diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 3dad41a88c8212..db4b1581ae671b 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,5 +1,16 @@
 # Contributing guidelines
 
+## Pull Request Checklist
+
+Before sending your pull requests, make sure you followed this list.
+
+- Read [contributing guidelines](CONTRIBUTING.md).
+- Read [Code of Conduct](CODE_OF_CONDUCT.md).
+- Ensure you have signed the [Contributor License Agreement (CLA)](https://cla.developers.google.com/).
+- Check if my changes are consistent with the [guidelines](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md#general-guidelines-and-philosophy-for-contribution).
+- Changes are consistent with the [Coding Style](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md#c-coding-style).
+- Run [Unit Tests](https://github.com/tensorflow/tensorflow/blob/master/CONTRIBUTING.md#running-unit-tests).
+
 ## How to become a contributor and submit your own code
 
 ### Contributor License Agreements
@@ -79,7 +90,7 @@ Bazel BUILD files also need to include a license section, e.g.,
 Changes to TensorFlow C++ code should conform to
 [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
 
-Use `clang-tidy` to check your C/C++ changes. To install clang-tidy on ubuntu:16.04, do:
+Use `clang-tidy` to check your C/C++ changes. To install `clang-tidy` on ubuntu:16.04, do:
 
 ```bash
 apt-get install -y clang-tidy
diff --git a/README.md b/README.md
index e1a50c87e26d49..6fb4486d0de9ff 100644
--- a/README.md
+++ b/README.md
@@ -5,9 +5,9 @@
 -----------------
 
 
-| **`Documentation`** | **`Linux CPU`** | **`Linux GPU`** | **`Mac OS CPU`** | **`Windows CPU`** | **`Android`** |
-|-----------------|---------------------|------------------|-------------------|---------------|---------------|
-| [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://www.tensorflow.org/api_docs/) | ![Build Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.png) | ![Build Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-cc.png) | ![Build Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.png) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [![Build Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) [ ![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg) ](https://bintray.com/google/tensorflow/tensorflow/_latestVersion)
+| **`Documentation`** |
+|-----------------|
+| [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://www.tensorflow.org/api_docs/) |
 
 **TensorFlow** is an open source software library for numerical computation using
 data flow graphs.  The graph nodes represent mathematical operations, while
@@ -40,15 +40,6 @@ environment to install the nightly TensorFlow build. We support CPU and GPU
 packages on Linux, Mac, and Windows.
 
 
-**Individual whl files**
-* Linux CPU-only: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/)) / [Python 3.4](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=cpu-slave/)) / [Python 3.5](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=cpu-slave/)) / [Python 3.6](http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-cp36-cp36m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=cpu-slave/))
-* Linux GPU: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/42/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp27-none-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=gpu-linux/)) / [Python 3.4](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp34-cp34m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=gpu-linux/)) / [Python 3.5](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp35-cp35m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.5,label=gpu-linux/)) / [Python 3.6](http://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly_gpu-1.head-cp36-cp36m-linux_x86_64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-linux/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3.6,label=gpu-linux/))
-* Mac CPU-only: [Python 2](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-py2-none-any.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=mac-slave/)) / [Python 3](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/lastSuccessfulBuild/artifact/pip_test/whl/tf_nightly-1.head-py3-none-any.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-mac/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON3,label=mac-slave/))
-* Windows CPU-only: [Python 3.5 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly-1.head-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly-1.head-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows,PY=36/))
-* Windows GPU: [Python 3.5 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=35/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly_gpu-1.head-cp35-cp35m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=35/)) / [Python 3.6 64-bit](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=36/lastSuccessfulBuild/artifact/cmake_build/tf_python/dist/tf_nightly_gpu-1.head-cp36-cp36m-win_amd64.whl) ([build history](https://ci.tensorflow.org/view/tf-nightly/job/tf-nightly-windows/M=windows-gpu,PY=36/))
-* Android: [demo APK](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk), [native libs](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/native/)
-([build history](https://ci.tensorflow.org/view/Nightly/job/nightly-android/))
-
 #### *Try your first TensorFlow program*
 ```shell
 $ python
@@ -82,6 +73,30 @@ The TensorFlow project strives to abide by generally accepted best practices in
 
 [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/1486/badge)](https://bestpractices.coreinfrastructure.org/projects/1486)
 
+
+## Continuous build status
+
+### Official Builds
+
+| Build Type      | Status | Artifacts |
+| ---             | ---    | ---       |
+| **Linux CPU**   | ![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-cc.png) | [pypi](https://pypi.org/project/tf-nightly/) |
+| **Linux GPU**   | ![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/ubuntu-gpu-cc.png) | [pypi](https://pypi.org/project/tf-nightly-gpu/) |
+| **Linux XLA**   | TBA | TBA |
+| **MacOS**       | ![Status](https://storage.googleapis.com/tensorflow-kokoro-build-badges/macos-py2-cc.png) | [pypi](https://pypi.org/project/tf-nightly/) |
+| **Windows CPU** | [![Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-win-cmake-py)](https://ci.tensorflow.org/job/tensorflow-master-win-cmake-py) | [pypi](https://pypi.org/project/tf-nightly/) |
+| **Windows GPU** | [![Status](http://ci.tensorflow.org/job/tf-master-win-gpu-cmake/badge/icon)](http://ci.tensorflow.org/job/tf-master-win-gpu-cmake/) | [pypi](https://pypi.org/project/tf-nightly-gpu/) |
+| **Android**     | [![Status](https://ci.tensorflow.org/buildStatus/icon?job=tensorflow-master-android)](https://ci.tensorflow.org/job/tensorflow-master-android) | [![Download](https://api.bintray.com/packages/google/tensorflow/tensorflow/images/download.svg)](https://bintray.com/google/tensorflow/tensorflow/_latestVersion) [demo APK](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/tensorflow_demo.apk), [native libs](https://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/native/) [build history](https://ci.tensorflow.org/view/Nightly/job/nightly-android/) |
+
+
+### Community Supported Builds
+
+| Build Type      | Status | Artifacts |
+| ---             | ---    | ---       |
+| **IBM s390x**       | [![Build Status](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/badge/icon)](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/) | TBA |
+| **IBM ppc64le CPU** | [![Build Status](http://powerci.osuosl.org/job/TensorFlow_Ubuntu_16.04_CPU/badge/icon)](http://powerci.osuosl.org/job/TensorFlow_Ubuntu_16.04_CPU/) | TBA |
+
+
 ## For more information
 
 * [TensorFlow Website](https://www.tensorflow.org)
diff --git a/RELEASE.md b/RELEASE.md
index 2717c75740aeea..27f73b7fc6a524 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -6,7 +6,7 @@
 * Added Gradient Boosted Trees as pre-made Estimators: BoostedTreesClassifier, BoostedTreesRegressor.
 * Add 3rd generation pipeline config for Cloud TPUs which improves performance and usability.
 * `tf.contrib.bayesflow` is moving out to it's own repo.
-* Added `tf.contrib.{proto,rpc}` to allow generic proto parsing and RPC communication.
+* Added `tf.contrib.{proto,rpc}` to allow generic proto parsing and RPC communication<sup>[1](#rpc-issue)</sup>.
 
 ## Bug Fixes and Other Changes
 * `tf.data`:
@@ -49,13 +49,14 @@
   * Fix non-uniformity of orthogonal matrices.
   * Fix bug where multi-image Estimator eval summaries were not displayed correctly.
 
+<a name="rpc-issue"><sup>1</sup></a> The cancellation logic of the RPC op contains a concurrency error. A fix has been submitted to master and will be part of the next release.
+
 ## Thanks to our Contributors
 
 This release contains contributions from many people at Google, as well as:
 
 4d55397500, Aghasy, Alan Du, Alan Lee, Alan Yee, Alex Wiltschko, Animesh Karnewar, Ankit Gupta, Anton Matosov, Aris L, Ben Barsdell, Brent Yi, Brett Koonce, Carl Thomé, cbockman, Chikanaga Tomoyuki, Chris Tava, CéDric Deltheil, Dahan Gong, Dalmo Cirne, Daniel Erenrich, David Norman, DavidNorman, Edd Wilder-James, Fanjin Zeng, Felix Abecassis, fo40225, George Sterpu, Giovanni Terlingen, Gor Baghdasaryan, Guillaume Klein, Hanchen Li, Ilya Polenov, Jakub Kolodziejczyk, Jason Sadler, Jayaram Bobba, Jerry Liu, jinghuangintel, Jiongyan Zhang (张炯衍), Joel Shor, Jong Wook Kim, Julian Eisenschlos, Karl Lessard, Krish Ravindranath, Loo Rong Jie, Lukas Geiger, Luke Iwanski, Mahmoud Abuzaina, ManHyuk, Marvin Richter, Maximilian Mitchell, Mohammad Ashraf Bhuiyan, msofka, Mustafa Kasap, Nathan Burnham, Nathan Luehr, Naveen Marri, ngc92, nio1814, Oleg Zabluda, Ou Changkun, Panos Ipeirotis, Paul Van Eck, Peter Lee, Piotr Czapla, qjivy, Rholais Lii, Rodrigo Formigone, Russell Klopfer, ryantimjohn, Sang Han, SebastiáN RamíRez, shengfuintel, Siby Jose Plathottam, Silver Chan, Stanislaw Antol, Taehoon Lee, Tarang Chugh, Ted Chang, Thomas Bastiani, Xian Xu, Xiaoming (Jason) Cui, Yan Facai (颜发才), yaox12, Yashal Shakti Kanungo, Yong Tang, Yuan (Terry) Tang, Yuxin Wu, Ziyue(Louis) Lu
 
-
 # Release 1.7.0
 
 ## Major Features And Improvements
@@ -235,7 +236,7 @@ Yoni Tsafir, yordun, Yuan (Terry) Tang, Yuxin Wu, zhengdi, Zhengsheng Wei, 田
   * Add `complex64` support to XLA compiler.
   * `bfloat` support is now added to XLA infrastructure.
   * Make `ClusterSpec` propagation work with XLA devices.
-  * Use a determinisitic executor to generate XLA graph.
+  * Use a deterministic executor to generate XLA graph.
 * `tf.contrib`:
   * `tf.contrib.distributions`:
     * Add `tf.contrib.distributions.Autoregressive`.
@@ -403,14 +404,6 @@ answered questions, and were part of inspiring discussions.
 
 # Release 1.4.0
 
-## Major Features And Improvements
-* `tf.keras` is now part of the core TensorFlow API.
-* [`tf.data`](http://tensorflow.org/programmers_guide/datasets) is now part of
-  the core TensorFlow API.
-  * The API is now subject to backwards compatibility guarantees.
-
-# Release 1.4.0
-
 ## Major Features And Improvements
 * `tf.keras` is now part of the core TensorFlow API.
 * [`tf.data`](http://tensorflow.org/programmers_guide/datasets) is now part of
diff --git a/SECURITY.md b/SECURITY.md
index a5ce3a62ee202f..0a4be37cbc2066 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -168,12 +168,12 @@ below).
 
 Please use a descriptive subject line for your report email. After the initial
 reply to your report, the security team will endeavor to keep you informed of
-the progress being made towards a fix and announcement. 
+the progress being made towards a fix and announcement.
 
 In addition, please include the following information along with your report:
 
 * Your name and affiliation (if any).
-* A description the technical details of the vulnerabilities. It is very
+* A description of the technical details of the vulnerabilities. It is very
   important to let us know how we can reproduce your findings.
 * An explanation who can exploit this vulnerability, and what they gain when
   doing so -- write an attack scenario. This will help us evaluate your report
@@ -246,5 +246,8 @@ v//Fw6ZeY+HmRDFdirjD7wXtIuER4vqCryIqR6Xe9X8oJXz9L/Jhslc=
 
 | Type               | Versions affected | Reported by           | Additional Information      |
 |--------------------|:-----------------:|-----------------------|-----------------------------|
+| TensorFlow Lite TOCO FlatBuffer Parsing Vulnerability | <= 1.7 | Blade Team of Tencent | [security advisory](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/docs_src/security/advisory/tfsa-2018-003.md) |
+| GIF File Parsing Null Pointer Dereference Error | <= 1.5 | Blade Team of Tencent | [security advisory](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/docs_src/security/advisory/tfsa-2018-002.md) |
+| BMP File Parser Out-of-bounds Read | <= 1.6 | Blade Team of Tencent | [security advisory](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/docs_src/security/advisory/tfsa-2018-001.md) |
 | Out Of Bounds Read |             <=1.4 | Blade Team of Tencent | [issue report](https://github.com/tensorflow/tensorflow/issues/14959) |
 
diff --git a/WORKSPACE b/WORKSPACE
index 4ddfb9a3832ea1..fd7570a80ae2ee 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -22,26 +22,10 @@ check_bazel_version_at_least("0.10.0")
 
 load("//tensorflow:workspace.bzl", "tf_workspace")
 
-# Uncomment and update the paths in these entries to build the Android demo.
-#android_sdk_repository(
-#    name = "androidsdk",
-#    api_level = 23,
-#    # Ensure that you have the build_tools_version below installed in the
-#    # SDK manager as it updates periodically.
-#    build_tools_version = "26.0.1",
-#    # Replace with path to Android SDK on your system
-#    path = "<PATH_TO_SDK>",
-#)
-#
-#android_ndk_repository(
-#    name="androidndk",
-#    path="<PATH_TO_NDK>",
-#    # This needs to be 14 or higher to compile TensorFlow.
-#    # Please specify API level to >= 21 to build for 64-bit
-#    # archtectures or the Android NDK will automatically select biggest
-#    # API level that it supports without notice.
-#    # Note that the NDK version is not the API level.
-#    api_level=14)
+load("//third_party/android:android_configure.bzl", "android_configure")
+android_configure(name="local_config_android")
+load("@local_config_android//:android.bzl", "android_workspace")
+android_workspace()
 
 # Please add all new TensorFlow dependencies in workspace.bzl.
 tf_workspace()
diff --git a/build b/build
index 25744ada64402a..f8c345d169ee3f 100755
--- a/build
+++ b/build
@@ -11,4 +11,4 @@
 pip uninstall -y tensorflow || true
 bazel build --config=opt --config=rocm //tensorflow/tools/pip_package:build_pip_package --verbose_failures &&
 bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg &&
-pip install /tmp/tensorflow_pkg/tensorflow-1.8.0rc1-cp27-cp27mu-linux_x86_64.whl
+pip install /tmp/tensorflow_pkg/tensorflow-1.8.0-cp27-cp27mu-linux_x86_64.whl
diff --git a/build_python3 b/build_python3
index a093bdd4873fe6..b0f6f2318a0b9b 100755
--- a/build_python3
+++ b/build_python3
@@ -11,4 +11,4 @@
 pip3 uninstall -y tensorflow || true
 bazel build --config=opt --config=rocm //tensorflow/tools/pip_package:build_pip_package --verbose_failures &&
 bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg &&
-pip3 install /tmp/tensorflow_pkg/tensorflow-1.8.0rc1-cp35-cp35m-linux_x86_64.whl
+pip3 install /tmp/tensorflow_pkg/tensorflow-1.8.0-cp35-cp35m-linux_x86_64.whl
diff --git a/configure.py b/configure.py
index 4f6fc8e70bc29b..46c637843bc2dd 100644
--- a/configure.py
+++ b/configure.py
@@ -498,10 +498,6 @@ def set_cc_opt_flags(environ_cp):
   if not is_ppc64le() and not is_windows():
     write_to_bazelrc('build:opt --host_copt=-march=haswell')
   write_to_bazelrc('build:opt --define with_default_optimizations=true')
-  # TODO(mikecase): Remove these default defines once we are able to get
-  # TF Lite targets building without them.
-  write_to_bazelrc('build --copt=-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK')
-  write_to_bazelrc('build --host_copt=-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK')
 
 def set_tf_cuda_clang(environ_cp):
   """set TF_CUDA_CLANG action_env.
@@ -674,8 +670,9 @@ def valid_ndk_path(path):
       error_msg=('The path %s or its child file "source.properties" '
                  'does not exist.')
   )
-
-  write_android_ndk_workspace_rule(android_ndk_home_path)
+  write_action_env_to_bazelrc('ANDROID_NDK_HOME', android_ndk_home_path)
+  write_action_env_to_bazelrc('ANDROID_NDK_API_LEVEL',
+                              check_ndk_level(android_ndk_home_path))
 
 
 def create_android_sdk_rule(environ_cp):
@@ -737,41 +734,12 @@ def valid_build_tools(version):
       error_msg=('The selected SDK does not have build-tools version %s '
                  'available.'))
 
-  write_android_sdk_workspace_rule(android_sdk_home_path,
-                                   android_build_tools_version,
-                                   android_api_level)
-
-
-def write_android_sdk_workspace_rule(android_sdk_home_path,
-                                     android_build_tools_version,
-                                     android_api_level):
-  print('Writing android_sdk_workspace rule.\n')
-  with open(_TF_WORKSPACE, 'a') as f:
-    f.write("""
-android_sdk_repository(
-  name="androidsdk",
-  api_level=%s,
-  path="%s",
-  build_tools_version="%s")\n
-""" % (android_api_level, android_sdk_home_path, android_build_tools_version))
-
-
-def write_android_ndk_workspace_rule(android_ndk_home_path):
-  print('Writing android_ndk_workspace rule.')
-  ndk_api_level = check_ndk_level(android_ndk_home_path)
-  if int(ndk_api_level) not in _SUPPORTED_ANDROID_NDK_VERSIONS:
-    print('WARNING: The API level of the NDK in %s is %s, which is not '
-          'supported by Bazel (officially supported versions: %s). Please use '
-          'another version. Compiling Android targets may result in confusing '
-          'errors.\n' % (android_ndk_home_path, ndk_api_level,
-                         _SUPPORTED_ANDROID_NDK_VERSIONS))
-  with open(_TF_WORKSPACE, 'a') as f:
-    f.write("""
-android_ndk_repository(
-  name="androidndk",
-  path="%s",
-  api_level=%s)\n
-""" % (android_ndk_home_path, ndk_api_level))
+  write_action_env_to_bazelrc('ANDROID_BUILD_TOOLS_VERSION',
+                              android_build_tools_version)
+  write_action_env_to_bazelrc('ANDROID_SDK_API_LEVEL',
+                              android_api_level)
+  write_action_env_to_bazelrc('ANDROID_SDK_HOME',
+                              android_sdk_home_path)
 
 
 def check_ndk_level(android_ndk_home_path):
@@ -784,18 +752,16 @@ def check_ndk_level(android_ndk_home_path):
 
   revision = re.search(r'Pkg.Revision = (\d+)', filedata)
   if revision:
-    return revision.group(1)
-  return None
-
-
-def workspace_has_any_android_rule():
-  """Check the WORKSPACE for existing android_*_repository rules."""
-  with open(_TF_WORKSPACE, 'r') as f:
-    workspace = f.read()
-  has_any_rule = re.search(r'^android_[ns]dk_repository',
-                           workspace,
-                           re.MULTILINE)
-  return has_any_rule
+    ndk_api_level = revision.group(1)
+  else:
+    raise Exception('Unable to parse NDK revision.')
+  if int(ndk_api_level) not in _SUPPORTED_ANDROID_NDK_VERSIONS:
+    print('WARNING: The API level of the NDK in %s is %s, which is not '
+          'supported by Bazel (officially supported versions: %s). Please use '
+          'another version. Compiling Android targets may result in confusing '
+          'errors.\n' % (android_ndk_home_path, ndk_api_level,
+                         _SUPPORTED_ANDROID_NDK_VERSIONS))
+  return ndk_api_level
 
 
 def set_gcc_host_compiler_path(environ_cp):
@@ -845,8 +811,8 @@ def reformat_version_sequence(version_str, sequence_count):
 def set_tf_cuda_version(environ_cp):
   """Set CUDA_TOOLKIT_PATH and TF_CUDA_VERSION."""
   ask_cuda_version = (
-      'Please specify the CUDA SDK version you want to use, '
-      'e.g. 7.0. [Leave empty to default to CUDA %s]: ') % _DEFAULT_CUDA_VERSION
+      'Please specify the CUDA SDK version you want to use. '
+      '[Leave empty to default to CUDA %s]: ') % _DEFAULT_CUDA_VERSION
 
   for _ in range(_DEFAULT_PROMPT_ASK_ATTEMPTS):
     # Configure the Cuda SDK version to use.
@@ -1226,6 +1192,9 @@ def set_tf_cuda_compute_capabilities(environ_cp):
         ask_cuda_compute_capabilities, default_cuda_compute_capabilities)
     # Check whether all capabilities from the input is valid
     all_valid = True
+    # Remove all whitespace characters before splitting the string
+    # that users may insert by accident, as this will result in error
+    tf_cuda_compute_capabilities = ''.join(tf_cuda_compute_capabilities.split())
     for compute_capability in tf_cuda_compute_capabilities.split(','):
       m = re.match('[0-9]+.[0-9]+', compute_capability)
       if not m:
@@ -1428,6 +1397,10 @@ def set_grpc_build_flags():
   write_to_bazelrc('build --define grpc_no_ares=true')
 
 
+def set_build_strip_flag():
+  write_to_bazelrc('build --strip=always')
+
+
 def set_windows_build_flags():
   if is_windows():
     # The non-monolithic build is not supported yet
@@ -1558,23 +1531,18 @@ def main():
 
   set_grpc_build_flags()
   set_cc_opt_flags(environ_cp)
+  set_build_strip_flag()
   set_windows_build_flags()
 
-  if workspace_has_any_android_rule():
-    print('The WORKSPACE file has at least one of ["android_sdk_repository", '
-          '"android_ndk_repository"] already set. Will not ask to help '
-          'configure the WORKSPACE. Please delete the existing rules to '
-          'activate the helper.\n')
-  else:
-    if get_var(
-        environ_cp, 'TF_SET_ANDROID_WORKSPACE', 'android workspace',
-        False,
-        ('Would you like to interactively configure ./WORKSPACE for '
-         'Android builds?'),
-        'Searching for NDK and SDK installations.',
-        'Not configuring the WORKSPACE for Android builds.'):
-      create_android_ndk_rule(environ_cp)
-      create_android_sdk_rule(environ_cp)
+  if get_var(
+      environ_cp, 'TF_SET_ANDROID_WORKSPACE', 'android workspace',
+      False,
+      ('Would you like to interactively configure ./WORKSPACE for '
+       'Android builds?'),
+      'Searching for NDK and SDK installations.',
+      'Not configuring the WORKSPACE for Android builds.'):
+    create_android_ndk_rule(environ_cp)
+    create_android_sdk_rule(environ_cp)
 
   print('Preconfigured Bazel build configs. You can use any of the below by '
         'adding "--config=<>" to your build command. See tools/bazel.rc for '
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 79adbe318c4c64..fb5a52e0c9e44c 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -19,6 +19,10 @@ load(
     "//tensorflow/core:platform/default/build_config.bzl",
     "tf_additional_binary_deps",
 )
+load(
+    "//tensorflow/tools/api/generator:api_gen.bzl",
+    "gen_api_init_files",  # @unused
+)
 
 # Config setting for determining if we are building for Android.
 config_setting(
@@ -478,7 +482,7 @@ tf_cc_shared_object(
 # excludes all but a subset of function names.
 # On MacOS, the linker does not support version_script, but has an
 # an "-exported_symbols_list" command.  -z defs disallows undefined
-# symbols in object files and -s strips the output.
+# symbols in object files.
 
 tf_cc_shared_object(
     name = "libtensorflow.so",
@@ -492,7 +496,6 @@ tf_cc_shared_object(
         "//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "-z defs",
-            "-s",
             "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
             "$(location //tensorflow/c:version_script.lds)",
         ],
@@ -518,7 +521,6 @@ tf_cc_shared_object(
         "//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "-z defs",
-            "-s",
             "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
             "$(location //tensorflow:tf_version_script.lds)",
         ],
@@ -543,13 +545,16 @@ exports_files(
     ],
 )
 
+gen_api_init_files(
+    name = "python_api_gen",
+    srcs = ["api_template.__init__.py"],
+    root_init_template = "api_template.__init__.py",
+)
+
 py_library(
     name = "tensorflow_py",
-    srcs = ["__init__.py"],
+    srcs = [":python_api_gen"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/python",
-        "//tensorflow/tools/api/generator:python_api",
-    ],
+    deps = ["//tensorflow/python"],
 )
diff --git a/tensorflow/__init__.py b/tensorflow/__init__.py
index c8683e3976c90a..440e9f8dbd2f4b 100644
--- a/tensorflow/__init__.py
+++ b/tensorflow/__init__.py
@@ -22,9 +22,6 @@
 
 # pylint: disable=g-bad-import-order
 from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
-# pylint: disable=wildcard-import
-from tensorflow.tools.api.generator.api import *  # pylint: disable=redefined-builtin
-# pylint: enable=wildcard-import
 
 from tensorflow.python.util.lazy_loader import LazyLoader
 contrib = LazyLoader('contrib', globals(), 'tensorflow.contrib')
diff --git a/tensorflow/api_template.__init__.py b/tensorflow/api_template.__init__.py
new file mode 100644
index 00000000000000..9b0d7d48afd058
--- /dev/null
+++ b/tensorflow/api_template.__init__.py
@@ -0,0 +1,43 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Bring in all of the public TensorFlow interface into this module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=g-bad-import-order
+from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
+# API IMPORTS PLACEHOLDER
+
+from tensorflow.python.util.lazy_loader import LazyLoader
+contrib = LazyLoader('contrib', globals(), 'tensorflow.contrib')
+del LazyLoader
+
+from tensorflow.python.platform import flags  # pylint: disable=g-import-not-at-top
+app.flags = flags  # pylint: disable=undefined-variable
+
+del absolute_import
+del division
+del print_function
+
+# These symbols appear because we import the python package which
+# in turn imports from tensorflow.core and tensorflow.python. They
+# must come from this module. So python adds these symbols for the
+# resolution to succeed.
+# pylint: disable=undefined-variable
+del python
+del core
+# pylint: enable=undefined-variable
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index 18eeb2816807ec..b86b277ac3200b 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -2097,7 +2097,7 @@ static void GraphImportGraphDefLocked(TF_Graph* graph, const GraphDef& def,
 
   for (int i = 0; i < size; ++i) {
     TensorId id = results.missing_unused_input_map_keys[i];
-    tf_results->missing_unused_key_names_data.push_back(id.first.ToString());
+    tf_results->missing_unused_key_names_data.push_back(std::string(id.first));
     tf_results->missing_unused_key_names[i] =
         tf_results->missing_unused_key_names_data.back().c_str();
     tf_results->missing_unused_key_indexes[i] = id.second;
diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index d3916bc16778a9..95b04f9058afdf 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -8368,3 +8368,90 @@ TF_Operation* TF_MakeFileBasedIteratorGetNextWithDatasets(
   return getnext_node;
 #endif
 }
+
+TF_Tensor* TF_DequeueNamedTensor(TF_Session* session, int tensor_id,
+                                 TF_Status* status) {
+  assert(session);
+  {
+    tensorflow::mutex_lock c(session->graph->mu);
+    VLOG(1) << "Dequeuing named tensor with id " << tensor_id
+            << ", with input graph: "
+            << session->graph->graph.ToGraphDefDebug().DebugString();
+  }
+
+  TF_Operation* dequeue_op = TF_GraphOperationByName(
+      session->graph,
+      tensorflow::strings::StrCat("fifo_queue_dequeue_", tensor_id).c_str());
+  if (dequeue_op == nullptr) {
+    status->status = tensorflow::errors::Internal(
+        "Unable to find the dequeue node in the TF graph.");
+    return nullptr;
+  }
+
+  VLOG(1) << "Running the dequeue op";
+  TF_Output output{dequeue_op, 0};
+  TF_Tensor* ret;
+  TF_SessionRun(session, /*run_options*/ nullptr,
+                // input related parameters
+                /*inputs*/ nullptr, /*input_values*/ nullptr, /*ninputs*/ 0,
+                // output related parameters
+                /*outputs*/ &output, /*output_values*/ &ret,
+                /*noutputs*/ 1,
+                /*targets*/ nullptr, /*ntargets*/ 0,
+                /*run_metadata*/ nullptr, status);
+  if (VLOG_IS_ON(1) && status->status.ok()) {
+    tensorflow::Tensor tensor;
+    if (tensorflow::TF_TensorToTensor(ret, &tensor).ok()) {
+      VLOG(1) << "Dequeued tensor content: " << tensor.DebugString();
+    }
+  }
+  return ret;
+}
+
+void TF_EnqueueNamedTensor(TF_Session* session, int tensor_id,
+                           TF_Tensor* tensor, TF_Status* status) {
+  assert(session);
+  {
+    tensorflow::mutex_lock c(session->graph->mu);
+    if (VLOG_IS_ON(1)) {
+      VLOG(1) << "Enqueuing named tensor with id " << tensor_id
+              << ", with input graph: "
+              << session->graph->graph.ToGraphDefDebug().DebugString();
+      tensorflow::Tensor internal_tensor;
+      if (tensorflow::TF_TensorToTensor(tensor, &internal_tensor).ok()) {
+        VLOG(1) << "Enqueu'ing tensor content: "
+                << internal_tensor.DebugString();
+      }
+    }
+  }
+
+  TF_Operation* enqueue_op = TF_GraphOperationByName(
+      session->graph,
+      tensorflow::strings::StrCat("fifo_queue_enqueue_", tensor_id).c_str());
+  if (enqueue_op == nullptr) {
+    status->status = tensorflow::errors::Internal(
+        "Unable to find the enqueue node in the TF graph.");
+    return;
+  }
+
+  TF_Operation* placeholder_op = TF_GraphOperationByName(
+      session->graph,
+      tensorflow::strings::StrCat("arg_tensor_enqueue_", tensor_id).c_str());
+  if (placeholder_op == nullptr) {
+    status->status = tensorflow::errors::Internal(
+        "Unable to find the placeholder node as input to enqueue in the TF "
+        "graph.");
+    return;
+  }
+
+  VLOG(1) << "Running the enqueue op";
+  TF_Output input{placeholder_op, 0};
+  TF_SessionRun(session, /*run_options*/ nullptr,
+                // input related parameters
+                /*inputs*/ &input, /*input_values*/ &tensor, /*ninputs*/ 1,
+                // output related parameters
+                /*outputs*/ nullptr, /*output_values*/ nullptr, /*noutputs*/ 0,
+                /*targets*/ &enqueue_op, /*ntargets*/ 1,
+                /*run_metadata*/ nullptr, status);
+  VLOG(1) << "Enqueuing is done.";
+}
diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h
index 88cb173cd25f42..20bdace40f1272 100644
--- a/tensorflow/c/c_api_experimental.h
+++ b/tensorflow/c/c_api_experimental.h
@@ -86,6 +86,35 @@ TF_CAPI_EXPORT extern TF_Operation* TF_MakeFileBasedIteratorGetNextWithDatasets(
     TF_Graph* graph, const char* file_path, int batch_size,
     unsigned char is_mnist, TF_Status* status);
 
+// On success, dequeues a tensor from a TF-managed FifoQueue given by
+// `tensor_id`, associated with `session`. There must be a graph node named
+// "fifo_queue_dequeue_<tensor_id>", to be executed by this API call.
+
+// Caller must call TF_DeleteTensor() over the returned tensor. If the queue is
+// empty, this call is blocked.
+//
+// Tensors are enqueued via the corresponding TF enqueue op.
+// TODO(hongm): Add support for `timeout_ms`.
+TF_CAPI_EXPORT extern TF_Tensor* TF_DequeueNamedTensor(TF_Session* session,
+                                                       int tensor_id,
+                                                       TF_Status* status);
+
+// On success, enqueues `tensor` into a TF-managed FifoQueue given by
+// `tensor_id`, associated with `session`. There must be a graph node named
+// "fifo_queue_enqueue_<tensor_id>", to be executed by this API call. It reads
+// from a placeholder node "arg_tensor_enqueue_<tensor_id>".
+//
+// `tensor` is still owned by the caller. This call will be blocked if the queue
+// has reached its capacity, and will be unblocked when the queued tensors again
+// drop below the capacity due to dequeuing.
+//
+// Tensors are dequeued via the corresponding TF dequeue op.
+// TODO(hongm): Add support for `timeout_ms`.
+TF_CAPI_EXPORT extern void TF_EnqueueNamedTensor(TF_Session* session,
+                                                 int tensor_id,
+                                                 TF_Tensor* tensor,
+                                                 TF_Status* status);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/c_api_test.cc b/tensorflow/c/c_api_test.cc
index 2762f31e0ccebf..581e5bd1998c73 100644
--- a/tensorflow/c/c_api_test.cc
+++ b/tensorflow/c/c_api_test.cc
@@ -1368,7 +1368,7 @@ TEST(CAPI, SavedModel) {
   }
 
   const tensorflow::string input_op_name =
-      tensorflow::ParseTensorName(input_name).first.ToString();
+      std::string(tensorflow::ParseTensorName(input_name).first);
   TF_Operation* input_op =
       TF_GraphOperationByName(graph, input_op_name.c_str());
   ASSERT_TRUE(input_op != nullptr);
@@ -1376,7 +1376,7 @@ TEST(CAPI, SavedModel) {
   ASSERT_EQ(TF_OK, TF_GetCode(s)) << TF_Message(s);
 
   const tensorflow::string output_op_name =
-      tensorflow::ParseTensorName(output_name).first.ToString();
+      std::string(tensorflow::ParseTensorName(output_name).first);
   TF_Operation* output_op =
       TF_GraphOperationByName(graph, output_op_name.c_str());
   ASSERT_TRUE(output_op != nullptr);
diff --git a/tensorflow/c/c_test_util.h b/tensorflow/c/c_test_util.h
index cd19cf8d624d9b..c16aba666ee697 100644
--- a/tensorflow/c/c_test_util.h
+++ b/tensorflow/c/c_test_util.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include <vector>
 #include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
diff --git a/tensorflow/c/checkpoint_reader.cc b/tensorflow/c/checkpoint_reader.cc
index b1f7bdaa5420a5..74bc25a491ac01 100644
--- a/tensorflow/c/checkpoint_reader.cc
+++ b/tensorflow/c/checkpoint_reader.cc
@@ -125,7 +125,7 @@ CheckpointReader::BuildV2VarMaps() {
       const auto& slice_proto = entry.slices(i);
       CHECK(filtered_keys
                 .insert(EncodeTensorNameSlice(
-                    v2_reader_->key().ToString() /* full var's name */,
+                    std::string(v2_reader_->key()) /* full var's name */,
                     TensorSlice(slice_proto)))
                 .second);
     }
@@ -138,11 +138,11 @@ CheckpointReader::BuildV2VarMaps() {
       new TensorSliceReader::VarToDataTypeMap);
   v2_reader_->Seek(kHeaderEntryKey);
   for (v2_reader_->Next(); v2_reader_->Valid(); v2_reader_->Next()) {
-    if (filtered_keys.count(v2_reader_->key().ToString()) > 0) continue;
+    if (filtered_keys.count(std::string(v2_reader_->key())) > 0) continue;
     CHECK(entry.ParseFromArray(v2_reader_->value().data(),
                                v2_reader_->value().size()))
         << entry.InitializationErrorString();
-    string key = v2_reader_->key().ToString();
+    string key = std::string(v2_reader_->key());
     (*var_to_shape_map)[key] = TensorShape(entry.shape());
     (*var_to_data_type_map)[key] = DataType(entry.dtype());
   }
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index d51f0520ac39d5..f2af81f04c5e1f 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -14,6 +14,7 @@ tf_gpu_library(
     name = "c_api",
     srcs = [
         "c_api.cc",
+        "c_api_debug.cc",
         "c_api_internal.h",
     ],
     hdrs = ["c_api.h"],
@@ -24,10 +25,10 @@ tf_gpu_library(
             "//tensorflow/core:android_tensorflow_lib_lite",
         ],
         "//conditions:default": [
-            ":runtime",
             "//tensorflow/c:c_api",
             "//tensorflow/c:c_api_internal",
             "//tensorflow/core:core_cpu",
+            "//tensorflow/core/common_runtime/eager:attr_builder",
             "//tensorflow/core/common_runtime/eager:context",
             "//tensorflow/core/common_runtime/eager:eager_executor",
             "//tensorflow/core/common_runtime/eager:execute",
@@ -45,10 +46,22 @@ tf_gpu_library(
         "//tensorflow:with_xla_support": [
             "//tensorflow/compiler/tf2xla:xla_compiler",
             "//tensorflow/compiler/jit",
+            "//tensorflow/compiler/jit:xla_device",
         ],
         "//conditions:default": [],
     }) + [
         "//tensorflow/core/common_runtime/eager:eager_operation",
+        "//tensorflow/core/distributed_runtime/eager:eager_client",
+        "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_client",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_channel",
+        "//tensorflow/core/distributed_runtime/rpc/eager:eager_grpc_server_lib",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_worker_service",
+        "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
+        "//tensorflow/core/distributed_runtime:remote_device",
+        "//tensorflow/core/distributed_runtime:server_lib",
+        "//tensorflow/core/distributed_runtime:worker_env",
         "//tensorflow/core:gpu_runtime",
     ],
 )
@@ -59,7 +72,6 @@ tf_gpu_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         ":c_api",
-        ":runtime",
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_api_internal",
         "//tensorflow/core:core_cpu",
@@ -69,70 +81,65 @@ tf_gpu_library(
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/common_runtime/eager:attr_builder",
         "//tensorflow/core/common_runtime/eager:context",
         "//tensorflow/core/common_runtime/eager:eager_executor",
         "//tensorflow/core/common_runtime/eager:eager_operation",
         "//tensorflow/core/common_runtime/eager:kernel_and_device",
         "//tensorflow/core/common_runtime/eager:tensor_handle",
+        "//tensorflow/core/distributed_runtime:remote_device",
+        "//tensorflow/core/distributed_runtime:server_lib",
+        "//tensorflow/core/distributed_runtime:worker_env",
+        "//tensorflow/core/distributed_runtime/eager:eager_client",
+        "//tensorflow/core/distributed_runtime/eager:remote_tensor_handle",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_channel",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_worker_service",
+        "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
+        "//tensorflow/core/distributed_runtime/rpc/eager:eager_grpc_server_lib",
+        "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_client",
     ],
 )
 
-tf_gpu_cc_test(
-    name = "c_api_test",
-    srcs = ["c_api_test.cc"],
-    extra_copts = tfe_xla_copts(),
-    tags = [
-        "guitar",
-        "multi_gpu",
+tf_gpu_library(
+    name = "c_api_test_util",
+    testonly = 1,
+    srcs = ["c_api_test_util.cc"],
+    hdrs = ["c_api_test_util.h"],
+    visibility = [
+        "//learning/brain:__subpackages__",
+        "//tensorflow:__subpackages__",
     ],
     deps = [
         ":c_api",
         "//tensorflow/c:c_test_util",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
     ],
 )
 
-tf_gpu_library(
-    name = "runtime",
-    srcs = ["runtime.cc"],
-    hdrs = ["runtime.h"],
-    copts = tf_copts(),
-    visibility = ["//tensorflow:internal"],
-    deps = select({
-        "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib_lite",
-        ],
-        "//conditions:default": [
-            "//tensorflow/c:c_api",
-            "//tensorflow/core:core_cpu",
-            "//tensorflow/core/common_runtime/eager:kernel_and_device",
-            "//tensorflow/core:core_cpu_internal",
-            "//tensorflow/core:framework",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-            "//tensorflow/core:lib_internal",
-            "//tensorflow/core:protos_all_cc",
-        ],
-    }),
-)
-
-tf_cc_test(
-    name = "runtime_test",
-    srcs = ["runtime_test.cc"],
+tf_gpu_cc_test(
+    name = "c_api_test",
+    srcs = [
+        "c_api_debug_test.cc",
+        "c_api_test.cc",
+    ],
+    extra_copts = tfe_xla_copts(),
+    tags = [
+        "guitar",
+        "multi_gpu",
+    ],
     deps = [
-        ":runtime",
-        "//tensorflow/cc:cc_ops",
-        "//tensorflow/cc:client_session",
-        "//tensorflow/cc:ops",
-        "//tensorflow/cc:scope",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
+        ":c_api",
+        ":c_api_test_util",
+        "//tensorflow/c:c_test_util",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/distributed_runtime/rpc/eager:eager_grpc_server_lib",
     ],
 )
 
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 3bf071f3abaac7..81221c4078bec9 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/eager/c_api_internal.h"
-#include "tensorflow/c/eager/runtime.h"
 #ifdef TENSORFLOW_EAGER_USE_XLA
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #endif  // TENSORFLOW_EAGER_USE_XLA
@@ -32,15 +31,22 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/device_set.h"
+#include "tensorflow/core/common_runtime/eager/attr_builder.h"
 #include "tensorflow/core/common_runtime/eager/copy_to_device_node.h"
 #include "tensorflow/core/common_runtime/eager/execute.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/rpc/eager/eager_grpc_server_lib.h"
+#include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
+#include "tensorflow/core/distributed_runtime/server_lib.h"
+#include "tensorflow/core/distributed_runtime/worker_env.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
@@ -67,10 +73,121 @@ string DeviceName(const tensorflow::Device* d) {
   return (d == nullptr) ? "cpu:0" : d->name();
 }
 
-#ifdef TENSORFLOW_EAGER_USE_XLA
-std::atomic_int_fast64_t func_id_generator(0);
-#endif  // TENSORFLOW_EAGER_USE_XLA
+tensorflow::Status GetAllRemoteDevices(
+    const std::vector<string>& remote_workers,
+    tensorflow::WorkerCacheInterface* worker_cache,
+    std::unique_ptr<tensorflow::DeviceMgr>* device_mgr) {
+  std::vector<tensorflow::Device*> remote_devices;
+  tensorflow::Status status;
+  // TODO(nareshmodi) do this in parallel instead of serially.
+  for (const string& remote_worker : remote_workers) {
+    tensorflow::Notification n;
+    tensorflow::NewRemoteDevices(
+        tensorflow::Env::Default(), worker_cache, remote_worker,
+        [&status, &n, &remote_devices](
+            const tensorflow::Status& s,
+            std::vector<tensorflow::Device*>* devices) {
+          status = s;
+          if (s.ok()) {
+            for (tensorflow::Device* d : *devices) {
+              remote_devices.push_back(d);
+            }
+          }
+          n.Notify();
+        });
+    n.WaitForNotification();
+  }
+  std::unique_ptr<tensorflow::DeviceMgr> remote_device_mgr(
+      new tensorflow::DeviceMgr(remote_devices));
+
+  TF_RETURN_IF_ERROR(status);
+
+  *device_mgr = std::move(remote_device_mgr);
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status CreateRemoteContexts(
+    const std::vector<string>& remote_workers,
+    tensorflow::eager::EagerClientCache* remote_eager_workers, bool async,
+    tensorflow::gtl::FlatMap<string, tensorflow::uint64>* remote_contexts) {
+  for (int i = 0; i < remote_workers.size(); i++) {
+    const string& remote_worker = remote_workers[i];
+
+    tensorflow::eager::CreateContextRequest request;
+    tensorflow::eager::CreateContextResponse response;
+    tensorflow::DeviceNameUtils::ParsedName parsed_name;
+    if (!tensorflow::DeviceNameUtils::ParseFullName(remote_worker,
+                                                    &parsed_name)) {
+      return tensorflow::errors::InvalidArgument(
+          "Unable to parse ", remote_worker, " as a device name");
+    }
+    request.mutable_server_def()->set_job_name(parsed_name.job);
+    request.mutable_server_def()->set_task_index(parsed_name.task);
+    request.set_async(async);
+    auto* eager_client = remote_eager_workers->GetClient(remote_worker);
+    if (eager_client == nullptr) {
+      return tensorflow::errors::Internal(
+          "Cannot find a client for the given target:", remote_worker);
+    }
+    tensorflow::Notification n;
+    tensorflow::Status status;
+    // TODO(nareshmodi) do this in parallel instead of serially.
+    eager_client->CreateContextAsync(
+        &request, &response, [&status, &n](const tensorflow::Status& s) {
+          status = s;
+          n.Notify();
+        });
+    n.WaitForNotification();
+    TF_RETURN_IF_ERROR(status);
+
+    remote_contexts->emplace(remote_worker, response.context_id());
+  }
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status NewRemoteAwareTFE_Context(const TFE_ContextOptions* opts,
+                                             TFE_Context** ctx) {
+  string worker_name = tensorflow::strings::StrCat(
+      "/job:", opts->server_def.job_name(),
+      "/replica:0/task:", opts->server_def.task_index());
+  std::unique_ptr<tensorflow::eager::EagerGrpcServer> server;
+  TF_RETURN_IF_ERROR(
+      tensorflow::eager::EagerGrpcServer::Create(opts->server_def, &server));
+
+  TF_RETURN_IF_ERROR(server->Start());
+
+  std::vector<string> remote_workers;
+  server->master_env()->worker_cache->ListWorkers(&remote_workers);
+  remote_workers.erase(
+      std::remove(remote_workers.begin(), remote_workers.end(), worker_name),
+      remote_workers.end());
+
+  std::unique_ptr<tensorflow::DeviceMgr> remote_device_mgr;
+  TF_RETURN_IF_ERROR(GetAllRemoteDevices(
+      remote_workers, server->master_env()->worker_cache, &remote_device_mgr));
+
+  std::shared_ptr<tensorflow::GrpcChannelCache> channel_cache =
+      server->channel_cache();
+  std::unique_ptr<tensorflow::eager::EagerClientCache> remote_eager_workers(
+      tensorflow::eager::NewGrpcEagerClientCache(channel_cache));
 
+  // Initialize remote eager workers.
+  tensorflow::gtl::FlatMap<string, tensorflow::uint64> remote_contexts;
+  TF_RETURN_IF_ERROR(CreateRemoteContexts(remote_workers,
+                                          remote_eager_workers.get(),
+                                          opts->async, &remote_contexts));
+
+  tensorflow::RemoteRendezvous* r =
+      server->worker_env()->rendezvous_mgr->Find(0);
+
+  auto* device_mgr = server->worker_env()->device_mgr;
+  *ctx = new TFE_Context(opts->session_options.options, opts->policy,
+                         opts->async, device_mgr, r, std::move(server),
+                         std::move(remote_eager_workers),
+                         std::move(remote_device_mgr), remote_contexts);
+
+  return tensorflow::Status::OK();
+}
 }  // namespace
 
 extern "C" {
@@ -91,6 +208,15 @@ void TFE_ContextOptionsSetDevicePlacementPolicy(
   options->policy = policy;
 }
 
+TF_CAPI_EXPORT extern void TFE_ContextOptionsSetServerDef(
+    TFE_ContextOptions* options, const void* proto, size_t proto_len,
+    TF_Status* status) {
+  if (!options->server_def.ParseFromArray(proto, proto_len)) {
+    status->status = tensorflow::errors::InvalidArgument(
+        "Invalid tensorflow.ServerDef protocol buffer");
+  }
+}
+
 TF_CAPI_EXPORT extern void TFE_ContextSetAsyncForThread(TFE_Context* ctx,
                                                         unsigned char async,
                                                         TF_Status* status) {
@@ -100,17 +226,23 @@ TF_CAPI_EXPORT extern void TFE_ContextSetAsyncForThread(TFE_Context* ctx,
 void TFE_DeleteContextOptions(TFE_ContextOptions* options) { delete options; }
 
 TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) {
+  if (!opts->server_def.job_name().empty()) {
+    TFE_Context* ctx = nullptr;
+    status->status = NewRemoteAwareTFE_Context(opts, &ctx);
+    return ctx;
+  }
+
   std::vector<tensorflow::Device*> devices;
   status->status = tensorflow::DeviceFactory::AddDevices(
       opts->session_options.options, "/job:localhost/replica:0/task:0",
       &devices);
-  if (!status->status.ok()) {
-    return nullptr;
-  }
+  if (!status->status.ok()) return nullptr;
   std::unique_ptr<tensorflow::DeviceMgr> device_mgr(
       new tensorflow::DeviceMgr(devices));
+
   tensorflow::Rendezvous* r =
       new tensorflow::IntraProcessRendezvous(device_mgr.get());
+
   return new TFE_Context(opts->session_options.options, opts->policy,
                          opts->async, std::move(device_mgr), r);
 }
@@ -119,7 +251,10 @@ void TFE_DeleteContext(TFE_Context* ctx, TF_Status* status) { delete ctx; }
 
 TF_DeviceList* TFE_ContextListDevices(TFE_Context* ctx, TF_Status* status) {
   TF_DeviceList* list = new TF_DeviceList;
-  ctx->context.device_mgr()->ListDeviceAttributes(&list->response);
+  ctx->context.local_device_mgr()->ListDeviceAttributes(&list->response);
+  if (ctx->context.remote_device_mgr()) {
+    ctx->context.remote_device_mgr()->ListDeviceAttributes(&list->response);
+  }
   return list;
 }
 
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index c06ce84a8c578a..1862af3ce2f505 100644
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -81,6 +81,16 @@ TF_CAPI_EXPORT extern void TFE_ContextOptionsSetAsync(TFE_ContextOptions*,
 TF_CAPI_EXPORT extern void TFE_ContextOptionsSetDevicePlacementPolicy(
     TFE_ContextOptions*, TFE_ContextDevicePlacementPolicy);
 
+// A tensorflow.ServerDef specifies remote workers (in addition to the current
+// workers name). Operations created on this context can then be executed on
+// any of these remote workers by setting an appropriate device.
+//
+// If the following is set, all servers identified by the
+// ServerDef must be up when the context is created.
+TF_CAPI_EXPORT extern void TFE_ContextOptionsSetServerDef(
+    TFE_ContextOptions* options, const void* proto, size_t proto_len,
+    TF_Status* status);
+
 // Destroy an options object.
 TF_CAPI_EXPORT extern void TFE_DeleteContextOptions(TFE_ContextOptions*);
 
@@ -181,6 +191,45 @@ TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_TensorHandleCopyToDevice(
     TFE_TensorHandle* h, TFE_Context* ctx, const char* device_name,
     TF_Status* status);
 
+// Debugging/Profiling information for TFE_TensorHandle
+//
+// TFE_TensorDebugInfo contains information useful for debugging and
+// profiling tensors.
+typedef struct TFE_TensorDebugInfo TFE_TensorDebugInfo;
+
+// Retrieves TFE_TensorDebugInfo for `handle`.
+// If TFE_TensorHandleTensorDebugInfo succeeds, `status` is set to OK and caller
+// is responsible for deleting returned TFE_TensorDebugInfo.
+// If TFE_TensorHandleTensorDebugInfo fails, `status` is set to appropriate
+// error and nullptr is returned. This function can block till the operation
+// that produces `handle` has completed.
+TF_CAPI_EXPORT extern TFE_TensorDebugInfo* TFE_TensorHandleTensorDebugInfo(
+    TFE_TensorHandle* handle, TF_Status* status);
+
+// Deletes `debug_info`.
+TF_CAPI_EXPORT extern void TFE_DeleteTensorDebugInfo(
+    TFE_TensorDebugInfo* debug_info);
+
+// Returns the number of dimensions used to represent the tensor on its device.
+// The number of dimensions used to reprensent the tensor on device can be
+// different from the number returned by TFE_TensorHandleNumDims.
+// The return value was current at the time of TFE_TensorDebugInfo creation.
+TF_CAPI_EXPORT extern int TFE_TensorDebugInfoOnDeviceNumDims(
+    TFE_TensorDebugInfo* debug_info);
+
+// Returns the number of elements in dimension `dim_index`.
+// Tensor representation on device can be transposed from its representation
+// on host. The data contained in dimension `dim_index` on device
+// can correspond to the data contained in another dimension in on-host
+// representation. The dimensions are indexed using the standard TensorFlow
+// major-to-minor order (slowest varying dimension first),
+// not the XLA's minor-to-major order.
+// On-device dimensions can be padded. TFE_TensorDebugInfoOnDeviceDim returns
+// the number of elements in a dimension after padding.
+// The return value was current at the time of TFE_TensorDebugInfo creation.
+TF_CAPI_EXPORT extern int64_t TFE_TensorDebugInfoOnDeviceDim(
+    TFE_TensorDebugInfo* debug_info, int dim_index);
+
 // Description of the TensorFlow op to execute.
 //
 // Assumes that the provided 'ctx' outlives the returned TFE_Op, i.e.,
diff --git a/tensorflow/c/eager/c_api_debug.cc b/tensorflow/c/eager/c_api_debug.cc
new file mode 100644
index 00000000000000..5006b76f1981d0
--- /dev/null
+++ b/tensorflow/c/eager/c_api_debug.cc
@@ -0,0 +1,167 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/eager/c_api.h"
+
+#include <vector>
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/eager/c_api_internal.h"
+#ifdef TENSORFLOW_EAGER_USE_XLA
+#include "tensorflow/compiler/jit/xla_device.h"
+#endif  // TENSORFLOW_EAGER_USE_XLA
+
+using tensorflow::int64;
+using tensorflow::string;
+
+namespace {
+
+std::vector<int64> TensorShapeAsVector(TFE_TensorHandle* handle,
+                                       TF_Status* status) {
+  std::vector<int64> shape;
+  int rank = TFE_TensorHandleNumDims(handle, status);
+  if (!status->status.ok()) {
+    return shape;
+  }
+  shape.reserve(rank);
+  for (int i = 0; i < rank; ++i) {
+    shape.push_back(TFE_TensorHandleDim(handle, i, status));
+    if (!status->status.ok()) {
+      return shape;
+    }
+  }
+  return shape;
+}
+
+}  // namespace
+
+extern "C" {
+
+TF_CAPI_EXPORT extern TFE_TensorDebugInfo* TFE_TensorHandleTensorDebugInfo(
+    TFE_TensorHandle* handle, TF_Status* status) {
+  const tensorflow::Tensor* tensor;
+  status->status = handle->handle->Tensor(&tensor);
+  if (!status->status.ok()) {
+    return nullptr;
+  }
+
+  tensorflow::Device* device;
+  status->status = handle->handle->Device(&device);
+  if (!status->status.ok()) {
+    return nullptr;
+  }
+
+#ifdef TENSORFLOW_EAGER_USE_XLA
+  // If tensor resides on an XLA device, use XLA device's PaddedShapeFn.
+  tensorflow::XlaDevice* xla_device =
+      dynamic_cast<tensorflow::XlaDevice*>(device);
+  if (xla_device != nullptr) {
+    tensorflow::XlaDevice::PaddedShapeFn shape_fn =
+        xla_device->metadata().padded_shape_fn();
+    xla::Shape padded_shape;
+    status->status = shape_fn(*tensor, &padded_shape);
+    if (!status->status.ok()) {
+      return nullptr;
+    }
+    if (VLOG_IS_ON(3)) {
+      std::vector<int64> shape_to_log = TensorShapeAsVector(handle, status);
+      if (!status->status.ok()) {
+        // Ignore the status here as we are simply logging.
+        status->status = tensorflow::Status::OK();
+      } else {
+        VLOG(3) << "Fully padded shape of ["
+                << tensorflow::str_util::Join(shape_to_log, ", ") << "] is "
+                << padded_shape.DebugString();
+      }
+    }
+
+    if (xla::ShapeUtil::IsTuple(padded_shape)) {
+      if (xla::ShapeUtil::TupleElementCount(padded_shape) != 2) {
+        // Currently, the only case of XlaTensor containing a tuple shape is to
+        // represent 64 bit ints, doubles, and complex numbers (we don't support
+        // 64bit complex numbers).
+        status->status = tensorflow::errors::InvalidArgument(
+            "XlaTensors should only contain tuples of size 2. Shape: ",
+            padded_shape.DebugString());
+        return nullptr;
+      }
+
+      // shape0 is not a const& because we will assign it to padded_shape below.
+      // It is illegal to assign a part of a message to itself.
+      xla::Shape shape0 = xla::ShapeUtil::GetTupleElementShape(padded_shape, 0);
+      const xla::Shape& shape1 =
+          xla::ShapeUtil::GetTupleElementShape(padded_shape, 1);
+      if (xla::ShapeUtil::IsTuple(shape0) || xla::ShapeUtil::IsTuple(shape1)) {
+        status->status = tensorflow::errors::InvalidArgument(
+            "XlaTensors should not contain nested tuples. Shape: ",
+            padded_shape.DebugString());
+        return nullptr;
+      }
+      if (!xla::ShapeUtil::Equal(shape0, shape1)) {
+        status->status = tensorflow::errors::InvalidArgument(
+            "Subshapes of XlaTensors should be the same. Shape: ",
+            padded_shape.DebugString());
+        return nullptr;
+      }
+
+      // Since the only case we handle here are two equal subshapes, we
+      // simply return one of them. The caller will interpret it as this
+      // shape directly storing the 64bit types. This approximation is good
+      // enough for this API's debugging use case.
+      padded_shape = shape0;
+    }
+
+    int rank = padded_shape.dimensions_size();
+    std::vector<int64> dev_dims;
+    dev_dims.reserve(rank);
+    if (rank == 1) {
+      // Rank 1 tensors might not have padded_shape.layout.minor_to_major set,
+      dev_dims.push_back(padded_shape.dimensions(0));
+    } else {
+      for (int i = rank - 1; i >= 0; --i) {
+        int64 dim_index = padded_shape.layout().minor_to_major(i);
+        dev_dims.push_back(padded_shape.dimensions(dim_index));
+      }
+    }
+    status->status = tensorflow::Status::OK();
+    return new TFE_TensorDebugInfo(dev_dims);
+  }
+#endif  // TENSORFLOW_EAGER_USE_XLA
+
+  // If the tensor is not an XLA tensor, the device shape is
+  // the same as regular tensor shape.
+  std::vector<int64> dev_dims = TensorShapeAsVector(handle, status);
+  if (!status->status.ok()) {
+    return nullptr;
+  }
+  return new TFE_TensorDebugInfo(dev_dims);
+}
+
+TF_CAPI_EXPORT extern void TFE_DeleteTensorDebugInfo(
+    TFE_TensorDebugInfo* debug_info) {
+  delete debug_info;
+}
+
+TF_CAPI_EXPORT extern int TFE_TensorDebugInfoOnDeviceNumDims(
+    TFE_TensorDebugInfo* debug_info) {
+  return debug_info->dev_dims.size();
+}
+
+TF_CAPI_EXPORT extern int64_t TFE_TensorDebugInfoOnDeviceDim(
+    TFE_TensorDebugInfo* debug_info, int dim_index) {
+  return debug_info->dev_dims[dim_index];
+}
+
+}  // extern "C"
diff --git a/tensorflow/c/eager/c_api_debug_test.cc b/tensorflow/c/eager/c_api_debug_test.cc
new file mode 100644
index 00000000000000..cddb9f6e00e9d6
--- /dev/null
+++ b/tensorflow/c/eager/c_api_debug_test.cc
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/eager/c_api.h"
+
+#include <string.h>
+#include "tensorflow/c/eager/c_api_test_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+
+TEST(CApiDebug, ScalarCPU) {
+  TFE_TensorHandle* h = TestScalarTensorHandle();
+  TF_Status* status = TF_NewStatus();
+  TFE_TensorDebugInfo* debug_info = TFE_TensorHandleTensorDebugInfo(h, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  ASSERT_EQ(0, TFE_TensorDebugInfoOnDeviceNumDims(debug_info));
+
+  TFE_DeleteTensorDebugInfo(debug_info);
+  TFE_DeleteTensorHandle(h);
+  TF_DeleteStatus(status);
+}
+
+TEST(CApiDebug, 2DCPU) {
+  TFE_TensorHandle* h = TestMatrixTensorHandle3X2();
+  TF_Status* status = TF_NewStatus();
+  TFE_TensorDebugInfo* debug_info = TFE_TensorHandleTensorDebugInfo(h, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  ASSERT_EQ(2, TFE_TensorDebugInfoOnDeviceNumDims(debug_info));
+  // Shape is the same for CPU tensors.
+  EXPECT_EQ(3, TFE_TensorDebugInfoOnDeviceDim(debug_info, 0));
+  EXPECT_EQ(2, TFE_TensorDebugInfoOnDeviceDim(debug_info, 1));
+
+  TFE_DeleteTensorDebugInfo(debug_info);
+  TFE_DeleteTensorHandle(h);
+  TF_DeleteStatus(status);
+}
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index 49e1aab1cef957..04a6efc47c5177 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -28,8 +28,8 @@ limitations under the License.
 
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/c_api_internal.h"
-#include "tensorflow/c/eager/runtime.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/eager/attr_builder.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/eager_executor.h"
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
@@ -37,6 +37,14 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/eager/eager_client.h"
+#include "tensorflow/core/distributed_runtime/remote_device.h"
+#include "tensorflow/core/distributed_runtime/rpc/eager/eager_grpc_server_lib.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h"
+#include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/server_lib.h"
+#include "tensorflow/core/distributed_runtime/worker_env.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -51,6 +59,7 @@ struct TFE_ContextOptions {
   // true if async execution is enabled.
   bool async = false;
   TFE_ContextDevicePlacementPolicy policy{TFE_DEVICE_PLACEMENT_SILENT};
+  tensorflow::ServerDef server_def;
 };
 
 struct TFE_Context {
@@ -64,6 +73,23 @@ struct TFE_Context {
                     default_policy),
                 async, std::move(device_mgr), rendezvous) {}
 
+  explicit TFE_Context(
+      const tensorflow::SessionOptions& opts,
+      TFE_ContextDevicePlacementPolicy default_policy, bool async,
+      tensorflow::DeviceMgr* local_device_mgr,
+      tensorflow::Rendezvous* rendezvous,
+      std::unique_ptr<tensorflow::GrpcServer> server,
+      std::unique_ptr<tensorflow::eager::EagerClientCache> remote_eager_workers,
+      std::unique_ptr<tensorflow::DeviceMgr> remote_device_mgr,
+      const tensorflow::gtl::FlatMap<tensorflow::string, tensorflow::uint64>&
+          remote_contexts)
+      : context(opts,
+                static_cast<tensorflow::ContextDevicePlacementPolicy>(
+                    default_policy),
+                async, local_device_mgr, rendezvous, std::move(server),
+                std::move(remote_eager_workers), std::move(remote_device_mgr),
+                remote_contexts) {}
+
   tensorflow::EagerContext context;
 };
 
@@ -81,6 +107,14 @@ struct TFE_TensorHandle {
   tensorflow::TensorHandle* handle;
 };
 
+struct TFE_TensorDebugInfo {
+  TFE_TensorDebugInfo(const std::vector<tensorflow::int64>& dims)
+      : dev_dims(dims) {}
+
+  // Fully-padded, minor-to-major.
+  std::vector<tensorflow::int64> dev_dims;
+};
+
 struct TFE_Op {
   // t is NULL iff the TFE_Op corresponds to a TensorFlow function instead of a
   // primitive operation.
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 701175e4943d1d..27ff5f7211b059 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api.h"
 
 #include <string.h>
+#include "tensorflow/c/eager/c_api_test_util.h"
+#include "tensorflow/core/distributed_runtime/rpc/eager/eager_grpc_server_lib.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
@@ -23,128 +25,14 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/protobuf/cluster.pb.h"
 #include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/protobuf/tensorflow_server.pb.h"
 
 using tensorflow::string;
 
 namespace {
 
-TFE_TensorHandle* DoubleTestMatrixTensorHandle() {
-  int64_t dims[] = {2, 2};
-  double data[] = {1.0, 2.0, 3.0, 4.0};
-  TF_Tensor* t = TF_AllocateTensor(
-      TF_DOUBLE, &dims[0], sizeof(dims) / sizeof(int64_t), sizeof(data));
-  memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
-  TF_Status* status = TF_NewStatus();
-  TFE_TensorHandle* th = TFE_NewTensorHandle(t, status);
-  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TF_DeleteTensor(t);
-  TF_DeleteStatus(status);
-  return th;
-}
-
-TFE_TensorHandle* TestMatrixTensorHandle() {
-  int64_t dims[] = {2, 2};
-  float data[] = {1.0f, 2.0f, 3.0f, 4.0f};
-  TF_Tensor* t = TF_AllocateTensor(
-      TF_FLOAT, &dims[0], sizeof(dims) / sizeof(int64_t), sizeof(data));
-  memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
-  TF_Status* status = TF_NewStatus();
-  TFE_TensorHandle* th = TFE_NewTensorHandle(t, status);
-  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TF_DeleteTensor(t);
-  TF_DeleteStatus(status);
-  return th;
-}
-
-TFE_TensorHandle* TestMatrixTensorHandle3X2() {
-  int64_t dims[] = {3, 2};
-  double data[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
-  TF_Tensor* t = TF_AllocateTensor(
-      TF_FLOAT, &dims[0], sizeof(dims) / sizeof(int64_t), sizeof(data));
-  memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
-  TF_Status* status = TF_NewStatus();
-  TFE_TensorHandle* th = TFE_NewTensorHandle(t, status);
-  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TF_DeleteTensor(t);
-  TF_DeleteStatus(status);
-  return th;
-}
-
-TFE_Op* MatMulOp(TFE_Context* ctx, TFE_TensorHandle* a, TFE_TensorHandle* b) {
-  TF_Status* status = TF_NewStatus();
-
-  TFE_Op* op = TFE_NewOp(ctx, "MatMul", status);
-  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TFE_OpAddInput(op, a, status);
-  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TFE_OpAddInput(op, b, status);
-  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TF_DeleteStatus(status);
-  TFE_OpSetAttrBool(op, "transpose_a", 0);
-  TFE_OpSetAttrBool(op, "transpose_b", 0);
-  TFE_OpSetAttrType(op, "T", TFE_TensorHandleDataType(a));
-
-  return op;
-}
-
-TFE_TensorHandle* TestAxisTensorHandle() {
-  int64_t dims[] = {1};
-  int data[] = {1};
-  TF_Tensor* t = TF_AllocateTensor(
-      TF_INT32, &dims[0], sizeof(dims) / sizeof(int64_t), sizeof(data));
-  memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
-  TF_Status* status = TF_NewStatus();
-  TFE_TensorHandle* th = TFE_NewTensorHandle(t, status);
-  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TF_DeleteTensor(t);
-  TF_DeleteStatus(status);
-  return th;
-}
-
-TFE_Op* MinOp(TFE_Context* ctx, TFE_TensorHandle* input,
-              TFE_TensorHandle* axis) {
-  TF_Status* status = TF_NewStatus();
-
-  TFE_Op* op = TFE_NewOp(ctx, "Min", status);
-  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TFE_OpAddInput(op, input, status);
-  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TFE_OpAddInput(op, axis, status);
-  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
-  TFE_OpSetAttrBool(op, "keep_dims", 1);
-  TFE_OpSetAttrType(op, "Tidx", TF_INT32);
-  TF_DeleteStatus(status);
-  TFE_OpSetAttrType(op, "T", TFE_TensorHandleDataType(input));
-
-  return op;
-}
-
-// If there is a GPU device, returns true and sets 'gpu_device_name'
-// accordingly.
-bool GetGPUDeviceName(TFE_Context* ctx, string* gpu_device_name) {
-  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
-      TF_NewStatus(), TF_DeleteStatus);
-  TF_DeviceList* devices = TFE_ContextListDevices(ctx, status.get());
-  CHECK_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
-
-  const int num_devices = TF_DeviceListCount(devices);
-  for (int i = 0; i < num_devices; ++i) {
-    const string device_type(TF_DeviceListType(devices, i, status.get()));
-    CHECK_EQ(TF_GetCode(status.get()), TF_OK) << TF_Message(status.get());
-    const string device_name(TF_DeviceListName(devices, i, status.get()));
-    CHECK_EQ(TF_GetCode(status.get()), TF_OK) << TF_Message(status.get());
-    if (device_type == "GPU") {
-      *gpu_device_name = device_name;
-      LOG(INFO) << "Found GPU device " << device_name;
-      TF_DeleteDeviceList(devices);
-      return true;
-    }
-  }
-  TF_DeleteDeviceList(devices);
-  return false;
-}
-
 void BM_InitOp(int iters) {
   tensorflow::testing::StopTiming();
   TF_Status* status = TF_NewStatus();
@@ -220,6 +108,103 @@ TEST(CAPI, Context) {
   TF_DeleteStatus(status);
 }
 
+tensorflow::ServerDef GetServerDef(int num_tasks) {
+  tensorflow::ServerDef server_def;
+  server_def.set_protocol("grpc");
+  server_def.set_job_name("localhost");
+  server_def.set_task_index(0);
+  tensorflow::ClusterDef* cluster_def = server_def.mutable_cluster();
+  tensorflow::JobDef* job_def = cluster_def->add_job();
+  job_def->set_name("localhost");
+  for (int i = 0; i < num_tasks; i++) {
+    int port = tensorflow::testing::PickUnusedPortOrDie();
+    job_def->mutable_tasks()->insert(
+        {i, tensorflow::strings::StrCat("localhost:", port)});
+  }
+  return server_def;
+}
+
+void TestRemoteExecute(bool async) {
+  tensorflow::ServerDef server_def = GetServerDef(2);
+
+  // This server def has the task index set to 0.
+  string serialized = server_def.SerializeAsString();
+
+  server_def.set_task_index(1);
+
+  std::unique_ptr<tensorflow::eager::EagerGrpcServer> worker_server;
+  ASSERT_TRUE(
+      tensorflow::eager::EagerGrpcServer::Create(server_def, &worker_server)
+          .ok());
+  ASSERT_TRUE(worker_server->Start().ok());
+
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetServerDef(opts, serialized.data(), serialized.size(),
+                                 status);
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(1));
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_TensorHandle* h0_task0 = TestMatrixTensorHandle();
+  TFE_TensorHandle* h1_task0 = TestMatrixTensorHandle();
+  const char remote_device_name[] =
+      "/job:localhost/replica:0/task:1/device:CPU:0";
+  auto* h0_task1 =
+      TFE_TensorHandleCopyToDevice(h0_task0, ctx, remote_device_name, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  auto* h1_task1 =
+      TFE_TensorHandleCopyToDevice(h1_task0, ctx, remote_device_name, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_Op* matmul = MatMulOp(ctx, h0_task1, h1_task1);
+  TFE_OpSetDevice(matmul, remote_device_name, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_TensorHandle* retvals[1];
+  int num_retvals = 1;
+  TFE_Execute(matmul, &retvals[0], &num_retvals, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  auto* retval_task0 = TFE_TensorHandleCopyToDevice(
+      retvals[0], ctx, "/job:localhost/replica:0/task:0/device:CPU:0", status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TF_Tensor* t = TFE_TensorHandleResolve(retval_task0, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteTensorHandle(retval_task0);
+  float product[4] = {0};
+  EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
+  memcpy(&product[0], TF_TensorData(t), TF_TensorByteSize(t));
+  TF_DeleteTensor(t);
+  EXPECT_EQ(7, product[0]);
+  EXPECT_EQ(10, product[1]);
+  EXPECT_EQ(15, product[2]);
+  EXPECT_EQ(22, product[3]);
+
+  TFE_DeleteTensorHandle(h0_task0);
+  TFE_DeleteTensorHandle(h1_task0);
+  TFE_DeleteTensorHandle(h0_task1);
+  TFE_DeleteTensorHandle(h1_task1);
+  TFE_DeleteTensorHandle(retvals[0]);
+
+  TFE_DeleteOp(matmul);
+
+  TFE_ContextAsyncWait(ctx, status);
+  TFE_DeleteContext(ctx, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TF_DeleteStatus(status);
+
+  // TODO(nareshmodi): Figure out how to correctly shut the server down.
+  worker_server.release();
+}
+
+TEST(CAPI, RemoteExecute) { TestRemoteExecute(false); }
+TEST(CAPI, RemoteExecuteAsync) { TestRemoteExecute(true); }
+
 TEST(CAPI, TensorHandle) {
   TFE_TensorHandle* h = TestMatrixTensorHandle();
   EXPECT_EQ(TF_FLOAT, TFE_TensorHandleDataType(h));
@@ -436,7 +421,7 @@ void TensorHandleSilentCopy(bool async) {
 
   // Disable the test if no GPU is present.
   string gpu_device_name;
-  if (GetGPUDeviceName(ctx, &gpu_device_name)) {
+  if (GetDeviceName(ctx, &gpu_device_name, "GPU")) {
     TFE_TensorHandle* hgpu = TFE_TensorHandleCopyToDevice(
         hcpu, ctx, gpu_device_name.c_str(), status.get());
     ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
@@ -483,7 +468,7 @@ void TensorHandleSilentCopyLocal(bool async) {
 
   // Disable the test if no GPU is present.
   string gpu_device_name;
-  if (GetGPUDeviceName(ctx, &gpu_device_name)) {
+  if (GetDeviceName(ctx, &gpu_device_name, "GPU")) {
     TFE_TensorHandle* hgpu = TFE_TensorHandleCopyToDevice(
         hcpu, ctx, gpu_device_name.c_str(), status.get());
     ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
@@ -524,7 +509,7 @@ void SetAndGetOpDevices(bool async) {
 
   // Disable the test if no GPU is present.
   string gpu_device_name;
-  if (GetGPUDeviceName(ctx, &gpu_device_name)) {
+  if (GetDeviceName(ctx, &gpu_device_name, "GPU")) {
     TFE_OpSetDevice(matmul, "GPU:0", status);
     ASSERT_TRUE(TF_GetCode(status) == TF_OK) << TF_Message(status);
     const char* device_name = TFE_OpGetDevice(matmul, status);
@@ -588,7 +573,7 @@ void Execute_MatMul_CPU_Runtime_Error(bool async) {
   TFE_DeleteContextOptions(opts);
 
   TFE_TensorHandle* m1 = TestMatrixTensorHandle();
-  TFE_TensorHandle* m2 = TestMatrixTensorHandle3X2();
+  TFE_TensorHandle* m2 = DoubleTestMatrixTensorHandle3X2();
   TFE_Op* matmul = MatMulOp(ctx, m1, m2);
   TFE_OpSetDevice(matmul, "/job:localhost/replica:0/task:0/device:CPU:0",
                   status);
diff --git a/tensorflow/c/eager/c_api_test_util.cc b/tensorflow/c/eager/c_api_test_util.cc
new file mode 100644
index 00000000000000..5607c9dcb0bbec
--- /dev/null
+++ b/tensorflow/c/eager/c_api_test_util.cc
@@ -0,0 +1,163 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/eager/c_api_test_util.h"
+
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+
+using tensorflow::string;
+
+TFE_TensorHandle* TestScalarTensorHandle() {
+  float data[] = {1.0f};
+  TF_Tensor* t = TF_AllocateTensor(TF_FLOAT, nullptr, 0, sizeof(float));
+  memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
+  TF_Status* status = TF_NewStatus();
+  TFE_TensorHandle* th = TFE_NewTensorHandle(t, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteTensor(t);
+  TF_DeleteStatus(status);
+  return th;
+}
+
+TFE_TensorHandle* DoubleTestMatrixTensorHandle() {
+  int64_t dims[] = {2, 2};
+  double data[] = {1.0, 2.0, 3.0, 4.0};
+  TF_Tensor* t = TF_AllocateTensor(
+      TF_DOUBLE, &dims[0], sizeof(dims) / sizeof(int64_t), sizeof(data));
+  memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
+  TF_Status* status = TF_NewStatus();
+  TFE_TensorHandle* th = TFE_NewTensorHandle(t, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteTensor(t);
+  TF_DeleteStatus(status);
+  return th;
+}
+
+TFE_TensorHandle* TestMatrixTensorHandle() {
+  int64_t dims[] = {2, 2};
+  float data[] = {1.0f, 2.0f, 3.0f, 4.0f};
+  TF_Tensor* t = TF_AllocateTensor(
+      TF_FLOAT, &dims[0], sizeof(dims) / sizeof(int64_t), sizeof(data));
+  memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
+  TF_Status* status = TF_NewStatus();
+  TFE_TensorHandle* th = TFE_NewTensorHandle(t, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteTensor(t);
+  TF_DeleteStatus(status);
+  return th;
+}
+
+TFE_TensorHandle* DoubleTestMatrixTensorHandle3X2() {
+  int64_t dims[] = {3, 2};
+  double data[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
+  TF_Tensor* t = TF_AllocateTensor(
+      TF_FLOAT, &dims[0], sizeof(dims) / sizeof(int64_t), sizeof(data));
+  memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
+  TF_Status* status = TF_NewStatus();
+  TFE_TensorHandle* th = TFE_NewTensorHandle(t, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteTensor(t);
+  TF_DeleteStatus(status);
+  return th;
+}
+
+TFE_TensorHandle* TestMatrixTensorHandle3X2() {
+  int64_t dims[] = {3, 2};
+  float data[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
+  TF_Tensor* t = TF_AllocateTensor(
+      TF_FLOAT, &dims[0], sizeof(dims) / sizeof(int64_t), sizeof(data));
+  memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
+  TF_Status* status = TF_NewStatus();
+  TFE_TensorHandle* th = TFE_NewTensorHandle(t, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteTensor(t);
+  TF_DeleteStatus(status);
+  return th;
+}
+
+TFE_Op* MatMulOp(TFE_Context* ctx, TFE_TensorHandle* a, TFE_TensorHandle* b) {
+  TF_Status* status = TF_NewStatus();
+
+  TFE_Op* op = TFE_NewOp(ctx, "MatMul", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInput(op, a, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInput(op, b, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteStatus(status);
+  TFE_OpSetAttrBool(op, "transpose_a", 0);
+  TFE_OpSetAttrBool(op, "transpose_b", 0);
+  TFE_OpSetAttrType(op, "T", TFE_TensorHandleDataType(a));
+
+  return op;
+}
+
+TFE_TensorHandle* TestAxisTensorHandle() {
+  int64_t dims[] = {1};
+  int data[] = {1};
+  TF_Tensor* t = TF_AllocateTensor(
+      TF_INT32, &dims[0], sizeof(dims) / sizeof(int64_t), sizeof(data));
+  memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
+  TF_Status* status = TF_NewStatus();
+  TFE_TensorHandle* th = TFE_NewTensorHandle(t, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteTensor(t);
+  TF_DeleteStatus(status);
+  return th;
+}
+
+TFE_Op* MinOp(TFE_Context* ctx, TFE_TensorHandle* input,
+              TFE_TensorHandle* axis) {
+  TF_Status* status = TF_NewStatus();
+
+  TFE_Op* op = TFE_NewOp(ctx, "Min", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInput(op, input, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInput(op, axis, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpSetAttrBool(op, "keep_dims", 1);
+  TFE_OpSetAttrType(op, "Tidx", TF_INT32);
+  TF_DeleteStatus(status);
+  TFE_OpSetAttrType(op, "T", TFE_TensorHandleDataType(input));
+
+  return op;
+}
+
+bool GetDeviceName(TFE_Context* ctx, string* device_name,
+                   const char* device_type) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TF_DeviceList* devices = TFE_ContextListDevices(ctx, status.get());
+  CHECK_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  const int num_devices = TF_DeviceListCount(devices);
+  for (int i = 0; i < num_devices; ++i) {
+    const string dev_type(TF_DeviceListType(devices, i, status.get()));
+    CHECK_EQ(TF_GetCode(status.get()), TF_OK) << TF_Message(status.get());
+    const string dev_name(TF_DeviceListName(devices, i, status.get()));
+    CHECK_EQ(TF_GetCode(status.get()), TF_OK) << TF_Message(status.get());
+    if (dev_type == device_type) {
+      *device_name = dev_name;
+      LOG(INFO) << "Found " << device_type << " device " << *device_name;
+      TF_DeleteDeviceList(devices);
+      return true;
+    }
+  }
+  TF_DeleteDeviceList(devices);
+  return false;
+}
diff --git a/tensorflow/c/eager/c_api_test_util.h b/tensorflow/c/eager/c_api_test_util.h
new file mode 100644
index 00000000000000..474cae67c89249
--- /dev/null
+++ b/tensorflow/c/eager/c_api_test_util.h
@@ -0,0 +1,53 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_C_EAGER_C_API_TEST_UTIL_H_
+#define TENSORFLOW_C_EAGER_C_API_TEST_UTIL_H_
+
+#include "tensorflow/c/eager/c_api.h"
+
+#include "tensorflow/core/platform/types.h"
+
+// Return a tensor handle containing a float scalar
+TFE_TensorHandle* TestScalarTensorHandle();
+
+// Return a tensor handle containing a 2x2 matrix of doubles
+TFE_TensorHandle* DoubleTestMatrixTensorHandle();
+
+// Return a tensor handle containing a 2x2 matrix of floats
+TFE_TensorHandle* TestMatrixTensorHandle();
+
+// Return a tensor handle containing a 3x2 matrix of doubles
+TFE_TensorHandle* DoubleTestMatrixTensorHandle3X2();
+
+// Return a tensor handle containing a 3x2 matrix of floats
+TFE_TensorHandle* TestMatrixTensorHandle3X2();
+
+// Return a matmul op multiplying `a` by `b`.
+TFE_Op* MatMulOp(TFE_Context* ctx, TFE_TensorHandle* a, TFE_TensorHandle* b);
+
+// Return an 1-D INT32 tensor containing a single value 1.
+TFE_TensorHandle* TestAxisTensorHandle();
+
+// Return an op taking minimum of `input` long `axis` dimension.
+TFE_Op* MinOp(TFE_Context* ctx, TFE_TensorHandle* input,
+              TFE_TensorHandle* axis);
+
+// If there is a device of type `device_type`, returns true
+// and sets 'device_name' accordingly.
+// `device_type` must be either "GPU" or "TPU".
+bool GetDeviceName(TFE_Context* ctx, tensorflow::string* device_name,
+                   const char* device_type);
+
+#endif  // TENSORFLOW_C_EAGER_C_API_TEST_UTIL_H_
diff --git a/tensorflow/c/eager/tape.h b/tensorflow/c/eager/tape.h
index 97c323b8722803..734e712daa39c0 100644
--- a/tensorflow/c/eager/tape.h
+++ b/tensorflow/c/eager/tape.h
@@ -48,7 +48,7 @@ struct OpTapeEntry {
 
   // Should be called before deleting the backward function. TODO(apassos) use
   // unique_ptrs to ensure this happens.
-  std::function<void()> backward_function_deleter;
+  std::function<void(BackwardFunction*)> backward_function_deleter;
 };
 
 // Map from tensor_id to internally-defined operation-id of the operation which
@@ -104,14 +104,12 @@ class VSpace {
       gtl::ArraySlice<Gradient*> output_gradients,
       std::vector<Gradient*>* result) const = 0;
 
+  // Marks the following gradient as a result so it's not consumed by backward
+  // functions.
+  virtual void MarkAsResult(Gradient* gradient) const = 0;
+
   // Deletes the input tensor.
   virtual void DeleteGradient(Gradient* gradient) const = 0;
-
-  // Lets this VSpace know that it can release resources held by the
-  // `backward_function`, It will not be called again.
-  // `backward_function` must not be null.
-  virtual void ReleaseBackwardFunction(
-      BackwardFunction* backward_function) const = 0;
 };
 
 // Traces the execution of operations, doing eager garbage collection, and
@@ -126,19 +124,21 @@ class GradientTape {
   GradientTape(bool persistent) : persistent_(persistent) {}
   ~GradientTape() {
     for (const auto& pair : op_tape_) {
-      pair.second.backward_function_deleter();
+      pair.second.backward_function_deleter(pair.second.backward_function);
     }
   }
 
-  bool ShouldRecord(gtl::ArraySlice<int64> tensor_ids);
+  bool ShouldRecord(gtl::ArraySlice<int64> tensor_ids,
+                    gtl::ArraySlice<tensorflow::DataType> dtypes);
 
   void Watch(int64 tensor_id);
 
-  void RecordOperation(const string& op_type,
-                       gtl::ArraySlice<TapeTensor> output_tensors,
-                       gtl::ArraySlice<int64> input_tensor_id,
-                       BackwardFunction* backward_function,
-                       const std::function<void()>& backward_function_deleter);
+  void RecordOperation(
+      const string& op_type, gtl::ArraySlice<TapeTensor> output_tensors,
+      gtl::ArraySlice<int64> input_tensor_id,
+      gtl::ArraySlice<tensorflow::DataType> input_dtypes,
+      BackwardFunction* backward_function,
+      const std::function<void(BackwardFunction*)>& backward_function_deleter);
 
   void DeleteTrace(int64 tensor_id);
 
@@ -170,12 +170,32 @@ class GradientTape {
 
 // Template instantiations here
 
+inline bool IsDtypeTrainable(DataType dtype) {
+  switch (dtype) {
+    case DT_HALF:
+    case DT_BFLOAT16:
+    case DT_FLOAT:
+    case DT_DOUBLE:
+    case DT_COMPLEX64:
+    case DT_COMPLEX128:
+    case DT_RESOURCE:
+    case DT_VARIANT:
+      return true;
+    default:
+      return false;
+  }
+}
+
 template <typename Gradient, typename BackwardFunction>
 bool GradientTape<Gradient, BackwardFunction>::ShouldRecord(
-    gtl::ArraySlice<int64> tensor_ids) {
-  for (int64 i : tensor_ids) {
-    if (tensor_tape_.find(i) != tensor_tape_.end()) {
-      return true;
+    gtl::ArraySlice<int64> tensor_ids,
+    gtl::ArraySlice<tensorflow::DataType> dtypes) {
+  CHECK_EQ(tensor_ids.size(), dtypes.size());
+  for (int i = 0; i < tensor_ids.size(); ++i) {
+    if (tensor_tape_.find(tensor_ids[i]) != tensor_tape_.end()) {
+      if (IsDtypeTrainable(dtypes[i])) {
+        return true;
+      }
     }
   }
   return false;
@@ -189,10 +209,12 @@ void GradientTape<Gradient, BackwardFunction>::Watch(int64 tensor_id) {
 template <typename Gradient, typename BackwardFunction>
 void GradientTape<Gradient, BackwardFunction>::RecordOperation(
     const string& op_type, gtl::ArraySlice<TapeTensor> output_tensors,
-    gtl::ArraySlice<int64> input_tensor_id, BackwardFunction* backward_function,
-    const std::function<void()>& backward_function_deleter) {
-  if (!ShouldRecord(input_tensor_id)) {
-    backward_function_deleter();
+    gtl::ArraySlice<int64> input_tensor_id,
+    gtl::ArraySlice<tensorflow::DataType> input_dtypes,
+    BackwardFunction* backward_function,
+    const std::function<void(BackwardFunction*)>& backward_function_deleter) {
+  if (!ShouldRecord(input_tensor_id, input_dtypes)) {
+    backward_function_deleter(backward_function);
     return;
   }
   std::vector<int64> ids;
@@ -247,7 +269,7 @@ void GradientTape<Gradient, BackwardFunction>::DeleteTrace(int64 tensor_id) {
   for (int64 id : op_it->second.input_tensor_id) {
     DeleteTrace(id);
   }
-  op_it->second.backward_function_deleter();
+  op_it->second.backward_function_deleter(op_it->second.backward_function);
   op_tape_.erase(op_it);
 }
 
@@ -332,8 +354,7 @@ BackpropInitialState<BackwardFunction> PrepareBackprop(
         count_it->second++;
       } else {
         result.tensor_usage_counts[it] = 1;
-        if (sources_set.find(it) == sources_set.end() &&
-            tensor_tape.find(it) != tensor_tape.end()) {
+        if (tensor_tape.find(it) != tensor_tape.end()) {
           tensor_stack.push_back(it);
         }
       }
@@ -354,7 +375,8 @@ BackpropInitialState<BackwardFunction> PrepareBackprop(
     // backward functions that will be used for gradient computation
     // has been transferred to `result`.
     for (const auto& op_pair : *op_tape) {
-      op_pair.second.backward_function_deleter();
+      op_pair.second.backward_function_deleter(
+          op_pair.second.backward_function);
     }
     op_tape->clear();
   }
@@ -380,49 +402,39 @@ Status InitialGradients(const VSpace<Gradient, BackwardFunction>& vspace,
                         gtl::ArraySlice<Gradient*> output_gradients,
                         const TensorTape& tensor_tape,
                         const OpTape<BackwardFunction>& op_tape,
-                        const gtl::FlatMap<int64, int64>& tensor_usage_counts,
                         gtl::FlatMap<int64, std::vector<Gradient*>>* result) {
   for (int i = 0; i < target_tensor_ids.size(); ++i) {
     const int64 id = target_tensor_ids[i];
-    if (tensor_usage_counts.find(id) != tensor_usage_counts.end()) {
-      if (!output_gradients.empty() && output_gradients[i] != nullptr) {
-        // TODO(apassos) figure out how to print debugging information here.
-        return errors::InvalidArgument(
-            "A gradient was provided for a tensor which is used as part of the "
-            "computation.");
-      }
-    } else {
-      if (output_gradients.empty() || output_gradients[i] == nullptr) {
-        auto tensor_it = tensor_tape.find(id);
-        if (tensor_it != tensor_tape.end() && tensor_it->second != -1) {
-          auto op_it = op_tape.find(tensor_it->second);
-          if (op_it == op_tape.end()) {
-            return errors::Internal(
-                "Internal state of the gradient tape is invalid: "
-                "failed to find operation producing a tensor");
-          }
-          bool found = false;
-          for (int j = 0; j < op_it->second.output_tensor_info.size(); ++j) {
-            if (op_it->second.output_tensor_info[j].id == id) {
-              found = true;
-              (*result)[id].push_back(
-                  vspace.Ones(op_it->second.output_tensor_info[j].shape,
-                              op_it->second.output_tensor_info[j].dtype));
-              break;
-            }
-          }
-          if (!found) {
-            return errors::Internal(
-                "Internal state of the gradient tape is invalid: "
-                "none of operations outputs match expected tensor");
+    if (output_gradients.empty() || output_gradients[i] == nullptr) {
+      auto tensor_it = tensor_tape.find(id);
+      if (tensor_it != tensor_tape.end() && tensor_it->second != -1) {
+        auto op_it = op_tape.find(tensor_it->second);
+        if (op_it == op_tape.end()) {
+          return errors::Internal(
+              "Internal state of the gradient tape is invalid: "
+              "failed to find operation producing a tensor");
+        }
+        bool found = false;
+        for (int j = 0; j < op_it->second.output_tensor_info.size(); ++j) {
+          if (op_it->second.output_tensor_info[j].id == id) {
+            found = true;
+            (*result)[id].push_back(
+                vspace.Ones(op_it->second.output_tensor_info[j].shape,
+                            op_it->second.output_tensor_info[j].dtype));
+            break;
           }
-        } else {
-          // No record of the target tensor found on the tape, so no gradient
-          // needs to be computed from it. Do nothing.
+        }
+        if (!found) {
+          return errors::Internal(
+              "Internal state of the gradient tape is invalid: "
+              "none of operations outputs match expected tensor");
         }
       } else {
-        (*result)[id].push_back(output_gradients[i]);
+        // No record of the target tensor found on the tape, so no gradient
+        // needs to be computed from it. Do nothing.
       }
+    } else {
+      (*result)[id].push_back(output_gradients[i]);
     }
   }
   return Status::OK();
@@ -451,13 +463,12 @@ Status GradientTape<Gradient, BackwardFunction>::ComputeGradient(
       InitialStack(state.op_tape, state.op_missing_tensor);
   gtl::FlatMap<int64, std::vector<Gradient*>> gradients;
   Status s = InitialGradients(vspace, target_tensor_ids, output_gradients,
-                              tensor_tape_, state.op_tape,
-                              state.tensor_usage_counts, &gradients);
+                              tensor_tape_, state.op_tape, &gradients);
   auto cleanup = [this, &state]() {
     if (!persistent_) {
       // Release all backprop functions
       for (const auto& pair : state.op_tape) {
-        pair.second.backward_function_deleter();
+        pair.second.backward_function_deleter(pair.second.backward_function);
       }
     }
   };
@@ -509,10 +520,15 @@ Status GradientTape<Gradient, BackwardFunction>::ComputeGradient(
         }
       } else {
         any_gradient_nonzero = true;
-        out_gradients.push_back(vspace.AggregateGradients(grad_it->second));
+        auto new_gradients = vspace.AggregateGradients(grad_it->second);
         if (sources_set.find(grad_it->first) == sources_set.end()) {
           gradients.erase(grad_it);
+        } else {
+          grad_it->second.clear();
+          grad_it->second.push_back(new_gradients);
+          vspace.MarkAsResult(new_gradients);
         }
+        out_gradients.push_back(new_gradients);
       }
     }
     std::vector<Gradient*> in_gradients;
@@ -520,7 +536,7 @@ Status GradientTape<Gradient, BackwardFunction>::ComputeGradient(
       Status s = vspace.CallBackwardFunction(trace.backward_function,
                                              out_gradients, &in_gradients);
       if (!persistent_) {
-        vspace.ReleaseBackwardFunction(trace.backward_function);
+        trace.backward_function_deleter(trace.backward_function);
       }
       if (!s.ok()) {
         cleanup();
@@ -529,7 +545,7 @@ Status GradientTape<Gradient, BackwardFunction>::ComputeGradient(
     } else {
       in_gradients.resize(trace.input_tensor_id.size());
       if (!persistent_) {
-        vspace.ReleaseBackwardFunction(trace.backward_function);
+        trace.backward_function_deleter(trace.backward_function);
       }
       for (Gradient* grad : out_gradients) {
         if (grad != nullptr) {
diff --git a/tensorflow/c/generate-pc.sh b/tensorflow/c/generate-pc.sh
index 02a6a58b6153bb..7184ad68fb79f2 100755
--- a/tensorflow/c/generate-pc.sh
+++ b/tensorflow/c/generate-pc.sh
@@ -15,10 +15,12 @@
 # ==============================================================================
 
 TF_PREFIX='/usr/local'
+LIBDIR='lib'
 
 usage() {
     echo "Usage: $0 OPTIONS"
     echo -e "-p, --prefix\tset installation prefix (default: /usr/local)"
+    echo -e "-l, --libdir\tset lib directory (default: lib)"
     echo -e "-v, --version\tset TensorFlow version"
     echo -e "-h, --help\tdisplay this message"
 }
@@ -26,7 +28,7 @@ usage() {
 [ $# == 0 ] && usage && exit 0
 
 # read the options
-ARGS=$(getopt -o p:v:h --long prefix:,version:,help -n $0 -- "$@")
+ARGS=$(getopt -o p:l:v:h --long prefix:,libdir:,version:,help -n $0 -- "$@")
 eval set -- "$ARGS"
 
 # extract options and their arguments into variables.
@@ -38,6 +40,11 @@ while true ; do
                 "") shift 2 ;;
                 *) TF_PREFIX=$2 ; shift 2 ;;
             esac ;;
+        -l|--libdir)
+            case "$2" in
+                "") shift 2 ;;
+                *) LIBDIR=$2 ; shift 2 ;;
+            esac ;;
         -v|--version)
             case "$2" in
                 "") shift 2 ;;
@@ -55,7 +62,7 @@ echo "Generating pkgconfig file for TensorFlow $TF_VERSION in $TF_PREFIX"
 cat << EOF > tensorflow.pc
 prefix=${TF_PREFIX}
 exec_prefix=\${prefix}
-libdir=\${exec_prefix}/lib
+libdir=\${exec_prefix}/${LIBDIR}
 includedir=\${prefix}/include
 
 Name: TensorFlow
diff --git a/tensorflow/cc/framework/cc_op_gen.cc b/tensorflow/cc/framework/cc_op_gen.cc
index d73121c7b701ec..d6a4f141b6bb8c 100644
--- a/tensorflow/cc/framework/cc_op_gen.cc
+++ b/tensorflow/cc/framework/cc_op_gen.cc
@@ -440,7 +440,7 @@ string AvoidCPPKeywords(StringPiece name) {
   if (IsCPPKeyword(name)) {
     return strings::StrCat(name, "_");
   }
-  return name.ToString();
+  return std::string(name);
 }
 
 void InferArgAttributes(const OpDef::ArgDef& arg,
diff --git a/tensorflow/cc/framework/scope.cc b/tensorflow/cc/framework/scope.cc
index c143b978338815..62a889181e787f 100644
--- a/tensorflow/cc/framework/scope.cc
+++ b/tensorflow/cc/framework/scope.cc
@@ -220,7 +220,7 @@ std::unordered_set<string> Scope::Impl::GetColocationConstraints(
     for (const string& entry : node_constraints) {
       StringPiece s(entry);
       if (str_util::ConsumePrefix(&s, kColocationGroupPrefix)) {
-        current_constraints.insert(s.ToString());
+        current_constraints.insert(std::string(s));
       }
     }
   } else {
diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc
index 52c177212a8c88..35a01e0341cb08 100644
--- a/tensorflow/cc/gradients/math_grad.cc
+++ b/tensorflow/cc/gradients/math_grad.cc
@@ -38,6 +38,7 @@ REGISTER_NO_GRADIENT_OP("NotEqual");
 REGISTER_NO_GRADIENT_OP("LogicalAnd");
 REGISTER_NO_GRADIENT_OP("LogicalOr");
 REGISTER_NO_GRADIENT_OP("LogicalNot");
+REGISTER_NO_GRADIENT_OP("Floor");
 
 // Conjugate helper function returns the conjugate of an Output if it
 // is complex valued.
diff --git a/tensorflow/cc/gradients/math_grad_test.cc b/tensorflow/cc/gradients/math_grad_test.cc
index 1b4c7c2688083e..fd7b6fe6625f27 100644
--- a/tensorflow/cc/gradients/math_grad_test.cc
+++ b/tensorflow/cc/gradients/math_grad_test.cc
@@ -31,7 +31,6 @@ using ops::AddN;
 using ops::BatchMatMul;
 using ops::Const;
 using ops::Div;
-using ops::Greater;
 using ops::MatMul;
 using ops::Max;
 using ops::Maximum;
@@ -46,7 +45,6 @@ using ops::RealDiv;
 using ops::SquaredDifference;
 using ops::Sub;
 using ops::Sum;
-using ops::Where3;
 
 // TODO(andydavis) Test gradient function against numeric gradients output.
 // TODO(andydavis) As more gradients are added move common test functions
diff --git a/tensorflow/cc/gradients/nn_grad.cc b/tensorflow/cc/gradients/nn_grad.cc
index 0cb3132e94e381..c73482d5f4d13a 100644
--- a/tensorflow/cc/gradients/nn_grad.cc
+++ b/tensorflow/cc/gradients/nn_grad.cc
@@ -255,6 +255,53 @@ Status LRNGradHelper(const Scope& scope, const Operation& op,
 }
 REGISTER_GRADIENT_OP("LRN", LRNGradHelper);
 
+Status SoftplusGradHelper(const Scope& scope, const Operation& op,
+                          const std::vector<Output>& grad_inputs,
+                          std::vector<Output>* grad_outputs) {
+  auto dx = internal::SoftplusGrad(scope, grad_inputs[0], op.input(0));
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("Softplus", SoftplusGradHelper);
+
+Status SoftsignGradHelper(const Scope& scope, const Operation& op,
+                          const std::vector<Output>& grad_inputs,
+                          std::vector<Output>* grad_outputs) {
+  auto dx = internal::SoftsignGrad(scope, grad_inputs[0], op.input(0));
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("Softsign", SoftsignGradHelper);
+
+Status FractionalAvgPoolGradHelper(const Scope& scope, const Operation& op,
+                                   const std::vector<Output>& grad_inputs,
+                                   std::vector<Output>* grad_outputs) {
+  bool overlapping;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.output(0).node()->attrs(), "overlapping", &overlapping));
+  auto dx = internal::FractionalAvgPoolGrad(
+      scope, Shape(scope, op.input(0), Shape::OutType(DT_INT64)),
+      grad_inputs[0], op.output(1), op.output(2),
+      internal::FractionalAvgPoolGrad::Overlapping(overlapping));
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("FractionalAvgPool", FractionalAvgPoolGradHelper);
+
+Status FractionalMaxPoolGradHelper(const Scope& scope, const Operation& op,
+                                   const std::vector<Output>& grad_inputs,
+                                   std::vector<Output>* grad_outputs) {
+  bool overlapping;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(op.output(0).node()->attrs(), "overlapping", &overlapping));
+  auto dx = internal::FractionalMaxPoolGrad(
+      scope, op.input(0), op.output(0), grad_inputs[0], op.output(1),
+      op.output(2), internal::FractionalMaxPoolGrad::Overlapping(overlapping));
+  grad_outputs->push_back(dx);
+  return scope.status();
+}
+REGISTER_GRADIENT_OP("FractionalMaxPool", FractionalMaxPoolGradHelper);
+
 }  // anonymous namespace
 }  // namespace ops
 }  // namespace tensorflow
diff --git a/tensorflow/cc/gradients/nn_grad_test.cc b/tensorflow/cc/gradients/nn_grad_test.cc
index c4eba7ecb017fe..b4d457a9d14eb7 100644
--- a/tensorflow/cc/gradients/nn_grad_test.cc
+++ b/tensorflow/cc/gradients/nn_grad_test.cc
@@ -28,6 +28,8 @@ namespace {
 using ops::BiasAdd;
 using ops::Conv2D;
 using ops::Elu;
+using ops::FractionalAvgPool;
+using ops::FractionalMaxPool;
 using ops::L2Loss;
 using ops::LogSoftmax;
 using ops::LRN;
@@ -41,6 +43,8 @@ using ops::Relu;
 using ops::Relu6;
 using ops::Selu;
 using ops::Softmax;
+using ops::Softplus;
+using ops::Softsign;
 
 class NNGradTest : public ::testing::Test {
  protected:
@@ -71,22 +75,30 @@ class NNGradTest : public ::testing::Test {
     EXPECT_LT(max_error, 1e-3);
   }
 
-  // Sets tensor with random values, ensuring that the max value is largest by
-  // a reasonable amount.
-  // This is an issue for MaxPool, MaxPoolV2 and MaxPool3D, in which
-  // perturbations by the numeric gradient computation in the gradient checker
-  // can change the max value if values are too close together.
+  // Sets tensor with random values, ensuring that every pair of elements are at
+  // least a reasonable amount apart.
+  // This is an issue for max pooling operations, in which perturbations by the
+  // numeric gradient computation in the gradient checker can change the max
+  // value if a pool has values that are too close together.
   template <typename T>
-  void SetRandomValuesWithBumpedMax(Tensor* tensor) {
+  void SetRandomValuesForMaxPooling(Tensor* tensor) {
     auto tensor_flat = tensor->flat<T>();
-    tensor_flat.setRandom();
-    int32 max_index = 0;
-    for (size_t i = 1; i < tensor->NumElements(); i++) {
-      if (tensor_flat(i) > tensor_flat(max_index)) {
-        max_index = i;
-      }
+    // First set the array to an increasing sequence of values spaced
+    // a reasonable amount apart
+    T cur = 0;
+    for (size_t i = 0; i < tensor->NumElements(); i++) {
+      tensor_flat(i) = cur;
+      cur += 5e-2;
+    }
+    // Fischer-Yates shuffle the array
+    for (size_t i = tensor->NumElements() - 1; i >= 1; i--) {
+      // j <- random integer 0 <= j <= i
+      size_t j = random::New64() % (i + 1);
+      // swap values at i, j
+      T tmp = tensor_flat(i);
+      tensor_flat(i) = tensor_flat(j);
+      tensor_flat(j) = tmp;
     }
-    tensor_flat(max_index) += 1e-2;
   }
 
   Scope scope_;
@@ -189,7 +201,7 @@ TEST_F(NNGradTest, MaxPoolGradHelper) {
   const std::vector<int> strides{1, 2, 2, 1};
   auto y = MaxPool(scope_, x, ksize, strides, "VALID");
   Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
-  SetRandomValuesWithBumpedMax<float>(&x_init_value);
+  SetRandomValuesForMaxPooling<float>(&x_init_value);
   RunTest(x, x_init_value, y, y_shape);
 }
 
@@ -202,7 +214,7 @@ TEST_F(NNGradTest, MaxPoolGradV2Helper) {
   Tensor strides = test::AsTensor<int>({1, 2, 2, 1}, {4});
   auto y = MaxPoolV2(scope_, x, ksize, strides, "VALID");
   Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
-  SetRandomValuesWithBumpedMax<float>(&x_init_value);
+  SetRandomValuesForMaxPooling<float>(&x_init_value);
   RunTest(x, x_init_value, y, y_shape);
 }
 
@@ -215,7 +227,7 @@ TEST_F(NNGradTest, MaxPool3DGradHelper) {
   const std::vector<int> strides{1, 3, 3, 3, 1};
   auto y = MaxPool3D(scope_, x, ksize, strides, "VALID");
   Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
-  SetRandomValuesWithBumpedMax<float>(&x_init_value);
+  SetRandomValuesForMaxPooling<float>(&x_init_value);
   RunTest(x, x_init_value, y, y_shape);
 }
 
@@ -248,5 +260,45 @@ TEST_F(NNGradTest, LRN){
   RunTest(x, x_shape, y, x_shape);
 }
 
+TEST_F(NNGradTest, SoftplusGrad) {
+  TensorShape shape({3, 7});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+  auto y = Softplus(scope_, x);
+  RunTest(x, shape, y, shape);
+}
+
+TEST_F(NNGradTest, SoftsignGrad) {
+  TensorShape shape({3, 7});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(shape));
+  auto y = Softsign(scope_, x);
+  RunTest(x, shape, y, shape);
+}
+
+TEST_F(NNGradTest, FractionalAvgPoolGradHelper) {
+  TensorShape x_shape({1, 3, 7, 1});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+  // Force consistent pooling regions for unit testing.
+  auto y = FractionalAvgPool(
+      scope_, x, {1, 1.2, 1.9, 1},
+      FractionalAvgPool::Deterministic(true).Overlapping(true).Seed(1).Seed2(
+          2));
+  TensorShape y_shape({1, 2, 3, 1});
+  RunTest(x, x_shape, y.output, y_shape);
+}
+
+TEST_F(NNGradTest, FractionalMaxPoolGradHelper) {
+  TensorShape x_shape({1, 3, 7, 1});
+  auto x = Placeholder(scope_, DT_FLOAT, Placeholder::Shape(x_shape));
+  // Force consistent pooling regions for unit testing.
+  auto y = FractionalMaxPool(
+      scope_, x, {1, 1.2, 1.9, 1},
+      FractionalMaxPool::Deterministic(true).Overlapping(true).Seed(1).Seed2(
+          2));
+  Tensor x_init_value = Tensor(DT_FLOAT, x_shape);
+  SetRandomValuesForMaxPooling<float>(&x_init_value);
+  TensorShape y_shape({1, 2, 3, 1});
+  RunTest(x, x_init_value, y.output, y_shape);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/cc/tools/freeze_saved_model.cc b/tensorflow/cc/tools/freeze_saved_model.cc
index 4ddddcb5863c9f..23e9dc40d23899 100644
--- a/tensorflow/cc/tools/freeze_saved_model.cc
+++ b/tensorflow/cc/tools/freeze_saved_model.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/cc/tools/freeze_saved_model.h"
 
+#include <iostream>
 #include <queue>
 
 #include "tensorflow/core/framework/attr_value.pb.h"
@@ -71,6 +72,15 @@ void GetNodeNameToNodeDefMap(
   }
 }
 
+// Strips off the tensor part of the tensor_name to get the node_name.
+const string GetNodeNameFromTensorName(string tensor_name) {
+  if (tensor_name[0] == '^') {
+    tensor_name.erase(0, 1);
+  }
+  std::vector<string> tensor_name_parts = str_util::Split(tensor_name, ':');
+  return tensor_name_parts[0];
+}
+
 // Gets the set of node names needed by `outputs` and the corresponding set of
 // variable nodes to convert.
 void GetReachableNodesAndVariables(
@@ -83,10 +93,8 @@ void GetReachableNodesAndVariables(
       new std::unordered_set<string>({"Variable", "VariableV2", "VarHandleOp"});
 
   std::queue<string> nodes_to_visit;
-  for (const string& tensor_name : outputs) {
-    // We need to strip off the tensor part to get the node name.
-    std::vector<string> tensor_name_parts = str_util::Split(tensor_name, ':');
-    nodes_to_visit.push(tensor_name_parts[0]);
+  for (const string& output_tensor_name : outputs) {
+    nodes_to_visit.push(GetNodeNameFromTensorName(output_tensor_name));
   }
   // We do a traversal backwards from the outputs specified in the MetaGraphDef.
   while (!nodes_to_visit.empty()) {
@@ -100,8 +108,8 @@ void GetReachableNodesAndVariables(
     if (kVariableTypes->find(node->op()) != kVariableTypes->end()) {
       variable_node_names->insert(node->name());
     }
-    for (const string& input : node->input()) {
-      nodes_to_visit.push(input);
+    for (const string& input_tensor_name : node->input()) {
+      nodes_to_visit.push(GetNodeNameFromTensorName(input_tensor_name));
     }
   }
 }
diff --git a/tensorflow/cc/tools/freeze_saved_model_test.cc b/tensorflow/cc/tools/freeze_saved_model_test.cc
index cd35fd3b95deec..979b23c3fc5f66 100644
--- a/tensorflow/cc/tools/freeze_saved_model_test.cc
+++ b/tensorflow/cc/tools/freeze_saved_model_test.cc
@@ -351,6 +351,56 @@ TEST_F(FreezeTest, GraphDefWithNoVariables) {
   GraphDefEqual(frozen_graph_def, graph_def);
 }
 
+TEST_F(FreezeTest, GraphDefWithMultiOutputOperation) {
+  // Tensors from operations with multiple outputs get tensor suffixes when used
+  // in input fields of following nodes, i.e. split:0, split:1.
+  // Test that we traverse those correctly.
+  SavedModelBundle saved_model_bundle;
+  GraphDef graph_def;
+  Scope scope = Scope::NewRootScope();
+  Output a = ops::Const(scope.WithOpName("a"), {10.0f, 10.0f}, {2});
+  Output axis = ops::Const(scope.WithOpName("axis"), 0, {});
+  OutputList split = ops::Split(scope.WithOpName("split"), axis, a, 2).output;
+  Output b = ops::Const(scope.WithOpName("b"), 10.0f, {});
+  Output c = ops::Mul(scope.WithOpName("c"), split[1], b);
+  TF_ASSERT_OK(scope.ToGraphDef(&graph_def));
+  TF_ASSERT_OK(AddGraphDefWithOutputsToSavedModelBundle(graph_def, {"c:0"}, "",
+                                                        &saved_model_bundle));
+
+  GraphDef frozen_graph_def;
+  std::unordered_set<string> inputs;
+  std::unordered_set<string> outputs;
+  TF_ASSERT_OK(FreezeSavedModel(saved_model_bundle, &frozen_graph_def, &inputs,
+                                &outputs));
+
+  GraphDefEqual(frozen_graph_def, graph_def);
+}
+
+TEST_F(FreezeTest, GraphDefWithControlDependency) {
+  // Inputs that are control dependencies get tensor prefixes,
+  // i.e. ^control_dependency.
+  // Test that we traverse those correctly.
+  SavedModelBundle saved_model_bundle;
+  GraphDef graph_def;
+  Scope scope = Scope::NewRootScope();
+  Output source = ops::Const(scope.WithOpName("source"), 10.0f, {});
+  Output a = ops::Const(scope.WithOpName("a").WithControlDependencies(source),
+                        {10.0f, 10.0f}, {2});
+  Output b = ops::Const(scope.WithOpName("b"), 10.0f, {});
+  Output c = ops::Mul(scope.WithOpName("c"), a, b);
+  TF_ASSERT_OK(scope.ToGraphDef(&graph_def));
+  TF_ASSERT_OK(AddGraphDefWithOutputsToSavedModelBundle(graph_def, {"c:0"}, "",
+                                                        &saved_model_bundle));
+
+  GraphDef frozen_graph_def;
+  std::unordered_set<string> inputs;
+  std::unordered_set<string> outputs;
+  TF_ASSERT_OK(FreezeSavedModel(saved_model_bundle, &frozen_graph_def, &inputs,
+                                &outputs));
+
+  GraphDefEqual(frozen_graph_def, graph_def);
+}
+
 TEST_F(FreezeTest, GraphDefWithoutDependentVariables) {
   TestFreezeGraphWithoutDependentVariables(false);
 }
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index 19e6bf68e77725..2119c8ec47f941 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -214,7 +214,6 @@ cc_library(
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/core:lib",
         "@llvm//:core",
-        "@llvm//:execution_engine",
         "@llvm//:support",
         "@llvm//:target",
     ],
diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc
index 2cae85e8965216..0025842aead539 100644
--- a/tensorflow/compiler/aot/codegen.cc
+++ b/tensorflow/compiler/aot/codegen.cc
@@ -333,6 +333,20 @@ Status GenerateHeader(const CodegenOpts& opts, const tf2xla::Config& config,
           R"(#include "tensorflow/compiler/xla/xla_data.pb.h")"
           : "";
 
+  const string include_hlo_profile_printer_data_proto =
+      opts.gen_hlo_profile_printer_data
+          ? R"(#include "tensorflow/compiler/xla/service/hlo_profile_printer_data.pb.h")"
+          : "";
+
+  // When HLO profiling is disabled we only forward declare the
+  // HloProfilePrinter protobuf.  So we can only conditionally emit this code
+  // calling HloProfilePrinter::profile_counters_size.
+  const string assign_profile_counters_size =
+      opts.gen_hlo_profile_printer_data
+          ? "data->profile_counters_size = "
+            "data->hlo_profile_printer_data->profile_counters_size();"
+          : "";
+
   // Use a poor-man's text templating mechanism; first populate the full header
   // with placeholder tokens, and then rewrite the tokens with real values.
   *header =
@@ -348,6 +362,7 @@ Status GenerateHeader(const CodegenOpts& opts, const tf2xla::Config& config,
 #define TFCOMPILE_GENERATED_{{ENTRY}}_H_  // NOLINT(build/header_guard)
 
 {{INCLUDE_XLA_DATA_PROTO}}
+{{INCLUDE_HLO_PROFILE_PRINTER_DATA_PROTO}}
 #include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -418,6 +433,8 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction {
       data->arg_names = StaticArgNames();
       data->result_names = StaticResultNames();
       data->program_shape = StaticProgramShape();
+      data->hlo_profile_printer_data = StaticHloProfilePrinterData();
+      {{ASSIGN_PROFILE_COUNTERS_SIZE}}
       return data;
     }();
     return *kStaticData;
@@ -487,6 +504,13 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction {
     static const xla::ProgramShape* kShape = {{PROGRAM_SHAPE_SHIM_EXPRESSION}};
     return kShape;
   }
+
+  // Metadata that can be used to pretty-print profile counters.
+  static const xla::HloProfilePrinterData* StaticHloProfilePrinterData() {
+    static const xla::HloProfilePrinterData* kHloProfilePrinterData =
+      {{HLO_PROFILE_PRINTER_DATA_SHIM_EXPRESSION}};
+    return kHloProfilePrinterData;
+  }
 };
 {{NS_END}}
 
@@ -501,35 +525,41 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction {
       {"{{ARG_NAMES_CODE}}", arg_names_code},
       {"{{ARG_NUM}}", strings::StrCat(arg_sizes.size())},
       {"{{ARG_SIZES}}", str_util::Join(arg_sizes, ", ")},
+      {"{{ASSIGN_PROFILE_COUNTERS_SIZE}}", assign_profile_counters_size},
       {"{{CLASS}}", opts.class_name},
+      {"{{DECLS_FROM_OBJ_FILE}}",
+       str_util::Join(metadata_result.header_variable_decls, "\n")},
       {"{{ENTRY}}", compile_result.entry_point},
+      {"{{HLO_PROFILE_PRINTER_DATA_SHIM_EXPRESSION}}",
+       metadata_result.hlo_profile_printer_data_access_shim},
       {"{{INCLUDE_XLA_DATA_PROTO}}", include_xla_data_proto},
+      {"{{INCLUDE_HLO_PROFILE_PRINTER_DATA_PROTO}}",
+       include_hlo_profile_printer_data_proto},
       {"{{METHODS_ARG}}\n", methods_arg},
       {"{{METHODS_RESULT}}\n", methods_result},
       {"{{NS_END}}\n", ns_end},
       {"{{NS_START}}\n", ns_start},
       {"{{PROGRAM_SHAPE}}", xla::ShapeUtil::HumanString(ps)},
+      {"{{PROGRAM_SHAPE_SHIM_EXPRESSION}}",
+       metadata_result.program_shape_access_shim},
       {"{{RESULT_INDEX}}", strings::StrCat(result_index)},
       {"{{RESULT_NAMES_CODE}}", result_names_code},
       {"{{TEMP_BYTES_ALIGNED}}", strings::StrCat(temp_bytes_aligned)},
       {"{{TEMP_BYTES_TOTAL}}", strings::StrCat(temp_bytes_total)},
       {"{{TEMP_NUM}}", strings::StrCat(temp_sizes.size())},
-      {"{{TEMP_SIZES}}", str_util::Join(temp_sizes, ", ")},
-      {"{{DECLS_FROM_OBJ_FILE}}",
-       str_util::Join(metadata_result.header_variable_decls, "\n")},
-      {"{{PROGRAM_SHAPE_SHIM_EXPRESSION}}",
-       metadata_result.program_shape_access_shim}};
+      {"{{TEMP_SIZES}}", str_util::Join(temp_sizes, ", ")}};
   str_util::ReplaceAllPairs(header, rewrites);
   return Status::OK();
 }
 
-static string CreateUniqueIdentifierForProgramShape(const CodegenOpts& opts) {
+static string CreateUniqueIdentifier(const CodegenOpts& opts,
+                                     StringPiece suffix) {
   string result = "__tfcompile";
   for (const string& n : opts.namespaces) {
     strings::StrAppend(&result, "_", n);
   }
 
-  strings::StrAppend(&result, "_", opts.class_name, "_ProgramShape");
+  strings::StrAppend(&result, "_", opts.class_name, "_", suffix);
   return result;
 }
 
@@ -550,18 +580,31 @@ Status GenerateMetadata(const CodegenOpts& opts,
   // When asked to serialize a null protobuf, CreateEmbeddedProtocolBuffer gives
   // a shim that evaluates to nullptr, which is what we want.
 
+  ProtobufToEmbed program_shape_protobuf{
+      CreateUniqueIdentifier(opts, "ProgramShape"), "xla::ProgramShape",
+      program_shape.get()};
+
+  ProtobufToEmbed hlo_profile_printer_data_protobuf{
+      CreateUniqueIdentifier(opts, "HloProfilePrinterData"),
+      "xla::HloProfilePrinterData",
+      compile_result.aot->hlo_profile_printer_data()};
+
   TF_ASSIGN_OR_RETURN(
-      EmbeddedProtocolBuffer embedded_program_shape,
-      CreateEmbeddedProtocolBuffer(opts.target_triple,
-                                   CreateUniqueIdentifierForProgramShape(opts),
-                                   "xla::ProgramShape", program_shape.get()));
+      EmbeddedProtocolBuffers embedded_protobufs,
+      CreateEmbeddedProtocolBuffers(
+          opts.target_triple,
+          {program_shape_protobuf, hlo_profile_printer_data_protobuf}));
 
   metadata_result->program_shape_access_shim =
-      std::move(embedded_program_shape.cpp_shim_expression);
+      std::move(embedded_protobufs.cpp_shims[0].expression);
+  metadata_result->hlo_profile_printer_data_access_shim =
+      std::move(embedded_protobufs.cpp_shims[1].expression);
+  metadata_result->header_variable_decls.emplace_back(
+      std::move(embedded_protobufs.cpp_shims[0].variable_decl));
   metadata_result->header_variable_decls.emplace_back(
-      std::move(embedded_program_shape.cpp_variable_decl));
+      std::move(embedded_protobufs.cpp_shims[1].variable_decl));
   metadata_result->object_file_data =
-      std::move(embedded_program_shape.object_file_data);
+      std::move(embedded_protobufs.object_file_data);
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/aot/codegen.h b/tensorflow/compiler/aot/codegen.h
index 3430b1f96cf4d3..83f2d3ee11d09d 100644
--- a/tensorflow/compiler/aot/codegen.h
+++ b/tensorflow/compiler/aot/codegen.h
@@ -44,6 +44,10 @@ struct CodegenOpts {
 
   // If true, generate program shape data for the ProgramShape method.
   bool gen_program_shape = false;
+
+  // If true, emit a serialized HloProfilePrinterData protobuf that can be used
+  // to pretty print HLO profile counters.
+  bool gen_hlo_profile_printer_data = false;
 };
 
 // Describes a generated metadata object file.
@@ -57,6 +61,12 @@ struct MetadataResult {
   // GenerateMetadata.
   string program_shape_access_shim;
 
+  // hlo_profile_printer_data_access_shim is a C++ expression that constructs
+  // the xla::HloProfilePrinterData instance for the CompileResult passed to
+  // GenerateMetadata.  If the xla::HloProfilePrinterData is null then this is a
+  // C++ expression that evaluates to nullptr at runtime.
+  string hlo_profile_printer_data_access_shim;
+
   // The contents of the object (".o") file.
   string object_file_data;
 };
diff --git a/tensorflow/compiler/aot/codegen_test.cc b/tensorflow/compiler/aot/codegen_test.cc
index 2642536c4f67eb..29bc9c13b889c8 100644
--- a/tensorflow/compiler/aot/codegen_test.cc
+++ b/tensorflow/compiler/aot/codegen_test.cc
@@ -172,7 +172,7 @@ TEST(CodegenTest, Golden) {
   fetch->set_name("myfetch");
   CompileResult compile_result;
   compile_result.aot.reset(
-      new xla::cpu::CpuAotCompilationResult({}, {1, -1, 2, -1, 3, 120}, 5));
+      new xla::cpu::CpuAotCompilationResult({}, {1, -1, 2, -1, 3, 120}, 5, {}));
   compile_result.program_shape = xla::ShapeUtil::MakeProgramShape(
       {
           xla::ShapeUtil::MakeShape(xla::F32, {1, 2}),
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index ac3b5873318873..6641d45e83020f 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -10,6 +10,7 @@
 #define TFCOMPILE_GENERATED_entry_point_H_  // NOLINT(build/header_guard)
 
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+
 #include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -23,6 +24,7 @@ extern "C" void entry_point(
 
 extern "C" char __tfcompile_foo_bar_MyClass_ProgramShape_protobuf_array_contents[];
 
+
 namespace foo {
 namespace bar {
 
@@ -54,9 +56,9 @@ namespace bar {
 //
 // Memory stats:
 //   arg bytes total:    104
-//   arg bytes aligned:  128
+//   arg bytes aligned:  192
 //   temp bytes total:   126
-//   temp bytes aligned: 224
+//   temp bytes aligned: 320
 class MyClass : public tensorflow::XlaCompiledCpuFunction {
  public:
   // Number of input arguments for the compiled computation.
@@ -82,6 +84,8 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
       data->arg_names = StaticArgNames();
       data->result_names = StaticResultNames();
       data->program_shape = StaticProgramShape();
+      data->hlo_profile_printer_data = StaticHloProfilePrinterData();
+      
       return data;
     }();
     return *kStaticData;
@@ -243,6 +247,13 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
   }();
     return kShape;
   }
+
+  // Metadata that can be used to pretty-print profile counters.
+  static const xla::HloProfilePrinterData* StaticHloProfilePrinterData() {
+    static const xla::HloProfilePrinterData* kHloProfilePrinterData =
+      nullptr;
+    return kHloProfilePrinterData;
+  }
 };
 
 }  // end namespace bar
diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc
index e17a7c4bf67321..bbc35da2ef6d14 100644
--- a/tensorflow/compiler/aot/compile.cc
+++ b/tensorflow/compiler/aot/compile.cc
@@ -44,7 +44,7 @@ namespace {
 
 // Compiles the XLA computation into executable code.
 Status CompileXla(xla::CompileOnlyClient* client,
-                  const xla::Computation& computation,
+                  const xla::XlaComputation& computation,
                   const xla::cpu::CpuAotCompilationOptions& aot_opts,
                   CompileResult* compile_result) {
   // Retrieves arg and result layouts from the computation.
@@ -62,7 +62,7 @@ Status CompileXla(xla::CompileOnlyClient* client,
   for (int i = 0; i < pshape->parameters_size(); ++i) {
     arg_layouts.push_back(pshape->mutable_parameters(i));
   }
-  xla::CompileOnlyClient::AotComputationInstance instance;
+  xla::CompileOnlyClient::AotXlaComputationInstance instance;
   instance.computation = &computation;
   instance.argument_layouts = std::move(arg_layouts);
   instance.result_layout = &pshape->result();
@@ -93,14 +93,14 @@ Status CompileGraph(const GraphDef& graph_def, const tf2xla::Config& config,
   xla::CompileOnlyClient* client =
       xla::ClientLibrary::GetOrCreateCompileOnlyClient(cpu_platform)
           .ValueOrDie();
-  xla::Computation computation;
+  xla::XlaComputation computation;
   TF_RETURN_IF_ERROR(
       ConvertGraphDefToXla(graph_def, config, client, &computation));
   if (!flags.out_session_module.empty()) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::SessionModule> module,
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::HloSnapshot> module,
                         computation.Snapshot());
-    // Serialize the SessionModule deterministically so that all the outputs of
-    // a tf_library genrule are deterministic.
+    // Serialize the HloSnapshot deterministically so that all the outputs of a
+    // tf_library genrule are deterministic.
     string proto;
     TF_RET_CHECK(SerializeToStringDeterministic(*module, &proto));
     TF_RETURN_IF_ERROR(
@@ -110,6 +110,7 @@ Status CompileGraph(const GraphDef& graph_def, const tf2xla::Config& config,
       flags.target_triple, flags.target_cpu, flags.target_features,
       flags.entry_point,
       xla::cpu::CpuAotCompilationOptions::RelocationModel::BigPic);
+
   return CompileXla(client, computation, aot_opts, compile_result);
 }
 
diff --git a/tensorflow/compiler/aot/embedded_protocol_buffers.cc b/tensorflow/compiler/aot/embedded_protocol_buffers.cc
index 0048eec93bbe10..4e27aafec77476 100644
--- a/tensorflow/compiler/aot/embedded_protocol_buffers.cc
+++ b/tensorflow/compiler/aot/embedded_protocol_buffers.cc
@@ -36,9 +36,8 @@ namespace tfcompile {
 
 using xla::llvm_ir::AsStringRef;
 
-static std::unique_ptr<llvm::Module> CreateModuleWithEmbeddedProtocolBuffer(
-    llvm::LLVMContext* llvm_context, llvm::TargetMachine* target_machine,
-    const ::tensorflow::protobuf::MessageLite& proto,
+static void AddEmbeddedProtocolBufferToLlvmModule(
+    llvm::Module* module, const ::tensorflow::protobuf::MessageLite& proto,
     StringPiece unique_identifier, string* protobuf_array_symbol_name,
     int64* protobuf_array_size) {
   string protobuf_array_contents = proto.SerializeAsString();
@@ -46,19 +45,14 @@ static std::unique_ptr<llvm::Module> CreateModuleWithEmbeddedProtocolBuffer(
       strings::StrCat(unique_identifier, "_protobuf_array_contents");
   *protobuf_array_size = protobuf_array_contents.size();
 
-  std::unique_ptr<llvm::Module> module =
-      MakeUnique<llvm::Module>("embedded_data_module", *llvm_context);
-
   llvm::Constant* protobuf_array_initializer =
-      llvm::ConstantDataArray::getString(*llvm_context,
+      llvm::ConstantDataArray::getString(module->getContext(),
                                          AsStringRef(protobuf_array_contents),
                                          /*AddNull=*/false);
   new llvm::GlobalVariable(
       *module, protobuf_array_initializer->getType(),
       /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage,
       protobuf_array_initializer, AsStringRef(*protobuf_array_symbol_name));
-
-  return module;
 }
 
 static string CreateCPPShimExpression(StringPiece qualified_cpp_protobuf_name,
@@ -88,7 +82,8 @@ static StatusOr<string> CodegenModule(llvm::TargetMachine* target_machine,
   llvm::legacy::PassManager codegen_passes;
 
   if (target_machine->addPassesToEmitFile(
-          codegen_passes, ostream, llvm::TargetMachine::CGFT_ObjectFile)) {
+          codegen_passes, ostream, nullptr,
+          llvm::TargetMachine::CGFT_ObjectFile)) {
     return xla::InternalError(
         "Could not create pass pipeline to generate object file");
   }
@@ -115,42 +110,44 @@ GetTargetMachineFromTriple(StringPiece target_triple) {
       /*Features=*/"", llvm::TargetOptions(), llvm::None));
 }
 
-StatusOr<EmbeddedProtocolBuffer> CreateEmbeddedProtocolBuffer(
-    StringPiece target_triple, StringPiece symbol_prefix,
-    StringPiece qualified_cpp_protobuf_name,
-    const ::tensorflow::protobuf::MessageLite* proto) {
+StatusOr<EmbeddedProtocolBuffers> CreateEmbeddedProtocolBuffers(
+    StringPiece target_triple,
+    gtl::ArraySlice<ProtobufToEmbed> protobufs_to_embed) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<llvm::TargetMachine> target_machine,
                       GetTargetMachineFromTriple(target_triple));
 
   llvm::LLVMContext llvm_context;
-  string object_file, cpp_shim, cpp_variable_decl;
-
-  if (proto) {
-    string protobuf_array_symbol_name;
-    int64 protobuf_array_size;
-
-    std::unique_ptr<llvm::Module> module_with_serialized_proto =
-        CreateModuleWithEmbeddedProtocolBuffer(
-            &llvm_context, target_machine.get(), *proto, symbol_prefix,
-            &protobuf_array_symbol_name, &protobuf_array_size);
-    TF_ASSIGN_OR_RETURN(object_file,
-                        CodegenModule(target_machine.get(),
-                                      std::move(module_with_serialized_proto)));
-    cpp_shim = CreateCPPShimExpression(qualified_cpp_protobuf_name,
-                                       protobuf_array_symbol_name,
-                                       protobuf_array_size);
-
-    cpp_variable_decl = strings::StrCat("extern \"C\" char ",
-                                        protobuf_array_symbol_name, "[];");
-  } else {
-    TF_ASSIGN_OR_RETURN(
-        object_file,
-        CodegenModule(target_machine.get(),
-                      MakeUnique<llvm::Module>("empty_module", llvm_context)));
-    cpp_shim = "nullptr";
+  std::unique_ptr<llvm::Module> module_with_serialized_proto =
+      MakeUnique<llvm::Module>("embedded_data_module", llvm_context);
+
+  EmbeddedProtocolBuffers result;
+
+  for (const ProtobufToEmbed& protobuf_to_embed : protobufs_to_embed) {
+    string cpp_shim, cpp_variable_decl;
+    if (protobuf_to_embed.message) {
+      string protobuf_array_symbol_name;
+      int64 protobuf_array_size;
+
+      AddEmbeddedProtocolBufferToLlvmModule(
+          module_with_serialized_proto.get(), *protobuf_to_embed.message,
+          protobuf_to_embed.symbol_prefix, &protobuf_array_symbol_name,
+          &protobuf_array_size);
+      cpp_shim = CreateCPPShimExpression(
+          protobuf_to_embed.qualified_cpp_protobuf_name,
+          protobuf_array_symbol_name, protobuf_array_size);
+
+      cpp_variable_decl = strings::StrCat("extern \"C\" char ",
+                                          protobuf_array_symbol_name, "[];");
+    } else {
+      cpp_shim = "nullptr";
+    }
+    result.cpp_shims.push_back({cpp_shim, cpp_variable_decl});
   }
 
-  return {{cpp_shim, cpp_variable_decl, object_file}};
+  TF_ASSIGN_OR_RETURN(result.object_file_data,
+                      CodegenModule(target_machine.get(),
+                                    std::move(module_with_serialized_proto)));
+  return result;
 }
 
 }  // namespace tfcompile
diff --git a/tensorflow/compiler/aot/embedded_protocol_buffers.h b/tensorflow/compiler/aot/embedded_protocol_buffers.h
index 8436e0ff67f352..4e194a6aba9a9e 100644
--- a/tensorflow/compiler/aot/embedded_protocol_buffers.h
+++ b/tensorflow/compiler/aot/embedded_protocol_buffers.h
@@ -21,51 +21,70 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_AOT_EMBEDDED_PROTOCOL_BUFFERS_H_
 
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
 namespace tfcompile {
 using xla::StatusOr;
 
-// Represents a protocol buffer embedded into an object file and describes a way
-// to access it at runtime.
-struct EmbeddedProtocolBuffer {
-  // cpp_shim_expression is a C++ expression that creates an instance of said
-  // protocol buffer when executed.
-  string cpp_shim_expression;
-
-  // cpp_variable_decl is an "extern C" array declaration that is used in
-  // cpp_shim_expression.  It must be visible wherever cpp_shim_expression is
-  // emitted.
-  string cpp_variable_decl;
-
-  // The contents of the object (".o") file the protocol buffer is embbed in.
-  // This needs to be linked in to any program that wants to execute
-  // cpp_variable_decl .
+// Represents a set of protocol buffers embedded into an object file and
+// describes how to access them at runtime.
+struct EmbeddedProtocolBuffers {
+  // Each instance CPPShim describes how to generate C++ code to instantiate a
+  // protobuf instance from the corresponding static data emitted into the
+  // object file.
+  struct CPPShim {
+    // `expression` is a C++ expression that creates an instance of said
+    // protocol buffer when executed.
+    string expression;
+
+    // `variable_decl` is an "extern C" array declaration that is used in
+    // `expression`.  It must be visible wherever `expression` is emitted.
+    string variable_decl;
+  };
+
+  // Each cpp_shim corresponds to one embedded protocol buffer.
+  std::vector<CPPShim> cpp_shims;
+
+  // The contents of the object (".o") file the protocol buffers are embbed in.
+  // This needs to be linked in to any program that wants to execute any of the
+  // expressions in `cpp_shims`.
   string object_file_data;
 };
 
-// Creates an object file that contains `proto`.
-//
-// `proto` is allowed to be nullptr, in which case the generated C++ shim
-// expression is just `nullptr`, and the generated object file does not define
-// any symbols.
+// Describes a protocol buffer to embed into an object file.
+struct ProtobufToEmbed {
+  // `symbol_prefix` is prefix that is guaranteed to be unique across the binary
+  // or DSO the generated object file will be linked into.
+  string symbol_prefix;
+
+  // `qualified_cpp_protobuf_name` is a qualified ("qualified" as in C++
+  // namespace qualified) protocol buffer name.  This is only used in
+  // CPPShim::expression so relatively qualified names are fine as long as
+  // they're valid wherever CPPShim::expression is emitted.
+  string qualified_cpp_protobuf_name;
+
+  // `message` is the protocol buffer to be embedded.  It is allowed to be
+  // nullptr, in which case the generated C++ shim expression is just `nullptr`,
+  // and the generated object file does not define any symbols.
+  const ::tensorflow::protobuf::MessageLite* message;
+};
+
+// Embeds a sequence of protocol buffers into an object file.
 //
 // `target_triple` is the target triple for the target architecture for the
 // generated object file.
 //
-// `symbol_prefix` is prefix that is guaranteed to be unique across the binary
-// or DSO the generated object file will be linked into.
-//
-// `qualified_cpp_protobuf_name` is a qualified ("qualified" as in C++
-// namespace qualified) protocol buffer name.  This needs is only used in
-// EmbeddedProtocolBuffer::cpp_shim_expression so relatively qualified
-// names are fine as long as they're valid wherever cpp_shim_expression
-// is emitted.
-StatusOr<EmbeddedProtocolBuffer> CreateEmbeddedProtocolBuffer(
-    StringPiece target_triple, StringPiece symbol_prefix,
-    StringPiece qualified_cpp_protobuf_name,
-    const ::tensorflow::protobuf::MessageLite* proto);
+// `protobufs_to_embed` describes the protocol buffers to embed into the
+// resulting object file.  The C++ shim for protobufs_to_embed[i] is
+// cpp_shims[i] in the returned EmbeddedProtocolBuffers instance.  The contents
+// of all the protocol buffers are embedded into a single .o file whose content
+// is stored in the object_file_data field in the returned
+// EmbeddedProtocolBuffers instance.
+StatusOr<EmbeddedProtocolBuffers> CreateEmbeddedProtocolBuffers(
+    StringPiece target_triple,
+    gtl::ArraySlice<ProtobufToEmbed> protobufs_to_embed);
 
 }  // namespace tfcompile
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/aot/runtime.h b/tensorflow/compiler/aot/runtime.h
index d085864f0012e4..d1a669ceb17b9f 100644
--- a/tensorflow/compiler/aot/runtime.h
+++ b/tensorflow/compiler/aot/runtime.h
@@ -25,8 +25,8 @@ namespace tensorflow {
 namespace tfcompile {
 namespace runtime {
 
-// Align to 32-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment.
-static constexpr size_t kAlign = 32;
+// Align to 64-bytes, to mimic tensorflow::Allocator::kAllocatorAlignment.
+static constexpr size_t kAlign = 64;
 
 // aligned_buffer_bytes returns the sum of each size in `sizes`, skipping -1
 // values.  There are `n` entries in `sizes`.  Each buffer is aligned to kAlign
diff --git a/tensorflow/compiler/aot/runtime_test.cc b/tensorflow/compiler/aot/runtime_test.cc
index 6d603a02eb4cea..06ec623eb2dce5 100644
--- a/tensorflow/compiler/aot/runtime_test.cc
+++ b/tensorflow/compiler/aot/runtime_test.cc
@@ -24,7 +24,7 @@ namespace runtime {
 namespace {
 
 TEST(Runtime, AlignmentValue) {
-  // We've chosen 32 byte alignment for the tfcompile runtime to mimic the
+  // We've chosen 64 byte alignment for the tfcompile runtime to mimic the
   // regular tensorflow allocator, which was chosen to play nicely with Eigen.
   // The tfcompile runtime also has a requirement that comes from the xla
   // generated code, on the relation: buffer_size >= 16 ? 2 * sizeof(void*) : 8
@@ -39,13 +39,13 @@ TEST(Runtime, AlignedBufferBytes) {
   EXPECT_EQ(aligned_buffer_bytes(sizesA, 1), 0);
 
   static constexpr intptr_t sizesB[1] = {3};
-  EXPECT_EQ(aligned_buffer_bytes(sizesB, 1), 32);
+  EXPECT_EQ(aligned_buffer_bytes(sizesB, 1), 64);
 
   static constexpr intptr_t sizesC[1] = {32};
-  EXPECT_EQ(aligned_buffer_bytes(sizesC, 1), 32);
+  EXPECT_EQ(aligned_buffer_bytes(sizesC, 1), 64);
 
   static constexpr intptr_t sizesD[7] = {1, -1, 32, -1, 64, 2, 3};
-  EXPECT_EQ(aligned_buffer_bytes(sizesD, 7), 192);
+  EXPECT_EQ(aligned_buffer_bytes(sizesD, 7), 320);
 }
 
 void* add_ptr(void* base, uintptr_t delta) {
@@ -101,11 +101,11 @@ TEST(Runtime, MallocFreeContiguousBuffers) {
   EXPECT_NE(base, nullptr);
   EXPECT_EQ(bufD[0], add_ptr(base, 0));
   EXPECT_EQ(bufD[1], nullptr);
-  EXPECT_EQ(bufD[2], add_ptr(base, 32));
+  EXPECT_EQ(bufD[2], add_ptr(base, 64));
   EXPECT_EQ(bufD[3], nullptr);
-  EXPECT_EQ(bufD[4], add_ptr(base, 64));
-  EXPECT_EQ(bufD[5], add_ptr(base, 128));
-  EXPECT_EQ(bufD[6], add_ptr(base, 160));
+  EXPECT_EQ(bufD[4], add_ptr(base, 128));
+  EXPECT_EQ(bufD[5], add_ptr(base, 192));
+  EXPECT_EQ(bufD[6], add_ptr(base, 256));
   for (int i = 0; i < 7; ++i) {
     const intptr_t size = sizesD[i];
     if (size != -1) {
diff --git a/tensorflow/compiler/aot/tests/BUILD b/tensorflow/compiler/aot/tests/BUILD
index bb73cb19c57a65..0ecc3feeb6fef1 100644
--- a/tensorflow/compiler/aot/tests/BUILD
+++ b/tensorflow/compiler/aot/tests/BUILD
@@ -7,6 +7,10 @@ package(
 load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
+# We disable some tfcompile tests in the open source build with the
+# "manual" tag to avoid making our OSS users build LLVM twice
+# (once for host and once for target).
+
 test_suite(
     name = "all_tests",
     tags = ["manual"],
@@ -15,6 +19,7 @@ test_suite(
         ":test_graph_tfadd_with_ckpt_saver_test",
         ":test_graph_tfadd_with_ckpt_test",
         ":test_graph_tfassert_eq_test",
+        ":test_graph_tfcond_test",
         ":test_graph_tffunction_test",
         ":test_graph_tfgather_test",
         ":test_graph_tfmatmul_test",
@@ -55,6 +60,7 @@ genrule(
         "test_graph_tfadd_with_ckpt_saver.pb",
         "test_graph_tfadd_with_ckpt_saver.saver",
         "test_graph_tfassert_eq.pb",
+        "test_graph_tfcond.pb",
         "test_graph_tffunction.pb",
         "test_graph_tfgather.pb",
         "test_graph_tfmatmul.pb",
@@ -118,6 +124,17 @@ tf_library(
     ],
 )
 
+tf_library(
+    name = "test_graph_tfcond",
+    testonly = 1,
+    config = "test_graph_tfcond.config.pbtxt",
+    cpp_class = "CondComp",
+    graph = "test_graph_tfcond.pb",
+    tags = [
+        "manual",
+    ],
+)
+
 tf_library(
     name = "test_graph_tffunction",
     testonly = 1,
@@ -163,6 +180,15 @@ tf_library(
     tfcompile_flags = "--gen_name_to_index --gen_program_shape",
 )
 
+tf_library(
+    name = "test_graph_tfmatmulandadd_with_profiling",
+    testonly = 1,
+    config = "test_graph_tfmatmulandadd.config.pbtxt",
+    cpp_class = "MatMulAndAddCompWithProfiling",
+    enable_xla_hlo_profiling = True,
+    graph = "test_graph_tfmatmulandadd.pb",
+)
+
 tf_library(
     name = "test_graph_tfsplits",
     testonly = 1,
@@ -185,13 +211,18 @@ tf_cc_test(
         ":test_graph_tfadd_with_ckpt",
         ":test_graph_tfadd_with_ckpt_saver",
         ":test_graph_tfassert_eq",
+        ":test_graph_tfcond",
         ":test_graph_tffunction",
         ":test_graph_tfgather",
         ":test_graph_tfmatmul",
         ":test_graph_tfmatmulandadd",
+        ":test_graph_tfmatmulandadd_with_profiling",
         ":test_graph_tfsplits",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo_profile_printer",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//third_party/eigen3",
diff --git a/tensorflow/compiler/aot/tests/make_test_graphs.py b/tensorflow/compiler/aot/tests/make_test_graphs.py
index 67767f55dae9b1..9ec7df163b1425 100644
--- a/tensorflow/compiler/aot/tests/make_test_graphs.py
+++ b/tensorflow/compiler/aot/tests/make_test_graphs.py
@@ -78,6 +78,22 @@ def tfadd_with_ckpt_saver(out_dir):
       f.write(saver.as_saver_def().SerializeToString())
 
 
+def tfassert_eq(_):
+  x = array_ops.placeholder(dtypes.int32, name='x_hold')
+  y = array_ops.placeholder(dtypes.int32, name='y_hold')
+  control_flow_ops.Assert(
+      math_ops.equal(x, y), ['Expected x == y.'], name='assert_eq')
+  math_ops.add(x, math_ops.negative(y), name='x_y_diff')
+
+
+def tfcond(_):
+  p = array_ops.placeholder(dtypes.bool, name='p_hold')
+  x = array_ops.placeholder(dtypes.int32, name='x_hold')
+  y = array_ops.placeholder(dtypes.int32, name='y_hold')
+  z = control_flow_ops.cond(p, lambda: x, lambda: y)
+  array_ops.identity(z, name='result')
+
+
 def tfgather(_):
   params = array_ops.placeholder(dtypes.float32, name='params')
   indices = array_ops.placeholder(dtypes.int32, name='indices')
@@ -126,14 +142,6 @@ def tfsplits(_):
   array_ops.identity(y, name='result')
 
 
-def tfassert_eq(_):
-  x = array_ops.placeholder(dtypes.int32, name='x_hold')
-  y = array_ops.placeholder(dtypes.int32, name='y_hold')
-  control_flow_ops.Assert(
-      math_ops.equal(x, y), ['Expected x == y.'], name='assert_eq')
-  math_ops.add(x, math_ops.negative(y), name='x_y_diff')
-
-
 def write_graph(build_graph, out_dir):
   """Build a graph using build_graph and write it out."""
   g = ops.Graph()
@@ -148,12 +156,13 @@ def main(_):
   write_graph(tfadd, FLAGS.out_dir)
   write_graph(tfadd_with_ckpt, FLAGS.out_dir)
   write_graph(tfadd_with_ckpt_saver, FLAGS.out_dir)
+  write_graph(tfassert_eq, FLAGS.out_dir)
+  write_graph(tfcond, FLAGS.out_dir)
+  write_graph(tffunction, FLAGS.out_dir)
   write_graph(tfgather, FLAGS.out_dir)
   write_graph(tfmatmul, FLAGS.out_dir)
   write_graph(tfmatmulandadd, FLAGS.out_dir)
-  write_graph(tffunction, FLAGS.out_dir)
   write_graph(tfsplits, FLAGS.out_dir)
-  write_graph(tfassert_eq, FLAGS.out_dir)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/compiler/aot/tests/test_graph_tfcond.config.pbtxt b/tensorflow/compiler/aot/tests/test_graph_tfcond.config.pbtxt
new file mode 100644
index 00000000000000..94a01ad4abfaab
--- /dev/null
+++ b/tensorflow/compiler/aot/tests/test_graph_tfcond.config.pbtxt
@@ -0,0 +1,20 @@
+# Text form of tensorflow.tf2xla.Config proto.
+feed {
+  id { node_name: "p_hold" }
+  shape {}
+}
+feed {
+  id { node_name: "x_hold" }
+  shape {
+    dim { size: 1 }
+  }
+}
+feed {
+  id { node_name: "y_hold" }
+  shape {
+    dim { size: 1 }
+  }
+}
+fetch {
+  id { node_name: "result" }
+}
diff --git a/tensorflow/compiler/aot/tests/tfcompile_test.cc b/tensorflow/compiler/aot/tests/tfcompile_test.cc
index 67dbd643bfc7bf..fee46280e9a0e7 100644
--- a/tensorflow/compiler/aot/tests/tfcompile_test.cc
+++ b/tensorflow/compiler/aot/tests/tfcompile_test.cc
@@ -21,19 +21,27 @@ limitations under the License.
 #include "tensorflow/compiler/aot/tests/test_graph_tfadd_with_ckpt.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfadd_with_ckpt_saver.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfassert_eq.h"
+#include "tensorflow/compiler/aot/tests/test_graph_tfcond.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tffunction.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfgather.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfmatmul.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfmatmulandadd.h"
+#include "tensorflow/compiler/aot/tests/test_graph_tfmatmulandadd_with_profiling.h"
 #include "tensorflow/compiler/aot/tests/test_graph_tfsplits.h"
+#include "tensorflow/compiler/xla/service/hlo_profile_printer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
 namespace tfcompile {
 namespace {
 
+using ::testing::HasSubstr;
+using ::testing::IsSupersetOf;
+
 TEST(TFCompileTest, Add) {
   AddComp add;
   EXPECT_EQ(add.arg0_data(), add.args()[0]);
@@ -143,6 +151,31 @@ TEST(TFCompileTest, AddWithCkptSaver) {
   EXPECT_EQ(add_const.result0_data(), add_const.results()[0]);
 }
 
+TEST(TFCompileTest, Cond) {
+  CondComp cond;
+  EXPECT_EQ(cond.arg0_data(), cond.args()[0]);
+  EXPECT_EQ(cond.arg1_data(), cond.args()[1]);
+  EXPECT_EQ(cond.arg2_data(), cond.args()[2]);
+  cond.arg1() = 10;
+  cond.arg2() = 20;
+  {
+    cond.arg0() = true;
+    const int32 expected_result = cond.arg1();
+    EXPECT_TRUE(cond.Run());
+    EXPECT_EQ(cond.result0(), expected_result);
+    EXPECT_EQ(cond.result0_data()[0], expected_result);
+    EXPECT_EQ(cond.result0_data(), cond.results()[0]);
+  }
+  {
+    cond.arg0() = false;
+    const int32 expected_result = cond.arg2();
+    EXPECT_TRUE(cond.Run());
+    EXPECT_EQ(cond.result0(), expected_result);
+    EXPECT_EQ(cond.result0_data()[0], expected_result);
+    EXPECT_EQ(cond.result0_data(), cond.results()[0]);
+  }
+}
+
 TEST(TFCompileTest, Gather) {
   GatherComp gather;
   EXPECT_EQ(gather.arg0_data(), gather.args()[0]);
@@ -484,6 +517,56 @@ TEST(TFCompileTest, ProgramShape) {
   EXPECT_TRUE(ShapeUtil::Compatible(muladd_result1, f32_2x2));
 }
 
+TEST(TFCompileTest, HloProfiling) {
+  Eigen::ThreadPool tp(1);
+  Eigen::ThreadPoolDevice device(&tp, tp.NumThreads());
+
+  MatMulAndAddCompWithProfiling fn;
+  ASSERT_TRUE(fn.hlo_profiling_enabled());
+
+  fn.set_thread_pool(&device);
+
+  // x = [[1, 2], [3, 4]]
+  fn.arg0(0, 0) = 1;
+  fn.arg0(0, 1) = 2;
+  fn.arg0(1, 0) = 3;
+  fn.arg0(1, 1) = 4;
+
+  // y = [[10, 20], [30, 40]]
+  fn.arg1(0, 0) = 10;
+  fn.arg1(0, 1) = 20;
+  fn.arg1(1, 0) = 30;
+  fn.arg1(1, 1) = 40;
+
+  EXPECT_TRUE(fn.Run());
+
+  string hlo_profile_as_string =
+      xla::PrintHloProfile(fn.hlo_profile_printer_data(), fn.profile_counters(),
+                           /*clock_rate_ghz=*/1.0);
+  VLOG(1) << "HLO profile string:\n" << hlo_profile_as_string;
+
+  std::vector<string> hlo_profile_lines =
+      tensorflow::str_util::Split(hlo_profile_as_string, '\n');
+
+  auto header = HasSubstr("Execution profile for");
+  auto total_cycles_profile_line = HasSubstr("[total]");
+  auto dot_profile_line = HasSubstr(
+      "%dot.0.4 = f32[2,2]{1,0} dot(f32[2,2]{1,0} %arg0.0.0, f32[2,2]{1,0} "
+      "%arg1.0.1)");
+  auto add_profile_line = HasSubstr(
+      "%add.0.6 = f32[2,2]{1,0} add(f32[2,2]{1,0} %arg0.0.0, f32[2,2]{1,0} "
+      "%arg1.0.1)");
+  auto tuple_profile_line = HasSubstr(
+      "%tuple.0.8 = (f32[2,2]{1,0}, f32[2,2]{1,0}) tuple(f32[2,2]{1,0} "
+      "%dot.0.4, f32[2,2]{1,0} %add.0.6)");
+  auto arg0_profile_line = HasSubstr("%arg0.0.0 = f32[2,2]{1,0} parameter(0)");
+  auto arg1_profile_line = HasSubstr("%arg1.0.1 = f32[2,2]{1,0} parameter(1)");
+
+  EXPECT_THAT(hlo_profile_lines,
+              IsSupersetOf({header, total_cycles_profile_line, dot_profile_line,
+                            add_profile_line, tuple_profile_line}));
+}
+
 }  // namespace
 }  // namespace tfcompile
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index 3a877c5337ff76..5c57fee326ca74 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -25,7 +25,8 @@ def tf_library(name, graph, config,
                visibility=None, testonly=None,
                tfcompile_flags=None,
                tfcompile_tool="//tensorflow/compiler/aot:tfcompile",
-               include_standard_runtime_deps=True, deps=None, tags=None):
+               include_standard_runtime_deps=True,
+               enable_xla_hlo_profiling=False, deps=None, tags=None):
   """Runs tfcompile to compile a TensorFlow graph into executable code.
 
   Given an invocation of tf_library(name="foo", ...), generates the following
@@ -68,6 +69,8 @@ def tf_library(name, graph, config,
     include_standard_runtime_deps: If True, the standard list of kernel/runtime
       deps is added to deps.  If False, deps must contain the full set of deps
       needed by the generated library.
+    enable_xla_hlo_profiling: Enable XLA HLO profiling in the generated program,
+      and emit metadata that lets us pretty-print the gathered profile counters.
     deps: a list of deps to include on the build rules for the generated
       library, added to the standard deps if standard_runtime_deps is True.
     tags: tags to apply to subsidiary build rules.
@@ -137,6 +140,10 @@ def tf_library(name, graph, config,
     flags = tfcompile_flags
   else:
     flags = " ".join(["'" + arg.replace("'", "'\\''") + "'" for arg in (tfcompile_flags or [])])
+  if enable_xla_hlo_profiling:
+    profiling_flag = "--xla_hlo_profile"
+  else:
+    profiling_flag = ""
   native.genrule(
       name=("gen_" + name),
       srcs=[
@@ -157,7 +164,7 @@ def tf_library(name, graph, config,
            " --out_header=$(@D)/" + header_file +
            " --out_metadata_object=$(@D)/" + metadata_object_file +
            " --out_function_object=$(@D)/" + function_object_file +
-           " " + flags),
+           " " + flags + " " + profiling_flag),
       tools=[tfcompile_tool],
       visibility=visibility,
       testonly=testonly,
@@ -220,6 +227,8 @@ def tf_library(name, graph, config,
       ] + (need_xla_data_proto and [
           # If we're generating the program shape, we must depend on the proto.
           "//tensorflow/compiler/xla:xla_data_proto",
+      ] or []) + (enable_xla_hlo_profiling and [
+          "//tensorflow/compiler/xla/service:hlo_profile_printer_data"
       ] or []) + (include_standard_runtime_deps and [
           # TODO(cwhipkey): only depend on kernel code that the model actually needed.
           "//tensorflow/compiler/tf2xla/kernels:index_ops_kernel_argmax_float_1d",
diff --git a/tensorflow/compiler/aot/tfcompile_main.cc b/tensorflow/compiler/aot/tfcompile_main.cc
index 8ea014c2eede2c..839e1588b7be6c 100644
--- a/tensorflow/compiler/aot/tfcompile_main.cc
+++ b/tensorflow/compiler/aot/tfcompile_main.cc
@@ -100,6 +100,8 @@ Status Main(const MainFlags& flags) {
   if (flags.cpp_class.empty()) {
     return errors::InvalidArgument("Must specify --cpp_class");
   }
+  codegen_opts.gen_hlo_profile_printer_data =
+      xla::legacy_flags::GetDebugOptionsFromFlags().xla_hlo_profile();
   TF_RETURN_IF_ERROR(ParseCppClass(flags.cpp_class, &codegen_opts.class_name,
                                    &codegen_opts.namespaces));
 
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index c8b4b05c6fb22c..0a10c97e74f320 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -23,6 +23,7 @@ package(
 load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
 load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.bzl", "tf_gpu_cc_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
@@ -134,7 +135,6 @@ cc_library(
     srcs = ["xla_tensor.cc"],
     hdrs = ["xla_tensor.h"],
     deps = [
-        ":common",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service:shaped_buffer",
@@ -186,11 +186,12 @@ cc_library(
         "//tensorflow/core/kernels:cast_op",
         "//tensorflow/core/kernels:constant_op",
         "//tensorflow/core/kernels:control_flow_ops",
+        "//tensorflow/core/kernels:identity_n_op",
         "//tensorflow/core/kernels:identity_op",
         "//tensorflow/core/kernels:no_op",
+        "//tensorflow/core/kernels:resource_variable_ops",
         "//tensorflow/core/kernels:sendrecv_ops",
         "//tensorflow/core/kernels:variable_ops",
-        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -227,6 +228,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:gpu_runtime",
@@ -271,6 +273,7 @@ cc_library(
     name = "create_xla_launch_op",
     srcs = [
         "create_xla_launch_op.cc",
+        "create_xla_launch_op.h",
     ],
     deps = [
         ":common",
@@ -280,6 +283,27 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+    alwayslink = 1,
+)
+
+tf_cc_test(
+    name = "create_xla_launch_op_test",
+    srcs = [
+        "create_xla_launch_op.h",
+        "create_xla_launch_op_test.cc",
+    ],
+    deps = [
+        ":create_xla_launch_op",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:session_options",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
     ],
 )
 
@@ -299,6 +323,7 @@ cc_library(
         ":common",
         ":shape_inference_helpers",
         ":union_find",
+        ":xla_cluster_util",
         "//tensorflow/compiler/jit/graphcycles",
         "//tensorflow/compiler/jit/kernels:parallel_check_op",
         "//tensorflow/compiler/jit/legacy_flags:encapsulate_subgraphs_pass_flags",
@@ -319,6 +344,18 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "xla_cluster_util",
+    srcs = ["xla_cluster_util.cc"],
+    hdrs = ["xla_cluster_util.h"],
+    deps = [
+        "//tensorflow/compiler/jit/graphcycles",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core/kernels:bounds_check",
+    ],
+)
+
 cc_library(
     name = "union_find",
     hdrs = ["union_find.h"],
@@ -370,6 +407,63 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "xla_launch_util_test",
+    size = "small",
+    srcs = ["xla_launch_util_test.cc"],
+    deps = [
+        ":common",
+        ":xla_compilation_cache",
+        ":xla_launch_util",
+        ":xla_tensor",
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:gpu_runtime",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core/kernels:variable_ops",
+    ],
+)
+
+cc_library(
+    name = "xla_fusion_optimizer",
+    srcs = ["xla_fusion_optimizer.cc"],
+    hdrs = ["xla_fusion_optimizer.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":common",
+        ":union_find",
+        ":xla_cluster_util",
+        "//tensorflow/compiler/jit/graphcycles",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+    ],
+)
+
+tf_gpu_cc_test(
+    name = "xla_fusion_optimizer_test",
+    srcs = ["xla_fusion_optimizer_test.cc"],
+    deps = [
+        ":common",
+        ":xla_cluster_util",
+        ":xla_fusion_optimizer",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler/utils:grappler_test",
+    ],
+)
+
 # This target can be used by XLA device plugins to prevent circular dependencies, and provides access to all of the required headers for building a device library.
 cc_header_only_library(
     name = "xla_jit_headers_lib",
diff --git a/tensorflow/compiler/jit/build_xla_launch_ops_pass.cc b/tensorflow/compiler/jit/build_xla_launch_ops_pass.cc
index 9a2bb000752755..b17ff589e2597f 100644
--- a/tensorflow/compiler/jit/build_xla_launch_ops_pass.cc
+++ b/tensorflow/compiler/jit/build_xla_launch_ops_pass.cc
@@ -40,7 +40,7 @@ static Status BuildLaunchNode(
     Graph* graph, Node** node) {
   NodeDef def;
   def.set_name(graph->NewName(nodename));
-  def.set_op("_XlaLaunch");
+  def.set_op("XlaLaunch");
   def.set_device(device_name);
   AddNodeAttr("Tconstants", constant_dtypes, &def);
   AddNodeAttr("Targs", arg_dtypes, &def);
@@ -79,7 +79,7 @@ static Status ReplaceNodeWithXlaLaunch(Graph* graph, Node* node) {
       node->input_types().begin() + num_constant_args,
       node->input_types().begin() + num_constant_args + num_nonconst_args);
 
-  // Build a _XlaLaunch operator to execute the function body.
+  // Build a XlaLaunch operator to execute the function body.
   Node* launch_node;
   TF_RETURN_IF_ERROR(BuildLaunchNode(
       graph->NewName(node->name()), node->type_string(), node->def().attr(),
diff --git a/tensorflow/compiler/jit/create_xla_launch_op.cc b/tensorflow/compiler/jit/create_xla_launch_op.cc
index 18d901323f1085..731b8ebfdc6262 100644
--- a/tensorflow/compiler/jit/create_xla_launch_op.cc
+++ b/tensorflow/compiler/jit/create_xla_launch_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/compiler/jit/create_xla_launch_op.h"
 
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/kernels/xla_launch_op.h"
@@ -21,82 +22,194 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace {
 
-// Givens a NodeDef 'ndef' and the function library runtime 'flr', if
-// 'ndef' is a call to a compilable function defined in 'flr', returns OK
-// and fills in 'kernel' with a XlaLaunchOp kernel which computes the
-// node. Otherwise, returns a non-OK.
+// Utility which searches for values in a sorted list by scanning over it once.
+// No matter how many times ScanForValue is called, the list is scanned at most
+// once. However, if a call to ScanForValue skips over a value, that value is
+// not revisited in future calls to ScanForValue, so callers must take
+// care to order their calls.
 //
-// This routine is here so that FunctionLibraryRuntime can jit a
-// specific function call as requested.
-Status CreateXlaLaunchOp(FunctionLibraryRuntime* flr, const NodeDef& ndef,
-                         std::unique_ptr<OpKernel>* kernel) {
-  bool xla_compile = false;
-  if (!flr->GetFunctionLibraryDefinition()
-           ->GetAttr(ndef, kXlaCompileAttr, &xla_compile)
-           .ok() ||
-      !xla_compile) {
-    // Not marked as _XlaCompile=true.
-    return errors::InvalidArgument("No ", kXlaCompileAttr, " for ", ndef.op());
+// Useful for merging multiple sorted lists in O(n) time.
+class SinglePassSearch {
+ public:
+  // Creates a SinglePassSearch object that can be used to search in `values`.
+  // Does not take ownership of `values`. `values` must outlive this.
+  // `values` must be sorted.
+  explicit SinglePassSearch(const std::vector<int>* values)
+      : current_index_(0), values_(values) {}
+
+  // Scans forward in the vector looking for "value", updating the internal
+  // position in to the vector.
+  // Returns true iff the vector contains the given value at or after current
+  // position.
+  // Not thread-safe.
+  bool ScanForValue(int value) {
+    while (current_index_ < values_->size() &&
+           (*values_)[current_index_] <= value) {
+      if ((*values_)[current_index_] == value) {
+        current_index_++;
+        return true;
+      }
+      current_index_++;
+    }
+    return false;
   }
-  // Make sure that kernels have been registered on the JIT device.
-  XlaOpRegistry::RegisterCompilationKernels();
-  if (!IsCompilable(flr, ndef)) {
-    // ndef is calling a function that XLA can't compile.
-    return errors::InvalidArgument("Not compilable: ", ndef.ShortDebugString());
+
+ private:
+  int current_index_;
+  const std::vector<int>* values_;
+};
+
+Status CompilationRequested(const FunctionLibraryRuntime& flr,
+                            const NodeDef& node_def) {
+  bool xla_compile = false;
+  // Check if op is marked _XlaCompile=true.
+  Status status = flr.GetFunctionLibraryDefinition()->GetAttr(
+      node_def, kXlaCompileAttr, &xla_compile);
+  if (!status.ok() || !xla_compile) {
+    if (VLOG_IS_ON(3)) {
+      if (!status.ok()) {
+        VLOG(3) << "No " << kXlaCompileAttr << " attr defined for "
+                << node_def.op() << ". status=" << status.ToString();
+      } else {
+        VLOG(3) << node_def.op() << " is explicitly marked not to be compiled";
+      }
+    }
+    return Status(error::INVALID_ARGUMENT, "");
   }
+  return Status::OK();
+}
+
+// Given a FunctionLibraryRuntime and a NodeDef calling a function in the
+// runtime, returns this function's body in `fbody` as well as the indices
+// of its constant and resource arguments.
+// `fbody` is owned by `flr`.
+// `constant_arg_indices` and `resource_arg_indices` should be empty vector.
+// They are sorted in ascending order on this function's return.
+Status GetBodyAndConstantsAndResources(FunctionLibraryRuntime* flr,
+                                       const NodeDef& node_def,
+                                       const FunctionBody** fbody,
+                                       std::vector<int>* constant_arg_indices,
+                                       std::vector<int>* resource_arg_indices) {
   FunctionLibraryRuntime::Handle handle;
-  // If ndef is not instantiable, e.g., the function does not exist,
+  // If node_def is not instantiable, e.g., the function does not exist,
   // simply bail out.
   TF_RETURN_IF_ERROR(
-      flr->Instantiate(ndef.op(), AttrSlice(&ndef.attr()), &handle));
-  const FunctionBody* fbody = flr->GetFunctionBody(handle);
-  CHECK(fbody);  // Can't be nullptr since we just instantiated it.
-  std::vector<bool> const_args(fbody->arg_types.size());
+      flr->Instantiate(node_def.op(), AttrSlice(&node_def.attr()), &handle));
+  *fbody = flr->GetFunctionBody(handle);
+  CHECK(*fbody);  // Can't be nullptr since we just instantiated it.
+  const DataTypeVector& arg_types = (*fbody)->arg_types;
+  std::vector<bool> const_args(arg_types.size());
   // If we can't analyze the const args. Bail out.
-  TF_RETURN_IF_ERROR(BackwardsConstAnalysis(*(fbody->graph), &const_args));
+  TF_RETURN_IF_ERROR(BackwardsConstAnalysis(*((*fbody)->graph), &const_args));
 
   for (int i = 0; i < const_args.size(); ++i) {
     if (const_args[i]) {
-      // There is a const arg. Bail out.
-      return errors::InvalidArgument("Const arg: ", i, " in ",
-                                     DebugString(fbody->fdef));
+      constant_arg_indices->push_back(i);
+    }
+  }
+
+  // There can be hundreds of resource variables. Reserve the space for them.
+  // We don't reserve for constants above as they are usually few.
+  resource_arg_indices->reserve(arg_types.size());
+  for (int i = 0; i < arg_types.size(); ++i) {
+    if (arg_types[i] == DT_RESOURCE) {
+      resource_arg_indices->push_back(i);
     }
   }
 
-  NodeDef launch_def;
-  launch_def.set_name(ndef.name());
-  launch_def.set_op("_XlaLaunch");
-  launch_def.set_device(flr->device()->name());
-  AddNodeAttr("Tconstants", DataTypeVector{}, &launch_def);
-  AddNodeAttr("Nresources", 0, &launch_def);
-  AddNodeAttr("Targs", fbody->arg_types, &launch_def);
-  AddNodeAttr("Tresults", fbody->ret_types, &launch_def);
-  NameAttrList func;
-  func.set_name(ndef.op());
-  *(func.mutable_attr()) = ndef.attr();
-  AddNodeAttr("function", func, &launch_def);
-
-  // TODO(b/32387911): Handles the host memory types across function
-  // calls properly. For now, we assume all inputs and outputs are on
-  // the device memory.
+  return Status::OK();
+}
+
+}  // namespace
+
+Status CreateXlaLaunchOp(FunctionLibraryRuntime* flr, const NodeDef& node_def,
+                         std::unique_ptr<OpKernel>* kernel) {
+  TF_RETURN_IF_ERROR(CompilationRequested(*flr, node_def));
+
+  VLOG(3) << "Creating XlaLaunchOp for " << node_def.DebugString();
+
+  // Make sure that kernels have been registered on the JIT device.
+  XlaOpRegistry::RegisterCompilationKernels();
+  if (!IsCompilable(flr, node_def)) {
+    // node_def is calling a function that XLA can't compile.
+    return errors::InvalidArgument("Not compilable: ",
+                                   node_def.ShortDebugString());
+  }
+
+  // Get function body, constant args, and resource args.
+  const FunctionBody* fbody = nullptr;
+  std::vector<int> constant_arg_indices;
+  std::vector<int> resource_arg_indices;
+  TF_RETURN_IF_ERROR(GetBodyAndConstantsAndResources(
+      flr, node_def, &fbody, &constant_arg_indices, &resource_arg_indices));
+
+  // Set input and output memory types.
   MemoryTypeVector input_memory_types(fbody->arg_types.size(), DEVICE_MEMORY);
+  // These indices are used only for optimization purposes. They allow us
+  // to loop over constant_arg_indices and resource_arg_indices only once
+  // while iterating over all the function arguments checking if it is a
+  // resource or a constant.
+  // The reason we optimized this code is because functions can have a lot of
+  // captured arguments. For example, the backward pass of ResNet50 takes in all
+  // 214 variables and a similar number of activations.
+  SinglePassSearch constants_search(&constant_arg_indices);
+  SinglePassSearch resources_search(&resource_arg_indices);
+  for (int i = 0; i < fbody->arg_types.size(); ++i) {
+    if (resources_search.ScanForValue(i) || constants_search.ScanForValue(i)) {
+      // Compile-time constants and resource handles are expected to be in
+      // host memory.
+      input_memory_types[i] = HOST_MEMORY;
+    }
+  }
+  // One might wonder, about the case where a compile-time constant argument
+  // (which must be in host memory) is also used as an input into an op,
+  // e.g. Add, that expects its inputs in device memory. Here is how it
+  // works now.
+  // First, what do we mean by "op expects an input in XYZ memory"?
+  // There are two types of "ops" here: the tf2xla kernel and the HLO
+  // computation it builds. The tf2xla kernel needs to retrieve the actual
+  // numeric value of the compile-time constant tensors, so it really expects
+  // them to be on in host memory. However, for other inputs, it refers to them
+  // using xla::ComputationDataHandle, which is just a symbolic handle that
+  // xla::ComputationBuilder assigns. How does this handle gets assigned for
+  // constant arguments? Even constant arguments get an _Arg node in the graph
+  // instatiated for Function compilation. The tf2xla kernel for constant _Arg
+  // nodes takes the constant value, converts it to XlaLiteral, and feeds it
+  // to xla::ComputationBuilder.ConstantLiteral, which returns the handle. This
+  // constant XlaLiteral is included in the HLO graph, and subsequently, in
+  // the actual executable, which is copied to the device before being
+  // executed. Thus, when this executable runs, the constant is available in
+  // device memory.
+
+  // XlaLaunch kernel keeps all outputs (including constants, which it copies),
+  // in device memory
   MemoryTypeVector output_memory_types(fbody->ret_types.size(), DEVICE_MEMORY);
 
+  // Create the kernel.
+  NameAttrList function;
+  function.set_name(node_def.op());
+  *(function.mutable_attr()) = node_def.attr();
+
   Device* dev = flr->device();
   Status s;
   OpKernelConstruction construction(
       DeviceType(dev->device_type()), dev,
-      dev->GetAllocator(AllocatorAttributes()), &launch_def,
+      dev->GetAllocator(AllocatorAttributes()), &node_def,
       &fbody->fdef.signature(), flr, fbody->arg_types, input_memory_types,
       fbody->ret_types, output_memory_types, flr->graph_def_version(), &s);
-  kernel->reset(new XlaLocalLaunchOp(&construction));
+
+  *kernel = MakeUnique<XlaLocalLaunchBase>(&construction, constant_arg_indices,
+                                           resource_arg_indices, function);
   return s;
 }
 
+namespace {
+
 bool RegisterLaunchOpCreator() {
   RegisterDefaultCustomKernelCreator(CreateXlaLaunchOp);
   return true;
diff --git a/tensorflow/compiler/jit/create_xla_launch_op.h b/tensorflow/compiler/jit/create_xla_launch_op.h
new file mode 100644
index 00000000000000..98a22e351532c1
--- /dev/null
+++ b/tensorflow/compiler/jit/create_xla_launch_op.h
@@ -0,0 +1,35 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_JIT_CREATE_XLA_LAUNCH_OP_H_
+#define TENSORFLOW_COMPILER_JIT_CREATE_XLA_LAUNCH_OP_H_
+
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class FunctionLibraryRuntime;
+class OpKernel;
+
+// Given a NodeDef 'node_def' and the function library runtime 'flr', if
+// 'node_def' is a call to a compilable function defined in 'flr', returns OK
+// and fills in 'kernel' with a XlaLaunchOp kernel which computes the
+// node. Otherwise, returns a non-OK.
+Status CreateXlaLaunchOp(FunctionLibraryRuntime* flr, const NodeDef& node_def,
+                         std::unique_ptr<OpKernel>* kernel);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_CREATE_XLA_LAUNCH_OP_H_
diff --git a/tensorflow/compiler/jit/create_xla_launch_op_test.cc b/tensorflow/compiler/jit/create_xla_launch_op_test.cc
new file mode 100644
index 00000000000000..b75ab486b80e09
--- /dev/null
+++ b/tensorflow/compiler/jit/create_xla_launch_op_test.cc
@@ -0,0 +1,145 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/create_xla_launch_op.h"
+
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+
+NodeDef ToNodeDef(const string& text) {
+  NodeDef node_def;
+  EXPECT_TRUE(protobuf::TextFormat::MergeFromString(text, &node_def));
+  return node_def;
+}
+
+// Create a FunctionDef that takes one resource and one regular param
+FunctionDef XTimesY() {
+  return FunctionDefHelper::Define(
+      // Name
+      "XTimesY",
+      // Args
+      {"x: float", "y: resource"},
+      // Return values
+      {"z: float"},
+      // Attr def
+      {},
+      // Nodes
+      {
+          {{"y0"}, "ReadVariableOp", {"y"}, {{"dtype", DT_FLOAT}}},
+          {{"z"}, "Mul", {"x", "y0"}, {{"T", DT_FLOAT}}},
+      });
+}
+
+class CreateXlaLaunchOpTest : public ::testing::Test {
+ protected:
+  void Init(const std::vector<FunctionDef>& flib) {
+    SessionOptions options;
+    auto* device_count = options.config.mutable_device_count();
+    device_count->insert({"CPU", 1});
+    TF_CHECK_OK(DeviceFactory::AddDevices(
+        options, "/job:localhost/replica:0/task:0", &devices_));
+
+    FunctionDefLibrary proto;
+    for (const auto& fdef : flib) {
+      *(proto.add_function()) = fdef;
+    }
+    lib_def_ =
+        MakeUnique<FunctionLibraryDefinition>(OpRegistry::Global(), proto);
+    OptimizerOptions opts;
+    device_mgr_ = MakeUnique<DeviceMgr>(devices_);
+    pflr_ = MakeUnique<ProcessFunctionLibraryRuntime>(
+        device_mgr_.get(), Env::Default(), TF_GRAPH_DEF_VERSION, lib_def_.get(),
+        opts, /*default_thread_pool=*/nullptr, /*cluster_flr=*/nullptr);
+    flr_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:0");
+  }
+
+  FunctionLibraryRuntime* flr_;
+  std::vector<Device*> devices_;
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<FunctionLibraryDefinition> lib_def_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+
+  std::unique_ptr<OpKernel> kernel_;
+};
+
+AttrValue BoolAttr(bool b) {
+  AttrValue v;
+  v.set_b(b);
+  return v;
+}
+
+TEST_F(CreateXlaLaunchOpTest, OneFloatOneResourceArgument) {
+  FunctionDef fdef = XTimesY();
+  (*fdef.mutable_attr())["_XlaCompile"] = BoolAttr(true);
+  Init({fdef});
+
+  Status status = CreateXlaLaunchOp(
+      flr_, ToNodeDef(R"pb(
+        name: 'XTimesY' op: 'XTimesY' input: 'a' input: 'b'
+      )pb"), &kernel_);
+  ASSERT_TRUE(status.ok()) << status.ToString();
+
+  EXPECT_EQ("XTimesY", kernel_->name());
+  EXPECT_EQ("XTimesY", kernel_->type_string());
+
+  EXPECT_EQ(2, kernel_->num_inputs());
+  EXPECT_EQ(DT_FLOAT, kernel_->input_type(0));
+  EXPECT_EQ(DT_RESOURCE, kernel_->input_type(1));
+  EXPECT_EQ(DEVICE_MEMORY, kernel_->input_memory_types()[0]);
+  EXPECT_EQ(HOST_MEMORY, kernel_->input_memory_types()[1]);
+
+  EXPECT_EQ(1, kernel_->num_outputs());
+  EXPECT_EQ(DT_FLOAT, kernel_->output_type(0));
+  EXPECT_EQ(DEVICE_MEMORY, kernel_->output_memory_types()[0]);
+}
+
+TEST_F(CreateXlaLaunchOpTest, FailsIfXlaCompileAttrNotSet) {
+  FunctionDef fdef = XTimesY();
+  Init({fdef});
+
+  Status status = CreateXlaLaunchOp(flr_, ToNodeDef(R"proto(
+                                      name: 'XTimesY'
+                                      op: 'XTimesY'
+                                      input: 'a'
+                                      input: 'b'
+                                    )proto"), &kernel_);
+  EXPECT_TRUE(errors::IsInvalidArgument(status)) << status.ToString();
+}
+
+TEST_F(CreateXlaLaunchOpTest, FailsIfXlaCompileAttrIsSetToFalse) {
+  FunctionDef fdef = XTimesY();
+  (*fdef.mutable_attr())["_XlaCompile"] = BoolAttr(false);
+  Init({fdef});
+
+  Status status = CreateXlaLaunchOp(flr_, ToNodeDef(R"proto(
+                                      name: 'XTimesY'
+                                      op: 'XTimesY'
+                                      input: 'a'
+                                      input: 'b'
+                                    )proto"), &kernel_);
+  EXPECT_TRUE(errors::IsInvalidArgument(status)) << status.ToString();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index f06debaf316c01..6d1e3325ebd35b 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -240,7 +240,7 @@ class Encapsulator {
   // Once edges between compiled and outside_compilation clusters have been
   // replaced by send/recv ops, some dependencies may no longer be apparent.
   // A clustering pass finds all the dependencies between HC nodes that are only
-  // present as a result of edges between nodes in outside_compilaton clusters.
+  // present as a result of edges between nodes in outside_compilation clusters.
   // Suppose there is a path from outside_compilation cluster C in subgraph S
   // to outside_compilation cluster D in subgraph T. If S != T then a control
   // edge is added from the call node for S to the call node for T, which
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h
index 34be4409a38119..5fee36f022a751 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.h
@@ -80,7 +80,7 @@ Status EncapsulateSubgraphsInFunctions(
     std::unique_ptr<Graph>* graph_out, FunctionLibraryDefinition* library);
 
 // The attribute that marks function calls produced by the encapsulate
-// subgraphs pass and that should in turn be compiled via _XlaLaunch operators.
+// subgraphs pass and that should in turn be compiled via XlaLaunch operators.
 extern const char* const kXlaCompiledKernelAttr;
 
 // Does `node` have the kXlaCompiledKernelAttr attribute?
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index 5ec24d39a2c40a..eef113a3547f0b 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -1050,7 +1050,7 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
                          .WithAttr("_outside", "O1"));
     Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2",
                              {DT_FLOAT, DT_FLOAT}, shape2.opts());
-    Node* h = Binary(ops::NodeOut(recv2, 0), e,
+    Node* h = Binary(ops::NodeOut(recv2, 1), e,
                      shape2.opts()
                          .WithName("H")
                          .WithAttr("_encapsulate", "F1")
@@ -1075,7 +1075,7 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
            {"outside_compilation_O1_host_compute"}},
           {{"outside_compilation_O2_host_compute"},
            "XlaHostCompute",
-           {"D:o:0", "F:o:0"},
+           {"F:o:0", "D:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
             {"ancestors",
@@ -1123,13 +1123,13 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
 
     Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2",
                              {DT_FLOAT, DT_FLOAT}, b2.opts());
-    Node* g = Binary(e, ops::NodeOut(recv2, 1),
+    Node* g = Binary(e, ops::NodeOut(recv2, 0),
                      b2.opts()
                          .WithName("G")
                          .WithControlInputs({recv2, e})
                          .WithAttr("_encapsulate", "F1")
                          .WithAttr("_outside", "O2"));
-    Node* h = Binary(ops::NodeOut(recv2, 0), e,
+    Node* h = Binary(ops::NodeOut(recv2, 1), e,
                      b2.opts()
                          .WithName("H")
                          .WithAttr("_encapsulate", "F1")
diff --git a/tensorflow/compiler/jit/graphcycles/graphcycles.cc b/tensorflow/compiler/jit/graphcycles/graphcycles.cc
index bc68afb322b5cf..805bbc62c1e2e8 100644
--- a/tensorflow/compiler/jit/graphcycles/graphcycles.cc
+++ b/tensorflow/compiler/jit/graphcycles/graphcycles.cc
@@ -354,6 +354,16 @@ bool GraphCycles::IsReachableNonConst(int32 x, int32 y) {
   return reachable;
 }
 
+bool GraphCycles::CanContractEdge(int32 a, int32 b) {
+  CHECK(HasEdge(a, b)) << "No edge exists from " << a << " to " << b;
+  RemoveEdge(a, b);
+  bool reachable = IsReachableNonConst(a, b);
+  // Restore the graph to its original state.
+  InsertEdge(a, b);
+  // If reachable, then contracting edge will cause cycle.
+  return !reachable;
+}
+
 bool GraphCycles::ContractEdge(int32 a, int32 b) {
   CHECK(HasEdge(a, b));
   RemoveEdge(a, b);
@@ -388,4 +398,8 @@ std::unordered_set<int32> GraphCycles::Successors(int32 node) {
   return rep_->nodes_[node]->out;
 }
 
+std::unordered_set<int32> GraphCycles::Predecessors(int32 node) {
+  return rep_->nodes_[node]->in;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/graphcycles/graphcycles.h b/tensorflow/compiler/jit/graphcycles/graphcycles.h
index d11d6e27b1b7bb..44448fa3d787d0 100644
--- a/tensorflow/compiler/jit/graphcycles/graphcycles.h
+++ b/tensorflow/compiler/jit/graphcycles/graphcycles.h
@@ -85,6 +85,9 @@ class GraphCycles {
   // and returns false.
   bool ContractEdge(int32 a, int32 b);
 
+  // Return true if can contract edge, otherwise return false.
+  bool CanContractEdge(int32 a, int32 b);
+
   // Return whether dest_node is reachable from source_node
   // by following edges.
   bool IsReachable(int32 source_node, int32 dest_node) const;
@@ -115,6 +118,7 @@ class GraphCycles {
   bool CheckInvariants() const;
 
   std::unordered_set<int32> Successors(int32 node);
+  std::unordered_set<int32> Predecessors(int32 node);
 
   // ----------------------------------------------------
   struct Rep;
diff --git a/tensorflow/compiler/jit/graphcycles/graphcycles_test.cc b/tensorflow/compiler/jit/graphcycles/graphcycles_test.cc
index e47b782207e912..274f5938a1228b 100644
--- a/tensorflow/compiler/jit/graphcycles/graphcycles_test.cc
+++ b/tensorflow/compiler/jit/graphcycles/graphcycles_test.cc
@@ -494,6 +494,20 @@ TEST_F(GraphCyclesTest, ContractEdge) {
   EXPECT_TRUE(g_.HasEdge(1, 4));
 }
 
+TEST_F(GraphCyclesTest, CanContractEdge) {
+  ASSERT_TRUE(AddEdge(1, 2));
+  ASSERT_TRUE(AddEdge(1, 3));
+  ASSERT_TRUE(AddEdge(2, 3));
+  ASSERT_TRUE(AddEdge(2, 4));
+  ASSERT_TRUE(AddEdge(3, 4));
+
+  EXPECT_FALSE(g_.CanContractEdge(1, 3));
+  EXPECT_FALSE(g_.CanContractEdge(2, 4));
+  EXPECT_TRUE(g_.CanContractEdge(1, 2));
+  EXPECT_TRUE(g_.CanContractEdge(2, 3));
+  EXPECT_TRUE(g_.CanContractEdge(3, 4));
+}
+
 static void BM_StressTest(int iters, int num_nodes) {
   while (iters > 0) {
     tensorflow::GraphCycles g;
diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
index 23997a67166918..8617dd41a96e41 100644
--- a/tensorflow/compiler/jit/kernels/xla_launch_op.cc
+++ b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
@@ -39,15 +39,15 @@ limitations under the License.
 
 namespace tensorflow {
 
-XlaLocalLaunchOp::XlaLocalLaunchOp(OpKernelConstruction* ctx)
-    : OpKernel(ctx), device_type_(ctx->device_type()) {
-  const NameAttrList* func;
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("function", &func));
-  function_ = *func;
-  DataTypeVector constant_types;
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("Tconstants", &constant_types));
-  num_constant_args_ = constant_types.size();
-  OP_REQUIRES_OK(ctx, ctx->GetAttr("Nresources", &num_resource_args_));
+XlaLocalLaunchBase::XlaLocalLaunchBase(OpKernelConstruction* ctx,
+                                       const std::vector<int>& constants,
+                                       const std::vector<int>& resources,
+                                       const NameAttrList& function)
+    : OpKernel(ctx),
+      constants_(constants),
+      resources_(resources),
+      device_type_(ctx->device_type()),
+      function_(function) {
   if (device_type_ == DeviceType(DEVICE_CPU)) {
     platform_id_ = se::host::kHostPlatformId;
   } else if (device_type_ == DeviceType(DEVICE_GPU)) {
@@ -59,8 +59,8 @@ XlaLocalLaunchOp::XlaLocalLaunchOp(OpKernelConstruction* ctx)
   }
 }
 
-Status XlaLocalLaunchOp::BuildCompilationCache(OpKernelContext* ctx,
-                                               XlaCompilationCache** cache) {
+Status XlaLocalLaunchBase::BuildCompilationCache(OpKernelContext* ctx,
+                                                 XlaCompilationCache** cache) {
   const XlaDevice::Metadata* metadata;
   Status s = XlaDevice::GetMetadata(ctx, &metadata);
   if (s.ok()) {
@@ -92,8 +92,8 @@ Status XlaLocalLaunchOp::BuildCompilationCache(OpKernelContext* ctx,
   return Status::OK();
 }
 
-void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
-  VLOG(1) << "XlaLocalLaunchOp::Compute "
+void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
+  VLOG(1) << "XlaLocalLaunchOpBase::Compute "
           << Canonicalize(function_.name(), AttrSlice(&function_.attr()));
   // We store information about the JIT-compiled XLA computation
   // in the ResourceMgr.
@@ -114,7 +114,7 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   // this is more obviously correct.)
   core::ScopedUnref cache_ref(cache);
 
-  const XlaDevice::Metadata* metadata;
+  const XlaDevice::Metadata* metadata = nullptr;
   Status s = XlaDevice::GetMetadata(ctx, &metadata);
   bool allocate_xla_tensors = s.ok();
 
@@ -126,7 +126,7 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   }
 
   std::map<int, OptionalTensor> variables =
-      SnapshotResourceVariables(ctx, num_resource_args_);
+      SnapshotResourceVariables(ctx, resources_);
 
   xla::LocalClient* client = static_cast<xla::LocalClient*>(cache->client());
 
@@ -150,30 +150,32 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
 
   XlaCompiler::Options options;
   options.client = client;
-  options.device_type = &cache->device_type();
+  options.device_type = cache->device_type();
   options.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
   options.graph_def_version = ctx->function_library()->graph_def_version();
   options.allow_cpu_custom_calls = (platform_id_ == se::host::kHostPlatformId);
   options.device_allocator = xla_allocator;
-  // TODO(b/77671268): We don't set variable_representation_shape_fn here. This
-  // is restricted to Variables, but we need something like this to apply to
-  // normal Tensors too.
+  if (metadata) {
+    options.shape_representation_fn = metadata->shape_representation_fn();
+  }
 
   const XlaCompiler::CompilationResult* kernel;
   xla::LocalExecutable* executable;
 
   std::map<int, Tensor> constant_args;
-  for (int i = 0; i < num_constant_args_; ++i) {
+  for (int i : constants_) {
     constant_args.insert({i, ctx->input(i)});
   }
-  OP_REQUIRES_OK(ctx, cache->Compile(options, function_, constant_args,
-                                     variables, ctx, &kernel, &executable,
-                                     /*compile_options=*/nullptr));
+  XlaCompiler::CompileOptions compile_options;
+  compile_options.is_entry_computation = true;
+  OP_REQUIRES_OK(
+      ctx, cache->Compile(options, function_, constant_args, variables, ctx,
+                          &kernel, &executable, &compile_options));
 
   VLOG(1) << "Executing XLA Computation...";
 
-  XlaComputationLaunchContext launch_context(
-      num_resource_args_, client, xla_allocator, allocate_xla_tensors);
+  XlaComputationLaunchContext launch_context(client, xla_allocator,
+                                             allocate_xla_tensors);
   launch_context.PopulateInputs(ctx, kernel, variables);
 
   // Execute the computation.
@@ -196,14 +198,69 @@ void XlaLocalLaunchOp::Compute(OpKernelContext* ctx) {
   VLOG(1) << "Done";
 }
 
+namespace {
+
+// OP_REQUIRES_OK_RETURN is the same as OP_REQUIRES_OK except that
+// in error case, it returns RET instead of void.
+#define OP_REQUIRES_OK_RETURN(CTX, RET, ...)                \
+  do {                                                      \
+    ::tensorflow::Status _s(__VA_ARGS__);                   \
+    if (!TF_PREDICT_TRUE(_s.ok())) {                        \
+      (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, _s); \
+      return RET;                                           \
+    }                                                       \
+  } while (0)
+
+// Helper static functions to construct parameters for
+// XlaLocalLaunchBase constructor from OpKernelConstruction.
+std::vector<int> ConstantsVector(OpKernelConstruction* ctx) {
+  DataTypeVector constant_types;
+  OP_REQUIRES_OK_RETURN(ctx, std::vector<int>(),
+                        ctx->GetAttr("Tconstants", &constant_types));
+  std::vector<int> constants(constant_types.size());
+  std::iota(constants.begin(), constants.end(), 0);
+  return constants;
+}
+
+std::vector<int> ResourcesVector(OpKernelConstruction* ctx) {
+  DataTypeVector constant_types;
+  OP_REQUIRES_OK_RETURN(ctx, std::vector<int>(),
+                        ctx->GetAttr("Tconstants", &constant_types));
+
+  DataTypeVector arg_types;
+  OP_REQUIRES_OK_RETURN(ctx, std::vector<int>(),
+                        ctx->GetAttr("Targs", &arg_types));
+
+  int num_resources;
+  OP_REQUIRES_OK_RETURN(ctx, std::vector<int>(),
+                        ctx->GetAttr("Nresources", &num_resources));
+
+  std::vector<int> resources(num_resources);
+  std::iota(resources.begin(), resources.end(),
+            constant_types.size() + arg_types.size());
+  return resources;
+}
+
+NameAttrList FunctionAttr(OpKernelConstruction* ctx) {
+  const NameAttrList* func;
+  OP_REQUIRES_OK_RETURN(ctx, NameAttrList(), ctx->GetAttr("function", &func));
+  return *func;
+}
+
+#undef OP_REQUIRES_OK_RETURN
+}  // namespace
+
+XlaLocalLaunchOp::XlaLocalLaunchOp(OpKernelConstruction* ctx)
+    : XlaLocalLaunchBase(ctx, ConstantsVector(ctx), ResourcesVector(ctx),
+                         FunctionAttr(ctx)) {}
+
 XlaLocalLaunchOp::~XlaLocalLaunchOp() {
   VLOG(1) << "XlaLocalLaunchOp destroyed";
 }
 
-REGISTER_KERNEL_BUILDER(Name("_XlaLaunch").Device(DEVICE_CPU),
-                        XlaLocalLaunchOp);
+REGISTER_KERNEL_BUILDER(Name("XlaLaunch").Device(DEVICE_CPU), XlaLocalLaunchOp);
 
-REGISTER_KERNEL_BUILDER(Name("_XlaLaunch")
+REGISTER_KERNEL_BUILDER(Name("XlaLaunch")
                             .Device(DEVICE_GPU)
                             .HostMemory("constants")
                             .HostMemory("resources"),
diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.h b/tensorflow/compiler/jit/kernels/xla_launch_op.h
index 8f8e646f0ff6d9..8dfc4b382d5115 100644
--- a/tensorflow/compiler/jit/kernels/xla_launch_op.h
+++ b/tensorflow/compiler/jit/kernels/xla_launch_op.h
@@ -26,6 +26,41 @@ limitations under the License.
 
 namespace tensorflow {
 
+// XlaLocalLaunchBase is almost the same as XlaLocalLaunchOp.
+// The only difference is that it does not require arguments to follow
+// the "constants, then regular args, then resources" order.
+// It takes vectors of constant and resource arguments explicitly.
+// It does not have corresponding OpDef because it is never present
+// in the GraphDef.
+// Currently, it is used by eager runtime. FunctionLibraryRuntime creates
+// this kernel when asked to create a kernel for an XLA-compiled function.
+class XlaLocalLaunchBase : public OpKernel {
+ public:
+  XlaLocalLaunchBase(OpKernelConstruction* ctx,
+                     const std::vector<int>& constants,
+                     const std::vector<int>& resources,
+                     const NameAttrList& function);
+  XlaLocalLaunchBase(const XlaLocalLaunchBase&) = delete;
+  XlaLocalLaunchBase& operator=(const XlaLocalLaunchBase&) = delete;
+  ~XlaLocalLaunchBase() override = default;
+
+  void Compute(OpKernelContext* ctx) override;
+
+ protected:
+  // Builds a XlaCompilationCache class suitable for the current device.
+  Status BuildCompilationCache(OpKernelContext* ctx,
+                               XlaCompilationCache** cache);
+
+  // Indexes of compile-time constant inputs
+  std::vector<int> constants_;
+  // Indexes of resource inputs
+  std::vector<int> resources_;
+
+  DeviceType device_type_;
+  NameAttrList function_;
+  se::Platform::Id platform_id_;
+};
+
 // XlaLocalLaunchOp is used to replace a region of the TensorFlow graph
 // which will be compiled and executed using XLA.  The XlaLocalLaunchOp is
 // responsible for handling interactions with the TensorFlow executor.
@@ -35,26 +70,12 @@ namespace tensorflow {
 // XlaLocalLaunchOp uses xla::LocalClient::Compile() and
 // xla::LocalExecutable::Run(), and passes arguments into/out of XLA in device
 // memory.
-class XlaLocalLaunchOp : public OpKernel {
+class XlaLocalLaunchOp : public XlaLocalLaunchBase {
  public:
   explicit XlaLocalLaunchOp(OpKernelConstruction* ctx);
   ~XlaLocalLaunchOp() override;
 
-  void Compute(OpKernelContext* ctx) override;
-
  private:
-  // Builds a XlaCompilationCache class suitable for the current device.
-  Status BuildCompilationCache(OpKernelContext* ctx,
-                               XlaCompilationCache** compiler);
-
-  DeviceType device_type_;
-  NameAttrList function_;
-  int num_constant_args_;
-  // Number of resource variable arguments.
-  int num_resource_args_;
-
-  se::Platform::Id platform_id_;
-
   TF_DISALLOW_COPY_AND_ASSIGN(XlaLocalLaunchOp);
 };
 
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 8e2ee0f1d71bc1..74468266b9e983 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
 #include "tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h"
 #include "tensorflow/compiler/jit/union_find.h"
+#include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/common_runtime/function.h"
@@ -41,11 +42,14 @@ limitations under the License.
 
 namespace tensorflow {
 
-const char* const kXlaClusterAttr = "_XlaCluster";
-const char* const kXlaOutsideCompilationAttr = "_XlaOutsideCompilation";
-
 namespace {
 
+// Returns true if, when executed in TensorFlow, `node` is guaranteed to forward
+// a ref tensor input to its output.
+static bool AlwaysForwardsRefInput(const Node& node) {
+  return node.IsIdentity();
+}
+
 bool HasXLAKernel(const Node& node, const DeviceType& jit_device_type) {
   // There is a SymbolicGradient kernel on the XLA_JIT device, but the gradient
   // is really a kind of function call and will be handled by
@@ -60,6 +64,26 @@ bool HasXLAKernel(const Node& node, const DeviceType& jit_device_type) {
       return false;
     }
   }
+
+  // XLA does not offer guaranteed aliasing between the input and output of the
+  // XLA cluster so it can't implement the forward-tensor-ref semantic.  Leave
+  // such nodes out of XLA clusters.
+  if (AlwaysForwardsRefInput(node)) {
+    for (const Edge* incoming_edge : node.in_edges()) {
+      if (incoming_edge->IsControlEdge()) {
+        continue;
+      }
+
+      Node* incoming_node = incoming_edge->src();
+      if (IsRefType(incoming_node->output_type(incoming_edge->src_output()))) {
+        VLOG(2) << "Not clustering " << node.def().ShortDebugString()
+                << " because of ref input " << incoming_node->name() << " "
+                << incoming_node->type_string();
+        return false;
+      }
+    }
+  }
+
   return FindKernelDef(jit_device_type, node.def(), nullptr, nullptr).ok();
 }
 
@@ -165,16 +189,6 @@ bool IsCompilableCall(const NodeDef& call_def,
   return true;
 }
 
-// Returns the DeviceType corresponding to 'device'.
-Status DeviceTypeOfDevice(const string& device, DeviceType* device_type) {
-  DeviceNameUtils::ParsedName parsed;
-  if (!DeviceNameUtils::ParseFullName(device, &parsed)) {
-    return errors::Internal("Malformed assigned device '", device, "'");
-  }
-  *device_type = DeviceType(parsed.type);
-  return Status::OK();
-}
-
 // Tests whether `node` has a DT_RESOURCE typed input or output.
 bool HasResourceInputOrOutput(const Node& node) {
   return std::find(node.input_types().begin(), node.input_types().end(),
@@ -183,18 +197,11 @@ bool HasResourceInputOrOutput(const Node& node) {
                    DT_RESOURCE) != node.output_types().end();
 }
 
-struct NodeCompare {
-  bool operator()(const Node* a, const Node* b) const {
-    return a->id() < b->id();
-  }
-};
-using OrderedNodeSet = std::set<Node*, NodeCompare>;
-
 // Returns true if the op can be decomposed into XLA ops for which
 // there are fusable elemental implementations.
 //
-// TODO(hpucha): Consider a black list instead of a white list as
-// implemented below.
+// TODO(hpucha): Remove this code since this functionality is subsumed by
+// Grappler XlaFusionOptimizer.
 bool IsXlaFusable(const NodeDef& node) {
   static const std::unordered_set<std::string>* elementwise_ops =
       new std::unordered_set<std::string>(
@@ -364,7 +371,7 @@ Status FindCompilationCandidates(
   for (Node* node : graph.op_nodes()) {
     sorted_nodes.push_back(node);
   }
-  std::sort(sorted_nodes.begin(), sorted_nodes.end(), NodeCompare());
+  std::sort(sorted_nodes.begin(), sorted_nodes.end(), NodeComparatorID());
 
   for (Node* node : sorted_nodes) {
     VLOG(2) << "Fuel: " << fuel;
@@ -379,9 +386,13 @@ Status FindCompilationCandidates(
 
     DeviceType device_type("");
     TF_RETURN_IF_ERROR(
-        DeviceTypeOfDevice(node->assigned_device_name(), &device_type));
+        DeviceToDeviceType(node->assigned_device_name(), &device_type));
 
-    if (is_compilable_fn && !is_compilable_fn(node, device_type)) continue;
+    if (is_compilable_fn && !is_compilable_fn(node, device_type)) {
+      VLOG(2) << "Compilation rejected node: not compilable " << node->name()
+              << ": " << node->type_string();
+      continue;
+    }
 
     const XlaOpRegistry::DeviceRegistration* registration;
     CHECK(
@@ -430,46 +441,6 @@ struct Cluster {
   int representative = -1;
 };
 
-// Returns a string describing how an edge from src to dst would
-// create a cycle.
-string DescribeCycle(const GraphCycles& cycles, const Graph& graph, int src,
-                     int dst) {
-  int32 max_path_size = graph.num_node_ids() + 1;
-  std::vector<int32> path(max_path_size);
-  int32 path_size = cycles.FindPath(dst, src, max_path_size, path.data());
-  if (path_size == 0) {
-    return "";
-  }
-
-  auto node_name = [&cycles, &graph](int node_id) {
-    if (!FastBoundsCheck(node_id, graph.num_node_ids())) {
-      return string("(null)");
-    }
-    auto* node = graph.FindNodeId(node_id);
-    if (node == nullptr) {
-      return string("(null)");
-    }
-    return node->name();
-  };
-
-  string description;
-  strings::StrAppend(&description, "Edge from ", node_name(src), " to ",
-                     node_name(dst), " would create a cycle.\n");
-  path.resize(path_size);
-  for (int32 node_id : path) {
-    string ascii_art;
-    if (node_id == dst) {
-      ascii_art = "+-> ";
-    } else if (node_id != src) {
-      ascii_art = "|   ";
-    } else {
-      ascii_art = "+-- ";
-    }
-    strings::StrAppend(&description, ascii_art, node_name(node_id), "\n");
-  }
-  return description;
-}
-
 }  // anonymous namespace
 
 bool IsCompilable(FunctionLibraryRuntime* flr, const NodeDef& ndef) {
@@ -575,84 +546,13 @@ Status MarkForCompilationPass::RunImpl(
                                            : Env::Default(),
       is_compilable_fn, &compilation_candidates));
 
-  GraphCycles cycles;
-  for (int i = 0; i < graph->num_node_ids(); ++i) {
-    // We rely on the node IDs in the cycle detection graph being consecutive
-    // integers starting from 0.
-    CHECK_EQ(i, cycles.NewNode());
+  if (compilation_candidates.empty()) {
+    VLOG(2) << "No compilable candidates";
+    return Status::OK();
   }
 
-  // Compute the loop structure of the graph.
-  std::vector<ControlFlowInfo> control_flow_info;
-  TF_RETURN_IF_ERROR(BuildControlFlowInfo(graph, &control_flow_info));
-
-  // The clustering code must avoid adding cycles to the graph to prevent
-  // deadlock. However, the graph may contain loops, which would trigger the
-  // cycle detection code. To handle loops, we alter the structure of the cycle
-  // detection graph, disconnecting each loop from the enclosing graph.
-  // Specifically, we:
-  // * add a new "frame" node for each loop.
-  // * replace edges to "Enter" nodes, and edges from "Exit" nodes with edges
-  //   to/from the corresponding frame node. In essence, we collapse the loop
-  //   into a single node for the purpose of cycle detection in the enclosing
-  //   graph.
-  // * the body of the loop should now be disconnected from the rest of the
-  //   graph; we make it acyclic by breaking loop backedges (edges outgoing from
-  //   "NextIteration" nodes.
-
-  // Map from frame name strings to node IDs in the cycle detection graph.
-  std::unordered_map<string, int> frame_nodes;
-
-  // Get the cycle graph node ID for frame 'frame_name', or add one if none
-  // exists.
-  auto GetOrAddFrameNodeId = [&frame_nodes, &cycles](const string& frame_name) {
-    int& frame_id = frame_nodes.emplace(frame_name, -1).first->second;
-    if (frame_id < 0) {
-      // The emplace succeeded; we have not allocated a frame node yet.
-      frame_id = cycles.NewNode();
-    }
-    return frame_id;
-  };
-
-  for (Edge const* edge : graph->edges()) {
-    if (edge->dst()->IsEnter()) {
-      // Lift edges to an "Enter" node to the corresponding frame node.
-      const string& frame_name =
-          control_flow_info[edge->dst()->id()].frame_name;
-      int dst = GetOrAddFrameNodeId(frame_name);
-      if (!cycles.InsertEdge(edge->src()->id(), dst)) {
-        return errors::Internal(
-            "Cycle detected when adding enter->frame edge: ",
-            DescribeCycle(cycles, *graph, edge->src()->id(), dst));
-      }
-      continue;
-    }
-    if (edge->src()->IsExit()) {
-      // Lift edges from an "Exit" node to the corresponding frame node.
-      const string& frame_name =
-          control_flow_info[edge->src()->id()].frame_name;
-      int src = GetOrAddFrameNodeId(frame_name);
-      if (!cycles.InsertEdge(src, edge->dst()->id())) {
-        return errors::Internal(
-            "Cycle detected when adding frame->exit edge: ",
-            DescribeCycle(cycles, *graph, src, edge->dst()->id()));
-      }
-      // Drop the original edge.
-      continue;
-    }
-    if (edge->src()->IsNextIteration()) {
-      // Break loop back-edges.
-      continue;
-    }
-    if (!cycles.InsertEdge(edge->src()->id(), edge->dst()->id())) {
-      // This should never happen. All cycles in the graph should contain
-      // a control flow operator.
-      return errors::Internal(
-          "Found cycle in graph without control flow operator during XLA "
-          "compilation: ",
-          DescribeCycle(cycles, *graph, edge->src()->id(), edge->dst()->id()));
-    }
-  }
+  GraphCycles cycles;
+  TF_RETURN_IF_ERROR(CreateCycleDetectionGraph(graph, &cycles));
 
   // Each compilation candidate belongs to a cluster. The cluster's
   // representative
@@ -670,6 +570,9 @@ Status MarkForCompilationPass::RunImpl(
 
   // Repeatedly contract edges between clusters that are on the same device,
   // provided the contraction would not create a cycle.
+  //
+  // TODO(hpucha): Handle the case where kXlaClusterAttr is already set (for
+  // example, from the Grappler fusion pass).
   while (!worklist.empty()) {
     int from = worklist.front()->Get().representative;
     worklist.pop_front();
@@ -778,7 +681,7 @@ Status MarkForCompilationPass::RunImpl(
     // compilation.
     DeviceType device_type("");
     TF_RETURN_IF_ERROR(
-        DeviceTypeOfDevice(n->assigned_device_name(), &device_type));
+        DeviceToDeviceType(n->assigned_device_name(), &device_type));
     const XlaOpRegistry::DeviceRegistration* registration;
     XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration);
 
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index 703d8825d74ced..772c92d369e67f 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -633,5 +633,52 @@ TEST(XlaCompilationTest, ConstOp) {
   }
 }
 
+TEST(XlaCompilationTest, DontClusterIdentityWithRefInput) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output variable = ops::Variable(root.WithOpName("variable"),
+                                  PartialTensorShape{}, DT_FLOAT);
+  Output read = ops::Identity(root.WithOpName("read"), variable);
+  Output neg = ops::Negate(root.WithOpName("negate"), read);
+  Output add = ops::Add(root.WithOpName("add"), neg, neg);
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+  TF_ASSERT_OK(MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+
+  ASSERT_FALSE(clusters.empty());
+  string cluster_name = clusters.begin()->second;
+
+  std::unordered_map<string, string> expected_clusters(
+      {{"negate", cluster_name}, {"add", cluster_name}});
+  EXPECT_EQ(clusters, expected_clusters);
+}
+
+TEST(XlaCompilationTest, ClusterIdentityWithNonRefInput) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output variable = ops::Variable(root.WithOpName("variable"),
+                                  PartialTensorShape{}, DT_FLOAT);
+  Output read = ops::Identity(root.WithOpName("read"), variable);
+  Output neg = ops::Negate(root.WithOpName("negate"), read);
+  Output identity = ops::Negate(root.WithOpName("identity"), neg);
+  Output add = ops::Add(root.WithOpName("add"), identity, neg);
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+  TF_ASSERT_OK(MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+
+  ASSERT_FALSE(clusters.empty());
+  string cluster_name = clusters.begin()->second;
+
+  std::unordered_map<string, string> expected_clusters(
+      {{"negate", cluster_name},
+       {"identity", cluster_name},
+       {"add", cluster_name}});
+  EXPECT_EQ(clusters, expected_clusters);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/ops/xla_ops.cc b/tensorflow/compiler/jit/ops/xla_ops.cc
index 07320b43dab790..f2473d98ffd5da 100644
--- a/tensorflow/compiler/jit/ops/xla_ops.cc
+++ b/tensorflow/compiler/jit/ops/xla_ops.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER_OP("_XlaLaunch")
+REGISTER_OP("XlaLaunch")
     .Input("constants: Tconstants")
     .Attr("Tconstants: list(type) >= 0")
     .Input("args: Targs")
@@ -28,7 +28,7 @@ REGISTER_OP("_XlaLaunch")
     .Attr("Tresults: list(type) >= 0")
     .Attr("function: func")
     // XLA random-number generation ops are stateful.
-    // TODO(phawkins): create stateful and non-stateful variants of _XlaLaunch.
+    // TODO(phawkins): create stateful and non-stateful variants of XlaLaunch.
     .SetIsStateful()
     .Doc("XLA Launch Op. For use by the XLA JIT only.");
 
diff --git a/tensorflow/compiler/jit/xla_cluster_util.cc b/tensorflow/compiler/jit/xla_cluster_util.cc
new file mode 100644
index 00000000000000..70bd10336b824b
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_cluster_util.cc
@@ -0,0 +1,161 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/xla_cluster_util.h"
+
+#include <unordered_map>
+
+#include "tensorflow/core/graph/control_flow.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+
+const char* const kXlaClusterAttr = "_XlaCluster";
+const char* const kXlaOutsideCompilationAttr = "_XlaOutsideCompilation";
+
+namespace {
+// Returns a string describing how an edge from src to dst would
+// create a cycle.
+string DescribeCycle(const GraphCycles* cycles, const Graph& graph, int src,
+                     int dst) {
+  int32 max_path_size = graph.num_node_ids() + 1;
+  std::vector<int32> path(max_path_size);
+  int32 path_size = cycles->FindPath(dst, src, max_path_size, path.data());
+  if (path_size == 0) {
+    return "";
+  }
+
+  auto node_name = [cycles, &graph](int node_id) {
+    if (!FastBoundsCheck(node_id, graph.num_node_ids())) {
+      return string("(null)");
+    }
+    auto* node = graph.FindNodeId(node_id);
+    if (node == nullptr) {
+      return string("(null)");
+    }
+    return node->name();
+  };
+
+  string description;
+  strings::StrAppend(&description, "Edge from ", node_name(src), " to ",
+                     node_name(dst), " would create a cycle.\n");
+  path.resize(path_size);
+  for (int32 node_id : path) {
+    string ascii_art;
+    if (node_id == dst) {
+      ascii_art = "+-> ";
+    } else if (node_id != src) {
+      ascii_art = "|   ";
+    } else {
+      ascii_art = "+-- ";
+    }
+    strings::StrAppend(&description, ascii_art, node_name(node_id), "\n");
+  }
+  return description;
+}
+}  // namespace
+
+Status DeviceToDeviceType(const string& device, DeviceType* device_type) {
+  DeviceNameUtils::ParsedName parsed;
+  if (!DeviceNameUtils::ParseFullName(device, &parsed)) {
+    return errors::Internal("Malformed assigned device '", device, "'");
+  }
+  *device_type = DeviceType(parsed.type);
+  return Status::OK();
+}
+
+Status CreateCycleDetectionGraph(const Graph* graph, GraphCycles* cycles) {
+  for (int i = 0; i < graph->num_node_ids(); ++i) {
+    // We rely on the node IDs in the cycle detection graph being consecutive
+    // integers starting from 0.
+    CHECK_EQ(i, cycles->NewNode());
+  }
+
+  // Compute the loop structure of the graph.
+  std::vector<ControlFlowInfo> control_flow_info;
+  TF_RETURN_IF_ERROR(BuildControlFlowInfo(graph, &control_flow_info));
+
+  // The clustering code must avoid adding cycles to the graph to prevent
+  // deadlock. However, the graph may contain loops, which would trigger the
+  // cycle detection code. To handle loops, we alter the structure of the cycle
+  // detection graph, disconnecting each loop from the enclosing graph.
+  // Specifically, we:
+  // * add a new "frame" node for each loop.
+  // * replace edges to "Enter" nodes, and edges from "Exit" nodes with edges
+  //   to/from the corresponding frame node. In essence, we collapse the loop
+  //   into a single node for the purpose of cycle detection in the enclosing
+  //   graph.
+  // * the body of the loop should now be disconnected from the rest of the
+  //   graph; we make it acyclic by breaking loop backedges (edges outgoing from
+  //   "NextIteration" nodes.
+
+  // Map from frame name strings to node IDs in the cycle detection graph.
+  std::unordered_map<string, int> frame_nodes;
+
+  // Get the cycle graph node ID for frame 'frame_name', or add one if none
+  // exists.
+  auto GetOrAddFrameNodeId = [&frame_nodes, cycles](const string& frame_name) {
+    int& frame_id = frame_nodes.emplace(frame_name, -1).first->second;
+    if (frame_id < 0) {
+      // The emplace succeeded; we have not allocated a frame node yet.
+      frame_id = cycles->NewNode();
+    }
+    return frame_id;
+  };
+
+  for (Edge const* edge : graph->edges()) {
+    if (edge->dst()->IsEnter()) {
+      // Lift edges to an "Enter" node to the corresponding frame node.
+      const string& frame_name =
+          control_flow_info[edge->dst()->id()].frame_name;
+      int dst = GetOrAddFrameNodeId(frame_name);
+      if (!cycles->InsertEdge(edge->src()->id(), dst)) {
+        return errors::Internal(
+            "Cycle detected when adding enter->frame edge: ",
+            DescribeCycle(cycles, *graph, edge->src()->id(), dst));
+      }
+      continue;
+    }
+    if (edge->src()->IsExit()) {
+      // Lift edges from an "Exit" node to the corresponding frame node.
+      const string& frame_name =
+          control_flow_info[edge->src()->id()].frame_name;
+      int src = GetOrAddFrameNodeId(frame_name);
+      if (!cycles->InsertEdge(src, edge->dst()->id())) {
+        return errors::Internal(
+            "Cycle detected when adding frame->exit edge: ",
+            DescribeCycle(cycles, *graph, src, edge->dst()->id()));
+      }
+      // Drop the original edge.
+      continue;
+    }
+    if (edge->src()->IsNextIteration()) {
+      // Break loop back-edges.
+      continue;
+    }
+    if (!cycles->InsertEdge(edge->src()->id(), edge->dst()->id())) {
+      // This should never happen. All cycles in the graph should contain
+      // a control flow operator.
+      return errors::Internal(
+          "Found cycle in graph without control flow operator during XLA "
+          "compilation: ",
+          DescribeCycle(cycles, *graph, edge->src()->id(), edge->dst()->id()));
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_cluster_util.h b/tensorflow/compiler/jit/xla_cluster_util.h
new file mode 100644
index 00000000000000..5b673bdc27fccb
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_cluster_util.h
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Contains utilities for clustering compilable graph nodes via XLA.
+
+#ifndef TENSORFLOW_COMPILER_JIT_XLA_CLUSTER_UTIL_H_
+#define TENSORFLOW_COMPILER_JIT_XLA_CLUSTER_UTIL_H_
+
+#include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
+#include "tensorflow/core/graph/algorithm.h"
+
+namespace tensorflow {
+
+// The attribute that marks nodes to be grouped into functions by the
+// encapsulate subgraphs pass.
+extern const char* const kXlaClusterAttr;
+
+// The attribute that marks nodes in a cluster to be placed outside the xla
+// compilation by the encapsulate subgraphs pass.
+extern const char* const kXlaOutsideCompilationAttr;
+
+using OrderedNodeSet = std::set<Node*, NodeComparatorID>;
+
+// Returns the DeviceType corresponding to 'device'.
+Status DeviceToDeviceType(const string& device, DeviceType* device_type);
+
+// Creates a graph representation to enable cycle detection when clustering.
+// This representation handles loops in graph by disconnecting each loop from
+// the enclosing graph.
+Status CreateCycleDetectionGraph(const Graph* graph, GraphCycles* cycles);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_XLA_CLUSTER_UTIL_H_
diff --git a/tensorflow/compiler/jit/xla_compilation_cache.cc b/tensorflow/compiler/jit/xla_compilation_cache.cc
index 6430975335f5ee..7ed609c4374806 100644
--- a/tensorflow/compiler/jit/xla_compilation_cache.cc
+++ b/tensorflow/compiler/jit/xla_compilation_cache.cc
@@ -122,8 +122,7 @@ Status XlaCompilationCache::BuildSignature(
 
 namespace {
 
-// Builds a XlaCompiler::Argument vector from the arguments to the _XlaLaunch
-// op.
+// Builds a XlaCompiler::Argument vector from the arguments to the XlaLaunch op.
 Status BuildArguments(const std::map<int, Tensor>& constant_args,
                       const std::map<int, OptionalTensor>& variable_args,
                       OpKernelContext* ctx,
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
index 60458f6f3314b2..b1943d3e1a7e32 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
@@ -48,13 +48,12 @@ Status XlaCompileOnDemandOp::Run(OpKernelContext* ctx,
                                  const XlaCompiler::CompilationResult* result,
                                  xla::LocalExecutable* executable) {
   std::map<int, OptionalTensor> variables = GetVariables(ctx);
-  int64 num_resource_args = variables.size();
 
   xla::LocalClient* client = metadata.client();
 
   // Builds an XLA allocator for the device.
   XlaComputationLaunchContext launch_context(
-      num_resource_args, client, client->backend().memory_allocator(), true);
+      client, client->backend().memory_allocator(), true);
 
   launch_context.PopulateInputs(ctx, result, variables);
 
@@ -152,16 +151,18 @@ Status XlaCompileOnDemandOp::Compile(
   core::ScopedUnref cache_ref(cache);
 
   XlaCompiler::Options options;
-  DeviceType device_type = metadata.jit_device_type();
-  options.device_type = &device_type;
+  options.device_type = metadata.jit_device_type();
   options.client = metadata.client();
   options.flib_def =
       new FunctionLibraryDefinition(OpRegistry::Global(), FunctionDefLibrary{});
+  options.shape_representation_fn = metadata.shape_representation_fn();
+
+  XlaCompiler::CompileOptions compile_options;
+  compile_options.is_entry_computation = true;
 
   std::map<int, OptionalTensor> variable_args = GetVariables(ctx);
   return cache->CompileSingleOp(options, constant_arguments, variable_args, ctx,
-                                result, executable,
-                                /*compile_options=*/nullptr);
+                                result, executable, &compile_options);
 }
 
 void XlaCompileOnDemandOp::Compute(OpKernelContext* ctx) {
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.h b/tensorflow/compiler/jit/xla_compile_on_demand_op.h
index 23c6f3903f841a..7cc3d0e007ba29 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.h
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.h
@@ -29,11 +29,8 @@ limitations under the License.
 namespace tensorflow {
 
 // An OpKernel that compiles an op to an XLA computation and runs it. Unlike
-// _XlaLaunch this doesn't rely on any rewrites of the graphdef - it will run a
+// XlaLaunch this doesn't rely on any rewrites of the graphdef - it will run a
 // vanilla TensorFlow op as long as the bridge supports it.
-//
-// Importantly _XlaLaunch assumes all input and output tensors are on the host,
-// whereas XlacompileOnDemandOp works with tensors in device memory.
 class XlaCompileOnDemandOp : public OpKernel {
  public:
   explicit XlaCompileOnDemandOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
diff --git a/tensorflow/compiler/jit/xla_cpu_device.cc b/tensorflow/compiler/jit/xla_cpu_device.cc
index bc07dbd7bdf005..43648402f65c65 100644
--- a/tensorflow/compiler/jit/xla_cpu_device.cc
+++ b/tensorflow/compiler/jit/xla_cpu_device.cc
@@ -53,7 +53,9 @@ Status XlaCpuDeviceFactory::CreateDevices(const SessionOptions& options,
   TF_RETURN_IF_ERROR(XlaDevice::Create("Host", DEVICE_XLA_CPU, 0,
                                        DEVICE_CPU_XLA_JIT, options, name_prefix,
                                        registration,
-                                       /*transfer_as_literal=*/false, &device));
+                                       /*transfer_as_literal=*/false,
+                                       /*shape_representation_fn=*/{},
+                                       /*padded_shape_fn=*/{}, &device));
   devices->push_back(device.release());
   return Status::OK();
 }
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index c814b7eb029054..ed007d603ea1b3 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -18,12 +18,12 @@ limitations under the License.
 #include <stdlib.h>
 #include <unordered_set>
 
-#include "absl/memory/memory.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/xla_compile_on_demand_op.h"
 #include "tensorflow/compiler/jit/xla_device_context.h"
 #include "tensorflow/compiler/jit/xla_device_ops.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/core/common_runtime/device.h"
@@ -49,6 +49,7 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/ptr_util.h"
 #include "tensorflow/core/util/stream_executor_util.h"
 
 namespace tensorflow {
@@ -105,12 +106,33 @@ XlaDeviceAllocator* XlaDeviceAllocatorState::GetOrCreateXlaDeviceAllocator(
   return alloc_ptr;
 }
 
+namespace {
+
+// Default PaddedShapeFn implementation that simply returns the unpadded
+// on-device shape. This is accurate for CPU and GPU devices that neither
+// transpose nor pad tensors.
+Status DefaultPaddedShapeFn(const Tensor& tensor, xla::Shape* shape) {
+  const tensorflow::XlaTensor* xla_tensor =
+      tensorflow::XlaTensor::FromTensor(&tensor);
+  if (xla_tensor == nullptr) {
+    return TensorShapeToXLAShape(tensor.dtype(), tensor.shape(), shape);
+  }
+
+  const xla::ShapedBuffer& shaped_buffer = xla_tensor->shaped_buffer();
+  *shape = shaped_buffer.on_device_shape();
+  return Status::OK();
+}
+
+}  // namespace
+
 /* static */ Status XlaDevice::Create(
     const string& platform_name, const string& device_name, int device_ordinal,
     const string& jit_device_name, const SessionOptions& options,
     const string& name_prefix,
     const XlaOpRegistry::DeviceRegistration& registration,
-    bool transfer_as_literal, std::unique_ptr<XlaDevice>* device) {
+    bool transfer_as_literal,
+    const XlaCompiler::ShapeRepresentationFn& shape_representation_fn,
+    const PaddedShapeFn& padded_shape_fn, std::unique_ptr<XlaDevice>* device) {
   VLOG(1) << "XlaDevice::Create " << platform_name << " " << device_name << ":"
           << device_ordinal;
 
@@ -129,17 +151,22 @@ XlaDeviceAllocator* XlaDeviceAllocatorState::GetOrCreateXlaDeviceAllocator(
       DeviceType(device_name), Bytes(16ULL << 30), DeviceLocality(),
       strings::StrCat("device: ", device_name, " device"));
 
-  device->reset(new XlaDevice(options, attrs, device_ordinal,
-                              DeviceType(jit_device_name),
-                              platform.ValueOrDie(), transfer_as_literal));
+  device->reset(new XlaDevice(
+      options, attrs, device_ordinal, DeviceType(jit_device_name),
+      platform.ValueOrDie(), transfer_as_literal, shape_representation_fn,
+      padded_shape_fn ? padded_shape_fn : DefaultPaddedShapeFn));
   return Status::OK();
 }
 
-XlaDevice::Metadata::Metadata(int device_ordinal, se::Platform* platform,
-                              const DeviceType& device_type)
+XlaDevice::Metadata::Metadata(
+    int device_ordinal, se::Platform* platform, const DeviceType& device_type,
+    XlaCompiler::ShapeRepresentationFn shape_representation_fn,
+    PaddedShapeFn padded_shape_fn)
     : device_ordinal_(device_ordinal),
       device_type_(device_type),
-      platform_(platform) {}
+      platform_(platform),
+      shape_representation_fn_(std::move(shape_representation_fn)),
+      padded_shape_fn_(std::move(padded_shape_fn)) {}
 
 int XlaDevice::Metadata::device_ordinal() const { return device_ordinal_; }
 
@@ -170,17 +197,21 @@ const DeviceType& XlaDevice::Metadata::jit_device_type() const {
   return Status::OK();
 }
 
-XlaDevice::XlaDevice(const SessionOptions& options,
-                     const DeviceAttributes& attrs, int device_ordinal,
-                     const DeviceType& jit_device_name, se::Platform* platform,
-                     bool transfer_as_literal)
+XlaDevice::XlaDevice(
+    const SessionOptions& options, const DeviceAttributes& attrs,
+    int device_ordinal, const DeviceType& jit_device_name,
+    se::Platform* platform, bool transfer_as_literal,
+    const XlaCompiler::ShapeRepresentationFn& shape_representation_fn,
+    const PaddedShapeFn& padded_shape_fn)
     : LocalDevice(options, attrs),
-      xla_metadata_(device_ordinal, platform, jit_device_name),
+      xla_metadata_(device_ordinal, platform, jit_device_name,
+                    shape_representation_fn, padded_shape_fn),
       device_ordinal_(device_ordinal),
       jit_device_name_(jit_device_name),
       xla_allocator_(nullptr),
       platform_(platform),
-      transfer_as_literal_(transfer_as_literal) {
+      transfer_as_literal_(transfer_as_literal),
+      shape_representation_fn_(shape_representation_fn) {
   VLOG(1) << "Created XLA device " << jit_device_name;
 }
 
@@ -230,10 +261,10 @@ Status XlaDevice::CreateAndSetGpuDeviceInfo() {
     GetAllocator({});
     // XlaDevice owns both gpu_device_info_ and
     // gpu_device_info_->default_context.
-    gpu_device_info_ = absl::make_unique<GpuDeviceInfo>();
+    gpu_device_info_ = MakeUnique<GpuDeviceInfo>();
     gpu_device_info_->stream = stream;
-    gpu_device_info_->default_context =
-        new XlaDeviceContext(stream, client(), transfer_as_literal_);
+    gpu_device_info_->default_context = new XlaDeviceContext(
+        stream, client(), transfer_as_literal_, shape_representation_fn_);
     set_tensorflow_gpu_device_info(gpu_device_info_.get());
   }
 
@@ -247,7 +278,8 @@ Status XlaDevice::FillContextMap(const Graph* graph,
   TF_ASSIGN_OR_RETURN(se::Stream * stream, GetStream());
   // Call GetAllocator for the side-effect of ensuring the allocator is created.
   GetAllocator({});
-  auto ctx = new XlaDeviceContext(stream, client(), transfer_as_literal_);
+  auto ctx = new XlaDeviceContext(stream, client(), transfer_as_literal_,
+                                  shape_representation_fn_);
   for (Node* n : graph->nodes()) {
     VLOG(2) << n->id() << " : " << n->type_string() << " : " << n->name();
     ctx->Ref();
@@ -260,11 +292,10 @@ Status XlaDevice::FillContextMap(const Graph* graph,
 void XlaDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
   VLOG(1) << "XlaDevice::Compute " << op_kernel->name() << ":"
           << op_kernel->type_string();
-  // When TraceMe profiling is off (which is the default), the
-  // following TraceMe constructor is simply a conditional test of
-  // false value. Measurements show that its overhead is negligible.
-  port::Tracing::TraceMe trace_me(op_kernel->name(), op_kernel->type_string(),
-                                  op_kernel->IsExpensive());
+  // When Xprof profiling is off (which is the default), constructing the
+  // activity is simple enough that its overhead is negligible.
+  tracing::ScopedActivity activity(op_kernel->name(), op_kernel->type_string(),
+                                   op_kernel->IsExpensive());
   op_kernel->Compute(context);
 }
 
@@ -272,8 +303,8 @@ void XlaDevice::ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
                              AsyncOpKernel::DoneCallback done) {
   VLOG(1) << "XlaDevice::ComputeAsync " << op_kernel->name() << ":"
           << op_kernel->type_string();
-  port::Tracing::TraceMe trace_me(op_kernel->name(), op_kernel->type_string(),
-                                  op_kernel->IsExpensive());
+  tracing::ScopedActivity activity(op_kernel->name(), op_kernel->type_string(),
+                                   op_kernel->IsExpensive());
   op_kernel->ComputeAsync(context, done);
 }
 
@@ -295,7 +326,8 @@ Status XlaDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
     Tensor copy(GetAllocator(alloc_attrs), parsed.dtype(), parsed.shape());
     Notification n;
     TF_ASSIGN_OR_RETURN(se::Stream * stream, GetStream());
-    XlaTransferManager manager(stream, client(), transfer_as_literal_);
+    XlaTransferManager manager(stream, client(), transfer_as_literal_,
+                               shape_representation_fn_);
     manager.CopyCPUTensorToDevice(&parsed, this, &copy,
                                   [&n, &status](const Status& s) {
                                     status = s;
diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h
index 3ae87308cc7cff..02e88ee6793e98 100644
--- a/tensorflow/compiler/jit/xla_device.h
+++ b/tensorflow/compiler/jit/xla_device.h
@@ -17,8 +17,7 @@ limitations under the License.
 // runtime.
 //
 // Operators assigned to an XlaDevice are compiled into XLA computations.
-// Tensors on an XlaDevice are thin wrappers around XLA GlobalDataHandles; state
-// is managed by XLA.
+// Tensors on an XlaDevice are thin wrappers around XLA ScopedShapedBuffers.
 //
 // XlaDevice is instantiated separately for each XLA backend (e.g., CPU or GPU),
 // under different names (e.g., XLA_CPU or XLA_GPU).
@@ -27,6 +26,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_JIT_XLA_DEVICE_H_
 
 #include "tensorflow/compiler/jit/xla_tensor.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
@@ -45,12 +45,19 @@ namespace tensorflow {
 
 class XlaDevice : public LocalDevice {
  public:
+  // Given a tensor, sets `xla::Shape*` the shape of tensor's representation
+  // on device, fully padded. On error, the contents of `xla::Shape*`
+  // are undefined.
+  typedef std::function<Status(const Tensor&, xla::Shape*)> PaddedShapeFn;
+
   // Wrapper class to store metadata about the XlaDevice, where it can be
   // retrieved e.g., when lazily creating the XlaCompilationCache device.
   class Metadata {
    public:
     Metadata(int device_ordinal, se::Platform* platform,
-             const DeviceType& device_type);
+             const DeviceType& device_type,
+             XlaCompiler::ShapeRepresentationFn shape_representation_fn,
+             PaddedShapeFn padded_shape_fn);
 
     // The index of the device on this host.
     int device_ordinal() const;
@@ -58,11 +65,17 @@ class XlaDevice : public LocalDevice {
     se::Platform* platform() const;
     xla::LocalClient* client() const;
     const DeviceType& jit_device_type() const;
+    const XlaCompiler::ShapeRepresentationFn& shape_representation_fn() const {
+      return shape_representation_fn_;
+    }
+    const PaddedShapeFn& padded_shape_fn() const { return padded_shape_fn_; }
 
    private:
     const int device_ordinal_;
     const DeviceType device_type_;
     se::Platform* platform_;  // Not owned.
+    XlaCompiler::ShapeRepresentationFn shape_representation_fn_;
+    PaddedShapeFn padded_shape_fn_;
 
     TF_DISALLOW_COPY_AND_ASSIGN(Metadata);
   };
@@ -76,16 +89,25 @@ class XlaDevice : public LocalDevice {
   // 'transfer_as_literal' is true if device<->host transfers must be done using
   // XLA's TransferLiteral{To,From}Device interface. If false, we can use
   // ThenMemcpy instead.
-  static Status Create(const string& platform_name, const string& device_name,
-                       int device_ordinal, const string& jit_device_name,
-                       const SessionOptions& options, const string& name_prefix,
-                       const XlaOpRegistry::DeviceRegistration& registration,
-                       bool transfer_as_literal,
-                       std::unique_ptr<XlaDevice>* device);
-
+  // If padded_shape_fn is empty, a default implementation that returns
+  // the on-host shape is used.
+  static Status Create(
+      const string& platform_name, const string& device_name,
+      int device_ordinal, const string& jit_device_name,
+      const SessionOptions& options, const string& name_prefix,
+      const XlaOpRegistry::DeviceRegistration& registration,
+      bool transfer_as_literal,
+      const XlaCompiler::ShapeRepresentationFn& shape_representation_fn,
+      const PaddedShapeFn& padded_shape_fn, std::unique_ptr<XlaDevice>* device);
+
+  // Creates a new XLA Device.
+  // If padded_shape_fn is empty, a default implementation that returns
+  // the logical on-device shape without padding is used.
   XlaDevice(const SessionOptions& options, const DeviceAttributes& attrs,
             int device_ordinal, const DeviceType& jit_device_name,
-            se::Platform* platform, bool transfer_as_literal);
+            se::Platform* platform, bool transfer_as_literal,
+            const XlaCompiler::ShapeRepresentationFn& shape_representation_fn,
+            const PaddedShapeFn& padded_shape_fn);
   ~XlaDevice() override;
 
   Allocator* GetAllocator(AllocatorAttributes attr) override;
@@ -102,6 +124,7 @@ class XlaDevice : public LocalDevice {
                              Tensor* tensor) override;
 
   xla::LocalClient* client() const;
+  const Metadata& metadata() { return xla_metadata_; }
   xla::StatusOr<se::Stream*> GetStream();
 
   // If not already set, create and set GpuDeviceInfo.
@@ -116,8 +139,8 @@ class XlaDevice : public LocalDevice {
   // The name of the device that is used to compile Ops for this XlaDevice.
   DeviceType jit_device_name_;
   // Memory allocator associated with this device.
-  Allocator* xla_allocator_;                   // Not owned.
-  se::Platform* platform_;                     // Not owned.
+  Allocator* xla_allocator_;  // Not owned.
+  se::Platform* platform_;    // Not owned.
   // Stream associated with this device. Operations enqueued on this
   // stream are executed on the device. Operations include data
   // copying back and forth between CPU and the device, and
@@ -126,6 +149,7 @@ class XlaDevice : public LocalDevice {
   // Must we use XLA's transfer manager for correct host<->device transfers? if
   // false, we can use ThenMemcpy() instead.
   bool transfer_as_literal_;
+  XlaCompiler::ShapeRepresentationFn shape_representation_fn_;
 
   // If set, holds default device context (that we must Unref)
   // and its stream.
diff --git a/tensorflow/compiler/jit/xla_device_context.cc b/tensorflow/compiler/jit/xla_device_context.cc
index bf8c1886a02231..71e63b110b3b13 100644
--- a/tensorflow/compiler/jit/xla_device_context.cc
+++ b/tensorflow/compiler/jit/xla_device_context.cc
@@ -47,22 +47,33 @@ void XlaDeviceAllocator::DeallocateRaw(void* ptr) {
 
 void XlaDeviceAllocator::GetStats(AllocatorStats* stats) { stats->Clear(); }
 
-XlaTransferManager::XlaTransferManager(se::Stream* stream,
-                                       xla::LocalClient* client,
-                                       bool transfer_as_literal)
+XlaTransferManager::XlaTransferManager(
+    se::Stream* stream, xla::LocalClient* client, bool transfer_as_literal,
+    XlaCompiler::ShapeRepresentationFn shape_representation_fn)
     : stream_(stream),
       client_(client),
       transfer_manager_(client->backend().transfer_manager()),
-      transfer_as_literal_(transfer_as_literal) {}
+      transfer_as_literal_(transfer_as_literal),
+      shape_representation_fn_(std::move(shape_representation_fn)) {
+  if (!shape_representation_fn_) {
+    shape_representation_fn_ = [](const TensorShape& shape, DataType dtype) {
+      return shape;
+    };
+  }
+}
 
 Status XlaTransferManager::TransferLiteralToDevice(
     const Tensor& host_tensor, Tensor* device_tensor) const {
-  xla::Literal literal;
-  TF_RETURN_IF_ERROR(HostTensorToLiteral(host_tensor, &literal));
-  VLOG(1) << "Transfer to device as literal: " << literal.ToString();
+  xla::Shape xla_shape;
+  TF_RETURN_IF_ERROR(TensorShapeToXLAShape(host_tensor.dtype(),
+                                           host_tensor.shape(), &xla_shape));
+  xla::BorrowingLiteral literal(
+      static_cast<const char*>(DMAHelper::base(&host_tensor)), xla_shape);
 
   const xla::ShapedBuffer& shaped_buffer =
       XlaTensor::FromTensor(device_tensor)->shaped_buffer();
+  VLOG(1) << "Transfer to device as literal: " << literal.ToString() << " "
+          << shaped_buffer.ToString();
   return transfer_manager_->TransferLiteralToDevice(stream_->parent(), literal,
                                                     shaped_buffer);
 }
@@ -75,8 +86,17 @@ Status XlaTransferManager::TransferLiteralFromDevice(
   TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Literal> literal,
                       transfer_manager_->TransferLiteralFromDevice(
                           stream_->parent(), shaped_buffer));
-  VLOG(1) << "Transfer from device as literal: " << literal->ToString();
-  return LiteralToHostTensor(*literal, host_tensor->dtype(), host_tensor);
+  VLOG(1) << "Transfer from device as literal: " << literal->ToString() << " "
+          << shaped_buffer.ToString();
+  Tensor tensor;
+  TF_RETURN_IF_ERROR(
+      LiteralToHostTensor(*literal, host_tensor->dtype(), &tensor));
+  // Reshape the tensor back to its declared shape.
+  if (!host_tensor->CopyFrom(tensor, device_tensor.shape())) {
+    return errors::Internal(
+        "Tensor::CopyFrom failed when copying from XLA device to CPU");
+  }
+  return Status::OK();
 }
 
 void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
@@ -89,16 +109,21 @@ void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
             << " "
             << reinterpret_cast<const void*>(
                    device_tensor->tensor_data().data())
-            << " " << cpu_tensor->NumElements();
+            << " " << cpu_tensor->NumElements() << " "
+            << cpu_tensor->shape().DebugString() << " "
+            << device_tensor->shape().DebugString();
 
     void* src_ptr = const_cast<void*>(DMAHelper::base(cpu_tensor));
     const int64 total_bytes = cpu_tensor->TotalBytes();
 
     XlaTensor* xla_tensor = XlaTensor::FromTensor(device_tensor);
     CHECK(xla_tensor);
+
+    TensorShape shape = shape_representation_fn_(device_tensor->shape(),
+                                                 device_tensor->dtype());
     if (!xla_tensor->has_shaped_buffer()) {
       Status s = xla_tensor->AllocateShapedBuffer(
-          device_tensor->dtype(), device_tensor->shape(), client_,
+          device_tensor->dtype(), shape, client_,
           stream_->parent()->device_ordinal());
       if (!s.ok()) {
         done(s);
@@ -106,12 +131,18 @@ void XlaTransferManager::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
       }
     }
 
-    se::DeviceMemoryBase dev_dst_ptr =
-        XlaTensor::DeviceMemoryFromTensor(*device_tensor);
     Status status;
     if (transfer_as_literal_) {
-      status = TransferLiteralToDevice(*cpu_tensor, device_tensor);
+      Tensor reshaped_cpu_tensor;
+      if (!reshaped_cpu_tensor.CopyFrom(*cpu_tensor, shape)) {
+        done(errors::Internal(
+            "Tensor::CopyFrom failed when copying from CPU to XLA device"));
+        return;
+      }
+      status = TransferLiteralToDevice(reshaped_cpu_tensor, device_tensor);
     } else {
+      se::DeviceMemoryBase dev_dst_ptr =
+          XlaTensor::DeviceMemoryFromTensor(*device_tensor);
       stream_->ThenMemcpy(&dev_dst_ptr, src_ptr, total_bytes);
       // TODO(hpucha): Make this asynchronous.
       Status block_status = stream_->BlockHostUntilDone();
@@ -142,7 +173,9 @@ void XlaTransferManager::CopyDeviceTensorToCPU(const Tensor* device_tensor,
                    device_tensor->tensor_data().data())
             << " "
             << reinterpret_cast<const void*>(cpu_tensor->tensor_data().data())
-            << device_tensor->NumElements();
+            << " " << device_tensor->NumElements() << " "
+            << cpu_tensor->shape().DebugString() << " "
+            << device_tensor->shape().DebugString();
 
     const int64 total_bytes = cpu_tensor->TotalBytes();
     se::DeviceMemoryBase dev_src_ptr =
@@ -171,9 +204,47 @@ void XlaTransferManager::CopyDeviceTensorToCPU(const Tensor* device_tensor,
   done(Status::OK());
 }
 
-XlaDeviceContext::XlaDeviceContext(se::Stream* stream, xla::LocalClient* client,
-                                   bool transfer_as_literal)
-    : manager_(stream, client, transfer_as_literal) {}
+void XlaTransferManager::CopyDeviceTensorToDevice(const Tensor& src_tensor,
+                                                  Tensor* dst_tensor,
+                                                  const StatusCallback& done) {
+  // TODO(phawkins): replace this code with an asynchronous implementation.
+  auto body = [&]() {
+    if (src_tensor.NumElements() == 0) {
+      return Status::OK();
+    }
+    XlaTensor* xla_src = XlaTensor::FromTensor(&src_tensor);
+    XlaTensor* xla_dst = XlaTensor::FromTensor(dst_tensor);
+    CHECK(xla_src && xla_dst)
+        << "Missing destination tensor for device-to-device copy";
+    if (!xla_dst->has_shaped_buffer()) {
+      TensorShape shape =
+          shape_representation_fn_(src_tensor.shape(), src_tensor.dtype());
+      TF_RETURN_IF_ERROR(
+          xla_dst->AllocateShapedBuffer(src_tensor.dtype(), shape, client_,
+                                        stream_->parent()->device_ordinal()));
+    }
+    TF_RETURN_IF_ERROR(
+        xla_dst->shaped_buffer().buffers().ForEachMutableElementWithStatus(
+            [&](const xla::ShapeIndex& index, se::DeviceMemoryBase* buffer) {
+              const se::DeviceMemoryBase& from_buffer =
+                  xla_src->shaped_buffer().buffers().element(index);
+              CHECK_EQ(buffer->size(), from_buffer.size());
+              if (!stream_->parent()->SynchronousMemcpy(buffer, from_buffer,
+                                                        buffer->size())) {
+                return errors::Internal("Device to device memcpy failed");
+              }
+              return Status::OK();
+            }));
+    return Status::OK();
+  };
+  done(body());
+}
+
+XlaDeviceContext::XlaDeviceContext(
+    se::Stream* stream, xla::LocalClient* client, bool transfer_as_literal,
+    XlaCompiler::ShapeRepresentationFn shape_representation_fn)
+    : manager_(stream, client, transfer_as_literal,
+               std::move(shape_representation_fn)) {}
 
 void XlaDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
                                              Device* device,
@@ -190,4 +261,10 @@ void XlaDeviceContext::CopyDeviceTensorToCPU(const Tensor* device_tensor,
                                  done);
 }
 
+void XlaDeviceContext::CopyDeviceTensorToDevice(const Tensor& src_tensor,
+                                                Tensor* dst_tensor,
+                                                const StatusCallback& done) {
+  manager_.CopyDeviceTensorToDevice(src_tensor, dst_tensor, done);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_device_context.h b/tensorflow/compiler/jit/xla_device_context.h
index d7f5f1d2089892..ee346e5653bbf9 100644
--- a/tensorflow/compiler/jit/xla_device_context.h
+++ b/tensorflow/compiler/jit/xla_device_context.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/jit/xla_tensor.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/core/framework/allocator.h"
@@ -45,14 +46,19 @@ class XlaDeviceAllocator : public Allocator {
 // Helper class for managing data transfers between host and XLA devices.
 class XlaTransferManager {
  public:
-  explicit XlaTransferManager(se::Stream* stream, xla::LocalClient* client,
-                              bool transfer_as_literal);
+  explicit XlaTransferManager(
+      se::Stream* stream, xla::LocalClient* client, bool transfer_as_literal,
+      XlaCompiler::ShapeRepresentationFn shape_representation_fn);
 
   void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
                              Tensor* device_tensor, StatusCallback done) const;
   void CopyDeviceTensorToCPU(const Tensor* device_tensor,
                              StringPiece tensor_name, Device* device,
                              Tensor* cpu_tensor, StatusCallback done);
+
+  void CopyDeviceTensorToDevice(const Tensor& src_tensor, Tensor* dst_tensor,
+                                const StatusCallback& done);
+
   se::Stream* stream() const { return stream_; }
 
  private:
@@ -69,7 +75,8 @@ class XlaTransferManager {
   // Transfer manager, for marshalling data to and from the device.
   xla::TransferManager* transfer_manager_;
   // True if we must use XLA's TransferManager for correct device transfers.
-  bool transfer_as_literal_;
+  const bool transfer_as_literal_;
+  XlaCompiler::ShapeRepresentationFn shape_representation_fn_;
 };
 
 // DeviceContext for operators assigned to XlaDevice devices. The
@@ -77,8 +84,9 @@ class XlaTransferManager {
 // wraps the methods in XlaTransferManager.
 class XlaDeviceContext : public DeviceContext {
  public:
-  explicit XlaDeviceContext(se::Stream* stream, xla::LocalClient* client,
-                            bool transfer_as_literal);
+  explicit XlaDeviceContext(
+      se::Stream* stream, xla::LocalClient* client, bool transfer_as_literal,
+      XlaCompiler::ShapeRepresentationFn shape_representation_fn);
 
   void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
                              Tensor* device_tensor,
@@ -86,6 +94,9 @@ class XlaDeviceContext : public DeviceContext {
   void CopyDeviceTensorToCPU(const Tensor* device_tensor,
                              StringPiece tensor_name, Device* device,
                              Tensor* cpu_tensor, StatusCallback done) override;
+  void CopyDeviceTensorToDevice(const Tensor& src_tensor, Tensor* dst_tensor,
+                                const StatusCallback& done);
+
   se::Stream* stream() const override { return manager_.stream(); }
 
  private:
diff --git a/tensorflow/compiler/jit/xla_device_ops.cc b/tensorflow/compiler/jit/xla_device_ops.cc
index f68dba6b6a26c0..5ecb1afa7bcec9 100644
--- a/tensorflow/compiler/jit/xla_device_ops.cc
+++ b/tensorflow/compiler/jit/xla_device_ops.cc
@@ -15,7 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/xla_device_ops.h"
 
+#include <memory>
+
 #include "tensorflow/compiler/jit/xla_device_context.h"
+#include "tensorflow/compiler/jit/xla_tensor.h"
 
 namespace tensorflow {
 
@@ -26,4 +29,82 @@ void XlaDeviceDummyOp::Compute(OpKernelContext* ctx) {
              << type_string() << " on an XLA device. This should never happen.";
 }
 
+XlaAssignVariableOp::XlaAssignVariableOp(OpKernelConstruction* c)
+    : AsyncOpKernel(c) {
+  OP_REQUIRES_OK(c, c->GetAttr("dtype", &dtype_));
+}
+
+void XlaAssignVariableOp::ComputeAsync(OpKernelContext* context,
+                                       DoneCallback done) {
+  OP_REQUIRES_ASYNC(context, dtype_ == context->input(1).dtype(),
+                    errors::InvalidArgument(
+                        "Variable and value dtypes don't match; respectively, ",
+                        dtype_, " and ", context->input(1).dtype()),
+                    done);
+  Var* variable = nullptr;
+  OP_REQUIRES_OK_ASYNC(
+      context,
+      LookupOrCreateResource<Var>(
+          context, HandleFromInput(context, 0), &variable,
+          [this, context](Var** ptr) {
+            *ptr = new Var(dtype_);
+            PersistentTensor unused;
+            Tensor* tmp;
+            AllocatorAttributes attr;
+            TF_RETURN_IF_ERROR(context->allocate_persistent(
+                dtype_, context->input(1).shape(), &unused, &tmp, attr));
+            *(*ptr)->tensor() = *tmp;
+            return Status::OK();
+          }),
+      done);
+  core::ScopedUnref s(variable);
+
+  OP_REQUIRES_ASYNC(context, variable->tensor()->dtype() == dtype_,
+                    errors::InvalidArgument(
+                        "Trying to assign variable with wrong dtype. Expected ",
+                        DataTypeString(variable->tensor()->dtype()), " got ",
+                        DataTypeString(dtype_)),
+                    done);
+
+  const Tensor& value = context->input(1);
+  AllocatorAttributes attr;
+
+  // Copying is unnecessary if we are the last user of the value tensor, we can
+  // just adopt the input tensor's buffer instead.
+  std::unique_ptr<Tensor> input_alias = context->forward_input(
+      1, /*output_index=*/OpKernelContext::Params::kNoReservation, dtype_,
+      value.shape(), DEVICE_MEMORY, attr);
+  mutex_lock ml(*variable->mu());
+  variable->is_initialized = true;
+  if (input_alias) {
+    *variable->tensor() = *input_alias;
+    done();
+    return;
+  }
+
+  // Need to copy, but maybe we can re-use variable's buffer?
+  if (!XlaTensor::RefCountIsOne(*variable->tensor()) ||
+      !variable->tensor()->shape().IsSameSize(value.shape())) {
+    // Copy to new buffer
+    PersistentTensor unused;
+    Tensor* tmp;
+    OP_REQUIRES_OK_ASYNC(context,
+                         context->allocate_persistent(dtype_, value.shape(),
+                                                      &unused, &tmp, attr),
+                         done);
+    *variable->tensor() = *tmp;
+  }
+
+  XlaDeviceContext* device_context =
+      static_cast<XlaDeviceContext*>(context->op_device_context());
+
+  variable->Ref();
+  device_context->CopyDeviceTensorToDevice(
+      value, variable->tensor(), [context, variable, done](Status status) {
+        variable->Unref();
+        context->SetStatus(status);
+        done();
+      });
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_device_ops.h b/tensorflow/compiler/jit/xla_device_ops.h
index 498d25cf566a91..0c49286acd3aba 100644
--- a/tensorflow/compiler/jit/xla_device_ops.h
+++ b/tensorflow/compiler/jit/xla_device_ops.h
@@ -23,8 +23,10 @@ limitations under the License.
 #include "tensorflow/core/kernels/cast_op.h"
 #include "tensorflow/core/kernels/constant_op.h"
 #include "tensorflow/core/kernels/control_flow_ops.h"
+#include "tensorflow/core/kernels/identity_n_op.h"
 #include "tensorflow/core/kernels/identity_op.h"
 #include "tensorflow/core/kernels/no_op.h"
+#include "tensorflow/core/kernels/resource_variable_ops.h"
 #include "tensorflow/core/kernels/sendrecv_ops.h"
 #include "tensorflow/core/kernels/variable_ops.h"
 
@@ -32,7 +34,7 @@ namespace tensorflow {
 
 // Dummy OpKernel, used for kernels assigned to an XLA device that should be
 // compiled. Should never be called at runtime since such ops should be
-// rewritten to a _XlaLaunch op. If it is called, it means the placer placed an
+// rewritten to a XlaLaunch op. If it is called, it means the placer placed an
 // operator on an XLA device but the compiler did not compile it.
 class XlaDeviceDummyOp : public OpKernel {
  public:
@@ -40,8 +42,17 @@ class XlaDeviceDummyOp : public OpKernel {
   void Compute(OpKernelContext* ctx) override;
 };
 
+class XlaAssignVariableOp : public AsyncOpKernel {
+ public:
+  explicit XlaAssignVariableOp(OpKernelConstruction* c);
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) override;
+
+ private:
+  DataType dtype_;
+};
+
 #define REGISTER_XLA_LAUNCH_KERNEL(DEVICE, KERNEL, TYPES) \
-  REGISTER_KERNEL_BUILDER(Name("_XlaLaunch")              \
+  REGISTER_KERNEL_BUILDER(Name("XlaLaunch")               \
                               .Device(DEVICE)             \
                               .HostMemory("constants")    \
                               .HostMemory("resources"),   \
@@ -63,13 +74,37 @@ class XlaDeviceDummyOp : public OpKernel {
       ConstantOp);                                                             \
   REGISTER_KERNEL_BUILDER(                                                     \
       Name("Identity").Device(DEVICE).TypeConstraint("T", TYPES), IdentityOp); \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("IdentityN").Device(DEVICE).TypeConstraint("T", TYPES),             \
+      IdentityNOp);                                                            \
   REGISTER_KERNEL_BUILDER(Name("Placeholder").Device(DEVICE), PlaceholderOp);  \
   REGISTER_KERNEL_BUILDER(Name("PlaceholderV2").Device(DEVICE),                \
                           PlaceholderOp);                                      \
                                                                                \
   REGISTER_KERNEL_BUILDER(                                                     \
       Name("VarHandleOp").Device(DEVICE).HostMemory("resource"),               \
-      ResourceHandleOp<Var>);
+      ResourceHandleOp<Var>);                                                  \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("ReadVariableOp").Device(DEVICE).HostMemory("resource"),            \
+      ReadVariableOp);                                                         \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("AssignVariableOp").Device(DEVICE).HostMemory("resource"),          \
+      XlaAssignVariableOp);                                                    \
+  REGISTER_KERNEL_BUILDER(Name("ControlTrigger").Device(DEVICE),               \
+                          ControlTriggerOp);                                   \
+  REGISTER_KERNEL_BUILDER(Name("Switch").Device(DEVICE).HostMemory("pred"),    \
+                          SwitchOp);                                           \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("Merge").Device(DEVICE).HostMemory("value_index"), MergeOp);        \
+  REGISTER_KERNEL_BUILDER(Name("Enter").Device(DEVICE), EnterOp);              \
+  REGISTER_KERNEL_BUILDER(Name("Exit").Device(DEVICE), ExitOp);                \
+  REGISTER_KERNEL_BUILDER(Name("NextIteration").Device(DEVICE),                \
+                          NextIterationOp);                                    \
+  REGISTER_KERNEL_BUILDER(Name("LoopCond")                                     \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("input")                             \
+                              .HostMemory("output"),                           \
+                          LoopCondOp);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/jit/xla_fusion_optimizer.cc b/tensorflow/compiler/jit/xla_fusion_optimizer.cc
new file mode 100644
index 00000000000000..96016521ea9022
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_fusion_optimizer.cc
@@ -0,0 +1,321 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/xla_fusion_optimizer.h"
+
+#include <atomic>
+#include <deque>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
+#include "tensorflow/compiler/jit/union_find.h"
+#include "tensorflow/compiler/jit/xla_cluster_util.h"
+#include "tensorflow/core/common_runtime/shape_refiner.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+
+namespace tensorflow {
+
+// Is 'node' an operator that consumes only the shape of its input, not the
+// data itself?
+static bool IsShapeConsumerOp(const Node& node) {
+  return node.type_string() == "Shape" || node.type_string() == "ShapeN" ||
+         node.type_string() == "Rank" || node.type_string() == "Size";
+}
+
+// Returns true if the op can be decomposed into XLA ops for which
+// there are fusable elemental implementations.
+bool IsXlaFusable(const NodeDef& node) {
+  static const std::unordered_set<std::string>* elementwise_ops =
+      new std::unordered_set<std::string>(
+          {// tf2xla/kernels/aggregate_ops.cc
+           "AddN",
+           // tf2xla/kernels/binary_ops.cc
+           "Add", "Sub", "Mul", "Div", "Atan2", "Complex", "FloorDiv",
+           "FloorMod", "BitwiseAnd", "BitwiseOr", "LeftShift", "RightShift",
+           "LogicalAnd", "LogicalOr", "Mod", "Maximum", "Minimum", "RealDiv",
+           "ReciprocalGrad", "RsqrtGrad", "SqrtGrad", "SquaredDifference",
+           "TruncateDiv", "TruncateMod", "Equal", "NotEqual", "Greater",
+           "GreaterEqual", "Less", "LessEqual", "SigmoidGrad", "SoftplusGrad",
+           "SoftsignGrad", "TanhGrad", "Pow", "ApproximateEqual",
+           // tf2xla/kernels/unary_ops.cc
+           "ComplexAbs", "Angle", "Conj", "Abs", "Acos", "Acosh", "Asin",
+           "Asinh", "Atan", "Atanh", "Ceil", "Cos", "Cosh", "Sin", "Exp",
+           "Expm1", "Floor", "IsFinite", "IsInf", "IsNan", "Inv", "Reciprocal",
+           "Log", "Log1p", "Invert", "LogicalNot", "Neg", "Rint", "Round",
+           "Rsqrt", "Sigmoid", "Sign", "Sinh", "Softplus", "Softsign", "Sqrt",
+           "Square", "Tan", "Tanh", "Real", "Imag",
+           // tf2xla/kernels/bcast_ops.cc
+           "BroadcastArgs", "BroadcastGradientArgs",
+           // tf2xla/kernels/bias_ops.cc
+           "BiasAdd", "BiasAddV1", "BiasAddGrad" /*(Reduce)*/,
+           // tf2xla/kernels/cast_op.cc
+           "Cast",
+           // tf2xla/kernels/concat_op.cc
+           "Concat", "ConcatV2", "ConcatOffset",
+           // tf2xla/kernels/const_op.cc
+           "Const",
+           // tf2xla/kernels/elu_op.cc
+           "Elu", "EluGrad", "Selu", "SeluGrad",
+           // tf2xla/kernels/fill_op.cc
+           "Fill",
+           // tf2xla/kernels/identity_op.cc
+           "Identity", "IdentityN", "PreventGradient",
+           "StopGradient", /*"Snapshot",*/
+           // tf2xla/kernels/index_ops.cc
+           "ArgMax", "ArgMin",
+           // tf2xla/kernels/mirror_pad_op.cc
+           "MirrorPad",
+           // tf2xla/kernels/one_hot_op.cc
+           "OneHot",
+           // tf2xla/kernels/pack_op.cc
+           "Pack",
+           // tf2xla/kernels/pad_op.cc
+           "Pad", "PadV2",
+           // tf2xla/kernels/relu_op.cc
+           "Relu", "Relu6", "ReluGrad", "Relu6Grad",
+           // tf2xla/kernels/reshape_op.cc
+           "Reshape",
+           // tf2xla/kernels/reverse_op.cc
+           "Reverse", "ReverseV2",
+           // tf2xla/kernels/reverse_sequence_op.cc
+           "ReverseSequence",
+           // tf2xla/kernels/shape_op.cc
+           "Shape", "ShapeN", "Rank", "Size", "ExpandDims", "Squeeze",
+           "ZerosLike", "OnesLike",
+           // tf2xla/kernels/slice_op.cc
+           "Slice",
+           // tf2xla/kernels/split_op.cc
+           "Split", "SplitV",
+           // tf2xla/kernels/strided_slice_op.cc
+           "StridedSlice", "StridedSliceGrad", "ResourceStridedSliceAssign",
+           // tf2xla/kernels/tile_ops.cc
+           "Tile",
+           // tf2xla/kernels/transpose_op.cc
+           "Transpose", "InvertPermutation",
+           // tf2xla/kernels/unpack_op.cc
+           "Unpack"});
+
+  return elementwise_ops->count(node.op()) > 0;
+}
+
+Status XlaFusionOptimizer::Optimize(grappler::Cluster* cluster,
+                                    const grappler::GrapplerItem& item,
+                                    GraphDef* output) {
+  VLOG(2) << "Here at fusion optimizer";
+
+  // TODO(hpucha): Implement encapsulation and replacing with XlaLaunch op.
+  // Once that happens, the expected interaction between this optimizer and when
+  // the global_jit_level is set is as follows: Fusion optimizer will replace
+  // appropriate fusion clusters with XlaLaunch nodes. The remaining graph can
+  // be further compiled where possible via mark_for_compilation_pass. Note that
+  // this might lead to inefficient clustering, and it is best to use either the
+  // fusion optimizer or the global_jit flag, and not combine the two.
+
+  // Create a Graph out of GraphDef. This is required currently because the
+  // helpers around clustering, encapsulation etc work on graphs.
+  FunctionLibraryDefinition function_library(OpRegistry::Global(),
+                                             item.graph.library());
+  Graph graph(function_library);
+  ShapeRefiner shape_refiner(graph.versions(), graph.op_registry());
+  shape_refiner.set_require_shape_inference_fns(false);
+  shape_refiner.set_disable_constant_propagation(true);
+  ImportGraphDefOptions options;
+  // Graph optimization happens at the late stage of graph execution, when
+  // colocation constraints are already validated previously and the device
+  // placement of nodes has also completed, so there is no need to validate
+  // colocation constraints again.
+  options.validate_colocation_constraints = false;
+  options.validate_shape = false;
+  TF_RETURN_IF_ERROR(
+      ImportGraphDef(options, item.graph, &graph, &shape_refiner));
+
+  // Collect nodes that can be fused via XLA, while ignoring those that
+  // explicitly ask for XLA: (*) nodes that are marked to be compiled
+  // explicitly. (*) nodes assigned to XLA device.
+  OrderedNodeSet compilation_candidates;
+  for (Node* node : graph.op_nodes()) {
+    // If there is a _XlaCompile annotation, ignore the node if it is
+    // true. Nodes are marked with this attr via experimental_jit_scope, and
+    // will be handled by the mark_for_compilation pass.
+    bool compile = false;
+    Status status = GetNodeAttr(node->attrs(), kXlaCompileAttr, &compile);
+    if (status.ok() && compile) {
+      continue;
+    }
+    // If there is already a _XlaCluster annotation, ignore the node. Nodes are
+    // marked with this attr to indicate they are already part of a cluster and
+    // hence ignored.
+    status = GetNodeAttr(node->attrs(), kXlaClusterAttr, &compile);
+    if (status.ok()) {
+      continue;
+    }
+
+    // If there is an explicit XLA device placement, ignore the node.
+    DeviceType device_type("");
+    TF_RETURN_IF_ERROR(DeviceToDeviceType(node->def().device(), &device_type));
+    if (device_type.type_string().find("XLA") != string::npos) continue;
+
+    // Assume all fusable ops are registered.
+    // TODO(hpucha): Check for registration if possible.
+    if (!IsXlaFusable(node->def())) {
+      continue;
+    }
+
+    compilation_candidates.insert(node);
+  }
+
+  if (compilation_candidates.empty()) {
+    VLOG(2) << "No compilable candidates";
+    *output = item.graph;
+    return Status::OK();
+  }
+
+  GraphCycles cycles;
+  TF_RETURN_IF_ERROR(CreateCycleDetectionGraph(&graph, &cycles));
+
+  // TODO(hpucha): Make clustering more robust. There are two known issues that
+  // we need to mitigate: (a) Non-resource variables can cause deadlocks
+  // when clustering changes order of execution. See b/77263461 for a specific
+  // example. (b) Queue operations can also cause deadlocks. See b/77261498 for
+  // example.
+
+  struct Cluster {
+    // Identifies the node that represents this cluster in the cycle detection
+    // graph.
+    int representative = -1;
+  };
+
+  // Each compilation candidate belongs to a cluster. The cluster's
+  // representative names the node in the 'cycles' graph that represents the
+  // cluster.
+  std::vector<UnionFind<Cluster>> clusters(graph.num_node_ids());
+  std::deque<UnionFind<Cluster>*> worklist;
+  for (Node* node : compilation_candidates) {
+    Cluster& cluster = clusters[node->id()].Get();
+    cluster.representative = node->id();
+    worklist.push_back(&clusters[node->id()]);
+  }
+
+  // Repeatedly contract edges between clusters that are on the same device,
+  // provided the contraction would not create a cycle. This is a simplified
+  // version of the clustering in mark_for_compilation_pass that also deals with
+  // nodes that are explicitly tagged to be compiled/clustered.
+  while (!worklist.empty()) {
+    int from = worklist.front()->Get().representative;
+    worklist.pop_front();
+
+    Node* node_from = graph.FindNodeId(from);
+    if (node_from->IsControlFlow()) {
+      // Control flow nodes aren't compilation candidates and should never
+      // appear.
+      return errors::Internal(
+          "Found control flow node in clustering worklist: ",
+          node_from->type_string());
+    }
+    for (int to : cycles.Successors(from)) {
+      if (to >= graph.num_node_ids()) {
+        // Node is a "frame" node that is present only in the cycle detection
+        // graph. No clustering is possible.
+        continue;
+      }
+      Node* node_to = graph.FindNodeId(to);
+      if (compilation_candidates.find(node_to) ==
+          compilation_candidates.cend()) {
+        continue;
+      }
+
+      // Do not cluster across devices.
+      if (node_from->def().device() != node_to->def().device()) {
+        VLOG(2) << "Devices " << node_from->def().device() << " "
+                << node_to->def().device();
+        VLOG(2) << "Device names " << node_from->assigned_device_name() << " "
+                << node_to->assigned_device_name();
+        continue;
+      }
+
+      // Ops that consume shapes cannot be the root of a cluster. This is an
+      // optimization.
+      if (clusters[from].Size() == 1 && IsShapeConsumerOp(*node_from)) {
+        continue;
+      }
+
+      // If contracting the edge would create a cycle, bail out.
+      // However, just because we can't merge the clusters now does not mean
+      // we won't be able to merge them in the future.
+      // e.g., if we have edges 1->2, 2->3 and 1->3, we cannot contract edge
+      // 1->3. But if we first contract 1->2 then we can later contract 1->3.
+      if (!cycles.ContractEdge(from, to)) continue;
+
+      // Merge the clusters. ContractEdge uses 'from' as the number of the
+      // merged node, so make sure 'from' is the chosen representative.
+      clusters[from].Merge(&clusters[to]);
+
+      worklist.push_back(&clusters[from]);
+      break;
+    }
+  }
+
+  // Count the number of non-trivial elements in each cluster.
+  std::vector<int> effective_cluster_sizes(graph.num_node_ids());
+  for (const Node* n : compilation_candidates) {
+    int cluster = clusters[n->id()].Get().representative;
+    // Identity nodes will be removed if the node gets marked for compilation.
+    // Therefore we don't want to count them towards the effective cluster size.
+    if (n->def().op() != "Identity") {
+      effective_cluster_sizes[cluster]++;
+    }
+  }
+
+  const int min_cluster_size = 2;
+  int num_clusters = 0;
+  for (auto size : effective_cluster_sizes) {
+    if (size >= min_cluster_size) {
+      VLOG(3) << "Cluster " << num_clusters << " " << size;
+      num_clusters++;
+    }
+  }
+
+  // Names for each cluster.
+  std::unordered_map<int, string> cluster_names;
+  // Sequence number generator to ensure clusters have unique names.
+  static std::atomic<int64> cluster_sequence_num;
+
+  for (Node* n : compilation_candidates) {
+    int cluster = clusters[n->id()].Get().representative;
+
+    // Compile if this is a cluster of >= min_cluster_size compilable operators.
+    if (effective_cluster_sizes[cluster] >= min_cluster_size) {
+      string& name = cluster_names[cluster];
+
+      if (name.empty()) {
+        name = strings::StrCat("cluster_", cluster_sequence_num++);
+      }
+      n->AddAttr(kXlaClusterAttr, name);
+      VLOG(3) << "Assigning node " << n->name() << " to cluster " << name;
+    }
+  }
+
+  graph.ToGraphDef(output);
+  return Status::OK();
+}
+
+REGISTER_GRAPH_OPTIMIZER_AS(XlaFusionOptimizer, "xla-fusion");
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_fusion_optimizer.h b/tensorflow/compiler/jit/xla_fusion_optimizer.h
new file mode 100644
index 00000000000000..3d2309e782d387
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_fusion_optimizer.h
@@ -0,0 +1,49 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_XLA_FUSION_OPTIMIZER_H_
+#define TENSORFLOW_COMPILER_JIT_XLA_FUSION_OPTIMIZER_H_
+
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+
+namespace tensorflow {
+
+// Optimizes graphs by fusing ops where possible, resulting in more efficient
+// execution.
+class XlaFusionOptimizer : public grappler::CustomGraphOptimizer {
+ public:
+  XlaFusionOptimizer() {}
+  ~XlaFusionOptimizer() override {}
+
+  Status Init(
+      const RewriterConfig_CustomGraphOptimizer* config = nullptr) override {
+    return Status::OK();
+  }
+
+  string name() const override { return "xla-fusion"; };
+
+  Status Optimize(grappler::Cluster* cluster,
+                  const grappler::GrapplerItem& item,
+                  GraphDef* output) override;
+
+  void Feedback(grappler::Cluster* cluster, const grappler::GrapplerItem& item,
+                const GraphDef& optimize_output, double result) override {
+    // Nothing to do for XlaFusionOptimizer.
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_XLA_FUSION_OPTIMIZER_H_
diff --git a/tensorflow/compiler/jit/xla_fusion_optimizer_test.cc b/tensorflow/compiler/jit/xla_fusion_optimizer_test.cc
new file mode 100644
index 00000000000000..5736760a878dc8
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_fusion_optimizer_test.cc
@@ -0,0 +1,183 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/xla_fusion_optimizer.h"
+#include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/compiler/jit/xla_cluster_util.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/graph/graph_def_builder_util.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace {
+
+REGISTER_OP("UncompilableNullary").Output("o: float");
+REGISTER_OP("UncompilableUnary").Input("a: float").Output("o: float");
+
+class XlaFusionOptimizerTest : public grappler::GrapplerTest {
+ protected:
+  std::unordered_map<string, string> GetClusters(const GraphDef& graph) {
+    std::unordered_map<string, string> ids;
+    for (const NodeDef& node : graph.node()) {
+      string cluster;
+      if (GetNodeAttr(AttrSlice(node), kXlaClusterAttr, &cluster).ok()) {
+        CHECK(!cluster.empty());
+        ids[node.name()] = cluster;
+      }
+    }
+    return ids;
+  }
+};
+
+TEST_F(XlaFusionOptimizerTest, Chains) {
+  GraphDef graph;
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* a =
+        ops::SourceOp("UncompilableNullary", builder.opts().WithName("A"));
+    Node* b = ops::UnaryOp("Relu", a, builder.opts().WithName("B"));
+    Node* c = ops::UnaryOp("Relu", b, builder.opts().WithName("C"));
+    Node* d =
+        ops::UnaryOp("UncompilableUnary", c, builder.opts().WithName("D"));
+    Node* e = ops::UnaryOp("Relu", d, builder.opts().WithName("E"));
+    ops::UnaryOp("Relu", e, builder.opts().WithName("F"));
+    TF_ASSERT_OK(builder.ToGraphDef(&graph));
+  }
+  grappler::GrapplerItem item;
+  item.graph = graph;
+
+  XlaFusionOptimizer optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  auto clusters = GetClusters(output);
+  EXPECT_EQ(4, clusters.size());
+  EXPECT_EQ(clusters["B"], clusters["C"]);
+  EXPECT_EQ(clusters["E"], clusters["F"]);
+  EXPECT_NE(clusters["B"], clusters["E"]);
+  EXPECT_TRUE(clusters.find("A") == clusters.cend());
+  EXPECT_TRUE(clusters.find("D") == clusters.cend());
+}
+
+TEST_F(XlaFusionOptimizerTest, FusableOps) {
+  GraphDef graph;
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* a = ops::SourceOp(
+        "Placeholder",
+        builder.opts().WithName("A").WithAttr("dtype", tensorflow::DT_FLOAT));
+    Node* b = ops::SourceOp(
+        "Placeholder",
+        builder.opts().WithName("B").WithAttr("dtype", tensorflow::DT_FLOAT));
+
+    Node* c = ops::BinaryOp("Add", a, b, builder.opts().WithName("C"));
+    ops::BinaryOp("MatMul", a, c, builder.opts().WithName("D"));
+    ops::UnaryOp("Abs", c, builder.opts().WithName("E"));
+
+    TF_ASSERT_OK(builder.ToGraphDef(&graph));
+  }
+  grappler::GrapplerItem item;
+  item.graph = graph;
+
+  XlaFusionOptimizer optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  auto clusters = GetClusters(output);
+  EXPECT_EQ(2, clusters.size());
+  EXPECT_EQ(clusters["C"], clusters["E"]);
+  EXPECT_TRUE(clusters.find("D") == clusters.cend());
+}
+
+TEST_F(XlaFusionOptimizerTest, IgnoreExplicitXLAAttrs) {
+  GraphDef graph;
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* a = ops::SourceOp(
+        "Placeholder",
+        builder.opts().WithName("A").WithAttr("dtype", tensorflow::DT_FLOAT));
+    Node* b = ops::SourceOp(
+        "Placeholder",
+        builder.opts().WithName("B").WithAttr("dtype", tensorflow::DT_FLOAT));
+
+    Node* c = ops::BinaryOp(
+        "Add", a, b,
+        builder.opts().WithName("C").WithDevice("/device:XLA_CPU"));
+    ops::BinaryOp("MatMul", a, c, builder.opts().WithName("D"));
+    Node* e = ops::UnaryOp("Abs", c, builder.opts().WithName("E"));
+    ops::UnaryOp("Cos", e,
+                 builder.opts().WithName("F").WithAttr(kXlaCompileAttr, true));
+
+    TF_ASSERT_OK(builder.ToGraphDef(&graph));
+  }
+  grappler::GrapplerItem item;
+  item.graph = graph;
+
+  XlaFusionOptimizer optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  auto clusters = GetClusters(output);
+  EXPECT_TRUE(clusters.empty());
+}
+
+TEST_F(XlaFusionOptimizerTest, UncompilableCycles) {
+  GraphDef graph;
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* a = ops::SourceOp("Const", builder.opts()
+                                         .WithName("A")
+                                         .WithAttr("dtype", DT_FLOAT)
+                                         .WithAttr("value", Tensor()));
+    Node* b =
+        ops::UnaryOp("UncompilableUnary", a, builder.opts().WithName("B"));
+    ops::BinaryOp("Mul", a, b, builder.opts().WithName("C"));
+
+    TF_ASSERT_OK(builder.ToGraphDef(&graph));
+  }
+  grappler::GrapplerItem item;
+  item.graph = graph;
+
+  XlaFusionOptimizer optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  auto clusters = GetClusters(output);
+  EXPECT_TRUE(clusters.empty());
+}
+
+TEST_F(XlaFusionOptimizerTest, CompilableCycles) {
+  GraphDef graph;
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* a = ops::SourceOp("Const", builder.opts()
+                                         .WithName("A")
+                                         .WithAttr("dtype", DT_FLOAT)
+                                         .WithAttr("value", Tensor()));
+    Node* b = ops::UnaryOp("Relu", a, builder.opts().WithName("B"));
+    ops::BinaryOp("Mul", a, b, builder.opts().WithName("C"));
+    TF_ASSERT_OK(builder.ToGraphDef(&graph));
+  }
+  grappler::GrapplerItem item;
+  item.graph = graph;
+
+  XlaFusionOptimizer optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  auto clusters = GetClusters(output);
+  EXPECT_EQ(3, clusters.size());
+  EXPECT_EQ(clusters["A"], clusters["B"]);
+  EXPECT_EQ(clusters["A"], clusters["C"]);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc
index ad4d44ce26224e..65aa5c35ede81f 100644
--- a/tensorflow/compiler/jit/xla_gpu_device.cc
+++ b/tensorflow/compiler/jit/xla_gpu_device.cc
@@ -50,7 +50,9 @@ Status XlaGpuDeviceFactory::CreateDevices(const SessionOptions& options,
       //XlaDevice::Create("CUDA", DEVICE_XLA_GPU, 0, DEVICE_GPU_XLA_JIT, options,
       XlaDevice::Create("CUDA", DEVICE_XLA_GPU, 0, DEVICE_GPU_XLA_JIT, options,
                         name_prefix, registration,
-                        /*transfer_as_literal=*/false, &device);
+                        /*transfer_as_literal=*/false,
+                        /*shape_representation_fn=*/{},
+                        /*padded_shape_fn=*/{}, &device);
   if (!status.ok()) {
     // Treat failures as non-fatal; there might not be a GPU in the machine.
     VLOG(1) << "Failed to create XLA_GPU device: " << status;
diff --git a/tensorflow/compiler/jit/xla_interpreter_device.cc b/tensorflow/compiler/jit/xla_interpreter_device.cc
index 9e098c46f422b4..661187f4a873b0 100644
--- a/tensorflow/compiler/jit/xla_interpreter_device.cc
+++ b/tensorflow/compiler/jit/xla_interpreter_device.cc
@@ -51,7 +51,9 @@ Status XlaInterpreterDeviceFactory::CreateDevices(
   TF_RETURN_IF_ERROR(XlaDevice::Create("Interpreter", DEVICE_XLA_INTERPRETER, 0,
                                        DEVICE_INTERPRETER_XLA_JIT, options,
                                        name_prefix, registration,
-                                       /*transfer_as_literal=*/false, &device));
+                                       /*transfer_as_literal=*/false,
+                                       /*shape_representation_fn=*/{},
+                                       /*padded_shape_fn=*/{}, &device));
   devices->push_back(device.release());
   return Status::OK();
 }
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 2a7f04271d4b7e..d0c7a936512570 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -38,14 +38,13 @@ using xla::ScopedShapedBuffer;
 using xla::ShapedBuffer;
 }  // anonymous namespace
 
-std::map<int, OptionalTensor> SnapshotResourceVariables(OpKernelContext* ctx,
-                                                        int num_variables) {
+std::map<int, OptionalTensor> SnapshotResourceVariables(
+    OpKernelContext* ctx, const std::vector<int>& variables) {
   std::map<int, OptionalTensor> snapshot;
-  int first_variable = ctx->num_inputs() - num_variables;
-  for (int i = 0; i < num_variables; ++i) {
+  for (int i : variables) {
     Var* variable = nullptr;
-    ResourceHandle handle = HandleFromInput(ctx, first_variable + i);
-    OptionalTensor& tensor = snapshot[first_variable + i];
+    ResourceHandle handle = HandleFromInput(ctx, i);
+    OptionalTensor& tensor = snapshot[i];
     if (LookupResource(ctx, handle, &variable).ok()) {
       tf_shared_lock lock(*variable->mu());
       tensor.name = handle.name();
@@ -61,32 +60,35 @@ XlaAllocator::XlaAllocator(const se::Platform* platform, Allocator* wrapped)
 
 XlaAllocator::~XlaAllocator() {}
 
-xla::StatusOr<se::DeviceMemoryBase> XlaAllocator::Allocate(
+xla::StatusOr<xla::OwningDeviceMemory> XlaAllocator::Allocate(
     int device_ordinal, uint64 size, bool retry_on_failure) {
-  void* data = wrapped_->AllocateRaw(Allocator::kAllocatorAlignment, size);
+  AllocationAttributes attrs;
+  attrs.no_retry_on_failure = !retry_on_failure;
+  void* data =
+      wrapped_->AllocateRaw(Allocator::kAllocatorAlignment, size, attrs);
   if (data == nullptr) {
     return errors::ResourceExhausted("Out of memory while trying to allocate ",
                                      size, " bytes.");
-  } else {
-    return se::DeviceMemoryBase(data, size);
   }
+  return xla::OwningDeviceMemory(se::DeviceMemoryBase(data, size),
+                                 device_ordinal, this);
 }
 
-Status XlaAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase* mem) {
-  wrapped_->DeallocateRaw(mem->opaque());
+Status XlaAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase mem) {
+  wrapped_->DeallocateRaw(mem.opaque());
   return Status::OK();
 }
 
-namespace {
+namespace internal {
 // Return the 'index''th subtree of the given ShapedBuffer as a
 // ScopedShapedBuffer. The returned ScopedShapedBuffer takes ownership of the
 // subtree, and sets the input's buffer pointers to nullptr for the subtree.
 ScopedShapedBuffer ExtractSubShapedBuffer(
     ShapedBuffer* shaped_buffer, int index,
     xla::DeviceMemoryAllocator* allocator) {
-  xla::Shape on_host_shape = xla::ShapeUtil::GetTupleElementShape(
+  const xla::Shape& on_host_shape = xla::ShapeUtil::GetTupleElementShape(
       shaped_buffer->on_host_shape(), index);
-  xla::Shape on_device_shape = xla::ShapeUtil::GetTupleElementShape(
+  const xla::Shape& on_device_shape = xla::ShapeUtil::GetTupleElementShape(
       shaped_buffer->on_device_shape(), index);
 
   ShapedBuffer sub_shaped_buffer(on_host_shape, on_device_shape,
@@ -98,20 +100,23 @@ ScopedShapedBuffer ExtractSubShapedBuffer(
   sub_shape_tree.CopySubtreeFrom(shape_tree,
                                  /*source_base_index=*/{index},
                                  /*target_base_index=*/{});
-  for (auto& index_to_buffer : shape_tree) {
-    if (!index_to_buffer.first.empty() && index_to_buffer.first[0] == index) {
-      index_to_buffer.second = se::DeviceMemoryBase(nullptr, 0);
-    }
-  }
+  shape_tree.ForEachMutableElement(
+      [index](const xla::ShapeIndex& shape_index,
+              tensorflow::se::DeviceMemoryBase* data) {
+        // shape_index is empty for the root node. Ignore that.
+        if (!shape_index.empty() && shape_index[0] == index) {
+          *data = tensorflow::se::DeviceMemoryBase(nullptr, 0);
+        }
+      });
   return ScopedShapedBuffer(std::move(sub_shaped_buffer), allocator);
 }
-}  // namespace
+}  // namespace internal
+using internal::ExtractSubShapedBuffer;
 
 XlaComputationLaunchContext::XlaComputationLaunchContext(
-    int64 num_resource_args, xla::LocalClient* client,
-    xla::DeviceMemoryAllocator* xla_allocator, bool allocate_xla_tensors)
-    : num_resource_args_(num_resource_args),
-      client_(client),
+    xla::LocalClient* client, xla::DeviceMemoryAllocator* xla_allocator,
+    bool allocate_xla_tensors)
+    : client_(client),
       xla_allocator_(xla_allocator),
       allocate_xla_tensors_(allocate_xla_tensors) {}
 
@@ -190,11 +195,6 @@ void XlaComputationLaunchContext::PopulateOutputs(
 
         OP_REQUIRES_OK(
             ctx, ctx->allocate_output(i, const_tensor.shape(), &output_tensor));
-        if (XlaTensor* xla_tensor = XlaTensor::FromTensor(output_tensor)) {
-          OP_REQUIRES_OK(ctx, xla_tensor->AllocateShapedBuffer(
-                                  const_tensor.dtype(), const_tensor.shape(),
-                                  client_, stream->parent()->device_ordinal()));
-        }
 
         Device* device = dynamic_cast<Device*>(ctx->device());
         OP_REQUIRES(ctx, device != nullptr,
@@ -236,7 +236,7 @@ void XlaComputationLaunchContext::PopulateOutputs(
       } else {
         Tensor output_tensor = XlaTensorBuffer::MakeTensor(
             ctx->expected_output_dtype(i), shape, buffer, allocator);
-        output.set_buffer(se::DeviceMemoryBase(nullptr, 0), {output_num});
+        output.set_buffer(xla::OwningDeviceMemory(), {output_num});
         ctx->set_output(i, output_tensor);
       }
       ++output_num;
@@ -286,7 +286,7 @@ void XlaComputationLaunchContext::PopulateOutputs(
     } else {
       Tensor output_tensor = XlaTensorBuffer::MakeTensor(
           write.type, write.shape, buffer, allocator);
-      output.set_buffer(se::DeviceMemoryBase(nullptr, 0), {output_num});
+      output.set_buffer(xla::OwningDeviceMemory(), {output_num});
       *variable->tensor() = output_tensor;
     }
     ++output_num;
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index 8a6ff3b0c75120..4390701ccbd0bc 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -22,6 +22,8 @@ limitations under the License.
 #include "tensorflow/compiler/jit/xla_tensor.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/service/owning_device_memory.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
@@ -31,15 +33,17 @@ limitations under the License.
 namespace tensorflow {
 class XlaAllocator;
 
-// Takes a snapshot of the values of resource variable arguments, which are
-// the last `num_variables` arguments. We snapshot tensors that back
+// Takes a snapshot of the values of resource variable arguments, whose
+// indices are specified in `variables` argument. We snapshot tensors that back
 // resource variables since concurrent updates may modify the shape, and it is
 // important that the shapes used for compilation match the true shapes of the
 // buffers.
 //
-// Returns a map of TensorFlow argument index to resource variable.
-std::map<int, OptionalTensor> SnapshotResourceVariables(OpKernelContext* ctx,
-                                                        int num_variables);
+// Returns a map of TensorFlow argument index to resource variable. If a
+// resource variable is not initialized, the corresponding OptionalTensor
+// will have its `present` field set to false.
+std::map<int, OptionalTensor> SnapshotResourceVariables(
+    OpKernelContext* ctx, const std::vector<int>& variables);
 
 // Adapter class that wraps a Tensorflow allocator as an XLA allocator.
 // Assumes that the Tensorflow allocator permits asynchronous deallocation:
@@ -48,9 +52,9 @@ class XlaAllocator : public xla::DeviceMemoryAllocator {
  public:
   XlaAllocator(const se::Platform* platform, Allocator* wrapped);
   ~XlaAllocator() override;
-  xla::StatusOr<se::DeviceMemoryBase> Allocate(int device_ordinal, uint64 size,
-                                               bool retry_on_failure) override;
-  Status Deallocate(int device_ordinal, se::DeviceMemoryBase* mem) override;
+  xla::StatusOr<xla::OwningDeviceMemory> Allocate(
+      int device_ordinal, uint64 size, bool retry_on_failure) override;
+  Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override;
 
   // The Tensorflow BFC allocator used on GPU allows host-side deallocation
   // before GPU execution takes place. Tensorflow uses the ordering of the main
@@ -72,7 +76,7 @@ class XlaComputationLaunchContext {
   // Create a new launch context. 'allocate_xla_tensors' is true if allocated
   // output tensors and variables are always XlaTensors. If false they are
   // assumed to be "normal" device pointers.
-  XlaComputationLaunchContext(int64 num_resource_args, xla::LocalClient* client,
+  XlaComputationLaunchContext(xla::LocalClient* client,
                               xla::DeviceMemoryAllocator* xla_allocator,
                               bool allocate_xla_tensors);
 
@@ -92,7 +96,6 @@ class XlaComputationLaunchContext {
   const std::vector<xla::ShapedBuffer*>& arguments() const { return arg_ptrs_; }
 
  private:
-  int64 num_resource_args_;
   xla::LocalClient* client_;
   xla::DeviceMemoryAllocator* xla_allocator_;
   bool allocate_xla_tensors_;
@@ -140,6 +143,17 @@ class XlaTensorBuffer : public TensorBuffer {
   Allocator* allocator_;
 };
 
+// Exposed in this header file for microbenchmarking purposes, but this is an
+// internal implementation detail.
+namespace internal {
+// Return the 'index''th subtree of the given ShapedBuffer as a
+// ScopedShapedBuffer. The returned ScopedShapedBuffer takes ownership of the
+// subtree, and sets the input's buffer pointers to nullptr for the subtree.
+xla::ScopedShapedBuffer ExtractSubShapedBuffer(
+    xla::ShapedBuffer* shaped_buffer, int index,
+    xla::DeviceMemoryAllocator* allocator);
+}  // namespace internal
+
 }  // namespace tensorflow
 
 #endif
diff --git a/tensorflow/compiler/jit/xla_launch_util_test.cc b/tensorflow/compiler/jit/xla_launch_util_test.cc
new file mode 100644
index 00000000000000..a45932403ec176
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_launch_util_test.cc
@@ -0,0 +1,64 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Contains microbenchmarks for performance critical functions in
+// xla_launch_util.cc.
+
+#include "tensorflow/compiler/jit/xla_launch_util.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+// Test ExtractSubBuffer with different depths (depth of ShapeTree) and fan-outs
+// (cardinality of each non-leaf node's children).
+void BM_ExtractSubBuffer(int iters, int depth, int fan_out) {
+  tensorflow::testing::StopTiming();
+  xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, {32, 64, 128});
+  for (int i = 0; i < depth; ++i) {
+    std::vector<xla::Shape> shapes(fan_out, shape);
+    shape = xla::ShapeUtil::MakeTupleShape(shapes);
+  }
+  xla::ShapedBuffer shaped_buffer(shape, shape, /*platform=*/nullptr,
+                                  /*device_ordinal=*/0);
+  tensorflow::testing::StartTiming();
+  for (int i = 0; i < iters; ++i) {
+    // Extract a buffer from approximately the middle of the first level of the
+    // tree.
+    (void)tensorflow::internal::ExtractSubShapedBuffer(&shaped_buffer,
+                                                       /*index=*/fan_out / 2,
+                                                       /*allocator=*/nullptr)
+        .release();
+  }
+}
+
+BENCHMARK(BM_ExtractSubBuffer)
+    ->ArgPair(1, 4)
+    ->ArgPair(1, 8)
+    ->ArgPair(1, 32)
+    ->ArgPair(1, 64)
+    ->ArgPair(1, 128)
+    ->ArgPair(1, 256)
+    ->ArgPair(1, 512)
+    ->ArgPair(2, 4)
+    ->ArgPair(2, 8)
+    ->ArgPair(2, 32)
+    ->ArgPair(2, 64)
+    ->ArgPair(2, 128);
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  tensorflow::testing::RunBenchmarks();
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/compiler/jit/xla_tensor.cc b/tensorflow/compiler/jit/xla_tensor.cc
index ce6456880bc1b3..3c44c4ae6df7f3 100644
--- a/tensorflow/compiler/jit/xla_tensor.cc
+++ b/tensorflow/compiler/jit/xla_tensor.cc
@@ -18,7 +18,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-/*static*/ XlaTensor* XlaTensor::FromTensor(Tensor* tensor) {
+/*static*/ XlaTensor* XlaTensor::FromTensor(const Tensor* tensor) {
   if (tensor->NumElements() == 0) {
     return nullptr;
   }
@@ -27,8 +27,8 @@ namespace tensorflow {
   return xla_tensor;
 }
 
-/*static*/ const XlaTensor* XlaTensor::FromTensor(const Tensor* tensor) {
-  return FromTensor(const_cast<Tensor*>(tensor));
+/*static*/ bool XlaTensor::RefCountIsOne(const Tensor& tensor) {
+  return tensor.RefCountIsOne();
 }
 
 /*static*/ se::DeviceMemoryBase XlaTensor::DeviceMemoryFromTensor(
@@ -52,20 +52,24 @@ Status XlaTensor::AllocateShapedBuffer(DataType dtype, const TensorShape& shape,
       client->backend().transfer_manager()->HostShapeToDeviceShape(
           on_host_shape);
 
-  xla::ShapedBuffer buffer(on_host_shape, on_device_shape, client->platform(),
-                           device_ordinal);
-  for (auto& index_to_buffer : buffer.buffers()) {
+  xla::ScopedShapedBuffer shaped_buffer(on_host_shape, on_device_shape,
+                                        client->backend().memory_allocator(),
+                                        device_ordinal);
+  for (auto& index_to_buffer : shaped_buffer.buffers()) {
     xla::Shape subshape =
         xla::ShapeUtil::GetSubshape(on_device_shape, index_to_buffer.first);
     uint64 size =
         client->backend().transfer_manager()->GetByteSizeRequirement(subshape);
-    TF_ASSIGN_OR_RETURN(index_to_buffer.second,
+    TF_ASSIGN_OR_RETURN(xla::OwningDeviceMemory buffer,
                         client->backend().memory_allocator()->Allocate(
                             device_ordinal, size, /*retry_on_failure=*/false));
+    // Move our buffer into shaped_buffer, which takes ownership of it.
+    index_to_buffer.second = buffer.Forget();
   }
 
-  set_shaped_buffer(xla::ScopedShapedBuffer(
-      std::move(buffer), client->backend().memory_allocator()));
+  VLOG(4) << shaped_buffer.ToString();
+
+  set_shaped_buffer(std::move(shaped_buffer));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/jit/xla_tensor.h b/tensorflow/compiler/jit/xla_tensor.h
index 922a9189731209..c54001a999998f 100644
--- a/tensorflow/compiler/jit/xla_tensor.h
+++ b/tensorflow/compiler/jit/xla_tensor.h
@@ -34,10 +34,9 @@ class XlaTensor {
  public:
   // Downcast from a Tensor to an XlaTensor. Return nullptr if the downcast
   // fails.
-  static XlaTensor* FromTensor(Tensor* tensor);
-  // Downcast from a Tensor to an XlaTensor. Return nullptr if the downcast
-  // fails.
-  static const XlaTensor* FromTensor(const Tensor* tensor);
+  static XlaTensor* FromTensor(const Tensor* tensor);
+
+  static bool RefCountIsOne(const Tensor& tensor);
 
   // Create a DeviceMemoryBase from a Tensor. The Tensor can be an XlaTensor, in
   // which case the returned value is shaped_buffer()->root_buffer(), or a
@@ -54,7 +53,7 @@ class XlaTensor {
   // Some Tensors can have complex on-device shapes, including tuple shapes. To
   // manage the memory for these tensors a ShapedBuffer may be required.
 
-  // Return true if this TensorInfo contains a ShapedBuffer.
+  // Return true if this XlaTensor contains a ShapedBuffer.
   bool has_shaped_buffer() const { return shaped_buffer_ != nullptr; }
   // Return the contained ShapedBuffer.
   // REQUIRES: has_shaped_buffer()
@@ -62,7 +61,11 @@ class XlaTensor {
     CHECK(has_shaped_buffer());
     return *shaped_buffer_;
   }
-  // Mutates the TensorInfo to set the ShapedBuffer.
+  xla::ShapedBuffer& shaped_buffer() {
+    CHECK(has_shaped_buffer());
+    return *shaped_buffer_;
+  }
+  // Mutates the XlaTensor to set the ShapedBuffer.
   void set_shaped_buffer(xla::ScopedShapedBuffer shaped_buffer) {
     shaped_buffer_ =
         xla::MakeUnique<xla::ScopedShapedBuffer>(std::move(shaped_buffer));
@@ -72,7 +75,7 @@ class XlaTensor {
   // in on-demand mode to avoid re-copying values from the device if we know the
   // host value already.
 
-  // Return true if this TensorInfo contains a host tensor.
+  // Return true if this XlaTensor contains a host tensor.
   bool has_host_tensor() const { return host_tensor_ != nullptr; }
   // Return the contained host tensor.
   // REQUIRES: has_host_tensor()
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index f7bad39af082d2..20baa745af6d8e 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -42,7 +42,7 @@ py_library(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform",
         "//tensorflow/python:random_seed",
         "//tensorflow/python:session",
@@ -58,7 +58,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
@@ -72,7 +72,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
@@ -93,7 +93,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
@@ -111,7 +111,7 @@ tf_xla_py_test(
         ":xla_test",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:bitwise_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:math_ops_gen",
         "//tensorflow/python:nn_ops",
@@ -120,6 +120,19 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "bucketize_op_test",
+    size = "small",
+    srcs = ["bucketize_op_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "categorical_op_test",
     size = "small",
@@ -127,7 +140,7 @@ tf_xla_py_test(
     tags = ["optonly"],
     deps = [
         ":xla_test",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:random_ops",
     ],
@@ -141,7 +154,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
@@ -156,7 +169,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
@@ -170,7 +183,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
@@ -184,7 +197,7 @@ tf_xla_py_test(
         ":xla_test",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:gradient_checker",
         "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
@@ -196,9 +209,11 @@ tf_xla_py_test(
     name = "oom_test",
     size = "medium",
     srcs = ["oom_test.py"],
+    # TODO(b/80081500): Re-enable on GPU. Disabled on 2018-05-21.
     disabled_backends = [
         "cpu",
         "cpu_ondemand",
+        "gpu",
     ],
     tags = [
         # Allocates very large amounts of memory and does not work under TSAN.
@@ -209,7 +224,7 @@ tf_xla_py_test(
         ":xla_test",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:gradient_checker",
         "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
@@ -225,7 +240,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:nn",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:nn_ops_gen",
@@ -241,7 +256,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:nn",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:nn_ops_gen",
@@ -263,7 +278,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:nn",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:nn_ops_gen",
@@ -291,7 +306,7 @@ tf_xla_py_test(
         ":xla_test",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
     ],
 )
@@ -300,10 +315,14 @@ tf_xla_py_test(
     name = "extract_image_patches_op_test",
     size = "small",
     srcs = ["extract_image_patches_op_test.py"],
+    tags = [
+        "manual",
+        "notap",
+    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
     ],
 )
@@ -322,8 +341,12 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nn",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:function",
     ],
 )
 
@@ -338,7 +361,7 @@ tf_xla_py_test(
         "//tensorflow/contrib/signal:signal_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:extra_py_tests_deps",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:spectral_ops",
     ],
@@ -352,7 +375,7 @@ tf_xla_py_test(
         ":xla_test",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
     ],
 )
@@ -364,7 +387,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
@@ -380,7 +403,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
     ],
 )
@@ -395,12 +418,27 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:image_ops",
         "//tensorflow/python:platform_test",
     ],
 )
 
+tf_xla_py_test(
+    name = "listdiff_op_test",
+    size = "small",
+    srcs = ["listdiff_op_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:data_flow_ops",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform_test",
+        "@six_archive//:six",
+    ],
+)
+
 tf_xla_py_test(
     name = "lrn_ops_test",
     size = "medium",
@@ -408,7 +446,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:nn",
         "//tensorflow/python:nn_ops_gen",
         "//tensorflow/python:platform_test",
@@ -423,7 +461,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
     ],
 )
@@ -435,7 +473,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
@@ -449,7 +487,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
@@ -462,7 +500,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
     ],
 )
@@ -475,7 +513,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:nn_ops_gen",
         "//tensorflow/python:platform_test",
@@ -490,7 +528,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:nn_ops_gen",
         "//tensorflow/python:platform_test",
@@ -507,7 +545,7 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:random_ops",
     ],
@@ -522,7 +560,7 @@ tf_xla_py_test(
         ":xla_test",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
@@ -538,7 +576,7 @@ tf_xla_py_test(
         "//tensorflow/compiler/tf2xla/python:xla",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
@@ -551,7 +589,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
     ],
 )
 
@@ -563,7 +601,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
     ],
 )
@@ -575,7 +613,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
@@ -590,7 +628,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
@@ -603,7 +641,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:math_ops_gen",
         "//tensorflow/python:platform_test",
@@ -618,7 +656,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
@@ -634,7 +672,7 @@ tf_xla_py_test(
         ":xla_test",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
     ],
 )
@@ -647,7 +685,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/contrib/stateless",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
     ],
 )
@@ -661,7 +699,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:math_ops_gen",
         "//tensorflow/python:nn_ops",
@@ -680,7 +718,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
@@ -693,7 +731,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:nn_ops_gen",
@@ -707,7 +745,7 @@ tf_xla_py_test(
     srcs = ["fused_batchnorm_test.py"],
     deps = [
         ":xla_test",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:math_ops_gen",
         "//tensorflow/python:nn",
@@ -726,7 +764,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:math_ops_gen",
         "//tensorflow/python:nn_ops",
@@ -745,7 +783,7 @@ tf_xla_py_test(
         ":xla_test",
         "//tensorflow/compiler/tf2xla/python:xla",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
     ],
@@ -755,11 +793,12 @@ tf_xla_py_test(
     name = "gather_test",
     size = "medium",
     srcs = ["gather_test.py"],
+    tags = ["noasan"],  # times out, http://b/78599043
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
     ],
 )
@@ -771,7 +810,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
     ],
 )
@@ -784,21 +823,34 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
     ],
 )
 
-gpu_py_test(
+tf_xla_py_test(
     name = "xla_device_test",
     size = "small",
     srcs = ["xla_device_test.py"],
+    tags = ["optonly"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
+gpu_py_test(
+    name = "xla_device_gpu_test",
+    size = "small",
+    srcs = ["xla_device_gpu_test.py"],
     additional_deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
     ],
 )
@@ -815,15 +867,23 @@ gpu_py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:gradients",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:nn_ops",
     ],
-    # TODO(b/62961789): Test fails with SIGABRT
-    tags = [
-        "manual",
-        "notap",
+)
+
+gpu_py_test(
+    name = "dense_layer_test",
+    size = "small",
+    srcs = ["dense_layer_test.py"],
+    additional_deps = [
+        "//tensorflow/contrib/compiler:compiler_py",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:variables",
     ],
 )
 
@@ -866,7 +926,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:variables",
@@ -881,7 +941,7 @@ gpu_py_test(
         ":xla_test",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:gradients",
         "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
@@ -919,7 +979,7 @@ tf_xla_py_test(
     srcs = ["fake_quant_ops_test.py"],
     deps = [
         ":xla_test",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
     ],
 )
@@ -931,7 +991,7 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
     ],
 )
diff --git a/tensorflow/compiler/tests/argminmax_test.py b/tensorflow/compiler/tests/argminmax_test.py
index ec547e16cd9c91..9d3a889b1f54c8 100644
--- a/tensorflow/compiler/tests/argminmax_test.py
+++ b/tensorflow/compiler/tests/argminmax_test.py
@@ -29,51 +29,70 @@
 
 class ArgMinMaxTest(xla_test.XLATestCase):
 
-  def _assertOpOutputMatchesExpected(self, op, inp, expected):
-    """Verifies that 'op' produces 'expected' when fed input 'inp' .
+  def _assertOpOutputMatchesExpected(self, op, axis, output_type, op_input,
+                                     expected):
+    """Verifies that 'op' produces 'expected' when fed input 'op_input' .
 
     Args:
-      op: operator to test
-      inp: numpy input array to use as input to 'op'.
+      op: argmin or argmax operator to test.
+      axis: integer axis to reduce across.
+      output_type: numpy datatype of the output to produce.
+      op_input: numpy input array to use as input to 'op'.
       expected: numpy array representing the expected output of 'op'.
     """
     with self.test_session() as session:
       with self.test_scope():
         pinp = array_ops.placeholder(
-            dtypes.as_dtype(inp.dtype), inp.shape, name="a")
-        output = op(pinp)
-      result = session.run(output, {pinp: inp})
+            dtypes.as_dtype(op_input.dtype), op_input.shape, name="a")
+        output = op(pinp, axis=axis, output_type=output_type)
+      result = session.run(output, {pinp: op_input})
       self.assertAllEqual(result, expected)
 
   def testArgMinMax(self):
     # Complex numbers do not support argmin/argmax.
     minmax_types = set(self.numeric_types) - set(self.complex_types)
     for dtype in minmax_types:
-      self._assertOpOutputMatchesExpected(
-          lambda x: math_ops.argmax(x, axis=0, output_type=dtypes.int32),
-          np.array([1, 10, 27, 3, 3, 4], dtype=dtype),
-          expected=np.int32(2))
-      self._assertOpOutputMatchesExpected(
-          lambda x: math_ops.argmax(x, axis=0, output_type=dtypes.int32),
-          np.array([[4, 1, 7], [3, 2, 4]], dtype=dtype),
-          expected=np.array([0, 1, 0], dtype=np.int32))
-      self._assertOpOutputMatchesExpected(
-          lambda x: math_ops.argmax(x, axis=1, output_type=dtypes.int32),
-          np.array([[4, 1], [3, 2]], dtype=dtype),
-          expected=np.array([0, 0], dtype=np.int32))
+      # output_type is a numpy data type that is used to specify the desired
+      # output type of the op as well as to convert the Python number to the
+      # array scalar of the type.
+      for output_type in self.int_types:
+        self._assertOpOutputMatchesExpected(
+            math_ops.argmax,
+            axis=0,
+            output_type=output_type,
+            op_input=np.array([1, 10, 27, 3, 3, 4], dtype=dtype),
+            expected=output_type(2))
+        self._assertOpOutputMatchesExpected(
+            math_ops.argmax,
+            axis=0,
+            output_type=output_type,
+            op_input=np.array([[4, 1, 7], [3, 2, 4]], dtype=dtype),
+            expected=np.array([0, 1, 0], dtype=output_type))
+        self._assertOpOutputMatchesExpected(
+            math_ops.argmax,
+            axis=1,
+            output_type=output_type,
+            op_input=np.array([[4, 1], [3, 2]], dtype=dtype),
+            expected=np.array([0, 0], dtype=output_type))
 
-      self._assertOpOutputMatchesExpected(
-          lambda x: math_ops.argmin(x, axis=0, output_type=dtypes.int32),
-          np.array([3, 10, 27, 3, 2, 4], dtype=dtype),
-          expected=np.int32(4))
-      self._assertOpOutputMatchesExpected(
-          lambda x: math_ops.argmin(x, axis=0, output_type=dtypes.int32),
-          np.array([[4, 1, 7], [3, 2, 4]], dtype=dtype),
-          expected=np.array([1, 0, 1], dtype=np.int32))
-      self._assertOpOutputMatchesExpected(
-          lambda x: math_ops.argmin(x, axis=1, output_type=dtypes.int32),
-          np.array([[4, 1], [3, 2]], dtype=dtype),
-          expected=np.array([1, 1], dtype=np.int32))
+        self._assertOpOutputMatchesExpected(
+            math_ops.argmin,
+            axis=0,
+            output_type=output_type,
+            op_input=np.array([3, 10, 27, 3, 2, 4], dtype=dtype),
+            expected=output_type(4))
+        self._assertOpOutputMatchesExpected(
+            math_ops.argmin,
+            axis=0,
+            output_type=output_type,
+            op_input=np.array([[4, 1, 7], [3, 2, 4]], dtype=dtype),
+            expected=np.array([1, 0, 1], dtype=output_type))
+        self._assertOpOutputMatchesExpected(
+            math_ops.argmin,
+            axis=1,
+            output_type=output_type,
+            op_input=np.array([[4, 1], [3, 2]], dtype=dtype),
+            expected=np.array([1, 1], dtype=output_type))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tests/bucketize_op_test.py b/tensorflow/compiler/tests/bucketize_op_test.py
new file mode 100644
index 00000000000000..fde9759a1c2098
--- /dev/null
+++ b/tensorflow/compiler/tests/bucketize_op_test.py
@@ -0,0 +1,78 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for bucketize_op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class BucketizationOpTest(XLATestCase):
+
+  def testInt(self):
+    with self.test_session() as sess:
+      p = array_ops.placeholder(dtypes.int32)
+      with self.test_scope():
+        op = math_ops._bucketize(p, boundaries=[0, 3, 8, 11])
+      expected_out = [0, 1, 1, 2, 2, 3, 3, 4, 4]
+      self.assertAllEqual(expected_out,
+                          sess.run(op, {p: [-5, 0, 2, 3, 5, 8, 10, 11, 12]}))
+
+  def testFloat(self):
+    with self.test_session() as sess:
+      p = array_ops.placeholder(dtypes.float32)
+      with self.test_scope():
+        op = math_ops._bucketize(p, boundaries=[0., 3., 8., 11.])
+      expected_out = [0, 1, 1, 2, 2, 3, 3, 4, 4]
+      self.assertAllEqual(
+          expected_out,
+          sess.run(op, {p: [-5., 0., 2., 3., 5., 8., 10., 11., 12.]}))
+
+  def test2DInput(self):
+    with self.test_session() as sess:
+      p = array_ops.placeholder(dtypes.float32)
+      with self.test_scope():
+        op = math_ops._bucketize(p, boundaries=[0, 3, 8, 11])
+      expected_out = [[0, 1, 1, 2, 2], [3, 3, 4, 4, 1]]
+      self.assertAllEqual(
+          expected_out, sess.run(op,
+                                 {p: [[-5, 0, 2, 3, 5], [8, 10, 11, 12, 0]]}))
+
+  def testInvalidBoundariesOrder(self):
+    with self.test_session() as sess:
+      p = array_ops.placeholder(dtypes.int32)
+      with self.test_scope():
+        op = math_ops._bucketize(p, boundaries=[0, 8, 3, 11])
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "Expected sorted boundaries"):
+        sess.run(op, {p: [-5, 0]})
+
+  def testBoundariesNotList(self):
+    with self.test_session():
+      with self.assertRaisesRegexp(TypeError, "Expected list.*"):
+        p = array_ops.placeholder(dtypes.int32)
+        with self.test_scope():
+          math_ops._bucketize(p, boundaries=0)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/dense_layer_test.py b/tensorflow/compiler/tests/dense_layer_test.py
new file mode 100644
index 00000000000000..865f60ccab46ec
--- /dev/null
+++ b/tensorflow/compiler/tests/dense_layer_test.py
@@ -0,0 +1,135 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for DenseLayer JIT compilation on the CPU and GPU devices."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import numpy as np
+
+from tensorflow.contrib.compiler import jit
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.layers import layers
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+jit_scope = jit.experimental_jit_scope
+
+
+def GetRunMetadataLabels(run_metadata):
+  """Returns all labels in run_metadata."""
+  labels = []
+  for dev_stats in run_metadata.step_stats.dev_stats:
+    for node_stats in dev_stats.node_stats:
+      labels.append(node_stats.timeline_label)
+  return labels
+
+
+def InLabels(labels, substr):
+  """Returns true iff one of the labels contains substr."""
+  return any([substr in x for x in labels])
+
+
+def XlaLaunchOpCount(labels):
+  """Count how many XlaLaunch labels are present."""
+  return sum("XlaLaunch(" in x for x in labels)
+
+
+class DenseLayerTest(test.TestCase):
+
+  def testDenseLayerAutoJit(self):
+    """Tests dense layer compilation in auto-jit mode.
+
+    Dense layer should be compiled into a single XlaLaunch op in auto-jit mode.
+    """
+
+    os.environ["TF_XLA_FLAGS"] = ("--tf_xla_cpu_global_jit")
+    config = config_pb2.ConfigProto()
+    config.graph_options.optimizer_options.global_jit_level = (
+        config_pb2.OptimizerOptions.ON_1)
+
+    with self.test_session(config=config) as sess:
+      x = array_ops.placeholder(shape=[None, None, 3], dtype=np.float32)
+      y = layers.dense(x, 3)
+
+      sess.run(variables.initialize_all_variables())
+      run_metadata = config_pb2.RunMetadata()
+      sess.run(
+          y, {x: np.array([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]])},
+          run_metadata=run_metadata,
+          options=config_pb2.RunOptions(
+              trace_level=config_pb2.RunOptions.FULL_TRACE))
+
+    labels = GetRunMetadataLabels(run_metadata)
+    self.assertEqual(1, XlaLaunchOpCount(labels))
+    self.assertFalse(InLabels(labels, "ListDiff"))
+
+  def testDenseLayerJitScopeDefinedShape(self):
+    """Tests that the dense layer node is properly compiled in jit scope.
+
+    Dense layer with static shape input tensor should be compiled into a single
+    XlaLaunch op by XLA.
+    """
+
+    with self.test_session() as sess:
+      x = array_ops.placeholder(shape=[2, 2, 3], dtype=np.float32)
+      with jit_scope():
+        y = layers.dense(x, 3)
+
+      sess.run(variables.initialize_all_variables())
+      run_metadata = config_pb2.RunMetadata()
+      sess.run(
+          y, {x: np.array([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]])},
+          run_metadata=run_metadata,
+          options=config_pb2.RunOptions(
+              trace_level=config_pb2.RunOptions.FULL_TRACE))
+
+    labels = GetRunMetadataLabels(run_metadata)
+    self.assertEqual(1, XlaLaunchOpCount(labels))
+    # No need to check whether ListDiff is compiled or not because ListDiff op
+    # is not used when input tensor shape is fully defined.
+
+  def testDenseLayerJitScopeUndefinedShape(self):
+    """Tests that the dense layer node is properly compiled in jit scope.
+
+    Dense layer uses shape op to get shape of input tensor if its shape is not
+    fully defined. XLA does not cluster shape op with other operators. But in
+    experimental_jit_scope, XLA is forced to compile shape op into its own
+    cluster, causing dense layer to be split into TWO XlaLaunch ops.
+    """
+
+    with self.test_session() as sess:
+      x = array_ops.placeholder(shape=[None, None, 3], dtype=np.float32)
+      with jit_scope():
+        y = layers.dense(x, 3)
+
+      sess.run(variables.initialize_all_variables())
+      run_metadata = config_pb2.RunMetadata()
+      sess.run(
+          y, {x: np.array([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]])},
+          run_metadata=run_metadata,
+          options=config_pb2.RunOptions(
+              trace_level=config_pb2.RunOptions.FULL_TRACE))
+
+    labels = GetRunMetadataLabels(run_metadata)
+    self.assertEqual(2, XlaLaunchOpCount(labels))
+    self.assertFalse(InLabels(labels, "ListDiff"))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/depthwise_conv_op_test.py b/tensorflow/compiler/tests/depthwise_conv_op_test.py
index 0a0d335ca76dd7..03d96a2cd8ab22 100644
--- a/tensorflow/compiler/tests/depthwise_conv_op_test.py
+++ b/tensorflow/compiler/tests/depthwise_conv_op_test.py
@@ -153,7 +153,7 @@ def _VerifyValues(self,
                   dtype=data_type).reshape(filter_in_sizes)
     with self.test_session() as sess:
       if data_type == np.float32:
-        tolerance = 1e-5
+        tolerance = 1e-4
       else:
         self.assertEqual(data_type, np.float64)
         tolerance = 1e-8
@@ -339,7 +339,7 @@ def _GetVal(use_xla):
 
     gpu_value = _GetVal(use_xla=True)
     cpu_value = _GetVal(use_xla=False)
-    self.assertAllClose(cpu_value, gpu_value, rtol=1e-4, atol=1e-4)
+    self.assertAllClose(cpu_value, gpu_value, rtol=1e-3, atol=1e-3)
 
   def testDepthwiseConv2DInputGradCompare(self):
     for index, (input_size, filter_size, output_size, stride,
diff --git a/tensorflow/compiler/tests/eager_test.py b/tensorflow/compiler/tests/eager_test.py
index bdd0185dfe4abe..4dff5f0f405fb1 100644
--- a/tensorflow/compiler/tests/eager_test.py
+++ b/tensorflow/compiler/tests/eager_test.py
@@ -24,10 +24,16 @@
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.layers import convolutional
+from tensorflow.python.layers import pooling
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import googletest
 
@@ -43,7 +49,7 @@ def testBasic(self):
 
   def testExecuteListOutputLen0(self):
     with self.test_scope():
-      empty = constant_op.constant([], dtype=dtypes.int32)
+      empty = constant_op.constant([], dtype=dtypes.float32)
       result = array_ops.unstack(empty, 0)
       self.assertTrue(isinstance(result, list))
       self.assertEqual(0, len(result))
@@ -51,7 +57,7 @@ def testExecuteListOutputLen0(self):
   def testExecuteListOutputLen1(self):
     with self.test_scope():
       split_dim = constant_op.constant(1)
-      value = constant_op.constant([[0, 1, 2], [3, 4, 5]])
+      value = constant_op.constant([[0., 1., 2.], [3., 4., 5.]])
       result = array_ops.split(value, 1, axis=split_dim)
       self.assertTrue(isinstance(result, list))
       self.assertEqual(1, len(result))
@@ -60,7 +66,7 @@ def testExecuteListOutputLen1(self):
   def testExecuteListOutputLen3(self):
     with self.test_scope():
       split_dim = constant_op.constant(1)
-      value = constant_op.constant([[0, 1, 2], [3, 4, 5]])
+      value = constant_op.constant([[0., 1., 2.], [3., 4., 5.]])
       result = array_ops.split(value, 3, axis=split_dim)
       self.assertTrue(isinstance(result, list))
       self.assertEqual(3, len(result))
@@ -111,6 +117,15 @@ def testAssignAddVariable(self):
       v.assign_add(2.0)
     self.assertEqual(3.0, v.numpy())
 
+  def testReadAssignRead(self):
+    with self.test_scope():
+      v = resource_variable_ops.ResourceVariable(1.0)
+      val1 = v.read_value()
+      v.assign_add(2.0)
+      val2 = v.read_value()
+    self.assertEqual(1.0, val1.numpy())
+    self.assertEqual(3.0, val2.numpy())
+
   def testGradient(self):
     def f(x):
       return x
@@ -130,8 +145,189 @@ def f():
       grads = backprop.implicit_grad(f)()
     self.assertEqual(2., grads[0][0].numpy())
 
+  def testMultipleVariableReads(self):
+    # This test makes sure consecutive variable reads don't copy
+    # the underlying memory.
+    with self.test_scope():
+      # Create 128MiB variables
+      var = resource_variable_ops.ResourceVariable(
+          array_ops.ones([32, 1024, 1024]))
+
+      # Read the same variable 100 times. If the underlying tensor
+      # is not copied, this is a trivial operation. If it is copied,
+      # this will eat over 13GB and OOM.
+      values = []
+      for _ in range(100):
+        values.append(var.value())
+
+
+class EagerFunctionTest(XLATestCase):
+
+  def testBasic(self):
+    with self.test_scope():
+      matmul = function.defun(math_ops.matmul, compiled=True)
+      t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+      sq = matmul(t, t, transpose_a=True)
+      self.assertAllEqual(sq.numpy().reshape(-1), [10, 14, 14, 20])
+
+  def testConv(self):
+    if 'GPU' in self.device:
+      # TODO(b/32333178)
+      self.skipTest('Current implementation of RandomStandardNormal kernel '
+                    'is very slow on GPU, and has been blacklisted.')
+    with self.test_scope():
+      data_format = 'channels_last'
+      conv = convolutional.Conv2D(
+          filters=1, kernel_size=2, padding='VALID',
+          data_format=data_format, activation=nn_ops.relu,
+          kernel_initializer=init_ops.ones_initializer(),
+          bias_initializer=init_ops.zeros_initializer())
+      pool = pooling.MaxPooling2D(2, 2, data_format=data_format)
+
+      def model(x):
+        x = conv(x)
+        return pool(x)
+      model = function.defun(model, compiled=True)
+
+      x = array_ops.ones([1, 4, 4, 1])
+      y = model(x)
+      self.assertAllEqual(y.numpy(), [[[[4.]]]])
+
+  def testReadVariable(self):
+    with self.test_scope():
+      v = resource_variable_ops.ResourceVariable(1.0)
+
+      @function.defun(compiled=True)
+      def f():
+        return v.read_value()
+
+      var = f()
+      self.assertEqual(1.0, var.numpy())
+
+  def testUpdateVariable(self):
+    with self.test_scope():
+      v = resource_variable_ops.ResourceVariable(1.0)
+
+      def f(v):
+        v.assign_add(1.0)
+        return v
+
+      f = function.defun(f, compiled=True)
+
+      var = f(v)
+      self.assertEqual(2.0, var.numpy())
+
+  def testAllArgumentKinds(self):
+    """Test a complex function that takes different argument kinds.
+
+    tf2xla machinery that translates, compiles, and runs defuns
+    classifies arguments into: compile-time constants, regular tensors,
+    and resources. This test creates a function with a mix of all these
+    kinds. Moreover, the order of function arguments is intentionally mixed up.
+
+    This also tests the case when the same argument is a compile-time constant
+    as well as used in an operation that normally expects its inputs to be
+    in device memory - addition in this case.
+    """
+    with self.test_scope():
+      def foo(c1, r1, v1, c2, v2, r2):
+        # c1 and c2 are compile-time constants
+        # r1 and r2 are regular tensors
+        # v1 and v2 are resource variables
+        a = c1 + r1
+        b = math_ops.cast(c2, dtypes.float32) + v2
+        c = array_ops.slice(v1, c1, c2)
+        d = r2 * v2
+        return a, b, c, d
+
+      foo = function.defun(foo, compiled=True)
+
+      c1 = [0, 0]
+      c2 = array_ops.ones([2], dtype=dtypes.int32)
+
+      r1 = array_ops.ones([2])
+      r2 = [[2., 2.], [3., 3.]]
+
+      v1 = resource_variable_ops.ResourceVariable([[1., 2.], [3., 4.]])
+      v2 = resource_variable_ops.ResourceVariable([[10., 20.], [30., 40.]])
+
+      a, b, c, d = foo(c1, r1, v1, c2, v2, r2)
+
+      self.assertAllEqual([1, 1], a.numpy())
+      self.assertAllEqual([[11., 21.], [31., 41.]], b.numpy())
+      self.assertAllEqual([[1.]], c.numpy())
+      self.assertAllEqual([[20., 40.], [90., 120.]], d.numpy())
+
+  def testDefunInGradientTape(self):
+    with self.test_scope():
+      v0 = resource_variable_ops.ResourceVariable(5.0)
+
+      @function.defun(compiled=True)
+      def f(x):
+        x = v0 * v0 * x
+        return x
+
+      x = constant_op.constant(3.0)
+      with backprop.GradientTape() as tape:
+        y = f(x)
+      dy = tape.gradient(y, v0)
+
+    self.assertEqual(75, y.numpy())
+    self.assertEqual(30, dy.numpy())
+
+
+class ExcessivePaddingTest(XLATestCase):
+  """Test that eager execution works with TPU flattened tensors.
+
+  Tensors that would normally be excessively padded when written
+  to TPU memory are reshaped to 1-D flat tensors.
+
+  This test case verifies that such tensors work with eager execution.
+
+  The flattening currently only happens on TPU, but tests should work
+  fine with all backends as flattening is transparent.
+  """
+
+  def testFromConstant(self):
+    with self.test_scope():
+      # Create constant of shape [100, 2, 1]. This tensor would be
+      # excessively padded on TPU.
+      tensor = constant_op.constant(100 * [[[10.0], [2.0]]])
+      # Use reduce_sum since it requires correctly working with
+      # a particular dimension.
+      reduced = math_ops.reduce_sum(tensor, axis=1)
+      self.assertAllEqual(100 * [[12.0]], reduced)
+
+  def testFromOperation(self):
+    with self.test_scope():
+      tensor = array_ops.ones([3, 100, 2, 2])
+      reduced = math_ops.reduce_sum(tensor, axis=[0, 2, 3])
+      self.assertAllEqual(100 * [12.0], reduced)
+
+  def testAsFunctionInput(self):
+    with self.test_scope():
+
+      @function.defun(compiled=True)
+      def f(x):
+        return math_ops.reduce_sum(x, axis=2)
+
+      tensor = constant_op.constant(100 * [[[10.0, 2.0]]])
+      reduced = f(tensor)
+      self.assertAllEqual(100 * [[12.0]], reduced)
+
+  def testAsFunctionOutput(self):
+    with self.test_scope():
+
+      @function.defun(compiled=True)
+      def f(x):
+        return x * constant_op.constant(100 * [[[10.0, 2.0]]])
+
+      y = f(3)
+      reduced = math_ops.reduce_sum(y, axis=2)
+      self.assertAllEqual(100 * [[36.0]], reduced)
+
 
-if __name__ == "__main__":
+if __name__ == '__main__':
   ops.enable_eager_execution(
       config=config_pb2.ConfigProto(log_device_placement=True))
   googletest.main()
diff --git a/tensorflow/compiler/tests/function_test.py b/tensorflow/compiler/tests/function_test.py
index fbc3c994d163a5..8a3f4b0bdc7a61 100644
--- a/tensorflow/compiler/tests/function_test.py
+++ b/tensorflow/compiler/tests/function_test.py
@@ -24,12 +24,10 @@
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import googletest
 
 
-@test_util.with_c_api
 class FunctionTest(XLATestCase):
 
   def testFunction(self):
diff --git a/tensorflow/compiler/tests/image_ops_test.py b/tensorflow/compiler/tests/image_ops_test.py
index 42e637734c578f..7cf953ef25ef5d 100644
--- a/tensorflow/compiler/tests/image_ops_test.py
+++ b/tensorflow/compiler/tests/image_ops_test.py
@@ -65,9 +65,7 @@ def testBatch(self):
         join1 = array_ops.stack(split1)
         join2 = array_ops.stack(split2)
         batch1, batch2, join1, join2 = sess.run([batch1, batch2, join1, join2],
-                                                {
-                                                    batch0: inp
-                                                })
+                                                {batch0: inp})
 
       # Verify that processing batch elements together is the same as separate
       self.assertAllClose(batch1, join1)
@@ -401,9 +399,7 @@ def testAdjustRandomSaturation(self):
           x = array_ops.placeholder(dtypes.float32, shape=x_shape)
           with self.test_scope():
             y_fused = self._adjust_saturation(x,
-                                              scale).eval(feed_dict={
-                                                  x: x_np
-                                              })
+                                              scale).eval(feed_dict={x: x_np})
           self.assertAllClose(y_fused, y_baseline, rtol=2e-5, atol=1e-5)
 
 
@@ -412,7 +408,8 @@ class ResizeBilinearTest(XLATestCase):
   def _assertForwardOpMatchesExpected(self,
                                       image_np,
                                       target_shape,
-                                      expected=None):
+                                      expected=None,
+                                      large_tolerance=False):
     if expected is None:
       self.fail("expected must be specified")
     with self.test_session() as sess, self.test_scope():
@@ -420,7 +417,11 @@ def _assertForwardOpMatchesExpected(self,
       resized = gen_image_ops.resize_bilinear(
           image, target_shape, align_corners=True)
       out = sess.run(resized, {image: image_np[np.newaxis, :, :, np.newaxis]})
-      self.assertAllClose(expected[np.newaxis, :, :, np.newaxis], out)
+      if large_tolerance:
+        self.assertAllClose(
+            expected[np.newaxis, :, :, np.newaxis], out, rtol=0.03, atol=0.1)
+      else:
+        self.assertAllClose(expected[np.newaxis, :, :, np.newaxis], out)
 
   def _assertBackwardOpMatchesExpected(self,
                                        grads_np,
@@ -555,6 +556,28 @@ def testAlignCorners3x3To9x9Grad(self):
               [[12.5, 27.5, 21.875], [42.5, 80.0, 57.5], [40.625, 72.5, 50]],
               dtype=np.float32))
 
+  def testAlignCorners4x4To8x8(self):
+    self._assertForwardOpMatchesExpected(
+        (np.array([[0, 1, 2, 3]], dtype=np.float32) + np.array(
+            [[0], [1], [2], [3]], dtype=np.float32)) * 7.0, [8, 8],
+        expected=3 *
+        (np.array([[0, 1, 2, 3, 4, 5, 6, 7]], dtype=np.float32) + np.array(
+            [[0], [1], [2], [3], [4], [5], [6], [7]], dtype=np.float32)),
+        large_tolerance=True)
+
+  def testAlignCorners8x8To16x16(self):
+    self._assertForwardOpMatchesExpected(
+        (np.array([[0, 1, 2, 3, 4, 5, 6, 7]], dtype=np.float32) + np.array(
+            [[0], [1], [2], [3], [4], [5], [6], [7]], dtype=np.float32)) * 15.0,
+        [16, 16],
+        expected=7 * (np.array(
+            [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]],
+            dtype=np.float32) + np.array(
+                [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11],
+                 [12], [13], [14], [15]],
+                dtype=np.float32)),
+        large_tolerance=True)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/compiler/tests/jit_test.py b/tensorflow/compiler/tests/jit_test.py
index 1f7da659e5590b..6e0db54b7a74b2 100644
--- a/tensorflow/compiler/tests/jit_test.py
+++ b/tensorflow/compiler/tests/jit_test.py
@@ -78,10 +78,10 @@ def InLabels(labels, substr):
 
 
 def MetadataHasXlaLaunch(run_metadata):
-  """Returns true if there is a _XlaLaunch kernel in run_metadata's timeline."""
+  """Returns true if there is a XlaLaunch kernel in run_metadata's timeline."""
 
   # TODO(phawkins): find a less hacky way to test whether a kernel ran.
-  return InLabels(RunMetadataLabels(run_metadata), "_XlaLaunch")
+  return InLabels(RunMetadataLabels(run_metadata), "XlaLaunch")
 
 
 class JitLaunchTest(test.TestCase):
@@ -90,8 +90,8 @@ class JitLaunchTest(test.TestCase):
   # Verifies that the outputs match and that XLA was invoked. 'fn' must take
   # the same number of tensors as arguments that are in 'args', and must return
   # a tuple of output tensors.
-  # If 'require_kernel_launch' is True, then we verify that a _XlaLaunch node
-  # actually ran. However, it is sometimes possible for _XlaLaunch ops to be
+  # If 'require_kernel_launch' is True, then we verify that a XlaLaunch node
+  # actually ran. However, it is sometimes possible for XlaLaunch ops to be
   # constant-folded away, so the check is optional.
   def _compare(self, fn, args, require_kernel_launch=True, noinline=None):
     with session_lib.Session(config=NoRewriteSessionConfig()) as sess:
@@ -125,7 +125,7 @@ def _compare(self, fn, args, require_kernel_launch=True, noinline=None):
           for (x, y) in zip(compiled, direct):
             self.assertAllClose(x, y, rtol=1e-1)
         else:
-          self.assertAllClose(compiled, direct)
+          self.assertAllClose(compiled, direct, rtol=1e-2)
 
   def testNoOutputs(self):
     with session_lib.Session() as sess:
@@ -441,14 +441,14 @@ def Forward(x):
     self.assertFalse(InLabels(labels, "Log"))
     self.assertTrue(InLabels(labels, "Reciprocal"))
     self.assertTrue(InLabels(labels, "Mul"))
-    self.assertFalse(InLabels(labels, "_XlaLaunch"))
+    self.assertFalse(InLabels(labels, "XlaLaunch"))
 
-    # Compile the backprop. One _XlaLaunch.
+    # Compile the backprop. One XlaLaunch.
     labels = _Run(compiled=True)
     self.assertFalse(InLabels(labels, "Log"))
     self.assertFalse(InLabels(labels, "Reciprocal"))
     self.assertFalse(InLabels(labels, "Mul"))
-    self.assertTrue(InLabels(labels, "_XlaLaunch"))
+    self.assertTrue(InLabels(labels, "XlaLaunch"))
 
 
 class ElementWiseFusionTest(test.TestCase):
@@ -482,14 +482,15 @@ def simpleTest(self, arg0, arg1, global_jit_level):
               trace_level=config_pb2.RunOptions.FULL_TRACE))
 
       labels = RunMetadataLabels(run_metadata)
-      count = sum("_XlaLaunch(" in x for x in labels)
+      count = sum("XlaLaunch(" in x for x in labels)
 
       return output, count
 
   def testElementWiseClustering(self):
     arg0 = np.random.rand(2, 2).astype(np.float32)
     arg1 = np.random.rand(2, 2).astype(np.float32)
-    os.environ["TF_XLA_FLAGS"] = "--tf_xla_fusion_only=true"
+    os.environ["TF_XLA_FLAGS"] = ("--tf_xla_fusion_only=true "
+                                  "--tf_xla_cpu_global_jit")
     tf_op, tf_count = self.simpleTest(arg0, arg1,
                                       config_pb2.OptimizerOptions.OFF)
     self.assertEqual(0, tf_count)
diff --git a/tensorflow/compiler/tests/listdiff_op_test.py b/tensorflow/compiler/tests/listdiff_op_test.py
new file mode 100644
index 00000000000000..45a04f0cf56e88
--- /dev/null
+++ b/tensorflow/compiler/tests/listdiff_op_test.py
@@ -0,0 +1,101 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for XLA listdiff operator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class ListDiffTest(xla_test.XLATestCase):
+
+  def _testListDiff(self, x, y, out, idx):
+    for dtype in [dtypes.int32, dtypes.int64]:
+      for index_dtype in [dtypes.int32, dtypes.int64]:
+        with self.test_session() as sess:
+          x_tensor = ops.convert_to_tensor(x, dtype=dtype)
+          y_tensor = ops.convert_to_tensor(y, dtype=dtype)
+          with self.test_scope():
+            out_tensor, idx_tensor = array_ops.listdiff(
+                x_tensor, y_tensor, out_idx=index_dtype)
+            tf_out, tf_idx = sess.run([out_tensor, idx_tensor])
+        self.assertAllEqual(out, tf_out)
+        self.assertAllEqual(idx, tf_idx)
+        self.assertEqual(1, out_tensor.get_shape().ndims)
+        self.assertEqual(1, idx_tensor.get_shape().ndims)
+
+  def testBasic1(self):
+    self._testListDiff(x=[1, 2, 3, 4], y=[1, 2], out=[3, 4], idx=[2, 3])
+
+  def testBasic2(self):
+    self._testListDiff(x=[1, 2, 3, 4], y=[2], out=[1, 3, 4], idx=[0, 2, 3])
+
+  def testBasic3(self):
+    self._testListDiff(x=[1, 4, 3, 2], y=[4, 2], out=[1, 3], idx=[0, 2])
+
+  def testDuplicates(self):
+    self._testListDiff(x=[1, 2, 4, 3, 2, 3, 3, 1],
+                       y=[4, 2],
+                       out=[1, 3, 3, 3, 1],
+                       idx=[0, 3, 5, 6, 7])
+
+  def testRandom(self):
+    num_random_tests = 10
+    int_low = -7
+    int_high = 8
+    max_size = 50
+    for _ in xrange(num_random_tests):
+      x_size = np.random.randint(max_size + 1)
+      x = np.random.randint(int_low, int_high, size=x_size)
+      y_size = np.random.randint(max_size + 1)
+      y = np.random.randint(int_low, int_high, size=y_size)
+      out_idx = [(entry, pos) for pos, entry in enumerate(x) if entry not in y]
+      if out_idx:
+        out, idx = map(list, zip(*out_idx))
+      else:
+        out = []
+        idx = []
+      self._testListDiff(list(x), list(y), out, idx)
+
+  def testFullyOverlapping(self):
+    self._testListDiff(x=[1, 2, 3, 4], y=[1, 2, 3, 4], out=[], idx=[])
+
+  def testNonOverlapping(self):
+    self._testListDiff(x=[1, 2, 3, 4],
+                       y=[5, 6],
+                       out=[1, 2, 3, 4],
+                       idx=[0, 1, 2, 3])
+
+  def testEmptyX(self):
+    self._testListDiff(x=[], y=[1, 2], out=[], idx=[])
+
+  def testEmptyY(self):
+    self._testListDiff(x=[1, 2, 3, 4], y=[], out=[1, 2, 3, 4], idx=[0, 1, 2, 3])
+
+  def testEmptyXY(self):
+    self._testListDiff(x=[], y=[], out=[], idx=[])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/oom_test.py b/tensorflow/compiler/tests/oom_test.py
index 1434e965e3d7ea..d68d32057a3677 100644
--- a/tensorflow/compiler/tests/oom_test.py
+++ b/tensorflow/compiler/tests/oom_test.py
@@ -22,6 +22,8 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import googletest
 
 
@@ -42,20 +44,33 @@ def testOutputOutOfMemory(self):
     """
 
     def test_loop():
-      size = 2e8
+      size = int(2e8)
       while True:
         with self.test_session():
-          # Force the compiled code to not be constant by feeding in an addend.
-          p = array_ops.placeholder(dtypes.float32, shape=[])
+          # Force the compiled code to not be constant by feeding in a
+          # parameter.
+          p = array_ops.placeholder(dtypes.float32, shape=[2, 1, 1])
           with self.test_scope():
-            # Create a large R1 tensor.
-            c = array_ops.zeros([size, 1]) + p
+            # Create a computation that produces a large R1 tensor as an
+            # intermediate result.  Reduce it down so that if this file was
+            # compiled without --config=cuda, we don't force a D2H copy of a
+            # large tensor and potentially OOM the host.
+            #
+            # This is a bit tricky because XLA:GPU doesn't currently support RNG
+            # ops.  Here we rely on the fact that XLA doesn't do algebraic
+            # simplifications on conv(<ones>, <filter>).
+            c = math_ops.reduce_sum(
+                nn_ops.convolution(
+                    array_ops.ones([1, size, 1]),
+                    p,
+                    padding='SAME',
+                    data_format='NWC'))
 
-            c.eval(feed_dict={p: 1.0})
+            c.eval(feed_dict={p: [[[1.0]], [[2.0]]]})
             size *= 2
 
     self.assertRaises(errors.ResourceExhaustedError, test_loop)
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/compiler/tests/random_ops_test.py b/tensorflow/compiler/tests/random_ops_test.py
index d6c93088d4efff..70be22936a500f 100644
--- a/tensorflow/compiler/tests/random_ops_test.py
+++ b/tensorflow/compiler/tests/random_ops_test.py
@@ -76,6 +76,13 @@ def testRandomUniformIsInRange(self):
         self.assertTrue((y >= -2).sum() == 1000)
         self.assertTrue((y < 33).sum() == 1000)
 
+  def testTruncatedNormalIsNotConstant(self):
+    def rng(dtype):
+      return random_ops.truncated_normal(shape=[2], dtype=dtype)
+
+    # TODO(b/34339814): implement inverse erf support for non-F32 types.
+    self._testRngIsNotConstant(rng, dtypes.float32)
+
   def testTruncatedNormalIsInRange(self):
     count = 10000
     # TODO(b/34339814): implement inverse erf support for non-F32 types.
diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc
index e53efc3091d893..16f293891d56d7 100644
--- a/tensorflow/compiler/tests/randomized_tests.cc
+++ b/tensorflow/compiler/tests/randomized_tests.cc
@@ -619,8 +619,8 @@ std::vector<int64> OpTest::ImageDims(TensorFormat format, int batch,
         dims.push_back(dim);
       }
       break;
-    case FORMAT_NCHW_VECT_C:
-      LOG(FATAL) << "FORMAT_NCHW_VECT_C not supported.";
+    default:
+      LOG(FATAL) << "Tensor format " << ToString(format) << " not supported.";
   }
   return dims;
 }
diff --git a/tensorflow/compiler/tests/reduce_ops_test.py b/tensorflow/compiler/tests/reduce_ops_test.py
index 2c084b04fa2f67..7420724bdbeab6 100644
--- a/tensorflow/compiler/tests/reduce_ops_test.py
+++ b/tensorflow/compiler/tests/reduce_ops_test.py
@@ -19,6 +19,7 @@
 from __future__ import print_function
 
 import functools
+import itertools
 import numpy as np
 
 from tensorflow.compiler.tests.xla_test import XLATestCase
@@ -155,5 +156,68 @@ def testReduceAny(self):
     self._testReduction(math_ops.reduce_any, np.any, np.bool, self.BOOL_DATA)
 
 
+class ReduceOpPrecisionTest(XLATestCase):
+
+  def _testReduceSum(self,
+                     expected_result,
+                     dtype,
+                     test_inputs,
+                     rtol=1e-3,
+                     atol=1e-4):
+    """Tests reduce sum on a list of input arrays.
+
+    For each array in test_inputs, check that performing reduce sum on the array
+    produces a value that is close to the expected result.
+
+    Args:
+      expected_result: the expected result.
+      dtype: the data type of the reduce sum operation.
+      test_inputs: a list of input arrays for the reduce sum operation.
+      rtol: the relative error.
+      atol: the absolute error.
+    """
+
+    for test_input in test_inputs:
+      with self.test_session() as sess:
+        with self.test_scope():
+          a = array_ops.placeholder(dtype)
+          index = array_ops.placeholder(dtypes.int32)
+          out = math_ops.reduce_sum(a, index)
+        result = sess.run(out, {
+            a: np.array(test_input, dtype=dtype),
+            index: [0]
+        })
+        # Compare the results using float32 type.
+        self.assertAllClose(
+            np.float32(result),
+            np.float32(expected_result),
+            rtol=rtol,
+            atol=atol)
+
+  def testReduceSumF16(self):
+    """Tests the reduce sum of float16 doesn't lose too much precision."""
+
+    if np.float16 not in self.all_types:
+      return
+
+    f16_max = np.finfo(np.float16).max
+    self._testReduceSum(
+        f16_max, np.float16,
+        itertools.permutations([f16_max, f16_max, f16_max * (-1.0)], 3))
+
+  def testReduceSumBF16(self):
+    """Tests the reduce sum of bfloat16 doesn't lose too much precision."""
+
+    if dtypes.bfloat16.as_numpy_dtype not in self.all_types:
+      return
+
+    bf16_max = np.float32(dtypes.bfloat16.max)
+    f32_max = dtypes.float32.max
+    value = min(bf16_max, f32_max - bf16_max)
+    self._testReduceSum(
+        dtypes.bfloat16.as_numpy_dtype(value), dtypes.bfloat16.as_numpy_dtype,
+        itertools.permutations([bf16_max, value, bf16_max * (-1.0)], 3))
+
+
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/compiler/tests/stateless_random_ops_test.py b/tensorflow/compiler/tests/stateless_random_ops_test.py
index 4336ebdbd184a0..b6f8390a45d43b 100644
--- a/tensorflow/compiler/tests/stateless_random_ops_test.py
+++ b/tensorflow/compiler/tests/stateless_random_ops_test.py
@@ -86,6 +86,15 @@ def testDistributionOfStatelessRandomUniform(self):
         # seed were not fixed.
         self.assertTrue(self._chi_squared(y, 10) < 16.92)
 
+  def testRandomNormalIsFinite(self):
+    with self.test_session() as sess, self.test_scope():
+      for dtype in self._random_types():
+        seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
+        x = stateless.stateless_random_uniform(
+            shape=[10000], seed=seed_t, dtype=dtype)
+        y = sess.run(x, {seed_t: [0x12345678, 0xabcdef12]})
+        self.assertTrue(np.all(np.isfinite(y)))
+
   def _normal_cdf(self, x):
     """Cumulative distribution function for a standard normal distribution."""
     return 0.5 + 0.5 * np.vectorize(math.erf)(x / math.sqrt(2))
diff --git a/tensorflow/compiler/tests/tensor_array_ops_test.py b/tensorflow/compiler/tests/tensor_array_ops_test.py
index 7624d6e4b2e2ec..f332aa2e9b97e1 100644
--- a/tensorflow/compiler/tests/tensor_array_ops_test.py
+++ b/tensorflow/compiler/tests/tensor_array_ops_test.py
@@ -472,7 +472,9 @@ def _testTensorArrayGradientWriteReadType(self, dtype):
       self.assertAllEqual(c([[-2.0, -10.0]]), grad_vals[1])
 
   def testTensorArrayGradientWriteRead(self):
-    for dtype in self.numeric_types:
+    for dtype in self.float_types:
+      self._testTensorArrayGradientWriteReadType(dtype)
+    for dtype in self.complex_types:
       self._testTensorArrayGradientWriteReadType(dtype)
 
   def _testTensorArrayGradientWritePackConcatAndRead(self):
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index ba79f393a8f9b2..689a4a1f4e02f5 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -209,7 +209,8 @@ def testFloatOps(self):
       self._assertOpOutputMatchesExpected(
           math_ops.expm1,
           np.array([[-1, 1]], dtype=dtype),
-          expected=np.array([[-0.63212056, 1.71828183]], dtype=dtype))
+          expected=np.array([[-0.63212056, 1.71828183]], dtype=dtype),
+          rtol=1e-5)
 
       self._assertOpOutputMatchesExpected(
           math_ops.floor,
@@ -251,12 +252,12 @@ def testFloatOps(self):
           np.array([[1, 2]], dtype=dtype),
           expected=np.array([[0.540297, -0.41614]], dtype=dtype))
 
-      # TODO(b/34703906): improve log1p implementation and make tolerance
-      # tighter.
       self._assertOpOutputMatchesExpected(
           math_ops.log1p,
           np.array([[1e-14, 1e-15, 0.6]], dtype=dtype),
-          expected=np.log1p(np.array([[1e-14, 1e-15, 0.6]], dtype=dtype)))
+          expected=np.log1p(np.array([[1e-14, 1e-15, 0.6]], dtype=dtype)),
+          rtol=1e-4,
+          atol=1e-6)
 
       self._assertOpOutputMatchesExpected(
           math_ops.rint,
@@ -333,13 +334,19 @@ def testFloatOps(self):
 
       self._assertOpOutputMatchesExpected(
           nn_ops.elu,
-          np.array([[-1, 0, 1]], dtype=dtype),
-          expected=np.array([[-0.63212056, 0, 1]], dtype=dtype))
+          np.array([[-1, 0, 1, -1e-6]], dtype=dtype),
+          expected=np.array([[-0.63212056, 0, 1, -9.999995e-07]], dtype=dtype),
+          rtol=1e-5,
+          atol=1e-6)
 
       self._assertOpOutputMatchesExpected(
           nn_ops.selu,
-          np.array([[-1, 0, 1]], dtype=dtype),
-          expected=np.array([[-1.11133074, 0., 1.05070099]], dtype=dtype))
+          np.array([[-1, 0, 1, -1e-5]], dtype=dtype),
+          expected=np.array(
+              [[-1.11133074, 0., 1.05070099, -1.758090550379974e-05]],
+              dtype=dtype),
+          rtol=1e-5,
+          atol=1e-6)
 
       self._assertOpOutputMatchesExpected(
           nn_ops.relu,
@@ -419,7 +426,9 @@ def testComplexOps(self):
       self._assertOpOutputMatchesExpected(
           math_ops.expm1,
           np.array([[-1 + 2j, 3j, 2 - 3j]], dtype=dtype),
-          expected=np.expm1(np.array([[-1 + 2j, 3j, 2 - 3j]], dtype=dtype)))
+          expected=np.expm1(np.array([[-1 + 2j, 3j, 2 - 3j]], dtype=dtype)),
+          rtol=1e-6,
+          atol=1e-6)
 
       self._assertOpOutputMatchesExpected(
           math_ops.reciprocal,
@@ -441,13 +450,13 @@ def testComplexOps(self):
           np.array([[5j, 3 - 2j]], dtype=dtype),
           expected=np.cos(np.array([[5j, 3 - 2j]], dtype=dtype)))
 
-      # TODO(b/34703906): improve log1p implementation and make tolerance
-      # tighter.
       self._assertOpOutputMatchesExpected(
           math_ops.log1p,
           np.array([[1e-14, 1e-15j, 0.6 - 0.3j]], dtype=dtype),
           expected=np.log1p(
-              np.array([[1e-14, 1e-15j, 0.6 - 0.3j]], dtype=dtype)))
+              np.array([[1e-14, 1e-15j, 0.6 - 0.3j]], dtype=dtype)),
+          rtol=1e-4,
+          atol=1e-6)
 
       val = np.array([1, 2j, 2 - 3j, 4 + 5j], dtype=dtype)
       self._assertOpOutputMatchesExpected(
@@ -789,7 +798,9 @@ def _assertSoftplusMatchesExpected(self, features, dtype):
     zero = np.asarray(0).astype(dtype)
     expected = np.logaddexp(zero, features)
     self._assertOpOutputMatchesExpected(
-        nn_ops.softplus, features, expected=expected)
+        nn_ops.softplus, features, expected=expected,
+        rtol=1e-6,
+        atol=9.1e-6)
 
   def testSoftplus(self):
     for dtype in self.float_types:
diff --git a/tensorflow/compiler/tests/variable_ops_test.py b/tensorflow/compiler/tests/variable_ops_test.py
index 8ecad00f6e23b3..2c09b03d5a35cd 100644
--- a/tensorflow/compiler/tests/variable_ops_test.py
+++ b/tensorflow/compiler/tests/variable_ops_test.py
@@ -187,6 +187,25 @@ def testTraining(self):
           rtol=1e-4)
       self.assertAllClose(np.array([1.9, 2.9], dtype=np.float32), vb, rtol=1e-4)
 
+  def testWriteOfAliasedTensor(self):
+    for dtype in self.numeric_types:
+      init = np.array([[1, 2j], [3, 4]]).astype(dtype)
+      update = np.array([[7, 1j], [2, 11]]).astype(dtype)
+      with self.test_session() as sess, self.test_scope():
+        v = resource_variable_ops.ResourceVariable(init)
+        sess.run(variables.variables_initializer([v]))
+        p = array_ops.placeholder(dtype)
+        q = array_ops.identity(p)
+        x = v.read_value()
+        # Writes the value of 'p' to 'v', but keeps a reference to the original
+        # value of 'v' so the variable update cannot reuse its buffer.
+        with ops.control_dependencies([x]):
+          y = v.assign(q)
+        result = sess.run([x, y, q], {p: update})
+        self.assertAllClose(init, result[0])
+        self.assertAllClose(update, result[1])
+        self.assertAllClose(update, result[2])
+
 
 class StridedSliceAssignChecker(object):
   """Compares the results of a slice assignment using Tensorflow and numpy."""
diff --git a/tensorflow/compiler/tests/xla_device_gpu_test.py b/tensorflow/compiler/tests/xla_device_gpu_test.py
new file mode 100644
index 00000000000000..1e30ebd55d09fe
--- /dev/null
+++ b/tensorflow/compiler/tests/xla_device_gpu_test.py
@@ -0,0 +1,48 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test cases for XLA devices."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class XlaDeviceGpuTest(test.TestCase):
+
+  def testCopiesToAndFromGpuWork(self):
+    """Tests that copies between GPU and XLA devices work."""
+    if not test.is_gpu_available():
+      return
+
+    with session_lib.Session() as sess:
+      x = array_ops.placeholder(dtypes.float32, [2])
+      with ops.device("GPU"):
+        y = x * 2
+      with ops.device("device:XLA_CPU:0"):
+        z = y * y
+      with ops.device("GPU"):
+        w = y + z
+      result = sess.run(w, {x: [1.5, 0.5]})
+    self.assertAllClose(result, [12., 2.], rtol=1e-3)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tests/xla_device_test.py b/tensorflow/compiler/tests/xla_device_test.py
index f5c228f8305d74..f0b010fa67f2ff 100644
--- a/tensorflow/compiler/tests/xla_device_test.py
+++ b/tensorflow/compiler/tests/xla_device_test.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -18,30 +18,40 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.client import session as session_lib
-from tensorflow.python.framework import dtypes
+import numpy as np
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.platform import test
 
 
-class XlaDeviceTest(test.TestCase):
+class XlaDeviceTest(XLATestCase):
 
   def testCopies(self):
-    """Tests that copies between GPU and XLA devices work."""
-    if not test.is_gpu_available():
-      return
-
-    with session_lib.Session() as sess:
-      x = array_ops.placeholder(dtypes.float32, [2])
-      with ops.device("GPU"):
-        y = x * 2
-      with ops.device("device:XLA_CPU:0"):
-        z = y * y
-      with ops.device("GPU"):
-        w = y + z
-      result = sess.run(w, {x: [1.5, 0.5]})
-    self.assertAllClose(result, [12., 2.], rtol=1e-3)
+    """Tests that copies onto and off XLA devices work."""
+    shapes = [[0], [1], [1, 0], [1024, 0], [1024, 1], [3, 777], [777, 3],
+              [16384, 1], [1, 16384], [1, 20000, 1, 1]]
+    for dtype in self.numeric_types:
+      for shape in shapes:
+        with self.test_session() as sess:
+          with ops.device("CPU"):
+            x = array_ops.placeholder(dtype, shape)
+          with self.test_scope():
+            y = x + x
+          with ops.device("CPU"):
+            z = array_ops.identity(y)
+
+          inputs = np.random.randint(-100, 100, shape).astype(dtype)
+          result = sess.run(z, {x: inputs})
+        self.assertAllCloseAccordingToType(result, inputs + inputs)
+
+  def testControlTrigger(self):
+    with self.test_session() as sess:
+      with self.test_scope():
+        x = gen_control_flow_ops.control_trigger()
+      sess.run(x)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 382d3d1aa9cf27..b75bfe96073e6f 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -82,7 +82,7 @@ cc_library(
         "//tensorflow/compiler/tf2xla/kernels:xla_cpu_only_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla/client",
-        "//tensorflow/compiler/xla/client:computation",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -171,9 +171,9 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -218,7 +218,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:sharding_builder",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
@@ -329,6 +328,7 @@ tf_cc_test(
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service:cpu_plugin",
diff --git a/tensorflow/compiler/tf2xla/cc/BUILD b/tensorflow/compiler/tf2xla/cc/BUILD
index 4f8bb8ad743afe..ea8d1b3d14939d 100644
--- a/tensorflow/compiler/tf2xla/cc/BUILD
+++ b/tensorflow/compiler/tf2xla/cc/BUILD
@@ -27,3 +27,25 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
     ],
 )
+
+tf_gen_op_wrapper_cc(
+    name = "xla_jit_op_gen",
+    out_ops_file = "ops/xla_jit_op",
+    deps = ["//tensorflow/compiler/jit/ops:xla_ops"],
+)
+
+cc_library(
+    name = "xla_jit_ops",
+    srcs = ["ops/xla_jit_op.cc"],
+    hdrs = ["ops/xla_jit_op.h"],
+    deps = [
+        "//tensorflow/cc:const_op",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/compiler/jit/ops:xla_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 8d1f2684909e87..42585ad4d8a17d 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -282,7 +282,58 @@ Status BuildLoopBody(const Graph& graph, Frame* frame,
   return Status::OK();
 }
 
-Status FunctionalizeLoop(Graph* graph, Frame* frame,
+// Copy the FunctionDef of given function from lookup_library to library, if
+// it can be found in lookup_library but is missing from library.
+Status AddMissingFunctionByName(const string& function_name,
+                                const FunctionLibraryDefinition* lookup_library,
+                                FunctionLibraryDefinition* library) {
+  if (!library->Find(function_name) && lookup_library->Find(function_name)) {
+    return library->AddFunctionDef(*lookup_library->Find(function_name));
+  }
+  return Status::OK();
+}
+
+// Iterate over all functions that the given fdef refers to. Copy the missing
+// FunctionDefs from lookup_library to library.
+Status AddMissingFunctionDef(const FunctionDef& fdef,
+                             const FunctionLibraryDefinition* lookup_library,
+                             FunctionLibraryDefinition* library) {
+  TF_RET_CHECK(lookup_library);
+  for (const NodeDef& node : fdef.node_def()) {
+    if (library->Find(node.op())) {
+      continue;
+    }
+    // The function refered by 'SymbolicGradient' node is specified in its
+    // attribute 'f'.
+    if (node.op() == FunctionLibraryDefinition::kGradientOp) {
+      const AttrValue* attr =
+          AttrSlice(&node.attr()).Find(FunctionLibraryDefinition::kFuncAttr);
+      if (!attr) {
+        return errors::InvalidArgument("SymbolicGradient is missing attr: f");
+      }
+      const string& func_name = attr->func().name();
+      TF_RETURN_IF_ERROR(
+          AddMissingFunctionByName(func_name, lookup_library, library));
+      // Copy the user-defined gradient function if it exists.
+      const string grad_name = lookup_library->FindGradient(func_name);
+      if (!grad_name.empty() && library->FindGradient(func_name).empty()) {
+        TF_RETURN_IF_ERROR(
+            AddMissingFunctionByName(grad_name, lookup_library, library));
+        GradientDef grad_def;
+        grad_def.set_function_name(func_name);
+        grad_def.set_gradient_func(grad_name);
+        TF_RETURN_IF_ERROR(library->AddGradientDef(grad_def));
+      }
+    } else if (lookup_library->Find(node.op())) {
+      TF_RETURN_IF_ERROR(
+          library->AddFunctionDef(*lookup_library->Find(node.op())));
+    }
+  }
+  return Status::OK();
+}
+
+Status FunctionalizeLoop(const FunctionLibraryDefinition* lookup_library,
+                         Graph* graph, Frame* frame,
                          FunctionLibraryDefinition* library) {
   VLOG(2) << "Frame " << frame->name << " before: "
           << dump_graph::DumpGraphToFile("functionalize_before", *graph,
@@ -489,6 +540,14 @@ Status FunctionalizeLoop(Graph* graph, Frame* frame,
 
   TF_RETURN_IF_ERROR(library->AddFunctionDef(cond_fdef));
   TF_RETURN_IF_ERROR(library->AddFunctionDef(body_fdef));
+  if (lookup_library) {
+    // Copy missing FunctionDefs from lookup_library to library to make library
+    // self-contained.
+    TF_RETURN_IF_ERROR(
+        AddMissingFunctionDef(cond_fdef, lookup_library, library));
+    TF_RETURN_IF_ERROR(
+        AddMissingFunctionDef(body_fdef, lookup_library, library));
+  }
 
   // Builds a While operator.
   NodeDef while_def;
@@ -1365,6 +1424,12 @@ Status FunctionalizeCond::Functionalize(Graph* graph,
 // functional equivalents.
 Status FunctionalizeControlFlow(Graph* graph,
                                 FunctionLibraryDefinition* library) {
+  return FunctionalizeControlFlow(/*lookup_library=*/nullptr, graph, library);
+}
+
+Status FunctionalizeControlFlow(const FunctionLibraryDefinition* lookup_library,
+                                Graph* graph,
+                                FunctionLibraryDefinition* library) {
   VLOG(2) << "FunctionalizeControlFlow (initial): "
           << dump_graph::DumpGraphToFile("functionalize_initial", *graph,
                                          library);
@@ -1434,7 +1499,8 @@ Status FunctionalizeControlFlow(Graph* graph,
       continue;
     }
 
-    TF_RETURN_IF_ERROR(FunctionalizeLoop(graph, frame, library));
+    TF_RETURN_IF_ERROR(
+        FunctionalizeLoop(lookup_library, graph, frame, library));
 
     // If the parent has no remaining children, add it to the worklist.
     --frame->parent->num_children;
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.h b/tensorflow/compiler/tf2xla/functionalize_control_flow.h
index 4d4ee3054c2914..d941041d155324 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.h
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.h
@@ -22,9 +22,13 @@ limitations under the License.
 namespace tensorflow {
 
 // Transformation that converts tf.while_loop() loops into functional While
-// operators, suitable for XLA compilation.
+// operators, suitable for XLA compilation. If lookup_library is provided, use
+// it to make the library for control flow self-contained.
 Status FunctionalizeControlFlow(Graph* graph,
                                 FunctionLibraryDefinition* library);
+Status FunctionalizeControlFlow(const FunctionLibraryDefinition* lookup_library,
+                                Graph* graph,
+                                FunctionLibraryDefinition* library);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
index e494f42e8ed254..14977a908ae2b0 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
@@ -299,6 +299,131 @@ TEST(FunctionalizeControlFlow, OneLoopVar) {
   }
 }
 
+// @function.Defun(noinline=True)
+// def increment_fn(x):
+//   return [x + 1]
+// Define the above function, and add it to the given graph. It's used as the
+// while loop body in NoinlineLoopBody test.
+Status AddNoinlineFunctionToGraph(const string& node_name, Graph* graph) {
+  FunctionDef fdef = FunctionDefHelper::Create(
+      "increment_fn", {"x:int32"}, {"add:int32"}, {},
+      {
+          {{"add/y"}, "Const", {}, {{"dtype", DT_INT32}}},
+          {{"add_0"}, "Add", {"x", "add/y:output:0"}, {{"T", DT_INT32}}},
+      },
+      {{"add", "add_0:z:0"}});
+  (*fdef.mutable_attr())["_noinline"].set_b(true);
+  FunctionDefLibrary fdef_lib;
+  *(fdef_lib.add_function()) = fdef;
+  TF_RETURN_IF_ERROR(graph->AddFunctionLibrary(fdef_lib));
+  NodeDef increment_fn;
+  increment_fn.set_name(node_name);
+  increment_fn.set_op("increment_fn");
+  *increment_fn.add_input() = "while/Identity";
+  *increment_fn.add_input() = "^while/Identity";
+  Status status;
+  graph->AddNode(increment_fn, &status);
+  return status;
+}
+
+// Graph:
+// x = array_ops.placeholder(dtypes.int32)
+// y = control_flow_ops.while_loop(lambda i: i < 10, increment_fn, [x])
+TEST(FunctionalizeControlFlow, NoinlineLoopBody) {
+  const string& noinline_node_name = "while/increment_fn";
+  Graph graph(OpRegistry::Global());
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto dummy = ops::Placeholder(scope.WithOpName("Dummy"), DT_INT32);
+    auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
+    auto enter = ops::internal::Enter(scope.WithOpName("while/Enter"), source,
+                                      "while/while_context");
+    auto merge = ops::Merge(scope.WithOpName("while/Merge"),
+                            std::initializer_list<Input>{enter, dummy});
+    auto ten = ops::Const<int32>(
+        scope.WithOpName("while/Less/y").WithControlDependencies(merge.output),
+        10);
+    auto less = ops::Less(scope.WithOpName("while/Less"), merge.output, ten);
+    auto loop_cond = ops::LoopCond(scope.WithOpName("while/LoopCond"), less);
+    auto switch_ =
+        ops::Switch(scope.WithOpName("while/Switch"), merge.output, loop_cond);
+    auto exit = ops::internal::Exit(scope.WithOpName("while/Exit"),
+                                    switch_.output_false);
+    auto identity =
+        ops::Identity(scope.WithOpName("while/Identity"), switch_.output_true);
+
+    TF_ASSERT_OK(AddNoinlineFunctionToGraph(noinline_node_name, scope.graph()));
+
+    NodeDef next_iter;
+    next_iter.set_name("while/NextIteration");
+    next_iter.set_op("NextIteration");
+    *next_iter.add_input() = noinline_node_name;
+    (*next_iter.mutable_attr())["T"].set_type(DT_INT32);
+
+    Status status;
+    Node* n = scope.graph()->AddNode(next_iter, &status);
+    TF_ASSERT_OK(status);
+
+    // Remove the dummy node and add the loop backedge.
+    scope.graph()->RemoveNode(dummy.node());
+    scope.graph()->AddEdge(n, 0, merge.output.node(), 1);
+    TF_ASSERT_OK(scope.ToGraph(&graph));
+  }
+
+  FunctionLibraryDefinition lookup_lib(graph.flib_def());
+  FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  // Function increment_fn will be copied from lookup_lib to library.
+  TF_ASSERT_OK(FunctionalizeControlFlow(&lookup_lib, &graph, &library));
+
+  GraphDef graph_def;
+  graph.ToGraphDef(&graph_def);
+
+  NameAttrList cond_fn, body_fn;
+  TF_ASSERT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
+
+  // Outer graph
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
+    auto while_op =
+        ops::XlaWhile(scope.WithOpName("while/LoopCond"),
+                      std::initializer_list<Input>{source}, cond_fn, body_fn);
+    GraphDef expected;
+    TF_ASSERT_OK(scope.ToGraphDef(&expected));
+    TF_EXPECT_GRAPH_EQ(expected, graph_def);
+  }
+
+  // Body graph.
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+    TF_ASSERT_OK(AddNoinlineFunctionToGraph(noinline_node_name, scope.graph()));
+    auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg);
+    NodeDef retval;
+    retval.set_name("_retval0_RetVal");
+    retval.set_op(FunctionLibraryDefinition::kRetOp);
+    *retval.add_input() = noinline_node_name;
+    (*retval.mutable_attr())["T"].set_type(DT_INT32);
+    (*retval.mutable_attr())["index"].set_i(0);
+    Status status;
+    scope.graph()->AddNode(retval, &status);
+    TF_ASSERT_OK(status);
+
+    GraphDef expected;
+    TF_ASSERT_OK(scope.ToGraphDef(&expected));
+
+    InstantiationResultForTest result;
+    // Verify that increment_fn has been copied to library.
+    TF_EXPECT_OK(InstantiateFunctionForTest(body_fn.name(), library, &result));
+
+    EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
+    EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
+    // Ignore the function library when comparing the graphs.
+    expected.clear_library();
+    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+  }
+}
+
 // Tests functionalizing OneLoopVar where the loop value is not used post the
 // loop.
 // Graph:
diff --git a/tensorflow/compiler/tf2xla/graph_compiler.cc b/tensorflow/compiler/tf2xla/graph_compiler.cc
index b20c1ffc7d8956..212f6f3966149c 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler.cc
@@ -51,6 +51,7 @@ Status PrepareArguments(XlaOpKernelContext* ctx, Graph* graph,
                         const std::vector<const XlaExpression*>& expressions,
                         std::vector<XlaCompiler::Argument>* args) {
   auto builder = ctx->builder();
+  auto client = ctx->compiler()->client();
   std::vector<bool> compile_time_constant_flags(expressions.size());
 
   TF_RETURN_IF_ERROR(
@@ -72,8 +73,10 @@ Status PrepareArguments(XlaOpKernelContext* ctx, Graph* graph,
       arg.kind = XlaCompiler::Argument::kConstant;
       TF_RET_CHECK(expressions[i]->resource() == nullptr)
           << "Input with resource is not yet implemented.";
+      TF_ASSIGN_OR_RETURN(auto constant_graph, builder->BuildConstantSubGraph(
+                                                   expressions[i]->handle()));
       TF_ASSIGN_OR_RETURN(auto literal,
-                          builder->ComputeConstant(expressions[i]->handle()));
+                          client->ComputeConstant(constant_graph));
       TF_RETURN_IF_ERROR(
           LiteralToHostTensor(*literal, arg.type, &arg.constant_value));
     } else {
@@ -205,14 +208,15 @@ Status GraphCompiler::CompileFunctionalNode(Node* n,
   TF_RETURN_IF_ERROR(
       PrepareArguments(&xla_op_context, graph.get(), expressions, &arguments));
 
+  XlaCompiler::CompileOptions compile_options;
+  compile_options.is_entry_computation = false;
   XlaCompiler::CompilationResult result;
-
-  TF_RETURN_IF_ERROR(compiler->CompileFunction(XlaCompiler::CompileOptions(),
-                                               func, arguments, &result));
+  TF_RETURN_IF_ERROR(
+      compiler->CompileFunction(compile_options, func, arguments, &result));
 
   TF_RET_CHECK(arguments.size() == expressions.size());
 
-  std::vector<xla::ComputationDataHandle> handles;
+  std::vector<xla::XlaOp> handles;
   for (int64 i = 0; i < expressions.size(); ++i) {
     if (arguments[i].kind == XlaCompiler::Argument::kConstant) {
       continue;
@@ -226,11 +230,14 @@ Status GraphCompiler::CompileFunctionalNode(Node* n,
   auto output_handle = b->Call(*result.computation, handles);
   // The output handle of `Call` computation is a tuple type. Unzip it so
   // that it can fit into future computations.
+  int computation_output = 0;
   for (int64 i = 0; i < n->num_outputs(); ++i) {
     if (result.outputs[i].is_constant) {
       xla_op_context.SetConstantOutput(i, result.outputs[i].constant_value);
     } else {
-      xla_op_context.SetOutput(i, b->GetTupleElement(output_handle, i));
+      xla_op_context.SetOutput(
+          i, b->GetTupleElement(output_handle, computation_output));
+      ++computation_output;
     }
   }
   return b->first_error();
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index 00fd08b1a07507..edd2ab6301ee89 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -18,6 +18,7 @@ tf_kernel_library(
         "bcast_ops.cc",
         "bias_ops.cc",
         "binary_ops.cc",
+        "bucketize_op.cc",
         "cast_op.cc",
         "categorical_op.cc",
         "cholesky_op.cc",
@@ -45,6 +46,7 @@ tf_kernel_library(
         "image_resize_ops.cc",
         "index_ops.cc",
         "l2loss_op.cc",
+        "listdiff_op.cc",
         "lrn_ops.cc",
         "matmul_op.cc",
         "matrix_band_part_op.cc",
@@ -114,8 +116,8 @@ tf_kernel_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/core:framework",
         "//tensorflow/core:image_ops_op_lib",
         "//tensorflow/core:lib",
@@ -151,7 +153,7 @@ tf_kernel_library(
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -167,7 +169,7 @@ tf_kernel_library(
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
         "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -203,8 +205,8 @@ tf_kernel_library(
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/kernels:argmax_op",
diff --git a/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc b/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc
index 5c9f66df101bfb..1e59868621475c 100644
--- a/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/aggregate_ops.cc
@@ -29,7 +29,7 @@ class AddNOp : public XlaOpKernel {
     OP_REQUIRES(ctx, ctx->num_inputs() >= 1,
                 errors::InvalidArgument("AddN requires at least one argument"));
 
-    xla::ComputationDataHandle sum = ctx->Input(0);
+    xla::XlaOp sum = ctx->Input(0);
     for (int i = 1; i < ctx->num_inputs(); ++i) {
       sum = ctx->builder()->Add(sum, ctx->Input(i));
     }
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
index 931175be1111ed..15e1815a4cf07f 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
@@ -48,9 +48,9 @@ class FusedBatchNormOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx,
                    DataTypeToPrimitiveType(ctx->input_type(1), &scale_type));
 
-    xla::ComputationBuilder* builder = ctx->builder();
+    xla::XlaBuilder* builder = ctx->builder();
 
-    xla::ComputationDataHandle input = ctx->Input(0);
+    xla::XlaOp input = ctx->Input(0);
     TensorShape input_shape = ctx->InputShape(0);
 
     int feature_index =
@@ -62,7 +62,7 @@ class FusedBatchNormOp : public XlaOpKernel {
     input = builder->ConvertElementType(input, scale_type);
 
     if (is_training_) {
-      xla::ComputationDataHandle output = builder->BatchNormTraining(
+      xla::XlaOp output = builder->BatchNormTraining(
           input, ctx->Input(1), ctx->Input(2), epsilon_, feature_index);
 
       // In training mode, outputs the normalized value as well as the
@@ -79,7 +79,7 @@ class FusedBatchNormOp : public XlaOpKernel {
       ctx->SetOutput(3, builder->GetTupleElement(output, 1));
       ctx->SetOutput(4, builder->GetTupleElement(output, 2));
     } else {
-      xla::ComputationDataHandle output = builder->BatchNormInference(
+      xla::XlaOp output = builder->BatchNormInference(
           input, ctx->Input(1), ctx->Input(2), ctx->Input(3), ctx->Input(4),
           epsilon_, feature_index);
       ctx->SetOutput(0, builder->ConvertElementType(output, input_type));
@@ -118,7 +118,7 @@ class FusedBatchNormGradOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* const b = ctx->builder();
+    xla::XlaBuilder* const b = ctx->builder();
     DataType input_dtype = ctx->input_type(0);
     DataType scale_dtype = ctx->input_type(2);
 
@@ -137,11 +137,11 @@ class FusedBatchNormGradOp : public XlaOpKernel {
     const int feature_index =
         GetTensorFeatureDimIndex(input_dims, data_format_);
 
-    xla::ComputationDataHandle x_backprop;
-    xla::ComputationDataHandle scale_backprop;
-    xla::ComputationDataHandle offset_backprop;
+    xla::XlaOp x_backprop;
+    xla::XlaOp scale_backprop;
+    xla::XlaOp offset_backprop;
     if (is_training_) {
-      xla::ComputationDataHandle output =
+      xla::XlaOp output =
           b->BatchNormGrad(activations, scale, mean, var, grad_backprop,
                            epsilon_, feature_index);
 
diff --git a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
index 569950c2dfaeb6..642278ab994bf3 100644
--- a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
@@ -20,9 +20,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-void BatchToSpace(XlaOpKernelContext* ctx,
-                  const xla::ComputationDataHandle& input, DataType input_dtype,
-                  const TensorShape& input_tensor_shape,
+void BatchToSpace(XlaOpKernelContext* ctx, const xla::XlaOp& input,
+                  DataType input_dtype, const TensorShape& input_tensor_shape,
                   gtl::ArraySlice<int64> block_shape,
                   const xla::Literal& crops) {
   const int input_rank = input_tensor_shape.dims();
@@ -46,7 +45,7 @@ void BatchToSpace(XlaOpKernelContext* ctx,
                               ", 2] instead of ",
                               xla::ShapeUtil::HumanString(crops.shape())));
 
-  xla::ComputationBuilder* b = ctx->builder();
+  xla::XlaBuilder* b = ctx->builder();
   const int64 batch_size = input_shape[0];
 
   // Compute the product of the block_shape values.
@@ -73,7 +72,7 @@ void BatchToSpace(XlaOpKernelContext* ctx,
   reshaped_shape[block_rank] = batch_size / block_num_elems;
   std::copy(input_shape.begin() + 1, input_shape.end(),
             reshaped_shape.begin() + block_rank + 1);
-  xla::ComputationDataHandle reshaped = b->Reshape(input, reshaped_shape);
+  xla::XlaOp reshaped = b->Reshape(input, reshaped_shape);
 
   // 2. Permute dimensions of `reshaped` to produce `permuted` of shape
   //      [batch / prod(block_shape),
@@ -91,7 +90,7 @@ void BatchToSpace(XlaOpKernelContext* ctx,
   }
   std::iota(permutation.begin() + 1 + block_rank * 2, permutation.end(),
             1 + block_rank * 2);
-  xla::ComputationDataHandle permuted = b->Transpose(reshaped, permutation);
+  xla::XlaOp permuted = b->Transpose(reshaped, permutation);
 
   // 3. Reshape `permuted` to produce `reshaped_permuted` of shape
   //      [batch / prod(block_shape),
@@ -111,8 +110,7 @@ void BatchToSpace(XlaOpKernelContext* ctx,
   std::copy(remainder_shape.begin(), remainder_shape.end(),
             reshaped_permuted_shape.begin() + 1 + block_rank);
 
-  xla::ComputationDataHandle reshaped_permuted =
-      b->Reshape(permuted, reshaped_permuted_shape);
+  xla::XlaOp reshaped_permuted = b->Reshape(permuted, reshaped_permuted_shape);
 
   // 4. Crop the start and end of dimensions `[1, ..., M]` of
   //    `reshaped_permuted` according to `crops` to produce the output of shape:
@@ -139,7 +137,7 @@ void BatchToSpace(XlaOpKernelContext* ctx,
             "Cropped size must be non-negative: start: ", crop_start,
             " end: ", crop_end, " size ", reshaped_permuted_shape[1 + i]));
   }
-  xla::ComputationDataHandle output =
+  xla::XlaOp output =
       b->Slice(reshaped_permuted, start_indices, end_indices, strides);
   ctx->SetOutput(0, output);
 }
diff --git a/tensorflow/compiler/tf2xla/kernels/bias_ops.cc b/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
index ed33b8ed2e823f..9d677f426650ea 100644
--- a/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
@@ -60,7 +60,7 @@ class BiasOp : public XlaOpKernel {
             "of the input tensor: ",
             bias_shape.DebugString(), " vs. ", input_shape.DebugString()));
 
-    xla::ComputationDataHandle result =
+    xla::XlaOp result =
         ctx->builder()->Add(ctx->Input(0), ctx->Input(1), {feature_dim});
     ctx->SetOutput(0, result);
   }
@@ -103,7 +103,7 @@ class BiasAddGradOp : public XlaOpKernel {
     std::iota(reduce_dims.begin(), reduce_dims.begin() + feature_dim, 0);
     std::iota(reduce_dims.begin() + feature_dim, reduce_dims.end(),
               feature_dim + 1);
-    xla::ComputationBuilder* const b = ctx->builder();
+    xla::XlaBuilder* const b = ctx->builder();
     const DataType accumulation_type =
         XlaHelpers::SumAccumulationType(input_type(0));
     auto converted =
diff --git a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
index 2436a6074a11ad..f04cde878e9800 100644
--- a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
@@ -34,14 +34,13 @@ namespace {
   class NAME##Op : public XlaBinaryOp {                                 \
    public:                                                              \
     explicit NAME##Op(OpKernelConstruction* ctx) : XlaBinaryOp(ctx) {}  \
-    xla::ComputationDataHandle Computation(                             \
-        XlaOpKernelContext* ctx, const xla::ComputationDataHandle& lhs, \
-        const gtl::ArraySlice<int64>& lhs_shape,                        \
-        const xla::ComputationDataHandle& rhs,                          \
+    xla::XlaOp Computation(                                             \
+        XlaOpKernelContext* ctx, const xla::XlaOp& lhs,                 \
+        const gtl::ArraySlice<int64>& lhs_shape, const xla::XlaOp& rhs, \
         const gtl::ArraySlice<int64>& rhs_shape,                        \
         const BCast& broadcast_helper,                                  \
         const std::vector<int64>& extend_dimensions) override {         \
-      xla::ComputationBuilder* b = ctx->builder();                      \
+      xla::XlaBuilder* b = ctx->builder();                              \
       return HLO;                                                       \
     }                                                                   \
   };                                                                    \
@@ -63,11 +62,8 @@ XLA_MAKE_BINARY(Complex, b->Complex(lhs, rhs, extend_dimensions));
 // } else {
 //   return x / y;
 // }
-static xla::ComputationDataHandle FloorDivImpl(xla::ComputationBuilder* b,
-                                               DataType dtype,
-                                               xla::ComputationDataHandle x,
-                                               xla::ComputationDataHandle y,
-                                               const BCast& broadcast_helper) {
+static xla::XlaOp FloorDivImpl(xla::XlaBuilder* b, DataType dtype, xla::XlaOp x,
+                               xla::XlaOp y, const BCast& broadcast_helper) {
   std::tie(x, y) = XlaBinaryOp::Broadcast(b, x, y, broadcast_helper);
   auto zero = XlaHelpers::Zero(b, dtype);
   auto one = XlaHelpers::One(b, dtype);
@@ -87,11 +83,8 @@ XLA_MAKE_BINARY(FloorDiv,
 // Implementation of FloorMod. Pseudo-code:
 // T trunc_mod = std::fmod(x, y);
 // return (x < T(0)) == (y < T(0)) ? trunc_mod : std::fmod(trunc_mod + y, y);
-static xla::ComputationDataHandle FloorModImpl(xla::ComputationBuilder* b,
-                                               DataType dtype,
-                                               xla::ComputationDataHandle x,
-                                               xla::ComputationDataHandle y,
-                                               const BCast& broadcast_helper) {
+static xla::XlaOp FloorModImpl(xla::XlaBuilder* b, DataType dtype, xla::XlaOp x,
+                               xla::XlaOp y, const BCast& broadcast_helper) {
   std::tie(x, y) = XlaBinaryOp::Broadcast(b, x, y, broadcast_helper);
   auto zero = XlaHelpers::Zero(b, dtype);
   auto same_sign = b->Eq(b->Lt(x, zero), b->Lt(y, zero));
@@ -127,8 +120,7 @@ XLA_MAKE_BINARY(SqrtGrad,
                               XlaHelpers::FloatLiteral(b, input_type(0), 0.5)),
                        lhs, extend_dimensions));
 
-static xla::ComputationDataHandle Square(xla::ComputationBuilder* builder,
-                                         const xla::ComputationDataHandle& x) {
+static xla::XlaOp Square(xla::XlaBuilder* builder, const xla::XlaOp& x) {
   return builder->Mul(x, x);
 }
 
@@ -175,11 +167,11 @@ class ApproximateEqualOp : public XlaOpKernel {
 
   // Computes the max of the scalar input x and 0.
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* b = ctx->builder();
+    xla::XlaBuilder* b = ctx->builder();
     auto abs = b->Abs(b->Sub(ctx->Input(0), ctx->Input(1)));
     auto abs_shape = b->GetShape(abs);
     OP_REQUIRES_OK(ctx, abs_shape.status());
-    auto abs_type = abs_shape.ValueOrDie()->element_type();
+    auto abs_type = abs_shape.ValueOrDie().element_type();
     auto result = b->Lt(
         abs, b->ConvertElementType(b->ConstantR0<float>(tolerance_), abs_type));
     ctx->SetOutput(0, result);
diff --git a/tensorflow/compiler/tf2xla/kernels/bucketize_op.cc b/tensorflow/compiler/tf2xla/kernels/bucketize_op.cc
new file mode 100644
index 00000000000000..ca9a6b40688d1e
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/bucketize_op.cc
@@ -0,0 +1,67 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+namespace {
+
+class BucketizeOp : public XlaOpKernel {
+ public:
+  explicit BucketizeOp(OpKernelConstruction* context) : XlaOpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("boundaries", &boundaries_));
+    OP_REQUIRES(context, std::is_sorted(boundaries_.begin(), boundaries_.end()),
+                errors::InvalidArgument("Expected sorted boundaries"));
+  }
+
+  void Compile(XlaOpKernelContext* context) override {
+    xla::XlaBuilder* builder = context->builder();
+    const DataType dtype = context->input_type(0);
+    xla::XlaOp input = context->Input(0);
+
+    xla::XlaOp boundaries = builder->ConstantR1<float>(boundaries_);
+    // TODO(phawkins): the following behavior matches the behavior of the core
+    // Bucketize kernel. However, comparing an int32 or int64 against float may
+    // lead to inaccurate bucketing due to rounding.
+    if (dtype == DT_DOUBLE) {
+      input = builder->ConvertElementType(input, xla::F64);
+      boundaries = builder->ConvertElementType(boundaries, xla::F64);
+    } else {
+      input = builder->ConvertElementType(input, xla::F32);
+    }
+    xla::XlaOp comparison = builder->ConvertElementType(
+        builder->Ge(builder->Broadcast(input, {1}), boundaries,
+                    /*broadcast_dimensions=*/{0}),
+        xla::S32);
+    xla::XlaOp buckets = builder->Reduce(
+        comparison, /*init_value=*/builder->ConstantR0<int32>(0),
+        /*computation=*/xla::CreateScalarAddComputation(xla::S32, builder),
+        /*dimensions_to_reduce=*/{0});
+    context->SetOutput(0, buckets);
+  }
+
+ private:
+  std::vector<float> boundaries_;
+};
+
+REGISTER_XLA_OP(Name("Bucketize"), BucketizeOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/cast_op.cc b/tensorflow/compiler/tf2xla/kernels/cast_op.cc
index c52b2dcb7e9ef8..e9d98c768572c5 100644
--- a/tensorflow/compiler/tf2xla/kernels/cast_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/cast_op.cc
@@ -33,9 +33,9 @@ class CastOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* builder = ctx->builder();
-    xla::ComputationDataHandle input = ctx->Input(0);
-    xla::ComputationDataHandle output;
+    xla::XlaBuilder* builder = ctx->builder();
+    xla::XlaOp input = ctx->Input(0);
+    xla::XlaOp output;
 
     if (src_dtype_ == dst_dtype_) {
       output = input;
@@ -72,9 +72,9 @@ class BitcastOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* builder = ctx->builder();
-    xla::ComputationDataHandle input = ctx->Input(0);
-    xla::ComputationDataHandle output;
+    xla::XlaBuilder* builder = ctx->builder();
+    xla::XlaOp input = ctx->Input(0);
+    xla::XlaOp output;
 
     if (src_dtype_ == dst_dtype_) {
       output = input;
diff --git a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
index 545aa364f937b2..835a7f568945f0 100644
--- a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
@@ -34,7 +34,7 @@ class CategoricalOp : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext* ctx) override {
     // Get the logits
-    const xla::ComputationDataHandle& logits = ctx->Input(0);
+    const xla::XlaOp& logits = ctx->Input(0);
     TensorShape logits_shape = ctx->InputShape(0);
     int64 num_samples;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(1, &num_samples));
@@ -56,7 +56,7 @@ class CategoricalOp : public XlaOpKernel {
     const int64 batch_size = logits_shape.dim_size(0);
     const int64 num_classes = logits_shape.dim_size(1);
 
-    xla::ComputationBuilder* builder = ctx->builder();
+    xla::XlaBuilder* builder = ctx->builder();
 
     std::array<int64, 3> uniform_shape_array = {
         {batch_size, num_samples, num_classes}};
@@ -78,7 +78,7 @@ class CategoricalOp : public XlaOpKernel {
                      /*broadcast_dimensions=*/{0, 2});
 
     TensorShape softmax_shape(uniform_shape_array);
-    xla::ComputationDataHandle argmax;
+    xla::XlaOp argmax;
     OP_REQUIRES_OK(
         ctx,
         XlaHelpers::ArgMax(builder, ctx, softmax_entries, softmax_shape,
diff --git a/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc b/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc
index fdf75be7b11565..a00bc912f9f400 100644
--- a/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/clip_by_value_op.cc
@@ -29,7 +29,7 @@ class ClipByValueOp : public XlaOpKernel {
     const TensorShape min_shape = ctx->InputShape(1);
     const TensorShape max_shape = ctx->InputShape(2);
 
-    xla::ComputationBuilder* builder = ctx->builder();
+    xla::XlaBuilder* builder = ctx->builder();
     auto input = ctx->Input(0);
     auto min = ctx->Input(1);
     auto max = ctx->Input(2);
diff --git a/tensorflow/compiler/tf2xla/kernels/concat_op.cc b/tensorflow/compiler/tf2xla/kernels/concat_op.cc
index 1a246e8df9b2cd..78285affa1c399 100644
--- a/tensorflow/compiler/tf2xla/kernels/concat_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/concat_op.cc
@@ -54,7 +54,7 @@ class ConcatBaseOp : public XlaOpKernel {
     // TODO(annarev): add a helper to support int64 input.
     const int32 concat_dim = literal.Get<int>({});
 
-    std::vector<xla::ComputationDataHandle> values;
+    std::vector<xla::XlaOp> values;
     std::vector<TensorShape> shapes;
     OP_REQUIRES_OK(ctx, ctx->InputList("values", &values, &shapes));
     const int N = values.size();
@@ -70,13 +70,13 @@ class ConcatBaseOp : public XlaOpKernel {
                     "[",
                     -input_dims, ", ", input_dims, "), but got ", concat_dim));
 
-    // Make a vector holding the ComputationDataHandles for each of
-    // the inputs that has non-zero elements.
-    std::vector<xla::ComputationDataHandle> input_data;
+    // Make a vector holding the XlaOp for each of the inputs that has non-zero
+    // elements.
+    std::vector<xla::XlaOp> input_data;
     int output_concat_dim = 0;
     const bool input_is_scalar = IsLegacyScalar(input_shape);
     for (int i = 0; i < N; ++i) {
-      xla::ComputationDataHandle handle = values[i];
+      xla::XlaOp handle = values[i];
       const TensorShape& in_shape = shapes[i];
       const bool in_is_scalar = IsLegacyScalar(in_shape);
       OP_REQUIRES(
diff --git a/tensorflow/compiler/tf2xla/kernels/const_op.cc b/tensorflow/compiler/tf2xla/kernels/const_op.cc
index 8f78b4c8f90cf0..59d06c654de18c 100644
--- a/tensorflow/compiler/tf2xla/kernels/const_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/const_op.cc
@@ -45,7 +45,7 @@ class ConstOp : public XlaOpKernel {
       ctx->SetInvalidOutput(0);
       return;
     }
-    xla::ComputationBuilder* b = ctx->builder();
+    xla::XlaBuilder* b = ctx->builder();
 
     // To avoid blowups for large constants filled with the same value,
     // recognize that case and emit a scalar broadcast instead.
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
index c0ee0c9c2ea849..627bad12f33c82 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
@@ -47,9 +47,8 @@ TensorShape ExpandedFilterShapeForDepthwiseConvolution(
 }
 
 // Broadcast zeros to ExpandedFilterShapeForDepthwiseConvolution.
-xla::ComputationDataHandle CreateExpandedZero(
-    const TensorShape& filter_shape, DataType dtype,
-    xla::ComputationBuilder* builder) {
+xla::XlaOp CreateExpandedZero(const TensorShape& filter_shape, DataType dtype,
+                              xla::XlaBuilder* builder) {
   TensorShape expanded_filter_shape =
       ExpandedFilterShapeForDepthwiseConvolution(filter_shape);
   return builder->Broadcast(XlaHelpers::Zero(builder, dtype),
@@ -87,8 +86,8 @@ xla::ComputationDataHandle CreateExpandedZero(
 //
 // Finally compare A and broadcasted B in dimension 2 amd return the result at
 // the beginning of the comment.
-xla::ComputationDataHandle CreateExpandedFilterMask(
-    const TensorShape& filter_shape, xla::ComputationBuilder* builder) {
+xla::XlaOp CreateExpandedFilterMask(const TensorShape& filter_shape,
+                                    xla::XlaBuilder* builder) {
   TensorShape expanded_filter_shape =
       ExpandedFilterShapeForDepthwiseConvolution(filter_shape);
   int64 depthwise_multiplier = filter_shape.dim_size(filter_shape.dims() - 1);
@@ -96,11 +95,11 @@ xla::ComputationDataHandle CreateExpandedFilterMask(
 
   // Create a M sized linspace and an M*N sized linspace that will be
   // broadcasted into perpendicular dimensions and compared.
-  xla::ComputationDataHandle input_feature_iota;
+  xla::XlaOp input_feature_iota;
   // DT_INT32 Iota will always return status::OK().
   TF_CHECK_OK(XlaHelpers::Iota(builder, DataType::DT_INT32, input_feature,
                                &input_feature_iota));
-  xla::ComputationDataHandle expanded_feature_iota;
+  xla::XlaOp expanded_feature_iota;
   TF_CHECK_OK(XlaHelpers::Iota(builder, DataType::DT_INT32,
                                input_feature * depthwise_multiplier,
                                &expanded_feature_iota));
@@ -126,10 +125,10 @@ xla::ComputationDataHandle CreateExpandedFilterMask(
 
 // Expands a filter of shape [H, W, ..., M, N] to [H, W, ..., M, M*N] by adding
 // zeros for the cross-depth filters. Used to build a depthwise convolution.
-xla::ComputationDataHandle ExpandFilterForDepthwiseConvolution(
-    const TensorShape& filter_shape, DataType dtype,
-    const xla::ComputationDataHandle& filter,
-    xla::ComputationBuilder* builder) {
+xla::XlaOp ExpandFilterForDepthwiseConvolution(const TensorShape& filter_shape,
+                                               DataType dtype,
+                                               const xla::XlaOp& filter,
+                                               xla::XlaBuilder* builder) {
   int64 depthwise_multiplier = filter_shape.dim_size(filter_shape.dims() - 1);
   int64 input_feature = filter_shape.dim_size(filter_shape.dims() - 2);
   TensorShape expanded_filter_shape =
@@ -156,10 +155,11 @@ xla::ComputationDataHandle ExpandFilterForDepthwiseConvolution(
 }
 
 // Inverse of ExpandFilterForDepthwiseConvolution.
-xla::ComputationDataHandle ContractFilterForDepthwiseBackprop(
-    XlaOpKernelContext* ctx, const TensorShape& filter_shape, DataType dtype,
-    const xla::ComputationDataHandle& filter_backprop,
-    xla::ComputationBuilder* builder) {
+xla::XlaOp ContractFilterForDepthwiseBackprop(XlaOpKernelContext* ctx,
+                                              const TensorShape& filter_shape,
+                                              DataType dtype,
+                                              const xla::XlaOp& filter_backprop,
+                                              xla::XlaBuilder* builder) {
   TensorShape expanded_filter_shape =
       ExpandedFilterShapeForDepthwiseConvolution(filter_shape);
   auto masked_expanded_filter = builder->Select(
@@ -248,9 +248,9 @@ class ConvOp : public XlaOpKernel {
                     "input and filter must have the same depth: ", in_depth,
                     " vs ", input_shape.dim_size(feature_dim)));
 
-    xla::ComputationBuilder* b = ctx->builder();
+    xla::XlaBuilder* b = ctx->builder();
 
-    xla::ComputationDataHandle filter = ctx->Input(1);
+    xla::XlaOp filter = ctx->Input(1);
     TensorShape expanded_filter_shape = filter_shape;
     if (depthwise_) {
       filter = ExpandFilterForDepthwiseConvolution(
@@ -288,7 +288,7 @@ class ConvOp : public XlaOpKernel {
                    &unused_output_size, &padding[i].first, &padding[i].second));
     }
 
-    xla::ComputationDataHandle conv =
+    xla::XlaOp conv =
         b->ConvGeneralDilated(ctx->Input(0), filter, window_strides, padding,
                               lhs_dilation, rhs_dilation, dims);
     ctx->SetOutput(0, conv);
@@ -391,7 +391,7 @@ class ConvBackpropInputOp : public XlaOpKernel {
                        expanded_filter_shape, out_backprop_shape, dilations_,
                        strides_, padding_, data_format_, &dims));
 
-    xla::ComputationBuilder* b = ctx->builder();
+    xla::XlaBuilder* b = ctx->builder();
     auto filter = ctx->Input(1);
     auto out_backprop = ctx->Input(2);
 
@@ -435,12 +435,11 @@ class ConvBackpropInputOp : public XlaOpKernel {
     }
 
     // Mirror the filter in the spatial dimensions.
-    xla::ComputationDataHandle mirrored_weights =
-        b->Rev(filter, kernel_spatial_dims);
+    xla::XlaOp mirrored_weights = b->Rev(filter, kernel_spatial_dims);
 
     // activation gradients
     //   = gradients (with padding and dilation) <conv> mirrored_weights
-    xla::ComputationDataHandle in_backprop = b->ConvGeneralDilated(
+    xla::XlaOp in_backprop = b->ConvGeneralDilated(
         out_backprop, mirrored_weights, /*window_strides=*/ones, padding,
         lhs_dilation, rhs_dilation, dnums);
 
@@ -546,9 +545,9 @@ class ConvBackpropFilterOp : public XlaOpKernel {
                        expanded_filter_shape, out_backprop_shape, dilations_,
                        strides_, padding_, data_format_, &dims));
 
-    xla::ComputationBuilder* b = ctx->builder();
-    xla::ComputationDataHandle activations = ctx->Input(0);
-    xla::ComputationDataHandle gradients = ctx->Input(2);
+    xla::XlaBuilder* b = ctx->builder();
+    xla::XlaOp activations = ctx->Input(0);
+    xla::XlaOp gradients = ctx->Input(2);
 
     // The filter gradients are computed by a convolution of the input
     // activations and the output gradients, with some appropriate padding.
diff --git a/tensorflow/compiler/tf2xla/kernels/cross_op.cc b/tensorflow/compiler/tf2xla/kernels/cross_op.cc
index 3df8c00f1b8355..7fcd4170fb79a5 100644
--- a/tensorflow/compiler/tf2xla/kernels/cross_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/cross_op.cc
@@ -53,7 +53,7 @@ class CrossOp : public XlaOpKernel {
     }
     std::vector<int64> strides(in0_shape.dims(), 1);
 
-    xla::ComputationBuilder* b = ctx->builder();
+    xla::XlaBuilder* b = ctx->builder();
     auto in0 = ctx->Input(0);
     auto in1 = ctx->Input(1);
     starts.back() = 0;
diff --git a/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc b/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc
index 0cf03ceb948a51..01aa1a83e79679 100644
--- a/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/util/bcast.h"
@@ -75,7 +75,7 @@ void XlaBinaryOp::Compile(XlaOpKernelContext* ctx) {
   }
 
   // Call virtual method to emit the computation.
-  xla::ComputationDataHandle output =
+  xla::XlaOp output =
       Computation(ctx, lhs_handle, lhs_shape.dim_sizes(), rhs_handle,
                   rhs_shape.dim_sizes(), bcast, extend_dimension);
 
@@ -85,11 +85,9 @@ void XlaBinaryOp::Compile(XlaOpKernelContext* ctx) {
   ctx->SetOutput(0, output);
 }
 
-/* static */ std::pair<xla::ComputationDataHandle, xla::ComputationDataHandle>
-XlaBinaryOp::Broadcast(xla::ComputationBuilder* builder,
-                       const xla::ComputationDataHandle& lhs,
-                       const xla::ComputationDataHandle& rhs,
-                       const BCast& broadcast_helper) {
+/* static */ std::pair<xla::XlaOp, xla::XlaOp> XlaBinaryOp::Broadcast(
+    xla::XlaBuilder* builder, const xla::XlaOp& lhs, const xla::XlaOp& rhs,
+    const BCast& broadcast_helper) {
   // Manually construct the broadcasting since MapN does not do
   // automatic broadcasting. The bcast helper ensures that
   // lhs.reshape(bcast.x_reshape()).broadcast(bcast.x_bcast()) and
diff --git a/tensorflow/compiler/tf2xla/kernels/cwise_ops.h b/tensorflow/compiler/tf2xla/kernels/cwise_ops.h
index 5bc1d5fb1f08fb..4f92dbc8740b69 100644
--- a/tensorflow/compiler/tf2xla/kernels/cwise_ops.h
+++ b/tensorflow/compiler/tf2xla/kernels/cwise_ops.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/util/bcast.h"
 
@@ -30,7 +30,7 @@ namespace tensorflow {
 // inputs that can be broadcast to the same shape. The base class
 // contains pure virtual methods to override: description is a textual
 // description of the operation; and Computation adds the
-// implementation of the operation to a xla::ComputationBuilder. For most
+// implementation of the operation to a xla::XlaBuilder. For most
 // arithmetic Ops XLA handles the broadcasting automatically given the input
 // tensors.
 class XlaBinaryOp : public XlaOpKernel {
@@ -55,10 +55,9 @@ class XlaBinaryOp : public XlaOpKernel {
   // higher-rank input should be matched when broadcasting the
   // lower-rank input. See comment below and the documentation on broadcasting
   // in the XLA documentation.
-  virtual xla::ComputationDataHandle Computation(
-      XlaOpKernelContext* ctx, const xla::ComputationDataHandle& lhs,
-      const gtl::ArraySlice<int64>& lhs_shape,
-      const xla::ComputationDataHandle& rhs,
+  virtual xla::XlaOp Computation(
+      XlaOpKernelContext* ctx, const xla::XlaOp& lhs,
+      const gtl::ArraySlice<int64>& lhs_shape, const xla::XlaOp& rhs,
       const gtl::ArraySlice<int64>& rhs_shape, const BCast& broadcast_helper,
       const std::vector<int64>& extend_dimensions) = 0;
 
@@ -67,11 +66,9 @@ class XlaBinaryOp : public XlaOpKernel {
   // Helper function that performs the broadcasting described by
   // 'broadcast_helper', yielding arguments 'lhs' and 'rhs' that have the same
   // shape.
-  static std::pair<xla::ComputationDataHandle, xla::ComputationDataHandle>
-  Broadcast(xla::ComputationBuilder* builder,
-            const xla::ComputationDataHandle& lhs,
-            const xla::ComputationDataHandle& rhs,
-            const BCast& broadcast_helper);
+  static std::pair<xla::XlaOp, xla::XlaOp> Broadcast(
+      xla::XlaBuilder* builder, const xla::XlaOp& lhs, const xla::XlaOp& rhs,
+      const BCast& broadcast_helper);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc b/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc
index 96d7809f799563..23243f62462c63 100644
--- a/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc
@@ -50,8 +50,8 @@ class DepthToSpaceOp : public XlaOpKernel {
     const gtl::InlinedVector<int64, 4> input_shape =
         input_tensor_shape.dim_sizes();
 
-    xla::ComputationBuilder* b = ctx->builder();
-    xla::ComputationDataHandle input = ctx->Input(0);
+    xla::XlaBuilder* b = ctx->builder();
+    xla::XlaOp input = ctx->Input(0);
 
     int feature_dim = GetTensorFeatureDimIndex(input_rank, data_format_);
     int num_spatial_dims = GetTensorSpatialDims(input_rank, data_format_);
@@ -130,7 +130,7 @@ class DepthToSpaceOp : public XlaOpKernel {
                     ") is not divisible by square of the block size (",
                     block_size_, ")"));
 
-    xla::ComputationDataHandle reshaped = b->Reshape(input, reshaped_shape);
+    xla::XlaOp reshaped = b->Reshape(input, reshaped_shape);
 
     // 2. Permute dimensions of `reshaped` to produce
     //    `permuted_reshaped` of shape:
@@ -141,8 +141,7 @@ class DepthToSpaceOp : public XlaOpKernel {
     //       input_shape[2],
     //       block_size_,
     //       depth / (block_size_ * block_size_)]
-    xla::ComputationDataHandle permuted_reshaped =
-        b->Transpose(reshaped, transpose_order);
+    xla::XlaOp permuted_reshaped = b->Transpose(reshaped, transpose_order);
 
     // 3. Reshape `permuted_reshaped` to flatten `block_shape` into the
     //    batch dimension, producing an output tensor of shape:
@@ -152,8 +151,7 @@ class DepthToSpaceOp : public XlaOpKernel {
     //       input_shape[2] * block_size_,
     //       depth / (block_size_ * block_size_)]
     //
-    xla::ComputationDataHandle output =
-        b->Reshape(permuted_reshaped, output_shape);
+    xla::XlaOp output = b->Reshape(permuted_reshaped, output_shape);
 
     ctx->SetOutput(0, output);
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/diag_op.cc b/tensorflow/compiler/tf2xla/kernels/diag_op.cc
index 765ea922a532a0..931705ba837153 100644
--- a/tensorflow/compiler/tf2xla/kernels/diag_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/diag_op.cc
@@ -25,10 +25,10 @@ namespace tensorflow {
 namespace {
 
 // Create a diagonal / batch diagonal matrix with 'input' on the diagonal.
-xla::StatusOr<xla::ComputationDataHandle> CreateDiagonal(
-    const xla::ComputationDataHandle& input, int64 last_dim_size,
+xla::StatusOr<xla::XlaOp> CreateDiagonal(
+    const xla::XlaOp& input, int64 last_dim_size,
     tensorflow::gtl::ArraySlice<int64> other_dims, XlaOpKernelContext* ctx,
-    xla::ComputationBuilder* builder) {
+    xla::XlaBuilder* builder) {
   // Create two matrices that have the following forms, and compare them:
   //
   // [[0, 0, 0, 0]            [[0, 1, 2, 3]
@@ -38,12 +38,11 @@ xla::StatusOr<xla::ComputationDataHandle> CreateDiagonal(
   //
   // This produces a predicate matrix of the right size, with "true" on the
   // diagonal.
-  xla::ComputationDataHandle iota;
+  xla::XlaOp iota;
   TF_RETURN_IF_ERROR(
       XlaHelpers::Iota(builder, DataType::DT_INT32, last_dim_size, &iota));
-  xla::ComputationDataHandle iota_broadcast =
-      builder->Broadcast(iota, {last_dim_size});
-  xla::ComputationDataHandle mask = builder->Eq(iota_broadcast, iota, {0});
+  xla::XlaOp iota_broadcast = builder->Broadcast(iota, {last_dim_size});
+  xla::XlaOp mask = builder->Eq(iota_broadcast, iota, {0});
 
   // If this is a batched diagonal, broadcast the mask across the other
   // dimensions.
@@ -65,8 +64,7 @@ xla::StatusOr<xla::ComputationDataHandle> CreateDiagonal(
   std::vector<int64> broadcast_dims(other_dims.begin(), other_dims.end());
   broadcast_dims.push_back(1LL);
   broadcast_dims.push_back(last_dim_size);
-  xla::ComputationDataHandle input_broadcast =
-      builder->Reshape(input, broadcast_dims);
+  xla::XlaOp input_broadcast = builder->Reshape(input, broadcast_dims);
 
   broadcast_dims[broadcast_dims.size() - 2] = last_dim_size;
   xla::PrimitiveType element_type;
@@ -74,7 +72,7 @@ xla::StatusOr<xla::ComputationDataHandle> CreateDiagonal(
       DataTypeToPrimitiveType(ctx->input_type(0), &element_type));
   auto broadcast_shape =
       xla::ShapeUtil::MakeShape(element_type, broadcast_dims);
-  xla::ComputationDataHandle zeros = Zeros(builder, broadcast_shape);
+  xla::XlaOp zeros = Zeros(builder, broadcast_shape);
 
   input_broadcast = builder->Add(input_broadcast, zeros);
   return builder->Select(mask, input_broadcast, zeros);
@@ -85,7 +83,7 @@ class DiagOp : public XlaOpKernel {
   explicit DiagOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* builder = ctx->builder();
+    xla::XlaBuilder* builder = ctx->builder();
 
     OP_REQUIRES(ctx, ctx->num_inputs() >= 1,
                 errors::InvalidArgument("Diag op must have at an input"));
@@ -96,7 +94,7 @@ class DiagOp : public XlaOpKernel {
                 errors::InvalidArgument("Expected 1 <= dims, got shape ",
                                         input_shape.DebugString()));
 
-    xla::ComputationDataHandle input = ctx->Input(0);
+    xla::XlaOp input = ctx->Input(0);
 
     // Picture:
     // tf.diag([1, 2, 3, 4]) ==> [[1, 0, 0, 0]
@@ -112,7 +110,7 @@ class DiagOp : public XlaOpKernel {
     auto diag_or_status =
         CreateDiagonal(input, size, /*other_dims=*/{}, ctx, builder);
     OP_REQUIRES_OK(ctx, diag_or_status.status());
-    xla::ComputationDataHandle diag = diag_or_status.ValueOrDie();
+    xla::XlaOp diag = diag_or_status.ValueOrDie();
 
     // Reshapes to the final shape.
     std::vector<int64> new_dims(dims.size() * 2);
@@ -131,7 +129,7 @@ class DiagPartOp : public XlaOpKernel {
   explicit DiagPartOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* builder = ctx->builder();
+    xla::XlaBuilder* builder = ctx->builder();
 
     const TensorShape input_shape = ctx->InputShape(0);
     auto dims = input_shape.dim_sizes();
@@ -158,7 +156,7 @@ class DiagPartOp : public XlaOpKernel {
       new_dims.push_back(dims[i]);
     }
 
-    xla::ComputationDataHandle diag = ctx->Input(0);
+    xla::XlaOp diag = ctx->Input(0);
 
     // TODO(b/30878775): use Slice with strides when supported, in place of
     // the Pad -> Reshape -> Slice.
@@ -199,7 +197,7 @@ class MatrixDiagOp : public XlaOpKernel {
   explicit MatrixDiagOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* builder = ctx->builder();
+    xla::XlaBuilder* builder = ctx->builder();
 
     OP_REQUIRES(ctx, ctx->num_inputs() >= 1,
                 errors::InvalidArgument("MatrixDiag op must have at an input"));
@@ -210,7 +208,7 @@ class MatrixDiagOp : public XlaOpKernel {
                 errors::InvalidArgument("Expected 1 <= dims, got shape ",
                                         input_shape.DebugString()));
 
-    xla::ComputationDataHandle diag = ctx->Input(0);
+    xla::XlaOp diag = ctx->Input(0);
 
     int last_dim = dims.size() - 1;
     int64 last_dim_size = input_shape.dim_size(last_dim);
@@ -232,7 +230,7 @@ class MatrixDiagPartOp : public XlaOpKernel {
   explicit MatrixDiagPartOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* builder = ctx->builder();
+    xla::XlaBuilder* builder = ctx->builder();
 
     const TensorShape input_shape = ctx->InputShape(0);
     auto dims = input_shape.dim_sizes();
@@ -241,7 +239,7 @@ class MatrixDiagPartOp : public XlaOpKernel {
                 errors::InvalidArgument("Expected 2 <= dims, got shape ",
                                         input_shape.DebugString()));
 
-    xla::ComputationDataHandle diag = ctx->Input(0);
+    xla::XlaOp diag = ctx->Input(0);
 
     int last_dim = dims.size() - 1;
     int64 last_dim_size = dims[last_dim];
diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc
index 800ef5ab98d70a..0419de78b2ee83 100644
--- a/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_slice_ops.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
 #include "tensorflow/compiler/tf2xla/type_util.h"
@@ -57,7 +57,7 @@ class DynamicUpdateSliceOp : public XlaOpKernel {
                                 input_shape.DebugString(), "; update shape is ",
                                 update_shape.DebugString()));
 
-    xla::ComputationDataHandle result = ctx->builder()->DynamicUpdateSlice(
+    xla::XlaOp result = ctx->builder()->DynamicUpdateSlice(
         ctx->Input(0), ctx->Input(1), ctx->Input(2));
     ctx->SetOutput(0, result);
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
index f2cd21ffb9ce88..dd4a1690877950 100644
--- a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
@@ -56,7 +56,7 @@ class DynamicStitchOp : public XlaOpKernel {
     std::vector<xla::Literal> indices_input;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputList("indices", &indices_input));
 
-    std::vector<xla::ComputationDataHandle> data;
+    std::vector<xla::XlaOp> data;
     std::vector<TensorShape> data_shapes;
     OP_REQUIRES_OK(ctx, ctx->InputList("data", &data, &data_shapes));
 
@@ -136,7 +136,7 @@ class DynamicStitchOp : public XlaOpKernel {
 
     // Look up all the children expressions that represent the data
     // inputs.
-    std::vector<xla::ComputationDataHandle> input(indices.size());
+    std::vector<xla::XlaOp> input(indices.size());
     for (int input_num = 0; input_num < indices.size(); input_num++) {
       TensorShape new_shape;
       // first reshaped dimension is the number of indices for this input.
@@ -166,7 +166,7 @@ class DynamicStitchOp : public XlaOpKernel {
     for (int d = indices0_shape.dims(); d < data0_shape.dims(); d++) {
       slice_limit[1 + d - indices0_shape.dims()] = data0_shape.dim_size(d);
     }
-    std::vector<xla::ComputationDataHandle> to_concat(number_of_indices);
+    std::vector<xla::XlaOp> to_concat(number_of_indices);
     for (int index_num = 0; index_num < number_of_indices; index_num++) {
       const auto& expression = input[src_input_vector[index_num]];
       // Take the appropriate slice of data.
diff --git a/tensorflow/compiler/tf2xla/kernels/elu_op.cc b/tensorflow/compiler/tf2xla/kernels/elu_op.cc
index 2fd27c5ca7e87c..493781a1e68b89 100644
--- a/tensorflow/compiler/tf2xla/kernels/elu_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/elu_op.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/types.h"
@@ -32,11 +32,10 @@ class EluOp : public XlaOpKernel {
   explicit EluOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
   // Computes the max of the scalar input x and 0.
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* b = ctx->builder();
+    xla::XlaBuilder* b = ctx->builder();
     const auto zero = XlaHelpers::Zero(b, input_type(0));
-    const auto one = XlaHelpers::One(b, input_type(0));
     const auto pred = b->Gt(ctx->Input(0), zero);
-    const auto expm1 = b->Sub(b->Exp(ctx->Input(0)), one);
+    const auto expm1 = b->Expm1(ctx->Input(0));
     ctx->SetOutput(0, b->Select(pred, ctx->Input(0), expm1));
   }
 };
@@ -47,7 +46,7 @@ class EluGradOp : public XlaOpKernel {
   // Return the lhs (incoming gradient) if the rhs (input feature) > 0,
   // otherwise return lhs * (1 + rhs).
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* b = ctx->builder();
+    xla::XlaBuilder* b = ctx->builder();
     const auto zero = XlaHelpers::Zero(b, input_type(0));
     const auto one = XlaHelpers::One(b, input_type(0));
     const auto grad = ctx->Input(0);
@@ -66,15 +65,14 @@ class SeluOp : public XlaOpKernel {
   explicit SeluOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
   // Computes the max of the scalar input x and 0.
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* b = ctx->builder();
+    xla::XlaBuilder* b = ctx->builder();
     const auto zero = XlaHelpers::Zero(b, input_type(0));
-    const auto one = XlaHelpers::One(b, input_type(0));
     const auto scale = XlaHelpers::FloatLiteral(b, input_type(0),
             1.0507009873554804934193349852946);
     const auto scale_alpha = XlaHelpers::FloatLiteral(b, input_type(0),
             1.7580993408473768599402175208123);
     const auto pred = b->Gt(ctx->Input(0), zero);
-    const auto expm1 = b->Sub(b->Exp(ctx->Input(0)), one);
+    const auto expm1 = b->Expm1(ctx->Input(0));
     ctx->SetOutput(0, b->Select(pred, b->Mul(scale, ctx->Input(0)),
                                       b->Mul(scale_alpha, expm1)));
   }
@@ -86,9 +84,8 @@ class SeluGradOp : public XlaOpKernel {
   // Return the lhs (incoming gradient) if the rhs (input feature) > 0,
   // otherwise return lhs * (1 + rhs).
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* b = ctx->builder();
+    xla::XlaBuilder* b = ctx->builder();
     const auto zero = XlaHelpers::Zero(b, input_type(0));
-    const auto one = XlaHelpers::One(b, input_type(0));
     const auto scale = XlaHelpers::FloatLiteral(b, input_type(0),
             1.0507009873554804934193349852946);
     const auto scale_alpha = XlaHelpers::FloatLiteral(b, input_type(0),
diff --git a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
index b2970eae20a3fb..6df01cabbf1d98 100644
--- a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
@@ -93,7 +93,7 @@ class ExtractImagePatchesOp : public XlaOpKernel {
                                 input_shape.DebugString()));
     const int64 depth = input_shape.dim_size(feature_dim);
 
-    xla::ComputationBuilder* builder = ctx->builder();
+    xla::XlaBuilder* builder = ctx->builder();
 
     // The following code is equivalent to:
     // eye = np.eye(kH * kW * D).reshape([kH, kW, D, kH * kW * kD])
@@ -110,7 +110,7 @@ class ExtractImagePatchesOp : public XlaOpKernel {
     // Builds an identity matrix as a broadcast equality of iotas.
     // iota = np.arange(np.prod(ksize), depth)
     // filter = np.equal(np.reshape(iota, [-1, 1]), iota).astype(np.float32)
-    xla::ComputationDataHandle iota;
+    xla::XlaOp iota;
     TF_CHECK_OK(XlaHelpers::Iota(builder, DataType::DT_INT32,
                                  kernel_size * depth, &iota));
 
@@ -147,7 +147,7 @@ class ExtractImagePatchesOp : public XlaOpKernel {
                    &padding[i].first, &padding[i].second));
     }
 
-    xla::ComputationDataHandle conv =
+    xla::XlaOp conv =
         builder->ConvGeneralDilated(ctx->Input(0), filter, window_strides,
                                     padding, lhs_dilation, rhs_dilation, dims);
     ctx->SetOutput(0, conv);
diff --git a/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc b/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc
index 99470d70e709dd..8f0de0a524c908 100644
--- a/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc
@@ -44,23 +44,20 @@ void CpuNudge(const float min, const float max, const float quant_min,
 }
 
 // An XLA version of CpuNudge().
-void XlaNudge(xla::ComputationBuilder* b, const DataType data_type,
-              const xla::ComputationDataHandle& min,
-              const xla::ComputationDataHandle& max,
+void XlaNudge(xla::XlaBuilder* b, const DataType data_type,
+              const xla::XlaOp& min, const xla::XlaOp& max,
               const float quant_min_value, const float quant_max_value,
-              xla::ComputationDataHandle* nudged_min,
-              xla::ComputationDataHandle* nudged_max,
-              xla::ComputationDataHandle* scale) {
+              xla::XlaOp* nudged_min, xla::XlaOp* nudged_max,
+              xla::XlaOp* scale) {
   *scale = b->Div(b->Sub(max, min),
                   XlaHelpers::FloatLiteral(b, data_type,
                                            quant_max_value - quant_min_value));
-  xla::ComputationDataHandle quant_min =
+  xla::XlaOp quant_min =
       XlaHelpers::FloatLiteral(b, data_type, quant_min_value);
-  xla::ComputationDataHandle zero_point_from_min =
-      b->Sub(quant_min, b->Div(min, *scale));
-  xla::ComputationDataHandle quant_max =
+  xla::XlaOp zero_point_from_min = b->Sub(quant_min, b->Div(min, *scale));
+  xla::XlaOp quant_max =
       XlaHelpers::FloatLiteral(b, data_type, quant_max_value);
-  xla::ComputationDataHandle nudged_zero_point =
+  xla::XlaOp nudged_zero_point =
       b->Select(b->Le(zero_point_from_min, quant_min), quant_min,
                 b->Select(b->Ge(zero_point_from_min, quant_max), quant_max,
                           b->Round(zero_point_from_min)));
@@ -68,22 +65,18 @@ void XlaNudge(xla::ComputationBuilder* b, const DataType data_type,
   *nudged_max = b->Mul(b->Sub(quant_max, nudged_zero_point), *scale);
 }
 
-xla::ComputationDataHandle Quantize(
-    xla::ComputationBuilder* b, const xla::ComputationDataHandle& input,
-    const DataType data_type,
-    const xla::ComputationDataHandle& nudged_input_min,
-    const xla::ComputationDataHandle& nudged_input_max,
-    const xla::ComputationDataHandle& input_scale) {
-  xla::ComputationDataHandle one = XlaHelpers::FloatLiteral(b, data_type, 1.0f);
-  xla::ComputationDataHandle inv_scale = b->Div(one, input_scale);
-  xla::ComputationDataHandle half =
-      XlaHelpers::FloatLiteral(b, data_type, 0.5f);
-
-  xla::ComputationDataHandle clamped =
-      b->Clamp(nudged_input_min, input, nudged_input_max);
-  xla::ComputationDataHandle clamped_shifted =
-      b->Sub(clamped, nudged_input_min);
-  xla::ComputationDataHandle rounded =
+xla::XlaOp Quantize(xla::XlaBuilder* b, const xla::XlaOp& input,
+                    const DataType data_type,
+                    const xla::XlaOp& nudged_input_min,
+                    const xla::XlaOp& nudged_input_max,
+                    const xla::XlaOp& input_scale) {
+  xla::XlaOp one = XlaHelpers::FloatLiteral(b, data_type, 1.0f);
+  xla::XlaOp inv_scale = b->Div(one, input_scale);
+  xla::XlaOp half = XlaHelpers::FloatLiteral(b, data_type, 0.5f);
+
+  xla::XlaOp clamped = b->Clamp(nudged_input_min, input, nudged_input_max);
+  xla::XlaOp clamped_shifted = b->Sub(clamped, nudged_input_min);
+  xla::XlaOp rounded =
       b->Floor(b->Add(b->Mul(clamped_shifted, inv_scale), half));
   return b->Add(b->Mul(rounded, input_scale), nudged_input_min);
 }
@@ -111,18 +104,18 @@ class FakeQuantWithMinMaxArgsOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationDataHandle input = ctx->Input(0);
+    xla::XlaOp input = ctx->Input(0);
     const DataType data_type = ctx->input_type(0);
 
-    xla::ComputationBuilder* b = ctx->builder();
-    xla::ComputationDataHandle nudged_input_min =
+    xla::XlaBuilder* b = ctx->builder();
+    xla::XlaOp nudged_input_min =
         XlaHelpers::FloatLiteral(b, data_type, nudged_input_min_);
-    xla::ComputationDataHandle nudged_input_max =
+    xla::XlaOp nudged_input_max =
         XlaHelpers::FloatLiteral(b, data_type, nudged_input_max_);
-    xla::ComputationDataHandle input_scale =
+    xla::XlaOp input_scale =
         XlaHelpers::FloatLiteral(b, data_type, input_scale_);
-    xla::ComputationDataHandle output = Quantize(
-        b, input, data_type, nudged_input_min, nudged_input_max, input_scale);
+    xla::XlaOp output = Quantize(b, input, data_type, nudged_input_min,
+                                 nudged_input_max, input_scale);
     ctx->SetOutput(0, output);
   }
 
@@ -159,23 +152,22 @@ class FakeQuantWithMinMaxArgsGradOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationDataHandle gradient = ctx->Input(0);
+    xla::XlaOp gradient = ctx->Input(0);
     const TensorShape gradient_shape = ctx->InputShape(0);
-    xla::ComputationDataHandle input = ctx->Input(1);
+    xla::XlaOp input = ctx->Input(1);
     const DataType data_type = ctx->input_type(1);
 
-    xla::ComputationBuilder* b = ctx->builder();
-    xla::ComputationDataHandle nudged_input_min =
+    xla::XlaBuilder* b = ctx->builder();
+    xla::XlaOp nudged_input_min =
         XlaHelpers::FloatLiteral(b, data_type, nudged_input_min_);
-    xla::ComputationDataHandle nudged_input_max =
+    xla::XlaOp nudged_input_max =
         XlaHelpers::FloatLiteral(b, data_type, nudged_input_max_);
 
-    xla::ComputationDataHandle between_nudged_min_max =
+    xla::XlaOp between_nudged_min_max =
         b->And(b->Le(nudged_input_min, input), b->Le(input, nudged_input_max));
-    xla::ComputationDataHandle zeroes = b->Broadcast(
-        XlaHelpers::Zero(b, data_type), gradient_shape.dim_sizes());
-    xla::ComputationDataHandle output =
-        b->Select(between_nudged_min_max, gradient, zeroes);
+    xla::XlaOp zeroes = b->Broadcast(XlaHelpers::Zero(b, data_type),
+                                     gradient_shape.dim_sizes());
+    xla::XlaOp output = b->Select(between_nudged_min_max, gradient, zeroes);
     ctx->SetOutput(0, output);
   }
 
@@ -204,18 +196,18 @@ class FakeQuantWithMinMaxVarsOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationDataHandle input = ctx->Input(0);
+    xla::XlaOp input = ctx->Input(0);
     const DataType data_type = ctx->input_type(0);
-    xla::ComputationDataHandle input_min = ctx->Input(1);
-    xla::ComputationDataHandle input_max = ctx->Input(2);
+    xla::XlaOp input_min = ctx->Input(1);
+    xla::XlaOp input_max = ctx->Input(2);
 
-    xla::ComputationBuilder* b = ctx->builder();
-    xla::ComputationDataHandle nudged_input_min, nudged_input_max, input_scale;
+    xla::XlaBuilder* b = ctx->builder();
+    xla::XlaOp nudged_input_min, nudged_input_max, input_scale;
     XlaNudge(b, data_type, input_min, input_max, quant_min_, quant_max_,
              &nudged_input_min, &nudged_input_max, &input_scale);
 
-    xla::ComputationDataHandle output = Quantize(
-        b, input, data_type, nudged_input_min, nudged_input_max, input_scale);
+    xla::XlaOp output = Quantize(b, input, data_type, nudged_input_min,
+                                 nudged_input_max, input_scale);
     ctx->SetOutput(0, output);
   }
 
@@ -243,47 +235,43 @@ class FakeQuantWithMinMaxVarsGradOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationDataHandle gradient = ctx->Input(0);
+    xla::XlaOp gradient = ctx->Input(0);
     const TensorShape gradient_shape = ctx->InputShape(0);
-    xla::ComputationDataHandle input = ctx->Input(1);
+    xla::XlaOp input = ctx->Input(1);
     const DataType data_type = ctx->input_type(1);
     const DataType accumulation_type =
         XlaHelpers::SumAccumulationType(data_type);
-    xla::ComputationDataHandle input_min = ctx->Input(2);
-    xla::ComputationDataHandle input_max = ctx->Input(3);
+    xla::XlaOp input_min = ctx->Input(2);
+    xla::XlaOp input_max = ctx->Input(3);
 
-    xla::ComputationBuilder* b = ctx->builder();
-    xla::ComputationDataHandle nudged_input_min, nudged_input_max, input_scale;
+    xla::XlaBuilder* b = ctx->builder();
+    xla::XlaOp nudged_input_min, nudged_input_max, input_scale;
     XlaNudge(b, data_type, input_min, input_max, quant_min_, quant_max_,
              &nudged_input_min, &nudged_input_max, &input_scale);
 
-    xla::ComputationDataHandle between_nudged_min_max =
+    xla::XlaOp between_nudged_min_max =
         b->And(b->Le(nudged_input_min, input), b->Le(input, nudged_input_max));
-    xla::ComputationDataHandle zero = XlaHelpers::Zero(b, data_type);
-    xla::ComputationDataHandle zeroes =
-        b->Broadcast(zero, gradient_shape.dim_sizes());
-    xla::ComputationDataHandle output0 =
-        b->Select(between_nudged_min_max, gradient, zeroes);
+    xla::XlaOp zero = XlaHelpers::Zero(b, data_type);
+    xla::XlaOp zeroes = b->Broadcast(zero, gradient_shape.dim_sizes());
+    xla::XlaOp output0 = b->Select(between_nudged_min_max, gradient, zeroes);
     ctx->SetOutput(0, output0);
 
-    xla::ComputationDataHandle below_min = b->Lt(input, nudged_input_min);
-    xla::ComputationDataHandle select1 = b->Select(below_min, gradient, zeroes);
-    xla::ComputationDataHandle reduce1 = b->ReduceAll(
+    xla::XlaOp below_min = b->Lt(input, nudged_input_min);
+    xla::XlaOp select1 = b->Select(below_min, gradient, zeroes);
+    xla::XlaOp reduce1 = b->ReduceAll(
         XlaHelpers::ConvertElementType(b, select1, accumulation_type),
         XlaHelpers::Zero(b, accumulation_type),
         *ctx->GetOrCreateAdd(accumulation_type));
-    xla::ComputationDataHandle output1 =
-        XlaHelpers::ConvertElementType(b, reduce1, data_type);
+    xla::XlaOp output1 = XlaHelpers::ConvertElementType(b, reduce1, data_type);
     ctx->SetOutput(1, output1);
 
-    xla::ComputationDataHandle above_max = b->Gt(input, nudged_input_max);
-    xla::ComputationDataHandle select2 = b->Select(above_max, gradient, zeroes);
-    xla::ComputationDataHandle reduce2 = b->ReduceAll(
+    xla::XlaOp above_max = b->Gt(input, nudged_input_max);
+    xla::XlaOp select2 = b->Select(above_max, gradient, zeroes);
+    xla::XlaOp reduce2 = b->ReduceAll(
         XlaHelpers::ConvertElementType(b, select2, accumulation_type),
         XlaHelpers::Zero(b, accumulation_type),
         *ctx->GetOrCreateAdd(accumulation_type));
-    xla::ComputationDataHandle output2 =
-        XlaHelpers::ConvertElementType(b, reduce2, data_type);
+    xla::XlaOp output2 = XlaHelpers::ConvertElementType(b, reduce2, data_type);
     ctx->SetOutput(2, output2);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/fft_ops.cc b/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
index a4f3c1c3ad9a92..933924cad1c7ca 100644
--- a/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
@@ -62,9 +62,8 @@ class GenericFftOp : public XlaOpKernel {
       }
     }
 
-    xla::ComputationBuilder* b = ctx->builder();
-    xla::ComputationDataHandle fft =
-        b->Fft(ctx->Input(0), fft_type_, fft_length);
+    xla::XlaBuilder* b = ctx->builder();
+    xla::XlaOp fft = b->Fft(ctx->Input(0), fft_type_, fft_length);
     ctx->SetOutput(0, fft);
   }
 
@@ -82,9 +81,11 @@ class FFTOp : public GenericFftOp {
   explicit FFTOp(OpKernelConstruction* ctx)
       : GenericFftOp(ctx, /*fft_type=*/FftType::FFT, /*fft_rank=*/FFTRank) {}
 };
-REGISTER_XLA_OP(Name("FFT"), FFTOp<1>);
-REGISTER_XLA_OP(Name("FFT2D"), FFTOp<2>);
-REGISTER_XLA_OP(Name("FFT3D"), FFTOp<3>);
+REGISTER_XLA_OP(Name("FFT").TypeConstraint("Tcomplex", DT_COMPLEX64), FFTOp<1>);
+REGISTER_XLA_OP(Name("FFT2D").TypeConstraint("Tcomplex", DT_COMPLEX64),
+                FFTOp<2>);
+REGISTER_XLA_OP(Name("FFT3D").TypeConstraint("Tcomplex", DT_COMPLEX64),
+                FFTOp<3>);
 
 template <int FFTRank>
 class IFFTOp : public GenericFftOp {
@@ -92,9 +93,12 @@ class IFFTOp : public GenericFftOp {
   explicit IFFTOp(OpKernelConstruction* ctx)
       : GenericFftOp(ctx, /*fft_type=*/FftType::IFFT, /*fft_rank=*/FFTRank) {}
 };
-REGISTER_XLA_OP(Name("IFFT"), IFFTOp<1>);
-REGISTER_XLA_OP(Name("IFFT2D"), IFFTOp<2>);
-REGISTER_XLA_OP(Name("IFFT3D"), IFFTOp<3>);
+REGISTER_XLA_OP(Name("IFFT").TypeConstraint("Tcomplex", DT_COMPLEX64),
+                IFFTOp<1>);
+REGISTER_XLA_OP(Name("IFFT2D").TypeConstraint("Tcomplex", DT_COMPLEX64),
+                IFFTOp<2>);
+REGISTER_XLA_OP(Name("IFFT3D").TypeConstraint("Tcomplex", DT_COMPLEX64),
+                IFFTOp<3>);
 
 template <int FFTRank>
 class RFFTOp : public GenericFftOp {
diff --git a/tensorflow/compiler/tf2xla/kernels/fill_op.cc b/tensorflow/compiler/tf2xla/kernels/fill_op.cc
index eaa13b8dfacce9..e4467a0fb138ed 100644
--- a/tensorflow/compiler/tf2xla/kernels/fill_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fill_op.cc
@@ -48,7 +48,7 @@ class FillOp : public XlaOpKernel {
                             0, {dims_shape.num_elements()}, &dims_literal));
 
     // Convert the dims literal into a vector that we can pass to
-    // ComputationBuilder.
+    // XlaBuilder.
     std::vector<int64> broadcast;
     broadcast.reserve(dims_literal.shape().dimensions(0));
     for (int i = 0; i < dims_literal.shape().dimensions(0); ++i) {
@@ -56,7 +56,7 @@ class FillOp : public XlaOpKernel {
     }
     // Look up the value input, reshaping to a scalar if it was a
     // 'legacy' scalar (secretly a vector).
-    xla::ComputationDataHandle data = ctx->Input(1);
+    xla::XlaOp data = ctx->Input(1);
     if (value_shape.dims() > 0) {
       CHECK_EQ(value_shape.dims(), 1);
       data = ctx->builder()->Reshape(data, {});
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op.cc b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
index 0b79cb0916ee8a..d13e25bcddae16 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
@@ -26,13 +26,11 @@ limitations under the License.
 
 namespace tensorflow {
 
-Status XlaGather(const xla::ComputationDataHandle& input,
-                 const TensorShape& input_shape,
-                 const xla::ComputationDataHandle& indices,
-                 const TensorShape& indices_shape, int64 axis,
-                 bool indices_are_nd, DataType dtype, DataType index_type,
-                 xla::ComputationBuilder* builder,
-                 xla::ComputationDataHandle* gather_output) {
+Status XlaGather(const xla::XlaOp& input, const TensorShape& input_shape,
+                 const xla::XlaOp& indices, const TensorShape& indices_shape,
+                 int64 axis, bool indices_are_nd, DataType dtype,
+                 DataType index_type, xla::XlaBuilder* builder,
+                 xla::XlaOp* gather_output) {
   // There is no deep reason why we need this precondition, but this is the only
   // combination that is used and tested today.
   CHECK(!indices_are_nd || axis == 0);
@@ -153,7 +151,7 @@ class GatherOp : public XlaOpKernel {
   explicit GatherOp(OpKernelConstruction* context) : XlaOpKernel(context) {}
 
   void Compile(XlaOpKernelContext* context) override {
-    xla::ComputationBuilder* builder = context->builder();
+    xla::XlaBuilder* builder = context->builder();
     auto input = context->Input(0);
     auto input_shape = context->InputShape(0);
     auto indices = context->Input(1);
@@ -182,7 +180,7 @@ class GatherOp : public XlaOpKernel {
     OP_REQUIRES(context, index_type == DT_INT32 || index_type == DT_INT64,
                 errors::InvalidArgument("indices must be int32 or int64"));
 
-    xla::ComputationDataHandle gather;
+    xla::XlaOp gather;
     OP_REQUIRES_OK(
         context, XlaGather(input, input_shape, indices, indices_shape, axis,
                            /*indices_are_nd=*/false, input_type(0), index_type,
@@ -220,10 +218,10 @@ class GatherNdOp : public XlaOpKernel {
             indices_shape.dim_size(indices_shape.dims() - 1), " vs. ",
             params_shape.dims()));
 
-    xla::ComputationBuilder* builder = context->builder();
+    xla::XlaBuilder* builder = context->builder();
     auto params = context->Input(0);
     auto indices = context->Input(1);
-    xla::ComputationDataHandle gather;
+    xla::XlaOp gather;
     OP_REQUIRES_OK(context, XlaGather(params, params_shape, indices,
                                       indices_shape, /*axis=*/0,
                                       /*indices_are_nd=*/true, params_type,
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h b/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h
index f9376f0eabdc0f..d898e43b858bac 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/util/bcast.h"
 
@@ -33,13 +33,11 @@ namespace tensorflow {
 // If `indices_are_nd` is true, the last dimension of `indices` are treated as
 // a multidimensional index values. Otherwise, `indices` is treated as a tensor
 // of scalar indices.
-Status XlaGather(const xla::ComputationDataHandle& input,
-                 const TensorShape& input_shape,
-                 const xla::ComputationDataHandle& indices,
-                 const TensorShape& indices_shape, int64 axis,
-                 bool indices_are_nd, DataType dtype, DataType index_type,
-                 xla::ComputationBuilder* builder,
-                 xla::ComputationDataHandle* gather_output);
+Status XlaGather(const xla::XlaOp& input, const TensorShape& input_shape,
+                 const xla::XlaOp& indices, const TensorShape& indices_shape,
+                 int64 axis, bool indices_are_nd, DataType dtype,
+                 DataType index_type, xla::XlaBuilder* builder,
+                 xla::XlaOp* gather_output);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/kernels/if_op.cc b/tensorflow/compiler/tf2xla/kernels/if_op.cc
index eefbe55c815d80..8b9b026643cf35 100644
--- a/tensorflow/compiler/tf2xla/kernels/if_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/if_op.cc
@@ -37,7 +37,7 @@ XlaIfOp::XlaIfOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
 // TODO(b/35949885): There is duplication here with the handling of the
 // while_op. Refactor the common code out/rework.
 void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
-  xla::ComputationBuilder* b = ctx->builder();
+  xla::XlaBuilder* b = ctx->builder();
 
   OP_REQUIRES(ctx, cond_type_ == DT_BOOL,
               errors::InvalidArgument(
@@ -48,7 +48,7 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
 
   VLOG(1) << "Building If: " << input_types_.size() << " inputs";
 
-  std::vector<xla::ComputationDataHandle> inputs(input_types_.size());
+  std::vector<xla::XlaOp> inputs(input_types_.size());
   std::vector<XlaCompiler::Argument> arguments(input_types_.size());
   for (int i = 0; i < input_types_.size(); ++i) {
     XlaCompiler::Argument& arg = arguments[i];
@@ -175,19 +175,19 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
             "Mismatch in resource of then and else branch for resource ", i));
   }
 
-  xla::ComputationDataHandle outputs =
+  xla::XlaOp outputs =
       b->Conditional(ctx->Input(0), b->Tuple(inputs), *then_result.computation,
                      b->Tuple(inputs), *else_result.computation);
   // Sets non-variable outputs.
   for (int i = 0; i < output_types_.size(); ++i) {
     if (ctx->input_type(i) != DT_RESOURCE) {
-      xla::ComputationDataHandle output_handle = b->GetTupleElement(outputs, i);
+      xla::XlaOp output_handle = b->GetTupleElement(outputs, i);
       if (VLOG_IS_ON(2)) {
         LOG(INFO) << "Setting output " << i;
         auto shape_or = b->GetShape(output_handle);
         if (shape_or.ok()) {
           LOG(INFO) << "Shape for output " << i << ": "
-                    << xla::ShapeUtil::HumanString(*shape_or.ValueOrDie());
+                    << xla::ShapeUtil::HumanString(shape_or.ValueOrDie());
         } else {
           LOG(INFO) << "Shape unknown for output " << i;
         }
diff --git a/tensorflow/compiler/tf2xla/kernels/image_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
index 5eeda79a935e81..1568b33679963c 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
@@ -23,10 +23,9 @@ namespace {
 
 // Converts 'input' from RGB format to HSV format.
 // 'shape' is the shape of the red/green/blue tensors.
-std::array<xla::ComputationDataHandle, 3> RGBToHSV(
-    XlaOpKernelContext* ctx, xla::ComputationBuilder* b,
-    const std::array<xla::ComputationDataHandle, 3>& rgb, DataType dtype,
-    const TensorShape& shape) {
+std::array<xla::XlaOp, 3> RGBToHSV(XlaOpKernelContext* ctx, xla::XlaBuilder* b,
+                                   const std::array<xla::XlaOp, 3>& rgb,
+                                   DataType dtype, const TensorShape& shape) {
   auto zero = XlaHelpers::Zero(b, dtype);
   auto one = XlaHelpers::One(b, dtype);
 
@@ -54,12 +53,12 @@ std::array<xla::ComputationDataHandle, 3> RGBToHSV(
 }
 
 // Converts 'input' from HSV format to RGB format.
-std::array<xla::ComputationDataHandle, 3> HSVToRGB(
-    xla::ComputationBuilder* b,
-    const std::array<xla::ComputationDataHandle, 3>& hsv, DataType dtype) {
-  xla::ComputationDataHandle hue = hsv[0];
-  xla::ComputationDataHandle saturation = hsv[1];
-  xla::ComputationDataHandle value = hsv[2];
+std::array<xla::XlaOp, 3> HSVToRGB(xla::XlaBuilder* b,
+                                   const std::array<xla::XlaOp, 3>& hsv,
+                                   DataType dtype) {
+  xla::XlaOp hue = hsv[0];
+  xla::XlaOp saturation = hsv[1];
+  xla::XlaOp value = hsv[2];
   auto zero = XlaHelpers::Zero(b, dtype);
   auto one = XlaHelpers::FloatLiteral(b, dtype, 1.0);
   auto two = XlaHelpers::FloatLiteral(b, dtype, 2.0);
@@ -95,16 +94,16 @@ class RGBToHSVOp : public XlaOpKernel {
         errors::FailedPrecondition("input must have 3 channels but input has ",
                                    channels, " channels."));
 
-    xla::ComputationBuilder* b = context->builder();
-    xla::ComputationDataHandle input = context->Input(0);
+    xla::XlaBuilder* b = context->builder();
+    xla::XlaOp input = context->Input(0);
 
-    xla::ComputationDataHandle red =
+    xla::XlaOp red =
         b->SliceInDim(input, /*start_index=*/0, /*limit_index=*/1, /*stride=*/1,
                       /*dimno=*/channel_dim);
-    xla::ComputationDataHandle green =
+    xla::XlaOp green =
         b->SliceInDim(input, /*start_index=*/1, /*limit_index=*/2, /*stride=*/1,
                       /*dimno=*/channel_dim);
-    xla::ComputationDataHandle blue =
+    xla::XlaOp blue =
         b->SliceInDim(input, /*start_index=*/2, /*limit_index=*/3, /*stride=*/1,
                       /*dimno=*/channel_dim);
     TensorShape channel_shape = input_shape;
@@ -133,15 +132,15 @@ class HSVToRGBOp : public XlaOpKernel {
         errors::FailedPrecondition("input must have 3 channels but input has ",
                                    channels, " channels."));
 
-    xla::ComputationBuilder* b = context->builder();
-    xla::ComputationDataHandle input = context->Input(0);
-    xla::ComputationDataHandle hue =
+    xla::XlaBuilder* b = context->builder();
+    xla::XlaOp input = context->Input(0);
+    xla::XlaOp hue =
         b->SliceInDim(input, /*start_index=*/0, /*limit_index=*/1, /*stride=*/1,
                       /*dimno=*/channel_dim);
-    xla::ComputationDataHandle saturation =
+    xla::XlaOp saturation =
         b->SliceInDim(input, /*start_index=*/1, /*limit_index=*/2, /*stride=*/1,
                       /*dimno=*/channel_dim);
-    xla::ComputationDataHandle value =
+    xla::XlaOp value =
         b->SliceInDim(input, /*start_index=*/2, /*limit_index=*/3, /*stride=*/1,
                       /*dimno=*/channel_dim);
 
@@ -174,9 +173,9 @@ class AdjustContrastOpV2 : public XlaOpKernel {
                 errors::InvalidArgument("contrast_factor must be scalar: ",
                                         factor_shape.DebugString()));
 
-    xla::ComputationBuilder* b = context->builder();
-    xla::ComputationDataHandle input = context->Input(0);
-    xla::ComputationDataHandle factor = context->Input(1);
+    xla::XlaBuilder* b = context->builder();
+    xla::XlaOp input = context->Input(0);
+    xla::XlaOp factor = context->Input(1);
 
     DataType type = context->input_type(0);
 
@@ -221,19 +220,19 @@ class AdjustSaturationOp : public XlaOpKernel {
         errors::InvalidArgument("input must have 3 channels but instead has ",
                                 channels, " channels."));
 
-    xla::ComputationBuilder* b = context->builder();
-    xla::ComputationDataHandle input = context->Input(0);
-    xla::ComputationDataHandle scale = context->Input(1);
+    xla::XlaBuilder* b = context->builder();
+    xla::XlaOp input = context->Input(0);
+    xla::XlaOp scale = context->Input(1);
 
     DataType type = context->input_type(0);
 
-    xla::ComputationDataHandle red =
+    xla::XlaOp red =
         b->SliceInDim(input, /*start_index=*/0, /*limit_index=*/1, /*stride=*/1,
                       /*dimno=*/channel_dim);
-    xla::ComputationDataHandle green =
+    xla::XlaOp green =
         b->SliceInDim(input, /*start_index=*/1, /*limit_index=*/2, /*stride=*/1,
                       /*dimno=*/channel_dim);
-    xla::ComputationDataHandle blue =
+    xla::XlaOp blue =
         b->SliceInDim(input, /*start_index=*/2, /*limit_index=*/3, /*stride=*/1,
                       /*dimno=*/channel_dim);
     TensorShape channel_shape = input_shape;
@@ -271,19 +270,19 @@ class AdjustHueOp : public XlaOpKernel {
         errors::InvalidArgument("input must have 3 channels but instead has ",
                                 channels, " channels."));
 
-    xla::ComputationBuilder* b = context->builder();
-    xla::ComputationDataHandle input = context->Input(0);
-    xla::ComputationDataHandle delta = context->Input(1);
+    xla::XlaBuilder* b = context->builder();
+    xla::XlaOp input = context->Input(0);
+    xla::XlaOp delta = context->Input(1);
 
     DataType type = context->input_type(0);
 
-    xla::ComputationDataHandle red =
+    xla::XlaOp red =
         b->SliceInDim(input, /*start_index=*/0, /*limit_index=*/1, /*stride=*/1,
                       /*dimno=*/channel_dim);
-    xla::ComputationDataHandle green =
+    xla::XlaOp green =
         b->SliceInDim(input, /*start_index=*/1, /*limit_index=*/2, /*stride=*/1,
                       /*dimno=*/channel_dim);
-    xla::ComputationDataHandle blue =
+    xla::XlaOp blue =
         b->SliceInDim(input, /*start_index=*/2, /*limit_index=*/3, /*stride=*/1,
                       /*dimno=*/channel_dim);
     TensorShape channel_shape = input_shape;
diff --git a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
index f36b3f594826c2..79d3a6979cec4c 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
@@ -99,28 +99,35 @@ ResizeConvolutionDims ComputeResizeConvolutionParameters(
   return dims;
 }
 
-xla::ComputationDataHandle MakeBilinearResizeKernel(
-    xla::ComputationBuilder* builder, gtl::ArraySlice<int64> kernel_size,
-    int64 channels) {
-  // Form a 2D convolution kernel like:
-  //       1 2 3 2 1
-  //       2 4 6 4 2
-  // 1/9 * 3 6 9 6 3
-  //       2 4 6 4 2
-  //       1 2 3 2 1
-  // by multiplying two 1D kernels of the form:
-  // 1/3 * [1 2 3 2 1]
-  auto make_1d_kernel = [](int64 n) {
-    std::vector<float> kernel(n * 2 - 1);
-    for (int64 i = 0; i < n; ++i) {
-      float v = (i + 1.0f) / n;
-      kernel[i] = v;
-      kernel[n * 2 - 2 - i] = v;
-    }
-    return kernel;
-  };
+// Form a 2D convolution kernel like:
+//       1 2 3 2 1
+//       2 4 6 4 2
+// 1/9 * 3 6 9 6 3
+//       2 4 6 4 2
+//       1 2 3 2 1
+// by multiplying two 1D kernels of the form:
+// 1/3 * [1 2 3 2 1]
+// If the 2D kernel would be very large, the 1D kernel can be applied once in
+// each dimension due to the symmetry of the kernel along all axis to reduce the
+// computational intensity.
+std::vector<float> Make1DKernel(int64 n) {
+  std::vector<float> kernel(n * 2 - 1);
+  for (int64 i = 0; i < n; ++i) {
+    float v = (i + 1.0f) / n;
+    kernel[i] = v;
+    kernel[n * 2 - 2 - i] = v;
+  }
+  return kernel;
+}
+
+// Kernels with more than 16 spatial elements are considered intense and the
+// kernel should applied to each dimension independently.
+const int64 kMax2DKernelSize = 16;
 
-  xla::ComputationDataHandle channels_iota;
+xla::XlaOp MakeBilinearResizeKernel(xla::XlaBuilder* builder,
+                                    gtl::ArraySlice<int64> kernel_size,
+                                    int64 channels) {
+  xla::XlaOp channels_iota;
   // DT_INT32 Iota will always return status::OK().
   TF_CHECK_OK(
       XlaHelpers::Iota(builder, DataType::DT_INT32, channels, &channels_iota));
@@ -133,16 +140,43 @@ xla::ComputationDataHandle MakeBilinearResizeKernel(
       xla::PrimitiveType::F32);
   return builder->Mul(
       builder->Mul(diag,
-                   builder->ConstantR1<float>(make_1d_kernel(kernel_size[1])),
+                   builder->ConstantR1<float>(Make1DKernel(kernel_size[1])),
                    /*broadcast_dimensions=*/{1}),
-      builder->ConstantR1<float>(make_1d_kernel(kernel_size[0])),
+      builder->ConstantR1<float>(Make1DKernel(kernel_size[0])),
       /*broadcast_dimensions=*/{0});
 }
 
-xla::ComputationDataHandle ResizeUsingDilationAndConvolution(
-    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& input,
-    const int num_spatial_dims, std::vector<int64> in_size,
-    std::vector<int64> out_size, const int64 channels) {
+xla::XlaOp MakeBilinearResizeKernelInDim(xla::XlaBuilder* builder,
+                                         gtl::ArraySlice<int64> kernel_size,
+                                         int64 channels, int64 dim) {
+  xla::XlaOp channels_iota;
+  // DT_INT32 Iota will always return status::OK().
+  TF_CHECK_OK(
+      XlaHelpers::Iota(builder, DataType::DT_INT32, channels, &channels_iota));
+
+  auto diag = builder->ConvertElementType(
+      builder->Eq(builder->Broadcast(
+                      channels_iota,
+                      {dim == 0 ? (2 * kernel_size[0] - 1) : 1,
+                       dim == 1 ? (2 * kernel_size[1] - 1) : 1, channels}),
+                  channels_iota, /*broadcast_dimensions=*/{2}),
+      xla::PrimitiveType::F32);
+  if (dim == 1) {
+    return builder->Mul(
+        diag, builder->ConstantR1<float>(Make1DKernel(kernel_size[1])),
+        /*broadcast_dimensions=*/{1});
+  }
+  return builder->Mul(diag,
+                      builder->ConstantR1<float>(Make1DKernel(kernel_size[0])),
+                      /*broadcast_dimensions=*/{0});
+}
+
+xla::XlaOp ResizeUsingDilationAndConvolution(xla::XlaBuilder* builder,
+                                             const xla::XlaOp& input,
+                                             const int num_spatial_dims,
+                                             std::vector<int64> in_size,
+                                             std::vector<int64> out_size,
+                                             const int64 channels) {
   // Picture for a 1x3 to 1x4 resize:
   // stride = 2, kernel size = 3
   // Input:
@@ -163,20 +197,42 @@ xla::ComputationDataHandle ResizeUsingDilationAndConvolution(
     dimension_numbers.add_output_spatial_dimensions(1 + i);
     dimension_numbers.add_kernel_spatial_dimensions(i);
   }
-  dimension_numbers.set_kernel_input_feature_dimension(num_spatial_dims);
-  dimension_numbers.set_kernel_output_feature_dimension(num_spatial_dims + 1);
+  dimension_numbers.set_kernel_input_feature_dimension(num_spatial_dims + 1);
+  dimension_numbers.set_kernel_output_feature_dimension(num_spatial_dims);
 
   ResizeConvolutionDims dims =
       ComputeResizeConvolutionParameters(in_size, out_size);
-  xla::ComputationDataHandle kernel =
-      MakeBilinearResizeKernel(builder, dims.kernel_size, channels);
-  xla::ComputationDataHandle output = builder->ConvGeneralDilated(
-      input, kernel, dims.stride,
-      /*padding=*/
-      {{dims.kernel_size[0] - 1, dims.kernel_size[0] - 1},
-       {dims.kernel_size[1] - 1, dims.kernel_size[1] - 1}},
-      /*lhs_dilation=*/dims.kernel_size,
-      /*rhs_dilation=*/{1, 1}, dimension_numbers);
+  xla::XlaOp output;
+  // Split convolutions into independent dimensions if they wmuld be a very
+  // large kernel.
+  if (dims.kernel_size[0] * dims.kernel_size[1] < kMax2DKernelSize) {
+    xla::XlaOp kernel =
+        MakeBilinearResizeKernel(builder, dims.kernel_size, channels);
+    output = builder->ConvGeneralDilated(
+        input, kernel, dims.stride,
+        /*padding=*/
+        {{dims.kernel_size[0] - 1, dims.kernel_size[0] - 1},
+         {dims.kernel_size[1] - 1, dims.kernel_size[1] - 1}},
+        /*lhs_dilation=*/dims.kernel_size,
+        /*rhs_dilation=*/{1, 1}, dimension_numbers);
+  } else {
+    xla::XlaOp kernel0 =
+        MakeBilinearResizeKernelInDim(builder, dims.kernel_size, channels, 0);
+    output = builder->ConvGeneralDilated(
+        input, kernel0, {dims.stride[0], 1},
+        /*padding=*/
+        {{dims.kernel_size[0] - 1, dims.kernel_size[0] - 1}, {0, 0}},
+        /*lhs_dilation=*/{dims.kernel_size[0], 1},
+        /*rhs_dilation=*/{1, 1}, dimension_numbers);
+    xla::XlaOp kernel1 =
+        MakeBilinearResizeKernelInDim(builder, dims.kernel_size, channels, 1);
+    output = builder->ConvGeneralDilated(
+        output, kernel1, {1, dims.stride[1]},
+        /*padding=*/
+        {{0, 0}, {dims.kernel_size[1] - 1, dims.kernel_size[1] - 1}},
+        /*lhs_dilation=*/{1, dims.kernel_size[1]},
+        /*rhs_dilation=*/{1, 1}, dimension_numbers);
+  }
 
   // Add broadcasts to handle expanding from a size == 1 dimension to a
   // size > 1 dimension.
@@ -189,10 +245,12 @@ xla::ComputationDataHandle ResizeUsingDilationAndConvolution(
   return output;
 }
 
-xla::ComputationDataHandle ResizeUsingDilationAndConvolutionGradOp(
-    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& grad,
-    const int num_spatial_dims, std::vector<int64> in_size,
-    std::vector<int64> grad_size, const int64 channels) {
+xla::XlaOp ResizeUsingDilationAndConvolutionGradOp(xla::XlaBuilder* builder,
+                                                   const xla::XlaOp& grad,
+                                                   const int num_spatial_dims,
+                                                   std::vector<int64> in_size,
+                                                   std::vector<int64> grad_size,
+                                                   const int64 channels) {
   ResizeConvolutionDims dims =
       ComputeResizeConvolutionParameters(in_size, grad_size);
 
@@ -210,26 +268,63 @@ xla::ComputationDataHandle ResizeUsingDilationAndConvolutionGradOp(
   }
   dimension_numbers.set_kernel_input_feature_dimension(num_spatial_dims);
   dimension_numbers.set_kernel_output_feature_dimension(num_spatial_dims + 1);
-  xla::ComputationDataHandle kernel =
-      MakeBilinearResizeKernel(builder, dims.kernel_size, channels);
+  xla::XlaOp output;
+  if (dims.kernel_size[0] * dims.kernel_size[1] < kMax2DKernelSize) {
+    xla::XlaOp kernel =
+        MakeBilinearResizeKernel(builder, dims.kernel_size, channels);
+
+    // Broadcast the input kernel where the forward op expanded from a size == 1
+    // dimension to a size > 1 dimension. This has the effect of summing the
+    // gradient contributions in that dimension.
+    for (int i = 0; i < num_spatial_dims; ++i) {
+      if (in_size[i] == 1 && grad_size[i] > 1) {
+        kernel =
+            builder->Add(kernel, builder->ConstantR1<float>(grad_size[i], 0),
+                         /*broadcast_dimensions=*/{i});
+      }
+    }
 
-  // Broadcast the input kernel where the forward op expanded from a size == 1
-  // dimension to a size > 1 dimension. This has the effect of summing the
-  // gradient contributions in that dimension.
-  for (int i = 0; i < num_spatial_dims; ++i) {
-    if (in_size[i] == 1 && grad_size[i] > 1) {
-      kernel = builder->Add(kernel, builder->ConstantR1<float>(grad_size[i], 0),
-                            /*broadcast_dimensions=*/{i});
+    output = builder->ConvGeneralDilated(
+        grad, kernel, /*window_strides=*/dims.kernel_size,
+        /*padding=*/
+        {{dims.kernel_size[0] - 1, dims.kernel_size[0] - 1},
+         {dims.kernel_size[1] - 1, dims.kernel_size[1] - 1}},
+        /*lhs_dilation=*/dims.stride,
+        /*rhs_dilation=*/{1, 1}, dimension_numbers);
+  } else {
+    xla::XlaOp kernel0 =
+        MakeBilinearResizeKernelInDim(builder, dims.kernel_size, channels, 0);
+    xla::XlaOp kernel1 =
+        MakeBilinearResizeKernelInDim(builder, dims.kernel_size, channels, 1);
+
+    // Broadcast the input kernel where the forward op expanded from a size == 1
+    // dimension to a size > 1 dimension. This has the effect of summing the
+    // gradient contributions in that dimension.
+    if (in_size[0] == 1 && grad_size[0] > 1) {
+      kernel0 =
+          builder->Add(kernel0, builder->ConstantR1<float>(grad_size[0], 0),
+                       /*broadcast_dimensions=*/{0});
+    }
+    if (in_size[1] == 1 && grad_size[1] > 1) {
+      kernel1 =
+          builder->Add(kernel0, builder->ConstantR1<float>(grad_size[1], 0),
+                       /*broadcast_dimensions=*/{1});
     }
-  }
 
-  xla::ComputationDataHandle output = builder->ConvGeneralDilated(
-      grad, kernel, /*window_strides=*/dims.kernel_size,
-      /*padding=*/
-      {{dims.kernel_size[0] - 1, dims.kernel_size[0] - 1},
-       {dims.kernel_size[1] - 1, dims.kernel_size[1] - 1}},
-      /*lhs_dilation=*/dims.stride,
-      /*rhs_dilation=*/{1, 1}, dimension_numbers);
+    output = builder->ConvGeneralDilated(
+        grad, kernel0, /*window_strides=*/{dims.kernel_size[0], 1},
+        /*padding=*/
+        {{dims.kernel_size[0] - 1, dims.kernel_size[0] - 1}, {0, 0}},
+        /*lhs_dilation=*/{dims.stride[0], 1},
+        /*rhs_dilation=*/{1, 1}, dimension_numbers);
+
+    output = builder->ConvGeneralDilated(
+        output, kernel1, /*window_strides=*/{1, dims.kernel_size[1]},
+        /*padding=*/
+        {{0, 0}, {dims.kernel_size[1] - 1, dims.kernel_size[1] - 1}},
+        /*lhs_dilation=*/{1, dims.stride[1]},
+        /*rhs_dilation=*/{1, 1}, dimension_numbers);
+  }
 
   // If in_size[i] > 1 and grad_size[i] == 1, pad the output in dimension i.
   // Opposite of the slice performed by the forward op.
@@ -258,7 +353,7 @@ class ResizeBilinearOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* b = ctx->builder();
+    xla::XlaBuilder* b = ctx->builder();
 
     TensorShape input_shape = ctx->InputShape(0);
     OP_REQUIRES(ctx, input_shape.dims() == 4,
@@ -283,7 +378,7 @@ class ResizeBilinearOp : public XlaOpKernel {
 
     const int num_spatial_dims = 2;
 
-    xla::ComputationDataHandle input = ctx->Input(0);
+    xla::XlaOp input = ctx->Input(0);
 
     // If in_size[i] > 1 and out_size[i] == 1, slice out the first input in
     // dimension i.
@@ -318,7 +413,7 @@ class ResizeBilinearOp : public XlaOpKernel {
     // from image of size axb -> cxd is same as resizing axb -> exf -> cxd.
     //
     // This makes the convolutions kernels smaller and the operation faster.
-    xla::ComputationDataHandle output = input;
+    xla::XlaOp output = input;
     while (in_size != out_size) {
       if (in_size[0] != 1 && in_size[1] != 1) {
         std::vector<float> k = {
@@ -369,7 +464,7 @@ class ResizeBilinearGradOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* b = ctx->builder();
+    xla::XlaBuilder* b = ctx->builder();
 
     TensorShape input_shape = ctx->InputShape(1);
     OP_REQUIRES(ctx, input_shape.dims() == 4,
@@ -406,9 +501,9 @@ class ResizeBilinearGradOp : public XlaOpKernel {
 
     const int num_spatial_dims = 2;
 
-    xla::ComputationDataHandle grad = ctx->Input(0);
+    xla::XlaOp grad = ctx->Input(0);
 
-    xla::ComputationDataHandle output = grad;
+    xla::XlaOp output = grad;
     while (in_size != grad_size) {
       if (in_size[0] != 1 && in_size[1] != 1) {
         std::vector<float> k = {
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops.cc b/tensorflow/compiler/tf2xla/kernels/index_ops.cc
index 7bf4b435f526af..36eb4c75454ed8 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops.cc
@@ -61,10 +61,10 @@ void XlaArgMinMaxOp::Compile(XlaOpKernelContext* ctx) {
 
   DataType index_type = output_type(0);
 
-  xla::ComputationBuilder* b = ctx->builder();
-  xla::ComputationDataHandle input = ctx->Input(0);
+  xla::XlaBuilder* b = ctx->builder();
+  xla::XlaOp input = ctx->Input(0);
 
-  xla::ComputationDataHandle output;
+  xla::XlaOp output;
   if (is_min_) {
     OP_REQUIRES_OK(ctx,
                    XlaHelpers::ArgMin(b, ctx, input, input_shape, input_type(0),
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc b/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
index b1f3c3c298ce0c..2c2d88486fda99 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
@@ -71,10 +71,10 @@ class ArgMaxCustomCallOp : public XlaOpKernel {
     OP_REQUIRES(ctx, XlaContext::Get(ctx).allow_cpu_custom_calls(),
                 errors::InvalidArgument(
                     "ArgMax implementation requires a CustomCall on CPU"));
-    xla::ComputationBuilder& b = *ctx->builder();
+    xla::XlaBuilder& b = *ctx->builder();
 
     // XLA passes <out> to the function, so it is not included here.
-    std::vector<xla::ComputationDataHandle> args;
+    std::vector<xla::XlaOp> args;
     args.push_back(ctx->Input(0));
     args.push_back(b.ConstantLiteral(
         *xla::Literal::CreateR1<int64>(input_shape.dim_sizes())));
@@ -91,7 +91,7 @@ class ArgMaxCustomCallOp : public XlaOpKernel {
 
     // Tell XLA to call the custom code, defined in
     // index_ops_kernel_argmax_float_1d.cc.
-    xla::ComputationDataHandle output;
+    xla::XlaOp output;
     switch (input_shape.dims()) {
       case 1:
         output = b.CustomCall("argmax_float_1d_xla_impl", args, xla_shape);
diff --git a/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc b/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc
index c177f08d9c4687..1decf7d72d72bb 100644
--- a/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/no_op.h"
@@ -33,7 +33,7 @@ class L2LossOp : public XlaOpKernel {
     std::iota(dims.begin(), dims.end(), 0);
 
     DataType dtype = ctx->input_type(0);
-    xla::ComputationBuilder* const b = ctx->builder();
+    xla::XlaBuilder* const b = ctx->builder();
 
     //  output = sum(t ** 2) / 2
     const DataType accumulation_type = XlaHelpers::SumAccumulationType(dtype);
diff --git a/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc b/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc
new file mode 100644
index 00000000000000..0388b4c830702e
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc
@@ -0,0 +1,120 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// XLA-specific ListDiff Op. This only supports constant DT_INT32 and DT_INT64
+// input.
+
+#include <unordered_set>
+
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace {
+
+constexpr std::array<DataType, 2> kListDiffTypes = {DT_INT32, DT_INT64};
+
+// ListDiffOp is an XLA kernel that supports constant-only x and y input.
+class ListDiffOp : public XlaOpKernel {
+ public:
+  explicit ListDiffOp(OpKernelConstruction* context) : XlaOpKernel(context) {}
+
+  void Compile(XlaOpKernelContext* context) override {
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(context->InputShape(0)),
+                errors::InvalidArgument("ListDiff expects x as a vector, not ",
+                                        context->InputShape(0).DebugString()));
+
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(context->InputShape(1)),
+                errors::InvalidArgument("ListDiff expects y as a vector, not ",
+                                        context->InputShape(1).DebugString()));
+
+    DataType val_type = context->expected_output_dtype(0);
+    DataType idx_type = context->expected_output_dtype(1);
+
+    Status status;
+    switch (val_type) {
+      case DT_INT32:
+        status = ListDiffWithIndexType<int32>(context, idx_type);
+        break;
+      case DT_INT64:
+        status = ListDiffWithIndexType<int64>(context, idx_type);
+        break;
+      default:
+        // This should never happen since we restrict this kernel to only match
+        // inputs with supported Tensor datatype.
+        status = errors::InvalidArgument("ListDiff expects x and y as either ",
+                                         "int32 or int64, not ",
+                                         DataTypeString(val_type));
+    }
+    OP_REQUIRES_OK(context, status);
+  }
+
+ private:
+  template <typename Tval, typename Tidx>
+  Status ListDiff(XlaOpKernelContext* context) {
+    std::vector<int64> x_input, y_input;
+    TF_RETURN_IF_ERROR(context->ConstantInputAsIntVector(0, &x_input));
+    TF_RETURN_IF_ERROR(context->ConstantInputAsIntVector(1, &y_input));
+
+    std::unordered_set<Tval> y_input_set;
+    y_input_set.reserve(y_input.size());
+    for (auto y : y_input) {
+      y_input_set.insert(y);
+    }
+
+    std::vector<Tval> val_output;
+    std::vector<Tidx> idx_output;
+    auto x_size = x_input.size();
+    for (Tidx i = 0; i < x_size; ++i) {
+      if (y_input_set.count(x_input[i]) > 0) {
+        continue;
+      }
+      val_output.push_back(x_input[i]);
+      idx_output.push_back(i);
+    }
+
+    context->SetOutput(0, context->builder()->ConstantR1<Tval>(val_output));
+    context->SetOutput(1, context->builder()->ConstantR1<Tidx>(idx_output));
+    return Status::OK();
+  }
+
+  template <typename Tval>
+  Status ListDiffWithIndexType(XlaOpKernelContext* context, DataType idx_type) {
+    switch (idx_type) {
+      case DT_INT32:
+        return ListDiff<Tval, int32>(context);
+      case DT_INT64:
+        return ListDiff<Tval, int64>(context);
+      default:
+        return errors::InvalidArgument(
+            "ListDiff expects idx_out as either int32 or int64, not ",
+            DataTypeString(idx_type));
+    }
+  }
+};
+
+REGISTER_XLA_OP(Name("ListDiff")
+                    .TypeConstraint("T", kListDiffTypes)
+                    .CompileTimeConstInput("x")
+                    .CompileTimeConstInput("y"),
+                ListDiffOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc b/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc
index 1cfee3070f384a..39fbf98a627491 100644
--- a/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc
@@ -38,8 +38,8 @@ class LRNOp : public XlaOpKernel {
     OP_REQUIRES(ctx, in_shape.dims() == 4,
                 errors::InvalidArgument("in must be 4-dimensional"));
 
-    xla::ComputationBuilder* builder = ctx->builder();
-    xla::ComputationDataHandle input = ctx->Input(0);
+    xla::XlaBuilder* builder = ctx->builder();
+    xla::XlaOp input = ctx->Input(0);
 
     // sqr_sum[a, b, c, d] =
     //    sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
@@ -111,10 +111,10 @@ class LRNGradOp : public XlaOpKernel {
             "input_grads, input_image, and out_image should have the same "
             "shape"));
 
-    xla::ComputationBuilder* builder = ctx->builder();
-    xla::ComputationDataHandle in_grads = ctx->Input(0);
-    xla::ComputationDataHandle in_image = ctx->Input(1);
-    xla::ComputationDataHandle out_image = ctx->Input(2);
+    xla::XlaBuilder* builder = ctx->builder();
+    xla::XlaOp in_grads = ctx->Input(0);
+    xla::XlaOp in_image = ctx->Input(1);
+    xla::XlaOp out_image = ctx->Input(2);
 
     // This code is ported from tensorflow/core/kernels/lrn_op.cc. In Python
     // pseudo-code, the Eigen code does this for each spatial position:
@@ -166,7 +166,7 @@ class LRNGradOp : public XlaOpKernel {
     auto dy_reduced =
         XlaHelpers::ConvertElementType(builder, dy_reduce, input_type(0));
 
-    xla::ComputationDataHandle gradients = builder->Add(
+    xla::XlaOp gradients = builder->Add(
         builder->Mul(in_image, dy_reduced),
         builder->Mul(in_grads,
                      builder->Pow(norm, builder->ConstantR0<float>(-beta_))));
diff --git a/tensorflow/compiler/tf2xla/kernels/matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
index 886baf8115243a..6949b296f4b9af 100644
--- a/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
@@ -66,8 +66,8 @@ class MatMulOp : public XlaOpKernel {
                                         a_shape.DebugString(), ", In[1]: ",
                                         b_shape.DebugString()));
 
-    xla::ComputationDataHandle a = ctx->Input(0);
-    xla::ComputationDataHandle b = ctx->Input(1);
+    xla::XlaOp a = ctx->Input(0);
+    xla::XlaOp b = ctx->Input(1);
     if (is_sparse_) {
       if (a_type_ == DT_BFLOAT16) {
         a = ctx->builder()->ConvertElementType(a, xla::F32);
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc
index faa415a97b053b..fbd5dc0fdad448 100644
--- a/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc
@@ -44,10 +44,10 @@ class MatrixBandPartOp : public XlaOpKernel {
                 errors::InvalidArgument("num_upper must be scalar, got shape ",
                                         num_upper_in_shape.DebugString()));
 
-    xla::ComputationBuilder* builder = context->builder();
-    xla::ComputationDataHandle input = context->Input(0);
-    xla::ComputationDataHandle num_lower = context->Input(1);
-    xla::ComputationDataHandle num_upper = context->Input(2);
+    xla::XlaBuilder* builder = context->builder();
+    xla::XlaOp input = context->Input(0);
+    xla::XlaOp num_lower = context->Input(1);
+    xla::XlaOp num_upper = context->Input(2);
     DataType input_type = context->input_type(0);
     DataType index_type = context->input_type(1);
 
@@ -58,10 +58,10 @@ class MatrixBandPartOp : public XlaOpKernel {
 
     // Compute 'offset', which is how many diagonals we are above/below the
     // diagonal.
-    xla::ComputationDataHandle iota_m;
+    xla::XlaOp iota_m;
     OP_REQUIRES_OK(context, XlaHelpers::Iota(builder, index_type, m, &iota_m));
 
-    xla::ComputationDataHandle iota_n;
+    xla::XlaOp iota_n;
     OP_REQUIRES_OK(context, XlaHelpers::Iota(builder, index_type, n, &iota_n));
 
     auto offset = builder->Sub(builder->Broadcast(iota_n, {m}), iota_m,
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc
index b2940bdcff75a0..db53f6fef8d6bf 100644
--- a/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc
@@ -54,16 +54,16 @@ class MatrixSetDiagOp : public XlaOpKernel {
                     input_shape.DebugString(),
                     " and diagonal shape: ", diag_shape.DebugString()));
 
-    xla::ComputationBuilder* builder = context->builder();
-    xla::ComputationDataHandle input = context->Input(0);
-    xla::ComputationDataHandle diag = context->Input(1);
+    xla::XlaBuilder* builder = context->builder();
+    xla::XlaOp input = context->Input(0);
+    xla::XlaOp diag = context->Input(1);
 
     auto zero = XlaHelpers::Zero(builder, context->input_type(0));
 
     // Create an indicator tensor that is true only on the diagonal.
-    xla::ComputationDataHandle iota_m;
+    xla::XlaOp iota_m;
     OP_REQUIRES_OK(context, XlaHelpers::Iota(builder, DT_INT32, m, &iota_m));
-    xla::ComputationDataHandle iota_n;
+    xla::XlaOp iota_n;
     OP_REQUIRES_OK(context, XlaHelpers::Iota(builder, DT_INT32, n, &iota_n));
     auto indicator = builder->Eq(iota_m,
                                  builder->Broadcast(iota_n, {m}),
diff --git a/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc b/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
index 05a36a031ad73b..7e9de3ef9b245c 100644
--- a/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/mirror_pad_op.cc
@@ -25,10 +25,11 @@ class MirrorPadOp : public XlaOpKernel {
  public:
   explicit MirrorPadOp(OpKernelConstruction* context) : XlaOpKernel(context) {}
 
-  xla::StatusOr<xla::ComputationDataHandle> DoMirrorPad(
-      const xla::ComputationDataHandle& t, const xla::Shape& original_shape,
-      const xla::Literal& pad_literal, xla::ComputationBuilder* b) {
-    xla::ComputationDataHandle accum = t;
+  xla::StatusOr<xla::XlaOp> DoMirrorPad(const xla::XlaOp& t,
+                                        const xla::Shape& original_shape,
+                                        const xla::Literal& pad_literal,
+                                        xla::XlaBuilder* b) {
+    xla::XlaOp accum = t;
     for (int64 dimno = xla::ShapeUtil::Rank(original_shape) - 1; dimno >= 0;
          --dimno) {
       auto t_rev = b->Rev(accum, {dimno});
@@ -76,12 +77,12 @@ class MirrorPadOp : public XlaOpKernel {
     OP_REQUIRES_OK(
         ctx, ctx->ConstantInputReshaped(1, {fixed_dims, 2}, &pad_literal));
 
-    xla::ComputationBuilder* b = ctx->builder();
+    xla::XlaBuilder* b = ctx->builder();
     auto in0 = ctx->Input(0);
-    xla::StatusOr<std::unique_ptr<xla::Shape>> in0_shape = b->GetShape(in0);
+    xla::StatusOr<xla::Shape> in0_shape = b->GetShape(in0);
     OP_REQUIRES(ctx, in0_shape.ok(), in0_shape.status());
-    xla::StatusOr<xla::ComputationDataHandle> accum_status =
-        DoMirrorPad(in0, *in0_shape.ValueOrDie(), pad_literal, b);
+    xla::StatusOr<xla::XlaOp> accum_status =
+        DoMirrorPad(in0, in0_shape.ValueOrDie(), pad_literal, b);
 
     OP_REQUIRES_OK(ctx, accum_status.status());
 
diff --git a/tensorflow/compiler/tf2xla/kernels/no_op.cc b/tensorflow/compiler/tf2xla/kernels/no_op.cc
index 8c8a9bbe787f32..65ab9da8d7ca05 100644
--- a/tensorflow/compiler/tf2xla/kernels/no_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/no_op.cc
@@ -24,8 +24,7 @@ namespace tensorflow {
 REGISTER_XLA_OP(Name("NoOp").CompilationOnly(), NoOp);
 
 // We register ControlTrigger as a no-op. This is correct since nodes seen
-// by the XLA compiler are never dead. This may need rethinking when we add
-// support for conditionals to XLA.
-REGISTER_XLA_OP(Name("ControlTrigger"), NoOp);
+// by the XLA compiler are never dead.
+REGISTER_XLA_OP(Name("ControlTrigger").CompilationOnly(), NoOp);
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/one_hot_op.cc b/tensorflow/compiler/tf2xla/kernels/one_hot_op.cc
index 9f7c9913802d31..cac2eea96eeed7 100644
--- a/tensorflow/compiler/tf2xla/kernels/one_hot_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/one_hot_op.cc
@@ -62,7 +62,7 @@ class OneHotOp : public XlaOpKernel {
         ctx, depth >= 0,
         errors::InvalidArgument("depth must be non-negative, got: ", depth));
 
-    xla::ComputationDataHandle one_hot;
+    xla::XlaOp one_hot;
     OP_REQUIRES_OK(
         ctx, XlaHelpers::OneHot(ctx->builder(), depth, axis, input_type(0),
                                 indices_shape, ctx->Input(0), ctx->Input(2),
diff --git a/tensorflow/compiler/tf2xla/kernels/pack_op.cc b/tensorflow/compiler/tf2xla/kernels/pack_op.cc
index a4318e29d2532f..aecaabb6dcf46b 100644
--- a/tensorflow/compiler/tf2xla/kernels/pack_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pack_op.cc
@@ -43,7 +43,7 @@ class PackOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    std::vector<xla::ComputationDataHandle> values;
+    std::vector<xla::XlaOp> values;
     std::vector<TensorShape> shapes;
     OP_REQUIRES_OK(ctx, ctx->InputList("values", &values, &shapes));
     const int num = values.size();
@@ -69,7 +69,7 @@ class PackOp : public XlaOpKernel {
                                         -expanded_num_dims, ", ",
                                         expanded_num_dims, ")"));
 
-    std::vector<xla::ComputationDataHandle> reshaped_inputs(num);
+    std::vector<xla::XlaOp> reshaped_inputs(num);
 
     TensorShape child_shape(shapes[0]);
     child_shape.InsertDim(axis, 1);
diff --git a/tensorflow/compiler/tf2xla/kernels/pad_op.cc b/tensorflow/compiler/tf2xla/kernels/pad_op.cc
index 791351637aee61..7c95475e7b1f02 100644
--- a/tensorflow/compiler/tf2xla/kernels/pad_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pad_op.cc
@@ -70,7 +70,7 @@ class PadOp : public XlaOpKernel {
     }
 
     // PadV2 added a "constant_values" input that indicates the pad value.
-    xla::ComputationDataHandle constant_values;
+    xla::XlaOp constant_values;
     if (ctx->num_inputs() == 3) {
       OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(ctx->InputShape(2)),
                   errors::InvalidArgument("constant_values must be a scalar."));
diff --git a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
index 5f635dd1bc6122..f8e7b48a0fd948 100644
--- a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
@@ -66,15 +66,15 @@ class PoolingOp : public XlaOpKernel {
   int num_dims() const { return num_spatial_dims_ + 2; }
 
   // Method that builds an initial value to use in reductions.
-  virtual xla::ComputationDataHandle InitValue(xla::ComputationBuilder* b) = 0;
+  virtual xla::XlaOp InitValue(xla::XlaBuilder* b) = 0;
 
   // The reduction operation to apply to each window.
-  virtual const xla::Computation* Reduction(XlaOpKernelContext* ctx) = 0;
+  virtual const xla::XlaComputation* Reduction(XlaOpKernelContext* ctx) = 0;
 
   // A post-processing operation to apply on the outputs of the ReduceWindow.
-  virtual xla::ComputationDataHandle PostProcessOutput(
-      XlaOpKernelContext* ctx, const xla::ComputationDataHandle& output,
-      DataType dtype, const TensorShape& input_shape) = 0;
+  virtual xla::XlaOp PostProcessOutput(XlaOpKernelContext* ctx,
+                                       const xla::XlaOp& output, DataType dtype,
+                                       const TensorShape& input_shape) = 0;
 
   void Compile(XlaOpKernelContext* ctx) override {
     std::vector<int64> ksize = ksize_;
@@ -110,7 +110,7 @@ class PoolingOp : public XlaOpKernel {
                                         " operator must have ", num_dims(),
                                         " dimensions"));
 
-    xla::ComputationBuilder* const b = ctx->builder();
+    xla::XlaBuilder* const b = ctx->builder();
     auto input =
         XlaHelpers::ConvertElementType(b, ctx->Input(0), reduction_type_);
     auto reduce = ctx->builder()->ReduceWindow(
@@ -135,17 +135,17 @@ class MaxPoolOp : public PoolingOp {
       : PoolingOp(ctx, /*num_spatial_dims=*/num_spatial_dims,
                   /*reduction_type=*/ctx->input_type(0)) {}
 
-  xla::ComputationDataHandle InitValue(xla::ComputationBuilder* b) override {
+  xla::XlaOp InitValue(xla::XlaBuilder* b) override {
     return XlaHelpers::MinValue(b, reduction_type_);
   }
 
-  const xla::Computation* Reduction(XlaOpKernelContext* ctx) override {
+  const xla::XlaComputation* Reduction(XlaOpKernelContext* ctx) override {
     return ctx->GetOrCreateMax(reduction_type_);
   }
 
-  xla::ComputationDataHandle PostProcessOutput(
-      XlaOpKernelContext* ctx, const xla::ComputationDataHandle& output,
-      DataType dtype, const TensorShape& input_shape) override {
+  xla::XlaOp PostProcessOutput(XlaOpKernelContext* ctx,
+                               const xla::XlaOp& output, DataType dtype,
+                               const TensorShape& input_shape) override {
     return output;
   }
 };
@@ -176,9 +176,9 @@ REGISTER_XLA_OP(Name("MaxPool3D"), MaxPool3DOp);
 // Common computation shared between AvgPool and AvgPoolGrad. Divide each
 // element of an image by the count of elements that contributed to that
 // element during pooling.
-static xla::ComputationDataHandle AvgPoolDivideByCount(
-    XlaOpKernelContext* ctx, const xla::ComputationDataHandle& output,
-    DataType dtype, const TensorShape& input_shape, xla::Padding padding,
+static xla::XlaOp AvgPoolDivideByCount(
+    XlaOpKernelContext* ctx, const xla::XlaOp& output, DataType dtype,
+    const TensorShape& input_shape, xla::Padding padding,
     const std::vector<int64>& ksize, const std::vector<int64>& stride,
     int num_spatial_dims, TensorFormat data_format) {
   if (padding == xla::Padding::kValid) {
@@ -234,17 +234,17 @@ class AvgPoolOp : public PoolingOp {
                   /*reduction_type=*/
                   XlaHelpers::SumAccumulationType(ctx->input_type(0))) {}
 
-  xla::ComputationDataHandle InitValue(xla::ComputationBuilder* b) override {
+  xla::XlaOp InitValue(xla::XlaBuilder* b) override {
     return XlaHelpers::Zero(b, reduction_type_);
   }
 
-  const xla::Computation* Reduction(XlaOpKernelContext* ctx) override {
+  const xla::XlaComputation* Reduction(XlaOpKernelContext* ctx) override {
     return ctx->GetOrCreateAdd(reduction_type_);
   }
 
-  xla::ComputationDataHandle PostProcessOutput(
-      XlaOpKernelContext* ctx, const xla::ComputationDataHandle& output,
-      DataType dtype, const TensorShape& input_shape) override {
+  xla::XlaOp PostProcessOutput(XlaOpKernelContext* ctx,
+                               const xla::XlaOp& output, DataType dtype,
+                               const TensorShape& input_shape) override {
     return AvgPoolDivideByCount(ctx, output, dtype, input_shape, padding_,
                                 ksize_, stride_, num_spatial_dims_,
                                 data_format_);
@@ -344,11 +344,10 @@ class MaxPoolGradOp : public XlaOpKernel {
 
     xla::PrimitiveType element_type;
     OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(input_type(2), &element_type));
-    xla::ComputationDataHandle init_value =
-        XlaHelpers::Zero(ctx->builder(), input_type(2));
+    xla::XlaOp init_value = XlaHelpers::Zero(ctx->builder(), input_type(2));
     auto select = CreateScalarGeComputation(element_type, ctx->builder());
     auto scatter = CreateScalarAddComputation(element_type, ctx->builder());
-    xla::ComputationDataHandle gradients = ctx->builder()->SelectAndScatter(
+    xla::XlaOp gradients = ctx->builder()->SelectAndScatter(
         input, select, ksize_, stride_, xla_padding, out_backprop, init_value,
         scatter);
 
@@ -462,7 +461,7 @@ class AvgPoolGradOp : public XlaOpKernel {
     // The input gradients are computed by a convolution of the output gradients
     // and the filter, with some appropriate padding. See the comment at the top
     // of conv_grad_ops.h for details.
-    xla::ComputationBuilder* const b = ctx->builder();
+    xla::XlaBuilder* const b = ctx->builder();
     auto out_backprop = ctx->Input(1);
     auto dtype = input_type(1);
     xla::Padding xla_padding =
diff --git a/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc b/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc
index 4171e076ff6d9d..661cd5923e1023 100644
--- a/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc
@@ -35,7 +35,7 @@ class QuantizeAndDequantizeOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationDataHandle input = ctx->Input(0);
+    xla::XlaOp input = ctx->Input(0);
     const DataType data_type = ctx->input_type(0);
 
     // Comments taken from semantics description at
@@ -46,8 +46,8 @@ class QuantizeAndDequantizeOp : public XlaOpKernel {
     // m = max(abs(input_min), abs(input_max)) if range_given is true,
     // m = max(abs(min_elem(input)),
     //         abs(max_elem(input))) otherwise.
-    xla::ComputationBuilder* b = ctx->builder();
-    xla::ComputationDataHandle input_min, input_max;
+    xla::XlaBuilder* b = ctx->builder();
+    xla::XlaOp input_min, input_max;
     if (range_given_) {
       double input_min_value, input_max_value;
       OP_REQUIRES_OK(ctx, ctx->ConstantInputAsFloatScalar(1, &input_min_value));
@@ -55,14 +55,14 @@ class QuantizeAndDequantizeOp : public XlaOpKernel {
       input_min = XlaHelpers::FloatLiteral(b, data_type, input_min_value);
       input_max = XlaHelpers::FloatLiteral(b, data_type, input_max_value);
     } else {
-      const xla::Computation* fmax = ctx->GetOrCreateMax(data_type);
-      const xla::Computation* fmin = ctx->GetOrCreateMin(data_type);
+      const xla::XlaComputation* fmax = ctx->GetOrCreateMax(data_type);
+      const xla::XlaComputation* fmin = ctx->GetOrCreateMin(data_type);
       input_min =
           b->ReduceAll(input, XlaHelpers::MaxValue(b, data_type), *fmin);
       input_max =
           b->ReduceAll(input, XlaHelpers::MinValue(b, data_type), *fmax);
     }
-    xla::ComputationDataHandle m = b->Max(b->Abs(input_min), b->Abs(input_max));
+    xla::XlaOp m = b->Max(b->Abs(input_min), b->Abs(input_max));
 
     // Next, we choose our fixed-point quantization buckets, [min_fixed,
     // max_fixed]. If signed_input is true, this is
@@ -85,7 +85,7 @@ class QuantizeAndDequantizeOp : public XlaOpKernel {
     // From this we compute our scaling factor, s:
     //
     // s = (max_fixed - min_fixed) / (2 * m).
-    xla::ComputationDataHandle s =
+    xla::XlaOp s =
         b->Div(XlaHelpers::FloatLiteral(b, data_type, max_fixed - min_fixed),
                b->Mul(XlaHelpers::FloatLiteral(b, data_type, 2.0), m));
 
@@ -93,7 +93,7 @@ class QuantizeAndDequantizeOp : public XlaOpKernel {
     // e is transformed into e':
     //
     // e' = (e * s).round_to_nearest() / s.
-    xla::ComputationDataHandle result = b->Div(b->Round(b->Mul(input, s)), s);
+    xla::XlaOp result = b->Div(b->Round(b->Mul(input, s)), s);
 
     ctx->SetOutput(0, result);
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/random_ops.cc b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
index c0994c434bca51..39149d56adb244 100644
--- a/tensorflow/compiler/tf2xla/kernels/random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
@@ -17,6 +17,7 @@ limitations under the License.
 // TODO(misard,phawkins): handle random number generator seeds/states correctly.
 // TODO(misard,phawkins): add tests.
 
+#include "tensorflow/compiler/tf2xla/lib/while_loop.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
@@ -41,9 +42,9 @@ class RandomUniformOp : public XlaOpKernel {
     xla::Shape xla_shape;
     OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype, shape, &xla_shape));
 
-    xla::ComputationBuilder* b = ctx->builder();
-    xla::ComputationDataHandle result = b->RngUniform(
-        XlaHelpers::Zero(b, dtype), XlaHelpers::One(b, dtype), xla_shape);
+    xla::XlaBuilder* b = ctx->builder();
+    xla::XlaOp result = b->RngUniform(XlaHelpers::Zero(b, dtype),
+                                      XlaHelpers::One(b, dtype), xla_shape);
 
     ctx->SetOutput(0, result);
   }
@@ -100,11 +101,11 @@ class RandomStandardNormalOp : public XlaOpKernel {
     xla::Shape xla_shape;
     OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype, shape, &xla_shape));
 
-    xla::ComputationBuilder* b = ctx->builder();
+    xla::XlaBuilder* b = ctx->builder();
 
     // Normal distribution with a mean of 0 and a standard deviation of 1:
-    xla::ComputationDataHandle result = b->RngNormal(
-        XlaHelpers::Zero(b, dtype), XlaHelpers::One(b, dtype), xla_shape);
+    xla::XlaOp result = b->RngNormal(XlaHelpers::Zero(b, dtype),
+                                     XlaHelpers::One(b, dtype), xla_shape);
 
     ctx->SetOutput(0, result);
   }
@@ -127,22 +128,16 @@ class TruncatedNormalOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &shape));
     xla::Shape xla_shape;
     OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype, shape, &xla_shape));
-    xla::Shape xla_element_shape =
-        xla::ShapeUtil::MakeShape(xla_shape.element_type(), {});
 
-    xla::ComputationBuilder* b = ctx->builder();
-    xla::ComputationDataHandle mean = XlaHelpers::Zero(b, dtype);
-    xla::ComputationDataHandle stddev = XlaHelpers::One(b, dtype);
-    xla::ComputationDataHandle candidate =
-        b->RngNormal(mean, stddev, xla_shape);
+    xla::XlaBuilder* b = ctx->builder();
 
-    auto two_sd = [dtype](bool negate, xla::ComputationBuilder* b) {
+    auto two_sd = [dtype](bool negate, xla::XlaBuilder* b) {
       return XlaHelpers::FloatLiteral(b, dtype, negate ? -2.0 : 2.0);
     };
-    auto out_of_range_mask = [two_sd](xla::ComputationDataHandle candidate,
-                                      xla::ComputationBuilder* b) {
-      xla::ComputationDataHandle too_large = b->Gt(candidate, two_sd(false, b));
-      xla::ComputationDataHandle too_small = b->Lt(candidate, two_sd(true, b));
+    auto out_of_range_mask = [two_sd](xla::XlaOp candidate,
+                                      xla::XlaBuilder* b) {
+      xla::XlaOp too_large = b->Gt(candidate, two_sd(false, b));
+      xla::XlaOp too_small = b->Lt(candidate, two_sd(true, b));
       return b->Or(too_large, too_small);
     };
 
@@ -152,37 +147,38 @@ class TruncatedNormalOp : public XlaOpKernel {
     //   out_of_range_mask := candidate < mean-2*sd || candidate > mean+2*sd
     //   candidate = select(out_of_range_mask, rng_normal(), candidate)
     // }
-    std::unique_ptr<xla::ComputationBuilder> test_builder =
-        b->CreateSubBuilder("truncated_normal_test");
-    {
-      auto* b = test_builder.get();
-      xla::ComputationDataHandle candidate =
-          b->Parameter(0, xla_shape, "candidate");
-      xla::ComputationDataHandle oor_mask = out_of_range_mask(candidate, b);
-      OP_REQUIRES_OK(ctx, Any(out_of_range_mask(candidate, b), b).status());
-    }
-
-    std::unique_ptr<xla::ComputationBuilder> body_builder =
-        b->CreateSubBuilder("truncated_normal_body");
-    {
-      auto* b = body_builder.get();
-      xla::ComputationDataHandle candidate =
-          b->Parameter(0, xla_shape, "candidate");
-      xla::ComputationDataHandle to_resample = out_of_range_mask(candidate, b);
-      xla::ComputationDataHandle mean = XlaHelpers::Zero(b, dtype);
-      xla::ComputationDataHandle stddev = XlaHelpers::One(b, dtype);
-      b->Select(to_resample, b->RngNormal(mean, stddev, xla_shape), candidate);
-    }
-
-    xla::StatusOr<xla::Computation> test_computation = test_builder->Build();
-    OP_REQUIRES_OK(ctx, test_computation.status());
-    xla::StatusOr<xla::Computation> body_computation = body_builder->Build();
-    OP_REQUIRES_OK(ctx, body_computation.status());
-    xla::ComputationDataHandle result =
-        b->While(test_computation.ValueOrDie(), body_computation.ValueOrDie(),
-                 candidate);
-
-    ctx->SetOutput(0, result);
+    std::vector<xla::XlaOp> initial_values = {
+        // The current candidate.
+        b->Broadcast(XlaHelpers::Zero(b, dtype), shape.dim_sizes()),
+        // The to_resample mask, where 'true' identifies a location in the
+        // current candidate that is out of range and must be regenerated.
+        b->Broadcast(b->ConstantR0<bool>(true), shape.dim_sizes()),
+        // Is any element in the mask true?
+        b->ConstantR0<bool>(true)};
+    auto condition = [&](gtl::ArraySlice<xla::XlaOp> values,
+                         xla::XlaBuilder* b) -> xla::StatusOr<xla::XlaOp> {
+      // Continue while any element in the mask is true.
+      return values[2];
+    };
+    auto body =
+        [&](gtl::ArraySlice<xla::XlaOp> values,
+            xla::XlaBuilder* b) -> xla::StatusOr<std::vector<xla::XlaOp>> {
+      xla::XlaOp candidate = values[0];
+      xla::XlaOp to_resample = values[1];
+      xla::XlaOp mean = XlaHelpers::Zero(b, dtype);
+      xla::XlaOp stddev = XlaHelpers::One(b, dtype);
+      candidate = b->Select(to_resample, b->RngNormal(mean, stddev, xla_shape),
+                            candidate);
+      // Compute a new to_resample mask, and determine whether any value is
+      // still out of range.
+      to_resample = out_of_range_mask(candidate, b);
+      TF_ASSIGN_OR_RETURN(xla::XlaOp done, Any(to_resample, b));
+      return std::vector<xla::XlaOp>{candidate, to_resample, done};
+    };
+    auto result =
+        XlaWhileLoop(condition, body, initial_values, "truncated_normal", b);
+    OP_REQUIRES_OK(ctx, result.status());
+    ctx->SetOutput(0, result.ValueOrDie()[0]);
   }
 };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc b/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc
index cb144bea9e429b..08894489ac77bb 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduce_window_op.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
@@ -65,7 +64,7 @@ class ReduceWindowOp : public XlaOpKernel {
                     "rank (",
                     padding_high_.size(), " vs. ", rank, ")"));
 
-    xla::ComputationBuilder* builder = context->builder();
+    xla::XlaBuilder* builder = context->builder();
 
     // Build the reducer function.
     XlaCompiler::Argument reducer_arg;
@@ -95,15 +94,15 @@ class ReduceWindowOp : public XlaOpKernel {
                     xla::ShapeUtil::HumanString(reducer.xla_output_shape)));
 
     // Wraps the reducer in a computation that unpacks the output tuple.
-    xla::Computation wrapper;
+    xla::XlaComputation wrapper;
     {
-      std::unique_ptr<xla::ComputationBuilder> cb =
+      std::unique_ptr<xla::XlaBuilder> cb =
           builder->CreateSubBuilder("wrapper");
       auto x = cb->Parameter(0, scalar_shape, "x");
       auto y = cb->Parameter(1, scalar_shape, "y");
       auto outputs = cb->Call(*reducer.computation, {x, y});
       cb->GetTupleElement(outputs, 0);
-      xla::StatusOr<xla::Computation> result = cb->Build();
+      xla::StatusOr<xla::XlaComputation> result = cb->Build();
       OP_REQUIRES_OK(context, result.status());
       wrapper = std::move(result.ValueOrDie());
     }
@@ -113,7 +112,7 @@ class ReduceWindowOp : public XlaOpKernel {
       padding[i] = {padding_low_[i], padding_high_[i]};
     }
 
-    xla::ComputationDataHandle output = builder->ReduceWindowWithGeneralPadding(
+    xla::XlaOp output = builder->ReduceWindowWithGeneralPadding(
         context->Input(0), context->Input(1), wrapper, window_dimensions_,
         window_strides_, padding);
     context->SetOutput(0, output);
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
index 812d258cd1677e..0f425637795e96 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
@@ -30,13 +30,11 @@ class SumOp : public XlaReductionOp {
   explicit SumOp(OpKernelConstruction* ctx)
       : XlaReductionOp(ctx,
                        XlaHelpers::SumAccumulationType(ctx->input_type(0))) {}
-  xla::ComputationDataHandle InitialValue(
-      xla::ComputationBuilder* builder) override {
+  xla::XlaOp InitialValue(xla::XlaBuilder* builder) override {
     return XlaHelpers::Zero(builder, reduction_type_);
   }
-  void BuildReducer(xla::ComputationBuilder* builder,
-                    const xla::ComputationDataHandle& scalar_lhs,
-                    const xla::ComputationDataHandle& scalar_rhs) override {
+  void BuildReducer(xla::XlaBuilder* builder, const xla::XlaOp& scalar_lhs,
+                    const xla::XlaOp& scalar_rhs) override {
     builder->Add(scalar_lhs, scalar_rhs);
   }
 };
@@ -49,14 +47,12 @@ class ProdOp : public XlaReductionOp {
       : XlaReductionOp(ctx,
                        XlaHelpers::SumAccumulationType(ctx->input_type(0))) {}
 
-  xla::ComputationDataHandle InitialValue(
-      xla::ComputationBuilder* builder) override {
+  xla::XlaOp InitialValue(xla::XlaBuilder* builder) override {
     return XlaHelpers::One(builder, reduction_type_);
   }
 
-  void BuildReducer(xla::ComputationBuilder* builder,
-                    const xla::ComputationDataHandle& scalar_lhs,
-                    const xla::ComputationDataHandle& scalar_rhs) override {
+  void BuildReducer(xla::XlaBuilder* builder, const xla::XlaOp& scalar_lhs,
+                    const xla::XlaOp& scalar_rhs) override {
     builder->Mul(scalar_lhs, scalar_rhs);
   }
 };
@@ -69,14 +65,12 @@ class MinOp : public XlaReductionOp {
   explicit MinOp(OpKernelConstruction* ctx)
       : XlaReductionOp(ctx, ctx->input_type(0)) {}
 
-  xla::ComputationDataHandle InitialValue(
-      xla::ComputationBuilder* builder) override {
+  xla::XlaOp InitialValue(xla::XlaBuilder* builder) override {
     return XlaHelpers::MaxValue(builder, reduction_type_);
   }
 
-  void BuildReducer(xla::ComputationBuilder* builder,
-                    const xla::ComputationDataHandle& scalar_lhs,
-                    const xla::ComputationDataHandle& scalar_rhs) override {
+  void BuildReducer(xla::XlaBuilder* builder, const xla::XlaOp& scalar_lhs,
+                    const xla::XlaOp& scalar_rhs) override {
     builder->Min(scalar_lhs, scalar_rhs);
   }
 };
@@ -88,14 +82,12 @@ class MaxOp : public XlaReductionOp {
   explicit MaxOp(OpKernelConstruction* ctx)
       : XlaReductionOp(ctx, ctx->input_type(0)) {}
 
-  xla::ComputationDataHandle InitialValue(
-      xla::ComputationBuilder* builder) override {
+  xla::XlaOp InitialValue(xla::XlaBuilder* builder) override {
     return XlaHelpers::MinValue(builder, reduction_type_);
   }
 
-  void BuildReducer(xla::ComputationBuilder* builder,
-                    const xla::ComputationDataHandle& scalar_lhs,
-                    const xla::ComputationDataHandle& scalar_rhs) override {
+  void BuildReducer(xla::XlaBuilder* builder, const xla::XlaOp& scalar_lhs,
+                    const xla::XlaOp& scalar_rhs) override {
     builder->Max(scalar_lhs, scalar_rhs);
   }
 };
@@ -108,20 +100,17 @@ class MeanOp : public XlaReductionOp {
       : XlaReductionOp(ctx,
                        XlaHelpers::SumAccumulationType(ctx->input_type(0))) {}
 
-  xla::ComputationDataHandle InitialValue(
-      xla::ComputationBuilder* builder) override {
+  xla::XlaOp InitialValue(xla::XlaBuilder* builder) override {
     return XlaHelpers::Zero(builder, reduction_type_);
   }
-  void BuildReducer(xla::ComputationBuilder* builder,
-                    const xla::ComputationDataHandle& scalar_lhs,
-                    const xla::ComputationDataHandle& scalar_rhs) override {
+  void BuildReducer(xla::XlaBuilder* builder, const xla::XlaOp& scalar_lhs,
+                    const xla::XlaOp& scalar_rhs) override {
     builder->Add(scalar_lhs, scalar_rhs);
   }
 
-  xla::ComputationDataHandle BuildFinalizer(
-      xla::ComputationBuilder* builder,
-      const xla::ComputationDataHandle& reduce_output,
-      int64 num_elements_reduced) override {
+  xla::XlaOp BuildFinalizer(xla::XlaBuilder* builder,
+                            const xla::XlaOp& reduce_output,
+                            int64 num_elements_reduced) override {
     auto divisor = XlaHelpers::IntegerLiteral(builder, input_type(0),
                                               num_elements_reduced);
     return builder->Div(reduce_output, divisor);
@@ -136,14 +125,12 @@ class AllOp : public XlaReductionOp {
   explicit AllOp(OpKernelConstruction* ctx)
       : XlaReductionOp(ctx, ctx->input_type(0)) {}
 
-  xla::ComputationDataHandle InitialValue(
-      xla::ComputationBuilder* builder) override {
+  xla::XlaOp InitialValue(xla::XlaBuilder* builder) override {
     return builder->ConstantR0<bool>(true);
   }
 
-  void BuildReducer(xla::ComputationBuilder* builder,
-                    const xla::ComputationDataHandle& scalar_lhs,
-                    const xla::ComputationDataHandle& scalar_rhs) override {
+  void BuildReducer(xla::XlaBuilder* builder, const xla::XlaOp& scalar_lhs,
+                    const xla::XlaOp& scalar_rhs) override {
     builder->And(scalar_lhs, scalar_rhs);
   }
 };
@@ -155,14 +142,12 @@ class AnyOp : public XlaReductionOp {
   explicit AnyOp(OpKernelConstruction* ctx)
       : XlaReductionOp(ctx, ctx->input_type(0)) {}
 
-  xla::ComputationDataHandle InitialValue(
-      xla::ComputationBuilder* builder) override {
+  xla::XlaOp InitialValue(xla::XlaBuilder* builder) override {
     return builder->ConstantR0<bool>(false);
   }
 
-  void BuildReducer(xla::ComputationBuilder* builder,
-                    const xla::ComputationDataHandle& scalar_lhs,
-                    const xla::ComputationDataHandle& scalar_rhs) override {
+  void BuildReducer(xla::XlaBuilder* builder, const xla::XlaOp& scalar_lhs,
+                    const xla::XlaOp& scalar_rhs) override {
     builder->Or(scalar_lhs, scalar_rhs);
   }
 };
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops.h b/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
index f3181f0dadc2d3..2ecfb854a1c862 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
@@ -19,7 +19,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_TF2XLA_KERNELS_REDUCTION_OPS_H_
 
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
@@ -28,35 +28,33 @@ namespace tensorflow {
 // to override: description is a textual description of the mapped
 // function; InitialValue constructs the base case for the reduction;
 // BuildReducer adds the implementation of the reduction lambda to a
-// xla::ComputationBuilder and BuildFinalizer adds the
+// xla::XlaBuilder and BuildFinalizer adds the
 // implementation of the finalizer lambda (if there is one) to a
-// xla::ComputationBuilder.
+// xla::XlaBuilder.
 class XlaReductionOp : public XlaOpKernel {
  public:
   XlaReductionOp(OpKernelConstruction* ctx, DataType reduction_type);
   ~XlaReductionOp() override {}
 
   // Return the base case for the reduction.
-  virtual xla::ComputationDataHandle InitialValue(
-      xla::ComputationBuilder* builder) = 0;
+  virtual xla::XlaOp InitialValue(xla::XlaBuilder* builder) = 0;
 
   // Implement the (scalar,scalar)->scalar lambda that should be
   // applied to each pair of elements to be reduced. The desired
   // computation should be added to 'builder' and
   // '(scalar_lhs,scalar_rhs)' are the function's inputs.
-  virtual void BuildReducer(xla::ComputationBuilder* builder,
-                            const xla::ComputationDataHandle& scalar_lhs,
-                            const xla::ComputationDataHandle& scalar_rhs) = 0;
+  virtual void BuildReducer(xla::XlaBuilder* builder,
+                            const xla::XlaOp& scalar_lhs,
+                            const xla::XlaOp& scalar_rhs) = 0;
 
   // Applies a transformation to the output of the reduction. The desired
   // computation should be added to 'builder'. Argument 'reduce_output' is the
   // output of the reduction. 'num_elements_reduced' is the number of elements
   // that contributed to the reduction. Returns the transformed reduction
   // output, Defaults to returning 'reduce_output' unchanged.
-  virtual xla::ComputationDataHandle BuildFinalizer(
-      xla::ComputationBuilder* builder,
-      const xla::ComputationDataHandle& reduce_output,
-      int64 num_elements_reduced);
+  virtual xla::XlaOp BuildFinalizer(xla::XlaBuilder* builder,
+                                    const xla::XlaOp& reduce_output,
+                                    int64 num_elements_reduced);
 
   void Compile(XlaOpKernelContext* ctx) override;
 
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
index 64fe765ae9a945..4fd5bfd03999a7 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
@@ -35,10 +35,9 @@ XlaReductionOp::XlaReductionOp(OpKernelConstruction* ctx,
 
 // Unless BuildFinalizer is overridden the reduction has no
 // finalizer.
-xla::ComputationDataHandle XlaReductionOp::BuildFinalizer(
-    xla::ComputationBuilder* builder,
-    const xla::ComputationDataHandle& reduce_output,
-    int64 num_elements_reduced) {
+xla::XlaOp XlaReductionOp::BuildFinalizer(xla::XlaBuilder* builder,
+                                          const xla::XlaOp& reduce_output,
+                                          int64 num_elements_reduced) {
   return reduce_output;
 }
 
@@ -96,9 +95,9 @@ void XlaReductionOp::Compile(XlaOpKernelContext* ctx) {
 
   string desc = ctx->op_kernel().name();
 
-  xla::ComputationBuilder* const b = ctx->builder();
+  xla::XlaBuilder* const b = ctx->builder();
   // Construct the builder for the reduction lambda.
-  xla::ComputationBuilder r(b->client(), strings::StrCat(desc, "-reduction"));
+  xla::XlaBuilder r(strings::StrCat(desc, "-reduction"));
   xla::PrimitiveType type;
   TF_CHECK_OK(DataTypeToPrimitiveType(reduction_type_, &type));
 
@@ -110,7 +109,7 @@ void XlaReductionOp::Compile(XlaOpKernelContext* ctx) {
   auto ry = r.Parameter(1, xla::ShapeUtil::MakeShape(type, {}), "y");
   // Call virtual method to build the reduction lambda.
   BuildReducer(&r, rx, ry);
-  xla::Computation reduction_computation = r.Build().ConsumeValueOrDie();
+  xla::XlaComputation reduction_computation = r.Build().ConsumeValueOrDie();
 
   auto reduce = b->Reduce(data, initial, reduction_computation, xla_axes);
   auto deconverted = XlaHelpers::ConvertElementType(b, reduce, input_type(0));
diff --git a/tensorflow/compiler/tf2xla/kernels/relu_op.cc b/tensorflow/compiler/tf2xla/kernels/relu_op.cc
index 12a35529992e61..ba7d484d53d725 100644
--- a/tensorflow/compiler/tf2xla/kernels/relu_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/relu_op.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/types.h"
@@ -32,7 +32,7 @@ class ReluOp : public XlaOpKernel {
   explicit ReluOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
   // Computes the max of the scalar input x and 0.
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* builder = ctx->builder();
+    xla::XlaBuilder* builder = ctx->builder();
     auto zero = XlaHelpers::Zero(builder, input_type(0));
     ctx->SetOutput(0, builder->Max(zero, ctx->Input(0)));
   }
@@ -43,7 +43,7 @@ class Relu6Op : public XlaOpKernel {
   explicit Relu6Op(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
   // Clamp the scalar input between 0 and 6.
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* builder = ctx->builder();
+    xla::XlaBuilder* builder = ctx->builder();
     auto zero = XlaHelpers::Zero(builder, input_type(0));
     auto six = XlaHelpers::IntegerLiteral(builder, input_type(0), 6);
     ctx->SetOutput(0, builder->Clamp(zero, ctx->Input(0), six));
@@ -56,7 +56,7 @@ class ReluGradOp : public XlaOpKernel {
   // Return the lhs (incoming gradient) if the rhs (input feature) > 0,
   // otherwise return 0.
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* b = ctx->builder();
+    xla::XlaBuilder* b = ctx->builder();
     const TensorShape shape = ctx->InputShape(0);
     const auto zero =
         b->Broadcast(XlaHelpers::Zero(b, input_type(0)), shape.dim_sizes());
@@ -71,7 +71,7 @@ class Relu6GradOp : public XlaOpKernel {
   // Return the lhs (incoming gradient) if the rhs (input feature) > 0,
   // otherwise return 0.
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* b = ctx->builder();
+    xla::XlaBuilder* b = ctx->builder();
     const TensorShape shape = ctx->InputShape(0);
     const auto zero =
         b->Broadcast(XlaHelpers::Zero(b, input_type(0)), shape.dim_sizes());
diff --git a/tensorflow/compiler/tf2xla/kernels/retval_op.cc b/tensorflow/compiler/tf2xla/kernels/retval_op.cc
index c283e3b02c2676..a711278638444b 100644
--- a/tensorflow/compiler/tf2xla/kernels/retval_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/retval_op.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 
@@ -45,7 +45,7 @@ class RetvalOp : public XlaOpKernel {
       // compilation.
       OP_REQUIRES_OK(ctx, frame->SetRetval(index_, input));
     } else {
-      xla::ComputationDataHandle input = ctx->Input(0);
+      xla::XlaOp input = ctx->Input(0);
       const TensorShape input_shape = ctx->InputShape(0);
 
       auto is_constant = ctx->builder()->IsConstant(input);
@@ -55,18 +55,33 @@ class RetvalOp : public XlaOpKernel {
       }
 
       XlaContext& tc = XlaContext::Get(ctx);
-      if (input_shape.num_elements() == 0 || is_constant.ValueOrDie()) {
+      if (tc.resolve_compile_time_constants() &&
+          (input_shape.num_elements() == 0 || is_constant.ValueOrDie())) {
         xla::Literal literal;
         OP_REQUIRES_OK(ctx, ctx->ConstantInput(0, &literal));
         OP_REQUIRES_OK(ctx, tc.AddConstRetval(index_, dtype_, literal));
       } else {
-        // The core from which a return value is returned depends on the core
-        // assignment of the input to the retval .Since we can't change the core
-        // assignment of <input> as this point, create a tuple/get-tuple-element
-        // combination so that the core will be set on them.
-        auto tuple_elem =
-            ctx->builder()->GetTupleElement(ctx->builder()->Tuple({input}), 0);
-        tc.AddRetval(index_, dtype_, tuple_elem);
+        TensorShape shape = ctx->InputShape(0);
+        TensorShape representation_shape =
+            tc.is_entry_computation()
+                ? tc.RepresentationShape(shape, ctx->input_type(0))
+                : shape;
+
+        xla::XlaOp output = input;
+        if (tc.is_entry_computation()) {
+          output =
+              ctx->builder()->Reshape(input, representation_shape.dim_sizes());
+        } else {
+          // The core from which a return value is returned depends on the
+          // device assignment of the input to the retval. Since we can't change
+          // the device assignment of "input" at this point, we must always
+          // introduce an operator here, even if the shape does not change.
+          // TODO(b/76097077): propagate device assignments onto arguments and
+          // return values of functions, and then reshape unconditionally.
+          output = ctx->builder()->GetTupleElement(
+              ctx->builder()->Tuple({output}), 0);
+        }
+        tc.AddRetval(index_, dtype_, shape, output);
       }
     }
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/reverse_op.cc b/tensorflow/compiler/tf2xla/kernels/reverse_op.cc
index e51d386926763e..2872a3c4d49d0d 100644
--- a/tensorflow/compiler/tf2xla/kernels/reverse_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reverse_op.cc
@@ -48,7 +48,7 @@ class ReverseOp : public XlaOpKernel {
       ctx->SetOutput(0, ctx->Input(0));
       return;
     }
-    // ComputationBuilder::Rev() requires concrete values for dimensions arg.
+    // XlaBuilder::Rev() requires concrete values for dimensions arg.
     xla::Literal lax;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputReshaped(1, {x_shape.dims()}, &lax));
     std::vector<bool> revdims(x_shape.dims());
@@ -90,7 +90,7 @@ class ReverseV2Op : public XlaOpKernel {
       ctx->SetOutput(0, ctx->Input(0));
       return;
     }
-    // ComputationBuilder::Rev() requires concrete values for dimensions arg.
+    // XlaBuilder::Rev() requires concrete values for dimensions arg.
     std::vector<int64> axes;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &axes));
 
diff --git a/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
index 6bc5d3adb091cd..5d1c05268493f4 100644
--- a/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
@@ -54,7 +54,7 @@ class ReverseSequenceOp : public XlaOpKernel {
                                 "), ", "(", seq_lens_shape.num_elements(),
                                 " vs. ", input_shape.dim_size(batch_dim_)));
 
-    xla::ComputationBuilder* builder = context->builder();
+    xla::XlaBuilder* builder = context->builder();
     const auto input = context->Input(0);
     const auto seq_lens = context->Input(1);
 
@@ -106,20 +106,40 @@ class ReverseSequenceOp : public XlaOpKernel {
           seq_lens, body_builder->Reshape(i, {1}), {1});
 
       // Indices is the offset of the batch element in the input.
-      auto indices = body_builder->Broadcast(
+      auto batch_element_indices = body_builder->Broadcast(
           XlaHelpers::Zero(body_builder.get(), seq_lens_type),
           {input_shape.dims()});
-      indices = body_builder->DynamicUpdateSlice(
-          indices, body_builder->Reshape(i, {1}),
+      batch_element_indices = body_builder->DynamicUpdateSlice(
+          batch_element_indices, body_builder->Reshape(i, {1}),
           body_builder->Reshape(
               XlaHelpers::IntegerLiteral(body_builder.get(), seq_lens_type,
                                          batch_dim_),
               {1}));
 
-      // slice_indices is the offset of the start of the reversed sequence in
-      // the input.
-      auto slice_indices = body_builder->DynamicUpdateSlice(
-          indices,
+      // Slice out the current batch element and pad it out in the sequence
+      // dimension.
+      TensorShape slice_shape = input_shape;
+      slice_shape.set_dim(batch_dim_, 1);
+      slice_shape.set_dim(seq_dim_, max_seq_len);
+      auto slice = body_builder->DynamicSlice(output, batch_element_indices,
+                                              slice_shape.dim_sizes());
+      auto padding_config = xla::MakeNoPaddingConfig(slice_shape.dims());
+      padding_config.mutable_dimensions(seq_dim_)->set_edge_padding_high(
+          slice_shape.dim_size(seq_dim_));
+      slice = body_builder->Pad(
+          slice, XlaHelpers::Zero(body_builder.get(), input_type),
+          padding_config);
+
+      // Now slice out the reversed sequence from its actual start.
+      // sequence_start_indices is the offset of the start of the reversed
+      // sequence in the input. The slice will go into the padding, however, we
+      // will mask off these elements and replace them with elements from the
+      // original input so their values do not matter.
+      auto sequence_start_indices = body_builder->Broadcast(
+          XlaHelpers::Zero(body_builder.get(), seq_lens_type),
+          {slice_shape.dims()});
+      sequence_start_indices = body_builder->DynamicUpdateSlice(
+          sequence_start_indices,
           body_builder->Sub(XlaHelpers::IntegerLiteral(
                                 body_builder.get(), seq_lens_type, max_seq_len),
                             seq_len),
@@ -127,18 +147,12 @@ class ReverseSequenceOp : public XlaOpKernel {
               XlaHelpers::IntegerLiteral(body_builder.get(), seq_lens_type,
                                          seq_dim_),
               {1}));
-
-      // Slice out the reversed sequence. The slice will overflow the end of the
-      // sequence, and the contents of the overflow are implementation-defined.
-      // However, we will mask off these elements and replace them with elements
-      // from the original input so their values do not matter.
-      TensorShape slice_shape = input_shape;
-      slice_shape.set_dim(batch_dim_, 1);
-      auto slice = body_builder->DynamicSlice(output, slice_indices,
-                                              slice_shape.dim_sizes());
+      slice = body_builder->DynamicSlice(slice, sequence_start_indices,
+                                         slice_shape.dim_sizes());
 
       // Shift the reversed sequence to the left.
-      output = body_builder->DynamicUpdateSlice(output, slice, indices);
+      output = body_builder->DynamicUpdateSlice(output, slice,
+                                                batch_element_indices);
 
       body_builder->Tuple(
           {body_builder->Add(
@@ -155,7 +169,7 @@ class ReverseSequenceOp : public XlaOpKernel {
     auto output = builder->GetTupleElement(loop_output, 2);
 
     // Mask out elements after the sequence length.
-    xla::ComputationDataHandle iota;
+    xla::XlaOp iota;
     OP_REQUIRES_OK(
         context, XlaHelpers::Iota(builder, seq_lens_type, max_seq_len, &iota));
     std::vector<int64> dims(input_shape.dims(), 1);
diff --git a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
index 4cfa28a0ce3d7d..1819fb543317ee 100644
--- a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
@@ -74,7 +74,7 @@ class ScanOp : public XlaOpKernel {
       return;
     }
 
-    xla::ComputationBuilder* builder = ctx->builder();
+    xla::XlaBuilder* builder = ctx->builder();
 
     std::vector<int64> window_strides(input_shape.dims(), 1);
     std::vector<int64> window_dims(input_shape.dims(), 1);
@@ -91,8 +91,8 @@ class ScanOp : public XlaOpKernel {
       std::swap(padding[axis].first, padding[axis].second);
     }
 
-    xla::ComputationDataHandle init;
-    const xla::Computation* reducer;
+    xla::XlaOp init;
+    const xla::XlaComputation* reducer;
     if (sum_) {
       init = XlaHelpers::Zero(builder, dtype);
       reducer = ctx->GetOrCreateAdd(dtype);
diff --git a/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc b/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
index 8433a29c4e203c..f2c63b4f9083ad 100644
--- a/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
@@ -102,7 +102,7 @@ class ScatterNdOp : public XlaOpKernel {
     OP_REQUIRES_OK(context, ValidateUpdateShape(buffer_shape, indices_shape,
                                                 updates_shape));
 
-    xla::ComputationBuilder* builder = context->builder();
+    xla::XlaBuilder* builder = context->builder();
     auto buffer = builder->Broadcast(XlaHelpers::Zero(builder, dtype),
                                      buffer_shape.dim_sizes());
     auto indices = context->Input(0);
diff --git a/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc b/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
index 498342a98881df..664078ca16c6d5 100644
--- a/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 
 namespace tensorflow {
 namespace {
@@ -62,16 +62,16 @@ class UnsortedSegmentSum : public XlaOpKernel {
                       d, " differs ", data_shape.dim_size(d), " vs. ",
                       indices_shape.dim_size(d)));
     }
-    xla::ComputationBuilder* builder = ctx->builder();
+    xla::XlaBuilder* builder = ctx->builder();
     TensorShape buffer_shape = data_shape;
     buffer_shape.RemoveDimRange(0, indices_shape.dims());
     buffer_shape.InsertDim(0, num_segments);
     auto buffer = builder->Broadcast(XlaHelpers::Zero(builder, dtype_),
                                      buffer_shape.dim_sizes());
 
-    auto combiner =
-        [](xla::ComputationDataHandle a, xla::ComputationDataHandle b,
-           xla::ComputationBuilder* builder) { return builder->Add(a, b); };
+    auto combiner = [](xla::XlaOp a, xla::XlaOp b, xla::XlaBuilder* builder) {
+      return builder->Add(a, b);
+    };
 
     auto result = XlaScatter(buffer, /*updates=*/data, indices,
                              /*indices_are_vectors=*/false, combiner, builder);
diff --git a/tensorflow/compiler/tf2xla/kernels/select_op.cc b/tensorflow/compiler/tf2xla/kernels/select_op.cc
index 8081d3c41c4363..f9f48164d63492 100644
--- a/tensorflow/compiler/tf2xla/kernels/select_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/select_op.cc
@@ -40,7 +40,7 @@ class SelectOp : public XlaOpKernel {
             "'then' and 'else' must have the same size.  but received: ",
             then_shape.DebugString(), " vs. ", else_shape.DebugString()));
 
-    xla::ComputationBuilder* builder = ctx->builder();
+    xla::XlaBuilder* builder = ctx->builder();
 
     auto cond_handle = ctx->Input(0);
     auto then_handle = ctx->Input(1);
diff --git a/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc b/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
index d079b89861817a..9ce01d0d44509b 100644
--- a/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/softmax_op.cc b/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
index 463788b8b461c3..bbf5ee8b12186a 100644
--- a/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
@@ -43,8 +43,8 @@ class SoftmaxOp : public XlaOpKernel {
     const DataType type = input_type(0);
     auto logits = ctx->Input(0);
 
-    xla::ComputationBuilder* const b = ctx->builder();
-    const xla::Computation& max_func = *ctx->GetOrCreateMax(type);
+    xla::XlaBuilder* const b = ctx->builder();
+    const xla::XlaComputation& max_func = *ctx->GetOrCreateMax(type);
 
     // Find the max in each batch, resulting in a tensor of shape [batch]
     auto logits_max =
@@ -76,16 +76,15 @@ class SoftmaxOp : public XlaOpKernel {
 REGISTER_XLA_OP(Name("Softmax"), SoftmaxOp);
 REGISTER_XLA_OP(Name("LogSoftmax"), SoftmaxOp);
 
-std::pair<xla::ComputationDataHandle, xla::ComputationDataHandle>
-CrossEntropyWithLogits(XlaOpKernelContext* ctx, DataType type,
-                       const xla::ComputationDataHandle& logits,
-                       const xla::ComputationDataHandle& labels) {
-  const xla::Computation& max_func = *ctx->GetOrCreateMax(type);
+std::pair<xla::XlaOp, xla::XlaOp> CrossEntropyWithLogits(
+    XlaOpKernelContext* ctx, DataType type, const xla::XlaOp& logits,
+    const xla::XlaOp& labels) {
+  const xla::XlaComputation& max_func = *ctx->GetOrCreateMax(type);
 
   const int kBatchDim = 0;
   const int kClassDim = 1;
 
-  xla::ComputationBuilder* b = ctx->builder();
+  xla::XlaBuilder* b = ctx->builder();
   // Find the max in each batch, resulting in a tensor of shape [batch]
   auto logits_max =
       b->Reduce(logits, XlaHelpers::MinValue(b, type), max_func, {kClassDim});
@@ -123,7 +122,7 @@ CrossEntropyWithLogits(XlaOpKernelContext* ctx, DataType type,
   // backprop: prob - labels, where
   //   prob = exp(logits - max_logits) / sum(exp(logits - max_logits))
   //     (where the division broadcasts along the batch dimension)
-  xla::ComputationDataHandle backprop =
+  xla::XlaOp backprop =
       b->Sub(b->Div(exp_shifted_logits, sum_exp, {kBatchDim}), labels);
   return {loss, backprop};
 }
@@ -150,7 +149,7 @@ class SoftmaxXentWithLogitsOp : public XlaOpKernel {
     auto logits = ctx->Input(0);
     auto labels = ctx->Input(1);
 
-    xla::ComputationDataHandle loss, backprop;
+    xla::XlaOp loss, backprop;
     std::tie(loss, backprop) =
         CrossEntropyWithLogits(ctx, type, logits, labels);
     ctx->SetOutput(0, loss);
@@ -191,10 +190,10 @@ class SparseSoftmaxXentWithLogitsOp : public XlaOpKernel {
     DataType logits_type = input_type(0);
     DataType indices_type = input_type(1);
 
-    xla::ComputationDataHandle indices = ctx->Input(1);
+    xla::XlaOp indices = ctx->Input(1);
 
-    xla::ComputationBuilder* builder = ctx->builder();
-    xla::ComputationDataHandle labels;
+    xla::XlaBuilder* builder = ctx->builder();
+    xla::XlaOp labels;
     OP_REQUIRES_OK(ctx,
                    XlaHelpers::OneHot(
                        builder, depth, /*axis=*/1, input_type(1), labels_shape,
@@ -207,7 +206,7 @@ class SparseSoftmaxXentWithLogitsOp : public XlaOpKernel {
     // Builds a vector of {batch_size} that is 0 if the index is in range, or
     // NaN otherwise; then add that vector to the labels to force out-of-range
     // values to NaNs.
-    xla::ComputationDataHandle nan_or_zero = builder->Select(
+    xla::XlaOp nan_or_zero = builder->Select(
         builder->And(
             builder->Le(XlaHelpers::Zero(builder, indices_type), indices),
             builder->Lt(indices, XlaHelpers::IntegerLiteral(
@@ -218,7 +217,7 @@ class SparseSoftmaxXentWithLogitsOp : public XlaOpKernel {
                            {batch_size}));
     labels = builder->Add(labels, nan_or_zero, {0});
 
-    xla::ComputationDataHandle loss, backprop;
+    xla::XlaOp loss, backprop;
     std::tie(loss, backprop) =
         CrossEntropyWithLogits(ctx, logits_type, ctx->Input(0), labels);
     ctx->SetOutput(0, loss);
diff --git a/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc b/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
index 01b46e160d1f1f..ec077924b5b5af 100644
--- a/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
@@ -20,9 +20,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-void SpaceToBatch(XlaOpKernelContext* ctx,
-                  const xla::ComputationDataHandle& input, DataType input_dtype,
-                  const TensorShape& input_tensor_shape,
+void SpaceToBatch(XlaOpKernelContext* ctx, const xla::XlaOp& input,
+                  DataType input_dtype, const TensorShape& input_tensor_shape,
                   gtl::ArraySlice<int64> block_shape,
                   const xla::Literal& paddings) {
   const int input_rank = input_tensor_shape.dims();
@@ -46,7 +45,7 @@ void SpaceToBatch(XlaOpKernelContext* ctx,
                               ", 2] instead of ",
                               xla::ShapeUtil::HumanString(paddings.shape())));
 
-  xla::ComputationBuilder* b = ctx->builder();
+  xla::XlaBuilder* b = ctx->builder();
 
   // 1. Zero-pad the start and end of dimensions `[1, ..., M]` of the
   //  input according to `paddings` to produce `padded` of shape `padded_shape`.
@@ -73,7 +72,7 @@ void SpaceToBatch(XlaOpKernelContext* ctx,
               errors::InvalidArgument(
                   "The product of the block dimensions must be positive"));
 
-  xla::ComputationDataHandle padded =
+  xla::XlaOp padded =
       b->Pad(input, XlaHelpers::Zero(b, input_dtype), padding_config);
 
   // 2. Reshape `padded` to `reshaped_padded` of shape:
@@ -101,8 +100,7 @@ void SpaceToBatch(XlaOpKernelContext* ctx,
   std::copy(remainder_shape.begin(), remainder_shape.end(),
             reshaped_padded_shape.begin() + 1 + 2 * block_rank);
 
-  xla::ComputationDataHandle reshaped_padded =
-      b->Reshape(padded, reshaped_padded_shape);
+  xla::XlaOp reshaped_padded = b->Reshape(padded, reshaped_padded_shape);
 
   // 3. Permute dimensions of `reshaped_padded` to produce
   //    `permuted_reshaped_padded` of shape:
@@ -121,7 +119,7 @@ void SpaceToBatch(XlaOpKernelContext* ctx,
   permutation[block_rank] = 0;
   std::iota(permutation.begin() + 1 + block_rank * 2, permutation.end(),
             1 + block_rank * 2);
-  xla::ComputationDataHandle permuted_reshaped_padded =
+  xla::XlaOp permuted_reshaped_padded =
       b->Transpose(reshaped_padded, permutation);
 
   // 4. Reshape `permuted_reshaped_padded` to flatten `block_shape` into the
@@ -142,8 +140,7 @@ void SpaceToBatch(XlaOpKernelContext* ctx,
   std::copy(remainder_shape.begin(), remainder_shape.end(),
             output_shape.begin() + 1 + block_rank);
 
-  xla::ComputationDataHandle output =
-      b->Reshape(permuted_reshaped_padded, output_shape);
+  xla::XlaOp output = b->Reshape(permuted_reshaped_padded, output_shape);
   ctx->SetOutput(0, output);
 }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc b/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc
index 806fda632cde64..4c5886ee2a0f63 100644
--- a/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc
@@ -50,8 +50,8 @@ class SpaceToDepthOp : public XlaOpKernel {
     const gtl::InlinedVector<int64, 4> input_shape =
         input_tensor_shape.dim_sizes();
 
-    xla::ComputationBuilder* b = ctx->builder();
-    xla::ComputationDataHandle input = ctx->Input(0);
+    xla::XlaBuilder* b = ctx->builder();
+    xla::XlaOp input = ctx->Input(0);
 
     int feature_dim = GetTensorFeatureDimIndex(input_rank, data_format_);
     int num_spatial_dims = GetTensorSpatialDims(input_rank, data_format_);
@@ -135,7 +135,7 @@ class SpaceToDepthOp : public XlaOpKernel {
     //       input_shape[1] / block_size_, block_size_,
     //       input_shape[2] / block_size_, block_size_,
     //       depth]
-    xla::ComputationDataHandle reshaped = b->Reshape(input, reshaped_shape);
+    xla::XlaOp reshaped = b->Reshape(input, reshaped_shape);
 
     // 2. Permute dimensions of `reshaped` to produce
     //    `permuted_reshaped` of shape:
@@ -145,8 +145,7 @@ class SpaceToDepthOp : public XlaOpKernel {
     //       input_shape[2] / block_size_,
     //       block_size_, block_size_,
     //       depth]
-    xla::ComputationDataHandle permuted_reshaped =
-        b->Transpose(reshaped, transpose_order);
+    xla::XlaOp permuted_reshaped = b->Transpose(reshaped, transpose_order);
 
     // 3. Reshape `permuted_reshaped` to flatten `block_shape` into the
     //    batch dimension, producing an output tensor of shape:
@@ -156,8 +155,7 @@ class SpaceToDepthOp : public XlaOpKernel {
     //       input_shape[2] / block_size_,
     //       block_size_ * block_size_ * depth]
     //
-    xla::ComputationDataHandle output =
-        b->Reshape(permuted_reshaped, output_shape);
+    xla::XlaOp output = b->Reshape(permuted_reshaped, output_shape);
 
     ctx->SetOutput(0, output);
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/split_op.cc b/tensorflow/compiler/tf2xla/kernels/split_op.cc
index 43c15e75380535..8958b2e7701e62 100644
--- a/tensorflow/compiler/tf2xla/kernels/split_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/split_op.cc
@@ -124,7 +124,7 @@ class SplitVOp : public XlaOpKernel {
                                         input_shape.dims(), "), but got ",
                                         split_dim_orig));
 
-    xla::ComputationDataHandle input = ctx->Input(0);
+    xla::XlaOp input = ctx->Input(0);
 
     OP_REQUIRES(ctx, input_shape.dims() > 0,
                 errors::InvalidArgument("Can't split a 0 dimensional input"));
diff --git a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
index 1a78c7ab9be701..0fb05a2be7b103 100644
--- a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
@@ -38,13 +38,13 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-Status GetStackShape(xla::ComputationBuilder* builder, XlaResource* resource,
+Status GetStackShape(xla::XlaBuilder* builder, XlaResource* resource,
                      TensorShape* stack_shape) {
   auto shape_or_status = builder->GetShape(resource->value());
   if (!shape_or_status.ok()) {
     return shape_or_status.status();
   }
-  xla::Shape shape = *shape_or_status.ValueOrDie();
+  xla::Shape shape = shape_or_status.ValueOrDie();
   TF_RET_CHECK(xla::ShapeUtil::IsTuple(shape));
   return XLAShapeToTensorShape(xla::ShapeUtil::GetTupleElementShape(shape, 0),
                                stack_shape);
@@ -60,9 +60,8 @@ Status GetStackShape(xla::ComputationBuilder* builder, XlaResource* resource,
 //
 // TODO(phawkins): consider changing the API of the stack operators to
 // allow an optional element shape at stack construction time.
-Status MaybeInitializeStack(xla::ComputationBuilder* builder,
-                            XlaResource* resource, DataType dtype,
-                            const TensorShape& elem_shape) {
+Status MaybeInitializeStack(xla::XlaBuilder* builder, XlaResource* resource,
+                            DataType dtype, const TensorShape& elem_shape) {
   if (resource->type() != dtype) {
     return errors::InvalidArgument(
         "Stack dtype is ", DataTypeString(resource->type()),
@@ -75,8 +74,6 @@ Status MaybeInitializeStack(xla::ComputationBuilder* builder,
 
   if (!resource->initialized()) {
     // Stack has not been initialized.
-    xla::ComputationDataHandle zero =
-        XlaHelpers::Zero(builder, resource->type());
     TF_RETURN_IF_ERROR(resource->SetTypeAndShape(dtype, elem_shape));
     TF_RETURN_IF_ERROR(resource->SetZeroValue(builder));
   } else {
@@ -111,7 +108,7 @@ class StackOp : public XlaOpKernel {
 
     // We defer initializing the Stack resource until we see the first push.
     // Otherwise we do not know the shape of the stack elements.
-    xla::ComputationDataHandle value;
+    xla::XlaOp value;
     XlaContext& xc = XlaContext::Get(ctx);
     XlaResource* resource;
     string name = strings::StrCat("Stack: ", stack_name_);
@@ -138,7 +135,7 @@ class StackPushOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* b = ctx->builder();
+    xla::XlaBuilder* b = ctx->builder();
     TensorShape elem_shape = ctx->InputShape(1);
 
     XlaResource* resource;
@@ -147,9 +144,9 @@ class StackPushOp : public XlaOpKernel {
     // Initializes the Stack, if the element shape was not already known.
     OP_REQUIRES_OK(ctx, MaybeInitializeStack(b, resource, dtype_, elem_shape));
 
-    xla::ComputationDataHandle ta = b->GetTupleElement(resource->value(), 0);
-    xla::ComputationDataHandle index = b->GetTupleElement(resource->value(), 1);
-    xla::ComputationDataHandle value = ctx->Input(1);
+    xla::XlaOp ta = b->GetTupleElement(resource->value(), 0);
+    xla::XlaOp index = b->GetTupleElement(resource->value(), 1);
+    xla::XlaOp value = ctx->Input(1);
 
     // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
     auto start_indices =
@@ -184,7 +181,7 @@ class StackPopOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* b = ctx->builder();
+    xla::XlaBuilder* b = ctx->builder();
 
     XlaResource* resource;
     OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &resource));
@@ -199,9 +196,9 @@ class StackPopOp : public XlaOpKernel {
     TensorShape stack_shape;
     OP_REQUIRES_OK(ctx, GetStackShape(b, resource, &stack_shape));
 
-    xla::ComputationDataHandle state = resource->value();
-    xla::ComputationDataHandle ta = b->GetTupleElement(state, 0);
-    xla::ComputationDataHandle index = b->GetTupleElement(state, 1);
+    xla::XlaOp state = resource->value();
+    xla::XlaOp ta = b->GetTupleElement(state, 0);
+    xla::XlaOp index = b->GetTupleElement(state, 1);
 
     index = b->Sub(index, b->ConstantR0<int32>(1));
     OP_REQUIRES_OK(ctx, resource->SetValue(b->Tuple({ta, index})));
@@ -216,8 +213,7 @@ class StackPopOp : public XlaOpKernel {
 
     // TODO(phawkins): We don't check the index is in bounds --- there is no
     // error mechanism in XLA.
-    xla::ComputationDataHandle read =
-        b->DynamicSlice(ta, start_indices, slice_shape);
+    xla::XlaOp read = b->DynamicSlice(ta, start_indices, slice_shape);
 
     // Remove the leading '1' dimension.
     std::vector<int64> value_shape(slice_shape.begin() + 1, slice_shape.end());
diff --git a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
index 5bb773d97fc5ce..a99d4ddc7c4956 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
@@ -30,9 +30,8 @@ namespace tensorflow {
 namespace {
 
 // Rotates a 32-bit integer 'v' left by 'distance' bits.
-xla::ComputationDataHandle RotateLeftS32(xla::ComputationBuilder* builder,
-                                         const xla::ComputationDataHandle& v,
-                                         int distance) {
+xla::XlaOp RotateLeftS32(xla::XlaBuilder* builder, const xla::XlaOp& v,
+                         int distance) {
   return builder->Or(
       builder->ShiftLeft(v, builder->ConstantR0<int>(distance)),
       builder->ShiftRightLogical(v, builder->ConstantR0<int>(32 - distance)));
@@ -40,25 +39,24 @@ xla::ComputationDataHandle RotateLeftS32(xla::ComputationBuilder* builder,
 
 // TODO(b/65209188): add a primitive XOR to XLA and call it here, rather than
 // building XOR out of other bitwise operators.
-xla::ComputationDataHandle BitwiseXor(xla::ComputationBuilder* builder,
-                                      const xla::ComputationDataHandle& x,
-                                      const xla::ComputationDataHandle& y) {
+xla::XlaOp BitwiseXor(xla::XlaBuilder* builder, const xla::XlaOp& x,
+                      const xla::XlaOp& y) {
   return builder->Or(builder->And(x, builder->Not(y)),
                      builder->And(builder->Not(x), y));
 }
 
-using ThreeFry2x32State = std::array<xla::ComputationDataHandle, 2>;
+using ThreeFry2x32State = std::array<xla::XlaOp, 2>;
 
 // Implements the ThreeFry counter-based PRNG algorithm.
 // Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.
 // http://www.thesalmons.org/john/random123/papers/random123sc11.pdf
-ThreeFry2x32State ThreeFry2x32(xla::ComputationBuilder* builder,
+ThreeFry2x32State ThreeFry2x32(xla::XlaBuilder* builder,
                                ThreeFry2x32State input, ThreeFry2x32State key) {
   // Rotation distances specified by the Threefry2x32 algorithm.
   constexpr std::array<int, 8> rotations = {13, 15, 26, 6, 17, 29, 16, 24};
   ThreeFry2x32State x;
 
-  std::array<xla::ComputationDataHandle, 3> ks;
+  std::array<xla::XlaOp, 3> ks;
   // 0x1BD11BDA is a parity constant specified by the ThreeFry2x32 algorithm.
   ks[2] = builder->ConstantR0<int32>(0x1BD11BDA);
   for (int i = 0; i < 2; ++i) {
@@ -121,10 +119,9 @@ ThreeFry2x32State ThreeFry2x32(xla::ComputationBuilder* builder,
 
 // Returns a tensor of 'shape' random values uniformly distributed in the range
 // [minval, maxval)
-xla::ComputationDataHandle RandomUniform(xla::ComputationBuilder* builder,
-                                         const xla::ComputationDataHandle& seed,
-                                         const TensorShape& shape,
-                                         double minval, double maxval) {
+xla::XlaOp RandomUniform(xla::XlaBuilder* builder, const xla::XlaOp& seed,
+                         const TensorShape& shape, double minval,
+                         double maxval) {
   // Split the seed into two 32-bit scalars to form a key.
   auto seed0 = builder->Reshape(builder->Slice(seed, {0}, {1}, {1}), {});
   auto seed1 = builder->Reshape(builder->Slice(seed, {1}, {2}, {1}), {});
@@ -178,9 +175,8 @@ xla::ComputationDataHandle RandomUniform(xla::ComputationBuilder* builder,
 //     p = sum_{i=1}^n gq[i]*w^i
 //   }
 //   return p*x
-xla::ComputationDataHandle ErfInvF32(xla::ComputationBuilder* b,
-                                     const xla::ComputationDataHandle& x,
-                                     const TensorShape& shape) {
+xla::XlaOp ErfInvF32(xla::XlaBuilder* b, const xla::XlaOp& x,
+                     const TensorShape& shape) {
   constexpr int kDegree = 9;
   constexpr std::array<float, 9> w_less_than_5_constants = {
       2.81022636e-08f,  3.43273939e-07f, -3.5233877e-06f,
@@ -220,7 +216,7 @@ class StatelessRandomUniformOp : public XlaOpKernel {
       : XlaOpKernel(ctx) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* builder = ctx->builder();
+    xla::XlaBuilder* builder = ctx->builder();
 
     TensorShape shape;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &shape));
@@ -229,7 +225,7 @@ class StatelessRandomUniformOp : public XlaOpKernel {
     OP_REQUIRES(ctx, seed_shape.dims() == 1 && seed_shape.dim_size(0) == 2,
                 errors::InvalidArgument("seed must have shape [2], not ",
                                         seed_shape.DebugString()));
-    xla::ComputationDataHandle seed = ctx->Input(1);
+    xla::XlaOp seed = ctx->Input(1);
     ctx->SetOutput(0, RandomUniform(builder, seed, shape, 0.0, 1.0));
   }
 
@@ -257,9 +253,10 @@ class StatelessRandomNormalOp : public XlaOpKernel {
     OP_REQUIRES(ctx, seed_shape == TensorShape({2}),
                 errors::InvalidArgument("seed must have shape [2], not ",
                                         seed_shape.DebugString()));
-    xla::ComputationDataHandle seed = ctx->Input(1);
-    xla::ComputationBuilder* builder = ctx->builder();
-    auto uniform = RandomUniform(builder, seed, shape, -1.0, 1.0);
+    xla::XlaOp seed = ctx->Input(1);
+    xla::XlaBuilder* builder = ctx->builder();
+    auto uniform =
+        RandomUniform(builder, seed, shape, std::nextafter(-1.0f, 0.0f), 1.0);
     // Convert uniform distribution to normal distribution by computing
     // sqrt(2) * erfinv(x)
     auto normal = builder->Mul(builder->ConstantR0<float>(std::sqrt(2.0)),
diff --git a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
index 6204aa4e27000f..55254c746e5eba 100644
--- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
@@ -90,7 +90,7 @@ class StridedSliceOp : public XlaOpKernel {
       }
     }
 
-    xla::ComputationDataHandle slice = ctx->Input(0);
+    xla::XlaOp slice = ctx->Input(0);
     if (!dimensions_to_reverse.empty()) {
       slice = ctx->builder()->Rev(slice, dimensions_to_reverse);
     }
@@ -168,7 +168,7 @@ class StridedSliceGradOp : public XlaOpKernel {
 
     auto zero = XlaHelpers::Zero(ctx->builder(), ctx->expected_output_dtype(0));
 
-    xla::ComputationDataHandle grad = ctx->Input(4);
+    xla::XlaOp grad = ctx->Input(4);
 
     // Undo any new/shrink axes.
     grad = ctx->builder()->Reshape(grad, processing_shape.dim_sizes());
@@ -255,7 +255,7 @@ class StridedSliceAssignOp : public XlaOpKernel {
                                             &strides_tensor));
 
     TensorShape lhs_shape;
-    xla::ComputationDataHandle lhs;
+    xla::XlaOp lhs;
     OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, dtype_, &lhs_shape, &lhs));
 
     const TensorShape rhs_shape = ctx->InputShape(4);
@@ -284,7 +284,7 @@ class StridedSliceAssignOp : public XlaOpKernel {
                     " does not match r-value shape ", rhs_shape.DebugString(),
                     ". Automatic broadcasting not yet implemented."));
 
-    xla::ComputationDataHandle rhs = ctx->Input(4);
+    xla::XlaOp rhs = ctx->Input(4);
 
     gtl::InlinedVector<int64, 4> dimensions_to_reverse;
     gtl::InlinedVector<int64, 4> slice_begin, slice_dims;
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
index 000b50af6bd86b..9adee78a1fd1fb 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
@@ -47,7 +47,7 @@ namespace {
 // the TensorArray with elements of `elem_shape`. For both initialized and
 // uninitialized TensorArrays, checks that the tensor has a type compatible with
 // 'dtype' and shape compatible with 'elem_shape'.
-Status MaybeInitializeTensorArray(xla::ComputationBuilder* builder,
+Status MaybeInitializeTensorArray(xla::XlaBuilder* builder,
                                   XlaResource* resource, DataType dtype,
                                   const TensorShape& elem_shape) {
   if (resource->kind() != XlaResource::kTensorArray) {
@@ -64,9 +64,6 @@ Status MaybeInitializeTensorArray(xla::ComputationBuilder* builder,
       << resource->name() << " size " << resource->tensor_array_size();
 
   if (!resource->initialized()) {
-    xla::ComputationDataHandle zero =
-        XlaHelpers::Zero(builder, resource->type());
-
     TF_RETURN_IF_ERROR(resource->SetTypeAndShape(dtype, elem_shape));
     TF_RETURN_IF_ERROR(resource->SetZeroValue(builder));
   } else {
@@ -77,7 +74,7 @@ Status MaybeInitializeTensorArray(xla::ComputationBuilder* builder,
     }
     TensorShape shape;
     TF_RETURN_IF_ERROR(
-        XLAShapeToTensorShape(*shape_or_status.ValueOrDie(), &shape));
+        XLAShapeToTensorShape(shape_or_status.ValueOrDie(), &shape));
 
     TensorShape ta_shape;
     ta_shape.AddDim(resource->tensor_array_size());
@@ -114,23 +111,21 @@ Status CheckTensorArrayIsInitialized(const string& op_name,
 }
 
 Status GetTensorArrayShape(const XlaResource* resource,
-                           xla::ComputationBuilder* builder,
-                           TensorShape* shape) {
+                           xla::XlaBuilder* builder, TensorShape* shape) {
   *shape = resource->shape();
   shape->InsertDim(0, resource->tensor_array_size());
   return Status::OK();
 }
 
-// Like ComputationBuilder::DynamicUpdateSlice, but adds 'update' to the
+// Like XlaBuilder::DynamicUpdateSlice, but adds 'update' to the
 // relevant slice of 'operand'.
-xla::ComputationDataHandle DynamicAddSlice(
-    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& operand,
-    const xla::ComputationDataHandle& update,
-    const gtl::ArraySlice<int64>& update_dims,
-    const xla::ComputationDataHandle& start_indices) {
-  xla::ComputationDataHandle current =
+xla::XlaOp DynamicAddSlice(xla::XlaBuilder* builder, const xla::XlaOp& operand,
+                           const xla::XlaOp& update,
+                           const gtl::ArraySlice<int64>& update_dims,
+                           const xla::XlaOp& start_indices) {
+  xla::XlaOp current =
       builder->DynamicSlice(operand, start_indices, update_dims);
-  xla::ComputationDataHandle sum = builder->Add(current, update);
+  xla::XlaOp sum = builder->Add(current, update);
   return builder->DynamicUpdateSlice(operand, sum, start_indices);
 }
 
@@ -155,18 +150,18 @@ class TensorArrayOp : public XlaOpKernel {
     OP_REQUIRES(ctx, size >= 0,
                 errors::InvalidArgument("TensorArray size must be >= 0"));
 
-    xla::ComputationBuilder* b = ctx->builder();
+    xla::XlaBuilder* b = ctx->builder();
 
     // Initializes the TensorArray value if we know the element shape.
     // Otherwise, defer initialization to the first write.
-    xla::ComputationDataHandle value;
+    xla::XlaOp value;
     TensorShape shape;
     if (element_shape_.IsFullyDefined()) {
       CHECK(element_shape_.AsTensorShape(&shape));
       TensorShape ta_shape;
       ta_shape.AddDim(size);
       ta_shape.AppendShape(shape);
-      xla::ComputationDataHandle zero = XlaHelpers::Zero(b, dtype_);
+      xla::XlaOp zero = XlaHelpers::Zero(b, dtype_);
       value = b->Broadcast(zero, ta_shape.dim_sizes());
     }
 
@@ -202,7 +197,7 @@ class TensorArrayWriteOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* b = ctx->builder();
+    xla::XlaBuilder* b = ctx->builder();
 
     TensorShape elem_shape = ctx->InputShape(2);
 
@@ -213,10 +208,10 @@ class TensorArrayWriteOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx,
                    MaybeInitializeTensorArray(b, resource, dtype_, elem_shape));
 
-    xla::ComputationDataHandle ta = resource->value();
-    xla::ComputationDataHandle index = ctx->Input(1);
-    xla::ComputationDataHandle value = ctx->Input(2);
-    xla::ComputationDataHandle flow = ctx->Input(3);
+    xla::XlaOp ta = resource->value();
+    xla::XlaOp index = ctx->Input(1);
+    xla::XlaOp value = ctx->Input(2);
+    xla::XlaOp flow = ctx->Input(3);
 
     // start_indices of the DynamicUpdateSlice are [index, 0, 0, ..., 0].
     auto start_indices =
@@ -227,7 +222,7 @@ class TensorArrayWriteOp : public XlaOpKernel {
     slice_shape.InsertDim(0, 1LL);
     auto update = b->Reshape(value, slice_shape.dim_sizes());
 
-    xla::ComputationDataHandle written =
+    xla::XlaOp written =
         DynamicAddSlice(b, ta, update, slice_shape.dim_sizes(), start_indices);
 
     OP_REQUIRES_OK(ctx, resource->SetValue(written));
@@ -249,7 +244,7 @@ class TensorArrayReadOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* b = ctx->builder();
+    xla::XlaBuilder* b = ctx->builder();
 
     XlaResource* resource;
     OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &resource));
@@ -259,8 +254,8 @@ class TensorArrayReadOp : public XlaOpKernel {
     TensorShape ta_shape;
     OP_REQUIRES_OK(ctx, GetTensorArrayShape(resource, b, &ta_shape));
 
-    xla::ComputationDataHandle ta = resource->value();
-    xla::ComputationDataHandle index = ctx->Input(1);
+    xla::XlaOp ta = resource->value();
+    xla::XlaOp index = ctx->Input(1);
 
     // start_indices of the DynamicSlice are [index, 0, 0, ..., 0].
     auto start_indices =
@@ -270,8 +265,7 @@ class TensorArrayReadOp : public XlaOpKernel {
     auto slice_shape = ta_shape.dim_sizes();
     slice_shape[0] = 1LL;
 
-    xla::ComputationDataHandle read =
-        b->DynamicSlice(ta, start_indices, slice_shape);
+    xla::XlaOp read = b->DynamicSlice(ta, start_indices, slice_shape);
 
     // Remove the leading '1' dimension.
     std::vector<int64> value_shape(slice_shape.begin() + 1, slice_shape.end());
@@ -293,7 +287,7 @@ class TensorArrayGatherOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* b = ctx->builder();
+    xla::XlaBuilder* b = ctx->builder();
 
     XlaResource* resource;
     OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &resource));
@@ -309,7 +303,7 @@ class TensorArrayGatherOp : public XlaOpKernel {
     auto indices = ctx->Input(1);
     DataType index_type = ctx->input_type(1);
 
-    xla::ComputationDataHandle ta = resource->value();
+    xla::XlaOp ta = resource->value();
 
     // Look for the case where the gather takes a simple slice from the
     // tensor array (0, 1, 2, 3, 4, ..., N)
@@ -337,7 +331,7 @@ class TensorArrayGatherOp : public XlaOpKernel {
       }
     }
 
-    xla::ComputationDataHandle gather;
+    xla::XlaOp gather;
     OP_REQUIRES_OK(
         ctx,
         XlaGather(ta, ta_shape, indices, indices_shape, /*axis=*/0,
@@ -360,7 +354,7 @@ class TensorArrayScatterOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* b = ctx->builder();
+    xla::XlaBuilder* b = ctx->builder();
 
     const TensorShape value_shape = ctx->InputShape(2);
 
@@ -375,11 +369,11 @@ class TensorArrayScatterOp : public XlaOpKernel {
     OP_REQUIRES(ctx, indices_shape.dims() >= 1,
                 errors::InvalidArgument("indices must be rank 1"));
     const int num_indices = indices_shape.dim_size(0);
-    const xla::ComputationDataHandle indices = ctx->Input(1);
+    const xla::XlaOp indices = ctx->Input(1);
 
-    xla::ComputationDataHandle ta = resource->value();
-    const xla::ComputationDataHandle value = ctx->Input(2);
-    const xla::ComputationDataHandle flow = ctx->Input(3);
+    xla::XlaOp ta = resource->value();
+    const xla::XlaOp value = ctx->Input(2);
+    const xla::XlaOp flow = ctx->Input(3);
 
     // Look for the case where the scatter is for each sub-tensor in order. The
     // tensor array implementation allows for this to be a straight addition.
@@ -443,7 +437,7 @@ class TensorArrayConcatOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* b = ctx->builder();
+    xla::XlaBuilder* b = ctx->builder();
 
     XlaResource* resource;
     OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &resource));
@@ -453,7 +447,7 @@ class TensorArrayConcatOp : public XlaOpKernel {
     TensorShape ta_shape;
     OP_REQUIRES_OK(ctx, GetTensorArrayShape(resource, b, &ta_shape));
 
-    xla::ComputationDataHandle ta = resource->value();
+    xla::XlaOp ta = resource->value();
 
     auto ta_dims = ta_shape.dim_sizes();
     std::vector<int64> shape(ta_dims.begin() + 1, ta_dims.end());
@@ -503,12 +497,12 @@ class TensorArraySplitOp : public XlaOpKernel {
     TensorShape elem_shape = value_shape;
     elem_shape.set_dim(0, length);
 
-    xla::ComputationBuilder* b = ctx->builder();
+    xla::XlaBuilder* b = ctx->builder();
     XlaResource* resource;
     OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &resource));
     OP_REQUIRES_OK(ctx,
                    MaybeInitializeTensorArray(b, resource, dtype_, elem_shape));
-    xla::ComputationDataHandle ta = resource->value();
+    xla::XlaOp ta = resource->value();
 
     TensorShape ta_shape;
     ta_shape.AddDim(resource->tensor_array_size());
@@ -520,8 +514,8 @@ class TensorArraySplitOp : public XlaOpKernel {
             "TensorArray's size is not equal to the size of lengths (",
             lengths.size(), " vs. ", resource->tensor_array_size(), ")"));
 
-    const xla::ComputationDataHandle value = ctx->Input(1);
-    const xla::ComputationDataHandle flow = ctx->Input(3);
+    const xla::XlaOp value = ctx->Input(1);
+    const xla::XlaOp flow = ctx->Input(3);
 
     OP_REQUIRES(ctx, value_shape.num_elements() == ta_shape.num_elements(),
                 errors::InvalidArgument("mismatched element count ",
@@ -569,7 +563,7 @@ class TensorArrayGradOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* b = ctx->builder();
+    xla::XlaBuilder* b = ctx->builder();
 
     XlaResource* resource;
     OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &resource));
diff --git a/tensorflow/compiler/tf2xla/kernels/tile_ops.cc b/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
index 9aefcd4fc7f94a..e91075196bd841 100644
--- a/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tile_ops.cc
@@ -112,7 +112,7 @@ class TileOp : public XlaOpKernel {
       flattened.push_back(i);
       flattened.push_back(i + output_shape.size());
     }
-    xla::ComputationDataHandle output =
+    xla::XlaOp output =
         ctx->builder()->Reshape(broadcasted, flattened, output_shape);
 
     ctx->SetOutput(0, output);
diff --git a/tensorflow/compiler/tf2xla/kernels/training_ops.cc b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
index f750f7003be288..34caefa050c0d5 100644
--- a/tensorflow/compiler/tf2xla/kernels/training_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/types.h"
@@ -30,8 +30,8 @@ class ResourceApplyGradientDescent : public XlaOpKernel {
   explicit ResourceApplyGradientDescent(OpKernelConstruction* ctx)
       : XlaOpKernel(ctx) {}
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationDataHandle handle;
-    xla::ComputationBuilder* b = ctx->builder();
+    xla::XlaOp handle;
+    xla::XlaBuilder* b = ctx->builder();
     DataType type = ctx->input_type(1);
     TensorShape var_shape;
     OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, &var_shape, &handle));
@@ -63,12 +63,12 @@ class ResourceApplyMomentum : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* b = ctx->builder();
+    xla::XlaBuilder* b = ctx->builder();
 
     DataType type = ctx->input_type(2);
 
     TensorShape var_shape, accum_shape;
-    xla::ComputationDataHandle var, accum;
+    xla::XlaOp var, accum;
     OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, &var_shape, &var));
     OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, type, &accum_shape, &accum));
 
@@ -93,9 +93,9 @@ class ResourceApplyMomentum : public XlaOpKernel {
                 errors::InvalidArgument("momentum is not a scalar: ",
                                         momentum_shape.DebugString()));
 
-    xla::ComputationDataHandle lr = ctx->Input(2);
-    xla::ComputationDataHandle grad = ctx->Input(3);
-    xla::ComputationDataHandle momentum = ctx->Input(4);
+    xla::XlaOp lr = ctx->Input(2);
+    xla::XlaOp grad = ctx->Input(3);
+    xla::XlaOp momentum = ctx->Input(4);
 
     accum = b->Add(b->Mul(accum, momentum), grad);
     if (use_nesterov_) {
@@ -121,12 +121,12 @@ class ResourceApplyAdagrad : public XlaOpKernel {
   explicit ResourceApplyAdagrad(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* b = ctx->builder();
+    xla::XlaBuilder* b = ctx->builder();
 
     DataType type = ctx->input_type(2);
 
     TensorShape var_shape, accum_shape;
-    xla::ComputationDataHandle var, accum;
+    xla::XlaOp var, accum;
     OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, &var_shape, &var));
     OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, type, &accum_shape, &accum));
 
@@ -146,8 +146,8 @@ class ResourceApplyAdagrad : public XlaOpKernel {
                     "var and grad do not have the same shape",
                     var_shape.DebugString(), " ", grad_shape.DebugString()));
 
-    xla::ComputationDataHandle lr = ctx->Input(2);
-    xla::ComputationDataHandle grad = ctx->Input(3);
+    xla::XlaOp lr = ctx->Input(2);
+    xla::XlaOp grad = ctx->Input(3);
 
     accum = b->Add(accum, b->Pow(grad, XlaHelpers::FloatLiteral(b, type, 2.0)));
     var = b->Sub(
@@ -168,7 +168,7 @@ class ResourceApplyAdam : public XlaOpKernel {
 
   void Compile(XlaOpKernelContext* ctx) override {
     TensorShape var_shape, m_shape, v_shape;
-    xla::ComputationDataHandle var, m, v;
+    xla::XlaOp var, m, v;
     OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, dtype_, &var_shape, &var));
     OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, dtype_, &m_shape, &m));
     OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, dtype_, &v_shape, &v));
@@ -213,25 +213,25 @@ class ResourceApplyAdam : public XlaOpKernel {
                     "var and grad do not have the same shape",
                     var_shape.DebugString(), " ", grad_shape.DebugString()));
 
-    xla::ComputationDataHandle beta1_power = ctx->Input(3);
-    xla::ComputationDataHandle beta2_power = ctx->Input(4);
-    xla::ComputationDataHandle lr = ctx->Input(5);
-    xla::ComputationDataHandle beta1 = ctx->Input(6);
-    xla::ComputationDataHandle beta2 = ctx->Input(7);
-    xla::ComputationDataHandle epsilon = ctx->Input(8);
-    xla::ComputationDataHandle grad = ctx->Input(9);
+    xla::XlaOp beta1_power = ctx->Input(3);
+    xla::XlaOp beta2_power = ctx->Input(4);
+    xla::XlaOp lr = ctx->Input(5);
+    xla::XlaOp beta1 = ctx->Input(6);
+    xla::XlaOp beta2 = ctx->Input(7);
+    xla::XlaOp epsilon = ctx->Input(8);
+    xla::XlaOp grad = ctx->Input(9);
 
     // alpha <- learning_rate * sqrt(1 - beta2^t) / (1 - beta1^t)
     // m_t <- beta1 * m_{t-1} + (1 - beta1) * g_t
     // v_t <- beta2 * v_{t-1} + (1 - beta2) * g_t * g_t
     // variable <- variable - alpha * m_t / (sqrt(v_t) + epsilon)
 
-    xla::ComputationBuilder* b = ctx->builder();
-    xla::ComputationDataHandle half = XlaHelpers::FloatLiteral(b, dtype_, 0.5);
-    xla::ComputationDataHandle one = XlaHelpers::FloatLiteral(b, dtype_, 1.0);
-    xla::ComputationDataHandle two = XlaHelpers::FloatLiteral(b, dtype_, 2.0);
+    xla::XlaBuilder* b = ctx->builder();
+    xla::XlaOp half = XlaHelpers::FloatLiteral(b, dtype_, 0.5);
+    xla::XlaOp one = XlaHelpers::FloatLiteral(b, dtype_, 1.0);
+    xla::XlaOp two = XlaHelpers::FloatLiteral(b, dtype_, 2.0);
 
-    xla::ComputationDataHandle alpha =
+    xla::XlaOp alpha =
         b->Div(b->Mul(lr, b->Pow(b->Sub(one, beta2_power), half)),
                b->Sub(one, beta1_power));
     m = b->Add(m, b->Mul(b->Sub(grad, m), b->Sub(one, beta1)));
@@ -255,12 +255,12 @@ class ResourceApplyRMSProp : public XlaOpKernel {
   explicit ResourceApplyRMSProp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* b = ctx->builder();
+    xla::XlaBuilder* b = ctx->builder();
 
     DataType type = ctx->input_type(3);
 
     TensorShape var_shape, ms_shape, mom_shape;
-    xla::ComputationDataHandle var, ms, mom;
+    xla::XlaOp var, ms, mom;
     OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, &var_shape, &var));
     OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, type, &ms_shape, &ms));
     OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, type, &mom_shape, &mom));
@@ -297,11 +297,11 @@ class ResourceApplyRMSProp : public XlaOpKernel {
                     "var and grad do not have the same shape",
                     var_shape.DebugString(), " ", grad_shape.DebugString()));
 
-    xla::ComputationDataHandle lr = ctx->Input(3);
-    xla::ComputationDataHandle rho = ctx->Input(4);
-    xla::ComputationDataHandle momentum = ctx->Input(5);
-    xla::ComputationDataHandle epsilon = ctx->Input(6);
-    xla::ComputationDataHandle grad = ctx->Input(7);
+    xla::XlaOp lr = ctx->Input(3);
+    xla::XlaOp rho = ctx->Input(4);
+    xla::XlaOp momentum = ctx->Input(5);
+    xla::XlaOp epsilon = ctx->Input(6);
+    xla::XlaOp grad = ctx->Input(7);
 
     // ms <- rho * ms_{t-1} + (1-rho) * grad * grad
     // mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
@@ -320,16 +320,16 @@ class ResourceApplyRMSProp : public XlaOpKernel {
     //    ms <- grad**2 (1 - rho) + ms * rho
     //
     // Which is the equation listed above.
-    xla::ComputationDataHandle new_ms = b->Add(
+    xla::XlaOp new_ms = b->Add(
         ms,
         b->Mul(b->Sub(b->Pow(grad, XlaHelpers::FloatLiteral(b, type, 2.0)), ms),
                b->Sub(XlaHelpers::FloatLiteral(b, type, 1.0), rho)));
-    xla::ComputationDataHandle new_mom =
+    xla::XlaOp new_mom =
         b->Add(b->Mul(mom, momentum),
                b->Mul(b->Mul(grad, lr),
                       b->Pow(b->Add(new_ms, epsilon),
                              XlaHelpers::FloatLiteral(b, type, -0.5))));
-    xla::ComputationDataHandle new_var = b->Sub(var, new_mom);
+    xla::XlaOp new_var = b->Sub(var, new_mom);
 
     OP_REQUIRES_OK(ctx, ctx->AssignVariable(0, type, new_var));
     OP_REQUIRES_OK(ctx, ctx->AssignVariable(1, type, new_ms));
@@ -341,10 +341,10 @@ REGISTER_XLA_OP(Name("ResourceApplyRMSProp").TypeConstraint("T", kFloatTypes),
 
 void CompileFtrl(XlaOpKernelContext* ctx, DataType dtype,
                  bool has_l2_shrinkage) {
-  xla::ComputationBuilder* b = ctx->builder();
+  xla::XlaBuilder* b = ctx->builder();
 
   TensorShape var_shape, accum_shape, linear_shape;
-  xla::ComputationDataHandle var, accum, linear;
+  xla::XlaOp var, accum, linear;
   OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, dtype, &var_shape, &var));
   OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(1, dtype, &accum_shape, &accum));
   OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(2, dtype, &linear_shape, &linear));
@@ -399,12 +399,12 @@ void CompileFtrl(XlaOpKernelContext* ctx, DataType dtype,
               errors::InvalidArgument("lr_power is not a scalar: ",
                                       lr_power_shape.DebugString()));
 
-  xla::ComputationDataHandle grad = ctx->Input(3);
-  xla::ComputationDataHandle lr = ctx->Input(4);
-  xla::ComputationDataHandle l1 = ctx->Input(5);
-  xla::ComputationDataHandle l2 = ctx->Input(6);
-  xla::ComputationDataHandle l2_shrinkage;
-  xla::ComputationDataHandle lr_power;
+  xla::XlaOp grad = ctx->Input(3);
+  xla::XlaOp lr = ctx->Input(4);
+  xla::XlaOp l1 = ctx->Input(5);
+  xla::XlaOp l2 = ctx->Input(6);
+  xla::XlaOp l2_shrinkage;
+  xla::XlaOp lr_power;
   if (has_l2_shrinkage) {
     l2_shrinkage = ctx->Input(7);
     lr_power = ctx->Input(8);
@@ -421,26 +421,23 @@ void CompileFtrl(XlaOpKernelContext* ctx, DataType dtype,
   // var = (linear_clipped - linear) / quadratic
   // accum = new_accum
 
-  xla::ComputationDataHandle two = XlaHelpers::FloatLiteral(b, dtype, 2.0);
-  xla::ComputationDataHandle grad_to_use;
+  xla::XlaOp two = XlaHelpers::FloatLiteral(b, dtype, 2.0);
+  xla::XlaOp grad_to_use;
   if (has_l2_shrinkage) {
     grad_to_use = b->Add(grad, b->Mul(two, b->Mul(l2_shrinkage, var)));
   } else {
     grad_to_use = grad;
   }
 
-  xla::ComputationDataHandle new_accum =
-      b->Add(accum, b->Pow(grad_to_use, two));
-  xla::ComputationDataHandle new_accum_lr_pow =
-      b->Pow(new_accum, b->Neg(lr_power));
-  xla::ComputationDataHandle accum_lr_pow = b->Pow(accum, b->Neg(lr_power));
+  xla::XlaOp new_accum = b->Add(accum, b->Pow(grad_to_use, two));
+  xla::XlaOp new_accum_lr_pow = b->Pow(new_accum, b->Neg(lr_power));
+  xla::XlaOp accum_lr_pow = b->Pow(accum, b->Neg(lr_power));
   linear = b->Add(
       linear,
       b->Sub(grad_to_use,
              b->Mul(b->Div(b->Sub(new_accum_lr_pow, accum_lr_pow), lr), var)));
-  xla::ComputationDataHandle linear_clipped = b->Clamp(b->Neg(l1), linear, l1);
-  xla::ComputationDataHandle quadratic =
-      b->Add(b->Div(new_accum_lr_pow, lr), b->Mul(two, l2));
+  xla::XlaOp linear_clipped = b->Clamp(b->Neg(l1), linear, l1);
+  xla::XlaOp quadratic = b->Add(b->Div(new_accum_lr_pow, lr), b->Mul(two, l2));
   var = b->Div(b->Sub(linear_clipped, linear), quadratic);
   accum = new_accum;
 
diff --git a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
index 7cb47f908d4ff4..71a9fd051bfc8d 100644
--- a/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unary_ops.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 
 namespace tensorflow {
@@ -33,9 +33,9 @@ namespace {
    public:                                                             \
     explicit NAME##Op(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {} \
     void Compile(XlaOpKernelContext* ctx) {                            \
-      xla::ComputationBuilder* b = ctx->builder();                     \
-      xla::ComputationDataHandle x = ctx->Input(0);                    \
-      xla::ComputationDataHandle y = COMPUTATION;                      \
+      xla::XlaBuilder* b = ctx->builder();                             \
+      xla::XlaOp x = ctx->Input(0);                                    \
+      xla::XlaOp y = COMPUTATION;                                      \
       ctx->SetOutput(0, y);                                            \
     }                                                                  \
   };                                                                   \
@@ -100,8 +100,7 @@ XLAJIT_MAKE_UNARY(Cosh,
 XLAJIT_MAKE_UNARY(Sin, b->Sin(x));
 XLAJIT_MAKE_UNARY(Exp, b->Exp(x));
 
-// TODO(b/34703906): use a more accurate implementation of expm1.
-XLAJIT_MAKE_UNARY(Expm1, b->Sub(b->Exp(x), XlaHelpers::One(b, input_type(0))));
+XLAJIT_MAKE_UNARY(Expm1, b->Expm1(x));
 
 XLAJIT_MAKE_UNARY(Floor, b->Floor(x));
 XLAJIT_MAKE_UNARY(IsFinite, b->IsFinite(x));
@@ -115,8 +114,7 @@ XLAJIT_MAKE_UNARY(Inv, b->Div(XlaHelpers::One(b, input_type(0)), x));
 XLAJIT_MAKE_UNARY(Reciprocal, b->Div(XlaHelpers::One(b, input_type(0)), x));
 XLAJIT_MAKE_UNARY(Log, b->Log(x));
 
-// TODO(b/34703906): use a more accurate implementation of log1p.
-XLAJIT_MAKE_UNARY(Log1p, b->Log(b->Add(XlaHelpers::One(b, input_type(0)), x)));
+XLAJIT_MAKE_UNARY(Log1p, b->Log1p(x));
 
 XLAJIT_MAKE_UNARY(Invert, b->Not(x));
 XLAJIT_MAKE_UNARY(LogicalNot, b->Not(x));
@@ -124,9 +122,8 @@ XLAJIT_MAKE_UNARY(Neg, b->Neg(x));
 
 // Implements Banker's rounding: numbers that are equidistant between two
 // integers are rounded towards even.
-static xla::ComputationDataHandle Round(xla::ComputationBuilder* b,
-                                        DataType dtype,
-                                        const xla::ComputationDataHandle& x) {
+static xla::XlaOp Round(xla::XlaBuilder* b, DataType dtype,
+                        const xla::XlaOp& x) {
   auto half = XlaHelpers::FloatLiteral(b, dtype, 0.5);
   auto one = XlaHelpers::FloatLiteral(b, dtype, 1.0);
   auto two = XlaHelpers::FloatLiteral(b, dtype, 2.0);
@@ -148,9 +145,8 @@ XLAJIT_MAKE_UNARY(Rsqrt,
                   b->Pow(x, XlaHelpers::FloatLiteral(b, input_type(0), -0.5)));
 
 // Expresses sigmoid as a rescaled tanh: sigmoid(x) == (tanh(x/2) + 1) / 2.
-static xla::ComputationDataHandle Sigmoid(xla::ComputationBuilder* b,
-                                          DataType dtype,
-                                          const xla::ComputationDataHandle& x) {
+static xla::XlaOp Sigmoid(xla::XlaBuilder* b, DataType dtype,
+                          const xla::XlaOp& x) {
   auto half = XlaHelpers::FloatLiteral(b, dtype, 0.5);
   return b->Add(half, b->Mul(half, b->Tanh(b->Mul(half, x))));
 }
@@ -162,26 +158,17 @@ XLAJIT_MAKE_UNARY(Sinh,
                   b->Mul(b->Sub(b->Exp(x), b->Exp(b->Neg(x))),
                          XlaHelpers::FloatLiteral(b, input_type(0), 0.5)));
 
-static xla::ComputationDataHandle Softplus(
-    xla::ComputationBuilder* b, DataType dtype,
-    const xla::ComputationDataHandle& features) {
-  xla::ComputationDataHandle threshold =
-      b->Add(b->Log(XlaHelpers::Epsilon(b, dtype)),
-             XlaHelpers::FloatLiteral(b, dtype, 2.0));
-  // Value above which exp(x) may overflow, but softplus(x) == x
-  // is within machine epsilon.
-  xla::ComputationDataHandle too_large = b->Gt(features, b->Neg(threshold));
-  // Value below which exp(x) may underflow, but softplus(x) == exp(x)
-  // is within machine epsilon.
-  xla::ComputationDataHandle too_small = b->Lt(features, threshold);
-  xla::ComputationDataHandle features_exp = b->Exp(features);
-  xla::ComputationDataHandle output = b->Select(
-      too_large, features,
-      b->Select(too_small, features_exp,
-                b->Log(b->Add(features_exp, XlaHelpers::One(b, dtype)))));
-  return output;
-}
-XLAJIT_MAKE_UNARY(Softplus, Softplus(b, input_type(0), x));
+// softplus(x) = log(1 + exp(x))
+//
+// This is not numerically stable when x is large, it can easily overflow.
+// However, we can compute it as LogSumExp(x, 0):
+//   max(x, 0) + log(exp(x - max(x, 0)) + exp(0 - max(x, 0)))
+//
+// This is equivalent to:
+//   max(x, 0) + log1p(exp(-abs(x)))
+XLAJIT_MAKE_UNARY(Softplus,
+                  b->Add(b->Max(x, XlaHelpers::Zero(b, input_type(0))),
+                         b->Log1p(b->Exp(b->Neg(b->Abs(x))))));
 
 // softsign(x) = x / (abs(x) + 1)
 XLAJIT_MAKE_UNARY(Softsign,
diff --git a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
index 71173f5aead477..a163fa0a5b3467 100644
--- a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/types.h"
@@ -48,7 +48,7 @@ class ReadVariableOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationDataHandle handle;
+    xla::XlaOp handle;
     OP_REQUIRES_OK(
         ctx, ctx->ReadVariableInput(0, dtype_, /*shape=*/nullptr, &handle));
     ctx->SetOutput(0, handle);
@@ -57,7 +57,7 @@ class ReadVariableOp : public XlaOpKernel {
  private:
   DataType dtype_;
 };
-REGISTER_XLA_OP(Name("ReadVariableOp"), ReadVariableOp);
+REGISTER_XLA_OP(Name("ReadVariableOp").CompilationOnly(), ReadVariableOp);
 
 class AssignVariableOp : public XlaOpKernel {
  public:
@@ -67,14 +67,14 @@ class AssignVariableOp : public XlaOpKernel {
                    ctx->AssignVariable(0, ctx->input_type(1), ctx->Input(1)));
   }
 };
-REGISTER_XLA_OP(Name("AssignVariableOp"), AssignVariableOp);
+REGISTER_XLA_OP(Name("AssignVariableOp").CompilationOnly(), AssignVariableOp);
 
 class AssignAddVariableOp : public XlaOpKernel {
  public:
   explicit AssignAddVariableOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
   void Compile(XlaOpKernelContext* ctx) override {
     DataType type = ctx->input_type(1);
-    xla::ComputationDataHandle handle;
+    xla::XlaOp handle;
     OP_REQUIRES_OK(ctx,
                    ctx->ReadVariableInput(0, type, /*shape=*/nullptr, &handle));
     handle = ctx->builder()->Add(handle, ctx->Input(1));
@@ -90,7 +90,7 @@ class AssignSubVariableOp : public XlaOpKernel {
   explicit AssignSubVariableOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
   void Compile(XlaOpKernelContext* ctx) override {
     DataType type = ctx->input_type(1);
-    xla::ComputationDataHandle handle;
+    xla::XlaOp handle;
     OP_REQUIRES_OK(ctx,
                    ctx->ReadVariableInput(0, type, /*shape=*/nullptr, &handle));
     handle = ctx->builder()->Sub(handle, ctx->Input(1));
@@ -105,19 +105,19 @@ class ResourceGatherOp : public XlaOpKernel {
  public:
   explicit ResourceGatherOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
   void Compile(XlaOpKernelContext* ctx) override {
-    xla::ComputationBuilder* builder = ctx->builder();
+    xla::XlaBuilder* builder = ctx->builder();
 
     DataType type = ctx->expected_output_dtype(0);
 
     TensorShape resource_shape;
-    xla::ComputationDataHandle resource_handle;
+    xla::XlaOp resource_handle;
     OP_REQUIRES_OK(ctx, ctx->ReadVariableInput(0, type, &resource_shape,
                                                &resource_handle));
 
     auto indices = ctx->Input(1);
     auto indices_shape = ctx->InputShape(1);
     DataType index_type = ctx->input_type(1);
-    xla::ComputationDataHandle gather;
+    xla::XlaOp gather;
     OP_REQUIRES_OK(
         ctx, XlaGather(resource_handle, resource_shape, indices, indices_shape,
                        /*axis=*/0, /*indices_are_nd=*/false, type, index_type,
diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.cc b/tensorflow/compiler/tf2xla/kernels/while_op.cc
index 0ff1b65ae9179d..5467c5d9946846 100644
--- a/tensorflow/compiler/tf2xla/kernels/while_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/while_op.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -101,7 +101,7 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
       ctx, MakeXlaCompilerArgumentsFromInputs(
                ctx, &arguments, &has_uninitialized_vars, &has_tensor_arrays));
 
-  xla::ComputationBuilder* builder = ctx->builder();
+  xla::XlaBuilder* builder = ctx->builder();
   XlaCompiler* compiler = ctx->compiler();
 
   VLOG(1) << "Compiling body";
@@ -234,7 +234,7 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
                   xla::ShapeUtil::HumanString(cond.xla_output_shape)));
 
   int num_inputs = body.input_mapping.size();
-  std::vector<xla::ComputationDataHandle> inputs(num_inputs);
+  std::vector<xla::XlaOp> inputs(num_inputs);
   for (int i = 0; i < num_inputs; ++i) {
     int input_num = body.input_mapping[i];
     if (ctx->input_type(input_num) == DT_RESOURCE) {
@@ -246,24 +246,24 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
     }
   }
 
-  xla::ComputationDataHandle init = builder->Tuple(inputs);
+  xla::XlaOp init = builder->Tuple(inputs);
 
   VLOG(1) << "Building while loop";
 
   // Wraps the condition in a computation that unpacks the output tuple.
-  xla::Computation cond_wrapper;
+  xla::XlaComputation cond_wrapper;
   {
-    std::unique_ptr<xla::ComputationBuilder> cb =
+    std::unique_ptr<xla::XlaBuilder> cb =
         builder->CreateSubBuilder("cond_wrapper");
     auto inputs = cb->Parameter(0, cond_input_shape, "inputs");
     auto outputs = cb->Call(*cond.computation, {inputs});
     cb->GetTupleElement(outputs, 0);
-    xla::StatusOr<xla::Computation> result = cb->Build();
+    xla::StatusOr<xla::XlaComputation> result = cb->Build();
     OP_REQUIRES_OK(ctx, result.status());
     cond_wrapper = std::move(result.ValueOrDie());
   }
 
-  xla::ComputationDataHandle while_result =
+  xla::XlaOp while_result =
       builder->While(cond_wrapper, *body.computation, init);
 
   // Sets non-variable outputs.
diff --git a/tensorflow/compiler/tf2xla/lib/BUILD b/tensorflow/compiler/tf2xla/lib/BUILD
index fde1977c1b1834..ee7f5d510ab7a3 100644
--- a/tensorflow/compiler/tf2xla/lib/BUILD
+++ b/tensorflow/compiler/tf2xla/lib/BUILD
@@ -25,8 +25,8 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/core:lib",
     ],
 )
@@ -44,8 +44,8 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/core:lib",
     ],
 )
@@ -62,9 +62,9 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/core:lib",
     ],
 )
@@ -80,10 +80,9 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/core:lib",
     ],
 )
@@ -91,6 +90,7 @@ cc_library(
 xla_test(
     name = "triangular_solve_test",
     srcs = ["triangular_solve_test.cc"],
+    tags = ["noasan"],  # sometimes times out, http://b/78650012
     deps = [
         ":triangular_solve",
         "//tensorflow/compiler/xla:array2d",
@@ -100,9 +100,9 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -121,8 +121,8 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/core:lib",
     ],
 )
@@ -140,7 +140,6 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
@@ -160,8 +159,8 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/core:lib",
     ],
 )
diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.cc b/tensorflow/compiler/tf2xla/lib/batch_dot.cc
index 798f0fa78055e8..526694d5a0c712 100644
--- a/tensorflow/compiler/tf2xla/lib/batch_dot.cc
+++ b/tensorflow/compiler/tf2xla/lib/batch_dot.cc
@@ -25,24 +25,22 @@ limitations under the License.
 
 namespace tensorflow {
 
-xla::StatusOr<xla::ComputationDataHandle> BatchDot(
-    xla::ComputationBuilder* builder, xla::ComputationDataHandle x,
-    xla::ComputationDataHandle y, bool transpose_x, bool transpose_y,
-    bool conjugate_x, bool conjugate_y) {
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> x_shape,
-                      builder->GetShape(x));
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> y_shape,
-                      builder->GetShape(y));
+xla::StatusOr<xla::XlaOp> BatchDot(xla::XlaBuilder* builder, xla::XlaOp x,
+                                   xla::XlaOp y, bool transpose_x,
+                                   bool transpose_y, bool conjugate_x,
+                                   bool conjugate_y) {
+  TF_ASSIGN_OR_RETURN(xla::Shape x_shape, builder->GetShape(x));
+  TF_ASSIGN_OR_RETURN(xla::Shape y_shape, builder->GetShape(y));
 
   // Check that both tensors have the same number of dimensions. There must be
   // at least two (the batch dimensions can be empty).
-  if (xla::ShapeUtil::Rank(*x_shape) != xla::ShapeUtil::Rank(*y_shape)) {
+  if (xla::ShapeUtil::Rank(x_shape) != xla::ShapeUtil::Rank(y_shape)) {
     return errors::InvalidArgument(
         "Arguments to BatchedDot have different ranks: ",
-        xla::ShapeUtil::HumanString(*x_shape), " vs. ",
-        xla::ShapeUtil::HumanString(*y_shape));
+        xla::ShapeUtil::HumanString(x_shape), " vs. ",
+        xla::ShapeUtil::HumanString(y_shape));
   }
-  const int ndims = xla::ShapeUtil::Rank(*x_shape);
+  const int ndims = xla::ShapeUtil::Rank(x_shape);
   if (ndims < 2) {
     return errors::InvalidArgument(
         "Arguments to BatchedDot must have rank >= 2: ", ndims);
@@ -52,46 +50,46 @@ xla::StatusOr<xla::ComputationDataHandle> BatchDot(
   // valid.
   std::vector<int64> batch_dimension_numbers;
   for (int i = 0; i < ndims - 2; ++i) {
-    if (x_shape->dimensions(i) != y_shape->dimensions(i)) {
+    if (x_shape.dimensions(i) != y_shape.dimensions(i)) {
       return errors::InvalidArgument(
           "Dimension ", i, " of inputs to BatchedDot must be equal: ",
-          xla::ShapeUtil::HumanString(*x_shape), " vs ",
-          xla::ShapeUtil::HumanString(*y_shape));
+          xla::ShapeUtil::HumanString(x_shape), " vs ",
+          xla::ShapeUtil::HumanString(y_shape));
     }
     batch_dimension_numbers.push_back(i);
   }
 
   int x_inner_dim = transpose_x ? (ndims - 2) : (ndims - 1);
   int y_inner_dim = transpose_y ? (ndims - 1) : (ndims - 2);
-  if (x_shape->dimensions(x_inner_dim) != y_shape->dimensions(y_inner_dim)) {
+  if (x_shape.dimensions(x_inner_dim) != y_shape.dimensions(y_inner_dim)) {
     return errors::InvalidArgument(
         "Dimensions ", x_inner_dim, " and ", y_inner_dim,
         " of arguments to BatchedDot must be equal: ",
-        xla::ShapeUtil::HumanString(*x_shape), " transpose: ", transpose_x,
-        " vs. ", xla::ShapeUtil::HumanString(*y_shape),
+        xla::ShapeUtil::HumanString(x_shape), " transpose: ", transpose_x,
+        " vs. ", xla::ShapeUtil::HumanString(y_shape),
         " transpose: ", transpose_y);
   }
 
   // Check for zero lhs/rhs dim size.
-  if (xla::ShapeUtil::HasZeroElements(*x_shape) ||
-      xla::ShapeUtil::HasZeroElements(*y_shape)) {
+  if (xla::ShapeUtil::HasZeroElements(x_shape) ||
+      xla::ShapeUtil::HasZeroElements(y_shape)) {
     std::vector<int64> dimensions(batch_dimension_numbers.size());
     for (int i = 0; i < batch_dimension_numbers.size(); ++i) {
-      dimensions[i] = x_shape->dimensions(batch_dimension_numbers[i]);
+      dimensions[i] = x_shape.dimensions(batch_dimension_numbers[i]);
     }
     int x_outer_dim = transpose_x ? (ndims - 1) : (ndims - 2);
     int y_outer_dim = transpose_y ? (ndims - 2) : (ndims - 1);
-    dimensions.push_back(x_shape->dimensions(x_outer_dim));
-    dimensions.push_back(y_shape->dimensions(y_outer_dim));
+    dimensions.push_back(x_shape.dimensions(x_outer_dim));
+    dimensions.push_back(y_shape.dimensions(y_outer_dim));
     return builder->Broadcast(
-        builder->ConstantLiteral(xla::Literal::Zero(x_shape->element_type())),
+        builder->ConstantLiteral(xla::Literal::Zero(x_shape.element_type())),
         dimensions);
   }
 
-  if (x_shape->element_type() == xla::C64 && conjugate_x) {
+  if (x_shape.element_type() == xla::C64 && conjugate_x) {
     x = builder->Conj(x);
   }
-  if (y_shape->element_type() == xla::C64 && conjugate_y) {
+  if (y_shape.element_type() == xla::C64 && conjugate_y) {
     y = builder->Conj(y);
   }
 
diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.h b/tensorflow/compiler/tf2xla/lib/batch_dot.h
index b230e885f10f45..1acc72033b05e7 100644
--- a/tensorflow/compiler/tf2xla/lib/batch_dot.h
+++ b/tensorflow/compiler/tf2xla/lib/batch_dot.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_BATCH_DOT_H_
 #define TENSORFLOW_COMPILER_TF2XLA_LIB_BATCH_DOT_H_
 
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 
 namespace tensorflow {
 
@@ -43,10 +43,10 @@ namespace tensorflow {
 // It is computed as:
 //
 //     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
-xla::StatusOr<xla::ComputationDataHandle> BatchDot(
-    xla::ComputationBuilder* builder, xla::ComputationDataHandle x,
-    xla::ComputationDataHandle y, bool transpose_x, bool transpose_y,
-    bool conjugate_x = false, bool conjugate_y = false);
+xla::StatusOr<xla::XlaOp> BatchDot(xla::XlaBuilder* builder, xla::XlaOp x,
+                                   xla::XlaOp y, bool transpose_x,
+                                   bool transpose_y, bool conjugate_x = false,
+                                   bool conjugate_y = false);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.cc b/tensorflow/compiler/tf2xla/lib/cholesky.cc
index 203365e2ab07e0..3f1384bc864abd 100644
--- a/tensorflow/compiler/tf2xla/lib/cholesky.cc
+++ b/tensorflow/compiler/tf2xla/lib/cholesky.cc
@@ -47,23 +47,21 @@ namespace {
 //     l[..., j+1:, j] = (a[..., j+1:, j] - np.dot(l[..., j+1:, :j], row_t)) /
 //                       l[..., j, j]
 //   return l
-xla::StatusOr<xla::ComputationDataHandle> CholeskyUnblocked(
-    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& a) {
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> a_shape,
-                      builder->GetShape(a));
-  const int n_dims = xla::ShapeUtil::Rank(*a_shape);
-  const int64 n = xla::ShapeUtil::GetDimension(*a_shape, -1);
-  gtl::ArraySlice<int64> major_dims(xla::AsInt64Slice(a_shape->dimensions()),
+xla::StatusOr<xla::XlaOp> CholeskyUnblocked(xla::XlaBuilder* builder,
+                                            const xla::XlaOp& a) {
+  TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a));
+  const int n_dims = xla::ShapeUtil::Rank(a_shape);
+  const int64 n = xla::ShapeUtil::GetDimension(a_shape, -1);
+  gtl::ArraySlice<int64> major_dims(xla::AsInt64Slice(a_shape.dimensions()),
                                     /*pos=*/0,
                                     /*len=*/n_dims - 2);
 
-  xla::ComputationDataHandle l = Zeros(builder, *a_shape);
+  xla::XlaOp l = Zeros(builder, a_shape);
 
   // Construct the for loop body to iterate over rows.
-  auto body_fn = [&](xla::ComputationDataHandle i,
-                     gtl::ArraySlice<xla::ComputationDataHandle> loop_vars,
-                     xla::ComputationBuilder* body_builder)
-      -> xla::StatusOr<std::vector<xla::ComputationDataHandle>> {
+  auto body_fn = [&](xla::XlaOp i, gtl::ArraySlice<xla::XlaOp> loop_vars,
+                     xla::XlaBuilder* body_builder)
+      -> xla::StatusOr<std::vector<xla::XlaOp>> {
     xla::Shape col_shape;
     xla::Shape row_shape;
     for (int64 d : major_dims) {
@@ -72,12 +70,12 @@ xla::StatusOr<xla::ComputationDataHandle> CholeskyUnblocked(
     }
     row_shape.add_dimensions(1);
     row_shape.add_dimensions(n);
-    row_shape.set_element_type(a_shape->element_type());
+    row_shape.set_element_type(a_shape.element_type());
     auto mask_zeros_row = Zeros(body_builder, row_shape);
 
     col_shape.add_dimensions(n);
     col_shape.add_dimensions(1);
-    col_shape.set_element_type(a_shape->element_type());
+    col_shape.set_element_type(a_shape.element_type());
     auto mask_zeros_col = Zeros(body_builder, col_shape);
 
     std::vector<int32> mask_vector(n);
@@ -101,7 +99,7 @@ xla::StatusOr<xla::ComputationDataHandle> CholeskyUnblocked(
     TF_ASSIGN_OR_RETURN(auto a_ii, DynamicSliceInMinorDims(body_builder, body_a,
                                                            {i, i}, {1, 1}));
     // np.dot(row, np.swapaxes(row, -1, -2))
-    xla::ComputationDataHandle diag_dot;
+    xla::XlaOp diag_dot;
     TF_ASSIGN_OR_RETURN(diag_dot, BatchDot(body_builder, row, row,
                                            /*transpose_x=*/false,
                                            /*transpose_y=*/true));
@@ -109,7 +107,7 @@ xla::StatusOr<xla::ComputationDataHandle> CholeskyUnblocked(
     //                                              np.swapaxes(row, -1, -2)))
     auto l_ii = body_builder->Pow(
         body_builder->Sub(a_ii, diag_dot),
-        FloatLiteral(body_builder, a_shape->element_type(), 0.5));
+        FloatLiteral(body_builder, a_shape.element_type(), 0.5));
 
     // a[..., i+1:, i]
     auto ip1 = body_builder->Add(i, body_builder->ConstantR0<int32>(1));
@@ -140,7 +138,7 @@ xla::StatusOr<xla::ComputationDataHandle> CholeskyUnblocked(
     TF_ASSIGN_OR_RETURN(body_l, DynamicUpdateSliceInMinorDims(
                                     body_builder, body_l, l_ii, {i, i}));
 
-    return std::vector<xla::ComputationDataHandle>{body_a, body_l};
+    return std::vector<xla::XlaOp>{body_a, body_l};
   };
 
   TF_ASSIGN_OR_RETURN(
@@ -152,22 +150,20 @@ xla::StatusOr<xla::ComputationDataHandle> CholeskyUnblocked(
 
 }  // namespace
 
-xla::StatusOr<xla::ComputationDataHandle> Cholesky(
-    xla::ComputationBuilder* builder, xla::ComputationDataHandle a,
-    int64 block_size) {
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> a_shape,
-                      builder->GetShape(a));
-  const int ndims = xla::ShapeUtil::Rank(*a_shape);
+xla::StatusOr<xla::XlaOp> Cholesky(xla::XlaBuilder* builder, xla::XlaOp a,
+                                   int64 block_size) {
+  TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a));
+  const int ndims = xla::ShapeUtil::Rank(a_shape);
   if (ndims < 2) {
     return errors::InvalidArgument(
         "Arguments to Cholesky must have rank >= 2: ", ndims);
   }
 
-  const int64 n = xla::ShapeUtil::GetDimension(*a_shape, -1);
-  if (n != xla::ShapeUtil::GetDimension(*a_shape, -2)) {
+  const int64 n = xla::ShapeUtil::GetDimension(a_shape, -1);
+  if (n != xla::ShapeUtil::GetDimension(a_shape, -2)) {
     return errors::InvalidArgument(
         "Arguments to Cholesky must be square matrices: ",
-        xla::ShapeUtil::HumanString(*a_shape));
+        xla::ShapeUtil::HumanString(a_shape));
   }
 
   if (block_size < 1) {
@@ -179,7 +175,7 @@ xla::StatusOr<xla::ComputationDataHandle> Cholesky(
   // Algorithm 1 from
   // Haidar, Azzam, et al. "High-performance Cholesky factorization for GPU-only
   // execution." Proceedings of General Purpose GPUs. ACM, 2017.
-  xla::ComputationDataHandle l = Zeros(builder, *a_shape);
+  xla::XlaOp l = Zeros(builder, a_shape);
   for (int64 i = 0; i < n; i += block_size) {
     int64 k = std::min(block_size, n - i);
     if (i > 0) {
@@ -218,7 +214,7 @@ xla::StatusOr<xla::ComputationDataHandle> Cholesky(
                                           /*lower=*/true,
                                           /*transpose_a=*/true,
                                           /*conjugate_a=*/false,
-                                          /*block_size=*/8));
+                                          /*block_size=*/block_size));
       TF_ASSIGN_OR_RETURN(
           l, UpdateSliceInMinorDims(builder, l, update, {i + k, i}));
     }
diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.h b/tensorflow/compiler/tf2xla/lib/cholesky.h
index 17da8d8b22d107..20fca7969ece27 100644
--- a/tensorflow/compiler/tf2xla/lib/cholesky.h
+++ b/tensorflow/compiler/tf2xla/lib/cholesky.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_CHOLESKY_H_
 #define TENSORFLOW_COMPILER_TF2XLA_LIB_CHOLESKY_H_
 
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 
 namespace tensorflow {
 
@@ -30,9 +30,8 @@ namespace tensorflow {
 // TODO(phawkins): check for negative values on the diagonal and return an
 // error, instead of silently yielding NaNs.
 // TODO(znado): handle the complex Hermitian case
-xla::StatusOr<xla::ComputationDataHandle> Cholesky(
-    xla::ComputationBuilder* builder, xla::ComputationDataHandle a,
-    int64 block_size = 256);
+xla::StatusOr<xla::XlaOp> Cholesky(xla::XlaBuilder* builder, xla::XlaOp a,
+                                   int64 block_size = 256);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/lib/scatter.cc b/tensorflow/compiler/tf2xla/lib/scatter.cc
index 45699233ea8b2a..d5a27abb2585f6 100644
--- a/tensorflow/compiler/tf2xla/lib/scatter.cc
+++ b/tensorflow/compiler/tf2xla/lib/scatter.cc
@@ -30,24 +30,19 @@ limitations under the License.
 
 namespace tensorflow {
 
-xla::StatusOr<xla::ComputationDataHandle> XlaScatter(
-    const xla::ComputationDataHandle& buffer,
-    const xla::ComputationDataHandle& updates,
-    const xla::ComputationDataHandle& indices, bool indices_are_vectors,
-    const std::function<xla::ComputationDataHandle(
-        xla::ComputationDataHandle, xla::ComputationDataHandle,
-        xla::ComputationBuilder*)>& combiner,
-    xla::ComputationBuilder* builder) {
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> buffer_shape,
-                      builder->GetShape(buffer));
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> updates_shape,
-                      builder->GetShape(updates));
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> indices_shape,
-                      builder->GetShape(indices));
+xla::StatusOr<xla::XlaOp> XlaScatter(
+    const xla::XlaOp& buffer, const xla::XlaOp& updates,
+    const xla::XlaOp& indices, bool indices_are_vectors,
+    const std::function<xla::XlaOp(xla::XlaOp, xla::XlaOp, xla::XlaBuilder*)>&
+        combiner,
+    xla::XlaBuilder* builder) {
+  TF_ASSIGN_OR_RETURN(xla::Shape buffer_shape, builder->GetShape(buffer));
+  TF_RETURN_IF_ERROR(builder->GetShape(updates).status());
+  TF_ASSIGN_OR_RETURN(xla::Shape indices_shape, builder->GetShape(indices));
   gtl::ArraySlice<int64> indices_dims =
-      xla::AsInt64Slice(indices_shape->dimensions());
+      xla::AsInt64Slice(indices_shape.dimensions());
   gtl::ArraySlice<int64> buffer_dims =
-      xla::AsInt64Slice(buffer_shape->dimensions());
+      xla::AsInt64Slice(buffer_shape.dimensions());
 
   // If the indices are N-dimensional, the minor dimension of indices contains
   // the indices to update. Otherwise the indices are all scalars.
@@ -55,12 +50,12 @@ xla::StatusOr<xla::ComputationDataHandle> XlaScatter(
   if (indices_are_vectors) {
     TF_RET_CHECK(!indices_dims.empty());
     num_index_dims = indices_dims.back();
-    if (num_index_dims > xla::ShapeUtil::Rank(*buffer_shape)) {
+    if (num_index_dims > xla::ShapeUtil::Rank(buffer_shape)) {
       return errors::InvalidArgument(
           "The size of the minor dimension of the indices (shape: ",
-          xla::ShapeUtil::HumanString(*indices_shape),
+          xla::ShapeUtil::HumanString(indices_shape),
           ") must be <= the rank of the buffer (shape: ",
-          xla::ShapeUtil::HumanString(*buffer_shape), ")");
+          xla::ShapeUtil::HumanString(buffer_shape), ")");
     }
     indices_dims.pop_back();
   }
@@ -78,10 +73,10 @@ xla::StatusOr<xla::ComputationDataHandle> XlaScatter(
   // If any of the indexed dimensions are zero in the buffer, the update cannot
   // succeed since it updates a slice of size 1.
   for (int64 i = 0; i < num_index_dims; ++i) {
-    if (xla::ShapeUtil::GetDimension(*buffer_shape, i) == 0) {
-      return errors::InvalidArgument(
-          "Scatter dimension ", i, " is of size zero in tensor with shape ",
-          xla::ShapeUtil::HumanString(*buffer_shape));
+    if (xla::ShapeUtil::GetDimension(buffer_shape, i) == 0) {
+      return errors::InvalidArgument("Scatter dimension ", i,
+                                     " is of size zero in tensor with shape ",
+                                     xla::ShapeUtil::HumanString(buffer_shape));
     }
   }
 
@@ -111,18 +106,17 @@ xla::StatusOr<xla::ComputationDataHandle> XlaScatter(
   //   index = dynamic-slice(indices, i)
   //   update = dynamic-slice(updates, i)
   //   buffer = dynamic-update-slice(buffer, update, index)
-  auto body_fn = [&](xla::ComputationDataHandle i,
-                     gtl::ArraySlice<xla::ComputationDataHandle> loop_vars,
-                     xla::ComputationBuilder* body_builder) {
+  auto body_fn = [&](xla::XlaOp i, gtl::ArraySlice<xla::XlaOp> loop_vars,
+                     xla::XlaBuilder* body_builder) {
     auto indices = loop_vars[0];
     auto updates = loop_vars[1];
     auto buffer = loop_vars[2];
 
     auto zero_index = body_builder->ConstantLiteral(
-        xla::Literal::Zero(indices_shape->element_type()));
+        xla::Literal::Zero(indices_shape.element_type()));
 
     // Slice the i-th index from the indices array.
-    xla::ComputationDataHandle index;
+    xla::XlaOp index;
     auto indices_offset = body_builder->Reshape(i, {1});
     if (indices_are_vectors) {
       indices_offset = body_builder->Pad(indices_offset, zero_index,
@@ -180,12 +174,12 @@ xla::StatusOr<xla::ComputationDataHandle> XlaScatter(
     // Apply the update.
     buffer = body_builder->DynamicUpdateSlice(buffer, update, index);
 
-    return std::vector<xla::ComputationDataHandle>{indices, updates, buffer};
+    return std::vector<xla::XlaOp>{indices, updates, buffer};
   };
 
-  TF_ASSIGN_OR_RETURN(
-      auto outputs, XlaForEachIndex(num_indices, indices_shape->element_type(),
-                                    body_fn, init, "scatter", builder));
+  TF_ASSIGN_OR_RETURN(auto outputs,
+                      XlaForEachIndex(num_indices, indices_shape.element_type(),
+                                      body_fn, init, "scatter", builder));
   return outputs[2];
 }
 
diff --git a/tensorflow/compiler/tf2xla/lib/scatter.h b/tensorflow/compiler/tf2xla/lib/scatter.h
index 41e6d3b195ebf9..87309e10ede320 100644
--- a/tensorflow/compiler/tf2xla/lib/scatter.h
+++ b/tensorflow/compiler/tf2xla/lib/scatter.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <functional>
 
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
 namespace tensorflow {
@@ -39,14 +39,12 @@ namespace tensorflow {
 // If a `combiner` is provided, updates are combined with the existing values in
 // the buffer using the combiner function. Otherwise, the updates replace the
 // existing values. The order of updates is implementation-defined.
-xla::StatusOr<xla::ComputationDataHandle> XlaScatter(
-    const xla::ComputationDataHandle& buffer,
-    const xla::ComputationDataHandle& updates,
-    const xla::ComputationDataHandle& indices, bool indices_are_vectors,
-    const std::function<xla::ComputationDataHandle(
-        xla::ComputationDataHandle, xla::ComputationDataHandle,
-        xla::ComputationBuilder*)>& combiner,
-    xla::ComputationBuilder* builder);
+xla::StatusOr<xla::XlaOp> XlaScatter(
+    const xla::XlaOp& buffer, const xla::XlaOp& updates,
+    const xla::XlaOp& indices, bool indices_are_vectors,
+    const std::function<xla::XlaOp(xla::XlaOp, xla::XlaOp, xla::XlaBuilder*)>&
+        combiner,
+    xla::XlaBuilder* builder);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
index 9bf5821b54abe3..b4503601f94baa 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
+++ b/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -29,21 +29,20 @@ limitations under the License.
 
 namespace tensorflow {
 
-xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
-    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& a,
-    xla::ComputationDataHandle b, bool left_side, bool lower, bool transpose_a,
-    bool conjugate_a, int64 block_size) {
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> a_shape,
-                      builder->GetShape(a));
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> b_shape,
-                      builder->GetShape(b));
-  if (xla::ShapeUtil::Rank(*a_shape) != xla::ShapeUtil::Rank(*b_shape)) {
+xla::StatusOr<xla::XlaOp> TriangularSolve(xla::XlaBuilder* builder,
+                                          const xla::XlaOp& a, xla::XlaOp b,
+                                          bool left_side, bool lower,
+                                          bool transpose_a, bool conjugate_a,
+                                          int64 block_size) {
+  TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a));
+  TF_ASSIGN_OR_RETURN(xla::Shape b_shape, builder->GetShape(b));
+  if (xla::ShapeUtil::Rank(a_shape) != xla::ShapeUtil::Rank(b_shape)) {
     return errors::InvalidArgument(
         "Arguments to TriangularSolve have different ranks: ",
-        xla::ShapeUtil::HumanString(*a_shape), " vs. ",
-        xla::ShapeUtil::HumanString(*b_shape));
+        xla::ShapeUtil::HumanString(a_shape), " vs. ",
+        xla::ShapeUtil::HumanString(b_shape));
   }
-  const int ndims = xla::ShapeUtil::Rank(*a_shape);
+  const int ndims = xla::ShapeUtil::Rank(a_shape);
   if (ndims < 2) {
     return errors::InvalidArgument(
         "Arguments to TriangularSolve must have rank >= 2: ", ndims);
@@ -51,30 +50,30 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
   // The batch dimensions must be equal.
   std::vector<int64> batch_dimensions;
   for (int i = 0; i < ndims - 2; ++i) {
-    int64 a_size = a_shape->dimensions(i);
-    int64 b_size = b_shape->dimensions(i);
+    int64 a_size = a_shape.dimensions(i);
+    int64 b_size = b_shape.dimensions(i);
     if (a_size != b_size) {
       return errors::InvalidArgument(
           "Batch dimensions of arguments to TriangularSolve must be equal: ",
-          xla::ShapeUtil::HumanString(*a_shape), " vs ",
-          xla::ShapeUtil::HumanString(*b_shape));
+          xla::ShapeUtil::HumanString(a_shape), " vs ",
+          xla::ShapeUtil::HumanString(b_shape));
     }
     batch_dimensions.push_back(a_size);
   }
 
-  if (xla::ShapeUtil::GetDimension(*a_shape, -1) !=
-      xla::ShapeUtil::GetDimension(*a_shape, -2)) {
+  if (xla::ShapeUtil::GetDimension(a_shape, -1) !=
+      xla::ShapeUtil::GetDimension(a_shape, -2)) {
     return errors::InvalidArgument(
         "The 'a' arguments to TriangularSolve must be square matrices: ",
-        xla::ShapeUtil::HumanString(*a_shape));
+        xla::ShapeUtil::HumanString(a_shape));
   }
-  const int64 m = xla::ShapeUtil::GetDimension(*b_shape, -2);
-  const int64 n = xla::ShapeUtil::GetDimension(*b_shape, -1);
-  if ((left_side ? m : n) != xla::ShapeUtil::GetDimension(*a_shape, -1)) {
+  const int64 m = xla::ShapeUtil::GetDimension(b_shape, -2);
+  const int64 n = xla::ShapeUtil::GetDimension(b_shape, -1);
+  if ((left_side ? m : n) != xla::ShapeUtil::GetDimension(a_shape, -1)) {
     return errors::InvalidArgument(
         "Arguments to TriangularSolve have incompatible matrix shapes: ",
-        xla::ShapeUtil::HumanString(*a_shape), " vs ",
-        xla::ShapeUtil::HumanString(*b_shape));
+        xla::ShapeUtil::HumanString(a_shape), " vs ",
+        xla::ShapeUtil::HumanString(b_shape));
   }
 
   if (block_size < 1) {
@@ -83,26 +82,18 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
         block_size);
   }
 
-  // Applies a complex conjugation operation if `a` is complex and `conjugate_a`
-  // is true, otherwise returns its argument.
-  auto maybe_conj = [&](xla::ComputationBuilder* builder,
-                        xla::ComputationDataHandle x) {
-    auto perform_conj = a_shape->element_type() == xla::C64 && conjugate_a;
-    return perform_conj ? builder->Conj(x) : x;
-  };
-
-  std::map<int, xla::Computation> base_computations;
+  std::map<int, xla::XlaComputation> base_computations;
   auto get_base_triangular_solve =
-      [&](int k) -> xla::StatusOr<xla::Computation*> {
-    xla::Computation& computation = base_computations[k];
+      [&](int k) -> xla::StatusOr<xla::XlaComputation*> {
+    xla::XlaComputation& computation = base_computations[k];
     if (computation.IsNull()) {
-      std::unique_ptr<xla::ComputationBuilder> sub = builder->CreateSubBuilder(
+      std::unique_ptr<xla::XlaBuilder> sub = builder->CreateSubBuilder(
           tensorflow::strings::StrCat("trsm_base_", k));
 
       auto a_param = sub->Parameter(
           0,
           xla::ShapeUtil::MakeShape(
-              b_shape->element_type(),
+              b_shape.element_type(),
               PrependMajorDims(sub.get(), batch_dimensions, {k, k})),
           "a");
 
@@ -115,20 +106,25 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
       auto b_param = sub->Parameter(
           1,
           xla::ShapeUtil::MakeShape(
-              b_shape->element_type(),
+              b_shape.element_type(),
               PrependMajorDims(sub.get(), batch_dimensions, b_lastd)),
           "b");
 
-      // We use a left-looking subroutine on the block diagonal in some common
-      // cases, while falling back to a recursive call in unsupported cases. The
-      // left-looking subroutine is written with a While loop and so yields much
-      // faster compile times. Moreover, the left-looking variant can give
-      // higher performance on smaller (sub)problems.
+      // We use a left-looking or right-looking subroutine on the block diagonal
+      // in the lower=true cases, while falling back to a recursive call in
+      // others. The left-looking and right-looking subroutines are written with
+      // a While loop and so yields much faster compile times. Moreover, they
+      // can give higher performance on smaller (sub)problems.
       if (left_side && lower) {
         TF_RETURN_IF_ERROR(TriangularSolveLeftLooking(sub.get(), a_param,
                                                       b_param, transpose_a,
                                                       conjugate_a)
                                .status());
+      } else if (!left_side && lower) {
+        TF_RETURN_IF_ERROR(TriangularSolveRightLooking(sub.get(), a_param,
+                                                       b_param, transpose_a,
+                                                       conjugate_a)
+                               .status());
       } else {
         TF_RETURN_IF_ERROR(TriangularSolve(sub.get(), a_param, b_param,
                                            left_side, lower, transpose_a,
@@ -142,7 +138,7 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
     return &computation;
   };
 
-  xla::ComputationDataHandle output = Zeros(builder, *b_shape);
+  xla::XlaOp output = Zeros(builder, b_shape);
 
   // Right-looking blocked triangular solve.
   // For an explanation of the algorithm, see the TRSM discussion in:
@@ -165,13 +161,15 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
                           SliceInMinorDims(builder, a, {i, i}, {i + k, i + k}));
       TF_ASSIGN_OR_RETURN(auto b_slice,
                           SliceInMinorDims(builder, b, {0, i}, {m, i + k}));
-      xla::ComputationDataHandle update;
+      xla::XlaOp update;
       if (k > 1) {
-        TF_ASSIGN_OR_RETURN(xla::Computation * solve,
+        TF_ASSIGN_OR_RETURN(xla::XlaComputation * solve,
                             get_base_triangular_solve(k));
         update = builder->Call(*solve, {a_slice, b_slice});
       } else {
-        update = builder->Div(b_slice, maybe_conj(builder, a_slice));
+        TF_ASSIGN_OR_RETURN(auto a_slice_conj,
+                            MaybeConjugate(builder, a_slice, conjugate_a));
+        update = builder->Div(b_slice, a_slice_conj);
       }
       TF_ASSIGN_OR_RETURN(
           output, UpdateSliceInMinorDims(builder, output, update, {0, i}));
@@ -181,7 +179,7 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
       //   a_slice_2 = T(a_slice_2) if transpose_a else a_slice_2
       //   b[..., :, i+k:] -= np.matmul(output[..., :, i:i+k], a_slice_2)
       if (i + k < n) {
-        xla::ComputationDataHandle a_slice_2;
+        xla::XlaOp a_slice_2;
         if (lower) {
           TF_ASSIGN_OR_RETURN(
               a_slice_2, SliceInMinorDims(builder, a, {i + k, i}, {n, i + k}));
@@ -215,13 +213,15 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
                           SliceInMinorDims(builder, a, {i, i}, {i + k, i + k}));
       TF_ASSIGN_OR_RETURN(auto b_slice,
                           SliceInMinorDims(builder, b, {i, 0}, {i + k, n}));
-      xla::ComputationDataHandle update;
+      xla::XlaOp update;
       if (k > 1) {
-        TF_ASSIGN_OR_RETURN(xla::Computation * solve,
+        TF_ASSIGN_OR_RETURN(xla::XlaComputation * solve,
                             get_base_triangular_solve(k));
         update = builder->Call(*solve, {a_slice, b_slice});
       } else {
-        update = builder->Div(b_slice, maybe_conj(builder, a_slice));
+        TF_ASSIGN_OR_RETURN(auto a_slice_conj,
+                            MaybeConjugate(builder, a_slice, conjugate_a));
+        update = builder->Div(b_slice, a_slice_conj);
       }
       TF_ASSIGN_OR_RETURN(
           output, UpdateSliceInMinorDims(builder, output, update, {i, 0}));
@@ -231,7 +231,7 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
       //   a_slice_2 = T(a_slice_2) if transpose_a else a_slice_2
       //   b[..., i+k:, :] -= np.matmul(a_slice_2, output[..., i:i+k, :])
       if (i + k < m) {
-        xla::ComputationDataHandle a_slice_2;
+        xla::XlaOp a_slice_2;
         if (lower) {
           TF_ASSIGN_OR_RETURN(
               a_slice_2, SliceInMinorDims(builder, a, {i + k, i}, {m, i + k}));
@@ -264,13 +264,15 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
                           SliceInMinorDims(builder, a, {i, i}, {i + k, i + k}));
       TF_ASSIGN_OR_RETURN(auto b_slice,
                           SliceInMinorDims(builder, b, {0, i}, {m, i + k}));
-      xla::ComputationDataHandle update;
+      xla::XlaOp update;
       if (k > 1) {
-        TF_ASSIGN_OR_RETURN(xla::Computation * solve,
+        TF_ASSIGN_OR_RETURN(xla::XlaComputation * solve,
                             get_base_triangular_solve(k));
         update = builder->Call(*solve, {a_slice, b_slice});
       } else {
-        update = builder->Div(b_slice, maybe_conj(builder, a_slice));
+        TF_ASSIGN_OR_RETURN(auto a_slice_conj,
+                            MaybeConjugate(builder, a_slice, conjugate_a));
+        update = builder->Div(b_slice, a_slice_conj);
       }
       TF_ASSIGN_OR_RETURN(
           output, UpdateSliceInMinorDims(builder, output, update, {0, i}));
@@ -280,7 +282,7 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
       //   a_slice_2 = T(a_slice_2) if transpose_a else a_slice_2
       //   b[..., :, :i] -= np.matmul(out[..., :, i:i+k], a_slice_2)
       if (i - k >= 0) {
-        xla::ComputationDataHandle a_slice_2;
+        xla::XlaOp a_slice_2;
         if (lower) {
           TF_ASSIGN_OR_RETURN(a_slice_2,
                               SliceInMinorDims(builder, a, {i, 0}, {i + k, i}));
@@ -314,13 +316,15 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
                           SliceInMinorDims(builder, a, {i, i}, {i + k, i + k}));
       TF_ASSIGN_OR_RETURN(auto b_slice,
                           SliceInMinorDims(builder, b, {i, 0}, {i + k, n}));
-      xla::ComputationDataHandle update;
+      xla::XlaOp update;
       if (k > 1) {
-        TF_ASSIGN_OR_RETURN(xla::Computation * solve,
+        TF_ASSIGN_OR_RETURN(xla::XlaComputation * solve,
                             get_base_triangular_solve(k));
         update = builder->Call(*solve, {a_slice, b_slice});
       } else {
-        update = builder->Div(b_slice, maybe_conj(builder, a_slice));
+        TF_ASSIGN_OR_RETURN(auto a_slice_conj,
+                            MaybeConjugate(builder, a_slice, conjugate_a));
+        update = builder->Div(b_slice, a_slice_conj);
       }
       TF_ASSIGN_OR_RETURN(
           output, UpdateSliceInMinorDims(builder, output, update, {i, 0}));
@@ -330,7 +334,7 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
       //   a_slice_2 = T(a_slice_2) if transpose_a else a_slice_2
       //   b[..., :i, :] -= np.matmul(a_slice_2, out[..., i:i+k, :])
       if (i - k >= 0) {
-        xla::ComputationDataHandle a_slice_2;
+        xla::XlaOp a_slice_2;
         if (lower) {
           TF_ASSIGN_OR_RETURN(a_slice_2,
                               SliceInMinorDims(builder, a, {i, 0}, {i + k, i}));
@@ -356,29 +360,23 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
   return output;
 }
 
-xla::StatusOr<xla::ComputationDataHandle> TriangularSolveLeftLooking(
-    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& a,
-    const xla::ComputationDataHandle& b, bool transpose_a, bool conjugate_a) {
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> a_shape,
-                      builder->GetShape(a));
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> b_shape,
-                      builder->GetShape(b));
-  const int64 m = xla::ShapeUtil::GetDimension(*b_shape, -2);
-  const int64 n = xla::ShapeUtil::GetDimension(*b_shape, -1);
-  const int64 ndims = xla::ShapeUtil::Rank(*a_shape);
+xla::StatusOr<xla::XlaOp> TriangularSolveLeftLooking(xla::XlaBuilder* builder,
+                                                     const xla::XlaOp& a,
+                                                     const xla::XlaOp& b,
+                                                     bool transpose_a,
+                                                     bool conjugate_a) {
+  TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a));
+  TF_ASSIGN_OR_RETURN(xla::Shape b_shape, builder->GetShape(b));
+  const int64 m = xla::ShapeUtil::GetDimension(b_shape, -2);
+  const int64 n = xla::ShapeUtil::GetDimension(b_shape, -1);
+  const int64 ndims = xla::ShapeUtil::Rank(a_shape);
 
   std::vector<int64> batch_dimensions;
   for (int i = 0; i < ndims - 2; ++i) {
-    int64 a_size = a_shape->dimensions(i);
+    int64 a_size = a_shape.dimensions(i);
     batch_dimensions.push_back(a_size);
   }
 
-  auto maybe_conj = [&](xla::ComputationBuilder* builder,
-                        xla::ComputationDataHandle x) {
-    auto perform_conj = a_shape->element_type() == xla::C64 && conjugate_a;
-    return perform_conj ? builder->Conj(x) : x;
-  };
-
   // The main computation is performed in a While loop.
 
   // Allocate the output and set its first or last row,
@@ -387,14 +385,16 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolveLeftLooking(
   //   output[..., m-1:, :] = b[..., m-1:, :] / a[..., m-1:, m-1:]
   // else:
   //   output[..., :1, :] = b[..., :1, :] / a[..., :1, :1]
-  xla::ComputationDataHandle output = Zeros(builder, *b_shape);
+  xla::XlaOp output = Zeros(builder, b_shape);
   {
     auto i = transpose_a ? m - 1 : 0;
     TF_ASSIGN_OR_RETURN(auto a_slice,
                         SliceInMinorDims(builder, a, {i, i}, {i + 1, i + 1}));
     TF_ASSIGN_OR_RETURN(auto b_slice,
                         SliceInMinorDims(builder, b, {i, 0}, {i + 1, n}));
-    auto update = builder->Div(b_slice, maybe_conj(builder, a_slice));
+    TF_ASSIGN_OR_RETURN(auto a_slice_conj,
+                        MaybeConjugate(builder, a_slice, conjugate_a));
+    auto update = builder->Div(b_slice, a_slice_conj);
     TF_ASSIGN_OR_RETURN(
         output, UpdateSliceInMinorDims(builder, output, update, {i, 0}));
   }
@@ -408,11 +408,11 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolveLeftLooking(
       // The loop iteration counter is a scalar, incremented each iteration.
       xla::ShapeUtil::MakeShape(xla::S32, {}),
       // The output has the shape of b, with one row updated each iteration.
-      *b_shape,
+      b_shape,
       // The coefficient matrix a is a loop invariant.
-      *a_shape,
+      a_shape,
       // The right-hand-side matrix b is a loop invariant.
-      *b_shape};
+      b_shape};
   xla::Shape tuple_shape = xla::ShapeUtil::MakeTupleShape(tuple_shapes);
   auto init_i = builder->ConstantR0<int32>(transpose_a ? m - 2 : 1);
   auto init = builder->Tuple({init_i, output, a, b});
@@ -421,7 +421,7 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolveLeftLooking(
   // def cond_fun(loop_carry):
   //   i, output, a, b = loop_carry
   //   return i >= 0 if transpose_a else i < m
-  std::unique_ptr<xla::ComputationBuilder> condb =
+  std::unique_ptr<xla::XlaBuilder> condb =
       builder->CreateSubBuilder("TriangularSolveLeftLookingWhileCond");
   {
     auto i = condb->GetTupleElement(
@@ -451,7 +451,7 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolveLeftLooking(
   //     return (i + 1, output, a, b)
   // We have to do some extra FLOPs propagating zeros in the matrix multiply
   // because we can't have the size of its arguments depend on the loop counter.
-  std::unique_ptr<xla::ComputationBuilder> bodyb =
+  std::unique_ptr<xla::XlaBuilder> bodyb =
       builder->CreateSubBuilder("TriangularSolveLeftLookingWhileBody");
   {
     auto input_tuple = bodyb->Parameter(0, tuple_shape,
@@ -475,7 +475,7 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolveLeftLooking(
     // But since we can't have intermediate array sizes depend on the loop
     // counter, we instead exploit the fact that we initialized the output to
     // all zeros and use that as zero-padding (doing unnecessary FLOPs).
-    xla::ComputationDataHandle a_row;
+    xla::XlaOp a_row;
     if (transpose_a) {
       TF_ASSIGN_OR_RETURN(a_row, DynamicSliceInMinorDims(bodyb.get(), body_a,
                                                          {zero, i}, {m, 1}));
@@ -496,7 +496,9 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolveLeftLooking(
     // body_out[..., i:i+1, :] = result_row / a[..., i:i+1, i:i+1]
     TF_ASSIGN_OR_RETURN(auto a_elt, DynamicSliceInMinorDims(bodyb.get(), body_a,
                                                             {i, i}, {1, 1}));
-    auto div_result = bodyb->Div(result_row, maybe_conj(bodyb.get(), a_elt));
+    TF_ASSIGN_OR_RETURN(auto a_elt_conj,
+                        MaybeConjugate(bodyb.get(), a_elt, conjugate_a));
+    auto div_result = bodyb->Div(result_row, a_elt_conj);
     TF_ASSIGN_OR_RETURN(body_out,
                         DynamicUpdateSliceInMinorDims(bodyb.get(), body_out,
                                                       div_result, {i, zero}));
@@ -516,4 +518,130 @@ xla::StatusOr<xla::ComputationDataHandle> TriangularSolveLeftLooking(
   return builder->GetTupleElement(triangular_solve_left_looking_while, 1);
 }
 
+xla::StatusOr<xla::XlaOp> TriangularSolveRightLooking(xla::XlaBuilder* builder,
+                                                      const xla::XlaOp& a,
+                                                      const xla::XlaOp& b,
+                                                      bool transpose_a,
+                                                      bool conjugate_a) {
+  TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a));
+  TF_ASSIGN_OR_RETURN(xla::Shape b_shape, builder->GetShape(b));
+  const int64 m = xla::ShapeUtil::GetDimension(b_shape, -2);
+  const int64 n = xla::ShapeUtil::GetDimension(b_shape, -1);
+  const int64 ndims = xla::ShapeUtil::Rank(a_shape);
+
+  std::vector<int64> batch_dimensions;
+  for (int i = 0; i < ndims - 2; ++i) {
+    int64 a_size = a_shape.dimensions(i);
+    batch_dimensions.push_back(a_size);
+  }
+
+  // The main computation is performed in a While loop.
+  xla::XlaOp output = Zeros(builder, b_shape);
+
+  // Construct the initial loop carry tuple,
+  // if transpose_a:
+  //   init = (0, output, a, b)
+  // else:
+  //   init = (n-1, output, a, b)
+  std::vector<xla::Shape> tuple_shapes = {
+      // The loop iteration counter is a scalar, incremented each iteration.
+      xla::ShapeUtil::MakeShape(xla::S32, {}),
+      // The output has the shape of b, with one row updated each iteration.
+      b_shape,
+      // The coefficient matrix a is a loop invariant.
+      a_shape,
+      // The right-hand-side matrix b is a loop invariant.
+      b_shape};
+  xla::Shape tuple_shape = xla::ShapeUtil::MakeTupleShape(tuple_shapes);
+  auto init_i = builder->ConstantR0<int32>(transpose_a ? 0 : n - 1);
+  auto init = builder->Tuple({init_i, output, a, b});
+
+  // Construct the loop condition function,
+  // def cond_fun(loop_carry):
+  //   i, output, a, b = loop_carry
+  //   return i < n if transpose_a else i >= 0
+  std::unique_ptr<xla::XlaBuilder> condb =
+      builder->CreateSubBuilder("TriangularSolveRightLookingWhileCond");
+  {
+    auto i = condb->GetTupleElement(
+        condb->Parameter(0, tuple_shape,
+                         "TriangularSolveRightLookingWhileTuple"),
+        0);
+    if (transpose_a) {
+      condb->Lt(i, condb->ConstantR0<int32>(n));
+    } else {
+      condb->Ge(i, condb->ConstantR0<int32>(0));
+    }
+  }
+  TF_ASSIGN_OR_RETURN(auto cond, condb->Build());
+
+  // Construct the loop body function,
+  // def body_fun(loop_carry):
+  //   i, output, a, b = loop_carry
+  //   if transpose_a:
+  //     a_row = np.swapaxes(a[..., :, i:i+1], -1 -2)
+  //   else:
+  //     a_row = a[..., :, i:i+1]
+  //   result_row = b[..., :, i:i+1] - np.matmul(output, a_row)
+  //   output[..., :, i:i+1] = result_row / a[..., i:i+1, i:i+1]
+  //   if transpose_a:
+  //     return (i - 1, output, a, b)
+  //   else:
+  //     return (i + 1, output, a, b)
+  // We have to do some extra FLOPs propagating zeros in the matrix multiply
+  // because we can't have the size of its arguments depend on the loop counter.
+  std::unique_ptr<xla::XlaBuilder> bodyb =
+      builder->CreateSubBuilder("TriangularSolveRightLookingWhileBody");
+  {
+    auto input_tuple = bodyb->Parameter(
+        0, tuple_shape, "TriangularSolveRightLookingWhileTuple");
+
+    // i, output, a, b = loop_carry
+    auto i = bodyb->GetTupleElement(input_tuple, 0);
+    auto body_out = bodyb->GetTupleElement(input_tuple, 1);
+    auto body_a = bodyb->GetTupleElement(input_tuple, 2);
+    auto body_b = bodyb->GetTupleElement(input_tuple, 3);
+    auto zero = bodyb->ConstantR0<int32>(0);
+
+    // We'd like to implement b[..., :, i:i+1] - np.matmul(output, a[..., :,
+    // i:i+1]) But since we can't have intermediate array sizes depend on the
+    // loop counter, we instead exploit the fact that we initialized the output
+    // to all zeros and use that as zero-padding (doing unnecessary FLOPs).
+    TF_ASSIGN_OR_RETURN(auto b_update, BatchDot(bodyb.get(), body_out, body_a,
+                                                /*transpose_x=*/false,
+                                                /*transpose_y=*/transpose_a,
+                                                /*conjugate_x=*/false,
+                                                /*conjugate_y=*/conjugate_a));
+    // result = b - np.matmul(output, a)
+    auto result = bodyb->Sub(body_b, b_update);
+    // result_row = result[..., :, i:i+1]
+    TF_ASSIGN_OR_RETURN(
+        auto result_row,
+        DynamicSliceInMinorDims(bodyb.get(), result, {zero, i}, {m, 1}));
+
+    // body_out[..., :, i:i+1] = result_row / a[..., i:i+1, i:i+1]
+    TF_ASSIGN_OR_RETURN(auto a_ii, DynamicSliceInMinorDims(bodyb.get(), body_a,
+                                                           {i, i}, {1, 1}));
+    TF_ASSIGN_OR_RETURN(auto a_ii_conj,
+                        MaybeConjugate(bodyb.get(), a_ii, conjugate_a));
+    auto div_result = bodyb->Div(result_row, a_ii_conj);
+    TF_ASSIGN_OR_RETURN(body_out,
+                        DynamicUpdateSliceInMinorDims(bodyb.get(), body_out,
+                                                      div_result, {zero, i}));
+
+    // if transpose_a:
+    //   return (i + 1, body_out, a, b)
+    // else:
+    //   return (i - 1, body_out, a, b)
+    auto next_i = bodyb->Add(i, bodyb->ConstantR0<int32>(transpose_a ? 1 : -1));
+    bodyb->Tuple({next_i, body_out, body_a, body_b});
+  }
+  TF_ASSIGN_OR_RETURN(auto body, bodyb->Build());
+
+  // Construct the While loop and return the result,
+  // return while_loop(cond_fun, body_fun, init)[1]
+  auto triangular_solve_left_looking_while = builder->While(cond, body, init);
+  return builder->GetTupleElement(triangular_solve_left_looking_while, 1);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.h b/tensorflow/compiler/tf2xla/lib/triangular_solve.h
index e32223bfdddda8..540c26b2473df9 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve.h
+++ b/tensorflow/compiler/tf2xla/lib/triangular_solve.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_TRIANGULAR_SOLVE_H_
 #define TENSORFLOW_COMPILER_TF2XLA_LIB_TRIANGULAR_SOLVE_H_
 
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 
 namespace tensorflow {
 
@@ -57,14 +57,23 @@ namespace tensorflow {
 //
 // Uses a blocked algorithm if `block_size` is > 1; if block_size == 1 then no
 // blocking is used.
-xla::StatusOr<xla::ComputationDataHandle> TriangularSolve(
-    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& a,
-    xla::ComputationDataHandle b, bool left_side, bool lower, bool transpose_a,
-    bool conjugate_a, int64 block_size = 256);
+xla::StatusOr<xla::XlaOp> TriangularSolve(xla::XlaBuilder* builder,
+                                          const xla::XlaOp& a, xla::XlaOp b,
+                                          bool left_side, bool lower,
+                                          bool transpose_a, bool conjugate_a,
+                                          int64 block_size = 256);
 
-xla::StatusOr<xla::ComputationDataHandle> TriangularSolveLeftLooking(
-    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& a,
-    const xla::ComputationDataHandle& b, bool transpose_a, bool conjugate_a);
+xla::StatusOr<xla::XlaOp> TriangularSolveLeftLooking(xla::XlaBuilder* builder,
+                                                     const xla::XlaOp& a,
+                                                     const xla::XlaOp& b,
+                                                     bool transpose_a,
+                                                     bool conjugate_a);
+
+xla::StatusOr<xla::XlaOp> TriangularSolveRightLooking(xla::XlaBuilder* builder,
+                                                      const xla::XlaOp& a,
+                                                      const xla::XlaOp& b,
+                                                      bool transpose_a,
+                                                      bool conjugate_a);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc b/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc
index 66170706291626..87ea4763f7c235 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc
+++ b/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
@@ -80,9 +80,9 @@ xla::Array2D<float> AValsFull() {
 }
 
 XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTranspose) {
-  xla::ComputationBuilder builder(client_, TestName());
+  xla::XlaBuilder builder(TestName());
 
-  xla::ComputationDataHandle a, b;
+  xla::XlaOp a, b;
   auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
   auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
   auto result = TriangularSolve(&builder, a, b,
@@ -102,9 +102,9 @@ XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTranspose) {
 }
 
 XLA_TEST_F(TriangularSolveTest, SimpleRightLowerNotranspose) {
-  xla::ComputationBuilder builder(client_, TestName());
+  xla::XlaBuilder builder(TestName());
 
-  xla::ComputationDataHandle a, b;
+  xla::XlaOp a, b;
   auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
   auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
   auto result = TriangularSolve(&builder, a, b,
@@ -124,9 +124,9 @@ XLA_TEST_F(TriangularSolveTest, SimpleRightLowerNotranspose) {
 }
 
 XLA_TEST_F(TriangularSolveTest, SimpleRightUpperTranspose) {
-  xla::ComputationBuilder builder(client_, TestName());
+  xla::XlaBuilder builder(TestName());
 
-  xla::ComputationDataHandle a, b;
+  xla::XlaOp a, b;
   auto a_data = CreateR2Parameter<float>(AValsUpper(), 0, "a", &builder, &a);
   auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
   auto result = TriangularSolve(&builder, a, b,
@@ -146,9 +146,9 @@ XLA_TEST_F(TriangularSolveTest, SimpleRightUpperTranspose) {
 }
 
 XLA_TEST_F(TriangularSolveTest, SimpleRightUpperNotranspose) {
-  xla::ComputationBuilder builder(client_, TestName());
+  xla::XlaBuilder builder(TestName());
 
-  xla::ComputationDataHandle a, b;
+  xla::XlaOp a, b;
   auto a_data = CreateR2Parameter<float>(AValsUpper(), 0, "a", &builder, &a);
   auto b_data = CreateR2Parameter<float>(BValsRight(), 1, "b", &builder, &b);
   auto result = TriangularSolve(&builder, a, b,
@@ -168,9 +168,9 @@ XLA_TEST_F(TriangularSolveTest, SimpleRightUpperNotranspose) {
 }
 
 XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerTranspose) {
-  xla::ComputationBuilder builder(client_, TestName());
+  xla::XlaBuilder builder(TestName());
 
-  xla::ComputationDataHandle a, b;
+  xla::XlaOp a, b;
   auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
   auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
   auto result = TriangularSolve(&builder, a, b,
@@ -191,9 +191,9 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerTranspose) {
 }
 
 XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotranspose) {
-  xla::ComputationBuilder builder(client_, TestName());
+  xla::XlaBuilder builder(TestName());
 
-  xla::ComputationDataHandle a, b;
+  xla::XlaOp a, b;
   auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
   auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
   auto result = TriangularSolve(&builder, a, b,
@@ -214,9 +214,9 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftLowerNotranspose) {
 }
 
 XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTranspose) {
-  xla::ComputationBuilder builder(client_, TestName());
+  xla::XlaBuilder builder(TestName());
 
-  xla::ComputationDataHandle a, b;
+  xla::XlaOp a, b;
   auto a_data = CreateR2Parameter<float>(AValsUpper(), 0, "a", &builder, &a);
   auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
   auto result = TriangularSolve(&builder, a, b,
@@ -237,9 +237,9 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTranspose) {
 }
 
 XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperNotranspose) {
-  xla::ComputationBuilder builder(client_, TestName());
+  xla::XlaBuilder builder(TestName());
 
-  xla::ComputationDataHandle a, b;
+  xla::XlaOp a, b;
   auto a_data = CreateR2Parameter<float>(AValsUpper(), 0, "a", &builder, &a);
   auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
   auto result = TriangularSolve(&builder, a, b,
@@ -260,9 +260,9 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperNotranspose) {
 }
 
 XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTransposeConjugate) {
-  xla::ComputationBuilder builder(client_, TestName());
+  xla::XlaBuilder builder(TestName());
 
-  xla::ComputationDataHandle a, b;
+  xla::XlaOp a, b;
   auto a_data =
       CreateR2Parameter<complex64>(AValsLowerComplex(), 0, "a", &builder, &a);
   auto b_data =
@@ -288,9 +288,9 @@ XLA_TEST_F(TriangularSolveTest, SimpleRightLowerTransposeConjugate) {
 }
 
 XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTransposeNoconjugate) {
-  xla::ComputationBuilder builder(client_, TestName());
+  xla::XlaBuilder builder(TestName());
 
-  xla::ComputationDataHandle a, b;
+  xla::XlaOp a, b;
   auto a_data =
       CreateR2Parameter<complex64>(AValsUpperComplex(), 0, "a", &builder, &a);
   auto b_data =
@@ -318,9 +318,9 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTransposeNoconjugate) {
 }
 
 XLA_TEST_F(TriangularSolveLeftLookingTest, Simple) {
-  xla::ComputationBuilder builder(client_, TestName());
+  xla::XlaBuilder builder(TestName());
 
-  xla::ComputationDataHandle a, b;
+  xla::XlaOp a, b;
   auto a_data = CreateR2Parameter<float>(AValsLower(), 0, "a", &builder, &a);
   auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
   auto result = TriangularSolveLeftLooking(&builder, a, b,
@@ -340,9 +340,9 @@ XLA_TEST_F(TriangularSolveLeftLookingTest, Simple) {
 }
 
 XLA_TEST_F(TriangularSolveLeftLookingTest, NonzeroUpperTriangle) {
-  xla::ComputationBuilder builder(client_, TestName());
+  xla::XlaBuilder builder(TestName());
 
-  xla::ComputationDataHandle a, b;
+  xla::XlaOp a, b;
   auto a_data = CreateR2Parameter<float>(AValsFull(), 0, "a", &builder, &a);
   auto b_data = CreateR2Parameter<float>(BValsLeft(), 1, "b", &builder, &b);
   auto result = TriangularSolveLeftLooking(&builder, a, b,
diff --git a/tensorflow/compiler/tf2xla/lib/util.cc b/tensorflow/compiler/tf2xla/lib/util.cc
index 31d823ca336039..d9ff7e6259f3fb 100644
--- a/tensorflow/compiler/tf2xla/lib/util.cc
+++ b/tensorflow/compiler/tf2xla/lib/util.cc
@@ -27,15 +27,14 @@ limitations under the License.
 
 namespace tensorflow {
 
-xla::ComputationDataHandle Zeros(xla::ComputationBuilder* builder,
-                                 const xla::Shape& shape) {
+xla::XlaOp Zeros(xla::XlaBuilder* builder, const xla::Shape& shape) {
   return builder->Broadcast(
       builder->ConstantLiteral(xla::Literal::Zero(shape.element_type())),
       xla::AsInt64Slice(shape.dimensions()));
 }
 
-xla::ComputationDataHandle FloatLiteral(xla::ComputationBuilder* builder,
-                                        xla::PrimitiveType type, double value) {
+xla::XlaOp FloatLiteral(xla::XlaBuilder* builder, xla::PrimitiveType type,
+                        double value) {
   switch (type) {
     case xla::F16:
       return builder->ConstantR0<xla::half>(static_cast<xla::half>(value));
@@ -57,9 +56,8 @@ xla::ComputationDataHandle FloatLiteral(xla::ComputationBuilder* builder,
   }
 }
 
-xla::ComputationDataHandle IntegerLiteral(xla::ComputationBuilder* builder,
-                                          xla::PrimitiveType type,
-                                          int64 value) {
+xla::XlaOp IntegerLiteral(xla::XlaBuilder* builder, xla::PrimitiveType type,
+                          int64 value) {
   xla::Literal literal;
   switch (type) {
     case xla::U8:
@@ -112,17 +110,18 @@ xla::ComputationDataHandle IntegerLiteral(xla::ComputationBuilder* builder,
   return builder->ConstantLiteral(literal);
 }
 
-xla::StatusOr<xla::ComputationDataHandle> SliceInMinorDims(
-    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
-    gtl::ArraySlice<int64> start, gtl::ArraySlice<int64> end) {
+xla::StatusOr<xla::XlaOp> SliceInMinorDims(xla::XlaBuilder* builder,
+                                           const xla::XlaOp& x,
+                                           gtl::ArraySlice<int64> start,
+                                           gtl::ArraySlice<int64> end) {
   TF_RET_CHECK(start.size() == end.size());
   int64 n_minor_dims = start.size();
 
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> shape, builder->GetShape(x));
+  TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
 
-  const int64 n_dims = xla::ShapeUtil::Rank(*shape);
+  const int64 n_dims = xla::ShapeUtil::Rank(shape);
   TF_RET_CHECK(n_minor_dims <= n_dims);
-  gtl::ArraySlice<int64> major_dims(xla::AsInt64Slice(shape->dimensions()),
+  gtl::ArraySlice<int64> major_dims(xla::AsInt64Slice(shape.dimensions()),
                                     /*pos=*/0,
                                     /*len=*/n_dims - n_minor_dims);
 
@@ -140,7 +139,7 @@ xla::StatusOr<xla::ComputationDataHandle> SliceInMinorDims(
   return builder->Slice(x, padded_start, padded_end, strides);
 }
 
-std::vector<int64> PrependMajorDims(xla::ComputationBuilder* builder,
+std::vector<int64> PrependMajorDims(xla::XlaBuilder* builder,
                                     const gtl::ArraySlice<int64>& major_dims,
                                     const gtl::ArraySlice<int64>& indices) {
   std::vector<int64> output(indices.size() + major_dims.size());
@@ -149,16 +148,16 @@ std::vector<int64> PrependMajorDims(xla::ComputationBuilder* builder,
   return output;
 }
 
-xla::StatusOr<xla::ComputationDataHandle> DynamicSliceInMinorDims(
-    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
-    const std::vector<xla::ComputationDataHandle>& starts,
+xla::StatusOr<xla::XlaOp> DynamicSliceInMinorDims(
+    xla::XlaBuilder* builder, const xla::XlaOp& x,
+    const std::vector<xla::XlaOp>& starts,
     const gtl::ArraySlice<int64>& sizes) {
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> shape, builder->GetShape(x));
-  const int64 n_dims = xla::ShapeUtil::Rank(*shape);
+  TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
+  const int64 n_dims = xla::ShapeUtil::Rank(shape);
   int64 n_minor_dims = starts.size();
   TF_RET_CHECK(n_minor_dims == sizes.size());
   TF_RET_CHECK(n_minor_dims <= n_dims);
-  gtl::ArraySlice<int64> major_dims(xla::AsInt64Slice(shape->dimensions()),
+  gtl::ArraySlice<int64> major_dims(xla::AsInt64Slice(shape.dimensions()),
                                     /*pos=*/0,
                                     /*len=*/n_dims - sizes.size());
   TF_ASSIGN_OR_RETURN(auto padded_starts,
@@ -167,27 +166,29 @@ xla::StatusOr<xla::ComputationDataHandle> DynamicSliceInMinorDims(
   return builder->DynamicSlice(x, padded_starts, padded_sizes);
 }
 
-xla::StatusOr<xla::ComputationDataHandle> UpdateSlice(
-    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
-    const xla::ComputationDataHandle& update, gtl::ArraySlice<int64> start) {
+xla::StatusOr<xla::XlaOp> UpdateSlice(xla::XlaBuilder* builder,
+                                      const xla::XlaOp& x,
+                                      const xla::XlaOp& update,
+                                      gtl::ArraySlice<int64> start) {
   // TODO(phawkins): make int64 work on all backends, remove the int32 cast.
   std::vector<int32> start_as_int32(start.begin(), start.end());
   auto start_constant = builder->ConstantR1<int32>(start_as_int32);
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> shape, builder->GetShape(x));
-  const int64 n_dims = xla::ShapeUtil::Rank(*shape);
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> start_constant_shape,
+  TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
+  const int64 n_dims = xla::ShapeUtil::Rank(shape);
+  TF_ASSIGN_OR_RETURN(xla::Shape start_constant_shape,
                       builder->GetShape(start_constant));
   const int64 start_length =
-      xla::ShapeUtil::GetDimension(*start_constant_shape, -1);
+      xla::ShapeUtil::GetDimension(start_constant_shape, -1);
   TF_RET_CHECK(start_length == n_dims);
   return builder->DynamicUpdateSlice(x, update, start_constant);
 }
 
-xla::StatusOr<xla::ComputationDataHandle> UpdateSliceInMinorDims(
-    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
-    const xla::ComputationDataHandle& update, gtl::ArraySlice<int64> start) {
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> shape, builder->GetShape(x));
-  const int64 n_dims = xla::ShapeUtil::Rank(*shape);
+xla::StatusOr<xla::XlaOp> UpdateSliceInMinorDims(xla::XlaBuilder* builder,
+                                                 const xla::XlaOp& x,
+                                                 const xla::XlaOp& update,
+                                                 gtl::ArraySlice<int64> start) {
+  TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
+  const int64 n_dims = xla::ShapeUtil::Rank(shape);
   const int64 n_minor_dims = start.size();
   TF_RET_CHECK(n_minor_dims <= n_dims);
   std::vector<int64> padded_start(n_dims, 0);
@@ -196,22 +197,21 @@ xla::StatusOr<xla::ComputationDataHandle> UpdateSliceInMinorDims(
   return UpdateSlice(builder, x, update, padded_start);
 }
 
-xla::StatusOr<xla::ComputationDataHandle> DynamicUpdateSliceInMinorDims(
-    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
-    const xla::ComputationDataHandle& update,
-    const std::vector<xla::ComputationDataHandle>& starts) {
+xla::StatusOr<xla::XlaOp> DynamicUpdateSliceInMinorDims(
+    xla::XlaBuilder* builder, const xla::XlaOp& x, const xla::XlaOp& update,
+    const std::vector<xla::XlaOp>& starts) {
   TF_ASSIGN_OR_RETURN(auto padded_starts,
                       PrependZerosInMajorDims(builder, x, starts));
   return builder->DynamicUpdateSlice(x, update, padded_starts);
 }
 
-xla::StatusOr<xla::ComputationDataHandle> PrependZerosInMajorDims(
-    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
-    const std::vector<xla::ComputationDataHandle>& starts) {
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> shape, builder->GetShape(x));
-  const int64 n_dims = xla::ShapeUtil::Rank(*shape);
+xla::StatusOr<xla::XlaOp> PrependZerosInMajorDims(
+    xla::XlaBuilder* builder, const xla::XlaOp& x,
+    const std::vector<xla::XlaOp>& starts) {
+  TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
+  const int64 n_dims = xla::ShapeUtil::Rank(shape);
   auto zero = builder->Reshape(builder->ConstantR0<int32>(0), {1});
-  std::vector<xla::ComputationDataHandle> padded_starts(n_dims, zero);
+  std::vector<xla::XlaOp> padded_starts(n_dims, zero);
   for (int i = 0; i < starts.size(); ++i) {
     padded_starts[n_dims - starts.size() + i] =
         builder->Reshape(starts[i], {1});
@@ -219,10 +219,10 @@ xla::StatusOr<xla::ComputationDataHandle> PrependZerosInMajorDims(
   return builder->ConcatInDim(padded_starts, 0);
 }
 
-xla::StatusOr<xla::ComputationDataHandle> TransposeInMinorDims(
-    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x) {
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Shape> shape, builder->GetShape(x));
-  const int64 n_dims = xla::ShapeUtil::Rank(*shape);
+xla::StatusOr<xla::XlaOp> TransposeInMinorDims(xla::XlaBuilder* builder,
+                                               const xla::XlaOp& x) {
+  TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
+  const int64 n_dims = xla::ShapeUtil::Rank(shape);
   TF_RET_CHECK(n_dims >= 2);
   std::vector<int64> permutation(n_dims);
   std::iota(permutation.begin(), permutation.end(), 0);
@@ -230,4 +230,11 @@ xla::StatusOr<xla::ComputationDataHandle> TransposeInMinorDims(
   return builder->Transpose(x, permutation);
 }
 
+xla::StatusOr<xla::XlaOp> MaybeConjugate(xla::XlaBuilder* builder,
+                                         const xla::XlaOp& x, bool conjugate) {
+  TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
+  auto perform_conj = shape.element_type() == xla::C64 && conjugate;
+  return perform_conj ? builder->Conj(x) : x;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/util.h b/tensorflow/compiler/tf2xla/lib/util.h
index b684123f1363cf..3c120a2548576d 100644
--- a/tensorflow/compiler/tf2xla/lib/util.h
+++ b/tensorflow/compiler/tf2xla/lib/util.h
@@ -16,75 +16,79 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_UTIL_H_
 #define TENSORFLOW_COMPILER_TF2XLA_LIB_UTIL_H_
 
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 
 namespace tensorflow {
 
 // Returns a zero-filled tensor with shape `shape`.
-xla::ComputationDataHandle Zeros(xla::ComputationBuilder* builder,
-                                 const xla::Shape& shape);
+xla::XlaOp Zeros(xla::XlaBuilder* builder, const xla::Shape& shape);
 
 // Returns a floating point scalar constant of 'type' with 'value'.
 // If 'type' is complex, returns a real value with zero imaginary component.
-xla::ComputationDataHandle FloatLiteral(xla::ComputationBuilder* builder,
-                                        xla::PrimitiveType type, double value);
+xla::XlaOp FloatLiteral(xla::XlaBuilder* builder, xla::PrimitiveType type,
+                        double value);
 
 // Makes a 1D tensor [0, ..., x, y] from two tensors x and y with zeros
 // prepended until the array is length n_dims.
-xla::ComputationDataHandle PrependZerosInMajorDims(
-    xla::ComputationBuilder* builder,
-    gtl::ArraySlice<xla::ComputationDataHandle> starts);
+xla::XlaOp PrependZerosInMajorDims(xla::XlaBuilder* builder,
+                                   gtl::ArraySlice<xla::XlaOp> starts);
 
 // Returns a integer scalar constant of 'type' with 'value'.
 // If 'type' is complex, returns a real value with zero imaginary component.
-xla::ComputationDataHandle IntegerLiteral(xla::ComputationBuilder* builder,
-                                          xla::PrimitiveType type, int64 value);
+xla::XlaOp IntegerLiteral(xla::XlaBuilder* builder, xla::PrimitiveType type,
+                          int64 value);
 
 // Builds a vector of zeros of length rank(x) with the last two values being
 // those in `starts`.
-xla::StatusOr<xla::ComputationDataHandle> PrependZerosInMajorDims(
-    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
-    const std::vector<xla::ComputationDataHandle>& starts);
+xla::StatusOr<xla::XlaOp> PrependZerosInMajorDims(
+    xla::XlaBuilder* builder, const xla::XlaOp& x,
+    const std::vector<xla::XlaOp>& starts);
 
 // Performs a slice in the minor dimensions of a Tensor.
-xla::StatusOr<xla::ComputationDataHandle> SliceInMinorDims(
-    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
-    gtl::ArraySlice<int64> start, gtl::ArraySlice<int64> end);
+xla::StatusOr<xla::XlaOp> SliceInMinorDims(xla::XlaBuilder* builder,
+                                           const xla::XlaOp& x,
+                                           gtl::ArraySlice<int64> start,
+                                           gtl::ArraySlice<int64> end);
 
 // Builds a 1-d vector out of a concatenation of `major_dims` and `starts`.
-std::vector<int64> PrependMajorDims(xla::ComputationBuilder* builder,
+std::vector<int64> PrependMajorDims(xla::XlaBuilder* builder,
                                     const gtl::ArraySlice<int64>& major_dims,
                                     const gtl::ArraySlice<int64>& indices);
 
 // Performs a dynamic slice in the minor dimensions of a Tensor.
-xla::StatusOr<xla::ComputationDataHandle> DynamicSliceInMinorDims(
-    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
-    const std::vector<xla::ComputationDataHandle>& starts,
-    const gtl::ArraySlice<int64>& sizes);
+xla::StatusOr<xla::XlaOp> DynamicSliceInMinorDims(
+    xla::XlaBuilder* builder, const xla::XlaOp& x,
+    const std::vector<xla::XlaOp>& starts, const gtl::ArraySlice<int64>& sizes);
 
 // Updates a slice of 'x', i.e.,
 // x[start[0], ..., start[n]] = update
-xla::StatusOr<xla::ComputationDataHandle> UpdateSlice(
-    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
-    const xla::ComputationDataHandle& update, gtl::ArraySlice<int64> start);
+xla::StatusOr<xla::XlaOp> UpdateSlice(xla::XlaBuilder* builder,
+                                      const xla::XlaOp& x,
+                                      const xla::XlaOp& update,
+                                      gtl::ArraySlice<int64> start);
 
 // Updates a slice of 'x', where 'start' contains a list of minor dimensions:
 // x[..., start[0], ..., start[n]] = update
-xla::StatusOr<xla::ComputationDataHandle> UpdateSliceInMinorDims(
-    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
-    const xla::ComputationDataHandle& update, gtl::ArraySlice<int64> start);
+xla::StatusOr<xla::XlaOp> UpdateSliceInMinorDims(xla::XlaBuilder* builder,
+                                                 const xla::XlaOp& x,
+                                                 const xla::XlaOp& update,
+                                                 gtl::ArraySlice<int64> start);
 
-xla::StatusOr<xla::ComputationDataHandle> DynamicUpdateSliceInMinorDims(
-    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x,
-    const xla::ComputationDataHandle& update,
-    const std::vector<xla::ComputationDataHandle>& starts);
+xla::StatusOr<xla::XlaOp> DynamicUpdateSliceInMinorDims(
+    xla::XlaBuilder* builder, const xla::XlaOp& x, const xla::XlaOp& update,
+    const std::vector<xla::XlaOp>& starts);
 
 // Transposes a stack of matrices `x` by swapping the last two dimensions.
-xla::StatusOr<xla::ComputationDataHandle> TransposeInMinorDims(
-    xla::ComputationBuilder* builder, const xla::ComputationDataHandle& x);
+xla::StatusOr<xla::XlaOp> TransposeInMinorDims(xla::XlaBuilder* builder,
+                                               const xla::XlaOp& x);
+
+// Applies a complex conjugation operation if `a` is complex and `conjugate_a`
+// is true, otherwise returns its argument.
+xla::StatusOr<xla::XlaOp> MaybeConjugate(xla::XlaBuilder* builder,
+                                         const xla::XlaOp& x, bool conjugate);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/lib/util_test.cc b/tensorflow/compiler/tf2xla/lib/util_test.cc
index b6bd33af2e42a4..265b39402c832f 100644
--- a/tensorflow/compiler/tf2xla/lib/util_test.cc
+++ b/tensorflow/compiler/tf2xla/lib/util_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/lib/batch_dot.h"
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
@@ -65,9 +64,9 @@ xla::Array3D<float> BatchedAValsFull() {
 }
 
 XLA_TEST_F(UtilTest, Simple2dLookup) {
-  xla::ComputationBuilder builder(client_, TestName());
+  xla::XlaBuilder builder(TestName());
 
-  xla::ComputationDataHandle a, x, y;
+  xla::XlaOp a, x, y;
   auto a_data = CreateR2Parameter<float>(BValsRight(), 0, "a", &builder, &a);
   auto x_data = CreateR0Parameter<int>(2, 1, "x", &builder, &x);
   auto y_data = CreateR0Parameter<int>(1, 2, "y", &builder, &y);
@@ -80,9 +79,9 @@ XLA_TEST_F(UtilTest, Simple2dLookup) {
 }
 
 XLA_TEST_F(UtilTest, Simple3dLookup) {
-  xla::ComputationBuilder builder(client_, TestName());
+  xla::XlaBuilder builder(TestName());
 
-  xla::ComputationDataHandle a, index;
+  xla::XlaOp a, index;
   auto a_data =
       CreateR3Parameter<float>(BatchedAValsFull(), 0, "a", &builder, &a);
   auto index_data = CreateR0Parameter<int>(1, 1, "index", &builder, &index);
@@ -97,9 +96,9 @@ XLA_TEST_F(UtilTest, Simple3dLookup) {
 }
 
 XLA_TEST_F(UtilTest, SimpleSliceUpdate) {
-  xla::ComputationBuilder builder(client_, TestName());
+  xla::XlaBuilder builder(TestName());
 
-  xla::ComputationDataHandle a, b, x, y;
+  xla::XlaOp a, b, x, y;
   auto a_data = CreateR2Parameter<float>(AValsFull(), 0, "a", &builder, &a);
   auto b_data = CreateR2Parameter<float>({{9, 1, -10}}, 1, "b", &builder, &b);
   auto x_data = CreateR0Parameter<int>(2, 2, "x", &builder, &x);
@@ -117,11 +116,11 @@ XLA_TEST_F(UtilTest, SimpleSliceUpdate) {
 }
 
 XLA_TEST_F(UtilTest, RowBatchDot) {
-  xla::ComputationBuilder builder(client_, TestName());
+  xla::XlaBuilder builder(TestName());
 
   int n = 4;
 
-  xla::ComputationDataHandle a, row, index;
+  xla::XlaOp a, row, index;
   auto a_data =
       CreateR3Parameter<float>(BatchedAValsFull(), 0, "a", &builder, &a);
   auto row_data = CreateR3Parameter<float>({{{9, 1, 0, 0}}, {{2, 4, 0, 0}}}, 1,
diff --git a/tensorflow/compiler/tf2xla/lib/while_loop.cc b/tensorflow/compiler/tf2xla/lib/while_loop.cc
index 495d9c60780b0a..09ce594930efc0 100644
--- a/tensorflow/compiler/tf2xla/lib/while_loop.cc
+++ b/tensorflow/compiler/tf2xla/lib/while_loop.cc
@@ -20,24 +20,24 @@ limitations under the License.
 
 namespace tensorflow {
 
-xla::StatusOr<std::vector<xla::ComputationDataHandle>> XlaWhileLoop(
+xla::StatusOr<std::vector<xla::XlaOp>> XlaWhileLoop(
     const LoopConditionFunction& condition_function,
     const LoopBodyFunction& body_function,
-    gtl::ArraySlice<xla::ComputationDataHandle> initial_values,
-    StringPiece name, xla::ComputationBuilder* builder) {
+    gtl::ArraySlice<xla::XlaOp> initial_values, StringPiece name,
+    xla::XlaBuilder* builder) {
   int arity = initial_values.size();
   std::vector<xla::Shape> var_shapes;
   var_shapes.reserve(arity);
-  for (const xla::ComputationDataHandle& input : initial_values) {
+  for (const xla::XlaOp& input : initial_values) {
     TF_ASSIGN_OR_RETURN(auto shape, builder->GetShape(input));
-    var_shapes.push_back(std::move(*shape));
+    var_shapes.push_back(std::move(shape));
   }
   xla::Shape tuple_shape = xla::ShapeUtil::MakeTupleShape(var_shapes);
 
   // Unpacks a tuple into its component parts.
-  auto unpack_tuple = [](xla::ComputationDataHandle tuple, int arity,
-                         xla::ComputationBuilder* builder) {
-    std::vector<xla::ComputationDataHandle> elements(arity);
+  auto unpack_tuple = [](xla::XlaOp tuple, int arity,
+                         xla::XlaBuilder* builder) {
+    std::vector<xla::XlaOp> elements(arity);
     for (int i = 0; i < arity; ++i) {
       elements[i] = builder->GetTupleElement(tuple, i);
     }
@@ -45,20 +45,20 @@ xla::StatusOr<std::vector<xla::ComputationDataHandle>> XlaWhileLoop(
   };
 
   // Build the condition.
-  std::unique_ptr<xla::ComputationBuilder> cond_builder =
+  std::unique_ptr<xla::XlaBuilder> cond_builder =
       builder->CreateSubBuilder(strings::StrCat(name, "_condition"));
   {
     auto parameter = cond_builder->Parameter(0, tuple_shape, "parameter");
 
-    TF_ASSIGN_OR_RETURN(
-        auto result,
+    TF_RETURN_IF_ERROR(
         condition_function(unpack_tuple(parameter, arity, cond_builder.get()),
-                           cond_builder.get()));
+                           cond_builder.get())
+            .status());
   }
   TF_ASSIGN_OR_RETURN(auto cond, cond_builder->Build());
 
   // Build the body.
-  std::unique_ptr<xla::ComputationBuilder> body_builder =
+  std::unique_ptr<xla::XlaBuilder> body_builder =
       builder->CreateSubBuilder(strings::StrCat(name, "_body"));
   {
     auto parameter = body_builder->Parameter(0, tuple_shape, "parameter");
@@ -78,38 +78,38 @@ xla::StatusOr<std::vector<xla::ComputationDataHandle>> XlaWhileLoop(
   return unpack_tuple(outputs, arity, builder);
 }
 
-xla::StatusOr<std::vector<xla::ComputationDataHandle>> XlaForEachIndex(
+xla::StatusOr<std::vector<xla::XlaOp>> XlaForEachIndex(
     int64 num_iterations, xla::PrimitiveType num_iterations_type,
     const ForEachIndexBodyFunction& body_function,
-    gtl::ArraySlice<xla::ComputationDataHandle> initial_values,
-    StringPiece name, xla::ComputationBuilder* builder) {
-  auto while_cond_fn = [&](gtl::ArraySlice<xla::ComputationDataHandle> values,
-                           xla::ComputationBuilder* cond_builder)
-      -> xla::StatusOr<xla::ComputationDataHandle> {
+    gtl::ArraySlice<xla::XlaOp> initial_values, StringPiece name,
+    xla::XlaBuilder* builder) {
+  auto while_cond_fn =
+      [&](gtl::ArraySlice<xla::XlaOp> values,
+          xla::XlaBuilder* cond_builder) -> xla::StatusOr<xla::XlaOp> {
     return cond_builder->Lt(
         values[0],
         IntegerLiteral(cond_builder, num_iterations_type, num_iterations));
   };
-  auto while_body_fn = [&](gtl::ArraySlice<xla::ComputationDataHandle> values,
-                           xla::ComputationBuilder* body_builder)
-      -> xla::StatusOr<std::vector<xla::ComputationDataHandle>> {
-    xla::ComputationDataHandle iteration = values[0];
+  auto while_body_fn = [&](gtl::ArraySlice<xla::XlaOp> values,
+                           xla::XlaBuilder* body_builder)
+      -> xla::StatusOr<std::vector<xla::XlaOp>> {
+    xla::XlaOp iteration = values[0];
 
-    std::vector<xla::ComputationDataHandle> updated_values;
+    std::vector<xla::XlaOp> updated_values;
     updated_values.reserve(values.size());
     updated_values.push_back(body_builder->Add(
         iteration,
         body_builder->ConstantLiteral(xla::Literal::One(num_iterations_type))));
 
     values.remove_prefix(1);
-    TF_ASSIGN_OR_RETURN(std::vector<xla::ComputationDataHandle> body_outputs,
+    TF_ASSIGN_OR_RETURN(std::vector<xla::XlaOp> body_outputs,
                         body_function(iteration, values, body_builder));
     updated_values.insert(updated_values.end(), body_outputs.begin(),
                           body_outputs.end());
     return updated_values;
   };
 
-  std::vector<xla::ComputationDataHandle> values;
+  std::vector<xla::XlaOp> values;
   values.reserve(initial_values.size() + 1);
   values.push_back(
       builder->ConstantLiteral(xla::Literal::Zero(num_iterations_type)));
diff --git a/tensorflow/compiler/tf2xla/lib/while_loop.h b/tensorflow/compiler/tf2xla/lib/while_loop.h
index 2e67a0c99b6deb..5b6684c995889e 100644
--- a/tensorflow/compiler/tf2xla/lib/while_loop.h
+++ b/tensorflow/compiler/tf2xla/lib/while_loop.h
@@ -19,8 +19,8 @@ limitations under the License.
 #include <functional>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -29,14 +29,14 @@ namespace tensorflow {
 
 // Function that builds a loop condition. Takes as input a sequence of input
 // values, and returns a boolean value representing if the condition succeeds.
-typedef std::function<xla::StatusOr<xla::ComputationDataHandle>(
-    gtl::ArraySlice<xla::ComputationDataHandle>, xla::ComputationBuilder*)>
+typedef std::function<xla::StatusOr<xla::XlaOp>(gtl::ArraySlice<xla::XlaOp>,
+                                                xla::XlaBuilder*)>
     LoopConditionFunction;
 
 // Function that builds a loop body. Takes as input a sequence of input values
 // and returns a sequence of output values.
-typedef std::function<xla::StatusOr<std::vector<xla::ComputationDataHandle>>(
-    gtl::ArraySlice<xla::ComputationDataHandle>, xla::ComputationBuilder*)>
+typedef std::function<xla::StatusOr<std::vector<xla::XlaOp>>(
+    gtl::ArraySlice<xla::XlaOp>, xla::XlaBuilder*)>
     LoopBodyFunction;
 
 // Helper function for building an XLA while loop, where the values carried by
@@ -47,27 +47,26 @@ typedef std::function<xla::StatusOr<std::vector<xla::ComputationDataHandle>>(
 //   init: (a, b, c)
 // )
 // 'name' is a descriptive name for the loop.
-xla::StatusOr<std::vector<xla::ComputationDataHandle>> XlaWhileLoop(
+xla::StatusOr<std::vector<xla::XlaOp>> XlaWhileLoop(
     const LoopConditionFunction& condition_function,
     const LoopBodyFunction& body_function,
-    gtl::ArraySlice<xla::ComputationDataHandle> initial_values,
-    StringPiece name, xla::ComputationBuilder* builder);
+    gtl::ArraySlice<xla::XlaOp> initial_values, StringPiece name,
+    xla::XlaBuilder* builder);
 
 // Builds an XLA loop that repeats a computation `num_iterations` times.
 //
 // The body function (ForEachIndexBodyFunction) takes as input a pair of
 // (current iteration number, loop-carried values), and returns an updated
 // vector of the loop-carried values.
-typedef std::function<xla::StatusOr<std::vector<xla::ComputationDataHandle>>(
-    xla::ComputationDataHandle, gtl::ArraySlice<xla::ComputationDataHandle>,
-    xla::ComputationBuilder*)>
+typedef std::function<xla::StatusOr<std::vector<xla::XlaOp>>(
+    xla::XlaOp, gtl::ArraySlice<xla::XlaOp>, xla::XlaBuilder*)>
     ForEachIndexBodyFunction;
 
-xla::StatusOr<std::vector<xla::ComputationDataHandle>> XlaForEachIndex(
+xla::StatusOr<std::vector<xla::XlaOp>> XlaForEachIndex(
     int64 num_iterations, xla::PrimitiveType num_iterations_type,
     const ForEachIndexBodyFunction& body_function,
-    gtl::ArraySlice<xla::ComputationDataHandle> initial_values,
-    StringPiece name, xla::ComputationBuilder* builder);
+    gtl::ArraySlice<xla::XlaOp> initial_values, StringPiece name,
+    xla::XlaBuilder* builder);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/literal_util.cc b/tensorflow/compiler/tf2xla/literal_util.cc
index 2c3cd658e04623..43e1c1e9fecec1 100644
--- a/tensorflow/compiler/tf2xla/literal_util.cc
+++ b/tensorflow/compiler/tf2xla/literal_util.cc
@@ -40,7 +40,7 @@ Status HostTensorToLiteral(const Tensor& host_tensor, xla::Literal* literal) {
   return Status::OK();
 }
 
-Status CopyLiteralToHostTensor(const xla::Literal& literal,
+Status CopyLiteralToHostTensor(const xla::LiteralSlice& literal,
                                Tensor* host_tensor) {
   TF_RET_CHECK(xla::ShapeUtil::IsArray(literal.shape()) &&
                xla::ShapeUtil::ElementsIn(literal.shape()) ==
@@ -63,8 +63,8 @@ Status CopyLiteralToHostTensor(const xla::Literal& literal,
   return Status::OK();
 }
 
-Status LiteralToHostTensor(const xla::Literal& literal, DataType target_type,
-                           Tensor* host_tensor) {
+Status LiteralToHostTensor(const xla::LiteralSlice& literal,
+                           DataType target_type, Tensor* host_tensor) {
   TensorShape shape;
   TF_RETURN_IF_ERROR(XLAShapeToTensorShape(literal.shape(), &shape));
   *host_tensor = Tensor(target_type, shape);
diff --git a/tensorflow/compiler/tf2xla/literal_util.h b/tensorflow/compiler/tf2xla/literal_util.h
index f283b0236811f8..220bec15538c36 100644
--- a/tensorflow/compiler/tf2xla/literal_util.h
+++ b/tensorflow/compiler/tf2xla/literal_util.h
@@ -36,13 +36,13 @@ Status HostTensorToLiteral(const Tensor& host_tensor, xla::Literal* literal);
 // derivable from the type of <literal>, because multiple tensorflow types map
 // to the same XLA type (e.g. INT32 and QINT32 both map to INT32 in
 // XLA).
-Status LiteralToHostTensor(const xla::Literal& literal, DataType target_type,
-                           Tensor* host_tensor);
+Status LiteralToHostTensor(const xla::LiteralSlice& literal,
+                           DataType target_type, Tensor* host_tensor);
 
 // Copies the contents of 'literal' to a previously allocated tensor
 // 'host_tensor'. The tensor and the literal must have the same number of
 // elements and the same type.
-Status CopyLiteralToHostTensor(const xla::Literal& literal,
+Status CopyLiteralToHostTensor(const xla::LiteralSlice& literal,
                                Tensor* host_tensor);
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/tf2xla.cc b/tensorflow/compiler/tf2xla/tf2xla.cc
index 6051d7dffd7493..ac768b206e2a8d 100644
--- a/tensorflow/compiler/tf2xla/tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla.cc
@@ -251,7 +251,7 @@ Status CreateXlaArgs(const Graph& graph,
 // Converts the TensorFlow graph into an XLA computation, by executing the
 // graph symbolically, with each op building up the XLA HLO.
 Status ConvertGraphToXla(std::unique_ptr<Graph> graph, xla::Client* client,
-                         xla::Computation* computation) {
+                         xla::XlaComputation* computation) {
   XlaOpRegistry::RegisterCompilationKernels();
   for (Node* node : graph->nodes()) {
     node->set_assigned_device_name(
@@ -263,8 +263,7 @@ Status ConvertGraphToXla(std::unique_ptr<Graph> graph, xla::Client* client,
   // Compile the graph into an XLA computation.
   XlaCompiler::Options compiler_options;
   compiler_options.client = client;
-  DeviceType device_type(DEVICE_CPU_XLA_JIT);
-  compiler_options.device_type = &device_type;
+  compiler_options.device_type = DeviceType(DEVICE_CPU_XLA_JIT);
   compiler_options.flib_def = &graph->flib_def();
   compiler_options.graph_def_version = graph->versions().producer();
   compiler_options.allow_cpu_custom_calls = true;
@@ -303,7 +302,7 @@ Status ConvertGraphToXla(std::unique_ptr<Graph> graph, xla::Client* client,
 }
 
 // InitGraph creates a graph based on the graph_def, that may then be converted
-// to an xla::Computation via ConvertGraphToXla.
+// to an xla::XlaComputation via ConvertGraphToXla.
 //
 // The graph is rewritten with _Arg and _Retval nodes, representing the inputs
 // and outputs of the function that will be compiled.  Each feed id causes a new
@@ -348,7 +347,7 @@ Status InitGraph(const GraphDef& graph_def, const tf2xla::Config& config,
 
 Status ConvertGraphDefToXla(const GraphDef& graph_def,
                             const tf2xla::Config& config, xla::Client* client,
-                            xla::Computation* computation) {
+                            xla::XlaComputation* computation) {
   std::unique_ptr<Graph> graph;
   TF_RETURN_IF_ERROR(InitGraph(graph_def, config, &graph));
   TF_RETURN_IF_ERROR(ConvertGraphToXla(std::move(graph), client, computation));
diff --git a/tensorflow/compiler/tf2xla/tf2xla.h b/tensorflow/compiler/tf2xla/tf2xla.h
index 473c431b12d441..d02fc56c5b8f58 100644
--- a/tensorflow/compiler/tf2xla/tf2xla.h
+++ b/tensorflow/compiler/tf2xla/tf2xla.h
@@ -18,21 +18,21 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
 #include "tensorflow/compiler/xla/client/client.h"
-#include "tensorflow/compiler/xla/client/computation.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/core/framework/graph.pb.h"
 
 namespace tensorflow {
 
-// Converts a tensorflow::GraphDef into an xla::Computation.  The given `config`
-// specifies the portion of the graph to convert, via feeds and fetches. Each
-// feed is a positional input argument for the generated computation, while each
-// fetch is a positional output argument.
+// Converts a tensorflow::GraphDef into an xla::XlaComputation. The given
+// `config` specifies the portion of the graph to convert, via feeds and
+// fetches. Each feed is a positional input argument for the generated
+// computation, while each fetch is a positional output argument.
 //
 // The computation is built in the context of the given `client`, which may
 // subsequently be used to compile or execute the computation.
 Status ConvertGraphDefToXla(const GraphDef& graph_def,
                             const tf2xla::Config& config, xla::Client* client,
-                            xla::Computation* computation);
+                            xla::XlaComputation* computation);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/tf2xla_test.cc b/tensorflow/compiler/tf2xla/tf2xla_test.cc
index b813668a9edd3a..84c133ffabe20d 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_test.cc
@@ -69,7 +69,7 @@ TEST(ConvertGraphDefToXla, Sum) {
   tf2xla::Config config = SumConfig();
 
   xla::LocalClient* client = xla::ClientLibrary::LocalClientOrDie();
-  xla::Computation computation;
+  xla::XlaComputation computation;
   TF_EXPECT_OK(ConvertGraphDefToXla(graph_def, config, client, &computation));
 
   // Set up arguments.
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.cc b/tensorflow/compiler/tf2xla/tf2xla_util.cc
index 7ec85aa3cdec62..9203e8d9e607e9 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.cc
@@ -232,7 +232,7 @@ Status PruneGraphDefInto(const tf2xla::Config& config, const GraphDef& in,
     // Push input nodes of the currently visited node to name_queue.
     for (const string& in_edge : map_entry.second->input()) {
       auto id = ParseTensorName(in_edge);
-      const string node_name = id.first.ToString();
+      const string node_name = std::string(id.first);
       if (feed_tensors.find(std::make_pair(node_name, id.second)) ==
           feed_tensors.end()) {
         name_queue.push(node_name);
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.cc b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
index fcb0a4e63814b4..fe7ec633eca250 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.cc
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/sharding_util.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/platform/mem.h"
@@ -108,7 +109,7 @@ void XlaCompilationDevice::Compute(OpKernel* op_kernel,
   // If no sharding metadata is found, XLA is free to use whatever device it
   // wants. In practice this usually has the effect of placing things on device
   // 0.
-  xla::ScopedShardingAssignment assign_sharding(b, op_sharding);
+  xla::XlaScopedShardingAssignment assign_sharding(b, op_sharding);
   op_kernel->Compute(context);
 
   b->ClearOpMetadata();
@@ -126,9 +127,7 @@ Status XlaCompilationDevice::MakeTensorFromProto(
 
 XlaExpression::XlaExpression() = default;
 
-void XlaExpression::set_handle(const xla::ComputationDataHandle& h) {
-  handle_ = h;
-}
+void XlaExpression::set_handle(const xla::XlaOp& h) { handle_ = h; }
 
 void XlaExpression::set_constant_value(Tensor value) {
   has_constant_value_ = true;
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.h b/tensorflow/compiler/tf2xla/xla_compilation_device.h
index 0243ee332fbdca..d0b9e34e162f34 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.h
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/tf2xla/xla_resource.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/framework/device_base.h"
@@ -69,7 +69,7 @@ class XlaCompilationDevice : public LocalDevice {
 
 // A XlaExpression wraps an XLA computation. Each Tensor on an
 // XlaCompilationDevice contains an XlaExpression, and the shape of the Tensor
-// matches the shape of the subcomputation in the ComputationDataHandle. Each
+// matches the shape of the subcomputation in the XlaOp. Each
 // expression is either a constant, or a function of previously-compiled
 // expressions.
 class XlaExpression {
@@ -78,8 +78,8 @@ class XlaExpression {
 
   // handle() stores the XLA handle of the computation that the
   // expression represents.
-  void set_handle(const xla::ComputationDataHandle& h);
-  const xla::ComputationDataHandle& handle() const { return handle_; }
+  void set_handle(const xla::XlaOp& h);
+  const xla::XlaOp& handle() const { return handle_; }
 
   void set_constant_value(Tensor value);
   bool has_constant_value() const { return has_constant_value_; }
@@ -90,7 +90,7 @@ class XlaExpression {
 
  private:
   // The XLA handle of the expression's computation.
-  xla::ComputationDataHandle handle_;
+  xla::XlaOp handle_;
 
   // If this expression is a constant with a known value, 'constant_value' is a
   // host-memory Tensor containing the value. Used to avoid invoking XLA for
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index c0e996768491a6..a8bd199675b4ad 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -15,10 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 
-#include <deque>
 #include <numeric>
+#include <vector>
 
-#include "tensorflow/compiler/tf2xla/const_analysis.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/tf2xla/functionalize_control_flow.h"
 #include "tensorflow/compiler/tf2xla/graph_compiler.h"
@@ -28,7 +27,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
-#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/executor.h"
@@ -40,7 +38,6 @@ limitations under the License.
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/public/version.h"
 
 namespace tensorflow {
 namespace {
@@ -86,12 +83,9 @@ XlaCompiler::XlaCompiler(XlaCompiler::Options options)
     : options_(options),
       initialization_status_(Status::OK()),
       next_step_id_(1),
-      device_(
-          new XlaCompilationDevice(SessionOptions(), *options_.device_type)),
+      device_(new XlaCompilationDevice(SessionOptions(), options_.device_type)),
       device_mgr_({device_}) {
-  // We no longer need the device_type.
-  options_.device_type = nullptr;
-
+  CHECK(!options_.device_type.type_string().empty());
   if (options_.populate_resource_manager) {
     initialization_status_ =
         (*options_.populate_resource_manager)(device_->resource_manager());
@@ -110,10 +104,10 @@ XlaCompiler::XlaCompiler(XlaCompiler::Options options)
   local_flib_runtime_ = local_pflr_->GetFLR(device_->name());
   flib_runtime_ = pflr_->GetFLR(device_->name());
 
-  // The default variable representation shape is the identity function.
-  if (!options_.variable_representation_shape_fn) {
-    options_.variable_representation_shape_fn =
-        [](const TensorShape& shape, DataType type) { return shape; };
+  // The default shape representation function is the identity.
+  if (!options_.shape_representation_fn) {
+    options_.shape_representation_fn = [](const TensorShape& shape,
+                                          DataType type) { return shape; };
   }
 }
 
@@ -230,20 +224,25 @@ Status XlaCompiler::CompileFunction(const XlaCompiler::CompileOptions& options,
 
 // Computes the XLA shape for argument 'arg'.
 Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg,
-                                        xla::Shape* xla_shape) {
+                                        bool is_entry_computation,
+                                        xla::Shape* xla_shape) const {
   switch (arg.kind) {
     case XlaCompiler::Argument::kConstant:
-      return TensorShapeToXLAShape(arg.type, arg.constant_value.shape(),
-                                   xla_shape);
-    case XlaCompiler::Argument::kParameter:
-      return TensorShapeToXLAShape(arg.type, arg.shape, xla_shape);
+      LOG(FATAL) << "Unreachable case";
+    case XlaCompiler::Argument::kParameter: {
+      TensorShape shape =
+          is_entry_computation
+              ? options_.shape_representation_fn(arg.shape, arg.type)
+              : arg.shape;
+      return TensorShapeToXLAShape(arg.type, shape, xla_shape);
+    }
     case XlaCompiler::Argument::kResource: {
       TF_RET_CHECK(arg.initialized);
 
       switch (arg.resource_kind) {
         case XlaResource::kVariable: {
           TensorShape representation_shape =
-              options_.variable_representation_shape_fn(arg.shape, arg.type);
+              options_.shape_representation_fn(arg.shape, arg.type);
           return TensorShapeToXLAShape(arg.type, representation_shape,
                                        xla_shape);
         }
@@ -337,16 +336,25 @@ Status ExecuteGraph(XlaContext* xla_context, std::unique_ptr<Graph> graph,
 Status BuildComputation(
     const std::vector<XlaCompiler::Argument>& args,
     const std::vector<int>& arg_cores,
-    const std::vector<XlaExpression>& retvals,
+    const std::vector<XlaContext::Retval>& retvals,
     const std::vector<std::unique_ptr<XlaResource>>& resources,
-    bool return_updated_values_for_all_resources,
-    xla::ComputationBuilder* builder, xla::Computation* computation,
-    int* num_computation_outputs, int* num_nonconst_outputs,
+    bool return_updated_values_for_all_resources, xla::XlaBuilder* builder,
+    xla::XlaComputation* computation, int* num_computation_outputs,
+    int* num_nonconst_outputs,
+    std::vector<XlaCompiler::OutputDescription>* outputs,
     std::vector<XlaCompiler::ResourceUpdate>* resource_updates) {
-  std::vector<xla::ComputationDataHandle> elems;
+  std::vector<xla::XlaOp> elems;
   elems.reserve(retvals.size());
-  for (const XlaExpression& retval : retvals) {
-    if (!retval.has_constant_value()) {
+  for (int i = 0; i < retvals.size(); ++i) {
+    XlaCompiler::OutputDescription& output = (*outputs)[i];
+    output.type = retvals[i].type;
+    output.shape = retvals[i].shape;
+    const XlaExpression& retval = retvals[i].expression;
+    if (retval.has_constant_value()) {
+      output.is_constant = true;
+      output.constant_value = retval.constant_value();
+    } else {
+      output.is_constant = false;
       elems.push_back(retval.handle());
     }
   }
@@ -376,14 +384,12 @@ Status BuildComputation(
     const XlaCompiler::Argument& arg = args[resource->arg_num()];
     const int core = arg_cores[resource->arg_num()];
     DCHECK_LT(resource->arg_num(), arg_cores.size());
-    bool modified =
-        resource->value().handle() != resource->initial_value().handle();
+    bool modified = resource->value() != resource->initial_value();
     // TensorArray gradients were modified if their values changed or there are
     // any newly created gradients.
     for (const auto& grad : resource->tensor_array_gradients()) {
       modified = modified ||
-                 grad.second->value().handle() !=
-                     grad.second->initial_value().handle() ||
+                 grad.second->value() != grad.second->initial_value() ||
                  arg.tensor_array_gradients.count(grad.first) == 0;
     }
     if (return_updated_values_for_all_resources || modified) {
@@ -398,11 +404,11 @@ Status BuildComputation(
       }
 
       // Request that the value be returned on a specific core.
-      xla::ScopedShardingAssignment assign_sharding(
+      xla::XlaScopedShardingAssignment assign_sharding(
           builder, core == -1 ? tensorflow::gtl::optional<xla::OpSharding>()
                               : xla::sharding_builder::AssignDevice(core));
 
-      xla::ComputationDataHandle handle;
+      xla::XlaOp handle;
       TF_RETURN_IF_ERROR(resource->Pack(&handle, builder));
 
       // Since we can't change the sharding metadata of <value> as this point,
@@ -421,7 +427,7 @@ Status BuildComputation(
   builder->Tuple(elems);
   builder->ClearOpMetadata();
 
-  xla::StatusOr<xla::Computation> computation_status = builder->Build();
+  xla::StatusOr<xla::XlaComputation> computation_status = builder->Build();
   if (!computation_status.ok()) {
     return computation_status.status();
   }
@@ -435,7 +441,7 @@ Status BuildComputation(
 // `args` are the arguments to the computation.
 Status XlaCompiler::BuildArguments(
     const Graph& graph, const std::vector<XlaCompiler::Argument>& args,
-    bool use_tuple_arg, xla::ComputationBuilder* builder, XlaContext* context,
+    bool use_tuple_arg, xla::XlaBuilder* builder, XlaContext* context,
     std::vector<int>* arg_cores, std::vector<XlaExpression>* arg_expressions,
     std::vector<int>* input_mapping, std::vector<xla::Shape>* input_shapes,
     bool is_entry_computation) {
@@ -461,8 +467,7 @@ Status XlaCompiler::BuildArguments(
         // alias.
         XlaResource* resource;
         TF_RETURN_IF_ERROR(context->CreateResource(
-            arg.resource_kind, i, arg.name, arg.type, arg.shape,
-            xla::ComputationDataHandle(),
+            arg.resource_kind, i, arg.name, arg.type, arg.shape, xla::XlaOp(),
             /*tensor_array_size=*/arg.tensor_array_size,
             /*tensor_array_gradients=*/arg.tensor_array_gradients, &resource));
         arg_expression.set_resource(resource);
@@ -493,8 +498,8 @@ Status XlaCompiler::BuildArguments(
   std::vector<xla::Shape> arg_shapes(input_mapping->size());
   for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
     // Computes the shapes of non-constant arguments.
-    TF_RETURN_IF_ERROR(
-        XLAShapeForArgument(args[(*input_mapping)[i]], &arg_shapes[i]));
+    TF_RETURN_IF_ERROR(XLAShapeForArgument(
+        args[(*input_mapping)[i]], is_entry_computation, &arg_shapes[i]));
   }
 
   if (use_tuple_arg) {
@@ -531,9 +536,9 @@ Status XlaCompiler::BuildArguments(
   builder->SetOpMetadata(arg_metadata);
 
   // Build parameter handles for non-constant arguments.
-  std::vector<xla::ComputationDataHandle> arg_handles(input_mapping->size());
+  std::vector<xla::XlaOp> arg_handles(input_mapping->size());
   if (use_tuple_arg) {
-    xla::ComputationDataHandle tuple;
+    xla::XlaOp tuple;
     if (is_entry_computation) {
       xla::OpSharding tuple_sharding;
       tuple_sharding.set_type(xla::OpSharding::Type::OpSharding_Type_TUPLE);
@@ -544,15 +549,15 @@ Status XlaCompiler::BuildArguments(
             core == -1 ? xla::sharding_builder::AssignDevice(root_device)
                        : xla::sharding_builder::AssignDevice(core);
       }
-      xla::ScopedShardingAssignment assign_tuple_sharding(builder,
-                                                          tuple_sharding);
+      xla::XlaScopedShardingAssignment assign_tuple_sharding(builder,
+                                                             tuple_sharding);
       tuple = builder->Parameter(0, (*input_shapes)[0], "arg_tuple");
     } else {
       tuple = builder->Parameter(0, (*input_shapes)[0], "arg_tuple");
     }
     for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
       const int core = (*arg_cores)[input_mapping->at(i)];
-      xla::ScopedShardingAssignment assign_sharding(
+      xla::XlaScopedShardingAssignment assign_sharding(
           builder, core == -1 ? tensorflow::gtl::optional<xla::OpSharding>()
                               : xla::sharding_builder::AssignDevice(core));
       arg_handles[i] = builder->GetTupleElement(tuple, i);
@@ -560,7 +565,7 @@ Status XlaCompiler::BuildArguments(
   } else {
     for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
       const int core = (*arg_cores)[input_mapping->at(i)];
-      xla::ScopedShardingAssignment assign_sharding(
+      xla::XlaScopedShardingAssignment assign_sharding(
           builder, core == -1 ? tensorflow::gtl::optional<xla::OpSharding>()
                               : xla::sharding_builder::AssignDevice(core));
       arg_handles[i] =
@@ -570,7 +575,8 @@ Status XlaCompiler::BuildArguments(
 
   builder->ClearOpMetadata();
 
-  // Fill in the handles in non-constant arguments.
+  // Fill in the handles in non-constant arguments, and reshape parameters
+  // back to their correct shapes.
   VLOG(2) << "XLA computation inputs:";
   for (std::vector<int>::size_type i = 0; i < input_mapping->size(); ++i) {
     const XlaCompiler::Argument& arg = args[input_mapping->at(i)];
@@ -589,7 +595,15 @@ Status XlaCompiler::BuildArguments(
         break;
       }
       case XlaCompiler::Argument::kParameter:
-        arg_expression.set_handle(arg_handles[i]);
+        // Reshape parameters back to their correct shapes.
+        // TODO(b/76097077): propagate device assignments onto arguments and
+        // return values of functions, and then reshape unconditionally.
+        if (is_entry_computation) {
+          arg_expression.set_handle(
+              builder->Reshape(arg_handles[i], arg.shape.dim_sizes()));
+        } else {
+          arg_expression.set_handle(arg_handles[i]);
+        }
         break;
       case XlaCompiler::Argument::kConstant:
       case XlaCompiler::Argument::kInvalid:
@@ -642,12 +656,71 @@ Status XlaCompiler::CompileSingleOp(
   return CompileGraph(options, name, std::move(graph), args, result);
 }
 
+namespace {
+
+// Check that the ops of all non-functional nodes have been registered.
+string ValidateFunctionDef(const FunctionDef* fdef,
+                           const FunctionLibraryDefinition& flib_def) {
+  std::vector<string> invalid_ops;
+  for (const NodeDef& node : fdef->node_def()) {
+    const string& op = node.op();
+    if (op == FunctionLibraryDefinition::kGradientOp || flib_def.Find(op)) {
+      continue;
+    }
+    const OpDef* op_def;
+    if (!OpRegistry::Global()->LookUpOpDef(op, &op_def).ok()) {
+      invalid_ops.push_back(op);
+    }
+  }
+  return tensorflow::str_util::Join(invalid_ops, ", ");
+}
+
+// Check that the graph doesn't have any invalid nodes (e.g. incompatible with
+// given device_type, invalid data type, missing attributes...)
+Status ValidateGraph(const Graph* graph,
+                     const FunctionLibraryDefinition& flib_def,
+                     const DeviceType& device_type, const string& name) {
+  std::vector<string> invalid_ops;
+  for (const Node* node : graph->nodes()) {
+    if (node->type_string() == FunctionLibraryDefinition::kGradientOp) {
+      continue;
+    }
+    const FunctionDef* fdef = flib_def.Find(node->def().op());
+    if (fdef) {
+      string error_msg = ValidateFunctionDef(fdef, flib_def);
+      if (!error_msg.empty()) {
+        invalid_ops.push_back(
+            strings::StrCat(node->def().op(), ":{", error_msg, "}"));
+      }
+      continue;
+    }
+    const OpDef* op_def;
+    if (!OpRegistry::Global()->LookUpOpDef(node->def().op(), &op_def).ok()) {
+      invalid_ops.push_back(node->def().op());
+      continue;
+    }
+    TF_RETURN_IF_ERROR(ValidateNodeDef(node->def(), *op_def));
+    if (!FindKernelDef(device_type, node->def(), nullptr, nullptr).ok()) {
+      invalid_ops.push_back(node->def().op());
+    }
+  }
+  if (!invalid_ops.empty()) {
+    return errors::InvalidArgument(strings::StrCat(
+        "Detected unsupported operations when trying to compile graph ", name,
+        " on ", device_type.type_string(), ":",
+        tensorflow::str_util::Join(invalid_ops, ", ")));
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
 Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
                                  string const& name,
                                  std::unique_ptr<Graph> graph,
                                  const std::vector<XlaCompiler::Argument>& args,
                                  CompilationResult* result) {
-  VLOG(1) << "Executing graph symbolically to populate ComputationBuilder.";
+  VLOG(1) << "Executing graph symbolically to populate XlaBuilder.";
 
   if (VLOG_IS_ON(2)) {
     VLOG(2) << "XlaCompiler::CompileGraph: "
@@ -661,13 +734,19 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
   // Converts Tensorflow's graph control-flow constructs into functional
   // control-flow that can be compiled into XLA code.
   TF_RETURN_IF_ERROR(
-      FunctionalizeControlFlow(graph.get(), local_flib_def_.get()));
-
-  xla::ComputationBuilder builder(client(), name);
-  XlaContext* context =
-      new XlaContext(this, &builder, options_.allow_cpu_custom_calls,
-                     options.resolve_compile_time_constants,
-                     &options_.variable_representation_shape_fn);
+      FunctionalizeControlFlow(flib_runtime_->GetFunctionLibraryDefinition(),
+                               graph.get(), local_flib_def_.get()));
+
+  // Detect invalid nodes.
+  // FunctionalizeControlFlow may remove some nodes from the graph.
+  TF_RETURN_IF_ERROR(ValidateGraph(graph.get(), *options_.flib_def,
+                                   options_.device_type, name));
+
+  xla::XlaBuilder builder(name);
+  XlaContext* context = new XlaContext(
+      this, &builder, options_.allow_cpu_custom_calls,
+      options.resolve_compile_time_constants, options.is_entry_computation,
+      &options_.shape_representation_fn);
   core::ScopedUnref context_unref(context);
 
   std::vector<XlaExpression> arg_expressions;
@@ -683,36 +762,23 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
 
   int num_nonconst_outputs;
   int num_computation_outputs;
-  result->computation = std::make_shared<xla::Computation>();
+  result->computation = std::make_shared<xla::XlaComputation>();
+  result->outputs.resize(context->retvals().size());
   TF_RETURN_IF_ERROR(BuildComputation(
       args, arg_cores, context->retvals(), context->resources(),
       options.return_updated_values_for_all_resources, &builder,
       result->computation.get(), &num_computation_outputs,
-      &num_nonconst_outputs, &result->resource_updates));
+      &num_nonconst_outputs, &result->outputs, &result->resource_updates));
 
   VLOG(2) << "Outputs: total: " << context->retvals().size()
           << " nonconstant: " << num_nonconst_outputs;
-  result->outputs.resize(context->retvals().size());
-  for (std::vector<XlaExpression>::size_type i = 0;
-       i < context->retvals().size(); ++i) {
-    const XlaExpression& retval = context->retvals()[i];
-    if (retval.has_constant_value()) {
-      OutputDescription& output = result->outputs[i];
-      output.shape = retval.constant_value().shape();
-      output.is_constant = true;
-      output.constant_value = retval.constant_value();
-    }
-  }
 
-  // Compute the output shapes, if there is a computation with non-constant
+  // Compute the XLA output shape, if there is a computation with non-constant
   // outputs.
-  auto computation_shape = client()->GetComputationShape(*result->computation);
-  if (!computation_shape.ok()) {
-    return computation_shape.status();
-  }
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::ProgramShape> computation_shape,
+                      client()->GetComputationShape(*result->computation));
 
-  result->xla_output_shape.Swap(
-      computation_shape.ValueOrDie()->mutable_result());
+  result->xla_output_shape.Swap(computation_shape->mutable_result());
   VLOG(2) << "XLA output shape: "
           << xla::ShapeUtil::HumanString(result->xla_output_shape);
 
@@ -727,23 +793,6 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
   // Tensorflow expects a major-to-minor order of results.
   xla::LayoutUtil::SetToDefaultLayout(&result->xla_output_shape);
 
-  // Converts the output shapes to TensorShapes.
-  int computation_output = 0;
-  for (std::vector<XlaExpression>::size_type i = 0;
-       i < context->retvals().size(); ++i) {
-    const XlaExpression& retval = context->retvals()[i];
-    if (!retval.has_constant_value()) {
-      TF_RET_CHECK(computation_output < num_computation_outputs)
-          << "Computation has more outputs than expected";
-      OutputDescription& output = result->outputs[i];
-      output.is_constant = false;
-      TF_RETURN_IF_ERROR(XLAShapeToTensorShape(
-          xla::ShapeUtil::GetTupleElementShape(result->xla_output_shape,
-                                               computation_output),
-          &output.shape));
-      ++computation_output;
-    }
-  }
   return Status::OK();
 }
 
@@ -814,7 +863,7 @@ Status XlaCompiler::SetHostToDeviceMetadata(
 }
 
 Status XlaCompiler::GetHostComputeControlDependency(
-    const string& host_compute_name, xla::ComputationDataHandle* handle) {
+    const string& host_compute_name, xla::XlaOp* handle) {
   const auto iter = host_compute_control_output_.find(host_compute_name);
   if (iter == host_compute_control_output_.end()) {
     return errors::InvalidArgument(
@@ -827,7 +876,7 @@ Status XlaCompiler::GetHostComputeControlDependency(
 }
 
 Status XlaCompiler::SetHostComputeControlDependency(
-    const string& host_compute_name, const xla::ComputationDataHandle& handle) {
+    const string& host_compute_name, const xla::XlaOp& handle) {
   if (host_compute_control_output_.find(host_compute_name) !=
       host_compute_control_output_.end()) {
     return errors::InvalidArgument(
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 8f564f35ec8176..c93850ce270502 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
 #include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -38,7 +39,7 @@ class XlaContext;
 // It does a symbolic execution of the graph starting from specific input
 // shapes, using a JIT device to convert operators into XLA computations.
 //
-// XlaCompiler is typically invoked from an `_XlaLaunch` operator once the
+// XlaCompiler is typically invoked from an `XlaLaunch` operator once the
 // shapes of all input parameters to the computation are known. This is
 // because the symbolic execution requires known shapes for all operations.
 //
@@ -67,6 +68,15 @@ class XlaContext;
 // _Retval values are ordered by _Retval index, whereas kResource values are
 // ordered by the original _Arg position of the variable.
 //
+// If a shape representation function is provided as part of
+// XlaCompiler::CompileOptions, kParameter arguments and return values to an
+// entry computation will be reshaped in accordance to the shape function.
+// Arguments and return values to a non-entry computation are not reshaped.
+// Variable resource arguments are passed and returned in reshaped form, even
+// for non-entry computations. This feature allows TensorFlow to keep on-device
+// tensors with a different shape to their representation inside the XLA
+// computation.
+//
 // In both inputs and outputs, kResource values are placed the end. When
 // emitting While loop bodies, we must ensure that the loop body has
 // identical input and output signatures. By moving variable values
@@ -171,7 +181,7 @@ class XlaCompiler {
   };
 
   struct OutputDescription {
-    // Type and shape of the output.
+    // Type and shape of the output. The shape is the unflattened shape.
     DataType type;
     TensorShape shape;
 
@@ -206,10 +216,12 @@ class XlaCompiler {
     // original arguments, and are not necessarily in the same order.)
     std::vector<int> input_mapping;
 
-    // Input shapes of the computation.
+    // Input shapes of the computation. If we are flattening inputs, these are
+    // the flattened shapes.
     std::vector<xla::Shape> xla_input_shapes;
 
-    // Output shape in XLA format. The output shape is always a tuple.
+    // Output shape in XLA format. The output shape is always a tuple. If we
+    // are flattening outputs, these are the flattened shapes.
     xla::Shape xla_output_shape;
 
     // TensorFlow shapes of outputs, together with the values of any
@@ -227,13 +239,15 @@ class XlaCompiler {
     std::vector<ResourceUpdate> resource_updates;
 
     // The XLA computation built from the tensorflow subgraph.
-    std::shared_ptr<xla::Computation> computation;
+    std::shared_ptr<xla::XlaComputation> computation;
   };
 
+  typedef std::function<TensorShape(const TensorShape&, DataType)>
+      ShapeRepresentationFn;
   struct Options {
-    // Name of the compilation device to use. Needs to be live only during
-    // XlaCompiler's constructor.
-    const DeviceType* device_type = nullptr;
+    // Name of the compilation device to use. It must be set by the caller.
+    // The default empty value is invalid.
+    DeviceType device_type = DeviceType("");
 
     xla::Client* client = nullptr;
 
@@ -250,8 +264,7 @@ class XlaCompiler {
     // If set, the XLA representation of variables represented to XLA as the
     // shape given by this shape function. Variables are reshaped to this shape
     // on write, and reshaped to their original shape on read.
-    std::function<TensorShape(const TensorShape&, DataType)>
-        variable_representation_shape_fn;
+    ShapeRepresentationFn shape_representation_fn;
 
     // If not nullptr, populate_resource_manager is called with the
     // compilation device's resource manager when the compilation
@@ -281,7 +294,7 @@ class XlaCompiler {
                          const NameAttrList& fn_name_attrs,
                          std::vector<Argument> args, CompilationResult* result);
 
-  // Compiles a tensorflow::Graph into an xla::Computation.
+  // Compiles a tensorflow::Graph into an xla::XlaComputation.
   // Similar to CompileFunction, but takes a Graph as input rather than a
   // function.
   Status CompileGraph(const CompileOptions& options, string const& name,
@@ -290,7 +303,7 @@ class XlaCompiler {
                       CompilationResult* result);
 
   // Compiles a single Op, given by an OpKernelContext, into an
-  // xla::Computation. Similar to CompileFunction but takes a single Op as
+  // xla::XlaComputation. Similar to CompileFunction but takes a single Op as
   // input.
   Status CompileSingleOp(const CompileOptions& options, string const& name,
                          OpKernelContext* ctx,
@@ -300,7 +313,8 @@ class XlaCompiler {
   // Returns the shape of the XLA parameter for an argument 'arg'.
   // See the class comment for more details about the argument passing
   // convention.
-  Status XLAShapeForArgument(const Argument& arg, xla::Shape* xla_shape);
+  Status XLAShapeForArgument(const Argument& arg, bool is_entry_computation,
+                             xla::Shape* xla_shape) const;
 
   // Retrieves the channel handle associated with `key`. Allocates
   // a new channel handle if none exists.
@@ -337,10 +351,9 @@ class XlaCompiler {
   // a given HostCompute Op as long as the names are unique within the
   // compilation.
   Status GetHostComputeControlDependency(const string& host_compute_name,
-                                         xla::ComputationDataHandle* handle);
-  Status SetHostComputeControlDependency(
-      const string& host_compute_name,
-      const xla::ComputationDataHandle& handle);
+                                         xla::XlaOp* handle);
+  Status SetHostComputeControlDependency(const string& host_compute_name,
+                                         const xla::XlaOp& handle);
 
   const Options& options() const { return options_; }
   xla::Client* client() const { return options_.client; }
@@ -358,7 +371,7 @@ class XlaCompiler {
   // `args` are the arguments to the computation.
   Status BuildArguments(const Graph& graph,
                         const std::vector<XlaCompiler::Argument>& args,
-                        bool use_tuple_arg, xla::ComputationBuilder* builder,
+                        bool use_tuple_arg, xla::XlaBuilder* builder,
                         XlaContext* context, std::vector<int>* arg_cores,
                         std::vector<XlaExpression>* arg_expressions,
                         std::vector<int>* input_mapping,
@@ -408,8 +421,7 @@ class XlaCompiler {
   std::unordered_map<string, tf2xla::HostTransferMetadata> host_compute_sends_;
   std::unordered_map<string, tf2xla::HostTransferMetadata> host_compute_recvs_;
 
-  std::unordered_map<string, xla::ComputationDataHandle>
-      host_compute_control_output_;
+  std::unordered_map<string, xla::XlaOp> host_compute_control_output_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(XlaCompiler);
 };
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index 096dc7160bfc0a..5fbf4b952c6e6f 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -25,12 +25,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -43,8 +45,6 @@ namespace tensorflow {
 
 class XlaCompilerTest : public ::testing::Test {
  protected:
-  XlaCompilerTest() : cpu_device_type_(DEVICE_CPU_XLA_JIT) {}
-
   void SetUp() override {
     client_ = xla::ClientLibrary::LocalClientOrDie();
 
@@ -56,7 +56,7 @@ class XlaCompilerTest : public ::testing::Test {
 
   XlaCompiler::Options DefaultOptions() {
     XlaCompiler::Options options;
-    options.device_type = &cpu_device_type_;
+    options.device_type = DeviceType(DEVICE_CPU_XLA_JIT);
     options.client = client_;
     options.flib_def = flib_def_.get();
     return options;
@@ -66,7 +66,6 @@ class XlaCompilerTest : public ::testing::Test {
     return compiler->local_flib_def_.get();
   }
 
-  DeviceType cpu_device_type_;
   xla::Client* client_;
   std::unique_ptr<FunctionLibraryDefinition> flib_def_;
 };
@@ -164,7 +163,6 @@ REGISTER_XLA_OP(Name("DummyDuplicateOp").Device(DEVICE_CPU_XLA_JIT),
 REGISTER_XLA_OP(Name("DummyDuplicateOp").Device(DEVICE_GPU_XLA_JIT),
                 DummyDuplicateOp);
 
-
 // Tests compilation and execution of an empty graph.
 TEST_F(XlaCompilerTest, EmptyReturnValues) {
   XlaCompiler compiler(DefaultOptions());
@@ -226,7 +224,7 @@ TEST_F(XlaCompilerTest, Simple) {
       xla::Literal::CreateR1<int32>({4, 143});
   std::unique_ptr<xla::Literal> expected_literal =
       xla::Literal::MakeTuple({expected0.get()});
-  xla::LiteralTestUtil::ExpectEqual(*expected_literal, *actual_literal);
+  EXPECT_TRUE(xla::LiteralTestUtil::Equal(*expected_literal, *actual_literal));
 }
 
 TEST_F(XlaCompilerTest, HasSaneErrorOnNonCompileTimeConstantInputToReshape) {
@@ -321,7 +319,8 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
         xla::Literal::CreateR1<int32>({-7, -42});
     std::unique_ptr<xla::Literal> expected_literal =
         xla::Literal::MakeTuple({expected0.get()});
-    xla::LiteralTestUtil::ExpectEqual(*expected_literal, *actual_literal);
+    EXPECT_TRUE(
+        xla::LiteralTestUtil::Equal(*expected_literal, *actual_literal));
   }
 
   {
@@ -356,10 +355,80 @@ TEST_F(XlaCompilerTest, ConstantOutputs) {
         xla::Literal::CreateR1<int32>({-7, -42});
     std::unique_ptr<xla::Literal> expected =
         xla::Literal::MakeTuple({expected0.get(), expected1.get()});
-    xla::LiteralTestUtil::ExpectEqual(*expected, *actual_literal);
+    EXPECT_TRUE(xla::LiteralTestUtil::Equal(*expected, *actual_literal));
   }
 }
 
+TEST_F(XlaCompilerTest, ConstantOutputsOfFunctionalNode) {
+  // Define a function with one compile-time constant output and one
+  // data-dependent output.
+  // @function.Defun(noinline=True)
+  // foo(a) {b=7; return b, a; }
+  const Tensor seven = test::AsScalar<int>(7);
+  FunctionDef fdef = FunctionDefHelper::Create(
+      "foo", {"a_0:int32"}, {"const:int32", "a:int32"}, {},
+      {
+          {{"Const"}, "Const", {}, {{"dtype", DT_INT32}, {"value", seven}}},
+      },
+      {{"a", "a_0"}, {"const", "Const:output:0"}});
+  (*fdef.mutable_attr())["_noinline"].set_b(true);
+  FunctionDefLibrary fdef_lib;
+  *(fdef_lib.add_function()) = fdef;
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  {
+    Scope scope = Scope::NewRootScope().ExitOnError();
+    TF_EXPECT_OK(scope.graph()->AddFunctionLibrary(fdef_lib));
+    auto arg = ops::_Arg(scope.WithOpName("input_arg"), DT_INT32, 0);
+    NodeDef foo;
+    foo.set_name("foo");
+    foo.set_op("foo");
+    *foo.add_input() = "input_arg";
+    Status status;
+    scope.graph()->AddNode(foo, &status);
+    TF_ASSERT_OK(status);
+    NodeDef retval_1;
+    retval_1.set_name("retval_0");
+    retval_1.set_op(FunctionLibraryDefinition::kRetOp);
+    *retval_1.add_input() = "foo";
+    (*retval_1.mutable_attr())["T"].set_type(DT_INT32);
+    (*retval_1.mutable_attr())["index"].set_i(0);
+    scope.graph()->AddNode(retval_1, &status);
+    TF_ASSERT_OK(status);
+    NodeDef retval_2;
+    retval_2.set_name("retval_1");
+    retval_2.set_op(FunctionLibraryDefinition::kRetOp);
+    *retval_2.add_input() = "foo:1";
+    (*retval_2.mutable_attr())["T"].set_type(DT_INT32);
+    (*retval_2.mutable_attr())["index"].set_i(1);
+    scope.graph()->AddNode(retval_2, &status);
+    TF_ASSERT_OK(status);
+    TF_ASSERT_OK(scope.ToGraph(graph.get()));
+  }
+
+  // Builds a description of the arguments.
+  std::vector<XlaCompiler::Argument> args(1);
+  args[0].kind = XlaCompiler::Argument::kParameter;
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({1});
+
+  XlaCompiler::Options options = DefaultOptions();
+  FunctionLibraryDefinition flib_def(OpRegistry::Global(), fdef_lib);
+  options.flib_def = &flib_def;
+  XlaCompiler compiler(options);
+
+  XlaCompiler::CompileOptions compile_options;
+  compile_options.resolve_compile_time_constants = true;
+  XlaCompiler::CompilationResult result;
+  TF_ASSERT_OK(compiler.CompileGraph(compile_options, "constants",
+                                     std::move(graph), args, &result));
+
+  ASSERT_EQ(2, result.outputs.size());
+  EXPECT_TRUE(result.outputs[0].is_constant);
+  test::ExpectTensorEqual<int32>(result.outputs[0].constant_value,
+                                 test::AsScalar(7));
+  EXPECT_FALSE(result.outputs[1].is_constant);
+}
+
 // Tests compilation and execution of a graph that adds two tensors.
 TEST_F(XlaCompilerTest, ResourceManager) {
   // Builds a graph that calls the dummy resource Op.
@@ -433,21 +502,26 @@ TEST_F(XlaCompilerTest, DeterministicCompilation) {
   }
 
   for (int64 i = 1; i < test_count; ++i) {
-    auto m1 =
-        results[i - 1].computation->Snapshot().ValueOrDie()->entry().requests();
-    auto m2 =
-        results[i].computation->Snapshot().ValueOrDie()->entry().requests();
-    // Check if every entry is the same.
-    for (auto& entry1 : m1) {
-      int64 key = entry1.first;
-      auto value1 = entry1.second;
-      auto entry2 = m2.find(key);
-      auto value2 = entry2->second;
-      EXPECT_TRUE(entry2 != m2.end());
-      string str1, str2;
-      value1.AppendToString(&str1);
-      value2.AppendToString(&str2);
-      EXPECT_EQ(str1, str2);
+    const auto& m1 = results[i - 1].computation->proto();
+    const auto& m2 = results[i].computation->proto();
+    ASSERT_EQ(m1.computations_size(), m2.computations_size());
+    // Check if every hlo computation is the same.
+    for (int k = 0; k < m1.computations_size(); k++) {
+      const auto& c1 = m1.computations(k);
+      const auto& c2 = m2.computations(k);
+      ASSERT_EQ(c1.instructions_size(), c2.instructions_size());
+      for (int j = 0; j < c1.instructions_size(); j++) {
+        auto instr1 = c1.instructions(j);
+        auto instr2 = c2.instructions(j);
+        instr1.clear_name();
+        instr2.clear_name();
+        // The names of instructions were uniquified by the XlaBuilder, the rest
+        // of the fields should be identical.
+        string str1, str2;
+        instr1.AppendPartialToString(&str1);
+        instr2.AppendPartialToString(&str2);
+        EXPECT_EQ(str1, str2);
+      }
     }
   }
 }
@@ -519,7 +593,7 @@ TEST_F(XlaCompilerTest, CanPassTensorArraysToAndFromComputation) {
       {output_base.get(), output_grad1.get(), output_grad2.get()});
   std::unique_ptr<xla::Literal> expected_literal =
       xla::Literal::MakeTuple({output_read.get(), output_resource.get()});
-  xla::LiteralTestUtil::ExpectEqual(*expected_literal, *actual_literal);
+  EXPECT_TRUE(xla::LiteralTestUtil::Equal(*expected_literal, *actual_literal));
 }
 
 // Tests compilation and execution of a graph that adds two tensors.
@@ -742,13 +816,10 @@ TEST_F(XlaCompilerTest, Variables) {
       xla::Literal::CreateR1<int32>({4, 143});
   std::unique_ptr<xla::Literal> expected_literal =
       xla::Literal::MakeTuple({expected0.get(), expected1.get()});
-  xla::LiteralTestUtil::ExpectEqual(*expected_literal, *actual_literal);
+  EXPECT_TRUE(xla::LiteralTestUtil::Equal(*expected_literal, *actual_literal));
 }
 
-// Tests a simple graph that reads and writes a variable, with a
-// variable_representation_shape_fn passed to the compiler that flattens all
-// variable tensors to vectors.
-TEST_F(XlaCompilerTest, VariableRepresentationShapeFunction) {
+xla::StatusOr<std::unique_ptr<Graph>> BuildTestGraph() {
   Scope scope = Scope::NewRootScope().ExitOnError();
   auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0);
   auto var = ops::_Arg(scope.WithOpName("V"), DT_RESOURCE, 1);
@@ -759,7 +830,15 @@ TEST_F(XlaCompilerTest, VariableRepresentationShapeFunction) {
   auto read_plus_one = ops::Add(scope, read, ops::Const<int32>(scope, 1));
   auto d = ops::_Retval(scope.WithOpName("D"), read_plus_one, 0);
   std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
-  TF_ASSERT_OK(scope.ToGraph(graph.get()));
+  TF_RETURN_IF_ERROR(scope.ToGraph(graph.get()));
+  return std::move(graph);
+}
+
+// Tests a simple graph that reads and writes a variable, with a
+// shape_representation_fn passed to the compiler that flattens all
+// variable tensors to vectors.
+TEST_F(XlaCompilerTest, VariableRepresentationShapeFunction) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Graph> graph, BuildTestGraph());
 
   // Builds a description of the arguments.
   std::vector<XlaCompiler::Argument> args(2);
@@ -774,15 +853,33 @@ TEST_F(XlaCompilerTest, VariableRepresentationShapeFunction) {
 
   // Compiles the graph.
   XlaCompiler::Options options = DefaultOptions();
-  options.variable_representation_shape_fn = [](const TensorShape& shape,
-                                                DataType type) {
+  options.shape_representation_fn = [](const TensorShape& shape,
+                                       DataType type) {
     return TensorShape({shape.num_elements()});
   };
   XlaCompiler compiler(options);
 
+  XlaCompiler::CompileOptions compile_options;
+  compile_options.is_entry_computation = false;  // Only reshape variables.
+
   XlaCompiler::CompilationResult result;
-  TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "add",
-                                     std::move(graph), args, &result));
+  TF_ASSERT_OK(compiler.CompileGraph(compile_options, "add", std::move(graph),
+                                     args, &result));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::ProgramShape> program_shape,
+                          client_->GetComputationShape(*result.computation));
+
+  ASSERT_EQ(program_shape->parameters_size(), 2);
+  EXPECT_TRUE(
+      xla::ShapeUtil::Compatible(program_shape->parameters(0),
+                                 xla::ShapeUtil::MakeShape(xla::S32, {2, 2})));
+  EXPECT_TRUE(xla::ShapeUtil::Compatible(
+      program_shape->parameters(1), xla::ShapeUtil::MakeShape(xla::S32, {4})));
+  EXPECT_TRUE(xla::ShapeUtil::Compatible(
+      program_shape->result(),
+      xla::ShapeUtil::MakeTupleShape(
+          {xla::ShapeUtil::MakeShape(xla::S32, {2, 2}),
+           xla::ShapeUtil::MakeShape(xla::S32, {4})})));
 
   // Tests that the generated computation works.
   std::unique_ptr<xla::Literal> param0_literal =
@@ -807,7 +904,149 @@ TEST_F(XlaCompilerTest, VariableRepresentationShapeFunction) {
       xla::Literal::CreateR1<int32>({26, 66, 34, 401});
   std::unique_ptr<xla::Literal> expected_literal =
       xla::Literal::MakeTuple({expected0.get(), expected1.get()});
-  xla::LiteralTestUtil::ExpectEqual(*expected_literal, *actual_literal);
+  EXPECT_TRUE(xla::LiteralTestUtil::Equal(*expected_literal, *actual_literal));
+}
+
+TEST_F(XlaCompilerTest, ArgRetvalShapeRepresentationFunction) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Graph> graph, BuildTestGraph());
+
+  // Builds a description of the arguments.
+  std::vector<XlaCompiler::Argument> args(2);
+  args[0].kind = XlaCompiler::Argument::kParameter;
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({2, 2});
+  args[1].kind = XlaCompiler::Argument::kResource;
+  args[1].resource_kind = XlaResource::kVariable;
+  args[1].initialized = true;
+  args[1].type = DT_INT32;
+  args[1].shape = TensorShape({2, 2});
+
+  // Compiles the graph.
+  XlaCompiler::Options options = DefaultOptions();
+  options.shape_representation_fn = [](const TensorShape& shape,
+                                       DataType type) {
+    return TensorShape({shape.num_elements()});
+  };
+  XlaCompiler compiler(options);
+
+  XlaCompiler::CompileOptions compile_options;
+  compile_options.is_entry_computation = true;  // Reshape args and retvals.
+
+  XlaCompiler::CompilationResult result;
+  TF_ASSERT_OK(compiler.CompileGraph(compile_options, "add", std::move(graph),
+                                     args, &result));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::ProgramShape> program_shape,
+                          client_->GetComputationShape(*result.computation));
+
+  ASSERT_EQ(program_shape->parameters_size(), 2);
+  EXPECT_TRUE(xla::ShapeUtil::Compatible(
+      program_shape->parameters(0), xla::ShapeUtil::MakeShape(xla::S32, {4})));
+  EXPECT_TRUE(xla::ShapeUtil::Compatible(
+      program_shape->parameters(1), xla::ShapeUtil::MakeShape(xla::S32, {4})));
+  EXPECT_TRUE(xla::ShapeUtil::Compatible(
+      program_shape->result(),
+      xla::ShapeUtil::MakeTupleShape(
+          {xla::ShapeUtil::MakeShape(xla::S32, {4}),
+           xla::ShapeUtil::MakeShape(xla::S32, {4})})));
+
+  // Tests that the generated computation works.
+  std::unique_ptr<xla::Literal> param0_literal =
+      xla::Literal::CreateR1<int32>({4, 55, 1, -3});
+  std::unique_ptr<xla::Literal> param1_literal =
+      xla::Literal::CreateR1<int32>({22, 11, 33, 404});
+  std::unique_ptr<xla::GlobalData> param0_data =
+      client_->TransferToServer(*param0_literal).ConsumeValueOrDie();
+  std::unique_ptr<xla::GlobalData> param1_data =
+      client_->TransferToServer(*param1_literal).ConsumeValueOrDie();
+
+  std::unique_ptr<xla::GlobalData> actual =
+      client_
+          ->Execute(*result.computation, {param0_data.get(), param1_data.get()})
+          .ConsumeValueOrDie();
+  std::unique_ptr<xla::Literal> actual_literal =
+      client_->Transfer(*actual).ConsumeValueOrDie();
+
+  std::unique_ptr<xla::Literal> expected0 =
+      xla::Literal::CreateR1<int32>({27, 67, 35, 402});
+  std::unique_ptr<xla::Literal> expected1 =
+      xla::Literal::CreateR1<int32>({26, 66, 34, 401});
+  std::unique_ptr<xla::Literal> expected_literal =
+      xla::Literal::MakeTuple({expected0.get(), expected1.get()});
+  EXPECT_TRUE(xla::LiteralTestUtil::Equal(*expected_literal, *actual_literal));
+}
+
+// Tests a graph which has a function with an invalid op.
+TEST_F(XlaCompilerTest, FunctionWithInvalidOp) {
+  XlaCompiler compiler(DefaultOptions());
+
+  FunctionDefLibrary flib;
+  FunctionDef fn = FillFn();
+  NodeDef* node = fn.add_node_def();
+  node->set_name("Invalid");
+  node->set_op("InvalidOp"); /* unsupported op */
+  node = fn.add_node_def();
+  node->set_name("Switch");
+  node->set_op("Switch"); /* control flow node */
+  *flib.add_function() = fn;
+
+  TF_ASSERT_OK(flib_def_->AddFunctionDef(fn));
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+
+  Scope scope = Scope::NewRootScope().ExitOnError();
+  auto value = ops::Const<int32>(scope.WithOpName("value"), 1, {});
+  auto shape = ops::Const<int32>(scope.WithOpName("shape"), {5}, {1});
+  TF_ASSERT_OK(scope.graph()->AddFunctionLibrary(flib));
+
+  NodeDef def;
+  TF_ASSERT_OK(NodeDefBuilder("fill_fn", "FillFn", flib_def_.get())
+                   .Input(value.name(), 0, DT_INT32)
+                   .Input(shape.name(), 1, DT_INT32)
+                   .Finalize(&def));
+  Status status;
+  Node* fill = scope.graph()->AddNode(def, &status);
+  TF_ASSERT_OK(status);
+  TF_ASSERT_OK(scope.DoShapeInference(fill));
+  scope.graph()->AddEdge(value.node(), 0, fill, 0);
+  scope.graph()->AddEdge(shape.node(), 0, fill, 1);
+
+  auto retval = ops::_Retval(scope.WithOpName("retval"), Output(fill), 0);
+
+  TF_ASSERT_OK(scope.ToGraph(graph.get()));
+
+  std::vector<XlaCompiler::Argument> args;
+  XlaCompiler::CompilationResult result;
+  status = compiler.CompileGraph(XlaCompiler::CompileOptions(), "fill",
+                                 std::move(graph), args, &result);
+  ASSERT_FALSE(status.ok());
+  EXPECT_TRUE(
+      str_util::StrContains(status.error_message(), "FillFn:{InvalidOp}"))
+      << status.error_message();
+}
+
+// Tests a graph which has a node with invalid data type.
+TEST_F(XlaCompilerTest, NodeWithInvalidDataType) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  NodeDef shape;
+  shape.set_name("Shape");
+  shape.set_op("Shape");
+  (*shape.mutable_attr())["T"].set_type(DT_INT32);
+  (*shape.mutable_attr())["out_type"].set_type(DT_BOOL); /* invalid type */
+  Status status;
+  Node* shape_node = graph->AddNode(shape, &status);
+  TF_ASSERT_OK(status);
+  graph->AddControlEdge(graph->source_node(), shape_node);
+
+  std::vector<XlaCompiler::Argument> args;
+  XlaCompiler::CompilationResult result;
+  XlaCompiler compiler(DefaultOptions());
+  status = compiler.CompileGraph(XlaCompiler::CompileOptions(), "invalid_type",
+                                 std::move(graph), args, &result);
+  ASSERT_FALSE(status.ok());
+  EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                    "is not in the list of allowed values"))
+      << status.error_message();
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc
index 8423921086fec1..098072d33cd4eb 100644
--- a/tensorflow/compiler/tf2xla/xla_context.cc
+++ b/tensorflow/compiler/tf2xla/xla_context.cc
@@ -25,7 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -63,28 +63,32 @@ void XlaContext::set_args(std::vector<XlaExpression> args) {
 }
 
 XlaContext::XlaContext(
-    XlaCompiler* compiler, xla::ComputationBuilder* builder,
+    XlaCompiler* compiler, xla::XlaBuilder* builder,
     bool allow_cpu_custom_calls, bool resolve_compile_time_constants,
+    bool is_entry_computation,
     const std::function<TensorShape(const TensorShape&, DataType)>*
-        variable_representation_shape_fn)
+        shape_representation_fn)
     : compiler_(compiler),
       builder_(builder),
       allow_cpu_custom_calls_(allow_cpu_custom_calls),
       resolve_compile_time_constants_(resolve_compile_time_constants),
-      variable_representation_shape_fn_(variable_representation_shape_fn) {}
+      is_entry_computation_(is_entry_computation),
+      shape_representation_fn_(shape_representation_fn) {}
 
 string XlaContext::DebugString() { return "TLA JIT context"; }
 
 // This is called by the Retval Op to associate a computed value
 // with a specific return value of the subgraph.
 void XlaContext::AddRetval(int retval_index, DataType type,
-                           const xla::ComputationDataHandle& handle) {
+                           const TensorShape& shape, const xla::XlaOp& handle) {
   VLOG(1) << "Added retval index " << retval_index << " to XLA computation";
   // Add the return value to the list being built up.
   if (retvals_.size() <= retval_index) {
     retvals_.resize(retval_index + 1);
   }
-  retvals_[retval_index].set_handle(handle);
+  XlaExpression e;
+  e.set_handle(handle);
+  retvals_[retval_index] = Retval{type, shape, e};
 }
 
 Status XlaContext::AddConstRetval(int retval_index, DataType dtype,
@@ -94,23 +98,20 @@ Status XlaContext::AddConstRetval(int retval_index, DataType dtype,
   if (retvals_.size() <= retval_index) {
     retvals_.resize(retval_index + 1);
   }
-  if (resolve_compile_time_constants_) {
-    Tensor value;
-    TF_RETURN_IF_ERROR(LiteralToHostTensor(literal, dtype, &value));
-    retvals_[retval_index].set_constant_value(std::move(value));
-  } else {
-    retvals_[retval_index].set_handle(builder_->ConstantLiteral(literal));
-  }
+  Tensor value;
+  TF_RETURN_IF_ERROR(LiteralToHostTensor(literal, dtype, &value));
+  XlaExpression e;
+  e.set_constant_value(value);
+  retvals_[retval_index] = Retval{dtype, value.shape(), e};
   return Status::OK();
 }
 
-xla::ComputationBuilder* XlaContext::builder() { return builder_; }
+xla::XlaBuilder* XlaContext::builder() { return builder_; }
 
 Status XlaContext::CreateResource(
     XlaResource::Kind kind, int arg_num, string name, DataType type,
-    TensorShape shape, const xla::ComputationDataHandle& handle,
-    int64 tensor_array_size, const std::set<string>& tensor_array_gradients,
-    XlaResource** resource) {
+    TensorShape shape, const xla::XlaOp& handle, int64 tensor_array_size,
+    const std::set<string>& tensor_array_gradients, XlaResource** resource) {
   resources_.emplace_back(
       new XlaResource(kind, arg_num, std::move(name), type, std::move(shape),
                       handle, tensor_array_size, tensor_array_gradients));
@@ -118,16 +119,16 @@ Status XlaContext::CreateResource(
   return Status::OK();
 }
 
-TensorShape XlaContext::VariableRepresentationShape(const TensorShape& shape,
-                                                    DataType type) const {
-  return (*variable_representation_shape_fn_)(shape, type);
+TensorShape XlaContext::RepresentationShape(const TensorShape& shape,
+                                            DataType type) const {
+  return (*shape_representation_fn_)(shape, type);
 }
 
-const xla::Computation* XlaContext::GetOrCreateMax(const DataType type) {
+const xla::XlaComputation* XlaContext::GetOrCreateMax(const DataType type) {
   return LookupOrCreate(type, &max_func_, [this, type] {
     const string type_string = DataTypeString(type);
     VLOG(1) << "Building Max() for " << type_string;
-    xla::ComputationBuilder b(builder()->client(), "max<" + type_string + ">");
+    xla::XlaBuilder b("max<" + type_string + ">");
     xla::PrimitiveType xla_type;
     TF_CHECK_OK(DataTypeToPrimitiveType(type, &xla_type));
     auto x = b.Parameter(0, xla::ShapeUtil::MakeShape(xla_type, {}), "x");
@@ -137,11 +138,11 @@ const xla::Computation* XlaContext::GetOrCreateMax(const DataType type) {
   });
 }
 
-const xla::Computation* XlaContext::GetOrCreateMin(const DataType type) {
+const xla::XlaComputation* XlaContext::GetOrCreateMin(const DataType type) {
   return LookupOrCreate(type, &min_func_, [this, type] {
     const string type_string = DataTypeString(type);
     VLOG(1) << "Building Min() for " << type_string;
-    xla::ComputationBuilder b(builder()->client(), "min<" + type_string + ">");
+    xla::XlaBuilder b("min<" + type_string + ">");
     xla::PrimitiveType xla_type;
     TF_CHECK_OK(DataTypeToPrimitiveType(type, &xla_type));
     auto x = b.Parameter(0, xla::ShapeUtil::MakeShape(xla_type, {}), "x");
@@ -151,11 +152,11 @@ const xla::Computation* XlaContext::GetOrCreateMin(const DataType type) {
   });
 }
 
-const xla::Computation* XlaContext::GetOrCreateAdd(const DataType type) {
+const xla::XlaComputation* XlaContext::GetOrCreateAdd(const DataType type) {
   return LookupOrCreate(type, &add_func_, [this, type] {
     const string type_string = DataTypeString(type);
     VLOG(1) << "Building Add() for " << type_string;
-    xla::ComputationBuilder b(builder()->client(), "add<" + type_string + ">");
+    xla::XlaBuilder b("add<" + type_string + ">");
     xla::PrimitiveType xla_type;
     TF_CHECK_OK(DataTypeToPrimitiveType(type, &xla_type));
     auto x = b.Parameter(0, xla::ShapeUtil::MakeShape(xla_type, {}), "x");
@@ -165,11 +166,11 @@ const xla::Computation* XlaContext::GetOrCreateAdd(const DataType type) {
   });
 }
 
-const xla::Computation* XlaContext::GetOrCreateMul(const DataType type) {
+const xla::XlaComputation* XlaContext::GetOrCreateMul(const DataType type) {
   return LookupOrCreate(type, &mul_func_, [this, type] {
     const string type_string = DataTypeString(type);
     VLOG(1) << "Building Mul() for " << type_string;
-    xla::ComputationBuilder b(builder()->client(), "mul<" + type_string + ">");
+    xla::XlaBuilder b("mul<" + type_string + ">");
     xla::PrimitiveType xla_type;
     TF_CHECK_OK(DataTypeToPrimitiveType(type, &xla_type));
     auto x = b.Parameter(0, xla::ShapeUtil::MakeShape(xla_type, {}), "x");
@@ -179,9 +180,9 @@ const xla::Computation* XlaContext::GetOrCreateMul(const DataType type) {
   });
 }
 
-const xla::Computation* XlaContext::LookupOrCreate(
+const xla::XlaComputation* XlaContext::LookupOrCreate(
     DataType type, ComputationMap* out,
-    const std::function<xla::Computation()>& create) {
+    const std::function<xla::XlaComputation()>& create) {
   {
     const auto& entry = (*out)[type];
     if (!entry.IsNull()) {
diff --git a/tensorflow/compiler/tf2xla/xla_context.h b/tensorflow/compiler/tf2xla/xla_context.h
index 00fbaba37c5429..341bf6ff1f37fa 100644
--- a/tensorflow/compiler/tf2xla/xla_context.h
+++ b/tensorflow/compiler/tf2xla/xla_context.h
@@ -22,8 +22,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
@@ -42,32 +42,44 @@ class XlaContext : public ResourceBase {
   static XlaContext& Get(const OpKernelContext* ctx);
   static XlaContext& Get(const XlaOpKernelContext* ctx);
 
-  // Creates a new XlaContext.
-  XlaContext(XlaCompiler* compiler, xla::ComputationBuilder* builder,
+  // Creates a new XlaContext. See the documentation on the class data fields
+  // for descriptions of the arguments.
+  XlaContext(XlaCompiler* compiler, xla::XlaBuilder* builder,
              bool allow_cpu_custom_calls, bool resolve_compile_time_constants,
+             bool is_entry_computation,
              const std::function<TensorShape(const TensorShape&, DataType)>*
-                 variable_representation_shape_fn);
+                 shape_representation_fn);
 
   // Virtual method defined by ResourceBase.
   string DebugString() override;
 
   XlaCompiler* compiler() const { return compiler_; }
 
-  // Returns the ComputationBuilder that Ops use for compiling new
-  // expressions.
-  xla::ComputationBuilder* builder();
+  // Returns the XlaBuilder that Ops use for compiling new expressions.
+  xla::XlaBuilder* builder();
 
   bool allow_cpu_custom_calls() const { return allow_cpu_custom_calls_; }
 
+  bool resolve_compile_time_constants() const {
+    return resolve_compile_time_constants_;
+  }
+  bool is_entry_computation() const { return is_entry_computation_; }
+
   const std::vector<XlaExpression>& args() const { return args_; }
   void set_args(std::vector<XlaExpression> args);
 
-  const std::vector<XlaExpression>& retvals() { return retvals_; }
+  struct Retval {
+    DataType type;
+    TensorShape shape;
+    // An XlaExpression representing the Retval's value.
+    XlaExpression expression;
+  };
+  const std::vector<Retval>& retvals() { return retvals_; }
 
   // This is called by the Retval Op to associate a computed value
   // with a specific return value of the subgraph.
-  void AddRetval(int retval_index, DataType type,
-                 const xla::ComputationDataHandle& handle);
+  void AddRetval(int retval_index, DataType type, const TensorShape& shape,
+                 const xla::XlaOp& handle);
 
   // As for Retval, but for return values that are compile-time constants.
   Status AddConstRetval(int retval_index, DataType dtype,
@@ -79,8 +91,7 @@ class XlaContext : public ResourceBase {
   // Fails if the resource already exists.
   Status CreateResource(XlaResource::Kind kind, int arg_num, string name,
                         DataType type, TensorShape shape,
-                        const xla::ComputationDataHandle& handle,
-                        int64 tensor_array_size,
+                        const xla::XlaOp& handle, int64 tensor_array_size,
                         const std::set<string>& tensor_array_gradients,
                         XlaResource** resource);
 
@@ -89,29 +100,29 @@ class XlaContext : public ResourceBase {
   }
 
   // Returns the XLA shape to be used to represent a variable of TF `shape`
-  // and `type`.
-  TensorShape VariableRepresentationShape(const TensorShape& shape,
-                                          DataType type) const;
+  // and `type`, or of an argument or return value of a top-level computation.
+  TensorShape RepresentationShape(const TensorShape& shape,
+                                  DataType type) const;
 
   // Get an XLA lambda to compute Max. This is cached in the
   // XlaContext since it may be used by multiple Ops. There is a
   // separate specialization of the computation for each DataType.
-  const xla::Computation* GetOrCreateMax(const DataType type);
+  const xla::XlaComputation* GetOrCreateMax(const DataType type);
 
   // Get an XLA lambda to compute Min. This is cached in the
   // XlaContext since it may be used by multiple Ops. There is a
   // separate specialization of the computation for each DataType.
-  const xla::Computation* GetOrCreateMin(const DataType type);
+  const xla::XlaComputation* GetOrCreateMin(const DataType type);
 
   // Get an XLA lambda to compute Add. This is cached in the
   // XlaContext since it may be used by multiple Ops. There is a
   // separate specialization of the computation for each DataType.
-  const xla::Computation* GetOrCreateAdd(const DataType type);
+  const xla::XlaComputation* GetOrCreateAdd(const DataType type);
 
   // Get an XLA lambda to compute Mul. This is cached in the
   // XlaContext since it may be used by multiple Ops. There is a
   // separate specialization of the computation for each DataType.
-  const xla::Computation* GetOrCreateMul(const DataType type);
+  const xla::XlaComputation* GetOrCreateMul(const DataType type);
 
   // The name of the XlaContext resource during symbolic graph execution.
   static const char kXlaContextResourceName[];
@@ -119,9 +130,8 @@ class XlaContext : public ResourceBase {
  private:
   XlaCompiler* const compiler_;
 
-  // The ComputationBuilder used to construct the subgraph's compiled
-  // representation.
-  xla::ComputationBuilder* builder_;
+  // The XlaBuilder used to construct the subgraph's compiled representation.
+  xla::XlaBuilder* builder_;
 
   // Allow ops to emit CustomCall operations for CPU.
   const bool allow_cpu_custom_calls_;
@@ -135,25 +145,33 @@ class XlaContext : public ResourceBase {
   std::vector<XlaExpression> args_;
 
   // Return values of the Tensorflow graph, indexed by _Retval index.
-  std::vector<XlaExpression> retvals_;
+  std::vector<Retval> retvals_;
 
   // Holds ownership of resources. The resources are not ordered.
   std::vector<std::unique_ptr<XlaResource>> resources_;
 
-  // A function that describes how variable shapes should be represented
-  // in XLA. Variable values will be reshaped to this shape. Must be non-null.
+  // Is this a top-level computation, or an inner computation (e.g., a while
+  // body)?
+  const bool is_entry_computation_;
+
+  // A function that describes how the shapes of
+  // a) argument and return value, for entry computations
+  // b) variables, for all computations,
+  // should be represented in XLA. Parameters/return values will be shaped
+  // according to this function, and reshaped back to/from their declared shapes
+  // for computations. Must be non-null.
   const std::function<TensorShape(const TensorShape&, DataType)>*
-      variable_representation_shape_fn_;
+      shape_representation_fn_;
 
   // Cache of prebuilt computations indexed by their type.
-  using ComputationMap = std::map<DataType, xla::Computation>;
+  using ComputationMap = std::map<DataType, xla::XlaComputation>;
 
   // Finds the value for the given type in out map if it already
   // exists or makes a new value with create function and keeps it the
   // map. The returned value != nullptr and is owned by the map.
-  const xla::Computation* LookupOrCreate(
+  const xla::XlaComputation* LookupOrCreate(
       DataType type, ComputationMap* out,
-      const std::function<xla::Computation()>& create);
+      const std::function<xla::XlaComputation()>& create);
 
   // Cached computation to compute Max of two elements, specialized by type.
   ComputationMap max_func_;
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc
index 62a5114837e07f..f1594193af09c7 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.cc
+++ b/tensorflow/compiler/tf2xla/xla_helpers.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -32,13 +32,12 @@ namespace tensorflow {
 
 namespace {
 
-Status ArgMinMax(xla::ComputationBuilder* builder, XlaOpKernelContext* ctx,
-                 const xla::ComputationDataHandle& input,
-                 const TensorShape& input_shape, DataType input_type,
-                 DataType output_type, int axis, bool is_min,
-                 xla::ComputationDataHandle* argminmax) {
-  xla::ComputationDataHandle init_value;
-  const xla::Computation* reducer;
+Status ArgMinMax(xla::XlaBuilder* builder, XlaOpKernelContext* ctx,
+                 const xla::XlaOp& input, const TensorShape& input_shape,
+                 DataType input_type, DataType output_type, int axis,
+                 bool is_min, xla::XlaOp* argminmax) {
+  xla::XlaOp init_value;
+  const xla::XlaComputation* reducer;
   if (is_min) {
     init_value = XlaHelpers::MaxValue(builder, input_type);
     reducer = ctx->GetOrCreateMin(input_type);
@@ -50,13 +49,13 @@ Status ArgMinMax(xla::ComputationBuilder* builder, XlaOpKernelContext* ctx,
   xla::PrimitiveType xla_output_type;
   TF_RETURN_IF_ERROR(DataTypeToPrimitiveType(output_type, &xla_output_type));
 
-  xla::ComputationDataHandle input_max = builder->Reduce(
-      input, init_value, *reducer, /*dimensions_to_reduce=*/{axis});
+  xla::XlaOp input_max = builder->Reduce(input, init_value, *reducer,
+                                         /*dimensions_to_reduce=*/{axis});
   std::vector<int64> broadcast_dims(input_shape.dims() - 1);
   std::iota(broadcast_dims.begin(), broadcast_dims.begin() + axis, 0);
   std::iota(broadcast_dims.begin() + axis, broadcast_dims.end(), axis + 1);
   // Compute a mask that has 1s for elements equal to the maximum.
-  xla::ComputationDataHandle partial_mask = builder->ConvertElementType(
+  xla::XlaOp partial_mask = builder->ConvertElementType(
       builder->Eq(input, input_max, broadcast_dims), xla_output_type);
 
   // In order to make identity elements for a bitwise And, we:
@@ -65,23 +64,23 @@ Status ArgMinMax(xla::ComputationBuilder* builder, XlaOpKernelContext* ctx,
   //   0xFF...F
   int32 bits_in_type =
       xla::ShapeUtil::ByteSizeOfPrimitiveType(xla_output_type) * 8 - 1;
-  xla::ComputationDataHandle shift_amount =
+  xla::XlaOp shift_amount =
       XlaHelpers::IntegerLiteral(builder, output_type, bits_in_type);
-  xla::ComputationDataHandle full_mask = builder->ShiftRightArithmetic(
+  xla::XlaOp full_mask = builder->ShiftRightArithmetic(
       builder->ShiftLeft(partial_mask, shift_amount), shift_amount);
 
   // And with the vector [0, 1, 2, ...] to convert each 0xFF...F into its
   // index.
-  xla::ComputationDataHandle iota;
+  xla::XlaOp iota;
 
   const int64 axis_size = input_shape.dim_size(axis);
   TF_RETURN_IF_ERROR(XlaHelpers::Iota(builder, output_type, axis_size, &iota));
-  xla::ComputationDataHandle product =
+  xla::XlaOp product =
       builder->And(full_mask, iota, /*broadcast_dimensions=*/{axis});
 
   // If there are multiple maximum elements, choose the one with the highest
   // index.
-  xla::ComputationDataHandle output =
+  xla::XlaOp output =
       builder->Reduce(product, XlaHelpers::MinValue(builder, output_type),
                       *ctx->GetOrCreateMax(output_type),
                       /*dimensions_to_reduce=*/{axis});
@@ -91,36 +90,31 @@ Status ArgMinMax(xla::ComputationBuilder* builder, XlaOpKernelContext* ctx,
 
 }  // namespace
 
-xla::ComputationDataHandle XlaHelpers::MinValue(xla::ComputationBuilder* b,
-                                                DataType data_type) {
+xla::XlaOp XlaHelpers::MinValue(xla::XlaBuilder* b, DataType data_type) {
   xla::PrimitiveType type;
   TF_CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
   return b->ConstantLiteral(xla::Literal::MinValue(type));
 }
 
-xla::ComputationDataHandle XlaHelpers::MaxValue(xla::ComputationBuilder* b,
-                                                DataType data_type) {
+xla::XlaOp XlaHelpers::MaxValue(xla::XlaBuilder* b, DataType data_type) {
   xla::PrimitiveType type;
   TF_CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
   return b->ConstantLiteral(xla::Literal::MaxValue(type));
 }
 
-xla::ComputationDataHandle XlaHelpers::Zero(xla::ComputationBuilder* b,
-                                            DataType data_type) {
+xla::XlaOp XlaHelpers::Zero(xla::XlaBuilder* b, DataType data_type) {
   xla::PrimitiveType type;
   TF_CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
   return b->ConstantLiteral(xla::Literal::Zero(type));
 }
 
-xla::ComputationDataHandle XlaHelpers::One(xla::ComputationBuilder* b,
-                                           DataType data_type) {
+xla::XlaOp XlaHelpers::One(xla::XlaBuilder* b, DataType data_type) {
   xla::PrimitiveType type;
   TF_CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
   return b->ConstantLiteral(xla::Literal::One(type));
 }
 
-xla::ComputationDataHandle XlaHelpers::Epsilon(xla::ComputationBuilder* b,
-                                               DataType data_type) {
+xla::XlaOp XlaHelpers::Epsilon(xla::XlaBuilder* b, DataType data_type) {
   switch (data_type) {
     case DT_HALF:
       return b->ConstantR0<Eigen::half>(
@@ -137,16 +131,15 @@ xla::ComputationDataHandle XlaHelpers::Epsilon(xla::ComputationBuilder* b,
   }
 }
 
-xla::ComputationDataHandle XlaHelpers::IntegerLiteral(
-    xla::ComputationBuilder* b, DataType data_type, int64 value) {
+xla::XlaOp XlaHelpers::IntegerLiteral(xla::XlaBuilder* b, DataType data_type,
+                                      int64 value) {
   xla::PrimitiveType type;
   TF_CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
   return ::tensorflow::IntegerLiteral(b, type, value);
 }
 
-xla::ComputationDataHandle XlaHelpers::FloatLiteral(xla::ComputationBuilder* b,
-                                                    DataType data_type,
-                                                    double value) {
+xla::XlaOp XlaHelpers::FloatLiteral(xla::XlaBuilder* b, DataType data_type,
+                                    double value) {
   xla::PrimitiveType type;
   TF_CHECK_OK(DataTypeToPrimitiveType(data_type, &type));
   return ::tensorflow::FloatLiteral(b, type, value);
@@ -183,28 +176,24 @@ static Tensor MakeLinspaceTensor(const TensorShape& shape, int64 depth) {
   return linspace;
 }
 
-Status XlaHelpers::ArgMax(xla::ComputationBuilder* builder,
-                          XlaOpKernelContext* ctx,
-                          const xla::ComputationDataHandle& input,
+Status XlaHelpers::ArgMax(xla::XlaBuilder* builder, XlaOpKernelContext* ctx,
+                          const xla::XlaOp& input,
                           const TensorShape& input_shape, DataType input_type,
-                          DataType output_type, int axis,
-                          xla::ComputationDataHandle* argmax) {
+                          DataType output_type, int axis, xla::XlaOp* argmax) {
   return ArgMinMax(builder, ctx, input, input_shape, input_type, output_type,
                    axis, /*is_min=*/false, argmax);
 }
 
-Status XlaHelpers::ArgMin(xla::ComputationBuilder* builder,
-                          XlaOpKernelContext* ctx,
-                          const xla::ComputationDataHandle& input,
+Status XlaHelpers::ArgMin(xla::XlaBuilder* builder, XlaOpKernelContext* ctx,
+                          const xla::XlaOp& input,
                           const TensorShape& input_shape, DataType input_type,
-                          DataType output_type, int axis,
-                          xla::ComputationDataHandle* argmin) {
+                          DataType output_type, int axis, xla::XlaOp* argmin) {
   return ArgMinMax(builder, ctx, input, input_shape, input_type, output_type,
                    axis, /*is_min=*/true, argmin);
 }
 
-Status XlaHelpers::Iota(xla::ComputationBuilder* builder, DataType dtype,
-                        int64 size, xla::ComputationDataHandle* iota) {
+Status XlaHelpers::Iota(xla::XlaBuilder* builder, DataType dtype, int64 size,
+                        xla::XlaOp* iota) {
   TensorShape linspace_shape({size});
   Tensor linspace;
   switch (dtype) {
@@ -227,13 +216,10 @@ Status XlaHelpers::Iota(xla::ComputationBuilder* builder, DataType dtype,
   return Status::OK();
 }
 
-Status XlaHelpers::OneHot(xla::ComputationBuilder* builder, int64 depth,
-                          int axis, DataType index_type,
-                          const TensorShape& indices_shape,
-                          const xla::ComputationDataHandle& indices,
-                          const xla::ComputationDataHandle& on_value,
-                          const xla::ComputationDataHandle& off_value,
-                          xla::ComputationDataHandle* one_hot) {
+Status XlaHelpers::OneHot(xla::XlaBuilder* builder, int64 depth, int axis,
+                          DataType index_type, const TensorShape& indices_shape,
+                          const xla::XlaOp& indices, const xla::XlaOp& on_value,
+                          const xla::XlaOp& off_value, xla::XlaOp* one_hot) {
   const int indices_dims = indices_shape.dims();
   const int output_dims = indices_dims + 1;
 
@@ -267,7 +253,7 @@ Status XlaHelpers::OneHot(xla::ComputationBuilder* builder, int64 depth,
   std::vector<int64> broadcast_dims(indices_shape.dims());
   std::iota(broadcast_dims.begin(), broadcast_dims.begin() + axis, 0);
   std::iota(broadcast_dims.begin() + axis, broadcast_dims.end(), axis + 1);
-  xla::ComputationDataHandle one_hot_bool = builder->Eq(
+  xla::XlaOp one_hot_bool = builder->Eq(
       indices, builder->ConstantLiteral(linspace_literal), broadcast_dims);
 
   // Selects the user-provided off_value and on_value values.
@@ -278,16 +264,15 @@ Status XlaHelpers::OneHot(xla::ComputationBuilder* builder, int64 depth,
 }
 
 DataType XlaHelpers::SumAccumulationType(const DataType& dtype) {
-  if (dtype == DT_BFLOAT16) {
+  if (dtype == DT_BFLOAT16 || dtype == DT_HALF) {
     return DT_FLOAT;
   }
   return dtype;
 }
 
-xla::ComputationDataHandle XlaHelpers::ConvertElementType(
-    xla::ComputationBuilder* const builder,
-    const xla::ComputationDataHandle& operand,
-    const DataType new_element_type) {
+xla::XlaOp XlaHelpers::ConvertElementType(xla::XlaBuilder* const builder,
+                                          const xla::XlaOp& operand,
+                                          const DataType new_element_type) {
   xla::PrimitiveType convert_to;
   TF_CHECK_OK(DataTypeToPrimitiveType(new_element_type, &convert_to));
   return builder->ConvertElementType(operand, convert_to);
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.h b/tensorflow/compiler/tf2xla/xla_helpers.h
index 68ab93b64a5fa8..c3fdc5252e7436 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.h
+++ b/tensorflow/compiler/tf2xla/xla_helpers.h
@@ -19,7 +19,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_TF2XLA_XLA_HELPERS_H_
 
 #include "tensorflow/compiler/tf2xla/xla_context.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 
@@ -30,41 +30,34 @@ class XlaHelpers {
  public:
   // Returns a handle representing the minimum value of a scalar
   // element of data_type.
-  static xla::ComputationDataHandle MinValue(xla::ComputationBuilder* b,
-                                             DataType data_type);
+  static xla::XlaOp MinValue(xla::XlaBuilder* b, DataType data_type);
 
   // Returns a handle representing the maximum value of a scalar
   // element of data_type.
-  static xla::ComputationDataHandle MaxValue(xla::ComputationBuilder* b,
-                                             DataType data_type);
+  static xla::XlaOp MaxValue(xla::XlaBuilder* b, DataType data_type);
 
   // Returns a handle representing the zero value of a scalar
   // element of data_type.
-  static xla::ComputationDataHandle Zero(xla::ComputationBuilder* b,
-                                         DataType data_type);
+  static xla::XlaOp Zero(xla::XlaBuilder* b, DataType data_type);
 
   // Returns a handle representing the one value of a scalar
   // element of data_type.
-  static xla::ComputationDataHandle One(xla::ComputationBuilder* b,
-                                        DataType data_type);
+  static xla::XlaOp One(xla::XlaBuilder* b, DataType data_type);
 
   // Returns the machine epsilon for floating-point type `data_type`, i.e.,
   // the difference between 1.0 and the next representable value.
-  static xla::ComputationDataHandle Epsilon(xla::ComputationBuilder* b,
-                                            DataType data_type);
+  static xla::XlaOp Epsilon(xla::XlaBuilder* b, DataType data_type);
 
   // Returns a handle representing the given value of an integer scalar
   // element of data_type.
   // Note that unlike One and Zero, does not work on boolean types.
-  static xla::ComputationDataHandle IntegerLiteral(xla::ComputationBuilder* b,
-                                                   DataType data_type,
-                                                   int64 value);
+  static xla::XlaOp IntegerLiteral(xla::XlaBuilder* b, DataType data_type,
+                                   int64 value);
 
   // Returns a handle representing the given value of a floating-point scalar
   // element of data_type.
-  static xla::ComputationDataHandle FloatLiteral(xla::ComputationBuilder* b,
-                                                 DataType data_type,
-                                                 double value);
+  static xla::XlaOp FloatLiteral(xla::XlaBuilder* b, DataType data_type,
+                                 double value);
 
   // Reshapes literal 'input' to have 'shape'. Both the original shape and
   // 'shape' must contain the same number of elements.
@@ -75,38 +68,32 @@ class XlaHelpers {
   // Sets `argmax` to the argmax of `input` along `axis`. `input_shape` and
   // `input_dtype` are the shape and dtype of `input` respectively, and
   // `output_type` is the dtype to use for `argmax`.
-  static Status ArgMax(xla::ComputationBuilder* builder,
-                       XlaOpKernelContext* ctx,
-                       const xla::ComputationDataHandle& input,
-                       const TensorShape& input_shape, DataType input_type,
-                       DataType output_type, int axis,
-                       xla::ComputationDataHandle* argmax);
+  static Status ArgMax(xla::XlaBuilder* builder, XlaOpKernelContext* ctx,
+                       const xla::XlaOp& input, const TensorShape& input_shape,
+                       DataType input_type, DataType output_type, int axis,
+                       xla::XlaOp* argmax);
 
   // Sets `argmin` to the argmin of `input` along `axis`. `input_shape` and
   // `input_dtype` are the shape and dtype of `input` respectively, and
   // `output_type` is the dtype to use for `argmin`.
-  static Status ArgMin(xla::ComputationBuilder* builder,
-                       XlaOpKernelContext* ctx,
-                       const xla::ComputationDataHandle& input,
-                       const TensorShape& input_shape, DataType input_type,
-                       DataType output_type, int axis,
-                       xla::ComputationDataHandle* argmin);
+  static Status ArgMin(xla::XlaBuilder* builder, XlaOpKernelContext* ctx,
+                       const xla::XlaOp& input, const TensorShape& input_shape,
+                       DataType input_type, DataType output_type, int axis,
+                       xla::XlaOp* argmin);
 
   // Sets *iota to a rank 1 tensor with values [0, 1, 2, ...] of `dtype`.
-  static Status Iota(xla::ComputationBuilder* builder, DataType dtype,
-                     int64 size, xla::ComputationDataHandle* iota);
+  static Status Iota(xla::XlaBuilder* builder, DataType dtype, int64 size,
+                     xla::XlaOp* iota);
 
   // Converts `indices` into a one-hot representation. `depth` is the size
   // of the new axis to add. `axis` is the position at which to add the new
   // axis. `indices_shape` is the shape of `indices`. `on_value` and
   // `off_value` represent the values to use for the on and off positions,
   // respectively.
-  static Status OneHot(xla::ComputationBuilder* builder, int64 depth, int axis,
+  static Status OneHot(xla::XlaBuilder* builder, int64 depth, int axis,
                        DataType index_type, const TensorShape& indices_shape,
-                       const xla::ComputationDataHandle& indices,
-                       const xla::ComputationDataHandle& on_value,
-                       const xla::ComputationDataHandle& off_value,
-                       xla::ComputationDataHandle* one_hot);
+                       const xla::XlaOp& indices, const xla::XlaOp& on_value,
+                       const xla::XlaOp& off_value, xla::XlaOp* one_hot);
 
   // Certain DataTypes should use increased precision DataTypes when performing
   // reductions.  This function remaps a given DataType to a higher precision
@@ -115,10 +102,9 @@ class XlaHelpers {
 
   // A helper for creating a ConvertElementType xla op given a DataType rather
   // than the xla::PrimitiveType.
-  static xla::ComputationDataHandle ConvertElementType(
-      xla::ComputationBuilder* const builder,
-      const xla::ComputationDataHandle& operand,
-      const DataType new_element_type);
+  static xla::XlaOp ConvertElementType(xla::XlaBuilder* const builder,
+                                       const xla::XlaOp& operand,
+                                       const DataType new_element_type);
 };
 
 }  // end namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
index 1fe6e69ff2dc83..9e17756b27733e 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
@@ -112,10 +112,10 @@ void CollectNames(const T& entries, std::vector<string>* nonempty_names,
 XlaJitCompiledCpuFunction::Compile(
     const GraphDef& graph_def, const tf2xla::Config& config,
     const xla::ExecutableBuildOptions& build_options) {
-  // Convert the graph_def into an xla::Computation.
+  // Convert the graph_def into an xla::XlaComputation.
   TF_ASSIGN_OR_RETURN(xla::LocalClient * client,
                       xla::ClientLibrary::GetOrCreateLocalClient());
-  xla::Computation computation;
+  xla::XlaComputation computation;
   TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToXla(graph_def, config, client,
                                                       &computation));
 
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index c4bb90d58755f1..76c68d81af4dd9 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -30,7 +30,7 @@ bool XlaOpKernelContext::ValidateInputsAreSameShape(OpKernel* op) {
   return context_->ValidateInputsAreSameShape(op);
 }
 
-xla::ComputationBuilder* XlaOpKernelContext::builder() const {
+xla::XlaBuilder* XlaOpKernelContext::builder() const {
   return XlaContext::Get(this).builder();
 }
 
@@ -38,9 +38,9 @@ xla::ComputationBuilder* XlaOpKernelContext::builder() const {
 static const XlaExpression* CastExpressionFromTensor(const Tensor& tensor) {
   const XlaExpression* expression =
       reinterpret_cast<const XlaExpression*>(tensor.tensor_data().data());
-  CHECK(expression->handle().handle() != 0 ||
+  CHECK(expression->handle().builder() != nullptr ||
         expression->resource() != nullptr);
-  VLOG(1) << "Fetched T" << expression->handle().handle();
+  VLOG(1) << "Fetched T" << expression->handle();
   return expression;
 }
 
@@ -48,20 +48,18 @@ static const XlaExpression* CastExpressionFromTensor(const Tensor& tensor) {
 static XlaExpression* CastExpressionFromUninitializedTensor(Tensor* tensor) {
   const XlaExpression* expression =
       reinterpret_cast<const XlaExpression*>(tensor->tensor_data().data());
-  CHECK_EQ(expression->handle().handle(), 0);
+  CHECK_EQ(expression->handle().builder(), nullptr);
   return const_cast<XlaExpression*>(expression);
 }
 
-// Retrieves the ComputationDataHandle from an input Tensor to an Op. This
-// computation was constructed by an Op that executed previously and
-// created the output Tensor using CreateOutputTensorFromComputation
-// or CreateConstantOutputTensor.
-static const xla::ComputationDataHandle& GetComputationFromTensor(
-    const Tensor& tensor) {
+// Retrieves the XlaOp from an input Tensor to an Op. This computation was
+// constructed by an Op that executed previously and created the output Tensor
+// using CreateOutputTensorFromComputation or CreateConstantOutputTensor.
+static const xla::XlaOp& GetComputationFromTensor(const Tensor& tensor) {
   return CastExpressionFromTensor(tensor)->handle();
 }
 
-const xla::ComputationDataHandle& XlaOpKernelContext::Input(int index) {
+const xla::XlaOp& XlaOpKernelContext::Input(int index) {
   return GetComputationFromTensor(context_->input(index));
 }
 
@@ -106,7 +104,7 @@ Status XlaOpKernelContext::ConstantInputReshaped(
     return HostTensorToLiteral(temp, constant_literal);
   }
 
-  xla::ComputationDataHandle handle = expression->handle();
+  xla::XlaOp handle = expression->handle();
   if (new_shape != tensor.shape()) {
     // Reshape the handle to the desired shape.
     handle = builder()->Reshape(handle, new_shape.dim_sizes());
@@ -141,8 +139,17 @@ Status XlaOpKernelContext::ConstantInputReshaped(
   }
 
   // Ask the XLA compiler to evaluate the data handle to a literal.
+  xla::StatusOr<xla::XlaComputation> constant_graph =
+      builder()->BuildConstantSubGraph(handle);
+  if (!constant_graph.ok()) {
+    return errors::Internal(
+        "Error getting a compile-time constant graph for ",
+        context_->op_kernel().name(), " input ", index,
+        ".\nError: ", constant_graph.status().error_message());
+  }
   xla::StatusOr<std::unique_ptr<xla::Literal>> computed =
-      builder()->ComputeConstant(handle, &layout);
+      compiler()->client()->ComputeConstant(constant_graph.ValueOrDie(),
+                                            &layout);
   if (!computed.ok()) {
     return errors::Internal("Error evaluating ", context_->op_kernel().name(),
                             " input ", index,
@@ -260,9 +267,9 @@ Status XlaOpKernelContext::ConstantInputAsShape(int index, TensorShape* shape) {
   return Status::OK();
 }
 
-Status XlaOpKernelContext::InputList(
-    StringPiece name, std::vector<xla::ComputationDataHandle>* handles,
-    std::vector<TensorShape>* shapes) {
+Status XlaOpKernelContext::InputList(StringPiece name,
+                                     std::vector<xla::XlaOp>* handles,
+                                     std::vector<TensorShape>* shapes) {
   OpInputList inputs;
   TF_RETURN_IF_ERROR(context_->input_list(name, &inputs));
   handles->clear();
@@ -285,9 +292,9 @@ Status XlaOpKernelContext::ConstantInputList(
   return Status::OK();
 }
 
-Status XlaOpKernelContext::ReadVariableInput(
-    int index, DataType type, TensorShape* shape,
-    xla::ComputationDataHandle* value) {
+Status XlaOpKernelContext::ReadVariableInput(int index, DataType type,
+                                             TensorShape* shape,
+                                             xla::XlaOp* value) {
   const Tensor& tensor = context_->input(index);
   const XlaExpression* expression = CastExpressionFromTensor(tensor);
   XlaResource* variable = expression->resource();
@@ -307,8 +314,8 @@ Status XlaOpKernelContext::ReadVariableInput(
   }
 
   XlaContext& xla_context = XlaContext::Get(context_);
-  TensorShape representation_shape = xla_context.VariableRepresentationShape(
-      variable->shape(), variable->type());
+  TensorShape representation_shape =
+      xla_context.RepresentationShape(variable->shape(), variable->type());
   if (representation_shape == variable->shape()) {
     *value = variable->value();
   } else {
@@ -334,8 +341,7 @@ Status XlaOpKernelContext::GetVariableTypeAndShape(int index, DataType* type,
   return Status::OK();
 }
 
-void XlaOpKernelContext::SetOutput(int index,
-                                   const xla::ComputationDataHandle& handle) {
+void XlaOpKernelContext::SetOutput(int index, const xla::XlaOp& handle) {
   // Makes the host Tensor that will refer to the expression.
   Tensor* output = nullptr;
   auto shape = builder()->GetShape(handle);
@@ -349,7 +355,7 @@ void XlaOpKernelContext::SetOutput(int index,
   // corresponds.
   TensorShape tensor_shape;
   OP_REQUIRES_OK(context_,
-                 XLAShapeToTensorShape(*shape.ValueOrDie(), &tensor_shape));
+                 XLAShapeToTensorShape(shape.ValueOrDie(), &tensor_shape));
   OP_REQUIRES_OK(context_,
                  context_->allocate_output(index, tensor_shape, &output));
 
@@ -364,8 +370,8 @@ void XlaOpKernelContext::SetConstantOutput(int index, const Tensor& constant) {
 
   xla::Literal literal;
   OP_REQUIRES_OK(context_, HostTensorToLiteral(constant, &literal));
-  xla::ComputationDataHandle handle = builder()->ConstantLiteral(literal);
-  CHECK_NE(handle.handle(), 0);
+  xla::XlaOp handle = builder()->ConstantLiteral(literal);
+  CHECK_NE(handle.builder(), nullptr);
 
   // Make the Tensor that will refer to the expression.
   Tensor* output = nullptr;
@@ -386,8 +392,7 @@ void XlaOpKernelContext::SetInvalidOutput(int index) {
   OP_REQUIRES_OK(context_,
                  context_->allocate_output(index, TensorShape({}), &output));
   XlaExpression* expression = CastExpressionFromUninitializedTensor(output);
-  xla::ComputationDataHandle handle;
-  handle.set_handle(0);
+  xla::XlaOp handle;
   expression->set_handle(handle);
 }
 
@@ -410,8 +415,8 @@ Status XlaOpKernelContext::GetResourceInput(int index, XlaResource** resource) {
 }
 
 Status XlaOpKernelContext::AssignVariable(int input_index, DataType type,
-                                          xla::ComputationDataHandle handle) {
-  TF_RET_CHECK(handle.handle() != 0);
+                                          xla::XlaOp handle) {
+  TF_RET_CHECK(handle.builder() != nullptr);
 
   const XlaExpression* expression =
       CastExpressionFromTensor(context_->input(input_index));
@@ -425,13 +430,13 @@ Status XlaOpKernelContext::AssignVariable(int input_index, DataType type,
   }
   TensorShape shape;
   TF_RETURN_IF_ERROR(
-      XLAShapeToTensorShape(*shape_or_status.ValueOrDie(), &shape));
+      XLAShapeToTensorShape(shape_or_status.ValueOrDie(), &shape));
 
   TF_RETURN_IF_ERROR(variable->SetTypeAndShape(type, shape));
 
   XlaContext& xla_context = XlaContext::Get(context_);
   TensorShape representation_shape =
-      xla_context.VariableRepresentationShape(shape, type);
+      xla_context.RepresentationShape(shape, type);
   if (shape != representation_shape) {
     handle = builder()->Reshape(handle, representation_shape.dim_sizes());
   }
@@ -457,22 +462,22 @@ void XlaOpKernelContext::CtxFailureWithWarning(const char* file, int line,
   context_->CtxFailureWithWarning(file, line, s);
 }
 
-const xla::Computation* XlaOpKernelContext::GetOrCreateMax(
+const xla::XlaComputation* XlaOpKernelContext::GetOrCreateMax(
     const DataType type) {
   return XlaContext::Get(context_).GetOrCreateMax(type);
 }
 
-const xla::Computation* XlaOpKernelContext::GetOrCreateMin(
+const xla::XlaComputation* XlaOpKernelContext::GetOrCreateMin(
     const DataType type) {
   return XlaContext::Get(context_).GetOrCreateMin(type);
 }
 
-const xla::Computation* XlaOpKernelContext::GetOrCreateAdd(
+const xla::XlaComputation* XlaOpKernelContext::GetOrCreateAdd(
     const DataType type) {
   return XlaContext::Get(context_).GetOrCreateAdd(type);
 }
 
-const xla::Computation* XlaOpKernelContext::GetOrCreateMul(
+const xla::XlaComputation* XlaOpKernelContext::GetOrCreateMul(
     const DataType type) {
   return XlaContext::Get(context_).GetOrCreateMul(type);
 }
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h
index 4e4b97e0cec8d1..667dc262ca03ca 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.h
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_TF2XLA_XLA_OP_KERNEL_H_
 
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/platform/macros.h"
 
@@ -58,8 +58,8 @@ class XlaOpKernelContext {
  public:
   explicit XlaOpKernelContext(OpKernelContext* context);
 
-  // Returns the XLA ComputationBuilder containing the output of compilation.
-  xla::ComputationBuilder* builder() const;
+  // Returns the XLA XlaBuilder containing the output of compilation.
+  xla::XlaBuilder* builder() const;
 
   // Inputs
 
@@ -72,10 +72,10 @@ class XlaOpKernelContext {
   // Returns the shape of input 'index'.
   TensorShape InputShape(int index);
 
-  // Returns input 'index' as a ComputationDataHandle. Unlike
+  // Returns input 'index' as a XlaOp. Unlike
   // OpKernelContext::Input returns a symbolic value rather than a concrete
   // Tensor.
-  const xla::ComputationDataHandle& Input(int index);
+  const xla::XlaOp& Input(int index);
 
   // Returns true if all inputs are the same shape, otherwise sets the
   // status to a non-OK value and returns false.
@@ -85,8 +85,7 @@ class XlaOpKernelContext {
   // Returns the named list-valued immutable input in "list", as
   // defined in the OpDef.  If the named output is not list-valued,
   // returns a one-element list.
-  Status InputList(StringPiece name,
-                   std::vector<xla::ComputationDataHandle>* handles,
+  Status InputList(StringPiece name, std::vector<xla::XlaOp>* handles,
                    std::vector<TensorShape>* shapes);
 
   // Helper methods for constant inputs.
@@ -132,10 +131,10 @@ class XlaOpKernelContext {
     return context_->expected_output_dtype(index);
   }
 
-  // Sets output 'index' to the ComputationDataHandle 'handle'.
+  // Sets output 'index' to the XlaOp 'handle'.
   // All outputs should be set using SetOutput and SetConstantOutput, not
   // via the underlying OpKernelContext.
-  void SetOutput(int index, const xla::ComputationDataHandle& handle);
+  void SetOutput(int index, const xla::XlaOp& handle);
 
   // Sets output 'index' to compile-time constant 'host_tensor', where
   // 'host_tensor' is a tensor in host memory. It is preferable to use
@@ -168,14 +167,13 @@ class XlaOpKernelContext {
   // variable. Returns an error if the variable has not been initialized, or if
   // its type does not match `type`.
   Status ReadVariableInput(int index, DataType type, TensorShape* shape,
-                           xla::ComputationDataHandle* value);
+                           xla::XlaOp* value);
 
   // Assigns the value `handle` to the variable referenced by input
   // `input_index`. The variable must be of `type`. Returns an error if the
   // variable has been initialized with a different type or with a
   // different shape.
-  Status AssignVariable(int input_index, DataType type,
-                        xla::ComputationDataHandle handle);
+  Status AssignVariable(int input_index, DataType type, xla::XlaOp handle);
 
   // Helper routines for the OP_REQUIRES macros
   void CtxFailure(const Status& s);
@@ -205,22 +203,22 @@ class XlaOpKernelContext {
   // Gets an XLA lambda to compute Max. This is cached in the
   // XlaContext since it may be used by multiple Ops. There is a
   // separate specialization of the computation for each DataType.
-  const xla::Computation* GetOrCreateMax(const DataType type);
+  const xla::XlaComputation* GetOrCreateMax(const DataType type);
 
   // Gets an XLA lambda to compute Min. This is cached in the
   // XlaContext since it may be used by multiple Ops. There is a
   // separate specialization of the computation for each DataType.
-  const xla::Computation* GetOrCreateMin(const DataType type);
+  const xla::XlaComputation* GetOrCreateMin(const DataType type);
 
   // Gets an XLA lambda to compute Add. This is cached in the
   // XlaContext since it may be used by multiple Ops. There is a
   // separate specialization of the computation for each DataType.
-  const xla::Computation* GetOrCreateAdd(const DataType type);
+  const xla::XlaComputation* GetOrCreateAdd(const DataType type);
 
   // Gets an XLA lambda to compute Mul. This is cached in the
   // XlaContext since it may be used by multiple Ops. There is a
   // separate specialization of the computation for each DataType.
-  const xla::Computation* GetOrCreateMul(const DataType type);
+  const xla::XlaComputation* GetOrCreateMul(const DataType type);
 
  private:
   OpKernelContext* const context_;
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.cc b/tensorflow/compiler/tf2xla/xla_op_registry.cc
index bbe808595d9583..4692038b61f687 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.cc
@@ -39,10 +39,10 @@ const char* const DEVICE_XLA_GPU = "XLA_GPU";
 
 static Status LaunchOpHasKernelForDevice(const DeviceType& device_type) {
   const OpDef* op_def;
-  TF_RETURN_IF_ERROR(OpRegistry::Global()->LookUpOpDef("_XlaLaunch", &op_def));
+  TF_RETURN_IF_ERROR(OpRegistry::Global()->LookUpOpDef("XlaLaunch", &op_def));
   NodeDef node_def;
   node_def.set_name("_XlaLaunch-op");
-  node_def.set_op("_XlaLaunch");
+  node_def.set_op("XlaLaunch");
   string kernel_class_name;
   TF_RETURN_IF_ERROR(FindKernelDef(device_type, node_def, /*KernelDef*/ nullptr,
                                    &kernel_class_name));
@@ -311,7 +311,7 @@ XlaOpRegistry& XlaOpRegistry::Instance() {
 
 XlaOpRegistrationBuilder::XlaOpRegistrationBuilder(StringPiece name) {
   registration_.reset(new XlaOpRegistry::OpRegistration);
-  registration_->name = name.ToString();
+  registration_->name = std::string(name);
 }
 
 XlaOpRegistrationBuilder XlaOpRegistrationBuilder::Name(StringPiece name) {
@@ -323,14 +323,14 @@ XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::Device(
     gtl::ArraySlice<StringPiece> devices) {
   registration_->has_device_whitelist = true;
   for (StringPiece device : devices) {
-    registration_->device_whitelist.insert(device.ToString());
+    registration_->device_whitelist.insert(std::string(device));
   }
   return *this;
 }
 
 XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::Device(StringPiece device) {
   registration_->has_device_whitelist = true;
-  registration_->device_whitelist.insert(device.ToString());
+  registration_->device_whitelist.insert(std::string(device));
   return *this;
 }
 
@@ -347,7 +347,7 @@ XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::AllowResourceTypes() {
 XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::TypeConstraint(
     StringPiece attr_name, DataType allowed) {
   std::set<DataType>& types =
-      registration_->type_constraints[attr_name.ToString()];
+      registration_->type_constraints[std::string(attr_name)];
   types.insert(allowed);
   return *this;
 }
@@ -355,7 +355,7 @@ XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::TypeConstraint(
 XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::TypeConstraint(
     StringPiece attr_name, gtl::ArraySlice<DataType> allowed) {
   std::set<DataType>& types =
-      registration_->type_constraints[attr_name.ToString()];
+      registration_->type_constraints[std::string(attr_name)];
   for (DataType t : allowed) {
     types.insert(t);
   }
@@ -364,7 +364,7 @@ XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::TypeConstraint(
 
 XlaOpRegistrationBuilder& XlaOpRegistrationBuilder::CompileTimeConstInput(
     StringPiece input_name) {
-  registration_->compile_time_constant_inputs.insert(input_name.ToString());
+  registration_->compile_time_constant_inputs.insert(std::string(input_name));
   return *this;
 }
 
@@ -394,7 +394,7 @@ XlaBackendRegistrar::XlaBackendRegistrar(
     StringPiece name, gtl::ArraySlice<DataType> types,
     XlaOpRegistry::BackendOpFilter op_filter) {
   XlaOpRegistry& registry = XlaOpRegistry::Instance();
-  registry.RegisterBackend(name.ToString(), types, op_filter);
+  registry.RegisterBackend(std::string(name), types, op_filter);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_resource.cc b/tensorflow/compiler/tf2xla/xla_resource.cc
index c2075b44b82ba2..540c65c597f20d 100644
--- a/tensorflow/compiler/tf2xla/xla_resource.cc
+++ b/tensorflow/compiler/tf2xla/xla_resource.cc
@@ -26,8 +26,7 @@ limitations under the License.
 namespace tensorflow {
 
 XlaResource::XlaResource(Kind kind, int arg_num, string name, DataType type,
-                         TensorShape shape,
-                         const xla::ComputationDataHandle& initial_value,
+                         TensorShape shape, const xla::XlaOp& initial_value,
                          int64 tensor_array_size,
                          const std::set<string>& tensor_array_gradients)
     : kind_(kind),
@@ -41,11 +40,10 @@ XlaResource::XlaResource(Kind kind, int arg_num, string name, DataType type,
   CHECK(kind_ != kInvalid);
 
   for (const string& gradient : tensor_array_gradients) {
-    tensor_array_gradients_[gradient].reset(
-        new XlaResource(/*kind=*/kTensorArray, /*arg_num=*/-1,
-                        /*name=*/strings::StrCat("TensorArrayGrad: ", name_),
-                        type_, shape_, xla::ComputationDataHandle(),
-                        tensor_array_size_, /*tensor_array_gradients=*/{}));
+    tensor_array_gradients_[gradient].reset(new XlaResource(
+        /*kind=*/kTensorArray, /*arg_num=*/-1,
+        /*name=*/strings::StrCat("TensorArrayGrad: ", name_), type_, shape_,
+        xla::XlaOp(), tensor_array_size_, /*tensor_array_gradients=*/{}));
   }
 }
 
@@ -73,7 +71,7 @@ Status XlaResource::SetTypeAndShape(DataType type, const TensorShape& shape) {
   return Status::OK();
 }
 
-Status XlaResource::SetValue(const xla::ComputationDataHandle& value) {
+Status XlaResource::SetValue(const xla::XlaOp& value) {
   if (type_ == DT_INVALID) {
     return errors::InvalidArgument(
         "Resource '", name_,
@@ -83,7 +81,7 @@ Status XlaResource::SetValue(const xla::ComputationDataHandle& value) {
   return Status::OK();
 }
 
-Status XlaResource::SetZeroValue(xla::ComputationBuilder* builder) {
+Status XlaResource::SetZeroValue(xla::XlaBuilder* builder) {
   if (type_ == DT_INVALID) {
     return errors::InvalidArgument(
         "Resource '", name_,
@@ -121,9 +119,9 @@ Status XlaResource::SetZeroValue(xla::ComputationBuilder* builder) {
   return Status::OK();
 }
 
-Status XlaResource::GetOrCreateTensorArrayGradient(
-    const string& source, xla::ComputationBuilder* builder,
-    XlaResource** gradient_out) {
+Status XlaResource::GetOrCreateTensorArrayGradient(const string& source,
+                                                   xla::XlaBuilder* builder,
+                                                   XlaResource** gradient_out) {
   VLOG(2) << "Gradient lookup for resource: " << name_
           << " gradient: " << source;
   TF_RET_CHECK(kind_ == kTensorArray);
@@ -132,7 +130,7 @@ Status XlaResource::GetOrCreateTensorArrayGradient(
     TensorShape ta_shape;
     ta_shape.AddDim(tensor_array_size_);
     ta_shape.AppendShape(shape_);
-    xla::ComputationDataHandle gradient_value = builder->Broadcast(
+    xla::XlaOp gradient_value = builder->Broadcast(
         XlaHelpers::Zero(builder, type_), ta_shape.dim_sizes());
     gradient.reset(
         new XlaResource(/*kind=*/kTensorArray, /*arg_num=*/-1,
@@ -144,13 +142,12 @@ Status XlaResource::GetOrCreateTensorArrayGradient(
   return Status::OK();
 }
 
-Status XlaResource::Pack(xla::ComputationDataHandle* pack,
-                         xla::ComputationBuilder* builder) const {
+Status XlaResource::Pack(xla::XlaOp* pack, xla::XlaBuilder* builder) const {
   if (tensor_array_gradients_.empty()) {
     *pack = value_;
   } else {
     TF_RET_CHECK(kind_ == kTensorArray);
-    std::vector<xla::ComputationDataHandle> elems;
+    std::vector<xla::XlaOp> elems;
     elems.push_back(value_);
     for (const auto& gradient : tensor_array_gradients_) {
       elems.push_back(gradient.second->value_);
@@ -161,8 +158,8 @@ Status XlaResource::Pack(xla::ComputationDataHandle* pack,
 }
 
 Status XlaResource::SetFromPack(const std::set<string>& gradient_sources,
-                                const xla::ComputationDataHandle& pack,
-                                xla::ComputationBuilder* builder) {
+                                const xla::XlaOp& pack,
+                                xla::XlaBuilder* builder) {
   if (gradient_sources.empty()) {
     if (!initialized()) {
       initial_value_ = pack;
diff --git a/tensorflow/compiler/tf2xla/xla_resource.h b/tensorflow/compiler/tf2xla/xla_resource.h
index 1bb2c7274ecdf0..9ce36d1aa76223 100644
--- a/tensorflow/compiler/tf2xla/xla_resource.h
+++ b/tensorflow/compiler/tf2xla/xla_resource.h
@@ -18,7 +18,7 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
@@ -37,8 +37,7 @@ class XlaResource {
   };
 
   XlaResource(Kind kind, int arg_num, string name, DataType type,
-              TensorShape shape,
-              const xla::ComputationDataHandle& initial_value,
+              TensorShape shape, const xla::XlaOp& initial_value,
               int64 tensor_array_size,
               const std::set<string>& tensor_array_gradients);
 
@@ -69,16 +68,14 @@ class XlaResource {
   // this is the shape of each entry in the TensorArray/Stack.
   const TensorShape& shape() const { return shape_; }
 
-  const xla::ComputationDataHandle& value() const { return value_; }
+  const xla::XlaOp& value() const { return value_; }
 
   // Value of the resource at computation entry. Used to detect which
   // variables have new values that need to be written back.
-  const xla::ComputationDataHandle& initial_value() const {
-    return initial_value_;
-  }
+  const xla::XlaOp& initial_value() const { return initial_value_; }
 
   // A variable is initialized if it has a value.
-  bool initialized() const { return value_.handle() > 0; }
+  bool initialized() const { return value_.builder() != nullptr; }
 
   // Sets the type and shape of the resource. The type and shape of a resource
   // must not change once the variable has been initialized.
@@ -86,17 +83,17 @@ class XlaResource {
 
   // Sets the current value of the resource. Returns an error if the type is not
   // set to a valid value.
-  Status SetValue(const xla::ComputationDataHandle& value);
+  Status SetValue(const xla::XlaOp& value);
 
   // Sets the current value of the resource to an all-zero value.
-  Status SetZeroValue(xla::ComputationBuilder* builder);
+  Status SetZeroValue(xla::XlaBuilder* builder);
 
   // Looks up the gradient for `source`, or creates it if it does not already
   // exist. The call target must be an initialized TensorArray resource. A
   // TensorArray can have multiple named gradients; see the operator
   // documentation for TensorArrayGradV3 for details.
   Status GetOrCreateTensorArrayGradient(const string& source,
-                                        xla::ComputationBuilder* builder,
+                                        xla::XlaBuilder* builder,
                                         XlaResource** gradient_out);
 
   // Packs a resource into a single XLA value `pack`, suitable for use as
@@ -104,8 +101,7 @@ class XlaResource {
   // gradients, sets `*pack` to `value`.
   // For TensorArrays with gradients, packs the value and its gradient values in
   // a tuple; the gradients values are packed in order by source name.
-  Status Pack(xla::ComputationDataHandle* pack,
-              xla::ComputationBuilder* builder) const;
+  Status Pack(xla::XlaOp* pack, xla::XlaBuilder* builder) const;
 
   // Updates the resource with values from `pack`. If `gradient_sources` is
   // non-empty, treats `pack` as a tuple that represents a TensorArray and
@@ -114,8 +110,7 @@ class XlaResource {
   // values.
   // Opposite of Pack().
   Status SetFromPack(const std::set<string>& gradient_sources,
-                     const xla::ComputationDataHandle& pack,
-                     xla::ComputationBuilder* builder);
+                     const xla::XlaOp& pack, xla::XlaBuilder* builder);
 
   // TensorArray and Stack specific fields
 
@@ -144,8 +139,8 @@ class XlaResource {
 
   DataType type_;
   TensorShape shape_;
-  xla::ComputationDataHandle value_;
-  xla::ComputationDataHandle initial_value_;
+  xla::XlaOp value_;
+  xla::XlaOp initial_value_;
 
   int64 tensor_array_size_ = -1;
 
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 1af9cb6d2ab15a..1b8e516770c3e2 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -53,7 +53,6 @@ xla_proto_library(
     deps = [
         ":xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo_proto",
-        "//tensorflow/compiler/xla/service:session_proto",
     ],
 )
 
@@ -99,8 +98,9 @@ cc_library(
     hdrs = ["service_interface.h"],
     visibility = [":friends"],
     deps = [
+        ":status",
+        ":xla_data_proto",
         ":xla_proto",
-        "//tensorflow/core:lib",
     ],
 )
 
@@ -244,6 +244,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":protobuf_util",
+        ":status",
         ":status_macros",
         ":statusor",
         ":types",
@@ -302,13 +303,13 @@ cc_library(
         ":array2d",
         ":array3d",
         ":array4d",
-        ":shape_tree",
         ":shape_util",
         ":sparse_index_array",
         ":status_macros",
         ":types",
         ":util",
         ":xla_data_proto",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
     ],
 )
@@ -323,12 +324,30 @@ tf_cc_test(
         ":shape_util",
         ":test",
         ":types",
+        "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
 )
 
+cc_library(
+    name = "error_spec",
+    hdrs = ["error_spec.h"],
+)
+
+cc_library(
+    name = "literal_comparison",
+    srcs = ["literal_comparison.cc"],
+    hdrs = ["literal_comparison.h"],
+    deps = [
+        ":error_spec",
+        ":literal_util",
+        ":util",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "metric_table_report",
     srcs = ["metric_table_report.cc"],
@@ -563,6 +582,7 @@ tf_cc_test(
         ":shape_util",
         ":test",
         ":xla_data_proto",
+        "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
 )
diff --git a/tensorflow/compiler/xla/README.md b/tensorflow/compiler/xla/README.md
index c93c39e180655e..39f8caaa961dc7 100644
--- a/tensorflow/compiler/xla/README.md
+++ b/tensorflow/compiler/xla/README.md
@@ -1 +1,7 @@
-This is the home of XLA.
+<p align="center">
+  <img width="200" src="xlalogo.png"/>
+</p>
+
+XLA (Accelerated Linear Algebra) is a domain-specific compiler for linear
+algebra that optimizes TensorFlow computations. See the
+[documentation](https://www.tensorflow.org/performance/xla/) for more details.
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index 286d06d12ffca7..8f08d3b2e04670 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -63,7 +63,6 @@ cc_library(
     srcs = ["client.cc"],
     hdrs = ["client.h"],
     deps = [
-        ":computation",
         ":global_data",
         "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:literal_util",
@@ -76,7 +75,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_proto",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
-        "//tensorflow/compiler/xla/service:session_proto",
+        "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/core:lib",
     ],
 )
@@ -87,6 +86,7 @@ cc_library(
     hdrs = ["executable_build_options.h"],
     deps = [
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/core:lib",
@@ -99,17 +99,18 @@ cc_library(
     hdrs = ["local_client.h"],
     deps = [
         ":client",
-        ":computation",
         ":executable_build_options",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service:backend",
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/service:source_map_util",
@@ -125,7 +126,6 @@ cc_library(
     hdrs = ["compile_only_client.h"],
     deps = [
         ":client",
-        ":computation",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
@@ -161,47 +161,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "computation",
-    srcs = ["computation.cc"],
-    hdrs = ["computation.h"],
-    deps = [
-        "//tensorflow/compiler/xla:service_interface",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla:xla_proto",
-        "//tensorflow/compiler/xla/service:session_proto",
-        "//tensorflow/core:lib",
-    ],
-)
-
-cc_library(
-    name = "computation_builder",
-    srcs = ["computation_builder.cc"],
-    hdrs = ["computation_builder.h"],
-    deps = [
-        ":client",
-        ":computation",
-        ":global_data",
-        ":padding",
-        "//tensorflow/compiler/xla:array",
-        "//tensorflow/compiler/xla:array2d",
-        "//tensorflow/compiler/xla:array3d",
-        "//tensorflow/compiler/xla:array4d",
-        "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla:xla_proto",
-        "//tensorflow/core:lib",
-    ],
-)
-
 cc_library(
     name = "sharding_builder",
     srcs = ["sharding_builder.cc"],
diff --git a/tensorflow/compiler/xla/client/client.cc b/tensorflow/compiler/xla/client/client.cc
index 328e1b8fa84e7b..3d596a6e65430b 100644
--- a/tensorflow/compiler/xla/client/client.cc
+++ b/tensorflow/compiler/xla/client/client.cc
@@ -64,7 +64,7 @@ StatusOr<std::unique_ptr<Literal>> Client::Transfer(
 }
 
 StatusOr<std::unique_ptr<GlobalData>> Client::TransferToServer(
-    const Literal& literal, const DeviceHandle* device_handle) {
+    const LiteralSlice& literal, const DeviceHandle* device_handle) {
   TransferToServerRequest request;
   *request.mutable_literal() = literal.ToProto();
   if (device_handle) {
@@ -91,7 +91,7 @@ StatusOr<std::unique_ptr<GlobalData>> Client::TransferToServer(
   return MakeUnique<GlobalData>(stub_, response.data());
 }
 
-Status Client::TransferToInfeed(const Literal& literal, int64 replica_id,
+Status Client::TransferToInfeed(const LiteralSlice& literal, int64 replica_id,
                                 const DeviceHandle* device_handle) {
   TransferToInfeedRequest request;
   *request.mutable_literal() = literal.ToProto();
@@ -161,22 +161,6 @@ Status Client::ResetDevice() {
   return Status::OK();
 }
 
-StatusOr<std::unique_ptr<Literal>> Client::ExecuteAndTransfer(
-    const Computation& computation,
-    tensorflow::gtl::ArraySlice<GlobalData*> arguments,
-    const ExecutionOptions* execution_options,
-    ExecutionProfile* execution_profile) {
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<GlobalData> data,
-      Execute(computation, arguments, execution_options, execution_profile));
-
-  const Shape* shape_with_output_layout = nullptr;
-  if (execution_options && execution_options->has_shape_with_output_layout()) {
-    shape_with_output_layout = &execution_options->shape_with_output_layout();
-  }
-  return Transfer(*data, shape_with_output_layout);
-}
-
 StatusOr<std::unique_ptr<Literal>> Client::ExecuteAndTransfer(
     const XlaComputation& computation,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments,
@@ -221,65 +205,11 @@ StatusOr<std::unique_ptr<Literal>> Client::ComputeConstant(
   return Literal::CreateFromProto(response.literal());
 }
 
-StatusOr<Computation> Client::LoadSnapshot(const SessionModule& module) {
-  LoadComputationSnapshotRequest request;
-  *request.mutable_module() = module;
-  LoadComputationSnapshotResponse response;
-
-  Status s = stub_->LoadComputationSnapshot(&request, &response);
-  if (!s.ok()) {
-    return s;
-  }
-
-  VLOG(1) << "load snapshot response: " << response.ShortDebugString();
-  return Computation(stub_, response.computation());
-}
-
 StatusOr<XlaComputation> Client::LoadSnapshot(const HloSnapshot& module) {
   TF_RET_CHECK(module.has_hlo() && module.hlo().has_hlo_module());
   return XlaComputation(module.hlo().hlo_module());
 }
 
-StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
-    const Computation& computation,
-    tensorflow::gtl::ArraySlice<GlobalData*> arguments,
-    const ExecutionOptions* execution_options,
-    ExecutionProfile* execution_profile) {
-  ExecuteRequest request;
-  *request.mutable_computation() = computation.handle();
-
-  if (execution_options == nullptr) {
-    *request.mutable_execution_options() = CreateDefaultExecutionOptions();
-  } else {
-    *request.mutable_execution_options() = *execution_options;
-  }
-  for (GlobalData* argument : arguments) {
-    CHECK(argument != nullptr) << "Argument pointers must not be null.";
-    *request.add_arguments() = argument->handle();
-  }
-
-  ExecuteResponse response;
-  VLOG(1) << "making execute request: " << request.ShortDebugString();
-  Status s = stub_->Execute(&request, &response);
-  VLOG(1) << "done with request";
-
-  if (!s.ok()) {
-    return s;
-  }
-
-  if (execution_profile != nullptr) {
-    *execution_profile = response.profile();
-    if (VLOG_IS_ON(1)) {
-      TF_ASSIGN_OR_RETURN(
-          auto execution_stats,
-          ExecutionStatsAsString(computation, response.profile()));
-      VLOG(1) << execution_stats;
-    }
-  }
-
-  return MakeUnique<GlobalData>(stub_, response.output());
-}
-
 StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
     const XlaComputation& computation,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments,
@@ -320,41 +250,6 @@ StatusOr<std::unique_ptr<GlobalData>> Client::Execute(
   return MakeUnique<GlobalData>(stub_, response.output());
 }
 
-StatusOr<std::vector<std::unique_ptr<GlobalData>>> Client::ExecuteParallel(
-    tensorflow::gtl::ArraySlice<ComputationInstance> computations) {
-  ExecuteParallelRequest request;
-
-  for (const ComputationInstance& computation : computations) {
-    ExecuteRequest single_request;
-    *single_request.mutable_computation() = computation.computation.handle();
-    for (GlobalData* argument : computation.arguments) {
-      *single_request.add_arguments() = argument->handle();
-    }
-    *single_request.mutable_execution_options() = computation.execution_options;
-    *request.add_requests() = single_request;
-  }
-
-  ExecuteParallelResponse response;
-  VLOG(1) << "making execute-parallel request: " << request.ShortDebugString();
-  tensorflow::Status s = stub_->ExecuteParallel(&request, &response);
-  VLOG(1) << "done with request";
-
-  if (!s.ok()) {
-    return s;
-  }
-
-  std::vector<std::unique_ptr<GlobalData>> outputs;
-  for (size_t i = 0; i < computations.size(); ++i) {
-    outputs.push_back(
-        MakeUnique<GlobalData>(stub_, response.responses(i).output()));
-    if (computations[i].execution_profile != nullptr) {
-      *computations[i].execution_profile = response.responses(i).profile();
-    }
-  }
-
-  return std::move(outputs);
-}
-
 StatusOr<std::vector<std::unique_ptr<GlobalData>>> Client::ExecuteParallel(
     tensorflow::gtl::ArraySlice<XlaComputationInstance> computations) {
   ExecuteGraphParallelRequest request;
@@ -372,7 +267,7 @@ StatusOr<std::vector<std::unique_ptr<GlobalData>>> Client::ExecuteParallel(
   ExecuteParallelResponse response;
   VLOG(1) << "making execute-graph-parallel request: "
           << request.ShortDebugString();
-  tensorflow::Status s = stub_->ExecuteGraphParallel(&request, &response);
+  Status s = stub_->ExecuteGraphParallel(&request, &response);
   VLOG(1) << "done with request";
 
   if (!s.ok()) {
@@ -401,7 +296,7 @@ StatusOr<std::vector<DeviceHandle>> Client::GetDeviceHandles(
 
   GetDeviceHandlesResponse response;
   VLOG(1) << "making get device request: " << request.ShortDebugString();
-  tensorflow::Status s = stub_->GetDeviceHandles(&request, &response);
+  Status s = stub_->GetDeviceHandles(&request, &response);
   VLOG(1) << "done with request";
 
   if (!s.ok()) {
@@ -449,24 +344,6 @@ StatusOr<std::vector<std::unique_ptr<GlobalData>>> Client::DeconstructTuple(
   return std::move(handles);
 }
 
-StatusOr<ComputationStats> Client::GetComputationStats(
-    const Computation& computation, const DebugOptions& debug_options) const {
-  ComputationStatsRequest request;
-  *request.mutable_computation() = computation.handle();
-  *request.mutable_debug_options() = debug_options;
-  ComputationStatsResponse response;
-
-  VLOG(1) << "making computation stats request";
-  Status s = stub_->GetComputationStats(&request, &response);
-  VLOG(1) << "done with request";
-
-  if (!s.ok()) {
-    return s;
-  }
-  CHECK(response.has_stats());
-  return response.stats();
-}
-
 StatusOr<ComputationStats> Client::GetComputationStats(
     const XlaComputation& computation,
     const DebugOptions& debug_options) const {
@@ -488,23 +365,6 @@ StatusOr<ComputationStats> Client::GetComputationStats(
   return response.stats();
 }
 
-StatusOr<std::unique_ptr<ProgramShape>> Client::GetComputationShape(
-    const Computation& computation) {
-  GetComputationShapeRequest request;
-  *request.mutable_computation() = computation.handle();
-  GetComputationShapeResponse response;
-
-  VLOG(1) << "making get-computation-shape request";
-  Status s = stub_->GetComputationShape(&request, &response);
-  VLOG(1) << "done with request";
-
-  if (!s.ok()) {
-    return s;
-  }
-
-  return WrapUnique(response.release_program_shape());
-}
-
 StatusOr<std::unique_ptr<ProgramShape>> Client::GetComputationShape(
     const XlaComputation& computation) {
   TF_ASSIGN_OR_RETURN(const auto& result, computation.GetProgramShape());
@@ -527,28 +387,6 @@ StatusOr<Shape> Client::GetShape(const GlobalData& data) {
   return response.shape();
 }
 
-StatusOr<string> Client::ExecutionStatsAsString(
-    const Computation& computation, const ExecutionProfile& profile) {
-  TF_ASSIGN_OR_RETURN(
-      auto computation_stats,
-      GetComputationStats(computation,
-                          legacy_flags::GetDebugOptionsFromFlags()));
-  int64 total_flops =
-      computation_stats.flop_count() + computation_stats.transcendental_count();
-  if (profile.compute_time_ns() > 0) {
-    int64 nanoseconds = profile.compute_time_ns();
-    int64 cycle_count = profile.compute_cycle_count();
-    double gflops = total_flops / nanoseconds;
-    return tensorflow::strings::StrCat(
-        "[Execution Statistics] flop count: ", computation_stats.flop_count(),
-        ", transcendental count: ", computation_stats.transcendental_count(),
-        ", compute execution time: ", nanoseconds, " nsec",
-        ", compute cycles: ", cycle_count, ", performance: ", gflops,
-        "gflop/s");
-  }
-  return string("[Execution Statistics] not available.");
-}
-
 StatusOr<string> Client::ExecutionStatsAsString(
     const XlaComputation& computation, const ExecutionProfile& profile) {
   TF_ASSIGN_OR_RETURN(
diff --git a/tensorflow/compiler/xla/client/client.h b/tensorflow/compiler/xla/client/client.h
index a63ff4c56d1dd7..68f0d0ac78c859 100644
--- a/tensorflow/compiler/xla/client/client.h
+++ b/tensorflow/compiler/xla/client/client.h
@@ -19,11 +19,10 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/session.pb.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -52,21 +51,6 @@ class Client {
   //   device is chosen by the service.
   // * If execution_profile is not nullptr then the pointed-to ExecutionProfile
   //   will be filled with profile data from the execution.
-  StatusOr<std::unique_ptr<GlobalData>> Execute(
-      const Computation& computation,
-      tensorflow::gtl::ArraySlice<GlobalData*> arguments,
-      const ExecutionOptions* execution_options = nullptr,
-      ExecutionProfile* execution_profile = nullptr);
-
-  // Executes the computation with the given arguments and returns the global
-  // data that was produced from the execution.
-  // * If execution_options is not nullptr, these options are passed to the
-  //   service to affect how it compiles our computation.  (The pointer does not
-  //   need to live beyond this call.)
-  // * If execution_profile is not nullptr then the pointed-to ExecutionProfile
-  //   will be filled with profile data from the execution.
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   StatusOr<std::unique_ptr<GlobalData>> Execute(
       const XlaComputation& computation,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments,
@@ -78,34 +62,6 @@ class Client {
   //   executed on the devices associated with the handles by partitioning the
   //   computation based on the attached sharding attributes. Otherwise, a
   //   device is chosen by the service.
-  struct ComputationInstance {
-    const Computation& computation;
-    std::vector<GlobalData*> arguments;
-    ExecutionOptions execution_options;
-    ExecutionProfile* execution_profile;
-
-    ComputationInstance(const Computation& computation,
-                        std::vector<GlobalData*> arguments,
-                        ExecutionOptions execution_options,
-                        ExecutionProfile* execution_profile)
-        : computation(computation),
-          arguments(std::move(arguments)),
-          execution_options(execution_options),
-          execution_profile(execution_profile) {}
-  };
-
-  // Executes a list ComputationInstances and returns global data produced from
-  // each computation.
-  StatusOr<std::vector<std::unique_ptr<GlobalData>>> ExecuteParallel(
-      tensorflow::gtl::ArraySlice<ComputationInstance> computations);
-
-  // A struct to represent a computation instance to be executed.
-  // * If execution_options.device_handles is not empty, the computation is
-  //   executed on the devices associated with the handles by partitioning the
-  //   computation based on the attached sharding attributes. Otherwise, a
-  //   device is chosen by the service.
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   struct XlaComputationInstance {
     const XlaComputation& computation;
     std::vector<GlobalData*> arguments;
@@ -125,7 +81,6 @@ class Client {
   // Executes a list XlaComputationInstances and returns global data produced
   // from each computation.
   //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   StatusOr<std::vector<std::unique_ptr<GlobalData>>> ExecuteParallel(
       tensorflow::gtl::ArraySlice<XlaComputationInstance> computations);
 
@@ -152,14 +107,14 @@ class Client {
   // device (and its replicas if replication is enabled). Otherwise, data is
   // transferred to the default device (and its replicas).
   StatusOr<std::unique_ptr<GlobalData>> TransferToServer(
-      const Literal& literal, const DeviceHandle* device_handle = nullptr);
+      const LiteralSlice& literal, const DeviceHandle* device_handle = nullptr);
 
   // Transfer the given literal to the Infeed interface of the device.
   //
   // device_handle and replica_id together specify a particular device; a device
   // assigned for the given replica_id among the replicas that the given device
   // handle belongs to.
-  Status TransferToInfeed(const Literal& literal, int64 replica_id = 0,
+  Status TransferToInfeed(const LiteralSlice& literal, int64 replica_id = 0,
                           const DeviceHandle* device_handle = nullptr);
 
   // Transfers from the Outfeed of the device.
@@ -177,17 +132,6 @@ class Client {
   // Executes the computation with the given arguments and transfers the result
   // to the client as a literal. Parameters are defined the same as for
   // Execute() and Transfer().
-  StatusOr<std::unique_ptr<Literal>> ExecuteAndTransfer(
-      const Computation& computation,
-      tensorflow::gtl::ArraySlice<GlobalData*> arguments,
-      const ExecutionOptions* execution_options = nullptr,
-      ExecutionProfile* execution_profile = nullptr);
-
-  // Executes the computation with the given arguments and transfers the result
-  // to the client as a literal. Parameters are defined the same as for
-  // Execute() and Transfer().
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   StatusOr<std::unique_ptr<Literal>> ExecuteAndTransfer(
       const XlaComputation& computation,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments,
@@ -209,8 +153,6 @@ class Client {
   //
   // If output_layout is non-null, then the output of the computation will be
   // stored using that layout.
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   StatusOr<std::unique_ptr<Literal>> ComputeConstant(
       const XlaComputation& computation,
       const Layout* output_layout = nullptr) const;
@@ -223,12 +165,6 @@ class Client {
       const GlobalData& data);
 
   // Retrieves the statistics of the given computation.
-  StatusOr<ComputationStats> GetComputationStats(
-      const Computation& computation, const DebugOptions& debug_options) const;
-
-  // Retrieves the statistics of the given computation.
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   StatusOr<ComputationStats> GetComputationStats(
       const XlaComputation& computation,
       const DebugOptions& debug_options) const;
@@ -239,13 +175,6 @@ class Client {
 
   // As above, but returns the shape of the provided computation (parameter
   // types/names and return type).
-  StatusOr<std::unique_ptr<ProgramShape>> GetComputationShape(
-      const Computation& computation);
-
-  // As above, but returns the shape of the provided computation (parameter
-  // types/names and return type).
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   StatusOr<std::unique_ptr<ProgramShape>> GetComputationShape(
       const XlaComputation& computation);
 
@@ -253,9 +182,6 @@ class Client {
   // two computations via a pair of Send and Recv instructions.
   StatusOr<ChannelHandle> CreateChannelHandle();
 
-  StatusOr<Computation> LoadSnapshot(const SessionModule& module);
-
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   StatusOr<XlaComputation> LoadSnapshot(const HloSnapshot& module);
 
   ServiceInterface* stub() { return stub_; }
@@ -263,8 +189,6 @@ class Client {
  private:
   // Returns the execution statistics (e.g., gflop/s) as a string from the
   // ExecutionProfile returned from an execution of the computation.
-  StatusOr<string> ExecutionStatsAsString(const Computation& computation,
-                                          const ExecutionProfile& profile);
   StatusOr<string> ExecutionStatsAsString(const XlaComputation& computation,
                                           const ExecutionProfile& profile);
 
diff --git a/tensorflow/compiler/xla/client/compile_only_client.cc b/tensorflow/compiler/xla/client/compile_only_client.cc
index 96e38bca010879..dc69d2097ebe14 100644
--- a/tensorflow/compiler/xla/client/compile_only_client.cc
+++ b/tensorflow/compiler/xla/client/compile_only_client.cc
@@ -21,24 +21,6 @@ limitations under the License.
 
 namespace xla {
 
-StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-CompileOnlyClient::CompileAheadOfTime(
-    const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
-    const AotCompilationOptions& options) {
-  std::vector<CompileOnlyService::AotComputationInstance> service_instances;
-  service_instances.reserve(computations.size());
-  for (const AotComputationInstance& instance : computations) {
-    service_instances.push_back({});
-    CompileOnlyService::AotComputationInstance& service_instance =
-        service_instances.back();
-    TF_RET_CHECK(instance.computation != nullptr);
-    service_instance.computation = instance.computation->handle();
-    service_instance.argument_layouts = instance.argument_layouts;
-    service_instance.result_layout = instance.result_layout;
-  }
-  return compiler_service_->CompileAheadOfTime(service_instances, options);
-}
-
 StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
 CompileOnlyClient::CompileAheadOfTime(
     const tensorflow::gtl::ArraySlice<AotXlaComputationInstance> computations,
diff --git a/tensorflow/compiler/xla/client/compile_only_client.h b/tensorflow/compiler/xla/client/compile_only_client.h
index c8725b8517484a..f9a7c31270c7a1 100644
--- a/tensorflow/compiler/xla/client/compile_only_client.h
+++ b/tensorflow/compiler/xla/client/compile_only_client.h
@@ -17,7 +17,6 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_CLIENT_COMPILE_ONLY_CLIENT_H_
 
 #include "tensorflow/compiler/xla/client/client.h"
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/compile_only_service.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
@@ -38,26 +37,7 @@ class CompileOnlyClient : public Client {
   CompileOnlyClient(const CompileOnlyClient&) = delete;
   void operator=(const CompileOnlyClient&) = delete;
 
-  // A description of a computation to compile using CompileAheadOfTime.
-  struct AotComputationInstance {
-    const Computation* computation;
-    // Inform the compiler of the expected layout for arguments.
-    std::vector<const Shape*> argument_layouts;
-    // Specifies the expected result layout.
-    const Shape* result_layout;
-  };
-
-  // Compiles a list of computations for ahead-of-time execution.  This is
-  // intended for use in static compilation. The |options| parameter describes
-  // the target for which the compiler should emit code.
-  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-  CompileAheadOfTime(
-      const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
-      const AotCompilationOptions& options);
-
   // A description of an xla computation to compile using CompileAheadOfTime.
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   struct AotXlaComputationInstance {
     const XlaComputation* computation;
     // Inform the compiler of the expected layout for arguments.
@@ -69,8 +49,6 @@ class CompileOnlyClient : public Client {
   // Compiles a list of xla computations for ahead-of-time execution.  This is
   // intended for use in static compilation. The |options| parameter describes
   // the target for which the compiler should emit code.
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(
       const tensorflow::gtl::ArraySlice<AotXlaComputationInstance> computations,
diff --git a/tensorflow/compiler/xla/client/computation.cc b/tensorflow/compiler/xla/client/computation.cc
deleted file mode 100644
index e6c57bda0f0c4c..00000000000000
--- a/tensorflow/compiler/xla/client/computation.cc
+++ /dev/null
@@ -1,77 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/client/computation.h"
-
-#include "tensorflow/compiler/xla/ptr_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/core/lib/core/errors.h"
-
-namespace xla {
-
-Computation::Computation() : parent_(nullptr) {}
-
-Computation::Computation(ServiceInterface* parent,
-                         const ComputationHandle& handle)
-    : handle_(handle), parent_(parent) {}
-
-Computation::Computation(Computation&& computation)
-    : handle_(std::move(computation.handle_)), parent_(computation.parent_) {
-  computation.ResetWithoutFreeing();
-}
-
-void Computation::Reset() {
-  // TODO(b/34469253) deallocate any owned computation.
-  ResetWithoutFreeing();
-}
-
-StatusOr<std::unique_ptr<SessionModule>> Computation::Snapshot() const {
-  SnapshotComputationRequest request;
-  *request.mutable_computation() = handle_;
-  SnapshotComputationResponse response;
-
-  TF_RETURN_IF_ERROR(parent_->SnapshotComputation(&request, &response));
-
-  return WrapUnique(response.release_module());
-}
-
-Computation::~Computation() { Reset(); }
-
-Computation& Computation::operator=(Computation&& computation) {
-  if (&computation != this) {
-    Reset();
-    handle_ = computation.handle_;
-    parent_ = computation.parent_;
-    computation.ResetWithoutFreeing();
-  }
-  return *this;
-}
-
-void Computation::ResetWithoutFreeing() {
-  handle_.Clear();
-  parent_ = nullptr;
-}
-
-StatusOr<ProgramShape> Computation::GetProgramShape() const {
-  GetComputationShapeRequest request;
-  *request.mutable_computation() = handle_;
-  GetComputationShapeResponse response;
-
-  TF_RETURN_IF_ERROR(parent_->GetComputationShape(&request, &response));
-
-  return std::move(*response.mutable_program_shape());
-}
-
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/computation.h b/tensorflow/compiler/xla/client/computation.h
deleted file mode 100644
index a53fc9e9cf3470..00000000000000
--- a/tensorflow/compiler/xla/client/computation.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_COMPUTATION_H_
-#define TENSORFLOW_COMPILER_XLA_CLIENT_COMPUTATION_H_
-
-#include <memory>
-
-#include "tensorflow/compiler/xla/service/session.pb.h"
-#include "tensorflow/compiler/xla/service_interface.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/xla.pb.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/platform/macros.h"
-
-namespace xla {
-
-// Wraps a ComputationHandle protobuf with a lifetime. Computation is
-// movable and not copyable to capture the same kind of unique
-// ownership that std::unique_ptr represents.
-class Computation {
- public:
-  // Creates a null Computation.
-  Computation();
-
-  // parent: stub for the service on which we will deallocate the computation
-  //   when it is no longer needed.
-  // handle: the computation handle protobuf from the service.
-  Computation(ServiceInterface* parent, const ComputationHandle& handle);
-
-  Computation(Computation&& computation);
-
-  // Deallocates the computation.
-  ~Computation();
-
-  Computation& operator=(Computation&& computation);
-
-  // Returns the underlying handle.
-  const ComputationHandle& handle() const { return handle_; }
-
-  // Sets handle to a null state and clears any owned computation.
-  void Reset();
-
-  // Requests that we snapshot the computation into a serializable protocol
-  // buffer form.
-  StatusOr<std::unique_ptr<SessionModule>> Snapshot() const;
-
-  // Returns true if this object is a null Computation.
-  bool IsNull() const { return parent_ == nullptr; }
-
-  // Returns the "program shape" (parameter and return shapes) for this
-  // computation.
-  StatusOr<ProgramShape> GetProgramShape() const;
-
- private:
-  void ResetWithoutFreeing();
-
-  ComputationHandle handle_;  // Handle that is wrapped by this class.
-
-  // Stub that the handle is deallocated on when this object's lifetime ends.
-  ServiceInterface* parent_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(Computation);
-};
-
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_COMPUTATION_H_
diff --git a/tensorflow/compiler/xla/client/computation_builder.cc b/tensorflow/compiler/xla/client/computation_builder.cc
deleted file mode 100644
index 83c7cb17440213..00000000000000
--- a/tensorflow/compiler/xla/client/computation_builder.cc
+++ /dev/null
@@ -1,1574 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/client/computation_builder.h"
-
-#include <stddef.h>
-#include <array>
-#include <numeric>
-#include <set>
-#include <vector>
-
-#include "tensorflow/compiler/xla/ptr_util.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/compiler/xla/xla.pb.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/protobuf.h"
-
-namespace xla {
-
-ComputationBuilder::ComputationBuilder(Client* client,
-                                       const string& computation_name)
-    : name_(computation_name), client_(client) {}
-
-ComputationBuilder::~ComputationBuilder() {}
-
-void ComputationBuilder::NoteError(const Status& error) {
-  if (die_immediately_on_error_) {
-    LOG(FATAL) << "error building computation: " << error;
-  }
-
-  if (first_error_.ok()) {
-    first_error_ = error;
-    first_error_backtrace_.CreateCurrent(/*skip_count=*/1);
-  }
-}
-
-std::unique_ptr<ComputationBuilder> ComputationBuilder::CreateSubBuilder(
-    const string& computation_name) {
-  auto sub_builder = MakeUnique<ComputationBuilder>(client_, computation_name);
-  sub_builder->parent_builder_ = this;
-  sub_builder->die_immediately_on_error_ = die_immediately_on_error_;
-  return sub_builder;
-}
-
-Status ComputationBuilder::PrepareComputation() {
-  TF_RETURN_IF_ERROR(first_error_);
-
-  if (!computation_.IsNull()) {
-    return Status::OK();
-  }
-
-  ComputationRequest request;
-  request.set_name(name_);
-  ComputationResponse response;
-
-  VLOG(2) << "making computation request";
-  Status s = client_->stub()->Computation(&request, &response);
-  VLOG(2) << "done with computation request";
-
-  if (!s.ok()) {
-    NoteError(s);
-    return first_error_;
-  }
-
-  computation_ = Computation(client_->stub(), response.computation());
-  return Status::OK();
-}
-
-Status ComputationBuilder::RunOp(OpRequest* op_request,
-                                 OpResponse* op_response) {
-  TF_RETURN_IF_ERROR(first_error_);
-  TF_RETURN_IF_ERROR(PrepareComputation());
-
-  // Fill in fields that are set on every OpRequest.
-  *op_request->mutable_computation() = computation_.handle();
-  *op_request->mutable_metadata() = metadata_;
-  if (sharding_) {
-    *op_request->mutable_sharding() = *sharding_;
-  }
-
-  const string& op_name =
-      OpRequest::descriptor()->FindFieldByNumber(op_request->op_case())->name();
-  VLOG(2) << "running op request: " << op_name;
-  Status status = client_->stub()->Op(op_request, op_response);
-  VLOG(2) << "done with op request: " << op_name;
-  return status;
-}
-
-void ComputationBuilder::RunOpAndNoteError(OpRequest* op_request) {
-  OpResponse op_response;
-  Status status = RunOp(op_request, &op_response);
-  if (!status.ok()) {
-    NoteError(status);
-  }
-}
-
-ComputationDataHandle ComputationBuilder::RunOpAndParseResponse(
-    OpRequest* op_request) {
-  OpResponse op_response;
-  Status status = RunOp(op_request, &op_response);
-  if (!status.ok()) {
-    NoteError(status);
-    return ComputationDataHandle();
-  }
-  if (op_response.output().handle() == 0) {
-    NoteError(InternalError("No output handle"));
-    return ComputationDataHandle();
-  }
-  return op_response.output();
-}
-
-bool ComputationBuilder::MakeWindow(
-    tensorflow::gtl::ArraySlice<int64> window_dimensions,
-    tensorflow::gtl::ArraySlice<int64> window_strides,
-    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-    tensorflow::gtl::ArraySlice<int64> lhs_dilation,
-    tensorflow::gtl::ArraySlice<int64> rhs_dilation, Window* window) {
-  const auto verify_size = [&](const size_t x, const char* x_name) {
-    if (x == 0 || x == window_dimensions.size()) {
-      return true;
-    } else {
-      NoteError(InvalidArgument(
-          "%s", tensorflow::strings::StrCat(
-                    "Window has different number of window dimensions than of ",
-                    x_name, "\nNumber of window dimensions: ",
-                    window_dimensions.size(), "\nNumber of ", x_name, ": ", x,
-                    "\n")
-                    .c_str()));  //
-      return false;
-    }
-  };
-  if (!verify_size(window_strides.size(), "window strides") ||
-      !verify_size(padding.size(), "padding entries") ||
-      !verify_size(lhs_dilation.size(), "lhs dilation factors") ||
-      !verify_size(rhs_dilation.size(), "rhs dilation factors")) {
-    return false;
-  }
-
-  window->Clear();
-  for (size_t i = 0; i < window_dimensions.size(); i++) {
-    auto dim = window->add_dimensions();
-    dim->set_size(window_dimensions[i]);
-    if (!window_strides.empty()) {
-      dim->set_stride(window_strides[i]);
-    } else {
-      dim->set_stride(1);
-    }
-    if (!padding.empty()) {
-      dim->set_padding_low(padding[i].first);
-      dim->set_padding_high(padding[i].second);
-    } else {
-      dim->set_padding_low(0);
-      dim->set_padding_high(0);
-    }
-    if (!lhs_dilation.empty()) {
-      dim->set_base_dilation(lhs_dilation[i]);
-    } else {
-      dim->set_base_dilation(1);
-    }
-    if (!rhs_dilation.empty()) {
-      dim->set_window_dilation(rhs_dilation[i]);
-    } else {
-      dim->set_window_dilation(1);
-    }
-    dim->set_window_reversal(false);
-  }
-  return true;
-}
-
-ComputationDataHandle ComputationBuilder::ConstantLiteral(
-    const Literal& literal) {
-  OpRequest op_request;
-  ConstantRequest* request = op_request.mutable_constant_request();
-  *request->mutable_literal() = literal.ToProto();
-  VLOG(3) << "created constant: " << request->literal().ShortDebugString();
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Parameter(int64 parameter_number,
-                                                    const Shape& shape,
-                                                    const string& name) {
-  OpRequest op_request;
-  ParameterRequest* request = op_request.mutable_parameter_request();
-  *request->mutable_shape() = shape;
-  request->set_parameter(parameter_number);
-  request->set_name(name);
-  return RunOpAndParseResponse(&op_request);
-}
-
-StatusOr<std::unique_ptr<Shape>> ComputationBuilder::GetShapeWithoutNoteError(
-    const ComputationDataHandle& operand) {
-  GetLocalShapeRequest request;
-  *request.mutable_computation() = computation_.handle();
-  *request.mutable_operand() = operand;
-  GetLocalShapeResponse response;
-
-  VLOG(2) << "making get-shape request";
-  TF_RETURN_IF_ERROR(client_->stub()->GetLocalShape(&request, &response));
-  VLOG(2) << "done with request";
-
-  TF_RET_CHECK(response.has_shape());
-  std::unique_ptr<Shape> shape = WrapUnique(response.release_shape());
-  TF_RET_CHECK(shape != nullptr);
-  return std::move(shape);
-}
-
-StatusOr<std::unique_ptr<Shape>> ComputationBuilder::GetShape(
-    const ComputationDataHandle& operand) {
-  TF_RETURN_IF_ERROR(first_error_);
-
-  auto status_or_shape = GetShapeWithoutNoteError(operand);
-  if (!status_or_shape.ok()) {
-    NoteError(status_or_shape.status());
-    return first_error_;
-  }
-  return status_or_shape;
-}
-
-StatusOr<ProgramShape> ComputationBuilder::GetProgramShape() {
-  TF_RETURN_IF_ERROR(first_error_);
-
-  GetComputationShapeRequest request;
-  *request.mutable_computation() = computation_.handle();
-  GetComputationShapeResponse response;
-
-  VLOG(2) << "making get-program-shape-request";
-  Status status = client_->stub()->GetComputationShape(&request, &response);
-  VLOG(2) << "done with get-program-shape-request";
-
-  if (!status.ok()) {
-    first_error_ = status;
-    return status;
-  }
-
-  TF_RET_CHECK(response.has_program_shape());
-  return std::move(*response.mutable_program_shape());
-}
-
-ComputationDataHandle ComputationBuilder::Slice(
-    const ComputationDataHandle& operand,
-    tensorflow::gtl::ArraySlice<int64> start_indices,
-    tensorflow::gtl::ArraySlice<int64> limit_indices,
-    tensorflow::gtl::ArraySlice<int64> strides) {
-  OpRequest op_request;
-  SliceRequest* request = op_request.mutable_slice_request();
-  *request->mutable_operand() = operand;
-  for (int64 index : start_indices) {
-    request->add_start_indices(index);
-  }
-  for (int64 index : limit_indices) {
-    request->add_limit_indices(index);
-  }
-  for (int64 index : strides) {
-    request->add_strides(index);
-  }
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::SliceInDim(
-    const ComputationDataHandle& operand, int64 start_index, int64 limit_index,
-    int64 stride, int64 dimno) {
-  StatusOr<std::unique_ptr<Shape>> shape_status = GetShape(operand);
-  if (!shape_status.ok()) {
-    NoteError(shape_status.status());
-    return ComputationDataHandle{};
-  }
-  const Shape& shape = *shape_status.ValueOrDie();
-  std::vector<int64> starts(ShapeUtil::Rank(shape), 0);
-  std::vector<int64> limits(shape.dimensions().begin(),
-                            shape.dimensions().end());
-  std::vector<int64> strides(ShapeUtil::Rank(shape), 1);
-  starts[dimno] = start_index;
-  limits[dimno] = limit_index;
-  strides[dimno] = stride;
-  return Slice(operand, starts, limits, strides);
-}
-
-ComputationDataHandle ComputationBuilder::DynamicSlice(
-    const ComputationDataHandle& operand,
-    const ComputationDataHandle& start_indices,
-    tensorflow::gtl::ArraySlice<int64> slice_sizes) {
-  OpRequest op_request;
-  DynamicSliceRequest* request = op_request.mutable_dynamic_slice_request();
-  *request->mutable_operand() = operand;
-  *request->mutable_start_indices() = start_indices;
-  for (int64 index : slice_sizes) {
-    request->add_slice_sizes(index);
-  }
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::DynamicUpdateSlice(
-    const ComputationDataHandle& operand, const ComputationDataHandle& update,
-    const ComputationDataHandle& start_indices) {
-  OpRequest op_request;
-  DynamicUpdateSliceRequest* request =
-      op_request.mutable_dynamic_update_slice_request();
-  *request->mutable_operand() = operand;
-  *request->mutable_update() = update;
-  *request->mutable_start_indices() = start_indices;
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::ConcatInDim(
-    tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
-    int64 dimension) {
-  OpRequest op_request;
-  ConcatenateRequest* request = op_request.mutable_concatenate_request();
-  for (const ComputationDataHandle& operand : operands) {
-    *request->add_operands() = operand;
-  }
-  request->set_dimension(dimension);
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Broadcast(
-    const ComputationDataHandle& operand,
-    tensorflow::gtl::ArraySlice<int64> broadcast_sizes) {
-  OpRequest op_request;
-  BroadcastRequest* request = op_request.mutable_broadcast_request();
-  *request->mutable_operand() = operand;
-  for (int64 size : broadcast_sizes) {
-    request->add_broadcast_sizes(size);
-  }
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Pad(
-    const ComputationDataHandle& operand,
-    const ComputationDataHandle& padding_value,
-    const PaddingConfig& padding_config) {
-  OpRequest op_request;
-  PadRequest* request = op_request.mutable_pad_request();
-  *request->mutable_operand() = operand;
-  *request->mutable_padding_value() = padding_value;
-  *request->mutable_padding_config() = padding_config;
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Reshape(
-    const ComputationDataHandle& operand,
-    tensorflow::gtl::ArraySlice<int64> dimensions,
-    tensorflow::gtl::ArraySlice<int64> new_sizes) {
-  OpRequest op_request;
-  ReshapeRequest* request = op_request.mutable_reshape_request();
-  *request->mutable_operand() = operand;
-  for (int64 dimension : dimensions) {
-    request->add_dimensions(dimension);
-  }
-  for (int64 new_size : new_sizes) {
-    request->add_new_sizes(new_size);
-  }
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Reshape(
-    const ComputationDataHandle& operand,
-    tensorflow::gtl::ArraySlice<int64> new_sizes) {
-  if (!first_error_.ok()) {
-    return ComputationDataHandle();
-  }
-
-  StatusOr<std::unique_ptr<Shape>> shape = GetShape(operand);
-  if (!shape.ok()) {
-    return ComputationDataHandle();
-  }
-  std::vector<int64> dimensions(shape.ValueOrDie()->dimensions().size());
-  std::iota(dimensions.begin(), dimensions.end(), 0);
-  return Reshape(operand, dimensions, new_sizes);
-}
-
-ComputationDataHandle ComputationBuilder::Collapse(
-    const ComputationDataHandle& operand,
-    tensorflow::gtl::ArraySlice<int64> dimensions) {
-  if (!first_error_.ok()) {
-    return ComputationDataHandle();
-  }
-
-  // Don't support out-of-order collapse here.
-  // Checks that the collapsed dimensions are in order and consecutive.
-  for (tensorflow::gtl::ArraySlice<int64>::size_type i = 1;
-       i < dimensions.size(); ++i) {
-    if (dimensions[i] - 1 != dimensions[i - 1]) {
-      NoteError(InvalidArgument(
-          "Collapsed dimensions are not in order and consecutive."));
-      return ComputationDataHandle();
-    }
-  }
-
-  // Create a new sizes vector from the old shape, replacing the collapsed
-  // dimensions by the product of their sizes.
-  StatusOr<std::unique_ptr<Shape>> shape_or_status = GetShape(operand);
-  if (!shape_or_status.ok()) {
-    return ComputationDataHandle();
-  }
-  std::unique_ptr<Shape> original_shape = shape_or_status.ConsumeValueOrDie();
-
-  VLOG(3) << "original shape: " << ShapeUtil::HumanString(*original_shape);
-  VLOG(3) << "dims to collapse: "
-          << tensorflow::str_util::Join(dimensions, ",");
-
-  if (dimensions.size() <= 1) {
-    // Not collapsing anything, trivially we can return the operand versus
-    // enqueueing a trivial reshape.
-    return operand;
-  }
-
-  std::vector<int64> new_sizes;
-  for (int i = 0; i < ShapeUtil::Rank(*original_shape); ++i) {
-    if (i <= dimensions.front() || i > dimensions.back()) {
-      new_sizes.push_back(original_shape->dimensions(i));
-    } else {
-      new_sizes.back() *= original_shape->dimensions(i);
-    }
-  }
-
-  VLOG(3) << "new sizes: [" << tensorflow::str_util::Join(new_sizes, ",")
-          << "]";
-
-  return Reshape(operand, new_sizes);
-}
-
-void ComputationBuilder::Trace(const string& tag,
-                               const ComputationDataHandle& operand) {
-  OpRequest op_request;
-  TraceRequest* request = op_request.mutable_trace_request();
-  request->set_tag(tag);
-  *request->mutable_operand() = operand;
-  RunOpAndNoteError(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Select(
-    const ComputationDataHandle& pred, const ComputationDataHandle& on_true,
-    const ComputationDataHandle& on_false) {
-  return TernaryOp(TRIOP_SELECT, pred, on_true, on_false);
-}
-
-ComputationDataHandle ComputationBuilder::Tuple(
-    tensorflow::gtl::ArraySlice<ComputationDataHandle> elements) {
-  OpRequest op_request;
-  VariadicOpRequest* request = op_request.mutable_variadic_op_request();
-  request->set_varop(VAROP_TUPLE);
-  for (const ComputationDataHandle& operand : elements) {
-    *request->add_operands() = operand;
-  }
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::GetTupleElement(
-    const ComputationDataHandle& tuple_data, int64 index) {
-  OpRequest op_request;
-  GetTupleElementRequest* request =
-      op_request.mutable_get_tuple_element_request();
-  *request->mutable_operand() = tuple_data;
-  request->set_index(index);
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Eq(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_EQ, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::Ne(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_NE, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::Ge(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_GE, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::Gt(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_GT, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::Le(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_LE, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::Lt(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_LT, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::Dot(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs) {
-  StatusOr<std::unique_ptr<Shape>> lhs_shape_or_status = GetShape(lhs);
-  if (!lhs_shape_or_status.ok()) {
-    return ComputationDataHandle();
-  }
-  std::unique_ptr<Shape> lhs_shape = lhs_shape_or_status.ConsumeValueOrDie();
-
-  DotDimensionNumbers dimension_numbers;
-  dimension_numbers.add_lhs_contracting_dimensions(
-      lhs_shape->dimensions_size() == 1 ? 0 : 1);
-  dimension_numbers.add_rhs_contracting_dimensions(0);
-  return DotGeneral(lhs, rhs, dimension_numbers);
-}
-
-ComputationDataHandle ComputationBuilder::DotGeneral(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    const DotDimensionNumbers& dimension_numbers) {
-  OpRequest op_request;
-  DotRequest* request = op_request.mutable_dot_request();
-  *request->mutable_lhs() = lhs;
-  *request->mutable_rhs() = rhs;
-  *request->mutable_dimension_numbers() = dimension_numbers;
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Conv(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding) {
-  return ConvWithGeneralDimensions(
-      lhs, rhs, window_strides, padding,
-      CreateDefaultConvDimensionNumbers(window_strides.size()));
-}
-
-ComputationDataHandle ComputationBuilder::ConvWithGeneralPadding(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> window_strides,
-    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding) {
-  return ConvGeneral(lhs, rhs, window_strides, padding,
-                     CreateDefaultConvDimensionNumbers(window_strides.size()));
-}
-
-bool ComputationBuilder::VerifyConvolution(
-    const Shape& lhs_shape, const Shape& rhs_shape,
-    const ConvolutionDimensionNumbers& dimension_numbers) {
-  if (ShapeUtil::Rank(lhs_shape) != ShapeUtil::Rank(rhs_shape)) {
-    NoteError(
-        InvalidArgument("Convolution arguments must have same number of "
-                        "dimensions. Got: %s and %s",
-                        ShapeUtil::HumanString(lhs_shape).c_str(),
-                        ShapeUtil::HumanString(rhs_shape).c_str()));
-    return false;
-  }
-  int num_dims = ShapeUtil::Rank(lhs_shape);
-  if (num_dims < 2) {
-    NoteError(InvalidArgument(
-        "Convolution expects argument arrays with >= 3 dimensions. "
-        "Got: %s and %s",
-        ShapeUtil::HumanString(lhs_shape).c_str(),
-        ShapeUtil::HumanString(rhs_shape).c_str()));
-    return false;
-  }
-  int num_spatial_dims = num_dims - 2;
-
-  const auto check_spatial_dimensions =
-      [&](const char* const field_name,
-          const tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>&
-              numbers) {
-        if (numbers.size() != num_spatial_dims) {
-          NoteError(InvalidArgument("Expected %d elements for %s, but got %d.",
-                                    num_spatial_dims, field_name,
-                                    numbers.size()));
-          return false;
-        }
-        for (int i = 0; i < numbers.size(); ++i) {
-          if (numbers.Get(i) < 0 || numbers.Get(i) >= num_dims) {
-            NoteError(
-                InvalidArgument("Convolution %s[%d] is out of bounds: %lld",
-                                field_name, i, numbers.Get(i)));
-            return false;
-          }
-        }
-        return true;
-      };
-  return check_spatial_dimensions(
-             "input_spatial_dimensions",
-             dimension_numbers.input_spatial_dimensions()) &&
-         check_spatial_dimensions(
-             "kernel_spatial_dimensions",
-             dimension_numbers.kernel_spatial_dimensions()) &&
-         check_spatial_dimensions(
-             "output_spatial_dimensions",
-             dimension_numbers.output_spatial_dimensions());
-}
-
-ComputationDataHandle ComputationBuilder::ConvWithGeneralDimensions(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
-    const ConvolutionDimensionNumbers& dimension_numbers) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  StatusOr<std::unique_ptr<Shape>> lhs_shape_or_status = GetShape(lhs);
-  if (!lhs_shape_or_status.ok()) {
-    return ComputationDataHandle();
-  }
-
-  StatusOr<std::unique_ptr<Shape>> rhs_shape_or_status = GetShape(rhs);
-  if (!rhs_shape_or_status.ok()) {
-    return ComputationDataHandle();
-  }
-
-  std::unique_ptr<Shape> lhs_shape = lhs_shape_or_status.ConsumeValueOrDie();
-  std::unique_ptr<Shape> rhs_shape = rhs_shape_or_status.ConsumeValueOrDie();
-
-  if (!VerifyConvolution(*lhs_shape, *rhs_shape, dimension_numbers)) {
-    NoteError(InternalError("failed to verify convolution"));
-    return ComputationDataHandle();
-  }
-
-  std::vector<int64> base_area_dimensions(
-      dimension_numbers.input_spatial_dimensions_size());
-  for (std::vector<int64>::size_type i = 0; i < base_area_dimensions.size();
-       ++i) {
-    base_area_dimensions[i] =
-        lhs_shape->dimensions(dimension_numbers.input_spatial_dimensions(i));
-  }
-
-  std::vector<int64> window_dimensions(
-      dimension_numbers.kernel_spatial_dimensions_size());
-  for (std::vector<int64>::size_type i = 0; i < window_dimensions.size(); ++i) {
-    window_dimensions[i] =
-        rhs_shape->dimensions(dimension_numbers.kernel_spatial_dimensions(i));
-  }
-
-  return ConvGeneral(lhs, rhs, window_strides,
-                     MakePadding(base_area_dimensions, window_dimensions,
-                                 window_strides, padding),
-                     dimension_numbers);
-}
-
-ComputationDataHandle ComputationBuilder::ConvGeneral(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> window_strides,
-    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-    const ConvolutionDimensionNumbers& dimension_numbers) {
-  return ConvGeneralDilated(lhs, rhs, window_strides, padding, {}, {},
-                            dimension_numbers);
-}
-
-ComputationDataHandle ComputationBuilder::ConvGeneralDilated(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> window_strides,
-    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-    tensorflow::gtl::ArraySlice<int64> lhs_dilation,
-    tensorflow::gtl::ArraySlice<int64> rhs_dilation,
-    const ConvolutionDimensionNumbers& dimension_numbers) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  StatusOr<std::unique_ptr<Shape>> lhs_shape_or_status = GetShape(lhs);
-  if (!lhs_shape_or_status.ok()) {
-    return ComputationDataHandle();
-  }
-
-  StatusOr<std::unique_ptr<Shape>> rhs_shape_or_status = GetShape(rhs);
-  if (!rhs_shape_or_status.ok()) {
-    return ComputationDataHandle();
-  }
-
-  std::unique_ptr<Shape> lhs_shape = lhs_shape_or_status.ConsumeValueOrDie();
-  std::unique_ptr<Shape> rhs_shape = rhs_shape_or_status.ConsumeValueOrDie();
-  if (!VerifyConvolution(*lhs_shape, *rhs_shape, dimension_numbers)) {
-    // Error is recorded in VerifyConvolution.
-    return ComputationDataHandle();
-  }
-
-  std::vector<int64> window_dimensions(
-      dimension_numbers.kernel_spatial_dimensions_size());
-  for (std::vector<int64>::size_type i = 0; i < window_dimensions.size(); ++i) {
-    window_dimensions[i] =
-        rhs_shape->dimensions(dimension_numbers.kernel_spatial_dimensions(i));
-  }
-
-  OpRequest op_request;
-  ConvolveRequest* request = op_request.mutable_convolve_request();
-  *request->mutable_lhs() = lhs;
-  *request->mutable_rhs() = rhs;
-  *request->mutable_dimension_numbers() = dimension_numbers;
-
-  if (!MakeWindow(window_dimensions, window_strides, padding, lhs_dilation,
-                  rhs_dilation, request->mutable_window())) {
-    // Error is recorded in MakeWindow.
-    return ComputationDataHandle();
-  }
-
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Fft(
-    const ComputationDataHandle& operand, const FftType fft_type,
-    const tensorflow::gtl::ArraySlice<int64> fft_length) {
-  OpRequest op_request;
-  FftRequest* request = op_request.mutable_fft_request();
-  *request->mutable_operand() = operand;
-  request->set_fft_type(fft_type);
-  for (int64 dim_len : fft_length) {
-    request->add_fft_length(dim_len);
-  }
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Infeed(const Shape& shape,
-                                                 const string& config) {
-  OpRequest op_request;
-  InfeedRequest* request = op_request.mutable_infeed_request();
-  *request->mutable_shape() = shape;
-  *request->mutable_config() = config;
-  return RunOpAndParseResponse(&op_request);
-}
-
-void ComputationBuilder::Outfeed(const ComputationDataHandle& operand,
-                                 const Shape& shape_with_layout,
-                                 const string& outfeed_config) {
-  OpRequest op_request;
-  OutfeedRequest* request = op_request.mutable_outfeed_request();
-  request->set_outfeed_config(outfeed_config);
-  *request->mutable_operand() = operand;
-  *request->mutable_shape() = shape_with_layout;
-  RunOpAndNoteError(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Call(
-    const Computation& computation,
-    tensorflow::gtl::ArraySlice<ComputationDataHandle> operands) {
-  OpRequest op_request;
-  CallRequest* request = op_request.mutable_call_request();
-  *request->mutable_to_apply() = computation.handle();
-  for (const ComputationDataHandle& operand : operands) {
-    *request->add_operands() = operand;
-  }
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::CustomCall(
-    const string& call_target_name,
-    tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
-    const Shape& shape) {
-  OpRequest op_request;
-  CustomCallRequest* request = op_request.mutable_custom_call_request();
-  request->set_call_target_name(call_target_name);
-  for (const ComputationDataHandle& operand : operands) {
-    *request->add_operands() = operand;
-  }
-  *request->mutable_shape() = shape;
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::HostCompute(
-    tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
-    const string& channel_name, int64 cost_estimate_ns, const Shape& shape) {
-  OpRequest op_request;
-  HostComputeRequest* request = op_request.mutable_host_compute_request();
-  for (const ComputationDataHandle& operand : operands) {
-    *request->add_operands() = operand;
-  }
-  *request->mutable_shape() = shape;
-  request->set_channel_name(channel_name);
-  request->set_cost_estimate_ns(cost_estimate_ns);
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Complex(
-    const ComputationDataHandle& real, const ComputationDataHandle& imag,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_COMPLEX, real, imag, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::Conj(
-    const ComputationDataHandle& operand) {
-  return Complex(Real(operand), Neg(Imag(operand)));
-}
-
-ComputationDataHandle ComputationBuilder::Add(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_ADD, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::Sub(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_SUB, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::Mul(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_MUL, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::Div(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_DIV, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::Rem(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_REM, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::Max(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_MAX, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::Min(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_MIN, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::And(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_AND, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::Or(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_OR, lhs, rhs, broadcast_dimensions);
-}
-
-// TODO(b/65209188): Create a dedicated lowering for Xor
-ComputationDataHandle ComputationBuilder::Xor(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return Or(And(Not(lhs), rhs, broadcast_dimensions),
-            And(lhs, Not(rhs), broadcast_dimensions));
-}
-
-ComputationDataHandle ComputationBuilder::Not(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_NOT, operand);
-}
-
-ComputationDataHandle ComputationBuilder::ShiftLeft(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_SHIFT_LEFT, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::ShiftRightArithmetic(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_SHIFT_RIGHT_ARITHMETIC, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::ShiftRightLogical(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_SHIFT_RIGHT_LOGICAL, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::Abs(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_ABS, operand);
-}
-
-ComputationDataHandle ComputationBuilder::Atan2(
-    const ComputationDataHandle& y, const ComputationDataHandle& x,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_ATAN2, y, x, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::Exp(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_EXP, operand);
-}
-
-ComputationDataHandle ComputationBuilder::Floor(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_FLOOR, operand);
-}
-
-ComputationDataHandle ComputationBuilder::Ceil(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_CEIL, operand);
-}
-
-ComputationDataHandle ComputationBuilder::Round(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_ROUND_NEAREST_AFZ, operand);
-}
-
-ComputationDataHandle ComputationBuilder::Log(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_LOG, operand);
-}
-
-ComputationDataHandle ComputationBuilder::Sign(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_SIGN, operand);
-}
-
-ComputationDataHandle ComputationBuilder::Cos(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_COS, operand);
-}
-
-ComputationDataHandle ComputationBuilder::Sin(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_SIN, operand);
-}
-
-ComputationDataHandle ComputationBuilder::Tanh(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_TANH, operand);
-}
-
-ComputationDataHandle ComputationBuilder::Real(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_REAL, operand);
-}
-
-ComputationDataHandle ComputationBuilder::Imag(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_IMAG, operand);
-}
-
-ComputationDataHandle ComputationBuilder::IsFinite(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_IS_FINITE, operand);
-}
-
-ComputationDataHandle ComputationBuilder::Transpose(
-    const ComputationDataHandle& operand,
-    tensorflow::gtl::ArraySlice<int64> permutation) {
-  OpRequest op_request;
-  TransposeRequest* request = op_request.mutable_transpose_request();
-  *request->mutable_operand() = operand;
-  for (int64 dimension : permutation) {
-    request->add_dimensions(dimension);
-  }
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Rev(
-    const ComputationDataHandle& operand,
-    tensorflow::gtl::ArraySlice<int64> dimensions) {
-  OpRequest op_request;
-  ReverseRequest* request = op_request.mutable_reverse_request();
-  *request->mutable_operand() = operand;
-  for (int64 dimension : dimensions) {
-    request->add_dimensions(dimension);
-  }
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Sort(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_SORT, operand);
-}
-
-ComputationDataHandle ComputationBuilder::SqrtF32(
-    const ComputationDataHandle& operand) {
-  return BinaryOp(BINOP_POW, operand, ConstantR0<float>(0.5),
-                  /*broadcast_dimensions=*/{});
-}
-
-ComputationDataHandle ComputationBuilder::Pow(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return BinaryOp(BINOP_POW, lhs, rhs, broadcast_dimensions);
-}
-
-ComputationDataHandle ComputationBuilder::ConvertElementType(
-    const ComputationDataHandle& operand, PrimitiveType new_element_type) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  StatusOr<std::unique_ptr<Shape>> shape_status = GetShape(operand);
-  if (!shape_status.ok()) {
-    return ComputationDataHandle();
-  }
-  std::unique_ptr<Shape> original = shape_status.ConsumeValueOrDie();
-
-  OpRequest op_request;
-  ConvertRequest* request = op_request.mutable_convert_request();
-  *request->mutable_operand() = operand;
-  request->set_new_element_type(new_element_type);
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::BitcastConvertType(
-    const ComputationDataHandle& operand, PrimitiveType new_element_type) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  StatusOr<std::unique_ptr<Shape>> shape_status = GetShape(operand);
-  if (!shape_status.ok()) {
-    return ComputationDataHandle();
-  }
-  std::unique_ptr<Shape> original = shape_status.ConsumeValueOrDie();
-
-  OpRequest op_request;
-  ConvertRequest* request = op_request.mutable_bitcast_convert_request();
-  *request->mutable_operand() = operand;
-  request->set_new_element_type(new_element_type);
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::SquareF32(
-    const ComputationDataHandle& operand) {
-  return BinaryOp(BINOP_POW, operand, ConstantR0<float>(2.0),
-                  /*broadcast_dimensions=*/{});
-}
-
-ComputationDataHandle ComputationBuilder::ReciprocalF32(
-    const ComputationDataHandle& operand) {
-  return BinaryOp(BINOP_POW, operand, ConstantR0<float>(-1.0),
-                  /*broadcast_dimensions=*/{});
-}
-
-ComputationDataHandle ComputationBuilder::Neg(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_NEGATE, operand);
-}
-
-ComputationDataHandle ComputationBuilder::Clz(
-    const ComputationDataHandle& operand) {
-  return UnaryOp(UNOP_CLZ, operand);
-}
-
-ComputationDataHandle ComputationBuilder::Clamp(
-    const ComputationDataHandle& min, const ComputationDataHandle& operand,
-    const ComputationDataHandle& max) {
-  return TernaryOp(TRIOP_CLAMP, min, operand, max);
-}
-
-ComputationDataHandle ComputationBuilder::UnaryOp(
-    UnaryOperation unop, const ComputationDataHandle& operand) {
-  OpRequest op_request;
-  UnaryOpRequest* request = op_request.mutable_unary_op_request();
-  request->set_unop(unop);
-  *request->mutable_operand() = operand;
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::BinaryOp(
-    BinaryOperation binop, const ComputationDataHandle& lhs,
-    const ComputationDataHandle& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  OpRequest op_request;
-  BinaryOpRequest* request = op_request.mutable_binary_op_request();
-  request->set_binop(binop);
-  *request->mutable_lhs() = lhs;
-  *request->mutable_rhs() = rhs;
-  for (int64 dimension : broadcast_dimensions) {
-    request->add_broadcast_dimensions(dimension);
-  }
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::RngOp(
-    RandomDistribution distribution,
-    tensorflow::gtl::ArraySlice<ComputationDataHandle> parameters,
-    const Shape& shape) {
-  OpRequest op_request;
-  RngRequest* request = op_request.mutable_rng_request();
-  request->set_distribution(distribution);
-  for (const ComputationDataHandle& param : parameters) {
-    *request->add_parameter() = param;
-  }
-  *request->mutable_shape() = shape;
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::TernaryOp(
-    TernaryOperation triop, const ComputationDataHandle& lhs,
-    const ComputationDataHandle& rhs, const ComputationDataHandle& ehs) {
-  OpRequest op_request;
-  TernaryOpRequest* request = op_request.mutable_ternary_op_request();
-  request->set_triop(triop);
-  *request->mutable_lhs() = lhs;
-  *request->mutable_rhs() = rhs;
-  *request->mutable_ehs() = ehs;
-  return RunOpAndParseResponse(&op_request);
-}
-
-Status ComputationBuilder::SetReturnValue(
-    const ComputationDataHandle& operand) {
-  TF_RETURN_IF_ERROR(first_error_);
-
-  SetReturnValueRequest request;
-  *request.mutable_computation() = computation_.handle();
-  *request.mutable_operand() = operand;
-
-  SetReturnValueResponse response;
-
-  VLOG(2) << "making set-handle-to-execute request";
-  Status s = client_->stub()->SetReturnValue(&request, &response);
-  VLOG(2) << "done with request";
-
-  if (!s.ok()) {
-    NoteError(s);
-    return first_error_;
-  }
-
-  return Status::OK();
-}
-
-StatusOr<bool> ComputationBuilder::IsConstant(
-    const ComputationDataHandle& operand, int64 num_parameters) {
-  TF_RETURN_IF_ERROR(first_error_);
-
-  IsConstantRequest request;
-  *request.mutable_computation() = computation_.handle();
-  *request.mutable_operand() = operand;
-  request.set_num_parameters(num_parameters);
-  IsConstantResponse response;
-
-  VLOG(2) << "making IsConstant request";
-  Status s = client_->stub()->IsConstant(&request, &response);
-  VLOG(2) << "done with request";
-
-  if (!s.ok()) {
-    return s;
-  }
-  return response.is_constant();
-}
-
-StatusOr<std::unique_ptr<Literal>> ComputationBuilder::ComputeConstant(
-    const ComputationDataHandle& operand, const Layout* output_layout,
-    tensorflow::gtl::ArraySlice<Literal> parameters) {
-  TF_RETURN_IF_ERROR(first_error_);
-
-  ComputeConstantRequest request;
-  *request.mutable_computation() = computation_.handle();
-  *request.mutable_operand() = operand;
-  if (output_layout != nullptr) {
-    *request.mutable_output_layout() = *output_layout;
-  }
-  for (const auto& param : parameters) {
-    *request.add_parameters() = param.ToProto();
-  }
-
-  ComputeConstantResponse response;
-
-  VLOG(2) << "making compute-constant request";
-  Status s = client_->stub()->ComputeConstant(&request, &response);
-  VLOG(2) << "done with request";
-
-  if (!s.ok()) {
-    return s;
-  }
-
-  VLOG(3) << "ComputeConstant: {" << response.DebugString() << "}";
-
-  if (!response.has_literal()) {
-    return InternalError(
-        "no computed literal in the provided response in ComputeConstant "
-        "request");
-  }
-  return Literal::CreateFromProto(response.literal());
-}
-
-ComputationDataHandle ComputationBuilder::Map(
-    tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
-    const Computation& computation,
-    tensorflow::gtl::ArraySlice<int64> dimensions,
-    tensorflow::gtl::ArraySlice<ComputationDataHandle> static_operands) {
-  OpRequest op_request;
-  MapRequest* request = op_request.mutable_map_request();
-  for (const ComputationDataHandle& operand : operands) {
-    *request->add_operands() = operand;
-  }
-  *request->mutable_to_apply() = computation.handle();
-  for (int64 dimension : dimensions) {
-    request->add_dimensions(dimension);
-  }
-  for (const ComputationDataHandle& sop : static_operands) {
-    *request->add_static_operands() = sop;
-  }
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::RngNormal(
-    const ComputationDataHandle& mu, const ComputationDataHandle& sigma,
-    const Shape& shape) {
-  return RngOp(RandomDistribution::RNG_NORMAL, {mu, sigma}, shape);
-}
-
-ComputationDataHandle ComputationBuilder::RngUniform(
-    const ComputationDataHandle& a, const ComputationDataHandle& b,
-    const Shape& shape) {
-  return RngOp(RandomDistribution::RNG_UNIFORM, {a, b}, shape);
-}
-
-ComputationDataHandle ComputationBuilder::While(
-    const Computation& condition, const Computation& body,
-    const ComputationDataHandle& init) {
-  OpRequest op_request;
-  WhileRequest* request = op_request.mutable_while_request();
-  *request->mutable_condition() = condition.handle();
-  *request->mutable_body() = body.handle();
-  *request->mutable_init() = init;
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Gather(
-    const ComputationDataHandle& input,
-    const ComputationDataHandle& gather_indices,
-    const GatherDimensionNumbers& dimension_numbers,
-    tensorflow::gtl::ArraySlice<int64> window_bounds) {
-  OpRequest op_request;
-  GatherRequest* gather_request = op_request.mutable_gather_request();
-  *gather_request->mutable_input() = input;
-  *gather_request->mutable_gather_indices() = gather_indices;
-  *gather_request->mutable_dimension_numbers() = dimension_numbers;
-  for (int64 window_bound : window_bounds) {
-    gather_request->add_window_bounds(window_bound);
-  }
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Conditional(
-    const ComputationDataHandle& predicate,
-    const ComputationDataHandle& true_operand,
-    const Computation& true_computation,
-    const ComputationDataHandle& false_operand,
-    const Computation& false_computation) {
-  OpRequest op_request;
-  ConditionalRequest* request = op_request.mutable_conditional_request();
-  *request->mutable_predicate() = predicate;
-  *request->mutable_true_operand() = true_operand;
-  *request->mutable_true_computation() = true_computation.handle();
-  *request->mutable_false_operand() = false_operand;
-  *request->mutable_false_computation() = false_computation.handle();
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Reduce(
-    const ComputationDataHandle& operand,
-    const ComputationDataHandle& init_value, const Computation& computation,
-    tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce) {
-  OpRequest op_request;
-  ReduceRequest* request = op_request.mutable_reduce_request();
-  *request->mutable_operand() = operand;
-  *request->mutable_init_value() = init_value;
-  for (int64 dimension : dimensions_to_reduce) {
-    request->add_dimensions(dimension);
-  }
-  *request->mutable_to_apply() = computation.handle();
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::ReduceAll(
-    const ComputationDataHandle& operand,
-    const ComputationDataHandle& init_value, const Computation& computation) {
-  if (!first_error_.ok() || !PrepareComputation().ok()) {
-    return ComputationDataHandle();
-  }
-
-  StatusOr<std::unique_ptr<Shape>> shape = GetShape(operand);
-  if (!shape.ok()) {
-    return ComputationDataHandle();
-  }
-
-  std::vector<int64> all_dimnos(ShapeUtil::Rank(*shape.ValueOrDie()));
-  std::iota(all_dimnos.begin(), all_dimnos.end(), 0);
-  return Reduce(operand, init_value, computation, all_dimnos);
-}
-
-ComputationDataHandle ComputationBuilder::ReduceWindow(
-    const ComputationDataHandle& operand,
-    const ComputationDataHandle& init_value, const Computation& computation,
-    tensorflow::gtl::ArraySlice<int64> window_dimensions,
-    tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding) {
-  if (!first_error_.ok()) {
-    return ComputationDataHandle();
-  }
-
-  StatusOr<std::unique_ptr<Shape>> shape = GetShape(operand);
-  if (!shape.ok()) {
-    return ComputationDataHandle();
-  }
-
-  Status padding_valid =
-      ValidatePaddingValues(AsInt64Slice(shape.ValueOrDie()->dimensions()),
-                            window_dimensions, window_strides);
-  if (!padding_valid.ok()) {
-    first_error_ = padding_valid;
-    return ComputationDataHandle();
-  }
-
-  std::vector<std::pair<int64, int64>> padding_values =
-      MakePadding(AsInt64Slice(shape.ValueOrDie()->dimensions()),
-                  window_dimensions, window_strides, padding);
-  return ReduceWindowWithGeneralPadding(operand, init_value, computation,
-                                        window_dimensions, window_strides,
-                                        padding_values);
-}
-
-ComputationDataHandle ComputationBuilder::ReduceWindowWithGeneralPadding(
-    const ComputationDataHandle& operand,
-    const ComputationDataHandle& init_value, const Computation& computation,
-    tensorflow::gtl::ArraySlice<int64> window_dimensions,
-    tensorflow::gtl::ArraySlice<int64> window_strides,
-    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding) {
-  OpRequest op_request;
-  ReduceWindowRequest* request = op_request.mutable_reduce_window_request();
-  *request->mutable_operand() = operand;
-  *request->mutable_to_apply() = computation.handle();
-  *request->mutable_init_value() = init_value;
-
-  if (!MakeWindow(window_dimensions, window_strides, padding, {}, {},
-                  request->mutable_window())) {
-    NoteError(InternalError("failed to make window"));
-    return ComputationDataHandle();
-  }
-
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::BatchNormTraining(
-    const ComputationDataHandle& operand, const ComputationDataHandle& scale,
-    const ComputationDataHandle& offset, float epsilon, int64 feature_index) {
-  OpRequest op_request;
-  BatchNormTrainingRequest* request =
-      op_request.mutable_batch_norm_training_request();
-  *request->mutable_operand() = operand;
-  *request->mutable_scale() = scale;
-  *request->mutable_offset() = offset;
-  request->set_epsilon(epsilon);
-  request->set_feature_index(feature_index);
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::BatchNormInference(
-    const ComputationDataHandle& operand, const ComputationDataHandle& scale,
-    const ComputationDataHandle& offset, const ComputationDataHandle& mean,
-    const ComputationDataHandle& variance, float epsilon, int64 feature_index) {
-  OpRequest op_request;
-  BatchNormInferenceRequest* request =
-      op_request.mutable_batch_norm_inference_request();
-  *request->mutable_operand() = operand;
-  *request->mutable_scale() = scale;
-  *request->mutable_offset() = offset;
-  *request->mutable_mean() = mean;
-  *request->mutable_variance() = variance;
-  request->set_epsilon(epsilon);
-  request->set_feature_index(feature_index);
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::BatchNormGrad(
-    const ComputationDataHandle& operand, const ComputationDataHandle& scale,
-    const ComputationDataHandle& batch_mean,
-    const ComputationDataHandle& batch_var,
-    const ComputationDataHandle& grad_output, float epsilon,
-    int64 feature_index) {
-  OpRequest op_request;
-  BatchNormGradRequest* request = op_request.mutable_batch_norm_grad_request();
-  *request->mutable_operand() = operand;
-  *request->mutable_scale() = scale;
-  *request->mutable_mean() = batch_mean;
-  *request->mutable_variance() = batch_var;
-  *request->mutable_grad_output() = grad_output;
-  request->set_epsilon(epsilon);
-  request->set_feature_index(feature_index);
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::CrossReplicaSum(
-    const ComputationDataHandle& operand) {
-  OpRequest op_request;
-  CrossReplicaSumRequest* request =
-      op_request.mutable_cross_replica_sum_request();
-  *request->mutable_operand() = operand;
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::SelectAndScatter(
-    const ComputationDataHandle& operand, const Computation& select,
-    tensorflow::gtl::ArraySlice<int64> window_dimensions,
-    tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
-    const ComputationDataHandle& source,
-    const ComputationDataHandle& init_value, const Computation& scatter) {
-  if (!first_error_.ok()) {
-    return ComputationDataHandle();
-  }
-
-  StatusOr<std::unique_ptr<Shape>> shape = GetShape(operand);
-  if (!shape.ok()) {
-    return ComputationDataHandle();
-  }
-  return SelectAndScatterWithGeneralPadding(
-      operand, select, window_dimensions, window_strides,
-      MakePadding(AsInt64Slice(shape.ValueOrDie()->dimensions()),
-                  window_dimensions, window_strides, padding),
-      source, init_value, scatter);
-}
-
-ComputationDataHandle ComputationBuilder::SelectAndScatterWithGeneralPadding(
-    const ComputationDataHandle& operand, const Computation& select,
-    tensorflow::gtl::ArraySlice<int64> window_dimensions,
-    tensorflow::gtl::ArraySlice<int64> window_strides,
-    tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-    const ComputationDataHandle& source,
-    const ComputationDataHandle& init_value, const Computation& scatter) {
-  OpRequest op_request;
-  SelectAndScatterRequest* request =
-      op_request.mutable_select_and_scatter_request();
-  *request->mutable_operand() = operand;
-  *request->mutable_select() = select.handle();
-  *request->mutable_source() = source;
-  *request->mutable_init_value() = init_value;
-  *request->mutable_scatter() = scatter.handle();
-
-  if (!MakeWindow(window_dimensions, window_strides, padding, {}, {},
-                  request->mutable_window())) {
-    NoteError(InternalError("failed to make window"));
-    return ComputationDataHandle();
-  }
-
-  return RunOpAndParseResponse(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::ReducePrecision(
-    const ComputationDataHandle& operand, const int exponent_bits,
-    const int mantissa_bits) {
-  OpRequest op_request;
-  ReducePrecisionRequest* request =
-      op_request.mutable_reduce_precision_request();
-  *request->mutable_operand() = operand;
-  request->set_exponent_bits(exponent_bits);
-  request->set_mantissa_bits(mantissa_bits);
-  return RunOpAndParseResponse(&op_request);
-}
-
-void ComputationBuilder::Send(const ComputationDataHandle& operand,
-                              const ChannelHandle& handle) {
-  OpRequest op_request;
-  SendRequest* request = op_request.mutable_send_request();
-  *request->mutable_operand() = operand;
-  *request->mutable_channel_handle() = handle;
-  *op_request.mutable_computation() = computation_.handle();
-  RunOpAndNoteError(&op_request);
-}
-
-ComputationDataHandle ComputationBuilder::Recv(const Shape& shape,
-                                               const ChannelHandle& handle) {
-  OpRequest op_request;
-  RecvRequest* request = op_request.mutable_recv_request();
-  *request->mutable_shape() = shape;
-  *request->mutable_channel_handle() = handle;
-  return RunOpAndParseResponse(&op_request);
-}
-
-Computation ComputationBuilder::BuildAndNoteError() {
-  DCHECK(parent_builder_ != nullptr);
-  auto build_status = Build();
-  if (!build_status.ok()) {
-    parent_builder_->NoteError(
-        AddStatus(build_status.status(),
-                  tensorflow::strings::StrCat("error from: ", name_)));
-    return Computation();
-  }
-  return build_status.ConsumeValueOrDie();
-}
-
-StatusOr<Computation> ComputationBuilder::Build() {
-  if (!first_error_.ok()) {
-    string backtrace;
-    first_error_backtrace_.Dump(tensorflow::DebugWriteToString, &backtrace);
-    return AppendStatus(first_error_, backtrace);
-  }
-
-  if (computation_.IsNull()) {
-    return FailedPrecondition("no computation was built");
-  }
-
-  return {std::move(computation_)};
-}
-
-/* static */ ConvolutionDimensionNumbers
-ComputationBuilder::CreateDefaultConvDimensionNumbers(int num_spatial_dims) {
-  ConvolutionDimensionNumbers dimension_numbers;
-  dimension_numbers.set_input_batch_dimension(kConvBatchDimension);
-  dimension_numbers.set_input_feature_dimension(kConvFeatureDimension);
-  dimension_numbers.set_output_batch_dimension(kConvBatchDimension);
-  dimension_numbers.set_output_feature_dimension(kConvFeatureDimension);
-  dimension_numbers.set_kernel_output_feature_dimension(
-      kConvKernelOutputDimension);
-  dimension_numbers.set_kernel_input_feature_dimension(
-      kConvKernelInputDimension);
-  for (int i = 0; i < num_spatial_dims; ++i) {
-    dimension_numbers.add_input_spatial_dimensions(i + 2);
-    dimension_numbers.add_kernel_spatial_dimensions(i + 2);
-    dimension_numbers.add_output_spatial_dimensions(i + 2);
-  }
-  return dimension_numbers;
-}
-
-/* static */ StatusOr<ConvolutionDimensionNumbers>
-ComputationBuilder::CreateConvDimensionNumbers(
-    int64 input_batch, int64 input_feature, int64 input_first_spatial,
-    int64 input_second_spatial, int64 output_batch, int64 output_feature,
-    int64 output_first_spatial, int64 output_second_spatial,
-    int64 kernel_output_feature, int64 kernel_input_feature,
-    int64 kernel_first_spatial, int64 kernel_second_spatial) {
-  if (std::set<int64>({input_batch, input_feature, input_first_spatial,
-                       input_second_spatial})
-          .size() != 4) {
-    return FailedPrecondition(
-        "dimension numbers for the input are not unique: (%lld, %lld, %lld, "
-        "%lld)",
-        input_batch, input_feature, input_first_spatial, input_second_spatial);
-  }
-  if (std::set<int64>({kernel_output_feature, kernel_input_feature,
-                       kernel_first_spatial, kernel_second_spatial})
-          .size() != 4) {
-    return FailedPrecondition(
-        "dimension numbers for the weight are not unique: (%lld, %lld, %lld, "
-        "%lld)",
-        kernel_output_feature, kernel_input_feature, kernel_first_spatial,
-        kernel_second_spatial);
-  }
-  if (std::set<int64>({output_batch, output_feature, output_first_spatial,
-                       output_second_spatial})
-          .size() != 4) {
-    return FailedPrecondition(
-        "dimension numbers for the output are not unique: (%lld, %lld, %lld, "
-        "%lld)",
-        output_batch, output_feature, output_first_spatial,
-        output_second_spatial);
-  }
-  ConvolutionDimensionNumbers dimension_numbers;
-  dimension_numbers.set_input_batch_dimension(input_batch);
-  dimension_numbers.set_input_feature_dimension(input_feature);
-  dimension_numbers.add_input_spatial_dimensions(input_first_spatial);
-  dimension_numbers.add_input_spatial_dimensions(input_second_spatial);
-  dimension_numbers.set_kernel_output_feature_dimension(kernel_output_feature);
-  dimension_numbers.set_kernel_input_feature_dimension(kernel_input_feature);
-  dimension_numbers.add_kernel_spatial_dimensions(kernel_first_spatial);
-  dimension_numbers.add_kernel_spatial_dimensions(kernel_second_spatial);
-  dimension_numbers.set_output_batch_dimension(output_batch);
-  dimension_numbers.set_output_feature_dimension(output_feature);
-  dimension_numbers.add_output_spatial_dimensions(output_first_spatial);
-  dimension_numbers.add_output_spatial_dimensions(output_second_spatial);
-  return dimension_numbers;
-}
-
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/computation_builder.h b/tensorflow/compiler/xla/client/computation_builder.h
deleted file mode 100644
index 9431c2c459a564..00000000000000
--- a/tensorflow/compiler/xla/client/computation_builder.h
+++ /dev/null
@@ -1,1065 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_COMPUTATION_BUILDER_H_
-#define TENSORFLOW_COMPILER_XLA_CLIENT_COMPUTATION_BUILDER_H_
-
-#include <functional>
-#include <initializer_list>
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "tensorflow/compiler/xla/array.h"
-#include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/array3d.h"
-#include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/client.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/global_data.h"
-#include "tensorflow/compiler/xla/client/padding.h"
-#include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/bitmap.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/lib/gtl/array_slice.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/stacktrace.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace xla {
-
-// Wraps an XLA client with a convenient interface for building up
-// computations. Any errors encountered in building up the computation are
-// deferred from being handled until Build() is called.
-//
-// Thread-compatible.
-class ComputationBuilder {
- public:
-  // client: client in which to build the computation.
-  // computation_name: name to use for the built computation.
-  ComputationBuilder(Client* client, const string& computation_name);
-
-  ~ComputationBuilder();
-
-  // Returns the client the builder was initialized with.
-  Client* client() const { return client_; }
-
-  // Returns the computation name.
-  const string& name() const { return name_; }
-
-  // Sets OpMetadata that will be added to all instructions until cleared.
-  //
-  // OpMetadata is often applied to a series of XLA HLO instructions. As a
-  // result, OpMetadata is set on the Computation Builder. All subsequent
-  // instructions generated via this Computation Builder will have the same
-  // OpMetadata attached until a call to ClearOpMetadata.
-  void SetOpMetadata(const OpMetadata& metadata) { metadata_ = metadata; }
-
-  // Clears the HloMetadata state.
-  void ClearOpMetadata() { metadata_.Clear(); }
-
-  // Sets an OpSharding that will be attached to all instructions until cleared.
-  void SetSharding(const OpSharding& sharding) { sharding_ = sharding; }
-
-  // Clears the sharding. Ops will be sharded according to the default placement
-  // policy.
-  void ClearSharding() { sharding_ = tensorflow::gtl::nullopt; }
-
-  // Returns the OpSharding that will be attached to all instructions.
-  const tensorflow::gtl::optional<OpSharding>& sharding() const {
-    return sharding_;
-  }
-
-  // Sets the builder to a mode where it will die immediately when an error is
-  // encountered, rather than producing it in a deferred fashion when Build() is
-  // called (which is the default).
-  void set_die_immediately_on_error(bool enabled) {
-    die_immediately_on_error_ = enabled;
-  }
-
-  // Enqueues a "retrieve parameter value" instruction for a parameter that was
-  // passed to the computation.
-  ComputationDataHandle Parameter(int64 parameter_number, const Shape& shape,
-                                  const string& name);
-
-  // Retrieves the (inferred) shape of the operand in the computation.
-  StatusOr<std::unique_ptr<Shape>> GetShape(
-      const ComputationDataHandle& operand);
-
-  // Retrieves the (inferred) result for the current computation's shape.
-  StatusOr<ProgramShape> GetProgramShape();
-
-  // Enqueues a constant with the value of the given literal onto the
-  // computation.
-  ComputationDataHandle ConstantLiteral(const Literal& literal);
-
-  // Enqueues a constant onto the computation. Methods are templated on the
-  // native host type (NativeT) which corresponds to a specific XLA
-  // PrimitiveType as given in the following table:
-  //
-  //  Native Type   PrimitiveType
-  // -----------------------------
-  //   bool           PRED
-  //   int32          S32
-  //   int64          S64
-  //   uint32         U32
-  //   uint64         U64
-  //   float          F32
-  //   double         F64
-  //
-  // Note: not all primitive types defined in xla_data.proto have a
-  // corresponding native type yet.
-  template <typename NativeT>
-  ComputationDataHandle ConstantR0(NativeT value);
-  template <typename NativeT>
-  ComputationDataHandle ConstantR1(tensorflow::gtl::ArraySlice<NativeT> values);
-  ComputationDataHandle ConstantR1(const tensorflow::core::Bitmap& values);
-  template <typename NativeT>
-  ComputationDataHandle ConstantR2(
-      std::initializer_list<std::initializer_list<NativeT>> values);
-  template <typename NativeT>
-  ComputationDataHandle ConstantFromArrayWithLayout(
-      const Array<NativeT>& values, const Layout& layout);
-  template <typename NativeT>
-  ComputationDataHandle ConstantFromArray(const Array<NativeT>& values);
-  template <typename NativeT>
-  ComputationDataHandle ConstantR2FromArray2DWithLayout(
-      const Array2D<NativeT>& values, const Layout& layout);
-  template <typename NativeT>
-  ComputationDataHandle ConstantR2FromArray2D(const Array2D<NativeT>& values);
-  template <typename NativeT>
-  ComputationDataHandle ConstantR3FromArray3DWithLayout(
-      const Array3D<NativeT>& values, const Layout& layout);
-  template <typename NativeT>
-  ComputationDataHandle ConstantR3FromArray3D(const Array3D<NativeT>& values);
-  template <typename NativeT>
-  ComputationDataHandle ConstantR4FromArray4DWithLayout(
-      const Array4D<NativeT>& values, const Layout& layout);
-  template <typename NativeT>
-  ComputationDataHandle ConstantR4FromArray4D(const Array4D<NativeT>& values);
-
-  // Enqueues a rank one constant (vector) onto the computation. The vector has
-  // size 'length' and every element has the value 'value'.
-  template <typename NativeT>
-  ComputationDataHandle ConstantR1(int64 length, NativeT value);
-
-  // Adds dimensions to an array by duplicating the data in the array.
-  //
-  // The new dimensions are inserted on the left, i.e. if
-  // broadcast_sizes has values {a0, ..., aN} and the operand shape
-  // has dimensions {b0, ..., bM} then the shape of the output has
-  // dimensions {a0, ..., aN, b0, ..., bM}.
-  //
-  // The new dimensions index into copies of the operand, i.e.
-  //
-  //   output[i0, ..., iN, j0, ..., jM] = operand[j0, ..., jM]
-  ComputationDataHandle Broadcast(
-      const ComputationDataHandle& operand,
-      tensorflow::gtl::ArraySlice<int64> broadcast_sizes);
-
-  // Enqueues a pad operation onto the computation that pads the given value on
-  // the edges as well as between the elements of the input. padding_config
-  // specifies the padding amount for each dimension.
-  ComputationDataHandle Pad(const ComputationDataHandle& operand,
-                            const ComputationDataHandle& padding_value,
-                            const PaddingConfig& padding_config);
-
-  // Enqueues an operation onto the computation that flattens the operand based
-  // on the dimension order (major/slowest-varying to minor/fastest-varying)
-  // given, followed by reshaping it into the shape with the given dimension
-  // sizes (also major to minor). Conceptually, this is a limited form of
-  // "shape casting".
-  ComputationDataHandle Reshape(const ComputationDataHandle& operand,
-                                tensorflow::gtl::ArraySlice<int64> dimensions,
-                                tensorflow::gtl::ArraySlice<int64> new_sizes);
-
-  // Enqueues an operation onto the computation that collapses the operand, from
-  // first to last dimension (C order), then reshapes it to the given dimension
-  // sizes. Conceptually, this is a limited form of "shape casting".
-  ComputationDataHandle Reshape(const ComputationDataHandle& operand,
-                                tensorflow::gtl::ArraySlice<int64> new_sizes);
-
-  // Wrapper for Reshape.
-  // Enqueues an operation to collapse the provided dimensions; e.g. an
-  // operand with dimensions {x=256, y=2, z=2, p=32} can be collapsed to
-  // {x=1024, y=32} by collapsing dims {0, 1, 2}. Collapsing dimensions must
-  // be a consecutive, in-order subsequence of the operand dimensions.
-  //
-  // Note that collapsing a single dimension does nothing:
-  //
-  //    {256} collapsing {0} => {256}
-  //    {1} collapsing {0} => {1}
-  //
-  // Collapsing multiple dimensions produces a single result dimension:
-  //
-  //    {256, 2} collapsing {0,1} => {512}
-  //    {256, 2, 3} collapsing {0,1} => {512, 3}
-  //
-  // This could potentially cause data to be moved -- it provides a more
-  // structured form of reshaping than an arbitrary Reshape operation.
-  ComputationDataHandle Collapse(const ComputationDataHandle& operand,
-                                 tensorflow::gtl::ArraySlice<int64> dimensions);
-
-  // Enqueues a slice operation onto the computation that slices the operand
-  // from the start indices to the limit indices; e.g.
-  //
-  //        x
-  //   [ 0 1 2 3 ]
-  // y [ 4 5 6 7 ] => slice(start={1, 1}, limit={2, 3}) => [ 5 6 ]
-  //   [ 8 9 a b ]
-  //
-  // Note that "limit" means up-to-but-not-including; i.e. [start, limit) in 1D
-  // range notation.
-  // The strides parameter determines the stride over the slice
-  ComputationDataHandle Slice(const ComputationDataHandle& operand,
-                              tensorflow::gtl::ArraySlice<int64> start_indices,
-                              tensorflow::gtl::ArraySlice<int64> limit_indices,
-                              tensorflow::gtl::ArraySlice<int64> strides);
-
-  // Enqueues a slice operation in a given dimension, taking all other
-  // dimensions as they are; e.g. if dimno is 1 from start_index 2 to
-  // limit_index 4 by 1, and the shape is f32[7,8,9], this call is short-hand
-  // for:
-  //
-  //  array[:, 2:4:1, :]
-  ComputationDataHandle SliceInDim(const ComputationDataHandle& operand,
-                                   int64 start_index, int64 limit_index,
-                                   int64 stride, int64 dimno);
-
-  // Enqueues a slice operation onto the computation that slices the 'operand'
-  // from dynamic start indices which are passed in 'start_indices'.
-  // The size of the slice in each dimension is passed in 'slice_sizes',
-  // which specify the end point of exclusive slice intervals in each
-  // dimension [start, start + size).
-  // The shape of 'start_indices' must be rank == 1, with dimension size
-  // equal to the rank of the 'operand'.
-  // Slice index calculations are computed modulo input dimension sizes to
-  // prevent dynamic start indices from generating out-of-bound array accesses.
-  ComputationDataHandle DynamicSlice(
-      const ComputationDataHandle& operand,
-      const ComputationDataHandle& start_indices,
-      tensorflow::gtl::ArraySlice<int64> slice_sizes);
-
-  // Enqueues a dynamic update slice operation onto the computation, which
-  // updates a slice of 'operand' with 'update' at dynamic 'start_indices'.
-  // The shape of 'update' determines the shape of the slice of 'operand'
-  // which is updated.
-  // The indices specified in 'start_indices' specify the offset of the slice
-  // of 'operand' which is updated.
-  //
-  //               update = {10, 11} // calculated at runtime.
-  //   [1 2 3]     start  = {1, 1}   // calculated at runtime.  [1 2  3 ]
-  //   [4 5 6]  => DynamicUpdateslice(data, update, start)   => [4 10 11]
-  //   [7 8 9]                                                  [7 8  9 ]
-  //
-  // The shape of 'start_indices' must be rank == 1, with dimension size
-  // equal to the rank of the 'operand'.
-  // Slice index calculations are computed modulo update dimension sizes to
-  // prevent dynamic start indices from generating out-of-bound array accesses.
-  ComputationDataHandle DynamicUpdateSlice(
-      const ComputationDataHandle& operand, const ComputationDataHandle& update,
-      const ComputationDataHandle& start_indices);
-
-  // Enqueues a concatenate instruction onto the computation. 'operands' must
-  // have >= 1 entry.
-  ComputationDataHandle ConcatInDim(
-      tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
-      int64 dimension);
-
-  // Enqueue a tracing operation onto the computation; the computation will emit
-  // a logging message with the operand.
-  void Trace(const string& tag, const ComputationDataHandle& operand);
-
-  // Enqueues a conditional-move-like select operation onto the computation;
-  // predicated on pred, selects between on_true and on_false.
-  ComputationDataHandle Select(const ComputationDataHandle& pred,
-                               const ComputationDataHandle& on_true,
-                               const ComputationDataHandle& on_false);
-
-  // Enqueues a tuple-creation instruction onto the computation.
-  ComputationDataHandle Tuple(
-      tensorflow::gtl::ArraySlice<ComputationDataHandle> elements);
-
-  // Enqueues a tuple-element-get instruction onto the computation.
-  ComputationDataHandle GetTupleElement(const ComputationDataHandle& tuple_data,
-                                        int64 index);
-
-  // Enqueues an equal-to comparison instruction onto the computation.
-  ComputationDataHandle Eq(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a not-equal comparison instruction onto the computation.
-  ComputationDataHandle Ne(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a greater-or-equal comparison instruction onto the computation.
-  ComputationDataHandle Ge(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a greater-than comparison instruction onto the computation.
-  ComputationDataHandle Gt(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a less-than comparison instruction onto the computation.
-  ComputationDataHandle Lt(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a less-or-equal comparison instruction onto the computation.
-  ComputationDataHandle Le(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a dot instruction onto the computation.
-  ComputationDataHandle Dot(const ComputationDataHandle& lhs,
-                            const ComputationDataHandle& rhs);
-
-  // Enqueues a general dot instruction onto the computation.
-  ComputationDataHandle DotGeneral(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      const DotDimensionNumbers& dimension_numbers);
-
-  // Default dimension numbers used for a 2D convolution.
-  static constexpr int64 kConvBatchDimension = 0;
-  static constexpr int64 kConvFeatureDimension = 1;
-  static constexpr int64 kConvFirstSpatialDimension = 2;
-  static constexpr int64 kConvSecondSpatialDimension = 3;
-  static constexpr int64 kConvKernelOutputDimension = 0;
-  static constexpr int64 kConvKernelInputDimension = 1;
-  static constexpr int64 kConvKernelFirstSpatialDimension = 2;
-  static constexpr int64 kConvKernelSecondSpatialDimension = 3;
-
-  // Creates a default ConvolutionDimensionNumbers. For a 2D convolution, for
-  // the input operand {batch, feature, height, width} = {0, 1, 2, 3} and for
-  // the kernel operand
-  // {output_feature, input_feature, height, width} = {0, 1, 2, 3}.
-  static ConvolutionDimensionNumbers CreateDefaultConvDimensionNumbers(
-      int num_spatial_dims = 2);
-
-  // Creates a ConvolutionDimensionNumbers with the given arguments. Returns an
-  // error if either the input or the weight dimension numbers have conflicts.
-  static StatusOr<ConvolutionDimensionNumbers> CreateConvDimensionNumbers(
-      int64 input_batch, int64 input_feature, int64 input_first_spatial,
-      int64 input_second_spatial, int64 output_batch, int64 output_feature,
-      int64 output_first_spatial, int64 output_second_spatial,
-      int64 kernel_output_feature, int64 kernel_input_feature,
-      int64 kernel_first_spatial, int64 kernel_second_spatial);
-
-  // Enqueues a convolution instruction onto the computation, which uses the
-  // default convolution dimension numbers.
-  ComputationDataHandle Conv(const ComputationDataHandle& lhs,
-                             const ComputationDataHandle& rhs,
-                             tensorflow::gtl::ArraySlice<int64> window_strides,
-                             Padding padding);
-
-  // Enqueues a convolution instruction onto the computation, with the caller
-  // provided padding configuration in the format returned by MakePadding().
-  ComputationDataHandle ConvWithGeneralPadding(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
-
-  // Enqueues a convolution instruction onto the computation, with the caller
-  // provided dimension numbers configuration.
-  ComputationDataHandle ConvWithGeneralDimensions(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
-      const ConvolutionDimensionNumbers& dimension_numbers);
-
-  // Enqueues a convolution instruction onto the computation, with the caller
-  // provided padding configuration as well as the dimension numbers.
-  ComputationDataHandle ConvGeneral(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-      const ConvolutionDimensionNumbers& dimension_numbers);
-
-  // Enqueues a convolution instruction onto the computation, with the caller
-  // provided padding configuration, dilation factors and dimension numbers.
-  ComputationDataHandle ConvGeneralDilated(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-      tensorflow::gtl::ArraySlice<int64> lhs_dilation,
-      tensorflow::gtl::ArraySlice<int64> rhs_dilation,
-      const ConvolutionDimensionNumbers& dimension_numbers);
-
-  // Enqueues an FFT instruction onto the computation, of the given type and
-  // with the given FFT length.
-  ComputationDataHandle Fft(const ComputationDataHandle& operand,
-                            FftType fft_type,
-                            tensorflow::gtl::ArraySlice<int64> fft_length);
-
-  // Enqueues an infeed instruction onto the computation, which writes data of
-  // the given shape to the infeed buffer of the device.
-  ComputationDataHandle Infeed(const Shape& shape, const string& config = "");
-
-  // Enqueues an outfeed instruction onto the computation. This instruction
-  // generates outgoing data transfers for the given data.
-  //
-  // shape_with_layout communicates the laid out shape that we want to outfeed
-  // -- if !ShapeUtil::Compatible(GetShape(operand), shape_with_layout) an error
-  // will occur.
-  void Outfeed(const ComputationDataHandle& operand,
-               const Shape& shape_with_layout, const string& outfeed_config);
-
-  // Enqueues a call instruction onto the computation.
-  ComputationDataHandle Call(
-      const Computation& computation,
-      tensorflow::gtl::ArraySlice<ComputationDataHandle> operands);
-
-  // Enqueues a custom call instruction onto the computation.
-  // During code generation, a call instruction is emitted which targets a
-  // symbol with the name |call_target_name|.  The |operands| are passed to the
-  // call instruction.  |shape| is the resultant shape.
-  ComputationDataHandle CustomCall(
-      const string& call_target_name,
-      tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
-      const Shape& shape);
-
-  // Enqueues a pseudo-op to represent host-side computation data-dependencies.
-  // During code generation, host send and receive operations will be generated
-  // to transfer |operands| to the host and a single result of |shape| back to
-  // the device.  Host send/recv operations are emitted using |channel_name|.
-  // Dataflow dependencies and the |cost_estimate_ns| field may be used in HLO
-  // instruction scheduling.
-  ComputationDataHandle HostCompute(
-      tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
-      const string& channel_name, int64 cost_estimate_ns, const Shape& shape);
-
-  // The following methods enqueue element-wise binary arithmetic operations
-  // onto the computation. The shapes of the operands have to match unless one
-  // of the operands is a scalar, or an explicit broadcast dimension is given
-  // (see g3doc for more details).
-
-  // Enqueues a complex compose instruction onto the computation.
-  ComputationDataHandle Complex(
-      const ComputationDataHandle& real, const ComputationDataHandle& imag,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a complex conjugate instruction onto the computation.
-  ComputationDataHandle Conj(const ComputationDataHandle& operand);
-
-  // Enqueues an add instruction onto the computation.
-  ComputationDataHandle Add(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a subtract instruction onto the computation.
-  ComputationDataHandle Sub(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a multiply instruction onto the computation.
-  ComputationDataHandle Mul(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a divide instruction onto the computation.
-  ComputationDataHandle Div(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a remainder instruction onto the computation.
-  ComputationDataHandle Rem(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a max instruction onto the computation.
-  ComputationDataHandle Max(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues a min instruction onto the computation.
-  ComputationDataHandle Min(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Element-wise logical operators
-  ComputationDataHandle And(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  ComputationDataHandle Or(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  ComputationDataHandle Xor(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  ComputationDataHandle Not(const ComputationDataHandle& operand);
-
-  ComputationDataHandle ShiftLeft(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-  ComputationDataHandle ShiftRightArithmetic(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-  ComputationDataHandle ShiftRightLogical(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Reduces an array among the provided dimensions, given "computation" as a
-  // reduction operator.
-  ComputationDataHandle Reduce(
-      const ComputationDataHandle& operand,
-      const ComputationDataHandle& init_value, const Computation& computation,
-      tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce);
-
-  // Convenience wrapper around the above that reduces all the dimensions in the
-  // operand shape.
-  ComputationDataHandle ReduceAll(const ComputationDataHandle& operand,
-                                  const ComputationDataHandle& init_value,
-                                  const Computation& computation);
-
-  // Enqueues a windowed reduce instruction onto the computation.
-  ComputationDataHandle ReduceWindow(
-      const ComputationDataHandle& operand,
-      const ComputationDataHandle& init_value, const Computation& computation,
-      tensorflow::gtl::ArraySlice<int64> window_dimensions,
-      tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding);
-
-  // As ReduceWindow(), but the padding is given in the format
-  // returned by MakePadding().
-  ComputationDataHandle ReduceWindowWithGeneralPadding(
-      const ComputationDataHandle& operand,
-      const ComputationDataHandle& init_value, const Computation& computation,
-      tensorflow::gtl::ArraySlice<int64> window_dimensions,
-      tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding);
-
-  // Returns the sum of the operand value across all replicas. All replicas
-  // supply one input to the sum and all replicas receive the resulting sum.
-  ComputationDataHandle CrossReplicaSum(const ComputationDataHandle& operand);
-
-  // Enqueues an operation that scatters the `source` array to the selected
-  // indices of each window.
-  ComputationDataHandle SelectAndScatter(
-      const ComputationDataHandle& operand, const Computation& select,
-      tensorflow::gtl::ArraySlice<int64> window_dimensions,
-      tensorflow::gtl::ArraySlice<int64> window_strides, Padding padding,
-      const ComputationDataHandle& source,
-      const ComputationDataHandle& init_value, const Computation& scatter);
-
-  // As SelectAndScatter(), but the padding is given in the format
-  // returned by MakePadding().
-  ComputationDataHandle SelectAndScatterWithGeneralPadding(
-      const ComputationDataHandle& operand, const Computation& select,
-      tensorflow::gtl::ArraySlice<int64> window_dimensions,
-      tensorflow::gtl::ArraySlice<int64> window_strides,
-      tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-      const ComputationDataHandle& source,
-      const ComputationDataHandle& init_value, const Computation& scatter);
-
-  // Enqueues an abs instruction onto the computation.
-  ComputationDataHandle Abs(const ComputationDataHandle& operand);
-
-  // Enqueues a atan2 instruction onto the computation.
-  ComputationDataHandle Atan2(
-      const ComputationDataHandle& y, const ComputationDataHandle& x,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues an exp instruction onto the computation.
-  ComputationDataHandle Exp(const ComputationDataHandle& operand);
-
-  // Enqueues a floor instruction onto the computation.
-  ComputationDataHandle Floor(const ComputationDataHandle& operand);
-
-  // Enqueues a ceil instruction onto the computation.
-  ComputationDataHandle Ceil(const ComputationDataHandle& operand);
-
-  // Enqueues a round instruction onto the computation, rounding to nearest even
-  // with half-way cases rounding away from zero.
-  ComputationDataHandle Round(const ComputationDataHandle& operand);
-
-  // Enqueues an log instruction (natural logarithm) onto the computation.
-  ComputationDataHandle Log(const ComputationDataHandle& operand);
-
-  // Enqueues a sign instruction onto the computation.
-  ComputationDataHandle Sign(const ComputationDataHandle& operand);
-
-  // Enqueues a cosine instruction onto the computation.
-  ComputationDataHandle Cos(const ComputationDataHandle& operand);
-
-  // Enqueues a sine instruction onto the computation.
-  ComputationDataHandle Sin(const ComputationDataHandle& operand);
-
-  // Enqueues a tanh instruction onto the computation.
-  ComputationDataHandle Tanh(const ComputationDataHandle& operand);
-
-  // Enqueues a real-part instruction onto the computation.
-  ComputationDataHandle Real(const ComputationDataHandle& operand);
-
-  // Enqueues an imaginary-part instruction onto the computation.
-  ComputationDataHandle Imag(const ComputationDataHandle& operand);
-
-  // Enqueues a float32 sqrt instruction onto the computation.
-  // (float32 is specified as there is an implicit float32 0.5f constant
-  // exponent).
-  ComputationDataHandle SqrtF32(const ComputationDataHandle& operand);
-
-  // Enqueues a float32 square instruction onto the computation.
-  // (float32 is specified as there is an implicit float32 2.0f constant
-  // exponent).
-  ComputationDataHandle SquareF32(const ComputationDataHandle& operand);
-
-  // Enqueues a lhs^rhs computation onto the computation.
-  ComputationDataHandle Pow(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions = {});
-
-  // Enqueues an operator that tests if the operand's values are finite, i.e.,
-  // not Inf or NaN. Defined only for floating-point types. Returns an array of
-  // booleans with the same shape where entries are true iff the corresponding
-  // entry was NaN.
-  ComputationDataHandle IsFinite(const ComputationDataHandle& operand);
-
-  // Enqueues a convert instruction onto the computation that changes the
-  // element type of the operand array to primitive_type.
-  ComputationDataHandle ConvertElementType(const ComputationDataHandle& operand,
-                                           PrimitiveType new_element_type);
-
-  // Enqueues a no-op instruction onto the computation that changes
-  // the element type of the operand array to primitive_type. The
-  // bit-widths of the source and destination element types must be
-  // identical.
-  ComputationDataHandle BitcastConvertType(const ComputationDataHandle& operand,
-                                           PrimitiveType new_element_type);
-
-  // Enqueues a float32 reciprocal instruction onto the computation.
-  // (float32 is specified as there is an implicit float32 -1.0f constant
-  // exponent).
-  //
-  // TODO(b/34468990) axe F32 suffix, can be determined by reflecting on the
-  // shape of the operand.
-  ComputationDataHandle ReciprocalF32(const ComputationDataHandle& operand);
-
-  // Enqueues a negate instruction onto the computation.
-  ComputationDataHandle Neg(const ComputationDataHandle& operand);
-
-  // Enqueues a count-leading-zeros instruction onto the computation.
-  ComputationDataHandle Clz(const ComputationDataHandle& operand);
-
-  // Enqueues a transpose instruction onto the computation.
-  ComputationDataHandle Transpose(
-      const ComputationDataHandle& operand,
-      tensorflow::gtl::ArraySlice<int64> permutation);
-
-  // Enqueues a reverse instruction onto the computation. The order of the
-  // elements in the given dimensions is reversed (i.e., the element at index i
-  // is moved to index dimension_size - 1 - i).
-  ComputationDataHandle Rev(const ComputationDataHandle& operand,
-                            tensorflow::gtl::ArraySlice<int64> dimensions);
-
-  // Enqueues a sort (as increasing order) instruction onto the computation.
-  ComputationDataHandle Sort(const ComputationDataHandle& operand);
-
-  // Enqueues a clamp instruction onto the computation.
-  ComputationDataHandle Clamp(const ComputationDataHandle& min,
-                              const ComputationDataHandle& operand,
-                              const ComputationDataHandle& max);
-
-  // Enqueues a map instruction onto the computation.
-  ComputationDataHandle Map(
-      tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
-      const Computation& computation,
-      tensorflow::gtl::ArraySlice<int64> dimensions,
-      tensorflow::gtl::ArraySlice<ComputationDataHandle> static_operands = {});
-
-  // Enqueues a N(mu, sigma) random number generation instruction onto the
-  // computation.
-  ComputationDataHandle RngNormal(const ComputationDataHandle& mu,
-                                  const ComputationDataHandle& sigma,
-                                  const Shape& shape);
-
-  // Enqueues a U(a, b) random number generation instruction onto the
-  // computation. Returns values in the semi-open interval [a, b).
-  ComputationDataHandle RngUniform(const ComputationDataHandle& a,
-                                   const ComputationDataHandle& b,
-                                   const Shape& shape);
-
-  // Enqueues a while node onto the computation.
-  ComputationDataHandle While(const Computation& condition,
-                              const Computation& body,
-                              const ComputationDataHandle& init);
-
-  // Enqueues a conditional node onto the computation.
-  ComputationDataHandle Conditional(const ComputationDataHandle& predicate,
-                                    const ComputationDataHandle& true_operand,
-                                    const Computation& true_computation,
-                                    const ComputationDataHandle& false_operand,
-                                    const Computation& false_computation);
-
-  // Enqueues a ReducePrecision node onto the computation.
-  ComputationDataHandle ReducePrecision(const ComputationDataHandle& operand,
-                                        const int exponent_bits,
-                                        const int mantissa_bits);
-
-  // Enqueues a Gather node onto the computation.
-  ComputationDataHandle Gather(
-      const ComputationDataHandle& input,
-      const ComputationDataHandle& gather_indices,
-      const GatherDimensionNumbers& dimension_numbers,
-      tensorflow::gtl::ArraySlice<int64> window_bounds);
-
-  // Enqueues a Send node onto the computation, to send the given operand to
-  // a Recv instruction that shares the same channel handle.
-  void Send(const ComputationDataHandle& operand, const ChannelHandle& handle);
-
-  // Enqueues a Recv node onto the computation. The data comes from a Send
-  // instruction that shares the same channel handle and its shape must
-  // be the same as the given shape.
-  ComputationDataHandle Recv(const Shape& shape, const ChannelHandle& handle);
-
-  // Returns true if 'operand' is a compile-time constant. A compile-time
-  // constant does not depend on parameters with index greater than or equal to
-  // `num_parameters`, or on stateful operators such as `RngNormal` or `Infeed`.
-  // Unlike `ComputeConstant`, `IsConstant` tests whether a computation is a
-  // compile-time constant without evaluating the computation.
-  StatusOr<bool> IsConstant(const ComputationDataHandle& operand,
-                            int64 num_parameters = 0);
-
-  // Normalizes operand across spatial and batch dimensions for each feature.
-  //
-  // Returns a tuple (normalized, batch_mean, batch_var) where `normalized`
-  // is the normalized result and batch_mean and batch_var are the mean and
-  // variance, respectively, across batch for the operand.
-  ComputationDataHandle BatchNormTraining(const ComputationDataHandle& operand,
-                                          const ComputationDataHandle& scale,
-                                          const ComputationDataHandle& offset,
-                                          float epsilon, int64 feature_index);
-
-  // Normalizes operand across spatial and batch dimensions for each feature.
-  //
-  // `BatchNormInference` is equivalent to calling `BatchNormTraining` without
-  // computing `mean` and `variance` for each batch inside the operation. It
-  // uses the input `mean` and `variance` instead as estimated values. The
-  // purpose of this op is to reduce latency in inference, hence the name
-  // `BatchNormInference`.
-  //
-  // The output has the same shape as `operand`, and contains the normalized
-  // values for each batch.
-  ComputationDataHandle BatchNormInference(
-      const ComputationDataHandle& operand, const ComputationDataHandle& scale,
-      const ComputationDataHandle& offset, const ComputationDataHandle& mean,
-      const ComputationDataHandle& variance, float epsilon,
-      int64 feature_index);
-
-  // Calculates the gradients of a batch norm op.
-  //
-  // The inputs `batch_mean` and `batch_var` represent the mean and variance
-  // across the batch.
-  //
-  // Returns a tuple of three elements:
-  //   - grad_operand: Gradient with respect to input `operand`
-  //   - grad_offset: Gradient with respect to input `offset`
-  //   - grad_scale: Gradient with respect to input `scale`
-  ComputationDataHandle BatchNormGrad(const ComputationDataHandle& operand,
-                                      const ComputationDataHandle& scale,
-                                      const ComputationDataHandle& batch_mean,
-                                      const ComputationDataHandle& batch_var,
-                                      const ComputationDataHandle& grad_output,
-                                      float epsilon, int64 feature_index);
-
-  // Computes the value of a constant indicated by a
-  // ComputationDataHandle using a non-optimized interpreter on the host.
-  //
-  // The operand must be from the computation currently being built -
-  // i.e., returned from this builder with no intervening call to
-  // Build(). This happens to currently work regardless of that, but
-  // that may stop working at any time.
-  //
-  // The operand must represent a constant value, which in this case
-  // means that it must not statically depend on any parameter of the
-  // computation that is being built other then the ones specified on the
-  // parameter list. The parameters in the list will be indexed by their
-  // parameter id property so the number of parameters specified should be at
-  // least as many as the largest used parameter index.
-  //
-  // `IsConstant` can be used to test whether a computation is a compile-time
-  // constant without evaluation it. `ComputeConstant` only succeeds for
-  // computations where `IsConstant` returns true.
-  //
-  // This functionality can be useful when translating a computation
-  // into XLA where something that looked dynamic is required by
-  // XLA to be specified as a constant. E.g. the source
-  // computation (outside of XLA) may include a dynamic
-  // computation of the shape of something and ComputeConstant lets
-  // you determine what the value of that computation is in the case
-  // where the value can be determined at compile time.
-  //
-  // If output_layout is non-null, then the output of the computation
-  // will be stored using that layout.
-  StatusOr<std::unique_ptr<Literal>> ComputeConstant(
-      const ComputationDataHandle& operand,
-      const Layout* output_layout = nullptr,
-      tensorflow::gtl::ArraySlice<Literal> parameters = {});
-
-  // Returns a new ComputationBuilder whose resultant Computation is used only
-  // by this ComputationBuilder. The sub-ComputationBuilder has the same
-  // die_immediately_on_error behavior as the parent.
-  std::unique_ptr<ComputationBuilder> CreateSubBuilder(
-      const string& computation_name);
-
-  // Modifies the computation being built so that executions of it
-  // will return the value associated with operand, rather than the
-  // last expression enqueued on the ComputationBuilder. Any subsequent
-  // operations added to the ComputationBuilder will not have any effect unless
-  // SetReturnValue is called again.
-  Status SetReturnValue(const ComputationDataHandle& operand);
-
-  // Builds the computation with the requested operations, or returns a non-ok
-  // status.
-  StatusOr<Computation> Build();
-
-  // Builds the computation with the requested operations, or notes an error in
-  // the parent ComputationBuilder and returns an empty computation if building
-  // failed. This function is intended to be used where the returned
-  // Computation is only used by the parent ComputationBuilder and hence further
-  // operation on the returned Computation will simply be error'ed out if an
-  // error occurred while building this computation. If the built computation is
-  // to be used by a ComputationBuilder other than the parent ComputationBuilder
-  // then Build() should be used instead.
-  Computation BuildAndNoteError();
-
-  // Returns the first error that was encountered while building the
-  // computation. When an error is encountered, by default we return a vacuous
-  // ComputationDataHandle and inform the user of the error that occurred while
-  // building the computation when they make a final call to Build().
-  //
-  // See also set_die_immediately_on_error().
-  Status first_error() const { return first_error_; }
-
- private:
-  // Limited checking of convolution parameters. Returns false on
-  // error.
-  bool VerifyConvolution(const Shape& lhs_shape, const Shape& rhs_shape,
-                         const ConvolutionDimensionNumbers& dimension_numbers);
-
-  // The parent ComputationBuilder of a sub-ComputationBuilder. The
-  // parent_builder_ will be the nullptr if not a sub-ComputationBuilder.
-  ComputationBuilder* parent_builder_{nullptr};
-
-  // Helper function for creating a Window proto from user-supplied
-  // data. Returns true if the user-supplied data was valid.
-  bool MakeWindow(tensorflow::gtl::ArraySlice<int64> window_dimensions,
-                  tensorflow::gtl::ArraySlice<int64> window_strides,
-                  tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-                  tensorflow::gtl::ArraySlice<int64> lhs_dilation,
-                  tensorflow::gtl::ArraySlice<int64> rhs_dilation,
-                  Window* window);
-
-  // Internal helper method that does the building for an arbitrary unary op.
-  ComputationDataHandle UnaryOp(UnaryOperation unop,
-                                const ComputationDataHandle& operand);
-
-  // Internal helper method that does the building for an arbitrary binary op.
-  // broadcast_dimensions specifies which dimensions to use for broadcasting
-  // when the operation is between tensors of different ranks.
-  ComputationDataHandle BinaryOp(
-      BinaryOperation binop, const ComputationDataHandle& lhs,
-      const ComputationDataHandle& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
-
-  // Internal helper method that does the building for an arbitrary ternary op.
-  ComputationDataHandle TernaryOp(TernaryOperation triop,
-                                  const ComputationDataHandle& lhs,
-                                  const ComputationDataHandle& rhs,
-                                  const ComputationDataHandle& ehs);
-
-  // Internal helper method that does the building for a random number generator
-  // of a given distribution with an explicitly specified shape.
-  ComputationDataHandle RngOp(
-      RandomDistribution distribution,
-      tensorflow::gtl::ArraySlice<ComputationDataHandle> parameters,
-      const Shape& shape);
-
-  // Populates computation_ with a valid object or returns a failing status.
-  // This is used before any given operation is enqueued.
-  Status PrepareComputation();
-
-  // Notes that the error occurred by:
-  // * storing it internally and capturing a backtrace if it's the first error
-  //   (this deferred value will be produced on the call to Build())
-  // * dying if die_immediately_on_error_ is true
-  void NoteError(const Status& error);
-
-  // Helper function that runs the given op_request, filling in op_response.
-  // Before the op is run, PrepareComputation is called, and common fields in
-  // the op_request are filled in.
-  Status RunOp(OpRequest* op_request, OpResponse* op_response);
-
-  // Helper function that calls RunOp and calls NoteError on failures.
-  void RunOpAndNoteError(OpRequest* op_request);
-
-  // Helper function that calls RunOp and either returns the output computation
-  // data handle (on success) or a vacuous computation data handle (on failure).
-  ComputationDataHandle RunOpAndParseResponse(OpRequest* op_request);
-
-  // Helper function that implements GetShape without noting errors. This makes
-  // it easier to ensure the real GetShape will note errors on every error path.
-  StatusOr<std::unique_ptr<Shape>> GetShapeWithoutNoteError(
-      const ComputationDataHandle& operand);
-
-  string name_;  // Name to use for the built computation.
-
-  // The first error encountered while building the computation.
-  // This is OK until the first error is encountered.
-  Status first_error_;
-
-  // The saved stack trace from the point at which the first error occurred.
-  tensorflow::SavedStackTrace first_error_backtrace_;
-
-  // The computation that operations are enqueued onto.
-  Computation computation_;
-
-  // The client that the computation is created in. Not owned.
-  Client* client_;
-
-  // Mode bit that indicates whether to die when a first error is encountered.
-  bool die_immediately_on_error_ = false;
-
-  // The metadata to attach to each op. This is structured as a "modal"-like
-  // operation, in order to simplify client code (and not sprinkle this metadata
-  // throughout the TensorFlow op kernel implementations).
-  OpMetadata metadata_;
-
-  // Sharding for this operator. This is structured as a "model"-like operation,
-  // in order to simplify client code, similar to metadata_.
-  tensorflow::gtl::optional<OpSharding> sharding_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(ComputationBuilder);
-};
-
-template <typename NativeT>
-ComputationDataHandle ComputationBuilder::ConstantR0(NativeT value) {
-  return ConstantLiteral(*Literal::CreateR0<NativeT>(value));
-}
-
-template <typename NativeT>
-ComputationDataHandle ComputationBuilder::ConstantR1(
-    tensorflow::gtl::ArraySlice<NativeT> values) {
-  return ConstantLiteral(*Literal::CreateR1<NativeT>(values));
-}
-
-template <typename NativeT>
-ComputationDataHandle ComputationBuilder::ConstantR1(int64 length,
-                                                     NativeT value) {
-  Literal literal(ShapeUtil::MakeShape(
-      primitive_util::NativeToPrimitiveType<NativeT>(), {length}));
-  literal.PopulateWithValue(value);
-  return ConstantLiteral(literal);
-}
-
-inline ComputationDataHandle ComputationBuilder::ConstantR1(
-    const tensorflow::core::Bitmap& values) {
-  return ConstantLiteral(*Literal::CreateR1(values));
-}
-
-template <typename NativeT>
-ComputationDataHandle ComputationBuilder::ConstantR2(
-    std::initializer_list<std::initializer_list<NativeT>> values) {
-  return ConstantLiteral(*Literal::CreateR2<NativeT>(values));
-}
-
-template <typename NativeT>
-ComputationDataHandle ComputationBuilder::ConstantFromArrayWithLayout(
-    const Array<NativeT>& values, const Layout& layout) {
-  return ConstantLiteral(
-      *Literal::CreateFromArrayWithLayout<NativeT>(values, layout));
-}
-
-template <typename NativeT>
-ComputationDataHandle ComputationBuilder::ConstantFromArray(
-    const Array<NativeT>& values) {
-  return ConstantLiteral(*Literal::CreateFromArray<NativeT>(values));
-}
-
-template <typename NativeT>
-ComputationDataHandle ComputationBuilder::ConstantR2FromArray2DWithLayout(
-    const Array2D<NativeT>& values, const Layout& layout) {
-  return ConstantLiteral(
-      *Literal::CreateFromArrayWithLayout<NativeT>(values, layout));
-}
-
-template <typename NativeT>
-ComputationDataHandle ComputationBuilder::ConstantR2FromArray2D(
-    const Array2D<NativeT>& values) {
-  return ConstantLiteral(*Literal::CreateR2FromArray2D<NativeT>(values));
-}
-
-template <typename NativeT>
-ComputationDataHandle ComputationBuilder::ConstantR3FromArray3DWithLayout(
-    const Array3D<NativeT>& values, const Layout& layout) {
-  return ConstantLiteral(
-      *Literal::CreateR3FromArray3DWithLayout<NativeT>(values, layout));
-}
-
-template <typename NativeT>
-ComputationDataHandle ComputationBuilder::ConstantR3FromArray3D(
-    const Array3D<NativeT>& values) {
-  return ConstantFromArray(values);
-}
-
-template <typename NativeT>
-ComputationDataHandle ComputationBuilder::ConstantR4FromArray4DWithLayout(
-    const Array4D<NativeT>& values, const Layout& layout) {
-  return ConstantFromArrayWithLayout(values, layout);
-}
-
-template <typename NativeT>
-ComputationDataHandle ComputationBuilder::ConstantR4FromArray4D(
-    const Array4D<NativeT>& values) {
-  return ConstantFromArray(values);
-}
-
-// RAII-style object: sets the current sharding assignment in builder on
-// construction, and sets back to the previous assignment on destruction.
-class ScopedShardingAssignment {
- public:
-  ScopedShardingAssignment(xla::ComputationBuilder* builder,
-                           tensorflow::gtl::optional<OpSharding> sharding)
-      : builder_(builder), prev_sharding_(builder->sharding()) {
-    SetSharding(sharding);
-  }
-
-  ~ScopedShardingAssignment() { SetSharding(prev_sharding_); }
-
- private:
-  void SetSharding(const tensorflow::gtl::optional<OpSharding>& sharding) {
-    if (sharding.has_value()) {
-      builder_->SetSharding(sharding.value());
-    } else {
-      builder_->ClearSharding();
-    }
-  }
-
-  xla::ComputationBuilder* const builder_;
-  tensorflow::gtl::optional<OpSharding> prev_sharding_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(ScopedShardingAssignment);
-};
-
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_COMPUTATION_BUILDER_H_
diff --git a/tensorflow/compiler/xla/client/executable_build_options.cc b/tensorflow/compiler/xla/client/executable_build_options.cc
index 6e3c5cb484b8f1..7dee41f6a05025 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.cc
+++ b/tensorflow/compiler/xla/client/executable_build_options.cc
@@ -87,6 +87,18 @@ ExecutableBuildOptions::dump_optimized_hlo_proto_to() const {
   return dump_optimized_hlo_proto_to_;
 }
 
+ExecutableBuildOptions&
+ExecutableBuildOptions::set_dump_unoptimized_hlo_proto_to(
+    tensorflow::StringPiece dirpath) {
+  dump_unoptimized_hlo_proto_to_ = dirpath.ToString();
+  return *this;
+}
+
+const tensorflow::gtl::optional<string>&
+ExecutableBuildOptions::dump_unoptimized_hlo_proto_to() const {
+  return dump_unoptimized_hlo_proto_to_;
+}
+
 ExecutableBuildOptions& ExecutableBuildOptions::set_dump_per_pass_hlo_proto_to(
     tensorflow::StringPiece dirpath) {
   dump_per_pass_hlo_proto_to_ = dirpath.ToString();
diff --git a/tensorflow/compiler/xla/client/executable_build_options.h b/tensorflow/compiler/xla/client/executable_build_options.h
index 11f10983606fe0..9dc9be4423564f 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.h
+++ b/tensorflow/compiler/xla/client/executable_build_options.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_CLIENT_EXECUTABLE_BUILD_OPTIONS_H_
 
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/optional.h"
@@ -64,6 +65,13 @@ class ExecutableBuildOptions {
       tensorflow::StringPiece dirpath);
   const tensorflow::gtl::optional<string>& dump_optimized_hlo_proto_to() const;
 
+  // If set, specifies a dirpath to dump the start-of-optimization-pipeline HLO
+  // protobuf to (as in DebugOptions).
+  ExecutableBuildOptions& set_dump_unoptimized_hlo_proto_to(
+      tensorflow::StringPiece dirpath);
+  const tensorflow::gtl::optional<string>& dump_unoptimized_hlo_proto_to()
+      const;
+
   // If set, specifies a dirpath to dump the per-pass-in-pipeline HLO protobufs
   // to (as in DebugOptions).
   ExecutableBuildOptions& set_dump_per_pass_hlo_proto_to(
@@ -76,6 +84,13 @@ class ExecutableBuildOptions {
   ExecutableBuildOptions& set_hlo_profile(bool enabled);
   tensorflow::gtl::optional<bool> hlo_profile() const;
 
+  void add_disabled_hlo_pass(tensorflow::StringPiece pass_name) {
+    disabled_hlo_passes_.push_back(std::string(pass_name));
+  }
+  const tensorflow::gtl::ArraySlice<std::string> disabled_hlo_passes() const {
+    return disabled_hlo_passes_;
+  }
+
   // Returns a string representation of the build options, suitable for
   // debugging.
   string ToString() const;
@@ -87,8 +102,10 @@ class ExecutableBuildOptions {
   bool result_layout_set_ = false;
   tensorflow::gtl::optional<string> generate_hlo_graph_;
   tensorflow::gtl::optional<string> dump_optimized_hlo_proto_to_;
+  tensorflow::gtl::optional<string> dump_unoptimized_hlo_proto_to_;
   tensorflow::gtl::optional<string> dump_per_pass_hlo_proto_to_;
   DeviceMemoryAllocator* device_allocator_ = nullptr;
+  std::vector<std::string> disabled_hlo_passes_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/global_data.cc b/tensorflow/compiler/xla/client/global_data.cc
index 40f59eaa68ebeb..2986d406001370 100644
--- a/tensorflow/compiler/xla/client/global_data.cc
+++ b/tensorflow/compiler/xla/client/global_data.cc
@@ -31,7 +31,7 @@ GlobalData::~GlobalData() {
   *request.mutable_data() = handle_;
   UnregisterResponse response;
   VLOG(1) << "requesting to unregister " << handle_.ShortDebugString();
-  tensorflow::Status s = parent_->Unregister(&request, &response);
+  Status s = parent_->Unregister(&request, &response);
   VLOG(1) << "done with request";
 
   if (!s.ok()) {
diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index 59c4a53c05a454..d49d959a6c8112 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -22,8 +22,6 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/core:lib",
@@ -43,9 +41,8 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.cc b/tensorflow/compiler/xla/client/lib/arithmetic.cc
index 63df449e0b3bdd..a1d34796ccfd86 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic.cc
+++ b/tensorflow/compiler/xla/client/lib/arithmetic.cc
@@ -17,7 +17,8 @@ limitations under the License.
 
 #include <string>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -27,28 +28,6 @@ limitations under the License.
 namespace xla {
 namespace {
 
-using InstructionGenerator =
-    ComputationDataHandle (*)(ComputationBuilder*, const ComputationDataHandle&,
-                              const ComputationDataHandle&);
-
-Computation CreateScalarComputation(const string& name, PrimitiveType type,
-                                    ComputationBuilder* builder,
-                                    InstructionGenerator generator) {
-  std::unique_ptr<ComputationBuilder> b;
-  if (type == PRED) {
-    b = builder->CreateSubBuilder(name);
-  } else {
-    b = builder->CreateSubBuilder(
-        tensorflow::strings::StrCat(name, "_", PrimitiveType_Name(type)));
-  }
-
-  const Shape scalar = ShapeUtil::MakeShape(type, {});
-  auto lhs = b->Parameter(0, scalar, "lhs");
-  auto rhs = b->Parameter(1, scalar, "rhs");
-  generator(b.get(), lhs, rhs);
-  return b->BuildAndNoteError();
-}
-
 using XlaOpGenerator = XlaOp (*)(XlaBuilder*, const XlaOp&, const XlaOp&);
 
 XlaComputation CreateScalarComputation(const string& name, PrimitiveType type,
@@ -71,71 +50,6 @@ XlaComputation CreateScalarComputation(const string& name, PrimitiveType type,
 
 }  // namespace
 
-Computation CreateScalarAddComputation(PrimitiveType type,
-                                       ComputationBuilder* builder) {
-  return CreateScalarComputation(
-      "add", type, builder,
-      [](ComputationBuilder* b, const ComputationDataHandle& lhs,
-         const ComputationDataHandle& rhs) { return b->Add(lhs, rhs); });
-}
-
-Computation CreateScalarMultiplyComputation(PrimitiveType type,
-                                            ComputationBuilder* builder) {
-  return CreateScalarComputation(
-      "mul", type, builder,
-      [](ComputationBuilder* b, const ComputationDataHandle& lhs,
-         const ComputationDataHandle& rhs) { return b->Mul(lhs, rhs); });
-}
-
-Computation CreateScalarGeComputation(PrimitiveType type,
-                                      ComputationBuilder* builder) {
-  return CreateScalarComputation(
-      "ge", type, builder,
-      [](ComputationBuilder* b, const ComputationDataHandle& lhs,
-         const ComputationDataHandle& rhs) { return b->Ge(lhs, rhs); });
-}
-
-Computation CreateScalarMaxComputation(PrimitiveType type,
-                                       ComputationBuilder* builder) {
-  return CreateScalarComputation(
-      "max", type, builder,
-      [](ComputationBuilder* b, const ComputationDataHandle& lhs,
-         const ComputationDataHandle& rhs) { return b->Max(lhs, rhs); });
-}
-
-Computation CreateScalarMinComputation(PrimitiveType type,
-                                       ComputationBuilder* builder) {
-  return CreateScalarComputation(
-      "min", type, builder,
-      [](ComputationBuilder* b, const ComputationDataHandle& lhs,
-         const ComputationDataHandle& rhs) { return b->Min(lhs, rhs); });
-}
-
-Computation CreateScalarAndComputation(ComputationBuilder* builder) {
-  return CreateScalarComputation(
-      "and", PRED, builder,
-      [](ComputationBuilder* b, const ComputationDataHandle& lhs,
-         const ComputationDataHandle& rhs) { return b->And(lhs, rhs); });
-}
-
-Computation CreateScalarOrComputation(ComputationBuilder* builder) {
-  return CreateScalarComputation(
-      "or", PRED, builder,
-      [](ComputationBuilder* b, const ComputationDataHandle& lhs,
-         const ComputationDataHandle& rhs) { return b->Or(lhs, rhs); });
-}
-
-StatusOr<ComputationDataHandle> Any(const ComputationDataHandle& predicates,
-                                    ComputationBuilder* builder) {
-  auto f = builder->ConstantR0<bool>(false);
-  Computation logical_or = CreateScalarOrComputation(builder);
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<Shape> predicates_shape,
-                      builder->GetShape(predicates));
-  std::vector<int64> all_dimensions(ShapeUtil::Rank(*predicates_shape));
-  std::iota(all_dimensions.begin(), all_dimensions.end(), 0);
-  return builder->Reduce(predicates, f, logical_or, all_dimensions);
-}
-
 XlaComputation CreateScalarAddComputation(PrimitiveType type,
                                           XlaBuilder* builder) {
   return CreateScalarComputation(
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.h b/tensorflow/compiler/xla/client/lib/arithmetic.h
index f4d3fc801590fe..64b6b7d6335316 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic.h
+++ b/tensorflow/compiler/xla/client/lib/arithmetic.h
@@ -18,83 +18,38 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 
-// Creates a scalar add computation and returns it.
-Computation CreateScalarAddComputation(PrimitiveType type,
-                                       ComputationBuilder* builder);
-
-// Creates a scalar multiply computation and returns it.
-Computation CreateScalarMultiplyComputation(PrimitiveType type,
-                                            ComputationBuilder* builder);
-
-// Creates a scalar ge computation and returns it.
-Computation CreateScalarGeComputation(PrimitiveType type,
-                                      ComputationBuilder* builder);
-
-// Creates a scalar max computation and returns it.
-Computation CreateScalarMaxComputation(PrimitiveType type,
-                                       ComputationBuilder* builder);
-
-// Creates a scalar min computation and returns it.
-Computation CreateScalarMinComputation(PrimitiveType type,
-                                       ComputationBuilder* builder);
-
-// Creates a scalar logical AND computation and returns it.
-Computation CreateScalarAndComputation(ComputationBuilder* builder);
-
-// Creates a scalar logical OR computation and returns it.
-Computation CreateScalarOrComputation(ComputationBuilder* builder);
-
-// Returns whether any predicate in "predicates" is set.
-//
-// Note: if predicates is zero-sized, Any() vacuously returns false.
-StatusOr<ComputationDataHandle> Any(const ComputationDataHandle& predicates,
-                                    ComputationBuilder* builder);
-
-// TODO(b/74197823): This is a part of a NOT YET ready refactor.
-//
 // Creates a scalar add computation and returns it.
 XlaComputation CreateScalarAddComputation(PrimitiveType type,
                                           XlaBuilder* builder);
-// TODO(b/74197823): This is a part of a NOT YET ready refactor.
-//
+
 // Creates a scalar multiply computation and returns it.
 XlaComputation CreateScalarMultiplyComputation(PrimitiveType type,
                                                XlaBuilder* builder);
-// TODO(b/74197823): This is a part of a NOT YET ready refactor.
-//
+
 // Creates a scalar ge computation and returns it.
 XlaComputation CreateScalarGeComputation(PrimitiveType type,
                                          XlaBuilder* builder);
-// TODO(b/74197823): This is a part of a NOT YET ready refactor.
-//
+
 // Creates a scalar max computation and returns it.
 XlaComputation CreateScalarMaxComputation(PrimitiveType type,
                                           XlaBuilder* builder);
-// TODO(b/74197823): This is a part of a NOT YET ready refactor.
-//
+
 // Creates a scalar min computation and returns it.
 XlaComputation CreateScalarMinComputation(PrimitiveType type,
                                           XlaBuilder* builder);
-// TODO(b/74197823): This is a part of a NOT YET ready refactor.
-//
+
 // Creates a scalar logical AND computation and returns it.
 XlaComputation CreateScalarAndComputation(XlaBuilder* builder);
 
-// TODO(b/74197823): This is a part of a NOT YET ready refactor.
-//
 // Creates a scalar logical OR computation and returns it.
 XlaComputation CreateScalarOrComputation(XlaBuilder* builder);
 
-// TODO(b/74197823): This is a part of a NOT YET ready refactor.
-//
 // Returns whether any predicate in "predicates" is set.
 //
 // Note: if predicates is zero-sized, Any() vacuously returns false.
diff --git a/tensorflow/compiler/xla/client/lib/testing.cc b/tensorflow/compiler/xla/client/lib/testing.cc
index 311dc4bdd72cfd..3380af9f303b1d 100644
--- a/tensorflow/compiler/xla/client/lib/testing.cc
+++ b/tensorflow/compiler/xla/client/lib/testing.cc
@@ -15,8 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/lib/testing.h"
 
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -46,16 +45,14 @@ int64 DataSizeOfShape(const Shape& shape) {
   return total_size;
 }
 
-// Create a ComputationDataHandle for an op what generates fake data with the
-// given shape.
-ComputationDataHandle BuildFakeDataOpOnDevice(const Shape& shape,
-                                              ComputationBuilder* builder) {
+// Creates a XlaOp for an op what generates fake data with the given shape.
+XlaOp BuildFakeDataOpOnDevice(const Shape& shape, XlaBuilder* builder) {
   if (ShapeUtil::IsArray(shape)) {
     return builder->Broadcast(
         builder->ConstantLiteral(Literal::One(shape.element_type())),
         AsInt64Slice(shape.dimensions()));
   }
-  std::vector<ComputationDataHandle> parts;
+  std::vector<XlaOp> parts;
   for (const Shape& s : shape.tuple_shapes()) {
     parts.push_back(BuildFakeDataOpOnDevice(s, builder));
   }
@@ -64,11 +61,10 @@ ComputationDataHandle BuildFakeDataOpOnDevice(const Shape& shape,
 
 std::unique_ptr<GlobalData> MakeFakeDataViaDeviceOrDie(const Shape& shape,
                                                        Client* client) {
-  ComputationBuilder b(
-      client,
+  XlaBuilder b(
       tensorflow::strings::StrCat("make_fake_", ShapeUtil::HumanString(shape)));
   BuildFakeDataOpOnDevice(shape, &b);
-  Computation computation = b.Build().ConsumeValueOrDie();
+  XlaComputation computation = b.Build().ConsumeValueOrDie();
 
   auto execution_options = CreateDefaultExecutionOptions();
   *execution_options.mutable_shape_with_output_layout() = shape;
@@ -96,21 +92,6 @@ std::unique_ptr<GlobalData> MakeFakeDataOrDie(const Shape& shape,
   return MakeFakeDataViaDeviceOrDie(shape, client);
 }
 
-std::vector<std::unique_ptr<GlobalData>> MakeFakeArgumentsOrDie(
-    const Computation& computation, Client* client) {
-  auto program_shape =
-      client->GetComputationShape(computation).ConsumeValueOrDie();
-
-  // For every (unbound) parameter that the computation wants, we manufacture
-  // some arbitrary data so that we can invoke the computation.
-  std::vector<std::unique_ptr<GlobalData>> fake_arguments;
-  for (const Shape& parameter : program_shape->parameters()) {
-    fake_arguments.push_back(MakeFakeDataOrDie(parameter, client));
-  }
-
-  return fake_arguments;
-}
-
 std::vector<std::unique_ptr<GlobalData>> MakeFakeArgumentsOrDie(
     const XlaComputation& computation, Client* client) {
   CHECK(computation.proto().has_program_shape())
diff --git a/tensorflow/compiler/xla/client/lib/testing.h b/tensorflow/compiler/xla/client/lib/testing.h
index 1dc2622972d5fd..dc613099e2b42a 100644
--- a/tensorflow/compiler/xla/client/lib/testing.h
+++ b/tensorflow/compiler/xla/client/lib/testing.h
@@ -20,7 +20,6 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/client.h"
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -33,12 +32,6 @@ namespace xla {
 std::unique_ptr<GlobalData> MakeFakeDataOrDie(const Shape& shape,
                                               Client* client);
 
-// Returns vector of GlobalData handles of fake data (created using
-// MakeFakeDataOrDie) that are correctly shaped arguments for the given
-// computation.
-std::vector<std::unique_ptr<GlobalData>> MakeFakeArgumentsOrDie(
-    const Computation& computation, Client* client);
-
 // Returns vector of GlobalData handles of fake data (created using
 // MakeFakeDataOrDie) that are correctly shaped arguments for the given
 // xla computation.
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index 1c1270590375ab..ae0308020d014e 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -48,30 +48,52 @@ LocalExecutable::LocalExecutable(std::unique_ptr<Executable> executable,
       << "Must have a valid device ordinal that the executable was built for.";
 }
 
-tensorflow::Status LocalExecutable::ValidateExecutionOptions(
+Status LocalExecutable::ValidateExecutionOptions(
     const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     const ExecutableRunOptions& run_options, const Backend& backend) {
-  const ComputationLayout& computation_layout =
-      executable_->module_config().entry_computation_layout();
+  const ComputationLayout& host_computation_layout =
+      executable_->module_config().host_entry_computation_layout();
+  const ComputationLayout& device_computation_layout =
+      executable_->module_config().device_entry_computation_layout();
 
   // Check argument number, shapes, and layouts.
-  if (arguments.size() != computation_layout.parameter_count()) {
+  if (arguments.size() != host_computation_layout.parameter_count()) {
     return InvalidArgument(
         "invalid number of arguments for computation: expected %d, got %zu",
-        computation_layout.parameter_count(), arguments.size());
+        host_computation_layout.parameter_count(), arguments.size());
+  }
+  if (arguments.size() != device_computation_layout.parameter_count()) {
+    return InvalidArgument(
+        "invalid number of arguments for computation: expected %d, got %zu",
+        device_computation_layout.parameter_count(), arguments.size());
   }
   for (int i = 0; i < arguments.size(); ++i) {
-    if (!computation_layout.parameter_layout(i).MatchesLayoutInShape(
+    if (!host_computation_layout.parameter_layout(i).MatchesLayoutInShape(
             arguments[i]->on_host_shape())) {
       return InvalidParameterArgument(
           executable_.get(), i,
-          "Argument does not match shape or layout of computation parameter "
+          "Argument does not match host shape or layout of computation "
+          "parameter "
           "%d: want %s, got %s",
           i,
-          ShapeUtil::HumanString(computation_layout.parameter_layout(i).shape())
+          ShapeUtil::HumanString(
+              host_computation_layout.parameter_layout(i).shape())
               .c_str(),
           ShapeUtil::HumanString(arguments[i]->on_host_shape()).c_str());
     }
+    if (!device_computation_layout.parameter_layout(i).MatchesLayoutInShape(
+            arguments[i]->on_device_shape())) {
+      return InvalidParameterArgument(
+          executable_.get(), i,
+          "Argument does not match device shape or layout of computation "
+          "parameter "
+          "%d: want %s, got %s",
+          i,
+          ShapeUtil::HumanString(
+              device_computation_layout.parameter_layout(i).shape())
+              .c_str(),
+          ShapeUtil::HumanString(arguments[i]->on_device_shape()).c_str());
+    }
   }
 
   if (run_options.stream() != nullptr) {
@@ -163,7 +185,7 @@ StatusOr<ScopedShapedBuffer> LocalExecutable::Run(
       run_options, backend_->StreamBorrower(),
       backend_->eigen_intra_op_thread_pool());
 
-  if (executable_->dumping()) {
+  if (executable_->dumping_snapshot()) {
     return ExecuteAndDump(&service_options, arguments);
   }
   return executable_->ExecuteOnStreamWrapper(
@@ -173,36 +195,36 @@ StatusOr<ScopedShapedBuffer> LocalExecutable::Run(
 StatusOr<ScopedShapedBuffer> LocalExecutable::ExecuteAndDump(
     const ServiceExecutableRunOptions* run_options,
     const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
-  executable_->session_module()->set_execution_platform(
+  executable_->hlo_snapshot()->set_execution_platform(
       backend_->platform()->Name());
-  TF_RETURN_IF_ERROR(RecordArguments(arguments, executable_->session_module()));
+  TF_RETURN_IF_ERROR(RecordArguments(arguments, executable_->hlo_snapshot()));
   TF_ASSIGN_OR_RETURN(
       ScopedShapedBuffer result,
       executable_->ExecuteOnStream(run_options, arguments,
                                    /*hlo_execution_profile=*/nullptr));
-  TF_RETURN_IF_ERROR(RecordResult(&result, executable_->session_module()));
-  TF_RETURN_IF_ERROR(executable_->DumpSessionModule());
+  TF_RETURN_IF_ERROR(RecordResult(&result, executable_->hlo_snapshot()));
+  TF_RETURN_IF_ERROR(executable_->DumpHloSnapshot());
   return std::move(result);
 }
 
-tensorflow::Status LocalExecutable::RecordArguments(
+Status LocalExecutable::RecordArguments(
     const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    SessionModule* session_module) {
-  session_module->clear_arguments();
+    HloSnapshot* hlo_snapshot) {
+  hlo_snapshot->clear_arguments();
   for (const ShapedBuffer* argument : arguments) {
     TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> literal,
                         LiteralFromShapedBuffer(*argument));
-    *session_module->add_arguments() = literal->ToProto();
+    *hlo_snapshot->add_arguments() = literal->ToProto();
   }
   return Status::OK();
 }
 
-tensorflow::Status LocalExecutable::RecordResult(
-    const ShapedBuffer* result, SessionModule* session_module) {
-  session_module->clear_result();
+Status LocalExecutable::RecordResult(const ShapedBuffer* result,
+                                     HloSnapshot* hlo_snapshot) {
+  hlo_snapshot->clear_result();
   TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> literal,
                       LiteralFromShapedBuffer(*result));
-  *session_module->mutable_result() = literal->ToProto();
+  *hlo_snapshot->mutable_result() = literal->ToProto();
   return Status::OK();
 }
 
@@ -239,25 +261,6 @@ Backend* LocalClient::mutable_backend() {
   return local_service_->mutable_backend();
 }
 
-StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Compile(
-    const Computation& computation,
-    const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
-    const ExecutableBuildOptions& options) {
-  ExecutableBuildOptions updated_options = options;
-  if (options.device_ordinal() == -1) {
-    updated_options.set_device_ordinal(default_device_ordinal());
-    VLOG(3) << "Set device ordinal to default value of: "
-            << updated_options.device_ordinal();
-  }
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<Executable> executable,
-      local_service_->CompileExecutable(computation.handle(), argument_layouts,
-                                        updated_options));
-  return WrapUnique(new LocalExecutable(std::move(executable),
-                                        local_service_->mutable_backend(),
-                                        updated_options));
-}
-
 StatusOr<std::unique_ptr<LocalExecutable>> LocalClient::Compile(
     const XlaComputation& computation,
     const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
@@ -301,6 +304,11 @@ StatusOr<std::unique_ptr<Literal>> LocalClient::ShapedBufferToLiteral(
                                                                  shaped_buffer);
 }
 
+StatusOr<const ShapedBuffer*> LocalClient::GlobalDataToShapedBuffer(
+    const GlobalDataHandle& data, int replica_number) {
+  return local_service_->GlobalDataToShapedBuffer(data, replica_number);
+}
+
 Status LocalClient::TransferToInfeedLocal(const Literal& literal,
                                           int device_ordinal) {
   TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index f306c520ede001..4d9e0d7cd9d6dd 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -19,12 +19,13 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/client/client.h"
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/executable.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -42,15 +43,6 @@ class LocalExecutable {
       const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       ExecutableRunOptions run_options);
 
-  // Return the layout (contained in a shape) of the result produced by the
-  // computation.
-  const Shape& result_layout() const {
-    return executable_->module_config()
-        .entry_computation_layout()
-        .result_layout()
-        .shape();
-  }
-
   // Return the options used to build the executable.
   const ExecutableBuildOptions& build_options() const { return build_options_; }
 
@@ -67,25 +59,30 @@ class LocalExecutable {
 
   // Validates that the given arguments and options satisfy various constraints
   // of the computation.
-  tensorflow::Status ValidateExecutionOptions(
+  //
+  // The given ExecutableRunOptions override any values from legacy_flags
+  // (TF_XLA_FLAGS environment variable).
+  Status ValidateExecutionOptions(
       const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
       const ExecutableRunOptions& run_options, const Backend& backend);
 
   // Records the computation in a SessionModule proto with the arguments used to
   // invoke it, and the result. Enabled by flag: --tla_dump_executions_to.
+  //
+  // The given ServiceExecutableRunOptions override any values from legacy_flags
+  // (TF_XLA_FLAGS environment variable).
   StatusOr<ScopedShapedBuffer> ExecuteAndDump(
       const ServiceExecutableRunOptions* run_options,
       const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments);
 
   // Records the arguments used to invoke the computation in a SessionModule
   // proto.
-  tensorflow::Status RecordArguments(
+  Status RecordArguments(
       const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      SessionModule* session_module);
+      HloSnapshot* hlo_snapshot);
 
   // Records the result of the computation in a SessionModule proto.
-  tensorflow::Status RecordResult(const ShapedBuffer* result,
-                                  SessionModule* session_module);
+  Status RecordResult(const ShapedBuffer* result, HloSnapshot* hlo_snapshot);
 
   // Returns a literal containing the contents of the given ShapedBuffer.
   StatusOr<std::unique_ptr<Literal>> LiteralFromShapedBuffer(
@@ -116,17 +113,11 @@ class LocalClient : public Client {
   LocalClient(const LocalClient&) = delete;
   void operator=(const LocalClient&) = delete;
 
-  // Build and return a LocalExecutable object. The executable is compiled using
-  // the given argument layouts and options.
-  StatusOr<std::unique_ptr<LocalExecutable>> Compile(
-      const Computation& computation,
-      const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
-      const ExecutableBuildOptions& options);
-
   // Build and return a LocalExecutable object. The executable is compiled using
   // the given XlaComputation, argument layouts and options.
   //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
+  // The given ExecutableBuildOptions override any values from legacy_flags
+  // (TF_XLA_FLAGS environment variable).
   StatusOr<std::unique_ptr<LocalExecutable>> Compile(
       const XlaComputation& computation,
       const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
@@ -145,6 +136,11 @@ class LocalClient : public Client {
   StatusOr<std::unique_ptr<Literal>> ShapedBufferToLiteral(
       const ShapedBuffer& shaped_buffer);
 
+  // Converts a GlobalDataHandle into a pointer to a ShapedBuffer that's valid
+  // as long as the handle is valid.
+  StatusOr<const ShapedBuffer*> GlobalDataToShapedBuffer(
+      const GlobalDataHandle& data, int replica_number);
+
   // Transfer the given literal to the infeed queue of the given device.
   // TODO(b/69670845): Remove the 'Local' from the name when LocalClient does
   // not inherit from Client and there is no possibility of confusion with
diff --git a/tensorflow/compiler/xla/client/xla_client/BUILD b/tensorflow/compiler/xla/client/xla_client/BUILD
index 0d6e207971ec64..507a2dc5f088e1 100644
--- a/tensorflow/compiler/xla/client/xla_client/BUILD
+++ b/tensorflow/compiler/xla/client/xla_client/BUILD
@@ -37,7 +37,6 @@ cc_library(
     ],
 )
 
-# TODO(b/74197823): Replace computation_builder with xla_builder.
 cc_library(
     name = "xla_builder",
     srcs = ["xla_builder.cc"],
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
index 1899983e442116..5e17cc4dfb0b22 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
@@ -57,16 +57,6 @@ bool CanBeRoot(HloOpcode opcode) {
   }
 }
 
-StatusOr<std::vector<Shape>> GetOperandShapes(
-    tensorflow::gtl::ArraySlice<XlaOp> operands) {
-  std::vector<Shape> operand_shapes;
-  for (const XlaOp& operand : operands) {
-    TF_ASSIGN_OR_RETURN(const Shape& shape, operand.GetShape());
-    operand_shapes.push_back(shape);
-  }
-  return operand_shapes;
-}
-
 }  // namespace
 
 StatusOr<Shape> XlaBuilder::GetShape(const XlaOp& op) const {
@@ -76,12 +66,14 @@ StatusOr<Shape> XlaBuilder::GetShape(const XlaOp& op) const {
   return instr->shape();
 }
 
-StatusOr<Shape> XlaOp::GetShape() const {
-  if (builder_ == nullptr) {
-    return InvalidArgument(
-        "cannot GetShape for an invalid XlaOp with handle %lld", handle());
+StatusOr<std::vector<Shape>> XlaBuilder::GetOperandShapes(
+    tensorflow::gtl::ArraySlice<XlaOp> operands) const {
+  std::vector<Shape> operand_shapes;
+  for (const XlaOp& operand : operands) {
+    TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(operand));
+    operand_shapes.push_back(shape);
   }
-  return builder_->GetShape(*this);
+  return operand_shapes;
 }
 
 XlaBuilder::XlaBuilder(const string& computation_name)
@@ -286,7 +278,7 @@ StatusOr<XlaOp> XlaBuilder::AddBroadcastSequence(const Shape& output_shape,
                                                  const XlaOp& operand) {
   TF_RETURN_IF_ERROR(first_error_);
 
-  TF_ASSIGN_OR_RETURN(const Shape& operand_shape, operand.GetShape());
+  TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
 
   CHECK(ShapeUtil::IsScalar(operand_shape) ||
         ShapeUtil::Rank(operand_shape) == ShapeUtil::Rank(output_shape));
@@ -325,7 +317,7 @@ StatusOr<XlaOp> XlaBuilder::AddBroadcastSequence(const Shape& output_shape,
 XlaOp XlaBuilder::UnaryOp(HloOpcode unop, const XlaOp& operand) {
   return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
-    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, operand.GetShape());
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
     TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
                         ShapeInference::InferUnaryOpShape(unop, operand_shape));
     return AddInstruction(std::move(instr), unop, {operand});
@@ -337,8 +329,8 @@ XlaOp XlaBuilder::BinaryOp(
     tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
   return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
-    TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, lhs.GetShape());
-    TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, rhs.GetShape());
+    TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
+    TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
     TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
                         ShapeInference::InferBinaryOpShape(
                             binop, lhs_shape, rhs_shape, broadcast_dimensions));
@@ -374,12 +366,12 @@ XlaOp XlaBuilder::BinaryOp(
       updated_rhs = !should_broadcast_lhs ? broadcasted_operand : rhs;
     }
 
-    TF_ASSIGN_OR_RETURN(Shape updated_lhs_shape, updated_lhs.GetShape());
+    TF_ASSIGN_OR_RETURN(Shape updated_lhs_shape, GetShape(updated_lhs));
     if (!ShapeUtil::SameDimensions(instr.shape(), updated_lhs_shape)) {
       TF_ASSIGN_OR_RETURN(updated_lhs,
                           AddBroadcastSequence(instr.shape(), updated_lhs));
     }
-    TF_ASSIGN_OR_RETURN(Shape updated_rhs_shape, updated_rhs.GetShape());
+    TF_ASSIGN_OR_RETURN(Shape updated_rhs_shape, GetShape(updated_rhs));
     if (!ShapeUtil::SameDimensions(instr.shape(), updated_rhs_shape)) {
       TF_ASSIGN_OR_RETURN(updated_rhs,
                           AddBroadcastSequence(instr.shape(), updated_rhs));
@@ -393,9 +385,9 @@ XlaOp XlaBuilder::TernaryOp(HloOpcode triop, const XlaOp& lhs, const XlaOp& rhs,
                             const XlaOp& ehs) {
   return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
-    TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, lhs.GetShape());
-    TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, rhs.GetShape());
-    TF_ASSIGN_OR_RETURN(const Shape& ehs_shape, ehs.GetShape());
+    TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
+    TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
+    TF_ASSIGN_OR_RETURN(const Shape& ehs_shape, GetShape(ehs));
     TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
                         ShapeInference::InferTernaryOpShape(
                             triop, lhs_shape, rhs_shape, ehs_shape));
@@ -437,7 +429,7 @@ XlaOp XlaBuilder::Mul(const XlaOp& lhs, const XlaOp& rhs,
   return BinaryOp(HloOpcode::kMultiply, lhs, rhs, broadcast_dimensions);
 }
 
-XlaOp XlaBuilder::ConstantLiteral(const Literal& literal) {
+XlaOp XlaBuilder::ConstantLiteral(const LiteralSlice& literal) {
   return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     *instr.mutable_shape() = literal.shape();
@@ -485,7 +477,7 @@ XlaOp XlaBuilder::Parameter(int64 parameter_number, const Shape& shape,
 XlaOp XlaBuilder::Broadcast(
     const XlaOp& operand, tensorflow::gtl::ArraySlice<int64> broadcast_sizes) {
   return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, operand.GetShape());
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
     TF_ASSIGN_OR_RETURN(
         const Shape& shape,
         ShapeInference::InferBroadcastShape(operand_shape, broadcast_sizes));
@@ -633,7 +625,7 @@ XlaOp XlaBuilder::Reshape(const XlaOp& operand,
                           tensorflow::gtl::ArraySlice<int64> dimensions,
                           tensorflow::gtl::ArraySlice<int64> new_sizes) {
   return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, operand.GetShape());
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
     TF_ASSIGN_OR_RETURN(const Shape& shape,
                         ShapeInference::InferReshapeShape(
                             operand_shape, dimensions, new_sizes));
@@ -647,7 +639,7 @@ XlaOp XlaBuilder::Reshape(const XlaOp& operand,
 XlaOp XlaBuilder::Reshape(const XlaOp& operand,
                           tensorflow::gtl::ArraySlice<int64> new_sizes) {
   return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    TF_ASSIGN_OR_RETURN(auto shape, operand.GetShape());
+    TF_ASSIGN_OR_RETURN(auto shape, GetShape(operand));
     std::vector<int64> dimensions(shape.dimensions_size());
     std::iota(dimensions.begin(), dimensions.end(), 0);
     return Reshape(operand, dimensions, new_sizes);
@@ -1002,7 +994,7 @@ XlaOp XlaBuilder::Fft(const XlaOp& operand, const FftType fft_type,
                       const tensorflow::gtl::ArraySlice<int64> fft_length) {
   return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
-    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, operand.GetShape());
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
     TF_ASSIGN_OR_RETURN(
         *instr.mutable_shape(),
         ShapeInference::InferFftShape(operand_shape, fft_type, fft_length));
@@ -1173,6 +1165,10 @@ XlaOp XlaBuilder::Exp(const XlaOp& operand) {
   return UnaryOp(HloOpcode::kExp, operand);
 }
 
+XlaOp XlaBuilder::Expm1(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kExpm1, operand);
+}
+
 XlaOp XlaBuilder::Floor(const XlaOp& operand) {
   return UnaryOp(HloOpcode::kFloor, operand);
 }
@@ -1189,6 +1185,10 @@ XlaOp XlaBuilder::Log(const XlaOp& operand) {
   return UnaryOp(HloOpcode::kLog, operand);
 }
 
+XlaOp XlaBuilder::Log1p(const XlaOp& operand) {
+  return UnaryOp(HloOpcode::kLog1p, operand);
+}
+
 XlaOp XlaBuilder::Sign(const XlaOp& operand) {
   return UnaryOp(HloOpcode::kSign, operand);
 }
@@ -1225,7 +1225,7 @@ XlaOp XlaBuilder::Transpose(const XlaOp& operand,
                             tensorflow::gtl::ArraySlice<int64> permutation) {
   return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
-    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, operand.GetShape());
+    TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
     TF_ASSIGN_OR_RETURN(
         *instr.mutable_shape(),
         ShapeInference::InferTransposeShape(operand_shape, permutation));
@@ -1613,13 +1613,35 @@ XlaOp XlaBuilder::BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
 
 XlaOp XlaBuilder::CrossReplicaSum(const XlaOp& operand) {
   return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(operand));
+    const Shape& scalar_shape = ShapeUtil::MakeShape(shape.element_type(), {});
+    auto b = CreateSubBuilder("sum");
+    b->Add(b->Parameter(/*parameter_number=*/0, scalar_shape, "x"),
+           b->Parameter(/*parameter_number=*/1, scalar_shape, "y"));
+    TF_ASSIGN_OR_RETURN(auto computation, b->Build());
+    return CrossReplicaSum(operand, computation, /*replica_group_ids=*/{},
+                           /*channel_id=*/tensorflow::gtl::nullopt);
+  });
+}
 
+XlaOp XlaBuilder::CrossReplicaSum(
+    const XlaOp& operand, const XlaComputation& computation,
+    tensorflow::gtl::ArraySlice<int64> replica_group_ids,
+    const tensorflow::gtl::optional<ChannelHandle>& channel_id) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    if (!replica_group_ids.empty() || channel_id.has_value()) {
+      return Unimplemented(
+          "replica_group_ids and channel_id and is not supported in AllReduce");
+    }
+
+    HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
     TF_ASSIGN_OR_RETURN(
         *instr.mutable_shape(),
         ShapeInference::InferCrossReplicaSumShape({&operand_shape}));
 
+    AddCalledComputation(computation, &instr);
+
     return AddInstruction(std::move(instr), HloOpcode::kCrossReplicaSum,
                           {operand});
   });
@@ -1948,11 +1970,18 @@ StatusOr<const HloInstructionProto*> XlaBuilder::LookUpInstruction(
     const XlaOp& op) const {
   TF_RETURN_IF_ERROR(first_error_);
 
+  if (op.builder_ == nullptr) {
+    return InvalidArgument(
+        "invalid XlaOp with handle %lld; the builder of this op is freed",
+        op.handle());
+  }
   if (op.builder_ != this) {
-    return InvalidArgument("invalid XlaOp with handle %lld", op.handle());
+    return InvalidArgument(
+        "XlaOp with handle %lld is built by builder '%s', but is trying to use "
+        "it in builder '%s'",
+        op.handle(), op.builder_->name().c_str(), this->name().c_str());
   }
 
-  TF_RET_CHECK(op.builder_ == this);
   if (op.handle() >= instructions_.size() || op.handle() < 0) {
     return InvalidArgument("no XlaOp value %lld", op.handle());
   }
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.h b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
index 4955f1515d66af..532cae014848e1 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
@@ -13,10 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// TODO(b/74197823): Replace computation_builder.h with this file.
-//
-// This is NOT YET ready to use.
-
 #ifndef TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_BUILDER_H_
 #define TENSORFLOW_COMPILER_XLA_CLIENT_XLA_CLIENT_XLA_BUILDER_H_
 
@@ -48,15 +44,11 @@ class XlaBuilder;
 // This represents an instruction that has been enqueued using the XlaBuilder.
 // This is used to pass to subsequent computations that depends upon the
 // instruction as an operand.
-//
-// TODO(b/74197823): Replace xla::ComputationDataHandle with this one.
 class XlaOp {
  public:
   XlaOp() : handle_(0), builder_(nullptr) {}
   ~XlaOp() {}
 
-  StatusOr<Shape> GetShape() const;
-
   const XlaBuilder* builder() const { return builder_; }
 
   bool operator==(const XlaOp& rhs) const {
@@ -87,8 +79,6 @@ class XlaOp {
 // A convenient interface for building up computations.
 //
 // Thread-compatible.
-//
-// TODO(b/74197823): Replace xla::ComputationBuilder with this one.
 class XlaBuilder {
  public:
   // computation_name: name to use for the built computation.
@@ -139,7 +129,7 @@ class XlaBuilder {
 
   // Enqueues a constant with the value of the given literal onto the
   // computation.
-  XlaOp ConstantLiteral(const Literal& literal);
+  XlaOp ConstantLiteral(const LiteralSlice& literal);
 
   // Enqueues a constant onto the computation. Methods are templated on the
   // native host type (NativeT) which corresponds to a specific XLA
@@ -542,6 +532,29 @@ class XlaBuilder {
   // supply one input to the sum and all replicas receive the resulting sum.
   XlaOp CrossReplicaSum(const XlaOp& operand);
 
+  // Enqueues an operation that do an AllReduce of the operand cross cores. Here
+  // AllReduce means doing a reduction on the input operand cross cores and then
+  // broadcasting the reduction result to those cores. The reduction function is
+  // defined by `computation`, which should be a commutative computation on
+  // scalars, e.g., add, min, or max. The way that AllReduce is applied is
+  // configured by:
+  //
+  // - `replica_group_ids`: maps replica ids to subgroup ids. If empty, all
+  // replicas belong to one group. Allreduce will be applied within subgroups.
+  // For example, we have 4 replicas, then replica_group_ids={0,1,0,1} means,
+  // replica 0 and 2 are in subgroup 0, replica 1 and 3 are in subgroup 1.
+  //
+  // - `channel_id`: for Allreduce nodes from different models, if they have the
+  // same channel_id, they will be 'Allreduce'd. If empty, Allreduce will not be
+  // applied cross models.
+  //
+  // TODO(b/79737069): Rename this to AllReduce when it's ready to use.
+  XlaOp CrossReplicaSum(
+      const XlaOp& operand, const XlaComputation& computation,
+      tensorflow::gtl::ArraySlice<int64> replica_group_ids = {},
+      const tensorflow::gtl::optional<ChannelHandle>& channel_id =
+          tensorflow::gtl::nullopt);
+
   // Enqueues an operation that scatters the `source` array to the selected
   // indices of each window.
   XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select,
@@ -571,6 +584,9 @@ class XlaBuilder {
   // Enqueues an exp instruction onto the computation.
   XlaOp Exp(const XlaOp& operand);
 
+  // Enqueues an expm1 instruction onto the computation.
+  XlaOp Expm1(const XlaOp& operand);
+
   // Enqueues a floor instruction onto the computation.
   XlaOp Floor(const XlaOp& operand);
 
@@ -584,6 +600,9 @@ class XlaBuilder {
   // Enqueues an log instruction (natural logarithm) onto the computation.
   XlaOp Log(const XlaOp& operand);
 
+  // Enqueues an log1p instruction (log(x+1)) onto the computation.
+  XlaOp Log1p(const XlaOp& operand);
+
   // Enqueues a sign instruction onto the computation.
   XlaOp Sign(const XlaOp& operand);
 
@@ -847,6 +866,10 @@ class XlaBuilder {
   // computation and fills the root_id in the pointer.
   StatusOr<ProgramShape> GetProgramShape(int64* root_id) const;
 
+  // Returns shapes for the operands.
+  StatusOr<std::vector<Shape>> GetOperandShapes(
+      tensorflow::gtl::ArraySlice<XlaOp> operands) const;
+
   // A visitor which checks whether an operation is a compile-time constant,
   // meaning that it doesn't depend on any parameters, or on any stateful
   // operation such as `RngNormal` or `Infeed`. The visitor walks the
@@ -981,8 +1004,6 @@ XlaOp XlaBuilder::ConstantR4FromArray4D(const Array4D<NativeT>& values) {
 
 // RAII-style object: sets the current sharding assignment in builder on
 // construction, and sets back to the previous assignment on destruction.
-//
-// TODO(b/74197823): This is a part of a NOT YET ready refactor.
 class XlaScopedShardingAssignment {
  public:
   XlaScopedShardingAssignment(xla::XlaBuilder* builder,
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder_test.cc
index ce984564d016ce..2df3ea3af0d4fc 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder_test.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder_test.cc
@@ -76,7 +76,7 @@ TEST_F(XlaBuilderTest, ParamPlusParamHasBroadcast) {
   auto y = b.Parameter(1, y_shape, "y");
   auto add = b.Add(x, y, /*broadcast_dimensions=*/{0, 1});
 
-  TF_ASSERT_OK_AND_ASSIGN(auto add_shape, add.GetShape());
+  TF_ASSERT_OK_AND_ASSIGN(auto add_shape, b.GetShape(add));
   EXPECT_TRUE(ShapeUtil::Equal(add_shape, x_shape));
 
   TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
@@ -188,8 +188,10 @@ TEST_F(XlaBuilderTest, OperandFromWrongBuilder) {
   builder.Add(p0, p0);
   auto statusor = builder.Build();
   ASSERT_FALSE(statusor.ok());
-  EXPECT_THAT(statusor.status().error_message(),
-              HasSubstr("Do not add XlaOp from builder b1 to builder main"));
+  EXPECT_THAT(
+      statusor.status().error_message(),
+      HasSubstr(
+          "built by builder 'b1', but is trying to use it in builder 'main'"));
 }
 
 TEST_F(XlaBuilderTest, ReshapeDefaultOrder) {
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_computation.h b/tensorflow/compiler/xla/client/xla_client/xla_computation.h
index b70b57e9ffec40..0ffba208b1f868 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_computation.h
+++ b/tensorflow/compiler/xla/client/xla_client/xla_computation.h
@@ -25,8 +25,6 @@ limitations under the License.
 namespace xla {
 
 // The computation graph that the user builds up with the XlaBuilder.
-//
-// TODO(b/74197823): Replace xla::Computation with this one.
 class XlaComputation {
  public:
   XlaComputation() : unique_id_(-1) {}
diff --git a/tensorflow/compiler/xla/error_spec.h b/tensorflow/compiler/xla/error_spec.h
new file mode 100644
index 00000000000000..a1463aa15941b9
--- /dev/null
+++ b/tensorflow/compiler/xla/error_spec.h
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_ERROR_SPEC_H_
+#define TENSORFLOW_COMPILER_XLA_ERROR_SPEC_H_
+
+namespace xla {
+
+// Structure describing permissible absolute and relative error bounds.
+struct ErrorSpec {
+  explicit ErrorSpec(float aabs, float arel = 0, bool relaxed_nans = false)
+      : abs(aabs), rel(arel), relaxed_nans(relaxed_nans) {}
+
+  float abs;  // Absolute error bound.
+  float rel;  // Relative error bound.
+
+  // If relaxed_nans is true then any result is valid if we are expecting NaNs.
+  // In effect, this allows the tested operation to produce incorrect results
+  // for inputs outside its mathematical domain.
+  bool relaxed_nans;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_ERROR_SPEC_H_
diff --git a/tensorflow/compiler/xla/executable_run_options.cc b/tensorflow/compiler/xla/executable_run_options.cc
index 99b8f0558e6e39..a472747bd174e3 100644
--- a/tensorflow/compiler/xla/executable_run_options.cc
+++ b/tensorflow/compiler/xla/executable_run_options.cc
@@ -45,17 +45,6 @@ stream_executor::Stream* ExecutableRunOptions::stream() const {
   return stream_;
 }
 
-ExecutableRunOptions& ExecutableRunOptions::set_inter_op_thread_pool(
-    tensorflow::thread::ThreadPool* inter_op_thread_pool) {
-  inter_op_thread_pool_ = inter_op_thread_pool;
-  return *this;
-}
-
-tensorflow::thread::ThreadPool* ExecutableRunOptions::inter_op_thread_pool()
-    const {
-  return inter_op_thread_pool_;
-}
-
 ExecutableRunOptions& ExecutableRunOptions::set_intra_op_thread_pool(
     const Eigen::ThreadPoolDevice* intra_op_thread_pool) {
   intra_op_thread_pool_ = intra_op_thread_pool;
diff --git a/tensorflow/compiler/xla/executable_run_options.h b/tensorflow/compiler/xla/executable_run_options.h
index a306ae16ba4aee..416131be006e6e 100644
--- a/tensorflow/compiler/xla/executable_run_options.h
+++ b/tensorflow/compiler/xla/executable_run_options.h
@@ -65,12 +65,6 @@ class ExecutableRunOptions {
   ExecutableRunOptions& set_stream(stream_executor::Stream* stream);
   stream_executor::Stream* stream() const;
 
-  // Sets the thread pool on which to run parallel CPU backend
-  // computations. Does not take ownership.
-  ExecutableRunOptions& set_inter_op_thread_pool(
-      tensorflow::thread::ThreadPool* inter_op_thread_pool);
-  tensorflow::thread::ThreadPool* inter_op_thread_pool() const;
-
   // Sets the thread pool device on which to run Eigen subcomputations.
   // Does not take ownership.
   ExecutableRunOptions& set_intra_op_thread_pool(
@@ -93,7 +87,6 @@ class ExecutableRunOptions {
   int device_ordinal_ = -1;
   DeviceAssignment* device_assignment_ = nullptr;
   stream_executor::Stream* stream_ = nullptr;
-  tensorflow::thread::ThreadPool* inter_op_thread_pool_ = nullptr;
   const Eigen::ThreadPoolDevice* intra_op_thread_pool_ = nullptr;
   ExecutionProfile* execution_profile_ = nullptr;
   int rng_seed_ = 0;
diff --git a/tensorflow/compiler/xla/layout_util.cc b/tensorflow/compiler/xla/layout_util.cc
index fdc4bbdd8b162b..e8f29b83291a7c 100644
--- a/tensorflow/compiler/xla/layout_util.cc
+++ b/tensorflow/compiler/xla/layout_util.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -64,6 +65,16 @@ void SetDefaultLayoutToContainer(
   return layout;
 }
 
+/* static */ Layout LayoutUtil::MakeLayoutFromMajorToMinor(
+    tensorflow::gtl::ArraySlice<int64> major_to_minor) {
+  Layout layout;
+  layout.set_format(DENSE);
+  for (int i = major_to_minor.size() - 1; i >= 0; i--) {
+    layout.add_minor_to_major(major_to_minor[i]);
+  }
+  return layout;
+}
+
 /* static */ Layout LayoutUtil::MakeSparseLayout(int64 max_sparse_elements) {
   Layout layout;
   layout.set_format(SPARSE);
@@ -87,8 +98,13 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
 }  // namespace
 
 /* static */ Layout LayoutUtil::GetDefaultLayoutForShape(const Shape& shape) {
+  if (ShapeUtil::IsOpaque(shape) || ShapeUtil::IsToken(shape)) {
+    // Opaque and token types have empty layouts.
+    return Layout();
+  }
+
   // A Layout proto corresponds to a single array, not a tuple.
-  DCHECK(!ShapeUtil::IsTuple(shape));
+  CHECK(ShapeUtil::IsArray(shape));
   return CreateDefaultLayoutForRank(shape.dimensions_size());
 }
 
@@ -115,14 +131,15 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
       SetToDefaultLayout(&element_shape);
     }
     shape->clear_layout();
-  } else if (ShapeUtil::IsOpaque(*shape)) {
-    shape->clear_layout();
-  } else {
+  } else if (ShapeUtil::IsArray(*shape)) {
     shape->mutable_layout()->set_format(DENSE);
     tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>*
         minor_to_major = shape->mutable_layout()->mutable_minor_to_major();
     minor_to_major->Resize(shape->dimensions_size(), 0);
     SetDefaultLayoutToContainer(minor_to_major);
+  } else {
+    // Opaque, token types etc. have no layout.
+    shape->clear_layout();
   }
 }
 
@@ -139,8 +156,7 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
   LayoutUtil::SetToDefaultLayout(program_shape->mutable_result());
 }
 
-/* static */ tensorflow::Status LayoutUtil::ValidateLayoutInShape(
-    const Shape& shape) {
+/* static */ Status LayoutUtil::ValidateLayoutInShape(const Shape& shape) {
   if (ShapeUtil::IsTuple(shape)) {
     // Tuple shape.
     if (shape.has_layout()) {
@@ -149,30 +165,34 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
     for (auto& element_shape : shape.tuple_shapes()) {
       TF_RETURN_IF_ERROR(ValidateLayoutInShape(element_shape));
     }
-    return tensorflow::Status::OK();
-  } else if (ShapeUtil::IsOpaque(shape)) {
-    if (shape.has_layout()) {
-      return InvalidArgument("opaque should not have a layout field");
-    }
-    return tensorflow::Status::OK();
-  } else {
-    // Array shape.
+    return Status::OK();
+  } else if (ShapeUtil::IsArray(shape)) {
     if (!shape.has_layout()) {
       return InvalidArgument("shape %s does not have a layout",
                              ShapeUtil::HumanString(shape).c_str());
     }
     return ValidateLayoutForShape(shape.layout(), shape);
+  } else {
+    // Token, opaque, etc. shape.
+    if (shape.has_layout()) {
+      return InvalidArgument(
+          "shape of primitive type %s should not have a layout",
+          PrimitiveType_Name(shape.element_type()).c_str());
+    }
+    return Status::OK();
   }
 }
 
-/* static */ tensorflow::Status LayoutUtil::ValidateLayoutForShape(
-    const Layout& layout, const Shape& shape) {
+/* static */ Status LayoutUtil::ValidateLayoutForShape(const Layout& layout,
+                                                       const Shape& shape) {
   if (ShapeUtil::IsTuple(shape)) {
     return InvalidArgument("a single Layout is not valid for tuple shapes");
   }
 
-  if (ShapeUtil::IsOpaque(shape)) {
-    return tensorflow::Status::OK();
+  if (!ShapeUtil::IsArray(shape)) {
+    return InvalidArgument(
+        "shape of primitive type %s should not have a layout",
+        PrimitiveType_Name(shape.element_type()).c_str());
   }
 
   if (layout.format() == INVALID_FORMAT) {
@@ -224,7 +244,7 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
     }
   }
 
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 /* static */ void LayoutUtil::ClearLayout(Shape* shape) {
@@ -263,7 +283,7 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
 }
 
 /* static */ bool LayoutUtil::IsPadded(const Shape& shape) {
-  if (ShapeUtil::IsTuple(shape) || !HasLayout(shape) ||
+  if (!ShapeUtil::IsArray(shape) || !HasLayout(shape) ||
       shape.layout().padded_dimensions_size() == 0) {
     return false;
   }
@@ -313,7 +333,8 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
     // Tuple shape: all subshapes must have a layout.
     return std::all_of(shape.tuple_shapes().begin(), shape.tuple_shapes().end(),
                        [](const Shape& s) { return HasLayout(s); });
-  } else if (ShapeUtil::IsOpaque(shape)) {
+  } else if (!ShapeUtil::IsArray(shape)) {
+    // Opaque, token types etc. ignore layout.
     return true;
   }
   return shape.has_layout() && shape.layout().format() != INVALID_FORMAT;
@@ -383,7 +404,7 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
 namespace {
 
 // Internal helper for recursively copying layouts.
-tensorflow::Status CopyLayoutInternal(const Shape& src, Shape* dst) {
+Status CopyLayoutInternal(const Shape& src, Shape* dst) {
   if (ShapeUtil::IsTuple(src) != ShapeUtil::IsTuple(*dst)) {
     return InvalidArgument(
         "cannot copy layout from shape: shape structure differs");
@@ -410,25 +431,21 @@ tensorflow::Status CopyLayoutInternal(const Shape& src, Shape* dst) {
       dst->clear_layout();
     }
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 }  // namespace
 
 /* static */
-tensorflow::Status LayoutUtil::CopyLayoutBetweenShapes(const Shape& src,
-                                                       Shape* dst) {
+Status LayoutUtil::CopyLayoutBetweenShapes(const Shape& src, Shape* dst) {
   return CopyLayoutInternal(src, dst);
 }
 
 /* static */ bool LayoutUtil::LayoutsInShapesEqual(const Shape& lhs,
                                                    const Shape& rhs) {
-  if (ShapeUtil::IsTuple(lhs) != ShapeUtil::IsTuple(rhs)) {
-    return false;
-  }
   if (ShapeUtil::IsTuple(lhs)) {
-    if (ShapeUtil::TupleElementCount(lhs) !=
-        ShapeUtil::TupleElementCount(rhs)) {
+    if (!ShapeUtil::IsTuple(rhs) || ShapeUtil::TupleElementCount(lhs) !=
+                                        ShapeUtil::TupleElementCount(rhs)) {
       return false;
     }
     for (int i = 0; i < ShapeUtil::TupleElementCount(lhs); ++i) {
@@ -437,9 +454,12 @@ tensorflow::Status LayoutUtil::CopyLayoutBetweenShapes(const Shape& src,
       }
     }
     return true;
-  } else {
+  } else if (ShapeUtil::IsArray(lhs)) {
     return ShapeUtil::Rank(lhs) == ShapeUtil::Rank(rhs) &&
            LayoutUtil::Equal(lhs.layout(), rhs.layout());
+  } else {
+    // Layouts of non-array and non-tuple shapes is ignored.
+    return true;
   }
 }
 
@@ -465,4 +485,25 @@ std::ostream& operator<<(std::ostream& out, const Layout& layout) {
   return out;
 }
 
+/*static*/ size_t LayoutUtil::Hash(const Layout& layout) {
+  using tensorflow::hash;
+  using tensorflow::Hash64Combine;
+
+  size_t hash_value = hash<Format>()(layout.format());
+
+  for (int64 minor_to_major : layout.minor_to_major()) {
+    hash_value = Hash64Combine(hash_value, hash<int64>()(minor_to_major));
+  }
+
+  for (int64 padded_dim : layout.padded_dimensions()) {
+    hash_value = Hash64Combine(hash_value, hash<int64>()(padded_dim));
+  }
+
+  hash_value =
+      Hash64Combine(hash_value, hash<PaddingValue>()(layout.padding_value()));
+  hash_value = Hash64Combine(hash_value, layout.max_sparse_elements());
+
+  return hash_value;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/layout_util.h b/tensorflow/compiler/xla/layout_util.h
index 6c54eb2201b66a..739bbe73675c7f 100644
--- a/tensorflow/compiler/xla/layout_util.h
+++ b/tensorflow/compiler/xla/layout_util.h
@@ -20,9 +20,9 @@ limitations under the License.
 
 #include <string>
 
+#include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
@@ -36,6 +36,10 @@ class LayoutUtil {
   // convenience function for protobuf construction.)
   static Layout MakeLayout(tensorflow::gtl::ArraySlice<int64> minor_to_major);
 
+  // Similar to MakeLayout, but take indices in reverse order.
+  static Layout MakeLayoutFromMajorToMinor(
+      tensorflow::gtl::ArraySlice<int64> major_to_minor);
+
   // Creates a sparse layout with the given maximum number of elements. (This is
   // a convenience function for protobuf construction.)
   static Layout MakeSparseLayout(int64 max_sparse_elements);
@@ -61,12 +65,12 @@ class LayoutUtil {
   static void SetToDefaultLayout(ProgramShape* program_shape);
 
   // Validates that the layout within the given shape is correct.
-  static tensorflow::Status ValidateLayoutInShape(const Shape& shape);
+  static Status ValidateLayoutInShape(const Shape& shape);
 
   // Validates that the provided layout satisfies invariants for the given
   // shape.
-  static tensorflow::Status ValidateLayoutForShape(const Layout& layout,
-                                                   const Shape& shape);
+  static Status ValidateLayoutForShape(const Layout& layout,
+                                       const Shape& shape);
 
   // Clears the layout in the given Shape. After this function is called,
   // HasLayout will return false for the shape.
@@ -179,8 +183,7 @@ class LayoutUtil {
   // tuples.  'src' and 'dst' need not be compatible but the two shapes must
   // have the same tuple structure (if any) and arrays must have the same
   // rank. within the shapes must have the same number of dimensions.
-  static tensorflow::Status CopyLayoutBetweenShapes(const Shape& src,
-                                                    Shape* dst);
+  static Status CopyLayoutBetweenShapes(const Shape& src, Shape* dst);
 
   // Returns true if the layouts of lhs and rhs are equal, false
   // otherwise. Recursively compares layouts of tuples.
@@ -195,6 +198,9 @@ class LayoutUtil {
   static bool AreDimensionsConsecutive(const Layout& layout,
                                        tensorflow::gtl::ArraySlice<int64> dims);
 
+  // Compute a hash for `layout`.
+  static size_t Hash(const Layout& layout);
+
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(LayoutUtil);
 };
diff --git a/tensorflow/compiler/xla/layout_util_test.cc b/tensorflow/compiler/xla/layout_util_test.cc
index 4fd1d818e3e3b4..e4c825450dcd45 100644
--- a/tensorflow/compiler/xla/layout_util_test.cc
+++ b/tensorflow/compiler/xla/layout_util_test.cc
@@ -218,6 +218,47 @@ TEST_F(LayoutUtilTest, CopyLayoutBogusLayout) {
                                "elements, but shape is rank"));
 }
 
+TEST_F(LayoutUtilTest, CopyTokenLayout) {
+  Shape src = ShapeUtil::MakeTokenShape();
+  Shape dst = ShapeUtil::MakeTokenShape();
+
+  // Layouts are trivially the same for token types and copying layouts should
+  // be a nop.
+  EXPECT_TRUE(LayoutUtil::LayoutsInShapesEqual(src, dst));
+  EXPECT_IS_OK(LayoutUtil::CopyLayoutBetweenShapes(src, &dst));
+  EXPECT_TRUE(LayoutUtil::LayoutsInShapesEqual(src, dst));
+}
+
+TEST_F(LayoutUtilTest, CopyOpaqueLayout) {
+  Shape src = ShapeUtil::MakeOpaqueShape();
+  Shape dst = ShapeUtil::MakeOpaqueShape();
+
+  // Layouts are trivially the same for opaque types and copying layouts should
+  // be a nop.
+  EXPECT_TRUE(LayoutUtil::LayoutsInShapesEqual(src, dst));
+  EXPECT_IS_OK(LayoutUtil::CopyLayoutBetweenShapes(src, &dst));
+  EXPECT_TRUE(LayoutUtil::LayoutsInShapesEqual(src, dst));
+}
+
+TEST_F(LayoutUtilTest, CopyTupleLayoutWithTokenAndOpaque) {
+  Shape src = ShapeUtil::MakeTupleShape(
+      {MakeShapeWithLayout(F32, {2, 3}, {0, 1}),
+       MakeShapeWithLayout(F32, {42, 123}, {1, 0}), ShapeUtil::MakeTokenShape(),
+       ShapeUtil::MakeTupleShape(
+           {ShapeUtil::MakeOpaqueShape(), MakeShapeWithLayout(F32, {}, {}),
+            MakeShapeWithLayout(F32, {1, 2, 3}, {0, 2, 1})})});
+  Shape dst = ShapeUtil::MakeTupleShape(
+      {MakeShapeWithLayout(F32, {2, 3}, {1, 0}),
+       MakeShapeWithLayout(F32, {42, 123}, {1, 0}), ShapeUtil::MakeTokenShape(),
+       ShapeUtil::MakeTupleShape(
+           {ShapeUtil::MakeOpaqueShape(), MakeShapeWithLayout(F32, {}, {}),
+            MakeShapeWithLayout(F32, {1, 2, 3}, {1, 2, 0})})});
+
+  EXPECT_FALSE(LayoutUtil::LayoutsInShapesEqual(src, dst));
+  EXPECT_IS_OK(LayoutUtil::CopyLayoutBetweenShapes(src, &dst));
+  EXPECT_TRUE(LayoutUtil::LayoutsInShapesEqual(src, dst));
+}
+
 TEST_F(LayoutUtilTest, ClearLayoutTuple) {
   Shape shape = ShapeUtil::MakeTupleShape(
       {MakeShapeWithLayout(F32, {2, 3}, {1, 0}),
@@ -236,6 +277,16 @@ TEST_F(LayoutUtilTest, ClearLayoutTuple) {
   EXPECT_FALSE(shape.tuple_shapes(2).tuple_shapes(1).has_layout());
 }
 
+TEST_F(LayoutUtilTest, ClearLayoutOpaqueAndToken) {
+  // Opaque and token types trivially have layouts.
+  for (Shape shape :
+       {ShapeUtil::MakeOpaqueShape(), ShapeUtil::MakeTokenShape()}) {
+    EXPECT_TRUE(LayoutUtil::HasLayout(shape));
+    LayoutUtil::ClearLayout(&shape);
+    EXPECT_TRUE(LayoutUtil::HasLayout(shape));
+  }
+}
+
 TEST_F(LayoutUtilTest, SetToDefaultLayoutTuple) {
   Shape shape = ShapeUtil::MakeTupleShape(
       {MakeShapeWithLayout(F32, {2, 3, 4}, {1, 0, 2}),
diff --git a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
index bc8405703b02dc..f42fb92359f40e 100644
--- a/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/legacy_flags/debug_options_flags.cc
@@ -47,6 +47,12 @@ void SetDebugOptionsDefaults(DebugOptions* flags) {
   // Set cudnn batchnorm off by default; it does not provide a performance win
   // on average.
   flags->set_xla_gpu_use_cudnn_batchnorm(false);
+
+  // Run all GPU work on one stream by default.  Using multiple streams
+  // increases memory usage and we lack strong motivating benchmarks for tuning
+  // the heuristics needed to decide when to run on multiple streams.  See
+  // b/77879207.
+  flags->set_xla_gpu_disable_multi_streaming(true);
 }
 
 // Allocates flag_values and flag_objects; this function must not be called more
diff --git a/tensorflow/compiler/xla/literal_comparison.cc b/tensorflow/compiler/xla/literal_comparison.cc
new file mode 100644
index 00000000000000..bf9679cafec72c
--- /dev/null
+++ b/tensorflow/compiler/xla/literal_comparison.cc
@@ -0,0 +1,741 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/literal_comparison.h"
+
+#include <unistd.h>
+#include <cmath>
+#include <vector>
+
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/casts.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/env.h"
+
+using tensorflow::strings::Appendf;
+using tensorflow::strings::Printf;
+using tensorflow::strings::StrAppend;
+using tensorflow::strings::StrCat;
+
+namespace xla {
+namespace literal_comparison {
+namespace {
+
+// Helper function for comparing a floating point type, FloatT, bitwise equal
+// between the left-hand-side and right-hand-side, by bit-casting to UnsignedT
+// -- on miscompare, a nice error message is given in the AssertionFailure.
+template <typename FloatT, typename UnsignedT>
+Status CompareFloatsBitwiseEqual(FloatT lhs, FloatT rhs) {
+  auto ulhs = tensorflow::bit_cast<UnsignedT>(lhs);
+  auto urhs = tensorflow::bit_cast<UnsignedT>(rhs);
+  auto lhs_double = static_cast<double>(lhs);
+  auto rhs_double = static_cast<double>(rhs);
+  if (ulhs != urhs) {
+    return InvalidArgument(
+        "floating values are not bitwise-equal; and equality testing "
+        "was requested: %s=%g=%a vs %s=%g=%a",
+        StrCat(tensorflow::strings::Hex(ulhs)).c_str(), lhs_double, lhs_double,
+        StrCat(tensorflow::strings::Hex(urhs)).c_str(), rhs_double, rhs_double);
+  }
+  return Status::OK();
+}
+
+// Templated comparator that specializes for float equality comparison with the
+// bitwise helper above (this is the un-specialized fallback, to just use the
+// default gunit implementation).
+template <typename NativeT>
+Status CompareEqual(NativeT lhs, NativeT rhs) {
+  if (lhs == rhs) {
+    return Status::OK();
+  }
+  return InvalidArgument("Expected equality of these values:\n  %s\n  %s",
+                         StrCat(lhs).c_str(), StrCat(rhs).c_str());
+}
+
+// Specializations for floating types that do bitwise comparisons when equality
+// comparison is requested.
+template <>
+Status CompareEqual<bfloat16>(bfloat16 lhs, bfloat16 rhs) {
+  return CompareFloatsBitwiseEqual<bfloat16, uint16>(lhs, rhs);
+}
+template <>
+Status CompareEqual<Eigen::half>(Eigen::half lhs, Eigen::half rhs) {
+  return CompareFloatsBitwiseEqual<Eigen::half, uint16>(lhs, rhs);
+}
+template <>
+Status CompareEqual<float>(float lhs, float rhs) {
+  return CompareFloatsBitwiseEqual<float, uint32>(lhs, rhs);
+}
+template <>
+Status CompareEqual<double>(double lhs, double rhs) {
+  return CompareFloatsBitwiseEqual<double, uint64>(lhs, rhs);
+}
+template <>
+Status CompareEqual<complex64>(complex64 lhs, complex64 rhs) {
+  auto res = CompareEqual<float>(lhs.real(), rhs.real());
+  if (!res.ok()) {
+    return res;
+  }
+  return CompareEqual<float>(lhs.imag(), rhs.imag());
+}
+
+// A recursive function which iterates through every index of expected and
+// actual literal and compares their values elementwise. Returns true if all
+// elements are equal.
+template <typename NativeT>
+Status Equal(LiteralSlice expected, LiteralSlice actual,
+             tensorflow::gtl::MutableArraySlice<int64> multi_index,
+             int64 dimension) {
+  if (dimension == expected.shape().dimensions_size()) {
+    NativeT expected_value = expected.Get<NativeT>(multi_index);
+    NativeT actual_value = actual.Get<NativeT>(multi_index);
+    return CompareEqual<NativeT>(expected_value, actual_value);
+  }
+
+  Status result;
+  for (int64 i = 0; i < expected.shape().dimensions(dimension); ++i) {
+    multi_index[dimension] = i;
+    result.Update(Equal<NativeT>(expected, actual, multi_index, dimension + 1));
+  }
+  return result;
+}
+
+// Gets the total element count.  For tuples, this is not the count of tuple
+// elements, but the sum of elements of each tuple element.
+int64 RecursiveElementCount(const Shape& shape) {
+  if (ShapeUtil::IsTuple(shape)) {
+    const int64 tuple_elements = ShapeUtil::TupleElementCount(shape);
+    int64 total = 0;
+    for (int64 i = 0; i < tuple_elements; ++i) {
+      total += RecursiveElementCount(ShapeUtil::GetTupleElementShape(shape, i));
+    }
+    return total;
+  } else {
+    return ShapeUtil::ElementsIn(shape);
+  }
+}
+
+// Returns whether the actual and expected values are mismatched with respect to
+// nans. 'relaxed_nans' is interpreted as in xla::ErrorSpec.
+template <typename NativeT>
+bool NanMismatch(NativeT expected, NativeT actual, bool relaxed_nans) {
+  if (relaxed_nans) {
+    return !std::isnan(expected) && std::isnan(actual);
+  } else {
+    return std::isnan(expected) != std::isnan(actual);
+  }
+}
+
+template <>
+bool NanMismatch<complex64>(complex64 expected, complex64 actual,
+                            bool relaxed_nans) {
+  return NanMismatch<float>(expected.real(), actual.real(), relaxed_nans) ||
+         NanMismatch<float>(expected.imag(), actual.imag(), relaxed_nans);
+}
+
+template <>
+bool NanMismatch<half>(half expected, half actual, bool relaxed_nans) {
+  return NanMismatch<float>(static_cast<float>(expected),
+                            static_cast<float>(actual), relaxed_nans);
+}
+
+// Converts the given floating-point value to a string.
+template <typename NativeT>
+string FpValueToString(NativeT value) {
+  return Printf("%8.4g", static_cast<double>(value));
+}
+
+template <>
+string FpValueToString<complex64>(complex64 value) {
+  return Printf("%8.4g + %8.4fi", value.real(), value.imag());
+}
+
+// Returns the absolute value of the given floating point value. This function
+// is used instead of std::abs directly in order to allow type-dependent
+// implementations for NearComparator.
+template <typename NativeT>
+float FpAbsoluteValue(NativeT value) {
+  return std::abs(value);
+}
+
+template <>
+float FpAbsoluteValue(bfloat16 value) {
+  return FpAbsoluteValue<float>(static_cast<float>(value));
+}
+
+template <>
+float FpAbsoluteValue(half value) {
+  return FpAbsoluteValue<float>(static_cast<float>(value));
+}
+
+// Helper class for comparing floating-point literals within an error bound.
+template <typename NativeT>
+class NearComparator {
+ public:
+  // Compares the two array literals elementwise and returns a comparison
+  // result. The comparison is ok() if all actual and expected elements are
+  // within the given error bound. In case of error, the status contains a
+  // detailed message about the discrepancy.
+  static Status Compare(const LiteralSlice& expected,
+                        const LiteralSlice& actual, ErrorSpec error,
+                        bool detailed_message,
+                        const MiscompareCallback& miscompare_callback) {
+    NearComparator<NativeT> comparator(expected, actual, error,
+                                       detailed_message, miscompare_callback);
+    return comparator.Run();
+  }
+
+ private:
+  // Data structure encapsulating metadata about a single element mismatch.
+  struct Mismatch {
+    NativeT actual;
+    NativeT expected;
+    float rel_error;
+    float abs_error;
+
+    // The linear index of the failure within the shape. This linear index is
+    // from the 'actual' literal.
+    int64 linear_index;
+
+    bool operator<(const Mismatch& other) const {
+      return rel_error < other.rel_error;
+    }
+
+    string ToString(const Shape& shape) const {
+      return Printf(
+          "actual %s, expected %s, index %s, rel error %8.3g, abs error %8.3g",
+          FpValueToString(actual).c_str(), FpValueToString(expected).c_str(),
+          Literal::MultiIndexAsString(
+              IndexUtil::LinearIndexToMultidimensionalIndex(shape,
+                                                            linear_index))
+              .c_str(),
+          rel_error, abs_error);
+    }
+  };
+
+  NearComparator(const LiteralSlice& expected, const LiteralSlice& actual,
+                 ErrorSpec error, bool detailed_message,
+                 const MiscompareCallback& miscompare_callback)
+      : expected_(expected),
+        actual_(actual),
+        error_(error),
+        detailed_message_(detailed_message),
+        miscompare_callback_(miscompare_callback),
+        abs_value_buckets_(kAbsValueBucketBounds.size() - 1, {0, 0}),
+        abs_error_buckets_(kErrorBucketBounds.size(), 0),
+        rel_error_buckets_(kErrorBucketBounds.size(), 0) {}
+
+  // Runs the comparison between expected and actual literals.
+  Status Run() {
+    VLOG(1) << "expected:";
+    XLA_VLOG_LINES(1, ToStringTruncated(expected_));
+    VLOG(1) << "actual:";
+    XLA_VLOG_LINES(1, ToStringTruncated(actual_));
+
+    // If the shapes mismatch, we simply fail the expectation instead of
+    // printing out data, as it's a type error rather than a value error.
+    TF_RETURN_IF_ERROR(EqualShapes(expected_.shape(), actual_.shape()));
+    if (!ShapeUtil::IsArray(expected_.shape())) {
+      return InvalidArgument("Expected array shape; got %s.",
+                             ShapeUtil::HumanString(expected_.shape()).c_str());
+    }
+
+    mismatches_ = Literal(ShapeUtil::ChangeElementType(actual_.shape(), PRED));
+    mismatches_.PopulateWithValue(false);
+
+    CompareLiterals();
+
+    if (num_mismatches_ == 0) {
+      return Status::OK();
+    } else if (!VLOG_IS_ON(1) && miscompare_callback_ != nullptr) {
+      miscompare_callback_(expected_, actual_, mismatches_);
+    }
+    return InvalidArgument("%s", ErrorMessage().c_str());
+  }
+
+  // Insert the given absolute value into the absolute value bucket vector. The
+  // bounds of the buckets are given by kAbsValueBucketBounds.
+  void UpdateAbsValueBucket(NativeT value, bool is_mismatch) {
+    // Adjust the bucket containing the absolute values of the 'actual'
+    // elements.
+    const float abs_value = FpAbsoluteValue(value);
+    for (int i = 0; i < abs_value_buckets_.size(); ++i) {
+      if (i == abs_value_buckets_.size() - 1 ||
+          (abs_value >= kAbsValueBucketBounds[i] &&
+           abs_value < kAbsValueBucketBounds[i + 1])) {
+        // The first value of the pair is the count of elements in the bucket,
+        // the second is the count of mismatches in the bucket.
+        abs_value_buckets_[i].first++;
+        if (is_mismatch) {
+          abs_value_buckets_[i].second++;
+        }
+        return;
+      }
+    }
+  }
+
+  // Insert the given error into the given error bucket vector.
+  void UpdateErrorBucket(
+      float error, tensorflow::gtl::MutableArraySlice<int64> error_buckets) {
+    CHECK_EQ(error_buckets.size(), kErrorBucketBounds.size());
+    for (int i = 0; i < error_buckets.size(); ++i) {
+      if (error >= kErrorBucketBounds[i]) {
+        error_buckets[i]++;
+      }
+    }
+  }
+
+  // Compares the two given elements from the expected and actual literals at
+  // the given literal_index and keeps track of various mismatch statistics.
+  void CompareValues(NativeT expected, NativeT actual, int64 linear_index) {
+    const bool is_nan_mismatch =
+        NanMismatch(expected, actual, error_.relaxed_nans);
+    float abs_error;
+    float rel_error;
+    if (actual == expected) {
+      abs_error = 0;
+      rel_error = 0;
+    } else if (is_nan_mismatch) {
+      num_nan_mismatches_++;
+      // A nan mismatch is considered to have infinite error. rel_error is used
+      // for sorting a std::set of the top mismatchs, and a nan value here will
+      // result in undefined behavior because nan's do not satisfy the strict
+      // weak ordering requirement of std containers.
+      abs_error = std::numeric_limits<float>::infinity();
+      rel_error = std::numeric_limits<float>::infinity();
+    } else {
+      abs_error = FpAbsoluteValue(actual - expected);
+      rel_error = abs_error / FpAbsoluteValue(expected);
+    }
+    const bool is_abs_mismatch = abs_error > error_.abs;
+    const bool is_rel_mismatch = rel_error > error_.rel;
+    const bool is_mismatch =
+        is_nan_mismatch || (is_abs_mismatch && is_rel_mismatch);
+
+    // Update the error of the relative bucket only if the *absolute* error
+    // bound is exceeded and vice versa.
+    if (is_abs_mismatch) {
+      num_abs_mismatches_++;
+      UpdateErrorBucket(rel_error, &rel_error_buckets_);
+    }
+    if (is_rel_mismatch) {
+      num_rel_mismatches_++;
+      UpdateErrorBucket(abs_error, &abs_error_buckets_);
+    }
+
+    UpdateAbsValueBucket(actual, is_mismatch);
+
+    if (!is_mismatch) {
+      return;
+    }
+
+    num_mismatches_++;
+
+    // Keep track of the kTopRelativeErrorCount relative error mismatches.
+    if (top_rel_mismatches_.size() < kTopRelativeErrorCount ||
+        rel_error > top_rel_mismatches_.begin()->rel_error) {
+      Mismatch mismatch = {actual, expected, rel_error, abs_error,
+                           linear_index};
+      top_rel_mismatches_.insert(mismatch);
+      if (top_rel_mismatches_.size() > kTopRelativeErrorCount) {
+        top_rel_mismatches_.erase(top_rel_mismatches_.begin());
+      }
+    }
+
+    mismatches_.data<bool>()[linear_index] = true;
+  }
+
+  // Compares the two literals elementwise.
+  void CompareLiterals() {
+    // Fast path optimization for the case were layouts match.
+    if (LayoutUtil::Equal(actual_.shape().layout(),
+                          expected_.shape().layout())) {
+      tensorflow::gtl::ArraySlice<const NativeT> expected_data =
+          expected_.data<NativeT>();
+      tensorflow::gtl::ArraySlice<const NativeT> actual_data =
+          actual_.data<NativeT>();
+      const int64 len = expected_data.size();
+      for (int64 i = 0; i < len; ++i) {
+        CompareValues(expected_data[i], actual_data[i], i);
+      }
+      return;
+    }
+    std::vector<int64> multi_index(ShapeUtil::Rank(actual_.shape()), 0);
+    CompareLiteralsSlow(0, &multi_index);
+  }
+
+  // Slow path for CompareLiterals when 'actual' and 'expected' literals have
+  // different layouts. In this case, multidimensional indices are constructed
+  // and indexed for each element.
+  void CompareLiteralsSlow(int64 dimension, std::vector<int64>* multi_index) {
+    if (dimension == multi_index->size()) {
+      CompareValues(expected_.Get<NativeT>(*multi_index),
+                    actual_.Get<NativeT>(*multi_index),
+                    IndexUtil::MultidimensionalIndexToLinearIndex(
+                        actual_.shape(), *multi_index));
+    } else {
+      for (int64 i = 0; i < expected_.shape().dimensions(dimension); ++i) {
+        (*multi_index)[dimension] = i;
+        CompareLiteralsSlow(dimension + 1, multi_index);
+      }
+    }
+  }
+
+  // Returns an error message string with a detailed breakdown of the
+  // mismatches. Called after calling Run().
+  string ErrorMessage() {
+    string out;
+    int64 element_count = ShapeUtil::ElementsIn(actual_.shape());
+
+    auto percent_string = [](float a, float b) {
+      float pct = b == 0.0 ? 0.0 : 100.0 * a / b;
+      return Printf("%0.4f%%", pct);
+    };
+
+    Appendf(&out,
+            "\nMismatch count %lld (%s) in shape %s (%lld elements), abs bound "
+            "%g, rel bound %g\n",
+            num_mismatches_,
+            percent_string(num_mismatches_, element_count).c_str(),
+            ShapeUtil::HumanString(actual_.shape()).c_str(),
+            ShapeUtil::ElementsIn(actual_.shape()), error_.abs, error_.rel);
+    if (num_nan_mismatches_ > 0) {
+      StrAppend(&out, "nan mismatches ", num_nan_mismatches_, "\n");
+    }
+    Appendf(&out, "Top relative error mismatches:\n");
+    for (auto it = top_rel_mismatches_.rbegin();
+         it != top_rel_mismatches_.rend(); ++it) {
+      StrAppend(&out, "  ", it->ToString(actual_.shape()).c_str(), "\n");
+    }
+
+    if (!detailed_message_) {
+      return out;
+    }
+
+    StrAppend(&out, "Absolute magnitude breakdown of actual values:\n");
+    CHECK_EQ(abs_value_buckets_.size() + 1, kAbsValueBucketBounds.size());
+    for (int i = 0; i < abs_value_buckets_.size(); ++i) {
+      const int64 bucket_size = abs_value_buckets_[i].first;
+      const int64 bucket_mismatches = abs_value_buckets_[i].second;
+      string mismatch_str = bucket_mismatches > 0
+                                ? Printf(", mismatches %lld", bucket_mismatches)
+                                : "";
+      Appendf(&out, "  %-6g <= x < %-6g : %7lld (%9s)%s\n",
+              kAbsValueBucketBounds[i], kAbsValueBucketBounds[i + 1],
+              bucket_size, percent_string(bucket_size, element_count).c_str(),
+              mismatch_str.c_str());
+    }
+
+    auto print_accum_buckets = [&](const string& header, int64 total,
+                                   tensorflow::gtl::ArraySlice<int64> buckets) {
+      StrAppend(&out, header, ":\n");
+      Appendf(&out, "  <  %-6g : %7lld (%s)\n", kErrorBucketBounds[0],
+              total - buckets[0],
+              percent_string(total - buckets[0], total).c_str());
+      CHECK_EQ(buckets.size(), kErrorBucketBounds.size());
+      for (int i = 0; i < kErrorBucketBounds.size(); ++i) {
+        Appendf(&out, "  >= %-6g : %7lld (%s)\n", kErrorBucketBounds[i],
+                buckets[i], percent_string(buckets[i], total).c_str());
+      }
+    };
+    Appendf(&out, "Elements exceeding abs error bound %g: %lld (%s)\n",
+            error_.abs, num_abs_mismatches_,
+            percent_string(num_abs_mismatches_, element_count).c_str());
+    print_accum_buckets(
+        "Relative error breakdown of elements exceeding abs error bound",
+        num_abs_mismatches_, rel_error_buckets_);
+    Appendf(&out, "Elements exceeding rel error bound %g: %lld (%s)\n",
+            error_.rel, num_rel_mismatches_,
+            percent_string(num_rel_mismatches_, element_count).c_str());
+    print_accum_buckets(
+        "Absolute error breakdown of elements exceeding rel error bound",
+        num_rel_mismatches_, abs_error_buckets_);
+    return out;
+  }
+
+  // 'actual' and 'expected' literals being compared.
+  LiteralSlice expected_;
+  LiteralSlice actual_;
+
+  // The error bounds of the comparison.
+  ErrorSpec error_;
+
+  // Whether to include detailed breakdown of mismatches in the error message.
+  bool detailed_message_;
+
+  // Callback to invoke on miscompare.
+  MiscompareCallback miscompare_callback_;
+
+  // Number of element element mismatches encountered so far.
+  int64 num_mismatches_ = 0;
+
+  // Number of elements with a nan mismatch.
+  int64 num_nan_mismatches_ = 0;
+
+  // Number of elements which exceed the absolute/relative error bound.
+  int64 num_abs_mismatches_ = 0;
+  int64 num_rel_mismatches_ = 0;
+
+  // A Literal containing which elements did not match in the expected and
+  // actual literals. mismatches_ contains PREDs and is of the same sizes as
+  // the comparison literals.
+  Literal mismatches_;
+
+  // The number of mismatches to report in the output, sorted by relative error
+  // magnitude.
+  static constexpr int64 kTopRelativeErrorCount = 5;
+
+  // The set of mismatches with the largest relative error. The size of this set
+  // is bounded by kTopRelativeErrorCount.
+  std::multiset<Mismatch> top_rel_mismatches_;
+
+  // Actual values are bucketed by absolute value. kAbsValueBucketBounds is the
+  // bounds of these buckets. abs_value_buckets_ contains a pair for each
+  // bucket: the element count and failure count.
+  static constexpr std::array<float, 7> kAbsValueBucketBounds = {
+      0.0, 0.0001, 0.001, 0.01, 0.1, 1, std::numeric_limits<float>::infinity()};
+  std::vector<std::pair<int64, int64>> abs_value_buckets_;
+
+  // Buckets for relative and absolute errors. The relative error buckets only
+  // contains those elements which exceed the *absolute* error bound, and vice
+  // versa. This makes it easy to see the effect of adjusting the relative (or
+  // absolute) error bound on the success of the comparison. kErrorBucketBounds
+  // are the lower bounds of the buckets in both vectors. The error buckets are
+  // a cumulative distribution so an error value may appear in more than one
+  // bucket. For example an error value of 0.003 may appear in the buckets
+  // bounded by 0.01, 0.1, and 1.0.
+  static constexpr std::array<float, 5> kErrorBucketBounds = {0.0001, 0.001,
+                                                              0.01, 0.1, 1};
+  std::vector<int64> abs_error_buckets_;
+  std::vector<int64> rel_error_buckets_;
+};
+
+template <typename NativeT>
+constexpr std::array<float, 7> NearComparator<NativeT>::kAbsValueBucketBounds;
+template <typename NativeT>
+constexpr std::array<float, 5> NearComparator<NativeT>::kErrorBucketBounds;
+
+// Helper function for comparing two literals for nearness. Handles tuple-shapes
+// via recursion. shape_index is the ShapeIndex of expected (or actual)
+// currently being compared.
+Status NearHelper(const LiteralSlice& expected, const LiteralSlice& actual,
+                  const ErrorSpec& error, bool detailed_message,
+                  const MiscompareCallback& miscompare_callback,
+                  const ShapeIndex& shape_index) {
+  TF_RETURN_IF_ERROR(EqualShapes(expected.shape(), actual.shape()));
+
+  if (ShapeUtil::IsTuple(expected.shape())) {
+    Status return_status;
+    for (int64 i = 0; i < ShapeUtil::TupleElementCount(expected.shape()); ++i) {
+      const auto expected_element = LiteralSlice(expected, {i});
+      const auto actual_element = LiteralSlice(actual, {i});
+      ShapeIndex element_index = shape_index;
+      element_index.push_back(i);
+      Status res =
+          NearHelper(expected_element, actual_element, error, detailed_message,
+                     miscompare_callback, element_index);
+      if (!res.ok()) {
+        string err_message = Printf("\nArray at shape index %s%s",
+                                    element_index.ToString().c_str(),
+                                    res.error_message().c_str());
+        if (return_status.ok()) {
+          return_status = res;
+        } else {
+          return_status = AppendStatus(return_status, res.error_message());
+        }
+      }
+    }
+    if (!return_status.ok() && shape_index.empty()) {
+      // Emit a top-level error message containing the top-level shape in case
+      // of mismatch.
+      int64 total_elements = RecursiveElementCount(actual.shape());
+      return_status = InvalidArgument(
+          "\nMismatches in shape %s (%lld elements):\n%s",
+          ShapeUtil::HumanString(actual.shape()).c_str(), total_elements,
+          return_status.error_message().c_str());
+    }
+    return return_status;
+  }
+
+  if (ShapeUtil::ElementIsFloating(expected.shape()) ||
+      ShapeUtil::ElementIsComplex(expected.shape())) {
+    switch (expected.shape().element_type()) {
+      case BF16:
+        return NearComparator<bfloat16>::Compare(
+            expected, actual, error, detailed_message, miscompare_callback);
+        break;
+      case F16:
+        return NearComparator<half>::Compare(
+            expected, actual, error, detailed_message, miscompare_callback);
+        break;
+      case F32:
+        return NearComparator<float>::Compare(
+            expected, actual, error, detailed_message, miscompare_callback);
+        break;
+      case F64:
+        return NearComparator<double>::Compare(
+            expected, actual, error, detailed_message, miscompare_callback);
+        break;
+      case C64:
+        return NearComparator<complex64>::Compare(
+            expected, actual, error, detailed_message, miscompare_callback);
+        break;
+      default:
+        LOG(FATAL) << "Unsupported primitive type in near comparator: "
+                   << PrimitiveType_Name(expected.shape().element_type())
+                   << ". Must be floating-point type.";
+    }
+  }
+
+  // Non-floating point literal.
+  return literal_comparison::Equal(expected, actual);
+}
+
+}  // namespace
+
+Status EqualShapes(const Shape& expected, const Shape& actual) {
+  if (ShapeUtil::IsTuple(expected) != ShapeUtil::IsTuple(actual)) {
+    return InvalidArgument("tupleness-mismatch! want: %s got %s",
+                           ShapeUtil::HumanString(expected).c_str(),
+                           ShapeUtil::HumanString(actual).c_str());
+  }
+  if (ShapeUtil::IsTuple(expected)) {
+    if (ShapeUtil::TupleElementCount(expected) !=
+        ShapeUtil::TupleElementCount(actual)) {
+      return InvalidArgument(
+          "want tuple element count: %lld got tuple element count: %lld",
+          ShapeUtil::TupleElementCount(expected),
+          ShapeUtil::TupleElementCount(actual));
+    }
+    for (int i = 0; i < expected.tuple_shapes_size(); ++i) {
+      Status result =
+          EqualShapes(expected.tuple_shapes(i), actual.tuple_shapes(i));
+      if (!result.ok()) {
+        return AppendStatus(result, StrCat("mismatch in tuple index", i));
+      }
+    }
+  } else {
+    if (ShapeUtil::Rank(expected) != ShapeUtil::Rank(actual)) {
+      return InvalidArgument("want rank of %s got rank of %s",
+                             ShapeUtil::HumanString(expected).c_str(),
+                             ShapeUtil::HumanString(actual).c_str());
+    }
+    if (expected.element_type() != actual.element_type()) {
+      return InvalidArgument(
+          "mismatch in primitive type %s vs %s",
+          PrimitiveType_Name(expected.element_type()).c_str(),
+          PrimitiveType_Name(actual.element_type()).c_str());
+    }
+    if (expected.dimensions_size() != actual.dimensions_size()) {
+      return InvalidArgument("want dimensions_size %d got dimensions_size %d",
+                             expected.dimensions_size(),
+                             actual.dimensions_size());
+    }
+    for (int i = 0; i < expected.dimensions_size(); ++i) {
+      if (expected.dimensions(i) != actual.dimensions(i)) {
+        return InvalidArgument(
+            "mismatch in dimension #%d expected: %s actual: %s", i,
+            ShapeUtil::HumanString(expected).c_str(),
+            ShapeUtil::HumanString(actual).c_str());
+      }
+    }
+  }
+  return Status::OK();
+}
+
+Status Equal(const LiteralSlice& expected, const LiteralSlice& actual) {
+  VLOG(1) << "expected:";
+  XLA_VLOG_LINES(1, expected.ToString());
+  VLOG(1) << "actual:";
+  XLA_VLOG_LINES(1, actual.ToString());
+
+  TF_RETURN_IF_ERROR(EqualShapes(expected.shape(), actual.shape()));
+  std::vector<int64> multi_index(expected.shape().dimensions_size(), 0);
+  Status result;
+  switch (expected.shape().element_type()) {
+    case PRED:
+      result = Equal<bool>(expected, actual, &multi_index, 0);
+      break;
+    case U8:
+      result = Equal<uint8>(expected, actual, &multi_index, 0);
+      break;
+    case S32:
+      result = Equal<int32>(expected, actual, &multi_index, 0);
+      break;
+    case S64:
+      result = Equal<int64>(expected, actual, &multi_index, 0);
+      break;
+    case U32:
+      result = Equal<uint32>(expected, actual, &multi_index, 0);
+      break;
+    case U64:
+      result = Equal<uint64>(expected, actual, &multi_index, 0);
+      break;
+    case BF16:
+      result = Equal<bfloat16>(expected, actual, &multi_index, 0);
+      break;
+    case F16:
+      result = Equal<half>(expected, actual, &multi_index, 0);
+      break;
+    case F32:
+      result = Equal<float>(expected, actual, &multi_index, 0);
+      break;
+    case F64:
+      result = Equal<double>(expected, actual, &multi_index, 0);
+      break;
+    case C64:
+      result = Equal<complex64>(expected, actual, &multi_index, 0);
+      break;
+    case TUPLE: {
+      for (int i = 0; i < ShapeUtil::TupleElementCount(expected.shape()); ++i) {
+        result.Update(
+            Equal(LiteralSlice(expected, {i}), LiteralSlice(actual, {i})));
+      }
+      break;
+    }
+    default:
+      LOG(FATAL)
+          << "Unsupported primitive type in LiteralTestUtil::ExpectEqual: "
+          << PrimitiveType_Name(expected.shape().element_type());
+  }
+
+  if (result.ok()) {
+    return Status::OK();
+  }
+
+  return AppendStatus(result,
+                      tensorflow::strings::Printf(
+                          "\nat index: %s\nexpected: %s\nactual:   %s",
+                          Literal::MultiIndexAsString(multi_index).c_str(),
+                          ToStringTruncated(expected).c_str(),
+                          ToStringTruncated(actual).c_str()));
+}
+
+Status Near(const LiteralSlice& expected, const LiteralSlice& actual,
+            const ErrorSpec& error, bool detailed_message,
+            const MiscompareCallback& miscompare_callback) {
+  return NearHelper(expected, actual, error, detailed_message,
+                    miscompare_callback,
+                    /*shape_index=*/{});
+}
+
+string ToStringTruncated(const LiteralSlice& literal) {
+  return RecursiveElementCount(literal.shape()) < 1000
+             ? literal.ToString()
+             : "[TRUNCATED, Literal with more than 1000 values]";
+}
+
+}  // namespace literal_comparison
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/literal_comparison.h b/tensorflow/compiler/xla/literal_comparison.h
new file mode 100644
index 00000000000000..00a13e361932e7
--- /dev/null
+++ b/tensorflow/compiler/xla/literal_comparison.h
@@ -0,0 +1,72 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Library for comparing literals without taking a dependency on testing
+// libraries.
+
+#ifndef TENSORFLOW_COMPILER_XLA_LITERAL_COMPARISON_H_
+#define TENSORFLOW_COMPILER_XLA_LITERAL_COMPARISON_H_
+
+#include "tensorflow/compiler/xla/error_spec.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace xla {
+namespace literal_comparison {
+
+// Returns ok if the given shapes have the same rank, dimension sizes, and
+// primitive types.
+Status EqualShapes(const Shape& expected, const Shape& actual);
+
+// Returns ok if the expected and actual literals are (bitwise) equal for all
+// elements in the literal. Also, asserts that the rank, dimensions sizes, and
+// primitive type are equal.
+Status Equal(const LiteralSlice& expected, const LiteralSlice& actual);
+
+using MiscompareCallback =
+    std::function<void(const LiteralSlice& expected, const LiteralSlice& actual,
+                       const LiteralSlice& mismatches)>;
+
+// Inspects whether the expected and actual literals are within the given error
+// bound for all elements. Also, inspects whether the rank, dimensions sizes,
+// and dimension bounds are equivalent.
+//
+// Tuples are matched recursively.
+//
+// When comparing tensors of non-floating-point type, this inspects for exact
+// equality, ignoring the ErrorSpec.
+//
+// If the shape of the literals is neither a complex/floating-point tensor nor a
+// tuple which contains a complex/floating-point tensor, Near() is equivalent to
+// Equal(). We don't raise an error in this case, because we want to allow
+// callers to call Near() even if they have no preconceptions about the shapes
+// being compared.
+//
+// If detailed_message is true, then the error message in the assertion result
+// will contain a more detailed breakdown of mismatches.
+Status Near(const LiteralSlice& expected, const LiteralSlice& actual,
+            const ErrorSpec& error, bool detailed_message,
+            const MiscompareCallback& miscompare_callback);
+
+// Calling ToString on a literal with over 100 million elements takes around
+// 3 minutes.  The utility of printing a literal with >1000 elements is
+// questionable, especially when writing the Literal proto to disk is orders
+// of magnitude faster.
+string ToStringTruncated(const LiteralSlice& literal);
+
+}  // namespace literal_comparison
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_LITERAL_COMPARISON_H_
diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index bb6dd4f9098aef..61afc311a70293 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/casts.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
@@ -61,8 +62,49 @@ void ConvertEndianShort(char* bytes, int64 size) {
   }
 }
 
+// Return a literal with all arrays of type FromNativeT converted to type
+// ToNativeT in the given literal.
+template <typename FromNativeT, typename ToNativeT>
+std::unique_ptr<Literal> ConvertType(LiteralSlice literal) {
+  // First construct shape of the result.
+  Shape result_shape(literal.shape());
+  ShapeUtil::ForEachMutableSubshape(
+      &result_shape, [](Shape* subshape, const ShapeIndex&) {
+        if (subshape->element_type() ==
+            primitive_util::NativeToPrimitiveType<FromNativeT>()) {
+          subshape->set_element_type(
+              primitive_util::NativeToPrimitiveType<ToNativeT>());
+        }
+      });
+  auto result = MakeUnique<Literal>(result_shape);
+
+  // Then copy over the data from 'literal' converting FromNativeT values to
+  // ToNativeT values as necessary.
+  ShapeUtil::ForEachSubshape(
+      literal.shape(),
+      [&](const Shape& subshape, const ShapeIndex& shape_index) {
+        if (ShapeUtil::IsArray(subshape)) {
+          if (subshape.element_type() ==
+              primitive_util::NativeToPrimitiveType<FromNativeT>()) {
+            auto src = literal.data<FromNativeT>(shape_index);
+            auto dest = result->data<ToNativeT>(shape_index);
+            for (int64 i = 0; i < src.size(); ++i) {
+              dest[i] = static_cast<ToNativeT>(src[i]);
+            }
+          } else {
+            TF_CHECK_OK(result->CopyFrom(literal,
+                                         /*dest_shape_index=*/shape_index,
+                                         /*src_shape_index=*/shape_index));
+          }
+        }
+      });
+  return result;
+}
+
 }  // namespace
 
+LiteralBase::~LiteralBase() {}
+
 std::ostream& operator<<(std::ostream& out, const Literal& literal) {
   out << literal.ToString();
   return out;
@@ -94,99 +136,89 @@ Literal::StrideConfig::StrideConfig(
 Literal::Literal(const Shape& shape)
     : Literal(shape, /*allocate_arrays=*/true) {}
 
-Literal::Literal(const Shape& shape, bool allocate_arrays)
-    : shape_(shape), pieces_(shape), owns_buffers_(true) {
-  CHECK(LayoutUtil::HasLayout(shape));
-  for (auto& pair : pieces_) {
-    const ShapeIndex& index = pair.first;
-    Piece& piece = pair.second;
-
-    piece.set_subshape(&ShapeUtil::GetSubshape(shape_, index));
-    const Shape& subshape = piece.subshape();
-    if (ShapeUtil::IsArray(subshape)) {
-      if (allocate_arrays) {
-        if (LayoutUtil::IsSparseArray(subshape)) {
-          // For sparse arrays, the buffer must be of the size of the maximum
-          // number of sparse elements possible.
-          const int64 max_sparse_elements =
-              LayoutUtil::MaxSparseElements(subshape.layout());
-          piece.set_buffer(
-              new char[max_sparse_elements * ShapeUtil::ByteSizeOfPrimitiveType(
-                                                 subshape.element_type())]);
-          piece.set_sparse_indices(new SparseIndexArray(
-              max_sparse_elements, ShapeUtil::Rank(subshape)));
-        } else {
-          piece.set_buffer(new char[piece.size_bytes()]);
-        }
+void Literal::SetPiece(const Shape& shape, Piece* piece, bool allocate_arrays) {
+  if (ShapeUtil::IsTuple(shape)) {
+    for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+      const Shape& subshape = shape.tuple_shapes(i);
+
+      auto child_piece = Piece();
+      child_piece.set_subshape(&subshape);
+
+      SetPiece(subshape, &child_piece, allocate_arrays);
+
+      piece->emplace_back(std::move(child_piece));
+    }
+  } else {
+    CHECK(ShapeUtil::IsArray(shape));
+    if (allocate_arrays) {
+      if (LayoutUtil::IsSparseArray(shape)) {
+        // For sparse arrays, the buffer must be of the size of the maximum
+        // number of sparse elements possible.
+        const int64 max_sparse_elements =
+            LayoutUtil::MaxSparseElements(shape.layout());
+        piece->set_buffer(
+            new char[max_sparse_elements *
+                     ShapeUtil::ByteSizeOfPrimitiveType(shape.element_type())]);
+        piece->set_sparse_indices(
+            new SparseIndexArray(max_sparse_elements, ShapeUtil::Rank(shape)));
       } else {
-        piece.set_buffer(nullptr);
+        piece->set_buffer(new char[piece->size_bytes()]);
       }
     }
   }
 }
 
-Literal::~Literal() { DeallocateBuffers(); }
+Literal::Literal(const Shape& shape, bool allocate_arrays)
+    : LiteralBase(), shape_(MakeUnique<Shape>(shape)) {
+  CHECK(LayoutUtil::HasLayout(*shape_));
+  root_piece_ = new Piece();
+  root_piece_->set_subshape(shape_.get());
+  CHECK(&root_piece_->subshape() == shape_.get());
 
-void Literal::DeallocateBuffers() {
-  if (owns_buffers_) {
-    for (auto& pair : pieces_) {
-      Piece& piece = pair.second;
-      if (piece.buffer() != nullptr) {
-        delete[] piece.buffer();
-        delete piece.sparse_indices();
-      }
-    }
-  }
+  SetPiece(*shape_, root_piece_, allocate_arrays);
 }
 
-Literal::Literal(Literal&& other) {
-  shape_ = std::move(other.shape_);
-  pieces_ = std::move(other.pieces_);
-  // We need to iterate through the pieces to set the subshape pointer
-  // properly. It must refer to subshapes within shape_.
-  for (auto& pair : pieces_) {
-    const ShapeIndex& index = pair.first;
-    Piece& piece = pair.second;
-    piece.set_subshape(&ShapeUtil::GetSubshape(shape_, index));
+Literal::~Literal() {
+  if (root_piece_ != nullptr) {
+    DeallocateBuffers();
+    delete root_piece_;
   }
-  owns_buffers_ = other.owns_buffers_;
+}
 
-  other.shape_ = ShapeUtil::MakeNil();
-  other.pieces_ = ShapeTree<Piece>(other.shape_);
-  other.piece({}).set_subshape(&other.shape_);
+void Literal::DeallocateBuffers() {
+  root_piece_->ForEachMutableSubpiece(
+      [&](const ShapeIndex& index, Piece* piece) {
+        if (piece->buffer() != nullptr) {
+          delete[] piece->buffer();
+          delete piece->sparse_indices();
+        }
+      });
 }
 
+Literal::Literal(Literal&& other) : LiteralBase() { *this = std::move(other); }
+
 Literal& Literal::operator=(Literal&& other) {
-  DeallocateBuffers();
-  shape_ = std::move(other.shape_);
-  pieces_ = std::move(other.pieces_);
-  // We need to iterate through the pieces to set the subshape pointer
-  // properly. It must refer to subshapes within shape_.
-  for (auto& pair : pieces_) {
-    const ShapeIndex& index = pair.first;
-    Piece& piece = pair.second;
-    piece.set_subshape(&ShapeUtil::GetSubshape(shape_, index));
-  }
-  owns_buffers_ = other.owns_buffers_;
-
-  other.shape_ = ShapeUtil::MakeNil();
-  other.pieces_ = ShapeTree<Piece>(other.shape_);
-  other.piece({}).set_subshape(&other.shape_);
+  DCHECK(&other.root_piece_->subshape() == other.shape_.get());
+  using std::swap;
+  swap(shape_, other.shape_);
+  swap(root_piece_, other.root_piece_);
+  DCHECK(&root_piece_->subshape() == shape_.get());
+
   return *this;
 }
 
-std::unique_ptr<Literal> Literal::CreateFromShape(const Shape& shape) {
+std::unique_ptr<Literal> LiteralBase::CreateFromShape(const Shape& shape) {
   auto literal = MakeUnique<Literal>(shape);
-  for (auto& pair : literal->pieces_) {
-    Piece& piece = pair.second;
-    if (ShapeUtil::IsArray(piece.subshape())) {
-      memset(piece.untyped_data(), 0, piece.size_bytes());
-    }
-  }
+  literal->root_piece_->ForEachMutableSubpiece(
+      [&](const ShapeIndex& index, Piece* piece) {
+        if (ShapeUtil::IsArray(piece->subshape())) {
+          memset(piece->untyped_data(), 0, piece->size_bytes());
+        }
+      });
   return literal;
 }
 
-const SparseIndexArray* Literal::sparse_indices(
+const SparseIndexArray* LiteralBase::sparse_indices(
     const ShapeIndex& shape_index) const {
   return piece(shape_index).sparse_indices();
 }
@@ -201,9 +233,19 @@ SparseIndexArray* Literal::sparse_indices(const ShapeIndex& shape_index) {
   return CreateFromShape(ShapeUtil::MakeShape(primitive_type, dimensions));
 }
 
+/* static */ std::unique_ptr<Literal> Literal::ConvertBF16ToF32(
+    const LiteralSlice& bf16_literal) {
+  return ConvertType<bfloat16, float>(bf16_literal);
+}
+
+/* static */ std::unique_ptr<Literal> Literal::ConvertF32ToBF16(
+    const LiteralSlice& f32_literal) {
+  return ConvertType<float, bfloat16>(f32_literal);
+}
+
 template <typename NativeT>
 Status Literal::CopySliceFromInternal(
-    const Literal& src_literal, tensorflow::gtl::ArraySlice<int64> src_base,
+    const LiteralBase& src_literal, tensorflow::gtl::ArraySlice<int64> src_base,
     tensorflow::gtl::ArraySlice<int64> dest_base,
     tensorflow::gtl::ArraySlice<int64> copy_size) {
   TF_RET_CHECK(ShapeUtil::Rank(src_literal.shape()) == src_base.size());
@@ -263,7 +305,7 @@ Status Literal::CopySliceFromInternal(
   return Status::OK();
 }
 
-Status Literal::CopyElementFrom(const Literal& src_literal,
+Status Literal::CopyElementFrom(const LiteralSlice& src_literal,
                                 tensorflow::gtl::ArraySlice<int64> src_index,
                                 tensorflow::gtl::ArraySlice<int64> dest_index) {
   DCHECK_EQ(shape().element_type(), src_literal.shape().element_type());
@@ -292,22 +334,21 @@ std::vector<Literal> Literal::DecomposeTuple() {
     elements.push_back(Literal(ShapeUtil::GetSubshape(shape(), {i}),
                                /*allocate_arrays=*/false));
     Literal& element = elements.back();
-    for (auto& pair : element.pieces_) {
-      const ShapeIndex& index = pair.first;
-      Piece& dest_piece = pair.second;
-      ShapeIndex src_index = {i};
-      for (int64 j : index) {
-        src_index.push_back(j);
-      }
-      Piece& src_piece = piece(src_index);
-
-      // Move the respective buffer and sparse indices over to the element
-      // Literal.
-      dest_piece.set_buffer(src_piece.buffer());
-      src_piece.set_buffer(nullptr);
-      dest_piece.set_sparse_indices(src_piece.sparse_indices());
-      src_piece.set_sparse_indices(nullptr);
-    }
+    element.root_piece_->ForEachMutableSubpiece(
+        [&](const ShapeIndex& index, Piece* dest_piece) {
+          ShapeIndex src_index = {i};
+          for (int64 j : index) {
+            src_index.push_back(j);
+          }
+          Piece& src_piece = piece(src_index);
+
+          // Move the respective buffer and sparse indices over to the element
+          // Literal.
+          dest_piece->set_buffer(src_piece.buffer());
+          src_piece.set_buffer(nullptr);
+          dest_piece->set_sparse_indices(src_piece.sparse_indices());
+          src_piece.set_sparse_indices(nullptr);
+        });
   }
   // Set this literal to be nil-shaped.
   *this = Literal();
@@ -350,7 +391,9 @@ void CopyElementsBetween(tensorflow::gtl::MutableArraySlice<NativeT> dest,
 
 }  // namespace
 
-Status Literal::Piece::CopyFrom(const Literal::Piece& src) {
+Status LiteralBase::Piece::CopyFrom(const LiteralBase::Piece& src) {
+  CHECK(subshape_ != nullptr);
+  CHECK(src.subshape_ != nullptr);
   if (ShapeUtil::Equal(subshape(), src.subshape())) {
     // If the layouts are equal it's faster just to memcpy.
     memcpy(buffer(), src.buffer(), src.size_bytes());
@@ -387,7 +430,7 @@ Status Literal::Piece::CopyFrom(const Literal::Piece& src) {
   return Status::OK();
 }
 
-Status Literal::CopyFrom(const Literal& src_literal,
+Status Literal::CopyFrom(const LiteralSlice& src_literal,
                          const ShapeIndex& dest_shape_index,
                          const ShapeIndex& src_shape_index) {
   const Shape& dest_subshape =
@@ -400,36 +443,32 @@ Status Literal::CopyFrom(const Literal& src_literal,
         ShapeUtil::HumanString(dest_subshape).c_str(),
         ShapeUtil::HumanString(src_subshape).c_str());
   }
+  return root_piece_->ForEachMutableSubpieceWithStatus(
+      [&](const ShapeIndex& index, Piece* piece) {
+        if (!ShapeUtil::IsArray(piece->subshape())) {
+          return Status::OK();
+        }
 
-  for (auto& pair : pieces_) {
-    const ShapeIndex& index = pair.first;
-    Piece& piece = pair.second;
-    if (!ShapeUtil::IsArray(piece.subshape())) {
-      continue;
-    }
-
-    // Determine if this index is in the part of this literal that we want to
-    // copy over from src_literal.
-    bool in_subtree_to_copy = true;
-    for (int i = 0; i < dest_shape_index.size(); ++i) {
-      if (index[i] != dest_shape_index[i]) {
-        in_subtree_to_copy = false;
-        break;
-      }
-    }
-    if (!in_subtree_to_copy) {
-      continue;
-    }
-
-    // Construct the index of the corresponding piece in the source literal.
-    ShapeIndex src_piece_index = src_shape_index;
-    for (int64 i = dest_shape_index.size(); i < index.size(); ++i) {
-      src_piece_index.push_back(index[i]);
-    }
-
-    TF_RETURN_IF_ERROR(piece.CopyFrom(src_literal.piece(src_piece_index)));
-  }
-  return Status::OK();
+        // Determine if this index is in the part of this literal that we want
+        // to copy over from src_literal.
+        bool in_subtree_to_copy = true;
+        for (int i = 0; i < dest_shape_index.size(); ++i) {
+          if (index[i] != dest_shape_index[i]) {
+            in_subtree_to_copy = false;
+            break;
+          }
+        }
+        if (!in_subtree_to_copy) {
+          return Status::OK();
+        }
+        // Construct the index of the corresponding piece in the source literal.
+        ShapeIndex src_piece_index = src_shape_index;
+        for (int64 i = dest_shape_index.size(); i < index.size(); ++i) {
+          src_piece_index.push_back(index[i]);
+        }
+        TF_RETURN_IF_ERROR(piece->CopyFrom(src_literal.piece(src_piece_index)));
+        return Status::OK();
+      });
 }
 
 Status Literal::MoveFrom(Literal&& src_literal,
@@ -443,37 +482,32 @@ Status Literal::MoveFrom(Literal&& src_literal,
         ShapeUtil::HumanString(src_literal.shape()).c_str());
   }
 
-  if (!(owns_buffers_ && src_literal.owns_buffers_)) {
-    return InvalidArgument(
-        "Source and destination literals must both own their buffers (ie, not "
-        "be views)");
-  }
+  src_literal.root_piece_->ForEachSubpiece(
+      [&](const ShapeIndex& src_index, const Piece& src_piece) {
+        if (!ShapeUtil::IsArray(src_piece.subshape())) {
+          return;
+        }
 
-  for (auto& pair : src_literal.pieces_) {
-    const ShapeIndex& src_index = pair.first;
-    Piece& src_piece = pair.second;
-    if (!ShapeUtil::IsArray(src_piece.subshape())) {
-      continue;
-    }
+        ShapeIndex dest_index = dest_shape_index;
+        for (int64 i : src_index) {
+          dest_index.push_back(i);
+        }
+        Piece& dest_piece = piece(dest_index);
+        delete[] dest_piece.buffer();
+        dest_piece.set_buffer(src_piece.buffer());
+        delete dest_piece.sparse_indices();
+        dest_piece.set_sparse_indices(src_piece.sparse_indices());
+      });
 
-    ShapeIndex dest_index = dest_shape_index;
-    for (int64 i : src_index) {
-      dest_index.push_back(i);
-    }
-    Piece& dest_piece = piece(dest_index);
-    delete[] dest_piece.buffer();
-    dest_piece.set_buffer(src_piece.buffer());
-    delete dest_piece.sparse_indices();
-    dest_piece.set_sparse_indices(src_piece.sparse_indices());
-  }
+  src_literal.shape_ = MakeUnique<Shape>(ShapeUtil::MakeNil());
+  delete src_literal.root_piece_;
+  src_literal.root_piece_ = new LiteralBase::Piece();
+  src_literal.root_piece_->set_subshape(src_literal.shape_.get());
 
-  src_literal.shape_ = ShapeUtil::MakeNil();
-  src_literal.pieces_ = ShapeTree<Piece>(src_literal.shape_);
-  src_literal.piece({}).set_subshape(&src_literal.shape_);
   return Status::OK();
 }
 
-Status Literal::CopySliceFrom(const Literal& src_literal,
+Status Literal::CopySliceFrom(const LiteralSlice& src_literal,
                               tensorflow::gtl::ArraySlice<int64> src_base,
                               tensorflow::gtl::ArraySlice<int64> dest_base,
                               tensorflow::gtl::ArraySlice<int64> copy_size) {
@@ -742,7 +776,7 @@ void Literal::PopulateR1(const tensorflow::core::Bitmap& values) {
   return CreateR2FromArray2D(*value);
 }
 
-std::unique_ptr<Literal> Literal::Relayout(
+std::unique_ptr<Literal> LiteralBase::Relayout(
     const Layout& new_layout, const ShapeIndex& shape_index) const {
   // Create new shape with 'new_layout' set at the given shape index.
   Shape new_shape = shape();
@@ -754,7 +788,7 @@ std::unique_ptr<Literal> Literal::Relayout(
   return result;
 }
 
-std::unique_ptr<Literal> Literal::Relayout(
+std::unique_ptr<Literal> LiteralBase::Relayout(
     const Shape& shape_with_layout) const {
   CHECK(ShapeUtil::Compatible(shape_with_layout, shape()))
       << "Given shape_with_layout " << ShapeUtil::HumanString(shape_with_layout)
@@ -773,7 +807,48 @@ std::unique_ptr<Literal> Literal::Relayout(
   return result;
 }
 
-StatusOr<std::unique_ptr<Literal>> Literal::Reshape(
+StatusOr<std::unique_ptr<Literal>> LiteralBase::Broadcast(
+    const Shape& result_shape,
+    tensorflow::gtl::ArraySlice<int64> dimensions) const {
+  if (!ShapeUtil::IsArray(shape())) {
+    return InvalidArgument("Broadcast only supports arrays.");
+  }
+
+  for (int64 i = 0; i < dimensions.size(); i++) {
+    TF_RET_CHECK(shape().dimensions(i) ==
+                 result_shape.dimensions(dimensions[i]));
+  }
+
+  std::unique_ptr<Literal> result = MakeUnique<Literal>(result_shape);
+
+  // scratch_source_index is temporary storage space for the computed index into
+  // the input literal.  We put it here to avoid allocating an std::vector in
+  // every iteration of ShapeUtil::ForEachIndex.
+  std::vector<int64> scratch_source_index(shape().dimensions_size());
+
+  char* dest_data = static_cast<char*>(result->untyped_data());
+  const char* source_data = static_cast<const char*>(untyped_data());
+  const int64 primitive_size =
+      ShapeUtil::ByteSizeOfPrimitiveType(shape().element_type());
+
+  ShapeUtil::ForEachIndex(
+      result_shape, [&](tensorflow::gtl::ArraySlice<int64> output_index) {
+        for (int64 i = 0; i < dimensions.size(); ++i) {
+          scratch_source_index[i] = output_index[dimensions[i]];
+        }
+        int64 dest_index = IndexUtil::MultidimensionalIndexToLinearIndex(
+            result_shape, output_index);
+        int64 source_index = IndexUtil::MultidimensionalIndexToLinearIndex(
+            shape(), scratch_source_index);
+        memcpy(dest_data + primitive_size * dest_index,
+               source_data + primitive_size * source_index, primitive_size);
+        return true;
+      });
+
+  return std::move(result);
+}
+
+StatusOr<std::unique_ptr<Literal>> LiteralBase::Reshape(
     tensorflow::gtl::ArraySlice<int64> dimensions) const {
   if (!ShapeUtil::IsArray(shape())) {
     return InvalidArgument("Reshape does not support tuples.");
@@ -787,7 +862,8 @@ StatusOr<std::unique_ptr<Literal>> Literal::Reshape(
   }
   // Because the layout is monotonic, we can simply reuse the same sequence of
   // values without changing their order.
-  output->shape_ = ShapeUtil::MakeShape(shape().element_type(), dimensions);
+  *output->mutable_shape_do_not_use() =
+      ShapeUtil::MakeShape(shape().element_type(), dimensions);
 
   int64 elements_before = ShapeUtil::ElementsIn(shape());
   int64 elements_after = ShapeUtil::ElementsIn(output->shape());
@@ -801,7 +877,79 @@ StatusOr<std::unique_ptr<Literal>> Literal::Reshape(
   return std::move(output);
 }
 
-std::unique_ptr<Literal> Literal::Transpose(
+/* static */ std::unique_ptr<Literal> Literal::ReshapeSlice(
+    tensorflow::gtl::ArraySlice<int64> new_dimensions,
+    tensorflow::gtl::ArraySlice<int64> minor_to_major,
+    const LiteralSlice& literal) {
+  int64 new_num_elements = 1;
+  for (int64 i = 0; i < new_dimensions.size(); ++i) {
+    new_num_elements *= new_dimensions[i];
+  }
+  CHECK_EQ(ShapeUtil::ElementsIn(literal.shape()), new_num_elements);
+  CHECK_EQ(new_dimensions.size(), minor_to_major.size());
+
+  auto new_literal = MakeUnique<Literal>(
+      ShapeUtil::MakeShape(literal.shape().element_type(), new_dimensions));
+
+  // Create a new shape with the given minor-to-major layout. This shape is used
+  // solely for converting linear address to multi-dimensional addresses when
+  // writing elements to the new literal.
+  Shape shape_with_layout = new_literal->shape();
+  *shape_with_layout.mutable_layout() = LayoutUtil::MakeLayout(minor_to_major);
+
+  // Copy data into new literal, element-by-element.
+  for (int64 i = 0; i < ShapeUtil::ElementsIn(literal.shape()); ++i) {
+    std::vector<int64> from_multi_index =
+        IndexUtil::LinearIndexToMultidimensionalIndex(literal.shape(), i);
+    std::vector<int64> to_multi_index =
+        IndexUtil::LinearIndexToMultidimensionalIndex(shape_with_layout, i);
+    switch (literal.shape().element_type()) {
+      case PRED:
+        new_literal->Set<bool>(to_multi_index,
+                               literal.Get<bool>(from_multi_index));
+        break;
+      case U8:
+        new_literal->Set<uint8>(to_multi_index,
+                                literal.Get<uint8>(from_multi_index));
+        break;
+      case U32:
+        new_literal->Set<uint32>(to_multi_index,
+                                 literal.Get<uint32>(from_multi_index));
+        break;
+      case S32:
+        new_literal->Set<int32>(to_multi_index,
+                                literal.Get<int32>(from_multi_index));
+        break;
+      case U64:
+        new_literal->Set<uint64>(to_multi_index,
+                                 literal.Get<uint64>(from_multi_index));
+        break;
+      case S64:
+        new_literal->Set<int64>(to_multi_index,
+                                literal.Get<int64>(from_multi_index));
+        break;
+      case F32:
+        new_literal->Set<float>(to_multi_index,
+                                literal.Get<float>(from_multi_index));
+        break;
+      case F64:
+        new_literal->Set<double>(to_multi_index,
+                                 literal.Get<double>(from_multi_index));
+        break;
+      case C64:
+        new_literal->Set<complex64>(to_multi_index,
+                                    literal.Get<complex64>(from_multi_index));
+        break;
+      default:
+        LOG(FATAL) << "Unhandled primitive element type: "
+                   << PrimitiveType_Name(literal.shape().element_type());
+    }
+  }
+
+  return new_literal;
+}
+
+std::unique_ptr<Literal> LiteralBase::Transpose(
     tensorflow::gtl::ArraySlice<int64> permutation) const {
   CHECK(ShapeUtil::IsArray(shape())) << "Tuple is not supported for transpose";
   CHECK(IsPermutation(permutation, ShapeUtil::Rank(shape())))
@@ -832,15 +980,31 @@ std::unique_ptr<Literal> Literal::Transpose(
   for (auto index : LayoutUtil::MinorToMajor(shape())) {
     layout->add_minor_to_major(inverse_permutation[index]);
   }
-  std::unique_ptr<Literal> new_literal = CreateFromShape(permuted_shape);
-  DCHECK_GE(ShapeUtil::ByteSizeOf(new_literal->shape()),
+  auto new_literal = MakeUnique<Literal>(permuted_shape);
+  DCHECK_EQ(ShapeUtil::ByteSizeOf(new_literal->shape()),
             ShapeUtil::ByteSizeOf(shape()));
-  std::memcpy(new_literal->root_piece().buffer(), root_piece().buffer(),
-              root_piece().size_bytes());
+  std::memcpy(new_literal->untyped_data(), untyped_data(), size_bytes());
   return new_literal;
 }
 
-std::unique_ptr<Literal> Literal::Slice(
+template <typename NativeT>
+std::unique_ptr<Literal> LiteralBase::SliceInternal(
+    const Shape& result_shape,
+    tensorflow::gtl::ArraySlice<int64> start_indices) const {
+  auto result_literal = MakeUnique<Literal>(result_shape);
+  DimensionVector new_indices(ShapeUtil::Rank(result_shape));
+  result_literal->EachCell<NativeT>(
+      [&](tensorflow::gtl::ArraySlice<int64> indices, NativeT /*value*/) {
+        for (int64 i = 0; i < ShapeUtil::Rank(result_shape); ++i) {
+          new_indices[i] = indices[i] + start_indices[i];
+        }
+        NativeT value = Get<NativeT>(new_indices);
+        result_literal->Set<NativeT>(indices, value);
+      });
+  return result_literal;
+}
+
+std::unique_ptr<Literal> LiteralBase::Slice(
     tensorflow::gtl::ArraySlice<int64> start_indices,
     tensorflow::gtl::ArraySlice<int64> limit_indices) const {
   CHECK(ShapeUtil::IsArray(shape())) << "tuple is not supported for slice";
@@ -857,71 +1021,37 @@ std::unique_ptr<Literal> Literal::Slice(
   const auto result_shape =
       ShapeUtil::MakeShapeWithLayout(shape().element_type(), result_dimensions,
                                      LayoutUtil::MinorToMajor(shape()));
-
-  auto result_literal = MakeUnique<Literal>(result_shape);
-
-  DimensionVector new_indices(ShapeUtil::Rank(result_shape));
   switch (result_shape.element_type()) {
     case F32:
-      result_literal->EachCell<float>(
-          [&](tensorflow::gtl::ArraySlice<int64> indices, float /*value*/) {
-            for (int64 i = 0; i < ShapeUtil::Rank(result_shape); ++i) {
-              new_indices[i] = indices[i] + start_indices[i];
-            }
-            float value = Get<float>(new_indices);
-            result_literal->Set<float>(indices, value);
-          });
-      return result_literal;
+      return SliceInternal<float>(result_shape, start_indices);
+    case BF16:
+      return SliceInternal<bfloat16>(result_shape, start_indices);
     case C64:
-      result_literal->EachCell<complex64>(
-          [&](tensorflow::gtl::ArraySlice<int64> indices, complex64 /*value*/) {
-            for (int64 i = 0; i < ShapeUtil::Rank(result_shape); ++i) {
-              new_indices[i] = indices[i] + start_indices[i];
-            }
-            complex64 value = Get<complex64>(new_indices);
-            result_literal->Set<complex64>(indices, value);
-          });
-      return result_literal;
+      return SliceInternal<complex64>(result_shape, start_indices);
     case S32:
-      result_literal->EachCell<int32>(
-          [&](tensorflow::gtl::ArraySlice<int64> indices, int32 /*value*/) {
-            for (int64 i = 0; i < ShapeUtil::Rank(result_shape); ++i) {
-              new_indices[i] = indices[i] + start_indices[i];
-            }
-            int32 value = Get<int32>(new_indices);
-            result_literal->Set<int32>(indices, value);
-          });
-      return result_literal;
+      return SliceInternal<int32>(result_shape, start_indices);
     case U32:
-      result_literal->EachCell<uint32>(
-          [&](tensorflow::gtl::ArraySlice<int64> indices, uint32 /*value*/) {
-            for (int64 i = 0; i < ShapeUtil::Rank(result_shape); ++i) {
-              new_indices[i] = indices[i] + start_indices[i];
-            }
-            uint32 value = Get<uint32>(new_indices);
-            result_literal->Set<uint32>(indices, value);
-          });
-      return result_literal;
+      return SliceInternal<uint32>(result_shape, start_indices);
     default:
       LOG(FATAL) << "not yet implemented: "
                  << PrimitiveType_Name(result_shape.element_type());
   }
 }
 
-Literal Literal::Clone() const {
+Literal LiteralBase::Clone() const {
   Literal result(shape());
   TF_CHECK_OK(result.CopyFrom(*this));
   return result;
 }
 
-std::unique_ptr<Literal> Literal::CloneToUnique() const {
+std::unique_ptr<Literal> LiteralBase::CloneToUnique() const {
   auto result = MakeUnique<Literal>(shape());
   TF_CHECK_OK(result->CopyFrom(*this));
   return result;
 }
 
-string Literal::GetAsString(tensorflow::gtl::ArraySlice<int64> multi_index,
-                            const ShapeIndex& shape_index) const {
+string LiteralBase::GetAsString(tensorflow::gtl::ArraySlice<int64> multi_index,
+                                const ShapeIndex& shape_index) const {
   const Shape& subshape = ShapeUtil::GetSubshape(shape(), shape_index);
   CHECK(LayoutUtil::IsDenseArray(subshape));
   switch (subshape.element_type()) {
@@ -961,8 +1091,8 @@ string Literal::GetAsString(tensorflow::gtl::ArraySlice<int64> multi_index,
   }
 }
 
-string Literal::GetSparseElementAsString(int64 sparse_element_number,
-                                         const ShapeIndex& shape_index) const {
+string LiteralBase::GetSparseElementAsString(
+    int64 sparse_element_number, const ShapeIndex& shape_index) const {
   const Shape& subshape = ShapeUtil::GetSubshape(shape(), shape_index);
   CHECK(LayoutUtil::IsSparseArray(subshape));
   switch (subshape.element_type()) {
@@ -1016,7 +1146,7 @@ string Literal::GetSparseElementAsString(int64 sparse_element_number,
   }
 }
 
-StatusOr<int64> Literal::GetIntegralAsS64(
+StatusOr<int64> LiteralBase::GetIntegralAsS64(
     tensorflow::gtl::ArraySlice<int64> multi_index) const {
   CHECK(LayoutUtil::IsDenseArray(shape()));
   switch (shape().element_type()) {
@@ -1039,6 +1169,27 @@ StatusOr<int64> Literal::GetIntegralAsS64(
   }
 }
 
+size_t LiteralBase::Hash() const {
+  using tensorflow::Hash64;
+  using tensorflow::Hash64Combine;
+
+  size_t hash_value = ShapeUtil::Hash(shape());
+
+  ShapeUtil::ForEachSubshape(
+      shape(), [&](const Shape& subshape, const ShapeIndex& index) {
+        if (ShapeUtil::IsTuple(subshape)) {
+          return;
+        }
+
+        CHECK(LayoutUtil::IsDense(subshape.layout()));
+        hash_value = Hash64Combine(
+            hash_value, Hash64(static_cast<const char*>(untyped_data(index)),
+                               size_bytes(index)));
+      });
+
+  return hash_value;
+}
+
 Status Literal::SetIntegralAsS64(tensorflow::gtl::ArraySlice<int64> multi_index,
                                  int64 value) {
   CHECK(LayoutUtil::IsDenseArray(shape()));
@@ -1069,7 +1220,7 @@ Status Literal::SetIntegralAsS64(tensorflow::gtl::ArraySlice<int64> multi_index,
   return Status::OK();
 }
 
-tensorflow::gtl::ArraySlice<int64> Literal::GetSparseIndex(
+tensorflow::gtl::ArraySlice<int64> LiteralBase::GetSparseIndex(
     int64 sparse_element_number, const ShapeIndex& shape_index) const {
   const Piece& p = piece(shape_index);
   CHECK_GE(sparse_element_number, 0);
@@ -1081,10 +1232,10 @@ void Literal::SortSparseElements(const ShapeIndex& shape_index) {
   piece(shape_index).SortSparseElements();
 }
 
-Literal Literal::GetFirstScalarLiteral() const {
-  CHECK(ShapeUtil::IsArray(shape_));
-  CHECK_GT(ShapeUtil::ElementsIn(shape_), 0);
-  switch (shape_.element_type()) {
+Literal LiteralBase::GetFirstScalarLiteral() const {
+  CHECK(ShapeUtil::IsArray(shape()));
+  CHECK_GT(ShapeUtil::ElementsIn(shape()), 0);
+  switch (shape().element_type()) {
     case PRED:
       return std::move(*Literal::CreateR0<bool>(GetFirstElement<bool>()));
     // 8 bit types.
@@ -1120,11 +1271,11 @@ Literal Literal::GetFirstScalarLiteral() const {
     case U64:
       return std::move(*Literal::CreateR0<uint64>(GetFirstElement<uint64>()));
     default:
-      LOG(FATAL) << "Unhandled primitive type " << shape_.element_type();
+      LOG(FATAL) << "Unhandled primitive type " << shape().element_type();
   }
 }
 
-void Literal::Piece::SortSparseElements() {
+void LiteralBase::Piece::SortSparseElements() {
   switch (subshape().element_type()) {
     case PRED:
       SortSparseElementsInternal<bool>();
@@ -1175,7 +1326,7 @@ void Literal::Piece::SortSparseElements() {
 }
 
 template <typename NativeT>
-void Literal::Piece::SortSparseElementsInternal() {
+void LiteralBase::Piece::SortSparseElementsInternal() {
   CHECK(LayoutUtil::IsSparseArray(subshape()));
   int64 num_elements = sparse_indices()->index_count();
   auto values = data<NativeT>();
@@ -1186,9 +1337,11 @@ void Literal::Piece::SortSparseElementsInternal() {
 
 namespace {
 
-void ToStringHelper(const Literal& literal, const ShapeIndex& shape_index,
+void ToStringHelper(const LiteralBase& literal, const ShapeIndex& shape_index,
                     bool print_layout, std::vector<string>* pieces) {
   const Shape& subshape = ShapeUtil::GetSubshape(literal.shape(), shape_index);
+  CHECK(LayoutUtil::HasLayout(literal.shape()));
+  CHECK(LayoutUtil::HasLayout(subshape));
 
   auto shape_to_string = [print_layout](const Shape& shape) {
     if (print_layout) {
@@ -1347,13 +1500,14 @@ void ToStringHelper(const Literal& literal, const ShapeIndex& shape_index,
 
 }  // namespace
 
-int64 Literal::sparse_element_count() const {
+int64 LiteralBase::sparse_element_count() const {
   CHECK(LayoutUtil::IsSparseArray(shape()));
   return sparse_indices()->index_count();
 }
 
-string Literal::ToString(bool print_layout) const {
+string LiteralBase::ToString(bool print_layout) const {
   std::vector<string> pieces;
+  CHECK(LayoutUtil::HasLayout(this->shape()));
   ToStringHelper(*this, {}, print_layout, &pieces);
   return tensorflow::str_util::Join(pieces, "");
 }
@@ -1361,7 +1515,7 @@ string Literal::ToString(bool print_layout) const {
 /* static */ std::unique_ptr<Literal> Literal::MakeTuple(
     tensorflow::gtl::ArraySlice<const Literal*> elements) {
   std::vector<Shape> element_shapes;
-  for (const Literal* element : elements) {
+  for (const auto* element : elements) {
     element_shapes.push_back(element->shape());
   }
   auto literal = MakeUnique<Literal>(ShapeUtil::MakeTupleShape(element_shapes));
@@ -1371,6 +1525,19 @@ string Literal::ToString(bool print_layout) const {
   return literal;
 }
 
+/* static */ std::unique_ptr<Literal> Literal::MakeTupleFromSlices(
+    tensorflow::gtl::ArraySlice<LiteralSlice> elements) {
+  std::vector<Shape> element_shapes;
+  for (const auto& element : elements) {
+    element_shapes.push_back(element.shape());
+  }
+  auto literal = MakeUnique<Literal>(ShapeUtil::MakeTupleShape(element_shapes));
+  for (int i = 0; i < elements.size(); ++i) {
+    TF_CHECK_OK(literal->CopyFrom(elements[i], /*dest_shape_index=*/{i}));
+  }
+  return literal;
+}
+
 /* static */ std::unique_ptr<Literal> Literal::MakeTupleOwned(
     std::vector<std::unique_ptr<Literal>> elements) {
   std::vector<Shape> element_shapes;
@@ -1386,7 +1553,7 @@ string Literal::ToString(bool print_layout) const {
   return literal;
 }
 
-void Literal::EachCellAsString(
+void LiteralBase::EachCellAsString(
     const std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
                              const string& value)>& per_cell) const {
   if (ShapeUtil::HasZeroElements(shape())) {
@@ -1402,7 +1569,7 @@ void Literal::EachCellAsString(
 namespace {
 template <typename NativeSrcT, typename NativeDestT, typename ConverterType>
 std::unique_ptr<Literal> ConvertBetweenNativeTypesWithConverter(
-    const Literal& src_literal, const ConverterType& converter) {
+    const LiteralBase& src_literal, const ConverterType& converter) {
   CHECK(ShapeUtil::IsArray(src_literal.shape()));
   auto result_literal = MakeUnique<Literal>(ShapeUtil::ChangeElementType(
       src_literal.shape(),
@@ -1418,7 +1585,8 @@ std::unique_ptr<Literal> ConvertBetweenNativeTypesWithConverter(
 }
 
 template <typename NativeSrcT, typename NativeDestT>
-std::unique_ptr<Literal> ConvertBetweenNativeTypes(const Literal& src_literal) {
+std::unique_ptr<Literal> ConvertBetweenNativeTypes(
+    const LiteralBase& src_literal) {
   auto converter = [](NativeSrcT src) { return static_cast<NativeDestT>(src); };
   return ConvertBetweenNativeTypesWithConverter<NativeSrcT, NativeDestT>(
       src_literal, converter);
@@ -1427,7 +1595,7 @@ std::unique_ptr<Literal> ConvertBetweenNativeTypes(const Literal& src_literal) {
 template <typename NativeSrcT, typename NativeDestT>
 typename std::enable_if<(sizeof(NativeSrcT) == sizeof(NativeDestT)),
                         std::unique_ptr<Literal>>::type
-BitcastBetweenNativeTypes(const Literal& src_literal) {
+BitcastBetweenNativeTypes(const LiteralBase& src_literal) {
   auto converter = [](NativeSrcT src) {
     return tensorflow::bit_cast<NativeDestT>(src);
   };
@@ -1442,12 +1610,12 @@ BitcastBetweenNativeTypes(const Literal& src_literal) {
 template <typename NativeSrcT, typename NativeDestT>
 typename std::enable_if<(sizeof(NativeSrcT) != sizeof(NativeDestT)),
                         std::unique_ptr<Literal>>::type
-BitcastBetweenNativeTypes(const Literal& src_literal) {
+BitcastBetweenNativeTypes(const LiteralBase& src_literal) {
   LOG(FATAL) << "Invalid bitcast between types of different sizes.";
 }
 
 template <PrimitiveType primitive_src_type>
-std::unique_ptr<Literal> ConvertToC64(const Literal& src_literal) {
+std::unique_ptr<Literal> ConvertToC64(const LiteralBase& src_literal) {
   CHECK(ShapeUtil::IsArray(src_literal.shape()));
   auto result_literal = MakeUnique<Literal>(
       ShapeUtil::ChangeElementType(src_literal.shape(), C64));
@@ -1465,7 +1633,7 @@ std::unique_ptr<Literal> ConvertToC64(const Literal& src_literal) {
 }
 
 template <PrimitiveType primitive_src_type, PrimitiveType primitive_dest_type>
-std::unique_ptr<Literal> ConvertIfTypesMatch(const Literal& src_literal,
+std::unique_ptr<Literal> ConvertIfTypesMatch(const LiteralBase& src_literal,
                                              bool bitcast) {
   CHECK_EQ(primitive_src_type, src_literal.shape().element_type());
   if (bitcast) {
@@ -1485,7 +1653,7 @@ std::unique_ptr<Literal> ConvertIfTypesMatch(const Literal& src_literal,
 
 template <PrimitiveType primitive_src_type>
 StatusOr<std::unique_ptr<Literal>> ConvertIfDestTypeMatches(
-    const Literal& src_literal, PrimitiveType primitive_dest_type,
+    const LiteralBase& src_literal, PrimitiveType primitive_dest_type,
     bool bitcast) {
   switch (primitive_dest_type) {
 #define CONVERT_IF_TYPES_MATCH(type)                                    \
@@ -1520,7 +1688,8 @@ StatusOr<std::unique_ptr<Literal>> ConvertIfDestTypeMatches(
 }
 
 StatusOr<std::unique_ptr<Literal>> ConvertSwitch(
-    const Literal& literal, PrimitiveType primitive_dest_type, bool bitcast) {
+    const LiteralBase& literal, PrimitiveType primitive_dest_type,
+    bool bitcast) {
   TF_RET_CHECK(ShapeUtil::IsArray(literal.shape()));
   if (literal.shape().element_type() == primitive_dest_type) {
     return literal.CloneToUnique();
@@ -1554,12 +1723,12 @@ StatusOr<std::unique_ptr<Literal>> ConvertSwitch(
 
 }  // namespace
 
-StatusOr<std::unique_ptr<Literal>> Literal::Convert(
+StatusOr<std::unique_ptr<Literal>> LiteralBase::Convert(
     PrimitiveType primitive_dest_type) const {
   return ConvertSwitch(*this, primitive_dest_type, /*bitcast=*/false);
 }
 
-StatusOr<std::unique_ptr<Literal>> Literal::BitcastConvert(
+StatusOr<std::unique_ptr<Literal>> LiteralBase::BitcastConvert(
     PrimitiveType primitive_dest_type) const {
   if (primitive_util::BitWidth(shape().element_type()) !=
       primitive_util::BitWidth(primitive_dest_type)) {
@@ -1574,7 +1743,7 @@ StatusOr<std::unique_ptr<Literal>> Literal::BitcastConvert(
   return ConvertSwitch(*this, primitive_dest_type, /*bitcast=*/true);
 }
 
-StatusOr<std::unique_ptr<Literal>> Literal::ConvertToShape(
+StatusOr<std::unique_ptr<Literal>> LiteralBase::ConvertToShape(
     const Shape& dest_shape, bool round_f32_to_bf16) const {
   if (!ShapeUtil::IsTuple(dest_shape)) {
     if (round_f32_to_bf16 && shape().element_type() == F32 &&
@@ -1589,7 +1758,7 @@ StatusOr<std::unique_ptr<Literal>> Literal::ConvertToShape(
   }
   std::vector<Literal> elements;
   for (int i = 0; i < ShapeUtil::TupleElementCount(shape()); ++i) {
-    auto element = LiteralView::Create(*this, {i});
+    auto element = LiteralSlice(*this, {i});
     TF_ASSIGN_OR_RETURN(
         auto new_element,
         element.ConvertToShape(ShapeUtil::GetSubshape(dest_shape, {i})));
@@ -1601,8 +1770,8 @@ StatusOr<std::unique_ptr<Literal>> Literal::ConvertToShape(
 }
 
 template <typename NativeT>
-bool Literal::Piece::EqualElementsInternal(
-    const Literal::Piece& other, std::vector<int64>* multi_index) const {
+bool LiteralBase::Piece::EqualElementsInternal(
+    const LiteralBase::Piece& other, std::vector<int64>* multi_index) const {
   if (multi_index->size() == ShapeUtil::Rank(subshape())) {
     return (Get<NativeT>(*multi_index) == other.Get<NativeT>(*multi_index));
   }
@@ -1616,7 +1785,7 @@ bool Literal::Piece::EqualElementsInternal(
   return true;
 }
 
-bool Literal::Piece::EqualElements(const Literal::Piece& other) const {
+bool LiteralBase::Piece::EqualElements(const LiteralBase::Piece& other) const {
   DCHECK(ShapeUtil::Compatible(subshape(), other.subshape()));
 
   std::vector<int64> multi_index;
@@ -1644,28 +1813,28 @@ bool Literal::Piece::EqualElements(const Literal::Piece& other) const {
     case C64:
       return EqualElementsInternal<complex64>(other, &multi_index);
     default:
-      LOG(FATAL) << "Unimplemented: Literal::Piece::EqualElements for type "
+      LOG(FATAL) << "Unimplemented: LiteralBase::Piece::EqualElements for type "
                  << PrimitiveType_Name(subshape().element_type());
   }
 }
 
-bool Literal::operator==(const Literal& other) const {
+bool LiteralBase::operator==(const LiteralBase& other) const {
   if (!ShapeUtil::Compatible(shape(), other.shape())) {
     return false;
   }
-  for (const auto& pair : pieces_) {
-    const ShapeIndex& index = pair.first;
-    const Piece& piece = pair.second;
-    if (!ShapeUtil::IsArray(piece.subshape())) {
-      continue;
-    }
 
-    const Piece& other_piece = other.piece(index);
-    if (!piece.EqualElements(other_piece)) {
-      return false;
-    }
-  }
-  return true;
+  return root_piece().ForEachSubpieceWithBool(
+      [&](const ShapeIndex& index, const Piece& piece) {
+        if (!ShapeUtil::IsArray(piece.subshape())) {
+          return true;
+        }
+
+        const Piece& other_piece = other.piece(index);
+        if (!piece.EqualElements(other_piece)) {
+          return false;
+        }
+        return true;
+      });
 }
 
 namespace {
@@ -1683,11 +1852,11 @@ static bool AllElementsEqualValue(tensorflow::gtl::ArraySlice<NativeT> data,
 
 }  // namespace
 
-bool Literal::IsAll(int8 value) const {
-  for (const auto& pair : pieces_) {
-    const Piece& piece = pair.second;
+bool LiteralBase::IsAll(int8 value) const {
+  return root_piece().ForEachSubpieceWithBool([&](const ShapeIndex& index,
+                                                  const Piece& piece) {
     if (!ShapeUtil::IsArray(piece.subshape())) {
-      continue;
+      return true;
     }
 
     auto piece_is_all = [&]() {
@@ -1740,41 +1909,41 @@ bool Literal::IsAll(int8 value) const {
     if (!piece_is_all()) {
       return false;
     }
-  }
-  return true;
+    return true;
+  });
 }
 
-bool Literal::IsAllFloat(float value) const {
-  for (const auto& pair : pieces_) {
-    const Piece& piece = pair.second;
-    if (!ShapeUtil::IsArray(piece.subshape())) {
-      continue;
-    }
+bool LiteralBase::IsAllFloat(float value) const {
+  return root_piece().ForEachSubpieceWithBool(
+      [&](const ShapeIndex& index, const Piece& piece) {
+        if (!ShapeUtil::IsArray(piece.subshape())) {
+          return true;
+        }
 
-    auto piece_is_all = [&]() {
-      switch (shape().element_type()) {
-        case F32:
-          return AllElementsEqualValue<float>(piece.data<float>(), value);
-        case F64:
-          return AllElementsEqualValue<double>(piece.data<double>(), value);
-        case F16:
-          return AllElementsEqualValue<half>(piece.data<half>(),
-                                             static_cast<half>(value));
-        case BF16:
-          return AllElementsEqualValue<bfloat16>(piece.data<bfloat16>(),
-                                                 static_cast<bfloat16>(value));
-        default:
+        auto piece_is_all = [&]() {
+          switch (shape().element_type()) {
+            case F32:
+              return AllElementsEqualValue<float>(piece.data<float>(), value);
+            case F64:
+              return AllElementsEqualValue<double>(piece.data<double>(), value);
+            case F16:
+              return AllElementsEqualValue<half>(piece.data<half>(),
+                                                 static_cast<half>(value));
+            case BF16:
+              return AllElementsEqualValue<bfloat16>(
+                  piece.data<bfloat16>(), static_cast<bfloat16>(value));
+            default:
+              return false;
+          }
+        };
+        if (!piece_is_all()) {
           return false;
-      }
-    };
-    if (!piece_is_all()) {
-      return false;
-    }
-  }
-  return true;
+        }
+        return true;
+      });
 }
 
-bool Literal::IsAllComplex(complex64 value) const {
+bool LiteralBase::IsAllComplex(complex64 value) const {
   switch (shape().element_type()) {
     case C64:
       return AllElementsEqualValue<complex64>(root_piece().data<complex64>(),
@@ -1784,93 +1953,93 @@ bool Literal::IsAllComplex(complex64 value) const {
   }
 }
 
-bool Literal::IsAllFirst() const {
-  for (const auto& pair : pieces_) {
-    const Piece& piece = pair.second;
-    if (!ShapeUtil::IsArray(piece.subshape())) {
-      continue;
-    }
-
-    // Empty shapes are not all the first element since there is no first
-    // element.
-    if (ShapeUtil::HasZeroElements(piece.subshape())) {
-      return false;
-    }
-    auto piece_is_all = [&]() {
-      switch (piece.subshape().element_type()) {
-        case PRED: {
-          auto data = piece.data<bool>();
-          return AllElementsEqualValue<bool>(data, data[0]);
-        }
-        // 8 bit types
-        case S8: {
-          auto data = piece.data<int8>();
-          return AllElementsEqualValue<int8>(data, data[0]);
+bool LiteralBase::IsAllFirst() const {
+  return root_piece().ForEachSubpieceWithBool(
+      [&](const ShapeIndex& index, const Piece& piece) {
+        if (!ShapeUtil::IsArray(piece.subshape())) {
+          return true;
         }
-        case U8: {
-          auto data = piece.data<uint8>();
-          return AllElementsEqualValue<uint8>(data, data[0]);
-        }
-        // 16 bit types
-        case BF16: {
-          auto data = piece.data<bfloat16>();
-          return AllElementsEqualValue<bfloat16>(data, data[0]);
-        }
-        case F16: {
-          auto data = piece.data<half>();
-          return AllElementsEqualValue<half>(data, data[0]);
-        }
-        case S16: {
-          auto data = piece.data<int16>();
-          return AllElementsEqualValue<int16>(data, data[0]);
-        }
-        case U16: {
-          auto data = piece.data<uint16>();
-          return AllElementsEqualValue<uint16>(data, data[0]);
-        }
-        // 32 bit types
-        case F32: {
-          auto data = piece.data<float>();
-          return AllElementsEqualValue<float>(data, data[0]);
-        }
-        case U32: {
-          auto data = piece.data<uint32>();
-          return AllElementsEqualValue<uint32>(data, data[0]);
-        }
-        case S32: {
-          auto data = piece.data<int32>();
-          return AllElementsEqualValue<int32>(data, data[0]);
-        }
-        // 64 bit types
-        case C64: {
-          auto data = piece.data<complex64>();
-          return AllElementsEqualValue<complex64>(data, data[0]);
-        }
-        case F64: {
-          auto data = piece.data<double>();
-          return AllElementsEqualValue<double>(data, data[0]);
-        }
-        case S64: {
-          auto data = piece.data<int64>();
-          return AllElementsEqualValue<int64>(data, data[0]);
-        }
-        case U64: {
-          auto data = piece.data<uint64>();
-          return AllElementsEqualValue<uint64>(data, data[0]);
-        }
-        default:
+
+        // Empty shapes are not all the first element since there is no first
+        // element.
+        if (ShapeUtil::HasZeroElements(piece.subshape())) {
           return false;
-      }
-    };
+        }
+        auto piece_is_all = [&]() {
+          switch (piece.subshape().element_type()) {
+            case PRED: {
+              auto data = piece.data<bool>();
+              return AllElementsEqualValue<bool>(data, data[0]);
+            }
+            // 8 bit types
+            case S8: {
+              auto data = piece.data<int8>();
+              return AllElementsEqualValue<int8>(data, data[0]);
+            }
+            case U8: {
+              auto data = piece.data<uint8>();
+              return AllElementsEqualValue<uint8>(data, data[0]);
+            }
+            // 16 bit types
+            case BF16: {
+              auto data = piece.data<bfloat16>();
+              return AllElementsEqualValue<bfloat16>(data, data[0]);
+            }
+            case F16: {
+              auto data = piece.data<half>();
+              return AllElementsEqualValue<half>(data, data[0]);
+            }
+            case S16: {
+              auto data = piece.data<int16>();
+              return AllElementsEqualValue<int16>(data, data[0]);
+            }
+            case U16: {
+              auto data = piece.data<uint16>();
+              return AllElementsEqualValue<uint16>(data, data[0]);
+            }
+            // 32 bit types
+            case F32: {
+              auto data = piece.data<float>();
+              return AllElementsEqualValue<float>(data, data[0]);
+            }
+            case U32: {
+              auto data = piece.data<uint32>();
+              return AllElementsEqualValue<uint32>(data, data[0]);
+            }
+            case S32: {
+              auto data = piece.data<int32>();
+              return AllElementsEqualValue<int32>(data, data[0]);
+            }
+            // 64 bit types
+            case C64: {
+              auto data = piece.data<complex64>();
+              return AllElementsEqualValue<complex64>(data, data[0]);
+            }
+            case F64: {
+              auto data = piece.data<double>();
+              return AllElementsEqualValue<double>(data, data[0]);
+            }
+            case S64: {
+              auto data = piece.data<int64>();
+              return AllElementsEqualValue<int64>(data, data[0]);
+            }
+            case U64: {
+              auto data = piece.data<uint64>();
+              return AllElementsEqualValue<uint64>(data, data[0]);
+            }
+            default:
+              return false;
+          }
+        };
 
-    if (!piece_is_all()) {
-      return false;
-    }
-  }
-  return true;
+        if (!piece_is_all()) {
+          return false;
+        }
+        return true;
+      });
 }
 
-bool Literal::IsZero(tensorflow::gtl::ArraySlice<int64> indices) const {
+bool LiteralBase::IsZero(tensorflow::gtl::ArraySlice<int64> indices) const {
   CHECK(ShapeUtil::IsArray(shape()));
   switch (shape().element_type()) {
     case U8:
@@ -1912,7 +2081,7 @@ void CopyToRepeatedField(RepeatedFieldT* dest,
 
 }  // namespace
 
-void Literal::Piece::WriteToProto(LiteralProto* proto) const {
+void LiteralBase::Piece::WriteToProto(LiteralProto* proto) const {
   *proto->mutable_shape() = subshape();
   switch (subshape().element_type()) {
     case PRED:
@@ -1968,12 +2137,12 @@ void Literal::Piece::WriteToProto(LiteralProto* proto) const {
   }
 }
 
-const void* Literal::Piece::untyped_data() const {
+const void* LiteralBase::Piece::untyped_data() const {
   CHECK(ShapeUtil::IsArray(subshape())) << ShapeUtil::HumanString(subshape());
   return buffer();
 }
 
-void* Literal::Piece::untyped_data() {
+void* LiteralBase::Piece::untyped_data() {
   CHECK(ShapeUtil::IsArray(subshape())) << ShapeUtil::HumanString(subshape());
   return buffer();
 }
@@ -1994,7 +2163,7 @@ Status CopyFromRepeatedField(tensorflow::gtl::MutableArraySlice<NativeT> dest,
 
 }  // namespace
 
-Status Literal::Piece::CopyFromProto(const LiteralProto& proto) {
+Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) {
   // These conditions should have been checked in Literal::CreateFromProto.
   TF_RET_CHECK(proto.has_shape());
   TF_RET_CHECK(LayoutUtil::HasLayout(proto.shape()));
@@ -2061,21 +2230,19 @@ Status Literal::Piece::CopyFromProto(const LiteralProto& proto) {
   return Status::OK();
 }
 
-LiteralProto Literal::ToProto() const {
+LiteralProto LiteralBase::ToProto() const {
   LiteralProto proto;
-  for (const auto& pair : pieces_) {
-    const ShapeIndex& index = pair.first;
-    const Piece& piece = pair.second;
-
-    LiteralProto* proto_piece = &proto;
-    for (int64 i : index) {
-      while (proto_piece->tuple_literals_size() <= i) {
-        proto_piece->add_tuple_literals();
-      }
-      proto_piece = proto_piece->mutable_tuple_literals(i);
-    }
-    piece.WriteToProto(proto_piece);
-  }
+  root_piece().ForEachSubpiece(
+      [&](const ShapeIndex& index, const Piece& piece) {
+        LiteralProto* proto_piece = &proto;
+        for (int64 i : index) {
+          while (proto_piece->tuple_literals_size() <= i) {
+            proto_piece->add_tuple_literals();
+          }
+          proto_piece = proto_piece->mutable_tuple_literals(i);
+        }
+        piece.WriteToProto(proto_piece);
+      });
 
   if (LayoutUtil::IsSparseArray(shape())) {
     CopyToRepeatedField(proto.mutable_sparse_indices(),
@@ -2097,33 +2264,40 @@ StatusOr<std::unique_ptr<Literal>> Literal::CreateFromProto(
 
   auto literal = MakeUnique<Literal>(proto.shape());
 
-  for (auto& pair : literal->pieces_) {
-    const ShapeIndex& index = pair.first;
-    Piece& piece = pair.second;
-    const LiteralProto* proto_element = &proto;
-    for (int64 i : index) {
-      TF_RET_CHECK(i < proto_element->tuple_literals_size());
-      proto_element = &proto_element->tuple_literals(i);
-    }
+  TF_RETURN_IF_ERROR(literal->root_piece_->ForEachMutableSubpieceWithStatus(
+      [&](const ShapeIndex& index, Piece* piece) {
+        const LiteralProto* proto_element = &proto;
+        for (int64 i : index) {
+          CHECK(i < proto_element->tuple_literals_size());
+          proto_element = &proto_element->tuple_literals(i);
+        }
 
-    if (ShapeUtil::IsTuple(piece.subshape())) {
-      if (proto_element->tuple_literals_size() !=
-          ShapeUtil::TupleElementCount(piece.subshape())) {
-        return InvalidArgument(
-            "Expected %lld tuple elements in LiteralProto, has %d",
-            ShapeUtil::TupleElementCount(piece.subshape()),
-            proto_element->tuple_literals_size());
-      }
-      continue;
-    }
+        if (ShapeUtil::IsTuple(piece->subshape())) {
+          if (proto_element->tuple_literals_size() !=
+              ShapeUtil::TupleElementCount(piece->subshape())) {
+            return InvalidArgument(
+                "Expected %lld tuple elements in LiteralProto, has %d",
+                ShapeUtil::TupleElementCount(piece->subshape()),
+                proto_element->tuple_literals_size());
+          }
+          return Status::OK();
+        }
+
+        CHECK(ShapeUtil::IsArray(piece->subshape()));
+        TF_RETURN_IF_ERROR(piece->CopyFromProto(*proto_element));
+
+        return Status::OK();
+      }));
 
-    TF_RET_CHECK(ShapeUtil::IsArray(piece.subshape()));
-    TF_RETURN_IF_ERROR(piece.CopyFromProto(*proto_element));
-  }
   return std::move(literal);
 }
 
-const void* Literal::untyped_data(const ShapeIndex& shape_index) const {
+/* static */ string Literal::MultiIndexAsString(
+    tensorflow::gtl::ArraySlice<int64> multi_index) {
+  return StrCat("{", tensorflow::str_util::Join(multi_index, ","), "}");
+}
+
+const void* LiteralBase::untyped_data(const ShapeIndex& shape_index) const {
   return piece(shape_index).untyped_data();
 }
 
@@ -2131,11 +2305,11 @@ void* Literal::untyped_data(const ShapeIndex& shape_index) {
   return piece(shape_index).untyped_data();
 }
 
-int64 Literal::size_bytes(const ShapeIndex& shape_index) const {
+int64 LiteralBase::size_bytes(const ShapeIndex& shape_index) const {
   return piece(shape_index).size_bytes();
 }
 
-string Literal::GetR1U8AsString() const {
+string LiteralBase::GetR1U8AsString() const {
   CHECK(ShapeUtil::IsArray(shape()));
   CHECK_EQ(ShapeUtil::Rank(shape()), 1);
   CHECK_EQ(shape().element_type(), U8);
@@ -2143,51 +2317,55 @@ string Literal::GetR1U8AsString() const {
                 ShapeUtil::ElementsIn(shape()));
 }
 
-/* static */ const LiteralView LiteralView::Create(
-    const Literal& literal, const ShapeIndex& view_root) {
-  return LiteralView(literal, view_root);
-}
+void BorrowingLiteral::BuildPieceSubtree(const Shape& shape, Piece* piece) {
+  CHECK(ShapeUtil::IsTuple(shape));
+  for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
+    const Shape& subshape = shape.tuple_shapes(i);
 
-LiteralView::LiteralView(const Literal& literal, const ShapeIndex& view_root) {
-  shape_ = ShapeUtil::GetSubshape(literal.shape(), view_root);
-  pieces_ = ShapeTree<Piece>(shape_);
-  owns_buffers_ = false;
-  for (auto& pair : pieces_) {
-    const ShapeIndex& index = pair.first;
-    Piece& piece = pair.second;
+    auto child_piece = Piece();
+    child_piece.set_subshape(&subshape);
 
-    ShapeIndex src_index = view_root;
-    for (int64 i : index) {
-      src_index.push_back(i);
+    if (ShapeUtil::IsTuple(subshape)) {
+      BuildPieceSubtree(subshape, &child_piece);
     }
-    const Piece& src_piece = literal.piece(src_index);
-    piece.set_buffer(src_piece.buffer());
-    piece.set_sparse_indices(src_piece.sparse_indices());
-    piece.set_subshape(&ShapeUtil::GetSubshape(shape_, index));
+
+    piece->emplace_back(std::move(child_piece));
   }
 }
 
-LiteralView::~LiteralView() {}
+LiteralSlice::LiteralSlice(const LiteralBase& literal)
+    : LiteralBase(), root_piece_(&literal.root_piece()) {}
 
-LiteralView::LiteralView(const LiteralView& other) { CopyFrom(other); }
+LiteralSlice::LiteralSlice(const LiteralBase& literal,
+                           const ShapeIndex& view_root)
+    : LiteralBase(), root_piece_(&literal.piece(view_root)) {}
 
-LiteralView& LiteralView::operator=(const LiteralView& other) {
-  CopyFrom(other);
-  return *this;
+BorrowingLiteral::BorrowingLiteral(const char* src_buf_ptr, const Shape& shape)
+    : LiteralBase(), shape_(shape) {
+  CHECK(ShapeUtil::IsArray(shape_));
+  CHECK_NE(src_buf_ptr, nullptr);
+  CHECK(LayoutUtil::HasLayout(shape_));
+
+  root_piece_ = Piece();
+  root_piece_.set_buffer(const_cast<char*>(src_buf_ptr));
+  root_piece_.set_subshape(&shape_);
 }
 
-void LiteralView::CopyFrom(const LiteralView& other) {
-  // We can't use the default copy-constructor/copy-assignment because
-  // Piece::subshape_ points to subshapes within the Shape of the owning
-  // Literal/LiteralView.
-  shape_ = other.shape();
-  pieces_ = other.pieces_;
-  for (auto& pair : pieces_) {
-    const ShapeIndex& index = pair.first;
-    Piece& piece = pair.second;
-    piece.set_subshape(&ShapeUtil::GetSubshape(shape_, index));
+BorrowingLiteral::BorrowingLiteral(
+    tensorflow::gtl::ArraySlice<const char*> src_buf_ptrs, const Shape& shape)
+    : LiteralBase(), shape_(shape) {
+  CHECK(ShapeUtil::IsTuple(shape_));
+  CHECK(!ShapeUtil::IsNestedTuple(shape_));
+  CHECK_EQ(src_buf_ptrs.size(), ShapeUtil::TupleElementCount(shape_));
+  root_piece_ = Piece();
+  root_piece_.set_subshape(&shape_);
+  BuildPieceSubtree(shape_, &root_piece_);
+
+  for (int i = 0; i < src_buf_ptrs.size(); ++i) {
+    const auto& src_shape = shape_.tuple_shapes(i);
+    CHECK(ShapeUtil::IsArray(src_shape));
+    root_piece_.child(i).set_buffer(const_cast<char*>(src_buf_ptrs[i]));
   }
-  owns_buffers_ = false;
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h
index 8aa19222dc4b91..1e26eb7ad4098b 100644
--- a/tensorflow/compiler/xla/literal_util.h
+++ b/tensorflow/compiler/xla/literal_util.h
@@ -34,7 +34,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
-#include "tensorflow/compiler/xla/shape_tree.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/sparse_index_array.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -52,14 +51,509 @@ limitations under the License.
 
 namespace xla {
 
+// Forward declare Literal and LiteralSlice class to be used by the creation
+// methods in the base class.
+class Literal;
+class LiteralSlice;
+
+// Abstract base class for literals.
+class LiteralBase {
+ public:
+  virtual ~LiteralBase() = 0;
+
+  // Literals are equal if they have compatible shapes and the same data
+  // values. Layout is not compared.
+  bool operator==(const LiteralBase& other) const;
+  bool operator!=(const LiteralBase& other) const { return !(*this == other); }
+
+  // Returns the shape of the literal.
+  const Shape& shape() const { return root_piece().subshape(); }
+
+  // Serialize to proto.
+  LiteralProto ToProto() const;
+
+  // Returns an ArraySlice of the array for this literal for the given NativeT
+  // (e.g., float). CHECKs if the subshape of the literal at the given
+  // ShapeIndex is not array. See primitive_util.h for the mapping from XLA type
+  // to native type.
+  template <typename NativeT>
+  tensorflow::gtl::ArraySlice<NativeT> data(
+      const ShapeIndex& shape_index = {}) const;
+
+  // Returns a const pointer to the sparse index array. Returns nullptr if the
+  // literal is not a sparse array.
+  const SparseIndexArray* sparse_indices(
+      const ShapeIndex& shape_index = {}) const;
+
+  // Returns a const pointer to (or size of) the underlying buffer holding the
+  // array at the given shape index. CHECKs if the subshape of the literal at
+  // the given ShapeIndex is not array.
+  const void* untyped_data(const ShapeIndex& shape_index = {}) const;
+  int64 size_bytes(const ShapeIndex& shape_index = {}) const;
+
+  // Returns this literal's data as a string. This literal must be a rank-1 U8
+  // array.
+  string GetR1U8AsString() const;
+
+  // Returns a string representation of the literal value.
+  // Warning: this function can take minutes for multi-million element Literals.
+  string ToString(bool print_layout = false) const;
+
+  // Gets an element in the literal at the given index. The multi_index is
+  // CHECKed against the dimension sizes.
+  template <typename NativeT>
+  NativeT Get(tensorflow::gtl::ArraySlice<int64> multi_index,
+              const ShapeIndex& shape_index) const;
+  // Overloads of Get for array literals. CHECKs if the literal is not
+  // array-shaped and dense.
+  template <typename NativeT>
+  NativeT Get(tensorflow::gtl::ArraySlice<int64> multi_index) const;
+
+  // Returns the element value at index (0, ..., 0), however many zeroes are
+  // required for that index.
+  template <typename NativeT>
+  NativeT GetFirstElement() const;
+
+  // As Get(), but determines the correct type and converts the value
+  // into text.
+  string GetAsString(tensorflow::gtl::ArraySlice<int64> multi_index,
+                     const ShapeIndex& shape_index = {}) const;
+  // As GetSparseElement(), but determines the correct type and converts the
+  // value into text.
+  string GetSparseElementAsString(int64 sparse_element_number,
+                                  const ShapeIndex& shape_index = {}) const;
+  // As Get(), but determines the correct type and converts the value into
+  // int64.  This literal must be an array.
+  StatusOr<int64> GetIntegralAsS64(
+      tensorflow::gtl::ArraySlice<int64> multi_index) const;
+
+  // Returns the multi-index of the element in a sparse literal at the given
+  // sparse element number.  The sparse element number is the position with in
+  // the sparse array's list of (index, value) pairs, and is checked against the
+  // total number of (index, value) pairs in the sparse array.
+  tensorflow::gtl::ArraySlice<int64> GetSparseIndex(
+      int64 sparse_element_number, const ShapeIndex& shape_index = {}) const;
+
+  // Returns the value of the element in a sparse literal at the given sparse
+  // element number.  The sparse element number is the position with in the
+  // sparse array's list of (index, value) pairs, and is checked against the
+  // total number of (index, value) pairs in the sparse array.
+  template <typename NativeT>
+  NativeT GetSparseElement(int64 sparse_element_number,
+                           const ShapeIndex& shape_index = {}) const;
+
+  // Invokes the "per cell" callback for each element in the provided
+  // literal with the element's indices and a string representation of
+  // the element's value.
+  //
+  // This function is useful if you want a polymorphic representation
+  // of the tensor's elements (turning it to a string for something
+  // like representation in a protobuf).
+  //
+  // This literal must have a dense layout.
+  void EachCellAsString(
+      const std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
+                               const string& value)>& per_cell) const;
+  template <typename NativeT>
+  void EachCell(std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
+                                   NativeT value)>
+                    per_cell) const;
+
+  // Returns whether every element in this literal is equal to value.
+  //
+  // value is an int8 because we expect this to be called with small
+  // compile-time constants (0, -1, etc.) and so that whatever value you pass
+  // can be represented exactly by floating-point types as small as 16 bits.
+  //
+  // If value doesn't fit in this literal's type, returns false.  Values of 1/0
+  // are considered equal to true/false; other values are not considered equal
+  // to true. Also if this literal is not array-shaped false is returned.
+  bool IsAll(int8 value) const;
+
+  // Like IsAll(const Literal&, int8), except we check whether the literal is
+  // equal to a particular floating-point number.
+  //
+  // If the literal is not a floating-point value, this always returns false.
+  //
+  // This casts value to the type of literal, then compares using ==.  The usual
+  // admonishments about floating-point equality checks apply.  We expect you to
+  // use this to check for values that can be expressed precisely as a float,
+  // e.g. -0.5.  Also if this literal is not array-shaped false is returned.
+  bool IsAllFloat(float value) const;
+
+  // Like IsAll(const Literal&, int8), except we check whether the literal is
+  // equal to a particular complex number.
+  //
+  // If the literal is not a complex value, this always returns false.
+  //
+  // This casts value to the type of literal, then compares using ==.  The usual
+  // admonishments about floating-point equality checks apply.  We expect you to
+  // use this to check for complex values that can be expressed precisely as
+  // float pairs e.g. (-0.5, 1.0).
+  //
+  // This literal must have a dense layout.
+  bool IsAllComplex(complex64 value) const;
+
+  // Literal consists entirely of the first element of the literal.
+  bool IsAllFirst() const;
+
+  // Returns whether this literal is zero at the specified index. This literal
+  // must be an array with a dense layout.
+  bool IsZero(tensorflow::gtl::ArraySlice<int64> indices) const;
+
+  // Returns the count of the elements in the array at the given shape index in
+  // this literal.
+  int64 element_count(const ShapeIndex& index = {}) const {
+    return ShapeUtil::ElementsIn(ShapeUtil::GetSubshape(shape(), index));
+  }
+
+  // Returns the count of the elements in the sparse array at the given shape
+  // index in this literal, which will be no larger than
+  // LayoutUtil::MaxSparseElements(SetSubshape(shape(), index).layout()).
+  int64 sparse_element_count() const;
+
+  // Compute a hash for this literal.  This literal must not be a sparse tensor
+  // or a tuple containing a sparse tensor.
+  size_t Hash() const;
+
+  // Converts this literal to the given shape. Returns an error is the
+  // conversion is not possible.
+  //
+  // round_f32_to_bf16: if true, converting F32 elements to BF16 uses rounding
+  // instead of truncation; otherwise, truncation is used.
+  //
+  // TODO(b/69266521): remove the round_to_bfloat16 flag when rounding becomes
+  // the default behavior.
+  StatusOr<std::unique_ptr<Literal>> ConvertToShape(
+      const Shape& dest_shape, bool round_f32_to_bf16 = false) const;
+
+  // Converts this literal to another primitive type using a bitcast
+  // conversion. The to and from primitive types must have the same bit
+  // width. Returns an error if the conversion is not possible. This literal
+  // must be array-shaped.
+  StatusOr<std::unique_ptr<Literal>> BitcastConvert(
+      PrimitiveType primitive_dest_type) const;
+
+  // Converts this literal to another primitive type. Returns an error if the
+  // conversion is not possible. This literal must be array-shaped.
+  StatusOr<std::unique_ptr<Literal>> Convert(
+      PrimitiveType primitive_dest_type) const;
+
+  // Returns a literal scalar representing the first element.
+  Literal GetFirstScalarLiteral() const;
+
+  // Clones the underlying buffers into a new Literal, or new
+  // std::unique_ptr<Literal>.
+  Literal Clone() const;
+  std::unique_ptr<Literal> CloneToUnique() const;
+
+  // TODO(b/67651157): The methods below which perform computation on Literals
+  // (Reshape, Slice, etc) should be moved elsewhere, and perhaps combined with
+  // evaluator code which operates on Literals.
+  //
+  // Creates a new value that has the equivalent value as this
+  // literal, but conforms to new_layout; e.g. a literal matrix that was in {0,
+  // 1} minor-to-major dimension layout can be re-layed-out as {1, 0}
+  // minor-to-major dimension layout and the value in the cell at any given
+  // logical index (i0, i1) will be the same.
+  //
+  // For tuple shaped literals, shape_index should be used to select the inner
+  // array that the new layout applies to.
+  //
+  // Note: this is useful when the client wants to ensure that a value placed in
+  // the XLA allocation tracker has a particular layout; for efficiency
+  // purposes or avoiding unimplemented operation/layout combinations.
+  std::unique_ptr<Literal> Relayout(const Layout& new_layout,
+                                    const ShapeIndex& shape_index = {}) const;
+
+  // An overload of Relayout which changes the layout of the entire shape rather
+  // than being limited to a single array within the shape.
+  std::unique_ptr<Literal> Relayout(const Shape& shape_with_layout) const;
+
+  // Creates a new literal by reshaping this literal to have the given
+  // dimensions. The total number of elements must not change; The
+  // implementation currently only supports monotonic dim0-major layouts.
+  // This literal must be an array.
+  StatusOr<std::unique_ptr<Literal>> Reshape(
+      tensorflow::gtl::ArraySlice<int64> dimensions) const;
+
+  // Creates a new literal by broadcasting this literal with `dimensions` to
+  // yield a literal of shape `result_shape`.
+  StatusOr<std::unique_ptr<Literal>> Broadcast(
+      const Shape& result_shape,
+      tensorflow::gtl::ArraySlice<int64> dimensions) const;
+
+  // Creates a new literal by reordering the dimensions of this literal.
+  // The given `permutation` must be a permutation of the dimension numbers
+  // in the original literal, and it specifies the order of the new dimensions
+  // in the result literal (i.e., new_order[i] = old_order[permutation[i]]).
+  // For example, a transpose call on a literal of shape [3 x 8 x 4] and
+  // `permutation` = {2, 0, 1} returns a new literal of shape [4 x 3 x 8].
+  // This literal must be an array.
+  std::unique_ptr<Literal> Transpose(
+      tensorflow::gtl::ArraySlice<int64> permutation) const;
+
+  // Creates a sub-array from this literal by extracting the indices
+  // [start_index, limit_index) of each dimension. The result literal has the
+  // same rank and layout as for the given literal. The number of indices in
+  // start_indices and limit_indices must be the rank of the literal, and the
+  // indices follow the order of the dimensions.
+  // This literal must be an array.
+  std::unique_ptr<Literal> Slice(
+      tensorflow::gtl::ArraySlice<int64> start_indices,
+      tensorflow::gtl::ArraySlice<int64> limit_indices) const;
+
+  // Creates a literal with a prepended dimension with bound "times"; e.g. a
+  // f32[3x2] with times=4 will produce a f32[4x3x2] with the 3x2 from this
+  // literal replicated four times.
+  // This literal must be an array.
+  template <typename NativeT>
+  std::unique_ptr<Literal> Replicate(int64 times) const;
+
+  // Creates a new Literal object with the shape specified as parameter.
+  // The content of the literal values is the default value of the primitive
+  // type of literal itself (0 for numeric types, and false for predicates).
+  //
+  // Note: It's an antipattern to use this method then immediately call
+  // Literal::Populate on the result (since that results in zero initialization,
+  // then reinitialization. Conside if a call to MakeUnique<Literal>(shape),
+  // followed by the call to Literal::Populate can be used instead.
+  static std::unique_ptr<Literal> CreateFromShape(const Shape& shape);
+
+ protected:
+  // A data structure representing a subshape at a particular ShapeIndex within
+  // the literal. For array-shaped ShapeIndexes, this data structure holds the
+  // pointer to the memory allocated for the array data.
+  class Piece {
+   public:
+    // Returns the buffer holding the array data for this piece as an array
+    // slice. This piece must be array-shaped.
+    template <typename NativeT>
+    tensorflow::gtl::ArraySlice<NativeT> data() const;
+    template <typename NativeT>
+    tensorflow::gtl::MutableArraySlice<NativeT> data();
+
+    // Returns the buffer holding the array data for this piece as a void*. This
+    // piece must be array-shaped.
+    void* untyped_data();
+    const void* untyped_data() const;
+
+    // Gets or sets an element in the array at the given index. The multi_index
+    // is CHECKed against the dimension sizes of the array.  This piece must be
+    // array-shaped.
+    template <typename NativeT>
+    NativeT Get(tensorflow::gtl::ArraySlice<int64> index) const;
+    template <typename NativeT>
+    void Set(tensorflow::gtl::ArraySlice<int64> index, NativeT value);
+
+    // Gets/sets the buffer holding the array data.
+    char* buffer() const { return buffer_; }
+    void set_buffer(char* buffer) { buffer_ = buffer; }
+
+    // The array of multi-indices that provide the locations of non-zero
+    // elements in a sparse array.  Only used if
+    // LayoutUtil::IsSparseArray(shape()) is true.
+    SparseIndexArray* sparse_indices() const { return sparse_indices_; }
+    void set_sparse_indices(SparseIndexArray* sparse_indices) {
+      sparse_indices_ = sparse_indices;
+    }
+
+    // Gets or sets the subshape of this piece. This reference points to a
+    // subshape within the shape in the containing Literal (Literal::shape_).
+    const Shape& subshape() const { return *subshape_; }
+    void set_subshape(const Shape* subshape) { subshape_ = subshape; }
+
+    // Returns the size in bytes of the buffer holding the array data.
+    int64 size_bytes() const { return ShapeUtil::ByteSizeOf(subshape()); }
+
+    // Returns the number of elements in this piece's array.
+    int64 element_count() const {
+      // If this is a sparse array, use the number of elements represented by
+      // the indices in the associated SparseIndexArray.
+      return LayoutUtil::IsSparseArray(subshape())
+                 ? sparse_indices()->index_count()
+                 : ShapeUtil::ElementsIn(subshape());
+    }
+
+    // Returns the child piece at 'index' of this piece.
+    Piece& child(int64 index) { return children_[index]; }
+
+    // Adds a child piece to this piece's children.
+    void emplace_back(Piece child_piece) {
+      children_.emplace_back(std::move(child_piece));
+    }
+
+    // Returns the size of children pieces of this piece.
+    int64 children_size() { return children_.size(); }
+
+    // Visitor functions that recursively traverses the piece and calls the
+    // given function at each child piece. The function has the type:
+    //    void (const ShapeIndex& index, const Piece& piece)
+    template <typename Fn>
+    void ForEachSubpiece(const Fn& func) const {
+      ShapeIndex index;
+      return ForEachHelper(
+                 [&func](const ShapeIndex& index, const Piece& piece) {
+                   func(index, piece);
+                   return Status::OK();
+                 },
+                 *this, &index)
+          .IgnoreError();
+    }
+    // Same as above, but the function has the type:
+    //    Status (const ShapeIndex& index, const Piece& piece)
+    // The first non-OK return value is returned by the function.
+    template <typename Fn>
+    Status ForEachSubpieceWithStatus(const Fn& func) const {
+      ShapeIndex index;
+      return ForEachHelper(func, *this, &index);
+    }
+    // Same as above, but the function has the type:
+    //    Bool (const ShapeIndex& index, const Piece& piece)
+    // The first non-true return value is returned by the function.
+    template <typename Fn>
+    bool ForEachSubpieceWithBool(const Fn& func) const {
+      ShapeIndex index;
+      return ForEachHelperBool(func, *this, &index);
+    }
+    // Same as above, but the function has the type:
+    //    Void (const ShapeIndex& index, Piece& piece)
+    template <typename Fn>
+    void ForEachMutableSubpiece(const Fn& func) {
+      ShapeIndex index;
+      return ForEachMutableHelper(
+                 [&func](const ShapeIndex& index, Piece* piece) {
+                   func(index, piece);
+                   return Status::OK();
+                 },
+                 const_cast<xla::LiteralBase::Piece*>(this), &index)
+          .IgnoreError();
+    }
+    // Same as above, but the function has the type:
+    //    Status (const ShapeIndex& index, Piece& piece)
+    // The first non-OK return value is returned by the function.
+    template <typename Fn>
+    Status ForEachMutableSubpieceWithStatus(const Fn& func) {
+      ShapeIndex index;
+      return ForEachMutableHelper(
+          func, const_cast<xla::LiteralBase::Piece*>(this), &index);
+    }
+
+    // Returns true if this piece and 'other' contain the same data. This piece
+    // and 'other' must be array-shaped and compatible.
+    bool EqualElements(const Piece& other) const;
+
+    // Writes the shape and data (if array-shaped) into the given proto.
+    void WriteToProto(LiteralProto* proto) const;
+
+    // Copy the data from 'src' into this piece's buffer. Shapes of this piece
+    // and src must be compatible.
+    Status CopyFrom(const Piece& src);
+
+    // Copies the data from the given proto into this piece. The shape of this
+    // piece must be equal (not just compatible) to the shape of the proto.
+    Status CopyFromProto(const LiteralProto& proto);
+
+    // Sorts the elements in a sparse array.
+    void SortSparseElements();
+
+   private:
+    // Helpers for traversing the piece via ForEachSubpiece rooted at 'index'.
+    // The first non-OK (or non-true) value is returned by the function.
+    // The callable 'func' has the same signature as described above in
+    // ForEachSubpiece*.
+    template <typename Fn>
+    Status ForEachHelper(const Fn& func, const Piece& piece,
+                         ShapeIndex* index) const {
+      TF_RETURN_IF_ERROR(func(*index, piece));
+      for (int64 i = 0; i < piece.children_.size(); ++i) {
+        index->push_back(i);
+        TF_RETURN_IF_ERROR(ForEachHelper(func, piece.children_[i], index));
+        index->pop_back();
+      }
+      return Status::OK();
+    }
+    template <typename Fn>
+    bool ForEachHelperBool(const Fn& func, const Piece& piece,
+                           ShapeIndex* index) const {
+      if (!func(*index, piece)) {
+        return false;
+      }
+      for (int64 i = 0; i < piece.children_.size(); ++i) {
+        index->push_back(i);
+        if (!ForEachHelperBool(func, piece.children_[i], index)) {
+          return false;
+        }
+        index->pop_back();
+      }
+      return true;
+    }
+    template <typename Fn>
+    Status ForEachMutableHelper(const Fn& func, Piece* piece,
+                                ShapeIndex* index) {
+      TF_RETURN_IF_ERROR(func(*index, piece));
+      for (int64 i = 0; i < piece->children_.size(); ++i) {
+        index->push_back(i);
+        TF_RETURN_IF_ERROR(
+            ForEachMutableHelper(func, &piece->children_[i], index));
+        index->pop_back();
+      }
+      return Status::OK();
+    }
+
+    // Recursive helper for EqualElements.
+    template <typename NativeT>
+    bool EqualElementsInternal(const Piece& other,
+                               std::vector<int64>* multi_index) const;
+
+    // Helper for SortSparseElements that has the element type as a template
+    // parameter.
+    template <typename NativeT>
+    void SortSparseElementsInternal();
+
+    // For array-shaped pieces, this is the buffer holding the literal data.
+    char* buffer_ = nullptr;
+
+    // For sparse arrays, this is the array of indices.
+    SparseIndexArray* sparse_indices_ = nullptr;
+
+    // The shape of piece. This points into the shape of the containing Literal
+    // (Literal::shape_).
+    const Shape* subshape_ = nullptr;
+
+    // Children pieces for tuple shaped pieces.
+    std::vector<Piece> children_ = {};
+  };  // class Piece
+
+  const Piece& piece(const ShapeIndex& shape_index) const {
+    Piece* piece = &const_cast<Piece&>(root_piece());
+    for (const auto i : shape_index) {
+      DCHECK_GE(i, 0);
+      DCHECK_LT(i, piece->children_size());
+      piece = &piece->child(i);
+    }
+    return *piece;
+  }
+
+  // Returns the piece at the root of the shape.
+  virtual const Piece& root_piece() const = 0;
+
+  // LiteralSlice and Literal must access Pieces of other Literals.
+  friend class Literal;
+  friend class LiteralSlice;
+  friend class BorrowingLiteral;
+
+ private:
+  template <typename NativeT>
+  std::unique_ptr<Literal> SliceInternal(
+      const Shape& result_shape,
+      tensorflow::gtl::ArraySlice<int64> start_indices) const;
+};
+
 // Class representing literal values in XLA.
 //
-// TODO(b/67651157): The methods in this class should be reduced to a minimal
-// set of methods which construct Literals and accessors methods. Other methods
-// which perform computation on Literals (Reshape, Slice, etc) should be moved
-// elsewhere, and perhaps combined with evaluator code which operates on
-// Literals.
-class Literal {
+// The underlying buffer and shape is always owned by this class.
+class Literal : public LiteralBase {
  public:
   Literal() : Literal(ShapeUtil::MakeNil()) {}
 
@@ -74,48 +568,162 @@ class Literal {
   Literal(const Literal& other) = delete;
   Literal& operator=(const Literal& other) = delete;
   Literal(Literal&& other);
+  // 'allocate_arrays' indicates whether to allocate memory for the arrays in
+  // the shape. If false, buffer pointers inside of the Literal::Pieces are set
+  // to nullptr.
+  Literal(const Shape& shape, bool allocate_arrays);
   Literal& operator=(Literal&& other);
 
-  // Literals are equal if they have compatible shapes and the same data
-  // values. Layout is not compared.
-  bool operator==(const Literal& other) const;
-  bool operator!=(const Literal& other) const { return !(*this == other); }
+  // TODO(b/67651157): Remove this accessor. Literal users should not be able to
+  // mutate the shape as this can produce malformed Literals.
+  Shape* mutable_shape_do_not_use() { return shape_.get(); }
 
-  // Serialize to and from a proto.
-  static StatusOr<std::unique_ptr<Literal>> CreateFromProto(
-      const LiteralProto& proto);
-  LiteralProto ToProto() const;
+  // Returns a MutableArraySlice view of the array for this literal for the
+  // given NativeT (e.g., float). CHECKs if the subshape of the literal at the
+  // given ShapeIndex is not array. See primitive_util.h for the mapping from
+  // XLA type to native type.
+  template <typename NativeT>
+  tensorflow::gtl::MutableArraySlice<NativeT> data(
+      const ShapeIndex& shape_index = {});
+  // Unhide const method from parent class.
+  using LiteralBase::data;
+
+  // Returns a pointer to the sparse index array. Returns nullptr if the literal
+  // is not a sparse array.
+  SparseIndexArray* sparse_indices(const ShapeIndex& shape_index = {});
+
+  // Returns a pointer to the underlying buffer holding the array at the given
+  // shape index. CHECKs if the subshape of the literal at the given ShapeIndex
+  // is not array.
+  void* untyped_data(const ShapeIndex& shape_index = {});
+  // Unhide const method from parent class.
+  using LiteralBase::untyped_data;
+
+  // Populates a literal with a sparse layout with the given indices and values.
+  // Each index in the indices array is CHECKed against the dimensions in the
+  // literal's shape.  If sort is true, then the indices and values will be
+  // sorted.  If sort is false, then the indices and values are assumed to
+  // already be in sorted order.  See CreateSparse for an example of how data
+  // are populated.
+  template <typename NativeT>
+  void PopulateSparse(SparseIndexArray indices,
+                      tensorflow::gtl::ArraySlice<NativeT> values,
+                      bool sort = true);
 
-  // Return the shape of the literal.
-  const Shape& shape() const { return shape_; }
+  // Copy values from 'src_literal' rooted at 'src_shape_index' into this
+  // literal rooted at 'dest_shape_index'. The subshape of this literal rooted
+  // at 'dest_shape_index' must be compatible with the subshape of 'src_literal'
+  // rooted at 'src_shape_index', but need not be arrays.
+  Status CopyFrom(const LiteralSlice& src_literal,
+                  const ShapeIndex& dest_shape_index = {},
+                  const ShapeIndex& src_shape_index = {});
+
+  // Similar to CopyFrom, but with move semantincs. The subshape of this literal
+  // rooted at 'dest_shape_index' must be *equal* to the shape 'src_literal'
+  // (layouts and shapes must match), but need not be arrays. The memory
+  // allocated in this literal for the subshape at dest_shape_index is
+  // deallocated, and the respective buffers are replaced with those in
+  // src_literal. Upon return, src_literal is set to a nil shape (empty tuple).
+  Status MoveFrom(Literal&& src_literal,
+                  const ShapeIndex& dest_shape_index = {});
+
+  // Copies the values from src_literal, starting at src_base shape indexes,
+  // to this literal, starting at dest_base, where the copy size in each
+  // dimension is specified by copy_size.
+  // The src_literal and this literal must have the same primitive type,
+  // src_base+copy_size must fit the source literal dimensions, as well as
+  // dest_base+copy_size must fit the destination literal dimensions.
+  // Note: if either src_literal or this literal contains dimensions with zero
+  // element, then copy_size must be 0 in these dimensions while the
+  // corresponding base indices being 0.
+  // This literal and 'src_literal' must be arrays.
+  Status CopySliceFrom(const LiteralSlice& src_literal,
+                       tensorflow::gtl::ArraySlice<int64> src_base,
+                       tensorflow::gtl::ArraySlice<int64> dest_base,
+                       tensorflow::gtl::ArraySlice<int64> copy_size);
+
+  // Copies one element from src_literal[src_index] to (*this)[dest_index].
+  Status CopyElementFrom(const LiteralSlice& src_literal,
+                         tensorflow::gtl::ArraySlice<int64> src_index,
+                         tensorflow::gtl::ArraySlice<int64> dest_index);
+
+  // Sets an element in the literal at the given index. The multi_index is
+  // CHECKed against the dimension sizes.
+  template <typename NativeT>
+  void Set(tensorflow::gtl::ArraySlice<int64> multi_index,
+           const ShapeIndex& shape_index, NativeT value);
+  // Overloads of Set for array literals. CHECKs if the literal is not
+  // array-shaped and dense.
+  template <typename NativeT>
+  void Set(tensorflow::gtl::ArraySlice<int64> multi_index, NativeT value);
+
+  // Appends the given element to the literal.  If the elements are not appended
+  // in sorted order, then SortSparseElements should be called before calling
+  // other methods.  This literal must have a sparse layout.
+  template <typename NativeT>
+  void AppendSparseElement(tensorflow::gtl::ArraySlice<int64> multi_index,
+                           NativeT value, const ShapeIndex& shape_index = {});
+
+  // Sorts the elements in a sparse array.
+  void SortSparseElements(const ShapeIndex& shape_index = {});
+
+  // As Set(), but truncates `value` to the literal element type before storing.
+  // This literal must be an array.
+  Status SetIntegralAsS64(tensorflow::gtl::ArraySlice<int64> multi_index,
+                          int64 value);
+
+  // Populate this literal with the given values. Examples:
+  //
+  //   // Populate with floats.
+  //   Array2D<float> float_values = ...
+  //   literal.PopulateR2FromArray2D(values);
+  //
+  //   // Populate with int32s.
+  //   literal.PopulateR2<int32>({{1, 2}, {3, 4}});
+  //
+  // The shape and element type of this literal must match given values. For
+  // example, in the call above to literal.PopulateR2(), 'literal' must be a 2x2
+  // array of S32.
+  template <typename NativeT>
+  void PopulateR1(tensorflow::gtl::ArraySlice<NativeT> values);
+  void PopulateR1(const tensorflow::core::Bitmap& values);
+  template <typename NativeT>
+  void PopulateR2(std::initializer_list<std::initializer_list<NativeT>> values);
+  template <typename NativeT>
+  void PopulateFromArray(const Array<NativeT>& values);
+  template <typename NativeT>
+  void PopulateR2FromArray2D(const Array2D<NativeT>& values);
+  template <typename NativeT>
+  void PopulateR3FromArray3D(const Array3D<NativeT>& values);
+  template <typename NativeT>
+  void PopulateR4FromArray4D(const Array4D<NativeT>& values);
+
+  // Populates literal values by calling the generator function for every cell
+  // in this literal object.
+  //
+  // generator must be a callable of the type
+  // NativeT(tensorflow::gtl::ArraySlice<int64> indexes) or compatible.
+  //
+  // This literal must have a dense layout.
+  template <typename NativeT, typename FnType>
+  Status Populate(const FnType& generator);
 
-  // TODO(b/67651157): Remove this accessor. Literal users should not be able to
-  // mutate the shape as this can produce malformed Literals.
-  Shape* mutable_shape_do_not_use() { return &shape_; }
+  // A parallel version of Populate(). This can be used if the generator is
+  // thread-safe and the values for the shape's different elements are
+  // independent.
+  template <typename NativeT, typename FnType>
+  Status PopulateParallel(const FnType& generator);
 
-  // Returns a (Mutable)ArraySlice view of the array for this literal for the
-  // given NativeT (e.g., float). CHECKs if the subshape of the literal at the
-  // given ShapeIndex is not array. See primitive_util.h for the mapping from
-  // XLA type to native type.
-  template <typename NativeT>
-  tensorflow::gtl::ArraySlice<NativeT> data(
-      const ShapeIndex& shape_index = {}) const;
+  // Fills this literal with the given value.
   template <typename NativeT>
-  tensorflow::gtl::MutableArraySlice<NativeT> data(
-      const ShapeIndex& shape_index = {});
+  void PopulateWithValue(NativeT value);
 
-  // Returns a pointer to the sparse index array. Returns nullptr if the literal
-  // is not a sparse array.
-  const SparseIndexArray* sparse_indices(
-      const ShapeIndex& shape_index = {}) const;
-  SparseIndexArray* sparse_indices(const ShapeIndex& shape_index = {});
+  // Factory methods below.
+  //
 
-  // Returns a pointer to (or size of) the underlying buffer holding the array
-  // at the given shape index. CHECKs if the subshape of the literal at the
-  // given ShapeIndex is not array.
-  const void* untyped_data(const ShapeIndex& shape_index = {}) const;
-  void* untyped_data(const ShapeIndex& shape_index = {});
-  int64 size_bytes(const ShapeIndex& shape_index = {}) const;
+  // Serialize from a proto.
+  static StatusOr<std::unique_ptr<Literal>> CreateFromProto(
+      const LiteralProto& proto);
 
   // Creates a new literal of a given rank. To minimize ambiguity (for users
   // and the compiler) these CreateR[0-2] methods should explicitly specify the
@@ -163,10 +771,6 @@ class Literal {
           values,
       const Layout& layout);
 
-  // Returns this literal's data as a string. This literal must be a rank-1 U8
-  // array.
-  string GetR1U8AsString() const;
-
   // Creates a literal with a sparse layout and the given indices and values.
   // The shape is initialized from the given dimensions.  The minor dimension of
   // the indices array must equal the rank of the shape (i.e. size of the
@@ -206,171 +810,16 @@ class Literal {
       tensorflow::gtl::ArraySlice<int64> dimensions, SparseIndexArray indices,
       tensorflow::gtl::ArraySlice<NativeT> values, bool sort = true);
 
-  // Populates a literal with a sparse layout with the given indices and values.
-  // Each index in the indices array is CHECKed against the dimensions in the
-  // literal's shape.  If sort is true, then the indices and values will be
-  // sorted.  If sort is false, then the indices and values are assumed to
-  // already be in sorted order.  See CreateSparse for an example of how data
-  // are populated.
-  template <typename NativeT>
-  void PopulateSparse(SparseIndexArray indices,
-                      tensorflow::gtl::ArraySlice<NativeT> values,
-                      bool sort = true);
-
-  // Creates a new Literal object with the shape specified as parameter.
-  // The content of the literal values is the default value of the primitive
-  // type of literal itself (0 for numeric types, and false for predicates).
-  static std::unique_ptr<Literal> CreateFromShape(const Shape& shape);
-
-  // Creates a new Literal object with its values havings the primitive_type
-  // type, and with dimensions defined by the dimensions parameter.
-  // The content of the literal values is the default value of the primitive
-  // type of literal itself (0 for numeric types, and false for predicates).
-  static std::unique_ptr<Literal> CreateFromDimensions(
-      PrimitiveType primitive_type,
-      tensorflow::gtl::ArraySlice<int64> dimensions);
-
-  // Copy values from 'src_literal' rooted at 'src_shape_index' into this
-  // literal rooted at 'dest_shape_index'. The subshape of this literal rooted
-  // at 'dest_shape_index' must be compatible with the subshape of 'src_literal'
-  // rooted at 'src_shape_index', but need not be arrays.
-  Status CopyFrom(const Literal& src_literal,
-                  const ShapeIndex& dest_shape_index = {},
-                  const ShapeIndex& src_shape_index = {});
-
-  // Similar to CopyFrom, but with move semantincs. The subshape of this literal
-  // rooted at 'dest_shape_index' must be *equal* to the shape 'src_literal'
-  // (layouts and shapes must match), but need not be arrays. The memory
-  // allocated in this literal for the subshape at dest_shape_index is
-  // deallocated, and the respective buffers are replaced with those in
-  // src_literal. Upon return, src_literal is set to a nil shape (empty tuple).
-  Status MoveFrom(Literal&& src_literal,
-                  const ShapeIndex& dest_shape_index = {});
-
-  // Copies the values from src_literal, starting at src_base shape indexes,
-  // to this literal, starting at dest_base, where the copy size in each
-  // dimension is specified by copy_size.
-  // The src_literal and this literal must have the same primitive type,
-  // src_base+copy_size must fit the source literal dimensions, as well as
-  // dest_base+copy_size must fit the destination literal dimensions.
-  // Note: if either src_literal or this literal contains dimensions with zero
-  // element, then copy_size must be 0 in these dimensions while the
-  // corresponding base indices being 0.
-  // This literal and 'src_literal' must be arrays.
-  Status CopySliceFrom(const Literal& src_literal,
-                       tensorflow::gtl::ArraySlice<int64> src_base,
-                       tensorflow::gtl::ArraySlice<int64> dest_base,
-                       tensorflow::gtl::ArraySlice<int64> copy_size);
-
-  // Copies one element from src_literal[src_index] to (*this)[dest_index].
-  Status CopyElementFrom(const Literal& src_literal,
-                         tensorflow::gtl::ArraySlice<int64> src_index,
-                         tensorflow::gtl::ArraySlice<int64> dest_index);
-
-  // Returns a vector containing the tuple elements of this Literal as separate
-  // Literals. This Literal must be tuple-shaped and can be a nested tuple. The
-  // elements are moved into the new Literals; no data is copied. Upon return
-  // this Literal is set to a nil shape (empty tuple)
-  std::vector<Literal> DecomposeTuple();
-
-  // This operation is the inverse of DecomposeTuple. The given elements are
-  // moved into the tuple elements of a new tuple-shaped Literal which is
-  // returned. Upon return, each of the Literals in 'elements' is set to a nil
-  // shape (empty tuple).
-  static Literal MoveIntoTuple(
-      tensorflow::gtl::MutableArraySlice<Literal> elements);
-
-  // Creates a new value that has the equivalent value as this literal, but
-  // conforms to new_layout; e.g. a literal matrix that was in {0, 1}
-  // minor-to-major dimension layout can be re-layed-out as {1, 0}
-  // minor-to-major dimension layout and the value in the cell at any given
-  // logical index (i0, i1) will be the same.
-  //
-  // For tuple shaped literals, shape_index should be used to select the inner
-  // array that the new layout applies to.
-  //
-  // Note: this is useful when the client wants to ensure that a value placed in
-  // the XLA allocation tracker has a particular layout; for efficiency
-  // purposes or avoiding unimplemented operation/layout combinations.
-  std::unique_ptr<Literal> Relayout(const Layout& new_layout,
-                                    const ShapeIndex& shape_index = {}) const;
-
-  // An overload of Relayout which changes the layout of the entire shape rather
-  // than being limited to a single array within the shape.
-  std::unique_ptr<Literal> Relayout(const Shape& shape_with_layout) const;
-
-  // Creates a new literal by reshaping this literal to have the given
-  // dimensions. The total number of elements must not change; The
-  // implementation currently only supports monotonic dim0-major layouts.
-  // This literal must be an array.
-  StatusOr<std::unique_ptr<Literal>> Reshape(
-      tensorflow::gtl::ArraySlice<int64> dimensions) const;
-
-  // Creates a new literal by reordering the dimensions of this literal.
-  // The given `permutation` must be a permutation of the dimension numbers
-  // in the original literal, and it specifies the order of the new dimensions
-  // in the result literal (i.e., new_order[i] = old_order[permutation[i]]).
-  // For example, a transpose call on a literal of shape [3 x 8 x 4] and
-  // `permutation` = {2, 0, 1} returns a new literal of shape [4 x 3 x 8].
-  // This literal must be an array.
-  std::unique_ptr<Literal> Transpose(
-      tensorflow::gtl::ArraySlice<int64> permutation) const;
-
-  // Creates a sub-array from this literal by extracting the indices
-  // [start_index, limit_index) of each dimension. The result literal has the
-  // same rank and layout as for the given literal. The number of indices in
-  // start_indices and limit_indices must be the rank of the literal, and the
-  // indices follow the order of the dimensions.
-  // This literal must be an array.
-  std::unique_ptr<Literal> Slice(
-      tensorflow::gtl::ArraySlice<int64> start_indices,
-      tensorflow::gtl::ArraySlice<int64> limit_indices) const;
-
-  // Creates a literal with a prepended dimension with bound "times"; e.g. a
-  // f32[3x2] with times=4 will produce a f32[4x3x2] with the 3x2 from this
-  // literal replicated four times.
-  // This literal must be an array.
-  template <typename NativeT>
-  std::unique_ptr<Literal> Replicate(int64 times) const;
-
-  // Converts this literal to another primitive type using
-  // static_cast<>. Returns an error if the conversion is not possible. This
-  // literal must be array-shaped.
-  StatusOr<std::unique_ptr<Literal>> Convert(
-      PrimitiveType primitive_dest_type) const;
-
-  // Converts this literal to another primitive type using a bitcast
-  // conversion. The to and from primitive types must have the same bit
-  // width. Returns an error if the conversion is not possible. This literal
-  // must be array-shaped.
-  StatusOr<std::unique_ptr<Literal>> BitcastConvert(
-      PrimitiveType primitive_dest_type) const;
-
-  // Converts this literal to the given shape. Returns an error is the
-  // conversion is not possible.
-  //
-  // round_f32_to_bf16: if true, converting F32 elements to BF16 uses rounding
-  // instead of truncation; otherwise, truncation is used.
-  //
-  // TODO(b/69266521): remove the round_to_bfloat16 flag when rounding becomes
-  // the default behavior.
-  StatusOr<std::unique_ptr<Literal>> ConvertToShape(
-      const Shape& dest_shape, bool round_f32_to_bf16 = false) const;
-
   // Creates a scalar literal value zero of the given primitive type.
   static Literal Zero(PrimitiveType primitive_type);
-
   // Creates a scalar literal value one of the given primitive type.
   static Literal One(PrimitiveType primitive_type);
-
   // Creates a scalar literal value containing the minimum value of the given
   // primitive type. For floating-point types, returns -inf.
   static Literal MinValue(PrimitiveType primitive_type);
-
   // Creates a scalar literal value containing the maximum value of the given
   // primitive type. For floating-point types, returns inf.
   static Literal MaxValue(PrimitiveType primitive_type);
-
   // Creates a literal of the given shape where each element is `value`.
   template <typename NativeT>
   static std::unique_ptr<Literal> CreateFullWithDescendingLayout(
@@ -420,83 +869,10 @@ class Literal {
 
   // Creates a literal that projects the (x, y) dimensions given in values into
   // the z and p dimensions given.
-  template <typename NativeT>
-  static std::unique_ptr<Literal> CreateR4Projected(
-      std::initializer_list<std::initializer_list<NativeT>> values,
-      int64 projection_p, int64 projection_z);
-
-  // Clones this literal into a new Literal, or new std::unique_ptr<Literal>.
-  Literal Clone() const;
-  std::unique_ptr<Literal> CloneToUnique() const;
-
-  // Gets or sets an element in the literal at the given index. The multi_index
-  // is CHECKed against the dimension sizes.
-  template <typename NativeT>
-  NativeT Get(tensorflow::gtl::ArraySlice<int64> multi_index,
-              const ShapeIndex& shape_index) const;
-  template <typename NativeT>
-  void Set(tensorflow::gtl::ArraySlice<int64> multi_index,
-           const ShapeIndex& shape_index, NativeT value);
-
-  // Overloads of Get and Set for array literals. CHECKs if the literal is not
-  // array-shaped and dense.
-  template <typename NativeT>
-  NativeT Get(tensorflow::gtl::ArraySlice<int64> multi_index) const;
-  template <typename NativeT>
-  void Set(tensorflow::gtl::ArraySlice<int64> multi_index, NativeT value);
-
-  // Returns the multi-index of the element in a sparse literal at the given
-  // sparse element number.  The sparse element number is the position with in
-  // the sparse array's list of (index, value) pairs, and is checked against the
-  // total number of (index, value) pairs in the sparse array.
-  tensorflow::gtl::ArraySlice<int64> GetSparseIndex(
-      int64 sparse_element_number, const ShapeIndex& shape_index = {}) const;
-
-  // Returns the value of the element in a sparse literal at the given sparse
-  // element number.  The sparse element number is the position with in the
-  // sparse array's list of (index, value) pairs, and is checked against the
-  // total number of (index, value) pairs in the sparse array.
-  template <typename NativeT>
-  NativeT GetSparseElement(int64 sparse_element_number,
-                           const ShapeIndex& shape_index = {}) const;
-
-  // Appends the given element to the literal.  If the elements are not appended
-  // in sorted order, then SortSparseElements should be called before calling
-  // other methods.  This literal must have a sparse layout.
-  template <typename NativeT>
-  void AppendSparseElement(tensorflow::gtl::ArraySlice<int64> multi_index,
-                           NativeT value, const ShapeIndex& shape_index = {});
-
-  // Sorts the elements in a sparse array.
-  void SortSparseElements(const ShapeIndex& shape_index = {});
-
-  // Returns the element value at index (0, ..., 0), however many zeroes are
-  // required for that index.
-  template <typename NativeT>
-  NativeT GetFirstElement() const;
-
-  // Returns a literal scalar representing the first element.
-  Literal GetFirstScalarLiteral() const;
-
-  // As Get(), but determines the correct type and converts the value
-  // into text.
-  string GetAsString(tensorflow::gtl::ArraySlice<int64> multi_index,
-                     const ShapeIndex& shape_index = {}) const;
-
-  // As GetSparseElement(), but determines the correct type and converts the
-  // value into text.
-  string GetSparseElementAsString(int64 sparse_element_number,
-                                  const ShapeIndex& shape_index = {}) const;
-
-  // As Get(), but determines the correct type and converts the value into
-  // int64.  This literal must be an array.
-  StatusOr<int64> GetIntegralAsS64(
-      tensorflow::gtl::ArraySlice<int64> multi_index) const;
-
-  // As Set(), but truncates `value` to the literal element type before storing.
-  // This literal must be an array.
-  Status SetIntegralAsS64(tensorflow::gtl::ArraySlice<int64> multi_index,
-                          int64 value);
+  template <typename NativeT>
+  static std::unique_ptr<Literal> CreateR4Projected(
+      std::initializer_list<std::initializer_list<NativeT>> values,
+      int64 projection_p, int64 projection_z);
 
   // Returns an identity matrix (rank 2) with the given row and column count.
   template <typename NativeT>
@@ -507,6 +883,9 @@ class Literal {
   static std::unique_ptr<Literal> MakeTuple(
       tensorflow::gtl::ArraySlice<const Literal*> elements);
 
+  static std::unique_ptr<Literal> MakeTupleFromSlices(
+      tensorflow::gtl::ArraySlice<LiteralSlice> elements);
+
   // As above, but intended to be invoked with move semantics; i.e.
   //
   //  std::vector<std::unique_ptr<Literal>> elements = ...;
@@ -538,136 +917,104 @@ class Literal {
     return MakeTupleOwned(std::move(v));
   }
 
-  // Returns a string representation of the literal value.
-  // Warning: this function can take minutes for multi-million element Literals.
-  string ToString(bool print_layout = false) const;
-
-  // Invokes the "per cell" callback for each element in the provided
-  // literal with the element's indices and a string representation of
-  // the element's value.
-  //
-  // This function is useful if you want a polymorphic representation
-  // of the tensor's elements (turning it to a string for something
-  // like representation in a protobuf).
-  //
-  // This literal must have a dense layout.
-  void EachCellAsString(
-      const std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
-                               const string& value)>& per_cell) const;
-  template <typename NativeT>
-  void EachCell(std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
-                                   NativeT value)>
-                    per_cell) const;
-
-  // Populate this literal with the given values. Examples:
-  //
-  //   // Populate with floats.
-  //   Array2D<float> float_values = ...
-  //   literal.PopulateR2FromArray2D(values);
-  //
-  //   // Populate with int32s.
-  //   literal.PopulateR2<int32>({{1, 2}, {3, 4}});
-  //
-  // The shape and element type of this literal must match given values. For
-  // example, in the call above to literal.PopulateR2(), 'literal' must be a 2x2
-  // array of S32.
-  template <typename NativeT>
-  void PopulateR1(tensorflow::gtl::ArraySlice<NativeT> values);
-  void PopulateR1(const tensorflow::core::Bitmap& values);
-  template <typename NativeT>
-  void PopulateR2(std::initializer_list<std::initializer_list<NativeT>> values);
-  template <typename NativeT>
-  void PopulateFromArray(const Array<NativeT>& values);
-  template <typename NativeT>
-  void PopulateR2FromArray2D(const Array2D<NativeT>& values);
-  template <typename NativeT>
-  void PopulateR3FromArray3D(const Array3D<NativeT>& values);
-  template <typename NativeT>
-  void PopulateR4FromArray4D(const Array4D<NativeT>& values);
-
-  // Populates literal values by calling the generator function for every cell
-  // in this literal object.
-  //
-  // generator must be a callable of the type
-  // NativeT(tensorflow::gtl::ArraySlice<int64> indexes) or compatible.
-  //
-  // This literal must have a dense layout.
-  template <typename NativeT, typename FnType>
-  Status Populate(const FnType& generator);
-
-  // A parallel version of Populate(). This can be used if the generator is
-  // thread-safe and the values for the shape's different elements are
-  // independent.
-  template <typename NativeT, typename FnType>
-  Status PopulateParallel(const FnType& generator);
+  // Returns a vector containing the tuple elements of this Literal as separate
+  // Literals. This Literal must be tuple-shaped and can be a nested tuple. The
+  // elements are moved into the new Literals; no data is copied. Upon return
+  // this Literal is set to a nil shape (empty tuple)
+  std::vector<Literal> DecomposeTuple();
 
-  // Fills this literal with the given value.
-  template <typename NativeT>
-  void PopulateWithValue(NativeT value);
+  // This operation is the inverse of DecomposeTuple. The given elements are
+  // moved into the tuple elements of a new tuple-shaped Literal which is
+  // returned. Upon return, each of the Literals in 'elements' is set to a nil
+  // shape (empty tuple).
+  static Literal MoveIntoTuple(
+      tensorflow::gtl::MutableArraySlice<Literal> elements);
 
-  // Returns whether every element in this literal is equal to value.
-  //
-  // value is an int8 because we expect this to be called with small
-  // compile-time constants (0, -1, etc.) and so that whatever value you pass
-  // can be represented exactly by floating-point types as small as 16 bits.
-  //
-  // If value doesn't fit in this literal's type, returns false.  Values of 1/0
-  // are considered equal to true/false; other values are not considered equal
-  // to true. Also if this literal is not array-shaped false is returned.
-  bool IsAll(int8 value) const;
+  // Creates a new Literal object with its values havings the primitive_type
+  // type, and with dimensions defined by the dimensions parameter.
+  // The content of the literal values is the default value of the primitive
+  // type of literal itself (0 for numeric types, and false for predicates).
+  static std::unique_ptr<Literal> CreateFromDimensions(
+      PrimitiveType primitive_type,
+      tensorflow::gtl::ArraySlice<int64> dimensions);
 
-  // Like IsAll(const Literal&, int8), except we check whether the literal is
-  // equal to a particular floating-point number.
-  //
-  // If the literal is not a floating-point value, this always returns false.
-  //
-  // This casts value to the type of literal, then compares using ==.  The usual
-  // admonishments about floating-point equality checks apply.  We expect you to
-  // use this to check for values that can be expressed precisely as a float,
-  // e.g. -0.5.  Also if this literal is not array-shaped false is returned.
-  bool IsAllFloat(float value) const;
+  // If the given literal's data type is bfloat16, converts it to a float
+  // literal; otherwise, returns a copy of it. If the literal is a tuple,
+  // recursively converts its elements.
+  static std::unique_ptr<Literal> ConvertBF16ToF32(
+      const LiteralSlice& bf16_literal);
+
+  // If the given literal's data type is float, converts it to a bfloat16
+  // literal; otherwise, returns a copy of it. If the literal is a tuple,
+  // recursively converts its elements.
+  static std::unique_ptr<Literal> ConvertF32ToBF16(
+      const LiteralSlice& f32_literal);
+
+  // Creates a literal with a new shape with the given new dimensions using the
+  // data in the given input literal. For reshaping purposes the (flat) data
+  // buffer of the input literal is assumed to have the given minor_to_major
+  // layout order.
+  static std::unique_ptr<Literal> ReshapeSlice(
+      tensorflow::gtl::ArraySlice<int64> new_dimensions,
+      tensorflow::gtl::ArraySlice<int64> minor_to_major,
+      const LiteralSlice& literal);
+
+  // Creates a literal with the supplied shape, and uses the provided value
+  // generator to populate the literal's values.
+  // Returns the new literal object, or an error Status if failed.
+  template <
+      PrimitiveType type,
+      typename T = typename primitive_util::PrimitiveTypeToNative<type>::type>
+  static StatusOr<std::unique_ptr<Literal>> CreateRandomLiteral(
+      const Shape& shape,
+      const std::function<T(tensorflow::gtl::ArraySlice<int64>)>& generator);
+
+  // Creates a literal with the supplied shape, and initializes the literal
+  // values using a normal distribution with given mean and stddev standard
+  // deviation, and using the engine as entropy generator.
+  // Returns the new literal object, or an error Status if failed.
+  template <
+      PrimitiveType type, typename E,
+      typename T = typename primitive_util::PrimitiveTypeToNative<type>::type>
+  static StatusOr<std::unique_ptr<Literal>> CreateRandomLiteral(
+      const Shape& shape, E* engine, T mean, T stddev);
+
+  // Creates a literal with the supplied shape, and initializes the literal
+  // values using a normal distribution with given mean and stddev standard
+  // deviation.
+  // Returns the new literal object, or an error Status if failed.
+  template <
+      PrimitiveType type,
+      typename T = typename primitive_util::PrimitiveTypeToNative<type>::type>
+  static StatusOr<std::unique_ptr<Literal>> CreateRandomLiteral(
+      const Shape& shape, T mean, T stddev);
 
-  // Like IsAll(const Literal&, int8), except we check whether the literal is
-  // equal to a particular complex number.
-  //
-  // If the literal is not a complex value, this always returns false.
   //
-  // This casts value to the type of literal, then compares using ==.  The usual
-  // admonishments about floating-point equality checks apply.  We expect you to
-  // use this to check for complex values that can be expressed precisely as
-  // float pairs e.g. (-0.5, 1.0).
-  //
-  // This literal must have a dense layout.
-  bool IsAllComplex(complex64 value) const;
+  // End of factory methods.
 
-  // Literal consists entirely of the first element of the literal.
-  bool IsAllFirst() const;
+  // Returns a multi-dimensional index as a string. For example: '{7, 8}' will
+  // be returned for a 2-dimensional index with dimension 0 index equal to 7,
+  // dimension 1 equal to 8.
+  static string MultiIndexAsString(
+      tensorflow::gtl::ArraySlice<int64> multi_index);
 
-  // Returns whether this literal is zero at the specified index. This literal
-  // must be an array with a dense layout.
-  bool IsZero(tensorflow::gtl::ArraySlice<int64> indices) const;
+ private:
+  // Recursively sets the subshapes and buffers of all subpieces rooted at
+  // 'piece'. If 'allocate_array' is true, memory is allocated for the arrays in
+  // the shape.
+  void SetPiece(const Shape& shape, Piece* piece, bool allocate_arrays);
 
-  // Return the count of the elements in the array at the given shape index in
-  // this literal.
-  int64 element_count(const ShapeIndex& index = {}) const {
-    return ShapeUtil::ElementsIn(ShapeUtil::GetSubshape(shape(), index));
+  // Returns the piece at the given ShapeIndex.
+  Piece& piece(const ShapeIndex& shape_index) {
+    return const_cast<Piece&>(LiteralBase::piece(shape_index));
   }
 
-  // Return the count of the elements in the sparse array at the given shape
-  // index in this literal, which will be no larger than
-  // LayoutUtil::MaxSparseElements(SetSubshape(shape(), index).layout()).
-  int64 sparse_element_count() const;
-
- protected:
-  // 'allocate_arrays' indicates whether to allocate memory for the arrays in
-  // the shape. If false, buffer pointers inside of the Literal::Pieces are set
-  // to nullptr.
-  Literal(const Shape& shape, bool allocate_arrays);
+  Piece& root_piece() const override { return *root_piece_; };
 
   // Internal template helper for the Literal::CopySliceFrom(), matching its
   // arguments one by one.
   template <typename NativeT>
-  Status CopySliceFromInternal(const Literal& src_literal,
+  Status CopySliceFromInternal(const LiteralBase& src_literal,
                                tensorflow::gtl::ArraySlice<int64> src_base,
                                tensorflow::gtl::ArraySlice<int64> dest_base,
                                tensorflow::gtl::ArraySlice<int64> copy_size);
@@ -695,162 +1042,69 @@ class Literal {
     int64 minor_loop_size = 1;
   };
 
-  // A data structure representing a subshape at a particular ShapeIndex within
-  // the literal. For array-shaped ShapeIndexes, this data structure holds the
-  // pointer to the memory allocated for the array data.
-  class Piece {
-   public:
-    // Return the buffer holding the array data for this piece as an array
-    // slice. This piece must be array-shaped.
-    template <typename NativeT>
-    tensorflow::gtl::ArraySlice<NativeT> data() const;
-    template <typename NativeT>
-    tensorflow::gtl::MutableArraySlice<NativeT> data();
-
-    // Return the buffer holding the array data for this piece as a void*. This
-    // piece must be array-shaped.
-    void* untyped_data();
-    const void* untyped_data() const;
-
-    // Gets or sets an element in the array at the given index. The multi_index
-    // is CHECKed against the dimension sizes of the array.  This piece must be
-    // array-shaped.
-    template <typename NativeT>
-    NativeT Get(tensorflow::gtl::ArraySlice<int64> index) const;
-    template <typename NativeT>
-    void Set(tensorflow::gtl::ArraySlice<int64> index, NativeT value);
-
-    // Gets/sets the buffer holding the array data.
-    char* buffer() const { return buffer_; }
-    void set_buffer(char* buffer) { buffer_ = buffer; }
-
-    // The array of multi-indices that provide the locations of non-zero
-    // elements in a sparse array.  Only used if
-    // LayoutUtil::IsSparseArray(shape()) is true.
-    SparseIndexArray* sparse_indices() const { return sparse_indices_; }
-    void set_sparse_indices(SparseIndexArray* sparse_indices) {
-      sparse_indices_ = sparse_indices;
-    }
-
-    // Gets or sets the subshape of this piece. This reference points to a
-    // subshape within the shape in the containing Literal (Literal::shape_).
-    const Shape& subshape() const { return *subshape_; }
-    void set_subshape(const Shape* subshape) { subshape_ = subshape; }
-
-    // Returns the size in bytes of the buffer holding the array data.
-    int64 size_bytes() const { return ShapeUtil::ByteSizeOf(subshape()); }
-
-    // Returns the number of elements in this piece's array.
-    int64 element_count() const {
-      // If this is a sparse array, use the number of elements represented by
-      // the indices in the associated SparseIndexArray.
-      return LayoutUtil::IsSparseArray(subshape())
-                 ? sparse_indices()->index_count()
-                 : ShapeUtil::ElementsIn(subshape());
-    }
-
-    // Copy the data from 'src' into this piece's buffer. Shapes of this piece
-    // and src must be compatible.
-    Status CopyFrom(const Piece& src);
-
-    // Returns true if this piece and 'other' contain the same data. This piece
-    // and 'other' must be array-shaped and compatible.
-    bool EqualElements(const Piece& other) const;
-
-    // Writes the shape and data (if array-shaped) into the given proto.
-    void WriteToProto(LiteralProto* proto) const;
-
-    // Copies the data from the given proto into this piece. The shape of this
-    // piece must be equal (not just compatible) to the shape of the proto.
-    Status CopyFromProto(const LiteralProto& proto);
-
-    // Sorts the elements in a sparse array.
-    void SortSparseElements();
-
-   private:
-    // Recursive helper for EqualElements.
-    template <typename NativeT>
-    bool EqualElementsInternal(const Piece& other,
-                               std::vector<int64>* multi_index) const;
-
-    // Helper for SortSparseElements that has the element type as a template
-    // parameter.
-    template <typename NativeT>
-    void SortSparseElementsInternal();
-
-    // For array-shaped pieces, this is the buffer holding the literal data.
-    char* buffer_ = nullptr;
-
-    // For sparse arrays, this is the array of indices.
-    SparseIndexArray* sparse_indices_ = nullptr;
-
-    // The shape of piece. This points into the shape of the containing Literal
-    // (Literal::shape_).
-    const Shape* subshape_ = nullptr;
-  };
-
-  // Returns the piece at the given ShapeIndex.
-  Piece& piece(const ShapeIndex& shape_index) {
-    return *pieces_.mutable_element(shape_index);
-  }
-  const Piece& piece(const ShapeIndex& shape_index) const {
-    return pieces_.element(shape_index);
-  }
-
-  // Returns the piece at the root of the shape (empty ShapeIndex).
-  Piece& root_piece() { return piece({}); }
-  const Piece& root_piece() const { return piece({}); }
+  // Literal class always owns the shape. The parent class borrows this shape.
+  std::unique_ptr<Shape> shape_;
 
-  // Deallocate the buffers held by this literal (if the literal owns the
-  // buffer).
-  void DeallocateBuffers();
+  Piece* root_piece_ = nullptr;
 
   // Implementation details shared between Populate() and PopulateParallel()
   template <typename NativeT, typename FnType>
   Status PopulateInternal(const FnType& generator, bool parallel);
 
-  Shape shape_;
-  ShapeTree<Piece> pieces_;
-
-  // Whether the buffers held in pieces_ are owned by this Literal.
-  bool owns_buffers_;
-
-  // LiteralView must access and manipulate Pieces of other Literals.
-  friend class LiteralView;
-};  // namespace xla
+  // Deallocate the buffers held by this literal.
+  void DeallocateBuffers();
 
+  friend class LiteralBase;
+};
 std::ostream& operator<<(std::ostream& out, const Literal& literal);
 
-// A read-only view of a Literal. A LiteralView contains pointers to buffers
-// owned by the viewed Literal.
-//
-// TODO(b/71550060): Replace LiteralView with Literal slice classes (immutable
-// and mutable) similar to (Mutable)ArraySlice.
-class LiteralView : public Literal {
+// A read-only view of a Literal. A LiteralSlice contains pointers to shape and
+// literal buffers always owned by others.
+class LiteralSlice : public LiteralBase {
  public:
-  // Create and return a view of the given literal rooted at the given shape
-  // index within the given literal. A factory is used rather than a public
-  // constructor because only const LiteralViews are supported. It's still
-  // possible to create non-const LiteralViews via the copy constructors, but
-  // the factory method makes it a bit less likely. Implementing literal slices
-  // will fix this undesirable situation (b/71550060).
-  static const LiteralView Create(const Literal& literal,
-                                  const ShapeIndex& view_root = {});
+  LiteralSlice() : LiteralBase() {}
+
+  // Implicit conversion constructors.
+  LiteralSlice(const LiteralBase& literal);
+  LiteralSlice(const LiteralBase& literal, const ShapeIndex& view_root);
 
-  LiteralView(const LiteralView& other);
-  LiteralView& operator=(const LiteralView& other);
+ private:
+  const Piece& root_piece() const override { return *root_piece_; };
+
+  const Piece* root_piece_;  // Not owned.
+};
 
-  virtual ~LiteralView();
+// A read-only Literal where the underlying buffers are never owned by this
+// class.
+class BorrowingLiteral : public LiteralBase {
+ public:
+  BorrowingLiteral() : LiteralBase() {}
+
+  // 'src_buf_ptr' is not owned by this class and must outlive the
+  // lifetime of this class. It points to an appropirately sized buffer with
+  // data interpretered as indicated by 'shape'.
+  // This constructor is only used for array shapes.
+  BorrowingLiteral(const char* src_buf_ptr, const Shape& shape);
+  // Similar as above, except to be used for constructing non-nested tuples.
+  BorrowingLiteral(tensorflow::gtl::ArraySlice<const char*> src_buf_ptrs,
+                   const Shape& shape);
+  // TODO(b/79707221): adding constructors for nested tuples as well.
 
  private:
-  LiteralView(const Literal& literal, const ShapeIndex& view_root);
+  // Recursively builds the subtree for the given piece and sets the subshapes
+  // of the given piece with the given shape.
+  void BuildPieceSubtree(const Shape& shape, Piece* piece);
 
-  // Helper for the copy constructor and copy assignment operator.
-  void CopyFrom(const LiteralView& other);
+  // Accessor for the root piece of this literal.
+  const Piece& root_piece() const override { return root_piece_; };
+  Piece root_piece_;
+
+  // Shape of this literal.
+  const Shape shape_;
 };
 
 template <typename NativeT>
-tensorflow::gtl::ArraySlice<NativeT> Literal::Piece::data() const {
+tensorflow::gtl::ArraySlice<NativeT> LiteralBase::Piece::data() const {
   CHECK(ShapeUtil::IsArray(subshape())) << ShapeUtil::HumanString(subshape());
   CHECK_EQ(subshape().element_type(),
            primitive_util::NativeToPrimitiveType<NativeT>())
@@ -863,7 +1117,7 @@ tensorflow::gtl::ArraySlice<NativeT> Literal::Piece::data() const {
 }
 
 template <typename NativeT>
-tensorflow::gtl::MutableArraySlice<NativeT> Literal::Piece::data() {
+tensorflow::gtl::MutableArraySlice<NativeT> LiteralBase::Piece::data() {
   CHECK(ShapeUtil::IsArray(subshape())) << ShapeUtil::HumanString(subshape());
   CHECK_EQ(subshape().element_type(),
            primitive_util::NativeToPrimitiveType<NativeT>())
@@ -876,7 +1130,7 @@ tensorflow::gtl::MutableArraySlice<NativeT> Literal::Piece::data() {
 }
 
 template <typename NativeT>
-NativeT Literal::Piece::Get(
+NativeT LiteralBase::Piece::Get(
     tensorflow::gtl::ArraySlice<int64> multi_index) const {
   CHECK(LayoutUtil::IsDenseArray(subshape()));
   return data<NativeT>()[IndexUtil::MultidimensionalIndexToLinearIndex(
@@ -884,15 +1138,15 @@ NativeT Literal::Piece::Get(
 }
 
 template <typename NativeT>
-void Literal::Piece::Set(tensorflow::gtl::ArraySlice<int64> multi_index,
-                         NativeT value) {
+void LiteralBase::Piece::Set(tensorflow::gtl::ArraySlice<int64> multi_index,
+                             NativeT value) {
   CHECK(LayoutUtil::IsDenseArray(subshape()));
   data<NativeT>()[IndexUtil::MultidimensionalIndexToLinearIndex(
       subshape(), multi_index)] = value;
 }
 
 template <typename NativeT>
-tensorflow::gtl::ArraySlice<NativeT> Literal::data(
+tensorflow::gtl::ArraySlice<NativeT> LiteralBase::data(
     const ShapeIndex& shape_index) const {
   return piece(shape_index).data<NativeT>();
 }
@@ -904,13 +1158,13 @@ tensorflow::gtl::MutableArraySlice<NativeT> Literal::data(
 }
 
 template <typename NativeT>
-inline NativeT Literal::Get(tensorflow::gtl::ArraySlice<int64> multi_index,
-                            const ShapeIndex& shape_index) const {
+inline NativeT LiteralBase::Get(tensorflow::gtl::ArraySlice<int64> multi_index,
+                                const ShapeIndex& shape_index) const {
   return piece(shape_index).Get<NativeT>(multi_index);
 }
 
 template <typename NativeT>
-inline NativeT Literal::Get(
+inline NativeT LiteralBase::Get(
     tensorflow::gtl::ArraySlice<int64> multi_index) const {
   return root_piece().Get<NativeT>(multi_index);
 }
@@ -1157,13 +1411,13 @@ template <typename NativeT>
 }
 
 template <typename NativeT>
-NativeT Literal::GetFirstElement() const {
+NativeT LiteralBase::GetFirstElement() const {
   return data<NativeT>().at(0);
 }
 
 template <typename NativeT>
-NativeT Literal::GetSparseElement(int64 sparse_element_number,
-                                  const ShapeIndex& shape_index) const {
+NativeT LiteralBase::GetSparseElement(int64 sparse_element_number,
+                                      const ShapeIndex& shape_index) const {
   CHECK(
       LayoutUtil::IsSparseArray(ShapeUtil::GetSubshape(shape(), shape_index)));
   return data<NativeT>(shape_index)[sparse_element_number];
@@ -1196,7 +1450,7 @@ template <typename NativeT>
 }
 
 template <typename NativeT>
-void Literal::EachCell(
+void LiteralBase::EachCell(
     std::function<void(tensorflow::gtl::ArraySlice<int64> indices,
                        NativeT value)>
         per_cell) const {
@@ -1372,7 +1626,7 @@ template <typename NativeT>
 }
 
 template <typename NativeT>
-std::unique_ptr<Literal> Literal::Replicate(int64 times) const {
+std::unique_ptr<Literal> LiteralBase::Replicate(int64 times) const {
   DimensionVector bounds = {times};
   bounds.reserve(shape().dimensions_size() + 1);
   for (int64 bound : shape().dimensions()) {
@@ -1407,6 +1661,38 @@ std::unique_ptr<Literal> Literal::Replicate(int64 times) const {
   return literal;
 }
 
+template <PrimitiveType type, typename T>
+/* static */ StatusOr<std::unique_ptr<Literal>> Literal::CreateRandomLiteral(
+    const Shape& shape,
+    const std::function<T(tensorflow::gtl::ArraySlice<int64>)>& generator) {
+  using NativeT = typename primitive_util::PrimitiveTypeToNative<type>::type;
+  TF_RET_CHECK(shape.element_type() == type);
+  auto literal = MakeUnique<Literal>(shape);
+  TF_RETURN_IF_ERROR(literal.get()->Populate<NativeT>(
+      [&](tensorflow::gtl::ArraySlice<int64> indexes) {
+        return generator(indexes);
+      }));
+  return std::move(literal);
+}
+
+template <PrimitiveType type, typename E, typename T>
+/* static */ StatusOr<std::unique_ptr<Literal>> Literal::CreateRandomLiteral(
+    const Shape& shape, E* engine, T mean, T stddev) {
+  using NativeT = typename primitive_util::PrimitiveTypeToNative<type>::type;
+  std::normal_distribution<NativeT> generator(mean, stddev);
+  return CreateRandomLiteral<type, NativeT>(
+      shape, [&](tensorflow::gtl::ArraySlice<int64> /*indexes*/) {
+        return generator(*engine);
+      });
+}
+
+template <PrimitiveType type, typename T>
+/* static */ StatusOr<std::unique_ptr<Literal>> Literal::CreateRandomLiteral(
+    const Shape& shape, T mean, T stddev) {
+  std::minstd_rand0 engine;
+  return CreateRandomLiteral<type>(shape, &engine, mean, stddev);
+}
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_LITERAL_UTIL_H_
diff --git a/tensorflow/compiler/xla/literal_util_test.cc b/tensorflow/compiler/xla/literal_util_test.cc
index 61046784e05623..f127cee0fdc126 100644
--- a/tensorflow/compiler/xla/literal_util_test.cc
+++ b/tensorflow/compiler/xla/literal_util_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/layout_util.h"
@@ -974,7 +975,7 @@ TEST_F(LiteralUtilTest, CopyFromTuples) {
                                    Literal::CreateR1<double>({2.0, 4.0}).get(),
                                    &nil_literal});
 
-  EXPECT_EQ(*matrix, LiteralView::Create(*nested_tuple, {0}));
+  EXPECT_EQ(*matrix, LiteralSlice(*nested_tuple, {0}));
   EXPECT_EQ(nested_tuple->Get<int32>({}, {1, 0}), 42);
   EXPECT_EQ(nested_tuple->Get<double>({0}, {1, 1}), 23.0);
   EXPECT_EQ(nested_tuple->Get<double>({1}, {1, 1}), 44.0);
@@ -985,7 +986,7 @@ TEST_F(LiteralUtilTest, CopyFromTuples) {
                                       /*src_shape_index=*/{}));
 
   // The matrix element should be unchanged.
-  EXPECT_EQ(*matrix, LiteralView::Create(*nested_tuple, {0}));
+  EXPECT_EQ(*matrix, LiteralSlice(*nested_tuple, {0}));
 
   // The tuple element should have been copied from 'tuple'.
   EXPECT_EQ(nested_tuple->Get<int32>({}, {1, 0}), -5);
@@ -1065,7 +1066,7 @@ TEST_F(LiteralUtilTest, Populate) {
     Shape shape = ShapeUtil::MakeShapeWithLayout(
         primitive_util::NativeToPrimitiveType<uint32>(), data.dimensions,
         data.layout);
-    auto literal = Literal::CreateFromShape(shape);
+    auto literal = MakeUnique<Literal>(shape);
     auto generator = [&](ArraySlice<int64> indexes) -> uint32 {
       // Offsets from linear index just to avoid R0 literals to be initialized
       // with zero.
@@ -1107,7 +1108,7 @@ TEST_F(LiteralUtilTest, PopulateParallel) {
     Shape shape = ShapeUtil::MakeShapeWithLayout(
         primitive_util::NativeToPrimitiveType<uint32>(), data.dimensions,
         data.layout);
-    auto literal = Literal::CreateFromShape(shape);
+    auto literal = MakeUnique<Literal>(shape);
     auto generator = [&](ArraySlice<int64> indexes) -> uint32 {
       // Offsets from linear index just to avoid R0 literals to be initialized
       // with zero.
@@ -1373,36 +1374,36 @@ TEST_F(LiteralUtilTest, CopyFromProto_f16) {
   ASSERT_EQ(h1, r[3]);
 }
 
-TEST_F(LiteralUtilTest, LiteralViewTest) {
+TEST_F(LiteralUtilTest, LiteralSliceTest) {
   auto scalar = Literal::CreateR0<float>(1.0);
   auto matrix = Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
   auto tuple = Literal::MakeTuple({scalar.get(), matrix.get()});
   auto nested_tuple = Literal::MakeTuple({tuple.get(), scalar.get()});
   Literal nil(ShapeUtil::MakeNil());
 
-  EXPECT_EQ(LiteralView::Create(*scalar, {}), *scalar);
-  EXPECT_EQ(LiteralView::Create(*matrix, {}), *matrix);
-  EXPECT_EQ(LiteralView::Create(*tuple, {}), *tuple);
-  EXPECT_EQ(LiteralView::Create(*nested_tuple, {}), *nested_tuple);
-  EXPECT_EQ(LiteralView::Create(nil, {}), nil);
+  EXPECT_EQ(LiteralSlice(*scalar, {}), *scalar);
+  EXPECT_EQ(LiteralSlice(*matrix, {}), *matrix);
+  EXPECT_EQ(LiteralSlice(*tuple, {}), *tuple);
+  EXPECT_EQ(LiteralSlice(*nested_tuple, {}), *nested_tuple);
+  EXPECT_EQ(LiteralSlice(nil, {}), nil);
 
-  EXPECT_EQ(LiteralView::Create(*tuple, {0}), *scalar);
-  EXPECT_EQ(LiteralView::Create(*tuple, {1}), *matrix);
+  EXPECT_EQ(LiteralSlice(*tuple, {0}), *scalar);
+  EXPECT_EQ(LiteralSlice(*tuple, {1}), *matrix);
 
-  EXPECT_EQ(LiteralView::Create(*nested_tuple, {0}), *tuple);
-  EXPECT_EQ(LiteralView::Create(*nested_tuple, {0, 0}), *scalar);
-  EXPECT_EQ(LiteralView::Create(*nested_tuple, {0, 1}), *matrix);
-  EXPECT_EQ(LiteralView::Create(*nested_tuple, {1}), *scalar);
+  EXPECT_EQ(LiteralSlice(*nested_tuple, {0}), *tuple);
+  EXPECT_EQ(LiteralSlice(*nested_tuple, {0, 0}), *scalar);
+  EXPECT_EQ(LiteralSlice(*nested_tuple, {0, 1}), *matrix);
+  EXPECT_EQ(LiteralSlice(*nested_tuple, {1}), *scalar);
 }
 
-TEST_F(LiteralUtilTest, MutatingLiteralView) {
+TEST_F(LiteralUtilTest, MutatingLiteralSlice) {
   auto scalar = Literal::CreateR0<float>(1.0);
   auto matrix = Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
   auto tuple = Literal::MakeTuple({scalar.get(), matrix.get()});
   auto nested_tuple = Literal::MakeTuple({tuple.get(), scalar.get()});
   // Verify that changing the underlying data beneath the view changes the
   // data of the view itself.
-  const auto nested_tuple_view = LiteralView::Create(*nested_tuple);
+  const auto nested_tuple_view = LiteralSlice(*nested_tuple);
   EXPECT_EQ(
       nested_tuple->Get<float>(/*multi_index=*/{}, /*shape_index=*/{0, 0}),
       1.0f);
@@ -1418,19 +1419,57 @@ TEST_F(LiteralUtilTest, MutatingLiteralView) {
             555.0f);
 }
 
-TEST_F(LiteralUtilTest, LiteralViewOfALiteralView) {
+TEST_F(LiteralUtilTest, LiteralSliceOfALiteralSlice) {
   auto scalar = Literal::CreateR0<float>(1.0);
   auto matrix = Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
   auto tuple = Literal::MakeTuple({scalar.get(), matrix.get()});
   auto nested_tuple = Literal::MakeTuple({tuple.get(), scalar.get()});
 
-  const auto nested_tuple_view = LiteralView::Create(*nested_tuple);
-  const auto tuple_view =
-      LiteralView::Create(nested_tuple_view, /*view_root=*/{0});
-  const auto matrix_view = LiteralView::Create(tuple_view, /*view_root=*/{1});
+  const auto nested_tuple_view = LiteralSlice(*nested_tuple);
+  const auto tuple_view = LiteralSlice(nested_tuple_view, /*view_root=*/{0});
+  const auto matrix_view = LiteralSlice(tuple_view, /*view_root=*/{1});
   EXPECT_EQ(matrix_view, *Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}}));
 }
 
+TEST_F(LiteralUtilTest, BorrowingLiteralFromOneBufferPtrTest) {
+  std::vector<int64> int64_values = {1, 2, 3};
+  const Shape literal_shape = ShapeUtil::MakeShape(S64, {3});
+
+  BorrowingLiteral literal(reinterpret_cast<const char*>(int64_values.data()),
+                           literal_shape);
+
+  EXPECT_EQ(literal.Get<int64>({0}), 1);
+  EXPECT_EQ(literal.Get<int64>({1}), 2);
+  EXPECT_EQ(literal.Get<int64>({2}), 3);
+}
+
+TEST_F(LiteralUtilTest, BorrowingLiteralFromMultipleBufferPtrsTest) {
+  std::vector<int64> one_two_three = {1, 2, 3};
+  const Shape one_two_three_shape = ShapeUtil::MakeShape(S64, {3});
+
+  std::vector<int64> hundred = {100};
+  const Shape hundred_shape = ShapeUtil::MakeShape(S64, {1});
+
+  std::vector<const char*> src_buf_ptrs;
+  src_buf_ptrs.emplace_back(
+      reinterpret_cast<const char*>(one_two_three.data()));
+  src_buf_ptrs.emplace_back(reinterpret_cast<const char*>(hundred.data()));
+  auto literal_tuple = BorrowingLiteral(
+      src_buf_ptrs,
+      ShapeUtil::MakeTupleShape({one_two_three_shape, hundred_shape}));
+
+  EXPECT_EQ(literal_tuple.Get<int64>(/*multi_index=*/{0}, /*shape_index=*/{0}),
+            1);
+  EXPECT_EQ(literal_tuple.Get<int64>(/*multi_index=*/{0}, /*shape_index=*/{1}),
+            100);
+
+  EXPECT_EQ(literal_tuple.Get<int64>(/*multi_index=*/{1}, /*shape_index=*/{0}),
+            2);
+
+  EXPECT_EQ(literal_tuple.Get<int64>(/*multi_index=*/{2}, /*shape_index=*/{0}),
+            3);
+}
+
 TEST_F(LiteralUtilTest, LiteralMove) {
   std::unique_ptr<Literal> matrix =
       Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
@@ -1533,11 +1572,11 @@ TEST_F(LiteralUtilTest, LiteralMoveAssignment) {
   EXPECT_EQ(literal.Get<float>({1, 1}), 4.0);
 }
 
-TEST_F(LiteralUtilTest, LiteralViewCopy) {
+TEST_F(LiteralUtilTest, LiteralSliceCopy) {
   std::unique_ptr<Literal> matrix =
       Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
-  const auto matrix_view = LiteralView::Create(*matrix);
-  LiteralView matrix_view_copy(matrix_view);
+  const auto matrix_view = LiteralSlice(*matrix);
+  LiteralSlice matrix_view_copy(matrix_view);
 
   EXPECT_EQ(matrix_view_copy.Get<float>({0, 0}), 1.0);
   EXPECT_EQ(matrix_view_copy.Get<float>({0, 1}), 2.0);
@@ -1771,5 +1810,35 @@ TEST_F(LiteralUtilTest, GetSparseElementAsString) {
       tensorflow::strings::StrCat("(", float{3.0}, ", ", float{4.0}, ")"));
 }
 
+TEST_F(LiteralUtilTest, BroadcastVectorToMatrix0) {
+  std::unique_ptr<Literal> literal = Literal::CreateR1<int64>({1, 2});
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Literal> broadcasted_literal,
+      literal->Broadcast(
+          /*result_shape=*/ShapeUtil::MakeShape(S64, {2, 2}),
+          /*dimensions=*/{0}));
+  EXPECT_EQ(*broadcasted_literal, *Literal::CreateR2<int64>({{1, 1}, {2, 2}}));
+}
+
+TEST_F(LiteralUtilTest, BroadcastVectorToMatrix1) {
+  std::unique_ptr<Literal> literal = Literal::CreateR1<int64>({1, 2});
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Literal> broadcasted_literal,
+      literal->Broadcast(
+          /*result_shape=*/ShapeUtil::MakeShape(S64, {2, 2}),
+          /*dimensions=*/{1}));
+  EXPECT_EQ(*broadcasted_literal, *Literal::CreateR2<int64>({{1, 2}, {1, 2}}));
+}
+
+TEST_F(LiteralUtilTest, BroadcastScalarToMatrix) {
+  std::unique_ptr<Literal> literal = Literal::CreateR0<int32>(9);
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<Literal> broadcasted_literal,
+      literal->Broadcast(
+          /*result_shape=*/ShapeUtil::MakeShape(S32, {2, 2}),
+          /*dimensions=*/{}));
+  EXPECT_EQ(*broadcasted_literal, *Literal::CreateR2<int32>({{9, 9}, {9, 9}}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/map_util.h b/tensorflow/compiler/xla/map_util.h
index 8db8c6f3de84a6..3c74e070da529b 100644
--- a/tensorflow/compiler/xla/map_util.h
+++ b/tensorflow/compiler/xla/map_util.h
@@ -86,11 +86,10 @@ const typename Collection::value_type::second_type& FindOrDefault(
 
 // Inserts the key-value pair into the collection. Dies if key was already
 // present.
-template <class Collection>
-void InsertOrDie(Collection* const collection,
-                 const typename Collection::value_type::first_type& key,
-                 const typename Collection::value_type::second_type& data) {
-  auto p = collection->insert(std::make_pair(key, data));
+template <class Collection, class Key, class Value>
+void InsertOrDie(Collection* const collection, Key&& key, Value&& value) {
+  auto p = collection->insert(
+      std::make_pair(std::forward<Key>(key), std::forward<Value>(value)));
   CHECK(p.second) << "duplicate key: " << key;
 }
 
@@ -101,9 +100,10 @@ bool ContainsKey(const Collection& collection, const Key& key) {
 }
 
 // Inserts `value` into `set`. Dies if it was already present.
-template <class Set>
-void InsertOrDie(Set* const set, const typename Set::value_type& value) {
-  CHECK(set->insert(value).second) << "duplicate value: " << value;
+template <class Set, class Value>
+void InsertOrDie(Set* const set, Value&& value) {
+  CHECK(set->insert(std::forward<Value>(value)).second)
+      << "duplicate value: " << value;
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index ecb87bd8893276..83834c1ff65ea2 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -12,6 +12,7 @@ py_library(
     deps = [
         ":pywrap_xla",
         "//tensorflow/compiler/xla:xla_data_proto_py",
+        "//tensorflow/compiler/xla/service:hlo_proto_py",
     ],
 )
 
@@ -49,9 +50,11 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:executable_build_options",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
index 24e17abbe06197..f808990cadeab5 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.cc
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -15,8 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/python/local_computation_builder.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/platform/default/thread_annotations.h"
+#include "tensorflow/core/platform/thread_annotations.h"
 
 namespace xla {
 
@@ -104,25 +105,25 @@ static StatusOr<ScopedShapedBuffer> ToBuffer(LocalClient* client,
 }
 
 /* static */
-LocalShapedBuffer* LocalShapedBuffer::FromLiteral(
+StatusOr<LocalShapedBuffer*> LocalShapedBuffer::FromLiteral(
     const Literal& argument,
     const tensorflow::gtl::optional<Shape>& shape_with_layout) {
   LocalClient* client = GetOrCreateLocalClient();
-  ScopedShapedBuffer buf = [&] {
+  StatusOr<ScopedShapedBuffer> buf = [&] {
     if (shape_with_layout) {
       std::unique_ptr<Literal> relaid =
           argument.Relayout(shape_with_layout.value());
-      return ToBuffer(client, /*device_ordinal=*/0, *relaid)
-          .ConsumeValueOrDie();
+      return ToBuffer(client, /*device_ordinal=*/0, *relaid);
     }
-    return ToBuffer(client, /*device_ordinal=*/0, argument).ConsumeValueOrDie();
+    return ToBuffer(client, /*device_ordinal=*/0, argument);
   }();
-  return new LocalShapedBuffer(std::move(buf));
+  TF_RETURN_IF_ERROR(buf.status());
+  return new LocalShapedBuffer(std::move(buf).ValueOrDie());
 }
 
-std::unique_ptr<Literal> LocalShapedBuffer::ToLiteral() const {
+StatusOr<std::unique_ptr<Literal>> LocalShapedBuffer::ToLiteral() const {
   LocalClient* client = GetOrCreateLocalClient();
-  return client->ShapedBufferToLiteral(*shaped_buffer()).ConsumeValueOrDie();
+  return client->ShapedBufferToLiteral(*shaped_buffer());
 }
 
 CompiledLocalComputation::CompiledLocalComputation(
@@ -197,8 +198,6 @@ StatusOr<std::unique_ptr<Literal>> CompiledLocalComputation::Execute(
         ExecutableRunOptions options;
         options.set_device_ordinal(device_ordinal);
         options.set_allocator(client->backend().memory_allocator());
-        options.set_inter_op_thread_pool(
-            client->backend().inter_op_thread_pool());
         options.set_intra_op_thread_pool(
             client->backend().eigen_intra_op_thread_pool_device());
         options.set_device_assignment(&device_assignment);
@@ -242,7 +241,6 @@ LocalShapedBuffer* CompiledLocalComputation::ExecuteWithShapedBuffers(
   // Execute
   ExecutableRunOptions options;
   options.set_allocator(client->backend().memory_allocator());
-  options.set_inter_op_thread_pool(client->backend().inter_op_thread_pool());
   options.set_intra_op_thread_pool(
       client->backend().eigen_intra_op_thread_pool_device());
   ScopedShapedBuffer result_buffer =
@@ -251,7 +249,7 @@ LocalShapedBuffer* CompiledLocalComputation::ExecuteWithShapedBuffers(
   return new LocalShapedBuffer(std::move(result_buffer));
 }
 
-LocalComputation::LocalComputation(Computation computation)
+LocalComputation::LocalComputation(XlaComputation computation)
     : computation_(std::move(computation)) {}
 
 StatusOr<CompiledLocalComputation*> LocalComputation::Compile(
@@ -274,18 +272,31 @@ StatusOr<CompiledLocalComputation*> LocalComputation::Compile(
   return new CompiledLocalComputation(std::move(local_executable));
 }
 
-const Computation& LocalComputation::computation() const {
+const XlaComputation& LocalComputation::computation() const {
   return computation_;
 }
 
+string LocalComputation::GetSerializedProto() const {
+  string result;
+  if (!computation_.proto().SerializeToString(&result)) {
+    LOG(ERROR) << "Failed to serialize the HloModuleProto.";
+    return "";
+  }
+  return result;
+}
+
 StatusOr<Shape> LocalComputation::GetReturnValueShape() const {
   TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
                       computation_.GetProgramShape());
   return std::move(*program_shape.mutable_result());
 }
 
+LocalOp::LocalOp(const XlaOp& op) : op_(op) {}
+
+const XlaOp& LocalOp::op() const { return op_; }
+
 LocalComputationBuilder::LocalComputationBuilder(const string& computation_name)
-    : builder_(GetOrCreateLocalClient(), computation_name) {}
+    : builder_(computation_name) {}
 
 void LocalComputationBuilder::SetOpMetadata(const OpMetadata& metadata) {
   builder_.SetOpMetadata(metadata);
@@ -294,19 +305,21 @@ void LocalComputationBuilder::SetOpMetadata(const OpMetadata& metadata) {
 void LocalComputationBuilder::ClearOpMetadata() { builder_.ClearOpMetadata(); }
 
 StatusOr<LocalComputation*> LocalComputationBuilder::Build() {
-  TF_ASSIGN_OR_RETURN(Computation computation, builder_.Build());
+  TF_ASSIGN_OR_RETURN(XlaComputation computation, builder_.Build());
   return new LocalComputation(std::move(computation));
 }
 
-ComputationDataHandle LocalComputationBuilder::Parameter(int64 parameter_number,
-                                                         const Shape& shape,
-                                                         const string& name) {
+LocalOp LocalComputationBuilder::Parameter(int64 parameter_number,
+                                           const Shape& shape,
+                                           const string& name) {
   return builder_.Parameter(parameter_number, shape, name);
 }
 
 std::unique_ptr<Shape> LocalComputationBuilder::GetShape(
-    const ComputationDataHandle& operand) {
-  return builder_.GetShape(operand).ConsumeValueOrDie();
+    const LocalOp& operand) {
+  auto result = MakeUnique<Shape>();
+  *result = builder_.GetShape(operand.op()).ValueOrDie();
+  return result;
 }
 
 StatusOr<Shape> LocalComputationBuilder::GetReturnValueShape() {
@@ -314,222 +327,236 @@ StatusOr<Shape> LocalComputationBuilder::GetReturnValueShape() {
   return program_shape.result();
 }
 
-ComputationDataHandle LocalComputationBuilder::Infeed(const Shape& shape) {
+LocalOp LocalComputationBuilder::Infeed(const Shape& shape) {
   return builder_.Infeed(shape);
 }
 
-void LocalComputationBuilder::Outfeed(const ComputationDataHandle& operand,
+void LocalComputationBuilder::Outfeed(const LocalOp& operand,
                                       const Shape& shape,
                                       const string& outfeed_config) {
-  builder_.Outfeed(operand, shape, outfeed_config);
+  builder_.Outfeed(operand.op(), shape, outfeed_config);
 }
 
-ComputationDataHandle LocalComputationBuilder::ConstantLiteral(
-    const Literal& literal) {
+LocalOp LocalComputationBuilder::ConstantLiteral(const Literal& literal) {
   return builder_.ConstantLiteral(literal);
 }
 
-ComputationDataHandle LocalComputationBuilder::Broadcast(
-    const ComputationDataHandle& operand,
+LocalOp LocalComputationBuilder::Broadcast(
+    const LocalOp& operand,
     tensorflow::gtl::ArraySlice<int64> broadcast_sizes) {
-  return builder_.Broadcast(operand, broadcast_sizes);
+  return builder_.Broadcast(operand.op(), broadcast_sizes);
 }
 
-ComputationDataHandle LocalComputationBuilder::Pad(
-    const ComputationDataHandle& operand,
-    const ComputationDataHandle& padding_value,
-    const PaddingConfig& padding_config) {
-  return builder_.Pad(operand, padding_value, padding_config);
+LocalOp LocalComputationBuilder::Pad(const LocalOp& operand,
+                                     const LocalOp& padding_value,
+                                     const PaddingConfig& padding_config) {
+  return builder_.Pad(operand.op(), padding_value.op(), padding_config);
 }
 
-ComputationDataHandle LocalComputationBuilder::Reshape(
-    const ComputationDataHandle& operand,
-    tensorflow::gtl::ArraySlice<int64> dimensions,
+LocalOp LocalComputationBuilder::Reshape(
+    const LocalOp& operand, tensorflow::gtl::ArraySlice<int64> dimensions,
     tensorflow::gtl::ArraySlice<int64> new_sizes) {
-  return builder_.Reshape(operand, dimensions, new_sizes);
+  return builder_.Reshape(operand.op(), dimensions, new_sizes);
 }
 
-ComputationDataHandle LocalComputationBuilder::Collapse(
-    const ComputationDataHandle& operand,
-    tensorflow::gtl::ArraySlice<int64> dimensions) {
-  return builder_.Collapse(operand, dimensions);
+LocalOp LocalComputationBuilder::Collapse(
+    const LocalOp& operand, tensorflow::gtl::ArraySlice<int64> dimensions) {
+  return builder_.Collapse(operand.op(), dimensions);
 }
 
-ComputationDataHandle LocalComputationBuilder::CrossReplicaSum(
-    const ComputationDataHandle& operand) {
-  return builder_.CrossReplicaSum(operand);
+LocalOp LocalComputationBuilder::CrossReplicaSum(const LocalOp& operand) {
+  return builder_.CrossReplicaSum(operand.op());
 }
 
-ComputationDataHandle LocalComputationBuilder::Slice(
-    const ComputationDataHandle& operand,
-    tensorflow::gtl::ArraySlice<int64> start_indices,
+LocalOp LocalComputationBuilder::Slice(
+    const LocalOp& operand, tensorflow::gtl::ArraySlice<int64> start_indices,
     tensorflow::gtl::ArraySlice<int64> limit_indices,
     tensorflow::gtl::ArraySlice<int64> strides) {
-  return builder_.Slice(operand, start_indices, limit_indices, strides);
+  return builder_.Slice(operand.op(), start_indices, limit_indices, strides);
 }
 
-ComputationDataHandle LocalComputationBuilder::SliceInDim(
-    const ComputationDataHandle& operand, int64 start_index, int64 limit_index,
-    int64 stride, int64 dimno) {
-  return builder_.SliceInDim(operand, start_index, limit_index, stride, dimno);
+LocalOp LocalComputationBuilder::SliceInDim(const LocalOp& operand,
+                                            int64 start_index,
+                                            int64 limit_index, int64 stride,
+                                            int64 dimno) {
+  return builder_.SliceInDim(operand.op(), start_index, limit_index, stride,
+                             dimno);
 }
 
-ComputationDataHandle LocalComputationBuilder::DynamicSlice(
-    const ComputationDataHandle& operand,
-    const ComputationDataHandle& start_indices,
+LocalOp LocalComputationBuilder::DynamicSlice(
+    const LocalOp& operand, const LocalOp& start_indices,
     tensorflow::gtl::ArraySlice<int64> slice_sizes) {
-  return builder_.DynamicSlice(operand, start_indices, slice_sizes);
+  return builder_.DynamicSlice(operand.op(), start_indices.op(), slice_sizes);
 }
 
-ComputationDataHandle LocalComputationBuilder::DynamicUpdateSlice(
-    const ComputationDataHandle& operand, const ComputationDataHandle& update,
-    const ComputationDataHandle& start_indices) {
-  return builder_.DynamicUpdateSlice(operand, update, start_indices);
+LocalOp LocalComputationBuilder::DynamicUpdateSlice(
+    const LocalOp& operand, const LocalOp& update,
+    const LocalOp& start_indices) {
+  return builder_.DynamicUpdateSlice(operand.op(), update.op(),
+                                     start_indices.op());
 }
 
-ComputationDataHandle LocalComputationBuilder::ConcatInDim(
-    tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
-    int64 dimension) {
-  return builder_.ConcatInDim(operands, dimension);
+LocalOp LocalComputationBuilder::ConcatInDim(
+    tensorflow::gtl::ArraySlice<LocalOp> operands, int64 dimension) {
+  std::vector<XlaOp> xla_ops;
+  xla_ops.reserve(operands.size());
+  for (const auto& op : operands) {
+    xla_ops.push_back(op.op());
+  }
+  return builder_.ConcatInDim(xla_ops, dimension);
 }
 
-ComputationDataHandle
-LocalComputationBuilder::SelectAndScatterWithGeneralPadding(
-    const ComputationDataHandle& operand, const LocalComputation& select,
+LocalOp LocalComputationBuilder::SelectAndScatterWithGeneralPadding(
+    const LocalOp& operand, const LocalComputation& select,
     tensorflow::gtl::ArraySlice<int64> window_dimensions,
     tensorflow::gtl::ArraySlice<int64> window_strides,
     tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
-    const ComputationDataHandle& source,
-    const ComputationDataHandle& init_value, const LocalComputation& scatter) {
+    const LocalOp& source, const LocalOp& init_value,
+    const LocalComputation& scatter) {
   return builder_.SelectAndScatterWithGeneralPadding(
-      operand, select.computation(), window_dimensions, window_strides, padding,
-      source, init_value, scatter.computation());
+      operand.op(), select.computation(), window_dimensions, window_strides,
+      padding, source.op(), init_value.op(), scatter.computation());
 }
 
-ComputationDataHandle LocalComputationBuilder::Tuple(
-    tensorflow::gtl::ArraySlice<ComputationDataHandle> elements) {
-  return builder_.Tuple(elements);
+LocalOp LocalComputationBuilder::Tuple(
+    tensorflow::gtl::ArraySlice<LocalOp> elements) {
+  std::vector<XlaOp> xla_ops;
+  xla_ops.reserve(elements.size());
+  for (const auto& op : elements) {
+    xla_ops.push_back(op.op());
+  }
+
+  return builder_.Tuple(xla_ops);
 }
 
-ComputationDataHandle LocalComputationBuilder::GetTupleElement(
-    const ComputationDataHandle& tuple_data, int64 index) {
-  return builder_.GetTupleElement(tuple_data, index);
+LocalOp LocalComputationBuilder::GetTupleElement(const LocalOp& tuple_data,
+                                                 int64 index) {
+  return builder_.GetTupleElement(tuple_data.op(), index);
 }
 
-ComputationDataHandle LocalComputationBuilder::Dot(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs) {
-  return builder_.Dot(lhs, rhs);
+LocalOp LocalComputationBuilder::Dot(const LocalOp& lhs, const LocalOp& rhs) {
+  return builder_.Dot(lhs.op(), rhs.op());
 }
 
-ComputationDataHandle LocalComputationBuilder::DotGeneral(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
+LocalOp LocalComputationBuilder::DotGeneral(
+    const LocalOp& lhs, const LocalOp& rhs,
     const DotDimensionNumbers& dimension_numbers) {
-  return builder_.DotGeneral(lhs, rhs, dimension_numbers);
+  return builder_.DotGeneral(lhs.op(), rhs.op(), dimension_numbers);
 }
 
-ComputationDataHandle LocalComputationBuilder::ConvGeneralDilated(
-    const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
+LocalOp LocalComputationBuilder::ConvGeneralDilated(
+    const LocalOp& lhs, const LocalOp& rhs,
     tensorflow::gtl::ArraySlice<int64> window_strides,
     tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding,
     tensorflow::gtl::ArraySlice<int64> lhs_dilation,
     tensorflow::gtl::ArraySlice<int64> rhs_dilation,
     const ConvolutionDimensionNumbers& dimension_numbers) {
-  return builder_.ConvGeneralDilated(lhs, rhs, window_strides, padding,
-                                     lhs_dilation, rhs_dilation,
+  return builder_.ConvGeneralDilated(lhs.op(), rhs.op(), window_strides,
+                                     padding, lhs_dilation, rhs_dilation,
                                      dimension_numbers);
 }
 
-ComputationDataHandle LocalComputationBuilder::ConvertElementType(
-    const ComputationDataHandle& operand, PrimitiveType new_element_type) {
-  return builder_.ConvertElementType(operand, new_element_type);
+LocalOp LocalComputationBuilder::ConvertElementType(
+    const LocalOp& operand, PrimitiveType new_element_type) {
+  return builder_.ConvertElementType(operand.op(), new_element_type);
 }
 
-ComputationDataHandle LocalComputationBuilder::Call(
+LocalOp LocalComputationBuilder::Call(
     const LocalComputation& local_computation,
-    tensorflow::gtl::ArraySlice<ComputationDataHandle> operands) {
-  return builder_.Call(local_computation.computation(), operands);
+    tensorflow::gtl::ArraySlice<LocalOp> operands) {
+  std::vector<XlaOp> xla_ops;
+  xla_ops.reserve(operands.size());
+  for (const auto& op : operands) {
+    xla_ops.push_back(op.op());
+  }
+  return builder_.Call(local_computation.computation(), xla_ops);
 }
 
-ComputationDataHandle LocalComputationBuilder::Transpose(
-    const ComputationDataHandle& operand,
-    tensorflow::gtl::ArraySlice<int64> permutation) {
-  return builder_.Transpose(operand, permutation);
+LocalOp LocalComputationBuilder::Transpose(
+    const LocalOp& operand, tensorflow::gtl::ArraySlice<int64> permutation) {
+  return builder_.Transpose(operand.op(), permutation);
 }
 
-ComputationDataHandle LocalComputationBuilder::Rev(
-    const ComputationDataHandle& operand,
-    tensorflow::gtl::ArraySlice<int64> dimensions) {
-  return builder_.Rev(operand, dimensions);
+LocalOp LocalComputationBuilder::Rev(
+    const LocalOp& operand, tensorflow::gtl::ArraySlice<int64> dimensions) {
+  return builder_.Rev(operand.op(), dimensions);
 }
 
-ComputationDataHandle LocalComputationBuilder::Map(
-    tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
+LocalOp LocalComputationBuilder::Map(
+    tensorflow::gtl::ArraySlice<LocalOp> operands,
     const LocalComputation& local_computation,
     tensorflow::gtl::ArraySlice<int64> dimensions,
-    tensorflow::gtl::ArraySlice<ComputationDataHandle> static_operands) {
-  return builder_.Map(operands, local_computation.computation(), dimensions,
-                      static_operands);
+    tensorflow::gtl::ArraySlice<LocalOp> static_operands) {
+  std::vector<XlaOp> xla_ops;
+  xla_ops.reserve(operands.size());
+  for (const auto& op : operands) {
+    xla_ops.push_back(op.op());
+  }
+
+  std::vector<XlaOp> static_xla_ops;
+  static_xla_ops.reserve(static_operands.size());
+  for (const auto& op : static_operands) {
+    static_xla_ops.push_back(op.op());
+  }
+
+  return builder_.Map(xla_ops, local_computation.computation(), dimensions,
+                      static_xla_ops);
 }
 
-ComputationDataHandle LocalComputationBuilder::Reduce(
-    const ComputationDataHandle& operand,
-    const ComputationDataHandle& init_value,
+LocalOp LocalComputationBuilder::Reduce(
+    const LocalOp& operand, const LocalOp& init_value,
     const LocalComputation& local_computation,
     tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce) {
-  return builder_.Reduce(operand, init_value, local_computation.computation(),
-                         dimensions_to_reduce);
+  return builder_.Reduce(operand.op(), init_value.op(),
+                         local_computation.computation(), dimensions_to_reduce);
 }
 
-ComputationDataHandle LocalComputationBuilder::ReduceWindowWithGeneralPadding(
-    const ComputationDataHandle& operand,
-    const ComputationDataHandle& init_value,
+LocalOp LocalComputationBuilder::ReduceWindowWithGeneralPadding(
+    const LocalOp& operand, const LocalOp& init_value,
     const LocalComputation& local_computation,
     tensorflow::gtl::ArraySlice<int64> window_dimensions,
     tensorflow::gtl::ArraySlice<int64> window_strides,
     tensorflow::gtl::ArraySlice<std::pair<int64, int64>> padding) {
   return builder_.ReduceWindowWithGeneralPadding(
-      operand, init_value, local_computation.computation(), window_dimensions,
-      window_strides, padding);
+      operand.op(), init_value.op(), local_computation.computation(),
+      window_dimensions, window_strides, padding);
 }
 
-ComputationDataHandle LocalComputationBuilder::RngNormal(
-    const ComputationDataHandle& mu, const ComputationDataHandle& sigma,
-    const Shape& shape) {
-  return builder_.RngNormal(mu, sigma, shape);
+LocalOp LocalComputationBuilder::RngNormal(const LocalOp& mu,
+                                           const LocalOp& sigma,
+                                           const Shape& shape) {
+  return builder_.RngNormal(mu.op(), sigma.op(), shape);
 }
 
-ComputationDataHandle LocalComputationBuilder::RngUniform(
-    const ComputationDataHandle& a, const ComputationDataHandle& b,
-    const Shape& shape) {
-  return builder_.RngUniform(a, b, shape);
+LocalOp LocalComputationBuilder::RngUniform(const LocalOp& a, const LocalOp& b,
+                                            const Shape& shape) {
+  return builder_.RngUniform(a.op(), b.op(), shape);
 }
 
-ComputationDataHandle LocalComputationBuilder::While(
-    const LocalComputation& condition, const LocalComputation& body,
-    const ComputationDataHandle& init) {
-  return builder_.While(condition.computation(), body.computation(), init);
+LocalOp LocalComputationBuilder::While(const LocalComputation& condition,
+                                       const LocalComputation& body,
+                                       const LocalOp& init) {
+  return builder_.While(condition.computation(), body.computation(), init.op());
 }
 
-ComputationDataHandle LocalComputationBuilder::Conditional(
-    const ComputationDataHandle& predicate,
-    const ComputationDataHandle& true_operand,
-    const LocalComputation& true_computation,
-    const ComputationDataHandle& false_operand,
+LocalOp LocalComputationBuilder::Conditional(
+    const LocalOp& predicate, const LocalOp& true_operand,
+    const LocalComputation& true_computation, const LocalOp& false_operand,
     const LocalComputation& false_computation) {
-  return builder_.Conditional(predicate, true_operand,
-                              true_computation.computation(), false_operand,
-                              false_computation.computation());
+  return builder_.Conditional(
+      predicate.op(), true_operand.op(), true_computation.computation(),
+      false_operand.op(), false_computation.computation());
 }
 
-StatusOr<bool> LocalComputationBuilder::IsConstant(
-    const ComputationDataHandle& operand, int64 num_parameters) {
-  return builder_.IsConstant(operand, num_parameters);
+StatusOr<bool> LocalComputationBuilder::IsConstant(const LocalOp& operand) {
+  return builder_.IsConstant(operand.op());
 }
 
-StatusOr<std::unique_ptr<Literal>> LocalComputationBuilder::ComputeConstant(
-    const ComputationDataHandle& operand, const Layout* output_layout,
-    tensorflow::gtl::ArraySlice<Literal> parameters) {
-  return builder_.ComputeConstant(operand, output_layout, parameters);
+StatusOr<LocalComputation*> LocalComputationBuilder::BuildConstantSubGraph(
+    const LocalOp& operand) {
+  TF_ASSIGN_OR_RETURN(XlaComputation computation,
+                      builder_.BuildConstantSubGraph(operand.op()));
+  return new LocalComputation(std::move(computation));
 }
 
 #define _FORWARD(method_name, return_sig, args_sig, args)    \
@@ -537,23 +564,19 @@ StatusOr<std::unique_ptr<Literal>> LocalComputationBuilder::ComputeConstant(
     return builder_.method_name args;                        \
   }
 
-#define _FORWARD_UNOP(method_name)             \
-  _FORWARD(method_name, ComputationDataHandle, \
-           (const ComputationDataHandle& operand), (operand))
-
-#define _FORWARD_BINOP(method_name)                                        \
-  _FORWARD(                                                                \
-      method_name, ComputationDataHandle,                                  \
-      (const ComputationDataHandle& lhs, const ComputationDataHandle& rhs, \
-       tensorflow::gtl::ArraySlice<int64> broadcast_dimensions),           \
-      (lhs, rhs, broadcast_dimensions))
-
-#define _FORWARD_TRIOP(method_name)                                        \
-  _FORWARD(                                                                \
-      method_name, ComputationDataHandle,                                  \
-      (const ComputationDataHandle& lhs, const ComputationDataHandle& rhs, \
-       const ComputationDataHandle& ehs),                                  \
-      (lhs, rhs, ehs))
+#define _FORWARD_UNOP(method_name) \
+  _FORWARD(method_name, LocalOp, (const LocalOp& operand), (operand.op()))
+
+#define _FORWARD_BINOP(method_name)                                   \
+  _FORWARD(method_name, LocalOp,                                      \
+           (const LocalOp& lhs, const LocalOp& rhs,                   \
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions), \
+           (lhs.op(), rhs.op(), broadcast_dimensions))
+
+#define _FORWARD_TRIOP(method_name)                                      \
+  _FORWARD(method_name, LocalOp,                                         \
+           (const LocalOp& lhs, const LocalOp& rhs, const LocalOp& ehs), \
+           (lhs.op(), rhs.op(), ehs.op()))
 
 _FORWARD_TRIOP(Select)
 _FORWARD_TRIOP(Clamp)
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
index e1048909ab29c2..9ac13b65231c93 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.h
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -17,9 +17,10 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_PYTHON_LOCAL_COMPUTATION_BUILDER_H_
 
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -59,12 +60,14 @@ StatusOr<std::unique_ptr<Literal> > TransferFromOutfeedLocalReplica(
 // client.
 class LocalShapedBuffer {
  public:
-  static LocalShapedBuffer* FromLiteral(
+  static StatusOr<LocalShapedBuffer*> FromLiteral(
       const Literal& argument,
       const tensorflow::gtl::optional<Shape>& shape_with_layout);
+
   LocalShapedBuffer(ScopedShapedBuffer shaped_buffer);
   const ScopedShapedBuffer* shaped_buffer() const;
-  std::unique_ptr<Literal> ToLiteral() const;
+
+  StatusOr<std::unique_ptr<Literal> > ToLiteral() const;
 
  private:
   ScopedShapedBuffer shaped_buffer_;
@@ -95,25 +98,42 @@ class CompiledLocalComputation {
   std::unique_ptr<LocalExecutable> executable_;
 };
 
-// Wraps a Computation produced by a LocalComputationBuilder. The
+// Wraps a XlaComputation produced by a LocalComputationBuilder. The
 // Compile method compiles the computation to a (local) executable via
 // the client library's local client. This class is intended to be
 // made available to Python via SWIG.
 class LocalComputation {
  public:
-  LocalComputation(Computation computation);
+  LocalComputation(XlaComputation computation);
 
   StatusOr<CompiledLocalComputation*> Compile(
       const std::vector<Shape>& argument_shapes,
       const ExecutableBuildOptions* build_options);
 
-  const Computation& computation() const;
+  const XlaComputation& computation() const;
+
+  // Returns the HloModuleProto contained in the XlaComputation in the
+  // serialized binary format. Logs an internal error and returns an empty
+  // string on failure.
+  string GetSerializedProto() const;
 
   // Returns the return-value shape for this computation.
   StatusOr<Shape> GetReturnValueShape() const;
 
  private:
-  Computation computation_;
+  XlaComputation computation_;
+};
+
+// Wraps a XlaOp produced by a LocalComputationBuilder. This class is intended
+// to be made available to Python via SWIG.
+class LocalOp {
+ public:
+  LocalOp(const XlaOp& op);
+
+  const XlaOp& op() const;
+
+ private:
+  XlaOp op_;
 };
 
 // Wraps the ComputationBuilder API in order to:
@@ -133,166 +153,137 @@ class LocalComputationBuilder {
   // Returns an owned LocalComputation to the caller on success.
   StatusOr<LocalComputation*> Build();
 
-  ComputationDataHandle Parameter(int64 parameter_number, const Shape& shape,
-                                  const string& name);
+  LocalOp Parameter(int64 parameter_number, const Shape& shape,
+                    const string& name);
 
-  std::unique_ptr<Shape> GetShape(const ComputationDataHandle& operand);
+  std::unique_ptr<Shape> GetShape(const LocalOp& operand);
 
   // Returns the shape of the current return value for the computation.
   StatusOr<Shape> GetReturnValueShape();
 
-  ComputationDataHandle Infeed(const Shape& shape);
+  LocalOp Infeed(const Shape& shape);
 
-  void Outfeed(const ComputationDataHandle& operand, const Shape& shape,
+  void Outfeed(const LocalOp& operand, const Shape& shape,
                const string& outfeed_config);
 
-  ComputationDataHandle ConstantLiteral(const Literal& literal);
+  LocalOp ConstantLiteral(const Literal& literal);
 
-  ComputationDataHandle Broadcast(
-      const ComputationDataHandle& operand,
-      tensorflow::gtl::ArraySlice<int64> broadcast_sizes);
+  LocalOp Broadcast(const LocalOp& operand,
+                    tensorflow::gtl::ArraySlice<int64> broadcast_sizes);
 
-  ComputationDataHandle Pad(const ComputationDataHandle& operand,
-                            const ComputationDataHandle& padding_value,
-                            const PaddingConfig& padding_config);
+  LocalOp Pad(const LocalOp& operand, const LocalOp& padding_value,
+              const PaddingConfig& padding_config);
 
-  ComputationDataHandle Reshape(const ComputationDataHandle& operand,
-                                tensorflow::gtl::ArraySlice<int64> dimensions,
-                                tensorflow::gtl::ArraySlice<int64> new_sizes);
+  LocalOp Reshape(const LocalOp& operand,
+                  tensorflow::gtl::ArraySlice<int64> dimensions,
+                  tensorflow::gtl::ArraySlice<int64> new_sizes);
 
-  ComputationDataHandle Collapse(const ComputationDataHandle& operand,
-                                 tensorflow::gtl::ArraySlice<int64> dimensions);
+  LocalOp Collapse(const LocalOp& operand,
+                   tensorflow::gtl::ArraySlice<int64> dimensions);
 
-  ComputationDataHandle CrossReplicaSum(const ComputationDataHandle& operand);
+  LocalOp CrossReplicaSum(const LocalOp& operand);
 
-  ComputationDataHandle Slice(const ComputationDataHandle& operand,
-                              tensorflow::gtl::ArraySlice<int64> start_indices,
-                              tensorflow::gtl::ArraySlice<int64> limit_indices,
-                              tensorflow::gtl::ArraySlice<int64> strides);
+  LocalOp Slice(const LocalOp& operand,
+                tensorflow::gtl::ArraySlice<int64> start_indices,
+                tensorflow::gtl::ArraySlice<int64> limit_indices,
+                tensorflow::gtl::ArraySlice<int64> strides);
 
-  ComputationDataHandle SliceInDim(const ComputationDataHandle& operand,
-                                   int64 start_index, int64 limit_index,
-                                   int64 stride, int64 dimno);
+  LocalOp SliceInDim(const LocalOp& operand, int64 start_index,
+                     int64 limit_index, int64 stride, int64 dimno);
 
-  ComputationDataHandle DynamicSlice(
-      const ComputationDataHandle& operand,
-      const ComputationDataHandle& start_indices,
-      tensorflow::gtl::ArraySlice<int64> slice_sizes);
+  LocalOp DynamicSlice(const LocalOp& operand, const LocalOp& start_indices,
+                       tensorflow::gtl::ArraySlice<int64> slice_sizes);
 
-  ComputationDataHandle DynamicUpdateSlice(
-      const ComputationDataHandle& operand, const ComputationDataHandle& update,
-      const ComputationDataHandle& start_indices);
+  LocalOp DynamicUpdateSlice(const LocalOp& operand, const LocalOp& update,
+                             const LocalOp& start_indices);
 
-  ComputationDataHandle ConcatInDim(
-      tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
-      int64 dimension);
+  LocalOp ConcatInDim(tensorflow::gtl::ArraySlice<LocalOp> operands,
+                      int64 dimension);
 
-  ComputationDataHandle SelectAndScatterWithGeneralPadding(
-      const ComputationDataHandle& operand, const LocalComputation& select,
+  LocalOp SelectAndScatterWithGeneralPadding(
+      const LocalOp& operand, const LocalComputation& select,
       tensorflow::gtl::ArraySlice<int64> window_dimensions,
       tensorflow::gtl::ArraySlice<int64> window_strides,
       tensorflow::gtl::ArraySlice<std::pair<int64, int64> > padding,
-      const ComputationDataHandle& source,
-      const ComputationDataHandle& init_value, const LocalComputation& scatter);
+      const LocalOp& source, const LocalOp& init_value,
+      const LocalComputation& scatter);
 
-  ComputationDataHandle Tuple(
-      tensorflow::gtl::ArraySlice<ComputationDataHandle> elements);
+  LocalOp Tuple(tensorflow::gtl::ArraySlice<LocalOp> elements);
 
-  ComputationDataHandle GetTupleElement(const ComputationDataHandle& tuple_data,
-                                        int64 index);
+  LocalOp GetTupleElement(const LocalOp& tuple_data, int64 index);
 
-  ComputationDataHandle Dot(const ComputationDataHandle& lhs,
-                            const ComputationDataHandle& rhs);
+  LocalOp Dot(const LocalOp& lhs, const LocalOp& rhs);
 
-  ComputationDataHandle DotGeneral(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
-      const DotDimensionNumbers& dimension_numbers);
+  LocalOp DotGeneral(const LocalOp& lhs, const LocalOp& rhs,
+                     const DotDimensionNumbers& dimension_numbers);
 
-  ComputationDataHandle ConvGeneralDilated(
-      const ComputationDataHandle& lhs, const ComputationDataHandle& rhs,
+  LocalOp ConvGeneralDilated(
+      const LocalOp& lhs, const LocalOp& rhs,
       tensorflow::gtl::ArraySlice<int64> window_strides,
       tensorflow::gtl::ArraySlice<std::pair<int64, int64> > padding,
       tensorflow::gtl::ArraySlice<int64> lhs_dilation,
       tensorflow::gtl::ArraySlice<int64> rhs_dilation,
       const ConvolutionDimensionNumbers& dimension_numbers);
 
-  ComputationDataHandle ConvertElementType(const ComputationDataHandle& operand,
-                                           PrimitiveType new_element_type);
+  LocalOp ConvertElementType(const LocalOp& operand,
+                             PrimitiveType new_element_type);
 
-  ComputationDataHandle Call(
-      const LocalComputation& local_computation,
-      tensorflow::gtl::ArraySlice<ComputationDataHandle> operands);
+  LocalOp Call(const LocalComputation& local_computation,
+               tensorflow::gtl::ArraySlice<LocalOp> operands);
 
-  ComputationDataHandle Transpose(
-      const ComputationDataHandle& operand,
-      tensorflow::gtl::ArraySlice<int64> permutation);
+  LocalOp Transpose(const LocalOp& operand,
+                    tensorflow::gtl::ArraySlice<int64> permutation);
 
-  ComputationDataHandle Rev(const ComputationDataHandle& operand,
-                            tensorflow::gtl::ArraySlice<int64> dimensions);
+  LocalOp Rev(const LocalOp& operand,
+              tensorflow::gtl::ArraySlice<int64> dimensions);
 
-  ComputationDataHandle Map(
-      tensorflow::gtl::ArraySlice<ComputationDataHandle> operands,
-      const LocalComputation& local_computation,
-      tensorflow::gtl::ArraySlice<int64> dimensions,
-      tensorflow::gtl::ArraySlice<ComputationDataHandle> static_operands);
+  LocalOp Map(tensorflow::gtl::ArraySlice<LocalOp> operands,
+              const LocalComputation& local_computation,
+              tensorflow::gtl::ArraySlice<int64> dimensions,
+              tensorflow::gtl::ArraySlice<LocalOp> static_operands);
 
-  ComputationDataHandle Reduce(
-      const ComputationDataHandle& operand,
-      const ComputationDataHandle& init_value,
-      const LocalComputation& local_computation,
-      tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce);
+  LocalOp Reduce(const LocalOp& operand, const LocalOp& init_value,
+                 const LocalComputation& local_computation,
+                 tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce);
 
-  ComputationDataHandle ReduceWindowWithGeneralPadding(
-      const ComputationDataHandle& operand,
-      const ComputationDataHandle& init_value,
+  LocalOp ReduceWindowWithGeneralPadding(
+      const LocalOp& operand, const LocalOp& init_value,
       const LocalComputation& local_computation,
       tensorflow::gtl::ArraySlice<int64> window_dimensions,
       tensorflow::gtl::ArraySlice<int64> window_strides,
       tensorflow::gtl::ArraySlice<std::pair<int64, int64> > padding);
 
-  ComputationDataHandle RngNormal(const ComputationDataHandle& mu,
-                                  const ComputationDataHandle& sigma,
-                                  const Shape& shape);
+  LocalOp RngNormal(const LocalOp& mu, const LocalOp& sigma,
+                    const Shape& shape);
 
-  ComputationDataHandle RngUniform(const ComputationDataHandle& a,
-                                   const ComputationDataHandle& b,
-                                   const Shape& shape);
+  LocalOp RngUniform(const LocalOp& a, const LocalOp& b, const Shape& shape);
 
-  ComputationDataHandle While(const LocalComputation& condition,
-                              const LocalComputation& body,
-                              const ComputationDataHandle& init);
+  LocalOp While(const LocalComputation& condition, const LocalComputation& body,
+                const LocalOp& init);
 
-  ComputationDataHandle Conditional(const ComputationDataHandle& predicate,
-                                    const ComputationDataHandle& true_operand,
-                                    const LocalComputation& true_computation,
-                                    const ComputationDataHandle& false_operand,
-                                    const LocalComputation& false_computation);
+  LocalOp Conditional(const LocalOp& predicate, const LocalOp& true_operand,
+                      const LocalComputation& true_computation,
+                      const LocalOp& false_operand,
+                      const LocalComputation& false_computation);
 
-  StatusOr<bool> IsConstant(const ComputationDataHandle& operand,
-                            int64 num_parameters);
+  StatusOr<bool> IsConstant(const LocalOp& operand);
 
-  StatusOr<std::unique_ptr<Literal> > ComputeConstant(
-      const ComputationDataHandle& operand, const Layout* output_layout,
-      tensorflow::gtl::ArraySlice<Literal> parameters);
+  StatusOr<LocalComputation*> BuildConstantSubGraph(const LocalOp& operand);
 
 #define _FORWARD(method_name, return_sig, args_sig) \
   return_sig method_name args_sig;
 
-#define _FORWARD_UNOP(method_name)             \
-  _FORWARD(method_name, ComputationDataHandle, \
-           (const ComputationDataHandle& operand))
+#define _FORWARD_UNOP(method_name) \
+  _FORWARD(method_name, LocalOp, (const LocalOp& operand))
 
-#define _FORWARD_BINOP(method_name)                                        \
-  _FORWARD(                                                                \
-      method_name, ComputationDataHandle,                                  \
-      (const ComputationDataHandle& lhs, const ComputationDataHandle& rhs, \
-       tensorflow::gtl::ArraySlice<int64> broadcast_dimensions))
+#define _FORWARD_BINOP(method_name)                 \
+  _FORWARD(method_name, LocalOp,                    \
+           (const LocalOp& lhs, const LocalOp& rhs, \
+            tensorflow::gtl::ArraySlice<int64> broadcast_dimensions))
 
-#define _FORWARD_TRIOP(method_name)                                        \
-  _FORWARD(                                                                \
-      method_name, ComputationDataHandle,                                  \
-      (const ComputationDataHandle& lhs, const ComputationDataHandle& rhs, \
-       const ComputationDataHandle& ehs))
+#define _FORWARD_TRIOP(method_name) \
+  _FORWARD(method_name, LocalOp,    \
+           (const LocalOp& lhs, const LocalOp& rhs, const LocalOp& ehs))
 
   _FORWARD_TRIOP(Select)
   _FORWARD_TRIOP(Clamp)
@@ -336,7 +327,7 @@ class LocalComputationBuilder {
 #undef _FORWARD_TRIOP
 
  private:
-  ComputationBuilder builder_;
+  XlaBuilder builder_;
 };
 
 // Functions for freeing resources from the Python side.
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i
index ac792e8189bda9..536b93c6f9381a 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.i
+++ b/tensorflow/compiler/xla/python/local_computation_builder.i
@@ -22,9 +22,8 @@ limitations under the License.
 //
 //    C++                                  Python
 // -------------------------------------+---------------------------------------
-//  ComputationDataHandle              <-> int
 //  ArraySlice<int64>                  <-  sequence of int
-//  ArraySlice<ComputationDataHandle>  <-  sequence of int
+//  ArraySlice<LocalOp>                <-  sequence of LocalOp
 //  Literal                            <-> (nested tuple of) numpy ndarray
 //  std::vector<Literal>               <-  sequence of (nested tuple of) ndarray
 //  Shape                               -> pair holding (dtype, dimensions)
@@ -91,12 +90,9 @@ limitations under the License.
 // One central reason for the Python-side indirection is that the
 // Python-side objects produced by the typemaps in this file are
 // further packaged up by xla_client before being passed on. For
-// instance, xla_client wraps the long produced for a C++
-// ComputationDataHandle in a Python ComputationDataHandle proto,
-// rather than exposing a raw long outside of the client. Similarly,
-// the Python pair produced for a C++ Shape is further wrapped in a
-// Python class (xla_client.Shape) so as not to expose the raw pair
-// externally.
+// instance, the Python pair produced for a C++ Shape is further
+// wrapped in a Python class (xla_client.Shape) so as not to expose
+// the raw pair externally.
 //
 // Other SWIG object wrappers (e.g. of LocalComputation) are further
 // wrapped by xla_client in order to set up a custom destructor that
@@ -124,6 +120,7 @@ using namespace xla;
 using namespace xla::swig;
 
 namespace xla {
+
 namespace swig {
 
 bool GetIntAttr(PyObject* o, const char* field, int64* result) {
@@ -177,27 +174,25 @@ bool HandleStringAttribute(PyObject* o,
 tensorflow::ImportNumpy();
 %}
 
-// ComputationDataHandle
-
-%typemap(in) const ComputationDataHandle& (ComputationDataHandle temp) {
-  const int64 handle = numpy::PyIntOrPyLongToLong($input);
-  if (handle == -1 && PyErr_Occurred()) {
+%typemap(out) StatusOr<xla::swig::CompiledLocalComputation*> {
+  if ($1.ok()) {
+    auto* value = $1.ValueOrDie();
+    {
+      auto* $1 = value;
+      $typemap(out, xla::swig::CompiledLocalComputation*)
+    }
+  } else {
+    PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
     SWIG_fail;
   }
-  temp.set_handle(handle);
-  $1 = &temp;
 }
 
-%typemap(out) ComputationDataHandle {
-  $result = numpy::LongToPyIntOrPyLong($1.handle());
-}
-
-%typemap(out) StatusOr<xla::swig::CompiledLocalComputation*> {
+%typemap(out) StatusOr<xla::swig::LocalShapedBuffer*> {
   if ($1.ok()) {
     auto* value = $1.ValueOrDie();
     {
       auto* $1 = value;
-      $typemap(out, xla::swig::CompiledLocalComputation*)
+      $typemap(out, xla::swig::LocalShapedBuffer*)
     }
   } else {
     PyErr_SetString(PyExc_RuntimeError, $1.status().ToString().c_str());
@@ -288,33 +283,23 @@ tensorflow::ImportNumpy();
   $1 = temps;
 }
 
-// ComputationDataHandle
+// ArraySlice<LocalOp>
 
-%typemap(in) tensorflow::gtl::ArraySlice<ComputationDataHandle>
-    (std::vector<ComputationDataHandle> temps) {
+%typemap(in) tensorflow::gtl::ArraySlice<xla::swig::LocalOp>(
+      std::vector<LocalOp> temps) {
   if (!PySequence_Check($input)) {
     PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
     SWIG_fail;
   }
   const int size = PySequence_Size($input);
-  temps.resize(size);
   for (int i = 0; i < size; ++i) {
     PyObject* o = PySequence_GetItem($input, i);
-    PyObject* py_int = numpy::PyNumberToPyInt(o);
-    if (!py_int) {
-      PyErr_SetString(
-          PyExc_TypeError,
-          "Argument sequence element cannot be converted to int");
-      SWIG_fail;
-    }
-    const int64 handle = numpy::PyIntOrPyLongToLong(py_int);
-    if (handle == -1 && PyErr_Occurred()) {
-      Py_DECREF(py_int);
-      Py_DECREF(o);
+    LocalOp* op;
+    if ((SWIG_ConvertPtr(o, (void**)&op, $descriptor(xla::swig::LocalOp*),
+                         SWIG_POINTER_EXCEPTION)) == -1) {
       SWIG_fail;
     }
-    temps[i].set_handle(handle);
-    Py_DECREF(py_int);
+    temps.push_back(*op);
     Py_DECREF(o);
   }
   $1 = temps;
@@ -866,6 +851,11 @@ tensorflow::ImportNumpy();
     })) {
       return nullptr;
     }
+    if (!HandleStringAttribute($input, "dump_unoptimized_hlo_proto_to", [&](string s) {
+      build_options.set_dump_unoptimized_hlo_proto_to(std::move(s));
+    })) {
+      return nullptr;
+    }
     if (!HandleStringAttribute($input, "dump_per_pass_hlo_proto_to", [&](string s) {
       build_options.set_dump_per_pass_hlo_proto_to(std::move(s));
     })) {
@@ -921,6 +911,8 @@ tensorflow::ImportNumpy();
 %unignore xla::swig::LocalComputation;
 %unignore xla::swig::LocalComputation::Compile;
 %unignore xla::swig::LocalComputation::GetReturnValueShape;
+%unignore xla::swig::LocalComputation::GetSerializedProto;
+%unignore xla::swig::LocalOp;
 %unignore xla::swig::LocalComputationBuilder;
 %unignore xla::swig::LocalComputationBuilder::LocalComputationBuilder;
 %unignore xla::swig::LocalComputationBuilder::Build;
diff --git a/tensorflow/compiler/xla/python/numpy_bridge.cc b/tensorflow/compiler/xla/python/numpy_bridge.cc
index dc6f5fe5fcc067..68648a3a176363 100644
--- a/tensorflow/compiler/xla/python/numpy_bridge.cc
+++ b/tensorflow/compiler/xla/python/numpy_bridge.cc
@@ -340,13 +340,13 @@ StatusOr<OpMetadata> OpMetadataFromPyObject(PyObject* o) {
   return result;
 }
 
-PyObject* PyObjectFromXlaLiteral(const Literal& literal) {
+PyObject* PyObjectFromXlaLiteral(const LiteralSlice& literal) {
   if (ShapeUtil::IsTuple(literal.shape())) {
     int num_elements = ShapeUtil::TupleElementCount(literal.shape());
     PyObject* tuple = PyTuple_New(num_elements);
     for (int i = 0; i < num_elements; i++) {
-      PyTuple_SET_ITEM(
-          tuple, i, PyObjectFromXlaLiteral(LiteralView::Create(literal, {i})));
+      PyTuple_SET_ITEM(tuple, i,
+                       PyObjectFromXlaLiteral(LiteralSlice(literal, {i})));
     }
     return tuple;
   } else {
@@ -431,7 +431,7 @@ Status CopyNumpyArrayToLiteral(int np_type, PyArrayObject* py_array,
   return Status::OK();
 }
 
-void CopyLiteralToNumpyArray(int np_type, const Literal& literal,
+void CopyLiteralToNumpyArray(int np_type, const LiteralSlice& literal,
                              PyArrayObject* py_array) {
   switch (np_type) {
     case NPY_BOOL:
diff --git a/tensorflow/compiler/xla/python/numpy_bridge.h b/tensorflow/compiler/xla/python/numpy_bridge.h
index 9656cb1c31c39d..64f0aae0f9790f 100644
--- a/tensorflow/compiler/xla/python/numpy_bridge.h
+++ b/tensorflow/compiler/xla/python/numpy_bridge.h
@@ -74,7 +74,7 @@ StatusOr<OpMetadata> OpMetadataFromPyObject(PyObject* o);
 // array data.
 //
 // The return value is a new reference.
-PyObject* PyObjectFromXlaLiteral(const Literal& literal);
+PyObject* PyObjectFromXlaLiteral(const LiteralSlice& literal);
 
 // Converts a Numpy ndarray or a nested Python tuple thereof to a
 // corresponding XLA literal.
@@ -90,7 +90,7 @@ StatusOr<std::unique_ptr<Literal> > XlaLiteralFromPyObject(PyObject* o);
 Status CopyNumpyArrayToLiteral(int np_type, PyArrayObject* py_array,
                                Literal* literal);
 
-void CopyLiteralToNumpyArray(int np_type, const Literal& literal,
+void CopyLiteralToNumpyArray(int np_type, const LiteralSlice& literal,
                              PyArrayObject* py_array);
 
 template <typename NativeT>
@@ -101,7 +101,8 @@ void CopyNumpyArrayToLiteral(PyArrayObject* py_array, Literal* literal) {
 }
 
 template <typename NativeT>
-void CopyLiteralToNumpyArray(const Literal& literal, PyArrayObject* py_array) {
+void CopyLiteralToNumpyArray(const LiteralSlice& literal,
+                             PyArrayObject* py_array) {
   NativeT* dest = static_cast<NativeT*>(PyArray_DATA(py_array));
   auto source = literal.data<NativeT>();
   std::copy(source.begin(), source.end(), dest);
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index f6809b6b871d7e..11611ac61287da 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -28,6 +28,7 @@
 
 from tensorflow.compiler.xla import xla_data_pb2
 from tensorflow.compiler.xla.python import pywrap_xla as c_api
+from tensorflow.compiler.xla.service import hlo_pb2
 
 
 # Most functions are snake_case for consistency with other modules, whereas
@@ -335,20 +336,6 @@ def _wrap_shape(shape_info):
     return Shape.array_shape(dtype, dims)
 
 
-def _wrap_data_handle(handle):
-  cdh = xla_data_pb2.ComputationDataHandle()
-  cdh.handle = handle
-  return cdh
-
-
-def _unwrap_data_handle(handle_proto):
-  return handle_proto.handle
-
-
-def _unwrap_data_handles(handle_protos):
-  return [_unwrap_data_handle(cdh) for cdh in handle_protos]
-
-
 def require_numpy_array_layout(value):
   if isinstance(value, tuple):
     return tuple(require_numpy_array_layout(x) for x in value)
@@ -366,6 +353,7 @@ class CompileOptions(object):
   def __init__(self):
     self.generate_hlo_graph = None
     self.dump_optimized_hlo_proto_to = None
+    self.dump_unoptimized_hlo_proto_to = None
     self.dump_per_pass_hlo_proto_to = None
     self.hlo_profile = False
 
@@ -424,6 +412,17 @@ def __init__(self, c_local_computation, is_compiled):
       assert isinstance(c_local_computation, c_api.LocalComputation)
       self._delete = c_api.DeleteLocalComputation
 
+  def GetProto(self):
+    """Get the HloModuleProto proto object in this local computation.
+
+    Returns:
+       An HloModuleProto proto object that has the whole-graph information.
+    """
+
+    serialized = self.c_local_computation.GetSerializedProto()
+    proto = hlo_pb2.HloModuleProto.FromString(serialized)
+    return proto
+
   def Compile(self, argument_shapes=(), compile_options=None, layout_fn=None):
     """Compiles an un-compiled local computation.
 
@@ -535,9 +534,9 @@ def Infeed(self, shape):
     queue for subsequent use in the computation.
 
     Returns:
-      A  ComputationDataHandle message.
+      A LocalOp.
     """
-    return _wrap_data_handle(self._client.Infeed(shape))
+    return self._client.Infeed(shape)
 
   def Outfeed(self, operand):
     """Enqueues an outfeed op onto the computation.
@@ -545,9 +544,7 @@ def Outfeed(self, operand):
     Outfeed operations enqueue data, using the given operand, onto the XLA
     outfeed queue for subsequent dequeue via the client API.
     """
-    self._client.Outfeed(
-        _unwrap_data_handle(operand), self.GetShape(operand),
-        ''.encode('utf-8'))
+    self._client.Outfeed(operand, self.GetShape(operand), ''.encode('utf-8'))
 
   def Constant(self, value):
     """Enqueues a constant op onto the computation.
@@ -557,10 +554,10 @@ def Constant(self, value):
              to one of the supported types.
 
     Returns:
-      A ComputationDataHandle message.
+      A LocalOp.
     """
     value = require_numpy_array_layout(value)
-    return _wrap_data_handle(self._client.ConstantLiteral(value))
+    return self._client.ConstantLiteral(value)
 
   def ConstantF32Scalar(self, value):
     """Convenience method to enqueue a scalar F32 constant op.
@@ -569,7 +566,7 @@ def ConstantF32Scalar(self, value):
       value: a floating-point number.
 
     Returns:
-      A ComputationDataHandle message.
+      A LocalOp.
     """
     return self.Constant(np.array(value, dtype=np.float32))
 
@@ -580,7 +577,7 @@ def ConstantF64Scalar(self, value):
       value: a floating-point number.
 
     Returns:
-      A ComputationDataHandle message.
+      A LocalOp.
     """
     return self.Constant(np.array(value, dtype=np.float64))
 
@@ -591,7 +588,7 @@ def ConstantS32Scalar(self, value):
       value: a floating-point number.
 
     Returns:
-      A ComputationDataHandle message.
+      A LocalOp.
     """
     return self.Constant(np.array(value, dtype=np.int32))
 
@@ -602,7 +599,7 @@ def ConstantS64Scalar(self, value):
       value: a floating-point number.
 
     Returns:
-      A ComputationDataHandle message.
+      A LocalOp.
     """
     return self.Constant(np.array(value, dtype=np.int64))
 
@@ -613,7 +610,7 @@ def ConstantPredScalar(self, value):
       value: a boolean value.
 
     Returns:
-      A ComputationDataHandle message.
+      A LocalOp.
     """
     return self.Constant(np.array(value, dtype=np.bool))
 
@@ -629,15 +626,14 @@ def ParameterWithShape(self, shape, name=None, parameter_num=None):
         parameters, use it for *all* parameters to avoid clashes.
 
     Returns:
-      A ComputationDataHandle message.
+      A LocalOp.
     """
     if name is None:
       name = ''
     if parameter_num is None:
       parameter_num = next(self._parameter_numbering)
 
-    return _wrap_data_handle(
-        self._client.Parameter(parameter_num, shape, name.encode('utf8')))
+    return self._client.Parameter(parameter_num, shape, name.encode('utf8'))
 
   def ParameterFromNumpy(self, value, name=None, parameter_num=None):
     """Enqueues a Parameter op onto the computation.
@@ -649,7 +645,7 @@ def ParameterFromNumpy(self, value, name=None, parameter_num=None):
       parameter_num: as in ParameterWithShape.
 
     Returns:
-      A ComputationDataHandle message.
+      A LocalOp.
     """
     return self.ParameterWithShape(
         Shape.from_pyval(value), name=name, parameter_num=parameter_num)
@@ -658,14 +654,13 @@ def Broadcast(self, operand, sizes):
     """Enqueues a broadcast operation onto the computation.
 
     Args:
-      operand: the operand ComputationDataHandle to broadcast.
+      operand: the operand LocalOp to broadcast.
       sizes: an iterable of broadcast sizes.
 
     Returns:
-      A ComputationDataHandle representing the added broadcast op.
+      A LocalOp representing the added broadcast op.
     """
-    return _wrap_data_handle(
-        self._client.Broadcast(_unwrap_data_handle(operand), sizes))
+    return self._client.Broadcast(operand, sizes)
 
   def Concatenate(self, operands, dimension):
     """Enqueues a concatenate operation onto the computation.
@@ -675,10 +670,9 @@ def Concatenate(self, operands, dimension):
       dimension: the dimension in which to perform the concatenation.
 
     Returns:
-      A ComputationDataHandle representing the added concatenate op.
+      A LocalOp representing the added concatenate op.
     """
-    return _wrap_data_handle(
-        self._client.ConcatInDim(_unwrap_data_handles(operands), dimension))
+    return self._client.ConcatInDim(operands, dimension)
 
   def ConvertElementType(self, operand, new_element_type):
     """Enqueues an element type conversion operation onto the computation.
@@ -688,14 +682,12 @@ def ConvertElementType(self, operand, new_element_type):
       new_element_type: the target primitive type.
 
     Returns:
-      A ComputationDataHandle representing the added conversion op.
+      A LocalOp representing the added conversion op.
     """
-    return _wrap_data_handle(
-        self._client.ConvertElementType(
-            _unwrap_data_handle(operand), new_element_type))
+    return self._client.ConvertElementType(operand, new_element_type)
 
   def GetShape(self, operand):
-    return _wrap_shape(self._client.GetShape(_unwrap_data_handle(operand)))
+    return _wrap_shape(self._client.GetShape(operand))
 
   def GetReturnValueShape(self):
     return _wrap_shape(self._client.GetReturnValueShape())
@@ -707,40 +699,35 @@ def Pad(self, operand, padding_value, padding_config):
     """Enqueues a Pad operation onto the computation.
 
     Args:
-      operand: ComputationDataHandle representing the array to pad.
-      padding_value: ComputationDataHandle representing the scalar pad value.
+      operand: LocalOp representing the array to pad.
+      padding_value: LocalOp representing the scalar pad value.
       padding_config: either an xla_data_pb2.PaddingConfig or a list of integer
         triples (edge_padding_low, edge_padding_high, interior_padding)
         representing the configuration of the padding operation.
 
     Returns:
-      A ComputationDataHandle representing the added Pad op.
+      A LocalOp representing the added Pad op.
     """
     if not isinstance(padding_config, xla_data_pb2.PaddingConfig):
       padding_config = GetPaddingConfigFromTriples(padding_config)
-    return _wrap_data_handle(
-        self._client.Pad(_unwrap_data_handle(operand),
-                         _unwrap_data_handle(padding_value),
-                         padding_config))
+    return self._client.Pad(operand, padding_value, padding_config)
 
   def Reshape(self, operand, dimensions, new_sizes):
     """Enqueues a reshape op onto the computation.
 
     Args:
-      operand: ComputationDataHandle representing the array to be reshaped.
+      operand: LocalOp representing the array to be reshaped.
       dimensions: sequence of integers encoding the order in which dimensions
         are collapsed or None, in which case dimensions are flattened in order.
       new_sizes: sequence of integers encoding the new dimension sizes (shape).
 
     Returns:
-      A ComputationDataHandle representing the added Reshape op.
+      A LocalOp representing the added Reshape op.
     """
     if dimensions is None:
       ndim = len(self.GetShape(operand).dimensions())
       dimensions = tuple(range(ndim))
-    return _wrap_data_handle(
-        self._client.Reshape(
-            _unwrap_data_handle(operand), dimensions, new_sizes))
+    return self._client.Reshape(operand, dimensions, new_sizes)
 
   def CrossReplicaSum(self, operand):
     """CrossReplicaSum op.
@@ -749,67 +736,56 @@ def CrossReplicaSum(self, operand):
       operand: the operand to sum across replica instances.
 
     Returns:
-      A ComputationDataHandle that has the sum of the value among all replicas.
+      A LocalOp that has the sum of the value among all replicas.
     """
-    return _wrap_data_handle(
-        self._client.CrossReplicaSum(_unwrap_data_handle(operand)))
+    return self._client.CrossReplicaSum(operand)
 
   def Collapse(self, operand, dimensions):
     """Collapse op."""
-    return _wrap_data_handle(
-        self._client.Collapse(_unwrap_data_handle(operand), dimensions))
+    return self._client.Collapse(operand, dimensions)
 
   def Trans(self, operand):
     """Specialized matrix transpose op."""
-    return _wrap_data_handle(
-        self._client.Transpose(_unwrap_data_handle(operand), [1, 0]))
+    return self._client.Transpose(operand, [1, 0])
 
   def Transpose(self, operand, permutation):
     """Transpose op."""
-    return _wrap_data_handle(
-        self._client.Transpose(_unwrap_data_handle(operand), permutation))
+    return self._client.Transpose(operand, permutation)
 
   def Rev(self, operand, dimensions):
     """Rev op."""
-    return _wrap_data_handle(
-        self._client.Rev(_unwrap_data_handle(operand), dimensions))
+    return self._client.Rev(operand, dimensions)
 
   def Clamp(self, min, operand, max):  # pylint: disable=redefined-builtin
     """Clamp op."""
-    return _wrap_data_handle(
-        self._client.Clamp(_unwrap_data_handle(min),
-                           _unwrap_data_handle(operand),
-                           _unwrap_data_handle(max)))
+    return self._client.Clamp(min, operand, max)
 
   def SelectAndScatter(self, operand, select, window_dimensions, window_strides,
                        padding, source, init_value, scatter):
     """Select and scatter op, used by the gradient of ReduceWindow.
 
     Args:
-      operand: ComputationDataHandle for array of dimension N and type T over
+      operand: LocalOp for array of dimension N and type T over
         which the windows slide.
       select: Computation of type (T, T) -> Pred to apply to the elements of
         each window to indicate which element is selected.
       window_dimensions: sequence of N integers for dimensions of the window.
       window_strides: sequence of N integers for the strides of the window.
       padding: PaddingType representing either 'SAME' or 'VALID ' padding.
-      source: ComputationDataHandle for array of type T with values to scatter.
-      init_value: ComputationDataHandle of scalar type T for initial out value.
+      source: LocalOp for array of type T with values to scatter.
+      init_value: LocalOp of scalar type T for initial out value.
       scatter: Computation of type (T, T) -> T to apply to each scatter source
         element with its destination element.
 
     Returns:
-      A ComputationDataHandle representing the added SelectAndScatter op.
+      A LocalOp representing the added SelectAndScatter op.
     """
     pads = _convert_padding_type_to_pad_values(
         padding, self.GetShape(operand).dimensions(),
         window_dimensions, window_strides)
-    return _wrap_data_handle(
-        self._client.SelectAndScatterWithGeneralPadding(
-            _unwrap_data_handle(operand), select.c_local_computation,
-            window_dimensions, window_strides, pads,
-            _unwrap_data_handle(source), _unwrap_data_handle(init_value),
-            scatter.c_local_computation))
+    return self._client.SelectAndScatterWithGeneralPadding(
+        operand, select.c_local_computation, window_dimensions, window_strides,
+        pads, source, init_value, scatter.c_local_computation)
 
   def Select(self, pred, on_true, on_false):
     """Element-wise selection op.
@@ -817,17 +793,13 @@ def Select(self, pred, on_true, on_false):
     Constructs an output array from elements of two input arrays, based on the
     values of a predicate array.
     """
-    return _wrap_data_handle(
-        self._client.Select(
-            _unwrap_data_handle(pred),
-            _unwrap_data_handle(on_true),
-            _unwrap_data_handle(on_false)))
+    return self._client.Select(pred, on_true, on_false)
 
   def Slice(self, operand, start_indices, limit_indices, strides=None):
     """Enqueues a slice operation onto the computation.
 
     Args:
-      operand: ComputationDataHandle for the N dimensional array to be sliced.
+      operand: LocalOp for the N dimensional array to be sliced.
       start_indices: iterable of N integers containing the starting indices of
         the slice for each dimension.
       limit_indices: iterable of N integers containing the ending indices
@@ -836,207 +808,177 @@ def Slice(self, operand, start_indices, limit_indices, strides=None):
         each dimension.
 
     Returns:
-      A ComputationDataHandle representing the added Slice op.
+      A LocalOp representing the added Slice op.
     """
     if strides is None:
       start_indices = list(start_indices)
       strides = [1] * len(start_indices)
-    return _wrap_data_handle(
-        self._client.Slice(
-            _unwrap_data_handle(operand), start_indices, limit_indices,
-            strides))
+    return self._client.Slice(operand, start_indices, limit_indices, strides)
 
   def SliceInDim(self, operand, start_index, limit_index, stride, dimno):
     """Enqueues a slice-in-dimension operation onto the computation.
 
     Args:
-      operand: ComputationDataHandle for the N dimensional array to be sliced.
+      operand: LocalOp for the N dimensional array to be sliced.
       start_index: an integer containing the start index of the slice.
       limit_index: an integer containing the end index of the slice.
       stride: an integer containing the stride size for the slice.
       dimno: an integer indicating the dimension along which to slice.
 
     Returns:
-      A ComputationDataHandle representing the added Slice op.
+      A LocalOp representing the added Slice op.
     """
-    return _wrap_data_handle(
-        self._client.SliceInDim(
-            _unwrap_data_handle(operand), start_index, limit_index, stride,
-            dimno))
+    return self._client.SliceInDim(operand, start_index, limit_index, stride,
+                                   dimno)
 
   def DynamicSlice(self, operand, start_indices, slice_sizes):
     """Enqueues a slice op with dynamic start indices onto the computation.
 
     Args:
-      operand: ComputationDataHandle for the N dimensional array to be sliced.
-      start_indices: ComputationDataHandle for the 1D array of N integers
+      operand: LocalOp for the N dimensional array to be sliced.
+      start_indices: LocalOp for the 1D array of N integers
         containing the starting indices of the slice.
       slice_sizes: iterable of N integers containing the slice sizes in each
         dimension.
 
     Returns:
-      A ComputationDataHandle representing the added DynamicSlice op.
+      A LocalOp representing the added DynamicSlice op.
     """
-    return _wrap_data_handle(
-        self._client.DynamicSlice(
-            _unwrap_data_handle(operand),
-            _unwrap_data_handle(start_indices),
-            slice_sizes))
+    return self._client.DynamicSlice(operand, start_indices, slice_sizes)
 
   def DynamicUpdateSlice(self, operand, update, start_indices):
     """Enqueues a dynamic update slice operation onto the computation.
 
     Args:
-      operand: ComputationDataHandle for the N dimensional array to be updated.
+      operand: LocalOp for the N dimensional array to be updated.
       update: N dimensional array comprising the slice update.
       start_indices: Rank-1 array of N integers comprising the starting indices
         of the slice along each dimension.
     Returns:
-      A ComputationDataHandle representing the added DynamicUpdateSlice op.
+      A LocalOp representing the added DynamicUpdateSlice op.
     """
-    return _wrap_data_handle(
-        self._client.DynamicUpdateSlice(
-            _unwrap_data_handle(operand),
-            _unwrap_data_handle(update),
-            _unwrap_data_handle(start_indices)))
+    return self._client.DynamicUpdateSlice(operand, update, start_indices)
 
   def Tuple(self, *ops):
     """Enqueues a tuple operation onto the computation.
 
     Args:
-      ops: a sequence of tuple operands (each a ComputationDataHandle).
+      ops: a sequence of tuple operands (each a LocalOp).
 
     Returns:
-      A ComputationDataHandle representing the added Tuple op.
+      A LocalOp representing the added Tuple op.
     """
-    return _wrap_data_handle(self._client.Tuple(_unwrap_data_handles(ops)))
+    return self._client.Tuple(ops)
 
   def GetTupleElement(self, tup, index):
     """Enqueues a 'get tuple element' operation onto the computation.
 
     Args:
-      tup: the tuple operand (a ComputationDataHandle).
+      tup: the tuple operand (a LocalOp).
       index: numeric index to select from the tuple.
 
     Returns:
-      A ComputationDataHandle representing the added GetTupleElement op.
+      A LocalOp representing the added GetTupleElement op.
     """
-    return _wrap_data_handle(
-        self._client.GetTupleElement(_unwrap_data_handle(tup), index))
+    return self._client.GetTupleElement(tup, index)
 
   def Call(self, computation_to_apply, operands):
     """Enqueues a call operation onto the computation.
 
     Args:
       computation_to_apply: a Computation object.
-      operands: an iterable of ComputationDataHandle. The number and types of
+      operands: an iterable of LocalOp. The number and types of
         operands must match the arity of computation_to_apply.
 
     Returns:
-      A ComputationDataHandle representing the added call op.
+      A LocalOp representing the added call op.
     """
-    return _wrap_data_handle(
-        self._client.Call(computation_to_apply.c_local_computation,
-                          _unwrap_data_handles(operands)))
+    return self._client.Call(computation_to_apply.c_local_computation, operands)
 
   def Map(self, operands, computation_to_apply, dimensions, static_operands=()):
     """Enqueues a map operation onto the computation.
 
     Args:
-      operands: an iterable of ComputationDataHandle.
+      operands: an iterable of LocalOp.
       computation_to_apply: a Computation object.
       dimensions: dimensions over which to apply map the function.
       static_operands: auxiliary arguments passed to the applied computation.
 
     Returns:
-      A ComputationDataHandle representing the added Map op.
+      A LocalOp representing the added Map op.
     """
-    return _wrap_data_handle(
-        self._client.Map(
-            _unwrap_data_handles(operands),
-            computation_to_apply.c_local_computation,
-            dimensions,
-            _unwrap_data_handles(static_operands)))
+    return self._client.Map(operands, computation_to_apply.c_local_computation,
+                            dimensions, static_operands)
 
   def Reduce(self, operand, init_value, computation_to_apply, dimensions):
     """Enqueues a reduction operation onto the computation.
 
     Args:
-      operand: reduction operand (ComputationDataHandle).
-      init_value: reduction initial value (ComputationDataHandle).
+      operand: reduction operand (LocalOp).
+      init_value: reduction initial value (LocalOp).
       computation_to_apply: a Computation object - binary reduction function.
       dimensions: sequence of dimensions (integers) to reduce on.
 
     Returns:
-      A ComputationDataHandle representing the added Reduce op.
+      A LocalOp representing the added Reduce op.
     """
-    return _wrap_data_handle(
-        self._client.Reduce(
-            _unwrap_data_handle(operand),
-            _unwrap_data_handle(init_value),
-            computation_to_apply.c_local_computation,
-            dimensions))
+    return self._client.Reduce(operand, init_value,
+                               computation_to_apply.c_local_computation,
+                               dimensions)
 
   def ReduceWindow(self, operand, init_value, computation_to_apply,
                    window_dimensions, window_strides, padding):
     """Enqueues a windowed reduction operation onto the computation.
 
     Args:
-      operand: reduction operand (ComputationDataHandle).
-      init_value: reduction initial value (ComputationDataHandle).
+      operand: reduction operand (LocalOp).
+      init_value: reduction initial value (LocalOp).
       computation_to_apply: a binary reduction function (Computation).
       window_dimensions: dimensions of window (sequence of integers).
       window_strides: strides for window (sequence of integers).
       padding: PaddingType representing either 'SAME' or 'VALID' padding.
 
     Returns:
-      A ComputationDataHandle representing the added ReduceWindow op.
+      A LocalOp representing the added ReduceWindow op.
     """
     pads = _convert_padding_type_to_pad_values(
         padding, self.GetShape(operand).dimensions(), window_dimensions,
         window_strides)
-    return _wrap_data_handle(
-        self._client.ReduceWindowWithGeneralPadding(
-            _unwrap_data_handle(operand),
-            _unwrap_data_handle(init_value),
-            computation_to_apply.c_local_computation,
-            window_dimensions, window_strides, pads))
+    return self._client.ReduceWindowWithGeneralPadding(
+        operand, init_value, computation_to_apply.c_local_computation,
+        window_dimensions, window_strides, pads)
 
   def RngNormal(self, mu, sigma, dims):
     """Enqueues an RngNormal operation onto the computation.
 
     Args:
-      mu: A ComputationDataHandle to an F32 scalar specifying the mean.
-      sigma: A ComputationDataHandle to an F32 scalar specifying the standard
+      mu: A LocalOp to an F32 scalar specifying the mean.
+      sigma: A LocalOp to an F32 scalar specifying the standard
         deviation.
       dims: A 1D array-like of nonnegative integers specifying the dimensions.
 
-    Returns: a ComputationDataHandle to the generated array of F32 values.
+    Returns: a LocalOp to the generated array of F32 values.
     """
     shape = Shape.array_shape(self.GetShape(mu).element_type(), dims)
-    return _wrap_data_handle(
-        self._client.RngNormal(
-            _unwrap_data_handle(mu), _unwrap_data_handle(sigma), shape))
+    return self._client.RngNormal(mu, sigma, shape)
 
   def RngUniform(self, a, b, dims):
     """Enqueues an RngUniform operation onto the computation.
 
     Args:
-      a: a ComputationDataHandle to an F32, S32, or U32 scalar (consistent with
+      a: a LocalOp to an F32, S32, or U32 scalar (consistent with
         the type of b) specifying the low end of the interval [a, b) over which
         values are generated.
-      b: a ComputationDataHandle to an F32, S32, or U32 scalar (consistent with
+      b: a LocalOp to an F32, S32, or U32 scalar (consistent with
         the type of a) specifying the high end of the interval [a, b) over which
         values are generated.
       dims: A 1D array-like of nonnegative integers specifying the dimensions.
 
-    Returns: a ComputationDataHandle to the generated array of values with the
+    Returns: a LocalOp to the generated array of values with the
       same numeric type (F32, S32, or U32) as the arguments a and b.
     """
     shape = Shape.array_shape(self.GetShape(a).element_type(), dims)
-    return _wrap_data_handle(
-        self._client.RngUniform(
-            _unwrap_data_handle(a), _unwrap_data_handle(b), shape))
+    return self._client.RngUniform(a, b, shape)
 
   def While(self, cond, body, init):
     """Enqueues a While operation onto the computation.
@@ -1044,112 +986,105 @@ def While(self, cond, body, init):
     Args:
       cond: a Computation for the loop condition, which has type T -> PRED
       body: a Computation for the loop body, which has type T -> T
-      init: a ComputationDataHandle for the initial parameter, which has type T
+      init: a LocalOp for the initial parameter, which has type T
 
-    Returns: a ComputationDataHandle representing the While operation.
+    Returns: a LocalOp representing the While operation.
     """
-    return _wrap_data_handle(
-        self._client.While(cond.c_local_computation,
-                           body.c_local_computation,
-                           _unwrap_data_handle(init)))
+    return self._client.While(cond.c_local_computation,
+                              body.c_local_computation, init)
 
   def Conditional(self, pred, true_operand, true_computation, false_operand,
                   false_computation):
     """Enqueues a Conditional operation onto the computation.
 
     Args:
-      predicate: a ComputationDataHandle to test, which has scalar type PRED
-      true_operand: a ComputationDataHandle of type T_0
+      predicate: a LocalOp to test, which has scalar type PRED
+      true_operand: a LocalOp of type T_0
       true_computation: a Computation to apply to true_operand, type T_0 -> S
       false_operand: a ComputationDatahandle of type T_1
       false_computation: a Computation to apply to false_operand, type T_1 -> S
 
-    Returns: a ComputationDataHandle representing the Conditional operation.
+    Returns: a LocalOp representing the Conditional operation.
     """
-    return _wrap_data_handle(
-        self._client.Conditional(
-            _unwrap_data_handle(pred), _unwrap_data_handle(true_operand),
-            true_computation.c_local_computation,
-            _unwrap_data_handle(false_operand),
-            false_computation.c_local_computation))
+    return self._client.Conditional(
+        pred, true_operand, true_computation.c_local_computation, false_operand,
+        false_computation.c_local_computation)
 
-  def IsConstant(self, operand, num_parameters=0):
-    """Enqueues an IsConstant operation onto the computation.
+  def IsConstant(self, operand):
+    """Checks whether the given operand is a compile-time constant.
 
     Args:
       operand: a ComputationDataHandle to test.
-      num_parameters: optional int, number of computation parameters to treat as
-        constant (default 0).
 
     Returns: bool indicating whether `operand` is a compile-time constant,
-      meaning its value does not depend on parameters with index greater than or
-      equal to `num_parameters`.
+      meaning its value does not depend on any parametersor, or on stateful
+      operators such as `RngNormal` or `Infeed`.
     """
-    return self._client.IsConstant(_unwrap_data_handle(operand), num_parameters)
+    return self._client.IsConstant(operand)
+
+  def BuildConstantSubGraph(self, operand):
+    """Builds a constant sub graph.
+
+    Args:
+      operand: a LocalOp to test.
+    Returns: a LocalComputation that is rooted on the given `operand` which is a
+      compile-time constant.
+    """
+    return self._client.BuildConstantSubGraph(operand)
 
   def Dot(self, lhs, rhs):
     """Enqueues a dot operation onto the computation.
 
     Args:
-      lhs: ComputationDataHandle for the rank 1 or rank 2 left-hand-side array.
-      rhs: ComputationDataHandle for the rank 1 or rank 2 right-hand-side array.
+      lhs: LocalOp for the rank 1 or rank 2 left-hand-side array.
+      rhs: LocalOp for the rank 1 or rank 2 right-hand-side array.
 
-    Returns: a ComputationDataHandle representing the Dot operation.
+    Returns: a LocalOp representing the Dot operation.
     """
-    return _wrap_data_handle(
-        self._client.Dot(_unwrap_data_handle(lhs), _unwrap_data_handle(rhs)))
+    return self._client.Dot(lhs, rhs)
 
   def DotGeneral(self, lhs, rhs, dimension_numbers):
     """Enqueues a general dot operation onto the computation.
 
     Args:
-      lhs: ComputationDataHandle for the left-hand-side array.
-      rhs: ComputationDataHandle for the right-hand-side array.
+      lhs: LocalOp for the left-hand-side array.
+      rhs: LocalOp for the right-hand-side array.
       dimension_numbers: either an xla_data_pb2.DotDimensionNumbers or a nested
         tuple ((lhs_contract, rhs_contract), (lhs_batch, rhs_batch)) of lists of
         integers representing the dimensions to treat as contracting dimensions
         and batch dimensions on each input operand.
 
-    Returns: a ComputationDataHandle representing the DotGeneral operation.
+    Returns: a LocalOp representing the DotGeneral operation.
     """
     if not isinstance(dimension_numbers, xla_data_pb2.DotDimensionNumbers):
       dimension_numbers = GetDotDimensionsFromLists(dimension_numbers)
-    return _wrap_data_handle(
-        self._client.DotGeneral(
-            _unwrap_data_handle(lhs), _unwrap_data_handle(rhs),
-            dimension_numbers))
+    return self._client.DotGeneral(lhs, rhs, dimension_numbers)
 
   def Conv(self, lhs, rhs, window_strides, padding):
     """Enqueues a Conv operation onto the computation.
 
     Args:
-      lhs: ComputationDataHandle for the rank N+2 array of inputs.
-      rhs: ComputationDataHandle for the rank N+2 array of kernel weights.
+      lhs: LocalOp for the rank N+2 array of inputs.
+      rhs: LocalOp for the rank N+2 array of kernel weights.
       window_strides: length-N array-like of integer kernel strides.
       padding: PaddingType representing either 'SAME' or 'VALID' padding.
 
-    Returns: a ComputationDataHandle representing the Conv operation.
+    Returns: a LocalOp representing the Conv operation.
     """
     pads = _convert_padding_type_to_pad_values(
         padding, self.GetShape(lhs).dimensions()[2:],
         self.GetShape(rhs).dimensions()[2:], window_strides)
     dimension_numbers = self._GetConvDimensionNumbers(len(window_strides))
-    return _wrap_data_handle(
-        self._client.ConvGeneralDilated(_unwrap_data_handle(lhs),
-                                        _unwrap_data_handle(rhs),
-                                        window_strides,
-                                        pads,
-                                        (),
-                                        (),
-                                        dimension_numbers))
+    return self._client.ConvGeneralDilated(lhs, rhs, window_strides, pads, (),
+                                           (), dimension_numbers)
 
   def ConvWithGeneralPadding(self, lhs, rhs, window_strides, padding,
                              lhs_dilation, rhs_dilation):
     """Enqueues a ConvWithGeneralPadding operation onto the computation.
 
     Args:
-      lhs: ComputationDataHandle for the rank N+2 array of inputs.
-      rhs: ComputationDataHandle for the rank N+2 array of kernel weights.
+      lhs: LocalOp for the rank N+2 array of inputs.
+      rhs: LocalOp for the rank N+2 array of kernel weights.
       window_strides: length-N array-like of kernel strides.
       padding: length-N array-like of pairs of integers of (low, high) padding.
       lhs_dilation: length-N array-like of dilation factors.
@@ -1159,14 +1094,9 @@ def ConvWithGeneralPadding(self, lhs, rhs, window_strides, padding,
       A ComputationdataHandle representing the added ConvWithGeneralPadding op.
     """
     dimension_numbers = self._GetConvDimensionNumbers(len(window_strides))
-    return _wrap_data_handle(
-        self._client.ConvGeneralDilated(_unwrap_data_handle(lhs),
-                                        _unwrap_data_handle(rhs),
-                                        window_strides,
-                                        padding,
-                                        lhs_dilation,
-                                        rhs_dilation,
-                                        dimension_numbers))
+    return self._client.ConvGeneralDilated(lhs, rhs, window_strides, padding,
+                                           lhs_dilation, rhs_dilation,
+                                           dimension_numbers)
 
   def _GetConvDimensionNumbers(self, num_spatial_dims):
     """Create ConvolutionDimensionNumbers proto for convolutions."""
@@ -1183,6 +1113,61 @@ def _GetConvDimensionNumbers(self, num_spatial_dims):
     dimension_numbers.output_spatial_dimensions.extend(range(2, 2 + nd))
     return dimension_numbers
 
+  def ConvGeneralDilated(self, lhs, rhs, window_strides, padding, lhs_dilation,
+                         rhs_dilation, dimension_numbers):
+    """Enqueues a ConvGeneralDilated operation onto the computation.
+
+    Args:
+      lhs: LocalOp for the rank N+2 array of inputs.
+      rhs: LocalOp for the rank N+2 array of kernel weights.
+      window_strides: length-N array-like of integer kernel strides.
+      padding: length-N array-like of pairs of integers of (low, high) padding.
+      lhs_dilation: length-N array-like of integer dilation factors.
+      rhs_dilation: length-N array-like of integer dilation factors.
+      dimension_numbers: either an xla_data_pb2.ConvolutionDimensionNumbers or a
+        triple (lhs_spec, rhs_spec, out_spec) where each element is a string of
+        length N+2 identifying by position (1) batch dimensions in lhs, rhs, and
+        the output with the character 'N', (2) feature dimensions in lhs and the
+        output with the character 'C', (3) input and output feature dimensions
+        in rhs with the characters 'I' and 'O' respectively, and (4) spatial
+        dimension correspondences between lhs, rhs, and the output using any
+        distinct characters. For example, to indicate dimension numbers
+        consistent with the Conv operation with two spatial dimensions, one
+        could use ('NCHW', 'OIHW', 'NCHW'). As another example, to indicate
+        dimension numbers consistent with the TensorFlow Conv2D operation, one
+        could use ('NHWC', 'HWIO', 'NHWC'). When using the latter form of
+        convolution dimension specification, window strides are associated with
+        spatial dimension character labels according to the order in which the
+        labels appear in the rhs_spec string, so that window_strides[0] is
+        matched with the dimension corresponding to the first character
+        appearing in rhs_spec that is not 'I' or 'O'.
+
+    Returns: a LocalOp representing the ConvGenralDilated operation.
+    """
+    if not isinstance(dimension_numbers,
+                      xla_data_pb2.ConvolutionDimensionNumbers):
+      lhs_spec, rhs_spec, out_spec = dimension_numbers
+      dimension_numbers = xla_data_pb2.ConvolutionDimensionNumbers()
+
+      dimension_numbers.input_batch_dimension = lhs_spec.index('N')
+      dimension_numbers.input_feature_dimension = lhs_spec.index('C')
+      dimension_numbers.output_batch_dimension = out_spec.index('N')
+      dimension_numbers.output_feature_dimension = out_spec.index('C')
+      dimension_numbers.kernel_output_feature_dimension = rhs_spec.index('O')
+      dimension_numbers.kernel_input_feature_dimension = rhs_spec.index('I')
+
+      dimension_numbers.kernel_spatial_dimensions.extend(
+          i for i, c in enumerate(rhs_spec) if c not in {'I', 'O'})
+      dimension_numbers.input_spatial_dimensions.extend(
+          sorted((i for i, c in enumerate(lhs_spec) if c not in {'N', 'C'}),
+                 key=lambda i: rhs_spec.index(lhs_spec[i])))
+      dimension_numbers.output_spatial_dimensions.extend(
+          sorted((i for i, c in enumerate(out_spec) if c not in {'N', 'C'}),
+                 key=lambda i: rhs_spec.index(out_spec[i])))
+    return self._client.ConvGeneralDilated(lhs, rhs, window_strides, padding,
+                                           lhs_dilation, rhs_dilation,
+                                           dimension_numbers)
+
 
 def _forward_methods_to_local_builder():
   """Forward remaining ComputationBuilder methods to the C API.
@@ -1196,15 +1181,14 @@ def forward_to_local_builder_with_handles(target_method, is_binop=False):
     """Generate a forwarding method that wraps/unwraps data handles."""
 
     def forward(self, *args, **kwargs):
-      unwrapped_args = [_unwrap_data_handle(arg) for arg in args]
+      arg_list = list(args)
 
-      if is_binop and len(unwrapped_args) < 3:
-        unwrapped_args.append(kwargs.get('broadcast_dimensions', ()))
+      if is_binop and len(arg_list) < 3:
+        arg_list.append(kwargs.get('broadcast_dimensions', ()))
 
-      return _wrap_data_handle(
-          target_method(
-              self._client,  # pylint: disable=protected-access
-              *unwrapped_args))
+      return target_method(
+          self._client,  # pylint: disable=protected-access
+          *arg_list)
 
     return forward
 
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index c073c02040e4d2..375e720f9b433f 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -164,6 +164,16 @@ def testSum2DF32(self):
         c.Constant(NumpyArrayF32([[1, -1, 1], [-1, 1, -1]])))
     self._ExecuteAndCompareClose(c, expected=[[2, 1, 4], [3, 6, 5]])
 
+  def testGetProto(self):
+    c = self._NewComputation()
+    c.Add(
+        c.Constant(NumpyArrayF32([[1, 2, 3], [4, 5, 6]])),
+        c.Constant(NumpyArrayF32([[1, -1, 1], [-1, 1, -1]])))
+    built = c.Build()
+    proto = built.GetProto()  # HloModuleProto
+    self.assertTrue(len(proto.computations) == 1)
+    self.assertTrue(len(proto.computations[0].instructions) == 3)
+
   def testSum2DF64(self):
     c = self._NewComputation()
     c.Add(
@@ -509,6 +519,46 @@ def testConvWithGeneralPaddingF32(self):
                          [40., 50., 0.]]]])
     self._ExecuteAndCompareClose(c, expected=result)
 
+  def testConvGeneralDilatedF32(self):
+    c = self._NewComputation()
+    a = lambda *dims: np.arange(np.prod(dims)).reshape(dims).astype("float32")
+    lhs = a(1, 1, 2, 3)
+    rhs = a(1, 1, 1, 2) * 10
+    strides = [1, 1]
+    pads = [(1, 0), (0, 1)]
+    lhs_dilation = (2, 1)
+    rhs_dilation = (1, 1)
+    dimension_numbers = ("NCHW", "OIHW", "NCHW")
+    c.ConvGeneralDilated(c.Constant(lhs), c.Constant(rhs),
+                         strides, pads, lhs_dilation, rhs_dilation,
+                         dimension_numbers)
+    result = np.array([[[[0., 0., 0.],
+                         [10., 20., 0.],
+                         [0., 0., 0.],
+                         [40., 50., 0.]]]])
+    self._ExecuteAndCompareClose(c, expected=result)
+
+  def testConvGeneralDilatedPermutedF32(self):
+    c = self._NewComputation()
+    a = lambda *dims: np.arange(np.prod(dims)).reshape(dims).astype("float32")
+    lhs = a(1, 1, 2, 3)
+    rhs = a(1, 1, 1, 2) * 10
+    strides = [1, 1]
+    pads = [(1, 0), (0, 1)]
+    lhs_dilation = (2, 1)
+    rhs_dilation = (1, 1)
+
+    dimension_numbers = ("NHWC", "OIHW", "CWNH")
+    c.ConvGeneralDilated(c.Constant(np.transpose(lhs, (0, 2, 3, 1))),
+                         c.Constant(rhs),
+                         strides, pads, lhs_dilation, rhs_dilation,
+                         dimension_numbers)
+    result = np.array([[[[0., 0., 0.],
+                         [10., 20., 0.],
+                         [0., 0., 0.],
+                         [40., 50., 0.]]]])
+    self._ExecuteAndCompareClose(c, expected=np.transpose(result, (1, 3, 0, 2)))
+
   def testBooleanNot(self):
     c = self._NewComputation()
     arr = NumpyArrayBool([True, False, True])
diff --git a/tensorflow/compiler/xla/reference_util.cc b/tensorflow/compiler/xla/reference_util.cc
index df9dbc58308f04..c289c84cff7438 100644
--- a/tensorflow/compiler/xla/reference_util.cc
+++ b/tensorflow/compiler/xla/reference_util.cc
@@ -572,7 +572,8 @@ ReferenceUtil::ConvArray4DGeneralDimensionsDilated(
 
   b.AddInstruction(HloInstruction::CreateConvolve(
       shape, lhs_instruction, rhs_instruction, window, dnums));
-  HloModule module("ReferenceUtil");
+  HloModuleConfig config;
+  HloModule module("ReferenceUtil", config);
   auto computation = module.AddEntryComputation(b.Build());
 
   HloEvaluator evaluator;
diff --git a/tensorflow/compiler/xla/reference_util.h b/tensorflow/compiler/xla/reference_util.h
index 28d6a8c3fe85fa..8fa6961d197dce 100644
--- a/tensorflow/compiler/xla/reference_util.h
+++ b/tensorflow/compiler/xla/reference_util.h
@@ -265,9 +265,9 @@ class ReferenceUtil {
                                               const Array3D<T>& rhs,
                                               int concatenate_dimension) {
     CHECK(0 <= concatenate_dimension && concatenate_dimension < 3);
-    std::vector<int64> lhs_dims = {lhs.n1(), lhs.n2(), lhs.n3()};
-    std::vector<int64> rhs_dims = {rhs.n1(), rhs.n2(), rhs.n3()};
-    std::vector<int64> out_dims = {rhs.n1(), rhs.n2(), rhs.n3()};
+    const int64 lhs_dims[] = {lhs.n1(), lhs.n2(), lhs.n3()};
+    const int64 rhs_dims[] = {rhs.n1(), rhs.n2(), rhs.n3()};
+    int64 out_dims[] = {rhs.n1(), rhs.n2(), rhs.n3()};
     for (int i = 0; i < 3; ++i) {
       if (i != concatenate_dimension) {
         out_dims[i] = lhs_dims[i];
@@ -299,9 +299,9 @@ class ReferenceUtil {
                                               const Array4D<T>& rhs,
                                               int concatenate_dimension) {
     CHECK(0 <= concatenate_dimension && concatenate_dimension < 4);
-    std::vector<int64> lhs_dims = {lhs.n1(), lhs.n2(), lhs.n3(), lhs.n4()};
-    std::vector<int64> rhs_dims = {rhs.n1(), rhs.n2(), rhs.n3(), rhs.n4()};
-    std::vector<int64> out_dims = {rhs.n1(), rhs.n2(), rhs.n3(), rhs.n4()};
+    const int64 lhs_dims[] = {lhs.n1(), lhs.n2(), lhs.n3(), lhs.n4()};
+    const int64 rhs_dims[] = {rhs.n1(), rhs.n2(), rhs.n3(), rhs.n4()};
+    int64 out_dims[] = {rhs.n1(), rhs.n2(), rhs.n3(), rhs.n4()};
     for (int i = 0; i < 4; ++i) {
       if (i != concatenate_dimension) {
         out_dims[i] = lhs_dims[i];
@@ -330,13 +330,14 @@ class ReferenceUtil {
     return result;
   }
 
-  // Slices with modulo-wrapping.
+  // Slices with index clamping
   template <typename T>
-  static std::vector<T> ModSlice1D(const tensorflow::gtl::ArraySlice<T>& input,
-                                   int64 start, int64 size) {
+  static std::vector<T> ClampSlice1D(
+      const tensorflow::gtl::ArraySlice<T>& input, int64 start, int64 size) {
+    start = std::min<int64>(std::max<int64>(0, start), input.size() - size);
     std::vector<T> result;
     for (int64 i = 0; i < size; ++i) {
-      result.push_back(input[(start + i) % input.size()]);
+      result.push_back(input[(start + i)]);
     }
     return result;
   }
@@ -552,12 +553,11 @@ class ReferenceUtil {
                                      const NativeT pad) {
     CHECK_EQ(padding.dimensions_size(), 3);
 
-    const std::vector<int64> input_bounds = {operand.n1(), operand.n2(),
-                                             operand.n3()};
-    std::vector<int64> pad_low(3);
-    std::vector<int64> pad_high(3);
-    std::vector<int64> pad_interior(3);
-    std::vector<int64> output_bounds(3);
+    const int64 input_bounds[] = {operand.n1(), operand.n2(), operand.n3()};
+    int64 pad_low[3];
+    int64 pad_high[3];
+    int64 pad_interior[3];
+    int64 output_bounds[3];
     for (int64 i = 0; i < 3; ++i) {
       pad_low[i] = padding.dimensions(i).edge_padding_low();
       pad_high[i] = padding.dimensions(i).edge_padding_high();
@@ -573,7 +573,7 @@ class ReferenceUtil {
 
     Array3D<NativeT> result(output_bounds[0], output_bounds[1],
                             output_bounds[2]);
-    std::vector<int> indices = {0, 0, 0};
+    int indices[] = {0, 0, 0};
     for (indices[0] = 0; indices[0] < output_bounds[0]; ++indices[0]) {
       for (indices[1] = 0; indices[1] < output_bounds[1]; ++indices[1]) {
         for (indices[2] = 0; indices[2] < output_bounds[2]; ++indices[2]) {
@@ -611,12 +611,12 @@ class ReferenceUtil {
                                      const NativeT pad) {
     CHECK_EQ(padding.dimensions_size(), 4);
 
-    const std::vector<int64> input_bounds = {operand.n1(), operand.n2(),
-                                             operand.n3(), operand.n4()};
-    std::vector<int64> pad_low(4);
-    std::vector<int64> pad_high(4);
-    std::vector<int64> pad_interior(4);
-    std::vector<int64> output_bounds(4);
+    const int64 input_bounds[] = {operand.n1(), operand.n2(), operand.n3(),
+                                  operand.n4()};
+    int64 pad_low[4];
+    int64 pad_high[4];
+    int64 pad_interior[4];
+    int64 output_bounds[4];
     for (int64 i = 0; i < 4; ++i) {
       pad_low[i] = padding.dimensions(i).edge_padding_low();
       pad_high[i] = padding.dimensions(i).edge_padding_high();
diff --git a/tensorflow/compiler/xla/rpc/BUILD b/tensorflow/compiler/xla/rpc/BUILD
index 977f8637873a4b..0d56a9a477b159 100644
--- a/tensorflow/compiler/xla/rpc/BUILD
+++ b/tensorflow/compiler/xla/rpc/BUILD
@@ -55,7 +55,7 @@ tf_cc_test(
     deps = [
         ":grpc_stub",
         "//tensorflow/compiler/xla/client",
-        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/xla/rpc/grpc_client_test.cc b/tensorflow/compiler/xla/rpc/grpc_client_test.cc
index b559ee4b5a345d..313f11a9a95715 100644
--- a/tensorflow/compiler/xla/rpc/grpc_client_test.cc
+++ b/tensorflow/compiler/xla/rpc/grpc_client_test.cc
@@ -24,7 +24,7 @@ limitations under the License.
 #include "grpc++/security/credentials.h"
 
 #include "tensorflow/compiler/xla/client/client.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/rpc/grpc_stub.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -84,7 +84,7 @@ TEST_F(GRPCClientTestBase, ItsAlive) {
 }
 
 TEST_F(GRPCClientTestBase, AxpyTenValues) {
-  ComputationBuilder builder(client_.get(), "axpy_10");
+  XlaBuilder builder("axpy_10");
   auto alpha = builder.ConstantR0<float>(3.1415926535);
   auto x = builder.ConstantR1<float>(
       {-1.0, 1.0, 2.0, -2.0, -3.0, 3.0, 4.0, -4.0, -5.0, 5.0});
@@ -101,8 +101,8 @@ TEST_F(GRPCClientTestBase, AxpyTenValues) {
   TF_ASSERT_OK_AND_ASSIGN(auto computation, builder.Build());
   TF_ASSERT_OK_AND_ASSIGN(auto result_literal, client_->ExecuteAndTransfer(
                                                    computation, {}, nullptr));
-  LiteralTestUtil::ExpectNear(*expected_literal, *result_literal,
-                              ErrorSpec(0.0001));
+  EXPECT_TRUE(LiteralTestUtil::Near(*expected_literal, *result_literal,
+                                    ErrorSpec(0.0001)));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/rpc/grpc_service.cc b/tensorflow/compiler/xla/rpc/grpc_service.cc
index 0b100bd108e239..4e1435fa30a24c 100644
--- a/tensorflow/compiler/xla/rpc/grpc_service.cc
+++ b/tensorflow/compiler/xla/rpc/grpc_service.cc
@@ -27,24 +27,11 @@ namespace xla {
   return std::move(grpc_service);
 }
 
-::grpc::Status DelegateRPC(std::function<tensorflow::Status()> op) {
-  tensorflow::Status s = op();
+::grpc::Status DelegateRPC(std::function<Status()> op) {
+  Status s = op();
   return tensorflow::ToGrpcStatus(s);
 }
 
-::grpc::Status GRPCService::Computation(::grpc::ServerContext* context,
-                                        const ComputationRequest* arg,
-                                        ComputationResponse* result) {
-  return DelegateRPC(
-      [this, arg, result]() { return service_->Computation(arg, result); });
-}
-
-::grpc::Status GRPCService::CreateOp(::grpc::ServerContext* context,
-                                     const OpRequest* arg, OpResponse* result) {
-  return DelegateRPC(
-      [this, arg, result]() { return service_->Op(arg, result); });
-}
-
 ::grpc::Status GRPCService::Unregister(::grpc::ServerContext* context,
                                        const UnregisterRequest* arg,
                                        UnregisterResponse* result) {
@@ -60,26 +47,11 @@ ::grpc::Status GRPCService::DeconstructTuple(::grpc::ServerContext* context,
   });
 }
 
-::grpc::Status GRPCService::SetReturnValue(::grpc::ServerContext* context,
-                                           const SetReturnValueRequest* arg,
-                                           SetReturnValueResponse* results) {
-  return DelegateRPC([this, arg, results]() {
-    return service_->SetReturnValue(arg, results);
-  });
-}
-
-::grpc::Status GRPCService::Execute(::grpc::ServerContext* context,
-                                    const ExecuteRequest* arg,
-                                    ExecuteResponse* result) {
+::grpc::Status GRPCService::ExecuteGraph(::grpc::ServerContext* /*context*/,
+                                         const ExecuteGraphRequest* arg,
+                                         ExecuteResponse* result) {
   return DelegateRPC(
-      [this, arg, result]() { return service_->Execute(arg, result); });
-}
-
-::grpc::Status GRPCService::ExecuteAsync(::grpc::ServerContext* context,
-                                         const ExecuteAsyncRequest* arg,
-                                         ExecuteAsyncResponse* result) {
-  return DelegateRPC(
-      [this, arg, result]() { return service_->ExecuteAsync(arg, result); });
+      [this, arg, result]() { return service_->ExecuteGraph(arg, result); });
 }
 
 ::grpc::Status GRPCService::WaitForExecution(::grpc::ServerContext* context,
@@ -129,20 +101,6 @@ ::grpc::Status GRPCService::ResetDevice(::grpc::ServerContext* context,
       [this, arg, result]() { return service_->ResetDevice(arg, result); });
 }
 
-::grpc::Status GRPCService::IsConstant(::grpc::ServerContext* context,
-                                       const IsConstantRequest* arg,
-                                       IsConstantResponse* result) {
-  return DelegateRPC(
-      [this, arg, result]() { return service_->IsConstant(arg, result); });
-}
-
-::grpc::Status GRPCService::ComputeConstant(::grpc::ServerContext* context,
-                                            const ComputeConstantRequest* arg,
-                                            ComputeConstantResponse* result) {
-  return DelegateRPC(
-      [this, arg, result]() { return service_->ComputeConstant(arg, result); });
-}
-
 ::grpc::Status GRPCService::GetShape(::grpc::ServerContext* context,
                                      const GetShapeRequest* arg,
                                      GetShapeResponse* result) {
@@ -150,43 +108,4 @@ ::grpc::Status GRPCService::GetShape(::grpc::ServerContext* context,
       [this, arg, result]() { return service_->GetShape(arg, result); });
 }
 
-::grpc::Status GRPCService::GetComputationShape(
-    ::grpc::ServerContext* context, const GetComputationShapeRequest* arg,
-    GetComputationShapeResponse* result) {
-  return DelegateRPC([this, arg, result]() {
-    return service_->GetComputationShape(arg, result);
-  });
-}
-
-::grpc::Status GRPCService::GetLocalShape(::grpc::ServerContext* context,
-                                          const GetLocalShapeRequest* arg,
-                                          GetLocalShapeResponse* result) {
-  return DelegateRPC(
-      [this, arg, result]() { return service_->GetLocalShape(arg, result); });
-}
-
-::grpc::Status GRPCService::GetComputationStats(
-    ::grpc::ServerContext* context, const ComputationStatsRequest* arg,
-    ComputationStatsResponse* result) {
-  return DelegateRPC([this, arg, result]() {
-    return service_->GetComputationStats(arg, result);
-  });
-}
-
-::grpc::Status GRPCService::SnapshotComputation(
-    ::grpc::ServerContext* context, const SnapshotComputationRequest* arg,
-    SnapshotComputationResponse* result) {
-  return DelegateRPC([this, arg, result]() {
-    return service_->SnapshotComputation(arg, result);
-  });
-}
-
-::grpc::Status GRPCService::LoadComputationSnapshot(
-    ::grpc::ServerContext* context, const LoadComputationSnapshotRequest* arg,
-    LoadComputationSnapshotResponse* result) {
-  return DelegateRPC([this, arg, result]() {
-    return service_->LoadComputationSnapshot(arg, result);
-  });
-}
-
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/rpc/grpc_service.h b/tensorflow/compiler/xla/rpc/grpc_service.h
index fad74375bd59f7..5cd573167ae8c0 100644
--- a/tensorflow/compiler/xla/rpc/grpc_service.h
+++ b/tensorflow/compiler/xla/rpc/grpc_service.h
@@ -31,13 +31,6 @@ class GRPCService : public grpc::XlaService::Service {
   static StatusOr<std::unique_ptr<GRPCService>> NewService(
       se::Platform* platform = nullptr);
 
-  ::grpc::Status Computation(::grpc::ServerContext* context,
-                             const ComputationRequest* arg,
-                             ComputationResponse* result) override;
-
-  ::grpc::Status CreateOp(::grpc::ServerContext* context, const OpRequest* arg,
-                          OpResponse* result) override;
-
   ::grpc::Status Unregister(::grpc::ServerContext* context,
                             const UnregisterRequest* arg,
                             UnregisterResponse* result) override;
@@ -46,17 +39,9 @@ class GRPCService : public grpc::XlaService::Service {
                                   const DeconstructTupleRequest* arg,
                                   DeconstructTupleResponse* result) override;
 
-  ::grpc::Status SetReturnValue(::grpc::ServerContext* context,
-                                const SetReturnValueRequest* arg,
-                                SetReturnValueResponse* results) override;
-
-  ::grpc::Status Execute(::grpc::ServerContext* context,
-                         const ExecuteRequest* arg,
-                         ExecuteResponse* result) override;
-
-  ::grpc::Status ExecuteAsync(::grpc::ServerContext* context,
-                              const ExecuteAsyncRequest* arg,
-                              ExecuteAsyncResponse* result) override;
+  ::grpc::Status ExecuteGraph(::grpc::ServerContext* context,
+                              const ExecuteGraphRequest* arg,
+                              ExecuteResponse* result) override;
 
   ::grpc::Status WaitForExecution(::grpc::ServerContext* context,
                                   const WaitForExecutionRequest* arg,
@@ -82,38 +67,10 @@ class GRPCService : public grpc::XlaService::Service {
                              const ResetDeviceRequest* arg,
                              ResetDeviceResponse* result) override;
 
-  ::grpc::Status IsConstant(::grpc::ServerContext* context,
-                            const IsConstantRequest* arg,
-                            IsConstantResponse* result) override;
-
-  ::grpc::Status ComputeConstant(::grpc::ServerContext* context,
-                                 const ComputeConstantRequest* arg,
-                                 ComputeConstantResponse* result) override;
-
   ::grpc::Status GetShape(::grpc::ServerContext* context,
                           const GetShapeRequest* arg,
                           GetShapeResponse* result) override;
 
-  ::grpc::Status GetComputationShape(
-      ::grpc::ServerContext* context, const GetComputationShapeRequest* arg,
-      GetComputationShapeResponse* result) override;
-
-  ::grpc::Status GetLocalShape(::grpc::ServerContext* context,
-                               const GetLocalShapeRequest* arg,
-                               GetLocalShapeResponse* result) override;
-
-  ::grpc::Status GetComputationStats(::grpc::ServerContext* context,
-                                     const ComputationStatsRequest* arg,
-                                     ComputationStatsResponse* result) override;
-
-  ::grpc::Status SnapshotComputation(
-      ::grpc::ServerContext* context, const SnapshotComputationRequest* arg,
-      SnapshotComputationResponse* result) override;
-
-  ::grpc::Status LoadComputationSnapshot(
-      ::grpc::ServerContext* context, const LoadComputationSnapshotRequest* arg,
-      LoadComputationSnapshotResponse* result) override;
-
  private:
   std::unique_ptr<::xla::Service> service_;
 
diff --git a/tensorflow/compiler/xla/rpc/grpc_stub.cc b/tensorflow/compiler/xla/rpc/grpc_stub.cc
index e1f2b0abe39b10..7b8ab158e1396d 100644
--- a/tensorflow/compiler/xla/rpc/grpc_stub.cc
+++ b/tensorflow/compiler/xla/rpc/grpc_stub.cc
@@ -20,82 +20,56 @@ namespace xla {
 
 GRPCStub::~GRPCStub() = default;
 
-tensorflow::Status MakeRPC(
+Status MakeRPC(
     const std::function<::grpc::Status(::grpc::ClientContext*)>& rpc_method) {
   ::grpc::ClientContext context;
   ::grpc::Status s = rpc_method(&context);
   return tensorflow::FromGrpcStatus(s);
 }
 
-tensorflow::Status GRPCStub::TransferToClient(
-    const TransferToClientRequest* request,
-    TransferToClientResponse* response) {
+Status GRPCStub::TransferToClient(const TransferToClientRequest* request,
+                                  TransferToClientResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->TransferToClient(context, *request, response);
   });
 }
 
-tensorflow::Status GRPCStub::TransferToServer(
-    const TransferToServerRequest* request,
-    TransferToServerResponse* response) {
+Status GRPCStub::TransferToServer(const TransferToServerRequest* request,
+                                  TransferToServerResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->TransferToServer(context, *request, response);
   });
 }
 
-tensorflow::Status GRPCStub::TransferToInfeed(
-    const TransferToInfeedRequest* request,
-    TransferToInfeedResponse* response) {
+Status GRPCStub::TransferToInfeed(const TransferToInfeedRequest* request,
+                                  TransferToInfeedResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->TransferToInfeed(context, *request, response);
   });
 }
 
-tensorflow::Status GRPCStub::TransferFromOutfeed(
-    const TransferFromOutfeedRequest* request,
-    TransferFromOutfeedResponse* response) {
+Status GRPCStub::TransferFromOutfeed(const TransferFromOutfeedRequest* request,
+                                     TransferFromOutfeedResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->TransferFromOutfeed(context, *request, response);
   });
 }
 
-tensorflow::Status GRPCStub::ResetDevice(const ResetDeviceRequest* request,
-                                         ResetDeviceResponse* response) {
+Status GRPCStub::ResetDevice(const ResetDeviceRequest* request,
+                             ResetDeviceResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->ResetDevice(context, *request, response);
   });
 }
 
-tensorflow::Status GRPCStub::LoadComputationSnapshot(
-    const LoadComputationSnapshotRequest* request,
-    LoadComputationSnapshotResponse* response) {
-  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
-    return grpc_stub_->LoadComputationSnapshot(context, *request, response);
-  });
-}
-
-tensorflow::Status GRPCStub::Execute(const ExecuteRequest* request,
-                                     ExecuteResponse* response) {
-  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
-    return grpc_stub_->Execute(context, *request, response);
-  });
-}
-
-tensorflow::Status GRPCStub::ExecuteGraph(const ExecuteGraphRequest* request,
-                                          ExecuteResponse* response) {
+Status GRPCStub::ExecuteGraph(const ExecuteGraphRequest* request,
+                              ExecuteResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->ExecuteGraph(context, *request, response);
   });
 }
 
-tensorflow::Status GRPCStub::ExecuteParallel(
-    const ExecuteParallelRequest* request, ExecuteParallelResponse* response) {
-  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
-    return grpc_stub_->ExecuteParallel(context, *request, response);
-  });
-}
-
-tensorflow::Status GRPCStub::ExecuteGraphParallel(
+Status GRPCStub::ExecuteGraphParallel(
     const ExecuteGraphParallelRequest* request,
     ExecuteParallelResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
@@ -103,38 +77,21 @@ tensorflow::Status GRPCStub::ExecuteGraphParallel(
   });
 }
 
-tensorflow::Status GRPCStub::ExecuteAsync(const ExecuteAsyncRequest* request,
-                                          ExecuteAsyncResponse* response) {
-  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
-    return grpc_stub_->ExecuteAsync(context, *request, response);
-  });
-}
-
-tensorflow::Status GRPCStub::WaitForExecution(
-    const WaitForExecutionRequest* request,
-    WaitForExecutionResponse* response) {
+Status GRPCStub::WaitForExecution(const WaitForExecutionRequest* request,
+                                  WaitForExecutionResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->WaitForExecution(context, *request, response);
   });
 }
 
-tensorflow::Status GRPCStub::DeconstructTuple(
-    const DeconstructTupleRequest* request,
-    DeconstructTupleResponse* response) {
+Status GRPCStub::DeconstructTuple(const DeconstructTupleRequest* request,
+                                  DeconstructTupleResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->DeconstructTuple(context, *request, response);
   });
 }
 
-tensorflow::Status GRPCStub::GetComputationStats(
-    const ComputationStatsRequest* request,
-    ComputationStatsResponse* response) {
-  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
-    return grpc_stub_->GetComputationStats(context, *request, response);
-  });
-}
-
-tensorflow::Status GRPCStub::GetComputationGraphStats(
+Status GRPCStub::GetComputationGraphStats(
     const ComputationGraphStatsRequest* request,
     ComputationStatsResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
@@ -142,81 +99,28 @@ tensorflow::Status GRPCStub::GetComputationGraphStats(
   });
 }
 
-tensorflow::Status GRPCStub::GetComputationShape(
-    const GetComputationShapeRequest* request,
-    GetComputationShapeResponse* response) {
-  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
-    return grpc_stub_->GetComputationShape(context, *request, response);
-  });
-}
-
-tensorflow::Status GRPCStub::GetShape(const GetShapeRequest* request,
-                                      GetShapeResponse* response) {
+Status GRPCStub::GetShape(const GetShapeRequest* request,
+                          GetShapeResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->GetShape(context, *request, response);
   });
 }
 
-tensorflow::Status GRPCStub::GetDeviceHandles(
-    const GetDeviceHandlesRequest* request,
-    GetDeviceHandlesResponse* response) {
+Status GRPCStub::GetDeviceHandles(const GetDeviceHandlesRequest* request,
+                                  GetDeviceHandlesResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->GetDeviceHandles(context, *request, response);
   });
 }
 
-tensorflow::Status GRPCStub::CreateChannelHandle(
-    const CreateChannelHandleRequest* request,
-    CreateChannelHandleResponse* response) {
+Status GRPCStub::CreateChannelHandle(const CreateChannelHandleRequest* request,
+                                     CreateChannelHandleResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->CreateChannelHandle(context, *request, response);
   });
 }
 
-// Methods used by ComputationBuilder.
-tensorflow::Status GRPCStub::Computation(const ComputationRequest* request,
-                                         ComputationResponse* response) {
-  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
-    return grpc_stub_->Computation(context, *request, response);
-  });
-}
-
-tensorflow::Status GRPCStub::Op(const OpRequest* request,
-                                OpResponse* response) {
-  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
-    return grpc_stub_->CreateOp(context, *request, response);
-  });
-}
-
-tensorflow::Status GRPCStub::GetLocalShape(const GetLocalShapeRequest* request,
-                                           GetLocalShapeResponse* response) {
-  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
-    return grpc_stub_->GetLocalShape(context, *request, response);
-  });
-}
-
-tensorflow::Status GRPCStub::SetReturnValue(
-    const SetReturnValueRequest* request, SetReturnValueResponse* responses) {
-  return MakeRPC([this, request, responses](::grpc::ClientContext* context) {
-    return grpc_stub_->SetReturnValue(context, *request, responses);
-  });
-}
-
-tensorflow::Status GRPCStub::IsConstant(const IsConstantRequest* request,
-                                        IsConstantResponse* response) {
-  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
-    return grpc_stub_->IsConstant(context, *request, response);
-  });
-}
-
-tensorflow::Status GRPCStub::ComputeConstant(
-    const ComputeConstantRequest* request, ComputeConstantResponse* response) {
-  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
-    return grpc_stub_->ComputeConstant(context, *request, response);
-  });
-}
-
-tensorflow::Status GRPCStub::ComputeConstantGraph(
+Status GRPCStub::ComputeConstantGraph(
     const ComputeConstantGraphRequest* request,
     ComputeConstantResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
@@ -224,18 +128,9 @@ tensorflow::Status GRPCStub::ComputeConstantGraph(
   });
 }
 
-// Methods used by Computation.
-tensorflow::Status GRPCStub::SnapshotComputation(
-    const SnapshotComputationRequest* request,
-    SnapshotComputationResponse* response) {
-  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
-    return grpc_stub_->SnapshotComputation(context, *request, response);
-  });
-}
-
 // Methods used by GlobalData.
-tensorflow::Status GRPCStub::Unregister(const UnregisterRequest* request,
-                                        UnregisterResponse* response) {
+Status GRPCStub::Unregister(const UnregisterRequest* request,
+                            UnregisterResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
     return grpc_stub_->Unregister(context, *request, response);
   });
diff --git a/tensorflow/compiler/xla/rpc/grpc_stub.h b/tensorflow/compiler/xla/rpc/grpc_stub.h
index fd9810d4f1a5e0..8dfcb761387d60 100644
--- a/tensorflow/compiler/xla/rpc/grpc_stub.h
+++ b/tensorflow/compiler/xla/rpc/grpc_stub.h
@@ -28,105 +28,51 @@ class GRPCStub : public ServiceInterface {
   explicit GRPCStub(grpc::XlaService::Stub* stub) : grpc_stub_(stub) {}
   ~GRPCStub() override;
 
-  tensorflow::Status TransferToClient(
-      const TransferToClientRequest* arg,
-      TransferToClientResponse* result) override;
+  Status TransferToClient(const TransferToClientRequest* arg,
+                          TransferToClientResponse* result) override;
 
-  tensorflow::Status TransferToServer(
-      const TransferToServerRequest* arg,
-      TransferToServerResponse* result) override;
+  Status TransferToServer(const TransferToServerRequest* arg,
+                          TransferToServerResponse* result) override;
 
-  tensorflow::Status TransferToInfeed(
-      const TransferToInfeedRequest* arg,
-      TransferToInfeedResponse* result) override;
+  Status TransferToInfeed(const TransferToInfeedRequest* arg,
+                          TransferToInfeedResponse* result) override;
 
-  tensorflow::Status TransferFromOutfeed(
-      const TransferFromOutfeedRequest* arg,
-      TransferFromOutfeedResponse* result) override;
+  Status TransferFromOutfeed(const TransferFromOutfeedRequest* arg,
+                             TransferFromOutfeedResponse* result) override;
 
-  tensorflow::Status ResetDevice(const ResetDeviceRequest* arg,
-                                 ResetDeviceResponse* result) override;
+  Status ResetDevice(const ResetDeviceRequest* arg,
+                     ResetDeviceResponse* result) override;
 
-  tensorflow::Status LoadComputationSnapshot(
-      const LoadComputationSnapshotRequest* request,
-      LoadComputationSnapshotResponse* result) override;
+  Status ExecuteGraph(const ExecuteGraphRequest* request,
+                      ExecuteResponse* response) override;
 
-  tensorflow::Status Execute(const ExecuteRequest* arg,
-                             ExecuteResponse* result) override;
+  Status ExecuteGraphParallel(const ExecuteGraphParallelRequest* request,
+                              ExecuteParallelResponse* response) override;
 
-  tensorflow::Status ExecuteGraph(const ExecuteGraphRequest* request,
-                                  ExecuteResponse* response) override;
+  Status WaitForExecution(const WaitForExecutionRequest* arg,
+                          WaitForExecutionResponse* result) override;
 
-  tensorflow::Status ExecuteParallel(const ExecuteParallelRequest* arg,
-                                     ExecuteParallelResponse* result) override;
+  Status DeconstructTuple(const DeconstructTupleRequest* arg,
+                          DeconstructTupleResponse* result) override;
 
-  tensorflow::Status ExecuteGraphParallel(
-      const ExecuteGraphParallelRequest* request,
-      ExecuteParallelResponse* response) override;
+  Status GetComputationGraphStats(const ComputationGraphStatsRequest* request,
+                                  ComputationStatsResponse* response) override;
 
-  tensorflow::Status ExecuteAsync(const ExecuteAsyncRequest* arg,
-                                  ExecuteAsyncResponse* result) override;
+  Status GetShape(const GetShapeRequest* arg,
+                  GetShapeResponse* result) override;
 
-  tensorflow::Status WaitForExecution(
-      const WaitForExecutionRequest* arg,
-      WaitForExecutionResponse* result) override;
+  Status GetDeviceHandles(const GetDeviceHandlesRequest* arg,
+                          GetDeviceHandlesResponse* result) override;
 
-  tensorflow::Status DeconstructTuple(
-      const DeconstructTupleRequest* arg,
-      DeconstructTupleResponse* result) override;
+  Status CreateChannelHandle(const CreateChannelHandleRequest* arg,
+                             CreateChannelHandleResponse* result) override;
 
-  tensorflow::Status GetComputationStats(
-      const ComputationStatsRequest* arg,
-      ComputationStatsResponse* result) override;
-
-  tensorflow::Status GetComputationGraphStats(
-      const ComputationGraphStatsRequest* request,
-      ComputationStatsResponse* response) override;
-
-  tensorflow::Status GetComputationShape(
-      const GetComputationShapeRequest* arg,
-      GetComputationShapeResponse* result) override;
-
-  tensorflow::Status GetShape(const GetShapeRequest* arg,
-                              GetShapeResponse* result) override;
-
-  tensorflow::Status GetDeviceHandles(
-      const GetDeviceHandlesRequest* arg,
-      GetDeviceHandlesResponse* result) override;
-
-  tensorflow::Status CreateChannelHandle(
-      const CreateChannelHandleRequest* arg,
-      CreateChannelHandleResponse* result) override;
-
-  // Methods used by ComputationBuilder.
-  tensorflow::Status Computation(const ComputationRequest* arg,
-                                 ComputationResponse* result) override;
-
-  tensorflow::Status Op(const OpRequest* arg, OpResponse* result) override;
-  tensorflow::Status GetLocalShape(const GetLocalShapeRequest* arg,
-                                   GetLocalShapeResponse* result) override;
-
-  tensorflow::Status SetReturnValue(const SetReturnValueRequest* arg,
-                                    SetReturnValueResponse* results) override;
-
-  tensorflow::Status IsConstant(const IsConstantRequest* arg,
-                                IsConstantResponse* result) override;
-
-  tensorflow::Status ComputeConstant(const ComputeConstantRequest* arg,
-                                     ComputeConstantResponse* result) override;
-
-  tensorflow::Status ComputeConstantGraph(
-      const ComputeConstantGraphRequest* arg,
-      ComputeConstantResponse* result) override;
-
-  // Methods used by Computation.
-  tensorflow::Status SnapshotComputation(
-      const SnapshotComputationRequest* ag,
-      SnapshotComputationResponse* result) override;
+  Status ComputeConstantGraph(const ComputeConstantGraphRequest* arg,
+                              ComputeConstantResponse* result) override;
 
   // Methods used by GlobalData.
-  tensorflow::Status Unregister(const UnregisterRequest* arg,
-                                UnregisterResponse* result) override;
+  Status Unregister(const UnregisterRequest* arg,
+                    UnregisterResponse* result) override;
 
   grpc::XlaService::Stub* service() { return grpc_stub_; }
 
diff --git a/tensorflow/compiler/xla/rpc/xla_service.proto b/tensorflow/compiler/xla/rpc/xla_service.proto
index c47164ee1b7657..92eb19ec0f9696 100644
--- a/tensorflow/compiler/xla/rpc/xla_service.proto
+++ b/tensorflow/compiler/xla/rpc/xla_service.proto
@@ -75,19 +75,7 @@ service XlaService {
   rpc GetShape(GetShapeRequest) returns (GetShapeResponse) {
   }
 
-  // Requests the program shape of the referenced computation.
-  rpc GetComputationShape(GetComputationShapeRequest)
-      returns (GetComputationShapeResponse) {
-  }
-
   // Requests the statistics of the given computation.
-  rpc GetComputationStats(ComputationStatsRequest)
-      returns (ComputationStatsResponse) {
-  }
-
-  // Requests the statistics of the given computation.
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   rpc GetComputationGraphStats(ComputationGraphStatsRequest)
       returns (ComputationStatsResponse) {
   }
@@ -121,15 +109,6 @@ service XlaService {
   rpc ResetDevice(ResetDeviceRequest) returns (ResetDeviceResponse) {
   }
 
-  // Tests if an expression is a compile-time constant.
-  rpc IsConstant(IsConstantRequest) returns (IsConstantResponse) {
-  }
-
-  // Computes the value of a constant expression.
-  rpc ComputeConstant(ComputeConstantRequest)
-      returns (ComputeConstantResponse) {
-  }
-
   // Computes the value of a constant expression. The request contains the
   // computation graph for the constant expression.
   rpc ComputeConstantGraph(ComputeConstantGraphRequest)
@@ -165,20 +144,6 @@ service XlaService {
   rpc SetReturnValue(SetReturnValueRequest) returns (SetReturnValueResponse) {
   }
 
-  // Computation creates a new computation with the given name.
-  // A unique ComputationHandle is returned.
-  rpc Computation(ComputationRequest) returns (ComputationResponse) {
-  }
-
-  // Adds a new op to a computation.
-  rpc CreateOp(OpRequest) returns (OpResponse) {
-  }
-
-  // Invokes the provided computation with the provided global data passed as
-  // immutable arguments. Returns global data output and execution timing.
-  rpc Execute(ExecuteRequest) returns (ExecuteResponse) {
-  }
-
   // Invokes the provided computation with the provided global data passed as
   // immutable arguments. The request contains the whole computation graph.
   // Returns global data output and execution timing.
@@ -188,38 +153,13 @@ service XlaService {
   // Invokes the provided list of computations in parallel with the provided
   // global data for each computation. Returns a list of global data output and
   // execution timing.
-  rpc ExecuteParallel(ExecuteParallelRequest)
-      returns (ExecuteParallelResponse) {
-  }
-
-  // Invokes the provided list of computations in parallel with the provided
-  // global data for each computation. Returns a list of global data output and
-  // execution timing.
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   rpc ExecuteGraphParallel(ExecuteGraphParallelRequest)
       returns (ExecuteParallelResponse) {
   }
 
-  // Invokes the provided computation with the provided global data passed as
-  // immutable arguments. Returns a handle to the execution.
-  rpc ExecuteAsync(ExecuteAsyncRequest) returns (ExecuteAsyncResponse) {
-  }
-
   // Waits until the given execution (aysnchronously launched) is complete, and
   // returns the global data output.
   rpc WaitForExecution(WaitForExecutionRequest)
       returns (WaitForExecutionResponse) {
   }
-
-  // Serializes a computation to proto form, so it can be loaded via
-  // LoadComputationSnapshot.
-  rpc SnapshotComputation(SnapshotComputationRequest)
-      returns (SnapshotComputationResponse) {
-  }
-
-  // Loads a computation from a captured snapshot.
-  rpc LoadComputationSnapshot(LoadComputationSnapshotRequest)
-      returns (LoadComputationSnapshotResponse) {
-  }
 }
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index e794ec12c7b7af..1f2de0c9553933 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -12,21 +12,26 @@ package_group(
     ],
 )
 
+load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_proto_library_py",
+)
 
 xla_proto_library(
-    name = "session_proto",
-    srcs = ["session.proto"],
+    name = "hlo_proto",
+    srcs = ["hlo.proto"],
     visibility = ["//visibility:public"],
     deps = ["//tensorflow/compiler/xla:xla_data_proto"],
 )
 
-xla_proto_library(
-    name = "hlo_proto",
+tf_proto_library_py(
+    name = "hlo_proto",  # bzl adds a _py suffix only to the OSS target.
     srcs = ["hlo.proto"],
-    deps = ["//tensorflow/compiler/xla:xla_data_proto"],
+    visibility = ["//visibility:public"],
 )
 
 xla_proto_library(
@@ -200,7 +205,22 @@ tf_cc_test(
 
 cc_library(
     name = "hlo_evaluator",
-    srcs = ["hlo_evaluator.cc"],
+    srcs = [
+        "hlo_evaluator.cc",
+        "hlo_evaluator_typed_visitor.h",
+        "hlo_evaluator_typed_visitor_bfloat16.cc",
+        "hlo_evaluator_typed_visitor_bool.cc",
+        "hlo_evaluator_typed_visitor_complex64.cc",
+        "hlo_evaluator_typed_visitor_double.cc",
+        "hlo_evaluator_typed_visitor_float.cc",
+        "hlo_evaluator_typed_visitor_half.cc",
+        "hlo_evaluator_typed_visitor_int32.cc",
+        "hlo_evaluator_typed_visitor_int64.cc",
+        "hlo_evaluator_typed_visitor_int8.cc",
+        "hlo_evaluator_typed_visitor_uint32.cc",
+        "hlo_evaluator_typed_visitor_uint64.cc",
+        "hlo_evaluator_typed_visitor_uint8.cc",
+    ],
     hdrs = ["hlo_evaluator.h"],
     deps = [
         ":hlo",
@@ -233,7 +253,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/service:hlo_element_type_converter",
         "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -256,7 +276,9 @@ cc_library(
     hdrs = [
         "dfs_hlo_visitor.h",
         "dfs_hlo_visitor_with_default.h",
+        "hlo_clone_context.h",
         "hlo_computation.h",
+        "hlo_domain_metadata.h",
         "hlo_instruction.h",
         "hlo_module.h",
         "hlo_opcode.h",
@@ -280,6 +302,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:human_readable_json",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
     ],
@@ -319,8 +342,8 @@ tf_cc_test(
         ":hlo",
         ":pattern_matcher",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:test",
     ],
 )
@@ -358,6 +381,7 @@ cc_library(
     deps = [
         ":hlo",
         "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
     ],
@@ -368,6 +392,7 @@ tf_cc_test(
     srcs = ["hlo_matchers_test.cc"],
     deps = [
         ":hlo_matchers",
+        ":hlo_parser",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
@@ -389,12 +414,14 @@ tf_cc_test(
     srcs = ["hlo_instruction_test.cc"],
     deps = [
         ":hlo",
+        ":hlo_parser",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
@@ -411,6 +438,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
@@ -515,45 +543,6 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "user_computation",
-    srcs = ["user_computation.cc"],
-    hdrs = ["user_computation.h"],
-    deps = [
-        ":hlo",
-        ":session_proto",
-        ":shape_inference",
-        ":versioned_computation_handle",
-        "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla:xla_proto",
-        "//tensorflow/core:lib",
-    ],
-)
-
-tf_cc_test(
-    name = "user_computation_test",
-    srcs = ["user_computation_test.cc"],
-    deps = [
-        ":hlo_matchers",
-        ":user_computation",
-        "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/core:test",
-    ],
-)
-
 cc_library(
     name = "platform_util",
     srcs = ["platform_util.cc"],
@@ -602,7 +591,6 @@ cc_library(
         ":compilation_cache",
         ":compiler",
         ":computation_layout",
-        ":computation_tracker",
         ":device_memory_allocator",
         ":executable",
         ":execution_tracker",
@@ -613,10 +601,8 @@ cc_library(
         ":hlo_module_config",
         ":hlo_proto_util",
         ":platform_util",
-        ":session_proto",
         ":source_map_util",
         ":transfer_manager",
-        ":user_computation",
         ":versioned_computation_handle",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:execution_options_util",
@@ -644,7 +630,6 @@ cc_library(
         ":backend",
         ":compiler",
         ":computation_layout",
-        ":computation_tracker",
         ":device_memory_allocator",
         ":executable",
         ":hlo",
@@ -653,7 +638,6 @@ cc_library(
         ":platform_util",
         ":service",
         ":shaped_buffer",
-        ":user_computation",
         ":versioned_computation_handle",
         "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:shape_layout",
@@ -678,7 +662,6 @@ cc_library(
         ":backend",
         ":compiler",
         ":computation_layout",
-        ":computation_tracker",
         ":platform_util",
         ":service",
         "//tensorflow/compiler/xla:status_macros",
@@ -744,6 +727,23 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "shaped_buffer_test",
+    srcs = ["shaped_buffer_test.cc"],
+    deps = [
+        ":cpu_plugin",
+        ":device_memory_allocator",
+        ":platform_util",
+        ":shaped_buffer",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:ptr_util",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "executable",
     srcs = ["executable.cc"],
@@ -759,7 +759,6 @@ cc_library(
         ":hlo_graph_dumper",
         ":hlo_proto",
         ":pool",
-        ":session_proto",
         ":shaped_buffer",
         ":versioned_computation_handle",
         "//tensorflow/compiler/xla:executable_run_options",
@@ -781,6 +780,7 @@ cc_library(
     srcs = ["compiler.cc"],
     hdrs = ["compiler.h"],
     deps = [
+        ":buffer_value",
         ":executable",
         ":hlo",
         ":hlo_module_config",
@@ -837,7 +837,6 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
     ],
 )
 
@@ -857,33 +856,12 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "computation_tracker",
-    srcs = ["computation_tracker.cc"],
-    hdrs = ["computation_tracker.h"],
-    deps = [
-        ":hlo",
-        ":hlo_module_config",
-        ":session_proto",
-        ":user_computation",
-        ":versioned_computation_handle",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/core:lib",
-    ],
-)
-
 cc_library(
     name = "channel_tracker",
     srcs = ["channel_tracker.cc"],
     hdrs = ["channel_tracker.h"],
     deps = [
         ":hlo",
-        ":session_proto",
-        ":user_computation",
         ":versioned_computation_handle",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:status_macros",
@@ -918,33 +896,6 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "liveness_util",
-    srcs = ["liveness_util.cc"],
-    hdrs = ["liveness_util.h"],
-    deps = [
-        ":hlo",
-        ":hlo_dataflow_analysis",
-        ":logical_buffer",
-        ":tuple_points_to_analysis",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-    ],
-)
-
-tf_cc_test(
-    name = "liveness_util_test",
-    srcs = ["liveness_util_test.cc"],
-    deps = [
-        ":hlo",
-        ":liveness_util",
-        ":tuple_points_to_analysis",
-        "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-    ],
-)
-
 cc_library(
     name = "buffer_liveness",
     srcs = [
@@ -956,7 +907,6 @@ cc_library(
     deps = [
         ":hlo",
         ":hlo_ordering",
-        ":liveness_util",
         ":logical_buffer",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:shape_util",
@@ -993,6 +943,7 @@ cc_library(
     ],
     deps = [
         ":buffer_liveness",
+        ":buffer_value_containers",
         ":heap_simulator",
         ":hlo",
         ":hlo_proto",
@@ -1015,8 +966,8 @@ tf_cc_test(
     srcs = ["buffer_assignment_test.cc"],
     deps = [
         ":buffer_assignment",
+        ":buffer_value",
         ":call_graph",
-        ":computation_tracker",
         ":copy_insertion",
         ":cpu_plugin",
         ":flatten_call_graph",
@@ -1030,9 +981,9 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:lib",
     ],
 )
@@ -1047,7 +998,6 @@ cc_library(
         ":hlo_dataflow_analysis",
         ":hlo_proto",
         ":hlo_value",
-        ":liveness_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
@@ -1069,9 +1019,9 @@ tf_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
     ],
 )
 
@@ -1080,11 +1030,11 @@ cc_library(
     srcs = ["heap_simulator.cc"],
     hdrs = ["heap_simulator.h"],
     deps = [
+        ":buffer_value",
+        ":buffer_value_containers",
         ":hlo",
         ":hlo_ordering",
         ":hlo_proto",
-        ":liveness_util",
-        ":logical_buffer",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
@@ -1096,10 +1046,11 @@ tf_cc_test(
     name = "heap_simulator_test",
     srcs = ["heap_simulator_test.cc"],
     deps = [
+        ":buffer_value",
         ":heap_simulator",
         ":hlo",
         ":hlo_ordering",
-        ":logical_buffer",
+        ":hlo_value",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -1164,15 +1115,16 @@ tf_cc_test(
     name = "hlo_scheduling_test",
     srcs = ["hlo_scheduling_test.cc"],
     deps = [
+        ":buffer_value",
         ":hlo",
         ":hlo_ordering",
         ":hlo_scheduling",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
     ],
 )
 
@@ -1205,6 +1157,7 @@ tf_cc_test(
     deps = [
         ":hlo_matchers",
         ":instruction_fusion",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
@@ -1247,13 +1200,11 @@ cc_library(
     deps = [
         ":hlo",
         ":hlo_pass",
-        ":hlo_query",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
     ],
@@ -1330,6 +1281,43 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
+cc_library(
+    name = "batch_dot_simplification",
+    srcs = ["batch_dot_simplification.cc"],
+    hdrs = ["batch_dot_simplification.h"],
+    deps = [
+        ":hlo",
+        ":hlo_creation_utils",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "batch_dot_simplification_test",
+    srcs = ["batch_dot_simplification_test.cc"],
+    deps = [
+        ":batch_dot_simplification",
+        ":hlo",
+        ":hlo_matchers",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:window_util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/core:lib",
@@ -1343,9 +1331,9 @@ tf_cc_test(
     deps = [
         ":gather_expander",
         "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:test_macros_header",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
     ],
 )
 
@@ -1651,23 +1639,21 @@ tf_cc_test(
     name = "hlo_cost_analysis_test",
     srcs = ["hlo_cost_analysis_test.cc"],
     deps = [
-        ":computation_tracker",
         ":cpu_plugin",
         ":hlo",
         ":hlo_cost_analysis",
         ":local_service",
         ":service",
-        ":user_computation",
         ":versioned_computation_handle",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -1697,8 +1683,10 @@ tf_cc_test(
         ":cpu_plugin",
         ":hlo_cost_analysis",
         ":hlo_execution_profile",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
     ],
 )
 
@@ -1748,11 +1736,38 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "buffer_value",
+    srcs = ["buffer_value.cc"],
+    hdrs = ["buffer_value.h"],
+    deps = [
+        ":hlo",
+        ":hlo_proto",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_library(
+    name = "buffer_value_containers",
+    hdrs = ["buffer_value_containers.h"],
+    deps = [
+        ":buffer_value",
+        ":logical_buffer",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
 cc_library(
     name = "logical_buffer",
     srcs = ["logical_buffer.cc"],
     hdrs = ["logical_buffer.h"],
     deps = [
+        ":buffer_value",
         ":hlo",
         ":hlo_proto",
         "//tensorflow/compiler/xla:shape_util",
@@ -1768,6 +1783,7 @@ cc_library(
     srcs = ["hlo_value.cc"],
     hdrs = ["hlo_value.h"],
     deps = [
+        ":buffer_value",
         ":hlo",
         "//tensorflow/compiler/xla:shape_tree",
         "//tensorflow/compiler/xla:shape_util",
@@ -1820,6 +1836,44 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "hlo_liveness_analysis",
+    srcs = ["hlo_liveness_analysis.cc"],
+    hdrs = ["hlo_liveness_analysis.h"],
+    deps = [
+        ":call_graph",
+        ":hlo",
+        ":hlo_value",
+        "//tensorflow/compiler/xla:shape_tree",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "hlo_liveness_analysis_test",
+    srcs = ["hlo_liveness_analysis_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_liveness_analysis",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "hlo_buffer",
     srcs = ["hlo_buffer.cc"],
@@ -1956,10 +2010,12 @@ cc_library(
     deps = [
         ":computation_layout",
         ":hlo",
+        ":hlo_dce",
         ":hlo_graph_dumper",
         ":hlo_pass",
         ":logical_buffer",
         ":tuple_points_to_analysis",
+        ":tuple_simplifier",
         "//tensorflow/compiler/xla:shape_layout",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -1983,7 +2039,6 @@ cc_library(
         ":hlo_graph_dumper",
         ":hlo_ordering",
         ":hlo_pass",
-        ":liveness_util",
         ":logical_buffer",
         ":tuple_simplifier",
         "//tensorflow/compiler/xla:status_macros",
@@ -2030,6 +2085,24 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "hlo_module_dce",
+    srcs = ["hlo_module_dce.cc"],
+    hdrs = ["hlo_module_dce.h"],
+    deps = [
+        ":hlo",
+        ":hlo_dce",
+        ":hlo_liveness_analysis",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "hlo_verifier",
     srcs = ["hlo_verifier.cc"],
@@ -2065,13 +2138,13 @@ cc_library(
     hdrs = ["hlo_rematerialization.h"],
     deps = [
         ":buffer_liveness",
+        ":buffer_value",
         ":call_graph",
         ":flatten_call_graph",
         ":hlo",
         ":hlo_dce",
         ":hlo_ordering",
         ":hlo_scheduling",
-        ":liveness_util",
         ":logical_buffer",
         ":tuple_points_to_analysis",
         "//tensorflow/compiler/xla:shape_util",
@@ -2119,6 +2192,27 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "hlo_module_dce_test",
+    srcs = ["hlo_module_dce_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_module_dce",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 tf_cc_test(
     name = "layout_assignment_test",
     srcs = ["layout_assignment_test.cc"],
@@ -2135,9 +2229,9 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
     ],
@@ -2185,6 +2279,7 @@ cc_library(
     hdrs = ["hlo_cse.h"],
     deps = [
         ":hlo",
+        ":hlo_domain_map",
         ":hlo_pass",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
@@ -2207,6 +2302,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
@@ -2248,6 +2344,79 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "hlo_domain_map",
+    srcs = ["hlo_domain_map.cc"],
+    hdrs = ["hlo_domain_map.h"],
+    deps = [
+        ":hlo",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "hlo_sharding_metadata",
+    srcs = ["hlo_sharding_metadata.cc"],
+    hdrs = [
+        "hlo_sharding_metadata.h",
+    ],
+    deps = [
+        ":hlo",
+        "//tensorflow/compiler/xla:shape_tree",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "hlo_domain_isolator",
+    srcs = ["hlo_domain_isolator.cc"],
+    hdrs = ["hlo_domain_isolator.h"],
+    deps = [
+        ":hlo",
+        ":hlo_graph_dumper",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+    ],
+)
+
+cc_library(
+    name = "hlo_domain_remover",
+    srcs = ["hlo_domain_remover.cc"],
+    hdrs = ["hlo_domain_remover.h"],
+    deps = [
+        ":hlo",
+        ":hlo_domain_isolator",
+        ":hlo_domain_map",
+        ":hlo_graph_dumper",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "hlo_domain_test",
+    srcs = ["hlo_domain_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_domain_isolator",
+        ":hlo_domain_remover",
+        ":hlo_parser",
+        ":hlo_sharding_metadata",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "hlo_element_type_converter",
     srcs = ["hlo_element_type_converter.cc"],
@@ -2276,8 +2445,14 @@ tf_cc_test(
 
 cc_library(
     name = "device_memory_allocator",
-    srcs = ["device_memory_allocator.cc"],
-    hdrs = ["device_memory_allocator.h"],
+    srcs = [
+        "device_memory_allocator.cc",
+        "owning_device_memory.cc",
+    ],
+    hdrs = [
+        "device_memory_allocator.h",
+        "owning_device_memory.h",
+    ],
     deps = [
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
@@ -2312,6 +2487,24 @@ cc_library(
     ],
 )
 
+xla_test(
+    name = "elemental_ir_emitter_test",
+    srcs = ["elemental_ir_emitter_test.cc"],
+    backends = [
+        "cpu",
+        "gpu",
+    ],
+    deps = [
+        "//tensorflow/compiler/xla:execution_options_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
 cc_library(
     name = "hlo_module_config",
     srcs = ["hlo_module_config.cc"],
@@ -2383,7 +2576,6 @@ tf_cc_test(
     srcs = ["hlo_tfgraph_builder_test.cc"],
     deps = [
         ":hlo_tfgraph_builder",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:protos_all_cc",
@@ -2436,6 +2628,7 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
     ],
 )
@@ -2445,6 +2638,7 @@ tf_cc_test(
     srcs = ["transpose_folding_test.cc"],
     deps = [
         ":hlo",
+        ":hlo_matchers",
         ":shape_inference",
         ":transpose_folding",
         "//tensorflow/compiler/xla:literal_util",
@@ -2452,7 +2646,8 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service/gpu:ir_emission_utils",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -2488,7 +2683,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -2592,12 +2787,11 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:backend",
         "//tensorflow/compiler/xla/service:compiler",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//third_party/eigen3",
-        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -2629,8 +2823,8 @@ tf_cc_test(
         ":tuple_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
     ],
 )
 
@@ -2653,9 +2847,10 @@ tf_cc_test(
     deps = [
         ":while_util",
         "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
     ],
 )
 
@@ -2681,6 +2876,34 @@ tf_cc_test(
         ":hlo_matchers",
         ":while_loop_invariant_code_motion",
         "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/core:test",
+    ],
+)
+
+cc_library(
+    name = "while_loop_constant_sinking",
+    srcs = ["while_loop_constant_sinking.cc"],
+    hdrs = ["while_loop_constant_sinking.h"],
+    deps = [
+        ":hlo",
+        ":hlo_pass",
+        ":while_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "while_loop_constant_sinking_test",
+    srcs = ["while_loop_constant_sinking_test.cc"],
+    deps = [
+        ":hlo_matchers",
+        ":while_loop_constant_sinking",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/core:test",
     ],
@@ -2712,3 +2935,96 @@ cc_library(
         "//tensorflow/core:lib",
     ],
 )
+
+cc_library(
+    name = "indexed_array_analysis",
+    srcs = ["indexed_array_analysis.cc"],
+    hdrs = ["indexed_array_analysis.h"],
+    deps = [
+        ":hlo",
+        ":hlo_evaluator",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:ptr_util",
+    ],
+)
+
+tf_cc_test(
+    name = "indexed_array_analysis_test",
+    srcs = ["indexed_array_analysis_test.cc"],
+    deps = [
+        ":hlo_matchers",
+        ":indexed_array_analysis",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/core:test",
+    ],
+)
+
+cc_library(
+    name = "hlo_parser",
+    srcs = ["hlo_parser.cc"],
+    hdrs = ["hlo_parser.h"],
+    deps = [
+        ":hlo",
+        ":hlo_lexer",
+        ":hlo_sharding_metadata",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_cc_test(
+    name = "hlo_parser_test",
+    size = "small",
+    srcs = ["hlo_parser_test.cc"],
+    deps = [
+        ":hlo_parser",
+        "//tensorflow/compiler/xla:window_util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",  # fixdeps: keep
+    ],
+)
+
+cc_library(
+    name = "hlo_lexer",
+    srcs = ["hlo_lexer.cc"],
+    hdrs = [
+        "hlo_lexer.h",
+        "hlo_token.h",
+    ],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:regexp_internal",
+    ],
+)
+
+cc_library(
+    name = "hlo_casting_utils",
+    hdrs = ["hlo_casting_utils.h"],
+    deps = [":hlo"],
+)
+
+tf_cc_test(
+    name = "hlo_casting_utils_test",
+    srcs = ["hlo_casting_utils_test.cc"],
+    deps = [
+        ":hlo_casting_utils",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//tensorflow/core:test",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 8e785de68cb1fb..dc5f1b31bf8510 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -92,26 +92,6 @@ bool ReshapeIsBitcast(
          valid_bitcast_callback(operand->shape(), reshape->shape());
 }
 
-// Adds a scalar computation to the module to enable optimizations with dot
-// converting into reduction.
-HloComputation* CreateScalarBinaryComputation(HloModule* module,
-                                              PrimitiveType primitive_type,
-                                              HloOpcode opcode) {
-  HloComputation::Builder b("scalar_computation");
-  auto scalar_lhs = b.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeShape(F32, {}), "scalar_lhs"));
-  auto scalar_rhs = b.AddInstruction(HloInstruction::CreateParameter(
-      1, ShapeUtil::MakeShape(F32, {}), "scalar_rhs"));
-  auto scalar_op = b.AddInstruction(
-      HloInstruction::CreateBinary(ShapeUtil::MakeShape(primitive_type, {}),
-                                   opcode, scalar_lhs, scalar_rhs));
-  HloComputation* scalar_computation =
-      module->AddEmbeddedComputation(b.Build(scalar_op));
-  return scalar_computation;
-}
-
-}  // namespace
-
 // AlgebraicSimplifierVisitor traverses the HLO computation and reduces certain
 // algebraic expressions to simplified forms. Note: This only supports
 // simplifications that simply look at the operands of an instruction. For the
@@ -177,6 +157,8 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
 
   Status HandleSubtract(HloInstruction* sub) override;
 
+  Status HandleMap(HloInstruction* map) override;
+
   Status HandleMaximum(HloInstruction* maximum) override;
   Status HandleMinimum(HloInstruction* minimum) override;
 
@@ -220,8 +202,7 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
   HloInstruction* AddReduce(HloInstruction* hlo, int64 dim) {
     HloInstruction* zero = computation_->AddInstruction(
         HloInstruction::CreateConstant(Literal::CreateR0(0.0f)));
-    HloComputation* AddReduce_computation = CreateScalarBinaryComputation(
-        computation_->parent(), F32, HloOpcode::kAdd);
+    HloComputation* AddReduce_computation = GetOrCreateScalarAddComputation();
     Shape shape = ShapeUtil::DeleteDimension(dim, hlo->shape());
     return computation_->AddInstruction(HloInstruction::CreateReduce(
         shape, hlo, zero, {dim}, AddReduce_computation));
@@ -252,10 +233,10 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
                                    HloInstruction* operand, HloInstruction* max,
                                    HloInstruction* max_operand);
 
-  // A Reshape or Broadcast that feeds an element-wise operation with a unique
-  // non-scalar operand can sink to after the operation.
-  StatusOr<bool> TryToSinkReshapeOrBroadcastAfterOpWithUniqueNonScalarOperand(
-      HloInstruction* reshape_or_broadcast);
+  // A Broadcast that feeds an element-wise operation with a unique non-scalar
+  // operand can sink to after the operation.
+  StatusOr<bool> TryToSinkBroadcastAfterOpWithUniqueNonScalarOperand(
+      HloInstruction* broadcast);
 
   // Replaces the existing HLO instruction old_instruction, with
   // new_instruction, and marks the optimizer status as changed.
@@ -291,6 +272,26 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
       const Shape& dot_shape, HloInstruction* lhs, int64 lhs_contracting_dim,
       HloInstruction* rhs, int64 rhs_contracting_dim, bool swapped);
 
+  StatusOr<HloInstruction*> OptimizeDotOfGather(HloInstruction* dot);
+
+  HloComputation* GetOrCreateScalarAddComputation() {
+    if (scalar_add_computation_) {
+      return scalar_add_computation_;
+    }
+
+    HloComputation::Builder b("scalar_add_computation");
+    Shape shape = ShapeUtil::MakeShape(F32, {});
+    auto scalar_lhs = b.AddInstruction(
+        HloInstruction::CreateParameter(0, shape, "scalar_lhs"));
+    auto scalar_rhs = b.AddInstruction(
+        HloInstruction::CreateParameter(1, shape, "scalar_rhs"));
+    auto scalar_op = b.AddInstruction(HloInstruction::CreateBinary(
+        shape, HloOpcode::kAdd, scalar_lhs, scalar_rhs));
+    scalar_add_computation_ =
+        computation_->parent()->AddEmbeddedComputation(b.Build(scalar_op));
+    return scalar_add_computation_;
+  }
+
   // Current HloComputation instance the AlgebraicSimplifierVisitor is
   // traversing.
   HloComputation* computation_;
@@ -309,8 +310,13 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
 
   // Disable convolution simplification on platforms where it causes a slowdown.
   bool enable_conv_simplification_;
+
+  // Cached computation for adding two scalar F32.
+  HloComputation* scalar_add_computation_ = nullptr;
 };
 
+}  // namespace
+
 bool AlgebraicSimplifierVisitor::Run(
     HloComputation* computation, bool is_layout_sensitive,
     AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback,
@@ -499,13 +505,13 @@ Status AlgebraicSimplifierVisitor::HandleConcatenate(
 }
 
 static HloInstruction* BuildTupleConstant(HloComputation* computation,
-                                          const Literal& literal) {
+                                          const LiteralSlice& literal) {
   if (ShapeUtil::IsTuple(literal.shape())) {
     std::vector<HloInstruction*> elems;
     elems.reserve(ShapeUtil::TupleElementCount(literal.shape()));
     for (int i = 0; i < ShapeUtil::TupleElementCount(literal.shape()); ++i) {
       elems.push_back(
-          BuildTupleConstant(computation, LiteralView::Create(literal, {i})));
+          BuildTupleConstant(computation, LiteralSlice(literal, {i})));
     }
     return computation->AddInstruction(HloInstruction::CreateTuple(elems));
   } else {
@@ -912,6 +918,134 @@ StatusOr<HloInstruction*> AlgebraicSimplifierVisitor::OptimizeDotOfConcatHelper(
   return add_result;
 }
 
+StatusOr<HloInstruction*> AlgebraicSimplifierVisitor::OptimizeDotOfGather(
+    HloInstruction* dot) {
+  const DotDimensionNumbers& dnums = dot->dot_dimension_numbers();
+  if (dnums.lhs_contracting_dimensions_size() != 1 ||
+      dnums.rhs_contracting_dimensions_size() != 1 ||
+      dnums.lhs_batch_dimensions_size() != 0 ||
+      dnums.rhs_batch_dimensions_size() != 0 ||
+      dot->shape().dimensions_size() != 2) {  // dot output 2D
+    VLOG(10) << "DotOfGather: Can only optimize 2D, non-batch dot operations.";
+    return nullptr;
+  }
+
+  // Optimize either dot(DS(ctA), ctB)) or dot(ctB, DS(ctA)).
+  // Currently a Gather is a DynamicSlice.
+  auto is_dynamic_slice_constant_combination =
+      [](HloInstruction* a, HloInstruction* b, int a_contracting_dimension) {
+        // First operand is a DynamicSlice(Constant).
+        if (a->opcode() != HloOpcode::kDynamicSlice) {
+          return false;
+        }
+        auto* dynamic_slice_op = a->operand(0);
+        if (dynamic_slice_op->opcode() != HloOpcode::kConstant) {
+          return false;
+        }
+        // Second operand is a Constant.
+        if (b->opcode() != HloOpcode::kConstant) {
+          return false;
+        }
+        // The DynamicSlice output is a vector.
+        const Shape& dynamic_slice_shape = a->shape();
+        if (dynamic_slice_shape.dimensions(1 - a_contracting_dimension) != 1) {
+          return false;
+        }
+        // Constant size is the same before and after slice in the contracting
+        // dimension, otherwise we either must precompute for all possible slice
+        // indices or dot is invalid.
+        const Shape& dynamic_slice_op_shape = dynamic_slice_op->shape();
+        if (dynamic_slice_op_shape.dimensions(a_contracting_dimension) !=
+            dynamic_slice_shape.dimensions(a_contracting_dimension)) {
+          return false;
+        }
+        return true;
+      };
+
+  HloInstruction* lhs = dot->mutable_operand(0);
+  HloInstruction* rhs = dot->mutable_operand(1);
+  int lhs_contracting_dimension = dnums.lhs_contracting_dimensions(0);
+  int rhs_contracting_dimension = dnums.rhs_contracting_dimensions(0);
+
+  if (!is_dynamic_slice_constant_combination(
+          lhs, rhs, /*a_contracting_dimension=*/lhs_contracting_dimension) &&
+      !is_dynamic_slice_constant_combination(
+          rhs, lhs, /*a_contracting_dimension=*/rhs_contracting_dimension)) {
+    VLOG(10) << "DotOfGather: Can only optimize dot(DS(ctA), ctB)) or "
+                "dot(ctB, DS(ctA)), where the two constants have equal "
+                "contracting dimensions.";
+    return nullptr;
+  }
+
+  // LHS is DynamicSlice:
+  // input: dot(DS(ctA), ctB))
+  // where DS(ctA) = DS({M x K}, {start, 0}, {1, K}) and ctB = {K x N}.
+  // => input dimensions: dot({1 x K}, {K x N}) => {1 x N}.
+  // output: DS(dot(ctA, ctB))
+  // => output dimensions: DS ({M x N}, {start, 0}, {1, N}) => {1 x N}.
+
+  // RHS is DynamicSlice:
+  // input: dot(ctA, DS(ctB))
+  // where ctA = {M x K} and DS(ctB) = DS({K x N}, {0, start}, {K, 1}).
+  // => input dimensions: dot({M x K}, {K x 1}) => {M x 1}.
+  // output: DS(dot(ctA, ctB))
+  // => output dimensions: DS ({M x N}, {0, start}, {M, 1}) => {M x 1}.
+
+  bool lhs_is_dynamic_slice = lhs->opcode() == HloOpcode::kDynamicSlice;
+
+  // ctA:
+  HloInstruction* left_operand =
+      lhs_is_dynamic_slice ? lhs->mutable_operand(0) : lhs;
+  // ctB:
+  HloInstruction* right_operand =
+      lhs_is_dynamic_slice ? rhs : rhs->mutable_operand(0);
+  // Build ctA x ctB.
+  const int m = left_operand->shape().dimensions(1 - lhs_contracting_dimension);
+  const int n =
+      right_operand->shape().dimensions(1 - rhs_contracting_dimension);
+  auto memoized_shape = ShapeUtil::MakeShape(F32, {m, n});
+  auto* memoized_inst = computation_->AddInstruction(HloInstruction::CreateDot(
+      memoized_shape, left_operand, right_operand, dnums));
+  // Get pair {start, 0} or {0, start}.
+  HloInstruction* original_start_indices =
+      lhs_is_dynamic_slice ? lhs->mutable_operand(1) : rhs->mutable_operand(1);
+  // Position of start:
+  int index_of_non_zero_start = lhs_is_dynamic_slice
+                                    ? 1 - lhs_contracting_dimension
+                                    : 1 - rhs_contracting_dimension;
+  // Position of zero:
+  int index_of_zero_start = 1 - index_of_non_zero_start;
+
+  // Slice out start and 0 components and reorder if necessary.
+  auto indices_type = original_start_indices->shape().element_type();
+  Shape s_shape = ShapeUtil::MakeShape(indices_type, {1});
+  Shape d_shape = ShapeUtil::MakeShape(indices_type, {2});
+  HloInstruction* non_zero_start =
+      computation_->AddInstruction(HloInstruction::CreateSlice(
+          s_shape, original_start_indices, {index_of_non_zero_start},
+          {index_of_non_zero_start + 1}, {1}));
+  HloInstruction* zero_start =
+      computation_->AddInstruction(HloInstruction::CreateSlice(
+          s_shape, original_start_indices, {index_of_zero_start},
+          {index_of_zero_start + 1}, {1}));
+  HloInstruction* new_start_indices =
+      lhs_is_dynamic_slice
+          ? computation_->AddInstruction(HloInstruction::CreateConcatenate(
+                d_shape, {non_zero_start, zero_start}, 0))
+          : computation_->AddInstruction(HloInstruction::CreateConcatenate(
+                d_shape, {zero_start, non_zero_start}, 0));
+
+  // Build DynamicSlice(ctA x ctB).
+  const int new_slice_m = lhs_is_dynamic_slice ? 1 : m;
+  const int new_slice_n = lhs_is_dynamic_slice ? n : 1;
+  auto* memoized_lookup =
+      computation_->AddInstruction(HloInstruction::CreateDynamicSlice(
+          dot->shape(), memoized_inst, new_start_indices,
+          {new_slice_m, new_slice_n}));
+
+  return memoized_lookup;
+}
+
 Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
   HloInstruction *lhs, *rhs;
   CHECK(Match(dot, m::Dot(m::Op(&lhs), m::Op(&rhs))));
@@ -941,6 +1075,17 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
     return ReplaceInstruction(dot, dot_of_concat_optimized);
   }
 
+  // Simplify dot(ConstA, Gather(Index, ConstB)) to:
+  // Gather(Index, dot*(ConstA, ConstB)), where dot* is an appropriately
+  // batched version of dot.
+  TF_ASSIGN_OR_RETURN(HloInstruction * dot_of_gather_optimized,
+                      OptimizeDotOfGather(dot));
+  if (dot_of_gather_optimized) {
+    VLOG(10) << "Replaced dot(constA, gather(i, constB)) with "
+                "gather(i, dot*(constA, constB))";
+    return ReplaceInstruction(dot, dot_of_gather_optimized);
+  }
+
   if (enable_dot_strength_reduction_ && !is_layout_sensitive_) {
     TF_ASSIGN_OR_RETURN(bool did_strength_reduction,
                         HandleDotStrengthReduction(dot));
@@ -1160,7 +1305,7 @@ Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) {
   // broadcast after the unary element-wise operation.
   TF_ASSIGN_OR_RETURN(
       bool sink_succeeded,
-      TryToSinkReshapeOrBroadcastAfterOpWithUniqueNonScalarOperand(broadcast));
+      TryToSinkBroadcastAfterOpWithUniqueNonScalarOperand(broadcast));
   changed_ |= sink_succeeded;
   if (sink_succeeded) {
     return Status::OK();
@@ -1412,15 +1557,16 @@ Status AlgebraicSimplifierVisitor::HandlePower(HloInstruction* power) {
   return Status::OK();
 }
 
-StatusOr<bool> AlgebraicSimplifierVisitor::
-    TryToSinkReshapeOrBroadcastAfterOpWithUniqueNonScalarOperand(
-        HloInstruction* reshape_or_broadcast) {
+StatusOr<bool>
+AlgebraicSimplifierVisitor::TryToSinkBroadcastAfterOpWithUniqueNonScalarOperand(
+    HloInstruction* broadcast) {
+  TF_RET_CHECK(broadcast->opcode() == HloOpcode::kBroadcast);
   bool changed = false;
-  if (ShapeUtil::IsScalar(reshape_or_broadcast->shape())) {
+  if (ShapeUtil::IsScalar(broadcast->shape())) {
     return false;
   }
-  HloInstruction* operand = reshape_or_broadcast->mutable_operand(0);
-  for (HloInstruction* user : reshape_or_broadcast->users()) {
+  HloInstruction* operand = broadcast->mutable_operand(0);
+  for (HloInstruction* user : broadcast->users()) {
     if (user->user_count() == 0 && user != computation_->root_instruction()) {
       continue;
     }
@@ -1438,55 +1584,50 @@ StatusOr<bool> AlgebraicSimplifierVisitor::
       continue;
     }
 
-    int64 reshape_or_broadcast_operand_index = -1;
     // Find the unique non-scalar operand or continue if there isn't one.
-    int64 scalar_count = 0;
-    for (int64 i = 0; i < user->operand_count(); ++i) {
-      if (ShapeUtil::IsScalar(user->operand(i)->shape())) {
-        ++scalar_count;
-      } else {
-        reshape_or_broadcast_operand_index = i;
+    int64 scalar_broadcast_count = 0;
+    int64 broadcast_use_count = 0;
+    for (HloInstruction* user_operand : user->operands()) {
+      if (user_operand->opcode() == HloOpcode::kBroadcast &&
+          ShapeUtil::IsScalar(user_operand->operand(0)->shape())) {
+        ++scalar_broadcast_count;
+      } else if (broadcast == user_operand) {
+        ++broadcast_use_count;
       }
     }
-    if (scalar_count != user->operand_count() - 1) {
+    if (scalar_broadcast_count + broadcast_use_count != user->operand_count()) {
       continue;
     }
-    VLOG(4) << "Sinking reshape or broadcast after user:";
-    VLOG(4) << "  old reshape/broadcast: " << reshape_or_broadcast->ToString();
+    std::vector<HloInstruction*> new_operands;
+    new_operands.reserve(user->operand_count());
+
+    for (HloInstruction* user_operand : user->operands()) {
+      if (user_operand->opcode() == HloOpcode::kBroadcast &&
+          ShapeUtil::IsScalar(user_operand->operand(0)->shape())) {
+        new_operands.push_back(
+            computation_->AddInstruction(HloInstruction::CreateBroadcast(
+                ShapeUtil::ChangeElementType(
+                    operand->shape(), user_operand->shape().element_type()),
+                user_operand->mutable_operand(0), {})));
+      } else {
+        CHECK_EQ(broadcast, user_operand);
+        new_operands.push_back(operand);
+      }
+    }
+    VLOG(4) << "Sinking broadcast after user:";
+    VLOG(4) << "  old broadcast: " << broadcast->ToString();
     VLOG(4) << "  old user: " << user->ToString();
-    CHECK_EQ(user->operand(reshape_or_broadcast_operand_index),
-             reshape_or_broadcast);
-    auto new_user_operands = user->operands();
-    new_user_operands[reshape_or_broadcast_operand_index] = operand;
-    auto new_user = computation_->AddInstruction(user->CloneWithNewOperands(
-        ShapeUtil::MakeShapeWithLayout(
-            user->shape().element_type(),
-            AsInt64Slice(operand->shape().dimensions()),
-            LayoutUtil::MinorToMajor(operand->shape())),
-        new_user_operands));
+    HloInstruction* new_user =
+        computation_->AddInstruction(user->CloneWithNewOperands(
+            ShapeUtil::ChangeElementType(operand->shape(),
+                                         user->shape().element_type()),
+            new_operands));
     VLOG(4) << "  new user: " << new_user->ToString();
-    HloInstruction* new_reshape_or_broadcast = nullptr;
-    if (reshape_or_broadcast->opcode() == HloOpcode::kReshape) {
-      new_reshape_or_broadcast =
-          computation_->AddInstruction(HloInstruction::CreateReshape(
-              ShapeUtil::MakeShapeWithLayout(
-                  user->shape().element_type(),
-                  AsInt64Slice(reshape_or_broadcast->shape().dimensions()),
-                  LayoutUtil::MinorToMajor(reshape_or_broadcast->shape())),
-              new_user));
-    } else {
-      TF_RET_CHECK(reshape_or_broadcast->opcode() == HloOpcode::kBroadcast);
-      new_reshape_or_broadcast =
-          computation_->AddInstruction(HloInstruction::CreateBroadcast(
-              ShapeUtil::MakeShapeWithLayout(
-                  user->shape().element_type(),
-                  AsInt64Slice(reshape_or_broadcast->shape().dimensions()),
-                  LayoutUtil::MinorToMajor(reshape_or_broadcast->shape())),
-              new_user, reshape_or_broadcast->dimensions()));
-    }
-    VLOG(4) << "  new reshape/broadcast: "
-            << new_reshape_or_broadcast->ToString();
-    TF_RETURN_IF_ERROR(user->ReplaceAllUsesWith(new_reshape_or_broadcast));
+    HloInstruction* new_broadcast =
+        computation_->AddInstruction(HloInstruction::CreateBroadcast(
+            user->shape(), new_user, broadcast->dimensions()));
+    VLOG(4) << "  new broadcast: " << new_broadcast->ToString();
+    TF_RETURN_IF_ERROR(user->ReplaceAllUsesWith(new_broadcast));
     changed = true;
   }
   return changed;
@@ -1529,16 +1670,6 @@ Status AlgebraicSimplifierVisitor::HandleReshape(HloInstruction* reshape) {
     }
   }
 
-  // A Reshape that feeds a unary element-wise operation can sink the
-  // reshape after the unary element-wise operation.
-  TF_ASSIGN_OR_RETURN(
-      bool sink_succeeded,
-      TryToSinkReshapeOrBroadcastAfterOpWithUniqueNonScalarOperand(reshape));
-  changed_ |= sink_succeeded;
-  if (sink_succeeded) {
-    return Status::OK();
-  }
-
   // Make this a bitcast if possible.
   if (is_layout_sensitive_ &&
       ReshapeIsBitcast(reshape, valid_bitcast_callback_)) {
@@ -1643,6 +1774,15 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
                     new_reduce_dimensions, function));
   }
 
+  // If the reduction results in the same number of elements, then the only
+  // possible side effect would be a reshape. Since the init_value is an
+  // identity of the reduction function, we can therefore replace the reduce
+  // with a simple reshape, ignoring the reduction function completely.
+  if (ShapeUtil::ElementsIn(reduce->shape()) ==
+      ShapeUtil::ElementsIn(arg->shape())) {
+    return ReplaceWithNewInstruction(
+        reduce, HloInstruction::CreateReshape(reduce->shape(), arg));
+  }
   // A reshape that collapses multiple dimensions into a dimension being
   // reduced can just reduce all of those dimensions instead of doing a
   // collapsing reshape before a reduction.
@@ -1687,15 +1827,6 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
                       new_reduce_dimensions, function));
     }
   }
-  if (ShapeUtil::ElementsIn(reduce->shape()) ==
-          ShapeUtil::ElementsIn(arg->shape()) ||
-      ShapeUtil::HasZeroElements(arg->shape())) {
-    auto reshape = computation_->AddInstruction(
-        HloInstruction::CreateReshape(reduce->shape(), arg));
-    return ReplaceWithNewInstruction(
-        reduce, HloInstruction::CreateMap(reduce->shape(),
-                                          {init_value, reshape}, function));
-  }
   return Status::OK();
 }
 
@@ -1715,7 +1846,7 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
     return ReplaceWithNewInstruction(
         reduce_window,
         HloInstruction::CreateMap(reduce_window->shape(),
-                                  {operand, reduce_window->mutable_operand(1)},
+                                  {reduce_window->mutable_operand(1), operand},
                                   function));
   }
 
@@ -2045,6 +2176,39 @@ bool AlgebraicSimplifierVisitor::TransformToClampIfSameShape(
   return true;
 }
 
+Status AlgebraicSimplifierVisitor::HandleMap(HloInstruction* map) {
+  auto* map_computation = map->to_apply();
+  auto* map_root = map_computation->root_instruction();
+  if (map_root->opcode() == HloOpcode::kParameter) {
+    ReplaceInstructionIfSameShape(
+        map, map->mutable_operand(map_root->parameter_number()));
+    return Status::OK();
+  }
+  if (map_root->opcode() == HloOpcode::kConstant) {
+    if (!ShapeUtil::IsScalar(map_root->shape())) {
+      return Status::OK();
+    }
+    auto clone = map_root->CloneWithNewOperands(map_root->shape(), {});
+    if (ShapeUtil::IsScalar(map->shape())) {
+      return ReplaceWithNewInstruction(map, std::move(clone));
+    }
+    return ReplaceWithNewInstruction(
+        map,
+        HloInstruction::CreateBroadcast(
+            map->shape(), computation_->AddInstruction(std::move(clone)), {}));
+  }
+  std::vector<HloInstruction*> new_operands;
+  for (auto* root_operand : map_root->operands()) {
+    if (root_operand->opcode() != HloOpcode::kParameter) {
+      return Status::OK();
+    }
+    new_operands.push_back(
+        map->mutable_operand(root_operand->parameter_number()));
+  }
+  auto clone = map_root->CloneWithNewOperands(map->shape(), new_operands);
+  return ReplaceWithNewInstruction(map, std::move(clone));
+}
+
 Status AlgebraicSimplifierVisitor::HandleMaximum(HloInstruction* maximum) {
   // Match the following tree:
   //          min_operand     operand
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 20c549562d5153..cda157f9fac163 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/window_util.h"
@@ -142,6 +143,39 @@ TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR0Operand) {
   EXPECT_EQ(root, param0);
 }
 
+TEST_F(AlgebraicSimplifierTest, InlineTrivialMap) {
+  HloComputation::Builder builder(TestName());
+  // Create add computation.
+  HloComputation* add_computation = nullptr;
+  {
+    HloComputation::Builder builder(TestName() + ".add");
+    const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+    HloInstruction* p0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, scalar_shape, "p0"));
+    HloInstruction* p1 = builder.AddInstruction(
+        HloInstruction::CreateParameter(1, scalar_shape, "p1"));
+    builder.AddInstruction(
+        HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1));
+    add_computation = module().AddEmbeddedComputation(builder.Build());
+  }
+  Shape r2f32 = ShapeUtil::MakeShape(F32, {32, 1});
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r2f32, "param0"));
+  HloInstruction* zero = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0f)));
+  builder.AddInstruction(
+      HloInstruction::CreateMap(r2f32, {param0, zero}, add_computation));
+
+  auto computation = module().AddEntryComputation(builder.Build());
+  HloInstruction* root = computation->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kMap);
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  root = computation->root_instruction();
+  EXPECT_THAT(root, op::Add(param0, zero));
+}
+
 TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR1Operand) {
   Shape r2f32 = ShapeUtil::MakeShape(F32, {3, 2});
   HloComputation::Builder builder(TestName());
@@ -1317,32 +1351,6 @@ TEST_F(AlgebraicSimplifierTest, ReshapeReplacedWithBitcast) {
       op::Tuple(op::Bitcast(), dimensions_wrong_reshape, layout_wrong_reshape));
 }
 
-TEST_F(AlgebraicSimplifierTest, ReshapeAfterEffectiveUnary) {
-  HloComputation::Builder builder(TestName());
-  HloInstruction* param =
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          0, ShapeUtil::MakeShape(F32, {2, 3, 4, 5}), "param"));
-  HloInstruction* movable_reshape =
-      builder.AddInstruction(HloInstruction::CreateReshape(
-          ShapeUtil::MakeShape(F32, {1, 2, 3, 4, 5}), param));
-  HloInstruction* zero = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0f)));
-  builder.AddInstruction(
-      HloInstruction::CreateBinary(ShapeUtil::MakeShape(F32, {1, 2, 3, 4, 5}),
-                                   HloOpcode::kMaximum, movable_reshape, zero));
-  auto computation = module().AddEntryComputation(builder.Build());
-
-  EXPECT_THAT(computation->root_instruction(),
-              op::Maximum(op::Reshape(param), zero));
-
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
-
-  simplifier.Run(&module()).ValueOrDie();
-  EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Maximum(param, zero)));
-}
-
 // Regression test for a bug in the reshape sinking transformation, where
 // moving a reshape to a scalar led to a crash.
 TEST_F(AlgebraicSimplifierTest, ReshapeToScalarNotHoistedAfterEffectiveUnary) {
@@ -1699,14 +1707,14 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopPad) {
   builder.AddInstruction(HloInstruction::CreatePad(
       ShapeUtil::MakeShape(F32, {2, 2}), param, zero, no_padding));
 
-  HloModule module(TestName());
-  HloComputation* computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Pad(param, zero));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param);
 }
@@ -1732,8 +1740,8 @@ TEST_F(AlgebraicSimplifierTest, NegativePadding) {
   HloInstruction* pad = builder.AddInstruction(HloInstruction::CreatePad(
       ShapeUtil::MakeShape(F32, {11, 5}), param, zero, padding));
 
-  HloModule module(TestName());
-  HloComputation* computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
@@ -1751,7 +1759,7 @@ TEST_F(AlgebraicSimplifierTest, NegativePadding) {
   EXPECT_THAT(computation->root_instruction(), op::Pad(param, zero));
   EXPECT_TRUE(has_negative_padding(pad));
 
-  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Slice(op::Pad(param, zero)));
   EXPECT_FALSE(
@@ -1766,14 +1774,14 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopReshape) {
   builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {2, 3}), param));
 
-  HloModule module(TestName());
-  HloComputation* computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Reshape(param));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param);
 }
@@ -1789,14 +1797,14 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopSlice) {
       ShapeUtil::MakeShape(F32, {dim0, dim1}), param, /*start_indices=*/{0, 0},
       /*limit_indices=*/{dim0, dim1}, /*strides=*/{1, 1}));
 
-  HloModule module(TestName());
-  HloComputation* computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), op::Slice(param));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param);
 }
@@ -1924,12 +1932,12 @@ TEST_F(AlgebraicSimplifierTest, ConvertConvToMatmul) {
     b.AddInstruction(HloInstruction::CreateConvolve(out_shape, input, filter,
                                                     window, dnums));
 
-    HloModule module(TestName());
-    auto* computation = module.AddEntryComputation(b.Build());
+    auto module = CreateNewModule();
+    auto* computation = module->AddEntryComputation(b.Build());
 
     AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
                                    bitcasting_callback());
-    if (!simplifier.Run(&module).ValueOrDie()) {
+    if (!simplifier.Run(module.get()).ValueOrDie()) {
       return "NO_CHANGE";
     }
     auto* root = computation->root_instruction();
@@ -2044,15 +2052,15 @@ TEST_F(AlgebraicSimplifierTest, MaxMinToClamp) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kMaximum, min, max_value));
 
-  HloModule module(TestName());
-  auto computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Maximum(op::Minimum(param0, min_value), max_value));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Clamp(max_value, param0, min_value));
@@ -2074,15 +2082,15 @@ TEST_F(AlgebraicSimplifierTest, MinMaxToClamp) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kMinimum, max, min_value));
 
-  HloModule module(TestName());
-  auto computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Minimum(op::Maximum(param0, max_value), min_value));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Clamp(max_value, param0, min_value));
@@ -2105,15 +2113,15 @@ TEST_F(AlgebraicSimplifierTest, MinMaxWithBroadcastToClamp) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r1f32, HloOpcode::kMinimum, max, min_value));
 
-  HloModule module(TestName());
-  auto computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Minimum(op::Maximum(param0, max_value), min_value));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Clamp(max_value, param0, min_value));
@@ -2135,15 +2143,15 @@ TEST_F(AlgebraicSimplifierTest, MinMaxNotToClamp) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kMinimum, max, min_value));
 
-  HloModule module(TestName());
-  auto computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Minimum(op::Maximum(param0, max_value), min_value));
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(&module).ValueOrDie());
+  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Minimum(op::Maximum(param0, max_value), min_value));
@@ -2167,8 +2175,8 @@ TEST_F(AlgebraicSimplifierTest, MinEquationWithMaxNotToClamp) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       r0f32, HloOpcode::kMinimum, fmax, min_value));
 
-  HloModule module(TestName());
-  auto computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Minimum(op::Add(op::Maximum(param0, max_value), max_value),
@@ -2176,7 +2184,7 @@ TEST_F(AlgebraicSimplifierTest, MinEquationWithMaxNotToClamp) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(&module).ValueOrDie());
+  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Minimum(op::Add(op::Maximum(param0, max_value), max_value),
@@ -2201,8 +2209,8 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToSlice) {
   HloInstruction* slice = builder.AddInstruction(HloInstruction::CreateSlice(
       slice_shape, broadcast, {0, 1, 2, 3}, {2, 3, 5, 6}, {1, 1, 1, 1}));
 
-  HloModule module(TestName());
-  auto computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, slice);
@@ -2211,10 +2219,10 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToSlice) {
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
 
-  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   // Running simplification again should not result in any further changes.
-  ASSERT_FALSE(simplifier.Run(&module).ValueOrDie());
+  ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
 
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Broadcast(scalar_param));
@@ -2242,8 +2250,8 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToTransposeReshape) {
   HloInstruction* reshape = builder.AddInstruction(
       HloInstruction::CreateReshape(reshape_shape, transpose));
 
-  HloModule module(TestName());
-  auto computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, reshape);
@@ -2251,7 +2259,7 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToTransposeReshape) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Broadcast(forty_two));
@@ -2260,7 +2268,7 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToTransposeReshape) {
 
 // Test that ReduceWindow(Pad(op, x), y) can simplify to ReduceWindow(op, x).
 TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
-  HloModule module(TestName());
+  auto module = CreateNewModule();
   HloComputation::Builder builder(TestName());
 
   // Create operand to the pad.
@@ -2289,7 +2297,7 @@ TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
         HloInstruction::CreateParameter(1, scalar_shape, "p1"));
     builder.AddInstruction(
         HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1));
-    add_computation = module.AddEmbeddedComputation(builder.Build());
+    add_computation = module->AddEmbeddedComputation(builder.Build());
   }
 
   // Create the reduce-window.
@@ -2312,15 +2320,15 @@ TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
           add_computation));
 
   // Build the computation and run the simplifier.
-  auto computation = module.AddEntryComputation(builder.Build());
+  auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, reduce_window);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   // Running simplification again should not result in any further changes.
-  ASSERT_FALSE(simplifier.Run(&module).ValueOrDie());
+  ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
 
   // Verify the result
   root = computation->root_instruction();
@@ -2341,7 +2349,7 @@ TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
 // Test that ReduceWindow(Convert(Pad(op, x)), y) can simplify to
 // ReduceWindow(Convert(op), x).
 TEST_F(AlgebraicSimplifierTest, FoldConvertedPadIntoReduceWindow) {
-  HloModule module(TestName());
+  auto module = CreateNewModule();
   HloComputation::Builder builder(TestName());
 
   // Create operand to the pad.
@@ -2374,7 +2382,7 @@ TEST_F(AlgebraicSimplifierTest, FoldConvertedPadIntoReduceWindow) {
         HloInstruction::CreateParameter(1, scalar_shape, "p1"));
     builder.AddInstruction(
         HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1));
-    add_computation = module.AddEmbeddedComputation(builder.Build());
+    add_computation = module->AddEmbeddedComputation(builder.Build());
   }
 
   // Create the reduce-window.
@@ -2397,15 +2405,15 @@ TEST_F(AlgebraicSimplifierTest, FoldConvertedPadIntoReduceWindow) {
           add_computation));
 
   // Build the computation and run the simplifier.
-  auto computation = module.AddEntryComputation(builder.Build());
+  auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, reduce_window);
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   // Running simplification again should not result in any further changes.
-  ASSERT_FALSE(simplifier.Run(&module).ValueOrDie());
+  ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
 
   // Verify the result
   root = computation->root_instruction();
@@ -2431,12 +2439,12 @@ TEST_F(AlgebraicSimplifierTest, ReversalOfTrivialDimensionsToBitcast) {
   builder.AddInstruction(
       HloInstruction::CreateReverse(shape, a, /*dimensions=*/{2, 3}));
 
-  HloModule module(TestName());
-  auto computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(&module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(a, root);
@@ -2962,5 +2970,208 @@ TEST_F(AlgebraicSimplifierTest, DynamicUpdateSliceZeroUpdate) {
 INSTANTIATE_TEST_CASE_P(DotOfConcatSimplificationTestInstantiation,
                         DotOfConcatSimplificationTest,
                         ::testing::ValuesIn(kDotOfConcatTestSpecs));
+
+struct DotOfGatherTestSpec {
+  int64 m;
+  int64 k;
+  int64 n;
+  int s;      // start index for dynamic slice on the non-contracting dimension
+  int64 lcd;  // left contracting dimension
+  int64 rcd;  // right contracting dimension
+  bool neg;   // is negative testcase
+};
+
+class DotOfGatherSimplificationTest
+    : public HloVerifiedTestBase,
+      public ::testing::WithParamInterface<DotOfGatherTestSpec> {};
+
+// input: dot(DS(ctA), ctB))
+// where DS(ctA) = DS({M x K}, {s, 0}, {1, K}) and ctB = {K x N}.
+// => input dimensions: dot({1 x K}, {K x N}) => {1 x N}.
+// output: DS(dot(ctA, ctB))
+// => output dimensions: DS ({M x N}, {s, 0}, {1, N}) => {1 x N}.
+TEST_P(DotOfGatherSimplificationTest, ConstantRHS) {
+  HloComputation::Builder builder(TestName());
+
+  DotOfGatherTestSpec spec = GetParam();
+
+  ASSERT_LE(spec.s, spec.m);
+
+  // For negative tests, increase k of the dynamic slice argument to prevent the
+  // optimization (constants ctA, ctB must have equal contracting dimensions).
+  int64 k_increase = spec.neg ? 5 : 0;
+  int64 lhs_rows = (spec.lcd == 0) ? (spec.k + k_increase) : spec.m;
+  int64 lhs_cols = (spec.lcd == 0) ? spec.m : (spec.k + k_increase);
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {lhs_rows, lhs_cols});
+  auto* lhs = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR2F32Linspace(
+          /*from=*/10.0, /*to=*/10000.0, /*rows=*/lhs_rows,
+          /*cols=*/lhs_cols)));
+
+  int32 start_row = (spec.lcd == 0) ? 0 : spec.s;
+  int32 start_col = (spec.lcd == 0) ? spec.s : 0;
+  const auto start_indices =
+      builder.AddInstruction(HloInstruction::CreateConstant(
+          Literal::CreateR1<int32>({start_row, start_col})));
+  int64 slice_row_size = (spec.lcd == 0) ? spec.k : 1;
+  int64 slice_col_size = (spec.lcd == 0) ? 1 : spec.k;
+  Shape ds_shape = ShapeUtil::MakeShape(F32, {slice_row_size, slice_col_size});
+  auto* ds = builder.AddInstruction(HloInstruction::CreateDynamicSlice(
+      ds_shape, lhs, start_indices, {slice_row_size, slice_col_size}));
+
+  int64 rhs_rows = (spec.rcd == 0) ? spec.k : spec.n;
+  int64 rhs_cols = (spec.rcd == 0) ? spec.n : spec.k;
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {rhs_rows, rhs_cols});
+  auto* rhs = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR2F32Linspace(
+          /*from=*/10.0, /*to=*/10000.0, /*rows=*/rhs_rows,
+          /*cols=*/rhs_cols)));
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(spec.lcd);
+  dot_dnums.add_rhs_contracting_dimensions(spec.rcd);
+
+  int64 dot_row_size = 1;
+  int64 dot_col_size = spec.n;
+  Shape dot_shape = ShapeUtil::MakeShape(F32, {dot_row_size, dot_col_size});
+  builder.AddInstruction(
+      HloInstruction::CreateDot(dot_shape, ds, rhs, dot_dnums));
+
+  auto computation = module().AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(&module()));
+  ASSERT_TRUE(run_successful);
+  EXPECT_TRUE(
+      ShapeUtil::Equal(computation->root_instruction()->shape(), dot_shape));
+
+  if (spec.neg) {
+    EXPECT_NE(computation->root_instruction()->opcode(),
+              HloOpcode::kDynamicSlice);
+  } else {
+    EXPECT_THAT(computation->root_instruction(),
+                op::DynamicSlice(op::Dot(op::Constant(), op::Constant()),
+                                 op::Concatenate()));
+  }
+}
+
+// input: dot(ctA, DS(ctB))
+// where ctA = {M x K} and DS(ctB) = DS({K x N}, {0, s}, {K, 1}).
+// => input dimensions: dot({M x K}, {K x 1}) => {M x 1}.
+// output: DS(dot(ctA, ctB))
+// => output dimensions: DS ({M x N}, {0, s}, {M, 1}) => {M x 1}.
+TEST_P(DotOfGatherSimplificationTest, ConstantLHS) {
+  HloComputation::Builder builder(TestName());
+
+  DotOfGatherTestSpec spec = GetParam();
+
+  ASSERT_LE(spec.s, spec.n);
+
+  int64 lhs_rows = (spec.lcd == 0) ? spec.k : spec.m;
+  int64 lhs_cols = (spec.lcd == 0) ? spec.m : spec.k;
+  Shape lhs_shape = ShapeUtil::MakeShape(F32, {lhs_rows, lhs_cols});
+  auto* lhs = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR2F32Linspace(
+          /*from=*/10.0, /*to=*/10000.0, /*rows=*/lhs_rows,
+          /*cols=*/lhs_cols)));
+
+  // For negative tests increase k of the dynamic slice argument to prevent the
+  // optimization
+  int64 k_increase = spec.neg ? 5 : 0;
+  int64 rhs_rows = (spec.rcd == 0) ? (spec.k + k_increase) : spec.n;
+  int64 rhs_cols = (spec.rcd == 0) ? spec.n : (spec.k + k_increase);
+  Shape rhs_shape = ShapeUtil::MakeShape(F32, {rhs_rows, rhs_cols});
+  auto* rhs = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR2F32Linspace(
+          /*from=*/10.0, /*to=*/10000.0, /*rows=*/rhs_rows,
+          /*cols=*/rhs_cols)));
+
+  int32 start_row = (spec.rcd == 0) ? 0 : spec.s;
+  int32 start_col = (spec.rcd == 0) ? spec.s : 0;
+  const auto start_indices =
+      builder.AddInstruction(HloInstruction::CreateConstant(
+          Literal::CreateR1<int32>({start_row, start_col})));
+  int64 slice_row_size = (spec.rcd == 0) ? spec.k : 1;
+  int64 slice_col_size = (spec.rcd == 0) ? 1 : spec.k;
+  Shape ds_shape = ShapeUtil::MakeShape(F32, {slice_row_size, slice_col_size});
+  auto* ds = builder.AddInstruction(HloInstruction::CreateDynamicSlice(
+      ds_shape, rhs, start_indices, {slice_row_size, slice_col_size}));
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(spec.lcd);
+  dot_dnums.add_rhs_contracting_dimensions(spec.rcd);
+
+  int64 dot_row_size = spec.m;
+  int64 dot_col_size = 1;
+  Shape dot_shape = ShapeUtil::MakeShape(F32, {dot_row_size, dot_col_size});
+  builder.AddInstruction(
+      HloInstruction::CreateDot(dot_shape, lhs, ds, dot_dnums));
+
+  auto computation = module().AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(&module()));
+  ASSERT_TRUE(run_successful);
+  EXPECT_TRUE(
+      ShapeUtil::Equal(computation->root_instruction()->shape(), dot_shape));
+
+  if (spec.neg) {
+    EXPECT_NE(computation->root_instruction()->opcode(),
+              HloOpcode::kDynamicSlice);
+  } else {
+    EXPECT_THAT(computation->root_instruction(),
+                op::DynamicSlice(op::Dot(op::Constant(), op::Constant()),
+                                 op::Concatenate()));
+  }
+}
+
+std::vector<DotOfGatherTestSpec> DotOfGatherPositiveNegativeTests() {
+  std::vector<DotOfGatherTestSpec> positives = {
+      // "Classical dot", i.e. matrix multiply:
+      {/*m=*/10, /*k=*/10, /*n=*/5, /*s=*/0, /*lcd=*/1, /*rcd=*/0,
+       /*neg=*/false},
+      {/*m=*/20, /*k=*/20, /*n=*/3, /*s=*/2, /*lcd=*/1, /*rcd=*/0,
+       /*neg=*/false},
+      {/*m=*/10, /*k=*/3, /*n=*/10, /*s=*/9, /*lcd=*/1, /*rcd=*/0,
+       /*neg=*/false},
+      // Note: testing for m=1 and n=1 is unnecessary, as this optimizes to
+      // dot(ct, ct) before DotOfGather optimization kicks in.
+      // Contract on rows:
+      {/*m=*/10, /*k=*/10, /*n=*/5, /*s=*/0, /*lcd=*/0, /*rcd=*/0,
+       /*neg=*/false},
+      {/*m=*/20, /*k=*/20, /*n=*/3, /*s=*/2, /*lcd=*/0, /*rcd=*/0,
+       /*neg=*/false},
+      {/*m=*/10, /*k=*/3, /*n=*/10, /*s=*/9, /*lcd=*/0, /*rcd=*/0,
+       /*neg=*/false},
+      // Reverse matrix multiply:
+      {/*m=*/10, /*k=*/10, /*n=*/5, /*s=*/0, /*lcd=*/0, /*rcd=*/1,
+       /*neg=*/false},
+      {/*m=*/20, /*k=*/20, /*n=*/3, /*s=*/2, /*lcd=*/0, /*rcd=*/1,
+       /*neg=*/false},
+      {/*m=*/10, /*k=*/3, /*n=*/10, /*s=*/9, /*lcd=*/0, /*rcd=*/1,
+       /*neg=*/false},
+      // Contract on columns:
+      {/*m=*/10, /*k=*/10, /*n=*/5, /*s=*/0, /*lcd=*/1, /*rcd=*/1,
+       /*neg=*/false},
+      {/*m=*/20, /*k=*/20, /*n=*/3, /*s=*/2, /*lcd=*/1, /*rcd=*/1,
+       /*neg=*/false},
+      {/*m=*/10, /*k=*/3, /*n=*/10, /*s=*/9, /*lcd=*/1, /*rcd=*/1,
+       /*neg=*/false},
+  };
+  std::vector<DotOfGatherTestSpec> all;
+  for (int i = 0; i < positives.size(); i++) {
+    DotOfGatherTestSpec positive_test = positives[i];
+    all.push_back(positive_test);
+    DotOfGatherTestSpec negative_test = positive_test;
+    negative_test.neg = true;
+    all.push_back(negative_test);
+  }
+  return all;
+}
+
+INSTANTIATE_TEST_CASE_P(
+    DotOfGatherSimplificationTestInstantiation, DotOfGatherSimplificationTest,
+    ::testing::ValuesIn(DotOfGatherPositiveNegativeTests()));
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.cc b/tensorflow/compiler/xla/service/allocation_tracker.cc
index cf1231bcce4d00..95b4cb6d2e6940 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.cc
+++ b/tensorflow/compiler/xla/service/allocation_tracker.cc
@@ -101,7 +101,7 @@ StatusOr<GlobalDataHandle> AllocationTracker::RegisterInternal(
   return result;
 }
 
-tensorflow::Status AllocationTracker::Unregister(const GlobalDataHandle& data) {
+Status AllocationTracker::Unregister(const GlobalDataHandle& data) {
   tensorflow::mutex_lock lock(mutex_);
   VLOG(2) << "Unregister("
           << "handle: " << data.handle() << ")";
@@ -130,7 +130,7 @@ tensorflow::Status AllocationTracker::Unregister(const GlobalDataHandle& data) {
   for (auto& shaped_buffer : it->second) {
     shaped_buffer.reset();
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 StatusOr<std::vector<GlobalDataHandle>> AllocationTracker::DeconstructTuple(
@@ -220,8 +220,10 @@ void AllocationTracker::AddAllocationOrIncrementRefCount(
   AllocationMap& allocation_map = opaque_to_allocation_map_[device_ordinal];
   auto it = allocation_map.find(device_memory.opaque());
   if (it == allocation_map.end()) {
-    allocation_map[device_memory.opaque()] = {device_memory, device_ordinal,
-                                              /*ref_count=*/1};
+    allocation_map[device_memory.opaque()] = {
+        OwningDeviceMemory(device_memory, device_ordinal,
+                           backend_->memory_allocator()),
+        /*ref_count=*/1};
   } else {
     it->second.ref_count++;
   }
@@ -235,13 +237,12 @@ Status AllocationTracker::DecrementRefCount(se::DeviceMemoryBase device_memory,
   Allocation& allocation = it->second;
   TF_RET_CHECK(allocation.ref_count >= 1);
   if (allocation.ref_count == 1) {
-    TF_RETURN_IF_ERROR(backend_->memory_allocator()->Deallocate(
-        device_ordinal, &device_memory));
+    allocation.device_memory.Free();
     allocation_map.erase(it);
   } else {
     allocation.ref_count--;
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/allocation_tracker.h b/tensorflow/compiler/xla/service/allocation_tracker.h
index 1174fa641c06ae..a7d8927cf7e90d 100644
--- a/tensorflow/compiler/xla/service/allocation_tracker.h
+++ b/tensorflow/compiler/xla/service/allocation_tracker.h
@@ -76,10 +76,7 @@ class AllocationTracker {
   // Data structure encapsulating single memory allocation on the device.
   struct Allocation {
     // The pointer to this allocation.
-    se::DeviceMemoryBase device_memory;
-
-    // The device that the memory is allocated on.
-    int device_ordinal;
+    OwningDeviceMemory device_memory;
 
     // This is the number of times this memory allocation is referred to by
     // registered data handles.
@@ -126,7 +123,10 @@ class AllocationTracker {
   int64 next_handle_ GUARDED_BY(mutex_);
 
   // A map from device ordinal to AllocationMap.
-  tensorflow::gtl::FlatMap<int, AllocationMap> opaque_to_allocation_map_
+  //
+  // This is not a TF FlatMap because (currently) FlatMap (and therefore
+  // AllocationMap) is not movable.
+  std::unordered_map<int, AllocationMap> opaque_to_allocation_map_
       GUARDED_BY(mutex_);
 
   // A map from data handle to a vector of shaped buffers that represent the
diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc
index b1d616ec3506f9..349b32451a697d 100644
--- a/tensorflow/compiler/xla/service/backend.cc
+++ b/tensorflow/compiler/xla/service/backend.cc
@@ -138,9 +138,6 @@ Backend::Backend(
       << "Service found no devices for backend " << platform_->Name() << '.';
 
   if (platform->id() == se::host::kHostPlatformId) {
-    inter_op_thread_pool_.reset(new tensorflow::thread::ThreadPool(
-        tensorflow::Env::Default(), "xla_inter_op",
-        tensorflow::port::NumSchedulableCPUs()));
     const int num_threads = intra_op_parallelism_threads > 0
                                 ? intra_op_parallelism_threads
                                 : tensorflow::port::NumSchedulableCPUs();
@@ -155,10 +152,6 @@ int Backend::default_device_ordinal() const {
   return default_stream_executor()->device_ordinal();
 }
 
-tensorflow::thread::ThreadPool* Backend::inter_op_thread_pool() const {
-  return inter_op_thread_pool_.get();
-}
-
 const Eigen::ThreadPoolDevice* Backend::eigen_intra_op_thread_pool_device()
     const {
   if (intra_op_thread_pool_wrapper_ == nullptr) {
diff --git a/tensorflow/compiler/xla/service/backend.h b/tensorflow/compiler/xla/service/backend.h
index d32a0a400d8bd5..6546602473e338 100644
--- a/tensorflow/compiler/xla/service/backend.h
+++ b/tensorflow/compiler/xla/service/backend.h
@@ -140,10 +140,6 @@ class Backend {
   // be equivalent to an executable compiled for the other.
   StatusOr<bool> devices_equivalent(int device_ordinal_a, int device_ordinal_b);
 
-  // For the host platform, returns the threadpool to use when scheduling
-  // parallel operators. For other platforms, returns NULL.
-  tensorflow::thread::ThreadPool* inter_op_thread_pool() const;
-
   // For the host platform, returns the configured eigen threadpool device to be
   // used for scheduling work. For other platforms, returns NULL.
   const Eigen::ThreadPoolDevice* eigen_intra_op_thread_pool_device() const;
@@ -178,9 +174,6 @@ class Backend {
   // The default memory allocator to use.
   std::unique_ptr<StreamExecutorMemoryAllocator> memory_allocator_;
 
-  // For the CPU backend, a threadpool for scheduling parallel operators.
-  std::unique_ptr<tensorflow::thread::ThreadPool> inter_op_thread_pool_;
-
   // For the CPU backend, an Eigen threadpool device for use by Eigen code.
   std::unique_ptr<EigenThreadPoolWrapper> intra_op_thread_pool_wrapper_;
 };
diff --git a/tensorflow/compiler/xla/service/batch_dot_simplification.cc b/tensorflow/compiler/xla/service/batch_dot_simplification.cc
new file mode 100644
index 00000000000000..2099916509acdb
--- /dev/null
+++ b/tensorflow/compiler/xla/service/batch_dot_simplification.cc
@@ -0,0 +1,99 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/batch_dot_simplification.h"
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
+
+namespace xla {
+StatusOr<bool>
+BatchDotSimplification::ElideDegenerateBatchDimensionFromBatchDot(
+    HloInstruction* batch_dot) {
+  const DotDimensionNumbers& dim_numbers = batch_dot->dot_dimension_numbers();
+  HloInstruction *lhs = batch_dot->mutable_operand(0),
+                 *rhs = batch_dot->mutable_operand(1);
+  const Shape& lhs_shape = lhs->shape();
+
+  std::vector<int64> degenerate_dims;
+  for (int64 batch_dim : dim_numbers.lhs_batch_dimensions()) {
+    if (lhs_shape.dimensions(batch_dim) == 1) {
+      degenerate_dims.push_back(batch_dim);
+    }
+  }
+
+  if (degenerate_dims.empty()) {
+    return false;
+  }
+
+  TF_ASSIGN_OR_RETURN(HloInstruction * new_lhs,
+                      ElideDegenerateDims(lhs, degenerate_dims));
+  TF_ASSIGN_OR_RETURN(HloInstruction * new_rhs,
+                      ElideDegenerateDims(rhs, degenerate_dims));
+
+  DotDimensionNumbers new_dim_numbers = dim_numbers;
+  new_dim_numbers.clear_lhs_batch_dimensions();
+  new_dim_numbers.clear_rhs_batch_dimensions();
+
+  for (int64 i = 0, e = dim_numbers.lhs_batch_dimensions_size() -
+                        degenerate_dims.size();
+       i < e; i++) {
+    new_dim_numbers.add_lhs_batch_dimensions(i);
+    new_dim_numbers.add_rhs_batch_dimensions(i);
+  }
+
+  new_dim_numbers.set_lhs_contracting_dimensions(
+      0,
+      new_dim_numbers.lhs_contracting_dimensions(0) - degenerate_dims.size());
+  new_dim_numbers.set_rhs_contracting_dimensions(
+      0,
+      new_dim_numbers.rhs_contracting_dimensions(0) - degenerate_dims.size());
+
+  TF_ASSIGN_OR_RETURN(HloInstruction * new_dot,
+                      MakeDotHlo(new_lhs, new_rhs, new_dim_numbers));
+
+  TF_ASSIGN_OR_RETURN(HloInstruction * new_dot_reshaped,
+                      MakeReshapeHlo(batch_dot->shape(), new_dot));
+
+  VLOG(2) << "Replaced " << batch_dot->ToString() << " with "
+          << new_dot->ToString();
+
+  TF_RETURN_IF_ERROR(
+      batch_dot->parent()->ReplaceInstruction(batch_dot, new_dot_reshaped));
+
+  return true;
+}
+
+tensorflow::StringPiece BatchDotSimplification::name() const {
+  return "batch-dot-simplification";
+}
+
+StatusOr<bool> BatchDotSimplification::Run(HloModule* module) {
+  bool changed = false;
+  std::vector<HloInstruction*> dot_instrs;
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    c_copy_if(computation->instructions(), std::back_inserter(dot_instrs),
+              [](HloInstruction* instr) {
+                return instr->opcode() == HloOpcode::kDot;
+              });
+  }
+  for (HloInstruction* dot_instr : dot_instrs) {
+    TF_ASSIGN_OR_RETURN(bool elided_batch_dim_from_one,
+                        ElideDegenerateBatchDimensionFromBatchDot(dot_instr));
+    changed |= elided_batch_dim_from_one;
+  }
+  return changed;
+}
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/batch_dot_simplification.h b/tensorflow/compiler/xla/service/batch_dot_simplification.h
new file mode 100644
index 00000000000000..c0ca8d8ebac1a3
--- /dev/null
+++ b/tensorflow/compiler/xla/service/batch_dot_simplification.h
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_BATCH_DOT_SIMPLIFICATION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_BATCH_DOT_SIMPLIFICATION_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+// Simplifies batch dot operations.
+//
+// Normally these would live in the algebraic simplifier, but we want to run
+// this to fixpoint (this pass reaches fixed point in one execution) before we
+// run the DotDecomposer.
+class BatchDotSimplification : public HloPassInterface {
+ public:
+  StatusOr<bool> Run(HloModule* module) override;
+  tensorflow::StringPiece name() const override;
+
+ private:
+  StatusOr<bool> ElideDegenerateBatchDimensionFromBatchDot(
+      HloInstruction* batch_dot);
+};
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_BATCH_DOT_SIMPLIFICATION_H_
diff --git a/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc b/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc
new file mode 100644
index 00000000000000..38f1a5d3a645f9
--- /dev/null
+++ b/tensorflow/compiler/xla/service/batch_dot_simplification_test.cc
@@ -0,0 +1,168 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/batch_dot_simplification.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+
+namespace xla {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+
+class BatchDotSimplificationTest : public HloVerifiedTestBase {};
+
+TEST_F(BatchDotSimplificationTest,
+       ElideSingleDegenerateBatchDotDim_VectorVector) {
+  const string hlo_text = R"(
+HloModule BatchDot
+
+main {
+  a = f32[1,3] parameter(0)
+  b = f32[1,3] parameter(1)
+  ROOT dot = f32[1] dot(a, b), lhs_batch_dims={0}, rhs_batch_dims={0}, lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+)";
+
+  ParseAndVerifyModule(hlo_text);
+  BatchDotSimplification pass;
+  ASSERT_TRUE(pass.Run(&module()).ValueOrDie());
+
+  HloInstruction* root = module().entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              op::Reshape(op::Dot(
+                  op::Reshape(op::Parameter(0)), op::Reshape(op::Parameter(1)),
+                  /*lhs_contracting_dim=*/0, /*rhs_contracting_dim=*/0)));
+}
+
+TEST_F(BatchDotSimplificationTest,
+       ElideSingleDegenerateBatchDotDim_MatrixVector) {
+  const string hlo_text = R"(
+HloModule BatchDot
+
+main {
+  a = f32[1,9,3] parameter(0)
+  b = f32[1,3] parameter(1)
+  ROOT dot = f32[1,9] dot(a, b), lhs_batch_dims={0}, rhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_contracting_dims={1}
+}
+)";
+
+  ParseAndVerifyModule(hlo_text);
+  BatchDotSimplification pass;
+  ASSERT_TRUE(pass.Run(&module()).ValueOrDie());
+
+  HloInstruction* root = module().entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              op::Reshape(op::Dot(
+                  op::Reshape(op::Parameter(0)), op::Reshape(op::Parameter(1)),
+                  /*lhs_contracting_dim=*/1, /*rhs_contracting_dim=*/0)));
+}
+
+TEST_F(BatchDotSimplificationTest,
+       ElideSingleDegenerateBatchDotDim_MatrixMatrix) {
+  const string hlo_text = R"(
+HloModule BatchDot
+
+main {
+  a = f32[1,9,3] parameter(0)
+  b = f32[1,3,7] parameter(1)
+  ROOT dot = f32[1,9,7] dot(a, b), lhs_batch_dims={0}, rhs_batch_dims={0}, lhs_contracting_dims={2}, rhs_contracting_dims={1}
+}
+)";
+
+  ParseAndVerifyModule(hlo_text);
+  BatchDotSimplification pass;
+  ASSERT_TRUE(pass.Run(&module()).ValueOrDie());
+
+  HloInstruction* root = module().entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              op::Reshape(op::Dot(
+                  op::Reshape(op::Parameter(0)), op::Reshape(op::Parameter(1)),
+                  /*lhs_contracting_dim=*/1, /*rhs_contracting_dim=*/0)));
+}
+
+TEST_F(BatchDotSimplificationTest,
+       ElideMultipleDegenerateBatchDotDims_VectorVector) {
+  const string hlo_text = R"(
+HloModule BatchDot
+
+main {
+  a = f32[9,1,7,1,3] parameter(0)
+  b = f32[9,1,7,1,3] parameter(1)
+  ROOT dot = f32[9,1,7,1] dot(a, b), lhs_batch_dims={0,1,2,3}, rhs_batch_dims={0,1,2,3}, lhs_contracting_dims={4}, rhs_contracting_dims={4}
+}
+)";
+
+  ParseAndVerifyModule(hlo_text);
+  BatchDotSimplification pass;
+  ASSERT_TRUE(pass.Run(&module()).ValueOrDie());
+
+  HloInstruction* root = module().entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              op::Reshape(op::Dot(
+                  op::Reshape(op::Parameter(0)), op::Reshape(op::Parameter(1)),
+                  /*lhs_contracting_dim=*/2, /*rhs_contracting_dim=*/2)));
+}
+
+TEST_F(BatchDotSimplificationTest,
+       ElideMultipleDegenerateBatchDotDims_VectorMatrix) {
+  const string hlo_text = R"(
+HloModule BatchDot
+
+main {
+  a = f32[9,1,7,1,3] parameter(0)
+  b = f32[9,1,7,1,20,3] parameter(1)
+  ROOT dot = f32[9,1,7,1,20] dot(a, b), lhs_batch_dims={0,1,2,3}, rhs_batch_dims={0,1,2,3}, lhs_contracting_dims={4}, rhs_contracting_dims={5}
+}
+)";
+
+  ParseAndVerifyModule(hlo_text);
+  BatchDotSimplification pass;
+  ASSERT_TRUE(pass.Run(&module()).ValueOrDie());
+
+  HloInstruction* root = module().entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              op::Reshape(op::Dot(
+                  op::Reshape(op::Parameter(0)), op::Reshape(op::Parameter(1)),
+                  /*lhs_contracting_dim=*/2, /*rhs_contracting_dim=*/3)));
+}
+
+TEST_F(BatchDotSimplificationTest,
+       ElideMultipleDegenerateBatchDotDims_MatrixMatrix) {
+  const string hlo_text = R"(
+HloModule BatchDot
+
+main {
+  a = f32[9,1,7,1,19,3] parameter(0)
+  b = f32[9,1,7,1,3,20] parameter(1)
+  ROOT dot = f32[9,1,7,1,19,20] dot(a, b), lhs_batch_dims={0,1,2,3}, rhs_batch_dims={0,1,2,3}, lhs_contracting_dims={5}, rhs_contracting_dims={4}
+}
+)";
+
+  ParseAndVerifyModule(hlo_text);
+  BatchDotSimplification pass;
+  ASSERT_TRUE(pass.Run(&module()).ValueOrDie());
+
+  HloInstruction* root = module().entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              op::Reshape(op::Dot(
+                  op::Reshape(op::Parameter(0)), op::Reshape(op::Parameter(1)),
+                  /*lhs_contracting_dim=*/3, /*rhs_contracting_dim=*/2)));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/batchnorm_expander.cc b/tensorflow/compiler/xla/service/batchnorm_expander.cc
index 38086bd7e12184..598718c72c6941 100644
--- a/tensorflow/compiler/xla/service/batchnorm_expander.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_expander.cc
@@ -15,35 +15,32 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/batchnorm_expander.h"
 
-#include <algorithm>
 #include <memory>
-#include <numeric>
-#include <set>
 #include <string>
 #include <utility>
 #include <vector>
 
-#include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 
+namespace {
+
 // BatchNormExpanderVisitor traverses the HLO computation and rewrites BatchNorm
 // operations into smaller operations.
 class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault {
@@ -80,30 +77,88 @@ class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault {
         rewrite_grad_op_(rewrite_grad_op),
         use_fusion_(use_fusion) {}
 
-  HloComputation* GetScalarBinaryComputation(PrimitiveType primitive_type,
-                                             HloOpcode opcode) {
-    HloComputation::Builder b("scalar_computation");
-    auto scalar_lhs = b.AddInstruction(HloInstruction::CreateParameter(
-        0, ShapeUtil::MakeShape(primitive_type, {}), "scalar_lhs"));
-    auto scalar_rhs = b.AddInstruction(HloInstruction::CreateParameter(
-        1, ShapeUtil::MakeShape(primitive_type, {}), "scalar_rhs"));
-    auto scalar_op = b.AddInstruction(
-        HloInstruction::CreateBinary(ShapeUtil::MakeShape(primitive_type, {}),
-                                     opcode, scalar_lhs, scalar_rhs));
-    return computation_->parent()->AddEmbeddedComputation(b.Build(scalar_op));
+  HloComputation* GetOrCreateScalarAddComputation(
+      PrimitiveType primitive_type) {
+    HloComputation** scalar_add_computation =
+        &scalar_add_computations_[primitive_type];
+    if (*scalar_add_computation) {
+      return *scalar_add_computation;
+    }
+
+    HloComputation::Builder b("scalar_add_computation");
+    Shape shape = ShapeUtil::MakeShape(primitive_type, {});
+    auto scalar_lhs = b.AddInstruction(
+        HloInstruction::CreateParameter(0, shape, "scalar_lhs"));
+    auto scalar_rhs = b.AddInstruction(
+        HloInstruction::CreateParameter(1, shape, "scalar_rhs"));
+    auto scalar_op = b.AddInstruction(HloInstruction::CreateBinary(
+        shape, HloOpcode::kAdd, scalar_lhs, scalar_rhs));
+    *scalar_add_computation =
+        computation_->parent()->AddEmbeddedComputation(b.Build(scalar_op));
+    return *scalar_add_computation;
   }
 
-  // Current HloComputation instance the BatchNormExpander is
-  // traversing.
-  HloComputation* computation_;
+  // TODO(b/80534766): Remove maps after performance issues with scalar
+  // broadcasts are resolved on all backends.
+  HloComputation* GetOrCreateScalarRsqrtComputation(
+      PrimitiveType primitive_type) {
+    HloComputation** scalar_rsqrt_computation =
+        &scalar_rsqrt_computations_[primitive_type];
+    if (*scalar_rsqrt_computation) {
+      return *scalar_rsqrt_computation;
+    }
 
-  bool rewrite_training_op_;
-  bool rewrite_inference_op_;
-  bool rewrite_grad_op_;
-  bool use_fusion_;
+    HloComputation::Builder b("scalar_add_computation");
+    Shape shape = ShapeUtil::MakeShape(primitive_type, {});
+    auto scalar_lhs = b.AddInstruction(
+        HloInstruction::CreateParameter(0, shape, "scalar_lhs"));
+    auto scalar_rhs = b.AddInstruction(HloInstruction::CreateConvert(
+        shape, b.AddInstruction(HloInstruction::CreateConstant(
+                   Literal::CreateR0<float>(-0.5f)))));
+    auto scalar_op = b.AddInstruction(HloInstruction::CreateBinary(
+        shape, HloOpcode::kPower, scalar_lhs, scalar_rhs));
+    *scalar_rsqrt_computation =
+        computation_->parent()->AddEmbeddedComputation(b.Build(scalar_op));
+    return *scalar_rsqrt_computation;
+  }
 
-  // Whether rewrite has occurred.
-  bool changed_ = false;
+  std::unique_ptr<HloInstruction> Rsqrt(HloInstruction* operand) {
+    return HloInstruction::CreateMap(
+        operand->shape(), {operand},
+        GetOrCreateScalarRsqrtComputation(operand->shape().element_type()));
+  }
+
+  HloComputation* GetOrCreateScalarMeanComputation(PrimitiveType primitive_type,
+                                                   int64 element_count) {
+    HloComputation** scalar_mean_computation =
+        &scalar_mean_computations_[std::pair<PrimitiveType, int64>(
+            primitive_type, element_count)];
+    if (*scalar_mean_computation) {
+      return *scalar_mean_computation;
+    }
+
+    HloComputation::Builder b("scalar_add_computation");
+    Shape shape = ShapeUtil::MakeShape(primitive_type, {});
+    auto scalar_lhs = b.AddInstruction(
+        HloInstruction::CreateParameter(0, shape, "scalar_lhs"));
+    auto scalar_rhs = b.AddInstruction(HloInstruction::CreateConvert(
+        shape, b.AddInstruction(
+                   HloInstruction::CreateConstant(Literal::CreateR0<float>(
+                       1.0f / static_cast<float>(element_count))))));
+    auto scalar_op = b.AddInstruction(HloInstruction::CreateBinary(
+        shape, HloOpcode::kMultiply, scalar_lhs, scalar_rhs));
+    *scalar_mean_computation =
+        computation_->parent()->AddEmbeddedComputation(b.Build(scalar_op));
+    return *scalar_mean_computation;
+  }
+
+  std::unique_ptr<HloInstruction> Mean(int64 element_count,
+                                       HloInstruction* operand) {
+    return HloInstruction::CreateMap(
+        operand->shape(), {operand},
+        GetOrCreateScalarMeanComputation(operand->shape().element_type(),
+                                         element_count));
+  }
 
   // Replaces the existing HLO instruction old_instruction, with
   // new_instruction, and marks the optimizer status as changed.
@@ -127,8 +182,29 @@ class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault {
     changed_ = true;
     return Status::OK();
   }
+  // Current HloComputation instance the BatchNormExpander is
+  // traversing.
+  HloComputation* computation_;
+
+  bool rewrite_training_op_;
+  bool rewrite_inference_op_;
+  bool rewrite_grad_op_;
+  bool use_fusion_;
+
+  // Whether rewrite has occurred.
+  bool changed_ = false;
+
+  // Cached computations for adding two scalars.
+  tensorflow::gtl::FlatMap<PrimitiveType, HloComputation*>
+      scalar_add_computations_;
+  tensorflow::gtl::FlatMap<PrimitiveType, HloComputation*>
+      scalar_rsqrt_computations_;
+  tensorflow::gtl::FlatMap<std::pair<PrimitiveType, int64>, HloComputation*>
+      scalar_mean_computations_;
 };
 
+}  // namespace
+
 bool BatchNormExpanderVisitor::Run(HloComputation* computation,
                                    bool rewrite_training_op,
                                    bool rewrite_inference_op,
@@ -156,6 +232,10 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining(
     added_instructions.push_back(added_inst);
     return added_inst;
   };
+  auto add_binary = [&](const Shape& shape, const HloOpcode opcode,
+                        HloInstruction* a, HloInstruction* b) {
+    return add(HloInstruction::CreateBinary(shape, opcode, a, b));
+  };
   int64 instruction_count_before = computation_->instruction_count();
 
   // Expand batch norm training into smaller HLO ops.
@@ -165,12 +245,7 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining(
   int64 feature_index = batch_norm->feature_index();
   const int64 feature_count = operand_shape.dimensions(feature_index);
   const int64 size_in_elements = ShapeUtil::ElementsIn(operand_shape);
-  auto elements_per_feature_literal =
-      Literal::CreateR0<float>(size_in_elements / feature_count);
-  TF_ASSIGN_OR_RETURN(elements_per_feature_literal,
-                      elements_per_feature_literal->Convert(ptype));
-  auto elements_per_feature = add(
-      HloInstruction::CreateConstant(std::move(elements_per_feature_literal)));
+  int64 elements_per_feature_int64 = size_in_elements / feature_count;
 
   HloInstruction* scale = batch_norm->mutable_operand(1);
   HloInstruction* offset = batch_norm->mutable_operand(2);
@@ -182,8 +257,9 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining(
 
   auto epsilon_literal = Literal::CreateR0(batch_norm->epsilon());
   TF_ASSIGN_OR_RETURN(epsilon_literal, epsilon_literal->Convert(ptype));
-  auto epsilon =
-      add(HloInstruction::CreateConstant(std::move(epsilon_literal)));
+  auto epsilon = add(HloInstruction::CreateBroadcast(
+      operand_shape,
+      add(HloInstruction::CreateConstant(std::move(epsilon_literal))), {}));
   std::vector<int64> dimensions_without_feature;
 
   for (int64 i = 0; i < ShapeUtil::Rank(operand_shape); ++i) {
@@ -199,11 +275,11 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining(
       HloInstruction::CreateBroadcast(operand_shape, offset, {feature_index}));
 
   HloComputation* add_reduce_computation =
-      GetScalarBinaryComputation(ptype, HloOpcode::kAdd);
+      GetOrCreateScalarAddComputation(ptype);
 
   // X^2.
-  auto operand_squared = add(HloInstruction::CreateBinary(
-      operand_shape, HloOpcode::kMultiply, operand, operand));
+  auto operand_squared =
+      add_binary(operand_shape, HloOpcode::kMultiply, operand, operand);
   // Sum[X].
   auto sum = add(HloInstruction::CreateReduce(feature_shape, operand, zero,
                                               dimensions_without_feature,
@@ -229,56 +305,47 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining(
   }
 
   // E[X].
-  auto mean = add(HloInstruction::CreateBinary(
-      feature_shape, HloOpcode::kDivide, sum, elements_per_feature));
+  auto mean = add(Mean(elements_per_feature_int64, sum));
 
   auto mean_broadcasted = add(
       HloInstruction::CreateBroadcast(operand_shape, mean, {feature_index}));
 
   // E[X^2].
-  auto square_mean = add(HloInstruction::CreateBinary(
-      feature_shape, HloOpcode::kDivide, squared_sum, elements_per_feature));
+  auto square_mean = add(Mean(elements_per_feature_int64, squared_sum));
 
   // E^2[X].
-  auto mean_square = add(HloInstruction::CreateBinary(
-      feature_shape, HloOpcode::kMultiply, mean, mean));
+  auto mean_square =
+      add_binary(feature_shape, HloOpcode::kMultiply, mean, mean);
 
   // Var[X].
-  auto var = add(HloInstruction::CreateBinary(
-      feature_shape, HloOpcode::kSubtract, square_mean, mean_square));
+  auto var =
+      add_binary(feature_shape, HloOpcode::kSubtract, square_mean, mean_square);
 
   auto var_broadcasted =
       add(HloInstruction::CreateBroadcast(operand_shape, var, {feature_index}));
 
   // Var[X] + epsilon.
-  auto var_add_epsilon = add(HloInstruction::CreateBinary(
-      operand_shape, HloOpcode::kAdd, var_broadcasted, epsilon));
-
-  auto neg_half_literal = Literal::CreateR0(-0.5f);
-  TF_ASSIGN_OR_RETURN(neg_half_literal, neg_half_literal->Convert(ptype));
-  auto neg_half =
-      add(HloInstruction::CreateConstant(std::move(neg_half_literal)));
+  auto var_add_epsilon =
+      add_binary(operand_shape, HloOpcode::kAdd, var_broadcasted, epsilon);
 
   // 1 / Sqrt[Var[X] + epsilon].
-  auto rsqrt_var_add_epsilon = add(HloInstruction::CreateBinary(
-      operand_shape, HloOpcode::kPower, var_add_epsilon, neg_half));
+  auto rsqrt_var_add_epsilon = add(Rsqrt(var_add_epsilon));
 
   // X - E[X].
-  auto operand_minus_mean = add(HloInstruction::CreateBinary(
-      operand_shape, HloOpcode::kSubtract, operand, mean_broadcasted));
+  auto operand_minus_mean = add_binary(operand_shape, HloOpcode::kSubtract,
+                                       operand, mean_broadcasted);
 
   // (X - E[X]) / Sqrt[Var[X] + epsilon].
-  auto normalized = add(
-      HloInstruction::CreateBinary(operand_shape, HloOpcode::kMultiply,
-                                   operand_minus_mean, rsqrt_var_add_epsilon));
+  auto normalized = add_binary(operand_shape, HloOpcode::kMultiply,
+                               operand_minus_mean, rsqrt_var_add_epsilon);
 
   // (X - E[X]) / Sqrt[Var[X] + epsilon] * scale.
-  auto scaled_normalized = add(HloInstruction::CreateBinary(
-      operand_shape, HloOpcode::kMultiply, normalized, scale_broadcasted));
+  auto scaled_normalized = add_binary(operand_shape, HloOpcode::kMultiply,
+                                      normalized, scale_broadcasted);
 
   // (X - E[X]) / Sqrt[Var[X] + epsilon] * scale + offset.
-  auto shifted_normalized = add(HloInstruction::CreateBinary(
-      operand_shape, HloOpcode::kAdd, scaled_normalized, offset_broadcasted));
+  auto shifted_normalized = add_binary(operand_shape, HloOpcode::kAdd,
+                                       scaled_normalized, offset_broadcasted);
 
   auto tuple = HloInstruction::CreateTuple({shifted_normalized, mean, var});
 
@@ -320,8 +387,11 @@ Status BatchNormExpanderVisitor::HandleBatchNormInference(
 
   auto epsilon_literal = Literal::CreateR0(batch_norm->epsilon());
   TF_ASSIGN_OR_RETURN(epsilon_literal, epsilon_literal->Convert(ptype));
-  auto epsilon = computation_->AddInstruction(
-      HloInstruction::CreateConstant(std::move(epsilon_literal)));
+  auto epsilon = computation_->AddInstruction(HloInstruction::CreateBroadcast(
+      operand_shape,
+      computation_->AddInstruction(
+          HloInstruction::CreateConstant(std::move(epsilon_literal))),
+      {}));
 
   std::vector<int64> dimensions_without_feature;
 
@@ -338,6 +408,10 @@ Status BatchNormExpanderVisitor::HandleBatchNormInference(
     added_instructions.push_back(added_inst);
     return added_inst;
   };
+  auto add_binary = [&](const Shape& shape, const HloOpcode opcode,
+                        HloInstruction* a, HloInstruction* b) {
+    return add(HloInstruction::CreateBinary(shape, opcode, a, b));
+  };
   int64 instruction_count_before = computation_->instruction_count();
 
   auto scale_broadcasted = add(
@@ -353,30 +427,23 @@ Status BatchNormExpanderVisitor::HandleBatchNormInference(
       add(HloInstruction::CreateBroadcast(operand_shape, var, {feature_index}));
 
   // Var[X] + epsilon.
-  auto var_add_epsilon = add(HloInstruction::CreateBinary(
-      operand_shape, HloOpcode::kAdd, var_broadcasted, epsilon));
-
-  auto neg_half_literal = Literal::CreateR0(-0.5f);
-  TF_ASSIGN_OR_RETURN(neg_half_literal, neg_half_literal->Convert(ptype));
-  auto neg_half =
-      add(HloInstruction::CreateConstant(std::move(neg_half_literal)));
+  auto var_add_epsilon =
+      add_binary(operand_shape, HloOpcode::kAdd, var_broadcasted, epsilon);
 
   // 1 / Sqrt[Var[X] + epsilon].
-  auto rsqrt_var_add_epsilon = add(HloInstruction::CreateBinary(
-      operand_shape, HloOpcode::kPower, var_add_epsilon, neg_half));
+  auto rsqrt_var_add_epsilon = add(Rsqrt(var_add_epsilon));
 
   // X - E[X].
-  auto operand_minus_mean = add(HloInstruction::CreateBinary(
-      operand_shape, HloOpcode::kSubtract, operand, mean_broadcasted));
+  auto operand_minus_mean = add_binary(operand_shape, HloOpcode::kSubtract,
+                                       operand, mean_broadcasted);
 
   // (X - E[X]) / Sqrt[Var[X] + epsilon].
-  auto normalized = add(
-      HloInstruction::CreateBinary(operand_shape, HloOpcode::kMultiply,
-                                   operand_minus_mean, rsqrt_var_add_epsilon));
+  auto normalized = add_binary(operand_shape, HloOpcode::kMultiply,
+                               operand_minus_mean, rsqrt_var_add_epsilon);
 
   // (X - E[X]) / Sqrt[Var[X] + epsilon] * scale.
-  auto scaled_normalized = add(HloInstruction::CreateBinary(
-      operand_shape, HloOpcode::kMultiply, normalized, scale_broadcasted));
+  auto scaled_normalized = add_binary(operand_shape, HloOpcode::kMultiply,
+                                      normalized, scale_broadcasted);
 
   // (X - E[X]) / Sqrt[Var[X] + epsilon] * scale + offset.
   auto shifted_normalized = HloInstruction::CreateBinary(
@@ -424,6 +491,10 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad(
     added_instructions.push_back(added_inst);
     return added_inst;
   };
+  auto add_binary = [&](const Shape& shape, const HloOpcode opcode,
+                        HloInstruction* a, HloInstruction* b) {
+    return add(HloInstruction::CreateBinary(shape, opcode, a, b));
+  };
   int64 instruction_count_before = computation_->instruction_count();
 
   HloInstruction* activation = batch_norm->mutable_operand(0);
@@ -439,26 +510,20 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad(
 
   const int64 size_in_elements = ShapeUtil::ElementsIn(activation_shape);
   const int64 feature_count = activation_shape.dimensions(feature_index);
-  auto elements_per_feature_literal =
-      Literal::CreateR0<float>(size_in_elements / feature_count);
-  TF_ASSIGN_OR_RETURN(elements_per_feature_literal,
-                      elements_per_feature_literal->Convert(ptype));
-  auto elements_per_feature = add(
-      HloInstruction::CreateConstant(std::move(elements_per_feature_literal)));
+  const int64 elements_per_feature_int64 = size_in_elements / feature_count;
 
   auto zero_literal = Literal::CreateR0(0.0f);
   TF_ASSIGN_OR_RETURN(zero_literal, zero_literal->Convert(ptype));
   auto zero = add(HloInstruction::CreateConstant(std::move(zero_literal)));
 
-  auto neg_half_literal = Literal::CreateR0(-0.5f);
-  TF_ASSIGN_OR_RETURN(neg_half_literal, neg_half_literal->Convert(ptype));
-  auto neg_half =
-      add(HloInstruction::CreateConstant(std::move(neg_half_literal)));
-
   auto epsilon_literal = Literal::CreateR0(batch_norm->epsilon());
   TF_ASSIGN_OR_RETURN(epsilon_literal, epsilon_literal->Convert(ptype));
-  auto epsilon =
+  auto epsilon_scalar =
       add(HloInstruction::CreateConstant(std::move(epsilon_literal)));
+  auto epsilon_activation = add(
+      HloInstruction::CreateBroadcast(activation_shape, epsilon_scalar, {}));
+  auto epsilon_feature =
+      add(HloInstruction::CreateBroadcast(feature_shape, epsilon_scalar, {}));
 
   std::vector<int64> dimensions_without_feature;
 
@@ -478,29 +543,24 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad(
       HloInstruction::CreateBroadcast(activation_shape, mean, {feature_index}));
 
   // rsqrt[Var[X] + epsilon].
-  auto rsqrt_var_add_epsilon_broadcasted = add(HloInstruction::CreateBinary(
-      activation_shape, HloOpcode::kPower,
-      add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kAdd,
-                                       variance_broadcasted, epsilon)),
-      neg_half));
-
-  auto rsqrt_var_add_epsilon = add(HloInstruction::CreateBinary(
-      feature_shape, HloOpcode::kPower,
-      add(HloInstruction::CreateBinary(feature_shape, HloOpcode::kAdd, variance,
-                                       epsilon)),
-      neg_half));
+  auto rsqrt_var_add_epsilon_broadcasted =
+      add(Rsqrt(add_binary(activation_shape, HloOpcode::kAdd,
+                           variance_broadcasted, epsilon_activation)));
+
+  auto rsqrt_var_add_epsilon = add(Rsqrt(
+      add_binary(feature_shape, HloOpcode::kAdd, variance, epsilon_feature)));
 
   // X - E[X].
-  auto activation_minus_mean = add(HloInstruction::CreateBinary(
-      activation_shape, HloOpcode::kSubtract, activation, mean_broadcasted));
+  auto activation_minus_mean = add_binary(
+      activation_shape, HloOpcode::kSubtract, activation, mean_broadcasted);
 
   // Grad[Y] * (X - E[X]).
   auto grad_output_times_activiation_minus_mean =
-      add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kMultiply,
-                                       grad_output, activation_minus_mean));
+      add_binary(activation_shape, HloOpcode::kMultiply, grad_output,
+                 activation_minus_mean);
 
   HloComputation* add_reduce_computation =
-      GetScalarBinaryComputation(ptype, HloOpcode::kAdd);
+      GetOrCreateScalarAddComputation(ptype);
 
   // sum(Grad[Y] * (X - E[X])).
   auto sum_grad_output_times_activiation_minus_mean =
@@ -529,9 +589,9 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad(
   }
 
   // Grad[scale] = Sum(Grad[Y] * (X - E[X]) * rsqrt[Var[X] + epsilon]).
-  auto grad_scale = add(HloInstruction::CreateBinary(
-      feature_shape, HloOpcode::kMultiply,
-      sum_grad_output_times_activiation_minus_mean, rsqrt_var_add_epsilon));
+  auto grad_scale = add_binary(feature_shape, HloOpcode::kMultiply,
+                               sum_grad_output_times_activiation_minus_mean,
+                               rsqrt_var_add_epsilon);
 
   // I2 = Sum(Grad[Y])
   auto i2 = add(HloInstruction::CreateBroadcast(activation_shape, grad_beta,
@@ -543,39 +603,40 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad(
       {feature_index}));
 
   // I4 = (X - E[X]) * I3
-  auto i4 = add(HloInstruction::CreateBinary(
-      activation_shape, HloOpcode::kMultiply, i3, activation_minus_mean));
+  auto i4 = add_binary(activation_shape, HloOpcode::kMultiply, i3,
+                       activation_minus_mean);
 
   // I5 = I4 / (Var[X] + epsilon)
-  auto i5 = add(HloInstruction::CreateBinary(
-      activation_shape, HloOpcode::kDivide, i4,
-      add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kAdd,
-                                       variance_broadcasted, epsilon))));
+  auto i5 = add_binary(activation_shape, HloOpcode::kDivide, i4,
+                       add_binary(activation_shape, HloOpcode::kAdd,
+                                  variance_broadcasted, epsilon_activation));
 
   // scale * rsqrt[Var[X] + epsilon] * 1/N
-  auto scale_times_rsqrt_var_add_epsilon = add(HloInstruction::CreateBinary(
-      activation_shape, HloOpcode::kMultiply, scale_broadcasted,
-      rsqrt_var_add_epsilon_broadcasted));
+  auto scale_times_rsqrt_var_add_epsilon =
+      add_binary(activation_shape, HloOpcode::kMultiply, scale_broadcasted,
+                 rsqrt_var_add_epsilon_broadcasted);
 
-  scale_times_rsqrt_var_add_epsilon = add(HloInstruction::CreateBinary(
-      activation_shape, HloOpcode::kDivide, scale_times_rsqrt_var_add_epsilon,
-      elements_per_feature));
+  scale_times_rsqrt_var_add_epsilon =
+      add(Mean(elements_per_feature_int64, scale_times_rsqrt_var_add_epsilon));
 
-  auto i1 =
-      add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kMultiply,
-                                       grad_output, elements_per_feature));
+  auto elements_per_feature_literal =
+      Literal::CreateR0<float>(elements_per_feature_int64);
+  TF_ASSIGN_OR_RETURN(elements_per_feature_literal,
+                      elements_per_feature_literal->Convert(ptype));
+  auto elements_per_feature = add(
+      HloInstruction::CreateConstant(std::move(elements_per_feature_literal)));
+  auto i1 = add_binary(activation_shape, HloOpcode::kMultiply, grad_output,
+                       add(HloInstruction::CreateBroadcast(
+                           activation_shape, elements_per_feature, {})));
 
   // I6 = I1 - I2 - I5
-  auto i6 = add(HloInstruction::CreateBinary(
+  auto i6 = add_binary(
       activation_shape, HloOpcode::kSubtract,
-      add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kSubtract,
-                                       i1, i2)),
-      i5));
+      add_binary(activation_shape, HloOpcode::kSubtract, i1, i2), i5);
 
   // Grad[X] = scale * rsqrt[Var[X] + epsilon] * 1/N * I6.
-  auto grad_activation =
-      add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kMultiply,
-                                       scale_times_rsqrt_var_add_epsilon, i6));
+  auto grad_activation = add_binary(activation_shape, HloOpcode::kMultiply,
+                                    scale_times_rsqrt_var_add_epsilon, i6);
   auto tuple =
       HloInstruction::CreateTuple({grad_activation, grad_scale, grad_beta});
   if (batch_norm->has_sharding()) {
diff --git a/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc b/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc
index 08d0152e3cfcfc..1b8b2d20450357 100644
--- a/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_conversion_folding.cc
@@ -182,15 +182,26 @@ Status BFloat16ConversionFoldingVisitor::DefaultAction(HloInstruction* hlo) {
 
 Status BFloat16ConversionFoldingVisitor::HandleCrossReplicaSum(
     HloInstruction* crs) {
-  if (!ShapeUtil::IsTuple(crs->shape()) ||
-      !bfloat16_support_->SupportsMixedPrecisions(*crs)) {
-    return DefaultAction(crs);
-  }
-
   // First use DefaultAction() to handle the operands. It can't handle
   // tuple-shaped output.
   TF_RETURN_IF_ERROR(DefaultAction(crs));
 
+  if (!bfloat16_support_->SupportsMixedPrecisions(*crs)) {
+    return Status::OK();
+  }
+
+  // If the output is not a tuple, we don't need special handling.
+  if (!ShapeUtil::IsTuple(crs->shape())) {
+    return Status::OK();
+  }
+
+  // If crs is the root instruction, we should keep its original output type.
+  // The root instruction implicitly has a use from being the result of the
+  // computation, and the code below does not take this use into account.
+  if (crs == computation_->root_instruction()) {
+    return Status::OK();
+  }
+
   // Then do per-tuple-element handling on the output.
   std::vector<std::vector<HloInstruction*>> per_tuple_element_gtes(
       crs->operand_count());
diff --git a/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc b/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
index 28e71c2054f59b..7fd1e733e96da9 100644
--- a/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
@@ -211,6 +211,17 @@ TEST_F(BFloat16ConversionFoldingTest, DoNotFoldTuple) {
 
 TEST_F(BFloat16ConversionFoldingTest, FoldCrossReplicaSumTupleOutput) {
   auto builder = HloComputation::Builder(TestName());
+
+  auto module = CreateNewModule();
+  HloComputation::Builder sum_builder("add");
+  auto x = sum_builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, ShapeUtil::MakeShape(F32, {}), "x"));
+  auto y = sum_builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, ShapeUtil::MakeShape(F32, {}), "y"));
+  sum_builder.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(F32, {}), HloOpcode::kAdd, x, y));
+  HloComputation* sum = module->AddEmbeddedComputation(sum_builder.Build());
+
   Shape f32_shape = ShapeUtil::MakeShape(F32, {2, 4});
   Shape bf16_shape = ShapeUtil::MakeShape(BF16, {2, 4});
 
@@ -223,7 +234,8 @@ TEST_F(BFloat16ConversionFoldingTest, FoldCrossReplicaSumTupleOutput) {
 
   HloInstruction* crs =
       builder.AddInstruction(HloInstruction::CreateCrossReplicaSum(
-          ShapeUtil::MakeTupleShape({f32_shape, f32_shape}), {convert_a, b}));
+          ShapeUtil::MakeTupleShape({f32_shape, f32_shape}), {convert_a, b},
+          sum));
   HloInstruction* gte_a = builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(f32_shape, crs, 0));
   HloInstruction* gte_b = builder.AddInstruction(
@@ -233,7 +245,6 @@ TEST_F(BFloat16ConversionFoldingTest, FoldCrossReplicaSumTupleOutput) {
   HloInstruction* tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({gte_a, convert_gte_b}));
 
-  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_TRUE(FoldConversions(module.get()));
diff --git a/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc b/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
index 1afaefd9df9c57..9926661dd30600 100644
--- a/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
@@ -228,6 +228,17 @@ TEST_F(BFloat16NormalizationTest, ResolveUnsupportedMixedPrecisionReduce) {
 }
 
 TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleCrossReplicaSum) {
+  auto module = CreateNewModule();
+  HloComputation::Builder sum_builder("sum");
+  auto x = sum_builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, ShapeUtil::MakeShape(F32, {}), "x"));
+  auto y = sum_builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, ShapeUtil::MakeShape(F32, {}), "y"));
+  sum_builder.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(F32, {}), HloOpcode::kAdd, x, y));
+  HloComputation* reduction =
+      module->AddEmbeddedComputation(sum_builder.Build());
+
   auto builder = HloComputation::Builder(TestName());
   Shape f32_shape = ShapeUtil::MakeShape(F32, {2, 4});
   Shape bf16_shape = ShapeUtil::MakeShape(BF16, {2, 4});
@@ -239,11 +250,11 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleCrossReplicaSum) {
 
   HloInstruction* crs =
       builder.AddInstruction(HloInstruction::CreateCrossReplicaSum(
-          ShapeUtil::MakeTupleShape({f32_shape, bf16_shape}), {a, b}));
+          ShapeUtil::MakeTupleShape({f32_shape, bf16_shape}), {a, b},
+          reduction));
   HloInstruction* gte = builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(bf16_shape, crs, 1));
 
-  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_TRUE(Normalize(module.get()));
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation.cc b/tensorflow/compiler/xla/service/bfloat16_propagation.cc
index 43ebe92c5ec1c9..ed0746980f87ac 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation.cc
@@ -33,7 +33,7 @@ BFloat16Propagation::BFloat16Propagation(
     const BFloat16Support* bfloat16_support)
     : bfloat16_support_(bfloat16_support) {}
 
-void BFloat16Propagation::DetermineAndMutateFusionComputationPrecision(
+void BFloat16Propagation::DetermineFusionComputationPrecision(
     HloInstruction* fusion) {
   CHECK_EQ(fusion->opcode(), HloOpcode::kFusion);
   if (!bfloat16_support_->SupportsMixedPrecisions(*fusion)) {
@@ -48,15 +48,13 @@ void BFloat16Propagation::DetermineAndMutateFusionComputationPrecision(
   auto root = fusion->fused_instructions_computation()->root_instruction();
 
   // Adjust root's element types according to the fusion's output shape.
-  ShapeUtil::ForEachMutableSubshape(
-      root->mutable_shape(), [&](Shape* subshape, const ShapeIndex& index) {
-        if (subshape->element_type() != F32) {
+  ShapeUtil::ForEachSubshape(
+      root->shape(), [&](const Shape& subshape, const ShapeIndex& index) {
+        if (subshape.element_type() != F32) {
           return;
         }
-        if (ShapeUtil::GetSubshape(fusion->shape(), index).element_type() ==
-            BF16) {
-          subshape->set_element_type(BF16);
-          changed_ = true;
+        if (OutputTypeAfterChange(fusion, index) == BF16) {
+          AddToOrRemoveFromBF16ChangeSet(root, index, BF16);
           VLOG(2) << "Fused root " << root->ToString() << " at shape index "
                   << index << " changed to BF16 precision for fusion "
                   << fusion->ToString();
@@ -67,13 +65,101 @@ void BFloat16Propagation::DetermineAndMutateFusionComputationPrecision(
   auto insts =
       fusion->fused_instructions_computation()->MakeInstructionPostOrder();
   for (auto inst_it = insts.rbegin(); inst_it != insts.rend(); ++inst_it) {
-    DetermineAndMutateInstructionPrecision(*inst_it, /*skip_parameters=*/false);
+    DetermineInstructionPrecision(*inst_it, /*skip_parameters=*/false);
   }
-  computations_visited_in_mutation_pass_.insert(
+  computations_visited_in_backward_pass_.insert(
       fusion->fused_instructions_computation());
+
+  RevertIfFusionInternalBF16Changes(fusion);
+}
+
+void BFloat16Propagation::RevertIfFusionInternalBF16Changes(
+    HloInstruction* fusion) {
+  auto has_changes = [this](HloInstruction* inst) {
+    auto it = changes_to_bf16_.find(inst);
+    return it != changes_to_bf16_.end() && !it->second.empty();
+  };
+
+  auto root = fusion->fused_instructions_computation()->root_instruction();
+  tensorflow::gtl::FlatSet<const HloValue*> changed_root_buffers;
+
+  auto root_changes_it = changes_to_bf16_.find(root);
+  if (root_changes_it != changes_to_bf16_.end()) {
+    for (const auto& index : root_changes_it->second) {
+      for (const HloValue* value :
+           dataflow_->GetValueSet(root, index).values()) {
+        changed_root_buffers.insert(value);
+      }
+    }
+  }
+
+  auto aliases_changed_root_buffer =
+      [this, &changed_root_buffers](const HloInstruction* inst) {
+        bool aliasing = false;
+        ShapeUtil::ForEachSubshape(
+            inst->shape(), [&](const Shape& subshape, const ShapeIndex& index) {
+              if (aliasing) {
+                // Skip if aliasing is already found.
+                return;
+              }
+              // Only F32 buffers are considered for changing to BF16 in this
+              // pass.
+              if (subshape.element_type() != F32) {
+                return;
+              }
+              for (const HloValue* value :
+                   dataflow_->GetValueSet(inst, index).values()) {
+                if (ContainsKey(changed_root_buffers, value)) {
+                  aliasing = true;
+                  break;
+                }
+              }
+            });
+        return aliasing;
+      };
+
+  for (auto inst :
+       fusion->fused_instructions_computation()->MakeInstructionPostOrder()) {
+    if (inst->opcode() == HloOpcode::kParameter) {
+      continue;
+    }
+    if (aliases_changed_root_buffer(inst)) {
+      continue;
+    }
+    if (inst->opcode() == HloOpcode::kFusion) {
+      bool parameter_reverted = false;
+      for (int64 i = 0; i < inst->operand_count(); ++i) {
+        if (has_changes(inst->mutable_operand(i))) {
+          // Changes on the operand have not been reverted.
+          continue;
+        }
+        auto* fused_parameter = inst->fused_parameter(i);
+        if (has_changes(fused_parameter)) {
+          changes_to_bf16_.erase(fused_parameter);
+          parameter_reverted = true;
+        }
+      }
+      if (parameter_reverted) {
+        RevertIfFusionInternalBF16Changes(inst);
+      }
+    }
+    if (!has_changes(inst)) {
+      continue;
+    }
+    bool revert_changes = true;
+    for (auto operand : inst->operands()) {
+      if (has_changes(operand)) {
+        revert_changes = false;
+        break;
+      }
+    }
+    if (revert_changes) {
+      changes_to_bf16_.erase(inst);
+    }
+  }
 }
 
-void BFloat16Propagation::DetermineAndMutateWhileComputationsPrecision(
+void BFloat16Propagation::DetermineWhileComputationsPrecision(
     HloInstruction* while_hlo) {
   CHECK_EQ(while_hlo->opcode(), HloOpcode::kWhile);
 
@@ -86,16 +172,14 @@ void BFloat16Propagation::DetermineAndMutateWhileComputationsPrecision(
   auto body_root = body->root_instruction();
   HloComputation* condition = while_hlo->while_condition();
 
-  ShapeUtil::ForEachMutableSubshape(
-      body_root->mutable_shape(),
-      [this, while_hlo, body_root](Shape* subshape, const ShapeIndex& index) {
-        if (subshape->element_type() != F32) {
+  ShapeUtil::ForEachSubshape(
+      body_root->shape(), [this, while_hlo, body_root](
+                              const Shape& subshape, const ShapeIndex& index) {
+        if (subshape.element_type() != F32) {
           return;
         }
-        if (ShapeUtil::GetSubshape(while_hlo->shape(), index).element_type() ==
-            BF16) {
-          subshape->set_element_type(BF16);
-          changed_ = true;
+        if (OutputTypeAfterChange(while_hlo, index) == BF16) {
+          AddToOrRemoveFromBF16ChangeSet(body_root, index, BF16);
           VLOG(2) << "While body root " << body_root->ToString()
                   << " at shape index " << index
                   << " changed to BF16 precision for while "
@@ -106,30 +190,30 @@ void BFloat16Propagation::DetermineAndMutateWhileComputationsPrecision(
   auto body_insts = body->MakeInstructionPostOrder();
   for (auto inst_it = body_insts.rbegin(); inst_it != body_insts.rend();
        ++inst_it) {
-    DetermineAndMutateInstructionPrecision(*inst_it, /*skip_parameters=*/false);
+    DetermineInstructionPrecision(*inst_it, /*skip_parameters=*/false);
   }
-  computations_visited_in_mutation_pass_.insert(body);
+  computations_visited_in_backward_pass_.insert(body);
 
   auto condition_insts = condition->MakeInstructionPostOrder();
   for (auto inst_it = condition_insts.rbegin();
        inst_it != condition_insts.rend(); ++inst_it) {
-    DetermineAndMutateInstructionPrecision(*inst_it, /*skip_parameters=*/false);
+    DetermineInstructionPrecision(*inst_it, /*skip_parameters=*/false);
   }
-  computations_visited_in_mutation_pass_.insert(condition);
+  computations_visited_in_backward_pass_.insert(condition);
 }
 
 bool BFloat16Propagation::AllUsersConsumeBF16(const HloInstruction& hlo,
                                               const ShapeIndex& index) const {
-  auto value_set = dataflow_->GetValueSet(&hlo, index);
+  auto& value_set = dataflow_->GetValueSet(&hlo, index);
   for (const HloValue* value : value_set.values()) {
     if (ContainsKey(values_that_must_be_kept_as_f32_, value)) {
       return false;
     }
-    if (value->shape().element_type() == BF16) {
+    if (ValueTypeAfterChange(value) == BF16) {
       continue;
     }
     for (const HloUse& use : value->uses()) {
-      if (!ContainsKey(instructions_visited_in_mutation_pass_,
+      if (!ContainsKey(instructions_visited_in_backward_pass_,
                        use.instruction)) {
         // We don't know yet whether use.instruction will consume BF16 since it
         // hasn't been visited. Although we visit instructions in reverse
@@ -145,26 +229,23 @@ bool BFloat16Propagation::AllUsersConsumeBF16(const HloInstruction& hlo,
       // precision, or a called computation's parameters have been changed to
       // BF16 for fusions or whiles.
       if (use.instruction->opcode() == HloOpcode::kFusion) {
-        const auto* fused_parameter =
+        auto* fused_parameter =
             use.instruction->fused_parameter(use.operand_number);
-        if (ShapeUtil::GetSubshape(fused_parameter->shape(), use.operand_index)
-                .element_type() != BF16) {
+        if (OutputTypeAfterChange(fused_parameter, use.operand_index) != BF16) {
           return false;
         }
         continue;
       } else if (use.instruction->opcode() == HloOpcode::kWhile) {
-        const auto* cond_parameter =
+        auto* cond_parameter =
             use.instruction->while_condition()->parameter_instruction(
                 use.operand_number);
-        if (ShapeUtil::GetSubshape(cond_parameter->shape(), use.operand_index)
-                .element_type() != BF16) {
+        if (OutputTypeAfterChange(cond_parameter, use.operand_index) != BF16) {
           return false;
         }
-        const auto* body_parameter =
+        auto* body_parameter =
             use.instruction->while_body()->parameter_instruction(
                 use.operand_number);
-        if (ShapeUtil::GetSubshape(body_parameter->shape(), use.operand_index)
-                .element_type() != BF16) {
+        if (OutputTypeAfterChange(body_parameter, use.operand_index) != BF16) {
           return false;
         }
         continue;
@@ -174,19 +255,20 @@ bool BFloat16Propagation::AllUsersConsumeBF16(const HloInstruction& hlo,
         continue;
       }
       // If the op propagates precision and it outputs a BF16, then it's OK to
-      // supply BF16 also as the input. In the backward mutation pass, the users
-      // shapes should have already been processed.
+      // supply BF16 also as the input. In the backward pass, the users shapes
+      // should have already been processed.
       PrimitiveType user_output_type = PRIMITIVE_TYPE_INVALID;
       if (use.instruction->opcode() == HloOpcode::kTuple ||
           (use.instruction->opcode() == HloOpcode::kCrossReplicaSum &&
            ShapeUtil::IsTuple(use.instruction->shape()))) {
-        user_output_type = ShapeUtil::GetSubshape(
-                               ShapeUtil::GetSubshape(use.instruction->shape(),
-                                                      {use.operand_number}),
-                               use.operand_index)
-                               .element_type();
+        ShapeIndex use_output_index{use.operand_number};
+        for (int64 i : use.operand_index) {
+          use_output_index.push_back(i);
+        }
+        user_output_type =
+            OutputTypeAfterChange(use.instruction, use_output_index);
       } else {
-        user_output_type = use.instruction->shape().element_type();
+        user_output_type = OutputTypeAfterChange(use.instruction, {});
       }
       if (bfloat16_support_->EffectiveOperandPrecisionIsOutputPrecision(
               *use.instruction, use.operand_number) &&
@@ -199,8 +281,8 @@ bool BFloat16Propagation::AllUsersConsumeBF16(const HloInstruction& hlo,
   return true;
 }
 
-void BFloat16Propagation::DetermineAndMutateInstructionPrecision(
-    HloInstruction* hlo, bool skip_parameters) {
+void BFloat16Propagation::DetermineInstructionPrecision(HloInstruction* hlo,
+                                                        bool skip_parameters) {
   // We handle any fusion computation or while body/condition after the
   // instruction is handled, because we need to know the output shape of a
   // fusion or while before propagating inside its  computations.
@@ -209,12 +291,12 @@ void BFloat16Propagation::DetermineAndMutateInstructionPrecision(
       [this, hlo, &postpone_processing_called_computations] {
         if (!postpone_processing_called_computations) {
           if (hlo->opcode() == HloOpcode::kFusion) {
-            DetermineAndMutateFusionComputationPrecision(hlo);
+            DetermineFusionComputationPrecision(hlo);
           } else if (hlo->opcode() == HloOpcode::kWhile) {
-            DetermineAndMutateWhileComputationsPrecision(hlo);
+            DetermineWhileComputationsPrecision(hlo);
           }
         }
-        instructions_visited_in_mutation_pass_.insert(hlo);
+        instructions_visited_in_backward_pass_.insert(hlo);
       });
 
   if (hlo->opcode() == HloOpcode::kWhile &&
@@ -245,9 +327,9 @@ void BFloat16Propagation::DetermineAndMutateInstructionPrecision(
   CHECK(hlo->parent() != nullptr);
   if (hlo == hlo->parent()->root_instruction()) {
     if (!hlo->parent()->IsFusionComputation()) {
-      ShapeUtil::ForEachSubshape(hlo->shape(), [&](const Shape& subshape,
+      ShapeUtil::ForEachSubshape(hlo->shape(), [&](const Shape& /* subshape */,
                                                    const ShapeIndex& index) {
-        if (subshape.element_type() != F32) {
+        if (OutputTypeAfterChange(hlo, index) != F32) {
           return;
         }
         for (const auto* value : dataflow_->GetValueSet(hlo, index).values()) {
@@ -269,13 +351,12 @@ void BFloat16Propagation::DetermineAndMutateInstructionPrecision(
     return;
   }
 
-  ShapeUtil::ForEachMutableSubshape(
-      hlo->mutable_shape(),
-      [hlo, this](Shape* subshape, const ShapeIndex& index) {
-        if (subshape->element_type() == F32 &&
+  ShapeUtil::ForEachSubshape(
+      hlo->shape(),
+      [hlo, this](const Shape& /* subshape */, const ShapeIndex& index) {
+        if (OutputTypeAfterChange(hlo, index) == F32 &&
             AllUsersConsumeBF16(*hlo, index)) {
-          subshape->set_element_type(BF16);
-          changed_ = true;
+          AddToOrRemoveFromBF16ChangeSet(hlo, index, BF16);
           VLOG(2) << "HloInstruction output at shape index " << index
                   << " changed to BF16 precision: " << hlo->ToString();
         }
@@ -308,26 +389,24 @@ void BFloat16Propagation::AdjustCalledComputationParameters(
         CHECK_EQ(operands.size(), computation->num_parameters());
         for (int64 i = 0; i < operands.size(); ++i) {
           auto parameter = computation->parameter_instruction(i);
-          ShapeUtil::ForEachMutableSubshape(
-              parameter->mutable_shape(),
-              [this, i, hlo, &operands, parameter](Shape* subshape,
+          ShapeUtil::ForEachSubshape(
+              parameter->shape(),
+              [this, i, hlo, &operands, parameter](const Shape& /* subshape */,
                                                    const ShapeIndex& index) {
                 if (!ShapeUtil::IsLeafIndex(parameter->shape(), index)) {
                   return;
                 }
                 PrimitiveType operand_type =
-                    ShapeUtil::GetSubshape(operands[i]->shape(), index)
-                        .element_type();
-                if (subshape->element_type() == operand_type) {
+                    OutputTypeAfterChange(operands[i], index);
+                if (OutputTypeAfterChange(parameter, index) == operand_type) {
                   return;
                 }
-                CHECK(operand_type == F32 || operand_type == BF16);
-                subshape->set_element_type(operand_type);
-                changed_ = true;
+                AddToOrRemoveFromBF16ChangeSet(parameter, index, operand_type);
                 VLOG(2) << "Called computation parameter "
                         << parameter->ToString() << " at shape index " << index
-                        << " adjusted to match operand in HLO "
-                        << hlo->ToString();
+                        << " adjusted to "
+                        << (operand_type == BF16 ? "BF16" : "F32")
+                        << " to match operand in HLO " << hlo->ToString();
               });
         }
       };
@@ -348,51 +427,48 @@ void BFloat16Propagation::AdjustCalledComputationParameters(
 
 void BFloat16Propagation::AdjustCalledComputationRoot(HloInstruction* hlo) {
   auto adjust_computation = [this, hlo](HloComputation* computation,
-                                        const Shape& output_shape) {
+                                        HloInstruction* output) {
     // Adjust root.
     HloInstruction* root = computation->root_instruction();
-    ShapeUtil::ForEachMutableSubshape(
-        root->mutable_shape(), [this, hlo, root, &output_shape](
-                                   Shape* subshape, const ShapeIndex& index) {
-          if (!ShapeUtil::IsLeafIndex(hlo->shape(), index)) {
-            return;
-          }
-          const PrimitiveType output_type =
-              ShapeUtil::GetSubshape(output_shape, index).element_type();
-          if (subshape->element_type() == output_type) {
-            return;
-          }
-          CHECK(output_type == F32 || output_type == BF16);
-          subshape->set_element_type(output_type);
-          // It's possible that output_type is F32, but the root instruction's
-          // type is BF16; e.g., a fusion node's output was changed to BF16
-          // initially but then adjusted back to F32, and the fusion computation
-          // is now being adjusted after the fusion node.
-          if (output_type == F32) {
-            for (const auto* value :
-                 dataflow_->GetValueSet(root, index).values()) {
-              // We rely on the fact that this adjustment works in reverse
-              // topological order so that called computation will be
-              // processed later. Adding the value to
-              // values_that_must_be_kept_as_f32_ will ensure the
-              // correctness of the adjustment for HLOs that will be
-              // processed later.
-              values_that_must_be_kept_as_f32_.insert(value);
-            }
-          }
-          changed_ = true;
-          VLOG(2) << "Called computation root " << root->ToString()
-                  << " at shape index " << index
-                  << " adjusted to match output shape of " << hlo->ToString();
-        });
+    ShapeUtil::ForEachSubshape(root->shape(), [this, hlo, root, output](
+                                                  const Shape& /* subshape */,
+                                                  const ShapeIndex& index) {
+      if (!ShapeUtil::IsLeafIndex(hlo->shape(), index)) {
+        return;
+      }
+      const PrimitiveType output_type = OutputTypeAfterChange(output, index);
+      if (OutputTypeAfterChange(root, index) == output_type) {
+        return;
+      }
+      AddToOrRemoveFromBF16ChangeSet(root, index, output_type);
+      // It's possible that output_type is F32, but the root instruction's
+      // type is BF16; e.g., a fusion node's output was changed to BF16
+      // initially but then adjusted back to F32, and the fusion computation
+      // is now being adjusted after the fusion node.
+      if (output_type == F32) {
+        for (const auto* value : dataflow_->GetValueSet(root, index).values()) {
+          // We rely on the fact that this adjustment works in reverse
+          // topological order so that called computation will be
+          // processed later. Adding the value to
+          // values_that_must_be_kept_as_f32_ will ensure the
+          // correctness of the adjustment for HLOs that will be
+          // processed later.
+          values_that_must_be_kept_as_f32_.insert(value);
+        }
+      }
+      VLOG(2) << "Called computation root " << root->ToString()
+              << " at shape index " << index << " adjusted to "
+              << (output_type == BF16 ? "BF16" : "F32")
+              << " to match output shape of " << hlo->ToString();
+    });
   };
 
   switch (hlo->opcode()) {
     case HloOpcode::kFusion:
-      adjust_computation(hlo->fused_instructions_computation(), hlo->shape());
+      adjust_computation(hlo->fused_instructions_computation(), hlo);
       break;
     case HloOpcode::kWhile:
-      adjust_computation(hlo->while_body(), hlo->shape());
+      adjust_computation(hlo->while_body(), hlo);
       break;
     default:
       break;
@@ -409,16 +485,19 @@ bool BFloat16Propagation::ResolveInconsistencyOfAliasingBuffersHelper(
   for (auto inst_it = insts.rbegin(); inst_it != insts.rend(); ++inst_it) {
     auto hlo = *inst_it;
     auto adjust_hlo_output = [this, hlo, &parameter_changed](
-                                 Shape* subshape, const ShapeIndex& index) {
-      if (subshape->element_type() != F32 && subshape->element_type() != BF16) {
+                                 const Shape& /* subshape */,
+                                 const ShapeIndex& index) {
+      auto output_type = OutputTypeAfterChange(hlo, index);
+      if (output_type != F32 && output_type != BF16) {
         return;
       }
       PrimitiveType type = BF16;
       for (const auto* value : dataflow_->GetValueSet(hlo, index).values()) {
-        if (value->shape().element_type() == BF16) {
+        auto value_type = ValueTypeAfterChange(value);
+        if (value_type == BF16) {
           continue;
         }
-        CHECK_EQ(value->shape().element_type(), F32);
+        CHECK_EQ(value_type, F32);
         type = F32;
         break;
       }
@@ -437,16 +516,17 @@ bool BFloat16Propagation::ResolveInconsistencyOfAliasingBuffersHelper(
           values_that_must_be_kept_as_f32_.insert(value);
         }
       }
-      if (type != subshape->element_type()) {
-        subshape->set_element_type(type);
+      if (type != output_type) {
+        AddToOrRemoveFromBF16ChangeSet(hlo, index, type);
         VLOG(2) << "HloInstruction output at shape index " << index
-                << " adjusted to " << *subshape << ": " << hlo->ToString();
+                << " adjusted to " << (type == BF16 ? "BF16" : "F32") << ": "
+                << hlo->ToString();
         if (hlo->opcode() == HloOpcode::kParameter) {
           parameter_changed = true;
         }
       }
     };
-    ShapeUtil::ForEachMutableSubshape(hlo->mutable_shape(), adjust_hlo_output);
+    ShapeUtil::ForEachSubshape(hlo->shape(), adjust_hlo_output);
     AdjustCalledComputationRoot(hlo);
     if (hlo->opcode() == HloOpcode::kWhile) {
       // We need to run on the while body and condition repeatedly until a fixed
@@ -463,8 +543,7 @@ bool BFloat16Propagation::ResolveInconsistencyOfAliasingBuffersHelper(
              ResolveInconsistencyOfAliasingBuffersHelper(hlo->while_body(),
                                                          &visited_in_while)) {
         visited_in_while.clear();
-        ShapeUtil::ForEachMutableSubshape(hlo->mutable_shape(),
-                                          adjust_hlo_output);
+        ShapeUtil::ForEachSubshape(hlo->shape(), adjust_hlo_output);
         AdjustCalledComputationRoot(hlo);
       }
       visited_computations->insert(visited_in_while.begin(),
@@ -478,7 +557,7 @@ bool BFloat16Propagation::ResolveInconsistencyOfAliasingBuffersHelper(
   return parameter_changed;
 }
 
-Status BFloat16Propagation::ResolveInconsistencyOfAliasingBuffers(
+void BFloat16Propagation::ResolveInconsistencyOfAliasingBuffers(
     HloModule* module) {
   std::list<HloComputation*> computations_topological_order =
       module->MakeComputationPostOrder();
@@ -490,7 +569,9 @@ Status BFloat16Propagation::ResolveInconsistencyOfAliasingBuffers(
     }
     ResolveInconsistencyOfAliasingBuffersHelper(*comp_it, &resolved);
   }
+}
 
+Status BFloat16Propagation::ResolveInconsistentFusions(HloModule* module) {
   // We could have changed a fusion computation's root shape to have a different
   // precision than the fusion node's output, if the fusion root does not
   // define a buffer (e.g., a tuple). Now we add conversions after such fusion
@@ -517,7 +598,7 @@ Status BFloat16Propagation::ResolveInconsistencyOfAliasingBuffers(
   // (2) after adding conversion
   // (3) after tuple simplifier and DCE.
   bool needs_tuple_simplifier = false;
-  for (auto computation : computations_topological_order) {
+  for (auto computation : module->MakeComputationPostOrder()) {
     auto insts = computation->MakeInstructionPostOrder();
     for (auto inst_it = insts.rbegin(); inst_it != insts.rend(); ++inst_it) {
       auto hlo = *inst_it;
@@ -587,7 +668,14 @@ Status BFloat16Propagation::ResolveInconsistencyOfAliasingBuffers(
       needs_tuple_simplifier |= ShapeUtil::IsTuple(hlo->shape());
     }
   }
+  if (needs_tuple_simplifier) {
+    TupleSimplifier tuple_simplifier;
+    TF_RETURN_IF_ERROR(tuple_simplifier.Run(module).status());
+  }
+  return Status::OK();
+}
 
+Status BFloat16Propagation::ResolveConvertedConstants(HloModule* module) {
   // We may have converted some constants from F32 to BF16, so adjust the
   // constant literals in such cases. We do this here instead of when the
   // constant node's is changed because 1) the HloInstruction interface does not
@@ -598,8 +686,7 @@ Status BFloat16Propagation::ResolveInconsistencyOfAliasingBuffers(
   // can avoid repeated conversions.
   //
   // TODO(b/73833576): Consider resetting literal in HloInstruction.
-  bool needs_dce = needs_tuple_simplifier;
-  for (auto computation : computations_topological_order) {
+  for (auto computation : module->MakeComputationPostOrder()) {
     for (auto hlo : computation->MakeInstructionPostOrder()) {
       if (hlo->opcode() != HloOpcode::kConstant) {
         continue;
@@ -612,23 +699,13 @@ Status BFloat16Propagation::ResolveInconsistencyOfAliasingBuffers(
         auto new_constant = computation->AddInstruction(
             HloInstruction::CreateConstant(std::move(converted_literal)));
         TF_RETURN_IF_ERROR(hlo->ReplaceAllUsesWith(new_constant));
-        needs_dce = true;
       }
     }
   }
-
-  if (needs_tuple_simplifier) {
-    TupleSimplifier tuple_simplifier;
-    TF_RETURN_IF_ERROR(tuple_simplifier.Run(module).status());
-  }
-  if (needs_dce) {
-    HloDCE dce;
-    TF_RETURN_IF_ERROR(dce.Run(module).status());
-  }
   return Status::OK();
 }
 
-Status BFloat16Propagation::RemoveNoopConversions(HloModule* module) {
+Status BFloat16Propagation::SkipNoopConversions(HloModule* module) {
   for (auto computation : module->computations()) {
     for (auto hlo : computation->MakeInstructionPostOrder()) {
       if (hlo->opcode() != HloOpcode::kConvert) {
@@ -643,7 +720,6 @@ Status BFloat16Propagation::RemoveNoopConversions(HloModule* module) {
       if (is_root) {
         computation->set_root_instruction(source);
       }
-      TF_RETURN_IF_ERROR(computation->RemoveInstructionAndUnusedOperands(hlo));
     }
   }
   return Status::OK();
@@ -652,8 +728,18 @@ Status BFloat16Propagation::RemoveNoopConversions(HloModule* module) {
 // The algorithm first does a forward pass (parameters to root) to determine a
 // set of instructions to consider using bfloat16, then does a backward pass to
 // determine the precisions of those instructions according to the need of
-// their users.
+// their users. During the backward pass, the potential changes are stored in
+// changes_to_bf16_ which are subject to further adjustments then applied to the
+// HLOs.
 StatusOr<bool> BFloat16Propagation::Run(HloModule* module) {
+  consider_using_bfloat16_.clear();
+  instructions_visited_in_backward_pass_.clear();
+  computations_visited_in_backward_pass_.clear();
+  values_that_must_be_kept_as_f32_.clear();
+  caller_counts_.clear();
+  changes_to_bf16_.clear();
+  changed_ = false;
+
   TF_ASSIGN_OR_RETURN(dataflow_, HloDataflowAnalysis::Run(*module));
 
   std::list<HloComputation*> computations_topological_order =
@@ -686,8 +772,24 @@ StatusOr<bool> BFloat16Propagation::Run(HloModule* module) {
     }
     auto insts = (*comp_it)->MakeInstructionPostOrder();
     for (auto inst_it = insts.rbegin(); inst_it != insts.rend(); ++inst_it) {
-      DetermineAndMutateInstructionPrecision(*inst_it,
-                                             /*skip_parameters=*/true);
+      DetermineInstructionPrecision(*inst_it,
+                                    /*skip_parameters=*/true);
+    }
+  }
+
+  // It's possible that an instruction does not define a buffer, but the
+  // defining instruction's shape has changed. So we need to adjust the output
+  // shapes of instructions according to the HLO values they refer to.
+  ResolveInconsistencyOfAliasingBuffers(module);
+
+  // Apply the changes in changes_to_bf16_.
+  for (auto& change : changes_to_bf16_) {
+    auto shape = change.first->mutable_shape();
+    for (const auto& index : change.second) {
+      auto subshape = ShapeUtil::GetMutableSubshape(shape, index);
+      CHECK_EQ(subshape->element_type(), F32);
+      subshape->set_element_type(BF16);
+      changed_ = true;
     }
   }
 
@@ -695,15 +797,56 @@ StatusOr<bool> BFloat16Propagation::Run(HloModule* module) {
     return false;
   }
 
-  // It's possible that an instruction does not define a buffer, but the
-  // defining instruction's shape has changed. So we need to adjust the output
-  // shapes of instructions according to the HLO values they refer to.
-  TF_RETURN_IF_ERROR(ResolveInconsistencyOfAliasingBuffers(module));
+  TF_RETURN_IF_ERROR(ResolveInconsistentFusions(module));
+  TF_RETURN_IF_ERROR(ResolveConvertedConstants(module));
 
   // This pass could have turned an F32 -> BF16 conversion to a no-op (BF16 ->
-  // BF16), so we remove them now.
-  TF_RETURN_IF_ERROR(RemoveNoopConversions(module));
+  // BF16), so we skip them now.
+  TF_RETURN_IF_ERROR(SkipNoopConversions(module));
+
+  {
+    // We may have dead HLOs after ResolveInconsistentFusions,
+    // ResolveConvertedConstants and SkipNoopConversions.
+    HloDCE dce;
+    TF_RETURN_IF_ERROR(dce.Run(module).status());
+  }
   return true;
 }
 
+PrimitiveType BFloat16Propagation::OutputTypeAfterChange(
+    HloInstruction* hlo, const ShapeIndex& index) const {
+  PrimitiveType type_on_hlo =
+      ShapeUtil::GetSubshape(hlo->shape(), index).element_type();
+  if (type_on_hlo != F32) {
+    return type_on_hlo;
+  }
+  auto it = changes_to_bf16_.find(hlo);
+  if (it == changes_to_bf16_.end()) {
+    return type_on_hlo;
+  }
+  return ContainsKey(it->second, index) ? BF16 : F32;
+}
+
+PrimitiveType BFloat16Propagation::ValueTypeAfterChange(
+    const HloValue* value) const {
+  auto hlo = value->defining_instruction();
+  const auto& position = value->defining_position();
+  return OutputTypeAfterChange(hlo, position.index);
+}
+
+void BFloat16Propagation::AddToOrRemoveFromBF16ChangeSet(
+    HloInstruction* hlo, const ShapeIndex& index, PrimitiveType target_type) {
+  if (target_type == BF16) {
+    auto& entry = changes_to_bf16_[hlo];
+    entry.insert(index);
+  } else {
+    CHECK_EQ(target_type, F32);
+    auto it = changes_to_bf16_.find(hlo);
+    if (it == changes_to_bf16_.end()) {
+      return;
+    }
+    it->second.erase(index);
+  }
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation.h b/tensorflow/compiler/xla/service/bfloat16_propagation.h
index 1744e9db90aeff..de0355ddfca127 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation.h
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/core/lib/hash/hash.h"
 
 namespace xla {
 
@@ -85,30 +86,39 @@ class BFloat16Propagation : public HloPassInterface {
   tensorflow::gtl::FlatSet<const HloInstruction*> consider_using_bfloat16_;
 
   // ***************************
-  // Functions called and state produced by the backward mutation pass (from
-  // root to parameters).
+  // Functions called and state produced by the backward pass (from root to
+  // parameters) that finds opportunities to use BF16.
 
-  // Determines the precision for the given instruction in the mutation pass.
-  void DetermineAndMutateInstructionPrecision(HloInstruction* hlo,
-                                              bool skip_parameters);
+  // Determines the precision for the given instruction in the
+  // opportunity-finding pass.
+  void DetermineInstructionPrecision(HloInstruction* hlo, bool skip_parameters);
 
-  // Special handling in the mutation pass for fusion computations.
+  // Special handling in the opportunity-finding pass for fusion computations.
   //
   // Precondition: hlo->opcode() == kFusion
-  void DetermineAndMutateFusionComputationPrecision(HloInstruction* fusion);
+  void DetermineFusionComputationPrecision(HloInstruction* fusion);
 
-  // Special handling in the mutation pass for while computations.
+  // Reverts changes to BF16 that will not propagate outside a fusion
+  // computation. This avoids BF16 casts overhead inside a fusion which won't
+  // save memory bandwidth.
+  //
+  // Precondition: hlo->opcode() == kFusion
+  void RevertIfFusionInternalBF16Changes(HloInstruction* fusion);
+
+  // Special handling in the opportunity-finding pass for while computations.
   //
   // Precondition: hlo->opcode() == kWhile
-  void DetermineAndMutateWhileComputationsPrecision(HloInstruction* while_hlo);
+  void DetermineWhileComputationsPrecision(HloInstruction* while_hlo);
 
-  // The set of HloInstructions that have been visited in the mutation pass.
+  // The set of HloInstructions that have been visited in the
+  // opportunity-finding pass.
   tensorflow::gtl::FlatSet<const HloInstruction*>
-      instructions_visited_in_mutation_pass_;
+      instructions_visited_in_backward_pass_;
 
-  // The set of HloComputations that have been visited in the mutation pass.
+  // The set of HloComputations that have been visited in the
+  // opportunity-finding pass.
   tensorflow::gtl::FlatSet<const HloComputation*>
-      computations_visited_in_mutation_pass_;
+      computations_visited_in_backward_pass_;
 
   // ***************************
   // Functions called by the final inconsistency resolving pass.
@@ -116,7 +126,7 @@ class BFloat16Propagation : public HloPassInterface {
   // Adjusts the output shapes of HloInstructions such that if two
   // HloInstructions have aliasing buffers in their outputs, they must have the
   // same precision.
-  Status ResolveInconsistencyOfAliasingBuffers(HloModule* module);
+  void ResolveInconsistencyOfAliasingBuffers(HloModule* module);
 
   // Resolves inconsistency of aliasing buffers for the given computation, and
   // recursively runs on a while instruction's condition and body until a fixed
@@ -134,9 +144,19 @@ class BFloat16Propagation : public HloPassInterface {
   void AdjustCalledComputationRoot(HloInstruction* hlo);
 
   // ***************************
-  // Removes no-op conversions (same source and target shapes) that can be
-  // produced this pass.
-  Status RemoveNoopConversions(HloModule* module);
+  // Functions called after changes in changes_to_bf16_ are applied.
+
+  // Resolves inconsistencies introduced by this pass for fusions with
+  // tuple-type output.
+  Status ResolveInconsistentFusions(HloModule* module);
+
+  // Converts the literals in kConstant HLOs which have their types changed to
+  // BF16 by this pass.
+  Status ResolveConvertedConstants(HloModule* module);
+
+  // Skips no-op conversions (same source and target shapes) that can be
+  // produced this pass, i.e., replaces them in their uses with their operands.
+  Status SkipNoopConversions(HloModule* module);
 
   // ***************************
   // Functions called and state used by two or more passes.
@@ -146,6 +166,23 @@ class BFloat16Propagation : public HloPassInterface {
   bool AllUsersConsumeBF16(const HloInstruction& hlo,
                            const ShapeIndex& index) const;
 
+  // The output element type of the HLO at the given shape index after changes
+  // in changes_to_bf16_ are applied.
+  PrimitiveType OutputTypeAfterChange(HloInstruction* hlo,
+                                      const ShapeIndex& index) const;
+
+  // The element type of the HLO value after changes in changes_to_bf16_ are
+  // applied.
+  PrimitiveType ValueTypeAfterChange(const HloValue* value) const;
+
+  // If target_type == BF16, adds the HLO at the given index to
+  // changes_to_bf16_; otherwise, target_type must be F32 and this function
+  // removes the HLO at the given index from changes_to_bf16_ if it was earlier
+  // added.
+  void AddToOrRemoveFromBF16ChangeSet(HloInstruction* hlo,
+                                      const ShapeIndex& index,
+                                      PrimitiveType target_type);
+
   // The set of F32 HLO values that must be kept in F32.
   tensorflow::gtl::FlatSet<const HloValue*> values_that_must_be_kept_as_f32_;
 
@@ -153,10 +190,28 @@ class BFloat16Propagation : public HloPassInterface {
   // module. Populated at the beginning of this pass.
   tensorflow::gtl::FlatMap<const HloComputation*, int64> caller_counts_;
 
+  // We first store the potential F32-to-BF16 changes to changes_to_bf16_, which
+  // are subject to further adjustment, then finally applied to the HLOs. This
+  // avoids setting changed_ to true but all changes are reverted during
+  // adjustment.
+  struct IndexHasher {
+    int64 operator()(const ShapeIndex& index) const {
+      int64 hash = 0;
+      for (int64 i : index) {
+        hash = tensorflow::Hash64Combine(hash, std::hash<int64>()(i));
+      }
+      return hash;
+    }
+  };
+  tensorflow::gtl::FlatMap<HloInstruction*,
+                           tensorflow::gtl::FlatSet<ShapeIndex, IndexHasher>>
+      changes_to_bf16_;
+
+  // Whether the last processed HLO module has been changed by this pass.
+  bool changed_ = false;
+
   const BFloat16Support* bfloat16_support_;
   std::unique_ptr<HloDataflowAnalysis> dataflow_;
-
-  bool changed_ = false;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
index 183db1652e498e..5e1499ee6b6ef3 100644
--- a/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_propagation_test.cc
@@ -149,12 +149,12 @@ TEST_F(BFloat16PropagationTest, ConvertConstantLiteral) {
   EXPECT_TRUE(OutputsBF16(dot->operand(1)));
   EXPECT_EQ(dot->operand(0)->opcode(), HloOpcode::kConstant);
   EXPECT_EQ(dot->operand(1)->opcode(), HloOpcode::kConstant);
-  LiteralTestUtil::ExpectEqual(
+  EXPECT_TRUE(LiteralTestUtil::Equal(
       dot->operand(0)->literal(),
-      *LiteralTestUtil::ConvertF32ToBF16(*Literal::CreateFromArray(array_a)));
-  LiteralTestUtil::ExpectEqual(
+      *Literal::ConvertF32ToBF16(*Literal::CreateFromArray(array_a))));
+  EXPECT_TRUE(LiteralTestUtil::Equal(
       dot->operand(1)->literal(),
-      *LiteralTestUtil::ConvertF32ToBF16(*Literal::CreateFromArray(array_b)));
+      *Literal::ConvertF32ToBF16(*Literal::CreateFromArray(array_b))));
 }
 
 // Tests that BF16 can be propagated through nested tuples.
@@ -323,6 +323,37 @@ TEST_F(BFloat16PropagationTest, PropagateThroughFusion) {
   EXPECT_TRUE(OutputsBF16(b_f1));
 }
 
+// Tests that changes to BF16 that cannot be propagated outside a fusion are
+// discarded.
+TEST_F(BFloat16PropagationTest, DiscardFusionInternalBF16Changes) {
+  auto module = CreateNewModule();
+  auto builder = HloComputation::Builder(TestName());
+  Shape shape = ShapeUtil::MakeShape(F32, {2, 4});
+
+  HloInstruction* param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param"));
+  HloInstruction* add = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param, param));
+
+  auto builder_f = HloComputation::Builder("fusion");
+  HloInstruction* a_f =
+      builder_f.AddInstruction(HloInstruction::CreateParameter(0, shape, "a"));
+  HloInstruction* b_f =
+      builder_f.AddInstruction(HloInstruction::CreateParameter(1, shape, "b"));
+  HloInstruction* add_f = builder_f.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, a_f, b_f));
+  HloInstruction* dot_f = builder_f.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(F32, {4, 4}), HloOpcode::kDot, add_f, add_f));
+  auto comp_f = module->AddEmbeddedComputation(builder_f.Build());
+  auto fusion = builder.AddInstruction(HloInstruction::CreateFusion(
+      dot_f->shape(), HloInstruction::FusionKind::kCustom, {add, add}, comp_f));
+
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_FALSE(PropagatePrecision(module.get()));
+  EXPECT_EQ(computation->root_instruction(), fusion);
+}
+
 // Tests that if 1) the root instruction of a fusion is a tuple, 2) the fusion
 // outputs are only used by a dot, and 3) one element of the tuple is used by
 // an add in the fusion computation, then the propagation pass should create a
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index dbe45e932cdeed..682c3865797c85 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -24,6 +24,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/buffer_value_containers.h"
 #include "tensorflow/compiler/xla/service/heap_simulator.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -134,6 +135,7 @@ Status GatherComputationsByAllocationType(
             worklist.push_back(std::make_pair(subcomputation,
                                               false));  // Not thread local.
             break;
+          case HloOpcode::kCrossReplicaSum:
           case HloOpcode::kMap:
           case HloOpcode::kReduce:
           case HloOpcode::kReduceWindow:
@@ -292,112 +294,6 @@ BufferAllocationProto BufferAllocation::ToProto() const {
   return proto;
 }
 
-std::pair<int64, std::vector<const LogicalBuffer*>>
-BufferAllocation::ComputePeakMemoryLogicalBuffers() const {
-  if (HeapTraces().empty()) {
-    // Just return the largest LogicalBuffer in the allocation.
-    const LogicalBuffer* largest_buffer = nullptr;
-    int64 largest_size = 0;
-    for (const auto& pair : assigned_buffers()) {
-      const LogicalBuffer* buffer = pair.first;
-      int64 size = pair.second.size;
-      if (largest_buffer == nullptr) {
-        largest_buffer = buffer;
-        largest_size = size;
-        continue;
-      }
-      // Tie-break with LogicalBuffer::Id so the return value is stable relative
-      // to changing addresses.
-      if (size > largest_size ||
-          ((size == largest_size) && (largest_buffer->id() > buffer->id()))) {
-        largest_buffer = buffer;
-        largest_size = size;
-      }
-    }
-    CHECK(largest_buffer != nullptr)
-        << "No logical buffers in allocation: " << ToString();
-    return {largest_size, {largest_buffer}};
-  }
-
-  // Create a map from LogicalBuffer::Id to LogicalBuffer* for the logical
-  // buffers in this allocation.
-  tensorflow::gtl::FlatMap<LogicalBuffer::Id, const LogicalBuffer*>
-      id_to_buffer;
-  tensorflow::gtl::FlatMap<const LogicalBuffer*, int64> buffer_sizes;
-  for (const auto& pair : assigned_buffers()) {
-    const LogicalBuffer* buffer = pair.first;
-    const OffsetSize& offset_size = pair.second;
-    id_to_buffer[buffer->id()] = buffer;
-    buffer_sizes[buffer] = offset_size.size;
-  }
-
-  // Returns how much the given event increases the total size of live
-  // buffers. Can be negative.
-  auto memory_delta = [this, &id_to_buffer, &buffer_sizes](
-                          const HeapSimulatorTrace::Event& event) -> int64 {
-    const LogicalBuffer* buffer = id_to_buffer.at(event.buffer_id());
-    const int64 buffer_size = buffer_sizes.at(buffer);
-    if (event.kind() == HeapSimulatorTrace::Event::ALLOC) {
-      return buffer_size;
-    } else if (event.kind() == HeapSimulatorTrace::Event::SHARE_WITH) {
-      // Sharing a buffer does not change the live set size for the purposes of
-      // the heap simulator. Even though the shared-with buffer may be smaller,
-      // the entire allocation remains live.
-      return 0;
-    } else if (event.kind() == HeapSimulatorTrace::Event::FREE) {
-      return -1 * buffer_size;
-    }
-    LOG(FATAL) << "Unknown event kind: " << event.kind();
-  };
-
-  int64 total_max_live_size = 0;
-  std::vector<const LogicalBuffer*> live_buffers_vector;
-  for (const HeapSimulatorTrace& heap_trace : HeapTraces()) {
-    // First compute the size of the maximal live set.
-    int64 max_live_size = 0;
-    int64 live_size = 0;
-    for (const auto& event : heap_trace.events()) {
-      live_size += memory_delta(event);
-      if (max_live_size < live_size) {
-        max_live_size = live_size;
-      }
-    }
-
-    // Next gather the set of logical buffers live at the earliest point of
-    // maximal live set size.
-    tensorflow::gtl::FlatSet<const LogicalBuffer*> live_buffers;
-    live_size = 0;
-    for (const auto& event : heap_trace.events()) {
-      const LogicalBuffer* buffer = id_to_buffer.at(event.buffer_id());
-      if (event.kind() == HeapSimulatorTrace::Event::ALLOC) {
-        InsertOrDie(&live_buffers, buffer);
-      } else if (event.kind() == HeapSimulatorTrace::Event::SHARE_WITH) {
-        // Nothing to do.
-      } else if (event.kind() == HeapSimulatorTrace::Event::FREE) {
-        CHECK(ContainsKey(live_buffers, buffer));
-        live_buffers.erase(buffer);
-      }
-
-      live_size += memory_delta(event);
-      if (live_size == max_live_size) {
-        break;
-      }
-    }
-    CHECK_EQ(live_size, max_live_size);
-    total_max_live_size += max_live_size;
-
-    live_buffers_vector.insert(live_buffers_vector.end(), live_buffers.begin(),
-                               live_buffers.end());
-  }
-
-  // Stabily sort the live buffers.
-  std::sort(live_buffers_vector.begin(), live_buffers_vector.end(),
-            [](const LogicalBuffer* a, const LogicalBuffer* b) {
-              return a->id() < b->id();
-            });
-  return {total_max_live_size, live_buffers_vector};
-}
-
 string BufferAllocation::ToString() const {
   string output;
   Appendf(&output, "allocation %lld: %p, size %lld", index_, this, size());
@@ -610,6 +506,7 @@ BufferAllocation* BufferAssignment::NewAllocation(const LogicalBuffer& buffer,
   BufferAllocation* allocation =
       NewEmptyAllocation(size, is_thread_local, is_reusable, buffer.color());
   AddAssignment(allocation, buffer, /*offset=*/0, size);
+  allocation->peak_buffers_.push_back(&buffer);
   return allocation;
 }
 
@@ -680,6 +577,10 @@ void BufferAssignment::CombineTempAllocations() {
         CHECK_EQ(temp_allocation.HeapTraces().size(), 1);
         combined_allocation->AddHeapTrace(temp_allocation.HeapTraces().front());
       }
+      combined_allocation->peak_buffers_.insert(
+          combined_allocation->peak_buffers_.end(),
+          temp_allocation.peak_buffers_.begin(),
+          temp_allocation.peak_buffers_.end());
     }
     // Replace all existing temporary allocations with the new combined
     // allocations.
@@ -800,7 +701,7 @@ BufferAssignmentProto BufferAssignment::ToProto() const {
         BufferAssignmentProto::BufferAlias* proto_alias =
             proto.add_buffer_aliases();
         LogicalBufferProto::Location proto_alias_location =
-            LogicalBuffer::ToLocationProto(*alias.instruction(), alias.index());
+            BufferValue::ToLocationProto(*alias.instruction(), alias.index());
         proto_alias->set_source_buffer_id(buffer.id());
         proto_alias->mutable_location()->Swap(&proto_alias_location);
       }
@@ -1184,7 +1085,9 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
       VLOG(2) << "Simulating heap for color " << color;
       int64 alignment = assignment->color_alignment_(color);
       HeapSimulator::Options options;
-      options.buffers_to_assign = &single_colored_set.second;
+      BufferValueFlatSet buffer_value_set =
+          ToBufferValueFlatSet(single_colored_set.second);
+      options.buffers_to_assign = &buffer_value_set;
       TF_ASSIGN_OR_RETURN(
           const HeapSimulator::Result result,
           HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
@@ -1212,7 +1115,9 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
         VLOG(2) << "Simulating heap for color " << color;
         int64 alignment = assignment->color_alignment_(color);
         HeapSimulator::Options options;
-        options.buffers_to_assign = &single_colored_set.second;
+        BufferValueFlatSet buffer_value_set =
+            ToBufferValueFlatSet(single_colored_set.second);
+        options.buffers_to_assign = &buffer_value_set;
         TF_ASSIGN_OR_RETURN(
             const HeapSimulator::Result result,
             HeapSimulator::Run(MakeUnique<DecreasingSizeRunsHeap>(
@@ -1228,6 +1133,89 @@ Status BufferAssigner::AssignBuffersWithSequentialOrdering(
   return Status::OK();
 }
 
+namespace {
+
+// Computes and returns the set of logical buffers live at the point of maximal
+// liveness in the given heap trace. LogicalBuffers are (stabily) sorted by id.
+std::vector<const LogicalBuffer*> ComputePeakMemoryLogicalBuffers(
+    const BufferAllocation& allocation, const HeapSimulatorTrace& heap_trace) {
+  // Create a map from LogicalBuffer::Id to LogicalBuffer* for the logical
+  // buffers in this allocation.
+  tensorflow::gtl::FlatMap<LogicalBuffer::Id, const LogicalBuffer*>
+      id_to_buffer;
+  tensorflow::gtl::FlatMap<const LogicalBuffer*, int64> buffer_sizes;
+  for (const auto& pair : allocation.assigned_buffers()) {
+    const LogicalBuffer* buffer = pair.first;
+    const BufferAllocation::OffsetSize& offset_size = pair.second;
+    id_to_buffer[buffer->id()] = buffer;
+    buffer_sizes[buffer] = offset_size.size;
+  }
+
+  // Returns how much the given event increases the total size of live
+  // buffers. Can be negative.
+  auto memory_delta = [&id_to_buffer, &buffer_sizes](
+                          const HeapSimulatorTrace::Event& event) -> int64 {
+    const LogicalBuffer* buffer = id_to_buffer.at(event.buffer_id());
+    const int64 buffer_size = buffer_sizes.at(buffer);
+    if (event.kind() == HeapSimulatorTrace::Event::ALLOC) {
+      return buffer_size;
+    } else if (event.kind() == HeapSimulatorTrace::Event::SHARE_WITH) {
+      // Sharing a buffer does not change the live set size for the purposes of
+      // the heap simulator. Even though the shared-with buffer may be smaller,
+      // the entire allocation remains live.
+      return 0;
+    } else if (event.kind() == HeapSimulatorTrace::Event::FREE) {
+      return -1 * buffer_size;
+    }
+    LOG(FATAL) << "Unknown event kind: " << event.kind();
+  };
+
+  // First compute the size of the maximal live set.
+  int64 max_live_size = 0;
+  int64 live_size = 0;
+  for (const auto& event : heap_trace.events()) {
+    live_size += memory_delta(event);
+    if (max_live_size < live_size) {
+      max_live_size = live_size;
+    }
+  }
+
+  // Next gather the set of logical buffers live at the earliest point of
+  // maximal live set size.
+  tensorflow::gtl::FlatSet<const LogicalBuffer*> live_buffers;
+  live_size = 0;
+  for (const auto& event : heap_trace.events()) {
+    const LogicalBuffer* buffer = id_to_buffer.at(event.buffer_id());
+    if (event.kind() == HeapSimulatorTrace::Event::ALLOC) {
+      InsertOrDie(&live_buffers, buffer);
+    } else if (event.kind() == HeapSimulatorTrace::Event::SHARE_WITH) {
+      // Nothing to do.
+    } else if (event.kind() == HeapSimulatorTrace::Event::FREE) {
+      CHECK(ContainsKey(live_buffers, buffer));
+      live_buffers.erase(buffer);
+    }
+
+    live_size += memory_delta(event);
+    if (live_size == max_live_size) {
+      break;
+    }
+  }
+  CHECK_EQ(live_size, max_live_size);
+
+  std::vector<const LogicalBuffer*> live_buffers_vector;
+  live_buffers_vector.insert(live_buffers_vector.end(), live_buffers.begin(),
+                             live_buffers.end());
+
+  // Stabily sort the live buffers.
+  std::sort(live_buffers_vector.begin(), live_buffers_vector.end(),
+            [](const LogicalBuffer* a, const LogicalBuffer* b) {
+              return a->id() < b->id();
+            });
+  return live_buffers_vector;
+}
+
+}  // namespace
+
 void BufferAssigner::AssignBuffersFromHeapSimulator(
     const HeapSimulator::Result& result, BufferAssignment* assignment,
     LogicalBuffer::Color color) {
@@ -1242,10 +1230,15 @@ void BufferAssigner::AssignBuffersFromHeapSimulator(
   BufferAllocation* allocation = assignment->NewEmptyAllocation(
       result.heap_size, /*is_thread_local=*/false, /*is_reusable=*/true, color);
   for (const auto& buffer_chunk : result.chunk_map) {
-    const LogicalBuffer& buffer = *buffer_chunk.first;
+    // TODO(lauj) Remove this down_cast after downstream users of
+    // BufferAllocation::assigned_buffers() are updated to use BufferValue.
+    const LogicalBuffer& buffer =
+        *CHECK_NOTNULL(dynamic_cast<const LogicalBuffer*>(buffer_chunk.first));
     const HeapSimulator::Chunk& chunk = buffer_chunk.second;
     assignment->AddAssignment(allocation, buffer, chunk.offset, chunk.size);
   }
+  allocation->peak_buffers_ =
+      ComputePeakMemoryLogicalBuffers(*allocation, result.debug_trace);
 
   VLOG(1) << "Ran heap simulation for allocation: " << allocation->ToString();
   allocation->AddHeapTrace(result.debug_trace);
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index 3086d0e2ca0026..ad0b0bf7c25d71 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -206,17 +206,15 @@ class BufferAllocation {
     return heap_traces_;
   }
 
-  // Compute and return the LogicalBuffers which are live at the point of peak
-  // memory usage for the given allocation. The point of peak memory usage is
-  // the point at which the total size of all live logical buffers is
-  // maximal. If peak memory is reached at multiple points, the set of logical
-  // buffers live at the earliest maximal point is returned. The vector is
-  // stabily asserted by LogicalBuffer::Index.
-  //
-  // The return value is a pair of total size of the logical buffers at peak,
-  // and the buffers themselves.
-  std::pair<int64, std::vector<const LogicalBuffer*>>
-  ComputePeakMemoryLogicalBuffers() const;
+  // Returns the LogicalBuffers which are live at the point of peak memory usage
+  // for this allocation. The point of peak memory usage is the point at which
+  // the total size of all live logical buffers is maximal. If peak memory is
+  // reached at multiple points, the set of logical buffers live at the earliest
+  // maximal point is returned. The vector is stabily sorted by
+  // LogicalBuffer::Index.
+  const std::vector<const LogicalBuffer*>& PeakMemoryLogicalBuffers() const {
+    return peak_buffers_;
+  }
 
   // Get the number of bytes lost to fragmentation. This is equal to the
   // difference between the size of the allocation and the size of the maximal
@@ -291,6 +289,9 @@ class BufferAllocation {
 
   int64 fragmentation_bytes_ = 0;
   std::vector<HeapSimulatorTrace> heap_traces_;
+
+  // Set of buffers live at the point of peak memory usage for this allocation.
+  std::vector<const LogicalBuffer*> peak_buffers_;
 };
 
 // Add stream operators for nicer output of CHECK/RET_CHECK failures.
@@ -414,10 +415,10 @@ class BufferAssignment {
   // Only BufferAssigner can build or modify BufferAssignments.
   friend class BufferAssigner;
 
-  explicit BufferAssignment(const HloModule* module,
-                            std::unique_ptr<BufferLiveness> liveness,
-                            LogicalBuffer::SizeFunction buffer_size,
-                            LogicalBuffer::AlignmentFunction color_alignment)
+  BufferAssignment(const HloModule* module,
+                   std::unique_ptr<BufferLiveness> liveness,
+                   LogicalBuffer::SizeFunction buffer_size,
+                   LogicalBuffer::AlignmentFunction color_alignment)
       : module_(module),
         liveness_(std::move(liveness)),
         buffer_size_(std::move(buffer_size)),
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index 513a8785bbd52b..7e86c33687e595 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -23,8 +23,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/buffer_value.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
-#include "tensorflow/compiler/xla/service/computation_tracker.h"
 #include "tensorflow/compiler/xla/service/copy_insertion.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
@@ -32,12 +32,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_scheduling.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/macros.h"
@@ -81,7 +81,7 @@ const std::vector<const HloInstruction*> GetInstructions(HloInstruction* root) {
 
 class BufferAssignmentTest : public HloTestBase {
  protected:
-  BufferAssignmentTest() : computation_tracker_() {}
+  BufferAssignmentTest() {}
   ~BufferAssignmentTest() override {}
 
   std::unique_ptr<BufferAssignment> RunBufferAssignment(HloModule* module,
@@ -251,9 +251,6 @@ class BufferAssignmentTest : public HloTestBase {
     return total_size;
   }
 
-  // Computation tracker for nested computations.
-  ComputationTracker computation_tracker_;
-
   // Shapes for use in the examples.
   Shape s32_ = ShapeUtil::MakeShape(xla::S32, {});
   Shape r0f32_ = ShapeUtil::MakeShape(xla::F32, {});
@@ -1519,12 +1516,8 @@ TEST_F(BufferAssignmentTest, TrivialPeakBuffers) {
   // single logical buffer should be exactly the logical buffer in that
   // allocation.
   const BufferAllocation& mul_buffer = GetTopLevelAllocation(*buffers, mul);
-  int64 peak_size;
-  std::vector<const LogicalBuffer*> peak_buffers;
-
-  std::tie(peak_size, peak_buffers) =
-      mul_buffer.ComputePeakMemoryLogicalBuffers();
-  EXPECT_EQ(peak_size, ShapeUtil::ByteSizeOf(f32vec100_));
+  const std::vector<const LogicalBuffer*>& peak_buffers =
+      mul_buffer.PeakMemoryLogicalBuffers();
   ASSERT_EQ(peak_buffers.size(), 1);
   EXPECT_EQ(peak_buffers[0]->instruction(), mul);
 }
@@ -1555,6 +1548,7 @@ TEST_F(BufferAssignmentTest, PeakBuffers) {
       HloInstruction::CreateConcatenate(concat_shape, {rev, neg}, 0));
   // Make the root tiny so no interior nodes can share its buffer.
   auto root = builder.AddInstruction(HloInstruction::CreateSlice(
+
       ShapeUtil::MakeShape(F32, {1}), concat, {0}, {1}, {1}));
 
   auto module = CreateNewModule();
@@ -1569,12 +1563,10 @@ TEST_F(BufferAssignmentTest, PeakBuffers) {
   EXPECT_TRUE(buffer.IsPreallocatedTempBuffer());
   ASSERT_EQ(buffer.assigned_buffers().size(), 4);
 
-  int64 peak_size;
-  std::vector<const LogicalBuffer*> peak_buffers;
-  std::tie(peak_size, peak_buffers) = buffer.ComputePeakMemoryLogicalBuffers();
+  const std::vector<const LogicalBuffer*>& peak_buffers =
+      buffer.PeakMemoryLogicalBuffers();
 
   // The peak live set should be concat and its inputs.
-  EXPECT_EQ(peak_size, ShapeUtil::ByteSizeOf(ShapeUtil::MakeShape(F32, {400})));
   ASSERT_EQ(peak_buffers.size(), 3);
   std::vector<const HloInstruction*> peak_instructions;
   for (const LogicalBuffer* logical_buffer : peak_buffers) {
@@ -1583,6 +1575,69 @@ TEST_F(BufferAssignmentTest, PeakBuffers) {
   EXPECT_THAT(peak_instructions, UnorderedElementsAre(rev, neg, concat));
 }
 
+TEST_F(BufferAssignmentTest, PeakBuffersWhile) {
+  auto module = CreateNewModule();
+  const Shape shape = ShapeUtil::MakeShape(F32, {123, 123});
+  HloComputation* condition;
+  {
+    auto b = HloComputation::Builder(TestName() + ".cond");
+    b.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
+    b.AddInstruction(
+        HloInstruction::CreateConstant(Literal::CreateR0<bool>(true)));
+    condition = module->AddEmbeddedComputation(b.Build());
+  }
+  HloComputation* body;
+  {
+    auto b = HloComputation::Builder(TestName() + ".body");
+    auto param =
+        b.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
+    b.AddInstruction(
+        HloInstruction::CreateUnary(shape, HloOpcode::kNegate, param));
+    body = module->AddEmbeddedComputation(b.Build());
+  }
+  auto builder = HloComputation::Builder(TestName());
+  auto param =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0"));
+  auto copy = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kCopy, param));
+  auto while_op = builder.AddInstruction(
+      HloInstruction::CreateWhile(shape, condition, body, copy));
+  // This broadcast should get a temporary allocation which is merged with the
+  // allocation for the while. Peak buffers should include the while and the
+  // broadcast.
+  auto bcast = builder.AddInstruction(HloInstruction::CreateBroadcast(
+      ShapeUtil::MakeShape(F32, {123, 123, 123}), while_op, {0, 1}));
+  builder.AddInstruction(HloInstruction::CreateReverse(
+      ShapeUtil::MakeShape(F32, {123, 123, 123}), bcast, {0}));
+  module->AddEntryComputation(builder.Build());
+
+  auto buffers = RunBufferAssignment(module.get());
+  const BufferAllocation& buffer = GetTopLevelAllocation(*buffers, bcast);
+  const std::vector<const LogicalBuffer*>& peak_buffers =
+      buffer.PeakMemoryLogicalBuffers();
+  ASSERT_EQ(peak_buffers.size(), 2);
+
+  // The peak buffers should include the broadcast and one of the colocated
+  // buffers of the while (body param, condition param, body root, or the while
+  // itself).
+  const LogicalBuffer* bcast_buffer;
+  const LogicalBuffer* nonbcast_buffer;
+  if (peak_buffers[0]->instruction() == bcast) {
+    bcast_buffer = peak_buffers[0];
+    nonbcast_buffer = peak_buffers[1];
+  } else {
+    bcast_buffer = peak_buffers[1];
+    nonbcast_buffer = peak_buffers[0];
+  }
+  EXPECT_EQ(bcast_buffer->instruction(), bcast);
+  EXPECT_TRUE(
+      nonbcast_buffer->instruction() == copy ||
+      nonbcast_buffer->instruction() == while_op ||
+      nonbcast_buffer->instruction() == body->parameter_instruction(0) ||
+      nonbcast_buffer->instruction() == body->root_instruction() ||
+      nonbcast_buffer->instruction() == condition->parameter_instruction(0));
+}
+
 class WhileBufferAssignmentTest : public HloTestBase {
  protected:
   std::unique_ptr<HloComputation> BuildWhileConditionComputation(
@@ -1626,7 +1681,7 @@ class WhileBufferAssignmentTest : public HloTestBase {
         .ConsumeValueOrDie();
   }
 
-  static int64 ByteSizeOf(const LogicalBuffer& buffer) {
+  static int64 ByteSizeOf(const BufferValue& buffer) {
     return ShapeUtil::ByteSizeOf(buffer.shape(), sizeof(void*));
   }
 
@@ -1641,7 +1696,7 @@ static void RunCopyInsertion(HloModule* module) {
 }
 
 TEST_F(WhileBufferAssignmentTest, TwoForwardWhileLoops) {
-  auto module = xla::MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto builder = HloComputation::Builder("entry");
 
   auto input0 = builder.AddInstruction(
@@ -1738,7 +1793,7 @@ ENTRY %test_module {
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(module_str));
+                          ParseHloString(module_str));
 
   // Run CopyInsertion and check if the graph constructed above doesn't need
   // any copies inserted for BufferAssignment to run.
@@ -1816,7 +1871,7 @@ TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) {
   };
 
   // Build the entry computation as described in the comment above.
-  auto module = xla::MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto builder = HloComputation::Builder("entry");
 
   auto infeed = builder.AddInstruction(HloInstruction::CreateInfeed(r0s32, ""));
@@ -1884,7 +1939,7 @@ TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) {
 }
 
 TEST_F(WhileBufferAssignmentTest, OneForwardBackwardWhileLoopSet) {
-  auto module = xla::MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto builder = HloComputation::Builder("entry");
 
   auto input0 = builder.AddInstruction(
@@ -1929,7 +1984,7 @@ TEST_F(WhileBufferAssignmentTest, OneForwardBackwardWhileLoopSet) {
 }
 
 TEST_F(BufferAssignmentTest, TwoCalls) {
-  auto module = xla::MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   Shape r0f32 = ShapeUtil::MakeShape(xla::F32, {});
   HloComputation* sub_computation;
   {
@@ -1994,7 +2049,7 @@ static bool IsPostOrderTraversal(
 }
 
 TEST_F(WhileBufferAssignmentTest, WhileLoopsInterferingResultRange) {
-  auto module = xla::MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto builder = HloComputation::Builder(TestName());
 
   auto zero = builder.AddInstruction(
@@ -2073,7 +2128,7 @@ TEST_F(WhileBufferAssignmentTest, WhileLoopsInterferingResultRange) {
 }
 
 TEST_F(WhileBufferAssignmentTest, WhilesDontShareEntryParamIfLiveOut) {
-  auto module = xla::MakeUnique<HloModule>(TestName());
+  auto module = CreateNewModule();
   auto builder = HloComputation::Builder("entry");
 
   auto input0 = builder.AddInstruction(
diff --git a/tensorflow/compiler/xla/service/buffer_liveness.cc b/tensorflow/compiler/xla/service/buffer_liveness.cc
index 37982aaef9eddd..810d597e730c18 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness.cc
+++ b/tensorflow/compiler/xla/service/buffer_liveness.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/liveness_util.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -44,7 +43,7 @@ StatusOr<std::unique_ptr<BufferLiveness>> BufferLiveness::Run(
   return std::move(liveness);
 }
 
-tensorflow::Status BufferLiveness::Analyze() {
+Status BufferLiveness::Analyze() {
   TF_ASSIGN_OR_RETURN(points_to_analysis_, TuplePointsToAnalysis::Run(module_));
   for (auto* computation : module_->computations()) {
     if (computation->IsFusionComputation()) {
@@ -71,7 +70,7 @@ tensorflow::Status BufferLiveness::Analyze() {
   }
 
   XLA_VLOG_LINES(3, ToString());
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 string BufferLiveness::ToString() const {
@@ -105,8 +104,8 @@ bool BufferLiveness::live_range_strictly_before(const LogicalBuffer& a,
   for (const BufferAlias& alias : points_to_analysis_->GetBufferAliases(a)) {
     // Every user of 'a' must be a predecessor of 'b' or 'b' itself.
     for (auto user : alias.instruction()->users()) {
-      if (DoesNotUseOperandBuffer(alias.instruction(), alias.index(), user,
-                                  points_to_analysis())) {
+      if (points_to_analysis().DoesNotUseOperandBuffer(alias.instruction(),
+                                                       alias.index(), user)) {
         continue;
       }
       if (user != b.instruction() &&
@@ -132,9 +131,8 @@ bool BufferLiveness::live_range_strictly_before(const LogicalBuffer& a,
   // the qualifications specified in CanShareOperandBufferWithUser.
   for (const BufferAlias& alias : points_to_analysis_->GetBufferAliases(a)) {
     if (b.instruction()->IsUserOf(alias.instruction()) &&
-        !CanShareOperandBufferWithUser(alias.instruction(), alias.index(),
-                                       b.instruction(), b.index(),
-                                       points_to_analysis())) {
+        !points_to_analysis().CanShareOperandBufferWithUser(
+            alias.instruction(), alias.index(), b.instruction(), b.index())) {
       return false;
     }
   }
diff --git a/tensorflow/compiler/xla/service/buffer_liveness.h b/tensorflow/compiler/xla/service/buffer_liveness.h
index 11834a5127e383..cdd3cf4032ef69 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness.h
+++ b/tensorflow/compiler/xla/service/buffer_liveness.h
@@ -89,7 +89,7 @@ class BufferLiveness {
 
   // Perform buffer liveness analysis. This method must be called prior to
   // MayInterfere or MaybeLiveOut.
-  tensorflow::Status Analyze();
+  Status Analyze();
 
   // Returns true if the live range of the buffer of 'a' is strictly before the
   // live range of the buffer of 'b' (they do not overlap).
diff --git a/tensorflow/compiler/xla/service/buffer_value.cc b/tensorflow/compiler/xla/service/buffer_value.cc
new file mode 100644
index 00000000000000..2bc556a9e27013
--- /dev/null
+++ b/tensorflow/compiler/xla/service/buffer_value.cc
@@ -0,0 +1,68 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/buffer_value.h"
+
+#include <iosfwd>
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+BufferValue::BufferValue(HloInstruction* instruction, const ShapeIndex& index,
+                         Id id)
+    : id_(id) {
+  const Shape& shape = ShapeUtil::GetSubshape(instruction->shape(), index);
+  is_array_ = ShapeUtil::IsArray(shape);
+  is_tuple_ = ShapeUtil::IsTuple(shape);
+}
+
+BufferValue::~BufferValue() {}
+
+std::ostream& operator<<(std::ostream& out, const BufferValue& buffer) {
+  out << buffer.ToString();
+  return out;
+}
+
+/*static*/ LogicalBufferProto::Location BufferValue::ToLocationProto(
+    const HloInstruction& instruction, const ShapeIndex& index) {
+  LogicalBufferProto::Location proto;
+  proto.set_computation_name(instruction.parent()->name());
+  proto.set_instruction_name(instruction.name());
+  for (const int64 index_entry : index) {
+    proto.add_shape_index(index_entry);
+  }
+  return proto;
+}
+
+LogicalBufferProto BufferValue::ToProto(const SizeFunction& size_fn) const {
+  LogicalBufferProto proto;
+  proto.set_id(id());
+  proto.set_size(size_fn(*this));
+  LogicalBufferProto::Location proto_location =
+      ToLocationProto(*instruction(), index());
+  proto.mutable_defined_at()->Swap(&proto_location);
+  if (has_color()) {
+    proto.set_color(color().value());
+  }
+  return proto;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/buffer_value.h b/tensorflow/compiler/xla/service/buffer_value.h
new file mode 100644
index 00000000000000..f4be16e0843f64
--- /dev/null
+++ b/tensorflow/compiler/xla/service/buffer_value.h
@@ -0,0 +1,177 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_BUFFER_VALUE_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_BUFFER_VALUE_H_
+
+#include <functional>
+#include <string>
+
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/gtl/int_type.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+// Abstract class describing a value used by one of the dataflow analyses -
+// TuplePointsToAnalysis or HloDataflowAnalysis.
+// TODO(b/78906445) Delete this class when TuplePointsToAnalysis is unused.
+//
+// XLA arrays are trivially a single BufferValue. Tuples are made up of more
+// than one BufferValue: an BufferValue for the pointer vector, and an
+// BufferValue for each child element.
+//
+// Every BufferValue is defined by a particular instruction and most
+// instructions define only a single BufferValue. Instructions which define a
+// single BufferValue include array-shaped instructions such as Add but also
+// includes Tuple-shaped instructions such as Tuple. The Tuple instruction
+// defines a single BufferValue which is a vector of pointers to the values
+// containing the Tuple instruction's operands. Though the result of the Tuple
+// instruction includes multiple values only the top-level BufferValue (the
+// vector of pointers) is defined by the Tuple instruction. The values
+// containing the tuple elements are defined by earlier instructions, usually
+// the operands of the Tuple instruction.
+//
+// Instructions which construct both the tuple *and* the tuple elements define
+// more than one BufferValue. This includes (at least) tuple-shaped Constant,
+// Parameter, Infeed and While instructions. These tuple-shaped instructions do
+// not assemble a tuple from existing BufferValues like the Tuple instruction
+// does, but rather define all the BufferValues in the tuple.
+//
+// Some instructions, such as Bitcast, define no buffers. These instructions
+// simply forward buffers from their operands.
+//
+// The BufferValue object describes which HLO instruction defines a buffer and
+// where within that instruction's output shape the buffer is defined. The
+// location within the output shape is indicated by BufferValue::index() which
+// is defined identically to the index used in ShapeUtil::GetSubshape().
+// Examples:
+//
+// %add = Add(%foo, %bar)
+// %tuple_constant = Constant({1, {42, 43}})
+//
+// %add defines a single array-shaped buffer BufferValue(%add, {}) which holds
+// the array result of the add operation. The nested-tuple-shaped
+// %tuple_constant defines 5 buffers described by the following BufferValue
+// objects:
+//
+//   BufferValue(%tuple_constant, {})      // "Top-level" buffer: vector of
+//                                         //  pointers to BufferValues at
+//                                         //  indices {0} and {1}
+//   BufferValue(%tuple_constant, {0})     // Holds value "1"
+//   BufferValue(%tuple_constant, {1})     // Holds nested tuple: vector of
+//                                         //  pointers to BufferValues at
+//                                         //  indices {1, 0} and {1, 1}
+//   BufferValue(%tuple_constant, {1, 0})  // Holds value "42"
+//   BufferValue(%tuple_constant, {1, 1})  // Holds value "43"
+
+class BufferValue {
+ public:
+  TF_LIB_GTL_DEFINE_INT_TYPE(Color, int64);
+
+  // Id is a unique identifier for the BufferValue to facilitate efficient
+  // collections of BufferValues with stable iteration order.
+  using Id = int64;
+
+  // Functions which return the size and alignment of a logical buffer in bytes.
+  using SizeFunction = std::function<int64(const BufferValue&)>;
+  using AlignmentFunction = std::function<int64(BufferValue::Color)>;
+
+  virtual ~BufferValue();
+
+  Id id() const { return id_; }
+
+  // Return the instruction that defines the buffer.
+  virtual HloInstruction* instruction() const = 0;
+
+  // Return the index within the output of the instruction where the buffer is
+  // defined. Index used defined as in ShapeUtil::GetSubshape()
+  virtual const ShapeIndex& index() const = 0;
+
+  // Return the color of the BufferValue. Differently colored buffers can not be
+  // parts of the same allocation.
+  Color color() const {
+    CHECK_NE(color_, kInvalidColor)
+        << "Should not query the color of a buffer that was never colored";
+    return color_;
+  }
+
+  void set_color(Color color) {
+    CHECK_NE(color, kInvalidColor)
+        << "Should not set the color of a buffer to the invalid color";
+    color_ = color;
+  }
+
+  bool has_color() const { return color_ != kInvalidColor; }
+
+  // Return the shape of the buffer. This reference points into the shape field
+  // of the instruction defining the buffer.  Therefore, the returned shape will
+  // contain the layout of instruction, if any.
+  virtual const Shape& shape() const = 0;
+
+  // Returns true if this buffer is the top-level output buffer of the defining
+  // HLO instruction. This is equivalent to index == {}.
+  bool IsTopLevel() const { return index().empty(); }
+
+  // Whether this buffer contains a tuple.
+  bool IsTuple() const { return is_tuple_; }
+
+  // Whether this buffer contains an array.
+  bool IsArray() const { return is_array_; }
+
+  // operator< is required for std::set.
+  bool operator<(const BufferValue& other) const { return id_ < other.id_; }
+
+  virtual string ToString() const = 0;
+
+  // TODO(lauj) rename LogicalBufferProto to BufferValueProto.
+  LogicalBufferProto ToProto(const SizeFunction& size_fn) const;
+
+  // Returns the LogicalBufferProto::Location that serializes the given
+  // instruction and index.
+  static LogicalBufferProto::Location ToLocationProto(
+      const HloInstruction& instruction, const ShapeIndex& index);
+
+  const Color kInvalidColor = Color(-1);
+
+ protected:
+  BufferValue(HloInstruction* instruction, const ShapeIndex& index, Id id);
+
+ private:
+  // The definining instruction and index are not stored here; they can be found
+  // in the LogicalBuffer and HloValue subclasses. This class exists only to
+  // support migrations from TuplePointsToAnalysis to HloDataflowAnalysis, by
+  // allowing abstract use of LogicalBuffer or HloValue. After those migrations
+  // are complete, this class should be deleted (b/78906445). Because we plan to
+  // delete LogicalBuffer and this class, we don't refactor all the shared
+  // features from LogicalBuffer and HloValue into this class.
+  Id id_ : 62;
+  bool is_array_ : 1;
+  bool is_tuple_ : 1;
+  Color color_ = kInvalidColor;
+};
+
+std::ostream& operator<<(std::ostream& out, const BufferValue& buffer);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_BUFFER_VALUE_H_
diff --git a/tensorflow/compiler/xla/service/buffer_value_containers.h b/tensorflow/compiler/xla/service/buffer_value_containers.h
new file mode 100644
index 00000000000000..305914fca828f1
--- /dev/null
+++ b/tensorflow/compiler/xla/service/buffer_value_containers.h
@@ -0,0 +1,55 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_BUFFER_VALUE_CONTAINERS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_BUFFER_VALUE_CONTAINERS_H_
+
+#include "tensorflow/compiler/xla/service/buffer_value.h"
+#include "tensorflow/compiler/xla/service/logical_buffer.h"
+#include "tensorflow/core/lib/gtl/compactptrset.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+
+namespace xla {
+
+// Define various containers of BufferValues, and utilities to convert from
+// containers of LogicalBuffers to containers of BufferValues.
+
+using BufferValueCompactPointerSet =
+    tensorflow::gtl::CompactPointerSet<const BufferValue*>;
+template <class LogicalBufferContainerT>
+BufferValueCompactPointerSet ToBufferValueCompactPointerSet(
+    const LogicalBufferContainerT& logical_buffer_container) {
+  BufferValueCompactPointerSet output;
+  for (const LogicalBuffer* buffer : logical_buffer_container) {
+    output.insert(buffer);
+  }
+  return output;
+}
+
+using BufferValueFlatSet = tensorflow::gtl::FlatSet<const BufferValue*>;
+template <class LogicalBufferContainerT>
+BufferValueFlatSet ToBufferValueFlatSet(
+    const LogicalBufferContainerT& logical_buffer_container) {
+  BufferValueFlatSet output;
+  output.reserve(logical_buffer_container.size());
+  for (const LogicalBuffer* buffer : logical_buffer_container) {
+    output.insert(buffer);
+  }
+  return output;
+}
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_BUFFER_VALUE_CONTAINERS_H_
diff --git a/tensorflow/compiler/xla/service/call_graph.cc b/tensorflow/compiler/xla/service/call_graph.cc
index a8053d15e12431..a23427f00ccd88 100644
--- a/tensorflow/compiler/xla/service/call_graph.cc
+++ b/tensorflow/compiler/xla/service/call_graph.cc
@@ -57,6 +57,7 @@ CallContext GetInstructionCallContext(HloOpcode opcode) {
     case HloOpcode::kConditional:
     case HloOpcode::kWhile:
       return CallContext::kSequential;
+    case HloOpcode::kCrossReplicaSum:
     case HloOpcode::kMap:
     case HloOpcode::kReduce:
     case HloOpcode::kReduceWindow:
diff --git a/tensorflow/compiler/xla/service/channel_tracker.h b/tensorflow/compiler/xla/service/channel_tracker.h
index c7763f2ca3e684..52f33a1318e91d 100644
--- a/tensorflow/compiler/xla/service/channel_tracker.h
+++ b/tensorflow/compiler/xla/service/channel_tracker.h
@@ -19,8 +19,6 @@ limitations under the License.
 #include <map>
 
 #include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/session.pb.h"
-#include "tensorflow/compiler/xla/service/user_computation.h"
 #include "tensorflow/compiler/xla/service/versioned_computation_handle.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/service/compile_only_service.cc b/tensorflow/compiler/xla/service/compile_only_service.cc
index c9f78a0f9f1c0e..d8fdccf9bbf1c1 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.cc
+++ b/tensorflow/compiler/xla/service/compile_only_service.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
-#include "tensorflow/compiler/xla/service/computation_tracker.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -69,70 +68,34 @@ CompileOnlyService::CompileAheadOfTime(
   for (const AotXlaComputationInstance& instance : computations) {
     TF_RET_CHECK(instance.computation.has_program_shape());
 
-    const DebugOptions& debug_options = options.debug_options();
-    const auto& program_shape = instance.computation.program_shape();
-    ExecutionOptions execution_options;
-    *execution_options.mutable_debug_options() = debug_options;
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<HloModuleConfig> module_config,
-        CreateModuleConfig(program_shape, instance.argument_layouts,
-                           &execution_options));
-
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<HloModule> hlo_module,
-        HloModule::CreateFromProto(instance.computation, *module_config));
-    TF_RETURN_IF_ERROR(MaybeDumpHloModule(*hlo_module));
-    hlo_modules.push_back(std::move(hlo_module));
-  }
-
-  return compiler_->CompileAheadOfTime(std::move(hlo_modules), options);
-}
-
-StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-CompileOnlyService::CompileAheadOfTime(
-    const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
-    const AotCompilationOptions& options) {
-  std::vector<std::unique_ptr<HloModule>> hlo_modules;
-  for (const AotComputationInstance& instance : computations) {
-    TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
-                        computation_tracker_.Resolve(instance.computation));
-    VersionedComputationHandle versioned_handle =
-        user_computation->GetVersionedHandle();
-
     const DebugOptions& debug_options = options.debug_options();
 
-    // Dump computation proto state if flag is set.
+    // Dump computation proto if flag is set.
     const string& directory_path = debug_options.xla_dump_computations_to();
     if (!directory_path.empty()) {
-      TF_ASSIGN_OR_RETURN(
-          std::unique_ptr<SessionModule> session_module,
-          computation_tracker_.SnapshotComputation(versioned_handle.handle));
+      HloSnapshot hlo_snapshot;
+      *hlo_snapshot.mutable_hlo()->mutable_hlo_module() = instance.computation;
       string filename = tensorflow::strings::StrCat(
-          "computation_", versioned_handle.handle.handle(), "__",
-          session_module->entry().name(), "__version_",
-          versioned_handle.version);
+          "computation_", instance.computation.id(), "__",
+          instance.computation.entry_computation_name());
       const string& per_host_path = tensorflow::io::JoinPath(
           directory_path, tensorflow::port::Hostname());
 
-      TF_RETURN_IF_ERROR(Executable::DumpToDirectory(per_host_path, filename,
-                                                     *session_module));
+      TF_RETURN_IF_ERROR(
+          Executable::DumpToDirectory(per_host_path, filename, hlo_snapshot));
     }
 
-    TF_ASSIGN_OR_RETURN(
-        std::shared_ptr<const ProgramShape> program_shape,
-        user_computation->ComputeProgramShape(versioned_handle.version));
-
+    const auto& program_shape = instance.computation.program_shape();
     ExecutionOptions execution_options;
     *execution_options.mutable_debug_options() = debug_options;
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<HloModuleConfig> module_config,
-        CreateModuleConfig(*program_shape, instance.argument_layouts,
-                           &execution_options, user_computation));
+        CreateModuleConfig(program_shape, instance.argument_layouts,
+                           &execution_options));
 
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
-                        computation_tracker_.BuildHloModule(
-                            versioned_handle, *module_config,
-                            /*include_unreachable_instructions=*/true));
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<HloModule> hlo_module,
+        HloModule::CreateFromProto(instance.computation, *module_config));
     TF_RETURN_IF_ERROR(MaybeDumpHloModule(*hlo_module));
     hlo_modules.push_back(std::move(hlo_module));
   }
diff --git a/tensorflow/compiler/xla/service/compile_only_service.h b/tensorflow/compiler/xla/service/compile_only_service.h
index c10609e67fcdec..e6a66c202d6e0d 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.h
+++ b/tensorflow/compiler/xla/service/compile_only_service.h
@@ -38,24 +38,7 @@ class CompileOnlyService : public Service {
   static StatusOr<std::unique_ptr<CompileOnlyService>> NewService(
       const ServiceOptions& options);
 
-  // A description of a computation to compile using CompileAheadOfTime.
-  struct AotComputationInstance {
-    ComputationHandle computation;
-    std::vector<const Shape*> argument_layouts;
-    const Shape* result_layout = nullptr;
-  };
-
-  // Compiles a list of computations for ahead-of-time execution.  This is
-  // intended for use in static compilation.  See
-  // |CompileOnlyClient::CompileAheadOfTime| for additional details.
-  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-  CompileAheadOfTime(
-      const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
-      const AotCompilationOptions& Options);
-
   // A description of a xla computation to compile using CompileAheadOfTime.
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   struct AotXlaComputationInstance {
     HloModuleProto computation;
     std::vector<const Shape*> argument_layouts;
@@ -65,58 +48,36 @@ class CompileOnlyService : public Service {
   // Compiles a list of xla computations for ahead-of-time execution.  This is
   // intended for use in static compilation.  See
   // |CompileOnlyClient::CompileAheadOfTime| for additional details.
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(
       const tensorflow::gtl::ArraySlice<AotXlaComputationInstance> computations,
       const AotCompilationOptions& options);
 
-  // Override Service methods that require or imply the existence of an
-  // execute backend.  Note that this does not include TransferToClient, as
-  // computing constants produces global data that we may wish to transfer.
-  tensorflow::Status Execute(const ExecuteRequest* arg,
-                             ExecuteResponse* result) override {
-    return Unimplemented("CompileOnlyService does not support execution.");
-  }
-  tensorflow::Status ExecuteParallel(const ExecuteParallelRequest* arg,
-                                     ExecuteParallelResponse* result) override {
-    return Unimplemented("CompileOnlyService does not support execution.");
-  }
-  tensorflow::Status GetDeviceHandles(
-      const GetDeviceHandlesRequest* arg,
-      GetDeviceHandlesResponse* result) override {
+  Status GetDeviceHandles(const GetDeviceHandlesRequest* arg,
+                          GetDeviceHandlesResponse* result) override {
     return Unimplemented("CompileOnlyService does not support devices.");
   }
-  tensorflow::Status ExecuteAsync(const ExecuteAsyncRequest* arg,
-                                  ExecuteAsyncResponse* result) override {
-    return Unimplemented("CompileOnlyService does not support execution.");
-  }
-  tensorflow::Status WaitForExecution(
-      const WaitForExecutionRequest* arg,
-      WaitForExecutionResponse* result) override {
+  Status WaitForExecution(const WaitForExecutionRequest* arg,
+                          WaitForExecutionResponse* result) override {
     return Unimplemented("CompileOnlyService does not support execution.");
   }
-  tensorflow::Status TransferToServer(
-      const TransferToServerRequest* arg,
-      TransferToServerResponse* result) override {
+  Status TransferToServer(const TransferToServerRequest* arg,
+                          TransferToServerResponse* result) override {
     return Unimplemented(
         "CompileOnlyService does not support device data transfers.");
   }
-  tensorflow::Status TransferToInfeed(
-      const TransferToInfeedRequest* arg,
-      TransferToInfeedResponse* result) override {
+  Status TransferToInfeed(const TransferToInfeedRequest* arg,
+                          TransferToInfeedResponse* result) override {
     return Unimplemented(
         "CompileOnlyService does not support device data transfers.");
   }
-  tensorflow::Status TransferFromOutfeed(
-      const TransferFromOutfeedRequest* arg,
-      TransferFromOutfeedResponse* result) override {
+  Status TransferFromOutfeed(const TransferFromOutfeedRequest* arg,
+                             TransferFromOutfeedResponse* result) override {
     return Unimplemented(
         "CompileOnlyService does not support device data transfers.");
   }
-  tensorflow::Status ResetDevice(const ResetDeviceRequest* arg,
-                                 ResetDeviceResponse* result) override {
+  Status ResetDevice(const ResetDeviceRequest* arg,
+                     ResetDeviceResponse* result) override {
     return Unimplemented("CompileOnlyService does not support devices.");
   }
 
diff --git a/tensorflow/compiler/xla/service/compiler.cc b/tensorflow/compiler/xla/service/compiler.cc
index 8b01a6c4b5004d..6f06bba6798bdf 100644
--- a/tensorflow/compiler/xla/service/compiler.cc
+++ b/tensorflow/compiler/xla/service/compiler.cc
@@ -28,6 +28,13 @@ namespace xla {
 /* static */ tensorflow::mutex Compiler::platform_compiler_mutex_(
     tensorflow::LINKER_INITIALIZED);
 
+std::vector<std::unique_ptr<tensorflow::protobuf::Message>>
+Compiler::ComputeBackendConfigs(const HloInstruction& hlo,
+                                se::StreamExecutor* executor) const {
+  CHECK(executor != nullptr);
+  return {};
+}
+
 /* static */ std::map<se::Platform::Id, Compiler::CompilerFactory>*
 Compiler::GetPlatformCompilerFactories() {
   static auto* r = new std::map<se::Platform::Id, CompilerFactory>;
diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h
index 5c14591d93cc99..6c52ffd800d19d 100644
--- a/tensorflow/compiler/xla/service/compiler.h
+++ b/tensorflow/compiler/xla/service/compiler.h
@@ -24,8 +24,11 @@ limitations under the License.
 #include <map>
 #include <memory>
 #include <string>
+#include <vector>
 
+#include "tensorflow/compiler/xla/service/buffer_value.h"
 #include "tensorflow/compiler/xla/service/executable.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
@@ -33,6 +36,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 
@@ -152,6 +156,16 @@ class Compiler {
       std::vector<std::vector<se::StreamExecutor*>> stream_exec,
       DeviceMemoryAllocator* device_allocator) = 0;
 
+  // Returns the backend configurations that the backend will consider for the
+  // given HLO. Returns no configurations if the backend does not support
+  // configurations for the given HLO.
+  //
+  // The stream executor is passed in to provide information about the hardware
+  // that the backend configurations would be targeting.
+  virtual std::vector<std::unique_ptr<tensorflow::protobuf::Message>>
+  ComputeBackendConfigs(const HloInstruction& hlo,
+                        se::StreamExecutor* executor) const;
+
   // Compiles the HLO module for ahead-of-time execution.  This is intended for
   // use in static compilation.
   virtual StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
@@ -181,9 +195,9 @@ class Compiler {
 
   // Returns a function that computes the size in bytes of a given
   // logical buffer.
-  std::function<int64(const LogicalBuffer&)> BufferSizeBytesFunction() {
+  std::function<int64(const BufferValue&)> BufferSizeBytesFunction() {
     HloCostAnalysis::ShapeSizeFunction shape_size = ShapeSizeBytesFunction();
-    return [shape_size](const LogicalBuffer& buffer) {
+    return [shape_size](const BufferValue& buffer) {
       return shape_size(buffer.shape());
     };
   }
diff --git a/tensorflow/compiler/xla/service/computation_layout.cc b/tensorflow/compiler/xla/service/computation_layout.cc
index d2d4f14fcec35f..cb61f3da39fb8e 100644
--- a/tensorflow/compiler/xla/service/computation_layout.cc
+++ b/tensorflow/compiler/xla/service/computation_layout.cc
@@ -23,12 +23,15 @@ limitations under the License.
 
 namespace xla {
 
-ComputationLayout::ComputationLayout(const ProgramShape& program_shape)
+ComputationLayout::ComputationLayout(const ProgramShape& program_shape,
+                                     bool ignore_layouts)
     : result_layout_(program_shape.result()) {
   for (auto& shape : program_shape.parameters()) {
     parameter_layouts_.emplace_back(shape);
   }
-  SetToDefaultLayout();
+  if (ignore_layouts) {
+    SetToDefaultLayout();
+  }
 }
 
 void ComputationLayout::SetToDefaultLayout() {
diff --git a/tensorflow/compiler/xla/service/computation_layout.h b/tensorflow/compiler/xla/service/computation_layout.h
index 80e102411c7885..53c3a3f7b73868 100644
--- a/tensorflow/compiler/xla/service/computation_layout.h
+++ b/tensorflow/compiler/xla/service/computation_layout.h
@@ -34,8 +34,9 @@ class ComputationLayout {
  public:
   // Constructs a ComputationLayout from a ProgramShape. The layouts of the
   // parameters and results are set to the default layout. Layouts in the
-  // ProgramShape are ignored.
-  explicit ComputationLayout(const ProgramShape& program_shape);
+  // ProgramShape are ignored if ignore_layouts is true.
+  explicit ComputationLayout(const ProgramShape& program_shape,
+                             bool ignore_layouts = true);
 
   // Returns the layout of a particular parameter.
   const ShapeLayout& parameter_layout(int64 param_no) const {
diff --git a/tensorflow/compiler/xla/service/computation_tracker.cc b/tensorflow/compiler/xla/service/computation_tracker.cc
deleted file mode 100644
index 70e25eebdb068d..00000000000000
--- a/tensorflow/compiler/xla/service/computation_tracker.cc
+++ /dev/null
@@ -1,256 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/computation_tracker.h"
-
-#include <list>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "tensorflow/compiler/xla/ptr_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/core/platform/logging.h"
-
-using ::tensorflow::strings::Appendf;
-
-namespace xla {
-
-ComputationTracker::ComputationTracker() : next_computation_(1) {}
-
-ComputationHandle ComputationTracker::NewComputation(
-    const string& computation_name) {
-  tensorflow::mutex_lock lock(computation_mutex_);
-  ComputationHandle computation_handle;
-  int64 handle_value = next_computation_++;
-  computation_handle.set_handle(handle_value);
-  opaque_to_computation_[handle_value] =
-      MakeUnique<UserComputation>(computation_name, computation_handle);
-  return computation_handle;
-}
-
-StatusOr<ComputationHandle> ComputationTracker::LoadSessionModule(
-    const SessionModule& session_module) {
-  tensorflow::mutex_lock lock(computation_mutex_);
-
-  // For each embedded computation, create a new computation based on its
-  // serialized data, and place the mapping from the old computation handle to
-  // the new computation handle.
-
-  // Build a mapping from old embedded computation handles to new computation
-  // handles. We build the ID mapping first since the embedded computations are
-  // in no particular order and may refer to each other.
-  std::map<int64, ComputationHandle> old_to_new;
-  for (const SessionComputation& computation :
-       session_module.embedded_computations()) {
-    const int64 old_handle = computation.computation_handle().handle();
-    if (!old_to_new.emplace(old_handle, AllocateHandle()).second) {
-      return InvalidArgument("Duplicate embedded computation handle %lld",
-                             old_handle);
-    }
-  }
-
-  // Create a new computation from each serialized embedded computation.
-  for (const SessionComputation& computation :
-       session_module.embedded_computations()) {
-    const int64 old_handle = computation.computation_handle().handle();
-    const ComputationHandle& new_handle = old_to_new[old_handle];
-    TF_ASSIGN_OR_RETURN(opaque_to_computation_[new_handle.handle()],
-                        UserComputation::MakeWithRemapping(
-                            computation, new_handle, old_to_new));
-  }
-
-  // Finally, place the entry computation in the tracker with all of the
-  // remappings populated from the above.
-  const int64 old_handle = session_module.entry().computation_handle().handle();
-  TF_ASSIGN_OR_RETURN(
-      old_to_new[old_handle],
-      LoadSessionComputation(session_module.entry(), &old_to_new));
-  return old_to_new[old_handle];
-}
-
-StatusOr<std::unique_ptr<SessionModule>>
-ComputationTracker::SnapshotComputation(const ComputationHandle& computation) {
-  TF_ASSIGN_OR_RETURN(UserComputation * user_computation, Resolve(computation));
-  const VersionedComputationHandle entry_versioned_handle =
-      user_computation->GetVersionedHandle();
-  std::set<VersionedComputationHandle> visited;
-  std::list<VersionedComputationHandle> post_order;
-  {
-    tensorflow::mutex_lock lock(computation_mutex_);
-    ComputeComputationPostOrder(entry_versioned_handle, &visited, &post_order);
-  }
-  auto session_module = MakeUnique<SessionModule>();
-  *session_module->mutable_entry() =
-      Resolve(entry_versioned_handle.handle)
-          .ValueOrDie()
-          ->CloneSessionComputation(entry_versioned_handle.version);
-  for (auto it = ++post_order.rbegin(); it != post_order.rend(); ++it) {
-    *session_module->add_embedded_computations() =
-        Resolve(it->handle).ValueOrDie()->CloneSessionComputation(it->version);
-  }
-  return std::move(session_module);
-}
-
-StatusOr<UserComputation*> ComputationTracker::Resolve(
-    const ComputationHandle& computation) const {
-  tensorflow::mutex_lock lock(computation_mutex_);
-  return ResolveInternal(computation);
-}
-
-ComputationHandle ComputationTracker::AllocateHandle() {
-  int64 handle_value = next_computation_++;
-  ComputationHandle result;
-  result.set_handle(handle_value);
-  return result;
-}
-
-StatusOr<ComputationHandle> ComputationTracker::LoadSessionComputation(
-    const SessionComputation& session_computation,
-    std::map<int64, ComputationHandle>* old_to_new) {
-  TF_RET_CHECK(old_to_new != nullptr);
-  const ComputationHandle new_handle = AllocateHandle();
-  (*old_to_new)[session_computation.computation_handle().handle()] = new_handle;
-  TF_ASSIGN_OR_RETURN(opaque_to_computation_[new_handle.handle()],
-                      UserComputation::MakeWithRemapping(
-                          session_computation, new_handle, *old_to_new));
-  return new_handle;
-}
-
-StatusOr<UserComputation*> ComputationTracker::ResolveInternal(
-    const ComputationHandle& computation) const {
-  auto it = opaque_to_computation_.find(computation.handle());
-  if (it == opaque_to_computation_.end()) {
-    return NotFound("computation handle not found: %lld", computation.handle());
-  }
-  UserComputation* user_computation = it->second.get();
-  return user_computation;
-}
-
-void ComputationTracker::ComputeComputationPostOrder(
-    const VersionedComputationHandle& versioned_handle,
-    std::set<VersionedComputationHandle>* visited,
-    std::list<VersionedComputationHandle>* post_order) const {
-  if (visited->count(versioned_handle) > 0) {
-    CHECK_EQ(1, visited->count(versioned_handle));
-    return;
-  }
-
-  UserComputation* computation =
-      ResolveInternal(versioned_handle.handle).ValueOrDie();
-  std::vector<VersionedComputationHandle> embedded_handles =
-      computation->GetEmbeddedComputations(versioned_handle.version);
-
-  for (const auto& embedded_handle : embedded_handles) {
-    ComputeComputationPostOrder(embedded_handle, visited, post_order);
-  }
-
-  visited->insert(versioned_handle);
-  post_order->push_back(versioned_handle);
-}
-
-StatusOr<std::unique_ptr<HloModule>> ComputationTracker::BuildHloModule(
-    const VersionedComputationHandle& entry_handle,
-    const HloModuleConfig& config,
-    bool include_unreachable_instructions) const {
-  tensorflow::mutex_lock lock(computation_mutex_);
-
-  VLOG(1) << "BuildHloModule(" << entry_handle
-          << ", include_unreachable_instructions="
-          << include_unreachable_instructions << ")";
-  XLA_VLOG_LINES(1, ToStringInternal());
-
-  TF_ASSIGN_OR_RETURN(UserComputation * entry_computation,
-                      ResolveInternal(entry_handle.handle));
-
-  // Build a topological sort of the entry and any embedded computations as a
-  // list. The root of the computation will be the last element in the list.
-  std::set<VersionedComputationHandle> visited;
-  std::list<VersionedComputationHandle> post_order;
-  ComputeComputationPostOrder(entry_handle, &visited, &post_order);
-
-  // Map from ComputationHandle value and computation version to HloComputation.
-  std::map<VersionedComputationHandle, HloComputation*> hlo_computations;
-
-  // The resolver lambda resolves VersionedHandles to embedded
-  // HloComputation*. This is required by UserComputation::BuildHloComputation
-  // when lowering calling operations (map, reduce etc).
-  auto resolver = [&hlo_computations](
-      const VersionedComputationHandle& versioned_handle) -> HloComputation* {
-    CHECK_GT(hlo_computations.count(versioned_handle), 0);
-    return hlo_computations.at(versioned_handle);
-  };
-
-  // Print the post-order list for this entry computation.
-  if (VLOG_IS_ON(2)) {
-    VLOG(2) << "Visiting UserComputations in post order:";
-    for (const VersionedComputationHandle& versioned_handle : post_order) {
-      VLOG(2) << "  " << versioned_handle;
-    }
-  }
-
-  string module_name =
-      tensorflow::strings::StrCat(entry_computation->name(), "_module");
-  auto module = MakeUnique<HloModule>(module_name, entry_handle, config);
-  for (auto versioned_handle : post_order) {
-    UserComputation* computation =
-        ResolveInternal(versioned_handle.handle).ValueOrDie();
-
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<HloComputation> hlo_computation,
-        computation->BuildHloComputation(versioned_handle.version, resolver,
-                                         config.debug_options(),
-                                         include_unreachable_instructions));
-
-    // Add the newly created computation to VersionedHandle-to-HloComputation
-    // map.
-    DCHECK_EQ(0, hlo_computations.count(versioned_handle));
-    hlo_computations[versioned_handle] = hlo_computation.get();
-
-    if (computation == entry_computation) {
-      module->AddEntryComputation(std::move(hlo_computation));
-    } else {
-      module->AddEmbeddedComputation(std::move(hlo_computation));
-    }
-  }
-
-  return std::move(module);
-}
-
-string ComputationTracker::ToString() const {
-  tensorflow::mutex_lock lock(computation_mutex_);
-  return ToStringInternal();
-}
-
-string ComputationTracker::ToStringInternal() const {
-  string out;
-  Appendf(&out, "ComputationTracker(%p):\n", this);
-  for (const auto& handle_computation : opaque_to_computation_) {
-    int64 handle = handle_computation.first;
-    const std::unique_ptr<UserComputation>& computation =
-        handle_computation.second;
-    Appendf(&out, "  %4lld : %s \"%s\"\n", handle,
-            computation->GetVersionedHandle().ToString().c_str(),
-            computation->name().c_str());
-  }
-  return out;
-}
-
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/computation_tracker.h b/tensorflow/compiler/xla/service/computation_tracker.h
deleted file mode 100644
index d42d66adefe7fa..00000000000000
--- a/tensorflow/compiler/xla/service/computation_tracker.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_COMPUTATION_TRACKER_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_COMPUTATION_TRACKER_H_
-
-#include <list>
-#include <map>
-#include <memory>
-#include <set>
-#include <string>
-
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_module_config.h"
-#include "tensorflow/compiler/xla/service/session.pb.h"
-#include "tensorflow/compiler/xla/service/user_computation.h"
-#include "tensorflow/compiler/xla/service/versioned_computation_handle.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/thread_annotations.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace xla {
-
-// Tracks computations for the XLA service; computations can be registered
-// with a UserComputation instance and can be resolved from a handle for later
-// use.
-//
-// This class is also capable of serializing/deserializing computations that it
-// tracks (and to serialize properly you need to serialize all referred-to
-// computations as well).
-class ComputationTracker {
- public:
-  ComputationTracker();
-
-  // Creates a new UserComputation object and returns the corresponding
-  // ComputationHandle for it.
-  //
-  // Precondition: user_computation is not already present in the map.
-  ComputationHandle NewComputation(const string& computation_name);
-
-  // Restores session data for a computation that has been serialized, and
-  // allocates a new computation handle for it.
-  StatusOr<ComputationHandle> LoadSessionModule(
-      const SessionModule& session_module);
-
-  // Snapshots a computation (referenced by the provided handle) at its latest
-  // version, returning a module where it is the entry, and any referred-to
-  // computations are entrained as "embedded" (non-entry) computations.
-  StatusOr<std::unique_ptr<SessionModule>> SnapshotComputation(
-      const ComputationHandle& computation);
-
-  // Resolves a ComputationHandle to a UserComputation that is present in the
-  // map.
-  StatusOr<UserComputation*> Resolve(
-      const ComputationHandle& computation) const;
-
-  // Builds an HLO module using the specified computation as the entry. The
-  // module will include the entry computation as well as all computations which
-  // are called directly or indirectly from the entry computation via operations
-  // like "map". config is the HLO module configuration to use for the
-  // constructed module.
-  // If include_unreachable_instructions is true, then instructions
-  // which are not reachable from the root are lowered into HloInstructions
-  // including unreachable parameters. This ensures the entry HloComputation has
-  // the same program shape (ProgramShape) as the entry UserComputation.
-  StatusOr<std::unique_ptr<HloModule>> BuildHloModule(
-      const VersionedComputationHandle& entry_handle,
-      const HloModuleConfig& config,
-      bool include_unreachable_instructions = true) const;
-
-  string ToString() const;
-
- private:
-  // Bumps the next_computation_ number and returns the allocated number wrapped
-  // in a ComputationHandle.
-  ComputationHandle AllocateHandle()
-      EXCLUSIVE_LOCKS_REQUIRED(computation_mutex_);
-
-  // Loads a session computation into a UserComputation, registers it, and
-  // returns the computation handle of the registered computation. If old_to_new
-  // is provided, it is used for remapping references to computations present in
-  // session_computation.
-  //
-  // old_to_new will be updated with the mapping from session_computation's old
-  // handle to the returned handle value, and may not be null.
-  StatusOr<ComputationHandle> LoadSessionComputation(
-      const SessionComputation& session_computation,
-      std::map<int64, ComputationHandle>* old_to_new)
-      EXCLUSIVE_LOCKS_REQUIRED(computation_mutex_);
-
-  // Internal implementation of Resolve method which requires, but does not
-  // acquire the mutex.
-  StatusOr<UserComputation*> ResolveInternal(
-      const ComputationHandle& computation) const
-      EXCLUSIVE_LOCKS_REQUIRED(computation_mutex_);
-
-  // Builds a post order sort of a computation ("entry") and all of its embedded
-  // computations including all transitively embedded computations. An embedded
-  // computation (the callee) will always appear in the sort before the
-  // computation which calls the embedded computation (the caller). Necessarily,
-  // the entry computation is the last element in the sort. visited and
-  // post_order should be empty when calling. post_order contains the post order
-  // sort when the function return.
-  void ComputeComputationPostOrder(
-      const VersionedComputationHandle& versioned_handle,
-      std::set<VersionedComputationHandle>* visited,
-      std::list<VersionedComputationHandle>* post_order) const
-      EXCLUSIVE_LOCKS_REQUIRED(computation_mutex_);
-
-  string ToStringInternal() const EXCLUSIVE_LOCKS_REQUIRED(computation_mutex_);
-
-  // Guards the computation mapping. Marked mutable so that the Resolve method
-  // can remain const; Resolve does't really modify the tracker in any way, but
-  // it has to lock the mutex for safety.
-  mutable tensorflow::mutex computation_mutex_;
-
-  // The next sequence number to assign to a computation, guarded by the same
-  // mutex as the mapping as they'll be mutated at the same time.
-  int64 next_computation_ GUARDED_BY(computation_mutex_);
-
-  // Mapping from ComputationHandle value to the corresponding registered
-  // UserComputation object.
-  std::map<int64, std::unique_ptr<UserComputation>> opaque_to_computation_
-      GUARDED_BY(computation_mutex_);
-
-  TF_DISALLOW_COPY_AND_ASSIGN(ComputationTracker);
-};
-
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_COMPUTATION_TRACKER_H_
diff --git a/tensorflow/compiler/xla/service/conditional_simplifier.cc b/tensorflow/compiler/xla/service/conditional_simplifier.cc
index e560abc87f8456..e9ec796121fff2 100644
--- a/tensorflow/compiler/xla/service/conditional_simplifier.cc
+++ b/tensorflow/compiler/xla/service/conditional_simplifier.cc
@@ -35,7 +35,7 @@ namespace xla {
 
 // Tries to replace a conditional with a call operation of the corresponding
 // computation. If the given conditional has a constant predicate, tries to
-// replace it with a call to its true/false computation as appropirate and then
+// replace it with a call to its true/false computation as appropriate and then
 // inline that computation.
 //
 // Returns true if it made a change to the graph.
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index 40519ecc799c8f..33d8338809d4e8 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
-#include "tensorflow/compiler/xla/service/liveness_util.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -65,7 +64,7 @@ struct SpecialCaseCopyPolicy {
   // output tuple.
   bool copy_root_replicated_buffers = false;
   // If true, insert a copy if a buffer coming from a constant or a parameter
-  // is found wihtin the output tuple.
+  // is found within the output tuple.
   bool copy_parameters_and_constants = false;
 };
 
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 04fda3b2df5745..278bb1bebfa1a0 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -103,6 +103,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:algebraic_simplifier",
+        "//tensorflow/compiler/xla/service:batch_dot_simplification",
         "//tensorflow/compiler/xla/service:batchnorm_expander",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:buffer_liveness",
@@ -125,12 +126,14 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_scheduling",
         "//tensorflow/compiler/xla/service:hlo_subcomputation_unification",
         "//tensorflow/compiler/xla/service:hlo_verifier",
+        "//tensorflow/compiler/xla/service:indexed_array_analysis",
         "//tensorflow/compiler/xla/service:inliner",
         "//tensorflow/compiler/xla/service:llvm_compiler",
         "//tensorflow/compiler/xla/service:reduce_precision_insertion",
         "//tensorflow/compiler/xla/service:reshape_mover",
         "//tensorflow/compiler/xla/service:transpose_folding",
         "//tensorflow/compiler/xla/service:tuple_simplifier",
+        "//tensorflow/compiler/xla/service:while_loop_constant_sinking",
         "//tensorflow/compiler/xla/service:while_loop_invariant_code_motion",
         "//tensorflow/compiler/xla/service:while_loop_simplifier",
         "//tensorflow/compiler/xla/service:zero_sized_hlo_elimination",
@@ -169,11 +172,13 @@ cc_library(
         ":orc_jit_memory_mapper",
         ":runtime_fp16",
         ":runtime_conv2d",
+        ":runtime_conv2d_mkl",
         ":runtime_fft",
         ":runtime_fork_join",
         ":runtime_matmul",
         ":runtime_matmul_mkl",
         ":runtime_single_threaded_conv2d",
+        ":runtime_single_threaded_fft",
         ":runtime_single_threaded_matmul",
         "@llvm//:execution_engine",
         "@llvm//:core",
@@ -293,6 +298,15 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "target_machine_features_fake",
+    testonly = 1,
+    hdrs = ["target_machine_features_fake.h"],
+    deps = [
+        ":target_machine_features",
+    ],
+)
+
 cc_library(
     name = "ir_function",
     srcs = ["ir_function.cc"],
@@ -334,6 +348,7 @@ cc_library(
     deps = [
         ":cpu_options",
         ":cpu_runtime",
+        ":ir_emission_utils",
         ":target_machine_features",
         ":vector_support_library",
         "//tensorflow/compiler/xla:shape_util",
@@ -363,10 +378,10 @@ tf_cc_binary(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/core:lib",
     ],
 )
@@ -406,7 +421,6 @@ cc_library(
         "//tensorflow/core:lib",
         "@llvm//:analysis",
         "@llvm//:core",
-        "@llvm//:execution_engine",
         "@llvm//:ipo",
         "@llvm//:mc",
         "@llvm//:object",
@@ -470,6 +484,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "runtime_conv2d_mkl",
+    srcs = [
+        "runtime_conv2d_mkl.cc",
+    ],
+    hdrs = ["runtime_conv2d_mkl.h"],
+    copts = runtime_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":runtime_conv2d",
+        ":runtime_single_threaded_conv2d",
+        "//tensorflow/compiler/xla:executable_run_options",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core/kernels:eigen_helpers",
+        "//third_party/eigen3",
+    ] + if_mkl([
+        "@mkl_dnn",
+        "//third_party/mkl:intel_binary_blob",
+    ]),
+)
+
 cc_library(
     name = "runtime_fft",
     srcs = [
@@ -482,7 +517,6 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/core:framework",
         "//tensorflow/core:framework_lite",
         "//third_party/eigen3",
     ],
@@ -544,6 +578,22 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "runtime_single_threaded_fft",
+    srcs = [
+        "runtime_fft_impl.h",
+        "runtime_single_threaded_fft.cc",
+    ],
+    hdrs = ["runtime_single_threaded_fft.h"],
+    copts = runtime_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:framework_lite",
+        "//third_party/eigen3",
+    ],
+)
+
 cc_library(
     name = "runtime_single_threaded_matmul",
     srcs = ["runtime_single_threaded_matmul.cc"],
@@ -599,6 +649,7 @@ tf_cc_test(
     deps = [
         ":cpu_instruction_fusion",
         "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:transpose_folding",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -636,6 +687,7 @@ cc_library(
     hdrs = ["ir_emission_utils.h"],
     deps = [
         ":cpu_runtime",
+        ":target_machine_features",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla/service:hlo",
@@ -648,14 +700,15 @@ tf_cc_test(
     srcs = ["ir_emission_utils_test.cc"],
     deps = [
         ":ir_emission_utils",
+        ":target_machine_features_fake",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
     ],
 )
 
@@ -666,6 +719,7 @@ cc_library(
     deps = [
         ":dot_op_emitter",
         ":ir_emission_utils",
+        ":target_machine_features",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:computation_layout",
         "//tensorflow/compiler/xla/service:layout_assignment",
@@ -679,6 +733,7 @@ tf_cc_test(
     srcs = ["cpu_layout_assignment_test.cc"],
     deps = [
         ":cpu_layout_assignment",
+        ":target_machine_features_fake",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_layout",
         "//tensorflow/compiler/xla:shape_util",
@@ -703,6 +758,7 @@ cc_library(
     deps = [
         ":cpu_runtime",
         ":ir_emission_utils",
+        ":target_machine_features",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
@@ -717,6 +773,7 @@ tf_cc_test(
     srcs = ["conv_canonicalization_test.cc"],
     deps = [
         ":conv_canonicalization",
+        ":target_machine_features_fake",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
@@ -755,6 +812,7 @@ cc_library(
         ":dot_op_emitter",
         ":ir_emission_utils",
         ":shape_partition",
+        ":target_machine_features",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_cost_analysis",
         "//tensorflow/compiler/xla/service:hlo_pass",
@@ -767,6 +825,7 @@ tf_cc_test(
     deps = [
         ":cpu_executable",
         ":parallel_task_assignment",
+        ":target_machine_features_fake",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_layout",
         "//tensorflow/compiler/xla:shape_util",
@@ -889,3 +948,17 @@ tf_cc_test(
         "//tensorflow/core:test",
     ],
 )
+
+tf_cc_test(
+    name = "cpu_eigen_tensor_alignment_test",
+    size = "small",
+    srcs = ["cpu_eigen_tensor_alignment_test.cc"],
+    deps = [
+        ":dot_op_emitter",
+        ":ir_emission_utils",
+        ":target_machine_features_fake",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc b/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc
index 2136aeb3877685..0985b9297fe487 100644
--- a/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc
+++ b/tensorflow/compiler/xla/service/cpu/conv_canonicalization.cc
@@ -33,7 +33,8 @@ StatusOr<bool> ConvCanonicalization::Run(HloModule* module) {
   for (HloInstruction* hlo :
        module->entry_computation()->MakeInstructionPostOrder()) {
     if (hlo->opcode() == HloOpcode::kConvolution &&
-        !PotentiallyImplementedAsEigenConvolution(*hlo)) {
+        !PotentiallyImplementedAsEigenConvolution(*hlo,
+                                                  target_machine_features_)) {
       const ConvolutionDimensionNumbers& dnums =
           hlo->convolution_dimension_numbers();
       auto input_batch_dim = dnums.input_batch_dimension();
diff --git a/tensorflow/compiler/xla/service/cpu/conv_canonicalization.h b/tensorflow/compiler/xla/service/cpu/conv_canonicalization.h
index 9b2c3d82eb673c..e6fd1499edd009 100644
--- a/tensorflow/compiler/xla/service/cpu/conv_canonicalization.h
+++ b/tensorflow/compiler/xla/service/cpu/conv_canonicalization.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CONV_CANONICALIZATION_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CONV_CANONICALIZATION_H_
 
+#include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
@@ -32,12 +33,19 @@ namespace cpu {
 // convolutions can run faster.
 class ConvCanonicalization : public HloPassInterface {
  public:
+  explicit ConvCanonicalization(
+      const TargetMachineFeatures* target_machine_features)
+      : target_machine_features_(*target_machine_features) {}
+
   ~ConvCanonicalization() override {}
   tensorflow::StringPiece name() const override {
     return "convolution-canonicalization";
   }
 
   StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  const TargetMachineFeatures& target_machine_features_;
 };
 
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc b/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
index 968f53d5c70665..375b017b09263c 100644
--- a/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/conv_canonicalization_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/compiler/xla/service/cpu/target_machine_features_fake.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -89,7 +90,11 @@ TEST_F(ConvCanonicalizationTest, NonCanonicalToCanonical) {
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
 
-  ConvCanonicalization conv_canonicalization;
+  cpu::TargetMachineFeaturesWithFakeAlignmentLogic target_machine_features(
+      [](int64 shape_size) {
+        return cpu::TargetMachineFeatures::kEigenExpectedTensorAlignment;
+      });
+  ConvCanonicalization conv_canonicalization(&target_machine_features);
   EXPECT_TRUE(conv_canonicalization.Run(module.get()).ValueOrDie());
 
   const HloInstruction* output_reshape = entry_computation->root_instruction();
@@ -146,7 +151,11 @@ TEST_F(ConvCanonicalizationTest, CanonicalStaysTheSame) {
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build());
 
-  ConvCanonicalization conv_canonicalization;
+  cpu::TargetMachineFeaturesWithFakeAlignmentLogic target_machine_features(
+      [](int64 shape_size) {
+        return cpu::TargetMachineFeatures::kEigenExpectedTensorAlignment;
+      });
+  ConvCanonicalization conv_canonicalization(&target_machine_features);
   EXPECT_FALSE(conv_canonicalization.Run(module.get()).ValueOrDie());
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 3c0c367df30639..25b18eff20f901 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -43,6 +43,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
+#include "tensorflow/compiler/xla/service/batch_dot_simplification.h"
 #include "tensorflow/compiler/xla/service/batchnorm_expander.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/buffer_liveness.h"
@@ -81,12 +82,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_scheduling.h"
 #include "tensorflow/compiler/xla/service/hlo_subcomputation_unification.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
+#include "tensorflow/compiler/xla/service/indexed_array_analysis.h"
 #include "tensorflow/compiler/xla/service/inliner.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/reduce_precision_insertion.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
+#include "tensorflow/compiler/xla/service/while_loop_constant_sinking.h"
 #include "tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h"
 #include "tensorflow/compiler/xla/service/while_loop_simplifier.h"
 #include "tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h"
@@ -118,10 +121,12 @@ se::Platform::Id CpuAotCompilationOptions::PlatformId() const {
 
 CpuAotCompilationResult::CpuAotCompilationResult(
     ObjectFileData object_file_data, BufferSizes buffer_sizes,
-    int64 result_buffer_index)
+    int64 result_buffer_index,
+    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data)
     : object_file_data_(std::move(object_file_data)),
       buffer_sizes_(std::move(buffer_sizes)),
-      result_buffer_index_(result_buffer_index) {}
+      result_buffer_index_(result_buffer_index),
+      hlo_profile_printer_data_(std::move(hlo_profile_printer_data)) {}
 
 CpuAotCompilationResult::~CpuAotCompilationResult() = default;
 
@@ -171,14 +176,13 @@ class CollectProfileCandidates : public DfsHloVisitorWithDefault {
  public:
   static StatusOr<std::unordered_map<const HloInstruction*, int64>>
   GetCandidatesForComputation(
-      HloComputation* computation,
+      const HloComputation& computation,
       const std::unordered_map<const HloInstruction*, int64>&
           assigned_indices) {
     std::unordered_map<const HloInstruction*, int64> hlo_to_profile_idx;
     CollectProfileCandidates profile_candidates_for_computation(
         &hlo_to_profile_idx, assigned_indices);
-    TF_RETURN_IF_ERROR(
-        computation->Accept(&profile_candidates_for_computation));
+    TF_RETURN_IF_ERROR(computation.Accept(&profile_candidates_for_computation));
     return hlo_to_profile_idx;
   }
 
@@ -229,7 +233,10 @@ class CollectProfileCandidates : public DfsHloVisitorWithDefault {
 };
 }  // namespace
 
-Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) {
+Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile,
+                                 llvm::TargetMachine* target_machine) {
+  LLVMTargetMachineFeatures target_machine_features(target_machine);
+
   // Optimization pipeline.
   HloPassPipeline pipeline("CPU");
   pipeline.AddInvariantChecker<HloVerifier>();
@@ -246,8 +253,9 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) {
   // TODO(b/65775800): Fix wrong output bug in Call and remove the CallInliner
   // pass.
   pipeline.AddPass<CallInliner>();
+  pipeline.AddPass<BatchDotSimplification>();
   pipeline.AddPass<DotDecomposer>();
-  pipeline.AddPass<ConvCanonicalization>();
+  pipeline.AddPass<ConvCanonicalization>(&target_machine_features);
   {
     auto& pass =
         pipeline.AddPass<HloPassFix<HloPassPipeline>>("simplification");
@@ -258,7 +266,6 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) {
         /*rewrite_inference_op=*/true,
         /*rewrite_grad_op=*/true,
         /*use_fusion=*/false);
-    pipeline.AddPass<GatherExpander>();
     pass.AddPass<AlgebraicSimplifier>(
         /*is_layout_sensitive=*/false,
         [](const Shape&, const Shape&) { return false; },
@@ -270,16 +277,19 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) {
 
     pass.AddPass<WhileLoopInvariantCodeMotion>();
     pass.AddPass<TupleSimplifier>();
+    pass.AddPass<WhileLoopConstantSinking>();
     pass.AddPass<WhileLoopSimplifier>();
     pass.AddPass<HloDCE>();
     pass.AddPass<ReshapeMover>();
     pass.AddPass<HloConstantFolding>();
     pass.AddPass<ConditionalSimplifier>();
   }
+  pipeline.AddPass<IndexedArrayAnalysisPrinterPass>();
   pipeline.AddPass<TransposeFolding>(
-      [](const HloInstruction& dot,
-         const TransposeFolding::OperandIndices& candidate_operands) {
-        return PotentiallyImplementedAsEigenDot(dot)
+      [&target_machine_features](
+          const HloInstruction& dot,
+          const TransposeFolding::OperandIndices& candidate_operands) {
+        return PotentiallyImplementedAsEigenDot(dot, target_machine_features)
                    ? candidate_operands
                    : TransposeFolding::OperandIndices{};
       },
@@ -287,12 +297,15 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) {
   pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/false);
   pipeline.AddPass<CpuInstructionFusion>();
 
+  pipeline.AddPass<GatherExpander>();
+
   ReducePrecisionInsertion::AddPasses(
       &pipeline, module->config().debug_options(),
       ReducePrecisionInsertion::PassTiming::AFTER_FUSION);
 
   pipeline.AddPass<CpuLayoutAssignment>(
-      module->mutable_entry_computation_layout());
+      module->mutable_device_entry_computation_layout(),
+      &target_machine_features);
   // The LayoutAssignment pass may leave behind kCopy instructions which are
   // duplicate or NOPs, so remove them with algebraic simplification and CSE.
   pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(
@@ -312,8 +325,8 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile) {
     // and thread synchronization dependencies which would likely increase
     // binary size (and most AOT applications are single-threaded).
     // TODO(b/29630486) Support multi-threaded AOT.
-    pipeline.AddPass<ParallelTaskAssigner>(max_parallelism,
-                                           ShapeSizeBytesFunction());
+    pipeline.AddPass<ParallelTaskAssigner>(
+        max_parallelism, ShapeSizeBytesFunction(), &target_machine_features);
   }
   // Copy insertion should be performed immediately before IR emission to avoid
   // inserting unnecessary copies (later pass adds an instruction which
@@ -423,6 +436,41 @@ Status VerifyLlvmModule(const llvm::Module& llvm_module) {
   return Status::OK();
 }
 
+Status CreateHloProfilingArtifacts(
+    const HloModule& module,
+    std::unordered_map<const HloInstruction*, int64>*
+        instruction_to_profile_idx,
+    std::unordered_map<const HloComputation*, int64>*
+        computation_to_profile_idx,
+    std::unique_ptr<HloProfileIndexMap>* hlo_profile_index_map,
+    std::unique_ptr<HloProfilePrinterData>* hlo_profile_printer_data) {
+  *hlo_profile_index_map = MakeUnique<HloProfileIndexMap>(module);
+  const HloComputation& entry_computation = *module.entry_computation();
+
+  TF_ASSIGN_OR_RETURN(
+      *instruction_to_profile_idx,
+      CollectProfileCandidates::GetCandidatesForComputation(
+          entry_computation,
+          (*hlo_profile_index_map)->instruction_to_profile_idx()));
+
+  auto shape_size_bytes = [](const Shape& shape) {
+    // On the cpu, opaques are pointers.
+    if (ShapeUtil::IsOpaque(shape)) {
+      return static_cast<int64>(sizeof(void*));
+    }
+    return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
+  };
+
+  HloCostAnalysis cost_analysis(shape_size_bytes);
+  TF_RETURN_IF_ERROR(entry_computation.Accept(&cost_analysis));
+  *hlo_profile_printer_data =
+      CreateHloProfilePrinterData(**hlo_profile_index_map, cost_analysis);
+  *computation_to_profile_idx =
+      (*hlo_profile_index_map)->computation_to_profile_idx();
+
+  return Status::OK();
+}
+
 }  // namespace
 
 StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
@@ -431,7 +479,13 @@ StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
   VLOG(2) << "Before optimization:";
   XLA_VLOG_LINES(2, module->ToString());
 
-  TF_RETURN_IF_ERROR(RunHloPasses(module.get(), /*is_aot_compile=*/false));
+  std::unique_ptr<llvm::TargetMachine> jit_target_machine =
+      SimpleOrcJIT::InferTargetMachineForJIT(
+          CompilerTargetOptions(module->config()),
+          CodeGenOptLevel(module->config()));
+
+  TF_RETURN_IF_ERROR(RunHloPasses(module.get(), /*is_aot_compile=*/false,
+                                  jit_target_machine.get()));
 
   VLOG(2) << "After optimization:";
   XLA_VLOG_LINES(2, module->ToString());
@@ -477,28 +531,9 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map;
   std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data;
   if (module->config().hlo_profiling_enabled()) {
-    hlo_profile_index_map = MakeUnique<HloProfileIndexMap>(*module);
-
-    TF_ASSIGN_OR_RETURN(
-        instruction_to_profile_idx,
-        CollectProfileCandidates::GetCandidatesForComputation(
-            entry_computation,
-            hlo_profile_index_map->instruction_to_profile_idx()));
-
-    auto shape_size_bytes = [](const Shape& shape) {
-      // On the cpu, opaques are pointers.
-      if (ShapeUtil::IsOpaque(shape)) {
-        return static_cast<int64>(sizeof(void*));
-      }
-      return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
-    };
-
-    HloCostAnalysis cost_analysis(shape_size_bytes);
-    TF_RETURN_IF_ERROR(entry_computation->Accept(&cost_analysis));
-    hlo_profile_printer_data =
-        CreateHloProfilePrinterData(*hlo_profile_index_map, cost_analysis);
-    computation_to_profile_idx =
-        hlo_profile_index_map->computation_to_profile_idx();
+    TF_RETURN_IF_ERROR(CreateHloProfilingArtifacts(
+        *module, &instruction_to_profile_idx, &computation_to_profile_idx,
+        &hlo_profile_index_map, &hlo_profile_printer_data));
   }
 
   std::unique_ptr<Executable> cpu_executable;
@@ -515,7 +550,8 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   // and reduced memory usage (as compared to using DependencyHloOrdering).
   TF_ASSIGN_OR_RETURN(
       SequentialHloOrdering::HloModuleSequence module_sequence,
-      CreateMemoryMinimizingSequence(*module, BufferSizeBytesFunction()));
+      CreateMemoryMinimizingSequence(*module, BufferSizeBytesFunction(),
+                                     DFSMemoryScheduler));
 
   // Run buffer analysis on the HLO graph. This analysis figures out which
   // temporary buffers are required to run the computation.
@@ -540,10 +576,11 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   // GetEmbeddedComputations guarantees that a called computation occurs
   // before a caller computation.
 
+  LLVMTargetMachineFeatures target_machine_features(jit->target_machine());
   IrEmitter ir_emitter(*module, *assignment, llvm_module.get(),
                        std::move(instruction_to_profile_idx),
                        std::move(computation_to_profile_idx),
-                       jit->target_machine(), jit->external_constant_pool());
+                       &target_machine_features, jit->external_constant_pool());
 
   for (auto embedded_computation :
        entry_computation->MakeEmbeddedComputationsList()) {
@@ -685,7 +722,8 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
     VLOG(2) << "Before optimization:";
     XLA_VLOG_LINES(2, module->ToString());
 
-    TF_RETURN_IF_ERROR(RunHloPasses(module, /*is_aot_compile=*/true));
+    TF_RETURN_IF_ERROR(
+        RunHloPasses(module, /*is_aot_compile=*/true, target_machine.get()));
 
     VLOG(2) << "After optimization:";
     XLA_VLOG_LINES(2, module->ToString());
@@ -714,12 +752,22 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
           proto, xla_dump_optimized_hlo_proto_to, module->name()));
     }
 
+    std::unordered_map<const HloInstruction*, int64> instruction_to_profile_idx;
+    std::unordered_map<const HloComputation*, int64> computation_to_profile_idx;
+    std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map;
+    std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data;
+
+    if (module->config().hlo_profiling_enabled()) {
+      TF_RETURN_IF_ERROR(CreateHloProfilingArtifacts(
+          *module, &instruction_to_profile_idx, &computation_to_profile_idx,
+          &hlo_profile_index_map, &hlo_profile_printer_data));
+    }
+
+    LLVMTargetMachineFeatures target_machine_features(target_machine.get());
     IrEmitter ir_emitter(*module, *assignment, &llvm_module,
-                         /*instruction_to_profile_idx=*/
-                         std::unordered_map<const HloInstruction*, int64>{},
-                         /*computation_to_profile_idx=*/
-                         std::unordered_map<const HloComputation*, int64>{},
-                         target_machine.get(),
+                         std::move(instruction_to_profile_idx),
+                         std::move(computation_to_profile_idx),
+                         &target_machine_features,
                          /*external_constant_pool=*/nullptr);
     HloComputation* computation = module->entry_computation();
     for (auto embedded_computation :
@@ -760,6 +808,8 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
       TF_RETURN_IF_ERROR(verify_status);
     }
 
+    XLA_VLOG_LINES(2, "LLVM IR:\n" + llvm_ir::DumpModuleToString(llvm_module));
+
     Disassembler disassembler(*target_machine);
     CompilerFunctor compiler_functor(
         target_machine.get(), &disassembler, opt_level,
@@ -793,7 +843,7 @@ CpuCompiler::CompileAheadOfTime(std::vector<std::unique_ptr<HloModule>> modules,
 
     results.emplace_back(MakeUnique<CpuAotCompilationResult>(
         std::move(object_file_data), std::move(buffer_sizes),
-        result_slice.index()));
+        result_slice.index(), std::move(hlo_profile_printer_data)));
   }
 
   VLOG(1) << "Compilation finished";
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
index 151af38438a980..e56f9f01134f84 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "llvm/Target/TargetMachine.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/llvm_compiler.h"
@@ -76,10 +77,16 @@ class CpuAotCompilationOptions : public AotCompilationOptions {
 
 class CpuAotCompilationResult : public AotCompilationResult {
  public:
-  CpuAotCompilationResult(ObjectFileData object_file_data,
-                          BufferSizes buffer_sizes, int64 result_buffer_index);
+  CpuAotCompilationResult(
+      ObjectFileData object_file_data, BufferSizes buffer_sizes,
+      int64 result_buffer_index,
+      std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data);
   ~CpuAotCompilationResult();
 
+  HloProfilePrinterData* hlo_profile_printer_data() const {
+    return hlo_profile_printer_data_.get();
+  }
+
   const ObjectFileData& object_file_data() const { return object_file_data_; }
   const BufferSizes& buffer_sizes() const { return buffer_sizes_; }
   int64 result_buffer_index() const { return result_buffer_index_; }
@@ -97,6 +104,10 @@ class CpuAotCompilationResult : public AotCompilationResult {
   // result of the computation.  This buffer should be passed into the output
   // parameter when calling the compiled computation.
   const int64 result_buffer_index_;
+
+  // Contains an instance of HloProfilePrinterData if HLO profiling is enabled,
+  // otherwise is nullptr.
+  std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data_;
 };
 
 // CPU-targeting implementation of the XLA Compiler interface.
@@ -138,7 +149,8 @@ class CpuCompiler : public LLVMCompiler {
 
   // Runs the HLO passes which are necessary for both optimizations and
   // correctness.
-  Status RunHloPasses(HloModule* module, bool is_aot_compile);
+  Status RunHloPasses(HloModule* module, bool is_aot_compile,
+                      llvm::TargetMachine* target_machine);
 
   TF_DISALLOW_COPY_AND_ASSIGN(CpuCompiler);
 };
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_eigen_tensor_alignment_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_eigen_tensor_alignment_test.cc
new file mode 100644
index 00000000000000..8727c72b6e4251
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/cpu_eigen_tensor_alignment_test.cc
@@ -0,0 +1,94 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/dot_op_emitter.h"
+#include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/cpu/target_machine_features_fake.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/test.h"
+
+namespace xla {
+namespace cpu {
+namespace {
+
+// Test that we don't call into Eigen with tensors too small to be aligned
+// reliably.
+
+class CpuEigenTensorAlignmentTest : public ::testing::Test {};
+
+TEST_F(CpuEigenTensorAlignmentTest, EigenDotAlignment) {
+  string hlo_string = R"(
+HloModule DotOperation
+
+ENTRY DotOperation {
+  arg0 = f32[5,256] parameter(0)
+  arg1 = f32[256,1024] parameter(1)
+  ROOT dot = f32[5,1024] dot(arg0, arg1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+
+  HloInstruction* dot = module->entry_computation()->root_instruction();
+
+  TargetMachineFeaturesWithFakeAlignmentLogic target_machine_with_no_alignment(
+      [](int64 size) { return 1; });
+
+  EXPECT_FALSE(
+      PotentiallyImplementedAsEigenDot(*dot, target_machine_with_no_alignment));
+
+  TargetMachineFeaturesWithFakeAlignmentLogic
+      target_machine_with_full_alignment([](int64 size) {
+        return TargetMachineFeatures::kEigenExpectedTensorAlignment;
+      });
+
+  EXPECT_TRUE(PotentiallyImplementedAsEigenDot(
+      *dot, target_machine_with_full_alignment));
+}
+
+TEST_F(CpuEigenTensorAlignmentTest, EigenConvAlignment) {
+  string hlo_string = R"(
+HloModule ConvOperation
+
+ENTRY ConvOperation {
+  arg0 = f32[1,2,1] parameter(0)
+  arg1 = f32[1,1,1] parameter(1)
+  ROOT conv = f32[1,2,1] convolution(arg0, arg1), window={size=1}, dim_labels=b0f_0io->b0f
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+
+  HloInstruction* conv = module->entry_computation()->root_instruction();
+
+  TargetMachineFeaturesWithFakeAlignmentLogic target_machine_with_no_alignment(
+      [](int64 size) { return 1; });
+
+  EXPECT_FALSE(PotentiallyImplementedAsEigenConvolution(
+      *conv, target_machine_with_no_alignment));
+
+  TargetMachineFeaturesWithFakeAlignmentLogic
+      target_machine_with_full_alignment([](int64 size) {
+        return TargetMachineFeatures::kEigenExpectedTensorAlignment;
+      });
+
+  EXPECT_TRUE(PotentiallyImplementedAsEigenConvolution(
+      *conv, target_machine_with_full_alignment));
+}
+}  // namespace
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index aabf4d5161e3af..cf43b74c699ca8 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -73,7 +73,7 @@ CpuExecutable::CpuExecutable(
 
 Status CpuExecutable::AllocateBuffers(
     DeviceMemoryAllocator* memory_allocator, int device_ordinal,
-    std::vector<se::DeviceMemoryBase>* buffers) {
+    std::vector<OwningDeviceMemory>* buffers) {
   CHECK_EQ(buffers->size(), assignment_->Allocations().size());
   VLOG(3) << "Allocating " << assignment_->Allocations().size()
           << " allocations for module " << module().name();
@@ -201,59 +201,18 @@ Status CpuExecutable::ExecuteComputeFunction(
   return Status::OK();
 }
 
-static void LogLiveAddresses(
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
-    const std::vector<bool>& buffers_in_result) {
-  if (!VLOG_IS_ON(3)) {
-    return;
-  }
-
-  CHECK_EQ(buffers.size(), buffers_in_result.size());
-  std::vector<const void*> live_out_buffers;
-  for (int i = 0; i < buffers.size(); ++i) {
-    if (buffers_in_result[i]) {
-      live_out_buffers.push_back(buffers[i].opaque());
-    }
-  }
-  VLOG(3) << "Live addresses in output marking found "
-          << live_out_buffers.size() << " addresses:\n"
-          << tensorflow::str_util::Join(
-                 live_out_buffers, ", ", [](string* out, const void* address) {
-                   tensorflow::strings::StrAppend(
-                       out, tensorflow::strings::Printf("%p", address));
-                 });
-}
-
-static Status DeallocateTempBuffers(
-    DeviceMemoryAllocator* allocator, se::Stream* stream,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
-    const std::vector<bool>& buffers_in_result) {
-  // Keep those buffers in the output of the marked live because they are needed
-  // by the service. They will be deallocated by the service.
-  for (size_t i = 0; i < buffers.size(); ++i) {
-    se::DeviceMemoryBase alloc = buffers[i];
-    if (!buffers_in_result[i] && !alloc.is_null()) {
-      VLOG(3) << "CpuExecutable deallocating buffer #" << i << " ["
-              << alloc.opaque() << "]";
-      TF_RETURN_IF_ERROR(
-          allocator->Deallocate(stream->parent()->device_ordinal(), &alloc));
-    }
-  }
-
-  return Status::OK();
-}
-
 StatusOr<ScopedShapedBuffer> CpuExecutable::CreateResultShapedBuffer(
     const ServiceExecutableRunOptions* run_options,
-    tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> allocated_buffers,
-    std::vector<bool>* buffers_in_result) {
+    tensorflow::gtl::MutableArraySlice<OwningDeviceMemory> buffers) {
   se::Stream* stream = run_options->stream();
   ScopedShapedBuffer result_buffer(
-      /*on_host_shape=*/result_shape(), /*on_device_shape=*/result_shape(),
-      run_options->allocator(), stream->parent()->device_ordinal());
+      /*on_host_shape=*/host_result_shape(),
+      /*on_device_shape=*/host_result_shape(), run_options->allocator(),
+      stream->parent()->device_ordinal());
 
-  // Copy DeviceMemoryBase values which contain the array(s) of the result into
-  // the respective location in ShapedBuffer which is returned to the caller.
+  // Move OwningDeviceMemory values which contain the array(s) of the result
+  // into the respective location in ScopedShapedBuffer which is returned to the
+  // caller.
   TF_RETURN_IF_ERROR(result_buffer.buffers().ForEachMutableElementWithStatus(
       [&](const ShapeIndex& index, se::DeviceMemoryBase* device_memory) {
         const auto& sources = this->GetRootPointsToSet().element(index);
@@ -272,10 +231,9 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::CreateResultShapedBuffer(
         CHECK(!slice.allocation()->is_entry_computation_parameter());
 
         const BufferAllocation::Index buffer_index = slice.index();
-        const se::DeviceMemoryBase& buffer = allocated_buffers[buffer_index];
+        OwningDeviceMemory& buffer = buffers[buffer_index];
         CHECK(!buffer.is_null() || buffer.size() == 0);
-        *device_memory = buffer;
-        (*buffers_in_result)[buffer_index] = true;
+        *device_memory = buffer.Forget();
         return Status::OK();
       }));
   return std::move(result_buffer);
@@ -291,23 +249,21 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteOnStream(
 
   se::Stream* stream = run_options->stream();
   DeviceMemoryAllocator* memory_allocator = run_options->allocator();
-  std::vector<se::DeviceMemoryBase> buffers(assignment_->Allocations().size());
+  std::vector<OwningDeviceMemory> buffers(assignment_->Allocations().size());
 
   TF_RETURN_IF_ERROR(AllocateBuffers(
       memory_allocator, stream->parent()->device_ordinal(), &buffers));
-  TF_RETURN_IF_ERROR(ExecuteComputeFunction(
-      &run_options->run_options(), arguments, buffers, hlo_execution_profile));
 
-  std::vector<bool> buffers_in_result(assignment_->Allocations().size(), false);
-  TF_ASSIGN_OR_RETURN(
-      ScopedShapedBuffer result_buffer,
-      CreateResultShapedBuffer(run_options, buffers, &buffers_in_result));
-
-  // Free all buffers not in the result.
-  TF_RETURN_IF_ERROR(DeallocateTempBuffers(memory_allocator, stream, buffers,
-                                           buffers_in_result));
+  std::vector<se::DeviceMemoryBase> unowning_buffers;
+  unowning_buffers.reserve(buffers.size());
+  for (auto& buffer : buffers) {
+    unowning_buffers.push_back(buffer.AsDeviceMemoryBase());
+  }
+  TF_RETURN_IF_ERROR(ExecuteComputeFunction(&run_options->run_options(),
+                                            arguments, unowning_buffers,
+                                            hlo_execution_profile));
 
-  return std::move(result_buffer);
+  return CreateResultShapedBuffer(run_options, &buffers);
 }
 
 StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteAsyncOnStream(
@@ -323,30 +279,53 @@ StatusOr<ScopedShapedBuffer> CpuExecutable::ExecuteAsyncOnStream(
       run_options->stream()->implementation());
   se::Stream* stream = run_options->stream();
   DeviceMemoryAllocator* memory_allocator = run_options->allocator();
-  std::vector<se::DeviceMemoryBase> buffers(assignment_->Allocations().size());
-
+  std::vector<OwningDeviceMemory> buffers(assignment_->Allocations().size());
   TF_RETURN_IF_ERROR(AllocateBuffers(
       memory_allocator, stream->parent()->device_ordinal(), &buffers));
 
-  std::vector<bool> buffers_in_result(assignment_->Allocations().size(), false);
-  TF_ASSIGN_OR_RETURN(
-      ScopedShapedBuffer result_buffer,
-      CreateResultShapedBuffer(run_options, buffers, &buffers_in_result));
-
-  LogLiveAddresses(buffers, buffers_in_result);
-
-  host_stream->EnqueueTask([this, run_options, arguments, buffers,
-                            buffers_in_result, memory_allocator, stream]() {
-    // Failing a CHECK here is not great, but I don't see an obvious way to
-    // return a failed Status asynchronously.
-    TF_CHECK_OK(ExecuteComputeFunction(&run_options->run_options(), arguments,
-                                       buffers,
-                                       /*hlo_execution_profile=*/nullptr));
-    TF_CHECK_OK(DeallocateTempBuffers(memory_allocator, stream, buffers,
-                                      buffers_in_result));
-  });
+  std::vector<se::DeviceMemoryBase> unowning_buffers;
+  unowning_buffers.reserve(buffers.size());
+  for (auto& buffer : buffers) {
+    unowning_buffers.push_back(buffer.AsDeviceMemoryBase());
+  }
+  TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result,
+                      CreateResultShapedBuffer(run_options, &buffers));
 
-  return std::move(result_buffer);
+  // At this point, `unowning_buffers` contains unowning pointers to all of our
+  // buffers, and `buffers` contains owning pointers to the non-live-out
+  // buffers.  Enqueue a task which keeps alive the non-live-out buffers.
+  //
+  // Logically we want this lambda to capture `buffers` by move, ultimately our
+  // functor needs to be wrapped in an std::function, and that requires its
+  // functor to be copyable.  Thus we perpitrate the hack of capturing buffers
+  // "by shared pointer".
+  //
+  // We also need to change the types of some of the variables we capture:
+  // run_options needs to change from a pointer to a value type, and arguments
+  // needs to change from an ArraySlice into a vector.  We use a struct instead
+  // of a lambda to make this explicit.
+  struct AsyncRunTask {
+    CpuExecutable* executable;
+    ServiceExecutableRunOptions run_options;
+    std::vector<const ShapedBuffer*> arguments;
+    std::vector<se::DeviceMemoryBase> unowning_buffers;
+    std::shared_ptr<std::vector<OwningDeviceMemory>> buffers;
+
+    void operator()() {
+      // Failing a CHECK here is not great, but I don't see an obvious way to
+      // return a failed Status asynchronously.
+      TF_CHECK_OK(executable->ExecuteComputeFunction(
+          &run_options.run_options(), arguments, unowning_buffers,
+          /*hlo_execution_profile=*/nullptr));
+    }
+  };
+  host_stream->EnqueueTask(AsyncRunTask{
+      this, *run_options,
+      std::vector<const ShapedBuffer*>(arguments.begin(), arguments.end()),
+      unowning_buffers,
+      std::make_shared<std::vector<OwningDeviceMemory>>(std::move(buffers))});
+
+  return std::move(result);
 }
 
 /*static*/ int64 CpuExecutable::ShapeSizeBytes(const Shape& shape) {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
index 68ad38cba88720..8dd47bfb865e8a 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@@ -92,7 +92,7 @@ class CpuExecutable : public Executable {
   // buffer is assigned for this element.
   Status AllocateBuffers(DeviceMemoryAllocator* memory_allocator,
                          int device_ordinal,
-                         std::vector<se::DeviceMemoryBase>* buffers);
+                         std::vector<OwningDeviceMemory>* buffers);
 
   // Calls the generated function performing the computation with the given
   // arguments using the supplied buffers.
@@ -102,16 +102,12 @@ class CpuExecutable : public Executable {
       tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> buffers,
       HloExecutionProfile* hlo_execution_profile);
 
-  // Creates a ScopedShapedBuffer for holding the result of the computation. The
-  // addresses (DeviceMemoryBases) are set according to buffer assignment.
-  // 'buffers_in_result' should point to a vector of the same size as
-  // 'allocated_buffers'. An element in buffers_in_result is set to true if the
-  // corresponding buffer is live out of the computation (and thus contained in
-  // the returned ShapedBuffer).
+  // Creates a ScopedShapedBuffer for holding the result of the computation,
+  // moving buffers out of allocated_buffers and into the result as appropriate.
+  // The addresses are set according to buffer assignment.
   StatusOr<ScopedShapedBuffer> CreateResultShapedBuffer(
       const ServiceExecutableRunOptions* run_options,
-      tensorflow::gtl::ArraySlice<se::DeviceMemoryBase> allocated_buffers,
-      std::vector<bool>* buffers_in_result);
+      tensorflow::gtl::MutableArraySlice<OwningDeviceMemory> buffers);
 
   // Returns the points-to set of the root instruction of the entry
   // computation. Uses points-to analysis from buffer assignment.
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
index 0fc5a746bbbc76..b40d264c03aba6 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
@@ -34,6 +34,7 @@ bool CanBeLoopFused(const HloInstruction& hlo) {
          hlo.opcode() == HloOpcode::kConcatenate ||
          hlo.opcode() == HloOpcode::kDynamicSlice ||
          hlo.opcode() == HloOpcode::kDynamicUpdateSlice ||
+         hlo.opcode() == HloOpcode::kGather ||
          hlo.opcode() == HloOpcode::kPad ||
          hlo.opcode() == HloOpcode::kReshape ||
          hlo.opcode() == HloOpcode::kReverse ||
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
index 6ed1cd31b18f63..97e10a89a209c0 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <set>
 
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -157,37 +158,95 @@ TEST_F(InstructionFusionTest, DotOperationFusion_ElementReuse) {
   EXPECT_EQ(dot, computation->root_instruction());
 }
 
-TEST_F(InstructionFusionTest, DotOperationFusion_TransposeFusion) {
-  HloComputation::Builder builder(TestName());
-  HloInstruction* arg0 = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeShape(F32, {1, 256}), "arg0"));
-  HloInstruction* arg1 = builder.AddInstruction(HloInstruction::CreateParameter(
-      1, ShapeUtil::MakeShape(F32, {1024, 256}), "arg1"));
+TEST_F(InstructionFusionTest, DotOperationFusion_TransposeFusion_RHS) {
+  string hlo_string = R"(
+HloModule DotOperationFusion_TransposeFusion
 
-  HloInstruction* exp1 = builder.AddInstruction(HloInstruction::CreateUnary(
-      ShapeUtil::MakeShape(S32, {1024, 256}), HloOpcode::kExp, arg1));
-  HloInstruction* transpose1 =
-      builder.AddInstruction(HloInstruction::CreateTranspose(
-          ShapeUtil::MakeShape(S32, {256, 1024}), exp1, {1, 0}));
-  builder.AddInstruction(
-      MakeDot(ShapeUtil::MakeShape(F32, {1, 1024}), arg0, transpose1));
+ENTRY DotOperationFusion_TransposeFusion {
+  arg0 = f32[1,256] parameter(0)
+  arg1 = f32[1024,256] parameter(1)
+  exponential = s32[1024,256] exponential(arg1)
+  transpose = s32[256,1024] transpose(exponential), dimensions={1,0}
+  ROOT dot = f32[1,1024] dot(arg0, transpose), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+  HloComputation* computation = module->entry_computation();
 
-  auto module = CreateNewModule();
-  auto computation = module->AddEntryComputation(builder.Build());
   TransposeFolding transpose_folding(
       [](const HloInstruction& dot,
          const TransposeFolding::OperandIndices& candidate_operands) {
         return candidate_operands;
       },
       TransposeFolding::NeverFoldTranspose);
-  EXPECT_TRUE(transpose_folding.Run(module.get()).ValueOrDie());
-  EXPECT_EQ(computation->root_instruction()->opcode(), HloOpcode::kFusion);
-  EXPECT_EQ(computation->root_instruction()->fusion_kind(),
-            HloInstruction::FusionKind::kTransposeDot);
-  EXPECT_FALSE(CpuInstructionFusion().Run(module.get()).ValueOrDie());
-  EXPECT_EQ(computation->root_instruction()->opcode(), HloOpcode::kFusion);
-  EXPECT_EQ(computation->root_instruction()->fusion_kind(),
-            HloInstruction::FusionKind::kTransposeDot);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, transpose_folding.Run(module.get()));
+  ASSERT_TRUE(changed);
+  ASSERT_THAT(computation->root_instruction(),
+              op::Dot(op::Parameter(0), op::Exp(op::Parameter(1)),
+                      /*lhs_contracting_dim=*/1, /*rhs_contracting_dim=*/1));
+}
+
+TEST_F(InstructionFusionTest, DotOperationFusion_TransposeFusion_LHS) {
+  string hlo_string = R"(
+HloModule DotOperationFusion_TransposeFusion
+
+ENTRY DotOperationFusion_TransposeFusion {
+  arg0 = f32[256,1] parameter(0)
+  arg1 = f32[256,1024] parameter(1)
+  transpose = s32[1,256] transpose(arg0), dimensions={1,0}
+  exponential = s32[256,1024] exponential(arg1)
+  ROOT dot = f32[1,1024] dot(transpose, exponential), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+  HloComputation* computation = module->entry_computation();
+
+  TransposeFolding transpose_folding(
+      [](const HloInstruction& dot,
+         const TransposeFolding::OperandIndices& candidate_operands) {
+        return candidate_operands;
+      },
+      TransposeFolding::NeverFoldTranspose);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, transpose_folding.Run(module.get()));
+  ASSERT_TRUE(changed);
+  ASSERT_THAT(computation->root_instruction(),
+              op::Dot(op::Parameter(0), op::Exp(op::Parameter(1)),
+                      /*lhs_contracting_dim=*/0, /*rhs_contracting_dim=*/0));
+}
+
+TEST_F(InstructionFusionTest,
+       DotOperationFusion_TransposeFusion_LHS_NonDefault) {
+  string hlo_string = R"(
+HloModule DotOperationFusion_TransposeFusion
+
+ENTRY DotOperationFusion_TransposeFusion {
+  arg0 = f32[1,256] parameter(0)
+  arg1 = f32[256,1024] parameter(1)
+  transpose = s32[256,1] transpose(arg0), dimensions={1,0}
+  exponential = s32[256,1024] exponential(arg1)
+  ROOT dot = f32[1,1024] dot(transpose, exponential), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+  HloComputation* computation = module->entry_computation();
+
+  TransposeFolding transpose_folding(
+      [](const HloInstruction& dot,
+         const TransposeFolding::OperandIndices& candidate_operands) {
+        return candidate_operands;
+      },
+      TransposeFolding::NeverFoldTranspose);
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, transpose_folding.Run(module.get()));
+  ASSERT_TRUE(changed);
+  ASSERT_THAT(computation->root_instruction(),
+              op::Dot(op::Parameter(0), op::Exp(op::Parameter(1)),
+                      /*lhs_contracting_dim=*/1, /*rhs_contracting_dim=*/0));
 }
 
 class OpcodeFusionTest : public InstructionFusionTest {
@@ -697,6 +756,154 @@ TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x1_multi_use) {
               Not(op::Fusion()));
 }
 
+struct GatherLoopFusionTestSpec {
+  string test_name;
+  string hlo_computation_text;
+
+  static string Name(
+      const ::testing::TestParamInfo<GatherLoopFusionTestSpec>& info) {
+    return info.param.test_name;
+  }
+};
+
+class GatherLoopFusionTest
+    : public OpcodeFusionTest,
+      public ::testing::WithParamInterface<GatherLoopFusionTestSpec> {};
+
+TEST_P(GatherLoopFusionTest, GatherLoopFusion) {
+  const GatherLoopFusionTestSpec& spec = GetParam();
+  string hlo_string = tensorflow::strings::StrCat(
+      "HloModule ", spec.test_name, "\n\n", spec.hlo_computation_text);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+
+  RunFusionAndCheckOpcodesWereFused(
+      module.get(),
+      {HloOpcode::kGather, HloOpcode::kAdd, HloOpcode::kBroadcast,
+       HloOpcode::kParameter, HloOpcode::kParameter, HloOpcode::kParameter});
+}
+
+std::vector<GatherLoopFusionTestSpec> GetGatherLoopFusionTestSpecs() {
+  std::vector<GatherLoopFusionTestSpec> result;
+
+  result.push_back({"FusedTensorFlowGatherV2", R"(
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  gather = s32[3,2] gather(operand, indices),
+      output_window_dims={0},
+      elided_window_dims={1},
+      gather_dims_to_operand_dims={1},
+      index_vector_dim=1,
+      window_bounds={3, 1}
+  one = s32[] constant(1)
+  one_broadcasted = s32[3,2] broadcast(one), dimensions={}
+  ROOT result = s32[3,2]{1,0} add(gather, one_broadcasted)
+}
+)"});
+
+  result.push_back({"FusedTensorFlowGatherMultipleBatchDims", R"(
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2,2] parameter(1)
+  gather = s32[2,3,2] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={1},
+      gather_dims_to_operand_dims={1},
+      index_vector_dim=2,
+      window_bounds={3, 1}
+  one = s32[] constant(1)
+  one_broadcasted = s32[2,3,2] broadcast(one), dimensions={}
+  ROOT result = s32[2,3,2]{2,1,0} add(gather, one_broadcasted)
+}
+)"});
+
+  result.push_back({"FusedTensorFlowGatherNdMultipleBatchDims", R"(
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2,2,2] parameter(1)
+  gather = s32[2,2] gather(operand, indices),
+      output_window_dims={},
+      elided_window_dims={0,1},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=2,
+      window_bounds={1, 1}
+  one = s32[] constant(1)
+  one_broadcasted = s32[2,2] broadcast(one), dimensions={}
+  ROOT result = s32[2,2]{1,0} add(gather, one_broadcasted)
+}
+)"});
+
+  result.push_back({"FusedTensorFlowGatherNd_0", R"(
+ENTRY main {
+  operand = s32[3,3,2] parameter(0)
+  indices = s32[2,2] parameter(1)
+  gather = s32[2,2] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={0,1},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=1,
+      window_bounds={1,1,2}
+  one = s32[] constant(1)
+  one_broadcasted = s32[2,2] broadcast(one), dimensions={}
+  ROOT result = s32[2,2]{1,0} add(gather, one_broadcasted)
+}
+)"});
+
+  result.push_back({"FusedTensorFlowGatherNd_1", R"(
+ENTRY main {
+  operand = s32[3,3,2] parameter(0)
+  indices = s32[2,2] parameter(1)
+  gather = s32[2,2] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={0,1},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=0,
+      window_bounds={1,1,2}
+  one = s32[] constant(1)
+  one_broadcasted = s32[2,2] broadcast(one), dimensions={}
+  ROOT result = s32[2,2]{1,0} add(gather, one_broadcasted)
+}
+)"});
+
+  result.push_back({"FusedDynamicSlice", R"(
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  gather = s32[1,1] gather(operand, indices),
+      output_window_dims={0,1},
+      elided_window_dims={},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=0,
+      window_bounds={1,1}
+  one = s32[] constant(1)
+  one_broadcasted = s32[1,1] broadcast(one), dimensions={}
+  ROOT result = s32[1,1]{1,0} add(gather, one_broadcasted)
+}
+)"});
+
+  result.push_back({"FusedBatchDynamicSlice", R"(
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2,2] parameter(1)
+  gather = s32[2,1,1] gather(operand, indices),
+      output_window_dims={1,2},
+      elided_window_dims={},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=0,
+      window_bounds={1,1}
+  one = s32[] constant(1)
+  one_broadcasted = s32[2,1,1] broadcast(one), dimensions={}
+  ROOT result = s32[2,1,1]{2,1,0} add(gather, one_broadcasted)
+}
+)"});
+
+  return result;
+}
+
+INSTANTIATE_TEST_CASE_P(GatherLoopFusionTestInstantiation, GatherLoopFusionTest,
+                        ::testing::ValuesIn(GetGatherLoopFusionTestSpecs()),
+                        GatherLoopFusionTestSpec::Name);
 }  // namespace
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc
index e8117377e61a4e..aa872d5ec9e759 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.cc
@@ -100,7 +100,8 @@ Status CpuLayoutAssignment::AddBackendConstraints(
   const HloComputation* computation = constraints->computation();
   for (auto* instruction : computation->instructions()) {
     if (instruction->opcode() == HloOpcode::kConvolution &&
-        PotentiallyImplementedAsEigenConvolution(*instruction)) {
+        PotentiallyImplementedAsEigenConvolution(*instruction,
+                                                 target_machine_features_)) {
       const HloInstruction* convolution = instruction;
       const HloInstruction* lhs_instruction = convolution->operand(0);
       const HloInstruction* rhs_instruction = convolution->operand(1);
@@ -126,7 +127,8 @@ Status CpuLayoutAssignment::AddBackendConstraints(
       const HloInstruction* op = instruction->operand(*op_idx);
       TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
           ColMajorShape(op->shape()), instruction, *op_idx));
-    } else if (PotentiallyImplementedAsEigenDot(*instruction)) {
+    } else if (PotentiallyImplementedAsEigenDot(*instruction,
+                                                target_machine_features_)) {
       const HloInstruction* dot = instruction;
       // In order to implement `dot` with Eigen dot, the layouts of the lhs,
       // rhs, and output need to be row-major.
@@ -139,13 +141,9 @@ Status CpuLayoutAssignment::AddBackendConstraints(
       Shape lhs_shape(RowMajorShape(lhs_instruction->shape()));
       TF_RETURN_IF_ERROR(constraints->SetOperandLayout(lhs_shape, dot, 0));
 
-      // dot is a kDot or a kTransposeDot fusion node.  In the latter case, if
-      // it represents X @ X, it may have just one operand.
-      if (dot->operand_count() > 1) {
-        const HloInstruction* rhs_instruction = dot->operand(1);
-        Shape rhs_shape(RowMajorShape(rhs_instruction->shape()));
-        TF_RETURN_IF_ERROR(constraints->SetOperandLayout(rhs_shape, dot, 1));
-      }
+      const HloInstruction* rhs_instruction = dot->operand(1);
+      Shape rhs_shape(RowMajorShape(rhs_instruction->shape()));
+      TF_RETURN_IF_ERROR(constraints->SetOperandLayout(rhs_shape, dot, 1));
 
       // Set layouts of the instructions' shapes.
       TF_RETURN_IF_ERROR(constraints->SetInstructionLayout(output_shape, dot));
@@ -181,7 +179,7 @@ Status CpuLayoutAssignment::AddBackendConstraints(
       }
     }
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h
index c8edbb9e15a5b6..3c4fe68b830d96 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_CPU_LAYOUT_ASSIGNMENT_H_
 
 #include "tensorflow/compiler/xla/service/computation_layout.h"
+#include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
 #include "tensorflow/compiler/xla/service/layout_assignment.h"
 #include "tensorflow/core/lib/core/status.h"
 
@@ -27,12 +28,17 @@ namespace cpu {
 // layout constraints for operands and results of library calls.
 class CpuLayoutAssignment : public LayoutAssignment {
  public:
-  explicit CpuLayoutAssignment(ComputationLayout* entry_computation_layout)
-      : LayoutAssignment(entry_computation_layout) {}
+  explicit CpuLayoutAssignment(
+      ComputationLayout* entry_computation_layout,
+      const TargetMachineFeatures* target_machine_features)
+      : LayoutAssignment(entry_computation_layout),
+        target_machine_features_(*target_machine_features) {}
   ~CpuLayoutAssignment() override {}
 
  protected:
   Status AddBackendConstraints(LayoutConstraints* constraints) override;
+
+  const TargetMachineFeatures& target_machine_features_;
 };
 
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
index 6ba030fff3bbc5..429fc7b78608da 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
+#include "tensorflow/compiler/xla/service/cpu/target_machine_features_fake.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
@@ -49,7 +50,12 @@ class CpuLayoutAssignmentTest : public HloTestBase {
  protected:
   void AssignLayouts(HloModule* module,
                      ComputationLayout* entry_computation_layout) {
-    cpu::CpuLayoutAssignment layout_assignment(entry_computation_layout);
+    cpu::TargetMachineFeaturesWithFakeAlignmentLogic target_machine_features(
+        [](int64 shape_size) {
+          return cpu::TargetMachineFeatures::kEigenExpectedTensorAlignment;
+        });
+    cpu::CpuLayoutAssignment layout_assignment(entry_computation_layout,
+                                               &target_machine_features);
     EXPECT_IS_OK(layout_assignment.Run(module).status());
   }
 };
@@ -311,7 +317,12 @@ static StatusOr<DotOutputFusionLayoutAssignmentResult> RunDotOutputFusion(
   result.addend_fusion_param = fusion_instruction->operand(
       fused_add->operand(1 - dot_operand_idx_in_add)->parameter_number());
 
-  cpu::CpuLayoutAssignment layout_assignment(&computation_layout);
+  cpu::TargetMachineFeaturesWithFakeAlignmentLogic target_machine_features(
+      [](int64 shape_size) {
+        return cpu::TargetMachineFeatures::kEigenExpectedTensorAlignment;
+      });
+  cpu::CpuLayoutAssignment layout_assignment(&computation_layout,
+                                             &target_machine_features);
   TF_ASSIGN_OR_RETURN(result.layout_assignment_changed_something,
                       layout_assignment.Run(module));
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.cc b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
index f9c51f243c47b8..e75fcb6bc9719f 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
@@ -22,6 +22,8 @@ namespace {
 const char* const kXlaOptimizeForSizeCpuOption = "xla_cpu_optimize_for_size";
 const char* const kXlaDisableVectorizedReduce = "xla_disable_vectorized_reduce";
 const char* const kLlvmIrDotTilingFactor = "xla_llvm_dot_tiling_factor";
+const char* const kXlaEnableExperimentalLlvmIrGemm =
+    "xla_enable_experimental_llvm_ir_gemm";
 
 }  // namespace
 
@@ -54,6 +56,12 @@ tensorflow::gtl::optional<int64> LlvmIrGemvTilingFactor(
   return tensorflow::gtl::nullopt;
 }
 
+bool EnableExperimentalLlvmIrGemm(const HloModuleConfig& config) {
+  const auto& extra_options_map =
+      config.debug_options().xla_backend_extra_options();
+  return extra_options_map.count(kXlaEnableExperimentalLlvmIrGemm) > 0;
+}
+
 }  // namespace options
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.h b/tensorflow/compiler/xla/service/cpu/cpu_options.h
index be62ff3cc1af23..106dfbbc62dfba 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.h
@@ -26,6 +26,7 @@ namespace options {
 
 bool OptimizeForSizeRequested(const HloModuleConfig& config);
 bool VectorizedReduceDisabled(const HloModuleConfig& config);
+bool EnableExperimentalLlvmIrGemm(const HloModuleConfig& config);
 tensorflow::gtl::optional<int64> LlvmIrGemvTilingFactor(
     const HloModuleConfig& config);
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
index 872b0be1f8a8ec..54c52bc08f9c53 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
@@ -37,6 +37,7 @@ extern const char* const kEigenMatMulF32SymbolName =
     "__xla_cpu_runtime_EigenMatMulF32";
 extern const char* const kEigenMatMulF64SymbolName =
     "__xla_cpu_runtime_EigenMatMulF64";
+extern const char* const kMKLConvF32SymbolName = "__xla_cpu_runtime_MKLConvF32";
 extern const char* const kMKLMatMulF32SymbolName =
     "__xla_cpu_runtime_MKLMatMulF32";
 extern const char* const kMKLMatMulF64SymbolName =
@@ -50,6 +51,8 @@ extern const char* const kEigenConvF16SymbolName =
 extern const char* const kEigenConvF32SymbolName =
     "__xla_cpu_runtime_EigenConvF32";
 extern const char* const kEigenFftSymbolName = "__xla_cpu_runtime_EigenFft";
+extern const char* const kEigenSingleThreadedFftSymbolName =
+    "__xla_cpu_runtime_EigenSingleThreadedFft";
 extern const char* const kEigenSingleThreadedMatMulF16SymbolName =
     "__xla_cpu_runtime_EigenSingleThreadedMatMulF16";
 extern const char* const kEigenSingleThreadedMatMulF32SymbolName =
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
index e392e231b4c71b..aa0e96712302e8 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.h
@@ -44,6 +44,7 @@ namespace runtime {
 extern const char* const kEigenMatMulF16SymbolName;
 extern const char* const kEigenMatMulF32SymbolName;
 extern const char* const kEigenMatMulF64SymbolName;
+extern const char* const kMKLConvF32SymbolName;
 extern const char* const kMKLMatMulF32SymbolName;
 extern const char* const kMKLMatMulF64SymbolName;
 extern const char* const kMKLSingleThreadedMatMulF32SymbolName;
@@ -51,6 +52,7 @@ extern const char* const kMKLSingleThreadedMatMulF64SymbolName;
 extern const char* const kEigenConvF16SymbolName;
 extern const char* const kEigenConvF32SymbolName;
 extern const char* const kEigenFftSymbolName;
+extern const char* const kEigenSingleThreadedFftSymbolName;
 extern const char* const kEigenSingleThreadedMatMulF16SymbolName;
 extern const char* const kEigenSingleThreadedMatMulF32SymbolName;
 extern const char* const kEigenSingleThreadedMatMulF64SymbolName;
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
index 9b39e7f5765ae5..d97802ee45d6ad 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.cc
@@ -88,8 +88,8 @@ CpuTransferManager::CpuTransferManager()
     : GenericTransferManager(se::host::kHostPlatformId,
                              /*pointer_size=*/sizeof(void*)) {}
 
-Status CpuTransferManager::TransferLiteralToInfeed(se::StreamExecutor* executor,
-                                                   const Literal& literal) {
+Status CpuTransferManager::TransferLiteralToInfeed(
+    se::StreamExecutor* executor, const LiteralSlice& literal) {
   const Shape& shape = literal.shape();
   VLOG(2) << "Transferring literal to infeed with shape: "
           << ShapeUtil::HumanString(shape);
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h
index 3ecb0d23649837..6dfc666f09dfa6 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_transfer_manager.h
@@ -38,7 +38,7 @@ class CpuTransferManager : public GenericTransferManager {
   ~CpuTransferManager() override {}
 
   Status TransferLiteralToInfeed(se::StreamExecutor* executor,
-                                 const Literal& literal) override;
+                                 const LiteralSlice& literal) override;
   Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size,
                                 const void* source) override;
   Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index 495fecc4aa8b3c..d77076546f404a 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
+#include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
 #include "tensorflow/compiler/xla/service/cpu/vector_support_library.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
@@ -41,17 +42,17 @@ using llvm_ir::SetToFirstInsertPoint;
 namespace cpu {
 
 namespace {
-// Loads a tile of values from a 2D tensor.
-class TileLoader {
+// Provides tiled access to an in-memory rank 2 array.
+class MemoryTile {
  public:
-  // Constructs a TileLoader that will load a tile consisting of
+  // Constructs a MemoryTile that can operate on tiles consisting of
   // `tile_size_along_major_dim` vectors from the matrix `matrix`, starting at
   // `major_dim_offset` in the major dimension.  The tile size along the minor
   // dimension is the vector size, and that is implicitly determined by `vsl`.
-  TileLoader(VectorSupportLibrary* vsl, llvm::IRBuilder<>* ir_builder,
+  MemoryTile(VectorSupportLibrary* vsl, llvm::IRBuilder<>* ir_builder,
              llvm::Value* matrix, int64 matrix_size_along_minor_dim,
              llvm::Value* major_dim_offset, int64 tile_size_along_major_dim)
-      : vsl_(vsl) {
+      : vsl_(vsl), ir_builder_(ir_builder) {
     pointers_.reserve(tile_size_along_major_dim);
     for (int64 i = 0; i < tile_size_along_major_dim; i++) {
       llvm::Value* total_offset = ir_builder->CreateMul(
@@ -61,9 +62,10 @@ class TileLoader {
     }
   }
 
-  // Load a tile consisting of `tile_size_along_major_dim_` vectors starting at
-  // `major_dim_offset_` in the major dimension and `minor_dim_offset` in the
-  // minor dimension.
+  // Load a tile consisting of `tile_size_along_major_dim` vectors from position
+  // {major: `major_dim_offset`, minor: `minor_dim_offset`}.
+  //
+  // Note: `major_dim_offset` is a parameter to the constructor.
   std::vector<llvm::Value*> LoadTile(llvm::Value* minor_dim_offset) const {
     std::vector<llvm::Value*> result;
     result.reserve(pointers_.size());
@@ -73,11 +75,104 @@ class TileLoader {
     return result;
   }
 
+  // Stores `tile` to position {major: `major_dim_offset`, minor:
+  // `minor_dim_offset`}.
+  //
+  // Note: `major_dim_offset` is a parameter to the constructor.
+  void StoreTile(tensorflow::gtl::ArraySlice<llvm::Value*> tile,
+                 llvm::Value* minor_dim_offset) const {
+    CHECK_EQ(tile.size(), pointers_.size());
+    for (int64 i = 0; i < pointers_.size(); i++) {
+      vsl_->StoreVector(tile[i], pointers_[i], minor_dim_offset);
+    }
+  }
+
+  // Loads a tile of size [`tile_size_along_major_dim`,
+  // `tile_size_along_middle_dim`] from position {major: `major_dim_offset`,
+  // minor: `minor_dim_offset`} and then broadcasts each element into a vector
+  // of size vsl_.vector_size().  The (i,j)'th element of the return value is
+  // the (i,j)'th element in the tile broadcasted into an LLVM vector.
+  //
+  // Note: `major_dim_offset` is a parameter to the constructor.
+  std::vector<std::vector<llvm::Value*>> LoadBroadcastTile(
+      llvm::Value* minor_dim_offset, int64 tile_size_along_middle_dim) const {
+    std::vector<std::vector<llvm::Value*>> result;
+    result.resize(pointers_.size());
+    for (int64 i = 0; i < pointers_.size(); i++) {
+      for (int64 j = 0; j < tile_size_along_middle_dim; j++) {
+        result[i].push_back(vsl_->LoadBroadcast(
+            pointers_[i], ir_builder_->CreateAdd(minor_dim_offset,
+                                                 ir_builder_->getInt64(j))));
+      }
+    }
+    return result;
+  }
+
  private:
   VectorSupportLibrary* vsl_;
+  llvm::IRBuilder<>* ir_builder_;
   std::vector<llvm::Value*> pointers_;
 };
 
+// The base class for the classes representing the GEMV emitter configurations.
+//
+// The IR emitted (modulo the LLVM values representing the input and output
+// buffers) by the row major and column major GEMV emitters should be a function
+// of their configuration.  This is important because their configuration is
+// used as a key to cache the generated IR.
+class GemvConfig {
+ public:
+  // Mixin for convenience.
+  template <typename T>
+  struct User {
+   public:
+    PrimitiveType scalar_type() const {
+      return derived().config().scalar_type();
+    }
+    int64 tile_rows() const { return derived().config().tile_rows(); }
+    int64 tile_cols() const { return derived().config().tile_cols(); }
+    int64 m() const { return derived().config().m(); }
+    int64 k() const { return derived().config().k(); }
+    int64 has_addend() const { return derived().config().has_addend(); }
+
+   private:
+    const T& derived() const { return *static_cast<const T*>(this); }
+  };
+
+  PrimitiveType scalar_type() const { return scalar_type_; }
+  int64 tile_rows() const { return tile_rows_; }
+  int64 tile_cols() const { return tile_cols_; }
+  int64 m() const { return m_; }
+  int64 k() const { return k_; }
+  bool has_addend() const { return has_addend_; }
+
+  string GetCacheKey() const {
+    return tensorflow::strings::StrCat(
+        name_, "_", PrimitiveType_Name(scalar_type()), "_", tile_rows(), "_",
+        tile_cols(), "_", m(), "_", k(), has_addend() ? "_with_addend" : "");
+  }
+
+ protected:
+  explicit GemvConfig(string name, PrimitiveType scalar_type, int64 tile_rows,
+                      int64 tile_cols, int64 m, int64 k, bool has_addend)
+      : name_(std::move(name)),
+        scalar_type_(scalar_type),
+        tile_rows_(tile_rows),
+        tile_cols_(tile_cols),
+        m_(m),
+        k_(k),
+        has_addend_(has_addend) {}
+
+ private:
+  string name_;
+  PrimitiveType scalar_type_;
+  int64 tile_rows_;
+  int64 tile_cols_;
+  int64 m_;
+  int64 k_;
+  bool has_addend_;
+};
+
 // Computes a dot product between "[M,K]{0,1} lhs" with a [K,1] vector (the
 // layout of the vector does not matter).  This implementation uses a tiling
 // scheme to improve performance.
@@ -139,38 +234,46 @@ class TileLoader {
 // TODO(sanjoy): We should investigate if using gather loads and scatter stores
 // can be used here have the same inner loop for both column-major and row-major
 // matrix-vector products.
-class ColumnMajorMatrixVectorProductEmitter {
+class ColumnMajorMatrixVectorProductEmitter
+    : public GemvConfig::User<ColumnMajorMatrixVectorProductEmitter> {
  public:
-  ColumnMajorMatrixVectorProductEmitter(PrimitiveType scalar_type,
-                                        int64 tile_rows, int64 tile_cols,
-                                        int64 m, int64 k, llvm::Value* lhs,
+  class Config : public GemvConfig {
+   public:
+    explicit Config(PrimitiveType scalar_type, int64 tile_rows, int64 tile_cols,
+                    int64 m, int64 k, bool has_addend)
+        : GemvConfig(/*name=*/"col_major_gemv", scalar_type,
+                     /*tile_rows=*/tile_rows, /*tile_cols=*/tile_cols, /*m=*/m,
+                     /*k=*/k, /*has_addend=*/has_addend) {}
+  };
+
+  ColumnMajorMatrixVectorProductEmitter(const Config& config, llvm::Value* lhs,
                                         llvm::Value* rhs, llvm::Value* addend,
                                         llvm::Value* result,
                                         llvm::IRBuilder<>* ir_builder)
-      : scalar_type_(scalar_type),
-        tile_rows_(tile_rows),
-        tile_cols_(tile_cols),
-        m_(m),
-        k_(k),
+      : config_(config),
         lhs_(lhs),
         rhs_(rhs),
         addend_(addend),
         result_(result),
         ir_builder_(ir_builder),
         ksl_(ir_builder_),
-        vsl_(scalar_type_, /*vector_size=*/tile_rows_, ir_builder_, "") {
-    CHECK(tile_rows_ > 0 && IsPowerOfTwo(static_cast<uint64>(tile_rows_)));
+        vsl_(config.scalar_type(), /*vector_size=*/config.tile_rows(),
+             ir_builder_, "") {
+    CHECK(tile_rows() > 0 && IsPowerOfTwo(static_cast<uint64>(tile_rows())));
+    CHECK(!has_addend() || addend != nullptr);
   }
 
   void Emit();
 
+  const Config& config() const { return config_; }
+
  private:
   void EmitOuterLoopBody(llvm::Value* column, int64 column_count,
                          bool is_first_column);
 
-  TileLoader GetLhsTileLoader(llvm::Value* column_start, int64 column_count) {
-    return TileLoader(&vsl_, ir_builder_, /*matrix=*/lhs_,
-                      /*matrix_size_along_minor_dim=*/m_,
+  MemoryTile GetLhsMemoryTile(llvm::Value* column_start, int64 column_count) {
+    return MemoryTile(&vsl_, ir_builder_, /*matrix=*/lhs_,
+                      /*matrix_size_along_minor_dim=*/m(),
                       /*major_dim_offset=*/column_start,
                       /*tile_size_along_major_dim=*/column_count);
   }
@@ -187,18 +290,14 @@ class ColumnMajorMatrixVectorProductEmitter {
     return result;
   }
 
-  void EmitInnerLoopTiled(TileLoader* lhs_tile_loader,
+  void EmitInnerLoopTiled(MemoryTile* lhs_memory_tile,
                           const std::vector<llvm::Value*>& rhs_tile,
                           int64 columns, bool is_first_column);
 
   void EmitInnerLoopEpilogue(llvm::Value* current_tile_col, int64 columns,
                              bool is_first_tiled_column);
 
-  PrimitiveType scalar_type_;
-  int64 tile_rows_;
-  int64 tile_cols_;
-  int64 m_;
-  int64 k_;
+  Config config_;
   llvm::Value* lhs_;
   llvm::Value* rhs_;
   llvm::Value* addend_;
@@ -210,25 +309,25 @@ class ColumnMajorMatrixVectorProductEmitter {
 
 void ColumnMajorMatrixVectorProductEmitter::EmitOuterLoopBody(
     llvm::Value* column, int64 column_count, bool is_first_column) {
-  TileLoader lhs_tile_loader = GetLhsTileLoader(/*column_start=*/column,
+  MemoryTile lhs_memory_tile = GetLhsMemoryTile(/*column_start=*/column,
                                                 /*column_count=*/column_count);
 
   std::vector<llvm::Value*> rhs_tile =
       LoadRhsTile(column, /*count=*/column_count);
-  EmitInnerLoopTiled(&lhs_tile_loader, rhs_tile,
+  EmitInnerLoopTiled(&lhs_memory_tile, rhs_tile,
                      /*columns=*/column_count, is_first_column);
   EmitInnerLoopEpilogue(column, /*columns=*/column_count, is_first_column);
 }
 
 void ColumnMajorMatrixVectorProductEmitter::Emit() {
   // See the comment on the class declaration for the algorithm used here.
-  int64 column_remainder = k_ % tile_cols_;
-  int64 column_limit = k_ - column_remainder;
+  int64 column_remainder = k() % tile_cols();
+  int64 column_limit = k() - column_remainder;
 
   ksl_.For("dot.outer.tiled",
-           /*start=*/0, /*end=*/column_limit, /*step=*/tile_cols_,
+           /*start=*/0, /*end=*/column_limit, /*step=*/tile_cols(),
            [&](llvm::Value* column, bool is_first_column) {
-             EmitOuterLoopBody(column, tile_cols_, is_first_column);
+             EmitOuterLoopBody(column, tile_cols(), is_first_column);
            });
 
   if (column_remainder != 0) {
@@ -238,14 +337,14 @@ void ColumnMajorMatrixVectorProductEmitter::Emit() {
 }
 
 void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopTiled(
-    TileLoader* lhs_tile_loader, const std::vector<llvm::Value*>& rhs_tile,
+    MemoryTile* lhs_memory_tile, const std::vector<llvm::Value*>& rhs_tile,
     int64 columns, bool is_first_column) {
-  int64 row_limit = m_ - (m_ % tile_rows_);
+  int64 row_limit = m() - (m() % tile_rows());
 
   ksl_.For("dot.inner.tiled", /*start=*/0, /*end=*/row_limit,
-           /*step=*/tile_rows_, [&](llvm::Value* row) {
+           /*step=*/tile_rows(), [&](llvm::Value* row) {
              std::vector<llvm::Value*> lhs_tile =
-                 lhs_tile_loader->LoadTile(/*minor_dim_offset=*/row);
+                 lhs_memory_tile->LoadTile(/*minor_dim_offset=*/row);
              llvm::Value* accumulator =
                  is_first_column ? (addend_ ? vsl_.LoadVector(addend_, row)
                                             : vsl_.GetZeroVector())
@@ -259,8 +358,8 @@ void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopTiled(
 
 void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
     llvm::Value* current_tile_col, int64 columns, bool is_first_tiled_column) {
-  int64 row_start = m_ - (m_ % tile_rows_);
-  if (row_start == m_) {
+  int64 row_start = m() - (m() % tile_rows());
+  if (row_start == m()) {
     return;
   }
 
@@ -280,11 +379,11 @@ void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
       [&](llvm::Value* col, llvm::Value* is_first_scalar_col) {
         llvm::Value* rhs_element = vsl_.LoadScalar(rhs_, col);
         llvm::Value* total_offset =
-            ir_builder_->CreateMul(col, ir_builder_->getInt64(m_));
+            ir_builder_->CreateMul(col, ir_builder_->getInt64(m()));
         llvm::Value* lhs_base_pointer =
             vsl_.ComputeOffsetPointer(lhs_, total_offset);
         ksl_.For(
-            "dot.inner.epilg.inner", /*start=*/row_start, /*end=*/m_,
+            "dot.inner.epilg.inner", /*start=*/row_start, /*end=*/m(),
             /*step=*/1, [&](llvm::Value* scalar_row) {
               llvm::Value* product = vsl_.Mul(
                   vsl_.LoadScalar(lhs_base_pointer, scalar_row), rhs_element);
@@ -364,51 +463,55 @@ void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
 //
 // We have an inner epilogue loop to deal with the "B" sub-matrix and an outer
 // epilogue loop to deal with the C,D submatrix.
-class RowMajorMatrixVectorProductEmitter {
+class RowMajorMatrixVectorProductEmitter
+    : public GemvConfig::User<RowMajorMatrixVectorProductEmitter> {
  public:
-  RowMajorMatrixVectorProductEmitter(PrimitiveType scalar_type, int64 tile_rows,
-                                     int64 tile_cols, int64 m, int64 k,
-                                     llvm::Value* lhs, llvm::Value* rhs,
-                                     llvm::Value* addend, llvm::Value* result,
+  class Config : public GemvConfig {
+   public:
+    explicit Config(PrimitiveType scalar_type, int64 tile_rows, int64 tile_cols,
+                    int64 m, int64 k, bool has_addend)
+        : GemvConfig(/*name=*/"row_major_gemv", scalar_type,
+                     /*tile_rows=*/tile_rows, /*tile_cols=*/tile_cols, /*m=*/m,
+                     /*k=*/k, /*has_addend=*/has_addend) {}
+  };
+
+  RowMajorMatrixVectorProductEmitter(const Config& config, llvm::Value* lhs,
+                                     llvm::Value* rhs, llvm::Value* addend,
+                                     llvm::Value* result,
                                      llvm::IRBuilder<>* ir_builder)
-      : scalar_type_(scalar_type),
-        tile_rows_(tile_rows),
-        tile_cols_(tile_cols),
-        m_(m),
-        k_(k),
+      : config_(config),
         lhs_(lhs),
         rhs_(rhs),
         addend_(addend),
         result_(result),
         ir_builder_(ir_builder),
         ksl_(ir_builder_),
-        vsl_(scalar_type_, /*vector_size=*/tile_cols_, ir_builder_, "") {
-    CHECK(tile_cols_ > 0 && IsPowerOfTwo(static_cast<uint64>(tile_cols_)));
+        vsl_(scalar_type(), /*vector_size=*/tile_cols(), ir_builder_, "") {
+    CHECK(tile_cols() > 0 && IsPowerOfTwo(static_cast<uint64>(tile_cols())));
+    CHECK(!has_addend() || addend != nullptr);
   }
 
   void Emit();
 
+  const Config& config() const { return config_; }
+
  private:
-  TileLoader GetLhsTileLoader(llvm::Value* row_start, int64 row_count) {
-    return TileLoader(&vsl_, ir_builder_, /*matrix=*/lhs_,
-                      /*matrix_size_along_minor_dim=*/k_,
+  MemoryTile GetLhsMemoryTile(llvm::Value* row_start, int64 row_count) {
+    return MemoryTile(&vsl_, ir_builder_, /*matrix=*/lhs_,
+                      /*matrix_size_along_minor_dim=*/k(),
                       /*major_dim_offset=*/row_start,
                       /*tile_size_along_major_dim=*/row_count);
   }
 
   void EmitOuterLoopBody(llvm::Value* row, int64 row_count);
 
-  void EmitInnerLoopTiled(TileLoader* lhs_tile_loader, int64 rows,
+  void EmitInnerLoopTiled(MemoryTile* lhs_memory_tile, int64 rows,
                           std::vector<VectorVariable>* vector_accumulators);
 
   void EmitInnerLoopEpilogue(llvm::Value* current_tile_row, int64 rows,
                              std::vector<ScalarVariable>* scalar_accumulators);
 
-  PrimitiveType scalar_type_;
-  int64 tile_rows_;
-  int64 tile_cols_;
-  int64 m_;
-  int64 k_;
+  Config config_;
   llvm::Value* lhs_;
   llvm::Value* rhs_;
   llvm::Value* addend_;
@@ -420,7 +523,7 @@ class RowMajorMatrixVectorProductEmitter {
 
 void RowMajorMatrixVectorProductEmitter::EmitOuterLoopBody(llvm::Value* row,
                                                            int64 row_count) {
-  TileLoader lhs_tile_loader = GetLhsTileLoader(/*row_start=*/row,
+  MemoryTile lhs_memory_tile = GetLhsMemoryTile(/*row_start=*/row,
                                                 /*row_count=*/row_count);
   std::vector<VectorVariable> vector_accumulators;
   std::vector<ScalarVariable> scalar_accumulators;
@@ -428,7 +531,7 @@ void RowMajorMatrixVectorProductEmitter::EmitOuterLoopBody(llvm::Value* row,
     vector_accumulators.emplace_back(&vsl_, vsl_.GetZeroVector());
     scalar_accumulators.emplace_back(&vsl_, vsl_.GetZeroScalar());
   }
-  EmitInnerLoopTiled(&lhs_tile_loader, /*rows=*/row_count,
+  EmitInnerLoopTiled(&lhs_memory_tile, /*rows=*/row_count,
                      &vector_accumulators);
   EmitInnerLoopEpilogue(/*current_tile_row=*/row, /*rows=*/row_count,
                         &scalar_accumulators);
@@ -465,12 +568,12 @@ void RowMajorMatrixVectorProductEmitter::EmitOuterLoopBody(llvm::Value* row,
 
 void RowMajorMatrixVectorProductEmitter::Emit() {
   // See the comment on the class declaration for the algorithm used here.
-  int64 row_remainder = m_ % tile_rows_;
-  int64 row_limit = m_ - row_remainder;
+  int64 row_remainder = m() % tile_rows();
+  int64 row_limit = m() - row_remainder;
 
   ksl_.For("dot.outer.tiled",
-           /*start=*/0, /*end=*/row_limit, /*step=*/tile_rows_,
-           [&](llvm::Value* row) { EmitOuterLoopBody(row, tile_rows_); });
+           /*start=*/0, /*end=*/row_limit, /*step=*/tile_rows(),
+           [&](llvm::Value* row) { EmitOuterLoopBody(row, tile_rows()); });
 
   if (row_remainder != 0) {
     EmitOuterLoopBody(ir_builder_->getInt64(row_limit), row_remainder);
@@ -478,14 +581,14 @@ void RowMajorMatrixVectorProductEmitter::Emit() {
 }
 
 void RowMajorMatrixVectorProductEmitter::EmitInnerLoopTiled(
-    TileLoader* lhs_tile_loader, int64 rows,
+    MemoryTile* lhs_memory_tile, int64 rows,
     std::vector<VectorVariable>* vector_accumulators) {
-  int64 column_limit = k_ - (k_ % tile_cols_);
+  int64 column_limit = k() - (k() % tile_cols());
 
   ksl_.For("dot.inner.tiled", /*start=*/0, /*end=*/column_limit,
-           /*step=*/tile_cols_, [&](llvm::Value* col) {
+           /*step=*/tile_cols(), [&](llvm::Value* col) {
              std::vector<llvm::Value*> lhs_tile =
-                 lhs_tile_loader->LoadTile(/*minor_dim_offset=*/col);
+                 lhs_memory_tile->LoadTile(/*minor_dim_offset=*/col);
              llvm::Value* rhs_value = vsl_.LoadVector(rhs_, col);
              for (int i = 0; i < rows; i++) {
                llvm::Value* old_sum = (*vector_accumulators)[i].Get();
@@ -498,18 +601,18 @@ void RowMajorMatrixVectorProductEmitter::EmitInnerLoopTiled(
 void RowMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
     llvm::Value* current_tile_row, int64 rows,
     std::vector<ScalarVariable>* scalar_accumulators) {
-  int64 column_start = k_ - (k_ % tile_cols_);
-  if (column_start == k_) {
+  int64 column_start = k() - (k() % tile_cols());
+  if (column_start == k()) {
     return;
   }
 
   for (int r = 0; r < rows; r++) {
     llvm::Value* total_offset = ir_builder_->CreateMul(
         ir_builder_->CreateAdd(ir_builder_->getInt64(r), current_tile_row),
-        ir_builder_->getInt64(k_));
+        ir_builder_->getInt64(k()));
     llvm::Value* lhs_base_pointer =
         vsl_.ComputeOffsetPointer(lhs_, total_offset);
-    ksl_.For("dot.inner.epilg.inner", /*start=*/column_start, /*end=*/k_,
+    ksl_.For("dot.inner.epilg.inner", /*start=*/column_start, /*end=*/k(),
              /*step=*/1, [&](llvm::Value* scalar_col) {
                llvm::Value* product =
                    vsl_.Mul(vsl_.LoadScalar(lhs_base_pointer, scalar_col),
@@ -520,18 +623,336 @@ void RowMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
   }
 }
 
+// This class implements a tiled matrix multiplication algorithm, intended for
+// use as the innermost GEBP loop in a GEMM kernel (GEBP is described in "Goto,
+// Kazushige, and Robert Van De Geijn. "High-performance implementation of the
+// level-3 BLAS." ACM Transactions on Mathematical Software (TOMS) 35.1 (2008):
+// 4).
+//
+// This only supports canonical dot operations (i.e. where the lhs contraction
+// dimension is 1 and the rhs contraction dimension is 0) over row major
+// matrices.
+class MatrixMatrixBlockPanelEmitter {
+ public:
+  // Describe the dimensions of the GEBP kernel.  These will usually not be the
+  // dimensions of the GEMM itself, the GEMM will usually be broken up into GEBP
+  // kernels with smaller dimensions.
+  class Dimensions {
+   public:
+    explicit Dimensions(int64 m, int64 k, int64 n) : m_(m), k_(k), n_(n) {}
+
+    int64 m() const { return m_; }
+    int64 k() const { return k_; }
+    int64 n() const { return n_; }
+
+    string ToString() const {
+      return tensorflow::strings::StrCat(m(), "x", k(), "x", n());
+    }
+
+   private:
+    const int64 m_;
+    const int64 k_;
+    const int64 n_;
+  };
+
+  // Represents the configuration of the GEBP emitter.  The LLVM IR emitted by
+  // the emitter, modulo the LLVM values holding the input and output buffers,
+  // must be a function of the instance of `Config` passed to it.
+  //
+  // `dims` holds the matrix multiplication dimensions.
+  //
+  // `max_vectorization_width` is the maximum vector width (i.e. the width of
+  // the largest vector register we will use).  This can be larger than the
+  // largest vector register supported by the machine -- LLVM will legalize
+  // these large vector widths into legally sized vectors.
+  // `min_vectorization_width` is the smallest vector width the emitter will use
+  // -- below that it will devolve to using a scalar loop.
+  //
+  // The innermost reduction loop executes the matrix multiply in tiles of size
+  // [`tile_size_m`, `tile_size_k`] from the LHS and [`tile_size_k`,
+  // <vectorization width>] in the RHS.
+  class Config {
+   public:
+    explicit Config(PrimitiveType scalar_type, Dimensions dims,
+                    int64 max_vectorization_width,
+                    int64 min_vectorization_width, int64 tile_size_m,
+                    int64 tile_size_k)
+        : scalar_type_(scalar_type),
+          dims_(dims),
+          max_vectorization_width_(max_vectorization_width),
+          min_vectorization_width_(min_vectorization_width),
+          tile_size_m_(tile_size_m),
+          tile_size_k_(tile_size_k) {}
+
+    string GetCacheKey() const {
+      return tensorflow::strings::StrCat(
+          "gebp_", PrimitiveType_Name(scalar_type()), "_", dims().ToString(),
+          "_", max_vectorization_width(), "_", min_vectorization_width(), "_",
+          tile_size_m(), "_", tile_size_k());
+    }
+
+    PrimitiveType scalar_type() const { return scalar_type_; }
+    Dimensions dims() const { return dims_; }
+    int64 max_vectorization_width() const { return max_vectorization_width_; }
+    int64 min_vectorization_width() const { return min_vectorization_width_; }
+
+    int64 tile_size_m() const { return tile_size_m_; }
+    int64 tile_size_k() const { return tile_size_k_; }
+
+   private:
+    PrimitiveType scalar_type_;
+    Dimensions dims_;
+    int64 max_vectorization_width_;
+    int64 min_vectorization_width_;
+    int64 tile_size_m_;
+    int64 tile_size_k_;
+  };
+
+  // Creates an instance of MatrixMatrixBlockPanelEmitter that matrix-multiplies
+  // `lhs` with `rhs` and stores the result in `result`.
+  explicit MatrixMatrixBlockPanelEmitter(Config config, llvm::Value* lhs,
+                                         llvm::Value* rhs, llvm::Value* result,
+                                         llvm::IRBuilder<>* ir_builder)
+      : lhs_(lhs),
+        rhs_(rhs),
+        result_(result),
+        config_(config),
+        ir_builder_(ir_builder),
+        ksl_(ir_builder_) {
+    CHECK(max_vectorization_width() > 0 &&
+          IsPowerOfTwo(static_cast<uint64>(max_vectorization_width())));
+    CHECK(min_vectorization_width() > 0 &&
+          IsPowerOfTwo(static_cast<uint64>(min_vectorization_width())));
+    CHECK_GT(tile_size_k(), 0);
+  }
+
+  void Emit();
+
+ private:
+  // This emits a loop that loops over the `n` dimension in multiples of
+  // `max_vectorization_width` as much as possible and then emits a remainder
+  // epilogue.
+  void EmitLoopOverN();
+
+  // This emits a loop that loops over the `k` dimension in multiples of
+  // `tile_size_k` as much as possible and then emits a remainder epilogue.
+  void EmitLoopOverK(VectorSupportLibrary* vsl, llvm::Value* n_start,
+                     llvm::Value* n_end);
+
+  // This emits a loop that loops over the `m` dimension in multiples of
+  // `tile_size_m` as much as possible and then emits a remainder epilogue.
+  void EmitLoopOverM(VectorSupportLibrary* vsl, int64 tile_size_k,
+                     llvm::Value* k_start, llvm::Value* k_end,
+                     llvm::Value* n_start, llvm::Value* n_end);
+
+  // This emits the inner reduction loop.  This inner reduction loop multiplies
+  // a tile from the LHS of size [tile_size_m,tile_size_k] and a tile from the
+  // RHS of size [`tile_size_k`, vls->vector_width()] to update a tile of size
+  // [`tile_size_m`, vls->vector_width()] in the result.
+  void EmitTiledReductionLoop(VectorSupportLibrary* vsl, int64 tile_size_k,
+                              llvm::Value* k_start, llvm::Value* k_end,
+                              llvm::Value* n_start, llvm::Value* n_end,
+                              int64 tile_size_m, llvm::Value* m_start,
+                              llvm::Value* m_end);
+
+  llvm::Value* GetInt64(int64 value) { return ir_builder_->getInt64(value); }
+
+  Config config() const { return config_; }
+  Dimensions dims() const { return config().dims(); }
+
+  int64 max_vectorization_width() const {
+    return config().max_vectorization_width();
+  }
+  int64 min_vectorization_width() const {
+    return config().min_vectorization_width();
+  }
+  int64 tile_size_m() const { return config().tile_size_m(); }
+  int64 tile_size_k() const { return config().tile_size_k(); }
+  PrimitiveType scalar_type() const { return config().scalar_type(); }
+
+  llvm::Value* lhs_;
+  llvm::Value* rhs_;
+  llvm::Value* result_;
+  Config config_;
+
+  llvm::IRBuilder<>* ir_builder_;
+  KernelSupportLibrary ksl_;
+};
+
+void MatrixMatrixBlockPanelEmitter::Emit() { EmitLoopOverN(); }
+
+void MatrixMatrixBlockPanelEmitter::EmitLoopOverN() {
+  // We can only iterate the `n` dimension for an extent that is divisible by
+  // the vectorization width.  So we emit an outer loop that first processes the
+  // largest extent in `n` that is divisible by max_vectorization_width, then
+  // the largest remaining extent that is divisible by max_vectorization_width /
+  // 2 etc.
+
+  int64 current_vectorization_width = max_vectorization_width();
+  int64 n_start = 0;
+  while (n_start != dims().n() &&
+         current_vectorization_width >= min_vectorization_width()) {
+    int64 n_end = dims().n() - (dims().n() % current_vectorization_width);
+    if (n_start != n_end) {
+      VectorSupportLibrary vsl(scalar_type(), current_vectorization_width,
+                               ir_builder_, "gebp");
+      EmitLoopOverK(&vsl, GetInt64(n_start), GetInt64(n_end));
+      n_start = n_end;
+    }
+    current_vectorization_width /= 2;
+  }
+
+  if (n_start != dims().n()) {
+    VectorSupportLibrary vsl(scalar_type(), 1, ir_builder_, "gebp");
+    ksl_.For("epi.n", n_start, dims().n(), 1, [&](llvm::Value* n_i) {
+      llvm::Value* n_i_next =
+          ir_builder_->CreateAdd(n_i, ir_builder_->getInt64(1));
+      EmitLoopOverK(&vsl, n_i, n_i_next);
+    });
+  }
+}
+
+void MatrixMatrixBlockPanelEmitter::EmitLoopOverK(VectorSupportLibrary* vsl,
+                                                  llvm::Value* n_start,
+                                                  llvm::Value* n_end) {
+  int64 k_start = 0;
+  int64 k_end = dims().k() - (dims().k() % tile_size_k());
+  if (k_end != k_start) {
+    EmitLoopOverM(vsl, tile_size_k(), GetInt64(k_start), GetInt64(k_end),
+                  n_start, n_end);
+    k_start = k_end;
+  }
+
+  if (k_start != dims().k()) {
+    EmitLoopOverM(vsl, dims().k() - k_start, GetInt64(k_start),
+                  GetInt64(dims().k()), n_start, n_end);
+  }
+}
+
+void MatrixMatrixBlockPanelEmitter::EmitLoopOverM(
+    VectorSupportLibrary* vsl, int64 tile_size_k, llvm::Value* k_start,
+    llvm::Value* k_end, llvm::Value* n_start, llvm::Value* n_end) {
+  const int64 m_end = dims().m() - dims().m() % tile_size_m();
+  EmitTiledReductionLoop(vsl, tile_size_k, k_start, k_end, n_start, n_end,
+                         tile_size_m(), GetInt64(0), GetInt64(m_end));
+
+  if (m_end != dims().m()) {
+    EmitTiledReductionLoop(vsl, tile_size_k, k_start, k_end, n_start, n_end,
+                           dims().m() - m_end, GetInt64(m_end),
+                           GetInt64(dims().m()));
+  }
+}
+
+// The tiling scheme is as follows:
+//
+// Let the LHS be:
+//
+//   +----+----+----+
+//   | a0 | b0 | c0 | .
+//   +----+----+----+ .
+//   | a1 | b1 | c1 | .
+//   +----+----+----+
+//     ..     ..
+//
+// and the RHS be:
+//
+//   +----+----+----+----+
+//   | p0 | p1 | p2 | p3 | .
+//   +----+----+----+----+ .
+//   | q0 | q1 | q2 | q3 | .
+//   +----+----+----+----+
+//   | r0 | r1 | r2 | r3 | .
+//   +----+----+----+----+ .
+//     ......    ......
+//
+// and let tile_size_m=2, tile_size_k=3 and the vector width (implicitly denoted
+// by `vsl`) be 4.  Then we want to matrix multiply this tile to get a [2,4]
+// matrix that we can increment the result matrix by.
+//
+// First broadcast the rows row in LHS to 3 vectors of width 4, giving us a rank
+// 3 array, L, of dimension [2,3,4]:
+//
+//       L[0,_,_]           *      L[1,_,_]
+//                          *
+//   +----+----+----+----+  *  +----+----+----+----+
+//   | a0 | a0 | a0 | a0 |  *  | a1 | a1 | a1 | a1 |
+//   +----+----+----+----+  *  +----+----+----+----+
+//   | b0 | b0 | b0 | b0 |  *  | b1 | b1 | b1 | b1 |
+//   +----+----+----+----+  *  +----+----+----+----+
+//   | c0 | c0 | c0 | c0 |  *  | c1 | c1 | c1 | c1 |
+//   +----+----+----+----+  *  +----+----+----+----+
+//
+//
+// Then we FMA L[0,_,_] with the RHS to get the first row of the result and
+// L[1,_,_] with the RHS to get the second row of the result.  For example,
+// L[0,_,_] is computed as:
+//
+//   +----+----+----+----+   +----+----+----+----+
+//   | a0 | a0 | a0 | a0 | * | p0 | p1 | p2 | p3 |   +
+//   +----+----+----+----+   +----+----+----+----+
+//
+//   +----+----+----+----+   +----+----+----+----+
+//   | b0 | b0 | b0 | b0 | * | q0 | q1 | q2 | q3 |   +
+//   +----+----+----+----+   +----+----+----+----+
+//
+//   +----+----+----+----+   +----+----+----+----+
+//   | c0 | c0 | c0 | c0 | * | r0 | r1 | r2 | r3 |
+//   +----+----+----+----+   +----+----+----+----+
+//
+// to get:
+//
+//   +-------------------+-------------------+-------------------+---------
+//   | a0*p0+b0*q0+c0*r0 | a0*p1+b0*q1+c0*r1 | a0*p2+b0*q2+c0*r2 |  ...
+//   +-------------------+-------------------+-------------------+---------
+void MatrixMatrixBlockPanelEmitter::EmitTiledReductionLoop(
+    VectorSupportLibrary* vsl, int64 tile_size_k, llvm::Value* k_start,
+    llvm::Value* k_end, llvm::Value* n_start, llvm::Value* n_end,
+    int64 tile_size_m, llvm::Value* m_start, llvm::Value* m_end) {
+  ksl_.For("dot.m", m_start, m_end, tile_size_m, [&](llvm::Value* m_i) {
+    MemoryTile result_memory_tile(vsl, ir_builder_, /*matrix=*/result_,
+                                  /*matrix_size_along_minor_dim=*/dims().n(),
+                                  /*major_dim_offset=*/m_i,
+                                  /*tile_size_along_major_dim=*/tile_size_m);
+    MemoryTile lhs_memory_tile(vsl, ir_builder_, /*matrix=*/lhs_,
+                               /*matrix_size_along_minor_dim=*/dims().k(),
+                               /*major_dim_offset=*/m_i,
+                               /*tile_size_along_major_dim=*/tile_size_m);
+
+    ksl_.For("dot.k", k_start, k_end, tile_size_k, [&](llvm::Value* k_i) {
+      MemoryTile rhs_memory_tile(vsl, ir_builder_, rhs_, dims().n(), k_i,
+                                 tile_size_k);
+      std::vector<std::vector<llvm::Value*>> lhs_tile =
+          lhs_memory_tile.LoadBroadcastTile(k_i, tile_size_k);
+      ksl_.For(
+          "dot.n", n_start, n_end, vsl->vector_size(), [&](llvm::Value* n_i) {
+            std::vector<llvm::Value*> rhs_tile = rhs_memory_tile.LoadTile(n_i);
+            std::vector<llvm::Value*> result_tile =
+                result_memory_tile.LoadTile(n_i);
+            for (int64 r_m_i = 0; r_m_i < tile_size_m; r_m_i++) {
+              for (int64 r_k_i = 0; r_k_i < tile_size_k; r_k_i++) {
+                result_tile[r_m_i] =
+                    vsl->MulAdd(lhs_tile[r_m_i][r_k_i], rhs_tile[r_k_i],
+                                result_tile[r_m_i]);
+              }
+            }
+            result_memory_tile.StoreTile(result_tile, n_i);
+          });
+    });
+  });
+}
+
 }  // namespace
 
-DotOpEmitter::DotOpEmitter(
-    const HloInstruction& dot, bool transpose_lhs, bool transpose_rhs,
-    const llvm_ir::IrArray& target_array, const llvm_ir::IrArray& lhs_array,
-    const llvm_ir::IrArray& rhs_array, const llvm_ir::IrArray* addend_array,
-    llvm::Value* executable_run_options_value, llvm::IRBuilder<>* ir_builder,
-    const HloModuleConfig& hlo_module_config,
-    const TargetMachineFeatures& target_machine_features)
+DotOpEmitter::DotOpEmitter(const HloInstruction& dot,
+                           const llvm_ir::IrArray& target_array,
+                           const llvm_ir::IrArray& lhs_array,
+                           const llvm_ir::IrArray& rhs_array,
+                           const llvm_ir::IrArray* addend_array,
+                           llvm::Value* executable_run_options_value,
+                           llvm::IRBuilder<>* ir_builder,
+                           const HloModuleConfig& hlo_module_config,
+                           const TargetMachineFeatures& target_machine_features)
     : dot_(dot),
-      transpose_lhs_(transpose_lhs),
-      transpose_rhs_(transpose_rhs),
       target_array_(target_array),
       lhs_array_(lhs_array),
       rhs_array_(rhs_array),
@@ -541,23 +962,99 @@ DotOpEmitter::DotOpEmitter(
       hlo_module_config_(hlo_module_config),
       target_machine_features_(target_machine_features) {}
 
-/* static */ tensorflow::Status DotOpEmitter::EmitDotOperation(
-    const HloInstruction& dot, bool transpose_lhs, bool transpose_rhs,
-    const llvm_ir::IrArray& target_array, const llvm_ir::IrArray& lhs_array,
-    const llvm_ir::IrArray& rhs_array, const llvm_ir::IrArray* addend_array,
+/* static */ Status DotOpEmitter::EmitDotOperation(
+    const HloInstruction& dot, const llvm_ir::IrArray& target_array,
+    const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
+    const llvm_ir::IrArray* addend_array,
     llvm::Value* executable_run_options_value, llvm::IRBuilder<>* ir_builder,
     const HloModuleConfig& hlo_module_config,
     const TargetMachineFeatures& target_machine_features) {
   PrimitiveType type = target_array.GetShape().element_type();
   TF_RET_CHECK(F16 == type || F32 == type || F64 == type || C64 == type);
-  DotOpEmitter dot_emitter(dot, transpose_lhs, transpose_rhs, target_array,
-                           lhs_array, rhs_array, addend_array,
-                           executable_run_options_value, ir_builder,
-                           hlo_module_config, target_machine_features);
+  DotOpEmitter dot_emitter(dot, target_array, lhs_array, rhs_array,
+                           addend_array, executable_run_options_value,
+                           ir_builder, hlo_module_config,
+                           target_machine_features);
   return dot_emitter.Emit();
 }
 
-bool DotOpEmitter::ShapesAreLegalForRuntimeDot() const { return true; }
+bool DotOpEmitter::EmitExperimentalGebpDotIfEnabled(
+    const DotOpEmitter::MatMultDims& mat_mult_dims) {
+  if (!EnableExperimentalLlvmIrGemm() || ShouldUseMultiThreadedEigen()) {
+    return false;
+  }
+
+  if (mat_mult_dims.lhs_non_canonical || mat_mult_dims.rhs_non_canonical) {
+    return false;
+  }
+
+  PrimitiveType primitive_type = dot_.shape().element_type();
+
+  switch (primitive_type) {
+    default:
+      return false;
+
+    case F32:
+    case F64:
+    case S32:
+    case S64:
+      break;
+  }
+
+  if (!(mat_mult_dims.lhs_column_major == mat_mult_dims.rhs_column_major &&
+        mat_mult_dims.rhs_column_major == mat_mult_dims.target_column_major)) {
+    return false;
+  }
+
+  llvm::Value* lhs = lhs_array_.GetBasePointer();
+  llvm::Value* rhs = rhs_array_.GetBasePointer();
+  llvm::Value* target = target_array_.GetBasePointer();
+  int64 m = mat_mult_dims.m;
+  int64 k = mat_mult_dims.k;
+  int64 n = mat_mult_dims.n;
+
+  if (mat_mult_dims.lhs_column_major) {
+    std::swap(lhs, rhs);
+    std::swap(m, n);
+  }
+
+  int64 size_bytes = m * n * ShapeUtil::ByteSizeOfPrimitiveType(primitive_type);
+  ir_builder_->CreateMemSet(
+      target, ir_builder_->getInt8(0), size_bytes,
+      target_machine_features_.minimum_alignment_for_allocation(size_bytes));
+
+  int64 max_vector_width =
+      target_machine_features_.vector_register_num_elements(
+          *ir_builder_->GetInsertBlock()->getParent(), primitive_type);
+
+  MatrixMatrixBlockPanelEmitter::Config config(
+      /*scalar_type=*/primitive_type,
+      MatrixMatrixBlockPanelEmitter::Dimensions{/*m=*/m, /*k=*/k, /*n=*/n},
+      /*max_vectorization_width=*/max_vector_width,
+      /*min_vectorization_width=*/std::min<int64>(4, max_vector_width),
+      /*tile_size_m=*/3, /*tile_size_k=*/5);
+
+  VLOG(2) << "Emitting GEBP kernel in LLVM IR with config "
+          << config.GetCacheKey();
+
+  const bool enable_fast_math =
+      hlo_module_config_.debug_options().xla_enable_fast_math();
+  const bool optimize_for_size =
+      options::OptimizeForSizeRequested(hlo_module_config_);
+
+  KernelSupportLibrary::EmitAndCallOutlinedKernel(
+      /*enable_fast_math=*/enable_fast_math,
+      /*optimize_for_size=*/optimize_for_size, ir_builder_,
+      config.GetCacheKey(), lhs, rhs, target,
+      [this, config](llvm::Value* lhs, llvm::Value* rhs, llvm::Value* target) {
+        MatrixMatrixBlockPanelEmitter gebp_emitter(
+            config, /*lhs=*/lhs, /*rhs=*/rhs,
+            /*result=*/target, ir_builder_);
+        gebp_emitter.Emit();
+      });
+
+  return true;
+}
 
 bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
   if (dot_.shape().dimensions_size() != 2) {
@@ -580,7 +1077,7 @@ bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
 
   if (mat_mult_dims.m == 1) {
     bool rhs_effectively_row_major =
-        transpose_rhs_ ^ !mat_mult_dims.rhs_column_major;
+        mat_mult_dims.rhs_non_canonical ^ !mat_mult_dims.rhs_column_major;
     if (rhs_effectively_row_major) {
       k = mat_mult_dims.k;
       m = mat_mult_dims.n;
@@ -596,7 +1093,7 @@ bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
 
   if (mat_mult_dims.n == 1) {
     bool lhs_effectively_column_major =
-        transpose_lhs_ ^ mat_mult_dims.lhs_column_major;
+        mat_mult_dims.lhs_non_canonical ^ mat_mult_dims.lhs_column_major;
     if (lhs_effectively_column_major) {
       m = mat_mult_dims.m;
       k = mat_mult_dims.k;
@@ -611,7 +1108,7 @@ bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
   }
 
   if (!is_column_major_matrix_vector && !is_row_major_matrix_vector) {
-    return false;
+    return EmitExperimentalGebpDotIfEnabled(mat_mult_dims);
   }
 
   int64 tiling_factor = GetGemvTilingFactor();
@@ -644,47 +1141,39 @@ bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
   if (is_column_major_matrix_vector) {
     VLOG(2) << "Emitting column major matrix-vector multiply with m = " << m
             << " and k = " << k;
-    int64 tile_rows = vector_register_element_size;
-    int64 tile_cols = tiling_factor;
-
-    string kernel_name = tensorflow::strings::StrCat(
-        "col_major_gemv_", PrimitiveType_Name(primitive_type), "_", tile_rows,
-        "_", tile_cols, "_", m, "_", k, addend_array_ ? "_with_addend" : "");
+    ColumnMajorMatrixVectorProductEmitter::Config config(
+        /*scalar_type=*/primitive_type,
+        /*tile_rows=*/vector_register_element_size, /*tile_cols=*/tiling_factor,
+        /*m=*/m, /*k=*/k, /*has_addend=*/addend_array_ != nullptr);
 
     KernelSupportLibrary::EmitAndCallOutlinedKernel(
         /*enable_fast_math=*/enable_fast_math,
-        /*optimize_for_size=*/optimize_for_size, ir_builder_, kernel_name,
-        lhs_op, rhs_op,
+        /*optimize_for_size=*/optimize_for_size, ir_builder_,
+        config.GetCacheKey(), lhs_op, rhs_op,
         addend_array_ ? addend_array_->GetBasePointer() : nullptr, result_op,
-        [this, tile_rows, tile_cols, m, k, primitive_type](
-            llvm::Value* lhs_op, llvm::Value* rhs_op, llvm::Value* addend_op,
-            llvm::Value* result_op) {
+        [this, config](llvm::Value* lhs_op, llvm::Value* rhs_op,
+                       llvm::Value* addend_op, llvm::Value* result_op) {
           ColumnMajorMatrixVectorProductEmitter emitter(
-              primitive_type, tile_rows, tile_cols, m, k, lhs_op, rhs_op,
-              addend_op, result_op, ir_builder_);
+              config, lhs_op, rhs_op, addend_op, result_op, ir_builder_);
           emitter.Emit();
         });
   } else {
     VLOG(2) << "Emitting row major matrix-vector multiply with m = " << m
             << " and k = " << k;
-    int64 tile_rows = tiling_factor;
-    int64 tile_cols = vector_register_element_size;
-
-    string kernel_name = tensorflow::strings::StrCat(
-        "row_major_gemv_", PrimitiveType_Name(primitive_type), "_", tile_rows,
-        "_", tile_cols, "_", m, "_", k, addend_array_ ? "_with_addend" : "");
+    RowMajorMatrixVectorProductEmitter::Config config(
+        /*scalar_type=*/primitive_type,
+        /*tile_rows=*/tiling_factor, /*tile_cols=*/vector_register_element_size,
+        /*m=*/m, /*k=*/k, /*has_addend=*/addend_array_ != nullptr);
 
     KernelSupportLibrary::EmitAndCallOutlinedKernel(
         /*enable_fast_math=*/enable_fast_math,
-        /*optimize_for_size=*/optimize_for_size, ir_builder_, kernel_name,
-        lhs_op, rhs_op,
+        /*optimize_for_size=*/optimize_for_size, ir_builder_,
+        config.GetCacheKey(), lhs_op, rhs_op,
         addend_array_ ? addend_array_->GetBasePointer() : nullptr, result_op,
-        [this, tile_rows, tile_cols, m, k, primitive_type](
-            llvm::Value* lhs_op, llvm::Value* rhs_op, llvm::Value* addend_op,
-            llvm::Value* result_op) {
+        [this, config](llvm::Value* lhs_op, llvm::Value* rhs_op,
+                       llvm::Value* addend_op, llvm::Value* result_op) {
           RowMajorMatrixVectorProductEmitter emitter(
-              primitive_type, tile_rows, tile_cols, m, k, lhs_op, rhs_op,
-              addend_op, result_op, ir_builder_);
+              config, lhs_op, rhs_op, addend_op, result_op, ir_builder_);
           emitter.Emit();
         });
   }
@@ -692,7 +1181,7 @@ bool DotOpEmitter::EmitLlvmIrDotIfProfitable() {
   return true;
 }
 
-tensorflow::Status DotOpEmitter::Emit() {
+Status DotOpEmitter::Emit() {
   // The dot operation performs a sum of products over dimension 0 of the left
   // hand side operand and dimension 1 of the right hand side operand.
   //
@@ -736,23 +1225,17 @@ tensorflow::Status DotOpEmitter::Emit() {
 
   CHECK_EQ(addend_array_, nullptr);
 
-  if (PotentiallyImplementedAsEigenDot(dot_)) {
+  if (PotentiallyImplementedAsEigenDot(dot_, target_machine_features_)) {
     return EmitCallToRuntime();
   }
 
   // Reduce along dimension 0 of the LHS and 1 of the RHS. Vectors are a special
   // case where the reduction dimension is 0 for both LHS and RHS. This results
   // in a vector dot product producing a scalar.
-  int64 lhs_reduction_dimension = 0;
-  if (ShapeUtil::Rank(lhs_shape) >= 2) {
-    lhs_reduction_dimension =
-        ShapeUtil::GetDimensionNumber(lhs_shape, transpose_lhs_ ? -2 : -1);
-  }
-  int64 rhs_reduction_dimension = 0;
-  if (ShapeUtil::Rank(rhs_shape) >= 2) {
-    rhs_reduction_dimension =
-        ShapeUtil::GetDimensionNumber(rhs_shape, transpose_rhs_ ? -1 : -2);
-  }
+  int64 lhs_reduction_dimension =
+      dot_.dot_dimension_numbers().lhs_contracting_dimensions(0);
+  int64 rhs_reduction_dimension =
+      dot_.dot_dimension_numbers().rhs_contracting_dimensions(0);
 
   // Verify the reduction dimension in the two operands are the same size.
   TF_RET_CHECK(lhs_shape.dimensions(lhs_reduction_dimension) ==
@@ -876,10 +1359,10 @@ tensorflow::Status DotOpEmitter::Emit() {
   // loop.
   ir_builder_->SetInsertPoint(loop_nest.GetOuterLoopExitBasicBlock());
 
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status DotOpEmitter::EmitScalarDot() {
+Status DotOpEmitter::EmitScalarDot() {
   // A scalar dot is just a scalar multiply.
   llvm::Value* result;
   llvm::Value* lhs_value =
@@ -904,12 +1387,10 @@ tensorflow::Status DotOpEmitter::EmitScalarDot() {
     result = ir_builder_->CreateFMul(lhs_value, rhs_value);
   }
   target_array_.EmitWriteArrayElement(/*index=*/{}, result, ir_builder_);
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status DotOpEmitter::EmitCallToRuntime() {
-  DCHECK(ShapesAreLegalForRuntimeDot());
-
+Status DotOpEmitter::EmitCallToRuntime() {
   // The signature of the Eigen runtime matmul function is:
   //
   //   (void)(void* run_options, float* out, float* lhs, float* rhs,
@@ -918,8 +1399,7 @@ tensorflow::Status DotOpEmitter::EmitCallToRuntime() {
   // The two transpose_... parameters are actually booleans, but we use int32
   // to avoid target-dependent calling convention details.
 
-  bool multi_threaded =
-      hlo_module_config_.debug_options().xla_cpu_multi_thread_eigen();
+  bool multi_threaded = ShouldUseMultiThreadedEigen();
   bool use_mkl_dnn = hlo_module_config_.debug_options().xla_cpu_use_mkl_dnn();
   PrimitiveType type = target_array_.GetShape().element_type();
   llvm::Type* float_type;
@@ -990,8 +1470,8 @@ tensorflow::Status DotOpEmitter::EmitCallToRuntime() {
 
   const llvm_ir::IrArray* lhs = &lhs_array_;
   const llvm_ir::IrArray* rhs = &rhs_array_;
-  bool transpose_lhs = transpose_lhs_;
-  bool transpose_rhs = transpose_rhs_;
+  bool transpose_lhs = mat_mult_dims.lhs_non_canonical;
+  bool transpose_rhs = mat_mult_dims.rhs_non_canonical;
 
   if (!mat_mult_dims.lhs_column_major) {
     std::swap(mat_mult_dims.m, mat_mult_dims.n);
@@ -1011,7 +1491,7 @@ tensorflow::Status DotOpEmitter::EmitCallToRuntime() {
        ir_builder_->getInt64(mat_mult_dims.k),
        ir_builder_->getInt32(transpose_lhs),
        ir_builder_->getInt32(transpose_rhs)});
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 DotOpEmitter::MatMultDims DotOpEmitter::GetMatMultDims() const {
@@ -1019,12 +1499,18 @@ DotOpEmitter::MatMultDims DotOpEmitter::GetMatMultDims() const {
 
   const Shape& lhs_shape = lhs_array_.GetShape();
   const Shape& rhs_shape = rhs_array_.GetShape();
-
-  return {lhs_shape.dimensions(transpose_lhs_ ? 1 : 0),
-          lhs_shape.dimensions(transpose_lhs_ ? 0 : 1),
-          rhs_shape.dimensions(transpose_rhs_ ? 0 : 1),
-          LayoutUtil::Minor(lhs_shape.layout(), 0) == 0,
-          LayoutUtil::Minor(rhs_shape.layout(), 0) == 0};
+  const DotDimensionNumbers& dim_nums = dot_.dot_dimension_numbers();
+
+  return {
+      /*m=*/lhs_shape.dimensions(1 - dim_nums.lhs_contracting_dimensions(0)),
+      /*k=*/lhs_shape.dimensions(dim_nums.lhs_contracting_dimensions(0)),
+      /*n=*/rhs_shape.dimensions(1 - dim_nums.rhs_contracting_dimensions(0)),
+      /*lhs_column_major=*/LayoutUtil::Minor(lhs_shape.layout(), 0) == 0,
+      /*lhs_non_canonical=*/dim_nums.lhs_contracting_dimensions(0) == 0,
+      /*rhs_column_major=*/LayoutUtil::Minor(rhs_shape.layout(), 0) == 0,
+      /*rhs_non_canonical=*/dim_nums.rhs_contracting_dimensions(0) == 1,
+      /*target_column_major=*/
+      LayoutUtil::Minor(target_array_.GetShape().layout(), 0) == 0};
 }
 
 llvm_ir::IrArray::Index DotOpEmitter::EmitOperandArrayLoopNest(
@@ -1064,19 +1550,39 @@ static bool IsRank2WithNoPadding(const Shape& shape) {
 
 // In a gemm operation where output = lhs * rhs, check whether the given shapes
 // are valid for the operation.
-static bool AreValidGemmShapes(const Shape& lhs_shape, const Shape& rhs_shape,
-                               const Shape& output_shape) {
+static bool AreValidGemmShapes(
+    const Shape& lhs_shape, const Shape& rhs_shape, const Shape& output_shape,
+    const TargetMachineFeatures& target_machine_features) {
   // The inputs and the output must
   // 1) be matrices with no padding, and
   // 2) have an allowed element type.
   PrimitiveType output_primitive_type = output_shape.element_type();
-  return (output_primitive_type == F64 || output_primitive_type == F32 ||
-          output_primitive_type == F16) &&
-         IsRank2WithNoPadding(lhs_shape) && IsRank2WithNoPadding(rhs_shape) &&
-         IsRank2WithNoPadding(output_shape);
+  if (!(output_primitive_type == F64 || output_primitive_type == F32 ||
+        output_primitive_type == F16)) {
+    return false;
+  }
+
+  if (!(IsRank2WithNoPadding(lhs_shape) && IsRank2WithNoPadding(rhs_shape) &&
+        IsRank2WithNoPadding(output_shape))) {
+    return false;
+  }
+
+  auto is_aligned = [&](const Shape& shape) {
+    return GetMinimumAlignmentForArray(shape, target_machine_features) >=
+           TargetMachineFeatures::kEigenExpectedTensorAlignment;
+  };
+
+  if (!is_aligned(lhs_shape) || !is_aligned(rhs_shape) ||
+      !is_aligned(output_shape)) {
+    return false;
+  }
+
+  return true;
 }
 
-bool PotentiallyImplementedAsEigenDot(const HloInstruction& hlo) {
+bool PotentiallyImplementedAsEigenDot(
+    const HloInstruction& hlo,
+    const TargetMachineFeatures& target_machine_features) {
   // For certain types of Dot, we can call Eigen
   if (hlo.opcode() == HloOpcode::kDot) {
     const Shape& lhs_shape = hlo.operand(0)->shape();
@@ -1093,28 +1599,18 @@ bool PotentiallyImplementedAsEigenDot(const HloInstruction& hlo) {
 
     // If gemm can accept the operand shapes, use it rather than a custom
     // kernel.
-    if (AreValidGemmShapes(lhs_shape, rhs_shape, hlo.shape())) {
+    if (AreValidGemmShapes(lhs_shape, rhs_shape, hlo.shape(),
+                           target_machine_features)) {
+      const DotDimensionNumbers& dim_numbers = hlo.dot_dimension_numbers();
       // The size of the reduction dimension should match. The shape inference
       // guarantees this invariant, so the check here is for programming
       // errors.
-      CHECK_EQ(lhs_shape.dimensions(1), rhs_shape.dimensions(0));
+      CHECK_EQ(lhs_shape.dimensions(dim_numbers.lhs_contracting_dimensions(0)),
+               rhs_shape.dimensions(dim_numbers.rhs_contracting_dimensions(0)));
       return true;
     }
   }
 
-  if (hlo.opcode() == HloOpcode::kFusion &&
-      hlo.fusion_kind() == HloInstruction::FusionKind::kTransposeDot &&
-      hlo.fused_expression_root()->opcode() == HloOpcode::kDot) {
-    auto* dot = hlo.fused_expression_root();
-    const Shape& lhs_shape = dot->operand(0)->shape();
-    const Shape& rhs_shape = dot->operand(1)->shape();
-    if (ShapeUtil::HasZeroElements(lhs_shape) ||
-        ShapeUtil::HasZeroElements(rhs_shape)) {
-      return false;
-    }
-    return true;
-  }
-
   return false;
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
index 9d748eb81f7850..d88ccea0dbc845 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
@@ -31,7 +31,9 @@ limitations under the License.
 namespace xla {
 namespace cpu {
 
-bool PotentiallyImplementedAsEigenDot(const HloInstruction& hlo);
+bool PotentiallyImplementedAsEigenDot(
+    const HloInstruction& hlo,
+    const TargetMachineFeatures& target_machine_features);
 
 // Returns the index for an operand to `hlo` that should ideally be column
 // major.  Returns nullopt if there is no such operand or if `hlo` is not a dot
@@ -55,17 +57,16 @@ class DotOpEmitter {
   // dimensions as the result, and the result is computed as `addend_array` +
   // dot(`lhs_array`, `rhs_array`).  A non-null `addend_array` is only supported
   // for Matrix-vector products.
-  static tensorflow::Status EmitDotOperation(
-      const HloInstruction& dot, bool transpose_lhs, bool transpose_rhs,
-      const llvm_ir::IrArray& target_array, const llvm_ir::IrArray& lhs_array,
-      const llvm_ir::IrArray& rhs_array, const llvm_ir::IrArray* addend_array,
+  static Status EmitDotOperation(
+      const HloInstruction& dot, const llvm_ir::IrArray& target_array,
+      const llvm_ir::IrArray& lhs_array, const llvm_ir::IrArray& rhs_array,
+      const llvm_ir::IrArray* addend_array,
       llvm::Value* executable_run_options_value, llvm::IRBuilder<>* ir_builder,
       const HloModuleConfig& hlo_module_config,
       const TargetMachineFeatures& target_machine_features);
 
  private:
-  DotOpEmitter(const HloInstruction& dot, bool transpose_lhs,
-               bool transpose_rhs, const llvm_ir::IrArray& target_array,
+  DotOpEmitter(const HloInstruction& dot, const llvm_ir::IrArray& target_array,
                const llvm_ir::IrArray& lhs_array,
                const llvm_ir::IrArray& rhs_array,
                const llvm_ir::IrArray* addend_array,
@@ -75,18 +76,18 @@ class DotOpEmitter {
                const TargetMachineFeatures& target_machine_features);
 
   // Emits the IR to perform the dot operation.
-  tensorflow::Status Emit();
+  Status Emit();
 
   // Emits instructions to perform a scalar dot product (a multiply of the
   // LHS and RHS) and store the results in the target.
-  tensorflow::Status EmitScalarDot();
+  Status EmitScalarDot();
 
   // Emit an LLVM IR implementation of the dot operation if we can.  Returns
   // true if an LLVM IR implementation was emitted.
   bool EmitLlvmIrDotIfProfitable();
 
   // Emits a call to the CPU runtime to perform the matrix multiply.
-  tensorflow::Status EmitCallToRuntime();
+  Status EmitCallToRuntime();
 
   // Emits a series of nested loops for iterating over an operand array in the
   // dot operation. Loops are constructed in major to minor dimension layout
@@ -99,10 +100,6 @@ class DotOpEmitter {
       llvm_ir::ForLoopNest* loop_nest, const llvm_ir::IrArray& operand_array,
       int64 reduction_dimension, tensorflow::StringPiece name_suffix);
 
-  // Our runtime operation requires that all arrays have the same layout,
-  // no padding, and a rank of two.
-  bool ShapesAreLegalForRuntimeDot() const;
-
   // Represents the dimensions of a matrix-matrix multiply operation.
   struct MatMultDims {
     // The number of rows in the LHS.
@@ -115,11 +112,20 @@ class DotOpEmitter {
     // The number of columns on the RHS.
     int64 n;
 
-    // True if the LHS matrix column major.
+    // True if the LHS matrix is column major.
     bool lhs_column_major;
 
-    // True if the RHS matrix column major.
+    // True if the LHS contraction dimension is not 1.
+    bool lhs_non_canonical;
+
+    // True if the RHS matrix is column major.
     bool rhs_column_major;
+
+    // True if the RHS contraction dimension is not 0.
+    bool rhs_non_canonical;
+
+    // True if the result matrix is column major.
+    bool target_column_major;
   };
 
   // Get the MatMultDims instance for the dot product this DotOpEmitter
@@ -127,6 +133,8 @@ class DotOpEmitter {
   // of rank 2 as well).
   MatMultDims GetMatMultDims() const;
 
+  bool EmitExperimentalGebpDotIfEnabled(const MatMultDims& mat_mult_dims);
+
   // When doing a tiled GEMV in LLVM IR, a "tile" consists of this many vector
   // registers.
   int64 GetGemvTilingFactor() const {
@@ -135,9 +143,18 @@ class DotOpEmitter {
         .value_or(kDefaultTilingFactor);
   }
 
+  // Returns true if we should use an experimental implementation of GEMM
+  // (general matrix matrix multiplication) if possible.
+  bool EnableExperimentalLlvmIrGemm() const {
+    return options::EnableExperimentalLlvmIrGemm(hlo_module_config_);
+  }
+
+  // Returns true if we should call into multi-threaded Eigen routines.
+  bool ShouldUseMultiThreadedEigen() {
+    return hlo_module_config_.debug_options().xla_cpu_multi_thread_eigen();
+  }
+
   const HloInstruction& dot_;
-  const bool transpose_lhs_;
-  const bool transpose_rhs_;
   const llvm_ir::IrArray& target_array_;
   const llvm_ir::IrArray& lhs_array_;
   const llvm_ir::IrArray& rhs_array_;
diff --git a/tensorflow/compiler/xla/service/cpu/external_constant_pool.cc b/tensorflow/compiler/xla/service/cpu/external_constant_pool.cc
index 7dcc4ca7fa08b4..c5628655915875 100644
--- a/tensorflow/compiler/xla/service/cpu/external_constant_pool.cc
+++ b/tensorflow/compiler/xla/service/cpu/external_constant_pool.cc
@@ -26,13 +26,13 @@ limitations under the License.
 
 namespace xla {
 namespace cpu {
-void ExternalConstantPool::Insert(string name, const Literal& literal,
+void ExternalConstantPool::Insert(string name, const LiteralSlice& literal,
                                   int64 alignment) {
   CHECK(!ShapeUtil::IsTuple(literal.shape()));
   CHECK(alignment > 0 && IsPowerOfTwo(static_cast<uint64>(alignment)));
   CHECK(entries_.find(name) == entries_.end());
 
-  int64 literal_size = ShapeUtil::ByteSizeOf(literal.shape());
+  const int64 literal_size = ShapeUtil::ByteSizeOf(literal.shape());
   void* raw_pointer = tensorflow::port::AlignedMalloc(
       literal_size, std::max<size_t>(alignment, sizeof(void*)));
   CHECK(raw_pointer != nullptr) << "failed to allocate " << literal_size
diff --git a/tensorflow/compiler/xla/service/cpu/external_constant_pool.h b/tensorflow/compiler/xla/service/cpu/external_constant_pool.h
index 8008a56df4dbf1..0677f5f0b58005 100644
--- a/tensorflow/compiler/xla/service/cpu/external_constant_pool.h
+++ b/tensorflow/compiler/xla/service/cpu/external_constant_pool.h
@@ -43,7 +43,7 @@ class ExternalConstantPool {
   // The constant pool copies out the contents of `literal` into a buffer it
   // owns -- it does not keep pointers to `literal`, or to memory owned by
   // `literal`.
-  void Insert(string name, const Literal& literal, int64 alignment);
+  void Insert(string name, const LiteralSlice& literal, int64 alignment);
 
   // Find the constant with name `name` in this constant pool.  If there isn't
   // such constant, return nullptr.
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
index f209a69e3cd0f8..b560b7531c0d24 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.cc
@@ -24,8 +24,25 @@ limitations under the License.
 namespace xla {
 namespace cpu {
 
+int64 GetMinimumAlignmentForArray(
+    const Shape& shape, const TargetMachineFeatures& target_machine_features) {
+  CHECK(ShapeUtil::IsArray(shape));
+  CHECK(!LayoutUtil::HasLayout(shape) || LayoutUtil::IsDense(shape.layout()));
+
+  // We don't require a layout to be set on `shape`.  This only works on CPU
+  // because we don't pad our tensors or otherwise have complicated data tiling
+  // schemes.
+
+  int64 allocation_size_bytes =
+      ShapeUtil::ElementsIn(shape) *
+      ShapeUtil::ByteSizeOfPrimitiveType(shape.element_type());
+  return target_machine_features.minimum_alignment_for_allocation(
+      allocation_size_bytes);
+}
+
 bool PotentiallyImplementedAsEigenConvolution(
-    const HloInstruction& convolution) {
+    const HloInstruction& convolution,
+    const TargetMachineFeatures& target_machine_features) {
   // The following conditions are necessary (but not sufficient) for
   // implementing `convolution` with Eigen convolution:
   // - the input and kernel have a non-zero number of elements.
@@ -35,6 +52,18 @@ bool PotentiallyImplementedAsEigenConvolution(
   // To be sufficient, certain layout constraints need to be satisfied as well.
   const Shape& input_shape = convolution.operand(0)->shape();
   const Shape& kernel_shape = convolution.operand(1)->shape();
+  const Shape& output_shape = convolution.shape();
+
+  auto is_aligned = [&](const Shape& shape) {
+    return GetMinimumAlignmentForArray(shape, target_machine_features) >=
+           TargetMachineFeatures::kEigenExpectedTensorAlignment;
+  };
+
+  if (!is_aligned(input_shape) || !is_aligned(kernel_shape) ||
+      !is_aligned(output_shape)) {
+    return false;
+  }
+
   if (ShapeUtil::HasZeroElements(input_shape) ||
       ShapeUtil::HasZeroElements(kernel_shape)) {
     return false;
@@ -71,7 +100,6 @@ bool PotentiallyImplementedAsEigenConvolution(
     }
   }
 
-  const Shape& output_shape = convolution.shape();
   return dnums.input_batch_dimension() == 0 &&
          dnums.input_feature_dimension() == input_shape.dimensions_size() - 1 &&
          dnums.output_batch_dimension() == 0 &&
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.h b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.h
index 34b2003916933f..68fbc7caaa9bfe 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emission_utils.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emission_utils.h
@@ -17,13 +17,20 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_IR_EMISSION_UTILS_H_
 
 #include "llvm/IR/Value.h"
+#include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 
 namespace xla {
 namespace cpu {
 
 bool PotentiallyImplementedAsEigenConvolution(
-    const HloInstruction& convolution);
+    const HloInstruction& convolution,
+    const TargetMachineFeatures& target_machine_features);
+
+// Computes the minimum alignment guaranteed for a tensor of shape `shape` on
+// the target machine.
+int64 GetMinimumAlignmentForArray(
+    const Shape& shape, const TargetMachineFeatures& target_machine_features);
 
 // Dynamic loop bounds are specified as an array of dimension index
 // [start, limit) pairs of ir values (one for each partitioned outer dimension).
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emission_utils_test.cc b/tensorflow/compiler/xla/service/cpu/ir_emission_utils_test.cc
index 215f48c4cc1a1a..530ebce854fedf 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emission_utils_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emission_utils_test.cc
@@ -15,8 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
 
+#include "tensorflow/compiler/xla/service/cpu/target_machine_features_fake.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 
 namespace xla {
 namespace {
@@ -34,12 +35,17 @@ ENTRY Conv {
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(hlo_string));
+                          ParseHloString(hlo_string));
 
   HloComputation* entry_computation = module->entry_computation();
 
   HloInstruction* conv_instr = entry_computation->root_instruction();
-  EXPECT_FALSE(cpu::PotentiallyImplementedAsEigenConvolution(*conv_instr));
+  cpu::TargetMachineFeaturesWithFakeAlignmentLogic target_machine_features(
+      [](int64 shape_size) {
+        return cpu::TargetMachineFeatures::kEigenExpectedTensorAlignment;
+      });
+  EXPECT_FALSE(cpu::PotentiallyImplementedAsEigenConvolution(
+      *conv_instr, target_machine_features));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 0b08ad8da3cf17..59223fddac2f5f 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -83,7 +83,7 @@ IrEmitter::IrEmitter(
     llvm::Module* llvm_module,
     std::unordered_map<const HloInstruction*, int64> instruction_to_profile_idx,
     std::unordered_map<const HloComputation*, int64> computation_to_profile_idx,
-    llvm::TargetMachine* target_machine,
+    const TargetMachineFeatures* target_machine_features,
     ExternalConstantPool* external_constant_pool)
     : assignment_(assignment),
       module_(llvm_module),
@@ -94,7 +94,7 @@ IrEmitter::IrEmitter(
       alias_analysis_(hlo_module, assignment, &llvm_module->getContext()),
       hlo_module_config_(hlo_module.config()),
       is_top_level_computation_(false),
-      target_machine_features_(target_machine),
+      target_machine_features_(*target_machine_features),
       external_constant_pool_(external_constant_pool) {
   ir_builder_.setFastMathFlags(llvm_ir::GetFastMathFlags(
       /*fast_math_enabled=*/hlo_module_config_.debug_options()
@@ -160,41 +160,59 @@ Status IrEmitter::HandleBitcast(HloInstruction* bitcast) {
   return Status::OK();
 }
 
-Status IrEmitter::HandleConstant(HloInstruction* constant) {
-  VLOG(2) << "HandleConstant: " << constant->ToString();
-  const Literal& literal = constant->literal();
-  llvm::GlobalVariable* global_for_const;
+llvm::Constant* IrEmitter::EmitGlobalForLiteral(const Literal& literal) {
+  llvm::Constant* result;
 
   // We avoid creating large constants in the LLVM IR since LLVM is not
   // efficient for large constant arrays.  We still emit "small enough" constant
   // arrays into the Ir, in the off chance the LLVM optimizer can do something
   // interesting with it.
+  //
+  // TODO(b/29904935): Remove the large constant pool.
   const int kMaxInternalConstantSizeInBytes = 128;
   if (external_constant_pool_ &&
       ByteSizeOf(literal.shape()) >= kMaxInternalConstantSizeInBytes) {
     string global_name = tensorflow::strings::StrCat(
         "constant_global_", external_global_constant_counter_++);
-    global_for_const = new llvm::GlobalVariable(
+    llvm::GlobalVariable* result_global = new llvm::GlobalVariable(
         /*Module=*/*module_,
         /*Type=*/IrShapeType(literal.shape()),
         /*isConstant=*/true,
         /*Linkage=*/llvm::GlobalValue::ExternalLinkage,
         /*Initializer=*/nullptr,
         /*Name=*/AsStringRef(global_name));
-    global_for_const->setAlignment(MinimumAlignmentForShape(literal.shape()));
+    result_global->setAlignment(MinimumAlignmentForShape(literal.shape()));
     external_constant_pool_->Insert(global_name, literal,
                                     MinimumAlignmentForShape(literal.shape()));
+    result = result_global;
   } else {
     llvm::Constant* initializer =
         llvm_ir::ConvertLiteralToIrConstant(literal, module_);
-    global_for_const = new llvm::GlobalVariable(
+    llvm::GlobalVariable* result_global = new llvm::GlobalVariable(
         /*Module=*/*module_,
         /*Type=*/initializer->getType(),
         /*isConstant=*/true,
         /*Linkage=*/llvm::GlobalValue::PrivateLinkage,
         /*Initializer=*/initializer,
         /*Name=*/"");
-    global_for_const->setAlignment(MinimumAlignmentForShape(literal.shape()));
+    result_global->setAlignment(MinimumAlignmentForShape(literal.shape()));
+    result = llvm::ConstantExpr::getBitCast(
+        result_global, IrShapeType(literal.shape())->getPointerTo());
+  }
+  return result;
+}
+
+Status IrEmitter::HandleConstant(HloInstruction* constant) {
+  VLOG(2) << "HandleConstant: " << constant->ToString();
+  const Literal& literal = constant->literal();
+  llvm::Constant* global_for_const;
+
+  auto it = emitted_literals_.find(&literal);
+  if (it != emitted_literals_.end()) {
+    global_for_const = it->second;
+  } else {
+    global_for_const = EmitGlobalForLiteral(literal);
+    emitted_literals_[&literal] = global_for_const;
   }
   emitted_value_[constant] = global_for_const;
   VLOG(2) << "  emitted value: " << llvm_ir::DumpToString(*global_for_const);
@@ -214,32 +232,6 @@ Status IrEmitter::HandleCopy(HloInstruction* copy) {
   }
 }
 
-// Calculate the alignment of a buffer with a particular size.
-int IrEmitter::MinimumAlignmentForBufferSize(int64 buffer_size) {
-  // GLibc returns a pointer with alignment 8 on 32-bit platforms and 16 on
-  // 64-bit platforms.  TCMalloc returns a pointer with alignment 8 for
-  // allocations smaller than kMallocAlignmentThreshold bytes and at least
-  // alignment 16 for allocations greater than or equal to
-  // kMallocAlignmentThreshold bytes.  N.B. We could improve on this lower bound
-  // by explicitly allocating the memory with posix_memalign.  This is
-  // complicated by our desire to allow parameter buffers created by clients to
-  // be consumed directly by the JIT.
-  if (buffer_size == 0) {
-    // No need to align empty buffers.
-    return 1;
-  }
-
-  const int64 kMallocAlignmentThreshold = 512;
-
-  int pointer_size = module_->getDataLayout().getPointerSize();
-  int buffer_alignment = buffer_size >= kMallocAlignmentThreshold
-                             ? 2 * pointer_size
-                             : pointer_size;
-  DCHECK_GT(buffer_alignment, 0);
-
-  return buffer_alignment;
-}
-
 // Calculate the alignment of a buffer allocated for a given primitive type.
 int IrEmitter::MinimumAlignmentForPrimitiveType(PrimitiveType primitive_type) {
   int64 byte_size = ShapeUtil::ByteSizeOfPrimitiveType(primitive_type);
@@ -264,7 +256,7 @@ int IrEmitter::MinimumAlignmentForShape(const Shape& shape) {
   DCHECK_GE(buffer_size, 0);
   DCHECK_LE(buffer_size, SIZE_MAX);
 
-  return MinimumAlignmentForBufferSize(buffer_size);
+  return target_machine_features_.minimum_alignment_for_allocation(buffer_size);
 }
 
 void IrEmitter::AttachAlignmentMetadataForLoad(llvm::LoadInst* load,
@@ -277,7 +269,8 @@ void IrEmitter::AttachAlignmentMetadataForLoad(llvm::LoadInst* load,
 
 void IrEmitter::AttachAlignmentMetadataForLoad(llvm::LoadInst* load,
                                                int64 buffer_size) {
-  int alignment = MinimumAlignmentForBufferSize(buffer_size);
+  int alignment =
+      target_machine_features_.minimum_alignment_for_allocation(buffer_size);
   if (alignment > 1) {
     llvm_ir::SetAlignmentMetadataForLoad(load, alignment);
   }
@@ -517,7 +510,7 @@ Status IrEmitter::HandleReduceWindow(HloInstruction* reduce_window) {
   HloComputation* function = reduce_window->to_apply();
   TF_RETURN_IF_ERROR(ElementTypesSameAndSupported(
       /*instruction=*/*reduce_window, /*operands=*/{operand},
-      /*supported_types=*/{F32, BF16}));
+      /*supported_types=*/{F32, BF16, S32}));
 
   // TODO(b/31410564): Implement dilation for reduce-window.
   if (window_util::HasDilation(window)) {
@@ -814,13 +807,6 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
         "Dot with multiple contracting dimensions not implemented.");
   }
 
-  if (dnums.lhs_contracting_dimensions(0) !=
-          std::min(lhs->shape().dimensions_size() - 1, 1) ||
-      dnums.rhs_contracting_dimensions(0) != 0) {
-    return Unimplemented(
-        "Dot with non-standard contracting dimensions not implemented.");
-  }
-
   llvm_ir::IrArray lhs_array(GetIrArrayFor(lhs));
   llvm_ir::IrArray rhs_array(GetIrArrayFor(rhs));
 
@@ -837,8 +823,7 @@ Status IrEmitter::HandleDot(HloInstruction* dot) {
 
   // Dot operation is complicated so we delegate to a helper class.
   return DotOpEmitter::EmitDotOperation(
-      *dot, /*transpose_lhs=*/false, /*transpose_rhs=*/false, target_array,
-      lhs_array, rhs_array, /*addend_array=*/nullptr,
+      *dot, target_array, lhs_array, rhs_array, /*addend_array=*/nullptr,
       GetExecutableRunOptionsArgument(), &ir_builder_, hlo_module_config_,
       target_machine_features_);
 }
@@ -854,7 +839,10 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
   const ConvolutionDimensionNumbers& dnums =
       convolution->convolution_dimension_numbers();
 
-  if (PotentiallyImplementedAsEigenConvolution(*convolution)) {
+  // TODO(tonywy): Add PotentiallyImplementedAsMKLCovolution to support
+  // different data layouts.
+  if (PotentiallyImplementedAsEigenConvolution(*convolution,
+                                               target_machine_features_)) {
     const Shape& lhs_shape = lhs->shape();
     const Shape& rhs_shape = rhs->shape();
     const Shape& convolution_shape = convolution->shape();
@@ -942,16 +930,26 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
            int64_type,    int64_type,  int64_type,  int64_type,  int64_type,
            int64_type,    int64_type,  int64_type,  int64_type},
           /*isVarArg=*/false);
-      bool multi_threaded_eigen =
+      bool multi_threaded =
           hlo_module_config_.debug_options().xla_cpu_multi_thread_eigen();
+      bool use_mkl_dnn =
+          hlo_module_config_.debug_options().xla_cpu_use_mkl_dnn();
+
+      // TODO(b/78639006) Singlethread MKL conv2d is not implemented due to the
+      // potential race condition by setting the omp_num_threads.
       const char* fn_name =
           primitive_type == F16
-              ? (multi_threaded_eigen
+              ? (multi_threaded
                      ? runtime::kEigenConvF16SymbolName
                      : runtime::kEigenSingleThreadedConvF16SymbolName)
-              : (multi_threaded_eigen
-                     ? runtime::kEigenConvF32SymbolName
+              : (multi_threaded
+                     ? (use_mkl_dnn ? runtime::kMKLConvF32SymbolName
+                                    : runtime::kEigenConvF32SymbolName)
                      : runtime::kEigenSingleThreadedConvF32SymbolName);
+      if (!multi_threaded && use_mkl_dnn) {
+        LOG(WARNING) << "Using Eigen instead of MKL-DNN for single-threaded "
+                        "conv2d function.";
+      }
       llvm::Function* conv_func = llvm::cast<llvm::Function>(
           module_->getOrInsertFunction(fn_name, conv_type));
       conv_func->setCallingConv(llvm::CallingConv::C);
@@ -1010,12 +1008,14 @@ Status IrEmitter::HandleConvolution(HloInstruction* convolution) {
         // We will accumulate the products into this sum to calculate
         // the output entry at the given index.
         PrimitiveType lhs_element_type = lhs->shape().element_type();
+        llvm::Type* lhs_llvm_type =
+            llvm_ir::PrimitiveTypeToIrType(lhs_element_type, module_);
         llvm::Value* sum_address = llvm_ir::EmitAllocaAtFunctionEntry(
-            llvm_ir::PrimitiveTypeToIrType(lhs_element_type, module_),
-            "convolution_sum_address", &ir_builder_,
+            lhs_llvm_type, "convolution_sum_address", &ir_builder_,
             MinimumAlignmentForPrimitiveType(lhs_element_type));
-        ir_builder_.CreateStore(
-            llvm::ConstantFP::get(ir_builder_.getFloatTy(), 0.0), sum_address);
+        llvm::Value* constant_zero =
+            llvm::Constant::getNullValue(lhs_llvm_type);
+        ir_builder_.CreateStore(constant_zero, sum_address);
 
         llvm_ir::ForLoopNest loops(IrName(convolution, "inner"), &ir_builder_);
         std::vector<llvm::Value*> kernel_spatial(num_spatial_dims);
@@ -1169,7 +1169,13 @@ Status IrEmitter::HandleFft(HloInstruction* fft) {
       {int8_ptr_type, int8_ptr_type, int8_ptr_type, int32_type, int32_type,
        int64_type, int64_type, int64_type, int64_type},
       /*isVarArg=*/false);
-  const char* fn_name = runtime::kEigenFftSymbolName;
+
+  bool multi_threaded_eigen =
+      hlo_module_config_.debug_options().xla_cpu_multi_thread_eigen();
+  const char* fn_name = multi_threaded_eigen
+                            ? runtime::kEigenFftSymbolName
+                            : runtime::kEigenSingleThreadedFftSymbolName;
+
   llvm::Function* fft_func = llvm::cast<llvm::Function>(
       module_->getOrInsertFunction(fn_name, fft_type));
   fft_func->setCallingConv(llvm::CallingConv::C);
@@ -1191,16 +1197,45 @@ Status IrEmitter::HandleFft(HloInstruction* fft) {
 }
 
 Status IrEmitter::HandleCrossReplicaSum(HloInstruction* crs) {
-  if (hlo_module_config_.replica_count() == 1) {
-    // When there is a single replica, a cross replica sum is the identity
-    // function, and the buffer assignment expects a copy (we could eliminate
-    // these at the HLO level as an optimization).
-    TF_RETURN_IF_ERROR(EmitTargetAddressForOp(crs));
+  if (hlo_module_config_.replica_count() != 1) {
+    // TODO(b/33011107): Support nontrivial cross replica sum on CPU.
+    return Unimplemented(
+        "CrossReplicaSum with >1 replica is not implemented on CPU.");
+  }
+
+  // When there is a single replica, a cross replica sum is the identity
+  // function, and the buffer assignment expects a copy.
+  //
+  // TODO(b/80100934): We would like to eliminate one-replica CRS nodes entirely
+  // in algebraic-simplifier, but currently on some platforms
+  // HloModuleConfig::num_replicas changes between when the module is compiled
+  // and when it's run.
+  TF_RETURN_IF_ERROR(EmitTargetAddressForOp(crs));
+
+  // CRS with one operand and one replica is simply the identity function.
+  if (crs->operand_count() == 1) {
     return EmitMemcpy(*crs->operand(0), *crs);
   }
 
-  // TODO(b/33011107): Support cross replica sum on CPU.
-  return Unimplemented("CrossReplicaSum is not implemented on CPU.");
+  // CRS with multiple operands and one replica produces a (one-deep) tuple.
+  std::vector<llvm::Value*> operand_ptrs;
+  for (int64 i = 0; i < crs->operand_count(); ++i) {
+    llvm::Value* in_ptr = GetEmittedValueFor(crs->operand(i));
+    TF_ASSIGN_OR_RETURN(const BufferAllocation::Slice out_slice,
+                        assignment_.GetUniqueSlice(crs, {i}));
+
+    const Shape& operand_shape = crs->operand(i)->shape();
+    CHECK(ShapeUtil::IsArray(operand_shape))
+        << "Operands to cross-replica-sum must be arrays: " << crs->ToString();
+    operand_ptrs.push_back(EmitTempBufferPointer(out_slice, operand_shape));
+
+    // TODO(b/63762267): Be more aggressive about specifying alignment.
+    ir_builder_.CreateMemCpy(operand_ptrs.back(), /*DstAlign=*/1, in_ptr,
+                             /*SrcAlign=*/1,
+                             ShapeUtil::ByteSizeOf(operand_shape));
+  }
+  llvm_ir::EmitTuple(GetIrArrayFor(crs), operand_ptrs, &ir_builder_, module_);
+  return Status::OK();
 }
 
 // Fills up the free variables in 'index_with_free_var' with values from
@@ -2061,44 +2096,7 @@ static const HloInstruction* StripTranspose(const HloInstruction& hlo) {
 
 Status IrEmitter::HandleFusion(HloInstruction* fusion) {
   auto* root = fusion->fused_expression_root();
-  if (fusion->fusion_kind() == HloInstruction::FusionKind::kTransposeDot) {
-    DCHECK(root->opcode() == HloOpcode::kDot);
-    const HloInstruction* lhs_parameter = StripTranspose(*root->operand(0));
-    const HloInstruction* rhs_parameter = StripTranspose(*root->operand(1));
-    DCHECK(lhs_parameter->opcode() == HloOpcode::kParameter &&
-           rhs_parameter->opcode() == HloOpcode::kParameter);
-    const HloInstruction* lhs =
-        fusion->operand(lhs_parameter->parameter_number());
-    const HloInstruction* rhs =
-        fusion->operand(rhs_parameter->parameter_number());
-
-    TF_RETURN_IF_ERROR(ElementTypesSameAndSupported(
-        /*instruction=*/*root, /*operands=*/{lhs, rhs},
-        /*supported_types=*/{F16, F32, F64}));
-
-    llvm_ir::IrArray lhs_array(GetIrArrayFor(lhs));
-    llvm_ir::IrArray rhs_array(GetIrArrayFor(rhs));
-
-    Shape target_shape = fusion->shape();
-    TF_RETURN_IF_ERROR(EmitTargetAddressForOp(fusion));
-    llvm_ir::IrArray target_array = GetIrArrayFor(fusion);
-    VLOG(2) << "HandleFusion kTransposeDot: ";
-    VLOG(2) << "  lhs operand: "
-            << llvm_ir::DumpToString(*lhs_array.GetBasePointer());
-    VLOG(2) << "  rhs operand: "
-            << llvm_ir::DumpToString(*rhs_array.GetBasePointer());
-    VLOG(2) << "  target: "
-            << llvm_ir::DumpToString(*target_array.GetBasePointer());
-
-    // Dot operation is complicated so we delegate to a helper class.
-    TF_RETURN_IF_ERROR(DotOpEmitter::EmitDotOperation(
-        *root, root->operand(0)->IsRank2Transpose(),
-        root->operand(1)->IsRank2Transpose(), target_array, lhs_array,
-        rhs_array, /*addend_array=*/nullptr, GetExecutableRunOptionsArgument(),
-        &ir_builder_, hlo_module_config_, target_machine_features_));
-    return Status::OK();
-  } else if (llvm_ir::CanEmitFusedDynamicUpdateSliceInPlace(fusion,
-                                                            assignment_)) {
+  if (llvm_ir::CanEmitFusedDynamicUpdateSliceInPlace(fusion, assignment_)) {
     VLOG(3) << "HandleFusion FusedDynamicUpdateSliceInPlace";
     CpuElementalIrEmitter elemental_emitter(hlo_module_config_, this, module_);
     TF_RETURN_IF_ERROR(EmitTargetAddressForOp(fusion));
@@ -2141,9 +2139,9 @@ Status IrEmitter::HandleFusion(HloInstruction* fusion) {
         GetIrArrayFor(fusion->operand(addend_param_number)));
 
     TF_RETURN_IF_ERROR(DotOpEmitter::EmitDotOperation(
-        *dot, /*transpose_lhs=*/false, /*transpose_rhs=*/false, target_array,
-        lhs_array, rhs_array, &addend_array, GetExecutableRunOptionsArgument(),
-        &ir_builder_, hlo_module_config_, target_machine_features_));
+        *dot, target_array, lhs_array, rhs_array, &addend_array,
+        GetExecutableRunOptionsArgument(), &ir_builder_, hlo_module_config_,
+        target_machine_features_));
     return Status::OK();
   } else {
     return Unimplemented("Fusion kind not implemented on CPU");
@@ -2538,8 +2536,12 @@ Status IrEmitter::FinishVisit(HloInstruction* root) {
   // nothing to do since the result was already written directly into the output
   // buffer.
   VLOG(2) << "FinishVisit root: " << root->ToString();
-  llvm::Value* root_value = GetEmittedValueFor(root);
-  VLOG(2) << "  value: " << llvm_ir::DumpToString(*root_value);
+  if (root->opcode() == HloOpcode::kOutfeed) {
+    VLOG(2) << "  outfeed with value: "
+            << llvm_ir::DumpToString(*GetEmittedValueFor(root->operand(0)));
+  } else {
+    VLOG(2) << "  value: " << llvm_ir::DumpToString(*GetEmittedValueFor(root));
+  }
 
   auto record_complete_computation = [&](llvm::Value* prof_counter) {
     if (prof_counter) {
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index 0f2f3d1817d6e8..32c536e18fee86 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -76,7 +76,7 @@ class IrEmitter : public DfsHloVisitorWithDefault {
                 instruction_to_profile_idx,
             std::unordered_map<const HloComputation*, int64>
                 computation_to_profile_idx,
-            llvm::TargetMachine* target_machine,
+            const TargetMachineFeatures* target_machine,
             ExternalConstantPool* external_constant_pool);
   ~IrEmitter() override;
 
@@ -514,9 +514,6 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   // Calculate the alignment of a buffer allocated for a given primitive type.
   int MinimumAlignmentForPrimitiveType(PrimitiveType primitive_type);
 
-  // Calculate the alignment of a buffer with a particular size.
-  int MinimumAlignmentForBufferSize(int64 buffer_size);
-
   // Returns the number of bytes within the shape.
   int64 ByteSizeOf(const Shape& shape) const;
 
@@ -530,15 +527,32 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   Status EmitXfeedTransfer(XfeedKind kind, const Shape& shape,
                            llvm::Value* program_buffer_address);
 
+  // Returns a ConstExpr bitcast.
+  llvm::Constant* EmitGlobalForLiteral(const Literal& literal);
+
   const HloModuleConfig& hlo_module_config_;
 
   bool is_top_level_computation_;
 
-  TargetMachineFeatures target_machine_features_;
+  const TargetMachineFeatures& target_machine_features_;
 
   int64 external_global_constant_counter_ = 0;
   ExternalConstantPool* external_constant_pool_;
 
+  struct LiteralPtrHashFunctor {
+    size_t operator()(const Literal* literal) const { return literal->Hash(); }
+  };
+
+  struct LiteralPtrEqualityFunctor {
+    bool operator()(const Literal* lhs, const Literal* rhs) const {
+      return *lhs == *rhs;
+    }
+  };
+
+  tensorflow::gtl::FlatMap<const Literal*, llvm::Constant*,
+                           LiteralPtrHashFunctor, LiteralPtrEqualityFunctor>
+      emitted_literals_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(IrEmitter);
 };
 
diff --git a/tensorflow/compiler/xla/service/cpu/ir_function.h b/tensorflow/compiler/xla/service/cpu/ir_function.h
index 557aa4a6bfc2ef..2e55181eed867a 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_function.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_function.h
@@ -33,8 +33,8 @@ namespace cpu {
 // emitters for function and function argument access.
 // The llvm::Function is created with the standard function signature
 // used in the XLA CPU backend (see ir_function.cc for argument details).
-// In addtion IrFunction saves the callers IR insert point during contruction,
-// and restores it after desctruction.
+// In addition IrFunction saves the callers IR insert point during construction,
+// and restores it after destruction.
 //
 // Example usage:
 //
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
index fb28280fade307..4fa5984b0466b1 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
@@ -38,7 +38,7 @@ class SimpleCostModel : public ParallelCostModel {
     const int64 min_cost_per_thread = 256LL << 10;  // 256KB L2 Cache size.
     // Return target parallel task count in [1, max_parallelism_].
     return std::min(max_parallelism_,
-                    std::max(1LL, instruction_cost / min_cost_per_thread));
+                    std::max(int64{1}, instruction_cost / min_cost_per_thread));
   }
 
  private:
@@ -63,7 +63,7 @@ class DefaultCostModel : public ParallelCostModel {
     int64 max_parallelism;
     // Calculate flops-to-bytes-ratio for 'instruction'.
     const int64 bytes_accessed =
-        std::max(1LL, cost_analysis_->bytes_accessed(*instruction));
+        std::max(int64{1}, cost_analysis_->bytes_accessed(*instruction));
     const float flops_to_bytes_ratio =
         cost_analysis_->flop_count(*instruction) /
         static_cast<float>(bytes_accessed);
@@ -93,7 +93,7 @@ class DefaultCostModel : public ParallelCostModel {
     }
     // Return target parallel task count in [1, max_parallelism_].
     return std::min(max_parallelism,
-                    std::max(1LL, instruction_cost / min_cost_per_thread));
+                    std::max(int64{1}, instruction_cost / min_cost_per_thread));
   }
 
  private:
@@ -104,7 +104,9 @@ class DefaultCostModel : public ParallelCostModel {
 
 ParallelTaskAssignment::ParallelTaskAssignment(
     const int64 max_parallelism,
-    const HloCostAnalysis::ShapeSizeFunction& shape_size, HloModule* module) {
+    const HloCostAnalysis::ShapeSizeFunction& shape_size, HloModule* module,
+    const TargetMachineFeatures* target_machine_features)
+    : target_machine_features_(*target_machine_features) {
   VLOG(1) << "ParallelTaskAssignment max_parallelism: " << max_parallelism;
   // Run cost analysis on 'module'.
   auto cost_analysis = MakeUnique<HloCostAnalysis>(shape_size);
@@ -127,7 +129,7 @@ int64 ParallelTaskAssignment::GetTargetParallelTaskCount(
   // Currently, we do not assign parallel tasks to instructions with at least
   // one of the following properties:
   // *) Internal threading (library calls to kConv, kDot, kFft, kCustomCall).
-  // *) Emit custom loops (kSelectAndScatter, FusionKind::kTransposeDot).
+  // *) Emit custom loops (kSelectAndScatter).
   // *) Operations that are not thread safe (like infeed and rng).
   // *) Tuple-shaped.
   // TODO(b/27458679) Parallelize instructions which are skipped here.
@@ -139,8 +141,10 @@ int64 ParallelTaskAssignment::GetTargetParallelTaskCount(
       opcode == HloOpcode::kFft || opcode == HloOpcode::kInfeed ||
       opcode == HloOpcode::kOutfeed || opcode == HloOpcode::kRng ||
       (opcode == HloOpcode::kConvolution &&
-       PotentiallyImplementedAsEigenConvolution(*instruction)) ||
-      PotentiallyImplementedAsEigenDot(*instruction) ||
+       PotentiallyImplementedAsEigenConvolution(*instruction,
+                                                target_machine_features_)) ||
+      PotentiallyImplementedAsEigenDot(*instruction,
+                                       target_machine_features_) ||
       (opcode == HloOpcode::kFusion &&
        instruction->fusion_kind() != HloInstruction::FusionKind::kLoop) ||
       ShapeUtil::IsTuple(instruction->shape())) {
@@ -231,7 +235,8 @@ bool ParallelTaskAssigner::AssignParallelTasksHelper(
 void ParallelTaskAssigner::ComputeTargetParallelTasks(
     HloModule* module, HloToParallelTasks* hlo_to_parallel_tasks) {
   ParallelTaskAssignment parallel_task_assignment(max_parallelism_,
-                                                  shape_size_function_, module);
+                                                  shape_size_function_, module,
+                                                  &target_machine_features_);
 
   // Compute parallel task counts for all instructions in 'module'.
   for (auto* computation : module->computations()) {
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h
index 7140dabe516cd7..8becc8fa23424d 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_TASK_ASSIGNMENT_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_PARALLEL_TASK_ASSIGNMENT_H_
 
+#include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
@@ -39,7 +40,8 @@ class ParallelTaskAssignment {
   // 'module': the containing HloModule.
   ParallelTaskAssignment(const int64 max_parallelism,
                          const HloCostAnalysis::ShapeSizeFunction& shape_size,
-                         HloModule* module);
+                         HloModule* module,
+                         const TargetMachineFeatures* target_machine_features);
   ~ParallelTaskAssignment() {}
 
   // Computes and returns the target parallel task count for 'instruction'.
@@ -47,6 +49,7 @@ class ParallelTaskAssignment {
 
  private:
   std::unique_ptr<ParallelCostModel> cost_model_;
+  const TargetMachineFeatures& target_machine_features_;
 };
 
 // ParallelTaskAssigner computes target parallel task counts for all HLOs
@@ -63,8 +66,11 @@ class ParallelTaskAssigner : public HloPassInterface {
   // 'shape_size': shape size function used by HloCostAnalysis during parallel
   //               task assignment.
   ParallelTaskAssigner(const int64 max_parallelism,
-                       const HloCostAnalysis::ShapeSizeFunction& shape_size)
-      : max_parallelism_(max_parallelism), shape_size_function_(shape_size) {}
+                       const HloCostAnalysis::ShapeSizeFunction& shape_size,
+                       const TargetMachineFeatures* target_machine_features)
+      : max_parallelism_(max_parallelism),
+        shape_size_function_(shape_size),
+        target_machine_features_(*target_machine_features) {}
   ~ParallelTaskAssigner() override {}
 
   tensorflow::StringPiece name() const override {
@@ -94,6 +100,7 @@ class ParallelTaskAssigner : public HloPassInterface {
 
   int64 max_parallelism_;
   HloCostAnalysis::ShapeSizeFunction shape_size_function_;
+  const TargetMachineFeatures& target_machine_features_;
 };
 
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
index 13eb75a57213b1..fc2efbaf9a22b0 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/cpu/parallel_task_assignment.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_executable.h"
+#include "tensorflow/compiler/xla/service/cpu/target_machine_features_fake.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -31,6 +32,19 @@ class ParallelTaskAssignmentTest : public HloVerifiedTestBase {
   // Use any value larger than 2 since we only test whether a module is
   // parallelized or not
   const int max_parallelism_ = 10;
+
+  cpu::TargetMachineFeaturesWithFakeAlignmentLogic target_machine_features_;
+
+  ParallelTaskAssignmentTest()
+      : target_machine_features_([](int64 shape_size) {
+          return cpu::TargetMachineFeatures::kEigenExpectedTensorAlignment;
+        }) {}
+
+  StatusOr<bool> RunParallelTaskAssigner(HloModule* module) {
+    return cpu::ParallelTaskAssigner(max_parallelism_, shape_size_func_,
+                                     &target_machine_features_)
+        .Run(module);
+  }
 };
 
 TEST_F(ParallelTaskAssignmentTest, DotOperationNotParallelized) {
@@ -45,9 +59,7 @@ TEST_F(ParallelTaskAssignmentTest, DotOperationNotParallelized) {
   )";
 
   ParseAndVerifyModule(hlo_string);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, cpu::ParallelTaskAssigner(
-                                            max_parallelism_, shape_size_func_)
-                                            .Run(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(&module()));
   EXPECT_FALSE(changed);
 }
 
@@ -74,9 +86,7 @@ TEST_F(ParallelTaskAssignmentTest,
   )";
 
   ParseAndVerifyModule(hlo_string);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, cpu::ParallelTaskAssigner(
-                                            max_parallelism_, shape_size_func_)
-                                            .Run(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(&module()));
   EXPECT_FALSE(changed);
 }
 
@@ -92,9 +102,7 @@ TEST_F(ParallelTaskAssignmentTest, RngOperationNotParallelized) {
   )";
 
   ParseAndVerifyModule(hlo_string);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, cpu::ParallelTaskAssigner(
-                                            max_parallelism_, shape_size_func_)
-                                            .Run(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(&module()));
   EXPECT_FALSE(changed);
 }
 
@@ -108,9 +116,7 @@ TEST_F(ParallelTaskAssignmentTest, InfeedOutfeedOperationNotParallelized) {
   )";
 
   ParseAndVerifyModule(hlo_string);
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, cpu::ParallelTaskAssigner(
-                                            max_parallelism_, shape_size_func_)
-                                            .Run(&module()));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunParallelTaskAssigner(&module()));
   EXPECT_FALSE(changed);
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_conv2d_mkl.cc b/tensorflow/compiler/xla/service/cpu/runtime_conv2d_mkl.cc
new file mode 100644
index 00000000000000..c60580d6e763c6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_conv2d_mkl.cc
@@ -0,0 +1,183 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/xla/service/cpu/runtime_conv2d_mkl.h"
+#include <iostream>
+#include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/core/platform/dynamic_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+using tensorflow::int64;
+
+#ifdef INTEL_MKL
+#include <omp.h>
+#include "mkldnn.hpp"
+#include "tensorflow/compiler/xla/service/cpu/runtime_conv2d.h"
+
+namespace {
+
+// Downcast an int64 to int and check if value is in range.
+int ToInt(int64 input) {
+  int output = static_cast<int>(input);
+  if (static_cast<int64>(output) != input) {
+    std::cerr << "Error occurred in downcasting int64 to int32: Value " << input
+              << " is out-of-range for type int32. \n";
+    exit(1);
+  }
+  return output;
+}
+
+using mkldnn::convolution_direct;
+using mkldnn::convolution_forward;
+using mkldnn::engine;
+using mkldnn::memory;
+using mkldnn::padding_kind;
+using mkldnn::primitive;
+using mkldnn::prop_kind;
+using mkldnn::reorder;
+using mkldnn::stream;
+
+template <typename EigenDevice, typename ScalarType>
+void MKLConvImpl(const EigenDevice& device, ScalarType* out, ScalarType* lhs,
+                 ScalarType* rhs, int64 input_batch, int64 input_rows,
+                 int64 input_cols, int64 input_channels, int64 kernel_rows,
+                 int64 kernel_cols, int64 kernel_channels, int64 kernel_filters,
+                 int64 output_rows, int64 output_cols, int64 row_stride,
+                 int64 col_stride, int64 padding_top, int64 padding_bottom,
+                 int64 padding_left, int64 padding_right,
+                 int64 lhs_row_dilation, int64 lhs_col_dilation,
+                 int64 rhs_row_dilation, int64 rhs_col_dilation) {
+  auto cpu_engine = engine(engine::cpu, 0);
+
+  // Create a vector primitive to hold the network.
+  std::vector<primitive> net;
+
+  // Since memory::dims takes int for each dimension, we downcast the int64
+  // values to int using the ToInt function defined above.
+  memory::dims conv1_src_dim = {ToInt(input_batch), ToInt(input_channels),
+                                ToInt(input_rows), ToInt(input_cols)};
+  memory::dims conv1_weights_dim = {ToInt(kernel_filters),
+                                    ToInt(kernel_channels), ToInt(kernel_rows),
+                                    ToInt(kernel_cols)};
+  memory::dims conv1_dst_dim = {ToInt(input_batch), ToInt(kernel_filters),
+                                ToInt(output_rows), ToInt(output_cols)};
+  memory::dims conv1_strides = {ToInt(row_stride), ToInt(col_stride)};
+  // Note: In MKL_DNN dilation starts from 0.
+  memory::dims conv1_dilates = {ToInt(rhs_row_dilation - 1),
+                                ToInt(rhs_col_dilation - 1)};
+  memory::dims conv1_padding_l = {ToInt(padding_top), ToInt(padding_left)};
+  memory::dims conv1_padding_r = {ToInt(padding_bottom), ToInt(padding_right)};
+
+  // Create memory for user data. Input and output data have format of NHWC and
+  // kernel data has format of HWIO.
+  // Note that as a convention in MKL-DNN, the dimensions of the data is always
+  // described in NCHW/IOHW, regardless of the actual layout of the data.
+  auto user_src_memory =
+      memory({{{conv1_src_dim}, memory::data_type::f32, memory::format::nhwc},
+              cpu_engine},
+             lhs);
+  auto user_weights_memory = memory(
+      {{{conv1_weights_dim}, memory::data_type::f32, memory::format::hwio},
+       cpu_engine},
+      rhs);
+  auto user_dst_memory =
+      memory({{{conv1_dst_dim}, memory::data_type::f32, memory::format::nhwc},
+              cpu_engine},
+             out);
+
+  // Create memory descriptors for convolution data with no specified format for
+  // best performance.
+  auto conv1_src_mem_desc = memory::desc(
+      {conv1_src_dim}, memory::data_type::f32, memory::format::any);
+  auto conv1_weights_mem_desc = memory::desc(
+      {conv1_weights_dim}, memory::data_type::f32, memory::format::any);
+  auto conv1_dst_mem_desc = memory::desc(
+      {conv1_dst_dim}, memory::data_type::f32, memory::format::any);
+
+  // Create a convolution.
+  auto conv1_desc = convolution_forward::desc(
+      prop_kind::forward_inference, convolution_direct, conv1_src_mem_desc,
+      conv1_weights_mem_desc, conv1_dst_mem_desc, conv1_strides, conv1_dilates,
+      conv1_padding_l, conv1_padding_r, padding_kind::zero);
+  auto conv1_prim_desc =
+      convolution_forward::primitive_desc(conv1_desc, cpu_engine);
+
+  // Create reorders for data and weights if layout requested by convolution is
+  // different from NCHW/OIHW.
+  auto conv1_src_memory = user_src_memory;
+  if (memory::primitive_desc(conv1_prim_desc.src_primitive_desc()) !=
+      user_src_memory.get_primitive_desc()) {
+    conv1_src_memory = memory(conv1_prim_desc.src_primitive_desc());
+    net.push_back(reorder(user_src_memory, conv1_src_memory));
+  }
+
+  auto conv1_weights_memory = user_weights_memory;
+  if (memory::primitive_desc(conv1_prim_desc.weights_primitive_desc()) !=
+      user_weights_memory.get_primitive_desc()) {
+    conv1_weights_memory = memory(conv1_prim_desc.weights_primitive_desc());
+    net.push_back(reorder(user_weights_memory, conv1_weights_memory));
+  }
+
+  // Check if output need layout conversion. If yes, create memory for
+  // intermediate layer of conv1_dst_memory.
+  bool need_output_conversion =
+      (memory::primitive_desc(conv1_prim_desc.dst_primitive_desc()) !=
+       user_dst_memory.get_primitive_desc());
+  auto conv1_dst_memory = need_output_conversion
+                              ? memory(conv1_prim_desc.dst_primitive_desc())
+                              : user_dst_memory;
+
+  // Create convolution primitive and add it to net.
+  net.push_back(convolution_forward(conv1_prim_desc, conv1_src_memory,
+                                    conv1_weights_memory, conv1_dst_memory));
+  if (need_output_conversion) {
+    net.push_back(reorder(conv1_dst_memory, user_dst_memory));
+  }
+  stream(stream::kind::eager).submit(net).wait();
+}
+}  // namespace
+#endif  // INTEL_MKL
+
+TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_MKLConvF32(
+    const void* run_options_ptr, float* out, float* lhs, float* rhs,
+    int64 input_batch, int64 input_rows, int64 input_cols, int64 input_channels,
+    int64 kernel_rows, int64 kernel_cols, int64 kernel_channels,
+    int64 kernel_filters, int64 output_rows, int64 output_cols,
+    int64 row_stride, int64 col_stride, int64 padding_top, int64 padding_bottom,
+    int64 padding_left, int64 padding_right, int64 lhs_row_dilation,
+    int64 lhs_col_dilation, int64 rhs_row_dilation, int64 rhs_col_dilation) {
+#ifdef INTEL_MKL
+  // Since MKL_DNN cannot handle transposed convolution, this is handled by
+  // Eigen.
+  if (lhs_row_dilation > 1 || lhs_col_dilation > 1) {
+    __xla_cpu_runtime_EigenConvF32(
+        run_options_ptr, out, lhs, rhs, input_batch, input_rows, input_cols,
+        input_channels, kernel_rows, kernel_cols, kernel_channels,
+        kernel_filters, output_rows, output_cols, row_stride, col_stride,
+        padding_top, padding_bottom, padding_left, padding_right,
+        lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation);
+  } else {
+    MKLConvImpl(nullptr, out, lhs, rhs, input_batch, input_rows, input_cols,
+                input_channels, kernel_rows, kernel_cols, kernel_channels,
+                kernel_filters, output_rows, output_cols, row_stride,
+                col_stride, padding_top, padding_bottom, padding_left,
+                padding_right, lhs_row_dilation, lhs_col_dilation,
+                rhs_row_dilation, rhs_col_dilation);
+  }
+#else
+  std::cerr << "Attempt to call MKL Conv2D runtime library without defining "
+               "INTEL_MKL. Add --config=mkl to build with MKL.";
+  exit(1);
+#endif  // INTEL_MKL
+}
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_conv2d_mkl.h b/tensorflow/compiler/xla/service/cpu/runtime_conv2d_mkl.h
new file mode 100644
index 00000000000000..b239e71d231c52
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_conv2d_mkl.h
@@ -0,0 +1,39 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_CONV2D_MKL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_CONV2D_MKL_H_
+
+#include <iostream>
+#include "tensorflow/core/platform/types.h"
+
+extern "C" {
+
+extern void __xla_cpu_runtime_MKLConvF32(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, float* out,
+    float* lhs, float* rhs, tensorflow::int64 input_batch,
+    tensorflow::int64 input_rows, tensorflow::int64 input_cols,
+    tensorflow::int64 input_channels, tensorflow::int64 kernel_rows,
+    tensorflow::int64 kernel_cols, tensorflow::int64 kernel_channels,
+    tensorflow::int64 kernel_filters, tensorflow::int64 output_rows,
+    tensorflow::int64 output_cols, tensorflow::int64 row_stride,
+    tensorflow::int64 col_stride, tensorflow::int64 padding_top,
+    tensorflow::int64 padding_bottom, tensorflow::int64 padding_left,
+    tensorflow::int64 padding_right, tensorflow::int64 lhs_row_dilation,
+    tensorflow::int64 lhs_col_dilation, tensorflow::int64 rhs_row_dilation,
+    tensorflow::int64 rhs_col_dilation);
+}
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_CONV2D_MKL_H_
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h b/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
index 984cb0616e0247..0bf693edd0b985 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h
@@ -21,8 +21,6 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/numeric_types.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/platform/types.h"
 
 // 'tensorflow' namespace is used so that int64 and other types don't require
@@ -71,11 +69,9 @@ void EigenFftR2C(const EigenDevice& device, complex64* out, float* operand,
   in_dims[0] = input_batch;
   Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> out_dims;
   out_dims[0] = input_batch;
-  TensorShape temp_shape{input_batch};
   for (int i = 0; i < FFTRank; i++) {
     in_dims[i + 1] = fft_shape[i];
     out_dims[i + 1] = i == FFTRank - 1 ? fft_shape[i] / 2 + 1 : fft_shape[i];
-    temp_shape.AddDim(fft_shape[i]);
   }
   const Eigen::TensorMap<Eigen::Tensor<float, FFTRank + 1, Eigen::RowMajor>,
                          Eigen::Aligned>
@@ -88,8 +84,8 @@ void EigenFftR2C(const EigenDevice& device, complex64* out, float* operand,
   const auto axes = Eigen::ArrayXi::LinSpaced(FFTRank, 1, FFTRank);
 
   // Compute the full FFT using a temporary tensor.
-  Tensor temp(DataTypeToEnum<complex64>::v(), temp_shape);
-  auto full_fft = temp.flat_inner_dims<complex64, FFTRank + 1>();
+  Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor> full_fft(in_dims);
+
   const Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> zero_start_indices;
   full_fft.device(device) =
       input.template fft<Eigen::BothParts, Eigen::FFT_FORWARD>(axes);
@@ -112,11 +108,9 @@ void EigenFftC2R(const EigenDevice& device, float* out, complex64* operand,
   in_dims[0] = input_batch;
   Eigen::DSizes<Eigen::DenseIndex, FFTRank + 1> out_dims;
   out_dims[0] = input_batch;
-  TensorShape temp_shape{input_batch};
   for (int i = 0; i < FFTRank; i++) {
     in_dims[i + 1] = i == FFTRank - 1 ? fft_shape[i] / 2 + 1 : fft_shape[i];
     out_dims[i + 1] = fft_shape[i];
-    temp_shape.AddDim(fft_shape[i]);
   }
   const Eigen::TensorMap<Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor>,
                          Eigen::Aligned>
@@ -129,8 +123,7 @@ void EigenFftC2R(const EigenDevice& device, float* out, complex64* operand,
   // region we will slice from input given fft_shape. We slice input to
   // fft_shape on its inner-most dimensions, except the last (which we
   // slice to fft_shape[-1] / 2 + 1).
-  Tensor temp(DataTypeToEnum<complex64>::v(), temp_shape);
-  auto full_fft = temp.flat_inner_dims<complex64, FFTRank + 1>();
+  Eigen::Tensor<complex64, FFTRank + 1, Eigen::RowMajor> full_fft(out_dims);
 
   // Calculate the starting point and range of the source of
   // negative frequency part.
@@ -179,7 +172,6 @@ template <int FFTRank, typename EigenDevice>
 void EigenFftWithRank(const EigenDevice& device, void* out, void* operand,
                       int32 fft_type, int64 input_batch, int64 fft_length0,
                       int64 fft_length1, int64 fft_length2) {
-  CHECK(::xla::FftType_IsValid(fft_type)) << fft_type;
   switch (fft_type) {
     case ::xla::FftType::FFT:
       EigenFftC2C<true, FFTRank, EigenDevice>(
@@ -204,7 +196,8 @@ void EigenFftWithRank(const EigenDevice& device, void* out, void* operand,
           input_batch, fft_length0, fft_length1, fft_length2);
       break;
     default:
-      LOG(FATAL) << "Unsupported FFT type: " << fft_type;
+      // Unsupported FFT type
+      abort();
   }
 }
 
@@ -230,7 +223,8 @@ void EigenFftImpl(const EigenDevice& device, void* out, void* operand,
                                                  fft_length1, fft_length2);
       break;
     default:
-      LOG(FATAL) << "Unsupported FFT rank " << fft_rank;
+      // Unsupported FFT rank
+      abort();
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc
new file mode 100644
index 00000000000000..2613ddb12704ae
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.cc
@@ -0,0 +1,32 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h"
+
+#include "tensorflow/compiler/xla/service/cpu/runtime_fft_impl.h"
+#include "tensorflow/core/platform/dynamic_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+using tensorflow::int32;
+using tensorflow::int64;
+
+TF_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenSingleThreadedFft(
+    const void* run_options_ptr, void* out, void* operand, int32 fft_type,
+    int32 fft_rank, int64 input_batch, int64 fft_length0, int64 fft_length1,
+    int64 fft_length2) {
+  tensorflow::xla::EigenFftImpl(Eigen::DefaultDevice(), out, operand, fft_type,
+                                fft_rank, input_batch, fft_length0, fft_length1,
+                                fft_length2);
+}
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h
new file mode 100644
index 00000000000000..dcd133d012cf07
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_FFT_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_FFT_H_
+
+#include "tensorflow/core/platform/types.h"
+
+extern "C" {
+
+extern void __xla_cpu_runtime_EigenSingleThreadedFft(
+    const void* /* xla::ExecutableRunOptions* */ run_options_ptr, void* out,
+    void* operand, tensorflow::int32 fft_type, tensorflow::int32 fft_rank,
+    tensorflow::int64 input_batch, tensorflow::int64 fft_length0,
+    tensorflow::int64 fft_length1, tensorflow::int64 fft_length2);
+
+}  // extern "C"
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_SINGLE_THREADED_FFT_H_
diff --git a/tensorflow/compiler/xla/service/cpu/sample_harness.cc b/tensorflow/compiler/xla/service/cpu/sample_harness.cc
index b3f4609d465efb..167aa4adda995a 100644
--- a/tensorflow/compiler/xla/service/cpu/sample_harness.cc
+++ b/tensorflow/compiler/xla/service/cpu/sample_harness.cc
@@ -19,10 +19,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -48,13 +48,13 @@ int main(int argc, char** argv) {
       client->TransferToServer(*param1_literal).ConsumeValueOrDie();
 
   // Build computation.
-  xla::ComputationBuilder builder(client, "");
+  xla::XlaBuilder builder("");
   auto p0 = builder.Parameter(0, param0_literal->shape(), "param0");
   auto p1 = builder.Parameter(1, param1_literal->shape(), "param1");
   auto add = builder.Add(p1, p0, {0});
 
-  xla::StatusOr<xla::Computation> computation_status = builder.Build();
-  xla::Computation computation = computation_status.ConsumeValueOrDie();
+  xla::StatusOr<xla::XlaComputation> computation_status = builder.Build();
+  xla::XlaComputation computation = computation_status.ConsumeValueOrDie();
 
   // Execute and transfer result of computation.
   xla::ExecutionProfile profile;
diff --git a/tensorflow/compiler/xla/service/cpu/shape_partition.cc b/tensorflow/compiler/xla/service/cpu/shape_partition.cc
index 42fe955f1917e0..d12c5396148d32 100644
--- a/tensorflow/compiler/xla/service/cpu/shape_partition.cc
+++ b/tensorflow/compiler/xla/service/cpu/shape_partition.cc
@@ -115,7 +115,7 @@ ShapePartitionIterator::ShapePartitionIterator(
   for (int i = 0; i < dimension_partition_sizes_.size(); ++i) {
     const int64 dim_size = shape_.dimensions(dimensions_[i]);
     dimension_partition_sizes_[i] =
-        std::max(1LL, dim_size / dimension_partition_counts_[i]);
+        std::max(int64{1}, dim_size / dimension_partition_counts_[i]);
   }
 
   // Calculate the partition strides for each dimension.
diff --git a/tensorflow/compiler/xla/service/cpu/shape_partition.h b/tensorflow/compiler/xla/service/cpu/shape_partition.h
index 33d02b70e61e33..db2cda2936c834 100644
--- a/tensorflow/compiler/xla/service/cpu/shape_partition.h
+++ b/tensorflow/compiler/xla/service/cpu/shape_partition.h
@@ -38,7 +38,7 @@ namespace cpu {
 //
 //     [0, 1), [1, 2), [2, 3), [3, 4), [4, 5) [5, 8)
 //
-//   Note that the last partition has residule because the dimension size is
+//   Note that the last partition has residual because the dimension size is
 //   not a multiple of the partition count.
 //
 //
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index b7ce5bbe474823..c4c90515ac7ec2 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -31,12 +31,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/service/cpu/orc_jit_memory_mapper.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_conv2d.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime_conv2d_mkl.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_fft.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_fork_join.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_fp16.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_conv2d.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_fft.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime_single_threaded_matmul.h"
 #include "tensorflow/compiler/xla/service/cpu/windows_compatibility.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -72,23 +74,33 @@ llvm::StringRef GetHostCpuName() {
 }
 }  // namespace
 
+/*static*/ std::unique_ptr<llvm::TargetMachine>
+SimpleOrcJIT::InferTargetMachineForJIT(
+    const llvm::TargetOptions& target_options,
+    llvm::CodeGenOpt::Level opt_level) {
+  std::unique_ptr<llvm::TargetMachine> target_machine(
+      llvm::EngineBuilder()
+          .setTargetOptions(target_options)
+          .setOptLevel(opt_level)
+          .selectTarget(
+              /*TargetTriple=*/llvm::Triple(), /*MArch=*/"",
+              /*MCPU=*/GetHostCpuName(),
+              /*MAttrs=*/DetectMachineAttributes()));
+  CHECK(target_machine != nullptr);
+  return target_machine;
+}
+
 SimpleOrcJIT::SimpleOrcJIT(const llvm::TargetOptions& target_options,
                            llvm::CodeGenOpt::Level opt_level,
                            bool optimize_for_size, bool enable_fast_math,
                            bool disable_expensive_passes,
                            LLVMCompiler::ModuleHook pre_optimization_hook,
                            LLVMCompiler::ModuleHook post_optimization_hook)
-    : target_machine_(
-          CHECK_NOTNULL(llvm::EngineBuilder()
-                            .setTargetOptions(target_options)
-                            .setOptLevel(opt_level)
-                            .selectTarget(
-                                /*TargetTriple=*/llvm::Triple(), /*MArch=*/"",
-                                /*MCPU=*/GetHostCpuName(),
-                                /*MAttrs=*/DetectMachineAttributes()))),
+    : target_machine_(InferTargetMachineForJIT(target_options, opt_level)),
       disassembler_(*target_machine_),
       data_layout_(target_machine_->createDataLayout()),
       symbol_resolver_(llvm::orc::createLegacyLookupResolver(
+          execution_session_,
           [this](const std::string& name) -> llvm::JITSymbol {
             return this->ResolveRuntimeSymbol(name);
           },
@@ -178,6 +190,7 @@ bool RegisterKnownJITSymbols() {
 
   REGISTER_CPU_RUNTIME_SYMBOL(AcquireInfeedBufferForDequeue);
   REGISTER_CPU_RUNTIME_SYMBOL(AcquireOutfeedBufferForPopulation);
+  REGISTER_CPU_RUNTIME_SYMBOL(MKLConvF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenConvF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenConvF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenFft);
@@ -190,6 +203,7 @@ bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(MKLSingleThreadedMatMulF64);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedConvF32);
+  REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedFft);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF16);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF32);
   REGISTER_CPU_RUNTIME_SYMBOL(EigenSingleThreadedMatMulF64);
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
index f4260a95bc4555..1851a3ee0bb97b 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.h
@@ -95,6 +95,12 @@ class SimpleOrcJIT {
     return &external_constant_pool_;
   }
 
+  // Creates an llvm::TargetMachine suitable for JITting code that will run on
+  // the current machine.
+  static std::unique_ptr<llvm::TargetMachine> InferTargetMachineForJIT(
+      const llvm::TargetOptions& target_options,
+      llvm::CodeGenOpt::Level opt_level);
+
  private:
   llvm::JITSymbol ResolveRuntimeSymbol(const std::string& name);
 
diff --git a/tensorflow/compiler/xla/service/cpu/target_machine_features.cc b/tensorflow/compiler/xla/service/cpu/target_machine_features.cc
index eeb049737dddd1..a0cd8ee2d2be10 100644
--- a/tensorflow/compiler/xla/service/cpu/target_machine_features.cc
+++ b/tensorflow/compiler/xla/service/cpu/target_machine_features.cc
@@ -18,7 +18,7 @@ limitations under the License.
 namespace xla {
 namespace cpu {
 
-llvm::TargetTransformInfo* TargetMachineFeatures::GetTargetTransformInfoFor(
+llvm::TargetTransformInfo* LLVMTargetMachineFeatures::GetTargetTransformInfoFor(
     const llvm::Function& function) const {
   auto it = target_transform_info_cache_.find(&function);
   if (it == target_transform_info_cache_.end()) {
@@ -31,5 +31,30 @@ llvm::TargetTransformInfo* TargetMachineFeatures::GetTargetTransformInfoFor(
   return &it->second;
 }
 
+int64 LLVMTargetMachineFeatures::minimum_alignment_for_allocation(
+    int64 size_bytes) const {
+  // GLibc malloc returns a pointer with alignment 8 on 32-bit platforms and 16
+  // on 64-bit platforms.  TCMalloc returns a pointer with alignment 8 for
+  // allocations smaller than kMallocAlignmentThreshold bytes and at least
+  // alignment 16 for allocations greater than or equal to
+  // kMallocAlignmentThreshold bytes.  N.B. We could improve on this lower bound
+  // by explicitly allocating the memory with posix_memalign.  This is
+  // complicated by our desire to allow parameter buffers created by clients to
+  // be consumed directly by the JIT.
+  if (size_bytes == 0) {
+    // No need to align empty buffers.
+    return 1;
+  }
+
+  const int64 kMallocAlignmentThreshold = 512;
+
+  int pointer_size = target_machine_->getPointerSize(0);
+  int buffer_alignment =
+      size_bytes >= kMallocAlignmentThreshold ? 2 * pointer_size : pointer_size;
+  DCHECK_GT(buffer_alignment, 0);
+
+  return buffer_alignment;
+}
+
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/target_machine_features.h b/tensorflow/compiler/xla/service/cpu/target_machine_features.h
index 703942615e552d..8b00ae9e47eeed 100644
--- a/tensorflow/compiler/xla/service/cpu/target_machine_features.h
+++ b/tensorflow/compiler/xla/service/cpu/target_machine_features.h
@@ -24,43 +24,68 @@ limitations under the License.
 namespace xla {
 namespace cpu {
 
-// Wraps an llvm::TargetMachine and parses out some information that feeds into
-// LLVM IR code generation decisions.
+// Abstract interface for classes providing information about the target we're
+// compiling for.
 class TargetMachineFeatures {
  public:
   static constexpr int kX86AvxVectorByteSize = 32;
 
-  TargetMachineFeatures(llvm::TargetMachine* target_machine)
-      : target_machine_(target_machine) {}
+  // Input and output tensor buffers must be aligned to this many bytes if we
+  // want to call an Eigen backed GEMM or Convolution.
+  static constexpr int kEigenExpectedTensorAlignment = 16;
 
   // Return the vectorization factor, which is the number of bytes of data
   // explicitly vectorized routines will try to process at once.
-  int vectorization_factor_in_bytes() const {
-    // Ideally this should be a function of the cache line size (which we can
-    // get from llvm::TargetTransformInfo::getCacheLineSize) of the target
-    // machine.  Guess a value of 128 bytes for now.
-    return 128;
-  }
+  virtual int vectorization_factor_in_bytes() const = 0;
 
   // Return the size of the largest vector size in bytes.  We need to pass in
   // "function" since llvm functions can contain annotations for specializing
   // them to specific micro-architectures (though currently XLA does not use
   // this functionality).
-  int vector_register_byte_size(const llvm::Function& function) const {
-    llvm::TargetTransformInfo* tti = GetTargetTransformInfoFor(function);
-    return tti->getRegisterBitWidth(/*Vector=*/true) / 8;
-  }
+  virtual int vector_register_byte_size(
+      const llvm::Function& function) const = 0;
 
   // Return the number of elements of type `type` that can fit into the largest
   // vector register available.  We need to pass in "function" since llvm
   // functions can contain annotations for specializing them to specific
   // micro-architectures (though currently XLA does not use this functionality).
+  virtual int vector_register_num_elements(const llvm::Function& function,
+                                           PrimitiveType type) const = 0;
+
+  // Returns the minimum alignment for a buffer of size size_bytes.
+  virtual int64 minimum_alignment_for_allocation(int64 size_bytes) const = 0;
+
+  virtual ~TargetMachineFeatures() = default;
+};
+
+// Implements the TargetMachineFeatures interface using an llvm::TargetMachine.
+class LLVMTargetMachineFeatures : public TargetMachineFeatures {
+ public:
+  static constexpr int kX86AvxVectorByteSize = 32;
+
+  LLVMTargetMachineFeatures(llvm::TargetMachine* target_machine)
+      : target_machine_(target_machine) {}
+
+  int vectorization_factor_in_bytes() const override {
+    // Ideally this should be a function of the cache line size (which we can
+    // get from llvm::TargetTransformInfo::getCacheLineSize) of the target
+    // machine.  Guess a value of 128 bytes for now.
+    return 128;
+  }
+
+  int vector_register_byte_size(const llvm::Function& function) const override {
+    llvm::TargetTransformInfo* tti = GetTargetTransformInfoFor(function);
+    return tti->getRegisterBitWidth(/*Vector=*/true) / 8;
+  }
+
   int vector_register_num_elements(const llvm::Function& function,
-                                   PrimitiveType type) const {
+                                   PrimitiveType type) const override {
     return vector_register_byte_size(function) /
            (primitive_util::BitWidth(type) / 8);
   }
 
+  int64 minimum_alignment_for_allocation(int64 size_bytes) const override;
+
  private:
   llvm::TargetTransformInfo* GetTargetTransformInfoFor(
       const llvm::Function& function) const;
diff --git a/tensorflow/compiler/xla/service/cpu/target_machine_features_fake.h b/tensorflow/compiler/xla/service/cpu/target_machine_features_fake.h
new file mode 100644
index 00000000000000..ffc6927cbe1a2b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/target_machine_features_fake.h
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_TARGET_MACHINE_FEATURES_FAKE_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_TARGET_MACHINE_FEATURES_FAKE_H_
+
+#include "tensorflow/compiler/xla/service/cpu/target_machine_features.h"
+
+namespace xla {
+namespace cpu {
+// Delegates calls to minimum_alignment_for_allocation to a user provided
+// std::function, crashes on all other methods.
+//
+// Primarily useful for testing.
+class TargetMachineFeaturesWithFakeAlignmentLogic
+    : public TargetMachineFeatures {
+ public:
+  explicit TargetMachineFeaturesWithFakeAlignmentLogic(
+      std::function<int64(int64)> fake_alignment_logic)
+      : fake_alignment_logic_(std::move(fake_alignment_logic)) {}
+
+  int vectorization_factor_in_bytes() const override {
+    LOG(FATAL) << "Unexpected call to " << __func__;
+  }
+
+  int vector_register_byte_size(const llvm::Function& function) const override {
+    LOG(FATAL) << "Unexpected call to " << __func__;
+  }
+
+  int vector_register_num_elements(const llvm::Function& function,
+                                   PrimitiveType type) const override {
+    LOG(FATAL) << "Unexpected call to " << __func__;
+  }
+
+  int64 minimum_alignment_for_allocation(int64 size_bytes) const override {
+    return fake_alignment_logic_(size_bytes);
+  }
+
+ private:
+  std::function<int64(int64)> fake_alignment_logic_;
+};
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_TARGET_MACHINE_FEATURES_FAKE_H_
diff --git a/tensorflow/compiler/xla/service/cpu/tests/BUILD b/tensorflow/compiler/xla/service/cpu/tests/BUILD
new file mode 100644
index 00000000000000..66ae5ef0f66e90
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/tests/BUILD
@@ -0,0 +1,176 @@
+# Description:
+#    Tests for LLVM-based CPU backend for XLA.
+
+licenses(["notice"])  # Apache 2.0
+
+package(
+    default_visibility = [":friends"],
+)
+
+package_group(
+    name = "friends",
+    includes = [
+        "//tensorflow/compiler/xla:friends",
+    ],
+)
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+# Filegroup used to collect source files for dependency checking.
+filegroup(
+    name = "c_srcs",
+    data = glob([
+        "**/*.cc",
+        "**/*.h",
+    ]),
+)
+
+cc_library(
+    name = "cpu_codegen_test",
+    testonly = True,
+    hdrs = ["cpu_codegen_test.h"],
+    deps = [
+        "//tensorflow/compiler/xla/service:cpu_plugin",
+        "//tensorflow/compiler/xla/tests:llvm_irgen_test_base",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "cpu_fusion_test",
+    srcs = ["cpu_fusion_test.cc"],
+    deps = [
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:cpu_plugin",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service/cpu:cpu_instruction_fusion",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "cpu_bytesizeof_test",
+    srcs = ["cpu_bytesizeof_test.cc"],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "cpu_external_constants_test",
+    srcs = ["cpu_external_constants_test.cc"],
+    deps = [
+        "//tensorflow/compiler/xla:array2d",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test",
+        "//tensorflow/compiler/xla/tests:filecheck",
+        "//tensorflow/core:test",
+    ],
+)
+
+tf_cc_test(
+    name = "cpu_noalias_test",
+    srcs = ["cpu_noalias_test.cc"],
+    deps = [
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:buffer_assignment",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test",
+        "//tensorflow/compiler/xla/service/llvm_ir:alias_analysis",
+        "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        "//tensorflow/compiler/xla/tests:filecheck",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@llvm//:core",
+    ],
+)
+
+tf_cc_test(
+    name = "cpu_intrinsic_test",
+    srcs = ["cpu_intrinsic_test.cc"],
+    deps = [
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
+        "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "cpu_eigen_dot_operation_test",
+    srcs = ["cpu_eigen_dot_operation_test.cc"],
+    deps = [
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
+        "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "cpu_infeed_test",
+    srcs = ["cpu_infeed_test.cc"],
+    deps = [
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/lib:arithmetic",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/service:cpu_plugin",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "cpu_literal_caching_test",
+    srcs = ["cpu_literal_caching_test.cc"],
+    deps = [
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
+        "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cc_test(
+    name = "cpu_outfeed_test",
+    srcs = ["cpu_outfeed_test.cc"],
+    deps = [
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
+        "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_bytesizeof_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_bytesizeof_test.cc
new file mode 100644
index 00000000000000..d5bbe7677ace67
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_bytesizeof_test.cc
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/core/platform/test.h"
+
+class CpuByteSizeOfTest : public ::testing::Test {};
+
+TEST_F(CpuByteSizeOfTest, ARM32) {
+  llvm::DataLayout data_layout(
+      "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64");
+  auto tuple_shape =
+      xla::ShapeUtil::MakeTupleShape({xla::ShapeUtil::MakeShape(xla::F32, {})});
+  EXPECT_EQ(xla::llvm_ir::ByteSizeOf(tuple_shape, data_layout),
+            data_layout.getPointerSize(0 /* default address space */));
+}
+
+TEST_F(CpuByteSizeOfTest, ARM64) {
+  llvm::DataLayout data_layout("e-m:e-i64:64-i128:128-n32:64-S128");
+  auto tuple_shape =
+      xla::ShapeUtil::MakeTupleShape({xla::ShapeUtil::MakeShape(xla::F32, {})});
+  EXPECT_EQ(xla::llvm_ir::ByteSizeOf(tuple_shape, data_layout),
+            data_layout.getPointerSize(0 /* default address space */));
+}
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h b/tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h
new file mode 100644
index 00000000000000..7c8d07a10baf55
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h
@@ -0,0 +1,30 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_TESTS_CPU_CODEGEN_TEST_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_TESTS_CPU_CODEGEN_TEST_H_
+
+#include "tensorflow/compiler/xla/tests/llvm_irgen_test_base.h"
+
+namespace xla {
+namespace cpu {
+
+// Tests that verify IR emitted by the CPU backend is as expected.
+class CpuCodegenTest : public LLVMIRGenTestBase {};
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_TESTS_CPU_CODEGEN_TEST_H_
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
new file mode 100644
index 00000000000000..6fcce42eaa4599
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
@@ -0,0 +1,113 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Tests that we call into Eigen for dot operations as needed.
+
+#include <algorithm>
+#include <cctype>
+#include <string>
+
+#include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
+#include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace cpu {
+namespace {
+
+struct DotTestSpec {
+  PrimitiveType primitive_type;
+  string filecheck_lines;
+};
+
+string DotTestSpecToString(const ::testing::TestParamInfo<DotTestSpec>& info) {
+  return PrimitiveType_Name(info.param.primitive_type);
+}
+
+class CpuEigenDotOperationTest
+    : public CpuCodegenTest,
+      public ::testing::WithParamInterface<DotTestSpec> {
+ protected:
+  void CompileAndCheck(std::unique_ptr<HloComputation> entry_computation,
+                       const string& filecheck_lines) {
+    CpuAotCompilationOptions options{
+        /*triple=*/"x86_64", /*cpu_name=*/"", /*features=*/"",
+        /*entry_point_name=*/"entry",
+        /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
+
+    auto hlo_module = CreateNewModule();
+    hlo_module->AddEntryComputation(std::move(entry_computation));
+
+    CompileAheadOfTimeAndVerifyIr(std::move(hlo_module), options,
+                                  filecheck_lines,
+                                  /*match_optimized_ir=*/true);
+  }
+};
+
+TEST_P(CpuEigenDotOperationTest, SimpleDotOp) {
+  HloComputation::Builder builder(TestName());
+  DotTestSpec spec = GetParam();
+
+  auto param_shape = ShapeUtil::MakeShape(spec.primitive_type, {128, 128});
+
+  HloInstruction* lhs = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, param_shape, "input"));
+  HloInstruction* rhs = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, param_shape, "input"));
+
+  builder.AddInstruction(
+      HloInstruction::CreateCanonicalDot(param_shape, lhs, rhs));
+  CompileAndCheck(builder.Build(), spec.filecheck_lines);
+}
+
+TEST_P(CpuEigenDotOperationTest, DotTransposeOp) {
+  HloComputation::Builder builder(TestName());
+  DotTestSpec spec = GetParam();
+
+  auto param_shape = ShapeUtil::MakeShape(spec.primitive_type, {128, 128});
+
+  HloInstruction* lhs = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, param_shape, "input"));
+  HloInstruction* rhs = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, param_shape, "input"));
+  HloInstruction* lhs_transposed = builder.AddInstruction(
+      HloInstruction::CreateTranspose(param_shape, lhs, {1, 0}));
+
+  builder.AddInstruction(
+      HloInstruction::CreateCanonicalDot(param_shape, lhs_transposed, rhs));
+  CompileAndCheck(builder.Build(), spec.filecheck_lines);
+}
+
+std::vector<DotTestSpec> GetDotTestCases() {
+  std::vector<DotTestSpec> result;
+  result.push_back(
+      {F16, R"(CHECK: call void @__xla_cpu_runtime_EigenMatMulF16)"});
+  result.push_back(
+      {F32, R"(CHECK: call void @__xla_cpu_runtime_EigenMatMulF32)"});
+  result.push_back(
+      {F64, R"(CHECK: call void @__xla_cpu_runtime_EigenMatMulF64)"});
+  return result;
+}
+
+INSTANTIATE_TEST_CASE_P(CpuEigenDotOperationTestInstantiation,
+                        CpuEigenDotOperationTest,
+                        ::testing::ValuesIn(GetDotTestCases()),
+                        DotTestSpecToString);
+
+}  // namespace
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
new file mode 100644
index 00000000000000..faac927027c48e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
@@ -0,0 +1,73 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/array2d.h"
+#include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/filecheck.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace cpu {
+namespace {
+class CpuExternalConstantsTest : public CpuCodegenTest {
+ public:
+  void TestWithArray(int64 rows, int64 cols, const char* filecheck_pattern) {
+    HloComputation::Builder builder(TestName());
+
+    Array2D<float> backing_array(rows, cols);
+    backing_array.FillUnique();
+
+    auto shape = ShapeUtil::MakeShape(F32, {rows, cols});
+
+    HloInstruction* constant =
+        builder.AddInstruction(HloInstruction::CreateConstant(
+            Literal::CreateR2FromArray2D(backing_array)));
+    HloInstruction* param =
+        builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
+    builder.AddInstruction(
+        HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param, constant));
+
+    std::unique_ptr<HloModule> module = CreateNewModule();
+    module->AddEntryComputation(builder.Build());
+
+    CompileAndVerifyIr(std::move(module), filecheck_pattern,
+                       /*match_optimized_ir=*/false);
+  }
+};
+
+TEST_F(CpuExternalConstantsTest, Basic) {
+  TestWithArray(/*rows=*/1024, /*cols=*/1024, R"(
+CHECK: @constant_global_0 = external constant [1024 x [1024 x float]], align 16
+)");
+}
+
+TEST_F(CpuExternalConstantsTest, BasicNegative) {
+  // The constant array in this test case is small enough that there is no need
+  // to externalize it.
+  TestWithArray(/*rows=*/4, /*cols=*/4, R"(
+CHECK-NOT: @constant_global_0 = external constant [16 x float], align 8
+CHECK: @0 = private constant [16 x float] {{.*}}, align 8
+)");
+}
+}  // namespace
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc
new file mode 100644
index 00000000000000..23e7a3de4d8188
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_fusion_test.cc
@@ -0,0 +1,330 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace cpu {
+namespace {
+
+class CpuFusionTest : public HloTestBase {
+ protected:
+  CpuFusionTest() {}
+
+  ErrorSpec error_spec_{0.0001, 1e-5};
+};
+
+TEST_F(CpuFusionTest, FuseTwoElementwiseOps) {
+  auto builder = HloComputation::Builder(TestName());
+  auto input_literal1 = Literal::CreateR1<float>({1.0, 2.0, 3.0});
+  auto input_literal2 = Literal::CreateR1<float>({-2.0, -42.0, 2.0});
+  Shape vshape = input_literal1->shape();
+
+  auto input1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(std::move(input_literal1)));
+  auto input2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(std::move(input_literal2)));
+
+  auto add1 = builder.AddInstruction(
+      HloInstruction::CreateBinary(vshape, HloOpcode::kAdd, input1, input2));
+  builder.AddInstruction(
+      HloInstruction::CreateUnary(vshape, HloOpcode::kNegate, add1));
+
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+
+  CpuInstructionFusion fusion;
+  EXPECT_TRUE(fusion.Run(module.get()).ValueOrDie());
+
+  // The computation root instruction was fused. Verify the fusion instruction
+  // is now the root.
+  auto computation = module->entry_computation();
+  auto fusion_instruction = computation->root_instruction();
+  EXPECT_EQ(HloOpcode::kFusion, fusion_instruction->opcode());
+  EXPECT_EQ(HloOpcode::kNegate,
+            fusion_instruction->fused_expression_root()->opcode());
+  // There should be four fused instructions: 2 parameters, the add, and the
+  // negate.
+  EXPECT_EQ(4, fusion_instruction->fused_instruction_count());
+
+  // Compile and execute the computation.
+  auto result = ExecuteAndTransfer(std::move(module), {});
+
+  // Check the output correctness.
+  LiteralTestUtil::ExpectR1Near<float>({1.0, 40.0, -5.0}, *result, error_spec_);
+}
+
+TEST_F(CpuFusionTest, FuseElementwiseOpChain) {
+  auto builder = HloComputation::Builder(TestName());
+  auto input_literal = Literal::CreateR1<float>({-1.5, -2.5, -3.0});
+  Shape vshape = input_literal->shape();
+
+  auto input = builder.AddInstruction(
+      HloInstruction::CreateConstant(std::move(input_literal)));
+  auto negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(vshape, HloOpcode::kNegate, input));
+  auto ceil = builder.AddInstruction(
+      HloInstruction::CreateUnary(vshape, HloOpcode::kCeil, negate));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(vshape, HloOpcode::kExp, ceil));
+  auto floor = builder.AddInstruction(
+      HloInstruction::CreateUnary(vshape, HloOpcode::kFloor, exp));
+  auto two = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(vshape, HloOpcode::kMultiply, two, floor));
+
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+
+  CpuInstructionFusion fusion;
+  EXPECT_TRUE(fusion.Run(module.get()).ValueOrDie());
+
+  // The computation root instruction was fused. Verify the fusion instruction
+  // is now the root.
+  auto computation = module->entry_computation();
+  auto fusion_instruction = computation->root_instruction();
+  EXPECT_EQ(HloOpcode::kFusion, fusion_instruction->opcode());
+  EXPECT_EQ(HloOpcode::kMultiply,
+            fusion_instruction->fused_expression_root()->opcode());
+  // There should be 7 fused instructions: 2 parameters and the fused
+  // operations.
+  EXPECT_EQ(7, fusion_instruction->fused_instruction_count());
+
+  // Compile and execute the computation.
+  auto result = ExecuteAndTransfer(std::move(module), {});
+
+  // Check the output correctness.
+  LiteralTestUtil::ExpectR1Near<float>({14.0, 40.0, 40.0}, *result,
+                                       error_spec_);
+}
+
+TEST_F(CpuFusionTest, ElementwiseOpChainWithNonfusableInstruction) {
+  // Test a chain of fusable ops with a non-fusable op (a reduce) thrown in the
+  // middle.
+  auto module = CreateNewModule();
+  auto builder = HloComputation::Builder(TestName());
+  auto input_literal = Literal::CreateR1<float>({-1.5, -2.5, -3.0});
+  Shape vshape = input_literal->shape();
+
+  auto input = builder.AddInstruction(
+      HloInstruction::CreateConstant(std::move(input_literal)));
+  auto negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(vshape, HloOpcode::kNegate, input));
+  auto ceil = builder.AddInstruction(
+      HloInstruction::CreateUnary(vshape, HloOpcode::kCeil, negate));
+
+  auto cshape = ShapeUtil::MakeShape(F32, {6});
+  auto concatenate = builder.AddInstruction(
+      HloInstruction::CreateConcatenate(cshape, {ceil, ceil}, /*dimension=*/0));
+
+  // Build an x+y computation to use in a reduce.
+  Shape r0f32 = ShapeUtil::MakeShape(F32, {});
+  auto embedded_builder = HloComputation::Builder("f32+f32");
+  embedded_builder.AddInstruction(HloInstruction::CreateBinary(
+      r0f32, HloOpcode::kAdd,
+      embedded_builder.AddInstruction(
+          HloInstruction::CreateParameter(0, r0f32, "x")),
+      embedded_builder.AddInstruction(
+          HloInstruction::CreateParameter(1, r0f32, "y"))));
+  auto add_f32 = module->AddEmbeddedComputation(embedded_builder.Build());
+
+  // This is a nop reduction.
+  auto reduce = builder.AddInstruction(HloInstruction::CreateReduce(
+      cshape,
+      builder.AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(F32, {6, 1}), concatenate)),
+      /*init_value=*/
+      builder.AddInstruction(
+          HloInstruction::CreateConstant(Literal::CreateR0<float>(0))),
+      /*dimensions_to_reduce=*/{1}, add_f32));
+
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(cshape, HloOpcode::kExp, reduce));
+  auto floor = builder.AddInstruction(
+      HloInstruction::CreateUnary(cshape, HloOpcode::kFloor, exp));
+  auto two = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(2.0)));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(cshape, HloOpcode::kMultiply, two, floor));
+
+  module->AddEntryComputation(builder.Build());
+
+  CpuInstructionFusion fusion;
+  EXPECT_TRUE(fusion.Run(module.get()).ValueOrDie());
+
+  // The computation root instruction was fused. Verify the fusion instruction
+  // is now the root.
+  auto computation = module->entry_computation();
+
+  auto fusion_instruction1 = computation->root_instruction();
+  EXPECT_EQ(HloOpcode::kFusion, fusion_instruction1->opcode());
+  EXPECT_EQ(HloOpcode::kMultiply,
+            fusion_instruction1->fused_expression_root()->opcode());
+  // There should be 5 fused instructions in the root fusion instruction: 2
+  // parameters, multiply, floor, and exp.
+  EXPECT_EQ(5, fusion_instruction1->fused_instruction_count())
+      << fusion_instruction1->fused_instructions_computation()->ToString();
+
+  auto fusion_instruction2 = reduce->operand(0);
+  EXPECT_EQ(HloOpcode::kFusion, fusion_instruction1->opcode());
+  EXPECT_EQ(HloOpcode::kReshape,
+            fusion_instruction2->fused_expression_root()->opcode());
+  // There should be 5 fused instructions in the second fusion instruction: 1
+  // parameter, negate, ceil, concat, and reshape.
+  EXPECT_EQ(5, fusion_instruction2->fused_instruction_count())
+      << fusion_instruction2->fused_instructions_computation()->ToString();
+
+  // Compile and execute the computation.
+  auto result = ExecuteAndTransfer(std::move(module), {});
+
+  // Check the output correctness.
+  LiteralTestUtil::ExpectR1Near<float>({14.0, 40.0, 40.0, 14.0, 40.0, 40.0},
+                                       *result, error_spec_);
+}
+
+TEST_F(CpuFusionTest, TestOperandOrderToAvoidDuplication) {
+  // Test that the operands of an instruction to be fused are considered in the
+  // proper order to avoid duplication. Test input:
+  //
+  //   constant = {...}
+  //   negate    = neg(constant)
+  //   ceil      = ceil(negate)
+  //   add1      = add(negate, ceil)
+  //   add2      = add(ceil, negate)
+  //
+  // In this example, the operands of both add1 and add2 should be fused in the
+  // order {ceil, negate} even though they have different orders in their
+  // operand vectors. Test for this problem by counting the number of nodes in
+  // each fusion instruction to ensure that negate is not duplicated.
+  auto builder = HloComputation::Builder(TestName());
+  auto input_literal = Literal::CreateR1<float>({1.0, 2.0, 3.0});
+  Shape vshape = input_literal->shape();
+
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(std::move(input_literal)));
+  auto negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(vshape, HloOpcode::kNegate, constant));
+  auto ceil = builder.AddInstruction(
+      HloInstruction::CreateUnary(vshape, HloOpcode::kCeil, negate));
+
+  auto add1 = builder.AddInstruction(
+      HloInstruction::CreateBinary(vshape, HloOpcode::kMultiply, negate, ceil));
+  auto add2 = builder.AddInstruction(
+      HloInstruction::CreateBinary(vshape, HloOpcode::kMultiply, ceil, negate));
+
+  // Tie together the two adds with a tuple to create a single root.
+  auto result =
+      builder.AddInstruction(HloInstruction::CreateTuple({add1, add2}));
+
+  // Create computation and module.
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+
+  // Run fusion.
+  CpuInstructionFusion fusion;
+  EXPECT_TRUE(fusion.Run(module.get()).ValueOrDie());
+
+  auto fusion1 = result->operand(0);
+  auto fusion2 = result->operand(1);
+  EXPECT_EQ(HloOpcode::kFusion, fusion1->opcode());
+  EXPECT_EQ(HloOpcode::kFusion, fusion2->opcode());
+
+  // Each fusion instruction should have 4 fused instruction inside: add, ceil,
+  // negate, and the fused parameter.
+  EXPECT_EQ(4, fusion1->fused_instruction_count());
+  EXPECT_EQ(4, fusion2->fused_instruction_count());
+
+  // Each fusion instruction should have one parameter and the parameter should
+  // be the constant.
+  EXPECT_EQ(1, fusion1->operand_count());
+  EXPECT_EQ(constant, fusion1->operand(0));
+  EXPECT_EQ(1, fusion2->operand_count());
+  EXPECT_EQ(constant, fusion2->operand(0));
+}
+
+TEST_F(CpuFusionTest, DoNotDuplicateExpensiveOps) {
+  // Verify that expensive operations will not be fused if the fusion results in
+  // duplication. Test code:
+  //
+  //   constant = 42.0
+  //   exp1 = exp(constant)
+  //   negate1 = negate(exp1)
+  //   exp2 = exp(constant)
+  //   negate2 = negate(exp2)
+  //   tuple = tuple(negate1, negate2, exp2)
+  //
+  // exp1 should be fused down into negate1, but exp2 will not be fused into
+  // negate2 because this will result in duplication of the expensive exp
+  // computation. The duplication is caused by the other use of exp2 in the
+  // tuple.
+  auto builder = HloComputation::Builder(TestName());
+  auto input_literal1 = Literal::CreateR1<float>({1.0, 2.0, 3.0});
+  auto input_literal2 = Literal::CreateR1<float>({-2.0, -42.0, 2.0});
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0)));
+  Shape shape = constant->shape();
+
+  auto exp1 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kExp, constant));
+  auto negate1 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, exp1));
+
+  auto exp2 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kExp, constant));
+  auto negate2 = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, exp2));
+
+  auto tuple = builder.AddInstruction(
+      HloInstruction::CreateTuple({negate1, negate2, exp2}));
+
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+
+  CpuInstructionFusion fusion;
+  EXPECT_TRUE(fusion.Run(module.get()).ValueOrDie());
+
+  // The only fusion instruction should be operand 0 of the tuple (formerly
+  // negate1).
+  EXPECT_EQ(HloOpcode::kFusion, tuple->operand(0)->opcode());
+  EXPECT_EQ(HloOpcode::kNegate, tuple->operand(1)->opcode());
+  EXPECT_EQ(HloOpcode::kExp, tuple->operand(2)->opcode());
+
+  auto fusion_inst = tuple->operand(0);
+  // There should be three fused instructions: negate2, exp2, and the fused
+  // parameter.
+  EXPECT_EQ(3, fusion_inst->fused_instruction_count());
+  EXPECT_EQ(1, fusion_inst->operand_count());
+  EXPECT_EQ(constant, fusion_inst->operand(0));
+}
+
+}  // namespace
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_infeed_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_infeed_test.cc
new file mode 100644
index 00000000000000..dd63b998e9b6d0
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_infeed_test.cc
@@ -0,0 +1,294 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <unistd.h>
+#include <memory>
+
+#include "tensorflow/compiler/xla/client/global_data.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/math/math_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+class InfeedTest : public ClientLibraryTestBase {
+ protected:
+  // Transfers the given literal to the infeed interface of the device, and
+  // check if the returned data from Infeed HLO is same as the literal.
+  void TestInfeedRoundTrip(const Literal& literal) {
+    // TODO(b/31037751) Explicitly reset the Infeed state so that the
+    // test is not affected by the state from the previous tests by
+    // adding ClearInfeed if necessary when it is implemented. For now
+    // don't use ResetDevice since it is not implemented on CPU.
+    ASSERT_IS_OK(client_->TransferToInfeed(literal));
+    XlaBuilder builder(TestName());
+    builder.Infeed(literal.shape());
+    if (ShapeUtil::IsTuple(literal.shape())) {
+      // TODO(b/30609564): Use ComputeAndCompareLiteral instead.
+      ComputeAndCompareTuple(&builder, literal, {});
+    } else {
+      ComputeAndCompareLiteral(&builder, literal, {});
+    }
+  }
+};
+
+TEST_F(InfeedTest, SingleInfeedR0Bool) {
+  TestInfeedRoundTrip(*Literal::CreateR0<bool>(true));
+}
+
+TEST_F(InfeedTest, SingleInfeedR1U32) {
+  TestInfeedRoundTrip(*Literal::CreateR1<uint32>({1, 2, 3}));
+}
+
+TEST_F(InfeedTest, SingleInfeedR2F32) {
+  TestInfeedRoundTrip(*Literal::CreateR2F32Linspace(0.0, 1.0, 128, 64));
+}
+
+TEST_F(InfeedTest, SingleInfeedR3F32) {
+  TestInfeedRoundTrip(
+      *Literal::CreateR3({{{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}},
+                          {{1.1f, 2.1f, 3.1f}, {6.1f, 3.5f, 2.8f}}}));
+}
+
+TEST_F(InfeedTest, SingleInfeedR3F32DifferentLayout) {
+  const Layout r3_dim0minor = LayoutUtil::MakeLayout({0, 1, 2});
+  const Layout r3_dim0major = LayoutUtil::MakeLayout({2, 1, 0});
+
+  TestInfeedRoundTrip(
+      *Literal::CreateR3WithLayout({{{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}},
+                                    {{1.1f, 2.1f, 3.1f}, {6.1f, 3.5f, 2.8f}}},
+                                   r3_dim0minor));
+
+  TestInfeedRoundTrip(
+      *Literal::CreateR3WithLayout({{{1.0f, 2.0f, 3.0f}, {4.0f, 5.0f, 6.0f}},
+                                    {{1.1f, 2.1f, 3.1f}, {6.1f, 3.5f, 2.8f}}},
+                                   r3_dim0major));
+}
+
+TEST_F(InfeedTest, SingleInfeedR4S32) {
+  TestInfeedRoundTrip(*Literal::CreateR4(
+      {{{{1, -2}, {-4, 5}, {6, 7}}, {{8, 9}, {10, 11}, {12, 13}}},
+       {{{10, 3}, {7, -2}, {3, 6}}, {{2, 5}, {-11, 5}, {-2, -5}}}}));
+}
+
+TEST_F(InfeedTest, SingleInfeedTuple) {
+  TestInfeedRoundTrip(
+      *Literal::MakeTuple({Literal::CreateR1<uint32>({1, 2, 3}).get(),
+                           Literal::CreateR0<bool>(false).get()}));
+}
+
+TEST_F(InfeedTest, SingleInfeedEmptyTuple) {
+  TestInfeedRoundTrip(*Literal::MakeTuple({}));
+}
+
+// Tests Infeed operation used in a while loop, as in the code below. The
+// computation is launched asynchronously, and then infeed data is transferred.
+//
+// float acc = 0.0f;
+// while (acc < 40.0f) {
+//   acc += reduce_add(Infeed());
+// }
+// return acc;
+// TODO(b/30671675) enable this test once asynchronous execution is
+// implemented for CPU.
+TEST_F(InfeedTest, DISABLED_SingleInfeedInWhile) {
+  XlaBuilder builder(TestName());
+  const auto infeed_shape = ShapeUtil::MakeShape(F32, {3});
+  const auto result_shape = ShapeUtil::MakeShape(F32, {});
+
+  // Create a computation for the condition: repeat until (prev < 40.0f) holds.
+  XlaComputation condition;
+  {
+    XlaBuilder builder("condition");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    builder.Gt(builder.ConstantR0<float>(40.0f), prev);
+    condition = builder.Build().ConsumeValueOrDie();
+  }
+  // Create a computation for the body: add the reduced value of the Infeed
+  // data to the result variable.
+  XlaComputation body;
+  {
+    XlaBuilder builder("body");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    auto infeed = builder.Infeed(infeed_shape);
+    auto addend =
+        builder.Reduce(infeed, builder.ConstantR0<float>(0.0f),
+                       CreateScalarAddComputation(F32, &builder), {0});
+    builder.Add(prev, addend);
+    body = builder.Build().ConsumeValueOrDie();
+  }
+  // Create a While node with computations for the condition and the body.
+  auto init = builder.ConstantR0<float>(0.0f);
+  builder.While(condition, body, init);
+
+  // Build and asynchronously launch the computation.
+  auto computation = builder.Build().ConsumeValueOrDie();
+  std::unique_ptr<GlobalData> result;
+  tensorflow::Thread* computation_thread =
+      tensorflow::Env::Default()->StartThread(
+          tensorflow::ThreadOptions{}, "computation_thread", [&] {
+            result = client_->Execute(computation, {}, &execution_options_)
+                         .ValueOrDie();
+          });
+
+  // Send 5 Infeed data of shape F32[3].
+  ASSERT_IS_OK(client_->TransferToInfeed(*Literal::CreateR1<float>({1, 2, 3})));
+  ASSERT_IS_OK(client_->TransferToInfeed(*Literal::CreateR1<float>({4, 5, 6})));
+  ASSERT_IS_OK(client_->TransferToInfeed(*Literal::CreateR1<float>({7, 8, 9})));
+  ASSERT_IS_OK(
+      client_->TransferToInfeed(*Literal::CreateR1<float>({10, 11, 12})));
+  ASSERT_IS_OK(
+      client_->TransferToInfeed(*Literal::CreateR1<float>({13, 14, 15})));
+
+  delete computation_thread;  // Joins the thread.
+  auto result_literal = client_->Transfer(*result).ConsumeValueOrDie();
+
+  // Only the first 3 infeed data should be added.
+  LiteralTestUtil::ExpectR0Near<float>(45.0f, *result_literal, ErrorSpec{1e-7});
+}
+
+// Tests two Infeed operations with a total order. The order is enforced by
+// using the result of the first while loop as the initial value of the second
+// while loop. The shapes of both Infeeds are Tuples, where the first tuple
+// element (R1F32) is for the data to reduce and accumulate, and the second
+// tuple element (PRED) to indicate whether the loop should continue. The
+// computation is launched asynchronously, and then infeed data is transferred.
+//
+// float acc = 0.0f;
+// continue = true;
+// while (!continue) {
+//   (data, continue) = Infeed(shape1);
+//   acc += reduce_add(data)
+// }
+// continue = true;
+// while(!continue) {
+//   (data, continue) = Infeed(shape2);
+//   acc += reduce_add(data)
+// }
+// return acc;
+// TODO(b/30671675) enable this test once asynchronous execution is
+// implemented for CPU.
+TEST_F(InfeedTest, DISABLED_TwoInfeedsInTotalOrder) {
+  XlaBuilder builder(TestName());
+  const auto infeed1_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {2}), ShapeUtil::MakeShape(PRED, {})});
+  const auto infeed2_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {3}), ShapeUtil::MakeShape(PRED, {})});
+  const auto result_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {}), ShapeUtil::MakeShape(PRED, {})});
+
+  // Create a computation for the condition: repeat until the second tuple
+  // element is false.
+  XlaComputation condition;
+  {
+    XlaBuilder builder("condition");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    builder.GetTupleElement(prev, 1);
+    condition = builder.Build().ConsumeValueOrDie();
+  }
+
+  // A lambda that builds the body computation of a while loop with the given
+  // infeed shape, and returns the computation with the ownership.
+  //
+  // The body adds the reduced value of the Infeed data (first tuple element)
+  // to the previous accumulator, and returns the accumulator and the continue
+  // flag (second tuple element) as a tuple.
+  const auto build_body = [this, &result_shape](const Shape& infeed_shape) {
+    XlaComputation body;
+    XlaBuilder builder("body");
+    auto prev = builder.Parameter(0, result_shape, "prev");
+    auto infeed = builder.Infeed(infeed_shape);
+    auto addend = builder.Reduce(
+        builder.GetTupleElement(infeed, 0), builder.ConstantR0<float>(0.0f),
+        CreateScalarAddComputation(F32, &builder), {0});
+    auto result = builder.Add(builder.GetTupleElement(prev, 0), addend);
+    builder.Tuple({result, builder.GetTupleElement(infeed, 1)});
+    return builder.Build().ConsumeValueOrDie();
+  };
+
+  // Create the first while loop with infeed1_shape.
+  auto init = builder.Tuple(
+      {builder.ConstantR0<float>(0.0f), builder.ConstantR0<bool>(true)});
+  auto while1 = builder.While(condition, build_body(infeed1_shape), init);
+  auto result1 = builder.Tuple(
+      {builder.GetTupleElement(while1, 0), builder.ConstantR0<bool>(true)});
+
+  // Create the second while loop with infeed2_shape. Note that the result from
+  // the first while loop is used as the initial value.
+  auto while2 = builder.While(condition, build_body(infeed2_shape), result1);
+  builder.GetTupleElement(while2, 0);
+
+  // Build the computation.
+  auto computation = builder.Build().ConsumeValueOrDie();
+
+  // Send the first 4 Infeed data of shape Tuple(F32[2], PRED).
+  ASSERT_IS_OK(client_->TransferToInfeed(
+      *Literal::MakeTuple({Literal::CreateR1<float>({1, 2}).get(),
+                           Literal::CreateR0<bool>(true).get()})));
+  ASSERT_IS_OK(client_->TransferToInfeed(
+      *Literal::MakeTuple({Literal::CreateR1<float>({3, 4}).get(),
+                           Literal::CreateR0<bool>(true).get()})));
+  ASSERT_IS_OK(client_->TransferToInfeed(
+      *Literal::MakeTuple({Literal::CreateR1<float>({5, 6}).get(),
+                           Literal::CreateR0<bool>(true).get()})));
+  ASSERT_IS_OK(client_->TransferToInfeed(
+      *Literal::MakeTuple({Literal::CreateR1<float>({7, 8}).get(),
+                           Literal::CreateR0<bool>(false).get()})));
+
+  // Asynchronously launch the execution on the device.
+  std::unique_ptr<GlobalData> result;
+  tensorflow::Thread* computation_thread =
+      tensorflow::Env::Default()->StartThread(
+          tensorflow::ThreadOptions{}, "computation_thread", [&] {
+            result = client_->Execute(computation, {}, &execution_options_)
+                         .ValueOrDie();
+          });
+
+  // Wait for a second to ensure testing that the execution is waiting on the
+  // Infeed data, and send the rest Infeed data of shape Tuple(F32[3], PRED).
+  sleep(1);
+  ASSERT_IS_OK(client_->TransferToInfeed(
+      *Literal::MakeTuple({Literal::CreateR1<float>({1, 2, 3}).get(),
+                           Literal::CreateR0<bool>(true).get()})));
+  ASSERT_IS_OK(client_->TransferToInfeed(
+      *Literal::MakeTuple({Literal::CreateR1<float>({7, 8, 9}).get(),
+                           Literal::CreateR0<bool>(false).get()})));
+  ASSERT_IS_OK(client_->TransferToInfeed(
+      *Literal::MakeTuple({Literal::CreateR1<float>({4, 5, 6}).get(),
+                           Literal::CreateR0<bool>(true).get()})));
+
+  // Wait for the execution to be done, and transfer the result.
+  delete computation_thread;  // Joins the thread.
+  auto result_literal = client_->Transfer(*result).ConsumeValueOrDie();
+
+  // Only the first 6 infeed data should be added.
+  LiteralTestUtil::ExpectR0Near<float>(66.0f, *result_literal, ErrorSpec{1e-7});
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
new file mode 100644
index 00000000000000..973aac8766f5aa
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
@@ -0,0 +1,151 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cctype>
+#include <string>
+
+#include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
+#include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace cpu {
+namespace {
+
+const char* const kTriple_x86_64 = "x86_64-pc-linux";
+const char* const kTriple_android_arm = "armv7-none-android";
+
+struct IntrinsicTestSpec {
+  HloOpcode opcode;
+  tensorflow::StringPiece triple;
+  tensorflow::StringPiece features;
+  tensorflow::StringPiece check_lines;
+};
+
+// Tests that unary functions get lowered using intrinsic calls.
+class CpuUnaryIntrinsicTest
+    : public CpuCodegenTest,
+      public ::testing::WithParamInterface<IntrinsicTestSpec> {
+ public:
+  static string Name(const ::testing::TestParamInfo<IntrinsicTestSpec>& info) {
+    auto spec = info.param;
+
+    string opcode = HloOpcodeString(spec.opcode);
+    opcode[0] = toupper(opcode[0]);
+
+    string triple{spec.triple.data(), spec.triple.size()};
+    if (triple == kTriple_x86_64) {
+      triple = "x86_64";
+    } else if (triple == kTriple_android_arm) {
+      triple = "android_arm";
+    } else {
+      triple = "Unknown";
+    }
+
+    string features{spec.features.data(), spec.features.size()};
+    if (!features.empty()) {
+      std::replace_if(features.begin(), features.end(),
+                      [](char c) { return c != '_' && !isalnum(c); }, '_');
+    } else {
+      features = "";
+    }
+
+    return tensorflow::strings::StrCat(opcode.c_str(), "_On_", triple.c_str(),
+                                       features.empty() ? "" : "_With",
+                                       features.c_str());
+  }
+};
+
+// Creates a module with a call to the unary op, and tests if the
+// compiler replaced it with a call to the intrinsic.
+TEST_P(CpuUnaryIntrinsicTest, DoIt) {
+  HloComputation::Builder builder(TestName());
+  IntrinsicTestSpec spec = GetParam();
+
+  auto param_shape = ShapeUtil::MakeShape(F32, {1024});
+  HloInstruction* param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, param_shape, "input"));
+  builder.AddInstruction(
+      HloInstruction::CreateUnary(param_shape, spec.opcode, param));
+  std::unique_ptr<HloComputation> computation = builder.Build();
+
+  string triple{spec.triple.data(), spec.triple.size()};
+  string features{spec.features.data(), spec.features.size()};
+
+  CpuAotCompilationOptions options{
+      /*triple=*/triple, /*cpu_name=*/"", /*features=*/features,
+      /*entry_point_name=*/"entry",
+      /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
+
+  auto hlo_module = CreateNewModule();
+  hlo_module->AddEntryComputation(std::move(computation));
+
+  string check_lines{spec.check_lines.data(), spec.check_lines.size()};
+
+  CompileAheadOfTimeAndVerifyIr(std::move(hlo_module), options, check_lines,
+                                /*match_optimized_ir=*/true);
+}
+
+IntrinsicTestSpec CpuUnaryIntrinsicTestCases[] = {
+    // The intrinsics are always inlined, so we match a line from it instead of
+    // a function call.
+
+    IntrinsicTestSpec{
+        HloOpcode::kExp, kTriple_x86_64, "",
+        R"(CHECK: fmul fast <4 x float> <float 0xBF2BD01060000000, float 0xBF2BD01060000000, float 0xBF2BD01060000000, float 0xBF2BD01060000000>)"},
+
+    IntrinsicTestSpec{
+        HloOpcode::kExp, kTriple_x86_64, "+avx",
+        R"(CHECK: fmul fast <8 x float> <float 0xBF2BD01060000000, float 0xBF2BD01060000000, float 0xBF2BD01060000000, float 0xBF2BD01060000000, float 0xBF2BD01060000000, float 0xBF2BD01060000000, float 0xBF2BD01060000000, float 0xBF2BD01060000000>)"},
+
+    IntrinsicTestSpec{
+        HloOpcode::kExp, kTriple_android_arm, "+neon",
+        R"(CHECK: fmul fast <4 x float> <float 0xBF2BD01060000000, float 0xBF2BD01060000000, float 0xBF2BD01060000000, float 0xBF2BD01060000000>)"},
+
+    IntrinsicTestSpec{
+        HloOpcode::kTanh, kTriple_x86_64, "",
+        R"(CHECK: fcmp fast uge <4 x float> %wide.load, <float -9.000000e+00, float -9.000000e+00, float -9.000000e+00, float -9.000000e+00>)"},
+
+    IntrinsicTestSpec{
+        HloOpcode::kTanh, kTriple_x86_64, "+avx",
+        R"(CHECK: fcmp fast uge <8 x float> %wide.load, <float -9.000000e+00, float -9.000000e+00, float -9.000000e+00, float -9.000000e+00, float -9.000000e+00, float -9.000000e+00, float -9.000000e+00, float -9.000000e+00>)"},
+
+    IntrinsicTestSpec{
+        HloOpcode::kTanh, kTriple_android_arm, "",
+        R"(CHECK: fcmp fast uge <4 x float> %wide.load, <float -9.000000e+00, float -9.000000e+00, float -9.000000e+00, float -9.000000e+00>)"},
+
+    IntrinsicTestSpec{
+        HloOpcode::kLog, kTriple_x86_64, "",
+        R"(CHECK: fadd fast <4 x float> <float 0x3FBDE4A340000000, float 0x3FBDE4A340000000, float 0x3FBDE4A340000000, float 0x3FBDE4A340000000>)"},
+
+    IntrinsicTestSpec{
+        HloOpcode::kLog, kTriple_x86_64, "+avx",
+        R"(CHECK: fadd fast <8 x float> <float 0x3FBDE4A340000000, float 0x3FBDE4A340000000, float 0x3FBDE4A340000000, float 0x3FBDE4A340000000, float 0x3FBDE4A340000000, float 0x3FBDE4A340000000, float 0x3FBDE4A340000000, float 0x3FBDE4A340000000>)"},
+
+    IntrinsicTestSpec{
+        HloOpcode::kLog, kTriple_android_arm, "",
+        R"(CHECK: fadd fast <4 x float> <float 0x3FBDE4A340000000, float 0x3FBDE4A340000000, float 0x3FBDE4A340000000, float 0x3FBDE4A340000000>)"}};
+
+INSTANTIATE_TEST_CASE_P(CpuUnaryIntrinsicTestInstantiation,
+                        CpuUnaryIntrinsicTest,
+                        ::testing::ValuesIn(CpuUnaryIntrinsicTestCases),
+                        CpuUnaryIntrinsicTest::Name);
+
+}  // namespace
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
new file mode 100644
index 00000000000000..27044b1d62027e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
@@ -0,0 +1,121 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
+#include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+
+namespace xla {
+namespace cpu {
+namespace {
+class CpuDuplicateConstantsTest : public CpuCodegenTest {};
+
+TEST_F(CpuDuplicateConstantsTest, RepeatedArrayConstants) {
+  // We use a while loop here to force the two constant HloInstructions to be in
+  // different computations.  Otherwise the HLO optimizer itself CSEs them.
+  const string hlo_text = R"(
+HloModule RepeatedConstants
+
+while_body {
+  arg_body = f32[2,3,2] parameter(0)
+  ROOT const = f32[2,3,2] constant(
+  f32[2,3,2]
+    {{{1, 2}, {1001, 1002}, {2001, 2002}},
+     {{2, 1}, {2001, 3002}, {2001, 2002}}})
+}
+
+while_cond {
+  arg_cond = f32[2,3,2] parameter(0)
+  ROOT unknown = pred[] infeed()
+}
+
+ENTRY main {
+  param = f32[2,3,2] parameter(0)
+  const_a = f32[2,3,2] constant(
+  f32[2,3,2]
+    {{{1, 2}, {1001, 1002}, {2001, 2002}},
+     {{2, 1}, {2001, 3002}, {2001, 2002}}})
+  const_b = f32[2,3,2] while(f32[2,3,2] const_a), condition=while_cond, body=while_body
+
+  out0 = () outfeed(f32[2,3,2] const_a)
+  ROOT out1 = () outfeed(f32[2,3,2] const_b)
+}
+)";
+
+  string filecheck_pattern = R"(
+CHECK: private constant [12 x float]
+CHECK-NOT: private constant [12 x float]
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_text));
+
+  CpuAotCompilationOptions options{
+      /*triple=*/"x86_64-pc-linux", /*cpu_name=*/"", /*features=*/"",
+      /*entry_point_name=*/"entry",
+      /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
+
+  CompileAheadOfTimeAndVerifyIr(std::move(module), options, filecheck_pattern,
+                                /*match_optimized_ir=*/false);
+}
+
+TEST_F(CpuDuplicateConstantsTest, RepeatedTupleConstants) {
+  // We use a while loop here to force the two constant HloInstructions to be in
+  // different computations.  Otherwise the HLO optimizer itself CSEs them.
+  const string hlo_text = R"(
+HloModule RepeatedConstants
+
+while_body {
+  arg_body = (f32[2,1]{1,0}, f32[1]{0}) parameter(0)
+  ROOT const = (f32[2,1]{1,0}, f32[1]{0}) constant((f32[2,1], f32[1]) ( f32[2,1] { { 1 }, { 2 } }, {2} ))
+}
+
+while_cond {
+  arg_cond = (f32[2,1]{1,0}, f32[1]{0}) parameter(0)
+  ROOT unknown = pred[] infeed()
+}
+
+ENTRY main {
+  param = f32[2,3,2] parameter(0)
+  const_a = (f32[2,1]{1,0}, f32[1]{0}) constant((f32[2,1], f32[1]) ( f32[2,1] { { 1 }, { 2 } }, {2} ))
+  const_b = (f32[2,1]{1,0}, f32[1]{0}) while((f32[2,1]{1,0}, f32[1]{0}) const_a), condition=while_cond, body=while_body
+
+  out0 = () outfeed((f32[2,1]{1,0}, f32[1]{0}) const_a)
+  ROOT out1 = () outfeed((f32[2,1]{1,0}, f32[1]{0}) const_b)
+}
+)";
+
+  string filecheck_pattern = R"(
+CHECK: private constant [1 x float]
+CHECK: private constant [2 x float]
+CHECK-NOT: private constant [1 x float]
+CHECK-NOT: private constant [2 x float]
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_text));
+
+  CpuAotCompilationOptions options{
+      /*triple=*/"x86_64-pc-linux", /*cpu_name=*/"", /*features=*/"",
+      /*entry_point_name=*/"entry",
+      /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
+
+  CompileAheadOfTimeAndVerifyIr(std::move(module), options, filecheck_pattern,
+                                /*match_optimized_ir=*/false);
+}
+
+}  // namespace
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
new file mode 100644
index 00000000000000..3b6b0ed7406561
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
@@ -0,0 +1,136 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "llvm/IR/Module.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/buffer_assignment.h"
+#include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/alias_analysis.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/filecheck.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace cpu {
+
+class CpuNoAliasTest : public CpuCodegenTest {};
+
+// Creates a simple HLO ir_module (runs concat(concat(x, y), x)), and then
+// inspects the aliasing information for loads to its buffers.
+TEST_F(CpuNoAliasTest, Concat) {
+  HloComputation::Builder builder(TestName());
+
+  std::unique_ptr<Literal> literal =
+      Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}});
+  auto param_shape = ShapeUtil::MakeShape(F32, {2, 2});
+  HloInstruction* param_x = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, param_shape, "x"));
+  HloInstruction* param_y = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, param_shape, "y"));
+  HloInstruction* concat1 =
+      builder.AddInstruction(HloInstruction::CreateConcatenate(
+          ShapeUtil::MakeShape(F32, {2, 4}), {param_x, param_y}, 1));
+  HloInstruction* concat2 =
+      builder.AddInstruction(HloInstruction::CreateConcatenate(
+          ShapeUtil::MakeShape(F32, {2, 6}), {concat1, param_x}, 1));
+
+  std::unique_ptr<HloComputation> computation = builder.Build();
+
+  auto hlo_module = CreateNewModule();
+  hlo_module->AddEntryComputation(std::move(computation));
+
+  // Now that we have an HLO module, build an llvm_ir::AliasAnalysis for it.
+  auto status_or_buffer_assn = BufferAssigner::Run(
+      hlo_module.get(), MakeUnique<DependencyHloOrdering>(hlo_module.get()),
+      backend().compiler()->BufferSizeBytesFunction(),
+      [](LogicalBuffer::Color) { return /*alignment=*/1; });
+  ASSERT_EQ(status_or_buffer_assn.status(), Status::OK());
+
+  llvm::LLVMContext context;
+  llvm_ir::AliasAnalysis aa(*hlo_module, *status_or_buffer_assn.ValueOrDie(),
+                            &context);
+
+  // Construct an LLVM module containing loads that we annotate as being from
+  // the buffers in the HLO module.  We'll inspect these loads to ensure that
+  // they have the expected alias information.
+  llvm::Module ir_module("test", context);
+  llvm::Function* func = llvm::cast<llvm::Function>(
+      ir_module.getOrInsertFunction("test_fn", llvm::Type::getVoidTy(context)));
+  llvm::BasicBlock* bb = llvm::BasicBlock::Create(context, "body", func);
+  llvm::IRBuilder<> ir_builder(bb);
+  auto* zero = llvm::ConstantInt::get(llvm::Type::getInt32Ty(context), 0);
+  llvm_ir::IrArray::Index zero2D({zero, zero});
+
+  llvm::ArrayType* array2d_type = llvm::ArrayType::get(
+      llvm::ArrayType::get(llvm::Type::getFloatTy(context), 100), 100);
+
+  {
+    llvm::Value* param_x_val =
+        ir_module.getOrInsertGlobal("param_x", array2d_type);
+    llvm_ir::IrArray param_x_array(param_x_val, param_shape);
+    aa.AddAliasingInformationToIrArray(*param_x, &param_x_array);
+    param_x_array.EmitReadArrayElement(zero2D, &ir_builder)
+        ->setName("read_param_x_array");
+  }
+
+  {
+    llvm::Value* concat1_val =
+        ir_module.getOrInsertGlobal("concat1", array2d_type);
+    auto shape = ShapeUtil::MakeShape(F32, {2, 4});
+    llvm_ir::IrArray concat1_array(concat1_val, shape);
+    aa.AddAliasingInformationToIrArray(*concat1, &concat1_array);
+    concat1_array.EmitReadArrayElement(zero2D, &ir_builder)
+        ->setName("read_concat1_array");
+  }
+
+  {
+    llvm::Value* concat2_val =
+        ir_module.getOrInsertGlobal("concat2", array2d_type);
+    auto shape = ShapeUtil::MakeShape(F32, {2, 6});
+    llvm_ir::IrArray concat2_array(concat2_val, shape);
+    aa.AddAliasingInformationToIrArray(*concat2, &concat2_array);
+    concat2_array.EmitReadArrayElement(zero2D, &ir_builder)
+        ->setName("read_concat2_array");
+  }
+
+  // Check the AA info in the loads.
+  const char* filecheck_pattern = R"(
+    CHECK: %read_param_x_array = load {{.*}} !noalias [[param_x_noalias:![0-9]+]]
+    CHECK: %read_concat1_array = load {{.*}} !alias.scope [[concat1_scope:![0-9]+]], !noalias [[concat1_noalias:![0-9]+]]
+    CHECK: %read_concat2_array = load {{.*}} !alias.scope [[concat1_noalias]], !noalias [[concat1_scope]]
+    CHECK-DAG: [[buf_size32:![0-9]+]] = !{!"buffer:{{.*}} size:32
+    CHECK-DAG: [[buf_size48:![0-9]+]] = !{!"buffer:{{.*}} size:48
+    CHECK-DAG: [[param_x_noalias]] = !{[[buf_size32]], [[buf_size48]]}
+    CHECK-DAG: [[concat1_scope]] = !{[[buf_size32]]}
+    CHECK-DAG: [[concat1_noalias]] = !{[[buf_size48]]}
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool filecheck_match,
+      RunFileCheck(llvm_ir::DumpModuleToString(ir_module), filecheck_pattern));
+  EXPECT_TRUE(filecheck_match);
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc
new file mode 100644
index 00000000000000..1ee279290b6fcf
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
+#include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+
+namespace xla {
+namespace cpu {
+namespace {
+class CpuOutfeedTest : public CpuCodegenTest {};
+
+TEST_F(CpuOutfeedTest, OutfeedRoot) {
+  const string hlo_text = R"(
+HloModule Outfeed
+
+ENTRY main {
+  const_a = f32[2,3,2] constant(
+  f32[2,3,2]
+    {{{1, 2}, {1001, 1002}, {2001, 2002}},
+     {{2, 1}, {2001, 3002}, {2001, 2002}}})
+
+  ROOT out = () outfeed(f32[2,3,2] const_a)
+}
+)";
+
+  string filecheck_pattern = R"(
+CHECK: private constant [12 x float]
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_text));
+
+  CpuAotCompilationOptions options{
+      /*triple=*/"x86_64-pc-linux", /*cpu_name=*/"", /*features=*/"",
+      /*entry_point_name=*/"entry",
+      /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
+
+  CompileAheadOfTimeAndVerifyIr(std::move(module), options, filecheck_pattern,
+                                /*match_optimized_ir=*/false);
+}
+
+}  // namespace
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.h b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
index 6479bf76aab581..edcaec584997b1 100644
--- a/tensorflow/compiler/xla/service/cpu/vector_support_library.h
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
@@ -143,6 +143,12 @@ class VectorSupportLibrary {
 
   llvm::Value* ComputeOffsetPointer(llvm::Value* base_pointer,
                                     llvm::Value* offset_elements);
+  llvm::Value* ComputeOffsetPointer(llvm::Value* base_pointer,
+                                    llvm::Value* offset_elements, int64 scale) {
+    return ComputeOffsetPointer(
+        base_pointer,
+        ir_builder_->CreateMul(ir_builder_->getInt64(scale), offset_elements));
+  }
   llvm::Value* ComputeOffsetPointer(llvm::Value* base_pointer,
                                     int64 offset_elements) {
     return ComputeOffsetPointer(base_pointer,
diff --git a/tensorflow/compiler/xla/service/despecializer.h b/tensorflow/compiler/xla/service/despecializer.h
index af48f4ab6e506d..cc1695b7f86380 100644
--- a/tensorflow/compiler/xla/service/despecializer.h
+++ b/tensorflow/compiler/xla/service/despecializer.h
@@ -25,7 +25,7 @@ namespace xla {
 
 // Creates an HloPassPipeline containing multiple HloPasses that can
 // despecialize an optimized HloModule. This is useful to run an HloModule
-// optimized for one specfic platform on a different platform (undoing platform
+// optimized for one specific platform on a different platform (undoing platform
 // specific passes) with matching numerics for comparison.
 //
 // Current despecialization passes are Defuser, ImplicitBroadcastRemover,
diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.cc b/tensorflow/compiler/xla/service/device_memory_allocator.cc
index 35db4fd2a22cc1..e228bb56bce8fe 100644
--- a/tensorflow/compiler/xla/service/device_memory_allocator.cc
+++ b/tensorflow/compiler/xla/service/device_memory_allocator.cc
@@ -29,7 +29,7 @@ StreamExecutorMemoryAllocator::StreamExecutorMemoryAllocator(
     : DeviceMemoryAllocator(platform),
       stream_executors_(stream_executors.begin(), stream_executors.end()) {}
 
-StatusOr<se::DeviceMemoryBase> StreamExecutorMemoryAllocator::Allocate(
+StatusOr<OwningDeviceMemory> StreamExecutorMemoryAllocator::Allocate(
     int device_ordinal, uint64 size, bool retry_on_failure) {
   TF_ASSIGN_OR_RETURN(se::StreamExecutor * stream_executor,
                       GetStreamExecutor(device_ordinal));
@@ -40,22 +40,17 @@ StatusOr<se::DeviceMemoryBase> StreamExecutorMemoryAllocator::Allocate(
         tensorflow::strings::HumanReadableNumBytes(size).c_str(), size,
         device_ordinal);
   }
-  return result;
+  return OwningDeviceMemory(result, device_ordinal, this);
 }
 
-tensorflow::Status StreamExecutorMemoryAllocator::Deallocate(
-    int device_ordinal, se::DeviceMemoryBase* mem) {
-  if (!mem->is_null()) {
+Status StreamExecutorMemoryAllocator::Deallocate(int device_ordinal,
+                                                 se::DeviceMemoryBase mem) {
+  if (!mem.is_null()) {
     TF_ASSIGN_OR_RETURN(se::StreamExecutor * stream_executor,
                         GetStreamExecutor(device_ordinal));
-    // We make a local copy of 'mem' so the original is not zeroed out by the
-    // Deallocate() call below. This gives us a better chance of
-    // catching double-free bugs, since Deallocate silently succeeds for null
-    // values.
-    se::DeviceMemoryBase mem_copy(*mem);
-    stream_executor->Deallocate(&mem_copy);
+    stream_executor->Deallocate(&mem);
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 StatusOr<se::StreamExecutor*> StreamExecutorMemoryAllocator::GetStreamExecutor(
diff --git a/tensorflow/compiler/xla/service/device_memory_allocator.h b/tensorflow/compiler/xla/service/device_memory_allocator.h
index da45c4d45a1c56..d87b86caf0d3ac 100644
--- a/tensorflow/compiler/xla/service/device_memory_allocator.h
+++ b/tensorflow/compiler/xla/service/device_memory_allocator.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/compiler/xla/service/owning_device_memory.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -37,28 +38,29 @@ class DeviceMemoryAllocator {
       : platform_(platform) {}
   virtual ~DeviceMemoryAllocator() {}
 
+  // Allocates memory on the device.
+  //
+  // If size > 0 and the returned StatusOr is OK, the wrapped OwningDeviceMemory
+  // must not be null.  If size == 0, must return a null OwningDeviceMemory.
+  //
   // 'retry_on_failure': If false, and the first attempt to allocate the memory
   // fails, the allocation should return immediately without retrying.  An
   // example use case is optional scratch spaces where a failure has only
   // performance impact.
-  //
-  // Allocate() should return a null pointer for a size-0 allocation.
-  // Deallocate() must be a no-op for null pointers.
-  virtual StatusOr<se::DeviceMemoryBase> Allocate(int device_ordinal,
-                                                  uint64 size,
-                                                  bool retry_on_failure) = 0;
+  virtual StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
+                                                bool retry_on_failure) = 0;
 
   // Two-arg version of Allocate(), which sets retry-on-failure to true.
   //
   // (We don't simply use a default argument on the virtual Allocate function
   // because default args on virtual functions are disallowed by the Google
   // style guide.)
-  StatusOr<se::DeviceMemoryBase> Allocate(int device_ordinal, uint64 size) {
+  StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size) {
     return Allocate(device_ordinal, size, /*retry_on_failure=*/true);
   }
 
-  virtual tensorflow::Status Deallocate(int device_ordinal,
-                                        se::DeviceMemoryBase* mem) = 0;
+  // Must be a nop for null pointers.
+  virtual Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) = 0;
 
   // Return the platform that the allocator allocates memory on.
   const se::Platform* platform() const { return platform_; }
@@ -68,6 +70,7 @@ class DeviceMemoryAllocator {
   virtual bool AllowsAsynchronousDeallocation() const = 0;
 
  protected:
+  friend class OwningDeviceMemory;
   const se::Platform* platform_;
 };
 
@@ -79,14 +82,13 @@ class StreamExecutorMemoryAllocator : public DeviceMemoryAllocator {
       const se::Platform* platform,
       tensorflow::gtl::ArraySlice<se::StreamExecutor*> stream_executors);
 
-  StatusOr<se::DeviceMemoryBase> Allocate(int device_ordinal, uint64 size,
-                                          bool retry_on_failure) override;
+  StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
+                                        bool retry_on_failure) override;
 
   // Pull in two-arg overload that sets retry_on_failure to true.
   using DeviceMemoryAllocator::Allocate;
 
-  tensorflow::Status Deallocate(int device_ordinal,
-                                se::DeviceMemoryBase* mem) override;
+  Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override;
 
   bool AllowsAsynchronousDeallocation() const override;
 
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index 0528b076027603..64678d9d745097 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -138,6 +138,9 @@ class DfsHloVisitorBase {
   virtual Status HandleExp(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
+  virtual Status HandleExpm1(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
   virtual Status HandleFloor(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
@@ -150,6 +153,9 @@ class DfsHloVisitorBase {
   virtual Status HandleClz(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
+  virtual Status HandleLog1p(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
   virtual Status HandleCos(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
@@ -191,6 +197,10 @@ class DfsHloVisitorBase {
     return HandleElementwiseUnary(hlo);
   }
 
+  virtual Status HandleDomain(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+
   virtual Status HandleInfeed(HloInstructionPtr hlo) = 0;
   virtual Status HandleOutfeed(HloInstructionPtr hlo) = 0;
   virtual Status HandleHostCompute(HloInstructionPtr hlo) = 0;
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index 38b5efa9fb2cdb..9a8bab353ef6b1 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -418,8 +418,12 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
     }
     case HloOpcode::kExp:
       return EmitExp(op->shape().element_type(), operand_value);
+    case HloOpcode::kExpm1:
+      return EmitExpm1(op->shape().element_type(), operand_value);
     case HloOpcode::kLog:
       return EmitLog(op->shape().element_type(), operand_value);
+    case HloOpcode::kLog1p:
+      return EmitLog1p(op->shape().element_type(), operand_value);
     case HloOpcode::kCos:
       return EmitCos(op->shape().element_type(), operand_value);
     case HloOpcode::kSin:
@@ -493,6 +497,22 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       return EmitComposeComplex(
           op, ir_builder_->CreateFMul(one_half, log_sum_sq), angle);
     }
+    case HloOpcode::kLog1p: {
+      // log1p(a+bi) = .5*log((a+1)^2+b^2) + i*atan2(b, a + 1)
+      auto a = EmitExtractReal(operand_value);
+      auto b = EmitExtractImag(operand_value);
+      llvm::Type* llvm_ty = a->getType();
+      auto one = llvm::ConstantFP::get(llvm_ty, 1.0);
+      auto a_plus_one = ir_builder_->CreateFAdd(a, one);
+      auto sum_sq = ir_builder_->CreateFAdd(
+          ir_builder_->CreateFMul(a_plus_one, a_plus_one),
+          ir_builder_->CreateFMul(b, b));
+      TF_ASSIGN_OR_RETURN(auto log_sum_sq, EmitLog(component_type, sum_sq));
+      TF_ASSIGN_OR_RETURN(auto angle, EmitAtan2(component_type, b, a_plus_one));
+      auto one_half = llvm::ConstantFP::get(llvm_ty, 0.5);
+      return EmitComposeComplex(
+          op, ir_builder_->CreateFMul(one_half, log_sum_sq), angle);
+    }
     case HloOpcode::kConvert: {
       PrimitiveType from_type = op->operand(0)->shape().element_type();
       TF_RET_CHECK(primitive_util::IsComplexType(from_type));
@@ -523,6 +543,20 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitComplexUnaryOp(
       return EmitComposeComplex(op, ir_builder_->CreateFMul(exp_a, cos_b),
                                 ir_builder_->CreateFMul(exp_a, sin_b));
     }
+    case HloOpcode::kExpm1: {
+      // e^(a+bi)-1 = (e^a*cos(b)-1)+e^a*sin(b)i
+      TF_ASSIGN_OR_RETURN(
+          auto exp_a, EmitExp(component_type, EmitExtractReal(operand_value)));
+      TF_ASSIGN_OR_RETURN(
+          auto cos_b, EmitCos(component_type, EmitExtractImag(operand_value)));
+      TF_ASSIGN_OR_RETURN(
+          auto sin_b, EmitSin(component_type, EmitExtractImag(operand_value)));
+      auto one = llvm::ConstantFP::get(exp_a->getType(), 1.0);
+      auto real_result =
+          ir_builder_->CreateFSub(ir_builder_->CreateFMul(exp_a, cos_b), one);
+      auto imag_result = ir_builder_->CreateFMul(exp_a, sin_b);
+      return EmitComposeComplex(op, real_result, imag_result);
+    }
     case HloOpcode::kCos: {
       // cos(z) = .5(e^(iz) + e^(-iz))
       // cos(a+bi) = .5(e^(-b+ai) + e^(b-ai))
@@ -975,6 +1009,28 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitLog(PrimitiveType prim_type,
                                       {value->getType()}, ir_builder_);
 }
 
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitLog1p(PrimitiveType prim_type,
+                                                     llvm::Value* value) const {
+  auto x = value;
+  auto type = llvm_ir::PrimitiveTypeToIrType(prim_type, module_);
+  auto one = llvm::ConstantFP::get(type, 1.0);
+  auto negative_half = llvm::ConstantFP::get(type, -0.5);
+  // When x is large, the naive evaluation of ln(x + 1) is more
+  // accurate than the Taylor series.
+  TF_ASSIGN_OR_RETURN(auto for_large_x,
+                      EmitLog(prim_type, ir_builder_->CreateFAdd(x, one)));
+  // The Taylor series for ln(x+1) is x - x^2/2 - x^3/3 + ….
+  auto for_small_x = ir_builder_->CreateFMul(
+      ir_builder_->CreateFAdd(ir_builder_->CreateFMul(negative_half, x), one),
+      x);
+  const auto kAntilogarithmIsSmallThreshold = 1e-4;
+  auto abs_x = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {value},
+                                            {type}, ir_builder_);
+  auto x_is_small = ir_builder_->CreateFCmpOLT(
+      abs_x, llvm::ConstantFP::get(type, kAntilogarithmIsSmallThreshold));
+  return ir_builder_->CreateSelect(x_is_small, for_small_x, for_large_x);
+}
+
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitSin(PrimitiveType prim_type,
                                                    llvm::Value* value) const {
   return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::sin, {value},
@@ -993,6 +1049,29 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitExp(PrimitiveType prim_type,
                                       {value->getType()}, ir_builder_);
 }
 
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitExpm1(PrimitiveType prim_type,
+                                                     llvm::Value* value) const {
+  auto x = value;
+  auto type = llvm_ir::PrimitiveTypeToIrType(prim_type, module_);
+  auto one = llvm::ConstantFP::get(type, 1.0);
+  auto half = llvm::ConstantFP::get(type, 0.5);
+  // When the exponent is large, the naive evaluation of e^(x) - 1 is more
+  // accurate than the Taylor series.
+  TF_ASSIGN_OR_RETURN(auto exp_x, EmitExp(prim_type, value));
+  auto for_large_x = ir_builder_->CreateFSub(exp_x, one);
+  // The Taylor series for exp(x) is 1 + x + x^2/2 + x^3/6 + ….
+  // We want exp(x)-1 which is x + x^2/2 + x^3/6 + ….
+  auto x_squared = ir_builder_->CreateFAdd(x, x);
+  auto x_squared_over_two = ir_builder_->CreateFMul(x_squared, half);
+  auto for_small_x = ir_builder_->CreateFAdd(x, x_squared_over_two);
+  const auto kExponentIsSmallThreshold = 1e-5;
+  auto abs_x = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {value},
+                                            {type}, ir_builder_);
+  auto x_is_small = ir_builder_->CreateFCmpOLT(
+      abs_x, llvm::ConstantFP::get(type, kExponentIsSmallThreshold));
+  return ir_builder_->CreateSelect(x_is_small, for_small_x, for_large_x);
+}
+
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitPow(PrimitiveType prim_type,
                                                    llvm::Value* lhs,
                                                    llvm::Value* rhs) const {
@@ -1344,6 +1423,482 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeRngElementGenerator(
   };
 }
 
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalSelect(
+    const HloInstruction* hlo,
+    const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator,
+    const llvm_ir::IrArray::Index& index) const {
+  TF_ASSIGN_OR_RETURN(llvm::Value * pred_value,
+                      operand_to_generator.at(hlo->operand(0))(
+                          ElementwiseSourceIndex(index, *hlo, 0)));
+  TF_ASSIGN_OR_RETURN(llvm::Value * on_true_value,
+                      operand_to_generator.at(hlo->operand(1))(
+                          ElementwiseSourceIndex(index, *hlo, 1)));
+  TF_ASSIGN_OR_RETURN(llvm::Value * on_false_value,
+                      operand_to_generator.at(hlo->operand(2))(
+                          ElementwiseSourceIndex(index, *hlo, 2)));
+  return ir_builder_->CreateSelect(
+      ir_builder_->CreateTrunc(pred_value, ir_builder_->getInt1Ty()),
+      on_true_value, on_false_value);
+}
+
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalClamp(
+    const HloInstruction* hlo,
+    const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator,
+    const llvm_ir::IrArray::Index& index) const {
+  TF_ASSIGN_OR_RETURN(llvm::Value * min_value,
+                      operand_to_generator.at(hlo->operand(0))(
+                          ElementwiseSourceIndex(index, *hlo, 0)));
+  TF_ASSIGN_OR_RETURN(llvm::Value * arg_value,
+                      operand_to_generator.at(hlo->operand(1))(
+                          ElementwiseSourceIndex(index, *hlo, 1)));
+  TF_ASSIGN_OR_RETURN(llvm::Value * max_value,
+                      operand_to_generator.at(hlo->operand(2))(
+                          ElementwiseSourceIndex(index, *hlo, 2)));
+  PrimitiveType prim_type = hlo->shape().element_type();
+  if (primitive_util::IsFloatingPointType(prim_type)) {
+    return EmitFloatMin(max_value, EmitFloatMax(min_value, arg_value));
+  } else if (primitive_util::IsIntegralType(prim_type)) {
+    bool is_signed = primitive_util::IsSignedIntegralType(prim_type);
+    return EmitIntegralMin(
+        max_value, EmitIntegralMax(min_value, arg_value, is_signed), is_signed);
+  } else {
+    return Unimplemented("Clamp unimplemented for %s",
+                         PrimitiveType_Name(prim_type).c_str());
+  }
+}
+
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalConcatenate(
+    const HloInstruction* hlo,
+    const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator,
+    const llvm_ir::IrArray::Index& target_index) const {
+  const int64 concat_dim = hlo->dimensions(0);
+  auto source_index = target_index;
+
+  llvm::BasicBlock* init_block = ir_builder_->GetInsertBlock();
+
+  // A terminator should be present iff we're emitting code
+  // into the middle (as opposed to the end) of a basic block.
+  CHECK_EQ(ir_builder_->GetInsertPoint() == init_block->end(),
+           init_block->getTerminator() == nullptr);
+
+  llvm::BasicBlock* exit_block;
+  if (ir_builder_->GetInsertPoint() == init_block->end()) {
+    exit_block = llvm_ir::CreateBasicBlock(
+        /*insert_before=*/nullptr, IrName(hlo, "merge"), ir_builder_);
+  } else {
+    exit_block = init_block->splitBasicBlock(ir_builder_->GetInsertPoint(),
+                                             AsStringRef(IrName(hlo, "merge")));
+    init_block->getTerminator()->eraseFromParent();
+  }
+
+  llvm_ir::SetToFirstInsertPoint(exit_block, ir_builder_);
+  llvm::PHINode* output = ir_builder_->CreatePHI(
+      llvm_ir::PrimitiveTypeToIrType(hlo->shape().element_type(), module_),
+      hlo->operands().size());
+  auto prior_insert_point = ir_builder_->GetInsertPoint();
+
+  ir_builder_->SetInsertPoint(init_block);
+
+  for (int64 operand_idx = 0; operand_idx < hlo->operand_count();
+       ++operand_idx) {
+    const HloInstruction* operand = hlo->operand(operand_idx);
+    auto true_block = llvm_ir::CreateBasicBlock(
+        exit_block, StrCat("concat_index_from_operand", operand_idx),
+        ir_builder_);
+    auto false_block = llvm_ir::CreateBasicBlock(
+        exit_block, StrCat("concat_index_not_from_operand", operand_idx),
+        ir_builder_);
+    auto concat_dim_size =
+        llvm::ConstantInt::get(source_index[concat_dim]->getType(),
+                               operand->shape().dimensions(concat_dim));
+    ir_builder_->CreateCondBr(
+        ir_builder_->CreateICmpULT(source_index[concat_dim], concat_dim_size),
+        true_block, false_block);
+
+    // Create the terminator of the true block before calling operand
+    // generators, because they require non-degenerate basic blocks.
+    ir_builder_->SetInsertPoint(
+        llvm::BranchInst::Create(exit_block, /*InsertAtEnd=*/true_block));
+    TF_ASSIGN_OR_RETURN(llvm::Value * value,
+                        operand_to_generator.at(operand)(source_index));
+    output->addIncoming(value, ir_builder_->GetInsertBlock());
+
+    // Subtract the size of the concat dimension of the current operand
+    // from the source index.
+    ir_builder_->SetInsertPoint(false_block);
+    source_index[concat_dim] =
+        ir_builder_->CreateSub(source_index[concat_dim], concat_dim_size);
+  }
+
+  ir_builder_->CreateUnreachable();
+  ir_builder_->SetInsertPoint(exit_block, prior_insert_point);
+  return output;
+}
+
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicSlice(
+    const HloInstruction* hlo,
+    const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator,
+    const llvm_ir::IrArray::Index& index) const {
+  // Emit IR to read dynamic start indices from hlo->operand(1).
+  const HloInstruction* input_hlo = hlo->operand(0);
+  const int64 rank = ShapeUtil::Rank(input_hlo->shape());
+  llvm_ir::IrArray::Index slice_start_index(rank);
+  for (int64 i = 0; i < rank; ++i) {
+    llvm_ir::IrArray::Index dim_index(1, ir_builder_->getInt64(i));
+    TF_ASSIGN_OR_RETURN(llvm::Value * start_index_value,
+                        operand_to_generator.at(hlo->operand(1))(dim_index));
+
+    // Clamp the start index so that the sliced portion fits in the operand:
+    // start_index = clamp(start_index, 0, operand_dim_size - output_dim_size)
+
+    // TODO(b/74360564): This is implementation defined behavior, but is
+    // currently respected by all implementations. Change this if we ever decide
+    // to oficially document different behavior.
+    start_index_value = ir_builder_->CreateSExtOrBitCast(start_index_value,
+                                                         index[i]->getType());
+    llvm::Value* operand_dim_size = llvm::ConstantInt::get(
+        start_index_value->getType(), input_hlo->shape().dimensions(i));
+    llvm::Value* output_dim_size = llvm::ConstantInt::get(
+        start_index_value->getType(), hlo->shape().dimensions(i));
+
+    start_index_value = EmitIntegralMin(
+        ir_builder_->CreateSub(operand_dim_size, output_dim_size),
+        EmitIntegralMax(llvm::ConstantInt::get(start_index_value->getType(), 0),
+                        start_index_value, /*is_signed=*/true),
+        /*is_signed=*/true);
+
+    start_index_value->setName(
+        AsStringRef(IrName(hlo, StrCat("start_idx", i))));
+    slice_start_index[i] = start_index_value;
+  }
+
+  llvm_ir::IrArray::Index input_index(rank);
+  for (int64 i = 0; i < rank; ++i) {
+    // Emit IR which computes:
+    //   input_index = start_index + offset_index
+    input_index[i] = ir_builder_->CreateAdd(slice_start_index[i], index[i]);
+  }
+  return operand_to_generator.at(input_hlo)(input_index);
+}
+
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalGather(
+    const HloInstruction* hlo,
+    const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator,
+    const llvm_ir::IrArray::Index& index) const {
+  const Shape& operand_shape = hlo->operand(0)->shape();
+  const Shape& indices_shape = hlo->operand(1)->shape();
+  const Shape& output_shape = hlo->shape();
+
+  const GatherDimensionNumbers& dim_numbers = hlo->gather_dimension_numbers();
+
+  const llvm_ir::ElementGenerator& operand_generator =
+      operand_to_generator.at(hlo->operand(0));
+  const llvm_ir::ElementGenerator& indices_generator =
+      operand_to_generator.at(hlo->operand(1));
+
+  // This is the index into `operand` that holds the element we want to
+  // generate.  This index "unsafe" as in the components in here may be
+  // out of bounds.
+  IrArray::Index unsafe_operand_index;
+
+  // First copy in the window indices to unsafe_operand_index.
+  for (int64 i = 0, e = operand_shape.dimensions_size(),
+             unsafe_operand_index_dim = 0;
+       i < e; i++) {
+    if (c_binary_search(dim_numbers.elided_window_dims(), i)) {
+      unsafe_operand_index.push_back(ir_builder_->getInt64(0));
+    } else {
+      unsafe_operand_index.push_back(
+          index[dim_numbers.output_window_dims(unsafe_operand_index_dim++)]);
+    }
+  }
+
+  // This is the index of the index vector in the gather_indices tensor.
+  IrArray::Index gather_index_index;
+  {
+    std::vector<llvm::Value*> gather_index_index_components;
+    for (int64 i = 0, e = output_shape.dimensions_size(); i < e; i++) {
+      if (!c_binary_search(dim_numbers.output_window_dims(), i)) {
+        gather_index_index.push_back(index[i]);
+      }
+    }
+
+    if (gather_index_index.size() != indices_shape.dimensions_size()) {
+      gather_index_index.InsertAt(dim_numbers.index_vector_dim(), nullptr);
+    }
+  }
+
+  auto add_to_unsafe_operand_index = [&](llvm::Value* index_component,
+                                         int64 dim) {
+    llvm::Value* gather_dim_component_extended = ir_builder_->CreateSExtOrTrunc(
+        index_component, ir_builder_->getInt64Ty());
+    unsafe_operand_index[dim_numbers.gather_dims_to_operand_dims(dim)] =
+        ir_builder_->CreateAdd(
+            unsafe_operand_index[dim_numbers.gather_dims_to_operand_dims(dim)],
+            gather_dim_component_extended);
+  };
+
+  if (indices_shape.dimensions_size() == dim_numbers.index_vector_dim()) {
+    TF_ASSIGN_OR_RETURN(llvm::Value * gather_dim_component,
+                        indices_generator(gather_index_index));
+    add_to_unsafe_operand_index(gather_dim_component, 0);
+  } else {
+    int64 index_vector_size =
+        indices_shape.dimensions(dim_numbers.index_vector_dim());
+    for (int64 i = 0; i < index_vector_size; i++) {
+      gather_index_index[dim_numbers.index_vector_dim()] =
+          ir_builder_->getInt64(i);
+      TF_ASSIGN_OR_RETURN(llvm::Value * gather_dim_component,
+                          indices_generator(gather_index_index));
+      add_to_unsafe_operand_index(gather_dim_component, i);
+    }
+  }
+
+  IrArray::Index safe_operand_index;
+  for (int64 i = 0, e = unsafe_operand_index.size(); i < e; i++) {
+    safe_operand_index.push_back(ir_builder_->CreateURem(
+        unsafe_operand_index[i],
+        ir_builder_->getInt64(operand_shape.dimensions(i))));
+  }
+
+  return operand_generator(safe_operand_index);
+}
+
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDynamicUpdateSlice(
+    const HloInstruction* hlo,
+    const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator,
+    const llvm_ir::IrArray::Index& index) const {
+  const HloInstruction* input_hlo = hlo->operand(0);
+  const HloInstruction* update_hlo = hlo->operand(1);
+  const HloInstruction* start_hlo = hlo->operand(2);
+  // Calculate slice start/end indices.
+  const int64 rank = ShapeUtil::Rank(input_hlo->shape());
+  llvm_ir::IrArray::Index slice_start_index(rank);
+  llvm_ir::IrArray::Index slice_limit_index(rank);
+  // Slice intersection gathers (ANDs) conditions on all ranks for which
+  // 'input' is set to 'update'
+  llvm::Value* slice_intersection = ir_builder_->getTrue();
+
+  for (int64 i = 0; i < rank; ++i) {
+    llvm_ir::IrArray::Index dim_index(1, ir_builder_->getInt64(i));
+    TF_ASSIGN_OR_RETURN(llvm::Value * start_index_value,
+                        operand_to_generator.at(start_hlo)(dim_index));
+
+    // Clamp the start index so that the update region fits in the operand.
+    // start_index = clamp(start_index, 0, input_dim_size - update_dim_size)
+
+    // TODO(b/74360564): This is implementation defined behavior, but is
+    // currently respected by all implementations. Change this if we ever decide
+    // to oficially document different behavior.
+    start_index_value = ir_builder_->CreateSExtOrBitCast(start_index_value,
+                                                         index[i]->getType());
+    llvm::Value* input_dim_size = llvm::ConstantInt::get(
+        index[i]->getType(), input_hlo->shape().dimensions(i));
+    llvm::Value* update_dim_size = llvm::ConstantInt::get(
+        index[i]->getType(), update_hlo->shape().dimensions(i));
+
+    start_index_value = EmitIntegralMin(
+        ir_builder_->CreateSub(input_dim_size, update_dim_size),
+        EmitIntegralMax(llvm::ConstantInt::get(start_index_value->getType(), 0),
+                        start_index_value, /*is_signed=*/true),
+        /*is_signed=*/true);
+
+    start_index_value->setName(
+        AsStringRef(IrName(hlo, StrCat("start_idx", i))));
+    slice_start_index[i] = start_index_value;
+    slice_limit_index[i] =
+        ir_builder_->CreateAdd(slice_start_index[i], update_dim_size);
+
+    slice_intersection = ir_builder_->CreateAnd(
+        slice_intersection,
+        ir_builder_->CreateICmpSGE(index[i], slice_start_index[i]),
+        "slice_intersection");
+    slice_intersection = ir_builder_->CreateAnd(
+        slice_intersection,
+        ir_builder_->CreateICmpSLT(index[i], slice_limit_index[i]),
+        "slice_intersection");
+  }
+
+  // Emit:
+  // if (slice_intersection) -> return data from 'update'.
+  // else                    -> return data from 'input'.
+  llvm::Value* ret_value_addr = llvm_ir::EmitAllocaAtFunctionEntry(
+      llvm_ir::PrimitiveTypeToIrType(hlo->shape().element_type(), module_),
+      "ret_value_addr", ir_builder_);
+  llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse(
+      slice_intersection, "slice_intersection", ir_builder_);
+
+  // Handle true BB (return data from 'update')
+  SetToFirstInsertPoint(if_data.true_block, ir_builder_);
+  // Compute update index for intersection case.
+  llvm_ir::IrArray::Index update_index(rank);
+  for (int64 i = 0; i < rank; ++i) {
+    update_index[i] = ir_builder_->CreateSub(index[i], slice_start_index[i]);
+  }
+  TF_ASSIGN_OR_RETURN(llvm::Value * true_value,
+                      operand_to_generator.at(update_hlo)(update_index));
+  ir_builder_->CreateStore(true_value, ret_value_addr);
+
+  // Handle false BB (return data from 'input')
+  SetToFirstInsertPoint(if_data.false_block, ir_builder_);
+  TF_ASSIGN_OR_RETURN(llvm::Value * false_value,
+                      operand_to_generator.at(input_hlo)(index));
+  ir_builder_->CreateStore(false_value, ret_value_addr);
+
+  SetToFirstInsertPoint(if_data.after_block, ir_builder_);
+  return ir_builder_->CreateLoad(ret_value_addr);
+}
+
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalPad(
+    const HloInstruction* hlo,
+    const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator,
+    const llvm_ir::IrArray::Index& padded_index) const {
+  auto index = padded_index;
+  llvm::Value* in_bounds = ir_builder_->getTrue();
+  for (size_t i = 0; i < index.size(); ++i) {
+    auto index_typed_const = [=](int64 n) {
+      return llvm::ConstantInt::get(index[i]->getType(), n);
+    };
+    const auto& pad_dim = hlo->padding_config().dimensions(i);
+    index[i] = ir_builder_->CreateSub(
+        index[i], index_typed_const(pad_dim.edge_padding_low()));
+    in_bounds = ir_builder_->CreateAnd(
+        in_bounds, ir_builder_->CreateICmpSGE(index[i], index_typed_const(0)),
+        "in_bounds");
+    in_bounds = ir_builder_->CreateAnd(
+        in_bounds,
+        ir_builder_->CreateICmpEQ(
+            index_typed_const(0),
+            ir_builder_->CreateURem(
+                index[i], index_typed_const(pad_dim.interior_padding() + 1))),
+        "in_bounds");
+    index[i] = ir_builder_->CreateSDiv(
+        index[i], index_typed_const(pad_dim.interior_padding() + 1));
+    in_bounds = ir_builder_->CreateAnd(
+        in_bounds,
+        ir_builder_->CreateICmpSLT(
+            index[i],
+            index_typed_const(hlo->operand(0)->shape().dimensions(i))),
+        "in_bounds");
+  }
+
+  // if (in_bounds) {
+  //   ret_value = operand0[index];  // source
+  // } else {
+  //   ret_value = *operand1;        // padding
+  // }
+  llvm::Value* ret_value_addr = llvm_ir::EmitAllocaAtFunctionEntry(
+      llvm_ir::PrimitiveTypeToIrType(hlo->shape().element_type(), module_),
+      "pad_result_addr", ir_builder_);
+  llvm_ir::LlvmIfData if_data =
+      llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", ir_builder_);
+  SetToFirstInsertPoint(if_data.true_block, ir_builder_);
+  TF_ASSIGN_OR_RETURN(llvm::Value * operand_value,
+                      operand_to_generator.at(hlo->operand(0))(index));
+  ir_builder_->CreateStore(operand_value, ret_value_addr);
+
+  SetToFirstInsertPoint(if_data.false_block, ir_builder_);
+  TF_ASSIGN_OR_RETURN(llvm::Value * padding_value,
+                      operand_to_generator.at(hlo->operand(1))({}));
+  ir_builder_->CreateStore(padding_value, ret_value_addr);
+
+  SetToFirstInsertPoint(if_data.after_block, ir_builder_);
+  // Don't create phi(operand_value, padding_value) here, because invoking
+  // operand_to_generator may create new basic blocks, making the parent
+  // of operand_value or padding_value no longer a predecessor of
+  // if_data.after_block.
+  return ir_builder_->CreateLoad(ret_value_addr);
+}
+
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDot(
+    const HloInstruction* hlo,
+    const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator,
+    const llvm_ir::IrArray::Index& dot_result_index) const {
+  auto lhs_generator = operand_to_generator.at(hlo->operand(0));
+  auto rhs_generator = operand_to_generator.at(hlo->operand(1));
+
+  const DotDimensionNumbers& dim_numbers = hlo->dot_dimension_numbers();
+  int64 lhs_contracting_dim = dim_numbers.lhs_contracting_dimensions(0);
+  int64 rhs_contracting_dim = dim_numbers.rhs_contracting_dimensions(0);
+
+  int64 contracted_dim_size =
+      hlo->operand(0)->shape().dimensions(lhs_contracting_dim);
+  int64 lhs_dims = hlo->operand(0)->shape().dimensions_size();
+  int64 rhs_dims = hlo->operand(1)->shape().dimensions_size();
+
+  std::unique_ptr<llvm_ir::ForLoop> inner_loop = llvm_ir::ForLoop::EmitForLoop(
+      IrName(hlo, "inner"), ir_builder_->getInt64(0),
+      ir_builder_->getInt64(contracted_dim_size), ir_builder_->getInt64(1),
+      ir_builder_);
+
+  SetToFirstInsertPoint(inner_loop->GetPreheaderBasicBlock(), ir_builder_);
+  PrimitiveType primitive_type = hlo->shape().element_type();
+  llvm::Type* primitive_type_llvm =
+      llvm_ir::PrimitiveTypeToIrType(primitive_type, module_);
+  llvm::Value* accumulator_alloca = llvm_ir::EmitAllocaAtFunctionEntry(
+      primitive_type_llvm, "dot_acc", ir_builder_);
+  ir_builder_->CreateStore(llvm::Constant::getNullValue(primitive_type_llvm),
+                           accumulator_alloca);
+
+  SetToFirstInsertPoint(inner_loop->GetBodyBasicBlock(), ir_builder_);
+
+  // This is the inner reduction loop for a dot operation that produces
+  // one element in the output.  If the operands to the dot operation have
+  // shapes [A,B,C,T] and [D,T,E], the result has a shape [A,B,C,D,E].
+  // Given an output index [a,b,c,d,e] in the result, we compute:
+  //   sum(lhs[a,b,c,t]*rhs[d,t,e] for t in [0, T))
+
+  IrArray::Index lhs_index, rhs_index;
+
+  for (int64 i = 0; i < lhs_dims - 1; i++) {
+    lhs_index.push_back(dot_result_index[i]);
+  }
+  lhs_index.InsertAt(lhs_contracting_dim, inner_loop->GetIndVarValue());
+
+  for (int64 i = 0; i < rhs_dims - 1; i++) {
+    rhs_index.push_back(dot_result_index[lhs_dims - 1 + i]);
+  }
+  rhs_index.InsertAt(rhs_contracting_dim, inner_loop->GetIndVarValue());
+
+  llvm::Value* current_accumulator =
+      ir_builder_->CreateLoad(accumulator_alloca);
+  TF_ASSIGN_OR_RETURN(llvm::Value * lhs_value, lhs_generator(lhs_index));
+  TF_ASSIGN_OR_RETURN(llvm::Value * rhs_value, rhs_generator(rhs_index));
+  llvm::Value* next_accumulator;
+  if (primitive_util::IsComplexType(primitive_type)) {
+    llvm::Value* product_real = ir_builder_->CreateFSub(
+        ir_builder_->CreateFMul(EmitExtractReal(lhs_value),
+                                EmitExtractReal(rhs_value)),
+        ir_builder_->CreateFMul(EmitExtractImag(lhs_value),
+                                EmitExtractImag(rhs_value)));
+    llvm::Value* product_imag = ir_builder_->CreateFAdd(
+        ir_builder_->CreateFMul(EmitExtractReal(lhs_value),
+                                EmitExtractImag(rhs_value)),
+        ir_builder_->CreateFMul(EmitExtractImag(lhs_value),
+                                EmitExtractReal(rhs_value)));
+    next_accumulator = ir_builder_->CreateInsertValue(
+        current_accumulator,
+        ir_builder_->CreateFAdd(EmitExtractReal(current_accumulator),
+                                product_real),
+        {0});
+    next_accumulator = ir_builder_->CreateInsertValue(
+        next_accumulator,
+        ir_builder_->CreateFAdd(EmitExtractImag(current_accumulator),
+                                product_imag),
+        {1});
+  } else if (primitive_util::IsFloatingPointType(primitive_type)) {
+    next_accumulator = ir_builder_->CreateFAdd(
+        current_accumulator, ir_builder_->CreateFMul(lhs_value, rhs_value));
+  } else {
+    next_accumulator = ir_builder_->CreateAdd(
+        current_accumulator, ir_builder_->CreateMul(lhs_value, rhs_value));
+  }
+  ir_builder_->CreateStore(next_accumulator, accumulator_alloca);
+
+  SetToFirstInsertPoint(inner_loop->GetExitBasicBlock(), ir_builder_);
+  return ir_builder_->CreateLoad(accumulator_alloca);
+}
+
 llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
     const HloInstruction* hlo,
     const ElementalIrEmitter::HloToElementGeneratorMap& operand_to_generator)
@@ -1358,10 +1913,12 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
     case HloOpcode::kCopy:
     case HloOpcode::kCos:
     case HloOpcode::kExp:
+    case HloOpcode::kExpm1:
     case HloOpcode::kFloor:
     case HloOpcode::kImag:
     case HloOpcode::kIsFinite:
     case HloOpcode::kLog:
+    case HloOpcode::kLog1p:
     case HloOpcode::kNegate:
     case HloOpcode::kNot:
     case HloOpcode::kReal:
@@ -1411,43 +1968,12 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
     case HloOpcode::kSelect:
       return [this, hlo, &operand_to_generator](
                  const IrArray::Index& index) -> StatusOr<llvm::Value*> {
-        TF_ASSIGN_OR_RETURN(llvm::Value * pred_value,
-                            operand_to_generator.at(hlo->operand(0))(
-                                ElementwiseSourceIndex(index, *hlo, 0)));
-        TF_ASSIGN_OR_RETURN(llvm::Value * on_true_value,
-                            operand_to_generator.at(hlo->operand(1))(
-                                ElementwiseSourceIndex(index, *hlo, 1)));
-        TF_ASSIGN_OR_RETURN(llvm::Value * on_false_value,
-                            operand_to_generator.at(hlo->operand(2))(
-                                ElementwiseSourceIndex(index, *hlo, 2)));
-        return ir_builder_->CreateSelect(
-            ir_builder_->CreateTrunc(pred_value, ir_builder_->getInt1Ty()),
-            on_true_value, on_false_value);
+        return EmitElementalSelect(hlo, operand_to_generator, index);
       };
     case HloOpcode::kClamp:
       return [this, hlo, &operand_to_generator](
                  const IrArray::Index& index) -> StatusOr<llvm::Value*> {
-        TF_ASSIGN_OR_RETURN(llvm::Value * min_value,
-                            operand_to_generator.at(hlo->operand(0))(
-                                ElementwiseSourceIndex(index, *hlo, 0)));
-        TF_ASSIGN_OR_RETURN(llvm::Value * arg_value,
-                            operand_to_generator.at(hlo->operand(1))(
-                                ElementwiseSourceIndex(index, *hlo, 1)));
-        TF_ASSIGN_OR_RETURN(llvm::Value * max_value,
-                            operand_to_generator.at(hlo->operand(2))(
-                                ElementwiseSourceIndex(index, *hlo, 2)));
-        PrimitiveType prim_type = hlo->shape().element_type();
-        if (primitive_util::IsFloatingPointType(prim_type)) {
-          return EmitFloatMin(max_value, EmitFloatMax(min_value, arg_value));
-        } else if (primitive_util::IsIntegralType(prim_type)) {
-          bool is_signed = primitive_util::IsSignedIntegralType(prim_type);
-          return EmitIntegralMin(
-              max_value, EmitIntegralMax(min_value, arg_value, is_signed),
-              is_signed);
-        } else {
-          return Unimplemented("Clamp unimplemented for %s",
-                               PrimitiveType_Name(prim_type).c_str());
-        }
+        return EmitElementalClamp(hlo, operand_to_generator, index);
       };
     case HloOpcode::kReducePrecision:
       return [this, hlo, &operand_to_generator](
@@ -1460,70 +1986,8 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
     case HloOpcode::kConcatenate:
       return [this, hlo, &operand_to_generator](
                  const IrArray::Index target_index) -> StatusOr<llvm::Value*> {
-        const int64 concat_dim = hlo->dimensions(0);
-        auto source_index = target_index;
-
-        llvm::BasicBlock* init_block = ir_builder_->GetInsertBlock();
-
-        // A terminator should be present iff we're emitting code
-        // into the middle (as opposed to the end) of a basic block.
-        CHECK_EQ(ir_builder_->GetInsertPoint() == init_block->end(),
-                 init_block->getTerminator() == nullptr);
-
-        llvm::BasicBlock* exit_block;
-        if (ir_builder_->GetInsertPoint() == init_block->end()) {
-          exit_block = llvm_ir::CreateBasicBlock(
-              /*insert_before=*/nullptr, IrName(hlo, "merge"), ir_builder_);
-        } else {
-          exit_block = init_block->splitBasicBlock(
-              ir_builder_->GetInsertPoint(), AsStringRef(IrName(hlo, "merge")));
-          init_block->getTerminator()->eraseFromParent();
-        }
-
-        llvm_ir::SetToFirstInsertPoint(exit_block, ir_builder_);
-        llvm::PHINode* output =
-            ir_builder_->CreatePHI(llvm_ir::PrimitiveTypeToIrType(
-                                       hlo->shape().element_type(), module_),
-                                   hlo->operands().size());
-        auto prior_insert_point = ir_builder_->GetInsertPoint();
-
-        ir_builder_->SetInsertPoint(init_block);
-
-        for (int64 operand_idx = 0; operand_idx < hlo->operand_count();
-             ++operand_idx) {
-          const HloInstruction* operand = hlo->operand(operand_idx);
-          auto true_block = llvm_ir::CreateBasicBlock(
-              exit_block, StrCat("concat_index_from_operand", operand_idx),
-              ir_builder_);
-          auto false_block = llvm_ir::CreateBasicBlock(
-              exit_block, StrCat("concat_index_not_from_operand", operand_idx),
-              ir_builder_);
-          auto concat_dim_size =
-              llvm::ConstantInt::get(source_index[concat_dim]->getType(),
-                                     operand->shape().dimensions(concat_dim));
-          ir_builder_->CreateCondBr(
-              ir_builder_->CreateICmpULT(source_index[concat_dim],
-                                         concat_dim_size),
-              true_block, false_block);
-
-          // Create the terminator of the true block before calling operand
-          // generators, because they require non-degenerate basic blocks.
-          ir_builder_->SetInsertPoint(
-              llvm::BranchInst::Create(exit_block, /*InsertAtEnd=*/true_block));
-          TF_ASSIGN_OR_RETURN(llvm::Value * value,
-                              operand_to_generator.at(operand)(source_index));
-          output->addIncoming(value, ir_builder_->GetInsertBlock());
-
-          // Subtract the size of the concat dimension of the current operand
-          // from the source index.
-          ir_builder_->SetInsertPoint(false_block);
-          source_index[concat_dim] =
-              ir_builder_->CreateSub(source_index[concat_dim], concat_dim_size);
-        }
-
-        ir_builder_->CreateUnreachable();
-        ir_builder_->SetInsertPoint(exit_block, prior_insert_point);
-        return output;
+        return EmitElementalConcatenate(hlo, operand_to_generator,
+                                        target_index);
       };
     case HloOpcode::kReverse:
       return [this, hlo, &operand_to_generator](
@@ -1559,184 +2023,19 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
     case HloOpcode::kDynamicSlice:
       return [this, hlo, &operand_to_generator](
                  const IrArray::Index& index) -> StatusOr<llvm::Value*> {
-        // Emit IR to read dynamic start indices from hlo->operand(1).
-        const HloInstruction* input_hlo = hlo->operand(0);
-        const int64 rank = ShapeUtil::Rank(input_hlo->shape());
-        llvm_ir::IrArray::Index slice_start_index(rank);
-        for (int64 i = 0; i < rank; ++i) {
-          llvm_ir::IrArray::Index dim_index(1, ir_builder_->getInt64(i));
-          TF_ASSIGN_OR_RETURN(
-              llvm::Value * start_index_value,
-              operand_to_generator.at(hlo->operand(1))(dim_index));
-          start_index_value->setName(
-              AsStringRef(IrName(hlo, StrCat("start_idx", i))));
-          slice_start_index[i] = start_index_value;
-        }
+        return EmitElementalDynamicSlice(hlo, operand_to_generator, index);
+      };
 
-        llvm_ir::IrArray::Index input_index(rank);
-        for (int64 i = 0; i < rank; ++i) {
-          // Emit IR which computes:
-          //   input_index = (start_index + offset_index) % dim_size
-          // Security note: this is the code that keeps the indices in-bounds.
-          llvm::Value* dim_size = llvm::ConstantInt::get(
-              index[i]->getType(), input_hlo->shape().dimensions(i));
-          llvm::Value* start_index = ir_builder_->CreateZExtOrBitCast(
-              slice_start_index[i], index[i]->getType());
-          input_index[i] = ir_builder_->CreateURem(
-              ir_builder_->CreateAdd(start_index, index[i]), dim_size);
-        }
-        return operand_to_generator.at(input_hlo)(input_index);
+    case HloOpcode::kGather:
+      return [this, hlo, &operand_to_generator](
+                 const IrArray::Index& index) -> StatusOr<llvm::Value*> {
+        return EmitElementalGather(hlo, operand_to_generator, index);
       };
     case HloOpcode::kDynamicUpdateSlice:
       return [this, hlo, &operand_to_generator](
                  const IrArray::Index& index) -> StatusOr<llvm::Value*> {
-        const HloInstruction* input_hlo = hlo->operand(0);
-        const HloInstruction* update_hlo = hlo->operand(1);
-        const HloInstruction* start_hlo = hlo->operand(2);
-        // Calculate slice start/end indices.
-        const int64 rank = ShapeUtil::Rank(input_hlo->shape());
-        llvm_ir::IrArray::Index slice_start_index(rank);
-        llvm_ir::IrArray::Index slice_limit_index(rank);
-        // Slice starts at update[index - slice_start_index_adjusted],
-        // where adjusted value = slice_start_index when in bounds, and
-        // adjusted value = slice_start_index - input_dim, when wrapping.
-        llvm_ir::IrArray::Index slice_start_index_adjusted(rank);
-
-        // Slice intersection gathers (ANDs) conditions on all ranks for which
-        // 'input' is set to 'update'
-        llvm::Value* slice_intersection = ir_builder_->getTrue();
-
-        for (int64 i = 0; i < rank; ++i) {
-          // Emit IR to read dynamic start indices from 'start_hlo'.
-          llvm_ir::IrArray::Index dim_index(1, ir_builder_->getInt64(i));
-          TF_ASSIGN_OR_RETURN(llvm::Value * start_index_value,
-                              operand_to_generator.at(start_hlo)(dim_index));
-          start_index_value->setName(
-              AsStringRef(IrName(hlo, StrCat("start_idx", i))));
-          slice_start_index[i] = ir_builder_->CreateZExtOrBitCast(
-              start_index_value, index[i]->getType());
-
-          llvm::Value* input_dim_size = llvm::ConstantInt::get(
-              index[i]->getType(), input_hlo->shape().dimensions(i));
-          llvm::Value* update_dim_size = llvm::ConstantInt::get(
-              index[i]->getType(), update_hlo->shape().dimensions(i));
-
-          // Generate code to handle wrapping semantics:
-          // slice_start_index[i] = slice_start_index[i] % input_dim_size;
-          // slice_limit_index[i] = slice_start_index[i] + update_dim_size.
-          // slice_start_index[i] is updated in place and it will now be in
-          // range. slice_limit_index[i] may be out of range, and it's being
-          // URem-ed below if so.
-          slice_start_index[i] =
-              ir_builder_->CreateURem(slice_start_index[i], input_dim_size);
-          slice_limit_index[i] =
-              ir_builder_->CreateAdd(slice_start_index[i], update_dim_size);
-
-          // Test if slice_limit_index[i] is in bounds
-          llvm::Value* in_bounds =
-              ir_builder_->CreateICmpULE(slice_limit_index[i], input_dim_size);
-          llvm_ir::LlvmIfData if_in_bounds =
-              llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", ir_builder_);
-
-          // Handle true BB (slice_limit_index[i] <= input_dim_size).
-          SetToFirstInsertPoint(if_in_bounds.true_block, ir_builder_);
-          // Check that index[i] >= slice_start_index[i] &&
-          //            index[i] < slice_limit_index[i]
-          llvm::Value* slice_intersection_in_bounds = ir_builder_->CreateAnd(
-              slice_intersection,
-              ir_builder_->CreateICmpSGE(index[i], slice_start_index[i]),
-              "slice_intersection_in");
-          slice_intersection_in_bounds = ir_builder_->CreateAnd(
-              slice_intersection_in_bounds,
-              ir_builder_->CreateICmpSLT(index[i], slice_limit_index[i]),
-              "slice_intersection_in");
-
-          // Handle false BB (slice_limit_index[i] > input_dim_size).
-          SetToFirstInsertPoint(if_in_bounds.false_block, ir_builder_);
-          // Check that index[i] >= slice_start_index[i] ||
-          //            index[i] < slice_limit_index[i]%input_dim_size.
-          llvm::Value* index_wraps = ir_builder_->CreateICmpSLT(
-              index[i],
-              ir_builder_->CreateURem(slice_limit_index[i], input_dim_size));
-          llvm::Value* slice_intersection_or = ir_builder_->CreateOr(
-              ir_builder_->CreateICmpSGE(index[i], slice_start_index[i]),
-              index_wraps, "slice_intersection_out");
-          llvm::Value* slice_intersection_out_of_bounds =
-              ir_builder_->CreateAnd(slice_intersection, slice_intersection_or,
-                                     "slice_intersection_out");
-          // Create value for slice_start_index_adjusted[i] when out of bounds.
-          // If within out-of-bounds if.
-          llvm_ir::LlvmIfData if_start_needs_adjustment =
-              llvm_ir::EmitIfThenElse(index_wraps, "adjust_start", ir_builder_);
-          SetToFirstInsertPoint(if_start_needs_adjustment.true_block,
-                                ir_builder_);
-          llvm::Value* slice_start_index_adjusted_oob =
-              ir_builder_->CreateSub(slice_start_index[i], input_dim_size);
-          SetToFirstInsertPoint(if_start_needs_adjustment.after_block,
-                                ir_builder_);
-          llvm::PHINode* slice_start_index_adjusted_phi =
-              ir_builder_->CreatePHI(slice_start_index_adjusted_oob->getType(),
-                                     2);
-          slice_start_index_adjusted_phi->addIncoming(
-              slice_start_index_adjusted_oob,
-              if_start_needs_adjustment.true_block);
-          slice_start_index_adjusted_phi->addIncoming(
-              slice_start_index[i], if_start_needs_adjustment.false_block);
-          // End of if within if.
-
-          // After checking in/out of bounds.
-          SetToFirstInsertPoint(if_in_bounds.after_block, ir_builder_);
-          llvm::PHINode* phi_slice_intersection =
-              ir_builder_->CreatePHI(slice_intersection->getType(), 2);
-          phi_slice_intersection->addIncoming(slice_intersection_in_bounds,
-                                              if_in_bounds.true_block);
-          phi_slice_intersection->addIncoming(
-              slice_intersection_out_of_bounds,
-              if_start_needs_adjustment.after_block);
-          slice_intersection = phi_slice_intersection;
-
-          llvm::PHINode* phi_index =
-              ir_builder_->CreatePHI(slice_start_index[i]->getType(), 2);
-          phi_index->addIncoming(slice_start_index[i], if_in_bounds.true_block);
-          phi_index->addIncoming(slice_start_index_adjusted_phi,
-                                 if_start_needs_adjustment.after_block);
-          slice_start_index_adjusted[i] = phi_index;
-        }
-
-        // Emit:
-        // if (slice_intersection) -> return data from 'update'.
-        // else                    -> return data from 'input'.
-        llvm::Value* ret_value_addr = llvm_ir::EmitAllocaAtFunctionEntry(
-            llvm_ir::PrimitiveTypeToIrType(hlo->shape().element_type(),
-                                           module_),
-            "ret_value_addr", ir_builder_);
-        llvm_ir::LlvmIfData if_data = llvm_ir::EmitIfThenElse(
-            slice_intersection, "slice_intersection", ir_builder_);
-
-        // Handle true BB (return data from 'update')
-        SetToFirstInsertPoint(if_data.true_block, ir_builder_);
-        // Compute update index for intersection case.
-        llvm_ir::IrArray::Index update_index(rank);
-        for (int64 i = 0; i < rank; ++i) {
-          llvm::Value* update_dim_size = llvm::ConstantInt::get(
-              index[i]->getType(), update_hlo->shape().dimensions(i));
-          // NOTE: Subtraction will be positive due to bounds checking above.
-          update_index[i] = ir_builder_->CreateURem(
-              ir_builder_->CreateSub(index[i], slice_start_index_adjusted[i]),
-              update_dim_size);
-        }
-        TF_ASSIGN_OR_RETURN(llvm::Value * true_value,
-                            operand_to_generator.at(update_hlo)(update_index));
-        ir_builder_->CreateStore(true_value, ret_value_addr);
-
-        // Handle false BB (return data from 'input')
-        SetToFirstInsertPoint(if_data.false_block, ir_builder_);
-        TF_ASSIGN_OR_RETURN(llvm::Value * false_value,
-                            operand_to_generator.at(input_hlo)(index));
-        ir_builder_->CreateStore(false_value, ret_value_addr);
-
-        SetToFirstInsertPoint(if_data.after_block, ir_builder_);
-        return ir_builder_->CreateLoad(ret_value_addr);
+        return EmitElementalDynamicUpdateSlice(hlo, operand_to_generator,
+                                               index);
       };
     case HloOpcode::kBitcast:
       CHECK_EQ(ShapeUtil::ElementsIn(hlo->shape()),
@@ -1765,155 +2064,16 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
     case HloOpcode::kRng:
       return MakeRngElementGenerator(hlo, operand_to_generator);
     case HloOpcode::kPad:
-      return [=, &operand_to_generator](
+      return [this, hlo, &operand_to_generator](
                  const IrArray::Index& padded_index) -> StatusOr<llvm::Value*> {
-        auto index = padded_index;
-        llvm::Value* in_bounds = ir_builder_->getTrue();
-        for (size_t i = 0; i < index.size(); ++i) {
-          auto index_typed_const = [=](int64 n) {
-            return llvm::ConstantInt::get(index[i]->getType(), n);
-          };
-          const auto& pad_dim = hlo->padding_config().dimensions(i);
-          index[i] = ir_builder_->CreateSub(
-              index[i], index_typed_const(pad_dim.edge_padding_low()));
-          in_bounds = ir_builder_->CreateAnd(
-              in_bounds,
-              ir_builder_->CreateICmpSGE(index[i], index_typed_const(0)),
-              "in_bounds");
-          in_bounds = ir_builder_->CreateAnd(
-              in_bounds,
-              ir_builder_->CreateICmpEQ(
-                  index_typed_const(0),
-                  ir_builder_->CreateURem(
-                      index[i],
-                      index_typed_const(pad_dim.interior_padding() + 1))),
-              "in_bounds");
-          index[i] = ir_builder_->CreateSDiv(
-              index[i], index_typed_const(pad_dim.interior_padding() + 1));
-          in_bounds = ir_builder_->CreateAnd(
-              in_bounds,
-              ir_builder_->CreateICmpSLT(
-                  index[i],
-                  index_typed_const(hlo->operand(0)->shape().dimensions(i))),
-              "in_bounds");
-        }
-
-        // if (in_bounds) {
-        //   ret_value = operand0[index];  // source
-        // } else {
-        //   ret_value = *operand1;        // padding
-        // }
-        llvm::Value* ret_value_addr = llvm_ir::EmitAllocaAtFunctionEntry(
-            llvm_ir::PrimitiveTypeToIrType(hlo->shape().element_type(),
-                                           module_),
-            "pad_result_addr", ir_builder_);
-        llvm_ir::LlvmIfData if_data =
-            llvm_ir::EmitIfThenElse(in_bounds, "in_bounds", ir_builder_);
-        SetToFirstInsertPoint(if_data.true_block, ir_builder_);
-        TF_ASSIGN_OR_RETURN(llvm::Value * operand_value,
-                            operand_to_generator.at(hlo->operand(0))(index));
-        ir_builder_->CreateStore(operand_value, ret_value_addr);
-
-        SetToFirstInsertPoint(if_data.false_block, ir_builder_);
-        TF_ASSIGN_OR_RETURN(llvm::Value * padding_value,
-                            operand_to_generator.at(hlo->operand(1))({}));
-        ir_builder_->CreateStore(padding_value, ret_value_addr);
-
-        SetToFirstInsertPoint(if_data.after_block, ir_builder_);
-        // Don't create phi(operand_value, padding_value) here, because invoking
-        // operand_to_generator may create new basic blocks, making the parent
-        // of operand_value or padding_value no longer a predecessor of
-        // if_data.after_block.
-        return ir_builder_->CreateLoad(ret_value_addr);
+        return EmitElementalPad(hlo, operand_to_generator, padded_index);
       };
 
     case HloOpcode::kDot:
-      return [=, &operand_to_generator](const IrArray::Index& dot_result_index)
+      return [this, hlo,
+              &operand_to_generator](const IrArray::Index& dot_result_index)
                  -> StatusOr<llvm::Value*> {
-        auto lhs_generator = operand_to_generator.at(hlo->operand(0));
-        auto rhs_generator = operand_to_generator.at(hlo->operand(1));
-        int64 contracted_dim_size = hlo->operand(0)->shape().dimensions(
-            hlo->operand(0)->shape().dimensions_size() - 1);
-        int64 lhs_dims = hlo->operand(0)->shape().dimensions_size();
-        int64 rhs_dims = hlo->operand(1)->shape().dimensions_size();
-
-        std::unique_ptr<llvm_ir::ForLoop> inner_loop =
-            llvm_ir::ForLoop::EmitForLoop(
-                IrName(hlo, "inner"), ir_builder_->getInt64(0),
-                ir_builder_->getInt64(contracted_dim_size),
-                ir_builder_->getInt64(1), ir_builder_);
-
-        SetToFirstInsertPoint(inner_loop->GetPreheaderBasicBlock(),
-                              ir_builder_);
-        PrimitiveType primitive_type = hlo->shape().element_type();
-        llvm::Type* primitive_type_llvm =
-            llvm_ir::PrimitiveTypeToIrType(primitive_type, module_);
-        llvm::Value* accumulator_alloca = llvm_ir::EmitAllocaAtFunctionEntry(
-            primitive_type_llvm, "dot_acc", ir_builder_);
-        ir_builder_->CreateStore(
-            llvm::Constant::getNullValue(primitive_type_llvm),
-            accumulator_alloca);
-
-        SetToFirstInsertPoint(inner_loop->GetBodyBasicBlock(), ir_builder_);
-
-        // This is the inner reduction loop for a dot operation that produces
-        // one element in the output.  If the operands to the dot operation have
-        // shapes [A,B,C,T] and [D,T,E], the result has a shape [A,B,C,D,E].
-        // Given an output index [a,b,c,d,e] in the result, we compute:
-        //   sum(lhs[a,b,c,t]*rhs[d,t,e] for t in [0, T))
-
-        IrArray::Index lhs_index, rhs_index;
-
-        for (int64 i = 0; i < lhs_dims - 1; i++) {
-          lhs_index.push_back(dot_result_index[i]);
-        }
-        lhs_index.push_back(inner_loop->GetIndVarValue());
-
-        for (int64 i = 0; i < rhs_dims - 2; i++) {
-          rhs_index.push_back(dot_result_index[lhs_dims - 1 + i]);
-        }
-        rhs_index.push_back(inner_loop->GetIndVarValue());
-        rhs_index.push_back(dot_result_index.back());
-
-        llvm::Value* current_accumulator =
-            ir_builder_->CreateLoad(accumulator_alloca);
-        TF_ASSIGN_OR_RETURN(llvm::Value * lhs_value, lhs_generator(lhs_index));
-        TF_ASSIGN_OR_RETURN(llvm::Value * rhs_value, rhs_generator(rhs_index));
-        llvm::Value* next_accumulator;
-        if (primitive_util::IsComplexType(primitive_type)) {
-          llvm::Value* product_real = ir_builder_->CreateFSub(
-              ir_builder_->CreateFMul(EmitExtractReal(lhs_value),
-                                      EmitExtractReal(rhs_value)),
-              ir_builder_->CreateFMul(EmitExtractImag(lhs_value),
-                                      EmitExtractImag(rhs_value)));
-          llvm::Value* product_imag = ir_builder_->CreateFAdd(
-              ir_builder_->CreateFMul(EmitExtractReal(lhs_value),
-                                      EmitExtractImag(rhs_value)),
-              ir_builder_->CreateFMul(EmitExtractImag(lhs_value),
-                                      EmitExtractReal(rhs_value)));
-          next_accumulator = ir_builder_->CreateInsertValue(
-              current_accumulator,
-              ir_builder_->CreateFAdd(EmitExtractReal(current_accumulator),
-                                      product_real),
-              {0});
-          next_accumulator = ir_builder_->CreateInsertValue(
-              next_accumulator,
-              ir_builder_->CreateFAdd(EmitExtractImag(current_accumulator),
-                                      product_imag),
-              {1});
-        } else if (primitive_util::IsFloatingPointType(primitive_type)) {
-          next_accumulator = ir_builder_->CreateFAdd(
-              current_accumulator,
-              ir_builder_->CreateFMul(lhs_value, rhs_value));
-        } else {
-          next_accumulator = ir_builder_->CreateAdd(
-              current_accumulator,
-              ir_builder_->CreateMul(lhs_value, rhs_value));
-        }
-        ir_builder_->CreateStore(next_accumulator, accumulator_alloca);
-
-        SetToFirstInsertPoint(inner_loop->GetExitBasicBlock(), ir_builder_);
-        return ir_builder_->CreateLoad(accumulator_alloca);
+        return EmitElementalDot(hlo, operand_to_generator, dot_result_index);
       };
     default:
       return [this, hlo, &operand_to_generator](const IrArray::Index& index) {
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
index c516a826d9e382..d199473374ad39 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.h
@@ -105,6 +105,9 @@ class ElementalIrEmitter {
   virtual StatusOr<llvm::Value*> EmitLog(PrimitiveType prim_type,
                                          llvm::Value* value) const;
 
+  virtual StatusOr<llvm::Value*> EmitLog1p(PrimitiveType prim_type,
+                                           llvm::Value* value) const;
+
   virtual StatusOr<llvm::Value*> EmitSin(PrimitiveType prim_type,
                                          llvm::Value* value) const;
 
@@ -114,6 +117,9 @@ class ElementalIrEmitter {
   virtual StatusOr<llvm::Value*> EmitExp(PrimitiveType prim_type,
                                          llvm::Value* value) const;
 
+  virtual StatusOr<llvm::Value*> EmitExpm1(PrimitiveType prim_type,
+                                           llvm::Value* value) const;
+
   virtual StatusOr<llvm::Value*> EmitPow(PrimitiveType prim_type,
                                          llvm::Value* lhs,
                                          llvm::Value* rhs) const;
@@ -142,6 +148,46 @@ class ElementalIrEmitter {
     return ir_builder_->getIntN(128, 0);
   }
 
+  StatusOr<llvm::Value*> EmitElementalSelect(
+      const HloInstruction* hlo,
+      const HloToElementGeneratorMap& operand_to_generator,
+      const llvm_ir::IrArray::Index& index) const;
+
+  StatusOr<llvm::Value*> EmitElementalClamp(
+      const HloInstruction* hlo,
+      const HloToElementGeneratorMap& operand_to_generator,
+      const llvm_ir::IrArray::Index& index) const;
+
+  StatusOr<llvm::Value*> EmitElementalConcatenate(
+      const HloInstruction* hlo,
+      const HloToElementGeneratorMap& operand_to_generator,
+      const llvm_ir::IrArray::Index& target_index) const;
+
+  StatusOr<llvm::Value*> EmitElementalDynamicSlice(
+      const HloInstruction* hlo,
+      const HloToElementGeneratorMap& operand_to_generator,
+      const llvm_ir::IrArray::Index& index) const;
+
+  StatusOr<llvm::Value*> EmitElementalGather(
+      const HloInstruction* hlo,
+      const HloToElementGeneratorMap& operand_to_generator,
+      const llvm_ir::IrArray::Index& index) const;
+
+  StatusOr<llvm::Value*> EmitElementalDynamicUpdateSlice(
+      const HloInstruction* hlo,
+      const HloToElementGeneratorMap& operand_to_generator,
+      const llvm_ir::IrArray::Index& index) const;
+
+  StatusOr<llvm::Value*> EmitElementalPad(
+      const HloInstruction* hlo,
+      const HloToElementGeneratorMap& operand_to_generator,
+      const llvm_ir::IrArray::Index& padded_index) const;
+
+  StatusOr<llvm::Value*> EmitElementalDot(
+      const HloInstruction* hlo,
+      const HloToElementGeneratorMap& operand_to_generator,
+      const llvm_ir::IrArray::Index& dot_result_index) const;
+
   llvm::IRBuilder<>* const ir_builder_;
 
   llvm::Module* module_;
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter_test.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter_test.cc
new file mode 100644
index 00000000000000..8980d4303353a1
--- /dev/null
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter_test.cc
@@ -0,0 +1,65 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/execution_options_util.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+
+namespace xla {
+namespace {
+
+using tensorflow::gtl::nullopt;
+
+class ElementalIrEmitterExecutionTest : public HloTestBase {
+ protected:
+  void RunTest(const string& hlo_text,
+               tensorflow::gtl::ArraySlice<Literal*> args) {
+    HloModuleConfig config;
+    config.set_debug_options(GetDebugOptionsForTest());
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                            ParseHloString(hlo_text, config));
+    EXPECT_TRUE(RunAndCompareNoHloPasses(std::move(module), args, nullopt));
+  }
+};
+
+XLA_TEST_F(ElementalIrEmitterExecutionTest, DotFusion) {
+  const string hlo_text = R"(
+HloModule FusedDot
+
+fused_computation {
+  arg0 = s32[1,2,1]{2,1,0} parameter(0)
+  reshape.lhs = s32[2,1]{1,0} reshape(arg0)
+  arg1 = s32[1,2,1]{2,1,0} parameter(1)
+  reshape.rhs = s32[2,1]{1,0} reshape(arg1)
+  ROOT dot = s32[1,1]{1,0} dot(reshape.lhs, reshape.rhs), lhs_contracting_dims={0}, rhs_contracting_dims={0}
+}
+
+ENTRY main {
+  entry_arg0 = s32[1,2,1]{2,1,0} parameter(0)
+  entry_arg1 = s32[1,2,1]{2,1,0} parameter(1)
+  ROOT fusion = s32[1,1]{1,0} fusion(entry_arg0, entry_arg1), kind=kLoop, calls=fused_computation
+}
+)";
+
+  std::unique_ptr<Literal> lhs = Literal::CreateR3<int32>({{{1}, {2}}});
+  std::unique_ptr<Literal> rhs = Literal::CreateR3<int32>({{{3}, {4}}});
+  RunTest(hlo_text, {lhs.get(), rhs.get()});
+}
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index 021f09d310b718..6df172db8e541c 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -129,38 +129,17 @@ StatusOr<ScopedShapedBuffer> Executable::ExecuteOnStreamWrapper(
   return return_value;
 }
 
-Status Executable::DumpSessionModule() {
-  TF_RET_CHECK(dumping());
+Status Executable::DumpHloSnapshot() {
+  TF_RET_CHECK(dumping_snapshot());
+  TF_RET_CHECK(hlo_snapshot_->has_hlo() &&
+               hlo_snapshot_->hlo().has_hlo_module());
   const string& directory_path =
       module_config().debug_options().xla_dump_executions_to();
-  VersionedComputationHandle versioned_handle = entry_computation_handle();
-  // This filename does not include the version number because the computation
-  // is only ever executed at one version.
+  const auto& module = hlo_snapshot_->hlo().hlo_module();
   string filename = tensorflow::strings::Printf(
-      "computation_%lld__%s__execution_%lld", versioned_handle.handle.handle(),
-      session_module_->entry().name().c_str(), ++execution_count_);
-  return Executable::DumpToDirectory(directory_path, filename,
-                                     *session_module_);
-}
-
-/* static */ Status Executable::DumpToDirectory(
-    const string& directory_path, string filename,
-    const SessionModule& session_module) {
-  tensorflow::Env* env = tensorflow::Env::Default();
-  if (!env->IsDirectory(directory_path).ok()) {
-    // NB! CreateDir does not work reliably with multiple XLA threads -- two
-    // threads can race to observe the absence of the dump directory and
-    // simultaneously try to create it, causing the "losing" thread to get a
-    // "directory already exists" error.
-    TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(directory_path));
-  }
-  filename = SanitizeFileName(std::move(filename));
-  string file_path = tensorflow::io::JoinPath(directory_path, filename);
-  string result;
-  TF_RET_CHECK(
-      tensorflow::SerializeToStringDeterministic(session_module, &result));
-  return tensorflow::WriteStringToFile(tensorflow::Env::Default(), file_path,
-                                       result);
+      "computation_%lld__%s__execution_%lld", module.id(),
+      module.entry_computation_name().c_str(), ++execution_count_);
+  return Executable::DumpToDirectory(directory_path, filename, *hlo_snapshot_);
 }
 
 /* static */ Status Executable::DumpToDirectory(
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index f7af1ca5749297..087bd1432945ab 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -27,7 +27,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
-#include "tensorflow/compiler/xla/service/session.pb.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/service/versioned_computation_handle.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -140,21 +139,17 @@ class Executable {
 
   // The shape (including layout) that results from this execution. This is the
   // shape of the DeviceMemoryBase result value in ExecuteOnStream above.
-  const Shape& result_shape() const {
-    return hlo_module_->config().entry_computation_layout().result_shape();
+  const Shape& host_result_shape() const {
+    return hlo_module_->config().host_entry_computation_layout().result_shape();
   }
 
   // Dumping helpers.
-  void set_session_module(std::unique_ptr<xla::SessionModule> session_module) {
-    session_module_ = std::move(session_module);
+  void set_hlo_snapshot(std::unique_ptr<xla::HloSnapshot> hlo_snapshot) {
+    hlo_snapshot_ = std::move(hlo_snapshot);
   }
-  bool dumping() const { return session_module_ != nullptr; }
-  SessionModule* session_module() const { return session_module_.get(); }
-  Status DumpSessionModule();
-
-  // Dump session_module to directory_path/filename.
-  static Status DumpToDirectory(const string& directory_path, string filename,
-                                const SessionModule& session_module);
+  bool dumping_snapshot() const { return hlo_snapshot_ != nullptr; }
+  HloSnapshot* hlo_snapshot() const { return hlo_snapshot_.get(); }
+  Status DumpHloSnapshot();
 
   // Dump hlo snapshot to directory_path/filename.
   static Status DumpToDirectory(const string& directory_path, string filename,
@@ -171,8 +166,8 @@ class Executable {
   // around.
   const std::unique_ptr<const HloModule> hlo_module_;
 
-  // SessionModule this was compiled from. Null if not dumping executions.
-  std::unique_ptr<SessionModule> session_module_;
+  // HloSnapshot this was compiled from. Null if not dumping executions.
+  std::unique_ptr<HloSnapshot> hlo_snapshot_;
 
   // Execution count, used to generate a unique filename for each dumped
   // execution.
diff --git a/tensorflow/compiler/xla/service/execution_tracker.cc b/tensorflow/compiler/xla/service/execution_tracker.cc
index 2f0b9ed2bd98fb..6794cfe297b0fb 100644
--- a/tensorflow/compiler/xla/service/execution_tracker.cc
+++ b/tensorflow/compiler/xla/service/execution_tracker.cc
@@ -37,11 +37,11 @@ AsyncExecution::AsyncExecution(Backend* backend,
   }
 }
 
-tensorflow::Status AsyncExecution::BlockUntilDone() const {
+Status AsyncExecution::BlockUntilDone() const {
   for (auto& stream : streams_) {
     TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 ExecutionTracker::ExecutionTracker() : next_handle_(1) {}
@@ -61,7 +61,7 @@ ExecutionHandle ExecutionTracker::Register(
   return execution_handle;
 }
 
-tensorflow::Status ExecutionTracker::Unregister(const ExecutionHandle& handle) {
+Status ExecutionTracker::Unregister(const ExecutionHandle& handle) {
   tensorflow::mutex_lock lock(execution_mutex_);
   auto it = handle_to_execution_.find(handle.handle());
   if (it == handle_to_execution_.end()) {
@@ -69,7 +69,7 @@ tensorflow::Status ExecutionTracker::Unregister(const ExecutionHandle& handle) {
                     handle.handle());
   }
   handle_to_execution_.erase(handle.handle());
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 StatusOr<const AsyncExecution*> ExecutionTracker::Resolve(
diff --git a/tensorflow/compiler/xla/service/execution_tracker.h b/tensorflow/compiler/xla/service/execution_tracker.h
index 5b6bddf9f16a85..4458152dd9a988 100644
--- a/tensorflow/compiler/xla/service/execution_tracker.h
+++ b/tensorflow/compiler/xla/service/execution_tracker.h
@@ -43,7 +43,7 @@ class AsyncExecution {
   AsyncExecution(Backend* backend, std::vector<Backend::StreamPtr> streams,
                  const ExecutionProfile& profile, GlobalDataHandle result);
 
-  tensorflow::Status BlockUntilDone() const;
+  Status BlockUntilDone() const;
 
   const GlobalDataHandle& result() const { return result_; }
 
@@ -77,7 +77,7 @@ class ExecutionTracker {
                            GlobalDataHandle data);
 
   // Unregisters the execution for the given handle.
-  tensorflow::Status Unregister(const ExecutionHandle& handle);
+  Status Unregister(const ExecutionHandle& handle);
 
   // Resolves the given ExecutionHandle to an AsyncExecution. Returns an
   // error status if the given handle is not found, which means that the
diff --git a/tensorflow/compiler/xla/tools/parser/README.md b/tensorflow/compiler/xla/service/g3doc/hlo_parser.md
similarity index 100%
rename from tensorflow/compiler/xla/tools/parser/README.md
rename to tensorflow/compiler/xla/service/g3doc/hlo_parser.md
diff --git a/tensorflow/compiler/xla/service/gather_expander_test.cc b/tensorflow/compiler/xla/service/gather_expander_test.cc
index 1c72ca066502eb..020ffcd106862c 100644
--- a/tensorflow/compiler/xla/service/gather_expander_test.cc
+++ b/tensorflow/compiler/xla/service/gather_expander_test.cc
@@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/gather_expander.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 
 namespace xla {
 namespace {
@@ -36,7 +36,7 @@ ENTRY main {
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(hlo_text));
+                          ParseHloString(hlo_text));
 
   Status status = GatherExpander{}.Run(module.get()).status();
   EXPECT_EQ(status.code(), tensorflow::error::UNIMPLEMENTED);
@@ -63,7 +63,7 @@ ENTRY main {
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(hlo_text));
+                          ParseHloString(hlo_text));
   TF_ASSERT_OK_AND_ASSIGN(bool changed, GatherExpander{}.Run(module.get()));
   ASSERT_TRUE(changed);
 
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.cc b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
index ddb687314ee822..5ee67ccb4ae147 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.cc
@@ -89,7 +89,7 @@ GenericTransferManager::TransferLiteralFromDevice(
 }
 
 Status GenericTransferManager::TransferLiteralToDevice(
-    se::StreamExecutor* executor, const Literal& literal,
+    se::StreamExecutor* executor, const LiteralSlice& literal,
     const ShapedBuffer& device_buffer) {
   const Shape& shape = literal.shape();
   VLOG(2) << "transferring literal shape to device: "
@@ -115,7 +115,7 @@ Status GenericTransferManager::TransferLiteralToDevice(
           TF_RET_CHECK(GetByteSizeRequirement(device_subshape) ==
                        device_memory.size());
           // Element is array-shaped: transfer array data to device buffer.
-          const auto subliteral = LiteralView::Create(literal, index);
+          const auto subliteral = LiteralSlice(literal, index);
           std::unique_ptr<Literal> relayed_out_literal;
           const void* source;
           if (LayoutUtil::Equal(device_subshape.layout(),
@@ -137,7 +137,7 @@ Status GenericTransferManager::TransferLiteralToDevice(
 }
 
 Status GenericTransferManager::TransferLiteralToInfeed(
-    se::StreamExecutor* executor, const Literal& literal) {
+    se::StreamExecutor* executor, const LiteralSlice& literal) {
   return Unimplemented("Generic transfer to Infeed");
 }
 
diff --git a/tensorflow/compiler/xla/service/generic_transfer_manager.h b/tensorflow/compiler/xla/service/generic_transfer_manager.h
index 0579099de40ba3..3da9570ef7eebc 100644
--- a/tensorflow/compiler/xla/service/generic_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/generic_transfer_manager.h
@@ -45,11 +45,11 @@ class GenericTransferManager : public TransferManager {
       se::StreamExecutor* executor, const ShapedBuffer& device_buffer) override;
 
   Status TransferLiteralToDevice(se::StreamExecutor* executor,
-                                 const Literal& literal,
+                                 const LiteralSlice& literal,
                                  const ShapedBuffer& device_buffer) override;
 
   Status TransferLiteralToInfeed(se::StreamExecutor* executor,
-                                 const Literal& literal) override;
+                                 const LiteralSlice& literal) override;
   Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
                                     const Shape& literal_shape,
                                     Literal* literal) override;
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 9a5ad2807591f0..16ab2d78c9cf45 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -1,6 +1,8 @@
 # Description:
 #   GPU-specific components in XLA service implementation.
 
+load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
+
 licenses(["notice"])  # Apache 2.0
 
 package(default_visibility = [":friends"])
@@ -27,6 +29,11 @@ load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
 
+xla_proto_library(
+    name = "backend_configs",
+    srcs = ["backend_configs.proto"],
+)
+
 cc_library(
     name = "gpu_constants",
     srcs = ["gpu_constants.cc"],
@@ -137,6 +144,7 @@ cc_library(
         "ir_emitter_unnested.h",
     ],
     deps = [
+        ":backend_configs",
         ":cudnn_convolution_runner",
         ":elemental_ir_emitter",
         ":gpu_constants",
@@ -273,6 +281,7 @@ cc_library(
     ] + if_cuda_is_configured(if_cuda(["nvptx_executable.h"])) +
         if_rocm_is_configured(if_rocm(["amdgpu_executable.h"])),
     deps = [
+        ":backend_configs",
         ":buffer_allocations",
         ":cudnn_convolution_runner",
         ":infeed_manager",
@@ -298,6 +307,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/compiler/xla/service:tuple_points_to_analysis",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core/platform/default/build_config:cublas_plugin",
         "//tensorflow/core/platform/default/build_config:cudnn_plugin",
@@ -329,6 +339,7 @@ cc_library(
     srcs = ["cudnn_convolution_algorithm_picker.cc"],
     hdrs = ["cudnn_convolution_algorithm_picker.h"],
     deps = [
+        ":backend_configs",
         ":cudnn_convolution_runner",
         ":gpu_executable",
         ":ir_emission_utils",
@@ -345,6 +356,7 @@ cc_library(
     srcs = ["cudnn_convolution_runner.cc"],
     hdrs = ["cudnn_convolution_runner.h"],
     deps = [
+        ":stream_executor_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:status_macros",
@@ -396,8 +408,10 @@ cc_library(
     deps = [
         ":ir_emission_utils",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:instruction_fusion",
+        "//tensorflow/compiler/xla/service:pattern_matcher",
     ],
 )
 
@@ -406,10 +420,13 @@ tf_cc_test(
     srcs = ["instruction_fusion_test.cc"],
     deps = [
         ":instruction_fusion",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
     ],
 )
 
@@ -451,9 +468,9 @@ tf_cc_test(
         ":instruction_fusion",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
     ],
 )
 
@@ -549,6 +566,8 @@ cc_library(
         "//tensorflow/compiler/xla/service:reshape_mover",
         "//tensorflow/compiler/xla/service:transpose_folding",
         "//tensorflow/compiler/xla/service:tuple_simplifier",
+        "//tensorflow/compiler/xla/service:while_loop_constant_sinking",
+        "//tensorflow/compiler/xla/service:while_loop_invariant_code_motion",
         "//tensorflow/compiler/xla/service:while_loop_simplifier",
         "//tensorflow/compiler/xla/service:zero_sized_hlo_elimination",
         "//tensorflow/compiler/xla/service/gpu:cudnn_batchnorm_rewriter",
@@ -595,14 +614,18 @@ cc_library(
     srcs = ["gpu_layout_assignment.cc"],
     hdrs = ["gpu_layout_assignment.h"],
     deps = [
+        ":gpu_options",
         ":ir_emission_utils",
+        ":stream_executor_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:computation_layout",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:layout_assignment",
         "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
     ],
 )
 
@@ -631,6 +654,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/service:buffer_value",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_ordering",
         "//tensorflow/compiler/xla/service:hlo_reachability",
@@ -698,6 +722,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "gpu_options",
+    srcs = ["gpu_options.cc"],
+    hdrs = ["gpu_options.h"],
+    deps = [
+        "//tensorflow/compiler/xla/service:hlo_module_config",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+cc_library(
+    name = "stream_executor_util",
+    srcs = ["stream_executor_util.cc"],
+    hdrs = ["stream_executor_util.h"],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:stream_executor_no_cuda",
+    ],
+)
+
 tf_cc_test(
     name = "gpu_hlo_support_checker_test",
     srcs = ["gpu_hlo_support_checker_test.cc"],
diff --git a/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc
index 53471249d2dcbf..908140e3f076d3 100644
--- a/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/amdgpu_compiler.cc
@@ -100,7 +100,7 @@ namespace gpu {
 
 namespace {
 
-using tensorflow::port::Tracing;
+namespace tracing = tensorflow::tracing;
 
 // Returns the directory containing ROCm-Device-Libs files. This function is
 // called in AMDGPUCompiler's constructor, so can't return an error. But
@@ -253,7 +253,7 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module,
   {
     HloPassPipeline pipeline("layout_assignment");
     pipeline.AddPass<GpuLayoutAssignment>(
-        hlo_module->mutable_entry_computation_layout());
+        hlo_module->mutable_device_entry_computation_layout(), stream_exec);
 
     // The LayoutAssignment pass may leave behind kCopy instructions which are
     // duplicate or NOPs, so remove them with algebraic simplification and CSE.
@@ -323,7 +323,7 @@ StatusOr<std::unique_ptr<HloModule>> AMDGPUCompiler::RunHloPasses(
     std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
     DeviceMemoryAllocator* device_allocator) {
   XLA_SCOPED_LOGGING_TIMER("AMDGPUCompiler::RunHloPasses");
-  Tracing::TraceMe annotation("HLO Transforms", module->name(),
+  tracing::ScopedActivity activity("HLO Transforms", module->name(),
                               /*is_expensive=*/true);
   TF_RETURN_IF_ERROR(
       OptimizeHloModule(module.get(), stream_exec, device_allocator));
diff --git a/tensorflow/compiler/xla/service/gpu/backend_configs.proto b/tensorflow/compiler/xla/service/gpu/backend_configs.proto
new file mode 100644
index 00000000000000..640c6392b8b820
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/backend_configs.proto
@@ -0,0 +1,27 @@
+syntax = "proto3";
+
+package xla.gpu;
+
+// Backend configs for XLA:GPU.
+//
+// These are metadata that the GPU backend attaches to HloInstrucitons and later
+// uses during e.g. codegen.
+//
+// Remember that proto3 doesn't give clients a way to tell the difference
+// between a field not being present and a field having the default value.
+// Choose your defaults carefully.
+//
+// No guarantee is made about the stability of these protos.
+//
+// See HloInstruction::backend_config() for more info.
+
+// Backend config for a convolution that runs through cudnn.
+message CudnnConvBackendConfig {
+  // Opaque algorithm number of cudnn algorithm chosen for this conv.
+  int64 algorithm = 1;
+
+  // Whether we may use tensor cores when running this conv.  Even if this is
+  // true, cudnn may choose not to use tensor cores, e.g. because the GPU or
+  // selected algorithm doesn't support it.
+  bool tensor_ops_enabled = 2;
+}
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
index 837f05244f7a8c..ab5149dcdb0929 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.cc
@@ -37,11 +37,11 @@ void BufferAllocations::Builder::RegisterBuffer(BufferAllocation::Index index,
 }
 
 StatusOr<std::unique_ptr<BufferAllocations>> BufferAllocations::Builder::Build(
-    const BufferAssignment& buffer_assignment, int device_ordinal,
+    const BufferAssignment* buffer_assignment, int device_ordinal,
     DeviceMemoryAllocator* memory_allocator) {
-  const int64 num_buffers = buffer_assignment.Allocations().size();
-  auto buffer_allocations = WrapUnique(
-      new BufferAllocations(num_buffers, device_ordinal, memory_allocator));
+  const int64 num_buffers = buffer_assignment->Allocations().size();
+  auto buffer_allocations = WrapUnique(new BufferAllocations(
+      num_buffers, device_ordinal, memory_allocator, buffer_assignment));
 
   for (BufferAllocation::Index i = 0; i < num_buffers; ++i) {
     // If buffer #i's address is already registered (e.g. external arguments or
@@ -62,28 +62,28 @@ StatusOr<std::unique_ptr<BufferAllocations>> BufferAllocations::Builder::Build(
 
     // Allocate each allocation that might escape, or is the temp buffer.
     bool seen_temp_buffer = false;
-    const BufferAllocation& allocation = buffer_assignment.GetAllocation(i);
+    const BufferAllocation& allocation = buffer_assignment->GetAllocation(i);
     if (allocation.maybe_live_out() || allocation.IsPreallocatedTempBuffer()) {
       const int64 buffer_size = allocation.size();
       se::DeviceMemoryBase buffer_address;
       if (buffer_size > 0) {
-        TF_ASSIGN_OR_RETURN(buffer_address, memory_allocator->Allocate(
-                                                device_ordinal, buffer_size));
-        if (buffer_address == nullptr) {
-          return ResourceExhausted(
-              "Out of memory when allocating %s for buffer %lld.",
-              tensorflow::strings::HumanReadableNumBytes(buffer_size).c_str(),
-              i);
-        }
-        if (reinterpret_cast<uintptr_t>(buffer_address.opaque()) %
+        OwningDeviceMemory buffer;
+        TF_ASSIGN_OR_RETURN(
+            buffer, memory_allocator->Allocate(device_ordinal, buffer_size));
+        if (reinterpret_cast<uintptr_t>(buffer.opaque()) %
                 kCudaMallocAlignBytes !=
             0) {
           return InternalError(
               "Address returned by memory_allocator->Allocate must be a "
               "multiple of %llx, but was %p",
-              kCudaMallocAlignBytes, buffer_address.opaque());
+              kCudaMallocAlignBytes, buffer.opaque());
         }
+        // We do manual memory management within BufferAllocations.  Be sure not
+        // to do a TF_RETURN_IF_ERROR between this line and the
+        // buffer_allocations->SetBuffer(buffer_address) call below!
+        buffer_address = buffer.Forget();
       }
+
       buffer_allocations->SetBuffer(i, buffer_address);
       if (allocation.IsPreallocatedTempBuffer()) {
         if (seen_temp_buffer) {
@@ -103,28 +103,42 @@ StatusOr<std::unique_ptr<BufferAllocations>> BufferAllocations::Builder::Build(
               << "B)";
     }
   }
-
   return std::move(buffer_allocations);
 }
 
-tensorflow::Status BufferAllocations::TearDown(
-    const std::set<se::DeviceMemoryBase>& live_addresses,
-    const BufferAssignment& buffer_assignment) {
-  // Deallocate temporary buffers.
-  const int64 num_buffers = buffer_assignment.Allocations().size();
+BufferAllocations::~BufferAllocations() {
+  if (!torn_down_) {
+    // Presumably if we're executing this branch, the caller is in an error
+    // state, otherwise it would have explicitly called TearDown so it could
+    // save some set of live addresses.  So ignoring any errors in TearDown is
+    // sensible.
+    TearDown(/*live_addresses=*/{}).IgnoreError();
+  }
+}
+
+Status BufferAllocations::TearDown(
+    const std::set<se::DeviceMemoryBase>& live_addresses) {
+  // Deallocate temporary buffers, taking care to try to deallocate all of them
+  // even if one of the deallocations fails.
+  Status status;
+  const int64 num_buffers = buffer_assignment_->Allocations().size();
   for (BufferAllocation::Index i = 0; i < num_buffers; ++i) {
-    const BufferAllocation& allocation = buffer_assignment.GetAllocation(i);
+    const BufferAllocation& allocation = buffer_assignment_->GetAllocation(i);
     se::DeviceMemoryBase buffer_address = GetDeviceAddress(allocation.index());
     // Deallocate buffers marked "maybe_live_out" but aren't actually live out,
     // and temp buffers.
     if ((allocation.maybe_live_out() &&
          !live_addresses.count(buffer_address)) ||
         allocation.IsPreallocatedTempBuffer()) {
-      TF_RETURN_IF_ERROR(
-          memory_allocator_->Deallocate(device_ordinal_, &buffer_address));
+      auto dealloc_result =
+          memory_allocator_->Deallocate(device_ordinal_, buffer_address);
+      if (!dealloc_result.ok() && status.ok()) {
+        status = dealloc_result;
+      }
     }
   }
-  return tensorflow::Status::OK();
+  torn_down_ = true;
+  return status;
 }
 
 se::DeviceMemoryBase BufferAllocations::GetDeviceAddress(
diff --git a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
index c2fc35be4ca4bc..636623502597b3 100644
--- a/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
+++ b/tensorflow/compiler/xla/service/gpu/buffer_allocations.h
@@ -48,13 +48,15 @@ class BufferAllocations {
     // `device_ordinal` is the number of the device this function allocates
     // memory on.
     StatusOr<std::unique_ptr<BufferAllocations>> Build(
-        const BufferAssignment& buffer_assignment, int device_ordinal,
+        const BufferAssignment* buffer_assignment, int device_ordinal,
         DeviceMemoryAllocator* memory_allocator);
 
    private:
     std::map<BufferAllocation::Index, se::DeviceMemoryBase> registered_buffers_;
   };
 
+  ~BufferAllocations();
+
   BufferAllocations(const BufferAllocations&) = delete;
   BufferAllocations& operator=(const BufferAllocations&) = delete;
 
@@ -76,16 +78,16 @@ class BufferAllocations {
 
   // Tears down all buffers allocated by this object that are not in
   // `live_addresses`.
-  tensorflow::Status TearDown(
-      const std::set<se::DeviceMemoryBase>& live_addresses,
-      const BufferAssignment& buffer_assignment);
+  Status TearDown(const std::set<se::DeviceMemoryBase>& live_addresses);
 
  private:
   BufferAllocations(BufferAllocation::Index buffer_count, int device_ordinal,
-                    DeviceMemoryAllocator* memory_allocator)
+                    DeviceMemoryAllocator* memory_allocator,
+                    const BufferAssignment* buffer_assignment)
       : buffers_(buffer_count),
         device_ordinal_(device_ordinal),
-        memory_allocator_(memory_allocator) {}
+        memory_allocator_(memory_allocator),
+        buffer_assignment_(buffer_assignment) {}
 
   // Sets the device address of buffer `buffer_index`.
   void SetBuffer(BufferAllocation::Index buffer_index,
@@ -100,8 +102,9 @@ class BufferAllocations {
   se::DeviceMemoryBase temp_buffer_base_;
 
   int device_ordinal_;
-
   DeviceMemoryAllocator* memory_allocator_;
+  const BufferAssignment* buffer_assignment_;
+  bool torn_down_ = false;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
index dce8de2e301ecf..77a48965e03134 100644
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.cc
@@ -35,9 +35,10 @@ ConditionalThunk::ConditionalThunk(
       true_thunk_(std::move(true_thunk_sequence), hlo),
       false_thunk_(std::move(false_thunk_sequence), hlo) {}
 
-Status ConditionalThunk::Initialize(const GpuExecutable& executable) {
-  TF_RETURN_IF_ERROR(true_thunk_.Initialize(executable));
-  TF_RETURN_IF_ERROR(false_thunk_.Initialize(executable));
+Status ConditionalThunk::Initialize(const GpuExecutable& executable,
+                                    se::StreamExecutor* executor) {
+  TF_RETURN_IF_ERROR(true_thunk_.Initialize(executable, executor));
+  TF_RETURN_IF_ERROR(false_thunk_.Initialize(executable, executor));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
index e40872688fdad2..ee03865d174469 100644
--- a/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/conditional_thunk.h
@@ -47,7 +47,8 @@ class ConditionalThunk : public Thunk {
   ConditionalThunk(const ConditionalThunk&) = delete;
   ConditionalThunk& operator=(const ConditionalThunk&) = delete;
 
-  Status Initialize(const GpuExecutable& executable) override;
+  Status Initialize(const GpuExecutable& executable,
+                    se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
                          se::Stream* stream) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
index 64d3b84b8c73d8..f0881124128c9b 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
@@ -29,11 +29,6 @@ namespace xla {
 namespace gpu {
 
 using se::dnn::AlgorithmDesc;
-using se::dnn::BatchDescriptor;
-using se::dnn::ConvolutionDescriptor;
-using se::dnn::DataLayout;
-using se::dnn::FilterDescriptor;
-using se::dnn::FilterLayout;
 
 ConvolutionThunk::ConvolutionThunk(
     CudnnConvKind convolution_kind, const BufferAllocation::Slice& input_buffer,
diff --git a/tensorflow/compiler/xla/service/gpu/copy_thunk.cc b/tensorflow/compiler/xla/service/gpu/copy_thunk.cc
index bf912fbd14de58..ee38c0318a878c 100644
--- a/tensorflow/compiler/xla/service/gpu/copy_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/copy_thunk.cc
@@ -29,12 +29,12 @@ HostToDeviceCopyThunk::HostToDeviceCopyThunk(
       destination_buffer_(destination_buffer),
       mem_size_(mem_size) {}
 
-tensorflow::Status HostToDeviceCopyThunk::ExecuteOnStream(
+Status HostToDeviceCopyThunk::ExecuteOnStream(
     const BufferAllocations& buffer_allocations, se::Stream* stream) {
   se::DeviceMemoryBase destination_data =
       buffer_allocations.GetDeviceAddress(destination_buffer_);
   stream->ThenMemcpy(&destination_data, source_address_, mem_size_);
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 DeviceToDeviceCopyThunk::DeviceToDeviceCopyThunk(
@@ -46,14 +46,14 @@ DeviceToDeviceCopyThunk::DeviceToDeviceCopyThunk(
       destination_buffer_(destination_buffer),
       mem_size_(mem_size) {}
 
-tensorflow::Status DeviceToDeviceCopyThunk::ExecuteOnStream(
+Status DeviceToDeviceCopyThunk::ExecuteOnStream(
     const BufferAllocations& buffer_allocations, se::Stream* stream) {
   se::DeviceMemoryBase destination_data =
       buffer_allocations.GetDeviceAddress(destination_buffer_);
   se::DeviceMemoryBase source_data =
       buffer_allocations.GetDeviceAddress(source_buffer_);
   stream->ThenMemcpy(&destination_data, source_data, mem_size_);
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/copy_thunk.h b/tensorflow/compiler/xla/service/gpu/copy_thunk.h
index 2e7eb5f3445bc9..8b128386f61636 100644
--- a/tensorflow/compiler/xla/service/gpu/copy_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/copy_thunk.h
@@ -39,8 +39,8 @@ class HostToDeviceCopyThunk : public Thunk {
   HostToDeviceCopyThunk(const HostToDeviceCopyThunk&) = delete;
   HostToDeviceCopyThunk& operator=(const HostToDeviceCopyThunk&) = delete;
 
-  tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         se::Stream* stream) override;
 
  private:
   const void* source_address_;
@@ -62,8 +62,8 @@ class DeviceToDeviceCopyThunk : public Thunk {
   DeviceToDeviceCopyThunk(const DeviceToDeviceCopyThunk&) = delete;
   DeviceToDeviceCopyThunk& operator=(const DeviceToDeviceCopyThunk&) = delete;
 
-  tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         se::Stream* stream) override;
 
  private:
   const BufferAllocation::Slice source_buffer_;
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
index c4c56c56928810..3dc98c4c93ea2b 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h"
+#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/core/lib/gtl/optional.h"
@@ -35,35 +36,22 @@ class ScratchAllocator : public se::ScratchAllocator {
   ScratchAllocator(int device_ordinal, DeviceMemoryAllocator* memory_allocator)
       : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {}
 
-  ~ScratchAllocator() override;
-
   int64 GetMemoryLimitInBytes(se::Stream* stream) override {
     return 1LL << 32;  // 4GB.  TODO(jlebar): Tune this?
   }
   int64 TotalAllocatedBytes() { return total_allocated_bytes_; }
 
-  se::port::StatusOr<se::DeviceMemory<uint8>> AllocateBytes(
-      se::Stream* stream, int64 byte_size) override;
+  StatusOr<se::DeviceMemory<uint8>> AllocateBytes(se::Stream* stream,
+                                                  int64 byte_size) override;
 
  private:
   const int device_ordinal_;
   DeviceMemoryAllocator* memory_allocator_;
-  std::vector<se::DeviceMemoryBase> allocated_buffers_;
+  std::vector<OwningDeviceMemory> allocated_buffers_;
   int64 total_allocated_bytes_ = 0;
 };
 
-ScratchAllocator::~ScratchAllocator() {
-  for (auto& allocated_buffer : allocated_buffers_) {
-    if (!memory_allocator_->Deallocate(device_ordinal_, &allocated_buffer)
-             .ok()) {
-      // The program can still continue with failed deallocation.
-      LOG(ERROR) << "Failed to deallocate the allocated buffer: "
-                 << allocated_buffer.opaque();
-    }
-  }
-}
-
-se::port::StatusOr<se::DeviceMemory<uint8>> ScratchAllocator::AllocateBytes(
+StatusOr<se::DeviceMemory<uint8>> ScratchAllocator::AllocateBytes(
     se::Stream* stream, int64 byte_size) {
   CHECK_GE(byte_size, 0) << "byte_size must be positive.";
   if (byte_size > GetMemoryLimitInBytes(stream)) {
@@ -74,19 +62,14 @@ se::port::StatusOr<se::DeviceMemory<uint8>> ScratchAllocator::AllocateBytes(
             byte_size, GetMemoryLimitInBytes(stream)));
   }
 
-  auto status_or_memory =
-      memory_allocator_->Allocate(device_ordinal_, byte_size,
-                                  /*retry_on_failure=*/false);
-  if (!status_or_memory.ok()) {
-    return se::port::Status(se::port::error::RESOURCE_EXHAUSTED,
-                            tensorflow::strings::Printf(
-                                "Failed to allocate %lld bytes on device %d.",
-                                byte_size, device_ordinal_));
-  }
-  se::DeviceMemoryBase allocated_buffer = status_or_memory.ValueOrDie();
-  allocated_buffers_.push_back(allocated_buffer);
+  TF_ASSIGN_OR_RETURN(OwningDeviceMemory allocated_buffer,
+                      memory_allocator_->Allocate(device_ordinal_, byte_size,
+                                                  /*retry_on_failure=*/false));
   total_allocated_bytes_ += byte_size;
-  return se::DeviceMemory<uint8>(allocated_buffer);
+
+  se::DeviceMemoryBase buffer_addr = allocated_buffer.AsDeviceMemoryBase();
+  allocated_buffers_.push_back(std::move(allocated_buffer));
+  return se::DeviceMemory<uint8>(buffer_addr);
 }
 
 // Determines whether we can safely perform a winograd non-fused convolution for
@@ -197,22 +180,42 @@ CudnnConvolutionAlgorithmPicker::PickBestAlgorithm(
   // We don't put any data in these buffers, because (in theory, anyway) the
   // speed of a conv isn't affected by the data being convolved.
   ScratchAllocator input_output_allocator(device_ordinal, allocator);
-  se::port::StatusOr<DeviceMemoryBase> input_buf =
+  StatusOr<DeviceMemoryBase> maybe_input_buf =
       input_output_allocator.AllocateBytes(&stream,
                                            ShapeUtil::ByteSizeOf(input_shape));
-  se::port::StatusOr<DeviceMemoryBase> filter_buf =
+  StatusOr<DeviceMemoryBase> maybe_filter_buf =
       input_output_allocator.AllocateBytes(&stream,
                                            ShapeUtil::ByteSizeOf(filter_shape));
-  se::port::StatusOr<DeviceMemoryBase> output_buf =
+  StatusOr<DeviceMemoryBase> maybe_output_buf =
       input_output_allocator.AllocateBytes(&stream,
                                            ShapeUtil::ByteSizeOf(output_shape));
-  if (!input_buf.ok() || !filter_buf.ok() || !output_buf.ok()) {
+  if (!maybe_input_buf.ok() || !maybe_filter_buf.ok() ||
+      !maybe_output_buf.ok()) {
     LOG(WARNING)
         << "Couldn't allocate space for input/filter/output of convolution "
         << instr->ToString() << ".  Falling back to default algorithm.";
     return nullopt;
   }
 
+  DeviceMemoryBase input_buf = maybe_input_buf.ValueOrDie();
+  DeviceMemoryBase filter_buf = maybe_filter_buf.ValueOrDie();
+  DeviceMemoryBase output_buf = maybe_output_buf.ValueOrDie();
+
+  // Although we don't have evidence this matters, zero out the buffers before
+  // autotuning.  It's conceivable that using uninitialized memory as the inputs
+  // might affect performance if e.g. the inputs contain denormals, and this is
+  // easy enough.
+  if (!stream.ThenMemZero(&input_buf, input_buf.size())
+           .ThenMemZero(&filter_buf, filter_buf.size())
+           .ThenMemZero(&output_buf, output_buf.size())
+           .BlockHostUntilDone()
+           .ok()) {
+    LOG(WARNING)
+        << "Couldn't zero out input/filter/output buffer for convolution "
+        << instr->ToString() << ".  Falling back to default algorithm.";
+    return nullopt;
+  }
+
   const bool use_winograd_nonfused = ShouldIncludeWinogradNonfusedAlgo(
       input_shape, output_shape, dnums, stream_exec_);
   se::dnn::ProfileResult best_result;
@@ -225,12 +228,12 @@ CudnnConvolutionAlgorithmPicker::PickBestAlgorithm(
     VLOG(3) << "Trying algorithm " << AlgorithmToString(alg) << " for "
             << instr->ToString();
 
-    bool launch_ok = RunCudnnConvolution(
-                         kind, input_shape, filter_shape, output_shape,
-                         input_buf.ValueOrDie(), filter_buf.ValueOrDie(),
-                         output_buf.ValueOrDie(), &scratch_allocator, window,
-                         dnums, AlgorithmConfig(alg), &stream, &profile_result)
-                         .ok();
+    bool launch_ok =
+        RunCudnnConvolution(kind, input_shape, filter_shape, output_shape,
+                            input_buf, filter_buf, output_buf,
+                            &scratch_allocator, window, dnums,
+                            AlgorithmConfig(alg), &stream, &profile_result)
+            .ok();
 
     if (launch_ok && profile_result.is_valid()) {
       int64 scratch_bytes_used = scratch_allocator.TotalAllocatedBytes();
@@ -314,21 +317,20 @@ StatusOr<bool> CudnnConvolutionAlgorithmPicker::RunOnInstruction(
   Shape new_call_shape =
       ShapeUtil::MakeTupleShape({instr->shape().tuple_shapes(0),
                                  ShapeUtil::MakeShape(U8, {scratch_bytes})});
-  HloInstruction* algorithm_hlo = computation->AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<int64>(algorithm)));
-  HloInstruction* tensor_ops_enabled_hlo =
-      computation->AddInstruction(HloInstruction::CreateConstant(
-          Literal::CreateR0<bool>(tensor_ops_enabled)));
+
+  CudnnConvBackendConfig backend_config;
+  backend_config.set_algorithm(algorithm);
+  backend_config.set_tensor_ops_enabled(tensor_ops_enabled);
 
   HloInstruction* new_call =
       computation->AddInstruction(HloInstruction::CreateCustomCall(
           new_call_shape,
-          {instr->mutable_operand(0), instr->mutable_operand(1), algorithm_hlo,
-           tensor_ops_enabled_hlo},
+          {instr->mutable_operand(0), instr->mutable_operand(1)},
           instr->custom_call_target()));
   new_call->set_window(instr->window());
   new_call->set_convolution_dimension_numbers(
       instr->convolution_dimension_numbers());
+  TF_RETURN_IF_ERROR(new_call->set_backend_config(backend_config));
 
   // Repackage new_call so it has the same shape as the original call, namely
   // (conv_result, u8[0]).
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc
index 10b4c3de89989c..0645fbb3ad39f1 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_runner.h"
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -113,8 +115,17 @@ Status RunCudnnConvolution(
 
   // cuDNN's convolution APIs support the BDYX layout for activations/output and
   // the OIYX layout for weights.
+  DataLayout input_dl;
+  FilterLayout filter_dl;
+  DataLayout output_dl;
+
+  TF_ASSIGN_OR_RETURN(std::tie(input_dl, filter_dl, output_dl),
+                      XlaConvLayoutsToStreamExecutorLayouts(
+                          dnums, input_shape.layout(), filter_shape.layout(),
+                          output_shape.layout()));
+
   BatchDescriptor input_descriptor(effective_num_dimensions);
-  input_descriptor.set_layout(DataLayout::kBatchDepthYX)
+  input_descriptor.set_layout(input_dl)
       .set_feature_map_count(
           input_shape.dimensions(dnums.input_feature_dimension()))
       .set_count(input_shape.dimensions(dnums.input_batch_dimension()));
@@ -126,7 +137,7 @@ Status RunCudnnConvolution(
   }
 
   FilterDescriptor filter_descriptor(effective_num_dimensions);
-  filter_descriptor.set_layout(FilterLayout::kOutputInputYX)
+  filter_descriptor.set_layout(filter_dl)
       .set_input_feature_map_count(
           filter_shape.dimensions(dnums.kernel_input_feature_dimension()))
       .set_output_feature_map_count(
@@ -149,7 +160,7 @@ Status RunCudnnConvolution(
   }
 
   BatchDescriptor output_descriptor(effective_num_dimensions);
-  output_descriptor.set_layout(DataLayout::kBatchDepthYX)
+  output_descriptor.set_layout(output_dl)
       .set_feature_map_count(
           output_shape.dimensions(dnums.output_feature_dimension()))
       .set_count(output_shape.dimensions(dnums.output_batch_dimension()));
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index 2dc1fc4cd064a0..1ac1159f3afeee 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -230,6 +230,11 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitLog(
   return EmitROCDLMathCall("__ocml_log", {value}, {prim_type}, prim_type);
 }
 
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitLog1p(
+    PrimitiveType prim_type, llvm::Value* value) const {
+  return EmitROCDLMathCall("__ocml_log1p", {value}, {prim_type}, prim_type);
+}
+
 StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitSin(
     PrimitiveType prim_type, llvm::Value* value) const {
   return EmitROCDLMathCall("__ocml_sin", {value}, {prim_type}, prim_type);
@@ -245,6 +250,11 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitExp(
   return EmitROCDLMathCall("__ocml_exp", {value}, {prim_type}, prim_type);
 }
 
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitExpm1(
+    PrimitiveType prim_type, llvm::Value* value) const {
+  return EmitROCDLMathCall("__ocml_expm1", {value}, {prim_type}, prim_type);
+}
+
 StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitPow(PrimitiveType prim_type,
                                                       llvm::Value* lhs,
                                                       llvm::Value* rhs) const {
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
index ee257ebc01a942..b0d0fc5b77d8a9 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.h
@@ -64,6 +64,9 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
   StatusOr<llvm::Value*> EmitLog(PrimitiveType prim_type,
                                  llvm::Value* value) const override;
 
+  StatusOr<llvm::Value*> EmitLog1p(PrimitiveType prim_type,
+                                   llvm::Value* value) const override;
+
   StatusOr<llvm::Value*> EmitSin(PrimitiveType prim_type,
                                  llvm::Value* value) const override;
 
@@ -73,6 +76,9 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
   StatusOr<llvm::Value*> EmitExp(PrimitiveType prim_type,
                                  llvm::Value* value) const override;
 
+  StatusOr<llvm::Value*> EmitExpm1(PrimitiveType prim_type,
+                                   llvm::Value* value) const override;
+
   StatusOr<llvm::Value*> EmitPow(PrimitiveType prim_type, llvm::Value* lhs,
                                  llvm::Value* rhs) const override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/fft_thunk.cc b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
index cc747addbd152e..e14ee6918bf148 100644
--- a/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.cc
@@ -31,23 +31,12 @@ FftScratchAllocator::FftScratchAllocator(
     int device_ordinal, DeviceMemoryAllocator* memory_allocator)
     : device_ordinal_(device_ordinal), memory_allocator_(memory_allocator) {}
 
-FftScratchAllocator::~FftScratchAllocator() {
-  for (auto& allocated_buffer : allocated_buffers_) {
-    if (!memory_allocator_->Deallocate(device_ordinal_, &allocated_buffer)
-             .ok()) {
-      // The program can still continue with failed deallocation.
-      LOG(ERROR) << "Failed to deallocate the allocated buffer: "
-                 << allocated_buffer.opaque();
-    }
-  }
-}
-
 int64 FftScratchAllocator::GetMemoryLimitInBytes(se::Stream* stream) {
   constexpr int64 kFftScratchSize = 1LL << 32;  // 4GB by default.
   return kFftScratchSize;
 }
 
-se::port::StatusOr<se::DeviceMemory<uint8>> FftScratchAllocator::AllocateBytes(
+StatusOr<se::DeviceMemory<uint8>> FftScratchAllocator::AllocateBytes(
     se::Stream* stream, int64 byte_size) {
   CHECK_GE(byte_size, 0) << "byte_size must be positive.";
   if (byte_size > GetMemoryLimitInBytes(stream)) {
@@ -58,18 +47,14 @@ se::port::StatusOr<se::DeviceMemory<uint8>> FftScratchAllocator::AllocateBytes(
             byte_size, GetMemoryLimitInBytes(stream)));
   }
 
-  auto status_or_memory =
-      memory_allocator_->Allocate(device_ordinal_, byte_size,
-                                  /*retry_on_failure=*/false);
-  if (!status_or_memory.ok()) {
-    return tensorflow::errors::ResourceExhausted(
-        "Failed to allocate %lld bytes on device %d.", byte_size,
-        device_ordinal_);
-  }
-  se::DeviceMemoryBase allocated_buffer = status_or_memory.ValueOrDie();
-  allocated_buffers_.push_back(allocated_buffer);
+  TF_ASSIGN_OR_RETURN(OwningDeviceMemory allocated_buffer,
+                      memory_allocator_->Allocate(device_ordinal_, byte_size,
+                                                  /*retry_on_failure=*/false));
   total_allocated_bytes_ += byte_size;
-  return se::DeviceMemory<uint8>(allocated_buffer);
+
+  se::DeviceMemoryBase buffer_addr = allocated_buffer.AsDeviceMemoryBase();
+  allocated_buffers_.push_back(std::move(allocated_buffer));
+  return se::DeviceMemory<uint8>(buffer_addr);
 }
 
 namespace {
@@ -121,8 +106,8 @@ FftThunk::FftThunk(FftType fft_type,
       input_shape_(input_shape),
       output_shape_(output_shape) {}
 
-tensorflow::Status FftThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations, se::Stream* stream) {
+Status FftThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                                 se::Stream* stream) {
   VLOG(3) << "FFT type: " << FftTypeToString(fft_type_);
   VLOG(3) << "Input shape: " << ShapeUtil::HumanStringWithLayout(input_shape_);
   VLOG(3) << "Output shape: "
@@ -222,7 +207,7 @@ tensorflow::Status FftThunk::ExecuteOnStream(
       LOG(FATAL) << "unsupported fft type";
   }
   if (launch_ok) {
-    return tensorflow::Status::OK();
+    return Status::OK();
   }
   return InternalError("Unable to launch fft for thunk %p with type %s", this,
                        FftTypeToString(fft_type_).c_str());
diff --git a/tensorflow/compiler/xla/service/gpu/fft_thunk.h b/tensorflow/compiler/xla/service/gpu/fft_thunk.h
index 24b1dca99865fe..b0a22564f3a09b 100644
--- a/tensorflow/compiler/xla/service/gpu/fft_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/fft_thunk.h
@@ -39,8 +39,6 @@ class FftScratchAllocator : public se::ScratchAllocator {
   FftScratchAllocator(int device_ordinal,
                       DeviceMemoryAllocator* memory_allocator);
 
-  ~FftScratchAllocator() override;
-
   int64 GetMemoryLimitInBytes(se::Stream* stream) override;
 
   int64 TotalAllocatedBytes() { return total_allocated_bytes_; }
@@ -51,7 +49,7 @@ class FftScratchAllocator : public se::ScratchAllocator {
  private:
   const int device_ordinal_;
   DeviceMemoryAllocator* memory_allocator_;
-  std::vector<se::DeviceMemoryBase> allocated_buffers_;
+  std::vector<OwningDeviceMemory> allocated_buffers_;
   int64 total_allocated_bytes_ = 0;
 };
 
@@ -73,8 +71,8 @@ class FftThunk : public Thunk {
   FftThunk& operator=(const FftThunk&) = delete;  // Cannot share fft_plan_
 
   // Does the FFT for the thunk on "stream".
-  tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         se::Stream* stream) override;
 
  private:
   const se::fft::Type fft_type_;
diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.cc b/tensorflow/compiler/xla/service/gpu/for_thunk.cc
index 6e6966df3987ee..b36539e0cb8d0a 100644
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.cc
@@ -30,19 +30,20 @@ ForThunk::ForThunk(const int64 loop_limit,
       body_thunk_sequence_(
           MakeUnique<SequentialThunk>(std::move(*body_thunk_sequence), hlo)) {}
 
-tensorflow::Status ForThunk::Initialize(const GpuExecutable& executable) {
-  TF_RETURN_IF_ERROR(body_thunk_sequence_->Initialize(executable));
-  return tensorflow::Status::OK();
+Status ForThunk::Initialize(const GpuExecutable& executable,
+                            se::StreamExecutor* executor) {
+  TF_RETURN_IF_ERROR(body_thunk_sequence_->Initialize(executable, executor));
+  return Status::OK();
 }
 
-tensorflow::Status ForThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations, se::Stream* stream) {
+Status ForThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                                 se::Stream* stream) {
   for (int64 i = 0; i < loop_limit_; ++i) {
     // Invoke loop body thunk sequence.
     TF_RETURN_IF_ERROR(
         body_thunk_sequence_->ExecuteOnStream(buffer_allocations, stream));
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/for_thunk.h b/tensorflow/compiler/xla/service/gpu/for_thunk.h
index c78d1c50686297..41ddfe0ceb1d05 100644
--- a/tensorflow/compiler/xla/service/gpu/for_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/for_thunk.h
@@ -36,9 +36,10 @@ class ForThunk : public Thunk {
   ForThunk(const ForThunk&) = delete;
   ForThunk& operator=(const ForThunk&) = delete;
 
-  tensorflow::Status Initialize(const GpuExecutable& executable) override;
-  tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
+  Status Initialize(const GpuExecutable& executable,
+                    se::StreamExecutor* executor) override;
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         se::Stream* stream) override;
 
  private:
   const int64 loop_limit_;
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc
index 2217776c7d5a5f..b22bb1d39ba177 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 
 namespace xla {
 namespace gpu {
@@ -40,7 +40,7 @@ class FusionMergerTest : public HloTestBase {};
 //                   Tuple
 //
 TEST_F(FusionMergerTest, MergeSharedFusionInstruction) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
 HloModule MergeSharedFusionInstruction
 
 comp.3 {
@@ -104,7 +104,7 @@ ENTRY MergeSharedFusionInstruction.Computation0 {
 //
 // Fusion2 is not merged because it exceeds the threshold flops-to-bytes ratio.
 TEST_F(FusionMergerTest, FlopsToBytesRatioThresholdExceeded) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
 HloModule FlopsToBytesRatioThresholdExceeded
 
 comp.2 {
@@ -162,7 +162,7 @@ ENTRY FlopsToBytesRatioThresholdExceeded.Computation1 {
 // is merged into Fusion0 and Fusion1) would exceed the bytes transferred
 // threshold.
 TEST_F(FusionMergerTest, BytesTransferredThresholdExeceeded) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
 HloModule BytesTransferredThresholdExeceeded
 
 comp.2 {
@@ -210,7 +210,7 @@ ENTRY BytesTransferredThresholdExeceeded.Computation2 {
 // Fusion2 is reduced for this test which makes the merge operation into its
 // operand below the bytes transferred threshold.
 TEST_F(FusionMergerTest, BytesTransferredThresholdNotExeceeded) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
 HloModule BytesTransferredThresholdNotExeceeded
 
 comp.2 {
@@ -253,7 +253,7 @@ ENTRY BytesTransferredThresholdNotExeceeded.Computation2 {
 // Check that we're willing to merge f1_computation into f2_computation, even
 // though f2 is an input fusion node.
 TEST_F(FusionMergerTest, WillMergeIntoInputFusion) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
     HloModule m
 
     f1_computation {
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
index 0ec12f52d8b398..79fca43d022816 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.cc
@@ -215,14 +215,32 @@ se::blas::ComputationType GetBlasComputationType(PrimitiveType type) {
   }
 }
 
+DotDimensionNumbers GetDimensionNumbers(const HloInstruction& hlo_instruction) {
+  if (hlo_instruction.opcode() == HloOpcode::kDot) {
+    return hlo_instruction.dot_dimension_numbers();
+  }
+  CHECK_EQ(hlo_instruction.opcode(), HloOpcode::kFusion);
+  CHECK_EQ(hlo_instruction.fusion_kind(), HloInstruction::FusionKind::kOutput);
+  CHECK_EQ(hlo_instruction.fused_expression_root()->opcode(),
+           HloOpcode::kMultiply);
+  // Try to find the dot inside the output fusion node.
+  const HloInstruction* dot =
+      hlo_instruction.fused_expression_root()->operand(0);
+  if (dot->opcode() != HloOpcode::kDot) {
+    dot = hlo_instruction.fused_expression_root()->operand(1);
+  }
+  CHECK_EQ(dot->opcode(), HloOpcode::kDot);
+
+  return dot->dot_dimension_numbers();
+}
+
 }  // namespace
 
 GemmThunk::GemmThunk(const BufferAllocation::Slice& lhs_buffer,
                      const BufferAllocation::Slice& rhs_buffer,
                      const BufferAllocation::Slice& output_buffer,
                      const Shape& lhs_shape, const Shape& rhs_shape,
-                     const Shape& output_shape, bool transpose_lhs,
-                     bool transpose_rhs, double alpha,
+                     const Shape& output_shape, double alpha,
                      const HloInstruction* hlo_instruction)
     : Thunk(Kind::kGemm, hlo_instruction),
       lhs_buffer_(lhs_buffer),
@@ -231,12 +249,10 @@ GemmThunk::GemmThunk(const BufferAllocation::Slice& lhs_buffer,
       lhs_shape_(lhs_shape),
       rhs_shape_(rhs_shape),
       output_shape_(output_shape),
-      transpose_lhs_(transpose_lhs),
-      transpose_rhs_(transpose_rhs),
       alpha_(alpha) {}
 
-tensorflow::Status GemmThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations, se::Stream* stream) {
+Status GemmThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                                  se::Stream* stream) {
   VLOG(2) << "Executing a GemmThunk";
 
   se::DeviceMemoryBase lhs_data =
@@ -284,10 +300,12 @@ tensorflow::Status GemmThunk::ExecuteOnStream(
                             shape.dimensions(!is_row_major));
   };
 
-  const MatrixDescriptor lhs_descriptor =
-      make_descriptor(lhs_data, lhs_shape_, transpose_lhs_);
-  const MatrixDescriptor rhs_descriptor =
-      make_descriptor(rhs_data, rhs_shape_, transpose_rhs_);
+  DotDimensionNumbers dim_nums = GetDimensionNumbers(*hlo_instruction());
+
+  const MatrixDescriptor lhs_descriptor = make_descriptor(
+      lhs_data, lhs_shape_, dim_nums.lhs_contracting_dimensions(0) == 0);
+  const MatrixDescriptor rhs_descriptor = make_descriptor(
+      rhs_data, rhs_shape_, dim_nums.rhs_contracting_dimensions(0) == 1);
 
   // Dispatches to a regular cublas gemm, a gemm-with-algorithm, or attempts to
   // autotune this gemm to figure out the best algorithm.
@@ -350,7 +368,7 @@ tensorflow::Status GemmThunk::ExecuteOnStream(
   if (!launch_ok) {
     return InternalError("Unable to launch cuBLAS gemm on stream %p", stream);
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
index a18f425bc38fd3..7a4830d64e7cae 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/gemm_thunk.h
@@ -35,22 +35,20 @@ namespace gpu {
 class GemmThunk : public Thunk {
  public:
   // Constructs a thunk that computes "output = (lhs <dot> rhs) * alpha" using
-  // BLAS gemm. transpose_lhs and transpose_rhs indicate whether gemm should
-  // transpose the lhs and rhs operand. hlo_instruction is as in Thunk. alpha is
-  // a constant.
+  // BLAS gemm.  hlo_instruction is as in Thunk. alpha is a constant.
   GemmThunk(const BufferAllocation::Slice& lhs_buffer,
             const BufferAllocation::Slice& rhs_buffer,
             const BufferAllocation::Slice& output_buffer,
             const Shape& lhs_shape, const Shape& rhs_shape,
-            const Shape& output_shape, bool transpose_lhs, bool transpose_rhs,
-            double alpha, const HloInstruction* hlo_instruction);
+            const Shape& output_shape, double alpha,
+            const HloInstruction* hlo_instruction);
 
   GemmThunk(const GemmThunk&) = delete;
   GemmThunk& operator=(const GemmThunk&) = delete;
 
   // Does the gemm operation for the thunk on "stream", which must be non-null.
-  tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         se::Stream* stream) override;
 
   // Returns true if we'll perform autotuning if run on the given stream.  If
   // so, we want the GPU to be quiescent during autotuning, so as not to
@@ -69,8 +67,6 @@ class GemmThunk : public Thunk {
   const Shape rhs_shape_;
   const Shape output_shape_;
 
-  const bool transpose_lhs_;
-  const bool transpose_rhs_;
   const double alpha_;
 
   // Maps device names (StreamExecutor::DeviceDescription::name()) to autotune
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
index 9db85bc788bde4..c5ccdd4a7dcec0 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
@@ -78,14 +78,13 @@ StatusOr<bool> GpuCopyInsertion::Run(HloModule* module) {
       for (int64 i = 0; i < hlo->operand_count() - 2; ++i) {
         TF_RETURN_IF_ERROR(copy_operand_if_constant(i));
       }
-    } else if (IsCustomCallToDnnConvolution(*hlo)) {
-      // The last two arguments to a CUDNN convolution are two HLO constants for
-      // cudnn algorithm and tensor_ops_enabled flag, which shouldn't be copied.
-      for (int64 i = 0; i < hlo->operand_count() - 2; ++i) {
-        TF_RETURN_IF_ERROR(copy_operand_if_constant(i));
-      }
-    } else if (ImplementedAsLibraryCall(*hlo)) {
-      // For all other library calls, materialize all the operands into memory.
+    } else if (ImplementedAsLibraryCall(*hlo) ||
+               hlo->opcode() == HloOpcode::kCrossReplicaSum) {
+      // For all other library calls and cross-replica-sum, materialize all the
+      // operands into memory.  (Cross-replica-sum gets its constant args
+      // materialized even if it's not implemented as a libcall to simplify the
+      // implementation.  It's slower, but we can constant fold away constant
+      // args *anyway*, so we just need to make it work.)
       for (int64 i = 0; i < hlo->operand_count(); ++i) {
         TF_RETURN_IF_ERROR(copy_operand_if_constant(i));
       }
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 97c10c9a5299f7..0196c4cf82457c 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -32,12 +32,15 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
+using tensorflow::tracing::ScopedAnnotation;
+
 // A helper class for profiling HLO in the course of GPU program execution.
 // All of the profiling is guarded internally, to avoid the caller needing to
 // have lots of conditionals sprinkled around.
@@ -134,6 +137,7 @@ Status GpuExecutable::ExecuteThunks(
   CheckCompatibilityWithServiceExecutableRunOptions(run_options);
 
   se::Stream* main_stream = run_options->stream();
+  se::StreamExecutor* executor = main_stream->parent();
 
   bool do_profile = hlo_execution_profile != nullptr;
   if (do_profile) {
@@ -145,21 +149,39 @@ Status GpuExecutable::ExecuteThunks(
   sub_streams.reserve(thunk_schedule_->StreamCount() - 1);
   while (sub_streams.size() + 1 < thunk_schedule_->StreamCount()) {
     sub_streams.emplace_back();
-    TF_ASSIGN_OR_RETURN(
-        sub_streams.back(),
-        run_options->BorrowStream(main_stream->parent()->device_ordinal()));
+    TF_ASSIGN_OR_RETURN(sub_streams.back(),
+                        run_options->BorrowStream(executor->device_ordinal()));
   }
 
   HloExecutionProfiler profiler(do_profile, hlo_execution_profile, main_stream,
                                 sub_streams, hlo_module_->entry_computation());
   uint64 start_micros = tensorflow::Env::Default()->NowMicros();
 
-  // The next event enqueued on stream N must not run until the thunk at
-  // last_blocking_thunk_for_stream[N] completes.
-  std::map<int32, const Thunk*> last_blocking_thunk_for_stream;
+  // This top-level trace serves two purposes:
+  //  1) It marks the scope of the whole XLA module.
+  //  2) It tells us whether tracing is enabled.  We use this to avoid the
+  //     expensive HloInstruction::ToString() calls inside the loop below if
+  //     tracing is disabled.
+  ScopedAnnotation top_level_annotation(hlo_module_->name(), "XLA GPU module");
+
   std::map<const Thunk*, std::unique_ptr<se::Event>> thunk_to_finish_event;
   for (Thunk* thunk : thunk_schedule_->TotalOrder()) {
-    TF_RETURN_IF_ERROR(thunk->Initialize(*this));
+    // Annotate execution of this op if tracing was enabled when we started
+    // running this module.  If tracing is enabled *while* we're running the
+    // module, we won't get any data, but that's probably an OK trade-off.
+    //
+    // TODO(jlebar): Should we cache the results of HloInstruction::ToString(),
+    // since we expect it to be an expensive call?
+    tensorflow::gtl::optional<ScopedAnnotation> op_annotation;
+    if (top_level_annotation.IsEnabled()) {
+      op_annotation.emplace(
+          thunk->hlo_instruction() != nullptr
+              ? thunk->hlo_instruction()->ToString(HloPrintOptions::Canonical())
+              : "<unknown>",
+          "XLA op");
+    }
+
+    TF_RETURN_IF_ERROR(thunk->Initialize(*this, executor));
     int32 stream_no =
         thunk_schedule_->StreamNumberForHlo(*thunk->hlo_instruction());
     se::Stream* stream =
@@ -169,18 +191,10 @@ Status GpuExecutable::ExecuteThunks(
       stream->ThenWaitFor(FindOrDie(thunk_to_finish_event, dependency).get());
     }
 
-    if (last_blocking_thunk_for_stream.count(stream_no)) {
-      stream->ThenWaitFor(FindOrDie(thunk_to_finish_event,
-                                    last_blocking_thunk_for_stream[stream_no])
-                              .get());
-      last_blocking_thunk_for_stream.erase(stream_no);
-    }
-
     // If this thunk requests it, wait for all currently-executing thunks to
     // finish.  This is useful e.g. if the thunk is about to perform autotuning.
     if (thunk->ShouldHaltAllActivityBeforeRunning(stream)) {
       TF_RETURN_IF_ERROR(main_stream->BlockHostUntilDone());
-      last_blocking_thunk_for_stream.clear();
     }
 
     profiler.StartOperation();
@@ -188,22 +202,11 @@ Status GpuExecutable::ExecuteThunks(
             << thunk->hlo_instruction()->ToString() << " on stream "
             << stream_no;
     TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(buffer_allocations, stream));
-    if (thunk_schedule_->Depended(thunk) || thunk->ShouldBlockFutureThunks()) {
+    if (thunk_schedule_->Depended(thunk)) {
       auto finish_event = MakeUnique<se::Event>(main_stream->parent());
       finish_event->Init();
       stream->ThenRecordEvent(finish_event.get());
       thunk_to_finish_event[thunk] = std::move(finish_event);
-
-      if (thunk->ShouldBlockFutureThunks()) {
-        // Set last_blocking_thunk_for_stream on all streams other than this one
-        // so that all other streams will wait for this thunk to complete before
-        // executing any events that occur later in the total order.
-        for (int32 i = 0; i < sub_streams.size() + 1; ++i) {
-          if (i != stream_no) {
-            last_blocking_thunk_for_stream[i] = thunk;
-          }
-        }
-      }
     }
     profiler.FinishOperation(thunk->hlo_instruction());
   }
@@ -276,8 +279,8 @@ StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteOnStream(
   se::StreamExecutor* executor = run_options->stream()->parent();
   TF_ASSIGN_OR_RETURN(
       auto buffer_allocations,
-      buffer_allocations_builder.Build(*assignment_, executor->device_ordinal(),
-                                       memory_allocator));
+      buffer_allocations_builder.Build(
+          assignment_.get(), executor->device_ordinal(), memory_allocator));
 
   bool block_host_until_done =
       !memory_allocator->AllowsAsynchronousDeallocation();
@@ -319,8 +322,7 @@ StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteOnStream(
         buffers_in_result.insert(src_base);
         return Status::OK();
       }));
-  TF_RETURN_IF_ERROR(
-      buffer_allocations->TearDown(buffers_in_result, *assignment_));
+  TF_RETURN_IF_ERROR(buffer_allocations->TearDown(buffers_in_result));
 
   return std::move(shaped_buffer);
 }
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
index 89f1e625884568..8bf62dde8b9948 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
@@ -18,31 +18,72 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_options.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
+#include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace xla {
 namespace gpu {
 
-// cuDNN convolutions are called with specific layouts on the input, output,
-// and filter:
-//
-//   input: DataLayout::kBatchDepthYX
-//   output: DataLayout::kBatchDepthYX
-//   filter: FilterLayout::kOutputInputYX
-//
-// The order dimensions in the constant name is major-to-minor (eg, the
-// most-major dimension of the input is batch, most-minor is X). The
-// specific dimension numbers these named dimensions correspond to is
-// determined by the ConvolutionDimensionNumbers argument. Y is spatial
-// dimension 0, and X is spatial dimension 1.
-//
-// TODO(b/29399649): Be more flexible about handling layouts of cuDNN calls.
-static Status AddBackendConstraintsToDnnConvCustomCall(
+using stream_executor::dnn::DataLayout;
+using stream_executor::dnn::FilterLayout;
+
+static bool IsVoltaOrLater(const se::StreamExecutor& stream_executor) {
+  int major, minor;
+  CHECK(stream_executor.GetDeviceDescription().cuda_compute_capability(&major,
+                                                                       &minor));
+  return major >= 7;
+}
+
+// Returns (input, filter, output) layouts.
+static std::tuple<DataLayout, FilterLayout, DataLayout>
+HeuristicLayoutAssignment(const HloInstruction* instr,
+                          stream_executor::StreamExecutor* stream_executor) {
+  // DataLayout and FilterLayout uses weird enum names. Translations:
+  //   N <=> Batch or Output
+  //   C <=> Depth or Input
+  //   H <=> Y
+  //   W <=> X
+  //
+  // Therefore kOutputInputYX means NHWC; kBatchDepthYX means NCHW.
+
+  // As of today, our empirical evidence is that cudnn 7.0 is faster on V100 x
+  // fp16 with the mostly-NHWC layout. The heuristic may change as cudnn version
+  // changes, as well as the hardware updates.
+  if (!(instr->operand(0)->shape().element_type() == xla::PrimitiveType::F16 &&
+        IsVoltaOrLater(*stream_executor))) {
+    return std::make_tuple(DataLayout::kBatchDepthYX,
+                           FilterLayout::kOutputInputYX,
+                           DataLayout::kBatchDepthYX);
+  }
+  VLOG(2) << "Using heuristic to figure out layouts for " << instr->ToString();
+  // For BackwardInput that has stride, full NHWC layouts run significantly
+  // slower than (NHWC, NCHW, NCHW) or (NHWC, NCHW, NHWC).
+  //
+  // TODO(timshen): more closely compare (NHWC, NCHW, NCHW) and (NHWC, NCHW,
+  // NHWC).
+  if (instr->custom_call_target() == kCudnnConvBackwardInputCallTarget &&
+      window_util::HasStride(instr->window())) {
+    return std::make_tuple(DataLayout::kBatchYXDepth,
+                           FilterLayout::kOutputInputYX,
+                           DataLayout::kBatchDepthYX);
+  }
+  return std::make_tuple(DataLayout::kBatchYXDepth,
+                         FilterLayout::kOutputYXInput,
+                         DataLayout::kBatchYXDepth);
+}
+
+// Adds layout constraints on the cudnn custom-call instruction. The layout
+// constraints are represented in terms of minor_to_major fields of both
+// operands and the output shape. Depending on the underlying algorithm, one of
+// { NCHW, NHWC } ^ 3 = 8 different layout combinations may be chosen.
+Status GpuLayoutAssignment::AddBackendConstraintsToDnnConvCustomCall(
     HloInstruction* instr, LayoutConstraints* constraints) {
   CHECK(IsCustomCallToDnnConvolution(*instr)) << instr->ToString();
   Shape input_shape;
@@ -66,39 +107,25 @@ static Status AddBackendConstraintsToDnnConvCustomCall(
                << instr->custom_call_target();
   }
 
-  // Construct minor-to-major dimension orders for operands and result.
-  // cuDNN's convolution APIs support the BDYX layout for activations/output
-  // and the OIYX layout for weights.
-  // TODO(b/29399649): Be more flexible about handling layouts of cuDNN
-  // calls after we switch to cuDNN v5.
-  const ConvolutionDimensionNumbers& dimension_numbers =
-      instr->convolution_dimension_numbers();
-  std::vector<int64> input_layout;
-  for (int i = dimension_numbers.input_spatial_dimensions_size() - 1; i >= 0;
-       --i) {
-    input_layout.push_back(dimension_numbers.input_spatial_dimensions(i));
-  }
-  input_layout.push_back(dimension_numbers.input_feature_dimension());
-  input_layout.push_back(dimension_numbers.input_batch_dimension());
-  *input_shape.mutable_layout() = LayoutUtil::MakeLayout(input_layout);
-
-  std::vector<int64> filter_layout;
-  for (int i = dimension_numbers.kernel_spatial_dimensions_size() - 1; i >= 0;
-       --i) {
-    filter_layout.push_back(dimension_numbers.kernel_spatial_dimensions(i));
-  }
-  filter_layout.push_back(dimension_numbers.kernel_input_feature_dimension());
-  filter_layout.push_back(dimension_numbers.kernel_output_feature_dimension());
-  *filter_shape.mutable_layout() = LayoutUtil::MakeLayout(filter_layout);
-
-  std::vector<int64> output_layout;
-  for (int i = dimension_numbers.output_spatial_dimensions_size() - 1; i >= 0;
-       --i) {
-    output_layout.push_back(dimension_numbers.output_spatial_dimensions(i));
+  {
+    DataLayout input;
+    FilterLayout filter;
+    DataLayout output;
+    if (ConvUseLayoutHeuristic(instr->GetModule()->config())) {
+      std::tie(input, filter, output) =
+          HeuristicLayoutAssignment(instr, stream_executor_);
+    } else {
+      input = DataLayout::kBatchDepthYX;
+      filter = FilterLayout::kOutputInputYX;
+      output = DataLayout::kBatchDepthYX;
+    }
+
+    TF_ASSIGN_OR_RETURN(
+        std::tie(*input_shape.mutable_layout(), *filter_shape.mutable_layout(),
+                 *output_shape.mutable_layout()),
+        StreamExecutorConvLayoutsToXlaLayouts(
+            instr->convolution_dimension_numbers(), input, filter, output));
   }
-  output_layout.push_back(dimension_numbers.output_feature_dimension());
-  output_layout.push_back(dimension_numbers.output_batch_dimension());
-  *output_shape.mutable_layout() = LayoutUtil::MakeLayout(output_layout);
 
   // The custom call returns a tuple of (actual_result, scratch_buffer);
   // call_result_buf is the logical buffer for actual_result, the thing that
@@ -132,7 +159,13 @@ static Status AddBackendConstraintsToDnnConvCustomCall(
 
 Status GpuLayoutAssignment::AddBackendConstraints(
     LayoutConstraints* constraints) {
-  for (auto* instruction : constraints->computation()->instructions()) {
+  // Add convolution constraints in reverse postorder that the earliest
+  // convolution layout propagates first. This reduces the likelihood of fusion
+  // nodes with copies.
+  auto post_order = constraints->computation()->MakeInstructionPostOrder();
+  for (auto iterator = post_order.rbegin(); iterator != post_order.rend();
+       ++iterator) {
+    HloInstruction* instruction = *iterator;
     if (IsCustomCallToDnnConvolution(*instruction)) {
       TF_RETURN_IF_ERROR(
           AddBackendConstraintsToDnnConvCustomCall(instruction, constraints));
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h
index 86a3a7111fd794..ce24af1cf88569 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/layout_assignment.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
 namespace xla {
 namespace gpu {
@@ -27,8 +28,10 @@ namespace gpu {
 // layout constraints for operands and results of library calls.
 class GpuLayoutAssignment : public LayoutAssignment {
  public:
-  explicit GpuLayoutAssignment(ComputationLayout* entry_computation_layout)
-      : LayoutAssignment(entry_computation_layout) {}
+  explicit GpuLayoutAssignment(ComputationLayout* entry_computation_layout,
+                               se::StreamExecutor* stream_executor)
+      : LayoutAssignment(entry_computation_layout),
+        stream_executor_(stream_executor) {}
   ~GpuLayoutAssignment() override {}
 
  protected:
@@ -41,6 +44,12 @@ class GpuLayoutAssignment : public LayoutAssignment {
       LayoutConstraints* constraints) override;
   bool CustomCallRequiresMajorFirstLayout(
       const HloInstruction* instruction) override;
+
+ private:
+  Status AddBackendConstraintsToDnnConvCustomCall(
+      HloInstruction* instr, LayoutConstraints* constraints);
+
+  se::StreamExecutor* stream_executor_;
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
index 4c45d2e94aebce..e48165c1426ea0 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
@@ -69,7 +69,8 @@ TEST_F(LayoutAssignmentTest, Elementwise) {
         *computation_layout.mutable_result_layout() =
             ShapeLayout(result_shape_with_layout);
 
-        GpuLayoutAssignment layout_assignment(&computation_layout);
+        GpuLayoutAssignment layout_assignment(
+            &computation_layout, backend().default_stream_executor());
         EXPECT_TRUE(layout_assignment.Run(module.get()).ValueOrDie());
 
         for (const HloInstruction* operand : add->operands()) {
@@ -156,7 +157,8 @@ TEST_F(LayoutAssignmentTest, BatchNormInference) {
         *computation_layout.mutable_result_layout() = ShapeLayout(result_shape);
       }
 
-      GpuLayoutAssignment layout_assignment(&computation_layout);
+      GpuLayoutAssignment layout_assignment(
+          &computation_layout, backend().default_stream_executor());
       EXPECT_TRUE(layout_assignment.Run(module.get()).ValueOrDie());
 
       // The first operand to batchnorm should have the same layout as the
@@ -225,7 +227,8 @@ TEST_F(LayoutAssignmentTest, BatchNormTraining) {
                 {result_shape, offset_scale_shape, offset_scale_shape}));
       }
 
-      GpuLayoutAssignment layout_assignment(&computation_layout);
+      GpuLayoutAssignment layout_assignment(
+          &computation_layout, backend().default_stream_executor());
       EXPECT_TRUE(layout_assignment.Run(module.get()).ValueOrDie());
 
       // The first operand to batchnorm should have the same layout as the
@@ -305,7 +308,8 @@ TEST_F(LayoutAssignmentTest, BatchNormGrad) {
                   {result_shape, scale_shape, scale_shape}));
         }
 
-        GpuLayoutAssignment layout_assignment(&computation_layout);
+        GpuLayoutAssignment layout_assignment(
+            &computation_layout, backend().default_stream_executor());
         EXPECT_TRUE(layout_assignment.Run(module.get()).ValueOrDie());
 
         // The first and fourth operands to the batchnorm call should have the
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_options.cc b/tensorflow/compiler/xla/service/gpu/gpu_options.cc
new file mode 100644
index 00000000000000..35b4b4e20b6337
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_options.cc
@@ -0,0 +1,28 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/gpu_options.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+
+namespace xla {
+namespace gpu {
+
+bool ConvUseLayoutHeuristic(const HloModuleConfig& config) {
+  return !config.debug_options().xla_backend_extra_options().count(
+      "xla_gpu_experimental_conv_disable_layout_heuristic");
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_options.h b/tensorflow/compiler/xla/service/gpu/gpu_options.h
new file mode 100644
index 00000000000000..498d4a94955cb2
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_options.h
@@ -0,0 +1,33 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_OPTIONS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_OPTIONS_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+
+// Helper functions for querying options that are specific to the GPU backend.
+
+namespace xla {
+namespace gpu {
+
+// Returns true if we should use heuristics to assign convolution layouts, as
+// opposed to always assigning NCHW.
+bool ConvUseLayoutHeuristic(const HloModuleConfig& config);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_GPU_OPTIONS_H_
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
index b7e3d8cdf34d6e..51897b1c96a478 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.cc
@@ -54,8 +54,8 @@ GpuTransferManager::GpuTransferManager(se::Platform::Id id)
 #endif
               .getPointerSize(0 /* default address space */)) {}
 
-Status GpuTransferManager::TransferLiteralToInfeed(se::StreamExecutor* executor,
-                                                   const Literal& literal) {
+Status GpuTransferManager::TransferLiteralToInfeed(
+    se::StreamExecutor* executor, const LiteralSlice& literal) {
   const Shape& shape = literal.shape();
   VLOG(2) << "Transferring literal to infeed with shape: "
           << ShapeUtil::HumanString(shape);
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h
index 1fff17f6071bc1..316fc4bf8f9503 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_transfer_manager.h
@@ -37,7 +37,7 @@ class GpuTransferManager : public GenericTransferManager {
   ~GpuTransferManager() override {}
 
   Status TransferLiteralToInfeed(se::StreamExecutor* executor,
-                                 const Literal& literal) override;
+                                 const LiteralSlice& literal) override;
   Status TransferBufferToInfeed(se::StreamExecutor* executor, int64 size,
                                 const void* source) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_schedule.cc b/tensorflow/compiler/xla/service/gpu/hlo_schedule.cc
index 42c1539e86c2ab..f766f968826d96 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_schedule.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_schedule.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/hlo_schedule.h"
 
 #include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/buffer_value.h"
 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/compiler/xla/service/hlo_scheduling.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -199,7 +200,7 @@ StatusOr<std::unique_ptr<HloSchedule>> HloSchedule::Build(
     TF_ASSIGN_OR_RETURN(
         schedule->thunk_launch_order_,
         CreateMemoryMinimizingSequence(
-            *entry_computation, [pointer_size](const LogicalBuffer& buffer) {
+            *entry_computation, [pointer_size](const BufferValue& buffer) {
               return ShapeUtil::ByteSizeOf(buffer.shape(), pointer_size);
             }));
   } else {
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_schedule_test.cc b/tensorflow/compiler/xla/service/gpu/hlo_schedule_test.cc
index ece9fa04dce3fd..e230d538cc2df8 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_schedule_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_schedule_test.cc
@@ -42,6 +42,15 @@ class HloScheduleTest : public HloTestBase {
         .ConsumeValueOrDie();
   }
 
+  std::unique_ptr<HloModule> CreateNewModule() {
+    HloModuleConfig config;
+    auto debug_options = GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_disable_multi_streaming(false);
+    config.set_debug_options(debug_options);
+    return MakeUnique<HloModule>("test_module", VersionedComputationHandle(),
+                                 config);
+  }
+
   HloVec RemoveHlo(const HloVec& input,
                    const std::unordered_set<const HloInstruction*>& remove) {
     HloVec result(input);
@@ -65,9 +74,9 @@ TEST_F(HloScheduleTest, SequentialMatMul) {
   HloInstruction* z = builder.AddInstruction(HloInstruction::CreateParameter(
       /*parameter_number=*/2, f32_2x2_, /*name=*/"z"));
   HloInstruction* dot1 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, x, y));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, x, y));
   HloInstruction* dot2 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, dot1, z));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, dot1, z));
 
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build(dot2));
@@ -193,11 +202,11 @@ TEST_F(HloScheduleTest, ConcurrentMatMul) {
   HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
       /*parameter_number=*/1, f32_2x2_, /*name=*/"y"));
   HloInstruction* dot1 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, x, y));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, x, y));
   HloInstruction* dot2 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, y, x));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, y, x));
   HloInstruction* add = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kAdd, dot1, dot2));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, dot1, dot2));
 
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build(add));
@@ -259,24 +268,24 @@ TEST_F(HloScheduleTest, LatticeMatMul) {
     params.push_back(builder.AddInstruction(HloInstruction::CreateParameter(
         i, f32_2x2_, /*name=*/tensorflow::strings::Printf("param%d", i))));
   }
-  HloInstruction* d00 = builder.AddInstruction(HloInstruction::CreateBinary(
-      f32_2x2_, HloOpcode::kDot, params[2], params[3]));
+  HloInstruction* d00 = builder.AddInstruction(
+      HloInstruction::CreateCanonicalDot(f32_2x2_, params[2], params[3]));
   HloInstruction* d10 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, params[1], d00));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, params[1], d00));
   HloInstruction* d11 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, d00, params[4]));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, d00, params[4]));
   HloInstruction* d20 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, params[0], d10));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, params[0], d10));
   HloInstruction* d21 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, d10, d11));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, d10, d11));
   HloInstruction* d22 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, d11, params[5]));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, d11, params[5]));
   HloInstruction* d30 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, d20, d21));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, d20, d21));
   HloInstruction* d31 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, d21, d22));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, d21, d22));
   HloInstruction* d40 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, d30, d31));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, d30, d31));
 
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build(d40));
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_manager.cc b/tensorflow/compiler/xla/service/gpu/infeed_manager.cc
index 3ddc1c0789d746..ae310beefad0c8 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_manager.cc
+++ b/tensorflow/compiler/xla/service/gpu/infeed_manager.cc
@@ -49,13 +49,25 @@ void InfeedManager::EnqueueBuffers(const std::vector<InfeedBuffer*>& buffers) {
 }
 
 InfeedBuffer* InfeedManager::BlockingDequeueBuffer() {
-  tensorflow::mutex_lock l(mu_);
-  while (enqueued_buffer_.empty()) {
-    cv_.wait(l);
+  bool became_empty = false;
+  InfeedBuffer* current_buffer;
+  {
+    tensorflow::mutex_lock l(mu_);
+    while (enqueued_buffer_.empty()) {
+      cv_.wait(l);
+    }
+    current_buffer = enqueued_buffer_.front();
+    enqueued_buffer_.pop_front();
+    dequeued_buffer_.insert(current_buffer);
+    if (enqueued_buffer_.empty()) {
+      became_empty = true;
+    }
+  }
+  if (became_empty) {
+    for (const auto& callback : on_empty_callbacks_) {
+      callback();
+    }
   }
-  InfeedBuffer* current_buffer = enqueued_buffer_.front();
-  enqueued_buffer_.pop_front();
-  dequeued_buffer_.insert(current_buffer);
   return current_buffer;
 }
 
@@ -88,6 +100,10 @@ se::Stream* InfeedManager::GetStream(se::StreamExecutor* executor) {
   return host_to_device_stream_.get();
 }
 
+void InfeedManager::RegisterOnEmptyCallback(std::function<void()> callback) {
+  on_empty_callbacks_.push_back(std::move(callback));
+}
+
 InfeedManager* GetOrCreateInfeedManager() {
   static InfeedManager* manager = new InfeedManager;
   return manager;
diff --git a/tensorflow/compiler/xla/service/gpu/infeed_manager.h b/tensorflow/compiler/xla/service/gpu/infeed_manager.h
index d5f2216d460a45..a3fc15cfe36a49 100644
--- a/tensorflow/compiler/xla/service/gpu/infeed_manager.h
+++ b/tensorflow/compiler/xla/service/gpu/infeed_manager.h
@@ -21,6 +21,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_INFEED_MANAGER_H_
 
 #include <deque>
+#include <vector>
 
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
@@ -100,6 +101,10 @@ class InfeedManager {
   // returns null.
   se::Stream* GetStream(se::StreamExecutor* executor);
 
+  // Registers a callback that will be called when 'enqueued_buffer_' becomes
+  // empty.
+  void RegisterOnEmptyCallback(std::function<void()> callback);
+
  private:
   // TODO(b/30467474): Revisit if this mutex becomes a point of
   // contention.
@@ -122,6 +127,10 @@ class InfeedManager {
 
   // Executor that the host_to_device_stream belongs to. Not owned.
   se::StreamExecutor* host_to_device_executor_;
+
+  // List of callbacks which will be called when 'enqueued_buffer_' becomes
+  // empty.
+  std::vector<std::function<void()>> on_empty_callbacks_;
 };
 
 // Singleton creator-or-accessor: Returns the GPU infeed manager.
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
index 85ecbe8fdb3470..36a1b82a26d84f 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 namespace gpu {
@@ -46,41 +48,100 @@ bool IsFusile(const HloInstruction& hlo) {
          hlo.opcode() == HloOpcode::kTranspose;
 }
 
+bool IsIEEEFloatingPointScalarConstant(const HloInstruction* constant) {
+  if (constant->opcode() != HloOpcode::kConstant ||
+      !ShapeUtil::IsScalar(constant->shape())) {
+    return false;
+  }
+  auto type = constant->shape().element_type();
+  return type == F16 || type == F32 || type == F64;
+}
+
 }  // namespace
 
+/*static*/ bool GpuInstructionFusion::IsExpensive(
+    const HloInstruction& instruction) {
+  switch (instruction.opcode()) {
+    // We say that floating-point division is cheap on the GPU.
+    case HloOpcode::kDivide:
+      return !ShapeUtil::ElementIsFloating(instruction.shape()) &&
+             InstructionFusion::IsExpensive(instruction);
+
+    default:
+      return InstructionFusion::IsExpensive(instruction);
+  }
+}
+
 bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
                                       int64 operand_index) {
   HloInstruction* producer = consumer->mutable_operand(operand_index);
 
   // Check if we can use output fusion for (A @ B) * alpha
-  if (producer->opcode() == HloOpcode::kDot) {
-    if (consumer->opcode() == HloOpcode::kMultiply) {
-      CHECK_EQ(consumer->operand_count(), 2);
-      int64 other_operand_index = 1 - operand_index;
-      const HloInstruction* alpha = consumer->operand(other_operand_index);
-      if (alpha->opcode() == HloOpcode::kConstant &&
-          ShapeUtil::IsScalar(alpha->shape())) {
+  if (consumer->operand_count() == 2 &&
+      (producer->opcode() == HloOpcode::kDot ||
+       (producer->opcode() == HloOpcode::kFusion &&
+        producer->fused_expression_root()->opcode() == HloOpcode::kDot))) {
+    int64 other_operand_index = 1 - operand_index;
+    const HloInstruction* alpha = consumer->operand(other_operand_index);
+    HloInstruction* op1 = nullptr;
+    HloInstruction* op2 = nullptr;
+    if (consumer->opcode() == HloOpcode::kFusion &&
+        consumer->fusion_kind() == HloInstruction::FusionKind::kLoop &&
+        Match(consumer->fused_expression_root(),
+              match::Op()
+                  .WithOpcode(HloOpcode::kMultiply)
+                  .WithOperand(0, match::Op(&op1))
+                  .WithOperand(1, match::Op(&op2)))) {
+      CHECK(op1 != nullptr && op2 != nullptr);
+      // If 'consumer' is a fusion node, it should consist of a broadcast of a
+      // scalar constant fused into a multiply, but nothing more. So one operand
+      // should be a parameter, and the other should be a broadcast.
+      if (op1->opcode() != HloOpcode::kParameter) {
+        std::swap(op1, op2);
+      }
+      if (op1->opcode() != HloOpcode::kParameter ||
+          op2->opcode() != HloOpcode::kBroadcast) {
+        return false;
+      }
+      if (IsIEEEFloatingPointScalarConstant(alpha)) {
+        return true;
+      }
+    } else if (consumer->opcode() == HloOpcode::kMultiply) {
+      // Fuse if 'alpha' is a broadcast of a scalar constant.
+      if (alpha->opcode() == HloOpcode::kBroadcast &&
+          alpha->dimensions().empty() &&
+          IsIEEEFloatingPointScalarConstant(alpha->operand(0))) {
         return true;
       }
     }
   }
 
-  // Only allow to fuse transpose into an output fusion.
+  // Only allow fusing transpose or broadcast into an output fusion that is
+  // implemented as a Gemm call.
   if (consumer->opcode() == HloOpcode::kFusion &&
-      consumer->fusion_kind() == HloInstruction::FusionKind::kOutput) {
-    if (producer->opcode() != HloOpcode::kTranspose) {
-      return false;
-    }
-    // Check that the transpose is the operand of a dot.
+      consumer->fusion_kind() == HloInstruction::FusionKind::kOutput &&
+      ImplementedAsGemm(*consumer)) {
     auto producer_operand_index = consumer->operand_index(producer);
     auto fused_parameter = consumer->fused_parameter(producer_operand_index);
     const std::vector<HloInstruction*>& fused_parameter_users =
         fused_parameter->users();
-    return (fused_parameter_users.size() == 1 &&
-            fused_parameter_users[0]->opcode() == HloOpcode::kDot);
+    if (fused_parameter_users.size() != 1) {
+      return false;
+    }
+    if (producer->opcode() == HloOpcode::kTranspose) {
+      // Check that the transpose is an operand of a dot.
+      return fused_parameter_users[0]->opcode() == HloOpcode::kDot;
+    }
+    if (producer->opcode() == HloOpcode::kBroadcast) {
+      // Check that the broadcast is a broadcast of a scalar constant into a
+      // multiply.
+      return producer->dimensions().empty() &&
+             IsIEEEFloatingPointScalarConstant(producer->operand(0)) &&
+             fused_parameter_users[0]->opcode() == HloOpcode::kMultiply;
+    }
   }
 
-  // Output fusion is not currently supported on GPUs.
+  // Other output fusions are not currently supported on GPUs.
   if (producer->opcode() == HloOpcode::kFusion) {
     return false;
   }
@@ -116,12 +177,34 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
          InstructionFusion::ShouldFuse(consumer, operand_index);
 }
 
+bool GpuInstructionFusion::ShouldFuseIntoMultiOutput(HloInstruction* consumer,
+                                                     int64 operand_index) {
+  const HloInstruction* producer = consumer->operand(operand_index);
+  // The IR emitter has limited support for non-loop fusions with multi output
+  // at present.
+  // TODO(tjoerg): Relax this constraint to allow for arbitraty kinds of fusion.
+  if (consumer->opcode() == HloOpcode::kFusion &&
+      consumer->fusion_kind() != HloInstruction::FusionKind::kLoop) {
+    return false;
+  }
+  // Multi-output fusion requires instructions with compatible shapes.
+  if (!ShapeUtil::Compatible(producer->shape(), consumer->shape())) {
+    return false;
+  }
+  // TODO(tjoerg): Stop calling `ShouldFuse` to relax the criteria for
+  // multi-output fusion. In particular, do not check whether an instruction is
+  // expensive to duplicate, since this doesn't matter here.
+  return GpuInstructionFusion::ShouldFuse(consumer, operand_index);
+}
+
 HloInstruction::FusionKind GpuInstructionFusion::ChooseKind(
     const HloInstruction* producer, const HloInstruction* consumer) {
   if (IsReductionToVector(*consumer)) {
     return HloInstruction::FusionKind::kInput;
   }
-  if (producer->opcode() == HloOpcode::kDot) {
+  if (producer->opcode() == HloOpcode::kDot ||
+      (producer->opcode() == HloOpcode::kFusion &&
+       producer->fused_expression_root()->opcode() == HloOpcode::kDot)) {
     return HloInstruction::FusionKind::kOutput;
   }
   if (HloOpcode::kFusion == consumer->opcode()) {
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.h b/tensorflow/compiler/xla/service/gpu/instruction_fusion.h
index bb2990e6dfc9de..f629d9ff2c7165 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.h
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.h
@@ -27,8 +27,13 @@ class GpuInstructionFusion : public InstructionFusion {
   explicit GpuInstructionFusion(bool may_duplicate)
       : InstructionFusion(GpuInstructionFusion::IsExpensive, may_duplicate) {}
 
+  static bool IsExpensive(const HloInstruction& instruction);
+
   bool ShouldFuse(HloInstruction* consumer, int64 operand_index) override;
 
+  bool ShouldFuseIntoMultiOutput(HloInstruction* consumer,
+                                 int64 operand_index) override;
+
   HloInstruction::FusionKind ChooseKind(
       const HloInstruction* producer, const HloInstruction* consumer) override;
 };
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
index 4b231c449f8f10..426b1d235c3135 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
@@ -15,9 +15,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
+#include "tensorflow/compiler/xla/util.h"
 
 namespace op = xla::testing::opcode_matchers;
 
@@ -108,8 +111,8 @@ TEST_F(InstructionFusionTest, PotentialBitcastReshapeOfDotUnfused) {
   HloComputation::Builder builder(TestName());
   auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(S32, {1, 1}), "0"));
-  auto dot1 = builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(S32, {1, 1}), HloOpcode::kDot, param0, param0));
+  auto dot1 = builder.AddInstruction(HloInstruction::CreateCanonicalDot(
+      ShapeUtil::MakeShape(S32, {1, 1}), param0, param0));
   auto reshape2 = builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(S32, {1, 1, 1}), dot1));
 
@@ -125,8 +128,8 @@ TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfDotUnfused) {
   HloComputation::Builder builder(TestName());
   auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
       0, ShapeUtil::MakeShape(S32, {1, 1}), "0"));
-  auto dot1 = builder.AddInstruction(HloInstruction::CreateBinary(
-      ShapeUtil::MakeShape(S32, {1, 1}), HloOpcode::kDot, param0, param0));
+  auto dot1 = builder.AddInstruction(HloInstruction::CreateCanonicalDot(
+      ShapeUtil::MakeShape(S32, {1, 1}), param0, param0));
   auto transpose2 = builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(S32, {1, 1}), dot1, {0, 1}));
 
@@ -140,7 +143,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfDotUnfused) {
 
 // Tests that broadcasts fused into a fusion with a reduce root.
 TEST_F(InstructionFusionTest, BroadcastIntoReduce) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
     HloModule test_module
 
     add {
@@ -169,7 +172,7 @@ TEST_F(InstructionFusionTest, BroadcastIntoReduce) {
 }
 
 TEST_F(InstructionFusionTest, BitcastIntoAdd) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
     HloModule test_module
 
     ENTRY BroadcastIntoAdd {
@@ -191,7 +194,7 @@ TEST_F(InstructionFusionTest, BitcastIntoAdd) {
 }
 
 TEST_F(InstructionFusionTest, AddIntoBitcast) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
     HloModule test_module
 
     ENTRY BroadcastIntoAdd {
@@ -213,7 +216,7 @@ TEST_F(InstructionFusionTest, AddIntoBitcast) {
 }
 
 TEST_F(InstructionFusionTest, DontFuseGTE) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
   HloModule test_module
   ENTRY DontFuseGTE {
     p0 = (f32[10], f32[10]) parameter(0)
@@ -229,15 +232,16 @@ TEST_F(InstructionFusionTest, DontFuseGTE) {
 }
 
 TEST_F(InstructionFusionTest, DotOutputFusion) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
   HloModule test_module
   ENTRY OutputFusion {
-    constant = f32[] constant(3)
+    alpha = f32[] constant(3)
+    broadcast = f32[4,4]{1,0} broadcast(alpha), dimensions={}
     p0 = f32[4,3]{1,0} parameter(0)
     p1 = f32[4,3]{1,0} parameter(1)
     transpose = f32[3,4]{1,0} transpose(p1), dimensions={1, 0}
-    dot = f32[4,4]{1,0} dot(p0, transpose)
-    ROOT mul = f32[4,4] multiply(constant, dot)
+    dot = f32[4,4]{1,0} dot(p0, transpose), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    ROOT mul = f32[4,4] multiply(dot, broadcast)
   })")
                     .ValueOrDie();
 
@@ -247,10 +251,334 @@ TEST_F(InstructionFusionTest, DotOutputFusion) {
 
   HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::Fusion());
+  EXPECT_EQ(root->fusion_kind(), HloInstruction::FusionKind::kOutput);
   EXPECT_THAT(
       root->fused_expression_root(),
-      op::Multiply(op::Parameter(),
-                   op::Dot(op::Parameter(), op::Transpose(op::Parameter()))));
+      op::Multiply(op::Dot(op::Parameter(), op::Transpose(op::Parameter())),
+                   op::Broadcast(op::Parameter())));
+}
+
+// Compute sum(1/p0), where p0 has type f32, twice.  Check that the division is
+// duplicated and fused into both reduces.
+TEST_F(InstructionFusionTest, FloatingPointDivIsCheap) {
+  auto module = ParseHloString(R"(
+  HloModule test_module
+  Add {
+    lhs = f32[] parameter(0)
+    rhs = f32[] parameter(1)
+    ROOT add = f32[] add(lhs, rhs)
+  }
+  ENTRY TestComputation {
+    zero = f32[] constant(0)
+    one = f32[] constant(1)
+    p0 = f32[100] parameter(0)
+    recip = f32[100] divide(one, p0)
+    sum1 = f32[] reduce(recip, zero), dimensions={0}, to_apply=Add
+    sum2 = f32[] reduce(recip, zero), dimensions={0}, to_apply=Add
+    ROOT root = (f32[], f32[]) tuple(sum1, sum2)
+  })")
+                    .ValueOrDie();
+
+  EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
+                  .Run(module.get())
+                  .ValueOrDie());
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Tuple(op::Fusion(), op::Fusion()))
+      << module->ToString();
+}
+
+// Compute sum(100/p0), where p0 has type s32, twice.  Check that the division
+// is *not* duplicated and fused into both reduces, because we say that integer
+// division is not cheap.
+TEST_F(InstructionFusionTest, IntegerDivIsNotCheap) {
+  auto module = ParseHloString(R"(
+  HloModule test_module
+  Add {
+    lhs = s32[] parameter(0)
+    rhs = s32[] parameter(1)
+    ROOT add = s32[] add(lhs, rhs)
+  }
+  ENTRY TestComputation {
+    zero = s32[] constant(0)
+    one_hundred = s32[] constant(100)
+    p0 = s32[100] parameter(0)
+    recip = s32[100] divide(one_hundred, p0)
+    sum1 = s32[] reduce(recip, zero), dimensions={0}, to_apply=Add
+    sum2 = s32[] reduce(recip, zero), dimensions={0}, to_apply=Add
+    ROOT mul = (s32[], s32[]) tuple(sum1, sum2)
+  })")
+                    .ValueOrDie();
+
+  EXPECT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
+                   .Run(module.get())
+                   .ValueOrDie())
+      << module->ToString();
+}
+
+TEST_F(InstructionFusionTest, DotOutputFusionImpossible) {
+  auto module = ParseHloString(R"(
+  HloModule test_module
+  ENTRY NoOutputFusion {
+    alpha = f32[] constant(3)
+    broadcast = f32[4,4]{1,0} broadcast(alpha), dimensions={}
+    p0 = f32[4,3]{1,0} parameter(0)
+    p1 = f32[3,4]{1,0} parameter(1)
+    dot = f32[4,4]{1,0} dot(p0, p1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    d = f32[4,4]{1,0} multiply(dot, dot)
+    ROOT mul = f32[4,4] multiply(d, broadcast)
+  })")
+                    .ValueOrDie();
+
+  EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
+                  .Run(module.get())
+                  .ValueOrDie());
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Fusion());
+  EXPECT_EQ(root->fusion_kind(), HloInstruction::FusionKind::kLoop);
+  EXPECT_THAT(root->fused_expression_root(),
+              op::Multiply(op::Multiply(op::Parameter(), op::Parameter()),
+                           op::Broadcast(op::Parameter())));
+}
+
+// Counts the HLO ops with a given op code in the specified module.
+static int Count(const HloModule& module, HloOpcode op) {
+  int count = 0;
+  for (const auto* computation : module.computations()) {
+    for (const auto* instruction : computation->instructions()) {
+      if (instruction->opcode() == op) {
+        ++count;
+      }
+    }
+  }
+  return count;
+}
+
+// Returns an HLO instruction from the given computation with the op code.
+static StatusOr<const HloInstruction*> FindHloInstruction(
+    const HloComputation& computation, HloOpcode op) {
+  for (const auto* instruction : computation.instructions()) {
+    if (instruction->opcode() == op) {
+      return instruction;
+    }
+  }
+  return NotFound(
+      "Computation '%s' does not contain an instruction with op code '%s'.",
+      computation.name().c_str(), HloOpcodeString(op).c_str());
+}
+
+TEST_F(InstructionFusionTest, MultiOutputFusion) {
+  // sub --> add --> tuple
+  //  \---------------/
+  auto module = ParseHloString(R"(
+    HloModule test_module
+    ENTRY OutputFusion {
+     p0 = f32[4,3]{1,0} parameter(0)
+     p1 = f32[4,3]{1,0} parameter(1)
+     p2 = f32[4,3]{1,0} parameter(2)
+     sub = f32[4,3]{1,0} subtract(p0, p2)
+     add = f32[4,3]{1,0} add(sub, p1)
+     ROOT tuple = (f32[4,3]{1,0}, f32[4,3]{1,0}) tuple(sub, add)
+    })")
+                    .ValueOrDie();
+
+  ASSERT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
+                  .Run(module.get())
+                  .ValueOrDie());
+  SCOPED_TRACE(module->ToString());
+
+  // Expect that there is one multi-output fusion and subtract has not been
+  // duplicated.
+  EXPECT_EQ(Count(*module, HloOpcode::kFusion), 1);
+  EXPECT_EQ(Count(*module, HloOpcode::kSubtract), 1);
+  TF_ASSERT_OK_AND_ASSIGN(
+      const HloInstruction* fusion,
+      FindHloInstruction(*module->entry_computation(), HloOpcode::kFusion));
+  EXPECT_THAT(
+      fusion->fused_expression_root(),
+      op::Tuple(op::Add(op::Subtract(), op::Parameter()), op::Subtract()));
+}
+
+TEST_F(InstructionFusionTest, MultiOutputFusionExpensiveOp) {
+  // tanh --> add --> tuple
+  //  \---------------/
+  auto module = ParseHloString(R"(
+    HloModule test_module
+    ENTRY OutputFusion {
+     p0 = f32[4,3]{1,0} parameter(0)
+     p1 = f32[4,3]{1,0} parameter(1)
+     tanh = f32[4,3]{1,0} tanh(p0)
+     add = f32[4,3]{1,0} add(tanh, p1)
+     ROOT tuple = (f32[4,3]{1,0}, f32[4,3]{1,0}) tuple(tanh, add)
+    })")
+                    .ValueOrDie();
+
+  // TODO(tjoerg): Allow multi-output fusion for expensive operations like tanh.
+  ASSERT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
+                   .Run(module.get())
+                   .ValueOrDie())
+      << module->ToString();
+}
+
+TEST_F(InstructionFusionTest, MultiOutputFusion2) {
+  // sub --> add1 --\--------\
+  //  \----------> add2 --> tuple
+  auto module = ParseHloString(R"(
+    HloModule test_module
+    ENTRY OutputFusion {
+     p0 = f32[4,3]{1,0} parameter(0)
+     p1 = f32[4,3]{1,0} parameter(1)
+     p2 = f32[4,3]{1,0} parameter(2)
+     sub = f32[4,3]{1,0} subtract(p0, p2)
+     add1 = f32[4,3]{1,0} add(sub, p1)
+     add2 = f32[4,3]{1,0} add(sub, add1)
+     ROOT tuple = (f32[4,3]{1,0}) tuple(add1, add2)
+    })")
+                    .ValueOrDie();
+
+  ASSERT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
+                  .Run(module.get())
+                  .ValueOrDie());
+  SCOPED_TRACE(module->ToString());
+
+  // Expect that there is one multi-output fusion and subtract has not been
+  // duplicated.
+  EXPECT_EQ(Count(*module, HloOpcode::kFusion), 1);
+  EXPECT_EQ(Count(*module, HloOpcode::kSubtract), 1);
+  TF_ASSERT_OK_AND_ASSIGN(
+      const HloInstruction* fusion,
+      FindHloInstruction(*module->entry_computation(), HloOpcode::kFusion));
+  EXPECT_THAT(fusion->fused_expression_root(),
+              op::Tuple(op::Add(op::Subtract(), op::Add()),
+                        op::Add(op::Subtract(), op::Parameter())));
+}
+
+TEST_F(InstructionFusionTest, MultiOutputFusion3) {
+  // sub --> add1 ----\--------\
+  //  \ --> add2 --> add3 --> tuple
+  auto module = ParseHloString(R"(
+    HloModule test_module
+    ENTRY OutputFusion {
+     p0 = f32[4,3]{1,0} parameter(0)
+     p1 = f32[4,3]{1,0} parameter(1)
+     p2 = f32[4,3]{1,0} parameter(2)
+     p3 = f32[4,3]{1,0} parameter(3)
+     sub = f32[4,3]{1,0} subtract(p0, p2)
+     add1 = f32[4,3]{1,0} add(sub, p1)
+     add2 = f32[4,3]{1,0} add(p2, sub)
+     add3 = f32[4,3]{1,0} add(add1, add2)
+     ROOT tuple = (f32[4,3]{1,0}) tuple(add3, add2)
+    })")
+                    .ValueOrDie();
+
+  ASSERT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
+                  .Run(module.get())
+                  .ValueOrDie());
+  SCOPED_TRACE(module->ToString());
+
+  // Expect that there is one multi-output fusion and subtract has not been
+  // duplicated.
+  EXPECT_EQ(Count(*module, HloOpcode::kFusion), 1);
+  EXPECT_EQ(Count(*module, HloOpcode::kSubtract), 1);
+  TF_ASSERT_OK_AND_ASSIGN(
+      const HloInstruction* fusion,
+      FindHloInstruction(*module->entry_computation(), HloOpcode::kFusion));
+  EXPECT_THAT(fusion->fused_expression_root(),
+              op::Tuple(op::Add(op::Add(), op::Add()),
+                        op::Add(op::Parameter(), op::Subtract())));
+}
+
+TEST_F(InstructionFusionTest, NoCyclesDueToMultiOutputFusion) {
+  // sub --> mul ---\
+  //  \--> call --> add --> tuple
+  auto module = ParseHloString(R"(
+  HloModule test_module
+  ENTRY OutputFusion {
+    c = f32[] constant(42)
+    p0 = f32[4,3]{1,0} parameter(0)
+    p1 = f32[4,3]{1,0} parameter(1)
+    sub = f32[4,3]{1,0} subtract(p0, p1)
+    mul = f32[4,3]{1,0} multiply(sub, c)
+    call = f32[4,3]{1,0} custom-call(sub), custom_call_target="foo"
+    add = f32[4,3]{1,0} add(mul, call)
+    ROOT tuple = (f32[4,3]{1,0}) tuple(add)
+  })")
+                    .ValueOrDie();
+
+  ASSERT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
+                  .Run(module.get())
+                  .ValueOrDie());
+  // Visit instructions in post order to detect cycles.
+  // TODO(tjoerg): Add cycle detection to the HloVerifier.
+  class DummyVisitor : public DfsHloVisitorWithDefault {
+   public:
+    DummyVisitor() {}
+    Status DefaultAction(HloInstruction* /*hlo_instruction*/) override {
+      return Status::OK();
+    }
+  } visitor;
+  for (const HloComputation* computation : module->MakeComputationPostOrder()) {
+    // Accept will return a FailedPrecondition when a cycle is detected.
+    EXPECT_TRUE(computation->root_instruction()->Accept(&visitor).ok());
+  }
+}
+
+TEST_F(InstructionFusionTest, NoMultiOutputFusionWithIncompatibleShapes) {
+  // sub[2,3] --> add[4,3] --> tuple([2,3], [4,3])
+  //  \-------------------------/
+  auto module = ParseHloString(R"(
+    HloModule test_module
+    ENTRY OutputFusion {
+     p0 = f32[2,3]{1,0} parameter(0)
+     p1 = f32[4,3]{1,0} parameter(1)
+     p2 = f32[2,3]{1,0} parameter(2)
+     sub = f32[2,3]{1,0} subtract(p0, p2)
+     add = f32[4,3]{1,0} add(sub, p1)
+     ROOT tuple = (f32[2,3]{1,0}, f32[4,3]{1,0}) tuple(sub, add)
+    })")
+                    .ValueOrDie();
+
+  // Multi-output fusion requires shapes to be compatible. Since `sub` and `add`
+  // have incompatible shapes, expect that no multi-output fusion happens.
+  ASSERT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
+                   .Run(module.get())
+                   .ValueOrDie())
+      << module->ToString();
+}
+
+TEST_F(InstructionFusionTest, FuseIntoInputFusionInstruction) {
+  auto module = ParseHloString(R"(
+  HloModule test_module
+
+  add_computation {
+    add_lhs = f32[] parameter(0)
+    add_rhs = f32[] parameter(1)
+    ROOT add_root = f32[] add(add_lhs, add_rhs)
+  }
+
+  fused_computation {
+    p1 = f32[10] parameter(0)
+    zero = f32[] constant(0)
+    ROOT f2_root = f32[] reduce(p1, zero), dimensions={0},
+           to_apply=add_computation
+  }
+
+  ENTRY entry {
+    p0 = f32[10] parameter(0)
+    mul = f32[10] multiply(p0, p0)
+    fusion = f32[] fusion(mul), kind=kInput, calls=fused_computation
+    ROOT tuple = (f32[10], f32[]) tuple(fusion, mul)
+  })")
+                    .ValueOrDie();
+
+  // Multi-output fusion is not supported for non-loop fusions at present. Since
+  // `fused_computation` is a input fusion, expect no multi-output fusion to
+  // happen.
+  ASSERT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
+                   .Run(module.get())
+                   .ValueOrDie())
+      << module->ToString();
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index 5fc0780efb254f..3cc4ca60270546 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -59,6 +59,25 @@ bool AreValidGemmShapes(const Shape& lhs_shape, const Shape& rhs_shape,
          !ShapeUtil::HasZeroElements(lhs_shape) &&
          !ShapeUtil::HasZeroElements(rhs_shape);
 }
+
+bool DotImplementedAsGemm(const HloInstruction& dot) {
+  CHECK_EQ(dot.opcode(), HloOpcode::kDot);
+  const Shape& lhs_shape = dot.operand(0)->shape();
+  const Shape& rhs_shape = dot.operand(1)->shape();
+
+  // If gemm can accept the operand shapes, use it rather than a custom
+  // kernel.
+  if (AreValidGemmShapes(lhs_shape, rhs_shape, dot.shape())) {
+    // The size of the reduction dimension should match. The shape inference
+    // guarantees this invariant, so the check here is for programming
+    // errors.
+    const DotDimensionNumbers& dim_numbers = dot.dot_dimension_numbers();
+    CHECK_EQ(lhs_shape.dimensions(dim_numbers.lhs_contracting_dimensions(0)),
+             rhs_shape.dimensions(dim_numbers.rhs_contracting_dimensions(0)));
+    return true;
+  }
+  return false;
+}
 }  // namespace
 
 bool ImplementedAsGemm(const HloInstruction& hlo) {
@@ -69,24 +88,7 @@ bool ImplementedAsGemm(const HloInstruction& hlo) {
 
   // For certain types of Dot, we can call pre-canned BLAS gemm.
   if (hlo.opcode() == HloOpcode::kDot) {
-    const Shape& lhs_shape = hlo.operand(0)->shape();
-    const Shape& rhs_shape = hlo.operand(1)->shape();
-
-    // If gemm can accept the operand shapes, use it rather than a custom
-    // kernel.
-    if (AreValidGemmShapes(lhs_shape, rhs_shape, hlo.shape())) {
-      // The size of the reduction dimension should match. The shape inference
-      // guarantees this invariant, so the check here is for programming
-      // errors.
-      CHECK_EQ(lhs_shape.dimensions(1), rhs_shape.dimensions(0));
-      return true;
-    }
-  }
-
-  if (hlo.opcode() == HloOpcode::kFusion &&
-      hlo.fusion_kind() == HloInstruction::FusionKind::kTransposeDot &&
-      hlo.fused_expression_root()->opcode() == HloOpcode::kDot) {
-    return true;
+    return DotImplementedAsGemm(hlo);
   }
 
   if (hlo.opcode() == HloOpcode::kFusion &&
@@ -98,7 +100,7 @@ bool ImplementedAsGemm(const HloInstruction& hlo) {
       dot = hlo.fused_expression_root()->operand(1);
     }
     if (dot->opcode() == HloOpcode::kDot) {
-      return ImplementedAsGemm(*dot);
+      return DotImplementedAsGemm(*dot);
     }
   }
 
@@ -160,19 +162,8 @@ static HloInstruction* CreateCudnnConv(
   Shape call_shape =
       ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeShape(U8, {0})});
 
-  // Our CustomCall takes four arguments: The conv lhs and rhs, the cudnn
-  // algorithm to use, and a boolean indicating whether to use tensor cores.
-  //
-  // It's up to a later pass to choose the algorithm and decide whether to use
-  // tensor cores, so to indicate that we haven't yet made a choice, we speicfy
-  // -1 and false for those args.
-  HloInstruction* negative_one = computation->AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<int64>(-1)));
-  HloInstruction* false_constant = computation->AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
-  HloInstruction* custom_call =
-      computation->AddInstruction(HloInstruction::CreateCustomCall(
-          call_shape, {lhs, rhs, negative_one, false_constant}, call_target));
+  HloInstruction* custom_call = computation->AddInstruction(
+      HloInstruction::CreateCustomCall(call_shape, {lhs, rhs}, call_target));
   custom_call->set_window(window);
   custom_call->set_convolution_dimension_numbers(dnums);
   return custom_call;
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 6104dc1a83d356..bf7983f87d88a9 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -112,7 +112,10 @@ Status IrEmitter::HandleConstant(HloInstruction* constant) {
           << std::endl
           << "  its type: "
           << llvm_ir::DumpToString(*global_for_const->getType());
-  bindings_.BindHloToIrValue(*constant, global_for_const);
+  llvm::Constant* shape_constant = llvm::ConstantExpr::getBitCast(
+      global_for_const,
+      llvm_ir::ShapeToIrType(literal.shape(), module_)->getPointerTo());
+  bindings_.BindHloToIrValue(*constant, shape_constant);
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
index b0accc08d47925..e55dfc6dae844c 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
@@ -120,10 +120,11 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   llvm::Value* GetBasePointer(const HloInstruction& inst) const {
     return bindings_.GetBasePointer(inst);
   }
-  // A convenient helper for calling BufferAssignment::GetUniqueTopLevelSlice.
-  BufferAllocation::Slice GetAllocationSlice(const HloInstruction& hlo) const {
+  // A convenient helper for calling BufferAssignment::GetUniqueSlice.
+  BufferAllocation::Slice GetAllocationSlice(
+      const HloInstruction& hlo, const ShapeIndex& index = {}) const {
     return ir_emitter_context_->buffer_assignment()
-        .GetUniqueTopLevelSlice(&hlo)
+        .GetUniqueSlice(&hlo, index)
         .ConsumeValueOrDie();
   }
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
index 71aada080ae8df..bb47a4280541ce 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_nested.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/core/lib/core/status.h"
 
@@ -116,6 +117,26 @@ Status IrEmitterNested::HandleParameter(HloInstruction* parameter) {
 Status IrEmitterNested::EmitTargetElementLoop(
     const HloInstruction& hlo,
     const llvm_ir::ElementGenerator& element_generator) {
+  // For MOF we give the loop emitter an array for every output it should
+  // generate.
+  if (hlo.IsMultiOutputFusion()) {
+    std::vector<llvm_ir::IrArray> target_arrays;
+    for (int64 i = 0, e = ShapeUtil::TupleElementCount(hlo.shape()); i != e;
+         ++i) {
+      target_arrays.push_back(GetIrArray(hlo, hlo, {i}));
+    }
+    TF_RETURN_IF_ERROR(
+        llvm_ir::LoopEmitter(element_generator, target_arrays, &ir_builder_)
+            .EmitLoop());
+
+    std::vector<llvm::Value*> tuple_operand_ptrs;
+    for (const llvm_ir::IrArray& array : target_arrays) {
+      tuple_operand_ptrs.push_back(array.GetBasePointer());
+    }
+    llvm_ir::EmitTuple(GetIrArray(hlo, hlo), tuple_operand_ptrs, &ir_builder_,
+                       module_);
+    return Status::OK();
+  }
   return llvm_ir::LoopEmitter(element_generator, GetIrArray(hlo, hlo),
                               &ir_builder_)
       .EmitLoop();
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index c74795456474c9..4b0d62adf50da9 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
+#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/conditional_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/copy_thunk.h"
@@ -79,6 +80,7 @@ namespace {
 
 using llvm_ir::IrName;
 using tensorflow::gtl::ArraySlice;
+using tensorflow::gtl::InlinedVector;
 using tensorflow::gtl::nullopt;
 using tensorflow::gtl::optional;
 using tensorflow::strings::StrCat;
@@ -234,8 +236,39 @@ llvm::Function* IrEmitterUnnested::BuildKernelPrototype(
   return kernel;
 }
 
+namespace {
+// Computes the maximum valid unroll factor for a given instruction.
+int ComputeMaxUnrollFactor(const HloInstruction* hlo) {
+  int max_unroll_factor = hlo->GetModule()
+                              ->config()
+                              .debug_options()
+                              .xla_gpu_max_kernel_unroll_factor();
+
+  // Find the largest possible power of two to unroll by.
+  // TODO(kramerb): Make this smarter.
+  const Shape& element_shape = hlo->IsMultiOutputFusion()
+                                   ? ShapeUtil::GetSubshape(hlo->shape(), {0})
+                                   : hlo->shape();
+  int64 num_elements = ShapeUtil::ElementsIn(element_shape);
+  for (int i = max_unroll_factor; i > 1; i /= 2) {
+    if (num_elements % i == 0) {
+      return i;
+    }
+  }
+
+  // Cannot unroll.
+  return 1;
+}
+}  // namespace
+
 Status IrEmitterUnnested::DefaultAction(HloInstruction* hlo) {
-  thunk_sequence_->emplace_back(BuildKernelThunk(hlo));
+  int unroll_factor = 1;
+  // Unfused elementwise operations are usually memory bound, unroll them.
+  if (hlo->IsElementwise()) {
+    unroll_factor = ComputeMaxUnrollFactor(hlo);
+  }
+
+  thunk_sequence_->emplace_back(BuildKernelThunk(hlo, unroll_factor));
   return IrEmitter::DefaultAction(hlo);
 }
 
@@ -368,15 +401,8 @@ Status IrEmitterUnnested::HandleCustomCall(HloInstruction* custom_call) {
     auto conv_result_slice = assn.GetUniqueSlice(custom_call, {0}).ValueOrDie();
     auto scratch_slice = assn.GetUniqueSlice(custom_call, {1}).ValueOrDie();
 
-    const HloInstruction* algorithm_inst = custom_call->operand(2);
-    CHECK(algorithm_inst->IsConstant()) << algorithm_inst->ToString();
-    int64 algorithm = algorithm_inst->literal().Get<int64>({});
-
-    const HloInstruction* tensor_ops_enabled_inst = custom_call->operand(3);
-    CHECK(tensor_ops_enabled_inst->IsConstant())
-        << tensor_ops_enabled_inst->ToString();
-    bool tensor_ops_enabled = tensor_ops_enabled_inst->literal().Get<bool>({});
-
+    TF_ASSIGN_OR_RETURN(CudnnConvBackendConfig backend_config,
+                        custom_call->backend_config<CudnnConvBackendConfig>());
     const auto& target = custom_call->custom_call_target();
     std::unique_ptr<ConvolutionThunk> thunk;
     if (target == kCudnnConvForwardCallTarget) {
@@ -391,7 +417,8 @@ Status IrEmitterUnnested::HandleCustomCall(HloInstruction* custom_call) {
           /*filter_shape=*/rhs_shape,
           /*output_shape=*/conv_result_shape,  //
           custom_call->window(), custom_call->convolution_dimension_numbers(),
-          algorithm, tensor_ops_enabled, custom_call);
+          backend_config.algorithm(), backend_config.tensor_ops_enabled(),
+          custom_call);
     } else if (target == kCudnnConvBackwardInputCallTarget) {
       thunk = MakeUnique<ConvolutionThunk>(
           CudnnConvKind::kBackwardInput,
@@ -404,7 +431,8 @@ Status IrEmitterUnnested::HandleCustomCall(HloInstruction* custom_call) {
           /*filter_shape=*/rhs_shape,
           /*output_shape=*/lhs_shape,  //
           custom_call->window(), custom_call->convolution_dimension_numbers(),
-          algorithm, tensor_ops_enabled, custom_call);
+          backend_config.algorithm(), backend_config.tensor_ops_enabled(),
+          custom_call);
     } else if (target == kCudnnConvBackwardFilterCallTarget) {
       thunk = MakeUnique<ConvolutionThunk>(
           CudnnConvKind::kBackwardFilter,
@@ -417,7 +445,8 @@ Status IrEmitterUnnested::HandleCustomCall(HloInstruction* custom_call) {
           /*filter_shape=*/conv_result_shape,
           /*output_shape=*/rhs_shape,  //
           custom_call->window(), custom_call->convolution_dimension_numbers(),
-          algorithm, tensor_ops_enabled, custom_call);
+          backend_config.algorithm(), backend_config.tensor_ops_enabled(),
+          custom_call);
     } else {
       LOG(FATAL) << "Unexpected custom call target: "
                  << custom_call->custom_call_target();
@@ -445,12 +474,24 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
   // initializes the output array to the initial value of the reduce.
   if (HloInstruction::FusionKind::kInput == fusion->fusion_kind()) {
     switch (root->opcode()) {
+      case HloOpcode::kTuple:
       case HloOpcode::kReduce: {
         VLOG(3) << "Emitting fused reduction to vector: " << fusion->ToString();
-        TF_ASSIGN_OR_RETURN(std::unique_ptr<Thunk> initializer_thunk,
-                            BuildInitializerThunk(fusion));
         std::vector<std::unique_ptr<Thunk>> thunks;
-        thunks.push_back(std::move(initializer_thunk));
+        ArraySlice<HloInstruction*> reduces =
+            root->opcode() == HloOpcode::kTuple
+                ? root->operands()
+                : ArraySlice<HloInstruction*>(&root, 1);
+
+        // For multi-output fusion emit an initializer for each tuple element.
+        // Otherwise it's sufficient to just initialize the single output.
+        for (int i = 0, e = reduces.size(); i != e; ++i) {
+          TF_ASSIGN_OR_RETURN(
+              std::unique_ptr<Thunk> initializer_thunk,
+              BuildInitializerThunk(
+                  fusion, reduces[i] == root ? ShapeIndex() : ShapeIndex({i})));
+          thunks.push_back(std::move(initializer_thunk));
+        }
         thunks.push_back(BuildKernelThunk(fusion));
         thunk_sequence_->emplace_back(
             MakeUnique<SequentialThunk>(std::move(thunks), fusion));
@@ -464,11 +505,34 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
         FusedIrEmitter fused_emitter(parameter_arrays, &elemental_emitter);
         TF_RETURN_IF_ERROR(root->Accept(&fused_emitter));
 
-        Shape input_shape = root->operand(0)->shape();
-        return EmitReductionToVector(
-            root, input_shape, fused_emitter.GetGenerator(root->operand(0)),
-            fused_emitter.GetGenerator(root->operand(1)), root->dimensions(),
-            root->to_apply());
+        // For multi-output fusion CHECK the constraints and feed all the
+        // reduces into a single loop code generator. Single-output reduce
+        // fusion is a special case of that.
+        InlinedVector<llvm_ir::ElementGenerator, 1> input_gens;
+        InlinedVector<llvm_ir::ElementGenerator, 1> init_value_gens;
+        InlinedVector<HloComputation*, 1> reducers;
+        for (const HloInstruction* reduce : reduces) {
+          CHECK_EQ(HloOpcode::kReduce, reduce->opcode());
+          // TODO(kramerb): CHECK that layouts are equal. Currently this
+          // breaks multioutputfusion_test. The test has pre-fused
+          // instructions, but layout_assignment will not assign any layouts
+          // for instructions inside of a fused computation. It just removes
+          // the layouts instead.
+          CHECK(ShapeUtil::Compatible(reduces[0]->shape(), reduce->shape()));
+          CHECK(ShapeUtil::Compatible(reduces[0]->operand(0)->shape(),
+                                      reduce->operand(0)->shape()));
+          CHECK(ShapeUtil::Compatible(reduces[0]->operand(1)->shape(),
+                                      reduce->operand(1)->shape()));
+          CHECK(reduces[0]->dimensions() == reduce->dimensions());
+          input_gens.push_back(fused_emitter.GetGenerator(reduce->operand(0)));
+          init_value_gens.push_back(
+              fused_emitter.GetGenerator(reduce->operand(1)));
+          reducers.push_back(reduce->to_apply());
+        }
+        const Shape& input_shape = reduces[0]->operand(0)->shape();
+        return EmitReductionToVector(reduces[0], input_shape, input_gens,
+                                     init_value_gens, reduces[0]->dimensions(),
+                                     reducers);
       }
       default:
         LOG(FATAL) << "Bad opcode for input fusion: "
@@ -514,24 +578,8 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
     return Status::OK();
   }
 
-  int max_unroll_factor = fusion->GetModule()
-                              ->config()
-                              .debug_options()
-                              .xla_gpu_max_kernel_unroll_factor();
-
-  // Find the largest possible power of two to unroll by.
-  // TODO(kramerb): Make this smarter.
-  int unroll_factor = 1;
-  if (!fusion->IsMultiOutputFusion()) {
-    CHECK(fusion->fusion_kind() == HloInstruction::FusionKind::kLoop);
-    int64 num_elements = ShapeUtil::ElementsIn(fusion->shape());
-    for (int i = max_unroll_factor; i > 1; i /= 2) {
-      if (num_elements % i == 0) {
-        unroll_factor = i;
-        break;
-      }
-    }
-  }
+  CHECK(fusion->fusion_kind() == HloInstruction::FusionKind::kLoop);
+  int unroll_factor = ComputeMaxUnrollFactor(fusion);
 
   thunk_sequence_->emplace_back(BuildKernelThunk(fusion, unroll_factor));
   return IrEmitter::HandleFusion(fusion);
@@ -870,8 +918,9 @@ Status IrEmitterUnnested::HandleCopy(HloInstruction* copy) {
 
 Status IrEmitterUnnested::EmitReductionToScalar(
     HloInstruction* reduce, const Shape& input_shape,
-    const llvm_ir::ElementGenerator& input_gen,
-    const llvm_ir::ElementGenerator& init_value_gen, HloComputation* reducer) {
+    tensorflow::gtl::ArraySlice<llvm_ir::ElementGenerator> input_gens,
+    tensorflow::gtl::ArraySlice<llvm_ir::ElementGenerator> init_value_gens,
+    tensorflow::gtl::ArraySlice<HloComputation*> reducers) {
   // Number of elements processed by a single thread.
   constexpr int64 kTileSize = 16;
   int64 num_elems = ShapeUtil::ElementsIn(input_shape);
@@ -923,16 +972,19 @@ Status IrEmitterUnnested::EmitReductionToScalar(
   //
   auto loop_body_emitter =
       [=](const llvm_ir::IrArray::Index& tile_index) -> Status {
+    const int num_reduces = reducers.size();
     llvm::Type* element_ir_type =
         llvm_ir::PrimitiveTypeToIrType(input_shape.element_type(), module_);
-    llvm::Value* partial_reduction_result_address =
-        llvm_ir::EmitAllocaAtFunctionEntry(element_ir_type,
-                                           "partial_reduction_result",
-                                           &ir_builder_);
-    {
-      TF_ASSIGN_OR_RETURN(llvm::Value * init_ir_value,
-                          init_value_gen(llvm_ir::IrArray::Index({})));
+    std::vector<llvm::Value*> partial_reduction_result_addresses;
+    for (int i = 0; i != num_reduces; ++i) {
+      llvm::Value* partial_reduction_result_address = ir_builder_.CreateAlloca(
+          element_ir_type, /*ArraySize=*/nullptr,
+          "partial_reduction_result." + llvm::Twine(i));
+      TF_ASSIGN_OR_RETURN(llvm::Value* const init_ir_value,
+                          init_value_gens[i](llvm_ir::IrArray::Index({})));
       ir_builder_.CreateStore(init_ir_value, partial_reduction_result_address);
+      partial_reduction_result_addresses.push_back(
+          partial_reduction_result_address);
     }
 
     llvm::Value* x_in_tiles = tile_index[0];
@@ -965,11 +1017,16 @@ Status IrEmitterUnnested::EmitReductionToScalar(
       llvm_ir::IrArray::Index input_index(
           /*linear=*/x, input_shape, &ir_builder_);
       llvm::Value* input_address = ir_builder_.CreateAlloca(element_ir_type);
-      TF_ASSIGN_OR_RETURN(llvm::Value * input_ir_value, input_gen(input_index));
-      ir_builder_.CreateStore(input_ir_value, input_address);
-      return (EmitCallToNestedComputation(
-          *reducer, {partial_reduction_result_address, input_address},
-          partial_reduction_result_address));
+      for (int i = 0; i != num_reduces; ++i) {
+        TF_ASSIGN_OR_RETURN(llvm::Value* const input_ir_value,
+                            input_gens[i](input_index));
+        ir_builder_.CreateStore(input_ir_value, input_address);
+        TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
+            *reducers[i],
+            {partial_reduction_result_addresses[i], input_address},
+            partial_reduction_result_addresses[i]));
+      }
+      return Status::OK();
     };
 
     // x_end = kTileSize + x_in_tiles * kTileSize, i.e., the location that's
@@ -1004,21 +1061,24 @@ Status IrEmitterUnnested::EmitReductionToScalar(
                                       : element_ir_type;
     for (int shuffle_distance = kWarpSize / 2; shuffle_distance >= 1;
          shuffle_distance /= 2) {
-      llvm::Value* partial_reduction_result = ir_builder_.CreateLoad(
-          ir_builder_.CreateBitCast(partial_reduction_result_address,
-                                    shuffle_ir_type->getPointerTo()),
-          "partial_reduction_result");
       llvm::Value* result_from_other_lane = ir_builder_.CreateAlloca(
           element_ir_type, nullptr, "result_from_other_lane");
-      ir_builder_.CreateStore(
-          EmitShuffleDown(partial_reduction_result,
-                          ir_builder_.getInt32(shuffle_distance),
-                          &ir_builder_, module_),
-          ir_builder_.CreateBitCast(result_from_other_lane,
-                                    shuffle_ir_type->getPointerTo()));
-      TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
-          *reducer, {partial_reduction_result_address, result_from_other_lane},
-          partial_reduction_result_address));
+      for (int i = 0; i != num_reduces; ++i) {
+        llvm::Value* partial_reduction_result = ir_builder_.CreateLoad(
+            ir_builder_.CreateBitCast(partial_reduction_result_addresses[i],
+                                      shuffle_ir_type->getPointerTo()),
+            "partial_reduction_result");
+        ir_builder_.CreateStore(
+            EmitShuffleDown(partial_reduction_result,
+                            ir_builder_.getInt32(shuffle_distance),
+                            &ir_builder_, module_),
+            ir_builder_.CreateBitCast(result_from_other_lane,
+                                      shuffle_ir_type->getPointerTo()));
+        TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
+            *reducers[i],
+            {partial_reduction_result_addresses[i], result_from_other_lane},
+            partial_reduction_result_addresses[i]));
+      }
     }
 
     const HloInstruction* output =
@@ -1034,14 +1094,25 @@ Status IrEmitterUnnested::EmitReductionToScalar(
         "lane_id_is_zero", &ir_builder_);
     llvm_ir::SetToFirstInsertPoint(if_lane_id_is_zero_data.true_block,
                                    &ir_builder_);
-    llvm::Value* output_address =
-        GetIrArray(*output, *output)
-            .EmitArrayElementAddress(
-                llvm_ir::IrArray::Index(/*linear=*/ir_builder_.getInt64(0),
-                                        output->shape(), &ir_builder_),
-                &ir_builder_, "output_element_address");
-    return EmitAtomicOperationForNestedComputation(
-        *reducer, output_address, partial_reduction_result_address);
+
+    for (int i = 0; i != num_reduces; ++i) {
+      ShapeIndex output_shape_index;
+      if (output->IsMultiOutputFusion()) {
+        output_shape_index = {i};
+      }
+      llvm::Value* output_address =
+          GetIrArray(*output, *output, output_shape_index)
+              .EmitArrayElementAddress(
+                  llvm_ir::IrArray::Index(
+                      /*linear=*/ir_builder_.getInt64(0),
+                      ShapeUtil::GetSubshape(output->shape(),
+                                             output_shape_index),
+                      &ir_builder_),
+                  &ir_builder_, "output_element_address");
+      TF_RETURN_IF_ERROR(EmitAtomicOperationForNestedComputation(
+          *reducers[i], output_address, partial_reduction_result_addresses[i]));
+    }
+    return Status::OK();
   };
 
   // Emit a parallel loop that iterates through all input tiles, one per thread.
@@ -1061,8 +1132,9 @@ Status IrEmitterUnnested::EmitReductionToScalar(
 
 Status IrEmitterUnnested::EmitColumnReduction(
     int64 height, int64 width, HloInstruction* reduce, const Shape& input_shape,
-    const llvm_ir::ElementGenerator& input_gen,
-    const llvm_ir::ElementGenerator& init_value_gen, HloComputation* reducer) {
+    tensorflow::gtl::ArraySlice<llvm_ir::ElementGenerator> input_gens,
+    tensorflow::gtl::ArraySlice<llvm_ir::ElementGenerator> init_value_gens,
+    tensorflow::gtl::ArraySlice<HloComputation*> reducers) {
   // Divide the input matrix into tiles of size Kx1. For example, when the
   // input matrix is 4x4 and K=2, the tiled matrix looks like
   //
@@ -1072,9 +1144,13 @@ Status IrEmitterUnnested::EmitColumnReduction(
   //   4567  // Numbers indicate tile IDs.
   //
   // Each tile is first partially reduced to a scalar by a thread, and then the
-  // scalar is accumulated to the output vector using atomic operations. We
-  // choose 16 as the tile size, which matches Eigen's ColumnReduceKernel.
-  constexpr int64 kTileSize = 16;
+  // scalar is accumulated to the output vector using atomic operations.
+  //
+  // We choose 128 as the tile size based on empirical evidence. It's big enough
+  // to reduce the amount of atomic adds in the end, maximizing the memory
+  // bandwidth.
+  constexpr int64 kTileSize = 128;
+
   // If the height is not a multiple of the tile size, we pad the bottom of the
   // input matrix.
   const int64 height_in_tiles = CeilOfRatio(height, kTileSize);
@@ -1104,15 +1180,20 @@ Status IrEmitterUnnested::EmitColumnReduction(
   // }
   auto loop_body_emitter =
       [=](const llvm_ir::IrArray::Index& tile_index) -> Status {
+    const int num_reduces = reducers.size();
     // Emit the loop body that reduces one tile.
     llvm::Type* element_ir_type =
         llvm_ir::PrimitiveTypeToIrType(input_shape.element_type(), module_);
-    llvm::Value* partial_reduction_result_address = ir_builder_.CreateAlloca(
-        element_ir_type, /*ArraySize=*/nullptr, "partial_reduction_result");
-    {
-      TF_ASSIGN_OR_RETURN(llvm::Value * init_ir_value,
-                          init_value_gen(llvm_ir::IrArray::Index({})));
+    std::vector<llvm::Value*> partial_reduction_result_addresses;
+    for (int i = 0; i != num_reduces; ++i) {
+      llvm::Value* partial_reduction_result_address = ir_builder_.CreateAlloca(
+          element_ir_type, /*ArraySize=*/nullptr,
+          "partial_reduction_result." + llvm::Twine(i));
+      TF_ASSIGN_OR_RETURN(llvm::Value* const init_ir_value,
+                          init_value_gens[i](llvm_ir::IrArray::Index({})));
       ir_builder_.CreateStore(init_ir_value, partial_reduction_result_address);
+      partial_reduction_result_addresses.push_back(
+          partial_reduction_result_address);
     }
 
     // Emit an inner for-loop that partially reduces the elements in the given
@@ -1170,13 +1251,17 @@ Status IrEmitterUnnested::EmitColumnReduction(
                 .SourceIndexOfTranspose(normalized_input_shape, input_shape,
                                         transpose_dimension_mapping,
                                         &ir_builder_);
-        TF_ASSIGN_OR_RETURN(llvm::Value * input_ir_value,
-                            input_gen(input_index));
-        ir_builder_.CreateStore(input_ir_value, input_address);
+        for (int i = 0; i != num_reduces; ++i) {
+          TF_ASSIGN_OR_RETURN(llvm::Value* const input_ir_value,
+                              input_gens[i](input_index));
+          ir_builder_.CreateStore(input_ir_value, input_address);
+          TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
+              *reducers[i],
+              {partial_reduction_result_addresses[i], input_address},
+              partial_reduction_result_addresses[i]));
+        }
+        return Status::OK();
       }
-      return (EmitCallToNestedComputation(
-          *reducer, {partial_reduction_result_address, input_address},
-          partial_reduction_result_address));
     };
 
     // y_end = kTileSize + y_in_tiles * kTileSize, i.e., the y location that's
@@ -1205,13 +1290,24 @@ Status IrEmitterUnnested::EmitColumnReduction(
                                    &ir_builder_);
     const HloInstruction* output =
         reduce->IsFused() ? reduce->parent()->FusionInstruction() : reduce;
-    llvm::Value* output_address =
-        GetIrArray(*output, *output)
-            .EmitArrayElementAddress(
-                llvm_ir::IrArray::Index(x, output->shape(), &ir_builder_),
-                &ir_builder_, "output_element_address");
-    return EmitAtomicOperationForNestedComputation(
-        *reducer, output_address, partial_reduction_result_address);
+    for (int i = 0; i != num_reduces; ++i) {
+      ShapeIndex output_shape_index;
+      if (output->IsMultiOutputFusion()) {
+        output_shape_index = {i};
+      }
+      llvm::Value* output_address =
+          GetIrArray(*output, *output, output_shape_index)
+              .EmitArrayElementAddress(
+                  llvm_ir::IrArray::Index(
+                      x,
+                      ShapeUtil::GetSubshape(output->shape(),
+                                             output_shape_index),
+                      &ir_builder_),
+                  &ir_builder_, "output_element_address");
+      TF_RETURN_IF_ERROR(EmitAtomicOperationForNestedComputation(
+          *reducers[i], output_address, partial_reduction_result_addresses[i]));
+    }
+    return Status::OK();
   };
 
   // Emit a parallel loop that iterate through all input tiles.
@@ -1231,8 +1327,10 @@ Status IrEmitterUnnested::EmitColumnReduction(
 
 Status IrEmitterUnnested::EmitRowReduction(
     int64 depth, int64 height, int64 width, HloInstruction* reduce,
-    const Shape& input_shape, const llvm_ir::ElementGenerator& input_gen,
-    const llvm_ir::ElementGenerator& init_value_gen, HloComputation* reducer) {
+    const Shape& input_shape,
+    tensorflow::gtl::ArraySlice<llvm_ir::ElementGenerator> input_gens,
+    tensorflow::gtl::ArraySlice<llvm_ir::ElementGenerator> init_value_gens,
+    tensorflow::gtl::ArraySlice<HloComputation*> reducers) {
   // A naive algorithm is:
   // 1. Divide the input tensor into tiles of size 1x1xK.
   // 2. Partially reduces each tile to a scalar using one thread.
@@ -1322,15 +1420,20 @@ Status IrEmitterUnnested::EmitRowReduction(
 
   auto loop_body_emitter =
       [=](const llvm_ir::IrArray::Index& tile_index) -> Status {
+    const int num_reduces = reducers.size();
     // Emit the loop body that reduces one tile.
     llvm::Type* element_ir_type = llvm_ir::PrimitiveTypeToIrType(
         input_shape.element_type(), ir_emitter_context_->llvm_module());
-    llvm::Value* partial_reduction_result_address = ir_builder_.CreateAlloca(
-        element_ir_type, /*ArraySize=*/nullptr, "partial_reduction_result");
-    {
-      TF_ASSIGN_OR_RETURN(llvm::Value * init_ir_value,
-                          init_value_gen(llvm_ir::IrArray::Index({})));
+    std::vector<llvm::Value*> partial_reduction_result_addresses;
+    for (int i = 0; i != num_reduces; ++i) {
+      llvm::Value* partial_reduction_result_address = ir_builder_.CreateAlloca(
+          element_ir_type, /*ArraySize=*/nullptr,
+          "partial_reduction_result." + llvm::Twine(i));
+      TF_ASSIGN_OR_RETURN(llvm::Value* const init_ir_value,
+                          init_value_gens[i](llvm_ir::IrArray::Index({})));
       ir_builder_.CreateStore(init_ir_value, partial_reduction_result_address);
+      partial_reduction_result_addresses.push_back(
+          partial_reduction_result_address);
     }
 
     // Emit an inner for-loop that partially reduces the elements in the given
@@ -1413,13 +1516,17 @@ Status IrEmitterUnnested::EmitRowReduction(
                 .SourceIndexOfTranspose(normalized_input_shape, input_shape,
                                         transpose_dimension_mapping,
                                         &ir_builder_);
-        TF_ASSIGN_OR_RETURN(llvm::Value * input_ir_value,
-                            input_gen(input_index));
-        ir_builder_.CreateStore(input_ir_value, input_address);
+        for (int i = 0; i != num_reduces; ++i) {
+          TF_ASSIGN_OR_RETURN(llvm::Value* const input_ir_value,
+                              input_gens[i](input_index));
+          ir_builder_.CreateStore(input_ir_value, input_address);
+          TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
+              *reducers[i],
+              {partial_reduction_result_addresses[i], input_address},
+              partial_reduction_result_addresses[i]));
+        }
+        return Status::OK();
       }
-      return EmitCallToNestedComputation(
-          *reducer, {partial_reduction_result_address, input_address},
-          partial_reduction_result_address);
     };
 
     llvm::Value* tile_in_bounds = ir_builder_.CreateOr(
@@ -1447,21 +1554,24 @@ Status IrEmitterUnnested::EmitRowReduction(
                                       : element_ir_type;
     for (int shuffle_distance = (kWarpSize / 2); shuffle_distance >= 1;
          shuffle_distance /= 2) {
-      llvm::Value* partial_reduction_result = ir_builder_.CreateLoad(
-          ir_builder_.CreateBitCast(partial_reduction_result_address,
-                                    shuffle_ir_type->getPointerTo()),
-          "partial_reduction_result");
       llvm::Value* result_from_other_lane = ir_builder_.CreateAlloca(
           element_ir_type, nullptr, "result_from_other_lane");
-      ir_builder_.CreateStore(
-          EmitShuffleDown(partial_reduction_result,
-                          ir_builder_.getInt32(shuffle_distance),
-                          &ir_builder_, module_),
-          ir_builder_.CreateBitCast(result_from_other_lane,
-                                    shuffle_ir_type->getPointerTo()));
-      TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
-          *reducer, {partial_reduction_result_address, result_from_other_lane},
-          partial_reduction_result_address));
+      for (int i = 0; i != num_reduces; ++i) {
+        llvm::Value* partial_reduction_result = ir_builder_.CreateLoad(
+            ir_builder_.CreateBitCast(partial_reduction_result_addresses[i],
+                                      shuffle_ir_type->getPointerTo()),
+            "partial_reduction_result");
+        ir_builder_.CreateStore(
+            EmitShuffleDown(partial_reduction_result,
+                            ir_builder_.getInt32(shuffle_distance),
+                            &ir_builder_, module_),
+            ir_builder_.CreateBitCast(result_from_other_lane,
+                                      shuffle_ir_type->getPointerTo()));
+        TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
+            *reducers[i],
+            {partial_reduction_result_addresses[i], result_from_other_lane},
+            partial_reduction_result_addresses[i]));
+      }
     }
 
     const HloInstruction* output =
@@ -1475,13 +1585,24 @@ Status IrEmitterUnnested::EmitRowReduction(
         "lane_id_is_zero", &ir_builder_);
     llvm_ir::SetToFirstInsertPoint(if_lane_id_is_zero_data.true_block,
                                    &ir_builder_);
-    llvm::Value* output_address =
-        GetIrArray(*output, *output)
-            .EmitArrayElementAddress(
-                llvm_ir::IrArray::Index(y, output->shape(), &ir_builder_),
-                &ir_builder_, "output_element_address");
-    return EmitAtomicOperationForNestedComputation(
-        *reducer, output_address, partial_reduction_result_address);
+    for (int i = 0; i != num_reduces; ++i) {
+      ShapeIndex output_shape_index;
+      if (output->IsMultiOutputFusion()) {
+        output_shape_index = {i};
+      }
+      llvm::Value* output_address =
+          GetIrArray(*output, *output, output_shape_index)
+              .EmitArrayElementAddress(
+                  llvm_ir::IrArray::Index(
+                      y,
+                      ShapeUtil::GetSubshape(output->shape(),
+                                             output_shape_index),
+                      &ir_builder_),
+                  &ir_builder_, "output_element_address");
+      TF_RETURN_IF_ERROR(EmitAtomicOperationForNestedComputation(
+          *reducers[i], output_address, partial_reduction_result_addresses[i]));
+    }
+    return Status::OK();
   };
 
   // Emit a parallel loop that iterates through every input tiles.
@@ -1508,10 +1629,10 @@ Status IrEmitterUnnested::EmitRowReduction(
 //               elementwise.
 Status IrEmitterUnnested::EmitReductionToVector(
     HloInstruction* reduce, const Shape& input_shape,
-    const llvm_ir::ElementGenerator& input_gen,
-    const llvm_ir::ElementGenerator& init_value_gen,
+    tensorflow::gtl::ArraySlice<llvm_ir::ElementGenerator> input_gens,
+    tensorflow::gtl::ArraySlice<llvm_ir::ElementGenerator> init_value_gens,
     tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce,
-    HloComputation* reducer) {
+    tensorflow::gtl::ArraySlice<HloComputation*> reducers) {
   // This emission requires "reduce" to have an input layout. It is either set
   // by LayoutAssignment (for a top-level kReduce) or by InstructionFusion (for
   // a fused kReduce).
@@ -1546,8 +1667,8 @@ Status IrEmitterUnnested::EmitReductionToVector(
   // `EmitReductionToVector`, we only need to check whether the minormost
   // dimension of the input is to keep.
   if (input_dims_to_keep.empty()) {
-    return EmitReductionToScalar(reduce, input_shape, input_gen, init_value_gen,
-                                 reducer);
+    return EmitReductionToScalar(reduce, input_shape, input_gens,
+                                 init_value_gens, reducers);
   } else if (input_dims_to_keep.front() ==
              LayoutUtil::Minor(input_shape.layout(), 0)) {
     // Column reduction. Treat the result of "input" as a matrix whose width
@@ -1564,8 +1685,8 @@ Status IrEmitterUnnested::EmitReductionToVector(
         height *= input_shape.dimensions(input_dim);
       }
     }
-    return EmitColumnReduction(height, width, reduce, input_shape, input_gen,
-                               init_value_gen, reducer);
+    return EmitColumnReduction(height, width, reduce, input_shape, input_gens,
+                               init_value_gens, reducers);
   } else {
     // Reduce the row dimension of a matrix or reduce dimension 0 and 2 in a
     // 3D tensor. The size of dimension 1 (the height) is the size of the
@@ -1591,7 +1712,7 @@ Status IrEmitterUnnested::EmitReductionToVector(
     }
     const int64 height = ShapeUtil::ElementsIn(reduce->shape());
     return EmitRowReduction(depth, height, width, reduce, input_shape,
-                            input_gen, init_value_gen, reducer);
+                            input_gens, init_value_gens, reducers);
   }
 }
 
@@ -1615,16 +1736,15 @@ Status IrEmitterUnnested::HandleReduce(HloInstruction* reduce) {
         MakeUnique<SequentialThunk>(std::move(thunks), reduce));
 
     return EmitReductionToVector(
-        reduce, input->shape(),
-        [&](const llvm_ir::IrArray::Index& index) {
+        reduce, input->shape(), {[&](const llvm_ir::IrArray::Index& index) {
           return GetIrArray(*input, *reduce)
               .EmitReadArrayElement(index, &ir_builder_);
-        },
-        [&](const llvm_ir::IrArray::Index& index) {
+        }},
+        {[&](const llvm_ir::IrArray::Index& index) {
           return GetIrArray(*init_value, *reduce)
               .EmitReadArrayElement(index, &ir_builder_);
-        },
-        dimensions_to_reduce, reducer);
+        }},
+        dimensions_to_reduce, {reducer});
   }
 
   thunk_sequence_->emplace_back(BuildKernelThunk(reduce));
@@ -1892,6 +2012,52 @@ Status IrEmitterUnnested::HandleSelect(HloInstruction* select) {
   return IrEmitter::HandleSelect(select);
 }
 
+Status IrEmitterUnnested::HandleCrossReplicaSum(HloInstruction* crs) {
+  if (hlo_module_config_.replica_count() != 1) {
+    // TODO(b/33011107): Support nontrivial cross replica sum on GPU.
+    return Unimplemented(
+        "CrossReplicaSum with >1 replica is not implemented on GPU.");
+  }
+
+  // CRS with one operand and one replica is simply the identity function.
+  // Buffer assignment expects a copy, so that's what we do.
+  //
+  // TODO(b/80100934): We would like to eliminate one-replica CRS nodes entirely
+  // in algebraic-simplifier, but currently on some platforms
+  // HloModuleConfig::num_replicas changes between when the module is compiled
+  // and when it's run.
+  if (crs->operand_count() == 1) {
+    CHECK(ShapeUtil::IsArray(crs->operand(0)->shape()))
+        << "Operands to cross-replica-sum must be arrays: " << crs->ToString();
+    thunk_sequence_->push_back(MakeUnique<DeviceToDeviceCopyThunk>(
+        /*source_address=*/GetAllocationSlice(*crs->operand(0)),
+        /*destination_buffer=*/GetAllocationSlice(*crs),
+        /*mem_size=*/ShapeUtil::ByteSizeOf(crs->shape()), crs));
+    return Status::OK();
+  }
+
+  // One-replica CRS with multiple operands produces a tuple of the inputs.
+  // Again, buffer assignment expects us to copy each.
+  std::vector<std::unique_ptr<Thunk>> thunks;
+  std::vector<BufferAllocation::Slice> tuple_element_buffers;
+  for (int64 i = 0; i < crs->operand_count(); ++i) {
+    tuple_element_buffers.push_back(ir_emitter_context_->buffer_assignment()
+                                        .GetUniqueSlice(crs, {i})
+                                        .ValueOrDie());
+    thunks.push_back(MakeUnique<DeviceToDeviceCopyThunk>(
+        /*source_address=*/GetAllocationSlice(*crs->operand(i)),
+        /*destination_buffer=*/tuple_element_buffers.back(),
+        /*mem_size=*/ShapeUtil::ByteSizeOf(crs->operand(i)->shape()), crs));
+  }
+
+  // Output a tuple of the buffers above.
+  thunks.push_back(MakeUnique<TupleThunk>(tuple_element_buffers,
+                                          GetAllocationSlice(*crs), crs));
+  thunk_sequence_->push_back(
+      MakeUnique<SequentialThunk>(std::move(thunks), crs));
+  return Status::OK();
+}
+
 Status IrEmitterUnnested::HandleInfeed(HloInstruction* infeed) {
   thunk_sequence_->emplace_back(BuildInfeedThunk(infeed));
   return Status::OK();
@@ -2158,6 +2324,21 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildInfeedThunk(
       /*destination_buffer=*/GetAllocationSlice(*inst), inst);
 }
 
+namespace {
+double GetScalarConstantAsDouble(const Literal& literal) {
+  switch (literal.shape().element_type()) {
+    case F16:
+      return static_cast<double>(literal.Get<Eigen::half>({}));
+    case F32:
+      return literal.Get<float>({});
+    case F64:
+      return literal.Get<double>({});
+    default:
+      LOG(FATAL) << "Unsupported type.";
+  }
+}
+}  // namespace
+
 std::unique_ptr<Thunk> IrEmitterUnnested::BuildGemmThunk(
     const HloInstruction* inst) {
   if (inst->opcode() == HloOpcode::kDot) {
@@ -2170,65 +2351,48 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildGemmThunk(
         lhs->shape(),               // The shape of LHS.
         rhs->shape(),               // The shape of RHS.
         inst->shape(),              // The shape of the output.
-        false,                      // Do not transpose LHS.
-        false,                      // Do not transpose RHS.
         1.0,                        // alpha.
         inst);
   }
 
   if (inst->opcode() == HloOpcode::kFusion) {
-    if (inst->fusion_kind() == HloInstruction::FusionKind::kOutput) {
-      const HloInstruction* mul = inst->fused_expression_root();
-      const HloInstruction* dot = mul->operand(0);
-      const HloInstruction* alpha = mul->operand(1);
-      if (dot->opcode() != HloOpcode::kDot) {
-        std::swap(dot, alpha);
-      }
-      DCHECK(dot->opcode() == HloOpcode::kDot);
-      const HloInstruction* lhs_parameter = StripTranspose(*dot->operand(0));
-      const HloInstruction* rhs_parameter = StripTranspose(*dot->operand(1));
-      DCHECK(lhs_parameter->opcode() == HloOpcode::kParameter &&
-             rhs_parameter->opcode() == HloOpcode::kParameter);
-      const HloInstruction* lhs =
-          inst->operand(lhs_parameter->parameter_number());
-      const HloInstruction* rhs =
-          inst->operand(rhs_parameter->parameter_number());
-
-      return MakeUnique<GemmThunk>(
-          GetAllocationSlice(*lhs),             // The buffer assigned to LHS.
-          GetAllocationSlice(*rhs),             // The buffer assigned to RHS.
-          GetAllocationSlice(*mul),             // The output buffer.
-          lhs->shape(),                         // The shape of LHS.
-          rhs->shape(),                         // The shape of RHS.
-          inst->shape(),                        // The shape of the output.
-          dot->operand(0)->IsRank2Transpose(),  // Transpose LHS.
-          dot->operand(1)->IsRank2Transpose(),  // Transpose RHS.
-          alpha->literal().Get<double>({0}),    // alpha.
-          inst);
-    } else {
-      const HloInstruction* dot = inst->fused_expression_root();
-      DCHECK(dot->opcode() == HloOpcode::kDot);
-      const HloInstruction* lhs_parameter = StripTranspose(*dot->operand(0));
-      const HloInstruction* rhs_parameter = StripTranspose(*dot->operand(1));
-      DCHECK(lhs_parameter->opcode() == HloOpcode::kParameter &&
-             rhs_parameter->opcode() == HloOpcode::kParameter);
-      const HloInstruction* lhs =
-          inst->operand(lhs_parameter->parameter_number());
-      const HloInstruction* rhs =
-          inst->operand(rhs_parameter->parameter_number());
-
-      return MakeUnique<GemmThunk>(
-          GetAllocationSlice(*lhs),             // The buffer assigned to LHS.
-          GetAllocationSlice(*rhs),             // The buffer assigned to RHS.
-          GetAllocationSlice(*inst),            // The output buffer.
-          lhs->shape(),                         // The shape of LHS.
-          rhs->shape(),                         // The shape of RHS.
-          inst->shape(),                        // The shape of the output.
-          dot->operand(0)->IsRank2Transpose(),  // Transpose LHS.
-          dot->operand(1)->IsRank2Transpose(),  // Transpose RHS.
-          1.0,                                  // Alpha.
-          inst);
+    CHECK_EQ(inst->fusion_kind(), HloInstruction::FusionKind::kOutput);
+    const HloInstruction* mul = inst->fused_expression_root();
+    const HloInstruction* dot = mul->operand(0);
+    const HloInstruction* alpha = mul->operand(1);
+    if (dot->opcode() != HloOpcode::kDot) {
+      std::swap(dot, alpha);
     }
+    if (alpha->opcode() == HloOpcode::kBroadcast) {
+      alpha = alpha->operand(0);
+    }
+    alpha = inst->operand(alpha->parameter_number());
+    // TODO(b/74185543): Remove the following if block once we support fusion
+    // with a non-constant as well. Then we will just always use the constant
+    // on the device.
+    if (alpha->opcode() == HloOpcode::kCopy) {
+      alpha = alpha->operand(0);
+    }
+
+    DCHECK(dot->opcode() == HloOpcode::kDot);
+    const HloInstruction* lhs_parameter = StripTranspose(*dot->operand(0));
+    const HloInstruction* rhs_parameter = StripTranspose(*dot->operand(1));
+    DCHECK(lhs_parameter->opcode() == HloOpcode::kParameter &&
+           rhs_parameter->opcode() == HloOpcode::kParameter);
+    const HloInstruction* lhs =
+        inst->operand(lhs_parameter->parameter_number());
+    const HloInstruction* rhs =
+        inst->operand(rhs_parameter->parameter_number());
+
+    return MakeUnique<GemmThunk>(
+        GetAllocationSlice(*lhs),   // The buffer assigned to LHS.
+        GetAllocationSlice(*rhs),   // The buffer assigned to RHS.
+        GetAllocationSlice(*inst),  // The output buffer.
+        lhs->shape(),               // The shape of LHS.
+        rhs->shape(),               // The shape of RHS.
+        inst->shape(),              // The shape of the output.
+        GetScalarConstantAsDouble(alpha->literal()),  // alpha.
+        inst);
   }
 
   LOG(FATAL) << "Cannot build a GemmThunk for " << inst->ToString();
@@ -2245,7 +2409,7 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildFftThunk(
 }
 
 StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildInitializerThunk(
-    const HloInstruction* hlo) {
+    const HloInstruction* hlo, const ShapeIndex& index) {
   bool fused = HloOpcode::kFusion == hlo->opcode();
   const HloInstruction* inst = fused ? hlo->fused_expression_root() : hlo;
   const HloInstruction* init_value = [&] {
@@ -2254,6 +2418,14 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildInitializerThunk(
         return inst->operand(2);
       case HloOpcode::kReduce:
         return inst->operand(1);
+      case HloOpcode::kTuple:
+        CHECK(hlo->IsMultiOutputFusion())
+            << ": " << hlo->ToString() << " is not a multi-output fusion.";
+        CHECK(inst->operand(index.back())->opcode() == HloOpcode::kReduce)
+            << ": Found '" << inst->operand(index.back())->opcode() << "' in "
+            << inst->ToString() << " but expected 'reduce'.";
+        // For multi-output fusion look through the tuple.
+        return inst->operand(index.back())->operand(1);
       default:
         LOG(FATAL) << "Opcode " << inst->opcode()
                    << " should not need an initializer.";
@@ -2277,7 +2449,7 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildInitializerThunk(
     ArraySlice<uint8> literal_bytes(
         reinterpret_cast<const uint8*>(literal.untyped_data()), num_bytes);
     if (c_all_of(literal_bytes, [](uint8 byte) { return byte == 0; })) {
-      return {MakeUnique<MemzeroThunk>(GetAllocationSlice(*hlo), hlo)};
+      return {MakeUnique<MemzeroThunk>(GetAllocationSlice(*hlo, index), hlo)};
     }
 
     // If the literal is 8 or 16 bits wide, we can emit a 32-bit memset by
@@ -2293,8 +2465,8 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildInitializerThunk(
         pattern16 = literal_bytes.front();
       }
       uint32 pattern32 = uint32{pattern16} | (uint32{pattern16} << 16);
-      return {MakeUnique<Memset32BitValueThunk>(pattern32,
-                                                GetAllocationSlice(*hlo), hlo)};
+      return {MakeUnique<Memset32BitValueThunk>(
+          pattern32, GetAllocationSlice(*hlo, index), hlo)};
     }
 
     // If the literal is an even multiple of 32 bits wide, we can emit a 32-bit
@@ -2304,8 +2476,8 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildInitializerThunk(
                literal_bytes.size() - 4) == 0) {
       uint32 word;
       memcpy(&word, literal_bytes.data(), sizeof(word));
-      return {MakeUnique<Memset32BitValueThunk>(word, GetAllocationSlice(*hlo),
-                                                hlo)};
+      return {MakeUnique<Memset32BitValueThunk>(
+          word, GetAllocationSlice(*hlo, index), hlo)};
     }
   }
 
@@ -2504,16 +2676,14 @@ Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
         .EmitLoop(IrName(&hlo));
   }
 
-  CHECK_EQ(unroll_factor, 1)
-      << "multi-output fusion does not support unrolling";
-
   // For multiple outputs fusion, we need to emit each operand and the root.
   std::vector<llvm_ir::IrArray> output_arrays;
   for (int64 i = 0; i < ShapeUtil::TupleElementCount(hlo.shape()); ++i) {
     output_arrays.push_back(GetIrArray(hlo, hlo, {i}));
   }
   TF_RETURN_IF_ERROR(ParallelLoopEmitter(element_generator, output_arrays,
-                                         launch_dimensions, &ir_builder_)
+                                         launch_dimensions, &ir_builder_,
+                                         unroll_factor)
                          .EmitLoop(IrName(&hlo)));
 
   std::vector<llvm::Value*> tuple_operand_ptrs;
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index b842f480c6257c..b41eaa303b0aad 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -38,7 +38,7 @@ namespace gpu {
 //
 // Examples of things that are not unnested computations:
 //
-//  - The reducer of a kReduce HLO.  This is emited using IrEmitterNested.
+//  - The reducer of a kReduce HLO.  This is emitted using IrEmitterNested.
 //  - The body of a fusion node.  IrEmitterUnenested emits the relevant code
 //    within a kernel function using FusedIrEmitter.  (FusedIrEmitter is not
 //    really an IrEmitter, but is more an "IR generator generator".)
@@ -76,6 +76,7 @@ class IrEmitterUnnested : public IrEmitter {
   Status HandleInfeed(HloInstruction* xla_infeed) override;
   Status HandleRng(HloInstruction* random) override;
   Status HandleSelect(HloInstruction* select) override;
+  Status HandleCrossReplicaSum(HloInstruction* crs) override;
 
   Status EmitTargetElementLoop(
       const HloInstruction& hlo,
@@ -109,28 +110,31 @@ class IrEmitterUnnested : public IrEmitter {
   // `EmitReductionToVector`. Note that input shape might not be
   // [height x width], but can be bitcast to [height x weight] with "height"
   // being the major dimension.
-  Status EmitColumnReduction(int64 height, int64 width, HloInstruction* reduce,
-                             const Shape& input_shape,
-                             const llvm_ir::ElementGenerator& input_gen,
-                             const llvm_ir::ElementGenerator& init_value_gen,
-                             HloComputation* reducer);
+  Status EmitColumnReduction(
+      int64 height, int64 width, HloInstruction* reduce,
+      const Shape& input_shape,
+      tensorflow::gtl::ArraySlice<llvm_ir::ElementGenerator> input_gens,
+      tensorflow::gtl::ArraySlice<llvm_ir::ElementGenerator> init_value_gens,
+      tensorflow::gtl::ArraySlice<HloComputation*> reducers);
 
   // Emits code that reduces a 3D tensor of shape [depth x height x width] to a
   // vector of shape [height]. Other parameters have the same meaning as those
   // of `EmitReductionToVector`. Note that input shape might not be
   // [depth x height x width], but can be bitcast to [depth x height x weight]
   // with "depth" being the most major dimension.
-  Status EmitRowReduction(int64 depth, int64 height, int64 width,
-                          HloInstruction* reduce, const Shape& input_shape,
-                          const llvm_ir::ElementGenerator& input_gen,
-                          const llvm_ir::ElementGenerator& init_value_gen,
-                          HloComputation* reducer);
+  Status EmitRowReduction(
+      int64 depth, int64 height, int64 width, HloInstruction* reduce,
+      const Shape& input_shape,
+      tensorflow::gtl::ArraySlice<llvm_ir::ElementGenerator> input_gens,
+      tensorflow::gtl::ArraySlice<llvm_ir::ElementGenerator> init_value_gens,
+      tensorflow::gtl::ArraySlice<HloComputation*> reducers);
 
   // Emits code that reduces a tensor of arbitrary rank to a scalar.
-  Status EmitReductionToScalar(HloInstruction* reduce, const Shape& input_shape,
-                               const llvm_ir::ElementGenerator& input_gen,
-                               const llvm_ir::ElementGenerator& init_value_gen,
-                               HloComputation* reducer);
+  Status EmitReductionToScalar(
+      HloInstruction* reduce, const Shape& input_shape,
+      tensorflow::gtl::ArraySlice<llvm_ir::ElementGenerator> input_gens,
+      tensorflow::gtl::ArraySlice<llvm_ir::ElementGenerator> init_value_gens,
+      tensorflow::gtl::ArraySlice<HloComputation*> reducers);
 
   // Figures out whether `reduce` is a row or column reduction, and which
   // dimensions to reduce, and calls either `EmitRowReduction` or
@@ -140,13 +144,16 @@ class IrEmitterUnnested : public IrEmitter {
   // generate elements of the input and the initial value. Other parameters mean
   // the same as for `HandleReduce`.
   //
+  // Multiple reduces can be emitted in the same loop, assuming they have the
+  // same input and output shapes, and the same reduce dimensions.
+  //
   // Prerequisite: `IsReductionToVector(*reduce)`
   Status EmitReductionToVector(
       HloInstruction* reduce, const Shape& input_shape,
-      const llvm_ir::ElementGenerator& input_gen,
-      const llvm_ir::ElementGenerator& init_value_gen,
+      tensorflow::gtl::ArraySlice<llvm_ir::ElementGenerator> input_gens,
+      tensorflow::gtl::ArraySlice<llvm_ir::ElementGenerator> init_value_gens,
       tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce,
-      HloComputation* reducer);
+      tensorflow::gtl::ArraySlice<HloComputation*> reducers);
 
   // Returns a KernelThunk that invokes the kernel emitted for `inst`. The
   // caller needs to make sure `inst` outlives the lifetime of the returned
@@ -165,7 +172,7 @@ class IrEmitterUnnested : public IrEmitter {
   // Returns a thunk that, given a reduce or select-and-scatter op, initializes
   // its memory to the appropriate initial value.
   StatusOr<std::unique_ptr<Thunk>> BuildInitializerThunk(
-      const HloInstruction* hlo);
+      const HloInstruction* hlo, const ShapeIndex& index = {});
 
   // Returns a thunk that calls host-to-device cuMemcpy to implement `inst`.
   std::unique_ptr<Thunk> BuildHostToDeviceCopyThunk(const HloInstruction* inst);
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
index e499a9b0091d3f..5355d34c019c88 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
@@ -35,29 +35,41 @@ KernelThunk::KernelThunk(
       kernel_name_(kernel_name),
       unroll_factor_(unroll_factor) {}
 
-tensorflow::Status KernelThunk::Initialize(const GpuExecutable& executable) {
+Status KernelThunk::Initialize(const GpuExecutable& executable,
+                               se::StreamExecutor* executor) {
   tensorflow::mutex_lock lock(mutex_);
-  if (loader_spec_) {
-    // Already initialized by another thread.
-    return tensorflow::Status::OK();
-  }
-
-  loader_spec_.reset(new se::MultiKernelLoaderSpec(args_.size()));
-  tensorflow::StringPiece text = executable.text();
-  // Convert tensorflow::StringPiece to se::port::StringPiece because
-  // StreamExecutor uses the latter.
-  loader_spec_->AddCudaPtxInMemory(
-      se::port::StringPiece(text.data(), text.size()), kernel_name_);
+  if (!loader_spec_) {
+    loader_spec_.reset(new se::MultiKernelLoaderSpec(args_.size()));
+    tensorflow::StringPiece text = executable.text();
+    // Convert tensorflow::StringPiece to se::port::StringPiece because
+    // StreamExecutor uses the latter.
+    loader_spec_->AddCudaPtxInMemory(
+        se::port::StringPiece(text.data(), text.size()), kernel_name_);
 
   // XXX figure out how to cope with both CUDA and ROCm platforms
 #if GOOGLE_CUDA
-  if (!executable.cubin().empty()) {
-    loader_spec_->AddCudaCubinInMemory(
-        reinterpret_cast<const char*>(executable.cubin().data()), kernel_name_);
-  }
+    if (!executable.cubin().empty()) {
+      loader_spec_->AddCudaCubinInMemory(
+          reinterpret_cast<const char*>(executable.cubin().data()),
+          kernel_name_);
+    }
 #endif
+  }
 
-  return tensorflow::Status::OK();
+  // Load the kernel into the device if necessary.
+  //
+  // We could alternatively do this within ExecuteOnStream, but doing it here
+  // lets the time spent loading the kernel not count towards our execution
+  // profiles.
+  auto it = kernel_cache_.find(executor);
+  if (kernel_cache_.end() == it) {
+    it = kernel_cache_.emplace(executor, se::KernelBase(executor)).first;
+    if (!executor->GetKernel(*loader_spec_, &it->second)) {
+      return InternalError("Unable to load kernel %s", kernel_name_.c_str());
+    }
+  }
+
+  return Status::OK();
 }
 
 void KernelThunk::SetLaunchDimensions(const LaunchDimensions& launch_dims) {
@@ -65,21 +77,18 @@ void KernelThunk::SetLaunchDimensions(const LaunchDimensions& launch_dims) {
   launch_dimensions_ = launch_dims;
 }
 
-tensorflow::Status KernelThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations, se::Stream* stream) {
+Status KernelThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                                    se::Stream* stream) {
   // Load the kernel.
   se::StreamExecutor* executor = stream->parent();
   LaunchDimensions launch_dimensions;
   const se::KernelBase* kernel = nullptr;
+
   {
     tensorflow::mutex_lock lock(mutex_);
     auto it = kernel_cache_.find(executor);
-    if (kernel_cache_.end() == it) {
-      it = kernel_cache_.emplace(executor, se::KernelBase(executor)).first;
-      if (!executor->GetKernel(*loader_spec_, &it->second)) {
-        return InternalError("Unable to load kernel %s", kernel_name_.c_str());
-      }
-    }
+    CHECK(it != kernel_cache_.end())
+        << "Initialize() not called for StreamExecutor " << executor;
     launch_dimensions = launch_dimensions_;
     kernel = &it->second;
   }
@@ -100,7 +109,7 @@ tensorflow::Status KernelThunk::ExecuteOnStream(
           *kernel_args)) {
     return InternalError("Unable to launch kernel %s", kernel_name_.c_str());
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
index b556befe66b6be..7def27e189b667 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
@@ -57,11 +57,12 @@ class KernelThunk : public Thunk {
   int unroll_factor() const { return unroll_factor_; }
   void SetLaunchDimensions(const LaunchDimensions& launch_dims);
 
-  tensorflow::Status Initialize(const GpuExecutable& executable) override;
+  Status Initialize(const GpuExecutable& executable,
+                    se::StreamExecutor* executor) override;
 
   // Executes the kernel for the thunk on "stream", which must be non-null.
-  tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         se::Stream* stream) override;
 
  private:
   // Buffers passed to the kernel as arguments.
@@ -83,7 +84,8 @@ class KernelThunk : public Thunk {
   mutable tensorflow::mutex mutex_;
   std::unique_ptr<se::MultiKernelLoaderSpec> loader_spec_ GUARDED_BY(mutex_);
 
-  // Loaded kernels for each `StreamExecutor`
+  // Loaded kernels for each `StreamExecutor`.  Requires pointer stability of
+  // values.
   std::unordered_map<se::StreamExecutor*, se::KernelBase> kernel_cache_
       GUARDED_BY(mutex_);
 };
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
index 0f229a4418a095..5fc8f1a5fe0bda 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
@@ -58,7 +58,6 @@ cc_library(
         "@llvm//:scalar",
         "@llvm//:support",
         "@llvm//:target",
-        "@llvm//:transform_utils",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/amdgpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/amdgpu_backend_lib.cc
index d71ab62f8a0a07..dc6857f28a79da 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/amdgpu_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/amdgpu_backend_lib.cc
@@ -248,7 +248,7 @@ std::vector<char> EmitModuleToHsaco(Module* module, llvm::TargetMachine* target_
     codegen_passes.add(new llvm::TargetLibraryInfoWrapperPass(
         llvm::Triple(module->getTargetTriple())));
 
-    target_machine->addPassesToEmitFile(codegen_passes, pstream,
+    target_machine->addPassesToEmitFile(codegen_passes, pstream, nullptr,
                                         llvm::TargetMachine::CGFT_ObjectFile);
     codegen_passes.run(*module);
   }
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
index df9d9be889ce83..a4e4e85bf3d2c1 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
@@ -77,8 +77,7 @@ static string GetLibdeviceFilename(const string& libdevice_dir_path,
   // Since CUDA 9.0, all GPU versions are included in a single file
   const char* unified_libdevice_filename = "libdevice.10.bc";
   std::vector<string> unified_libdevice_files;
-  const tensorflow::Status status =
-    tensorflow::Env::Default()->GetMatchingPaths(
+  const Status status = tensorflow::Env::Default()->GetMatchingPaths(
       tensorflow::io::JoinPath(libdevice_dir_path, unified_libdevice_filename),
       &unified_libdevice_files);
   if (status.ok() && unified_libdevice_files.size() == 1) {
@@ -273,7 +272,7 @@ string EmitModuleToPTX(Module* module, llvm::TargetMachine* target_machine) {
     codegen_passes.add(new llvm::TargetLibraryInfoWrapperPass(
         llvm::Triple(module->getTargetTriple())));
 
-    target_machine->addPassesToEmitFile(codegen_passes, pstream,
+    target_machine->addPassesToEmitFile(codegen_passes, pstream, nullptr,
                                         llvm::TargetMachine::CGFT_AssemblyFile);
     codegen_passes.run(*module);
   }
@@ -311,11 +310,11 @@ bool CouldNeedLibdevice(const llvm::Module& module) {
 }
 
 // Links libdevice into the given module if the module needs libdevice.
-tensorflow::Status LinkLibdeviceIfNecessary(
-    llvm::Module* module, std::pair<int, int> compute_capability,
-    const string& libdevice_dir_path) {
+Status LinkLibdeviceIfNecessary(llvm::Module* module,
+                                std::pair<int, int> compute_capability,
+                                const string& libdevice_dir_path) {
   if (!CouldNeedLibdevice(*module)) {
-    return tensorflow::Status::OK();
+    return Status::OK();
   }
 
   llvm::Linker linker(*module);
@@ -336,7 +335,7 @@ tensorflow::Status LinkLibdeviceIfNecessary(
     return tensorflow::errors::Internal(tensorflow::strings::StrCat(
         "Error linking libdevice from ", libdevice_path));
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 StatusOr<string> CompileModuleToPtx(llvm::Module* module,
@@ -491,7 +490,7 @@ StatusOr<string> CompileToPtx(llvm::Module* module,
 
   string ptx;
   {
-    tensorflow::port::Tracing::TraceMe annotation(
+    tensorflow::tracing::ScopedActivity activity(
         "Compiling IR", llvm_ir::AsString(module->getName()),
         /*is_expensive=*/true);
     XLA_SCOPED_LOGGING_TIMER("Compile module " +
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index 43ba786443891e..8ecd9ceb5c49f2 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -73,6 +73,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
+#include "tensorflow/compiler/xla/service/while_loop_constant_sinking.h"
+#include "tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h"
 #include "tensorflow/compiler/xla/service/while_loop_simplifier.h"
 #include "tensorflow/compiler/xla/service/zero_sized_hlo_elimination.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -100,7 +102,7 @@ namespace gpu {
 
 namespace {
 
-using tensorflow::port::Tracing;
+namespace tracing = tensorflow::tracing;
 
 // Returns the directory containing nvvm libdevice files.  config_cuda_data_dir
 // should be equal to config().debug_options().xla_gpu_cuda_data_dir() of the
@@ -128,9 +130,8 @@ string GetLibdeviceDir(const string& config_cuda_data_dir) {
 }
 
 // Runs optimization passes on the given HLO module.
-tensorflow::Status OptimizeHloModule(HloModule* hlo_module,
-                                     se::StreamExecutor* stream_exec,
-                                     DeviceMemoryAllocator* device_allocator) {
+Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
+                         DeviceMemoryAllocator* device_allocator) {
   {
     HloPassPipeline pipeline("optimization");
     pipeline.AddInvariantChecker<HloVerifier>();
@@ -158,11 +159,13 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module,
       if (hlo_module->config().debug_options().xla_gpu_use_cudnn_batchnorm()) {
         pass.AddPass<CudnnBatchNormRewriter>();
       }
+      // TODO(kramerb): Remove use_fusion once instruction fusion can create
+      // multi-output fusions from the unfused expander output.
       pass.AddPass<BatchNormExpander>(
           /*rewrite_training_op=*/true,
           /*rewrite_inference_op=*/true,
           /*rewrite_grad_op=*/true,
-          /*use_fusion=*/false);
+          /*use_fusion=*/true);
 
       // Rewrite gather ops into smaller ones.
       pass.AddPass<GatherExpander>();
@@ -175,6 +178,7 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module,
           /*is_layout_sensitive=*/false,
           [](const Shape&, const Shape&) { return false; });
       pass.AddPass<TupleSimplifier>();
+      pass.AddPass<WhileLoopConstantSinking>();
       pass.AddPass<WhileLoopSimplifier>();
       pass.AddPass<HloDCE>();
       pass.AddPass<ReshapeMover>();
@@ -201,18 +205,28 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module,
     pipeline.AddInvariantChecker<HloVerifier>();
     pipeline.AddPass<CudnnConvolutionRewriter>();
     pipeline.AddPass<PadInsertion>();
+    TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
+  }
+
+  {
+    HloPassPipeline pipeline("layout_assignment");
+    pipeline.AddPass<GpuLayoutAssignment>(
+        hlo_module->mutable_device_entry_computation_layout(), stream_exec);
+
+    // The LayoutAssignment pass may leave behind kCopy instructions which are
+    // duplicate or NOPs, so remove them with algebraic simplification and CSE.
+    pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(
+        /*is_layout_sensitive=*/true,
+        /*valid_bitcast_callback=*/[](const Shape&, const Shape&) {
+          return true;
+        });
 
     // Choose the fastest algorithm for each conv.
     //
-    // In theory doing this here is way too early: It needs to happen after
-    // layout assignment, because the layout of the inputs/outputs affects the
-    // speed of the conv.  But currently we only allow only one input/output
-    // layout when calling cudnn, so there's no ambiguity.
-    //
-    // We pick the algorithm at this early stage so we can generate better HLO.
-    // After CudnnConvolutionRewriter, our convolutions are CustomCalls which
-    // return a tuple (conv_result, scratch_memory), and the each conv uses 0
-    // bytes of scratch:
+    // We pick the algorithm before fusion so we can generate better HLO. After
+    // CudnnConvolutionRewriter, our convolutions are CustomCalls which return a
+    // tuple (conv_result, scratch_memory), and the each conv uses 0 bytes of
+    // scratch:
     //
     //   customcall = (f32[...], f32[0])
     //   return gte(customcall, 0)
@@ -228,35 +242,15 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module,
     // The new tuple and gte instructions then be simplified away, because
     // nobody is expected to use the scratch value.
     //
-    // However, if we were to run CudnnConvolutionAlgorithmPicker after layout
-    // assignment, fusion would already have run, and the gte(customcall, 0)
-    // would probably already be into a fusion node.  We can't simplify across
-    // HloComputation boundaries, so in this case we wouldn't be able to
-    // simplify away the new_tuple bits.
-    //
-    // We'll need to revisit this if we ever allow multiple layouts for the
-    // inputs/outputs of a cudnn convolution.
+    // However, if we were to run CudnnConvolutionAlgorithmPicker after fusion
+    // the gte(customcall, 0) would probably already be into a fusion node.  We
+    // can't simplify across HloComputation boundaries, so in this case we
+    // wouldn't be able to simplify away the new_tuple bits.
     pipeline.AddPass<CudnnConvolutionAlgorithmPicker>(stream_exec,
                                                       device_allocator);
     // Clean up new_tuple described above.
     pipeline.AddPass<TupleSimplifier>();
-    pipeline.AddPass<HloDCE>();
 
-    TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
-  }
-
-  {
-    HloPassPipeline pipeline("layout_assignment");
-    pipeline.AddPass<GpuLayoutAssignment>(
-        hlo_module->mutable_entry_computation_layout());
-
-    // The LayoutAssignment pass may leave behind kCopy instructions which are
-    // duplicate or NOPs, so remove them with algebraic simplification and CSE.
-    pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(
-        /*is_layout_sensitive=*/true,
-        /*valid_bitcast_callback=*/[](const Shape&, const Shape&) {
-          return true;
-        });
     pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/true);
     TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
   }
@@ -283,12 +277,21 @@ tensorflow::Status OptimizeHloModule(HloModule* hlo_module,
       TF_RETURN_IF_ERROR(fusion.Run(hlo_module).status());
     }
   }
-  return tensorflow::Status::OK();
+
+  {
+    // Do an aggressive LICM pass over while loops.  In particular, this hoists
+    // constants that were sunk by WhileLoopConstantSinking.  Leaving them in
+    // the while loop may result in unnecessary copies.
+    HloPassPipeline pipeline("while-loop-licm");
+    pipeline.AddPass<WhileLoopInvariantCodeMotion>(true);
+    TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
+  }
+  return Status::OK();
 }
 
 // Modifies the given HLO module so that it will be accepted by IrEmitter.
 // Unlike optimization passes, the passes are necessary for correctness.
-tensorflow::Status PrepareHloModuleForIrEmitting(HloModule* hlo_module) {
+Status PrepareHloModuleForIrEmitting(HloModule* hlo_module) {
   // In some cases, we have to place the result of an instruction in a temporary
   // buffer. For instance, the buffer that holds an external parameter is
   // assumed immutable at this point, and should not be reused for output
@@ -410,7 +413,7 @@ void WarnIfBadDriverJITVersion() {
 // code (i.e. a cubin) as a byte array.
 StatusOr<std::vector<uint8>> CompilePtx(const string& ptx, int cc_major,
                                         int cc_minor) {
-  Tracing::TraceMe annotation("Compile PTX", /*is_expensive=*/true);
+  tracing::ScopedActivity activity("Compile PTX", /*is_expensive=*/true);
   const string ptxas_path =
       tensorflow::io::JoinPath(tensorflow::CudaRoot(), "bin", "ptxas");
   VLOG(2) << "Using ptxas at " << ptxas_path;
@@ -481,8 +484,8 @@ StatusOr<std::unique_ptr<HloModule>> NVPTXCompiler::RunHloPasses(
     std::unique_ptr<HloModule> module, se::StreamExecutor* stream_exec,
     DeviceMemoryAllocator* device_allocator) {
   XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::RunHloPasses");
-  Tracing::TraceMe annotation("HLO Transforms", module->name(),
-                              /*is_expensive=*/true);
+  tracing::ScopedActivity activity("HLO Transforms", module->name(),
+                                   /*is_expensive=*/true);
   TF_RETURN_IF_ERROR(
       OptimizeHloModule(module.get(), stream_exec, device_allocator));
   return std::move(module);
@@ -692,7 +695,7 @@ std::vector<uint8> NVPTXCompiler::CompilePtxOrGetCachedResult(const string& ptx,
                                                             int cc_major,
                                                             int cc_minor) {
   XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::CompilePtxOrGetCachedResult");
-  Tracing::TraceMe annotation("PTX->CUBIN", /*is_expensive=*/true);
+  tracing::ScopedActivity activity("PTX->CUBIN", /*is_expensive=*/true);
   bool inserted;
   decltype(compilation_cache_.begin()) iter;
   // Pointers into compilation_cache_ where the ptx and (optional) cubin are
diff --git a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
index 7bda4e2fcd469b..c8f0d4185c63c5 100644
--- a/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
+++ b/tensorflow/compiler/xla/service/gpu/pad_insertion.cc
@@ -370,26 +370,38 @@ bool PadInsertion::CanonicalizeBackwardInputConvolution(
   return true;
 }
 
-StatusOr<bool> PadInsertion::Run(HloModule* module) {
+StatusOr<bool> PadInsertion::RunOnComputation(HloComputation* computation) {
   bool changed = false;
-  for (HloInstruction* instruction :
-       module->entry_computation()->MakeInstructionPostOrder()) {
-    if (IsCustomCallToDnnConvolution(*instruction)) {
-      const auto& target = instruction->custom_call_target();
-      if (target == kCudnnConvForwardCallTarget) {
-        changed |= CanonicalizeForwardConvolution(instruction);
-      } else if (target == kCudnnConvBackwardFilterCallTarget) {
-        changed |= CanonicalizeBackwardFilterConvolution(instruction);
-      } else if (target == kCudnnConvBackwardInputCallTarget) {
-        changed |= CanonicalizeBackwardInputConvolution(instruction);
-      } else {
-        LOG(FATAL) << "Unknown custom call target for cudnn conv: "
-                   << instruction->ToString();
-      }
+  std::vector<HloInstruction*> convs;
+  for (auto* instr : computation->instructions()) {
+    if (IsCustomCallToDnnConvolution(*instr)) {
+      convs.push_back(instr);
+    }
+  }
+  for (HloInstruction* instruction : convs) {
+    const auto& target = instruction->custom_call_target();
+    if (target == kCudnnConvForwardCallTarget) {
+      changed |= CanonicalizeForwardConvolution(instruction);
+    } else if (target == kCudnnConvBackwardFilterCallTarget) {
+      changed |= CanonicalizeBackwardFilterConvolution(instruction);
+    } else if (target == kCudnnConvBackwardInputCallTarget) {
+      changed |= CanonicalizeBackwardInputConvolution(instruction);
+    } else {
+      LOG(FATAL) << "Unknown custom call target for cudnn conv: "
+                 << instruction->ToString();
     }
   }
   return changed;
 }
 
+StatusOr<bool> PadInsertion::Run(HloModule* module) {
+  bool changed = false;
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    TF_ASSIGN_OR_RETURN(bool result, RunOnComputation(computation));
+    changed |= result;
+  }
+  return changed;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/pad_insertion.h b/tensorflow/compiler/xla/service/gpu/pad_insertion.h
index 5e1c68701daa02..67e51509e4c717 100644
--- a/tensorflow/compiler/xla/service/gpu/pad_insertion.h
+++ b/tensorflow/compiler/xla/service/gpu/pad_insertion.h
@@ -31,6 +31,7 @@ class PadInsertion : public HloPassInterface {
   StatusOr<bool> Run(HloModule* module) override;
 
  private:
+  StatusOr<bool> RunOnComputation(HloComputation* computation);
   // Returns if any changes are made to the parent computation.
   bool CanonicalizeForwardConvolution(HloInstruction* conv);
   bool CanonicalizeBackwardFilterConvolution(HloInstruction* backward_conv);
diff --git a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
index c8510808f10a73..88cb10883e97ae 100644
--- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.cc
@@ -20,24 +20,24 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-SequentialThunk::SequentialThunk(std::vector<std::unique_ptr<Thunk>>&& thunks,
+SequentialThunk::SequentialThunk(std::vector<std::unique_ptr<Thunk>> thunks,
                                  const HloInstruction* hlo)
     : Thunk(Kind::kSequential, hlo), thunks_(std::move(thunks)) {}
 
-tensorflow::Status SequentialThunk::Initialize(
-    const GpuExecutable& executable) {
+Status SequentialThunk::Initialize(const GpuExecutable& executable,
+                                   se::StreamExecutor* executor) {
   for (auto& thunk : thunks_) {
-    TF_RETURN_IF_ERROR(thunk->Initialize(executable));
+    TF_RETURN_IF_ERROR(thunk->Initialize(executable, executor));
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status SequentialThunk::ExecuteOnStream(
+Status SequentialThunk::ExecuteOnStream(
     const BufferAllocations& buffer_allocations, se::Stream* stream) {
   for (const auto& thunk : thunks_) {
     TF_RETURN_IF_ERROR(thunk->ExecuteOnStream(buffer_allocations, stream));
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
index df17b8d67b8032..135f79e413dfaa 100644
--- a/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/sequential_thunk.h
@@ -31,16 +31,17 @@ namespace gpu {
 // require multiple kernel launches or library calls.
 class SequentialThunk : public Thunk {
  public:
-  SequentialThunk(std::vector<std::unique_ptr<Thunk>>&& thunks,
+  SequentialThunk(std::vector<std::unique_ptr<Thunk>> thunks,
                   const HloInstruction* hlo);
   SequentialThunk(const SequentialThunk&) = delete;
   SequentialThunk& operator=(const SequentialThunk&) = delete;
 
   const std::vector<std::unique_ptr<Thunk>>& thunks() const { return thunks_; }
 
-  tensorflow::Status Initialize(const GpuExecutable& executable) override;
-  tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
+  Status Initialize(const GpuExecutable& executable,
+                    se::StreamExecutor* executor) override;
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         se::Stream* stream) override;
 
  private:
   // The list of sub-thunks.
diff --git a/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
index 8c98956f1a9b2a..696fa7e0194032 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
@@ -28,6 +28,15 @@ namespace gpu {
 
 class StreamAssignmentTest : public HloTestBase {
  protected:
+  std::unique_ptr<HloModule> CreateNewModule() {
+    HloModuleConfig config;
+    auto debug_options = GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_disable_multi_streaming(false);
+    config.set_debug_options(debug_options);
+    return MakeUnique<HloModule>("test_module", VersionedComputationHandle(),
+                                 config);
+  }
+
   // Pre-canned shapes.
   Shape f32_2x2_ = ShapeUtil::MakeShape(F32, {2, 2});
 };
@@ -41,9 +50,9 @@ TEST_F(StreamAssignmentTest, SequentialMatMul) {
   HloInstruction* z = builder.AddInstruction(HloInstruction::CreateParameter(
       /*parameter_number=*/2, f32_2x2_, /*name=*/"z"));
   HloInstruction* dot1 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, x, y));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, x, y));
   HloInstruction* dot2 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, dot1, z));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, dot1, z));
 
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build(dot2));
@@ -60,9 +69,9 @@ TEST_F(StreamAssignmentTest, ConcurrentMatMul) {
   HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
       /*parameter_number=*/1, f32_2x2_, /*name=*/"y"));
   HloInstruction* dot1 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, x, y));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, x, y));
   HloInstruction* dot2 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, y, x));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, y, x));
   HloInstruction* add = builder.AddInstruction(
       HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kAdd, dot1, dot2));
 
@@ -91,24 +100,24 @@ TEST_F(StreamAssignmentTest, LatticeMatMul) {
     params.push_back(builder.AddInstruction(HloInstruction::CreateParameter(
         i, f32_2x2_, /*name=*/tensorflow::strings::Printf("param%d", i))));
   }
-  HloInstruction* d00 = builder.AddInstruction(HloInstruction::CreateBinary(
-      f32_2x2_, HloOpcode::kDot, params[2], params[3]));
+  HloInstruction* d00 = builder.AddInstruction(
+      HloInstruction::CreateCanonicalDot(f32_2x2_, params[2], params[3]));
   HloInstruction* d10 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, params[1], d00));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, params[1], d00));
   HloInstruction* d11 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, d00, params[4]));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, d00, params[4]));
   HloInstruction* d20 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, params[0], d10));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, params[0], d10));
   HloInstruction* d21 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, d10, d11));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, d10, d11));
   HloInstruction* d22 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, d11, params[5]));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, d11, params[5]));
   HloInstruction* d30 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, d20, d21));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, d20, d21));
   HloInstruction* d31 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, d21, d22));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, d21, d22));
   HloInstruction* d40 = builder.AddInstruction(
-      HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kDot, d30, d31));
+      HloInstruction::CreateCanonicalDot(f32_2x2_, d30, d31));
 
   auto module = CreateNewModule();
   module->AddEntryComputation(builder.Build(d40));
diff --git a/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc b/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
new file mode 100644
index 00000000000000..a50ddf6ac63c7f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
@@ -0,0 +1,151 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
+
+#include "tensorflow/compiler/xla/layout_util.h"
+
+namespace xla {
+namespace gpu {
+
+using stream_executor::dnn::DataLayout;
+using stream_executor::dnn::DataLayoutString;
+using stream_executor::dnn::FilterLayout;
+using stream_executor::dnn::FilterLayoutString;
+
+StatusOr<std::tuple<Layout, Layout, Layout>>
+StreamExecutorConvLayoutsToXlaLayouts(const ConvolutionDimensionNumbers& dnums,
+                                      DataLayout input, FilterLayout filter,
+                                      DataLayout output) {
+  std::vector<int64> input_layout;
+  switch (input) {
+    case DataLayout::kBatchDepthYX:
+      input_layout.push_back(dnums.input_batch_dimension());
+      input_layout.push_back(dnums.input_feature_dimension());
+      input_layout.insert(input_layout.end(),
+                          dnums.input_spatial_dimensions().begin(),
+                          dnums.input_spatial_dimensions().end());
+      break;
+    case DataLayout::kBatchYXDepth:
+      input_layout.push_back(dnums.input_batch_dimension());
+      input_layout.insert(input_layout.end(),
+                          dnums.input_spatial_dimensions().begin(),
+                          dnums.input_spatial_dimensions().end());
+      input_layout.push_back(dnums.input_feature_dimension());
+      break;
+    default:
+      return tensorflow::errors::Internal("Invalid input layout: ",
+                                          DataLayoutString(input));
+  }
+
+  std::vector<int64> filter_layout;
+  switch (filter) {
+    case FilterLayout::kOutputInputYX:
+      filter_layout.push_back(dnums.kernel_output_feature_dimension());
+      filter_layout.push_back(dnums.kernel_input_feature_dimension());
+      filter_layout.insert(filter_layout.end(),
+                           dnums.kernel_spatial_dimensions().begin(),
+                           dnums.kernel_spatial_dimensions().end());
+      break;
+    case FilterLayout::kOutputYXInput:
+      filter_layout.push_back(dnums.kernel_output_feature_dimension());
+      filter_layout.insert(filter_layout.end(),
+                           dnums.kernel_spatial_dimensions().begin(),
+                           dnums.kernel_spatial_dimensions().end());
+      filter_layout.push_back(dnums.kernel_input_feature_dimension());
+      break;
+    default:
+      return tensorflow::errors::Internal("Invalid filter layout: ",
+                                          FilterLayoutString(filter));
+  }
+
+  std::vector<int64> output_layout;
+  switch (output) {
+    case DataLayout::kBatchDepthYX:
+      output_layout.push_back(dnums.output_batch_dimension());
+      output_layout.push_back(dnums.output_feature_dimension());
+      output_layout.insert(output_layout.end(),
+                           dnums.output_spatial_dimensions().begin(),
+                           dnums.output_spatial_dimensions().end());
+      break;
+    case DataLayout::kBatchYXDepth:
+      output_layout.push_back(dnums.output_batch_dimension());
+      output_layout.insert(output_layout.end(),
+                           dnums.output_spatial_dimensions().begin(),
+                           dnums.output_spatial_dimensions().end());
+      output_layout.push_back(dnums.output_feature_dimension());
+      break;
+    default:
+      return tensorflow::errors::Internal("Invalid output layout: ",
+                                          DataLayoutString(output));
+  }
+
+  return std::make_tuple(LayoutUtil::MakeLayoutFromMajorToMinor(input_layout),
+                         LayoutUtil::MakeLayoutFromMajorToMinor(filter_layout),
+                         LayoutUtil::MakeLayoutFromMajorToMinor(output_layout));
+}
+
+StatusOr<std::tuple<DataLayout, FilterLayout, DataLayout>>
+XlaConvLayoutsToStreamExecutorLayouts(const ConvolutionDimensionNumbers& dnums,
+                                      const Layout& input, const Layout& filter,
+                                      const Layout& output) {
+  Layout nchw_input, nchw_filter, nchw_output;
+  std::tie(nchw_input, nchw_filter, nchw_output) =
+      StreamExecutorConvLayoutsToXlaLayouts(dnums, DataLayout::kBatchDepthYX,
+                                            FilterLayout::kOutputInputYX,
+                                            DataLayout::kBatchDepthYX)
+          .ConsumeValueOrDie();
+
+  Layout nhwc_input, nhwc_filter, nhwc_output;
+  std::tie(nhwc_input, nhwc_filter, nhwc_output) =
+      StreamExecutorConvLayoutsToXlaLayouts(dnums, DataLayout::kBatchYXDepth,
+                                            FilterLayout::kOutputYXInput,
+                                            DataLayout::kBatchYXDepth)
+          .ConsumeValueOrDie();
+
+  DataLayout input_layout;
+  if (LayoutUtil::Equal(input, nchw_input)) {
+    input_layout = DataLayout::kBatchDepthYX;
+  } else if (LayoutUtil::Equal(input, nhwc_input)) {
+    input_layout = DataLayout::kBatchYXDepth;
+  } else {
+    return tensorflow::errors::Internal("Invalid input layout: ",
+                                        input.ShortDebugString());
+  }
+
+  FilterLayout filter_layout;
+  if (LayoutUtil::Equal(filter, nchw_filter)) {
+    filter_layout = FilterLayout::kOutputInputYX;
+  } else if (LayoutUtil::Equal(filter, nhwc_filter)) {
+    filter_layout = FilterLayout::kOutputYXInput;
+  } else {
+    return tensorflow::errors::Internal("Invalid filter layout: ",
+                                        filter.ShortDebugString());
+  }
+
+  DataLayout output_layout;
+  if (LayoutUtil::Equal(output, nchw_output)) {
+    output_layout = DataLayout::kBatchDepthYX;
+  } else if (LayoutUtil::Equal(output, nhwc_output)) {
+    output_layout = DataLayout::kBatchYXDepth;
+  } else {
+    return tensorflow::errors::Internal("Invalid output layout: ",
+                                        output.ShortDebugString());
+  }
+
+  return std::make_tuple(input_layout, filter_layout, output_layout);
+}
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/stream_executor_util.h b/tensorflow/compiler/xla/service/gpu/stream_executor_util.h
new file mode 100644
index 00000000000000..8218f4fd11d397
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/stream_executor_util.h
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_STREAM_EXECUTOR_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_STREAM_EXECUTOR_UTIL_H_
+
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+// Helper functions for interacting with StreamExecutor.
+
+namespace xla {
+namespace gpu {
+
+// Returns (input, filter, output) XLA Layout protos given the StreamExecutor
+// layouts.
+StatusOr<std::tuple<Layout, Layout, Layout>>
+StreamExecutorConvLayoutsToXlaLayouts(const ConvolutionDimensionNumbers& dnums,
+                                      stream_executor::dnn::DataLayout input,
+                                      stream_executor::dnn::FilterLayout filter,
+                                      stream_executor::dnn::DataLayout output);
+
+// Returns (input, filter, output) StreamExecutor layouts given the XLA layouts.
+StatusOr<std::tuple<stream_executor::dnn::DataLayout,
+                    stream_executor::dnn::FilterLayout,
+                    stream_executor::dnn::DataLayout>>
+XlaConvLayoutsToStreamExecutorLayouts(const ConvolutionDimensionNumbers& dnums,
+                                      const Layout& input, const Layout& filter,
+                                      const Layout& output);
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_STREAM_EXECUTOR_UTIL_H_
diff --git a/tensorflow/compiler/xla/service/gpu/thunk.h b/tensorflow/compiler/xla/service/gpu/thunk.h
index a0c785ed913109..931c0bffab8503 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk.h
@@ -70,11 +70,14 @@ class Thunk {
   Kind kind() const { return kind_; }
   const HloInstruction* hlo_instruction() const { return hlo_instruction_; }
 
-  // Prepares for executing the thunk. This method is called only once over
-  // Thunk's lifetime. For example, KernelThunk::Initialize loads the PTX of a
-  // kernel, which is the same in every execution.
-  virtual tensorflow::Status Initialize(const GpuExecutable& executable) {
-    return tensorflow::Status::OK();
+  // Prepares the thunk for execution on the given StreamExecutor.
+  //
+  // This may be called multiple times.  Its main purpose is to give us a chance
+  // to do initialization outside of ExecuteOnStream() so that the
+  // time spent initializing doesn't count towards our execution profile.
+  virtual Status Initialize(const GpuExecutable& /*executable*/,
+                            se::StreamExecutor* /*executor*/) {
+    return Status::OK();
   }
 
   // Users of Thunk should call ShouldHaltAllActivityBeforeRunning(stream)
@@ -89,21 +92,13 @@ class Thunk {
     return false;
   }
 
-  // Indicates whether thunks scheduled after this one should wait for this one
-  // to complete before running. For example, a convolution thunk creates a
-  // scratch allocator, then kicks off a convolution in cudnn via the stream
-  // executor. When the stream executor call returns, the scratch allocator goes
-  // out of scope, and the scratch memory is deallocated. In this case, the
-  // convolution thunk needs to return true so that future thunks wait for the
-  // convolution thunk to avoid reusing the deallocated memory until the
-  // convolution thunk is done with it.
-  virtual bool ShouldBlockFutureThunks() { return false; }
-
   // Execute the kernel for the thunk on the given stream. This method must be
   // called after Initialize and can be called multiple times over Thunk's
   // lifetime. Stream argument must be non-null.
-  virtual tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations, se::Stream* stream) = 0;
+  //
+  // Precondition: Initialize(stream->parent()) has been called.
+  virtual Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                                 se::Stream* stream) = 0;
 
  private:
   Kind kind_;
diff --git a/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc b/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc
index ecb54857ccc40e..97cb04c38fbf18 100644
--- a/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/tuple_thunk.cc
@@ -20,8 +20,8 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-tensorflow::Status TupleThunk::ExecuteOnStream(
-    const BufferAllocations& buffer_allocations, se::Stream* stream) {
+Status TupleThunk::ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                                   se::Stream* stream) {
   std::vector<void*> tuple_element_buffer_addresses;
   for (BufferAllocation::Slice tuple_element_buffer : tuple_element_buffers_) {
     tuple_element_buffer_addresses.push_back(
@@ -40,7 +40,7 @@ tensorflow::Status TupleThunk::ExecuteOnStream(
         tuple_element_buffer_addresses.data(), dest_buffer_address.opaque(),
         sizeof(void*) * tuple_element_buffer_addresses.size());
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/tuple_thunk.h b/tensorflow/compiler/xla/service/gpu/tuple_thunk.h
index 8b459c29a136a6..951f809b51937c 100644
--- a/tensorflow/compiler/xla/service/gpu/tuple_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/tuple_thunk.h
@@ -45,8 +45,8 @@ class TupleThunk : public Thunk {
   TupleThunk(const TupleThunk&) = delete;
   TupleThunk& operator=(const TupleThunk&) = delete;
 
-  tensorflow::Status ExecuteOnStream(
-      const BufferAllocations& buffer_allocations, se::Stream* stream) override;
+  Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
+                         se::Stream* stream) override;
 
  private:
   const std::vector<BufferAllocation::Slice> tuple_element_buffers_;
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.cc b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
index a9f3d619a3ffd6..30b9640c4c75da 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.cc
@@ -34,9 +34,11 @@ WhileThunk::WhileThunk(
       body_thunk_sequence_(
           MakeUnique<SequentialThunk>(std::move(*body_thunk_sequence), hlo)) {}
 
-Status WhileThunk::Initialize(const GpuExecutable& executable) {
-  TF_RETURN_IF_ERROR(condition_thunk_sequence_->Initialize(executable));
-  TF_RETURN_IF_ERROR(body_thunk_sequence_->Initialize(executable));
+Status WhileThunk::Initialize(const GpuExecutable& executable,
+                              se::StreamExecutor* executor) {
+  TF_RETURN_IF_ERROR(
+      condition_thunk_sequence_->Initialize(executable, executor));
+  TF_RETURN_IF_ERROR(body_thunk_sequence_->Initialize(executable, executor));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/while_thunk.h b/tensorflow/compiler/xla/service/gpu/while_thunk.h
index e589ca78a7ea00..22176685a92df9 100644
--- a/tensorflow/compiler/xla/service/gpu/while_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/while_thunk.h
@@ -45,7 +45,8 @@ class WhileThunk : public Thunk {
   WhileThunk(const WhileThunk&) = delete;
   WhileThunk& operator=(const WhileThunk&) = delete;
 
-  Status Initialize(const GpuExecutable& executable) override;
+  Status Initialize(const GpuExecutable& executable,
+                    se::StreamExecutor* executor) override;
   Status ExecuteOnStream(const BufferAllocations& buffer_allocations,
                          se::Stream* stream) override;
 
diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer.cc b/tensorflow/compiler/xla/service/gpu/while_transformer.cc
index e6caec8625f0d6..7749201cbceece 100644
--- a/tensorflow/compiler/xla/service/gpu/while_transformer.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_transformer.cc
@@ -144,7 +144,7 @@ class ExprTree {
       TF_RETURN_IF_ERROR(pair.second->Match(instruction->operand(pair.first),
                                             tagged_instructions));
     }
-    return tensorflow::Status::OK();
+    return Status::OK();
   }
 
  private:
@@ -169,7 +169,7 @@ class MatcherBase {
 
   // Attempts to match each ExprTree in 'expr_trees_'.
   // Returns OK on the first successful match, error status otherwise.
-  virtual tensorflow::Status Run() {
+  virtual Status Run() {
     Status status;
     for (const ExprTree& expr_tree : expr_trees_) {
       status = MatchExprTree(expr_tree);
@@ -201,7 +201,7 @@ class MatcherBase {
     } else if (type == S64) {
       *const_value = literal.GetFirstElement<int64>();
     }
-    return tensorflow::Status::OK();
+    return Status::OK();
   }
 
   StatusOr<const HloInstruction*> GetTaggedInstruction(
@@ -315,7 +315,7 @@ class WhileConditionComputationMatcher : public MatcherBase {
                              gte_fusion_param0->name().c_str());
     }
 
-    return tensorflow::Status::OK();
+    return Status::OK();
   }
 
   const HloComputation* computation_;
@@ -379,7 +379,7 @@ class WhileInitOperandMatcher : public MatcherBase {
         GetTaggedInstruction("loop_start", tagged_instructions));
     TF_RETURN_IF_ERROR(ParseConstInteger(const_hlo, &loop_start_));
 
-    return tensorflow::Status::OK();
+    return Status::OK();
   }
 
   const HloInstruction* while_hlo_;
@@ -457,8 +457,8 @@ class WhileBodyComputationMatcher : public MatcherBase {
         return InvalidArgument("Unexpected tuple index instruction : %s",
                                inst->name().c_str());
       } else if (tag == "loop_increment") {
-        // Parse the constant which represents the loop induction variable
-        // increment value.
+        // ParseHloString the constant which represents the loop induction
+        // variable increment value.
         TF_RETURN_IF_ERROR(ParseConstInteger(inst, &loop_increment_));
       } else if (tag == "param0" &&
                  inst != computation_->parameter_instruction(0)) {
@@ -477,7 +477,7 @@ class WhileBodyComputationMatcher : public MatcherBase {
         }
       }
     }
-    return tensorflow::Status::OK();
+    return Status::OK();
   }
 
   const HloComputation* computation_;
diff --git a/tensorflow/compiler/xla/service/graphviz_example.cc b/tensorflow/compiler/xla/service/graphviz_example.cc
index 05017008e2ddbe..acf661148699da 100644
--- a/tensorflow/compiler/xla/service/graphviz_example.cc
+++ b/tensorflow/compiler/xla/service/graphviz_example.cc
@@ -82,7 +82,8 @@ HloComputation* CallForwardingComputation(HloComputation* computation,
 // instructions. Sets the computation as the entry to an HLO module and returns
 // the module.
 std::unique_ptr<HloModule> MakeBigGraph() {
-  auto module = MakeUnique<HloModule>("BigGraph");
+  HloModuleConfig config;
+  auto module = MakeUnique<HloModule>("BigGraph", config);
 
   auto builder = HloComputation::Builder("TestBigGraphvizGraph");
 
diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc
index 3dd4c4a0794e5c..06a5e0351b6327 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/map_util.h"
-#include "tensorflow/compiler/xla/service/liveness_util.h"
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
@@ -32,7 +31,7 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
     std::unique_ptr<HeapAlgorithm> algorithm, const HloModule& module,
     const SequentialHloOrdering::HloModuleSequence& module_sequence,
     const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_fn, const Options& options) {
+    const BufferValue::SizeFunction& size_fn, const Options& options) {
   HeapSimulator heap(std::move(algorithm), size_fn, options, &module_sequence);
   const HloComputation* entry_computation = module.entry_computation();
   const std::vector<const HloInstruction*>& instruction_sequence =
@@ -47,7 +46,7 @@ StatusOr<HeapSimulator::Result> HeapSimulator::Run(
     std::unique_ptr<HeapAlgorithm> algorithm, const HloComputation& computation,
     const std::vector<const HloInstruction*>& instruction_sequence,
     const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_fn, const Options& options) {
+    const BufferValue::SizeFunction& size_fn, const Options& options) {
   HeapSimulator heap(std::move(algorithm), size_fn, options,
                      /*module_sequence=*/nullptr);
   TF_RETURN_IF_ERROR(heap.RunComputation(computation, instruction_sequence,
@@ -73,11 +72,11 @@ Status HeapSimulator::RunComputation(
   // 'used_buffers' is the reverse map - it tracks which buffers were used by an
   // instruction, so that we can remove the instructions from a buffer's live
   // set after they are visited.
-  FlatMap<const LogicalBuffer*, FlatSet<const HloInstruction*>> live_buffers;
-  FlatMap<const HloInstruction*, FlatSet<const LogicalBuffer*>> used_buffers;
+  FlatMap<const BufferValue*, FlatSet<const HloInstruction*>> live_buffers;
+  FlatMap<const HloInstruction*, FlatSet<const BufferValue*>> used_buffers;
   auto add_user_to_buffer = [this, &live_buffers, &used_buffers](
                                 const HloInstruction* user,
-                                const LogicalBuffer* buffer) {
+                                const BufferValue* buffer) {
     if (!IgnoreBuffer(buffer)) {
       VLOG(4) << "  Adding user " << user->name() << " to buffer "
               << buffer->ToString();
@@ -96,7 +95,7 @@ Status HeapSimulator::RunComputation(
     const PointsToSet::BufferSet& buffer_set = points_to.CreateFlattenedSet();
     for (const HloInstruction* user : instruction->users()) {
       if (user->opcode() != HloOpcode::kGetTupleElement) {
-        for (const LogicalBuffer* buffer : buffer_set) {
+        for (const BufferValue* buffer : buffer_set) {
           add_user_to_buffer(user, buffer);
         }
       } else {
@@ -104,12 +103,12 @@ Status HeapSimulator::RunComputation(
         // alive. It only needs the buffers that relate to the element its
         // extracting, and the tuple it's extracting from, but not the buffers
         // for the other elements.
-        for (const LogicalBuffer* buffer : points_to.element({})) {
+        for (const BufferValue* buffer : points_to.element({})) {
           add_user_to_buffer(user, buffer);
         }
         const PointsToSet& gte_points_to =
             points_to_analysis.GetPointsToSet(user);
-        for (const LogicalBuffer* buffer : gte_points_to.CreateFlattenedSet()) {
+        for (const BufferValue* buffer : gte_points_to.CreateFlattenedSet()) {
           add_user_to_buffer(user, buffer);
         }
       }
@@ -117,24 +116,25 @@ Status HeapSimulator::RunComputation(
   }
 
   const HloInstruction* root = computation.root_instruction();
-  auto output_source_buffers =
-      points_to_analysis.GetPointsToSet(root).CreateFlattenedSet();
+  BufferValueCompactPointerSet output_source_buffers =
+      ToBufferValueCompactPointerSet(
+          points_to_analysis.GetPointsToSet(root).CreateFlattenedSet());
 
-  std::vector<const LogicalBuffer*> dead_buffers_to_free;
-  std::vector<const LogicalBuffer*> operand_buffers_to_free;
+  std::vector<const BufferValue*> dead_buffers_to_free;
+  std::vector<const BufferValue*> operand_buffers_to_free;
   for (const HloInstruction* instruction : instruction_sequence) {
     const TuplePointsToAnalysis::BufferDefinitionVector&
         buffers_defined_by_instruction =
             points_to_analysis.GetBuffersDefinedByInstruction(instruction);
 
     VLOG(3) << "Instruction: " << instruction->ToString();
-    for (const LogicalBuffer* buffer : buffers_defined_by_instruction) {
+    for (const BufferValue* buffer : buffers_defined_by_instruction) {
       VLOG(4) << "  Defines: " << buffer->ToString()
               << (IgnoreBuffer(buffer) ? " (Ignored)" : "");
     }
 
     dead_buffers_to_free.clear();
-    for (const LogicalBuffer* buffer : buffers_defined_by_instruction) {
+    for (const BufferValue* buffer : buffers_defined_by_instruction) {
       if (IgnoreBuffer(buffer)) {
         continue;
       }
@@ -161,7 +161,7 @@ Status HeapSimulator::RunComputation(
     // have no instructions left to visit are moved from live_buffers to
     // operand_buffers_to_free.
     operand_buffers_to_free.clear();
-    for (const LogicalBuffer* operand_buffer : used_buffers[instruction]) {
+    for (const BufferValue* operand_buffer : used_buffers[instruction]) {
       if (IgnoreBuffer(operand_buffer)) {
         continue;
       }
@@ -177,7 +177,7 @@ Status HeapSimulator::RunComputation(
     }
     // Sort to get a deterministic iteration order.
     std::sort(operand_buffers_to_free.begin(), operand_buffers_to_free.end(),
-              [](const LogicalBuffer* x, const LogicalBuffer* y) {
+              [](const BufferValue* x, const BufferValue* y) {
                 return x->id() < y->id();
               });
 
@@ -188,7 +188,7 @@ Status HeapSimulator::RunComputation(
     //
     // INVARIANT: Either Alloc or ShareBuffer will be called for each buffer
     // that we should assign.
-    for (const LogicalBuffer* buffer : buffers_defined_by_instruction) {
+    for (const BufferValue* buffer : buffers_defined_by_instruction) {
       if (IgnoreBuffer(buffer)) {
         continue;
       }
@@ -199,12 +199,12 @@ Status HeapSimulator::RunComputation(
       // we must be the last user of the buffer.
       bool shared = false;
       if (options_.may_reuse_operand_buffers) {
-        for (const LogicalBuffer* operand_buffer : operand_buffers_to_free) {
+        for (const BufferValue* operand_buffer : operand_buffers_to_free) {
           if (buffer->instruction()->IsUserOf(operand_buffer->instruction()) &&
               buffer->instruction()->opcode() != HloOpcode::kCopy &&
-              CanShareOperandBufferWithUser(
+              points_to_analysis.CanShareOperandBufferWithUser(
                   operand_buffer->instruction(), operand_buffer->index(),
-                  buffer->instruction(), buffer->index(), points_to_analysis)) {
+                  buffer->instruction(), buffer->index())) {
             VLOG(3) << "  Sharing: " << buffer->ToString() << " with "
                     << operand_buffer->ToString();
             ShareBuffer(buffer, operand_buffer, instruction);
@@ -248,11 +248,11 @@ Status HeapSimulator::RunComputation(
 
     // Free buffers that are no longer live.  This is the earliest point that we
     // can de-allocate; right after the last use of the buffer.
-    for (const LogicalBuffer* buffer : dead_buffers_to_free) {
+    for (const BufferValue* buffer : dead_buffers_to_free) {
       VLOG(3) << "  Freeing dead: " << buffer->ToString();
       Free(buffer, instruction);
     }
-    for (const LogicalBuffer* buffer : operand_buffers_to_free) {
+    for (const BufferValue* buffer : operand_buffers_to_free) {
       VLOG(3) << "  Freeing operand: " << buffer->ToString();
       Free(buffer, instruction);
     }
@@ -261,10 +261,10 @@ Status HeapSimulator::RunComputation(
   // Any remaining live buffers must be entry parameters or output source
   // buffers, which had a nullptr sentry added.  Free them now, in a
   // deterministic order.
-  std::vector<const LogicalBuffer*> to_free;
+  std::vector<const BufferValue*> to_free;
   to_free.reserve(live_buffers.size());
   for (const auto& buffer_pending : live_buffers) {
-    const LogicalBuffer* buffer = buffer_pending.first;
+    const BufferValue* buffer = buffer_pending.first;
     const FlatSet<const HloInstruction*>& pending = buffer_pending.second;
     CHECK_EQ(pending.size(), 1) << *buffer;
     CHECK(*pending.begin() == nullptr) << *buffer;
@@ -272,10 +272,10 @@ Status HeapSimulator::RunComputation(
   }
 
   std::sort(to_free.begin(), to_free.end(),
-            [](const LogicalBuffer* x, const LogicalBuffer* y) {
+            [](const BufferValue* x, const BufferValue* y) {
               return x->id() < y->id();
             });
-  for (const LogicalBuffer* buffer : to_free) {
+  for (const BufferValue* buffer : to_free) {
     VLOG(3) << "Freeing pending: " << buffer->ToString();
     Free(buffer, root);
   }
@@ -285,7 +285,7 @@ Status HeapSimulator::RunComputation(
 
 HeapSimulator::HeapSimulator(
     std::unique_ptr<HeapAlgorithm> algorithm,
-    const LogicalBuffer::SizeFunction& size_fn, const Options& options,
+    const BufferValue::SizeFunction& size_fn, const Options& options,
     const SequentialHloOrdering::HloModuleSequence* module_sequence)
     : no_fragmentation_stats_(MakeUnique<NoFragmentationStatsHeap>()),
       algorithm_(std::move(algorithm)),
@@ -297,7 +297,7 @@ HeapSimulator::HeapSimulator(
 
 HeapSimulator::~HeapSimulator() {}
 
-bool HeapSimulator::IgnoreBuffer(const LogicalBuffer* buffer) const {
+bool HeapSimulator::IgnoreBuffer(const BufferValue* buffer) const {
   // Buffers for constants are ignored unless the alloc_constants option is
   // set. Also ignore buffers that we're not meant to assign.
   //
@@ -311,7 +311,7 @@ bool HeapSimulator::IgnoreBuffer(const LogicalBuffer* buffer) const {
 }
 
 // Alloc always calls the underlying heap algorithm.
-void HeapSimulator::Alloc(const LogicalBuffer* buffer,
+void HeapSimulator::Alloc(const BufferValue* buffer,
                           const HloInstruction* instruction) {
   CHECK(allocated_buffers_.count(buffer) == 0)
       << "Alloc called on allocated buffer: " << *buffer;
@@ -331,7 +331,7 @@ void HeapSimulator::Alloc(const LogicalBuffer* buffer,
 // buffers whose group liveness has expired.  Shared group liveness is tracked
 // by maintaining a refcount; the Free call on the last buffer in the group
 // causes Free to be called on the underlying algorithm.
-void HeapSimulator::Free(const LogicalBuffer* buffer,
+void HeapSimulator::Free(const BufferValue* buffer,
                          const HloInstruction* instruction) {
   auto shared_it = shared_buffers_.find(buffer);
   if (shared_it != shared_buffers_.end()) {
@@ -362,8 +362,8 @@ void HeapSimulator::Free(const LogicalBuffer* buffer,
 // The 'buffer' must be a non-allocated, non-freed buffer, just like in calls to
 // Alloc.  The 'shared' buffer must be a previously allocated or shared buffer.
 // Both 'buffer' and 'shared' will be associated with the same SharedGroup.
-void HeapSimulator::ShareBuffer(const LogicalBuffer* buffer,
-                                const LogicalBuffer* shared,
+void HeapSimulator::ShareBuffer(const BufferValue* buffer,
+                                const BufferValue* shared,
                                 const HloInstruction* instruction) {
   CHECK_LE(size_fn_(*buffer), size_fn_(*shared))
       << "ShareBuffer oversized buffer" << *buffer << " shared: " << *shared;
@@ -374,7 +374,7 @@ void HeapSimulator::ShareBuffer(const LogicalBuffer* buffer,
   CHECK(freed_buffers_.count(shared) == 0)
       << "ShareBuffer called on freed shared buffer: " << *shared;
 
-  const LogicalBuffer* canonical = nullptr;
+  const BufferValue* canonical = nullptr;
   auto shared_it = shared_buffers_.find(shared);
   if (shared_it != shared_buffers_.end()) {
     // The 'shared' buffer already has a group; it might be the canonical, but
@@ -408,7 +408,7 @@ HeapSimulator::Result HeapSimulator::Finish() {
   // collecting statistics, e.g. NoFragmentationStatsHeap.
   if (!result.chunk_map.empty()) {
     for (const auto& share_pair : shared_buffers_) {
-      const LogicalBuffer* buffer = share_pair.first;
+      const BufferValue* buffer = share_pair.first;
       std::shared_ptr<SharedGroup> group = share_pair.second;
       if (buffer != group->canonical) {
         // The canonical must already exist in the chunk_map, since we called
@@ -437,9 +437,9 @@ HeapSimulator::Result HeapSimulator::Finish() {
 }
 
 void HeapSimulator::FillDebugTrace(HeapSimulatorTrace::Event::Kind kind,
-                                   const LogicalBuffer* buffer,
+                                   const BufferValue* buffer,
                                    const HloInstruction* instruction,
-                                   const LogicalBuffer* share_with_canonical) {
+                                   const BufferValue* share_with_canonical) {
   HeapSimulatorTrace::Event* event = debug_trace_.add_events();
   event->set_kind(kind);
   event->set_buffer_id(buffer->id());
@@ -453,14 +453,14 @@ void HeapSimulator::FillDebugTrace(HeapSimulatorTrace::Event::Kind kind,
   }
 }
 
-void NoFragmentationStatsHeap::Alloc(const LogicalBuffer* buffer, int64 size) {
+void NoFragmentationStatsHeap::Alloc(const BufferValue* buffer, int64 size) {
   current_heap_size_ += size;
   if (current_heap_size_ > max_heap_size_) {
     max_heap_size_ = current_heap_size_;
   }
 }
 
-void NoFragmentationStatsHeap::Free(const LogicalBuffer* buffer, int64 size) {
+void NoFragmentationStatsHeap::Free(const BufferValue* buffer, int64 size) {
   current_heap_size_ -= size;
 }
 
@@ -472,12 +472,12 @@ HeapSimulator::Result NoFragmentationStatsHeap::Finish() {
   return result;
 }
 
-void DecreasingSizeRunsHeap::Alloc(const LogicalBuffer* buffer, int64 size) {
+void DecreasingSizeRunsHeap::Alloc(const BufferValue* buffer, int64 size) {
   SetMode(kAlloc);
   run_.emplace_back(Op{buffer, size});
 }
 
-void DecreasingSizeRunsHeap::Free(const LogicalBuffer* buffer, int64 size) {
+void DecreasingSizeRunsHeap::Free(const BufferValue* buffer, int64 size) {
   CHECK(mode_ != kInit) << "Free called on empty heap: " << *buffer;
   SetMode(kFree);
   run_.emplace_back(Op{buffer, size});
@@ -518,7 +518,7 @@ void DecreasingSizeRunsHeap::CallAndDrainRun() {
   run_.clear();
 }
 
-void LazyBestFitHeap::Alloc(const LogicalBuffer* buffer, int64 size) {
+void LazyBestFitHeap::Alloc(const BufferValue* buffer, int64 size) {
   // Degenerate case: 0-sized buffers are always allocated at offset 0.
   if (size == 0) {
     result_.chunk_map.emplace(buffer, Chunk{0, 0});
@@ -586,7 +586,7 @@ void LazyBestFitHeap::Alloc(const LogicalBuffer* buffer, int64 size) {
   result_.chunk_map.emplace(buffer, Chunk{kLazyAllocOffset, size});
 }
 
-void LazyBestFitHeap::Free(const LogicalBuffer* buffer, int64 size) {
+void LazyBestFitHeap::Free(const BufferValue* buffer, int64 size) {
   auto alloc_it = result_.chunk_map.find(buffer);
   CHECK(alloc_it != result_.chunk_map.end())
       << "Free called on non-allocated buffer: " << *buffer;
diff --git a/tensorflow/compiler/xla/service/heap_simulator.h b/tensorflow/compiler/xla/service/heap_simulator.h
index 636f19dd39f097..8b2b43a37a5c41 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.h
+++ b/tensorflow/compiler/xla/service/heap_simulator.h
@@ -21,11 +21,12 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/compiler/xla/service/buffer_value.h"
+#include "tensorflow/compiler/xla/service/buffer_value_containers.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
-#include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
@@ -43,7 +44,7 @@ class HeapAlgorithm;
 // don't need to return the assignment of buffer offsets until the very end.
 class HeapSimulator {
  public:
-  // Chunk represents a contiguous piece of memory.  Each LogicalBuffer will be
+  // Chunk represents a contiguous piece of memory.  Each BufferValue will be
   // associated with a chunk in the assignment result.
   struct Chunk {
     int64 offset;
@@ -55,7 +56,7 @@ class HeapSimulator {
   // Result represents the result of the heap simulation.
   struct Result {
     // The assignment of buffers to chunks.
-    tensorflow::gtl::FlatMap<const LogicalBuffer*, Chunk> chunk_map;
+    tensorflow::gtl::FlatMap<const BufferValue*, Chunk> chunk_map;
 
     // The total size in bytes of the heap, containing all assigned chunks.
     int64 heap_size = 0;
@@ -81,7 +82,7 @@ class HeapSimulator {
     bool alloc_constants;
     // If 'buffers_to_assign' is provided, only those buffers are assigned
     // offsets, otherwise all buffers defined by the instructions are assigned.
-    const tensorflow::gtl::FlatSet<const LogicalBuffer*>* buffers_to_assign;
+    const BufferValueFlatSet* buffers_to_assign;
   };
 
   // Run the heap simulation with the given algorithm, assuming the given
@@ -97,7 +98,7 @@ class HeapSimulator {
       std::unique_ptr<HeapAlgorithm> algorithm, const HloModule& module,
       const SequentialHloOrdering::HloModuleSequence& module_sequence,
       const TuplePointsToAnalysis& points_to_analysis,
-      const LogicalBuffer::SizeFunction& size_fn,
+      const BufferValue::SizeFunction& size_fn,
       const Options& options = Options());
 
   // Same as above, but runs on a single computation. The 'instruction_sequence'
@@ -109,7 +110,7 @@ class HeapSimulator {
       const HloComputation& computation,
       const std::vector<const HloInstruction*>& instruction_sequence,
       const TuplePointsToAnalysis& points_to_analysis,
-      const LogicalBuffer::SizeFunction& size_fn,
+      const BufferValue::SizeFunction& size_fn,
       const Options& options = Options());
 
  private:
@@ -118,7 +119,7 @@ class HeapSimulator {
   // be run recursively. I.e. the simulation is run over the whole module.
   HeapSimulator(
       std::unique_ptr<HeapAlgorithm> algorithm,
-      const LogicalBuffer::SizeFunction& size_fn, const Options& options,
+      const BufferValue::SizeFunction& size_fn, const Options& options,
       const SequentialHloOrdering::HloModuleSequence* module_sequence);
   ~HeapSimulator();
 
@@ -127,21 +128,21 @@ class HeapSimulator {
       const std::vector<const HloInstruction*>& instruction_sequence,
       const TuplePointsToAnalysis& points_to_analysis);
 
-  bool IgnoreBuffer(const LogicalBuffer* buffer) const;
-  void Alloc(const LogicalBuffer* buffer, const HloInstruction* instruction);
-  void Free(const LogicalBuffer* buffer, const HloInstruction* instruction);
-  void ShareBuffer(const LogicalBuffer* buffer, const LogicalBuffer* shared,
+  bool IgnoreBuffer(const BufferValue* buffer) const;
+  void Alloc(const BufferValue* buffer, const HloInstruction* instruction);
+  void Free(const BufferValue* buffer, const HloInstruction* instruction);
+  void ShareBuffer(const BufferValue* buffer, const BufferValue* shared,
                    const HloInstruction* instruction);
   Result Finish();
 
   void FillDebugTrace(HeapSimulatorTrace::Event::Kind kind,
-                      const LogicalBuffer* buffer,
+                      const BufferValue* buffer,
                       const HloInstruction* instruction,
-                      const LogicalBuffer* shared_with_canonical);
+                      const BufferValue* shared_with_canonical);
 
   const std::unique_ptr<HeapAlgorithm> no_fragmentation_stats_;
   const std::unique_ptr<HeapAlgorithm> algorithm_;
-  const LogicalBuffer::SizeFunction size_fn_;
+  const BufferValue::SizeFunction size_fn_;
   const Options options_;
   const SequentialHloOrdering::HloModuleSequence* module_sequence_;
 
@@ -160,15 +161,15 @@ class HeapSimulator {
   // The shared_buffers_ map associates each shared buffer (including the
   // canonical) to its SharedGroup control block.
   struct SharedGroup {
-    const LogicalBuffer* canonical = nullptr;
+    const BufferValue* canonical = nullptr;
     int64 refcount = 0;
   };
-  tensorflow::gtl::FlatMap<const LogicalBuffer*, std::shared_ptr<SharedGroup>>
+  tensorflow::gtl::FlatMap<const BufferValue*, std::shared_ptr<SharedGroup>>
       shared_buffers_;
 
   // Hold some sets for error-checking the sequence of Alloc and Free calls.
-  tensorflow::gtl::FlatSet<const LogicalBuffer*> allocated_buffers_;
-  tensorflow::gtl::FlatSet<const LogicalBuffer*> freed_buffers_;
+  tensorflow::gtl::FlatSet<const BufferValue*> allocated_buffers_;
+  tensorflow::gtl::FlatSet<const BufferValue*> freed_buffers_;
 
   // Debugging information filled in while the heap simulator runs.
   HeapSimulatorTrace debug_trace_;
@@ -186,10 +187,10 @@ class HeapAlgorithm {
   virtual ~HeapAlgorithm() = default;
 
   // Alloc allocates a buffer of 'size' bytes.
-  virtual void Alloc(const LogicalBuffer* buffer, int64 size) = 0;
+  virtual void Alloc(const BufferValue* buffer, int64 size) = 0;
 
   // Free de-allocates a previously allocated buffer.
-  virtual void Free(const LogicalBuffer* buffer, int64 size) = 0;
+  virtual void Free(const BufferValue* buffer, int64 size) = 0;
 
   // Finish collects the buffer offset assignment results.  Free may only be
   // called once, after the Alloc and Free calls.
@@ -205,8 +206,8 @@ class NoFragmentationStatsHeap : public HeapAlgorithm {
   NoFragmentationStatsHeap() = default;
   ~NoFragmentationStatsHeap() override = default;
 
-  void Alloc(const LogicalBuffer* buffer, int64 size) override;
-  void Free(const LogicalBuffer* buffer, int64 size) override;
+  void Alloc(const BufferValue* buffer, int64 size) override;
+  void Free(const BufferValue* buffer, int64 size) override;
   Result Finish() override;
 
  private:
@@ -223,14 +224,14 @@ class DecreasingSizeRunsHeap : public HeapAlgorithm {
       : algorithm_(std::move(algorithm)) {}
   ~DecreasingSizeRunsHeap() override {}
 
-  void Alloc(const LogicalBuffer* buffer, int64 size) override;
-  void Free(const LogicalBuffer* buffer, int64 size) override;
+  void Alloc(const BufferValue* buffer, int64 size) override;
+  void Free(const BufferValue* buffer, int64 size) override;
   Result Finish() override;
 
  private:
   // A single Alloc or Free operation that we've buffered in run_.
   struct Op {
-    const LogicalBuffer* buffer;
+    const BufferValue* buffer;
     int64 size;
   };
 
@@ -266,8 +267,8 @@ class LazyBestFitHeap : public HeapAlgorithm {
   LazyBestFitHeap(int64 alignment) : alignment_(alignment) {}
   ~LazyBestFitHeap() override {}
 
-  void Alloc(const LogicalBuffer* buffer, int64 size) override;
-  void Free(const LogicalBuffer* buffer, int64 size) override;
+  void Alloc(const BufferValue* buffer, int64 size) override;
+  void Free(const BufferValue* buffer, int64 size) override;
   Result Finish() override;
 
  private:
diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc
index 688a271712ac24..6271652412c297 100644
--- a/tensorflow/compiler/xla/service/heap_simulator_test.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc
@@ -20,11 +20,12 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/buffer_value.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
-#include "tensorflow/compiler/xla/service/logical_buffer.h"
+#include "tensorflow/compiler/xla/service/hlo_value.h"
 #include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
@@ -38,7 +39,7 @@ const char kFree[] = "Free";
 const char kFinish[] = "Finish";
 
 // CallSequence records a sequence of Alloc/Free/Finish calls.
-using CallSequence = std::vector<std::pair<string, const LogicalBuffer*>>;
+using CallSequence = std::vector<std::pair<string, const BufferValue*>>;
 
 // HeapCallRecorder is a dummy heap algorithm that simply records its calls.
 class HeapCallRecorder : public HeapAlgorithm {
@@ -46,7 +47,7 @@ class HeapCallRecorder : public HeapAlgorithm {
   explicit HeapCallRecorder(CallSequence* calls) : calls_(calls) {}
   ~HeapCallRecorder() override {}
 
-  void Alloc(const LogicalBuffer* buffer, int64 size) override {
+  void Alloc(const BufferValue* buffer, int64 size) override {
     calls_->emplace_back(kAlloc, buffer);
     // Instead of assigning a real offset, we set the cardinality of the Alloc
     // call.  This isn't a valid assignment, but allows us to easily test for
@@ -54,7 +55,7 @@ class HeapCallRecorder : public HeapAlgorithm {
     const int64 offset = result_.chunk_map.size();
     result_.chunk_map.emplace(buffer, Chunk{offset, size});
   }
-  void Free(const LogicalBuffer* buffer, int64 size) override {
+  void Free(const BufferValue* buffer, int64 size) override {
     calls_->emplace_back(kFree, buffer);
   }
   Result Finish() override {
@@ -76,7 +77,8 @@ class HeapSimulatorTracker {
   HeapSimulatorTracker(
       const string& name, std::unique_ptr<HloComputation> computation,
       const std::vector<const HloInstruction*>& instruction_sequence) {
-    module_ = MakeUnique<HloModule>(name);
+    HloModuleConfig config;
+    module_ = MakeUnique<HloModule>(name, config);
     module_->AddEntryComputation(std::move(computation));
     points_to_analysis_ =
         TuplePointsToAnalysis::Run(module_.get()).ConsumeValueOrDie();
@@ -84,7 +86,7 @@ class HeapSimulatorTracker {
     // size of the buffers doesn't matter, so we always return 0.  We rely on
     // the secondary sorting criteria of DecreasingSizeRunsHeap to sort calls by
     // buffer id, for determinism in the tests.
-    auto zero_size = [](const LogicalBuffer& buffer) { return 0; };
+    auto zero_size = [](const BufferValue& buffer) { return 0; };
     auto algorithm = MakeUnique<DecreasingSizeRunsHeap>(
         MakeUnique<HeapCallRecorder>(&actual_calls_));
     result_ = HeapSimulator::Run(
@@ -94,7 +96,8 @@ class HeapSimulatorTracker {
   }
 
   explicit HeapSimulatorTracker(const string& name) {
-    module_ = MakeUnique<HloModule>(name);
+    HloModuleConfig config;
+    module_ = MakeUnique<HloModule>(name, config);
   }
 
   // Similar to the single entry computation constructor above, but runs the
@@ -115,9 +118,9 @@ class HeapSimulatorTracker {
 
     // Hack the size_fn so that it returns a decreasing value as we step through
     // the sequence. This lets us ensure the Alloc calls are in the sequence
-    // order. The Free calls are sorted by LogicalBuffer.id, which is at least
+    // order. The Free calls are sorted by BufferValue.id, which is at least
     // deterministic.
-    auto size_fn = [&reverse_position](const LogicalBuffer& buffer) {
+    auto size_fn = [&reverse_position](const BufferValue& buffer) {
       return reverse_position[buffer.instruction()];
     };
     auto algorithm = MakeUnique<DecreasingSizeRunsHeap>(
@@ -130,8 +133,8 @@ class HeapSimulatorTracker {
   HloModule* module() { return module_.get(); }
 
   // Returns the buffer defined at the given instruction and index.
-  const LogicalBuffer* BufferAt(const HloInstruction* instruction,
-                                const ShapeIndex& index) const {
+  const BufferValue* BufferAt(const HloInstruction* instruction,
+                              const ShapeIndex& index) const {
     return points_to_analysis_->GetBufferDefinedAt(instruction, index)
         .ConsumeValueOrDie();
   }
@@ -147,8 +150,8 @@ class HeapSimulatorTracker {
                            const ShapeIndex& index_a,
                            const HloInstruction* instruction_b,
                            const ShapeIndex& index_b) {
-    const LogicalBuffer* a = BufferAt(instruction_a, index_a);
-    const LogicalBuffer* b = BufferAt(instruction_b, index_b);
+    const BufferValue* a = BufferAt(instruction_a, index_a);
+    const BufferValue* b = BufferAt(instruction_b, index_b);
     EXPECT_EQ(result_.chunk_map[a].offset, result_.chunk_map[b].offset)
         << *a << ", " << *b;
   }
@@ -522,7 +525,7 @@ TEST_F(HeapSimulatorTest, WholeModule) {
       // Now the final cond less-than buffer is allocated.
       {kAlloc, tracker.BufferAt(cond_lt, {})},
 
-      // The order of the remaining Free calls is based on the LogicalBuffer.id,
+      // The order of the remaining Free calls is based on the BufferValue.id,
       // which is deterministic, but not obvious.
       {kFree, tracker.BufferAt(param, {})},
       {kFree, tracker.BufferAt(param, {0})},
@@ -544,40 +547,40 @@ TEST_F(HeapSimulatorTest, WholeModule) {
 class HeapAlgorithmTestBase : public ::testing::Test {
  protected:
   HeapAlgorithmTestBase() : builder_("heap_simulator_test") {
-    buffer_a_ = DummyLogicalBuffer();
-    buffer_b_ = DummyLogicalBuffer();
-    buffer_c_ = DummyLogicalBuffer();
-    buffer_d_ = DummyLogicalBuffer();
-    buffer_e_ = DummyLogicalBuffer();
-    buffer_f_ = DummyLogicalBuffer();
-    buffer_g_ = DummyLogicalBuffer();
-    buffer_h_ = DummyLogicalBuffer();
-    buffer_i_ = DummyLogicalBuffer();
+    buffer_a_ = DummyBufferValue();
+    buffer_b_ = DummyBufferValue();
+    buffer_c_ = DummyBufferValue();
+    buffer_d_ = DummyBufferValue();
+    buffer_e_ = DummyBufferValue();
+    buffer_f_ = DummyBufferValue();
+    buffer_g_ = DummyBufferValue();
+    buffer_h_ = DummyBufferValue();
+    buffer_i_ = DummyBufferValue();
   }
   ~HeapAlgorithmTestBase() override {}
 
-  const LogicalBuffer* buffer_a_;
-  const LogicalBuffer* buffer_b_;
-  const LogicalBuffer* buffer_c_;
-  const LogicalBuffer* buffer_d_;
-  const LogicalBuffer* buffer_e_;
-  const LogicalBuffer* buffer_f_;
-  const LogicalBuffer* buffer_g_;
-  const LogicalBuffer* buffer_h_;
-  const LogicalBuffer* buffer_i_;
+  const BufferValue* buffer_a_;
+  const BufferValue* buffer_b_;
+  const BufferValue* buffer_c_;
+  const BufferValue* buffer_d_;
+  const BufferValue* buffer_e_;
+  const BufferValue* buffer_f_;
+  const BufferValue* buffer_g_;
+  const BufferValue* buffer_h_;
+  const BufferValue* buffer_i_;
 
  private:
-  // Create a dummy LogicalBuffer to pass to the heap algorithm.
-  const LogicalBuffer* DummyLogicalBuffer() {
-    const LogicalBuffer::Id id = buffers_.size();
+  // Create a dummy BufferValue to pass to the heap algorithm.
+  const BufferValue* DummyBufferValue() {
+    const BufferValue::Id id = buffers_.size();
     auto const0 = builder_.AddInstruction(
         HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
-    buffers_.emplace_back(MakeUnique<LogicalBuffer>(const0, ShapeIndex{}, id));
+    buffers_.emplace_back(MakeUnique<HloValue>(id, const0, ShapeIndex{}));
     return buffers_.back().get();
   }
 
   HloComputation::Builder builder_;
-  std::vector<std::unique_ptr<LogicalBuffer>> buffers_;
+  std::vector<std::unique_ptr<BufferValue>> buffers_;
 };
 
 class NoFragmentationStatsHeapTest : public HeapAlgorithmTestBase {};
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index aa6860880b7a13..1f7c1cffd324ad 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -147,6 +147,9 @@ message HloInstructionProto {
   repeated int64 called_computation_ids = 38;
 
   xla.OpSharding sharding = 40;
+
+  // Backend configuration for the instruction. Has backend-specific meaning.
+  string backend_config = 43;
 }
 
 // Serialization of HloComputation.
diff --git a/tensorflow/compiler/xla/service/hlo_casting_utils.h b/tensorflow/compiler/xla/service/hlo_casting_utils.h
new file mode 100644
index 00000000000000..b15f1f24c60771
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_casting_utils.h
@@ -0,0 +1,101 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Casting utilitiy functions for HLO instructions.
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CASTING_UTILS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CASTING_UTILS_H_
+
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+
+namespace xla {
+
+template <class T>
+using EnableIfDerivedFromHlo =
+    typename std::enable_if<std::is_base_of<HloInstruction, T>::value>::type;
+
+// TODO(b/93238915): Switch implementation from C++'s dynamic_cast to LLVM-like
+// RTTI if it turns out to be a performance issue.
+// Casts an HloInstruction pointer to one of its subclasses, dies if argument is
+// nullptr or runtime information does not match.
+//
+// Similar to LLVM's cast.
+template <class T, EnableIfDerivedFromHlo<T>* = nullptr>
+const T* Cast(const HloInstruction* instruction) {
+  CHECK(instruction != nullptr);
+  const T* casted = dynamic_cast<const T*>(instruction);
+  CHECK(casted != nullptr);
+  return casted;
+}
+
+// Non-const overload of Cast.
+template <class T, EnableIfDerivedFromHlo<T>* = nullptr>
+T* Cast(HloInstruction* instruction) {
+  return const_cast<T*>(
+      Cast<T>(const_cast<const HloInstruction*>(instruction)));
+}
+
+// Works just like the Cast, except that it allows for a null pointer as an
+// argument which it then propagates.
+//
+// Similar to LLVM's cast_or_null.
+template <class T, EnableIfDerivedFromHlo<T>* = nullptr>
+const T* CastOrNull(const HloInstruction* instruction) {
+  return instruction != nullptr ? Cast<T>(instruction) : nullptr;
+}
+
+// Non-const overload of CastOrNull.
+template <class T, EnableIfDerivedFromHlo<T>* = nullptr>
+T* CastOrNull(HloInstruction* instruction) {
+  return const_cast<T*>(
+      CastOrNull<T>(const_cast<const HloInstruction*>(instruction)));
+}
+
+// Casts an HloInstruction pointer to one of its subclasses, dies if argument is
+// nullptr, returns nullptr if runtime information does not match.
+//
+// Similar to LLVM's dyn_cast.
+template <class T, EnableIfDerivedFromHlo<T>* = nullptr>
+const T* DynCast(const HloInstruction* instruction) {
+  CHECK(instruction != nullptr);
+  return dynamic_cast<const T*>(instruction);
+}
+
+// Non-const overload of DynCast.
+template <class T, EnableIfDerivedFromHlo<T>* = nullptr>
+T* DynCast(HloInstruction* instruction) {
+  return const_cast<T*>(
+      DynCast<T>(const_cast<const HloInstruction*>(instruction)));
+}
+
+// Works just like the DynCast, except that it allows for a null pointer as an
+// argument which it then propagates.
+//
+// Similar to LLVM's dyn_cast_or_null.
+template <class T, EnableIfDerivedFromHlo<T>* = nullptr>
+const T* DynCastOrNull(const HloInstruction* instruction) {
+  return instruction != nullptr ? DynCast<T>(instruction) : nullptr;
+}
+
+// Non-const overload of DynCastOrNull.
+template <class T, EnableIfDerivedFromHlo<T>* = nullptr>
+T* DynCastOrNull(HloInstruction* instruction) {
+  return const_cast<T*>(
+      DynCastOrNull<T>(const_cast<const HloInstruction*>(instruction)));
+}
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CASTING_UTILS_H_
diff --git a/tensorflow/compiler/xla/service/hlo_casting_utils_test.cc b/tensorflow/compiler/xla/service/hlo_casting_utils_test.cc
new file mode 100644
index 00000000000000..436a9222342dd8
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_casting_utils_test.cc
@@ -0,0 +1,112 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace {
+
+class DummyInstruction : public HloInstruction {
+ public:
+  DummyInstruction()
+      : HloInstruction(HloOpcode::kConstant, ShapeUtil::MakeShape(F32, {})) {}
+};
+
+class AnotherDummyInstruction : public HloInstruction {
+ public:
+  AnotherDummyInstruction()
+      : HloInstruction(HloOpcode::kParameter, ShapeUtil::MakeShape(F32, {})) {}
+};
+
+TEST(HloCastingUtilsTest, CastSucceeds) {
+  DummyInstruction instruction;
+  DummyInstruction* casted =
+      Cast<DummyInstruction>(static_cast<HloInstruction*>(&instruction));
+  ASSERT_EQ(casted, &instruction);
+}
+
+TEST(HloCastingUtilsTest, CastDiesForWrongType) {
+  AnotherDummyInstruction instruction;
+  ASSERT_DEATH(
+      Cast<DummyInstruction>(static_cast<HloInstruction*>(&instruction)), "");
+}
+
+TEST(HloCastingUtilsTest, CastDiesForNullptr) {
+  HloInstruction* null = nullptr;
+  ASSERT_DEATH(Cast<DummyInstruction>(null), "");
+}
+
+TEST(HloCastingUtilsTest, CastOrNullSucceeds) {
+  DummyInstruction instruction;
+  DummyInstruction* casted =
+      Cast<DummyInstruction>(static_cast<HloInstruction*>(&instruction));
+  ASSERT_EQ(casted, &instruction);
+}
+
+TEST(HloCastingUtilsTest, CastOrNullDiesForWrongType) {
+  AnotherDummyInstruction instruction;
+  ASSERT_DEATH(
+      Cast<DummyInstruction>(static_cast<HloInstruction*>(&instruction)), "");
+}
+
+TEST(HloCastingUtilsTest, CastOrNullReturnsNullptrForNullptr) {
+  HloInstruction* null = nullptr;
+  DummyInstruction* casted = CastOrNull<DummyInstruction>(null);
+  ASSERT_EQ(casted, nullptr);
+}
+
+TEST(HloCastingUtilsTest, DynCastSucceeds) {
+  DummyInstruction instruction;
+  DummyInstruction* casted =
+      DynCast<DummyInstruction>(static_cast<HloInstruction*>(&instruction));
+  ASSERT_EQ(casted, &instruction);
+}
+
+TEST(HloCastingUtilsTest, DynCastReturnsNullptrForWrongType) {
+  AnotherDummyInstruction instruction;
+  DummyInstruction* casted =
+      DynCast<DummyInstruction>(static_cast<HloInstruction*>(&instruction));
+  ASSERT_EQ(casted, nullptr);
+}
+
+TEST(HloCastingUtilsTest, DynCastDiesForNullptr) {
+  HloInstruction* null = nullptr;
+  ASSERT_DEATH(DynCast<DummyInstruction>(null), "");
+}
+
+TEST(HloCastingUtilsTest, DynCastOrNullSucceeds) {
+  DummyInstruction instruction;
+  DummyInstruction* casted = DynCastOrNull<DummyInstruction>(
+      static_cast<HloInstruction*>(&instruction));
+  ASSERT_EQ(casted, &instruction);
+}
+
+TEST(HloCastingUtilsTest, DynCastOrNullReturnsNullptrForWrongType) {
+  AnotherDummyInstruction instruction;
+  DummyInstruction* casted = DynCastOrNull<DummyInstruction>(
+      static_cast<HloInstruction*>(&instruction));
+  ASSERT_EQ(casted, nullptr);
+}
+
+TEST(HloCastingUtilsTest, DynCastOrNullReturnsNullptrForNullptr) {
+  HloInstruction* null = nullptr;
+  DummyInstruction* casted = DynCastOrNull<DummyInstruction>(null);
+  ASSERT_EQ(casted, nullptr);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_clone_context.h b/tensorflow/compiler/xla/service/hlo_clone_context.h
new file mode 100644
index 00000000000000..658643b427a962
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_clone_context.h
@@ -0,0 +1,97 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CLONE_CONTEXT_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CLONE_CONTEXT_H_
+
+#include <string>
+
+#include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+
+namespace xla {
+
+class HloInstruction;
+class HloComputation;
+class HloModule;
+
+// Data structure used to track the cloning of HloInstruction and HloComputation
+// objects.
+class HloCloneContext {
+ public:
+  // Creates a new HloCloneContext object to clone HloInstruction and
+  // HloComputation objects to be added to the module specified as argument.
+  // The suffix string will be appended to computation names.
+  explicit HloCloneContext(HloModule* module, const string& suffix = "")
+      : module_(module), suffix_(suffix) {}
+
+  HloModule* module() const { return module_; }
+
+  const string& suffix() const { return suffix_; }
+
+  void MapInstruction(const HloInstruction* old_instruction,
+                      HloInstruction* new_instruction) {
+    instructions_[old_instruction] = new_instruction;
+  }
+
+  void MapComputation(const HloComputation* old_computation,
+                      HloComputation* new_computation) {
+    computations_[old_computation] = new_computation;
+  }
+
+  // Finds the new instruction mapped to its old copy, or return nullptr in case
+  // it is not found.
+  HloInstruction* FindInstruction(const HloInstruction* old_instruction) const {
+    return FindOrDefault(instructions_, old_instruction, nullptr);
+  }
+
+  // Finds the new computation mapped to its old copy, or return nullptr in case
+  // it is not found.
+  HloComputation* FindComputation(const HloComputation* old_computation) const {
+    return FindOrDefault(computations_, old_computation, nullptr);
+  }
+
+  // Retrieves the new instruction mapped to its old copy, or fail if not found.
+  HloInstruction* GetInstruction(const HloInstruction* old_instruction) const {
+    return FindOrDie(instructions_, old_instruction);
+  }
+
+  // Retrieves the new computation mapped to its old copy, or fail if not found.
+  HloComputation* GetComputation(const HloComputation* old_computation) const {
+    return FindOrDie(computations_, old_computation);
+  }
+
+  const tensorflow::gtl::FlatMap<const HloInstruction*, HloInstruction*>&
+  cloned_instructions() const {
+    return instructions_;
+  }
+
+  const tensorflow::gtl::FlatMap<const HloComputation*, HloComputation*>&
+  cloned_computations() const {
+    return computations_;
+  }
+
+ private:
+  HloModule* module_;
+  string suffix_;
+  tensorflow::gtl::FlatMap<const HloInstruction*, HloInstruction*>
+      instructions_;
+  tensorflow::gtl::FlatMap<const HloComputation*, HloComputation*>
+      computations_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CLONE_CONTEXT_H_
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index 594413e88fb26e..b61eabbbf52624 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -347,6 +347,11 @@ std::list<HloComputation*> HloComputation::MakeEmbeddedComputationsList()
   // To avoid special handling of this computation, cast away const of
   // 'this'. 'this' is immediately removed from the post order after
   // construction.
+  //
+  // TODO(b/78350259): This violates const-correctness, since while the original
+  // computation is not returned, we still retrieve non-const computations from
+  // a const one. Consider also avoiding const for HloComputation, or review XLA
+  // for const-correctness of non-HloInstruction* types like this.
   ComputeComputationPostOrder(const_cast<HloComputation*>(this), &visited,
                               &post_order);
 
@@ -360,25 +365,38 @@ std::list<HloComputation*> HloComputation::MakeEmbeddedComputationsList()
 string HloComputation::ToString(const HloPrintOptions& options) const {
   std::ostringstream s;
   for (int i = 0; i < options.indent_amount(); i++) {
-    s << "    ";
+    s << "  ";
   }
-  if (options.print_percent()) {
-    s << "%";
+
+  if (!options.is_in_nested_computation()) {
+    if (options.print_percent()) {
+      s << "%";
+    }
+    s << name() << " ";
   }
-  s << name();
+
   if (options.print_program_shape()) {
-    s << " " << ShapeUtil::HumanString(ComputeProgramShape());
-  }
-  s << " {\n";
-  for (const HloInstruction* instruction : MakeInstructionPostOrder()) {
-    for (int i = 0; i < options.indent_amount(); i++) {
-      s << "    ";
+    s << ShapeUtil::HumanString(ComputeProgramShape()) << " ";
+  }
+  s << "{\n";
+  {
+    // Print the instructions in this computation.
+    HloPrintOptions new_options = options;
+    new_options.set_indent_amount(options.indent_amount() + 1)
+        .set_is_in_nested_computation(true);
+    CanonicalNameMap name_map;
+    for (const HloInstruction* instruction : MakeInstructionPostOrder()) {
+      for (int i = 0; i < new_options.indent_amount(); i++) {
+        s << "  ";
+      }
+      s << (instruction == root_instruction_ ? "ROOT " : "")
+        << instruction->ToStringWithCanonicalNameMap(new_options, &name_map)
+        << "\n";
     }
-    s << "  " << (instruction == root_instruction_ ? "ROOT " : "")
-      << instruction->ToString(options) << "\n";
   }
+
   for (int i = 0; i < options.indent_amount(); i++) {
-    s << "    ";
+    s << "  ";
   }
   s << "}";
   return s.str();
@@ -402,27 +420,37 @@ HloComputationProto HloComputation::ToProto() const {
 
 /* static */ StatusOr<std::unique_ptr<HloComputation>>
 HloComputation::CreateFromProto(
-    HloModule* module, const HloComputationProto& proto,
+    const HloComputationProto& proto,
     const tensorflow::gtl::FlatMap<int64, HloComputation*>& computation_map) {
-  std::vector<std::unique_ptr<HloInstruction>> instructions;
   tensorflow::gtl::FlatMap<int64, HloInstruction*> instruction_map;
+  tensorflow::gtl::FlatMap<HloInstruction*, int64> to_proto_id;
+  std::vector<std::unique_ptr<HloInstruction>> instructions;
   int64 parameter_count = 0;
   for (const HloInstructionProto& instruction_proto : proto.instructions()) {
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<HloInstruction> instruction,
-        HloInstruction::CreateFromProto(module, instruction_proto,
-                                        instruction_map, computation_map));
+        HloInstruction::CreateFromProto(instruction_proto, instruction_map,
+                                        computation_map));
     if (instruction->opcode() == HloOpcode::kParameter) {
       parameter_count++;
     }
     TF_RET_CHECK(!ContainsKey(instruction_map, instruction_proto.id()));
     instruction_map[instruction_proto.id()] = instruction.get();
+    to_proto_id[instruction.get()] = instruction_proto.id();
     instructions.push_back(std::move(instruction));
   }
 
   TF_RET_CHECK(proto.root_id() != -1);
   TF_RET_CHECK(ContainsKey(instruction_map, proto.root_id()));
   HloInstruction* root = instruction_map.at(proto.root_id());
+
+  // Sort the instructions in the proto id's order.
+  std::sort(instructions.begin(), instructions.end(),
+            [&](const std::unique_ptr<HloInstruction>& a,
+                const std::unique_ptr<HloInstruction>& b) {
+              return to_proto_id[a.get()] < to_proto_id[b.get()];
+            });
+
   return WrapUnique(new HloComputation(proto.name(), parameter_count,
                                        &instructions, root,
                                        /*fusion_instruction=*/nullptr));
@@ -723,18 +751,24 @@ Status HloComputation::Accept(
   return this->Accept(&visitor);
 }
 
-std::unique_ptr<HloComputation> HloComputation::Clone(const string& suffix,
-                                                      HloModule* module) {
+std::unique_ptr<HloComputation> HloComputation::Clone(
+    const string& suffix, HloCloneContext* context) {
   return CloneWithReplacements(
       /*replacements=*/std::unordered_map<const HloInstruction*,
                                           std::unique_ptr<HloInstruction>>(),
-      module, suffix);
+      context, suffix);
 }
 
 std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
     std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
         replacements,
-    HloModule* module, const string& suffix) {
+    HloCloneContext* context, const string& suffix) {
+  std::unique_ptr<HloCloneContext> context_ptr;
+  if (context == nullptr) {
+    context_ptr = MakeUnique<HloCloneContext>(parent(), suffix);
+    context = context_ptr.get();
+  }
+
   // Look up instr in the replacements map, and return either the replacement,
   // or instr, if the replacement isn't present.
   //
@@ -756,24 +790,19 @@ std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
     }
   }
 
-  std::unordered_map<HloInstruction*, HloInstruction*> clone_map;
   std::vector<std::unique_ptr<HloInstruction>> instructions;
-  std::unique_ptr<HloInstruction> new_instr = nullptr;
+  std::unique_ptr<HloInstruction> new_instr;
   for (auto instr : postorder) {
     std::vector<HloInstruction*> new_operands;
     for (auto operand : instr->operands()) {
       auto replaced_operand = replace(operand);
-      // If replaced_operand is null, that means 'replacements' asked us not to
-      // include operand in the new computation.  But we can't do that, because
-      // operand is used by instr.
       CHECK_NE(replaced_operand, nullptr)
           << "replacements map tried to eliminate a used instruction "
           << operand->ToString() << ", used by " << instr->ToString();
-      new_operands.push_back(FindOrDie(clone_map, replaced_operand));
+      new_operands.push_back(context->GetInstruction(replaced_operand));
     }
     new_instr =
-        instr->CloneWithNewOperands(instr->shape(), new_operands, module);
-    InsertOrDie(&clone_map, instr, new_instr.get());
+        instr->CloneWithNewOperands(instr->shape(), new_operands, context);
     instructions.push_back(std::move(new_instr));
   }
   Builder builder(name() + "." + suffix);
@@ -781,27 +810,25 @@ std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
     builder.AddInstruction(std::move(instr));
   }
   auto result = builder.Build(
-      /*root_instruction=*/FindOrDie(clone_map, replace(root_instruction())));
+      /*root_instruction=*/context->GetInstruction(
+          replace(root_instruction())));
 
   // Clone control dependencies.
   for (auto instr : postorder) {
-    HloInstruction* new_instr = FindOrDie(clone_map, instr);
+    HloInstruction* new_instr = context->GetInstruction(instr);
     for (auto successor : instr->control_successors()) {
       auto replaced_successor = replace(successor);
-
-      // successor may not be in clone_map, because it might have been
+      // successor may not have been remapped, because it might have been
       // removed by the replacements map.
-      if (replaced_successor == nullptr) {
-        continue;
+      if (replaced_successor != nullptr) {
+        TF_CHECK_OK(new_instr->AddControlDependencyTo(
+            context->GetInstruction(replaced_successor)));
       }
-
-      TF_CHECK_OK(new_instr->AddControlDependencyTo(
-          FindOrDie(clone_map, replaced_successor)));
     }
   }
-
+  context->MapComputation(this, result.get());
   // We cloned the elements of 'replacements', so they're all going to be
-  // destroyed.  HloInstructions need to be detached from their operands before
+  // destroyed. HloInstructions need to be detached from their operands before
   // they're destroyed, otherwise they stick around in the operands' users lists
   // and cause use-after-frees.
   for (auto& kv : replacements) {
@@ -809,7 +836,6 @@ std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
       new_instr->DetachFromOperands();
     }
   }
-
   return result;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index 9d3f6e9a2c2efd..0da4a305f3d5d6 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/service/hlo_clone_context.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
@@ -49,9 +50,20 @@ class HloModule;
 
 // Describes a computation at the HLO level.
 //
-// An HloComputation contains a directed acyclic graph of HLO instructions. The
-// computation has a single root instruction which produces the output of the
-// computation.
+// You can think of an HloComputation like a function.  It has some inputs
+// (parameters) and returns exactly one value (the value of its root node).  If
+// you want to return multiple values, you can return a tuple.
+//
+// The instructions inside of a computation do not have an explicit total order.
+// Instead, they have a partial order determined by their data and control
+// dependencies.
+//
+// An HloModule contains one "entry computation" -- this is like main() in a C
+// program.  Every other computation inside of a module is attached to one or
+// more HloInstructions, as a "nested computation".  For example, the kMap
+// instruction has a nested computation and "applies" it to every element of its
+// input, elementwise.  (That is, the input [x, y, z] is transformed to [f(x),
+// f(y), f(z)].)
 class HloComputation {
  public:
   // Builder class for HloComputation.
@@ -157,14 +169,12 @@ class HloComputation {
 
   // Creates a computation from the given proto. Arguments:
   //
-  //   module: the module which will contain the computation. The newly created
-  //     computation is *not* added to the module, however.
   //   proto: the proto to convert from.
   //   computation_map: a map from computation id to HloComputation*. This map
   //     must contain all computations which the newly constructed computation
   //     calls.
   static StatusOr<std::unique_ptr<HloComputation>> CreateFromProto(
-      HloModule* module, const HloComputationProto& proto,
+      const HloComputationProto& proto,
       const tensorflow::gtl::FlatMap<int64, HloComputation*>& computation_map);
 
   // Gets the instructions in this computation.
@@ -291,11 +301,11 @@ class HloComputation {
       const std::function<Status(const HloInstruction*)>& visitor_func) const;
 
   // Returns a deep copy of this computation including all instructions.
-  // If the module pointer is not nullptr, it will be the module where
-  // the cloned computations will be added to (in order to support deep
-  // cloning).
+  // If the clone context is specified, it will be populated with the cloned
+  // object mappings, and its module() will be used to add new computations
+  // into.
   std::unique_ptr<HloComputation> Clone(const string& suffix = "clone",
-                                        HloModule* module = nullptr);
+                                        HloCloneContext* context = nullptr);
 
   // Like Clone(), but if an instruction is present in replacement_map, we use
   // the map's value to replace that instruction in the cloned computation.
@@ -305,7 +315,7 @@ class HloComputation {
   std::unique_ptr<HloComputation> CloneWithReplacements(
       std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
           replacements,
-      HloModule* module = nullptr, const string& suffix = "clone");
+      HloCloneContext* context = nullptr, const string& suffix = "clone");
 
   // Returns true if the given instruction can be removed from the computation.
   // Parameter instructions cannot be removed without violating invariants of
diff --git a/tensorflow/compiler/xla/service/hlo_computation_test.cc b/tensorflow/compiler/xla/service/hlo_computation_test.cc
index 7b7588f4ba9aa6..25469a54c48f4f 100644
--- a/tensorflow/compiler/xla/service/hlo_computation_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation_test.cc
@@ -550,6 +550,108 @@ TEST_F(HloComputationTest, Reachability) {
   EXPECT_FALSE(reachability->IsReachable(constant2, copy));
 }
 
+TEST_F(HloComputationTest, Stringification) {
+  const Shape s1 = ShapeUtil::MakeShape(F32, {5, 10});
+  const Shape s2 = ShapeUtil::MakeShape(F32, {20, 10});
+  const Shape s2t = ShapeUtil::MakeShape(F32, {10, 20});
+  const Shape sout = ShapeUtil::MakeShape(F32, {5, 20});
+
+  HloComputation::Builder builder("TransposeDot");
+  HloInstruction* x =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, s1, "x"));
+  HloInstruction* y =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, s2, "y"));
+  HloInstruction* reshape =
+      builder.AddInstruction(HloInstruction::CreateTranspose(s2t, y, {1, 0}));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  builder.AddInstruction(
+      HloInstruction::CreateDot(sout, x, reshape, dot_dnums));
+  auto module = CreateNewModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
+
+  auto options = HloPrintOptions().set_print_metadata(false);
+  EXPECT_EQ(computation->ToString(options),
+            R"(%TransposeDot (x: f32[5,10], y: f32[20,10]) -> f32[5,20] {
+  %x = f32[5,10]{1,0} parameter(0)
+  %y = f32[20,10]{1,0} parameter(1)
+  %transpose = f32[10,20]{1,0} transpose(f32[20,10]{1,0} %y), dimensions={1,0}
+  ROOT %dot = f32[5,20]{1,0} dot(f32[5,10]{1,0} %x, f32[10,20]{1,0} %transpose), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})");
+}
+
+TEST_F(HloComputationTest, StringificationIndent) {
+  const Shape s1 = ShapeUtil::MakeShape(F32, {5, 10});
+  const Shape s2 = ShapeUtil::MakeShape(F32, {20, 10});
+  const Shape s2t = ShapeUtil::MakeShape(F32, {10, 20});
+  const Shape sout = ShapeUtil::MakeShape(F32, {5, 20});
+
+  HloComputation::Builder builder("TransposeDot");
+  HloInstruction* x =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, s1, "x"));
+  HloInstruction* y =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, s2, "y"));
+  HloInstruction* reshape =
+      builder.AddInstruction(HloInstruction::CreateTranspose(s2t, y, {1, 0}));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  builder.AddInstruction(
+      HloInstruction::CreateDot(sout, x, reshape, dot_dnums));
+  auto module = CreateNewModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
+
+  auto options =
+      HloPrintOptions().set_print_metadata(false).set_indent_amount(2);
+  EXPECT_EQ(computation->ToString(options),
+            R"(    %TransposeDot (x: f32[5,10], y: f32[20,10]) -> f32[5,20] {
+      %x = f32[5,10]{1,0} parameter(0)
+      %y = f32[20,10]{1,0} parameter(1)
+      %transpose = f32[10,20]{1,0} transpose(f32[20,10]{1,0} %y), dimensions={1,0}
+      ROOT %dot = f32[5,20]{1,0} dot(f32[5,10]{1,0} %x, f32[10,20]{1,0} %transpose), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    })");
+}
+
+TEST_F(HloComputationTest, StringificationCanonical) {
+  const Shape s1 = ShapeUtil::MakeShape(F32, {5, 10});
+  const Shape s2 = ShapeUtil::MakeShape(F32, {20, 10});
+  const Shape s2t = ShapeUtil::MakeShape(F32, {10, 20});
+  const Shape sout = ShapeUtil::MakeShape(F32, {5, 20});
+
+  HloComputation::Builder builder("TransposeDot");
+  HloInstruction* x =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, s1, "x"));
+  HloInstruction* y =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, s2, "y"));
+  HloInstruction* reshape =
+      builder.AddInstruction(HloInstruction::CreateTranspose(s2t, y, {1, 0}));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  builder.AddInstruction(
+      HloInstruction::CreateDot(sout, x, reshape, dot_dnums));
+  auto module = CreateNewModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
+
+  auto options = HloPrintOptions().set_print_metadata(false);
+  EXPECT_EQ(computation->ToString(options),
+            R"(%TransposeDot (x: f32[5,10], y: f32[20,10]) -> f32[5,20] {
+  %x = f32[5,10]{1,0} parameter(0)
+  %y = f32[20,10]{1,0} parameter(1)
+  %transpose = f32[10,20]{1,0} transpose(f32[20,10]{1,0} %y), dimensions={1,0}
+  ROOT %dot = f32[5,20]{1,0} dot(f32[5,10]{1,0} %x, f32[10,20]{1,0} %transpose), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})");
+
+  options = HloPrintOptions().Canonical();
+  EXPECT_EQ(computation->ToString(options), R"(TransposeDot {
+  tmp_0 = f32[5,10]{1,0} parameter(0)
+  tmp_1 = f32[20,10]{1,0} parameter(1)
+  tmp_2 = f32[10,20]{1,0} transpose(f32[20,10]{1,0} tmp_1), dimensions={1,0}
+  ROOT tmp_3 = f32[5,20]{1,0} dot(f32[5,10]{1,0} tmp_0, f32[10,20]{1,0} tmp_2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})");
+}
+
 }  // namespace
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
index 7b552ee5b1798c..5d05ccfc0b223d 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
@@ -149,7 +149,7 @@ TEST_F(HloConstantFoldingTest, Slice) {
   const int64 slice_limits[] = {10, 8, 6, 5, 9};
   const int64 slice_strides[] = {1, 1, 1, 1, 1};
   TF_ASSERT_OK_AND_ASSIGN(auto literal,
-                          LiteralTestUtil::CreateRandomLiteral<F32>(
+                          Literal::CreateRandomLiteral<F32>(
                               ShapeUtil::MakeShape(F32, dimensions), 0.0, 1.0));
   HloInstruction* literal_instruction = builder.AddInstruction(
       HloInstruction::CreateConstant(std::move(literal)));
@@ -172,7 +172,7 @@ TEST_F(HloConstantFoldingTest, TransposeConstantFold) {
   HloComputation::Builder builder(TestName());
   const int64 dimensions[] = {11, 8, 7, 5, 9};
   TF_ASSERT_OK_AND_ASSIGN(auto literal,
-                          LiteralTestUtil::CreateRandomLiteral<F32>(
+                          Literal::CreateRandomLiteral<F32>(
                               ShapeUtil::MakeShape(F32, dimensions), 0.0, 1.0));
   auto literal_clone = literal->Literal::CloneToUnique();
   HloInstruction* literal_instruction = builder.AddInstruction(
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 44e4f75f75b275..94c9c7eabcc99d 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -142,19 +142,25 @@ Status HloCostAnalysis::HandleReducePrecision(const HloInstruction* hlo) {
 }
 
 Status HloCostAnalysis::HandleParameter(const HloInstruction*) {
+  current_should_compute_bottleneck_time_ = false;
   current_properties_[kBytesAccessedKey] = 0;
+  current_properties_[kOptimalSecondsKey] = 0;
   return Status::OK();
 }
 
 Status HloCostAnalysis::HandleConstant(const HloInstruction*) {
+  current_should_compute_bottleneck_time_ = false;
   current_properties_[kBytesAccessedKey] = 0;
+  current_properties_[kOptimalSecondsKey] = 0;
   return Status::OK();
 }
 
 Status HloCostAnalysis::HandleGetTupleElement(const HloInstruction*) {
   // GetTupleElement forwards a pointer and does not touch each element in the
   // output.
+  current_should_compute_bottleneck_time_ = false;
   current_properties_[kBytesAccessedKey] = 0;
+  current_properties_[kOptimalSecondsKey] = 0;
   return Status::OK();
 }
 
@@ -329,6 +335,7 @@ Status HloCostAnalysis::HandleSelectAndScatter(
 Status HloCostAnalysis::HandleBitcast(const HloInstruction*) {
   // A bitcast does no computation and touches no memory.
   current_properties_[kBytesAccessedKey] = 0;
+  current_properties_[kOptimalSecondsKey] = 0;
   return Status::OK();
 }
 
@@ -555,11 +562,13 @@ Status HloCostAnalysis::HandleCall(const HloInstruction* call) {
 }
 
 Status HloCostAnalysis::HandleCustomCall(const HloInstruction*) {
-  // We can't do anything sane with CustomCalls, since we don't know what they
-  // do, and returning an error status will stop iteration over this
-  // computation, which is probably also not what we want.  So just punt and
-  // return OK.  This will cause all of the properties to be reported as 0,
-  // which is fine.
+  // Mark applicable fields as "unknown", since we don't know what CustomCall
+  // does.  This is better than returning an error, which would stop iteration,
+  // and therefore would prevent us from getting *any* stats for a computation
+  // which contains a CustomCall.
+  current_properties_[kOptimalSecondsKey] = -1;
+  current_properties_[kBytesAccessedKey] = -1;
+  current_properties_[kFlopsKey] = -1;
   current_should_compute_bottleneck_time_ = false;
   return Status::OK();
 }
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
index 3d055b327ee920..16fdda8a8b9ade 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
@@ -20,16 +20,13 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
-#include "tensorflow/compiler/xla/service/computation_tracker.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/service/service.h"
-#include "tensorflow/compiler/xla/service/user_computation.h"
-#include "tensorflow/compiler/xla/service/versioned_computation_handle.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/platform/logging.h"
@@ -58,11 +55,10 @@ class HloCostAnalysisTest : public ::testing::Test {
         // whitebox accesses to the user computation built from the client,
         // as shown in the BuildHloGraph functions below.
         service_(static_cast<Service*>(ClientLibrary::GetXlaService(
-            static_cast<LocalClient*>(client_)->platform()))),
-        computation_tracker_(service_->computation_tracker()) {
+            static_cast<LocalClient*>(client_)->platform()))) {
     // Create a computation for a unary user function: x => exp(x + 0.5)
     {
-      ComputationBuilder builder(client_, "add_and_exp");
+      XlaBuilder builder("add_and_exp");
       auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
       auto half = builder.ConstantR0<float>(0.5);
       builder.Exp(builder.Add(x, half));
@@ -73,7 +69,7 @@ class HloCostAnalysisTest : public ::testing::Test {
 
     // Create a computation for a binary user function: (x, y) => x + y
     {
-      ComputationBuilder builder(client_, "add");
+      XlaBuilder builder("add");
       auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
       auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
       builder.Add(x, y);
@@ -84,7 +80,7 @@ class HloCostAnalysisTest : public ::testing::Test {
 
     // Create a computation for a sigmoid function: x => 1 / (1 + exp(-x))
     {
-      ComputationBuilder builder(client_, "sigmoid");
+      XlaBuilder builder("sigmoid");
       auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
       auto one = builder.ConstantR0<float>(1.0);
       builder.Div(one, builder.Add(one, builder.Exp(builder.Neg(x))));
@@ -95,7 +91,7 @@ class HloCostAnalysisTest : public ::testing::Test {
 
     // Create a computation for a binary max function: (x, y) => max (x, y)
     {
-      ComputationBuilder builder(client_, "max");
+      XlaBuilder builder("max");
       auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
       auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
       builder.Max(x, y);
@@ -106,7 +102,7 @@ class HloCostAnalysisTest : public ::testing::Test {
 
     // Create a computation for a binary GT function: (x, y) => x > y
     {
-      ComputationBuilder builder(client_, "gt");
+      XlaBuilder builder("gt");
       auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
       auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "y");
       builder.Gt(x, y);
@@ -117,35 +113,30 @@ class HloCostAnalysisTest : public ::testing::Test {
   }
 
   // Build HLO graph from the given builder and return the HLO module.
-  std::unique_ptr<HloModule> BuildHloGraph(ComputationBuilder* builder) {
+  std::unique_ptr<HloModule> BuildHloGraph(XlaBuilder* builder) {
     auto computation_status = builder->Build();
     TF_CHECK_OK(computation_status.status());
     auto computation = computation_status.ConsumeValueOrDie();
-    auto user_computation_status =
-        computation_tracker_.Resolve(computation.handle());
-    TF_CHECK_OK(user_computation_status.status());
-    auto user_computation = user_computation_status.ConsumeValueOrDie();
-    VersionedComputationHandle versioned_handle =
-        user_computation->GetVersionedHandle();
-    return std::move(
-        computation_tracker_.BuildHloModule(versioned_handle, HloModuleConfig())
-            .ValueOrDie());
+    auto config = HloModule::CreateModuleConfigFromProto(computation.proto(),
+                                                         DebugOptions())
+                      .ConsumeValueOrDie();
+    return HloModule::CreateFromProto(computation.proto(), config)
+        .ConsumeValueOrDie();
   }
 
   Client* client_;
   Service* service_;
-  const ComputationTracker& computation_tracker_;
 
   // User computations used for higher order operations (e.g., Map, Reduce).
-  Computation add_;
-  Computation add_and_exp_;
-  Computation sigmoid_;
-  Computation max_;
-  Computation gt_;
+  XlaComputation add_;
+  XlaComputation add_and_exp_;
+  XlaComputation sigmoid_;
+  XlaComputation max_;
+  XlaComputation gt_;
 };
 
 TEST_F(HloCostAnalysisTest, MatrixMultiply) {
-  ComputationBuilder builder(client_, "matrix_multiply");
+  XlaBuilder builder("matrix_multiply");
   auto lhs = builder.Parameter(0, ShapeUtil::MakeShape(F32, {10, 5}), "lhs");
   auto rhs = builder.Parameter(1, ShapeUtil::MakeShape(F32, {5, 30}), "rhs");
   auto result = builder.Dot(lhs, rhs);
@@ -167,7 +158,7 @@ TEST_F(HloCostAnalysisTest, MatrixMultiply) {
 }
 
 TEST_F(HloCostAnalysisTest, Map) {
-  ComputationBuilder builder(client_, "map");
+  XlaBuilder builder("map");
   auto input = builder.Parameter(0, ShapeUtil::MakeShape(F32, {10}), "in");
   auto result = builder.Map({input}, add_and_exp_, {0});
 
@@ -184,7 +175,7 @@ TEST_F(HloCostAnalysisTest, Map) {
 }
 
 TEST_F(HloCostAnalysisTest, Convolution) {
-  ComputationBuilder builder(client_, "convolution");
+  XlaBuilder builder("convolution");
   auto input = builder.Parameter(
       0,
       ShapeUtil::MakeShape(F32, {/*p_dim=*/1, /*z_dim=*/1, /*y_dim=*/10,
@@ -213,7 +204,7 @@ TEST_F(HloCostAnalysisTest, Convolution) {
 }
 
 TEST_F(HloCostAnalysisTest, Reduce) {
-  ComputationBuilder builder(client_, "reduce");
+  XlaBuilder builder("reduce");
   auto input =
       builder.Parameter(0, ShapeUtil::MakeShape(F32, {10, 20}), "input");
   auto result =
@@ -231,7 +222,7 @@ TEST_F(HloCostAnalysisTest, Reduce) {
 }
 
 TEST_F(HloCostAnalysisTest, ReduceWindow) {
-  ComputationBuilder builder(client_, "reduce_window");
+  XlaBuilder builder("reduce_window");
   auto input =
       builder.Parameter(0, ShapeUtil::MakeShape(F32, {10, 20}), "input");
   auto result = builder.ReduceWindow(input, builder.ConstantR0<float>(0), add_,
@@ -248,7 +239,7 @@ TEST_F(HloCostAnalysisTest, ReduceWindow) {
 }
 
 TEST_F(HloCostAnalysisTest, SelectAndScatter) {
-  ComputationBuilder builder(client_, "select_and_scatter");
+  XlaBuilder builder("select_and_scatter");
   auto operand =
       builder.Parameter(0, ShapeUtil::MakeShape(F32, {10, 20}), "input");
   auto source =
@@ -269,7 +260,7 @@ TEST_F(HloCostAnalysisTest, SelectAndScatter) {
 }
 
 TEST_F(HloCostAnalysisTest, Broadcast) {
-  ComputationBuilder b(client_, "broadcast");
+  XlaBuilder b("broadcast");
   b.Broadcast(b.ConstantR0<float>(42), {10, 7});
   auto hlo_module = BuildHloGraph(&b);
   HloCostAnalysis analysis(ShapeSize);
@@ -280,7 +271,7 @@ TEST_F(HloCostAnalysisTest, Broadcast) {
 
 // Calculates the computation cost of a graph with more than one HLO node.
 TEST_F(HloCostAnalysisTest, FullyConnectedForward) {
-  ComputationBuilder builder(client_, "fully_connected_forward");
+  XlaBuilder builder("fully_connected_forward");
   auto input =
       builder.Parameter(0, ShapeUtil::MakeShape(F32, {10, 5}), "input");
   auto weight =
@@ -305,7 +296,7 @@ TEST_F(HloCostAnalysisTest, FullyConnectedForward) {
 TEST_F(HloCostAnalysisTest, MatmulAndConvolutionCanBeTheSameComputation) {
   HloCostAnalysis conv_analysis(ShapeSize);
   {
-    ComputationBuilder builder(client_, "conv_looking_matmul");
+    XlaBuilder builder("conv_looking_matmul");
     auto lhs = builder.Parameter(0, ShapeUtil::MakeShape(F32, {64, 64, 1, 1}),
                                  "input");
     auto rhs = builder.Parameter(1, ShapeUtil::MakeShape(F32, {64, 64, 1, 1}),
@@ -318,7 +309,7 @@ TEST_F(HloCostAnalysisTest, MatmulAndConvolutionCanBeTheSameComputation) {
 
   HloCostAnalysis matmul_analysis(ShapeSize);
   {
-    ComputationBuilder builder(client_, "matmul");
+    XlaBuilder builder("matmul");
     auto lhs =
         builder.Parameter(0, ShapeUtil::MakeShape(F32, {64, 64}), "input");
     auto rhs =
@@ -370,8 +361,8 @@ TEST_F(FusionCostAnalysis, LoopFusion) {
         HloInstruction::CreateBinary(r2f32, HloOpcode::kSubtract, mul, clamp));
     auto tuple = HloInstruction::CreateTuple({sub, sub, mul, c1});
 
-    HloModule module(TestName());
-    auto* computation = module.AddEntryComputation(builder.Build());
+    auto module = CreateNewModule();
+    auto* computation = module->AddEntryComputation(builder.Build());
     auto* fusion = computation->CreateFusionInstruction(
         {sub, mul, exp, clamp, add}, HloInstruction::FusionKind::kLoop);
 
@@ -412,8 +403,8 @@ TEST_F(FusionCostAnalysis, NoLayout) {
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       shape_with_layout, HloOpcode::kAdd, c1, broadcast));
 
-  HloModule module(TestName());
-  auto* computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {add, broadcast}, HloInstruction::FusionKind::kLoop);
 
@@ -427,7 +418,7 @@ TEST_F(FusionCostAnalysis, NoLayout) {
 TEST_F(HloCostAnalysisTest, TupleCost) {
   HloCostAnalysis analysis(ShapeSize);
   {
-    ComputationBuilder builder(client_, "matmul");
+    XlaBuilder builder("matmul");
     auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {123}), "x");
     auto y = builder.Parameter(1, ShapeUtil::MakeShape(F32, {42}), "y");
     auto tuple = builder.Tuple({x, y});
@@ -443,7 +434,7 @@ TEST_F(HloCostAnalysisTest, TupleCost) {
 }
 
 TEST_F(HloCostAnalysisTest, BaseDilatedConvolution) {
-  ComputationBuilder builder(client_, "BaseDilatedConvolution");
+  XlaBuilder builder("BaseDilatedConvolution");
   auto input = builder.Parameter(
       0,
       ShapeUtil::MakeShape(F32, {/*p_dim=*/1, /*z_dim=*/1, /*y_dim=*/10,
@@ -458,7 +449,7 @@ TEST_F(HloCostAnalysisTest, BaseDilatedConvolution) {
   auto result = builder.ConvGeneralDilated(
       input, kernel, /*window_strides=*/{1, 1}, /*padding=*/{{1, 1}, {1, 1}},
       /*lhs_dilation=*/{3, 5}, /*rhs_dilation=*/{7, 11},
-      ComputationBuilder::CreateDefaultConvDimensionNumbers(2));
+      XlaBuilder::CreateDefaultConvDimensionNumbers(2));
 
   // Run HLO cost analysis.
   auto hlo_module = BuildHloGraph(&builder);
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.cc b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
index 9a89888480b8c7..0fb65c845a6d44 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.cc
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.cc
@@ -162,6 +162,17 @@ StatusOr<HloInstruction*> MakeConcatHlo(ArraySlice<HloInstruction*> operands,
       HloInstruction::CreateConcatenate(concat_shape, operands, dimension));
 }
 
+StatusOr<HloInstruction*> MakeDotHlo(HloInstruction* lhs, HloInstruction* rhs,
+                                     const DotDimensionNumbers& dim_numbers) {
+  HloComputation* computation = lhs->parent();
+  CHECK_EQ(computation, rhs->parent());
+  TF_ASSIGN_OR_RETURN(
+      Shape dot_shape,
+      ShapeInference::InferDotOpShape(lhs->shape(), rhs->shape(), dim_numbers));
+  return computation->AddInstruction(
+      HloInstruction::CreateDot(dot_shape, lhs, rhs, dim_numbers));
+}
+
 StatusOr<HloInstruction*> CollapseFirstNDims(HloInstruction* operand, int64 n) {
   CHECK_GT(n, 0);
 
@@ -269,7 +280,7 @@ StatusOr<HloInstruction*> BroadcastZeros(
 StatusOr<std::unique_ptr<HloComputation>> CreateComputationWithSignature(
     ArraySlice<const Shape*> domain, const Shape& range,
     tensorflow::StringPiece name) {
-  HloComputation::Builder b(name.ToString());
+  HloComputation::Builder b{std::string(name)};
   int64 param_idx = 0;
   for (const Shape* param_shape : domain) {
     b.AddInstruction(HloInstruction::CreateParameter(
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils.h b/tensorflow/compiler/xla/service/hlo_creation_utils.h
index c9a7361a6af0c2..49b1402d689a74 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils.h
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils.h
@@ -97,6 +97,11 @@ StatusOr<HloInstruction*> MakeGetTupleElementHlo(HloInstruction* operand,
 StatusOr<HloInstruction*> MakeConcatHlo(
     tensorflow::gtl::ArraySlice<HloInstruction*> operands, int64 dimension);
 
+// Creates a Dot HLO instruction and adds it to the computation containing `lhs`
+// and `rhs` (both must be in the same computation).
+StatusOr<HloInstruction*> MakeDotHlo(HloInstruction* lhs, HloInstruction* rhs,
+                                     const DotDimensionNumbers& dim_numbers);
+
 // -----------------------------------------------------------------------------
 // Some other miscellaneous helpers to generate common HLO patterns.  All of
 // these add all the instructions they generate into the computation containing
diff --git a/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc b/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
index 6b681a5bf6f34b..7e7c4f95fed737 100644
--- a/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_creation_utils_test.cc
@@ -19,27 +19,32 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
 using tensorflow::gtl::ArraySlice;
 
-std::unique_ptr<HloModule> CreateModuleWithProgramShape(
-    PrimitiveType primitive_type, ArraySlice<int64> input_shape_dims,
-    ArraySlice<int64> output_shape_dims, HloInstruction** param,
-    HloComputation** entry_computation) {
-  Shape input_shape = ShapeUtil::MakeShape(primitive_type, input_shape_dims);
-  Shape output_shape = ShapeUtil::MakeShape(primitive_type, output_shape_dims);
-  std::unique_ptr<HloModule> module = MakeUnique<HloModule>("test");
-  *entry_computation = module->AddEntryComputation(
-      CreateComputationWithSignature({&input_shape}, output_shape, "entry")
-          .ValueOrDie());
-  *param = (*entry_computation)->parameter_instruction(0);
-  return module;
-}
-
-TEST(HloCreationUtilsTest, CollapseFirst1Dim) {
+class HloCreationUtilsTest : public HloTestBase {
+ protected:
+  static std::unique_ptr<HloModule> CreateModuleWithProgramShape(
+      PrimitiveType primitive_type, ArraySlice<int64> input_shape_dims,
+      ArraySlice<int64> output_shape_dims, HloInstruction** param,
+      HloComputation** entry_computation) {
+    Shape input_shape = ShapeUtil::MakeShape(primitive_type, input_shape_dims);
+    Shape output_shape =
+        ShapeUtil::MakeShape(primitive_type, output_shape_dims);
+    auto module = CreateNewModule("test");
+    *entry_computation = module->AddEntryComputation(
+        CreateComputationWithSignature({&input_shape}, output_shape, "entry")
+            .ValueOrDie());
+    *param = (*entry_computation)->parameter_instruction(0);
+    return module;
+  }
+};
+
+TEST_F(HloCreationUtilsTest, CollapseFirst1Dim) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
@@ -59,7 +64,7 @@ TEST(HloCreationUtilsTest, CollapseFirst1Dim) {
   CHECK_EQ(*result_literal, *Literal::CreateR1<int32>({3, 4}));
 }
 
-TEST(HloCreationUtilsTest, CollapseFirst2Dims) {
+TEST_F(HloCreationUtilsTest, CollapseFirst2Dims) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
@@ -84,7 +89,7 @@ TEST(HloCreationUtilsTest, CollapseFirst2Dims) {
                {{1, 2}, {3, 4}, {5, 6}, {-1, -2}, {-3, -4}, {-5, -6}}));
 }
 
-TEST(HloCreationUtilsTest, Prepend1DegenerateDim) {
+TEST_F(HloCreationUtilsTest, Prepend1DegenerateDim) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
@@ -104,7 +109,7 @@ TEST(HloCreationUtilsTest, Prepend1DegenerateDim) {
   CHECK_EQ(*result_literal, *Literal::CreateR2<int32>({{9, 10}}));
 }
 
-TEST(HloCreationUtilsTest, Prepend2DegenerateDims) {
+TEST_F(HloCreationUtilsTest, Prepend2DegenerateDims) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
@@ -124,7 +129,7 @@ TEST(HloCreationUtilsTest, Prepend2DegenerateDims) {
   CHECK_EQ(*result_literal, *Literal::CreateR3<int32>({{{9, 10}}}));
 }
 
-TEST(HloCreationUtilsTest, Prepend2DegenerateDimsToScalar) {
+TEST_F(HloCreationUtilsTest, Prepend2DegenerateDimsToScalar) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
@@ -144,7 +149,7 @@ TEST(HloCreationUtilsTest, Prepend2DegenerateDimsToScalar) {
   CHECK_EQ(*result_literal, *Literal::CreateR2<int32>({{9}}));
 }
 
-TEST(HloCreationUtilsTest, ExpandFirstDimInto3Dims) {
+TEST_F(HloCreationUtilsTest, ExpandFirstDimInto3Dims) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
@@ -166,7 +171,7 @@ TEST(HloCreationUtilsTest, ExpandFirstDimInto3Dims) {
            *Literal::CreateR3<int32>({{{1, 2}}, {{3, 4}}, {{5, 6}}}));
 }
 
-TEST(HloCreationUtilsTest, PadVectorWithZeros) {
+TEST_F(HloCreationUtilsTest, PadVectorWithZeros) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
@@ -187,7 +192,7 @@ TEST(HloCreationUtilsTest, PadVectorWithZeros) {
   CHECK_EQ(*result_literal, *Literal::CreateR1<int32>({0, 0, 0, 3, 4, 0}));
 }
 
-TEST(HloCreationUtilsTest, BroadcastZeros_S32) {
+TEST_F(HloCreationUtilsTest, BroadcastZeros_S32) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
@@ -208,7 +213,7 @@ TEST(HloCreationUtilsTest, BroadcastZeros_S32) {
   CHECK_EQ(*result_literal, *Literal::CreateR2<int32>({{0, 0}, {0, 0}}));
 }
 
-TEST(HloCreationUtilsTest, BroadcastZeros_F32) {
+TEST_F(HloCreationUtilsTest, BroadcastZeros_F32) {
   HloInstruction* param;
   HloComputation* entry_computation;
 
diff --git a/tensorflow/compiler/xla/service/hlo_cse.cc b/tensorflow/compiler/xla/service/hlo_cse.cc
index 3b22c93733af29..dab946a099fa00 100644
--- a/tensorflow/compiler/xla/service/hlo_cse.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse.cc
@@ -26,12 +26,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_domain_map.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 
 namespace xla {
@@ -40,16 +42,16 @@ namespace {
 
 // Find and combine identical constants. Constants are identical if they have
 // the same type and value.
-bool CombineConstants(HloComputation* computation, bool is_layout_sensitive) {
-  bool changed = false;
-
+StatusOr<bool> CombineConstants(HloComputation* computation,
+                                bool is_layout_sensitive) {
+  TF_ASSIGN_OR_RETURN(auto domain_map, HloDomainMap::Create(computation, ""));
   // Map from ShortDebugString of the layoutless shape of the constant to the
   // set of constant instructions with that shape. Layoutless shape is used to
   // bin possible common constants together to reduce number of constant
   // comparisons. If we end up having too many constant comparisons, a more
   // precise binning might have to be used.
   std::multimap<string, HloInstruction*> constants;
-
+  int64 combined = 0;
   auto inst_it = computation->instructions().begin();
   while (inst_it != computation->instructions().end()) {
     HloInstruction* instruction = *inst_it;
@@ -69,7 +71,8 @@ bool CombineConstants(HloComputation* computation, bool is_layout_sensitive) {
       auto range = constants.equal_range(shape_string);
       HloInstruction* match = nullptr;
       for (auto it = range.first; it != range.second; ++it) {
-        if (instruction->literal() == it->second->literal()) {
+        if (instruction->literal() == it->second->literal() &&
+            domain_map->InSameDomain(it->second, instruction)) {
           match = it->second;
           break;
         }
@@ -80,12 +83,27 @@ bool CombineConstants(HloComputation* computation, bool is_layout_sensitive) {
         // Match found, replace this instruction with the one in the multimap.
         TF_CHECK_OK(instruction->ReplaceAllUsesWith(match));
         TF_CHECK_OK(computation->RemoveInstruction(instruction));
-        changed = true;
+        ++combined;
       }
     }
   }
+  VLOG(4) << "Combined " << combined << " constants in " << computation->name()
+          << " computation";
+  return combined > 0;
+}
 
-  return changed;
+// An instruction is considered to be equivalent to another only if they
+// share the exact same set of operands.
+int64 CseHash(const HloInstruction* instruction) {
+  int64 hash = std::hash<int64>()(static_cast<int64>(instruction->opcode()));
+  hash = tensorflow::Hash64Combine(
+      hash, instruction->opcode() == HloOpcode::kGetTupleElement
+                ? instruction->tuple_index()
+                : -1);
+  for (auto operand : instruction->operands()) {
+    hash = tensorflow::Hash64Combine(hash, operand->unique_id());
+  }
+  return hash;
 }
 
 }  // namespace
@@ -95,21 +113,34 @@ StatusOr<bool> HloCSE::Run(HloModule* module) {
   const std::function<bool(const HloInstruction*, const HloInstruction*)>
       eq_instructions = std::equal_to<const HloInstruction*>();
   const std::function<bool(const HloComputation*, const HloComputation*)>
-      eq_computations = std::equal_to<const HloComputation*>();
+      eq_computations = [](const HloComputation* lhs,
+                           const HloComputation* rhs) { return *lhs == *rhs; };
+
+  auto cse_equal = [&](const HloInstruction* lhs, const HloInstruction* rhs) {
+    return lhs->Identical(*rhs, eq_instructions, eq_computations,
+                          is_layout_sensitive_);
+  };
+
   for (auto* computation : module->computations()) {
     if (only_fusion_computations_ && !computation->IsFusionComputation()) {
       continue;
     }
 
-    changed |= CombineConstants(computation, is_layout_sensitive_);
-
-    std::list<HloInstruction*> post_order =
-        computation->MakeInstructionPostOrder();
-    std::set<HloInstruction*> removed_instructions;
-    for (auto instruction : post_order) {
-      // If the instruction has already been removed by CSE skip over it.
-      if (removed_instructions.count(instruction) > 0 ||
-          instruction->operand_count() == 0) {
+    TF_ASSIGN_OR_RETURN(bool combined,
+                        CombineConstants(computation, is_layout_sensitive_));
+    changed |= combined;
+
+    // HLO instructions are grouped into equivalency classes by using the
+    // cse_equal predicate defined above. This set holds a representative
+    // instruction for each class.
+    tensorflow::gtl::FlatSet<HloInstruction*, decltype(&CseHash),
+                             decltype(cse_equal)>
+        representatives(/*N=*/1024, &CseHash, cse_equal);
+
+    for (auto instruction : computation->MakeInstructionPostOrder()) {
+      // If the instruction has zero operands (constants, parameters, etc.) skip
+      // over it.
+      if (instruction->operand_count() == 0) {
         continue;
       }
 
@@ -118,31 +149,16 @@ StatusOr<bool> HloCSE::Run(HloModule* module) {
         continue;
       }
 
-      // An instruction is considered to be equivalent to another only if they
-      // share the exact same set of operands. So to find equivalent
-      // instructions, we just search among instructions which share operand(0)
-      // of this instruction.
-      const HloInstruction* operand = instruction->operand(0);
-
-      tensorflow::gtl::InlinedVector<HloInstruction*, 8>
-          equivalent_instructions;
-      for (HloInstruction* user : operand->users()) {
-        if (user != instruction && !user->HasSideEffect() &&
-            user->Identical(*instruction, eq_instructions, eq_computations,
-                            is_layout_sensitive_)) {
-          equivalent_instructions.push_back(user);
-        }
-      }
-
-      // Replace all equivalent instructions with this instruction.
-      for (HloInstruction* equivalent_instruction : equivalent_instructions) {
-        TF_RETURN_IF_ERROR(
-            equivalent_instruction->ReplaceAllUsesWith(instruction));
+      auto it = representatives.find(instruction);
+      if (it != representatives.end()) {
+        HloInstruction* equivalent_instruction = *it;
         TF_RETURN_IF_ERROR(
-            computation->RemoveInstruction(equivalent_instruction));
-        removed_instructions.insert(equivalent_instruction);
+            instruction->ReplaceAllUsesWith(equivalent_instruction));
+        TF_RETURN_IF_ERROR(computation->RemoveInstruction(instruction));
         changed = true;
+        continue;
       }
+      representatives.insert(instruction);
     }
   }
   return changed;
diff --git a/tensorflow/compiler/xla/service/hlo_cse_test.cc b/tensorflow/compiler/xla/service/hlo_cse_test.cc
index df8853f34f6a72..16db374566c727 100644
--- a/tensorflow/compiler/xla/service/hlo_cse_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse_test.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -72,7 +73,7 @@ TEST_F(HloCseTest, CombineTwoConstants) {
 
   auto result = ExecuteAndTransfer(std::move(module), {});
   auto expected = Literal::CreateR0<float>(84.0);
-  LiteralTestUtil::ExpectNear(*expected, *result, ErrorSpec(1e-4));
+  EXPECT_TRUE(LiteralTestUtil::Near(*expected, *result, ErrorSpec(1e-4)));
 }
 
 TEST_F(HloCseTest, CombineTwoConstantsDifferentLayoutsAndInsensitive) {
@@ -104,7 +105,7 @@ TEST_F(HloCseTest, CombineTwoConstantsDifferentLayoutsAndInsensitive) {
 
   auto result = ExecuteAndTransfer(std::move(module), {});
   auto expected = Literal::CreateR2<float>({{2.0, 4.0}, {6.0, 8.0}});
-  LiteralTestUtil::ExpectNear(*expected, *result, ErrorSpec(1e-4));
+  EXPECT_TRUE(LiteralTestUtil::Near(*expected, *result, ErrorSpec(1e-4)));
 }
 
 TEST_F(HloCseTest, CombineTwoConstantsDifferentLayoutsAndSensitive) {
@@ -134,38 +135,53 @@ TEST_F(HloCseTest, CombineTwoConstantsDifferentLayoutsAndSensitive) {
 
   auto result = ExecuteAndTransfer(std::move(module), {});
   auto expected = Literal::CreateR2<float>({{2.0, 4.0}, {6.0, 8.0}});
-  LiteralTestUtil::ExpectNear(*expected, *result, ErrorSpec(1e-4));
+  EXPECT_TRUE(LiteralTestUtil::Near(*expected, *result, ErrorSpec(1e-4)));
 }
 
 TEST_F(HloCseTest, ConstantsSameValueDifferentType) {
   // Test that constants with the same value but different type are *not*
   // commoned.
   auto builder = HloComputation::Builder(TestName());
-  builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<uint32>(42)));
-  builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<int32>(42)));
-  builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<uint64>(42.0)));
-  builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<int64>(42.0)));
-  builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<double>(42.0)));
-  builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
+  std::vector<HloInstruction*> constants;
+  constants.push_back(builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<uint32>(42))));
+  constants.push_back(builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<int32>(42))));
+  constants.push_back(builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<uint64>(42.0))));
+  constants.push_back(builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<int64>(42.0))));
+  constants.push_back(builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<double>(42.0))));
+  constants.push_back(builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f))));
   // Duplicate the float constant to verify something happens.
-  builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
+  constants.push_back(builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f))));
+
+  const Shape shape_r0 = ShapeUtil::MakeShape(F32, {});
+  for (int64 i = 0; i < constants.size(); ++i) {
+    constants[i] = builder.AddInstruction(
+        HloInstruction::CreateConvert(shape_r0, constants[i]));
+  }
+  HloInstruction* root = builder.AddInstruction(HloInstruction::CreateBinary(
+      shape_r0, HloOpcode::kAdd, constants[0], constants[1]));
+  for (int64 i = 2; i < constants.size(); ++i) {
+    root = builder.AddInstruction(HloInstruction::CreateBinary(
+        shape_r0, HloOpcode::kAdd, root, constants[i]));
+  }
 
   auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_EQ(7, computation->instruction_count());
+  EXPECT_EQ(20, computation->instruction_count());
 
   HloCSE cse(/*is_layout_sensitive=*/false);
   EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
-  EXPECT_EQ(6, computation->instruction_count());
+  // CSE will remove both the second float(42.0f) and the corresponding
+  // convert/cast.
+  EXPECT_EQ(18, computation->instruction_count());
 }
 
 TEST_F(HloCseTest, NonscalarConstants) {
@@ -469,5 +485,56 @@ TEST_F(HloCseTest, DoNotCombineCallsToImpureFunctions) {
   EXPECT_THAT(root, op::Add(op::Map(op::Constant()), op::Map(op::Constant())));
 }
 
+TEST_F(HloCseTest, CompareComputations) {
+  auto module = ParseHloString(R"(
+    HloModule m
+
+    add_computation {
+      add_lhs = f32[] parameter(0)
+      add_rhs = f32[] parameter(1)
+      ROOT add_root = f32[] add(add_lhs, add_rhs)
+    }
+
+    add_computation2 {
+      add_lhs2 = f32[] parameter(0)
+      add_rhs2 = f32[] parameter(1)
+      ROOT add_root2 = f32[] add(add_lhs2, add_rhs2)
+    }
+
+    ENTRY entry {
+      p = f32[10]{0} parameter(0)
+      c = f32[] constant(0)
+      r1 = f32[] reduce(p, c), dimensions={0}, to_apply=add_computation
+      r2 = f32[] reduce(p, c), dimensions={0}, to_apply=add_computation2
+      ROOT f2 = (f32[],f32[]) tuple(r1, r2)
+    })")
+                    .ValueOrDie();
+
+  HloCSE cse(/*is_layout_sensitive=*/false);
+  EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_EQ(root->operand(0), root->operand(1));
+}
+
+TEST_F(HloCseTest, ConstantsSameValueInDifferentDomains) {
+  // Test that constants with the same value but in different domains (disjoint
+  // in this case) are not collapsed.
+  auto builder = HloComputation::Builder(TestName());
+  builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<uint32>(42)));
+  builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<uint32>(42)));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_EQ(2, computation->instruction_count());
+
+  HloCSE cse(/*is_layout_sensitive=*/false);
+  EXPECT_FALSE(cse.Run(module.get()).ValueOrDie());
+
+  EXPECT_EQ(2, computation->instruction_count());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index 0c37a8d75f38da..cc130a4900dc16 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -363,7 +363,7 @@ bool HloDataflowAnalysis::UpdateCallValueSet(HloInstruction* call) {
 bool HloDataflowAnalysis::UpdateConditionalValueSet(
     HloInstruction* conditional) {
   CHECK_EQ(conditional->opcode(), HloOpcode::kConditional);
-  std::vector<const InstructionValueSet*> inputs = {
+  const InstructionValueSet* const inputs[] = {
       &GetInstructionValueSet(
           conditional->true_computation()->root_instruction()),
       &GetInstructionValueSet(
@@ -538,7 +538,7 @@ bool HloDataflowAnalysis::UpdateTupleValueSet(HloInstruction* tuple) {
 
 bool HloDataflowAnalysis::UpdateWhileValueSet(HloInstruction* xla_while) {
   CHECK_EQ(xla_while->opcode(), HloOpcode::kWhile);
-  std::vector<const InstructionValueSet*> inputs = {
+  const InstructionValueSet* const inputs[] = {
       &GetInstructionValueSet(xla_while->while_body()->root_instruction()),
       &GetInstructionValueSet(xla_while->operand(0))};
   if (ssa_form_) {
@@ -878,4 +878,128 @@ Status HloDataflowAnalysis::Verify() const {
   return Status::OK();
 }
 
+bool HloDataflowAnalysis::DoesNotUseOperandBuffer(
+    const HloInstruction* operand, const ShapeIndex& index,
+    const HloInstruction* user) const {
+  CHECK(user->IsUserOf(operand))
+      << "user: " << user->ToString() << " operand: " << operand->ToString();
+  if (user->opcode() == HloOpcode::kFusion &&
+      user->fusion_kind() == HloInstruction::FusionKind::kLoop) {
+    // Find fusion parameter associated with 'operand'.
+    HloInstruction* fusion_param =
+        user->fused_parameter(user->operand_index(operand));
+    // Iterate through all users of all uses of the fusion parameter value.
+    // Return false if any uses are detected, returns true otherwise.
+    const HloValue& value = GetValueDefinedAt(fusion_param, index);
+    return value.uses().empty();
+  } else {
+    // Return false if no value at 'operand' and 'index' is used at 'user'.
+    for (const HloValue* value : GetValueSet(operand, index).values()) {
+      for (const HloUse& use : value->uses()) {
+        if (use.instruction == user) {
+          return false;
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+bool HloDataflowAnalysis::CanShareOperandBufferWithUser(
+    HloInstruction* operand, const ShapeIndex& operand_index,
+    HloInstruction* user, const ShapeIndex& user_index) const {
+  CHECK(user->IsUserOf(operand))
+      << "user: " << user->ToString() << " operand: " << operand->ToString();
+  const Shape& operand_subshape =
+      ShapeUtil::GetSubshape(operand->shape(), operand_index);
+  const Shape& user_subshape =
+      ShapeUtil::GetSubshape(user->shape(), user_index);
+  // Check that operand and user emit the same shape and layout.
+  if (!ShapeUtil::Equal(operand_subshape, user_subshape)) {
+    return false;
+  }
+
+  if (user->opcode() == HloOpcode::kFusion) {
+    // Get the parameter associated with 'operand';
+    HloInstruction* fusion_param =
+        user->fused_parameter(user->operand_index(operand));
+
+    const HloValue& value = GetValueDefinedAt(fusion_param, operand_index);
+    if (value.uses().size() != 1) {
+      return false;
+    }
+    const HloUse& use = value.uses()[0];
+
+    if (user->fusion_kind() == HloInstruction::FusionKind::kLoop &&
+        user->fused_expression_root()->opcode() ==
+            HloOpcode::kDynamicUpdateSlice) {
+      // Loop fusion with kDynamicUpdateSlice fused root.
+      //
+      // Returns true iff there is exactly one use of 'operand' at shape index
+      // 'operand_index', and this singleton use is the fused root at operand
+      // index 0.
+      return use.instruction == user->fused_expression_root() &&
+             use.operand_number == 0;
+    } else if (user->fusion_kind() == HloInstruction::FusionKind::kOutput &&
+               user->fused_expression_root()->opcode() == HloOpcode::kAdd) {
+      // Output fusion with kAdd fused root.
+
+      // Check if one operand of kAdd fused root is kDot or kConvolution.
+      auto* add = user->fused_expression_root();
+      auto add_operand_it =
+          std::find_if(add->operands().begin(), add->operands().end(),
+                       [&](HloInstruction* operand) {
+                         return operand->opcode() == HloOpcode::kConvolution ||
+                                operand->opcode() == HloOpcode::kDot;
+                       });
+      if (add_operand_it == add->operands().end()) {
+        return false;
+      }
+      auto* matched_add_operand = *add_operand_it;
+      // Calculate operand index of 'add' operand which was not matched above.
+      const int64 other_add_operand_index =
+          matched_add_operand == add->operand(0) ? 1 : 0;
+      // Returns true iff there is exactly one use of 'operand' at shape index
+      // 'operand_index', and this singleton use is the fused root (at operand
+      // index 'other_add_operand_index').
+      return use.instruction == user->fused_expression_root() &&
+             use.operand_number == other_add_operand_index;
+    }
+  }
+  if (user->opcode() == HloOpcode::kDynamicUpdateSlice ||
+      user->opcode() == HloOpcode::kWhile) {
+    // We eliminated other users in BufferLiveness::live_range_strictly_before,
+    // so here we just need to check that the use is at operand index 0.
+    std::vector<int64> operand_indices = user->OperandIndices(operand);
+    return operand_indices.size() == 1 && operand_indices[0] == 0;
+  }
+  if (user->opcode() == HloOpcode::kCall) {
+    // Get all uses of value defined by 'operand' at 'operand_index'.
+    const auto& uses = GetValueDefinedAt(operand, operand_index).uses();
+    // Return true iff:
+    // *) There exists two uses of 'operand'.
+    // *) One use is by 'user' (caller).
+    // *) One use is by root instruction of called computation (callee root).
+    //    (Note: we check the root of the called computation, because the
+    //     root result buffer is required to alias with the Call result buffer).
+    // *) The root instruction of the called computation is element-wise on
+    //    'operand'.
+    const bool found_caller_use =
+        std::find_if(uses.begin(), uses.end(), [user](const HloUse& use) {
+          return use.instruction == user;
+        }) != uses.end();
+    auto* callee_root = user->to_apply()->root_instruction();
+    const bool found_elementwise_callee_use =
+        std::find_if(
+            uses.begin(), uses.end(), [callee_root](const HloUse& use) {
+              return use.instruction == callee_root &&
+                     callee_root->IsElementwiseOnOperand(use.operand_number);
+            }) != uses.end();
+    return uses.size() == 2 && found_caller_use && found_elementwise_callee_use;
+  }
+  // Check if 'user' is element-wise.
+  return user->IsElementwise();
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
index 7b8a74b096ff48..9868746b611388 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
@@ -118,6 +118,23 @@ class HloDataflowAnalysis {
 
   string ToString() const;
 
+  // Returns true if 'user' cannot possibly use the buffer at 'index' in
+  // 'operand'. Returns false otherwise.
+  //
+  // REQUIRES: 'operand' is an operand of 'user'.
+  bool DoesNotUseOperandBuffer(const HloInstruction* operand,
+                               const ShapeIndex& index,
+                               const HloInstruction* user) const;
+
+  // Returns true if 'user' (at 'user_index') can share a buffer with its
+  // operand 'operand' (at 'operand_index'). Returns false otherwise.
+  //
+  // REQUIRES: 'operand' is an operand of 'user'.
+  bool CanShareOperandBufferWithUser(HloInstruction* operand,
+                                     const ShapeIndex& operand_index,
+                                     HloInstruction* user,
+                                     const ShapeIndex& user_index) const;
+
  protected:
   HloDataflowAnalysis(const HloModule& module, bool ssa_form,
                       bool bitcast_defines_value = false);
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
index 07f69b8e1339fe..5798326dcbf65c 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
@@ -1873,5 +1873,346 @@ INSTANTIATE_TEST_CASE_P(HloDataflowAnalysisInstantiation,
                         HloDataflowAnalysisTest,
                         ::testing::Values(false, true));
 
+class HloDataflowAnalysisTestBase : public HloTestBase {
+ protected:
+  void BuildModule(std::unique_ptr<HloComputation> computation) {
+    module_ = CreateNewModule();
+    computation_ = module_->AddEntryComputation(std::move(computation));
+  }
+
+  void RunAnalysis() {
+    CHECK_NOTNULL(module_.get());
+    dataflow_analysis_ = HloDataflowAnalysis::Run(*module_).ConsumeValueOrDie();
+  }
+
+  void BuildModuleAndRunAnalysis(std::unique_ptr<HloComputation> computation) {
+    BuildModule(std::move(computation));
+    RunAnalysis();
+  }
+
+  std::unique_ptr<HloModule> module_;
+  HloComputation* computation_ = nullptr;
+  std::unique_ptr<HloDataflowAnalysis> dataflow_analysis_;
+};
+
+class DoesNotUseOperandBufferTest : public HloDataflowAnalysisTestBase {};
+
+TEST_F(DoesNotUseOperandBufferTest, GetTupleElement) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape elem_shape = ShapeUtil::MakeShape(F32, {8});
+  auto tuple = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeTupleShape({elem_shape, elem_shape}), "tuple"));
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(elem_shape, tuple, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(elem_shape, tuple, 1));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(elem_shape, HloOpcode::kAdd, gte0, gte1));
+
+  BuildModuleAndRunAnalysis(builder.Build());
+
+  // GetTupleElement instructions only access the top-level buffer of their
+  // operand.
+  EXPECT_TRUE(dataflow_analysis_->DoesNotUseOperandBuffer(tuple, {0}, gte0));
+  EXPECT_TRUE(dataflow_analysis_->DoesNotUseOperandBuffer(tuple, {1}, gte1));
+  EXPECT_FALSE(dataflow_analysis_->DoesNotUseOperandBuffer(tuple, {}, gte0));
+  EXPECT_FALSE(dataflow_analysis_->DoesNotUseOperandBuffer(tuple, {}, gte1));
+}
+
+TEST_F(DoesNotUseOperandBufferTest, FusedDynamicUpdateSlice) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape data_shape = ShapeUtil::MakeShape(F32, {8});
+  auto tuple = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeTupleShape({data_shape, data_shape}), "tuple"));
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape, tuple, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape, tuple, 1));
+
+  // Create a DynamicUpdateSlice instruction of tuple element 1.
+  auto starts = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR1<int32>({2})));
+  auto update = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR1<float>({2.f, 2.f, 2.f})));
+  auto dynamic_update_slice =
+      builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+          data_shape, gte1, update, starts));
+  builder.AddInstruction(
+      HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
+
+  BuildModule(builder.Build());
+  auto fusion = computation_->CreateFusionInstruction(
+      {dynamic_update_slice, starts, update, gte1},
+      HloInstruction::FusionKind::kLoop);
+  RunAnalysis();
+
+  // The fusion instruction never uses tuple element 0, but does use element 1.
+  EXPECT_TRUE(dataflow_analysis_->DoesNotUseOperandBuffer(tuple, {0}, fusion));
+  EXPECT_FALSE(dataflow_analysis_->DoesNotUseOperandBuffer(tuple, {1}, fusion));
+}
+
+class CanShareOperandBufferWithUserTest : public HloDataflowAnalysisTestBase {};
+
+TEST_F(CanShareOperandBufferWithUserTest, ElementWiseSameShape) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape shape = ShapeUtil::MakeShape(F32, {8});
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param"));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kExp, param));
+  auto log = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kLog, exp));
+
+  BuildModuleAndRunAnalysis(builder.Build());
+
+  EXPECT_TRUE(
+      dataflow_analysis_->CanShareOperandBufferWithUser(param, {}, exp, {}));
+  EXPECT_TRUE(
+      dataflow_analysis_->CanShareOperandBufferWithUser(exp, {}, log, {}));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, ElementWiseDifferentShape) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape in_shape = ShapeUtil::MakeShape(F32, {8});
+  Shape out_shape = ShapeUtil::MakeShape(PRED, {8});
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, in_shape, "param0"));
+  auto param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, in_shape, "param1"));
+  auto result = builder.AddInstruction(
+      HloInstruction::CreateBinary(out_shape, HloOpcode::kEq, param0, param1));
+
+  BuildModuleAndRunAnalysis(builder.Build());
+
+  EXPECT_FALSE(dataflow_analysis_->CanShareOperandBufferWithUser(param0, {},
+                                                                 result, {}));
+  EXPECT_FALSE(dataflow_analysis_->CanShareOperandBufferWithUser(param1, {},
+                                                                 result, {}));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, CopyShares) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape shape = ShapeUtil::MakeShape(F32, {8});
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param"));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kExp, param));
+  auto copy = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kCopy, exp));
+
+  BuildModuleAndRunAnalysis(builder.Build());
+
+  EXPECT_TRUE(
+      dataflow_analysis_->CanShareOperandBufferWithUser(param, {}, exp, {}));
+  EXPECT_TRUE(
+      dataflow_analysis_->CanShareOperandBufferWithUser(exp, {}, copy, {}));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, FusedDynamicUpdateSlice) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape data_shape = ShapeUtil::MakeShape(F32, {8});
+  auto tuple = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeTupleShape({data_shape, data_shape}), "tuple"));
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape, tuple, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape, tuple, 1));
+
+  // Create a DynamicUpdateSlice instruction of tuple element 1.
+  auto starts = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR1<int32>({2})));
+  auto update = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR1<float>({2.f, 2.f, 2.f})));
+  auto dynamic_update_slice =
+      builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+          data_shape, gte1, update, starts));
+  builder.AddInstruction(
+      HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
+
+  BuildModule(builder.Build());
+  auto fusion = computation_->CreateFusionInstruction(
+      {dynamic_update_slice, starts, update, gte1},
+      HloInstruction::FusionKind::kLoop);
+  RunAnalysis();
+
+  // The fusion instruction can share with tuple element 1.
+  EXPECT_FALSE(dataflow_analysis_->CanShareOperandBufferWithUser(tuple, {0},
+                                                                 fusion, {}));
+  EXPECT_TRUE(dataflow_analysis_->CanShareOperandBufferWithUser(tuple, {1},
+                                                                fusion, {}));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, DynamicUpdateSliceCanShare) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape data_shape = ShapeUtil::MakeShape(F32, {8});
+  Shape update_shape = ShapeUtil::MakeShape(F32, {4});
+  Shape starts_shape = ShapeUtil::MakeShape(S32, {1});
+  auto data = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, data_shape, "data"));
+  auto update = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, update_shape, "update"));
+  auto starts = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, starts_shape, "starts"));
+  auto dus = builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+      data_shape, data, update, starts));
+
+  BuildModuleAndRunAnalysis(builder.Build());
+
+  // The DynamicUpdateSlice instruction can share with the data operand, but not
+  // with update or starts.
+  EXPECT_TRUE(
+      dataflow_analysis_->CanShareOperandBufferWithUser(data, {}, dus, {}));
+  EXPECT_FALSE(
+      dataflow_analysis_->CanShareOperandBufferWithUser(update, {}, dus, {}));
+  EXPECT_FALSE(
+      dataflow_analysis_->CanShareOperandBufferWithUser(starts, {}, dus, {}));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, FusedDotAdd) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2});
+
+  auto a = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR2<float>({{1.0, 0.0}, {0.0, 1.0}})));
+  auto b = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto dot = builder.AddInstruction(
+      HloInstruction::CreateDot(data_shape, a, b, dot_dnums));
+
+  auto one = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
+  auto add_operand = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(data_shape, one, {1}));
+
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      data_shape, HloOpcode::kAdd, dot, add_operand));
+
+  BuildModule(builder.Build());
+  auto fusion = computation_->CreateFusionInstruction(
+      {add, dot}, HloInstruction::FusionKind::kOutput);
+  RunAnalysis();
+
+  // Output fused dot add should be able to share buffer with 'add_operand'.
+  EXPECT_TRUE(dataflow_analysis_->CanShareOperandBufferWithUser(add_operand, {},
+                                                                fusion, {}));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, OutputFusionCantAliasOperandBuffer) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2});
+
+  auto one = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
+  auto operand = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(data_shape, one, {1}));
+
+  auto reverse = builder.AddInstruction(
+      HloInstruction::CreateReverse(data_shape, operand, {0, 1}));
+
+  auto two = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
+
+  auto add = builder.AddInstruction(
+      HloInstruction::CreateBinary(data_shape, HloOpcode::kAdd, reverse, two));
+
+  BuildModule(builder.Build());
+  auto fusion = computation_->CreateFusionInstruction(
+      {add, two, reverse}, HloInstruction::FusionKind::kOutput);
+  RunAnalysis();
+
+  // Output fused operand->reverse->add cannot alias operand buffer 'operand'.
+  EXPECT_FALSE(dataflow_analysis_->CanShareOperandBufferWithUser(operand, {},
+                                                                 fusion, {}));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, WhileCanShare) {
+  Shape data_shape = ShapeUtil::MakeShape(F32, {8});
+
+  auto make_cond = [this, &data_shape]() {
+    auto builder = HloComputation::Builder(TestName() + ".Cond");
+    auto data = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, data_shape, "data"));
+    builder.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(PRED, {}), HloOpcode::kEq, data, data));
+    return builder.Build();
+  };
+
+  auto make_body = [this, &data_shape]() {
+    auto builder = HloComputation::Builder(TestName() + ".Body");
+    auto data = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, data_shape, "data"));
+    builder.AddInstruction(
+        HloInstruction::CreateBinary(data_shape, HloOpcode::kAdd, data, data));
+    return builder.Build();
+  };
+
+  module_ = CreateNewModule();
+  HloComputation* cond_computation =
+      module_->AddEmbeddedComputation(make_cond());
+  HloComputation* body_computation =
+      module_->AddEmbeddedComputation(make_body());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto data = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, data_shape, "data"));
+  auto whil = builder.AddInstruction(HloInstruction::CreateWhile(
+      data_shape, cond_computation, body_computation, data));
+  computation_ = module_->AddEntryComputation(builder.Build());
+
+  RunAnalysis();
+
+  // The While instruction can share with the data operand.
+  EXPECT_TRUE(
+      dataflow_analysis_->CanShareOperandBufferWithUser(data, {}, whil, {}));
+}
+
+// Tests that Call can alias operand buffer if the only use of the operand
+// in the called computation is an elementwise instruction.
+TEST_F(CanShareOperandBufferWithUserTest, CallToComputationWithFusionRoot) {
+  Shape shape = ShapeUtil::MakeShape(F32, {8});
+  // Build sub-computation with fusion root.
+  auto sub_builder = HloComputation::Builder(TestName() + "_sub");
+  auto sub_param = sub_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "sub_param"));
+  auto one = sub_builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
+  auto ones = sub_builder.AddInstruction(
+      HloInstruction::CreateBroadcast(shape, one, {1}));
+  auto add = sub_builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, sub_param, ones));
+
+  module_ = CreateNewModule();
+  auto sub_computation = module_->AddEmbeddedComputation(sub_builder.Build());
+  sub_computation->CreateFusionInstruction({add, ones},
+                                           HloInstruction::FusionKind::kLoop);
+
+  // Build entry-computation with kCall which calls 'sub_computation'.
+  auto builder = HloComputation::Builder(TestName());
+
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param"));
+  auto reverse =
+      builder.AddInstruction(HloInstruction::CreateReverse(shape, param, {0}));
+  auto call = builder.AddInstruction(
+      HloInstruction::CreateCall(shape, {reverse}, sub_computation));
+  computation_ = module_->AddEntryComputation(builder.Build());
+
+  RunAnalysis();
+
+  EXPECT_TRUE(
+      dataflow_analysis_->CanShareOperandBufferWithUser(reverse, {}, call, {}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_domain_isolator.cc b/tensorflow/compiler/xla/service/hlo_domain_isolator.cc
new file mode 100644
index 00000000000000..78955db0da02f1
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_domain_isolator.cc
@@ -0,0 +1,104 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_domain_isolator.h"
+
+#include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/types.h"
+
+namespace xla {
+
+class HloDomainIsolator::RunContext {
+ public:
+  RunContext(HloModule* module, HloDomainIsolator* isolator)
+      : module_(module), isolator_(isolator) {}
+
+  StatusOr<bool> Run();
+
+ private:
+  // Inserts a kDomain instruction between parent and operand, in case
+  // the attribute (ie, sharding) values change between instruction and operand.
+  // Returns the newly inserted kDomain instruction, or nullptr if no kDomain
+  // instruction was necessary.
+  StatusOr<HloInstruction*> CreateDomain(HloInstruction* instruction,
+                                         HloInstruction* parent,
+                                         HloInstruction* operand);
+
+  HloModule* module_;
+  HloDomainIsolator* isolator_;
+};
+
+StatusOr<HloInstruction*> HloDomainIsolator::RunContext::CreateDomain(
+    HloInstruction* instruction, HloInstruction* parent,
+    HloInstruction* operand) {
+  HloInstruction* domain = nullptr;
+  std::unique_ptr<HloInstruction> domain_instruction =
+      isolator_->creator_(instruction, operand);
+  if (domain_instruction != nullptr) {
+    domain = operand->parent()->AddInstruction(std::move(domain_instruction));
+    TF_RETURN_IF_ERROR(operand->ReplaceUseWith(parent, domain));
+  }
+  return domain;
+}
+
+StatusOr<bool> HloDomainIsolator::RunContext::Run() {
+  hlo_graph_dumper::MaybeDumpHloModule(*module_, "Before Domain Isolator");
+
+  int64 added_domains = 0;
+  for (HloComputation* computation : module_->computations()) {
+    // Walk in post order and place all the required kDomain instructions.
+    for (HloInstruction* instruction :
+         computation->MakeInstructionPostOrder()) {
+      if (instruction->opcode() == HloOpcode::kDomain) {
+        continue;
+      }
+      for (HloInstruction* operand : instruction->unique_operands()) {
+        // When applying multiple domains, we could end up stacking more than
+        // one in one edge, so here we want to build the effective
+        // (kDomain-less) instruction->operand edge.
+        HloInstruction* parent = instruction;
+        while (operand->opcode() == HloOpcode::kDomain) {
+          parent = operand;
+          operand = operand->mutable_operand(0);
+        }
+        // Check whether a kDomain is necessary between instruction and operand.
+        TF_ASSIGN_OR_RETURN(HloInstruction * domain,
+                            CreateDomain(instruction, parent, operand));
+        if (domain != nullptr) {
+          VLOG(4) << "New domain: " << domain->ToString();
+          ++added_domains;
+        }
+      }
+    }
+  }
+  VLOG(3) << "Added " << added_domains << " kDomain instructions";
+  if (added_domains > 0) {
+    hlo_graph_dumper::MaybeDumpHloModule(*module_, "After Domain Isolator");
+  }
+  return added_domains > 0;
+}
+
+HloDomainIsolator::HloDomainIsolator(DomainCreator creator)
+    : creator_(std::move(creator)) {}
+
+StatusOr<bool> HloDomainIsolator::Run(HloModule* module) {
+  RunContext run_context(module, this);
+  return run_context.Run();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_domain_isolator.h b/tensorflow/compiler/xla/service/hlo_domain_isolator.h
new file mode 100644
index 00000000000000..e0c5718509dabe
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_domain_isolator.h
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DOMAIN_ISOLATOR_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DOMAIN_ISOLATOR_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// Domain isolation is the task of placing kDomain instructions between HLO
+// instructions having different shrading. A kDomain instruction is essentially
+// used to break an HLO graph edge connecting two instructions with different
+// sharding. If a set of connected instructions have all the same sharding, no
+// kDomain instruciton will be placed.
+class HloDomainIsolator : public HloPassInterface {
+ public:
+  // Creates a new kDomain instruction for the edge between the use instruction
+  // (the first HloInstruction argument), and the operand instruction (the
+  // second HloInstruction argument).
+  // Returns nullptr in case no domain separation is necessary.
+  using DomainCreator = std::function<std::unique_ptr<HloInstruction>(
+      HloInstruction*, HloInstruction*)>;
+
+  explicit HloDomainIsolator(DomainCreator creator);
+
+  tensorflow::StringPiece name() const override { return "domain_isolator"; }
+
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  class RunContext;
+
+  DomainCreator creator_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DOMAIN_ISOLATOR_H_
diff --git a/tensorflow/compiler/xla/service/hlo_domain_map.cc b/tensorflow/compiler/xla/service/hlo_domain_map.cc
new file mode 100644
index 00000000000000..ebd5adb5d573ce
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_domain_map.cc
@@ -0,0 +1,176 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_domain_map.h"
+
+#include <algorithm>
+
+#include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/types.h"
+
+namespace xla {
+
+/* static */ StatusOr<std::unique_ptr<HloDomainMap>> HloDomainMap::Create(
+    HloComputation* computation, string domain_kind) {
+  auto domain_map = WrapUnique(new HloDomainMap(std::move(domain_kind)));
+  TF_RETURN_IF_ERROR(domain_map->Populate(computation));
+  return std::move(domain_map);
+}
+
+/* static */ StatusOr<std::unique_ptr<HloDomainMap>> HloDomainMap::Create(
+    HloModule* module, string domain_kind) {
+  auto domain_map = WrapUnique(new HloDomainMap(std::move(domain_kind)));
+  for (HloComputation* computation : module->computations()) {
+    TF_RETURN_IF_ERROR(domain_map->Populate(computation));
+  }
+  return std::move(domain_map);
+}
+
+bool HloDomainMap::InSameDomain(HloInstruction* instruction1,
+                                HloInstruction* instruction2) const {
+  int64 domain_id1 = FindOrDefault(instruction_to_domain_, instruction1, -1);
+  int64 domain_id2 = FindOrDefault(instruction_to_domain_, instruction2, -1);
+  return domain_id1 >= 0 && domain_id1 == domain_id2;
+}
+
+Status HloDomainMap::TryProcessEmptyDomain(HloInstruction* instruction) {
+  TF_RET_CHECK(instruction->opcode() == HloOpcode::kDomain);
+  // We only check operands, so we are sure to not process the empty domain from
+  // both sides.
+  for (HloInstruction* operand : instruction->unique_operands()) {
+    if (IsDomainInstruction(operand)) {
+      auto domain = MakeUnique<DomainMetadata::Domain>();
+      domain->enter_domains.insert(operand);
+      domain->exit_domains.insert(instruction);
+      TF_RETURN_IF_ERROR(InsertDomain(std::move(domain)));
+    }
+  }
+  return Status::OK();
+}
+
+Status HloDomainMap::Populate(HloComputation* computation) {
+  for (HloInstruction* instruction : computation->instructions()) {
+    if (IsDomainInstruction(instruction)) {
+      // If this is a kDomain of the kind we are currently processing, check
+      // whether this is an "empty domain".
+      TF_RETURN_IF_ERROR(TryProcessEmptyDomain(instruction));
+      continue;
+    }
+    int64 domain_id = FindOrDefault(instruction_to_domain_, instruction, -1);
+    if (domain_id >= 0) {
+      // We have already processed this instruction.
+      continue;
+    }
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<DomainMetadata::Domain> domain,
+                        CreateDomain(instruction));
+    TF_RETURN_IF_ERROR(InsertDomain(std::move(domain)));
+  }
+  return Status::OK();
+}
+
+Status HloDomainMap::InsertDomain(
+    std::unique_ptr<DomainMetadata::Domain> domain) {
+  int64 domain_id = instruction_domains_.size();
+  instruction_domains_.push_back(std::move(domain));
+  for (HloInstruction* instruction : instruction_domains_.back()->reach_set) {
+    instruction_to_domain_[instruction] = domain_id;
+  }
+  return Status::OK();
+}
+
+Status HloDomainMap::ExpandDomain(HloInstruction* instruction,
+                                  DomainMetadata::Domain* domain) const {
+  std::vector<HloInstruction*> in_queue;
+  in_queue.push_back(instruction);
+  while (!in_queue.empty()) {
+    HloInstruction* current_instruction = in_queue.back();
+    in_queue.pop_back();
+    if (domain->reach_set.insert(current_instruction).second) {
+      // We should not be finding instructions with assigned domain here.
+      // If we assigned a domain to the instruction, it means that all the
+      // instructions reached by it, should have a domain as well.
+      int64 domain_id =
+          FindOrDefault(instruction_to_domain_, current_instruction, -1);
+      TF_RET_CHECK(domain_id < 0)
+          << "Instruction " << current_instruction->ToString()
+          << " already has domain " << domain_id;
+      for (HloInstruction* operand : current_instruction->operands()) {
+        if (IsDomainInstruction(operand)) {
+          // The reach set instruction is a user of the domain instruction
+          // (the instruction sees the kDomain as operand).
+          // IOW the dataflow enters the domain through the kDomain instruction.
+          domain->enter_domains.insert(operand);
+        } else {
+          in_queue.push_back(operand);
+        }
+      }
+      for (HloInstruction* user : current_instruction->users()) {
+        if (IsDomainInstruction(user)) {
+          // The reach set instruction is an operand of the domain instruction
+          // (the instruction sees the kDomain as user).
+          // IOW the dataflow exits the domain through the kDomain instruction.
+          domain->exit_domains.insert(user);
+        } else {
+          in_queue.push_back(user);
+        }
+      }
+    }
+  }
+  return Status::OK();
+}
+
+StatusOr<std::unique_ptr<DomainMetadata::Domain>> HloDomainMap::CreateDomain(
+    HloInstruction* instruction) const {
+  auto domain = MakeUnique<DomainMetadata::Domain>();
+  TF_RETURN_IF_ERROR(ExpandDomain(instruction, domain.get()));
+  domain->instructions = MakeNonDomainInstructions(domain->reach_set);
+  return std::move(domain);
+}
+
+bool HloDomainMap::IsDomainInstruction(HloInstruction* instruction) const {
+  if (instruction->opcode() != HloOpcode::kDomain) {
+    return false;
+  }
+  if (!domain_kind_.empty()) {
+    if (instruction->user_side_metadata().Kind() != domain_kind_) {
+      return false;
+    }
+    // Both user and operand side of the metadata must be of the same kind.
+    CHECK(instruction->operand_side_metadata().Kind() == domain_kind_)
+        << "Instruction " << instruction->ToString()
+        << " has mismatching metadata kinds";
+  }
+  return true;
+}
+
+/* static */ std::vector<HloInstruction*>
+HloDomainMap::MakeNonDomainInstructions(
+    const tensorflow::gtl::FlatSet<HloInstruction*>& instruction_set) {
+  std::vector<HloInstruction*> instructions;
+  instructions.reserve(instruction_set.size());
+  for (HloInstruction* instruction : instruction_set) {
+    if (instruction->opcode() != HloOpcode::kDomain) {
+      instructions.push_back(instruction);
+    }
+  }
+  std::sort(instructions.begin(), instructions.end(),
+            [](HloInstruction* a, HloInstruction* b) {
+              return a->unique_id() < b->unique_id();
+            });
+  return instructions;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_domain_map.h b/tensorflow/compiler/xla/service/hlo_domain_map.h
new file mode 100644
index 00000000000000..e62ef763fb3881
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_domain_map.h
@@ -0,0 +1,108 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DOMAIN_MAP_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DOMAIN_MAP_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_domain_metadata.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+
+namespace xla {
+
+// The HloDomainMap splits a set of instructions within a module or computation,
+// into different domains, separated by kDomain instructions.
+// A domain is composed by a set of instructions which can reach each other via
+// operand/user edges, without crossing a kDomain insutrction of a given kind.
+// A domain never crosses computation boundaries.
+class HloDomainMap {
+ public:
+  // Creates a new HloDomainMap, creating all the domains within the input
+  // computation, of the given kind. If domain_kind is not empty, only the
+  // kDomain instructions of domain_kind will be considered as separators.
+  // Otherwise every kDomain instruction will be splitting domains.
+  static StatusOr<std::unique_ptr<HloDomainMap>> Create(
+      HloComputation* computation, string domain_kind);
+
+  // Creates a new HloDomainMap, creating all the domains within the input
+  // module, of the given kind. If domain_kind is not empty, only the
+  // kDomain instructions of domain_kind will be considered as separators.
+  // Otherwise every kDomain instruction will be splitting domains.
+  static StatusOr<std::unique_ptr<HloDomainMap>> Create(HloModule* module,
+                                                        string domain_kind);
+
+  // Retrieves all the domains the input module or computation are composed by.
+  const std::vector<std::unique_ptr<DomainMetadata::Domain>>& GetDomains()
+      const {
+    return instruction_domains_;
+  }
+
+  // Checks whether two instructions are within the same domain.
+  bool InSameDomain(HloInstruction* instruction1,
+                    HloInstruction* instruction2) const;
+
+  // Checks whether instruction is a kDomain instruction of the kind we are
+  // currently processing.
+  bool IsDomainInstruction(HloInstruction* instruction) const;
+
+ private:
+  HloDomainMap(string domain_kind) : domain_kind_(std::move(domain_kind)) {}
+
+  // Check if the kDomain instruction is facing (via its operand link) another
+  // kDomain instruction of the same kind, hence defining an empty domain.
+  // If that is the case, create the empty domain and call the proper
+  // normalizer.
+  Status TryProcessEmptyDomain(HloInstruction* instruction);
+
+  Status Populate(HloComputation* computation);
+
+  // Inserts the provided domain into the ones tracked by this object,
+  // creating a new domain ID.
+  Status InsertDomain(std::unique_ptr<DomainMetadata::Domain> domain);
+
+  // From the given instruction, epxands operand and user wise, the set of
+  // instructions which can be reached without crossing a kDomain instruction
+  // of the kind specified by domain_kind_.
+  // The domain data structure will be populated with all the reached
+  // instructions, and the boundaries of the domain, with the kDomain
+  // instructions encountered while expanding the reach.
+  Status ExpandDomain(HloInstruction* instruction,
+                      DomainMetadata::Domain* domain) const;
+
+  // Creates a domain data structure using the ExpandDomain() API.
+  StatusOr<std::unique_ptr<DomainMetadata::Domain>> CreateDomain(
+      HloInstruction* instruction) const;
+
+  // Out of an instruction set, returns a vector of all the ones which are not
+  // a kDomain kind.
+  static std::vector<HloInstruction*> MakeNonDomainInstructions(
+      const tensorflow::gtl::FlatSet<HloInstruction*>& instruction_set);
+
+  string domain_kind_;
+  std::vector<std::unique_ptr<DomainMetadata::Domain>> instruction_domains_;
+  tensorflow::gtl::FlatMap<HloInstruction*, int64> instruction_to_domain_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DOMAIN_MAP_H_
diff --git a/tensorflow/compiler/xla/service/hlo_domain_metadata.h b/tensorflow/compiler/xla/service/hlo_domain_metadata.h
new file mode 100644
index 00000000000000..aa0308100a21f1
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_domain_metadata.h
@@ -0,0 +1,84 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DOMAIN_METADATA_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DOMAIN_METADATA_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+
+namespace xla {
+
+// Cannot include hlo_instruction.h as this file is included from there.
+class HloInstruction;
+
+// The DomainMetadata represents the base class for metadata which can be
+// attached to kDomain HLO instructions.
+class DomainMetadata {
+ public:
+  // A Domain data structure captures all the information about a kDomain
+  // bounded instruction set.
+  struct Domain {
+    // The set of instructions which are reachable from each other via
+    // operand/user pathways, without crossing a kDomain instruction of a given
+    // kind. The reach_set can contain kDomain instructions of other kinds, if
+    // two domains of different kind intersect each other.
+    tensorflow::gtl::FlatSet<HloInstruction*> reach_set;
+
+    // The same instructions in reach_set, but purged from kDomain instructions.
+    std::vector<HloInstruction*> instructions;
+
+    // If we consider a graph edge as an arrow oriented from the operand to the
+    // user, the enter_domains will contain the set of kDomain instructions
+    // whose dataflow enters the reach set (domain), while the exit_domains
+    // contains the set of kDomain instructions whose dataflow exit the reach
+    // set.
+    tensorflow::gtl::FlatSet<HloInstruction*> enter_domains;
+    tensorflow::gtl::FlatSet<HloInstruction*> exit_domains;
+  };
+
+  virtual ~DomainMetadata() = default;
+
+  // Clones the metadata object.
+  virtual std::unique_ptr<DomainMetadata> Clone() const = 0;
+
+  // Returns the metadata type. A unique identifier which describes the real
+  // metadata type.
+  virtual tensorflow::StringPiece Kind() const = 0;
+
+  // Compares the metadata object with another one and returns true if the
+  // two matches.
+  virtual bool Matches(const DomainMetadata& other) const = 0;
+
+  // Returns a string representation of the metadata.
+  virtual string ToString() const = 0;
+
+  // Given a reachable set (the set of instructions which are reachable from
+  // each other via user/operand pathways, without crossing a kDomain
+  // instruciton), makes sure that all of them have metadata attributes which
+  // are coherent with this metadata object.
+  virtual Status NormalizeInstructions(const Domain& domain) const = 0;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DOMAIN_METADATA_H_
diff --git a/tensorflow/compiler/xla/service/hlo_domain_remover.cc b/tensorflow/compiler/xla/service/hlo_domain_remover.cc
new file mode 100644
index 00000000000000..1d06040b0e7c92
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_domain_remover.cc
@@ -0,0 +1,149 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_domain_remover.h"
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_domain_isolator.h"
+#include "tensorflow/compiler/xla/service/hlo_domain_map.h"
+#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/types.h"
+
+namespace xla {
+
+class HloDomainRemover::RunContext {
+ public:
+  RunContext(HloModule* module, HloDomainRemover* remover)
+      : module_(module), remover_(remover) {}
+
+  StatusOr<bool> Run();
+
+ private:
+  // Verifies the consistency of the domain, and normalizes the instructions
+  // within it.
+  Status VerifyAndNormalizeDomain(const DomainMetadata::Domain& domain);
+
+  HloModule* module_;
+  HloDomainRemover* remover_;
+};
+
+Status HloDomainRemover::RunContext::VerifyAndNormalizeDomain(
+    const DomainMetadata::Domain& domain) {
+  // Verify that the whole kDomain frontier bounding the instruction reach set,
+  // has matching metadata.
+  // A kDomain instruction has two sides of metadata, a user facing and an
+  // operand facing.
+  // A reachable instruction set can make contact with a kDomain instruction on
+  // a user facing side (the kDomain is operand of the instruction), or on a
+  // operand facing side (the kDomain is user of the instruction).
+  // And depending on the contact side, the proper metadata object
+  // (user_side_metadata() vs. operand_side_metadata()) needs to be used for
+  // consistency checks.
+  const DomainMetadata* ref_metadata = nullptr;
+  VLOG(4) << "Reach set:";
+  for (HloInstruction* instruction : domain.instructions) {
+    VLOG(4) << "  " << instruction->name();
+  }
+  VLOG(4) << "  Domains:";
+  for (HloInstruction* instruction : domain.enter_domains) {
+    const DomainMetadata& meta = instruction->user_side_metadata();
+    VLOG(4) << "    User side: " << instruction->name();
+    VLOG(4) << "      " << meta.ToString();
+    if (ref_metadata == nullptr) {
+      ref_metadata = &meta;
+    } else {
+      TF_RET_CHECK(meta.Matches(*ref_metadata))
+          << "Metadata mismatch at instruction " << instruction->name() << " : "
+          << meta.ToString() << " vs " << ref_metadata->ToString();
+    }
+  }
+  for (HloInstruction* instruction : domain.exit_domains) {
+    const DomainMetadata& meta = instruction->operand_side_metadata();
+    VLOG(4) << "    Operand side: " << instruction->name();
+    VLOG(4) << "      " << meta.ToString();
+    if (ref_metadata == nullptr) {
+      ref_metadata = &meta;
+    } else {
+      TF_RET_CHECK(meta.Matches(*ref_metadata))
+          << "Metadata mismatch at instruction " << instruction->name() << " : "
+          << meta.ToString() << " vs " << ref_metadata->ToString();
+    }
+  }
+  if (ref_metadata != nullptr) {
+    VLOG(4) << "Applying domain normalization: " << ref_metadata->ToString();
+    TF_RETURN_IF_ERROR(ref_metadata->NormalizeInstructions(domain));
+  } else {
+    // No kDomain instruction was present within this domain, so call the
+    // generic normalization functions and have them apply their heuristic.
+    VLOG(2) << "Applying domain-less normalization";
+    TF_RETURN_IF_ERROR(remover_->normalizer_(domain));
+  }
+  return Status::OK();
+}
+
+StatusOr<bool> HloDomainRemover::RunContext::Run() {
+  VLOG(4) << "Processing metadata domain: '" << remover_->kind_ << "'";
+  hlo_graph_dumper::MaybeDumpHloModule(*module_, "Before Domain Remover");
+
+  int64 removed_domains = 0;
+  for (HloComputation* computation : module_->computations()) {
+    // First create the domain instruciton sets. A domain instruction set is
+    // the set of instructions whose edges never cross a kDomain instruction.
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloDomainMap> domain_map,
+                        HloDomainMap::Create(computation, remover_->kind_));
+    // Verify and normalize every domain populated within the map.
+    for (auto& domain : domain_map->GetDomains()) {
+      TF_RETURN_IF_ERROR(VerifyAndNormalizeDomain(*domain));
+    }
+
+    // Now remove all the kDomain instructions of the kind specified by the
+    // remover, that are within the currently processed computation from the
+    // graph.
+    for (HloInstruction* instruction :
+         computation->MakeInstructionPostOrder()) {
+      for (HloInstruction* operand : instruction->unique_operands()) {
+        if (domain_map->IsDomainInstruction(operand)) {
+          VLOG(5) << "Removing " << operand->name();
+          TF_RETURN_IF_ERROR(
+              operand->ReplaceAllUsesWith(operand->mutable_operand(0)));
+          TF_RETURN_IF_ERROR(computation->RemoveInstruction(operand));
+          ++removed_domains;
+        }
+      }
+    }
+    HloInstruction* root = computation->root_instruction();
+    if (root != nullptr && domain_map->IsDomainInstruction(root)) {
+      VLOG(5) << "Removing " << root->name();
+      computation->set_root_instruction(root->mutable_operand(0));
+      TF_RETURN_IF_ERROR(computation->RemoveInstruction(root));
+      ++removed_domains;
+    }
+  }
+  VLOG(3) << "Removed " << removed_domains << " kDomain instructions of '"
+          << remover_->kind_ << "' kind";
+  if (removed_domains > 0) {
+    hlo_graph_dumper::MaybeDumpHloModule(*module_, "After Domain Remover");
+  }
+  return removed_domains > 0;
+}
+
+StatusOr<bool> HloDomainRemover::Run(HloModule* module) {
+  RunContext run_context(module, this);
+  return run_context.Run();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_domain_remover.h b/tensorflow/compiler/xla/service/hlo_domain_remover.h
new file mode 100644
index 00000000000000..0c71dd34fd4d29
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_domain_remover.h
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DOMAIN_REMOVER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DOMAIN_REMOVER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_domain_metadata.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace xla {
+
+// Removes all the kDomain instructions of a given kind from the input module,
+// and calls the normalizer to propagate the properties on the possibly new born
+// instructions.
+class HloDomainRemover : public HloPassInterface {
+ public:
+  // Creates a new HloDomainRemover object tasked at removing all the kDomain
+  // instructions of a given kind.
+  // In case a reachable set (the set of instructions within a computation,
+  // which are mutually reachable via operand/user pathways) has all the
+  // instructions in it with the same attributes (ie, sharding), a normalizer
+  // function is tasked at applying attribute normalization on the instructions
+  // within such domain.
+  HloDomainRemover(
+      tensorflow::StringPiece kind,
+      std::function<Status(const DomainMetadata::Domain&)> normalizer)
+      : kind_(kind.ToString()), normalizer_(std::move(normalizer)) {}
+
+  tensorflow::StringPiece name() const override { return "domain_remover"; }
+
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  class RunContext;
+
+  string kind_;
+  std::function<Status(const DomainMetadata::Domain&)> normalizer_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DOMAIN_REMOVER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_domain_test.cc b/tensorflow/compiler/xla/service/hlo_domain_test.cc
new file mode 100644
index 00000000000000..5553ddb153f7f1
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_domain_test.cc
@@ -0,0 +1,432 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/service/hlo_domain_isolator.h"
+#include "tensorflow/compiler/xla/service/hlo_domain_metadata.h"
+#include "tensorflow/compiler/xla/service/hlo_domain_remover.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding_metadata.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+class HloDomainTest : public HloTestBase {
+ protected:
+  bool FindUserViaDomainPath(HloInstruction* instruction,
+                             HloInstruction* operand) const {
+    for (HloInstruction* user : operand->users()) {
+      if (user == instruction) {
+        return true;
+      }
+      if (user->opcode() == HloOpcode::kDomain &&
+          FindUserViaDomainPath(instruction, user)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // Checks whether there is a kDomain instruction in the edge between the
+  // instruction and the operand.
+  bool HasDomainEdge(HloModule* module,
+                     tensorflow::StringPiece instruction_name,
+                     tensorflow::StringPiece operand_name) {
+    HloInstruction* instruction = FindInstruction(module, instruction_name);
+    HloInstruction* operand = FindInstruction(module, operand_name);
+    CHECK_NE(instruction, nullptr);
+    CHECK_NE(operand, nullptr);
+    if (!instruction->IsUserOf(operand)) {
+      // If instruction is not an immediate user, we must find a path from
+      // operand to instruction anyway, otherwise there is a corruption.
+      if (FindUserViaDomainPath(instruction, operand)) {
+        return true;
+      }
+      LOG(FATAL) << "Bad HLO module generated across the '" << instruction_name
+                 << "' and '" << operand_name << "' instructions:\n"
+                 << module->ToString();
+    }
+    return false;
+  }
+
+  StatusOr<std::unique_ptr<HloModule>> ParseModule(
+      tensorflow::StringPiece hlo_string) {
+    HloModuleConfig config;
+    config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+    return ParseHloString(hlo_string, config);
+  }
+};
+
+// Dummy DomainMetadata implementation which create kDomain boundaries around
+// HLO instructions with the same metadata().op_name() values.
+class OpNameMetadata : public DomainMetadata {
+ public:
+  explicit OpNameMetadata(string opname) : opname_(std::move(opname)) {}
+
+  std::unique_ptr<DomainMetadata> Clone() const override {
+    return MakeUnique<OpNameMetadata>(opname_);
+  }
+
+  tensorflow::StringPiece Kind() const override { return KindName(); }
+
+  bool Matches(const DomainMetadata& other) const override {
+    const OpNameMetadata* other_ptr =
+        dynamic_cast<const OpNameMetadata*>(&other);
+    if (other_ptr == nullptr) {
+      // If other is not a OpNameMetadata, then it is clearly a no match.
+      return false;
+    }
+    return opname_ == other_ptr->opname_;
+  }
+
+  string ToString() const override { return opname_; }
+
+  Status NormalizeInstructions(
+      const DomainMetadata::Domain& domain) const override {
+    // For the purposes of this test, nothing to do.
+    return Status::OK();
+  }
+
+  static tensorflow::StringPiece KindName() { return "opname"; }
+
+ private:
+  string opname_;
+};
+
+// Creator function for OpNameMetadata domains.
+std::unique_ptr<HloInstruction> OpNameDomainCreator(HloInstruction* instruction,
+                                                    HloInstruction* operand) {
+  if (instruction->metadata().op_name() == operand->metadata().op_name()) {
+    return nullptr;
+  }
+  std::unique_ptr<DomainMetadata> operand_side_metadata =
+      MakeUnique<OpNameMetadata>(operand->metadata().op_name());
+  std::unique_ptr<DomainMetadata> user_side_metadata =
+      MakeUnique<OpNameMetadata>(instruction->metadata().op_name());
+  return HloInstruction::CreateDomain(operand->shape(), operand,
+                                      std::move(operand_side_metadata),
+                                      std::move(user_side_metadata));
+}
+
+Status OpNameDomainNormalizer(const DomainMetadata::Domain& domain) {
+  // Nothing to do for the particular use this test make of the OpName domains.
+  return Status::OK();
+}
+
+TEST_F(HloDomainTest, CheckDomainLinks) {
+  const char* const hlo_string = R"(
+HloModule Module
+
+ENTRY entry {
+  p0 = (f32[4], f32[4]) parameter(0)
+  a = f32[4] get-tuple-element(p0), index=0
+  b = f32[4] get-tuple-element(p0), index=1
+  c = f32[4] add(f32[4] a, f32[4] b), sharding={maximal device=1}
+  d = f32[4] subtract(a, b), sharding={maximal device=1}
+  e = f32[4] multiply(c, d), sharding={maximal device=1}
+  ROOT f = (f32[4], f32[4], f32[4]) tuple(c, d, e)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseModule(hlo_string));
+  LOG(INFO) << "Original module:\n" << module->ToString();
+
+  HloDomainIsolator isolator(CreateShardingDomain);
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
+  EXPECT_TRUE(isolator_changed);
+
+  EXPECT_TRUE(HasDomainEdge(module.get(), "c", "a"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "c", "b"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "d", "a"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "d", "b"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "e", "c"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "e", "d"));
+
+  HloDomainRemover remover(ShardingMetadata::KindName(),
+                           NormalizeShardingDomain);
+  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get()));
+  EXPECT_TRUE(remover_changed);
+
+  EXPECT_FALSE(HasDomainEdge(module.get(), "c", "a"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "c", "b"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "d", "a"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "d", "b"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "e", "c"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "e", "d"));
+}
+
+TEST_F(HloDomainTest, CheckNoDomainAddedIfNoSharding) {
+  const char* const hlo_string = R"(
+HloModule Module
+
+ENTRY entry {
+  p0 = (f32[4], f32[4]) parameter(0)
+  a = f32[4] get-tuple-element(p0), index=0
+  b = f32[4] get-tuple-element(p0), index=1
+  c = f32[4] add(f32[4] a, f32[4] b)
+  d = f32[4] subtract(a, b)
+  e = f32[4] multiply(c, d)
+  ROOT f = (f32[4], f32[4], f32[4]) tuple(c, d, e)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseModule(hlo_string));
+  LOG(INFO) << "Original module:\n" << module->ToString();
+
+  HloDomainIsolator isolator(CreateShardingDomain);
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
+  EXPECT_TRUE(!isolator_changed);
+}
+
+TEST_F(HloDomainTest, CheckDomainAroundIO) {
+  const char* const hlo_string = R"(
+HloModule Module
+
+ENTRY entry {
+  p0 = (f32[4]) parameter(0)
+  a = f32[4] get-tuple-element(p0), index=0
+  b = (f32[4], u32[]) send(a), channel_id=1, sharding={maximal device=0}
+  c = () send-done(b), channel_id=1, sharding={maximal device=0}
+  d = (f32[4], u32[]) recv(), channel_id=2, sharding={maximal device=0}
+  e = f32[4] recv-done(d), channel_id=2, sharding={maximal device=0}
+  f = f32[4] add(a, e)
+  g = f32[4] subtract(a, e)
+  ROOT h = (f32[4], f32[4]) tuple(f, g)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseModule(hlo_string));
+  LOG(INFO) << "Original module:\n" << module->ToString();
+
+  HloDomainIsolator isolator(CreateShardingDomain);
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
+  EXPECT_TRUE(isolator_changed);
+
+  EXPECT_TRUE(HasDomainEdge(module.get(), "b", "a"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "f", "e"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "a", "p0"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "c", "b"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "e", "d"));
+
+  HloDomainRemover remover(ShardingMetadata::KindName(),
+                           NormalizeShardingDomain);
+  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get()));
+  EXPECT_TRUE(remover_changed);
+
+  EXPECT_FALSE(HasDomainEdge(module.get(), "b", "a"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "f", "e"));
+}
+
+TEST_F(HloDomainTest, CheckNoDomainAddedOnPureIOComputation) {
+  const char* const hlo_string = R"(
+HloModule Module
+
+ENTRY entry {
+  a = (f32[4], u32[]) recv(), channel_id=1, sharding={maximal device=-1}
+  b = f32[4] recv-done(a), channel_id=1, sharding={maximal device=-1}
+  c = f32[4] add(b, b), sharding={maximal device=-1}
+  d = (f32[4], u32[]) send(c), channel_id=2, sharding={maximal device=-1}
+  ROOT e = () send-done(d), channel_id=2, sharding={maximal device=-1}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseModule(hlo_string));
+  LOG(INFO) << "Original module:\n" << module->ToString();
+
+  HloDomainIsolator isolator(CreateShardingDomain);
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
+  EXPECT_FALSE(isolator_changed);
+}
+
+TEST_F(HloDomainTest, CheckNormalizationOnPureIOComputation) {
+  const char* const hlo_string = R"(
+HloModule Module
+
+ENTRY entry {
+  a = (f32[4], u32[]) recv(), channel_id=1, sharding={maximal device=0}
+  b = f32[4] recv-done(a), channel_id=1, sharding={maximal device=0}
+  c = f32[4] add(b, b)
+  d = (f32[4], u32[]) send(c), channel_id=2, sharding={maximal device=0}
+  ROOT e = () send-done(d), channel_id=2, sharding={maximal device=0}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseModule(hlo_string));
+  LOG(INFO) << "Original module:\n" << module->ToString();
+
+  HloDomainRemover remover(ShardingMetadata::KindName(),
+                           NormalizeShardingDomain);
+  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get()));
+  EXPECT_FALSE(remover_changed);
+
+  HloInstruction* add = FindInstruction(module.get(), "c");
+  ASSERT_NE(add, nullptr);
+  auto device = add->sharding_unique_device();
+  EXPECT_TRUE(device.has_value());
+  EXPECT_EQ(*device, 0);
+}
+
+TEST_F(HloDomainTest, CheckMultiDomainLinks) {
+  const char* const hlo_string = R"(
+HloModule Module
+
+ENTRY entry {
+  p0 = (f32[4], f32[4]) parameter(0)
+  a = f32[4] get-tuple-element(p0), index=0
+  b = f32[4] get-tuple-element(p0), index=1
+  c = f32[4] add(a, b), sharding={maximal device=1}
+  d = f32[4] subtract(a, c), sharding={maximal device=1}, metadata={op_name="D"}
+  e = f32[4] multiply(c, d), sharding={maximal device=1}, metadata={op_name="D"}
+  f = f32[4] add(e, c), sharding={maximal device=1}
+  ROOT g = (f32[4], f32[4], f32[4]) tuple(c, d, f)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseModule(hlo_string));
+  LOG(INFO) << "Original module:\n" << module->ToString();
+
+  HloDomainIsolator sharding_isolator(CreateShardingDomain);
+  TF_ASSERT_OK_AND_ASSIGN(bool sharding_isolator_changed,
+                          sharding_isolator.Run(module.get()));
+  EXPECT_TRUE(sharding_isolator_changed);
+
+  HloDomainIsolator opname_isolator(OpNameDomainCreator);
+  TF_ASSERT_OK_AND_ASSIGN(bool opname_isolator_changed,
+                          opname_isolator.Run(module.get()));
+  EXPECT_TRUE(opname_isolator_changed);
+
+  EXPECT_TRUE(HasDomainEdge(module.get(), "c", "a"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "c", "b"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "d", "a"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "d", "c"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "e", "d"));
+
+  HloDomainRemover sharding_remover(ShardingMetadata::KindName(),
+                                    NormalizeShardingDomain);
+  TF_ASSERT_OK_AND_ASSIGN(bool sharding_remover_changed,
+                          sharding_remover.Run(module.get()));
+  EXPECT_TRUE(sharding_remover_changed);
+
+  HloDomainRemover opname_remover(OpNameMetadata::KindName(),
+                                  OpNameDomainNormalizer);
+  TF_ASSERT_OK_AND_ASSIGN(bool opname_remover_changed,
+                          opname_remover.Run(module.get()));
+  EXPECT_TRUE(opname_remover_changed);
+
+  EXPECT_FALSE(HasDomainEdge(module.get(), "c", "a"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "c", "b"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "d", "a"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "d", "c"));
+}
+
+TEST_F(HloDomainTest, CheckNormalizationOnInfeedTuple) {
+  const char* const hlo_string = R"(
+HloModule Module
+
+ENTRY entry {
+  infeed = (f32[4], f32[4]) infeed(),
+    sharding={{maximal device=1}, {maximal device=0}}
+  gte0 = f32[4] get-tuple-element(infeed), index=0
+  gte1 = f32[4] get-tuple-element(infeed), index=1
+  copy0 = f32[4] copy(gte0)
+  copy1 = f32[4] copy(gte1)
+  ROOT add = f32[4] add(copy0, copy1)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseModule(hlo_string));
+  LOG(INFO) << "Original module:\n" << module->ToString();
+
+  HloDomainIsolator isolator(CreateShardingDomain);
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
+  EXPECT_TRUE(isolator_changed);
+
+  EXPECT_TRUE(HasDomainEdge(module.get(), "gte0", "infeed"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "gte1", "infeed"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "copy0", "gte0"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "copy1", "gte1"));
+
+  // Inject unassigned tuple/gte within the infeed domain, to simulate the
+  // HLO passes adding unexpected instructions.
+  //
+  //            infeed
+  //           /      \
+  //         GTE0    GTE1
+  //         /          \
+  //       COPY0       COPY1
+  //          \         /
+  //           \       /
+  //             TUPLE
+  //               |
+  //             DOMAIN
+  HloInstruction* infeed = FindInstruction(module.get(), "infeed");
+  ASSERT_NE(infeed, nullptr);
+  auto infeed_users = infeed->users();
+  HloInstruction* new_gte0 =
+      infeed->parent()->AddInstruction(HloInstruction::CreateGetTupleElement(
+          ShapeUtil::GetTupleElementShape(infeed->shape(), 0), infeed, 0));
+  HloInstruction* new_copy0 =
+      infeed->parent()->AddInstruction(HloInstruction::CreateUnary(
+          new_gte0->shape(), HloOpcode::kCopy, new_gte0));
+  HloInstruction* new_gte1 =
+      infeed->parent()->AddInstruction(HloInstruction::CreateGetTupleElement(
+          ShapeUtil::GetTupleElementShape(infeed->shape(), 1), infeed, 1));
+  HloInstruction* new_copy1 =
+      infeed->parent()->AddInstruction(HloInstruction::CreateUnary(
+          new_gte1->shape(), HloOpcode::kCopy, new_gte1));
+  HloInstruction* new_tuple = infeed->parent()->AddInstruction(
+      HloInstruction::CreateTuple({new_copy0, new_copy1}));
+  for (HloInstruction* user : infeed_users) {
+    TF_EXPECT_OK(infeed->ReplaceUseWith(user, new_tuple));
+  }
+
+  HloDomainRemover remover(ShardingMetadata::KindName(),
+                           NormalizeShardingDomain);
+  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get()));
+  EXPECT_TRUE(remover_changed);
+
+  struct Assignment {
+    HloInstruction* instruction;
+    int64 device;
+  } assignments[] = {
+      {new_gte0, 1},
+      {new_copy0, 1},
+      {new_gte1, 0},
+      {new_copy1, 0},
+  };
+  for (auto& assignment : assignments) {
+    auto device = assignment.instruction->sharding_unique_device();
+    EXPECT_TRUE(device.has_value());
+    EXPECT_EQ(*device, assignment.device);
+  }
+  EXPECT_TRUE(new_tuple->has_sharding());
+  EXPECT_EQ(
+      new_tuple->sharding(),
+      HloSharding::Tuple(new_tuple->shape(), {HloSharding::AssignDevice(1),
+                                              HloSharding::AssignDevice(0)}));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
index d236f83aeb9254..4ed1508d706768 100644
--- a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
+++ b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
@@ -119,6 +119,7 @@ StatusOr<bool> HloElementTypeConverter::Run(HloModule* module) {
     return false;
   }
 
+  HloCloneContext context(module);
   bool changed = false;
   for (auto* computation : module->computations()) {
     for (auto* hlo : computation->MakeInstructionPostOrder()) {
@@ -140,6 +141,7 @@ StatusOr<bool> HloElementTypeConverter::Run(HloModule* module) {
       // These are ops with embedded computations where it suffices to convert
       // the embedded computations instead of converting the ops themselves.
       if (opcode == HloOpcode::kWhile || opcode == HloOpcode::kCall ||
+          opcode == HloOpcode::kCrossReplicaSum ||
           opcode == HloOpcode::kFusion || opcode == HloOpcode::kMap ||
           opcode == HloOpcode::kReduce || opcode == HloOpcode::kReduceWindow ||
           opcode == HloOpcode::kSelectAndScatter ||
@@ -180,7 +182,7 @@ StatusOr<bool> HloElementTypeConverter::Run(HloModule* module) {
             ShapeUtil::ChangeElementType(hlo->shape(), replace_with_type_);
 
         new_hlo = computation->AddInstruction(
-            hlo->CloneWithNewOperands(shape, new_operands, hlo->GetModule()));
+            hlo->CloneWithNewOperands(shape, new_operands, &context));
         TF_RETURN_IF_ERROR(new_hlo->CopyAllControlDepsFrom(hlo));
 
         new_hlo = ToElementType(new_hlo, eliminate_type_);
@@ -189,16 +191,16 @@ StatusOr<bool> HloElementTypeConverter::Run(HloModule* module) {
         Shape new_shape = GetConvertedTupleShape(hlo->shape(), eliminate_type_,
                                                  replace_with_type_);
 
-        new_hlo = computation->AddInstruction(hlo->CloneWithNewOperands(
-            new_shape, new_operands, hlo->GetModule()));
+        new_hlo = computation->AddInstruction(
+            hlo->CloneWithNewOperands(new_shape, new_operands, &context));
         TF_RETURN_IF_ERROR(new_hlo->CopyAllControlDepsFrom(hlo));
 
         // Convert the elements of the result of `new_hlo` to produce a new
         // tuple with shape `old_shape`.
         new_hlo = ConvertTupleElements(new_hlo, old_shape);
       } else {
-        new_hlo = computation->AddInstruction(hlo->CloneWithNewOperands(
-            hlo->shape(), new_operands, hlo->GetModule()));
+        new_hlo = computation->AddInstruction(
+            hlo->CloneWithNewOperands(hlo->shape(), new_operands, &context));
         TF_RETURN_IF_ERROR(new_hlo->CopyAllControlDepsFrom(hlo));
       }
 
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index c5e30148345fec..1e78d775c8e172 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
@@ -42,7 +43,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
@@ -52,25 +52,11 @@ namespace xla {
 namespace {
 
 using tensorflow::gtl::ArraySlice;
-using tensorflow::gtl::FlatSet;
-using tensorflow::gtl::optional;
-
-template <typename T>
-struct is_complex_t : public std::false_type {};
-
-template <>
-struct is_complex_t<complex64> : public std::true_type {};
-
-template <typename T>
-struct is_complex64_t : public std::false_type {};
-
-template <>
-struct is_complex64_t<complex64> : public std::true_type {};
 
 template <typename OperandT>
 StatusOr<std::unique_ptr<Literal>> Compare(const Shape& shape, HloOpcode opcode,
-                                           const Literal& lhs_literal,
-                                           const Literal& rhs_literal) {
+                                           LiteralSlice lhs_literal,
+                                           LiteralSlice rhs_literal) {
   std::function<bool(OperandT, OperandT)> compare_op;
   switch (opcode) {
     case HloOpcode::kEq:
@@ -108,7 +94,7 @@ StatusOr<std::unique_ptr<Literal>> Compare(const Shape& shape, HloOpcode opcode,
                  << HloOpcodeString(opcode);
   }
 
-  auto result = Literal::CreateFromShape(shape);
+  auto result = MakeUnique<Literal>(shape);
   TF_RETURN_IF_ERROR(result->Populate<bool>([&](ArraySlice<int64> multi_index) {
     return compare_op(lhs_literal.Get<OperandT>(multi_index),
                       rhs_literal.Get<OperandT>(multi_index));
@@ -119,8 +105,8 @@ StatusOr<std::unique_ptr<Literal>> Compare(const Shape& shape, HloOpcode opcode,
 
 template <>
 StatusOr<std::unique_ptr<Literal>> Compare<complex64>(
-    const Shape& shape, HloOpcode opcode, const Literal& lhs_literal,
-    const Literal& rhs_literal) {
+    const Shape& shape, HloOpcode opcode, LiteralSlice lhs_literal,
+    LiteralSlice rhs_literal) {
   std::function<bool(complex64, complex64)> compare_op;
   switch (opcode) {
     case HloOpcode::kEq:
@@ -138,7 +124,7 @@ StatusOr<std::unique_ptr<Literal>> Compare<complex64>(
                  << HloOpcodeString(opcode);
   }
 
-  auto result = Literal::CreateFromShape(shape);
+  auto result = MakeUnique<Literal>(shape);
   TF_RETURN_IF_ERROR(result->Populate<bool>([&](ArraySlice<int64> multi_index) {
     return compare_op(lhs_literal.Get<complex64>(multi_index),
                       rhs_literal.Get<complex64>(multi_index));
@@ -147,2092 +133,48 @@ StatusOr<std::unique_ptr<Literal>> Compare<complex64>(
   return std::move(result);
 }
 
-template <typename ReturnT, typename NativeT>
-StatusOr<std::unique_ptr<Literal>> ElementWiseUnaryOpImpl(
-    HloInstruction* instruction,
-    const std::function<ReturnT(NativeT)>& unary_op,
-    const Literal& operand_literal) {
-  const auto shape = instruction->shape();
-  const auto* operand = instruction->operand(0);
-
-  // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast is
-  // removed.
-  if (!ShapeUtil::SameDimensions(shape, operand->shape())) {
-    return Unimplemented(
-        "Implicit broadcasting is currently unsupported in HLO evaluator "
-        "Shape Mismatch: %s vs %s",
-        ShapeUtil::HumanString(shape).c_str(),
-        ShapeUtil::HumanString(operand->shape()).c_str());
-  }
-
-  auto result = Literal::CreateFromShape(shape);
-
-  TF_RETURN_IF_ERROR(
-      result->Populate<ReturnT>([&](ArraySlice<int64> multi_index) {
-        return unary_op(operand_literal.Get<NativeT>(multi_index));
-      }));
-  return std::move(result);
-}
-
-// For one particular placement of a window in a base shape (the placement is
-// represented as `window_count_index`), iterates inside the window. Translates
-// the window index into base index. If the base index is within bound, call `f`
-// with the base index.
-void IterateThroughWindow(
-    const Shape& window_shape, const Window& window, const Shape& base_shape,
-    const ArraySlice<int64>& window_count_index,
-    const std::function<void(const std::vector<int64>&)>& f) {
-  const int64 rank = ShapeUtil::Rank(base_shape);
-  DimensionVector window_index(rank);
-  std::fill(window_index.begin(), window_index.end(), 0);
-  do {
-    std::vector<int64> base_index(rank);
-    bool out_of_bound = false;
-    for (int64 i = 0; i < rank; ++i) {
-      base_index[i] = window_count_index[i] * window.dimensions(i).stride() +
-                      window_index[i] - window.dimensions(i).padding_low();
-      if (base_index[i] < 0 || base_index[i] >= base_shape.dimensions(i)) {
-        out_of_bound = true;
-        break;
-      }
-    }
-    if (!out_of_bound) {
-      f(base_index);
-    }
-  } while (IndexUtil::BumpIndices(window_shape, &window_index));
-}
-
-// Creates a vector of multipliers which can be used to create a linear index
-// into shape.
-//
-// Given the multidimensional index {i1, ..., iN} and
-// M = MakeDimMultipliers(shape), the corresponding linear index LI is simply
-//
-//   LI = i1 * M[1] + i2 * M[2] + ... + iN * M[N].
-//
-// This lets you calculate LI given the multidimensional indices in any order.
-DimensionVector MakeDimMultipliers(const Shape& shape) {
-  DimensionVector v(ShapeUtil::Rank(shape));
-  int64 scale = 1;
-  for (auto dim : LayoutUtil::MinorToMajor(shape)) {
-    v[dim] = scale;
-    scale *= shape.dimensions(dim);
-  }
-  return v;
-}
-
 }  // namespace
 
-template <typename ReturnT, typename ElementwiseT>
-class HloEvaluator::TypedVisitor : public DfsHloVisitorWithDefault {
- public:
-  explicit TypedVisitor(HloEvaluator* p) : parent_(p) {}
-
-  // The following higher-order functions convert a function with ElementwiseT
-  // to a function with ReturnT.
-  std::function<ReturnT(ReturnT)> ConvertUnaryFunction(
-      const std::function<ElementwiseT(ElementwiseT)>& unary_op) {
-    return [&unary_op](ReturnT arg) {
-      return static_cast<ReturnT>(unary_op(static_cast<ElementwiseT>(arg)));
-    };
-  }
-  std::function<ReturnT(ReturnT, ReturnT)> ConvertBinaryFunction(
-      const std::function<ElementwiseT(ElementwiseT, ElementwiseT)>&
-          binary_op) {
-    return [&binary_op](ReturnT arg1, ReturnT arg2) {
-      return static_cast<ReturnT>(binary_op(static_cast<ElementwiseT>(arg1),
-                                            static_cast<ElementwiseT>(arg2)));
-    };
-  }
-  std::function<ReturnT(ReturnT, ReturnT, ReturnT)> ConvertTernaryFunction(
-      const std::function<ElementwiseT(ElementwiseT, ElementwiseT,
-                                       ElementwiseT)>& ternary_op) {
-    return [&ternary_op](ReturnT arg1, ReturnT arg2, ReturnT arg3) {
-      return static_cast<ReturnT>(ternary_op(static_cast<ElementwiseT>(arg1),
-                                             static_cast<ElementwiseT>(arg2),
-                                             static_cast<ElementwiseT>(arg3)));
-    };
-  }
-
-  Status DefaultAction(HloInstruction* hlo_instruction) override {
-    return Unimplemented("unhandled HLO ops for HloEvaluator: %s.",
-                         HloOpcodeString(hlo_instruction->opcode()).c_str());
-  }
-
-  // TODO(b/35950897): many of the stl functions used in the handlers are not
-  // overloaded for every XLA primitive types.
-
-  template <typename NativeT,
-            typename std::enable_if<std::is_unsigned<NativeT>::value>::type* =
-                nullptr>
-  Status HandleAbs(HloInstruction* abs) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[abs],
-                        ElementWiseUnaryOp(abs, [](NativeT elem_operand) {
-                          return elem_operand;
-                        }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<std::is_signed<NativeT>::value>::type* = nullptr>
-  Status HandleAbs(HloInstruction* abs) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[abs],
-                        ElementWiseUnaryOp(abs, [](NativeT elem_operand) {
-                          return std::abs(elem_operand);
-                        }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<is_complex64_t<NativeT>::value>::type* = nullptr>
-  Status HandleAbs(HloInstruction* abs) {
-    const Literal& operand_literal =
-        parent_->GetEvaluatedLiteralFor(abs->operand(0));
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[abs],
-        (ElementWiseUnaryOpImpl<float, NativeT>(
-            abs, [](NativeT elem_operand) { return std::abs(elem_operand); },
-            operand_literal)));
-
-    return Status::OK();
-  }
-
-  Status HandleAbs(HloInstruction* abs) override {
-    // If the operand is of C64 type, the return type of abs will be F32.
-    // However, ElementwiseT would still be the return type, F32, and thus
-    // specifying the ElementwiseT explicitly as C64 is needed below.
-    if (abs->operand(0)->shape().element_type() == C64) {
-      return HandleAbs<complex64>(abs);
-    }
-    return HandleAbs<ElementwiseT>(abs);
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleRound(HloInstruction* round) {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[round],
-        ElementWiseUnaryOp(round, [](ElementwiseT elem_operand) {
-          return std::round(elem_operand);
-        }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleRound(HloInstruction* round) {
-    return InvalidArgument("Unsupported type for Round");
-  }
-
-  Status HandleRound(HloInstruction* round) override {
-    return HandleRound<ReturnT>(round);
-  }
-
-  Status HandleBroadcast(HloInstruction* broadcast) override {
-    parent_->evaluated_[broadcast] =
-        Literal::CreateFromShape(broadcast->shape());
-    auto output = parent_->evaluated_[broadcast].get();
-    const Literal& operand_to_broadcast =
-        parent_->GetEvaluatedLiteralFor(broadcast->operand(0));
-    std::vector<int64> broadcast_indices(
-        ShapeUtil::Rank(broadcast->operand(0)->shape()), 0);
-
-    TF_RET_CHECK(broadcast->dimensions().size() ==
-                 ShapeUtil::Rank(operand_to_broadcast.shape()))
-        << "broadcast dimensions is of size: " << broadcast->dimensions().size()
-        << " and rank of operand_to_broadcast is: "
-        << ShapeUtil::Rank(operand_to_broadcast.shape());
-    // Checks that operand's dimensions are the same as the broadcast's
-    // dimensions along the dimensions to be broadcasted.
-    for (int64 i = 0; i < broadcast->dimensions().size(); ++i) {
-      TF_RET_CHECK(broadcast->shape().dimensions(broadcast->dimensions(i)) ==
-                   operand_to_broadcast.shape().dimensions(i));
-    }
-
-    return output->Populate<ReturnT>([&](ArraySlice<int64> multi_index) {
-      for (int64 i = 0; i < broadcast->dimensions().size(); ++i) {
-        broadcast_indices[i] = multi_index[broadcast->dimensions(i)];
-      }
-      return operand_to_broadcast.Get<ReturnT>(broadcast_indices);
-    });
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleCeil(HloInstruction* ceil) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[ceil],
-                        ElementWiseUnaryOp(ceil, [](ElementwiseT elem_operand) {
-                          return std::ceil(elem_operand);
-                        }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleCeil(HloInstruction* ceil) {
-    return InvalidArgument("Unsupported type for Ceil");
-  }
-
-  Status HandleCeil(HloInstruction* ceil) override {
-    return HandleCeil<ReturnT>(ceil);
-  }
-
-  Status HandleConvert(HloInstruction* convert) override {
-    const HloInstruction* operand = convert->operand(0);
-    TF_RET_CHECK(ShapeUtil::SameDimensions(operand->shape(), convert->shape()));
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> result,
-                        parent_->GetEvaluatedLiteralFor(operand).Convert(
-                            convert->shape().element_type()));
-
-    if (LayoutUtil::LayoutsInShapesEqual(result->shape(), convert->shape())) {
-      parent_->evaluated_[convert] = std::move(result);
-    } else {
-      parent_->evaluated_[convert] =
-          result->Relayout(convert->shape().layout());
-    }
-    return Status::OK();
-  }
-
-  Status HandleBitcastConvert(HloInstruction* convert) override {
-    const HloInstruction* operand = convert->operand(0);
-    TF_RET_CHECK(ShapeUtil::SameDimensions(operand->shape(), convert->shape()));
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> result,
-                        parent_->GetEvaluatedLiteralFor(operand).BitcastConvert(
-                            convert->shape().element_type()));
-
-    if (LayoutUtil::LayoutsInShapesEqual(result->shape(), convert->shape())) {
-      parent_->evaluated_[convert] = std::move(result);
-    } else {
-      parent_->evaluated_[convert] =
-          result->Relayout(convert->shape().layout());
-    }
-    return Status::OK();
-  }
-
-  Status HandleExp(HloInstruction* exp) override {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[exp],
-                        ElementWiseUnaryOp(exp, [](ElementwiseT elem_operand) {
-                          return std::exp(elem_operand);
-                        }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleFloor(HloInstruction* floor) {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[floor],
-        ElementWiseUnaryOp(floor, [](ElementwiseT elem_operand) {
-          return std::floor(elem_operand);
-        }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleFloor(HloInstruction* floor) {
-    return InvalidArgument("Unsupported type for Floor");
-  }
-
-  Status HandleFloor(HloInstruction* floor) override {
-    return HandleFloor<ReturnT>(floor);
-  }
-
-  Status HandleLog(HloInstruction* log) override {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[log],
-                        ElementWiseUnaryOp(log, [](ElementwiseT elem_operand) {
-                          return std::log(elem_operand);
-                        }));
-    return Status::OK();
-  }
-
-  template <typename NativeT,
-            typename std::enable_if<
-                std::is_integral<NativeT>::value &&
-                !std::is_same<NativeT, bool>::value>::type* = nullptr>
-  Status HandleNot(HloInstruction* not_) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[not_],
-                        ElementWiseUnaryOp(not_, [](ElementwiseT elem_operand) {
-                          return ~elem_operand;
-                        }));
-    return Status::OK();
-  }
-
-  template <typename NativeT, typename std::enable_if<std::is_floating_point<
-                                  NativeT>::value>::type* = nullptr>
-  Status HandleNot(HloInstruction* not_) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[not_],
-                        ElementWiseUnaryOp(not_, [](ElementwiseT elem_operand) {
-                          return !elem_operand;
-                        }));
-    return Status::OK();
-  }
-
-  template <typename NativeT,
-            typename std::enable_if<std::is_same<NativeT, bool>::value>::type* =
-                nullptr>
-  Status HandleNot(HloInstruction* not_) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[not_],
-                        ElementWiseUnaryOp(not_, [](ElementwiseT elem_operand) {
-                          return !elem_operand;
-                        }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleNot(HloInstruction* not_) {
-    return InvalidArgument("Unsupported type for Not");
-  }
-
-  Status HandleNot(HloInstruction* not_) override {
-    return HandleNot<ElementwiseT>(not_);
-  }
-
-  template <typename NativeT,
-            typename std::enable_if<
-                std::is_signed<NativeT>::value &&
-                !std::is_floating_point<NativeT>::value>::type* = nullptr>
-  Status HandleNegate(HloInstruction* negate) {
-    using type = typename std::make_unsigned<NativeT>::type;
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[negate],
-        ElementWiseUnaryOp(negate, [](ElementwiseT elem_operand) {
-          return NativeT(-type(elem_operand));
-        }));
-    return Status::OK();
-  }
-
-  template <typename NativeT,
-            typename std::enable_if<
-                !std::is_signed<NativeT>::value ||
-                std::is_floating_point<NativeT>::value>::type* = nullptr>
-  Status HandleNegate(HloInstruction* negate) {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[negate],
-        ElementWiseUnaryOp(
-            negate, [](ElementwiseT elem_operand) { return -elem_operand; }));
-    return Status::OK();
-  }
-
-  Status HandleNegate(HloInstruction* negate) override {
-    return HandleNegate<ReturnT>(negate);
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleSign(HloInstruction* sign) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[sign],
-                        ElementWiseUnaryOp(sign, [](ElementwiseT elem_operand) {
-                          return (ElementwiseT(0) < elem_operand) -
-                                 (elem_operand < ElementwiseT(0));
-                        }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleSign(HloInstruction* sign) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[sign],
-                        ElementWiseUnaryOp(sign, [](ElementwiseT elem_operand) {
-                          auto abs_val = std::abs(elem_operand);
-                          return 0 == abs_val ? ElementwiseT(0)
-                                              : elem_operand / abs_val;
-                        }));
-    return Status::OK();
-  }
-
-  Status HandleSign(HloInstruction* sign) override {
-    return HandleSign<ReturnT>(sign);
-  }
-
-  template <typename NativeT, typename std::enable_if<std::is_floating_point<
-                                  NativeT>::value>::type* = nullptr>
-  Status HandleAtan2(HloInstruction* atan2) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[atan2],
-                        ElementWiseBinaryOp(atan2, [](ElementwiseT lhs_elem,
-                                                      ElementwiseT rhs_elem) {
-                          return std::atan2(lhs_elem, rhs_elem);
-                        }));
-    return Status::OK();
-  }
-
-  template <typename NativeT, typename std::enable_if<!std::is_floating_point<
-                                  NativeT>::value>::type* = nullptr>
-  Status HandleAtan2(HloInstruction* atan2) {
-    return InvalidArgument("Unsupported type for Atan2");
-  }
-
-  Status HandleAtan2(HloInstruction* atan2) override {
-    return HandleAtan2<ElementwiseT>(atan2);
-  }
-
-  Status HandleTanh(HloInstruction* tanh) override {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[tanh],
-                        ElementWiseUnaryOp(tanh, [](ElementwiseT elem_operand) {
-                          return std::tanh(elem_operand);
-                        }));
-    return Status::OK();
-  }
-
-  template <typename NativeT,
-            typename std::enable_if<
-                std::is_signed<NativeT>::value &&
-                !std::is_floating_point<NativeT>::value>::type* = nullptr>
-  Status HandleMultiply(HloInstruction* multiply) {
-    using type = typename std::make_unsigned<NativeT>::type;
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[multiply],
-        ElementWiseBinaryOp(multiply,
-                            [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) {
-                              return NativeT(type(lhs_elem) * type(rhs_elem));
-                            }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<std::is_unsigned<NativeT>::value ||
-                              std::is_floating_point<NativeT>::value ||
-                              is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleMultiply(HloInstruction* multiply) {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[multiply],
-        ElementWiseBinaryOp(multiply,
-                            [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) {
-                              return lhs_elem * rhs_elem;
-                            }));
-    return Status::OK();
-  }
-
-  Status HandleMultiply(HloInstruction* multiply) override {
-    return HandleMultiply<ElementwiseT>(multiply);
-  }
-
-  Status HandleSubtract(HloInstruction* subtract) override {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[subtract],
-        ElementWiseBinaryOp(subtract,
-                            [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) {
-                              return lhs_elem - rhs_elem;
-                            }));
-    return Status::OK();
-  }
-
-  Status HandleAdd(HloInstruction* add) override {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[add],
-                        ElementWiseBinaryOp(add, [](ElementwiseT lhs_elem,
-                                                    ElementwiseT rhs_elem) {
-                          return lhs_elem + rhs_elem;
-                        }));
-    return Status::OK();
-  }
-
-  Status HandleDivide(HloInstruction* divide) override {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[divide],
-                        ElementWiseBinaryOp(divide, [](ElementwiseT lhs_elem,
-                                                       ElementwiseT rhs_elem) {
-                          return lhs_elem / rhs_elem;
-                        }));
-    return Status::OK();
-  }
-
-  template <typename NativeT,
-            typename std::enable_if<std::is_integral<NativeT>::value>::type* =
-                nullptr>
-  Status HandleMaximum(HloInstruction* maximum) {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[maximum],
-        ElementWiseBinaryOp(maximum, [](ElementwiseT lhs, ElementwiseT rhs) {
-          return std::max(lhs, rhs);
-        }));
-    return Status::OK();
-  }
-
-  template <typename NativeT, typename std::enable_if<std::is_floating_point<
-                                  NativeT>::value>::type* = nullptr>
-  Status HandleMaximum(HloInstruction* maximum) {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[maximum],
-        ElementWiseBinaryOp(maximum, [](ElementwiseT lhs, ElementwiseT rhs) {
-          return ((lhs >= rhs) || std::isnan(lhs)) ? lhs : rhs;
-        }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleMaximum(HloInstruction* maximum) {
-    return InvalidArgument("Unsupported type for Maximum");
-  }
-
-  Status HandleMaximum(HloInstruction* maximum) override {
-    return HandleMaximum<ElementwiseT>(maximum);
-  }
-
-  template <typename NativeT,
-            typename std::enable_if<std::is_integral<NativeT>::value>::type* =
-                nullptr>
-  Status HandleMinimum(HloInstruction* minimum) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[minimum],
-                        ElementWiseBinaryOp(minimum, [](ElementwiseT lhs_el,
-                                                        ElementwiseT rhs_el) {
-                          return std::min(lhs_el, rhs_el);
-                        }));
-    return Status::OK();
-  }
-
-  template <typename NativeT, typename std::enable_if<std::is_floating_point<
-                                  NativeT>::value>::type* = nullptr>
-  Status HandleMinimum(HloInstruction* minimum) {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[minimum],
-        ElementWiseBinaryOp(minimum, [](ElementwiseT lhs_el,
-                                        ElementwiseT rhs_el) {
-          return ((lhs_el <= rhs_el) || std::isnan(lhs_el)) ? lhs_el : rhs_el;
-        }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleMinimum(HloInstruction* minimum) {
-    return InvalidArgument("Unsupported type for Minimum");
-  }
-
-  Status HandleMinimum(HloInstruction* minimum) override {
-    return HandleMinimum<ElementwiseT>(minimum);
-  }
-
-  Status HandlePower(HloInstruction* power) override {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[power],
-                        ElementWiseBinaryOp(power, [](ElementwiseT lhs_el,
-                                                      ElementwiseT rhs_el) {
-                          return std::pow(lhs_el, rhs_el);
-                        }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleRemainder(HloInstruction* remainder) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[remainder],
-                        ElementWiseBinaryOp(remainder, [](ElementwiseT lhs_el,
-                                                          ElementwiseT rhs_el) {
-                          return std::fmod(lhs_el, rhs_el);
-                        }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleRemainder(HloInstruction* remainder) {
-    return InvalidArgument("Unsupported type for Remainder");
-  }
-
-  Status HandleRemainder(HloInstruction* remainder) override {
-    return HandleRemainder<ElementwiseT>(remainder);
-  }
-
-  template <typename NativeT,
-            typename std::enable_if<std::is_integral<NativeT>::value>::type* =
-                nullptr>
-  Status HandleAnd(HloInstruction* and_) {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[and_],
-        ElementWiseBinaryOp(and_, [](ElementwiseT lhs_el, ElementwiseT rhs_el) {
-          return lhs_el & rhs_el;
-        }));
-    return Status::OK();
-  }
-
-  template <typename NativeT, typename std::enable_if<std::is_floating_point<
-                                  NativeT>::value>::type* = nullptr>
-  Status HandleAnd(HloInstruction* and_) {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[and_],
-        ElementWiseBinaryOp(and_, [](ElementwiseT lhs_el, ElementwiseT rhs_el) {
-          return lhs_el && rhs_el;
-        }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleAnd(HloInstruction* and_) {
-    return InvalidArgument("Unsupported type for And");
-  }
-
-  Status HandleAnd(HloInstruction* and_) override {
-    return HandleAnd<ElementwiseT>(and_);
-  }
-
-  template <typename NativeT,
-            typename std::enable_if<std::is_integral<NativeT>::value>::type* =
-                nullptr>
-  Status HandleOr(HloInstruction* or_) {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[or_],
-        ElementWiseBinaryOp(or_, [](ElementwiseT lhs_el, ElementwiseT rhs_el) {
-          return lhs_el | rhs_el;
-        }));
-    return Status::OK();
-  }
-
-  template <typename NativeT, typename std::enable_if<std::is_floating_point<
-                                  NativeT>::value>::type* = nullptr>
-  Status HandleOr(HloInstruction* or_) {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[or_],
-        ElementWiseBinaryOp(or_, [](ElementwiseT lhs_el, ElementwiseT rhs_el) {
-          return lhs_el || rhs_el;
-        }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleOr(HloInstruction* or_) {
-    return InvalidArgument("Unsupported type for Or");
-  }
-
-  Status HandleOr(HloInstruction* or_) override {
-    return HandleOr<ElementwiseT>(or_);
-  }
-
-  template <typename NativeT,
-            typename std::enable_if<
-                std::is_integral<NativeT>::value &&
-                !std::is_same<NativeT, bool>::value>::type* = nullptr>
-  Status HandleShiftLeft(HloInstruction* shl) {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[shl],
-        ElementWiseBinaryOp(shl, [](NativeT lhs_elem, NativeT rhs_elem) {
-          return IsShiftOutOfBounds<NativeT>(rhs_elem) ? 0
-                                                       : (lhs_elem << rhs_elem);
-        }));
-    return Status::OK();
-  }
-
-  template <typename NativeT,
-            typename std::enable_if<!std::is_integral<NativeT>::value ||
-                                    std::is_same<NativeT, bool>::value>::type* =
-                nullptr>
-  Status HandleShiftLeft(HloInstruction*) {
-    return InvalidArgument("Unsupported type for ShiftLeft");
-  }
-
-  Status HandleShiftLeft(HloInstruction* shl) override {
-    return HandleShiftLeft<ElementwiseT>(shl);
-  }
-  template <typename NativeT,
-            typename std::enable_if<
-                std::is_integral<NativeT>::value &&
-                !std::is_same<NativeT, bool>::value>::type* = nullptr>
-  Status HandleShiftRightArithmetic(HloInstruction* shr) {
-    typedef typename std::make_signed<NativeT>::type SignedT;
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[shr],
-        ElementWiseBinaryOp(shr, [](NativeT lhs_elem, NativeT rhs_elem) {
-          SignedT lhs_signed = static_cast<SignedT>(lhs_elem);
-          if (IsShiftOutOfBounds<NativeT>(rhs_elem)) {
-            return lhs_signed < 0 ? static_cast<SignedT>(-1) : 0;
-          } else {
-            return lhs_signed >> rhs_elem;
-          }
-        }));
-    return Status::OK();
-  }
-
-  template <typename NativeT,
-            typename std::enable_if<!std::is_integral<NativeT>::value ||
-                                    std::is_same<NativeT, bool>::value>::type* =
-                nullptr>
-  Status HandleShiftRightArithmetic(HloInstruction*) {
-    return InvalidArgument("Unsupported type for ShiftRightArithmetic");
-  }
-
-  Status HandleShiftRightArithmetic(HloInstruction* shra) override {
-    return HandleShiftRightArithmetic<ElementwiseT>(shra);
-  }
-
-  template <typename NativeT,
-            typename std::enable_if<
-                std::is_integral<NativeT>::value &&
-                !std::is_same<NativeT, bool>::value>::type* = nullptr>
-  Status HandleShiftRightLogical(HloInstruction* shr) {
-    typedef typename std::make_unsigned<NativeT>::type UnsignedT;
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[shr],
-        ElementWiseBinaryOp(shr, [](NativeT lhs_elem, NativeT rhs_elem) {
-          // If shift amount is greater than the number of bits, then return 0.
-          if (IsShiftOutOfBounds<NativeT>(rhs_elem)) {
-            return static_cast<NativeT>(0);
-          }
-          return static_cast<NativeT>(static_cast<UnsignedT>(lhs_elem) >>
-                                      rhs_elem);
-        }));
-    return Status::OK();
-  }
-
-  template <typename NativeT,
-            typename std::enable_if<!std::is_integral<NativeT>::value ||
-                                    std::is_same<NativeT, bool>::value>::type* =
-                nullptr>
-  Status HandleShiftRightLogical(HloInstruction*) {
-    return InvalidArgument("Unsupported type for ShiftRightLogical");
-  }
-
-  Status HandleShiftRightLogical(HloInstruction* shrl) override {
-    return HandleShiftRightLogical<ElementwiseT>(shrl);
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleClamp(HloInstruction* clamp) {
-    std::function<ElementwiseT(ElementwiseT, ElementwiseT, ElementwiseT)>
-        clamp_op = [](ElementwiseT low, ElementwiseT value, ElementwiseT high) {
-          return std::fmin(high, std::fmax(value, low));
-        };
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[clamp],
-        ElementwiseTernaryOp(clamp,
-                             std::move(ConvertTernaryFunction(clamp_op))));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleClamp(HloInstruction*) {
-    return InvalidArgument("Unsupported type for Clamp");
-  }
-
-  Status HandleClamp(HloInstruction* clamp) override {
-    return HandleClamp<ElementwiseT>(clamp);
-  }
-
-  Status HandleSelect(HloInstruction* select) override {
-    CHECK(!ShapeUtil::IsScalar(select->operand(0)->shape()));
-    CHECK(!ShapeUtil::IsTuple(select->shape()));
-    std::function<ReturnT(bool, ReturnT, ReturnT)> select_op =
-        [](bool pred, ReturnT on_true, ReturnT on_false) {
-          if (pred) {
-            return on_true;
-          }
-          return on_false;
-        };
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[select],
-                        ElementwiseTernaryOp(select, std::move(select_op)));
-    return Status::OK();
-  }
-
-  Status HandleReverse(HloInstruction* reverse) override {
-    const auto result_shape = reverse->shape();
-    const auto reverse_dimensions = reverse->dimensions();
-
-    auto operand = reverse->operand(0);
-    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
-                        ShapeInference::InferReverseShape(operand->shape(),
-                                                          reverse_dimensions));
-
-    TF_RET_CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape))
-        << "return shape set to: " << ShapeUtil::HumanString(result_shape)
-        << " but is inferred to be: "
-        << ShapeUtil::HumanString(inferred_return_shape);
-
-    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
-    auto result = Literal::CreateFromShape(result_shape);
-
-    TF_RETURN_IF_ERROR(
-        result->Populate<ReturnT>([&](ArraySlice<int64> out_index) {
-          std::vector<int64> from_index(out_index.begin(), out_index.end());
-          for (const int64 dim : reverse_dimensions) {
-            from_index[dim] = result_shape.dimensions(dim) - 1 - out_index[dim];
-          }
-          return operand_literal.Get<ReturnT>(from_index);
-        }));
-
-    parent_->evaluated_[reverse] = std::move(result);
-    return Status::OK();
-  }
-
-  Status HandleConvolution(HloInstruction* conv) override {
-    auto lhs = conv->operand(0);
-    auto rhs = conv->operand(1);
-    const auto& window = conv->window();
-    const Shape& result_shape = conv->shape();
-    const Shape& lhs_shape = lhs->shape();
-    const Shape& rhs_shape = rhs->shape();
-
-    TF_CHECK_OK(ShapeUtil::ValidateShape(lhs_shape));
-    TF_CHECK_OK(ShapeUtil::ValidateShape(rhs_shape));
-    CHECK(ShapeUtil::IsArray(lhs_shape));
-    CHECK(ShapeUtil::IsArray(rhs_shape));
-    CHECK(ShapeUtil::SameElementType(lhs_shape, rhs_shape));
-    CHECK(ShapeUtil::SameElementType(lhs_shape, result_shape));
-
-    const auto& dnums = conv->convolution_dimension_numbers();
-    const int64 num_spatial_dims = dnums.output_spatial_dimensions_size();
-    CHECK_EQ(num_spatial_dims, dnums.input_spatial_dimensions_size());
-    CHECK_EQ(num_spatial_dims, dnums.kernel_spatial_dimensions_size());
-    CHECK_GE(num_spatial_dims, 0);
-    CHECK_EQ(window.dimensions_size(), num_spatial_dims);
-
-    const auto lhs_rank = ShapeUtil::Rank(lhs_shape);
-    const auto rhs_rank = ShapeUtil::Rank(rhs_shape);
-
-    CHECK_EQ(num_spatial_dims + 2, lhs_rank);
-    CHECK_EQ(num_spatial_dims + 2, rhs_rank);
-
-    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
-                        ShapeInference::InferConvolveShape(lhs_shape, rhs_shape,
-                                                           window, dnums));
-    CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape))
-        << "return shape set to: " << ShapeUtil::HumanString(result_shape)
-        << " but is inferred to be: "
-        << ShapeUtil::HumanString(inferred_return_shape);
-
-    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
-    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
-
-    std::vector<int64> window_dimension_sizes;
-    for (auto i : dnums.kernel_spatial_dimensions()) {
-      window_dimension_sizes.push_back(ShapeUtil::GetDimension(rhs_shape, i));
-    }
-
-    const Shape& window_shape =
-        ShapeUtil::MakeShape(rhs_shape.element_type(), window_dimension_sizes);
-
-    DimensionVector lhs_dim_multipliers = MakeDimMultipliers(lhs_shape);
-    DimensionVector rhs_dim_multipliers = MakeDimMultipliers(rhs_shape);
-
-    auto lhs_literal_data = lhs_literal.data<ReturnT>();
-    auto rhs_literal_data = rhs_literal.data<ReturnT>();
-
-    auto func = [&window_shape, &dnums, &lhs_shape, &rhs_shape, &window,
-                 &lhs_dim_multipliers, &rhs_dim_multipliers, lhs_literal_data,
-                 rhs_literal_data](ArraySlice<int64> out_index) {
-      // Dimension number applicable for input (lhs).
-      const int64 input_batch_dim = dnums.input_batch_dimension();
-      const int64 input_z_dim = dnums.input_feature_dimension();
-      // Dimension number applicable for kernel (rhs).
-      const int64 kernel_input_z_dim = dnums.kernel_input_feature_dimension();
-      const int64 kernel_output_z_dim = dnums.kernel_output_feature_dimension();
-      // Dimension number applicable for output.
-      const int64 output_batch_dim = dnums.output_batch_dimension();
-      const int64 output_z_dim = dnums.output_feature_dimension();
-
-      const int64 z_size = ShapeUtil::GetDimension(lhs_shape, input_z_dim);
-
-      ElementwiseT result_val = static_cast<ElementwiseT>(0);
-      DimensionVector rhs_spatial_index(dnums.kernel_spatial_dimensions_size(),
-                                        0);
-
-      // Convolve input feature with kernel.
-      do {
-        for (int64 iz = 0; iz < z_size; ++iz) {
-          int64 lhs_linear_index = 0;
-          lhs_linear_index += out_index[output_batch_dim] *
-                              lhs_dim_multipliers[input_batch_dim];
-          lhs_linear_index += iz * lhs_dim_multipliers[input_z_dim];
-
-          int64 rhs_linear_index = 0;
-          rhs_linear_index += out_index[output_z_dim] *
-                              rhs_dim_multipliers[kernel_output_z_dim];
-          rhs_linear_index += iz * rhs_dim_multipliers[kernel_input_z_dim];
-
-          // Find corresponding spatial dimension index for input (lhs).
-          for (int64 ki = 0; ki < rhs_spatial_index.size(); ++ki) {
-            // Spatial dimension number for input (lhs) and output.
-            const int64 input_spatial_dim = dnums.input_spatial_dimensions(ki);
-            const int64 output_spatial_dim =
-                dnums.output_spatial_dimensions(ki);
-
-            // Calculate lhs (input) index without taking base dilation into
-            // account.
-            const auto& window_dim = window.dimensions(ki);
-            const int64 undilated_index =
-                out_index[output_spatial_dim] * window_dim.stride() -
-                window_dim.padding_low() +
-                rhs_spatial_index[ki] * window_dim.window_dilation();
-            // Skip if the lhs (input) index is to be dilated.  As an
-            // optimization, skip this mod if there's no dilation.
-            if (window_dim.base_dilation() > 1 &&
-                undilated_index % window_dim.base_dilation() != 0) {
-              goto cnt;
-            }
-
-            // Calculate the actual lhs (input) index after dilation.  As an
-            // optimization, skip this integer divide if there's no dilation.
-            int64 lhs_spatial_index;
-            if (window_dim.base_dilation() > 1) {
-              lhs_spatial_index = undilated_index / window_dim.base_dilation();
-            } else {
-              lhs_spatial_index = undilated_index;
-            }
-            lhs_linear_index +=
-                lhs_spatial_index * lhs_dim_multipliers[input_spatial_dim];
-
-            // Skip if input index is not in bounds.
-            if (!(lhs_spatial_index >= 0 &&
-                  lhs_spatial_index <
-                      lhs_shape.dimensions(input_spatial_dim))) {
-              goto cnt;
-            }
-
-            rhs_linear_index +=
-                (window_dim.window_reversal()
-                     ? ((window_dim.size() - 1) - rhs_spatial_index[ki])
-                     : rhs_spatial_index[ki]) *
-                rhs_dim_multipliers[dnums.kernel_spatial_dimensions(ki)];
-          }
-
-          result_val +=
-              static_cast<ElementwiseT>(lhs_literal_data[lhs_linear_index]) *
-              static_cast<ElementwiseT>(rhs_literal_data[rhs_linear_index]);
-        }
-      cnt : {}
-      } while (IndexUtil::BumpIndices(window_shape, &rhs_spatial_index));
-
-      return static_cast<ReturnT>(result_val);
-    };
-
-    auto result = Literal::CreateFromShape(result_shape);
-    TF_RETURN_IF_ERROR(result->PopulateParallel<ReturnT>(func));
-
-    parent_->evaluated_[conv] = std::move(result);
-    return Status::OK();
-  }
-
-  Status HandleDot(HloInstruction* dot) override {
-    auto lhs = dot->operand(0);
-    auto rhs = dot->operand(1);
-    CHECK(ShapeUtil::IsArray(dot->shape()));
-    CHECK(ShapeUtil::IsArray(lhs->shape()));
-    CHECK(ShapeUtil::IsArray(rhs->shape()));
-
-    const auto& dnums = dot->dot_dimension_numbers();
-
-    const auto lhs_rank = ShapeUtil::Rank(lhs->shape());
-    const auto rhs_rank = ShapeUtil::Rank(rhs->shape());
-
-    CHECK(ShapeUtil::SameElementType(lhs->shape(), rhs->shape()));
-    CHECK(ShapeUtil::SameElementType(lhs->shape(), dot->shape()));
-
-    // There must be 1 and only 1 Contracting dimension for lhs and rhs.
-    CHECK_EQ(dnums.lhs_contracting_dimensions_size(), 1);
-    CHECK_EQ(dnums.rhs_contracting_dimensions_size(), 1);
-    const int64 lhs_contracting_dimension = dnums.lhs_contracting_dimensions(0);
-    const int64 rhs_contracting_dimension = dnums.rhs_contracting_dimensions(0);
-    // Contracted dimension sizes must be the same.
-    CHECK_EQ(lhs->shape().dimensions(lhs_contracting_dimension),
-             rhs->shape().dimensions(rhs_contracting_dimension))
-        << "lhs contracted dimension: "
-        << lhs->shape().dimensions(lhs_contracting_dimension)
-        << " rhs contracted dimension: "
-        << rhs->shape().dimensions(rhs_contracting_dimension);
-    const int64 contracted_dimension_size =
-        lhs->shape().dimensions(lhs_contracting_dimension);
-
-    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
-    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
-
-    auto result = Literal::CreateFromShape(dot->shape());
-
-    CHECK_EQ(dnums.lhs_batch_dimensions_size(),
-             dnums.rhs_batch_dimensions_size());
-
-    std::vector<int64> lhs_non_contracting_dims;
-    for (int64 i = 0; i < lhs_rank; i++) {
-      if (i != lhs_contracting_dimension) {
-        lhs_non_contracting_dims.push_back(i);
-      }
-    }
-
-    std::vector<int64> rhs_non_batch_non_contracting_dims;
-    FlatSet<int64> batch_dims_set(dnums.rhs_batch_dimensions().begin(),
-                                  dnums.rhs_batch_dimensions().end());
-    for (int64 i = 0; i < rhs_rank; i++) {
-      if (i != rhs_contracting_dimension && batch_dims_set.count(i) == 0) {
-        rhs_non_batch_non_contracting_dims.push_back(i);
-      }
-    }
-
-    const int64 batch_dim_size = dnums.lhs_batch_dimensions_size();
-    const int64 lhs_non_contracting_size = lhs_non_contracting_dims.size();
-
-    DimensionVector lhs_index(lhs_rank);
-    DimensionVector rhs_index(rhs_rank);
-    TF_RETURN_IF_ERROR(
-        result->Populate<ReturnT>([&](ArraySlice<int64> result_index) {
-          ElementwiseT result_val = static_cast<ElementwiseT>(0);
-
-          // Find the corresponding non-contracting indices for lhs and rhs.
-          //
-          // For `result_index`, its batch dimension, if exists, will be at the
-          // same dimension as the batch dimension of lhs and rhs. More
-          // specifically:
-          // - For lhs, the non-contracting dimensions, including the batch
-          // dimension have the same index as the `result_index`.
-          // - For rhs, the batch dimension is set seperately from other
-          // non-contracting dimensions, since these other non-contracting
-          // dimensions in rhs follow the non-contracting dimensions of lhs in
-          // the resulting index.
-          //
-          // As an example, for a resulting index:
-          //  result_index [result_batch, result_x, result_y]
-          // the effecting lhs and rhs indices are:
-          //  lhs [result_batch, lhs_non_contracting_dim, contracting_dim
-          //  rhs [result_batch, contracting_dim, rhs_non_contracting_dim]
-          // `result_x` is only affected by the lhs_non_contracting_dim and
-          // likewise `result_y` only depends on rhs_non_contracting_dim.
-          //
-          // so we can look up the lhs and rhs indices by:
-          //
-          // lhs:
-          //  batch index is the same as `result_batch`.
-          //    non-contracting dimension is the same as
-          //    result_index[lhs_non_contracting_dim]
-          // rhs:
-          //  batch index: the same as `result_batch`.
-          //  non-contracting dimension index: *not* the same as
-          //    result_index[rhs_non_contractng_dim], since the
-          //    non-contracting dimensions of lhs are included in the
-          //    result_index first. Instead, the non_contracting_dim of rhs must
-          //    be calculated as following:
-          //      lhs_non_contracting_dimensions_size +
-          //      (rhs_non_batch_non_contracting_dim - batch_dim_size) - 1
-          //
-          //    Note that (rhs_non_batch_contracting_dim - batch_dim_size) is
-          //    the index offset to the result_index that only depends on
-          //    the non_batch and non-contracting dimensions of rhs. -1 at the
-          //    end translates size to index.
-          for (auto i : lhs_non_contracting_dims) {
-            lhs_index[i] = result_index[i];
-          }
-          for (auto i : dnums.rhs_batch_dimensions()) {
-            rhs_index[i] = result_index[i];
-          }
-          for (auto i : rhs_non_batch_non_contracting_dims) {
-            const int64 rhs_non_batch_non_contracting_dim =
-                lhs_non_contracting_size + (i - batch_dim_size) - 1;
-            rhs_index[i] = result_index[rhs_non_batch_non_contracting_dim];
-          }
-
-          // Accumulates resulting product along the contracted dimension.
-          for (int64 i = 0; i < contracted_dimension_size; ++i) {
-            lhs_index[lhs_contracting_dimension] = i;
-            rhs_index[rhs_contracting_dimension] = i;
-
-            result_val +=
-                static_cast<ElementwiseT>(lhs_literal.Get<ReturnT>(lhs_index)) *
-                static_cast<ElementwiseT>(rhs_literal.Get<ReturnT>(rhs_index));
-          }
-
-          return static_cast<ReturnT>(result_val);
-        }));
-
-    parent_->evaluated_[dot] = std::move(result);
-    return Status::OK();
-  }
-
-  Status HandlePad(HloInstruction* pad) override {
-    CHECK(!ShapeUtil::IsTuple(pad->operand(0)->shape()));
-    // Padding value must be scalar.
-    CHECK(ShapeUtil::IsScalar(pad->operand(1)->shape()));
-    CHECK_EQ(ShapeUtil::Rank(pad->operand(0)->shape()),
-             pad->padding_config().dimensions_size());
-
-    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
-                        ShapeInference::InferPadShape(
-                            /*operand_shape=*/pad->operand(0)->shape(),
-                            /*padding_value_shape=*/pad->operand(1)->shape(),
-                            /*padding_config=*/pad->padding_config()));
-    CHECK(ShapeUtil::Compatible(pad->shape(), inferred_return_shape))
-        << "return shape is set to: " << ShapeUtil::HumanString(pad->shape())
-        << "but is inferred to be: "
-        << ShapeUtil::HumanString(inferred_return_shape);
-
-    // Create new HLO of padded shape with padding value.
-    ReturnT scalar =
-        parent_->GetEvaluatedLiteralFor(pad->operand(1)).Get<ReturnT>({});
-    auto result = Literal::CreateFromShape(pad->shape());
-    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
-        [&scalar](ArraySlice<int64> multi_index) { return scalar; }));
-
-    const Literal& evaluated_operand =
-        parent_->GetEvaluatedLiteralFor(pad->operand(0));
-
-    std::vector<int64> input_index(ShapeUtil::Rank(evaluated_operand.shape()),
-                                   0);
-    std::vector<int64> target_index(ShapeUtil::Rank(result->shape()), 0);
-
-    // Loop through each element of the operand, assign them to the
-    // corresponding index of the resulting padded literal.
-    const PaddingConfig& pad_config = pad->padding_config();
-
-    auto func = [&](ArraySlice<int64> input_index) {
-      for (auto i = 0; i < input_index.size(); ++i) {
-        // Interior padding occurs logically before edge padding, so in the case
-        // of negative edge padding elements are removed from the
-        // interior-padded operand.
-        target_index[i] =
-            pad_config.dimensions(i).edge_padding_low() +
-            input_index[i] * (pad_config.dimensions(i).interior_padding() + 1);
-
-        // Account for negative low and high padding: skip assignment if the
-        // any target index is out of range.
-        if (!(target_index[i] >= 0 &&
-              target_index[i] < pad->shape().dimensions(i))) {
-          return true;
-        }
-      }
-      result->Set<ReturnT>(target_index,
-                           evaluated_operand.Get<ReturnT>(input_index));
-      return true;
-    };
-
-    std::vector<int64> zero_base(evaluated_operand.shape().dimensions_size(),
-                                 0);
-    std::vector<int64> step(evaluated_operand.shape().dimensions_size(), 1);
-
-    ShapeUtil::ForEachIndex(
-        evaluated_operand.shape(), zero_base,
-        AsInt64Slice(evaluated_operand.shape().dimensions()), step, func);
-
-    parent_->evaluated_[pad] = std::move(result);
-    return Status::OK();
-  }
-
-  Status HandleDynamicSlice(HloInstruction* dynamic_slice) override {
-    auto operand = dynamic_slice->operand(0);
-    auto start_indices = dynamic_slice->operand(1);
-    auto result_shape = dynamic_slice->shape();
-    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
-                        ShapeInference::InferDynamicSliceShape(
-                            operand->shape(), start_indices->shape(),
-                            dynamic_slice->dynamic_slice_sizes()));
-    TF_RET_CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape))
-        << "return shape is set to: " << ShapeUtil::HumanString(result_shape)
-        << "but is inferred to be: "
-        << ShapeUtil::HumanString(inferred_return_shape);
-    TF_RET_CHECK(
-        primitive_util::IsIntegralType(start_indices->shape().element_type()));
-
-    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
-    const Literal& start_indices_literal =
-        parent_->GetEvaluatedLiteralFor(start_indices);
-
-    switch (start_indices->shape().element_type()) {
-      case S32: {
-        TF_ASSIGN_OR_RETURN(
-            parent_->evaluated_[dynamic_slice],
-            DynamicSlice<int32>(operand_literal, start_indices_literal,
-                                result_shape));
-      } break;
-      case S64: {
-        TF_ASSIGN_OR_RETURN(
-            parent_->evaluated_[dynamic_slice],
-            DynamicSlice<int64>(operand_literal, start_indices_literal,
-                                result_shape));
-      } break;
-      case U32: {
-        TF_ASSIGN_OR_RETURN(
-            parent_->evaluated_[dynamic_slice],
-            DynamicSlice<uint32>(operand_literal, start_indices_literal,
-                                 result_shape));
-      } break;
-      case U64: {
-        TF_ASSIGN_OR_RETURN(
-            parent_->evaluated_[dynamic_slice],
-            DynamicSlice<uint64>(operand_literal, start_indices_literal,
-                                 result_shape));
-      } break;
-      default:
-        LOG(FATAL) << "HandleDynamicSlice: unhandled primitive type for "
-                      "start_indices: "
-                   << PrimitiveType_Name(start_indices->shape().element_type());
-    }
-
-    return Status::OK();
-  }
-
-  Status HandleDynamicUpdateSlice(
-      HloInstruction* dynamic_update_slice) override {
-    auto operand = dynamic_update_slice->operand(0);
-    auto update = dynamic_update_slice->operand(1);
-    auto start_indices = dynamic_update_slice->operand(2);
-    auto result_shape = dynamic_update_slice->shape();
-    TF_ASSIGN_OR_RETURN(
-        auto inferred_return_shape,
-        ShapeInference::InferDynamicUpdateSliceShape(
-            operand->shape(), update->shape(), start_indices->shape()));
-    TF_RET_CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape))
-        << "return shape is set to: " << ShapeUtil::HumanString(result_shape)
-        << "but is inferred to be: "
-        << ShapeUtil::HumanString(inferred_return_shape);
-    TF_RET_CHECK(
-        primitive_util::IsIntegralType(start_indices->shape().element_type()));
-    TF_RET_CHECK(ShapeUtil::Compatible(result_shape, operand->shape()));
-
-    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
-    const Literal& update_literal = parent_->GetEvaluatedLiteralFor(update);
-    const Literal& start_indices_literal =
-        parent_->GetEvaluatedLiteralFor(start_indices);
-
-    switch (start_indices->shape().element_type()) {
-      case S32: {
-        TF_ASSIGN_OR_RETURN(
-            parent_->evaluated_[dynamic_update_slice],
-            DynamicUpdateSlice<int32>(operand_literal, update_literal,
-                                      start_indices_literal));
-      } break;
-      case S64: {
-        TF_ASSIGN_OR_RETURN(
-            parent_->evaluated_[dynamic_update_slice],
-            DynamicUpdateSlice<int64>(operand_literal, update_literal,
-                                      start_indices_literal));
-      } break;
-      case U32: {
-        TF_ASSIGN_OR_RETURN(
-            parent_->evaluated_[dynamic_update_slice],
-            DynamicUpdateSlice<uint32>(operand_literal, update_literal,
-                                       start_indices_literal));
-      } break;
-      case U64: {
-        TF_ASSIGN_OR_RETURN(
-            parent_->evaluated_[dynamic_update_slice],
-            DynamicUpdateSlice<uint64>(operand_literal, update_literal,
-                                       start_indices_literal));
-      } break;
-      default:
-        LOG(FATAL) << "HandleDynamicUpdateSlice: unhandled primitive type for "
-                      "start_indices: "
-                   << PrimitiveType_Name(start_indices->shape().element_type());
-    }
-
-    return Status::OK();
-  }
-
-  template <typename NativeT>
-  StatusOr<std::unique_ptr<Literal>> MapImpl(HloInstruction* map) {
-    auto operands = map->operands();
-    HloComputation* computation = map->to_apply();
-
-    auto result = Literal::CreateFromShape(map->shape());
-
-    HloEvaluator embedded_evaluator(parent_->max_loop_iterations_);
-    TF_RETURN_IF_ERROR(
-        result->Populate<ReturnT>([&](ArraySlice<int64> multi_index) {
-          std::vector<std::unique_ptr<Literal>> arg_literals;
-          arg_literals.reserve(operands.size());
-
-          // Construct scalar literal parameters to be passed to the map
-          // computation.
-          for (auto operand : operands) {
-            const Literal& arg_literal =
-                parent_->GetEvaluatedLiteralFor(operand);
-
-            auto curr_val = arg_literal.Get<NativeT>(multi_index);
-            auto curr_val_literal = Literal::CreateR0<NativeT>(curr_val);
-
-            arg_literals.push_back(std::move(curr_val_literal));
-          }
-
-          std::unique_ptr<Literal> computed_result =
-              embedded_evaluator
-                  .Evaluate<std::unique_ptr<Literal>>(*computation,
-                                                      arg_literals)
-                  .ConsumeValueOrDie();
-          // Clear visit states so that the we can use the evaluate again on
-          // the same computation.
-          embedded_evaluator.ResetVisitStates();
-
-          return computed_result->Get<ReturnT>({});
-        }));
-    return std::move(result);
-  }
-
-  Status HandleMap(HloInstruction* map) override {
-    switch (map->operand(0)->shape().element_type()) {
-      case PRED: {
-        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<bool>(map));
-        break;
-      }
-      case U8: {
-        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<uint8>(map));
-        break;
-      }
-      case U32: {
-        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<uint32>(map));
-        break;
-      }
-      case U64: {
-        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<uint64>(map));
-        break;
-      }
-      case S8: {
-        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<int8>(map));
-        break;
-      }
-      case S32: {
-        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<int32>(map));
-        break;
-      }
-      case S64: {
-        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<int64>(map));
-        break;
-      }
-      case F16: {
-        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map],
-                            MapImpl<Eigen::half>(map));
-        break;
-      }
-      case F32: {
-        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<float>(map));
-        break;
-      }
-      case F64: {
-        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<double>(map));
-        break;
-      }
-      case C64: {
-        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<complex64>(map));
-        break;
-      }
-      default:
-        LOG(FATAL) << "HandleMap: unhandled primitive type for "
-                      "input operand: "
-                   << PrimitiveType_Name(
-                          map->operand(0)->shape().element_type());
-    }
-
-    return Status::OK();
-  }
-
-  Status HandleReduce(HloInstruction* reduce) override {
-    auto arg = reduce->operand(0);
-    auto init_value = reduce->operand(1);
-    ArraySlice<int64> dimensions(reduce->dimensions());
-    HloComputation* function = reduce->to_apply();
-    TF_RET_CHECK(ShapeUtil::Rank(reduce->shape()) ==
-                 ShapeUtil::Rank(arg->shape()) - dimensions.size());
-    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
-                        ShapeInference::InferReduceShape(
-                            /*arg=*/arg->shape(),
-                            /*init_value=*/init_value->shape(),
-                            /*dimensions_to_reduce=*/dimensions,
-                            /*to_apply=*/function->ComputeProgramShape()));
-    TF_RET_CHECK(ShapeUtil::Compatible(reduce->shape(), inferred_return_shape))
-        << "return shape is set to: " << ShapeUtil::HumanString(reduce->shape())
-        << "but is inferred to be: "
-        << ShapeUtil::HumanString(inferred_return_shape);
-
-    const Literal& arg_literal = parent_->GetEvaluatedLiteralFor(arg);
-    VLOG(3) << "HandleReduce arg_literal: " << arg_literal.ToString();
-    const Literal& init_literal = parent_->GetEvaluatedLiteralFor(init_value);
-    VLOG(3) << "HandleReduce init_literal: " << init_literal.ToString();
-    TF_RET_CHECK(ShapeUtil::IsScalar(init_literal.shape()));
-    auto init_scalar = init_literal.Get<ReturnT>({});
-
-    auto result = Literal::CreateFromShape(reduce->shape());
-
-    const auto arg_dimensions = AsInt64Slice(arg_literal.shape().dimensions());
-    std::vector<int64> arg_dim_steps(arg_dimensions.size());
-    std::vector<int64> arg_dim_counts(arg_dimensions.size());
-    for (const int64 dim : dimensions) {
-      arg_dim_steps[dim] = 1;
-      arg_dim_counts[dim] = arg_dimensions[dim];
-    }
-
-    // Map each dimension in the result to a dimension in arg that isn't
-    // being reduced.
-    std::vector<int64> result_to_arg_index;
-    for (int64 i = 0; i < arg_dimensions.size(); ++i) {
-      if (arg_dim_steps[i] == 0) {
-        result_to_arg_index.push_back(i);
-      }
-    }
-
-    HloEvaluator embedded_evaluator(parent_->max_loop_iterations_);
-    // For each resulting dimension, calculate and assign computed value.
-    TF_RETURN_IF_ERROR(
-        result->Populate<ReturnT>([&](ArraySlice<int64> multi_index) {
-          ReturnT result_val = init_scalar;
-
-          std::vector<int64> base(arg_dimensions.size());
-          for (int64 i = 0; i < multi_index.size(); ++i) {
-            base[result_to_arg_index[i]] = multi_index[i];
-          }
-
-          // When the reduction is addition of floats, accumulate in a double
-          // for better precision. Also, avoid creating Literals for the
-          // intermediate results; it's much faster.
-          if (ShapeUtil::ElementIsFloating(init_literal.shape()) &&
-              IsScalarAdd(function)) {
-            double computed_result = 0;
-            auto func = [&](ArraySlice<int64> input_index) {
-              computed_result += arg_literal.Get<float>(input_index);
-              return true;
-            };
-            ShapeUtil::ForEachIndex(arg_literal.shape(), base, arg_dim_counts,
-                                    arg_dim_steps, func);
-            return static_cast<ReturnT>(computed_result);
-          }
-          auto func = [&](ArraySlice<int64> input_index) {
-            auto curr_val = arg_literal.Get<ReturnT>(input_index);
-
-            // Evaluate computation with specified literal operands.
-            auto curr_val_literal = Literal::CreateR0<ReturnT>(curr_val);
-            auto result_val_literal = Literal::CreateR0<ReturnT>(result_val);
-            std::vector<const Literal*> args = {result_val_literal.get(),
-                                                curr_val_literal.get()};
-
-            std::unique_ptr<Literal> computed_result =
-                embedded_evaluator.Evaluate<const Literal*>(*function, args)
-                    .ConsumeValueOrDie();
-            // Clear visit states so that we can use the evaluator again on
-            // the same computation.
-            embedded_evaluator.ResetVisitStates();
-            // Assign computed result to result_val.
-            result_val = computed_result->Get<ReturnT>({});
-            return true;
-          };
-          // Computes one element of the result, reducing all dimensions that
-          // contribute to that element.
-          ShapeUtil::ForEachIndex(arg_literal.shape(), base, arg_dim_counts,
-                                  arg_dim_steps, func);
-          return result_val;
-        }));
-
-    parent_->evaluated_[reduce] = std::move(result);
-    return Status::OK();
-  }
-
-  bool IsScalarAdd(HloComputation* computation) {
-    HloInstruction* instruction = computation->root_instruction();
-    if (instruction->opcode() == HloOpcode::kAdd &&
-        computation->num_parameters() == 2) {
-      const HloInstruction* lhs = instruction->operand(0);
-      const HloInstruction* rhs = instruction->operand(1);
-      return lhs->opcode() == HloOpcode::kParameter &&
-             ShapeUtil::IsScalar(lhs->shape()) &&
-             rhs->opcode() == HloOpcode::kParameter &&
-             ShapeUtil::IsScalar(rhs->shape()) && lhs != rhs;
-    }
-    return false;
-  }
-
-  Status HandleSelectAndScatter(HloInstruction* select_and_scatter) override {
-    auto operand = select_and_scatter->operand(0);
-    auto source = select_and_scatter->operand(1);
-    const Window& window = select_and_scatter->window();
-
-    const Literal& init_literal =
-        parent_->GetEvaluatedLiteralFor(select_and_scatter->operand(2));
-    TF_RET_CHECK(ShapeUtil::IsScalar(init_literal.shape()));
-    auto init_scalar = init_literal.Get<ReturnT>({});
-
-    auto result = Literal::CreateFromShape(select_and_scatter->shape());
-
-    // Initialize result array with the init value.
-    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
-        [&](ArraySlice<int64> output_index) { return init_scalar; }));
-
-    std::vector<int64> window_dimension_sizes;
-    for (const auto& window_dimension : window.dimensions()) {
-      window_dimension_sizes.push_back(window_dimension.size());
-    }
-    const Shape window_shape = ShapeUtil::MakeShape(
-        operand->shape().element_type(), window_dimension_sizes);
-
-    HloComputation* select = select_and_scatter->select();
-    HloComputation* scatter = select_and_scatter->scatter();
-
-    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
-    const Literal& source_literal = parent_->GetEvaluatedLiteralFor(source);
-
-    int64 rank = ShapeUtil::Rank(operand_literal.shape());
-
-    HloEvaluator embedded_evaluator(parent_->max_loop_iterations_);
-    DimensionVector source_index(rank);
-
-    std::fill(source_index.begin(), source_index.end(), 0);
-    do {
-      // For each element in `source`, we place a window in `operand`. For each
-      // window placement, we iterate inside the window twice:
-      //
-      // 1. Find the selected index by applying `select` function to all
-      // elements. E.g., If the `select` function is GreaterEqual, the first
-      // iteration through the window finds the biggest value and returns its
-      // index.
-      //
-      // 2. Using the selected index, scatter value from `source` to result. We
-      // do this by iterating through the window, and compare each index with
-      // the selected index.
-      optional<ReturnT> selected_val;
-      optional<std::vector<int64>> selected_index;
-
-      IterateThroughWindow(
-          window_shape, window, operand_literal.shape(), source_index,
-          [&](const std::vector<int64>& operand_index) {
-            auto curr_val = operand_literal.Get<ReturnT>(operand_index);
-            if (!selected_val) {
-              selected_val = curr_val;
-              selected_index = operand_index;
-            }
-            const auto curr_val_literal = Literal::CreateR0<ReturnT>(curr_val);
-            const auto selected_val_literal =
-                Literal::CreateR0<ReturnT>(*selected_val);
-
-            const std::vector<const Literal*> args = {
-                selected_val_literal.get(), curr_val_literal.get()};
-            std::unique_ptr<Literal> computed_result =
-                embedded_evaluator.Evaluate<const Literal*>(*select, args)
-                    .ConsumeValueOrDie();
-            bool selected = !computed_result->Get<bool>({});
-            if (selected) {
-              selected_val = curr_val;
-              selected_index = operand_index;
-            }
-            embedded_evaluator.ResetVisitStates();
-          });
-
-      IterateThroughWindow(
-          window_shape, window, operand_literal.shape(), source_index,
-          [&](const std::vector<int64>& operand_index) {
-            if (std::equal(operand_index.begin(), operand_index.end(),
-                           selected_index->begin())) {
-              auto source = source_literal.Get<ReturnT>(source_index);
-              auto scattered = result->Get<ReturnT>(operand_index);
-              const auto source_literal = Literal::CreateR0<ReturnT>(source);
-              const auto scattered_literal =
-                  Literal::CreateR0<ReturnT>(scattered);
-
-              const std::vector<const Literal*> args = {
-                  source_literal.get(), scattered_literal.get()};
-              std::unique_ptr<Literal> computed_result =
-                  embedded_evaluator.Evaluate<const Literal*>(*scatter, args)
-                      .ConsumeValueOrDie();
-              result->Set(operand_index, computed_result->Get<ReturnT>({}));
-              // Clear visit states so that the we can use the evaluator again
-              // on the same computation.
-              embedded_evaluator.ResetVisitStates();
-            }
-          });
-    } while (IndexUtil::BumpIndices(source->shape(), &source_index));
-
-    parent_->evaluated_[select_and_scatter] = std::move(result);
-    return Status::OK();
-  }
-
-  Status HandleReduceWindow(HloInstruction* reduce_window) override {
-    auto operand = reduce_window->operand(0);
-    const Window& window = reduce_window->window();
-    HloComputation* function = reduce_window->to_apply();
-    TF_ASSIGN_OR_RETURN(
-        auto inferred_return_shape,
-        ShapeInference::InferReduceWindowShape(
-            /*operand_shape=*/reduce_window->operand(0)->shape(),
-            /*init_value=*/reduce_window->operand(1)->shape(), window,
-            /*to_apply_shape=*/function->ComputeProgramShape()));
-    TF_RET_CHECK(
-        ShapeUtil::Compatible(reduce_window->shape(), inferred_return_shape))
-        << "return shape is set to: "
-        << ShapeUtil::HumanStringWithLayout(reduce_window->shape())
-        << "but is inferred to be: "
-        << ShapeUtil::HumanStringWithLayout(inferred_return_shape);
-
-    const Literal& operand_literal =
-        parent_->GetEvaluatedLiteralFor(reduce_window->operand(0));
-    VLOG(3) << "HandleReduceWindow arg_literal: " << operand_literal.ToString();
-    const Literal& init_literal =
-        parent_->GetEvaluatedLiteralFor(reduce_window->operand(1));
-    VLOG(3) << "HandleReduceWindow init_literal: " << init_literal.ToString();
-    TF_RET_CHECK(ShapeUtil::IsScalar(init_literal.shape()));
-    auto init_scalar = init_literal.Get<ReturnT>({});
-
-    auto result = Literal::CreateFromShape(reduce_window->shape());
-
-    // Creates a Shape object from window, for iteration below.
-    std::vector<int64> window_dimension_sizes;
-    for (const auto& window_dimension : window.dimensions()) {
-      window_dimension_sizes.push_back(window_dimension.size());
-    }
-    const Shape window_shape = ShapeUtil::MakeShape(
-        operand->shape().element_type(), window_dimension_sizes);
-
-    DimensionVector window_index(window.dimensions_size());
-    DimensionVector operand_index(ShapeUtil::Rank(operand_literal.shape()));
-
-    HloEvaluator embedded_evaluator(parent_->max_loop_iterations_);
-    // For each resulting dimension, calculate and assign computed value.
-    TF_RETURN_IF_ERROR(
-        result->Populate<ReturnT>([&](ArraySlice<int64> output_index) {
-          ReturnT result_val = init_scalar;
-
-          std::fill(window_index.begin(), window_index.end(), 0);
-          std::fill(operand_index.begin(), operand_index.end(), 0);
-
-          IterateThroughWindow(
-              window_shape, window, operand_literal.shape(), output_index,
-              [&](const std::vector<int64>& operand_index) {
-                auto curr_val = operand_literal.Get<ReturnT>(operand_index);
-
-                // Evaluate computation with specified literal operands.
-                const auto curr_val_literal =
-                    Literal::CreateR0<ReturnT>(curr_val);
-                const auto result_val_literal =
-                    Literal::CreateR0<ReturnT>(result_val);
-                const std::vector<const Literal*> args = {
-                    result_val_literal.get(), curr_val_literal.get()};
-                std::unique_ptr<Literal> computed_result =
-                    embedded_evaluator.Evaluate<const Literal*>(*function, args)
-                        .ConsumeValueOrDie();
-
-                // Clear visit states so that the we can use the evaluate again
-                // on the same computation.
-                embedded_evaluator.ResetVisitStates();
-
-                result_val = computed_result->Get<ReturnT>({});
-              });
-
-          return result_val;
-        }));
-
-    parent_->evaluated_[reduce_window] = std::move(result);
-    return Status::OK();
-  }
-
-  Status HandleSlice(HloInstruction* slice) override {
-    auto operand = slice->operand(0);
-    const Shape& shape = slice->shape();
-    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
-                        ShapeInference::InferSliceShape(
-                            operand->shape(), slice->slice_starts(),
-                            slice->slice_limits(), slice->slice_strides()));
-    TF_RET_CHECK(ShapeUtil::Compatible(shape, inferred_return_shape))
-        << "return shape set to: " << ShapeUtil::HumanString(shape)
-        << " but is inferred to be: "
-        << ShapeUtil::HumanString(inferred_return_shape);
-
-    const int64 rank = ShapeUtil::Rank(operand->shape());
-    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
-    auto func = [&](ArraySlice<int64> out_index) {
-      DimensionVector operand_index(rank);
-      for (int64 i = 0; i < rank; ++i) {
-        operand_index[i] =
-            slice->slice_starts(i) + out_index[i] * slice->slice_strides(i);
-      }
-      return operand_literal.Get<ReturnT>(operand_index);
-    };
-
-    auto result = Literal::CreateFromDimensions(
-        shape.element_type(), AsInt64Slice(shape.dimensions()));
-    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(func));
-    parent_->evaluated_[slice] = std::move(result);
-    return Status::OK();
-  }
-
-  // Enable CLZ only for int32 and uint32.
-  template <
-      typename NativeT,
-      typename std::enable_if<
-          (std::is_floating_point<NativeT>::value ||
-           std::is_integral<NativeT>::value || is_complex_t<NativeT>::value) &&
-          !(std::is_same<NativeT, uint32>::value ||
-            std::is_same<NativeT, int32>::value)>::type* = nullptr>
-  Status HandleClz(HloInstruction* clz) {
-    return InvalidArgument("Unsupported type for Clz");
-  }
-
-  template <typename NativeT,
-            typename std::enable_if<
-                std::is_same<NativeT, uint32>::value ||
-                std::is_same<NativeT, int32>::value>::type* = nullptr>
-  Status HandleClz(HloInstruction* clz) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[clz],
-                        ElementWiseUnaryOp(clz, [](ElementwiseT elem_operand) {
-                          return 31 - tensorflow::Log2Floor(elem_operand);
-                        }));
-    return Status::OK();
-  }
-
-  Status HandleClz(HloInstruction* clz) override {
-    return HandleClz<ElementwiseT>(clz);
-  }
-
-  template <typename NativeT, typename std::enable_if<std::is_floating_point<
-                                  NativeT>::value>::type* = nullptr>
-  Status HandleSin(HloInstruction* sin) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[sin],
-                        ElementWiseUnaryOp(sin, [](ElementwiseT elem_operand) {
-                          return std::sin(elem_operand);
-                        }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<std::is_integral<NativeT>::value ||
-                              is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleSin(HloInstruction* sin) {
-    return InvalidArgument("Unsupported type for Sin");
-  }
-
-  Status HandleSin(HloInstruction* sin) override {
-    return HandleSin<ElementwiseT>(sin);
-  }
-
-  template <typename NativeT, typename std::enable_if<std::is_floating_point<
-                                  NativeT>::value>::type* = nullptr>
-  Status HandleCos(HloInstruction* cos) {
-    TF_ASSIGN_OR_RETURN(parent_->evaluated_[cos],
-                        ElementWiseUnaryOp(cos, [](ElementwiseT elem_operand) {
-                          return std::cos(elem_operand);
-                        }));
-    return Status::OK();
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<std::is_integral<NativeT>::value ||
-                              is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleCos(HloInstruction* cos) {
-    return InvalidArgument("Unsupported type for Cos");
-  }
-
-  Status HandleCos(HloInstruction* cos) override {
-    return HandleCos<ElementwiseT>(cos);
-  }
-
-  template <typename NativeT, typename std::enable_if<std::is_same<
-                                  float, NativeT>::value>::type* = nullptr>
-  Status HandleReducePrecision(HloInstruction* reduce_precision) {
-    TF_ASSIGN_OR_RETURN(
-        parent_->evaluated_[reduce_precision],
-        ElementWiseUnaryOp(reduce_precision, [reduce_precision](
-                                                 ElementwiseT elem) {
-          uint32_t value_as_int = tensorflow::bit_cast<uint32_t>(elem);
-          const uint32_t mantissa_bits = reduce_precision->mantissa_bits();
-          const uint32_t exponent_bits = reduce_precision->exponent_bits();
-
-          // Code is based on the CPU/GPU implementation in LLVM-emitting code.
-          //
-          // Bits in float type:
-          //   mantissa : bits [0:22]
-          //   exponent : bits [23:30]
-          //   sign     : bits [31]
-          if (mantissa_bits < 23) {
-            const uint32_t last_mantissa_bit_mask = 1u << (23 - mantissa_bits);
-
-            // Compute rounding bias for round-to-nearest with ties to even.
-            // This is equal to a base value of 0111... plus one bit if the last
-            // remaining mantissa bit is 1.
-            const uint32_t base_rounding_bias =
-                (last_mantissa_bit_mask >> 1) - 1;
-            const uint32_t x_last_mantissa_bit =
-                (value_as_int & last_mantissa_bit_mask) >> (23 - mantissa_bits);
-            const uint32_t x_rounding_bias =
-                x_last_mantissa_bit + base_rounding_bias;
-
-            // Add rounding bias, and mask out truncated bits.  Note that the
-            // case where adding the rounding bias overflows into the exponent
-            // bits is correct; the non-masked mantissa bits will all be zero,
-            // and the exponent will be incremented by one.
-            const uint32_t truncation_mask = ~(last_mantissa_bit_mask - 1);
-            value_as_int = value_as_int + x_rounding_bias;
-            value_as_int = value_as_int & truncation_mask;
-          }
-          if (exponent_bits < 8) {
-            // Masks for f32 values.
-            const uint32_t f32_sign_bit_mask = 1u << 31;
-            const uint32_t f32_exp_bits_mask = 0xffu << 23;
-
-            // An exponent of 2^(n-1)-1 -- that is, 0111... with the zero in the
-            // most- significant bit -- is equal to 1.0f for all exponent sizes.
-            // Adding 2^(n-1)-1 to this gives us the highest non-infinite
-            // exponent for a bit- size of n, and subtracting 2^(n-1)-1 from
-            // this gives us the lowest' exponent (corresponding to 0.0f).
-            //
-            // Thus, the f32 exponent corresponding to the highest non-infinite
-            // exponent for a bit size of n is (2^7-1) + 2^(n-1)-1, and the f32
-            // exponent corresponding to the lowest exponent for a bit size of n
-            // is (2^7-1) - 2^(n-1)-1.
-            //
-            // Note that we have already checked that exponents_bits >= 1.
-            const uint32_t f32_exponent_bias = (1 << 7) - 1;
-            const uint32_t reduced_exponent_bias =
-                (1 << (exponent_bits - 1)) - 1;
-            const uint32_t reduced_max_exponent =
-                f32_exponent_bias + reduced_exponent_bias;
-            const uint32_t reduced_min_exponent =
-                f32_exponent_bias - reduced_exponent_bias;
-
-            // Do we overflow or underflow?
-            const uint32_t x_exponent = value_as_int & f32_exp_bits_mask;
-            const bool x_overflows = x_exponent > (reduced_max_exponent << 23);
-            const bool x_underflows =
-                x_exponent <= (reduced_min_exponent << 23);
-
-            // Compute appropriately-signed values of zero and infinity.
-            const uint32_t x_signed_zero = value_as_int & f32_sign_bit_mask;
-            const uint32_t x_signed_inf = x_signed_zero | f32_exp_bits_mask;
-
-            // Force to zero or infinity if overflow or underflow.  (Note that
-            // this truncates all denormal values to zero, rather than rounding
-            // them.)
-            value_as_int = x_overflows ? x_signed_inf : value_as_int;
-            value_as_int = x_underflows ? x_signed_zero : value_as_int;
-          }
-
-          float reduced_result = tensorflow::bit_cast<float>(value_as_int);
-          if (std::isnan(elem)) {
-            reduced_result = mantissa_bits > 0
-                                 ? elem
-                                 : std::numeric_limits<float>::infinity();
-          }
-          return reduced_result;
-        }));
-    return Status::OK();
-  }
-
-  template <typename NativeT, typename std::enable_if<std::is_same<
-                                  double, NativeT>::value>::type* = nullptr>
-  Status HandleReducePrecision(HloInstruction* reduce_precision) {
-    return InvalidArgument("Double not supported for reduce precision");
-  }
-
-  template <
-      typename NativeT,
-      typename std::enable_if<std::is_integral<NativeT>::value ||
-                              is_complex_t<NativeT>::value>::type* = nullptr>
-  Status HandleReducePrecision(HloInstruction* reduce_precision) {
-    return InvalidArgument("Unsupported type for reduce precision");
-  }
-
-  Status HandleReducePrecision(HloInstruction* reduce_precision) override {
-    return HandleReducePrecision<ElementwiseT>(reduce_precision);
-  }
-
- private:
-  template <typename IndexT>
-  StatusOr<std::unique_ptr<Literal>> DynamicSlice(
-      const Literal& operand_literal, const Literal& start_indices_literal,
-      const Shape& result_shape) {
-    auto start_indices_typed = start_indices_literal.data<IndexT>();
-    std::vector<int64> start(start_indices_typed.begin(),
-                             start_indices_typed.end());
-
-    std::vector<int64> operand_indices(start.size());
-
-    auto result = Literal::CreateFromShape(result_shape);
-    TF_RETURN_IF_ERROR(
-        result->Populate<ReturnT>([&](ArraySlice<int64> multi_index) {
-          for (int64 i = 0; i < operand_indices.size(); ++i) {
-            CHECK_GE(multi_index[i] + start[i], 0);
-            // Mod is only used here to be consistent with the existing
-            // backends' behavior.
-            operand_indices[i] = (multi_index[i] + start[i]) %
-                                 operand_literal.shape().dimensions(i);
-          }
-
-          auto result = operand_literal.Get<ReturnT>(operand_indices);
-          return result;
-        }));
-
-    return std::move(result);
-  }
-
-  template <typename IndexT>
-  StatusOr<std::unique_ptr<Literal>> DynamicUpdateSlice(
-      const Literal& operand_literal, const Literal& update_literal,
-      const Literal& start_indices_literal) {
-    auto result = operand_literal.CloneToUnique();
-    auto start_indices_typed = start_indices_literal.data<IndexT>();
-    const auto rank = ShapeUtil::Rank(result->shape());
-    std::vector<int64> start(rank, 0);
-    for (int64 i = 0; i < rank; ++i) {
-      // All other implementations currently wrap-around the index, so this
-      // should do so as well.
-      start[i] = (start_indices_typed[i] % result->shape().dimensions(i));
-      start[i] += (start[i] < 0) * result->shape().dimensions(i);
-    }
-    std::vector<int64> result_index(rank, 0);
-
-    auto func = [&](ArraySlice<int64> update_index) {
-      std::transform(update_index.begin(), update_index.end(), start.begin(),
-                     result_index.begin(), std::plus<int64>());
-      // Same as above, wrap-around only to match other implementations'
-      // semantics.
-      std::transform(result_index.begin(), result_index.end(),
-                     result->shape().dimensions().begin(), result_index.begin(),
-                     std::modulus<int64>());
-      result->Set<ReturnT>(result_index,
-                           update_literal.Get<ReturnT>(update_index));
-      return true;
-    };
-
-    std::vector<int64> base(update_literal.shape().dimensions_size(), 0);
-    std::vector<int64> step(update_literal.shape().dimensions_size(), 1);
-    ShapeUtil::ForEachIndex(update_literal.shape(), base,
-                            AsInt64Slice(update_literal.shape().dimensions()),
-                            step, func);
-
-    return std::move(result);
-  }
-
-  StatusOr<std::unique_ptr<Literal>> ElementWiseUnaryOp(
-      HloInstruction* instruction,
-      const std::function<ElementwiseT(ElementwiseT)>& unary_op) {
-    const Literal& operand_literal =
-        parent_->GetEvaluatedLiteralFor(instruction->operand(0));
-    TF_ASSIGN_OR_RETURN(
-        auto result_literal,
-        (ElementWiseUnaryOpImpl<ReturnT, ReturnT>(
-            instruction, ConvertUnaryFunction(unary_op), operand_literal)));
-
-    return std::move(result_literal);
-  }
-
-  StatusOr<std::unique_ptr<Literal>> ElementWiseBinaryOp(
-      HloInstruction* instruction,
-      const std::function<ElementwiseT(ElementwiseT, ElementwiseT)>&
-          binary_op) {
-    const auto shape = instruction->shape();
-    const auto* lhs = instruction->operand(0);
-    const auto* rhs = instruction->operand(1);
-
-    // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast
-    // is removed.
-    if (!(ShapeUtil::SameDimensions(shape, rhs->shape()) &&
-          ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()))) {
-      return Unimplemented(
-          "Implicit broadcasting is currently unsupported in HLO evaluator "
-          "Shape Mismatch: %s vs %s vs %s: ",
-          ShapeUtil::HumanString(shape).c_str(),
-          ShapeUtil::HumanString(lhs->shape()).c_str(),
-          ShapeUtil::HumanString(rhs->shape()).c_str());
-    }
-
-    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
-    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
-
-    auto result = Literal::CreateFromShape(shape);
-
-    TF_RETURN_IF_ERROR(
-        result->Populate<ReturnT>([&](ArraySlice<int64> multi_index) {
-          return ConvertBinaryFunction(binary_op)(
-              lhs_literal.Get<ReturnT>(multi_index),
-              rhs_literal.Get<ReturnT>(multi_index));
-        }));
-    return std::move(result);
-  }
-
-  template <typename LhsType, typename RhsType, typename EhsType>
-  StatusOr<std::unique_ptr<Literal>> ElementwiseTernaryOp(
-      HloInstruction* instruction,
-      const std::function<ReturnT(LhsType, RhsType, EhsType)>& ternary_op) {
-    const auto shape = instruction->shape();
-    const auto* lhs = instruction->operand(0);
-    const auto* rhs = instruction->operand(1);
-    const auto* ehs = instruction->operand(2);
-
-    // TODO(b/35950897, b/27796129): add DCHECK back once implicit
-    // broadcast is removed.
-    if (!(ShapeUtil::SameDimensions(shape, lhs->shape()) &&
-          ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()) &&
-          ShapeUtil::SameDimensions(rhs->shape(), ehs->shape()))) {
-      return Unimplemented(
-          "Implicit broadcasting is currently unsupported in HLO evaluator "
-          "Shape Mismatch: %s vs %s vs %s vs %s: ",
-          ShapeUtil::HumanString(shape).c_str(),
-          ShapeUtil::HumanString(lhs->shape()).c_str(),
-          ShapeUtil::HumanString(rhs->shape()).c_str(),
-          ShapeUtil::HumanString(ehs->shape()).c_str());
-    }
-
-    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
-    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
-    const Literal& ehs_literal = parent_->GetEvaluatedLiteralFor(ehs);
-
-    auto result = Literal::CreateFromShape(shape);
-
-    TF_RETURN_IF_ERROR(
-        result->Populate<ReturnT>([&](ArraySlice<int64> multi_index) {
-          return ternary_op(lhs_literal.Get<LhsType>(multi_index),
-                            rhs_literal.Get<RhsType>(multi_index),
-                            ehs_literal.Get<EhsType>(multi_index));
-        }));
-
-    return std::move(result);
-  }
-
-  template <typename NativeT>
-  static bool IsShiftOutOfBounds(NativeT rhs) {
-    typedef typename std::make_unsigned<NativeT>::type UnsignedT;
-    UnsignedT lhs_size_unsigned = sizeof(NativeT) * CHAR_BIT;
-    UnsignedT rhs_unsigned = static_cast<UnsignedT>(rhs);
-    return rhs_unsigned >= lhs_size_unsigned;
-  }
-
-  HloEvaluator* parent_;
-};  // class HloEvaluator::TypedVisitor
 
 HloEvaluator::HloEvaluator(int64 max_loop_iterations)
     : max_loop_iterations_(max_loop_iterations) {
-  typed_visitors_[PRED] = MakeUnique<TypedVisitor<bool>>(this);
-  typed_visitors_[U8] = MakeUnique<TypedVisitor<uint8>>(this);
+  typed_visitors_[PRED] = MakeUnique<HloEvaluatorTypedVisitor<bool>>(this);
+  typed_visitors_[U8] = MakeUnique<HloEvaluatorTypedVisitor<uint8>>(this);
   typed_visitors_[U16] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
     return Unimplemented(
-        "HloEvaluator::TypedVisitor: unhandled primitive type: U16.");
+        "HloEvaluator::HloEvaluatorTypedVisitor: unhandled primitive type: "
+        "U16.");
   });
-  typed_visitors_[U32] = MakeUnique<TypedVisitor<uint32>>(this);
-  typed_visitors_[U64] = MakeUnique<TypedVisitor<uint64>>(this);
-  typed_visitors_[S8] = MakeUnique<TypedVisitor<int8>>(this);
+  typed_visitors_[U32] = MakeUnique<HloEvaluatorTypedVisitor<uint32>>(this);
+  typed_visitors_[U64] = MakeUnique<HloEvaluatorTypedVisitor<uint64>>(this);
+  typed_visitors_[S8] = MakeUnique<HloEvaluatorTypedVisitor<int8>>(this);
   typed_visitors_[S16] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
     return Unimplemented(
-        "HloEvaluator::TypedVisitor: unhandled primitive type: S16.");
+        "HloEvaluator::HloEvaluatorTypedVisitor: unhandled primitive type: "
+        "S16.");
   });
-  typed_visitors_[S32] = MakeUnique<TypedVisitor<int32>>(this);
-  typed_visitors_[S64] = MakeUnique<TypedVisitor<int64>>(this);
-  typed_visitors_[F16] = MakeUnique<TypedVisitor<Eigen::half, float>>(this);
-  typed_visitors_[F32] = MakeUnique<TypedVisitor<float>>(this);
-  typed_visitors_[F64] = MakeUnique<TypedVisitor<double>>(this);
-  typed_visitors_[C64] = MakeUnique<TypedVisitor<complex64>>(this);
+  typed_visitors_[S32] = MakeUnique<HloEvaluatorTypedVisitor<int32>>(this);
+  typed_visitors_[S64] = MakeUnique<HloEvaluatorTypedVisitor<int64>>(this);
+  typed_visitors_[F16] =
+      MakeUnique<HloEvaluatorTypedVisitor<Eigen::half, float>>(this);
+  typed_visitors_[F32] = MakeUnique<HloEvaluatorTypedVisitor<float>>(this);
+  typed_visitors_[F64] = MakeUnique<HloEvaluatorTypedVisitor<double>>(this);
+  typed_visitors_[C64] = MakeUnique<HloEvaluatorTypedVisitor<complex64>>(this);
 
   // Most of the evaluator computations we use don't support BF16 (e.g.,
   // std::ceil, std::tanh). To make evaluator work with BF16, we set all
   // elementwise computations to be done in F32 and do BF16<->F32 conversion
   // around the input and the output of the computations.
-  typed_visitors_[BF16] = MakeUnique<TypedVisitor<bfloat16, float>>(this);
+  typed_visitors_[BF16] =
+      MakeUnique<HloEvaluatorTypedVisitor<bfloat16, float>>(this);
 
   typed_visitors_[TUPLE] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
     return Unimplemented(
-        "HloEvaluator::TypedVistor: unhandled primitive type: TUPLE.");
+        "HloEvaluatorTypedVisitor: unhandled primitive type: TUPLE.");
   });
   typed_visitors_[OPAQUE] = MakeUnique<FunctionVisitor>([](HloInstruction*) {
     return Unimplemented(
-        "HloEvaluator::TypedVisitor: unhandled primitive type: OPAQUE.");
+        "HloEvaluatorTypedVisitor: unhandled primitive type: OPAQUE.");
   });
 }
 
@@ -2367,6 +309,35 @@ StatusOr<std::unique_ptr<Literal>> HloEvaluator::EvaluateWithSubstitutions(
   return result;
 }
 
+StatusOr<std::unique_ptr<Literal>> HloEvaluator::EvaluateElementwiseBinaryOp(
+    HloOpcode opcode, const Literal& lhs, const Literal& rhs) {
+  std::unique_ptr<HloInstruction> lhs_instr =
+      HloInstruction::CreateConstant(lhs.CloneToUnique());
+  std::unique_ptr<HloInstruction> rhs_instr =
+      HloInstruction::CreateConstant(rhs.CloneToUnique());
+
+  std::unique_ptr<HloInstruction> cloned_instruction =
+      HloInstruction::CreateBinary(lhs.shape(), opcode, lhs_instr.get(),
+                                   rhs_instr.get());
+  auto result = Evaluate(cloned_instruction.get());
+
+  cloned_instruction->DetachFromOperands();
+  return result;
+}
+
+StatusOr<std::unique_ptr<Literal>> HloEvaluator::EvaluateElementwiseUnaryOp(
+    HloOpcode opcode, const Literal& operand) {
+  std::unique_ptr<HloInstruction> operand_instr =
+      HloInstruction::CreateConstant(operand.CloneToUnique());
+
+  std::unique_ptr<HloInstruction> cloned_instruction =
+      HloInstruction::CreateUnary(operand.shape(), opcode, operand_instr.get());
+  auto result = Evaluate(cloned_instruction.get());
+
+  cloned_instruction->DetachFromOperands();
+  return result;
+}
+
 Status HloEvaluator::HandleParameter(HloInstruction* parameter) {
   CHECK_LT(parameter->parameter_number(), arg_literals_.size());
   const Literal* input_literal = arg_literals_[parameter->parameter_number()];
@@ -2536,6 +507,11 @@ Status HloEvaluator::HandleCompare(HloInstruction* compare) {
     } break;
     case F16:
       return Unimplemented("unhandled primitive type: F16.");
+    case BF16: {
+      TF_ASSIGN_OR_RETURN(evaluated_[compare],
+                          Compare<bfloat16>(compare->shape(), opcode,
+                                            lhs_literal, rhs_literal));
+    } break;
     case F32: {
       TF_ASSIGN_OR_RETURN(
           evaluated_[compare],
@@ -2912,6 +888,28 @@ Status HloEvaluator::HandleGather(HloInstruction* gather) {
   return Status::OK();
 }
 
+Status HloEvaluator::HandleBroadcast(HloInstruction* broadcast) {
+  const Literal& operand = GetEvaluatedLiteralFor(broadcast->operand(0));
+
+  TF_RET_CHECK(broadcast->dimensions().size() ==
+               ShapeUtil::Rank(operand.shape()))
+      << "broadcast dimensions is of size: " << broadcast->dimensions().size()
+      << " and rank of operand_to_broadcast is: "
+      << ShapeUtil::Rank(operand.shape());
+  // Checks that operand's dimensions are the same as the broadcast's
+  // dimensions along the dimensions to be broadcasted.
+  for (int64 i = 0; i < broadcast->dimensions().size(); ++i) {
+    TF_RET_CHECK(broadcast->shape().dimensions(broadcast->dimensions(i)) ==
+                 operand.shape().dimensions(i));
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      evaluated_[broadcast],
+      operand.Broadcast(broadcast->shape(), broadcast->dimensions()));
+
+  return Status::OK();
+}
+
 Status HloEvaluator::HandleGetTupleElement(HloInstruction* get_tuple_element) {
   const auto result_shape = get_tuple_element->shape();
   const int64 index = get_tuple_element->tuple_index();
@@ -2963,12 +961,14 @@ Status HloEvaluator::HandleCall(HloInstruction* call) {
 }
 
 Status HloEvaluator::HandleFusion(HloInstruction* fusion) {
+  HloModuleConfig config;
   // Attach cloned computation to an empty HLO module so the existing ones are
   // not modified.
-  HloModule empty_hlo_module("EmptyModuleForFusion");
+  HloModule empty_hlo_module("EmptyModuleForFusion", config);
+  HloCloneContext context(&empty_hlo_module);
   auto cloned_fused_computation =
       fusion->fused_instructions_computation()->Clone(
-          /*suffix=*/"clone_with_layout", &empty_hlo_module);
+          /*suffix=*/"clone_with_layout", &context);
   for (auto* instruction : cloned_fused_computation->instructions()) {
     LayoutUtil::SetToDefaultLayout(instruction->mutable_shape());
   }
@@ -3003,8 +1003,8 @@ Status HloEvaluator::HandleConditional(HloInstruction* conditional) {
   auto* true_computation = conditional->true_computation();
   auto* false_computation = conditional->false_computation();
 
-  auto result = Literal::CreateFromShape(conditional->shape());
   HloEvaluator embedded_evaluator;
+  std::unique_ptr<Literal> result;
   if (pred.Get<bool>({})) {
     result = embedded_evaluator
                  .Evaluate<const Literal*>(*true_computation,
@@ -3028,7 +1028,7 @@ Status HloEvaluator::HandleSelect(HloInstruction* select) {
 
   // If predicate is of scalar type, no element-wise selection would be needed.
   // This would also handle output array of tuple types as the DefaultAction
-  // would go through the TypedVisitor which doesn't handle tuples.
+  // would go through the HloEvaluatorTypedVisitor which doesn't handle tuples.
   if (ShapeUtil::IsScalar(pred.shape())) {
     if (pred.Get<bool>({})) {
       evaluated_[select] = on_true.CloneToUnique();
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h
index c0dcee0c3e382f..b53d5644de5a17 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -108,20 +109,23 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
       const std::unordered_map<const HloInstruction*, const Literal*>&
           substitutions);
 
+  StatusOr<std::unique_ptr<Literal>> EvaluateElementwiseBinaryOp(
+      HloOpcode opcode, const Literal& lhs, const Literal& rhs);
+
+  StatusOr<std::unique_ptr<Literal>> EvaluateElementwiseUnaryOp(
+      HloOpcode opcode, const Literal& operand);
+
  protected:
-  // Templated DfsHloVisitor. Typically ReturnT here indicates the resulting
-  // literal type of each evaluated Handle* method of a TypedVisitor.
-  // There are however a few notable exceptions to this rule, notably:
-  // - HandleCompare and HandleIsFinite: where the resulting literal type is
-  // always boolean.
-  // These operations are handled outside of the parent HloEvaluator handlers
-  // instead of from within TypedVisitor.
+  // Make HloEvaluatorTypedVisitor a friend because it is logically part of this
+  // class.
   //
-  // Type params:
-  //   - ReturnT: The type of input and output of each operation.
-  //   - ElementwiseT: The type in which internal computation are done.
-  template <typename ReturnT, typename ElementwiseT = ReturnT>
-  class TypedVisitor;
+  // A straightforward implementation would be to make it a nested class
+  // declared and defined in hlo_evaluator.cc.  Instead HloEvaluatorTypedVisitor
+  // lives as a separate class with its own header because its template gets
+  // instantiated many times and we want to use extern templates to shard out
+  // the compilation of those instantiations across multiple cc files.
+  template <typename ReturnT, typename ElementwiseT>
+  friend class HloEvaluatorTypedVisitor;
 
   // Wraps around instruction handling to infer types before dispatching to
   // the corresponding typed Visitor.
@@ -168,7 +172,8 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
 
   Status HandleSelect(HloInstruction* select) override;
 
- private:
+  Status HandleBroadcast(HloInstruction* broadcast) override;
+
   // Returns the already-evaluated literal result for the instruction.
   // A Constant instruction is considered evaluated and its literal will be
   // returned directly without looking up the cache.
@@ -183,14 +188,6 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
     return *(it->second);
   }
 
-  // Map from a primitive type to its associated (templated) DfsHloVisitor.
-  // Note: the hash function here is only needed because current gcc std::hash
-  // does not specialize for enum types. This should however be fixed in the
-  // future: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=60970#c5
-  tensorflow::gtl::FlatMap<PrimitiveType, std::unique_ptr<DfsHloVisitor>,
-                           std::hash<int>>
-      typed_visitors_;
-
   // Tracks the HLO instruction and its evaluated literal result.
   // TODO(b/35950897): have better memory management here to free instructions
   // that are no longer a parent for any other subsequent instruction in
@@ -199,6 +196,41 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   tensorflow::gtl::FlatMap<const HloInstruction*, std::unique_ptr<Literal>>
       evaluated_;
 
+ private:
+  template <typename ReturnT, typename NativeT>
+  static StatusOr<std::unique_ptr<Literal>> ElementWiseUnaryOpImpl(
+      HloInstruction* instruction,
+      const std::function<ReturnT(NativeT)>& unary_op,
+      const Literal& operand_literal) {
+    const auto shape = instruction->shape();
+    const auto* operand = instruction->operand(0);
+
+    // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast is
+    // removed.
+    if (!ShapeUtil::SameDimensions(shape, operand->shape())) {
+      return Unimplemented(
+          "Implicit broadcasting is currently unsupported in HLO evaluator "
+          "Shape Mismatch: %s vs %s",
+          ShapeUtil::HumanString(shape).c_str(),
+          ShapeUtil::HumanString(operand->shape()).c_str());
+    }
+
+    auto result = MakeUnique<Literal>(shape);
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+        [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
+          return unary_op(operand_literal.Get<NativeT>(multi_index));
+        }));
+    return std::move(result);
+  }
+
+  // Map from a primitive type to its associated (templated) DfsHloVisitor.
+  // Note: the hash function here is only needed because current gcc std::hash
+  // does not specialize for enum types. This should however be fixed in the
+  // future: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=60970#c5
+  tensorflow::gtl::FlatMap<PrimitiveType, std::unique_ptr<DfsHloVisitor>,
+                           std::hash<int>>
+      typed_visitors_;
+
   // Caches pointers to input literals, assuming they are in post-order.
   // Literals are not owned by this class, and they must outlive the lifetime of
   // each invocation to the Evaluate* method.
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index dd14dd38537a83..84b4ead2dd28ca 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -82,9 +82,9 @@ class HloEvaluatorTest : public ::testing::WithParamInterface<bool>,
     auto element_type = expected->shape().element_type();
     if (element_type == F32 || element_type == F64) {
       ErrorSpec error(aabs);
-      LiteralTestUtil::ExpectNear(*expected, *result, error);
+      EXPECT_TRUE(LiteralTestUtil::Near(*expected, *result, error));
     } else {
-      LiteralTestUtil::ExpectEqual(*expected, *result);
+      EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
     }
   }
 
@@ -100,7 +100,7 @@ class HloEvaluatorTest : public ::testing::WithParamInterface<bool>,
 
     std::unique_ptr<Literal> result = Evaluate();
 
-    LiteralTestUtil::ExpectEqual(*expected, *result);
+    EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
   }
 
   bool use_bfloat16_;
@@ -129,7 +129,7 @@ TEST_P(HloEvaluatorTest, DoesClamp) {
 
   auto expected = Literal::CreateR2<float>({{0, 4}, {2, 4}});
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, DISABLED_DoesClampSpecialBroadcast) {
@@ -150,7 +150,7 @@ TEST_P(HloEvaluatorTest, DISABLED_DoesClampSpecialBroadcast) {
 
   auto expected = Literal::CreateR2<float>({{0, 0}, {1, 1}});
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs select
@@ -175,7 +175,7 @@ TEST_P(HloEvaluatorTest, DoesSelect) {
 
   auto expected = Literal::CreateR2<float>({{2, 5}, {0, 4}});
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 // Verifies that HloEvaluator evaluates a HLO instruction that performs
@@ -262,13 +262,13 @@ TEST_P(HloEvaluatorTest, DoesCosR2) {
   auto operand = Literal::CreateR2<float>({{0, M_PI}, {-M_PI, 2 * M_PI}});
   auto expected = Literal::CreateR2<float>({{1, -1}, {-1, 1}});
   TestUnaryOp(HloOpcode::kCos, std::move(expected), std::move(operand),
-              use_bfloat16_ ? 0x1.0P-5 : 0x1.0P-20);
+              use_bfloat16_ ? 0.031250 : 9.5367431640625E-7);
 }
 TEST_P(HloEvaluatorTest, DoesSinR2) {
   auto operand = Literal::CreateR2<float>({{0, M_PI}, {-M_PI, 2 * M_PI}});
   auto expected = Literal::CreateR2<float>({{0, 0}, {0, 0}});
   TestUnaryOp(HloOpcode::kSin, std::move(expected), std::move(operand),
-              use_bfloat16_ ? 0x1.0P-5 : 0x1.0P-20);
+              use_bfloat16_ ? 0.031250 : 9.5367431640625E-7);
 }
 TEST_P(HloEvaluatorTest, DoesNotR2) {
   auto operand =
@@ -307,7 +307,7 @@ TEST_P(HloEvaluatorTest, DoesTraverseInstructions) {
 
   auto expected = Literal::CreateR2<int64>({{4, -16}, {-196, 12}});
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 // Verifies Reshape operation is correctly evaluated.
@@ -315,7 +315,7 @@ TEST_P(HloEvaluatorTest, DoesReshape) {
   HloComputation::Builder b(TestName());
   const int64 dimensions[] = {11, 8, 7, 5, 9};
   TF_ASSERT_OK_AND_ASSIGN(auto literal,
-                          LiteralTestUtil::CreateRandomLiteral<F32>(
+                          Literal::CreateRandomLiteral<F32>(
                               ShapeUtil::MakeShape(F32, dimensions), 0.0, 1.0));
   auto literal_clone = literal->CloneToUnique();
   HloInstruction* literal_instruction =
@@ -333,7 +333,7 @@ TEST_P(HloEvaluatorTest, DoesReshape) {
   result->EachCell<NativeT>(
       [&](tensorflow::gtl::ArraySlice<int64> indices, NativeT value) {
         std::vector<int64> rindexes = Permute(permutation, indices);
-        EXPECT_NEAR(value, literal_clone->Get<NativeT>(rindexes), 0x1.0P-5);
+        EXPECT_NEAR(value, literal_clone->Get<NativeT>(rindexes), 0.031250);
       });
 }
 
@@ -351,7 +351,7 @@ TEST_P(HloEvaluatorTest, DoesBroadcast) {
 
   std::unique_ptr<Literal> result = Evaluate({});
 
-  LiteralTestUtil::ExpectEqual(*result, *output_literal);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*result, *output_literal));
 }
 
 TEST_P(HloEvaluatorTest, DoesBroadcastScalar) {
@@ -370,7 +370,7 @@ TEST_P(HloEvaluatorTest, DoesBroadcastScalar) {
 
   std::unique_ptr<Literal> result = Evaluate({});
 
-  LiteralTestUtil::ExpectEqual(*result, *output_literal);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*result, *output_literal));
 }
 
 TEST_P(HloEvaluatorTest, DoesConcatenateSimple) {
@@ -392,7 +392,7 @@ TEST_P(HloEvaluatorTest, DoesConcatenateSimple) {
 
   auto expected =
       Literal::CreateR2<int64>({{-1, -2}, {100, 200}, {-2, -3}, {-100, -200}});
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, ConcatenateHandlesShapeWithZeroElement) {
@@ -413,7 +413,7 @@ TEST_P(HloEvaluatorTest, ConcatenateHandlesShapeWithZeroElement) {
   std::unique_ptr<Literal> result = Evaluate();
 
   auto expected = Literal::CreateR1<int64>({100, 200});
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, ConvertWithSameLayout) {
@@ -432,7 +432,7 @@ TEST_P(HloEvaluatorTest, ConvertWithSameLayout) {
 
   std::unique_ptr<Literal> result = Evaluate();
 
-  LiteralTestUtil::ExpectEqual(*result, *expected);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*result, *expected));
 }
 
 TEST_P(HloEvaluatorTest, ConvertWithDifferentLayout) {
@@ -452,7 +452,7 @@ TEST_P(HloEvaluatorTest, ConvertWithDifferentLayout) {
 
   std::unique_ptr<Literal> result = Evaluate();
 
-  LiteralTestUtil::ExpectEqual(*result, *expected);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*result, *expected));
 }
 
 PaddingConfig CreatePaddingConfig(
@@ -490,7 +490,7 @@ TEST_P(HloEvaluatorTest, Pad2DIntegerArrayWithZeroDimension) {
   auto expected = Literal::CreateR2<int32>(
       {{10, 10}, {10, 10}, {10, 10}, {10, 10}, {10, 10}});
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, Pad4DFloatArrayWithInteriorPadding) {
@@ -525,7 +525,7 @@ TEST_P(HloEvaluatorTest, Pad4DFloatArrayWithInteriorPadding) {
 
   auto expected = Literal::CreateR4FromArray4D<float>(*expected_array);
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, NegativePadding2D) {
@@ -567,7 +567,7 @@ TEST_P(HloEvaluatorTest, NegativePadding2D) {
   (*expected_array)(0, 4) = 2.718f;
   auto expected = Literal::CreateR2FromArray2D<float>(*expected_array);
 
-  LiteralTestUtil::ExpectNear(*expected, *result, ErrorSpec(0x1.0P-5));
+  EXPECT_TRUE(LiteralTestUtil::Near(*expected, *result, ErrorSpec(0.031250)));
 }
 
 TEST_P(HloEvaluatorTest, NegativeAndInteriorPadding2D) {
@@ -606,7 +606,7 @@ TEST_P(HloEvaluatorTest, NegativeAndInteriorPadding2D) {
   auto expected_array = MakeUnique<Array2D<float>>(0, 9);
   auto expected = Literal::CreateR2FromArray2D<float>(*expected_array);
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, DotRank2AndRank1) {
@@ -651,7 +651,7 @@ TEST_P(HloEvaluatorTest, DotRank2AndRank1) {
   // clang-format on
   auto expected = Literal::CreateR2FromArray2D<float>(expected_array);
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, DotRank1AndRank2) {
@@ -688,7 +688,7 @@ TEST_P(HloEvaluatorTest, DotRank1AndRank2) {
 
   auto expected = Literal::CreateR1<float>({22.f, 28.f});
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, DotRank2AndRank2) {
@@ -737,7 +737,7 @@ TEST_P(HloEvaluatorTest, DotRank2AndRank2) {
   });
   auto expected = Literal::CreateR2FromArray2D<float>(expected_array);
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, SimpleConv1D) {
@@ -785,7 +785,7 @@ TEST_P(HloEvaluatorTest, SimpleConv1D) {
   Array3D<float> expected_array = {{{11.f, 18.f, 9.f}}};
   auto expected = Literal::CreateR3FromArray3D<float>(expected_array);
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, Simple4x4Conv2DWith2x2Kernel) {
@@ -827,7 +827,7 @@ TEST_P(HloEvaluatorTest, Simple4x4Conv2DWith2x2Kernel) {
   *window.add_dimensions() = dim;
 
   ConvolutionDimensionNumbers dnums =
-      ComputationBuilder::CreateDefaultConvDimensionNumbers(2);
+      XlaBuilder::CreateDefaultConvDimensionNumbers(2);
 
   const Shape& shape = ShapeUtil::MakeShape(F32, {1, 1, 4, 4});
   b.AddInstruction(HloInstruction::CreateConvolve(
@@ -847,7 +847,7 @@ TEST_P(HloEvaluatorTest, Simple4x4Conv2DWith2x2Kernel) {
   // clang-format on
   auto expected = Literal::CreateR4FromArray4D<float>(expected_array);
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, Conv2DGeneralDimensionsReversed) {
@@ -927,7 +927,7 @@ TEST_P(HloEvaluatorTest, Conv2DGeneralDimensionsReversed) {
   auto expected = Literal::CreateR4FromArray4D<float>(
       use_bfloat16_ ? expected_array_bf16 : expected_array);
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, Conv2DGeneralDimensions) {
@@ -1004,7 +1004,7 @@ TEST_P(HloEvaluatorTest, Conv2DGeneralDimensions) {
   auto expected = Literal::CreateR4FromArray4D<float>(
       use_bfloat16_ ? expected_array_bf16 : expected_array);
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithHighPadding) {
@@ -1046,7 +1046,7 @@ TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithHighPadding) {
   *window.add_dimensions() = dim;
 
   ConvolutionDimensionNumbers dnums =
-      ComputationBuilder::CreateDefaultConvDimensionNumbers(2);
+      XlaBuilder::CreateDefaultConvDimensionNumbers(2);
 
   const Shape& shape = ShapeUtil::MakeShape(F32, {1, 1, 7, 7});
   b.AddInstruction(HloInstruction::CreateConvolve(
@@ -1067,7 +1067,7 @@ TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithHighPadding) {
   }));
   auto expected = Literal::CreateR4FromArray4D<float>(expected_array);
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithLowAndHighPadding) {
@@ -1109,7 +1109,7 @@ TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithLowAndHighPadding) {
   *window.add_dimensions() = dim;
 
   ConvolutionDimensionNumbers dnums =
-      ComputationBuilder::CreateDefaultConvDimensionNumbers(2);
+      XlaBuilder::CreateDefaultConvDimensionNumbers(2);
 
   const Shape& shape = ShapeUtil::MakeShape(F32, {1, 1, 8, 8});
   b.AddInstruction(HloInstruction::CreateConvolve(
@@ -1131,7 +1131,7 @@ TEST_P(HloEvaluatorTest, DilatedBaseConv2DWithLowAndHighPadding) {
   }));
   auto expected = Literal::CreateR4FromArray4D<float>(expected_array);
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest,
@@ -1180,7 +1180,7 @@ TEST_P(HloEvaluatorTest,
   *window.add_dimensions() = dim;
 
   ConvolutionDimensionNumbers dnums =
-      ComputationBuilder::CreateDefaultConvDimensionNumbers(2);
+      XlaBuilder::CreateDefaultConvDimensionNumbers(2);
 
   const Shape& shape = ShapeUtil::MakeShape(F32, {1, 1, 9, 3});
   b.AddInstruction(HloInstruction::CreateConvolve(
@@ -1203,7 +1203,7 @@ TEST_P(HloEvaluatorTest,
   }));
   auto expected = Literal::CreateR4FromArray4D<float>(expected_array);
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 class HloEvaluatorPreciseReduceTest : public HloVerifiedTestBase {};
@@ -1319,7 +1319,7 @@ TEST_P(HloEvaluatorTest, ReduceAdd) {
 
   auto expected = Literal::CreateR1<float>({6, 18});
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, ReduceWindowMax) {
@@ -1370,7 +1370,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowMax) {
   std::unique_ptr<Literal> result = Evaluate();
 
   auto expected = Literal::CreateR2<float>({{6, 7}});
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, ReduceWindowAdd) {
@@ -1427,7 +1427,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowAdd) {
   std::unique_ptr<Literal> result = Evaluate();
 
   auto expected = Literal::CreateR2<float>({{1, 3, 5}, {5, 11, 13}});
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, ReduceWindowAdd6D) {
@@ -1490,7 +1490,7 @@ TEST_P(HloEvaluatorTest, ReduceWindowAdd6D) {
   std::vector<int64> output_dims = {4, 3, 3, 3, 4, 4};
   std::unique_ptr<Literal> result_literal =
       Literal::CreateFullWithDescendingLayout<float>(output_dims, 8.0f);
-  LiteralTestUtil::ExpectEqual(*result_literal, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*result_literal, *result));
 }
 
 TEST_P(HloEvaluatorTest, StridedSlice) {
@@ -1523,7 +1523,7 @@ TEST_P(HloEvaluatorTest, StridedSlice) {
       {19},
   });
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, DynamicSlice) {
@@ -1556,7 +1556,7 @@ TEST_P(HloEvaluatorTest, DynamicSlice) {
       {6, 7, 8},
   });
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 // Verifies that the HloEvaluator's implementation goes along with existing
@@ -1591,7 +1591,7 @@ TEST_P(HloEvaluatorTest, DynamicSliceModSlice) {
       {6, 7, 8},
   });
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, DynamicSliceUpdate) {
@@ -1627,7 +1627,7 @@ TEST_P(HloEvaluatorTest, DynamicSliceUpdate) {
       {5, -6, -7},
   });
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, SetAndGetTuples) {
@@ -1662,7 +1662,7 @@ TEST_P(HloEvaluatorTest, SetAndGetTuples) {
       {5, 6, 7},
   });
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, SetAndGetNestedTuples) {
@@ -1703,7 +1703,7 @@ TEST_P(HloEvaluatorTest, SetAndGetNestedTuples) {
       result_inner_literal.get(),
   });
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, Reverse) {
@@ -1756,7 +1756,7 @@ TEST_P(HloEvaluatorTest, Reverse) {
   });
   // clang-format on
 
-  LiteralTestUtil::ExpectEqual(*expected, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *result));
 }
 
 TEST_P(HloEvaluatorTest, EvaluateWithSubstitutions) {
@@ -1776,8 +1776,8 @@ TEST_P(HloEvaluatorTest, EvaluateWithSubstitutions) {
       add, {{param0, Literal::CreateR1<float>({1, 2, 3, 4}).get()},
             {square, Literal::CreateR1<float>({10, 20, 30, 40}).get()}});
   TF_ASSERT_OK(result.status());
-  LiteralTestUtil::ExpectEqual(*Literal::CreateR1<float>({11, 22, 33, 44}),
-                               *result.ValueOrDie());
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      *Literal::CreateR1<float>({11, 22, 33, 44}), *result.ValueOrDie()));
 }
 
 // Check that EvaluateWithSubstitutions works if one of the operands to the op
@@ -1800,8 +1800,8 @@ TEST_P(HloEvaluatorTest, EvaluateWithSubstitutionsWithConstantOperand) {
   auto result = evaluator.EvaluateWithSubstitutions(
       add, {{square, Literal::CreateR1<float>({10, 20, 30, 40}).get()}});
   TF_ASSERT_OK(result.status());
-  LiteralTestUtil::ExpectEqual(*Literal::CreateR1<float>({11, 22, 33, 44}),
-                               *result.ValueOrDie());
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      *Literal::CreateR1<float>({11, 22, 33, 44}), *result.ValueOrDie()));
 }
 
 TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherV1) {
@@ -1823,9 +1823,9 @@ ENTRY main {
   std::unique_ptr<Literal> operand =
       Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   std::unique_ptr<Literal> gather_indices = Literal::CreateR1<int32>({0, 2});
-  LiteralTestUtil::ExpectEqual(
-      *Literal::CreateR2<int32>({{1, 2, 3}, {7, 8, 9}}),
-      *Evaluate({operand.get(), gather_indices.get()}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR2<int32>({{1, 2, 3}, {7, 8, 9}}),
+                             *Evaluate({operand.get(), gather_indices.get()})));
 }
 
 TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherV2) {
@@ -1847,9 +1847,9 @@ ENTRY main {
   std::unique_ptr<Literal> operand =
       Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   std::unique_ptr<Literal> gather_indices = Literal::CreateR1<int32>({0, 2});
-  LiteralTestUtil::ExpectEqual(
+  EXPECT_TRUE(LiteralTestUtil::Equal(
       *Literal::CreateR2<int32>({{1, 3}, {4, 6}, {7, 9}}),
-      *Evaluate({operand.get(), gather_indices.get()}));
+      *Evaluate({operand.get(), gather_indices.get()})));
 }
 
 TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherMultipleBatchDims) {
@@ -1872,10 +1872,10 @@ ENTRY main {
       Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   std::unique_ptr<Literal> gather_indices =
       Literal::CreateR2<int32>({{0, 2}, {2, 1}});
-  LiteralTestUtil::ExpectEqual(
+  EXPECT_TRUE(LiteralTestUtil::Equal(
       *Literal::CreateR3<int32>(
           {{{1, 3}, {4, 6}, {7, 9}}, {{3, 2}, {6, 5}, {9, 8}}}),
-      *Evaluate({operand.get(), gather_indices.get()}));
+      *Evaluate({operand.get(), gather_indices.get()})));
 }
 
 TEST_P(HloEvaluatorTest, EvaluateGather_TensorFlowGatherNd) {
@@ -1900,9 +1900,9 @@ ENTRY main {
                                 {{-7, 7}, {-8, 8}, {-9, 9}}});
   std::unique_ptr<Literal> gather_indices =
       Literal::CreateR2<int32>({{0, 0}, {1, 0}});
-  LiteralTestUtil::ExpectEqual(
-      *Literal::CreateR2<int32>({{-1, 1}, {-4, 4}}),
-      *Evaluate({operand.get(), gather_indices.get()}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR2<int32>({{-1, 1}, {-4, 4}}),
+                             *Evaluate({operand.get(), gather_indices.get()})));
 }
 
 TEST_P(HloEvaluatorTest,
@@ -1928,9 +1928,9 @@ ENTRY main {
                                 {{-7, 7}, {-8, 8}, {-9, 9}}});
   std::unique_ptr<Literal> gather_indices =
       Literal::CreateR2<int32>({{0, 0}, {1, 0}});
-  LiteralTestUtil::ExpectEqual(
-      *Literal::CreateR2<int32>({{-2, 2}, {-1, 1}}),
-      *Evaluate({operand.get(), gather_indices.get()}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR2<int32>({{-2, 2}, {-1, 1}}),
+                             *Evaluate({operand.get(), gather_indices.get()})));
 }
 
 TEST_P(HloEvaluatorTest, EvaluateGather_DynamicSlice) {
@@ -1952,9 +1952,9 @@ ENTRY main {
   std::unique_ptr<Literal> operand =
       Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   std::unique_ptr<Literal> gather_indices = Literal::CreateR1<int32>({1, 1});
-  LiteralTestUtil::ExpectEqual(
-      *Literal::CreateR2<int32>({{5}}),
-      *Evaluate({operand.get(), gather_indices.get()}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR2<int32>({{5}}),
+                             *Evaluate({operand.get(), gather_indices.get()})));
 }
 
 TEST_P(HloEvaluatorTest, EvaluateGather_BatchDynamicSlice) {
@@ -1977,9 +1977,9 @@ ENTRY main {
       Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   std::unique_ptr<Literal> gather_indices =
       Literal::CreateR2<int32>({{2, 1}, {1, 1}});
-  LiteralTestUtil::ExpectEqual(
-      *Literal::CreateR3<int32>({{{8}}, {{5}}}),
-      *Evaluate({operand.get(), gather_indices.get()}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR3<int32>({{{8}}, {{5}}}),
+                             *Evaluate({operand.get(), gather_indices.get()})));
 }
 
 TEST_P(HloEvaluatorTest, EvaluateGather_ZeroDimBounds) {
@@ -2000,9 +2000,50 @@ ENTRY main {
   ParseAndVerifyModule(hlo_text);
   std::unique_ptr<Literal> operand = Literal::CreateR2<int32>({{}, {}, {}});
   std::unique_ptr<Literal> gather_indices = Literal::CreateR1<int32>({0, 2});
-  LiteralTestUtil::ExpectEqual(
-      *Literal::CreateR2<int32>({{}, {}}),
-      *Evaluate({operand.get(), gather_indices.get()}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR2<int32>({{}, {}}),
+                             *Evaluate({operand.get(), gather_indices.get()})));
+}
+
+TEST_P(HloEvaluatorTest, EvaluateGather_NoOutputWindowDims) {
+  const string hlo_text = R"(
+HloModule GatherXd
+
+ENTRY main {
+  operand = s32[3] parameter(0)
+  indices = s32[2,2,1] parameter(1)
+  ROOT gather = s32[2,2] gather(operand, indices),
+      output_window_dims={},
+      elided_window_dims={0},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=2,
+      window_bounds={1}
+}
+)";
+  ParseAndVerifyModule(hlo_text);
+
+  std::unique_ptr<Literal> operand = Literal::CreateR1<int32>({0, 1, 2});
+  std::unique_ptr<Literal> gather_indices =
+      Literal::CreateR3<int32>({{{0}, {1}}, {{2}, {1}}});
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR2<int32>({{0, 1}, {2, 1}}),
+                             *Evaluate({operand.get(), gather_indices.get()})));
+}
+
+// Verifies that HloEvaluator evaluates a HLO instruction that performs
+// element-wise comparison with 2 bfloat16 operands.
+TEST_P(HloEvaluatorTest, DoesCompareBF16) {
+  // lhs >= rhs
+  auto lhs = Literal::CreateR2<bfloat16>(
+      {{bfloat16(0.25), bfloat16(0.35), bfloat16(0.125)},
+       {bfloat16(-0.25), bfloat16(-0.35), bfloat16(-0.125)}});
+  auto rhs = Literal::CreateR2<bfloat16>(
+      {{bfloat16(0.5), bfloat16(0.125), bfloat16(0.125)},
+       {bfloat16(0.25), bfloat16(-0.375), bfloat16(-0.127)}});
+  auto expected =
+      Literal::CreateR2<bool>({{false, true, true}, {false, true, true}});
+  TestBinaryOp(HloOpcode::kGe, std::move(expected), std::move(lhs),
+               std::move(rhs));
 }
 
 INSTANTIATE_TEST_CASE_P(HloEvaluatorTest_Instantiation, HloEvaluatorTest,
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
new file mode 100644
index 00000000000000..13f46407e33e36
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
@@ -0,0 +1,2142 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_TYPED_VISITOR_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_TYPED_VISITOR_H_
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
+#include "tensorflow/core/lib/core/casts.h"
+#include "tensorflow/core/lib/gtl/optional.h"
+
+namespace xla {
+
+// TODO(b/79274244): We'd like these type traits to live inside of
+// HloEvaluatorTypedVisitor so they don't pollute namespace xla, but that
+// crashes clang in the frontend.
+//
+// Anyway this is relatively safe as-is because hlo_evaluator_typed_visitor.h is
+// a "private" header that's not exposed outside of hlo_evaluator.cc.
+template <typename T>
+using is_complex_t = std::is_same<T, complex64>;
+template <typename T>
+using is_complex64_t = std::is_same<T, complex64>;
+
+// Templated DfsHloVisitor for use by HloEvaluator.
+//
+// Typically ReturnT here indicates the resulting literal type of each evaluated
+// Handle* method of a TypedVisitor.  There are however a few notable exceptions
+// to this rule, notably:
+// - HandleCompare and HandleIsFinite: where the resulting literal type is
+//   always boolean.
+// These operations are handled outside of the parent HloEvaluator handlers
+// instead of from within TypedVisitor.
+//
+// Type params:
+//   - ReturnT: The type of input and output of each operation.
+//   - ElementwiseT: The type in which internal computation are done.
+//
+// This a logically a private part of HloEvaluator.  It lives in this header
+// file rather than in hlo_evaluator.cc because we use extern templates and a
+// bunch of independent cc files to speed up compiling the many instantiations
+// of this class.
+template <typename ReturnT, typename ElementwiseT = ReturnT>
+class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
+ public:
+  explicit HloEvaluatorTypedVisitor(HloEvaluator* p) : parent_(p) {}
+
+  // The following higher-order functions convert a function with ElementwiseT
+  // to a function with ReturnT.
+  std::function<ReturnT(ReturnT)> ConvertUnaryFunction(
+      const std::function<ElementwiseT(ElementwiseT)>& unary_op) {
+    return [&unary_op](ReturnT arg) {
+      return static_cast<ReturnT>(unary_op(static_cast<ElementwiseT>(arg)));
+    };
+  }
+  std::function<ReturnT(ReturnT, ReturnT)> ConvertBinaryFunction(
+      const std::function<ElementwiseT(ElementwiseT, ElementwiseT)>&
+          binary_op) {
+    return [&binary_op](ReturnT arg1, ReturnT arg2) {
+      return static_cast<ReturnT>(binary_op(static_cast<ElementwiseT>(arg1),
+                                            static_cast<ElementwiseT>(arg2)));
+    };
+  }
+  std::function<ReturnT(ReturnT, ReturnT, ReturnT)> ConvertTernaryFunction(
+      const std::function<ElementwiseT(ElementwiseT, ElementwiseT,
+                                       ElementwiseT)>& ternary_op) {
+    return [&ternary_op](ReturnT arg1, ReturnT arg2, ReturnT arg3) {
+      return static_cast<ReturnT>(ternary_op(static_cast<ElementwiseT>(arg1),
+                                             static_cast<ElementwiseT>(arg2),
+                                             static_cast<ElementwiseT>(arg3)));
+    };
+  }
+
+  Status DefaultAction(HloInstruction* hlo_instruction) override {
+    return Unimplemented("unhandled HLO ops for HloEvaluator: %s.",
+                         HloOpcodeString(hlo_instruction->opcode()).c_str());
+  }
+
+  // TODO(b/35950897): many of the stl functions used in the handlers are not
+  // overloaded for every XLA primitive type.
+
+  template <typename NativeT,
+            typename std::enable_if<std::is_unsigned<NativeT>::value>::type* =
+                nullptr>
+  Status HandleAbs(HloInstruction* abs) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[abs],
+                        ElementWiseUnaryOp(abs, [](NativeT elem_operand) {
+                          return elem_operand;
+                        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<std::is_signed<NativeT>::value>::type* = nullptr>
+  Status HandleAbs(HloInstruction* abs) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[abs],
+                        ElementWiseUnaryOp(abs, [](NativeT elem_operand) {
+                          return std::abs(elem_operand);
+                        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<is_complex64_t<NativeT>::value>::type* = nullptr>
+  Status HandleAbs(HloInstruction* abs) {
+    const Literal& operand_literal =
+        parent_->GetEvaluatedLiteralFor(abs->operand(0));
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[abs],
+        (HloEvaluator::ElementWiseUnaryOpImpl<float, NativeT>(
+            abs, [](NativeT elem_operand) { return std::abs(elem_operand); },
+            operand_literal)));
+
+    return Status::OK();
+  }
+
+  Status HandleAbs(HloInstruction* abs) override {
+    // If the operand is of C64 type, the return type of abs will be F32.
+    // However, ElementwiseT would still be the return type, F32, and thus
+    // specifying the ElementwiseT explicitly as C64 is needed below.
+    if (abs->operand(0)->shape().element_type() == C64) {
+      return HandleAbs<complex64>(abs);
+    }
+    return HandleAbs<ElementwiseT>(abs);
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleRound(HloInstruction* round) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[round],
+        ElementWiseUnaryOp(round, [](ElementwiseT elem_operand) {
+          return std::round(elem_operand);
+        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleRound(HloInstruction* round) {
+    return InvalidArgument("Unsupported type for Round");
+  }
+
+  Status HandleRound(HloInstruction* round) override {
+    return HandleRound<ReturnT>(round);
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleCeil(HloInstruction* ceil) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[ceil],
+                        ElementWiseUnaryOp(ceil, [](ElementwiseT elem_operand) {
+                          return std::ceil(elem_operand);
+                        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleCeil(HloInstruction* ceil) {
+    return InvalidArgument("Unsupported type for Ceil");
+  }
+
+  Status HandleCeil(HloInstruction* ceil) override {
+    return HandleCeil<ReturnT>(ceil);
+  }
+
+  Status HandleConvert(HloInstruction* convert) override {
+    const HloInstruction* operand = convert->operand(0);
+    TF_RET_CHECK(ShapeUtil::SameDimensions(operand->shape(), convert->shape()));
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> result,
+                        parent_->GetEvaluatedLiteralFor(operand).Convert(
+                            convert->shape().element_type()));
+
+    if (LayoutUtil::LayoutsInShapesEqual(result->shape(), convert->shape())) {
+      parent_->evaluated_[convert] = std::move(result);
+    } else {
+      parent_->evaluated_[convert] =
+          result->Relayout(convert->shape().layout());
+    }
+    return Status::OK();
+  }
+
+  Status HandleBitcastConvert(HloInstruction* convert) override {
+    const HloInstruction* operand = convert->operand(0);
+    TF_RET_CHECK(ShapeUtil::SameDimensions(operand->shape(), convert->shape()));
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> result,
+                        parent_->GetEvaluatedLiteralFor(operand).BitcastConvert(
+                            convert->shape().element_type()));
+
+    if (LayoutUtil::LayoutsInShapesEqual(result->shape(), convert->shape())) {
+      parent_->evaluated_[convert] = std::move(result);
+    } else {
+      parent_->evaluated_[convert] =
+          result->Relayout(convert->shape().layout());
+    }
+    return Status::OK();
+  }
+
+  Status HandleExp(HloInstruction* exp) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[exp],
+                        ElementWiseUnaryOp(exp, [](ElementwiseT elem_operand) {
+                          return std::exp(elem_operand);
+                        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleExpm1(HloInstruction* expm1) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[expm1],
+        ElementWiseUnaryOp(expm1, [](ElementwiseT elem_operand) {
+          return std::expm1(elem_operand);
+        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleExpm1(HloInstruction* floor) {
+    return InvalidArgument("Unsupported type for Expm1");
+  }
+
+  Status HandleExpm1(HloInstruction* floor) override {
+    return HandleExpm1<ReturnT>(floor);
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleFloor(HloInstruction* floor) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[floor],
+        ElementWiseUnaryOp(floor, [](ElementwiseT elem_operand) {
+          return std::floor(elem_operand);
+        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleFloor(HloInstruction* floor) {
+    return InvalidArgument("Unsupported type for Floor");
+  }
+
+  Status HandleFloor(HloInstruction* floor) override {
+    return HandleFloor<ReturnT>(floor);
+  }
+
+  Status HandleLog(HloInstruction* log) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[log],
+                        ElementWiseUnaryOp(log, [](ElementwiseT elem_operand) {
+                          return std::log(elem_operand);
+                        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleLog1p(HloInstruction* expm1) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[expm1],
+        ElementWiseUnaryOp(expm1, [](ElementwiseT elem_operand) {
+          return std::log1p(elem_operand);
+        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleLog1p(HloInstruction* floor) {
+    return InvalidArgument("Unsupported type for Log1p");
+  }
+
+  Status HandleLog1p(HloInstruction* floor) override {
+    return HandleLog1p<ReturnT>(floor);
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<
+                std::is_integral<NativeT>::value &&
+                !std::is_same<NativeT, bool>::value>::type* = nullptr>
+  Status HandleNot(HloInstruction* not_) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[not_],
+                        ElementWiseUnaryOp(not_, [](ElementwiseT elem_operand) {
+                          return ~elem_operand;
+                        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_floating_point<
+                                  NativeT>::value>::type* = nullptr>
+  Status HandleNot(HloInstruction* not_) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[not_],
+                        ElementWiseUnaryOp(not_, [](ElementwiseT elem_operand) {
+                          return !elem_operand;
+                        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<std::is_same<NativeT, bool>::value>::type* =
+                nullptr>
+  Status HandleNot(HloInstruction* not_) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[not_],
+                        ElementWiseUnaryOp(not_, [](ElementwiseT elem_operand) {
+                          return !elem_operand;
+                        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleNot(HloInstruction* not_) {
+    return InvalidArgument("Unsupported type for Not");
+  }
+
+  Status HandleNot(HloInstruction* not_) override {
+    return HandleNot<ElementwiseT>(not_);
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<
+                std::is_signed<NativeT>::value &&
+                !std::is_floating_point<NativeT>::value>::type* = nullptr>
+  Status HandleNegate(HloInstruction* negate) {
+    using type = typename std::make_unsigned<NativeT>::type;
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[negate],
+        ElementWiseUnaryOp(negate, [](ElementwiseT elem_operand) {
+          return NativeT(-type(elem_operand));
+        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<
+                !std::is_signed<NativeT>::value ||
+                std::is_floating_point<NativeT>::value>::type* = nullptr>
+  Status HandleNegate(HloInstruction* negate) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[negate],
+        ElementWiseUnaryOp(
+            negate, [](ElementwiseT elem_operand) { return -elem_operand; }));
+    return Status::OK();
+  }
+
+  Status HandleNegate(HloInstruction* negate) override {
+    return HandleNegate<ReturnT>(negate);
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleSign(HloInstruction* sign) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[sign],
+                        ElementWiseUnaryOp(sign, [](ElementwiseT elem_operand) {
+                          return (ElementwiseT(0) < elem_operand) -
+                                 (elem_operand < ElementwiseT(0));
+                        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleSign(HloInstruction* sign) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[sign],
+                        ElementWiseUnaryOp(sign, [](ElementwiseT elem_operand) {
+                          auto abs_val = std::abs(elem_operand);
+                          return 0 == abs_val ? ElementwiseT(0)
+                                              : elem_operand / abs_val;
+                        }));
+    return Status::OK();
+  }
+
+  Status HandleSign(HloInstruction* sign) override {
+    return HandleSign<ReturnT>(sign);
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_floating_point<
+                                  NativeT>::value>::type* = nullptr>
+  Status HandleAtan2(HloInstruction* atan2) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[atan2],
+                        ElementWiseBinaryOp(atan2, [](ElementwiseT lhs_elem,
+                                                      ElementwiseT rhs_elem) {
+                          return std::atan2(lhs_elem, rhs_elem);
+                        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT, typename std::enable_if<!std::is_floating_point<
+                                  NativeT>::value>::type* = nullptr>
+  Status HandleAtan2(HloInstruction* atan2) {
+    return InvalidArgument("Unsupported type for Atan2");
+  }
+
+  Status HandleAtan2(HloInstruction* atan2) override {
+    return HandleAtan2<ElementwiseT>(atan2);
+  }
+
+  Status HandleTanh(HloInstruction* tanh) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[tanh],
+                        ElementWiseUnaryOp(tanh, [](ElementwiseT elem_operand) {
+                          return std::tanh(elem_operand);
+                        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<
+                std::is_signed<NativeT>::value &&
+                !std::is_floating_point<NativeT>::value>::type* = nullptr>
+  Status HandleMultiply(HloInstruction* multiply) {
+    using type = typename std::make_unsigned<NativeT>::type;
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[multiply],
+        ElementWiseBinaryOp(multiply,
+                            [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) {
+                              return NativeT(type(lhs_elem) * type(rhs_elem));
+                            }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<std::is_unsigned<NativeT>::value ||
+                              std::is_floating_point<NativeT>::value ||
+                              is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleMultiply(HloInstruction* multiply) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[multiply],
+        ElementWiseBinaryOp(multiply,
+                            [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) {
+                              return lhs_elem * rhs_elem;
+                            }));
+    return Status::OK();
+  }
+
+  Status HandleMultiply(HloInstruction* multiply) override {
+    return HandleMultiply<ElementwiseT>(multiply);
+  }
+
+  Status HandleSubtract(HloInstruction* subtract) override {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[subtract],
+        ElementWiseBinaryOp(subtract,
+                            [](ElementwiseT lhs_elem, ElementwiseT rhs_elem) {
+                              return lhs_elem - rhs_elem;
+                            }));
+    return Status::OK();
+  }
+
+  Status HandleAdd(HloInstruction* add) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[add],
+                        ElementWiseBinaryOp(add, [](ElementwiseT lhs_elem,
+                                                    ElementwiseT rhs_elem) {
+                          return lhs_elem + rhs_elem;
+                        }));
+    return Status::OK();
+  }
+
+  Status HandleDivide(HloInstruction* divide) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[divide],
+                        ElementWiseBinaryOp(divide, [](ElementwiseT lhs_elem,
+                                                       ElementwiseT rhs_elem) {
+                          return lhs_elem / rhs_elem;
+                        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<std::is_integral<NativeT>::value>::type* =
+                nullptr>
+  Status HandleMaximum(HloInstruction* maximum) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[maximum],
+        ElementWiseBinaryOp(maximum, [](ElementwiseT lhs, ElementwiseT rhs) {
+          return std::max(lhs, rhs);
+        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_floating_point<
+                                  NativeT>::value>::type* = nullptr>
+  Status HandleMaximum(HloInstruction* maximum) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[maximum],
+        ElementWiseBinaryOp(maximum, [](ElementwiseT lhs, ElementwiseT rhs) {
+          return ((lhs >= rhs) || std::isnan(lhs)) ? lhs : rhs;
+        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleMaximum(HloInstruction* maximum) {
+    return InvalidArgument("Unsupported type for Maximum");
+  }
+
+  Status HandleMaximum(HloInstruction* maximum) override {
+    return HandleMaximum<ElementwiseT>(maximum);
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<std::is_integral<NativeT>::value>::type* =
+                nullptr>
+  Status HandleMinimum(HloInstruction* minimum) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[minimum],
+                        ElementWiseBinaryOp(minimum, [](ElementwiseT lhs_el,
+                                                        ElementwiseT rhs_el) {
+                          return std::min(lhs_el, rhs_el);
+                        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_floating_point<
+                                  NativeT>::value>::type* = nullptr>
+  Status HandleMinimum(HloInstruction* minimum) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[minimum],
+        ElementWiseBinaryOp(minimum, [](ElementwiseT lhs_el,
+                                        ElementwiseT rhs_el) {
+          return ((lhs_el <= rhs_el) || std::isnan(lhs_el)) ? lhs_el : rhs_el;
+        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleMinimum(HloInstruction* minimum) {
+    return InvalidArgument("Unsupported type for Minimum");
+  }
+
+  Status HandleMinimum(HloInstruction* minimum) override {
+    return HandleMinimum<ElementwiseT>(minimum);
+  }
+
+  Status HandlePower(HloInstruction* power) override {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[power],
+                        ElementWiseBinaryOp(power, [](ElementwiseT lhs_el,
+                                                      ElementwiseT rhs_el) {
+                          return std::pow(lhs_el, rhs_el);
+                        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleRemainder(HloInstruction* remainder) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[remainder],
+                        ElementWiseBinaryOp(remainder, [](ElementwiseT lhs_el,
+                                                          ElementwiseT rhs_el) {
+                          return std::fmod(lhs_el, rhs_el);
+                        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleRemainder(HloInstruction* remainder) {
+    return InvalidArgument("Unsupported type for Remainder");
+  }
+
+  Status HandleRemainder(HloInstruction* remainder) override {
+    return HandleRemainder<ElementwiseT>(remainder);
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<std::is_integral<NativeT>::value>::type* =
+                nullptr>
+  Status HandleAnd(HloInstruction* and_) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[and_],
+        ElementWiseBinaryOp(and_, [](ElementwiseT lhs_el, ElementwiseT rhs_el) {
+          return lhs_el & rhs_el;
+        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_floating_point<
+                                  NativeT>::value>::type* = nullptr>
+  Status HandleAnd(HloInstruction* and_) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[and_],
+        ElementWiseBinaryOp(and_, [](ElementwiseT lhs_el, ElementwiseT rhs_el) {
+          return lhs_el && rhs_el;
+        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleAnd(HloInstruction* and_) {
+    return InvalidArgument("Unsupported type for And");
+  }
+
+  Status HandleAnd(HloInstruction* and_) override {
+    return HandleAnd<ElementwiseT>(and_);
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<std::is_integral<NativeT>::value>::type* =
+                nullptr>
+  Status HandleOr(HloInstruction* or_) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[or_],
+        ElementWiseBinaryOp(or_, [](ElementwiseT lhs_el, ElementwiseT rhs_el) {
+          return lhs_el | rhs_el;
+        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_floating_point<
+                                  NativeT>::value>::type* = nullptr>
+  Status HandleOr(HloInstruction* or_) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[or_],
+        ElementWiseBinaryOp(or_, [](ElementwiseT lhs_el, ElementwiseT rhs_el) {
+          return lhs_el || rhs_el;
+        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleOr(HloInstruction* or_) {
+    return InvalidArgument("Unsupported type for Or");
+  }
+
+  Status HandleOr(HloInstruction* or_) override {
+    return HandleOr<ElementwiseT>(or_);
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<
+                std::is_integral<NativeT>::value &&
+                !std::is_same<NativeT, bool>::value>::type* = nullptr>
+  Status HandleShiftLeft(HloInstruction* shl) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[shl],
+        ElementWiseBinaryOp(shl, [](NativeT lhs_elem, NativeT rhs_elem) {
+          return IsShiftOutOfBounds<NativeT>(rhs_elem) ? 0
+                                                       : (lhs_elem << rhs_elem);
+        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<!std::is_integral<NativeT>::value ||
+                                    std::is_same<NativeT, bool>::value>::type* =
+                nullptr>
+  Status HandleShiftLeft(HloInstruction*) {
+    return InvalidArgument("Unsupported type for ShiftLeft");
+  }
+
+  Status HandleShiftLeft(HloInstruction* shl) override {
+    return HandleShiftLeft<ElementwiseT>(shl);
+  }
+  template <typename NativeT,
+            typename std::enable_if<
+                std::is_integral<NativeT>::value &&
+                !std::is_same<NativeT, bool>::value>::type* = nullptr>
+  Status HandleShiftRightArithmetic(HloInstruction* shr) {
+    typedef typename std::make_signed<NativeT>::type SignedT;
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[shr],
+        ElementWiseBinaryOp(shr, [](NativeT lhs_elem, NativeT rhs_elem) {
+          SignedT lhs_signed = static_cast<SignedT>(lhs_elem);
+          if (IsShiftOutOfBounds<NativeT>(rhs_elem)) {
+            return lhs_signed < 0 ? static_cast<SignedT>(-1) : 0;
+          } else {
+            return lhs_signed >> rhs_elem;
+          }
+        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<!std::is_integral<NativeT>::value ||
+                                    std::is_same<NativeT, bool>::value>::type* =
+                nullptr>
+  Status HandleShiftRightArithmetic(HloInstruction*) {
+    return InvalidArgument("Unsupported type for ShiftRightArithmetic");
+  }
+
+  Status HandleShiftRightArithmetic(HloInstruction* shra) override {
+    return HandleShiftRightArithmetic<ElementwiseT>(shra);
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<
+                std::is_integral<NativeT>::value &&
+                !std::is_same<NativeT, bool>::value>::type* = nullptr>
+  Status HandleShiftRightLogical(HloInstruction* shr) {
+    typedef typename std::make_unsigned<NativeT>::type UnsignedT;
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[shr],
+        ElementWiseBinaryOp(shr, [](NativeT lhs_elem, NativeT rhs_elem) {
+          // If shift amount is greater than the number of bits, then return 0.
+          if (IsShiftOutOfBounds<NativeT>(rhs_elem)) {
+            return static_cast<NativeT>(0);
+          }
+          return static_cast<NativeT>(static_cast<UnsignedT>(lhs_elem) >>
+                                      rhs_elem);
+        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<!std::is_integral<NativeT>::value ||
+                                    std::is_same<NativeT, bool>::value>::type* =
+                nullptr>
+  Status HandleShiftRightLogical(HloInstruction*) {
+    return InvalidArgument("Unsupported type for ShiftRightLogical");
+  }
+
+  Status HandleShiftRightLogical(HloInstruction* shrl) override {
+    return HandleShiftRightLogical<ElementwiseT>(shrl);
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<!is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleClamp(HloInstruction* clamp) {
+    std::function<ElementwiseT(ElementwiseT, ElementwiseT, ElementwiseT)>
+        clamp_op = [](ElementwiseT low, ElementwiseT value, ElementwiseT high) {
+          return std::fmin(high, std::fmax(value, low));
+        };
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[clamp],
+        ElementwiseTernaryOp(clamp,
+                             std::move(ConvertTernaryFunction(clamp_op))));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleClamp(HloInstruction*) {
+    return InvalidArgument("Unsupported type for Clamp");
+  }
+
+  Status HandleClamp(HloInstruction* clamp) override {
+    return HandleClamp<ElementwiseT>(clamp);
+  }
+
+  Status HandleSelect(HloInstruction* select) override {
+    CHECK(!ShapeUtil::IsScalar(select->operand(0)->shape()));
+    CHECK(!ShapeUtil::IsTuple(select->shape()));
+    std::function<ReturnT(bool, ReturnT, ReturnT)> select_op =
+        [](bool pred, ReturnT on_true, ReturnT on_false) {
+          if (pred) {
+            return on_true;
+          }
+          return on_false;
+        };
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[select],
+                        ElementwiseTernaryOp(select, std::move(select_op)));
+    return Status::OK();
+  }
+
+  Status HandleReverse(HloInstruction* reverse) override {
+    const auto result_shape = reverse->shape();
+    const auto reverse_dimensions = reverse->dimensions();
+
+    auto operand = reverse->operand(0);
+    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
+                        ShapeInference::InferReverseShape(operand->shape(),
+                                                          reverse_dimensions));
+
+    TF_RET_CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape))
+        << "return shape set to: " << ShapeUtil::HumanString(result_shape)
+        << " but is inferred to be: "
+        << ShapeUtil::HumanString(inferred_return_shape);
+
+    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
+    auto result = MakeUnique<Literal>(result_shape);
+
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+        [&](tensorflow::gtl::ArraySlice<int64> out_index) {
+          std::vector<int64> from_index(out_index.begin(), out_index.end());
+          for (const int64 dim : reverse_dimensions) {
+            from_index[dim] = result_shape.dimensions(dim) - 1 - out_index[dim];
+          }
+          return operand_literal.Get<ReturnT>(from_index);
+        }));
+
+    parent_->evaluated_[reverse] = std::move(result);
+    return Status::OK();
+  }
+
+  Status HandleConvolution(HloInstruction* conv) override {
+    auto lhs = conv->operand(0);
+    auto rhs = conv->operand(1);
+    const auto& window = conv->window();
+    const Shape& result_shape = conv->shape();
+    const Shape& lhs_shape = lhs->shape();
+    const Shape& rhs_shape = rhs->shape();
+
+    TF_CHECK_OK(ShapeUtil::ValidateShape(lhs_shape));
+    TF_CHECK_OK(ShapeUtil::ValidateShape(rhs_shape));
+    CHECK(ShapeUtil::IsArray(lhs_shape));
+    CHECK(ShapeUtil::IsArray(rhs_shape));
+    CHECK(ShapeUtil::SameElementType(lhs_shape, rhs_shape));
+    CHECK(ShapeUtil::SameElementType(lhs_shape, result_shape));
+
+    const auto& dnums = conv->convolution_dimension_numbers();
+    const int64 num_spatial_dims = dnums.output_spatial_dimensions_size();
+    CHECK_EQ(num_spatial_dims, dnums.input_spatial_dimensions_size());
+    CHECK_EQ(num_spatial_dims, dnums.kernel_spatial_dimensions_size());
+    CHECK_GE(num_spatial_dims, 0);
+    CHECK_EQ(window.dimensions_size(), num_spatial_dims);
+
+    const auto lhs_rank = ShapeUtil::Rank(lhs_shape);
+    const auto rhs_rank = ShapeUtil::Rank(rhs_shape);
+
+    CHECK_EQ(num_spatial_dims + 2, lhs_rank);
+    CHECK_EQ(num_spatial_dims + 2, rhs_rank);
+
+    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
+                        ShapeInference::InferConvolveShape(lhs_shape, rhs_shape,
+                                                           window, dnums));
+    CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape))
+        << "return shape set to: " << ShapeUtil::HumanString(result_shape)
+        << " but is inferred to be: "
+        << ShapeUtil::HumanString(inferred_return_shape);
+
+    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
+    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
+
+    std::vector<int64> window_dimension_sizes;
+    for (auto i : dnums.kernel_spatial_dimensions()) {
+      window_dimension_sizes.push_back(ShapeUtil::GetDimension(rhs_shape, i));
+    }
+
+    const Shape& window_shape =
+        ShapeUtil::MakeShape(rhs_shape.element_type(), window_dimension_sizes);
+
+    DimensionVector lhs_dim_multipliers = MakeDimMultipliers(lhs_shape);
+    DimensionVector rhs_dim_multipliers = MakeDimMultipliers(rhs_shape);
+
+    auto lhs_literal_data = lhs_literal.data<ReturnT>();
+    auto rhs_literal_data = rhs_literal.data<ReturnT>();
+
+    auto func = [&window_shape, &dnums, &lhs_shape, &rhs_shape, &window,
+                 &lhs_dim_multipliers, &rhs_dim_multipliers, lhs_literal_data,
+                 rhs_literal_data](
+                    tensorflow::gtl::ArraySlice<int64> out_index) {
+      // Dimension number applicable for input (lhs).
+      const int64 input_batch_dim = dnums.input_batch_dimension();
+      const int64 input_z_dim = dnums.input_feature_dimension();
+      // Dimension number applicable for kernel (rhs).
+      const int64 kernel_input_z_dim = dnums.kernel_input_feature_dimension();
+      const int64 kernel_output_z_dim = dnums.kernel_output_feature_dimension();
+      // Dimension number applicable for output.
+      const int64 output_batch_dim = dnums.output_batch_dimension();
+      const int64 output_z_dim = dnums.output_feature_dimension();
+
+      const int64 z_size = ShapeUtil::GetDimension(lhs_shape, input_z_dim);
+
+      ElementwiseT result_val = static_cast<ElementwiseT>(0);
+      DimensionVector rhs_spatial_index(dnums.kernel_spatial_dimensions_size(),
+                                        0);
+
+      // Convolve input feature with kernel.
+      do {
+        for (int64 iz = 0; iz < z_size; ++iz) {
+          int64 lhs_linear_index = 0;
+          lhs_linear_index += out_index[output_batch_dim] *
+                              lhs_dim_multipliers[input_batch_dim];
+          lhs_linear_index += iz * lhs_dim_multipliers[input_z_dim];
+
+          int64 rhs_linear_index = 0;
+          rhs_linear_index += out_index[output_z_dim] *
+                              rhs_dim_multipliers[kernel_output_z_dim];
+          rhs_linear_index += iz * rhs_dim_multipliers[kernel_input_z_dim];
+
+          // Find corresponding spatial dimension index for input (lhs).
+          for (int64 ki = 0; ki < rhs_spatial_index.size(); ++ki) {
+            // Spatial dimension number for input (lhs) and output.
+            const int64 input_spatial_dim = dnums.input_spatial_dimensions(ki);
+            const int64 output_spatial_dim =
+                dnums.output_spatial_dimensions(ki);
+
+            // Calculate lhs (input) index without taking base dilation into
+            // account.
+            const auto& window_dim = window.dimensions(ki);
+            const int64 undilated_index =
+                out_index[output_spatial_dim] * window_dim.stride() -
+                window_dim.padding_low() +
+                rhs_spatial_index[ki] * window_dim.window_dilation();
+            // Skip if the lhs (input) index is to be dilated.  As an
+            // optimization, skip this mod if there's no dilation.
+            if (window_dim.base_dilation() > 1 &&
+                undilated_index % window_dim.base_dilation() != 0) {
+              goto cnt;
+            }
+
+            // Calculate the actual lhs (input) index after dilation.  As an
+            // optimization, skip this integer divide if there's no dilation.
+            int64 lhs_spatial_index;
+            if (window_dim.base_dilation() > 1) {
+              lhs_spatial_index = undilated_index / window_dim.base_dilation();
+            } else {
+              lhs_spatial_index = undilated_index;
+            }
+            lhs_linear_index +=
+                lhs_spatial_index * lhs_dim_multipliers[input_spatial_dim];
+
+            // Skip if input index is not in bounds.
+            if (!(lhs_spatial_index >= 0 &&
+                  lhs_spatial_index <
+                      lhs_shape.dimensions(input_spatial_dim))) {
+              goto cnt;
+            }
+
+            rhs_linear_index +=
+                (window_dim.window_reversal()
+                     ? ((window_dim.size() - 1) - rhs_spatial_index[ki])
+                     : rhs_spatial_index[ki]) *
+                rhs_dim_multipliers[dnums.kernel_spatial_dimensions(ki)];
+          }
+
+          result_val +=
+              static_cast<ElementwiseT>(lhs_literal_data[lhs_linear_index]) *
+              static_cast<ElementwiseT>(rhs_literal_data[rhs_linear_index]);
+        }
+      cnt : {}
+      } while (IndexUtil::BumpIndices(window_shape, &rhs_spatial_index));
+
+      return static_cast<ReturnT>(result_val);
+    };
+
+    auto result = MakeUnique<Literal>(result_shape);
+    TF_RETURN_IF_ERROR(result->PopulateParallel<ReturnT>(func));
+
+    parent_->evaluated_[conv] = std::move(result);
+    return Status::OK();
+  }
+
+  Status HandleDot(HloInstruction* dot) override {
+    auto lhs = dot->operand(0);
+    auto rhs = dot->operand(1);
+    CHECK(ShapeUtil::IsArray(dot->shape()));
+    CHECK(ShapeUtil::IsArray(lhs->shape()));
+    CHECK(ShapeUtil::IsArray(rhs->shape()));
+
+    const auto& dnums = dot->dot_dimension_numbers();
+
+    const auto lhs_rank = ShapeUtil::Rank(lhs->shape());
+    const auto rhs_rank = ShapeUtil::Rank(rhs->shape());
+
+    CHECK(ShapeUtil::SameElementType(lhs->shape(), rhs->shape()));
+    CHECK(ShapeUtil::SameElementType(lhs->shape(), dot->shape()));
+
+    // There must be 1 and only 1 Contracting dimension for lhs and rhs.
+    CHECK_EQ(dnums.lhs_contracting_dimensions_size(), 1);
+    CHECK_EQ(dnums.rhs_contracting_dimensions_size(), 1);
+    const int64 lhs_contracting_dimension = dnums.lhs_contracting_dimensions(0);
+    const int64 rhs_contracting_dimension = dnums.rhs_contracting_dimensions(0);
+    // Contracted dimension sizes must be the same.
+    CHECK_EQ(lhs->shape().dimensions(lhs_contracting_dimension),
+             rhs->shape().dimensions(rhs_contracting_dimension))
+        << "lhs contracted dimension: "
+        << lhs->shape().dimensions(lhs_contracting_dimension)
+        << " rhs contracted dimension: "
+        << rhs->shape().dimensions(rhs_contracting_dimension);
+    const int64 contracted_dimension_size =
+        lhs->shape().dimensions(lhs_contracting_dimension);
+
+    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
+    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
+
+    CHECK_EQ(dnums.lhs_batch_dimensions_size(),
+             dnums.rhs_batch_dimensions_size());
+
+    std::vector<int64> lhs_non_contracting_dims;
+    for (int64 i = 0; i < lhs_rank; i++) {
+      if (i != lhs_contracting_dimension) {
+        lhs_non_contracting_dims.push_back(i);
+      }
+    }
+
+    std::vector<int64> rhs_non_batch_non_contracting_dims;
+    tensorflow::gtl::FlatSet<int64> batch_dims_set(
+        dnums.rhs_batch_dimensions().begin(),
+        dnums.rhs_batch_dimensions().end());
+    for (int64 i = 0; i < rhs_rank; i++) {
+      if (i != rhs_contracting_dimension && batch_dims_set.count(i) == 0) {
+        rhs_non_batch_non_contracting_dims.push_back(i);
+      }
+    }
+
+    const int64 batch_dim_size = dnums.lhs_batch_dimensions_size();
+    const int64 lhs_non_contracting_size = lhs_non_contracting_dims.size();
+
+    DimensionVector lhs_index(lhs_rank);
+    DimensionVector rhs_index(rhs_rank);
+    auto result = MakeUnique<Literal>(dot->shape());
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+        [&](tensorflow::gtl::ArraySlice<int64> result_index) {
+          ElementwiseT result_val = static_cast<ElementwiseT>(0);
+
+          // Find the corresponding non-contracting indices for lhs and rhs.
+          //
+          // For `result_index`, its batch dimension, if exists, will be at the
+          // same dimension as the batch dimension of lhs and rhs. More
+          // specifically:
+          // - For lhs, the non-contracting dimensions, including the batch
+          // dimension have the same index as the `result_index`.
+          // - For rhs, the batch dimension is set seperately from other
+          // non-contracting dimensions, since these other non-contracting
+          // dimensions in rhs follow the non-contracting dimensions of lhs in
+          // the resulting index.
+          //
+          // As an example, for a resulting index:
+          //  result_index [result_batch, result_x, result_y]
+          // the effecting lhs and rhs indices are:
+          //  lhs [result_batch, lhs_non_contracting_dim, contracting_dim
+          //  rhs [result_batch, contracting_dim, rhs_non_contracting_dim]
+          // `result_x` is only affected by the lhs_non_contracting_dim and
+          // likewise `result_y` only depends on rhs_non_contracting_dim.
+          //
+          // so we can look up the lhs and rhs indices by:
+          //
+          // lhs:
+          //  batch index is the same as `result_batch`.
+          //    non-contracting dimension is the same as
+          //    result_index[lhs_non_contracting_dim]
+          // rhs:
+          //  batch index: the same as `result_batch`.
+          //  non-contracting dimension index: *not* the same as
+          //    result_index[rhs_non_contractng_dim], since the
+          //    non-contracting dimensions of lhs are included in the
+          //    result_index first. Instead, the non_contracting_dim of rhs must
+          //    be calculated as following:
+          //      lhs_non_contracting_dimensions_size +
+          //      (rhs_non_batch_non_contracting_dim - batch_dim_size) - 1
+          //
+          //    Note that (rhs_non_batch_contracting_dim - batch_dim_size) is
+          //    the index offset to the result_index that only depends on
+          //    the non_batch and non-contracting dimensions of rhs. -1 at the
+          //    end translates size to index.
+          for (auto i : lhs_non_contracting_dims) {
+            lhs_index[i] = result_index[i];
+          }
+          for (auto i : dnums.rhs_batch_dimensions()) {
+            rhs_index[i] = result_index[i];
+          }
+          for (auto i : rhs_non_batch_non_contracting_dims) {
+            const int64 rhs_non_batch_non_contracting_dim =
+                lhs_non_contracting_size + (i - batch_dim_size) - 1;
+            rhs_index[i] = result_index[rhs_non_batch_non_contracting_dim];
+          }
+
+          // Accumulates resulting product along the contracted dimension.
+          for (int64 i = 0; i < contracted_dimension_size; ++i) {
+            lhs_index[lhs_contracting_dimension] = i;
+            rhs_index[rhs_contracting_dimension] = i;
+
+            result_val +=
+                static_cast<ElementwiseT>(lhs_literal.Get<ReturnT>(lhs_index)) *
+                static_cast<ElementwiseT>(rhs_literal.Get<ReturnT>(rhs_index));
+          }
+
+          return static_cast<ReturnT>(result_val);
+        }));
+
+    parent_->evaluated_[dot] = std::move(result);
+    return Status::OK();
+  }
+
+  Status HandlePad(HloInstruction* pad) override {
+    CHECK(!ShapeUtil::IsTuple(pad->operand(0)->shape()));
+    // Padding value must be scalar.
+    CHECK(ShapeUtil::IsScalar(pad->operand(1)->shape()));
+    CHECK_EQ(ShapeUtil::Rank(pad->operand(0)->shape()),
+             pad->padding_config().dimensions_size());
+
+    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
+                        ShapeInference::InferPadShape(
+                            /*operand_shape=*/pad->operand(0)->shape(),
+                            /*padding_value_shape=*/pad->operand(1)->shape(),
+                            /*padding_config=*/pad->padding_config()));
+    CHECK(ShapeUtil::Compatible(pad->shape(), inferred_return_shape))
+        << "return shape is set to: " << ShapeUtil::HumanString(pad->shape())
+        << "but is inferred to be: "
+        << ShapeUtil::HumanString(inferred_return_shape);
+
+    // Create new HLO of padded shape with padding value.
+    ReturnT scalar =
+        parent_->GetEvaluatedLiteralFor(pad->operand(1)).Get<ReturnT>({});
+    auto result = MakeUnique<Literal>(pad->shape());
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+        [&scalar](tensorflow::gtl::ArraySlice<int64> multi_index) {
+          return scalar;
+        }));
+
+    const Literal& evaluated_operand =
+        parent_->GetEvaluatedLiteralFor(pad->operand(0));
+
+    std::vector<int64> input_index(ShapeUtil::Rank(evaluated_operand.shape()),
+                                   0);
+    std::vector<int64> target_index(ShapeUtil::Rank(result->shape()), 0);
+
+    // Loop through each element of the operand, assign them to the
+    // corresponding index of the resulting padded literal.
+    const PaddingConfig& pad_config = pad->padding_config();
+
+    auto func = [&](tensorflow::gtl::ArraySlice<int64> input_index) {
+      for (auto i = 0; i < input_index.size(); ++i) {
+        // Interior padding occurs logically before edge padding, so in the case
+        // of negative edge padding elements are removed from the
+        // interior-padded operand.
+        target_index[i] =
+            pad_config.dimensions(i).edge_padding_low() +
+            input_index[i] * (pad_config.dimensions(i).interior_padding() + 1);
+
+        // Account for negative low and high padding: skip assignment if the
+        // any target index is out of range.
+        if (!(target_index[i] >= 0 &&
+              target_index[i] < pad->shape().dimensions(i))) {
+          return true;
+        }
+      }
+      result->Set<ReturnT>(target_index,
+                           evaluated_operand.Get<ReturnT>(input_index));
+      return true;
+    };
+
+    std::vector<int64> zero_base(evaluated_operand.shape().dimensions_size(),
+                                 0);
+    std::vector<int64> step(evaluated_operand.shape().dimensions_size(), 1);
+
+    ShapeUtil::ForEachIndex(
+        evaluated_operand.shape(), zero_base,
+        AsInt64Slice(evaluated_operand.shape().dimensions()), step, func);
+
+    parent_->evaluated_[pad] = std::move(result);
+    return Status::OK();
+  }
+
+  Status HandleDynamicSlice(HloInstruction* dynamic_slice) override {
+    auto operand = dynamic_slice->operand(0);
+    auto start_indices = dynamic_slice->operand(1);
+    auto result_shape = dynamic_slice->shape();
+    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
+                        ShapeInference::InferDynamicSliceShape(
+                            operand->shape(), start_indices->shape(),
+                            dynamic_slice->dynamic_slice_sizes()));
+    TF_RET_CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape))
+        << "return shape is set to: " << ShapeUtil::HumanString(result_shape)
+        << "but is inferred to be: "
+        << ShapeUtil::HumanString(inferred_return_shape);
+    TF_RET_CHECK(
+        primitive_util::IsIntegralType(start_indices->shape().element_type()));
+
+    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
+    const Literal& start_indices_literal =
+        parent_->GetEvaluatedLiteralFor(start_indices);
+
+    switch (start_indices->shape().element_type()) {
+      case S32: {
+        TF_ASSIGN_OR_RETURN(
+            parent_->evaluated_[dynamic_slice],
+            DynamicSlice<int32>(operand_literal, start_indices_literal,
+                                result_shape));
+      } break;
+      case S64: {
+        TF_ASSIGN_OR_RETURN(
+            parent_->evaluated_[dynamic_slice],
+            DynamicSlice<int64>(operand_literal, start_indices_literal,
+                                result_shape));
+      } break;
+      case U32: {
+        TF_ASSIGN_OR_RETURN(
+            parent_->evaluated_[dynamic_slice],
+            DynamicSlice<uint32>(operand_literal, start_indices_literal,
+                                 result_shape));
+      } break;
+      case U64: {
+        TF_ASSIGN_OR_RETURN(
+            parent_->evaluated_[dynamic_slice],
+            DynamicSlice<uint64>(operand_literal, start_indices_literal,
+                                 result_shape));
+      } break;
+      default:
+        LOG(FATAL) << "HandleDynamicSlice: unhandled primitive type for "
+                      "start_indices: "
+                   << PrimitiveType_Name(start_indices->shape().element_type());
+    }
+
+    return Status::OK();
+  }
+
+  Status HandleDynamicUpdateSlice(
+      HloInstruction* dynamic_update_slice) override {
+    auto operand = dynamic_update_slice->operand(0);
+    auto update = dynamic_update_slice->operand(1);
+    auto start_indices = dynamic_update_slice->operand(2);
+    auto result_shape = dynamic_update_slice->shape();
+    TF_ASSIGN_OR_RETURN(
+        auto inferred_return_shape,
+        ShapeInference::InferDynamicUpdateSliceShape(
+            operand->shape(), update->shape(), start_indices->shape()));
+    TF_RET_CHECK(ShapeUtil::Compatible(result_shape, inferred_return_shape))
+        << "return shape is set to: " << ShapeUtil::HumanString(result_shape)
+        << "but is inferred to be: "
+        << ShapeUtil::HumanString(inferred_return_shape);
+    TF_RET_CHECK(
+        primitive_util::IsIntegralType(start_indices->shape().element_type()));
+    TF_RET_CHECK(ShapeUtil::Compatible(result_shape, operand->shape()));
+
+    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
+    const Literal& update_literal = parent_->GetEvaluatedLiteralFor(update);
+    const Literal& start_indices_literal =
+        parent_->GetEvaluatedLiteralFor(start_indices);
+
+    switch (start_indices->shape().element_type()) {
+      case S32: {
+        TF_ASSIGN_OR_RETURN(
+            parent_->evaluated_[dynamic_update_slice],
+            DynamicUpdateSlice<int32>(operand_literal, update_literal,
+                                      start_indices_literal));
+      } break;
+      case S64: {
+        TF_ASSIGN_OR_RETURN(
+            parent_->evaluated_[dynamic_update_slice],
+            DynamicUpdateSlice<int64>(operand_literal, update_literal,
+                                      start_indices_literal));
+      } break;
+      case U32: {
+        TF_ASSIGN_OR_RETURN(
+            parent_->evaluated_[dynamic_update_slice],
+            DynamicUpdateSlice<uint32>(operand_literal, update_literal,
+                                       start_indices_literal));
+      } break;
+      case U64: {
+        TF_ASSIGN_OR_RETURN(
+            parent_->evaluated_[dynamic_update_slice],
+            DynamicUpdateSlice<uint64>(operand_literal, update_literal,
+                                       start_indices_literal));
+      } break;
+      default:
+        LOG(FATAL) << "HandleDynamicUpdateSlice: unhandled primitive type for "
+                      "start_indices: "
+                   << PrimitiveType_Name(start_indices->shape().element_type());
+    }
+
+    return Status::OK();
+  }
+
+  template <typename NativeT>
+  StatusOr<std::unique_ptr<Literal>> MapImpl(HloInstruction* map) {
+    auto operands = map->operands();
+    HloComputation* computation = map->to_apply();
+
+    auto result = MakeUnique<Literal>(map->shape());
+
+    HloEvaluator embedded_evaluator(parent_->max_loop_iterations_);
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+        [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
+          std::vector<std::unique_ptr<Literal>> arg_literals;
+          arg_literals.reserve(operands.size());
+
+          // Construct scalar literal parameters to be passed to the map
+          // computation.
+          for (auto operand : operands) {
+            const Literal& arg_literal =
+                parent_->GetEvaluatedLiteralFor(operand);
+
+            auto curr_val = arg_literal.Get<NativeT>(multi_index);
+            auto curr_val_literal = Literal::CreateR0<NativeT>(curr_val);
+
+            arg_literals.push_back(std::move(curr_val_literal));
+          }
+
+          std::unique_ptr<Literal> computed_result =
+              embedded_evaluator
+                  .Evaluate<std::unique_ptr<Literal>>(*computation,
+                                                      arg_literals)
+                  .ConsumeValueOrDie();
+          // Clear visit states so that the we can use the evaluate again on
+          // the same computation.
+          embedded_evaluator.ResetVisitStates();
+
+          return computed_result->Get<ReturnT>({});
+        }));
+    return std::move(result);
+  }
+
+  Status HandleMap(HloInstruction* map) override {
+    switch (map->operand(0)->shape().element_type()) {
+      case PRED: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<bool>(map));
+        break;
+      }
+      case U8: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<uint8>(map));
+        break;
+      }
+      case U32: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<uint32>(map));
+        break;
+      }
+      case U64: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<uint64>(map));
+        break;
+      }
+      case S8: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<int8>(map));
+        break;
+      }
+      case S32: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<int32>(map));
+        break;
+      }
+      case S64: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<int64>(map));
+        break;
+      }
+      case F16: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map],
+                            MapImpl<Eigen::half>(map));
+        break;
+      }
+      case F32: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<float>(map));
+        break;
+      }
+      case F64: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<double>(map));
+        break;
+      }
+      case C64: {
+        TF_ASSIGN_OR_RETURN(parent_->evaluated_[map], MapImpl<complex64>(map));
+        break;
+      }
+      default:
+        LOG(FATAL) << "HandleMap: unhandled primitive type for "
+                      "input operand: "
+                   << PrimitiveType_Name(
+                          map->operand(0)->shape().element_type());
+    }
+
+    return Status::OK();
+  }
+
+  Status HandleReduce(HloInstruction* reduce) override {
+    auto arg = reduce->operand(0);
+    auto init_value = reduce->operand(1);
+    tensorflow::gtl::ArraySlice<int64> dimensions(reduce->dimensions());
+    HloComputation* function = reduce->to_apply();
+    TF_RET_CHECK(ShapeUtil::Rank(reduce->shape()) ==
+                 ShapeUtil::Rank(arg->shape()) - dimensions.size());
+    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
+                        ShapeInference::InferReduceShape(
+                            /*arg=*/arg->shape(),
+                            /*init_value=*/init_value->shape(),
+                            /*dimensions_to_reduce=*/dimensions,
+                            /*to_apply=*/function->ComputeProgramShape()));
+    TF_RET_CHECK(ShapeUtil::Compatible(reduce->shape(), inferred_return_shape))
+        << "return shape is set to: " << ShapeUtil::HumanString(reduce->shape())
+        << "but is inferred to be: "
+        << ShapeUtil::HumanString(inferred_return_shape);
+
+    const Literal& arg_literal = parent_->GetEvaluatedLiteralFor(arg);
+    VLOG(3) << "HandleReduce arg_literal: " << arg_literal.ToString();
+    const Literal& init_literal = parent_->GetEvaluatedLiteralFor(init_value);
+    VLOG(3) << "HandleReduce init_literal: " << init_literal.ToString();
+    TF_RET_CHECK(ShapeUtil::IsScalar(init_literal.shape()));
+    auto init_scalar = init_literal.Get<ReturnT>({});
+
+    const auto arg_dimensions = AsInt64Slice(arg_literal.shape().dimensions());
+    std::vector<int64> arg_dim_steps(arg_dimensions.size());
+    std::vector<int64> arg_dim_counts(arg_dimensions.size());
+    for (const int64 dim : dimensions) {
+      arg_dim_steps[dim] = 1;
+      arg_dim_counts[dim] = arg_dimensions[dim];
+    }
+
+    // Map each dimension in the result to a dimension in arg that isn't
+    // being reduced.
+    std::vector<int64> result_to_arg_index;
+    for (int64 i = 0; i < arg_dimensions.size(); ++i) {
+      if (arg_dim_steps[i] == 0) {
+        result_to_arg_index.push_back(i);
+      }
+    }
+
+    HloEvaluator embedded_evaluator(parent_->max_loop_iterations_);
+    auto result = MakeUnique<Literal>(reduce->shape());
+    // For each resulting dimension, calculate and assign computed value.
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+        [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
+          ReturnT result_val = init_scalar;
+
+          std::vector<int64> base(arg_dimensions.size());
+          for (int64 i = 0; i < multi_index.size(); ++i) {
+            base[result_to_arg_index[i]] = multi_index[i];
+          }
+
+          // When the reduction is addition of floats, accumulate in a double
+          // for better precision. Also, avoid creating Literals for the
+          // intermediate results; it's much faster.
+          if (ShapeUtil::ElementIsFloating(init_literal.shape()) &&
+              IsScalarAdd(function)) {
+            double computed_result = 0;
+            auto func = [&](tensorflow::gtl::ArraySlice<int64> input_index) {
+              computed_result += arg_literal.Get<float>(input_index);
+              return true;
+            };
+            ShapeUtil::ForEachIndex(arg_literal.shape(), base, arg_dim_counts,
+                                    arg_dim_steps, func);
+            return static_cast<ReturnT>(computed_result);
+          }
+          auto func = [&](tensorflow::gtl::ArraySlice<int64> input_index) {
+            auto curr_val = arg_literal.Get<ReturnT>(input_index);
+
+            // Evaluate computation with specified literal operands.
+            auto curr_val_literal = Literal::CreateR0<ReturnT>(curr_val);
+            auto result_val_literal = Literal::CreateR0<ReturnT>(result_val);
+
+            std::unique_ptr<Literal> computed_result =
+                embedded_evaluator
+                    .Evaluate<const Literal*>(
+                        *function,
+                        {result_val_literal.get(), curr_val_literal.get()})
+                    .ConsumeValueOrDie();
+            // Clear visit states so that we can use the evaluator again on
+            // the same computation.
+            embedded_evaluator.ResetVisitStates();
+            // Assign computed result to result_val.
+            result_val = computed_result->Get<ReturnT>({});
+            return true;
+          };
+          // Computes one element of the result, reducing all dimensions that
+          // contribute to that element.
+          ShapeUtil::ForEachIndex(arg_literal.shape(), base, arg_dim_counts,
+                                  arg_dim_steps, func);
+          return result_val;
+        }));
+
+    parent_->evaluated_[reduce] = std::move(result);
+    return Status::OK();
+  }
+
+  bool IsScalarAdd(HloComputation* computation) {
+    HloInstruction* instruction = computation->root_instruction();
+    if (instruction->opcode() == HloOpcode::kAdd &&
+        computation->num_parameters() == 2) {
+      const HloInstruction* lhs = instruction->operand(0);
+      const HloInstruction* rhs = instruction->operand(1);
+      return lhs->opcode() == HloOpcode::kParameter &&
+             ShapeUtil::IsScalar(lhs->shape()) &&
+             rhs->opcode() == HloOpcode::kParameter &&
+             ShapeUtil::IsScalar(rhs->shape()) && lhs != rhs;
+    }
+    return false;
+  }
+
+  Status HandleSelectAndScatter(HloInstruction* select_and_scatter) override {
+    auto operand = select_and_scatter->operand(0);
+    auto source = select_and_scatter->operand(1);
+    const Window& window = select_and_scatter->window();
+
+    const Literal& init_literal =
+        parent_->GetEvaluatedLiteralFor(select_and_scatter->operand(2));
+    TF_RET_CHECK(ShapeUtil::IsScalar(init_literal.shape()));
+    auto init_scalar = init_literal.Get<ReturnT>({});
+
+    auto result = MakeUnique<Literal>(select_and_scatter->shape());
+
+    // Initialize result array with the init value.
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+        [&](tensorflow::gtl::ArraySlice<int64> output_index) {
+          return init_scalar;
+        }));
+
+    std::vector<int64> window_dimension_sizes;
+    for (const auto& window_dimension : window.dimensions()) {
+      window_dimension_sizes.push_back(window_dimension.size());
+    }
+    const Shape window_shape = ShapeUtil::MakeShape(
+        operand->shape().element_type(), window_dimension_sizes);
+
+    HloComputation* select = select_and_scatter->select();
+    HloComputation* scatter = select_and_scatter->scatter();
+
+    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
+    const Literal& source_literal = parent_->GetEvaluatedLiteralFor(source);
+
+    int64 rank = ShapeUtil::Rank(operand_literal.shape());
+
+    HloEvaluator embedded_evaluator(parent_->max_loop_iterations_);
+    DimensionVector source_index(rank, 0);
+
+    // Used in the dual IterateThroughWindow lambdas below. Hoisted to avoid
+    // dynamic memory allocations.
+    auto curr_val_literal = Literal::CreateR0<ReturnT>(ReturnT());
+    auto selected_val_literal = Literal::CreateR0<ReturnT>(ReturnT());
+    auto source_literal_scatter = Literal::CreateR0<ReturnT>(ReturnT());
+    auto scattered_literal = Literal::CreateR0<ReturnT>(ReturnT());
+    do {
+      // For each element in `source`, we place a window in `operand`. For each
+      // window placement, we iterate inside the window twice:
+      //
+      // 1. Find the selected index by applying `select` function to all
+      // elements. E.g., If the `select` function is GreaterEqual, the first
+      // iteration through the window finds the biggest value and returns its
+      // index.
+      //
+      // 2. Using the selected index, scatter value from `source` to result. We
+      // do this by iterating through the window, and compare each index with
+      // the selected index.
+      tensorflow::gtl::optional<ReturnT> selected_val;
+      tensorflow::gtl::optional<std::vector<int64>> selected_index;
+
+      IterateThroughWindow(
+          window_shape, window, operand_literal.shape(), source_index,
+          [&](const std::vector<int64>& operand_index) {
+            auto curr_val = operand_literal.Get<ReturnT>(operand_index);
+            if (!selected_val) {
+              selected_val = curr_val;
+              selected_index = operand_index;
+            }
+            curr_val_literal->Set({}, curr_val);
+            selected_val_literal->Set({}, *selected_val);
+            std::unique_ptr<Literal> computed_result =
+                embedded_evaluator
+                    .Evaluate<const Literal*>(
+                        *select,
+                        {selected_val_literal.get(), curr_val_literal.get()})
+                    .ConsumeValueOrDie();
+            bool selected = !computed_result->Get<bool>({});
+            if (selected) {
+              selected_val = curr_val;
+              selected_index = operand_index;
+            }
+            embedded_evaluator.ResetVisitStates();
+          });
+
+      IterateThroughWindow(
+          window_shape, window, operand_literal.shape(), source_index,
+          [&](const std::vector<int64>& operand_index) {
+            if (std::equal(operand_index.begin(), operand_index.end(),
+                           selected_index->begin())) {
+              auto source = source_literal.Get<ReturnT>(source_index);
+              auto scattered = result->Get<ReturnT>(operand_index);
+              source_literal_scatter->Set({}, source);
+              scattered_literal->Set({}, scattered);
+              std::unique_ptr<Literal> computed_result =
+                  embedded_evaluator
+                      .Evaluate<const Literal*>(*scatter,
+                                                {source_literal_scatter.get(),
+                                                 scattered_literal.get()})
+                      .ConsumeValueOrDie();
+              result->Set(operand_index, computed_result->Get<ReturnT>({}));
+              // Clear visit states so that the we can use the evaluator again
+              // on the same computation.
+              embedded_evaluator.ResetVisitStates();
+            }
+          });
+    } while (IndexUtil::BumpIndices(source->shape(), &source_index));
+
+    parent_->evaluated_[select_and_scatter] = std::move(result);
+    return Status::OK();
+  }
+
+  Status HandleReduceWindow(HloInstruction* reduce_window) override {
+    auto operand = reduce_window->operand(0);
+    const Window& window = reduce_window->window();
+    HloComputation* function = reduce_window->to_apply();
+    TF_ASSIGN_OR_RETURN(
+        auto inferred_return_shape,
+        ShapeInference::InferReduceWindowShape(
+            /*operand_shape=*/reduce_window->operand(0)->shape(),
+            /*init_value=*/reduce_window->operand(1)->shape(), window,
+            /*to_apply_shape=*/function->ComputeProgramShape()));
+    TF_RET_CHECK(
+        ShapeUtil::Compatible(reduce_window->shape(), inferred_return_shape))
+        << "return shape is set to: "
+        << ShapeUtil::HumanStringWithLayout(reduce_window->shape())
+        << "but is inferred to be: "
+        << ShapeUtil::HumanStringWithLayout(inferred_return_shape);
+
+    const Literal& operand_literal =
+        parent_->GetEvaluatedLiteralFor(reduce_window->operand(0));
+    VLOG(3) << "HandleReduceWindow arg_literal: " << operand_literal.ToString();
+    const Literal& init_literal =
+        parent_->GetEvaluatedLiteralFor(reduce_window->operand(1));
+    VLOG(3) << "HandleReduceWindow init_literal: " << init_literal.ToString();
+    TF_RET_CHECK(ShapeUtil::IsScalar(init_literal.shape()));
+    auto init_scalar = init_literal.Get<ReturnT>({});
+
+    // Creates a Shape object from window, for iteration below.
+    std::vector<int64> window_dimension_sizes;
+    for (const auto& window_dimension : window.dimensions()) {
+      window_dimension_sizes.push_back(window_dimension.size());
+    }
+    const Shape window_shape = ShapeUtil::MakeShape(
+        operand->shape().element_type(), window_dimension_sizes);
+
+    DimensionVector window_index(window.dimensions_size());
+    DimensionVector operand_index(ShapeUtil::Rank(operand_literal.shape()));
+
+    HloEvaluator embedded_evaluator(parent_->max_loop_iterations_);
+    auto result = MakeUnique<Literal>(reduce_window->shape());
+    // For each resulting dimension, calculate and assign computed value.
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+        [&](tensorflow::gtl::ArraySlice<int64> output_index) {
+          ReturnT result_val = init_scalar;
+
+          std::fill(window_index.begin(), window_index.end(), 0);
+          std::fill(operand_index.begin(), operand_index.end(), 0);
+
+          IterateThroughWindow(
+              window_shape, window, operand_literal.shape(), output_index,
+              [&](const std::vector<int64>& operand_index) {
+                auto curr_val = operand_literal.Get<ReturnT>(operand_index);
+
+                // Evaluate computation with specified literal operands.
+                const auto curr_val_literal =
+                    Literal::CreateR0<ReturnT>(curr_val);
+                const auto result_val_literal =
+                    Literal::CreateR0<ReturnT>(result_val);
+                std::unique_ptr<Literal> computed_result =
+                    embedded_evaluator
+                        .Evaluate<const Literal*>(
+                            *function,
+                            {result_val_literal.get(), curr_val_literal.get()})
+                        .ConsumeValueOrDie();
+
+                // Clear visit states so that the we can use the evaluate again
+                // on the same computation.
+                embedded_evaluator.ResetVisitStates();
+
+                result_val = computed_result->Get<ReturnT>({});
+              });
+
+          return result_val;
+        }));
+
+    parent_->evaluated_[reduce_window] = std::move(result);
+    return Status::OK();
+  }
+
+  Status HandleSlice(HloInstruction* slice) override {
+    auto operand = slice->operand(0);
+    const Shape& shape = slice->shape();
+    TF_ASSIGN_OR_RETURN(auto inferred_return_shape,
+                        ShapeInference::InferSliceShape(
+                            operand->shape(), slice->slice_starts(),
+                            slice->slice_limits(), slice->slice_strides()));
+    TF_RET_CHECK(ShapeUtil::Compatible(shape, inferred_return_shape))
+        << "return shape set to: " << ShapeUtil::HumanString(shape)
+        << " but is inferred to be: "
+        << ShapeUtil::HumanString(inferred_return_shape);
+
+    const int64 rank = ShapeUtil::Rank(operand->shape());
+    const Literal& operand_literal = parent_->GetEvaluatedLiteralFor(operand);
+    auto func = [&](tensorflow::gtl::ArraySlice<int64> out_index) {
+      DimensionVector operand_index(rank);
+      for (int64 i = 0; i < rank; ++i) {
+        operand_index[i] =
+            slice->slice_starts(i) + out_index[i] * slice->slice_strides(i);
+      }
+      return operand_literal.Get<ReturnT>(operand_index);
+    };
+
+    auto result = Literal::CreateFromDimensions(
+        shape.element_type(), AsInt64Slice(shape.dimensions()));
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(func));
+    parent_->evaluated_[slice] = std::move(result);
+    return Status::OK();
+  }
+
+  // Enable CLZ only for int32, uint32, int64 and uint64.
+  template <
+      typename NativeT,
+      typename std::enable_if<
+          (std::is_floating_point<NativeT>::value ||
+           std::is_integral<NativeT>::value || is_complex_t<NativeT>::value) &&
+          !(std::is_same<NativeT, uint32>::value ||
+            std::is_same<NativeT, int32>::value ||
+            std::is_same<NativeT, int64>::value ||
+            std::is_same<NativeT, uint64>::value)>::type* = nullptr>
+  Status HandleClz(HloInstruction* clz) {
+    return InvalidArgument("Unsupported type for Clz");
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<
+                std::is_same<NativeT, uint32>::value ||
+                std::is_same<NativeT, int32>::value>::type* = nullptr>
+  Status HandleClz(HloInstruction* clz) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[clz],
+                        ElementWiseUnaryOp(clz, [](ElementwiseT elem_operand) {
+                          return 31 - tensorflow::Log2Floor(elem_operand);
+                        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT,
+            typename std::enable_if<
+                std::is_same<NativeT, uint64>::value ||
+                std::is_same<NativeT, int64>::value>::type* = nullptr>
+  Status HandleClz(HloInstruction* clz) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[clz],
+                        ElementWiseUnaryOp(clz, [](ElementwiseT elem_operand) {
+                          return 63 - tensorflow::Log2Floor64(elem_operand);
+                        }));
+    return Status::OK();
+  }
+
+  Status HandleClz(HloInstruction* clz) override {
+    return HandleClz<ElementwiseT>(clz);
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_floating_point<
+                                  NativeT>::value>::type* = nullptr>
+  Status HandleSin(HloInstruction* sin) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[sin],
+                        ElementWiseUnaryOp(sin, [](ElementwiseT elem_operand) {
+                          return std::sin(elem_operand);
+                        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<std::is_integral<NativeT>::value ||
+                              is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleSin(HloInstruction* sin) {
+    return InvalidArgument("Unsupported type for Sin");
+  }
+
+  Status HandleSin(HloInstruction* sin) override {
+    return HandleSin<ElementwiseT>(sin);
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_floating_point<
+                                  NativeT>::value>::type* = nullptr>
+  Status HandleCos(HloInstruction* cos) {
+    TF_ASSIGN_OR_RETURN(parent_->evaluated_[cos],
+                        ElementWiseUnaryOp(cos, [](ElementwiseT elem_operand) {
+                          return std::cos(elem_operand);
+                        }));
+    return Status::OK();
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<std::is_integral<NativeT>::value ||
+                              is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleCos(HloInstruction* cos) {
+    return InvalidArgument("Unsupported type for Cos");
+  }
+
+  Status HandleCos(HloInstruction* cos) override {
+    return HandleCos<ElementwiseT>(cos);
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_same<
+                                  float, NativeT>::value>::type* = nullptr>
+  Status HandleReducePrecision(HloInstruction* reduce_precision) {
+    TF_ASSIGN_OR_RETURN(
+        parent_->evaluated_[reduce_precision],
+        ElementWiseUnaryOp(reduce_precision, [reduce_precision](
+                                                 ElementwiseT elem) {
+          uint32_t value_as_int = tensorflow::bit_cast<uint32_t>(elem);
+          const uint32_t mantissa_bits = reduce_precision->mantissa_bits();
+          const uint32_t exponent_bits = reduce_precision->exponent_bits();
+
+          // Code is based on the CPU/GPU implementation in LLVM-emitting code.
+          //
+          // Bits in float type:
+          //   mantissa : bits [0:22]
+          //   exponent : bits [23:30]
+          //   sign     : bits [31]
+          if (mantissa_bits < 23) {
+            const uint32_t last_mantissa_bit_mask = 1u << (23 - mantissa_bits);
+
+            // Compute rounding bias for round-to-nearest with ties to even.
+            // This is equal to a base value of 0111... plus one bit if the last
+            // remaining mantissa bit is 1.
+            const uint32_t base_rounding_bias =
+                (last_mantissa_bit_mask >> 1) - 1;
+            const uint32_t x_last_mantissa_bit =
+                (value_as_int & last_mantissa_bit_mask) >> (23 - mantissa_bits);
+            const uint32_t x_rounding_bias =
+                x_last_mantissa_bit + base_rounding_bias;
+
+            // Add rounding bias, and mask out truncated bits.  Note that the
+            // case where adding the rounding bias overflows into the exponent
+            // bits is correct; the non-masked mantissa bits will all be zero,
+            // and the exponent will be incremented by one.
+            const uint32_t truncation_mask = ~(last_mantissa_bit_mask - 1);
+            value_as_int = value_as_int + x_rounding_bias;
+            value_as_int = value_as_int & truncation_mask;
+          }
+          if (exponent_bits < 8) {
+            // Masks for f32 values.
+            const uint32_t f32_sign_bit_mask = 1u << 31;
+            const uint32_t f32_exp_bits_mask = 0xffu << 23;
+
+            // An exponent of 2^(n-1)-1 -- that is, 0111... with the zero in the
+            // most- significant bit -- is equal to 1.0f for all exponent sizes.
+            // Adding 2^(n-1)-1 to this gives us the highest non-infinite
+            // exponent for a bit- size of n, and subtracting 2^(n-1)-1 from
+            // this gives us the lowest' exponent (corresponding to 0.0f).
+            //
+            // Thus, the f32 exponent corresponding to the highest non-infinite
+            // exponent for a bit size of n is (2^7-1) + 2^(n-1)-1, and the f32
+            // exponent corresponding to the lowest exponent for a bit size of n
+            // is (2^7-1) - 2^(n-1)-1.
+            //
+            // Note that we have already checked that exponents_bits >= 1.
+            const uint32_t f32_exponent_bias = (1 << 7) - 1;
+            const uint32_t reduced_exponent_bias =
+                (1 << (exponent_bits - 1)) - 1;
+            const uint32_t reduced_max_exponent =
+                f32_exponent_bias + reduced_exponent_bias;
+            const uint32_t reduced_min_exponent =
+                f32_exponent_bias - reduced_exponent_bias;
+
+            // Do we overflow or underflow?
+            const uint32_t x_exponent = value_as_int & f32_exp_bits_mask;
+            const bool x_overflows = x_exponent > (reduced_max_exponent << 23);
+            const bool x_underflows =
+                x_exponent <= (reduced_min_exponent << 23);
+
+            // Compute appropriately-signed values of zero and infinity.
+            const uint32_t x_signed_zero = value_as_int & f32_sign_bit_mask;
+            const uint32_t x_signed_inf = x_signed_zero | f32_exp_bits_mask;
+
+            // Force to zero or infinity if overflow or underflow.  (Note that
+            // this truncates all denormal values to zero, rather than rounding
+            // them.)
+            value_as_int = x_overflows ? x_signed_inf : value_as_int;
+            value_as_int = x_underflows ? x_signed_zero : value_as_int;
+          }
+
+          float reduced_result = tensorflow::bit_cast<float>(value_as_int);
+          if (std::isnan(elem)) {
+            reduced_result = mantissa_bits > 0
+                                 ? elem
+                                 : std::numeric_limits<float>::infinity();
+          }
+          return reduced_result;
+        }));
+    return Status::OK();
+  }
+
+  template <typename NativeT, typename std::enable_if<std::is_same<
+                                  double, NativeT>::value>::type* = nullptr>
+  Status HandleReducePrecision(HloInstruction* reduce_precision) {
+    return InvalidArgument("Double not supported for reduce precision");
+  }
+
+  template <
+      typename NativeT,
+      typename std::enable_if<std::is_integral<NativeT>::value ||
+                              is_complex_t<NativeT>::value>::type* = nullptr>
+  Status HandleReducePrecision(HloInstruction* reduce_precision) {
+    return InvalidArgument("Unsupported type for reduce precision");
+  }
+
+  Status HandleReducePrecision(HloInstruction* reduce_precision) override {
+    return HandleReducePrecision<ElementwiseT>(reduce_precision);
+  }
+
+ private:
+  // Creates a vector of multipliers which can be used to create a linear index
+  // into shape.
+  //
+  // Given the multidimensional index {i1, ..., iN} and
+  // M = MakeDimMultipliers(shape), the corresponding linear index LI is simply
+  //
+  //   LI = i1 * M[1] + i2 * M[2] + ... + iN * M[N].
+  //
+  // This lets you calculate LI given the multidimensional indices in any order.
+  static DimensionVector MakeDimMultipliers(const Shape& shape) {
+    DimensionVector v(ShapeUtil::Rank(shape));
+    int64 scale = 1;
+    for (auto dim : LayoutUtil::MinorToMajor(shape)) {
+      v[dim] = scale;
+      scale *= shape.dimensions(dim);
+    }
+    return v;
+  }
+
+  // For one particular placement of a window in a base shape (the placement is
+  // represented as `window_count_index`), iterates inside the window.
+  // Translates the window index into base index. If the base index is within
+  // bound, call `f` with the base index.
+  static void IterateThroughWindow(
+      const Shape& window_shape, const Window& window, const Shape& base_shape,
+      const tensorflow::gtl::ArraySlice<int64>& window_count_index,
+      const std::function<void(const std::vector<int64>&)>& f) {
+    const int64 rank = ShapeUtil::Rank(base_shape);
+    DimensionVector window_index(rank);
+    std::fill(window_index.begin(), window_index.end(), 0);
+    do {
+      std::vector<int64> base_index(rank);
+      bool out_of_bound = false;
+      for (int64 i = 0; i < rank; ++i) {
+        base_index[i] = window_count_index[i] * window.dimensions(i).stride() +
+                        window_index[i] - window.dimensions(i).padding_low();
+        if (base_index[i] < 0 || base_index[i] >= base_shape.dimensions(i)) {
+          out_of_bound = true;
+          break;
+        }
+      }
+      if (!out_of_bound) {
+        f(base_index);
+      }
+    } while (IndexUtil::BumpIndices(window_shape, &window_index));
+  }
+
+  template <typename IndexT>
+  StatusOr<std::unique_ptr<Literal>> DynamicSlice(
+      const Literal& operand_literal, const Literal& start_indices_literal,
+      const Shape& result_shape) {
+    auto start_indices_typed = start_indices_literal.data<IndexT>();
+    std::vector<int64> start(start_indices_typed.begin(),
+                             start_indices_typed.end());
+
+    // Clamp the start indices so the slice is in-bounds w.r.t the operand.
+
+    // TODO(b/74360564): This is implementation defined behavior, but is
+    // currently respected by all implementations. Change this if we ever decide
+    // to officially document different behavior.
+    for (int64 i = 0; i < start.size(); ++i) {
+      start[i] = std::min<int64>(
+          std::max(int64{0}, start[i]),
+          operand_literal.shape().dimensions(i) - result_shape.dimensions(i));
+    }
+
+    std::vector<int64> operand_indices(start.size());
+    auto result = MakeUnique<Literal>(result_shape);
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+        [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
+          for (int64 i = 0; i < operand_indices.size(); ++i) {
+            CHECK_GE(multi_index[i] + start[i], 0);
+            operand_indices[i] = multi_index[i] + start[i];
+          }
+
+          auto result = operand_literal.Get<ReturnT>(operand_indices);
+          return result;
+        }));
+
+    return std::move(result);
+  }
+
+  template <typename IndexT>
+  StatusOr<std::unique_ptr<Literal>> DynamicUpdateSlice(
+      const Literal& operand_literal, const Literal& update_literal,
+      const Literal& start_indices_literal) {
+    auto result = operand_literal.CloneToUnique();
+    auto start_indices_typed = start_indices_literal.data<IndexT>();
+    const auto rank = ShapeUtil::Rank(result->shape());
+    std::vector<int64> start(start_indices_typed.begin(),
+                             start_indices_typed.end());
+    // Clamp the update start indices so the slice is in-bounds w.r.t the
+    // operand.
+
+    // TODO(b/74360564): This is implementation defined behavior, but is
+    // currently respected by all implementations. Change this if we ever decide
+    // to oficially document different behavior.
+    for (int64 i = 0; i < rank; ++i) {
+      start[i] = std::min<int64>(
+          std::max<int64>(0, start[i]),
+          result->shape().dimensions(i) - update_literal.shape().dimensions(i));
+    }
+    std::vector<int64> result_index(rank, 0);
+
+    auto func = [&](tensorflow::gtl::ArraySlice<int64> update_index) {
+      std::transform(update_index.begin(), update_index.end(), start.begin(),
+                     result_index.begin(), std::plus<int64>());
+      result->Set<ReturnT>(result_index,
+                           update_literal.Get<ReturnT>(update_index));
+      return true;
+    };
+
+    std::vector<int64> base(update_literal.shape().dimensions_size(), 0);
+    std::vector<int64> step(update_literal.shape().dimensions_size(), 1);
+    ShapeUtil::ForEachIndex(update_literal.shape(), base,
+                            AsInt64Slice(update_literal.shape().dimensions()),
+                            step, func);
+
+    return std::move(result);
+  }
+
+  StatusOr<std::unique_ptr<Literal>> ElementWiseUnaryOp(
+      HloInstruction* instruction,
+      const std::function<ElementwiseT(ElementwiseT)>& unary_op) {
+    const Literal& operand_literal =
+        parent_->GetEvaluatedLiteralFor(instruction->operand(0));
+    TF_ASSIGN_OR_RETURN(
+        auto result_literal,
+        (HloEvaluator::ElementWiseUnaryOpImpl<ReturnT, ReturnT>(
+            instruction, ConvertUnaryFunction(unary_op), operand_literal)));
+
+    return std::move(result_literal);
+  }
+
+  StatusOr<std::unique_ptr<Literal>> ElementWiseBinaryOp(
+      HloInstruction* instruction,
+      const std::function<ElementwiseT(ElementwiseT, ElementwiseT)>&
+          binary_op) {
+    const auto shape = instruction->shape();
+    const auto* lhs = instruction->operand(0);
+    const auto* rhs = instruction->operand(1);
+
+    // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast
+    // is removed.
+    if (!(ShapeUtil::SameDimensions(shape, rhs->shape()) &&
+          ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()))) {
+      return Unimplemented(
+          "Implicit broadcasting is currently unsupported in HLO evaluator "
+          "Shape Mismatch: %s vs %s vs %s: ",
+          ShapeUtil::HumanString(shape).c_str(),
+          ShapeUtil::HumanString(lhs->shape()).c_str(),
+          ShapeUtil::HumanString(rhs->shape()).c_str());
+    }
+
+    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
+    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
+
+    auto result = MakeUnique<Literal>(shape);
+
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+        [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
+          return ConvertBinaryFunction(binary_op)(
+              lhs_literal.Get<ReturnT>(multi_index),
+              rhs_literal.Get<ReturnT>(multi_index));
+        }));
+    return std::move(result);
+  }
+
+  template <typename LhsType, typename RhsType, typename EhsType>
+  StatusOr<std::unique_ptr<Literal>> ElementwiseTernaryOp(
+      HloInstruction* instruction,
+      const std::function<ReturnT(LhsType, RhsType, EhsType)>& ternary_op) {
+    const auto shape = instruction->shape();
+    const auto* lhs = instruction->operand(0);
+    const auto* rhs = instruction->operand(1);
+    const auto* ehs = instruction->operand(2);
+
+    // TODO(b/35950897, b/27796129): add DCHECK back once implicit
+    // broadcast is removed.
+    if (!(ShapeUtil::SameDimensions(shape, lhs->shape()) &&
+          ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()) &&
+          ShapeUtil::SameDimensions(rhs->shape(), ehs->shape()))) {
+      return Unimplemented(
+          "Implicit broadcasting is currently unsupported in HLO evaluator "
+          "Shape Mismatch: %s vs %s vs %s vs %s: ",
+          ShapeUtil::HumanString(shape).c_str(),
+          ShapeUtil::HumanString(lhs->shape()).c_str(),
+          ShapeUtil::HumanString(rhs->shape()).c_str(),
+          ShapeUtil::HumanString(ehs->shape()).c_str());
+    }
+
+    const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
+    const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
+    const Literal& ehs_literal = parent_->GetEvaluatedLiteralFor(ehs);
+
+    auto result = MakeUnique<Literal>(shape);
+
+    TF_RETURN_IF_ERROR(result->Populate<ReturnT>(
+        [&](tensorflow::gtl::ArraySlice<int64> multi_index) {
+          return ternary_op(lhs_literal.Get<LhsType>(multi_index),
+                            rhs_literal.Get<RhsType>(multi_index),
+                            ehs_literal.Get<EhsType>(multi_index));
+        }));
+
+    return std::move(result);
+  }
+
+  template <typename NativeT>
+  static bool IsShiftOutOfBounds(NativeT rhs) {
+    typedef typename std::make_unsigned<NativeT>::type UnsignedT;
+    UnsignedT lhs_size_unsigned = sizeof(NativeT) * CHAR_BIT;
+    UnsignedT rhs_unsigned = static_cast<UnsignedT>(rhs);
+    return rhs_unsigned >= lhs_size_unsigned;
+  }
+
+  HloEvaluator* parent_;
+};
+
+// These extern templates prevent users of this class from implicitly
+// instantiating it.  We explicitly instantiate this class in the various
+// hlo_evaluator_typed_visitor*.cc files.
+extern template class HloEvaluatorTypedVisitor<bool>;
+extern template class HloEvaluatorTypedVisitor<uint8>;
+extern template class HloEvaluatorTypedVisitor<uint32>;
+extern template class HloEvaluatorTypedVisitor<uint64>;
+extern template class HloEvaluatorTypedVisitor<int8>;
+extern template class HloEvaluatorTypedVisitor<int32>;
+extern template class HloEvaluatorTypedVisitor<int64>;
+extern template class HloEvaluatorTypedVisitor<Eigen::half, float>;
+extern template class HloEvaluatorTypedVisitor<float>;
+extern template class HloEvaluatorTypedVisitor<double>;
+extern template class HloEvaluatorTypedVisitor<complex64>;
+extern template class HloEvaluatorTypedVisitor<bfloat16, float>;
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_EVALUATOR_TYPED_VISITOR_H_
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_bfloat16.cc b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_bfloat16.cc
new file mode 100644
index 00000000000000..39c352dfb966af
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_bfloat16.cc
@@ -0,0 +1,22 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+namespace xla {
+template class HloEvaluatorTypedVisitor<bfloat16, float>;
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_bool.cc b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_bool.cc
new file mode 100644
index 00000000000000..289b40fa06d37b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_bool.cc
@@ -0,0 +1,22 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+namespace xla {
+template class HloEvaluatorTypedVisitor<bool>;
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_complex64.cc b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_complex64.cc
new file mode 100644
index 00000000000000..9cb4eb921fd3af
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_complex64.cc
@@ -0,0 +1,22 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+namespace xla {
+template class HloEvaluatorTypedVisitor<complex64>;
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_double.cc b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_double.cc
new file mode 100644
index 00000000000000..5e6252fbf8c24a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_double.cc
@@ -0,0 +1,22 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+namespace xla {
+template class HloEvaluatorTypedVisitor<double>;
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_float.cc b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_float.cc
new file mode 100644
index 00000000000000..ee793ae77b1b43
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_float.cc
@@ -0,0 +1,22 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+namespace xla {
+template class HloEvaluatorTypedVisitor<float>;
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_half.cc b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_half.cc
new file mode 100644
index 00000000000000..038d9d39e4a588
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_half.cc
@@ -0,0 +1,22 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+namespace xla {
+template class HloEvaluatorTypedVisitor<Eigen::half, float>;
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_int32.cc b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_int32.cc
new file mode 100644
index 00000000000000..b1952ca6193958
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_int32.cc
@@ -0,0 +1,22 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+namespace xla {
+template class HloEvaluatorTypedVisitor<int32>;
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_int64.cc b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_int64.cc
new file mode 100644
index 00000000000000..0cbaffb40b7128
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_int64.cc
@@ -0,0 +1,22 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+namespace xla {
+template class HloEvaluatorTypedVisitor<int64>;
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_int8.cc b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_int8.cc
new file mode 100644
index 00000000000000..6f4bf2a392b51a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_int8.cc
@@ -0,0 +1,22 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+namespace xla {
+template class HloEvaluatorTypedVisitor<int8>;
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_uint32.cc b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_uint32.cc
new file mode 100644
index 00000000000000..10235447e0d266
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_uint32.cc
@@ -0,0 +1,22 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+namespace xla {
+template class HloEvaluatorTypedVisitor<uint32>;
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_uint64.cc b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_uint64.cc
new file mode 100644
index 00000000000000..8abeaa6ffca440
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_uint64.cc
@@ -0,0 +1,22 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+namespace xla {
+template class HloEvaluatorTypedVisitor<uint64>;
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_uint8.cc b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_uint8.cc
new file mode 100644
index 00000000000000..6dabd1c176eabc
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor_uint8.cc
@@ -0,0 +1,22 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h"
+
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+
+namespace xla {
+template class HloEvaluatorTypedVisitor<uint8>;
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
index a0cb28246d3be5..eba80c0f199f62 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
@@ -15,53 +15,33 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 
 namespace xla {
 namespace {
 
-class HloExecutionProfileTest : public HloTestBase {
- protected:
-  static constexpr int64 kInstructionCyclesIndex = 0;
-  static constexpr int64 kInstructionNameIndex = 19;
-};
+using tensorflow::strings::StrCat;
+using ::testing::AllOf;
+using ::testing::ContainsRegex;
 
-// Splits `lines` into a sequence of lines delimited by newlines and then split
-// each of those lines into a sequence of words delimited by spaces.  Filter out
-// empty words.
-std::vector<std::vector<string>> SplitIntoLinesAndWords(
-    tensorflow::StringPiece lines) {
-  std::vector<std::vector<string>> result;
-  for (const string& line : tensorflow::str_util::Split(lines, '\n')) {
-    std::vector<string> words;
-    for (const string& word : tensorflow::str_util::Split(line, ' ')) {
-      if (!word.empty()) {
-        words.push_back(word);
-      }
-    }
-    result.push_back(std::move(words));
-  }
-
-  return result;
-}
+class HloExecutionProfileTest : public HloTestBase {};
 
 TEST_F(HloExecutionProfileTest, Basic) {
-  std::unique_ptr<HloModule> hlo_module = CreateNewModule();
-
-  HloComputation::Builder builder(TestName());
+  auto hlo_module = ParseHloString(R"(
+  HloModule test_module
+  ENTRY entry_computation {
+    lhs = f32[30,30]{1,0} parameter(0)
+    rhs = f32[30,30]{1,0} parameter(1)
+    add = f32[30,30]{1,0} add(lhs, rhs)
+    ROOT dot = f32[30,30]{1,0} dot(lhs, add), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  })")
+                        .ValueOrDie();
+  const HloInstruction* dot_instruction =
+      hlo_module->entry_computation()->root_instruction();
+  const HloInstruction* add_instruction = dot_instruction->operand(1);
   Shape shape = ShapeUtil::MakeShape(F32, {30, 30});
-  HloInstruction* param_lhs =
-      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "lhs"));
-  HloInstruction* param_rhs =
-      builder.AddInstruction(HloInstruction::CreateParameter(1, shape, "rhs"));
-  HloInstruction* add_instruction =
-      builder.AddInstruction(HloInstruction::CreateBinary(
-          shape, HloOpcode::kAdd, param_lhs, param_rhs));
-  HloInstruction* dot_instruction =
-      builder.AddInstruction(HloInstruction::CreateBinary(
-          shape, HloOpcode::kDot, param_lhs, add_instruction));
-
-  hlo_module->AddEntryComputation(builder.Build());
 
   auto shape_size_function = [&](const Shape& shape) {
     const int64 pointer_size = 8;
@@ -84,20 +64,12 @@ TEST_F(HloExecutionProfileTest, Basic) {
   execution_profile.SetCyclesTakenBy(add_instruction, add_cycles);
   execution_profile.SetCyclesTakenBy(dot_instruction, dot_cycles);
 
-  string rendered_profile = execution_profile.ToString(
-      backend().default_stream_executor()->GetDeviceDescription());
-  std::vector<std::vector<string>> lines_and_words =
-      SplitIntoLinesAndWords(rendered_profile);
-  ASSERT_EQ(lines_and_words.size(), 8);
-
-  const std::vector<string>& line_2 = lines_and_words[2];
-  const std::vector<string>& line_3 = lines_and_words[3];
-
-  EXPECT_EQ(line_2[kInstructionCyclesIndex], std::to_string(dot_cycles));
-  EXPECT_EQ(line_2[kInstructionNameIndex], '%' + dot_instruction->name());
-
-  EXPECT_EQ(line_3[kInstructionCyclesIndex], std::to_string(add_cycles));
-  EXPECT_EQ(line_3[kInstructionNameIndex], '%' + add_instruction->name());
+  EXPECT_THAT(execution_profile.ToString(
+                  backend().default_stream_executor()->GetDeviceDescription()),
+              AllOf(ContainsRegex(StrCat(dot_cycles, R"(\b.*%)",
+                                         dot_instruction->name())),
+                    ContainsRegex(StrCat(add_cycles, R"(\b.*%)",
+                                         add_instruction->name()))));
 }
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 516e14b4642ae6..61612bebd1e906 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -321,12 +321,12 @@ optional<string> MatchTrivialComputation(const HloComputation* computation) {
 class HloDotDumper {
  public:
   HloDotDumper(const HloComputation* computation, tensorflow::StringPiece label,
-               const DebugOptions& debug_options, bool show_metadata,
+               const DebugOptions& debug_options, bool show_backend_config,
                const HloExecutionProfile* profile, NodeFilter filter)
       : computation_(computation),
-        label_(label.ToString()),
+        label_(std::string(label)),
         debug_options_(debug_options),
-        show_metadata_(show_metadata),
+        show_backend_config_(show_backend_config),
         profile_(profile),
         filter_(std::move(filter)) {}
 
@@ -365,6 +365,7 @@ class HloDotDumper {
   string GetInstructionNodeShape(const HloInstruction* instr);
   string GetInstructionNodeLabel(const HloInstruction* instr);
   string GetInstructionNodeMetadata(const HloInstruction* instr);
+  string GetInstructionNodeBackendConfig(const HloInstruction* instr);
   string GetInstructionNodeExtraInfo(const HloInstruction* instr);
   string GetInstructionNodeInlinedOperands(const HloInstruction* instr);
   void AddInstructionIncomingEdges(const HloInstruction* instr);
@@ -392,7 +393,7 @@ class HloDotDumper {
   const HloComputation* computation_;  // never null
   const string label_;                 // overall name for the graph
   const DebugOptions& debug_options_;
-  const bool show_metadata_;
+  const bool show_backend_config_;
   const HloExecutionProfile* profile_;  // may be null
   const NodeFilter filter_;
 
@@ -426,7 +427,8 @@ class HloDotDumper {
 
   // When coloring by sharding information, we track the sharding string
   // representation to color association, by round-robin the color schemes.
-  std::unordered_map<string, ColorScheme> sharding_colors_;
+  std::unordered_map<HloSharding, ColorScheme, HloSharding::Hasher>
+      sharding_colors_;
   int64 next_shard_color_ = 0;
 };
 
@@ -588,15 +590,26 @@ bool HloDotDumper::ShouldShowSubcomputation(const HloComputation* subcomp) {
 string HloDotDumper::DumpSubcomputation(const HloComputation* subcomp,
                                         const HloInstruction* parent_instr) {
   VLOG(2) << "Dumping subcomputation " << subcomp->name();
-  const char* computation_fmt = R"(subgraph %s {
-%s
-label = <%s>;
-labelloc = t;
-tooltip = " ";
-%s
-}  // %s
+  // Add an edge from the subcomputation to its parent node.  If subcomp
+  // belongs to a fusion node, it's drawn in place of the fusion instruction,
+  // so there's no need to link those.
+  if (parent_instr->opcode() != HloOpcode::kFusion) {
+    const HloInstruction* from = GetNodeForEdge(subcomp->root_instruction());
+    VLOG(2) << "Edge: from " << from->name() << " to " << parent_instr->name()
+            << " as " << next_edge_id_;
+    edge_ids_.insert({{from, parent_instr}, next_edge_id_++});
+    const char* edge_fmt =
+        R"(%s -> %s [ltail="%s", style="dashed" tooltip="%s -> %s"];)";
+    edges_.push_back(Printf(
+        edge_fmt, InstructionId(from), InstructionId(parent_instr),
+        SubcomputationId(subcomp), subcomp->name(), parent_instr->name()));
+  }
 
-)";
+  // Have we already dumped this subcomputation?  If so, generating the edge
+  // linking it and parent_instr is all we want to do in this function.
+  if (cluster_ids_.find(subcomp) != cluster_ids_.end()) {
+    return "";
+  }
 
   cluster_ids_[subcomp] = next_cluster_id_++;
 
@@ -611,6 +624,10 @@ tooltip = " ";
     if (!extra_info.empty()) {
       StrAppend(&subcomp_label, "<br/>", extra_info);
     }
+    string node_backend_config = GetInstructionNodeBackendConfig(parent_instr);
+    if (!node_backend_config.empty()) {
+      StrAppend(&subcomp_label, "<br/>", node_backend_config);
+    }
 
     bool highlight = filter_.Highlight(parent_instr);
     const char* fillcolor;
@@ -639,25 +656,16 @@ tooltip = " ";
 
   string comp_body = DumpComputation(subcomp);
 
-  // Add an edge from the subcomputation to its parent node.  If subcomp
-  // belongs to a fusion node, it's drawn in place of the fusion instruction,
-  // so there's no need to link those.
-  if (parent_instr->opcode() != HloOpcode::kFusion) {
-    const HloInstruction* from = GetNodeForEdge(subcomp->root_instruction());
-    VLOG(2) << "Edge: from " << from->name() << " to " << parent_instr->name()
-            << " as " << next_edge_id_;
-    edge_ids_.insert({{from, parent_instr}, next_edge_id_++});
-    const char* edge_fmt =
-        R"(%s -> %s [ltail="%s", style="dashed" tooltip="%s -> %s"];)";
-    edges_.push_back(Printf(
-        edge_fmt, InstructionId(from), InstructionId(parent_instr),
-        SubcomputationId(subcomp), subcomp->name(), parent_instr->name()));
-  }
-
-  string computation =
-      Printf(computation_fmt, id, style, subcomp_label, comp_body, id);
+  const char* computation_fmt = R"(subgraph %s {
+%s
+label = <%s>;
+labelloc = t;
+tooltip = " ";
+%s
+}  // %s
 
-  return computation;
+)";
+  return Printf(computation_fmt, id, style, subcomp_label, comp_body, id);
 }
 
 string HloDotDumper::DumpComputation(const HloComputation* comp) {
@@ -765,6 +773,7 @@ string HloDotDumper::DumpInstruction(const HloInstruction* instr) {
   string node_shape = GetInstructionNodeShape(instr);
   string node_label = GetInstructionNodeLabel(instr);
   string node_metadata = GetInstructionNodeMetadata(instr);
+  string node_backend_config = GetInstructionNodeBackendConfig(instr);
   string extra_info = GetInstructionNodeExtraInfo(instr);
   string inlined_constants = GetInstructionNodeInlinedOperands(instr);
   string trivial_subcomputation = GetInstructionTrivialComputationStr(instr);
@@ -782,16 +791,16 @@ string HloDotDumper::DumpInstruction(const HloInstruction* instr) {
   }
   // Build the text that will be displayed inside the node.
   string node_body = node_label;
-  for (const string& s :
-       {trivial_subcomputation, node_metadata, extra_info, inlined_constants}) {
+  for (const string& s : {trivial_subcomputation, node_backend_config,
+                          extra_info, inlined_constants}) {
     if (!s.empty()) {
       StrAppend(&node_body, "<br/>", s);
     }
   }
 
-  return Printf(R"(%s [label=<%s>, shape=%s, tooltip=" ", %s];)"
+  return Printf(R"(%s [label=<%s>, shape=%s, tooltip="%s", %s];)"
                 "\n",
-                InstructionId(instr), node_body, node_shape,
+                InstructionId(instr), node_body, node_shape, node_metadata,
                 NodeColorAttributes(color));
 }
 
@@ -804,7 +813,7 @@ string HloDotDumper::GetInstructionNodeInlinedOperands(
     // "{} (f32[42, 0, 10])".  The alternative, calling Literal::ToString(),
     // enumerates all of its empty dimensions (e.g.  "{ { {}, {} }, ..."), which
     // is just noise.
-    if (ShapeUtil::HasZeroElements(shape)) {
+    if (!ShapeUtil::IsTuple(shape) && ShapeUtil::HasZeroElements(shape)) {
       return Printf("{} (%s)", ShapeUtil::HumanString(constant->shape()));
     }
 
@@ -816,7 +825,7 @@ string HloDotDumper::GetInstructionNodeInlinedOperands(
         *elem_count *= dim;
       }
     }
-    if (elem_count.has_value() && *elem_count <= 8) {
+    if (elem_count.has_value() && *elem_count <= 8 && constant->HasLiteral()) {
       return Printf("%s (%s)", constant->literal().ToString(),
                     ShapeUtil::HumanString(constant->shape()));
     }
@@ -876,14 +885,13 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     if (!instr->has_sharding()) {
       return kDashedBorder;
     }
-    string shard_str = instr->sharding().ToString();
-    auto it = sharding_colors_.find(shard_str);
+    auto it = sharding_colors_.find(instr->sharding());
     if (it != sharding_colors_.end()) {
       return it->second;
     }
     ColorScheme color = static_cast<ColorScheme>(
         kBlue + (next_shard_color_++ % (kDashedBorder - kBlue)));
-    sharding_colors_.emplace(shard_str, color);
+    sharding_colors_.emplace(instr->sharding(), color);
     return color;
   }
   const auto kParameterColor = kOrange;
@@ -916,6 +924,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kDivide:
     case HloOpcode::kEq:
     case HloOpcode::kExp:
+    case HloOpcode::kExpm1:
     case HloOpcode::kFloor:
     case HloOpcode::kGe:
     case HloOpcode::kGt:
@@ -923,6 +932,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kIsFinite:
     case HloOpcode::kLe:
     case HloOpcode::kLog:
+    case HloOpcode::kLog1p:
     case HloOpcode::kLt:
     case HloOpcode::kMaximum:
     case HloOpcode::kMinimum:
@@ -1002,6 +1012,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kReduceWindow:
     case HloOpcode::kSelectAndScatter:
       return kPurple;
+    case HloOpcode::kDomain:
     case HloOpcode::kFusion:
     case HloOpcode::kMap:
       return kGray;
@@ -1057,10 +1068,6 @@ string HloDotDumper::GetInstructionNodeLabel(const HloInstruction* instr) {
 }
 
 string HloDotDumper::GetInstructionNodeMetadata(const HloInstruction* instr) {
-  if (!show_metadata_) {
-    return "";
-  }
-
   std::vector<string> lines;
   if (!instr->metadata().op_name().empty()) {
     lines.push_back(HtmlLikeStringSanitize(instr->metadata().op_name()));
@@ -1078,13 +1085,23 @@ string HloDotDumper::GetInstructionNodeMetadata(const HloInstruction* instr) {
   return Join(lines, "<br/>");
 }
 
+string HloDotDumper::GetInstructionNodeBackendConfig(
+    const HloInstruction* instr) {
+  if (!show_backend_config_ || instr->raw_backend_config_string().empty()) {
+    return "";
+  }
+
+  return StrCat("backend_config=\"", instr->raw_backend_config_string(), "\"");
+}
+
 string HloDotDumper::GetInstructionNodeExtraInfo(const HloInstruction* instr) {
   std::vector<string> lines;
 
   // Get the instruction's extra attributes excluding the names of its
   // subcomputations, since those are drawn explicitly in the graph.
   for (const auto& line : instr->ExtraAttributesToString(
-           HloPrintOptions().set_print_subcomputation_references(false))) {
+           HloPrintOptions().set_print_subcomputation_mode(
+               HloPrintOptions::PrintSubcomputationMode::kOff))) {
     lines.push_back(HtmlLikeStringSanitize(line));
   }
 
@@ -1133,6 +1150,20 @@ string HloDotDumper::GetInstructionNodeExtraInfo(const HloInstruction* instr) {
   return Join(lines, "<br/>");
 }
 
+// Gets the total number of array elements in the given shape.  For tuples, this
+// is the sum of all the sizes of all of the array elements recursively in the
+// tuple.
+static int64 TotalElementsInShape(const Shape& shape) {
+  int64 elems = 0;
+  ShapeUtil::ForEachSubshape(
+      shape, [&](const Shape& subshape, const ShapeIndex& /*index*/) {
+        if (ShapeUtil::IsArray(subshape)) {
+          elems += ShapeUtil::ElementsIn(subshape);
+        }
+      });
+  return elems;
+}
+
 void HloDotDumper::AddInstructionIncomingEdges(const HloInstruction* instr) {
   auto add_edge = [&](const HloInstruction* from, const HloInstruction* to,
                       int64 operand_num, bool control_edge = false) {
@@ -1152,9 +1183,16 @@ void HloDotDumper::AddInstructionIncomingEdges(const HloInstruction* instr) {
     } else if (control_edge) {
       edge_label = "style=\"dotted\" color=\"gray\" label=\"ctrl\"";
     }
-    const char* kEdgeFmt = R"(%s -> %s [tooltip="%s -> %s" %s];)";
+
+    // We print "small" arrays using a hollow arrowhead and "large" arrays using
+    // a filled arrowhead.  For now, we use an arbitrary cutoff for what "big"
+    // means.
+    bool is_big_array = TotalElementsInShape(from->shape()) >= 4096;
+
+    const char* kEdgeFmt = R"(%s -> %s [arrowhead=%s tooltip="%s -> %s" %s];)";
     edges_.push_back(Printf(kEdgeFmt, InstructionId(from), InstructionId(to),
-                            from->name(), to->name(), edge_label));
+                            (is_big_array ? "normal" : "empty"), from->name(),
+                            to->name(), edge_label));
   };
 
   // Add edges from instr's operands to instr.  Parameters within fusion
@@ -1404,7 +1442,7 @@ string ExportGraph(const string& graph,
 string DumpGraph(const HloComputation& computation, const string& label,
                  const DebugOptions& debug_options,
                  const HloExecutionProfile* hlo_execution_profile,
-                 bool show_metadata) {
+                 bool show_backend_config) {
   GraphRendererInterface::GraphKind graph_kind;
   string graph;
   if (debug_options.xla_hlo_dump_as_graphdef()) {
@@ -1414,9 +1452,10 @@ string DumpGraph(const HloComputation& computation, const string& label,
                                                           &graph));
     graph_kind = GraphRendererInterface::TF_GRAPHDEF;
   } else {
-    graph = HloDotDumper(&computation, label, debug_options, show_metadata,
-                         hlo_execution_profile, NodeFilter())
-                .Dump();
+    graph =
+        HloDotDumper(&computation, label, debug_options, show_backend_config,
+                     hlo_execution_profile, NodeFilter())
+            .Dump();
     graph_kind = GraphRendererInterface::DOT_GRAPH;
   }
 
@@ -1427,13 +1466,13 @@ string DumpGraph(const HloComputation& computation, const string& label,
 }
 
 string DumpNeighborhoodAround(const HloInstruction& node, int radius,
-                              bool show_metadata) {
+                              bool show_backend_config) {
   auto debug_options = node.GetModule()->config().debug_options();
   string label =
       StrCat("Neighborhood of ", radius, " nodes around ", node.name());
   NodeFilter filter = MakeNodeFilter(&node, radius);
   string graph =
-      HloDotDumper(node.parent(), label, debug_options, show_metadata,
+      HloDotDumper(node.parent(), label, debug_options, show_backend_config,
                    /*profile=*/nullptr, filter)
           .Dump();
   return ExportGraph(graph, GraphRendererInterface::DOT_GRAPH, debug_options);
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.h b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
index 2704aae1e3ba7f..0b11f34abb7f0d 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.h
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
@@ -56,7 +56,7 @@ string MaybeDumpHloModule(const HloModule& module, const string& label,
 string DumpGraph(const HloComputation& computation, const string& label,
                  const DebugOptions& debug_options,
                  const HloExecutionProfile* hlo_execution_profile = nullptr,
-                 bool show_metadata = false);
+                 bool show_backend_config = false);
 
 // Like DumpGraph, but renders only nodes "near" the given node in the graph.
 //
@@ -64,7 +64,7 @@ string DumpGraph(const HloComputation& computation, const string& label,
 // (roughly) corresponds to the max distance a node may be from the primary node
 // before it's omitted from the graph.
 string DumpNeighborhoodAround(const HloInstruction& node, int radius,
-                              bool show_metadata = false);
+                              bool show_backend_config = false);
 
 // Dumps the HloModule::ToString() as a file into the provided directory path
 // suffixed with the provided label.
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc
index 1f00aa41dc783f..8e52d926d85f1c 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc
@@ -64,8 +64,8 @@ TEST(HloGraphDumperTest, NestedFusion) {
     sums.push_back(b.AddInstruction(HloInstruction::CreateBinary(
         shape, HloOpcode::kAdd, sums[i], params[i + 2])));
   }
-
-  HloModule m(TestName());
+  HloModuleConfig config;
+  HloModule m(TestName(), config);
   m.AddEntryComputation(b.Build());
   HloComputation* root_computation = m.entry_computation();
 
@@ -122,7 +122,8 @@ TEST(HloGraphDumperTest, Constant) {
   auto instruction = b.AddInstruction(
       HloInstruction::CreateConstant(Literal::CreateR0<float>(-42)));
   instruction->set_name("i_am_a_constant_root_instruction");
-  HloModule m(TestName());
+  HloModuleConfig config;
+  HloModule m(TestName(), config);
   HloComputation* root_computation = m.AddEntryComputation(b.Build());
   string graph = hlo_graph_dumper::DumpGraph(
       *root_computation, /*label=*/"an_empty_graph", DebugOptions());
@@ -130,5 +131,23 @@ TEST(HloGraphDumperTest, Constant) {
   EXPECT_THAT(graph, Not(HasSubstr("i_am_a_constant_root_instruction")));
 }
 
+TEST(HloGraphDumperTest, TupleConstant) {
+  Shape tuple_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {3, 2}), ShapeUtil::MakeShape(S32, {4, 5})});
+  HloComputation::Builder b("b");
+  auto constant = b.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateFromShape(tuple_shape)));
+  auto gte = b.AddInstruction(HloInstruction::CreateGetTupleElement(
+      ShapeUtil::MakeShape(F32, {3, 2}), constant, 0));
+
+  HloModuleConfig config;
+  HloModule m(TestName(), config);
+  HloComputation* root_computation = m.AddEntryComputation(b.Build(gte));
+  string graph = hlo_graph_dumper::DumpGraph(
+      *root_computation, /*label=*/"tuple_constant", DebugOptions());
+  EXPECT_THAT(graph, HasSubstr("tuple_constant"));
+  EXPECT_THAT(graph, HasSubstr("constant (f32[3,2], s32[4,5])"));
+}
+
 }  // anonymous namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index a714d0e1142450..06775d6a9ab661 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -37,9 +37,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/human_readable_json.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -51,7 +53,7 @@ using ::tensorflow::strings::StrCat;
 
 /* static */
 StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
-    HloModule* module, const HloInstructionProto& proto,
+    const HloInstructionProto& proto,
     const tensorflow::gtl::FlatMap<int64, HloInstruction*>& instruction_map,
     const tensorflow::gtl::FlatMap<int64, HloComputation*>& computation_map) {
   TF_RET_CHECK(!proto.opcode().empty());
@@ -109,6 +111,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
   instruction->name_ = proto.name();
 
   instruction->metadata_ = proto.metadata();
+  instruction->backend_config_ = proto.backend_config();
   if (proto.has_literal()) {
     TF_ASSIGN_OR_RETURN(instruction->literal_,
                         Literal::CreateFromProto(proto.literal()));
@@ -255,11 +258,14 @@ HloInstruction::CreateGetTupleElement(const Shape& shape,
     case HloOpcode::kCopy:
     case HloOpcode::kCos:
     case HloOpcode::kClz:
+    case HloOpcode::kDomain:
     case HloOpcode::kExp:
+    case HloOpcode::kExpm1:
     case HloOpcode::kFloor:
     case HloOpcode::kImag:
     case HloOpcode::kIsFinite:
     case HloOpcode::kLog:
+    case HloOpcode::kLog1p:
     case HloOpcode::kNot:
     case HloOpcode::kNegate:
     case HloOpcode::kReal:
@@ -417,8 +423,20 @@ HloInstruction::CreateReducePrecision(const Shape& shape,
 
 /* static */ std::unique_ptr<HloInstruction>
 HloInstruction::CreateCrossReplicaSum(
-    const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands) {
-  return CreateNary(shape, HloOpcode::kCrossReplicaSum, operands);
+    const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
+    HloComputation* reduce_computation,
+    tensorflow::gtl::ArraySlice<int64> replica_group_ids,
+    const tensorflow::gtl::optional<int64>& channel_id) {
+  // TODO(b/79737069): Remove the CHECK when supported.
+  CHECK(replica_group_ids.empty());
+  CHECK(!channel_id.has_value());
+  auto instruction =
+      WrapUnique(new HloInstruction(HloOpcode::kCrossReplicaSum, shape));
+  for (auto operand : operands) {
+    instruction->AppendOperand(operand);
+  }
+  instruction->called_computations_.push_back(reduce_computation);
+  return instruction;
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateInfeed(
@@ -437,7 +455,7 @@ HloInstruction::CreateCrossReplicaSum(
       << "Outfeed shape " << shape << " must be compatible with operand shape "
       << operand->shape();
   instruction->AppendOperand(operand);
-  instruction->outfeed_config_ = outfeed_config.ToString();
+  instruction->outfeed_config_ = std::string(outfeed_config);
   instruction->outfeed_shape_ = shape;
   return instruction;
 }
@@ -792,23 +810,11 @@ HloInstruction::CreateBroadcastSequence(
   return instruction;
 }
 
-// We put the fusion kind into the instruction's name for transpose-dot fusions,
-// since those fusions are really just describing a type of dot rather than
-// generating a novel computation.
-static string FusionNodeName(HloInstruction::FusionKind fusion_kind) {
-  switch (fusion_kind) {
-    case HloInstruction::FusionKind::kTransposeDot:
-      return "dot_fusion";
-    default:
-      return "fusion";
-  }
-}
-
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateFusion(
     const Shape& shape, FusionKind fusion_kind, HloInstruction* fused_root) {
   auto instruction = WrapUnique(new HloInstruction(HloOpcode::kFusion, shape));
   instruction->fusion_kind_ = fusion_kind;
-  instruction->name_ = FusionNodeName(fusion_kind);
+  instruction->name_ = "fusion";
   instruction->set_parent(fused_root->parent());
   instruction->set_metadata(fused_root->metadata());
   instruction->CloneAndFuseInternal(fused_root);
@@ -824,12 +830,21 @@ static string FusionNodeName(HloInstruction::FusionKind fusion_kind) {
     instruction->AppendOperand(operand);
   }
   instruction->fusion_kind_ = fusion_kind;
-  instruction->name_ = FusionNodeName(fusion_kind);
+  instruction->name_ = "fusion";
   instruction->called_computations_.push_back(fusion_computation);
   fusion_computation->SetFusionInstruction(instruction.get());
   return instruction;
 }
 
+void HloInstruction::set_device_sharding(int64 device) {
+  HloSharding device_sharding = HloSharding::AssignDevice(device);
+  if (ShapeUtil::IsTuple(shape())) {
+    set_sharding(HloSharding::Tuple(device_sharding.GetAsShapeTree(shape())));
+  } else {
+    set_sharding(device_sharding);
+  }
+}
+
 void HloInstruction::SetupDerivedInstruction(
     HloInstruction* derived_instruction) const {
   if (sharding_ != nullptr) {
@@ -1123,7 +1138,7 @@ RandomDistribution HloInstruction::random_distribution() const {
   return distribution_;
 }
 
-bool HloInstruction::HasSideEffect() const {
+bool HloInstruction::HasSideEffectNoRecurse() const {
   switch (opcode_) {
     case HloOpcode::kSend:
     case HloOpcode::kSendDone:
@@ -1135,16 +1150,22 @@ bool HloInstruction::HasSideEffect() const {
     case HloOpcode::kTrace:
     case HloOpcode::kHostCompute:
       return true;
-    default: {
-      // Check if any of the called computations has a side effect.
-      for (const auto& computation : called_computations()) {
-        if (computation->HasSideEffect()) {
-          return true;
-        }
-      }
+    default:
       return false;
+  }
+}
+
+bool HloInstruction::HasSideEffect() const {
+  if (HasSideEffectNoRecurse()) {
+    return true;
+  }
+  // Check if any of the called computations has a side effect.
+  for (const auto& computation : called_computations()) {
+    if (computation->HasSideEffect()) {
+      return true;
     }
   }
+  return false;
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateCall(
@@ -1167,7 +1188,7 @@ bool HloInstruction::HasSideEffect() const {
   for (auto operand : operands) {
     instruction->AppendOperand(operand);
   }
-  instruction->custom_call_target_ = custom_call_target.ToString();
+  instruction->custom_call_target_ = std::string(custom_call_target);
   return instruction;
 }
 
@@ -1179,7 +1200,7 @@ bool HloInstruction::HasSideEffect() const {
   for (auto operand : operands) {
     instruction->AppendOperand(operand);
   }
-  instruction->channel_name_ = channel_name.ToString();
+  instruction->channel_name_ = std::string(channel_name);
   instruction->cost_estimate_ns_ = cost_estimate_ns;
   return instruction;
 }
@@ -1228,10 +1249,21 @@ bool HloInstruction::HasSideEffect() const {
   return gather_dim_numbers;
 }
 
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateDomain(
+    const Shape& shape, HloInstruction* operand,
+    std::unique_ptr<DomainMetadata> operand_side_metadata,
+    std::unique_ptr<DomainMetadata> user_side_metadata) {
+  auto instruction = WrapUnique(new HloInstruction(HloOpcode::kDomain, shape));
+  instruction->operand_side_metadata_ = std::move(operand_side_metadata);
+  instruction->user_side_metadata_ = std::move(user_side_metadata);
+  instruction->AppendOperand(operand);
+  return instruction;
+}
+
 std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     const Shape& shape,
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
-    HloModule* module) const {
+    HloCloneContext* context) const {
   VLOG(3) << "CloneWithNewOperands:\n  " << ToString();
   VLOG(3) << "  new operands:";
   for (const HloInstruction* new_operand : new_operands) {
@@ -1239,7 +1271,6 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
   }
 
   std::unique_ptr<HloInstruction> clone;
-
   // Explicitly call the factory for the instruction type. This is more robust
   // in the face of code changes than copying fields explicitly. This also
   // properly sets the user fields of the operands.
@@ -1253,10 +1284,12 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kCopy:
     case HloOpcode::kCos:
     case HloOpcode::kExp:
+    case HloOpcode::kExpm1:
     case HloOpcode::kImag:
     case HloOpcode::kIsFinite:
     case HloOpcode::kFloor:
     case HloOpcode::kLog:
+    case HloOpcode::kLog1p:
     case HloOpcode::kNot:
     case HloOpcode::kNegate:
     case HloOpcode::kReal:
@@ -1309,6 +1342,14 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       break;
     case HloOpcode::kCustomCall:
       clone = CreateCustomCall(shape, new_operands, custom_call_target_);
+      if (window_ != nullptr) {
+        clone->window_ = MakeUnique<Window>(*window_);
+      }
+      if (convolution_dimension_numbers_ != nullptr) {
+        clone->convolution_dimension_numbers_ =
+            MakeUnique<ConvolutionDimensionNumbers>(
+                *convolution_dimension_numbers_);
+      }
       break;
     case HloOpcode::kHostCompute:
       clone = CreateHostCompute(shape, new_operands, channel_name_,
@@ -1342,9 +1383,10 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       break;
     case HloOpcode::kFft:
       CHECK_EQ(new_operands.size(), 1);
-      return CreateFft(shape, new_operands[0], fft_type_, fft_length_);
+      clone = CreateFft(shape, new_operands[0], fft_type_, fft_length_);
+      break;
     case HloOpcode::kCrossReplicaSum:
-      clone = CreateCrossReplicaSum(shape, new_operands);
+      clone = CreateCrossReplicaSum(shape, new_operands, to_apply());
       break;
     case HloOpcode::kGetTupleElement:
       CHECK_EQ(new_operands.size(), 1);
@@ -1415,9 +1457,22 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kConstant:
       clone = CreateConstant(literal_->CloneToUnique());
       break;
-    case HloOpcode::kFusion:
-      clone = CloneFusionWithNewOperands(shape, new_operands, module);
+    case HloOpcode::kFusion: {
+      HloModule* module = context != nullptr ? context->module() : GetModule();
+      HloComputation* new_fused_computation = nullptr;
+      if (context != nullptr) {
+        new_fused_computation =
+            context->FindComputation(fused_instructions_computation());
+      }
+      if (new_fused_computation == nullptr) {
+        new_fused_computation = module->AddEmbeddedComputation(
+            fused_instructions_computation()->Clone("clone", context));
+      }
+      clone = CreateFusion(/*shape=*/shape, /*fusion_kind=*/fusion_kind(),
+                           /*operands=*/new_operands,
+                           /*fusion_computation=*/new_fused_computation);
       break;
+    }
     case HloOpcode::kParameter:
       clone = CreateParameter(parameter_number_, shape, name_);
       break;
@@ -1476,20 +1531,35 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       clone = CreateGather(shape, new_operands[0], new_operands[1],
                            *gather_dimension_numbers_, gather_window_bounds_);
       break;
+    case HloOpcode::kDomain:
+      CHECK_EQ(new_operands.size(), 1);
+      clone =
+          CreateDomain(shape, new_operands[0], operand_side_metadata_->Clone(),
+                       user_side_metadata_->Clone());
+      break;
     case HloOpcode::kTrace:
       LOG(FATAL) << "Not yet implemented, clone: " << HloOpcodeString(opcode_);
   }
   SetupDerivedInstruction(clone.get());
   clone->set_parent(parent_);
+  clone->set_raw_backend_config_string(backend_config_);
+  if (context != nullptr) {
+    context->MapInstruction(this, clone.get());
+    clone->ReplaceCalledComputations([&](HloComputation* callee) {
+      return callee->parent() != context->module()
+                 ? context->module()->DeepCloneComputation(callee, context)
+                 : callee;
+    });
+  }
   return clone;
 }
 
 HloInstruction::~HloInstruction() {}
 
-std::unique_ptr<HloInstruction> HloInstruction::Clone(const string& suffix,
-                                                      HloModule* module) const {
+std::unique_ptr<HloInstruction> HloInstruction::Clone(
+    const string& suffix, HloCloneContext* context) const {
   std::unique_ptr<HloInstruction> clone =
-      CloneWithNewOperands(shape_, operands_, module);
+      CloneWithNewOperands(shape_, operands_, context);
   if (suffix.empty()) {
     clone->name_ = name();
   } else {
@@ -1526,71 +1596,6 @@ std::unique_ptr<HloInstruction> HloInstruction::Clone(const string& suffix,
   return clone;
 }
 
-std::unique_ptr<HloInstruction> HloInstruction::CloneFusionWithNewOperands(
-    const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
-    HloModule* module) const {
-  CHECK_EQ(opcode_, HloOpcode::kFusion);
-  CHECK(parent() != nullptr);
-
-  auto new_instruction =
-      WrapUnique(new HloInstruction(HloOpcode::kFusion, shape));
-  // Add the operands to our new fusion instruction.
-  for (HloInstruction* new_operand : operands) {
-    new_instruction->AppendOperand(new_operand);
-  }
-  // Clone all the fused instructions for the new fusion instruction.
-  HloInstructionMap<HloInstruction*> old_to_new;
-  std::list<std::unique_ptr<HloInstruction>> new_fused_instructions;
-  // Create the list of fused parameters by mapping through the cloned,
-  // fused instructions.
-  for (HloInstruction* old_fused_parameter :
-       fused_instructions_computation()->parameter_instructions()) {
-    new_fused_instructions.push_back(
-        old_fused_parameter->Clone("clone", module));
-    HloInstruction* new_fusion_parameter = new_fused_instructions.back().get();
-    InsertOrDie(&old_to_new, old_fused_parameter, new_fusion_parameter);
-  }
-  for (auto old_fused_instruction :
-       fused_instructions_computation()->MakeInstructionPostOrder()) {
-    if (old_fused_instruction->opcode() == HloOpcode::kParameter) {
-      FindOrDie(old_to_new, old_fused_instruction);
-      continue;
-    }
-    std::vector<HloInstruction*> new_operands;
-    for (int64 operand_idx = 0;
-         operand_idx < old_fused_instruction->operand_count(); ++operand_idx) {
-      HloInstruction* old_operand =
-          old_fused_instruction->mutable_operand(operand_idx);
-      new_operands.push_back(FindOrDie(old_to_new, old_operand));
-    }
-    new_fused_instructions.push_back(
-        old_fused_instruction->CloneWithNewOperands(
-            old_fused_instruction->shape(), new_operands, module));
-    HloInstruction* new_fused_instruction = new_fused_instructions.back().get();
-    new_fused_instruction->set_parent(parent_);
-    InsertOrDie(&old_to_new, old_fused_instruction, new_fused_instruction);
-  }
-  new_instruction->fusion_kind_ = fusion_kind_;
-  auto computation_builder = HloComputation::Builder(
-      fused_instructions_computation()->name() + ".clone",
-      new_instruction.get());
-  // We iterated the fusion instructions in reverse post order which means
-  // that we must reverse our new list of fusion instructions.
-  for (auto new_fused_instruction_iter = new_fused_instructions.rbegin();
-       new_fused_instruction_iter != new_fused_instructions.rend();
-       ++new_fused_instruction_iter) {
-    computation_builder.AddInstruction(std::move(*new_fused_instruction_iter));
-  }
-  if (module == nullptr) {
-    module = GetModule();
-  }
-  auto fused_root_ = fused_expression_root();
-  new_instruction->called_computations_.push_back(
-      CHECK_NOTNULL(module)->AddEmbeddedComputation(
-          computation_builder.Build(FindOrDie(old_to_new, fused_root_))));
-  return new_instruction;
-}
-
 std::pair<const HloInstruction*, ShapeIndex>
 HloInstruction::LatestNonGteAncestorAndIndex() const {
   const HloInstruction* hlo = this;
@@ -1619,6 +1624,8 @@ const Literal& HloInstruction::literal() const {
   return *literal_;
 }
 
+bool HloInstruction::HasLiteral() const { return literal_ != nullptr; }
+
 bool HloInstruction::CanHaveDimensionsField() const {
   return (opcode() == HloOpcode::kReverse ||
           opcode() == HloOpcode::kConcatenate ||
@@ -1664,6 +1671,17 @@ int64 HloInstruction::operand_index(const HloInstruction* target) const {
   LOG(FATAL) << "target was not an operand: " << target->ToString();
 }
 
+HloInstruction::InstructionVector HloInstruction::unique_operands() const {
+  InstructionVector unique;
+  tensorflow::gtl::FlatSet<const HloInstruction*> seen;
+  for (HloInstruction* operand : operands()) {
+    if (seen.insert(operand).second) {
+      unique.push_back(operand);
+    }
+  }
+  return unique;
+}
+
 Status HloInstruction::AddControlDependencyTo(HloInstruction* instruction) {
   TF_RET_CHECK(instruction->parent() == parent());
   if (std::find(control_successors_.begin(), control_successors_.end(),
@@ -1739,26 +1757,29 @@ bool HloInstruction::HasConstantOperand() const {
 bool HloInstruction::IdenticalSlowPath(
     const HloInstruction& other,
     const std::function<bool(const HloComputation*, const HloComputation*)>&
-        eq_computations,
-    const std::function<bool(const Shape&, const Shape&)>& eq_shapes) const {
+        eq_computations) const {
   // Perform opcode specific checks.
   switch (opcode()) {
     // The result of these instructions only depend upon their opcode and
     // operands.
     case HloOpcode::kAbs:
     case HloOpcode::kAtan2:
-    case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kAdd:
+    case HloOpcode::kBitcast:
+    case HloOpcode::kBitcastConvert:
     case HloOpcode::kCeil:
     case HloOpcode::kClamp:
     case HloOpcode::kClz:
     case HloOpcode::kComplex:
+    case HloOpcode::kConvert:
     case HloOpcode::kCopy:
     case HloOpcode::kCos:
-    case HloOpcode::kCrossReplicaSum:
     case HloOpcode::kDivide:
+    case HloOpcode::kDynamicSlice:
+    case HloOpcode::kDynamicUpdateSlice:
     case HloOpcode::kEq:
     case HloOpcode::kExp:
+    case HloOpcode::kExpm1:
     case HloOpcode::kFloor:
     case HloOpcode::kGe:
     case HloOpcode::kGt:
@@ -1766,6 +1787,7 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kIsFinite:
     case HloOpcode::kLe:
     case HloOpcode::kLog:
+    case HloOpcode::kLog1p:
     case HloOpcode::kAnd:
     case HloOpcode::kNot:
     case HloOpcode::kOr:
@@ -1778,6 +1800,8 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kPower:
     case HloOpcode::kReal:
     case HloOpcode::kRemainder:
+    case HloOpcode::kReshape:
+    case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kSelect:
     case HloOpcode::kShiftLeft:
     case HloOpcode::kShiftRightArithmetic:
@@ -1789,22 +1813,26 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kTuple:
       return true;
 
+    // Broadcast, Concatenate, and Transpose need the same dimensions field.
+    case HloOpcode::kBroadcast:
+    case HloOpcode::kConcatenate:
+    case HloOpcode::kTranspose:
+      return dimensions() == other.dimensions();
+
     case HloOpcode::kFusion:
       return fusion_kind() == other.fusion_kind() &&
              eq_computations(fused_instructions_computation(),
                              other.fused_instructions_computation());
 
     // These opcodes have complex or special behavior so just return false.
+    case HloOpcode::kDomain:
     case HloOpcode::kRng:
     case HloOpcode::kTrace:
     case HloOpcode::kWhile:
       return false;
 
     case HloOpcode::kParameter:
-      return parameter_number() == other.parameter_number() &&
-             // Check the shape too because `this` and `other` may be in
-             // different HloComputations.
-             eq_shapes(shape(), other.shape());
+      return parameter_number() == other.parameter_number();
 
     case HloOpcode::kBatchNormTraining:
     case HloOpcode::kBatchNormInference:
@@ -1816,12 +1844,6 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kConstant:
       return literal() == other.literal();
 
-    // A convert result is determined by the primitive type that the operand is
-    // converted into.
-    case HloOpcode::kConvert:
-    case HloOpcode::kBitcastConvert:
-      return shape().element_type() == other.shape().element_type();
-
     // A reduce-precision operation is determined by the bit sizes.
     case HloOpcode::kReducePrecision:
       return exponent_bits() == other.exponent_bits() &&
@@ -1864,22 +1886,8 @@ bool HloInstruction::IdenticalSlowPath(
              eq_computations(scatter(), other.scatter()) &&
              protobuf_util::ProtobufEquals(window(), other.window());
 
-    case HloOpcode::kReshape:
-      return eq_shapes(shape(), other.shape());
-
-    // Transpose result is determined by the final shape and the permutation.
-    case HloOpcode::kTranspose:
-      return eq_shapes(shape(), other.shape()) &&
-             dimensions() == other.dimensions();
 
     // Remaining instructions with special values.
-    case HloOpcode::kBitcast:
-      return eq_shapes(shape(), other.shape());
-    case HloOpcode::kBroadcast:
-      return eq_shapes(shape(), other.shape()) &&
-             dimensions() == other.dimensions();
-    case HloOpcode::kConcatenate:
-      return dimensions() == other.dimensions();
     case HloOpcode::kGetTupleElement:
       return tuple_index() == other.tuple_index();
     case HloOpcode::kPad:
@@ -1889,15 +1897,24 @@ bool HloInstruction::IdenticalSlowPath(
       return slice_starts_ == other.slice_starts_ &&
              slice_limits_ == other.slice_limits_ &&
              slice_strides_ == other.slice_strides_;
-    case HloOpcode::kDynamicSlice:
-      return eq_shapes(shape(), other.shape()) &&
-             dynamic_slice_sizes_ == other.dynamic_slice_sizes_;
-    case HloOpcode::kDynamicUpdateSlice:
-      return eq_shapes(shape(), other.shape());
     case HloOpcode::kCall:
+    case HloOpcode::kCrossReplicaSum:
     case HloOpcode::kMap:
       return eq_computations(to_apply(), other.to_apply());
     case HloOpcode::kCustomCall:
+      if ((window_ == nullptr) != (other.window_ == nullptr) ||
+          (window_ != nullptr &&
+           !protobuf_util::ProtobufEquals(window(), other.window()))) {
+        return false;
+      }
+      if ((convolution_dimension_numbers_ == nullptr) !=
+              (other.convolution_dimension_numbers_ == nullptr) ||
+          (convolution_dimension_numbers_ != nullptr &&
+           !protobuf_util::ProtobufEquals(
+               convolution_dimension_numbers(),
+               other.convolution_dimension_numbers()))) {
+        return false;
+      }
       return custom_call_target_ == other.custom_call_target_;
     case HloOpcode::kReverse:
       return dimensions() == other.dimensions();
@@ -2029,6 +2046,7 @@ HloComputation* HloInstruction::to_apply() const {
     case HloOpcode::kMap:
     case HloOpcode::kReduceWindow:
     case HloOpcode::kReduce:
+    case HloOpcode::kCrossReplicaSum:
       CHECK_EQ(called_computations_.size(), 1);
       return called_computations_[0];
     default:
@@ -2160,28 +2178,68 @@ string PrintName(const string& name, const HloPrintOptions& options) {
 }  // namespace
 
 string HloInstruction::ToString(const HloPrintOptions& options) const {
-  string result =
-      StrCat(PrintName(name(), options), " = ",
-             ShapeUtil::HumanStringWithLayout(shape()), " ",
-             HloOpcodeString(opcode()), "(", OperandsToString(options), ")");
+  CanonicalNameMap new_map;
+  return ToStringWithCanonicalNameMap(options, &new_map);
+}
+
+string HloInstruction::ToStringWithCanonicalNameMap(
+    const HloPrintOptions& options,
+    CanonicalNameMap* canonical_name_map) const {
+  string result = "";
+
+  // Logic to print the instruction name (e.g. "%foo = ").
+  if (options.canonicalize_instruction_names()) {
+    if (options.is_in_nested_computation()) {
+      // If we are canonicalizing instruction names and this is a top-level
+      // HloInstruction::ToString() call, don't print an instruction name.
+      StrAppend(&result,
+                PrintName(canonical_name_map->LookupOrInsert(name()), options),
+                " = ");
+    }
+  } else {
+    StrAppend(&result, PrintName(name(), options), " = ");
+  }
+
+  // Print opcode, operand(s) and shape.
+  StrAppend(&result, ShapeUtil::HumanStringWithLayout(shape()), " ",
+            HloOpcodeString(opcode()), "(",
+            OperandsToStringWithCanonicalNameMap(options, canonical_name_map),
+            ")");
+
+  // Print additional attributes. If an instruction contains a subcomputation,
+  // the subcomputation is also printed here.
   for (const string& extra : ExtraAttributesToString(options)) {
     StrAppend(&result, ", ", extra);
   }
+
   if (options.print_metadata() &&
       (!metadata_.op_type().empty() || !metadata_.op_name().empty() ||
        !metadata_.source_file().empty())) {
     StrAppend(&result, ", metadata={", xla::OpMetadataToString(metadata_), "}");
   }
+  if (options.print_backend_config() && !backend_config_.empty()) {
+    StrAppend(&result, ", backend_config=\"", CEscape(backend_config_), "\"");
+  }
   return result;
 }
 
 string HloInstruction::OperandsToString(const HloPrintOptions& options) const {
+  CanonicalNameMap new_map;
+  return OperandsToStringWithCanonicalNameMap(options, &new_map);
+}
+
+string HloInstruction::OperandsToStringWithCanonicalNameMap(
+    const HloPrintOptions& options,
+    CanonicalNameMap* canonical_name_map) const {
   string operands;
   if (opcode() == HloOpcode::kConstant) {
     // For constants, show the actual value in place of an empty operand list.
-    if ((!ShapeUtil::IsTuple(shape()) &&
-         ShapeUtil::ElementsIn(shape()) <= 10) ||
-        options.print_large_constants()) {
+    //
+    // In HloInstruction, sometimes a constant literal is not constructed due
+    // to its size. Skip the printing in this case.
+    if (HasLiteral() && ((!ShapeUtil::IsTuple(shape()) &&
+                          ShapeUtil::ElementsIn(shape()) <= 10) ||
+                         options.print_large_constants())) {
       // Literal::ToString emits multidimensional arrays over multiple
       // lines. Compact this into one line by stripping out white space.
       string tmp = literal().ToString();
@@ -2215,7 +2273,14 @@ string HloInstruction::OperandsToString(const HloPrintOptions& options) const {
       if (options.print_operand_shape()) {
         str.push_back(ShapeUtil::HumanStringWithLayout(operand->shape()));
       }
-      if (!options.compact_operands()) {
+
+      // In a top-level HloInstruction::ToString() call, the operand name is not
+      // part of the canonical string.
+      if (options.canonicalize_instruction_names() &&
+          options.is_in_nested_computation()) {
+        str.push_back(PrintName(
+            canonical_name_map->LookupOrInsert(operand->name()), options));
+      } else if (!options.compact_operands()) {
         str.push_back(PrintName(operand->name(), options));
       }
       StrAppend(out, Join(str, " "));
@@ -2269,7 +2334,9 @@ std::vector<string> HloInstruction::ExtraAttributesToString(
   }
 
   if (convolution_dimension_numbers_ != nullptr) {
-    extra.push_back(ConvolutionDimensionNumbersToString());
+    extra.push_back(StrCat(
+        "dim_labels=",
+        ConvolutionDimensionNumbersToString(*convolution_dimension_numbers_)));
   }
   if (dot_dimension_numbers_ != nullptr) {
     extra.push_back(DotDimensionNumbersToString());
@@ -2284,7 +2351,8 @@ std::vector<string> HloInstruction::ExtraAttributesToString(
     extra.push_back(StrCat("fft_length={", Join(fft_length(), ","), "}"));
   }
 
-  if (options.print_subcomputation_references()) {
+  if (options.print_subcomputation_mode() ==
+      HloPrintOptions::PrintSubcomputationMode::kNameOnly) {
     if (opcode() == HloOpcode::kWhile) {
       extra.push_back(
           StrCat("condition=", PrintName(while_condition()->name(), options)));
@@ -2301,7 +2369,8 @@ std::vector<string> HloInstruction::ExtraAttributesToString(
                              PrintName(false_computation()->name(), options)));
     } else if (opcode() == HloOpcode::kCall || opcode() == HloOpcode::kMap ||
                opcode() == HloOpcode::kReduceWindow ||
-               opcode() == HloOpcode::kReduce) {
+               opcode() == HloOpcode::kReduce ||
+               opcode() == HloOpcode::kCrossReplicaSum) {
       extra.push_back(
           StrCat("to_apply=", PrintName(to_apply()->name(), options)));
     } else if (!called_computations().empty()) {
@@ -2312,8 +2381,45 @@ std::vector<string> HloInstruction::ExtraAttributesToString(
                                      PrintName(computation->name(), options));
                          })));
     }
+  } else if (options.print_subcomputation_mode() ==
+             HloPrintOptions::PrintSubcomputationMode::kFullBodies) {
+    HloPrintOptions new_options = options;
+    new_options.set_is_in_nested_computation(true);
+    switch (opcode()) {
+      case HloOpcode::kWhile:
+        extra.push_back(
+            StrCat("condition=\n", while_condition()->ToString(new_options)));
+        extra.push_back(StrCat("body=\n", while_body()->ToString(new_options)));
+        break;
+      case HloOpcode::kSelectAndScatter:
+        extra.push_back(StrCat("select=\n", select()->ToString(new_options)));
+        extra.push_back(StrCat("scatter=\n", scatter()->ToString(new_options)));
+        break;
+      case HloOpcode::kConditional:
+        extra.push_back(StrCat("true_computation=\n",
+                               true_computation()->ToString(new_options)));
+        extra.push_back(StrCat("false_computation=\n",
+                               false_computation()->ToString(new_options)));
+        break;
+      case HloOpcode::kCall:
+      case HloOpcode::kMap:
+      case HloOpcode::kReduceWindow:
+      case HloOpcode::kReduce:
+        extra.push_back(
+            StrCat("to_apply=\n", to_apply()->ToString(new_options)));
+        break;
+      default:
+        if (!called_computations().empty()) {
+          extra.push_back(
+              StrCat("calls=\n",
+                     Join(called_computations(), ", ",
+                          [&](string* out, const HloComputation* computation) {
+                            StrAppend(out, computation->ToString(new_options));
+                          })));
+        }
+        break;
+    }
   }
-
   if (opcode() == HloOpcode::kSend || opcode() == HloOpcode::kRecv ||
       opcode() == HloOpcode::kSendDone || opcode() == HloOpcode::kRecvDone) {
     extra.push_back(StrCat("channel_id=", channel_id_));
@@ -2349,14 +2455,19 @@ std::vector<string> HloInstruction::ExtraAttributesToString(
     extra.push_back(StrCat("exponent_bits=", exponent_bits_));
     extra.push_back(StrCat("mantissa_bits=", mantissa_bits_));
   }
-
+  if (operand_side_metadata_ != nullptr && user_side_metadata_ != nullptr) {
+    extra.push_back(StrCat("domain={kind=\"", operand_side_metadata_->Kind(),
+                           "\", entry=", operand_side_metadata_->ToString(),
+                           ", exit=", user_side_metadata_->ToString(), "}"));
+  }
   // By contract, we print the custom call target even if
-  // !options.print_subcomputation_references(), because the call target is not
+  // options.print_subcomputation_mode() == kOff, because the call target is not
   // an HloComputation.
   if (opcode() == HloOpcode::kCustomCall) {
     extra.push_back(
         StrCat("custom_call_target=\"", CEscape(custom_call_target_), "\""));
   }
+
   return extra;
 }
 
@@ -2386,6 +2497,7 @@ HloInstructionProto HloInstruction::ToProto() const {
   }
 
   *proto.mutable_metadata() = metadata_;
+  proto.set_backend_config(backend_config_);
   if (literal_ != nullptr) {
     *proto.mutable_literal() = literal_->ToProto();
   }
@@ -2451,6 +2563,10 @@ HloInstructionProto HloInstruction::ToProto() const {
     proto.add_fft_length(fft_len);
   }
 
+  if (has_sharding()) {
+    *proto.mutable_sharding() = sharding().ToProto();
+  }
+
   proto.set_channel_name(channel_name_);
   proto.set_cost_estimate_ns(cost_estimate_ns_);
 
@@ -2487,8 +2603,6 @@ string HloInstruction::ToCategory() const {
         return "input fusion";
       case FusionKind::kOutput:
         return "output fusion";
-      case FusionKind::kTransposeDot:
-        return "dot";
       case FusionKind::kCustom:
         return "custom fusion";
     }
@@ -2522,6 +2636,7 @@ bool HloInstruction::IsFusable() const {
   }
   // Some kinds of instructions don't make sense to fuse.
   switch (opcode_) {
+    case HloOpcode::kDomain:
     case HloOpcode::kParameter:
       return false;
     // Side effecting instrutions cannot be fused.
@@ -2534,7 +2649,9 @@ HloComputation* HloInstruction::fused_instructions_computation() const {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
   CHECK(!called_computations_.empty());
   auto* fused_instructions_computation = called_computations_.front();
-  CHECK(fused_instructions_computation->IsFusionComputation());
+  CHECK(fused_instructions_computation->IsFusionComputation())
+      << "Computation " << fused_instructions_computation->name()
+      << " is not a fusion kind";
   return fused_instructions_computation;
 }
 
@@ -2673,6 +2790,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleNegate(this);
     case HloOpcode::kExp:
       return visitor->HandleExp(this);
+    case HloOpcode::kExpm1:
+      return visitor->HandleExpm1(this);
     case HloOpcode::kFloor:
       return visitor->HandleFloor(this);
     case HloOpcode::kCeil:
@@ -2681,6 +2800,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleClz(this);
     case HloOpcode::kLog:
       return visitor->HandleLog(this);
+    case HloOpcode::kLog1p:
+      return visitor->HandleLog1p(this);
     case HloOpcode::kTanh:
       return visitor->HandleTanh(this);
     case HloOpcode::kCos:
@@ -2745,6 +2866,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleSendDone(this);
     case HloOpcode::kGather:
       return visitor->HandleGather(this);
+    case HloOpcode::kDomain:
+      return visitor->HandleDomain(this);
 
     // These opcodes are not handled here.
     case HloOpcode::kTrace:
@@ -2971,6 +3094,7 @@ Status HloInstruction::AcceptOrdered(
       continue;
     }
 
+    // TODO(b/78350259): Eliminate const laundering.
     HloInstruction* instruction =
         const_cast<HloInstruction*>(const_instruction);
 
@@ -3026,10 +3150,12 @@ bool HloInstruction::IsElementwise() const {
     case HloOpcode::kCopy:
     case HloOpcode::kCos:
     case HloOpcode::kExp:
+    case HloOpcode::kExpm1:
     case HloOpcode::kFloor:
     case HloOpcode::kImag:
     case HloOpcode::kIsFinite:
     case HloOpcode::kLog:
+    case HloOpcode::kLog1p:
     case HloOpcode::kNot:
     case HloOpcode::kNegate:
     case HloOpcode::kReal:
@@ -3094,7 +3220,7 @@ bool HloInstruction::IsElementwise() const {
 
 bool HloInstruction::ImplicitlyBroadcastsOperand(int64 operand_idx) const {
   CHECK(IsElementwise());
-  return !ShapeUtil::Equal(shape(), operand(operand_idx)->shape());
+  return !ShapeUtil::SameDimensions(shape(), operand(operand_idx)->shape());
 }
 
 namespace {
@@ -3270,8 +3396,6 @@ string ToString(HloInstruction::FusionKind kind) {
       return "kInput";
     case HloInstruction::FusionKind::kOutput:
       return "kOutput";
-    case HloInstruction::FusionKind::kTransposeDot:
-      return "kTransposeDot";
     case HloInstruction::FusionKind::kCustom:
       return "kCustom";
   }
@@ -3288,9 +3412,6 @@ StatusOr<HloInstruction::FusionKind> StringToFusionKind(
   if (kind_name == "kOutput") {
     return HloInstruction::FusionKind::kOutput;
   }
-  if (kind_name == "kTransposeDot") {
-    return HloInstruction::FusionKind::kTransposeDot;
-  }
   if (kind_name == "kCustom") {
     return HloInstruction::FusionKind::kCustom;
   }
@@ -3334,42 +3455,8 @@ string RandomDistributionToString(const RandomDistribution& distribution) {
   return tensorflow::str_util::Lowercase(RandomDistribution_Name(distribution));
 }
 
-StatusOr<RandomDistribution> StringToRandomDistribution(const string& name) {
-  static std::unordered_map<string, RandomDistribution>* map = [] {
-    static auto* map = new std::unordered_map<string, RandomDistribution>;
-    for (int i = 0; i < RandomDistribution_ARRAYSIZE; i++) {
-      if (RandomDistribution_IsValid(i)) {
-        auto value = static_cast<RandomDistribution>(i);
-        (*map)[RandomDistributionToString(value)] = value;
-      }
-    }
-    return map;
-  }();
-  auto found = map->find(tensorflow::str_util::Lowercase(name));
-  if (found == map->end()) {
-    return InvalidArgument("Unknown distribution");
-  }
-  return found->second;
-}
-
-std::ostream& operator<<(std::ostream& os, HloInstruction::FusionKind kind) {
-  return os << ToString(kind);
-}
-
-string HloInstruction::ConvolutionDimensionNumbersToString() const {
-  string result;
-  if (convolution_dimension_numbers_ == nullptr) {
-    return result;
-  }
-  const ConvolutionDimensionNumbers& dnums = *convolution_dimension_numbers_;
-  // Show the given dimension labels in order of major to minor based on the
-  // shape's layout.
-  const auto append_dims = [&](const std::vector<string>& dims,
-                               const Shape& shape) {
-    CHECK_EQ(dims.size(), ShapeUtil::Rank(shape));
-    StrAppend(&result, Join(dims, ""));
-  };
-
+string ConvolutionDimensionNumbersToString(
+    const ConvolutionDimensionNumbers& dnums) {
   // lhs_dims[i] is the symbol of the logical dimension i for the lhs
   // operand. E.g. if batch has dimension number 2, then lhs_dims[2] == "b".
   std::vector<string> lhs_dims(2 + dnums.input_spatial_dimensions().size());
@@ -3393,19 +3480,8 @@ string HloInstruction::ConvolutionDimensionNumbersToString() const {
     output_dims[dnums.output_spatial_dimensions(i)] = StrCat(i);
   }
 
-  result += "dim_labels=";
-  append_dims(lhs_dims, operand(0)->shape());
-  result += "_";
-  append_dims(rhs_dims, operand(1)->shape());
-  result += "->";
-
-  // A convolution can be represented as a kConvolution HLO or as a CustomCall
-  // that returns a tuple, the first element of which is the result of the
-  // convolution.
-  Shape this_shape =
-      ShapeUtil::IsTuple(shape()) ? shape().tuple_shapes(0) : shape();
-  append_dims(output_dims, this_shape);
-  return result;
+  return StrCat(Join(lhs_dims, ""), "_", Join(rhs_dims, ""), "->",
+                Join(output_dims, ""));
 }
 
 string HloInstruction::DotDimensionNumbersToString() const {
@@ -3431,6 +3507,28 @@ string HloInstruction::DotDimensionNumbersToString() const {
   return Join(result, ", ");
 }
 
+StatusOr<RandomDistribution> StringToRandomDistribution(const string& name) {
+  static std::unordered_map<string, RandomDistribution>* map = [] {
+    static auto* map = new std::unordered_map<string, RandomDistribution>;
+    for (int i = 0; i < RandomDistribution_ARRAYSIZE; i++) {
+      if (RandomDistribution_IsValid(i)) {
+        auto value = static_cast<RandomDistribution>(i);
+        (*map)[RandomDistributionToString(value)] = value;
+      }
+    }
+    return map;
+  }();
+  auto found = map->find(tensorflow::str_util::Lowercase(name));
+  if (found == map->end()) {
+    return InvalidArgument("Unknown distribution");
+  }
+  return found->second;
+}
+
+std::ostream& operator<<(std::ostream& os, HloInstruction::FusionKind kind) {
+  return os << ToString(kind);
+}
+
 string HloInstruction::GatherDimensionNumbersToString() const {
   CHECK_NE(gather_dimension_numbers_.get(), nullptr);
   string output_window_dims =
@@ -3462,6 +3560,31 @@ bool HloInstruction::CouldBeBitcast() const {
   }
 }
 
+Status HloInstruction::GetBackendConfigInternal(
+    tensorflow::protobuf::Message* proto) const {
+  proto->Clear();
+
+  // Empty string does not parse as valid JSON, but it's a valid backend config,
+  // corresponding to the empty proto.
+  if (backend_config_.empty()) {
+    return Status::OK();
+  }
+  return tensorflow::HumanReadableJsonToProto(backend_config_, proto);
+}
+
+Status HloInstruction::set_backend_config(
+    const tensorflow::protobuf::Message& proto) {
+  TF_ASSIGN_OR_RETURN(backend_config_, BackendConfigToRawString(proto));
+  return Status::OK();
+}
+
+/* static */ StatusOr<string> HloInstruction::BackendConfigToRawString(
+    const tensorflow::protobuf::Message& proto) {
+  string ret;
+  TF_RETURN_IF_ERROR(tensorflow::ProtoToHumanReadableJson(proto, &ret));
+  return ret;
+}
+
 HloModule* HloInstruction::GetModule() const {
   if (parent_) {
     return parent_->parent();
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index a5e9aecb9e7f52..ef55c6668f2baa 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -37,6 +37,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/service/hlo_clone_context.h"
+#include "tensorflow/compiler/xla/service/hlo_domain_metadata.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
@@ -50,6 +52,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/iterator_range.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -60,51 +63,75 @@ class HloModule;
 // A bunch of switches that control how the hlo text should be printed.
 class HloPrintOptions {
  public:
+  enum class PrintSubcomputationMode {
+    kOff,         // Do not print anything about subcomputations.
+    kNameOnly,    // Only print the name of subcomputations.
+    kFullBodies,  // Print the full bodies of subcomputations.
+  };
+
   // Constructs the default print options: don't print large constants, don't
   // compact operands, no indentation.
   HloPrintOptions()
       : print_large_constants_(false),
-        print_subcomputation_references_(true),
+        print_subcomputation_mode_(PrintSubcomputationMode::kNameOnly),
         print_metadata_(true),
+        print_backend_config_(true),
         compact_operands_(false),
         print_operand_shape_(true),
         print_program_shape_(true),
         print_percent_(true),
-        indent_amount_(0) {}
+        canonicalize_instruction_names_(false),
+        indent_amount_(0),
+        is_in_nested_computation_(false) {}
 
   static HloPrintOptions ShortParsable() {
     return HloPrintOptions()
         .set_print_large_constants(true)
-        .set_print_subcomputation_references(true)
+        .set_print_subcomputation_mode(PrintSubcomputationMode::kNameOnly)
         .set_print_metadata(false)
+        .set_print_backend_config(false)
         .set_print_operand_shape(false)
         .set_print_program_shape(false)
         .set_print_percent(false);
   }
 
+  // Options to produce the canonical string representing an isomorphic
+  // computation graph.
+  static HloPrintOptions Canonical() {
+    return HloPrintOptions()
+        .set_print_subcomputation_mode(PrintSubcomputationMode::kFullBodies)
+        .set_print_metadata(false)
+        .set_compact_operands(true)
+        .set_print_operand_shape(true)
+        .set_print_program_shape(false)
+        .set_print_percent(false)
+        .set_canonicalize_instruction_names(true);
+  }
+
   // If true, large constants will be printed out.
   HloPrintOptions& set_print_large_constants(bool value) {
     print_large_constants_ = value;
     return *this;
   }
 
-  // If true, the names of subcomputations (e.g. a fusion node's fused
-  // computation) won't be printed.  This makes the resulting text not parsable.
-  //
-  // A CustomCall's call target is printed even if
-  // print_subcomputation_references is false, because the call target isn't an
-  // HloComputation.
-  HloPrintOptions& set_print_subcomputation_references(bool value) {
-    print_subcomputation_references_ = value;
+  HloPrintOptions& set_print_subcomputation_mode(
+      PrintSubcomputationMode value) {
+    print_subcomputation_mode_ = value;
     return *this;
   }
 
-  // If true, metatdata will be printed.
+  // If true, metadata will be printed.
   HloPrintOptions& set_print_metadata(bool value) {
     print_metadata_ = value;
     return *this;
   }
 
+  // If true, backend_config will be printed.
+  HloPrintOptions& set_print_backend_config(bool value) {
+    print_backend_config_ = value;
+    return *this;
+  }
+
   // If true, operands' shapes will be printed.
   HloPrintOptions& set_print_operand_shape(bool value) {
     print_operand_shape_ = value;
@@ -130,54 +157,175 @@ class HloPrintOptions {
     return *this;
   }
 
+  // If true, canonicalizes instructions' name. Instead of using "%foo.1" as
+  // the name of an instruction, we use "%tmp_1", "%tmp_2" etc.
+  HloPrintOptions& set_canonicalize_instruction_names(bool value) {
+    canonicalize_instruction_names_ = value;
+    return *this;
+  }
+
   // The indent of the hlo text block.
   HloPrintOptions& set_indent_amount(int value) {
     indent_amount_ = value;
     return *this;
   }
 
+  // If true, indicates the instruction being printed is inside a nested
+  // computation.
+  HloPrintOptions& set_is_in_nested_computation(bool value) {
+    is_in_nested_computation_ = value;
+    return *this;
+  }
+
   bool print_large_constants() const { return print_large_constants_; }
-  bool print_subcomputation_references() const {
-    return print_subcomputation_references_;
+  PrintSubcomputationMode print_subcomputation_mode() const {
+    return print_subcomputation_mode_;
   }
   bool print_metadata() const { return print_metadata_; }
+  bool print_backend_config() const { return print_metadata_; }
   bool compact_operands() const { return compact_operands_; }
   bool print_operand_shape() const { return print_operand_shape_; }
   bool print_program_shape() const { return print_program_shape_; }
   bool print_percent() const { return print_percent_; }
+  bool canonicalize_instruction_names() const {
+    return canonicalize_instruction_names_;
+  }
   int indent_amount() const { return indent_amount_; }
+  int is_in_nested_computation() const { return is_in_nested_computation_; }
 
  private:
   bool print_large_constants_;
-  bool print_subcomputation_references_;
+  PrintSubcomputationMode print_subcomputation_mode_;
   bool print_metadata_;
+  bool print_backend_config_;
   bool compact_operands_;
   bool print_operand_shape_;
   bool print_program_shape_;
   bool print_percent_;
+  bool canonicalize_instruction_names_;
   int indent_amount_;
+  bool is_in_nested_computation_;
+};
+
+// For canonical string output, we need to have a canonical way to rename
+// each instruction and its operands. Each operand is renamed as "tmp_<xxx>",
+// where <xxx> is an index starting from 0.
+class CanonicalNameMap {
+ public:
+  CanonicalNameMap() : index(0) {}
+
+  string LookupOrInsert(const string& old_name) {
+    auto iter = canonical_name_map.find(old_name);
+    if (iter != canonical_name_map.end()) {
+      return iter->second;
+    }
+
+    string new_name = tensorflow::strings::StrCat("tmp_", index++);
+    canonical_name_map[old_name] = new_name;
+    return new_name;
+  }
+  void Clear() {
+    canonical_name_map.clear();
+    index = 0;
+  }
+
+ private:
+  int64 index;
+  tensorflow::gtl::FlatMap<string, string> canonical_name_map;
 };
 
-// HLO instructions are the IR used by the high-level compiler.
+// HLO instructions are the atomic unit of the high-level compiler's IR.
+//
+// HloInstructions live inside of an HloComputation, which is analogous to a
+// function in other programming languages.  Nodes have no total order within
+// their computation.  Instead, they have a partial ordering determined by their
+// data and control dependencies.
+//
+// HLO does not have basic blocks or explicit "branch" instructions.  Instead,
+// certain HloInstructions -- namely, kWhile, kConditional, and kCall -- encode
+// control flow.  For example, the kConditional HLO executes one of two possible
+// computations, depending on the runtime value of a predicate.
+//
+// HLO is pure (mostly).  It has no concept of mutable state.  Instead, data
+// values are produced by one HLO and flow into consumers across dependency
+// edges.
 class HloInstruction {
  public:
+  // A fusion node computes the same value a call to its fusion computation
+  // would compute.  However, the choice of fusion kind dictates codegen
+  // strategy for the backend.
+  //
+  // To generate code for a kFusion HloInstruction, most backends do something
+  // like the following:
+  //
+  // 1) Identify the "primary" HloInstruction of the fused computation.
+  // 2) Emit code that does the work of the primary node, creating its inputs
+  //    and transforming its outputs as specified by the fused computation.
+  //
+  // In step (2), the code emitted is usually similar to the code that would be
+  // emitted for an *unfused* version of the primary node, except that
+  //
+  //  - when the primary node reads an element of one of its operands, instead
+  //    of loading the value from memory, it *computes* the value based on the
+  //    contents of the fused computation.
+  //  - when the primary node outputs a value, instead of storing it to memory,
+  //    it forwards the value to its users, which then perform additional
+  //    computations before the value is finally stored to memory at the root of
+  //    the fusion node.
+  //
+  // An HloInstruction's FusionKind helps us find the kFusion instruction's
+  // primary node, and can also affect how we generate code in step (2).
+  //
+  //  - kInput: The primary node is the root of the fused instruction.
+  //
+  //  - kOutput: The primary node is not the root of the fused instruction.
+  //    This fusion kind requires that one operand buffer of the fusion
+  //    instruction be able to alias the output buffer.  This constraint is
+  //    usually enough to let backends find the primary node unambiguously.
+  //
+  //  - kLoop: The primary node is the root of the fused computation, but,
+  //    unlike in input fusion, we prescribe a specific implementation for
+  //    codegen.  Rather than generating code that looks like the code we'd emit
+  //    for an unfused version of the primary/root node, we emit code that
+  //    generates one element of the root at a time.
+  //
+  //  - kCustom: Custom category for backend-specific fusions that don't fit
+  //    into the above patterns.
+  //
+  // Not all backends support all fusion kinds, and given a particular fused
+  // computation, it's not in general safe to change its fusion kind.  Creation
+  // of fusion nodes is always backend-specific.
+  //
+  // For elementwise ops (e.g. kAdd), most backends would emit a
+  // one-element-at-a-time implementation for the unfused version, so loop
+  // fusion and input fusion are probably equivalent if the root node is
+  // elementwise.  They're not necessarily equivalent e.g. for kReduce, where an
+  // implementation might emit something more sophisticated for an unfused or
+  // input-fusion reduce, but will emit the naive code that reduces one element
+  // at a time for loop fusion with a reduce as the root.
+  //
+  // Another way to think of loop fusion is that it's equivalent to input
+  // fusion, but where the root node is an implicit identity node, whose
+  // unfused implementation is "read one element, write one element".
+  //
+  // TODO(b/79869434): This categorization scheme is not great.  For one thing,
+  // input and loop fusion are basically the same thing: There is no reason for
+  // the HLO to encode backend-specific decisions about how e.g. a reduce that's
+  // the root of a fusion should be lowered.  In addition, this scheme as
+  // written doesn't work for multi-output fusion, where the primary node is
+  // never actually the root (which is a kTuple instruction that gathers the
+  // multiple outputs of the fusion).
   enum class FusionKind {
-    kLoop,          // Fused into a loop.
-    kInput,         // Op's input is fused into the op itself.
-    kOutput,        // Op's output is fused into the op itself.
-                    // REQUIRES: At least one operand buffer must be able
-                    // to alias the output buffer.
-    kTransposeDot,  // Fused into a dot with transposed operands.
-    kCustom,        // Custom category for backend-specific fusions that
-                    // do not match any of the more specific ones.
+    kLoop,
+    kInput,
+    kOutput,
+    kCustom,
   };
 
-  ~HloInstruction();
+  virtual ~HloInstruction();
 
   // Creates an instruction from the given proto. Arguments:
   //
-  //   module: the module which will contain the instruction. The newly created
-  //     instruction is *not* added to the module or any computation, however.
   //   proto: the proto to convert from.
   //   instruction_map: a map from instruction id to HloInstruction*. This map
   //     must contain all operands of the newly constructed instruction.
@@ -185,7 +333,7 @@ class HloInstruction {
   //     must contain all computations which the newly constructed instruction
   //     calls.
   static StatusOr<std::unique_ptr<HloInstruction>> CreateFromProto(
-      HloModule* module, const HloInstructionProto& proto,
+      const HloInstructionProto& proto,
       const tensorflow::gtl::FlatMap<int64, HloInstruction*>& instruction_map,
       const tensorflow::gtl::FlatMap<int64, HloComputation*>& computation_map);
 
@@ -278,10 +426,26 @@ class HloInstruction {
       const Shape& shape, HloInstruction* operand, const int exponent_bits,
       const int mantissa_bits);
 
-  // Creates a cross replica sum op.
+  // Creates a cross replica reduction op.
+  //
+  // `reduction_computation`: the reduction function.
+  //
+  // `replica_group_ids`: maps replica ids to subgroup ids. If empty, all
+  // replicas belong to one group. Allreduce will be applied within subgroups.
+  // For example, we have 4 replicas, then replica_group_ids={0,1,0,1} means,
+  // replica 0 and 2 are in subgroup 0, replica 1 and 3 are in subgroup 1.
+  //
+  // `channel_id`: for Allreduce nodes from different models, if they have the
+  // same channel_id, they will be 'Allreduce'd. If empty, Allreduce will not be
+  // applied cross models.
+  //
+  // TODO(b/79737069): Rename this to AllReduce.
   static std::unique_ptr<HloInstruction> CreateCrossReplicaSum(
-      const Shape& shape,
-      tensorflow::gtl::ArraySlice<HloInstruction*> operands);
+      const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
+      HloComputation* reduce_computation,
+      tensorflow::gtl::ArraySlice<int64> replica_group_ids = {},
+      const tensorflow::gtl::optional<int64>& channel_id =
+          tensorflow::gtl::nullopt);
 
   // Creates a conversion instruction, where operand is the data to convert and
   // shape is the target shape for the conversion.
@@ -452,6 +616,13 @@ class HloInstruction {
       const GatherDimensionNumbers& gather_dim_numbers,
       tensorflow::gtl::ArraySlice<int64> window_bounds);
 
+  // Creates a kDomain instruction which delimits an HLO domain which have
+  // the provided user and operand side metadata.
+  static std::unique_ptr<HloInstruction> CreateDomain(
+      const Shape& shape, HloInstruction* operand,
+      std::unique_ptr<DomainMetadata> operand_side_metadata,
+      std::unique_ptr<DomainMetadata> user_side_metadata);
+
   // Creates a fusion instruction. A fusion instruction contains one or more
   // fused instructions forming an expression with a single root
   // "fused_root". Additional instructions can be added to the fusion
@@ -503,6 +674,10 @@ class HloInstruction {
   // Returns the opcode for this instruction.
   HloOpcode opcode() const { return opcode_; }
 
+  // Returns true if this instruction has a side effect, irrespective of whether
+  // any called computations may contain an instruction with side effects.
+  bool HasSideEffectNoRecurse() const;
+
   // Returns true if this instruction has a side effect. An instruction has a
   // side effect if it uses certain opcodes or calls a computation with a side
   // effect.
@@ -527,6 +702,10 @@ class HloInstruction {
   using InstructionVector = tensorflow::gtl::InlinedVector<HloInstruction*, 2>;
   const InstructionVector& operands() const { return operands_; }
 
+  // Returns the vector of unique operands, in the same order they are found
+  // within the operand vector.
+  InstructionVector unique_operands() const;
+
   // Returns the index of 'target' in the operands sequence.
   // Precondition: target must be an operand (or a fatal error will occur).
   int64 operand_index(const HloInstruction* target) const;
@@ -597,10 +776,8 @@ class HloInstruction {
     if (opcode() != other.opcode()) {
       return false;
     }
-    using EqShapeFuncType = bool (*)(const Shape&, const Shape&);
-    EqShapeFuncType eq_shapes =
-        layout_sensitive ? ShapeUtil::Equal : ShapeUtil::Compatible;
-    if (!eq_shapes(shape(), other.shape())) {
+    if (!(layout_sensitive ? ShapeUtil::Equal(shape(), other.shape())
+                           : ShapeUtil::Compatible(shape(), other.shape()))) {
       return false;
     }
     if (operands().size() != other.operands().size()) {
@@ -615,7 +792,11 @@ class HloInstruction {
       }
     }
 
-    return IdenticalSlowPath(other, eq_computations, eq_shapes);
+    if (backend_config_ != other.backend_config_) {
+      return false;
+    }
+
+    return IdenticalSlowPath(other, eq_computations);
   }
 
   // Returns whether the instruction has a constant operand.
@@ -643,6 +824,8 @@ class HloInstruction {
   // Detaches an instruction from its operands. That is, remove the instruction
   // from each operand's user set. This should only be called prior to
   // deallocating the instruction.
+  //
+  // TODO(b/78305363): Make this automatic when deleting an instruction.
   void DetachFromOperands();
 
   // Performs a postorder DFS visit using this node as the root. If
@@ -695,6 +878,9 @@ class HloInstruction {
   // Note: only constant and parameter opcodes have an associated literal.
   const Literal& literal() const;
 
+  // Returns whether there is literal associated with this instruction.
+  bool HasLiteral() const;
+
   // Returns the parameter number associated with this instruction.
   //
   // Note: only parameter opcodes have an associated parameter number.
@@ -942,20 +1128,41 @@ class HloInstruction {
   }
   // Returns the sharding unique device, if any.
   tensorflow::gtl::optional<int64> sharding_unique_device() const {
-    if (sharding_ == nullptr || !sharding_->HasUniqueDevice()) {
+    if (sharding_ == nullptr) {
       return tensorflow::gtl::optional<int64>();
     }
-    return sharding_->UniqueDevice().ValueOrDie();
+    auto device = sharding_->UniqueDevice();
+    return device.ok() ? device.ValueOrDie()
+                       : tensorflow::gtl::optional<int64>();
   }
   // Sets the sharding of this operator. Should only be called by HloModule or
   // HloComputation methods.
   void set_sharding(const HloSharding& sharding) {
     sharding_ = MakeUnique<HloSharding>(sharding);
   }
+  // Sets a sharding that assigns the current instruction to device.
+  void set_device_sharding(int64 device);
   // Remove any sharding from this operator.
   void clear_sharding() { sharding_ = nullptr; }
   // Return true if this operator has a sharding assigned.
   bool has_sharding() const { return sharding_ != nullptr; }
+  // Checks whether the instruction has compatible sharding with the other
+  // instruction.
+  bool has_compatible_sharding(const HloInstruction* other) const {
+    if (!has_sharding()) {
+      return !other->has_sharding();
+    }
+    return other->has_sharding() ? sharding() == other->sharding() : false;
+  }
+
+  // Retrieves the operand side metadata of a kDomain instruction.
+  const DomainMetadata& operand_side_metadata() const {
+    return *operand_side_metadata_;
+  }
+  // Retrieves the user side metadata of a kDomain instruction.
+  const DomainMetadata& user_side_metadata() const {
+    return *user_side_metadata_;
+  }
 
   // When creating a new instruction which either replaces, or shifts up (kCopy
   // insertion case), another instruction, we need to make sure the certain
@@ -1127,9 +1334,6 @@ class HloInstruction {
     return fft_length_;
   }
 
-  // Returns the dump string of the convolution dimension numbers.
-  string ConvolutionDimensionNumbersToString() const;
-
   // Returns data on the dimension numbers used for a dot operation.
   const DotDimensionNumbers& dot_dimension_numbers() const {
     CHECK(dot_dimension_numbers_ != nullptr);
@@ -1160,20 +1364,15 @@ class HloInstruction {
   // Clones the HLO instruction. The clone will have the same opcode, shape, and
   // operands. After creation the clone has no uses. "this" (the instruction
   // cloned from) is not changed. Suffix is the string to append to the name of
-  // the instruction to form the name of the cloned instruction.  If the module
-  // pointer is not nullptr, it will be the module where the cloned computations
-  // will be added to (in order to support deep cloning).  Ignores the control
-  // predecessors and successors of this HLO instruction.
-  std::unique_ptr<HloInstruction> Clone(const string& suffix = "clone",
-                                        HloModule* module = nullptr) const;
-
-  // Clones the HLO instruction as above but with new shape and operands.  If
-  // the module pointer is not nullptr, it will be the module where the cloned
-  // computations will be added to (in order to support deep cloning).  Ignores
-  // the control predecessors and successors of this HLO instruction.
+  // the instruction to form the name of the cloned instruction.
+  // Ignores the control predecessors and successors of this HLO instruction.
+  std::unique_ptr<HloInstruction> Clone(
+      const string& suffix = "clone", HloCloneContext* context = nullptr) const;
+
+  // Clones the HLO instruction as above but with new shape and operands.
   std::unique_ptr<HloInstruction> CloneWithNewOperands(
       const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
-      HloModule* module = nullptr) const;
+      HloCloneContext* context = nullptr) const;
 
   // Returns the computations this instruction directly calls (if any).
   const std::vector<HloComputation*>& called_computations() const {
@@ -1245,7 +1444,7 @@ class HloInstruction {
 
   // Gets/sets the string identifier for this instruction.
   const string& name() const { return name_; }
-  void set_name(tensorflow::StringPiece name) { name_ = name.ToString(); }
+  void set_name(tensorflow::StringPiece name) { name_ = std::string(name); }
 
   // Use the given NameUniquer to select a unique name for the instruction based
   // on the instruction's existing name.
@@ -1262,6 +1461,40 @@ class HloInstruction {
   // if no id has been assigned yet).
   int unique_id() const { return unique_id_; }
 
+  // Returns the backend-specific configuration for how a backend should compile
+  // this HLO. The meaning of the field is backend specific. Not for use before
+  // or during general HLO optimization, since HLO optimizations do not preserve
+  // this field and they cannot interpret it due to its meaning being backend
+  // specific.
+  //
+  // ConfigProto should be a protobuf Message type.
+  template <typename ConfigProto>
+  StatusOr<ConfigProto> backend_config() const {
+    ConfigProto proto;
+    TF_RETURN_IF_ERROR(GetBackendConfigInternal(&proto));
+    return std::move(proto);
+  }
+  Status set_backend_config(const tensorflow::protobuf::Message& proto);
+
+  // Getter/setter for raw JSON-encoded backend config.  Prefer the
+  // functions above that deal in proto Messages where possible.
+  const string& raw_backend_config_string() const { return backend_config_; }
+  void set_raw_backend_config_string(string config_str) {
+    backend_config_ = std::move(config_str);
+  }
+
+  // Returns a string representation of a proto in the format used by
+  // raw_backend_config_string.
+  //
+  // This is morally equivalent to:
+  //
+  //   HloInstruction instr;
+  //   TF_RETURN_IF_ERROR(instr.set_backend_config(proto));
+  //   return instr.raw_backend_config_string();
+  //
+  static StatusOr<string> BackendConfigToRawString(
+      const tensorflow::protobuf::Message& proto);
+
   // Sets the debug metadata for this instruction.
   void set_metadata(const OpMetadata& metadata) { metadata_ = metadata; }
   const OpMetadata& metadata() const { return metadata_; }
@@ -1283,6 +1516,7 @@ class HloInstruction {
   // Get/Set the number of partitions per outer dimension (in order, starting
   // with outer-most dimension first). Currently used by the parallel cpu
   // backend to partition HLOs into parallel tasks.
+  //
   // TODO(b/62783254) Replace these methods with a more general way to
   // annotate HLOs with backend-specific information.
   const std::vector<int64>& outer_dimension_partitions() const {
@@ -1297,21 +1531,40 @@ class HloInstruction {
   void RelayoutConstant(const Layout& new_layout,
                         const ShapeIndex& shape_index = {});
 
+ protected:
+  // Internal constructor for a given opcode/shape, other fields must be filled
+  // by factory methods.
+  HloInstruction(HloOpcode opcode, const Shape& shape);
+
  private:
+  // Prints an instruction to a string.
+  //
+  // The canonical string representation needs to name operands and instruction
+  // names in a consistent way. This is implemented through the
+  // canonical_name_map.
+  string ToStringWithCanonicalNameMap(
+      const HloPrintOptions& options,
+      CanonicalNameMap* canonical_name_map) const;
+
+  // Prints an operand to a string.
+  string OperandsToStringWithCanonicalNameMap(
+      const HloPrintOptions& options,
+      CanonicalNameMap* canonical_name_map) const;
+
+  // Allow HloInstruction to access the ToStringWithCanonicalNameMap() and
+  // OperandsToStringWithCanonicalNameMap() functions.
+  friend class HloComputation;
+
   enum class UseKind { kNoUse, kReuse, kUsePermutingElements, kUse };
 
   // Helper class for computing OperandElementUse for kFusion.
   class FusionReusesParamElements;
 
   // See comments on Identical().
-  // eq_shapes() is used to check shapes for equality, and would normally be
-  // expected to be ShapeUtil::Equals or ShapeUtil::Compatible, depending on
-  // whether we want a layout-sensitive check or not.
   bool IdenticalSlowPath(
       const HloInstruction& other,
       const std::function<bool(const HloComputation*, const HloComputation*)>&
-          eq_computations,
-      const std::function<bool(const Shape&, const Shape&)>& eq_shapes) const;
+          eq_computations) const;
 
   // Creates an n-ary elementwise operation.
   static std::unique_ptr<HloInstruction> CreateNary(
@@ -1328,10 +1581,6 @@ class HloInstruction {
   // Removes a user for this instruction.
   void RemoveUser(HloInstruction* user);
 
-  // Internal constructor for a given opcode/shape, other fields must be filled
-  // by factory methods.
-  HloInstruction(HloOpcode opcode, const Shape& shape);
-
   // Fuses the given instruction into this fusion instruction. When add_output
   // is false (which is the default), instruction_to_fuse is cloned and the
   // clone is placed in the fusion instruction. instruction_to_fuse is
@@ -1358,7 +1607,7 @@ class HloInstruction {
   // Clones a fusion instruction with a new shape and operands.
   std::unique_ptr<HloInstruction> CloneFusionWithNewOperands(
       const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
-      HloModule* module = nullptr) const;
+      HloCloneContext* context = nullptr) const;
 
   // Returns true if this instruction can legally have the dimensions field
   // set. Used for checking precondition of dimensions field accessors.
@@ -1367,6 +1616,10 @@ class HloInstruction {
   // Returns how this instruction uses elements of its `i`th operand.
   UseKind OperandElementUse(int64 i) const;
 
+  // Helper for implementing backend_config().  Parses backend_config_ into the
+  // given proto.
+  Status GetBackendConfigInternal(tensorflow::protobuf::Message* proto) const;
+
   int unique_id_;  // Unique to this HloInstruction within a HloModule
 
   // Opcode for this instruction.
@@ -1451,6 +1704,10 @@ class HloInstruction {
   // The sharding, if one exists.
   std::unique_ptr<HloSharding> sharding_;
 
+  // Fields used by the kDomain instruction.
+  std::unique_ptr<DomainMetadata> operand_side_metadata_;
+  std::unique_ptr<DomainMetadata> user_side_metadata_;
+
   // For parameter instructions this field holds the parameter number.
   int64 parameter_number_ = 0;
 
@@ -1510,6 +1767,10 @@ class HloInstruction {
   // The string representation of the infeed configuration.
   string infeed_config_;
 
+  // The backend-specific configuration for how a backend should compile this
+  // HLO. See the documentation on backend_config().
+  string backend_config_;
+
   // String identifier for instruction.
   string name_;
 
@@ -1532,6 +1793,9 @@ StatusOr<HloInstruction::FusionKind> StringToFusionKind(
 string PaddingConfigToString(const PaddingConfig& padding);
 string OpMetadataToString(const OpMetadata& metadata);
 string RandomDistributionToString(const RandomDistribution& distribution);
+string ConvolutionDimensionNumbersToString(
+    const ConvolutionDimensionNumbers& dnums);
+
 StatusOr<RandomDistribution> StringToRandomDistribution(const string& name);
 
 std::ostream& operator<<(std::ostream& os, HloInstruction::FusionKind kind);
@@ -1540,13 +1804,20 @@ std::ostream& operator<<(std::ostream& os, HloInstruction::FusionKind kind);
 // an HloInstruction* or a const HloInstruction*.
 // To make the iteration order over the map deterministic, the comparator
 // should not be using the pointer values, but rather an intrinsic property of
-// the hlo.
+// the hlo. Exception: null pointer values compare less than non-null.
 //
 // Note that this cannot be used for HLO instructions across multiple modules
 // since the id of HLO instructions are only unique within each HLO module.
 struct HloPtrComparator {
   bool operator()(const HloInstruction* const& lhs,
                   const HloInstruction* const& rhs) const {
+    if (rhs == nullptr) {
+      // Nothing compares less than nullptr.
+      return false;
+    }
+    if (lhs == nullptr) {
+      return true;
+    }
     return lhs->unique_id() < rhs->unique_id();
   }
 };
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index f2980d309d01fd..313033ddadce6a 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -24,11 +24,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/window_util.h"
 
 namespace xla {
 namespace {
@@ -149,8 +151,8 @@ TEST_F(HloInstructionTest, UserWithTwoOperands) {
       builder.AddInstruction(HloInstruction::CreateParameter(1, r0f32_, "bar"));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, foo, bar));
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(add->operands(), UnorderedElementsAre(foo, bar));
   EXPECT_THAT(foo->users(), UnorderedElementsAre(add));
@@ -186,8 +188,8 @@ TEST_F(HloInstructionTest, MultipleUsers) {
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, foo));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, foo, bar));
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(3, foo->user_count());
   EXPECT_EQ(1, bar->user_count());
@@ -219,8 +221,8 @@ TEST_F(HloInstructionTest, RepeatedUser) {
       builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "foo"));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, foo, foo));
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(1, foo->user_count());
 
@@ -254,8 +256,8 @@ TEST_F(HloInstructionTest, MultipleUsersAndOperands) {
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, c0, param1));
   auto addtotal = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, addleft, addright));
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   OpAndUserCollectingVisitor visitor;
   ASSERT_IS_OK(addtotal->Accept(&visitor));
@@ -303,8 +305,8 @@ TEST_F(HloInstructionTest, MultipleUsersAndOperandsWithUnaryOps) {
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, addleft, addright));
   auto neg2 = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, addtotal));
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   OpAndUserCollectingVisitor visitor;
   ASSERT_IS_OK(neg2->Accept(&visitor));
@@ -325,7 +327,7 @@ TEST_F(HloInstructionTest, TrivialMap) {
   //
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
   Shape f32a100x10 = ShapeUtil::MakeShape(F32, {100, 10});
-  HloModule module(TestName());
+  auto module = CreateNewModule();
 
   // Builds an x+1.0 computation to use in a Map.
   auto embedded_builder = HloComputation::Builder("f32+1");
@@ -335,7 +337,7 @@ TEST_F(HloInstructionTest, TrivialMap) {
       HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
   embedded_builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, param, value));
-  auto add_f32 = module.AddEmbeddedComputation(embedded_builder.Build());
+  auto add_f32 = module->AddEmbeddedComputation(embedded_builder.Build());
 
   // Builds a parameter and feeds it to the map.
   HloComputation::Builder builder(TestName());
@@ -343,7 +345,7 @@ TEST_F(HloInstructionTest, TrivialMap) {
       HloInstruction::CreateParameter(0, f32a100x10, ""));
   auto map = builder.AddInstruction(
       HloInstruction::CreateMap(f32a100x10, {param0}, add_f32));
-  module.AddEntryComputation(builder.Build());
+  module->AddEntryComputation(builder.Build());
 
   OpAndUserCollectingVisitor visitor;
   ASSERT_IS_OK(map->Accept(&visitor));
@@ -373,8 +375,8 @@ TEST_F(HloInstructionTest, TrivialReduce) {
       HloInstruction::CreateParameter(1, r0f32, "y"));
   embedded_builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, paramx, paramy));
-  HloModule module(TestName());
-  auto add_f32 = module.AddEmbeddedComputation(embedded_builder.Build());
+  auto module = CreateNewModule();
+  auto add_f32 = module->AddEmbeddedComputation(embedded_builder.Build());
 
   // Builds a parameter and an initial value and feeds them to the reduce.
   HloComputation::Builder builder(TestName());
@@ -387,7 +389,7 @@ TEST_F(HloInstructionTest, TrivialReduce) {
   auto reduce = builder.AddInstruction(
       HloInstruction::CreateReduce(f32v100, param0, const0,
                                    /*dimensions_to_reduce=*/{1}, add_f32));
-  module.AddEntryComputation(builder.Build());
+  module->AddEntryComputation(builder.Build());
 
   OpAndUserCollectingVisitor visitor;
   ASSERT_IS_OK(reduce->Accept(&visitor));
@@ -414,8 +416,8 @@ TEST_F(HloInstructionTest, ReplaceUseInBinaryOps) {
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, foo, foo));
   builder.AddInstruction(HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd,
                                                       add_foobar, add_foofoo));
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(2, foo->user_count());
   EXPECT_EQ(1, bar->user_count());
@@ -449,8 +451,8 @@ TEST_F(HloInstructionTest, ReplaceUseInVariadicOp) {
       builder.AddInstruction(HloInstruction::CreateTuple({foo, bar, baz, foo}));
   auto add_foobar = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, foo, bar));
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(2, foo->user_count());
   EXPECT_THAT(foo->users(), UnorderedElementsAre(tuple, add_foobar));
@@ -477,8 +479,8 @@ TEST_F(HloInstructionTest, ReplaceUseInUnaryOp) {
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, foo));
   auto log = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kLog, foo));
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(2, foo->user_count());
   EXPECT_THAT(foo->users(), UnorderedElementsAre(exp, log));
@@ -514,8 +516,8 @@ TEST_F(HloInstructionTest, ReplaceAllUsesWithInBinaryOps) {
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, foo, foo));
   builder.AddInstruction(HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd,
                                                       add_foobar, add_foofoo));
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(2, foo->user_count());
   EXPECT_EQ(1, bar->user_count());
@@ -544,8 +546,8 @@ TEST_F(HloInstructionTest, ReplaceAllUsesInMultipleOps) {
   auto exp = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, foo));
   auto tuple = builder.AddInstruction(HloInstruction::CreateTuple({foo, bar}));
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(3, foo->user_count());
   EXPECT_EQ(2, bar->user_count());
@@ -609,8 +611,8 @@ TEST_F(HloInstructionTest, PostProcessAllVisitedNodes) {
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kLog, foo));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd, exp, log));
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   NodeCollectorAndPostProcessor visitor;
   ASSERT_IS_OK(add->Accept(&visitor));
@@ -627,8 +629,8 @@ TEST_F(HloInstructionTest, SingletonFusionOp) {
       HloInstruction::CreateConstant(Literal::CreateR0<float>(1.1f)));
   auto exp = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, constant));
-  HloModule module(TestName());
-  auto* computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {exp}, HloInstruction::FusionKind::kLoop);
 
@@ -645,8 +647,8 @@ TEST_F(HloInstructionTest, BinaryFusionOp) {
       HloInstruction::CreateConstant(Literal::CreateR0<float>(42.1f)));
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       r0f32_, HloOpcode::kAdd, constant1, constant2));
-  HloModule module(TestName());
-  auto* computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {add}, HloInstruction::FusionKind::kLoop);
 
@@ -667,8 +669,8 @@ TEST_F(HloInstructionTest, ChainFusionOp) {
   auto exp3 = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, exp2));
 
-  HloModule module(TestName());
-  auto* computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {exp3, exp2, exp1}, HloInstruction::FusionKind::kLoop);
 
@@ -690,8 +692,8 @@ TEST_F(HloInstructionTest, PreserveMetadataInFusionAndClone) {
   exp1->set_metadata(metadata);
   exp2->set_metadata(metadata);
 
-  HloModule module(TestName());
-  auto* computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {exp2, exp1}, HloInstruction::FusionKind::kLoop);
 
@@ -746,13 +748,13 @@ TEST_F(HloInstructionTest, PreserveTupleShapeThroughClone) {
 TEST_F(HloInstructionTest, FusionOpWithCalledComputations) {
   // Create a fusion instruction containing a single unary operation.
   const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
-  HloModule module(TestName());
+  auto module = CreateNewModule();
 
   auto make_map_computation = [&]() {
     auto builder = HloComputation::Builder("FusionMap");
     builder.AddInstruction(
         HloInstruction::CreateParameter(0, scalar_shape, "param"));
-    return module.AddEmbeddedComputation(builder.Build());
+    return module->AddEmbeddedComputation(builder.Build());
   };
 
   HloComputation* computation_x = make_map_computation();
@@ -767,7 +769,7 @@ TEST_F(HloInstructionTest, FusionOpWithCalledComputations) {
       scalar_shape, {map_1_x}, computation_x, /*static_operands=*/{}));
   auto map_3_y = builder.AddInstruction(HloInstruction::CreateMap(
       scalar_shape, {map_2_x}, computation_y, /*static_operands=*/{}));
-  auto* computation = module.AddEntryComputation(builder.Build());
+  auto* computation = module->AddEntryComputation(builder.Build());
 
   auto* fusion = computation->CreateFusionInstruction(
       {map_3_y}, HloInstruction::FusionKind::kLoop);
@@ -814,8 +816,8 @@ TEST_F(HloInstructionTest, ComplexFusionOp) {
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({sub, sub, mul, c1}));
 
-  HloModule module(TestName());
-  auto* computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {tuple, sub, mul, exp, clamp, add}, HloInstruction::FusionKind::kLoop);
 
@@ -940,8 +942,8 @@ TEST_F(HloInstructionTest, FunctionVisitor) {
       HloInstruction::CreateUnary(f32, HloOpcode::kExp, param));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(f32, HloOpcode::kAdd, negate, exp));
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   int visit_num = 0;
   std::unordered_map<HloInstruction*, int> visit_order;
@@ -969,8 +971,8 @@ TEST_F(HloInstructionTest, FullyElementwise) {
       builder.AddInstruction(HloInstruction::CreateParameter(1, r1f32, "y"));
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(r1f32, HloOpcode::kAdd, x, y));
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   EXPECT_TRUE(add->IsElementwise());
   for (int i = 0; i < add->operand_count(); ++i) {
@@ -1013,8 +1015,8 @@ TEST_F(HloInstructionTest, PartiallyElementwise) {
   HloInstruction* max = builder.AddInstruction(
       HloInstruction::CreateBinary(r2f32, HloOpcode::kMaximum, div, broadcast));
 
-  HloModule module(TestName());
-  auto* computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
   HloInstruction* fusion = computation->CreateFusionInstruction(
       {max, broadcast, div, mul}, HloInstruction::FusionKind::kLoop);
   EXPECT_FALSE(fusion->IsElementwise());
@@ -1056,8 +1058,8 @@ TEST_F(HloInstructionTest, PartiallyElementwiseWithReuse) {
   HloInstruction* sub = builder.AddInstruction(HloInstruction::CreateBinary(
       r1f32, HloOpcode::kSubtract, min, broadcast));
 
-  HloModule module(TestName());
-  auto* computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
   HloInstruction* fusion = computation->CreateFusionInstruction(
       {sub, broadcast, min}, HloInstruction::FusionKind::kLoop);
   EXPECT_FALSE(fusion->IsElementwise());
@@ -1099,10 +1101,10 @@ TEST_F(HloInstructionTest, CloneOfFusionPreservesShape) {
   HloInstruction* dot = builder.AddInstruction(
       HloInstruction::CreateDot(sout, x, reshape, dot_dnums));
 
-  HloModule module(TestName());
-  auto* computation = module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
   HloInstruction* fusion = computation->CreateFusionInstruction(
-      {dot, reshape}, HloInstruction::FusionKind::kTransposeDot);
+      {dot, reshape}, HloInstruction::FusionKind::kLoop);
 
   auto fusion2 = fusion->Clone();
   const HloInstruction* root = fusion->fused_expression_root();
@@ -1118,7 +1120,7 @@ TEST_F(HloInstructionTest, CloneOfFusionPreservesShape) {
 }
 
 TEST_F(HloInstructionTest, FusionEquality) {
-  HloModule module(TestName());
+  auto module = CreateNewModule();
   HloComputation::Builder builder(TestName());
 
   // Create two fusion instructions containing a single unary operation.
@@ -1128,7 +1130,7 @@ TEST_F(HloInstructionTest, FusionEquality) {
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kExp, parameter));
   auto neg = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, parameter));
-  auto* computation = module.AddEntryComputation(builder.Build());
+  auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {exp}, HloInstruction::FusionKind::kLoop);
   auto* fusion2 = computation->CreateFusionInstruction(
@@ -1140,7 +1142,7 @@ TEST_F(HloInstructionTest, FusionEquality) {
 }
 
 TEST_F(HloInstructionTest, NestedFusionEquality) {
-  HloModule module(TestName());
+  auto module = CreateNewModule();
   HloComputation::Builder builder(TestName());
 
   // Build a nested fusion computation.
@@ -1166,10 +1168,10 @@ TEST_F(HloInstructionTest, NestedFusionEquality) {
       data_shape, HloOpcode::kSubtract, dot, add_operand));
   builder.AddInstruction(
       HloInstruction::CreateBinary(data_shape, HloOpcode::kMultiply, add, sub));
-  auto computation = module.AddEntryComputation(builder.Build());
+  auto computation = module->AddEntryComputation(builder.Build());
 
   auto nested_fusion = computation->CreateFusionInstruction(
-      {dot, b_t}, HloInstruction::FusionKind::kTransposeDot);
+      {dot, b_t}, HloInstruction::FusionKind::kLoop);
 
   auto fusion = computation->CreateFusionInstruction(
       {add, nested_fusion}, HloInstruction::FusionKind::kOutput);
@@ -1244,15 +1246,8 @@ TEST_F(HloInstructionTest, Stringification) {
             "%dot = f32[5,20]{1,0} dot(f32[5,10]{1,0} %x, f32[10,20]{1,0} "
             "%transpose), lhs_contracting_dims={1}, rhs_contracting_dims={0}");
 
-  HloModule module(TestName());
-  auto* computation = module.AddEntryComputation(builder.Build());
-  HloInstruction* fusion = computation->CreateFusionInstruction(
-      {dot, reshape}, HloInstruction::FusionKind::kTransposeDot);
-
-  EXPECT_EQ(
-      fusion->ToString(options),
-      "%dot_fusion = f32[5,20]{1,0} fusion(f32[5,10]{1,0} %x, "
-      "f32[20,10]{1,0} %y), kind=kTransposeDot, calls=%fused_computation");
+  auto module = CreateNewModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
 
   HloInstruction* loop = builder.AddInstruction(
       HloInstruction::CreateWhile(sout, computation, computation, x));
@@ -1295,8 +1290,8 @@ TEST_F(HloInstructionTest, StringifyGather_0) {
               /*index_vector_dim=*/4),
           /*window_bounds=*/{30, 29, 28, 27, 26}));
 
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(gather_instruction->ToString(),
             "%gather = f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} "
@@ -1331,8 +1326,8 @@ TEST_F(HloInstructionTest, StringifyGather_1) {
               /*index_vector_dim=*/2),
           /*window_bounds=*/{30, 29, 28, 27, 26}));
 
-  HloModule module(TestName());
-  module.AddEntryComputation(builder.Build());
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(gather_instruction->ToString(),
             "%gather = f32[10,9,7,6,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} "
@@ -1343,5 +1338,275 @@ TEST_F(HloInstructionTest, StringifyGather_1) {
             "index_vector_dim=2, window_bounds={30,29,28,27,26}");
 }
 
+TEST_F(HloInstructionTest, CanonnicalStringificationFusion) {
+  // Tests stringification of a simple op, fusion, while, and conditional.
+  const Shape s1 = ShapeUtil::MakeShape(F32, {5, 10});
+  const Shape s2 = ShapeUtil::MakeShape(F32, {20, 10});
+  const Shape s2t = ShapeUtil::MakeShape(F32, {10, 20});
+  const Shape sout = ShapeUtil::MakeShape(F32, {5, 20});
+
+  HloComputation::Builder builder("TransposeDot");
+  HloInstruction* x =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, s1, "x"));
+  HloInstruction* y =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, s2, "y"));
+  HloInstruction* reshape =
+      builder.AddInstruction(HloInstruction::CreateTranspose(s2t, y, {1, 0}));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  HloInstruction* dot = builder.AddInstruction(
+      HloInstruction::CreateDot(sout, x, reshape, dot_dnums));
+
+  auto options = HloPrintOptions().Canonical();
+
+  EXPECT_EQ(dot->ToString(options),
+            "f32[5,20]{1,0} dot(f32[5,10]{1,0}, f32[10,20]{1,0}), "
+            "lhs_contracting_dims={1}, rhs_contracting_dims={0}");
+
+  auto module = CreateNewModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
+  HloInstruction* fusion = computation->CreateFusionInstruction(
+      {dot, reshape}, HloInstruction::FusionKind::kLoop);
+
+  EXPECT_EQ(
+      fusion->ToString(options),
+      R"(f32[5,20]{1,0} fusion(f32[5,10]{1,0}, f32[20,10]{1,0}), kind=kLoop, calls=
+{
+  tmp_0 = f32[5,10]{1,0} parameter(0)
+  tmp_1 = f32[20,10]{1,0} parameter(1)
+  tmp_2 = f32[10,20]{1,0} transpose(f32[20,10]{1,0} tmp_1), dimensions={1,0}
+  ROOT tmp_3 = f32[5,20]{1,0} dot(f32[5,10]{1,0} tmp_0, f32[10,20]{1,0} tmp_2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})");
+}
+
+TEST_F(HloInstructionTest, CanonnicalStringificationWhile) {
+  // Tests stringification of a simple op, fusion, while, and conditional.
+  const Shape s1 = ShapeUtil::MakeShape(F32, {5, 10});
+  const Shape s2 = ShapeUtil::MakeShape(F32, {20, 10});
+  const Shape s2t = ShapeUtil::MakeShape(F32, {10, 20});
+  const Shape sout = ShapeUtil::MakeShape(F32, {5, 20});
+
+  HloComputation::Builder builder("TransposeDot");
+  HloInstruction* x =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, s1, "x"));
+  HloInstruction* y =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, s2, "y"));
+  HloInstruction* reshape =
+      builder.AddInstruction(HloInstruction::CreateTranspose(s2t, y, {1, 0}));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  HloInstruction* dot = builder.AddInstruction(
+      HloInstruction::CreateDot(sout, x, reshape, dot_dnums));
+
+  auto module = CreateNewModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
+  computation->CreateFusionInstruction({dot, reshape},
+                                       HloInstruction::FusionKind::kLoop);
+
+  HloInstruction* loop = builder.AddInstruction(
+      HloInstruction::CreateWhile(sout, computation, computation, x));
+
+  auto options = HloPrintOptions().Canonical();
+  EXPECT_EQ(loop->ToString(options),
+            R"(f32[5,20]{1,0} while(f32[5,10]{1,0}), condition=
+{
+  tmp_0 = f32[5,10]{1,0} parameter(0)
+  tmp_1 = f32[20,10]{1,0} parameter(1)
+  ROOT tmp_2 = f32[5,20]{1,0} fusion(f32[5,10]{1,0} tmp_0, f32[20,10]{1,0} tmp_1), kind=kLoop, calls=
+  {
+    tmp_0 = f32[5,10]{1,0} parameter(0)
+    tmp_1 = f32[20,10]{1,0} parameter(1)
+    tmp_2 = f32[10,20]{1,0} transpose(f32[20,10]{1,0} tmp_1), dimensions={1,0}
+    ROOT tmp_3 = f32[5,20]{1,0} dot(f32[5,10]{1,0} tmp_0, f32[10,20]{1,0} tmp_2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  }
+}, body=
+{
+  tmp_0 = f32[5,10]{1,0} parameter(0)
+  tmp_1 = f32[20,10]{1,0} parameter(1)
+  ROOT tmp_2 = f32[5,20]{1,0} fusion(f32[5,10]{1,0} tmp_0, f32[20,10]{1,0} tmp_1), kind=kLoop, calls=
+  {
+    tmp_0 = f32[5,10]{1,0} parameter(0)
+    tmp_1 = f32[20,10]{1,0} parameter(1)
+    tmp_2 = f32[10,20]{1,0} transpose(f32[20,10]{1,0} tmp_1), dimensions={1,0}
+    ROOT tmp_3 = f32[5,20]{1,0} dot(f32[5,10]{1,0} tmp_0, f32[10,20]{1,0} tmp_2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  }
+})");
+}
+
+TEST_F(HloInstructionTest, CanonnicalStringificationConditional) {
+  // Tests stringification of a simple op, fusion, while, and conditional.
+  const Shape s1 = ShapeUtil::MakeShape(F32, {5, 10});
+  const Shape s2 = ShapeUtil::MakeShape(F32, {20, 10});
+  const Shape s2t = ShapeUtil::MakeShape(F32, {10, 20});
+  const Shape sout = ShapeUtil::MakeShape(F32, {5, 20});
+
+  HloComputation::Builder builder("TransposeDot");
+  HloInstruction* x =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, s1, "x"));
+  HloInstruction* y =
+      builder.AddInstruction(HloInstruction::CreateParameter(1, s2, "y"));
+  HloInstruction* reshape =
+      builder.AddInstruction(HloInstruction::CreateTranspose(s2t, y, {1, 0}));
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  HloInstruction* dot = builder.AddInstruction(
+      HloInstruction::CreateDot(sout, x, reshape, dot_dnums));
+
+  auto module = CreateNewModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
+  computation->CreateFusionInstruction({dot, reshape},
+                                       HloInstruction::FusionKind::kLoop);
+
+  builder.AddInstruction(
+      HloInstruction::CreateWhile(sout, computation, computation, x));
+
+  auto pred = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<bool>(true)));
+  HloInstruction* conditional =
+      builder.AddInstruction(HloInstruction::CreateConditional(
+          sout, pred, x, computation, x, computation));
+  auto options = HloPrintOptions().Canonical();
+  EXPECT_EQ(
+      conditional->ToString(options),
+      R"(f32[5,20]{1,0} conditional(pred[], f32[5,10]{1,0}, f32[5,10]{1,0}), true_computation=
+{
+  tmp_0 = f32[5,10]{1,0} parameter(0)
+  tmp_1 = f32[20,10]{1,0} parameter(1)
+  ROOT tmp_2 = f32[5,20]{1,0} fusion(f32[5,10]{1,0} tmp_0, f32[20,10]{1,0} tmp_1), kind=kLoop, calls=
+  {
+    tmp_0 = f32[5,10]{1,0} parameter(0)
+    tmp_1 = f32[20,10]{1,0} parameter(1)
+    tmp_2 = f32[10,20]{1,0} transpose(f32[20,10]{1,0} tmp_1), dimensions={1,0}
+    ROOT tmp_3 = f32[5,20]{1,0} dot(f32[5,10]{1,0} tmp_0, f32[10,20]{1,0} tmp_2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  }
+}, false_computation=
+{
+  tmp_0 = f32[5,10]{1,0} parameter(0)
+  tmp_1 = f32[20,10]{1,0} parameter(1)
+  ROOT tmp_2 = f32[5,20]{1,0} fusion(f32[5,10]{1,0} tmp_0, f32[20,10]{1,0} tmp_1), kind=kLoop, calls=
+  {
+    tmp_0 = f32[5,10]{1,0} parameter(0)
+    tmp_1 = f32[20,10]{1,0} parameter(1)
+    tmp_2 = f32[10,20]{1,0} transpose(f32[20,10]{1,0} tmp_1), dimensions={1,0}
+    ROOT tmp_3 = f32[5,20]{1,0} dot(f32[5,10]{1,0} tmp_0, f32[10,20]{1,0} tmp_2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  }
+})");
+}
+
+TEST_F(HloInstructionTest, CheckDeepClone) {
+  const char* const hlo_string = R"(
+HloModule Module
+
+addy (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  rhs = s32[] parameter(1)
+  ROOT zadd = s32[] add(lhs, rhs)
+}
+
+calla (x: s32[]) -> s32[] {
+  x = s32[] parameter(0)
+  reduce = s32[] reduce-window(x, x), to_apply=addy
+  ROOT xadd = s32[] add(x, reduce)
+}
+
+body (bparam: s32[]) -> s32[] {
+  constant = s32[] constant(1)
+  bparam = s32[] parameter(0)
+  v = s32[] call(bparam), to_apply=calla
+  ROOT add = s32[] add(constant, bparam)
+}
+
+condition (cparam: s32[]) -> pred[] {
+  xconstant = s32[] constant(5)
+  cparam = s32[] parameter(0)
+  ROOT greater-than = pred[] greater-than(xconstant, cparam)
+}
+
+ENTRY entry (param: s32[]) -> s32[] {
+  eparam = s32[] parameter(0)
+  ROOT while = s32[] while(eparam), condition=condition, body=body
+ }
+)";
+  // Check that deep clones really deep clones every instruction and
+  // computations, without leaving dangling pointers to the old module.
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+  std::unique_ptr<HloModule> clone = module->Clone();
+  for (HloComputation* computation : clone->computations()) {
+    EXPECT_EQ(computation->parent(), clone.get());
+    for (HloInstruction* instruction : computation->instructions()) {
+      EXPECT_EQ(instruction->parent()->parent(), clone.get());
+    }
+  }
+}
+
+TEST_F(HloInstructionTest, IdenticalAccountsForBackendConfig) {
+  const Shape shape = ShapeUtil::MakeShape(F32, {42});
+  HloComputation::Builder builder("test");
+  HloInstruction* p =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p"));
+
+  HloInstruction* add1 = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, p, p));
+  HloInstruction* add2 = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, p, p));
+
+  EXPECT_TRUE(add1->Identical(*add2));
+  add1->set_raw_backend_config_string("abc");
+  EXPECT_FALSE(add1->Identical(*add2));
+}
+
+TEST_F(HloInstructionTest, IdenticalAccountsForCustomCallWindow) {
+  auto instr1 = HloInstruction::CreateCustomCall(ShapeUtil::MakeShape(F32, {}),
+                                                 /*operands=*/{},
+                                                 /*custom_call_target=*/"foo");
+  auto instr2 = instr1->Clone();
+  EXPECT_TRUE(instr1->Identical(*instr2));
+
+  Window w = window_util::MakeWindow({1, 2, 3});
+  instr1->set_window(w);
+  EXPECT_FALSE(instr1->Identical(*instr2));
+}
+
+TEST_F(HloInstructionTest, IdenticalAccountsForCustomCallDnums) {
+  auto instr1 = HloInstruction::CreateCustomCall(ShapeUtil::MakeShape(F32, {}),
+                                                 /*operands=*/{},
+                                                 /*custom_call_target=*/"foo");
+  auto instr2 = instr1->Clone();
+  EXPECT_TRUE(instr1->Identical(*instr2));
+
+  ConvolutionDimensionNumbers dnums;
+  dnums.set_output_batch_dimension(42);
+  instr1->set_convolution_dimension_numbers(dnums);
+  EXPECT_FALSE(instr1->Identical(*instr2));
+}
+
+TEST_F(HloInstructionTest, CloneWindowOnCustomCall) {
+  auto instr = HloInstruction::CreateCustomCall(ShapeUtil::MakeShape(F32, {}),
+                                                /*operands=*/{},
+                                                /*custom_call_target=*/"foo");
+  Window w = window_util::MakeWindow({1, 2, 3});
+  instr->set_window(w);
+  auto clone = instr->Clone();
+  EXPECT_TRUE(protobuf_util::ProtobufEquals(clone->window(), w))
+      << clone->window().DebugString();
+}
+
+TEST_F(HloInstructionTest, CloneDnumsOnCustomCall) {
+  auto instr = HloInstruction::CreateCustomCall(ShapeUtil::MakeShape(F32, {}),
+                                                /*operands=*/{},
+                                                /*custom_call_target=*/"foo");
+  ConvolutionDimensionNumbers dnums;
+  dnums.set_output_batch_dimension(42);
+  instr->set_convolution_dimension_numbers(dnums);
+  auto clone = instr->Clone();
+  EXPECT_TRUE(protobuf_util::ProtobufEquals(
+      clone->convolution_dimension_numbers(), dnums))
+      << clone->convolution_dimension_numbers().DebugString();
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_lexer.cc b/tensorflow/compiler/xla/service/hlo_lexer.cc
similarity index 94%
rename from tensorflow/compiler/xla/tools/parser/hlo_lexer.cc
rename to tensorflow/compiler/xla/service/hlo_lexer.cc
index fc0e4444521247..f0d9fdbc8f86da 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_lexer.cc
+++ b/tensorflow/compiler/xla/service/hlo_lexer.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/tools/parser/hlo_lexer.h"
+#include "tensorflow/compiler/xla/service/hlo_lexer.h"
 
 #include <unordered_map>
 
@@ -26,9 +26,8 @@ limitations under the License.
 #include "tensorflow/core/platform/regexp.h"
 
 namespace xla {
-namespace tools {
 
-using tensorflow::StringPiece;
+using ::tensorflow::StringPiece;
 
 namespace {
 
@@ -67,12 +66,12 @@ bool HloLexer::CanDereference(const char* ptr) const {
   return ptr < buf_.end() && ptr >= buf_.begin();
 }
 
-StringPiece HloLexer::StringPieceFromPointers(const char* begin,
-                                              const char* end) const {
+tensorflow::StringPiece HloLexer::StringPieceFromPointers(
+    const char* begin, const char* end) const {
   CHECK(begin <= end);
   CHECK(begin == buf_.end() || CanDereference(begin));
   CHECK(end == buf_.end() || CanDereference(end));
-  return StringPiece(begin, end - begin);
+  return tensorflow::StringPiece(begin, end - begin);
 }
 
 tensorflow::RegexpStringPiece HloLexer::RegexpStringPieceFromPointers(
@@ -197,7 +196,8 @@ TokKind HloLexer::LexIdentifier() {
     return TokKind::kAttributeName;
   }
 
-  StringPiece identifier = StringPieceFromPointers(token_start_, current_ptr_);
+  tensorflow::StringPiece identifier =
+      StringPieceFromPointers(token_start_, current_ptr_);
 
   // See if this is a keyword.
 #define KEYWORD(STR)            \
@@ -230,7 +230,7 @@ TokKind HloLexer::LexIdentifier() {
     }
   }
 
-  str_val_ = identifier.ToString();
+  str_val_ = std::string(identifier);
   return TokKind::kIdent;
 }
 
@@ -332,23 +332,24 @@ std::pair<unsigned, unsigned> HloLexer::GetLineAndColumn(LocTy location) const {
   line_no_cache_.last_query = ptr;
   line_no_cache_.line_no_of_query = line_no;
   size_t line_offset = StringPieceFromPointers(start, ptr).rfind('\n');
-  if (line_offset == StringPiece::npos) {
+  if (line_offset == tensorflow::StringPiece::npos) {
     line_offset = 0;
   }
   return {line_no, ptr - start - line_offset};
 }
 
-StringPiece HloLexer::GetLine(LocTy loc) const {
+tensorflow::StringPiece HloLexer::GetLine(LocTy loc) const {
   if (!CanDereference(loc)) {
     return "LINE OUT OF RANGE";
   }
   size_t line_start =
       StringPieceFromPointers(buf_.begin(), loc + 1).rfind('\n');
-  const char* start = line_start == StringPiece::npos
+  const char* start = line_start == tensorflow::StringPiece::npos
                           ? buf_.begin()
                           : buf_.begin() + line_start + 1;
   size_t line_end = StringPieceFromPointers(loc, buf_.end()).find('\n');
-  const char* end = line_end == StringPiece::npos ? buf_.end() : loc + line_end;
+  const char* end =
+      line_end == tensorflow::StringPiece::npos ? buf_.end() : loc + line_end;
 
   return StringPieceFromPointers(start, end);
 }
@@ -370,7 +371,7 @@ TokKind HloLexer::LexString() {
   static LazyRE2 escaping_pattern = {R"("([^"\\]|\\.)*")"};
   if (RE2::Consume(&consumable, *escaping_pattern)) {
     current_ptr_ = consumable.begin();
-    StringPiece raw =
+    tensorflow::StringPiece raw =
         StringPieceFromPointers(token_start_ + 1, current_ptr_ - 1);
     string error;
     if (!tensorflow::str_util::CUnescape(raw, &str_val_, &error)) {
@@ -453,5 +454,4 @@ string TokKindToString(TokKind kind) {
   }
 }
 
-}  // namespace tools
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_lexer.h b/tensorflow/compiler/xla/service/hlo_lexer.h
similarity index 90%
rename from tensorflow/compiler/xla/tools/parser/hlo_lexer.h
rename to tensorflow/compiler/xla/service/hlo_lexer.h
index 27880b9b8afbfa..ceb674f25e94ac 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_lexer.h
+++ b/tensorflow/compiler/xla/service/hlo_lexer.h
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_TOOLS_PARSER_HLO_LEXER_H_
-#define TENSORFLOW_COMPILER_XLA_TOOLS_PARSER_HLO_LEXER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_LEXER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_LEXER_H_
 
 #include <string>
 
-#include "tensorflow/compiler/xla/tools/parser/hlo_token.h"
+#include "tensorflow/compiler/xla/service/hlo_token.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -27,9 +27,11 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
-namespace tools {
 
 // Lexer for the HloModule::ToString() format text.
+//
+// This class is meant to be used by hlo_parser.cc.  You shouldn't need to use
+// it directly.
 class HloLexer {
  public:
   explicit HloLexer(tensorflow::StringPiece buf) : buf_(buf) {
@@ -57,7 +59,7 @@ class HloLexer {
     CHECK(GetKind() == TokKind::kShape);
     return shape_val_;
   }
-  int64 GetInt64Val() const {
+  tensorflow::int64 GetInt64Val() const {
     CHECK(GetKind() == TokKind::kInt);
     return int64_val_;
   }
@@ -114,7 +116,7 @@ class HloLexer {
   TokKind current_kind_;
   string str_val_;
   Shape shape_val_;
-  int64 int64_val_;
+  tensorflow::int64 int64_val_;
   double decimal_val_;
 
   struct LineNoCacheTy {
@@ -125,7 +127,6 @@ class HloLexer {
   mutable LineNoCacheTy line_no_cache_{nullptr, 0};
 };
 
-}  // namespace tools
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_TOOLS_PARSER_HLO_LEXER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_LEXER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_liveness_analysis.cc b/tensorflow/compiler/xla/service/hlo_liveness_analysis.cc
new file mode 100644
index 00000000000000..43c41ece6efc4f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_liveness_analysis.cc
@@ -0,0 +1,306 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_liveness_analysis.h"
+
+#include <deque>
+
+#include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/call_graph.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+
+using Worklist = std::deque<const HloInstruction*>;
+using Workset = std::unordered_set<const HloInstruction*>;
+
+namespace {
+
+void AddToWorklist(const HloInstruction* instruction, Worklist* worklist,
+                   Workset* workset) {
+  if (workset->count(instruction) == 0) {
+    worklist->push_back(instruction);
+    workset->insert(instruction);
+    VLOG(3) << "ADD instruction: " << instruction->name();
+  }
+}
+
+using VisitorFunction = std::function<void(const ShapeIndex& /*index*/)>;
+
+void ForEachLiveIndex(const ShapeTree<bool>& index_tree,
+                      const VisitorFunction& func) {
+  index_tree.ForEachElement([&](const ShapeIndex& shape_index, bool live) {
+    if (live) {
+      func(shape_index);
+    }
+  });
+}
+
+// Marks 'instruction' output live at 'shape_index'.
+// Adds to 'worklist' iff:
+// *) 'instruction' is not already on worklist.
+// *) 'shape_index' has not yet been visited.
+void MarkLiveAtIndex(const HloInstruction* instruction,
+                     const ShapeIndex& shape_index,
+                     HloLivenessAnalysis::HloIndexMap* live_index_map,
+                     Worklist* worklist, Workset* workset) {
+  auto it = live_index_map->find(instruction);
+  if (it == live_index_map->end()) {
+    auto it_added = live_index_map->emplace(
+        std::piecewise_construct, std::forward_as_tuple(instruction),
+        std::forward_as_tuple(instruction->shape(), /*init_value=*/false));
+    it = it_added.first;
+  }
+  if (it->second.element(shape_index) == false) {
+    AddToWorklist(instruction, worklist, workset);
+    *it->second.mutable_element(shape_index) = true;
+    VLOG(3) << "MARK instruction: " << instruction->name()
+            << " shape_index: " << shape_index.ToString();
+  }
+}
+
+// Marks 'instruction' live at all shape indices in its output.
+void MarkLiveAtAllIndices(const HloInstruction* instruction,
+                          HloLivenessAnalysis::HloIndexMap* live_index_map,
+                          Worklist* worklist, Workset* workset) {
+  bool add_to_worklist = false;
+  auto it = live_index_map->find(instruction);
+  if (it == live_index_map->end()) {
+    live_index_map->emplace(
+        std::piecewise_construct, std::forward_as_tuple(instruction),
+        std::forward_as_tuple(instruction->shape(), /*init_value=*/true));
+    add_to_worklist = true;
+  } else {
+    ShapeUtil::ForEachSubshape(
+        instruction->shape(),
+        [&](const Shape& sub_shape, const ShapeIndex& shape_index) {
+          if (it->second.element(shape_index) == false) {
+            add_to_worklist = true;
+            *it->second.mutable_element(shape_index) = true;
+            VLOG(3) << "MARK instruction: " << instruction->name()
+                    << " shape_index: " << shape_index.ToString();
+          }
+        });
+  }
+  if (add_to_worklist) {
+    AddToWorklist(instruction, worklist, workset);
+  }
+}
+
+// Propagates liveness through Tuple instructions.
+// *) For each tuple operand:
+//   *) For tuple output shape index associated with operand:
+//     *) Propgate live shape indices to tuple operand at the associated
+//        shape index in the operands output, and add to worklist.
+void PropagateLivenessThroughTuple(
+    const HloInstruction* instruction,
+    HloLivenessAnalysis::HloIndexMap* live_index_map, Worklist* worklist,
+    Workset* workset) {
+  CHECK_EQ(instruction->opcode(), HloOpcode::kTuple);
+  for (int64 operand_index = 0; operand_index < instruction->operand_count();
+       ++operand_index) {
+    const ShapeTree<bool>& index_tree = FindOrDie(*live_index_map, instruction);
+    ForEachLiveIndex(index_tree, [&](const ShapeIndex& shape_index) {
+      if (shape_index.empty() || shape_index[0] != operand_index) {
+        return;
+      }
+      // Mark top-level index of operand at 'operand_index'.
+      MarkLiveAtIndex(instruction->operand(operand_index), {}, live_index_map,
+                      worklist, workset);
+      // Mark sub-shape index of operand at 'operand_index'.
+      ShapeIndex operand_shape_index;
+      for (int i = 1; i < shape_index.size(); ++i) {
+        operand_shape_index.push_back(shape_index[i]);
+      }
+      MarkLiveAtIndex(instruction->operand(operand_index), operand_shape_index,
+                      live_index_map, worklist, workset);
+    });
+  }
+}
+
+// Propagates liveness through GetTupleElement instructions.
+// *) For each live index in GetTupleElement output, mark output of GTE operand
+//    at associated shape index in its output, and add to worklist.
+void PropagateLivenessThroughGTE(
+    const HloInstruction* instruction,
+    HloLivenessAnalysis::HloIndexMap* live_index_map, Worklist* worklist,
+    Workset* workset) {
+  CHECK_EQ(instruction->opcode(), HloOpcode::kGetTupleElement);
+  // Mark operand top-level index.
+  MarkLiveAtIndex(instruction->operand(0), {}, live_index_map, worklist,
+                  workset);
+  const ShapeTree<bool>& index_tree = FindOrDie(*live_index_map, instruction);
+  // Propagate live shape indices along GTE -> Tuple edge.
+  ForEachLiveIndex(index_tree, [&](const ShapeIndex& shape_index) {
+    ShapeIndex operand_shape_index(shape_index);
+    operand_shape_index.push_front(instruction->tuple_index());
+    MarkLiveAtIndex(instruction->operand(0), operand_shape_index,
+                    live_index_map, worklist, workset);
+  });
+}
+
+// Propagates liveness through While instructions.
+// *) For each live index in While output, mark shape index of while.body.root
+//    and while.operand (adding each to worklist).
+// *) Mark while.cond.root and add to worklist.
+void PropagateLivenessThroughWhile(
+    const HloInstruction* instruction,
+    HloLivenessAnalysis::HloIndexMap* live_index_map, Worklist* worklist,
+    Workset* workset) {
+  CHECK_EQ(instruction->opcode(), HloOpcode::kWhile);
+  const ShapeTree<bool>& index_tree = FindOrDie(*live_index_map, instruction);
+
+  ForEachLiveIndex(index_tree, [&](const ShapeIndex& shape_index) {
+    // Propagate liveness to while body computation root instruction.
+    MarkLiveAtIndex(instruction->while_body()->root_instruction(), shape_index,
+                    live_index_map, worklist, workset);
+    // Propagate liveness to tuple-shaped operand.
+    MarkLiveAtIndex(instruction->operand(0), shape_index, live_index_map,
+                    worklist, workset);
+  });
+
+  // Propagate liveness to while condition computation root instruction.
+  MarkLiveAtIndex(instruction->while_condition()->root_instruction(), {},
+                  live_index_map, worklist, workset);
+}
+
+// Propagates liveness out of Parameter instructions to callers and aliasing
+// positions. This can occur if liveness propagates to a parameter in the
+// while.condition computation, requiring liveness to propagate out to caller
+// callsite while (and while.body.root).
+void PropagateLivenessToParameterCallers(
+    const HloInstruction* instruction,
+    HloLivenessAnalysis::HloIndexMap* live_index_map, Worklist* worklist,
+    Workset* workset, CallGraph* call_graph) {
+  CHECK_EQ(instruction->opcode(), HloOpcode::kParameter);
+  const CallGraphNode& call_graph_node =
+      call_graph->GetNode(instruction->parent());
+  if (call_graph_node.context() == CallContext::kSequential) {
+    for (const CallSite& callsite : call_graph_node.caller_callsites()) {
+      if (callsite.instruction()->opcode() == HloOpcode::kWhile) {
+        auto* xla_while = callsite.instruction();
+        const ShapeTree<bool>& index_tree =
+            FindOrDie(*live_index_map, instruction);
+        ForEachLiveIndex(index_tree, [&](const ShapeIndex& shape_index) {
+          // Propagate liveness to while result{shape_index}
+          MarkLiveAtIndex(xla_while, shape_index, live_index_map, worklist,
+                          workset);
+          // Propagate liveness to while body root{shape_index}.
+          MarkLiveAtIndex(xla_while->while_body()->root_instruction(),
+                          shape_index, live_index_map, worklist, workset);
+          // Propagate liveness to operand(0){shape_index}.
+          MarkLiveAtIndex(xla_while->operand(0), shape_index, live_index_map,
+                          worklist, workset);
+        });
+      }
+    }
+  }
+}
+
+}  // namespace
+
+HloLivenessAnalysis::HloLivenessAnalysis(const HloModule& module)
+    : module_(module), call_graph_(CallGraph::Build(&module)) {}
+
+// Runs liveness analysis on 'module_'.
+// Initializes worklist with entry root instruction (and any instruction with
+// side-effects), marking all of their output shape indices live.
+// Visits elements on worklist, propagating liveness from an instructions
+// live output shape indices to its called computations and operands.
+void HloLivenessAnalysis::RunAnalysis() {
+  Worklist worklist;
+  Workset workset;
+  // Add entry compuation root instruction.
+  MarkLiveAtAllIndices(module_.entry_computation()->root_instruction(),
+                       &live_index_map_, &worklist, &workset);
+  for (auto* computation : module_.computations()) {
+    for (auto* instruction : computation->instructions()) {
+      if (instruction->HasSideEffectNoRecurse()) {
+        // Add instructions with side effects.
+        MarkLiveAtAllIndices(instruction, &live_index_map_, &worklist,
+                             &workset);
+      }
+    }
+  }
+
+  while (!worklist.empty()) {
+    const HloInstruction* instruction = worklist.front();
+    worklist.pop_front();
+    workset.erase(workset.find(instruction));
+    VLOG(1) << "VISIT instruction: " << instruction->name();
+
+    if (instruction->opcode() == HloOpcode::kTuple) {
+      PropagateLivenessThroughTuple(instruction, &live_index_map_, &worklist,
+                                    &workset);
+    } else if (instruction->opcode() == HloOpcode::kGetTupleElement) {
+      PropagateLivenessThroughGTE(instruction, &live_index_map_, &worklist,
+                                  &workset);
+    } else if (instruction->opcode() == HloOpcode::kWhile &&
+               ShapeUtil::IsTuple(instruction->shape())) {
+      PropagateLivenessThroughWhile(instruction, &live_index_map_, &worklist,
+                                    &workset);
+    } else if (instruction->opcode() == HloOpcode::kParameter &&
+               ShapeUtil::IsTuple(instruction->shape())) {
+      PropagateLivenessToParameterCallers(instruction, &live_index_map_,
+                                          &worklist, &workset,
+                                          call_graph_.get());
+    } else {
+      // Propagate liveness to called computations.
+      for (auto* called_computation : instruction->called_computations()) {
+        MarkLiveAtAllIndices(called_computation->root_instruction(),
+                             &live_index_map_, &worklist, &workset);
+      }
+      // Propagate liveness to operands.
+      for (HloInstruction* operand : instruction->operands()) {
+        MarkLiveAtAllIndices(operand, &live_index_map_, &worklist, &workset);
+      }
+    }
+  }
+}
+
+bool HloLivenessAnalysis::IsLive(const HloInstruction* instruction,
+                                 const ShapeIndex& shape_index) const {
+  if (ContainsKey(live_index_map_, instruction)) {
+    return FindOrDie(live_index_map_, instruction).element(shape_index);
+  }
+  return false;
+}
+
+/* static */
+StatusOr<std::unique_ptr<HloLivenessAnalysis>> HloLivenessAnalysis::Run(
+    const HloModule& module) {
+  VLOG(1) << "HloLivenessAnalysis::Run on module " << module.name();
+  XLA_VLOG_LINES(2, module.ToString());
+
+  auto liveness_analysis = WrapUnique(new HloLivenessAnalysis(module));
+
+  liveness_analysis->RunAnalysis();
+
+  return std::move(liveness_analysis);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_liveness_analysis.h b/tensorflow/compiler/xla/service/hlo_liveness_analysis.h
new file mode 100644
index 00000000000000..fe55a8070a42a3
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_liveness_analysis.h
@@ -0,0 +1,66 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_LIVENESS_ANALYSIS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_LIVENESS_ANALYSIS_H_
+
+#include <unordered_map>
+
+#include "tensorflow/compiler/xla/service/call_graph.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_value.h"
+#include "tensorflow/compiler/xla/shape_tree.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// Analysis which identifies all live {HloInstruction, ShapeIndex} pairs in
+// an HLO module.
+//
+// HloLivenessAnalysis marks the shape index of each live output of each
+// instruction in the module, by propagating live shape index information
+// from an instruction to its called computations and operands.
+class HloLivenessAnalysis {
+ public:
+  // Maps from an HloInstruction to its live/dead output shape indices.
+  using HloIndexMap =
+      std::unordered_map<const HloInstruction*, ShapeTree<bool>>;
+
+  // Runs liveness analysis on 'module'. Returns HloLivenessAnalysis object
+  // which exports liveness for each {HloInstruction, ShapeIndex} in 'module'.
+  static StatusOr<std::unique_ptr<HloLivenessAnalysis>> Run(
+      const HloModule& module);
+
+  // Returns true if output of 'instruction' at 'shape_index' is live.
+  // Returns false otherwise.
+  bool IsLive(const HloInstruction* instruction,
+              const ShapeIndex& shape_index) const;
+
+ private:
+  HloLivenessAnalysis(const HloModule& module);
+
+  void RunAnalysis();
+
+  const HloModule& module_;
+  std::unique_ptr<CallGraph> call_graph_;
+  HloIndexMap live_index_map_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_LIVENESS_ANALYSIS_H_
diff --git a/tensorflow/compiler/xla/service/hlo_liveness_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_liveness_analysis_test.cc
new file mode 100644
index 00000000000000..0275294a1a86ce
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_liveness_analysis_test.cc
@@ -0,0 +1,402 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_liveness_analysis.h"
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace {
+
+class HloLivenessAnalysisTest : public HloTestBase {
+ protected:
+  HloLivenessAnalysisTest() {}
+
+  // Run liveness analysis on the member module. For convenience returns a
+  // reference to the generated analysis stored in analysis_.
+  const HloLivenessAnalysis& RunLiveness(HloModule* module) {
+    liveness_ = HloLivenessAnalysis::Run(*module).ConsumeValueOrDie();
+    return *liveness_;
+  }
+
+  HloInstruction* GetInstruction(HloModule* module, const string& name) {
+    HloInstruction* to_return = nullptr;
+    for (auto* comp : module->computations()) {
+      for (auto* inst : comp->instructions()) {
+        if (inst->name() == name) {
+          to_return = inst;
+          break;
+        }
+      }
+    }
+    return CHECK_NOTNULL(to_return);
+  }
+
+  std::unique_ptr<HloLivenessAnalysis> liveness_;
+};
+
+// Test that add instruction at entry root is live at all output shape indices.
+TEST_F(HloLivenessAnalysisTest, AddAtEntryRoot) {
+  auto module = ParseHloString(R"(
+  HloModule SimpleModule
+  ENTRY SimpleComputation {
+    constant.1 = s32[] constant(0)
+    constant.2 = s32[] constant(1)
+    ROOT add = s32[] add(constant.1, constant.2)
+  })")
+                    .ValueOrDie();
+  const HloLivenessAnalysis& liveness = RunLiveness(module.get());
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "add"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "constant.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "constant.2"), {}));
+}
+
+// Test that a dead add instruction is marked as dead by analysis.
+TEST_F(HloLivenessAnalysisTest, DeadAdd) {
+  auto module = ParseHloString(R"(
+  HloModule SimpleModule
+  ENTRY SimpleComputation {
+    constant.1 = s32[] constant(0)
+    constant.2 = s32[] constant(1)
+    add.1 = s32[] add(constant.1, constant.2)
+    ROOT add.2 = s32[] add(constant.1, constant.2)
+  })")
+                    .ValueOrDie();
+  const HloLivenessAnalysis& liveness = RunLiveness(module.get());
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "add.2"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "constant.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "constant.2"), {}));
+  EXPECT_FALSE(liveness.IsLive(GetInstruction(module.get(), "add.1"), {}));
+}
+
+// Test that all output shape indices of entry root tuple (and defining
+// instruction in its output) are marked live.
+TEST_F(HloLivenessAnalysisTest, TupleAtEntryRoot) {
+  auto module = ParseHloString(R"(
+  HloModule SimpleModule
+  ENTRY SimpleComputation {
+    constant.1 = s32[] constant(0)
+    constant.2 = s32[] constant(1)
+    ROOT tuple.1 = (s32[], s32[]) tuple(constant.1, constant.2)
+  })")
+                    .ValueOrDie();
+  const HloLivenessAnalysis& liveness = RunLiveness(module.get());
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {0}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {1}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "constant.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "constant.2"), {}));
+}
+
+// Tests that all outputs of nested tuple and entry root (and defining
+// instruction values appearing in its output) are marked live.
+TEST_F(HloLivenessAnalysisTest, NestedTupleAtEntryRoot) {
+  auto module = ParseHloString(R"(
+  HloModule SimpleModule
+  ENTRY SimpleComputation {
+    constant.1 = s32[] constant(1)
+    constant.2 = s32[] constant(2)
+    constant.3 = s32[] constant(3)
+    tuple.1 = (s32[], s32[]) tuple(constant.2, constant.3)
+    ROOT tuple.2 = (s32[], s32[]) tuple(constant.1, tuple.1)
+  })")
+                    .ValueOrDie();
+  const HloLivenessAnalysis& liveness = RunLiveness(module.get());
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {0}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {1}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {0}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {1}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {1, 0}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {1, 1}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "constant.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "constant.2"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "constant.3"), {}));
+}
+
+// Tests that GTE at entry root of Tuple instruction only propgates liveness
+// to the live elements in tuple.
+TEST_F(HloLivenessAnalysisTest, GteOfTuple) {
+  auto module = ParseHloString(R"(
+  HloModule SimpleModule
+  ENTRY SimpleComputation {
+    constant.1 = s32[] constant(0)
+    constant.2 = s32[] constant(1)
+    tuple.1 = (s32[], s32[]) tuple(constant.1, constant.2)
+    ROOT get-tuple-element.1 = s32[] get-tuple-element(tuple.1), index=0
+  })")
+                    .ValueOrDie();
+  const HloLivenessAnalysis& liveness = RunLiveness(module.get());
+  EXPECT_TRUE(
+      liveness.IsLive(GetInstruction(module.get(), "get-tuple-element.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {0}));
+  EXPECT_FALSE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {1}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "constant.1"), {}));
+  EXPECT_FALSE(liveness.IsLive(GetInstruction(module.get(), "constant.2"), {}));
+}
+
+// Tests that GTE at entry root of nested Tuple instruction only propgates
+// liveness to the live elements in tuple.
+TEST_F(HloLivenessAnalysisTest, GteOfNestedTuple) {
+  auto module = ParseHloString(R"(
+  HloModule SimpleModule
+  ENTRY SimpleComputation {
+    constant.1 = s32[] constant(0)
+    constant.2 = s32[] constant(1)
+    constant.3 = s32[] constant(2)
+    tuple.1 = (s32[], s32[]) tuple(constant.2, constant.3)
+    tuple.2 = (s32[], s32[]) tuple(constant.1, tuple.1)
+    ROOT get-tuple-element.1 = (s32[], s32[]) get-tuple-element(tuple.2), index=1
+  })")
+                    .ValueOrDie();
+  const HloLivenessAnalysis& liveness = RunLiveness(module.get());
+  EXPECT_TRUE(
+      liveness.IsLive(GetInstruction(module.get(), "get-tuple-element.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(
+      GetInstruction(module.get(), "get-tuple-element.1"), {0}));
+  EXPECT_TRUE(liveness.IsLive(
+      GetInstruction(module.get(), "get-tuple-element.1"), {1}));
+
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {}));
+  EXPECT_FALSE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {0}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {1}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {1, 0}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {1, 1}));
+
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {0}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {1}));
+
+  EXPECT_FALSE(liveness.IsLive(GetInstruction(module.get(), "constant.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "constant.2"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "constant.3"), {}));
+}
+
+// Tests that GTE of GTE (at entry root) of nested Tuple instruction only
+// propgates liveness to the live elements in tuple.
+TEST_F(HloLivenessAnalysisTest, GteOfGteOfNestedTuple) {
+  auto module = ParseHloString(R"(
+  HloModule SimpleModule
+  ENTRY SimpleComputation {
+    constant.1 = s32[] constant(0)
+    constant.2 = s32[] constant(1)
+    constant.3 = s32[] constant(2)
+    tuple.1 = (s32[], s32[]) tuple(constant.2, constant.3)
+    tuple.2 = (s32[], s32[]) tuple(constant.1, tuple.1)
+    get-tuple-element.1 = (s32[], s32[]) get-tuple-element(tuple.2), index=1
+    ROOT get-tuple-element.2 = s32[] get-tuple-element(get-tuple-element.1), index=0
+  })")
+                    .ValueOrDie();
+  const HloLivenessAnalysis& liveness = RunLiveness(module.get());
+  EXPECT_TRUE(
+      liveness.IsLive(GetInstruction(module.get(), "get-tuple-element.2"), {}));
+
+  EXPECT_TRUE(
+      liveness.IsLive(GetInstruction(module.get(), "get-tuple-element.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(
+      GetInstruction(module.get(), "get-tuple-element.1"), {0}));
+  EXPECT_FALSE(liveness.IsLive(
+      GetInstruction(module.get(), "get-tuple-element.1"), {1}));
+
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {}));
+  EXPECT_FALSE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {0}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {1}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {1, 0}));
+  EXPECT_FALSE(
+      liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {1, 1}));
+
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {0}));
+  EXPECT_FALSE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {1}));
+
+  EXPECT_FALSE(liveness.IsLive(GetInstruction(module.get(), "constant.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "constant.2"), {}));
+  EXPECT_FALSE(liveness.IsLive(GetInstruction(module.get(), "constant.3"), {}));
+}
+
+// Test that live/dead while tuple elements are marked live/dead correctly.
+TEST_F(HloLivenessAnalysisTest, WhileWithDeadTupleElement) {
+  auto module = ParseHloString(R"(
+  HloModule SimpleLoop
+  SimpleLoop.body {
+    loop_var.1 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
+    constant.1 = s32[] constant(1)
+    add.0 = s32[] add(get-tuple-element.1, constant.1)
+    get-tuple-element.2 = s32[3]{0} get-tuple-element(loop_var.1), index=1
+    multiply.0 = s32[3]{0} multiply(get-tuple-element.2, get-tuple-element.2)
+    ROOT tuple.0 = (s32[], s32[3]{0}) tuple(add.0, multiply.0)
+  }
+  SimpleLoop.condition {
+    loop_var.2 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=0
+    constant.2 = s32[] constant(5)
+    ROOT less-than = pred[] less-than(get-tuple-element.3, constant.2)
+  }
+  ENTRY SimpleLoop {
+    constant.3 = s32[] constant(0)
+    constant.4 = s32[3]{0} constant({0, 1, 2})
+    tuple.1 = (s32[], s32[3]{0}) tuple(constant.3, constant.4)
+    while.0 = (s32[], s32[3]{0}) while(tuple.1), condition=
+      SimpleLoop.condition, body=SimpleLoop.body
+    ROOT get-tuple-element.4 = s32[] get-tuple-element(while.0), index=0
+  })")
+                    .ValueOrDie();
+  const HloLivenessAnalysis& liveness = RunLiveness(module.get());
+  EXPECT_TRUE(
+      liveness.IsLive(GetInstruction(module.get(), "get-tuple-element.4"), {}));
+
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "while.0"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "while.0"), {0}));
+  EXPECT_FALSE(liveness.IsLive(GetInstruction(module.get(), "while.0"), {1}));
+
+  // While operand.
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {0}));
+  EXPECT_FALSE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {1}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "constant.3"), {}));
+
+  // While body.
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.0"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.0"), {0}));
+  EXPECT_FALSE(liveness.IsLive(GetInstruction(module.get(), "tuple.0"), {1}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "add.0"), {}));
+  EXPECT_FALSE(liveness.IsLive(GetInstruction(module.get(), "multiply.0"), {}));
+}
+
+// Tests that a tuple element live in while.cond computation, propagates
+// liveness to while.body.root/while.result/while.operand (where it is unused).
+TEST_F(HloLivenessAnalysisTest, WhileCondPropagatesLiveness) {
+  auto module = ParseHloString(R"(
+  HloModule SimpleLoop
+  SimpleLoop.body {
+    loop_var.1 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
+    constant.1 = s32[] constant(1)
+    add.0 = s32[] add(get-tuple-element.1, constant.1)
+    get-tuple-element.2 = s32[3]{0} get-tuple-element(loop_var.1), index=1
+    multiply.0 = s32[3]{0} multiply(get-tuple-element.2, get-tuple-element.2)
+    ROOT tuple.0 = (s32[], s32[3]{0}) tuple(add.0, multiply.0)
+  }
+  SimpleLoop.condition {
+    loop_var.2 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=0
+    get-tuple-element.4 = s32[] get-tuple-element(loop_var.2), index=1
+    add.1 = s32[] add(get-tuple-element.3, get-tuple-element.4)
+    constant.2 = s32[] constant(5)
+    ROOT less-than = pred[] less-than(add.1, constant.2)
+  }
+  ENTRY SimpleLoop {
+    constant.3 = s32[] constant(0)
+    constant.4 = s32[3]{0} constant({0, 1, 2})
+    tuple.1 = (s32[], s32[3]{0}) tuple(constant.3, constant.4)
+    while.0 = (s32[], s32[3]{0}) while(tuple.1), condition=
+      SimpleLoop.condition, body=SimpleLoop.body
+    ROOT get-tuple-element.5 = s32[] get-tuple-element(while.0), index=0
+  })")
+                    .ValueOrDie();
+  const HloLivenessAnalysis& liveness = RunLiveness(module.get());
+  EXPECT_TRUE(
+      liveness.IsLive(GetInstruction(module.get(), "get-tuple-element.5"), {}));
+
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "while.0"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "while.0"), {0}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "while.0"), {1}));
+
+  // While operand.
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {0}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {1}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "constant.3"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "constant.4"), {}));
+
+  // While body.
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.0"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.0"), {0}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.0"), {1}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "add.0"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "multiply.0"), {}));
+}
+
+// Tests that a use of while.result{0} propagates liveness to
+// while.body.param{1} to while.body.root{1}, and then to while.body.param{2}.
+TEST_F(HloLivenessAnalysisTest, WhileWithLiveTupleElements) {
+  auto module = ParseHloString(R"(
+  HloModule SimpleLoop
+  SimpleLoop.body {
+    loop_var.1 = (s32[], s32[], s32[]) parameter(0)
+    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
+    get-tuple-element.2 = s32[] get-tuple-element(loop_var.1), index=1
+    add.1 = s32[] add(get-tuple-element.1, get-tuple-element.2)
+    get-tuple-element.3 = s32[] get-tuple-element(loop_var.1), index=2
+    multiply.1 = s32[] multiply(get-tuple-element.3, get-tuple-element.3)
+    ROOT tuple.1 = (s32[], s32[], s32[]) tuple(add.1, get-tuple-element.3, multiply.1)
+  }
+  SimpleLoop.condition {
+    loop_var.2 = (s32[], s32[], s32[]) parameter(0)
+    get-tuple-element.4 = s32[] get-tuple-element(loop_var.2), index=0
+    constant.1 = s32[] constant(5)
+    ROOT less-than = pred[] less-than(get-tuple-element.4, constant.1)
+  }
+  ENTRY SimpleLoop {
+    constant.2 = s32[] constant(0)
+    constant.3 = s32[] constant(1)
+    constant.4 = s32[] constant(2)
+    tuple.2 = (s32[], s32[], s32[]) tuple(constant.2, constant.3, constant.4)
+    while.1 = (s32[], s32[], s32[]) while(tuple.2), condition=
+      SimpleLoop.condition, body=SimpleLoop.body
+    ROOT get-tuple-element.5 = s32[] get-tuple-element(while.1), index=0
+  })")
+                    .ValueOrDie();
+
+  const HloLivenessAnalysis& liveness = RunLiveness(module.get());
+  EXPECT_TRUE(
+      liveness.IsLive(GetInstruction(module.get(), "get-tuple-element.5"), {}));
+
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "while.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "while.1"), {0}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "while.1"), {1}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "while.1"), {2}));
+  // While operand.
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {0}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {1}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.2"), {2}));
+  // While body root.
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {0}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {1}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "tuple.1"), {2}));
+  // While body param.
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "loop_var.1"), {}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "loop_var.1"), {0}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "loop_var.1"), {1}));
+  EXPECT_TRUE(liveness.IsLive(GetInstruction(module.get(), "loop_var.1"), {2}));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.cc b/tensorflow/compiler/xla/service/hlo_matchers.cc
index 69deac263ee58f..7e4b8834357d39 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.cc
+++ b/tensorflow/compiler/xla/service/hlo_matchers.cc
@@ -17,10 +17,13 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace xla {
 namespace testing {
 
+using ::tensorflow::str_util::Join;
+
 bool HloMatcher::MatchAndExplain(
     const HloInstruction* instruction,
     ::testing::MatchResultListener* listener) const {
@@ -195,6 +198,41 @@ void HloShardingMatcher::DescribeTo(std::ostream* os) const {
   }
 }
 
+bool HloDotWithContractingDimsMatcher::MatchAndExplain(
+    const HloInstruction* instruction,
+    ::testing::MatchResultListener* listener) const {
+  if (!HloMatcher::MatchAndExplain(instruction, listener)) {
+    return false;
+  }
+
+  const DotDimensionNumbers& dim_nums = instruction->dot_dimension_numbers();
+  if (dim_nums.lhs_contracting_dimensions_size() != 1 ||
+      dim_nums.lhs_contracting_dimensions(0) != lhs_contracting_dim_) {
+    *listener << instruction->ToString()
+              << " has wrong lhs_contracting_dimensions (got {"
+              << Join(dim_nums.lhs_contracting_dimensions(), ",") << "} want {"
+              << lhs_contracting_dim_ << "})";
+    return false;
+  }
+
+  if (dim_nums.rhs_contracting_dimensions_size() != 1 ||
+      dim_nums.rhs_contracting_dimensions(0) != rhs_contracting_dim_) {
+    *listener << instruction->ToString()
+              << " has wrong rhs_contracting_dimensions (got {"
+              << Join(dim_nums.rhs_contracting_dimensions(), ",") << "} want {"
+              << rhs_contracting_dim_ << "})";
+    return false;
+  }
+
+  return true;
+}
+
+void HloDotWithContractingDimsMatcher::DescribeTo(std::ostream* os) const {
+  HloMatcher::DescribeTo(os);
+  *os << " with lhs_contracting_dims={" << lhs_contracting_dim_
+      << "} and rhs_contracting_dims={" << rhs_contracting_dim_ << "}";
+}
+
 }  // namespace testing
 
 void PrintTo(const HloInstruction* inst, ::std::ostream* os) {
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
index f2ab9b5d9b6e00..c570b420c21fed 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.h
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MATCHERS_H_
 
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/core/lib/gtl/optional.h"
 
@@ -131,6 +132,27 @@ class HloShardingMatcher
   tensorflow::gtl::optional<HloSharding> sharding_;
 };
 
+// Matches a Dot HLO instruction with specific LHS and RHS contracting
+// dimensions.
+class HloDotWithContractingDimsMatcher : public HloMatcher {
+ public:
+  explicit HloDotWithContractingDimsMatcher(
+      ::testing::Matcher<const HloInstruction*> lhs,
+      ::testing::Matcher<const HloInstruction*> rhs, int64 lhs_contracting_dim,
+      int64 rhs_contracting_dim)
+      : HloMatcher(HloOpcode::kDot, /*operands=*/{lhs, rhs}),
+        lhs_contracting_dim_(lhs_contracting_dim),
+        rhs_contracting_dim_(rhs_contracting_dim) {}
+
+  bool MatchAndExplain(const HloInstruction* instruction,
+                       ::testing::MatchResultListener* listener) const override;
+  void DescribeTo(std::ostream* os) const override;
+
+ private:
+  int64 lhs_contracting_dim_;
+  int64 rhs_contracting_dim_;
+};
+
 // HloInstruction* matchers for opcode and operands. Example:
 //   namespace op = xla::opcode_matchers;
 //   EXPECT_THAT(instruction,
@@ -158,7 +180,6 @@ HLO_MATCHER(Convolution);
 HLO_MATCHER(Copy);
 HLO_MATCHER(CrossReplicaSum);
 HLO_MATCHER(Divide);
-HLO_MATCHER(Dot);
 HLO_MATCHER(DynamicSlice);
 HLO_MATCHER(DynamicUpdateSlice);
 HLO_MATCHER(Eq);
@@ -282,11 +303,21 @@ inline ::testing::Matcher<const ::xla::HloInstruction*> Shape(
     const class Shape& shape) {
   return ::testing::MakeMatcher(new ::xla::testing::HloShapeMatcher(shape));
 }
+inline ::testing::Matcher<const ::xla::HloInstruction*> Shape(
+    tensorflow::StringPiece shape) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloShapeMatcher(
+      ShapeUtil::ParseShapeString(shape).ValueOrDie()));
+}
 inline ::testing::Matcher<const ::xla::HloInstruction*> ShapeWithLayout(
     const class Shape& shape) {
   return ::testing::MakeMatcher(
       new ::xla::testing::HloShapeAndLayoutMatcher(shape));
 }
+inline ::testing::Matcher<const ::xla::HloInstruction*> ShapeWithLayout(
+    tensorflow::StringPiece shape) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloShapeAndLayoutMatcher(
+      ShapeUtil::ParseShapeString(shape).ValueOrDie()));
+}
 
 // Verifies the value of the HloSharing against the provided sharding object.
 inline ::testing::Matcher<const ::xla::HloInstruction*> Sharding(
@@ -294,12 +325,42 @@ inline ::testing::Matcher<const ::xla::HloInstruction*> Sharding(
   return ::testing::MakeMatcher(
       new ::xla::testing::HloShardingMatcher(sharding));
 }
+// Matcher for Sharding from sharding string
+inline ::testing::Matcher<const ::xla::HloInstruction*> Sharding(
+    tensorflow::StringPiece sharding) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloShardingMatcher(
+      ParseSharding(sharding).ValueOrDie()));
+}
 // Verifies that no HloSharding is set for an HLO instruction.
 inline ::testing::Matcher<const ::xla::HloInstruction*> NoSharding() {
   return ::testing::MakeMatcher(
       new ::xla::testing::HloShardingMatcher(tensorflow::gtl::nullopt));
 }
 
+inline ::testing::Matcher<const ::xla::HloInstruction*> Dot(
+    ::testing::Matcher<const HloInstruction*> lhs_matcher,
+    ::testing::Matcher<const HloInstruction*> rhs_matcher) {
+  return ::testing::MakeMatcher(new ::xla::testing::HloMatcher(
+      ::xla::HloOpcode::kDot, {lhs_matcher, rhs_matcher}));
+}
+
+// Matches a Dot HLO instruction if it has exactly one lhs contracting dimension
+// equal to `lhs_contracting_dim` and exactly one rhs contracting dimension
+// equal to `rhs_contracting_dim`.
+//
+// Currently the HLO verifier rejects Dot operations with more than one
+// contracting dimension (even though we can represent these in the
+// DotDimensionNumbers proto) so there is no need to generalize this to support
+// multiple contracting dimensions.
+inline ::testing::Matcher<const ::xla::HloInstruction*> Dot(
+    ::testing::Matcher<const HloInstruction*> lhs_matcher,
+    ::testing::Matcher<const HloInstruction*> rhs_matcher,
+    int64 lhs_contracting_dim, int64 rhs_contracting_dim) {
+  return ::testing::MakeMatcher(
+      new ::xla::testing::HloDotWithContractingDimsMatcher(
+          lhs_matcher, rhs_matcher, lhs_contracting_dim, rhs_contracting_dim));
+}
+
 #undef HLO_MATCHER
 }  // namespace opcode_matchers
 
diff --git a/tensorflow/compiler/xla/service/hlo_matchers_test.cc b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
index c6373b2e46af7d..9a3010cf1ff75e 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 
 namespace op = xla::testing::opcode_matchers;
@@ -105,21 +106,28 @@ TEST(HloMatchersTest, ShapeMatcher) {
       0, ShapeUtil::MakeShapeWithLayout(F32, {5, 7}, {0, 1}), "param");
 
   EXPECT_THAT(p0.get(), op::Shape(ShapeUtil::MakeShape(F32, {5, 7})));
+  EXPECT_THAT(p0.get(), op::Shape("f32[5,7]"));
   EXPECT_THAT(
       p0.get(),
       ::testing::Not(op::ShapeWithLayout(ShapeUtil::MakeShape(F32, {5, 7}))));
+  EXPECT_THAT(p0.get(), ::testing::Not(op::ShapeWithLayout("f32[5,7]")));
   EXPECT_THAT(p0.get(),
               ::testing::Not(op::Shape(ShapeUtil::MakeShape(F32, {7, 5}))));
+  EXPECT_THAT(p0.get(), ::testing::Not(op::Shape("f32[7,5]")));
   EXPECT_THAT(
       p0.get(),
       ::testing::Not(op::ShapeWithLayout(ShapeUtil::MakeShape(F32, {7, 5}))));
+  EXPECT_THAT(p0.get(), ::testing::Not(op::ShapeWithLayout("f32[7,5]")));
   EXPECT_THAT(p0.get(),
               op::Shape(ShapeUtil::MakeShapeWithLayout(F32, {5, 7}, {0, 1})));
+  EXPECT_THAT(p0.get(), op::Shape("f32[5,7]{0,1}"));
   EXPECT_THAT(p0.get(), op::ShapeWithLayout(ShapeUtil::MakeShapeWithLayout(
                             F32, {5, 7}, {0, 1})));
+  EXPECT_THAT(p0.get(), op::ShapeWithLayout("f32[5,7]{0,1}"));
   EXPECT_THAT(p0.get(),
               ::testing::Not(op::ShapeWithLayout(
                   ShapeUtil::MakeShapeWithLayout(F32, {5, 7}, {1, 0}))));
+  EXPECT_THAT(p0.get(), ::testing::Not(op::ShapeWithLayout("f32[5,7]{1,0}")));
 
   EXPECT_THAT(Explain(p0.get(), op::Shape(ShapeUtil::MakeShape(F32, {7, 5}))),
               "%param = f32[5,7]{0,1} parameter(0) has incorrect shape "
@@ -139,6 +147,18 @@ TEST(HloMatchersTest, ShardingMatcher) {
                                             "param.1");
   p1->set_sharding(HloSharding::AssignDevice(1));
 
+  auto tuple_shape = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(F32, {7}), ShapeUtil::MakeShape(S32, {9}),
+       ShapeUtil::MakeShape(F32, {11})});
+  auto p2 = HloInstruction::CreateParameter(1, tuple_shape, "param.2");
+  Array<int64> assignment({2});
+  assignment.SetValues({0, 1});
+  auto sharding = HloSharding::Tuple(
+      tuple_shape,
+      {HloSharding::Tile(ShapeUtil::MakeShape(F32, {5}), assignment),
+       HloSharding::AssignDevice(1), HloSharding::Replicate()});
+  p2->set_sharding(sharding);
+
   EXPECT_THAT(p0.get(), op::NoSharding());
   EXPECT_THAT(p0.get(),
               ::testing::Not(op::Sharding(HloSharding::AssignDevice(1))));
@@ -147,6 +167,11 @@ TEST(HloMatchersTest, ShardingMatcher) {
               ::testing::Not(op::Sharding(HloSharding::AssignDevice(0))));
   EXPECT_THAT(p1.get(), op::Sharding(HloSharding::AssignDevice(1)));
 
+  EXPECT_THAT(
+      p2.get(),
+      op::Sharding(
+          "{{f32[5] devices=[2]0,1}, {maximal device=1}, {replicated}}"));
+
   EXPECT_THAT(Explain(p0.get(), op::Sharding(HloSharding::AssignDevice(1))),
               "%param.0 = f32[5]{0} parameter(0) has no sharding (expected: "
               "{maximal device=1})");
@@ -158,5 +183,41 @@ TEST(HloMatchersTest, ShardingMatcher) {
               "has incorrect sharding (expected: {maximal device=0})");
 }
 
+TEST(HloMatchersTest, DotMatcher) {
+  string hlo_string = R"(
+HloModule DotOperationFusion_TransposeFusion
+
+ENTRY DotOperationFusion_TransposeFusion {
+  arg0 = f32[1,256] parameter(0)
+  arg1 = f32[256,1024] parameter(1)
+  ROOT dot = f32[1,1024] dot(arg0, arg1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+  HloInstruction* root = module->entry_computation()->root_instruction();
+
+  EXPECT_THAT(root, op::Dot(op::Parameter(0), op::Parameter(1),
+                            /*lhs_contracting_dim=*/1,
+                            /*rhs_contracting_dim=*/0));
+
+  EXPECT_THAT(
+      Explain(root, op::Dot(op::Parameter(0), op::Parameter(1),
+                            /*lhs_contracting_dim=*/0,
+                            /*rhs_contracting_dim=*/0)),
+      "%dot = f32[1,1024]{1,0} dot(f32[1,256]{1,0} %arg0, f32[256,1024]{1,0} "
+      "%arg1), lhs_contracting_dims={1}, rhs_contracting_dims={0} has wrong "
+      "lhs_contracting_dimensions (got {1} want {0})");
+
+  EXPECT_THAT(
+      Explain(root, op::Dot(op::Parameter(0), op::Parameter(1),
+                            /*lhs_contracting_dim=*/1,
+                            /*rhs_contracting_dim=*/1)),
+      "%dot = f32[1,1024]{1,0} dot(f32[1,256]{1,0} %arg0, f32[256,1024]{1,0} "
+      "%arg1), lhs_contracting_dims={1}, rhs_contracting_dims={0} has wrong "
+      "rhs_contracting_dimensions (got {0} want {1})");
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index 08b9a29aeda2ee..e63424c2dfb6c7 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -41,14 +41,23 @@ HloModule::HloModule(const string& name,
       entry_computation_handle_(entry_computation_handle),
       unique_id_(next_unique_module_id_++) {}
 
-HloModule::HloModule(const string& name)
-    : name_(NameUniquer::GetSanitizedName(name)),
-      unique_id_(next_unique_module_id_++) {}
 HloModule::HloModule(const string& name, const HloModuleConfig& config)
     : name_(NameUniquer::GetSanitizedName(name)),
       config_(config),
       unique_id_(next_unique_module_id_++) {}
 
+StatusOr<HloInstruction*> HloModule::LaunderConstInstructionFromModule(
+    const HloInstruction* hlo) {
+  if (hlo == nullptr) {
+    return nullptr;
+  }
+
+  TF_RET_CHECK(hlo->GetModule() == this);
+
+  // TODO(b/78350259): Eliminate const laundering.
+  return const_cast<HloInstruction*>(hlo);
+}
+
 HloComputation* HloModule::AddComputationInternal(
     std::unique_ptr<HloComputation> computation, bool is_entry,
     bool uniquify_names) {
@@ -58,7 +67,7 @@ HloComputation* HloModule::AddComputationInternal(
 
     // If the module configuration has no entry layout computation set, create a
     // default one based on the program shape.
-    if (!config_.has_entry_computation_layout()) {
+    if (!config_.has_host_entry_computation_layout()) {
       config_.SetDefaultComputationLayout(
           entry_computation_->ComputeProgramShape());
     }
@@ -232,11 +241,14 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
   TF_RET_CHECK(proto.has_program_shape())
       << "No program shape found in the proto";
   const auto& expected_program_shape = proto.program_shape();
-  TF_RET_CHECK(expected_program_shape.parameters_size() ==
-               module_config.entry_computation_layout().parameter_count());
+  TF_RET_CHECK(
+      expected_program_shape.parameters_size() ==
+      module_config.device_entry_computation_layout().parameter_count());
   for (int i = 0; i < expected_program_shape.parameters_size(); ++i) {
     const Shape& parameter_shape =
-        module_config.entry_computation_layout().parameter_layout(i).shape();
+        module_config.device_entry_computation_layout()
+            .parameter_layout(i)
+            .shape();
     TF_RET_CHECK(ShapeUtil::Compatible(expected_program_shape.parameters(i),
                                        parameter_shape))
         << "HloModuleConfig has different shape for parameter " << i
@@ -246,7 +258,7 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
         << ", actual: " << ShapeUtil::HumanStringWithLayout(parameter_shape);
   }
   const Shape& result_shape =
-      module_config.entry_computation_layout().result_layout().shape();
+      module_config.device_entry_computation_layout().result_layout().shape();
   TF_RET_CHECK(
       ShapeUtil::Compatible(expected_program_shape.result(), result_shape))
       << "HloModuleConfig has different result shape than the HLO module. "
@@ -254,24 +266,44 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
       << ShapeUtil::HumanStringWithLayout(expected_program_shape.result())
       << ", actual: " << ShapeUtil::HumanStringWithLayout(result_shape);
 
-  auto module = MakeUnique<HloModule>(proto.name(), entry_computation_handle,
-                                      module_config);
-
   tensorflow::gtl::FlatMap<int64, HloComputation*> computation_map;
+  tensorflow::gtl::FlatMap<HloComputation*, int64> to_proto_id;
+  std::vector<std::unique_ptr<HloComputation>> computations;
+  HloComputation* entry = nullptr;
   for (const HloComputationProto& computation_proto : proto.computations()) {
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloComputation> computation,
-                        HloComputation::CreateFromProto(
-                            module.get(), computation_proto, computation_map));
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<HloComputation> computation,
+        HloComputation::CreateFromProto(computation_proto, computation_map));
     CHECK_NE(computation.get(), nullptr);
     int64 computation_id = computation_proto.id();
     TF_RET_CHECK(computation_id != -1);
     TF_RET_CHECK(!ContainsKey(computation_map, computation_id));
+    computation_map[computation_id] = computation.get();
+    to_proto_id[computation.get()] = computation_id;
+    if (computation_id == proto.entry_computation_id()) {
+      entry = computation.get();
+    }
+    computations.push_back(std::move(computation));
+  }
+  TF_RET_CHECK(entry != nullptr);
+
+  auto module = MakeUnique<HloModule>(proto.name(), entry_computation_handle,
+                                      module_config);
+
+  // Sort the computations in the proto id's order.
+  std::sort(computations.begin(), computations.end(),
+            [&](const std::unique_ptr<HloComputation>& a,
+                const std::unique_ptr<HloComputation>& b) {
+              return to_proto_id[a.get()] < to_proto_id[b.get()];
+            });
+
+  // Add sorted computations to the module.
+  for (auto& computation : computations) {
+    bool is_entry = computation.get() == entry;
     // Don't uniquify names because we want names to be stable across
     // serialization and deserialization.
-    computation_map[computation_id] = module->AddComputationInternal(
-        std::move(computation),
-        /*is_entry=*/proto.entry_computation_id() == computation_id,
-        /*uniquify_names=*/false);
+    module->AddComputationInternal(std::move(computation), is_entry,
+                                   /*uniquify_names=*/false);
   }
   TF_RET_CHECK(module->entry_computation_ != nullptr);
 
@@ -306,7 +338,7 @@ StatusOr<HloModuleConfig> HloModule::CreateModuleConfigFromProto(
   // The module config is constructed with default layouts regardless of what is
   // passed in via the ProgramShape. Set the layouts to the appropriate values.
   ComputationLayout* entry_layout =
-      module_config.mutable_entry_computation_layout();
+      module_config.mutable_host_entry_computation_layout();
   for (int64 i = 0; i < entry_layout->parameter_count(); ++i) {
     TF_RETURN_IF_ERROR(
         entry_layout->mutable_parameter_layout(i)->CopyLayoutFromShape(
@@ -314,6 +346,8 @@ StatusOr<HloModuleConfig> HloModule::CreateModuleConfigFromProto(
   }
   TF_RETURN_IF_ERROR(entry_layout->mutable_result_layout()->CopyLayoutFromShape(
       program_shape.result()));
+  *module_config.mutable_device_entry_computation_layout() =
+      module_config.host_entry_computation_layout();
 
   return module_config;
 }
@@ -462,7 +496,18 @@ std::list<HloComputation*> HloModule::MakeComputationPostOrder() const {
       added_computations.insert(computation.get());
     }
   }
-  CHECK_EQ(post_order.size(), computations_.size());
+  if (post_order.size() != computations_.size()) {
+    for (HloComputation* computation : post_order) {
+      LOG(ERROR) << "Post Order: " << computation->name() << " ("
+                 << computation->parent()->name() << ")";
+    }
+    for (auto& computation : computations_) {
+      LOG(ERROR) << "Computations: " << computation->name() << " ("
+                 << computation->parent()->name() << ")";
+    }
+    LOG(FATAL) << "Mismatch computation count: post_order=" << post_order.size()
+               << " computation_count=" << computations_.size();
+  }
   return post_order;
 }
 
@@ -479,59 +524,29 @@ std::vector<HloComputation*> HloModule::MakeNonfusionComputations() const {
 
 std::unique_ptr<HloModule> HloModule::Clone(const string& suffix) const {
   VLOG(1) << "Cloning module :" << name_ << " --> " << suffix << "\n";
-  auto module = MakeUnique<HloModule>(name_ + "-" + suffix);
-  module->config_ = config_;
+  auto module = MakeUnique<HloModule>(name_ + "-" + suffix, config_);
   module->entry_computation_handle_ = entry_computation_handle_;
   module->has_entry_computation_handle_ = has_entry_computation_handle_;
 
-  std::unordered_map<HloComputation*, HloComputation*> clone_map;
-  for (auto& computation : computations_) {
-    if (computation->IsFusionComputation()) {
-      // Cloning of a fused computation is handled by its fusion instruction.
-      continue;
-    }
-
-    // When cloning a computation, pass in the new module, so that for any
-    // fusion instruction in this computation, the fused computation will be
-    // deep cloned to the new module.
-    auto cloned_computation = computation->Clone(suffix, module.get());
-    InsertOrDie(&clone_map, computation.get(), cloned_computation.get());
-
-    if (entry_computation_ == computation.get()) {
-      module->AddEntryComputation(std::move(cloned_computation));
-    } else {
-      module->AddEmbeddedComputation(std::move(cloned_computation));
-    }
-  }
-
-  for (auto& cloned_computation : module->computations_) {
-    for (auto* instruction : cloned_computation->instructions()) {
-      // Rewrite instruction's called_computation to point to the cloned
-      // computations.
-      instruction->ReplaceCalledComputations([&](HloComputation* hlo) {
-        if (hlo->IsFusionComputation()) {
-          // Cloning of a fused computation has already been handled when its
-          // fusion instruction is cloned. So this hlo computation is already
-          // the cloned one.
-          return hlo;
-        }
-        return FindOrDie(clone_map, hlo);
-      });
-    }
-  }
+  HloCloneContext context(module.get(), suffix);
+  auto cloned_computation = entry_computation_->Clone(suffix, &context);
+  module->AddEntryComputation(std::move(cloned_computation));
   return module;
 }
 
-HloComputation* HloModule::DeepCloneComputation(HloComputation* computation) {
-  HloComputation* clone = AddEmbeddedComputation(computation->Clone("", this));
-  TF_CHECK_OK(
-      clone->root_instruction()->Accept([this](HloInstruction* instruction) {
-        instruction->ReplaceCalledComputations([this](HloComputation* callee) {
-          return DeepCloneComputation(callee);
-        });
-        return Status::OK();
-      }));
-  return clone;
+HloComputation* HloModule::DeepCloneComputation(HloComputation* computation,
+                                                HloCloneContext* context) {
+  HloComputation* new_computation;
+  if (context != nullptr) {
+    if ((new_computation = context->FindComputation(computation)) != nullptr) {
+      return new_computation;
+    }
+    new_computation =
+        AddEmbeddedComputation(computation->Clone(context->suffix(), context));
+  } else {
+    new_computation = AddEmbeddedComputation(computation->Clone(""));
+  }
+  return new_computation;
 }
 
 uint64 HloModule::RandomNew64() const {
@@ -539,6 +554,14 @@ uint64 HloModule::RandomNew64() const {
   return rng_();
 }
 
+HloComputation* HloModule::GetComputationWithName(
+    tensorflow::StringPiece name) {
+  auto it = c_find_if(computations(), [&](HloComputation* computation) {
+    return computation->name() == name;
+  });
+  return it == computations().end() ? nullptr : *it;
+}
+
 /* static */ std::atomic<int> HloModule::next_unique_module_id_(0);
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h
index 9f7f25202ba42b..c93c74d34a95cf 100644
--- a/tensorflow/compiler/xla/service/hlo_module.h
+++ b/tensorflow/compiler/xla/service/hlo_module.h
@@ -26,12 +26,14 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/iterator_util.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/service/hlo_clone_context.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/compiler/xla/service/versioned_computation_handle.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/gtl/iterator_range.h"
 #include "tensorflow/core/platform/logging.h"
@@ -41,10 +43,18 @@ namespace xla {
 
 // Describes a compilation unit at the HLO level.
 //
-// A HLO module contains one or more HLO computations. The module contains one
-// "entry" computation which produces the result. The module also includes any
-// embedded computations used by instructions such as "map" and "reduce". All
-// computations are owned by the module.
+// HloModule is the top-level unit in the HLO IR.  It corresponds to a whole
+// "program".  Running a module, from beginning to end, is the only way to run
+// an XLA program.
+//
+// A module contains one "entry computation"; this HloComputation is like main()
+// in a C program.  The result of running the module is the result of running
+// this computation.
+//
+// A module also contains some number of "nested computations".  Each nested
+// computation is attached to an HloInstruction within some other computation.
+// The meaning of the nested computation depends on the instruction it's
+// attached to.
 class HloModule {
  public:
   HloModule(const string& name,
@@ -55,7 +65,6 @@ class HloModule {
   // only be used for HloModules used outside of the XLA service (eg
   // tests). The versioned handle is used by the service in the compilation
   // cache. A default configuration is created for this module.
-  explicit HloModule(const string& name);
   explicit HloModule(const string& name, const HloModuleConfig& config);
 
   // Adds an entry computation to the module. A module can only have one entry
@@ -86,8 +95,10 @@ class HloModule {
   std::unique_ptr<HloModule> Clone(const string& suffix = "clone") const;
 
   // Performs a deep clone of the computation, by recursively cloning all
-  // the called computations as well.
-  HloComputation* DeepCloneComputation(HloComputation* computation);
+  // the called computations as well. If the clone context is specified, it
+  // will be populated with the cloned object mappings.
+  HloComputation* DeepCloneComputation(HloComputation* computation,
+                                       HloCloneContext* context = nullptr);
 
   // Return a pointer to the entry computation of the module..
   const HloComputation* entry_computation() const {
@@ -99,12 +110,20 @@ class HloModule {
     return entry_computation_;
   }
 
-  ComputationLayout* mutable_entry_computation_layout() {
-    return config_.mutable_entry_computation_layout();
+  ComputationLayout* mutable_host_entry_computation_layout() {
+    return config_.mutable_host_entry_computation_layout();
+  }
+
+  const ComputationLayout& host_entry_computation_layout() const {
+    return config_.host_entry_computation_layout();
+  }
+
+  ComputationLayout* mutable_device_entry_computation_layout() {
+    return config_.mutable_device_entry_computation_layout();
   }
 
-  const ComputationLayout& entry_computation_layout() const {
-    return config_.entry_computation_layout();
+  const ComputationLayout& device_entry_computation_layout() const {
+    return config_.device_entry_computation_layout();
   }
 
   const VersionedComputationHandle& entry_computation_handle() const {
@@ -131,6 +150,10 @@ class HloModule {
             MakeUnwrappingIterator(computations_.end())};
   }
 
+  // Returns the computation in this module that has the name `name`.  Returns
+  // null if there is no such computation.
+  HloComputation* GetComputationWithName(tensorflow::StringPiece name);
+
   // Gets the number of computations in this module.
   int64 computation_count() const { return computations_.size(); }
 
@@ -205,6 +228,25 @@ class HloModule {
   // the lifetime of this process.
   int unique_id() const { return unique_id_; }
 
+  // Returns a non-const version of the passed-in const HloInstruction*. This is
+  // safe on the argument that if you have a non-const module, then you can
+  // access all instructions in the module as non-const.
+  //
+  // Returns an error if the passed-in instruction is not from this module,
+  // except that it is allowed to pass in a null pointer.
+  //
+  // TODO(b/78350259): Eliminate const laundering. The argument above is not
+  // reliable since at any time someone could add or discover a way for a
+  // non-const module to transitively contain a const HloInstruction. The
+  // reliable way to do this would be to create a const laundering map from a
+  // module, mapping each encountered HloInstruction to its non-const version
+  // and then look up each instruction in need of laundering in that map, but
+  // this is much more expensive and complicated. This returns a Status instead
+  // of doing a CHECK-failure in part to make it strongly apparent that this is
+  // something that can fail.
+  StatusOr<HloInstruction*> LaunderConstInstructionFromModule(
+      const HloInstruction* hlo);
+
  private:
   HloComputation* AddComputationInternal(
       std::unique_ptr<HloComputation> computation, bool is_entry,
diff --git a/tensorflow/compiler/xla/service/hlo_module_config.cc b/tensorflow/compiler/xla/service/hlo_module_config.cc
index 4205b0402cb8b2..dae5578a3158fe 100644
--- a/tensorflow/compiler/xla/service/hlo_module_config.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_config.cc
@@ -31,11 +31,13 @@ using tensorflow::strings::StrAppend;
 HloModuleConfig::HloModuleConfig() {}
 
 HloModuleConfig::HloModuleConfig(const ProgramShape& program_shape)
-    : entry_computation_layout_(program_shape) {}
+    : host_entry_computation_layout_(program_shape),
+      device_entry_computation_layout_(program_shape) {}
 
 void HloModuleConfig::SetDefaultComputationLayout(
     const ProgramShape& program_shape) {
-  entry_computation_layout_ = ComputationLayout(program_shape);
+  host_entry_computation_layout_ = ComputationLayout(program_shape);
+  device_entry_computation_layout_ = ComputationLayout(program_shape);
 }
 
 string HloModuleConfig::compilation_cache_key() const {
@@ -44,11 +46,18 @@ string HloModuleConfig::compilation_cache_key() const {
   StrAppend(&key, "::(");
   std::vector<string> params;
   for (const ShapeLayout& param_layout :
-       entry_computation_layout_->parameter_layouts()) {
+       host_entry_computation_layout_->parameter_layouts()) {
     params.push_back(param_layout.shape().DebugString());
   }
   StrAppend(&key, tensorflow::str_util::Join(params, ", "), ") => ",
-            entry_computation_layout_->result_shape().SerializeAsString());
+            host_entry_computation_layout_->result_shape().SerializeAsString());
+  for (const ShapeLayout& param_layout :
+       device_entry_computation_layout_->parameter_layouts()) {
+    params.push_back(param_layout.shape().DebugString());
+  }
+  StrAppend(
+      &key, tensorflow::str_util::Join(params, ", "), ") => ",
+      device_entry_computation_layout_->result_shape().SerializeAsString());
   if (seed() != 0) {
     // TODO(b/32083678): force recompilation to reset global state.
     static std::atomic<int> counter{0};
diff --git a/tensorflow/compiler/xla/service/hlo_module_config.h b/tensorflow/compiler/xla/service/hlo_module_config.h
index 586a03d412681c..cdb0b29a2399b3 100644
--- a/tensorflow/compiler/xla/service/hlo_module_config.h
+++ b/tensorflow/compiler/xla/service/hlo_module_config.h
@@ -41,26 +41,44 @@ class HloModuleConfig {
   explicit HloModuleConfig(const ProgramShape& program_shape);
 
   // Checks if this config has an entry computation layout already.
-  bool has_entry_computation_layout() const {
-    return entry_computation_layout_.has_value();
+  bool has_host_entry_computation_layout() const {
+    return host_entry_computation_layout_.has_value();
+  }
+
+  bool has_device_entry_computation_layout() const {
+    return device_entry_computation_layout_.has_value();
   }
 
   // Sets the entry computation layout for this config. If the entry computation
   // layout already exists, it is silently replaced.
   void SetDefaultComputationLayout(const ProgramShape& program_shape);
 
-  // Returns a constant reference to the layout of the entry computation.
+  // Returns a constant reference to the on-host layout of the entry
+  // computation. Assumes the layout was set.
+  const ComputationLayout& host_entry_computation_layout() const {
+    CHECK(host_entry_computation_layout_.has_value());
+    return *host_entry_computation_layout_;
+  }
+
+  // Returns a mutable pointer to the layout of the on-host entry computation.
   // Assumes the layout was set.
-  const ComputationLayout& entry_computation_layout() const {
-    CHECK(entry_computation_layout_.has_value());
-    return *entry_computation_layout_;
+  ComputationLayout* mutable_host_entry_computation_layout() {
+    CHECK(host_entry_computation_layout_.has_value());
+    return &(*host_entry_computation_layout_);
   }
 
-  // Returns a mutable pointer to the layout of the entry computation. Assumes
-  // the layout was set.
-  ComputationLayout* mutable_entry_computation_layout() {
-    CHECK(entry_computation_layout_.has_value());
-    return &(*entry_computation_layout_);
+  // Returns a constant reference to the on-device layout of the entry
+  // computation. Assumes the layout was set.
+  const ComputationLayout& device_entry_computation_layout() const {
+    CHECK(device_entry_computation_layout_.has_value());
+    return *device_entry_computation_layout_;
+  }
+
+  // Returns a mutable pointer to the layout of the on-device entry computation.
+  // Assumes the layout was set.
+  ComputationLayout* mutable_device_entry_computation_layout() {
+    CHECK(device_entry_computation_layout_.has_value());
+    return &(*device_entry_computation_layout_);
   }
 
   // Returns whether to enable HLO-level profiling.
@@ -109,7 +127,8 @@ class HloModuleConfig {
  private:
   // If you add new members, be sure to update compilation_cache_key.
 
-  tensorflow::gtl::optional<ComputationLayout> entry_computation_layout_;
+  tensorflow::gtl::optional<ComputationLayout> host_entry_computation_layout_;
+  tensorflow::gtl::optional<ComputationLayout> device_entry_computation_layout_;
 
   // Whether this is a 'host module'.
   bool is_host_module_ = false;
diff --git a/tensorflow/compiler/xla/service/hlo_module_dce.cc b/tensorflow/compiler/xla/service/hlo_module_dce.cc
new file mode 100644
index 00000000000000..98d20315e399c6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_module_dce.cc
@@ -0,0 +1,131 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_module_dce.h"
+
+#include <deque>
+#include <unordered_set>
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_liveness_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace xla {
+
+namespace {
+
+bool HasSendRecv(HloComputation* computation) {
+  for (auto* instruction : computation->instructions()) {
+    if (instruction->opcode() == HloOpcode::kSend ||
+        instruction->opcode() == HloOpcode::kSendDone ||
+        instruction->opcode() == HloOpcode::kRecv ||
+        instruction->opcode() == HloOpcode::kRecvDone) {
+      return true;
+    }
+    for (auto* sub_computation : instruction->called_computations()) {
+      if (HasSendRecv(sub_computation)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+StatusOr<bool> RunWhileDCE(HloModule* module, HloLivenessAnalysis* liveness) {
+  bool changed = false;
+  for (auto* computation : module->computations()) {
+    for (auto* instruction : computation->instructions()) {
+      if (instruction->opcode() != HloOpcode::kWhile) {
+        continue;
+      }
+
+      const auto* xla_while = instruction;
+      auto* while_body_comp = xla_while->while_body();
+      auto* while_body_param = while_body_comp->parameter_instruction(0);
+      auto* while_body_root = while_body_comp->root_instruction();
+
+      if (!ShapeUtil::IsTuple(xla_while->shape()) ||
+          while_body_root->opcode() != HloOpcode::kTuple ||
+          HasSendRecv(while_body_comp)) {
+        // Only run DCE on tuple-shaped while loops where body root is Tuple,
+        // with no send/recv instructions.
+        VLOG(1) << "WhileDCE SKIP while: " << xla_while->ToString();
+        continue;
+      }
+
+      // Remove dead tuple elements.
+      const int64 tuple_element_count =
+          ShapeUtil::TupleElementCount(xla_while->shape());
+      for (int64 i = 0; i < tuple_element_count; ++i) {
+        if (liveness->IsLive(xla_while, {i})) {
+          continue;
+        }
+        VLOG(1) << "WhileDCE Dead while tuple element."
+                << " while: " << xla_while->name() << " tuple_index: " << i;
+        // Transform while.body computation to make tuple element at
+        // 'shape_index' as simple pass-through parameter (which candidate
+        // be removed later by simplification pass).
+        HloInstruction* pass_thru_gte = while_body_comp->AddInstruction(
+            HloInstruction::CreateGetTupleElement(
+                while_body_param->shape().tuple_shapes(i), while_body_param,
+                i));
+        // Replace while.body.root Tuple operand at 'tuple_index' with
+        // 'pass_thru_gte', making prior operand a dead root (to be cleaned
+        // up with a subsequent DCE pass).
+        TF_RETURN_IF_ERROR(
+            while_body_root->ReplaceOperandWith(i, pass_thru_gte));
+        changed = true;
+      }
+    }
+  }
+  return changed;
+}
+
+}  // namespace
+
+StatusOr<bool> HloModuleDCE::Run(HloModule* module) {
+  VLOG(2) << "Before HloModuleDCE:";
+  XLA_VLOG_LINES(3, module->ToString());
+
+  std::unique_ptr<HloLivenessAnalysis> liveness;
+  TF_ASSIGN_OR_RETURN(liveness, HloLivenessAnalysis::Run(*module));
+
+  // Sweep through while instructions, transforming dead while tuple element
+  // computations to pass through tuple values (creating dead roots in while
+  // body computation in the process).
+  TF_ASSIGN_OR_RETURN(bool hlo_module_dce_changed,
+                      RunWhileDCE(module, liveness.get()));
+
+  // Run HloDCE to clean up any dead code created during HloModuleDCE.
+  HloDCE hlo_dce;
+  TF_ASSIGN_OR_RETURN(bool hlo_dce_changed, hlo_dce.Run(module));
+
+  VLOG(2) << "After HloModuleDCE:";
+  XLA_VLOG_LINES(3, module->ToString());
+
+  return hlo_module_dce_changed | hlo_dce_changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module_dce.h b/tensorflow/compiler/xla/service/hlo_module_dce.h
new file mode 100644
index 00000000000000..29024085c10389
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_module_dce.h
@@ -0,0 +1,43 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_DCE_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_DCE_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// HLO pass which removes dead code from computations in the module using
+// HloModule-scoped analysis (HloLivenessAnalysis).
+//
+// Sweeps through live instructions which cross computation boundaries (kWhile),
+// and removes code at dead shape indices.
+//
+class HloModuleDCE : public HloPassInterface {
+ public:
+  ~HloModuleDCE() override {}
+  tensorflow::StringPiece name() const override { return "hlo-module-dce"; }
+
+  // Run the pass on the given module. Returns whether the module was changed
+  // (instructions were removed).
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MODULE_DCE_H_
diff --git a/tensorflow/compiler/xla/service/hlo_module_dce_test.cc b/tensorflow/compiler/xla/service/hlo_module_dce_test.cc
new file mode 100644
index 00000000000000..363862e4905fc1
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_module_dce_test.cc
@@ -0,0 +1,371 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_module_dce.h"
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+class HloModuleDceTest : public HloTestBase {
+ protected:
+  HloModuleDceTest() {}
+
+  // Returns whether the given instruction exists in the given computation.
+  bool HasInstruction(const HloComputation& computation,
+                      const HloInstruction* instruction) {
+    return std::find(computation.instructions().begin(),
+                     computation.instructions().end(),
+                     instruction) != computation.instructions().end();
+  }
+
+  // Returns whether the while instruction with name 'while_name' in
+  // 'computation' passes through its tuple element at 'tuple_index' from
+  // parameter to root instruction.
+  bool WhileBodyHasPassThroughTupleElement(const HloComputation* computation,
+                                           const string& while_name,
+                                           const int64 tuple_index) {
+    for (auto* instruction : computation->instructions()) {
+      if (instruction->opcode() == HloOpcode::kWhile &&
+          instruction->name() == while_name) {
+        auto* while_body_comp = instruction->while_body();
+        auto* while_body_param = while_body_comp->parameter_instruction(0);
+        auto* while_body_root = while_body_comp->root_instruction();
+        if (while_body_root->opcode() != HloOpcode::kTuple) {
+          return false;
+        }
+        auto* operand = while_body_root->operand(tuple_index);
+        if (operand->opcode() == HloOpcode::kGetTupleElement &&
+            operand->tuple_index() == tuple_index &&
+            operand->operand(0) == while_body_param) {
+          return true;
+        }
+        return false;
+      }
+    }
+    return false;
+  }
+};
+
+// Tests that a while with all outputs live is unmodified.
+TEST_F(HloModuleDceTest, WhileWithLiveOutputs) {
+  auto module = ParseHloString(R"(
+  HloModule SimpleLoop
+  SimpleLoop.body {
+    loop_var.1 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
+    constant.1 = s32[] constant(1)
+    add = s32[] add(get-tuple-element.1, constant.1)
+    get-tuple-element.2 = s32[3]{0} get-tuple-element(loop_var.1), index=1
+    multiply = s32[3]{0} multiply(get-tuple-element.2, get-tuple-element.2)
+    ROOT tuple = (s32[], s32[3]{0}) tuple(add, multiply)
+  }
+  SimpleLoop.condition {
+    loop_var.2 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=0
+    constant.2 = s32[] constant(5)
+    ROOT less-than = pred[] less-than(get-tuple-element.3, constant.2)
+  }
+  ENTRY SimpleLoop {
+    constant.3 = s32[] constant(0)
+    constant.4 = s32[3]{0} constant({0, 1, 2})
+    tuple.1 = (s32[], s32[3]{0}) tuple(constant.3, constant.4)
+    ROOT while = (s32[], s32[3]{0}) while(tuple.1), condition=
+      SimpleLoop.condition, body=SimpleLoop.body
+  })")
+                    .ValueOrDie();
+
+  HloModuleDCE dce;
+  EXPECT_FALSE(dce.Run(module.get()).ValueOrDie());
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while", 0));
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while", 1));
+}
+
+// Tests a while loop with one unused output (which is used in the while loop
+// body by an instruction with side-effects: rng) is unmodified.
+TEST_F(HloModuleDceTest, WhileWithUnusedSideEffectingTupleElement) {
+  auto module = ParseHloString(R"(
+  HloModule SimpleLoop
+  SimpleLoop.body {
+    loop_var.1 = (s32[], f32[]) parameter(0)
+    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
+    constant.1 = s32[] constant(1)
+    add = s32[] add(get-tuple-element.1, constant.1)
+    get-tuple-element.2 = f32[] get-tuple-element(loop_var.1), index=1
+    constant.2 = f32[] constant(1.0)
+    rng = f32[] rng(constant.2, get-tuple-element.2), distribution=rng_uniform
+    add.1 = s32[] add(get-tuple-element.2, constant.2)
+    ROOT tuple = (s32[], f32[]) tuple(add, add.1)
+  }
+  SimpleLoop.condition {
+    loop_var.2 = (s32[], f32[]) parameter(0)
+    get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=0
+    constant.3 = s32[] constant(5)
+    ROOT less-than = pred[] less-than(get-tuple-element.3, constant.3)
+  }
+  ENTRY SimpleLoop {
+    constant.4 = s32[] constant(0)
+    constant.5 = f32[] constant(0.0)
+    tuple.1 = (s32[], f32[]) tuple(constant.4, constant.5)
+    while = (s32[], f32[]) while(tuple.1), condition=
+      SimpleLoop.condition, body=SimpleLoop.body
+    ROOT get-tuple-element.4 = s32[] get-tuple-element(while), index=0
+  })")
+                    .ValueOrDie();
+
+  HloModuleDCE dce;
+  EXPECT_FALSE(dce.Run(module.get()).ValueOrDie());
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while", 0));
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while", 1));
+}
+
+// Tests that a while loop with one dead tuple element at {1} has its while
+// loop body modified to make that tuple element pass-through the while body.
+TEST_F(HloModuleDceTest, OneWhileWithDeadTupleElement) {
+  auto module = ParseHloString(R"(
+  HloModule SimpleLoop
+  SimpleLoop.body {
+    loop_var.1 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
+    constant.1 = s32[] constant(1)
+    add = s32[] add(get-tuple-element.1, constant.1)
+    get-tuple-element.2 = s32[3]{0} get-tuple-element(loop_var.1), index=1
+    multiply = s32[3]{0} multiply(get-tuple-element.2, get-tuple-element.2)
+    ROOT tuple = (s32[], s32[3]{0}) tuple(add, multiply)
+  }
+  SimpleLoop.condition {
+    loop_var.2 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=0
+    constant.2 = s32[] constant(5)
+    ROOT less-than = pred[] less-than(get-tuple-element.3, constant.2)
+  }
+  ENTRY SimpleLoop {
+    constant.3 = s32[] constant(0)
+    constant.4 = s32[3]{0} constant({0, 1, 2})
+    tuple.1 = (s32[], s32[3]{0}) tuple(constant.3, constant.4)
+    while = (s32[], s32[3]{0}) while(tuple.1), condition=
+      SimpleLoop.condition, body=SimpleLoop.body
+    ROOT get-tuple-element.4 = s32[] get-tuple-element(while), index=0
+  })")
+                    .ValueOrDie();
+
+  HloModuleDCE dce;
+  // While tuple element {1} should not be pass-through before ModuleDCE.
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while", 1));
+  EXPECT_TRUE(dce.Run(module.get()).ValueOrDie());
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while", 0));
+  // While tuple element {1} should now be pass-through after ModuleDCE.
+  EXPECT_TRUE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                  "while", 1));
+}
+
+// Tests that a tuple element {1} used by condition computation (which appears
+// dead in while.body{1} and at while.result{1}) propgates liveness of this
+// tuple element to while.body{1} and at while.result{1}.
+TEST_F(HloModuleDceTest, OneWhileWithTupleElementUsedByCond) {
+  auto module = ParseHloString(R"(
+  HloModule SimpleLoop
+  SimpleLoop.body {
+    loop_var.1 = (s32[], s32[]) parameter(0)
+    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
+    constant.1 = s32[] constant(1)
+    add = s32[] add(get-tuple-element.1, constant.1)
+    get-tuple-element.2 = s32[] get-tuple-element(loop_var.1), index=1
+    multiply = s32[] multiply(get-tuple-element.2, get-tuple-element.2)
+    ROOT tuple = (s32[], s32[]) tuple(add, multiply)
+  }
+  SimpleLoop.condition {
+    loop_var.2 = (s32[], s32[]) parameter(0)
+    get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=1
+    constant.2 = s32[] constant(5)
+    ROOT less-than = pred[] less-than(get-tuple-element.3, constant.2)
+  }
+  ENTRY SimpleLoop {
+    constant.3 = s32[] constant(0)
+    constant.4 = s32[] constant(0)
+    tuple.1 = (s32[], s32[]) tuple(constant.3, constant.4)
+    while = (s32[], s32[]) while(tuple.1), condition=
+      SimpleLoop.condition, body=SimpleLoop.body
+    ROOT get-tuple-element.4 = s32[] get-tuple-element(while), index=0
+  })")
+                    .ValueOrDie();
+
+  HloModuleDCE dce;
+  // While tuple element {1} should not be pass-through before ModuleDCE.
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while", 1));
+  EXPECT_FALSE(dce.Run(module.get()).ValueOrDie());
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while", 0));
+  // While tuple element {1} still be pass-through after ModuleDCE.
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while", 1));
+}
+
+// Tests that HloModuleDCE can remove a dead tuple element at index {1} between
+// two dependent while loops.
+TEST_F(HloModuleDceTest, TwoWhilesWithDeadTupleElement) {
+  auto module = ParseHloString(R"(
+  HloModule SimpleLoop
+  SimpleLoop.body0 {
+    loop_var.1 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
+    constant.1 = s32[] constant(1)
+    add = s32[] add(get-tuple-element.1, constant.1)
+    get-tuple-element.2 = s32[3]{0} get-tuple-element(loop_var.1), index=1
+    multiply = s32[3]{0} multiply(get-tuple-element.2, get-tuple-element.2)
+    ROOT tuple = (s32[], s32[3]{0}) tuple(add, multiply)
+  }
+  SimpleLoop.condition0 {
+    loop_var.2 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=0
+    constant.2 = s32[] constant(5)
+    ROOT less-than = pred[] less-than(get-tuple-element.3, constant.2)
+  }
+  SimpleLoop.body1 {
+    loop_var.3 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.4 = s32[] get-tuple-element(loop_var.3), index=0
+    constant.3 = s32[] constant(1)
+    add.1 = s32[] add(get-tuple-element.4, constant.3)
+    get-tuple-element.5 = s32[3]{0} get-tuple-element(loop_var.3), index=1
+    multiply.1 = s32[3]{0} multiply(get-tuple-element.5, get-tuple-element.5)
+    ROOT tuple.1 = (s32[], s32[3]{0}) tuple(add.1, multiply.1)
+  }
+  SimpleLoop.condition1 {
+    loop_var.4 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.6 = s32[] get-tuple-element(loop_var.4), index=0
+    constant.4 = s32[] constant(5)
+    ROOT less-than.1 = pred[] less-than(get-tuple-element.6, constant.4)
+  }
+  ENTRY SimpleLoop {
+    constant.5 = s32[] constant(0)
+    constant.6 = s32[3]{0} constant({0, 1, 2})
+    tuple.2 = (s32[], s32[3]{0}) tuple(constant.5, constant.6)
+    while.1 = (s32[], s32[3]{0}) while(tuple.2), condition=
+      SimpleLoop.condition0, body=SimpleLoop.body0
+    get-tuple-element.7 = s32[] get-tuple-element(while.1), index=0
+    tuple.3 = (s32[], s32[3]{0}) tuple(get-tuple-element.7, constant.6)
+    while.2 = (s32[], s32[3]{0}) while(tuple.3), condition=
+      SimpleLoop.condition1, body=SimpleLoop.body1
+    ROOT get-tuple-element.8 = s32[] get-tuple-element(while.2), index=0
+  })")
+                    .ValueOrDie();
+
+  HloModuleDCE dce;
+  // Before HloModuleDCE while.1 and while.2 should not have pass-thru elements.
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while.1", 1));
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while.2", 1));
+  EXPECT_TRUE(dce.Run(module.get()).ValueOrDie());
+  // After HloModuleDCE while.1 and while.2 should have pass-thru elements,
+  // after being modified to pass through unused tuple element {1}.
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while.1", 0));
+  EXPECT_TRUE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                  "while.1", 1));
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while.2", 0));
+  EXPECT_TRUE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                  "while.2", 1));
+}
+
+// Tests that HloModuleDCE can remove a dead tuple element at while.1{0} and
+// while.2{1}, between two dependent while loops.
+TEST_F(HloModuleDceTest, TwoWhilesWithDeadTupleElementSwizzled) {
+  auto module = ParseHloString(R"(
+  HloModule SimpleLoop
+  SimpleLoop.body0 {
+    loop_var.1 = (s32[3]{0}, s32[]) parameter(0)
+    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=1
+    constant.1 = s32[] constant(1)
+    add = s32[] add(get-tuple-element.1, constant.1)
+    get-tuple-element.2 = s32[3]{0} get-tuple-element(loop_var.1), index=0
+    multiply = s32[3]{0} multiply(get-tuple-element.2, get-tuple-element.2)
+    ROOT tuple = (s32[3]{0}, s32[]) tuple(multiply, add)
+  }
+  SimpleLoop.condition0 {
+    loop_var.2 = (s32[3]{0}, s32[]) parameter(0)
+    get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=1
+    constant.2 = s32[] constant(5)
+    ROOT less-than = pred[] less-than(get-tuple-element.3, constant.2)
+  }
+  SimpleLoop.body1 {
+    loop_var.3 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.4 = s32[] get-tuple-element(loop_var.3), index=0
+    constant.3 = s32[] constant(1)
+    add.1 = s32[] add(get-tuple-element.4, constant.3)
+    get-tuple-element.5 = s32[3]{0} get-tuple-element(loop_var.3), index=1
+    multiply.1 = s32[3]{0} multiply(get-tuple-element.5, get-tuple-element.5)
+    ROOT tuple.1 = (s32[], s32[3]{0}) tuple(add.1, multiply.1)
+  }
+  SimpleLoop.condition1 {
+    loop_var.4 = (s32[], s32[3]{0}) parameter(0)
+    get-tuple-element.6 = s32[] get-tuple-element(loop_var.4), index=0
+    constant.4 = s32[] constant(5)
+    ROOT less-than.1 = pred[] less-than(get-tuple-element.6, constant.4)
+  }
+  ENTRY SimpleLoop {
+    constant.5 = s32[] constant(0)
+    constant.6 = s32[3]{0} constant({0, 1, 2})
+    tuple.2 = (s32[3]{0}, s32[]) tuple(constant.6, constant.5)
+    while.1 = (s32[3]{0}, s32[]) while(tuple.2), condition=
+      SimpleLoop.condition0, body=SimpleLoop.body0
+    get-tuple-element.7 = s32[] get-tuple-element(while.1), index=1
+    tuple.3 = (s32[], s32[3]{0}) tuple(get-tuple-element.7, constant.6)
+    while.2 = (s32[], s32[3]{0}) while(tuple.3), condition=
+      SimpleLoop.condition1, body=SimpleLoop.body1
+    ROOT get-tuple-element.8 = s32[] get-tuple-element(while.2), index=0
+  })")
+                    .ValueOrDie();
+
+  HloModuleDCE dce;
+  // Before HloModuleDCE while.1{0} and while.2{1} should not be pass-thru.
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while.1", 0));
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while.2", 1));
+  EXPECT_TRUE(dce.Run(module.get()).ValueOrDie());
+  // After HloModuleDCE while.1{0} and while.2{1} not be pass-thru elements.
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while.1", 1));
+  EXPECT_TRUE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                  "while.1", 0));
+  EXPECT_FALSE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                   "while.2", 0));
+  EXPECT_TRUE(WhileBodyHasPassThroughTupleElement(module->entry_computation(),
+                                                  "while.2", 1));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
index 54c34ce1166516..4f1715e4cafd1a 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_module_group_metadata.h"
 
+#include <sstream>
 #include <string>
 #include <utility>
 
@@ -47,13 +48,16 @@ string HloModuleGroupMetadata::TrackedInstruction::ToString() const {
     case ComputationKind::kConditionalFalse:
       repr += ":CONDITIONAL_FALSE";
       break;
+    case ComputationKind::kCallFunction:
+      repr += ":CALL";
+      break;
   }
   return repr;
 }
 
 /* static */ StatusOr<std::unique_ptr<HloModuleGroupMetadata>>
 HloModuleGroupMetadata::Build(const std::vector<HloModule*>& modules) {
-  auto metadata = absl::make_unique<HloModuleGroupMetadata>(modules);
+  auto metadata = MakeUnique<HloModuleGroupMetadata>(modules);
   TF_RETURN_IF_ERROR(metadata->Build());
   return std::move(metadata);
 }
@@ -83,6 +87,7 @@ Status HloModuleGroupMetadata::Build() {
           << "Peer instruction does not match the computation kind";
       TF_RETURN_IF_ERROR(
           AddCompanion(tracked->instruction(), peer_tracked->instruction()));
+      tracked_instructions_comms_[tracked->instruction()].push_back(hlo);
     }
 
     // Add the parents of companion instructions (they must be all of the same
@@ -107,6 +112,42 @@ Status HloModuleGroupMetadata::Build() {
       TF_RETURN_IF_ERROR(computation->Accept(visitor));
     }
   }
+  TF_RETURN_IF_ERROR(VerifyCompanionSets());
+  if (VLOG_IS_ON(4)) {
+    DumpCollectedStats();
+  }
+  return Status::OK();
+}
+
+Status HloModuleGroupMetadata::VerifyCompanionSets() const {
+  for (const auto& companions : companion_sets_) {
+    // A companion set must be composed at most of an instruction per
+    // device/module.
+    std::unordered_set<int64> devices;
+    for (HloInstruction* instruction : *companions) {
+      // Go through all the communicating instructions (send, recv) of the given
+      // companion, and record their device.
+      std::unordered_set<int64> comm_devices;
+      for (HloInstruction* comm_instruction :
+           tracked_instructions_comms_.at(instruction)) {
+        auto device = GetInstructionDevice(*comm_instruction);
+        TF_RET_CHECK(device) << "Instruction " << comm_instruction->ToString()
+                             << " does not have a device";
+        comm_devices.insert(*device);
+      }
+      for (int64 device : comm_devices) {
+        if (!devices.insert(device).second) {
+          std::stringstream ss;
+          ss << "Companion set:" << std::endl;
+          for (HloInstruction* hlo : *companions) {
+            ss << "  " << hlo->name() << std::endl;
+          }
+          ss << "has multiple instructions on the same device";
+          return FailedPrecondition("%s", ss.str().c_str());
+        }
+      }
+    }
+  }
   return Status::OK();
 }
 
@@ -194,6 +235,28 @@ int64 HloModuleGroupMetadata::GetModuleId(const HloModule* module) const {
   LOG(FATAL) << "unknown module";
 }
 
+tensorflow::gtl::optional<int64> HloModuleGroupMetadata::GetInstructionDevice(
+    const HloInstruction& instruction) const {
+  // The module group metadata can be created in both "single module, multiple
+  // devices" and "multiple modules, no explicit devices" fashions.
+  // The API returns an optional even though the current implementation always
+  // returns a device, to account for cases where we cannot guess a device.
+  // In such cases the VerifyChannelInstructions() will return proper errors.
+  tensorflow::gtl::optional<int64> device =
+      instruction.sharding_unique_device();
+  if (!device) {
+    device = GetModuleId(instruction.parent()->parent());
+  }
+  return device;
+}
+
+int64 HloModuleGroupMetadata::GetDeviceModulesCount() const {
+  return std::count_if(modules_.begin(), modules_.end(),
+                       [](const HloModule* module) {
+                         return !module->config().is_host_module();
+                       });
+}
+
 Status HloModuleGroupMetadata::RecordInstructions() {
   const auto visitor = [this](HloInstruction* hlo) -> Status {
     if (hlo->opcode() == HloOpcode::kWhile) {
@@ -206,6 +269,9 @@ Status HloModuleGroupMetadata::RecordInstructions() {
           TrackedInstruction(hlo, ComputationKind::kConditionalTrue);
       tracked_instructions_[hlo->false_computation()] =
           TrackedInstruction(hlo, ComputationKind::kConditionalFalse);
+    } else if (hlo->opcode() == HloOpcode::kCall) {
+      tracked_instructions_[hlo->to_apply()] =
+          TrackedInstruction(hlo, ComputationKind::kCallFunction);
     }
     if (!IsChannelInstruction(hlo)) {
       return Status::OK();
@@ -252,20 +318,22 @@ Status HloModuleGroupMetadata::RecordInstructions() {
       TF_RETURN_IF_ERROR(computation->Accept(visitor));
     }
   }
+  VLOG(2) << "Created " << channels_.size() << " channels";
   return Status::OK();
 }
 
 Status HloModuleGroupMetadata::AddCompanion(HloInstruction* instruction1,
                                             HloInstruction* instruction2) {
   TF_RET_CHECK(instruction1->opcode() == HloOpcode::kWhile ||
-               instruction1->opcode() == HloOpcode::kConditional);
+               instruction1->opcode() == HloOpcode::kConditional ||
+               instruction1->opcode() == HloOpcode::kCall);
   VLOG(2) << "adding as companions:" << instruction1->ToString() << " and "
           << instruction2->ToString();
 
   if (!ContainsKey(companion_set_index_, instruction1) &&
       !ContainsKey(companion_set_index_, instruction2)) {
     companion_sets_.push_back(
-        absl::make_unique<std::unordered_set<HloInstruction*>>());
+        tensorflow::MakeUnique<std::unordered_set<HloInstruction*>>());
     auto companion_set = companion_sets_.back().get();
     companion_set->insert(instruction1);
     companion_set->insert(instruction2);
@@ -313,44 +381,46 @@ Status HloModuleGroupMetadata::VerifyChannelInstructions() {
     if (!ShapeUtil::Compatible(send_shape, recv_shape)) {
       return FailedPrecondition("send/recv shapes do not match");
     }
-    const HloModule* send_module = channel.send->parent()->parent();
-    const HloModule* send_done_module = channel.send_done->parent()->parent();
-    if (send_module != send_done_module) {
+    auto send_device = GetInstructionDevice(*channel.send);
+    auto send_done_device = GetInstructionDevice(*channel.send_done);
+    if (!send_device) {
+      return FailedPrecondition("send instruction must have a device: %s",
+                                channel.send->ToString().c_str());
+    }
+    if (!send_done_device) {
+      return FailedPrecondition("send_done instruction must have a device: %s",
+                                channel.send_done->ToString().c_str());
+    }
+    if (*send_device != *send_done_device) {
       return FailedPrecondition(
           "send and send-done (channel=%lld) must be on the same device: %lld "
           "vs. %lld",
-          channel.id, GetModuleId(send_module), GetModuleId(send_done_module));
+          channel.id, *send_device, *send_done_device);
+    }
+    auto recv_device = GetInstructionDevice(*channel.recv);
+    auto recv_done_device = GetInstructionDevice(*channel.recv_done);
+    if (!recv_done_device) {
+      return FailedPrecondition("recv_done instruction must have a device: %s",
+                                channel.recv_done->ToString().c_str());
     }
-    const HloModule* recv_module = channel.recv->parent()->parent();
-    const HloModule* recv_done_module = channel.recv_done->parent()->parent();
-    if (recv_module != recv_done_module) {
+    if (*recv_device != *recv_done_device) {
       return FailedPrecondition(
           "recv and recv-done (channel=%lld) must be on the same device: %lld "
           "vs. %lld",
-          channel.id, GetModuleId(recv_module), GetModuleId(recv_done_module));
+          channel.id, *recv_device, *recv_done_device);
     }
-    if (send_module == recv_module) {
+    if (*send_device == *recv_device) {
       return FailedPrecondition(
           "send and recv (channel=%lld) must be on different devices: %lld",
-          channel.id, GetModuleId(send_module));
+          channel.id, *send_device);
     }
   }
 
-  // Check if channel instructions are used only in allowed computations.
-  const auto allowed = [this](HloInstruction* hlo) {
-    HloComputation* computation = hlo->parent();
-    const HloModule* module = computation->parent();
-    if (module->entry_computation() == computation ||
-        tracked_instructions_.count(computation) > 0) {
-      return true;
-    }
-    return false;
-  };
   for (const Channel& channel : channels_) {
-    if (!allowed(channel.send) || !allowed(channel.send_done) ||
-        !allowed(channel.recv) || !allowed(channel.recv_done)) {
-      return FailedPrecondition("channel is used in disallowed computation");
-    }
+    TF_RETURN_IF_ERROR(CheckCommunicatingInstruction(channel.send));
+    TF_RETURN_IF_ERROR(CheckCommunicatingInstruction(channel.send_done));
+    TF_RETURN_IF_ERROR(CheckCommunicatingInstruction(channel.recv));
+    TF_RETURN_IF_ERROR(CheckCommunicatingInstruction(channel.recv_done));
   }
   // Check if the nest levels match for each channel.
   for (const Channel& channel : channels_) {
@@ -368,4 +438,47 @@ Status HloModuleGroupMetadata::VerifyChannelInstructions() {
   return Status::OK();
 }
 
+Status HloModuleGroupMetadata::CheckCommunicatingInstruction(
+    HloInstruction* instruction) const {
+  HloComputation* computation = instruction->parent();
+  const HloModule* module = computation->parent();
+  if (module->entry_computation() == computation ||
+      tracked_instructions_.count(computation) > 0) {
+    return Status::OK();
+  }
+  return FailedPrecondition("channel is used in disallowed computation");
+}
+
+void HloModuleGroupMetadata::DumpCollectedStats() const {
+  std::map<std::pair<int64, int64>, int64> communication_histogram;
+  for (auto& channel : channels_) {
+    auto from_device = GetInstructionDevice(*channel.send);
+    auto to_device = GetInstructionDevice(*channel.recv);
+    LOG(INFO) << "Channel " << channel.id << ": from_device=" << *from_device
+              << " to_device=" << *to_device << " send=" << channel.send->name()
+              << " send_done=" << channel.send_done->name()
+              << " recv=" << channel.recv->name()
+              << " recv_done=" << channel.recv_done->name();
+    communication_histogram[std::pair<int64, int64>(*from_device,
+                                                    *to_device)] += 1;
+  }
+  for (auto& fromto_count : communication_histogram) {
+    LOG(INFO) << "From " << fromto_count.first.first << " to "
+              << fromto_count.first.second << ": " << fromto_count.second;
+  }
+  for (auto& companion_set : companion_sets_) {
+    LOG(INFO) << "Companion set:";
+    for (HloInstruction* instruction : *companion_set) {
+      LOG(INFO) << "  " << instruction->name();
+    }
+  }
+  for (auto& instruction_comm : tracked_instructions_comms_) {
+    LOG(INFO) << "Communicating instruction " << instruction_comm.first->name();
+    for (HloInstruction* instruction : instruction_comm.second) {
+      auto device = GetInstructionDevice(*instruction);
+      LOG(INFO) << "  " << instruction->name() << " on device " << *device;
+    }
+  }
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
index c48a7ab0b59269..ffde3a332dfc14 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -60,6 +61,7 @@ class HloModuleGroupMetadata {
     kWhileBody,
     kConditionalTrue,
     kConditionalFalse,
+    kCallFunction,
   };
 
   // Tracks the instruction mapped to a given computation, and the computation
@@ -147,6 +149,15 @@ class HloModuleGroupMetadata {
   // the module in the module vector.
   int64 GetModuleId(const HloModule* module) const;
 
+  // Retrieves the device an instruction is assigned to. Either from the
+  // sharding information, or from the ordinal of the module the instruction
+  // is in.
+  tensorflow::gtl::optional<int64> GetInstructionDevice(
+      const HloInstruction& instruction) const;
+
+  // Returns the number of modules for devices (excluding the host module).
+  int64 GetDeviceModulesCount() const;
+
   // Returns the companion instructions for the given instruction.
   //
   // Precondition: IsCompanionWhile(instruction) is true.
@@ -202,6 +213,15 @@ class HloModuleGroupMetadata {
   Status AddCompanion(HloInstruction* instruction1,
                       HloInstruction* instruction2);
 
+  // Checks whether a communicating instruction is placed in a valid position
+  // within the graph.
+  Status CheckCommunicatingInstruction(HloInstruction* instruction) const;
+
+  // Performs a consistency check on the companion sets built for the input
+  // modules. Check that a companion set does not include instructions from the
+  // same module/device.
+  Status VerifyCompanionSets() const;
+
   // Retrieves a pointer to the stored TrackedInstruction associated with a
   // tracked computation, or nullptr in case such computation is not tracked.
   const TrackedInstruction* GetTrackedInstruction(
@@ -210,6 +230,9 @@ class HloModuleGroupMetadata {
     return it != tracked_instructions_.end() ? &it->second : nullptr;
   }
 
+  // Dump all the collected module group statistics to the logs.
+  void DumpCollectedStats() const;
+
   // List of all companion instructions sets in the module.
   std::vector<std::unique_ptr<std::unordered_set<HloInstruction*>>>
       companion_sets_;
@@ -221,6 +244,11 @@ class HloModuleGroupMetadata {
   tensorflow::gtl::FlatMap<const HloComputation*, TrackedInstruction>
       tracked_instructions_;
 
+  // Maps tracked instructions (kWhile, kConditional, kCall, ...) to the set of
+  // communicating instructions within the proper called computation(s).
+  tensorflow::gtl::FlatMap<HloInstruction*, std::vector<HloInstruction*>>
+      tracked_instructions_comms_;
+
   // All channels in the module.
   std::vector<Channel> channels_;
 
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_util.cc b/tensorflow/compiler/xla/service/hlo_module_group_util.cc
index 289c96b0a7b90c..5a0d1e264eb509 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_util.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_group_util.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -289,7 +290,7 @@ HloModuleGroupUtil::ComputeReachability(
     TF_RETURN_IF_ERROR(
         VisitTopologicalOrder(&visit_states, visit_function, root));
   }
-  auto reachability = absl::make_unique<HloReachabilityMap>(post_order);
+  auto reachability = MakeUnique<HloReachabilityMap>(post_order);
   for (HloInstruction* hlo : post_order) {
     reachability->SetReachabilityToUnion(GlobalPredecessors(hlo), hlo);
   }
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index ca763076a16af1..1fe06ee0c0d142 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -69,11 +69,13 @@ namespace xla {
   V(kCrossReplicaSum, "cross-replica-sum")                   \
   V(kCustomCall, "custom-call")                              \
   V(kDivide, "divide")                                       \
+  V(kDomain, "domain")                                       \
   V(kDot, "dot")                                             \
   V(kDynamicSlice, "dynamic-slice")                          \
   V(kDynamicUpdateSlice, "dynamic-update-slice")             \
   V(kEq, "equal-to", kHloOpcodeIsComparison)                 \
   V(kExp, "exponential")                                     \
+  V(kExpm1, "exponential-minus-one")                         \
   V(kFft, "fft")                                             \
   V(kFloor, "floor")                                         \
   V(kFusion, "fusion", kHloOpcodeIsVariadic)                 \
@@ -87,6 +89,7 @@ namespace xla {
   V(kIsFinite, "is-finite")                                  \
   V(kLe, "less-than-or-equal-to", kHloOpcodeIsComparison)    \
   V(kLog, "log")                                             \
+  V(kLog1p, "log-plus-one")                                  \
   V(kAnd, "and")                                             \
   V(kNot, "not")                                             \
   V(kOr, "or")                                               \
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.cc b/tensorflow/compiler/xla/service/hlo_ordering.cc
index e89d94bede6c43..dcd4725fe78e8b 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/liveness_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -170,10 +169,10 @@ bool HloOrdering::UseIsBeforeValueDefinition(
   // is before the def if the instruction allows buffer sharing (in place
   // computation).
   if (use.instruction == value.defining_instruction() &&
-      CanShareOperandBufferWithUser(
+      dataflow.CanShareOperandBufferWithUser(
           use.instruction->mutable_operand(use.operand_number),
           use.operand_index, value.defining_instruction(),
-          value.defining_index(), dataflow)) {
+          value.defining_index())) {
     VLOG(4) << "  use is value def, and instruction can share use buffer";
     return true;
   }
diff --git a/tensorflow/compiler/xla/service/hlo_ordering_test.cc b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
index 37a7fbad97cea2..cfe5dace05ac03 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
@@ -22,10 +22,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_scheduling.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -310,7 +310,7 @@ ENTRY while.v11 {
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(module_str));
+                          ParseHloString(module_str));
   DependencyHloOrdering ordering(module.get());
   ordering.ToString();  // Shouldn't crash.
 }
@@ -347,7 +347,7 @@ ENTRY root {
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(module_str));
+                          ParseHloString(module_str));
   TF_ASSERT_OK_AND_ASSIGN(auto dataflow,
                           HloDataflowAnalysis::Run(*module, /*ssa_form=*/true));
   DependencyHloOrdering ordering(module.get());
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
similarity index 85%
rename from tensorflow/compiler/xla/tools/parser/hlo_parser.cc
rename to tensorflow/compiler/xla/service/hlo_parser.cc
index fdbfc0210ea63a..3eadedfe1f8b0a 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_domain_metadata.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding_metadata.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -24,17 +26,17 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 
 namespace xla {
-namespace tools {
 
 namespace {
 
-using tensorflow::StringPiece;
-using tensorflow::gtl::optional;
-using tensorflow::str_util::Split;
-using tensorflow::str_util::SplitAndParseAsInts;
-using tensorflow::strings::Printf;
-using tensorflow::strings::StrAppend;
-using tensorflow::strings::StrCat;
+using ::tensorflow::StringPiece;
+using ::tensorflow::gtl::optional;
+using ::tensorflow::str_util::Join;
+using ::tensorflow::str_util::Split;
+using ::tensorflow::str_util::SplitAndParseAsInts;
+using ::tensorflow::strings::Printf;
+using ::tensorflow::strings::StrAppend;
+using ::tensorflow::strings::StrCat;
 
 const double kF16max = 65504;
 
@@ -53,7 +55,12 @@ class HloParser {
   std::unique_ptr<HloModule> ConsumeHloModule() { return std::move(module_); }
 
   // Returns the error information.
-  string GetError() const { return tensorflow::str_util::Join(error_, "\n"); }
+  string GetError() const { return Join(error_, "\n"); }
+
+  // Stand alone parsing utils for various aggregate data types.
+  StatusOr<HloSharding> ParseShardingOnly();
+  StatusOr<Window> ParseWindowOnly();
+  StatusOr<ConvolutionDimensionNumbers> ParseConvolutionDimensionNumbersOnly();
 
  private:
   // ParseXXX returns false if an error occurred.
@@ -77,11 +84,15 @@ class HloParser {
 
   // Sets the sub-value of literal at the given index to the given value. The
   // literal's shape must have the default layout.
-  bool SetValueInLiteral(int64 value, int64 linear_index, Literal* literal);
-  bool SetValueInLiteral(double value, int64 linear_index, Literal* literal);
-  bool SetValueInLiteral(bool value, int64 linear_index, Literal* literal);
+  bool SetValueInLiteral(tensorflow::int64 value,
+                         tensorflow::int64 linear_index, Literal* literal);
+  bool SetValueInLiteral(double value, tensorflow::int64 linear_index,
+                         Literal* literal);
+  bool SetValueInLiteral(bool value, tensorflow::int64 linear_index,
+                         Literal* literal);
   template <typename LiteralNativeT, typename ParsedElemT>
-  bool SetValueInLiteralHelper(ParsedElemT value, int64 linear_index,
+  bool SetValueInLiteralHelper(ParsedElemT value,
+                               tensorflow::int64 linear_index,
                                Literal* literal);
 
   bool ParseOperands(std::vector<HloInstruction*>* operands);
@@ -93,9 +104,15 @@ class HloParser {
   // Describes the start, limit, and stride on every dimension of the operand
   // being sliced.
   struct SliceRanges {
-    std::vector<int64> starts;
-    std::vector<int64> limits;
-    std::vector<int64> strides;
+    std::vector<tensorflow::int64> starts;
+    std::vector<tensorflow::int64> limits;
+    std::vector<tensorflow::int64> strides;
+  };
+
+  // The data parsed for the kDomain instruction.
+  struct DomainData {
+    std::unique_ptr<DomainMetadata> entry_metadata;
+    std::unique_ptr<DomainMetadata> exit_metadata;
   };
 
   // Types of attributes.
@@ -116,6 +133,7 @@ class HloParser {
     kMetadata,
     kFusionKind,
     kDistribution,
+    kDomain,
   };
 
   struct AttrConfig {
@@ -163,21 +181,27 @@ class HloParser {
   bool ParseComputationName(HloComputation** value);
   // Parses a list of names and finds the corresponding hlo instructions.
   bool ParseInstructionNames(std::vector<HloInstruction*>* instructions);
-  bool ParseWindow(Window* window);
+  // Pass expect_outer_curlies == true when parsing a Window in the context of a
+  // larger computation.  Pass false when parsing a stand-alone Window string.
+  bool ParseWindow(Window* window, bool expect_outer_curlies);
   bool ParseConvolutionDimensionNumbers(ConvolutionDimensionNumbers* dnums);
   bool ParsePaddingConfig(PaddingConfig* padding);
   bool ParseMetadata(OpMetadata* metadata);
   bool ParseSharding(OpSharding* sharding);
   bool ParseSingleSharding(OpSharding* sharding, bool lbrace_pre_lexed);
 
+  // Parses the metadata behind a kDOmain instruction.
+  bool ParseDomain(DomainData* domain);
+
   // Parses a sub-attribute of the window attribute, e.g.,size=1x2x3.
-  bool ParseDxD(const string& name, std::vector<int64>* result);
+  bool ParseDxD(const string& name, std::vector<tensorflow::int64>* result);
   // Parses window's pad sub-attriute, e.g., pad=0_0x3x3.
-  bool ParseWindowPad(std::vector<std::vector<int64>>* pad);
+  bool ParseWindowPad(std::vector<std::vector<tensorflow::int64>>* pad);
 
   bool ParseSliceRanges(SliceRanges* result);
   bool ParseInt64List(const TokKind start, const TokKind end,
-                      const TokKind delim, std::vector<int64>* result);
+                      const TokKind delim,
+                      std::vector<tensorflow::int64>* result);
 
   bool ParseParamListToShape(Shape* shape, LocTy* shape_loc);
   bool ParseParamList();
@@ -189,7 +213,7 @@ class HloParser {
   bool ParseFftType(FftType* result);
   bool ParseFusionKind(HloInstruction::FusionKind* result);
   bool ParseRandomDistribution(RandomDistribution* result);
-  bool ParseInt64(int64* result);
+  bool ParseInt64(tensorflow::int64* result);
   bool ParseDouble(double* result);
   bool ParseBool(bool* result);
   bool ParseToken(TokKind kind, const string& msg);
@@ -242,10 +266,10 @@ bool HloParser::Error(LocTy loc, StringPiece msg) {
   std::vector<string> error_lines;
   error_lines.push_back(
       StrCat("was parsing ", line, ":", col, ": error: ", msg));
-  error_lines.push_back(lexer_.GetLine(loc).ToString());
+  error_lines.push_back(std::string(lexer_.GetLine(loc)));
   error_lines.push_back(col == 0 ? "" : StrCat(string(col - 1, ' '), "^"));
 
-  error_.push_back(tensorflow::str_util::Join(error_lines, "\n"));
+  error_.push_back(Join(error_lines, "\n"));
   VLOG(1) << "Error: " << error_.back();
   return false;
 }
@@ -303,12 +327,18 @@ bool HloParser::ParseComputations() {
     // set the layouts to what the hlo text says.
     for (int p = 0; p < computation->num_parameters(); p++) {
       const Shape& param_shape = computation->parameter_instruction(p)->shape();
-      TF_CHECK_OK(module_->mutable_entry_computation_layout()
+      TF_CHECK_OK(module_->mutable_host_entry_computation_layout()
+                      ->mutable_parameter_layout(p)
+                      ->CopyLayoutFromShape(param_shape));
+      TF_CHECK_OK(module_->mutable_device_entry_computation_layout()
                       ->mutable_parameter_layout(p)
                       ->CopyLayoutFromShape(param_shape));
     }
     const Shape& result_shape = computation->root_instruction()->shape();
-    TF_CHECK_OK(module_->mutable_entry_computation_layout()
+    TF_CHECK_OK(module_->mutable_host_entry_computation_layout()
+                    ->mutable_result_layout()
+                    ->CopyLayoutFromShape(result_shape));
+    TF_CHECK_OK(module_->mutable_device_entry_computation_layout()
                     ->mutable_result_layout()
                     ->CopyLayoutFromShape(result_shape));
   }
@@ -377,6 +407,7 @@ bool HloParser::ParseComputation(HloComputation** entry_computation) {
     }
     *entry_computation = computation;
   }
+  instruction_pool_.clear();
 
   return AddComputation(name, computation, name_loc);
 }
@@ -433,10 +464,14 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
   optional<OpMetadata> metadata;
   attrs["metadata"] = {/*required=*/false, AttrTy::kMetadata, &metadata};
 
+  optional<string> backend_config;
+  attrs["backend_config"] = {/*required=*/false, AttrTy::kString,
+                             &backend_config};
+
   HloInstruction* instruction;
   switch (opcode) {
     case HloOpcode::kParameter: {
-      int64 parameter_number;
+      tensorflow::int64 parameter_number;
       if (!ParseToken(TokKind::kLparen,
                       "expects '(' before parameter number") ||
           !ParseInt64(&parameter_number) ||
@@ -470,10 +505,12 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     case HloOpcode::kCopy:
     case HloOpcode::kCos:
     case HloOpcode::kExp:
+    case HloOpcode::kExpm1:
     case HloOpcode::kImag:
     case HloOpcode::kIsFinite:
     case HloOpcode::kFloor:
     case HloOpcode::kLog:
+    case HloOpcode::kLog1p:
     case HloOpcode::kNot:
     case HloOpcode::kNegate:
     case HloOpcode::kReal:
@@ -550,11 +587,14 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kCrossReplicaSum: {
+      optional<HloComputation*> to_apply;
+      attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation,
+                           &to_apply};
       if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
         return false;
       }
       instruction = builder->AddInstruction(
-          HloInstruction::CreateCrossReplicaSum(shape, operands));
+          HloInstruction::CreateCrossReplicaSum(shape, operands, *to_apply));
       break;
     }
     case HloOpcode::kReshape: {
@@ -589,7 +629,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kRecv: {
-      optional<int64> channel_id;
+      optional<tensorflow::int64> channel_id;
       attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id};
       if (!ParseOperands(&operands, /*expected_size=*/0) ||
           !ParseAttributes(attrs)) {
@@ -600,7 +640,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kRecvDone: {
-      optional<int64> channel_id;
+      optional<tensorflow::int64> channel_id;
       attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id};
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
           !ParseAttributes(attrs)) {
@@ -614,7 +654,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kSend: {
-      optional<int64> channel_id;
+      optional<tensorflow::int64> channel_id;
       attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id};
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
           !ParseAttributes(attrs)) {
@@ -625,7 +665,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kSendDone: {
-      optional<int64> channel_id;
+      optional<tensorflow::int64> channel_id;
       attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id};
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
           !ParseAttributes(attrs)) {
@@ -639,7 +679,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kGetTupleElement: {
-      optional<int64> index;
+      optional<tensorflow::int64> index;
       attrs["index"] = {/*required=*/true, AttrTy::kInt64, &index};
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
           !ParseAttributes(attrs)) {
@@ -697,7 +737,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     }
     case HloOpcode::kFft: {
       optional<FftType> fft_type;
-      optional<std::vector<int64>> fft_length;
+      optional<std::vector<tensorflow::int64>> fft_length;
       attrs["fft_type"] = {/*required=*/true, AttrTy::kFftType, &fft_type};
       attrs["fft_length"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &fft_length};
@@ -710,7 +750,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kBroadcast: {
-      optional<std::vector<int64>> broadcast_dimensions;
+      optional<std::vector<tensorflow::int64>> broadcast_dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &broadcast_dimensions};
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
@@ -722,7 +762,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kConcatenate: {
-      optional<std::vector<int64>> dimensions;
+      optional<std::vector<tensorflow::int64>> dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &dimensions};
       if (!ParseOperands(&operands) || !ParseAttributes(attrs) ||
@@ -748,7 +788,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       optional<HloComputation*> reduce_computation;
       attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation,
                            &reduce_computation};
-      optional<std::vector<int64>> dimensions_to_reduce;
+      optional<std::vector<tensorflow::int64>> dimensions_to_reduce;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &dimensions_to_reduce};
       if (!ParseOperands(&operands, /*expected_size=*/2) ||
@@ -761,7 +801,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kReverse: {
-      optional<std::vector<int64>> dimensions;
+      optional<std::vector<tensorflow::int64>> dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &dimensions};
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
@@ -805,7 +845,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kDynamicSlice: {
-      optional<std::vector<int64>> dynamic_slice_sizes;
+      optional<std::vector<tensorflow::int64>> dynamic_slice_sizes;
       attrs["dynamic_slice_sizes"] = {
           /*required=*/true, AttrTy::kBracedInt64List, &dynamic_slice_sizes};
       if (!ParseOperands(&operands, /*expected_size=*/2) ||
@@ -829,7 +869,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kTranspose: {
-      optional<std::vector<int64>> dimensions;
+      optional<std::vector<tensorflow::int64>> dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &dimensions};
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
@@ -843,7 +883,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     case HloOpcode::kBatchNormTraining: {
       optional<float> epsilon;
       attrs["epsilon"] = {/*required=*/true, AttrTy::kFloat, &epsilon};
-      optional<int64> feature_index;
+      optional<tensorflow::int64> feature_index;
       attrs["feature_index"] = {/*required=*/true, AttrTy::kInt64,
                                 &feature_index};
       if (!ParseOperands(&operands, /*expected_size=*/3) ||
@@ -859,7 +899,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     case HloOpcode::kBatchNormInference: {
       optional<float> epsilon;
       attrs["epsilon"] = {/*required=*/true, AttrTy::kFloat, &epsilon};
-      optional<int64> feature_index;
+      optional<tensorflow::int64> feature_index;
       attrs["feature_index"] = {/*required=*/true, AttrTy::kInt64,
                                 &feature_index};
       if (!ParseOperands(&operands, /*expected_size=*/5) ||
@@ -876,7 +916,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     case HloOpcode::kBatchNormGrad: {
       optional<float> epsilon;
       attrs["epsilon"] = {/*required=*/true, AttrTy::kFloat, &epsilon};
-      optional<int64> feature_index;
+      optional<tensorflow::int64> feature_index;
       attrs["feature_index"] = {/*required=*/true, AttrTy::kInt64,
                                 &feature_index};
       if (!ParseOperands(&operands, /*expected_size=*/5) ||
@@ -947,8 +987,8 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kReducePrecision: {
-      optional<int64> exponent_bits;
-      optional<int64> mantissa_bits;
+      optional<tensorflow::int64> exponent_bits;
+      optional<tensorflow::int64> mantissa_bits;
       attrs["exponent_bits"] = {/*required=*/true, AttrTy::kInt64,
                                 &exponent_bits};
       attrs["mantissa_bits"] = {/*required=*/true, AttrTy::kInt64,
@@ -993,7 +1033,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     }
     case HloOpcode::kHostCompute: {
       optional<string> channel_name;
-      optional<int64> cost_estimate_ns;
+      optional<tensorflow::int64> cost_estimate_ns;
       attrs["channel_name"] = {/*required=*/true, AttrTy::kString,
                                &channel_name};
       attrs["cost_estimate_ns"] = {/*required=*/true, AttrTy::kInt64,
@@ -1006,16 +1046,16 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kDot: {
-      optional<std::vector<int64>> lhs_contracting_dims;
+      optional<std::vector<tensorflow::int64>> lhs_contracting_dims;
       attrs["lhs_contracting_dims"] = {
           /*required=*/false, AttrTy::kBracedInt64List, &lhs_contracting_dims};
-      optional<std::vector<int64>> rhs_contracting_dims;
+      optional<std::vector<tensorflow::int64>> rhs_contracting_dims;
       attrs["rhs_contracting_dims"] = {
           /*required=*/false, AttrTy::kBracedInt64List, &rhs_contracting_dims};
-      optional<std::vector<int64>> lhs_batch_dims;
+      optional<std::vector<tensorflow::int64>> lhs_batch_dims;
       attrs["lhs_batch_dims"] = {/*required=*/false, AttrTy::kBracedInt64List,
                                  &lhs_batch_dims};
-      optional<std::vector<int64>> rhs_batch_dims;
+      optional<std::vector<tensorflow::int64>> rhs_batch_dims;
       attrs["rhs_batch_dims"] = {/*required=*/false, AttrTy::kBracedInt64List,
                                  &rhs_batch_dims};
 
@@ -1047,20 +1087,20 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kGather: {
-      optional<std::vector<int64>> output_window_dims;
+      optional<std::vector<tensorflow::int64>> output_window_dims;
       attrs["output_window_dims"] = {
           /*required=*/true, AttrTy::kBracedInt64List, &output_window_dims};
-      optional<std::vector<int64>> elided_window_dims;
+      optional<std::vector<tensorflow::int64>> elided_window_dims;
       attrs["elided_window_dims"] = {
           /*required=*/true, AttrTy::kBracedInt64List, &elided_window_dims};
-      optional<std::vector<int64>> gather_dims_to_operand_dims;
+      optional<std::vector<tensorflow::int64>> gather_dims_to_operand_dims;
       attrs["gather_dims_to_operand_dims"] = {/*required=*/true,
                                               AttrTy::kBracedInt64List,
                                               &gather_dims_to_operand_dims};
-      optional<int64> index_vector_dim;
+      optional<tensorflow::int64> index_vector_dim;
       attrs["index_vector_dim"] = {/*required=*/true, AttrTy::kInt64,
                                    &index_vector_dim};
-      optional<std::vector<int64>> window_bounds;
+      optional<std::vector<tensorflow::int64>> window_bounds;
       attrs["window_bounds"] = {/*required=*/true, AttrTy::kBracedInt64List,
                                 &window_bounds};
 
@@ -1080,6 +1120,18 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
           dim_numbers, *window_bounds));
       break;
     }
+    case HloOpcode::kDomain: {
+      DomainData domain;
+      attrs["domain"] = {/*required=*/true, AttrTy::kDomain, &domain};
+      if (!ParseOperands(&operands, /*expected_size=*/1) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(HloInstruction::CreateDomain(
+          shape, operands[0], std::move(domain.entry_metadata),
+          std::move(domain.exit_metadata)));
+      break;
+    }
     case HloOpcode::kTrace:
       return TokenError(StrCat("parsing not yet implemented for op: ",
                                HloOpcodeString(opcode)));
@@ -1087,8 +1139,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
 
   instruction->set_name(name);
 
-  // Add common attrs (sharding, control predecessors) to the instruction, if
-  // they were seen.
+  // Add shared attributes like metadata to the instruction, if they were seen.
   if (sharding) {
     instruction->set_sharding(
         HloSharding::FromProto(sharding.value()).ValueOrDie());
@@ -1105,6 +1156,9 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
   if (metadata) {
     instruction->set_metadata(*metadata);
   }
+  if (backend_config) {
+    instruction->set_raw_backend_config_string(std::move(*backend_config));
+  }
   return AddInstruction(name, instruction, name_loc);
 }  // NOLINT(readability/fn_size)
 
@@ -1154,8 +1208,8 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
   LocTy loc = lexer_.GetLoc();
   bool maximal = false;
   bool replicated = false;
-  std::vector<int64> devices;
-  std::vector<int64> tile_assignment_dimensions;
+  std::vector<tensorflow::int64> devices;
+  std::vector<tensorflow::int64> tile_assignment_dimensions;
   Shape tile_shape;
   while (lexer_.GetKind() != TokKind::kRbrace) {
     switch (lexer_.GetKind()) {
@@ -1182,7 +1236,7 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
           }
 
           do {
-            int64 dim;
+            tensorflow::int64 dim;
             if (!ParseInt64(&dim)) {
               return false;
             }
@@ -1194,7 +1248,7 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
             return false;
           }
           do {
-            int64 device;
+            tensorflow::int64 device;
             if (!ParseInt64(&device)) {
               return false;
             }
@@ -1253,10 +1307,10 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
     }
     sharding->set_type(OpSharding::Type::OpSharding_Type_OTHER);
     *sharding->mutable_tile_shape() = tile_shape;
-    for (int64 dim : tile_assignment_dimensions) {
+    for (tensorflow::int64 dim : tile_assignment_dimensions) {
       sharding->add_tile_assignment_dimensions(dim);
     }
-    for (int64 device : devices) {
+    for (tensorflow::int64 device : devices) {
       sharding->add_tile_assignment_devices(device);
     }
   }
@@ -1265,6 +1319,34 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
   return true;
 }
 
+// domain ::= '{' 'kind=' domain_kind ',' 'entry=' entry_sharding ','
+//            'exit=' exit_sharding '}'
+bool HloParser::ParseDomain(DomainData* domain) {
+  std::unordered_map<string, AttrConfig> attrs;
+  optional<string> kind;
+  optional<OpSharding> entry_sharding;
+  optional<OpSharding> exit_sharding;
+  attrs["kind"] = {/*required=*/true, AttrTy::kString, &kind};
+  attrs["entry"] = {/*required=*/true, AttrTy::kSharding, &entry_sharding};
+  attrs["exit"] = {/*required=*/true, AttrTy::kSharding, &exit_sharding};
+  if (!ParseSubAttributes(attrs)) {
+    return false;
+  }
+  if (*kind == ShardingMetadata::KindName()) {
+    auto entry_sharding_ptr = MakeUnique<HloSharding>(
+        HloSharding::FromProto(*entry_sharding).ValueOrDie());
+    auto exit_sharding_ptr = MakeUnique<HloSharding>(
+        HloSharding::FromProto(*exit_sharding).ValueOrDie());
+    domain->entry_metadata =
+        MakeUnique<ShardingMetadata>(std::move(entry_sharding_ptr));
+    domain->exit_metadata =
+        MakeUnique<ShardingMetadata>(std::move(exit_sharding_ptr));
+  } else {
+    return TokenError(StrCat("unsupported domain kind: ", *kind));
+  }
+  return true;
+}
+
 // '{' name+ '}'
 bool HloParser::ParseInstructionNames(
     std::vector<HloInstruction*>* instructions) {
@@ -1291,40 +1373,50 @@ bool HloParser::ParseInstructionNames(
                     "expects '}' at the end of instruction name list");
 }
 
-bool HloParser::SetValueInLiteral(int64 value, int64 linear_index,
+bool HloParser::SetValueInLiteral(tensorflow::int64 value,
+                                  tensorflow::int64 linear_index,
                                   Literal* literal) {
   const Shape& shape = literal->shape();
   switch (shape.element_type()) {
     case S8:
-      return SetValueInLiteralHelper<int8>(value, linear_index, literal);
+      return SetValueInLiteralHelper<tensorflow::int8>(value, linear_index,
+                                                       literal);
     case S16:
-      return SetValueInLiteralHelper<int16>(value, linear_index, literal);
+      return SetValueInLiteralHelper<tensorflow::int16>(value, linear_index,
+                                                        literal);
     case S32:
-      return SetValueInLiteralHelper<int32>(value, linear_index, literal);
+      return SetValueInLiteralHelper<tensorflow::int32>(value, linear_index,
+                                                        literal);
     case S64:
-      return SetValueInLiteralHelper<int64>(value, linear_index, literal);
+      return SetValueInLiteralHelper<tensorflow::int64>(value, linear_index,
+                                                        literal);
     case U8:
-      return SetValueInLiteralHelper<uint8>(value, linear_index, literal);
+      return SetValueInLiteralHelper<tensorflow::uint8>(value, linear_index,
+                                                        literal);
     case U16:
-      return SetValueInLiteralHelper<uint8>(value, linear_index, literal);
+      return SetValueInLiteralHelper<tensorflow::uint16>(value, linear_index,
+                                                         literal);
     case U32:
-      return SetValueInLiteralHelper<uint32>(value, linear_index, literal);
+      return SetValueInLiteralHelper<tensorflow::uint32>(value, linear_index,
+                                                         literal);
     case U64:
-      return SetValueInLiteralHelper<uint64>(value, linear_index, literal);
+      return SetValueInLiteralHelper<tensorflow::uint64>(value, linear_index,
+                                                         literal);
     default:
       LOG(FATAL) << "unknown integral primitive type "
                  << PrimitiveType_Name(shape.element_type());
   }
 }
 
-bool HloParser::SetValueInLiteral(double value, int64 linear_index,
+bool HloParser::SetValueInLiteral(double value, tensorflow::int64 linear_index,
                                   Literal* literal) {
   const Shape& shape = literal->shape();
   switch (shape.element_type()) {
     case F16:
-      return SetValueInLiteralHelper<half>(value, linear_index, literal);
+      return SetValueInLiteralHelper<Eigen::half>(value, linear_index, literal);
     case BF16:
-      return SetValueInLiteralHelper<bfloat16>(value, linear_index, literal);
+      return SetValueInLiteralHelper<tensorflow::bfloat16>(value, linear_index,
+                                                           literal);
     case F32:
       return SetValueInLiteralHelper<float>(value, linear_index, literal);
     case F64:
@@ -1335,7 +1427,7 @@ bool HloParser::SetValueInLiteral(double value, int64 linear_index,
   }
 }
 
-bool HloParser::SetValueInLiteral(bool value, int64 linear_index,
+bool HloParser::SetValueInLiteral(bool value, tensorflow::int64 linear_index,
                                   Literal* literal) {
   const Shape& shape = literal->shape();
   switch (shape.element_type()) {
@@ -1348,7 +1440,8 @@ bool HloParser::SetValueInLiteral(bool value, int64 linear_index,
 }
 
 template <typename LiteralNativeT, typename ParsedElemT>
-bool HloParser::SetValueInLiteralHelper(ParsedElemT value, int64 linear_index,
+bool HloParser::SetValueInLiteralHelper(ParsedElemT value,
+                                        tensorflow::int64 linear_index,
                                         Literal* literal) {
   // Check that linear_index is in range.
   if (linear_index >= ShapeUtil::ElementsIn(literal->shape())) {
@@ -1460,7 +1553,7 @@ bool HloParser::ParseNonTupleLiteral(std::unique_ptr<Literal>* literal,
 
 bool HloParser::ParseDenseLiteral(std::unique_ptr<Literal>* literal,
                                   const Shape& shape) {
-  const int64 rank = ShapeUtil::Rank(shape);
+  const tensorflow::int64 rank = ShapeUtil::Rank(shape);
   if (rank > 1 && !EatShapeAndCheckCompatible(shape)) {
     return false;
   }
@@ -1468,8 +1561,8 @@ bool HloParser::ParseDenseLiteral(std::unique_ptr<Literal>* literal,
   // Create a literal with the given shape in default layout.
   *literal = Literal::CreateFromDimensions(shape.element_type(),
                                            AsInt64Slice(shape.dimensions()));
-  int64 nest_level = 0;
-  int64 linear_index = 0;
+  tensorflow::int64 nest_level = 0;
+  tensorflow::int64 linear_index = 0;
   // elems_seen_per_dim[i] is how many elements or sub-arrays we have seen for
   // the dimension i. For example, to parse f32[2,3] {{1, 2, 3}, {4, 5, 6}},
   // when we are parsing the 2nd '{' (right before '1'), we are seeing a
@@ -1477,16 +1570,15 @@ bool HloParser::ParseDenseLiteral(std::unique_ptr<Literal>* literal,
   // the first '}' (right after '3'), it means the sub-array ends, and the
   // sub-array is supposed to contain exactly 3 elements, so check if
   // elems_seen_per_dim[1] is 3.
-  std::vector<int64> elems_seen_per_dim(rank);
+  std::vector<tensorflow::int64> elems_seen_per_dim(rank);
   auto get_index_str = [&elems_seen_per_dim](int dim) -> string {
-    std::vector<int64> elems_seen_until_dim(elems_seen_per_dim.begin(),
-                                            elems_seen_per_dim.begin() + dim);
+    std::vector<tensorflow::int64> elems_seen_until_dim(
+        elems_seen_per_dim.begin(), elems_seen_per_dim.begin() + dim);
     return StrCat("[",
-                  tensorflow::str_util::Join(
-                      elems_seen_until_dim, ",",
-                      [](string* out, const int64& num_elems) {
-                        tensorflow::strings::StrAppend(out, num_elems - 1);
-                      }),
+                  Join(elems_seen_until_dim, ",",
+                       [](string* out, const tensorflow::int64& num_elems) {
+                         StrAppend(out, num_elems - 1);
+                       }),
                   "]");
   };
   do {
@@ -1561,7 +1653,7 @@ bool HloParser::ParseDenseLiteral(std::unique_ptr<Literal>* literal,
           lexer_.Lex();
         } else if (primitive_util::IsIntegralType(shape.element_type())) {
           LocTy loc = lexer_.GetLoc();
-          int64 value;
+          tensorflow::int64 value;
           if (!ParseInt64(&value)) {
             return Error(loc, StrCat("expects integer for primitive type: ",
                                      PrimitiveType_Name(shape.element_type())));
@@ -1601,29 +1693,29 @@ bool HloParser::ParseSparseLiteral(std::unique_ptr<Literal>* literal,
 
   switch (shape.element_type()) {
     case PRED:
-      return ParseSparseLiteralHelper<uint8>(literal, shape);
+      return ParseSparseLiteralHelper<tensorflow::uint8>(literal, shape);
     case S8:
-      return ParseSparseLiteralHelper<int8>(literal, shape);
+      return ParseSparseLiteralHelper<tensorflow::int8>(literal, shape);
     case S16:
-      return ParseSparseLiteralHelper<int16>(literal, shape);
+      return ParseSparseLiteralHelper<tensorflow::int16>(literal, shape);
     case S32:
-      return ParseSparseLiteralHelper<int32>(literal, shape);
+      return ParseSparseLiteralHelper<tensorflow::int32>(literal, shape);
     case S64:
-      return ParseSparseLiteralHelper<int64>(literal, shape);
+      return ParseSparseLiteralHelper<tensorflow::int64>(literal, shape);
     case U8:
-      return ParseSparseLiteralHelper<uint8>(literal, shape);
+      return ParseSparseLiteralHelper<tensorflow::uint8>(literal, shape);
     case U16:
-      return ParseSparseLiteralHelper<uint16>(literal, shape);
+      return ParseSparseLiteralHelper<tensorflow::uint16>(literal, shape);
     case U32:
-      return ParseSparseLiteralHelper<uint32>(literal, shape);
+      return ParseSparseLiteralHelper<tensorflow::uint32>(literal, shape);
     case U64:
-      return ParseSparseLiteralHelper<uint64>(literal, shape);
+      return ParseSparseLiteralHelper<tensorflow::uint64>(literal, shape);
     case F16:
-      return ParseSparseLiteralHelper<half>(literal, shape);
+      return ParseSparseLiteralHelper<Eigen::half>(literal, shape);
     case F32:
       return ParseSparseLiteralHelper<float>(literal, shape);
     case BF16:
-      return ParseSparseLiteralHelper<bfloat16>(literal, shape);
+      return ParseSparseLiteralHelper<tensorflow::bfloat16>(literal, shape);
     case F64:
       return ParseSparseLiteralHelper<double>(literal, shape);
     default:
@@ -1636,9 +1728,9 @@ bool HloParser::ParseSparseLiteral(std::unique_ptr<Literal>* literal,
 template <typename LiteralNativeT>
 bool HloParser::ParseSparseLiteralHelper(std::unique_ptr<Literal>* literal,
                                          const Shape& shape) {
-  std::vector<int64> index;
+  std::vector<tensorflow::int64> index;
 
-  int64 rank = ShapeUtil::Rank(shape);
+  tensorflow::int64 rank = ShapeUtil::Rank(shape);
 
   *literal = MakeUnique<Literal>(shape);
 
@@ -1656,7 +1748,7 @@ bool HloParser::ParseSparseLiteralHelper(std::unique_ptr<Literal>* literal,
     LocTy index_loc = lexer_.GetLoc();
     index.clear();
     if (lexer_.GetKind() == TokKind::kInt) {
-      int64 single_index = lexer_.GetInt64Val();
+      tensorflow::int64 single_index = lexer_.GetInt64Val();
       lexer_.Lex();
       if (rank != 1) {
         return Error(
@@ -1674,7 +1766,7 @@ bool HloParser::ParseSparseLiteralHelper(std::unique_ptr<Literal>* literal,
         return Error(
             index_loc,
             StrCat("invalid multi-dimension index for shape with rank ", rank,
-                   ": [", tensorflow::str_util::Join(index, ", "), "]"));
+                   ": [", Join(index, ", "), "]"));
       }
     }
     if (!ParseToken(TokKind::kColon,
@@ -1689,7 +1781,7 @@ bool HloParser::ParseSparseLiteralHelper(std::unique_ptr<Literal>* literal,
       value = static_cast<LiteralNativeT>(lexer_.GetKind() == TokKind::kw_true);
       lexer_.Lex();
     } else if (primitive_util::IsIntegralType(shape.element_type())) {
-      int64 value_s64;
+      tensorflow::int64 value_s64;
       if (!ParseInt64(&value_s64)) {
         return Error(value_loc,
                      StrCat("expects integer for primitive type: ",
@@ -1842,7 +1934,19 @@ bool HloParser::ParseAttributeHelper(
   }
   auto attr_it = attrs.find(name);
   if (attr_it == attrs.end()) {
-    return Error(loc, Printf("unexpected attribute %s", name.c_str()));
+    string allowed_attrs;
+    if (attrs.empty()) {
+      allowed_attrs = "No attributes are allowed here.";
+    } else {
+      allowed_attrs = StrCat(
+          "Allowed attributes: ",
+          Join(attrs, ", ",
+               [&](string* out, const std::pair<string, AttrConfig>& kv) {
+                 StrAppend(out, kv.first);
+               }));
+    }
+    return Error(loc, Printf("unexpected attribute \"%s\".  %s", name.c_str(),
+                             allowed_attrs.c_str()));
   }
   AttrTy attr_type = attr_it->second.attr_type;
   void* attr_out_ptr = attr_it->second.result;
@@ -1850,23 +1954,24 @@ bool HloParser::ParseAttributeHelper(
     LocTy attr_loc = lexer_.GetLoc();
     switch (attr_type) {
       case AttrTy::kInt64: {
-        int64 result;
+        tensorflow::int64 result;
         if (!ParseInt64(&result)) {
           return false;
         }
-        static_cast<optional<int64>*>(attr_out_ptr)->emplace(result);
+        static_cast<optional<tensorflow::int64>*>(attr_out_ptr)
+            ->emplace(result);
         return true;
       }
       case AttrTy::kInt32: {
-        int64 result;
+        tensorflow::int64 result;
         if (!ParseInt64(&result)) {
           return false;
         }
-        if (result != static_cast<int32>(result)) {
+        if (result != static_cast<tensorflow::int32>(result)) {
           return Error(attr_loc, "value out of range for int32");
         }
-        static_cast<optional<int32>*>(attr_out_ptr)
-            ->emplace(static_cast<int32>(result));
+        static_cast<optional<tensorflow::int32>*>(attr_out_ptr)
+            ->emplace(static_cast<tensorflow::int32>(result));
         return true;
       }
       case AttrTy::kFloat: {
@@ -1900,7 +2005,7 @@ bool HloParser::ParseAttributeHelper(
       }
       case AttrTy::kWindow: {
         Window result;
-        if (!ParseWindow(&result)) {
+        if (!ParseWindow(&result, /*expect_outer_curlies=*/true)) {
           return false;
         }
         static_cast<optional<Window>*>(attr_out_ptr)->emplace(result);
@@ -1942,12 +2047,12 @@ bool HloParser::ParseAttributeHelper(
         return true;
       }
       case AttrTy::kBracedInt64List: {
-        std::vector<int64> result;
+        std::vector<tensorflow::int64> result;
         if (!ParseInt64List(TokKind::kLbrace, TokKind::kRbrace, TokKind::kComma,
                             &result)) {
           return false;
         }
-        static_cast<optional<std::vector<int64>>*>(attr_out_ptr)
+        static_cast<optional<std::vector<tensorflow::int64>>*>(attr_out_ptr)
             ->emplace(result);
         return true;
       }
@@ -1992,6 +2097,9 @@ bool HloParser::ParseAttributeHelper(
             ->emplace(result);
         return true;
       }
+      case AttrTy::kDomain: {
+        return ParseDomain(static_cast<DomainData*>(attr_out_ptr));
+      }
     }
   }();
   if (!success) {
@@ -2018,9 +2126,10 @@ bool HloParser::ParseComputationName(HloComputation** value) {
 // ::= '{' size stride? pad? lhs_dilate? rhs_dilate? '}'
 // The subattributes can appear in any order. 'size=' is required, others are
 // optional.
-bool HloParser::ParseWindow(Window* window) {
+bool HloParser::ParseWindow(Window* window, bool expect_outer_curlies) {
   LocTy loc = lexer_.GetLoc();
-  if (!ParseToken(TokKind::kLbrace, "expected '{' to start window attribute")) {
+  if (expect_outer_curlies &&
+      !ParseToken(TokKind::kLbrace, "expected '{' to start window attribute")) {
     return false;
   }
 
@@ -2030,7 +2139,9 @@ bool HloParser::ParseWindow(Window* window) {
   std::vector<int64> lhs_dilate;
   std::vector<int64> rhs_dilate;
   std::vector<int64> rhs_reversal;
-  while (lexer_.GetKind() != TokKind::kRbrace) {
+  const auto end_token =
+      expect_outer_curlies ? TokKind::kRbrace : TokKind::kEof;
+  while (lexer_.GetKind() != end_token) {
     LocTy attr_loc = lexer_.GetLoc();
     string field_name;
     if (!ParseAttributeName(&field_name)) {
@@ -2094,7 +2205,8 @@ bool HloParser::ParseWindow(Window* window) {
     window->mutable_dimensions(i)->set_window_reversal(
         rhs_reversal.empty() ? false : (rhs_reversal[i] == 1));
   }
-  return ParseToken(TokKind::kRbrace, "expected '}' to end window attribute");
+  return !expect_outer_curlies ||
+         ParseToken(TokKind::kRbrace, "expected '}' to end window attribute");
 }
 
 // This is the inverse of HloInstruction::ConvolutionDimensionNumbersToString.
@@ -2118,7 +2230,7 @@ bool HloParser::ParseConvolutionDimensionNumbers(
                << str;
   }
 
-  const int64 rank = lhs_rhs_out[0].length();
+  const tensorflow::int64 rank = lhs_rhs_out[0].length();
   if (rank != lhs_rhs_out[1].length() || rank != lhs_rhs_out[2].length()) {
     return TokenError(
         "convolution lhs, rhs, and output must have the same rank");
@@ -2232,7 +2344,7 @@ bool HloParser::ParseSliceRanges(SliceRanges* result) {
   if (!ParseToken(TokKind::kLbrace, "expects '{' to start ranges")) {
     return false;
   }
-  std::vector<std::vector<int64>> ranges;
+  std::vector<std::vector<tensorflow::int64>> ranges;
   if (lexer_.GetKind() == TokKind::kRbrace) {
     // empty
     return ParseToken(TokKind::kRbrace, "expects '}' to end ranges");
@@ -2266,7 +2378,7 @@ bool HloParser::ParseSliceRanges(SliceRanges* result) {
 //   ::= int64_val (delim int64_val)*
 bool HloParser::ParseInt64List(const TokKind start, const TokKind end,
                                const TokKind delim,
-                               std::vector<int64>* result) {
+                               std::vector<tensorflow::int64>* result) {
   if (!ParseToken(start, StrCat("expects an int64 list starting with ",
                                 TokKindToString(start)))) {
     return false;
@@ -2275,7 +2387,7 @@ bool HloParser::ParseInt64List(const TokKind start, const TokKind end,
     // empty
   } else {
     do {
-      int64 i;
+      tensorflow::int64 i;
       if (!ParseInt64(&i)) {
         return false;
       }
@@ -2392,7 +2504,8 @@ bool HloParser::ParseString(string* result) {
   return true;
 }
 
-bool HloParser::ParseDxD(const string& name, std::vector<int64>* result) {
+bool HloParser::ParseDxD(const string& name,
+                         std::vector<tensorflow::int64>* result) {
   LocTy loc = lexer_.GetLoc();
   if (!result->empty()) {
     return Error(loc,
@@ -2400,7 +2513,7 @@ bool HloParser::ParseDxD(const string& name, std::vector<int64>* result) {
   }
   // 1D
   if (lexer_.GetKind() == TokKind::kInt) {
-    int64 number;
+    tensorflow::int64 number;
     if (!ParseInt64(&number)) {
       return Error(loc, Printf("expects sub-attribute '%s=i'", name.c_str()));
     }
@@ -2420,7 +2533,8 @@ bool HloParser::ParseDxD(const string& name, std::vector<int64>* result) {
   return TokenError("expects token type kInt or kDxD");
 }
 
-bool HloParser::ParseWindowPad(std::vector<std::vector<int64>>* pad) {
+bool HloParser::ParseWindowPad(
+    std::vector<std::vector<tensorflow::int64>>* pad) {
   LocTy loc = lexer_.GetLoc();
   if (!pad->empty()) {
     return Error(loc, "sub-attribute 'pad=' already exists");
@@ -2431,7 +2545,7 @@ bool HloParser::ParseWindowPad(std::vector<std::vector<int64>>* pad) {
   string str = lexer_.GetStrVal();
   std::vector<string> padding_str = Split(str, 'x');
   for (int i = 0; i < padding_str.size(); i++) {
-    std::vector<int64> low_high;
+    std::vector<tensorflow::int64> low_high;
     if (!SplitAndParseAsInts(padding_str[i], '_', &low_high) ||
         low_high.size() != 2) {
       return Error(loc,
@@ -2455,7 +2569,7 @@ bool HloParser::ParsePaddingConfig(PaddingConfig* padding) {
   string str = lexer_.GetStrVal();
   std::vector<string> padding_str = Split(str, 'x');
   for (const auto& padding_dim_str : padding_str) {
-    std::vector<int64> padding_dim;
+    std::vector<tensorflow::int64> padding_dim;
     if (!SplitAndParseAsInts(padding_dim_str, '_', &padding_dim) ||
         (padding_dim.size() != 2 && padding_dim.size() != 3)) {
       return Error(loc,
@@ -2477,7 +2591,7 @@ bool HloParser::ParseMetadata(OpMetadata* metadata) {
   optional<string> op_type;
   optional<string> op_name;
   optional<string> source_file;
-  optional<int32> source_line;
+  optional<tensorflow::int32> source_line;
   attrs["op_type"] = {/*required=*/false, AttrTy::kString, &op_type};
   attrs["op_name"] = {/*required=*/false, AttrTy::kString, &op_name};
   attrs["source_file"] = {/*required=*/false, AttrTy::kString, &source_file};
@@ -2564,7 +2678,7 @@ bool HloParser::ParseRandomDistribution(RandomDistribution* result) {
   return true;
 }
 
-bool HloParser::ParseInt64(int64* result) {
+bool HloParser::ParseInt64(tensorflow::int64* result) {
   VLOG(1) << "ParseInt64";
   if (lexer_.GetKind() != TokKind::kInt) {
     return TokenError("expects integer");
@@ -2647,10 +2761,48 @@ bool HloParser::AddComputation(const string& name, HloComputation* computation,
   return true;
 }
 
+StatusOr<HloSharding> HloParser::ParseShardingOnly() {
+  lexer_.Lex();
+  OpSharding op_sharding;
+  if (!ParseSharding(&op_sharding)) {
+    return InvalidArgument("Syntax error:\n%s", GetError().c_str());
+  }
+  if (lexer_.GetKind() != TokKind::kEof) {
+    return InvalidArgument("Syntax error:\nExtra content after sharding");
+  }
+  return HloSharding::FromProto(op_sharding);
+}
+
+StatusOr<Window> HloParser::ParseWindowOnly() {
+  lexer_.Lex();
+  Window window;
+  if (!ParseWindow(&window, /*expect_outer_curlies=*/false)) {
+    return InvalidArgument("Syntax error:\n%s", GetError().c_str());
+  }
+  if (lexer_.GetKind() != TokKind::kEof) {
+    return InvalidArgument("Syntax error:\nExtra content after window");
+  }
+  return window;
+}
+
+StatusOr<ConvolutionDimensionNumbers>
+HloParser::ParseConvolutionDimensionNumbersOnly() {
+  lexer_.Lex();
+  ConvolutionDimensionNumbers dnums;
+  if (!ParseConvolutionDimensionNumbers(&dnums)) {
+    return InvalidArgument("Syntax error:\n%s", GetError().c_str());
+  }
+  if (lexer_.GetKind() != TokKind::kEof) {
+    return InvalidArgument(
+        "Syntax error:\nExtra content after convolution dnums");
+  }
+  return dnums;
+}
+
 }  // namespace
 
-StatusOr<std::unique_ptr<HloModule>> Parse(StringPiece str,
-                                           const HloModuleConfig& config) {
+StatusOr<std::unique_ptr<HloModule>> ParseHloString(
+    tensorflow::StringPiece str, const HloModuleConfig& config) {
   HloParser parser(str, config);
   if (!parser.Run()) {
     return InvalidArgument("Syntax error:\n%s", parser.GetError().c_str());
@@ -2658,10 +2810,29 @@ StatusOr<std::unique_ptr<HloModule>> Parse(StringPiece str,
   return parser.ConsumeHloModule();
 }
 
-StatusOr<std::unique_ptr<HloModule>> Parse(StringPiece str) {
+StatusOr<std::unique_ptr<HloModule>> ParseHloString(
+    tensorflow::StringPiece str) {
+  HloModuleConfig config;
+  return ParseHloString(str, config);
+}
+
+StatusOr<HloSharding> ParseSharding(tensorflow::StringPiece str) {
   HloModuleConfig config;
-  return Parse(str, config);
+  HloParser parser(str, config);
+  return parser.ParseShardingOnly();
+}
+
+StatusOr<Window> ParseWindow(tensorflow::StringPiece str) {
+  HloModuleConfig config;
+  HloParser parser(str, config);
+  return parser.ParseWindowOnly();
+}
+
+StatusOr<ConvolutionDimensionNumbers> ParseConvolutionDimensionNumbers(
+    tensorflow::StringPiece str) {
+  HloModuleConfig config;
+  HloParser parser(str, config);
+  return parser.ParseConvolutionDimensionNumbersOnly();
 }
 
-}  // namespace tools
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.h b/tensorflow/compiler/xla/service/hlo_parser.h
similarity index 52%
rename from tensorflow/compiler/xla/tools/parser/hlo_parser.h
rename to tensorflow/compiler/xla/service/hlo_parser.h
index 2f97a2b9b19d0c..3f3a51215e34bb 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser.h
+++ b/tensorflow/compiler/xla/service/hlo_parser.h
@@ -13,30 +13,47 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_TOOLS_PARSER_HLO_PARSER_H_
-#define TENSORFLOW_COMPILER_XLA_TOOLS_PARSER_HLO_PARSER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PARSER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PARSER_H_
 
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_lexer.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_lexer.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
-namespace tools {
+
+// For details about the syntax accepted by this parser, see
+// g3doc/hlo_parser.md.
 
 // The api of the hlo parser. Given a string in the HloModule::ToString()
 // format, parses the string and creates a HloModule with the given config.
-StatusOr<std::unique_ptr<HloModule>> Parse(tensorflow::StringPiece str,
-                                           const HloModuleConfig& config);
+StatusOr<std::unique_ptr<HloModule>> ParseHloString(
+    tensorflow::StringPiece str, const HloModuleConfig& config);
 
 // The api of the hlo parser. Given a string in the HloModule::ToString()
 // format, parses the string and creates a HloModule with default config.
-StatusOr<std::unique_ptr<HloModule>> Parse(tensorflow::StringPiece str);
+StatusOr<std::unique_ptr<HloModule>> ParseHloString(
+    tensorflow::StringPiece str);
+
+// Parses the result of HloSharding::ToString(), e.g. "{replicated}".
+StatusOr<HloSharding> ParseSharding(tensorflow::StringPiece str);
+
+// Parses the result of window_util::ToString(const Window&).
+StatusOr<Window> ParseWindow(tensorflow::StringPiece str);
+
+// Parses the result of ConvolutionDimensionNumbersToString(), e.g.
+// "b0f_0io->b0f".
+StatusOr<ConvolutionDimensionNumbers> ParseConvolutionDimensionNumbers(
+    tensorflow::StringPiece str);
+
+// ParseHloString sharding from str. str is supposed to contain the body of the
+// sharding, i.e. just the rhs of the "sharding={...}" attribute string.
+StatusOr<HloSharding> ParseSharding(tensorflow::StringPiece str);
 
-}  // namespace tools
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_TOOLS_PARSER_HLO_PARSER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PARSER_H_
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
similarity index 86%
rename from tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
rename to tensorflow/compiler/xla/service/hlo_parser_test.cc
index adc8b1d620eb65..08068dc5042d58 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -13,19 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 
 #include <string>
+#include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
-namespace tools {
+
 namespace {
 
-using tensorflow::StringPiece;
+using ::tensorflow::StringPiece;
 
 struct TestData {
   string test_name;
@@ -65,7 +66,7 @@ ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
 R"(HloModule constant_pred_module
 
 ENTRY %constant_pred () -> pred[] {
-  ROOT %constant = pred[] constant(true), metadata={op_type="const" op_name="\"it\'s not a problem\n" source_file="path/to/test.cc" source_line=68}
+  ROOT %constant = pred[] constant(true), metadata={op_type="const" op_name="\"it\'s not a problem\n" source_file="path/to/test.cc" source_line=68}, backend_config="foo\" bar"
 }
 
 )"
@@ -81,13 +82,14 @@ ENTRY %constant_s32 () -> s32[] {
 
 )"
 },
-// f32 constant, but the value is not a decimal
+// f32 constant, but the value is not a decimal and there is a backend
+// configuration
 {
 "ConstantF32",
 R"(HloModule ConstantF32_module
 
 ENTRY %ConstantF32.v4 () -> f32[] {
-  ROOT %constant = f32[] constant(42)
+  ROOT %constant = f32[] constant(42), backend_config="this is a configuration"
 }
 
 )"
@@ -232,6 +234,17 @@ ENTRY %ShardedTupleCreate.v4 (v1: f32[], v2: f32[3], v3: f32[2,3]) -> (f32[], f3
   ROOT %tuple = (f32[], f32[3]{0}, f32[2,3]{1,0}) tuple(f32[] %v1, f32[3]{0} %v2, f32[2,3]{1,0} %v3), sharding={{replicated}, {maximal device=0}, {replicated}}
 }
 
+)"
+},
+{
+"DomainParsing",
+R"(HloModule DomainParsing_module
+
+ENTRY %DomainParsing (v1: f32[]) -> f32[] {
+  %v1 = f32[] parameter(0)
+  ROOT %dom = f32[] domain(f32[] %v1), domain={kind="sharding", entry={maximal device=0}, exit={maximal device=1}}
+}
+
 )"
 },
 // int32 result = 0;
@@ -885,6 +898,24 @@ ENTRY Gather {
   ROOT gather = f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} gather(input_tensor, gather_indices), output_window_dims={4,5,6,7,8}, elided_window_dims={}, gather_dims_to_operand_dims={0,1,2,3,4}, index_vector_dim=4, window_bounds={30,29,28,27,26}
 }
 
+)"
+},
+// cross-replica-sum
+{
+"CrossReplicaSum",
+R"(HloModule CRS
+
+add {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+ENTRY CRS {
+  input = f32[8]{0} parameter(0)
+  ROOT crs = f32[8]{0} cross-replica-sum(input), to_apply=add
+}
+
 )"
 },
   });
@@ -899,12 +930,12 @@ class HloParserTest : public ::testing::Test,
         << "'" << s << "' does not contain '" << expected << "'";
   }
 
-  // Expects "ToString(Parse(string)) == string", that is, parses the string,
-  // asserts that it succeeded, stringifies the parsed module, and checks that
-  // the it equals the original string.
+  // Expects "ToString(ParseHloString(string)) == string", that is, parses the
+  // string, asserts that it succeeded, stringifies the parsed module, and
+  // checks that the it equals the original string.
   void ExpectEqual() {
     const string& original = GetParam().module_string;
-    auto result = Parse(original);
+    auto result = ParseHloString(original);
     TF_ASSERT_OK(result.status());
     EXPECT_EQ(original, result.ValueOrDie()->ToString(
                             HloPrintOptions().set_print_large_constants(true)));
@@ -915,7 +946,7 @@ class HloParserShortTest : public HloParserTest {
  protected:
   void ExpectEqualShort() {
     const string& original = GetParam().module_string;
-    auto result = Parse(original);
+    auto result = ParseHloString(original);
     TF_ASSERT_OK(result.status());
     EXPECT_EQ(original,
               result.ValueOrDie()->ToString(HloPrintOptions::ShortParsable()));
@@ -936,14 +967,14 @@ INSTANTIATE_TEST_CASE_P(HloParserTestSuccessInstantiation, HloParserShortTest,
 
 TEST_F(HloParserTest, Empty) {
   const string original = "";
-  auto result = Parse(original);
-  EXPECT_NE(tensorflow::Status::OK(), result.status());
+  auto result = ParseHloString(original);
+  EXPECT_NE(Status::OK(), result.status());
 }
 
 TEST_F(HloParserTest, Garbage) {
   const string original = "HloModule thi$ str1ng makes# N0 sen$e @all!*&^%$";
-  auto result = Parse(original);
-  EXPECT_NE(tensorflow::Status::OK(), result.status());
+  auto result = ParseHloString(original);
+  EXPECT_NE(Status::OK(), result.status());
 }
 
 TEST_F(HloParserTest, WrongOpcode) {
@@ -956,8 +987,8 @@ ENTRY %blabla (x: f32[], y: f32[]) -> f32[] {
 }
 
 )";
-  auto result = Parse(original);
-  EXPECT_NE(tensorflow::Status::OK(), result.status());
+  auto result = ParseHloString(original);
+  EXPECT_NE(Status::OK(), result.status());
 }
 
 TEST_F(HloParserTest, WrongShape) {
@@ -968,8 +999,8 @@ ENTRY %blabla (x: g32[]) -> g32[] {
 }
 
 )";
-  auto result = Parse(original);
-  EXPECT_NE(tensorflow::Status::OK(), result.status());
+  auto result = ParseHloString(original);
+  EXPECT_NE(Status::OK(), result.status());
 }
 
 TEST_F(HloParserTest, WrongOperandsSize) {
@@ -981,8 +1012,8 @@ ENTRY %blabla (x: f32[]) -> pred[] {
 }
 
 )";
-  auto result = Parse(original);
-  EXPECT_NE(tensorflow::Status::OK(), result.status());
+  auto result = ParseHloString(original);
+  EXPECT_NE(Status::OK(), result.status());
 }
 
 TEST_F(HloParserTest, OperandNotFound) {
@@ -992,8 +1023,8 @@ ENTRY %blabla (x: f32[]) -> pred[] {
   %eq = pred[]{} equal-to(f32[]{} %x, f32[]{} %y)
 }
 )";
-  auto result = Parse(original);
-  EXPECT_NE(tensorflow::Status::OK(), result.status());
+  auto result = ParseHloString(original);
+  EXPECT_NE(Status::OK(), result.status());
 }
 
 TEST_F(HloParserTest, MoreConstants) {
@@ -1007,12 +1038,25 @@ ENTRY %SelectScalarS32True.v4 () -> s32[] {
 }
 
 )";
-  auto result = Parse(original);
+  auto result = ParseHloString(original);
   TF_EXPECT_OK(result.status());
   // Constant instructions have no name. The string will be parsed successfully
   // but the constant names will not be exactly the same.
 }
 
+TEST_F(HloParserTest, ConfigurationField) {
+  const string original = R"(HloModule AModule
+ENTRY %configuration_test() -> s32[] {
+  %constant = s32[] constant(42), backend_config="foo bar"
+})";
+  auto result = ParseHloString(original);
+  TF_ASSERT_OK(result.status());
+  EXPECT_EQ("foo bar", result.ValueOrDie()
+                           ->entry_computation()
+                           ->root_instruction()
+                           ->raw_backend_config_string());
+}
+
 TEST_F(HloParserTest, LiteralDimensionsMismatch_1) {
   const string original = R"(HloModule some_2_module
 
@@ -1021,8 +1065,8 @@ ENTRY %some_2 () -> f32[2] {
 }
 
 )";
-  auto result = Parse(original);
-  EXPECT_NE(tensorflow::Status::OK(), result.status());
+  auto result = ParseHloString(original);
+  EXPECT_NE(Status::OK(), result.status());
   ExpectHasSubstr(result.status().error_message(),
                   "expects nested array in rank 1, but sees larger");
 }
@@ -1035,8 +1079,8 @@ ENTRY %some_2x3 () -> f32[2,3] {
 }
 
 )";
-  auto result = Parse(original);
-  EXPECT_NE(tensorflow::Status::OK(), result.status());
+  auto result = ParseHloString(original);
+  EXPECT_NE(Status::OK(), result.status());
   ExpectHasSubstr(result.status().error_message(),
                   "expects nested array in rank 2, but sees 1");
 }
@@ -1049,8 +1093,8 @@ ENTRY %some_2x3x2 () -> f32[2,3,2] {
 }
 
 )";
-  auto result = Parse(original);
-  EXPECT_NE(tensorflow::Status::OK(), result.status());
+  auto result = ParseHloString(original);
+  EXPECT_NE(Status::OK(), result.status());
   ExpectHasSubstr(result.status().error_message(),
                   "expects 3 elements in the [0]th element");
 }
@@ -1064,8 +1108,8 @@ ENTRY %ConstantF16Overflow.v4 () -> f16[] {
 }
 
 )";
-  auto result = Parse(original);
-  EXPECT_NE(tensorflow::Status::OK(), result.status());
+  auto result = ParseHloString(original);
+  EXPECT_NE(Status::OK(), result.status());
   ExpectHasSubstr(result.status().error_message(),
                   "is out of range for literal's primitive type F16");
 }
@@ -1078,7 +1122,7 @@ ENTRY %ConstantWithExp.v4 () -> f32[] {
 }
 
 )";
-  auto result = Parse(original);
+  auto result = ParseHloString(original);
   TF_EXPECT_OK(result.status());
   // The string will be parsed successfully but the output strings are not
   // exactly the same, because "3e2" is parsed into value 300 and will be
@@ -1092,11 +1136,11 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2
   %input = f32[1,2,1]{2,1,0} parameter(0)
   %copy = f32[1,2,1]{2,0,1} copy(f32[1,2,1]{2,1,0} %input)
   %filter = f32[1,1,1]{2,1,0} parameter(1)
-  ROOT %convolution = f32[1,2,1]{2,0,1} convolution(f32[1,2,1]{2,0,1} %copy, f32[1,1,1]{2,1,0} %filter), sharding={maximal device=1}, dim_labels=b0f_0io->b0f, window={pad=1_1 size=2}
+  ROOT %convolution = f32[1,2,1]{2,0,1} convolution(f32[1,2,1]{2,0,1} %copy, f32[1,1,1]{2,1,0} %filter), sharding={maximal device=1}, backend_config="foo", dim_labels=b0f_0io->b0f, window={pad=1_1 size=2}
 }
 
 )";
-  TF_EXPECT_OK(Parse(original).status());
+  TF_EXPECT_OK(ParseHloString(original).status());
 }
 
 TEST_F(HloParserTest, InvalidDimLabels) {
@@ -1112,17 +1156,18 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2
 
 )";
 
+  ExpectHasSubstr(ParseHloString(tensorflow::strings::StrCat(
+                                     prefix, ",dim_labels=00_01_10", suffix))
+                      .status()
+                      .error_message(),
+                  "expects dim labels pattern");
+
   ExpectHasSubstr(
-      Parse(tensorflow::strings::StrCat(prefix, ",dim_labels=00_01_10", suffix))
+      ParseHloString(tensorflow::strings::StrCat(
+                         prefix, ",dim_labels=010_1100->010", suffix))
           .status()
           .error_message(),
-      "expects dim labels pattern");
-
-  ExpectHasSubstr(Parse(tensorflow::strings::StrCat(
-                            prefix, ",dim_labels=010_1100->010", suffix))
-                      .status()
-                      .error_message(),
-                  "must have the same rank");
+      "must have the same rank");
 }
 
 TEST_F(HloParserTest, UnexpectedAttribute) {
@@ -1137,8 +1182,8 @@ ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
 }
 
 )";
-  ExpectHasSubstr(Parse(original).status().error_message(),
-                  "unexpected attribute calls");
+  ExpectHasSubstr(ParseHloString(original).status().error_message(),
+                  "unexpected attribute \"calls\"");
 }
 
 TEST_F(HloParserTest, MissingAttribute) {
@@ -1153,7 +1198,7 @@ ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
 }
 
 )";
-  ExpectHasSubstr(Parse(original).status().error_message(),
+  ExpectHasSubstr(ParseHloString(original).status().error_message(),
                   "attribute channel_id is expected but not seen");
 }
 
@@ -1169,7 +1214,7 @@ ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
 }
 
 )";
-  ExpectHasSubstr(Parse(original).status().error_message(),
+  ExpectHasSubstr(ParseHloString(original).status().error_message(),
                   "'done' is not defined");
 }
 
@@ -1182,7 +1227,7 @@ ENTRY %slice.v2 (p0: f32[3,3,4,4]) -> f32[3,3,2,4] {
 }
 
 )";
-  TF_EXPECT_OK(Parse(original).status());
+  TF_EXPECT_OK(ParseHloString(original).status());
 }
 
 TEST_F(HloParserTest, PaddingConfigIsNotWindowPad) {
@@ -1196,7 +1241,7 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2
 }
 
 )";
-  ExpectHasSubstr(Parse(original).status().error_message(),
+  ExpectHasSubstr(ParseHloString(original).status().error_message(),
                   "expects padding_low and padding_high separated by '_'");
 }
 
@@ -1208,7 +1253,7 @@ ENTRY %test_comma.v4 () -> f32[] {
 }
 
 )";
-  TF_EXPECT_OK(Parse(original).status());
+  TF_EXPECT_OK(ParseHloString(original).status());
 }
 
 TEST_F(HloParserTest, ComputationShapeDoesNotMatchRootShape) {
@@ -1218,7 +1263,7 @@ ENTRY %CustomCall () -> f32[1] {
   %constant = f32[1]{0} constant({12345})
   ROOT %foo = f32[1,2,3]{0,2,1} custom-call(f32[1]{0} %constant), custom_call_target="foo\"bar"
 })";
-  ExpectHasSubstr(Parse(original).status().error_message(),
+  ExpectHasSubstr(ParseHloString(original).status().error_message(),
                   "Shape of computation CustomCall, f32[1], is not compatible "
                   "with that of its root instruction foo, f32[1,2,3]");
 }
@@ -1237,9 +1282,9 @@ ENTRY %Reduce (input: f32[8,16,256]) -> f32[8,16] {
   ROOT reduce = f32[8,16]{0,1} reduce(input, constant), dimensions={2}, to_apply=add_F32.v3
 })";
 
-  auto module = Parse(original);
+  auto module = ParseHloString(original);
   TF_ASSERT_OK(module.status());
-  auto program_layout = module.ValueOrDie()->entry_computation_layout();
+  auto program_layout = module.ValueOrDie()->host_entry_computation_layout();
   ASSERT_EQ(program_layout.parameter_count(), 1);
   auto param_layout = program_layout.parameter_layout(0).layout();
   auto result_layout = program_layout.result_layout().layout();
@@ -1260,7 +1305,7 @@ c1 {
 c2 {
   const2 = f32[1]{0} constant({67890})
 })";
-  auto module = Parse(original);
+  auto module = ParseHloString(original);
   TF_ASSERT_OK(module.status());
   EXPECT_EQ(module.ValueOrDie()->entry_computation()->name(), "c2");
 }
@@ -1271,7 +1316,7 @@ ENTRY consts {
   first = f32[1]{0} constant({12345})
   last = f32[1]{0} constant({67890})
 })";
-  auto module = Parse(original);
+  auto module = ParseHloString(original);
   TF_ASSERT_OK(module.status());
   EXPECT_EQ(
       module.ValueOrDie()->entry_computation()->root_instruction()->name(),
@@ -1286,7 +1331,7 @@ ENTRY c1 {
 ENTRY c2 {
   const2 = f32[1]{0} constant({67890})
 })";
-  ExpectHasSubstr(Parse(original).status().error_message(),
+  ExpectHasSubstr(ParseHloString(original).status().error_message(),
                   "expects only one ENTRY");
 }
 
@@ -1296,25 +1341,10 @@ ENTRY consts {
   ROOT const1 = f32[1]{0} constant({12345})
   ROOT const2 = f32[1]{0} constant({12345})
 })";
-  ExpectHasSubstr(Parse(original).status().error_message(),
+  ExpectHasSubstr(ParseHloString(original).status().error_message(),
                   "one computation should have only one ROOT");
 }
 
-TEST_F(HloParserTest, InstructionExists) {
-  const string original = R"(HloModule comp_exists
-c1 {
-  instr = f32[1]{0} constant({12345})
-}
-c2 {
-  instr = f32[1]{0} constant({67890})
-})";
-
-  ExpectHasSubstr(Parse(original).status().error_message(),
-                  R"(was parsing 3:3: error: instruction previously defined here
-  instr = f32[1]{0} constant({12345})
-  ^)");
-}
-
 TEST_F(HloParserTest, ComputationExists) {
   const string original = R"(HloModule comp_exists
 comp {
@@ -1323,12 +1353,52 @@ comp {
 comp {
   const2 = f32[1]{0} constant({67890})
 })";
-  ExpectHasSubstr(Parse(original).status().error_message(),
+  ExpectHasSubstr(ParseHloString(original).status().error_message(),
                   R"(was parsing 2:1: error: computation previously defined here
 comp {
 ^)");
 }
 
+TEST_F(HloParserTest, CrossComputationLookup) {
+  const string original = R"(HloModule cross_computation_lookup:
+tcalla (a: (s32[], s32[])) -> (s32[], s32[]) {
+  ROOT aparam = (s32[], s32[]) parameter(0)
+}
+
+tcallb (b: (s32[], s32[])) -> s32[] {
+  rparam = (s32[], s32[]) parameter(0)
+  ROOT gte0 = s32[] get-tuple-element(aparam), index=0
+}
+
+ENTRY entry {
+  param = (s32[], s32[]) parameter(0)
+  call0 = (s32[], s32[]) call(param), to_apply=tcalla
+  ROOT call1 = s32[] call(param), to_apply=tcallb
+})";
+  ExpectHasSubstr(
+      ParseHloString(original).status().error_message(),
+      "was parsing 8:39: error: instruction does not exist: aparam");
+}
+
+TEST_F(HloParserTest, ParseSharding) {
+  const string original = "{maximal device=42}";
+  TF_ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
+  EXPECT_EQ(sharding.ToString(), original);
+}
+
+TEST_F(HloParserTest, ParseWindow) {
+  Window original = window_util::MakeWindow({1, 2, 3});
+  TF_ASSERT_OK_AND_ASSIGN(Window parsed,
+                          ParseWindow(window_util::ToString(original)))
+  EXPECT_EQ(window_util::ToString(original), window_util::ToString(parsed));
+}
+
+TEST_F(HloParserTest, ParseConvolutionDimensionNumbers) {
+  const string original = "b0f_0io->b0f";
+  TF_ASSERT_OK_AND_ASSIGN(ConvolutionDimensionNumbers dnums,
+                          ParseConvolutionDimensionNumbers(original));
+  EXPECT_EQ(original, ConvolutionDimensionNumbersToString(dnums));
+}
+
 }  // namespace
-}  // namespace tools
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
index 5120775737bfa3..d8f1ab916b5c5c 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
@@ -90,7 +90,7 @@ StatusOr<bool> HloPassPipeline::Run(HloModule* module) {
     return Status::OK();
   };
 
-  string prefix = name().ToString() + ": pipeline start";
+  string prefix = std::string(name()) + ": pipeline start";
   bool changed = false;
   string message;
   TF_RETURN_IF_ERROR(
@@ -98,12 +98,12 @@ StatusOr<bool> HloPassPipeline::Run(HloModule* module) {
   const string xla_dump_per_pass_hlo_proto_to =
       module->config().debug_options().xla_dump_per_pass_hlo_proto_to();
   if (!xla_dump_per_pass_hlo_proto_to.empty()) {
-    DumpModuleProto(*module, xla_dump_per_pass_hlo_proto_to, name().ToString(),
-                    "pipeline_start");
+    DumpModuleProto(*module, xla_dump_per_pass_hlo_proto_to,
+                    std::string(name()), "pipeline_start");
   }
 
   for (auto& pass : passes_) {
-    if (disabled_passes.count(pass->name().ToString()) > 0) {
+    if (disabled_passes.count(std::string(pass->name())) > 0) {
       VLOG(1) << "  Skipping HLO pass " << pass->name()
               << ", disabled by --xla_disable_hlo_passes";
       continue;
@@ -121,7 +121,7 @@ StatusOr<bool> HloPassPipeline::Run(HloModule* module) {
         run_invariant_checkers(StrCat("after running pass: ", pass->name())));
     if (!xla_dump_per_pass_hlo_proto_to.empty()) {
       DumpModuleProto(*module, xla_dump_per_pass_hlo_proto_to,
-                      name().ToString(), pass->name().ToString());
+                      std::string(name()), std::string(pass->name()));
     }
 
     changed |= changed_this_pass;
diff --git a/tensorflow/compiler/xla/service/hlo_reachability.cc b/tensorflow/compiler/xla/service/hlo_reachability.cc
index 8e167633bb1347..4738e46f8aeb96 100644
--- a/tensorflow/compiler/xla/service/hlo_reachability.cc
+++ b/tensorflow/compiler/xla/service/hlo_reachability.cc
@@ -33,17 +33,27 @@ bool HloReachabilityMap::SetReachabilityToUnion(
     const HloInstruction* instruction) {
   BitVector& bit_vector = GetBitVector(instruction);
   tmp_bit_vector_ = bit_vector;
+  SetReachabilityToUnionHelper(inputs, instruction, &bit_vector);
+  return bit_vector != tmp_bit_vector_;
+}
 
+void HloReachabilityMap::FastSetReachabilityToUnion(
+    tensorflow::gtl::ArraySlice<const HloInstruction*> inputs,
+    const HloInstruction* instruction) {
+  SetReachabilityToUnionHelper(inputs, instruction, &GetBitVector(instruction));
+}
+
+void HloReachabilityMap::SetReachabilityToUnionHelper(
+    tensorflow::gtl::ArraySlice<const HloInstruction*> inputs,
+    const HloInstruction* instruction, BitVector* bit_vector) {
   // If instruction is part of inputs, don't reset the bit_vector.
   if (std::find(inputs.begin(), inputs.end(), instruction) == inputs.end()) {
-    bit_vector.SetToZero();
+    bit_vector->SetToZero();
   }
-  bit_vector.Set(GetIndex(instruction));
+  bit_vector->Set(GetIndex(instruction));
   for (const HloInstruction* input : inputs) {
-    bit_vector.OrWith(GetBitVector(input));
+    bit_vector->OrWith(GetBitVector(input));
   }
-
-  return bit_vector != tmp_bit_vector_;
 }
 
 void HloReachabilityMap::SetReachable(const HloInstruction* a,
diff --git a/tensorflow/compiler/xla/service/hlo_reachability.h b/tensorflow/compiler/xla/service/hlo_reachability.h
index 553ec11f6f9a29..69bb2b3cee6daf 100644
--- a/tensorflow/compiler/xla/service/hlo_reachability.h
+++ b/tensorflow/compiler/xla/service/hlo_reachability.h
@@ -57,6 +57,11 @@ class HloReachabilityMap {
       tensorflow::gtl::ArraySlice<const HloInstruction*> inputs,
       const HloInstruction* instruction);
 
+  // As above, but faster because it does not check if the reachability changed.
+  void FastSetReachabilityToUnion(
+      tensorflow::gtl::ArraySlice<const HloInstruction*> inputs,
+      const HloInstruction* instruction);
+
   // Sets entry so that IsReachable(a, b) will return true
   //
   // !!! THIS FUNCTION DOES NOT COMPUTE REACHABILITY !!! It sets the adjacency
@@ -133,6 +138,11 @@ class HloReachabilityMap {
     return bit_vectors_[GetIndex(instruction)];
   }
 
+  // Helper for SetReachabilityToUnion/FastSetReachabilityToUnion.
+  void SetReachabilityToUnionHelper(
+      tensorflow::gtl::ArraySlice<const HloInstruction*> inputs,
+      const HloInstruction* instruction, BitVector* bit_vector);
+
   // Return the index of the given instruction. The value is used to index into
   // the vector of BitVectors and the BitVectors themselves.
   int GetIndex(const HloInstruction* instruction) const {
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index b0632448933df4..39b85de0f12024 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/service/buffer_value.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
@@ -30,7 +31,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/service/hlo_scheduling.h"
-#include "tensorflow/compiler/xla/service/liveness_util.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -273,9 +273,8 @@ ItemList GetUsers(const InstructionList& instruction_list,
   for (const BufferAlias& buffer_alias :
        points_to_analysis.GetBufferAliases(*logical_buffer)) {
     for (const HloInstruction* user : buffer_alias.instruction()->users()) {
-      if (DoesNotUseOperandBuffer(buffer_alias.instruction(),
-                                  buffer_alias.index(), user,
-                                  points_to_analysis)) {
+      if (points_to_analysis.DoesNotUseOperandBuffer(
+              buffer_alias.instruction(), buffer_alias.index(), user)) {
         // The alias may be an operand of 'user', but the LogicalBuffer cannot
         // possibly be used by the instruction so ignore 'user'. This is the
         // case, for example, for the tuple element buffers in a GetTupleElement
@@ -1216,7 +1215,7 @@ StatusOr<bool> HloRematerialization::Run(
   // Create initial sequence of HLO instructions.
   TF_ASSIGN_OR_RETURN(*sequence, CreateMemoryMinimizingSequence(
                                      *module,
-                                     [this](const LogicalBuffer& buffer) {
+                                     [this](const BufferValue& buffer) {
                                        return size_function_(buffer.shape());
                                      },
                                      scheduler_algorithm_));
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index 81c43db292a75d..e1f9d8efd49740 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -19,13 +19,12 @@ limitations under the License.
 #include <string>
 #include <utility>
 
-#include "absl/memory/memory.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
@@ -37,7 +36,7 @@ HloRunner::CreateModuleFromString(const tensorflow::StringPiece hlo_string,
                                   const DebugOptions& debug_options) {
   HloModuleConfig config;
   config.set_debug_options(debug_options);
-  return tools::Parse(hlo_string, config);
+  return ParseHloString(hlo_string, config);
 }
 
 namespace {
@@ -81,7 +80,7 @@ HloRunner::ReadModuleFromHloTextFile(const std::string& filename,
                                                   filename, &hlo_string));
   HloModuleConfig config;
   config.set_debug_options(debug_options);
-  return tools::Parse(hlo_string, config);
+  return ParseHloString(hlo_string, config);
 }
 
 HloRunner::HloRunner(se::Platform* platform) {
@@ -93,53 +92,108 @@ HloRunner::HloRunner(se::Platform* platform) {
 
 HloRunner::~HloRunner() {}
 
-StatusOr<std::unique_ptr<Literal>> HloRunner::Execute(
-    std::unique_ptr<HloModule> module,
-    const tensorflow::gtl::ArraySlice<Literal*> arguments,
-    bool run_hlo_passes) {
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
-                      CreateExecutable(std::move(module), run_hlo_passes));
-  se::Stream stream(backend().default_stream_executor());
-  stream.Init();
+StatusOr<ScopedShapedBuffer> HloRunner::TransferLiteralToDevice(
+    const Literal& literal) {
+  TF_ASSIGN_OR_RETURN(ScopedShapedBuffer buffer,
+                      backend().transfer_manager()->AllocateScopedShapedBuffer(
+                          literal.shape(), backend().memory_allocator(),
+                          backend().default_device_ordinal()));
+  TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice(
+      backend().default_stream_executor(), literal, buffer));
+  return std::move(buffer);
+}
 
-  ServiceExecutableRunOptions service_run_options(GetServiceRunOptionsForDevice(
-      backend().default_device_ordinal(), &stream, nullptr));
-  const ExecutableRunOptions& run_options = service_run_options.run_options();
+StatusOr<std::vector<ScopedShapedBuffer>> HloRunner::TransferLiteralsToDevice(
+    const tensorflow::gtl::ArraySlice<const Literal*> literals) {
+  std::vector<ScopedShapedBuffer> buffers;
+  for (const Literal* literal : literals) {
+    CHECK(literal != nullptr);
+    TF_ASSIGN_OR_RETURN(ScopedShapedBuffer buffer,
+                        TransferLiteralToDevice(*literal));
+    buffers.push_back(std::move(buffer));
+  }
+  return std::move(buffers);
+}
 
-  // Copy arguments to device.
-  std::vector<ScopedShapedBuffer> argument_buffers;
-  for (Literal* argument : arguments) {
-    TF_ASSIGN_OR_RETURN(
-        ScopedShapedBuffer argument_buffer,
-        backend().transfer_manager()->AllocateScopedShapedBuffer(
-            argument->shape(), run_options.allocator(),
-            run_options.device_ordinal()));
-    TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice(
-        stream.parent(), *argument, argument_buffer));
-    argument_buffers.push_back(std::move(argument_buffer));
+StatusOr<std::vector<ScopedShapedBuffer>> HloRunner::TransferLiteralsToDevice(
+    const tensorflow::gtl::ArraySlice<std::unique_ptr<Literal>> literals) {
+  std::vector<const Literal*> literal_pointers;
+  literal_pointers.reserve(literals.size());
+  for (const auto& literal : literals) {
+    literal_pointers.push_back(literal.get());
   }
+  return TransferLiteralsToDevice(literal_pointers);
+}
+
+StatusOr<std::unique_ptr<Literal>> HloRunner::TransferLiteralFromDevice(
+    const ShapedBuffer& buffer) {
+  return backend().transfer_manager()->TransferLiteralFromDevice(
+      backend().default_stream_executor(), buffer);
+}
 
-  std::vector<const ShapedBuffer*> argument_buffer_ptrs;
-  argument_buffer_ptrs.reserve(argument_buffers.size());
-  for (const auto& buf : argument_buffers) {
-    argument_buffer_ptrs.push_back(&buf);
+StatusOr<std::unique_ptr<Literal>> HloRunner::Execute(
+    std::unique_ptr<HloModule> module,
+    const tensorflow::gtl::ArraySlice<const Literal*> arguments,
+    bool run_hlo_passes, ExecutionProfile* profile) {
+  TF_ASSIGN_OR_RETURN(std::vector<ScopedShapedBuffer> argument_buffers,
+                      TransferLiteralsToDevice(arguments));
+  TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result,
+                      ExecuteWithDeviceBuffers(
+                          /*module=*/std::move(module),
+                          /*arguments=*/argument_buffers,
+                          /*run_hlo_passes=*/run_hlo_passes,
+                          /*profile=*/profile));
+  return TransferLiteralFromDevice(result);
+}
+
+StatusOr<std::unique_ptr<Literal>> HloRunner::Execute(
+    std::unique_ptr<HloModule> module,
+    const tensorflow::gtl::ArraySlice<std::unique_ptr<Literal>> arguments,
+    bool run_hlo_passes, ExecutionProfile* profile) {
+  // Construct a vector of plain pointers for the arguments.
+  std::vector<const Literal*> argument_pointers;
+  argument_pointers.reserve(arguments.size());
+  for (const auto& argument : arguments) {
+    argument_pointers.push_back(argument.get());
   }
+  return Execute(
+      /*module=*/std::move(module),
+      /*arguments=*/argument_pointers,
+      /*run_hlo_passes=*/run_hlo_passes,
+      /*profile=*/profile);
+}
 
-  TF_ASSIGN_OR_RETURN(
-      ScopedShapedBuffer result,
-      executable->ExecuteOnStreamWrapper(
-          &service_run_options, /*profile=*/nullptr, argument_buffer_ptrs));
-
-  auto result_literal = backend().transfer_manager()->TransferLiteralFromDevice(
-      stream.parent(), result);
-  if (result_literal.ok()) {
-    VLOG(4) << "Executed binary and got result: "
-            << result_literal.ValueOrDie()->ToString();
-  } else {
-    VLOG(4) << "Executed binary and got status: "
-            << result_literal.status().ToString();
+StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
+    std::unique_ptr<HloModule> module,
+    const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
+    bool run_hlo_passes, ExecutionProfile* profile) {
+  // Get service run options.
+  se::Stream stream(backend().default_stream_executor());
+  stream.Init();
+  ServiceExecutableRunOptions service_run_options =
+      GetServiceRunOptionsForDevice(backend().default_device_ordinal(), &stream,
+                                    nullptr);
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
+                      CreateExecutable(std::move(module), run_hlo_passes));
+  return executable->ExecuteOnStreamWrapper(&service_run_options,
+                                            /*profile=*/profile, arguments);
+}
+
+StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
+    std::unique_ptr<HloModule> module,
+    const tensorflow::gtl::ArraySlice<ScopedShapedBuffer> arguments,
+    bool run_hlo_passes, ExecutionProfile* profile) {
+  std::vector<const ShapedBuffer*> argument_pointers;
+  argument_pointers.reserve(arguments.size());
+  for (const auto& argument : arguments) {
+    argument_pointers.push_back(&argument);
   }
-  return result_literal;
+  return ExecuteWithDeviceBuffers(
+      /*module=*/std::move(module),
+      /*arguments=*/argument_pointers,
+      /*run_hlo_passes=*/run_hlo_passes,
+      /*profile=*/profile);
 }
 
 StatusOr<std::vector<std::unique_ptr<Literal>>> HloRunner::ExecuteReplicated(
@@ -171,7 +225,7 @@ StatusOr<std::vector<std::unique_ptr<Literal>>> HloRunner::ExecuteReplicated(
     int64 device = device_assignment(i, 0);
     TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
                         backend().stream_executor(device));
-    streams.push_back(absl::make_unique<se::Stream>(executor));
+    streams.push_back(MakeUnique<se::Stream>(executor));
     streams.back()->Init();
     service_run_options.emplace_back(GetServiceRunOptionsForDevice(
         device, streams.back().get(), &device_assignment));
@@ -198,7 +252,7 @@ StatusOr<std::vector<std::unique_ptr<Literal>>> HloRunner::ExecuteReplicated(
     num_threads += options.num_replicas;
   }
   if (num_threads > 0) {
-    pool = absl::make_unique<tensorflow::thread::ThreadPool>(
+    pool = MakeUnique<tensorflow::thread::ThreadPool>(
         tensorflow::Env::Default(), "infeed_outfeed",
         /*num_threads=*/num_threads);
   }
@@ -229,7 +283,7 @@ StatusOr<std::vector<std::unique_ptr<Literal>>> HloRunner::ExecuteReplicated(
         VLOG(1) << "Starting outfeed on device " << device;
         for (int64 step = 1;
              options.infeed_steps < 0 || step <= options.infeed_steps; ++step) {
-          auto literal = absl::make_unique<Literal>();
+          auto literal = MakeUnique<Literal>();
           TF_CHECK_OK(backend().transfer_manager()->TransferLiteralFromOutfeed(
               executor, options.outfeed_shape, literal.get()));
           if (options.outfeed_values != nullptr) {
@@ -278,14 +332,14 @@ ServiceExecutableRunOptions HloRunner::GetServiceRunOptionsForDevice(
   run_options.set_device_ordinal(device);
   run_options.set_stream(stream);
   run_options.set_allocator(backend().memory_allocator());
-  run_options.set_inter_op_thread_pool(backend().inter_op_thread_pool());
   run_options.set_intra_op_thread_pool(
       backend().eigen_intra_op_thread_pool_device());
   if (device_assignment != nullptr) {
     run_options.set_device_assignment(device_assignment);
   }
-  return ServiceExecutableRunOptions(run_options, backend().StreamBorrower(),
-                                     backend().inter_op_thread_pool());
+  return ServiceExecutableRunOptions(
+      run_options, backend().StreamBorrower(),
+      /*xla_intra_op_thread_pool=*/backend().eigen_intra_op_thread_pool());
 }
 
 Backend& HloRunner::backend() {
@@ -296,4 +350,8 @@ Backend& HloRunner::backend() {
   return *backend_;
 }
 
+const Backend& HloRunner::backend() const {
+  return const_cast<HloRunner*>(this)->backend();
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_runner.h b/tensorflow/compiler/xla/service/hlo_runner.h
index 53f7c6fe4a0911..65537f07f56e74 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.h
+++ b/tensorflow/compiler/xla/service/hlo_runner.h
@@ -102,6 +102,15 @@ class HloRunner {
   static StatusOr<std::unique_ptr<HloModule>> ReadModuleFromHloTextFile(
       const std::string& filename, const DebugOptions& debug_options);
 
+  // Transfers data between the host and device.
+  StatusOr<ScopedShapedBuffer> TransferLiteralToDevice(const Literal& literal);
+  StatusOr<std::vector<ScopedShapedBuffer>> TransferLiteralsToDevice(
+      const tensorflow::gtl::ArraySlice<const Literal*> literals);
+  StatusOr<std::vector<ScopedShapedBuffer>> TransferLiteralsToDevice(
+      const tensorflow::gtl::ArraySlice<std::unique_ptr<Literal>> literals);
+  StatusOr<std::unique_ptr<Literal>> TransferLiteralFromDevice(
+      const ShapedBuffer& buffer);
+
   // Executes the given module with given literals as input and returns the
   // result as a Literal.
   //
@@ -109,20 +118,25 @@ class HloRunner {
   // optimization.
   StatusOr<std::unique_ptr<Literal>> Execute(
       std::unique_ptr<HloModule> module,
-      const tensorflow::gtl::ArraySlice<Literal*> arguments,
-      bool run_hlo_passes = true);
+      const tensorflow::gtl::ArraySlice<const Literal*> arguments,
+      bool run_hlo_passes = true, ExecutionProfile* profile = nullptr);
 
   StatusOr<std::unique_ptr<Literal>> Execute(
       std::unique_ptr<HloModule> module,
       const tensorflow::gtl::ArraySlice<std::unique_ptr<Literal>> arguments,
-      bool run_hlo_passes = true) {
-    // Construct a vector of plain pointers for the arguments.
-    std::vector<Literal*> argument_pointers;
-    c_transform(
-        arguments, std::back_inserter(argument_pointers),
-        [](const std::unique_ptr<Literal>& literal) { return literal.get(); });
-    return Execute(std::move(module), argument_pointers, run_hlo_passes);
-  }
+      bool run_hlo_passes = true, ExecutionProfile* profile = nullptr);
+
+  // As Execute(), but accepts and returns device buffers instead of host
+  // buffers.
+  StatusOr<ScopedShapedBuffer> ExecuteWithDeviceBuffers(
+      std::unique_ptr<HloModule> module,
+      const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
+      bool run_hlo_passes = true, ExecutionProfile* profile = nullptr);
+
+  StatusOr<ScopedShapedBuffer> ExecuteWithDeviceBuffers(
+      std::unique_ptr<HloModule> module,
+      const tensorflow::gtl::ArraySlice<ScopedShapedBuffer> arguments,
+      bool run_hlo_passes = true, ExecutionProfile* profile = nullptr);
 
   // Executes a given HLO module into a set of replicas, and returns a map
   // with the replica number as key, and the corresponding returned literal as
@@ -137,6 +151,7 @@ class HloRunner {
   // This creates the backend lazily so it's possible to instantiate an
   // HloRunner in a program without any backends linked in.
   Backend& backend();
+  const Backend& backend() const;
 
  private:
   // Creates an executable object given an HLO module. If run_hlo_passes is
diff --git a/tensorflow/compiler/xla/service/hlo_scheduling.cc b/tensorflow/compiler/xla/service/hlo_scheduling.cc
index 1a767628f6e2d3..68b2cde83a2eb4 100644
--- a/tensorflow/compiler/xla/service/hlo_scheduling.cc
+++ b/tensorflow/compiler/xla/service/hlo_scheduling.cc
@@ -62,7 +62,34 @@ StatusOr<int64> MinimumMemoryForSequence(
 namespace {
 
 // Class implementing a list scheduler of HLO instructions which produces a
-// sequence which minimizes memory usage.
+// sequence which minimizes memory usage by preferring to schedule the node that
+// frees bigger buffer and defines smaller outputs.
+//
+// Note that list scheduler is a greedy algorithm which cannot guarantee a
+// global optimal solution. As a counterexample, considering the following
+// graph:
+//
+//      +--> B ===> C -------+
+// A -> |                    |
+//      |                    v
+//      +--> D ---> F=======>G
+//      |           ^
+//      |           |
+//      +--> E -----+
+//
+//  --> : Buffer with size 1
+//  ==> : Buffer with size 2
+//
+// The list scheduler will always try to defer scheduling B in a greedy way
+// since its output buffer is bigger than input. The sequence it creates will
+// be:
+//   A D E F B C G
+// , which has a maximum memory usage of 6 (B is alive while F is executing).
+//
+// An optimal way to shedule the previous graph is:
+//   A B C D E F G
+// , which has a maximum memory usage of 5 (when F is executing).
+//
 class ListScheduler {
  public:
   // Construct and return a memory-minimizing sequence of HLO instructions
@@ -70,8 +97,11 @@ class ListScheduler {
   static StatusOr<std::vector<const HloInstruction*>> Run(
       const HloComputation& computation,
       const TuplePointsToAnalysis& points_to_analysis,
-      const LogicalBuffer::SizeFunction& size_function) {
-    ListScheduler scheduler(computation, points_to_analysis, size_function);
+      const LogicalBuffer::SizeFunction& size_function,
+      const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+          memory_by_computation) {
+    ListScheduler scheduler(computation, points_to_analysis, size_function,
+                            memory_by_computation);
     return scheduler.CreateSchedule();
   }
 
@@ -92,10 +122,13 @@ class ListScheduler {
 
   ListScheduler(const HloComputation& computation,
                 const TuplePointsToAnalysis& points_to_analysis,
-                const LogicalBuffer::SizeFunction& size_function)
+                const LogicalBuffer::SizeFunction& size_function,
+                const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+                    memory_by_computation)
       : computation_(computation),
         points_to_analysis_(points_to_analysis),
-        size_function_(size_function) {
+        size_function_(size_function),
+        memory_by_computation_(memory_by_computation) {
     // Create a map containing the LogicalBuffer uses for each HLO
     // instruction. An HLO instruction "uses" a LogicalBuffer if the
     // LogicalBuffer is in an operand of the instruction as indicated by
@@ -185,6 +218,12 @@ class ListScheduler {
   }
 
   // Returns the number of bytes freed if the HLO instruction is scheduled.
+  // If the instruction calls subcomputations, we count the memory used by the
+  // subcomputations as memory "defined" by the instruction. This is not
+  // entirely accurate, because subcomputation memory will be freed after the
+  // instruction finishes. But it is more accurate than not taking
+  // subcomputations into account at all. In the future, we may improve
+  // accounting for subcomputation memory (b/65409243).
   int64 BytesFreedIfScheduled(const ReadyListEntry& entry) {
     int64 freed_bytes = 0;
     for (const auto& kv : entry.used_buffer_unscheduled_use_counts) {
@@ -194,7 +233,19 @@ class ListScheduler {
         freed_bytes += size_function_(*buffer);
       }
     }
-    return freed_bytes - entry.bytes_defined;
+    // We only count the memory usage of the largest subcomputation, instead of
+    // adding them all, because subcomputations won't execute in parallel.
+    int64 max_subcomputation_bytes = 0;
+    for (const auto* c : entry.instruction->called_computations()) {
+      auto it = memory_by_computation_.find(c);
+      if (it != memory_by_computation_.end()) {
+        int64 subcomputation_bytes = it->second;
+        if (subcomputation_bytes > max_subcomputation_bytes) {
+          max_subcomputation_bytes = subcomputation_bytes;
+        }
+      }
+    }
+    return freed_bytes - entry.bytes_defined - max_subcomputation_bytes;
   }
 
   // Constructs the scheduling priority of the given instruction.
@@ -248,6 +299,8 @@ class ListScheduler {
       auto best_it = ready_queue.end();
       --best_it;
       const HloInstruction* best = best_it->second.instruction;
+      VLOG(2) << "Schedule instruction: " << best->ToShortString()
+              << " Bytes freed: " << best_it->first.first;
       ready_queue.erase(best_it);
       ready_instructions.erase(best);
       schedule.push_back(best);
@@ -315,6 +368,11 @@ class ListScheduler {
   const HloComputation& computation_;
   const TuplePointsToAnalysis& points_to_analysis_;
   const LogicalBuffer::SizeFunction& size_function_;
+  // Computations are analyzed in post-order. When scheduling an instruction
+  // that includes subcomputations, such as a while loop, we use this map to
+  // look up the memory needed by subcomputations.
+  const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+      memory_by_computation_;
 
   // A map containing the LogicalBuffers that each instruction uses.
   tensorflow::gtl::FlatMap<const HloInstruction*,
@@ -340,6 +398,24 @@ int64 SumLogicalBufferSizes(
   return size;
 }
 
+StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
+    const HloComputation& computation,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const LogicalBuffer::SizeFunction& size_function,
+    const MemorySchedulerAlgorithm& algorithm,
+    const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+        memory_by_computation) {
+  VLOG(2) << "Computation: " << computation.name();
+  if (algorithm) {
+    return algorithm(computation, points_to_analysis, size_function,
+                     memory_by_computation);
+  }
+  return DefaultMemoryScheduler(computation, points_to_analysis, size_function,
+                                memory_by_computation);
+}
+
+}  // namespace
+
 StatusOr<int64> MinimumMemoryForComputation(
     const HloComputation& computation,
     const std::vector<const HloInstruction*>& sequence,
@@ -352,24 +428,12 @@ StatusOr<int64> MinimumMemoryForComputation(
   return result.heap_size;
 }
 
-StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
-    const HloComputation& computation,
-    const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_function,
-    const MemorySchedulerAlgorithm& algorithm) {
-  VLOG(2) << "Computation: " << computation.name();
-  if (algorithm) {
-    return algorithm(computation, points_to_analysis, size_function);
-  }
-  return DefaultMemoryScheduler(computation, points_to_analysis, size_function);
-}
-
-}  // namespace
-
 StatusOr<std::vector<const HloInstruction*>> DFSMemoryScheduler(
     const HloComputation& computation,
     const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_function) {
+    const LogicalBuffer::SizeFunction& size_function,
+    const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+        memory_by_computation) {
   // This ordering is based on DFS post-order, with a heuristic to decide which
   // operand to visit first.  The heuristic is based on 'extra_users', which is
   // simply users-1 for each instruction.  By subtracting 1, we're saying that
@@ -395,6 +459,13 @@ StatusOr<std::vector<const HloInstruction*>> DFSMemoryScheduler(
       extra_users[hlo] += extra_users[operand];
       total_sizes[hlo] += total_sizes[operand];
     }
+    // total_sizes[hlo] transitively includes the sizes of all nodes that
+    // lead to it. But computation is a DAG, so we are double-counting nodes,
+    // which can lead to overflows for large programs.
+    // cumulative_total_size caps the size to prevent overflows.
+    // NOTE(dimvar): this is quite ugly and should be changed. It's unclear
+    // why we care about transitive sizes; when scheduling a node, its input
+    // and output buffers should be all that matters, not its "history".
     total_sizes[hlo] = std::min(total_sizes[hlo], cumulative_total_size);
   }
   CHECK_EQ(extra_users.size(), computation.instruction_count());
@@ -421,52 +492,87 @@ StatusOr<std::vector<const HloInstruction*>> DFSMemoryScheduler(
       }));
   CHECK_EQ(sequence.size(), computation.instruction_count());
   return sequence;
-}
+}  // namespace xla
 
 StatusOr<std::vector<const HloInstruction*>> ListMemoryScheduler(
     const HloComputation& computation,
     const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_function) {
-  return ListScheduler::Run(computation, points_to_analysis, size_function);
+    const LogicalBuffer::SizeFunction& size_function,
+    const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+        memory_by_computation) {
+  return ListScheduler::Run(computation, points_to_analysis, size_function,
+                            memory_by_computation);
+}
+
+StatusOr<std::vector<const HloInstruction*>> PostOrderMemoryScheduler(
+    const HloComputation& computation,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const LogicalBuffer::SizeFunction& size_function,
+    const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+        memory_by_computation) {
+  const auto& post_order = computation.MakeInstructionPostOrder();
+  return std::vector<const HloInstruction*>{post_order.begin(),
+                                            post_order.end()};
 }
 
 StatusOr<std::vector<const HloInstruction*>> DefaultMemoryScheduler(
     const HloComputation& computation,
     const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_function) {
-  // We try both a list-scheduler based ordering and a DFS based ordering, and
-  // choose whichever returns a lower min-memory, not accounting for
-  // fragmentation.
-  //
-  // Note that this is just a heuristic. One obvious inaccuracy is that the
-  // memory required for sub-computations might be different when considered
-  // within the caller's context. But it's good enough for now.
+    const LogicalBuffer::SizeFunction& size_function,
+    const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+        memory_by_computation) {
+  // We try a few schedulers and choose whichever returns a lower min-memory,
+  // not accounting for fragmentation.
+  // - List is a scheduler that uses greedy heuristics.
+  // - DFS visits HLOs in postorder, with a heuristic to decide the order of
+  //   children.
+  // - Postorder does not use any heuristics.
+  // List wins for most of our benchmarks; postorder-based schedulers win for
+  // some RNNs.
   TF_ASSIGN_OR_RETURN(
       std::vector<const HloInstruction*> list_sequence,
-      ListMemoryScheduler(computation, points_to_analysis, size_function));
+      ListMemoryScheduler(computation, points_to_analysis, size_function,
+                          memory_by_computation));
   TF_ASSIGN_OR_RETURN(
       const int64 list_memory,
       MinimumMemoryForComputation(computation, list_sequence,
                                   points_to_analysis, size_function));
   VLOG(2) << "Min-memory list sequence: " << HumanReadableNumBytes(list_memory);
 
-  TF_ASSIGN_OR_RETURN(
-      std::vector<const HloInstruction*> dfs_sequence,
-      DFSMemoryScheduler(computation, points_to_analysis, size_function));
+  TF_ASSIGN_OR_RETURN(std::vector<const HloInstruction*> dfs_sequence,
+                      DFSMemoryScheduler(computation, points_to_analysis,
+                                         size_function, memory_by_computation));
   TF_ASSIGN_OR_RETURN(
       const int64 dfs_memory,
       MinimumMemoryForComputation(computation, dfs_sequence, points_to_analysis,
                                   size_function));
   VLOG(2) << "Min-memory dfs sequence: " << HumanReadableNumBytes(dfs_memory);
 
-  if (list_memory <= dfs_memory) {
+  TF_ASSIGN_OR_RETURN(
+      std::vector<const HloInstruction*> post_order_sequence,
+      PostOrderMemoryScheduler(computation, points_to_analysis, size_function,
+                               memory_by_computation));
+  TF_ASSIGN_OR_RETURN(
+      const int64 post_order_memory,
+      MinimumMemoryForComputation(computation, post_order_sequence,
+                                  points_to_analysis, size_function));
+  VLOG(2) << "Min-memory post order sequence: "
+          << HumanReadableNumBytes(post_order_memory);
+
+  auto min_memory = std::min({dfs_memory, post_order_memory, list_memory});
+
+  if (min_memory == list_memory) {
     VLOG(2) << "Chose min-memory list sequence: "
             << HumanReadableNumBytes(list_memory);
     return list_sequence;
-  } else {
+  } else if (min_memory == dfs_memory) {
     VLOG(2) << "Chose min-memory dfs sequence: "
             << HumanReadableNumBytes(dfs_memory);
     return dfs_sequence;
+  } else {
+    VLOG(2) << "Chose min-memory post_order sequence: "
+            << HumanReadableNumBytes(post_order_memory);
+    return post_order_sequence;
   }
 }
 
@@ -477,24 +583,32 @@ CreateMemoryMinimizingSequence(const HloModule& module,
   SequentialHloOrdering::HloModuleSequence sequence;
   TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
                       TuplePointsToAnalysis::Run(&module));
-  for (const auto* computation : module.MakeNonfusionComputations()) {
-    TF_ASSIGN_OR_RETURN(
-        sequence[computation],
-        CreateMemoryMinimizingSequence(*computation, *points_to_analysis,
-                                       size_function, algorithm));
+  tensorflow::gtl::FlatMap<const HloComputation*, int64> memory_by_computation;
+  for (const auto* computation : module.MakeComputationPostOrder()) {
+    if (!computation->IsFusionComputation()) {
+      TF_ASSIGN_OR_RETURN(auto one_computation_sequence,
+                          CreateMemoryMinimizingSequence(
+                              *computation, *points_to_analysis, size_function,
+                              algorithm, memory_by_computation));
+      memory_by_computation[computation] =
+          MinimumMemoryForComputation(*computation, one_computation_sequence,
+                                      *points_to_analysis, size_function)
+              .ValueOrDie();
+      sequence[computation] = std::move(one_computation_sequence);
+    }
   }
   return sequence;
 }
 
 StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
     const HloComputation& computation,
-    const LogicalBuffer::SizeFunction& size_function,
-    const MemorySchedulerAlgorithm& algorithm) {
+    const LogicalBuffer::SizeFunction& size_function) {
   CHECK(!computation.IsFusionComputation());
   TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
                       TuplePointsToAnalysis::Run(computation.parent()));
+  tensorflow::gtl::FlatMap<const HloComputation*, int64> empty_map;
   return CreateMemoryMinimizingSequence(computation, *points_to_analysis,
-                                        size_function, algorithm);
+                                        size_function, nullptr, empty_map);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_scheduling.h b/tensorflow/compiler/xla/service/hlo_scheduling.h
index 068e68383deb17..49b927eefd24f4 100644
--- a/tensorflow/compiler/xla/service/hlo_scheduling.h
+++ b/tensorflow/compiler/xla/service/hlo_scheduling.h
@@ -34,26 +34,47 @@ StatusOr<int64> MinimumMemoryForSequence(
     const SequentialHloOrdering::HloModuleSequence& module_sequence,
     const LogicalBuffer::SizeFunction& size_function);
 
+// Returns the minimum memory required to compute the given computation,
+// assuming no fragmentation.
+StatusOr<int64> MinimumMemoryForComputation(
+    const HloComputation& computation,
+    const std::vector<const HloInstruction*>& sequence,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const LogicalBuffer::SizeFunction& size_function);
+
 // A memory scheduler computes an execution sequence for the HLO instructions in
 // 'computation' that minimizes peak memory, given a points-to analysis result
 // that describes buffer aliasing, together with a target-specific size function
 // that maps a tensor's logical size to its padded size.
 typedef std::function<StatusOr<std::vector<const HloInstruction*>>(
     const HloComputation&, const TuplePointsToAnalysis&,
-    const LogicalBuffer::SizeFunction&)>
+    const LogicalBuffer::SizeFunction&,
+    const tensorflow::gtl::FlatMap<const HloComputation*, int64>&)>
     MemorySchedulerAlgorithm;
 
 // List scheduler
 StatusOr<std::vector<const HloInstruction*>> ListMemoryScheduler(
     const HloComputation& computation,
     const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_function);
+    const LogicalBuffer::SizeFunction& size_function,
+    const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+        memory_by_computation);
 
 // DFS-order scheduler
 StatusOr<std::vector<const HloInstruction*>> DFSMemoryScheduler(
     const HloComputation& computation,
     const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_function);
+    const LogicalBuffer::SizeFunction& size_function,
+    const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+        memory_by_computation);
+
+// Naive Post Order scheduler
+StatusOr<std::vector<const HloInstruction*>> PostOrderMemoryScheduler(
+    const HloComputation& computation,
+    const TuplePointsToAnalysis& points_to_analysis,
+    const LogicalBuffer::SizeFunction& size_function,
+    const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+        memory_by_computation);
 
 // The default scheduling algorithm. Runs both the list scheduler
 // and the DFS scheduler, and chooses whichever returns a lower min-memory,
@@ -61,7 +82,9 @@ StatusOr<std::vector<const HloInstruction*>> DFSMemoryScheduler(
 StatusOr<std::vector<const HloInstruction*>> DefaultMemoryScheduler(
     const HloComputation& computation,
     const TuplePointsToAnalysis& points_to_analysis,
-    const LogicalBuffer::SizeFunction& size_function);
+    const LogicalBuffer::SizeFunction& size_function,
+    const tensorflow::gtl::FlatMap<const HloComputation*, int64>&
+        memory_by_computation);
 
 // Returns an HloModuleSequence which seeks to minimize the memory required for
 // the computation. size_function is the function returning the number of bytes
@@ -72,10 +95,10 @@ CreateMemoryMinimizingSequence(const HloModule& module,
                                const MemorySchedulerAlgorithm& algorithm = {});
 
 // Overload of above that computes the sequence for a single computation.
+// Currently only used by the GPU backend.
 StatusOr<std::vector<const HloInstruction*>> CreateMemoryMinimizingSequence(
     const HloComputation& computation,
-    const LogicalBuffer::SizeFunction& size_function,
-    const MemorySchedulerAlgorithm& algorithm = {});
+    const LogicalBuffer::SizeFunction& size_function);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/hlo_scheduling_test.cc b/tensorflow/compiler/xla/service/hlo_scheduling_test.cc
index 74544c4a67a819..db7ef6f0d4bd96 100644
--- a/tensorflow/compiler/xla/service/hlo_scheduling_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_scheduling_test.cc
@@ -22,9 +22,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -77,7 +77,7 @@ TEST_F(MinimumMemoryForSequenceTest, MultiComputation) {
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build());
 
-  auto size_fn = [](const LogicalBuffer& buffer) {
+  auto size_fn = [](const BufferValue& buffer) {
     return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
   };
 
@@ -124,7 +124,7 @@ TEST_F(HloSchedulingTest, LastUseScheduledFirst) {
 
   TF_ASSERT_OK_AND_ASSIGN(
       SequentialHloOrdering::HloModuleSequence sequence,
-      CreateMemoryMinimizingSequence(*module, [](const LogicalBuffer& buffer) {
+      CreateMemoryMinimizingSequence(*module, [](const BufferValue& buffer) {
         return ShapeUtil::ByteSizeOf(buffer.shape());
       }));
   // Verify that all instructions are in the sequence.
@@ -158,9 +158,9 @@ ENTRY root {
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(module_str));
+                          ParseHloString(module_str));
 
-  auto size_fn = [](const LogicalBuffer& buffer) {
+  auto size_fn = [](const BufferValue& buffer) {
     return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
   };
   TF_ASSERT_OK_AND_ASSIGN(
@@ -190,5 +190,199 @@ ENTRY root {
                                       instructions_by_name.at("e")));
 }
 
+TEST_F(HloSchedulingTest, ListAccountsForSubcomputations) {
+  // %WhileCond (cond_param: f32[4]) -> pred[] {
+  //   %cond_param = f32[4]{0} parameter(0)
+  //   %constant = f32[1,4]{1,0} constant(f32[1,4] { { 0, 0, 0, 0 } })
+  //   ROOT %not-equal-to = pred[] not-equal-to(
+  //     f32[4]{0} %cond_param, f32[1,4]{1,0} %constant)
+  // }
+  // %WhileBody (body_param: f32[4]) -> f32[4] {
+  //   %body_param = f32[4]{0} parameter(0)
+  //   %constant.1 = f32[1,4]{1,0} constant(f32[1,4] { { 1, 1, 1, 1 } })
+  //   ROOT %subtract = f32[4]{0} subtract(
+  //     f32[4]{0} %body_param, f32[1,4]{1,0} %constant.1)
+  // }
+  // %SubcomputationsNotAccounted () -> f32[2,4] {
+  //   %constant.3 = f32[2,4]{1,0} constant(
+  //     f32[2,4] { { 1, 2, 3, 4 }, { 1, 2, 3, 4 } })
+  //   %transpose = f32[2,4]{1,0} transpose(
+  //     f32[2,4]{1,0} %constant.3), dimensions={0,1}
+  //   %constant.2 = f32[1,4]{1,0} constant(f32[1,4] { { 1, 1, 1, 1 } })
+  //   %while = f32[4]{0} while(f32[1,4]{1,0} %constant.2),
+  //      condition=%WhileCond,
+  //      body=%WhileBody
+  //   %broadcast = f32[2,4]{1,0} broadcast(f32[4]{0} %while), dimensions={0}
+  //   ROOT %add = f32[2,4]{1,0} add(
+  //     f32[2,4]{1,0} %transpose, f32[2,4]{1,0} %broadcast)
+  // }
+
+  auto module = CreateNewModule();
+  const Shape r1f32 = ShapeUtil::MakeShape(F32, {4});
+  const Shape r2f32 = ShapeUtil::MakeShape(F32, {2, 4});
+
+  // param != 0
+  // Needs 17 bytes
+  auto cond_builder = HloComputation::Builder("WhileCond");
+  HloInstruction* cond_param = cond_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r1f32, "cond_param"));
+  HloInstruction* zero_vector = cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR2<float>({{0, 0, 0, 0}})));
+  cond_builder.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(PRED, {}), HloOpcode::kNe, cond_param, zero_vector));
+  auto cond_computation = module->AddEmbeddedComputation(cond_builder.Build());
+
+  // param - 1
+  // Needs 16 bytes
+  auto body_builder = HloComputation::Builder("WhileBody");
+  HloInstruction* body_param = body_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r1f32, "body_param"));
+  HloInstruction* one_vector = body_builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR2<float>({{1, 1, 1, 1}})));
+  body_builder.AddInstruction(HloInstruction::CreateBinary(
+      r1f32, HloOpcode::kSubtract, body_param, one_vector));
+  auto body_computation = module->AddEmbeddedComputation(body_builder.Build());
+
+  // transpose(matrix) + bcast(while)
+  auto builder = HloComputation::Builder(TestName());
+  HloInstruction* while_init = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR2<float>({{1, 1, 1, 1}})));
+  // Creates 16 bytes, ignoring subcomputations
+  HloInstruction* while_loop =
+      builder.AddInstruction(HloInstruction::CreateWhile(
+          r1f32, cond_computation, body_computation, while_init));
+
+  // Creates 32 bytes and frees 16
+  HloInstruction* bcast = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(r2f32, while_loop, {0}));
+
+  HloInstruction* matrix = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR2<float>(
+          {{1.0, 2.0, 3.0, 4.0}, {1.0, 2.0, 3.0, 4.0}})));
+  // Creates 32 bytes
+  HloInstruction* transpose = builder.AddInstruction(
+      HloInstruction::CreateTranspose(r2f32, matrix, {0, 1}));
+
+  // Creates 32 bytes and frees 64
+  HloInstruction* add = builder.AddInstruction(
+      HloInstruction::CreateBinary(r2f32, HloOpcode::kAdd, transpose, bcast));
+
+  module->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN(SequentialHloOrdering::HloModuleSequence sequence,
+                          CreateMemoryMinimizingSequence(
+                              *module,
+                              [](const BufferValue& buffer) {
+                                return ShapeUtil::ByteSizeOf(buffer.shape());
+                              },
+                              ListMemoryScheduler));
+  // Verify that all instructions are in the sequence.
+  EXPECT_EQ(module->entry_computation()->instruction_count(),
+            sequence.at(module->entry_computation()).size());
+  SequentialHloOrdering ordering(module.get(), sequence);
+  // This schedule is an example of List's greedy heuristics being suboptimal.
+  // The while_loop is more expensive than transpose, so it would have been
+  // better to schedule it first, instead of during the busy time.
+  EXPECT_TRUE(ordering.ExecutesBefore(transpose, while_loop));
+  EXPECT_TRUE(ordering.ExecutesBefore(transpose, bcast));
+  EXPECT_TRUE(ordering.ExecutesBefore(bcast, add));
+  EXPECT_TRUE(ordering.ExecutesBefore(transpose, add));
+}
+
+TEST_F(HloSchedulingTest, TuplesAreAccountedCorrectly) {
+  auto builder = HloComputation::Builder(TestName());
+  const auto TUPLE_SIZE = 1;
+  const Shape r1f32 = ShapeUtil::MakeShape(xla::F32, {6});
+
+  // Wrap lit in abs because constants are considered free by
+  // IgnoreInstruction, and it skews the accounting.
+  auto lit = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR1<float>({1, 1, 1, 1, 1, 1})));
+  auto abs_const = builder.AddInstruction(
+      HloInstruction::CreateUnary(r1f32, HloOpcode::kAbs, lit));
+
+  auto abs_abs1 = builder.AddInstruction(
+      HloInstruction::CreateUnary(r1f32, HloOpcode::kAbs, abs_const));
+  auto tuple = builder.AddInstruction(HloInstruction::CreateTuple(
+      tensorflow::gtl::ArraySlice<HloInstruction*>({abs_abs1})));
+  auto tuple_elm = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(r1f32, tuple, 0));
+
+  auto abs_abs2 = builder.AddInstruction(
+      HloInstruction::CreateUnary(r1f32, HloOpcode::kAbs, abs_const));
+
+  builder.AddInstruction(HloInstruction::CreateBinary(r1f32, HloOpcode::kAdd,
+                                                      tuple_elm, abs_abs2));
+
+  auto module = CreateNewModule();
+  module->AddEntryComputation(builder.Build());
+  TF_ASSERT_OK_AND_ASSIGN(
+      SequentialHloOrdering::HloModuleSequence sequence,
+      CreateMemoryMinimizingSequence(*module,
+                                     [&TUPLE_SIZE](const BufferValue& buffer) {
+                                       return ShapeUtil::ByteSizeOf(
+                                           buffer.shape(), TUPLE_SIZE);
+                                     },
+                                     ListMemoryScheduler));
+
+  // Verify that all instructions are in the sequence.
+  EXPECT_EQ(module->entry_computation()->instruction_count(),
+            sequence.at(module->entry_computation()).size());
+  SequentialHloOrdering ordering(module.get(), sequence);
+  // tuple allocates the tuple buffer and doesn't free anything.
+  // abs_abs2 uses the same buffer for input/output, so its bytes-freed is 0.
+  // abs_abs2 should be scheduled before tuple by List.
+  EXPECT_TRUE(ordering.ExecutesBefore(abs_abs2, tuple));
+}
+
+TEST_F(HloSchedulingTest, MultiOutputFusionAccountedCorrectly) {
+  const Shape r1f32 = ShapeUtil::MakeShape(xla::F32, {5});
+  HloComputation::Builder builder(TestName());
+
+  auto c1 = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR1<float>({1, 1, 1, 1, 1})));
+  auto c2 = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR1<float>({1, 2, 3, 4, 5})));
+  auto c3 = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR1<float>({0, 2, 4, 6, 8})));
+
+  auto add = builder.AddInstruction(
+      HloInstruction::CreateBinary(r1f32, HloOpcode::kAdd, c1, c2));
+  auto mul = builder.AddInstruction(
+      HloInstruction::CreateBinary(r1f32, HloOpcode::kMultiply, add, c3));
+  auto tuple = builder.AddInstruction(HloInstruction::CreateTuple({add, mul}));
+
+  auto tuple_elm = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(r1f32, tuple, 0));
+
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(r1f32, HloOpcode::kExp, c3));
+
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(r1f32, HloOpcode::kAdd, tuple_elm, exp));
+
+  auto module = CreateNewModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
+
+  auto fusion = computation->CreateFusionInstruction(
+      {tuple, mul, add}, HloInstruction::FusionKind::kLoop);
+
+  TF_ASSERT_OK_AND_ASSIGN(SequentialHloOrdering::HloModuleSequence sequence,
+                          CreateMemoryMinimizingSequence(
+                              *module,
+                              [](const BufferValue& buffer) {
+                                return ShapeUtil::ByteSizeOf(buffer.shape(), 2);
+                              },
+                              ListMemoryScheduler));
+
+  // Verify that all instructions are in the sequence.
+  EXPECT_EQ(module->entry_computation()->instruction_count(),
+            sequence.at(module->entry_computation()).size());
+  SequentialHloOrdering ordering(module.get(), sequence);
+  // fusion allocates memory for the tuple elements and doesn't free anything,
+  // so it's more expensive than exp.
+  EXPECT_TRUE(ordering.ExecutesBefore(exp, fusion));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.cc b/tensorflow/compiler/xla/service/hlo_sharding.cc
index 994de441237493..58224ef870096a 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding.cc
@@ -49,9 +49,6 @@ string HloSharding::ToString() const {
     return StrCat("{", tensorflow::str_util::Join(parts, ", "), "}");
   }
 
-  string result = StrCat("{", (replicated_ ? " replicated" : ""),
-                         (maximal_ ? " maximal" : ""));
-
   if (replicated_) {
     return "{replicated}";
   } else if (maximal_) {
@@ -126,6 +123,24 @@ std::vector<int64> HloSharding::TileLimitForDevice(int64 device) const {
   return index;
 }
 
+StatusOr<ShapeTree<HloSharding>> HloSharding::AsShapeTree(
+    const Shape& shape) const {
+  if (IsTuple()) {
+    ShapeTree<HloSharding> result(shape, HloSharding::Replicate());
+    int64 num_leaves = result.leaf_count();
+    TF_RET_CHECK(num_leaves == tuple_elements_.size())
+        << "Shape " << ShapeUtil::HumanString(shape) << " has " << num_leaves
+        << " leaf nodes while this sharding has " << tuple_elements_.size();
+    auto it = tuple_elements_.begin();
+    for (auto& index_to_sharding : result.leaves()) {
+      index_to_sharding.second = *it++;
+    }
+    return std::move(result);
+  } else {
+    return ShapeTree<HloSharding>(shape, *this);
+  }
+}
+
 StatusOr<int64> HloSharding::UniqueDevice() const {
   if (IsTuple()) {
     if (tuple_elements_.empty()) {
@@ -367,10 +382,11 @@ HloSharding HloSharding::GetSubSharding(const Shape& shape,
                                         const ShapeIndex& index) const {
   CHECK(IsTuple());
 
-  ShapeTree<HloSharding> sub_shape_tree(ShapeUtil::GetSubshape(shape, index),
-                                        Replicate());
+  Shape sub_shape = ShapeUtil::GetSubshape(shape, index);
+  ShapeTree<HloSharding> sub_shape_tree(sub_shape, Replicate());
   sub_shape_tree.CopySubtreeFrom(GetAsShapeTree(shape), index, {});
-  return Tuple(sub_shape_tree);
+  return ShapeUtil::IsTuple(sub_shape) ? Tuple(sub_shape_tree)
+                                       : sub_shape_tree.element(ShapeIndex({}));
 }
 
 std::ostream& operator<<(std::ostream& out, const HloSharding& sharding) {
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.h b/tensorflow/compiler/xla/service/hlo_sharding.h
index 2b8e757f42991f..f4a0fb626f2c3e 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.h
+++ b/tensorflow/compiler/xla/service/hlo_sharding.h
@@ -99,6 +99,9 @@ class HloSharding {
   static bool IsReservedDevice(int64 device) { return device < 0; }
 
   OpSharding ToProto() const;
+
+  // Note that this string canonically has outer curly braces, e.g.
+  // "{replicated}".
   string ToString() const;
 
   // Validate that this sharding can be applied to a tensor with shape `shape`.
@@ -160,19 +163,9 @@ class HloSharding {
   // tuple, if IsTuple, or a ShapeTree with a single element containing this
   // sharding. Only the leaf elements are populated. This creates a new
   // ShapeTree object so is not cheap.
+  StatusOr<ShapeTree<HloSharding>> AsShapeTree(const Shape& shape) const;
   ShapeTree<HloSharding> GetAsShapeTree(const Shape& shape) const {
-    if (IsTuple()) {
-      ShapeTree<HloSharding> result(shape, HloSharding::Replicate());
-      CHECK_EQ(std::distance(result.leaf_begin(), result.leaf_end()),
-               tuple_elements_.size());
-      auto it = tuple_elements_.begin();
-      for (auto& index_to_sharding : result.leaves()) {
-        index_to_sharding.second = *it++;
-      }
-      return result;
-    } else {
-      return ShapeTree<HloSharding>(shape, *this);
-    }
+    return AsShapeTree(shape).ValueOrDie();
   }
 
   // Retrieves the sub sharding at a given index, out of a tuple sharding.
@@ -208,6 +201,12 @@ class HloSharding {
     return h;
   }
 
+  struct Hasher {
+    size_t operator()(const HloSharding& sharding) const {
+      return sharding.Hash();
+    }
+  };
+
   // Gets the tile shape.
   // REQUIRES: !IsTileMaximal() && !IsTuple()
   const Shape& tile_shape() const { return tile_shape_; }
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc b/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc
new file mode 100644
index 00000000000000..82cff2a4b7146c
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc
@@ -0,0 +1,401 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_sharding_metadata.h"
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/shape_tree.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+
+namespace xla {
+
+namespace {
+
+struct PassThrough {
+  PassThrough(HloInstruction* user, HloInstruction* operand)
+      : user(user), operand(operand) {}
+
+  HloInstruction* user = nullptr;
+  HloInstruction* operand = nullptr;
+};
+
+void SetDeviceSharding(HloInstruction* instruction, int64 device) {
+  VLOG(4) << "  " << instruction->name() << " to device " << device;
+  instruction->set_device_sharding(device);
+}
+
+tensorflow::gtl::optional<int64> ShardingUniqueDevice(
+    const HloSharding& sharding) {
+  if (sharding.IsTileMaximal()) {
+    auto device = sharding.UniqueDevice();
+    if (device.ok()) {
+      return device.ValueOrDie();
+    }
+  }
+  return tensorflow::gtl::optional<int64>();
+}
+
+bool ShardingMatches(const HloSharding& sharding1,
+                     const HloSharding& sharding2) {
+  auto device1 = ShardingUniqueDevice(sharding1);
+  if (device1) {
+    auto device2 = ShardingUniqueDevice(sharding2);
+    if (device2) {
+      return *device1 == *device2;
+    }
+  }
+  // Anything which is not tile maximal with unique device, gets a full sharding
+  // compare.
+  return sharding1 == sharding2;
+}
+
+// When we create domains, they are never "empty", where with empty we mean
+// that a kDomain instruction has as operand another kDomain instruction of the
+// same kind.
+// But when the HLO optimizations are run, empty domains can be created.
+// For example:
+//
+//  Domain(device=None, device=0) ->
+//    Tuple(device=0) ->
+//      GTE(device=0) ->
+//        Domain(device=0, device=None)
+//
+// In that case the tuple simplifier could create something like:
+//
+//  Domain(device=None, device=0) -> Domain(device=0, device=None)
+//
+// Which is a so called empty domain.
+// In the case above, crossing an empty domain which was transiting through
+// device 0, requires the normalization phase to fixup the empty domain by
+// adding back a Tuple+GTE pair with the proper device.
+// One particular case where this can create problems is the result of the
+// entry computation, where the GTE assignments are used by TF to tell the
+// XLA where the results should be sent.
+std::vector<PassThrough> LocatePassThroughDomainLinks(
+    const DomainMetadata::Domain& domain) {
+  std::vector<PassThrough> pass_through;
+  for (HloInstruction* instruction : domain.enter_domains) {
+    CHECK(instruction->opcode() == HloOpcode::kDomain)
+        << "Instruction is not a kDomain: " << instruction->ToString();
+    for (HloInstruction* user : instruction->users()) {
+      if (user->opcode() == HloOpcode::kDomain &&
+          domain.exit_domains.count(user) != 0) {
+        pass_through.emplace_back(user, instruction);
+        VLOG(2) << "Found passthrough domain link:";
+        VLOG(2) << "  " << user->ToString();
+        VLOG(2) << "  " << instruction->ToString();
+      }
+    }
+  }
+  return pass_through;
+}
+
+Status FixupPassThroughDomainLinks(const DomainMetadata::Domain& domain,
+                                   const HloSharding& sharding) {
+  for (auto& pass_through : LocatePassThroughDomainLinks(domain)) {
+    HloInstruction* tuple = pass_through.operand->parent()->AddInstruction(
+        HloInstruction::CreateTuple({pass_through.operand}));
+    HloInstruction* gte = pass_through.operand->parent()->AddInstruction(
+        HloInstruction::CreateGetTupleElement(pass_through.operand->shape(),
+                                              tuple, 0));
+    gte->set_sharding(sharding);
+    TF_RETURN_IF_ERROR(
+        pass_through.operand->ReplaceUseWith(pass_through.user, gte));
+  }
+  return Status::OK();
+}
+
+std::unique_ptr<HloSharding> CloneShardingForDomain(
+    const HloSharding& sharding) {
+  auto device = ShardingUniqueDevice(sharding);
+  if (!device) {
+    return MakeUnique<HloSharding>(sharding);
+  }
+  return MakeUnique<HloSharding>(HloSharding::AssignDevice(*device));
+}
+
+Status ApplyDomainDeviceSharding(const DomainMetadata::Domain& domain,
+                                 int64 device) {
+  VLOG(4) << "Applying device " << device << " sharding";
+  for (HloInstruction* instruction : domain.instructions) {
+    // We only change instructions without sharding, since otherwise we might
+    // mess up with eventual HLO passes which has knowledge of it.
+    if (!instruction->has_sharding()) {
+      SetDeviceSharding(instruction, device);
+    } else {
+      VLOG(4) << "  " << instruction->name() << " already has sharding "
+              << instruction->sharding();
+    }
+  }
+  return Status::OK();
+}
+
+// Retrieves the sharding of a tuple shaped instruction in form of a ShapeTree.
+// If the instruction has no sharding, a ShapeTree with HloSharding::Replicate()
+// sharding will be returned.
+ShapeTree<HloSharding> GetTupleSharding(HloInstruction* tuple) {
+  if (tuple->has_sharding()) {
+    return tuple->sharding().GetAsShapeTree(tuple->shape());
+  }
+  return ShapeTree<HloSharding>(tuple->shape(), HloSharding::Replicate());
+}
+
+// Retrieves the sharding of operand, asked from a user instruction which is
+// within domain. If operand is a kDomain, it means that sharding argument is
+// the operand sharding, otherwise the operand's own sharding will be returned.
+const HloSharding* GetOperandSharding(const HloInstruction* operand,
+                                      const DomainMetadata::Domain& domain,
+                                      const HloSharding& sharding) {
+  DCHECK_EQ(domain.reach_set.count(const_cast<HloInstruction*>(operand)), 1);
+  // Here the user of operand is within the domain instruction set, and since it
+  // is user of operand, we need to look into the enter_domains set. If this is
+  // not a kDomain within the user domains set, then return the operand
+  // sharding, if any.
+  if (operand->opcode() != HloOpcode::kDomain ||
+      domain.enter_domains.count(const_cast<HloInstruction*>(operand)) == 0) {
+    return operand->has_sharding() ? &operand->sharding() : nullptr;
+  }
+  // At this point operand is a kDomain of the currently processed domain, so we
+  // can refer to sharding as the domain sharding.
+  return &sharding;
+}
+
+// Tries to propagate the sharding information into the instructions that are
+// part of the domain, in a post order manner (operand propagate to user).
+StatusOr<int64> ApplyDomainShardingPass(const DomainMetadata::Domain& domain,
+                                        const HloSharding& sharding) {
+  int64 assigned = 0;
+  for (HloInstruction* instruction : domain.instructions) {
+    if (instruction->has_sharding()) {
+      continue;
+    }
+    if (instruction->opcode() == HloOpcode::kGetTupleElement) {
+      HloInstruction* tuple = instruction->mutable_operand(0);
+      const HloSharding* tuple_sharding =
+          GetOperandSharding(tuple, domain, sharding);
+      if (tuple_sharding != nullptr) {
+        TF_RET_CHECK(tuple_sharding->IsTuple()) << tuple->ToString();
+        HloSharding sub_sharding = tuple_sharding->GetSubSharding(
+            tuple->shape(), {instruction->tuple_index()});
+        VLOG(4) << "  " << instruction->name() << " to sharding "
+                << sub_sharding;
+        instruction->set_sharding(sub_sharding);
+        ++assigned;
+      }
+    } else if (instruction->opcode() == HloOpcode::kTuple) {
+      int64 tuple_assigned = 0;
+      ShapeTree<HloSharding> shape_tree = GetTupleSharding(instruction);
+      for (int64 i = 0; i < instruction->operand_count(); ++i) {
+        const HloSharding* operand_sharding =
+            GetOperandSharding(instruction->operand(i), domain, sharding);
+        if (operand_sharding != nullptr &&
+            shape_tree.element({i}) != *operand_sharding) {
+          *shape_tree.mutable_element({i}) = *operand_sharding;
+          ++tuple_assigned;
+        }
+      }
+      if (tuple_assigned > 0) {
+        HloSharding tuple_sharding = HloSharding::Tuple(shape_tree);
+        VLOG(4) << "  " << instruction->name() << " to sharding "
+                << tuple_sharding;
+        instruction->set_sharding(tuple_sharding);
+        ++assigned;
+      }
+    } else {
+      // If all the operand of the given instruction has the same single device
+      // assignment, assign that device to this instruction as well.
+      const HloSharding* common_sharding = nullptr;
+      for (const HloInstruction* operand : instruction->operands()) {
+        const HloSharding* operand_sharding =
+            GetOperandSharding(operand, domain, sharding);
+        if (operand_sharding != nullptr) {
+          if (common_sharding != nullptr &&
+              *common_sharding != *operand_sharding) {
+            common_sharding = nullptr;
+            break;
+          }
+          common_sharding = operand_sharding;
+        }
+      }
+      if (common_sharding != nullptr) {
+        VLOG(4) << "  " << instruction->name() << " to sharding "
+                << *common_sharding;
+        instruction->set_sharding(*common_sharding);
+        ++assigned;
+      }
+    }
+  }
+  return assigned;
+}
+
+Status ApplyDomainSharding(const DomainMetadata::Domain& domain,
+                           const HloSharding& sharding) {
+  auto device = ShardingUniqueDevice(sharding);
+  if (device) {
+    // Shortcut the simple case. We have a unique device sharding, so we call
+    // the ApplyDomainDeviceSharding() API which will apply array or tuple
+    // shaped device sharding to the domain instructions.
+    return ApplyDomainDeviceSharding(domain, *device);
+  }
+  VLOG(1) << "Assigning non-trivial sharding " << sharding;
+  for (;;) {
+    TF_ASSIGN_OR_RETURN(int64 assigned,
+                        ApplyDomainShardingPass(domain, sharding));
+    if (assigned == 0) {
+      break;
+    }
+  }
+  int64 unassigned = 0;
+  for (HloInstruction* instruction : domain.instructions) {
+    if (!instruction->has_sharding()) {
+      LOG(WARNING) << "Unassigned instruction: " << instruction->ToString();
+      ++unassigned;
+    }
+  }
+  // Should we error out if unassigned > 0?
+  return Status::OK();
+}
+
+// Creates a kDomain instruction to be placed between instruction and operand.
+// The kDomain instruction will be created only if the sharding differ between
+// the instruction and the operand.
+std::unique_ptr<HloInstruction> CreateDomain(HloInstruction* instruction,
+                                             HloInstruction* operand) {
+  const HloSharding* instruction_sharding =
+      instruction->has_sharding() ? &instruction->sharding() : nullptr;
+  const HloSharding* operand_sharding =
+      operand->has_sharding() ? &operand->sharding() : nullptr;
+  // No need for domain if they both have no sharding.
+  if (instruction_sharding == nullptr && operand_sharding == nullptr) {
+    return nullptr;
+  }
+  // No need for domain if they match.
+  if (instruction_sharding != nullptr && operand_sharding != nullptr &&
+      ShardingMatches(*instruction_sharding, *operand_sharding)) {
+    return nullptr;
+  }
+  std::unique_ptr<HloSharding> real_instruction_sharding;
+  std::unique_ptr<HloSharding> real_operand_sharding;
+  if (instruction_sharding != nullptr) {
+    real_instruction_sharding = CloneShardingForDomain(*instruction_sharding);
+  }
+  if (operand_sharding != nullptr) {
+    real_operand_sharding = CloneShardingForDomain(*operand_sharding);
+  }
+  VLOG(3) << "Creating domain:";
+  VLOG(3) << "  Instruction: " << instruction->name();
+  VLOG(3) << "  Operand: " << operand->name();
+  VLOG(3) << "    User side sharding: "
+          << (real_instruction_sharding != nullptr
+                  ? real_instruction_sharding->ToString()
+                  : "None");
+  VLOG(3) << "    Operand side sharding: "
+          << (real_operand_sharding != nullptr
+                  ? real_operand_sharding->ToString()
+                  : "None");
+
+  std::unique_ptr<DomainMetadata> operand_side_metadata =
+      MakeUnique<ShardingMetadata>(std::move(real_operand_sharding));
+  std::unique_ptr<DomainMetadata> user_side_metadata =
+      MakeUnique<ShardingMetadata>(std::move(real_instruction_sharding));
+  return HloInstruction::CreateDomain(operand->shape(), operand,
+                                      std::move(operand_side_metadata),
+                                      std::move(user_side_metadata));
+}
+
+StatusOr<std::unique_ptr<HloSharding>> ExtractOriginalCommonSharding(
+    tensorflow::gtl::ArraySlice<HloInstruction*> instructions) {
+  // If we are here, all the instructions being passed had the same sharding
+  // (or no sharding), by the means of the ShardingMatches() API.
+  // As such, no kDomain was inserted, and here we are asked to extract the
+  // original common sharding.
+  // All the instructions passed to this API are part of the same computation.
+  const HloSharding* sharding = nullptr;
+  for (HloInstruction* instruction : instructions) {
+    if (instruction->has_sharding()) {
+      if (sharding == nullptr) {
+        sharding = &instruction->sharding();
+      } else {
+        TF_RET_CHECK(ShardingMatches(*sharding, instruction->sharding()))
+            << "Sharding " << *sharding << " does not match the one in "
+            << instruction->ToString();
+      }
+    }
+  }
+  if (sharding == nullptr) {
+    return std::unique_ptr<HloSharding>();
+  }
+  VLOG(4) << "Extracted sharding is " << *sharding;
+  return CloneShardingForDomain(*sharding);
+}
+
+}  // namespace
+
+std::unique_ptr<DomainMetadata> ShardingMetadata::Clone() const {
+  std::unique_ptr<HloSharding> sharding;
+  if (sharding_ != nullptr) {
+    sharding = MakeUnique<HloSharding>(*sharding_);
+  }
+  return MakeUnique<ShardingMetadata>(std::move(sharding));
+}
+
+bool ShardingMetadata::Matches(const DomainMetadata& other) const {
+  const ShardingMetadata* other_ptr =
+      dynamic_cast<const ShardingMetadata*>(&other);
+  if (other_ptr == nullptr) {
+    // If other is not a ShardingMetadata, then it is clearly a no match.
+    return false;
+  }
+  if (sharding_ == nullptr) {
+    return other_ptr->sharding_ == nullptr;
+  }
+  return other_ptr->sharding_ != nullptr
+             ? ShardingMatches(*sharding_, *other_ptr->sharding_)
+             : false;
+}
+
+string ShardingMetadata::ToString() const {
+  return sharding_ != nullptr ? sharding_->ToString() : "None";
+}
+
+Status ShardingMetadata::NormalizeInstructions(
+    const DomainMetadata::Domain& domain) const {
+  if (sharding_ != nullptr) {
+    VLOG(4) << "Normalizing sharding to " << sharding_->ToString() << ":";
+    TF_RETURN_IF_ERROR(ApplyDomainSharding(domain, *sharding_));
+    TF_RETURN_IF_ERROR(FixupPassThroughDomainLinks(domain, *sharding_));
+  }
+  return Status::OK();
+}
+
+Status NormalizeShardingDomain(const DomainMetadata::Domain& domain) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloSharding> sharding,
+                      ExtractOriginalCommonSharding(domain.instructions));
+  if (sharding != nullptr) {
+    VLOG(4) << "Normalizing sharding-less domain to " << sharding->ToString()
+            << ":";
+    TF_RETURN_IF_ERROR(ApplyDomainSharding(domain, *sharding));
+  } else {
+    VLOG(1) << "Unable to find common sharding";
+  }
+  return Status::OK();
+}
+
+std::unique_ptr<HloInstruction> CreateShardingDomain(
+    HloInstruction* instruction, HloInstruction* operand) {
+  return CreateDomain(instruction, operand);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_metadata.h b/tensorflow/compiler/xla/service/hlo_sharding_metadata.h
new file mode 100644
index 00000000000000..ec162c34904ee2
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_sharding_metadata.h
@@ -0,0 +1,67 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SHARDING_METADATA_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SHARDING_METADATA_H_
+
+#include "tensorflow/compiler/xla/service/hlo_domain_metadata.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+
+namespace xla {
+
+// A DomainMetadata implementation that internally wraps a sharding attribute.
+class ShardingMetadata : public DomainMetadata {
+ public:
+  explicit ShardingMetadata(std::unique_ptr<HloSharding> sharding)
+      : sharding_(std::move(sharding)) {}
+
+  std::unique_ptr<DomainMetadata> Clone() const override;
+
+  tensorflow::StringPiece Kind() const override { return KindName(); }
+
+  bool Matches(const DomainMetadata& other) const override;
+
+  string ToString() const override;
+
+  Status NormalizeInstructions(
+      const DomainMetadata::Domain& domain) const override;
+
+  static tensorflow::StringPiece KindName() { return "sharding"; }
+
+ private:
+  std::unique_ptr<HloSharding> sharding_;
+};
+
+// Within a set of instructions which had common sharding attributes before
+// entring the HLO passes pipeline, apply sharding heuristics and normalize the
+// instructions whose sharding deviates from the one which is inferred as to be
+// the original one.
+// Policy wise, HLO passes are allowed to create new unassigned instructions,
+// but if they do create assigned ones, they have to conform to the ones around.
+Status NormalizeShardingDomain(const DomainMetadata::Domain& domain);
+
+// Given an HLO graph edge between instruction and one of its operands, creates
+// a ShardingMetadata based kDomain instruction if the sharding between
+// instruction and operand changes. Returns nullptr if there is no need for a
+// domain separation.
+std::unique_ptr<HloInstruction> CreateShardingDomain(
+    HloInstruction* instruction, HloInstruction* operand);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SHARDING_METADATA_H_
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_test.cc b/tensorflow/compiler/xla/service/hlo_sharding_test.cc
index 3bf0d25efb7fad..ee7133689b1534 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding_test.cc
@@ -13,14 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/service/hlo_sharding.h"
-
 #include <set>
 #include <unordered_map>
 #include <utility>
 #include <vector>
 
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
@@ -312,5 +311,48 @@ TEST_F(HloShardingTest, OstreamTest) {
   EXPECT_EQ(oss.str(), "{f32[3,5,7,11] devices=[1,1,2,2]0,1,2,3}");
 }
 
+TEST_F(HloShardingTest, ParseHloString) {
+  auto check = [](const HloSharding& sharding) {
+    TF_ASSERT_OK_AND_ASSIGN(auto parsed_sharding,
+                            ParseSharding(sharding.ToString()));
+    EXPECT_EQ(sharding, parsed_sharding);
+  };
+  check(HloSharding::Replicate());
+  check(HloSharding::AssignDevice(2));
+  check(HloSharding::Tile(ShapeUtil::MakeShape(F32, {3, 1, 3, 7}),
+                          Array4D<int64>({{{{0}, {1}}}})));
+  // Empty tuple.
+  check(HloSharding::Tuple(ShapeUtil::MakeTupleShape({}), {}));
+  {
+    // Non-nested tuple.
+    auto tuple_shape =
+        ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {3, 1, 5, 7}),
+                                   ShapeUtil::MakeShape(F32, {3, 5, 7}),
+                                   ShapeUtil::MakeShape(F32, {3, 7})});
+    check(HloSharding::Tuple(
+        tuple_shape, {HloSharding::Tile(ShapeUtil::MakeShape(F32, {3, 1, 3, 7}),
+                                        Array4D<int64>({{{{0}, {1}}}})),
+                      HloSharding::Replicate(), HloSharding::AssignDevice(1)}));
+  }
+  {
+    // Nested tuple.
+    auto tuple_shape = ShapeUtil::MakeTupleShape(
+        {ShapeUtil::MakeShape(F32, {3, 1, 5, 7}),
+         ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {3, 5, 7}),
+                                    ShapeUtil::MakeShape(F32, {3, 7})})});
+    std::vector<HloSharding> leaf_shardings = {
+        HloSharding::Tile(ShapeUtil::MakeShape(F32, {3, 1, 3, 7}),
+                          Array4D<int64>({{{{0}, {1}}}})),
+        HloSharding::Replicate(), HloSharding::AssignDevice(1)};
+    ShapeTree<HloSharding> sharding_tree(tuple_shape, HloSharding::Replicate());
+    // Assign leaf_shardings to sharding_tree leaves.
+    auto it = leaf_shardings.begin();
+    for (auto& index_to_sharding : sharding_tree.leaves()) {
+      index_to_sharding.second = *it++;
+    }
+    check(HloSharding::Tuple(sharding_tree));
+  }
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc b/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc
index f8d98f06785967..be156d765dc10d 100644
--- a/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_tfgraph_builder_test.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/hlo_tfgraph_builder.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_token.h b/tensorflow/compiler/xla/service/hlo_token.h
similarity index 84%
rename from tensorflow/compiler/xla/tools/parser/hlo_token.h
rename to tensorflow/compiler/xla/service/hlo_token.h
index 7928bee5c2097f..533429608bc2e1 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_token.h
+++ b/tensorflow/compiler/xla/service/hlo_token.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_TOOLS_PARSER_HLO_TOKEN_H_
-#define TENSORFLOW_COMPILER_XLA_TOOLS_PARSER_HLO_TOKEN_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TOKEN_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TOKEN_H_
 
 #include <string>
 
@@ -22,9 +22,11 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
-namespace tools {
 
 // Defines different kinds of tokens in a hlo module string.
+//
+// You shouldn't need to use this directly unless you're using HloLexer
+// directly, and you probably don't need to do that.  Use hlo_parser instead.
 enum class TokKind {
   // Markers
   kEof,
@@ -72,7 +74,6 @@ enum class TokKind {
 
 string TokKindToString(TokKind kind);
 
-}  // namespace tools
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_TOOLS_PARSER_HLO_TOKEN_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TOKEN_H_
diff --git a/tensorflow/compiler/xla/service/hlo_value.cc b/tensorflow/compiler/xla/service/hlo_value.cc
index 05b7dce3d1ecf9..7b27dbfec376b8 100644
--- a/tensorflow/compiler/xla/service/hlo_value.cc
+++ b/tensorflow/compiler/xla/service/hlo_value.cc
@@ -29,9 +29,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace xla {
 
@@ -69,7 +71,7 @@ std::ostream& operator<<(std::ostream& out, const HloUse& use) {
 
 HloValue::HloValue(HloValue::Id id, HloInstruction* instruction,
                    const ShapeIndex& index, bool is_phi)
-    : id_(id), is_phi_(is_phi) {
+    : BufferValue(instruction, index, id), is_phi_(is_phi) {
   // The defining position is always the first element in the positions_ vector.
   positions_.push_back(HloPosition{instruction, index});
 }
@@ -90,8 +92,8 @@ string HloValue::ToShortString() const {
   string index_str = ShapeUtil::IsTuple(defining_instruction()->shape())
                          ? defining_index().ToString()
                          : "";
-  return StrCat(id_, " ", is_phi_ ? "PHI " : "", defining_instruction()->name(),
-                index_str);
+  return StrCat(id(), " ", is_phi_ ? "PHI " : "",
+                defining_instruction()->name(), index_str);
 }
 
 string HloValue::ToString(int indent) const {
diff --git a/tensorflow/compiler/xla/service/hlo_value.h b/tensorflow/compiler/xla/service/hlo_value.h
index 2a711e8b42590c..a1151f65e07dff 100644
--- a/tensorflow/compiler/xla/service/hlo_value.h
+++ b/tensorflow/compiler/xla/service/hlo_value.h
@@ -16,16 +16,20 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_VALUE_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_VALUE_H_
 
-#include <ostream>
+#include <stddef.h>
 #include <string>
 #include <vector>
 
+#include "tensorflow/compiler/xla/service/buffer_value.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/shape_tree.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace xla {
 
@@ -80,30 +84,9 @@ struct HloUse {
 
 std::ostream& operator<<(std::ostream& out, const HloUse& use);
 
-// Class describing a value used by the dataflow analysis. XLA arrays are
-// trivially a single HloValue. Tuples are made up of more than one HloValue: an
-// HloValue for the pointer vector, and an HloValue for each child element.
-//
-// Every HloValue is defined by a particular instruction and most instructions
-// define only a single HloValue. Instructions which define a single HloValue
-// include array-shaped instructions such as Add but also includes Tuple-shaped
-// instructions such as Tuple. The Tuple instruction defines a single HloValue
-// which is a vector of pointers to the values containing the Tuple
-// instruction's operands. Though the result of the Tuple instruction includes
-// multiple values only the top-level HloValue (the vector of pointers) is
-// defined by the Tuple instruction. The values containing the tuple elements
-// are defined by earlier instructions, usually the operands of the Tuple
-// instruction.
-//
-// Instructions which construct both the tuple *and* the tuple elements define
-// more than one HloValue. This includes (at least) tuple-shaped Constant,
-// Parameter, Infeed and While instructions. These tuple-shaped instructions do
-// not assemble a tuple from existing HloValues like the Tuple instruction does,
-// but rather define all the HloValues in the tuple.
-class HloValue {
+// HloDataflowAnalysis uses this subclass of BufferValue.
+class HloValue : public BufferValue {
  public:
-  using Id = int64;
-
   // Predicate comparing HloValues by increasing id, useful for std::sort.
   static bool IdLessThan(const HloValue* a, const HloValue* b) {
     return a->id() < b->id();
@@ -120,6 +103,7 @@ class HloValue {
   // dataflow analysis (HloDataflowAnalysis::ssa_form_ is true).
   HloValue(Id id, HloInstruction* instruction, const ShapeIndex& index,
            bool is_phi = false);
+  ~HloValue() override {}
 
   // Sets the positions in the module at which the HloValue appears. Updates
   // uses. Should be called once and only once. The defining position should not
@@ -127,10 +111,6 @@ class HloValue {
   void SetPositionsAndComputeUses(
       tensorflow::gtl::ArraySlice<HloPosition> positions);
 
-  // Return a unique identifier for this HloValue. This value is used for stable
-  // sorting and iteration
-  Id id() const { return id_; }
-
   // Returns whether this value is a phi value.
   bool is_phi() const { return is_phi_; }
 
@@ -142,12 +122,18 @@ class HloValue {
     return defining_position().instruction;
   }
 
+  HloInstruction* instruction() const override {
+    return defining_instruction();
+  }
+
   // Return the shape index at which this HloValue is defined in the output of
   // its defining instruction.
   const ShapeIndex& defining_index() const { return defining_position().index; }
 
+  const ShapeIndex& index() const override { return defining_index(); }
+
   // Return the shape of this HloValue.
-  const Shape& shape() const { return defining_position().shape(); }
+  const Shape& shape() const override { return defining_position().shape(); }
 
   // Return all positions of the HloValue in the module.
   const std::vector<HloPosition>& positions() const { return positions_; }
@@ -164,12 +150,11 @@ class HloValue {
   // Return a single-line string representation of the value.
   string ToShortString() const;
 
-  string ToString(int indent = 0) const;
+  string ToString(int indent) const;
 
- private:
-  // Unique identifier for this HloValue. Used for stable sorting and iteration.
-  const Id id_;
+  string ToString() const override { return ToString(0); }
 
+ private:
   // Whether this instruction is a phi value.
   const bool is_phi_;
 
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 8a30cbf9cd622f..9cfd8a9bf74bc6 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -106,9 +106,7 @@ Status ShapeVerifier::HandleReducePrecision(HloInstruction* reduce_precision) {
                                           reduce_precision->mantissa_bits()));
 }
 
-Status ShapeVerifier::HandleInfeed(HloInstruction*) {
-  return tensorflow::Status::OK();
-}
+Status ShapeVerifier::HandleInfeed(HloInstruction*) { return Status::OK(); }
 
 Status ShapeVerifier::HandleOutfeed(HloInstruction* outfeed) {
   // Outfeed has a separate shape field for the value which is outfed to the
@@ -116,7 +114,7 @@ Status ShapeVerifier::HandleOutfeed(HloInstruction* outfeed) {
   // produces no HLO value in the graph.
   if (!ShapeUtil::Compatible(outfeed->outfeed_shape(),
                              outfeed->operand(0)->shape())) {
-    return InvalidArgument(
+    return InternalError(
         "Expected outfeed to have shape compatible with operand's shape %s, "
         "actual shape is %s:\n%s",
         ShapeUtil::HumanString(outfeed->operand(0)->shape()).c_str(),
@@ -127,12 +125,10 @@ Status ShapeVerifier::HandleOutfeed(HloInstruction* outfeed) {
 }
 
 Status ShapeVerifier::HandleHostCompute(HloInstruction*) {
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-Status ShapeVerifier::HandleRng(HloInstruction*) {
-  return tensorflow::Status::OK();
-}
+Status ShapeVerifier::HandleRng(HloInstruction*) { return Status::OK(); }
 
 Status ShapeVerifier::HandleReverse(HloInstruction* reverse) {
   return CheckShape(
@@ -164,7 +160,7 @@ Status ShapeVerifier::HandleReduce(HloInstruction* reduce) {
 }
 
 Status ShapeVerifier::HandleBitcast(HloInstruction* bitcast) {
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 Status ShapeVerifier::HandleBroadcast(HloInstruction* broadcast) {
@@ -183,7 +179,7 @@ Status ShapeVerifier::HandleBroadcast(HloInstruction* broadcast) {
                  operand_shape.dimensions(operand_dimension))
         << broadcast->ToString() << " operand shape " << operand_shape;
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 Status ShapeVerifier::HandleReshape(HloInstruction* reshape) {
@@ -191,7 +187,7 @@ Status ShapeVerifier::HandleReshape(HloInstruction* reshape) {
   TF_RETURN_IF_ERROR(CheckShape(reshape, reshape->shape()));
   TF_RET_CHECK(ShapeUtil::ElementsIn(reshape->shape()) ==
                ShapeUtil::ElementsIn(reshape->operand(0)->shape()));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 Status ShapeVerifier::HandleTranspose(HloInstruction* transpose) {
@@ -200,22 +196,18 @@ Status ShapeVerifier::HandleTranspose(HloInstruction* transpose) {
                      transpose->operand(0)->shape(), transpose->dimensions()));
 }
 
-Status ShapeVerifier::HandleParameter(HloInstruction*) {
-  return tensorflow::Status::OK();
+Status ShapeVerifier::HandleParameter(HloInstruction* hlo) {
+  return Status::OK();
 }
 
-Status ShapeVerifier::HandleFusion(HloInstruction*) {
-  return tensorflow::Status::OK();
-}
+Status ShapeVerifier::HandleFusion(HloInstruction*) { return Status::OK(); }
 
 Status ShapeVerifier::HandleCall(HloInstruction* call) {
   // The shape of kCall should match the shape of the computation it calls.
   return CheckShape(call, call->to_apply()->ComputeProgramShape().result());
 }
 
-Status ShapeVerifier::HandleCustomCall(HloInstruction*) {
-  return tensorflow::Status::OK();
-}
+Status ShapeVerifier::HandleCustomCall(HloInstruction*) { return Status::OK(); }
 
 Status ShapeVerifier::HandleSlice(HloInstruction* slice) {
   return CheckShape(slice,
@@ -384,6 +376,7 @@ Status CheckMixedPrecisionOperands(const HloInstruction* instruction) {
     case HloOpcode::kConstant:
     case HloOpcode::kCrossReplicaSum:
     case HloOpcode::kCustomCall:
+    case HloOpcode::kDomain:
     case HloOpcode::kFusion:
     case HloOpcode::kGetTupleElement:
     case HloOpcode::kInfeed:
@@ -410,7 +403,7 @@ Status CheckMixedPrecisionOperands(const HloInstruction* instruction) {
               if (fp_type == PRIMITIVE_TYPE_INVALID) {
                 fp_type = subshape.element_type();
               } else if (fp_type != subshape.element_type()) {
-                return FailedPrecondition(
+                return InternalError(
                     "Seen floating point types of different precisions in "
                     "%s, but mixed precision is disallowed.",
                     instruction->ToString().c_str());
@@ -490,14 +483,14 @@ Status ShapeVerifier::CheckShape(const HloInstruction* instruction,
       }
   }
   if (!compatible) {
-    return InvalidArgument(
+    return InternalError(
         "Expected instruction to have shape compatible with %s, actual "
         "shape is %s:\n%s",
         ShapeUtil::HumanString(inferred_shape).c_str(),
         ShapeUtil::HumanString(instruction->shape()).c_str(),
         instruction->ToString().c_str());
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 Status ShapeVerifier::CheckShape(const HloInstruction* instruction,
@@ -541,13 +534,13 @@ Status ShapeVerifier::CheckVariadicShape(const HloInstruction* instruction) {
 Status ShapeVerifier::CheckSameChannel(const HloInstruction* instr1,
                                        const HloInstruction* instr2) {
   if (instr1->channel_id() != instr2->channel_id()) {
-    return FailedPrecondition(
+    return InternalError(
         "Expected to have the same channel id, actual channel ids are: %s "
         "(%lld), %s (%lld)",
         instr1->ToString().c_str(), instr1->channel_id(),
         instr2->ToString().c_str(), instr2->channel_id());
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 string ComputationsToString(
@@ -571,22 +564,22 @@ string ComputationsToString(
 Status VerifyHloStructure(HloModule* module) {
   for (const HloComputation* computation : module->computations()) {
     if (computation->parent() == nullptr) {
-      return FailedPrecondition("Computation %s has a null parent pointer",
-                                computation->name().c_str());
+      return InternalError("Computation %s has a null parent pointer",
+                           computation->name().c_str());
     }
     if (computation->parent() != module) {
-      return FailedPrecondition(
+      return InternalError(
           "Computation %s parent() does not point to parent module",
           computation->name().c_str());
     }
 
     for (const HloInstruction* instruction : computation->instructions()) {
       if (instruction->parent() == nullptr) {
-        return FailedPrecondition("Instruction %s has a null parent pointer",
-                                  instruction->name().c_str());
+        return InternalError("Instruction %s has a null parent pointer",
+                             instruction->name().c_str());
       }
       if (instruction->parent() != computation) {
-        return FailedPrecondition(
+        return InternalError(
             "Instruction %s parent() does not point to parent computation",
             instruction->name().c_str());
       }
@@ -602,7 +595,7 @@ Status VerifyHloStructure(HloModule* module) {
       for (int i = 0; i < instruction->operand_count(); ++i) {
         const HloInstruction* operand = instruction->operand(i);
         if (operand->parent() != instruction->parent()) {
-          return FailedPrecondition(
+          return InternalError(
               "Operand %d (%s) of instruction %s is in a different "
               "computation: %s vs %s",
               i, operand->name().c_str(), instruction->name().c_str(),
@@ -612,14 +605,14 @@ Status VerifyHloStructure(HloModule* module) {
       }
     }
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const {
   // The parent fusion instruction of the fusion computation must be 'fusion'.
   HloComputation* fused_computation = fusion->fused_instructions_computation();
   if (fusion != fused_computation->FusionInstruction()) {
-    return FailedPrecondition(
+    return InternalError(
         "Instruction of fused computation does not match expected instruction "
         "%s.",
         fusion->ToString().c_str());
@@ -635,37 +628,37 @@ Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const {
   for (auto* instruction : fused_computation->instructions()) {
     if (fused_root == instruction) {
       if (root_owned) {
-        return FailedPrecondition("Root appears more than once in %s.",
-                                  fusion->ToString().c_str());
+        return InternalError("Root appears more than once in %s.",
+                             fusion->ToString().c_str());
       }
       root_owned = true;
     }
     for (int i = 0; i < fused_parameters.size(); ++i) {
       if (fused_parameters[i] == instruction) {
         if (parameter_owned[i]) {
-          return FailedPrecondition("Parameter appears more than once in %s.",
-                                    fusion->ToString().c_str());
+          return InternalError("Parameter appears more than once in %s.",
+                               fusion->ToString().c_str());
         }
         parameter_owned[i] = true;
       }
     }
   }
   if (!root_owned) {
-    return FailedPrecondition("Root not found in computation of %s.",
-                              fusion->ToString().c_str());
+    return InternalError("Root not found in computation of %s.",
+                         fusion->ToString().c_str());
   }
   // Make sure all the parameter_owned entries are set
   for (int i = 0; i < parameter_owned.size(); i++) {
     if (!parameter_owned[i]) {
-      return FailedPrecondition("Parameter %d not found in computation of %s.",
-                                i, fusion->ToString().c_str());
+      return InternalError("Parameter %d not found in computation of %s.", i,
+                           fusion->ToString().c_str());
     }
   }
 
   // Fused root must have no users.
   if (fused_root->user_count() != 0) {
-    return FailedPrecondition("Root of %s may not have users.",
-                              fusion->ToString().c_str());
+    return InternalError("Root of %s may not have users.",
+                         fusion->ToString().c_str());
   }
 
   // All uses of fused instructions must be in the fusion computation, and every
@@ -674,13 +667,13 @@ Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const {
        fusion->fused_instructions_computation()->instructions()) {
     if (instruction != fused_root) {
       if (instruction->user_count() == 0) {
-        return FailedPrecondition(
-            "Non-root instruction %s in %s must have users.",
-            instruction->ToString().c_str(), fusion->ToString().c_str());
+        return InternalError("Non-root instruction %s in %s must have users.",
+                             instruction->ToString().c_str(),
+                             fusion->ToString().c_str());
       }
       for (auto& user : instruction->users()) {
         if (fused_computation != user->parent()) {
-          return FailedPrecondition(
+          return InternalError(
               "Non-root instruction %s in %s may not have external users.",
               instruction->ToString().c_str(), fusion->ToString().c_str());
         }
@@ -695,41 +688,40 @@ Status HloVerifier::CheckFusionInstruction(HloInstruction* fusion) const {
   for (auto fused_param : fused_parameters) {
     int64 param_no = fused_param->parameter_number();
     if (param_no < 0) {
-      return FailedPrecondition(
-          "Unexpected negative parameter number %lld in %s.", param_no,
-          fusion->ToString().c_str());
+      return InternalError("Unexpected negative parameter number %lld in %s.",
+                           param_no, fusion->ToString().c_str());
     }
     if (param_no >= fused_parameters.size()) {
-      return FailedPrecondition(
+      return InternalError(
           "Unexpected parameter number %lld in %s: higher then number of "
           "parameters %lu.",
           param_no, fusion->ToString().c_str(), fused_parameters.size());
     }
     if (parameter_numbers[param_no]) {
-      return FailedPrecondition(
+      return InternalError(
           "Did not expect parameter number %lld more than once in %s.",
           param_no, fusion->ToString().c_str());
     }
     parameter_numbers[param_no] = true;
     if (!ShapeUtil::Compatible(fused_param->shape(),
                                fusion->operand(param_no)->shape())) {
-      return FailedPrecondition(
+      return InternalError(
           "Shape mismatch between parameter number %lld and its operand in %s.",
           param_no, fusion->ToString().c_str());
     }
   }
-  // Make sure all the parameter_numbers entries were seen
+  // Make sure all the parameter_numbers entries were seen.
   for (int i = 0; i < parameter_numbers.size(); i++) {
     if (!parameter_numbers[i]) {
-      return FailedPrecondition("Did not see parameter number %d in %s.", i,
-                                fusion->ToString().c_str());
+      return InternalError("Did not see parameter number %d in %s.", i,
+                           fusion->ToString().c_str());
     }
   }
 
   // TODO(b/65423525): We'd like to check that all operands are distinct.
   // This is currently disabled due to the invariant being violated by
   // multi-output fusion.
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 Status HloVerifier::CheckWhileInstruction(HloInstruction* instruction) {
@@ -778,7 +770,7 @@ Status HloVerifier::CheckWhileInstruction(HloInstruction* instruction) {
         "init: %s, body: %s",
         init->ToString().c_str(), body_root->ToString().c_str());
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 Status HloVerifier::CheckElementwiseInstruction(HloInstruction* instruction) {
@@ -796,7 +788,7 @@ Status HloVerifier::CheckElementwiseInstruction(HloInstruction* instruction) {
           ShapeUtil::HumanString(operand_shape).c_str());
     }
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 StatusOr<bool> HloVerifier::Run(HloModule* module) {
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index 6208887547a14d..1392a78097aa02 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -82,9 +82,7 @@ class ShapeVerifier : public DfsHloVisitor {
   Status HandleBatchNormGrad(HloInstruction* batch_norm_grad) override;
   Status HandleGather(HloInstruction* gather) override;
 
-  Status FinishVisit(HloInstruction*) override {
-    return tensorflow::Status::OK();
-  }
+  Status FinishVisit(HloInstruction*) override { return Status::OK(); }
 
  protected:
   // Check the instruction's shape against the shape given by ShapeInference
diff --git a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
index 13e4557317f74b..d7458c338e9f1d 100644
--- a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
+++ b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
@@ -27,6 +27,7 @@ using tensorflow::strings::HumanReadableElapsedTime;
 using tensorflow::strings::HumanReadableNumBytes;
 using tensorflow::strings::Printf;
 using tensorflow::strings::StrAppend;
+using tensorflow::strings::StrCat;
 
 string HumanReadableProfileBuilder::ToString() const {
   string s;
@@ -35,20 +36,26 @@ string HumanReadableProfileBuilder::ToString() const {
           computation_name_.c_str(),
           HumanReadableElapsedTime(CyclesToSeconds(total_cycles_)).c_str());
 
-  auto append_op = [&](const OpInfo& op) {
+  auto print_op = [&](const OpInfo& op) {
+    // Skip ops with 0 optimal seconds and 0 actual cycles.  These are ops that
+    // were expected to be free and are actually free -- things like (on most
+    // backends) kParameter or kConstant HLOs.  There's no need to clutter the
+    // profile with these.
+    if (op.optimal_seconds == 0 && op.cycles == 0) {
+      return;
+    }
+
     string bytes_per_sec;
     string bytes_per_cycle;
-    if (op.cycles <= 0 || op.bytes_accessed < 0) {
-      bytes_per_sec = "<unknown>";
-      bytes_per_cycle = "<unknown>";
-    } else {
-      bytes_per_sec =
-          HumanReadableNumBytes(op.bytes_accessed / CyclesToSeconds(op.cycles));
+    if (op.cycles > 0 && op.bytes_accessed >= 0) {
+      bytes_per_sec = StrCat(
+          HumanReadableNumBytes(op.bytes_accessed / CyclesToSeconds(op.cycles)),
+          "/s");
+      double bpc = static_cast<double>(op.bytes_accessed) / op.cycles;
       if (op.bytes_accessed > op.cycles) {
-        bytes_per_cycle = HumanReadableNumBytes(op.bytes_accessed / op.cycles);
+        bytes_per_cycle = StrCat(HumanReadableNumBytes(bpc), "/cycle");
       } else {
-        bytes_per_cycle =
-            Printf("%.3fB", static_cast<float>(op.bytes_accessed) / op.cycles);
+        bytes_per_cycle = Printf("%.3fB/cycle", bpc);
       }
     }
 
@@ -59,14 +66,16 @@ string HumanReadableProfileBuilder::ToString() const {
 
     double nsecs = op.cycles / clock_rate_ghz_;
     Appendf(&s,
-            "%15lld cycles (%6.2f%%) :: %12.1f usec (%12.1f optimal) :: %18s "
-            ":: %18s :: %12s/s :: %12s/cycle :: %s\n",
+            "%15lld cycles (%6.2f%%) :: %12.1f usec %22s :: %18s "
+            ":: %18s :: %14s :: %16s :: %s\n",
             op.cycles, cycles_percent, CyclesToMicroseconds(op.cycles),
-            op.optimal_seconds * 1e6,
+            op.optimal_seconds < 0
+                ? ""
+                : Printf("(%12.1f optimal)", op.optimal_seconds * 1e6).c_str(),
             op.flop_count <= 0
-                ? "<none>"
+                ? ""
                 : HumanReadableNumFlops(op.flop_count, nsecs).c_str(),
-            op.transcendental_count <= 0 ? "<none>"
+            op.transcendental_count <= 0 ? ""
                                          : HumanReadableNumTranscendentalOps(
                                                op.transcendental_count, nsecs)
                                                .c_str(),
@@ -78,24 +87,26 @@ string HumanReadableProfileBuilder::ToString() const {
   int64 total_transcendentals = 0.;
   int64 total_bytes = 0;
   for (const auto& op : op_infos_) {
-    optimal_seconds_sum += op.optimal_seconds;
-    total_flops += op.flop_count;
-    total_transcendentals += op.transcendental_count;
-    total_bytes += op.bytes_accessed;
+    if (op.optimal_seconds > 0) {
+      optimal_seconds_sum += op.optimal_seconds;
+    }
+    total_flops += std::max(op.flop_count, int64{0});
+    total_transcendentals += std::max(op.transcendental_count, int64{0});
+    total_bytes += std::max(op.bytes_accessed, int64{0});
   }
 
   VLOG(1) << "Total floating point ops: " << total_flops;
 
-  append_op({"[total]", "[total]", /*category=*/"", total_cycles_, total_flops,
-             total_transcendentals, total_bytes, optimal_seconds_sum});
+  print_op({"[total]", "[total]", /*category=*/"", total_cycles_, total_flops,
+            total_transcendentals, total_bytes, optimal_seconds_sum});
 
-  // Sort ops in decreasing order of cycles.
+  // Sort ops in decreasing order of cycles, and print them.
   std::vector<OpInfo> sorted_ops(op_infos_);
   std::sort(
       sorted_ops.begin(), sorted_ops.end(),
       [](const OpInfo& a, const OpInfo& b) { return a.cycles > b.cycles; });
   for (const auto& op : sorted_ops) {
-    append_op(op);
+    print_op(op);
   }
 
   if (total_cycles_ <= 0) {
@@ -109,8 +120,20 @@ string HumanReadableProfileBuilder::ToString() const {
       table.SetMetricName("microseconds above estimated optimum");
       table.SetEntryName("ops");
       table.SetShowCategoryTable();
+      table.SetShowAllEntries();
       float total_discrepancy_in_microseconds = 0.0f;
-      for (const auto& op : sorted_ops) {
+      for (const auto& op : op_infos_) {
+        // Skip ops with < 0 optimal seconds.  These are ops for which we don't
+        // know the optimal time.
+        if (op.optimal_seconds < 0) {
+          continue;
+        }
+        // Also skip ops with 0 actual cycles.  These ops were free; there's no
+        // need to clutter the "above estimated optimum" table with them,
+        // because they can't be optimized further.
+        if (op.cycles == 0) {
+          continue;
+        }
         MetricTableReport::Entry entry;
         entry.text = op.name;
         entry.short_text = op.short_name;
@@ -128,7 +151,14 @@ string HumanReadableProfileBuilder::ToString() const {
       table.SetMetricName("microseconds");
       table.SetEntryName("ops");
       table.SetShowCategoryTable();
-      for (const auto& op : sorted_ops) {
+      table.SetShowAllEntries();
+      for (const auto& op : op_infos_) {
+        // Skip ops with 0 optimal seconds and 0 actual cycles.  As in
+        // print_op(), these are uninteresting because they're expected to be
+        // free, and they were actually free.
+        if (op.cycles == 0 && op.optimal_seconds == 0) {
+          continue;
+        }
         MetricTableReport::Entry entry;
         entry.text = op.name;
         entry.short_text = op.short_name;
@@ -139,6 +169,23 @@ string HumanReadableProfileBuilder::ToString() const {
       StrAppend(&s, table.MakeReport(CyclesToMicroseconds(total_cycles_)));
     }
   }
+
+  if (total_bytes > 0) {
+    MetricTableReport table;
+    table.SetMetricName("MiB read+written");
+    table.SetEntryName("ops");
+    table.SetShowCategoryTable();
+    for (const auto& op : op_infos_) {
+      MetricTableReport::Entry entry;
+      entry.text = op.name;
+      entry.short_text = op.short_name;
+      entry.category_text = op.category;
+      entry.metric = static_cast<double>(op.bytes_accessed) / (1 << 20);
+      table.AddEntry(std::move(entry));
+    }
+    StrAppend(&s,
+              table.MakeReport(static_cast<double>(total_bytes) / (1 << 20)));
+  }
   return s;
 }
 
diff --git a/tensorflow/compiler/xla/service/human_readable_profile_builder.h b/tensorflow/compiler/xla/service/human_readable_profile_builder.h
index fc24acd2713f4c..6f56c3aa82e9d1 100644
--- a/tensorflow/compiler/xla/service/human_readable_profile_builder.h
+++ b/tensorflow/compiler/xla/service/human_readable_profile_builder.h
@@ -32,7 +32,7 @@ class HumanReadableProfileBuilder {
   explicit HumanReadableProfileBuilder(tensorflow::StringPiece computation_name,
                                        int64 total_cycles,
                                        double clock_rate_ghz)
-      : computation_name_(computation_name.ToString()),
+      : computation_name_(std::string(computation_name)),
         total_cycles_(total_cycles),
         clock_rate_ghz_(clock_rate_ghz) {
     CHECK_GE(clock_rate_ghz, 1e-9);
@@ -41,15 +41,17 @@ class HumanReadableProfileBuilder {
   int64 total_cycles() const { return total_cycles_; }
 
   // Adds an operation to the profile.  If you don't know the number of
-  // floating-point ops or bytes touched by the op, pass -1 for that param.
+  // floating-point ops or bytes touched by the op, or if you don't know how
+  // fast it would run optimally, pass -1 for that param.
   void AddOp(tensorflow::StringPiece op_name,
              tensorflow::StringPiece short_name,
              tensorflow::StringPiece category, int64 cycles, int64 flop_count,
              int64 transcendental_count, int64 bytes_accessed,
              float optimal_seconds) {
-    op_infos_.push_back(
-        {op_name.ToString(), short_name.ToString(), category.ToString(), cycles,
-         flop_count, transcendental_count, bytes_accessed, optimal_seconds});
+    op_infos_.push_back({std::string(op_name), std::string(short_name),
+                         std::string(category), cycles, flop_count,
+                         transcendental_count, bytes_accessed,
+                         optimal_seconds});
   }
 
   // Gets the human-readable profile.
@@ -61,10 +63,10 @@ class HumanReadableProfileBuilder {
     string short_name;
     string category;
     int64 cycles;
-    int64 flop_count;
+    int64 flop_count;  // -1 if unknown
     int64 transcendental_count;
-    int64 bytes_accessed;
-    float optimal_seconds;
+    int64 bytes_accessed;   // -1 if unknown
+    float optimal_seconds;  // -1 if unknown
   };
 
   double CyclesToSeconds(int64 cycles) const {
diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis.cc b/tensorflow/compiler/xla/service/indexed_array_analysis.cc
new file mode 100644
index 00000000000000..8b3fa6c1572cf0
--- /dev/null
+++ b/tensorflow/compiler/xla/service/indexed_array_analysis.cc
@@ -0,0 +1,733 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/indexed_array_analysis.h"
+#include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace xla {
+namespace gtl = ::tensorflow::gtl;
+
+namespace {
+using Analysis = IndexedArrayAnalysis;
+using UnknownArray = Analysis::UnknownArray;
+using ConstantArray = Analysis::ConstantArray;
+using ScalarIndexedArray = Analysis::ScalarIndexedArray;
+using tensorflow::gtl::ArraySlice;
+using tensorflow::str_util::Join;
+}  // namespace
+
+string IndexedArrayAnalysis::ToString(Array* root, bool print_constants) {
+  switch (root->kind()) {
+    case Array::kUnknown: {
+      auto* unknown_tensor = root->as<UnknownArray>();
+      return tensorflow::strings::StrCat("%",
+                                         unknown_tensor->instruction().name());
+    }
+
+    case Array::kConstant: {
+      if (print_constants) {
+        string contents = root->as<ConstantArray>()->literal()->ToString();
+        return tensorflow::strings::StrCat(
+            "(constant ", ShapeUtil::HumanString(root->shape()), " ", contents,
+            ")");
+      }
+      return tensorflow::strings::StrCat(
+          "(constant ", ShapeUtil::HumanString(root->shape()), ")");
+    }
+
+    case Array::kScalarIndexedConstant:
+    case Array::kScalarIndexed: {
+      auto* indexed_array = root->as<ScalarIndexedArray>();
+      string name = root->kind() == Array::kScalarIndexedConstant
+                        ? "scalar-indexed-const"
+                        : "scalar-indexed";
+      return tensorflow::strings::StrCat(
+          "(", name, " ", ToString(indexed_array->source(), print_constants),
+          " ", ToString(indexed_array->indices(), print_constants), " ",
+          indexed_array->source_dim(), "->[",
+          Join(indexed_array->output_dims(), ","), "])");
+    }
+  }
+}
+
+StatusOr<Analysis::Array*> IndexedArrayAnalysis::GetArrayFor(
+    const HloInstruction* instr) {
+  auto it = cache_.find(instr);
+  if (it != cache_.end()) {
+    return it->second;
+  }
+
+  TF_RETURN_IF_ERROR(TraverseAndPopulateCache(instr));
+  return FindOrDie(cache_, instr);
+}
+
+Status IndexedArrayAnalysis::TraverseAndPopulateCache(
+    const HloInstruction* root) {
+  // Depth first search over the DAG, invoking ComputeArrayFor in post order.
+  // The HLO instructions already in the cache are considered leaves.
+
+  gtl::InlinedVector<const HloInstruction*, 4> stack;
+
+  enum DfsState { kDiscovered, kVisited };
+  gtl::FlatMap<const HloInstruction*, DfsState> dfs_state_map;
+
+  stack.push_back(root);
+  InsertOrDie(&dfs_state_map, root, kDiscovered);
+
+  do {
+    const HloInstruction* instr = stack.back();
+    if (cache_.count(instr)) {
+      stack.pop_back();
+      continue;
+    }
+
+    switch (FindOrDie(dfs_state_map, instr)) {
+      case kDiscovered: {
+        for (const HloInstruction* operand : instr->operands()) {
+          if (!cache_.count(operand)) {
+            stack.push_back(operand);
+            CHECK(!dfs_state_map.count(operand) ||
+                  dfs_state_map[operand] == kDiscovered);
+            dfs_state_map[operand] = kDiscovered;
+          }
+        }
+        dfs_state_map[instr] = kVisited;
+        break;
+      }
+
+      case kVisited:
+        stack.pop_back();
+        TF_ASSIGN_OR_RETURN(Array * array, ComputeArrayFor(instr));
+        InsertOrDie(&cache_, instr, array);
+        break;
+    }
+  } while (!stack.empty());
+
+  return Status::OK();
+}
+
+StatusOr<Analysis::Array*> IndexedArrayAnalysis::ComputeArrayFor(
+    const HloInstruction* instr) {
+  Array* computed_array;
+  if (instr->IsElementwise() && instr->operand_count() == 1) {
+    TF_ASSIGN_OR_RETURN(
+        computed_array,
+        ComputeArrayForElementwiseUnaryOp(
+            instr->opcode(), FindOrDie(cache_, instr->operand(0))));
+  } else if (instr->IsElementwise() && instr->operand_count() == 2) {
+    TF_ASSIGN_OR_RETURN(
+        computed_array,
+        ComputeArrayForElementwiseBinaryOp(
+            instr->opcode(), FindOrDie(cache_, instr->operand(0)),
+            FindOrDie(cache_, instr->operand(1))));
+  } else if (instr->opcode() == HloOpcode::kConstant) {
+    TF_ASSIGN_OR_RETURN(computed_array,
+                        ComputeArrayForConstant(instr->literal()));
+  } else if (instr->opcode() == HloOpcode::kGather) {
+    TF_ASSIGN_OR_RETURN(
+        computed_array,
+        ComputeArrayForGather(instr->shape(), instr->gather_dimension_numbers(),
+                              instr->gather_window_bounds(),
+                              FindOrDie(cache_, instr->operand(0)),
+                              FindOrDie(cache_, instr->operand(1))));
+  } else if (instr->opcode() == HloOpcode::kReshape) {
+    TF_ASSIGN_OR_RETURN(
+        computed_array,
+        ComputeArrayForReshape(instr->shape(),
+                               FindOrDie(cache_, instr->operand(0))));
+  } else {
+    computed_array = nullptr;
+  }
+
+  if (!computed_array) {
+    computed_array = Construct<UnknownArray>(instr);
+  }
+
+  return computed_array;
+}
+
+StatusOr<Analysis::Array*> IndexedArrayAnalysis::ComputeArrayForConstant(
+    const Literal& literal) {
+  return Construct<ConstantArray>(&literal);
+}
+
+StatusOr<ScalarIndexedArray*> IndexedArrayAnalysis::FoldGatherOfGather(
+    ScalarIndexedArray* source, Array* indices, int64 source_dim,
+    tensorflow::gtl::ArraySlice<int64> output_dims, Shape shape) {
+  // We want to transform Gather(Gather(A, X), Y) => Gather(A, Gather(X, Y)).
+  // `source` is the inner Gather(A, X).
+
+  Array* a = source->source();
+  Array* x = source->indices();
+  Array* y = indices;
+
+  // This bit is slightly tricky, so we do a naive "simulation" of the two
+  // consecutive gather operations to infer what the composed gather should look
+  // like.
+
+  enum class IndexComponent { Ungathered, GatheredFirst, GatheredSecond };
+
+  std::vector<IndexComponent> simulated_index(a->shape().dimensions_size(),
+                                              IndexComponent::Ungathered);
+
+  // Simulate the first gather.
+  EraseAt(&simulated_index, source->source_dim());
+  for (int64 gather_dim : source->output_dims()) {
+    simulated_index.insert(simulated_index.begin() + gather_dim,
+                           IndexComponent::GatheredFirst);
+  }
+
+  // Simulate the second gather.
+  EraseAt(&simulated_index, source_dim);
+  for (int64 output_dim : output_dims) {
+    simulated_index.insert(simulated_index.begin() + output_dim,
+                           IndexComponent::GatheredSecond);
+  }
+
+  int64 source_dim_for_index_array =
+      FindIndex(source->output_dims(), source_dim);
+  CHECK_NE(source_dim_for_index_array, source->output_dims().size());
+
+  std::vector<int64> output_dims_for_index_array;
+  int64 gathered_index_components_seen = 0;
+  for (IndexComponent simulation_dim : simulated_index) {
+    if (simulation_dim == IndexComponent::GatheredSecond) {
+      output_dims_for_index_array.push_back(gathered_index_components_seen);
+    }
+    if (simulation_dim != IndexComponent::Ungathered) {
+      gathered_index_components_seen++;
+    }
+  }
+
+  std::vector<int64> dim_sizes_for_composed_index;
+  std::vector<int64> output_dims_for_new_gather;
+  for (int64 i = 0, e = simulated_index.size(); i < e; i++) {
+    if (simulated_index[i] != IndexComponent::Ungathered) {
+      dim_sizes_for_composed_index.push_back(shape.dimensions(i));
+      output_dims_for_new_gather.push_back(i);
+    }
+  }
+
+  Array* inner_indices = ConstructScalarIndexedArray(
+      x, y, source_dim_for_index_array, output_dims_for_index_array,
+      ShapeUtil::MakeShape(x->shape().element_type(),
+                           dim_sizes_for_composed_index));
+  return ConstructScalarIndexedArray(a, inner_indices, source->source_dim(),
+                                     output_dims_for_new_gather,
+                                     std::move(shape));
+}
+
+StatusOr<Analysis::Array*> IndexedArrayAnalysis::ComputeArrayForGather(
+    const Shape& shape, const GatherDimensionNumbers& dim_numbers,
+    tensorflow::gtl::ArraySlice<int64> window_bounds, Array* source,
+    Array* indices) {
+  if (dim_numbers.index_vector_dim() != indices->shape().dimensions_size()) {
+    return nullptr;
+  }
+
+  CHECK_EQ(dim_numbers.gather_dims_to_operand_dims_size(), 1);
+  if (!c_binary_search(dim_numbers.elided_window_dims(),
+                       dim_numbers.gather_dims_to_operand_dims(0))) {
+    return nullptr;
+  }
+
+  int64 source_dim = dim_numbers.gather_dims_to_operand_dims(0);
+  std::vector<int64> output_dims;
+  for (int64 i = 0, e = shape.dimensions_size(); i < e; i++) {
+    if (!c_binary_search(dim_numbers.output_window_dims(), i)) {
+      output_dims.push_back(i);
+    }
+  }
+
+  if (auto* indexed = dynamic_cast<ScalarIndexedArray*>(source)) {
+    auto it = c_find(indexed->output_dims(), source_dim);
+    if (it != indexed->output_dims().end()) {
+      return FoldGatherOfGather(indexed, indices, source_dim, output_dims,
+                                shape);
+    }
+  } else if (auto* constant = dynamic_cast<ConstantArray*>(source)) {
+    return Construct<ScalarIndexedConstantArray>(constant, indices, source_dim,
+                                                 output_dims, shape);
+  }
+
+  return Construct<ScalarIndexedArray>(source, indices, source_dim, output_dims,
+                                       shape);
+}
+
+namespace {
+// Returns an index into `values` such that the product of the range
+// [values.begin()+index, values.end()) is equal to `product`.  If there is no
+// such index, return -1.  All integers in `values` must be positive.
+int64 FindSuffixWithProduct(ArraySlice<int64> values, int64 product) {
+  DCHECK(c_all_of(values, [](int64 value) { return value > 0; }));
+
+  int64 current_product = 1;
+  int64 i;
+  for (i = values.size() - 1; i >= 0 && product > current_product; --i) {
+    current_product *= values[i];
+  }
+
+  if (product == current_product) {
+    return i + 1;
+  }
+
+  return -1;
+}
+
+struct ReshapePassthroughDimPair {
+  int64 result_dim;
+  int64 operand_dim;
+};
+
+// Returns a set of dimension pairs such for all (result_dim, operand_dim) in
+// the set:
+//
+// output_index[result_dim] = SourceIndexOfReshape(output_index)[operand_dim]
+//
+// The returned vector of pairs is sorted in both the result_dim and the
+// operand_dim components.
+std::vector<ReshapePassthroughDimPair> ComputeReshapePassthroughDimPairs(
+    ArraySlice<int64> operand_shape, ArraySlice<int64> result_shape) {
+  // A reshape can be seen as an index mapping from output index to input index:
+  //
+  // (i_0, ..., i_n) = f(o_0, ..., o_m)
+  //
+  // This function returns the pairs (j, k) for which the following invariant
+  // holds for all indices in the shape:
+  //
+  //   o_j == i_k
+  //
+  // And this occurs when:
+  //
+  //    O_{j+1} * ... * O_n == I_{k+1} * ...  * I_m
+  //
+  // (where O_x are the sizes of the output shape and I_x are the sizes of the
+  // input shape) and the size of the dimension j of the result is the same as
+  // the size of dimension k in the operand.
+  //
+  // These conditions are sufficient because the Reshape HLO is spec'ed such
+  // that the rightmost dimensions are always minor in the flattening and refine
+  // operation.
+
+  std::vector<ReshapePassthroughDimPair> result;
+  int64 result_subarray_size = 1;
+  for (int64 result_dim = result_shape.size() - 1; result_dim >= 0;
+       --result_dim) {
+    int64 candidate_operand_dim =
+        FindSuffixWithProduct(operand_shape, result_subarray_size);
+
+    // result_subarray_size does not include the elements in the current
+    // `result_dim` dimension (we multiply in result_shape[result_dim] at the
+    // end of loop body) so candidate_operand_dim can never be zero.
+    CHECK_NE(candidate_operand_dim, 0);
+
+    if (candidate_operand_dim != -1 &&
+        result_shape[result_dim] == operand_shape[candidate_operand_dim - 1]) {
+      result.push_back({/*result_dim=*/result_dim,
+                        /*operand_dim=*/candidate_operand_dim - 1});
+    }
+    result_subarray_size *= result_shape[result_dim];
+  }
+
+  c_reverse(result);
+
+  if (VLOG_IS_ON(3)) {
+    std::vector<string> result_strings;
+    c_transform(result, std::back_inserter(result_strings),
+                [](ReshapePassthroughDimPair value) {
+                  return tensorflow::strings::StrCat(value.result_dim, "->",
+                                                     value.operand_dim);
+                });
+    VLOG(3) << "For a reshape from [" << Join(operand_shape, ",") << "] to ["
+            << Join(result_shape, ",") << "] passthrough indices are ["
+            << Join(result_strings, ",") << "]";
+  }
+
+  DCHECK(c_is_sorted(
+      result, [](ReshapePassthroughDimPair lhs, ReshapePassthroughDimPair rhs) {
+        return lhs.result_dim < rhs.result_dim;
+      }));
+
+  DCHECK(c_is_sorted(
+      result, [](ReshapePassthroughDimPair lhs, ReshapePassthroughDimPair rhs) {
+        return lhs.operand_dim < rhs.operand_dim;
+      }));
+
+  return result;
+}
+
+// Return true if `dim` is stated as an passthrough operand dim in
+// `passthrough_dims`.
+bool IsReshapePassthroughOperandDim(
+    ArraySlice<ReshapePassthroughDimPair> passthrough_dims, int64 dim) {
+  return c_any_of(passthrough_dims,
+                  [&](ReshapePassthroughDimPair passthrough_dim_pair) {
+                    return passthrough_dim_pair.operand_dim == dim;
+                  });
+}
+
+// Maps `operand_dim` which must be an passthrough operand dimension to its
+// corresponding passthrough result dimension based on `passthrough_dims`.
+int64 MapPassthroughOperandDimToResultDim(
+    ArraySlice<ReshapePassthroughDimPair> passthrough_dims, int64 operand_dim) {
+  auto it = c_find_if(passthrough_dims,
+                      [&](ReshapePassthroughDimPair passthrough_dim_pair) {
+                        return passthrough_dim_pair.operand_dim == operand_dim;
+                      });
+  CHECK(it != passthrough_dims.end());
+  return it->result_dim;
+}
+
+int64 FindSourcePositionForPassthroughResultDim(ArraySlice<int64> operand_shape,
+                                                ArraySlice<int64> result_shape,
+                                                int64 source_passthrough_dim) {
+  int64 indexed_source_subarray_size =
+      std::accumulate(operand_shape.begin() + source_passthrough_dim + 1,
+                      operand_shape.end(), 1, std::multiplies<int64>());
+
+  return FindSuffixWithProduct(result_shape, indexed_source_subarray_size);
+}
+
+};  // namespace
+
+StatusOr<Analysis::Array*> IndexedArrayAnalysis::ComputeArrayForReshape(
+    const Shape& shape, Array* operand) {
+  auto* scalar_indexed = dynamic_cast<ScalarIndexedConstantArray*>(operand);
+  if (!scalar_indexed) {
+    return nullptr;
+  }
+
+  // Try to fold Reshape(ScalarIndexed(Const, Indices))
+  //          => ScalarIndexed(Const', Indices)
+  //
+  // We can view the reshape and the scalar-indexed operations as functions that
+  // map an output index (i.e. an index into the result) to an input index
+  // (i.e. an index into the operand).  The key idea used here is that the
+  // output-to-input mapping for some reshape operations may "pass through" some
+  // output dimensions into the input space unchanged -- i.e. there may exist
+  // output dimension "O" and input dimension "I" such that OutputIndex[O] is
+  // always == InputIndexForReshape(OutputIndex)[I].  If these pass-through
+  // dimensions in the input space of the reshape happen to be include all the
+  // output dimensions for the scalar-indexed node then, roughly, the following
+  // holds:
+  //
+  //    SourceIndexOfScalarIndexed(SourceIndexOfReshape(Idx))
+  // == SourceIndexOfScalarIndexed(SourceIndexOfReshape(Ps ++ Qs))
+  //
+  //      Where Ps are the set of the pass-through components of Idx that are
+  //      also the output dims of the scalar-indexed node, and Qs are the rest.
+  //      For brevity, we're playing fast and loose with the notation here -- we
+  //      don't literally require Idx to be a concatenation of Ps and Qs, as
+  //      suggested by the "++".
+  //
+  // == SourceIndexOfScalarIndexed(Ps ++ SourceIndexOfReshape(Qs))
+  //
+  //      Again, we're playing fast and loose with the notation around "++".
+  //      Generally this ++ will be a different function that the ++ in the
+  //      previous step.
+  //
+  // If the scalar-indexed node has a constant as the source then the
+  // SourceIndexOfReshape function can be "folded into" the constant itself by
+  // reshaping it, leaving us with:
+  //
+  // == SourceIndexOfScalarIndexed(Ps ++ Qs)
+  // == SourceIndexOfScalarIndexed(Idx)
+  //
+  // which is just a scalar-indexed node (with parameters different from the
+  // scalar-indexed node we started with) with a reshaped constant as the
+  // source.
+  //
+  // We can't fold SourceIndexOfReshape into the constant without introducing
+  // another precondition: since the new scalar-indexed node will have a
+  // reshaped (constant) array as its source it will, in general, have a
+  // different source dimension than the original scalar-indexed node.  This
+  // source dimension will have to be a passthrough dimension of the
+  // SourceIndexOfReshape indexing function that is folded into the source. And
+  // such a dimension need not exist so this is a non-trivial precondition.
+
+  std::vector<ReshapePassthroughDimPair> reshape_passthrough_dims =
+      ComputeReshapePassthroughDimPairs(
+          /*operand_shape=*/AsInt64Slice(operand->shape().dimensions()),
+          /*result_shape=*/AsInt64Slice(shape.dimensions()));
+
+  auto is_reshape_passthrough_operand_dim = [&](int64 operand_dim) {
+    return IsReshapePassthroughOperandDim(reshape_passthrough_dims,
+                                          operand_dim);
+  };
+
+  if (!c_all_of(scalar_indexed->output_dims(),
+                is_reshape_passthrough_operand_dim)) {
+    return nullptr;
+  }
+
+  // To compute the shape of the source for the new scalar-indexed node we're
+  // going to create, we first "undo" the scalar-indexed operation.
+  std::vector<int64> new_scalar_indexed_source_shape(shape.dimensions().begin(),
+                                                     shape.dimensions().end());
+  for (int64 i = scalar_indexed->output_dims().size() - 1; i >= 0; i--) {
+    int64 output_dim = scalar_indexed->output_dims()[i];
+    int64 output_dim_after_reshape = MapPassthroughOperandDimToResultDim(
+        reshape_passthrough_dims, output_dim);
+    EraseAt(&new_scalar_indexed_source_shape, output_dim_after_reshape);
+  }
+
+  // After this, we need to add in the dimension that will be the source
+  // dimension for the new scalar-indexed node.  A scalar-indexed node "removes"
+  // the source dimensions and "adds" the output dimensions, so to get back to
+  // the shape for the *source* of the scalar-indexed node we need to remove the
+  // output dims (which we did above) and then add back the source dim (which we
+  // are about to do below):
+
+  const Shape& scalar_indexed_source_shape = scalar_indexed->source()->shape();
+
+  int64 source_dim_for_new_scalar_indexed_node =
+      FindSourcePositionForPassthroughResultDim(
+          /*operand_shape=*/AsInt64Slice(
+              scalar_indexed_source_shape.dimensions()),
+          /*result_shape=*/new_scalar_indexed_source_shape,
+          scalar_indexed->source_dim());
+
+  // We may not be able to find a source dim for the new scalar-indexed node.
+  // For instance consider:
+  //
+  //   operand = s32[3,5,2] constant({...})
+  //   indices = s32[7] parameter(0)
+  //   gather = s32[3,2,7] gather(operand, indices),
+  //       output_window_dims={0,1},
+  //       elided_window_dims={1},
+  //       gather_dims_to_operand_dims={1},
+  //       index_vector_dim=1,
+  //       window_bounds={3,1,2}
+  //   reshape = s32[6,7] reshape(gather)
+  //
+  // In this case the gather maps to:
+  //    (scalar-indexed-const (constant s32[3,5,2]) %indices 1->[2])
+  //
+  // and the reshape passes through dimension 2 from its input into dimension 1
+  // in its output.  However, we can't rewrite the reshape as a scalar-indexed
+  // node because then we'd have to reshape the [3,5,2] `operand` array to
+  // [6,5], but then dimension 1 of the reshaped [6,5] array indexes differently
+  // (a.k.a. isn't pass-through) than the [3,5,2] array.
+
+  if (source_dim_for_new_scalar_indexed_node == -1) {
+    return nullptr;
+  }
+
+  InsertAt(
+      &new_scalar_indexed_source_shape, source_dim_for_new_scalar_indexed_node,
+      scalar_indexed_source_shape.dimensions(scalar_indexed->source_dim()));
+
+  CHECK(IsReshapePassthroughOperandDim(
+      ComputeReshapePassthroughDimPairs(
+          /*operand_shape=*/AsInt64Slice(
+              scalar_indexed_source_shape.dimensions()),
+          /*result_shape=*/new_scalar_indexed_source_shape),
+      scalar_indexed->source_dim()));
+
+  auto map_passthrough_operand_dim_to_result_dim = [&](int64 result_dim) {
+    return MapPassthroughOperandDimToResultDim(reshape_passthrough_dims,
+                                               result_dim);
+  };
+
+  std::vector<int64> output_dims_for_new_scalar_indexed_node;
+  c_transform(scalar_indexed->output_dims(),
+              std::back_inserter(output_dims_for_new_scalar_indexed_node),
+              map_passthrough_operand_dim_to_result_dim);
+
+  TF_ASSIGN_OR_RETURN(const Literal* new_scalar_indexed_source_literal,
+                      TakeOwnership(scalar_indexed->literal().Reshape(
+                          new_scalar_indexed_source_shape)));
+  TF_ASSIGN_OR_RETURN(
+      Array * new_scalar_indexed_source,
+      ComputeArrayForConstant(*new_scalar_indexed_source_literal));
+
+  return ConstructScalarIndexedArray(
+      new_scalar_indexed_source, scalar_indexed->indices(),
+      source_dim_for_new_scalar_indexed_node,
+      output_dims_for_new_scalar_indexed_node, shape);
+}
+
+StatusOr<Analysis::Array*>
+IndexedArrayAnalysis::ComputeArrayForElementwiseBinaryOp(HloOpcode opcode,
+                                                         Array* lhs,
+                                                         Array* rhs) {
+  // Try to fold BinaryOp(Broadcast(Const0), ScalarIndexed(Const1, Indices))
+  //          => ScalarIndexed(BinaryOp(Broadcast'(Const0), Const1), Indices)
+  //
+  // We can do this if every output dimension from the scalar-indexed node is a
+  // broadcasted dimension for the broadcast node.  Informally, the precondition
+  // means Broadcast(Const0)[IDX] is solely a function of the components of IDX
+  // that are not output-dims for the scalar-indexed node. In other words, for
+  // every assignment to the non-output dims in IDX we have a "constant" LHS to
+  // the BinaryOp.  This transform propagates this "constant" to the source for
+  // the scalar-indexed node.
+
+  ScalarIndexedConstantArray* lhs_scalar_indexed_const =
+      dynamic_cast<ScalarIndexedConstantArray*>(lhs);
+  ScalarIndexedConstantArray* rhs_scalar_indexed_const =
+      dynamic_cast<ScalarIndexedConstantArray*>(rhs);
+
+  bool lhs_is_indexed;
+
+  // One of the operands must be scalar-indexed and the other must be a
+  // broadcast of a constant.
+  if (lhs_scalar_indexed_const && !rhs_scalar_indexed_const) {
+    lhs_is_indexed = true;
+  } else if (rhs_scalar_indexed_const && !lhs_scalar_indexed_const) {
+    lhs_is_indexed = false;
+  } else {
+    return nullptr;
+  }
+
+  ScalarIndexedConstantArray* scalar_indexed_const =
+      lhs_is_indexed ? lhs_scalar_indexed_const : rhs_scalar_indexed_const;
+  UnknownArray* candidate_broadcast_array =
+      dynamic_cast<UnknownArray*>(lhs_is_indexed ? rhs : lhs);
+  if (!candidate_broadcast_array ||
+      candidate_broadcast_array->instruction().opcode() !=
+          HloOpcode::kBroadcast) {
+    return nullptr;
+  }
+
+  const HloInstruction* broadcast_instr =
+      &candidate_broadcast_array->instruction();
+  const HloInstruction* broadcast_const_operand = broadcast_instr->operand(0);
+  if (broadcast_const_operand->opcode() != HloOpcode::kConstant) {
+    return nullptr;
+  }
+
+  ArraySlice<int64> broadcast_dims = broadcast_instr->dimensions();
+  auto is_broadcasted_dim = [&](int64 output_dim) {
+    return c_find(broadcast_dims, output_dim) == broadcast_dims.end();
+  };
+
+  // All of the output dims must be "broadcasted" dims for the other operand.
+  if (!c_all_of(scalar_indexed_const->output_dims(), is_broadcasted_dim)) {
+    return nullptr;
+  }
+
+  // To figure out the broadcast dimensions for the (constant) source for the
+  // scalar-indexed node, we "simulate" the index transformation done by the
+  // existing broadcsat:
+  enum class IndexComponent { Broadcasted, NotBroadcasted };
+  std::vector<IndexComponent> simulated_index(
+      broadcast_instr->shape().dimensions_size(), IndexComponent::Broadcasted);
+  for (int64 broadcast_dim : broadcast_dims) {
+    simulated_index[broadcast_dim] = IndexComponent::NotBroadcasted;
+  }
+
+  // The scalar-indexed node "removes" the source dim and "inserts" the output
+  // dims.  We do the opposite here to undo the scalar-indexed operation.
+  ArraySlice<int64> output_dims = scalar_indexed_const->output_dims();
+  for (int64 i = output_dims.size() - 1; i >= 0; --i) {
+    CHECK(simulated_index[output_dims[i]] == IndexComponent::Broadcasted);
+    EraseAt(&simulated_index, output_dims[i]);
+  }
+
+  InsertAt(&simulated_index, scalar_indexed_const->source_dim(),
+           IndexComponent::Broadcasted);
+
+  // new_inner_broadcast_dims holds the broadcast dimensions for the inner
+  // BinaryOp(Broadcast'(Const0), Const1).  We now translate simulated_index to
+  // new_inner_broadcast_dims.
+  std::vector<int64> new_inner_broadcast_dims;
+  for (int64 i = 0; i < simulated_index.size(); i++) {
+    if (simulated_index[i] == IndexComponent::NotBroadcasted) {
+      new_inner_broadcast_dims.push_back(i);
+    }
+  }
+
+  // inner_broadcast_result is the Broadcast'(Const0) bit in
+  // BinaryOp(Broadcast'(Const0), Const1)
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<Literal> inner_broadcast_result,
+      broadcast_const_operand->literal().Broadcast(
+          scalar_indexed_const->source()->shape(), new_inner_broadcast_dims));
+
+  // literal_for_new_source is BinaryOp(Broadcast'(Const0), Const1)
+  const Literal* literal_for_new_source;
+  if (lhs_is_indexed) {
+    TF_ASSIGN_OR_RETURN(
+        literal_for_new_source,
+        TakeOwnership(HloEvaluator{}.EvaluateElementwiseBinaryOp(
+            opcode, scalar_indexed_const->literal(), *inner_broadcast_result)));
+  } else {
+    TF_ASSIGN_OR_RETURN(
+        literal_for_new_source,
+        TakeOwnership(HloEvaluator{}.EvaluateElementwiseBinaryOp(
+            opcode, *inner_broadcast_result, scalar_indexed_const->literal())));
+  }
+
+  ConstantArray* new_source = Construct<ConstantArray>(literal_for_new_source);
+  return Construct<ScalarIndexedConstantArray>(
+      new_source, scalar_indexed_const->indices(),
+      scalar_indexed_const->source_dim(),
+      std::vector<int64>(scalar_indexed_const->output_dims().begin(),
+                         scalar_indexed_const->output_dims().end()),
+      scalar_indexed_const->shape());
+}
+
+StatusOr<Analysis::Array*>
+IndexedArrayAnalysis::ComputeArrayForElementwiseUnaryOp(HloOpcode opcode,
+                                                        Array* operand) {
+  auto* scalar_indexed_const =
+      dynamic_cast<ScalarIndexedConstantArray*>(operand);
+  if (scalar_indexed_const == nullptr) {
+    return nullptr;
+  }
+
+  // Fold UnaryOp(ScalarIndexed(Const, Indices))
+  //   => ScalarIndexed(UnaryOp(Const), Indices)
+
+  TF_ASSIGN_OR_RETURN(Literal * literal_for_new_source,
+                      TakeOwnership(HloEvaluator{}.EvaluateElementwiseUnaryOp(
+                          opcode, scalar_indexed_const->literal())));
+  ConstantArray* new_source = Construct<ConstantArray>(literal_for_new_source);
+  return Construct<ScalarIndexedConstantArray>(
+      new_source, scalar_indexed_const->indices(),
+      scalar_indexed_const->source_dim(),
+      std::vector<int64>(scalar_indexed_const->output_dims().begin(),
+                         scalar_indexed_const->output_dims().end()),
+      scalar_indexed_const->shape());
+}
+
+tensorflow::StringPiece IndexedArrayAnalysisPrinterPass::name() const {
+  return "indexed-array-analysis-printer-pass";
+}
+
+StatusOr<bool> IndexedArrayAnalysisPrinterPass::Run(HloModule* module) {
+  if (!VLOG_IS_ON(2)) {
+    return false;
+  }
+
+  IndexedArrayAnalysis analysis;
+  for (auto* computation : module->MakeNonfusionComputations()) {
+    for (auto* instr : computation->instructions()) {
+      TF_ASSIGN_OR_RETURN(Analysis::Array * t, analysis.GetArrayFor(instr));
+      if (!dynamic_cast<UnknownArray*>(t) && !dynamic_cast<ConstantArray*>(t)) {
+        VLOG(2) << instr->ToString() << "   ->   " << analysis.ToString(t);
+      }
+    }
+  }
+
+  return false;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis.h b/tensorflow/compiler/xla/service/indexed_array_analysis.h
new file mode 100644
index 00000000000000..ce92fd2919c90f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/indexed_array_analysis.h
@@ -0,0 +1,326 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_INDEXED_ARRAY_ANALYSIS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_INDEXED_ARRAY_ANALYSIS_H_
+
+#include <type_traits>
+
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace xla {
+
+// IndexedArrayAnalysis decides if an HLO instruction can be rewritten as a
+// gather from another array.  It does this by mapping HLO instructions to
+// instances of IndexedArrayAnalysis::Array, which can be inspected to discover
+// whether said HLO is equivalent to a gather.
+class IndexedArrayAnalysis {
+ public:
+  // IndexedArrayAnalysis maps each HLO instruction to an instance of a Array.
+  // Array really just a sum type of the classes that inherit from it.  The
+  // meaning of each of the subtypes is documented on the subtype declaration.
+  //
+  // Array instances are immutable once created.
+  class Array {
+   public:
+    enum Kind { kUnknown, kConstant, kScalarIndexedConstant, kScalarIndexed };
+
+    virtual Kind kind() const = 0;
+    virtual const Shape& shape() const = 0;
+
+    // Does a checked downcast from `Array` to `T` which must be one of its
+    // subtypes.
+    template <typename T>
+    T* as() {
+      static_assert((std::is_base_of<Array, T>::value),
+                    "target type not derived from source type");
+      // We skip the CHECK and hence the dynamic_cast if RTTI is disabled.
+#if !defined(__GNUC__) || defined(__GXX_RTTI)
+      CHECK_NE(dynamic_cast<T*>(this), nullptr);
+#endif  // !defined(__GNUC__) || defined(__GXX_RTTI)
+
+      return static_cast<T*>(this);
+    }
+
+    virtual ~Array() = default;
+
+    Array& operator=(const Array& other) = delete;
+  };
+
+  // Represents an HLO instruction that was not analyzable by this
+  // IndexedArrayAnalysis.  Instances of UnknownArray just wrap an existing
+  // HloInstruction.
+  class UnknownArray : public Array {
+   public:
+    Kind kind() const override { return kUnknown; }
+    const Shape& shape() const override { return instruction().shape(); }
+    const HloInstruction& instruction() const { return instruction_; }
+
+   private:
+    explicit UnknownArray(const HloInstruction* instr) : instruction_(*instr) {}
+
+    const HloInstruction& instruction_;
+
+    friend class IndexedArrayAnalysis;
+  };
+
+  // Represents a constant value.  This constant value may be present in the HLO
+  // module being analyzed, or it could have been created on the fly by the
+  // analysis.
+  class ConstantArray : public Array {
+   public:
+    Kind kind() const override { return kConstant; }
+    const Shape& shape() const override { return literal()->shape(); }
+    const Literal* literal() const { return literal_; }
+
+   private:
+    explicit ConstantArray(const Literal* literal) : literal_(literal) {}
+    const Literal* literal_;
+
+    friend class IndexedArrayAnalysis;
+  };
+
+  // ---------------------------------------------------------------------------
+  // Indexed Array Overview
+  // ---------------------------------------------------------------------------
+  //
+  // ScalarIndexedArray and ScalarIndexedConstantArray form the core of this
+  // analysis.  ScalarIndexedConstantArray is just a specialization of
+  // ScalarIndexedArray so we will only discuss ScalarIndexedArray in this
+  // overview.
+  //
+  // A ScalarIndexedArray represents an array that can be computed by indexing
+  // into a "source" array using an "indices" tensor.  A simple example is a
+  // gather operation gathering 12 rows out of a [100,100] matrix -- such an
+  // operation will be represented by an instance of a ScalarIndexedArray with
+  // the [100,100] matrix as the "source" array and the [12]-shaped indices
+  // array as the "indices" tensor.  The ScalarIndexedArray operation itself
+  // will be of shape [12,100] (assuming we were gathering with axis=0).
+  //
+  // Gather operations are not the only operation that maps to
+  // ScalarIndexedArray instances (if that were true there would be little point
+  // in having a separate analysis).  We can often infer ScalarIndexedArrays for
+  // other operations too.  For instance, consider:
+  //
+  //   %source = f32[100,100] constant
+  //   %indices = s32[12] ...
+  //   %gather = f32[12,100] ... gather from %source using %indices at axis 0
+  //   %dot = dot(%gather, other_constant) [canonical contracting dims]
+  //
+  // The dot operation itself is also a ScalarIndexedArray with source =
+  // dot(constant, other_constant) and indices = %indices.  A reshape of %gather
+  // to [12,5,20] too is a ScalarIndexedArray with source = an appropriately
+  // reshaped constant and indices = %indices.
+
+  // Represents the result of a gather operation.  This gather operation may
+  // explicitly be present in the HLO module being analyzed, or it could have
+  // been created on the fly by the analysis.
+  //
+  // An instance of ScalarIndexedArray represents a array whose I'th element can
+  // be mapped to the J'th element of the `source` array (where I and J are
+  // multidimensional indices) in this way:
+  //
+  //   I' = remove components at positions `output_dims` from I
+  //   G' = remove components not at positions `output_dims` from I
+  //   T  = indices[G']
+  //   J  = I' with T inserted at position `source_dim`
+  //
+  // For example, if source is of shape [11,13,17,19], indices is of shape
+  // [23,29], output_dims is [0,2] and source_dim is 2 then the output is of
+  // shape [23,11,29,13,19] and the output index [A,B,C,D,E] is mapped to the
+  // input index [B,D,indices[A,C],E].
+  class ScalarIndexedArray : public Array {
+   public:
+    Kind kind() const override { return kScalarIndexed; }
+    const Shape& shape() const override { return shape_; }
+
+    Array* source() const { return source_; }
+    Array* indices() const { return indices_; }
+
+    // `source_dim` is the dimension in the source array that is being indexed
+    // over using indices from the `indices` array.  See the class documentation
+    // and the overview for more details.
+    int64 source_dim() const { return source_dim_; }
+
+    // `output_dims` are the dimensions in the output array that are being used
+    // to compute an index into the `indices` array.  See the class
+    // documentation and the overview for more details.
+    tensorflow::gtl::ArraySlice<int64> output_dims() const {
+      return output_dims_;
+    }
+
+   private:
+    explicit ScalarIndexedArray(Array* source, Array* indices, int64 source_dim,
+                                std::vector<int64> output_dims, Shape shape)
+        : source_(source),
+          indices_(indices),
+          source_dim_(source_dim),
+          output_dims_(std::move(output_dims)),
+          shape_(std::move(shape)) {}
+
+    Array* source_;
+    Array* indices_;
+    int64 source_dim_;
+    std::vector<int64> output_dims_;
+    Shape shape_;
+
+    friend class IndexedArrayAnalysis;
+  };
+
+  // A ScalarIndexedConstantArray is just a ScalarIndexedArray constrained to
+  // have a ConstantArray instance as the source.  This is an ergonomic
+  // concession -- in theory it is possible to just keep ScalarIndexedArray and
+  // check source()->kind().
+  class ScalarIndexedConstantArray : public ScalarIndexedArray {
+   public:
+    Kind kind() const override { return kScalarIndexedConstant; }
+
+    const Literal& literal() const {
+      return *source()->as<ConstantArray>()->literal();
+    }
+
+   private:
+    explicit ScalarIndexedConstantArray(Array* source, Array* indices,
+                                        int64 source_dim,
+                                        std::vector<int64> output_dims,
+                                        Shape shape)
+        : ScalarIndexedArray(source, indices, source_dim,
+                             std::move(output_dims), std::move(shape)) {
+      CHECK(dynamic_cast<ConstantArray*>(source));
+    }
+
+    friend class IndexedArrayAnalysis;
+  };
+
+  // Returns an Array instance for `instr`.  The IndexedArrayAnalysis instance
+  // keeps ownership of the returned Array instance.
+  //
+  // Caching Behavior: IndexedArrayAnalysis has a cache mapping HLO
+  // instructions to IndexedArrayAnalysis::Array instances.  This entire cache
+  // becomes stale and may cause the analysis to return incorrect results if any
+  // transitive operand (stopping at the containing computation) is modified for
+  // any HLO instruction on which GetArrayFor has been invoked.
+  //
+  // NB!  By inspecting the implementation, you may be able to infer a stronger
+  // caching guarantee than what is mentioned above.  Nevertheless, what is
+  // stated above is the contract.
+  StatusOr<Array*> GetArrayFor(const HloInstruction* instr);
+
+  // Pretty-prints the expression rooted at `root`.
+  string ToString(Array* root, bool print_constants = false);
+
+ private:
+  // Helper function that ensures that every HLO instruction that is
+  // transitively used by `root` has an entry in `cache_`.
+  Status TraverseAndPopulateCache(const HloInstruction* root);
+
+  // Creates an Array instance for `instr` under the assumption that all
+  // operations of `instr` are present in `cache_`.
+  StatusOr<Array*> ComputeArrayFor(const HloInstruction* instr);
+
+  StatusOr<Array*> ComputeArrayForConstant(const Literal& literal);
+
+  StatusOr<Array*> ComputeArrayForGather(
+      const Shape& shape, const GatherDimensionNumbers& dim_numbers,
+      tensorflow::gtl::ArraySlice<int64> window_bounds, Array* source,
+      Array* indices);
+
+  // This tries to fold a ScalarIndexedArray which has another
+  // ScalarIndexedArray as a source into a ScalarIndexedArray that instead has a
+  // ScalarIndexedArray as indices.  If `source` happened to be a
+  // ScalarIndexedConstantArray this can result in an expression that is more
+  // canonical.
+  //
+  // As an example, consider a gather operation, G0, gathering 7 elements from
+  // an array "Arr" of shape [100] resulting in an array of shape [7], and a
+  // second gather operation, G1, which gathers 3 elements out of the result of
+  // G0 resulting in an array of shape [3].  Let the indices uses by G0 be I0
+  // (of shape [7]) and the indices used by G1 be I1 (of shape [3]).  We can
+  // instead rewrite G1 to gather directly from "Arr" with the three indices
+  // from I0 as per I1.  In other words, we can rewrite:
+  //
+  //    G0 = [Arr[i] for i in I0]
+  //    G1 = [G0[i]  for i in I1]
+  //
+  // into
+  //
+  //    I2 = [I0[i]  for i in I1]
+  //    G1 = [Arr[i] for i in I2]
+  StatusOr<ScalarIndexedArray*> FoldGatherOfGather(
+      ScalarIndexedArray* source, Array* indices, int64 source_dim,
+      tensorflow::gtl::ArraySlice<int64> output_dims, Shape shape);
+
+  StatusOr<Array*> ComputeArrayForReshape(const Shape& shape, Array* operand);
+
+  StatusOr<Array*> ComputeArrayForElementwiseBinaryOp(HloOpcode opcode,
+                                                      Array* lhs, Array* rhs);
+  StatusOr<Array*> ComputeArrayForElementwiseUnaryOp(HloOpcode opcode,
+                                                     Array* operand);
+
+  template <typename T, typename... Args>
+  T* Construct(Args&&... args) {
+    T* new_tensor = new T(std::forward<Args>(args)...);
+    owned_tensors_.push_back(std::unique_ptr<T>(new_tensor));
+    return new_tensor;
+  }
+
+  ScalarIndexedArray* ConstructScalarIndexedArray(
+      Array* source, Array* indices, int64 source_dim,
+      std::vector<int64> output_dims, Shape shape) {
+    if (source->kind() == Array::kConstant) {
+      return Construct<ScalarIndexedConstantArray>(source, indices, source_dim,
+                                                   std::move(output_dims),
+                                                   std::move(shape));
+    } else {
+      return Construct<ScalarIndexedArray>(source, indices, source_dim,
+                                           std::move(output_dims),
+                                           std::move(shape));
+    }
+  }
+
+  Literal* TakeOwnership(std::unique_ptr<Literal> literal) {
+    owned_literals_.push_back(std::move(literal));
+    return owned_literals_.back().get();
+  }
+
+  StatusOr<Literal*> TakeOwnership(
+      StatusOr<std::unique_ptr<Literal>> literal_or_error) {
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> literal,
+                        std::move(literal_or_error));
+    owned_literals_.push_back(std::move(literal));
+    return owned_literals_.back().get();
+  }
+
+  std::vector<std::unique_ptr<Array>> owned_tensors_;
+  std::vector<std::unique_ptr<Literal>> owned_literals_;
+  tensorflow::gtl::FlatMap<const HloInstruction*, Array*> cache_;
+};
+
+// A pass that prints all non-trivial results returned by IndexedArrayAnalysis.
+// This pass is a no-op if !VLOG_IS_ON(2) so it should be fine to
+// unconditionally add to the regular HLO pass pipeline.
+class IndexedArrayAnalysisPrinterPass : public HloPassInterface {
+ public:
+  tensorflow::StringPiece name() const override;
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_INDEXED_ARRAY_ANALYSIS_H_
diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc b/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
new file mode 100644
index 00000000000000..373556ebeba883
--- /dev/null
+++ b/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
@@ -0,0 +1,504 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/indexed_array_analysis.h"
+#include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+
+namespace xla {
+namespace {
+class IndexedArrayAnalysisTest : public HloVerifiedTestBase {
+ protected:
+  void AssertArrayForRootExpressionIs(const string& hlo_text,
+                                      const string& root_expression) {
+    AssertArrayForRootExpressionIsImpl(hlo_text, root_expression,
+                                       /*print_constants=*/false);
+  }
+
+  void AssertArrayWithConstantsForRootExpressionIs(
+      const string& hlo_text, const string& root_expression) {
+    AssertArrayForRootExpressionIsImpl(hlo_text, root_expression,
+                                       /*print_constants=*/true);
+  }
+
+ private:
+  void AssertArrayForRootExpressionIsImpl(const string& hlo_text,
+                                          const string& root_expression,
+                                          bool print_constants) {
+    IndexedArrayAnalysis indexed_tensor_analysis;
+    ParseAndVerifyModule(hlo_text);
+
+    TF_ASSERT_OK_AND_ASSIGN(
+        IndexedArrayAnalysis::Array* const array_result,
+        indexed_tensor_analysis.GetArrayFor(
+            module().entry_computation()->root_instruction()));
+    string string_result =
+        indexed_tensor_analysis.ToString(array_result, print_constants);
+    LOG(INFO) << string_result;
+    ASSERT_EQ(string_result, root_expression);
+  }
+};
+
+TEST_F(IndexedArrayAnalysisTest, SimpleOneToOneGather) {
+  string hlo_text = R"(
+HloModule SimpleGather
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[5] parameter(1)
+  ROOT gather = s32[5,3] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={0},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      window_bounds={1,3}
+}
+)";
+
+  AssertArrayForRootExpressionIs(hlo_text,
+                                 "(scalar-indexed %operand %indices 0->[0])");
+}
+
+TEST_F(IndexedArrayAnalysisTest, SimpleOneToOneConstantGather) {
+  string hlo_text = R"(
+HloModule SimpleGather
+
+ENTRY main {
+  operand = s32[3,3] constant(s32[3,3]{{1,2,3},{1,2,3},{1,2,3}})
+  indices = s32[5] parameter(0)
+  ROOT gather = s32[5,3] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={0},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      window_bounds={1,3}
+}
+)";
+
+  AssertArrayForRootExpressionIs(
+      hlo_text, "(scalar-indexed-const (constant s32[3,3]) %indices 0->[0])");
+}
+
+TEST_F(IndexedArrayAnalysisTest, GatherOfGather_OneToOne) {
+  string hlo_text = R"(
+HloModule SimpleGather
+
+ENTRY main {
+  operand = s32[3,3] constant(s32[3,3]{{1,2,3},{1,2,3},{1,2,3}})
+  indices_a = s32[5] parameter(0)
+  indices_b = s32[2] parameter(1)
+  gather_a = s32[5,3] gather(operand, indices_a),
+      output_window_dims={1},
+      elided_window_dims={0},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      window_bounds={1,3}
+  ROOT gather_b = s32[2,3] gather(gather_a, indices_b),
+      output_window_dims={1},
+      elided_window_dims={0},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      window_bounds={1,3}
+}
+)";
+
+  AssertArrayForRootExpressionIs(
+      hlo_text,
+      "(scalar-indexed-const (constant s32[3,3]) (scalar-indexed %indices_a "
+      "%indices_b 0->[0]) 0->[0])");
+}
+
+TEST_F(IndexedArrayAnalysisTest, GatherOfGather_ManyToOneWithOneToOne) {
+  string hlo_text = R"(
+HloModule SimpleGather
+
+ENTRY main {
+  operand = s32[3,2] parameter(0)
+  indices_a = s32[5,7] parameter(1)
+  indices_b = s32[2] parameter(2)
+  gather_a = s32[5,3,7] gather(operand, indices_a),
+      output_window_dims={1},
+      elided_window_dims={1},
+      gather_dims_to_operand_dims={1},
+      index_vector_dim=2,
+      window_bounds={3,1}
+  ROOT gather_b = s32[5,3,2] gather(gather_a, indices_b),
+      output_window_dims={0,1},
+      elided_window_dims={2},
+      gather_dims_to_operand_dims={2},
+      index_vector_dim=1,
+      window_bounds={5,3,1}
+}
+)";
+
+  AssertArrayForRootExpressionIs(hlo_text,
+                                 "(scalar-indexed %operand (scalar-indexed "
+                                 "%indices_a %indices_b 1->[1]) 1->[0,2])");
+}
+
+TEST_F(IndexedArrayAnalysisTest, GatherOfGather_OneToOneWithManyToOne) {
+  string hlo_text = R"(
+HloModule SimpleGather
+
+ENTRY main {
+  operand = s32[3,6] parameter(0)
+  indices_a = s32[2] parameter(1)
+  indices_b = s32[5,7] parameter(2)
+  gather_a = s32[2,6] gather(operand, indices_a),
+      output_window_dims={1},
+      elided_window_dims={0},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      window_bounds={1,6}
+  ROOT gather_b = s32[5,6,7] gather(gather_a, indices_b),
+      output_window_dims={1},
+      elided_window_dims={0},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=2,
+      window_bounds={1,6}
+}
+)";
+
+  AssertArrayForRootExpressionIs(hlo_text,
+                                 "(scalar-indexed %operand (scalar-indexed "
+                                 "%indices_a %indices_b 0->[0,1]) 0->[0,2])");
+}
+
+TEST_F(IndexedArrayAnalysisTest, GatherOfGather_ManyToOneWithManyToOne) {
+  string hlo_text = R"(
+HloModule SimpleGather
+
+ENTRY main {
+  operand = s32[3,2] parameter(0)
+  indices_a = s32[5,7] parameter(1)
+  indices_b = s32[4,8] parameter(2)
+  gather_a = s32[5,3,7] gather(operand, indices_a),
+      output_window_dims={1},
+      elided_window_dims={1},
+      gather_dims_to_operand_dims={1},
+      index_vector_dim=2,
+      window_bounds={3,1}
+  ROOT gather_b = s32[4,5,3,8] gather(gather_a, indices_b),
+      output_window_dims={1,2},
+      elided_window_dims={2},
+      gather_dims_to_operand_dims={2},
+      index_vector_dim=2,
+      window_bounds={5,3,1}
+}
+)";
+
+  AssertArrayForRootExpressionIs(
+      hlo_text,
+      "(scalar-indexed %operand (scalar-indexed %indices_a %indices_b "
+      "1->[0,2]) 1->[0,1,3])");
+}
+
+TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather0) {
+  string hlo_text = R"(
+HloModule ReshapeOfGather
+
+ENTRY main {
+  operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{1,2,3,4},{1,2,3,4}})
+  indices = s32[5] parameter(0)
+  gather = s32[5,4] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={0},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      window_bounds={1,4}
+  ROOT reshape = s32[5,2,2] reshape(gather)
+}
+)";
+
+  AssertArrayForRootExpressionIs(
+      hlo_text, "(scalar-indexed-const (constant s32[3,2,2]) %indices 0->[0])");
+}
+
+TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather1) {
+  string hlo_text = R"(
+HloModule ReshapeOfGather
+
+ENTRY main {
+  operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{1,2,3,4},{1,2,3,4}})
+  indices = s32[5,7] parameter(0)
+  gather = s32[5,4,7] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={0},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=2,
+      window_bounds={1,4}
+  ROOT reshape = s32[5,2,2,7] reshape(gather)
+}
+)";
+
+  AssertArrayForRootExpressionIs(
+      hlo_text,
+      "(scalar-indexed-const (constant s32[3,2,2]) %indices 0->[0,3])");
+}
+
+TEST_F(IndexedArrayAnalysisTest, ReshapeOfGather2) {
+  string hlo_text = R"(
+HloModule ReshapeOfGather
+
+ENTRY main {
+  operand = s32[3,2,6] constant(s32[3,2,6]{
+      {{1,2,3,4,5,6},{1,2,3,4,5,6}},
+      {{1,2,3,4,5,6},{1,2,3,4,5,6}},
+      {{1,2,3,4,5,6},{1,2,3,4,5,6}}})
+  indices = s32[5,7] parameter(0)
+  gather = s32[5,2,6,7] gather(operand, indices),
+      output_window_dims={1,2},
+      elided_window_dims={0},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=2,
+      window_bounds={1,2,6}
+  ROOT reshape = s32[5,3,4,7] reshape(gather)
+}
+)";
+
+  AssertArrayForRootExpressionIs(
+      hlo_text,
+      "(scalar-indexed-const (constant s32[3,3,4]) %indices 0->[0,3])");
+}
+
+TEST_F(IndexedArrayAnalysisTest, ReshapeOfGatherNegative0) {
+  string hlo_text = R"(
+HloModule ReshapeOfGather
+
+ENTRY main {
+  operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{1,2,3,4},{1,2,3,4}})
+  indices = s32[5,6] parameter(0)
+  gather = s32[5,4,6] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={0},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=2,
+      window_bounds={1,4}
+  ROOT reshape = s32[5,2,2,2,3] reshape(gather)
+}
+)";
+
+  AssertArrayForRootExpressionIs(hlo_text, "%reshape");
+}
+
+TEST_F(IndexedArrayAnalysisTest, ReshapeOfGatherNegative1) {
+  string hlo_text = R"(
+HloModule ReshapeOfGather
+
+ENTRY main {
+  operand = s32[3,5,2] constant(s32[3,5,2]{
+      {{1,2},{3,4},{5,6},{7,8},{9,10}},
+      {{1,2},{3,4},{5,6},{7,8},{9,10}},
+      {{1,2},{3,4},{5,6},{7,8},{9,10}}})
+  indices = s32[7] parameter(0)
+  gather = s32[3,2,7] gather(operand, indices),
+      output_window_dims={0,1},
+      elided_window_dims={1},
+      gather_dims_to_operand_dims={1},
+      index_vector_dim=1,
+      window_bounds={3,1,2}
+  ROOT reshape = s32[6,7] reshape(gather)
+}
+)";
+
+  AssertArrayForRootExpressionIs(hlo_text, "%reshape");
+}
+
+TEST_F(IndexedArrayAnalysisTest, UnaryOpOfGather) {
+  string hlo_text = R"(
+HloModule UnaryOpOfGather
+
+ENTRY main {
+  operand = f32[3,4] constant(f32[3,4]{{1,2,3,4},{1,3,2,4},{4,3,2,1}})
+  indices = s32[5] parameter(0)
+  gather = f32[5,4] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={0},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      window_bounds={1,4}
+  ROOT tanh = f32[5,4] tanh(gather)
+}
+)";
+
+  AssertArrayWithConstantsForRootExpressionIs(hlo_text, 1 + R"(
+(scalar-indexed-const (constant f32[3,4] f32[3,4] {
+  { 0.761594176, 0.964027584, 0.995054781, 0.999329329 },
+  { 0.761594176, 0.995054781, 0.964027584, 0.999329329 },
+  { 0.999329329, 0.995054781, 0.964027584, 0.761594176 }
+}) %indices 0->[0]))");
+}
+
+TEST_F(IndexedArrayAnalysisTest, AddBroadcastedScalarWithGather) {
+  string hlo_text = R"(
+HloModule AddBroadcastedScalarWithGather
+
+ENTRY main {
+  gather_operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{1,3,2,4},{4,3,2,1}})
+  constant = s32[] constant(5)
+  constant_broadcasted = s32[5,4] broadcast(constant), dimensions={}
+  indices = s32[5] parameter(0)
+  gather = s32[5,4] gather(gather_operand, indices),
+      output_window_dims={1},
+      elided_window_dims={0},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      window_bounds={1,4}
+  ROOT add = s32[5,4] add(gather, constant_broadcasted)
+}
+)";
+
+  AssertArrayWithConstantsForRootExpressionIs(hlo_text, 1 + R"(
+(scalar-indexed-const (constant s32[3,4] s32[3,4] {
+  { 6, 7, 8, 9 },
+  { 6, 8, 7, 9 },
+  { 9, 8, 7, 6 }
+}) %indices 0->[0]))");
+}
+
+TEST_F(IndexedArrayAnalysisTest,
+       SubtractBroadcastedScalarWithGather_GatherIsLhs) {
+  string hlo_text = R"(
+HloModule SubtractBroadcastedScalarWithGather
+
+ENTRY main {
+  gather_operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{1,3,2,4},{4,3,2,1}})
+  constant = s32[] constant(5)
+  constant_broadcasted = s32[5,4] broadcast(constant), dimensions={}
+  indices = s32[5] parameter(0)
+  gather = s32[5,4] gather(gather_operand, indices),
+      output_window_dims={1},
+      elided_window_dims={0},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      window_bounds={1,4}
+  ROOT sub = s32[5,4] subtract(gather, constant_broadcasted)
+}
+)";
+
+  AssertArrayWithConstantsForRootExpressionIs(hlo_text, 1 + R"(
+(scalar-indexed-const (constant s32[3,4] s32[3,4] {
+  { -4, -3, -2, -1 },
+  { -4, -2, -3, -1 },
+  { -1, -2, -3, -4 }
+}) %indices 0->[0]))");
+}
+
+TEST_F(IndexedArrayAnalysisTest,
+       SubtractBroadcastedScalarWithGather_GatherIsRhs) {
+  string hlo_text = R"(
+HloModule SubtractBroadcastedScalarWithGather
+
+ENTRY main {
+  gather_operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{1,3,2,4},{4,3,2,1}})
+  constant = s32[] constant(5)
+  constant_broadcasted = s32[5,4] broadcast(constant), dimensions={}
+  indices = s32[5] parameter(0)
+  gather = s32[5,4] gather(gather_operand, indices),
+      output_window_dims={1},
+      elided_window_dims={0},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      window_bounds={1,4}
+  ROOT sub = s32[5,4] subtract(constant_broadcasted, gather)
+}
+)";
+
+  AssertArrayWithConstantsForRootExpressionIs(hlo_text, 1 + R"(
+(scalar-indexed-const (constant s32[3,4] s32[3,4] {
+  { 4, 3, 2, 1 },
+  { 4, 2, 3, 1 },
+  { 1, 2, 3, 4 }
+}) %indices 0->[0]))");
+}
+
+TEST_F(IndexedArrayAnalysisTest, AddBroadcastedVectorWithGather) {
+  string hlo_text = R"(
+HloModule AddBroadcastedVectorWithGather
+
+ENTRY main {
+  gather_operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{1,3,2,4},{4,3,2,1}})
+  constant_vect = s32[4] constant({10,11,12,13})
+  constant_broadcasted = s32[5,4] broadcast(constant_vect), dimensions={1}
+  indices = s32[5] parameter(0)
+  gather = s32[5,4] gather(gather_operand, indices),
+      output_window_dims={1},
+      elided_window_dims={0},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      window_bounds={1,4}
+  ROOT add = s32[5,4] add(gather, constant_broadcasted)
+}
+)";
+
+  AssertArrayWithConstantsForRootExpressionIs(hlo_text, 1 + R"(
+(scalar-indexed-const (constant s32[3,4] s32[3,4] {
+  { 11, 13, 15, 17 },
+  { 11, 14, 14, 17 },
+  { 14, 14, 14, 14 }
+}) %indices 0->[0]))");
+}
+
+TEST_F(IndexedArrayAnalysisTest, AddBroadcastedVectorWithGather_Negative) {
+  string hlo_text = R"(
+HloModule AddBroadcastedVectorWithGather
+
+ENTRY main {
+  gather_operand = s32[3,4] constant(s32[3,4]{{1,2,3,4},{1,3,2,4},{4,3,2,1}})
+  constant_vect = s32[5] constant({10,11,12,13,14})
+  constant_broadcasted = s32[5,4] broadcast(constant_vect), dimensions={0}
+  indices = s32[5] parameter(0)
+  gather = s32[5,4] gather(gather_operand, indices),
+      output_window_dims={1},
+      elided_window_dims={0},
+      gather_dims_to_operand_dims={0},
+      index_vector_dim=1,
+      window_bounds={1,4}
+  ROOT add = s32[5,4] add(gather, constant_broadcasted)
+}
+)";
+
+  AssertArrayForRootExpressionIs(hlo_text, "%add");
+}
+
+TEST_F(IndexedArrayAnalysisTest, RegularUnaryOp) {
+  string hlo_text = R"(
+HloModule RegularUnaryOp
+
+ENTRY main {
+  input = f32[100] parameter(0)
+  ROOT tanh = f32[100] tanh(input)
+}
+)";
+
+  AssertArrayForRootExpressionIs(hlo_text, "%tanh");
+}
+
+TEST_F(IndexedArrayAnalysisTest, RegularBinaryOp) {
+  string hlo_text = R"(
+HloModule RegularUnaryOp
+
+ENTRY main {
+  input0 = f32[100] parameter(0)
+  input1 = f32[100] parameter(1)
+  ROOT add = f32[100] add(input0, input1)
+}
+)";
+
+  AssertArrayForRootExpressionIs(hlo_text, "%add");
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/inliner_test.cc b/tensorflow/compiler/xla/service/inliner_test.cc
index 7aa1c7c8358318..d2af261008f40e 100644
--- a/tensorflow/compiler/xla/service/inliner_test.cc
+++ b/tensorflow/compiler/xla/service/inliner_test.cc
@@ -71,7 +71,7 @@ TEST_F(InlinerTest, MapMax) {
   // Verify execution on CPU.
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
   auto expected = Literal::CreateR1<float>({4, 3, 3, 4});
-  LiteralTestUtil::ExpectEqual(*result, *expected);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*result, *expected));
 }
 
 // Test that `constant` function is changed to `broadcast`.
@@ -105,7 +105,7 @@ TEST_F(InlinerTest, MapConstant) {
   // Verify execution on CPU.
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
   auto expected = Literal::CreateR2<float>({{2, 2, 2, 2}, {2, 2, 2, 2}});
-  LiteralTestUtil::ExpectEqual(*result, *expected);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*result, *expected));
 }
 
 TEST_F(InlinerTest, MapSubtractOppositeOrder) {
@@ -143,7 +143,7 @@ TEST_F(InlinerTest, MapSubtractOppositeOrder) {
   // Verify execution on CPU.
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
   auto expected = Literal::CreateR1<float>({3, 1, -1, -3});
-  LiteralTestUtil::ExpectEqual(*result, *expected);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*result, *expected));
 }
 
 
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index b9ccfeddb565b7..429c8503432b79 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -28,6 +28,25 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
+namespace {
+// These nodes can always be duplicated into consumers, even if
+// InstructionFusion::may_duplicate_ is false.
+//
+// In general these should be nodes that get *cheaper* the more they're
+// duplicated (and fused into consumers).
+//
+// TODO(jlebar): Duplicating instructions when we have a variable called "may
+// duplicate" that's equal to false is not pretty.
+bool IsAlwaysDuplicable(const HloInstruction& instruction) {
+  // We are always willing to duplicate a widening type-conversion instruction
+  // if it means we can fuse the convert into a consumer.  This allows the
+  // consumer to read less memory, which is almost always a performance win.
+  return instruction.opcode() == HloOpcode::kConvert &&
+         ShapeUtil::ByteSizeOf(instruction.operand(0)->shape()) <
+             ShapeUtil::ByteSizeOf(instruction.shape());
+}
+}  // namespace
+
 /*static*/ bool InstructionFusion::IsExpensive(
     const HloInstruction& instruction) {
   switch (instruction.opcode()) {
@@ -99,13 +118,16 @@ namespace xla {
     case HloOpcode::kCrossReplicaSum:
     case HloOpcode::kCustomCall:
     case HloOpcode::kDivide:
+    case HloOpcode::kDomain:
     case HloOpcode::kDot:
     case HloOpcode::kExp:
+    case HloOpcode::kExpm1:
     case HloOpcode::kFft:
     case HloOpcode::kFusion:
     case HloOpcode::kGather:
     case HloOpcode::kHostCompute:
     case HloOpcode::kLog:
+    case HloOpcode::kLog1p:
     case HloOpcode::kMap:
     case HloOpcode::kParameter:
     case HloOpcode::kPower:
@@ -128,11 +150,11 @@ namespace xla {
   return false;
 }
 
-// An "effectively unary" operation is one that has one "large"
+// An "effectively at most unary" operation is one that has at most one "large"
 // input with the others being negligible in terms of memory usage.
 // We use "has a smaller true rank than the output" as a heuristic
 // for "negligible" memory usage.
-bool InstructionFusion::EffectivelyUnary(HloInstruction* hlo) {
+bool InstructionFusion::EffectivelyAtMostUnary(HloInstruction* hlo) {
   int64 output_rank = 0;
   ShapeUtil::ForEachSubshape(
       hlo->shape(),
@@ -156,66 +178,89 @@ bool InstructionFusion::EffectivelyUnary(HloInstruction* hlo) {
 }
 
 bool InstructionFusion::CanFuseOnAllPaths(
-    const HloReachabilityMap& reachability_map, HloInstruction* producer,
-    HloInstruction* consumer, DoNotFuseSet* do_not_fuse) {
-  auto could_fuse_on_all_paths = [&] {
-    // First check to see if we have already marked this producer as infeasible
-    // to fuse into consumer.
-    if (do_not_fuse->count(producer) > 0) {
+    HloInstruction* producer, HloInstruction* consumer,
+    const HloInstructionSet& do_not_duplicate) {
+  if (consumer == producer) {
+    return true;
+  }
+  if (!consumer->IsFusable()) {
+    return false;
+  }
+  for (int64 i = 0, e = consumer->operand_count(); i < e; ++i) {
+    auto* consumer_operand = consumer->mutable_operand(i);
+    // If the operand is not on a path to the producer, it doesn't matter
+    // whether it's fusable.
+    if (!reachability_->IsReachable(producer, consumer_operand)) {
+      continue;
+    }
+    if (do_not_duplicate.count(consumer_operand) > 0 ||
+        !ShouldFuse(consumer, i)) {
       return false;
     }
-    // Make sure it is possible for producer and consumer to exist in a fusion
-    // node.
-    if (!producer->IsFusable() || !consumer->IsFusable()) {
+    // The producer is reachable from consumer_operand which means we need
+    // to be able to fuse consumer_operand into consumer in order for
+    // producer to be fusable into consumer on all paths.
+    // Perform the recursive step: make sure producer can be fused into
+    // consumer_operand on all paths.
+    if (!CanFuseOnAllPaths(producer, consumer_operand, do_not_duplicate)) {
       return false;
     }
-    // We do an upward walk of the graph from consumer towards all paths which
-    // lead to producer to find any unfusable paths.
-    for (int64 i = 0, e = consumer->operand_count(); i < e; ++i) {
-      auto* consumer_operand = consumer->mutable_operand(i);
-      if (consumer_operand == producer) {
-        // This is the base case: our upward crawl ends but we need to make sure
-        // that fusion from consumer can happen.
-        if (!ShouldFuse(consumer, i)) {
-          return false;
-        }
-      } else if (reachability_map.IsReachable(producer, consumer_operand)) {
-        // The reachability map told us that consumer_operand is a node on the
-        // path to producer. We need to further investigate from
-        // consumer_operand.
-
-        // First check if we have already ruled out fusing producer into
-        // consumer_operand.
-        if (do_not_fuse->count(consumer_operand) > 0) {
-          return false;
-        }
-        // Make sure it is possible for consumer_operand to exist in a fusion
-        // node.
-        if (!consumer_operand->IsFusable()) {
-          return false;
-        }
-        // The producer is reachable from consumer_operand which means we need
-        // to be able to fuse consumer_operand into consumer in order for
-        // producer to be fusable into consumer on all paths.
-        if (!ShouldFuse(consumer, i)) {
-          return false;
-        }
-        // Perform the recursive step: make sure producer can be fused into
-        // consumer_operand on all paths.
-        if (!CanFuseOnAllPaths(reachability_map, producer, consumer_operand,
-                               do_not_fuse)) {
-          return false;
-        }
+  }
+  return true;
+}
+
+InstructionFusion::HloInstructionSet
+InstructionFusion::ComputeGloballyUnfusable(
+    tensorflow::gtl::ArraySlice<HloInstruction*> post_order) {
+  // Forbid fusion of producers that:
+  // a) Need to be duplicated, unless they can be fused into all consumers
+  //    via all paths.
+  // b) Are more than unary, that is, fusing them would likely lead to an
+  //    increase in memory bandwidth use.
+  //
+  // Note that if we allow fusion by these global rules, we may still forbid
+  // fusing operations that require duplication later depending on
+  // is_expensive_().
+  HloInstructionSet do_not_duplicate;
+  for (HloInstruction* consumer : post_order) {
+    for (HloInstruction* producer : consumer->operands()) {
+      if (do_not_duplicate.count(producer) > 0) {
+        continue;
       }
+
+      // If the producer is effectively not more than unary, duplicating it
+      // will not increase the number of relevant inputs read, as the fusion
+      // node will only need to read at most 1 relevant input (the input of
+      // the producer). In that case, we do not forbid fusion of the operation
+      // here.
+      if (EffectivelyAtMostUnary(producer)) {
+        continue;
+      }
+      // Otherwise we will forbid fusing the op unless we can fuse it into
+      // all of its consumers on all paths.
+      //
+      // That means, that for:
+      // A --> B (fusable)
+      //   \-> C (non-fusable)
+      // A will be not allowed to be fused into B, as it cannot be fused into C.
+      //
+      // Similarly, for:
+      // A -------------> B
+      //   \-> C -> D -/
+      // If:
+      // - A is fusable into B and C, and D is fusable into B
+      // - C is *not* fusable into D
+      // A will be not allowed to be fused into B, as it cannot be fused via
+      // all paths.
+      if (producer->IsFusable() &&
+          CanFuseOnAllPaths(producer, consumer, do_not_duplicate)) {
+        continue;
+      }
+      do_not_duplicate.insert(producer);
     }
-    return true;
-  };
-  if (could_fuse_on_all_paths()) {
-    return true;
   }
-  // We couldn't fuse on all paths, record this result.
-  do_not_fuse->insert(producer);
-  return false;
+
+  return do_not_duplicate;
 }
 
 StatusOr<bool> InstructionFusion::Run(HloModule* module) {
@@ -227,6 +272,7 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
   for (auto* computation : module->MakeNonfusionComputations()) {
     CHECK(!computation->IsFusionComputation());
     computation_ = computation;
+    reachability_ = computation_->ComputeReachability();
 
     // We want to be able to remove arbitrary instructions from the post order
     // and also compare positions of instructions in the post order. To make
@@ -244,36 +290,7 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
       InsertOrDie(&post_order_index, post_order[i], i);
     }
 
-    DoNotFuseSet do_not_fuse;
-    auto reachability = computation->ComputeReachability();
-
-    auto cheap_to_duplicate = [this](HloInstruction* producer) {
-      if (producer->opcode() == HloOpcode::kBroadcast) {
-        return true;
-      }
-      if (producer->opcode() == HloOpcode::kConstant &&
-          ShapeUtil::IsEffectiveScalar(producer->shape())) {
-        return true;
-      }
-      if (EffectivelyUnary(producer)) {
-        return true;
-      }
-      return false;
-    };
-
-    for (HloInstruction* consumer : post_order) {
-      for (HloInstruction* producer : consumer->operands()) {
-        if (cheap_to_duplicate(producer)) {
-          continue;
-        }
-        if (CanFuseOnAllPaths(*reachability, producer, consumer,
-                              &do_not_fuse)) {
-          CHECK_EQ(do_not_fuse.count(producer), 0);
-        } else {
-          CHECK_GT(do_not_fuse.count(producer), 0);
-        }
-      }
-    }
+    HloInstructionSet do_not_duplicate = ComputeGloballyUnfusable(post_order);
 
     // Instruction fusion effectively fuses edges in the computation graph
     // (producer instruction -> consumer instruction) so we iterate over all
@@ -341,9 +358,20 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
       // ensures that B will be considered before A.
       //
       // We store the original indices of the operands to pass to ShouldFuse.
-      std::vector<int64> sorted_operand_numbers(instruction->operands().size());
-      std::iota(std::begin(sorted_operand_numbers),
-                std::end(sorted_operand_numbers), 0);
+      std::vector<int64> sorted_operand_numbers;
+      sorted_operand_numbers.reserve(instruction->operands().size());
+      for (int i = 0; i < instruction->operands().size(); ++i) {
+        // This will happen if we have two possible instructions to fuse the
+        // same operand into; once the operand is fused into one instruction,
+        // the other instruction will get a new get-tuple-element as its
+        // operand, which is not in the post-order index.
+        // TODO(tjoerg): Look into fusing past these multi-output fuse points.
+        if (post_order_index.find(instruction->mutable_operand(i)) ==
+            post_order_index.end()) {
+          continue;
+        }
+        sorted_operand_numbers.push_back(i);
+      }
       std::sort(
           sorted_operand_numbers.begin(), sorted_operand_numbers.end(),
           [&](int64 i, int64 j) {
@@ -360,13 +388,20 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
         if (!operand->IsFusable()) {
           continue;
         }
-        if (!ShouldFuse(instruction, i)) {
-          continue;
-        }
-        if (do_not_fuse.count(operand) > 0) {
+
+        HloInstruction* fusion_instruction;
+        // Try "regular" fusion if the operand may be duplicated. Otherwise,
+        // perform multi-output fusion, unless this creates a cycle.
+        // TODO(tjoerg): Consider making multi-output fusion the default.
+        if (ShouldFuse(instruction, i) &&
+            do_not_duplicate.count(operand) == 0) {
+          fusion_instruction = Fuse(operand, instruction);
+        } else if (ShouldFuseIntoMultiOutput(instruction, i) &&
+                   !MultiOutputFusionCreatesCycle(operand, instruction)) {
+          fusion_instruction = FuseIntoMultiOutput(operand, instruction);
+        } else {
           continue;
         }
-        HloInstruction* fusion_instruction = Fuse(operand, instruction);
 
         // Fusing an instruction into a fusion instruction can change the
         // operand set of the fusion instruction. For simplicity just push the
@@ -397,12 +432,9 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
   return changed;
 }
 
-HloInstruction* InstructionFusion::Fuse(HloInstruction* producer,
-                                        HloInstruction* consumer) {
+HloInstruction* InstructionFusion::AddFusionInstruction(
+    HloInstruction* producer, HloInstruction* consumer) {
   HloInstruction* fusion_instruction;
-
-  VLOG(2) << "Fusing " << producer->ToString() << " into "
-          << consumer->ToString();
   auto kind = ChooseKind(producer, consumer);
   if (consumer->opcode() == HloOpcode::kFusion) {
     fusion_instruction = consumer;
@@ -414,17 +446,48 @@ HloInstruction* InstructionFusion::Fuse(HloInstruction* producer,
         HloInstruction::CreateFusion(consumer->shape(), kind, consumer));
     TF_CHECK_OK(computation_->ReplaceInstruction(consumer, fusion_instruction));
   }
+  return fusion_instruction;
+}
 
+HloInstruction* InstructionFusion::Fuse(HloInstruction* producer,
+                                        HloInstruction* consumer) {
+  VLOG(2) << "Fusing " << producer->ToString() << " into "
+          << consumer->ToString();
+  HloInstruction* fusion_instruction = AddFusionInstruction(producer, consumer);
   fusion_instruction->FuseInstruction(producer);
   return fusion_instruction;
 }
 
+HloInstruction* InstructionFusion::FuseIntoMultiOutput(
+    HloInstruction* producer, HloInstruction* consumer) {
+  VLOG(2) << "Multi-output fusing " << producer->ToString() << " into "
+          << consumer->ToString();
+  HloInstruction* fusion_instruction = AddFusionInstruction(producer, consumer);
+  fusion_instruction->FuseInstructionIntoMultiOutput(producer);
+  return fusion_instruction;
+}
+
+bool InstructionFusion::MultiOutputFusionCreatesCycle(
+    HloInstruction* producer, HloInstruction* consumer) {
+  return c_any_of(
+      consumer->operands(), [&](const HloInstruction* consumer_operand) {
+        // The fusion algorithm traverses the HLO graph in reverse post order.
+        // Thus `cosumers` is visited before its operands (including
+        // `producer`). Therefore, consumer operands cannot have been fused yet.
+        // It is thus safe to use the pre-computed reachability map.
+        return consumer_operand != producer &&
+               reachability_->IsReachable(producer, consumer_operand);
+      });
+}
+
 bool InstructionFusion::ShouldFuse(HloInstruction* consumer,
                                    int64 operand_index) {
   HloInstruction* producer = consumer->mutable_operand(operand_index);
+
   // Cost condition: don't duplicate expensive instructions.
   if (FusionWouldDuplicate(*producer, *consumer) &&
-      (is_expensive_(*producer) || !may_duplicate_)) {
+      (!may_duplicate_ || is_expensive_(*producer)) &&
+      !IsAlwaysDuplicable(*producer)) {
     return false;
   }
 
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.h b/tensorflow/compiler/xla/service/instruction_fusion.h
index 152d0886ee9eda..f73ca9adf768ed 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.h
+++ b/tensorflow/compiler/xla/service/instruction_fusion.h
@@ -61,6 +61,14 @@ class InstructionFusion : public HloPassInterface {
   // Subtypes can override this with target-specific heuristics.
   virtual bool ShouldFuse(HloInstruction* consumer, int64 operand_index);
 
+  // Returns whether multi-output fusion can be applied to fuse `producer` into
+  // `consumer`. In contrast to "regular" fusion, the `producer` is not
+  // duplicated by multi-output fusion.
+  virtual bool ShouldFuseIntoMultiOutput(HloInstruction* consumer,
+                                         int64 operand_index) {
+    return false;
+  }
+
   // Chooses a fusion kind for `producer` and `consumer`.
   // Default method chooses `kLoop`.
   virtual HloInstruction::FusionKind ChooseKind(const HloInstruction* producer,
@@ -70,11 +78,18 @@ class InstructionFusion : public HloPassInterface {
   virtual HloInstruction* Fuse(HloInstruction* producer,
                                HloInstruction* consumer);
 
-  // An "effectively unary" operation is one that has one "large"
+  // Creates a new fusion instruction containing `producer` and `consumer`. A
+  // tuple is added as the fusion instruction's root, which consumes from both,
+  // `producer` and `consumer`. This style of fusion is referred to as
+  // multi-output fusion.
+  virtual HloInstruction* FuseIntoMultiOutput(HloInstruction* producer,
+                                              HloInstruction* consumer);
+
+  // An "effectively unary" operation is one that has at most one "large"
   // input with the others being negligible in terms of memory usage.
   // We use "has a smaller true rank than the output" as a heuristic
   // for "negligible" memory usage.
-  bool EffectivelyUnary(HloInstruction* hlo);
+  bool EffectivelyAtMostUnary(HloInstruction* hlo);
 
   // Returns true if fusing producer into consumer would cause producer to be
   // duplicated. This is the case if producer has uses other than consumer.
@@ -90,21 +105,34 @@ class InstructionFusion : public HloPassInterface {
   // Current HloComputation instance the loop fuser is traversing.
   HloComputation* computation_;
   HloModule* module_;
+  // Reachability information for the current computation.
+  std::unique_ptr<HloReachabilityMap> reachability_;
 
  private:
   // The set of producers whose consumers we cannot fuse into.
-  using DoNotFuseSet = std::unordered_set<HloInstruction*>;
+  using HloInstructionSet = std::unordered_set<HloInstruction*>;
 
-  // Whether or not we can fuse consumer into original_producer on all paths
+  HloInstruction* AddFusionInstruction(HloInstruction* producer,
+                                       HloInstruction* consumer);
+
+  // Whether or not we can fuse producer into consumer on all paths
   // from the producer to the consumer where nodes are HLOs and edges are uses.
-  bool CanFuseOnAllPaths(const HloReachabilityMap& reachability_map,
-                         HloInstruction* producer, HloInstruction* consumer,
-                         DoNotFuseSet* do_not_fuse);
+  bool CanFuseOnAllPaths(HloInstruction* producer, HloInstruction* consumer,
+                         const HloInstructionSet& do_not_fuse);
+
+  // Computes the set of nodes that we do not want to fuse into any of their
+  // consumers based on a global analysis of the HLO graph.
+  HloInstructionSet ComputeGloballyUnfusable(
+      tensorflow::gtl::ArraySlice<HloInstruction*> post_order);
 
   // Used to determine if an HLO is expensive. Expensive operations will not be
   // duplicated.
   std::function<bool(const HloInstruction& instruction)> is_expensive_;
 
+  // Whether multi-output fusion would introduce a cycle into the HLO graph.
+  bool MultiOutputFusionCreatesCycle(HloInstruction* producer,
+                                     HloInstruction* consumer);
+
   // Returns whether we may duplicate an instruction if we want to fuse it.
   bool may_duplicate_;
 
diff --git a/tensorflow/compiler/xla/service/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/instruction_fusion_test.cc
index 0fa2c95fb458f8..21db2338995960 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion_test.cc
@@ -16,12 +16,100 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/instruction_fusion.h"
 
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
 namespace xla {
 
+namespace op = xla::testing::opcode_matchers;
+
 using InstructionFusionTest = HloTestBase;
 
+// Subclass of InstructionFusion exposing the protected methods Fuse and
+// FuseIntoMultiOutput for testing.
+class InstructionFusionForTesting : public InstructionFusion {
+ public:
+  explicit InstructionFusionForTesting(HloModule* module)
+      : InstructionFusion(InstructionFusion::IsExpensive) {
+    module_ = module;
+    computation_ = module->entry_computation();
+  }
+
+  HloInstruction* Fuse(HloInstruction* producer,
+                       HloInstruction* consumer) override {
+    return InstructionFusion::Fuse(producer, consumer);
+  }
+
+  HloInstruction* FuseIntoMultiOutput(HloInstruction* producer,
+                                      HloInstruction* consumer) override {
+    return InstructionFusion::FuseIntoMultiOutput(producer, consumer);
+  }
+};
+
+TEST_F(InstructionFusionTest, FuseInstructions) {
+  auto module = ParseHloString(R"(
+  HloModule test_module
+  ENTRY entry_computation {
+    p0 = f32[4,3]{1,0} parameter(0)
+    add = f32[4,3]{1,0} add(p0, p0)
+    ROOT sub = f32[4,3]{1,0} subtract(add, p0)
+  })")
+                    .ValueOrDie();
+  HloInstruction* sub = module->entry_computation()->root_instruction();
+  HloInstruction* add = sub->mutable_operand(0);
+  HloInstruction* fusion =
+      InstructionFusionForTesting(module.get()).Fuse(add, sub);
+
+  ASSERT_THAT(fusion, op::Fusion()) << module->ToString();
+  EXPECT_THAT(fusion->fused_expression_root(),
+              op::Subtract(op::Add(), op::Parameter()))
+      << module->ToString();
+}
+
+TEST_F(InstructionFusionTest, FuseIntoFusionInstruction) {
+  auto module = ParseHloString(R"(
+  HloModule test_module
+  fused_computation {
+    p1 = f32[4,3] parameter(0)
+    add = f32[4,3] add(p1, p1)
+  }
+  ENTRY entry_computation {
+    p0 = f32[4,3] parameter(0)
+    abs = f32[4,3] abs(p0)
+    ROOT fusion = f32[4,3] fusion(abs), kind=kLoop, calls=fused_computation
+  })")
+                    .ValueOrDie();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  HloInstruction* abs = root->mutable_operand(0);
+  HloInstruction* fusion =
+      InstructionFusionForTesting(module.get()).Fuse(abs, root);
+
+  ASSERT_THAT(fusion, op::Fusion()) << module->ToString();
+  EXPECT_THAT(fusion->fused_expression_root(), op::Add(op::Abs(), op::Abs()))
+      << module->ToString();
+}
+
+TEST_F(InstructionFusionTest, FuseInstructionsIntoMultiOutput) {
+  auto module = ParseHloString(R"(
+  HloModule test_module
+  ENTRY entry_computation {
+    p0 = f32[4,3]{1,0} parameter(0)
+    abs = f32[4,3]{1,0} abs(p0)
+    tanh = f32[4,3]{1,0} tanh(abs)
+    ROOT add = f32[4,3]{1,0} add(abs, tanh)
+  })")
+                    .ValueOrDie();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  HloInstruction* abs = root->mutable_operand(0);
+  HloInstruction* tanh = root->mutable_operand(1);
+  HloInstruction* fusion =
+      InstructionFusionForTesting(module.get()).FuseIntoMultiOutput(abs, tanh);
+
+  ASSERT_THAT(fusion, op::Fusion()) << module->ToString();
+  EXPECT_THAT(fusion->fused_expression_root(), op::Tuple(op::Tanh(), op::Abs()))
+      << module->ToString();
+}
+
 TEST_F(InstructionFusionTest, PotentialBitcastReshapeOfParameterUnfused) {
   HloComputation::Builder builder(TestName());
   auto param0 = builder.AddInstruction(
@@ -89,7 +177,172 @@ TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusable) {
   EXPECT_FALSE(
       InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true)
           .Run(module.get())
-          .ValueOrDie());
+          .ValueOrDie())
+      << module->ToString();
+}
+
+// Counts the number of HLO ops with a given op code in the specified module.
+static int Count(const HloModule& module, HloOpcode op) {
+  int count = 0;
+  for (const auto* computation : module.computations()) {
+    for (const auto* instruction : computation->instructions()) {
+      if (instruction->opcode() == op) {
+        ++count;
+      }
+    }
+  }
+  return count;
+}
+
+TEST_F(InstructionFusionTest, FuseCheapNonDuplicatableOps) {
+  auto module = ParseHloString(R"(
+  HloModule test_module
+  ENTRY OutputFusion {
+    p0 = f32[4,3]{1,0} parameter(0)
+    add = f32[4,3]{1,0} add(p0, p0)
+    ROOT root = f32[4,3]{1,0} subtract(add, add)
+  })")
+                    .ValueOrDie();
+  // Expect the add and subtraction to be fused.
+  EXPECT_TRUE(
+      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true)
+          .Run(module.get())
+          .ValueOrDie())
+      << module->ToString();
+  EXPECT_EQ(Count(*module, HloOpcode::kFusion), 1) << module->ToString();
+
+  // Make sure the add hasn't been duplicated.
+  EXPECT_EQ(Count(*module, HloOpcode::kAdd), 1) << module->ToString();
+}
+
+TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusableRecursively) {
+  // Make sure we do not duplicate the add, as we cannot fuse through the rng.
+  //
+  // p0 -> add -------------------------> sub
+  //           \-> abs1 -> rng -> abs2 -/
+  auto module = ParseHloString(R"(
+  HloModule test_module
+  ENTRY OutputFusion {
+    p0 = f32[4,3]{1,0} parameter(0)
+    add = f32[4,3]{1,0} add(p0, p0)
+    abs1 = f32[4,3]{1,0} abs(add)
+    rng = f32[4,3]{1,0} rng(abs1), distribution=rng_uniform
+    abs2 = f32[4,3]{1,0} abs(rng)
+    ROOT root = f32[4,3]{1,0} subtract(abs2, add)
+  })")
+                    .ValueOrDie();
+  // We expect abs2 to be fused into root.
+  EXPECT_TRUE(
+      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true)
+          .Run(module.get())
+          .ValueOrDie())
+      << module->ToString();
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Fusion());
+  EXPECT_THAT(root->fused_expression_root(),
+              op::Subtract(op::Abs(op::Parameter()), op::Parameter()))
+      << module->ToString();
+
+  // Make sure the add hasn't been duplicated.
+  EXPECT_EQ(Count(*module, HloOpcode::kAdd), 1) << module->ToString();
+
+  // Use a log node with a second consumer to break the fusion.
+  //
+  // p0 -> add -------------------------> sub
+  //           \-> abs1 -> log -> abs2 -/
+  //                           \-> send
+  module = ParseHloString(R"(
+  HloModule test_module
+  ENTRY OutputFusion {
+    p0 = f32[4,3]{1,0} parameter(0)
+    add = f32[4,3]{1,0} add(p0, p0)
+    abs1 = f32[4,3]{1,0} abs(add)
+    log = f32[4,3]{1,0} log(abs1)
+    send = f32[4,3]{1,0} send(log), channel_id=0
+    abs2 = f32[4,3]{1,0} abs(log)
+    ROOT root = f32[4,3]{1,0} subtract(abs2, add)
+  })")
+               .ValueOrDie();
+
+  // We expect abs2 to be fused into root and abs1 to be fused into log.
+  EXPECT_TRUE(
+      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true)
+          .Run(module.get())
+          .ValueOrDie())
+      << module->ToString();
+  EXPECT_EQ(Count(*module, HloOpcode::kFusion), 2) << module->ToString();
+
+  // Make sure the add hasn't been duplicated.
+  EXPECT_EQ(Count(*module, HloOpcode::kAdd), 1) << module->ToString();
+
+  // Make sure we still fuse ops where one operand in the chain to the producer
+  // can't be fused.
+  //
+  // p0 ---> add1 -----------> sub
+  //    \         \-> add2 -/
+  //     \-> log -/
+  //             \-> send
+  module = ParseHloString(R"(
+  HloModule test_module
+  ENTRY OutputFusion {
+    p0 = f32[4,3]{1,0} parameter(0)
+    add1 = f32[4,3]{1,0} add(p0, p0)
+    log = f32[4,3]{1,0} log(p0)
+    send = f32[4,3]{1,0} send(log), channel_id=0
+    add2 = f32[4,3]{1,0} add(log, add1)
+    ROOT root = f32[4,3]{1,0} subtract(add1, add2)
+  })")
+               .ValueOrDie();
+
+  // Expect the add1 and add2 to be fused into root.
+  EXPECT_TRUE(
+      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true)
+          .Run(module.get())
+          .ValueOrDie())
+      << module->ToString();
+  EXPECT_EQ(Count(*module, HloOpcode::kFusion), 1) << module->ToString();
+
+  // Make sure we didn't duplicate any adds.
+  EXPECT_EQ(Count(*module, HloOpcode::kAdd), 2) << module->ToString();
+
+  // A variant of the above that allows the algorithm to put add2 into the set
+  // of unfusable ops to short-circuit the decision whether add1 should be fused
+  // into sub2.
+  //
+  //             /---------------\
+  // p0 ---> add1 ---> add2 ------> sub2
+  //                       \------> sub1
+  //                        log -/
+  //                            \-> send
+  module = ParseHloString(R"(
+  HloModule test_module
+  ENTRY OutputFusion {
+    p0 = f32[4,3]{1,0} parameter(0)
+    add1 = f32[4,3]{1,0} add(p0, p0)
+    add2 = f32[4,3]{1,0} add(add1, add1)
+    log = f32[4,3]{1,0} log(add2)
+    send = f32[4,3]{1,0} send(log), channel_id=0
+    sub1 = f32[4,3]{1,0} subtract(log, add2)
+    sub2 = f32[4,3]{1,0} subtract(add2, add1)
+    ROOT root = (f32[4,3]{1,0}, f32[4,3]{1,0}) tuple(sub1, sub2)
+  })")
+               .ValueOrDie();
+
+  // Expect sub1 and sub2 to be fused into root.
+  EXPECT_TRUE(
+      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true)
+          .Run(module.get())
+          .ValueOrDie())
+      << module->ToString();
+  root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Fusion());
+  EXPECT_THAT(root->fused_expression_root(),
+              op::Tuple(op::Subtract(op::Parameter(), op::Parameter()),
+                        op::Subtract(op::Parameter(), op::Parameter())))
+      << module->ToString();
+
+  // Make sure we didn't duplicate any adds.
+  EXPECT_EQ(Count(*module, HloOpcode::kAdd), 2) << module->ToString();
 }
 
 TEST_F(InstructionFusionTest, AllowUnaryDuplication) {
@@ -135,4 +388,29 @@ TEST_F(InstructionFusionTest, AllowEffectiveUnaryDuplication) {
           .ValueOrDie());
 }
 
+TEST_F(InstructionFusionTest,
+       WideningConvertsAreAlwaysDuplicableIntoConsumers) {
+  auto module = ParseHloString(R"(
+  HloModule test_module
+  ENTRY Test {
+    p0 = f16[100] parameter(0)
+    c = f32[100] convert(p0)
+    add = f32[100] add(c, c)
+    ROOT mul = f32[100] multiply(c, c)
+  })")
+                    .ValueOrDie();
+
+  // The convert should be fused into the add and mul, even though may_duplicate
+  // is false, because it's always beneficial to fuse/duplicate widening
+  // converts into consumers.
+  EXPECT_TRUE(
+      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/false)
+          .Run(module.get())
+          .ValueOrDie())
+      << module->ToString();
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Fusion(op::Parameter()));
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/interpreter/BUILD b/tensorflow/compiler/xla/service/interpreter/BUILD
index 45505484951abf..524d3234eb4eff 100644
--- a/tensorflow/compiler/xla/service/interpreter/BUILD
+++ b/tensorflow/compiler/xla/service/interpreter/BUILD
@@ -18,7 +18,6 @@ cc_library(
         "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/compiler/xla/service/interpreter:platform_id",
         "//tensorflow/core:lib",
-        "//tensorflow/core:stream_executor_no_cuda",
     ],
     alwayslink = True,  # Contains per-platform transfer manager registration
 )
@@ -117,6 +116,5 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_headers_lib",
-        "//tensorflow/core:stream_executor_no_cuda",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/interpreter/README.md b/tensorflow/compiler/xla/service/interpreter/README.md
index 4c19a1b916d421..0b21b251c3f663 100644
--- a/tensorflow/compiler/xla/service/interpreter/README.md
+++ b/tensorflow/compiler/xla/service/interpreter/README.md
@@ -5,7 +5,7 @@ evaluating the result of the HLO graph directly with HloEvaluator, without
 lowering it further (to LLVM IR for example) before execution as other backends
 (CPU and GPU for example) do.
 
-Its key componenets are:
+Its key components are:
 
 *   [`InterpreterCompiler`] despite the inherited naming of "compiler", all
     `InterpreterCompiler` really does is the following:
diff --git a/tensorflow/compiler/xla/service/interpreter/compiler.cc b/tensorflow/compiler/xla/service/interpreter/compiler.cc
index 76b3ecad26fe92..c1666530687f2f 100644
--- a/tensorflow/compiler/xla/service/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/service/interpreter/compiler.cc
@@ -34,7 +34,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
 #include "tensorflow/compiler/xla/service/while_loop_simplifier.h"
 #include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -45,8 +44,7 @@ Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) {
   HloPassPipeline pipeline("Interpreter");
 
   pipeline.AddPass<LayoutAssignment>(
-      hlo_module->mutable_entry_computation_layout());
-
+      hlo_module->mutable_device_entry_computation_layout());
   return pipeline.Run(hlo_module).status();
 }
 
@@ -71,7 +69,8 @@ StatusOr<std::unique_ptr<Executable>> InterpreterCompiler::RunBackend(
 
   // Create executable from only the Hlo module.
   std::unique_ptr<Executable> executable =
-      xla::MakeUnique<InterpreterExecutable>(std::move(hlo_module));
+      xla::MakeUnique<InterpreterExecutable>(std::move(hlo_module),
+                                             xla::MakeUnique<HloEvaluator>());
 
   return std::move(executable);
 }
@@ -101,17 +100,14 @@ HloCostAnalysis::ShapeSizeFunction InterpreterCompiler::ShapeSizeBytesFunction()
   return InterpreterExecutable::ShapeSizeBytes;
 }
 
-static std::unique_ptr<xla::ComputationPlacer> CreateComputationPlacer() {
-  return xla::MakeUnique<xla::ComputationPlacer>();
-}
-
 static bool InitModule() {
   xla::Compiler::RegisterCompilerFactory(
       se::interpreter::kXlaInterpreterPlatformId, []() {
         return xla::MakeUnique<xla::interpreter::InterpreterCompiler>();
       });
   xla::ComputationPlacer::RegisterComputationPlacer(
-      se::interpreter::kXlaInterpreterPlatformId, &CreateComputationPlacer);
+      se::interpreter::kXlaInterpreterPlatformId,
+      []() { return xla::MakeUnique<xla::ComputationPlacer>(); });
   return true;
 }
 
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc
index 61f199bc9e8f4f..029e71058a7373 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/interpreter/executor.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
@@ -32,16 +31,17 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 
 namespace xla {
 namespace interpreter {
 
 InterpreterExecutable::InterpreterExecutable(
-    std::unique_ptr<const HloModule> hlo_module)
+    std::unique_ptr<const HloModule> hlo_module,
+    std::unique_ptr<HloEvaluator> evaluator)
     : Executable(std::move(hlo_module), /*hlo_profile_printer=*/nullptr,
-                 /*hlo_profile_index_map=*/nullptr) {}
+                 /*hlo_profile_index_map=*/nullptr),
+      evaluator_(std::move(evaluator)) {}
 
 InterpreterExecutable::~InterpreterExecutable() {}
 
@@ -82,10 +82,13 @@ StatusOr<ScopedShapedBuffer> InterpreterExecutable::ExecuteOnStream(
   }
 
   // Execute the graph using the HloEvaluator.
-  HloEvaluator evaluator;
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<Literal> result_literal,
-      evaluator.Evaluate<std::unique_ptr<Literal>>(*computation, arg_literals));
+  std::unique_ptr<Literal> result_literal;
+  {
+    tensorflow::mutex_lock lock(evaluator_lock_);
+    TF_ASSIGN_OR_RETURN(result_literal,
+                        evaluator_->Evaluate<std::unique_ptr<Literal>>(
+                            *computation, arg_literals));
+  }
 
   // Transform the result literal back into a ShapedBuffer.
   TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result,
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.h b/tensorflow/compiler/xla/service/interpreter/executable.h
index b0b797ca7d6f44..91d8148d26dc8e 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.h
+++ b/tensorflow/compiler/xla/service/interpreter/executable.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_evaluator.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
@@ -30,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -40,13 +42,15 @@ namespace interpreter {
 // buffer allocation. Refer to interpreter/README.md for more.
 class InterpreterExecutable : public Executable {
  public:
-  InterpreterExecutable(std::unique_ptr<const HloModule> hlo_module);
+  InterpreterExecutable(std::unique_ptr<const HloModule> hlo_module,
+                        std::unique_ptr<HloEvaluator> evaluator);
   ~InterpreterExecutable() override;
 
   StatusOr<ScopedShapedBuffer> ExecuteOnStream(
       const ServiceExecutableRunOptions* run_options,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      HloExecutionProfile* hlo_execution_profile) override;
+      HloExecutionProfile* hlo_execution_profile) override
+      LOCKS_EXCLUDED(evaluator_lock_);
 
   StatusOr<ScopedShapedBuffer> ExecuteAsyncOnStream(
       const ServiceExecutableRunOptions* run_options,
@@ -54,6 +58,11 @@ class InterpreterExecutable : public Executable {
 
   static int64 ShapeSizeBytes(const Shape& shape);
 
+ protected:
+  // The interpreter interprets executables with an HloEvaluator.
+  std::unique_ptr<HloEvaluator> evaluator_ PT_GUARDED_BY(evaluator_lock_);
+  mutable tensorflow::mutex evaluator_lock_;
+
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(InterpreterExecutable);
 };
diff --git a/tensorflow/compiler/xla/service/interpreter/platform.cc b/tensorflow/compiler/xla/service/interpreter/platform.cc
index 92e069a8c67c1d..42c2c28997d5f3 100644
--- a/tensorflow/compiler/xla/service/interpreter/platform.cc
+++ b/tensorflow/compiler/xla/service/interpreter/platform.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/compiler/xla/service/interpreter/executor.h"
-#include "tensorflow/compiler/xla/service/interpreter/platform_id.h"
 #include "tensorflow/stream_executor/device_options.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/ptr_util.h"
@@ -31,13 +30,13 @@ limitations under the License.
 namespace stream_executor {
 namespace interpreter {
 
-XlaInterpreterPlatform::XlaInterpreterPlatform() : name_("Interpreter") {}
+XlaInterpreterPlatform::XlaInterpreterPlatform(const string& name,
+                                               const Platform::Id& id)
+    : name_(name), id_(id) {}
 
 XlaInterpreterPlatform::~XlaInterpreterPlatform() {}
 
-Platform::Id XlaInterpreterPlatform::id() const {
-  return kXlaInterpreterPlatformId;
-}
+Platform::Id XlaInterpreterPlatform::id() const { return id_; }
 
 int XlaInterpreterPlatform::VisibleDeviceCount() const { return 1; }
 
@@ -106,8 +105,6 @@ REGISTER_MODULE_INITIALIZER(
     interpreter_platform,
     stream_executor::interpreter::InitializeXlaInterpreterPlatform());
 
-DECLARE_MODULE_INITIALIZER(multi_platform_manager);
-
 // Note that module initialization sequencing is not supported in the
 // open-source project, so this will be a no-op there.
 REGISTER_MODULE_INITIALIZER_SEQUENCE(interpreter_platform,
diff --git a/tensorflow/compiler/xla/service/interpreter/platform.h b/tensorflow/compiler/xla/service/interpreter/platform.h
index d68c5aa20dda7a..0187f6d473b19f 100644
--- a/tensorflow/compiler/xla/service/interpreter/platform.h
+++ b/tensorflow/compiler/xla/service/interpreter/platform.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <string>
 
+#include "tensorflow/compiler/xla/service/interpreter/platform_id.h"
 #include "tensorflow/stream_executor/executor_cache.h"
 #include "tensorflow/stream_executor/plugin.h"
 #include "tensorflow/stream_executor/stream_executor.h"
@@ -28,7 +29,8 @@ namespace interpreter {
 
 class XlaInterpreterPlatform : public Platform {
  public:
-  XlaInterpreterPlatform();
+  XlaInterpreterPlatform(const string& name = "Interpreter",
+                         const Platform::Id& id = kXlaInterpreterPlatformId);
   ~XlaInterpreterPlatform() override;
 
   Platform::Id id() const override;
@@ -55,6 +57,8 @@ class XlaInterpreterPlatform : public Platform {
  private:
   // This platform's name.
   string name_;
+  // This platform's id.
+  Platform::Id id_;
 
   // Cache of created StreamExecutors.
   ExecutorCache executor_cache_;
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index 2494569db53f26..7067b6f86a0fb2 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -31,10 +31,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
+#include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -400,9 +402,9 @@ string LayoutConstraints::ToString() const {
 }
 
 Status LayoutAssignment::AddMandatoryConstraints(
-    const ComputationLayout& computation_layout,
-    const ChannelLayoutConstraints* channel_constraints,
-    HloComputation* computation, LayoutConstraints* constraints) {
+    const ComputationLayout* computation_layout,
+    ChannelLayoutConstraints* channel_constraints, HloComputation* computation,
+    LayoutConstraints* constraints) {
   VLOG(3) << "Adding mandatory layout constraints to computation "
           << computation->name();
 
@@ -424,11 +426,16 @@ Status LayoutAssignment::AddMandatoryConstraints(
       TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
           instruction->outfeed_shape(), instruction, 0));
     } else if (instruction->opcode() == HloOpcode::kParameter) {
-      // Parameter layouts must match the respective layout in
-      // ComputationLayout.
-      shape_with_layout =
-          &computation_layout.parameter_layout(instruction->parameter_number())
-               .shape();
+      if (computation_layout != nullptr) {
+        const ShapeLayout& parameter_layout =
+            computation_layout->parameter_layout(
+                instruction->parameter_number());
+        if (parameter_layout.LayoutIsSet()) {
+          // Parameter layouts must match the respective layout in
+          // ComputationLayout, if there is one.
+          shape_with_layout = &parameter_layout.shape();
+        }
+      }
     }
     if (shape_with_layout != nullptr) {
       TF_RETURN_IF_ERROR(
@@ -493,9 +500,8 @@ Status LayoutAssignment::AddMandatoryConstraints(
       HloComputation* body = instruction->while_body();
       HloComputation* condition = instruction->while_condition();
       const HloInstruction* init = instruction->operand(0);
-      const ComputationLayout& body_layout =
-          FindOrDie(computation_layouts_, body);
-      const ComputationLayout& condition_layout =
+      ComputationLayout& body_layout = FindOrDie(computation_layouts_, body);
+      ComputationLayout& condition_layout =
           FindOrDie(computation_layouts_, condition);
 
       // Check a few invariants irrespective of layout.
@@ -508,26 +514,19 @@ Status LayoutAssignment::AddMandatoryConstraints(
                                    condition_layout.parameter_shape(0)));
       DCHECK(ShapeUtil::Compatible(body_layout.result_shape(), init->shape()));
 
-      // Return error if earlier layout assignment of the embedded computations
-      // has produced conflicting layouts.
-      if (!ShapeUtil::Equal(body_layout.result_shape(),
-                            body_layout.parameter_shape(0))) {
-        return InternalError(
-            "Parameter and result of body computation %s of while instruction "
-            "%s have different layouts: %s vs %s",
-            body->name().c_str(), instruction->name().c_str(),
-            ShapeUtil::HumanString(body_layout.result_shape()).c_str(),
-            ShapeUtil::HumanString(body_layout.parameter_shape(0)).c_str());
+      if (body_layout.result_layout() != body_layout.parameter_layout(0)) {
+        VLOG(2) << "Reset %while body parameter layout: body=" << body->name()
+                << " while=" << instruction->name()
+                << " shape=" << body_layout.result_layout().ToString();
+        *body_layout.mutable_parameter_layout(0) = body_layout.result_layout();
       }
-      if (!ShapeUtil::Equal(body->root_instruction()->shape(),
-                            condition->parameter_instruction(0)->shape())) {
-        return InternalError(
-            "Parameter of condition computation %s of while instruction "
-            "%s does not match body computation %s result: %s vs %s",
-            condition->name().c_str(), instruction->name().c_str(),
-            body->name().c_str(),
-            ShapeUtil::HumanString(condition_layout.parameter_shape(0)).c_str(),
-            ShapeUtil::HumanString(body_layout.result_shape()).c_str());
+      if (condition_layout.parameter_layout(0) !=
+          body_layout.parameter_layout(0)) {
+        VLOG(2) << "Reset %while condition parameter layout: cond="
+                << condition->name() << " while=" << instruction->name()
+                << " shape=" << body_layout.parameter_layout(0).ToString();
+        *condition_layout.mutable_parameter_layout(0) =
+            body_layout.parameter_layout(0);
       }
 
       // Constrain the output and the operand of the while instruction to match
@@ -557,7 +556,20 @@ Status LayoutAssignment::AddMandatoryConstraints(
                                    true_computation_layout.parameter_shape(0)));
       DCHECK(ShapeUtil::Compatible(
           false_operand->shape(), false_computation_layout.parameter_shape(0)));
-
+      if (true_computation_layout.result_layout() !=
+          false_computation_layout.result_layout()) {
+        // We assign layouts in DFS fashion, so the true and false computations
+        // might have negotiated a different layout. But for the conditional
+        // instruction POV the layout must match, so we run again on the false
+        // computation, this time with proper computation layout.
+        VLOG(2) << "Reset %conditional false computation result layout: "
+                   "false_computation="
+                << false_computation->name()
+                << " conditional=" << instruction->name() << " shape="
+                << true_computation_layout.result_layout().ToString();
+        *false_computation_layout.mutable_result_layout() =
+            true_computation_layout.result_layout();
+      }
       TF_RETURN_IF_ERROR(constraints->SetInstructionLayout(
           true_computation_layout.result_shape(), instruction));
       TF_RETURN_IF_ERROR(constraints->SetOperandLayout(
@@ -593,10 +605,14 @@ Status LayoutAssignment::AddMandatoryConstraints(
       }
     }
   }
-
-  // Finally set the result layout to match ComputationLayout.
-  return constraints->SetResultLayout(
-      computation_layout.result_layout().shape());
+  // Finally set the result layout to match ComputationLayout, if there is one.
+  if (computation_layout != nullptr) {
+    const ShapeLayout& result_layout = computation_layout->result_layout();
+    if (result_layout.LayoutIsSet()) {
+      TF_RETURN_IF_ERROR(constraints->SetResultLayout(result_layout.shape()));
+    }
+  }
+  return Status::OK();
 }
 
 namespace {
@@ -760,6 +776,7 @@ StatusOr<HloInstruction*> LayoutAssignment::CreateCopyWithNewLayout(
     HloInstruction* copy =
         instruction->parent()->AddInstruction(HloInstruction::CreateUnary(
             instruction->shape(), HloOpcode::kCopy, instruction));
+    RegisterAddedCopy(copy);
     SetupCopiedInstruction(*instruction, copy, {});
     LayoutUtil::ClearLayout(copy->mutable_shape());
     TF_RETURN_IF_ERROR(LayoutUtil::CopyLayoutBetweenShapes(
@@ -783,13 +800,19 @@ Status LayoutAssignment::CopyOperandIfLayoutsDiffer(
   TF_RET_CHECK(LayoutUtil::HasLayout(operand->shape()));
 
   if (ShapeUtil::Equal(operand_layout.shape(), operand->shape())) {
+    VLOG(5) << "Operand " << operand->ToString() << " layout matches in "
+            << instruction->ToString();
     // Operand layout already matches our constraint. Nothing to do.
     return Status::OK();
   }
+  VLOG(4) << "Operand " << operand->ToString() << " layout does not match "
+          << operand_layout.ToString() << " in " << instruction->ToString();
 
   TF_ASSIGN_OR_RETURN(HloInstruction * operand_copy,
                       CreateCopyWithNewLayout(operand_layout.shape(), operand));
 
+  VLOG(4) << "New copy of " << operand->ToString() << " is "
+          << operand_copy->ToString();
   return instruction->ReplaceOperandWith(operand_no, operand_copy);
 }
 
@@ -896,15 +919,16 @@ Status LayoutAssignment::CheckLayouts(HloModule* module) {
       }
     }
   }
-
-  // Finally verify the result layout matches the layout of the entry
+  // Finally verify the result layout, if set, matches the layout of the entry
   // computation root.
-  TF_RET_CHECK(ShapeUtil::Equal(
-      module->entry_computation()->root_instruction()->shape(),
+  const ShapeLayout& result_layout =
       FindOrDie(computation_layouts_, module->entry_computation())
-          .result_layout()
-          .shape()));
-
+          .result_layout();
+  if (result_layout.LayoutIsSet()) {
+    TF_RET_CHECK(ShapeUtil::Equal(
+        module->entry_computation()->root_instruction()->shape(),
+        result_layout.shape()));
+  }
   return Status::OK();
 }
 
@@ -913,18 +937,13 @@ LayoutAssignment::LayoutAssignment(
     ChannelLayoutConstraints* channel_constraints)
     : entry_computation_layout_(entry_computation_layout),
       channel_layout_constraints_(channel_constraints) {
-  VLOG(1) << "entry computation layout given to layout assignment: "
+  VLOG(1) << "Entry computation layout given to layout assignment: "
           << entry_computation_layout_->ToString();
   // Layouts of all parameter instructions must be set.
   for (const ShapeLayout& parameter_layout :
        entry_computation_layout_->parameter_layouts()) {
     CHECK(parameter_layout.LayoutIsSet());
   }
-  // If the result layout is not set, then choose the default.
-  // TODO(b/29118294): Choose a better layout in this case.
-  if (!entry_computation_layout_->result_layout().LayoutIsSet()) {
-    entry_computation_layout_->mutable_result_layout()->SetToDefaultLayout();
-  }
 }
 
 std::unique_ptr<Layout> LayoutAssignment::ChooseOperandLayoutFromOutputLayout(
@@ -1484,16 +1503,60 @@ Status LayoutAssignment::AssignLayouts(const LayoutConstraints& constraints,
   return Status::OK();
 }
 
+Status LayoutAssignment::CalculateComputationLayout(
+    HloComputation* computation) {
+  ComputationLayout computation_layout(computation->ComputeProgramShape(),
+                                       /*ignore_layouts=*/false);
+  InsertOrDie(&computation_layouts_, computation, computation_layout);
+  VLOG(2) << "  Calculated ComputationLayout = "
+          << computation_layout.ToString();
+  return Status::OK();
+}
+
+Status LayoutAssignment::ClearComputationLayouts(HloComputation* computation) {
+  // Clear existing layouts of the instructions.  All layouts must be assigned
+  // by the LayoutAssignment pass, except for those on infeeds, parameters,
+  // and the computation result. The latter two are specified in
+  // computation_layout, so we only need to keep the existing layouts for
+  // infeeds.  Clearing the layouts here avoids hiding potential bugs in the
+  // layout assignment pass that may accidently use the existing layout.
+  for (HloInstruction* instruction : computation->instructions()) {
+    if (instruction->opcode() == HloOpcode::kBitcast) {
+      // bitcasts are inherently layout sensitive and so a bitcast instruction
+      // present in the IR before layout assignment is a bug.
+      return InternalError(
+          "Unexpected bitcast operation seen during layout assignment: %s.",
+          instruction->ToString().c_str());
+    }
+    if (instruction->opcode() != HloOpcode::kInfeed) {
+      LayoutUtil::ClearLayout(instruction->mutable_shape());
+    }
+  }
+  return Status::OK();
+}
+
 Status LayoutAssignment::RunOnComputation(
-    const ComputationLayout& computation_layout,
+    ComputationLayout* computation_layout,
     const TuplePointsToAnalysis& points_to_analysis,
     HloComputation* computation,
     ChannelLayoutConstraints* channel_constraints) {
-  DCHECK(computation_layout.LayoutIsSet());
-  InsertOrDie(&computation_layouts_, computation, computation_layout);
   VLOG(2) << "LayoutAssignment::RunOnComputation(" << computation->name()
           << ")";
-  VLOG(2) << "  ComputationLayout = " << computation_layout.ToString();
+  TF_RETURN_IF_ERROR(ClearComputationLayouts(computation));
+  if (computation_layout != nullptr) {
+    auto it = computation_layouts_.find(computation);
+    if (it == computation_layouts_.end()) {
+      VLOG(2) << "  New ComputationLayout = " << computation_layout->ToString();
+      computation_layouts_.emplace(computation, *computation_layout);
+    } else {
+      TF_RET_CHECK(computation_layout == &it->second ||
+                   computation_layout == entry_computation_layout_);
+      VLOG(2) << "  Existing ComputationLayout = "
+              << computation_layout->ToString();
+    }
+  } else {
+    VLOG(2) << "  No ComputationLayout specified (will be calculated)";
+  }
 
   // Construct LayoutConstraints with all layout constraints of the computation.
   LayoutConstraints constraints(points_to_analysis, computation);
@@ -1536,12 +1599,19 @@ Status LayoutAssignment::RunOnComputation(
     CHECK_LT(constraints.unconstrained_buffer_ids().size(),
              unconstrained_count);
   }
-
   // All logical buffers should have constraints at this point. All that
   // remains is assign the constraints to the buffers and infer layouts for
   // aliased buffers.
   TF_RETURN_IF_ERROR(AssignLayouts(constraints, computation));
 
+  // If the computation layout wasn't specified, now it is the time to compute
+  // it according to the parameters and root instruction layouts.
+  // This allows the first pass through this API to record the best flowing
+  // layout to parameters and root instruction.
+  if (computation_layout == nullptr) {
+    TF_RETURN_IF_ERROR(CalculateComputationLayout(computation));
+  }
+
   // Record the layouts assigned for any communication ops in
   // channel_constraints so that they are constrained for future modules.
   for (HloInstruction* instruction : computation->instructions()) {
@@ -1556,6 +1626,34 @@ Status LayoutAssignment::RunOnComputation(
   return Status::OK();
 }
 
+Status LayoutAssignment::PropagateComputationLayouts(
+    HloComputation* computation, ComputationLayout* computation_layout) {
+  ComputationLayout computed_computation_layout(
+      computation->ComputeProgramShape(),
+      /*ignore_layouts=*/false);
+  for (int64 i = 0; i < computed_computation_layout.parameter_count(); ++i) {
+    ShapeLayout* param_layout = computation_layout->mutable_parameter_layout(i);
+    if (!param_layout->LayoutIsSet()) {
+      VLOG(4) << "Assigning layout to parameter " << i << " of computation "
+              << computation->name() << ": "
+              << computed_computation_layout.parameter_layout(i).ToString();
+      *param_layout = computed_computation_layout.parameter_layout(i);
+    } else {
+      TF_RET_CHECK(computed_computation_layout.parameter_layout(i) ==
+                   *param_layout);
+    }
+  }
+  ShapeLayout* result_layout = computation_layout->mutable_result_layout();
+  if (!result_layout->LayoutIsSet()) {
+    VLOG(4) << "Assigning result layout of computation " << computation->name()
+            << ": " << computed_computation_layout.result_layout().ToString();
+    *result_layout = computed_computation_layout.result_layout();
+  } else {
+    TF_RET_CHECK(computed_computation_layout.result_layout() == *result_layout);
+  }
+  return Status::OK();
+}
+
 StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
   VLOG(2) << "Running layout assignment on module " << module->name();
   XLA_VLOG_LINES(3, module->ToString());
@@ -1564,52 +1662,45 @@ StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
                                 "before layout assignment",
                                 module->config().debug_options());
   }
-
-  TF_ASSIGN_OR_RETURN(auto points_to_analysis,
-                      TuplePointsToAnalysis::Run(module));
-
-  // Assign layouts to computations in an order such that a callee computation
-  // is handled before its caller computation. This ensures that the layout of
-  // all callers of a computation will agree.
-  std::list<HloComputation*> computation_post_order =
-      module->MakeComputationPostOrder();
-  for (auto* computation : module->MakeComputationPostOrder()) {
-    if (computation->IsFusionComputation()) {
-      continue;
-    }
-    // Clear existing layouts of the instructions.  All layouts must be assigned
-    // by the LayoutAssignment pass, except for those on infeeds, parameters,
-    // and the computation result. The latter two are specified in
-    // computation_layout, so we only need to keep the existing layouts for
-    // infeeds.  Clearing the layouts here avoids hiding potential bugs in the
-    // layout assignment pass that may accidently use the existing layout.
-    for (HloInstruction* instruction : computation->instructions()) {
-      if (instruction->opcode() == HloOpcode::kBitcast) {
-        // bitcasts are inherently layout sensitive and so a bitcast instruction
-        // present in the IR before layout assignment is a bug.
-        return InternalError(
-            "Unexpected bitcast operation seen during layout assignment: %s.",
-            instruction->ToString().c_str());
+  TF_RETURN_IF_ERROR(Init());
+
+  // We do two passes. The first one we pass a nullptr ComputationLayout to
+  // the RunOnComputation() calls (for non entry computations), and we register
+  // the ComputationLayout which are naturally flowing in DFS fashion to the
+  // parameters and root instruction.
+  // Walking in DFS mode though, means that we can end up with incorrect layouts
+  // when seen from an outer instruction, which has across-computation
+  // constraints to impose.
+  // For example, the kWhile instruction needs to enforce the same layouts for
+  // the parameters and root of the bosy, as well as the condition parameters.
+  // Similarly, the kConditional instruction needs to enforce the same layouts
+  // for the root of the true and false computations.
+  // So in the first pass, while allowing the layouts to flow to parameters and
+  // root, we also fix up the eventually inconsistent ComputationLayout, which
+  // will be then made mandatory by the second pass.
+  for (int64 i = 0; i < 2; ++i) {
+    TF_RETURN_IF_ERROR(ClearPreviousPassSideEffects(module));
+    TF_ASSIGN_OR_RETURN(auto points_to_analysis,
+                        TuplePointsToAnalysis::Run(module));
+    for (auto* computation : module->MakeComputationPostOrder()) {
+      if (computation->IsFusionComputation()) {
+        continue;
       }
-      if (instruction->opcode() != HloOpcode::kInfeed) {
-        LayoutUtil::ClearLayout(instruction->mutable_shape());
+      if (computation == module->entry_computation()) {
+        TF_RETURN_IF_ERROR(RunOnComputation(
+            entry_computation_layout_, *points_to_analysis,
+            module->entry_computation(), channel_layout_constraints_));
+      } else {
+        ComputationLayout* computation_layout =
+            (i == 0) ? nullptr : &FindOrDie(computation_layouts_, computation);
+        TF_RETURN_IF_ERROR(RunOnComputation(computation_layout,
+                                            *points_to_analysis, computation,
+                                            channel_layout_constraints_));
       }
     }
-    if (computation == module->entry_computation()) {
-      TF_RETURN_IF_ERROR(RunOnComputation(
-          *entry_computation_layout_, *points_to_analysis,
-          module->entry_computation(), channel_layout_constraints_));
-    } else {
-      ComputationLayout computation_layout(computation->ComputeProgramShape());
-      // Setting all embedded computations to the default layout is potentially
-      // suboptimal.
-      computation_layout.SetToDefaultLayout();
-      TF_RETURN_IF_ERROR(RunOnComputation(computation_layout,
-                                          *points_to_analysis, computation,
-                                          channel_layout_constraints_));
-    }
   }
-
+  TF_RETURN_IF_ERROR(PropagateComputationLayouts(module->entry_computation(),
+                                                 entry_computation_layout_));
   TF_RETURN_IF_ERROR(CheckLayouts(module));
 
   VLOG(3) << "After layout assignment:";
@@ -1619,9 +1710,54 @@ StatusOr<bool> LayoutAssignment::Run(HloModule* module) {
                                 "after layout assignment",
                                 module->config().debug_options());
   }
-
   // All layouts are reset then reassigned by this pass.
   return true;
 }
 
+Status LayoutAssignment::Init() {
+  computation_layouts_.clear();
+  return Status::OK();
+}
+
+Status LayoutAssignment::ClearPreviousPassSideEffects(HloModule* module) {
+  // Clear all the copies which have been added, and all the related
+  // instructions (like GTE and tuples).
+  int64 removed_copies = 0;
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* instruction :
+         computation->MakeInstructionPostOrder()) {
+      if (instruction->opcode() == HloOpcode::kCopy &&
+          added_copies_.count(instruction) > 0) {
+        VLOG(5) << "Removing added copy: " << instruction->ToString();
+        TF_RETURN_IF_ERROR(
+            instruction->ReplaceAllUsesWith(instruction->mutable_operand(0)));
+        TF_RETURN_IF_ERROR(computation->RemoveInstruction(instruction));
+        ++removed_copies;
+      }
+    }
+  }
+  added_copies_.clear();
+  if (removed_copies > 0) {
+    TupleSimplifier tuple_simplifier;
+    HloDCE dce;
+    TF_RETURN_IF_ERROR(tuple_simplifier.Run(module).status());
+    TF_RETURN_IF_ERROR(dce.Run(module).status());
+  }
+  return Status::OK();
+}
+
+Status LayoutAssignment::AddCopyForOperand(HloInstruction* instruction,
+                                           int64 operand_number) {
+  HloInstruction* operand = instruction->mutable_operand(operand_number);
+  if (operand->opcode() != HloOpcode::kCopy || operand->user_count() > 1) {
+    HloInstruction* copy =
+        instruction->parent()->AddInstruction(HloInstruction::CreateUnary(
+            operand->shape(), HloOpcode::kCopy, operand));
+    SetupCopiedInstruction(*operand, copy, {});
+    LayoutUtil::ClearLayout(copy->mutable_shape());
+    TF_RETURN_IF_ERROR(instruction->ReplaceOperandWith(operand_number, copy));
+  }
+  return Status::OK();
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/layout_assignment.h b/tensorflow/compiler/xla/service/layout_assignment.h
index ae4986d6ad9bc3..c287cca0c54ba1 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/layout_assignment.h
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -281,8 +282,8 @@ class LayoutAssignment : public HloPassInterface {
   // the case that no particular layout is requested.
   //
   // channel_constraints is both an input and output. Any sends or recvs that
-  // are present in channel_constraints will be layed out as constrained. Any
-  // unconstrained sends or recvs will be layed out as locally optimal and their
+  // are present in channel_constraints will be laid out as constrained. Any
+  // unconstrained sends or recvs will be laid out as locally optimal and their
   // layout will be added as a constraint to channel_constraints.
   //
   // If channel_constraints is nullptr, no kSend or kRecvs must be contained
@@ -362,12 +363,15 @@ class LayoutAssignment : public HloPassInterface {
       int64 operand_no);
 
  private:
+  // Initializes the layout assignment object for a new Run() call.
+  Status Init();
+
   // Adds constraints which must be satisfied for correctness on all
   // backends. Called once prior to propagating constraints.
-  Status AddMandatoryConstraints(
-      const ComputationLayout& computation_layout,
-      const ChannelLayoutConstraints* channel_constraints,
-      HloComputation* computation, LayoutConstraints* constraints);
+  Status AddMandatoryConstraints(const ComputationLayout* computation_layout,
+                                 ChannelLayoutConstraints* channel_constraints,
+                                 HloComputation* computation,
+                                 LayoutConstraints* constraints);
 
   // This method can be overridden to add backend-specific constraints to the
   // layout of the instructions of a computation. This method is called after
@@ -378,10 +382,12 @@ class LayoutAssignment : public HloPassInterface {
   }
 
   // Construct contraints and assign layouts to all instructions in the
-  // computation satisfying the given ComputationLayout. Layouts constraints are
-  // added, then propagated until all LogicalBuffers in the computation are
-  // constrained.
-  Status RunOnComputation(const ComputationLayout& computation_layout,
+  // computation satisfying the given ComputationLayout, if not nullptr.
+  // Otherwise the ComputationLayout will be calculated by propagating the
+  // computation instruction contraints.
+  // Layouts constraints are added, then propagated until all LogicalBuffers in
+  // the computation are constrained.
+  Status RunOnComputation(ComputationLayout* computation_layout,
                           const TuplePointsToAnalysis& points_to_analysis,
                           HloComputation* computation,
                           ChannelLayoutConstraints* channel_constraints);
@@ -402,6 +408,25 @@ class LayoutAssignment : public HloPassInterface {
   // necessary conditions.
   Status CheckLayouts(HloModule* module);
 
+  // Computes the ComputationLayout of the given computation based of the
+  // layouts assigned to parameters and root instruction, and inserts it to the
+  // computation_layouts_ map.
+  Status CalculateComputationLayout(HloComputation* computation);
+
+  // Clears all the layouts which can be cleared within a computation.
+  Status ClearComputationLayouts(HloComputation* computation);
+
+  // Clears the side effects of a previous pass, like added copy instructions.
+  Status ClearPreviousPassSideEffects(HloModule* module);
+
+  // Propagates the layouts computed by the layout assignment pass on the given
+  // computation, to the computation layout passed in to this API.
+  // This API propagates missing layout, and also checks that the caller
+  // specified have been respected, by comparing those with the parameters and
+  // root computation instruction.
+  Status PropagateComputationLayouts(HloComputation* computation,
+                                     ComputationLayout* computation_layout);
+
   ComputationLayout* entry_computation_layout_;
 
  protected:
@@ -418,21 +443,37 @@ class LayoutAssignment : public HloPassInterface {
   // Creates and returns a copy of the given instruction with a different
   // layout. Tuple-shaped instructions will be deep-copied, and the last Tuple
   // instruction producing the copy is returned.
-  static StatusOr<HloInstruction*> CreateCopyWithNewLayout(
+  StatusOr<HloInstruction*> CreateCopyWithNewLayout(
       const Shape& shape_with_layout, HloInstruction* instruction);
 
   // Creates a copy of the given operand if the operand's layout does not match
   // the given layout. This copy replaces the use in the given instruction.
   // Tuple operands will be deep-copied.
-  static Status CopyOperandIfLayoutsDiffer(const ShapeLayout& operand_layout,
-                                           HloInstruction* instruction,
-                                           int64 operand_no);
+  Status CopyOperandIfLayoutsDiffer(const ShapeLayout& operand_layout,
+                                    HloInstruction* instruction,
+                                    int64 operand_no);
+
+  // Registers a copy instruction added by the layout assignment pass.
+  void RegisterAddedCopy(HloInstruction* copy) {
+    CHECK_EQ(copy->opcode(), HloOpcode::kCopy);
+    added_copies_.insert(copy);
+  }
+
+  // Adds a copy for the operand of an instruction, unless such operand is
+  // already a copy, and has a single user (which is forcibly the instruction
+  // itself).
+  Status AddCopyForOperand(HloInstruction* instruction, int64 operand_number);
 
   // Map containing the layouts of all computations assigned so
   // far. Computations are handled in a topological sort where computations are
   // handled before their caller instructions so the layouts of caller
   // instructions can be set to match the computation.
   std::map<HloComputation*, ComputationLayout> computation_layouts_;
+
+  // Every copy added to the module by the layout assignment pass is registered
+  // here.
+  tensorflow::gtl::FlatSet<HloInstruction*> added_copies_;
+
   ChannelLayoutConstraints* channel_layout_constraints_;
 };
 
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index 4b1c9bad41de80..bf0448a67674f2 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -29,13 +29,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -651,7 +651,7 @@ TEST_F(LayoutAssignmentTest, TransposeWithinFusionDoesNotCrash) {
     }
   )";
 
-  auto module = tools::Parse(module_str).ValueOrDie();
+  auto module = ParseHloString(module_str).ValueOrDie();
 
   module =
       backend()
@@ -660,13 +660,12 @@ TEST_F(LayoutAssignmentTest, TransposeWithinFusionDoesNotCrash) {
                          /*device_allocator=*/nullptr)
           .ConsumeValueOrDie();
 
-  EXPECT_EQ(
-      ::tensorflow::Status::OK(),
-      backend()
-          .compiler()
-          ->RunBackend(std::move(module), backend().default_stream_executor(),
-                       /*device_allocator=*/nullptr)
-          .status());
+  EXPECT_EQ(Status::OK(), backend()
+                              .compiler()
+                              ->RunBackend(std::move(module),
+                                           backend().default_stream_executor(),
+                                           /*device_allocator=*/nullptr)
+                              .status());
 }
 
 // A GTE inside of a fusion node inherits the layout of its operand (which
@@ -692,7 +691,7 @@ TEST_F(LayoutAssignmentTest, GTEInheritsLayoutFromOperand) {
     }
   )";
 
-  auto module = tools::Parse(module_str).ValueOrDie();
+  auto module = ParseHloString(module_str).ValueOrDie();
   ComputationLayout computation_layout(
       module->entry_computation()->ComputeProgramShape());
   Shape param_shape = ShapeUtil::MakeTupleShape(
diff --git a/tensorflow/compiler/xla/service/liveness_util.cc b/tensorflow/compiler/xla/service/liveness_util.cc
deleted file mode 100644
index 68c99256a246ed..00000000000000
--- a/tensorflow/compiler/xla/service/liveness_util.cc
+++ /dev/null
@@ -1,379 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/liveness_util.h"
-
-#include <algorithm>
-#include <utility>
-#include <vector>
-
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/logical_buffer.h"
-#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/util.h"
-
-namespace xla {
-
-bool DoesNotUseOperandBuffer(const HloInstruction* operand,
-                             const ShapeIndex& index,
-                             const HloInstruction* user,
-                             const TuplePointsToAnalysis& points_to_analysis) {
-  CHECK(user->IsUserOf(operand))
-      << "user: " << user->ToString() << " operand: " << operand->ToString();
-  if (user->opcode() == HloOpcode::kGetTupleElement && !index.empty()) {
-    // GetTupleElement instructions only access the top-level buffer of their
-    // operand.
-    return true;
-  } else if (user->opcode() == HloOpcode::kFusion &&
-             user->fusion_kind() == HloInstruction::FusionKind::kLoop) {
-    // Find fusion parameter associated with 'operand'.
-    auto it = std::find_if(
-        user->fused_parameters().begin(), user->fused_parameters().end(),
-        [=](HloInstruction* fused_param) {
-          return user->operand(fused_param->parameter_number()) == operand;
-        });
-    CHECK(it != user->fused_parameters().end());
-    // Iterate through all users of all buffer aliases of the buffer in the
-    // points-to set of fusion parameter at 'index'.
-    // Return false if any uses are detected at 'index', returns true otherwise.
-    const LogicalBuffer* buffer =
-        points_to_analysis.GetBufferDefinedAt(*it, index).ValueOrDie();
-    for (const BufferAlias& alias :
-         points_to_analysis.GetBufferAliases(*buffer)) {
-      for (HloInstruction* alias_user : alias.instruction()->users()) {
-        if (DoesNotUseOperandBuffer(alias.instruction(), alias.index(),
-                                    alias_user, points_to_analysis)) {
-          continue;
-        }
-        // Return false: use detected at 'buffer' -> 'alias' -> 'alias_user'.
-        return false;
-      }
-    }
-    // Return true: found no uses of 'operand' at 'index' in 'user'.
-    return true;
-  }
-  return false;
-}
-
-bool DoesNotUseOperandBuffer(const HloInstruction* operand,
-                             const ShapeIndex& index,
-                             const HloInstruction* user,
-                             const HloDataflowAnalysis& dataflow) {
-  CHECK(user->IsUserOf(operand))
-      << "user: " << user->ToString() << " operand: " << operand->ToString();
-  if (user->opcode() == HloOpcode::kFusion &&
-      user->fusion_kind() == HloInstruction::FusionKind::kLoop) {
-    // Find fusion parameter associated with 'operand'.
-    HloInstruction* fusion_param =
-        user->fused_parameter(user->operand_index(operand));
-    // Iterate through all users of all uses of the fusion parameter value.
-    // Return false if any uses are detected, returns true otherwise.
-    const HloValue& value = dataflow.GetValueDefinedAt(fusion_param, index);
-    return value.uses().empty();
-  } else {
-    // Return false if no value at 'operand' and 'index' is used at 'user'.
-    for (const HloValue* value :
-         dataflow.GetValueSet(operand, index).values()) {
-      for (const HloUse& use : value->uses()) {
-        if (use.instruction == user) {
-          return false;
-        }
-      }
-    }
-  }
-
-  return true;
-}
-
-namespace {
-
-// Returns all uses of all aliases of 'instruction' at 'index' in 'uses'.
-// Each use in 'uses' is a pair (HloInstruction* user, int64 operand_index)
-// where 'user' is a user of an alias of 'instruction' at 'index', and
-// 'operand_index' is the operand index at which the alias appears in the
-// operand list of 'user'.
-std::vector<std::pair<HloInstruction*, int64>> GetAllUsesOfInstructionAtIndex(
-    HloInstruction* instruction, const ShapeIndex& index,
-    const TuplePointsToAnalysis& points_to_analysis) {
-  std::vector<std::pair<HloInstruction*, int64>> uses;
-  const PointsToSet::BufferList& points_to =
-      points_to_analysis.GetPointsToSet(instruction).element(index);
-  for (const LogicalBuffer* buffer : points_to) {
-    for (const BufferAlias& alias :
-         points_to_analysis.GetBufferAliases(*buffer)) {
-      for (HloInstruction* alias_user : alias.instruction()->users()) {
-        if (DoesNotUseOperandBuffer(alias.instruction(), alias.index(),
-                                    alias_user, points_to_analysis)) {
-          continue;
-        }
-        for (int64 op_idx : alias_user->OperandIndices(alias.instruction())) {
-          uses.emplace_back(alias_user, op_idx);
-        }
-      }
-    }
-  }
-  return uses;
-}
-
-// Returns true if there is exactly one use of 'operand' at 'operand_index'
-// in 'fusion.fused_instructions', where the singleton use is the fused
-// root at operand index 'use_operand_index'. Returns false otherwise.
-//
-// REQUIRES: 'fusion' opcode is a kFusion instruction.
-bool HasUniqueFusedUseOfOperandAt(
-    HloInstruction* operand, const ShapeIndex& operand_index,
-    HloInstruction* fusion, const int64 use_operand_index,
-    const TuplePointsToAnalysis& points_to_analysis) {
-  CHECK_EQ(HloOpcode::kFusion, fusion->opcode());
-  // Check that 'operand' is unique in the operand list of 'fusion'.
-  if (fusion->OperandIndices(operand).size() > 1) {
-    return false;
-  }
-  // Find fusion parameter associated with 'operand'.
-  const auto& fused_params = fusion->fused_parameters();
-  auto fused_param_it = std::find_if(
-      fused_params.begin(), fused_params.end(),
-      [&](HloInstruction* fused_param) {
-        return fusion->operand(fused_param->parameter_number()) == operand;
-      });
-  if (fused_param_it == fused_params.end()) {
-    return false;
-  }
-  auto* fused_param = *fused_param_it;
-  // Get all uses of 'operand' at 'index' from 'fusion.fused_instructions'.
-  auto fused_param_uses = GetAllUsesOfInstructionAtIndex(
-      fused_param, operand_index, points_to_analysis);
-  // Return true iff there is exactly one use of 'operand' at 'index', and
-  // this singleton use is the fused root (at index in 'use_operand_indices').
-  return fused_param_uses.size() == 1 &&
-         fused_param_uses[0].first == fusion->fused_expression_root() &&
-         fused_param_uses[0].second == use_operand_index;
-}
-
-}  // namespace
-
-// User and operand can share buffers iff both instructions emit the same shape
-// and layout, and 'user' meets one of the following qualifications:
-//
-// (1) Is element-wise. Or...
-// (2) Is a loop fusion instruction where the only use of 'operand' at 'index'
-//     in the set 'user.fused_instructions' is a DynamicUpdateSlice fused root
-//     at operand 0. Or...
-// (3) Is a kDot -> kAdd (or fused kTransposeDot -> kAdd) output fusion
-//     instruction where the only use of 'operand' at 'index' in the set
-//     'user.fused_instructions' is a kAdd fused root at operand 0 or 1. Or...
-// (4) The 'user' of 'operand' is DynamicUpdateSlice or While at operand index
-//     0.
-//
-// (2) and (3) can only be determined if points-to analysis is available.
-bool CanShareOperandBufferWithUser(
-    HloInstruction* operand, const ShapeIndex& operand_index,
-    HloInstruction* user, const ShapeIndex& user_index,
-    const TuplePointsToAnalysis& points_to_analysis) {
-  CHECK(user->IsUserOf(operand))
-      << "user: " << user->ToString() << " operand: " << operand->ToString();
-  const Shape& operand_subshape =
-      ShapeUtil::GetSubshape(operand->shape(), operand_index);
-  const Shape& user_subshape =
-      ShapeUtil::GetSubshape(user->shape(), user_index);
-  // Check that operand and user emit the same shape and layout.
-  if (!ShapeUtil::Equal(operand_subshape, user_subshape)) {
-    return false;
-  }
-  if (user->opcode() == HloOpcode::kFusion) {
-    if (user->fusion_kind() == HloInstruction::FusionKind::kLoop &&
-        user->fused_expression_root()->opcode() ==
-            HloOpcode::kDynamicUpdateSlice) {
-      // Loop fusion with kDynamicUpdateSlice fused root.
-      //
-      // Returns true iff there is exactly one use of 'operand' at shape index
-      // 'operand_index', and this singleton use is the fused root at operand
-      // index 0.
-      return HasUniqueFusedUseOfOperandAt(operand, operand_index, user, 0,
-                                          points_to_analysis);
-    } else if (user->fusion_kind() == HloInstruction::FusionKind::kOutput &&
-               user->fused_expression_root()->opcode() == HloOpcode::kAdd) {
-      // Output fusion with kAdd fused root.
-
-      // Check if one operand of kAdd fused root is either kDot, or nested
-      // kFusion of kind kTransposeDot.
-      auto* add = user->fused_expression_root();
-      auto add_operand_it =
-          std::find_if(add->operands().begin(), add->operands().end(),
-                       [&](HloInstruction* operand) {
-                         return operand->opcode() == HloOpcode::kConvolution ||
-                                operand->opcode() == HloOpcode::kDot ||
-                                (operand->opcode() == HloOpcode::kFusion &&
-                                 operand->fusion_kind() ==
-                                     HloInstruction::FusionKind::kTransposeDot);
-                       });
-      if (add_operand_it == add->operands().end()) {
-        return false;
-      }
-      auto* matched_add_operand = *add_operand_it;
-      // Calculate operand index of 'add' operand which was not matched above.
-      const int64 other_add_operand_index =
-          matched_add_operand == add->operand(0) ? 1 : 0;
-      // Returns true iff there is exactly one use of 'operand' at shape index
-      // 'operand_index', and this singleton use is the fused root (at operand
-      // index 'other_add_operand_index').
-      return HasUniqueFusedUseOfOperandAt(operand, operand_index, user,
-                                          other_add_operand_index,
-                                          points_to_analysis);
-    }
-  }
-  if (user->opcode() == HloOpcode::kDynamicUpdateSlice ||
-      user->opcode() == HloOpcode::kWhile) {
-    // We eliminated other users in BufferLiveness::live_range_strictly_before,
-    // so here we just need to check that the use is at operand index 0.
-    std::vector<int64> operand_indices = user->OperandIndices(operand);
-    return operand_indices.size() == 1 && operand_indices[0] == 0;
-  }
-  if (user->opcode() == HloOpcode::kCall) {
-    // TODO(b/62548313): Remove when buffer assignment is module scoped and
-    // does not assign buffers to calls.
-    // Find called computation parameter associated with 'operand'.
-    const std::vector<int64> operand_indices = user->OperandIndices(operand);
-    if (operand_indices.size() > 1) {
-      return false;
-    }
-    CHECK_EQ(1, operand_indices.size());
-    auto* param = user->to_apply()->parameter_instruction(operand_indices[0]);
-    // Get all uses of 'operand' at 'index' in called computation.
-    auto param_uses = GetAllUsesOfInstructionAtIndex(param, operand_index,
-                                                     points_to_analysis);
-
-    // Return true iff:
-    // *) There exists exactly one use of 'operand' in called computation.
-    // *) The unique use is by the root instruction of called computation.
-    //    (Note: we check the root of the called computation, because the
-    //     root result buffer is required to alias with the Call result buffer).
-    // *) The root instruction of the called computation is element-wise on
-    //    'operand'.
-    auto* callee_root = user->to_apply()->root_instruction();
-    return param_uses.size() == 1 && param_uses[0].first == callee_root &&
-           callee_root->IsElementwiseOnOperand(param_uses[0].second);
-  }
-  // Check if 'user' is element-wise.
-  return user->IsElementwise();
-}
-
-bool CanShareOperandBufferWithUser(HloInstruction* operand,
-                                   const ShapeIndex& operand_index,
-                                   HloInstruction* user,
-                                   const ShapeIndex& user_index,
-                                   const HloDataflowAnalysis& dataflow) {
-  CHECK(user->IsUserOf(operand))
-      << "user: " << user->ToString() << " operand: " << operand->ToString();
-  const Shape& operand_subshape =
-      ShapeUtil::GetSubshape(operand->shape(), operand_index);
-  const Shape& user_subshape =
-      ShapeUtil::GetSubshape(user->shape(), user_index);
-  // Check that operand and user emit the same shape and layout.
-  if (!ShapeUtil::Equal(operand_subshape, user_subshape)) {
-    return false;
-  }
-
-  if (user->opcode() == HloOpcode::kFusion) {
-    // Get the parameter associated with 'operand';
-    HloInstruction* fusion_param =
-        user->fused_parameter(user->operand_index(operand));
-
-    const HloValue& value =
-        dataflow.GetValueDefinedAt(fusion_param, operand_index);
-    if (value.uses().size() != 1) {
-      return false;
-    }
-    const HloUse& use = value.uses()[0];
-
-    if (user->fusion_kind() == HloInstruction::FusionKind::kLoop &&
-        user->fused_expression_root()->opcode() ==
-            HloOpcode::kDynamicUpdateSlice) {
-      // Loop fusion with kDynamicUpdateSlice fused root.
-      //
-      // Returns true iff there is exactly one use of 'operand' at shape index
-      // 'operand_index', and this singleton use is the fused root at operand
-      // index 0.
-      return use.instruction == user->fused_expression_root() &&
-             use.operand_number == 0;
-    } else if (user->fusion_kind() == HloInstruction::FusionKind::kOutput &&
-               user->fused_expression_root()->opcode() == HloOpcode::kAdd) {
-      // Output fusion with kAdd fused root.
-
-      // Check if one operand of kAdd fused root is either kDot, or nested
-      // kFusion of kind kTransposeDot.
-      auto* add = user->fused_expression_root();
-      auto add_operand_it =
-          std::find_if(add->operands().begin(), add->operands().end(),
-                       [&](HloInstruction* operand) {
-                         return operand->opcode() == HloOpcode::kConvolution ||
-                                operand->opcode() == HloOpcode::kDot ||
-                                (operand->opcode() == HloOpcode::kFusion &&
-                                 operand->fusion_kind() ==
-                                     HloInstruction::FusionKind::kTransposeDot);
-                       });
-      if (add_operand_it == add->operands().end()) {
-        return false;
-      }
-      auto* matched_add_operand = *add_operand_it;
-      // Calculate operand index of 'add' operand which was not matched above.
-      const int64 other_add_operand_index =
-          matched_add_operand == add->operand(0) ? 1 : 0;
-      // Returns true iff there is exactly one use of 'operand' at shape index
-      // 'operand_index', and this singleton use is the fused root (at operand
-      // index 'other_add_operand_index').
-      return use.instruction == user->fused_expression_root() &&
-             use.operand_number == other_add_operand_index;
-    }
-  }
-  if (user->opcode() == HloOpcode::kDynamicUpdateSlice ||
-      user->opcode() == HloOpcode::kWhile) {
-    // We eliminated other users in BufferLiveness::live_range_strictly_before,
-    // so here we just need to check that the use is at operand index 0.
-    std::vector<int64> operand_indices = user->OperandIndices(operand);
-    return operand_indices.size() == 1 && operand_indices[0] == 0;
-  }
-  if (user->opcode() == HloOpcode::kCall) {
-    // Get all uses of value defined by 'operand' at 'operand_index'.
-    const auto& uses =
-        dataflow.GetValueDefinedAt(operand, operand_index).uses();
-    // Return true iff:
-    // *) There exists two uses of 'operand'.
-    // *) One use is by 'user' (caller).
-    // *) One use is by root instruction of called computation (callee root).
-    //    (Note: we check the root of the called computation, because the
-    //     root result buffer is required to alias with the Call result buffer).
-    // *) The root instruction of the called computation is element-wise on
-    //    'operand'.
-    const bool found_caller_use =
-        std::find_if(uses.begin(), uses.end(), [user](const HloUse& use) {
-          return use.instruction == user;
-        }) != uses.end();
-    auto* callee_root = user->to_apply()->root_instruction();
-    const bool found_elementwise_callee_use =
-        std::find_if(
-            uses.begin(), uses.end(), [callee_root](const HloUse& use) {
-              return use.instruction == callee_root &&
-                     callee_root->IsElementwiseOnOperand(use.operand_number);
-            }) != uses.end();
-    return uses.size() == 2 && found_caller_use && found_elementwise_callee_use;
-  }
-  // Check if 'user' is element-wise.
-  return user->IsElementwise();
-}
-
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/liveness_util.h b/tensorflow/compiler/xla/service/liveness_util.h
deleted file mode 100644
index 28ef991880039d..00000000000000
--- a/tensorflow/compiler/xla/service/liveness_util.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// A collection of utilities on the HLO graph.
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LIVENESS_UTIL_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_LIVENESS_UTIL_H_
-
-#include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/types.h"
-
-namespace xla {
-
-// Returns true if 'user' cannot possibly use the buffer at 'index' in
-// 'operand'. Returns false otherwise.
-//
-// REQUIRES: 'operand' is an operand of 'user'.
-//
-// TODO(b/65835246): Remove TuplePointsToAnalysis overload when all users have
-// moved over to the dataflow overload.
-bool DoesNotUseOperandBuffer(const HloInstruction* operand,
-                             const ShapeIndex& index,
-                             const HloInstruction* user,
-                             const TuplePointsToAnalysis& points_to_analysis);
-bool DoesNotUseOperandBuffer(const HloInstruction* operand,
-                             const ShapeIndex& index,
-                             const HloInstruction* user,
-                             const HloDataflowAnalysis& dataflow);
-
-// Returns true if 'user' (at 'user_index') can share a buffer with its operand
-// 'operand' (at 'operand_index'). Returns false otherwise.
-//
-// REQUIRES: 'operand' is an operand of 'user'.
-//
-// TODO(b/65835246): Remove TuplePointsToAnalysis overload when all users have
-// moved over to the dataflow overload.
-bool CanShareOperandBufferWithUser(
-    HloInstruction* operand, const ShapeIndex& operand_index,
-    HloInstruction* user, const ShapeIndex& user_index,
-    const TuplePointsToAnalysis& points_to_analysis);
-bool CanShareOperandBufferWithUser(HloInstruction* operand,
-                                   const ShapeIndex& operand_index,
-                                   HloInstruction* user,
-                                   const ShapeIndex& user_index,
-                                   const HloDataflowAnalysis& dataflow);
-
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_LIVENESS_UTIL_H_
diff --git a/tensorflow/compiler/xla/service/liveness_util_test.cc b/tensorflow/compiler/xla/service/liveness_util_test.cc
deleted file mode 100644
index f8b309488eeb53..00000000000000
--- a/tensorflow/compiler/xla/service/liveness_util_test.cc
+++ /dev/null
@@ -1,463 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/liveness_util.h"
-
-#include <memory>
-
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/tuple_points_to_analysis.h"
-#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-
-namespace xla {
-namespace {
-
-class PointsToAnalysisTestBase : public HloTestBase {
- protected:
-  void BuildModule(std::unique_ptr<HloComputation> computation) {
-    module_ = CreateNewModule();
-    computation_ = module_->AddEntryComputation(std::move(computation));
-  }
-
-  void RunAnalysis() {
-    CHECK_NOTNULL(module_.get());
-    points_to_analysis_ =
-        TuplePointsToAnalysis::Run(module_.get()).ConsumeValueOrDie();
-    dataflow_analysis_ = HloDataflowAnalysis::Run(*module_).ConsumeValueOrDie();
-  }
-
-  void BuildModuleAndRunAnalysis(std::unique_ptr<HloComputation> computation) {
-    BuildModule(std::move(computation));
-    RunAnalysis();
-  }
-
-  std::unique_ptr<HloModule> module_;
-  HloComputation* computation_ = nullptr;
-  std::unique_ptr<TuplePointsToAnalysis> points_to_analysis_;
-  std::unique_ptr<HloDataflowAnalysis> dataflow_analysis_;
-};
-
-class DoesNotUseOperandBufferTest : public PointsToAnalysisTestBase {};
-
-TEST_F(DoesNotUseOperandBufferTest, GetTupleElement) {
-  auto builder = HloComputation::Builder(TestName());
-
-  Shape elem_shape = ShapeUtil::MakeShape(F32, {8});
-  auto tuple = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeTupleShape({elem_shape, elem_shape}), "tuple"));
-  auto gte0 = builder.AddInstruction(
-      HloInstruction::CreateGetTupleElement(elem_shape, tuple, 0));
-  auto gte1 = builder.AddInstruction(
-      HloInstruction::CreateGetTupleElement(elem_shape, tuple, 1));
-  builder.AddInstruction(
-      HloInstruction::CreateBinary(elem_shape, HloOpcode::kAdd, gte0, gte1));
-
-  BuildModuleAndRunAnalysis(builder.Build());
-
-  // GetTupleElement instructions only access the top-level buffer of their
-  // operand.
-  EXPECT_TRUE(DoesNotUseOperandBuffer(tuple, {0}, gte0, *points_to_analysis_));
-  EXPECT_TRUE(DoesNotUseOperandBuffer(tuple, {1}, gte1, *points_to_analysis_));
-  EXPECT_FALSE(DoesNotUseOperandBuffer(tuple, {}, gte0, *points_to_analysis_));
-  EXPECT_FALSE(DoesNotUseOperandBuffer(tuple, {}, gte1, *points_to_analysis_));
-
-  EXPECT_TRUE(DoesNotUseOperandBuffer(tuple, {0}, gte0, *dataflow_analysis_));
-  EXPECT_TRUE(DoesNotUseOperandBuffer(tuple, {1}, gte1, *dataflow_analysis_));
-  EXPECT_FALSE(DoesNotUseOperandBuffer(tuple, {}, gte0, *dataflow_analysis_));
-  EXPECT_FALSE(DoesNotUseOperandBuffer(tuple, {}, gte1, *dataflow_analysis_));
-}
-
-TEST_F(DoesNotUseOperandBufferTest, FusedDynamicUpdateSlice) {
-  auto builder = HloComputation::Builder(TestName());
-
-  Shape data_shape = ShapeUtil::MakeShape(F32, {8});
-  auto tuple = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeTupleShape({data_shape, data_shape}), "tuple"));
-  auto gte0 = builder.AddInstruction(
-      HloInstruction::CreateGetTupleElement(data_shape, tuple, 0));
-  auto gte1 = builder.AddInstruction(
-      HloInstruction::CreateGetTupleElement(data_shape, tuple, 1));
-
-  // Create a DynamicUpdateSlice instruction of tuple element 1.
-  auto starts = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR1<int32>({2})));
-  auto update = builder.AddInstruction(HloInstruction::CreateConstant(
-      Literal::CreateR1<float>({2.f, 2.f, 2.f})));
-  auto dynamic_update_slice =
-      builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-          data_shape, gte1, update, starts));
-  builder.AddInstruction(
-      HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
-
-  BuildModule(builder.Build());
-  auto fusion = computation_->CreateFusionInstruction(
-      {dynamic_update_slice, starts, update, gte1},
-      HloInstruction::FusionKind::kLoop);
-  RunAnalysis();
-
-  // The fusion instruction never uses tuple element 0, but does use element 1.
-  EXPECT_TRUE(
-      DoesNotUseOperandBuffer(tuple, {0}, fusion, *points_to_analysis_));
-  EXPECT_FALSE(
-      DoesNotUseOperandBuffer(tuple, {1}, fusion, *points_to_analysis_));
-
-  EXPECT_TRUE(DoesNotUseOperandBuffer(tuple, {0}, fusion, *dataflow_analysis_));
-  EXPECT_FALSE(
-      DoesNotUseOperandBuffer(tuple, {1}, fusion, *dataflow_analysis_));
-}
-
-class CanShareOperandBufferWithUserTest : public PointsToAnalysisTestBase {};
-
-TEST_F(CanShareOperandBufferWithUserTest, ElementWiseSameShape) {
-  auto builder = HloComputation::Builder(TestName());
-
-  Shape shape = ShapeUtil::MakeShape(F32, {8});
-  auto param = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, shape, "param"));
-  auto exp = builder.AddInstruction(
-      HloInstruction::CreateUnary(shape, HloOpcode::kExp, param));
-  auto log = builder.AddInstruction(
-      HloInstruction::CreateUnary(shape, HloOpcode::kLog, exp));
-
-  BuildModuleAndRunAnalysis(builder.Build());
-
-  EXPECT_TRUE(
-      CanShareOperandBufferWithUser(param, {}, exp, {}, *points_to_analysis_));
-  EXPECT_TRUE(
-      CanShareOperandBufferWithUser(exp, {}, log, {}, *points_to_analysis_));
-
-  EXPECT_TRUE(
-      CanShareOperandBufferWithUser(param, {}, exp, {}, *dataflow_analysis_));
-  EXPECT_TRUE(
-      CanShareOperandBufferWithUser(exp, {}, log, {}, *dataflow_analysis_));
-}
-
-TEST_F(CanShareOperandBufferWithUserTest, ElementWiseDifferentShape) {
-  auto builder = HloComputation::Builder(TestName());
-
-  Shape in_shape = ShapeUtil::MakeShape(F32, {8});
-  Shape out_shape = ShapeUtil::MakeShape(PRED, {8});
-  auto param0 = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, in_shape, "param0"));
-  auto param1 = builder.AddInstruction(
-      HloInstruction::CreateParameter(1, in_shape, "param1"));
-  auto result = builder.AddInstruction(
-      HloInstruction::CreateBinary(out_shape, HloOpcode::kEq, param0, param1));
-
-  BuildModuleAndRunAnalysis(builder.Build());
-
-  EXPECT_FALSE(CanShareOperandBufferWithUser(param0, {}, result, {},
-                                             *points_to_analysis_));
-  EXPECT_FALSE(CanShareOperandBufferWithUser(param1, {}, result, {},
-                                             *points_to_analysis_));
-
-  EXPECT_FALSE(CanShareOperandBufferWithUser(param0, {}, result, {},
-                                             *dataflow_analysis_));
-  EXPECT_FALSE(CanShareOperandBufferWithUser(param1, {}, result, {},
-                                             *dataflow_analysis_));
-}
-
-TEST_F(CanShareOperandBufferWithUserTest, CopyShares) {
-  auto builder = HloComputation::Builder(TestName());
-
-  Shape shape = ShapeUtil::MakeShape(F32, {8});
-  auto param = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, shape, "param"));
-  auto exp = builder.AddInstruction(
-      HloInstruction::CreateUnary(shape, HloOpcode::kExp, param));
-  auto copy = builder.AddInstruction(
-      HloInstruction::CreateUnary(shape, HloOpcode::kCopy, exp));
-
-  BuildModuleAndRunAnalysis(builder.Build());
-
-  EXPECT_TRUE(
-      CanShareOperandBufferWithUser(param, {}, exp, {}, *points_to_analysis_));
-  EXPECT_TRUE(
-      CanShareOperandBufferWithUser(exp, {}, copy, {}, *points_to_analysis_));
-
-  EXPECT_TRUE(
-      CanShareOperandBufferWithUser(param, {}, exp, {}, *dataflow_analysis_));
-  EXPECT_TRUE(
-      CanShareOperandBufferWithUser(exp, {}, copy, {}, *dataflow_analysis_));
-}
-
-TEST_F(CanShareOperandBufferWithUserTest, FusedDynamicUpdateSlice) {
-  auto builder = HloComputation::Builder(TestName());
-
-  Shape data_shape = ShapeUtil::MakeShape(F32, {8});
-  auto tuple = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeTupleShape({data_shape, data_shape}), "tuple"));
-  auto gte0 = builder.AddInstruction(
-      HloInstruction::CreateGetTupleElement(data_shape, tuple, 0));
-  auto gte1 = builder.AddInstruction(
-      HloInstruction::CreateGetTupleElement(data_shape, tuple, 1));
-
-  // Create a DynamicUpdateSlice instruction of tuple element 1.
-  auto starts = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR1<int32>({2})));
-  auto update = builder.AddInstruction(HloInstruction::CreateConstant(
-      Literal::CreateR1<float>({2.f, 2.f, 2.f})));
-  auto dynamic_update_slice =
-      builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-          data_shape, gte1, update, starts));
-  builder.AddInstruction(
-      HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
-
-  BuildModule(builder.Build());
-  auto fusion = computation_->CreateFusionInstruction(
-      {dynamic_update_slice, starts, update, gte1},
-      HloInstruction::FusionKind::kLoop);
-  RunAnalysis();
-
-  // The fusion instruction can share with tuple element 1.
-  EXPECT_FALSE(CanShareOperandBufferWithUser(tuple, {0}, fusion, {},
-                                             *points_to_analysis_));
-  EXPECT_TRUE(CanShareOperandBufferWithUser(tuple, {1}, fusion, {},
-                                            *points_to_analysis_));
-
-  EXPECT_FALSE(CanShareOperandBufferWithUser(tuple, {0}, fusion, {},
-                                             *dataflow_analysis_));
-  EXPECT_TRUE(CanShareOperandBufferWithUser(tuple, {1}, fusion, {},
-                                            *dataflow_analysis_));
-}
-
-TEST_F(CanShareOperandBufferWithUserTest, DynamicUpdateSliceCanShare) {
-  auto builder = HloComputation::Builder(TestName());
-
-  Shape data_shape = ShapeUtil::MakeShape(F32, {8});
-  Shape update_shape = ShapeUtil::MakeShape(F32, {4});
-  Shape starts_shape = ShapeUtil::MakeShape(S32, {1});
-  auto data = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, data_shape, "data"));
-  auto update = builder.AddInstruction(
-      HloInstruction::CreateParameter(1, update_shape, "update"));
-  auto starts = builder.AddInstruction(
-      HloInstruction::CreateParameter(2, starts_shape, "starts"));
-  auto dus = builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-      data_shape, data, update, starts));
-
-  BuildModuleAndRunAnalysis(builder.Build());
-
-  // The DynamicUpdateSlice instruction can share with the data operand, but not
-  // with update or starts.
-  EXPECT_TRUE(
-      CanShareOperandBufferWithUser(data, {}, dus, {}, *points_to_analysis_));
-  EXPECT_FALSE(
-      CanShareOperandBufferWithUser(update, {}, dus, {}, *points_to_analysis_));
-  EXPECT_FALSE(
-      CanShareOperandBufferWithUser(starts, {}, dus, {}, *points_to_analysis_));
-
-  EXPECT_TRUE(
-      CanShareOperandBufferWithUser(data, {}, dus, {}, *dataflow_analysis_));
-  EXPECT_FALSE(
-      CanShareOperandBufferWithUser(update, {}, dus, {}, *dataflow_analysis_));
-  EXPECT_FALSE(
-      CanShareOperandBufferWithUser(starts, {}, dus, {}, *dataflow_analysis_));
-}
-
-TEST_F(CanShareOperandBufferWithUserTest, FusedDotAdd) {
-  auto builder = HloComputation::Builder(TestName());
-  Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2});
-
-  auto a = builder.AddInstruction(HloInstruction::CreateConstant(
-      Literal::CreateR2<float>({{1.0, 0.0}, {0.0, 1.0}})));
-  auto b = builder.AddInstruction(HloInstruction::CreateConstant(
-      Literal::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
-
-  DotDimensionNumbers dot_dnums;
-  dot_dnums.add_lhs_contracting_dimensions(1);
-  dot_dnums.add_rhs_contracting_dimensions(0);
-  auto dot = builder.AddInstruction(
-      HloInstruction::CreateDot(data_shape, a, b, dot_dnums));
-
-  auto one = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
-  auto add_operand = builder.AddInstruction(
-      HloInstruction::CreateBroadcast(data_shape, one, {1}));
-
-  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
-      data_shape, HloOpcode::kAdd, dot, add_operand));
-
-  BuildModule(builder.Build());
-  auto fusion = computation_->CreateFusionInstruction(
-      {add, dot}, HloInstruction::FusionKind::kOutput);
-  RunAnalysis();
-
-  // Output fused dot add should be able to share buffer with 'add_operand'.
-  EXPECT_TRUE(CanShareOperandBufferWithUser(add_operand, {}, fusion, {},
-                                            *points_to_analysis_));
-
-  EXPECT_TRUE(CanShareOperandBufferWithUser(add_operand, {}, fusion, {},
-                                            *dataflow_analysis_));
-}
-
-TEST_F(CanShareOperandBufferWithUserTest, FusedTransposeDotAdd) {
-  auto builder = HloComputation::Builder(TestName());
-  Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2});
-
-  auto a = builder.AddInstruction(HloInstruction::CreateConstant(
-      Literal::CreateR2<float>({{1.0, 0.0}, {0.0, 1.0}})));
-  auto b = builder.AddInstruction(HloInstruction::CreateConstant(
-      Literal::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
-  auto b_t = builder.AddInstruction(
-      HloInstruction::CreateTranspose(data_shape, b, {1, 0}));
-
-  DotDimensionNumbers dot_dnums;
-  dot_dnums.add_lhs_contracting_dimensions(1);
-  dot_dnums.add_rhs_contracting_dimensions(0);
-  auto dot = builder.AddInstruction(
-      HloInstruction::CreateDot(data_shape, a, b_t, dot_dnums));
-
-  auto one = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
-  auto add_operand = builder.AddInstruction(
-      HloInstruction::CreateBroadcast(data_shape, one, {1}));
-
-  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
-      data_shape, HloOpcode::kAdd, dot, add_operand));
-
-  BuildModule(builder.Build());
-
-  auto nested_fusion = computation_->CreateFusionInstruction(
-      {dot, b_t}, HloInstruction::FusionKind::kTransposeDot);
-
-  auto fusion = computation_->CreateFusionInstruction(
-      {add, nested_fusion}, HloInstruction::FusionKind::kOutput);
-  RunAnalysis();
-
-  // Output fused transpose-dot-add should be share buffer with 'add_operand'.
-  EXPECT_TRUE(CanShareOperandBufferWithUser(add_operand, {}, fusion, {},
-                                            *points_to_analysis_));
-
-  EXPECT_TRUE(CanShareOperandBufferWithUser(add_operand, {}, fusion, {},
-                                            *dataflow_analysis_));
-}
-
-TEST_F(CanShareOperandBufferWithUserTest, OutputFusionCantAliasOperandBuffer) {
-  auto builder = HloComputation::Builder(TestName());
-  Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2});
-
-  auto one = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
-  auto operand = builder.AddInstruction(
-      HloInstruction::CreateBroadcast(data_shape, one, {1}));
-
-  auto reverse = builder.AddInstruction(
-      HloInstruction::CreateReverse(data_shape, operand, {0, 1}));
-
-  auto two = builder.AddInstruction(HloInstruction::CreateConstant(
-      Literal::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
-
-  auto add = builder.AddInstruction(
-      HloInstruction::CreateBinary(data_shape, HloOpcode::kAdd, reverse, two));
-
-  BuildModule(builder.Build());
-  auto fusion = computation_->CreateFusionInstruction(
-      {add, two, reverse}, HloInstruction::FusionKind::kOutput);
-  RunAnalysis();
-
-  // Output fused operand->reverse->add cannot alias operand buffer 'operand'.
-  EXPECT_FALSE(CanShareOperandBufferWithUser(operand, {}, fusion, {},
-                                             *points_to_analysis_));
-
-  EXPECT_FALSE(CanShareOperandBufferWithUser(operand, {}, fusion, {},
-                                             *dataflow_analysis_));
-}
-
-TEST_F(CanShareOperandBufferWithUserTest, WhileCanShare) {
-  Shape data_shape = ShapeUtil::MakeShape(F32, {8});
-
-  auto make_cond = [this, &data_shape]() {
-    auto builder = HloComputation::Builder(TestName() + ".Cond");
-    auto data = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, data_shape, "data"));
-    builder.AddInstruction(HloInstruction::CreateBinary(
-        ShapeUtil::MakeShape(PRED, {}), HloOpcode::kEq, data, data));
-    return builder.Build();
-  };
-
-  auto make_body = [this, &data_shape]() {
-    auto builder = HloComputation::Builder(TestName() + ".Body");
-    auto data = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, data_shape, "data"));
-    builder.AddInstruction(
-        HloInstruction::CreateBinary(data_shape, HloOpcode::kAdd, data, data));
-    return builder.Build();
-  };
-
-  module_ = CreateNewModule();
-  HloComputation* cond_computation =
-      module_->AddEmbeddedComputation(make_cond());
-  HloComputation* body_computation =
-      module_->AddEmbeddedComputation(make_body());
-
-  auto builder = HloComputation::Builder(TestName());
-  auto data = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, data_shape, "data"));
-  auto whil = builder.AddInstruction(HloInstruction::CreateWhile(
-      data_shape, cond_computation, body_computation, data));
-  computation_ = module_->AddEntryComputation(builder.Build());
-
-  RunAnalysis();
-
-  // The While instruction can share with the data operand.
-  EXPECT_TRUE(
-      CanShareOperandBufferWithUser(data, {}, whil, {}, *points_to_analysis_));
-
-  EXPECT_TRUE(
-      CanShareOperandBufferWithUser(data, {}, whil, {}, *dataflow_analysis_));
-}
-
-// Tests that Call can alias operand buffer if the only use of the operand
-// in the called computation is an elementwise instruction.
-TEST_F(CanShareOperandBufferWithUserTest, CallToComputationWithFusionRoot) {
-  Shape shape = ShapeUtil::MakeShape(F32, {8});
-  // Build sub-computation with fusion root.
-  auto sub_builder = HloComputation::Builder(TestName() + "_sub");
-  auto sub_param = sub_builder.AddInstruction(
-      HloInstruction::CreateParameter(0, shape, "sub_param"));
-  auto one = sub_builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
-  auto ones = sub_builder.AddInstruction(
-      HloInstruction::CreateBroadcast(shape, one, {1}));
-  auto add = sub_builder.AddInstruction(
-      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, sub_param, ones));
-
-  module_ = CreateNewModule();
-  auto sub_computation = module_->AddEmbeddedComputation(sub_builder.Build());
-  sub_computation->CreateFusionInstruction({add, ones},
-                                           HloInstruction::FusionKind::kLoop);
-
-  // Build entry-computation with kCall which calls 'sub_computation'.
-  auto builder = HloComputation::Builder(TestName());
-
-  auto param = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, shape, "param"));
-  auto reverse =
-      builder.AddInstruction(HloInstruction::CreateReverse(shape, param, {0}));
-  auto call = builder.AddInstruction(
-      HloInstruction::CreateCall(shape, {reverse}, sub_computation));
-  computation_ = module_->AddEntryComputation(builder.Build());
-
-  RunAnalysis();
-
-  EXPECT_TRUE(CanShareOperandBufferWithUser(reverse, {}, call, {},
-                                            *points_to_analysis_));
-  EXPECT_TRUE(CanShareOperandBufferWithUser(reverse, {}, call, {},
-                                            *dataflow_analysis_));
-}
-
-}  // namespace
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
index bc683a1880b010..d909845a3a21fc 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
@@ -80,8 +80,10 @@ Status FusedIrEmitter::HandleConstant(HloInstruction* constant) {
       *ir_builder_->GetInsertBlock()->getModule(), initializer->getType(),
       /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, initializer,
       /*Name=*/"");
+  llvm::Constant* shape_constant = llvm::ConstantExpr::getBitCast(
+      global, llvm_ir::ShapeToIrType(literal.shape(), module_)->getPointerTo());
   generators_[constant] = [=](const IrArray::Index& index) {
-    return IrArray(global, constant->shape())
+    return IrArray(shape_constant, constant->shape())
         .EmitReadArrayElement(index, ir_builder_);
   };
 
@@ -151,7 +153,7 @@ Status FusedIrEmitter::HandleTuple(HloInstruction* tuple) {
 
 Status FusedIrEmitter::FinishVisit(HloInstruction* root) {
   fused_root_ = root;
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 FusedIrEmitter::Generator FusedIrEmitter::GetRootGenerator() const {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
index 3312a888443233..7323abeb207715 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
@@ -333,18 +333,7 @@ llvm::Value* IrArray::EmitArrayElementAddress(
   }
   CHECK_EQ(index.size(), ShapeUtil::Rank(*shape_));
 
-  std::vector<llvm::Value*> actual_index;
-  bool is_implicit_broadcast = false;
-  // We perform broadcasting when the operand shape has dimension(s) of size
-  // 1. In this case we fix the index value for that dimension to zero. This
-  // effectively broadcasts along this dimension.
-  for (int64 i = 0; i < index.size(); ++i) {
-    auto dim = shape_->dimensions(i);
-    actual_index.push_back(dim == 1 ? ir_builder->getInt64(0) : index[i]);
-    is_implicit_broadcast |= dim == 1;
-  }
-
-  if (!is_implicit_broadcast && index.LinearValidOnShape(*shape_)) {
+  if (index.LinearValidOnShape(*shape_)) {
     llvm::Module* module =
         ir_builder->GetInsertBlock()->getParent()->getParent();
     return ir_builder->CreateInBoundsGEP(
@@ -354,6 +343,15 @@ llvm::Value* IrArray::EmitArrayElementAddress(
         {index.linear()}, llvm_ir::AsStringRef(name));
   }
 
+  std::vector<llvm::Value*> actual_index;
+  for (int64 i = 0; i < index.size(); ++i) {
+    // When dimension i is of size 1, LLVM optimization is able to replace
+    // index[i] with 0. However, setting index[i] to 0 here still allows LLVM to
+    // produce better code in some cases.
+    auto dim = shape_->dimensions(i);
+    actual_index.push_back(dim == 1 ? ir_builder->getInt64(0) : index[i]);
+  }
+
   // "base_ptr_" has the type of "<ir_type_for_its_shape>*"
   // (e.g. [3 x [2 x float]]*). Therefore, the address of the indexed element
   // should be computed by
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
index 06cfb2a36c56c5..4c3195c29c859c 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
@@ -97,6 +97,10 @@ class IrArray {
     llvm::Value*& operator[](size_t i) { return multidim()[i]; }
 
     void push_back(llvm::Value* value) { multidim().push_back(value); }
+    void InsertAt(int64 index, llvm::Value* value) {
+      CHECK_LE(index, size());
+      multidim().insert(multidim().begin() + index, value);
+    }
 
     using iterator = std::vector<llvm::Value*>::iterator;
     using const_iterator = std::vector<llvm::Value*>::const_iterator;
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
index 1c00b2aabd182d..64b935bbf1fb90 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
@@ -100,6 +100,15 @@ class KernelSupportLibrary {
         [&](llvm::Value* indvar, llvm::Value*) { for_body_generator(indvar); });
   }
 
+  void For(
+      tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
+      int64 step,
+      const std::function<void(llvm::Value* ind_var)>& for_body_generator) {
+    For(name, start, end, ir_builder_->getInt64(step),
+        /*peel_first_iteration=*/false,
+        [&](llvm::Value* indvar, llvm::Value*) { for_body_generator(indvar); });
+  }
+
   void For(
       tensorflow::StringPiece name, int64 start, int64 end, int64 step,
       const std::function<void(llvm::Value* ind_var)>& for_body_generator) {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
index 7b227ce294176c..497b48ff227d7d 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
@@ -36,8 +36,8 @@ ForLoop::ForLoop(tensorflow::StringPiece prefix, tensorflow::StringPiece suffix,
                  llvm::Value* start_index, llvm::Value* end_index,
                  llvm::Value* step, bool prevent_unrolling,
                  bool prevent_vectorization)
-    : prefix_(prefix.ToString()),
-      suffix_(suffix.ToString()),
+    : prefix_(std::string(prefix)),
+      suffix_(std::string(suffix)),
       start_index_(start_index),
       end_index_(end_index),
       step_(step),
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
index 20069ce5a28184..d915f95db13491 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
@@ -174,7 +174,7 @@ class ForLoopNest {
       : ForLoopNest(/*name=*/"", ir_builder) {}
 
   ForLoopNest(tensorflow::StringPiece name, llvm::IRBuilder<>* ir_builder)
-      : name_(name.ToString()),
+      : name_(std::string(name)),
         outer_loop_preheader_bb_(nullptr),
         outer_loop_exit_bb_(nullptr),
         inner_loop_body_bb_(nullptr),
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index ec04239b4f9112..ff64da87e9c9ac 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -87,18 +87,10 @@ llvm::Value* EmitCallToIntrinsic(
     tensorflow::gtl::ArraySlice<llvm::Value*> operands,
     tensorflow::gtl::ArraySlice<llvm::Type*> overloaded_types,
     llvm::IRBuilder<>* ir_builder) {
-  std::vector<llvm::Type*> types;
-  for (auto type : overloaded_types) {
-    types.push_back(type);
-  }
   llvm::Module* module = ModuleFromIRBuilder(ir_builder);
-  llvm::Function* intrinsic =
-      llvm::Intrinsic::getDeclaration(module, intrinsic_id, types);
-  std::vector<llvm::Value*> operands_vec;
-  for (auto operand : operands) {
-    operands_vec.push_back(operand);
-  }
-  return ir_builder->CreateCall(intrinsic, operands_vec);
+  llvm::Function* intrinsic = llvm::Intrinsic::getDeclaration(
+      module, intrinsic_id, AsArrayRef(overloaded_types));
+  return ir_builder->CreateCall(intrinsic, AsArrayRef(operands));
 }
 
 llvm::Value* EmitFloatMax(llvm::Value* lhs_value, llvm::Value* rhs_value,
@@ -368,15 +360,52 @@ llvm::Constant* LiteralToConstant(const Literal& literal, int64 dimension_index,
   return llvm::ConstantArray::get(aggregate_type, elements);
 }
 
+template <typename T>
+llvm::Constant* GetConstantDataArray(const Literal& literal,
+                                     llvm::Module* module) {
+  const T* data = static_cast<const T*>(literal.untyped_data());
+  int64 num_elements = literal.size_bytes() / sizeof(T);
+  return llvm::ConstantDataArray::get(module->getContext(),
+                                      llvm::makeArrayRef(data, num_elements));
+}
+
 }  // namespace
 
 llvm::Constant* ConvertLiteralToIrConstant(const Literal& literal,
                                            llvm::Module* module) {
-  std::vector<int64> multi_index(ShapeUtil::Rank(literal.shape()), 0);
-  llvm::Constant* value = LiteralToConstant(
-      literal, /*dimension_index=*/ShapeUtil::Rank(literal.shape()) - 1,
-      &multi_index, module);
-  return value;
+  const Shape& shape = literal.shape();
+  // TODO(b/29904935): We can get rid of this switch by exposing a
+  // ConstantDataArray factory method that takes a llvm::Type and a StringRef.
+  switch (shape.element_type()) {
+    case U64:
+      return GetConstantDataArray<uint64>(literal, module);
+    case U32:
+      return GetConstantDataArray<uint32>(literal, module);
+    case U8:
+      return GetConstantDataArray<uint8>(literal, module);
+    case S64:
+      return GetConstantDataArray<int64>(literal, module);
+    case S32:
+      return GetConstantDataArray<int32>(literal, module);
+    case F64:
+      return GetConstantDataArray<double>(literal, module);
+    case F32:
+      return GetConstantDataArray<float>(literal, module);
+    case BF16:
+    case F16:
+      return GetConstantDataArray<uint16>(literal, module);
+    case PRED:
+      return GetConstantDataArray<bool>(literal, module);
+    // TODO(b/29904935): Also use ConstantDataArray for complex numbers.
+    case C64: {
+      int64 dimensions = ShapeUtil::Rank(shape);
+      std::vector<int64> multi_index(dimensions, 0);
+      return LiteralToConstant(literal, /*dimension_index=*/dimensions - 1,
+                               &multi_index, module);
+    }
+    default:
+      LOG(FATAL) << "unsupported type " << shape.element_type();
+  }
 }
 
 llvm::AllocaInst* EmitAllocaAtFunctionEntry(llvm::Type* type,
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
index 3978acc132f34b..dc2934a34c23f8 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
@@ -39,14 +39,13 @@ LoopEmitter::LoopEmitter(const BodyEmitter& body_emitter, const Shape& shape,
 LoopEmitter::LoopEmitter(const ElementGenerator& target_element_generator,
                          const IrArray& target_array,
                          llvm::IRBuilder<>* ir_builder)
-    : body_emitter_([=](const llvm_ir::IrArray::Index array_index)
-                        -> ::tensorflow::Status {
+    : body_emitter_([=](const llvm_ir::IrArray::Index array_index) -> Status {
         // Convert target_element_generator to a BodyEmitter.
         TF_ASSIGN_OR_RETURN(llvm::Value * target_element,
                             target_element_generator(array_index));
         target_array.EmitWriteArrayElement(array_index, target_element,
                                            ir_builder);
-        return tensorflow::Status::OK();
+        return Status::OK();
       }),
       shape_(target_array.GetShape()),
       ir_builder_(ir_builder) {}
@@ -84,7 +83,9 @@ LoopEmitter::LoopEmitter(const ElementGenerator& target_element_generator,
   // Sanity check: In multi-output fusion, all shapes produced must have the
   // same dimensions.
   for (const IrArray& array : target_arrays) {
-    CHECK(ShapeUtil::SameDimensions(shape_, array.GetShape()));
+    CHECK(ShapeUtil::SameDimensions(shape_, array.GetShape()))
+        << ": '" << shape_.ShortDebugString() << "' does not match '"
+        << array.GetShape().ShortDebugString() << "'";
   }
 }
 
@@ -124,7 +125,7 @@ std::vector<IrArray::Index> LoopEmitter::EmitIndexAndSetExitBasicBlock(
   return {array_index};
 }
 
-tensorflow::Status LoopEmitter::EmitLoop(tensorflow::StringPiece loop_name) {
+Status LoopEmitter::EmitLoop(tensorflow::StringPiece loop_name) {
   for (const IrArray::Index& array_index :
        EmitIndexAndSetExitBasicBlock(loop_name)) {
     TF_RETURN_IF_ERROR(body_emitter_(array_index));
@@ -135,7 +136,7 @@ tensorflow::Status LoopEmitter::EmitLoop(tensorflow::StringPiece loop_name) {
   if (exit_bb_ != nullptr) {
     ir_builder_->SetInsertPoint(exit_bb_);
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 }  // namespace llvm_ir
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
index 9ff497aecd0bc9..b70d28ecd3033e 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.h
@@ -38,8 +38,7 @@ using ElementGenerator =
 // Emits a loop for every element in the given shape.
 class LoopEmitter {
  public:
-  using BodyEmitter =
-      std::function<tensorflow::Status(const IrArray::Index& index)>;
+  using BodyEmitter = std::function<Status(const IrArray::Index& index)>;
 
   LoopEmitter(const BodyEmitter& body_emitter, const Shape& shape,
               llvm::IRBuilder<>* ir_builder);
@@ -72,7 +71,7 @@ class LoopEmitter {
       tensorflow::StringPiece loop_name);
 
   // Emits a complete loop nest for every element in the given shape.
-  tensorflow::Status EmitLoop(tensorflow::StringPiece loop_name = "");
+  Status EmitLoop(tensorflow::StringPiece loop_name = "");
 
  protected:
   // An IR emitter that generates the loop body.
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ops.cc b/tensorflow/compiler/xla/service/llvm_ir/ops.cc
index 34899b7400464e..dacc54742c0897 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ops.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/ops.cc
@@ -49,22 +49,41 @@ static Status EmitDynamicUpdateSliceInPlaceImpl(
   for (int64 i = 0; i < rank; ++i) {
     IrArray::Index dim_index({ir_builder->getInt64(i)});
     TF_ASSIGN_OR_RETURN(start_index[i], start_indices_generator(dim_index));
+    llvm::Value* output_dim_size = llvm::ConstantInt::get(
+        start_index[i]->getType(), output_shape.dimensions(i));
+    llvm::Value* update_dim_size = llvm::ConstantInt::get(
+        start_index[i]->getType(), update_shape.dimensions(i));
+
+    // Clamp the start index so that the update region fits in the operand.
+    // start_index = clamp(start_index, 0, output_dim_size - update_dim_size)
+
+    // TODO(b/74360564): This is implementation defined behavior, but is
+    // currently respected by all implementations. Change this if we ever decide
+    // to oficially document different behavior.
+    llvm::Value* max_bound =
+        ir_builder->CreateSub(output_dim_size, update_dim_size);
+    llvm::Value* zero = llvm::ConstantInt::get(start_index[i]->getType(), 0);
+    start_index[i] = ir_builder->CreateSelect(
+        ir_builder->CreateICmp(llvm::ICmpInst::ICMP_SGE, zero, start_index[i]),
+        zero, start_index[i]);
+
+    start_index[i] = ir_builder->CreateSelect(
+        ir_builder->CreateICmp(llvm::ICmpInst::ICMP_SLE, max_bound,
+                               start_index[i]),
+        max_bound, start_index[i]);
   }
 
   auto loop_body_emitter = [&](const IrArray::Index& update_index) -> Status {
     // Calculate output_index, where we'll write the value from update.  For
     // each dimension,
     //
-    //   output_index[dim] = (start_index[dim] + update_index[dim]) % dim_size.
+    //   output_index[dim] = start_index[dim] + update_index[dim]
     //
     IrArray::Index output_index(rank);
     for (int64 i = 0; i < rank; ++i) {
-      llvm::Value* dim_size = llvm::ConstantInt::get(
-          update_index[i]->getType(), output_shape.dimensions(i));
-      llvm::Value* start_index0 = ir_builder->CreateZExtOrBitCast(
+      llvm::Value* start_index0 = ir_builder->CreateSExtOrBitCast(
           start_index[i], update_index[i]->getType());
-      output_index[i] = ir_builder->CreateURem(
-          ir_builder->CreateAdd(start_index0, update_index[i]), dim_size);
+      output_index[i] = ir_builder->CreateAdd(start_index0, update_index[i]);
     }
 
     // Do output[output_index] = update[update_index].
diff --git a/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc b/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc
index 3a21eda35757aa..5fc08aab916e37 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.cc
@@ -24,15 +24,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
 namespace llvm_ir {
 
-void EmitTupleSelect(IrArray select, IrArray pred, llvm::Value* on_true,
-                     llvm::Value* on_false, llvm::IRBuilder<>* ir_builder,
-                     llvm::Module* module) {
+void EmitTupleSelect(const IrArray& select, const IrArray& pred,
+                     llvm::Value* on_true, llvm::Value* on_false,
+                     llvm::IRBuilder<>* ir_builder, llvm::Module* module) {
   CHECK(ShapeUtil::IsScalar(pred.GetShape()));
 
   llvm::LoadInst* pred_value =
@@ -47,30 +46,27 @@ void EmitTupleSelect(IrArray select, IrArray pred, llvm::Value* on_true,
   VLOG(2) << "  pred_cond: " << DumpToString(*pred_cond);
 
   for (int i = 0; i < ShapeUtil::TupleElementCount(select.GetShape()); ++i) {
-    std::vector<llvm::Value*> element_index = {ir_builder->getInt64(0),
-                                               ir_builder->getInt64(i)};
+    llvm::Value* const element_index[] = {ir_builder->getInt64(0),
+                                          ir_builder->getInt64(i)};
     llvm::Value* on_true_element_address =
         ir_builder->CreateInBoundsGEP(on_true, element_index);
     llvm::Value* on_true_element = ir_builder->CreateLoad(
-        on_true_element_address,
-        tensorflow::strings::Printf("on_true_element_%d", i).c_str());
+        on_true_element_address, "on_true_element_" + llvm::Twine(i));
     llvm::Value* on_false_element_address =
         ir_builder->CreateInBoundsGEP(on_false, element_index);
     llvm::Value* on_false_element = ir_builder->CreateLoad(
-        on_false_element_address,
-        tensorflow::strings::Printf("on_false_element_%d", i).c_str());
+        on_false_element_address, "on_false_element_" + llvm::Twine(i));
 
     llvm::Value* output_element_address =
         ir_builder->CreateInBoundsGEP(select.GetBasePointer(), element_index);
     ir_builder->CreateStore(
-        ir_builder->CreateSelect(
-            pred_cond, on_true_element, on_false_element,
-            tensorflow::strings::Printf("select_output_element_%d", i).c_str()),
+        ir_builder->CreateSelect(pred_cond, on_true_element, on_false_element,
+                                 "select_output_element_" + llvm::Twine(i)),
         output_element_address);
   }
 }
 
-void EmitTuple(IrArray tuple,
+void EmitTuple(const IrArray& tuple,
                tensorflow::gtl::ArraySlice<llvm::Value*> operands,
                llvm::IRBuilder<>* ir_builder, llvm::Module* module) {
   for (size_t i = 0; i < operands.size(); ++i) {
diff --git a/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h b/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h
index dbf9a140068b60..352d34ebf839c6 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h
@@ -59,13 +59,13 @@ namespace llvm_ir {
 // of the address from the corresponding element in either
 // tuple_on_true or tuple_on_false:
 //   output[i] = pred ? tuple_on_true[i] : tuple_on_false[i]
-void EmitTupleSelect(IrArray select, IrArray pred, llvm::Value* on_true,
-                     llvm::Value* on_false, llvm::IRBuilder<>* ir_builder,
-                     llvm::Module* module);
+void EmitTupleSelect(const IrArray& select, const IrArray& pred,
+                     llvm::Value* on_true, llvm::Value* on_false,
+                     llvm::IRBuilder<>* ir_builder, llvm::Module* module);
 
 // A tuple is an array of pointers, one for each operand. Each pointer points to
 // the output buffer of its corresponding operand.
-void EmitTuple(IrArray tuple,
+void EmitTuple(const IrArray& tuple,
                tensorflow::gtl::ArraySlice<llvm::Value*> operands,
                llvm::IRBuilder<>* ir_builder, llvm::Module* module);
 
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index 0fa4061738612d..1d9c9e0678765a 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -24,14 +24,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
-#include "tensorflow/compiler/xla/service/computation_tracker.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
-#include "tensorflow/compiler/xla/service/user_computation.h"
 #include "tensorflow/compiler/xla/service/versioned_computation_handle.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -110,6 +108,11 @@ ExecutionOptions CreateExecutionOptions(
         ->set_xla_dump_optimized_hlo_proto_to(
             build_options.dump_optimized_hlo_proto_to().value());
   }
+  if (build_options.dump_unoptimized_hlo_proto_to().has_value()) {
+    execution_options.mutable_debug_options()
+        ->set_xla_dump_unoptimized_hlo_proto_to(
+            build_options.dump_unoptimized_hlo_proto_to().value());
+  }
   if (build_options.dump_per_pass_hlo_proto_to().has_value()) {
     execution_options.mutable_debug_options()
         ->set_xla_dump_per_pass_hlo_proto_to(
@@ -124,75 +127,17 @@ ExecutionOptions CreateExecutionOptions(
     LayoutUtil::SetToDefaultLayout(
         execution_options.mutable_shape_with_output_layout());
   }
-  return execution_options;
-}
-
-}  // namespace
-
-StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
-    const ComputationHandle& computation,
-    const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
-    const ExecutableBuildOptions& build_options) {
-  TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
-                      computation_tracker_.Resolve(computation));
-  VersionedComputationHandle versioned_handle =
-      user_computation->GetVersionedHandle();
 
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<const ProgramShape> program_shape,
-      user_computation->ComputeProgramShape(versioned_handle.version));
-
-  // Validate incoming layouts.
-  if (argument_layouts.size() != program_shape->parameters_size()) {
-    return InvalidArgument(
-        "Invalid number of arguments for computation: expected %d, got %zu.",
-        program_shape->parameters_size(), argument_layouts.size());
+  for (const std::string& disabled_pass : build_options.disabled_hlo_passes()) {
+    execution_options.mutable_debug_options()->add_xla_disable_hlo_passes(
+        disabled_pass);
   }
-  for (int i = 0; i < argument_layouts.size(); ++i) {
-    const Shape& argument_shape = *argument_layouts[i];
-    TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(argument_shape));
-    if (!ShapeUtil::Compatible(argument_shape, program_shape->parameters(i))) {
-      tensorflow::gtl::optional<const OpMetadata*> metadata =
-          user_computation->ParameterMetadata(i);
-      auto metadata_string = [&metadata]() -> string {
-        if (!metadata.has_value()) {
-          return "";
-        }
-        CHECK(metadata.value() != nullptr);
-        const OpMetadata& m = *metadata.value();
-        if (!m.source_file().empty()) {
-          return tensorflow::strings::Printf(
-              " (%s:%d)", m.source_file().c_str(), m.source_line());
-        }
-        return "";
-      };
-      return InvalidArgument(
-          "Invalid argument shape for argument %d%s, expected %s, got %s.", i,
-          metadata_string().c_str(),
-          ShapeUtil::HumanString(program_shape->parameters(i)).c_str(),
-          ShapeUtil::HumanString(argument_shape).c_str());
-    }
-  }
-  if (build_options.result_layout() != nullptr) {
-    TF_RETURN_IF_ERROR(ValidateResultShapeWithLayout(
-        *build_options.result_layout(), program_shape->result()));
-  }
-
-  ExecutionOptions execution_options =
-      CreateExecutionOptions(build_options, program_shape.get());
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
-                      CreateModuleConfig(*program_shape, argument_layouts,
-                                         &execution_options, user_computation));
 
-  TF_ASSIGN_OR_RETURN(
-      se::StreamExecutor * executor,
-      execute_backend_->stream_executor(build_options.device_ordinal()));
-
-  return BuildExecutable(versioned_handle, std::move(module_config),
-                         execute_backend_.get(), executor,
-                         build_options.device_allocator());
+  return execution_options;
 }
 
+}  // namespace
+
 StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
     const XlaComputation& computation,
     const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
@@ -260,4 +205,15 @@ StatusOr<int> LocalService::ReplicaNumberToDeviceOrdinal(int replica_number) {
       /*computation_count=*/1);
 }
 
+StatusOr<const ShapedBuffer*> LocalService::GlobalDataToShapedBuffer(
+    const GlobalDataHandle& data, int replica_number) {
+  TF_ASSIGN_OR_RETURN(auto buffers, allocation_tracker_.Resolve(data));
+  if (replica_number >= buffers.size()) {
+    return InvalidArgument(
+        "replica_number %d out of range; must be less than num_replicas = %zu.",
+        replica_number, buffers.size());
+  }
+  return buffers[replica_number];
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/local_service.h b/tensorflow/compiler/xla/service/local_service.h
index 06567cabd6eb28..39d6734c3fc06d 100644
--- a/tensorflow/compiler/xla/service/local_service.h
+++ b/tensorflow/compiler/xla/service/local_service.h
@@ -41,23 +41,11 @@ class LocalService : public Service {
   static StatusOr<std::unique_ptr<LocalService>> NewService(
       const ServiceOptions& options);
 
-  // Builds an Executable with the given argument layouts and options. If
-  // result_layout is non-null, then the executable is compiled to produce a
-  // result of the given layout.  If device_allocator is non-null, then the
-  // compiler may use it to allocate temp space on the device.  The compiler is
-  // responsible for freeing any memory it allocates this way.
-  StatusOr<std::unique_ptr<Executable>> CompileExecutable(
-      const ComputationHandle& computation,
-      const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
-      const ExecutableBuildOptions& options);
-
   // Builds an Executable with the given XlaComputation, argument layouts and
   // options. If result_layout is non-null, then the executable is compiled to
   // produce a result of the given layout.  If device_allocator is non-null,
   // then the compiler may use it to allocate temp space on the device.  The
   // compiler is responsible for freeing any memory it allocates this way.
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   StatusOr<std::unique_ptr<Executable>> CompileExecutable(
       const XlaComputation& computation,
       const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
@@ -70,6 +58,11 @@ class LocalService : public Service {
   // the "easy" case where a single replica is a single device.
   StatusOr<int> ReplicaNumberToDeviceOrdinal(int replica_number);
 
+  // Converts a GlobalDataHandle into a pointer to a ShapedBuffer that's valid
+  // as long as the handle is valid.
+  StatusOr<const ShapedBuffer*> GlobalDataToShapedBuffer(
+      const GlobalDataHandle& data, int replica_number);
+
  private:
   explicit LocalService(const ServiceOptions& options,
                         std::unique_ptr<Backend> backend);
diff --git a/tensorflow/compiler/xla/service/logical_buffer.cc b/tensorflow/compiler/xla/service/logical_buffer.cc
index 68553bed121917..c742d35a7bcafa 100644
--- a/tensorflow/compiler/xla/service/logical_buffer.cc
+++ b/tensorflow/compiler/xla/service/logical_buffer.cc
@@ -15,9 +15,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/logical_buffer.h"
 
-#include <ostream>
-#include <vector>
-
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -28,43 +25,20 @@ namespace xla {
 
 LogicalBuffer::LogicalBuffer(HloInstruction* instruction,
                              const ShapeIndex& index, Id id)
-    : instruction_(instruction), id_(id), color_(kInvalidColor), index_(index) {
-  const auto& s = shape();
-  is_array_ = ShapeUtil::IsArray(s);
-  is_tuple_ = ShapeUtil::IsTuple(s);
-}
+    : BufferValue(instruction, index, id),
+      instruction_(instruction),
+      index_(index) {}
+
+LogicalBuffer::~LogicalBuffer() {}
 
 string LogicalBuffer::ToString() const {
+  string color_string;
+  if (has_color()) {
+    color_string = tensorflow::strings::StrCat(" @", color().value());
+  }
   return tensorflow::strings::StrCat(instruction_->name(), "[",
                                      tensorflow::str_util::Join(index_, ","),
-                                     "](#", id_, " @", color_.value(), ")");
-}
-
-std::ostream& operator<<(std::ostream& out, const LogicalBuffer& buffer) {
-  out << buffer.ToString();
-  return out;
-}
-
-/*static*/ LogicalBufferProto::Location LogicalBuffer::ToLocationProto(
-    const HloInstruction& instruction, const ShapeIndex& index) {
-  LogicalBufferProto::Location proto;
-  proto.set_computation_name(instruction.parent()->name());
-  proto.set_instruction_name(instruction.name());
-  for (const int64 index_entry : index) {
-    proto.add_shape_index(index_entry);
-  }
-  return proto;
-}
-
-LogicalBufferProto LogicalBuffer::ToProto(const SizeFunction& size_fn) const {
-  LogicalBufferProto proto;
-  proto.set_id(id_);
-  proto.set_size(size_fn(*this));
-  LogicalBufferProto::Location proto_location =
-      ToLocationProto(*instruction_, index_);
-  proto.mutable_defined_at()->Swap(&proto_location);
-  proto.set_color(color_.value());
-  return proto;
+                                     "](#", id(), color_string, ")");
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/logical_buffer.h b/tensorflow/compiler/xla/service/logical_buffer.h
index 67b205e289e626..f9ba5a554740c9 100644
--- a/tensorflow/compiler/xla/service/logical_buffer.h
+++ b/tensorflow/compiler/xla/service/logical_buffer.h
@@ -16,11 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_LOGICAL_BUFFER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_LOGICAL_BUFFER_H_
 
-#include <functional>
-#include <iosfwd>
 #include <string>
-#include <vector>
 
+#include "tensorflow/compiler/xla/service/buffer_value.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -33,133 +31,30 @@ limitations under the License.
 
 namespace xla {
 
-// Class describing a contiguous sequence of elements (ie, C array) which form
-// the components of Shaped values in XLA. XLA arrays are trivially a
-// single LogicalBuffer. Tuple values are made up of more than one
-// LogicalBuffer: a LogicalBuffer for the pointers to elements, and a
-// LogicalBuffer for each child element.
-//
-// Every buffer is defined by a particular instruction and most instructions
-// define only a single buffer. Instructions which define a single buffer
-// include array-shaped instructions such as Add but also includes Tuple-shaped
-// instructions such as Tuple. The Tuple instruction defines a single buffer
-// which is a vector of pointers to the buffers containing the Tuple
-// instruction's operands. Though the result of the Tuple instruction includes
-// multiple buffers only the top-level buffer (the vector of pointers) is
-// defined by the Tuple instruction. The buffers containing the tuple elements
-// are defined by earlier instructions, usually the operands of the Tuple
-// instruction.
-//
-// Instructions which construct both the tuple *and* the tuple elements define
-// more than one buffer. This includes (at least) tuple-shaped Constant,
-// Parameter, Infeed and While instructions. The tuple-shaped instructions do
-// not assemble a tuple from existing buffers like the Tuple instruction does,
-// but rather define the entire tuple.
-//
-// Some instructions, such as Bitcast, define no buffers. These instructions
-// simply forward buffers from their operands.
-//
-// The LogicalBuffer object describes which HLO instruction defines a buffer and
-// where within that instruction's output shape the buffer is defined. The
-// location within the output shape is indicated by LogicalBuffer::index() which
-// is defined identically to the index used in
-// ShapeUtil::GetSubshape(). Examples:
-//
-// %add = Add(%foo, %bar)
-// %tuple_constant = Constant({1, {42, 43}})
-//
-// %add defines a single array-shaped buffer LogicalBuffer(%add, {}) which holds
-// the array result of the add operation. The nested-tuple-shaped
-// %tuple_constant defines 5 buffers described by the following LogicalBuffer
-// objects:
-//
-//   LogicalBuffer(%tuple_constant, {})      // "Top-level" buffer: vector of
-//                                           //  pointers to LogicalBuffers at
-//                                           //  indices {0} and {1}
-//   LogicalBuffer(%tuple_constant, {0})     // Holds value "1"
-//   LogicalBuffer(%tuple_constant, {1})     // Holds nested tuple: vector of
-//                                           //  pointers to LogicalBuffers at
-//                                           //  indices {1, 0} and {1, 1}
-//   LogicalBuffer(%tuple_constant, {1, 0})  // Holds value "42"
-//   LogicalBuffer(%tuple_constant, {1, 1})  // Holds value "43"
-class LogicalBuffer {
+// TuplePointsToAnalysis uses this subclass of BufferValue.
+class LogicalBuffer : public BufferValue {
  public:
-  TF_LIB_GTL_DEFINE_INT_TYPE(Color, int64);
-
-  // Id is a unique identifier for the LogicalBuffer to facilitate efficient
-  // collections of LogicalBuffers with stable iteration order.
-  // LogicalBuffers are typically created and accessed through
-  // TuplePointsToAnalysis, and points-to analysis assigns each LogicalBuffer a
-  // unique value.
-  using Id = int64;
-
-  // Functions which return the size and alignment of a logical buffer in bytes.
-  using SizeFunction = std::function<int64(const LogicalBuffer&)>;
-  using AlignmentFunction = std::function<int64(LogicalBuffer::Color)>;
-
   LogicalBuffer(HloInstruction* instruction, const ShapeIndex& index, Id id);
-
-  Id id() const { return id_; }
+  ~LogicalBuffer() override;
 
   // Return the instruction that defines the buffer.
-  HloInstruction* instruction() const { return instruction_; }
+  HloInstruction* instruction() const override { return instruction_; }
 
   // Return the index within the output of the instruction where the buffer is
   // defined. Index used defined as in ShapeUtil::GetSubshape()
-  const ShapeIndex& index() const { return index_; }
-
-  // Return the color of the logical buffer. Differently colored buffers can
-  // not be parts of the same allocation.
-  Color color() const {
-    CHECK_NE(color_, kInvalidColor)
-        << "Should not query the color of a buffer that was never colored";
-    return color_;
-  }
-
-  void set_color(Color color) {
-    CHECK_NE(color, kInvalidColor)
-        << "Should not set the color of a buffer to the invalid color";
-    color_ = color;
-  }
-
-  bool has_color() const { return color_ != kInvalidColor; }
+  const ShapeIndex& index() const override { return index_; }
 
   // Return the shape of the buffer. This reference points into the shape field
   // of the instruction defining the buffer.  Therefore, the returned shape will
   // contain the layout of instruction, if any.
-  const Shape& shape() const {
+  const Shape& shape() const override {
     return ShapeUtil::GetSubshape(instruction_->shape(), index_);
   }
 
-  // Returns true if this buffer is the top-level output buffer of the defining
-  // HLO instruction. This is equivalent to index == {}.
-  bool IsTopLevel() const { return index_.empty(); }
-
-  // Whether this buffer contains a tuple.
-  bool IsTuple() const { return is_tuple_; }
-
-  // Whether this buffer contains an array.
-  bool IsArray() const { return is_array_; }
-
-  // operator< is required for std::set.
-  bool operator<(const LogicalBuffer& other) const { return id_ < other.id_; }
-
-  string ToString() const;
-  LogicalBufferProto ToProto(const SizeFunction& size_fn) const;
-
-  // Returns the LogicalBufferProto::Location that serializes the given
-  // instruction and index.
-  static LogicalBufferProto::Location ToLocationProto(
-      const HloInstruction& instruction, const ShapeIndex& index);
-
-  const Color kInvalidColor = Color(-1);
+  string ToString() const override;
 
  private:
   HloInstruction* instruction_;
-  Id id_ : 62;
-  bool is_array_ : 1;
-  bool is_tuple_ : 1;
-  Color color_;
   ShapeIndex index_;
 
   // Similar to HLO constructs (HloInstruction, etc), pointers are used for
@@ -167,8 +62,6 @@ class LogicalBuffer {
   TF_DISALLOW_COPY_AND_ASSIGN(LogicalBuffer);
 };
 
-std::ostream& operator<<(std::ostream& out, const LogicalBuffer& buffer);
-
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_LOGICAL_BUFFER_H_
diff --git a/tensorflow/compiler/xla/service/logical_buffer_analysis.cc b/tensorflow/compiler/xla/service/logical_buffer_analysis.cc
index 6aca6ba38572c5..f410921b4b5337 100644
--- a/tensorflow/compiler/xla/service/logical_buffer_analysis.cc
+++ b/tensorflow/compiler/xla/service/logical_buffer_analysis.cc
@@ -125,6 +125,12 @@ Status LogicalBufferAnalysis::HandleBitcast(HloInstruction*) {
   return Status::OK();
 }
 
+Status LogicalBufferAnalysis::HandleDomain(HloInstruction*) {
+  // A kDomain instruction aliases its operand. That is, the buffer of its
+  // result *is* the buffer of its operand.
+  return Status::OK();
+}
+
 Status LogicalBufferAnalysis::HandleRecvDone(HloInstruction*) {
   // RecvDone doesn't create a new buffer but rather aliases its input (Recv)
   // tuple element at {0} to its output.
diff --git a/tensorflow/compiler/xla/service/logical_buffer_analysis.h b/tensorflow/compiler/xla/service/logical_buffer_analysis.h
index f4c63dd86b4d8a..b5ef3967875a58 100644
--- a/tensorflow/compiler/xla/service/logical_buffer_analysis.h
+++ b/tensorflow/compiler/xla/service/logical_buffer_analysis.h
@@ -59,6 +59,7 @@ class LogicalBufferAnalysis : public DfsHloVisitorWithDefault {
   Status HandleTuple(HloInstruction* tuple) override;
   Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
   Status HandleBitcast(HloInstruction* bitcast) override;
+  Status HandleDomain(HloInstruction* domain) override;
   Status HandleCopy(HloInstruction* copy) override;
   Status HandleRecvDone(HloInstruction* recv_done) override;
   Status HandleSend(HloInstruction* send) override;
diff --git a/tensorflow/compiler/xla/service/name_uniquer.cc b/tensorflow/compiler/xla/service/name_uniquer.cc
index f74bcb0b79355c..3a6a7c25f4b727 100644
--- a/tensorflow/compiler/xla/service/name_uniquer.cc
+++ b/tensorflow/compiler/xla/service/name_uniquer.cc
@@ -53,7 +53,7 @@ NameUniquer::NameUniquer(const string& separator) {
 }
 
 string NameUniquer::GetUniqueName(tensorflow::StringPiece prefix) {
-  string root = GetSanitizedName(prefix.empty() ? "name" : prefix.ToString());
+  string root = GetSanitizedName(prefix.empty() ? "name" : std::string(prefix));
 
   // Strip away numeric suffix (if any). Only recognize separator if it is in
   // the middle of the name.
diff --git a/tensorflow/compiler/xla/service/owning_device_memory.cc b/tensorflow/compiler/xla/service/owning_device_memory.cc
new file mode 100644
index 00000000000000..c115bc097f3b1d
--- /dev/null
+++ b/tensorflow/compiler/xla/service/owning_device_memory.cc
@@ -0,0 +1,35 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/owning_device_memory.h"
+
+#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+
+namespace xla {
+
+void OwningDeviceMemory::Free() {
+  CHECK(allocator_ != nullptr)
+      << "Can't call Free() on an inactive (i.e. moved from, Forget()'ten, "
+         "or Free()'ed) instance.";
+  auto status = allocator_->Deallocate(device_ordinal_, mem_);
+  if (!status.ok()) {
+    LOG(WARNING) << "Deallocating buffer " << mem_.opaque() << " failed.";
+  }
+
+  allocator_ = nullptr;
+  mem_ = se::DeviceMemoryBase();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/owning_device_memory.h b/tensorflow/compiler/xla/service/owning_device_memory.h
new file mode 100644
index 00000000000000..9cf071f0d9d09d
--- /dev/null
+++ b/tensorflow/compiler/xla/service/owning_device_memory.h
@@ -0,0 +1,131 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_OWNING_DEVICE_MEMORY_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_OWNING_DEVICE_MEMORY_H_
+
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/stream_executor_no_cuda.h"
+
+namespace xla {
+
+// Break circular dependency between this file and device_memory_allocator.h.
+class DeviceMemoryAllocator;
+
+// Owning pointer for memory on a device.
+//
+// OwningDeviceMemory is an owning pointer like std::unique_ptr, but it can
+// point to memory that resides on a "device" (e.g. a GPU).  When an
+// OwningDeviceMemory goes out of scope, it frees the memory it owns.
+//
+// We say that an instance of OwningDeviceMemory is "active" if it currently
+// owns a (possibly empty) slice of memory on the device.  Moving, Forget()'ing,
+// Free()'ing, and other actions can deactive an active object.
+//
+// Note that we can't simply use stream_executor::ScopedDeviceMemory instead of
+// OwningDeviceMemory, because ScopedDeviceMemory frees its pointer via a
+// StreamExecutor.  This class needs to free via a xla::DeviceMemoryAllocator.
+class OwningDeviceMemory {
+ public:
+  OwningDeviceMemory() : device_ordinal_(-1), allocator_(nullptr) {}
+
+  explicit OwningDeviceMemory(se::DeviceMemoryBase mem, int device_ordinal,
+                              DeviceMemoryAllocator* allocator)
+      : mem_(mem), device_ordinal_(device_ordinal), allocator_(allocator) {
+    CHECK(allocator != nullptr) << "allocator cannot be null.";
+  }
+
+  OwningDeviceMemory(OwningDeviceMemory&& other)
+      : mem_(other.mem_),
+        device_ordinal_(other.device_ordinal_),
+        allocator_(other.allocator_) {
+    other.mem_ = se::DeviceMemoryBase();
+    other.allocator_ = nullptr;
+  }
+
+  OwningDeviceMemory& operator=(OwningDeviceMemory&& other) {
+    if (allocator_ != nullptr) {
+      Free();
+    }
+    mem_ = other.mem_;
+    device_ordinal_ = other.device_ordinal_;
+    allocator_ = other.allocator_;
+
+    other.mem_ = se::DeviceMemoryBase();
+    other.allocator_ = nullptr;
+    return *this;
+  }
+
+  // Deactivates this instance if it's active.  Nop if it's not active.
+  OwningDeviceMemory& operator=(std::nullptr_t) {
+    if (allocator_ != nullptr) {
+      Free();
+    }
+    return *this;
+  }
+
+  ~OwningDeviceMemory() {
+    if (allocator_ != nullptr) {
+      Free();
+    }
+  }
+
+  // The returned allocator is nonnull iff this object is active.
+  DeviceMemoryAllocator* allocator() const { return allocator_; }
+
+  int device_ordinal() const { return device_ordinal_; }
+
+  // Gets the device memory pointer.
+  const void* opaque() const { return mem_.opaque(); }
+  void* opaque() { return mem_.opaque(); }
+
+  uint64 size() const { return mem_.size(); }
+
+  // Determines whether this wraps a null pointer.
+  //
+  // !is_null() is sufficient but not necessary to imply `this` is active.
+  bool is_null() const { return mem_.is_null(); }
+
+  se::DeviceMemoryBase AsDeviceMemoryBase() {
+    return se::DeviceMemoryBase(opaque(), size(), /*is_sub_buffer=*/false);
+  }
+
+  // Returns the wrapped DeviceMemoryBase without freeing it, and deactivates
+  // this object.  Precondition: `this` is active.
+  TF_MUST_USE_RESULT se::DeviceMemoryBase Forget() {
+    CHECK(allocator_ != nullptr)
+        << "Can't call Forget() on an inactive (i.e. moved from, Forget()'ten, "
+           "or Free()'ed) instance.";
+    allocator_ = nullptr;
+    se::DeviceMemoryBase mem(mem_);
+    mem_ = se::DeviceMemoryBase();
+    return mem;
+  }
+
+  // Frees the wrapped DeviceMemoryBase and deactivates this object.
+  // Precondition: `this` is active.
+  void Free();
+
+ private:
+  se::DeviceMemoryBase mem_;
+  int device_ordinal_;
+  DeviceMemoryAllocator* allocator_;  // Null if this object is inactive.
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_OWNING_DEVICE_MEMORY_H_
diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
index 586f6ef7a9c4f1..2515222cf2db3d 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher.h
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -204,7 +204,7 @@ class LayoutPattern {
   // Modifies the pattern to match only if the layout equals the given proto.
   // The layout must outlive the returned pattern.
   constexpr LayoutPattern<LayoutType, LayoutPatternEqualImpl<Impl>> EqualTo(
-      const Layout* layout) const {
+      const ::xla::Layout* layout) const {
     return LayoutPattern<LayoutType, LayoutPatternEqualImpl<Impl>>(
         LayoutPatternEqualImpl<Impl>(impl_, layout), matched_layout_);
   }
@@ -702,6 +702,30 @@ class HloInstructionPatternOperandImpl {
   HloInstructionPattern<OperandType, OperandImpl> operand_;
 };
 
+// An HloInstructionPattern implementation that matches only if the instruction
+// is a fusion node with a particular kind.
+template <typename Previous>
+class HloInstructionPatternFusionKindImpl {
+ public:
+  explicit constexpr HloInstructionPatternFusionKindImpl(
+      const Previous& previous, ::xla::HloInstruction::FusionKind kind)
+      : previous_(previous), kind_(kind) {}
+
+  bool Match(const ::xla::HloInstruction* inst) const {
+    return previous_.Match(inst) && inst->opcode() == HloOpcode::kFusion &&
+           inst->fusion_kind() == kind_;
+  }
+
+  bool Match(::xla::HloInstruction* inst) const {
+    return previous_.Match(inst) && inst->opcode() == HloOpcode::kFusion &&
+           inst->fusion_kind() == kind_;
+  }
+
+ private:
+  Previous previous_;
+  ::xla::HloInstruction::FusionKind kind_;
+};
+
 // A pattern that matches HloInstructions.
 template <typename HloInstructionType, typename Impl>
 class HloInstructionPattern {
@@ -807,6 +831,16 @@ class HloInstructionPattern {
         matched_inst_);
   }
 
+  // Modifies the pattern to match only if the instruction is a fusion node with
+  // the given kind.
+  constexpr HloInstructionPattern<HloInstructionType,
+                                  HloInstructionPatternFusionKindImpl<Impl>>
+  WithFusionKind(HloInstruction::FusionKind kind) const {
+    return HloInstructionPattern<HloInstructionType,
+                                 HloInstructionPatternFusionKindImpl<Impl>>(
+        HloInstructionPatternFusionKindImpl<Impl>(impl_, kind), matched_inst_);
+  }
+
  private:
   Impl impl_;
   HloInstructionType** matched_inst_;
diff --git a/tensorflow/compiler/xla/service/pattern_matcher_test.cc b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
index c88157c312524f..fef3c132b0f346 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher_test.cc
+++ b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
@@ -29,7 +29,7 @@ TEST(PatternMatcherTest, AddOp) {
       ROOT %two_plus_two = f32[] add(f32[] %two, f32[] %two)
     }
   )";
-  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, tools::Parse(kModuleStr));
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseHloString(kModuleStr));
 
   const HloInstruction* matched_inst;
   HloInstruction* matched_operand;
@@ -170,5 +170,28 @@ TEST(PatternMatcherTest, TupleShape) {
       Match(&tuple_shape, match::Shape().WithSubshape({0, 0}, match::Shape())));
 }
 
+TEST(PatternMatcherTest, FusionKind) {
+  constexpr char kModuleStr[] = R"(
+    HloModule test_module
+
+    fused_computation {
+      ROOT fp0 = f32[] parameter(0)
+    }
+
+    ENTRY while.v11 {
+      p0 = f32[] parameter(0)
+      ROOT fusion = f32[] fusion(p0), kind=kLoop, calls=fused_computation
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseHloString(kModuleStr));
+
+  auto* root = hlo_module->entry_computation()->root_instruction();
+  EXPECT_TRUE(Match(
+      root, match::Op().WithFusionKind(HloInstruction::FusionKind::kLoop)));
+  EXPECT_FALSE(Match(
+      root, match::Op().WithFusionKind(HloInstruction::FusionKind::kInput)));
+  EXPECT_FALSE(Match(root->operand(0), match::Op().WithFusionKind(
+                                           HloInstruction::FusionKind::kLoop)));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/reduce_precision_insertion.cc b/tensorflow/compiler/xla/service/reduce_precision_insertion.cc
index e2c07e38271df8..688cceff0cd10d 100644
--- a/tensorflow/compiler/xla/service/reduce_precision_insertion.cc
+++ b/tensorflow/compiler/xla/service/reduce_precision_insertion.cc
@@ -75,7 +75,7 @@ StatusOr<bool> ReducePrecisionInsertion::insert_after(
     return false;
   }
 
-  // Check that we haven't already inserted an equivalant reduce-precision
+  // Check that we haven't already inserted an equivalent reduce-precision
   // operation after this instruction.  (The zero-user case occurs when this is
   // the root instruction.)
   if (instruction->user_count() > 0) {
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 086bd61dd04aa1..d01c35b9923131 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -36,7 +36,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/hlo_proto_util.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
-#include "tensorflow/compiler/xla/service/session.pb.h"
 #include "tensorflow/compiler/xla/service/source_map_util.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
@@ -62,12 +61,11 @@ namespace xla {
 
 namespace {
 
-// Records the arguments used to invoke a computation in a SessionModule
-// proto.
-tensorflow::Status RecordArguments(
+// Records the arguments used to invoke a computation in an HloSnapshot proto.
+Status RecordArguments(
     const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
     se::StreamExecutor* executor, TransferManager* transfer_manager,
-    SessionModule* module) {
+    HloSnapshot* module) {
   module->clear_arguments();
   for (const ShapedBuffer* argument : arguments) {
     TF_ASSIGN_OR_RETURN(
@@ -75,20 +73,18 @@ tensorflow::Status RecordArguments(
         transfer_manager->TransferLiteralFromDevice(executor, *argument));
     *module->add_arguments() = literal->ToProto();
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-// Records the result of a computation in a SessionModule proto.
-tensorflow::Status RecordResult(const ShapedBuffer& result,
-                                se::StreamExecutor* executor,
-                                TransferManager* transfer_manager,
-                                SessionModule* module) {
+// Records the result of a computation in a HloSnapshot proto.
+Status RecordResult(const ShapedBuffer& result, se::StreamExecutor* executor,
+                    TransferManager* transfer_manager, HloSnapshot* module) {
   module->clear_result();
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<Literal> literal,
       transfer_manager->TransferLiteralFromDevice(executor, result));
   *module->mutable_result() = literal->ToProto();
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 }  // namespace
@@ -171,35 +167,20 @@ Service::Service(const ServiceOptions& options,
   }
 }
 
-tensorflow::Status Service::Computation(const ComputationRequest* arg,
-                                        ComputationResponse* result) {
-  if (arg->name().empty()) {
-    return InvalidArgument("computation request needs a name");
-  }
-
-  *result->mutable_computation() =
-      computation_tracker_.NewComputation(arg->name());
-  VLOG(1) << Printf("Created new computation %s on service %p, name %s",
-                    result->computation().ShortDebugString().c_str(), this,
-                    arg->name().c_str());
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status Service::CreateChannelHandle(
-    const CreateChannelHandleRequest* arg,
-    CreateChannelHandleResponse* result) {
+Status Service::CreateChannelHandle(const CreateChannelHandleRequest* arg,
+                                    CreateChannelHandleResponse* result) {
   *result->mutable_channel() = channel_tracker_.NewChannel();
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status Service::Unregister(const UnregisterRequest* arg,
-                                       UnregisterResponse* result) {
+Status Service::Unregister(const UnregisterRequest* arg,
+                           UnregisterResponse* result) {
   return allocation_tracker_.Unregister(arg->data());
 }
 
 // Deconstructs a previously-allocated global handle.
-tensorflow::Status Service::DeconstructTuple(const DeconstructTupleRequest* arg,
-                                             DeconstructTupleResponse* result) {
+Status Service::DeconstructTuple(const DeconstructTupleRequest* arg,
+                                 DeconstructTupleResponse* result) {
   TF_ASSIGN_OR_RETURN(
       std::vector<GlobalDataHandle> elements,
       allocation_tracker_.DeconstructTuple(arg->tuple_handle()));
@@ -207,11 +188,11 @@ tensorflow::Status Service::DeconstructTuple(const DeconstructTupleRequest* arg,
   for (auto& element : elements) {
     *result->add_element_handles() = element;
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status Service::ValidateResultShapeWithLayout(
-    const Shape& shape_with_layout, const Shape& result_shape) const {
+Status Service::ValidateResultShapeWithLayout(const Shape& shape_with_layout,
+                                              const Shape& result_shape) const {
   if (!ShapeUtil::Compatible(shape_with_layout, result_shape)) {
     return InvalidArgument(
         "Shape used to set computation result layout %s is not compatible "
@@ -265,11 +246,12 @@ Service::ResolveAndValidateArguments(
 StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     const ProgramShape& program_shape,
     tensorflow::gtl::ArraySlice<const Shape*> argument_shapes,
-    const ExecutionOptions* execution_options,
-    const UserComputation* user_computation) {
+    const ExecutionOptions* execution_options) {
   auto config = MakeUnique<HloModuleConfig>(program_shape);
-  auto* computation_layout = config->mutable_entry_computation_layout();
-
+  ComputationLayout* host_computation_layout =
+      config->mutable_host_entry_computation_layout();
+  ComputationLayout* device_computation_layout =
+      config->mutable_device_entry_computation_layout();
   if (program_shape.parameters_size() != argument_shapes.size()) {
     return InvalidArgument("computation takes %d parameters, but %zu given",
                            program_shape.parameters_size(),
@@ -280,23 +262,16 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     // ProgramShape.
     if (!ShapeUtil::Compatible(*argument_shapes[i],
                                program_shape.parameters(i))) {
-      if (user_computation == nullptr) {
-        return InvalidArgument(
-            "Argument does not match shape of computation parameter %d: want "
-            "%s, got %s",
-            i, ShapeUtil::HumanString(program_shape.parameters(i)).c_str(),
-            ShapeUtil::HumanString(*argument_shapes[i]).c_str());
-      }
-      return InvalidParameterArgument(
-          *user_computation->ParameterMetadata(i).value(),
-          "Argument does not match shape of computation parameter %d: want %s, "
-          "got %s",
+      return InvalidArgument(
+          "Argument does not match shape of computation parameter %d: want "
+          "%s, got %s",
           i, ShapeUtil::HumanString(program_shape.parameters(i)).c_str(),
           ShapeUtil::HumanString(*argument_shapes[i]).c_str());
     }
-    TF_RETURN_IF_ERROR(
-        computation_layout->mutable_parameter_layout(i)->CopyLayoutFromShape(
-            *argument_shapes[i]));
+    TF_RETURN_IF_ERROR(host_computation_layout->mutable_parameter_layout(i)
+                           ->CopyLayoutFromShape(*argument_shapes[i]));
+    TF_RETURN_IF_ERROR(device_computation_layout->mutable_parameter_layout(i)
+                           ->CopyLayoutFromShape(*argument_shapes[i]));
   }
   if (execution_options != nullptr &&
       execution_options->has_shape_with_output_layout()) {
@@ -305,10 +280,20 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     TF_RETURN_IF_ERROR(ValidateResultShapeWithLayout(shape_with_output_layout,
                                                      program_shape.result()));
     TF_RETURN_IF_ERROR(
-        computation_layout->mutable_result_layout()->CopyLayoutFromShape(
+        host_computation_layout->mutable_result_layout()->CopyLayoutFromShape(
+            shape_with_output_layout));
+    TF_RETURN_IF_ERROR(
+        device_computation_layout->mutable_result_layout()->CopyLayoutFromShape(
             shape_with_output_layout));
   } else {
-    computation_layout->mutable_result_layout()->Clear();
+    // If the result layout is not set, then choose the default.
+    // TODO(b/29118294): Allow the compiler to choose a better layout in this
+    // case.
+    // TODO(b/78356948): We are forcing the default layout here. We should fix
+    // clients which expect a default layout, to be explicit about it, by
+    // passing the proper ExecutionOptions with shape_with_output_layout set.
+    host_computation_layout->mutable_result_layout()->SetToDefaultLayout();
+    device_computation_layout->mutable_result_layout()->SetToDefaultLayout();
   }
 
   config->set_replica_count(options_.number_of_replicas());
@@ -330,85 +315,43 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
 StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     const ProgramShape& program_shape,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    const ExecutionOptions& execution_options,
-    const UserComputation* user_computation) {
+    const ExecutionOptions& execution_options) {
   std::vector<const Shape*> argument_shapes;
   for (const auto* arg : arguments) {
     argument_shapes.push_back(&arg->on_host_shape());
   }
-  return CreateModuleConfig(program_shape, argument_shapes, &execution_options,
-                            user_computation);
+  return CreateModuleConfig(program_shape, argument_shapes, &execution_options);
 }
 
 StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
-    std::vector<VersionedComputationHandle> versioned_handles,
+    const std::vector<const HloModuleProto*>& module_protos,
     std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
     Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
     DeviceMemoryAllocator* device_allocator) {
   VLOG(1) << Printf("BuildExecutable on service %p", this);
 
   // Dump computation proto state if flag is set.
-  std::vector<std::unique_ptr<SessionModule>> session_modules;
-  for (int64 i = 0; i < versioned_handles.size(); ++i) {
+  std::vector<std::unique_ptr<HloSnapshot>> hlo_snapshots;
+  for (int64 i = 0; i < module_protos.size(); ++i) {
     const string& directory_path =
         module_configs[i]->debug_options().xla_dump_computations_to();
-    const string& other_directory_path =
+    const string& execution_directory_path =
         module_configs[i]->debug_options().xla_dump_executions_to();
-    if (directory_path.empty() && other_directory_path.empty()) {
+    if (directory_path.empty() && execution_directory_path.empty()) {
       continue;
     }
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<SessionModule> session_module,
-        computation_tracker_.SnapshotComputation(versioned_handles[i].handle));
+    auto hlo_snapshot = MakeUnique<HloSnapshot>();
+    *hlo_snapshot->mutable_hlo()->mutable_hlo_module() = *module_protos[i];
     if (!directory_path.empty()) {
-      string filename = Printf("computation_%lld__%s__version_%lld",
-                               versioned_handles[i].handle.handle(),
-                               session_module->entry().name().c_str(),
-                               versioned_handles[i].version);
-      TF_RETURN_IF_ERROR(Executable::DumpToDirectory(directory_path, filename,
-                                                     *session_module));
-      session_modules.push_back(std::move(session_module));
-    }
-  }
-
-  VLOG(1) << "Computation handles:";
-  for (const VersionedComputationHandle& versioned_handle : versioned_handles) {
-    VLOG(1) << versioned_handle;
-  }
-
-  CHECK_EQ(versioned_handles.size(), module_configs.size());
-  std::vector<std::unique_ptr<HloModule>> modules;
-  for (int64 i = 0; i < versioned_handles.size(); ++i) {
-    const VersionedComputationHandle& versioned_handle = versioned_handles[i];
-    const HloModuleConfig& config = *module_configs[i];
-    TF_ASSIGN_OR_RETURN(auto module,
-                        computation_tracker_.BuildHloModule(
-                            versioned_handle, config,
-                            /*include_unreachable_instructions=*/true));
-    modules.push_back(std::move(module));
-  }
-
-  TF_ASSIGN_OR_RETURN(
-      std::vector<std::unique_ptr<Executable>> executables,
-      backend->compiler()->Compile(std::move(modules), std::move(executors),
-                                   device_allocator));
-
-  for (size_t i = 0; i < versioned_handles.size(); ++i) {
-    if (!module_configs[i]->debug_options().xla_dump_executions_to().empty()) {
-      executables[i]->set_session_module(std::move(session_modules[i]));
+      string filename =
+          Printf("computation_%lld__%s", module_protos[i]->id(),
+                 module_protos[i]->entry_computation_name().c_str());
+      TF_RETURN_IF_ERROR(
+          Executable::DumpToDirectory(directory_path, filename, *hlo_snapshot));
+      hlo_snapshots.push_back(std::move(hlo_snapshot));
     }
   }
 
-  return std::move(executables);
-}
-
-StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
-    const std::vector<const HloModuleProto*>& module_protos,
-    std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
-    Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
-    DeviceMemoryAllocator* device_allocator) {
-  VLOG(1) << Printf("BuildExecutable on service %p", this);
-
   VLOG(1) << "Computations:";
   for (const HloModuleProto* proto : module_protos) {
     VLOG(1) << proto->name();
@@ -429,97 +372,29 @@ StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
       backend->compiler()->Compile(std::move(modules), std::move(executors),
                                    device_allocator));
 
-  return std::move(executables);
-}
-
-StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
-    const VersionedComputationHandle& versioned_handle,
-    std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
-    se::StreamExecutor* executor, DeviceMemoryAllocator* device_allocator) {
-  VLOG(1) << Printf("BuildExecutable on service %p with handle %s", this,
-                    versioned_handle.ToString().c_str());
-
-  // Dump computation proto state if flag is set.
-  std::unique_ptr<SessionModule> session_module;
-  const string& directory_path =
-      module_config->debug_options().xla_dump_computations_to();
-  const string& other_directory_path =
-      module_config->debug_options().xla_dump_executions_to();
-  if (!directory_path.empty() || !other_directory_path.empty()) {
-    TF_ASSIGN_OR_RETURN(
-        session_module,
-        computation_tracker_.SnapshotComputation(versioned_handle.handle));
-    if (!directory_path.empty()) {
-      string filename = Printf("computation_%lld__%s__version_%lld",
-                               versioned_handle.handle.handle(),
-                               session_module->entry().name().c_str(),
-                               versioned_handle.version);
-      TF_RETURN_IF_ERROR(Executable::DumpToDirectory(directory_path, filename,
-                                                     *session_module));
+  for (size_t i = 0; i < module_protos.size(); ++i) {
+    if (!module_configs[i]->debug_options().xla_dump_executions_to().empty()) {
+      executables[i]->set_hlo_snapshot(std::move(hlo_snapshots[i]));
     }
   }
 
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloModule> module,
-      computation_tracker_.BuildHloModule(versioned_handle, *module_config,
-                                          /*include_unreachable_instructions=*/
-                                          true));
-
-  TF_RETURN_IF_ERROR(MaybeDumpHloModule(*module));
-
-  TF_ASSIGN_OR_RETURN(
-      module, backend->compiler()->RunHloPasses(std::move(module), executor,
-                                                device_allocator));
-
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
-                      backend->compiler()->RunBackend(
-                          std::move(module), executor, device_allocator));
-
-  if (!other_directory_path.empty()) {
-    executable->set_session_module(std::move(session_module));
-  }
-
-  return std::move(executable);
+  return std::move(executables);
 }
 
-StatusOr<std::shared_ptr<Executable>> Service::BuildAndCacheExecutable(
-    const VersionedComputationHandle& versioned_handle,
-    std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
-    se::StreamExecutor* executor, ExecutionProfile* profile,
-    DeviceMemoryAllocator* device_allocator) {
-  std::shared_ptr<Executable> executable =
-      compilation_cache_.LookUp(versioned_handle, *module_config);
-
-  if (executable != nullptr) {
-    // Executable found in the computation cache.
-    if (profile != nullptr) {
-      profile->set_compilation_cache_hit(true);
-    }
-    return executable;
-  }
-
-  uint64 start_micros =
-      // Avoid reading the clock if we don't want timing info
-      (profile != nullptr) ? tensorflow::Env::Default()->NowMicros() : 0;
-
-  // Take a copy of the module config, as compilation introduces layouts where
-  // layouts were optional before.
-  HloModuleConfig original_module_config = *module_config;
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<Executable> executable_unique_ptr,
-      BuildExecutable(versioned_handle, std::move(module_config), backend,
-                      executor, device_allocator));
-
-  if (profile != nullptr) {
-    uint64 end_micros = tensorflow::Env::Default()->NowMicros();
-    uint64 milliseconds = (end_micros - start_micros) / 1000;
-    profile->set_compilation_cache_hit(false);
-    profile->set_compile_time_ms(milliseconds);
-  }
-
-  // Insert executable into the cache.
-  return compilation_cache_.Insert(std::move(executable_unique_ptr),
-                                   original_module_config);
+Status Service::ValidateEntryComputationLayout(HloModule* module) {
+  const ComputationLayout& on_device =
+      module->device_entry_computation_layout();
+  for (int64 i = 0; i < on_device.parameter_count(); ++i) {
+    TF_RET_CHECK(ShapeUtil::Equal(
+        on_device.parameter_shape(i),
+        execute_backend_->transfer_manager()->HostShapeToDeviceShape(
+            module->host_entry_computation_layout().parameter_shape(i))));
+  }
+  TF_RET_CHECK(ShapeUtil::Equal(
+      module->device_entry_computation_layout().result_shape(),
+      execute_backend_->transfer_manager()->HostShapeToDeviceShape(
+          module->host_entry_computation_layout().result_shape())));
+  return Status::OK();
 }
 
 StatusOr<std::vector<GlobalDataHandle>>
@@ -542,9 +417,16 @@ Service::ExecuteParallelAndRegisterResult(
   // profiled.
   std::map<int64, se::Stream*> index_to_profiled_streams;
 
-  TF_ASSIGN_OR_RETURN(DeviceAssignment device_assignment,
-                      backend->computation_placer()->AssignDevices(
-                          options_.number_of_replicas(), executables.size()));
+  // Build DeviceAssignment for all cores based on the provided device handles.
+  DeviceAssignment device_assignment(options_.number_of_replicas(),
+                                     executables.size());
+  for (int64 i = 0; i < executables.size(); i++) {
+    TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*backend, device_handles[i]));
+    CHECK_EQ(replicas.size(), arguments[i].size());
+    for (int64 replica = 0; replica < replicas.size(); ++replica) {
+      device_assignment(replica, i) = replicas[replica]->device_ordinal();
+    }
+  }
 
   for (int64 i = 0; i < executables.size(); i++) {
     // Stream executors for the replicas of the current computation.
@@ -574,7 +456,6 @@ Service::ExecuteParallelAndRegisterResult(
       ExecutableRunOptions options;
       options.set_stream(streams.back().get());
       options.set_allocator(backend->memory_allocator());
-      options.set_inter_op_thread_pool(backend->inter_op_thread_pool());
       options.set_intra_op_thread_pool(
           backend->eigen_intra_op_thread_pool_device());
       options.set_device_assignment(&device_assignment);
@@ -688,12 +569,12 @@ StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
     options.set_stream(stream.get());
     options.set_device_ordinal(stream->parent()->device_ordinal());
     options.set_allocator(backend->memory_allocator());
-    options.set_inter_op_thread_pool(backend->inter_op_thread_pool());
     options.set_intra_op_thread_pool(
         backend->eigen_intra_op_thread_pool_device());
     options.set_device_assignment(&device_assignment);
-    run_options.emplace_back(options, backend->StreamBorrower(),
-                             backend->inter_op_thread_pool());
+    run_options.emplace_back(
+        options, backend->StreamBorrower(),
+        /*xla_intra_op_thread_pool=*/backend->eigen_intra_op_thread_pool());
   }
 
   if (options_.number_of_replicas() == 1) {
@@ -718,13 +599,6 @@ StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
                                                        result_tag);
 }
 
-tensorflow::Status Service::SetReturnValue(const SetReturnValueRequest* arg,
-                                           SetReturnValueResponse* results) {
-  TF_ASSIGN_OR_RETURN(UserComputation * computation,
-                      computation_tracker_.Resolve(arg->computation()));
-  return computation->SetReturnValue(arg->operand());
-}
-
 StatusOr<std::vector<se::StreamExecutor*>> Service::GetExecutors(
     const ExecutionOptions& execution_options, int64 requests_size,
     int64 request_index) const {
@@ -766,119 +640,8 @@ StatusOr<std::vector<std::vector<const ShapedBuffer*>>> Service::GetArguments(
   return replicated_arguments;
 }
 
-tensorflow::Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
-                                            ExecuteParallelResponse* result) {
-  VLOG(1) << "running execute-parallel request: " << arg->ShortDebugString();
-
-  std::vector<std::vector<std::vector<const ShapedBuffer*>>> all_arguments;
-  std::vector<std::vector<se::StreamExecutor*>> all_executors;
-  std::vector<VersionedComputationHandle> versioned_handles;
-  std::vector<std::unique_ptr<HloModuleConfig>> module_configs;
-  std::vector<string> computation_names;
-  std::vector<DeviceHandle> device_handles;
-
-  int num_requested_devices =
-      std::accumulate(arg->requests().begin(), arg->requests().end(), 0,
-                      [](int a, const ExecuteRequest& r) -> int {
-                        return a + r.execution_options().device_handles_size();
-                      });
-  if (num_requested_devices * options_.number_of_replicas() >
-      execute_backend_->device_count()) {
-    return FailedPrecondition(
-        "there are not enough stream executors to execute %d computations",
-        num_requested_devices);
-  }
-
-  for (int64 i = 0; i < arg->requests_size(); ++i) {
-    // Get the stream executor for the i'th computation. This stream executor
-    // is one of the executors to run the replicated computation.
-    const ExecutionOptions& execution_options =
-        arg->requests(i).execution_options();
-
-    // Get the executors.
-    TF_ASSIGN_OR_RETURN(auto executors, GetExecutors(execution_options,
-                                                     arg->requests_size(), i));
-
-    // Resolve the UserComputation object associated with the requested
-    // computation and compute the program shape.
-    const ExecuteRequest& request = arg->requests(i);
-    TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
-                        computation_tracker_.Resolve(request.computation()));
-    VersionedComputationHandle versioned_handle =
-        user_computation->GetVersionedHandle();
-    if (user_computation->request_count(versioned_handle.version) == 0) {
-      return InvalidArgument("computations may not be empty");
-    }
-
-    TF_ASSIGN_OR_RETURN(
-        std::shared_ptr<const ProgramShape> program_shape,
-        user_computation->ComputeProgramShape(versioned_handle.version));
-
-    // Get the replicated arguments.
-    TF_ASSIGN_OR_RETURN(auto replicated_arguments,
-                        GetArguments(execution_options, request.arguments()));
-
-    // Create an HloModuleConfig object for the computation, given the shape of
-    // the program and the argument allocations. Here, we care only about the
-    // shapes of the arguments, so, it is sufficient to use the arguments of
-    // replica 0.
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<HloModuleConfig> module_config,
-        CreateModuleConfig(*program_shape, replicated_arguments.front(),
-                           request.execution_options(), user_computation));
-    VLOG(3) << "ExecuteParallel created HloModuleConfig computation layout: "
-            << module_config->entry_computation_layout().ToString();
-
-    // Adds to the vectors to build and execute the computations after the loop.
-    all_arguments.push_back(replicated_arguments);
-    all_arguments.insert(all_arguments.end(), executors.size() - 1, {{}});
-    versioned_handles.push_back(versioned_handle);
-    module_configs.push_back(std::move(module_config));
-    computation_names.insert(computation_names.end(), executors.size(),
-                             user_computation->name());
-    all_executors.push_back(executors);
-    device_handles.insert(device_handles.end(),
-                          execution_options.device_handles().begin(),
-                          execution_options.device_handles().end());
-  }
-
-  // Build the user computations into HloModules and compile to generate the
-  // executables.
-  //
-  // TODO(jlebar): There's currently no way to pass a device allocator to
-  // ExecuteParallel, so we have to pass a null device_allocator below.
-  TF_ASSIGN_OR_RETURN(
-      std::vector<std::unique_ptr<Executable>> executables,
-      BuildExecutables(versioned_handles, std::move(module_configs),
-                       execute_backend_.get(), all_executors,
-                       /*device_allocator=*/nullptr));
-  std::vector<Executable*> executable_ptrs;
-  executable_ptrs.reserve(executables.size());
-  for (const auto& executable : executables) {
-    executable_ptrs.push_back(executable.get());
-  }
-
-  // Execute the generated executables in parallel and return the device
-  // handles for each computation's output.
-  ExecutionProfile profile;
-  TF_ASSIGN_OR_RETURN(
-      std::vector<GlobalDataHandle> outputs,
-      ExecuteParallelAndRegisterResult(executable_ptrs, all_arguments,
-                                       execute_backend_.get(), device_handles,
-                                       computation_names, &profile));
-  for (const GlobalDataHandle& output : outputs) {
-    ExecuteResponse response;
-    *response.mutable_output() = output;
-    *response.mutable_profile() = profile;
-    *result->add_responses() = response;
-  }
-
-  VLOG(1) << "successfully completed 'execute-parallel' request";
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status Service::ExecuteGraphParallel(
-    const ExecuteGraphParallelRequest* arg, ExecuteParallelResponse* result) {
+Status Service::ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg,
+                                     ExecuteParallelResponse* result) {
   VLOG(1) << "running execute-graph-parallel request";
 
   std::vector<std::vector<std::vector<const ShapedBuffer*>>> all_arguments;
@@ -926,11 +689,10 @@ tensorflow::Status Service::ExecuteGraphParallel(
         std::unique_ptr<HloModuleConfig> module_config,
         CreateModuleConfig(request.computation().program_shape(),
                            replicated_arguments.front(),
-                           request.execution_options(),
-                           /*user_computation=*/nullptr));
+                           request.execution_options()));
     VLOG(3)
         << "ExecuteGraphParallel created HloModuleConfig computation layout: "
-        << module_config->entry_computation_layout().ToString();
+        << module_config->host_entry_computation_layout().ToString();
 
     // Adds to the vectors to build and execute the computations after the loop.
     all_arguments.push_back(replicated_arguments);
@@ -975,11 +737,11 @@ tensorflow::Status Service::ExecuteGraphParallel(
   }
 
   VLOG(1) << "successfully completed 'execute-graph-parallel' request";
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status Service::GetDeviceHandles(const GetDeviceHandlesRequest* arg,
-                                             GetDeviceHandlesResponse* result) {
+Status Service::GetDeviceHandles(const GetDeviceHandlesRequest* arg,
+                                 GetDeviceHandlesResponse* result) {
   const int64 available_device_count = execute_backend_->device_count();
   const int64 replica_count = options_.number_of_replicas();
   if (replica_count <= 0) {
@@ -999,20 +761,11 @@ tensorflow::Status Service::GetDeviceHandles(const GetDeviceHandlesRequest* arg,
     *result->add_device_handles() = device_handle;
   }
 
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status Service::ExecuteOneToN(const ExecuteRequest* arg,
-                                          ExecuteResponse* result) {
-  ExecuteParallelRequest parallel_arg;
-  *parallel_arg.add_requests() = *arg;
-  ExecuteParallelResponse parallel_result;
-  TF_RETURN_IF_ERROR(ExecuteParallel(&parallel_arg, &parallel_result));
-  return PickParallelResponse(parallel_result, result);
+  return Status::OK();
 }
 
-tensorflow::Status Service::ExecuteOneToN(const ExecuteGraphRequest* arg,
-                                          ExecuteResponse* result) {
+Status Service::ExecuteOneToN(const ExecuteGraphRequest* arg,
+                              ExecuteResponse* result) {
   ExecuteGraphParallelRequest parallel_arg;
   *parallel_arg.add_requests() = *arg;
   ExecuteParallelResponse parallel_result;
@@ -1020,7 +773,7 @@ tensorflow::Status Service::ExecuteOneToN(const ExecuteGraphRequest* arg,
   return PickParallelResponse(parallel_result, result);
 }
 
-tensorflow::Status Service::PickParallelResponse(
+Status Service::PickParallelResponse(
     const ExecuteParallelResponse& parallel_result, ExecuteResponse* result) {
   // The "result device" selection is a bit hacky, but better than assuming it
   // is device 0. We have b/76035356 for restructuring the client API to clean
@@ -1043,81 +796,6 @@ tensorflow::Status Service::PickParallelResponse(
   return Status::OK();
 }
 
-tensorflow::Status Service::Execute(const ExecuteRequest* arg,
-                                    ExecuteResponse* result) {
-  VLOG(1) << "running execute request: " << arg->ShortDebugString();
-
-  TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
-                      computation_tracker_.Resolve(arg->computation()));
-
-  VersionedComputationHandle versioned_handle =
-      user_computation->GetVersionedHandle();
-
-  if (user_computation->request_count(versioned_handle.version) == 0) {
-    return InvalidArgument("computations may not be empty");
-  }
-
-  // If we received multiple device handles, we must partition the module.
-  if (arg->execution_options().device_handles_size() > 1) {
-    return ExecuteOneToN(arg, result);
-  }
-
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<const ProgramShape> program_shape,
-      user_computation->ComputeProgramShape(versioned_handle.version));
-
-  TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*execute_backend_,
-                                              SingleComputationDeviceHandle()));
-  TF_ASSIGN_OR_RETURN(
-      std::vector<std::vector<const ShapedBuffer*>> replicated_arguments,
-      ResolveAndValidateArguments(arg->arguments(), replicas));
-
-  // Since we care only about the shapes of the arguments, it is sufficient to
-  // use the arguments of replica 0.
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloModuleConfig> module_config,
-      CreateModuleConfig(*program_shape, replicated_arguments.front(),
-                         arg->execution_options(), user_computation));
-
-  VLOG(3) << "Execute created HloModuleConfig computation layout: "
-          << module_config->entry_computation_layout().ToString();
-
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<Executable> executable,
-      BuildAndCacheExecutable(versioned_handle, std::move(module_config),
-                              execute_backend_.get(),
-                              execute_backend_->default_stream_executor(),
-                              result->mutable_profile()));
-
-  if (executable->dumping()) {
-    executable->session_module()->set_execution_platform(
-        execute_backend_->platform()->Name());
-    TF_RETURN_IF_ERROR(RecordArguments(
-        replicated_arguments.front(),
-        execute_backend_->default_stream_executor(),
-        execute_backend_->transfer_manager(), executable->session_module()));
-  }
-
-  TF_ASSIGN_OR_RETURN(
-      *result->mutable_output(),
-      ExecuteAndRegisterResult(
-          executable.get(), replicated_arguments, execute_backend_.get(),
-          "result of " + user_computation->name(), result->mutable_profile()));
-
-  if (executable->dumping()) {
-    TF_ASSIGN_OR_RETURN(
-        const ShapedBuffer* result_buffer,
-        allocation_tracker_.ResolveForReplica(result->output(), 0));
-    TF_RETURN_IF_ERROR(RecordResult(
-        *result_buffer, execute_backend_->default_stream_executor(),
-        execute_backend_->transfer_manager(), executable->session_module()));
-    TF_RETURN_IF_ERROR(executable->DumpSessionModule());
-  }
-
-  VLOG(1) << "successfully completed 'execute' request";
-  return tensorflow::Status::OK();
-}
-
 StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
     const HloModuleProto& module_proto,
     std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
@@ -1126,6 +804,22 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
       "BuildExecutable on service %p with serialized module proto: %s", this,
       module_proto.name().c_str());
 
+  // Dump computation proto state if flag is set.
+  auto hlo_snapshot = MakeUnique<HloSnapshot>();
+  const string& directory_path =
+      module_config->debug_options().xla_dump_computations_to();
+  const string& execution_directory_path =
+      module_config->debug_options().xla_dump_executions_to();
+  if (!directory_path.empty() || !execution_directory_path.empty()) {
+    *hlo_snapshot->mutable_hlo()->mutable_hlo_module() = module_proto;
+    if (!directory_path.empty()) {
+      string filename = Printf("computation_%lld__%s", module_proto.id(),
+                               module_proto.entry_computation_name().c_str());
+      TF_RETURN_IF_ERROR(
+          Executable::DumpToDirectory(directory_path, filename, *hlo_snapshot));
+    }
+  }
+
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
                       HloModule::CreateFromProto(module_proto, *module_config));
 
@@ -1134,6 +828,8 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
   TF_ASSIGN_OR_RETURN(
       module, backend->compiler()->RunHloPasses(std::move(module), executor,
                                                 device_allocator));
+  // Check that on-host and on-device shapes are consistent.
+  TF_RETURN_IF_ERROR(ValidateEntryComputationLayout(module.get()));
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
                       backend->compiler()->RunBackend(
@@ -1142,8 +838,8 @@ StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
   return std::move(executable);
 }
 
-tensorflow::Status Service::ExecuteGraph(const ExecuteGraphRequest* arg,
-                                         ExecuteResponse* result) {
+Status Service::ExecuteGraph(const ExecuteGraphRequest* arg,
+                             ExecuteResponse* result) {
   VLOG(1) << "running execute-graph request";
 
   if (!arg->has_computation()) {
@@ -1176,99 +872,37 @@ tensorflow::Status Service::ExecuteGraph(const ExecuteGraphRequest* arg,
                       execute_backend_->default_stream_executor(),
                       /*device_allocator=*/nullptr));
 
+  if (executable->dumping_snapshot()) {
+    executable->hlo_snapshot()->set_execution_platform(
+        execute_backend_->platform()->Name());
+    TF_RETURN_IF_ERROR(RecordArguments(
+        replicated_arguments.front(),
+        execute_backend_->default_stream_executor(),
+        execute_backend_->transfer_manager(), executable->hlo_snapshot()));
+  }
+
   TF_ASSIGN_OR_RETURN(
       *result->mutable_output(),
       ExecuteAndRegisterResult(
           executable.get(), replicated_arguments, execute_backend_.get(),
           "result of " + arg->computation().name(), result->mutable_profile()));
 
-  VLOG(1) << "successfully completed 'execute-graph' request";
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
-                                         ExecuteAsyncResponse* result) {
-  VLOG(1) << "running execute-async request: " << arg->ShortDebugString();
-
-  TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
-                      computation_tracker_.Resolve(arg->computation()));
-
-  VersionedComputationHandle versioned_handle =
-      user_computation->GetVersionedHandle();
-  if (user_computation->request_count(versioned_handle.version) == 0) {
-    return InvalidArgument("computations may not be empty");
-  }
-
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<const ProgramShape> program_shape,
-      user_computation->ComputeProgramShape(versioned_handle.version));
-
-  TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*execute_backend_,
-                                              SingleComputationDeviceHandle()));
-  TF_RET_CHECK(!replicas.empty());
-  TF_ASSIGN_OR_RETURN(
-      std::vector<std::vector<const ShapedBuffer*>> replicated_arguments,
-      ResolveAndValidateArguments(arg->arguments(), replicas));
-
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloModuleConfig> module_config,
-      CreateModuleConfig(*program_shape, replicated_arguments.front(),
-                         arg->execution_options(), user_computation));
-
-  VLOG(3) << "ExecuteAsync created HloModuleConfig computation layout: "
-          << module_config->entry_computation_layout().ToString();
-
-  ExecutionProfile profile;
-
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<Executable> executable,
-      BuildAndCacheExecutable(
-          versioned_handle, std::move(module_config), execute_backend_.get(),
-          execute_backend_->default_stream_executor(), &profile));
-
-  // Set up streams.
-  std::vector<Pool<se::Stream>::SmartPtr> streams;
-  for (se::StreamExecutor* executor : replicas) {
-    TF_ASSIGN_OR_RETURN(Pool<se::Stream>::SmartPtr stream,
-                        execute_backend_->BorrowStream(executor));
-    streams.push_back(std::move(stream));
-  }
-
-  std::vector<ScopedShapedBuffer> result_buffers;
-  for (size_t i = 0; i < streams.size(); ++i) {
-    const auto& stream = streams[i];
-    ExecutableRunOptions options;
-    options.set_stream(stream.get());
-    options.set_allocator(execute_backend_->memory_allocator());
-    options.set_inter_op_thread_pool(execute_backend_->inter_op_thread_pool());
-    options.set_intra_op_thread_pool(
-        execute_backend_->eigen_intra_op_thread_pool_device());
-
-    ServiceExecutableRunOptions service_options(
-        options, execute_backend_->StreamBorrower());
-
-    TF_ASSIGN_OR_RETURN(ScopedShapedBuffer this_result_buffer,
-                        executable->ExecuteAsyncOnStream(
-                            &service_options, replicated_arguments[i]));
-
-    result_buffers.emplace_back(std::move(this_result_buffer));
+  if (executable->dumping_snapshot()) {
+    TF_ASSIGN_OR_RETURN(
+        const ShapedBuffer* result_buffer,
+        allocation_tracker_.ResolveForReplica(result->output(), 0));
+    TF_RETURN_IF_ERROR(RecordResult(
+        *result_buffer, execute_backend_->default_stream_executor(),
+        execute_backend_->transfer_manager(), executable->hlo_snapshot()));
+    TF_RETURN_IF_ERROR(executable->DumpHloSnapshot());
   }
 
-  TF_ASSIGN_OR_RETURN(
-      GlobalDataHandle output,
-      allocation_tracker_.RegisterReplicatedBuffers(
-          std::move(result_buffers), "result of " + user_computation->name()));
-
-  *result->mutable_execution() = execution_tracker_.Register(
-      execute_backend_.get(), std::move(streams), profile, output);
-  streams.clear();
-
-  VLOG(1) << "successfully completed 'execute-async' request";
-  return tensorflow::Status::OK();
+  VLOG(1) << "successfully completed 'execute-graph' request";
+  return Status::OK();
 }
 
-tensorflow::Status Service::WaitForExecution(const WaitForExecutionRequest* arg,
-                                             WaitForExecutionResponse* result) {
+Status Service::WaitForExecution(const WaitForExecutionRequest* arg,
+                                 WaitForExecutionResponse* result) {
   TF_ASSIGN_OR_RETURN(const auto execution,
                       execution_tracker_.Resolve(arg->execution()));
 
@@ -1279,11 +913,11 @@ tensorflow::Status Service::WaitForExecution(const WaitForExecutionRequest* arg,
 
   TF_RETURN_IF_ERROR(execution_tracker_.Unregister(arg->execution()));
   VLOG(1) << "successfully completed 'wait-for-execution' request";
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status Service::TransferToClient(const TransferToClientRequest* arg,
-                                             TransferToClientResponse* result) {
+Status Service::TransferToClient(const TransferToClientRequest* arg,
+                                 TransferToClientResponse* result) {
   TF_ASSIGN_OR_RETURN(const ShapedBuffer* shaped_buffer,
                       allocation_tracker_.ResolveForReplica(arg->data(), 0));
 
@@ -1313,7 +947,7 @@ tensorflow::Status Service::TransferToClient(const TransferToClientRequest* arg,
     *result->mutable_literal() =
         result_literal->Relayout(*return_shape)->ToProto();
   }
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 namespace {
@@ -1331,8 +965,8 @@ std::unique_ptr<ShapedBuffer> CloneShapedBufferOnDevice(
 
 }  // namespace
 
-tensorflow::Status Service::TransferToServer(const TransferToServerRequest* arg,
-                                             TransferToServerResponse* result) {
+Status Service::TransferToServer(const TransferToServerRequest* arg,
+                                 TransferToServerResponse* result) {
   TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> literal,
                       Literal::CreateFromProto(arg->literal()));
   const Shape& shape = literal->shape();
@@ -1365,11 +999,11 @@ tensorflow::Status Service::TransferToServer(const TransferToServerRequest* arg,
                           StrCat("TransferToServer literal of shape ",
                                  ShapeUtil::HumanString(shape))));
 
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status Service::TransferToInfeed(const TransferToInfeedRequest* arg,
-                                             TransferToInfeedResponse* result) {
+Status Service::TransferToInfeed(const TransferToInfeedRequest* arg,
+                                 TransferToInfeedResponse* result) {
   const int64 replica_count = options_.number_of_replicas();
   if (arg->replica_id() < 0 || arg->replica_id() >= replica_count) {
     return FailedPrecondition(
@@ -1398,9 +1032,8 @@ tensorflow::Status Service::TransferToInfeed(const TransferToInfeedRequest* arg,
       executor, *literal);
 }
 
-tensorflow::Status Service::TransferFromOutfeed(
-    const TransferFromOutfeedRequest* arg,
-    TransferFromOutfeedResponse* result) {
+Status Service::TransferFromOutfeed(const TransferFromOutfeedRequest* arg,
+                                    TransferFromOutfeedResponse* result) {
   const int64 replica_count = options_.number_of_replicas();
   if (arg->replica_id() < 0 || arg->replica_id() >= replica_count) {
     return FailedPrecondition(
@@ -1426,127 +1059,16 @@ tensorflow::Status Service::TransferFromOutfeed(
       execute_backend_->transfer_manager()->TransferLiteralFromOutfeed(
           executor, arg->shape_with_layout(), &literal));
   *result->mutable_literal() = literal.ToProto();
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status Service::ResetDevice(const ResetDeviceRequest* arg,
-                                        ResetDeviceResponse* result) {
+Status Service::ResetDevice(const ResetDeviceRequest* arg,
+                            ResetDeviceResponse* result) {
   return execute_backend_->ResetDevices();
 }
 
-tensorflow::Status Service::IsConstant(const IsConstantRequest* arg,
-                                       IsConstantResponse* result) {
-  TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
-                      computation_tracker_.Resolve(arg->computation()));
-
-  VersionedComputationHandle versioned_handle =
-      user_computation->GetVersionedHandleAtOperation(arg->operand());
-
-  if (user_computation->request_count(versioned_handle.version) == 0) {
-    return InvalidArgument("computations may not be empty");
-  }
-
-  TF_ASSIGN_OR_RETURN(
-      bool is_constant,
-      user_computation->IsConstant(arg->operand(), arg->num_parameters()));
-
-  result->set_is_constant(is_constant);
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status Service::ComputeConstant(const ComputeConstantRequest* arg,
-                                            ComputeConstantResponse* result) {
-  TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
-                      computation_tracker_.Resolve(arg->computation()));
-
-  VersionedComputationHandle versioned_handle =
-      user_computation->GetVersionedHandleAtOperation(arg->operand());
-
-  if (user_computation->request_count(versioned_handle.version) == 0) {
-    return InvalidArgument("computations may not be empty");
-  }
-
-  TF_ASSIGN_OR_RETURN(
-      bool is_constant,
-      user_computation->IsConstant(arg->operand(), arg->parameters_size()));
-  if (!is_constant) {
-    StatusOr<const OperationRequest*> op_request_status =
-        user_computation->LookUpRequestForErrorReporting(arg->operand());
-    string op_request_string = "<unknown operation>";
-    if (op_request_status.ok()) {
-      op_request_string = op_request_status.ValueOrDie()->ShortDebugString();
-    }
-    return InvalidArgument(
-        "Operand to ComputeConstant depends on a parameter.\n\n"
-        "  op requested for constant evaluation: %s\n\n"
-        "This is an internal error that typically happens when the XLA user "
-        "(e.g. TensorFlow) is attempting to determine a value that must be a "
-        "compile-time constant (e.g. an array dimension) but it is not capable "
-        "of being evaluated at XLA compile time.\n\n"
-        "Please file a usability bug with the framework being used (e.g. "
-        "TensorFlow).",
-        op_request_string.c_str());
-  }
-
-  // We can't use ComputeProgramShape because it checks that all parameter
-  // instructions are present and contiguous. Instead construct ProgramShape
-  // directly.
-  ProgramShape program_shape;
-  TF_ASSIGN_OR_RETURN(*program_shape.mutable_result(),
-                      user_computation->GetShape(arg->operand()));
-
-  TF_DCHECK_OK(ShapeUtil::ValidateShape(program_shape.result()));
-
-  ExecutionOptions execution_options = xla::CreateDefaultExecutionOptions();
-  execution_options.mutable_debug_options()->set_xla_enable_fast_math(false);
-  execution_options.mutable_debug_options()
-      ->set_xla_eliminate_hlo_implicit_broadcast(true);
-  *execution_options.mutable_shape_with_output_layout() =
-      program_shape.result();
-
-  Shape shape_with_output_layout(program_shape.result());
-  if (arg->has_output_layout()) {
-    TF_RETURN_IF_ERROR(LayoutUtil::ValidateLayoutForShape(
-        arg->output_layout(), execution_options.shape_with_output_layout()));
-    *execution_options.mutable_shape_with_output_layout()->mutable_layout() =
-        arg->output_layout();
-  }
-
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
-                      CreateModuleConfig(program_shape, {}, execution_options,
-                                         user_computation));
-
-  // Exclude dead parameter instructions for the purpose of computing constants.
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloModule> module,
-      computation_tracker_.BuildHloModule(versioned_handle, *module_config,
-                                          /*include_unreachable_instructions=*/
-                                          false));
-
-  std::vector<std::unique_ptr<Literal>> parameters(arg->parameters_size());
-  for (int64 i = 0; i < arg->parameters_size(); ++i) {
-    TF_ASSIGN_OR_RETURN(parameters[i],
-                        Literal::CreateFromProto(arg->parameters(i)));
-  }
-  HloEvaluator evaluator;
-  TF_ASSIGN_OR_RETURN(
-      auto result_literal,
-      evaluator.Evaluate<std::unique_ptr<Literal>>(*module, parameters));
-
-  // Since the shape_with_output_layout option in ExecutionOption is
-  // non-effective to the Evaluator results, explicit relayout here.
-  //
-  // TODO(b/77824332): Make HloEvaluator take care of the re-layout.
-  if (arg->has_output_layout()) {
-    result_literal = result_literal->Relayout(arg->output_layout());
-  }
-  *result->mutable_literal() = result_literal->ToProto();
-
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status Service::ComputeConstantGraph(
-    const ComputeConstantGraphRequest* arg, ComputeConstantResponse* result) {
+Status Service::ComputeConstantGraph(const ComputeConstantGraphRequest* arg,
+                                     ComputeConstantResponse* result) {
   if (!arg->has_computation()) {
     return InvalidArgument("computations may not be empty");
   }
@@ -1584,73 +1106,17 @@ tensorflow::Status Service::ComputeConstantGraph(
   }
   *result->mutable_literal() = result_literal->ToProto();
 
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status Service::GetShape(const GetShapeRequest* arg,
-                                     GetShapeResponse* result) {
+Status Service::GetShape(const GetShapeRequest* arg, GetShapeResponse* result) {
   TF_ASSIGN_OR_RETURN(const ShapedBuffer* buffer,
                       allocation_tracker_.ResolveForReplica(arg->data(), 0));
   *result->mutable_shape() = buffer->on_host_shape();
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status Service::GetComputationShape(
-    const GetComputationShapeRequest* arg,
-    GetComputationShapeResponse* result) {
-  TF_ASSIGN_OR_RETURN(UserComputation * computation,
-                      computation_tracker_.Resolve(arg->computation()));
-
-  VersionedComputationHandle versioned_handle =
-      computation->GetVersionedHandle();
-
-  TF_ASSIGN_OR_RETURN(auto program_shape, computation->ComputeProgramShape(
-                                              versioned_handle.version));
-  *result->mutable_program_shape() = *program_shape;
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status Service::GetLocalShape(const GetLocalShapeRequest* arg,
-                                          GetLocalShapeResponse* result) {
-  TF_ASSIGN_OR_RETURN(UserComputation * computation,
-                      computation_tracker_.Resolve(arg->computation()));
-
-  TF_ASSIGN_OR_RETURN(*result->mutable_shape(),
-                      computation->GetShape(arg->operand()));
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status Service::GetComputationStats(
-    const ComputationStatsRequest* arg, ComputationStatsResponse* result) {
-  TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
-                      computation_tracker_.Resolve(arg->computation()));
-
-  VersionedComputationHandle versioned_handle =
-      user_computation->GetVersionedHandle();
-
-  HloModuleConfig config;
-  config.set_debug_options(arg->debug_options());
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloModule> module,
-      computation_tracker_.BuildHloModule(versioned_handle, config));
-
-  hlo_graph_dumper::MaybeDumpHloModule(*module,
-                                       "computation statistics subject");
-
-  // Run HLO analysis to get the computation statistics.
-  HloCostAnalysis analysis(
-      execute_backend_->compiler()->ShapeSizeBytesFunction());
-
-  TF_RETURN_IF_ERROR(module->entry_computation()->Accept(&analysis));
-
-  ComputationStats stats;
-  stats.set_flop_count(analysis.flop_count());
-  stats.set_transcendental_count(analysis.transcendental_count());
-  *result->mutable_stats() = stats;
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status Service::GetComputationGraphStats(
+Status Service::GetComputationGraphStats(
     const ComputationGraphStatsRequest* arg, ComputationStatsResponse* result) {
   if (!arg->has_computation()) {
     return InvalidArgument("Computations may not be empty.");
@@ -1677,264 +1143,7 @@ tensorflow::Status Service::GetComputationGraphStats(
   stats.set_flop_count(analysis.flop_count());
   stats.set_transcendental_count(analysis.transcendental_count());
   *result->mutable_stats() = stats;
-  return tensorflow::Status::OK();
-}
-
-template <typename RequestT, typename ResponseT>
-tensorflow::Status Service::AddInstruction(
-    const RequestT* arg, ResponseT* result,
-    const std::function<StatusOr<ComputationDataHandle>(UserComputation*)>&
-        adder) {
-  TF_ASSIGN_OR_RETURN(UserComputation * computation,
-                      computation_tracker_.Resolve(arg->computation()));
-
-  TF_ASSIGN_OR_RETURN(*result->mutable_output(), adder(computation));
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status Service::Op(const OpRequest* arg, OpResponse* result) {
-  TF_ASSIGN_OR_RETURN(UserComputation * computation,
-                      computation_tracker_.Resolve(arg->computation()));
-  StatusOr<ComputationDataHandle> handle_status;
-
-  switch (arg->op_case()) {
-    case OpRequest::kBatchNormTrainingRequest:
-      handle_status = computation->AddBatchNormTrainingInstruction(
-          arg->batch_norm_training_request());
-      break;
-    case OpRequest::kBatchNormInferenceRequest:
-      handle_status = computation->AddBatchNormInferenceInstruction(
-          arg->batch_norm_inference_request());
-      break;
-    case OpRequest::kBatchNormGradRequest:
-      handle_status = computation->AddBatchNormGradInstruction(
-          arg->batch_norm_grad_request());
-      break;
-    case OpRequest::kBinaryOpRequest:
-      handle_status =
-          computation->AddBinaryInstruction(arg->binary_op_request());
-      break;
-    case OpRequest::kBroadcastRequest:
-      handle_status =
-          computation->AddBroadcastInstruction(arg->broadcast_request());
-      break;
-    case OpRequest::kCallRequest: {
-      TF_ASSIGN_OR_RETURN(
-          UserComputation * to_apply,
-          computation_tracker_.Resolve(arg->call_request().to_apply()));
-      handle_status =
-          computation->AddCallInstruction(arg->call_request(), *to_apply);
-      break;
-    }
-    case OpRequest::kConcatenateRequest:
-      handle_status =
-          computation->AddConcatenateInstruction(arg->concatenate_request());
-      break;
-    case OpRequest::kConditionalRequest: {
-      TF_ASSIGN_OR_RETURN(UserComputation * true_computation,
-                          computation_tracker_.Resolve(
-                              arg->conditional_request().true_computation()));
-      TF_ASSIGN_OR_RETURN(UserComputation * false_computation,
-                          computation_tracker_.Resolve(
-                              arg->conditional_request().false_computation()));
-      handle_status = computation->AddConditionalInstruction(
-          arg->conditional_request(), *true_computation, *false_computation);
-      break;
-    }
-    case OpRequest::kConstantRequest:
-      handle_status =
-          computation->AddConstantInstruction(arg->constant_request());
-      break;
-    case OpRequest::kConvertRequest:
-      handle_status =
-          computation->AddConvertInstruction(arg->convert_request());
-      break;
-    case OpRequest::kBitcastConvertRequest:
-      handle_status = computation->AddBitcastConvertInstruction(
-          arg->bitcast_convert_request());
-      break;
-    case OpRequest::kConvolveRequest:
-      handle_status =
-          computation->AddConvolveInstruction(arg->convolve_request());
-      break;
-    case OpRequest::kCrossReplicaSumRequest:
-      handle_status = computation->AddCrossReplicaSumInstruction(
-          arg->cross_replica_sum_request());
-      break;
-    case OpRequest::kCustomCallRequest:
-      handle_status =
-          computation->AddCustomCallInstruction(arg->custom_call_request());
-      break;
-    case OpRequest::kDotRequest:
-      handle_status = computation->AddDotInstruction(arg->dot_request());
-      break;
-    case OpRequest::kDynamicSliceRequest:
-      handle_status =
-          computation->AddDynamicSliceInstruction(arg->dynamic_slice_request());
-      break;
-    case OpRequest::kDynamicUpdateSliceRequest:
-      handle_status = computation->AddDynamicUpdateSliceInstruction(
-          arg->dynamic_update_slice_request());
-      break;
-    case OpRequest::kFftRequest:
-      handle_status = computation->AddFftInstruction(arg->fft_request());
-      break;
-    case OpRequest::kGatherRequest:
-      handle_status = computation->AddGatherInstruction(arg->gather_request());
-      break;
-    case OpRequest::kGetTupleElementRequest:
-      handle_status = computation->AddGetTupleElementInstruction(
-          arg->get_tuple_element_request());
-      break;
-    case OpRequest::kInfeedRequest:
-      handle_status = computation->AddInfeedInstruction(arg->infeed_request());
-      break;
-    case OpRequest::kOutfeedRequest:
-      handle_status =
-          computation->AddOutfeedInstruction(arg->outfeed_request());
-      break;
-    case OpRequest::kHostComputeRequest:
-      handle_status =
-          computation->AddHostComputeInstruction(arg->host_compute_request());
-      break;
-    case OpRequest::kMapRequest: {
-      TF_ASSIGN_OR_RETURN(
-          UserComputation * to_apply,
-          computation_tracker_.Resolve(arg->map_request().to_apply()));
-      handle_status =
-          computation->AddMapInstruction(arg->map_request(), *to_apply);
-      break;
-    }
-    case OpRequest::kPadRequest:
-      handle_status = computation->AddPadInstruction(arg->pad_request());
-      break;
-    case OpRequest::kParameterRequest:
-      handle_status =
-          computation->AddParameterInstruction(arg->parameter_request());
-      break;
-    case OpRequest::kReduceRequest: {
-      TF_ASSIGN_OR_RETURN(
-          UserComputation * to_apply,
-          computation_tracker_.Resolve(arg->reduce_request().to_apply()));
-      handle_status =
-          computation->AddReduceInstruction(arg->reduce_request(), *to_apply);
-      break;
-    }
-    case OpRequest::kReducePrecisionRequest: {
-      handle_status = computation->AddReducePrecisionInstruction(
-          arg->reduce_precision_request());
-      break;
-    }
-    case OpRequest::kReduceWindowRequest: {
-      TF_ASSIGN_OR_RETURN(UserComputation * to_apply,
-                          computation_tracker_.Resolve(
-                              arg->reduce_window_request().to_apply()));
-      handle_status = computation->AddReduceWindowInstruction(
-          arg->reduce_window_request(), *to_apply);
-      break;
-    }
-    case OpRequest::kReshapeRequest:
-      handle_status =
-          computation->AddReshapeInstruction(arg->reshape_request());
-      break;
-    case OpRequest::kReverseRequest:
-      handle_status =
-          computation->AddReverseInstruction(arg->reverse_request());
-      break;
-    case OpRequest::kRngRequest:
-      handle_status = computation->AddRngInstruction(arg->rng_request());
-      break;
-    case OpRequest::kSelectAndScatterRequest: {
-      TF_ASSIGN_OR_RETURN(UserComputation * select,
-                          computation_tracker_.Resolve(
-                              arg->select_and_scatter_request().select()));
-      TF_ASSIGN_OR_RETURN(UserComputation * scatter,
-                          computation_tracker_.Resolve(
-                              arg->select_and_scatter_request().scatter()));
-      handle_status = computation->AddSelectAndScatterInstruction(
-          arg->select_and_scatter_request(), *select, *scatter);
-      break;
-    }
-    case OpRequest::kSliceRequest:
-      handle_status = computation->AddSliceInstruction(arg->slice_request());
-      break;
-    case OpRequest::kTernaryOpRequest:
-      handle_status =
-          computation->AddTernaryInstruction(arg->ternary_op_request());
-      break;
-    case OpRequest::kTraceRequest:
-      return computation->AddTraceInstruction(arg->trace_request());
-    case OpRequest::kTransposeRequest:
-      handle_status =
-          computation->AddTransposeInstruction(arg->transpose_request());
-      break;
-    case OpRequest::kUnaryOpRequest:
-      handle_status = computation->AddUnaryInstruction(arg->unary_op_request());
-      break;
-    case OpRequest::kVariadicOpRequest:
-      handle_status =
-          computation->AddVariadicInstruction(arg->variadic_op_request());
-      break;
-    case OpRequest::kWhileRequest: {
-      TF_ASSIGN_OR_RETURN(
-          UserComputation * condition,
-          computation_tracker_.Resolve(arg->while_request().condition()));
-      TF_ASSIGN_OR_RETURN(
-          UserComputation * body,
-          computation_tracker_.Resolve(arg->while_request().body()));
-      handle_status = computation->AddWhileInstruction(arg->while_request(),
-                                                       *condition, *body);
-      break;
-    }
-    case OpRequest::kSendRequest: {
-      TF_RETURN_IF_ERROR(
-          channel_tracker_.RegisterSend(arg->send_request().channel_handle()));
-      // Send does not return a value, but we need a handle to be able to
-      // set OpMetadata and OpSharding (device assignment).
-      handle_status = computation->AddSendInstruction(arg->send_request());
-      break;
-    }
-    case OpRequest::kRecvRequest: {
-      TF_RETURN_IF_ERROR(
-          channel_tracker_.RegisterRecv(arg->recv_request().channel_handle()));
-      handle_status = computation->AddRecvInstruction(arg->recv_request());
-      break;
-    }
-    case OpRequest::OP_NOT_SET:
-      return InvalidArgument("XLA service received OpRequest with OP_NOT_SET");
-    default:
-      return InvalidArgument("Unsupported operation in XLA service");
-  }
-  TF_ASSIGN_OR_RETURN(*result->mutable_output(), handle_status);
-
-  // We set the debug metadata here, because we slice off part of the OpRequest
-  // proto in the above switch statement.
-  TF_ASSIGN_OR_RETURN(ComputationDataHandle handle, handle_status);
-  TF_RETURN_IF_ERROR(computation->SetOpMetadata(handle, arg->metadata()));
-  if (arg->has_sharding()) {
-    TF_RETURN_IF_ERROR(computation->SetOpSharding(handle, arg->sharding()));
-  }
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status Service::SnapshotComputation(
-    const SnapshotComputationRequest* arg,
-    SnapshotComputationResponse* result) {
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<SessionModule> module,
-      computation_tracker_.SnapshotComputation(arg->computation()));
-
-  result->set_allocated_module(module.release());
-
-  return tensorflow::Status::OK();
-}
-
-tensorflow::Status Service::LoadComputationSnapshot(
-    const LoadComputationSnapshotRequest* arg,
-    LoadComputationSnapshotResponse* result) {
-  TF_ASSIGN_OR_RETURN(*result->mutable_computation(),
-                      computation_tracker_.LoadSessionModule(arg->module()));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 DeviceHandle Service::SingleComputationDeviceHandle() const {
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index 476bd0597de735..d64b2b4d0afa15 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -27,15 +27,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/channel_tracker.h"
 #include "tensorflow/compiler/xla/service/compilation_cache.h"
-#include "tensorflow/compiler/xla/service/computation_tracker.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/execution_tracker.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
-#include "tensorflow/compiler/xla/service/session.pb.h"
-#include "tensorflow/compiler/xla/service/user_computation.h"
 #include "tensorflow/compiler/xla/service/versioned_computation_handle.h"
 #include "tensorflow/compiler/xla/service_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -83,57 +80,29 @@ class Service : public ServiceInterface {
   static StatusOr<std::unique_ptr<Service>> NewService(
       const ServiceOptions& options);
 
-  // Creates a new computation with the given name.
-  // A unique ComputationHandle is returned.
-  tensorflow::Status Computation(const ComputationRequest* arg,
-                                 ComputationResponse* result) override;
-
   // Unregisters a previously-allocated global handle.
   //
   // If the handle given is not currently allocated, a NOT_FOUND status is
   // returned.
-  tensorflow::Status Unregister(const UnregisterRequest* arg,
-                                UnregisterResponse* result) override;
+  Status Unregister(const UnregisterRequest* arg,
+                    UnregisterResponse* result) override;
 
   // Deconstructs a tuple. Returns a newly created GlobalDataHandle for each
   // element in the tuple.
-  tensorflow::Status DeconstructTuple(
-      const DeconstructTupleRequest* arg,
-      DeconstructTupleResponse* result) override;
-
-  // Modifies the provided computation so that subsequent executions
-  // will compute the provided ComputationDataHandle, rather than the
-  // last expression enqueued on that Computation.
-  tensorflow::Status SetReturnValue(const SetReturnValueRequest* arg,
-                                    SetReturnValueResponse* results) override;
-
-  // Executes a computation with the provided global data passed as
-  // immutable arguments. Returns global data output and execution timing.
-  tensorflow::Status Execute(const ExecuteRequest* arg,
-                             ExecuteResponse* result) override;
+  Status DeconstructTuple(const DeconstructTupleRequest* arg,
+                          DeconstructTupleResponse* result) override;
 
   // Executes a computation with the provided global data passed as
   // immutable arguments. The request contains the whole computation graph.
   // Returns global data output and execution timing.
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
-  tensorflow::Status ExecuteGraph(const ExecuteGraphRequest* arg,
-                                  ExecuteResponse* result) override;
+  Status ExecuteGraph(const ExecuteGraphRequest* arg,
+                      ExecuteResponse* result) override;
 
   // Executes one or more computations in parallel with the provided global data
   // passed as immutable arguments. Returns global data output for each
   // computation.
-  tensorflow::Status ExecuteParallel(const ExecuteParallelRequest* arg,
-                                     ExecuteParallelResponse* result) override;
-
-  // Executes one or more computations in parallel with the provided global data
-  // passed as immutable arguments. Returns global data output for each
-  // computation.
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
-  tensorflow::Status ExecuteGraphParallel(
-      const ExecuteGraphParallelRequest* arg,
-      ExecuteParallelResponse* result) override;
+  Status ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg,
+                              ExecuteParallelResponse* result) override;
 
   // Requests one or more device handles from the target.
   //
@@ -143,49 +112,33 @@ class Service : public ServiceInterface {
   // the first set of replicas, and the next R devices to the second set of
   // replicas, etc. Each returned device handle represents the device with the
   // replica id 0.
-  tensorflow::Status GetDeviceHandles(
-      const GetDeviceHandlesRequest* arg,
-      GetDeviceHandlesResponse* result) override;
-
-  // Asynchronously executes a computation with provided arguments. Invokes
-  // the provided computation with the provided global data passed as
-  // immutable arguments. Returns a handle to the execution.
-  //
-  // (Note: The corresponding function in xla::Client was removed as part of
-  // b/64116060, in an attempt to simplify our API.  We're keeping this around
-  // for now in case we want to expose this to clients in a different way.)
-  tensorflow::Status ExecuteAsync(const ExecuteAsyncRequest* arg,
-                                  ExecuteAsyncResponse* result) override;
+  Status GetDeviceHandles(const GetDeviceHandlesRequest* arg,
+                          GetDeviceHandlesResponse* result) override;
 
   // Waits until the specified execution is complete and returns the result.
   // Calling this API multiple times with the same execution handle returns the
   // method with an error since the execution handle is destroyed after the
   // first call.
-  tensorflow::Status WaitForExecution(
-      const WaitForExecutionRequest* arg,
-      WaitForExecutionResponse* result) override;
+  Status WaitForExecution(const WaitForExecutionRequest* arg,
+                          WaitForExecutionResponse* result) override;
 
   // Requests that global data be transferred to the client in literal form.
-  tensorflow::Status TransferToClient(
-      const TransferToClientRequest* arg,
-      TransferToClientResponse* result) override;
+  Status TransferToClient(const TransferToClientRequest* arg,
+                          TransferToClientResponse* result) override;
 
   // Transfers data from a literal provided by the client, into device memory.
-  tensorflow::Status TransferToServer(
-      const TransferToServerRequest* arg,
-      TransferToServerResponse* result) override;
+  Status TransferToServer(const TransferToServerRequest* arg,
+                          TransferToServerResponse* result) override;
 
   // Transfers data from a literal provided by the client, into the Infeed
   // buffer of the device.
-  tensorflow::Status TransferToInfeed(
-      const TransferToInfeedRequest* arg,
-      TransferToInfeedResponse* result) override;
+  Status TransferToInfeed(const TransferToInfeedRequest* arg,
+                          TransferToInfeedResponse* result) override;
 
   // Transfers data from the Outfeed othe device to the literal provided by the
   // client.
-  tensorflow::Status TransferFromOutfeed(
-      const TransferFromOutfeedRequest* arg,
-      TransferFromOutfeedResponse* result) override;
+  Status TransferFromOutfeed(const TransferFromOutfeedRequest* arg,
+                             TransferFromOutfeedResponse* result) override;
 
   // Resets devices, clearing all existing state on all the devices associated
   // with this service (including memory allocated on the devices).
@@ -196,77 +149,25 @@ class Service : public ServiceInterface {
   // ResetDevice should be called before an Execution that expect the device to
   // be in the reset state. For example, if the prior Execution modifies device
   // state (e.g., architectural state) that the next Execution depends on.
-  tensorflow::Status ResetDevice(const ResetDeviceRequest* arg,
-                                 ResetDeviceResponse* result) override;
-
-  // Tests if an expression is a compile-time constant.
-  tensorflow::Status IsConstant(const IsConstantRequest* arg,
-                                IsConstantResponse* result) override;
+  Status ResetDevice(const ResetDeviceRequest* arg,
+                     ResetDeviceResponse* result) override;
 
-  // Computes the value of a constant expression.
-  tensorflow::Status ComputeConstant(const ComputeConstantRequest* arg,
-                                     ComputeConstantResponse* result) override;
-  tensorflow::Status ComputeConstantGraph(
-      const ComputeConstantGraphRequest* arg,
-      ComputeConstantResponse* result) override;
+  Status ComputeConstantGraph(const ComputeConstantGraphRequest* arg,
+                              ComputeConstantResponse* result) override;
 
   // Returns the shape (with layout) of an array associated with a given data
   // handle.
-  tensorflow::Status GetShape(const GetShapeRequest* arg,
-                              GetShapeResponse* result) override;
-
-  // Returns the program shape of the computation associated with the given
-  // handle.
-  tensorflow::Status GetComputationShape(
-      const GetComputationShapeRequest* arg,
-      GetComputationShapeResponse* result) override;
-
-  /////
-  // Computation-oriented methods.
-
-  // Enqueues an Op on the computation.
-  tensorflow::Status Op(const OpRequest* arg, OpResponse* result) override;
-
-  // Retrieves the inferred shape for a value within a computation.
-  tensorflow::Status GetLocalShape(const GetLocalShapeRequest* arg,
-                                   GetLocalShapeResponse* result) override;
+  Status GetShape(const GetShapeRequest* arg,
+                  GetShapeResponse* result) override;
 
   // Retrieves the statistics of a computation.
-  tensorflow::Status GetComputationStats(
-      const ComputationStatsRequest* arg,
-      ComputationStatsResponse* result) override;
-
-  // Retrieves the statistics of a computation.
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
-  tensorflow::Status GetComputationGraphStats(
-      const ComputationGraphStatsRequest* arg,
-      ComputationStatsResponse* result) override;
-
-  // Snapshots the current state of a computation handle into a serializable
-  // protocol buffer form, so it can be loaded via
-  // LoadComputationSnapshot.
-  tensorflow::Status SnapshotComputation(
-      const SnapshotComputationRequest* arg,
-      SnapshotComputationResponse* result) override;
-
-  // Loads a computation from a serialized protocol buffer created via
-  // SnapshotComputation.
-  tensorflow::Status LoadComputationSnapshot(
-      const LoadComputationSnapshotRequest* arg,
-      LoadComputationSnapshotResponse* result) override;
+  Status GetComputationGraphStats(const ComputationGraphStatsRequest* arg,
+                                  ComputationStatsResponse* result) override;
 
   // Creates a unique channel handle that can be used for Send/Recv
   // instructions.
-  tensorflow::Status CreateChannelHandle(
-      const CreateChannelHandleRequest* arg,
-      CreateChannelHandleResponse* result) override;
-
-  // Returns the ComputationTracker of the current service instance.
-  // Only used in unit tests to access user computations from client.
-  const ComputationTracker& computation_tracker() {
-    return computation_tracker_;
-  }
+  Status CreateChannelHandle(const CreateChannelHandleRequest* arg,
+                             CreateChannelHandleResponse* result) override;
 
   // Returns the backend used to execute computations.
   const Backend& backend() const { return *execute_backend_; }
@@ -278,8 +179,7 @@ class Service : public ServiceInterface {
   StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
       const ProgramShape& program_shape,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      const ExecutionOptions& execution_options,
-      const UserComputation* user_computation = nullptr);
+      const ExecutionOptions& execution_options);
 
   // Picks a parallel response and fills the result.
   Status PickParallelResponse(const ExecuteParallelResponse& parallel_result,
@@ -295,6 +195,9 @@ class Service : public ServiceInterface {
       const ExecutionOptions& execution_options,
       tensorflow::gtl::ArraySlice<const GlobalDataHandle*> arguments);
 
+  // Assert that host- and device-shapes are in a consistent state.
+  Status ValidateEntryComputationLayout(HloModule* module);
+
  protected:
   friend class LocalExecutable;
 
@@ -317,23 +220,13 @@ class Service : public ServiceInterface {
   StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
       const ProgramShape& program_shape,
       tensorflow::gtl::ArraySlice<const Shape*> argument_shapes,
-      const ExecutionOptions* execution_options,
-      const UserComputation* user_computation = nullptr);
+      const ExecutionOptions* execution_options);
 
   // Builds an Executable for the given parameters.
   //
   // If device_allocator is not null, the compiler may use it to allocate temp
   // buffers, which the compiler is responsible for freeing.  The allocator
   // given here need not match the allocator used when running the executable.
-  StatusOr<std::unique_ptr<Executable>> BuildExecutable(
-      const VersionedComputationHandle& versioned_handle,
-      std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
-      se::StreamExecutor* executor,
-      DeviceMemoryAllocator* device_allocator = nullptr);
-
-  // Builds an Executable for the given HLO module proto.
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   StatusOr<std::unique_ptr<Executable>> BuildExecutable(
       const HloModuleProto& module_proto,
       std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
@@ -342,26 +235,12 @@ class Service : public ServiceInterface {
 
   // Same as BuildExecutable() above, but builds a list of Executables for the
   // given computations that may interact with each other.
-  StatusOr<std::vector<std::unique_ptr<Executable>>> BuildExecutables(
-      std::vector<VersionedComputationHandle> versioned_handles,
-      std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
-      Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
-      DeviceMemoryAllocator* device_allocator);
   StatusOr<std::vector<std::unique_ptr<Executable>>> BuildExecutables(
       const std::vector<const HloModuleProto*>& module_protos,
       std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
       Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
       DeviceMemoryAllocator* device_allocator);
 
-  // Similar to BuildExecutable, but look in the compilation cache for the
-  // executable first. If the executable is not in the cache, it is built and
-  // inserted into the cache.
-  StatusOr<std::shared_ptr<Executable>> BuildAndCacheExecutable(
-      const VersionedComputationHandle& versioned_handle,
-      std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
-      se::StreamExecutor* executor, ExecutionProfile* profile,
-      DeviceMemoryAllocator* device_allocator = nullptr);
-
   // Runs the given executable with the given arguments and register the result
   // in the allocation tracker. The handle of the result from the tracker is
   // returned. If the parameter "profile" is not null, it points to an
@@ -384,26 +263,16 @@ class Service : public ServiceInterface {
       tensorflow::gtl::ArraySlice<string> result_tags,
       ExecutionProfile* profile);
 
-  // Convenience function for adding a function to a user computation.
-  template <typename RequestT, typename ResponseT>
-  tensorflow::Status AddInstruction(
-      const RequestT* arg, ResponseT* result,
-      const std::function<StatusOr<ComputationDataHandle>(UserComputation*)>&
-          adder);
-
   // Executes a single computation which has more than one target device.
   // The N devices are expected to all return an empty tuple, but one, which
   // will be the result of this computation.
-  tensorflow::Status ExecuteOneToN(const ExecuteRequest* arg,
-                                   ExecuteResponse* result);
-  tensorflow::Status ExecuteOneToN(const ExecuteGraphRequest* arg,
-                                   ExecuteResponse* result);
+  Status ExecuteOneToN(const ExecuteGraphRequest* arg, ExecuteResponse* result);
 
   // Convenience function which checks whether the given shape_with_layout
   // (presumably passed by the client to set the result layout) is valid for the
   // given computation result shape.
-  tensorflow::Status ValidateResultShapeWithLayout(
-      const Shape& shape_with_layout, const Shape& result_shape) const;
+  Status ValidateResultShapeWithLayout(const Shape& shape_with_layout,
+                                       const Shape& result_shape) const;
 
   // Returns the stream executors assigned to the replicas represented by the
   // given device handle. Each device_handle is a virtual replicated device that
@@ -419,9 +288,6 @@ class Service : public ServiceInterface {
 
   ServiceOptions options_;
 
-  // Tracks computations built via the API.
-  ComputationTracker computation_tracker_;
-
   // Tracks channels created via the API.
   ChannelTracker channel_tracker_;
 
diff --git a/tensorflow/compiler/xla/service/session.proto b/tensorflow/compiler/xla/service/session.proto
deleted file mode 100644
index bb8d1cd2a106ea..00000000000000
--- a/tensorflow/compiler/xla/service/session.proto
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This proto file defines messages which store the state of XLA
-// computations within the XLA service. A computation is stored as a record
-// of the operation requests used to build it.
-syntax = "proto3";
-
-import "tensorflow/compiler/xla/xla_data.proto";
-
-package xla;
-
-// Describes a single operation request.
-message OperationRequest {
-  ComputationDataHandle output_handle = 1;
-  Shape output_shape = 2;
-
-  // For operations which call embedded computations such as "Map", these are
-  // the version(s) that the embedded computation should be called at. A version
-  // value of a computation is the ComputationDataHandle of the root of the
-  // computation at the point in time.
-  //
-  // "Call", "Map", "Reduce", and "ReduceWindow" operations take a single
-  // embedded computation so this field will have a single value for those
-  // operations.
-  //
-  // "While" operation takes two; index 0 is the "condition" version and index 1
-  // is the "body" version.
-  repeated int64 embedded_computation_versions = 3;
-
-  // The actual request, which in itself is a tagged union of all possible
-  // operation request types.
-  OpRequest request = 4;
-}
-
-// Describes a sequence of operation requests which define an XLA
-// computation.
-message SessionComputation {
-  string name = 1;
-
-  // The ComputationHandle used to refer to this computation in the XLA
-  // service.
-  ComputationHandle computation_handle = 2;
-
-  // Map from ComputationDataHandle value to operation request. The highest
-  // ComputationDataHandle value corresponds to the root of the computation.
-  map<int64, OperationRequest> requests = 3;
-}
-
-// Describes a group of SessionComputations with an "entry point" computation
-// that may refer to the other non-entry (AKA embedded) computations.
-//
-// This message is used to serialize a computation that has been built via the
-// XLA service API, along with its dependencies, for purposes such as
-// analysis/replay/file-storage.
-message SessionModule {
-  // The entry computation, which was requested for serialization. This may have
-  // referred to embedded computations, which are reflected below.
-  SessionComputation entry = 1;
-
-  // Embedded computations that are transitively referred to by the entry
-  // computation.
-  repeated SessionComputation embedded_computations = 2;
-
-  // The arguments passed to the computation.
-  repeated LiteralProto arguments = 3;
-
-  // The result of the computation.
-  LiteralProto result = 4;
-
-  // The name of the platform used to run the computation.
-  string execution_platform = 5;
-}
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 48b2922e77b787..d624f548b1ba65 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -58,6 +58,8 @@ UnaryOperation OpcodeToUnaryOperation(HloOpcode opcode) {
       return UNOP_COS;
     case HloOpcode::kExp:
       return UNOP_EXP;
+    case HloOpcode::kExpm1:
+      return UNOP_EXPM1;
     case HloOpcode::kFloor:
       return UNOP_FLOOR;
     case HloOpcode::kImag:
@@ -66,6 +68,8 @@ UnaryOperation OpcodeToUnaryOperation(HloOpcode opcode) {
       return UNOP_IS_FINITE;
     case HloOpcode::kLog:
       return UNOP_LOG;
+    case HloOpcode::kLog1p:
+      return UNOP_LOG1P;
     case HloOpcode::kNot:
       return UNOP_NOT;
     case HloOpcode::kNegate:
@@ -168,24 +172,24 @@ bool AllUnique(tensorflow::gtl::ArraySlice<int64> slice) {
   return std::set<int64>(slice.begin(), slice.end()).size() == slice.size();
 }
 
-tensorflow::Status ExpectNotTupleOrOpaque(const Shape& shape,
-                                          tensorflow::StringPiece op_type) {
+Status ExpectNotTupleOrOpaque(const Shape& shape,
+                              tensorflow::StringPiece op_type) {
   if (ShapeUtil::IsTuple(shape)) {
     return InvalidArgument("Expected non-tuple argument for %s, but got %s.",
-                           op_type.ToString().c_str(),
+                           std::string(op_type).c_str(),
                            ShapeUtil::HumanString(shape).c_str());
   } else if (ShapeUtil::IsOpaque(shape)) {
     return InvalidArgument("Expected non-opaque argument for %s, but got %s.",
-                           op_type.ToString().c_str(),
+                           std::string(op_type).c_str(),
                            ShapeUtil::HumanString(shape).c_str());
   } else {
-    return tensorflow::Status::OK();
+    return Status::OK();
   }
 }
 
-tensorflow::Status VerifyReducerShape(const ProgramShape& reducer_shape,
-                                      const Shape& init_value_shape,
-                                      const PrimitiveType& input_element_type) {
+Status VerifyReducerShape(const ProgramShape& reducer_shape,
+                          const Shape& init_value_shape,
+                          const PrimitiveType& input_element_type) {
   if (reducer_shape.parameters_size() != 2) {
     return InvalidArgument(
         "Reduction function must take 2 parameters, but "
@@ -245,7 +249,7 @@ tensorflow::Status VerifyReducerShape(const ProgramShape& reducer_shape,
         ShapeUtil::HumanString(accumulator_shape).c_str());
   }
 
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
@@ -312,7 +316,8 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
 /* static */ StatusOr<Shape> ShapeInference::InferUnaryOpShape(
     HloOpcode opcode, const Shape& shape) {
   // There is no copy operation at the proto level, so handle copy explicitly.
-  if (opcode == HloOpcode::kCopy) {
+  // A domain shape is the same as the input one.
+  if (opcode == HloOpcode::kCopy || opcode == HloOpcode::kDomain) {
     return shape;
   }
 
@@ -337,7 +342,9 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
     case UNOP_COS:
     case UNOP_SIN:
     case UNOP_EXP:
+    case UNOP_EXPM1:
     case UNOP_LOG:
+    case UNOP_LOG1P:
     case UNOP_TANH:
       if (!ShapeUtil::ElementIsFloating(arg) &&
           !ShapeUtil::ElementIsComplex(arg)) {
@@ -1212,11 +1219,11 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
       scale_shape, "scale input of batch norm training"));
 
   TF_RET_CHECK(ShapeUtil::ValidateShapeWithOptionalLayout(operand_shape) ==
-               tensorflow::Status::OK());
+               Status::OK());
   TF_RET_CHECK(ShapeUtil::ValidateShapeWithOptionalLayout(offset_shape) ==
-               tensorflow::Status::OK());
+               Status::OK());
   TF_RET_CHECK(ShapeUtil::ValidateShapeWithOptionalLayout(scale_shape) ==
-               tensorflow::Status::OK());
+               Status::OK());
 
   if (feature_index >= ShapeUtil::Rank(operand_shape)) {
     return InvalidArgument(
@@ -1318,15 +1325,15 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
       scale_shape, "scale input of batch norm inference"));
 
   TF_RET_CHECK(ShapeUtil::ValidateShapeWithOptionalLayout(operand_shape) ==
-               tensorflow::Status::OK());
+               Status::OK());
   TF_RET_CHECK(ShapeUtil::ValidateShapeWithOptionalLayout(offset_shape) ==
-               tensorflow::Status::OK());
+               Status::OK());
   TF_RET_CHECK(ShapeUtil::ValidateShapeWithOptionalLayout(scale_shape) ==
-               tensorflow::Status::OK());
+               Status::OK());
   TF_RET_CHECK(ShapeUtil::ValidateShapeWithOptionalLayout(mean_shape) ==
-               tensorflow::Status::OK());
+               Status::OK());
   TF_RET_CHECK(ShapeUtil::ValidateShapeWithOptionalLayout(variance_shape) ==
-               tensorflow::Status::OK());
+               Status::OK());
 
   if (feature_index >= ShapeUtil::Rank(operand_shape)) {
     return InvalidArgument(
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.cc b/tensorflow/compiler/xla/service/shaped_buffer.cc
index fb3b5f06dad67b..7d7dcac10b6593 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.cc
+++ b/tensorflow/compiler/xla/service/shaped_buffer.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 
-#include <set>
 #include <string>
 #include <utility>
 
@@ -25,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -123,6 +123,8 @@ ScopedShapedBuffer::ScopedShapedBuffer(ScopedShapedBuffer&& s)
 }
 
 ScopedShapedBuffer& ScopedShapedBuffer::operator=(ScopedShapedBuffer&& s) {
+  Deallocate();
+
   *static_cast<ShapedBuffer*>(this) = std::move(static_cast<ShapedBuffer&>(s));
   allocator_ = s.allocator_;
   // Null out s.allocator_ so it doesn't try to free anything in its destructor.
@@ -130,7 +132,15 @@ ScopedShapedBuffer& ScopedShapedBuffer::operator=(ScopedShapedBuffer&& s) {
   return *this;
 }
 
-ScopedShapedBuffer::~ScopedShapedBuffer() {
+ScopedShapedBuffer::~ScopedShapedBuffer() { Deallocate(); }
+
+ShapedBuffer ScopedShapedBuffer::release() {
+  ShapedBuffer shaped_buffer(static_cast<ShapedBuffer&&>(*this));
+  buffers_ = ShapeTree<se::DeviceMemoryBase>();
+  return shaped_buffer;
+}
+
+void ScopedShapedBuffer::Deallocate() {
   // allocator_ will be null if we were moved-from.
   if (allocator_ == nullptr) {
     return;
@@ -138,22 +148,14 @@ ScopedShapedBuffer::~ScopedShapedBuffer() {
   // Deallocate all non-null buffers. A buffer may appear in more than one spot
   // in the shape (eg, a tuple with a repeated element) so keep track of what
   // has been deallocated.
-  std::set<void*> deallocated_opaques;
+  tensorflow::gtl::FlatSet<void*> deallocated_ptrs;
   for (auto& pair : buffers_) {
     se::DeviceMemoryBase& memory_base = pair.second;
     if (!memory_base.is_null() &&
-        deallocated_opaques.count(memory_base.opaque()) == 0) {
-      deallocated_opaques.insert(memory_base.opaque());
-      TF_CHECK_OK(
-          this->allocator_->Deallocate(this->device_ordinal(), &memory_base));
+        deallocated_ptrs.insert(memory_base.opaque()).second) {
+      TF_CHECK_OK(allocator_->Deallocate(device_ordinal(), memory_base));
     }
   }
 }
 
-ShapedBuffer ScopedShapedBuffer::release() {
-  ShapedBuffer shaped_buffer(static_cast<ShapedBuffer&&>(*this));
-  buffers_ = ShapeTree<se::DeviceMemoryBase>();
-  return shaped_buffer;
-}
-
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/shaped_buffer.h b/tensorflow/compiler/xla/service/shaped_buffer.h
index e10fca9e9466c0..905a7e82e621f2 100644
--- a/tensorflow/compiler/xla/service/shaped_buffer.h
+++ b/tensorflow/compiler/xla/service/shaped_buffer.h
@@ -148,13 +148,29 @@ class ScopedShapedBuffer : public ShapedBuffer {
   // ScopedShapedBuffer.
   DeviceMemoryAllocator* memory_allocator() const { return allocator_; }
 
-  // Releases all device memory owned by this ScopedShapedBuffer and returns the
-  // device memory pointers in the form of a ShapedBuffer. The returned
-  // ShapedBuffer takes over the memory from the ScopedShapedBuffer. The
-  // resulting ScopedShapedBuffer can only be destroyed.
-  ShapedBuffer release();
+  // Sets the device memory buffer at the given index.
+  //
+  // If the given buffer's device memory is non-null, its device_ordinal and
+  // allocator must match those in `this`.
+  void set_buffer(OwningDeviceMemory buffer, const ShapeIndex& index) {
+    if (!buffer.is_null()) {
+      CHECK_EQ(buffer.device_ordinal(), device_ordinal());
+      CHECK_EQ(buffer.allocator(), allocator_);
+      *buffers_.mutable_element(index) = buffer.Forget();
+    } else {
+      *buffers_.mutable_element(index) = se::DeviceMemoryBase();
+    }
+  }
+
+  // Like unique_ptr::release(), creates and returns a regular ShapedBuffer from
+  // this ScopedShapedBuffer, without freeing any of the associated memory.
+  //
+  // It's the caller's job to ensure that the memory contained therein is freed.
+  TF_MUST_USE_RESULT ShapedBuffer release();
 
  protected:
+  void Deallocate();
+
   DeviceMemoryAllocator* allocator_;
 };
 
diff --git a/tensorflow/compiler/xla/service/shaped_buffer_test.cc b/tensorflow/compiler/xla/service/shaped_buffer_test.cc
new file mode 100644
index 00000000000000..0fc24366791165
--- /dev/null
+++ b/tensorflow/compiler/xla/service/shaped_buffer_test.cc
@@ -0,0 +1,110 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/shaped_buffer.h"
+
+#include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/service/platform_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace xla {
+namespace {
+
+TEST(ShapedBufferTest, ScopedShapeBufferAsShapedBufferB71629047) {
+  TF_ASSERT_OK_AND_ASSIGN(auto platforms,
+                          xla::PlatformUtil::GetSupportedPlatforms());
+  ASSERT_FALSE(platforms.empty());
+  auto* platform = platforms[0];
+  TF_ASSERT_OK_AND_ASSIGN(auto executors,
+                          xla::PlatformUtil::GetStreamExecutors(platform));
+  xla::StreamExecutorMemoryAllocator allocator(platform, executors);
+  const xla::Shape shape = xla::ShapeUtil::MakeShape(xla::F32, {});
+  const int kDeviceOrdinal = 0;
+  auto scoped_buffer = tensorflow::MakeUnique<xla::ScopedShapedBuffer>(
+      shape, shape, &allocator, kDeviceOrdinal);
+  std::unique_ptr<xla::ShapedBuffer> buffer = std::move(scoped_buffer);
+  buffer = nullptr;
+}
+
+class TestAllocator : public DeviceMemoryAllocator {
+ public:
+  TestAllocator()
+      : DeviceMemoryAllocator(PlatformUtil::GetDefaultPlatform().ValueOrDie()) {
+  }
+
+  ~TestAllocator() override {
+    if (!allocations_.empty()) {
+      ADD_FAILURE() << "Some allocations not freed!";
+    }
+  }
+
+  // Pull in two-arg overload of Allocate.
+  using DeviceMemoryAllocator::Allocate;
+
+  StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
+                                        bool /*retry_on_failure*/) override {
+    // By contract, we must return null if size == 0.
+    if (size == 0) {
+      return OwningDeviceMemory();
+    }
+    void* buf = malloc(size);
+    allocations_.insert({device_ordinal, buf});
+    return OwningDeviceMemory(se::DeviceMemoryBase(buf, size), device_ordinal,
+                              this);
+  }
+
+  Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override {
+    if (mem.is_null()) {
+      return Status::OK();
+    }
+
+    auto it = allocations_.find({device_ordinal, mem.opaque()});
+    if (it == allocations_.end()) {
+      ADD_FAILURE() << "Allocation not found (double free?)";
+    } else {
+      free(mem.opaque());
+      allocations_.erase(it);
+    }
+    return Status::OK();
+  }
+
+  bool AllowsAsynchronousDeallocation() const override { return false; }
+
+ private:
+  std::set<std::pair</*device_ordinal*/ int64, void*>> allocations_;
+};
+
+TEST(ScopedShapedBufferTest, TestMoveAssignmentOperator) {
+  Shape s = ShapeUtil::MakeShape(F32, {1});
+  TestAllocator allocator;
+  ScopedShapedBuffer sb1(s, s, &allocator, /*device_ordinal=*/0);
+  sb1.set_buffer(
+      allocator.Allocate(/*device_ordinal=*/0, /*size=*/42).ValueOrDie(),
+      /*index=*/{});
+
+  ScopedShapedBuffer sb2(s, s, &allocator, /*device_ordinal=*/1);
+  sb2.set_buffer(
+      allocator.Allocate(/*device_ordinal=*/1, /*size=*/10).ValueOrDie(),
+      /*index=*/{});
+
+  sb1 = std::move(sb2);
+
+  // TestAllocator's destructor checks that all memory was freed.
+}
+
+}  // anonymous namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/source_map_util.h b/tensorflow/compiler/xla/service/source_map_util.h
index a776d745f4e56c..18e2651abb1600 100644
--- a/tensorflow/compiler/xla/service/source_map_util.h
+++ b/tensorflow/compiler/xla/service/source_map_util.h
@@ -23,7 +23,7 @@ limitations under the License.
 namespace xla {
 namespace source_map_util {
 
-// Creates an INVALID_ARUGMENT status with the given format string.
+// Creates an INVALID_ARGUMENT status with the given format string.
 //
 // Also, attempts to extract the OpMetadata for parameter_number on executable
 // and append it to the status message for source mapping to user code.
diff --git a/tensorflow/compiler/xla/service/transfer_manager.cc b/tensorflow/compiler/xla/service/transfer_manager.cc
index 8b71a415091f02..c4d01562c4e322 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.cc
+++ b/tensorflow/compiler/xla/service/transfer_manager.cc
@@ -37,7 +37,7 @@ TransferManager::GetPlatformTransferManagers() {
 }
 
 Status TransferManager::TransferArrayToDevice(
-    se::StreamExecutor* executor, const Literal& literal,
+    se::StreamExecutor* executor, const LiteralSlice& literal,
     const se::DeviceMemoryBase& dest) {
   const Shape on_device_shape = HostShapeToDeviceShape(literal.shape());
   TF_RET_CHECK(ShapeUtil::IsArray(on_device_shape))
@@ -196,9 +196,11 @@ StatusOr<ScopedShapedBuffer> TransferManager::AllocateScopedShapedBuffer(
     const ShapeIndex& index = pair.first;
     se::DeviceMemoryBase& memory_base = pair.second;
     const Shape& subshape = ShapeUtil::GetSubshape(on_device_shape, index);
-    TF_ASSIGN_OR_RETURN(memory_base,
+    TF_ASSIGN_OR_RETURN(auto memory,
                         allocator->Allocate(shaped_buffer.device_ordinal(),
                                             GetByteSizeRequirement(subshape)));
+    // Move the allocated buffer into the ScopedShapedBuffer, which owns it.
+    memory_base = memory.Forget();
   }
 
   return std::move(shaped_buffer);
diff --git a/tensorflow/compiler/xla/service/transfer_manager.h b/tensorflow/compiler/xla/service/transfer_manager.h
index d82b4f0f81b5da..43a8092b06fba0 100644
--- a/tensorflow/compiler/xla/service/transfer_manager.h
+++ b/tensorflow/compiler/xla/service/transfer_manager.h
@@ -65,14 +65,14 @@ class TransferManager {
   // of the ShapedBuffer and DeviceShape(literal.shape()) must be compatible,
   // but need not have the same layout
   virtual Status TransferLiteralToDevice(se::StreamExecutor* executor,
-                                         const Literal& literal,
+                                         const LiteralSlice& literal,
                                          const ShapedBuffer& device_buffer) = 0;
 
   // Convenience methods for transferring an array to or from the device at a
   // known address. This avoids having to construct a ShapedBuffer just to
   // transfer an array at a known address.
   Status TransferArrayToDevice(se::StreamExecutor* executor,
-                               const Literal& literal,
+                               const LiteralSlice& literal,
                                const se::DeviceMemoryBase& dest);
   StatusOr<std::unique_ptr<Literal>> TransferArrayFromDevice(
       se::StreamExecutor* executor, const Shape& shape,
@@ -81,7 +81,7 @@ class TransferManager {
   // Transfers the given literal into the Infeed interface of the device,
   // using the given executor.
   virtual Status TransferLiteralToInfeed(se::StreamExecutor* executor,
-                                         const Literal& literal) = 0;
+                                         const LiteralSlice& literal) = 0;
 
   // Transfers the given literal from the Outfeed interface of the device,
   // using the given executor.
diff --git a/tensorflow/compiler/xla/service/transpose_folding.cc b/tensorflow/compiler/xla/service/transpose_folding.cc
index 3efd38ce0daa3e..ba16dc640e2d29 100644
--- a/tensorflow/compiler/xla/service/transpose_folding.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding.cc
@@ -35,7 +35,8 @@ TransposeFolding::OperandIndices CanFoldOperandsIntoDot(
     const HloInstruction& dot,
     const TransposeFolding::TransposableGemmOperandsFn&
         transposable_gemm_operands) {
-  if (HloOpcode::kDot != dot.opcode()) {
+  if (HloOpcode::kDot != dot.opcode() ||
+      dot.dot_dimension_numbers().lhs_batch_dimensions_size() != 0) {
     return {};
   }
 
@@ -44,6 +45,8 @@ TransposeFolding::OperandIndices CanFoldOperandsIntoDot(
     auto& operand = *dot.operand(i);
     if (operand.IsRank2Transpose()) {
       operand_set.push_back(i);
+    } else if (ShapeUtil::Rank(operand.shape()) != 2) {
+      return {};
     }
   }
 
@@ -74,23 +77,39 @@ using InstructionOperandsPair =
 
 // Folds the operands of `dot` that are foldable transposes. `computation` is
 // the parent HLO computation of `dot`.
-//
-// Returns whether the module is changed.
-bool FoldTransposeIntoDot(InstructionOperandsPair pair) {
-  auto* dot = pair.first;
-  std::vector<HloInstruction*> instructions_to_fuse(1, dot);
-  for (const int64 operand_index : pair.second) {
-    instructions_to_fuse.push_back(dot->mutable_operand(operand_index));
-  }
-
-  // Early-exit if no operands are foldable.
-  if (instructions_to_fuse.size() == 1) {
-    return false;
+Status FoldTransposeIntoDot(InstructionOperandsPair pair) {
+  HloInstruction* dot = pair.first;
+
+  DotDimensionNumbers new_dim_numbers = dot->dot_dimension_numbers();
+  HloInstruction* new_lhs = dot->mutable_operand(0);
+  HloInstruction* new_rhs = dot->mutable_operand(1);
+
+  CHECK_EQ(new_dim_numbers.lhs_batch_dimensions_size(), 0);
+  CHECK_EQ(new_dim_numbers.rhs_batch_dimensions_size(), 0);
+  CHECK_EQ(new_dim_numbers.lhs_contracting_dimensions_size(), 1);
+  CHECK_EQ(new_dim_numbers.rhs_contracting_dimensions_size(), 1);
+
+  for (int64 operand_index : pair.second) {
+    // We've checked that there aren't any batch dimensions and that the inputs
+    // are rank 2, and shape inference guarantees that there is exactly one
+    // contracting dimension.
+    if (operand_index == 0) {
+      CHECK_EQ(new_lhs->opcode(), HloOpcode::kTranspose);
+      new_dim_numbers.set_lhs_contracting_dimensions(
+          0, 1 - new_dim_numbers.lhs_contracting_dimensions(0));
+      new_lhs = new_lhs->mutable_operand(0);
+    } else {
+      CHECK_EQ(operand_index, 1);
+      CHECK_EQ(new_rhs->opcode(), HloOpcode::kTranspose);
+      new_dim_numbers.set_rhs_contracting_dimensions(
+          0, 1 - new_dim_numbers.rhs_contracting_dimensions(0));
+      new_rhs = new_rhs->mutable_operand(0);
+    }
   }
 
-  dot->parent()->CreateFusionInstruction(
-      instructions_to_fuse, HloInstruction::FusionKind::kTransposeDot);
-  return true;
+  std::unique_ptr<HloInstruction> new_dot = HloInstruction::CreateDot(
+      dot->shape(), new_lhs, new_rhs, new_dim_numbers);
+  return dot->parent()->ReplaceWithNewInstruction(dot, std::move(new_dot));
 }
 
 // Folds the operands of `convolution` that are foldable transposes.
@@ -196,7 +215,7 @@ StatusOr<bool> TransposeFolding::Run(HloModule* module) {
             std::make_pair(instruction, operand_indices));
       }
     }
-    return tensorflow::Status::OK();
+    return Status::OK();
   };
 
   for (auto* comp : module->MakeNonfusionComputations()) {
@@ -205,7 +224,8 @@ StatusOr<bool> TransposeFolding::Run(HloModule* module) {
 
   bool changed = false;
   for (InstructionOperandsPair& pair : foldable_dots) {
-    changed |= FoldTransposeIntoDot(pair);
+    TF_RETURN_IF_ERROR(FoldTransposeIntoDot(pair));
+    changed = true;
   }
   for (InstructionOperandsPair& pair : foldable_convolutions) {
     changed |= FoldTransposeIntoConvolution(pair);
diff --git a/tensorflow/compiler/xla/service/transpose_folding_test.cc b/tensorflow/compiler/xla/service/transpose_folding_test.cc
index caa1a111ad880b..3139801ea31303 100644
--- a/tensorflow/compiler/xla/service/transpose_folding_test.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding_test.cc
@@ -19,13 +19,15 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
@@ -34,6 +36,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/logging.h"
 
+namespace op = xla::testing::opcode_matchers;
+
 namespace xla {
 namespace {
 
@@ -54,83 +58,102 @@ class TransposeFoldingTest : public HloTestBase {
 };
 
 TEST_F(TransposeFoldingTest, FoldDotTranspose) {
-  auto builder = HloComputation::Builder("entry_computation");
-  HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
-      /*parameter_number=*/0, ShapeUtil::MakeShape(F32, {2, 3}),
-      /*name=*/"x"));
-  HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
-      /*parameter_number=*/1, ShapeUtil::MakeShape(F32, {2, 3}),
-      /*name=*/"y"));
-  HloInstruction* transpose_y =
-      builder.AddInstruction(HloInstruction::CreateTranspose(
-          ShapeUtil::MakeShape(F32, {3, 2}), y, {1, 0}));
-  DotDimensionNumbers dot_dnums;
-  dot_dnums.add_lhs_contracting_dimensions(1);
-  dot_dnums.add_rhs_contracting_dimensions(0);
-  HloInstruction* dot = builder.AddInstruction(
-      HloInstruction::CreateDot(ShapeUtil::MakeShape(F32, {2, 2}), /*lhs=*/x,
-                                /*rhs=*/transpose_y, dot_dnums));
-
-  HloModule module("test_module");
-  HloComputation* entry_computation =
-      module.AddEntryComputation(builder.Build(dot));
-  FoldTranspose(&module);
+  string hlo_string = R"(
+HloModule FoldDotTranspose
+
+ENTRY entry_computation {
+  x = f32[2,3]{1,0} parameter(0)
+  y = f32[2,3]{1,0} parameter(1)
+  transpose = f32[3,2]{1,0} transpose(y), dimensions={1,0}
+  ROOT dot = f32[2,2]{1,0} dot(x, transpose), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
 
-  // Instructions after folding: x, y, and the fusion.
-  std::unordered_set<HloInstruction*> instruction_set(
-      entry_computation->instructions().begin(),
-      entry_computation->instructions().end());
-  CHECK_EQ(1, instruction_set.erase(x)) << "x is not in entry_computation.";
-  CHECK_EQ(1, instruction_set.erase(y)) << "y is not in entry_computation.";
-  CHECK_EQ(1, instruction_set.size())
-      << "entry_computation should contain exactly 3 instructions.";
-  HloInstruction* fusion = *instruction_set.begin();
-  EXPECT_EQ(HloOpcode::kFusion, fusion->opcode());
+  FoldTranspose(module.get());
+
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Dot(op::Parameter(0), op::Parameter(1),
+                      /*lhs_contracting_dim=*/1, /*rhs_contracting_dim=*/1));
+}
 
-  // The fusion instruction should contain two parameters, one transpose and
-  // one dot.
-  EXPECT_EQ(4, fusion->fused_instruction_count());
+TEST_F(TransposeFoldingTest, DontFoldTransposeOfBatchDim) {
+  string hlo_string = R"(
+HloModule FoldDotTranspose
+
+ENTRY entry_computation {
+  x = f32[2,3] parameter(0)
+  y = f32[3,2] parameter(1)
+  transpose = f32[2,3] transpose(y), dimensions={1,0}
+  ROOT dot = f32[2] dot(x, transpose), lhs_batch_dims={0}, rhs_batch_dims={0}, lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+
+  TransposeFolding transpose_folding(
+      [](const HloInstruction& dot,
+         const TransposeFolding::OperandIndices& candidate_operands) {
+        return candidate_operands;
+      },
+      [](const HloInstruction& convolution,
+         const TransposeFolding::OperandIndices& candidate_operands) {
+        return candidate_operands;
+      });
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, transpose_folding.Run(module.get()));
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(TransposeFoldingTest, DontFoldTransposeOfRank1Dot) {
+  string hlo_string = R"(
+HloModule FoldDotTranspose
+
+ENTRY entry_computation {
+  x = f32[3] parameter(0)
+  y = f32[3,2] parameter(1)
+  transpose = f32[2,3] transpose(y), dimensions={1,0}
+  ROOT dot = f32[2] dot(x, transpose), lhs_batch_dims={}, rhs_batch_dims={0}, lhs_contracting_dims={0}, rhs_contracting_dims={1}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+
+  TransposeFolding transpose_folding(
+      [](const HloInstruction& dot,
+         const TransposeFolding::OperandIndices& candidate_operands) {
+        return candidate_operands;
+      },
+      [](const HloInstruction& convolution,
+         const TransposeFolding::OperandIndices& candidate_operands) {
+        return candidate_operands;
+      });
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, transpose_folding.Run(module.get()));
+  EXPECT_FALSE(changed);
 }
 
 TEST_F(TransposeFoldingTest, FoldDotTransposeConstant) {
-  auto builder = HloComputation::Builder("entry_computation");
-  // 2x1
-  HloInstruction* const0 = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR2<float>({{1}, {2}})));
-  // 3x2
-  HloInstruction* const1 =
-      builder.AddInstruction(HloInstruction::CreateConstant(
-          Literal::CreateR2<float>({{1, 2}, {3, 4}, {5, 6}})));
-  HloInstruction* transpose0 =
-      builder.AddInstruction(HloInstruction::CreateTranspose(
-          ShapeUtil::MakeShape(F32, {1, 2}), const0, {1, 0}));
-  HloInstruction* transpose1 =
-      builder.AddInstruction(HloInstruction::CreateTranspose(
-          ShapeUtil::MakeShape(F32, {2, 3}), const1, {1, 0}));
-  DotDimensionNumbers dot_dnums;
-  dot_dnums.add_lhs_contracting_dimensions(1);
-  dot_dnums.add_rhs_contracting_dimensions(0);
-  HloInstruction* dot = builder.AddInstruction(HloInstruction::CreateDot(
-      ShapeUtil::MakeShape(F32, {1, 3}),
-      /*lhs=*/transpose0, /*rhs=*/transpose1, dot_dnums));
-
-  HloModule module("test_module");
-  HloComputation* entry_computation =
-      module.AddEntryComputation(builder.Build(dot));
-  FoldTranspose(&module);
-
-  for (auto* instruction : entry_computation->instructions()) {
-    if (instruction->opcode() == HloOpcode::kFusion) {
-      CHECK_EQ(2, instruction->operand_count());
-      EXPECT_EQ(const0, instruction->operand(0));
-      EXPECT_EQ(const1, instruction->operand(1));
-    }
-  }
+  string hlo_string = R"(
+HloModule FoldDotTransposeConstant
+
+ENTRY entry_computation {
+  constant = f32[2,1]{1,0} constant(f32[2,1] { { 1 }, { 2 } })
+  transpose = f32[1,2]{1,0} transpose(constant), dimensions={1,0}
+  constant.1 = f32[3,2]{1,0} constant(f32[3,2] { { 1, 2 }, { 3, 4 }, { 5, 6 } })
+  transpose.1 = f32[2,3]{1,0} transpose(constant.1), dimensions={1,0}
+  ROOT dot = f32[1,3]{1,0} dot(transpose, transpose.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+
+  FoldTranspose(module.get());
 
-  // The created fusion instruction should contain two parameters, two
-  // transposes (one for each parameter) and one dot.
-  EXPECT_EQ(5,
-            entry_computation->root_instruction()->fused_instruction_count());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Dot(op::Constant(), op::Constant(),
+                      /*lhs_contracting_dim=*/0, /*rhs_contracting_dim=*/1));
 }
 
 TEST_F(TransposeFoldingTest, FuseDotWithConstantOperands) {
@@ -149,10 +172,10 @@ TEST_F(TransposeFoldingTest, FuseDotWithConstantOperands) {
   HloInstruction* mul = builder.AddInstruction(HloInstruction::CreateBinary(
       add->shape(), HloOpcode::kMultiply, add, sub));
 
-  HloModule module("fuse_with_constant_operands");
+  auto module = CreateNewModule("fuse_with_constant_operands");
   HloComputation* entry_computation =
-      module.AddEntryComputation(builder.Build(mul));
-  HloInstruction* call = module.OutlineExpressionFromComputation(
+      module->AddEntryComputation(builder.Build(mul));
+  HloInstruction* call = module->OutlineExpressionFromComputation(
       {add, sub, mul}, "", entry_computation);
   EXPECT_EQ(call, entry_computation->root_instruction());
   HloComputation* callee_computation = call->to_apply();
@@ -164,50 +187,32 @@ TEST_F(TransposeFoldingTest, FuseDotWithConstantOperands) {
   EXPECT_EQ(6, callee_computation->instruction_count());
 }
 
-TEST_F(TransposeFoldingTest, FoldDotTransposeInWhile) {
-  auto builder = HloComputation::Builder("entry_computation");
-  HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
-      /*parameter_number=*/0, ShapeUtil::MakeShape(F32, {2, 3}),
-      /*name=*/"x"));
-  HloInstruction* y = builder.AddInstruction(HloInstruction::CreateParameter(
-      /*parameter_number=*/1, ShapeUtil::MakeShape(F32, {2, 3}),
-      /*name=*/"y"));
-  HloInstruction* transpose_y =
-      builder.AddInstruction(HloInstruction::CreateTranspose(
-          ShapeUtil::MakeShape(F32, {3, 2}), y, {1, 0}));
-  DotDimensionNumbers dot_dnums;
-  dot_dnums.add_lhs_contracting_dimensions(1);
-  dot_dnums.add_rhs_contracting_dimensions(0);
-  HloInstruction* dot = builder.AddInstruction(
-      HloInstruction::CreateDot(ShapeUtil::MakeShape(F32, {2, 2}), /*lhs=*/x,
-                                /*rhs=*/transpose_y, dot_dnums));
-
-  HloModule module("test_module");
-  HloComputation* entry_computation =
-      module.AddEntryComputation(builder.Build(dot));
-
-  HloInstruction* call = module.OutlineExpressionFromComputation(
-      {transpose_y, dot}, "outlined", entry_computation);
+TEST_F(TransposeFoldingTest, FoldDotTransposeInCall) {
+  string hlo_string = R"(
+HloModule FoldDotTransposeInCall
 
-  FoldTranspose(&module);
-
-  // Instructions after folding: x, y, and the fusion.
-  std::unordered_set<HloInstruction*> instruction_set(
-      entry_computation->instructions().begin(),
-      entry_computation->instructions().end());
-  CHECK_EQ(1, instruction_set.erase(x)) << "x is not in entry_computation.";
-  CHECK_EQ(1, instruction_set.erase(y)) << "y is not in entry_computation.";
-  CHECK_EQ(1, instruction_set.erase(call))
-      << "call is not in entry_computation.";
-  CHECK(instruction_set.empty())
-      << "entry_computation should contain exactly 3 instructions.";
-  HloInstruction* fusion =
-      call->called_computations().front()->root_instruction();
-  EXPECT_EQ(HloOpcode::kFusion, fusion->opcode());
+callee {
+  name.0 = f32[2,3]{1,0} parameter(0)
+  name.1 = f32[2,3]{1,0} parameter(1)
+  transpose.clone = f32[3,2]{1,0} transpose(name.0), dimensions={1,0}
+  ROOT dot.clone = f32[2,2]{1,0} dot(name.1, transpose.clone), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
 
-  // The fusion instruction should contain two parameters, one transpose and
-  // one dot.
-  EXPECT_EQ(4, fusion->fused_instruction_count());
+ENTRY entry_computation {
+  y = f32[2,3]{1,0} parameter(1)
+  x = f32[2,3]{1,0} parameter(0)
+  ROOT call = f32[2,2]{1,0} call(y, x), to_apply=callee
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+  FoldTranspose(module.get());
+
+  const HloComputation* callee = module->GetComputationWithName("callee");
+  ASSERT_NE(callee, nullptr);
+  EXPECT_THAT(callee->root_instruction(),
+              op::Dot(op::Parameter(1), op::Parameter(0),
+                      /*lhs_contracting_dim=*/1, /*rhs_contracting_dim=*/1));
 }
 
 // Test that a two dimension swap of the kernel gets folded into convolution.
@@ -222,7 +227,7 @@ TEST_F(TransposeFoldingTest, FoldConvDimSwapTransposeRhs) {
   HloInstruction* transpose_y =
       builder.AddInstruction(HloInstruction::CreateTranspose(
           ShapeUtil::MakeShape(F32, {2, 3, 1, 1}), y, {1, 0, 2, 3}));
-  auto dnums = ComputationBuilder::CreateDefaultConvDimensionNumbers();
+  auto dnums = XlaBuilder::CreateDefaultConvDimensionNumbers();
   Window window;
   for (int i = 0; i < 2; ++i) {
     WindowDimension* dim = window.add_dimensions();
@@ -240,10 +245,10 @@ TEST_F(TransposeFoldingTest, FoldConvDimSwapTransposeRhs) {
   HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
       conv_shape.ValueOrDie(), x, transpose_y, window, dnums));
 
-  HloModule module("test_module");
+  auto module = CreateNewModule("test_module");
   HloComputation* entry_computation =
-      module.AddEntryComputation(builder.Build(conv));
-  FoldTranspose(&module);
+      module->AddEntryComputation(builder.Build(conv));
+  FoldTranspose(module.get());
 
   // Instructions after folding: x, y, and the convolution.
   std::unordered_set<HloInstruction*> instruction_set(
@@ -275,7 +280,7 @@ TEST_F(TransposeFoldingTest, FoldConvComplexTransposeRhs) {
   HloInstruction* transpose_y =
       builder.AddInstruction(HloInstruction::CreateTranspose(
           ShapeUtil::MakeShape(F32, {2, 3, 1, 1}), y, {1, 3, 0, 2}));
-  auto dnums = ComputationBuilder::CreateDefaultConvDimensionNumbers();
+  auto dnums = XlaBuilder::CreateDefaultConvDimensionNumbers();
   Window window;
   for (int i = 0; i < 2; ++i) {
     WindowDimension* dim = window.add_dimensions();
@@ -293,10 +298,10 @@ TEST_F(TransposeFoldingTest, FoldConvComplexTransposeRhs) {
   HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
       conv_shape.ValueOrDie(), x, transpose_y, window, dnums));
 
-  HloModule module("test_module");
+  auto module = CreateNewModule("test_module");
   HloComputation* entry_computation =
-      module.AddEntryComputation(builder.Build(conv));
-  FoldTranspose(&module);
+      module->AddEntryComputation(builder.Build(conv));
+  FoldTranspose(module.get());
 
   // Instructions after folding: x, y, and the convolution.
   std::unordered_set<HloInstruction*> instruction_set(
@@ -334,7 +339,7 @@ TEST_F(TransposeFoldingTest, FoldConvTransposeLhs) {
   HloInstruction* transpose_x =
       builder.AddInstruction(HloInstruction::CreateTranspose(
           ShapeUtil::MakeShape(F32, {2, 3, 1, 1}), x, {1, 0, 2, 3}));
-  auto dnums = ComputationBuilder::CreateDefaultConvDimensionNumbers();
+  auto dnums = XlaBuilder::CreateDefaultConvDimensionNumbers();
   Window window;
   for (int i = 0; i < 2; ++i) {
     WindowDimension* dim = window.add_dimensions();
@@ -351,10 +356,10 @@ TEST_F(TransposeFoldingTest, FoldConvTransposeLhs) {
   HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
       conv_shape.ValueOrDie(), transpose_x, y, window, dnums));
 
-  HloModule module("test_module");
+  auto module = CreateNewModule("test_module");
   HloComputation* entry_computation =
-      module.AddEntryComputation(builder.Build(conv));
-  FoldTranspose(&module);
+      module->AddEntryComputation(builder.Build(conv));
+  FoldTranspose(module.get());
 
   // Instructions after folding: x, y, and the convolution.
   std::unordered_set<HloInstruction*> instruction_set(
@@ -398,7 +403,7 @@ TEST_F(TransposeFoldingTest, FoldConvComplexTransposeLhs) {
   HloInstruction* transpose_x =
       builder.AddInstruction(HloInstruction::CreateTranspose(
           ShapeUtil::MakeShape(F32, {2, 3, 1, 1}), x, {1, 0, 3, 2}));
-  auto dnums = ComputationBuilder::CreateDefaultConvDimensionNumbers();
+  auto dnums = XlaBuilder::CreateDefaultConvDimensionNumbers();
   Window window;
   for (int i = 0; i < 2; ++i) {
     WindowDimension* dim = window.add_dimensions();
@@ -415,10 +420,10 @@ TEST_F(TransposeFoldingTest, FoldConvComplexTransposeLhs) {
   HloInstruction* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
       conv_shape.ValueOrDie(), transpose_x, y, window, dnums));
 
-  HloModule module("test_module");
+  auto module = CreateNewModule("test_module");
   HloComputation* entry_computation =
-      module.AddEntryComputation(builder.Build(conv));
-  FoldTranspose(&module);
+      module->AddEntryComputation(builder.Build(conv));
+  FoldTranspose(module.get());
 
   // Instructions after folding: x, y, and the convolution.
   std::unordered_set<HloInstruction*> instruction_set(
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
index 657a8fe09ae9df..bb634e6573ffce 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
@@ -273,6 +273,14 @@ Status TuplePointsToAnalysis::HandleBitcast(HloInstruction* bitcast) {
   return Status::OK();
 }
 
+Status TuplePointsToAnalysis::HandleDomain(HloInstruction* domain) {
+  // A kDomain instruction aliases its operand. That is, the buffer of its
+  // result *is* the buffer of its operand, so just copy the operands points-to
+  // set.
+  CreateCopiedPointsToSet(domain, domain->operand(0));
+  return Status::OK();
+}
+
 Status TuplePointsToAnalysis::HandleSlice(HloInstruction* slice) {
   // A kSlice instruction aliases its operand if the backend lowers it to an
   // in-place implementation.
@@ -588,4 +596,201 @@ void TuplePointsToAnalysis::InstructionToString(
   });
 }
 
+bool TuplePointsToAnalysis::DoesNotUseOperandBuffer(
+    const HloInstruction* operand, const ShapeIndex& index,
+    const HloInstruction* user) const {
+  CHECK(user->IsUserOf(operand))
+      << "user: " << user->ToString() << " operand: " << operand->ToString();
+  if (user->opcode() == HloOpcode::kGetTupleElement && !index.empty()) {
+    // GetTupleElement instructions only access the top-level buffer of their
+    // operand.
+    return true;
+  } else if (user->opcode() == HloOpcode::kFusion &&
+             user->fusion_kind() == HloInstruction::FusionKind::kLoop) {
+    // Find fusion parameter associated with 'operand'.
+    auto it = std::find_if(
+        user->fused_parameters().begin(), user->fused_parameters().end(),
+        [=](HloInstruction* fused_param) {
+          return user->operand(fused_param->parameter_number()) == operand;
+        });
+    CHECK(it != user->fused_parameters().end());
+    // Iterate through all users of all buffer aliases of the buffer in the
+    // points-to set of fusion parameter at 'index'.
+    // Return false if any uses are detected at 'index', returns true otherwise.
+    const LogicalBuffer* buffer = GetBufferDefinedAt(*it, index).ValueOrDie();
+    for (const BufferAlias& alias : GetBufferAliases(*buffer)) {
+      for (HloInstruction* alias_user : alias.instruction()->users()) {
+        if (DoesNotUseOperandBuffer(alias.instruction(), alias.index(),
+                                    alias_user)) {
+          continue;
+        }
+        // Return false: use detected at 'buffer' -> 'alias' -> 'alias_user'.
+        return false;
+      }
+    }
+    // Return true: found no uses of 'operand' at 'index' in 'user'.
+    return true;
+  }
+  return false;
+}
+
+// Returns all uses of all aliases of 'instruction' at 'index' in 'uses'.
+// Each use in 'uses' is a pair (HloInstruction* user, int64 operand_index)
+// where 'user' is a user of an alias of 'instruction' at 'index', and
+// 'operand_index' is the operand index at which the alias appears in the
+// operand list of 'user'.
+std::vector<std::pair<HloInstruction*, int64>>
+TuplePointsToAnalysis::GetAllUsesOfInstructionAtIndex(
+    HloInstruction* instruction, const ShapeIndex& index) const {
+  std::vector<std::pair<HloInstruction*, int64>> uses;
+  const PointsToSet::BufferList& points_to =
+      GetPointsToSet(instruction).element(index);
+  for (const LogicalBuffer* buffer : points_to) {
+    for (const BufferAlias& alias : GetBufferAliases(*buffer)) {
+      for (HloInstruction* alias_user : alias.instruction()->users()) {
+        if (DoesNotUseOperandBuffer(alias.instruction(), alias.index(),
+                                    alias_user)) {
+          continue;
+        }
+        for (int64 op_idx : alias_user->OperandIndices(alias.instruction())) {
+          uses.emplace_back(alias_user, op_idx);
+        }
+      }
+    }
+  }
+  return uses;
+}
+
+// Returns true if there is exactly one use of 'operand' at 'operand_index'
+// in 'fusion.fused_instructions', where the singleton use is the fused
+// root at operand index 'use_operand_index'. Returns false otherwise.
+//
+// REQUIRES: 'fusion' opcode is a kFusion instruction.
+bool TuplePointsToAnalysis::HasUniqueFusedUseOfOperandAt(
+    HloInstruction* operand, const ShapeIndex& operand_index,
+    HloInstruction* fusion, const int64 use_operand_index) const {
+  CHECK_EQ(HloOpcode::kFusion, fusion->opcode());
+  // Check that 'operand' is unique in the operand list of 'fusion'.
+  if (fusion->OperandIndices(operand).size() > 1) {
+    return false;
+  }
+  // Find fusion parameter associated with 'operand'.
+  const auto& fused_params = fusion->fused_parameters();
+  auto fused_param_it = std::find_if(
+      fused_params.begin(), fused_params.end(),
+      [&](HloInstruction* fused_param) {
+        return fusion->operand(fused_param->parameter_number()) == operand;
+      });
+  if (fused_param_it == fused_params.end()) {
+    return false;
+  }
+  auto* fused_param = *fused_param_it;
+  // Get all uses of 'operand' at 'index' from 'fusion.fused_instructions'.
+  auto fused_param_uses =
+      GetAllUsesOfInstructionAtIndex(fused_param, operand_index);
+  // Return true iff there is exactly one use of 'operand' at 'index', and
+  // this singleton use is the fused root (at index in 'use_operand_indices').
+  return fused_param_uses.size() == 1 &&
+         fused_param_uses[0].first == fusion->fused_expression_root() &&
+         fused_param_uses[0].second == use_operand_index;
+}
+
+// User and operand can share buffers iff both instructions emit the same shape
+// and layout, and 'user' meets one of the following qualifications:
+//
+// (1) Is element-wise. Or...
+// (2) Is a loop fusion instruction where the only use of 'operand' at 'index'
+//     in the set 'user.fused_instructions' is a DynamicUpdateSlice fused root
+//     at operand 0. Or...
+// (3) Is a kDot -> kAdd output fusion instruction where the only use of
+//     'operand' at 'index' in the set 'user.fused_instructions' is a kAdd fused
+//     root at operand 0 or 1. Or...
+// (4) The 'user' of 'operand' is DynamicUpdateSlice or While at operand index
+//     0.
+//
+// (2) and (3) can only be determined if points-to analysis is available.
+bool TuplePointsToAnalysis::CanShareOperandBufferWithUser(
+    HloInstruction* operand, const ShapeIndex& operand_index,
+    HloInstruction* user, const ShapeIndex& user_index) const {
+  CHECK(user->IsUserOf(operand))
+      << "user: " << user->ToString() << " operand: " << operand->ToString();
+  const Shape& operand_subshape =
+      ShapeUtil::GetSubshape(operand->shape(), operand_index);
+  const Shape& user_subshape =
+      ShapeUtil::GetSubshape(user->shape(), user_index);
+  // Check that operand and user emit the same shape and layout.
+  if (!ShapeUtil::Equal(operand_subshape, user_subshape)) {
+    return false;
+  }
+  if (user->opcode() == HloOpcode::kFusion) {
+    if (user->fusion_kind() == HloInstruction::FusionKind::kLoop &&
+        user->fused_expression_root()->opcode() ==
+            HloOpcode::kDynamicUpdateSlice) {
+      // Loop fusion with kDynamicUpdateSlice fused root.
+      //
+      // Returns true iff there is exactly one use of 'operand' at shape index
+      // 'operand_index', and this singleton use is the fused root at operand
+      // index 0.
+      return HasUniqueFusedUseOfOperandAt(operand, operand_index, user, 0);
+    } else if (user->fusion_kind() == HloInstruction::FusionKind::kOutput &&
+               user->fused_expression_root()->opcode() == HloOpcode::kAdd) {
+      // Output fusion with kAdd fused root.
+
+      // Check if one operand of kAdd fused root is kDot or kConvolution.
+      auto* add = user->fused_expression_root();
+      auto add_operand_it =
+          std::find_if(add->operands().begin(), add->operands().end(),
+                       [&](HloInstruction* operand) {
+                         return operand->opcode() == HloOpcode::kConvolution ||
+                                operand->opcode() == HloOpcode::kDot;
+                       });
+      if (add_operand_it == add->operands().end()) {
+        return false;
+      }
+      auto* matched_add_operand = *add_operand_it;
+      // Calculate operand index of 'add' operand which was not matched above.
+      const int64 other_add_operand_index =
+          matched_add_operand == add->operand(0) ? 1 : 0;
+      // Returns true iff there is exactly one use of 'operand' at shape index
+      // 'operand_index', and this singleton use is the fused root (at operand
+      // index 'other_add_operand_index').
+      return HasUniqueFusedUseOfOperandAt(operand, operand_index, user,
+                                          other_add_operand_index);
+    }
+  }
+  if (user->opcode() == HloOpcode::kDynamicUpdateSlice ||
+      user->opcode() == HloOpcode::kWhile) {
+    // We eliminated other users in BufferLiveness::live_range_strictly_before,
+    // so here we just need to check that the use is at operand index 0.
+    std::vector<int64> operand_indices = user->OperandIndices(operand);
+    return operand_indices.size() == 1 && operand_indices[0] == 0;
+  }
+  if (user->opcode() == HloOpcode::kCall) {
+    // TODO(b/62548313): Remove when buffer assignment is module scoped and
+    // does not assign buffers to calls.
+    // Find called computation parameter associated with 'operand'.
+    const std::vector<int64> operand_indices = user->OperandIndices(operand);
+    if (operand_indices.size() > 1) {
+      return false;
+    }
+    CHECK_EQ(1, operand_indices.size());
+    auto* param = user->to_apply()->parameter_instruction(operand_indices[0]);
+    // Get all uses of 'operand' at 'index' in called computation.
+    auto param_uses = GetAllUsesOfInstructionAtIndex(param, operand_index);
+
+    // Return true iff:
+    // *) There exists exactly one use of 'operand' in called computation.
+    // *) The unique use is by the root instruction of called computation.
+    //    (Note: we check the root of the called computation, because the
+    //     root result buffer is required to alias with the Call result buffer).
+    // *) The root instruction of the called computation is element-wise on
+    //    'operand'.
+    auto* callee_root = user->to_apply()->root_instruction();
+    return param_uses.size() == 1 && param_uses[0].first == callee_root &&
+           callee_root->IsElementwiseOnOperand(param_uses[0].second);
+  }
+  // Check if 'user' is element-wise.
+  return user->IsElementwise();
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
index c3743b150168eb..c0d82414806d9a 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
@@ -248,6 +248,7 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
   Status HandleTuple(HloInstruction* tuple) override;
   Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
   Status HandleBitcast(HloInstruction* bitcast) override;
+  Status HandleDomain(HloInstruction* domain) override;
   Status HandleSlice(HloInstruction* slice) override;
   Status HandleCopy(HloInstruction* copy) override;
   Status HandleRecvDone(HloInstruction* recv_done) override;
@@ -256,6 +257,23 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
 
   string ToString() const;
 
+  // Returns true if 'user' cannot possibly use the buffer at 'index' in
+  // 'operand'. Returns false otherwise.
+  //
+  // REQUIRES: 'operand' is an operand of 'user'.
+  bool DoesNotUseOperandBuffer(const HloInstruction* operand,
+                               const ShapeIndex& index,
+                               const HloInstruction* user) const;
+
+  // Returns true if 'user' (at 'user_index') can share a buffer with its
+  // operand 'operand' (at 'operand_index'). Returns false otherwise.
+  //
+  // REQUIRES: 'operand' is an operand of 'user'.
+  bool CanShareOperandBufferWithUser(HloInstruction* operand,
+                                     const ShapeIndex& operand_index,
+                                     HloInstruction* user,
+                                     const ShapeIndex& user_index) const;
+
  private:
   explicit TuplePointsToAnalysis(
       const HloModule* module,
@@ -310,6 +328,13 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
     return &per_instruction_[id];
   }
 
+  std::vector<std::pair<HloInstruction*, int64>> GetAllUsesOfInstructionAtIndex(
+      HloInstruction* instruction, const ShapeIndex& index) const;
+  bool HasUniqueFusedUseOfOperandAt(HloInstruction* operand,
+                                    const ShapeIndex& operand_index,
+                                    HloInstruction* fusion,
+                                    const int64 use_operand_index) const;
+
   // The module this analysis is performed on.
   const HloModule* module_;
 
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
index dec446d4dac650..f558316b05b168 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
@@ -805,5 +805,348 @@ TEST_F(FusionPointsToAnalysisTest, FusionParam0TwoUsers) {
   Run(/*add_additional_gte0_user=*/true);
 }
 
+class PointsToAnalysisTestBase : public HloTestBase {
+ protected:
+  void BuildModule(std::unique_ptr<HloComputation> computation) {
+    module_ = CreateNewModule();
+    computation_ = module_->AddEntryComputation(std::move(computation));
+  }
+
+  void RunAnalysis() {
+    CHECK_NOTNULL(module_.get());
+    points_to_analysis_ =
+        TuplePointsToAnalysis::Run(module_.get()).ConsumeValueOrDie();
+  }
+
+  void BuildModuleAndRunAnalysis(std::unique_ptr<HloComputation> computation) {
+    BuildModule(std::move(computation));
+    RunAnalysis();
+  }
+
+  std::unique_ptr<HloModule> module_;
+  HloComputation* computation_ = nullptr;
+  std::unique_ptr<TuplePointsToAnalysis> points_to_analysis_;
+};
+
+class DoesNotUseOperandBufferTest : public PointsToAnalysisTestBase {};
+
+TEST_F(DoesNotUseOperandBufferTest, GetTupleElement) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape elem_shape = ShapeUtil::MakeShape(F32, {8});
+  auto tuple = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeTupleShape({elem_shape, elem_shape}), "tuple"));
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(elem_shape, tuple, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(elem_shape, tuple, 1));
+  builder.AddInstruction(
+      HloInstruction::CreateBinary(elem_shape, HloOpcode::kAdd, gte0, gte1));
+
+  BuildModuleAndRunAnalysis(builder.Build());
+
+  // GetTupleElement instructions only access the top-level buffer of their
+  // operand.
+  EXPECT_TRUE(points_to_analysis_->DoesNotUseOperandBuffer(tuple, {0}, gte0));
+  EXPECT_TRUE(points_to_analysis_->DoesNotUseOperandBuffer(tuple, {1}, gte1));
+  EXPECT_FALSE(points_to_analysis_->DoesNotUseOperandBuffer(tuple, {}, gte0));
+  EXPECT_FALSE(points_to_analysis_->DoesNotUseOperandBuffer(tuple, {}, gte1));
+}
+
+TEST_F(DoesNotUseOperandBufferTest, FusedDynamicUpdateSlice) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape data_shape = ShapeUtil::MakeShape(F32, {8});
+  auto tuple = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeTupleShape({data_shape, data_shape}), "tuple"));
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape, tuple, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape, tuple, 1));
+
+  // Create a DynamicUpdateSlice instruction of tuple element 1.
+  auto starts = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR1<int32>({2})));
+  auto update = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR1<float>({2.f, 2.f, 2.f})));
+  auto dynamic_update_slice =
+      builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+          data_shape, gte1, update, starts));
+  builder.AddInstruction(
+      HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
+
+  BuildModule(builder.Build());
+  auto fusion = computation_->CreateFusionInstruction(
+      {dynamic_update_slice, starts, update, gte1},
+      HloInstruction::FusionKind::kLoop);
+  RunAnalysis();
+
+  // The fusion instruction never uses tuple element 0, but does use element 1.
+  EXPECT_TRUE(points_to_analysis_->DoesNotUseOperandBuffer(tuple, {0}, fusion));
+  EXPECT_FALSE(
+      points_to_analysis_->DoesNotUseOperandBuffer(tuple, {1}, fusion));
+}
+
+class CanShareOperandBufferWithUserTest : public PointsToAnalysisTestBase {};
+
+TEST_F(CanShareOperandBufferWithUserTest, ElementWiseSameShape) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape shape = ShapeUtil::MakeShape(F32, {8});
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param"));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kExp, param));
+  auto log = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kLog, exp));
+
+  BuildModuleAndRunAnalysis(builder.Build());
+
+  EXPECT_TRUE(
+      points_to_analysis_->CanShareOperandBufferWithUser(param, {}, exp, {}));
+  EXPECT_TRUE(
+      points_to_analysis_->CanShareOperandBufferWithUser(exp, {}, log, {}));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, ElementWiseDifferentShape) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape in_shape = ShapeUtil::MakeShape(F32, {8});
+  Shape out_shape = ShapeUtil::MakeShape(PRED, {8});
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, in_shape, "param0"));
+  auto param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, in_shape, "param1"));
+  auto result = builder.AddInstruction(
+      HloInstruction::CreateBinary(out_shape, HloOpcode::kEq, param0, param1));
+
+  BuildModuleAndRunAnalysis(builder.Build());
+
+  EXPECT_FALSE(points_to_analysis_->CanShareOperandBufferWithUser(param0, {},
+                                                                  result, {}));
+  EXPECT_FALSE(points_to_analysis_->CanShareOperandBufferWithUser(param1, {},
+                                                                  result, {}));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, CopyShares) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape shape = ShapeUtil::MakeShape(F32, {8});
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param"));
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kExp, param));
+  auto copy = builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kCopy, exp));
+
+  BuildModuleAndRunAnalysis(builder.Build());
+
+  EXPECT_TRUE(
+      points_to_analysis_->CanShareOperandBufferWithUser(param, {}, exp, {}));
+  EXPECT_TRUE(
+      points_to_analysis_->CanShareOperandBufferWithUser(exp, {}, copy, {}));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, FusedDynamicUpdateSlice) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape data_shape = ShapeUtil::MakeShape(F32, {8});
+  auto tuple = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeTupleShape({data_shape, data_shape}), "tuple"));
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape, tuple, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape, tuple, 1));
+
+  // Create a DynamicUpdateSlice instruction of tuple element 1.
+  auto starts = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR1<int32>({2})));
+  auto update = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR1<float>({2.f, 2.f, 2.f})));
+  auto dynamic_update_slice =
+      builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+          data_shape, gte1, update, starts));
+  builder.AddInstruction(
+      HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
+
+  BuildModule(builder.Build());
+  auto fusion = computation_->CreateFusionInstruction(
+      {dynamic_update_slice, starts, update, gte1},
+      HloInstruction::FusionKind::kLoop);
+  RunAnalysis();
+
+  // The fusion instruction can share with tuple element 1.
+  EXPECT_FALSE(points_to_analysis_->CanShareOperandBufferWithUser(tuple, {0},
+                                                                  fusion, {}));
+  EXPECT_TRUE(points_to_analysis_->CanShareOperandBufferWithUser(tuple, {1},
+                                                                 fusion, {}));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, DynamicUpdateSliceCanShare) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape data_shape = ShapeUtil::MakeShape(F32, {8});
+  Shape update_shape = ShapeUtil::MakeShape(F32, {4});
+  Shape starts_shape = ShapeUtil::MakeShape(S32, {1});
+  auto data = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, data_shape, "data"));
+  auto update = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, update_shape, "update"));
+  auto starts = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, starts_shape, "starts"));
+  auto dus = builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+      data_shape, data, update, starts));
+
+  BuildModuleAndRunAnalysis(builder.Build());
+
+  // The DynamicUpdateSlice instruction can share with the data operand, but not
+  // with update or starts.
+  EXPECT_TRUE(
+      points_to_analysis_->CanShareOperandBufferWithUser(data, {}, dus, {}));
+  EXPECT_FALSE(
+      points_to_analysis_->CanShareOperandBufferWithUser(update, {}, dus, {}));
+  EXPECT_FALSE(
+      points_to_analysis_->CanShareOperandBufferWithUser(starts, {}, dus, {}));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, FusedDotAdd) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2});
+
+  auto a = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR2<float>({{1.0, 0.0}, {0.0, 1.0}})));
+  auto b = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto dot = builder.AddInstruction(
+      HloInstruction::CreateDot(data_shape, a, b, dot_dnums));
+
+  auto one = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
+  auto add_operand = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(data_shape, one, {1}));
+
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      data_shape, HloOpcode::kAdd, dot, add_operand));
+
+  BuildModule(builder.Build());
+  auto fusion = computation_->CreateFusionInstruction(
+      {add, dot}, HloInstruction::FusionKind::kOutput);
+  RunAnalysis();
+
+  // Output fused dot add should be able to share buffer with 'add_operand'.
+  EXPECT_TRUE(points_to_analysis_->CanShareOperandBufferWithUser(
+      add_operand, {}, fusion, {}));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, OutputFusionCantAliasOperandBuffer) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2});
+
+  auto one = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
+  auto operand = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(data_shape, one, {1}));
+
+  auto reverse = builder.AddInstruction(
+      HloInstruction::CreateReverse(data_shape, operand, {0, 1}));
+
+  auto two = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR2<float>({{2.0, 2.0}, {2.0, 2.0}})));
+
+  auto add = builder.AddInstruction(
+      HloInstruction::CreateBinary(data_shape, HloOpcode::kAdd, reverse, two));
+
+  BuildModule(builder.Build());
+  auto fusion = computation_->CreateFusionInstruction(
+      {add, two, reverse}, HloInstruction::FusionKind::kOutput);
+  RunAnalysis();
+
+  // Output fused operand->reverse->add cannot alias operand buffer 'operand'.
+  EXPECT_FALSE(points_to_analysis_->CanShareOperandBufferWithUser(operand, {},
+                                                                  fusion, {}));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest, WhileCanShare) {
+  Shape data_shape = ShapeUtil::MakeShape(F32, {8});
+
+  auto make_cond = [this, &data_shape]() {
+    auto builder = HloComputation::Builder(TestName() + ".Cond");
+    auto data = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, data_shape, "data"));
+    builder.AddInstruction(HloInstruction::CreateBinary(
+        ShapeUtil::MakeShape(PRED, {}), HloOpcode::kEq, data, data));
+    return builder.Build();
+  };
+
+  auto make_body = [this, &data_shape]() {
+    auto builder = HloComputation::Builder(TestName() + ".Body");
+    auto data = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, data_shape, "data"));
+    builder.AddInstruction(
+        HloInstruction::CreateBinary(data_shape, HloOpcode::kAdd, data, data));
+    return builder.Build();
+  };
+
+  module_ = CreateNewModule();
+  HloComputation* cond_computation =
+      module_->AddEmbeddedComputation(make_cond());
+  HloComputation* body_computation =
+      module_->AddEmbeddedComputation(make_body());
+
+  auto builder = HloComputation::Builder(TestName());
+  auto data = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, data_shape, "data"));
+  auto whil = builder.AddInstruction(HloInstruction::CreateWhile(
+      data_shape, cond_computation, body_computation, data));
+  computation_ = module_->AddEntryComputation(builder.Build());
+
+  RunAnalysis();
+
+  // The While instruction can share with the data operand.
+  EXPECT_TRUE(
+      points_to_analysis_->CanShareOperandBufferWithUser(data, {}, whil, {}));
+}
+
+// Tests that Call can alias operand buffer if the only use of the operand
+// in the called computation is an elementwise instruction.
+TEST_F(CanShareOperandBufferWithUserTest, CallToComputationWithFusionRoot) {
+  Shape shape = ShapeUtil::MakeShape(F32, {8});
+  // Build sub-computation with fusion root.
+  auto sub_builder = HloComputation::Builder(TestName() + "_sub");
+  auto sub_param = sub_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "sub_param"));
+  auto one = sub_builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
+  auto ones = sub_builder.AddInstruction(
+      HloInstruction::CreateBroadcast(shape, one, {1}));
+  auto add = sub_builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, sub_param, ones));
+
+  module_ = CreateNewModule();
+  auto sub_computation = module_->AddEmbeddedComputation(sub_builder.Build());
+  sub_computation->CreateFusionInstruction({add, ones},
+                                           HloInstruction::FusionKind::kLoop);
+
+  // Build entry-computation with kCall which calls 'sub_computation'.
+  auto builder = HloComputation::Builder(TestName());
+
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "param"));
+  auto reverse =
+      builder.AddInstruction(HloInstruction::CreateReverse(shape, param, {0}));
+  auto call = builder.AddInstruction(
+      HloInstruction::CreateCall(shape, {reverse}, sub_computation));
+  computation_ = module_->AddEntryComputation(builder.Build());
+
+  RunAnalysis();
+
+  EXPECT_TRUE(points_to_analysis_->CanShareOperandBufferWithUser(reverse, {},
+                                                                 call, {}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier.cc b/tensorflow/compiler/xla/service/tuple_simplifier.cc
index 113c2e2bd9f73a..d668855084a884 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier.cc
+++ b/tensorflow/compiler/xla/service/tuple_simplifier.cc
@@ -69,6 +69,7 @@ StatusOr<bool> TupleSimplifier::Run(HloModule* module) {
       //       Tuple
       //
       HloInstruction* top_tuple = nullptr;
+      HloInstruction* first_gte = nullptr;
       bool can_simplify = true;
       for (int64 operand_number = 0;
            operand_number < instruction->operand_count(); ++operand_number) {
@@ -78,11 +79,17 @@ StatusOr<bool> TupleSimplifier::Run(HloModule* module) {
           can_simplify = false;
           break;
         }
-
+        if (first_gte == nullptr) {
+          first_gte = operand;
+        } else if (!first_gte->has_compatible_sharding(operand)) {
+          can_simplify = false;
+          break;
+        }
         if (top_tuple == nullptr) {
           top_tuple = operand->mutable_operand(0);
           if (!ShapeUtil::Compatible(top_tuple->shape(),
-                                     instruction->shape())) {
+                                     instruction->shape()) ||
+              !instruction->has_compatible_sharding(top_tuple)) {
             can_simplify = false;
             break;
           }
@@ -108,15 +115,17 @@ StatusOr<bool> TupleSimplifier::Run(HloModule* module) {
       //          |
       //         GTE
       if (instruction->operand(0)->opcode() == HloOpcode::kTuple) {
-        changed = true;
         HloInstruction* element_source =
             instruction->mutable_operand(0)->mutable_operand(
                 instruction->tuple_index());
-        TF_RETURN_IF_ERROR(instruction->ReplaceAllUsesWith(element_source));
-        for (HloInstruction* user : element_source->users()) {
-          if (user->opcode() == HloOpcode::kTuple ||
-              user->opcode() == HloOpcode::kGetTupleElement) {
-            worklist.push(user);
+        if (instruction->has_compatible_sharding(element_source)) {
+          changed = true;
+          TF_RETURN_IF_ERROR(instruction->ReplaceAllUsesWith(element_source));
+          for (HloInstruction* user : element_source->users()) {
+            if (user->opcode() == HloOpcode::kTuple ||
+                user->opcode() == HloOpcode::kGetTupleElement) {
+              worklist.push(user);
+            }
           }
         }
       }
diff --git a/tensorflow/compiler/xla/service/tuple_util_test.cc b/tensorflow/compiler/xla/service/tuple_util_test.cc
index 754fd8ef169231..d33d5bb8f30c85 100644
--- a/tensorflow/compiler/xla/service/tuple_util_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_util_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/tuple_util.h"
 
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 
 namespace xla {
 namespace {
@@ -37,7 +37,7 @@ ENTRY entry {
 )";
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
-                      tools::Parse(hlo_string));
+                      ParseHloString(hlo_string));
 
   *entry_computation = module->entry_computation();
   *param0 = (*entry_computation)->parameter_instruction(0);
diff --git a/tensorflow/compiler/xla/service/user_computation.cc b/tensorflow/compiler/xla/service/user_computation.cc
deleted file mode 100644
index 0f16a592b68e20..00000000000000
--- a/tensorflow/compiler/xla/service/user_computation.cc
+++ /dev/null
@@ -1,3553 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/user_computation.h"
-
-#include <algorithm>
-#include <set>
-#include <stack>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/shape_inference.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/protobuf.h"
-
-namespace xla {
-namespace {
-
-HloOpcode UnaryOperationToHloOpcode(UnaryOperation unop) {
-  switch (unop) {
-    case UNOP_ABS:
-      return HloOpcode::kAbs;
-    case UNOP_CEIL:
-      return HloOpcode::kCeil;
-    case UNOP_CLZ:
-      return HloOpcode::kClz;
-    case UNOP_COS:
-      return HloOpcode::kCos;
-    case UNOP_EXP:
-      return HloOpcode::kExp;
-    case UNOP_FLOOR:
-      return HloOpcode::kFloor;
-    case UNOP_IMAG:
-      return HloOpcode::kImag;
-    case UNOP_IS_FINITE:
-      return HloOpcode::kIsFinite;
-    case UNOP_LOG:
-      return HloOpcode::kLog;
-    case UNOP_NOT:
-      return HloOpcode::kNot;
-    case UNOP_NEGATE:
-      return HloOpcode::kNegate;
-    case UNOP_REAL:
-      return HloOpcode::kReal;
-    case UNOP_ROUND_NEAREST_AFZ:
-      return HloOpcode::kRoundNearestAfz;
-    case UNOP_SIGN:
-      return HloOpcode::kSign;
-    case UNOP_SIN:
-      return HloOpcode::kSin;
-    case UNOP_SORT:
-      return HloOpcode::kSort;
-    case UNOP_TANH:
-      return HloOpcode::kTanh;
-    default:
-      LOG(FATAL) << "unhandled operation " << unop;
-  }
-}
-
-HloOpcode BinaryOperationToHloOpcode(BinaryOperation binop) {
-  switch (binop) {
-    case BINOP_ATAN2:
-      return HloOpcode::kAtan2;
-    case BINOP_COMPLEX:
-      return HloOpcode::kComplex;
-    case BINOP_MUL:
-      return HloOpcode::kMultiply;
-    case BINOP_ADD:
-      return HloOpcode::kAdd;
-    case BINOP_SUB:
-      return HloOpcode::kSubtract;
-    case BINOP_DIV:
-      return HloOpcode::kDivide;
-    case BINOP_EQ:
-      return HloOpcode::kEq;
-    case BINOP_GE:
-      return HloOpcode::kGe;
-    case BINOP_GT:
-      return HloOpcode::kGt;
-    case BINOP_LE:
-      return HloOpcode::kLe;
-    case BINOP_LT:
-      return HloOpcode::kLt;
-    case BINOP_NE:
-      return HloOpcode::kNe;
-    case BINOP_MAX:
-      return HloOpcode::kMaximum;
-    case BINOP_MIN:
-      return HloOpcode::kMinimum;
-    case BINOP_POW:
-      return HloOpcode::kPower;
-    case BINOP_REM:
-      return HloOpcode::kRemainder;
-    case BINOP_OR:
-      return HloOpcode::kOr;
-    case BINOP_AND:
-      return HloOpcode::kAnd;
-    case BINOP_SHIFT_LEFT:
-      return HloOpcode::kShiftLeft;
-    case BINOP_SHIFT_RIGHT_ARITHMETIC:
-      return HloOpcode::kShiftRightArithmetic;
-    case BINOP_SHIFT_RIGHT_LOGICAL:
-      return HloOpcode::kShiftRightLogical;
-    default:
-      LOG(FATAL) << "unhandled operation " << binop;
-  }
-}
-
-HloOpcode TernaryOperationToHloOpcode(TernaryOperation triop) {
-  switch (triop) {
-    case TRIOP_CLAMP:
-      return HloOpcode::kClamp;
-    case TRIOP_SELECT:
-      return HloOpcode::kSelect;
-    default:
-      LOG(FATAL) << "unhandled operation " << triop;
-  }
-}
-
-HloOpcode VariadicOperationToHloOpcode(VariadicOperation varop) {
-  switch (varop) {
-    case VAROP_TUPLE:
-      return HloOpcode::kTuple;
-    default:
-      LOG(FATAL) << "unhandled operation " << varop;
-  }
-}
-
-}  // namespace
-
-/* static */ StatusOr<std::unique_ptr<UserComputation>>
-UserComputation::MakeWithRemapping(
-    const SessionComputation& session_computation,
-    const ComputationHandle& handle,
-    const std::map<int64, ComputationHandle>& old_to_new) {
-  auto user_computation =
-      MakeUnique<UserComputation>(session_computation.name(), handle);
-  {
-    tensorflow::mutex_lock lock(user_computation->mutex_);
-    user_computation->session_computation_ = session_computation;
-    user_computation->next_handle_value_ =
-        std::max_element(session_computation.requests().begin(),
-                         session_computation.requests().end(),
-                         [](const std::pair<int64, OperationRequest>& lhs,
-                            const std::pair<int64, OperationRequest>& rhs) {
-                           return lhs.first < rhs.first;
-                         })
-            ->first +
-        1;
-    TF_RETURN_IF_ERROR(user_computation->RemapEmbeddedComputations(old_to_new));
-  }
-
-  return std::move(user_computation);
-}
-
-UserComputation::UserComputation(const string& name,
-                                 const ComputationHandle& handle)
-    : name_(name), next_handle_value_(1) {
-  *session_computation_.mutable_computation_handle() = handle;
-  session_computation_.set_name(name);
-
-  VLOG(1) << "New UserComputation \"" << name
-          << "\", handle: " << handle.handle();
-}
-
-ComputationDataHandle UserComputation::CreateComputationDataHandle() {
-  ComputationDataHandle handle;
-  handle.set_handle(next_handle_value_);
-  // Handles are used as Version values and *must* be assigned consecutively for
-  // computation versioning to work.
-  next_handle_value_++;
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddParameterInstruction(
-    const ParameterRequest& parameter_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  int64 parameter_number = parameter_request.parameter();
-  if (parameters_.count(parameter_number) != 0) {
-    return InvalidArgument("parameter %lld already registered",
-                           parameter_number);
-  }
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  const Shape& validated_shape = parameter_request.shape();
-  TF_RETURN_IF_ERROR(
-      ShapeUtil::ValidateShapeWithOptionalLayout(validated_shape));
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = validated_shape;
-  *request.mutable_request()->mutable_parameter_request() = parameter_request;
-
-  parameters_[parameter_number] = &request;
-
-  VLOG(1) << "AddParameterInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << parameter_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddSendInstruction(
-    const SendRequest& send_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  // Check if the operand of the instruction is valid.
-  TF_RETURN_IF_ERROR(LookUpRequest(send_request.operand()).status());
-
-  // No handle is returned, but a handle must be assigned to this instruction
-  // for computation versioning.
-  ComputationDataHandle handle = CreateComputationDataHandle();
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = ShapeUtil::MakeNil();
-  *request.mutable_request()->mutable_send_request() = send_request;
-
-  VLOG(1) << "AddSendInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << send_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddRecvInstruction(
-    const RecvRequest& recv_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  const Shape& shape = recv_request.shape();
-  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(shape));
-  ComputationDataHandle handle = CreateComputationDataHandle();
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = shape;
-  *request.mutable_request()->mutable_recv_request() = recv_request;
-
-  VLOG(1) << "AddRecvInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << recv_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddPadInstruction(
-    const PadRequest& pad_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(pad_request.operand()));
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* padding_value,
-                      LookUpRequest(pad_request.padding_value()));
-
-  TF_ASSIGN_OR_RETURN(Shape inferred_shape, ShapeInference::InferPadShape(
-                                                operand->output_shape(),
-                                                padding_value->output_shape(),
-                                                pad_request.padding_config()));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = inferred_shape;
-  *request.mutable_request()->mutable_pad_request() = pad_request;
-
-  VLOG(1) << "AddPadInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << pad_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddConstantInstruction(
-    const ConstantRequest& constant_request) {
-  const Shape& validated_shape = constant_request.literal().shape();
-  TF_RETURN_IF_ERROR(
-      ShapeUtil::ValidateShapeWithOptionalLayout(validated_shape));
-
-  tensorflow::mutex_lock lock(mutex_);
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = validated_shape;
-  *request.mutable_request()->mutable_constant_request() = constant_request;
-
-  VLOG(1) << "AddConstantInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddGatherInstruction(
-    const GatherRequest& gather_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* input_request,
-                      LookUpRequest(gather_request.input()));
-  TF_ASSIGN_OR_RETURN(const OperationRequest* gather_indices_request,
-                      LookUpRequest(gather_request.gather_indices()));
-
-  TF_ASSIGN_OR_RETURN(
-      Shape shape,
-      ShapeInference::InferGatherShape(
-          input_request->output_shape(), gather_indices_request->output_shape(),
-          gather_request.dimension_numbers(),
-          AsInt64Slice(gather_request.window_bounds())));
-
-  const ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = shape;
-  *request.mutable_request()->mutable_gather_request() = gather_request;
-
-  VLOG(1) << "AddGatherInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << gather_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddGetTupleElementInstruction(
-    const GetTupleElementRequest& get_tuple_element_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(get_tuple_element_request.operand()));
-  if (!ShapeUtil::IsTuple(operand->output_shape())) {
-    return InvalidArgument(
-        "Operand to GetTupleElement() is not a tuple; got %s",
-        ShapeUtil::HumanString(operand->output_shape()).c_str());
-  }
-  Shape element_shape = ShapeUtil::GetTupleElementShape(
-      operand->output_shape(), get_tuple_element_request.index());
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = element_shape;
-  *request.mutable_request()->mutable_get_tuple_element_request() =
-      get_tuple_element_request;
-
-  VLOG(1) << "AddGetTupleElementInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << get_tuple_element_request.ShortDebugString();
-  return handle;
-}
-
-Status UserComputation::AddTraceInstruction(const TraceRequest& trace_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  // Verify that the operand index is valid.
-  TF_RETURN_IF_ERROR(LookUpRequest(trace_request.operand()).status());
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = ShapeUtil::MakeNil();
-  *request.mutable_request()->mutable_trace_request() = trace_request;
-
-  VLOG(1) << "AddTraceInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << trace_request.ShortDebugString();
-  return Status::OK();
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddRngInstruction(
-    const RngRequest& rng_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  // Check the number of parameters per RNG distribution.
-  switch (rng_request.distribution()) {
-    case RandomDistribution::RNG_NORMAL:
-    case RandomDistribution::RNG_UNIFORM:
-      if (rng_request.parameter_size() != 2) {
-        return InvalidArgument(
-            "RNG distribution (%s) expects 2 parameters, but got %d",
-            RandomDistribution_Name(rng_request.distribution()).c_str(),
-            rng_request.parameter_size());
-      }
-      break;
-    default:
-      LOG(FATAL) << "unhandled distribution " << rng_request.distribution();
-  }
-
-  // Verify that the parameter indices are valid;
-  for (const ComputationDataHandle& param : rng_request.parameter()) {
-    TF_RETURN_IF_ERROR(LookUpRequest(param).status());
-  }
-  const Shape& validated_shape = rng_request.shape();
-  TF_RETURN_IF_ERROR(
-      ShapeUtil::ValidateShapeWithOptionalLayout(validated_shape));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = validated_shape;
-  *request.mutable_request()->mutable_rng_request() = rng_request;
-
-  VLOG(1) << "AddRngInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << rng_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddMapInstruction(
-    const MapRequest& map_request,
-    const UserComputation& to_apply_computation) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  std::vector<const Shape*> operand_shapes;
-  for (const ComputationDataHandle& handle : map_request.operands()) {
-    TF_ASSIGN_OR_RETURN(const OperationRequest* operand, LookUpRequest(handle));
-    operand_shapes.push_back(&operand->output_shape());
-  }
-
-  VersionedComputationHandle::Version to_apply_version =
-      to_apply_computation.version();
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<const ProgramShape> to_apply_program_shape,
-      to_apply_computation.ComputeProgramShape(to_apply_version));
-  TF_ASSIGN_OR_RETURN(
-      Shape inferred_shape,
-      ShapeInference::InferMapShape(operand_shapes, *to_apply_program_shape,
-                                    AsInt64Slice(map_request.dimensions())));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = inferred_shape;
-  request.add_embedded_computation_versions(to_apply_version);
-  *request.mutable_request()->mutable_map_request() = map_request;
-
-  VLOG(1) << "AddMapInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << map_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddReduceInstruction(
-    const ReduceRequest& reduce_request,
-    const UserComputation& to_apply_computation) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(reduce_request.operand()));
-  TF_ASSIGN_OR_RETURN(const OperationRequest* init_value,
-                      LookUpRequest(reduce_request.init_value()));
-
-  VersionedComputationHandle::Version to_apply_version =
-      to_apply_computation.version();
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<const ProgramShape> to_apply_program_shape,
-      to_apply_computation.ComputeProgramShape(to_apply_version));
-
-  TF_ASSIGN_OR_RETURN(
-      Shape inferred_shape,
-      ShapeInference::InferReduceShape(
-          operand->output_shape(), init_value->output_shape(),
-          AsInt64Slice(reduce_request.dimensions()), *to_apply_program_shape));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = inferred_shape;
-  request.add_embedded_computation_versions(to_apply_version);
-  *request.mutable_request()->mutable_reduce_request() = reduce_request;
-
-  VLOG(1) << "AddReduceInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << reduce_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle>
-UserComputation::AddBatchNormTrainingInstruction(
-    const BatchNormTrainingRequest& batch_norm_training_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(batch_norm_training_request.operand()));
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* scale,
-                      LookUpRequest(batch_norm_training_request.scale()));
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* offset,
-                      LookUpRequest(batch_norm_training_request.offset()));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-
-  TF_ASSIGN_OR_RETURN(
-      Shape inferred_shape,
-      ShapeInference::InferBatchNormTrainingShape(
-          operand->output_shape(), scale->output_shape(),
-          offset->output_shape(), batch_norm_training_request.feature_index()));
-
-  *request.mutable_output_shape() = inferred_shape;
-
-  *request.mutable_output_handle() = handle;
-
-  *request.mutable_request()->mutable_batch_norm_training_request() =
-      batch_norm_training_request;
-
-  VLOG(1) << "AddBatchNormTrainingInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << batch_norm_training_request.ShortDebugString();
-
-  return handle;
-}
-
-StatusOr<ComputationDataHandle>
-UserComputation::AddBatchNormInferenceInstruction(
-    const BatchNormInferenceRequest& batch_norm_inference_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(batch_norm_inference_request.operand()));
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* scale,
-                      LookUpRequest(batch_norm_inference_request.scale()));
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* offset,
-                      LookUpRequest(batch_norm_inference_request.offset()));
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* mean,
-                      LookUpRequest(batch_norm_inference_request.mean()));
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* variance,
-                      LookUpRequest(batch_norm_inference_request.variance()));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-
-  TF_ASSIGN_OR_RETURN(Shape inferred_shape,
-                      ShapeInference::InferBatchNormInferenceShape(
-                          operand->output_shape(), scale->output_shape(),
-                          offset->output_shape(), mean->output_shape(),
-                          variance->output_shape(),
-                          batch_norm_inference_request.feature_index()));
-
-  *request.mutable_output_shape() = inferred_shape;
-
-  *request.mutable_output_handle() = handle;
-
-  *request.mutable_request()->mutable_batch_norm_inference_request() =
-      batch_norm_inference_request;
-
-  VLOG(1) << "AddBatchNormInferenceInstruction ("
-          << GetVersionedHandleInternal() << "), data handle "
-          << handle.handle() << ": "
-          << batch_norm_inference_request.ShortDebugString();
-
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddBatchNormGradInstruction(
-    const BatchNormGradRequest& batch_norm_grad_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(batch_norm_grad_request.operand()));
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* scale,
-                      LookUpRequest(batch_norm_grad_request.scale()));
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* mean,
-                      LookUpRequest(batch_norm_grad_request.mean()));
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* variance,
-                      LookUpRequest(batch_norm_grad_request.variance()));
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* grad_output,
-                      LookUpRequest(batch_norm_grad_request.grad_output()));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-
-  TF_ASSIGN_OR_RETURN(
-      Shape inferred_shape,
-      ShapeInference::InferBatchNormGradShape(
-          operand->output_shape(), scale->output_shape(), mean->output_shape(),
-          variance->output_shape(), grad_output->output_shape(),
-          batch_norm_grad_request.feature_index()));
-
-  *request.mutable_output_shape() = inferred_shape;
-
-  *request.mutable_output_handle() = handle;
-
-  *request.mutable_request()->mutable_batch_norm_grad_request() =
-      batch_norm_grad_request;
-
-  VLOG(1) << "AddBatchNormGradInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << batch_norm_grad_request.ShortDebugString();
-
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddReduceWindowInstruction(
-    const ReduceWindowRequest& reduce_window_request,
-    const UserComputation& to_apply_computation) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(reduce_window_request.operand()));
-  TF_ASSIGN_OR_RETURN(const OperationRequest* init_value,
-                      LookUpRequest(reduce_window_request.init_value()));
-
-  VersionedComputationHandle::Version to_apply_version =
-      to_apply_computation.version();
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<const ProgramShape> to_apply_program_shape,
-      to_apply_computation.ComputeProgramShape(to_apply_version));
-
-  TF_ASSIGN_OR_RETURN(
-      Shape inferred_shape,
-      ShapeInference::InferReduceWindowShape(
-          operand->output_shape(), init_value->output_shape(),
-          reduce_window_request.window(), *to_apply_program_shape));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = inferred_shape;
-  request.add_embedded_computation_versions(to_apply_version);
-  *request.mutable_request()->mutable_reduce_window_request() =
-      reduce_window_request;
-
-  VLOG(1) << "AddReduceWindowInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << reduce_window_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddSelectAndScatterInstruction(
-    const SelectAndScatterRequest& select_and_scatter_request,
-    const UserComputation& select_computation,
-    const UserComputation& scatter_computation) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(select_and_scatter_request.operand()));
-  TF_ASSIGN_OR_RETURN(const OperationRequest* source,
-                      LookUpRequest(select_and_scatter_request.source()));
-  TF_ASSIGN_OR_RETURN(const OperationRequest* init_value,
-                      LookUpRequest(select_and_scatter_request.init_value()));
-
-  VersionedComputationHandle::Version select_version =
-      select_computation.version();
-  TF_ASSIGN_OR_RETURN(std::shared_ptr<const ProgramShape> select_program_shape,
-                      select_computation.ComputeProgramShape(select_version));
-  VersionedComputationHandle::Version scatter_version =
-      scatter_computation.version();
-  TF_ASSIGN_OR_RETURN(std::shared_ptr<const ProgramShape> scatter_program_shape,
-                      scatter_computation.ComputeProgramShape(scatter_version));
-
-  TF_ASSIGN_OR_RETURN(
-      Shape inferred_shape,
-      ShapeInference::InferSelectAndScatterShape(
-          operand->output_shape(), *select_program_shape,
-          select_and_scatter_request.window(), source->output_shape(),
-          init_value->output_shape(), *scatter_program_shape));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = inferred_shape;
-  request.add_embedded_computation_versions(select_version);
-  request.add_embedded_computation_versions(scatter_version);
-  *request.mutable_request()->mutable_select_and_scatter_request() =
-      select_and_scatter_request;
-
-  VLOG(1) << "AddSelectAndScatterInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << select_and_scatter_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddReverseInstruction(
-    const ReverseRequest& reverse_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(reverse_request.operand()));
-  TF_ASSIGN_OR_RETURN(
-      Shape inferred_shape,
-      ShapeInference::InferReverseShape(
-          operand->output_shape(), AsInt64Slice(reverse_request.dimensions())));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = inferred_shape;
-  *request.mutable_request()->mutable_reverse_request() = reverse_request;
-  VLOG(1) << "AddReverseInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << reverse_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddWhileInstruction(
-    const WhileRequest& while_request,
-    const UserComputation& condition_computation,
-    const UserComputation& body_computation) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* init,
-                      LookUpRequest(while_request.init()));
-
-  VersionedComputationHandle::Version condition_version =
-      condition_computation.version();
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<const ProgramShape> condition_program_shape,
-      condition_computation.ComputeProgramShape(condition_version));
-
-  VersionedComputationHandle::Version body_version = body_computation.version();
-  TF_ASSIGN_OR_RETURN(std::shared_ptr<const ProgramShape> body_program_shape,
-                      body_computation.ComputeProgramShape(body_version));
-
-  TF_ASSIGN_OR_RETURN(
-      Shape inferred_shape,
-      ShapeInference::InferWhileShape(
-          *condition_program_shape, *body_program_shape, init->output_shape()));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = inferred_shape;
-  request.add_embedded_computation_versions(condition_version);
-  request.add_embedded_computation_versions(body_version);
-  *request.mutable_request()->mutable_while_request() = while_request;
-
-  VLOG(1) << "AddWhileInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << while_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddConditionalInstruction(
-    const ConditionalRequest& conditional_request,
-    const UserComputation& true_computation,
-    const UserComputation& false_computation) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* pred,
-                      LookUpRequest(conditional_request.predicate()));
-  TF_ASSIGN_OR_RETURN(const OperationRequest* true_operand,
-                      LookUpRequest(conditional_request.true_operand()));
-  TF_ASSIGN_OR_RETURN(const OperationRequest* false_operand,
-                      LookUpRequest(conditional_request.false_operand()));
-
-  VersionedComputationHandle::Version true_computation_version =
-      true_computation.version();
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<const ProgramShape> true_computation_shape,
-      true_computation.ComputeProgramShape(true_computation_version));
-
-  VersionedComputationHandle::Version false_computation_version =
-      false_computation.version();
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<const ProgramShape> false_computation_shape,
-      false_computation.ComputeProgramShape(false_computation_version));
-
-  TF_ASSIGN_OR_RETURN(Shape inferred_shape,
-                      ShapeInference::InferConditionalShape(
-                          pred->output_shape(), true_operand->output_shape(),
-                          false_operand->output_shape(),
-                          *true_computation_shape, *false_computation_shape));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = inferred_shape;
-  request.add_embedded_computation_versions(true_computation_version);
-  request.add_embedded_computation_versions(false_computation_version);
-  *request.mutable_request()->mutable_conditional_request() =
-      conditional_request;
-
-  VLOG(1) << "AddConditionalInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << conditional_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddBroadcastInstruction(
-    const BroadcastRequest& broadcast_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  // Fetches and validates the operand.
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(broadcast_request.operand()));
-  TF_ASSIGN_OR_RETURN(Shape inferred_shape,
-                      ShapeInference::InferBroadcastShape(
-                          operand->output_shape(),
-                          AsInt64Slice(broadcast_request.broadcast_sizes())));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = inferred_shape;
-  *request.mutable_request()->mutable_broadcast_request() = broadcast_request;
-
-  VLOG(1) << "AddBroadcastInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << broadcast_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddReshapeInstruction(
-    const ReshapeRequest& reshape_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  // Fetches and validates the operand.
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(reshape_request.operand()));
-
-  TF_ASSIGN_OR_RETURN(
-      Shape inferred_shape,
-      ShapeInference::InferReshapeShape(
-          operand->output_shape(), AsInt64Slice(reshape_request.dimensions()),
-          AsInt64Slice(reshape_request.new_sizes())));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = inferred_shape;
-  *request.mutable_request()->mutable_reshape_request() = reshape_request;
-
-  VLOG(1) << "AddReshapeInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << reshape_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddTransposeInstruction(
-    const TransposeRequest& transpose_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  // Fetches and validates the operand.
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(transpose_request.operand()));
-
-  TF_ASSIGN_OR_RETURN(Shape inferred_shape,
-                      ShapeInference::InferTransposeShape(
-                          operand->output_shape(),
-                          AsInt64Slice(transpose_request.dimensions())));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = inferred_shape;
-  *request.mutable_request()->mutable_transpose_request() = transpose_request;
-
-  VLOG(1) << "AddTransposeInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << transpose_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddSliceInstruction(
-    const SliceRequest& slice_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(slice_request.operand()));
-
-  TF_ASSIGN_OR_RETURN(
-      Shape new_shape,
-      ShapeInference::InferSliceShape(
-          operand->output_shape(), AsInt64Slice(slice_request.start_indices()),
-          AsInt64Slice(slice_request.limit_indices()),
-          AsInt64Slice(slice_request.strides())));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = new_shape;
-  *request.mutable_request()->mutable_slice_request() = slice_request;
-
-  VLOG(1) << "AddSliceInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << slice_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddDynamicSliceInstruction(
-    const DynamicSliceRequest& dynamic_slice_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(dynamic_slice_request.operand()));
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* start_indices,
-                      LookUpRequest(dynamic_slice_request.start_indices()));
-
-  TF_ASSIGN_OR_RETURN(
-      Shape new_shape,
-      ShapeInference::InferDynamicSliceShape(
-          operand->output_shape(), start_indices->output_shape(),
-          AsInt64Slice(dynamic_slice_request.slice_sizes())));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = new_shape;
-  *request.mutable_request()->mutable_dynamic_slice_request() =
-      dynamic_slice_request;
-
-  VLOG(1) << "AddDynamicSliceInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << dynamic_slice_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle>
-UserComputation::AddDynamicUpdateSliceInstruction(
-    const DynamicUpdateSliceRequest& dynamic_update_slice_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(dynamic_update_slice_request.operand()));
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* update,
-                      LookUpRequest(dynamic_update_slice_request.update()));
-
-  TF_ASSIGN_OR_RETURN(
-      const OperationRequest* start_indices,
-      LookUpRequest(dynamic_update_slice_request.start_indices()));
-
-  TF_ASSIGN_OR_RETURN(Shape new_shape,
-                      ShapeInference::InferDynamicUpdateSliceShape(
-                          operand->output_shape(), update->output_shape(),
-                          start_indices->output_shape()));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = new_shape;
-  *request.mutable_request()->mutable_dynamic_update_slice_request() =
-      dynamic_update_slice_request;
-
-  VLOG(1) << "AddDynamicUpdateSliceInstruction ("
-          << GetVersionedHandleInternal() << "), data handle "
-          << handle.handle() << ": "
-          << dynamic_update_slice_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddConcatenateInstruction(
-    const ConcatenateRequest& concatenate_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  std::vector<const Shape*> operand_shapes;
-  for (const ComputationDataHandle& handle : concatenate_request.operands()) {
-    TF_ASSIGN_OR_RETURN(const OperationRequest* operand, LookUpRequest(handle));
-    operand_shapes.push_back(&operand->output_shape());
-  }
-
-  TF_ASSIGN_OR_RETURN(Shape new_shape,
-                      ShapeInference::InferConcatOpShape(
-                          operand_shapes, concatenate_request.dimension()));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = new_shape;
-  *request.mutable_request()->mutable_concatenate_request() =
-      concatenate_request;
-
-  VLOG(1) << "AddConcatenateInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << concatenate_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddConvertInstruction(
-    const ConvertRequest& convert_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(convert_request.operand()));
-
-  TF_ASSIGN_OR_RETURN(Shape new_shape, ShapeInference::InferConvertShape(
-                                           operand->output_shape(),
-                                           convert_request.new_element_type()));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = new_shape;
-  *request.mutable_request()->mutable_convert_request() = convert_request;
-
-  VLOG(1) << "AddConvertInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << convert_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddBitcastConvertInstruction(
-    const ConvertRequest& convert_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(convert_request.operand()));
-
-  TF_ASSIGN_OR_RETURN(Shape new_shape, ShapeInference::InferConvertShape(
-                                           operand->output_shape(),
-                                           convert_request.new_element_type()));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = new_shape;
-  *request.mutable_request()->mutable_bitcast_convert_request() =
-      convert_request;
-
-  VLOG(1) << "AddBitcastConvertInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << convert_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddReducePrecisionInstruction(
-    const ReducePrecisionRequest& reduce_precision_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(reduce_precision_request.operand()));
-
-  TF_ASSIGN_OR_RETURN(
-      Shape new_shape,
-      ShapeInference::InferReducePrecisionShape(
-          operand->output_shape(), reduce_precision_request.exponent_bits(),
-          reduce_precision_request.mantissa_bits()));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = new_shape;
-  *request.mutable_request()->mutable_reduce_precision_request() =
-      reduce_precision_request;
-
-  VLOG(1) << "AddReducePrecisionInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << reduce_precision_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddConvolveInstruction(
-    const ConvolveRequest& convolve_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* lhs,
-                      LookUpRequest(convolve_request.lhs()));
-  TF_ASSIGN_OR_RETURN(const OperationRequest* rhs,
-                      LookUpRequest(convolve_request.rhs()));
-  TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferConvolveShape(
-                                       lhs->output_shape(), rhs->output_shape(),
-                                       convolve_request.window(),
-                                       convolve_request.dimension_numbers()));
-
-  const ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = shape;
-  *request.mutable_request()->mutable_convolve_request() = convolve_request;
-
-  VLOG(1) << "AddConvolveInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << convolve_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddFftInstruction(
-    const FftRequest& fft_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(fft_request.operand()));
-  TF_ASSIGN_OR_RETURN(Shape shape,
-                      ShapeInference::InferFftShape(
-                          operand->output_shape(), fft_request.fft_type(),
-                          AsInt64Slice(fft_request.fft_length())));
-
-  const ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = shape;
-  *request.mutable_request()->mutable_fft_request() = fft_request;
-
-  VLOG(1) << "AddFftInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << fft_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddCrossReplicaSumInstruction(
-    const CrossReplicaSumRequest& cross_replica_sum_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(cross_replica_sum_request.operand()));
-  TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferCrossReplicaSumShape(
-                                       {&operand->output_shape()}));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = shape;
-  *request.mutable_request()->mutable_cross_replica_sum_request() =
-      cross_replica_sum_request;
-
-  VLOG(1) << "AddCrossreplicaSumInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << cross_replica_sum_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddInfeedInstruction(
-    const InfeedRequest& infeed_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  const Shape& shape = infeed_request.shape();
-  if (!LayoutUtil::HasLayout(shape)) {
-    return InvalidArgument("Given shape to Infeed must have a layout");
-  }
-
-  const ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = shape;
-  *request.mutable_request()->mutable_infeed_request() = infeed_request;
-
-  VLOG(1) << "AddInfeedInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << infeed_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddOutfeedInstruction(
-    const OutfeedRequest& outfeed_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  const Shape& shape = outfeed_request.shape();
-  if (!LayoutUtil::HasLayout(shape)) {
-    return InvalidArgument("Given shape to Outfeed must have a layout");
-  }
-
-  // Verify that operand is valid.
-  TF_RETURN_IF_ERROR(LookUpRequest(outfeed_request.operand()).status());
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = shape;
-  *request.mutable_request()->mutable_outfeed_request() = outfeed_request;
-
-  VLOG(1) << "AddOutfeedInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << outfeed_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddCallInstruction(
-    const CallRequest& call_request,
-    const UserComputation& to_apply_computation) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  std::vector<const Shape*> operand_shapes;
-  for (const ComputationDataHandle& handle : call_request.operands()) {
-    TF_ASSIGN_OR_RETURN(const OperationRequest* operand, LookUpRequest(handle));
-    operand_shapes.push_back(&operand->output_shape());
-  }
-
-  VersionedComputationHandle::Version to_apply_version =
-      to_apply_computation.version();
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<const ProgramShape> to_apply_program_shape,
-      to_apply_computation.ComputeProgramShape(to_apply_version));
-  TF_ASSIGN_OR_RETURN(
-      Shape inferred_shape,
-      ShapeInference::InferCallShape(operand_shapes, *to_apply_program_shape));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = inferred_shape;
-  request.add_embedded_computation_versions(to_apply_version);
-  *request.mutable_request()->mutable_call_request() = call_request;
-
-  VLOG(1) << "AddCallInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << call_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddCustomCallInstruction(
-    const CustomCallRequest& custom_call_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  for (const ComputationDataHandle& handle : custom_call_request.operands()) {
-    TF_RETURN_IF_ERROR(LookUpRequest(handle).status());
-  }
-
-  if (tensorflow::str_util::StartsWith(custom_call_request.call_target_name(),
-                                       "$")) {
-    return InvalidArgument(
-        "Invalid custom_call_target \"%s\": Call targets that start with '$' "
-        "are reserved for internal use.",
-        custom_call_request.call_target_name().c_str());
-  }
-
-  const ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = custom_call_request.shape();
-  *request.mutable_request()->mutable_custom_call_request() =
-      custom_call_request;
-
-  VLOG(1) << "AddCustomCallInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << custom_call_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddHostComputeInstruction(
-    const HostComputeRequest& host_compute_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  for (const ComputationDataHandle& handle : host_compute_request.operands()) {
-    TF_RETURN_IF_ERROR(LookUpRequest(handle).status());
-  }
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = host_compute_request.shape();
-  *request.mutable_request()->mutable_host_compute_request() =
-      host_compute_request;
-
-  VLOG(1) << "AddHostComputeInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << host_compute_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddDotInstruction(
-    const DotRequest& dot_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* lhs,
-                      LookUpRequest(dot_request.lhs()));
-  TF_ASSIGN_OR_RETURN(const OperationRequest* rhs,
-                      LookUpRequest(dot_request.rhs()));
-
-  TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferDotOpShape(
-                                       lhs->output_shape(), rhs->output_shape(),
-                                       dot_request.dimension_numbers()));
-
-  const ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = shape;
-  *request.mutable_request()->mutable_dot_request() = dot_request;
-
-  VLOG(1) << "AddDotInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << dot_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddUnaryInstruction(
-    const UnaryOpRequest& unary_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(unary_request.operand()));
-  TF_ASSIGN_OR_RETURN(
-      Shape shape, ShapeInference::InferUnaryOpShape(unary_request.unop(),
-                                                     operand->output_shape()));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = shape;
-  *request.mutable_request()->mutable_unary_op_request() = unary_request;
-
-  VLOG(1) << "AddUnaryInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << unary_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddBinaryInstruction(
-    const BinaryOpRequest& binary_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* lhs,
-                      LookUpRequest(binary_request.lhs()));
-  TF_ASSIGN_OR_RETURN(const OperationRequest* rhs,
-                      LookUpRequest(binary_request.rhs()));
-  TF_ASSIGN_OR_RETURN(
-      Shape shape,
-      ShapeInference::InferBinaryOpShape(
-          binary_request.binop(), lhs->output_shape(), rhs->output_shape(),
-          AsInt64Slice(binary_request.broadcast_dimensions())));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = shape;
-  *request.mutable_request()->mutable_binary_op_request() = binary_request;
-
-  VLOG(1) << "AddBinaryInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << binary_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddTernaryInstruction(
-    const TernaryOpRequest& ternary_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* lhs,
-                      LookUpRequest(ternary_request.lhs()));
-  TF_ASSIGN_OR_RETURN(const OperationRequest* rhs,
-                      LookUpRequest(ternary_request.rhs()));
-  TF_ASSIGN_OR_RETURN(const OperationRequest* ehs,
-                      LookUpRequest(ternary_request.ehs()));
-  TF_ASSIGN_OR_RETURN(Shape shape,
-                      ShapeInference::InferTernaryOpShape(
-                          ternary_request.triop(), lhs->output_shape(),
-                          rhs->output_shape(), ehs->output_shape()));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = shape;
-  *request.mutable_request()->mutable_ternary_op_request() = ternary_request;
-
-  VLOG(1) << "AddTernaryInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << ternary_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddVariadicInstruction(
-    const VariadicOpRequest& variadic_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  std::vector<const Shape*> operand_shapes;
-  for (const ComputationDataHandle& handle : variadic_request.operands()) {
-    TF_ASSIGN_OR_RETURN(const OperationRequest* operand, LookUpRequest(handle));
-    operand_shapes.push_back(&operand->output_shape());
-  }
-
-  TF_ASSIGN_OR_RETURN(Shape shape,
-                      ShapeInference::InferVariadicOpShape(
-                          variadic_request.varop(), operand_shapes));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = shape;
-  *request.mutable_request()->mutable_variadic_op_request() = variadic_request;
-
-  VLOG(1) << "AddVariadicInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << variadic_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<Shape> UserComputation::GetShape(const ComputationDataHandle& handle) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand, LookUpRequest(handle));
-  return operand->output_shape();
-}
-
-Status UserComputation::SetOpMetadata(const ComputationDataHandle& handle,
-                                      const OpMetadata& metadata) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  int64 handle_value = handle.handle();
-  if (session_computation_.requests().count(handle_value) == 0) {
-    return InvalidArgument("Invalid handle in SetOpMetadata (%lld)",
-                           handle_value);
-  }
-  *session_computation_.mutable_requests()
-       ->at(handle_value)
-       .mutable_request()
-       ->mutable_metadata() = metadata;
-  return Status::OK();
-}
-
-Status UserComputation::SetOpSharding(const ComputationDataHandle& handle,
-                                      const OpSharding& sharding) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  int64 handle_value = handle.handle();
-  if (session_computation_.requests().count(handle_value) == 0) {
-    return InvalidArgument("Invalid handle in SetOpSharding (%lld)",
-                           handle_value);
-  }
-  *session_computation_.mutable_requests()
-       ->at(handle_value)
-       .mutable_request()
-       ->mutable_sharding() = sharding;
-  return Status::OK();
-}
-
-Status UserComputation::SetReturnValue(const ComputationDataHandle& handle) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  if (!(handle.handle() > 0 && handle.handle() < next_handle_value_)) {
-    return InvalidArgument("Invalid handle in SetReturnValue");
-  }
-
-  handle_to_return_ = handle;
-
-  VLOG(1) << "SetReturnValue of computation \"" << name() << "\" fixed to "
-          << GetVersionedHandleInternal();
-
-  return Status::OK();
-}
-
-VersionedComputationHandle UserComputation::GetVersionedHandle() const {
-  tensorflow::mutex_lock lock(mutex_);
-  return GetVersionedHandleInternal();
-}
-
-VersionedComputationHandle UserComputation::GetVersionedHandleInternal() const {
-  VersionedComputationHandle versioned_handle;
-  versioned_handle.handle = session_computation_.computation_handle();
-
-  if (handle_to_return_.handle() > 0) {
-    // A specific handle has been requested for the result of the computation.
-    versioned_handle.version = handle_to_return_.handle();
-  } else {
-    // A version value is simply the most recently assigned
-    // ComputationDataHandle value, ie the handle value of the root of the
-    // computation.
-    versioned_handle.version = next_handle_value_ - 1;
-  }
-
-  return versioned_handle;
-}
-
-VersionedComputationHandle UserComputation::GetVersionedHandleAtOperation(
-    const ComputationDataHandle& operation) const {
-  tensorflow::mutex_lock lock(mutex_);
-
-  // The version at which an operation was added is simply the handle value of
-  // the ComputationDataHandle.
-  VersionedComputationHandle versioned_handle;
-  versioned_handle.handle = session_computation_.computation_handle();
-  versioned_handle.version = operation.handle();
-  return versioned_handle;
-}
-
-VersionedComputationHandle::Version UserComputation::version() const {
-  return GetVersionedHandle().version;
-}
-
-namespace {
-
-// Returns true if the operation type corresponding to the given opcase can be
-// the root of the computation.
-bool CanBeRoot(const OpRequest::OpCase& op_case) {
-  switch (op_case) {
-    case OpRequest::kTraceRequest:
-    case OpRequest::kSendRequest:
-    case OpRequest::kOutfeedRequest:
-      return false;
-    default:
-      return true;
-  }
-}
-
-// Returns a pointer to the operation with the given data handle value in the
-// given SessionComputation.
-StatusOr<const OperationRequest*> LookUpRequest(
-    int64 handle_value, const SessionComputation& session_computation) {
-  if (session_computation.requests().count(handle_value) == 0) {
-    return InvalidArgument("no ComputationDataHandle value %lld", handle_value);
-  }
-  return &session_computation.requests().at(handle_value);
-}
-
-// Returns the OperationRequest corresponding to the root (result) of the
-// session computation.
-StatusOr<const OperationRequest*> GetRoot(
-    VersionedComputationHandle::Version version,
-    const SessionComputation& session_computation) {
-  TF_RET_CHECK(version > 0);
-  // Not all instructions can be roots. Walk backwards from the operation
-  // indicated by this version until a valid root is found.
-  const OperationRequest* root_request = nullptr;
-  while (version > 0) {
-    TF_ASSIGN_OR_RETURN(root_request,
-                        LookUpRequest(version, session_computation));
-    if (CanBeRoot(root_request->request().op_case())) {
-      break;
-    }
-    version--;
-  }
-  if (version == 0) {
-    return InternalError("Computation contains no root operation");
-  }
-  return root_request;
-}
-
-}  // namespace
-
-StatusOr<std::shared_ptr<const ProgramShape>>
-UserComputation::ComputeProgramShape(
-    VersionedComputationHandle::Version version) const {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_RET_CHECK(version > 0 && version < next_handle_value_);
-
-  if (program_shape_ == nullptr || program_shape_version_ != version) {
-    // ProgramShape has not been computed yet, or is for different
-    // version. Compute it now.
-    TF_RETURN_IF_ERROR(CheckParametersAreContiguous(version));
-
-    auto program_shape = MakeUnique<ProgramShape>();
-    for (int64 request_num = 1; request_num <= version; ++request_num) {
-      const OperationRequest& request =
-          session_computation_.requests().at(request_num);
-      if (request.request().op_case() == OpRequest::kParameterRequest) {
-        const ParameterRequest& parameter_request =
-            request.request().parameter_request();
-        int64 param_no = parameter_request.parameter();
-        // Parameters may be out of order so expand ProgramShape parameters
-        // until it is at least large enough to hold the current parameter
-        // number.
-        while (program_shape->parameters_size() <= param_no) {
-          program_shape->add_parameters();
-          program_shape->add_parameter_names();
-        }
-        *program_shape->mutable_parameters(param_no) = request.output_shape();
-        *program_shape->mutable_parameter_names(param_no) =
-            parameter_request.name();
-      }
-    }
-
-    // The root determines the output shape.
-    TF_ASSIGN_OR_RETURN(const OperationRequest* root_request,
-                        GetRoot(version, session_computation_));
-    *program_shape->mutable_result() = root_request->output_shape();
-    if (ShapeUtil::IsOpaque(program_shape->result())) {
-      return Unimplemented("Computation results cannot be opaque");
-    }
-
-    program_shape_ = std::move(program_shape);
-    program_shape_version_ = version;
-  }
-
-  return program_shape_;
-}
-
-namespace {
-
-// A visitor which checks whether an operation is pure functional meaning that
-// it doesn't depend on any parameter with an index higher then num_parameters.
-// The visitor walks the computation starting at a given operation and sets
-// is_functional to false iff a parameter or RNG operation is encountered.
-void PureFunctionalVisitor(const SessionComputation& session_computation,
-                           const ComputationDataHandle& handle,
-                           int64 num_parameters, std::set<int64>* visited,
-                           bool* is_functional) {
-  if (visited->count(handle.handle()) != 0 || !*is_functional) {
-    return;
-  }
-
-  const OperationRequest& request =
-      session_computation.requests().at(handle.handle());
-  switch (request.request().op_case()) {
-    case OpRequest::kRngRequest:
-      *is_functional = false;
-      break;
-
-    case OpRequest::kConstantRequest:
-      break;
-
-    case OpRequest::kGetTupleElementRequest: {
-      const GetTupleElementRequest& get_tuple_element_request =
-          request.request().get_tuple_element_request();
-      PureFunctionalVisitor(session_computation,
-                            get_tuple_element_request.operand(), num_parameters,
-                            visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kSliceRequest: {
-      const SliceRequest& slice_request = request.request().slice_request();
-      PureFunctionalVisitor(session_computation, slice_request.operand(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kDynamicSliceRequest: {
-      const DynamicSliceRequest& dynamic_slice_request =
-          request.request().dynamic_slice_request();
-      PureFunctionalVisitor(session_computation,
-                            dynamic_slice_request.operand(), num_parameters,
-                            visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            dynamic_slice_request.start_indices(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kDynamicUpdateSliceRequest: {
-      const DynamicUpdateSliceRequest& dynamic_update_slice_request =
-          request.request().dynamic_update_slice_request();
-      PureFunctionalVisitor(session_computation,
-                            dynamic_update_slice_request.operand(),
-                            num_parameters, visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            dynamic_update_slice_request.update(),
-                            num_parameters, visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            dynamic_update_slice_request.start_indices(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kConcatenateRequest: {
-      const ConcatenateRequest& concatenate_request =
-          request.request().concatenate_request();
-      for (const ComputationDataHandle& handle :
-           concatenate_request.operands()) {
-        PureFunctionalVisitor(session_computation, handle, num_parameters,
-                              visited, is_functional);
-      }
-      break;
-    }
-
-    case OpRequest::kConvolveRequest: {
-      const ConvolveRequest& convolve_request =
-          request.request().convolve_request();
-      PureFunctionalVisitor(session_computation, convolve_request.lhs(),
-                            num_parameters, visited, is_functional);
-      PureFunctionalVisitor(session_computation, convolve_request.rhs(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kFftRequest: {
-      const FftRequest& fft_request = request.request().fft_request();
-      PureFunctionalVisitor(session_computation, fft_request.operand(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kCrossReplicaSumRequest: {
-      // TODO(b/33009255): Implmement constant folding for cross replica sum.
-      *is_functional = false;
-      break;
-    }
-
-    case OpRequest::kInfeedRequest: {
-      *is_functional = false;
-      break;
-    }
-
-    case OpRequest::kOutfeedRequest: {
-      *is_functional = false;
-      break;
-    }
-
-    case OpRequest::kHostComputeRequest: {
-      *is_functional = false;
-      break;
-    }
-
-    case OpRequest::kCallRequest: {
-      const CallRequest& call_request = request.request().call_request();
-      for (const ComputationDataHandle& handle : call_request.operands()) {
-        PureFunctionalVisitor(session_computation, handle, num_parameters,
-                              visited, is_functional);
-      }
-      // TODO(b/32495713): We aren't checking the to_apply computation itself,
-      // so we conservatively say that computations containing the Call op
-      // cannot be constant.  We cannot set is_functional=false in other similar
-      // cases since we're already relying on IsConstant to return true.
-      *is_functional = false;
-      break;
-    }
-
-    case OpRequest::kCustomCallRequest: {
-      *is_functional = false;
-      break;
-    }
-
-    case OpRequest::kDotRequest: {
-      const DotRequest& dot_request = request.request().dot_request();
-      PureFunctionalVisitor(session_computation, dot_request.lhs(),
-                            num_parameters, visited, is_functional);
-      PureFunctionalVisitor(session_computation, dot_request.rhs(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kSendRequest: {
-      *is_functional = false;
-      break;
-    }
-
-    case OpRequest::kRecvRequest: {
-      *is_functional = false;
-      break;
-    }
-
-    case OpRequest::kMapRequest: {
-      const MapRequest& map_request = request.request().map_request();
-      for (const ComputationDataHandle& handle : map_request.operands()) {
-        PureFunctionalVisitor(session_computation, handle, num_parameters,
-                              visited, is_functional);
-      }
-      // TODO(b/32495713): We aren't checking the to_apply computation itself.
-      break;
-    }
-
-    case OpRequest::kReduceRequest: {
-      const ReduceRequest& reduce_request = request.request().reduce_request();
-      PureFunctionalVisitor(session_computation, reduce_request.operand(),
-                            num_parameters, visited, is_functional);
-      PureFunctionalVisitor(session_computation, reduce_request.init_value(),
-                            num_parameters, visited, is_functional);
-      // TODO(b/32495713): We aren't checking the to_apply computation itself.
-      break;
-    }
-
-    case OpRequest::kReduceWindowRequest: {
-      const ReduceWindowRequest& reduce_window_request =
-          request.request().reduce_window_request();
-      PureFunctionalVisitor(session_computation,
-                            reduce_window_request.operand(), num_parameters,
-                            visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            reduce_window_request.init_value(), num_parameters,
-                            visited, is_functional);
-      // TODO(b/32495713): We aren't checking the to_apply computation itself.
-      break;
-    }
-
-    case OpRequest::kSelectAndScatterRequest: {
-      const SelectAndScatterRequest& select_and_scatter_request =
-          request.request().select_and_scatter_request();
-      PureFunctionalVisitor(session_computation,
-                            select_and_scatter_request.operand(),
-                            num_parameters, visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            select_and_scatter_request.source(), num_parameters,
-                            visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            select_and_scatter_request.init_value(),
-                            num_parameters, visited, is_functional);
-      // TODO(b/32495713): We aren't checking the select and scatter
-      // computations themselves.
-      break;
-    }
-
-    case OpRequest::kBroadcastRequest: {
-      const BroadcastRequest& broadcast_request =
-          request.request().broadcast_request();
-      PureFunctionalVisitor(session_computation, broadcast_request.operand(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kReshapeRequest: {
-      const ReshapeRequest& reshape_request =
-          request.request().reshape_request();
-      PureFunctionalVisitor(session_computation, reshape_request.operand(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kReverseRequest: {
-      const ReverseRequest& reverse_request =
-          request.request().reverse_request();
-      PureFunctionalVisitor(session_computation, reverse_request.operand(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kPadRequest: {
-      const PadRequest& pad_request = request.request().pad_request();
-      PureFunctionalVisitor(session_computation, pad_request.operand(),
-                            num_parameters, visited, is_functional);
-      PureFunctionalVisitor(session_computation, pad_request.padding_value(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kParameterRequest: {
-      const ParameterRequest& parameter_request =
-          request.request().parameter_request();
-      if (parameter_request.parameter() >= num_parameters) {
-        *is_functional = false;
-      }
-      break;
-    }
-
-    case OpRequest::kConvertRequest: {
-      const ConvertRequest& convert_request =
-          request.request().convert_request();
-      PureFunctionalVisitor(session_computation, convert_request.operand(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kBitcastConvertRequest: {
-      const ConvertRequest& convert_request =
-          request.request().bitcast_convert_request();
-      PureFunctionalVisitor(session_computation, convert_request.operand(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kWhileRequest: {
-      const WhileRequest& while_request = request.request().while_request();
-      PureFunctionalVisitor(session_computation, while_request.init(),
-                            num_parameters, visited, is_functional);
-      // TODO(b/32495713): We aren't checking the condition and body
-      // computations themselves.
-      *is_functional = false;
-      break;
-    }
-
-    case OpRequest::kConditionalRequest: {
-      const ConditionalRequest& conditional_request =
-          request.request().conditional_request();
-      PureFunctionalVisitor(session_computation,
-                            conditional_request.predicate(), num_parameters,
-                            visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            conditional_request.true_operand(), num_parameters,
-                            visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            conditional_request.false_operand(), num_parameters,
-                            visited, is_functional);
-      // TODO(b/32495713): We aren't checking the true and false computations
-      // themselves.
-      break;
-    }
-
-    case OpRequest::kTernaryOpRequest: {
-      const TernaryOpRequest& ternary_op_request =
-          request.request().ternary_op_request();
-      PureFunctionalVisitor(session_computation, ternary_op_request.lhs(),
-                            num_parameters, visited, is_functional);
-      PureFunctionalVisitor(session_computation, ternary_op_request.rhs(),
-                            num_parameters, visited, is_functional);
-      PureFunctionalVisitor(session_computation, ternary_op_request.ehs(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kTransposeRequest: {
-      const TransposeRequest& transpose_request =
-          request.request().transpose_request();
-      PureFunctionalVisitor(session_computation, transpose_request.operand(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kVariadicOpRequest: {
-      const VariadicOpRequest& variadic_op_request =
-          request.request().variadic_op_request();
-      for (const ComputationDataHandle& handle :
-           variadic_op_request.operands()) {
-        PureFunctionalVisitor(session_computation, handle, num_parameters,
-                              visited, is_functional);
-      }
-      break;
-    }
-
-    case OpRequest::kUnaryOpRequest: {
-      const UnaryOpRequest& unary_op_request =
-          request.request().unary_op_request();
-      PureFunctionalVisitor(session_computation, unary_op_request.operand(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kBatchNormTrainingRequest: {
-      const BatchNormTrainingRequest& batch_norm_training_request =
-          request.request().batch_norm_training_request();
-      PureFunctionalVisitor(session_computation,
-                            batch_norm_training_request.operand(),
-                            num_parameters, visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            batch_norm_training_request.scale(), num_parameters,
-                            visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            batch_norm_training_request.offset(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kBatchNormInferenceRequest: {
-      const BatchNormInferenceRequest& batch_norm_inference_request =
-          request.request().batch_norm_inference_request();
-      PureFunctionalVisitor(session_computation,
-                            batch_norm_inference_request.operand(),
-                            num_parameters, visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            batch_norm_inference_request.scale(),
-                            num_parameters, visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            batch_norm_inference_request.offset(),
-                            num_parameters, visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            batch_norm_inference_request.mean(), num_parameters,
-                            visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            batch_norm_inference_request.variance(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kBatchNormGradRequest: {
-      const BatchNormGradRequest& batch_norm_grad_request =
-          request.request().batch_norm_grad_request();
-      PureFunctionalVisitor(session_computation,
-                            batch_norm_grad_request.operand(), num_parameters,
-                            visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            batch_norm_grad_request.scale(), num_parameters,
-                            visited, is_functional);
-      PureFunctionalVisitor(session_computation, batch_norm_grad_request.mean(),
-                            num_parameters, visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            batch_norm_grad_request.variance(), num_parameters,
-                            visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            batch_norm_grad_request.grad_output(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kBinaryOpRequest: {
-      const BinaryOpRequest& binary_op_request =
-          request.request().binary_op_request();
-      PureFunctionalVisitor(session_computation, binary_op_request.lhs(),
-                            num_parameters, visited, is_functional);
-      PureFunctionalVisitor(session_computation, binary_op_request.rhs(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kGatherRequest: {
-      PureFunctionalVisitor(session_computation,
-                            request.request().gather_request().input(),
-                            num_parameters, visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            request.request().gather_request().gather_indices(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::OP_NOT_SET:
-      LOG(FATAL) << "OperationRequest doesn't contain a request";
-
-    default:
-      LOG(FATAL) << "Unexpected request type: " << request.request().op_case();
-  }
-  if (!*is_functional) {
-    VLOG(1) << "Non-functional: " << request.request().DebugString();
-  }
-  visited->insert(handle.handle());
-}
-
-}  // namespace
-
-StatusOr<bool> UserComputation::IsConstant(const ComputationDataHandle& handle,
-                                           int64 num_parameters) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  // Verify that the handle is valid.
-  auto operation_status = LookUpRequest(handle);
-  if (!operation_status.ok()) {
-    return operation_status.status();
-  }
-
-  bool is_constant = true;
-  std::set<int64> visited;
-  PureFunctionalVisitor(session_computation_, handle, num_parameters, &visited,
-                        &is_constant);
-
-  return is_constant;
-}
-
-std::vector<VersionedComputationHandle>
-UserComputation::GetEmbeddedComputations(
-    VersionedComputationHandle::Version version) const {
-  tensorflow::mutex_lock lock(mutex_);
-
-  VLOG(1)
-      << "GetEmbeddedComputations(" << name() << " "
-      << VersionedComputationHandle{session_computation_.computation_handle(),
-                                    version}
-      << ")";
-  XLA_VLOG_LINES(3, session_computation_.DebugString());
-
-  std::vector<VersionedComputationHandle> computations;
-  std::vector<int64> sorted_handles;
-  for (const auto& handle_request : session_computation_.requests()) {
-    sorted_handles.push_back(handle_request.first);
-  }
-  std::sort(sorted_handles.begin(), sorted_handles.end());
-  for (int64 handle : sorted_handles) {
-    const auto& handle_request = session_computation_.requests().find(handle);
-    CHECK(handle_request != session_computation_.requests().end());
-    int64 handle_value = handle_request->first;
-    if (handle_value <= version) {
-      const OperationRequest& request = handle_request->second;
-      switch (request.request().op_case()) {
-        case OpRequest::kCallRequest: {
-          CHECK_EQ(1, request.embedded_computation_versions_size());
-          const CallRequest& call_request = request.request().call_request();
-          const VersionedComputationHandle versioned_handle = {
-              call_request.to_apply(),
-              request.embedded_computation_versions(0)};
-          computations.push_back(versioned_handle);
-          break;
-        }
-
-        case OpRequest::kMapRequest: {
-          CHECK_EQ(1, request.embedded_computation_versions_size());
-          const MapRequest& map_request = request.request().map_request();
-          const VersionedComputationHandle versioned_handle = {
-              map_request.to_apply(), request.embedded_computation_versions(0)};
-          computations.push_back(versioned_handle);
-          break;
-        }
-
-        case OpRequest::kReduceRequest: {
-          CHECK_EQ(1, request.embedded_computation_versions_size());
-          const ReduceRequest& reduce_request =
-              request.request().reduce_request();
-          const VersionedComputationHandle versioned_handle = {
-              reduce_request.to_apply(),
-              request.embedded_computation_versions(0)};
-          computations.push_back(versioned_handle);
-          break;
-        }
-
-        case OpRequest::kReduceWindowRequest: {
-          CHECK_EQ(1, request.embedded_computation_versions_size());
-          const ReduceWindowRequest& reduce_window_request =
-              request.request().reduce_window_request();
-          const VersionedComputationHandle versioned_handle = {
-              reduce_window_request.to_apply(),
-              request.embedded_computation_versions(0)};
-          computations.push_back(versioned_handle);
-          break;
-        }
-
-        case OpRequest::kSelectAndScatterRequest: {
-          CHECK_EQ(2, request.embedded_computation_versions_size());
-          const SelectAndScatterRequest& select_and_scatter_request =
-              request.request().select_and_scatter_request();
-          const VersionedComputationHandle select_versioned_handle = {
-              select_and_scatter_request.select(),
-              request.embedded_computation_versions(0)};
-          computations.push_back(select_versioned_handle);
-          const VersionedComputationHandle scatter_versioned_handle = {
-              select_and_scatter_request.scatter(),
-              request.embedded_computation_versions(1)};
-          computations.push_back(scatter_versioned_handle);
-          break;
-        }
-
-        case OpRequest::kWhileRequest: {
-          CHECK_EQ(2, request.embedded_computation_versions_size());
-          const WhileRequest& while_request = request.request().while_request();
-          const VersionedComputationHandle condition_versioned_handle = {
-              while_request.condition(),
-              request.embedded_computation_versions(0)};
-          computations.push_back(condition_versioned_handle);
-          const VersionedComputationHandle body_versioned_handle = {
-              while_request.body(), request.embedded_computation_versions(1)};
-          computations.push_back(body_versioned_handle);
-          break;
-        }
-
-        case OpRequest::kConditionalRequest: {
-          CHECK_EQ(2, request.embedded_computation_versions_size());
-          const ConditionalRequest& conditional_request =
-              request.request().conditional_request();
-          const VersionedComputationHandle true_computation_versioned_handle = {
-              conditional_request.true_computation(),
-              request.embedded_computation_versions(0)};
-          computations.push_back(true_computation_versioned_handle);
-          const VersionedComputationHandle false_computation_versioned_handle =
-              {conditional_request.false_computation(),
-               request.embedded_computation_versions(1)};
-          computations.push_back(false_computation_versioned_handle);
-          break;
-        }
-
-        default:
-          // No embedded computation.
-          break;
-      }
-    }
-  }
-  VLOG(2) << "Embedded computations: "
-          << tensorflow::str_util::Join(
-                 computations, ", ",
-                 [](string* out, const VersionedComputationHandle& h) {
-                   out->append(h.ToString());
-                 });
-  return computations;
-}
-
-StatusOr<const OperationRequest*>
-UserComputation::LookUpRequestForErrorReporting(
-    const ComputationDataHandle& handle) const {
-  tensorflow::mutex_lock lock(mutex_);
-  return LookUpRequest(handle);
-}
-
-tensorflow::gtl::optional<const OpMetadata*> UserComputation::ParameterMetadata(
-    int parameter_number) const {
-  tensorflow::mutex_lock lock(mutex_);
-  auto it = parameters_.find(parameter_number);
-  if (it == parameters_.end()) {
-    return tensorflow::gtl::nullopt;
-  }
-  OperationRequest* op = it->second;
-  return &op->request().metadata();
-}
-
-Status UserComputation::RemapEmbeddedComputations(
-    const std::map<int64, ComputationHandle>& old_to_new) {
-  auto update = [&old_to_new](ComputationHandle* to_update) -> Status {
-    int64 old = to_update->handle();
-    auto it = old_to_new.find(old);
-    if (it == old_to_new.end()) {
-      string mapping = tensorflow::str_util::Join(
-          old_to_new, ", ",
-          [](string* out, std::pair<int64, ComputationHandle> element) {
-            tensorflow::strings::Appendf(out, "%lld:%lld", element.first,
-                                         element.second.handle());
-          });
-      return NotFound(
-          "could not find referenced (old) computation handle in mapping: "
-          "%lld; mapping: {%s}",
-          old, mapping.c_str());
-    }
-    VLOG(2) << "remapping " << old << " to " << it->second.handle();
-    *to_update = it->second;
-    return Status::OK();
-  };
-  TF_RETURN_IF_ERROR(update(session_computation_.mutable_computation_handle()));
-  for (auto& handle_request : *session_computation_.mutable_requests()) {
-    OperationRequest& request = handle_request.second;
-    switch (request.request().op_case()) {
-      case OpRequest::kCallRequest: {
-        TF_RET_CHECK(1 == request.embedded_computation_versions_size());
-        CallRequest* call_request =
-            request.mutable_request()->mutable_call_request();
-        TF_RETURN_IF_ERROR(update(call_request->mutable_to_apply()));
-        break;
-      }
-      case OpRequest::kMapRequest: {
-        TF_RET_CHECK(1 == request.embedded_computation_versions_size());
-        MapRequest* map_request =
-            request.mutable_request()->mutable_map_request();
-        TF_RETURN_IF_ERROR(update(map_request->mutable_to_apply()));
-        break;
-      }
-      case OpRequest::kReduceRequest: {
-        TF_RET_CHECK(1 == request.embedded_computation_versions_size());
-        ReduceRequest* reduce_request =
-            request.mutable_request()->mutable_reduce_request();
-        TF_RETURN_IF_ERROR(update(reduce_request->mutable_to_apply()));
-        break;
-      }
-      case OpRequest::kReduceWindowRequest: {
-        TF_RET_CHECK(1 == request.embedded_computation_versions_size());
-        ReduceWindowRequest* reduce_window_request =
-            request.mutable_request()->mutable_reduce_window_request();
-        TF_RETURN_IF_ERROR(update(reduce_window_request->mutable_to_apply()));
-        break;
-      }
-      case OpRequest::kSelectAndScatterRequest: {
-        TF_RET_CHECK(2 == request.embedded_computation_versions_size());
-        SelectAndScatterRequest* select_and_scatter_request =
-            request.mutable_request()->mutable_select_and_scatter_request();
-        TF_RETURN_IF_ERROR(
-            update(select_and_scatter_request->mutable_select()));
-        TF_RETURN_IF_ERROR(
-            update(select_and_scatter_request->mutable_scatter()));
-        break;
-      }
-      case OpRequest::kWhileRequest: {
-        TF_RET_CHECK(2 == request.embedded_computation_versions_size());
-        WhileRequest* while_request =
-            request.mutable_request()->mutable_while_request();
-        TF_RETURN_IF_ERROR(update(while_request->mutable_condition()));
-        TF_RETURN_IF_ERROR(update(while_request->mutable_body()));
-        break;
-      }
-      case OpRequest::kConditionalRequest: {
-        TF_RET_CHECK(2 == request.embedded_computation_versions_size());
-        ConditionalRequest* conditional_request =
-            request.mutable_request()->mutable_conditional_request();
-        TF_RETURN_IF_ERROR(
-            update(conditional_request->mutable_true_computation()));
-        TF_RETURN_IF_ERROR(
-            update(conditional_request->mutable_false_computation()));
-        break;
-      }
-      default:
-        // No embedded computation.
-        TF_RET_CHECK(0 == request.embedded_computation_versions_size());
-        break;
-    }
-  }
-  return Status::OK();
-}
-
-SessionComputation UserComputation::CloneSessionComputation(
-    VersionedComputationHandle::Version version) const {
-  tensorflow::mutex_lock lock(mutex_);
-  SessionComputation result = session_computation_;
-  // Erase all the requests that exceed the version specified.
-  // There's no lower_bound method on tensorflow::protobuf::Map so we iterate
-  // all the elements.
-  auto it = result.mutable_requests()->begin();
-  while (it != result.mutable_requests()->end()) {
-    if (it->first > version) {
-      it = result.mutable_requests()->erase(it);
-    } else {
-      ++it;
-    }
-  }
-  return result;
-}
-
-StatusOr<const OperationRequest*> UserComputation::LookUpRequest(
-    const ComputationDataHandle& handle) const {
-  int64 handle_value = handle.handle();
-  if (session_computation_.requests().count(handle_value) == 0) {
-    return InvalidArgument("no ComputationDataHandle value %lld", handle_value);
-  }
-  return &session_computation_.requests().at(handle_value);
-}
-
-Status UserComputation::CheckParametersAreContiguous(
-    VersionedComputationHandle::Version version) const {
-  TF_RET_CHECK(version > 0 && version < next_handle_value_);
-
-  // Determine number of parameter inputs at the given version.
-  std::map<int64, const ParameterRequest*> parameter_requests;
-  for (int64 request_num = 1; request_num <= version; ++request_num) {
-    const OperationRequest& request =
-        session_computation_.requests().at(request_num);
-
-    if (request.request().op_case() == OpRequest::kParameterRequest) {
-      const ParameterRequest& parameter_request =
-          request.request().parameter_request();
-      // Duplicate parameters should be checked when parameter requests are
-      // added.
-      TF_RET_CHECK(0 ==
-                   parameter_requests.count(parameter_request.parameter()));
-      parameter_requests[parameter_request.parameter()] = &parameter_request;
-    }
-  }
-
-  for (int64 i = 0; i < parameter_requests.size(); ++i) {
-    auto it = parameter_requests.find(i);
-    if (it == parameter_requests.end()) {
-      return FailedPrecondition(
-          "computation %s does not have all its parameters populated "
-          "sequentially, missing parameter %lld",
-          name_.c_str(), i);
-    }
-  }
-
-  return Status::OK();
-}
-
-namespace {
-
-// Helper class which builds an HLO computation from a SessionComputation. To
-// construct the HLO computation, the SessionComputation graph is walked in
-// DFS order lowering each OperationRequest to an HLO instruction.
-class ComputationLowerer {
- public:
-  static StatusOr<std::unique_ptr<HloComputation>> Lower(
-      const string& computation_name,
-      const SessionComputation& session_computation,
-      VersionedComputationHandle::Version version,
-      UserComputation::HloComputationResolver hlo_resolver,
-      const DebugOptions& debug_options,
-      bool include_unreachable_instructions) {
-    ComputationLowerer lowerer(computation_name, session_computation, version,
-                               std::move(hlo_resolver), debug_options,
-                               include_unreachable_instructions);
-    return lowerer.Lower();
-  }
-
- private:
-  ComputationLowerer(const string& computation_name,
-                     const SessionComputation& session_computation,
-                     VersionedComputationHandle::Version version,
-                     UserComputation::HloComputationResolver hlo_resolver,
-                     const DebugOptions& debug_options,
-                     bool include_unreachable_instructions)
-      : hlo_builder_(computation_name),
-        session_computation_(session_computation),
-        version_(version),
-        hlo_resolver_(std::move(hlo_resolver)),
-        debug_options_(debug_options),
-        include_unreachable_instructions_(include_unreachable_instructions) {}
-
-  // Build an HLO computation from the SessionComputation at the given
-  // version.
-  StatusOr<std::unique_ptr<HloComputation>> Lower();
-
- private:
-  // Traverses the computation 'root' using a DFS, calling 'visit' in postorder.
-  void TraversePostorder(
-      const ComputationDataHandle& root,
-      std::unordered_map<int64, HloInstruction*>* visited,
-      const std::function<void(const ComputationDataHandle&)>& visit);
-
-  // DFS visitor of the UserComputation operations which lowers the operations
-  // to HLO instructions.
-  void Visit(const ComputationDataHandle& handle,
-             std::unordered_map<int64, HloInstruction*>* instructions);
-
-  // Resolves a ComputationHandle and Version to a previously lowered
-  // HloComputation using the hlo_resolver_ function.
-  HloComputation* ResolveComputation(
-      const ComputationHandle& handle,
-      VersionedComputationHandle::Version version);
-
-  // This function takes an input value which is being implicitly broadcast into
-  // an output shape and figures out the right kBroadcast instruction(s)
-  // necessary to replicate the implicit broadcast semantics explicitly.
-  HloInstruction* ImplicitBroadcastToExplicitBroadcast(
-      HloInstruction* operand, const Shape& output_shape);
-
-  HloComputation::Builder hlo_builder_;
-  const SessionComputation& session_computation_;
-  const VersionedComputationHandle::Version version_;
-  const UserComputation::HloComputationResolver hlo_resolver_;
-  const DebugOptions& debug_options_;
-  const bool include_unreachable_instructions_;
-};
-
-// Calls 'apply' on each operand of 'request'.
-static void ForEachOperand(
-    const OperationRequest& request,
-    const std::function<void(const ComputationDataHandle& param)>& apply) {
-  switch (request.request().op_case()) {
-    case OpRequest::kRngRequest: {
-      const RngRequest& rng_request = request.request().rng_request();
-      for (const ComputationDataHandle& param : rng_request.parameter()) {
-        apply(param);
-      }
-      break;
-    }
-
-    case OpRequest::kConstantRequest:
-      break;
-    case OpRequest::kGetTupleElementRequest: {
-      const GetTupleElementRequest& get_tuple_element_request =
-          request.request().get_tuple_element_request();
-      apply(get_tuple_element_request.operand());
-      break;
-    }
-
-    case OpRequest::kSliceRequest: {
-      const SliceRequest& slice_request = request.request().slice_request();
-      apply(slice_request.operand());
-      break;
-    }
-
-    case OpRequest::kDynamicSliceRequest: {
-      const DynamicSliceRequest& dynamic_slice_request =
-          request.request().dynamic_slice_request();
-      apply(dynamic_slice_request.operand());
-      apply(dynamic_slice_request.start_indices());
-      break;
-    }
-
-    case OpRequest::kDynamicUpdateSliceRequest: {
-      const DynamicUpdateSliceRequest& dynamic_update_slice_request =
-          request.request().dynamic_update_slice_request();
-      apply(dynamic_update_slice_request.operand());
-      apply(dynamic_update_slice_request.update());
-      apply(dynamic_update_slice_request.start_indices());
-      break;
-    }
-
-    case OpRequest::kConcatenateRequest: {
-      const ConcatenateRequest& concatenate_request =
-          request.request().concatenate_request();
-      for (const ComputationDataHandle& handle :
-           concatenate_request.operands()) {
-        apply(handle);
-      }
-      break;
-    }
-
-    case OpRequest::kConvolveRequest: {
-      const ConvolveRequest& convolve_request =
-          request.request().convolve_request();
-      apply(convolve_request.lhs());
-      apply(convolve_request.rhs());
-      break;
-    }
-
-    case OpRequest::kFftRequest: {
-      const FftRequest& fft_request = request.request().fft_request();
-      apply(fft_request.operand());
-      break;
-    }
-
-    case OpRequest::kBatchNormTrainingRequest: {
-      const BatchNormTrainingRequest& batch_norm_training_request =
-          request.request().batch_norm_training_request();
-
-      apply(batch_norm_training_request.operand());
-      apply(batch_norm_training_request.scale());
-      apply(batch_norm_training_request.offset());
-      break;
-    }
-
-    case OpRequest::kBatchNormInferenceRequest: {
-      const BatchNormInferenceRequest& batch_norm_inference_request =
-          request.request().batch_norm_inference_request();
-
-      apply(batch_norm_inference_request.operand());
-      apply(batch_norm_inference_request.scale());
-      apply(batch_norm_inference_request.offset());
-      apply(batch_norm_inference_request.mean());
-      apply(batch_norm_inference_request.variance());
-      break;
-    }
-
-    case OpRequest::kBatchNormGradRequest: {
-      const BatchNormGradRequest& batch_norm_grad_request =
-          request.request().batch_norm_grad_request();
-
-      apply(batch_norm_grad_request.operand());
-      apply(batch_norm_grad_request.scale());
-      apply(batch_norm_grad_request.mean());
-      apply(batch_norm_grad_request.variance());
-      apply(batch_norm_grad_request.grad_output());
-      break;
-    }
-
-    case OpRequest::kCrossReplicaSumRequest: {
-      const CrossReplicaSumRequest& cross_replica_sum_request =
-          request.request().cross_replica_sum_request();
-      apply(cross_replica_sum_request.operand());
-      break;
-    }
-
-    case OpRequest::kInfeedRequest:
-      break;
-
-    case OpRequest::kOutfeedRequest: {
-      const OutfeedRequest& outfeed_request =
-          request.request().outfeed_request();
-      apply(outfeed_request.operand());
-      break;
-    }
-
-    case OpRequest::kMapRequest: {
-      const MapRequest& map_request = request.request().map_request();
-      for (const ComputationDataHandle& handle : map_request.operands()) {
-        apply(handle);
-      }
-      break;
-    }
-
-    case OpRequest::kReduceRequest: {
-      const ReduceRequest& reduce_request = request.request().reduce_request();
-      apply(reduce_request.operand());
-      apply(reduce_request.init_value());
-      break;
-    }
-
-    case OpRequest::kReduceWindowRequest: {
-      const ReduceWindowRequest& reduce_window_request =
-          request.request().reduce_window_request();
-      apply(reduce_window_request.operand());
-      apply(reduce_window_request.init_value());
-      break;
-    }
-
-    case OpRequest::kSelectAndScatterRequest: {
-      const SelectAndScatterRequest& select_and_scatter_request =
-          request.request().select_and_scatter_request();
-      apply(select_and_scatter_request.operand());
-      apply(select_and_scatter_request.source());
-      apply(select_and_scatter_request.init_value());
-
-      break;
-    }
-
-    case OpRequest::kBroadcastRequest: {
-      const BroadcastRequest& broadcast_request =
-          request.request().broadcast_request();
-      apply(broadcast_request.operand());
-      break;
-    }
-
-    case OpRequest::kReshapeRequest: {
-      const ReshapeRequest& reshape_request =
-          request.request().reshape_request();
-      apply(reshape_request.operand());
-      break;
-    }
-
-    case OpRequest::kTransposeRequest: {
-      const TransposeRequest& transpose_request =
-          request.request().transpose_request();
-      apply(transpose_request.operand());
-      break;
-    }
-
-    case OpRequest::kReverseRequest: {
-      const ReverseRequest& reverse_request =
-          request.request().reverse_request();
-      apply(reverse_request.operand());
-      break;
-    }
-
-    case OpRequest::kPadRequest: {
-      const PadRequest& pad_request = request.request().pad_request();
-      apply(pad_request.operand());
-      apply(pad_request.padding_value());
-      break;
-    }
-
-    case OpRequest::kRecvRequest:
-    case OpRequest::kParameterRequest:
-      break;
-
-    case OpRequest::kConvertRequest: {
-      const ConvertRequest& convert_request =
-          request.request().convert_request();
-      apply(convert_request.operand());
-      break;
-    }
-
-    case OpRequest::kBitcastConvertRequest: {
-      const ConvertRequest& convert_request =
-          request.request().bitcast_convert_request();
-      apply(convert_request.operand());
-      break;
-    }
-
-    case OpRequest::kWhileRequest: {
-      const WhileRequest& while_request = request.request().while_request();
-      apply(while_request.init());
-      break;
-    }
-
-    case OpRequest::kConditionalRequest: {
-      const ConditionalRequest& conditional_request =
-          request.request().conditional_request();
-      apply(conditional_request.predicate());
-      apply(conditional_request.true_operand());
-      apply(conditional_request.false_operand());
-      break;
-    }
-
-    case OpRequest::kTernaryOpRequest: {
-      const TernaryOpRequest& ternary_op_request =
-          request.request().ternary_op_request();
-      apply(ternary_op_request.lhs());
-      apply(ternary_op_request.rhs());
-      apply(ternary_op_request.ehs());
-      break;
-    }
-
-    case OpRequest::kVariadicOpRequest: {
-      const VariadicOpRequest& variadic_op_request =
-          request.request().variadic_op_request();
-      for (const ComputationDataHandle& handle :
-           variadic_op_request.operands()) {
-        apply(handle);
-      }
-      break;
-    }
-
-    case OpRequest::kCallRequest: {
-      const CallRequest& call_request = request.request().call_request();
-      for (const ComputationDataHandle& handle : call_request.operands()) {
-        apply(handle);
-      }
-      break;
-    }
-
-    case OpRequest::kCustomCallRequest: {
-      const CustomCallRequest& cc_request =
-          request.request().custom_call_request();
-      for (const ComputationDataHandle& operand : cc_request.operands()) {
-        apply(operand);
-      }
-      break;
-    }
-
-    case OpRequest::kHostComputeRequest: {
-      const HostComputeRequest& hc_request =
-          request.request().host_compute_request();
-      for (const ComputationDataHandle& operand : hc_request.operands()) {
-        apply(operand);
-      }
-      break;
-    }
-
-    case OpRequest::kDotRequest: {
-      const DotRequest& dot_request = request.request().dot_request();
-      apply(dot_request.rhs());
-      apply(dot_request.lhs());
-      break;
-    }
-
-    case OpRequest::kUnaryOpRequest: {
-      const UnaryOpRequest& unary_op_request =
-          request.request().unary_op_request();
-      apply(unary_op_request.operand());
-      break;
-    }
-
-    case OpRequest::kBinaryOpRequest: {
-      const BinaryOpRequest& binary_op_request =
-          request.request().binary_op_request();
-      apply(binary_op_request.rhs());
-      apply(binary_op_request.lhs());
-      break;
-    }
-
-    case OpRequest::kReducePrecisionRequest: {
-      const ReducePrecisionRequest& reduce_precision_request =
-          request.request().reduce_precision_request();
-      apply(reduce_precision_request.operand());
-      break;
-    }
-
-    case OpRequest::kTraceRequest: {
-      const TraceRequest& trace_request = request.request().trace_request();
-      apply(trace_request.operand());
-      break;
-    }
-
-    case OpRequest::kSendRequest: {
-      const SendRequest& send_request = request.request().send_request();
-      apply(send_request.operand());
-      break;
-    }
-
-    case OpRequest::kGatherRequest: {
-      const GatherRequest& gather_request = request.request().gather_request();
-      apply(gather_request.input());
-      apply(gather_request.gather_indices());
-      break;
-    }
-
-    case OpRequest::OP_NOT_SET:
-      LOG(FATAL) << "OperationRequest doesn't contain a request";
-
-    default:
-      LOG(FATAL) << "Unexpected request type: " << request.request().op_case();
-  }
-}
-
-void ComputationLowerer::TraversePostorder(
-    const ComputationDataHandle& root,
-    std::unordered_map<int64, HloInstruction*>* visited,
-    const std::function<void(const ComputationDataHandle&)>& visit) {
-  // Stack containing {handle, enter} pairs. The 'enter' value describes whether
-  // we are entering or leaving 'handle'.
-  std::stack<std::pair<ComputationDataHandle, bool>> work;
-  work.push({root, true});
-  while (!work.empty()) {
-    ComputationDataHandle handle;
-    bool enter;
-    std::tie(handle, enter) = work.top();
-    work.pop();
-
-    if (enter) {
-      // We are entering 'handle'. The first time we enter 'handle', we add it
-      // to 'visited' with a nullptr value. If 'handle' is already in 'visited',
-      // we do not visit it again. This algorithm only uses the presence of
-      // a handle in 'visited', but we use a map so we can use the same data
-      // structure to store the HloInstruction outputs.
-      if (visited->emplace(handle.handle(), nullptr).second) {
-        const OperationRequest& request =
-            session_computation_.requests().at(handle.handle());
-        // Push the corresponding 'leave' action onto the stack, followed by
-        // the operands.
-        work.push({handle, false});
-        ForEachOperand(request, [&work](const ComputationDataHandle& child) {
-          work.push({child, true});
-        });
-      }
-    } else {
-      // We are leaving 'handle'. We have visited the operands of 'handle', and
-      // now can visit the 'handle' itself.
-      visit(handle);
-    }
-  }
-}
-
-StatusOr<std::unique_ptr<HloComputation>> ComputationLowerer::Lower() {
-  // Map from ComputationDataHandle to HLO instruction. Serves as a record of
-  // which operations have been visited as well as a cache for looking up
-  // ComputationDataHandles as HloInstructions.
-  std::unordered_map<int64, HloInstruction*> instructions;
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* root_request,
-                      GetRoot(version_, session_computation_));
-
-  auto visit = [&](const ComputationDataHandle& handle) {
-    Visit(handle, &instructions);
-  };
-  TraversePostorder(root_request->output_handle(), &instructions, visit);
-  HloInstruction* hlo_root =
-      instructions.at(root_request->output_handle().handle());
-
-  if (include_unreachable_instructions_) {
-    // Iterate through all computation data handles, and visit any unvisited
-    // operations.
-    for (int64 request_num = 1; request_num <= version_; ++request_num) {
-      TF_ASSIGN_OR_RETURN(const OperationRequest* request,
-                          LookUpRequest(request_num, session_computation_));
-      TraversePostorder(request->output_handle(), &instructions, visit);
-    }
-  }
-
-  return hlo_builder_.Build(hlo_root);
-}
-
-HloComputation* ComputationLowerer::ResolveComputation(
-    const ComputationHandle& handle,
-    VersionedComputationHandle::Version version) {
-  const VersionedComputationHandle checked_handle = {handle, version};
-  return hlo_resolver_(checked_handle);
-}
-
-HloInstruction* ComputationLowerer::ImplicitBroadcastToExplicitBroadcast(
-    HloInstruction* operand, const Shape& output_shape) {
-  auto fadd = [this](std::unique_ptr<HloInstruction> x) {
-    return hlo_builder_.AddInstruction(std::move(x));
-  };
-  return fadd(
-      HloInstruction::CreateBroadcastSequence(output_shape, operand, fadd));
-}
-
-void ComputationLowerer::Visit(
-    const ComputationDataHandle& handle,
-    std::unordered_map<int64, HloInstruction*>* instructions) {
-  CHECK_LE(handle.handle(), version_);
-  CHECK(instructions->at(handle.handle()) == nullptr);
-  const OperationRequest& request =
-      session_computation_.requests().at(handle.handle());
-  auto add_instruction = [&](std::unique_ptr<HloInstruction> instruction) {
-    HloInstruction* hlo_instruction =
-        hlo_builder_.AddInstruction(std::move(instruction));
-    hlo_instruction->set_metadata(request.request().metadata());
-    if (request.request().has_sharding()) {
-      OpSharding op_sharding = request.request().sharding();
-      hlo_instruction->set_sharding(
-          HloSharding::FromProto(op_sharding).ValueOrDie());
-    }
-    return hlo_instruction;
-  };
-  auto lookup_instruction = [&](const ComputationDataHandle& handle) {
-    return instructions->at(handle.handle());
-  };
-  HloInstruction* hlo_instruction;
-  switch (request.request().op_case()) {
-    case OpRequest::kRngRequest: {
-      const RngRequest& rng_request = request.request().rng_request();
-      std::vector<HloInstruction*> parameters;
-      for (const ComputationDataHandle& param : rng_request.parameter()) {
-        parameters.push_back(lookup_instruction(param));
-      }
-      hlo_instruction = add_instruction(HloInstruction::CreateRng(
-          request.output_shape(), rng_request.distribution(), parameters));
-      break;
-    }
-
-    case OpRequest::kConstantRequest: {
-      const ConstantRequest& constant_request =
-          request.request().constant_request();
-      hlo_instruction = add_instruction(HloInstruction::CreateConstant(
-          Literal::CreateFromProto(constant_request.literal())
-              .ConsumeValueOrDie()));
-      break;
-    }
-
-    case OpRequest::kGetTupleElementRequest: {
-      const GetTupleElementRequest& get_tuple_element_request =
-          request.request().get_tuple_element_request();
-      HloInstruction* operand =
-          lookup_instruction(get_tuple_element_request.operand());
-      hlo_instruction = add_instruction(HloInstruction::CreateGetTupleElement(
-          request.output_shape(), operand, get_tuple_element_request.index()));
-      break;
-    }
-
-    case OpRequest::kSliceRequest: {
-      const SliceRequest& slice_request = request.request().slice_request();
-      HloInstruction* operand = lookup_instruction(slice_request.operand());
-      hlo_instruction = add_instruction(HloInstruction::CreateSlice(
-          request.output_shape(), operand,
-          AsInt64Slice(slice_request.start_indices()),
-          AsInt64Slice(slice_request.limit_indices()),
-          AsInt64Slice(slice_request.strides())));
-      break;
-    }
-
-    case OpRequest::kDynamicSliceRequest: {
-      const DynamicSliceRequest& dynamic_slice_request =
-          request.request().dynamic_slice_request();
-      HloInstruction* operand =
-          lookup_instruction(dynamic_slice_request.operand());
-      HloInstruction* start_indices =
-          lookup_instruction(dynamic_slice_request.start_indices());
-
-      hlo_instruction = add_instruction(HloInstruction::CreateDynamicSlice(
-          request.output_shape(), operand, start_indices,
-          AsInt64Slice(dynamic_slice_request.slice_sizes())));
-      break;
-    }
-
-    case OpRequest::kDynamicUpdateSliceRequest: {
-      const DynamicUpdateSliceRequest& dynamic_update_slice_request =
-          request.request().dynamic_update_slice_request();
-      HloInstruction* operand =
-          lookup_instruction(dynamic_update_slice_request.operand());
-      HloInstruction* update =
-          lookup_instruction(dynamic_update_slice_request.update());
-      HloInstruction* start_indices =
-          lookup_instruction(dynamic_update_slice_request.start_indices());
-      hlo_instruction =
-          add_instruction(HloInstruction::CreateDynamicUpdateSlice(
-              request.output_shape(), operand, update, start_indices));
-      break;
-    }
-
-    case OpRequest::kConcatenateRequest: {
-      const ConcatenateRequest& concatenate_request =
-          request.request().concatenate_request();
-      std::vector<HloInstruction*> operands;
-      for (const ComputationDataHandle& handle :
-           concatenate_request.operands()) {
-        HloInstruction* operand = lookup_instruction(handle);
-        operands.push_back(operand);
-      }
-      hlo_instruction = add_instruction(HloInstruction::CreateConcatenate(
-          request.output_shape(), operands, concatenate_request.dimension()));
-      break;
-    }
-
-    case OpRequest::kConvolveRequest: {
-      const ConvolveRequest& convolve_request =
-          request.request().convolve_request();
-      HloInstruction* lhs = lookup_instruction(convolve_request.lhs());
-      HloInstruction* rhs = lookup_instruction(convolve_request.rhs());
-      hlo_instruction = add_instruction(HloInstruction::CreateConvolve(
-          request.output_shape(), lhs, rhs, convolve_request.window(),
-          convolve_request.dimension_numbers()));
-      break;
-    }
-
-    case OpRequest::kFftRequest: {
-      const FftRequest& fft_request = request.request().fft_request();
-      HloInstruction* operand = lookup_instruction(fft_request.operand());
-      hlo_instruction = add_instruction(HloInstruction::CreateFft(
-          request.output_shape(), operand, fft_request.fft_type(),
-          AsInt64Slice(fft_request.fft_length())));
-      break;
-    }
-
-    case OpRequest::kDotRequest: {
-      const DotRequest& dot_request = request.request().dot_request();
-      HloInstruction* lhs = lookup_instruction(dot_request.lhs());
-      HloInstruction* rhs = lookup_instruction(dot_request.rhs());
-      hlo_instruction = add_instruction(HloInstruction::CreateDot(
-          request.output_shape(), lhs, rhs, dot_request.dimension_numbers()));
-      break;
-    }
-
-    case OpRequest::kCrossReplicaSumRequest: {
-      const CrossReplicaSumRequest& cross_replica_sum_request =
-          request.request().cross_replica_sum_request();
-      HloInstruction* operand =
-          lookup_instruction(cross_replica_sum_request.operand());
-      hlo_instruction = add_instruction(HloInstruction::CreateCrossReplicaSum(
-          request.output_shape(), {operand}));
-      break;
-    }
-
-    case OpRequest::kInfeedRequest: {
-      const InfeedRequest& infeed_request = request.request().infeed_request();
-      hlo_instruction = add_instruction(HloInstruction::CreateInfeed(
-          request.output_shape(), infeed_request.config()));
-      break;
-    }
-
-    case OpRequest::kOutfeedRequest: {
-      const OutfeedRequest& outfeed_request =
-          request.request().outfeed_request();
-      HloInstruction* operand = lookup_instruction(outfeed_request.operand());
-      hlo_instruction = add_instruction(HloInstruction::CreateOutfeed(
-          outfeed_request.shape(), operand, outfeed_request.outfeed_config()));
-      break;
-    }
-
-    case OpRequest::kMapRequest: {
-      const MapRequest& map_request = request.request().map_request();
-      std::vector<HloInstruction*> operands;
-      for (const ComputationDataHandle& handle : map_request.operands()) {
-        HloInstruction* operand = lookup_instruction(handle);
-        operands.push_back(operand);
-      }
-      CHECK_EQ(1, request.embedded_computation_versions_size());
-      VersionedComputationHandle::Version map_version =
-          request.embedded_computation_versions(0);
-      HloComputation* map_computation =
-          ResolveComputation(map_request.to_apply(), map_version);
-      hlo_instruction = add_instruction(HloInstruction::CreateMap(
-          request.output_shape(), operands, map_computation));
-      break;
-    }
-
-    case OpRequest::kReduceRequest: {
-      const ReduceRequest& reduce_request = request.request().reduce_request();
-      HloInstruction* operand = lookup_instruction(reduce_request.operand());
-      HloInstruction* init_value =
-          lookup_instruction(reduce_request.init_value());
-      CHECK_EQ(1, request.embedded_computation_versions_size());
-      VersionedComputationHandle::Version reduce_version =
-          request.embedded_computation_versions(0);
-      HloComputation* reduce_computation =
-          ResolveComputation(reduce_request.to_apply(), reduce_version);
-      hlo_instruction = add_instruction(HloInstruction::CreateReduce(
-          request.output_shape(), operand, init_value,
-          AsInt64Slice(reduce_request.dimensions()), reduce_computation));
-      break;
-    }
-
-    case OpRequest::kReduceWindowRequest: {
-      const ReduceWindowRequest& reduce_window_request =
-          request.request().reduce_window_request();
-      HloInstruction* operand =
-          lookup_instruction(reduce_window_request.operand());
-      HloInstruction* init_value =
-          lookup_instruction(reduce_window_request.init_value());
-      CHECK_EQ(1, request.embedded_computation_versions_size());
-      VersionedComputationHandle::Version reduce_window_version =
-          request.embedded_computation_versions(0);
-      HloComputation* reduce_window_computation = ResolveComputation(
-          reduce_window_request.to_apply(), reduce_window_version);
-      hlo_instruction = add_instruction(HloInstruction::CreateReduceWindow(
-          request.output_shape(), operand, init_value,
-          reduce_window_request.window(), reduce_window_computation));
-      break;
-    }
-
-    case OpRequest::kSelectAndScatterRequest: {
-      const SelectAndScatterRequest& select_and_scatter_request =
-          request.request().select_and_scatter_request();
-      HloInstruction* operand =
-          lookup_instruction(select_and_scatter_request.operand());
-      HloInstruction* source =
-          lookup_instruction(select_and_scatter_request.source());
-      HloInstruction* init_value =
-          lookup_instruction(select_and_scatter_request.init_value());
-      CHECK_EQ(2, request.embedded_computation_versions_size());
-      VersionedComputationHandle::Version select_version =
-          request.embedded_computation_versions(0);
-      VersionedComputationHandle::Version scatter_version =
-          request.embedded_computation_versions(1);
-      HloComputation* select_computation = ResolveComputation(
-          select_and_scatter_request.select(), select_version);
-      HloComputation* scatter_computation = ResolveComputation(
-          select_and_scatter_request.scatter(), scatter_version);
-      hlo_instruction = add_instruction(HloInstruction::CreateSelectAndScatter(
-          request.output_shape(), operand, select_computation,
-          select_and_scatter_request.window(), source, init_value,
-          scatter_computation));
-      break;
-    }
-
-    case OpRequest::kBatchNormTrainingRequest: {
-      const BatchNormTrainingRequest& batch_norm_training_request =
-          request.request().batch_norm_training_request();
-      HloInstruction* operand =
-          lookup_instruction(batch_norm_training_request.operand());
-      HloInstruction* scale =
-          lookup_instruction(batch_norm_training_request.scale());
-      HloInstruction* offset =
-          lookup_instruction(batch_norm_training_request.offset());
-
-      hlo_instruction = add_instruction(HloInstruction::CreateBatchNormTraining(
-          request.output_shape(), operand, scale, offset,
-          batch_norm_training_request.epsilon(),
-          batch_norm_training_request.feature_index()));
-      break;
-    }
-
-    case OpRequest::kBatchNormInferenceRequest: {
-      const BatchNormInferenceRequest& batch_norm_inference_request =
-          request.request().batch_norm_inference_request();
-      HloInstruction* operand =
-          lookup_instruction(batch_norm_inference_request.operand());
-      HloInstruction* scale =
-          lookup_instruction(batch_norm_inference_request.scale());
-      HloInstruction* offset =
-          lookup_instruction(batch_norm_inference_request.offset());
-      HloInstruction* mean =
-          lookup_instruction(batch_norm_inference_request.mean());
-      HloInstruction* variance =
-          lookup_instruction(batch_norm_inference_request.variance());
-
-      hlo_instruction =
-          add_instruction(HloInstruction::CreateBatchNormInference(
-              request.output_shape(), operand, scale, offset, mean, variance,
-              batch_norm_inference_request.epsilon(),
-              batch_norm_inference_request.feature_index()));
-      break;
-    }
-
-    case OpRequest::kBatchNormGradRequest: {
-      const BatchNormGradRequest& batch_norm_grad_request =
-          request.request().batch_norm_grad_request();
-
-      HloInstruction* operand =
-          lookup_instruction(batch_norm_grad_request.operand());
-      HloInstruction* scale =
-          lookup_instruction(batch_norm_grad_request.scale());
-      HloInstruction* mean = lookup_instruction(batch_norm_grad_request.mean());
-      HloInstruction* variance =
-          lookup_instruction(batch_norm_grad_request.variance());
-      HloInstruction* grad_output =
-          lookup_instruction(batch_norm_grad_request.grad_output());
-
-      hlo_instruction = add_instruction(HloInstruction::CreateBatchNormGrad(
-          request.output_shape(), operand, scale, mean, variance, grad_output,
-          batch_norm_grad_request.epsilon(),
-          batch_norm_grad_request.feature_index()));
-      break;
-    }
-
-    case OpRequest::kBroadcastRequest: {
-      const BroadcastRequest& broadcast_request =
-          request.request().broadcast_request();
-      HloInstruction* operand = lookup_instruction(broadcast_request.operand());
-      std::vector<int64> broadcast_dimensions;
-      // The client-level broadcast instruction just appends dimensions on the
-      // left (adds lowest numbered dimensions). The HLO broadcast op is more
-      // flexible and can add new dimensions anywhere. The broadcast_dimensions
-      // maps operand dimensions to dimensions in the broadcast output, so
-      // to append dimensions on the left the broadcast_dimensions should just
-      // be the n highest dimension numbers of the output shape where n is
-      // the number of input dimensions.
-      broadcast_dimensions.reserve(ShapeUtil::Rank(operand->shape()));
-      for (int i = 0; i < ShapeUtil::Rank(operand->shape()); ++i) {
-        broadcast_dimensions.push_back(i +
-                                       ShapeUtil::Rank(request.output_shape()) -
-                                       ShapeUtil::Rank(operand->shape()));
-      }
-      hlo_instruction = add_instruction(HloInstruction::CreateBroadcast(
-          request.output_shape(), operand, broadcast_dimensions));
-      break;
-    }
-
-    case OpRequest::kReshapeRequest: {
-      const ReshapeRequest& reshape_request =
-          request.request().reshape_request();
-      HloInstruction* operand = lookup_instruction(reshape_request.operand());
-      HloInstruction* transposed;
-      if (IsIdentityPermutation(AsInt64Slice(reshape_request.dimensions()))) {
-        transposed = operand;
-      } else {
-        transposed = add_instruction(HloInstruction::CreateTranspose(
-            ShapeUtil::PermuteDimensions(
-                InversePermutation(AsInt64Slice(reshape_request.dimensions())),
-                operand->shape()),
-            operand, AsInt64Slice(reshape_request.dimensions())));
-      }
-      hlo_instruction = add_instruction(
-          HloInstruction::CreateReshape(request.output_shape(), transposed));
-      break;
-    }
-
-    case OpRequest::kTransposeRequest: {
-      const TransposeRequest& transpose_request =
-          request.request().transpose_request();
-      HloInstruction* operand = lookup_instruction(transpose_request.operand());
-      hlo_instruction = add_instruction(HloInstruction::CreateTranspose(
-          ShapeUtil::PermuteDimensions(
-              InversePermutation(AsInt64Slice(transpose_request.dimensions())),
-              operand->shape()),
-          operand, AsInt64Slice(transpose_request.dimensions())));
-      break;
-    }
-
-    case OpRequest::kReverseRequest: {
-      const ReverseRequest& reverse_request =
-          request.request().reverse_request();
-      HloInstruction* operand = lookup_instruction(reverse_request.operand());
-      hlo_instruction = add_instruction(HloInstruction::CreateReverse(
-          request.output_shape(), operand,
-          AsInt64Slice(reverse_request.dimensions())));
-      break;
-    }
-
-    case OpRequest::kPadRequest: {
-      const PadRequest& pad_request = request.request().pad_request();
-      HloInstruction* operand = lookup_instruction(pad_request.operand());
-      HloInstruction* padding_value =
-          lookup_instruction(pad_request.padding_value());
-      hlo_instruction = add_instruction(HloInstruction::CreatePad(
-          request.output_shape(), operand, padding_value,
-          pad_request.padding_config()));
-      break;
-    }
-
-    case OpRequest::kRecvRequest: {
-      const RecvRequest& recv_request = request.request().recv_request();
-      HloInstruction* recv = add_instruction(HloInstruction::CreateRecv(
-          request.output_shape(), recv_request.channel_handle().handle()));
-      hlo_instruction = add_instruction(HloInstruction::CreateRecvDone(recv));
-      break;
-    }
-
-    case OpRequest::kParameterRequest: {
-      const ParameterRequest& parameter_request =
-          request.request().parameter_request();
-      hlo_instruction = add_instruction(HloInstruction::CreateParameter(
-          parameter_request.parameter(), request.output_shape(),
-          parameter_request.name()));
-      break;
-    }
-
-    case OpRequest::kConvertRequest: {
-      const ConvertRequest& convert_request =
-          request.request().convert_request();
-      HloInstruction* operand = lookup_instruction(convert_request.operand());
-      hlo_instruction = add_instruction(
-          HloInstruction::CreateConvert(request.output_shape(), operand));
-      break;
-    }
-
-    case OpRequest::kBitcastConvertRequest: {
-      const ConvertRequest& convert_request =
-          request.request().bitcast_convert_request();
-      HloInstruction* operand = lookup_instruction(convert_request.operand());
-      hlo_instruction = add_instruction(HloInstruction::CreateBitcastConvert(
-          request.output_shape(), operand));
-      break;
-    }
-
-    case OpRequest::kWhileRequest: {
-      const WhileRequest& while_request = request.request().while_request();
-      CHECK_EQ(2, request.embedded_computation_versions_size());
-      VersionedComputationHandle::Version condition_version =
-          request.embedded_computation_versions(0);
-      HloComputation* condition =
-          ResolveComputation(while_request.condition(), condition_version);
-      VersionedComputationHandle::Version body_version =
-          request.embedded_computation_versions(1);
-      HloComputation* body =
-          ResolveComputation(while_request.body(), body_version);
-      HloInstruction* init = lookup_instruction(while_request.init());
-      hlo_instruction = add_instruction(HloInstruction::CreateWhile(
-          request.output_shape(), condition, body, init));
-      break;
-    }
-
-    case OpRequest::kConditionalRequest: {
-      const ConditionalRequest& conditional_request =
-          request.request().conditional_request();
-      CHECK_EQ(2, request.embedded_computation_versions_size());
-      VersionedComputationHandle::Version true_computation_version =
-          request.embedded_computation_versions(0);
-      HloComputation* true_computation = ResolveComputation(
-          conditional_request.true_computation(), true_computation_version);
-      VersionedComputationHandle::Version false_computation_version =
-          request.embedded_computation_versions(1);
-      HloComputation* false_computation = ResolveComputation(
-          conditional_request.false_computation(), false_computation_version);
-      HloInstruction* predicate =
-          lookup_instruction(conditional_request.predicate());
-      HloInstruction* true_operand =
-          lookup_instruction(conditional_request.true_operand());
-      HloInstruction* false_operand =
-          lookup_instruction(conditional_request.false_operand());
-      hlo_instruction = add_instruction(HloInstruction::CreateConditional(
-          request.output_shape(), predicate, true_operand, true_computation,
-          false_operand, false_computation));
-      break;
-    }
-
-    case OpRequest::kTernaryOpRequest: {
-      const TernaryOpRequest& ternary_op_request =
-          request.request().ternary_op_request();
-      HloInstruction* lhs = lookup_instruction(ternary_op_request.lhs());
-      HloInstruction* rhs = lookup_instruction(ternary_op_request.rhs());
-      HloInstruction* ehs = lookup_instruction(ternary_op_request.ehs());
-      auto hlo_opcode = TernaryOperationToHloOpcode(ternary_op_request.triop());
-      if (debug_options_.xla_eliminate_hlo_implicit_broadcast() &&
-          !ShapeUtil::IsTuple(request.output_shape())) {
-        if (!ShapeUtil::IsTuple(lhs->shape()) &&
-            !ShapeUtil::SameDimensions(request.output_shape(), lhs->shape())) {
-          // lhs side is being implicitly broadcast. Change to explicit.
-          lhs =
-              ImplicitBroadcastToExplicitBroadcast(lhs, request.output_shape());
-        }
-
-        if (!ShapeUtil::IsTuple(rhs->shape()) &&
-            !ShapeUtil::SameDimensions(request.output_shape(), rhs->shape())) {
-          rhs =
-              ImplicitBroadcastToExplicitBroadcast(rhs, request.output_shape());
-        }
-
-        if (!ShapeUtil::IsTuple(ehs->shape()) &&
-            !ShapeUtil::SameDimensions(request.output_shape(), ehs->shape())) {
-          ehs =
-              ImplicitBroadcastToExplicitBroadcast(ehs, request.output_shape());
-        }
-      }
-
-      hlo_instruction = add_instruction(HloInstruction::CreateTernary(
-          request.output_shape(), hlo_opcode, lhs, rhs, ehs));
-      break;
-    }
-
-    case OpRequest::kVariadicOpRequest: {
-      const VariadicOpRequest& variadic_op_request =
-          request.request().variadic_op_request();
-      std::vector<HloInstruction*> operands;
-      for (const ComputationDataHandle& handle :
-           variadic_op_request.operands()) {
-        HloInstruction* operand = lookup_instruction(handle);
-        operands.push_back(operand);
-      }
-      auto hlo_opcode =
-          VariadicOperationToHloOpcode(variadic_op_request.varop());
-      hlo_instruction = add_instruction(HloInstruction::CreateVariadic(
-          request.output_shape(), hlo_opcode, operands));
-      break;
-    }
-
-    case OpRequest::kCallRequest: {
-      const CallRequest& call_request = request.request().call_request();
-      std::vector<HloInstruction*> operands;
-      for (const ComputationDataHandle& handle : call_request.operands()) {
-        operands.push_back(lookup_instruction(handle));
-      }
-      CHECK_EQ(1, request.embedded_computation_versions_size());
-      VersionedComputationHandle::Version call_version =
-          request.embedded_computation_versions(0);
-      HloComputation* call_computation =
-          ResolveComputation(call_request.to_apply(), call_version);
-      hlo_instruction = add_instruction(HloInstruction::CreateCall(
-          request.output_shape(), operands, call_computation));
-      break;
-    }
-
-    case OpRequest::kCustomCallRequest: {
-      const CustomCallRequest& cc_request =
-          request.request().custom_call_request();
-      std::vector<HloInstruction*> operands;
-      for (const ComputationDataHandle& operand : cc_request.operands()) {
-        operands.push_back(lookup_instruction(operand));
-      }
-      hlo_instruction = add_instruction(HloInstruction::CreateCustomCall(
-          cc_request.shape(), operands, cc_request.call_target_name()));
-      break;
-    }
-
-    case OpRequest::kHostComputeRequest: {
-      const HostComputeRequest& host_compute_request =
-          request.request().host_compute_request();
-      std::vector<HloInstruction*> operands;
-      for (const ComputationDataHandle& operand :
-           host_compute_request.operands()) {
-        operands.push_back(lookup_instruction(operand));
-      }
-      auto output_shape = host_compute_request.shape();
-      auto channel_name = host_compute_request.channel_name();
-      auto cost_estimate_ns = host_compute_request.cost_estimate_ns();
-      hlo_instruction = add_instruction(HloInstruction::CreateHostCompute(
-          output_shape, operands, channel_name, cost_estimate_ns));
-      break;
-    }
-
-    case OpRequest::kUnaryOpRequest: {
-      const UnaryOpRequest& unary_op_request =
-          request.request().unary_op_request();
-      HloInstruction* operand = lookup_instruction(unary_op_request.operand());
-      auto hlo_opcode = UnaryOperationToHloOpcode(unary_op_request.unop());
-      hlo_instruction = add_instruction(HloInstruction::CreateUnary(
-          request.output_shape(), hlo_opcode, operand));
-      break;
-    }
-
-    case OpRequest::kBinaryOpRequest: {
-      const BinaryOpRequest& binary_op_request =
-          request.request().binary_op_request();
-      HloInstruction* lhs = lookup_instruction(binary_op_request.lhs());
-      HloInstruction* rhs = lookup_instruction(binary_op_request.rhs());
-      auto hlo_opcode = BinaryOperationToHloOpcode(binary_op_request.binop());
-      if (binary_op_request.broadcast_dimensions_size() > 0 &&
-          ShapeUtil::Rank(lhs->shape()) != ShapeUtil::Rank(rhs->shape())) {
-        // Emit a broadcast instruction to perform the "broadcast in dimension"
-        // operation.
-        HloInstruction* operand_to_broadcast =
-            ShapeUtil::Rank(lhs->shape()) < ShapeUtil::Rank(rhs->shape()) ? lhs
-                                                                          : rhs;
-        CHECK_EQ(ShapeUtil::Rank(operand_to_broadcast->shape()),
-                 binary_op_request.broadcast_dimensions().size());
-
-        // Construct the bounds of the shape of the kBroadcast instruction
-        // responsible for the in-dimension broadcast.
-        std::vector<int64> output_dimensions;
-        for (int64 size : request.output_shape().dimensions()) {
-          output_dimensions.push_back(size);
-        }
-        for (int64 operand_dim = 0;
-             operand_dim < ShapeUtil::Rank(operand_to_broadcast->shape());
-             ++operand_dim) {
-          int64 output_dim =
-              binary_op_request.broadcast_dimensions()[operand_dim];
-          output_dimensions[output_dim] =
-              operand_to_broadcast->shape().dimensions(operand_dim);
-        }
-
-        Shape broadcast_shape = ShapeUtil::MakeShape(
-            operand_to_broadcast->shape().element_type(), output_dimensions);
-
-        // The broadcast semantics of a client-level binary op broadcast is
-        // identical to the HLO broadcast semantics so the broadcast_dimensions
-        // field can just be passed to the instruction builder.
-        HloInstruction* broadcasted_operand =
-            add_instruction(HloInstruction::CreateBroadcast(
-                broadcast_shape, operand_to_broadcast,
-                AsInt64Slice(binary_op_request.broadcast_dimensions())));
-
-        lhs = (lhs == operand_to_broadcast) ? broadcasted_operand : lhs;
-        rhs = (rhs == operand_to_broadcast) ? broadcasted_operand : rhs;
-      }
-      if (debug_options_.xla_eliminate_hlo_implicit_broadcast()) {
-        if (!ShapeUtil::SameDimensions(request.output_shape(), lhs->shape())) {
-          // lhs side is being implicitly broadcast. Change to explicit.
-          lhs =
-              ImplicitBroadcastToExplicitBroadcast(lhs, request.output_shape());
-        }
-
-        if (!ShapeUtil::SameDimensions(request.output_shape(), rhs->shape())) {
-          rhs =
-              ImplicitBroadcastToExplicitBroadcast(rhs, request.output_shape());
-        }
-      }
-      hlo_instruction = add_instruction(HloInstruction::CreateBinary(
-          request.output_shape(), hlo_opcode, lhs, rhs));
-      break;
-    }
-
-    case OpRequest::kReducePrecisionRequest: {
-      const ReducePrecisionRequest& reduce_precision_request =
-          request.request().reduce_precision_request();
-      HloInstruction* operand =
-          lookup_instruction(reduce_precision_request.operand());
-      auto exponent_bits = reduce_precision_request.exponent_bits();
-      auto mantissa_bits = reduce_precision_request.mantissa_bits();
-      hlo_instruction = add_instruction(HloInstruction::CreateReducePrecision(
-          request.output_shape(), operand, exponent_bits, mantissa_bits));
-      break;
-    }
-
-    case OpRequest::kTraceRequest: {
-      const TraceRequest& trace_request = request.request().trace_request();
-      HloInstruction* operand = lookup_instruction(trace_request.operand());
-      hlo_instruction = add_instruction(
-          HloInstruction::CreateTrace(trace_request.tag(), operand));
-      break;
-    }
-
-    case OpRequest::kSendRequest: {
-      const SendRequest& send_request = request.request().send_request();
-      HloInstruction* operand = lookup_instruction(send_request.operand());
-      HloInstruction* send = add_instruction(HloInstruction::CreateSend(
-          operand, send_request.channel_handle().handle()));
-      hlo_instruction = add_instruction(HloInstruction::CreateSendDone(send));
-      break;
-    }
-
-    case OpRequest::kGatherRequest: {
-      const GatherRequest& gather_request = request.request().gather_request();
-      HloInstruction* input_operand =
-          lookup_instruction(gather_request.input());
-      HloInstruction* gather_indices_operand =
-          lookup_instruction(gather_request.gather_indices());
-      std::vector<int64> window_bounds;
-      c_copy(gather_request.window_bounds(), std::back_inserter(window_bounds));
-      hlo_instruction = add_instruction(HloInstruction::CreateGather(
-          request.output_shape(), input_operand, gather_indices_operand,
-          gather_request.dimension_numbers(), window_bounds));
-      break;
-    }
-
-    case OpRequest::OP_NOT_SET:
-      LOG(FATAL) << "OperationRequest doesn't contain a request";
-
-    default:
-      LOG(FATAL) << "Unexpected request type: " << request.request().op_case();
-  }
-  (*instructions)[handle.handle()] = hlo_instruction;
-}  // NOLINT(readability/fn_size)
-
-}  // namespace
-
-StatusOr<std::unique_ptr<HloComputation>> UserComputation::BuildHloComputation(
-    VersionedComputationHandle::Version version,
-    HloComputationResolver hlo_resolver, const DebugOptions& debug_options,
-    bool include_unreachable_instructions) const {
-  tensorflow::mutex_lock lock(mutex_);
-
-  VLOG(2) << "Building HloComputation from UserComputation " << name_
-          << " at version " << version;
-  XLA_VLOG_LINES(3, session_computation_.DebugString());
-
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloComputation> hlo_computation,
-      ComputationLowerer::Lower(
-          tensorflow::strings::StrCat(name(), ".v", version),
-          session_computation_, version, std::move(hlo_resolver), debug_options,
-          include_unreachable_instructions));
-
-  return std::move(hlo_computation);
-}
-
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/user_computation.h b/tensorflow/compiler/xla/service/user_computation.h
deleted file mode 100644
index 5544c868fe905c..00000000000000
--- a/tensorflow/compiler/xla/service/user_computation.h
+++ /dev/null
@@ -1,413 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_USER_COMPUTATION_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_USER_COMPUTATION_H_
-
-#include <functional>
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/session.pb.h"
-#include "tensorflow/compiler/xla/service/versioned_computation_handle.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla.pb.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/thread_annotations.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace xla {
-
-// A UserComputation is the built-up computation that users create via the
-// XLA Service interface.
-//
-// The XLA service adds instructions to a user computation via this
-// interface. The state of the computation is stored as a SessionComputation
-// proto which holds a record of all operation-building requests received by the
-// XLA service.
-//
-// UserComputations are lowered to HloComputations which are passed to the high
-// level compiler interface.
-class UserComputation {
- public:
-  // Factory used when restoring a computation from serialized session
-  // computation (computation snapshot) data. Remaps any references to
-  // computation handle via the old_to_new mapping.
-  //
-  // An error will occur if the old_to_new mapping cannot resolve a reference to
-  // a computation that is present in session_computation.
-  static StatusOr<std::unique_ptr<UserComputation>> MakeWithRemapping(
-      const SessionComputation& session_computation,
-      const ComputationHandle& handle,
-      const std::map<int64, ComputationHandle>& old_to_new);
-
-  // Creates an empty computation with the given name and computation handle.
-  explicit UserComputation(const string& name, const ComputationHandle& handle);
-
-  // Enqueues a parameter-retrieving instruction onto this user computation.
-  // Returns an error status if the parameter number is already registered with
-  // different values.
-  StatusOr<ComputationDataHandle> AddParameterInstruction(
-      const ParameterRequest& parameter_request);
-
-  // Enqueues a pad instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddPadInstruction(
-      const PadRequest& pad_request);
-
-  // Enqueues a tracing instruction onto this user computation.
-  // Returns an error status if the operand cannot be resolved.
-  Status AddTraceInstruction(const TraceRequest& trace_request);
-
-  // Enqueues a random number generation instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddRngInstruction(
-      const RngRequest& rng_request);
-
-  // Enqueues a unary instruction onto this user computation.
-  // Returns an error status if the operand index is out of bounds.
-  StatusOr<ComputationDataHandle> AddUnaryInstruction(
-      const UnaryOpRequest& unary_request);
-
-  // Enqueues a batch norm training instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddBatchNormTrainingInstruction(
-      const BatchNormTrainingRequest& batch_norm_training_request);
-
-  // Enqueues a batch norm inference instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddBatchNormInferenceInstruction(
-      const BatchNormInferenceRequest& batch_norm_inference_request);
-
-  // Enqueues a batch norm grad instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddBatchNormGradInstruction(
-      const BatchNormGradRequest& batch_norm_grad_request);
-
-  // Enqueues a binary instruction onto this user computation.
-  // Returns an error status if the operand indices are out of bounds.
-  StatusOr<ComputationDataHandle> AddBinaryInstruction(
-      const BinaryOpRequest& binary_request);
-
-  // Enqueues a ternary instruction onto this user computation.
-  // Returns an error status if the operand indices are out of bounds.
-  StatusOr<ComputationDataHandle> AddTernaryInstruction(
-      const TernaryOpRequest& ternary_request);
-
-  // Enqueues a variadic instruction onto this user computation.
-  // Returns an error status if the operand indices are out of bounds.
-  StatusOr<ComputationDataHandle> AddVariadicInstruction(
-      const VariadicOpRequest& variadic_request);
-
-  // Enqueues a constant instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddConstantInstruction(
-      const ConstantRequest& constant_request);
-
-  // Enqueues a get tuple element instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddGetTupleElementInstruction(
-      const GetTupleElementRequest& get_tuple_element_request);
-
-  // Enqueues a map instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddMapInstruction(
-      const MapRequest& map_request,
-      const UserComputation& to_apply_computation);
-
-  // Enqueues a reduce-precision instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddReducePrecisionInstruction(
-      const ReducePrecisionRequest& reduce_precision_request);
-
-  // Enqueues a convolution instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddConvolveInstruction(
-      const ConvolveRequest& convolve_request);
-
-  // Enqueues an FFT instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddFftInstruction(
-      const FftRequest& fft_request);
-
-  // Enqueues a cross replica sum instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddCrossReplicaSumInstruction(
-      const CrossReplicaSumRequest& cross_replica_sum_request);
-
-  // Enqueues an infeed instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddInfeedInstruction(
-      const InfeedRequest& infeed_request);
-
-  // Enqueues an outfeed instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddOutfeedInstruction(
-      const OutfeedRequest& outfeed_request);
-
-  // Enqueues a host compute instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddHostComputeInstruction(
-      const HostComputeRequest& host_compute_request);
-
-  // Enqueues a call instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddCallInstruction(
-      const CallRequest& call_request,
-      const UserComputation& to_apply_computation);
-
-  // Enqueues a custom call instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddCustomCallInstruction(
-      const CustomCallRequest& custom_call_request);
-
-  // Enqueues a dot instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddDotInstruction(
-      const DotRequest& dot_request);
-
-  // Enqueues a broadcast instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddBroadcastInstruction(
-      const BroadcastRequest& broadcast_request);
-
-  // Enqueues a reshape instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddReshapeInstruction(
-      const ReshapeRequest& reshape_request);
-
-  // Enqueues a transpose instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddTransposeInstruction(
-      const TransposeRequest& transpose_request);
-
-  // Enqueues a slice instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddSliceInstruction(
-      const SliceRequest& slice_request);
-
-  // Enqueues a dynamic slice instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddDynamicSliceInstruction(
-      const DynamicSliceRequest& dynamic_slice_request);
-
-  // Enqueues a dynamic update slice instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddDynamicUpdateSliceInstruction(
-      const DynamicUpdateSliceRequest& dynamic_update_slice_request);
-
-  // Enqueues a concatenate instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddConcatenateInstruction(
-      const ConcatenateRequest& concatenate_request);
-
-  // Enqueues a convert instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddConvertInstruction(
-      const ConvertRequest& convert_request);
-
-  // Enqueues a bitcast element instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddBitcastConvertInstruction(
-      const ConvertRequest& convert_request);
-
-  // Enqueues a reduce instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddReduceInstruction(
-      const ReduceRequest& reduce_request,
-      const UserComputation& to_apply_computation);
-
-  // Enqueues a windowed reduce instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddReduceWindowInstruction(
-      const ReduceWindowRequest& reduce_window_request,
-      const UserComputation& to_apply_computation);
-
-  // Enqueues a select-and-scatter instruction onto this user
-  // computation.
-  StatusOr<ComputationDataHandle> AddSelectAndScatterInstruction(
-      const SelectAndScatterRequest& select_and_scatter_request,
-      const UserComputation& select_computation,
-      const UserComputation& scatter_computation);
-
-  // Enqueues a reverse instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddReverseInstruction(
-      const ReverseRequest& reverse_request);
-
-  // Enqueues a while instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddWhileInstruction(
-      const WhileRequest& while_request,
-      const UserComputation& condition_computation,
-      const UserComputation& body_computation);
-
-  // Enqueues a conditional instruction on this user computation.
-  StatusOr<ComputationDataHandle> AddConditionalInstruction(
-      const ConditionalRequest& conditional_request,
-      const UserComputation& true_computation,
-      const UserComputation& false_computation);
-
-  // Enqueues a Send instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddSendInstruction(
-      const SendRequest& send_request);
-
-  // Enqueues a Recv instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddRecvInstruction(
-      const RecvRequest& recv_request);
-
-  // Enqueues a Gather instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddGatherInstruction(
-      const GatherRequest& gather_request);
-
-  // Returns the user-provided name of this user computation, which is provided
-  // via the XLA computation-building API.
-  const string& name() const { return name_; }
-
-  // Subsequent executions of this computation will compute the value
-  // represented by handle, rather than the last expression enqueued
-  // on the computation.
-  Status SetReturnValue(const ComputationDataHandle& handle);
-
-  // Return a versioned handle for this computation.
-  VersionedComputationHandle GetVersionedHandle() const;
-
-  // Return a versioned handle for this computation with a version equal to the
-  // point at which given operation was added to the computation.
-  VersionedComputationHandle GetVersionedHandleAtOperation(
-      const ComputationDataHandle& operation) const;
-
-  // Return a version value representing the current state of the
-  // computation.
-  VersionedComputationHandle::Version version() const;
-
-  // Computes and returns the program shape for the user computation -- gathers
-  // parameters and result type into a single proto. A shared_ptr is used
-  // because the returned pointer refers to an internally cached value which may
-  // be discarded by the UserComputation object. This avoid unnecessary copies.
-  //
-  // If the parameter space is not dense (i.e. there are holes in the parameter
-  // numbers provided) then an error status is returned.
-  StatusOr<std::shared_ptr<const ProgramShape>> ComputeProgramShape(
-      VersionedComputationHandle::Version version) const;
-
-  // Returns true if the given data handle does not depend on any parameter with
-  // index higher then num_parameters. That is, the value can be computed at
-  // compile time if we know the first num_parameters arguments.
-  StatusOr<bool> IsConstant(const ComputationDataHandle& handle,
-                            int64 num_parameters);
-
-  // Returns the output shape of the operation indicated by the given handle.
-  StatusOr<Shape> GetShape(const ComputationDataHandle& handle);
-
-  // Sets metadata on the Hlo instruction referenced by the given handle.
-  Status SetOpMetadata(const ComputationDataHandle& handle,
-                       const OpMetadata& metadata);
-
-  // Sets the device assignment on the Hlo instruction referenced by 'handle'.
-  Status SetOpSharding(const ComputationDataHandle& handle,
-                       const OpSharding& sharding);
-
-  // Builds a HLO computation from the UserComputation. The parameter "resolver"
-  // is a function which returns a pointer to the HloComputation corresponding
-  // to the given ComputationHandle at the given version. The resolver is used
-  // for operations, such as map, which call other computations and need a
-  // pointer to the called HloComputation to construct the respective HLO
-  // instructions. If include_unreachable_instructions is true, then
-  // instructions which are not reachable from the root are lowered into
-  // HloInstructions.
-  using HloComputationResolver =
-      std::function<HloComputation*(const VersionedComputationHandle& handle)>;
-  StatusOr<std::unique_ptr<HloComputation>> BuildHloComputation(
-      VersionedComputationHandle::Version version,
-      HloComputationResolver hlo_resolver, const DebugOptions& debug_options,
-      bool include_unreachable_instructions = true) const;
-
-  // Return a vector containing the embedded computations used by this
-  // UserComputation. Only embedded computations which are called directly by
-  // this UserComputation are included. That is, the transitive closure of
-  // embedded computations is not included.
-  std::vector<VersionedComputationHandle> GetEmbeddedComputations(
-      VersionedComputationHandle::Version version) const;
-
-  // Returns the number of OperationRequest objects in this UserComputation.
-  // The 'version' of a computation is identical to the number of
-  // OperationRequests in the UserComputation.
-  int64 request_count(VersionedComputationHandle::Version version) const {
-    return version;
-  }
-
-  // Returns a copy of the internal session state for this computation -- this
-  // is useful for serializing the guts of a user computation, though references
-  // to other handles (e.g. referred-to computations) must be handled with care
-  // in the serialization / de-serialization process.
-  SessionComputation CloneSessionComputation(
-      VersionedComputationHandle::Version version) const;
-
-  // Warning: typically we don't want to look up computation data handles until
-  // the computation is finished being built, for consistency purposes. We
-  // expose this routine for error reporting purposes so that we can provide
-  // more meaningful error messages from the XLA service layer.
-  //
-  // Returns the operation request that the handle comes from.
-  StatusOr<const OperationRequest*> LookUpRequestForErrorReporting(
-      const ComputationDataHandle& handle) const;
-
-  // Retrieves the parameter metadata for the given parameter number.
-  //
-  // If the parameter number is invalid for this computation, nullopt is
-  // returned. When the return value has_value(), nullptr will never be
-  // the held value.
-  tensorflow::gtl::optional<const OpMetadata*> ParameterMetadata(
-      int parameter_number) const;
-
- private:
-  // Warning: dangerous mutating operation that doesn't respect versioning.
-  // This is only used at initialization time when constructing from a
-  // SessionComputation a la MakeWithRemapping.
-  //
-  // Remaps references to old computations (with handle values in the keys of
-  // old_to_new) to the computation handle given in the values. This is useful
-  // when loading computations from snapshots, to finish initialization, before
-  // the user computation is released into the wild.
-  Status RemapEmbeddedComputations(
-      const std::map<int64, ComputationHandle>& old_to_new)
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
-
-  // Returns the OperationRequest corresponding to the given handle.
-  StatusOr<const OperationRequest*> LookUpRequest(
-      const ComputationDataHandle& handle) const
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
-
-  // Creates a new ComputationDataHandle with the next available handle value.
-  ComputationDataHandle CreateComputationDataHandle()
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
-
-  // Checks whether the parameter numbers of the parameter operations are
-  // contiguous starting from zero. Returns appropriate error status if not.
-  Status CheckParametersAreContiguous(
-      VersionedComputationHandle::Version version) const
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
-
-  VersionedComputationHandle GetVersionedHandleInternal() const
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
-
-  // Name of the computation.
-  string name_;
-
-  mutable tensorflow::mutex mutex_;
-
-  // State of the computation as a record of all operation-building requests.
-  SessionComputation session_computation_ GUARDED_BY(mutex_);
-
-  // Mapping from parameter number to operation request containing the
-  // respective ParameterRequest.
-  std::map<int64, OperationRequest*> parameters_ GUARDED_BY(mutex_);
-
-  // The next ComputationDataHandle value to assign. Handle values are assigned
-  // sequentially.
-  int64 next_handle_value_ GUARDED_BY(mutex_);
-
-  // If handle_to_return_.has_handle() then an Execution of this Computation
-  // will compute the value represented by handle_to_return_, otherwise it will
-  // compute the value of (next_handle_value_ - 1).
-  ComputationDataHandle handle_to_return_ GUARDED_BY(mutex_);
-
-  // Memoized ProgramShape and its version. A shared_ptr is used because
-  // references to this object are returned by ComputeProgramShape.
-  mutable int64 program_shape_version_ GUARDED_BY(mutex_) = 0;
-  mutable std::shared_ptr<const ProgramShape> program_shape_ GUARDED_BY(mutex_);
-
-  TF_DISALLOW_COPY_AND_ASSIGN(UserComputation);
-};
-
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_USER_COMPUTATION_H_
diff --git a/tensorflow/compiler/xla/service/user_computation_test.cc b/tensorflow/compiler/xla/service/user_computation_test.cc
deleted file mode 100644
index 2fa163953f638c..00000000000000
--- a/tensorflow/compiler/xla/service/user_computation_test.cc
+++ /dev/null
@@ -1,340 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/user_computation.h"
-
-#include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-
-namespace op = xla::testing::opcode_matchers;
-
-namespace xla {
-namespace {
-
-using UserComputationTest = ::testing::Test;
-
-TEST_F(UserComputationTest, SimpleComputation) {
-  const Shape kScalarShape = ShapeUtil::MakeShape(F32, {});
-  const Shape kVectorShape = ShapeUtil::MakeShape(F32, {2});
-
-  // Build a simple three operation computatation:
-  //
-  //   %constant = Constant({123, 42})
-  //   %param = Param(0)
-  //   %outfeed = Outfeed(%constant)
-  //
-  // Build the computation at two different versions and check invariants.
-  ComputationHandle handle;
-  handle.set_handle(123);
-  UserComputation computation("TheComputation", handle);
-
-  ConstantRequest constant_request;
-  *constant_request.mutable_literal() =
-      Literal::CreateR1<float>({123.0f, 42.0f})->ToProto();
-  TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle constant_handle,
-                          computation.AddConstantInstruction(constant_request));
-
-  ParameterRequest param_request;
-  *param_request.mutable_shape() = kScalarShape;
-  param_request.set_parameter(0);
-  param_request.set_name("param0");
-  TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle param_handle,
-                          computation.AddParameterInstruction(param_request));
-  OpMetadata metadata;
-  metadata.set_op_name("meta");
-  TF_ASSERT_OK(computation.SetOpMetadata(param_handle, metadata));
-
-  OutfeedRequest outfeed_request;
-  *outfeed_request.mutable_operand() = constant_handle;
-  *outfeed_request.mutable_shape() = kVectorShape;
-  outfeed_request.set_outfeed_config("abc");
-  TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle outfeed_handle,
-                          computation.AddOutfeedInstruction(outfeed_request));
-
-  auto hlo_resolver = [](const VersionedComputationHandle& handle) {
-    return nullptr;
-  };
-  {
-    // Test the computation at the latest version. In this case, the most
-    // recently added operation is an outfeed. However, the outfeed is not the
-    // root because outfeeds cannot be the root of a computation.
-    VersionedComputationHandle latest_version =
-        computation.GetVersionedHandle();
-
-    // Program shape should have a single scalar parameter and scalar
-    // result. The outfeed instruction should not affect the program shape.
-    TF_ASSERT_OK_AND_ASSIGN(
-        std::shared_ptr<const ProgramShape> program_shape,
-        computation.ComputeProgramShape(latest_version.version));
-    ASSERT_EQ(1, program_shape->parameters_size());
-    EXPECT_TRUE(
-        ShapeUtil::Compatible(kScalarShape, program_shape->parameters(0)));
-    EXPECT_TRUE(ShapeUtil::Compatible(kScalarShape, program_shape->result()));
-
-    // Build the HLO computation.
-    TF_ASSERT_OK_AND_ASSIGN(
-        std::unique_ptr<HloComputation> hlo_computation,
-        computation.BuildHloComputation(latest_version.version, hlo_resolver,
-                                        DebugOptions()));
-    // There should be one HloInstruction per UserComputation operation.
-    EXPECT_EQ(3, hlo_computation->instruction_count());
-    // The root of the instruction should be the parameter instruction (not the
-    // outfeed).
-    EXPECT_THAT(hlo_computation->root_instruction(), op::Parameter());
-  }
-
-  {
-    // Test the computation at the version right after the parameter instruction
-    // is added.
-    VersionedComputationHandle version_at_param =
-        computation.GetVersionedHandleAtOperation(param_handle);
-
-    // Program shape should have a single scalar parameter, and scalar result.
-    TF_ASSERT_OK_AND_ASSIGN(
-        std::shared_ptr<const ProgramShape> program_shape,
-        computation.ComputeProgramShape(version_at_param.version));
-    ASSERT_EQ(1, program_shape->parameters_size());
-    EXPECT_TRUE(
-        ShapeUtil::Compatible(kScalarShape, program_shape->parameters(0)));
-    EXPECT_TRUE(ShapeUtil::Compatible(kScalarShape, program_shape->result()));
-
-    // There should be two instructions, one for the constant and one for the
-    // parameter. The outfeed instruction should not be included.
-    TF_ASSERT_OK_AND_ASSIGN(
-        std::unique_ptr<HloComputation> hlo_computation,
-        computation.BuildHloComputation(version_at_param.version, hlo_resolver,
-                                        DebugOptions()));
-    EXPECT_EQ(2, hlo_computation->instruction_count());
-    EXPECT_THAT(hlo_computation->root_instruction(), op::Parameter());
-  }
-  {
-    // Test the computation at the latest version, but lowered with
-    // include_unreachable_instructions set to false.
-    VersionedComputationHandle latest_version =
-        computation.GetVersionedHandle();
-
-    // Build the HLO computation.
-    TF_ASSERT_OK_AND_ASSIGN(
-        std::unique_ptr<HloComputation> hlo_computation,
-        computation.BuildHloComputation(
-            latest_version.version, hlo_resolver, DebugOptions(),
-            /*include_unreachable_instructions=*/false));
-    // There is only one reachable instruction, the parameter.
-    EXPECT_EQ(1, hlo_computation->instruction_count());
-    // The root of the instruction should be the parameter instruction (not the
-    // outfeed).
-    EXPECT_THAT(hlo_computation->root_instruction(), op::Parameter());
-    EXPECT_EQ(hlo_computation->root_instruction()->metadata().op_name(),
-              "meta");
-  }
-}
-
-TEST_F(UserComputationTest, EliminateScalarBroadcast) {
-  auto debug_options = DebugOptions();
-  debug_options.set_xla_eliminate_hlo_implicit_broadcast(true);
-
-  // Build a binary computation with scalar broadcast.
-  //
-  //  %a = Constant({123, 42})
-  //  %b = Constant(1)
-  //  %add = Add(%a, %b)
-  ComputationHandle handle;
-  handle.set_handle(123);
-  UserComputation computation("TheComputation", handle);
-
-  ConstantRequest a_request;
-  *a_request.mutable_literal() =
-      Literal::CreateR1<float>({123.0f, 42.0f})->ToProto();
-  TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle a_handle,
-                          computation.AddConstantInstruction(a_request));
-
-  ConstantRequest b_request;
-  *b_request.mutable_literal() = Literal::CreateR0<float>(1.0f)->ToProto();
-  TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle b_handle,
-                          computation.AddConstantInstruction(b_request));
-
-  BinaryOpRequest add;
-  add.set_binop(BINOP_ADD);
-  *add.mutable_lhs() = a_handle;
-  *add.mutable_rhs() = b_handle;
-  TF_ASSERT_OK(computation.AddBinaryInstruction(add).status());
-
-  auto hlo_resolver = [](const VersionedComputationHandle& handle) {
-    return nullptr;
-  };
-  VersionedComputationHandle latest_version = computation.GetVersionedHandle();
-
-  // Build the HLO computation.
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<HloComputation> hlo_computation,
-      computation.BuildHloComputation(latest_version.version, hlo_resolver,
-                                      debug_options));
-  // The binary operation has implicit scalar broadcast, should be converted
-  // to an explicit broadcast intruction and a binary instruction.
-  EXPECT_EQ(4, hlo_computation->instruction_count());
-  EXPECT_THAT(hlo_computation->root_instruction(), op::Add());
-  LOG(INFO) << hlo_computation->root_instruction()->ToString();
-  const auto& operands = hlo_computation->root_instruction()->operands();
-  ASSERT_EQ(2, operands.size());
-  EXPECT_TRUE(operands[0]->opcode() == HloOpcode::kBroadcast ||
-              operands[1]->opcode() == HloOpcode::kBroadcast);
-}
-
-TEST_F(UserComputationTest, CheckImplicitBroadcastToExplicitBroadcast) {
-  auto debug_options = DebugOptions();
-  debug_options.set_xla_eliminate_hlo_implicit_broadcast(true);
-
-  // Build a binary computation with degenerate broadcast.
-  //
-  //  %a = Param({1, 2, 3});
-  //  %b = Param({1, 2, 1});
-  //  %add = Add(%a, %b, {});
-  ComputationHandle handle;
-  handle.set_handle(123);
-  UserComputation computation("TheComputation", handle);
-
-  ParameterRequest a_request;
-  *a_request.mutable_shape() = ShapeUtil::MakeShape(F32, {1, 2, 3});
-  a_request.set_name("a");
-  a_request.set_parameter(0);
-  TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle a_handle,
-                          computation.AddParameterInstruction(a_request));
-
-  ParameterRequest b_request;
-  *b_request.mutable_shape() = ShapeUtil::MakeShape(F32, {1, 2, 1});
-  b_request.set_name("b");
-  b_request.set_parameter(1);
-  TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle b_handle,
-                          computation.AddParameterInstruction(b_request));
-
-  const int64 kDevice = 7;
-  OpSharding sharding;
-  sharding.set_type(OpSharding::Type::OpSharding_Type_MAXIMAL);
-  sharding.add_tile_assignment_dimensions(1);
-  sharding.add_tile_assignment_devices(kDevice);
-
-  TF_EXPECT_OK(computation.SetOpSharding(b_handle, sharding));
-
-  BinaryOpRequest add;
-  add.set_binop(BINOP_ADD);
-  *add.mutable_lhs() = a_handle;
-  *add.mutable_rhs() = b_handle;
-  TF_ASSERT_OK(computation.AddBinaryInstruction(add).status());
-
-  auto hlo_resolver = [](const VersionedComputationHandle& handle) {
-    return nullptr;
-  };
-  VersionedComputationHandle latest_version = computation.GetVersionedHandle();
-
-  // Build the HLO computation.
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<HloComputation> hlo_computation,
-      computation.BuildHloComputation(latest_version.version, hlo_resolver,
-                                      debug_options));
-
-  //    b         a
-  //    |         |
-  // reshape      |
-  //    |         |
-  // broadcast    |
-  //     \       /
-  //        add
-  EXPECT_EQ(5, hlo_computation->instruction_count());
-  ASSERT_THAT(
-      hlo_computation->root_instruction(),
-      op::Add(op::Parameter(), op::Broadcast(op::Reshape(op::Parameter()))));
-
-  const HloInstruction* broadcast =
-      hlo_computation->root_instruction()->operand(1);
-  EXPECT_TRUE(broadcast->has_sharding());
-
-  const HloInstruction* reshape = broadcast->operand(0);
-  EXPECT_TRUE(reshape->has_sharding());
-}
-
-TEST_F(UserComputationTest, EliminateDegenerateBroadcastAfterIndimBroadcast) {
-  auto debug_options = DebugOptions();
-  debug_options.set_xla_eliminate_hlo_implicit_broadcast(true);
-
-  // Build a binary computation with in-dim broadcast and degenerate broadcast.
-  //
-  //  %a = Param({2, 3});
-  //  %b = Param({2, 1, 4});
-  //  %add = Add(%a, %b, {0, 1});
-  ComputationHandle handle;
-  handle.set_handle(123);
-  UserComputation computation("TheComputation", handle);
-
-  ParameterRequest a_request;
-  *a_request.mutable_shape() = ShapeUtil::MakeShape(F32, {2, 3});
-  a_request.set_name("a");
-  a_request.set_parameter(0);
-  TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle a_handle,
-                          computation.AddParameterInstruction(a_request));
-
-  ParameterRequest b_request;
-  *b_request.mutable_shape() = ShapeUtil::MakeShape(F32, {2, 1, 4});
-  b_request.set_name("b");
-  b_request.set_parameter(1);
-  TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle b_handle,
-                          computation.AddParameterInstruction(b_request));
-
-  BinaryOpRequest add;
-  add.set_binop(BINOP_ADD);
-  *add.mutable_lhs() = a_handle;
-  *add.mutable_rhs() = b_handle;
-  add.add_broadcast_dimensions(0);
-  add.add_broadcast_dimensions(1);
-  TF_ASSERT_OK(computation.AddBinaryInstruction(add).status());
-
-  auto hlo_resolver = [](const VersionedComputationHandle& handle) {
-    return nullptr;
-  };
-  VersionedComputationHandle latest_version = computation.GetVersionedHandle();
-
-  // Build the HLO computation.
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<HloComputation> hlo_computation,
-      computation.BuildHloComputation(latest_version.version, hlo_resolver,
-                                      debug_options));
-
-  // The binary operation has in-dim broadcast and degenerate broadcast, should
-  // first do the in-dim broadcast then convert the degnerate broadcast into a
-  // reshape and a broadcast.
-  //
-  //    b         a
-  //    |         |
-  // broadcast reshape
-  //    |         |
-  //    |     broadcast
-  //     \        /
-  //        add
-  EXPECT_EQ(6, hlo_computation->instruction_count());
-  EXPECT_THAT(hlo_computation->root_instruction(), op::Add());
-  const auto& operands = hlo_computation->root_instruction()->operands();
-  ASSERT_EQ(2, operands.size());
-  EXPECT_TRUE(operands[0]->opcode() == HloOpcode::kBroadcast &&
-              operands[1]->opcode() == HloOpcode::kBroadcast);
-}
-
-}  // namespace
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc b/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc
new file mode 100644
index 00000000000000..10fc4958fae064
--- /dev/null
+++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking.cc
@@ -0,0 +1,128 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/while_loop_constant_sinking.h"
+#include "tensorflow/compiler/xla/service/while_util.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+
+namespace xla {
+
+// Replaces all uses of old_instr with new_instr except the use at
+// `while_body_root` (which must be a tuple instruction) at index `tuple_index`.
+// This utility helps us replace an instruction in the while body with a
+// constant while still keeping it trivially loop invariant.
+static Status ReplaceUsesWhileKeepingLoopInvariance(
+    HloInstruction* old_instr, HloInstruction* new_instr,
+    HloInstruction* while_body_root, int64 tuple_index) {
+  CHECK_EQ(while_body_root->opcode(), HloOpcode::kTuple);
+
+  std::vector<HloInstruction*> users;
+  users.reserve(old_instr->user_count());
+  c_copy(old_instr->users(), std::back_inserter(users));
+
+  for (auto* user : users) {
+    for (int64 i = 0, e = user->operand_count(); i < e; i++) {
+      if (user->operand(i) == old_instr &&
+          !(user == while_body_root && i == tuple_index)) {
+        TF_RETURN_IF_ERROR(user->ReplaceOperandWith(i, new_instr));
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
+StatusOr<bool> WhileLoopConstantSinking::TrySinkingConstantsIntoWhileBody(
+    HloInstruction* while_instr) {
+  HloComputation* while_body = while_instr->while_body();
+
+  const HloInstruction& init_value = *while_instr->operand(0);
+  if (init_value.opcode() != HloOpcode::kTuple) {
+    return false;
+  }
+
+  bool changed = false;
+
+  for (HloInstruction* invariant_gte :
+       WhileUtil::GetInvariantGTEsForWhileBody(*while_body)) {
+    int64 index = invariant_gte->tuple_index();
+    const HloInstruction& invariant_value = *init_value.operand(index);
+    if (invariant_value.opcode() == HloOpcode::kConstant) {
+      auto* constant_instr =
+          while_body->AddInstruction(invariant_value.Clone(/*suffix=*/".sunk"));
+      TF_RETURN_IF_ERROR(ReplaceUsesWhileKeepingLoopInvariance(
+          invariant_gte, constant_instr, while_body->root_instruction(),
+          index));
+      changed = true;
+    }
+  }
+
+  return changed;
+}
+
+StatusOr<bool> WhileLoopConstantSinking::Run(HloModule* module) {
+  VLOG(2) << "HLO module before WhileLoopConstantSinking:";
+  XLA_VLOG_LINES(2, module->ToString());
+
+  bool changed = false;
+  std::vector<HloInstruction*> while_instrs;
+  for (auto* comp : module->MakeNonfusionComputations()) {
+    // Right now we don't particulary care about optimizing while-of-while
+    // patterns.  If/When we do, we'll want to visit the outer while (while_0)
+    // before we visit the inner while (while_1):
+    //
+    // while_1_body(state) {
+    //   val = gte(state, 0) // Loop invariant
+    //   use(val)
+    // }
+    //
+    // while_0_body(state) {
+    //   val = gte(state, 0) // Loop invariant
+    //   while_1 = while(init=tuple(val, ...), body=while_1_body, ...)
+    //   ...
+    // }
+    //
+    // main {
+    //   while_0 = while(init=(constant, ...), body=while_0_body, ...)
+    // }
+    //
+    // This will let us sink the constant into the outer while first and then
+    // into the inner while in a single run of this pass.
+    c_copy_if(comp->instructions(), std::back_inserter(while_instrs),
+              [](const HloInstruction* instr) {
+                return instr->opcode() == HloOpcode::kWhile;
+              });
+  }
+
+  for (HloInstruction* while_instr : while_instrs) {
+    // We only sink into while loop bodies, but this can be extended to
+    // transform conditions as well.
+    TF_ASSIGN_OR_RETURN(bool result,
+                        TrySinkingConstantsIntoWhileBody(while_instr));
+    changed |= result;
+  }
+
+  if (changed) {
+    VLOG(2) << "HLO module after WhileLoopConstantSinking:";
+    XLA_VLOG_LINES(2, module->ToString());
+  } else {
+    VLOG(2) << "HLO module unchanged after WhileLoopConstantSinking";
+  }
+
+  return changed;
+}
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking.h b/tensorflow/compiler/xla/service/while_loop_constant_sinking.h
new file mode 100644
index 00000000000000..21fb8568a84985
--- /dev/null
+++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking.h
@@ -0,0 +1,68 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_CONSTANT_SINKING_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_CONSTANT_SINKING_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// Sinks while loop invariant values that happen to be constants into the while
+// loop body.  This is probably not a win in isolation but may unlock further
+// optimizations like constant folding.
+//
+//   state = (..., const, ...)
+//   while (pred(state)) {
+//     (..., v, ...) = state
+//     use(v)
+//     state = (..., v, ...)
+//   }
+//
+// =>
+//
+//   state = (..., const, ...)
+//   while (pred(state)) {
+//     (..., v, ...) = state
+//     use(const)
+//     state = (..., v, ...)
+//   }
+//
+// Note that it leaves the `v` in place to keep that component of the state
+// tuple trivially loop invariant.  WhileLoopSimplifier will later get rid of
+// `v`.
+//
+// We only sink into while loop bodies, but this can be extended to transform
+// conditions as well.
+//
+// TODO(b/79121449):  We should also sink broadcasts of constants.
+class WhileLoopConstantSinking : public HloPassInterface {
+ public:
+  ~WhileLoopConstantSinking() override = default;
+
+  tensorflow::StringPiece name() const override {
+    return "while-loop-invariant-code-motion";
+  }
+
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  StatusOr<bool> TrySinkingConstantsIntoWhileBody(HloInstruction* while_instr);
+};
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_WHILE_LOOP_CONSTANT_SINKING_H_
diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc b/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc
new file mode 100644
index 00000000000000..393e75803888d8
--- /dev/null
+++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc
@@ -0,0 +1,200 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/while_loop_constant_sinking.h"
+
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+using ::testing::_;
+
+class WhileLoopConstantSinkingTest : public ::testing::Test {};
+
+TEST_F(WhileLoopConstantSinkingTest, SinkOneConstant) {
+  const char* const hlo_string = R"(
+HloModule ModuleWithWhile
+
+body {
+  p_body = (f32[2],f32[2]) parameter(0)
+  p_body.0 = f32[2] get-tuple-element((f32[2],f32[2]) p_body), index=0
+  p_body.1 = f32[2] get-tuple-element((f32[2],f32[2]) p_body), index=1
+
+  add.0 = f32[2] add(p_body.0, p_body.1)
+  ROOT root = (f32[2],f32[2]) tuple(add.0, p_body.1)
+}
+
+condition {
+  p_cond = (f32[2],f32[2]) parameter(0)
+  ROOT result = pred[] constant(true)
+}
+
+ENTRY entry {
+  const_0 = f32[2] constant({1, 2})
+  const_1 = f32[2] constant({2, 1})
+  while_init = (f32[2],f32[2]) tuple(const_0, const_1)
+  ROOT while = (f32[2],f32[2]) while(while_init), condition=condition, body=body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          WhileLoopConstantSinking{}.Run(module.get()));
+  ASSERT_TRUE(changed);
+
+  auto* while_body = module->GetComputationWithName("body");
+  EXPECT_THAT(while_body->root_instruction(),
+              op::Tuple(op::Add(_, op::Constant()), _));
+}
+
+TEST_F(WhileLoopConstantSinkingTest, KeepConstantsLoopInvariant) {
+  const char* const hlo_string = R"(
+HloModule ModuleWithWhile
+
+body {
+  p_body = (f32[2],f32[2],f32[2]) parameter(0)
+  p_body.0 = f32[2] get-tuple-element((f32[2],f32[2],f32[2]) p_body), index=0
+  p_body.1 = f32[2] get-tuple-element((f32[2],f32[2],f32[2]) p_body), index=1
+  p_body.2 = f32[2] get-tuple-element((f32[2],f32[2],f32[2]) p_body), index=2
+
+  add.0 = f32[2] add(p_body.1, p_body.2)
+  ROOT root = (f32[2],f32[2],f32[2]) tuple(add.0, p_body.1, p_body.2)
+}
+
+condition {
+  p_cond = (f32[2],f32[2],f32[2]) parameter(0)
+  ROOT result = pred[] constant(true)
+}
+
+ENTRY entry {
+  const_0 = f32[2] constant({1, 2})
+  const_1 = f32[2] constant({2, 1})
+  const_2 = f32[2] constant({3, 1})
+  while_init = (f32[2],f32[2],f32[2]) tuple(const_0, const_1, const_2)
+  ROOT while = (f32[2],f32[2],f32[2]) while(while_init), condition=condition, body=body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          WhileLoopConstantSinking{}.Run(module.get()));
+  ASSERT_TRUE(changed);
+
+  auto* while_body = module->GetComputationWithName("body");
+  EXPECT_THAT(while_body->root_instruction(),
+              op::Tuple(op::Add(op::Constant(), op::Constant()),
+                        op::GetTupleElement(op::Parameter(0)),
+                        op::GetTupleElement(op::Parameter(0))));
+}
+
+TEST_F(WhileLoopConstantSinkingTest, TupleShapedConstants) {
+  const char* const hlo_string = R"(
+HloModule ModuleWithWhile
+
+body {
+  p_b = (f32[2],(f32[2],f32[2])) parameter(0)
+  p_b.0 = f32[2] get-tuple-element((f32[2],f32[2],f32[2]) p_b), index=0
+  p_b.1 = (f32[2],f32[2]) get-tuple-element((f32[2],(f32[2],f32[2])) p_b), index=1
+
+  p_b.1.1 = f32[2] get-tuple-element(p_b.1), index=0
+
+  ROOT root = (f32[2],f32[2],f32[2]) tuple(p_b.1.1, p_b.1)
+}
+
+condition {
+  p_cond = (f32[2],(f32[2],f32[2])) parameter(0)
+  ROOT result = pred[] constant(true)
+}
+
+ENTRY entry {
+  const_0 = f32[2] constant({1, 2})
+  const_1 = (f32[2], f32[2]) constant((f32[2], f32[2]) ({2, 1},{3,1}))
+  while_init = (f32[2],(f32[2],f32[2])) tuple(const_0, const_1)
+  ROOT while = (f32[2],(f32[2],f32[2])) while(while_init), condition=condition, body=body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          WhileLoopConstantSinking{}.Run(module.get()));
+  ASSERT_TRUE(changed);
+
+  auto* while_body = module->GetComputationWithName("body");
+  EXPECT_THAT(while_body->root_instruction(),
+              op::Tuple(op::GetTupleElement(op::Constant(), 0),
+                        op::GetTupleElement(op::Parameter(0))));
+}
+
+TEST_F(WhileLoopConstantSinkingTest, DuplicateGTEs) {
+  // This test shows that the pass fails to optimize non-canonical IR.
+  //
+  // Even though the input IR has a constant value for p_b.2.dup,
+  // WhileLoopConstantSinking doesn't try to detect this.  Instead, it relies on
+  // prior runs of HLO CSE to have commoned these identical GTE instructions.
+
+  const char* const hlo_string = R"(
+HloModule ModuleWithWhile
+
+body {
+  p_b = (f32[2],f32[2],f32[2]) parameter(0)
+
+  p_b.1     = f32[2] get-tuple-element((f32[2],f32[2],f32[2]) p_b), index=1
+  p_b.2     = f32[2] get-tuple-element((f32[2],f32[2],f32[2]) p_b), index=2
+  p_b.2.dup = f32[2] get-tuple-element((f32[2],f32[2],f32[2]) p_b), index=2
+
+  add.0 = f32[2] add(p_b.1, p_b.2.dup)
+  ROOT root = (f32[2],f32[2],f32[2]) tuple(add.0, p_b.1, p_b.2)
+}
+
+condition {
+  p_cond = (f32[2],f32[2],f32[2]) parameter(0)
+  ROOT result = pred[] constant(true)
+}
+
+ENTRY entry {
+  const_0 = f32[2] constant({1, 2})
+  const_1 = f32[2] constant({2, 1})
+  const_2 = f32[2] constant({3, 1})
+  while_init = (f32[2],f32[2],f32[2]) tuple(const_0, const_1, const_2)
+  ROOT while = (f32[2],f32[2],f32[2]) while(while_init), condition=condition, body=body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          WhileLoopConstantSinking{}.Run(module.get()));
+  ASSERT_TRUE(changed);
+
+  auto* while_body = module->GetComputationWithName("body");
+  EXPECT_THAT(while_body->root_instruction(),
+              op::Tuple(op::Add(op::Constant(), ::testing::Not(op::Constant())),
+                        op::GetTupleElement(op::Parameter(0)),
+                        op::GetTupleElement(op::Parameter(0))));
+}
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
index 3ef0cdff675125..09ddcffb22c218 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
@@ -98,14 +98,17 @@ static void CreateLoopInvariantCopy(
 // Returns true if `instruction` is worth hoisting only if it lets us hoist some
 // instruction using it.  The rationale is that hoisting these instructions will
 // prevent simplification and fusion in the while body.
-static bool NotWorthHoistingIndividually(const HloInstruction& instruction) {
+bool WhileLoopInvariantCodeMotion::NotWorthHoistingIndividually(
+    const HloInstruction& instruction) {
   switch (instruction.opcode()) {
     default:
       return false;
 
+    case HloOpcode::kConstant:
+      return !hoist_constants_;
+
     case HloOpcode::kBitcast:
     case HloOpcode::kBroadcast:
-    case HloOpcode::kConstant:
     case HloOpcode::kReshape:
     case HloOpcode::kReverse:
     case HloOpcode::kSlice:
@@ -115,26 +118,8 @@ static bool NotWorthHoistingIndividually(const HloInstruction& instruction) {
   }
 }
 
-// Populates `gte_set` with the GetTupleElement instructions in `while_body`
-// that access elements in the parameter tuple that don't change across
-// iterations.  Assumes `while_body` is the body computation of the while loop
-// in question.
-static void GatherInvariantGTEs(HloComputation* while_body,
-                                FlatSet<HloInstruction*>* gte_set) {
-  const HloInstruction::InstructionVector root_operands =
-      while_body->root_instruction()->operands();
-  for (int i = 0; i < root_operands.size(); i++) {
-    HloInstruction* instr = root_operands[i];
-    if (instr->opcode() == HloOpcode::kGetTupleElement &&
-        instr->tuple_index() == i &&
-        instr->operand(0) == while_body->parameter_instruction(0) &&
-        ShapeUtil::IsArray(instr->shape())) {
-      InsertOrDie(gte_set, instr);
-    }
-  }
-}
-
-static StatusOr<bool> TryHoistingInvariantInstructionsFromWhileBody(
+StatusOr<bool>
+WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
     HloInstruction* while_instr) {
   auto print_no_metadata = HloPrintOptions{}.set_print_metadata(false);
 
@@ -172,14 +157,24 @@ static StatusOr<bool> TryHoistingInvariantInstructionsFromWhileBody(
   // unhoisted_invariant_instructions -- they can be legally hoisted, but there
   // is no benefit to hoisting them unless something that uses it is also
   // hoisted.
-  GatherInvariantGTEs(while_body, &unhoisted_invariant_instructions);
+  for (auto* instr : WhileUtil::GetInvariantGTEsForWhileBody(*while_body)) {
+    if (ShapeUtil::IsArray(instr->shape())) {
+      // TODO(b/79147885): We should try to generalize this to tuples for
+      // uniformity's sake, if nothing else.
+      InsertOrDie(&unhoisted_invariant_instructions, instr);
+    }
+  }
 
-  if (unhoisted_invariant_instructions.empty()) {
+  if (unhoisted_invariant_instructions.empty() && !hoist_constants_) {
     // There are no obviously loop invariant elements in the state being
     // threaded through the while loop so give up.  In theory this precondition
     // is too strong -- we could have code that e.g. permutes the elements in
     // the while state but uses a select to pick the same value on every
     // iteration.
+    //
+    // If we were asked to hoist constants, we need to scan the while body for
+    // constants even if we didn't find any loop invariant values in the while
+    // state tuple.
     return false;
   }
 
@@ -256,6 +251,9 @@ static StatusOr<bool> TryHoistingInvariantInstructionsFromWhileBody(
 }
 
 StatusOr<bool> WhileLoopInvariantCodeMotion::Run(HloModule* module) {
+  VLOG(2) << "HLO module before WhileLoopConstantSinking:";
+  XLA_VLOG_LINES(2, module->ToString());
+
   bool changed = false;
   std::vector<HloInstruction*> while_instrs;
   for (auto* comp : module->computations()) {
@@ -283,6 +281,14 @@ StatusOr<bool> WhileLoopInvariantCodeMotion::Run(HloModule* module) {
         TryHoistingInvariantInstructionsFromWhileBody(while_instr));
     changed |= result;
   }
+
+  if (changed) {
+    VLOG(2) << "HLO module after WhileLoopConstantSinking:";
+    XLA_VLOG_LINES(2, module->ToString());
+  } else {
+    VLOG(2) << "HLO module unchanged after WhileLoopConstantSinking";
+  }
+
   return changed;
 }
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h
index 8c4b765b0003c4..8e6cc8787576e4 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h
@@ -27,12 +27,28 @@ namespace xla {
 
 class WhileLoopInvariantCodeMotion : public HloPassInterface {
  public:
+  // If `hoist_constants` is true then constants are always hoisted out of while
+  // loop bodies.  Otherwise they are only hoisted out if they enable other
+  // non-trivial computations to be hoisted out.
+  //
+  // Setting `hoist_constants` to false can be help if LICM is run in the mid
+  // level HLO pipeline because hoisting constants out of while loop bodies can
+  // break optimizations like constant folding.
+  explicit WhileLoopInvariantCodeMotion(bool hoist_constants = false)
+      : hoist_constants_(hoist_constants) {}
   ~WhileLoopInvariantCodeMotion() override = default;
 
   tensorflow::StringPiece name() const override {
     return "while-loop-invariant-code-motion";
   }
   StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  bool NotWorthHoistingIndividually(const HloInstruction& instruction);
+  StatusOr<bool> TryHoistingInvariantInstructionsFromWhileBody(
+      HloInstruction* while_instr);
+
+  bool hoist_constants_;
 };
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
index 799340fda905fb..8831c513eee66e 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h"
 
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -438,5 +439,77 @@ TEST_F(WhileLoopInvariantCodeMotionTest, BodyHasNonTupleRoot) {
   EXPECT_FALSE(simplified_loop);
 }
 
+const char* const kConstantHoistingTestCase = R"(
+HloModule ModuleWithWhile
+
+body {
+  p_body = (f32[2]{0}) parameter(0)
+  p_body.1 = f32[2]{0} get-tuple-element(p_body), index=0
+  const = f32[2]{0} constant({3, 4})
+  add.0 = f32[2]{0} add(p_body.1, const)
+  ROOT root = (f32[2]{0}) tuple(add.0)
+}
+
+condition {
+  p_cond = (f32[2]{0}) parameter(0)
+  ROOT result = pred[] constant(true)
+}
+
+ENTRY entry {
+  const_0 = f32[2]{0} constant({1, 2})
+  while_init = (f32[2]{0}) tuple(const_0)
+  ROOT while = (f32[2]{0}) while(while_init), condition=condition, body=body
+}
+)";
+
+TEST_F(WhileLoopInvariantCodeMotionTest, HoistsConstantWhenAsked) {
+  ParseAndVerifyModule(kConstantHoistingTestCase);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool simplified_loop,
+      WhileLoopInvariantCodeMotion{/*hoist_constants=*/true}.Run(&module()));
+  EXPECT_TRUE(simplified_loop);
+
+  HloComputation* while_body = module().GetComputationWithName("wide.body");
+  ASSERT_NE(while_body, nullptr);
+
+  // We expect the while body to be the equivalent of:
+  //
+  //  wide.body {
+  //    wide_param.1 = (f32[2]{0}, f32[2]{0}) parameter(0)
+  //    get-tuple-element.1 = f32[2]{0} get-tuple-element(wide_param.1), index=0
+  //    tuple.1 = (f32[2]{0}) tuple(get-tuple-element.1)
+  //    get-tuple-element.4 = f32[2]{0} get-tuple-element(tuple.1), index=0
+  //    get-tuple-element.7 = f32[2]{0} get-tuple-element(wide_param.1), index=1
+  //    add.1 = f32[2]{0} add(get-tuple-element.4, get-tuple-element.7)
+  //    tuple.3 = (f32[2]{0}) tuple(add.1)
+  //    get-tuple-element.8 = f32[2]{0} get-tuple-element(tuple.3), index=0
+  //    get-tuple-element.9 = f32[2]{0} get-tuple-element(wide_param.1), index=1
+  //    ROOT tuple.4 = (f32[2]{0}, f32[2]{0}) tuple(get-tuple-element.8,
+  //                                                get-tuple-element.9)
+  //  }
+
+  auto wide_param_1 = op::Parameter(0);
+  auto get_tuple_element_1 = op::GetTupleElement(wide_param_1, 0);
+  auto tuple_1 = op::Tuple(get_tuple_element_1);
+  auto get_tuple_element_4 = op::GetTupleElement(tuple_1, 0);
+  auto get_tuple_element_7 = op::GetTupleElement(wide_param_1, 1);
+  auto add_1 = op::Add(get_tuple_element_4, get_tuple_element_7);
+  auto tuple_3 = op::Tuple(add_1);
+  auto get_tuple_element_8 = op::GetTupleElement(tuple_3, 0);
+  auto get_tuple_element_9 = op::GetTupleElement(wide_param_1, 1);
+  auto tuple_4 = op::Tuple(get_tuple_element_8, get_tuple_element_9);
+
+  EXPECT_THAT(while_body->root_instruction(), tuple_4);
+}
+
+TEST_F(WhileLoopInvariantCodeMotionTest, DoesNotHoistConstantByDefault) {
+  ParseAndVerifyModule(kConstantHoistingTestCase);
+
+  TF_ASSERT_OK_AND_ASSIGN(bool simplified_loop,
+                          WhileLoopInvariantCodeMotion{}.Run(&module()));
+  EXPECT_FALSE(simplified_loop);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_util.cc b/tensorflow/compiler/xla/service/while_util.cc
index bd0794184328b7..473eab2ea84eb8 100644
--- a/tensorflow/compiler/xla/service/while_util.cc
+++ b/tensorflow/compiler/xla/service/while_util.cc
@@ -117,9 +117,13 @@ WhileUtil::MakeInstructionsLiveIn(
   HloInstruction* new_while = containing_computation->AddInstruction(
       HloInstruction::CreateWhile(new_while_shape, new_while_condition,
                                   new_while_body, new_while_init));
-  TF_RETURN_IF_ERROR(containing_computation->ReplaceInstruction(
-      while_instr, TupleUtil::ExtractPrefix(
-                       new_while, while_instr->shape().tuple_shapes_size())));
+
+  // We want to get rid of the old while instruction even if it has side
+  // effecting operations so we do a manual HloComputation::RemoveInstruction
+  // instead of relying on HloComputation::ReplaceInstruction.
+  TF_RETURN_IF_ERROR(while_instr->ReplaceAllUsesWith(TupleUtil::ExtractPrefix(
+      new_while, while_instr->shape().tuple_shapes_size())));
+  TF_RETURN_IF_ERROR(containing_computation->RemoveInstruction(while_instr));
 
   HloInstruction* while_body_param = new_while_body->parameter_instruction(0);
   std::vector<HloInstruction*> live_in_instructions;
@@ -244,4 +248,21 @@ static Shape MakeLoopStateShape(const WhileUtil::LoopStateTy& init_values) {
   }
   return result;
 }
+
+/*static*/ std::vector<HloInstruction*> WhileUtil::GetInvariantGTEsForWhileBody(
+    const HloComputation& while_body) {
+  std::vector<HloInstruction*> result;
+  const HloInstruction::InstructionVector root_operands =
+      while_body.root_instruction()->operands();
+  for (int i = 0; i < root_operands.size(); i++) {
+    HloInstruction* instr = root_operands[i];
+    if (instr->opcode() == HloOpcode::kGetTupleElement &&
+        instr->tuple_index() == i &&
+        instr->operand(0) == while_body.parameter_instruction(0)) {
+      result.push_back(instr);
+    }
+  }
+  return result;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_util.h b/tensorflow/compiler/xla/service/while_util.h
index 1688d4674269c3..e67636d80f4b68 100644
--- a/tensorflow/compiler/xla/service/while_util.h
+++ b/tensorflow/compiler/xla/service/while_util.h
@@ -38,17 +38,21 @@ class WhileUtil {
   };
 
   // Replaces `while_instr` with a new while instruction that is equivalent to
-  // `while_instr`, except that it has all of the HLO instructions in
+  // `while_instr` except that it has all of the HLO instructions in
   // `instructions` as live-in, loop invariant values.  These new live in values
   // are represented as new elements appended to the parameter of the while
   // loop, which must be of tuple shape.  GetTupleElement instructions computing
   // each new live in value is returned in the `while_body_live_in_values`
   // vector.
   //
-  // Precondition: `while_instr` must have a tuple shaped state.
+  // Deletes `while_instr` after replacing it.
   //
-  // Every instruction in `instructions` must be contained in the computation
-  // that contains `while_instr`.
+  // Preconditions:
+  //
+  //  `while_instr` must have a tuple shaped state.
+  //
+  //   Every instruction in `instructions` must be contained in the computation
+  //   that contains `while_instr`.
   static StatusOr<MakeInstructionsLiveInResult> MakeInstructionsLiveIn(
       HloInstruction* while_instr,
       tensorflow::gtl::ArraySlice<HloInstruction*> instructions);
@@ -74,6 +78,12 @@ class WhileUtil {
       HloComputation* computation, int32 trip_count,
       const LoopStateTy& init_values,
       const LoopBodyGeneratorTy& loop_body_generator);
+
+  // Returns the GetTupleElement instructions in `while_body` that access
+  // elements in the parameter tuple that don't change across iterations.
+  // Assumes `while_body` is the body computation of the while loop in question.
+  static std::vector<HloInstruction*> GetInvariantGTEsForWhileBody(
+      const HloComputation& while_body);
 };
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/while_util_test.cc b/tensorflow/compiler/xla/service/while_util_test.cc
index cf0d0db99bd92b..d79d3297213e83 100644
--- a/tensorflow/compiler/xla/service/while_util_test.cc
+++ b/tensorflow/compiler/xla/service/while_util_test.cc
@@ -16,8 +16,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/while_util.h"
 
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
+#include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
 namespace {
@@ -49,7 +50,7 @@ ENTRY entry {
 )";
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
-                      tools::Parse(hlo_string));
+                      ParseHloString(hlo_string));
 
   *entry_computation = module->entry_computation();
   *param0 = (*entry_computation)->parameter_instruction(0);
@@ -126,5 +127,84 @@ TEST(WhileUtilTest, MakeTwoInstructionsLive) {
                         op::GetTupleElement(op::Parameter(0), 3)));
 }
 
+TEST(WhileUtilTest, GetInvariantGTEsForWhileBody) {
+  const char* const hlo_string = R"(
+HloModule ModuleWithWhile
+
+body {
+  param.b = (s32[], s32[]) parameter(0)
+  gte.0 = s32[] get-tuple-element(param.b), index=0
+  gte.1 = s32[] get-tuple-element(param.b), index=1
+  add = s32[] add(gte.0, gte.1)
+  ROOT tuple = (s32[], s32[]) tuple(gte.0, add)
+}
+
+cond {
+  param.c = (s32[], s32[]) parameter(0)
+  ROOT constant = pred[] constant(true)
+}
+
+ENTRY main {
+  init = (s32[], s32[]) parameter(0)
+  ROOT while = (s32[], s32[]) while(init), condition=cond, body=body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+
+  HloComputation* while_body = module->GetComputationWithName("body");
+
+  ASSERT_NE(while_body, nullptr)
+      << "Expected exactly one while_body computation";
+
+  std::vector<HloInstruction*> gte_list =
+      WhileUtil::GetInvariantGTEsForWhileBody(*while_body);
+
+  ASSERT_EQ(gte_list.size(), 1);
+  EXPECT_EQ((*gte_list.begin())->name(), "gte.0");
+}
+
+TEST(WhileUtilTest, AlwaysRemovePreviousWhileBody) {
+  const char* const hlo_string = R"(
+HloModule WhileWithSideEffects
+
+body {
+  param.b = (s32[], s32[]) parameter(0)
+  gte.0 = s32[] get-tuple-element(param.b), index=0
+  gte.1 = s32[] get-tuple-element(param.b), index=1
+  add = s32[] add(gte.0, gte.1)
+  ROOT tuple = (s32[], s32[]) tuple(gte.0, add)
+}
+
+cond {
+  param.c = (s32[], s32[]) parameter(0)
+  ROOT condition = pred[] infeed()
+}
+
+ENTRY main {
+  init = (s32[], s32[]) parameter(0)
+  to_make_live_in = f32[100] parameter(1)
+  ROOT while = (s32[], s32[]) while(init), condition=cond, body=body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_string));
+
+  HloComputation* main = module->GetComputationWithName("main");
+  HloInstruction* while_instr = main->root_instruction();
+  HloInstruction* to_make_live_in = main->parameter_instruction(1);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      WhileUtil::MakeInstructionsLiveInResult make_live_in_result,
+      WhileUtil::MakeInstructionsLiveIn(while_instr,
+                                        /*instructions=*/{to_make_live_in}));
+
+  auto is_while = [](const HloInstruction* instr) {
+    return instr->opcode() == HloOpcode::kWhile;
+  };
+  EXPECT_EQ(c_count_if(main->instructions(), is_while), 1);
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc
index 4f8cdc1e0e73cd..f5331280ee9f25 100644
--- a/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc
+++ b/tensorflow/compiler/xla/service/zero_sized_hlo_elimination_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
@@ -46,9 +45,9 @@ class ZeroSizedHloEliminationTest : public HloTestBase {
                 0, ShapeUtil::MakeShape(F32, {3, 0}), "zero sized param"))) {}
 
   StatusOr<bool> RunZeroSizedElimination() {
-    HloModule module("zero_sized_elimination_test_module");
-    module.AddEntryComputation(builder_.Build());
-    return ZeroSizedHloElimination{}.Run(&module);
+    auto module = CreateNewModule("zero_sized_elimination_test_module");
+    module->AddEntryComputation(builder_.Build());
+    return ZeroSizedHloElimination{}.Run(module.get());
   }
 
   HloComputation::Builder builder_;
diff --git a/tensorflow/compiler/xla/service_interface.h b/tensorflow/compiler/xla/service_interface.h
index 5b44c26b7c7b08..14c35e7b84f07b 100644
--- a/tensorflow/compiler/xla/service_interface.h
+++ b/tensorflow/compiler/xla/service_interface.h
@@ -16,8 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_INTERFACE_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_INTERFACE_H_
 
+#include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
-#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 
@@ -31,99 +32,52 @@ class ServiceInterface {
   virtual ~ServiceInterface() = default;
 
   // TODO(b/31824348): Convert to use StatusOr.
-  virtual tensorflow::Status TransferToClient(
-      const TransferToClientRequest* arg, TransferToClientResponse* result) = 0;
+  virtual Status TransferToClient(const TransferToClientRequest* arg,
+                                  TransferToClientResponse* result) = 0;
 
-  virtual tensorflow::Status TransferToServer(
-      const TransferToServerRequest* arg, TransferToServerResponse* result) = 0;
+  virtual Status TransferToServer(const TransferToServerRequest* arg,
+                                  TransferToServerResponse* result) = 0;
 
-  virtual tensorflow::Status TransferToInfeed(
-      const TransferToInfeedRequest* arg, TransferToInfeedResponse* result) = 0;
+  virtual Status TransferToInfeed(const TransferToInfeedRequest* arg,
+                                  TransferToInfeedResponse* result) = 0;
 
-  virtual tensorflow::Status TransferFromOutfeed(
-      const TransferFromOutfeedRequest* arg,
-      TransferFromOutfeedResponse* result) = 0;
+  virtual Status TransferFromOutfeed(const TransferFromOutfeedRequest* arg,
+                                     TransferFromOutfeedResponse* result) = 0;
 
-  virtual tensorflow::Status ResetDevice(const ResetDeviceRequest* arg,
-                                         ResetDeviceResponse* result) = 0;
+  virtual Status ResetDevice(const ResetDeviceRequest* arg,
+                             ResetDeviceResponse* result) = 0;
 
-  virtual tensorflow::Status LoadComputationSnapshot(
-      const LoadComputationSnapshotRequest* request,
-      LoadComputationSnapshotResponse* result) = 0;
+  virtual Status ExecuteGraph(const ExecuteGraphRequest* arg,
+                              ExecuteResponse* result) = 0;
 
-  virtual tensorflow::Status Execute(const ExecuteRequest* arg,
-                                     ExecuteResponse* result) = 0;
+  virtual Status ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg,
+                                      ExecuteParallelResponse* result) = 0;
 
-  virtual tensorflow::Status ExecuteGraph(const ExecuteGraphRequest* arg,
-                                          ExecuteResponse* result) = 0;
+  virtual Status WaitForExecution(const WaitForExecutionRequest* arg,
+                                  WaitForExecutionResponse* result) = 0;
 
-  virtual tensorflow::Status ExecuteParallel(
-      const ExecuteParallelRequest* arg, ExecuteParallelResponse* result) = 0;
+  virtual Status DeconstructTuple(const DeconstructTupleRequest* arg,
+                                  DeconstructTupleResponse* result) = 0;
 
-  virtual tensorflow::Status ExecuteGraphParallel(
-      const ExecuteGraphParallelRequest* arg,
-      ExecuteParallelResponse* result) = 0;
-
-  virtual tensorflow::Status ExecuteAsync(const ExecuteAsyncRequest* arg,
-                                          ExecuteAsyncResponse* result) = 0;
-
-  virtual tensorflow::Status WaitForExecution(
-      const WaitForExecutionRequest* arg, WaitForExecutionResponse* result) = 0;
-
-  virtual tensorflow::Status DeconstructTuple(
-      const DeconstructTupleRequest* arg, DeconstructTupleResponse* result) = 0;
-
-  virtual tensorflow::Status GetComputationStats(
-      const ComputationStatsRequest* arg, ComputationStatsResponse* result) = 0;
-
-  virtual tensorflow::Status GetComputationGraphStats(
+  virtual Status GetComputationGraphStats(
       const ComputationGraphStatsRequest* arg,
       ComputationStatsResponse* result) = 0;
 
-  virtual tensorflow::Status GetComputationShape(
-      const GetComputationShapeRequest* arg,
-      GetComputationShapeResponse* result) = 0;
-
-  virtual tensorflow::Status GetShape(const GetShapeRequest* arg,
-                                      GetShapeResponse* result) = 0;
-
-  virtual tensorflow::Status CreateChannelHandle(
-      const CreateChannelHandleRequest* arg,
-      CreateChannelHandleResponse* result) = 0;
-
-  virtual tensorflow::Status GetDeviceHandles(
-      const GetDeviceHandlesRequest* arg, GetDeviceHandlesResponse* result) = 0;
-
-  // Methods used by ComputationBuilder.
-  virtual tensorflow::Status Computation(const ComputationRequest* arg,
-                                         ComputationResponse* result) = 0;
-
-  virtual tensorflow::Status Op(const OpRequest* arg, OpResponse* result) = 0;
-
-  virtual tensorflow::Status GetLocalShape(const GetLocalShapeRequest* arg,
-                                           GetLocalShapeResponse* result) = 0;
-
-  virtual tensorflow::Status SetReturnValue(
-      const SetReturnValueRequest* arg, SetReturnValueResponse* results) = 0;
-
-  virtual tensorflow::Status IsConstant(const IsConstantRequest* arg,
-                                        IsConstantResponse* result) = 0;
+  virtual Status GetShape(const GetShapeRequest* arg,
+                          GetShapeResponse* result) = 0;
 
-  virtual tensorflow::Status ComputeConstant(
-      const ComputeConstantRequest* arg, ComputeConstantResponse* result) = 0;
+  virtual Status CreateChannelHandle(const CreateChannelHandleRequest* arg,
+                                     CreateChannelHandleResponse* result) = 0;
 
-  virtual tensorflow::Status ComputeConstantGraph(
-      const ComputeConstantGraphRequest* arg,
-      ComputeConstantResponse* result) = 0;
+  virtual Status GetDeviceHandles(const GetDeviceHandlesRequest* arg,
+                                  GetDeviceHandlesResponse* result) = 0;
 
-  // Methods used by Computation.
-  virtual tensorflow::Status SnapshotComputation(
-      const SnapshotComputationRequest* ag,
-      SnapshotComputationResponse* result) = 0;
+  virtual Status ComputeConstantGraph(const ComputeConstantGraphRequest* arg,
+                                      ComputeConstantResponse* result) = 0;
 
   // Methods used by GlobalData.
-  virtual tensorflow::Status Unregister(const UnregisterRequest* arg,
-                                        UnregisterResponse* result) = 0;
+  virtual Status Unregister(const UnregisterRequest* arg,
+                            UnregisterResponse* result) = 0;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/shape_layout.cc b/tensorflow/compiler/xla/shape_layout.cc
index 789eba5780d37e..7ee366b27a82bd 100644
--- a/tensorflow/compiler/xla/shape_layout.cc
+++ b/tensorflow/compiler/xla/shape_layout.cc
@@ -22,24 +22,24 @@ limitations under the License.
 
 namespace xla {
 
-tensorflow::Status ShapeLayout::CopyLayoutFromShape(const Shape& other_shape) {
+Status ShapeLayout::CopyLayoutFromShape(const Shape& other_shape) {
   if (!ShapeUtil::Compatible(other_shape, shape_)) {
     return InvalidArgument("Shape %s is not compatible with shape %s",
                            ShapeUtil::HumanString(other_shape).c_str(),
                            ShapeUtil::HumanString(shape()).c_str());
   }
   shape_ = other_shape;
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status ShapeLayout::AssignLayoutToShape(Shape* to_shape) const {
+Status ShapeLayout::AssignLayoutToShape(Shape* to_shape) const {
   if (!ShapeUtil::Compatible(*to_shape, shape_)) {
     return InvalidArgument("Shape %s is not compatible with shape %s",
                            ShapeUtil::HumanString(*to_shape).c_str(),
                            ShapeUtil::HumanString(shape()).c_str());
   }
   *to_shape = shape_;
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 void ShapeLayout::SetToDefaultLayout() {
diff --git a/tensorflow/compiler/xla/shape_layout.h b/tensorflow/compiler/xla/shape_layout.h
index a1dce758cd3ab3..36806da599cc9b 100644
--- a/tensorflow/compiler/xla/shape_layout.h
+++ b/tensorflow/compiler/xla/shape_layout.h
@@ -40,7 +40,7 @@ class ShapeLayout {
   // Assigns the layouts in this ShapeLayout to the Layout fields of the given
   // shape. 'to_shape' and the shape of the ShapeLayout object must be
   // compatible.
-  tensorflow::Status AssignLayoutToShape(Shape* to_shape) const;
+  Status AssignLayoutToShape(Shape* to_shape) const;
 
   // Returns true if the Layouts in this ShapeLayout match the layouts in the
   // given shape. Returns false otherwise. If the given shape is not compatible
@@ -49,7 +49,7 @@ class ShapeLayout {
 
   // Copies the layout from the given shape into this ShapeLayout. 'other_shape'
   // must be compatible with the ShapeLayout's shape.
-  tensorflow::Status CopyLayoutFromShape(const Shape& other_shape);
+  Status CopyLayoutFromShape(const Shape& other_shape);
 
   // Clears (Layout::Clear) all the Layouts stored in this object.
   void Clear();
diff --git a/tensorflow/compiler/xla/shape_tree.h b/tensorflow/compiler/xla/shape_tree.h
index ffaa40c2d673a2..5b14953ebb243d 100644
--- a/tensorflow/compiler/xla/shape_tree.h
+++ b/tensorflow/compiler/xla/shape_tree.h
@@ -42,36 +42,20 @@ namespace internal {
 template <typename T>
 struct ShapeTreeNode {
   // Data corresponding to this node.
-  T data;
+  std::pair<ShapeIndex, T> data;
 
-  // Children of this node.
-  std::vector<std::unique_ptr<ShapeTreeNode>> children;
+  // Children of this node, as indices into the container's nodes_ array.
+  std::vector<size_t> children;
 
-  ShapeTreeNode() = default;
-  explicit ShapeTreeNode(const T& data) : data(data) {}
-
-  ShapeTreeNode(const ShapeTreeNode& other)
-      : data(other.data), children(other.children.size()) {
-    for (size_t i = 0; i < children.size(); ++i) {
-      children[i] = ::xla::MakeUnique<ShapeTreeNode>(*other.children[i]);
-    }
-  }
-
-  ShapeTreeNode& operator=(const ShapeTreeNode& other) {
-    if (this != &other) {
-      data = other.data;
-      children.resize(other.children.size());
-      for (size_t i = 0; i < children.size(); ++i) {
-        children[i] = ::xla::MakeUnique<ShapeTreeNode>(*other.children[i]);
-      }
-    }
-    return *this;
-  }
+  explicit ShapeTreeNode(ShapeIndex index)
+      : ShapeTreeNode(std::move(index), T()) {}
+  ShapeTreeNode(ShapeIndex index, T data)
+      : data(std::move(index), std::move(data)) {}
 };
 
 }  // namespace internal
 
-template <typename T, bool is_const>
+template <typename ContainerType, typename IteratorType, typename ValueType>
 class ShapeTreeIterator;
 
 // A ShapeTree<T> is a recursive data structure which mirrors the structure of a
@@ -95,10 +79,9 @@ class ShapeTreeIterator;
 // before its ShapeTree goes away.
 template <typename T>
 class ShapeTree {
-  friend class ShapeTreeIterator<T, /*is_const=*/true>;
-  friend class ShapeTreeIterator<T, /*is_const=*/false>;
-
  public:
+  using Node = internal::ShapeTreeNode<T>;
+
   // Default constructor creates a tree with a nil shape (i.e. an empty tuple).
   ShapeTree() : ShapeTree(ShapeUtil::MakeNil()) {}
 
@@ -110,30 +93,12 @@ class ShapeTree {
   // alive longer than this ShapeTree.
   explicit ShapeTree(Shape shape);
   explicit ShapeTree(const Shape* shape);
+  explicit ShapeTree(const std::shared_ptr<Shape>& shape);
 
   // Create ShapeTree with the given shape, and init_value for all nodes.
   ShapeTree(Shape shape, const T& init_value);
   ShapeTree(const Shape* shape, const T& init_value);
-
-  ShapeTree(const ShapeTree& other) { *this = other; }
-  ShapeTree(ShapeTree&&) = default;
-
-  ShapeTree& operator=(const ShapeTree& other) {
-    root_ = other.root_;
-
-    // Fix up internal pointer if necessary.
-    if (other.shape_storage_) {
-      CHECK_EQ(other.shape_, other.shape_storage_.get());
-      shape_storage_.reset(new Shape(*other.shape_));
-      shape_ = shape_storage_.get();
-    } else {
-      shape_ = other.shape_;
-    }
-
-    return *this;
-  }
-
-  ShapeTree& operator=(ShapeTree&& other) = default;
+  ShapeTree(const std::shared_ptr<Shape>& shape, const T& init_value);
 
   // Returns the data element associated with the array in the shape at the
   // given index (see ShapeUtil::GetSubshape for how indexes are defined).
@@ -161,63 +126,70 @@ class ShapeTree {
     return Lookup(index)->children.empty();
   }
 
-  // iterator implements a forward_iterator with value_type =
-  // std::pair<ShapeIndex, T&>
-  using iterator = ShapeTreeIterator<T, /*is_const=*/false>;
-  using const_iterator = ShapeTreeIterator<T, /*is_const=*/true>;
+  ShapeTree(const ShapeTree&) = default;
+  ShapeTree& operator=(const ShapeTree&) = default;
+  ShapeTree(ShapeTree&&) = default;
+  ShapeTree& operator=(ShapeTree&& other) = default;
+
+  // iterator implements a bidirectional_iterator with
+  //  value_type = std::pair<ShapeIndex, T>.
+  //
+  // The iteration order is guaranteed to be a pre-order walk of the ShapeTree.
+  using iterator =
+      ShapeTreeIterator<std::vector<Node>, typename std::vector<Node>::iterator,
+                        std::pair<ShapeIndex, T>>;
+  using const_iterator =
+      ShapeTreeIterator<const std::vector<Node>,
+                        typename std::vector<Node>::const_iterator,
+                        const std::pair<ShapeIndex, T>>;
+  using reverse_iterator = std::reverse_iterator<iterator>;
+  using const_reverse_iterator = std::reverse_iterator<const_iterator>;
 
   // begin/end for iterating over all nodes.
   iterator begin() {
-    return iterator(&root_, /*iterate_leaves_only=*/false,
-                    /*reverse=*/false);
+    return iterator(&nodes_, nodes_.begin(),
+                    /*iterate_leaves_only=*/false);
   }
   iterator end() {
-    return iterator(nullptr, /*iterate_leaves_only=*/false,
-                    /*reverse=*/false);
+    return iterator(&nodes_, nodes_.end(),
+                    /*iterate_leaves_only=*/false);
   }
   const_iterator begin() const {
-    return const_iterator(&root_, /*iterate_leaves_only=*/false,
-                          /*reverse=*/false);
+    return const_iterator(&nodes_, nodes_.begin(),
+                          /*iterate_leaves_only=*/false);
   }
   const_iterator end() const {
-    return const_iterator(nullptr, /*iterate_leaves_only=*/false,
-                          /*reverse=*/false);
+    return const_iterator(&nodes_, nodes_.end(),
+                          /*iterate_leaves_only=*/false);
   }
 
   // rbegin/rend for iterating over all nodes in reverse.
-  iterator rbegin() {
-    return iterator(&root_, /*iterate_leaves_only=*/false,
-                    /*reverse=*/true);
-  }
-  iterator rend() {
-    return iterator(nullptr, /*iterate_leaves_only=*/false,
-                    /*reverse=*/true);
+  reverse_iterator rbegin() { return reverse_iterator(end()); }
+  reverse_iterator rend() { return reverse_iterator(begin()); }
+  const_reverse_iterator rbegin() const {
+    return const_reverse_iterator(end());
   }
-  const_iterator rbegin() const {
-    return const_iterator(&root_, /*iterate_leaves_only=*/false,
-                          /*reverse=*/true);
-  }
-  const_iterator rend() const {
-    return const_iterator(nullptr, /*iterate_leaves_only=*/false,
-                          /*reverse=*/true);
+  const_reverse_iterator rend() const {
+    return const_reverse_iterator(begin());
   }
 
   // leaf_begin()/leaf_end() iterates over all leaf nodes (nodes with no
   // children).
   iterator leaf_begin() {
-    return iterator(&root_, /*iterate_leaves_only=*/true, /*reverse=*/false);
+    return iterator(&nodes_, nodes_.begin(),
+                    /*iterate_leaves_only=*/true);
   }
   iterator leaf_end() {
-    return iterator(nullptr, /*iterate_leaves_only=*/true,
-                    /*reverse=*/false);
+    return iterator(&nodes_, nodes_.end(),
+                    /*iterate_leaves_only=*/true);
   }
   const_iterator leaf_begin() const {
-    return const_iterator(&root_, /*iterate_leaves_only=*/true,
-                          /*reverse=*/false);
+    return const_iterator(&nodes_, nodes_.begin(),
+                          /*iterate_leaves_only=*/true);
   }
   const_iterator leaf_end() const {
-    return const_iterator(nullptr, /*iterate_leaves_only=*/true,
-                          /*reverse=*/false);
+    return const_iterator(&nodes_, nodes_.end(),
+                          /*iterate_leaves_only=*/true);
   }
   // range-based iterator for leaf_begin()/leaf_end().
   tensorflow::gtl::iterator_range<iterator> leaves() {
@@ -227,22 +199,32 @@ class ShapeTree {
     return tensorflow::gtl::make_range(leaf_begin(), leaf_end());
   }
 
-  iterator leaf_rbegin() {
-    return iterator(&root_, /*iterate_leaves_only=*/true, /*reverse=*/true);
+  reverse_iterator leaf_rbegin() { return reverse_iterator(leaf_end()); }
+  reverse_iterator leaf_rend() { return reverse_iterator(leaf_begin()); }
+  const_reverse_iterator leaf_rbegin() const {
+    return const_reverse_iterator(leaf_end());
   }
-  iterator leaf_rend() {
-    return iterator(nullptr, /*iterate_leaves_only=*/true,
-                    /*reverse=*/true);
+  const_reverse_iterator leaf_rend() const {
+    return const_reverse_iterator(leaf_begin());
   }
-  const_iterator leaf_rbegin() const {
-    return const_iterator(&root_, /*iterate_leaves_only=*/true,
-                          /*reverse=*/true);
+
+  // Returns an iterator pointing to the given ShapeIndex.
+  // REQUIRES: index must exist in the ShapeTree.
+  iterator find(const ShapeIndex& index) {
+    Node* element = Lookup(index);
+    return iterator(&nodes_, typename std::vector<Node>::iterator(element),
+                    /*iterate_leaves_only=*/false);
   }
-  const_iterator leaf_rend() const {
-    return const_iterator(nullptr, /*iterate_leaves_only=*/true,
-                          /*reverse=*/true);
+  const_iterator find(const ShapeIndex& index) const {
+    Node* element = Lookup(index);
+    return iterator(&nodes_,
+                    typename std::vector<Node>::const_iterator(element),
+                    /*iterate_leaves_only=*/false);
   }
 
+  // Returns the number of leaf nodes in the tree.
+  int64 leaf_count() const { return std::distance(leaf_begin(), leaf_end()); }
+
   // Recursively traverses the shape and calls the given function at each
   // element. The function has the following arguments:
   //
@@ -282,8 +264,6 @@ class ShapeTree {
   bool operator!=(const ShapeTree<T>& other) const { return !(*this == other); }
 
  private:
-  using Node = internal::ShapeTreeNode<T>;
-
   // Initialize node->children based on 'shape'. All children are assigned the
   // the given 'init_value'.
   void InitChildren(const Shape& shape, const T& init_value, Node* node);
@@ -292,136 +272,57 @@ class ShapeTree {
   // default-constructed data values.
   void InitChildren(const Shape& shape, Node* node);
 
+  // Returns the number of subshapes, including interior nodes, in shape.
+  int64 CountSubshapes(const Shape& shape);
+
   // Helpers for traversing the shape via ForEachElement. The helpers
   // recursively traverse the subtree rooted at "index" (defined as in
   // ShapeUtil::GetSubshape).
   template <typename Fn>
-  static Status ForEachHelper(const Fn& func, const Node& node,
-                              ShapeIndex* index);
+  static Status ForEachHelper(const Fn& func, const std::vector<Node>& nodes);
   template <typename Fn>
-  static Status ForEachMutableHelper(const Fn& func, Node* node,
-                                     ShapeIndex* index);
+  static Status ForEachMutableHelper(const Fn& func, std::vector<Node>* nodes);
 
   // Return the tree node at the given index.
   Node* Lookup(const ShapeIndex& index);
   const Node* Lookup(const ShapeIndex& index) const;
 
-  // The root node, which contains all other nodes.
-  Node root_;
+  // The nodes in this shape tree.
+  std::vector<Node> nodes_;
 
   // If we own our Shape, this field contains it, and shape_ is a pointer into
   // here.  Otherwise if we don't own our shape, this is nullptr.
-  std::unique_ptr<Shape> shape_storage_;
+  std::shared_ptr<Shape> shape_storage_;
 
   // The XLA shape mirrored in this ShapeTree.  This is either
   // shape_storage_.get() or the Shape pointer passed to our constructor.
   const Shape* shape_;
 };
 
-// Internal iterator that performs a pre-order walk. This is copyable, but
-// contains a vector so isn't cheap to copy. This also means post-increment is
-// expensive. The iterator value_type is equivalent to a std::pair<ShapeIndex,
-// T&>, similar to std::map. The non-const iterator's T& type can be mutated
-// in-place.
-template <typename T, bool is_const>
-class ShapeTreeIterator : public std::iterator<std::forward_iterator_tag,
-                                               std::pair<ShapeIndex, T&>> {
+// Internal iterator that performs a pre-order walk. This is cheap to copy.
+// The iterator value_type is equivalent to a
+// std::pair<ShapeIndex,T>&, similar to std::map.
+template <typename ContainerType, typename IteratorType, typename ValueType>
+class ShapeTreeIterator
+    : public std::iterator<std::bidirectional_iterator_tag, ValueType> {
  public:
-  using value_type =
-      typename std::conditional<is_const, std::pair<ShapeIndex, const T&>,
-                                std::pair<ShapeIndex, T&>>::type;
-  using NodeType =
-      typename std::conditional<is_const, const typename ShapeTree<T>::Node,
-                                typename ShapeTree<T>::Node>::type;
-
-  // Construct an iterator pointing at node. Node must either be the tree root
-  // or nullptr (which is equivalent to end() and should not be dereferenced or
-  // incremented). If iterate_leaves_only is true, the iterator will not include
-  // interior tree nodes, only leaves. If reverse is true, the iterator will
-  // visit nodes in the reverse of pre-order traversal.
-  ShapeTreeIterator(NodeType* node, bool iterate_leaves_only, bool reverse)
-      : node_(node),
-        iterate_leaves_only_(iterate_leaves_only),
-        reverse_(reverse) {
-    if (node_) {
-      if (reverse_) {
-        while (!node_->children.empty()) {
-          const int child_index = node_->children.size() - 1;
-          stack_.push_back({node_, child_index});
-          node_ = node_->children[child_index].get();
-        }
-      } else {
-        if (!node_->children.empty() && iterate_leaves_only) {
-          ++*this;
-        }
-      }
+  ShapeTreeIterator(ContainerType* nodes, IteratorType node,
+                    bool iterate_leaves_only)
+      : nodes_(nodes),
+        node_(std::move(node)),
+        iterate_leaves_only_(iterate_leaves_only) {
+    while (iterate_leaves_only && node_ != nodes_->end() &&
+           !node_->children.empty()) {
+      ++node_;
     }
   }
-  ShapeTreeIterator(const ShapeTreeIterator& other)
-      : node_(other.node_),
-        stack_(other.stack_),
-        iterate_leaves_only_(other.iterate_leaves_only_),
-        reverse_(other.reverse_) {}
 
   ShapeTreeIterator& operator++() {
-    CHECK_NE(nullptr, node_) << "walking off the end() of an iterator!";
-    if (reverse_) {
-      while (!stack_.empty()) {
-        node_ = stack_.back().first;
-        int64 next_child_index = stack_.back().second - 1;
-        stack_.pop_back();
-        if (next_child_index < 0) {
-          if (!iterate_leaves_only_) {
-            // All children are visited, yield <node_>.
-            return *this;
-          }
-        } else {
-          stack_.push_back({node_, next_child_index});
-          node_ = node_->children[next_child_index].get();
-          while (!node_->children.empty()) {
-            const int child_index = node_->children.size() - 1;
-            stack_.push_back({node_, child_index});
-            node_ = node_->children[child_index].get();
-          }
-          return *this;
-        }
-      }
-    } else {
-      // We're doing a pre-order walk, so if our current node has children take
-      // the first child.
-      if (!node_->children.empty()) {
-        stack_.push_back({node_, /*child-index=*/0});
-        node_ = node_->children[0].get();
-        if (node_->children.empty() || !iterate_leaves_only_) {
-          return *this;
-        } else {
-          // This is a non-leaf; tail-recurse.
-          return ++(*this);
-        }
-      }
-      // Otherwise we are currently at a leaf. Walk back up until a node
-      // contains a child we haven't visited yet.
-      while (!stack_.empty()) {
-        node_ = stack_.back().first;
-        int64 next_child_index = stack_.back().second + 1;
-        stack_.pop_back();
-        if (node_->children.size() > next_child_index) {
-          stack_.push_back({node_, next_child_index});
-          node_ = node_->children[next_child_index].get();
-
-          if (node_->children.empty() || !iterate_leaves_only_) {
-            return *this;
-          } else {
-            // This is a non-leaf; tail-recurse.
-            return ++(*this);
-          }
-        }
-      }
+    ++node_;
+    while (iterate_leaves_only_ && node_ != nodes_->end() &&
+           !node_->children.empty()) {
+      ++node_;
     }
-    // We've walked off the end of the tree. Set node_ to nullptr to signify
-    // end().
-    node_ = nullptr;
-    current_.reset();
     return *this;
   }
   ShapeTreeIterator operator++(int) {
@@ -429,52 +330,62 @@ class ShapeTreeIterator : public std::iterator<std::forward_iterator_tag,
     ++(*this);
     return i;
   }
+
+  ShapeTreeIterator& operator--() {
+    --node_;
+    while (iterate_leaves_only_ && node_ > nodes_->begin() &&
+           !node_->children.empty()) {
+      --node_;
+    }
+    return *this;
+  }
+  ShapeTreeIterator operator--(int) {
+    auto i = *this;
+    --(*this);
+    return i;
+  }
+
   bool operator==(const ShapeTreeIterator& other) const {
     return node_ == other.node_;
   }
   bool operator!=(const ShapeTreeIterator& other) const {
     return node_ != other.node_;
   }
-  value_type& operator*() { return UpdateCurrent(); }
-  value_type* operator->() { return &UpdateCurrent(); }
+  ValueType& operator*() { return node_->data; }
+  ValueType* operator->() { return &node_->data; }
 
  private:
-  // Updates the current_ member to reflect the current state.
-  value_type& UpdateCurrent() {
-    ShapeIndex index;
-    for (auto& node_and_index : stack_) {
-      index.push_back(node_and_index.second);
-    }
-    current_ = ::xla::MakeUnique<value_type>(index, node_->data);
-    return *current_;
-  }
-
-  // The node to which this iterator is pointing. This is the source of truth in
-  // the iterator - the stack only exists to facilitate walking back from
-  // children to parents.
-  NodeType* node_;
-  // Stack of {node, child-index} pairs of the path taken from the root to get
-  // to node_. This allows us to backtrack and know where to go next.
-  std::vector<std::pair<NodeType*, int64>> stack_;
+  ContainerType* nodes_;
+  IteratorType node_;
   // True if we should not include interior nodes in our walk.
   bool iterate_leaves_only_;
-  // True if we should yield the reverse of the pre-order traversal.
-  bool reverse_;
-  // Placeholder for the current value. Ideally this wouldn't exist and would
-  // just be an rvalue, but operator -> needs to return a pointer to something.
-  // We cannot just use a plain old value_type as it contains a reference so
-  // cannot be default-constructed.
-  std::unique_ptr<value_type> current_;
 };
 
+template <typename T>
+int64 ShapeTree<T>::CountSubshapes(const Shape& shape) {
+  int64 current_count = 1;
+  if (ShapeUtil::IsTuple(shape)) {
+    int64 count = ShapeUtil::TupleElementCount(shape);
+    for (int i = 0; i < count; ++i) {
+      current_count += CountSubshapes(shape.tuple_shapes(i));
+    }
+  }
+  return current_count;
+}
+
 template <typename T>
 void ShapeTree<T>::InitChildren(const Shape& shape, const T& init_value,
                                 Node* node) {
   if (ShapeUtil::IsTuple(shape)) {
-    for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
-      node->children.emplace_back(new Node(init_value));
-      InitChildren(shape.tuple_shapes(i), init_value,
-                   node->children.back().get());
+    const int64 size = ShapeUtil::TupleElementCount(shape);
+    node->children.reserve(size);
+    ShapeIndex shape_index = node->data.first;
+    shape_index.push_back(0);
+    for (int i = 0; i < size; ++i) {
+      shape_index[shape_index.size() - 1] = i;
+      node->children.push_back(nodes_.size());
+      nodes_.emplace_back(shape_index, init_value);
+      InitChildren(shape.tuple_shapes(i), init_value, &nodes_.back());
     }
   }
 }
@@ -482,63 +393,92 @@ void ShapeTree<T>::InitChildren(const Shape& shape, const T& init_value,
 template <typename T>
 void ShapeTree<T>::InitChildren(const Shape& shape, Node* node) {
   if (ShapeUtil::IsTuple(shape)) {
-    for (int i = 0; i < ShapeUtil::TupleElementCount(shape); ++i) {
-      node->children.emplace_back(new Node());
-      InitChildren(shape.tuple_shapes(i), node->children.back().get());
+    const int64 size = ShapeUtil::TupleElementCount(shape);
+    node->children.reserve(size);
+    ShapeIndex shape_index = node->data.first;
+    shape_index.push_back(0);
+    for (int i = 0; i < size; ++i) {
+      shape_index[shape_index.size() - 1] = i;
+      node->children.push_back(nodes_.size());
+      nodes_.emplace_back(shape_index);
+      InitChildren(shape.tuple_shapes(i), &nodes_.back());
     }
   }
 }
 
 template <typename T>
 ShapeTree<T>::ShapeTree(Shape shape)
-    : root_(),
-      shape_storage_(::xla::MakeUnique<Shape>(std::move(shape))),
+    : shape_storage_(std::make_shared<Shape>(std::move(shape))),
       shape_(shape_storage_.get()) {
   // The shape_ field is just used to hold the structure of the shape.
   // It should not be relied upon to store layout information.
   LayoutUtil::ClearLayout(shape_storage_.get());
-  InitChildren(*shape_, &root_);
+  nodes_.reserve(CountSubshapes(*shape_));
+  nodes_.emplace_back(ShapeIndex{});
+  InitChildren(*shape_, &nodes_[0]);
+}
+
+template <typename T>
+ShapeTree<T>::ShapeTree(const Shape* shape) : shape_(shape) {
+  nodes_.reserve(CountSubshapes(*shape_));
+  nodes_.emplace_back(ShapeIndex{});
+  InitChildren(*shape_, &nodes_[0]);
 }
 
 template <typename T>
-ShapeTree<T>::ShapeTree(const Shape* shape) : root_(), shape_(shape) {
-  InitChildren(*shape_, &root_);
+ShapeTree<T>::ShapeTree(const std::shared_ptr<Shape>& shape)
+    : shape_storage_(shape), shape_(shape_storage_.get()) {
+  nodes_.reserve(CountSubshapes(*shape_));
+  nodes_.emplace_back(ShapeIndex{});
+  InitChildren(*shape_, &nodes_[0]);
 }
 
 template <typename T>
 ShapeTree<T>::ShapeTree(Shape shape, const T& init_value)
-    : root_(init_value),
-      shape_storage_(::xla::MakeUnique<Shape>(std::move(shape))),
+    : shape_storage_(std::make_shared<Shape>(std::move(shape))),
       shape_(shape_storage_.get()) {
   // The shape_ field is just used to hold the structure of the shape.
   // It should not be relied upon to store layout information.
   LayoutUtil::ClearLayout(shape_storage_.get());
-  InitChildren(*shape_, init_value, &root_);
+  nodes_.reserve(CountSubshapes(*shape_));
+  nodes_.emplace_back(ShapeIndex{}, init_value);
+  InitChildren(*shape_, init_value, &nodes_[0]);
 }
 
 template <typename T>
 ShapeTree<T>::ShapeTree(const Shape* shape, const T& init_value)
-    : root_(init_value), shape_(shape) {
-  InitChildren(*shape_, init_value, &root_);
+    : shape_(shape) {
+  nodes_.reserve(CountSubshapes(*shape_));
+  nodes_.emplace_back(ShapeIndex{}, init_value);
+  InitChildren(*shape_, init_value, &nodes_[0]);
+}
+
+template <typename T>
+ShapeTree<T>::ShapeTree(const std::shared_ptr<Shape>& shape,
+                        const T& init_value)
+    : shape_storage_(shape), shape_(shape_storage_.get()) {
+  nodes_.reserve(CountSubshapes(*shape_));
+  nodes_.emplace_back(ShapeIndex{}, init_value);
+  InitChildren(*shape_, init_value, &nodes_[0]);
 }
 
 template <typename T>
 const T& ShapeTree<T>::element(const ShapeIndex& index) const {
-  return Lookup(index)->data;
+  return Lookup(index)->data.second;
 }
 
 template <typename T>
 T* ShapeTree<T>::mutable_element(const ShapeIndex& index) {
-  return &Lookup(index)->data;
+  return &Lookup(index)->data.second;
 }
 
 template <typename T>
 internal::ShapeTreeNode<T>* ShapeTree<T>::Lookup(const ShapeIndex& index) {
-  Node* node = &root_;
+  Node* node = &nodes_[0];
   for (const int64 i : index) {
     CHECK_GE(i, 0);
     CHECK_LT(i, node->children.size());
-    node = node->children[i].get();
+    node = &nodes_[node->children[i]];
   }
   return node;
 }
@@ -552,13 +492,10 @@ const internal::ShapeTreeNode<T>* ShapeTree<T>::Lookup(
 /* static */
 template <typename T>
 template <typename Fn>
-Status ShapeTree<T>::ForEachHelper(const Fn& func, const Node& node,
-                                   ShapeIndex* index) {
-  TF_RETURN_IF_ERROR(func(*index, node.data));
-  for (int64 i = 0; i < node.children.size(); ++i) {
-    index->push_back(i);
-    TF_RETURN_IF_ERROR(ForEachHelper(func, *node.children[i], index));
-    index->pop_back();
+Status ShapeTree<T>::ForEachHelper(const Fn& func,
+                                   const std::vector<Node>& nodes) {
+  for (const auto& node : nodes) {
+    TF_RETURN_IF_ERROR(func(node.data.first, node.data.second));
   }
   return Status::OK();
 }
@@ -566,14 +503,10 @@ Status ShapeTree<T>::ForEachHelper(const Fn& func, const Node& node,
 /* static */
 template <typename T>
 template <typename Fn>
-Status ShapeTree<T>::ForEachMutableHelper(const Fn& func, Node* node,
-                                          ShapeIndex* index) {
-  TF_RETURN_IF_ERROR(func(*index, &node->data));
-  for (int64 i = 0; i < node->children.size(); ++i) {
-    index->push_back(i);
-    TF_RETURN_IF_ERROR(
-        ForEachMutableHelper(func, node->children[i].get(), index));
-    index->pop_back();
+Status ShapeTree<T>::ForEachMutableHelper(const Fn& func,
+                                          std::vector<Node>* nodes) {
+  for (auto& node : *nodes) {
+    TF_RETURN_IF_ERROR(func(node.data.first, &node.data.second));
   }
   return Status::OK();
 }
@@ -581,40 +514,36 @@ Status ShapeTree<T>::ForEachMutableHelper(const Fn& func, Node* node,
 template <typename T>
 template <typename Fn>
 Status ShapeTree<T>::ForEachElementWithStatus(const Fn& func) const {
-  ShapeIndex index;
-  return ForEachHelper(func, root_, &index);
+  return ForEachHelper(func, nodes_);
 }
 
 template <typename T>
 template <typename Fn>
 Status ShapeTree<T>::ForEachMutableElementWithStatus(const Fn& func) {
-  ShapeIndex index;
-  return ForEachMutableHelper(func, &root_, &index);
+  return ForEachMutableHelper(func, &nodes_);
 }
 
 template <typename T>
 template <typename Fn>
 void ShapeTree<T>::ForEachElement(const Fn& func) const {
-  ShapeIndex index;
   return ForEachHelper(
              [&func](const ShapeIndex& index, const T& data) {
                func(index, data);
                return Status::OK();
              },
-             root_, &index)
+             nodes_)
       .IgnoreError();
 }
 
 template <typename T>
 template <typename Fn>
 void ShapeTree<T>::ForEachMutableElement(const Fn& func) {
-  ShapeIndex index;
   return ForEachMutableHelper(
              [&func](const ShapeIndex& index, T* data) {
                func(index, data);
                return Status::OK();
              },
-             &root_, &index)
+             &nodes_)
       .IgnoreError();
 }
 
diff --git a/tensorflow/compiler/xla/shape_tree_test.cc b/tensorflow/compiler/xla/shape_tree_test.cc
index 4b6ab772811f4a..dc5facf1581c07 100644
--- a/tensorflow/compiler/xla/shape_tree_test.cc
+++ b/tensorflow/compiler/xla/shape_tree_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/test_benchmark.h"
 
 namespace xla {
 namespace {
@@ -421,8 +422,8 @@ TEST_F(ShapeTreeTest, IterateAndMutate) {
     }
     ++i;
   }
-  t.begin()->second = 78;
-  EXPECT_EQ(78, t.begin()->second);
+  (*t.begin()).second = 78;
+  EXPECT_EQ(78, (*t.begin()).second);
   i = 0;
   for (auto& index_to_data : t) {
     if (i == 0) {
@@ -434,14 +435,14 @@ TEST_F(ShapeTreeTest, IterateAndMutate) {
     }
     ++i;
   }
-  EXPECT_EQ(78, t.begin()->second);
-  EXPECT_EQ(98, std::next(t.begin())->second);
+  EXPECT_EQ(78, (*t.begin()).second);
+  EXPECT_EQ(98, (*std::next(t.begin())).second);
 }
 
 TEST_F(ShapeTreeTest, IterateOrder) {
   ShapeTree<int> t(nested_tuple_shape_, 42);
   std::vector<ShapeIndex> v;
-  for (auto& index_to_data : t) {
+  for (auto index_to_data : t) {
     v.push_back(index_to_data.first);
   }
   EXPECT_EQ(v, (std::vector<ShapeIndex>{{},
@@ -479,7 +480,7 @@ TEST_F(ShapeTreeTest, ReverseIterateOrder) {
 TEST_F(ShapeTreeTest, IterateOrderLeaves) {
   ShapeTree<int> t(nested_tuple_shape_, 42);
   std::vector<ShapeIndex> v;
-  for (auto& index_to_data : t.leaves()) {
+  for (auto index_to_data : t.leaves()) {
     v.push_back(index_to_data.first);
   }
   EXPECT_EQ(v, (std::vector<ShapeIndex>{
@@ -502,5 +503,106 @@ TEST_F(ShapeTreeTest, ReverseIterateOrderLeaves) {
                }));
 }
 
+void BM_Construct(int iters, int depth, int fan_out) {
+  tensorflow::testing::StopTiming();
+  Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
+  for (int i = 0; i < depth; ++i) {
+    std::vector<xla::Shape> shapes(fan_out, shape);
+    shape = ShapeUtil::MakeTupleShape(shapes);
+  }
+  tensorflow::testing::StartTiming();
+
+  for (int i = 0; i < iters; ++i) {
+    ShapeTree<int> shape_tree(shape);
+  }
+}
+
+void BM_ConstructUnowned(int iters, int depth, int fan_out) {
+  tensorflow::testing::StopTiming();
+  Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
+  for (int i = 0; i < depth; ++i) {
+    std::vector<xla::Shape> shapes(fan_out, shape);
+    shape = ShapeUtil::MakeTupleShape(shapes);
+  }
+  tensorflow::testing::StartTiming();
+
+  for (int i = 0; i < iters; ++i) {
+    ShapeTree<int> shape_tree(&shape);
+  }
+}
+
+void BM_Copy(int iters, int depth, int fan_out) {
+  tensorflow::testing::StopTiming();
+  Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
+  for (int i = 0; i < depth; ++i) {
+    std::vector<xla::Shape> shapes(fan_out, shape);
+    shape = ShapeUtil::MakeTupleShape(shapes);
+  }
+  tensorflow::testing::StartTiming();
+
+  ShapeTree<int> shape_tree(shape);
+  for (int i = 0; i < iters; ++i) {
+    ShapeTree<int> copy = shape_tree;
+    tensorflow::testing::DoNotOptimize(copy);
+  }
+}
+
+void BM_Move(int iters, int depth, int fan_out) {
+  tensorflow::testing::StopTiming();
+  Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
+  for (int i = 0; i < depth; ++i) {
+    std::vector<xla::Shape> shapes(fan_out, shape);
+    shape = ShapeUtil::MakeTupleShape(shapes);
+  }
+  tensorflow::testing::StartTiming();
+
+  ShapeTree<int> shape_tree(shape);
+  for (int i = 0; i < iters; ++i) {
+    ShapeTree<int> copy = std::move(shape_tree);
+    shape_tree = std::move(copy);
+  }
+}
+
+void BM_ForEach(int iters, int depth, int fan_out) {
+  tensorflow::testing::StopTiming();
+  Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
+  for (int i = 0; i < depth; ++i) {
+    std::vector<xla::Shape> shapes(fan_out, shape);
+    shape = ShapeUtil::MakeTupleShape(shapes);
+  }
+  tensorflow::testing::StartTiming();
+
+  ShapeTree<int> shape_tree(shape);
+  for (int i = 0; i < iters; ++i) {
+    shape_tree.ForEachMutableElement([](const ShapeIndex& index, int* data) {
+      tensorflow::testing::DoNotOptimize(index);
+    });
+  }
+}
+
+void BM_Iterate(int iters, int depth, int fan_out) {
+  tensorflow::testing::StopTiming();
+  Shape shape = ShapeUtil::MakeShape(F32, {32, 64, 128});
+  for (int i = 0; i < depth; ++i) {
+    std::vector<xla::Shape> shapes(fan_out, shape);
+    shape = ShapeUtil::MakeTupleShape(shapes);
+  }
+  tensorflow::testing::StartTiming();
+
+  ShapeTree<int> shape_tree(shape);
+  for (int i = 0; i < iters; ++i) {
+    for (auto& iter : shape_tree) {
+      tensorflow::testing::DoNotOptimize(iter.second);
+    }
+  }
+}
+
+BENCHMARK(BM_Construct)->ArgPair(2, 8);
+BENCHMARK(BM_ConstructUnowned)->ArgPair(2, 8);
+BENCHMARK(BM_Copy)->ArgPair(2, 8);
+BENCHMARK(BM_Move)->ArgPair(2, 8);
+BENCHMARK(BM_ForEach)->ArgPair(2, 8);
+BENCHMARK(BM_Iterate)->ArgPair(2, 8);
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index ac7e201bfdceab..ce4d0079ee5eb2 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -27,11 +27,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/iterator_range.h"
 #include "tensorflow/core/lib/gtl/optional.h"
+#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -41,17 +41,35 @@ limitations under the License.
 
 namespace xla {
 
+using ::tensorflow::strings::StrAppend;
+using ::tensorflow::strings::StrCat;
+
 string ShapeIndex::ToString() const {
-  return tensorflow::strings::StrCat(
-      "{", tensorflow::str_util::Join(indices_, ","), "}");
+  return StrCat("{", tensorflow::str_util::Join(indices_, ","), "}");
 }
 
 string ShapeIndexView::ToString() const {
-  return tensorflow::strings::StrCat(
-      "{",
-      tensorflow::str_util::Join(tensorflow::gtl::make_range(begin_, end_),
-                                 ","),
-      "}");
+  return StrCat("{",
+                tensorflow::str_util::Join(
+                    tensorflow::gtl::make_range(begin_, end_), ","),
+                "}");
+}
+
+bool ShapeIndexView::operator==(const ShapeIndexView& other) const {
+  if (size() != other.size()) {
+    return false;
+  }
+  for (auto it = begin(), other_it = other.begin(); it != end();
+       ++it, ++other_it) {
+    if (*it != *other_it) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool ShapeIndexView::operator!=(const ShapeIndexView& other) const {
+  return !(*this == other);
 }
 
 std::ostream& operator<<(std::ostream& out, const ShapeIndex& shape_index) {
@@ -66,18 +84,30 @@ std::ostream& operator<<(std::ostream& out, const ShapeIndexView& shape_index) {
 
 namespace {
 
+// Returns whether the given primitive type corresponds to an array shape.
+bool IsArrayPrimitiveType(PrimitiveType primitive_type) {
+  return primitive_type != PRIMITIVE_TYPE_INVALID && primitive_type != TUPLE &&
+         primitive_type != OPAQUE && primitive_type != TOKEN;
+}
+
 // Recursive helper for comparing the equality of two shapes. Returns true if
 // the shapes are the same. If compare_layouts is true, then layouts must also
 // match.
 bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
-  if (ShapeUtil::IsTuple(lhs) || ShapeUtil::IsTuple(rhs)) {
-    return ShapeUtil::IsTuple(lhs) && ShapeUtil::IsTuple(rhs) &&
-           ContainersEqual(lhs.tuple_shapes(), rhs.tuple_shapes(),
+  if (!ShapeUtil::SameElementType(lhs, rhs)) {
+    VLOG(3) << "CompareShapes: lhs element type != rhs element type";
+    return false;
+  }
+
+  if (ShapeUtil::IsTuple(lhs)) {
+    return ContainersEqual(lhs.tuple_shapes(), rhs.tuple_shapes(),
                            [=](const Shape& l, const Shape& r) {
                              return CompareShapes(l, r, compare_layouts);
                            });
-  } else if (ShapeUtil::IsOpaque(lhs) || ShapeUtil::IsOpaque(rhs)) {
-    return ShapeUtil::IsOpaque(lhs) && ShapeUtil::IsOpaque(rhs);
+  } else if (!ShapeUtil::IsArray(lhs)) {
+    // Non-tuple, non-array tupes such as opaque and token types are trivially
+    // the same.
+    return true;
   }
 
   if (compare_layouts) {
@@ -107,10 +137,6 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
     VLOG(3) << "CompareShapes: lhs dimensions != rhs dimensions";
     return false;
   }
-  if (!ShapeUtil::SameElementType(lhs, rhs)) {
-    VLOG(3) << "CompareShapes: lhs element type != rhs element type";
-    return false;
-  }
   return true;
 }
 
@@ -153,8 +179,8 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
 }
 
 /* static */ int64 ShapeUtil::Rank(const Shape& shape) {
-  CHECK(!ShapeUtil::IsTuple(shape))
-      << "Tuples do not have a rank, shape: " << shape;
+  CHECK(ShapeUtil::IsArray(shape))
+      << "Non-arrays do not have a rank, shape: " << shape;
   return shape.dimensions_size();
 }
 
@@ -181,8 +207,7 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
 
 /* static */ Shape ShapeUtil::MakeShape(
     PrimitiveType element_type, tensorflow::gtl::ArraySlice<int64> dimensions) {
-  DCHECK_NE(TUPLE, element_type);
-  DCHECK_NE(OPAQUE, element_type);
+  CHECK(IsArrayPrimitiveType(element_type));
   Shape result;
   PopulateShape(element_type, dimensions, &result);
   return result;
@@ -205,8 +230,7 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
 /* static */ Shape ShapeUtil::MakeShapeWithSparseLayout(
     PrimitiveType element_type, tensorflow::gtl::ArraySlice<int64> dimensions,
     int64 max_sparse_elements) {
-  DCHECK_NE(TUPLE, element_type);
-  DCHECK_NE(OPAQUE, element_type);
+  CHECK(IsArrayPrimitiveType(element_type));
   Shape shape = ShapeUtil::MakeShape(element_type, dimensions);
   *shape.mutable_layout() = LayoutUtil::MakeSparseLayout(max_sparse_elements);
   TF_DCHECK_OK(ShapeUtil::ValidateShape(shape));
@@ -253,6 +277,13 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   return result;
 }
 
+/* static */ Shape ShapeUtil::MakeTokenShape() {
+  Shape result;
+  result.set_element_type(TOKEN);
+  TF_DCHECK_OK(ValidateShapeWithOptionalLayout(result));
+  return result;
+}
+
 /* static */ void ShapeUtil::AppendShapeToTuple(const Shape& shape,
                                                 Shape* tuple_shape) {
   TF_DCHECK_OK(ValidateShapeWithOptionalLayout(shape));
@@ -276,7 +307,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 }
 
 /* static */ bool ShapeUtil::ElementHasBitWidth(const Shape& shape, int bits) {
-  if (shape.element_type() == TUPLE || shape.element_type() == OPAQUE) {
+  if (!IsArray(shape)) {
     return false;
   }
   return primitive_util::BitWidth(shape.element_type()) == bits;
@@ -302,6 +333,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
     case C64:
     case TUPLE:
     case OPAQUE:
+    case TOKEN:
       return false;
 
     default:
@@ -317,6 +349,10 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   return primitive_util::IsFloatingPointType(shape.element_type());
 }
 
+/* static */ bool ShapeUtil::IsArray(const Shape& shape) {
+  return IsArrayPrimitiveType(shape.element_type());
+}
+
 /* static */ bool ShapeUtil::IsNestedTuple(const Shape& shape) {
   return IsTuple(shape) && std::any_of(shape.tuple_shapes().begin(),
                                        shape.tuple_shapes().end(), IsTuple);
@@ -370,7 +406,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 }
 
 /* static */ int64 ShapeUtil::ElementsIn(const Shape& shape) {
-  CHECK(!IsTuple(shape)) << ShapeUtil::HumanString(shape);
+  CHECK(IsArray(shape)) << ShapeUtil::HumanString(shape);
   CHECK_EQ(shape.dimensions_size(), Rank(shape));
   return std::accumulate<decltype(shape.dimensions().begin()), int64>(
       shape.dimensions().begin(), shape.dimensions().end(), 1LL,
@@ -385,23 +421,6 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   return shape.element_type() == F32 && Rank(shape) == 0;
 }
 
-/* static */ string ShapeUtil::HumanString(const Shape& shape) {
-  if (IsTuple(shape)) {
-    string text = "(";
-    const char* prefix = "";
-    for (const Shape& elem_shape : shape.tuple_shapes()) {
-      tensorflow::strings::StrAppend(&text, prefix, HumanString(elem_shape));
-      prefix = ", ";
-    }
-    text += ")";
-    return text;
-  } else {
-    return tensorflow::strings::StrCat(
-        tensorflow::str_util::Lowercase(
-            PrimitiveType_Name(shape.element_type())),
-        "[", tensorflow::str_util::Join(shape.dimensions(), ","), "]");
-  }
-}
 
 namespace {
 
@@ -452,48 +471,56 @@ StatusOr<PrimitiveType> StringToPrimitiveType(const string& name) {
 
 }  // namespace
 
-/* static */ string ShapeUtil::HumanStringWithLayout(const Shape& shape) {
+/* static */ string ShapeUtil::HumanString(const Shape& shape) {
   if (IsTuple(shape)) {
     string text = "(";
     const char* prefix = "";
     for (const Shape& elem_shape : shape.tuple_shapes()) {
-      tensorflow::strings::StrAppend(&text, prefix,
-                                     HumanStringWithLayout(elem_shape));
+      StrAppend(&text, prefix, HumanString(elem_shape));
       prefix = ", ";
     }
     text += ")";
     return text;
-  } else {
-    string result = tensorflow::strings::StrCat(
-        LowercasePrimitiveTypeName(shape.element_type()), "[");
-    for (int i = 0; i < shape.dimensions().size(); i++) {
-      tensorflow::strings::StrAppend(&result, (i > 0) ? "," : "",
-                                     shape.dimensions(i));
+  }
+  return StrCat(LowercasePrimitiveTypeName(shape.element_type()), "[",
+                tensorflow::str_util::Join(shape.dimensions(), ","), "]");
+}
+
+/* static */ string ShapeUtil::HumanStringWithLayout(const Shape& shape) {
+  if (IsTuple(shape)) {
+    string text = "(";
+    const char* prefix = "";
+    for (const Shape& elem_shape : shape.tuple_shapes()) {
+      StrAppend(&text, prefix, HumanStringWithLayout(elem_shape));
+      prefix = ", ";
     }
-    result += "]";
-    if (!IsScalar(shape) && !IsOpaque(shape)) {
-      if (LayoutUtil::HasLayout(shape)) {
-        tensorflow::strings::StrAppend(&result,
-                                       LayoutUtil::HumanString(shape.layout()));
-      }
+    text += ")";
+    return text;
+  }
+  string result = StrCat(LowercasePrimitiveTypeName(shape.element_type()), "[");
+  for (int i = 0; i < shape.dimensions().size(); i++) {
+    StrAppend(&result, (i > 0) ? "," : "", shape.dimensions(i));
+  }
+  result += "]";
+  if (!IsScalar(shape) && IsArray(shape)) {
+    if (LayoutUtil::HasLayout(shape)) {
+      StrAppend(&result, LayoutUtil::HumanString(shape.layout()));
     }
-    return result;
   }
+  return result;
 }
 
 /* static */ string ShapeUtil::HumanString(const ProgramShape& program_shape) {
   std::vector<string> parameters;
   for (auto& shape : program_shape.parameters()) {
     const int i = parameters.size();
-    parameters.push_back(
-        tensorflow::strings::StrCat(i < program_shape.parameter_names_size()
-                                        ? program_shape.parameter_names(i)
-                                        : "(unknown)",
-                                    ": ", HumanString(shape)));
+    parameters.push_back(StrCat(i < program_shape.parameter_names_size()
+                                    ? program_shape.parameter_names(i)
+                                    : "(unknown)",
+                                ": ", HumanString(shape)));
   }
-  return tensorflow::strings::StrCat(
-      "(", tensorflow::str_util::Join(parameters, ", "), ") -> ",
-      HumanString(program_shape.result()));
+  return StrCat("(", tensorflow::str_util::Join(parameters, ", "), ") -> ",
+                HumanString(program_shape.result()));
 }
 
 namespace {
@@ -510,7 +537,7 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
         break;
       } else if (must_end) {
         return InvalidArgument("Expected end of tuple; got: \"%s\"",
-                               s->ToString().c_str());
+                               std::string(*s).c_str());
       }
       shapes.emplace_back();
       TF_ASSIGN_OR_RETURN(shapes.back(), ParseShapeStringInternal(s));
@@ -540,7 +567,7 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
       if (!tensorflow::strings::safe_strto64(input.c_str(), &element)) {
         return InvalidArgument(
             "Invalid s64 value in parsed shape string: \"%s\" in \"%s\"",
-            input.c_str(), s->ToString().c_str());
+            input.c_str(), std::string(*s).c_str());
       }
       return element;
     };
@@ -563,14 +590,17 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
     // Extract the primitive element type.
     TF_ASSIGN_OR_RETURN(const PrimitiveType primitive_type,
                         StringToPrimitiveType(element_type_string));
-    if (primitive_type == PRIMITIVE_TYPE_INVALID || primitive_type == TUPLE ||
-        primitive_type == OPAQUE) {
+    if (primitive_type == PRIMITIVE_TYPE_INVALID || primitive_type == TUPLE) {
       return InvalidArgument("Invalid element type string: \"%s\".",
                              element_type_string.c_str());
     }
 
     Shape result;
-    if (format_string.empty() && layout_string.empty()) {
+    if (primitive_type == OPAQUE) {
+      result = ShapeUtil::MakeOpaqueShape();
+    } else if (primitive_type == TOKEN) {
+      result = ShapeUtil::MakeTokenShape();
+    } else if (format_string.empty() && layout_string.empty()) {
       // Create a shape without a layout set.
       result = ShapeUtil::MakeShape(primitive_type, dimensions);
     } else if (format_string == "sparse") {
@@ -593,7 +623,7 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
   }
 
   return InvalidArgument("Invalid shape string to parse: \"%s\"",
-                         s->ToString().c_str());
+                         std::string(*s).c_str());
 }
 }  // namespace
 
@@ -602,7 +632,7 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
   TF_ASSIGN_OR_RETURN(Shape shape, ParseShapeStringInternal(&s));
   if (!s.empty()) {
     return InvalidArgument("Invalid shape string to parse: \"%s\"",
-                           s.ToString().c_str());
+                           std::string(s).c_str());
   }
   return shape;
 }
@@ -615,43 +645,44 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
 }
 
 /* static */ bool ShapeUtil::Compatible(const Shape& lhs, const Shape& rhs) {
-  if (lhs.element_type() == TUPLE) {
+  if (IsArray(lhs)) {
+    return SameElementType(lhs, rhs) && SameDimensions(lhs, rhs);
+  } else if (lhs.element_type() == TUPLE) {
     return rhs.element_type() == TUPLE &&
            ContainersEqual(lhs.tuple_shapes(), rhs.tuple_shapes(), Compatible);
+  } else {
+    // Opaque, token, etc types are vacuously compatible.
+    return true;
   }
-  if (lhs.element_type() == OPAQUE) {
-    return rhs.element_type() == OPAQUE;
-  }
-  return SameElementType(lhs, rhs) && SameDimensions(lhs, rhs);
 }
 
 /* static */ bool ShapeUtil::CompatibleIgnoringElementType(const Shape& lhs,
                                                            const Shape& rhs) {
-  if (lhs.element_type() == TUPLE) {
+  if (IsArray(lhs)) {
+    return IsArray(rhs) && SameDimensions(lhs, rhs);
+  } else if (lhs.element_type() == TUPLE) {
     return rhs.element_type() == TUPLE &&
            ContainersEqual(lhs.tuple_shapes(), rhs.tuple_shapes(),
                            CompatibleIgnoringElementType);
+  } else {
+    // Opaque, token, etc types are vacuously compatible.
+    return true;
   }
-  if (lhs.element_type() == OPAQUE) {
-    return rhs.element_type() == OPAQUE;
-  }
-  return ShapeUtil::IsArray(rhs) && SameDimensions(lhs, rhs);
 }
 
 /* static */ bool ShapeUtil::CompatibleIgnoringFpPrecision(const Shape& lhs,
                                                            const Shape& rhs) {
-  if (lhs.element_type() == TUPLE) {
+  if (IsArray(lhs)) {
+    return IsArray(rhs) && SameElementTypeIgnoringFpPrecision(lhs, rhs) &&
+           CompatibleIgnoringElementType(lhs, rhs);
+  } else if (lhs.element_type() == TUPLE) {
     return rhs.element_type() == TUPLE &&
            ContainersEqual(lhs.tuple_shapes(), rhs.tuple_shapes(),
                            CompatibleIgnoringFpPrecision);
+  } else {
+    // Opaque, token, etc types are vacuously compatible.
+    return true;
   }
-  if (lhs.element_type() == OPAQUE) {
-    return rhs.element_type() == OPAQUE;
-  }
-  if (SameElementTypeIgnoringFpPrecision(lhs, rhs)) {
-    return CompatibleIgnoringElementType(lhs, rhs);
-  }
-  return false;
 }
 
 /* static */ int64 ShapeUtil::GetDimension(const Shape& shape,
@@ -673,10 +704,6 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
   switch (primitive_type) {
     case PRED:
       return sizeof(int8);
-    case TUPLE:
-      LOG(FATAL) << "tuples have no definitive size";
-    case OPAQUE:
-      LOG(FATAL) << "opaque have no definitive size";
     case S8:
       return sizeof(int8);
     case S16:
@@ -703,6 +730,13 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
       return sizeof(double);
     case C64:
       return sizeof(complex64);
+    case TOKEN:
+      // Tokens require no space.
+      return 0;
+    case TUPLE:
+    case OPAQUE:
+      LOG(FATAL) << PrimitiveType_Name(primitive_type)
+                 << " primitive type has no definitive size";
     default:
       LOG(FATAL) << "Unhandled primitive type " << primitive_type;
   }
@@ -711,28 +745,32 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
 /* static */ int64 ShapeUtil::ByteSizeOf(const Shape& shape,
                                          int64 pointer_size) {
   TF_DCHECK_OK(ValidateShape(shape));
-  DCHECK_NE(OPAQUE, shape.element_type());
   if (shape.element_type() == TUPLE) {
     return ByteSizeOfTupleIndexTable(shape, pointer_size);
+  } else if (IsArray(shape)) {
+    int64 byte_size = ByteSizeOfElements(shape);
+    if (LayoutUtil::IsSparseArray(shape)) {
+      byte_size += ByteSizeOfSparseIndices(shape);
+    }
+    return byte_size;
+  } else if (shape.element_type() == TOKEN) {
+    return 0;
   }
-  int64 byte_size = ByteSizeOfElements(shape);
-  if (LayoutUtil::IsSparseArray(shape)) {
-    byte_size += ByteSizeOfSparseIndices(shape);
-  }
-  return byte_size;
+  LOG(FATAL) << PrimitiveType_Name(shape.element_type())
+             << " primitive type has no definitive size";
 }
 
 /* static */ int64 ShapeUtil::ByteSizeOfTupleIndexTable(const Shape& shape,
                                                         int64 pointer_size) {
   TF_DCHECK_OK(ValidateShape(shape));
-  DCHECK_EQ(TUPLE, shape.element_type());
+  CHECK_EQ(TUPLE, shape.element_type());
   CHECK_GT(pointer_size, 0);
   return pointer_size * shape.tuple_shapes_size();
 }
 
 /* static */ int64 ShapeUtil::ByteSizeOfElements(const Shape& shape) {
   TF_DCHECK_OK(ValidateShape(shape));
-  DCHECK(ShapeUtil::IsArray(shape));
+  CHECK(ShapeUtil::IsArray(shape));
   int64 allocated_element_count;
 
   if (LayoutUtil::IsSparseArray(shape)) {
@@ -757,13 +795,17 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
 
 /* static */ int64 ShapeUtil::ByteSizeOfSparseIndices(const Shape& shape) {
   TF_DCHECK_OK(ValidateShape(shape));
-  DCHECK(LayoutUtil::IsSparseArray(shape));
+  CHECK(LayoutUtil::IsSparseArray(shape));
   return LayoutUtil::MaxSparseElements(shape.layout()) *
          ShapeUtil::Rank(shape) * sizeof(int64);
 }
 
 /* static */ Status ShapeUtil::ValidateShapeWithOptionalLayoutInternal(
     const Shape& shape) {
+  if (shape.element_type() == PRIMITIVE_TYPE_INVALID) {
+    return InvalidArgument("shape has invalid element type: %s",
+                           shape.ShortDebugString().c_str());
+  }
   if (shape.element_type() == TUPLE) {
     if (shape.dimensions_size() != 0) {
       return InvalidArgument("tuples must not have dimensions specified");
@@ -779,10 +821,24 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
   if (shape.tuple_shapes_size() > 0) {
     return InvalidArgument("non-tuple shape has tuple_shapes field");
   }
-  if (shape.element_type() == PRIMITIVE_TYPE_INVALID) {
-    return InvalidArgument("shape has invalid element type: %s",
-                           shape.ShortDebugString().c_str());
+
+  // Tokens and opaques can should not have layout or dimensions.
+  if (shape.element_type() == TOKEN || shape.element_type() == OPAQUE) {
+    if (shape.dimensions_size() != 0) {
+      return InvalidArgument(
+          "shape has %s element type, but has dimensions field: %s",
+          LowercasePrimitiveTypeName(shape.element_type()).c_str(),
+          shape.ShortDebugString().c_str());
+    }
+    if (shape.has_layout()) {
+      return InvalidArgument(
+          "shape has %s element type, but has layout field: %s",
+          LowercasePrimitiveTypeName(shape.element_type()).c_str(),
+          shape.ShortDebugString().c_str());
+    }
+    return Status::OK();
   }
+
   if (Rank(shape) != shape.dimensions_size()) {
     return InvalidArgument(
         "shape's rank is mismatched with dimension count; rank=%lld "
@@ -862,7 +918,30 @@ bool ShapeUtil::IsLeafIndex(const Shape& shape, const ShapeIndex& index) {
   return !IsTuple(GetSubshape(shape, index));
 }
 
+/* static */ int64 ShapeUtil::GetLeafCount(const Shape& shape) {
+  int64 count = 0;
+  ForEachSubshape(shape, [&](const Shape&, const ShapeIndex& index) {
+    if (IsLeafIndex(shape, index)) {
+      ++count;
+    }
+  });
+  return count;
+}
+
+/* static */ std::vector<ShapeUtil::IndexedShape> ShapeUtil::GetLeafShapes(
+    const Shape& shape) {
+  std::vector<IndexedShape> leaves;
+  ForEachSubshape(shape, [&](const Shape& sub_shape, const ShapeIndex& index) {
+    if (IsLeafIndex(shape, index)) {
+      leaves.emplace_back(index, sub_shape);
+    }
+  });
+  return leaves;
+}
+
 /* static */ Shape ShapeUtil::StripDegenerateDimensions(const Shape& shape) {
+  CHECK(IsArray(shape));
+
   std::vector<int64> dimension_sizes;
   std::vector<int64> degenerate_dimensions;
   for (int64 i = 0; i < shape.dimensions_size(); ++i) {
@@ -905,10 +984,17 @@ bool ShapeUtil::IsLeafIndex(const Shape& shape, const ShapeIndex& index) {
            std::is_permutation(minor_to_major.begin(), minor_to_major.end(),
                                dims.begin()));
   }
-  Shape stripped_shape =
-      shape.has_layout() ? MakeShapeWithLayout(shape.element_type(),
-                                               dimension_sizes, minor_to_major)
-                         : MakeShape(shape.element_type(), dimension_sizes);
+  Shape stripped_shape;
+  if (LayoutUtil::IsDenseArray(shape)) {
+    stripped_shape = MakeShapeWithLayout(shape.element_type(), dimension_sizes,
+                                         minor_to_major);
+  } else if (LayoutUtil::IsSparseArray(shape)) {
+    stripped_shape =
+        MakeShapeWithSparseLayout(shape.element_type(), dimension_sizes,
+                                  shape.layout().max_sparse_elements());
+  } else {
+    stripped_shape = MakeShape(shape.element_type(), dimension_sizes);
+  }
 
   VLOG(10) << "Original_shape: " << HumanStringWithLayout(shape);
   VLOG(10) << "Stripped_shape: " << HumanStringWithLayout(stripped_shape);
@@ -1020,6 +1106,9 @@ Status ForEachMutableSubshapeHelper(
 /* static */ std::tuple<bool, std::vector<int64>, std::vector<int64>>
 ShapeUtil::InsertedOrDeleted1SizedDimensions(const Shape& shape_pre,
                                              const Shape& shape_post) {
+  CHECK(IsArray(shape_pre));
+  CHECK(IsArray(shape_post));
+
   auto nil = std::make_tuple(false, std::vector<int64>(), std::vector<int64>());
 
   std::vector<int64> deleted_indices;
@@ -1077,6 +1166,9 @@ ShapeUtil::InsertedOrDeleted1SizedDimensions(const Shape& shape_pre,
 /* static */ std::vector<std::pair<int64, int64>>
 ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
                                          const Shape& output_shape) {
+  CHECK(IsArray(input_shape));
+  CHECK(IsArray(output_shape));
+
   // Unmodified dimensions are merely common factors of rank 1.
   auto common_factors = CommonFactors(AsInt64Slice(input_shape.dimensions()),
                                       AsInt64Slice(output_shape.dimensions()));
@@ -1130,8 +1222,10 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
 
 /* static */ bool ShapeUtil::ReshapeIsBitcast(const Shape& input_shape,
                                               const Shape& output_shape) {
-  CHECK(LayoutUtil::HasLayout(input_shape) &&
-        LayoutUtil::HasLayout(output_shape));
+  CHECK(IsArray(input_shape));
+  CHECK(IsArray(output_shape));
+  CHECK(LayoutUtil::HasLayout(input_shape));
+  CHECK(LayoutUtil::HasLayout(output_shape));
 
   if (!SameElementType(input_shape, output_shape)) {
     return false;
@@ -1293,6 +1387,9 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
 
 /* static */ tensorflow::gtl::optional<Shape> ShapeUtil::AlignLayouts(
     const Shape& input_shape, const Shape& output_shape) {
+  CHECK(IsArray(input_shape));
+  CHECK(IsArray(output_shape));
+
   int64 input_rank = Rank(input_shape);
   int64 output_rank = Rank(output_shape);
 
@@ -1427,6 +1524,7 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
 
 /* static */ Shape ShapeUtil::DeleteDimension(int64 dim_to_delete,
                                               Shape shape) {
+  CHECK(IsArray(shape));
   shape.mutable_dimensions()->erase(shape.dimensions().begin() + dim_to_delete);
   if (LayoutUtil::HasLayout(shape)) {
     Layout* layout = shape.mutable_layout();
@@ -1448,6 +1546,7 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
 
 /* static */ Shape ShapeUtil::FilterDimensions(
     const std::function<bool(int64)>& p, Shape shape) {
+  CHECK(IsArray(shape));
   std::vector<int64> dims_to_delete;
   for (int64 i = shape.dimensions().size() - 1; i >= 0; --i) {
     if (!p(i)) {
@@ -1465,4 +1564,26 @@ std::ostream& operator<<(std::ostream& out, const Shape& shape) {
   return out;
 }
 
+/*static*/ size_t ShapeUtil::Hash(const Shape& shape) {
+  using tensorflow::hash;
+  using tensorflow::Hash64Combine;
+
+  size_t hash_value = hash<PrimitiveType>()(shape.element_type());
+
+  if (shape.tuple_shapes().empty()) {
+    for (int64 dim : shape.dimensions()) {
+      hash_value = Hash64Combine(hash_value, hash<int64>()(dim));
+    }
+
+    hash_value = Hash64Combine(hash_value, LayoutUtil::Hash(shape.layout()));
+  } else {
+    hash_value = 0;
+    for (const Shape& subshape : shape.tuple_shapes()) {
+      hash_value = Hash64Combine(hash_value, ShapeUtil::Hash(subshape));
+    }
+  }
+
+  return hash_value;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index 5fa728e7c2fa5f..3853ada6ba65db 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -132,6 +133,9 @@ class ShapeIndexView {
     return ShapeIndexView(new_begin, end_);
   }
 
+  bool operator==(const ShapeIndexView& other) const;
+  bool operator!=(const ShapeIndexView& other) const;
+
   string ToString() const;
 
  private:
@@ -150,12 +154,22 @@ std::ostream& operator<<(std::ostream& out, const ShapeIndexView& shape_index);
 // properties, which do invariant checks before / after the operation.
 class ShapeUtil {
  public:
+  // Data structure which describes the coordinates and the shape, of a tuple
+  // shaped sub-shape.
+  struct IndexedShape {
+    IndexedShape() = default;
+    IndexedShape(ShapeIndex index, Shape shape)
+        : index(std::move(index)), shape(std::move(shape)) {}
+    ShapeIndex index;
+    Shape shape;
+  };
+
   // Returns the number of elements are contained within the provided shape;
   // e.g. for rank 0 (scalars) the result is always 1. Note that sparse shapes
   // may not actually be able to store this number of elements. See
   // LayoutUtil::MaxSparseElements(shape) to obtain the maximum number of
   // elements that can be stored in a sparse shape.
-  // Precondition: !IsTuple(shape)
+  // Precondition: IsArray(shape)
   static int64 ElementsIn(const Shape& shape);
 
   // Returns true if 'shape' has zero elements.
@@ -166,13 +180,11 @@ class ShapeUtil {
   // shapes. This includes only the size of the top-level buffer. For example, a
   // tuple is stored as an array of pointers to other buffers. In this case,
   // this method only returns the size of the pointer array.
-  // Precondition: (!ShapeUtil::IsTuple(shape) || pointer_size > 0) &&
-  //               !ShapeUtil::IsOpaque(shape)
   static int64 ByteSizeOf(const Shape& shape, int64 pointer_size = -1);
 
   // Returns the number of bytes used to store the primitive_type.
   //
-  // Precondition: !ShapeUtil::IsOpaque(shape) && !ShapeUtil::IsTuple(shape)
+  // Precondition: ShapeUtil::IsArray(shape)
   static int64 ByteSizeOfPrimitiveType(PrimitiveType primitive_type);
 
   // Returns the number of bytes required to store the tuple member pointers for
@@ -279,10 +291,10 @@ class ShapeUtil {
   // Scalar-specific
 
   static bool IsScalar(const Shape& shape) {
-    return !IsTuple(shape) && !IsOpaque(shape) && Rank(shape) == 0;
+    return IsArray(shape) && Rank(shape) == 0;
   }
   static bool IsEffectiveScalar(const Shape& shape) {
-    return !IsTuple(shape) && !IsOpaque(shape) && TrueRank(shape) == 0;
+    return IsArray(shape) && TrueRank(shape) == 0;
   }
   static bool IsScalarF32(const Shape& shape);
 
@@ -311,6 +323,10 @@ class ShapeUtil {
   // into a custom operation.
   static Shape MakeOpaqueShape();
 
+  // Creates a token shape. Values of this shape are used for ordering
+  // side-effecting operations.
+  static Shape MakeTokenShape();
+
   // Appends a shape to the given tuple.
   static void AppendShapeToTuple(const Shape& shape, Shape* tuple_shape);
 
@@ -410,11 +426,15 @@ class ShapeUtil {
     return shape.element_type() == OPAQUE;
   }
 
+  // Returns whether the shape is an token value used for ordering
+  // side-effecting operations.
+  static bool IsToken(const Shape& shape) {
+    return shape.element_type() == TOKEN;
+  }
+
   // Returns whether the shape is an array.  Note that scalars are considered
   // arrays.
-  static bool IsArray(const Shape& shape) {
-    return !IsTuple(shape) && !IsOpaque(shape);
-  }
+  static bool IsArray(const Shape& shape);
 
   // Returns whether the shape is a tuple with at least one element which is
   // also a tuple.
@@ -461,6 +481,13 @@ class ShapeUtil {
   // shape.
   static bool IsLeafIndex(const Shape& shape, const ShapeIndex& index);
 
+  // Returns the number of leaves in the shape.
+  static int64 GetLeafCount(const Shape& shape);
+
+  // Retrieves all the leaf shapes and their indexes, in the order walked by
+  // the ForEachSubshape() API.
+  static std::vector<IndexedShape> GetLeafShapes(const Shape& shape);
+
   // Calls the given visitor function for each subshape of the given shape.
   // Subshapes are visited in DFS pre-order starting with the entire shape
   // (index {}).
@@ -626,6 +653,28 @@ class ShapeUtil {
         .IgnoreError();
   }
 
+  // These convenience wrappers don't take `base`, `count` and `incr`
+  // explicitly, but iterate over every element in `shape` instead.
+
+  template <typename FnType>
+  static Status ForEachIndexWithStatus(const Shape& shape,
+                                       const FnType& visitor_function) {
+    std::vector<int64> base(shape.dimensions_size());
+    std::vector<int64> incr(shape.dimensions_size(), 1);
+    return ForEachIndexWithStatus(shape, base,
+                                  /*count=*/AsInt64Slice(shape.dimensions()),
+                                  incr, visitor_function);
+  }
+
+  template <typename FnType>
+  static void ForEachIndex(const Shape& shape, const FnType& visitor_function) {
+    ForEachIndexWithStatus(shape,
+                           [&](tensorflow::gtl::ArraySlice<int64> indices) {
+                             return StatusOr<bool>(visitor_function(indices));
+                           })
+        .IgnoreError();
+  }
+
   // A parallel version of ForEachIndex(WithStatus). This can only be used if
   // the visitor_function is thread-safe and the order of iteration does not
   // matter.
@@ -650,6 +699,9 @@ class ShapeUtil {
               .ok());
   }
 
+  // Compute a hash for `shape`.
+  static size_t Hash(const Shape& shape);
+
  private:
   // Validates all of the non-layout properties of the shape -- this is a helper
   // used by both the layout-optional and layout-required public method.
diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc
index 13582a2a267854..ecdb6532f1d743 100644
--- a/tensorflow/compiler/xla/shape_util_test.cc
+++ b/tensorflow/compiler/xla/shape_util_test.cc
@@ -93,12 +93,14 @@ TEST(ShapeUtilTest, ParseShapeStringTupleOfArrays) {
 }
 
 TEST(ShapeUtilTest, ParseShapeStringNestedTuple) {
-  string shape_string = "(f32[1],(f32[2]), f32[3])";
+  string shape_string = "(f32[1],(f32[2], token[]), opaque[], f32[3])";
   TF_ASSERT_OK_AND_ASSIGN(Shape actual,
                           ShapeUtil::ParseShapeString(shape_string));
   Shape expected = ShapeUtil::MakeTupleShape({
       ShapeUtil::MakeShape(F32, {1}),
-      ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {2})}),
+      ShapeUtil::MakeTupleShape(
+          {ShapeUtil::MakeShape(F32, {2}), ShapeUtil::MakeTokenShape()}),
+      ShapeUtil::MakeOpaqueShape(),
       ShapeUtil::MakeShape(F32, {3}),
   });
   ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
@@ -136,6 +138,23 @@ TEST(ShapeUtilTest, ParseShapeStringWithSparseLayout) {
       << "actual: " << ShapeUtil::HumanString(actual);
 }
 
+TEST(ShapeUtilTest, ParseOpaqueType) {
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual,
+                          ShapeUtil::ParseShapeString("opaque[]"));
+  Shape expected = ShapeUtil::MakeOpaqueShape();
+  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
+      << "expected: " << ShapeUtil::HumanString(expected)
+      << "actual:   " << ShapeUtil::HumanString(actual);
+}
+
+TEST(ShapeUtilTest, ParseTokenType) {
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ShapeUtil::ParseShapeString("token[]"));
+  Shape expected = ShapeUtil::MakeTokenShape();
+  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
+      << "expected: " << ShapeUtil::HumanString(expected)
+      << "actual:   " << ShapeUtil::HumanString(actual);
+}
+
 TEST(ShapeUtilTest, ParseInvalidShapeString) {
   string shape_strings[] = {
       "f32[123,456]foobar{0,1}", "f32[123,456]sparse{0,1}", "f32[123,456]{foo}",
@@ -295,6 +314,9 @@ TEST(ShapeUtilTest, ByteSizeOfWithoutPadding) {
   EXPECT_EQ(8, ShapeUtil::ByteSizeOfPrimitiveType(C64));
   EXPECT_EQ(8, ShapeUtil::ByteSizeOf(ShapeUtil::MakeShape(C64, {})));
   EXPECT_EQ(1600, ShapeUtil::ByteSizeOf(ShapeUtil::MakeShape(C64, {10, 20})));
+
+  EXPECT_EQ(0, ShapeUtil::ByteSizeOfPrimitiveType(TOKEN));
+  EXPECT_EQ(0, ShapeUtil::ByteSizeOf(ShapeUtil::MakeTokenShape()));
 }
 
 TEST(ShapeUtilTest, ByteSizeOfWithPadding) {
@@ -449,19 +471,21 @@ TEST(ShapeUtilTest, IsLeafIndex) {
 
 TEST(ShapeUtilTest, HumanString) {
   Shape opaque = ShapeUtil::MakeOpaqueShape();
+  Shape token = ShapeUtil::MakeTokenShape();
   Shape scalar = ShapeUtil::MakeShape(F32, {});
   Shape matrix = ShapeUtil::MakeShape(U32, {1, 2});
   Shape matrix2 = ShapeUtil::MakeShapeWithLayout(S32, {3, 4}, {0, 1});
   Shape tuple = ShapeUtil::MakeTupleShape({opaque, scalar, matrix, matrix2});
-  Shape nested_tuple = ShapeUtil::MakeTupleShape({tuple, matrix});
+  Shape nested_tuple = ShapeUtil::MakeTupleShape({tuple, matrix, token});
 
   EXPECT_EQ("opaque[]", ShapeUtil::HumanString(opaque));
+  EXPECT_EQ("token[]", ShapeUtil::HumanString(token));
   EXPECT_EQ("f32[]", ShapeUtil::HumanString(scalar));
   EXPECT_EQ("u32[1,2]", ShapeUtil::HumanString(matrix));
   EXPECT_EQ("s32[3,4]", ShapeUtil::HumanString(matrix2));
   EXPECT_EQ("(opaque[], f32[], u32[1,2], s32[3,4])",
             ShapeUtil::HumanString(tuple));
-  EXPECT_EQ("((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2])",
+  EXPECT_EQ("((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])",
             ShapeUtil::HumanString(nested_tuple));
 
   EXPECT_EQ("opaque[]", ShapeUtil::HumanStringWithLayout(opaque));
@@ -470,8 +494,10 @@ TEST(ShapeUtilTest, HumanString) {
   EXPECT_EQ("s32[3,4]{0,1}", ShapeUtil::HumanStringWithLayout(matrix2));
   EXPECT_EQ("(opaque[], f32[], u32[1,2]{1,0}, s32[3,4]{0,1})",
             ShapeUtil::HumanStringWithLayout(tuple));
-  EXPECT_EQ("((opaque[], f32[], u32[1,2]{1,0}, s32[3,4]{0,1}), u32[1,2]{1,0})",
-            ShapeUtil::HumanStringWithLayout(nested_tuple));
+  EXPECT_EQ(
+      "((opaque[], f32[], u32[1,2]{1,0}, s32[3,4]{0,1}), u32[1,2]{1,0}, "
+      "token[])",
+      ShapeUtil::HumanStringWithLayout(nested_tuple));
 
   ProgramShape prog = ShapeUtil::MakeProgramShape(
       {opaque, scalar, matrix, matrix2, tuple, nested_tuple}, nested_tuple);
@@ -481,8 +507,9 @@ TEST(ShapeUtilTest, HumanString) {
       "(unknown): u32[1,2], "
       "(unknown): s32[3,4], "
       "(unknown): (opaque[], f32[], u32[1,2], s32[3,4]), "
-      "(unknown): ((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2])) -> "
-      "((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2])",
+      "(unknown): ((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])) "
+      "-> "
+      "((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])",
       ShapeUtil::HumanString(prog));
 
   prog.add_parameter_names("arg0");
@@ -497,8 +524,10 @@ TEST(ShapeUtilTest, HumanString) {
       "matrix: u32[1,2], "
       "matrix2: s32[3,4], "
       "tuple: (opaque[], f32[], u32[1,2], s32[3,4]), "
-      "nested_tuple: ((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2])) -> "
-      "((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2])",
+      "nested_tuple: ((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], "
+      "token[])) "
+      "-> "
+      "((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])",
       ShapeUtil::HumanString(prog));
 }
 
@@ -713,6 +742,16 @@ TEST(ShapeUtilTest, ReshapeIsBitcast_3x2x2_6x2_Dim1IsMostMinor) {
       ShapeUtil::MakeShapeWithLayout(F32, {6, 2}, {0, 1})));
 }
 
+TEST(ShapeUtilTest, StripDegenerateDimensions) {
+  EXPECT_TRUE(ShapeUtil::Equal(ShapeUtil::StripDegenerateDimensions(
+                                   ShapeUtil::MakeShape(F32, {3, 1, 2})),
+                               ShapeUtil::MakeShape(F32, {3, 2})));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      ShapeUtil::StripDegenerateDimensions(
+          ShapeUtil::MakeShapeWithSparseLayout(F32, {3, 1, 2}, 10)),
+      ShapeUtil::MakeShapeWithSparseLayout(F32, {3, 2}, 10)));
+}
+
 TEST(AlgebraicSimplifierTest, ReshapeIsBitcast_3x2x2_6x2_Dim0IsMostMinor) {
   EXPECT_FALSE(ShapeUtil::ReshapeIsBitcast(
       ShapeUtil::MakeShapeWithLayout(F32, {3, 2, 2}, {0, 1, 2}),
diff --git a/tensorflow/compiler/xla/status.h b/tensorflow/compiler/xla/status.h
index 4eb3bf3766412d..69abb51852ac09 100644
--- a/tensorflow/compiler/xla/status.h
+++ b/tensorflow/compiler/xla/status.h
@@ -21,7 +21,7 @@ limitations under the License.
 
 namespace xla {
 
-using tensorflow::Status;
+using tensorflow::Status;  // TENSORFLOW_STATUS_OK
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/statusor.h b/tensorflow/compiler/xla/statusor.h
index cccbce5fc83af8..0e1387c93938fa 100644
--- a/tensorflow/compiler/xla/statusor.h
+++ b/tensorflow/compiler/xla/statusor.h
@@ -13,13 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// StatusOr<T> is the union of a Status object and a T
-// object. StatusOr models the concept of an object that is either a
-// usable value, or an error Status explaining why such a value is
-// not present. To this end, StatusOr<T> does not allow its Status
-// value to be Status::OK. Furthermore, the value of a StatusOr<T*>
-// must not be null. This is enforced by a debug check in most cases,
-// but even when it is not, clients must not set the value to null.
+// StatusOr<T> is the union of a Status object and a T object. StatusOr models
+// the concept of an object that is either a value, or an error Status
+// explaining why such a value is not present. To this end, StatusOr<T> does not
+// allow its Status value to be Status::OK.
 //
 // The primary use-case for StatusOr<T> is as the return value of a
 // function which may fail.
diff --git a/tensorflow/compiler/xla/statusor_test.cc b/tensorflow/compiler/xla/statusor_test.cc
index f9d25945bc6175..377a618ffbd993 100644
--- a/tensorflow/compiler/xla/statusor_test.cc
+++ b/tensorflow/compiler/xla/statusor_test.cc
@@ -75,6 +75,14 @@ TEST(StatusOr, ElementType) {
   static_assert(std::is_same<StatusOr<char>::element_type, char>(), "");
 }
 
+TEST(StatusOr, NullPointerStatusOr) {
+  // As a very special case, null-plain-pointer StatusOr used to be an
+  // error. Test that it no longer is.
+  StatusOr<int*> null_status(nullptr);
+  EXPECT_TRUE(null_status.ok());
+  EXPECT_EQ(null_status.ValueOrDie(), nullptr);
+}
+
 TEST(StatusOr, TestNoDefaultConstructorInitialization) {
   // Explicitly initialize it with an error code.
   StatusOr<NoDefaultConstructor> statusor(tensorflow::errors::Cancelled(""));
@@ -405,7 +413,7 @@ TEST(StatusOr, TestPointerValueConst) {
   EXPECT_EQ(&kI, thing.ValueOrDie());
 }
 
-// NOTE(tucker): tensorflow::StatusOr does not support this kind
+// NOTE(tucker): StatusOr does not support this kind
 // of resize op.
 // TEST(StatusOr, StatusOrVectorOfUniquePointerCanResize) {
 //   using EvilType = std::vector<std::unique_ptr<int>>;
diff --git a/tensorflow/compiler/xla/test_helpers.h b/tensorflow/compiler/xla/test_helpers.h
index 17bae2e4f61126..8918350135fbb8 100644
--- a/tensorflow/compiler/xla/test_helpers.h
+++ b/tensorflow/compiler/xla/test_helpers.h
@@ -40,13 +40,10 @@ class Literal;
 namespace testing {
 
 namespace internal_status {
-inline const ::tensorflow::Status& GetStatus(
-    const ::tensorflow::Status& status) {
-  return status;
-}
+inline const Status& GetStatus(const Status& status) { return status; }
 
 template <typename T>
-inline const ::tensorflow::Status& GetStatus(const StatusOr<T>& status) {
+inline const Status& GetStatus(const StatusOr<T>& status) {
   return status.status();
 }
 }  // namespace internal_status
@@ -57,21 +54,17 @@ inline const ::tensorflow::Status& GetStatus(const StatusOr<T>& status) {
 // The following macros are similar to macros in gmock, but deliberately named
 // differently in order to avoid conflicts in files which include both.
 
-// Macros for testing the results of functions that return tensorflow::Status or
+// Macros for testing the results of functions that return Status or
 // StatusOr<T> (for any type T).
-#define EXPECT_IS_OK(expression)      \
-  EXPECT_EQ(tensorflow::Status::OK(), \
-            xla::testing::internal_status::GetStatus(expression))
-#define EXPECT_IS_NOT_OK(expression)  \
-  EXPECT_NE(tensorflow::Status::OK(), \
-            xla::testing::internal_status::GetStatus(expression))
+#define EXPECT_IS_OK(expression) \
+  EXPECT_EQ(Status::OK(), xla::testing::internal_status::GetStatus(expression))
+#define EXPECT_IS_NOT_OK(expression) \
+  EXPECT_NE(Status::OK(), xla::testing::internal_status::GetStatus(expression))
 #undef ASSERT_IS_OK
-#define ASSERT_IS_OK(expression)      \
-  ASSERT_EQ(tensorflow::Status::OK(), \
-            xla::testing::internal_status::GetStatus(expression))
+#define ASSERT_IS_OK(expression) \
+  ASSERT_EQ(Status::OK(), xla::testing::internal_status::GetStatus(expression))
 #undef ASSERT_IS_NOT_OK
-#define ASSERT_IS_NOT_OK(expression)  \
-  ASSERT_NE(tensorflow::Status::OK(), \
-            xla::testing::internal_status::GetStatus(expression))
+#define ASSERT_IS_NOT_OK(expression) \
+  ASSERT_NE(Status::OK(), xla::testing::internal_status::GetStatus(expression))
 
 #endif  // TENSORFLOW_COMPILER_XLA_TEST_HELPERS_H_
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index c28d14ba8ac3a0..7f6bbe6f879fd9 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -87,12 +87,12 @@ cc_library(
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
         "//tensorflow/compiler/xla:array4d",
+        "//tensorflow/compiler/xla:error_spec",
+        "//tensorflow/compiler/xla:literal_comparison",
         "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
@@ -117,11 +117,11 @@ cc_library(
         "//tensorflow/compiler/xla/service:backend",
         "//tensorflow/compiler/xla/service:computation_layout",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:hlo_runner",
         "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/service:interpreter_plugin",  # reference backend
         "//tensorflow/compiler/xla/service:platform_util",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
@@ -138,8 +138,8 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:hlo_verifier",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
     ],
@@ -152,7 +152,6 @@ tf_cc_binary(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
@@ -188,8 +187,6 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
@@ -288,8 +285,6 @@ xla_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -313,7 +308,6 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
@@ -335,7 +329,6 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -378,7 +371,6 @@ xla_test(
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -398,7 +390,6 @@ xla_test(
         "enable_for_xla_interpreter",
     ],
     deps = [
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -422,8 +413,6 @@ xla_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla:xla_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
@@ -450,8 +439,6 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
@@ -472,7 +459,6 @@ xla_test(
     ],
     deps = [
         "//tensorflow/compiler/xla:array2d",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
@@ -491,7 +477,6 @@ xla_test(
     ],
     deps = [
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
@@ -528,7 +513,6 @@ xla_test(
     tags = ["enable_for_xla_interpreter"],
     deps = [
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
@@ -552,7 +536,6 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
@@ -572,8 +555,6 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
@@ -598,8 +579,6 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
@@ -626,12 +605,12 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
@@ -640,6 +619,7 @@ xla_test(
 
 xla_test(
     name = "exhaustive_f32_elementwise_op_test",
+    size = "enormous",
     srcs = ["exhaustive_f32_elementwise_op_test.cc"],
     backends = [
         "cpu",
@@ -647,13 +627,13 @@ xla_test(
     ],
     shard_count = 48,
     tags = [
-        "enormous",
         "manual",
         "notap",
     ],
     deps = [
         ":client_library_test_base",
         ":literal_test_util",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
     ],
@@ -695,7 +675,6 @@ xla_test(
         "//tensorflow/compiler/xla:array3d",
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -718,8 +697,8 @@ xla_test(
         "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
     ],
 )
 
@@ -739,7 +718,6 @@ xla_test(
         "//tensorflow/compiler/xla:array3d",
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -764,7 +742,6 @@ xla_test(
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -788,7 +765,6 @@ xla_test(
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -800,30 +776,42 @@ xla_test(
     ],
 )
 
+CONVOLUTION_TEST_DEPS = [
+    "//tensorflow/compiler/xla:array2d",
+    "//tensorflow/compiler/xla:array4d",
+    "//tensorflow/compiler/xla:literal_util",
+    "//tensorflow/compiler/xla:reference_util",
+    "//tensorflow/compiler/xla:shape_util",
+    "//tensorflow/compiler/xla:statusor",
+    "//tensorflow/compiler/xla:util",
+    "//tensorflow/compiler/xla:xla_data_proto",
+    "//tensorflow/compiler/xla/client:global_data",
+    "//tensorflow/compiler/xla/client:local_client",
+    "//tensorflow/compiler/xla/client:padding",
+    "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+    "//tensorflow/compiler/xla/tests:client_library_test_base",
+    "//tensorflow/compiler/xla/tests:literal_test_util",
+    "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    "//tensorflow/core:lib",
+    "//tensorflow/core:test",
+]
+
 xla_test(
     name = "convolution_test",
     timeout = "long",
     srcs = ["convolution_test.cc"],
     shard_count = 25,
-    deps = [
-        "//tensorflow/compiler/xla:array2d",
-        "//tensorflow/compiler/xla:array4d",
-        "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:reference_util",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:global_data",
-        "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client:padding",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-    ],
+    deps = CONVOLUTION_TEST_DEPS,
+)
+
+xla_test(
+    name = "convolution_test_gpu_alternative_layout",
+    timeout = "long",
+    srcs = ["convolution_test.cc"],
+    backend_args = {"gpu": ["--xla_backend_extra_options=xla_gpu_experimental_conv_disable_layout_heuristic"]},
+    backends = ["gpu"],
+    shard_count = 25,
+    deps = CONVOLUTION_TEST_DEPS,
 )
 
 xla_test(
@@ -841,7 +829,6 @@ xla_test(
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
@@ -866,7 +853,6 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
@@ -928,8 +914,6 @@ xla_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
@@ -958,8 +942,6 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
@@ -1000,7 +982,6 @@ xla_test(
     deps = [
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1053,8 +1034,6 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1076,7 +1055,6 @@ xla_test(
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
@@ -1106,8 +1084,6 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
@@ -1201,6 +1177,7 @@ xla_test(
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
@@ -1218,9 +1195,9 @@ xla_test(
     ],
     deps = [
         ":client_library_test_base",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
     ],
@@ -1237,8 +1214,6 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
@@ -1278,7 +1253,6 @@ xla_test(
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:reference_util",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1301,7 +1275,6 @@ xla_test(
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1341,7 +1314,6 @@ xla_test(
         "enable_for_xla_interpreter",
     ],
     deps = [
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1359,7 +1331,6 @@ xla_test(
         "enable_for_xla_interpreter",
     ],
     deps = [
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1385,8 +1356,6 @@ xla_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1408,7 +1377,6 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1480,8 +1448,6 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
@@ -1529,7 +1495,6 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1542,6 +1507,30 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "cross_replica_sum_test",
+    srcs = ["cross_replica_sum_test.cc"],
+    deps = [
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:stream_executor_no_cuda",
+        "//tensorflow/core:test",
+    ],
+)
+
 xla_test(
     name = "bitcast_convert_test",
     srcs = ["bitcast_convert_test.cc"],
@@ -1571,8 +1560,6 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla:xla_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
@@ -1593,7 +1580,6 @@ xla_test(
         "enable_for_xla_interpreter",
     ],
     deps = [
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1617,8 +1603,6 @@ xla_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1639,7 +1623,6 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
@@ -1658,7 +1641,6 @@ xla_test(
     srcs = ["execution_profile_test.cc"],
     deps = [
         ":client_library_test_base",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1673,7 +1655,6 @@ xla_test(
     args = ["--xla_hlo_profile"],
     deps = [
         ":client_library_test_base",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1779,8 +1760,6 @@ xla_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1808,8 +1787,6 @@ xla_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1847,8 +1824,6 @@ xla_test(
     deps = [
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1864,7 +1839,10 @@ xla_test(
 
 xla_test(
     name = "local_client_execute_test",
+    # TODO(b/79375911): Test times out in LLVM at normal size.
+    size = "large",
     srcs = ["local_client_execute_test.cc"],
+    shard_count = 30,
     tags = ["optonly"],
     deps = [
         "//tensorflow/compiler/xla:literal_util",
@@ -1874,8 +1852,6 @@ xla_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
@@ -1930,24 +1906,6 @@ xla_test(
     ],
 )
 
-xla_test(
-    name = "set_return_value_test",
-    srcs = ["set_return_value_test.cc"],
-    deps = [
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla/client:computation_builder",
-        "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
-        "//tensorflow/compiler/xla/client/xla_client:xla_computation",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-    ],
-)
-
 xla_test(
     name = "reshape_motion_test",
     srcs = ["reshape_motion_test.cc"],
@@ -1961,8 +1919,6 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:computation",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
@@ -1979,6 +1935,7 @@ xla_test(
     name = "deep_graph_test",
     srcs = ["deep_graph_test.cc"],
     deps = [
+        "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
@@ -2062,7 +2019,6 @@ xla_test(
         ":local_client_test_base",
         ":test_utils",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla/client:computation_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index e8a5efe796a920..36a706496918ac 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -2225,6 +2225,15 @@ XLA_TEST_F(ArrayElementwiseOpTest, ClzU32s) {
   ComputeAndCompareR1<uint32>(&builder, {32, 31, 27, 15, 9, 3, 0}, {});
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, ClzS64s) {
+  XlaBuilder builder(TestName());
+  auto a =
+      builder.ConstantR1<int64>({0, 1, 0x80000000, 0x7FFFFFFFF2345678ul, -1});
+  builder.Clz(a);
+
+  ComputeAndCompareR1<int64>(&builder, {64, 63, 32, 1, 0}, {});
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, AddChainFoldLeft) {
   // a ------ (add) --------- (add)
   //         /               /
diff --git a/tensorflow/compiler/xla/tests/bfloat16_test.cc b/tensorflow/compiler/xla/tests/bfloat16_test.cc
index 4e65cf11f3f1a0..ca337e78840e77 100644
--- a/tensorflow/compiler/xla/tests/bfloat16_test.cc
+++ b/tensorflow/compiler/xla/tests/bfloat16_test.cc
@@ -37,7 +37,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/compiler/xla/tests/broadcast_test.cc b/tensorflow/compiler/xla/tests/broadcast_test.cc
index 6ebbf7191833ef..51b9f0d3e330e7 100644
--- a/tensorflow/compiler/xla/tests/broadcast_test.cc
+++ b/tensorflow/compiler/xla/tests/broadcast_test.cc
@@ -46,8 +46,8 @@ XLA_TEST_F(BroadcastTest, BroadcastScalarToScalar) {
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
-  LiteralTestUtil::ExpectNear(*Literal::CreateR0<float>(42.0), *result,
-                              error_spec_);
+  EXPECT_TRUE(LiteralTestUtil::Near(*Literal::CreateR0<float>(42.0), *result,
+                                    error_spec_));
 }
 
 XLA_TEST_F(BroadcastTest, BroadcastScalarTo2D) {
@@ -62,9 +62,9 @@ XLA_TEST_F(BroadcastTest, BroadcastScalarTo2D) {
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
-  LiteralTestUtil::ExpectNear(
+  EXPECT_TRUE(LiteralTestUtil::Near(
       *Literal::CreateR2<float>({{42.0, 42.0}, {42.0, 42.0}}), *result,
-      error_spec_);
+      error_spec_));
 }
 
 XLA_TEST_F(BroadcastTest, BroadcastVectorTo2D) {
@@ -85,13 +85,13 @@ XLA_TEST_F(BroadcastTest, BroadcastVectorTo2D) {
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
-  LiteralTestUtil::ExpectNear(
+  EXPECT_TRUE(LiteralTestUtil::Near(
       *Literal::CreateR2<float>({{1.0, 1.0}, {2.0, 2.0}, {3.0, 3.0}}),
-      LiteralView::Create(*result, {0}), error_spec_);
+      LiteralSlice(*result, {0}), error_spec_));
 
-  LiteralTestUtil::ExpectNear(
+  EXPECT_TRUE(LiteralTestUtil::Near(
       *Literal::CreateR2<float>({{1.0, 2.0, 3.0}, {1.0, 2.0, 3.0}}),
-      LiteralView::Create(*result, {1}), error_spec_);
+      LiteralSlice(*result, {1}), error_spec_));
 }
 
 XLA_TEST_F(BroadcastTest, Broadcast2DTo2D) {
@@ -106,9 +106,9 @@ XLA_TEST_F(BroadcastTest, Broadcast2DTo2D) {
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
-  LiteralTestUtil::ExpectNear(
-      *Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}}), *result,
-      error_spec_);
+  EXPECT_TRUE(
+      LiteralTestUtil::Near(*Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}}),
+                            *result, error_spec_));
 }
 
 XLA_TEST_F(BroadcastTest, Broadcast2DTo2DTranspose) {
@@ -125,9 +125,9 @@ XLA_TEST_F(BroadcastTest, Broadcast2DTo2DTranspose) {
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
-  LiteralTestUtil::ExpectNear(
-      *Literal::CreateR2<float>({{1.0, 3.0}, {2.0, 4.0}}), *result,
-      error_spec_);
+  EXPECT_TRUE(
+      LiteralTestUtil::Near(*Literal::CreateR2<float>({{1.0, 3.0}, {2.0, 4.0}}),
+                            *result, error_spec_));
 }
 
 XLA_TEST_F(BroadcastTest, Broadcast2DTo3D) {
@@ -142,10 +142,10 @@ XLA_TEST_F(BroadcastTest, Broadcast2DTo3D) {
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
-  LiteralTestUtil::ExpectNear(
+  EXPECT_TRUE(LiteralTestUtil::Near(
       *Literal::CreateR3<float>({{{1.0, 2.0}, {1.0, 2.0}, {1.0, 2.0}},
                                  {{3.0, 4.0}, {3.0, 4.0}, {3.0, 4.0}}}),
-      *result, error_spec_);
+      *result, error_spec_));
 }
 
 TEST_F(BroadcastTest, Broadcast_R1_2_To_R4_2x2x3x3) {
@@ -166,8 +166,8 @@ TEST_F(BroadcastTest, Broadcast_R1_2_To_R4_2x2x3x3) {
   Array2D<float> pz({{1, 2}, {1, 2}});
   expected.FillWithPZ(pz);
 
-  LiteralTestUtil::ExpectNear(*Literal::CreateR4FromArray4D<float>(expected),
-                              *result, error_spec_);
+  EXPECT_TRUE(LiteralTestUtil::Near(
+      *Literal::CreateR4FromArray4D<float>(expected), *result, error_spec_));
 }
 
 TEST_F(BroadcastTest, Broadcast_R1_1025_To_R4_3x3x3x1025) {
@@ -196,8 +196,8 @@ TEST_F(BroadcastTest, Broadcast_R1_1025_To_R4_3x3x3x1025) {
   }
   expected.FillWithYX(yx);
 
-  LiteralTestUtil::ExpectNear(*Literal::CreateR4FromArray4D<float>(expected),
-                              *result, error_spec_);
+  EXPECT_TRUE(LiteralTestUtil::Near(
+      *Literal::CreateR4FromArray4D<float>(expected), *result, error_spec_));
 }
 
 XLA_TEST_F(BroadcastTest, Broadcast_R1_64_To_R4_32x64x7x7) {
@@ -218,8 +218,8 @@ XLA_TEST_F(BroadcastTest, Broadcast_R1_64_To_R4_32x64x7x7) {
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
-  LiteralTestUtil::ExpectNear(*Literal::CreateR4FromArray4D(r4_array), *result,
-                              error_spec_);
+  EXPECT_TRUE(LiteralTestUtil::Near(*Literal::CreateR4FromArray4D(r4_array),
+                                    *result, error_spec_));
 }
 
 TEST_F(BroadcastTest, Broadcast_R0_to_R4_64x64x3x3) {
@@ -238,8 +238,8 @@ TEST_F(BroadcastTest, Broadcast_R0_to_R4_64x64x3x3) {
   Array4D<float> expected(64, 64, 3, 3);
   expected.Fill(1.0f);
 
-  LiteralTestUtil::ExpectNear(*Literal::CreateR4FromArray4D<float>(expected),
-                              *result, error_spec_);
+  EXPECT_TRUE(LiteralTestUtil::Near(
+      *Literal::CreateR4FromArray4D<float>(expected), *result, error_spec_));
 }
 
 TEST_F(BroadcastTest, Broadcast_R2_2x2_To_R4_3x3x2x2) {
@@ -260,8 +260,8 @@ TEST_F(BroadcastTest, Broadcast_R2_2x2_To_R4_3x3x2x2) {
   Array4D<float> expected(3, 3, 2, 2);
   expected.FillWithYX(to_broadcast);
 
-  LiteralTestUtil::ExpectNear(*Literal::CreateR4FromArray4D<float>(expected),
-                              *result, error_spec_);
+  EXPECT_TRUE(LiteralTestUtil::Near(
+      *Literal::CreateR4FromArray4D<float>(expected), *result, error_spec_));
 }
 
 TEST_F(BroadcastTest, Broadcast_R3_2x3x4_to_R4_2x3x4x5) {
@@ -291,8 +291,8 @@ TEST_F(BroadcastTest, Broadcast_R3_2x3x4_to_R4_2x3x4x5) {
   hlo_module->AddEntryComputation(builder.Build());
   auto result = ExecuteAndTransfer(std::move(hlo_module), {});
 
-  LiteralTestUtil::ExpectNear(*Literal::CreateR4FromArray4D<float>(expected),
-                              *result, error_spec_);
+  EXPECT_TRUE(LiteralTestUtil::Near(
+      *Literal::CreateR4FromArray4D<float>(expected), *result, error_spec_));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/call_test.cc b/tensorflow/compiler/xla/tests/call_test.cc
index 5e42365ae38dcc..5fd33b50c94356 100644
--- a/tensorflow/compiler/xla/tests/call_test.cc
+++ b/tensorflow/compiler/xla/tests/call_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
@@ -32,16 +32,16 @@ namespace {
 
 class CallOpTest : public ClientLibraryTestBase {
  protected:
-  Computation CreateR0F32IdentityComputation() {
-    ComputationBuilder builder(client_, "Identity");
+  XlaComputation CreateR0F32IdentityComputation() {
+    XlaBuilder builder("Identity");
     builder.Parameter(0, r0f32_, "x");
     auto build_status = builder.Build();
     EXPECT_IS_OK(build_status.status());
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR1S0F32AdditionComputation() {
-    ComputationBuilder builder(client_, "Addition");
+  XlaComputation CreateR1S0F32AdditionComputation() {
+    XlaBuilder builder("Addition");
     auto x = builder.Parameter(0, r1s0f32_, "x");
     auto y = builder.Parameter(1, r1s0f32_, "y");
     builder.Add(x, y);
@@ -50,8 +50,8 @@ class CallOpTest : public ClientLibraryTestBase {
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR1S2F32AdditionComputation() {
-    ComputationBuilder builder(client_, "Addition");
+  XlaComputation CreateR1S2F32AdditionComputation() {
+    XlaBuilder builder("Addition");
     auto x = builder.Parameter(0, r1s2f32_, "x");
     auto y = builder.Parameter(1, r1s2f32_, "y");
     builder.Add(x, y);
@@ -60,8 +60,8 @@ class CallOpTest : public ClientLibraryTestBase {
     return build_status.ConsumeValueOrDie();
   }
 
-  Computation CreateR0F32TupleComputation() {
-    ComputationBuilder builder(client_, "Tuple");
+  XlaComputation CreateR0F32TupleComputation() {
+    XlaBuilder builder("Tuple");
     builder.Tuple({builder.Parameter(0, r0f32_, "x")});
     auto build_status = builder.Build();
     EXPECT_IS_OK(build_status.status());
@@ -74,8 +74,8 @@ class CallOpTest : public ClientLibraryTestBase {
 };
 
 XLA_TEST_F(CallOpTest, CallR0F32IdentityScalar) {
-  ComputationBuilder builder(client_, TestName());
-  Computation callee = CreateR0F32IdentityComputation();
+  XlaBuilder builder(TestName());
+  XlaComputation callee = CreateR0F32IdentityComputation();
   auto constant = builder.ConstantLiteral(*Literal::CreateR0<float>(42.0));
   builder.Call(callee, {constant});
 
@@ -83,8 +83,8 @@ XLA_TEST_F(CallOpTest, CallR0F32IdentityScalar) {
 }
 
 XLA_TEST_F(CallOpTest, CallR1S0F32AddArray) {
-  ComputationBuilder builder(client_, TestName());
-  Computation callee = CreateR1S0F32AdditionComputation();
+  XlaBuilder builder(TestName());
+  XlaComputation callee = CreateR1S0F32AdditionComputation();
   auto x = builder.ConstantLiteral(*Literal::CreateR1<float>({}));
   auto y = builder.ConstantLiteral(*Literal::CreateR1<float>({}));
   builder.Call(callee, {x, y});
@@ -93,8 +93,8 @@ XLA_TEST_F(CallOpTest, CallR1S0F32AddArray) {
 }
 
 XLA_TEST_F(CallOpTest, CallR1S2F32AddArray) {
-  ComputationBuilder builder(client_, TestName());
-  Computation callee = CreateR1S2F32AdditionComputation();
+  XlaBuilder builder(TestName());
+  XlaComputation callee = CreateR1S2F32AdditionComputation();
   auto x = builder.ConstantLiteral(*Literal::CreateR1<float>({1.0f, 2.0f}));
   auto y = builder.ConstantLiteral(*Literal::CreateR1<float>({2.0f, 3.0f}));
   builder.Call(callee, {x, y});
@@ -103,23 +103,23 @@ XLA_TEST_F(CallOpTest, CallR1S2F32AddArray) {
 }
 
 XLA_TEST_F(CallOpTest, CallTreeTwoDeepBranchFactorThree) {
-  ComputationBuilder builder(client_, "inner");
+  XlaBuilder builder("inner");
   {
     auto x = builder.Parameter(0, r0f32_, "x");
     builder.Add(x, builder.ConstantR0<float>(1.0));
   }
-  TF_ASSERT_OK_AND_ASSIGN(Computation inner, builder.Build());
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation inner, builder.Build());
 
-  ComputationBuilder builder2(client_, "outer");
+  XlaBuilder builder2("outer");
   {
     auto x = builder2.Parameter(0, r0f32_, "x");
     x = builder2.Call(inner, {x});
     x = builder2.Call(inner, {x});
     x = builder2.Call(inner, {x});
   }
-  TF_ASSERT_OK_AND_ASSIGN(Computation outer, builder2.Build());
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation outer, builder2.Build());
 
-  ComputationBuilder builder3(client_, "outermost");
+  XlaBuilder builder3("outermost");
   {
     auto x = builder3.Parameter(0, r0f32_, "x");
     x = builder3.Call(outer, {x});
@@ -134,8 +134,8 @@ XLA_TEST_F(CallOpTest, CallTreeTwoDeepBranchFactorThree) {
 }
 
 XLA_TEST_F(CallOpTest, CallR0F32Tuple) {
-  ComputationBuilder builder(client_, TestName());
-  Computation callee = CreateR0F32TupleComputation();
+  XlaBuilder builder(TestName());
+  XlaComputation callee = CreateR0F32TupleComputation();
   auto elem = Literal::CreateR0<float>(42.0);
   auto tuple = Literal::MakeTuple({elem.get()});
   builder.Call(callee, {builder.ConstantLiteral(*elem)});
diff --git a/tensorflow/compiler/xla/tests/check_execution_arity_test.cc b/tensorflow/compiler/xla/tests/check_execution_arity_test.cc
index f594cc10ac6496..660ff0cad56662 100644
--- a/tensorflow/compiler/xla/tests/check_execution_arity_test.cc
+++ b/tensorflow/compiler/xla/tests/check_execution_arity_test.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -35,7 +35,7 @@ using ::testing::ContainsRegex;
 class CheckExecutionArityTest : public ClientLibraryTestBase {};
 
 TEST_F(CheckExecutionArityTest, TwoParamComputationNumArguments) {
-  ComputationBuilder builder(client_, "add_two_params");
+  XlaBuilder builder("add_two_params");
   auto param_literal = Literal::CreateR1<float>({1.1f, 2.2f});
 
   auto p0 = builder.Parameter(0, param_literal->shape(), "param0");
@@ -75,7 +75,7 @@ TEST_F(CheckExecutionArityTest, TwoParamComputationNumArguments) {
 }
 
 XLA_TEST_F(CheckExecutionArityTest, CheckArgumentShapes) {
-  ComputationBuilder builder(client_, "add_two_params");
+  XlaBuilder builder("add_two_params");
 
   auto p0 = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param0");
   auto p1 = builder.Parameter(1, ShapeUtil::MakeShape(F32, {4}), "param1");
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index 22660c35dcaa0e..bf8ed4d9fb0bc6 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
@@ -94,27 +93,13 @@ string ClientLibraryTestBase::TestName() const {
   return ::testing::UnitTest::GetInstance()->current_test_info()->name();
 }
 
-template <typename BuilderT>
 StatusOr<std::unique_ptr<GlobalData>> ClientLibraryTestBase::Execute(
-    BuilderT* builder, tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
+    XlaBuilder* builder, tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
   // Build the computation, as a convenience.
   TF_ASSIGN_OR_RETURN(auto computation, builder->Build());
   return client_->Execute(computation, arguments, &execution_options_);
 }
 
-StatusOr<std::unique_ptr<Literal>> ClientLibraryTestBase::ExecuteAndTransfer(
-    const Computation& computation,
-    tensorflow::gtl::ArraySlice<GlobalData*> arguments,
-    const Shape* shape_with_output_layout) {
-  ExecutionOptions execution_options = execution_options_;
-  if (shape_with_output_layout != nullptr) {
-    *execution_options.mutable_shape_with_output_layout() =
-        *shape_with_output_layout;
-  }
-  return client_->ExecuteAndTransfer(computation, arguments,
-                                     &execution_options);
-}
-
 StatusOr<std::unique_ptr<Literal>> ClientLibraryTestBase::ExecuteAndTransfer(
     const XlaComputation& computation,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments,
@@ -128,17 +113,6 @@ StatusOr<std::unique_ptr<Literal>> ClientLibraryTestBase::ExecuteAndTransfer(
                                      &execution_options);
 }
 
-template <>
-StatusOr<std::unique_ptr<Literal>> ClientLibraryTestBase::ExecuteAndTransfer(
-    ComputationBuilder* builder,
-    tensorflow::gtl::ArraySlice<GlobalData*> arguments,
-    const Shape* shape_with_output_layout) {
-  // Build the computation, as a convenience.
-  TF_ASSIGN_OR_RETURN(auto computation, builder->Build());
-  return ExecuteAndTransfer(computation, arguments, shape_with_output_layout);
-}
-
-template <>
 StatusOr<std::unique_ptr<Literal>> ClientLibraryTestBase::ExecuteAndTransfer(
     XlaBuilder* builder, tensorflow::gtl::ArraySlice<GlobalData*> arguments,
     const Shape* shape_with_output_layout) {
@@ -162,18 +136,6 @@ ClientLibraryTestBase::ExecuteAndTransferReference(
                                          &execution_options);
 }
 
-std::unique_ptr<GlobalData> ClientLibraryTestBase::ExecuteOrDie(
-    ComputationBuilder* builder,
-    tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
-  return Execute(builder, arguments).ConsumeValueOrDie();
-}
-
-std::unique_ptr<Literal> ClientLibraryTestBase::ExecuteAndTransferOrDie(
-    ComputationBuilder* builder,
-    tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
-  return ExecuteAndTransfer(builder, arguments).ConsumeValueOrDie();
-}
-
 string ClientLibraryTestBase::ExecuteToString(
     XlaBuilder* builder, tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
   auto computation_status = builder->Build();
@@ -191,32 +153,6 @@ string ClientLibraryTestBase::ExecuteToString(
   }
 }
 
-string ClientLibraryTestBase::ExecuteToString(
-    ComputationBuilder* builder,
-    tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
-  auto computation_status = builder->Build();
-  if (!computation_status.ok()) {
-    return computation_status.status().ToString();
-  }
-  auto computation = computation_status.ConsumeValueOrDie();
-
-  auto result =
-      client_->ExecuteAndTransfer(computation, arguments, &execution_options_);
-  if (!result.ok()) {
-    return result.status().ToString();
-  } else {
-    return result.ValueOrDie()->ToString();
-  }
-}
-
-void ClientLibraryTestBase::ComputeAndCompareR1(
-    ComputationBuilder* builder, const tensorflow::core::Bitmap& expected,
-    tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
-  std::unique_ptr<Literal> expected_literal = Literal::CreateR1(expected);
-  ClientLibraryTestBase::ComputeAndCompareLiteral(builder, *expected_literal,
-                                                  arguments);
-}
-
 void ClientLibraryTestBase::ComputeAndCompareR1(
     XlaBuilder* builder, const tensorflow::core::Bitmap& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
@@ -225,27 +161,24 @@ void ClientLibraryTestBase::ComputeAndCompareR1(
                                                   arguments);
 }
 
-template <typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareLiteral(
-    BuilderT* builder, const Literal& expected,
+    XlaBuilder* builder, const Literal& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments,
     const Shape* shape_with_layout) {
   EXPECT_IS_OK(ComputeAndCompareLiteralWithStatus(builder, expected, arguments,
                                                   shape_with_layout));
 }
 
-template <typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareLiteral(
-    BuilderT* builder, const Literal& expected,
+    XlaBuilder* builder, const Literal& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error,
     const Shape* shape_with_layout) {
   EXPECT_IS_OK(ComputeAndCompareLiteralWithStatus(builder, expected, arguments,
                                                   error, shape_with_layout));
 }
 
-tensorflow::Status
-ClientLibraryTestBase::ComputeAndCompareLiteralWithAllOutputLayouts(
-    const xla::Computation& computation, const Literal& expected,
+Status ClientLibraryTestBase::ComputeAndCompareLiteralWithAllOutputLayouts(
+    const xla::XlaComputation& computation, const Literal& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments,
     const std::function<void(const Literal& actual,
                              const string& error_message)>& verify_output) {
@@ -266,12 +199,11 @@ ClientLibraryTestBase::ComputeAndCompareLiteralWithAllOutputLayouts(
                                "Test with output layout: ",
                                ShapeUtil::HumanStringWithLayout(layout)));
   } while (std::next_permutation(minor_to_major.begin(), minor_to_major.end()));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
-tensorflow::Status
-ClientLibraryTestBase::ComputeAndCompareLiteralWithAllInputLayouts(
-    const xla::Computation& computation, const Literal& expected,
+Status ClientLibraryTestBase::ComputeAndCompareLiteralWithAllInputLayouts(
+    const xla::XlaComputation& computation, const Literal& /*expected*/,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments,
     const std::function<void(const Literal& actual,
                              const string& error_message)>& verify_output,
@@ -281,8 +213,8 @@ ClientLibraryTestBase::ComputeAndCompareLiteralWithAllInputLayouts(
   // This is a recursive function. It's an std::function instead of a lambda
   // because it needs to capture itself. The index is the index of the argument
   // to try all layouts for.
-  std::function<tensorflow::Status(int64)> choose;
-  choose = [&, this](int64 index) -> tensorflow::Status {
+  std::function<Status(int64)> choose;
+  choose = [&, this](int64 index) -> Status {
     if (index < arguments.size()) {
       // Try out all layouts for the operand.
       TF_ASSIGN_OR_RETURN(auto literal,
@@ -295,7 +227,7 @@ ClientLibraryTestBase::ComputeAndCompareLiteralWithAllInputLayouts(
         TF_RETURN_IF_ERROR(choose(index + 1));
         arguments_with_layout.pop_back();
         layout_strings.pop_back();
-        return tensorflow::Status::OK();
+        return Status::OK();
       }
 
       std::vector<int64> minor_to_major(ShapeUtil::Rank(literal->shape()));
@@ -313,7 +245,7 @@ ClientLibraryTestBase::ComputeAndCompareLiteralWithAllInputLayouts(
         layout_strings.pop_back();
       } while (
           std::next_permutation(minor_to_major.begin(), minor_to_major.end()));
-      return tensorflow::Status::OK();
+      return Status::OK();
     }
 
     // Every argument has an assigned layout.
@@ -328,34 +260,14 @@ ClientLibraryTestBase::ComputeAndCompareLiteralWithAllInputLayouts(
       tensorflow::strings::StrAppend(&error_message, str, " ");
     }
     verify_output(*actual, error_message);
-    return tensorflow::Status::OK();
+    return Status::OK();
   };
 
   return choose(0);
 }
 
-tensorflow::Status
-ClientLibraryTestBase::ComputeAndCompareLiteralWithAllOutputLayouts(
-    const xla::XlaComputation& /*computation*/, const Literal& /*expected*/,
-    tensorflow::gtl::ArraySlice<GlobalData*> /*arguments*/,
-    const std::function<void(const Literal& actual,
-                             const string& error_message)>& /*verify_output*/) {
-  return Unimplemented("not yet implemented for XlaComputation");
-}
-
-tensorflow::Status
-ClientLibraryTestBase::ComputeAndCompareLiteralWithAllInputLayouts(
-    const xla::XlaComputation& /*computation*/, const Literal& /*expected*/,
-    tensorflow::gtl::ArraySlice<GlobalData*> /*arguments*/,
-    const std::function<void(const Literal& actual,
-                             const string& error_message)>& /*verify_output*/,
-    const Shape* /*output_with_layout*/) {
-  return Unimplemented("not yet implemented for XlaComputation");
-}
-
-template <typename BuilderT>
-tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
-    BuilderT* builder, const Literal& expected,
+Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
+    XlaBuilder* builder, const Literal& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments_passed_in,
     const Shape* shape_with_layout) {
   std::vector<GlobalData*> arguments(arguments_passed_in.begin(),
@@ -382,7 +294,7 @@ tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
   std::unique_ptr<Literal> converted_expected;
   Shape layout_shape;
   if (use_bfloat16_) {
-    converted_expected = LiteralTestUtil::ConvertF32ToBF16(expected);
+    converted_expected = Literal::ConvertF32ToBF16(expected);
     expected_ptr = converted_expected.get();
     if (shape_with_layout != nullptr) {
       layout_shape = *shape_with_layout;
@@ -396,7 +308,7 @@ tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
     }
   }
   auto expect_equal = [&](const Literal& actual, const string& error_message) {
-    LiteralTestUtil::ExpectEqual(*expected_ptr, actual, error_message);
+    EXPECT_TRUE(LiteralTestUtil::Equal(*expected_ptr, actual)) << error_message;
   };
   if (execution_options_.debug_options().xla_test_all_output_layouts()) {
     return ComputeAndCompareLiteralWithAllOutputLayouts(
@@ -408,13 +320,12 @@ tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
   }
   TF_ASSIGN_OR_RETURN(auto actual, ExecuteAndTransfer(computation, arguments,
                                                       shape_with_layout));
-  LiteralTestUtil::ExpectEqual(*expected_ptr, *actual);
-  return tensorflow::Status::OK();
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected_ptr, *actual));
+  return Status::OK();
 }
 
-template <typename BuilderT>
-tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
-    BuilderT* builder, const Literal& expected,
+Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
+    XlaBuilder* builder, const Literal& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments_passed_in,
     ErrorSpec error, const Shape* shape_with_layout) {
   std::vector<GlobalData*> arguments(arguments_passed_in.begin(),
@@ -435,7 +346,7 @@ tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
   std::unique_ptr<Literal> converted_expected;
   Shape layout_shape;
   if (use_bfloat16_) {
-    converted_expected = LiteralTestUtil::ConvertF32ToBF16(expected);
+    converted_expected = Literal::ConvertF32ToBF16(expected);
     expected_ptr = converted_expected.get();
     if (shape_with_layout != nullptr) {
       layout_shape = *shape_with_layout;
@@ -449,7 +360,8 @@ tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
     }
   }
   auto expect_near = [&](const Literal& actual, const string& error_message) {
-    LiteralTestUtil::ExpectNear(*expected_ptr, actual, error, error_message);
+    EXPECT_TRUE(LiteralTestUtil::Near(*expected_ptr, actual, error))
+        << error_message;
   };
   if (execution_options_.debug_options().xla_test_all_output_layouts()) {
     return ComputeAndCompareLiteralWithAllOutputLayouts(
@@ -461,8 +373,8 @@ tensorflow::Status ClientLibraryTestBase::ComputeAndCompareLiteralWithStatus(
   }
   TF_ASSIGN_OR_RETURN(auto actual, ExecuteAndTransfer(computation, arguments,
                                                       shape_with_layout));
-  LiteralTestUtil::ExpectNear(*expected_ptr, *actual, error);
-  return tensorflow::Status::OK();
+  EXPECT_TRUE(LiteralTestUtil::Near(*expected_ptr, *actual, error));
+  return Status::OK();
 }
 
 void ClientLibraryTestBase::ComputeAndCompareR1U8(
@@ -484,9 +396,8 @@ void ClientLibraryTestBase::ComputeAndCompareR1U8(
   EXPECT_EQ(expected, actual->GetR1U8AsString());
 }
 
-template <typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareTuple(
-    BuilderT* builder, const Literal& expected,
+    XlaBuilder* builder, const Literal& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
   auto actual_status = ExecuteAndTransfer(builder, arguments);
   EXPECT_IS_OK(actual_status.status());
@@ -494,12 +405,11 @@ void ClientLibraryTestBase::ComputeAndCompareTuple(
     return;
   }
   auto actual = actual_status.ConsumeValueOrDie();
-  LiteralTestUtil::ExpectEqual(expected, *actual);
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected, *actual));
 }
 
-template <typename BuilderT>
 void ClientLibraryTestBase::ComputeAndCompareTuple(
-    BuilderT* builder, const Literal& expected,
+    XlaBuilder* builder, const Literal& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error) {
   auto actual_status = ExecuteAndTransfer(builder, arguments);
   EXPECT_IS_OK(actual_status.status());
@@ -507,61 +417,7 @@ void ClientLibraryTestBase::ComputeAndCompareTuple(
     return;
   }
   auto actual = actual_status.ConsumeValueOrDie();
-  LiteralTestUtil::ExpectNear(expected, *actual, error);
-}
-
-void ClientLibraryTestBase::ComputeAndCompare(
-    ComputationBuilder* builder, const ComputationDataHandle& operand,
-    tensorflow::gtl::ArraySlice<Literal> arguments) {
-  auto status_or_data = ComputeValueAndReference(builder, operand, arguments);
-  EXPECT_IS_OK(status_or_data);
-  if (!status_or_data.ok()) {
-    return;
-  }
-  std::unique_ptr<Literal> reference, result;
-  std::tie(reference, result) = status_or_data.ConsumeValueOrDie();
-  LiteralTestUtil::ExpectEqual(*reference, *result);
-}
-
-void ClientLibraryTestBase::ComputeAndCompare(
-    ComputationBuilder* builder, const ComputationDataHandle& operand,
-    tensorflow::gtl::ArraySlice<Literal> arguments, ErrorSpec error) {
-  auto status_or_data = ComputeValueAndReference(builder, operand, arguments);
-  EXPECT_IS_OK(status_or_data);
-  if (!status_or_data.ok()) {
-    return;
-  }
-  std::unique_ptr<Literal> reference, result;
-  std::tie(reference, result) = status_or_data.ConsumeValueOrDie();
-  LiteralTestUtil::ExpectNear(*reference, *result, error);
-}
-
-StatusOr<std::pair<std::unique_ptr<Literal>, std::unique_ptr<Literal>>>
-ClientLibraryTestBase::ComputeValueAndReference(
-    ComputationBuilder* builder, const ComputationDataHandle& operand,
-    tensorflow::gtl::ArraySlice<Literal> arguments) {
-  // Transfer the arguments to the executor service. We put the unique_ptr's
-  // into a vector to keep the data alive on the service until the end of this
-  // function.
-  std::vector<std::unique_ptr<GlobalData>> argument_data;
-  for (const auto& arg : arguments) {
-    TF_ASSIGN_OR_RETURN(auto data, client_->TransferToServer(arg));
-    argument_data.push_back(std::move(data));
-  }
-
-  // Create raw pointers to the GlobalData for the rest of the call stack.
-  std::vector<GlobalData*> argument_data_ptr;
-  std::transform(
-      argument_data.begin(), argument_data.end(),
-      std::back_inserter(argument_data_ptr),
-      [](const std::unique_ptr<GlobalData>& data) { return data.get(); });
-
-  TF_ASSIGN_OR_RETURN(
-      auto reference,
-      builder->ComputeConstant(operand, /*output_layout=*/nullptr, arguments));
-  TF_ASSIGN_OR_RETURN(auto result,
-                      ExecuteAndTransfer(builder, argument_data_ptr));
-  return std::make_pair(std::move(reference), std::move(result));
+  EXPECT_TRUE(LiteralTestUtil::Near(expected, *actual, error));
 }
 
 void ClientLibraryTestBase::ComputeAndCompare(
@@ -573,7 +429,7 @@ void ClientLibraryTestBase::ComputeAndCompare(
   }
   std::unique_ptr<Literal> reference, result;
   std::tie(reference, result) = status_or_data.ConsumeValueOrDie();
-  LiteralTestUtil::ExpectEqual(*reference, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*reference, *result));
 }
 
 void ClientLibraryTestBase::ComputeAndCompare(
@@ -586,7 +442,7 @@ void ClientLibraryTestBase::ComputeAndCompare(
   }
   std::unique_ptr<Literal> reference, result;
   std::tie(reference, result) = status_or_data.ConsumeValueOrDie();
-  LiteralTestUtil::ExpectNear(*reference, *result, error);
+  EXPECT_TRUE(LiteralTestUtil::Near(*reference, *result, error));
 }
 
 StatusOr<std::pair<std::unique_ptr<Literal>, std::unique_ptr<Literal>>>
@@ -651,8 +507,8 @@ XlaComputation ClientLibraryTestBase::CreateScalarMax() {
   return computation_status.ConsumeValueOrDie();
 }
 
-Computation ClientLibraryTestBase::CreateScalarReluSensitivity() {
-  ComputationBuilder builder(client_, "relu_sensitivity");
+XlaComputation ClientLibraryTestBase::CreateScalarReluSensitivity() {
+  XlaBuilder builder("relu_sensitivity");
   auto shape = ShapeUtil::MakeShape(use_bfloat16_ ? BF16 : F32, {});
   auto activation = builder.Parameter(0, shape, "activation");
   auto backprop = builder.Parameter(1, shape, "backprop");
@@ -693,14 +549,6 @@ ClientLibraryTestBase::CreatePatternedMatrixWithZeroPadding(int rows, int cols,
   return array;
 }
 
-ComputationDataHandle ClientLibraryTestBase::AddParam(
-    const Literal& argument, ComputationBuilder* builder) {
-  ComputationDataHandle data_handle;
-  arguments_.push_back(CreateParameterAndTransferLiteral(
-      arguments_.size(), argument, "", builder, &data_handle));
-  return data_handle;
-}
-
 XlaOp ClientLibraryTestBase::AddParam(const Literal& argument,
                                       XlaBuilder* builder) {
   XlaOp data_handle;
@@ -709,59 +557,39 @@ XlaOp ClientLibraryTestBase::AddParam(const Literal& argument,
   return data_handle;
 }
 
-ComputationDataHandle ClientLibraryTestBase::CreateConstantFromLiteral(
-    const Literal& literal, ComputationBuilder* builder) {
-  return builder->ConstantLiteral(
-      use_bfloat16_ ? *LiteralTestUtil::ConvertF32ToBF16(literal) : literal);
-}
-
 XlaOp ClientLibraryTestBase::CreateConstantFromLiteral(const Literal& literal,
                                                        XlaBuilder* builder) {
   return builder->ConstantLiteral(
-      use_bfloat16_ ? *LiteralTestUtil::ConvertF32ToBF16(literal) : literal);
+      use_bfloat16_ ? *Literal::ConvertF32ToBF16(literal) : literal);
+}
+
+std::unique_ptr<GlobalData>
+ClientLibraryTestBase::CreateParameterAndTransferLiteral(int64 parameter_number,
+                                                         const Literal& literal,
+                                                         const string& name,
+                                                         XlaBuilder* builder,
+                                                         XlaOp* data_handle) {
+  return CreateParameterAndTransferLiteral(parameter_number, literal, name,
+                                           nullptr, builder, data_handle);
+}
+
+std::unique_ptr<GlobalData>
+ClientLibraryTestBase::CreateParameterAndTransferLiteral(
+    int64 parameter_number, const Literal& literal, const string& name,
+    const DeviceHandle* device_handle, XlaBuilder* builder,
+    XlaOp* data_handle) {
+  const Literal* param_literal = &literal;
+  std::unique_ptr<Literal> converted_literal;
+  if (use_bfloat16_) {
+    converted_literal = Literal::ConvertF32ToBF16(literal);
+    param_literal = converted_literal.get();
+  }
+  std::unique_ptr<GlobalData> data =
+      client_->TransferToServer(*param_literal, device_handle)
+          .ConsumeValueOrDie();
+  *data_handle =
+      builder->Parameter(parameter_number, param_literal->shape(), name);
+  return data;
 }
 
-template void ClientLibraryTestBase::ComputeAndCompareLiteral(
-    ComputationBuilder* builder, const Literal& expected,
-    tensorflow::gtl::ArraySlice<GlobalData*> arguments,
-    const Shape* shape_with_layout);
-
-template void ClientLibraryTestBase::ComputeAndCompareLiteral(
-    XlaBuilder* builder, const Literal& expected,
-    tensorflow::gtl::ArraySlice<GlobalData*> arguments,
-    const Shape* shape_with_layout);
-
-template void ClientLibraryTestBase::ComputeAndCompareLiteral(
-    ComputationBuilder* builder, const Literal& expected,
-    tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error,
-    const Shape* shape_with_layout);
-
-template void ClientLibraryTestBase::ComputeAndCompareLiteral(
-    XlaBuilder* builder, const Literal& expected,
-    tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error,
-    const Shape* shape_with_layout);
-
-template void ClientLibraryTestBase::ComputeAndCompareTuple(
-    ComputationBuilder* builder, const Literal& expected,
-    tensorflow::gtl::ArraySlice<GlobalData*> arguments);
-
-template void ClientLibraryTestBase::ComputeAndCompareTuple(
-    XlaBuilder* builder, const Literal& expected,
-    tensorflow::gtl::ArraySlice<GlobalData*> arguments);
-
-template void ClientLibraryTestBase::ComputeAndCompareTuple(
-    ComputationBuilder* builder, const Literal& expected,
-    tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error);
-
-template void ClientLibraryTestBase::ComputeAndCompareTuple(
-    XlaBuilder* builder, const Literal& expected,
-    tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error);
-
-template StatusOr<std::unique_ptr<GlobalData>> ClientLibraryTestBase::Execute(
-    ComputationBuilder* builder,
-    tensorflow::gtl::ArraySlice<GlobalData*> arguments);
-
-template StatusOr<std::unique_ptr<GlobalData>> ClientLibraryTestBase::Execute(
-    XlaBuilder* builder, tensorflow::gtl::ArraySlice<GlobalData*> arguments);
-
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index 32eea7c2f3a65d..0499fec5898a42 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -25,10 +25,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -91,21 +90,11 @@ class ClientLibraryTestBase : public ::testing::Test {
   // Convenience methods for building and running a computation with the member
   // execution options. Modify execution_options_ in your test if you want to
   // customize the options.
-  template <typename BuilderT>
   StatusOr<std::unique_ptr<GlobalData>> Execute(
-      BuilderT* builder, tensorflow::gtl::ArraySlice<GlobalData*> arguments);
+      XlaBuilder* builder, tensorflow::gtl::ArraySlice<GlobalData*> arguments);
 
-  // TODO(b/74197823): Remove the template type 'BuilderT' in all methods once
-  // the migration to XlaBuilder is complete.
-
-  template <typename BuilderT>
   StatusOr<std::unique_ptr<Literal>> ExecuteAndTransfer(
-      BuilderT* builder, tensorflow::gtl::ArraySlice<GlobalData*> arguments,
-      const Shape* shape_with_output_layout = nullptr);
-
-  StatusOr<std::unique_ptr<Literal>> ExecuteAndTransfer(
-      const Computation& computation,
-      tensorflow::gtl::ArraySlice<GlobalData*> arguments,
+      XlaBuilder* builder, tensorflow::gtl::ArraySlice<GlobalData*> arguments,
       const Shape* shape_with_output_layout = nullptr);
 
   StatusOr<std::unique_ptr<Literal>> ExecuteAndTransfer(
@@ -121,101 +110,90 @@ class ClientLibraryTestBase : public ::testing::Test {
       tensorflow::gtl::ArraySlice<GlobalData*> arguments,
       const Shape* shape_with_output_layout = nullptr);
 
-  // Convenience OrDie variants of above methods.
-  std::unique_ptr<GlobalData> ExecuteOrDie(
-      ComputationBuilder* builder,
-      tensorflow::gtl::ArraySlice<GlobalData*> arguments);
-  std::unique_ptr<Literal> ExecuteAndTransferOrDie(
-      ComputationBuilder* builder,
-      tensorflow::gtl::ArraySlice<GlobalData*> arguments);
-
   // Run a computation and return its value as a string. If an error
   // occurs, then instead return the error as a string.
   string ExecuteToString(XlaBuilder* builder,
                          tensorflow::gtl::ArraySlice<GlobalData*> arguments);
-  string ExecuteToString(ComputationBuilder* builder,
-                         tensorflow::gtl::ArraySlice<GlobalData*> arguments);
 
   // Convenience methods for building and running a computation, transferring
   // the result, and comparing it to the expected value(s). Methods are
   // templated on the native host type which maps to specific XLA types (See
-  // ComputationBuilder/XlaBuilder for details). For each rank, two forms are
+  // XlaBuilder for details). For each rank, two forms are
   // provided: one for floating point types with an ErrorSpec parameter, and one
   // for integral types without the ErrorSpec parameter.
-  template <typename NativeT, typename BuilderT>
-  void ComputeAndCompareR0(BuilderT* builder, NativeT expected,
+  template <typename NativeT>
+  void ComputeAndCompareR0(XlaBuilder* builder, NativeT expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments);
-  template <typename NativeT, typename BuilderT>
-  void ComputeAndCompareR0(BuilderT* builder, NativeT expected,
+  template <typename NativeT>
+  void ComputeAndCompareR0(XlaBuilder* builder, NativeT expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments,
                            ErrorSpec error);
 
-  template <typename NativeT, typename BuilderT>
-  void ComputeAndCompareR1(BuilderT* builder,
+  template <typename NativeT>
+  void ComputeAndCompareR1(XlaBuilder* builder,
                            tensorflow::gtl::ArraySlice<NativeT> expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments);
-  template <typename NativeT, typename BuilderT>
-  void ComputeAndCompareR1(BuilderT* builder,
+  template <typename NativeT>
+  void ComputeAndCompareR1(XlaBuilder* builder,
                            tensorflow::gtl::ArraySlice<NativeT> expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments,
                            ErrorSpec error);
 
   // As above, but uses a bitmap to hold the predicate vector to avoid
   // deficiencies of vector<bool>.
-  void ComputeAndCompareR1(ComputationBuilder* builder,
-                           const tensorflow::core::Bitmap& expected,
-                           tensorflow::gtl::ArraySlice<GlobalData*> arguments);
   void ComputeAndCompareR1(XlaBuilder* builder,
                            const tensorflow::core::Bitmap& expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments);
 
-  template <typename NativeT, typename BuilderT>
-  void ComputeAndCompareR2(BuilderT* builder, const Array2D<NativeT>& expected,
+  template <typename NativeT>
+  void ComputeAndCompareR2(XlaBuilder* builder,
+                           const Array2D<NativeT>& expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments);
-  template <typename NativeT, typename BuilderT>
-  void ComputeAndCompareR2(BuilderT* builder, const Array2D<NativeT>& expected,
+  template <typename NativeT>
+  void ComputeAndCompareR2(XlaBuilder* builder,
+                           const Array2D<NativeT>& expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments,
                            ErrorSpec error);
 
-  template <typename NativeT, typename BuilderT>
-  void ComputeAndCompareR3(BuilderT* builder, const Array3D<NativeT>& expected,
+  template <typename NativeT>
+  void ComputeAndCompareR3(XlaBuilder* builder,
+                           const Array3D<NativeT>& expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments);
-  template <typename NativeT, typename BuilderT>
-  void ComputeAndCompareR3(BuilderT* builder, const Array3D<NativeT>& expected,
+  template <typename NativeT>
+  void ComputeAndCompareR3(XlaBuilder* builder,
+                           const Array3D<NativeT>& expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments,
                            ErrorSpec error);
 
-  template <typename NativeT, typename BuilderT>
-  void ComputeAndCompareR4(BuilderT* builder, const Array4D<NativeT>& expected,
+  template <typename NativeT>
+  void ComputeAndCompareR4(XlaBuilder* builder,
+                           const Array4D<NativeT>& expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments);
-  template <typename NativeT, typename BuilderT>
-  void ComputeAndCompareR4(BuilderT* builder, const Array4D<NativeT>& expected,
+  template <typename NativeT>
+  void ComputeAndCompareR4(XlaBuilder* builder,
+                           const Array4D<NativeT>& expected,
                            tensorflow::gtl::ArraySlice<GlobalData*> arguments,
                            ErrorSpec error);
 
   // Build and run the computation and compare the result with the given
   // literal. shape_with_layout indicates the result layout to request when
   // calling Execute.
-  template <typename BuilderT>
   void ComputeAndCompareLiteral(
-      BuilderT* builder, const Literal& expected,
+      XlaBuilder* builder, const Literal& expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments,
       const Shape* shape_with_layout = nullptr);
-  template <typename BuilderT>
   void ComputeAndCompareLiteral(
-      BuilderT* builder, const Literal& expected,
+      XlaBuilder* builder, const Literal& expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error,
       const Shape* shape_with_layout = nullptr);
 
   // ComputeAndCompare variant which returns an error status.
-  template <typename BuilderT>
-  tensorflow::Status ComputeAndCompareLiteralWithStatus(
-      BuilderT* builder, const Literal& expected,
+  Status ComputeAndCompareLiteralWithStatus(
+      XlaBuilder* builder, const Literal& expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments,
       const Shape* shape_with_layout = nullptr);
-  template <typename BuilderT>
-  tensorflow::Status ComputeAndCompareLiteralWithStatus(
-      BuilderT* builder, const Literal& expected,
+  Status ComputeAndCompareLiteralWithStatus(
+      XlaBuilder* builder, const Literal& expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error,
       const Shape* shape_with_layout = nullptr);
 
@@ -227,25 +205,13 @@ class ClientLibraryTestBase : public ::testing::Test {
 
   // Convenience method for running a built computation, transferring the
   // result, and comparing it to the expected tuple literal.
-  template <typename BuilderT>
   void ComputeAndCompareTuple(
-      BuilderT* builder, const Literal& expected,
+      XlaBuilder* builder, const Literal& expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments);
-  template <typename BuilderT>
   void ComputeAndCompareTuple(
-      BuilderT* builder, const Literal& expected,
+      XlaBuilder* builder, const Literal& expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error);
 
-  // Convenience method for running a built computation and comparing the result
-  // with the HloEvaluator.
-  void ComputeAndCompare(ComputationBuilder* builder,
-                         const ComputationDataHandle& operand,
-                         tensorflow::gtl::ArraySlice<Literal> arguments);
-  void ComputeAndCompare(ComputationBuilder* builder,
-                         const ComputationDataHandle& operand,
-                         tensorflow::gtl::ArraySlice<Literal> arguments,
-                         ErrorSpec error);
-
   // Convenience method for running a built computation and comparing the result
   // with the reference result.
   void ComputeAndCompare(XlaBuilder* builder,
@@ -257,7 +223,7 @@ class ClientLibraryTestBase : public ::testing::Test {
   // Create scalar operations for use in reductions.
   XlaComputation CreateScalarRelu();
   XlaComputation CreateScalarMax();
-  Computation CreateScalarReluSensitivity();
+  XlaComputation CreateScalarReluSensitivity();
 
   // Special case convenience functions for creating filled arrays.
 
@@ -297,34 +263,25 @@ class ClientLibraryTestBase : public ::testing::Test {
   // server, then stores into "data_handle" the global handle for that
   // parameter. When the use_bfloat16 flag is set but the literal has F32
   // elements, the literal will be converted to BF16 before being transferred.
-  template <typename BuilderT, typename HandleT>
   std::unique_ptr<GlobalData> CreateParameterAndTransferLiteral(
       int64 parameter_number, const Literal& literal, const string& name,
-      BuilderT* builder, HandleT* data_handle);
+      XlaBuilder* builder, XlaOp* data_handle);
 
   // As above, but the caller can specify the device that the literal is
   // transferred to. If device_handle is nullptr, the literal will be
   // transferred to the default device.
-  template <typename BuilderT, typename HandleT>
   std::unique_ptr<GlobalData> CreateParameterAndTransferLiteral(
       int64 parameter_number, const Literal& literal, const string& name,
-      const DeviceHandle* device_handle, BuilderT* builder,
-      HandleT* data_handle);
+      const DeviceHandle* device_handle, XlaBuilder* builder,
+      XlaOp* data_handle);
 
   // Creates a parameter instruction and sets the value that will be passed to
   // the computation as specified. This function must be used for all parameters
   // or none and no parameters must be passed when invoking the computation if
   // using this mechanism. If using this mechanism, then each parameter must be
   // set exactly once. The first added parameter gets index 0, then 1 and so on.
-  ComputationDataHandle AddParam(const Literal& argument,
-                                 ComputationBuilder* builder);
   XlaOp AddParam(const Literal& argument, XlaBuilder* builder);
 
-  template <class T>
-  ComputationDataHandle AddParam(const Array<T>& argument,
-                                 ComputationBuilder* builder) {
-    return AddParam(*Literal::CreateFromArray(argument), builder);
-  }
   template <class T>
   XlaOp AddParam(const Array<T>& argument, XlaBuilder* builder) {
     return AddParam(*Literal::CreateFromArray(argument), builder);
@@ -333,18 +290,11 @@ class ClientLibraryTestBase : public ::testing::Test {
   // Creates a constant instruction with the given literal. When the
   // use_bfloat16 flag is set but the literal has F32 elements, the elements
   // will be converted to BF16s.
-  ComputationDataHandle CreateConstantFromLiteral(const Literal& literal,
-                                                  ComputationBuilder* builder);
   XlaOp CreateConstantFromLiteral(const Literal& literal, XlaBuilder* builder);
 
   // Creates a constant instruction with the given array. When the use_bfloat16
   // flag is set but the array has float elements, the elements will be
   // converted to bfloat16s.
-  template <typename NativeT>
-  ComputationDataHandle CreateConstantFromArray(const Array<NativeT>& array,
-                                                ComputationBuilder* builder) {
-    return CreateConstantFromLiteral(*Literal::CreateFromArray(array), builder);
-  }
 
   template <typename NativeT>
   XlaOp CreateConstantFromArray(const Array<NativeT>& array,
@@ -353,13 +303,6 @@ class ClientLibraryTestBase : public ::testing::Test {
   }
 
   // Same as CreateConstantFromArray, but for scalars.
-  template <typename NativeT>
-  ComputationDataHandle CreateConstantFromScalar(NativeT value,
-                                                 ComputationBuilder* builder) {
-    return CreateConstantFromLiteral(*Literal::CreateR0<NativeT>(value),
-                                     builder);
-  }
-
   template <typename NativeT>
   XlaOp CreateConstantFromScalar(NativeT value, XlaBuilder* builder) {
     return CreateConstantFromLiteral(*Literal::CreateR0<NativeT>(value),
@@ -374,12 +317,12 @@ class ClientLibraryTestBase : public ::testing::Test {
   //
   // When the use_bfloat16 flag is set but NativeT is float, the data will be
   // converted to bfloat16.
-  template <typename NativeT, typename BuilderT, typename HandleT>
+  template <typename NativeT>
   std::unique_ptr<GlobalData> CreateR0Parameter(NativeT value,
                                                 int64 parameter_number,
                                                 const string& name,
-                                                BuilderT* builder,
-                                                HandleT* data_handle);
+                                                XlaBuilder* builder,
+                                                XlaOp* data_handle);
 
   // Creates a parameter instruction that wraps the given values and then stores
   // into "data_handle" the global handle for that parameter.
@@ -389,10 +332,10 @@ class ClientLibraryTestBase : public ::testing::Test {
   //
   // When the use_bfloat16 flag is set but NativeT is float, the data will be
   // converted to bfloat16.
-  template <typename NativeT, typename BuilderT, typename HandleT>
+  template <typename NativeT>
   std::unique_ptr<GlobalData> CreateR1Parameter(
       tensorflow::gtl::ArraySlice<NativeT> values, int64 parameter_number,
-      const string& name, BuilderT* builder, HandleT* data_handle);
+      const string& name, XlaBuilder* builder, XlaOp* data_handle);
 
   // Creates a parameter instruction that wraps the given constant array
   // "array_2d" and then stores to "data_handle" the global handle for that
@@ -403,10 +346,10 @@ class ClientLibraryTestBase : public ::testing::Test {
   //
   // When the use_bfloat16 flag is set but NativeT is float, the data will be
   // converted to bfloat16.
-  template <typename NativeT, typename BuilderT, typename HandleT>
+  template <typename NativeT>
   std::unique_ptr<GlobalData> CreateR2Parameter(
       const Array2D<NativeT>& array_2d, int64 parameter_number,
-      const string& name, BuilderT* builder, HandleT* data_handle);
+      const string& name, XlaBuilder* builder, XlaOp* data_handle);
 
   // Creates a parameter instruction that wraps the given constant array
   // "array_3d" and then stores to "data_handle" the global handle for that
@@ -417,10 +360,10 @@ class ClientLibraryTestBase : public ::testing::Test {
   //
   // When the use_bfloat16 flag is set but NativeT is float, the data will be
   // converted to bfloat16.
-  template <typename NativeT, typename BuilderT, typename HandleT>
+  template <typename NativeT>
   std::unique_ptr<GlobalData> CreateR3Parameter(
       const Array3D<NativeT>& array_3d, int64 parameter_number,
-      const string& name, BuilderT* builder, HandleT* data_handle);
+      const string& name, XlaBuilder* builder, XlaOp* data_handle);
 
   // Getter and setter for the use_bfloat16 flag, which indicates whether to run
   // tests with all float-type input/output converted to bfloat16.
@@ -435,40 +378,18 @@ class ClientLibraryTestBase : public ::testing::Test {
   ExecutionOptions execution_options_;
 
  private:
-  // Build and run the computation with all permutations of output layouts.
-  tensorflow::Status ComputeAndCompareLiteralWithAllOutputLayouts(
-      const xla::Computation& computation, const Literal& expected,
-      tensorflow::gtl::ArraySlice<GlobalData*> arguments,
-      const std::function<void(const Literal& actual,
-                               const string& error_message)>& verify_output);
-  // Build and run the computation with all permutations of layouts of all input
-  // arguments.
-  tensorflow::Status ComputeAndCompareLiteralWithAllInputLayouts(
-      const xla::Computation& computation, const Literal& expected,
-      tensorflow::gtl::ArraySlice<GlobalData*> arguments,
-      const std::function<void(const Literal& actual,
-                               const string& error_message)>& verify_output,
-      const Shape* output_with_layout = nullptr);
-
-  tensorflow::Status ComputeAndCompareLiteralWithAllOutputLayouts(
+  Status ComputeAndCompareLiteralWithAllOutputLayouts(
       const xla::XlaComputation& computation, const Literal& expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments,
       const std::function<void(const Literal& actual,
                                const string& error_message)>& verify_output);
-  tensorflow::Status ComputeAndCompareLiteralWithAllInputLayouts(
+  Status ComputeAndCompareLiteralWithAllInputLayouts(
       const xla::XlaComputation& computation, const Literal& expected,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments,
       const std::function<void(const Literal& actual,
                                const string& error_message)>& verify_output,
       const Shape* output_with_layout = nullptr);
 
-  // Executes the computation and calculates the expected reference value using
-  // the HloEvaluator. Returns two literals in the order of (expected, actual).
-  StatusOr<std::pair<std::unique_ptr<Literal>, std::unique_ptr<Literal>>>
-  ComputeValueAndReference(ComputationBuilder* builder,
-                           const ComputationDataHandle& operand,
-                           tensorflow::gtl::ArraySlice<Literal> arguments);
-
   // Executes the computation and calculates the expected reference value using
   // the reference client. Returns two literals in the order of (expected,
   // actual).
@@ -484,9 +405,9 @@ class ClientLibraryTestBase : public ::testing::Test {
   std::vector<std::unique_ptr<GlobalData>> arguments_;
 };
 
-template <typename NativeT, typename BuilderT>
+template <typename NativeT>
 void ClientLibraryTestBase::ComputeAndCompareR0(
-    BuilderT* builder, NativeT expected,
+    XlaBuilder* builder, NativeT expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
   std::unique_ptr<Literal> expected_literal =
       Literal::CreateR0<NativeT>(expected);
@@ -494,9 +415,9 @@ void ClientLibraryTestBase::ComputeAndCompareR0(
                                                   arguments);
 }
 
-template <typename NativeT, typename BuilderT>
+template <typename NativeT>
 void ClientLibraryTestBase::ComputeAndCompareR0(
-    BuilderT* builder, NativeT expected,
+    XlaBuilder* builder, NativeT expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error) {
   static_assert(std::is_same<NativeT, float>::value ||
                     std::is_same<NativeT, double>::value ||
@@ -510,9 +431,9 @@ void ClientLibraryTestBase::ComputeAndCompareR0(
                                                   arguments, error);
 }
 
-template <typename NativeT, typename BuilderT>
+template <typename NativeT>
 void ClientLibraryTestBase::ComputeAndCompareR1(
-    BuilderT* builder, tensorflow::gtl::ArraySlice<NativeT> expected,
+    XlaBuilder* builder, tensorflow::gtl::ArraySlice<NativeT> expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
   std::unique_ptr<Literal> expected_literal =
       Literal::CreateR1<NativeT>(expected);
@@ -520,9 +441,9 @@ void ClientLibraryTestBase::ComputeAndCompareR1(
                                                   arguments);
 }
 
-template <typename NativeT, typename BuilderT>
+template <typename NativeT>
 void ClientLibraryTestBase::ComputeAndCompareR1(
-    BuilderT* builder, tensorflow::gtl::ArraySlice<NativeT> expected,
+    XlaBuilder* builder, tensorflow::gtl::ArraySlice<NativeT> expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error) {
   static_assert(std::is_same<NativeT, float>::value ||
                     std::is_same<NativeT, double>::value ||
@@ -536,9 +457,9 @@ void ClientLibraryTestBase::ComputeAndCompareR1(
                                                   arguments, error);
 }
 
-template <typename NativeT, typename BuilderT>
+template <typename NativeT>
 void ClientLibraryTestBase::ComputeAndCompareR2(
-    BuilderT* builder, const Array2D<NativeT>& expected,
+    XlaBuilder* builder, const Array2D<NativeT>& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
   std::unique_ptr<Literal> expected_literal =
       Literal::CreateR2FromArray2D<NativeT>(expected);
@@ -546,9 +467,9 @@ void ClientLibraryTestBase::ComputeAndCompareR2(
                                                   arguments);
 }
 
-template <typename NativeT, typename BuilderT>
+template <typename NativeT>
 void ClientLibraryTestBase::ComputeAndCompareR2(
-    BuilderT* builder, const Array2D<NativeT>& expected,
+    XlaBuilder* builder, const Array2D<NativeT>& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error) {
   static_assert(std::is_same<NativeT, float>::value ||
                     std::is_same<NativeT, double>::value ||
@@ -562,9 +483,9 @@ void ClientLibraryTestBase::ComputeAndCompareR2(
                                                   arguments, error);
 }
 
-template <typename NativeT, typename BuilderT>
+template <typename NativeT>
 void ClientLibraryTestBase::ComputeAndCompareR3(
-    BuilderT* builder, const Array3D<NativeT>& expected,
+    XlaBuilder* builder, const Array3D<NativeT>& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
   std::unique_ptr<Literal> expected_literal =
       Literal::CreateR3FromArray3D<NativeT>(expected);
@@ -572,9 +493,9 @@ void ClientLibraryTestBase::ComputeAndCompareR3(
                                                   arguments);
 }
 
-template <typename NativeT, typename BuilderT>
+template <typename NativeT>
 void ClientLibraryTestBase::ComputeAndCompareR3(
-    BuilderT* builder, const Array3D<NativeT>& expected,
+    XlaBuilder* builder, const Array3D<NativeT>& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error) {
   static_assert(std::is_same<NativeT, float>::value ||
                     std::is_same<NativeT, double>::value ||
@@ -588,9 +509,9 @@ void ClientLibraryTestBase::ComputeAndCompareR3(
                                                   arguments, error);
 }
 
-template <typename NativeT, typename BuilderT>
+template <typename NativeT>
 void ClientLibraryTestBase::ComputeAndCompareR4(
-    BuilderT* builder, const Array4D<NativeT>& expected,
+    XlaBuilder* builder, const Array4D<NativeT>& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
   std::unique_ptr<Literal> expected_literal =
       Literal::CreateR4FromArray4D<NativeT>(expected);
@@ -598,9 +519,9 @@ void ClientLibraryTestBase::ComputeAndCompareR4(
                                                   arguments);
 }
 
-template <typename NativeT, typename BuilderT>
+template <typename NativeT>
 void ClientLibraryTestBase::ComputeAndCompareR4(
-    BuilderT* builder, const Array4D<NativeT>& expected,
+    XlaBuilder* builder, const Array4D<NativeT>& expected,
     tensorflow::gtl::ArraySlice<GlobalData*> arguments, ErrorSpec error) {
   static_assert(std::is_same<NativeT, float>::value ||
                     std::is_same<NativeT, double>::value ||
@@ -614,13 +535,13 @@ void ClientLibraryTestBase::ComputeAndCompareR4(
                                                   arguments, error);
 }
 
-template <typename NativeT, typename BuilderT, typename HandleT>
+template <typename NativeT>
 std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR0Parameter(
     NativeT value, int64 parameter_number, const string& name,
-    BuilderT* builder, HandleT* data_handle) {
+    XlaBuilder* builder, XlaOp* data_handle) {
   std::unique_ptr<Literal> literal = Literal::CreateR0(value);
   if (use_bfloat16_ && literal->shape().element_type() == F32) {
-    literal = LiteralTestUtil::ConvertF32ToBF16(*literal);
+    literal = Literal::ConvertF32ToBF16(*literal);
   }
   std::unique_ptr<GlobalData> data =
       client_->TransferToServer(*literal).ConsumeValueOrDie();
@@ -628,13 +549,13 @@ std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR0Parameter(
   return data;
 }
 
-template <typename NativeT, typename BuilderT, typename HandleT>
+template <typename NativeT>
 std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR1Parameter(
     tensorflow::gtl::ArraySlice<NativeT> values, int64 parameter_number,
-    const string& name, BuilderT* builder, HandleT* data_handle) {
+    const string& name, XlaBuilder* builder, XlaOp* data_handle) {
   std::unique_ptr<Literal> literal = Literal::CreateR1(values);
   if (use_bfloat16_ && literal->shape().element_type() == F32) {
-    literal = LiteralTestUtil::ConvertF32ToBF16(*literal);
+    literal = Literal::ConvertF32ToBF16(*literal);
   }
   std::unique_ptr<GlobalData> data =
       client_->TransferToServer(*literal).ConsumeValueOrDie();
@@ -642,13 +563,13 @@ std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR1Parameter(
   return data;
 }
 
-template <typename NativeT, typename BuilderT, typename HandleT>
+template <typename NativeT>
 std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR2Parameter(
     const Array2D<NativeT>& array_2d, int64 parameter_number,
-    const string& name, BuilderT* builder, HandleT* data_handle) {
+    const string& name, XlaBuilder* builder, XlaOp* data_handle) {
   std::unique_ptr<Literal> literal = Literal::CreateR2FromArray2D(array_2d);
   if (use_bfloat16_ && literal->shape().element_type() == F32) {
-    literal = LiteralTestUtil::ConvertF32ToBF16(*literal);
+    literal = Literal::ConvertF32ToBF16(*literal);
   }
   std::unique_ptr<GlobalData> data =
       client_->TransferToServer(*literal).ConsumeValueOrDie();
@@ -656,13 +577,13 @@ std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR2Parameter(
   return data;
 }
 
-template <typename NativeT, typename BuilderT, typename HandleT>
+template <typename NativeT>
 std::unique_ptr<GlobalData> ClientLibraryTestBase::CreateR3Parameter(
     const Array3D<NativeT>& array_3d, int64 parameter_number,
-    const string& name, BuilderT* builder, HandleT* data_handle) {
+    const string& name, XlaBuilder* builder, XlaOp* data_handle) {
   std::unique_ptr<Literal> literal = Literal::CreateR3FromArray3D(array_3d);
   if (use_bfloat16_ && literal->shape().element_type() == F32) {
-    literal = LiteralTestUtil::ConvertF32ToBF16(*literal);
+    literal = Literal::ConvertF32ToBF16(*literal);
   }
   std::unique_ptr<GlobalData> data =
       client_->TransferToServer(*literal).ConsumeValueOrDie();
@@ -695,37 +616,6 @@ std::unique_ptr<Array2D<NativeT>> ClientLibraryTestBase::CreatePseudorandomR2(
   return result;
 }
 
-template <typename BuilderT, typename HandleT>
-std::unique_ptr<GlobalData>
-ClientLibraryTestBase::CreateParameterAndTransferLiteral(int64 parameter_number,
-                                                         const Literal& literal,
-                                                         const string& name,
-                                                         BuilderT* builder,
-                                                         HandleT* data_handle) {
-  return CreateParameterAndTransferLiteral(parameter_number, literal, name,
-                                           nullptr, builder, data_handle);
-}
-
-template <typename BuilderT, typename HandleT>
-std::unique_ptr<GlobalData>
-ClientLibraryTestBase::CreateParameterAndTransferLiteral(
-    int64 parameter_number, const Literal& literal, const string& name,
-    const DeviceHandle* device_handle, BuilderT* builder,
-    HandleT* data_handle) {
-  const Literal* param_literal = &literal;
-  std::unique_ptr<Literal> converted_literal;
-  if (use_bfloat16_) {
-    converted_literal = LiteralTestUtil::ConvertF32ToBF16(literal);
-    param_literal = converted_literal.get();
-  }
-  std::unique_ptr<GlobalData> data =
-      client_->TransferToServer(*param_literal, device_handle)
-          .ConsumeValueOrDie();
-  *data_handle =
-      builder->Parameter(parameter_number, param_literal->shape(), name);
-  return data;
-}
-
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_TESTS_CLIENT_LIBRARY_TEST_BASE_H_
diff --git a/tensorflow/compiler/xla/tests/client_test.cc b/tensorflow/compiler/xla/tests/client_test.cc
index 1e544717967731..08671cf6244582 100644
--- a/tensorflow/compiler/xla/tests/client_test.cc
+++ b/tensorflow/compiler/xla/tests/client_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
@@ -39,7 +38,7 @@ namespace {
 class ClientTest : public ClientLibraryTestBase {};
 
 XLA_TEST_F(ClientTest, ExecuteWithLayout) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   std::vector<std::vector<int64>> layouts = {{0, 1}, {1, 0}};
   for (const std::vector<int64>& execute_layout : layouts) {
@@ -63,15 +62,15 @@ XLA_TEST_F(ClientTest, ExecuteWithLayout) {
       TF_ASSERT_OK_AND_ASSIGN(
           auto computed, client_->Transfer(*data, &expected_literal->shape()));
 
-      LiteralTestUtil::AssertEqualShapesAndLayouts(expected_literal->shape(),
-                                                   computed->shape());
-      LiteralTestUtil::ExpectEqual(*expected_literal, *computed);
+      ASSERT_TRUE(LiteralTestUtil::EqualShapesAndLayouts(
+          expected_literal->shape(), computed->shape()));
+      EXPECT_TRUE(LiteralTestUtil::Equal(*expected_literal, *computed));
     }
   }
 }
 
 XLA_TEST_F(ClientTest, ExecuteWithTupleLayout) {
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
 
   b.Tuple({b.ConstantR2<int32>({{1, 2}, {3, 4}}),
            b.ConstantR2<int32>({{10, 20}, {30, 40}})});
@@ -92,9 +91,9 @@ XLA_TEST_F(ClientTest, ExecuteWithTupleLayout) {
       auto result,
       client_->ExecuteAndTransfer(computation, {}, &execution_options));
   LiteralTestUtil::ExpectR2Equal<int32>({{1, 2}, {3, 4}},
-                                        LiteralView::Create(*result, {0}));
+                                        LiteralSlice(*result, {0}));
   LiteralTestUtil::ExpectR2Equal<int32>({{10, 20}, {30, 40}},
-                                        LiteralView::Create(*result, {1}));
+                                        LiteralSlice(*result, {1}));
 
   EXPECT_TRUE(ShapeUtil::IsTuple(result->shape()));
   EXPECT_EQ(2, ShapeUtil::TupleElementCount(result->shape()));
@@ -143,7 +142,7 @@ XLA_TEST_F(ClientTest, DISABLED_ON_GPU(ExecuteParallel)) {
       auto result_literal,
       client_->Transfer(*results[0], &expected_result->shape()));
 
-  LiteralTestUtil::ExpectEqual(*expected_result, *result_literal);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected_result, *result_literal));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/compilation_cache_test.cc b/tensorflow/compiler/xla/tests/compilation_cache_test.cc
index 0f780fa87ef98f..50a006964869b3 100644
--- a/tensorflow/compiler/xla/tests/compilation_cache_test.cc
+++ b/tensorflow/compiler/xla/tests/compilation_cache_test.cc
@@ -17,10 +17,10 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -39,7 +39,7 @@ namespace {
 class CompilationCacheTest : public ClientLibraryTestBase {
  public:
   void ExecuteComputationR0F32(
-      const Computation& computation,
+      const XlaComputation& computation,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments, float expected_result,
       bool expect_cache_hit) {
     ExecutionProfile execution_profile;
@@ -49,13 +49,13 @@ class CompilationCacheTest : public ClientLibraryTestBase {
                                  /*execution_options=*/&execution_options_,
                                  &execution_profile)
             .ConsumeValueOrDie();
-    LiteralTestUtil::ExpectNear(*Literal::CreateR0<float>(expected_result),
-                                *result, error_spec_);
+    EXPECT_TRUE(LiteralTestUtil::Near(
+        *Literal::CreateR0<float>(expected_result), *result, error_spec_));
     EXPECT_EQ(expect_cache_hit, execution_profile.compilation_cache_hit());
   }
 
   void ExecuteComputationR2F32(
-      const Computation& computation,
+      const XlaComputation& computation,
       tensorflow::gtl::ArraySlice<GlobalData*> arguments,
       std::initializer_list<std::initializer_list<float>> expected_result,
       bool expect_cache_hit) {
@@ -66,25 +66,28 @@ class CompilationCacheTest : public ClientLibraryTestBase {
                            .ConsumeValueOrDie();
     std::unique_ptr<Literal> result =
         client_->Transfer(*data_handle).ConsumeValueOrDie();
-    LiteralTestUtil::ExpectNear(*Literal::CreateR2<float>(expected_result),
-                                *result, error_spec_);
+    EXPECT_TRUE(LiteralTestUtil::Near(
+        *Literal::CreateR2<float>(expected_result), *result, error_spec_));
     EXPECT_EQ(expect_cache_hit, execution_profile.compilation_cache_hit());
   }
 
   ErrorSpec error_spec_{0.0001};
 };
 
-XLA_TEST_F(CompilationCacheTest, ComputationCalledMultipleTimes) {
-  ComputationBuilder builder(client_, TestName());
+// TODO(b/74197823): Disabled because there is no cache in the new design.
+XLA_TEST_F(CompilationCacheTest, DISABLED_ComputationCalledMultipleTimes) {
+  XlaBuilder builder(TestName());
   builder.Neg(builder.ConstantR0<float>(42.0));
-  Computation computation = builder.Build().ConsumeValueOrDie();
+  XlaComputation computation = builder.Build().ConsumeValueOrDie();
 
   ExecuteComputationR0F32(computation, {}, -42.0, /*expect_cache_hit=*/false);
   ExecuteComputationR0F32(computation, {}, -42.0, /*expect_cache_hit=*/true);
   ExecuteComputationR0F32(computation, {}, -42.0, /*expect_cache_hit=*/true);
 }
 
-XLA_TEST_F(CompilationCacheTest, ComputationCalledWithDifferentParameters) {
+// TODO(b/74197823): Disabled because there is no cache in the new design.
+XLA_TEST_F(CompilationCacheTest,
+           DISABLED_ComputationCalledWithDifferentParameters) {
   std::unique_ptr<GlobalData> data_42 =
       client_->TransferToServer(*Literal::CreateR0<float>(42.0f))
           .ConsumeValueOrDie();
@@ -95,9 +98,9 @@ XLA_TEST_F(CompilationCacheTest, ComputationCalledWithDifferentParameters) {
       client_->TransferToServer(*Literal::CreateR0<float>(456.0f))
           .ConsumeValueOrDie();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Neg(builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "param"));
-  Computation computation = builder.Build().ConsumeValueOrDie();
+  XlaComputation computation = builder.Build().ConsumeValueOrDie();
 
   ExecuteComputationR0F32(computation, {data_42.get()}, -42.0,
                           /*expect_cache_hit=*/false);
@@ -109,19 +112,20 @@ XLA_TEST_F(CompilationCacheTest, ComputationCalledWithDifferentParameters) {
                           /*expect_cache_hit=*/true);
 }
 
-XLA_TEST_F(CompilationCacheTest, MultipleComputations) {
-  ComputationBuilder builder_neg(client_, TestName() + "_neg");
+// TODO(b/74197823): Disabled because there is no cache in the new design.
+XLA_TEST_F(CompilationCacheTest, DISABLED_MultipleComputations) {
+  XlaBuilder builder_neg(TestName() + "_neg");
   builder_neg.Neg(builder_neg.ConstantR0<float>(42.0));
-  Computation computation_neg = builder_neg.Build().ConsumeValueOrDie();
+  XlaComputation computation_neg = builder_neg.Build().ConsumeValueOrDie();
 
-  ComputationBuilder builder_exp(client_, TestName() + "_exp");
+  XlaBuilder builder_exp(TestName() + "_exp");
   builder_exp.Exp(builder_exp.ConstantR0<float>(1.0));
-  Computation computation_exp = builder_exp.Build().ConsumeValueOrDie();
+  XlaComputation computation_exp = builder_exp.Build().ConsumeValueOrDie();
 
-  ComputationBuilder builder_add(client_, TestName() + "_add");
+  XlaBuilder builder_add(TestName() + "_add");
   builder_add.Add(builder_add.ConstantR0<float>(2.0),
                   builder_add.ConstantR0<float>(3.0));
-  Computation computation_add = builder_add.Build().ConsumeValueOrDie();
+  XlaComputation computation_add = builder_add.Build().ConsumeValueOrDie();
 
   ExecuteComputationR0F32(computation_neg, {}, -42.0,
                           /*expect_cache_hit=*/false);
@@ -133,7 +137,8 @@ XLA_TEST_F(CompilationCacheTest, MultipleComputations) {
                           /*expect_cache_hit=*/true);
 }
 
-XLA_TEST_F(CompilationCacheTest, DifferentParameterLayouts) {
+// TODO(b/74197823): Disabled because there is no cache in the new design.
+XLA_TEST_F(CompilationCacheTest, DISABLED_DifferentParameterLayouts) {
   // Create two GlobalData arrays with the same shape but different
   // layouts. Use these arrays as parameters to a simple computation. If the
   // layout of the array changes then computation should be recompiled (cache
@@ -148,9 +153,9 @@ XLA_TEST_F(CompilationCacheTest, DifferentParameterLayouts) {
   auto colmaj_handle =
       client_->TransferToServer(*colmaj_array).ConsumeValueOrDie();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Parameter(0, ShapeUtil::MakeShape(F32, {2, 2}), "param0");
-  Computation computation = builder.Build().ConsumeValueOrDie();
+  XlaComputation computation = builder.Build().ConsumeValueOrDie();
 
   ExecuteComputationR2F32(computation, {colmaj_handle.get()},
                           {{1.0f, 2.0f}, {3.0f, 4.0f}},
@@ -169,32 +174,5 @@ XLA_TEST_F(CompilationCacheTest, DifferentParameterLayouts) {
                           /*expect_cache_hit=*/true);
 }
 
-XLA_TEST_F(CompilationCacheTest, MutatedComputation) {
-  // Build a computation, execute it, then mutate it. The mutated computation
-  // should not be in the cache until it is run once. This must be done through
-  // the stub interface because Computations built from ComputationBuilder are
-  // immutable.
-  ComputationBuilder builder(client_, TestName());
-  auto neg = builder.Neg(builder.ConstantR0<float>(42.0));
-  Computation computation = builder.Build().ConsumeValueOrDie();
-
-  ExecuteComputationR0F32(computation, {}, -42.0, /*expect_cache_hit=*/false);
-  ExecuteComputationR0F32(computation, {}, -42.0, /*expect_cache_hit=*/true);
-
-  BinaryOpRequest request;
-  request.set_binop(BINOP_ADD);
-  *request.mutable_lhs() = neg;
-  *request.mutable_rhs() = neg;
-  OpRequest op_request;
-  *op_request.mutable_computation() = computation.handle();
-  *op_request.mutable_binary_op_request() = request;
-  OpResponse response;
-  tensorflow::Status s = client_->stub()->Op(&op_request, &response);
-  ASSERT_TRUE(s.ok());
-
-  ExecuteComputationR0F32(computation, {}, -84.0, /*expect_cache_hit=*/false);
-  ExecuteComputationR0F32(computation, {}, -84.0, /*expect_cache_hit=*/true);
-}
-
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/compute_constant_test.cc b/tensorflow/compiler/xla/tests/compute_constant_test.cc
index 7ea82a791f72ea..ba22530f1cfee5 100644
--- a/tensorflow/compiler/xla/tests/compute_constant_test.cc
+++ b/tensorflow/compiler/xla/tests/compute_constant_test.cc
@@ -18,8 +18,6 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
@@ -88,17 +86,6 @@ class ComputeConstantTest : public ::testing::Test {
     return literal->Get<Scalar>({});
   }
 
-  template <class Scalar>
-  StatusOr<Scalar> ComputeConstantScalar(
-      Client* client, const ComputationDataHandle& operand,
-      ComputationBuilder* builder,
-      tensorflow::gtl::ArraySlice<Literal> parameters = {}) {
-    TF_ASSIGN_OR_RETURN(auto literal,
-                        builder->ComputeConstant(
-                            operand, /*output_layout=*/nullptr, parameters));
-    return literal->Get<Scalar>({});
-  }
-
   bool IsConstant(const XlaOp& operand, XlaBuilder* builder) {
     StatusOr<bool> result = builder->IsConstant(operand);
     EXPECT_TRUE(result.ok()) << result.status();
@@ -150,26 +137,6 @@ TEST_F(ComputeConstantTest, ScalarRng) {
   }
 }
 
-TEST_F(ComputeConstantTest, Param) {
-  for (ClientType client_type : client_types) {
-    Client* client = ClientOrDie(platform_, client_type);
-    ComputationBuilder b(client, TestName());
-    auto param = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "lhs");
-    auto computation = b.Add(param, b.ConstantR0<float>(1.5f));
-
-    std::vector<Literal> arguments;
-    arguments.push_back(std::move(*Literal::CreateR0(42.5f)));
-    TF_ASSERT_OK_AND_ASSIGN(bool is_constant,
-                            b.IsConstant(computation, arguments.size()));
-    EXPECT_TRUE(is_constant);
-
-    TF_ASSERT_OK_AND_ASSIGN(
-        auto value,
-        ComputeConstantScalar<float>(client, computation, &b, arguments));
-    EXPECT_EQ(value, 44.0f);
-  }
-}
-
 TEST_F(ComputeConstantTest, DirectParamMissing) {
   for (ClientType client_type : client_types) {
     Client* client = ClientOrDie(platform_, client_type);
@@ -241,7 +208,7 @@ TEST_F(ComputeConstantTest, NonScalarAdd) {
                             ComputeConstantLiteral(client, computation, &b));
     std::unique_ptr<Literal> expected_literal =
         Literal::CreateR1<int32>({4, 6});
-    LiteralTestUtil::ExpectEqual(*expected_literal, *computed);
+    EXPECT_TRUE(LiteralTestUtil::Equal(*expected_literal, *computed));
   }
 }
 
@@ -255,7 +222,7 @@ TEST_F(ComputeConstantTest, IntegerDivide) {
     TF_ASSERT_OK_AND_ASSIGN(auto computed,
                             ComputeConstantLiteral(client, computation, &b));
     std::unique_ptr<Literal> expected_literal = Literal::CreateR0<int32>(5);
-    LiteralTestUtil::ExpectEqual(*expected_literal, *computed);
+    EXPECT_TRUE(LiteralTestUtil::Equal(*expected_literal, *computed));
   }
 }
 
@@ -277,9 +244,9 @@ XLA_TEST_F(ComputeConstantTest, Layout) {
       std::unique_ptr<Literal> expected_literal =
           Literal::CreateR2WithLayout<int32>({{11, 22}, {33, 44}},
                                              LayoutUtil::MakeLayout(layout));
-      LiteralTestUtil::AssertEqualShapesAndLayouts(expected_literal->shape(),
-                                                   computed->shape());
-      LiteralTestUtil::ExpectEqual(*expected_literal, *computed);
+      ASSERT_TRUE(LiteralTestUtil::EqualShapesAndLayouts(
+          expected_literal->shape(), computed->shape()));
+      EXPECT_TRUE(LiteralTestUtil::Equal(*expected_literal, *computed));
     }
   }
 }
diff --git a/tensorflow/compiler/xla/tests/constants_test.cc b/tensorflow/compiler/xla/tests/constants_test.cc
index 35aa3f6d696297..916ffadbc798ec 100644
--- a/tensorflow/compiler/xla/tests/constants_test.cc
+++ b/tensorflow/compiler/xla/tests/constants_test.cc
@@ -21,12 +21,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -39,7 +38,7 @@ class ConstantsTest : public ClientLibraryTestBase {
 };
 
 TEST_F(ConstantsTest, ZeroCellF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR1<float>({});
 
   ComputeAndCompareR1<float>(&builder, {}, {}, error_spec_);
@@ -48,7 +47,7 @@ TEST_F(ConstantsTest, ZeroCellF32) {
 TEST_F(ConstantsTest, OneCellF32) {
   std::vector<float> constant = {2.0};
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR1<float>(constant);
 
   ComputeAndCompareR1<float>(&builder, constant, {}, error_spec_);
@@ -57,7 +56,7 @@ TEST_F(ConstantsTest, OneCellF32) {
 TEST_F(ConstantsTest, OneCellS32) {
   std::vector<int32> constant = {2};
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR1<int32>(constant);
 
   ComputeAndCompareR1<int32>(&builder, constant, {});
@@ -66,7 +65,7 @@ TEST_F(ConstantsTest, OneCellS32) {
 TEST_F(ConstantsTest, OneCellU32) {
   std::vector<uint32> constant = {2};
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR1<uint32>(constant);
 
   ComputeAndCompareR1<uint32>(&builder, constant, {});
@@ -75,7 +74,7 @@ TEST_F(ConstantsTest, OneCellU32) {
 TEST_F(ConstantsTest, EightCells) {
   std::vector<float> constant = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0};
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR1<float>(constant);
 
   ComputeAndCompareR1<float>(&builder, constant, {}, error_spec_);
@@ -85,14 +84,14 @@ TEST_F(ConstantsTest, SixteenCells) {
   std::vector<float> constant = {0.0, 1.0, 2.0,  3.0,  4.0,  5.0,  6.0,  7.0,
                                  8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0};
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR1<float>(constant);
 
   ComputeAndCompareR1<float>(&builder, constant, {}, error_spec_);
 }
 
 TEST_F(ConstantsTest, Empty_0x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 2));
 
   ComputeAndCompareR2<float>(&builder, Array2D<float>(0, 2), {}, error_spec_);
@@ -102,14 +101,14 @@ TEST_F(ConstantsTest, Small_2x2) {
   std::unique_ptr<Array2D<float>> constant =
       MakeLinspaceArray2D(100.0, 200.0, 2, 2);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR2FromArray2D<float>(*constant);
 
   ComputeAndCompareR2<float>(&builder, *constant, {}, error_spec_);
 }
 
 TEST_F(ConstantsTest, Empty_3x0x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto constant = builder.ConstantLiteral(
       *Literal::CreateR3FromArray3D<float>(Array3D<float>(3, 0, 2)));
 
@@ -117,7 +116,7 @@ TEST_F(ConstantsTest, Empty_3x0x2) {
 }
 
 TEST_F(ConstantsTest, Small_2x2x2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Array3D<float> array3d({
       // x0  x1
       {{1.f, 2.f},   // y0
@@ -145,13 +144,13 @@ TEST_F(ConstantsTest, Small_3x2x1x1) {
       Literal::CreateR4FromArray4D(input_array);
 
   {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     builder.ConstantLiteral(*input_literal);
     ComputeAndCompareR4<float>(&builder, input_array, {}, error_spec_);
   }
 
   {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     builder.ConstantR4FromArray4D<float>(input_array);
     ComputeAndCompareR4<float>(&builder, input_array, {}, error_spec_);
   }
@@ -159,17 +158,18 @@ TEST_F(ConstantsTest, Small_3x2x1x1) {
 
 // TODO(b/29263943): Support tuple constants.
 TEST_F(ConstantsTest, DISABLED_TupleConstant) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantLiteral(
       *Literal::MakeTuple({Literal::CreateR2<float>({{1.0}, {2.0}}).get(),
                            Literal::CreateR1<float>({2.0, 42}).get()}));
 
-  std::unique_ptr<Literal> result = ExecuteAndTransferOrDie(&builder, {});
+  std::unique_ptr<Literal> result =
+      ExecuteAndTransfer(&builder, {}).ConsumeValueOrDie();
 
   LiteralTestUtil::ExpectR2Near<float>(
-      {{1.0}, {2.0}}, LiteralView::Create(*result, {0}), error_spec_);
+      {{1.0}, {2.0}}, LiteralSlice(*result, {0}), error_spec_);
   LiteralTestUtil::ExpectR1Near<float>(
-      {2.0, 42.0}, LiteralView::Create(*result, {1}), error_spec_);
+      {2.0, 42.0}, LiteralSlice(*result, {1}), error_spec_);
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/convert_test.cc b/tensorflow/compiler/xla/tests/convert_test.cc
index e67a30d76c2fac..722d882471a41a 100644
--- a/tensorflow/compiler/xla/tests/convert_test.cc
+++ b/tensorflow/compiler/xla/tests/convert_test.cc
@@ -18,8 +18,8 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
@@ -44,7 +44,7 @@ class ConvertTest : public ClientLibraryTestBase {
 };
 
 TEST_F(ConvertTest, ConvertR1S32ToR1S32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({42, 64});
   builder.ConvertElementType(a, S32);
 
@@ -53,7 +53,7 @@ TEST_F(ConvertTest, ConvertR1S32ToR1S32) {
 }
 
 TEST_F(ConvertTest, ConvertR1F32ToR1F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.0f, 64.0f});
   builder.ConvertElementType(a, F32);
 
@@ -62,7 +62,7 @@ TEST_F(ConvertTest, ConvertR1F32ToR1F32) {
 }
 
 TEST_F(ConvertTest, ConvertR1S32ToR1F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({42, 64});
   builder.ConvertElementType(a, F32);
 
@@ -71,7 +71,7 @@ TEST_F(ConvertTest, ConvertR1S32ToR1F32) {
 }
 
 TEST_F(ConvertTest, ConvertR1PREDToR1S32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<bool>({true, false, true});
   builder.ConvertElementType(a, S32);
 
@@ -80,7 +80,7 @@ TEST_F(ConvertTest, ConvertR1PREDToR1S32) {
 }
 
 TEST_F(ConvertTest, ConvertR1PREDToR1F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<bool>({true, false, true});
   builder.ConvertElementType(a, F32);
 
@@ -89,7 +89,7 @@ TEST_F(ConvertTest, ConvertR1PREDToR1F32) {
 }
 
 XLA_TEST_F(ConvertTest, ConvertR1S0S32ToR1S0F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>({});
   builder.ConvertElementType(a, F32);
 
@@ -98,7 +98,7 @@ XLA_TEST_F(ConvertTest, ConvertR1S0S32ToR1S0F32) {
 }
 
 TEST_F(ConvertTest, ConvertR1F32ToR1S32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({42.6, 64.4});
   builder.ConvertElementType(a, S32);
 
@@ -107,7 +107,7 @@ TEST_F(ConvertTest, ConvertR1F32ToR1S32) {
 }
 
 XLA_TEST_F(ConvertTest, ConvertR1S64ToR1F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::vector<int64> arg{
       -9223371216516022272,
       -2,
@@ -160,7 +160,7 @@ XLA_TEST_F(ConvertTest, ConvertR1S64ToR1F32) {
 }
 
 XLA_TEST_F(ConvertTest, ConvertR1U32ToR1F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::vector<uint32> arg{0,          1,          0x1000,     0x7fffffff,
                           0x80000000, 0x80000001, 0x80000002, 0x80000003,
                           0x80000080, 0x80000081, 0x80000082, 0xFFFFFFFF};
@@ -179,7 +179,7 @@ XLA_TEST_F(ConvertTest, ConvertR1U32ToR1F32) {
 }
 
 XLA_TEST_F(ConvertTest, ConvertR1F32ToR1U32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::vector<float> arg{0.0f,        1.0f,          16777216.0f,
                          16777218.0f, 2147483647.0f, 4294967040.0f};
   std::unique_ptr<Literal> arg_literal = Literal::CreateR1<float>({arg});
@@ -197,7 +197,7 @@ XLA_TEST_F(ConvertTest, ConvertR1F32ToR1U32) {
 }
 
 XLA_TEST_F(ConvertTest, ConvertR1U32ToR1S64) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::vector<uint32> arg{0, 1, 0x1000, 0x7fffffff, 0x80000082, 0xFFFFFFFF};
   std::unique_ptr<Literal> arg_literal = Literal::CreateR1<uint32>({arg});
   auto arg_param = builder.Parameter(0, arg_literal->shape(), "arg_param");
@@ -214,7 +214,7 @@ XLA_TEST_F(ConvertTest, ConvertR1U32ToR1S64) {
 }
 
 XLA_TEST_F(ConvertTest, ConvertR1S32ToR1S64) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::vector<int32> arg{0, 1, 0x1000, -1, -0x1000};
   std::unique_ptr<Literal> arg_literal = Literal::CreateR1<int32>({arg});
   auto arg_param = builder.Parameter(0, arg_literal->shape(), "arg_param");
@@ -231,7 +231,7 @@ XLA_TEST_F(ConvertTest, ConvertR1S32ToR1S64) {
 }
 
 XLA_TEST_F(ConvertTest, ConvertR1F32ToR1S64) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   // Test cases from compiler_rt library.
   std::vector<float> arg{0.0f,
                          0.5f,
@@ -249,10 +249,10 @@ XLA_TEST_F(ConvertTest, ConvertR1F32ToR1S64) {
                          -1.99f,
                          -2.0f,
                          -2.01f,
-                         0x1.FFFFFEp+62F,
-                         0x1.FFFFFCp+62F,
-                         -0x1.FFFFFEp+62F,
-                         -0x1.FFFFFCp+62F};
+                         9223371487098961920.f,
+                         9223370937343148032.f,
+                         -9223371487098961920.f,
+                         -9223370937343148032.f};
   std::unique_ptr<Literal> arg_literal = Literal::CreateR1<float>({arg});
   auto arg_param = builder.Parameter(0, arg_literal->shape(), "arg_param");
   std::unique_ptr<GlobalData> arg_data =
@@ -268,7 +268,7 @@ XLA_TEST_F(ConvertTest, ConvertR1F32ToR1S64) {
 }
 
 XLA_TEST_F(ConvertTest, ConvertR1U8ToR1F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<uint8_t>({32, 64});
   builder.ConvertElementType(a, F32);
 
@@ -277,7 +277,7 @@ XLA_TEST_F(ConvertTest, ConvertR1U8ToR1F32) {
 }
 
 XLA_TEST_F(ConvertTest, ConvertR1U8ToR1S32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<uint8_t>({32, 64});
   builder.ConvertElementType(a, S32);
 
@@ -286,7 +286,7 @@ XLA_TEST_F(ConvertTest, ConvertR1U8ToR1S32) {
 }
 
 XLA_TEST_F(ConvertTest, ConvertR1U8ToR1U32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<uint8_t>({32, 64});
   builder.ConvertElementType(a, U32);
 
@@ -295,7 +295,7 @@ XLA_TEST_F(ConvertTest, ConvertR1U8ToR1U32) {
 }
 
 XLA_TEST_F(ConvertTest, ConvertR1F32ToR1F64) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<float>({32.0f, 64.0f});
   builder.ConvertElementType(a, F64);
 
@@ -304,7 +304,7 @@ XLA_TEST_F(ConvertTest, ConvertR1F32ToR1F64) {
 }
 
 XLA_TEST_F(ConvertTest, ConvertR1F64ToR1F32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<double>({32.0, 64.0});
   builder.ConvertElementType(a, F32);
 
@@ -313,7 +313,7 @@ XLA_TEST_F(ConvertTest, ConvertR1F64ToR1F32) {
 }
 
 TEST_F(ConvertTest, ConvertS32Extremes) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.ConstantR1<int32>(
       {std::numeric_limits<int32>::min(), std::numeric_limits<int32>::max()});
   builder.ConvertElementType(a, F32);
@@ -325,7 +325,7 @@ TEST_F(ConvertTest, ConvertS32Extremes) {
 }
 
 TEST_F(ConvertTest, ConvertMapToS32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto b = builder.CreateSubBuilder("convert");
   auto param = b->Parameter(0, ShapeUtil::MakeShape(F32, {}), "in");
   b->ConvertElementType(param, S32);
@@ -337,7 +337,7 @@ TEST_F(ConvertTest, ConvertMapToS32) {
 }
 
 TEST_F(ConvertTest, ConvertMapToF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto b = builder.CreateSubBuilder("convert");
   auto param = b->Parameter(0, ShapeUtil::MakeShape(S32, {}), "in");
   b->ConvertElementType(param, F32);
@@ -354,7 +354,7 @@ TEST_F(ConvertTest, ConvertMapToF32) {
 //   input -> convert -> reshape
 // the new convert should have the same element type as the old convert.
 TEST_F(ConvertTest, ConvertReshape) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto input = builder.ConstantR1<int32>({42});
   auto reshape = builder.Reshape(input, /*dimensions=*/{0}, /*new_sizes=*/{});
   builder.ConvertElementType(reshape, F32);
@@ -393,7 +393,7 @@ XLA_TEST_F(ConvertTest, ConvertR1F16ToR1F32) {
       std::unique_ptr<GlobalData> dot_lhs_handle,
       client_->TransferToServer(*Literal::CreateR1<half>(input)));
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConvertElementType(
       builder.Parameter(
           0, ShapeUtil::MakeShape(F16, {static_cast<int64>(input.size())}),
@@ -413,7 +413,7 @@ XLA_TEST_F(ConvertTest, ConvertR1F32ToR1F16) {
       std::unique_ptr<GlobalData> dot_lhs_handle,
       client_->TransferToServer(*Literal::CreateR1<float>(input)));
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConvertElementType(
       builder.Parameter(
           0, ShapeUtil::MakeShape(F32, {static_cast<int64>(input.size())}),
@@ -424,28 +424,28 @@ XLA_TEST_F(ConvertTest, ConvertR1F32ToR1F16) {
 }
 
 XLA_TEST_F(ConvertTest, ConvertC64ToC64) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::vector<complex64> x = {{42.0f, 64.0f}};
   builder.ConvertElementType(builder.ConstantR1<complex64>(x), C64);
   ComputeAndCompareR1<complex64>(&builder, x, {}, ErrorSpec(0.0001));
 }
 
 XLA_TEST_F(ConvertTest, ConvertS64S64) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::vector<int64> x = {{-42, 64}};
   builder.ConvertElementType(builder.ConstantR1<int64>(x), S64);
   ComputeAndCompareR1<int64>(&builder, x, {});
 }
 
 XLA_TEST_F(ConvertTest, ConvertU64U64) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::vector<uint64> x = {{42, 64}};
   builder.ConvertElementType(builder.ConstantR1<uint64>(x), U64);
   ComputeAndCompareR1<uint64>(&builder, x, {});
 }
 
 XLA_TEST_F(ConvertTest, ConvertU64S64) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::vector<uint64> unsigned_x = {{42, UINT64_MAX}};
   builder.ConvertElementType(builder.ConstantR1<uint64>(unsigned_x), S64);
   std::vector<int64> signed_x = {{42, -1}};
@@ -453,7 +453,7 @@ XLA_TEST_F(ConvertTest, ConvertU64S64) {
 }
 
 XLA_TEST_F(ConvertTest, ConvertS64U64) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::vector<int64> signed_x = {{42, -1, INT64_MIN}};
   builder.ConvertElementType(builder.ConstantR1<int64>(signed_x), U64);
   std::vector<uint64> unsigned_x = {
diff --git a/tensorflow/compiler/xla/tests/convolution_variants_test.cc b/tensorflow/compiler/xla/tests/convolution_variants_test.cc
index 50d6e25d868c49..fea850dc135e33 100644
--- a/tensorflow/compiler/xla/tests/convolution_variants_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_variants_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/padding.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
diff --git a/tensorflow/compiler/xla/tests/copy_test.cc b/tensorflow/compiler/xla/tests/copy_test.cc
index ece7c3b05e7faf..2b3390ca98cb29 100644
--- a/tensorflow/compiler/xla/tests/copy_test.cc
+++ b/tensorflow/compiler/xla/tests/copy_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/compiler/xla/array2d.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -48,7 +49,7 @@ class CopyOpTest : public HloTestBase {
     module->AddEntryComputation(std::move(computation));
 
     std::unique_ptr<Literal> result = ExecuteAndTransfer(std::move(module), {});
-    LiteralTestUtil::ExpectEqual(literal, *result);
+    EXPECT_TRUE(LiteralTestUtil::Equal(literal, *result));
   }
 
   void TestCopyConstantLayout021(size_t n1, size_t n2, size_t n3);
@@ -246,13 +247,13 @@ XLA_TEST_F(CopyOpClientTest, Copy0x0) {
   Shape out_shape = ShapeUtil::MakeShapeWithLayout(F32, {0, 0}, {1, 0});
   auto empty = Literal::CreateFromShape(in_shape);
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto param0 = builder.Parameter(0, in_shape, "input");
   auto input_data = client_->TransferToServer(*empty).ConsumeValueOrDie();
 
   auto actual = ExecuteAndTransfer(&builder, {input_data.get()}, &out_shape)
                     .ConsumeValueOrDie();
-  LiteralTestUtil::ExpectEqual(*empty, *actual);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*empty, *actual));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/cross_replica_sum_test.cc b/tensorflow/compiler/xla/tests/cross_replica_sum_test.cc
new file mode 100644
index 00000000000000..b151187c4b8f01
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/cross_replica_sum_test.cc
@@ -0,0 +1,103 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+
+namespace xla {
+namespace {
+
+class TrivialCrossReplicaSumTest : public HloTestBase {};
+
+// Currently the CPU and GPU backends only support CrossReplicaSum with one
+// replica.  But we can at least check this.
+
+XLA_TEST_F(TrivialCrossReplicaSumTest, OneOperand) {
+  const char* module_str = R"(
+  HloModule test
+
+  add {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    add = f32[] add(x, y)
+  }
+
+  ENTRY test_computation {
+    p = f32[3] parameter(0)
+    ROOT crs = f32[3] cross-replica-sum(p), to_apply=add
+  })";
+  auto module =
+      ParseHloString(module_str, GetModuleConfigForTest()).ValueOrDie();
+  auto literal = Literal::CreateR1<float>({1, 2, 3});
+  EXPECT_EQ(*literal, *ExecuteAndTransfer(std::move(module), {literal.get()}));
+}
+
+XLA_TEST_F(TrivialCrossReplicaSumTest, MultipleOperands) {
+  const char* module_str = R"(
+  HloModule test
+
+  add {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    add = f32[] add(x, y)
+  }
+
+  ENTRY test_computation {
+    p0 = f32[3] parameter(0)
+    p1 = f32[2] parameter(1)
+    ROOT crs = (f32[3], f32[2]) cross-replica-sum(p0, p1), to_apply=add
+  })";
+  auto module =
+      ParseHloString(module_str, GetModuleConfigForTest()).ValueOrDie();
+  auto literal0 = Literal::CreateR1<float>({1, 2, 3});
+  auto literal1 = Literal::CreateR1<float>({10, 20});
+  EXPECT_EQ(
+      *Literal::MakeTuple({literal0.get(), literal1.get()}),
+      *ExecuteAndTransfer(std::move(module), {literal0.get(), literal1.get()}));
+}
+
+// On the GPU backend, constants get special handling.  Someone might pass a
+// constant to CRS to e.g. count the number of replicas -- we need to make sure
+// it works.
+XLA_TEST_F(TrivialCrossReplicaSumTest, ConstantOperand) {
+  const char* module_str = R"(
+  HloModule test
+
+  add {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    add = f32[] add(x, y)
+  }
+
+  ENTRY test_computation {
+    p0 = f32[3] parameter(0)
+    p1 = f32[2] constant({10, 20})
+    ROOT crs = (f32[3], f32[2]) cross-replica-sum(p0, p1), to_apply=add
+  })";
+  auto module =
+      ParseHloString(module_str, GetModuleConfigForTest()).ValueOrDie();
+  auto literal0 = Literal::CreateR1<float>({1, 2, 3});
+  auto literal1 = Literal::CreateR1<float>({10, 20});
+  EXPECT_EQ(*Literal::MakeTuple({literal0.get(), literal1.get()}),
+            *ExecuteAndTransfer(std::move(module), {literal0.get()}));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/deallocation_test.cc b/tensorflow/compiler/xla/tests/deallocation_test.cc
index fe5621e8dc209d..bfe688e20d182d 100644
--- a/tensorflow/compiler/xla/tests/deallocation_test.cc
+++ b/tensorflow/compiler/xla/tests/deallocation_test.cc
@@ -15,10 +15,10 @@ limitations under the License.
 
 #include <memory>
 
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
@@ -36,9 +36,8 @@ class DeallocationTest : public ClientLibraryTestBase {
   // Build and execute the given computation then verify the results can be
   // transferred from the device successfully.
   std::unique_ptr<GlobalData> ExecuteAndCheckTransfer(
-      ComputationBuilder* builder,
-      tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
-    Computation computation = builder->Build().ConsumeValueOrDie();
+      XlaBuilder* builder, tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
+    XlaComputation computation = builder->Build().ConsumeValueOrDie();
     auto global_data =
         client_->Execute(computation, arguments, &execution_options_)
             .ConsumeValueOrDie();
@@ -48,7 +47,7 @@ class DeallocationTest : public ClientLibraryTestBase {
 };
 
 TEST_F(DeallocationTest, DeallocateScalar) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR0<float>(42.0);
   auto global_data = ExecuteAndCheckTransfer(&builder, {});
 
@@ -66,7 +65,7 @@ TEST_F(DeallocationTest, DeallocateScalar) {
 }
 
 TEST_F(DeallocationTest, DeallocateVector) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR1<float>({1.0, 2.0, 3.0, 4.0});
   auto global_data = ExecuteAndCheckTransfer(&builder, {});
 
@@ -79,7 +78,7 @@ TEST_F(DeallocationTest, DeallocateVector) {
 }
 
 TEST_F(DeallocationTest, DeallocateEmptyVector) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR1<float>({});
   auto global_data = ExecuteAndCheckTransfer(&builder, {});
 
@@ -92,7 +91,7 @@ TEST_F(DeallocationTest, DeallocateEmptyVector) {
 }
 
 XLA_TEST_F(DeallocationTest, DeallocateTuple) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Tuple({builder.ConstantR0<float>(42.0),
                  builder.ConstantR1<float>({1.0, 2.0, 3.0})});
   auto global_data = ExecuteAndCheckTransfer(&builder, {});
@@ -106,7 +105,7 @@ XLA_TEST_F(DeallocationTest, DeallocateTuple) {
 }
 
 XLA_TEST_F(DeallocationTest, DeallocateTupleWithRepeatedElements) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto element = builder.ConstantR0<float>(42.0);
   auto inner_tuple = builder.Tuple({builder.ConstantR0<float>(42.0), element});
   builder.Tuple({element, inner_tuple, element});
@@ -121,7 +120,7 @@ XLA_TEST_F(DeallocationTest, DeallocateTupleWithRepeatedElements) {
 }
 
 XLA_TEST_F(DeallocationTest, DeallocateNestedTuple) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto inner_tuple =
       builder.Tuple({builder.ConstantR0<float>(42.0),
                      builder.ConstantR1<float>({1.0, 2.0, 3.0})});
diff --git a/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc b/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc
index 3ab0ea4ad48c00..12789fe66530fe 100644
--- a/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/deconstruct_tuple_test.cc
@@ -16,10 +16,10 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -42,9 +42,8 @@ class DeconstructTupleTest : public ClientLibraryTestBase {
   // Build and execute the given computation then verify the results can be
   // transferred from the device successfully.
   std::unique_ptr<GlobalData> ExecuteAndCheckTransfer(
-      ComputationBuilder* builder,
-      tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
-    Computation computation = builder->Build().ConsumeValueOrDie();
+      XlaBuilder* builder, tensorflow::gtl::ArraySlice<GlobalData*> arguments) {
+    XlaComputation computation = builder->Build().ConsumeValueOrDie();
     auto global_data =
         client_->Execute(computation, arguments, &execution_options_)
             .ConsumeValueOrDie();
@@ -54,7 +53,7 @@ class DeconstructTupleTest : public ClientLibraryTestBase {
 };
 
 TEST_F(DeconstructTupleTest, DeconstructTuple) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto const1 = builder.ConstantR1<float>({1.0, 2.0, 3.0, 4.0});
   auto const2 = builder.ConstantR1<float>({2.0, 4.0, 6.0, 8.0});
   builder.Tuple({const1, const2});
@@ -73,7 +72,7 @@ TEST_F(DeconstructTupleTest, DeconstructTuple) {
 }
 
 TEST_F(DeconstructTupleTest, DeconstructTupleTwice) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto const1 = builder.ConstantR1<float>({1.0, 2.0, 3.0, 4.0});
   auto const2 = builder.ConstantR1<float>({2.0, 4.0, 6.0, 8.0});
   builder.Tuple({const1, const2});
@@ -103,7 +102,7 @@ TEST_F(DeconstructTupleTest, DeconstructTupleTwice) {
 }
 
 XLA_TEST_F(DeconstructTupleTest, DeconstructTupleRepeatedElement) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto const1 = builder.ConstantR1<float>({1.0, 2.0, 3.0, 4.0});
   auto const2 = builder.ConstantR1<float>({2.0, 4.0, 6.0, 8.0});
   builder.Tuple({const1, const2, const2, const1});
@@ -129,7 +128,7 @@ XLA_TEST_F(DeconstructTupleTest, DeconstructTupleRepeatedElement) {
 }
 
 TEST_F(DeconstructTupleTest, DeconstructTupleThenDeallocate) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto const1 = builder.ConstantR1<float>({1.0, 2.0, 3.0, 4.0});
   auto const2 = builder.ConstantR1<float>({2.0, 4.0, 6.0, 8.0});
   builder.Tuple({const1, const2, const1});
@@ -159,7 +158,7 @@ TEST_F(DeconstructTupleTest, DeconstructTupleThenDeallocate) {
 }
 
 TEST_F(DeconstructTupleTest, DeconstructNonTuple) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR1<float>({1.0, 2.0, 3.0, 4.0});
   auto global_data = ExecuteAndCheckTransfer(&builder, {});
 
@@ -170,7 +169,7 @@ TEST_F(DeconstructTupleTest, DeconstructNonTuple) {
 }
 
 XLA_TEST_F(DeconstructTupleTest, DeconstructTupleFromParam) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> param0_literal =
       Literal::CreateR1<float>({3.14f, -100.25f});
   std::unique_ptr<GlobalData> param0_data =
@@ -186,7 +185,7 @@ XLA_TEST_F(DeconstructTupleTest, DeconstructTupleFromParam) {
 }
 
 XLA_TEST_F(DeconstructTupleTest, DeconstructNestedTuple) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto const1 = builder.ConstantR1<float>({1.0, 2.0, 3.0, 4.0});
   auto const2 = builder.ConstantR1<float>({2.0, 4.0, 6.0, 8.0});
   builder.Tuple({builder.Tuple({const1, const2}), const1});
diff --git a/tensorflow/compiler/xla/tests/deep_graph_test.cc b/tensorflow/compiler/xla/tests/deep_graph_test.cc
index 1da7a96fe2388e..085a5105aca1c1 100644
--- a/tensorflow/compiler/xla/tests/deep_graph_test.cc
+++ b/tensorflow/compiler/xla/tests/deep_graph_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 
 namespace xla {
@@ -22,12 +23,12 @@ TEST_F(ClientLibraryTestBase, DeepGraph) {
   // intended to track, we need to set kDepth to 20000.
   // Unfortunately, setting it that high causes the test to time out.
   const int kDepth = 200;
-  ComputationBuilder b(client_, TestName());
-  ComputationDataHandle x;
-  ComputationDataHandle y;
+  XlaBuilder b(TestName());
+  XlaOp x;
+  XlaOp y;
   auto x_data = CreateR0Parameter<int32>(3, 0, "x", &b, &x);
   auto y_data = CreateR0Parameter<int32>(1, 1, "y", &b, &y);
-  ComputationDataHandle z = x;
+  XlaOp z = x;
   for (int i = 0; i < kDepth; ++i) {
     z = b.Add(z, y);
   }
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index c4031dfee593a1..0fd846cef8095a 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -51,21 +51,20 @@ using TypesF16F32F64 = ::testing::Types<Eigen::half, float, double>;
 using TypesF16F32F64CF64 =
     ::testing::Types<Eigen::half, float, double, complex64>;
 #elif !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16) && \
-    defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT64) && \
+    defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT64) &&    \
     defined(XLA_BACKEND_DOES_NOT_SUPPORT_COMPLEX)
 using TypesF16F32 = ::testing::Types<Eigen::half, float>;
 using TypesF16F32F64 = ::testing::Types<Eigen::half, float>;
-using TypesF16F32F64CF64 =
-    ::testing::Types<Eigen::half, float>;
+using TypesF16F32F64CF64 = ::testing::Types<Eigen::half, float>;
 #else
 #error "Situation not handled yet"
 #endif
 
 // Check that we can safely pass an input tuple's elements to a dot operation.
-TEST_F(DotOperationTest, DotOfInputTupleElem) {
-  ComputationBuilder builder(client_, TestName());
+XLA_TEST_F(DotOperationTest, DotOfInputTupleElem) {
+  XlaBuilder builder(TestName());
 
-  ComputationDataHandle param;
+  XlaOp param;
   auto param_data = CreateParameterAndTransferLiteral(
       0,
       *Literal::MakeTuple({Literal::CreateR2<float>({{1, 2}, {3, 4}}).get(),
@@ -86,7 +85,7 @@ TYPED_TEST_CASE(DotOperationTest_F16F32F64CF64, TypesF16F32F64CF64);
 
 XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, ZeroElementVectorDot) {
   using T = TypeParam;
-  ComputationBuilder builder(this->client_, this->TestName());
+  XlaBuilder builder(this->TestName());
 
   auto lhs = builder.ConstantR1<T>({});
   auto rhs = builder.ConstantR1<T>({});
@@ -102,7 +101,7 @@ TYPED_TEST_CASE(DotOperationTest_F16F32F64, TypesF16F32F64);
 
 XLA_TYPED_TEST(DotOperationTest_F16F32F64, TrivialMatrixVectorDot) {
   using T = TypeParam;
-  ComputationBuilder builder(this->client_, this->TestName());
+  XlaBuilder builder(this->TestName());
   auto lhs = builder.ConstantR2FromArray2D<T>({{3.0f, 4.0f}});
   auto rhs = builder.ConstantFromArray<T>({3.0f, 4.0f});
   auto result = builder.Dot(lhs, rhs);
@@ -113,7 +112,7 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64, TrivialMatrixVectorDot) {
 
 XLA_TYPED_TEST(DotOperationTest_F16F32F64, OneElementVectorDot) {
   using T = TypeParam;
-  ComputationBuilder builder(this->client_, this->TestName());
+  XlaBuilder builder(this->TestName());
   auto lhs = builder.ConstantR1<T>({static_cast<T>(2.0f)});
   auto rhs = builder.ConstantR1<T>({static_cast<T>(3.0f)});
   auto result = builder.Dot(lhs, rhs);
@@ -124,7 +123,7 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64, OneElementVectorDot) {
 
 XLA_TYPED_TEST(DotOperationTest_F16F32F64, VectorDot) {
   using T = TypeParam;
-  ComputationBuilder builder(this->client_, this->TestName());
+  XlaBuilder builder(this->TestName());
   auto lhs = builder.ConstantFromArray<T>({1.0f, 2.5f, 42.0f});
   auto rhs = builder.ConstantFromArray<T>({11.0f, -1.0f, 0.5f});
   auto result = builder.Dot(lhs, rhs);
@@ -139,7 +138,7 @@ std::vector<int64> MinorToMajorForIsRowMajor(bool row_major) {
 
 XLA_TYPED_TEST(DotOperationTest_F16F32F64, Dot_0x2_2x0) {
   using T = TypeParam;
-  ComputationBuilder builder(this->client_, this->TestName());
+  XlaBuilder builder(this->TestName());
   auto lhs = builder.ConstantR2FromArray2D<T>(Array2D<T>(0, 2));
   auto rhs = builder.ConstantR2FromArray2D<T>(Array2D<T>(2, 0));
   auto result = builder.Dot(lhs, rhs);
@@ -150,7 +149,7 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64, Dot_0x2_2x0) {
 
 XLA_TYPED_TEST(DotOperationTest_F16F32F64, Dot_0x2_2x3) {
   using T = TypeParam;
-  ComputationBuilder builder(this->client_, this->TestName());
+  XlaBuilder builder(this->TestName());
   auto lhs = builder.ConstantR2FromArray2D<T>(Array2D<T>(0, 2));
   auto rhs = builder.ConstantR2FromArray2D<T>(
       {{7.0f, 8.0f, 9.0f}, {42.0f, 77.0f, 101.0f}});
@@ -162,7 +161,7 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64, Dot_0x2_2x3) {
 
 XLA_TYPED_TEST(DotOperationTest_F16F32F64, Dot_3x2_2x0) {
   using T = TypeParam;
-  ComputationBuilder builder(this->client_, this->TestName());
+  XlaBuilder builder(this->TestName());
   auto lhs = builder.ConstantR2FromArray2D<T>(
       {{7.0f, 8.0f}, {9.0f, 42.0f}, {77.0f, 101.0f}});
   auto rhs = builder.ConstantR2FromArray2D<T>(Array2D<T>(2, 0));
@@ -174,7 +173,7 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64, Dot_3x2_2x0) {
 
 XLA_TYPED_TEST(DotOperationTest_F16F32F64, Dot_2x0_0x2) {
   using T = TypeParam;
-  ComputationBuilder builder(this->client_, this->TestName());
+  XlaBuilder builder(this->TestName());
   auto lhs = builder.ConstantR2FromArray2D<T>(Array2D<T>(2, 0));
   auto rhs = builder.ConstantR2FromArray2D<T>(Array2D<T>(0, 2));
   auto result = builder.Dot(lhs, rhs);
@@ -185,7 +184,7 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64, Dot_2x0_0x2) {
 
 XLA_TYPED_TEST(DotOperationTest_F16F32F64, FusedDot) {
   using T = TypeParam;
-  ComputationBuilder builder(this->client_, this->TestName());
+  XlaBuilder builder(this->TestName());
   auto param0 =
       builder.Parameter(0, ShapeUtil::MakeShapeWithType<T>({2, 4}), "arg0");
   auto param1 =
@@ -230,7 +229,7 @@ class SquareMatrixDot : public DotOperationTest {
                 LayoutUtil::MakeLayout(
                     MinorToMajorForIsRowMajor(rhs_row_major))))
             .ConsumeValueOrDie();
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto prim_type = primitive_util::NativeToPrimitiveType<T>();
     auto result = builder.Dot(
         builder.Parameter(0, ShapeUtil::MakeShape(prim_type, {2, 2}), "lhs"),
@@ -315,7 +314,7 @@ void ParametricDotTest::TestImpl() {
     addend_handle = client_->TransferToServer(*addend_lit).ConsumeValueOrDie();
   }
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto prim_type = primitive_util::NativeToPrimitiveType<NativeT>();
   auto result = builder.Dot(
       builder.Parameter(0,
@@ -491,7 +490,7 @@ class NonsquareMatrixDot : public DotOperationTest {
                     MinorToMajorForIsRowMajor(rhs_row_major))))
             .ConsumeValueOrDie();
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto prim_type = primitive_util::NativeToPrimitiveType<T>();
     auto result = builder.Dot(
         builder.Parameter(0, ShapeUtil::MakeShape(prim_type, {2, 3}), "lhs"),
@@ -523,7 +522,7 @@ XLA_TEST_F(DotOperationTest, MatrixVectorC64) {
               LayoutUtil::MakeLayout({1, 0})))
           .ConsumeValueOrDie();
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto prim_type = primitive_util::NativeToPrimitiveType<complex64>();
   auto result = builder.Dot(
       builder.Parameter(0, ShapeUtil::MakeShape(prim_type, {1, 4}), "lhs"),
@@ -538,7 +537,7 @@ XLA_TEST_F(DotOperationTest, MatrixVectorC64) {
 XLA_TYPED_TEST(DotOperationTest_F16F32F64, ConcurrentMatMult) {
   using T = TypeParam;
 
-  ComputationBuilder builder(this->client_, this->TestName());
+  XlaBuilder builder(this->TestName());
   auto matrix1 = builder.ConstantR2FromArray2D<T>({{1.0f, 2.0f}, {3.0f, 4.0f}});
   auto matrix2 = builder.ConstantR2FromArray2D<T>({{5.0f, 6.0f}, {7.0f, 8.0f}});
   auto matrix12 = builder.Dot(matrix1, matrix2);
@@ -559,7 +558,7 @@ TYPED_TEST_CASE(DotOperationTestForBatchMatMul, TypesF16F32F64);
 // sync-dependent on bitcasts' operands.
 XLA_TYPED_TEST(DotOperationTestForBatchMatMul, Types) {
   using T = TypeParam;
-  ComputationBuilder builder(this->client_, this->TestName());
+  XlaBuilder builder(this->TestName());
   auto x =
       builder.Parameter(0, ShapeUtil::MakeShapeWithType<T>({2, 2, 2, 2}), "x");
   auto y =
@@ -569,7 +568,7 @@ XLA_TYPED_TEST(DotOperationTestForBatchMatMul, Types) {
   auto y_flat = builder.Reshape(y, {0, 1, 2, 3}, {4, 2, 2});
 
   // Slice batches into individual matrices and multiply them.
-  std::vector<xla::ComputationDataHandle> out_slices;
+  std::vector<XlaOp> out_slices;
   for (int i = 0; i < 4; ++i) {
     // Slice off individual matrices and reshape to 2D tensors.
     auto x_slice = builder.Slice(x_flat, {i, 0, 0}, {i + 1, 2, 2}, {1, 1, 1});
@@ -615,7 +614,7 @@ XLA_TYPED_TEST(DotOperationTestForBatchMatMul, Types) {
 XLA_TYPED_TEST(DotOperationTest_F16F32F64, GeneralMatMul) {
   using T = TypeParam;
 
-  ComputationBuilder builder(this->client_, this->TestName());
+  XlaBuilder builder(this->TestName());
   auto x =
       builder.Parameter(0, ShapeUtil::MakeShapeWithType<T>({2, 2, 2}), "x");
   auto y =
@@ -677,7 +676,7 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64, TransposeFolding) {
                               MinorToMajorForIsRowMajor(row_major))))
                 .ConsumeValueOrDie();
 
-        ComputationBuilder builder(this->client_, this->TestName());
+        XlaBuilder builder(this->TestName());
         auto prim_type = primitive_util::NativeToPrimitiveType<T>();
         auto lhs_arg = builder.Parameter(
             0, ShapeUtil::MakeShape(prim_type, {lhs->height(), lhs->width()}),
@@ -713,7 +712,7 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64,
       new Array2D<T>({{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},
                       {6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f}}));
 
-  ComputationBuilder builder(this->client_, this->TestName());
+  XlaBuilder builder(this->TestName());
   auto lhs_constant = builder.ConstantR2FromArray2D(*constant_lhs_array);
   auto rhs_arg_0 = builder.Parameter(0, ShapeUtil::MakeShape(prim_type, {2, 2}),
                                      "rhs_arg_0");
@@ -761,7 +760,7 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64,
                       {4.0f, 3.0f},
                       {2.0f, 1.0f}}));
 
-  ComputationBuilder builder(this->client_, this->TestName());
+  XlaBuilder builder(this->TestName());
   auto rhs_constant = builder.ConstantR2FromArray2D(*constant_rhs_array);
   auto lhs_arg_0 = builder.Parameter(0, ShapeUtil::MakeShapeWithType<T>({2, 2}),
                                      "lhs_arg_0");
@@ -799,5 +798,250 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64,
       this->error_spec_);
 }
 
+XLA_TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstRHSClassicMM) {
+  std::unique_ptr<Array2D<float>> constant_lhs_array(new Array2D<float>(
+      {{1.0, 2.0, 3.0, 4.0, 5.0, 6.0}, {6.0, 5.0, 4.0, 3.0, 2.0, 1.0}}));
+  std::unique_ptr<Array2D<float>> constant_rhs_array(
+      new Array2D<float>({{1.0, 2.0, 3.0},
+                          {4.0, 5.0, 6.0},
+                          {7.0, 8.0, 9.0},
+                          {9.0, 8.0, 7.0},
+                          {6.0, 5.0, 4.0},
+                          {3.0, 2.0, 1.0}}));
+  // Dot result to slice from: {{114, 105, 96}, {96, 105, 114}}
+
+  XlaBuilder builder(TestName());
+  auto lhs_constant = builder.ConstantR2FromArray2D(*constant_lhs_array);
+  auto rhs_constant = builder.ConstantR2FromArray2D(*constant_rhs_array);
+  auto start_constant = builder.ConstantR1<int32>({1, 0});
+  auto dynamic_slice =
+      builder.DynamicSlice(lhs_constant, start_constant, {1, 6});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto result = builder.DotGeneral(dynamic_slice, rhs_constant, dot_dnums);
+
+  Array2D<float> expected({{96.0, 105.0, 114.0}});
+  ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
+}
+
+XLA_TEST_F(DotOperationTest, DotOfGatherOptimizationWithConstLHSClassicMM) {
+  std::unique_ptr<Array2D<float>> constant_lhs_array(new Array2D<float>(
+      {{1.0, 2.0, 3.0, 4.0, 5.0, 6.0}, {6.0, 5.0, 4.0, 3.0, 2.0, 1.0}}));
+  std::unique_ptr<Array2D<float>> constant_rhs_array(
+      new Array2D<float>({{1.0, 2.0, 3.0},
+                          {4.0, 5.0, 6.0},
+                          {7.0, 8.0, 9.0},
+                          {9.0, 8.0, 7.0},
+                          {6.0, 5.0, 4.0},
+                          {3.0, 2.0, 1.0}}));
+  // Dot result to slice from: {{114, 105, 96}, {96, 105, 114}}
+
+  XlaBuilder builder(TestName());
+  auto lhs_constant = builder.ConstantR2FromArray2D(*constant_lhs_array);
+  auto rhs_constant = builder.ConstantR2FromArray2D(*constant_rhs_array);
+  auto start_constant = builder.ConstantR1<int32>({0, 1});
+  auto dynamic_slice =
+      builder.DynamicSlice(rhs_constant, start_constant, {6, 1});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto result = builder.DotGeneral(lhs_constant, dynamic_slice, dot_dnums);
+
+  Array2D<float> expected({{105.0}, {105.0}});
+  ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
+}
+
+// TODO (b/69062148) Enable when Dot implements general contracting dimensions.
+XLA_TEST_F(DotOperationTest,
+       DISABLED_ON_CPU(DISABLED_ON_GPU(DISABLED_ON_INTERPRETER(
+           DotOfGatherOptimizationWithConstRHSReverseMM)))) {
+  std::unique_ptr<Array2D<float>> constant_lhs_array(
+      new Array2D<float>({{1.0, 2.0, 3.0},
+                          {4.0, 5.0, 6.0},
+                          {7.0, 8.0, 9.0},
+                          {9.0, 8.0, 7.0},
+                          {6.0, 5.0, 4.0},
+                          {3.0, 2.0, 1.0}}));
+  std::unique_ptr<Array2D<float>> constant_rhs_array(new Array2D<float>(
+      {{1.0, 2.0, 3.0, 4.0, 5.0, 6.0}, {6.0, 5.0, 4.0, 3.0, 2.0, 1.0}}));
+  // Dot result to slice from: {{114, 96}, {105, 105}, {96, 114}}
+
+  XlaBuilder builder(TestName());
+  auto lhs_constant = builder.ConstantR2FromArray2D(*constant_lhs_array);
+  auto rhs_constant = builder.ConstantR2FromArray2D(*constant_rhs_array);
+  auto start_constant = builder.ConstantR1<int32>({0, 1});
+  auto dynamic_slice =
+      builder.DynamicSlice(lhs_constant, start_constant, {6, 1});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(0);
+  dot_dnums.add_rhs_contracting_dimensions(1);
+  auto result = builder.DotGeneral(dynamic_slice, rhs_constant, dot_dnums);
+
+  Array2D<float> expected({{105.0, 105.0}});
+  ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
+}
+
+// TODO (b/69062148) Enable when Dot implements general contracting dimensions.
+XLA_TEST_F(DotOperationTest,
+       DISABLED_ON_CPU(DISABLED_ON_GPU(DISABLED_ON_INTERPRETER(
+           DotOfGatherOptimizationWithConstLHSReverseMM)))) {
+  std::unique_ptr<Array2D<float>> constant_lhs_array(
+      new Array2D<float>({{1.0, 2.0, 3.0},
+                          {4.0, 5.0, 6.0},
+                          {7.0, 8.0, 9.0},
+                          {9.0, 8.0, 7.0},
+                          {6.0, 5.0, 4.0},
+                          {3.0, 2.0, 1.0}}));
+  std::unique_ptr<Array2D<float>> constant_rhs_array(new Array2D<float>(
+      {{1.0, 2.0, 3.0, 4.0, 5.0, 6.0}, {6.0, 5.0, 4.0, 3.0, 2.0, 1.0}}));
+  // Dot result to slice from: {{114, 96}, {105, 105}, {96, 114}}
+
+  XlaBuilder builder(TestName());
+  auto lhs_constant = builder.ConstantR2FromArray2D(*constant_lhs_array);
+  auto rhs_constant = builder.ConstantR2FromArray2D(*constant_rhs_array);
+  auto start_constant = builder.ConstantR1<int32>({1, 0});
+  auto dynamic_slice =
+      builder.DynamicSlice(rhs_constant, start_constant, {1, 6});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(0);
+  dot_dnums.add_rhs_contracting_dimensions(1);
+  auto result = builder.DotGeneral(lhs_constant, dynamic_slice, dot_dnums);
+
+  Array2D<float> expected({{96.0}, {105.0}, {114.0}});
+  ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
+}
+
+// TODO (b/69062148) Enable when Dot implements general contracting dimensions.
+XLA_TEST_F(DotOperationTest,
+       DISABLED_ON_CPU(DISABLED_ON_GPU(
+           DISABLED_ON_INTERPRETER(DotOfGatherOptimizationWithConstRHSRows)))) {
+  std::unique_ptr<Array2D<float>> constant_lhs_array(
+      new Array2D<float>({{1.0, 2.0},
+                          {3.0, 4.0},
+                          {5.0, 6.0},
+                          {6.0, 5.0},
+                          {4.0, 3.0},
+                          {2.0, 1.0}}));
+  std::unique_ptr<Array2D<float>> constant_rhs_array(
+      new Array2D<float>({{1.0, 2.0, 3.0},
+                          {4.0, 5.0, 6.0},
+                          {7.0, 8.0, 9.0},
+                          {9.0, 8.0, 7.0},
+                          {6.0, 5.0, 4.0},
+                          {3.0, 2.0, 1.0}}));
+  // Dot result to slice from: {{132, 129, 126}, {126, 129, 132}}
+
+  XlaBuilder builder(TestName());
+  auto lhs_constant = builder.ConstantR2FromArray2D(*constant_lhs_array);
+  auto rhs_constant = builder.ConstantR2FromArray2D(*constant_rhs_array);
+  auto start_constant = builder.ConstantR1<int32>({0, 1});
+  auto dynamic_slice =
+      builder.DynamicSlice(lhs_constant, start_constant, {6, 1});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(0);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto result = builder.DotGeneral(dynamic_slice, rhs_constant, dot_dnums);
+
+  Array2D<float> expected({{126.0, 129.0, 132.0}});
+  ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
+}
+
+// TODO (b/69062148) Enable when Dot implements general contracting dimensions.
+XLA_TEST_F(DotOperationTest,
+       DISABLED_ON_CPU(DISABLED_ON_GPU(
+           DISABLED_ON_INTERPRETER(DotOfGatherOptimizationWithConstLHSRows)))) {
+  std::unique_ptr<Array2D<float>> constant_lhs_array(
+      new Array2D<float>({{1.0, 2.0},
+                          {3.0, 4.0},
+                          {5.0, 6.0},
+                          {6.0, 5.0},
+                          {4.0, 3.0},
+                          {2.0, 1.0}}));
+  std::unique_ptr<Array2D<float>> constant_rhs_array(
+      new Array2D<float>({{1.0, 2.0, 3.0},
+                          {4.0, 5.0, 6.0},
+                          {7.0, 8.0, 9.0},
+                          {9.0, 8.0, 7.0},
+                          {6.0, 5.0, 4.0},
+                          {3.0, 2.0, 1.0}}));
+  // Dot result to slice from: {{132, 129, 126}, {126, 129, 132}}
+
+  XlaBuilder builder(TestName());
+  auto lhs_constant = builder.ConstantR2FromArray2D(*constant_lhs_array);
+  auto rhs_constant = builder.ConstantR2FromArray2D(*constant_rhs_array);
+  auto start_constant = builder.ConstantR1<int32>({0, 1});
+  auto dynamic_slice =
+      builder.DynamicSlice(rhs_constant, start_constant, {6, 1});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(0);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto result = builder.DotGeneral(lhs_constant, dynamic_slice, dot_dnums);
+
+  Array2D<float> expected({{129.0}, {129.0}});
+  ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
+}
+
+// TODO (b/69062148) Enable when Dot implements general contracting dimensions.
+XLA_TEST_F(DotOperationTest,
+       DISABLED_ON_CPU(DISABLED_ON_GPU(
+           DISABLED_ON_INTERPRETER(DotOfGatherOptimizationWithConstRHSCols)))) {
+  std::unique_ptr<Array2D<float>> constant_lhs_array(new Array2D<float>(
+      {{1.0, 2.0, 3.0, 4.0, 5.0, 6.0}, {6.0, 5.0, 4.0, 3.0, 2.0, 1.0}}));
+  std::unique_ptr<Array2D<float>> constant_rhs_array(
+      new Array2D<float>({{1.0, 2.0, 3.0, 4.0, 5.0, 6.0},
+                          {7.0, 8.0, 9.0, 9.0, 8.0, 7.0},
+                          {6.0, 5.0, 4.0, 3.0, 2.0, 1.0}}));
+  // Dot result to slice from: {{91, 168, 56}, {56, 168, 91}}
+
+  XlaBuilder builder(TestName());
+  auto lhs_constant = builder.ConstantR2FromArray2D(*constant_lhs_array);
+  auto rhs_constant = builder.ConstantR2FromArray2D(*constant_rhs_array);
+  auto start_constant = builder.ConstantR1<int32>({1, 0});
+  auto dynamic_slice =
+      builder.DynamicSlice(lhs_constant, start_constant, {1, 6});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(1);
+  auto result = builder.DotGeneral(dynamic_slice, rhs_constant, dot_dnums);
+
+  Array2D<float> expected({{56.0, 168.0, 91.0}});
+  ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
+}
+
+// TODO (b/69062148) Enable when Dot implements general contracting dimensions.
+XLA_TEST_F(DotOperationTest,
+       DISABLED_ON_CPU(DISABLED_ON_GPU(
+           DISABLED_ON_INTERPRETER(DotOfGatherOptimizationWithConstLHSCols)))) {
+  std::unique_ptr<Array2D<float>> constant_lhs_array(new Array2D<float>(
+      {{1.0, 2.0, 3.0, 4.0, 5.0, 6.0}, {6.0, 5.0, 4.0, 3.0, 2.0, 1.0}}));
+  std::unique_ptr<Array2D<float>> constant_rhs_array(
+      new Array2D<float>({{1.0, 2.0, 3.0, 4.0, 5.0, 6.0},
+                          {7.0, 8.0, 9.0, 9.0, 8.0, 7.0},
+                          {6.0, 5.0, 4.0, 3.0, 2.0, 1.0}}));
+  // Dot result to slice from: {{91, 168, 56}, {56, 168, 91}}
+
+  XlaBuilder builder(TestName());
+  auto lhs_constant = builder.ConstantR2FromArray2D(*constant_lhs_array);
+  auto rhs_constant = builder.ConstantR2FromArray2D(*constant_rhs_array);
+  auto start_constant = builder.ConstantR1<int32>({1, 0});
+  auto dynamic_slice =
+      builder.DynamicSlice(rhs_constant, start_constant, {1, 6});
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(1);
+  auto result = builder.DotGeneral(lhs_constant, dynamic_slice, dot_dnums);
+
+  Array2D<float> expected({{168.0}, {168.0}});
+  ComputeAndCompareR2<float>(&builder, expected, {}, error_spec_);
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
index ff53a84588fc04..49f3a10d227f2f 100644
--- a/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/dynamic_ops_test.cc
@@ -53,9 +53,9 @@ class DynamicSliceTest : public ClientLibraryTestBase {
   }
 
   template <typename IndexT, typename DataT>
-  void TestR1Wrap() {
-    // Slice at dimension boundaries, but with sizes that cause indices to wrap.
-    RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {6}, {4}, {6, 7, 0, 1});
+  void TestR1OOB() {
+    // Slice at dimension boundaries, but with out of bounds indices.
+    RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {6}, {4}, {4, 5, 6, 7});
   }
 
   template <typename IndexT, typename DataT>
@@ -78,10 +78,10 @@ class DynamicSliceTest : public ClientLibraryTestBase {
   }
 
   template <typename IndexT, typename DataT>
-  void TestR2Wrap() {
-    // Slice at dimension boundaries, but with sizes that cause indices to wrap.
+  void TestR2OOB() {
+    // Slice at dimension boundaries, but with out of bounds indices.
     RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {1, 1}, {3, 3},
-                         {{5, 6, 4}, {8, 9, 7}, {2, 3, 1}});
+                         {{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
   }
 
   template <typename IndexT, typename DataT>
@@ -106,11 +106,11 @@ class DynamicSliceTest : public ClientLibraryTestBase {
   }
 
   template <typename IndexT, typename DataT>
-  void TestR3Wrap() {
-    // Slice at dimension boundaries, but with sizes that cause indices to wrap.
+  void TestR3OOB() {
+    // Slice at dimension boundaries, but with out of bounds indices.
     RunR3<IndexT, DataT>(
         {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 12}}}, {0, 2, 1},
-        {2, 1, 2}, {{{6, 5}}, {{12, 11}}});
+        {2, 1, 2}, {{{5, 6}}, {{11, 12}}});
   }
 
   template <typename IndexT, typename DataT>
@@ -199,19 +199,19 @@ class DynamicSliceTest : public ClientLibraryTestBase {
 
 XLA_TEST_F(DynamicSliceTest, Int32R1BF16) { TestR1<int32, bfloat16>(); }
 XLA_TEST_F(DynamicSliceTest, Int32R1) { TestR1<int32, int32>(); }
-XLA_TEST_F(DynamicSliceTest, Int32R1Wrap) { TestR1Wrap<int32, int32>(); }
+XLA_TEST_F(DynamicSliceTest, Int32R1OOB) { TestR1OOB<int32, int32>(); }
 XLA_TEST_F(DynamicSliceTest, Int64R1) { TestR1<int64, float>(); }
 XLA_TEST_F(DynamicSliceTest, UInt64R1) { TestR1<uint64, float>(); }
 
 XLA_TEST_F(DynamicSliceTest, Int32R2BF16) { TestR2<int32, bfloat16>(); }
 XLA_TEST_F(DynamicSliceTest, Int32R2) { TestR2<int32, int32>(); }
-XLA_TEST_F(DynamicSliceTest, Int32R2Wrap) { TestR2Wrap<int32, int32>(); }
+XLA_TEST_F(DynamicSliceTest, Int32R2OOB) { TestR2OOB<int32, int32>(); }
 XLA_TEST_F(DynamicSliceTest, Int64R2) { TestR2<int64, float>(); }
 XLA_TEST_F(DynamicSliceTest, UInt64R2) { TestR2<uint64, int32>(); }
 
 XLA_TEST_F(DynamicSliceTest, Int32R3BF16) { TestR3<int32, bfloat16>(); }
 XLA_TEST_F(DynamicSliceTest, Int32R3) { TestR3<int32, float>(); }
-XLA_TEST_F(DynamicSliceTest, Int32R3Wrap) { TestR3Wrap<int32, float>(); }
+XLA_TEST_F(DynamicSliceTest, Int32R3OOB) { TestR3OOB<int32, float>(); }
 XLA_TEST_F(DynamicSliceTest, Int64R3) { TestR3<int64, float>(); }
 XLA_TEST_F(DynamicSliceTest, UInt64R3) { TestR3<uint64, float>(); }
 
@@ -332,17 +332,17 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
   }
 
   template <typename IndexT, typename DataT>
-  void TestWrap() {
-    // Slice at dimension boundaries, but with sizes that cause indices to wrap.
+  void TestOOB() {
+    // // Slice at dimension boundaries, but with out of bounds indices.
     RunR1<IndexT, DataT>({0, 1, 2, 3, 4, 5, 6, 7}, {8, 9, 10}, {6},
-                         {10, 1, 2, 3, 4, 5, 8, 9});
+                         {0, 1, 2, 3, 4, 8, 9, 10});
     // R2 Shape: [3, 3]
     RunR2<IndexT, DataT>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}, {{10, 11}}, {2, 2},
-                         {{1, 2, 3}, {4, 5, 6}, {11, 8, 10}});
+                         {{1, 2, 3}, {4, 5, 6}, {7, 10, 11}});
     // R3 Shape: [2, 3, 2]
     RunR3<IndexT, DataT>(
         {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 10}, {11, 12}}}, {{{13}, {15}}},
-        {1, 2, 1}, {{{1, 2}, {3, 4}, {5, 6}}, {{7, 15}, {9, 10}, {11, 13}}});
+        {1, 2, 1}, {{{1, 2}, {3, 4}, {5, 6}}, {{7, 8}, {9, 13}, {11, 15}}});
   }
 
   template <typename IndexT, typename DataT>
@@ -361,9 +361,9 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
                        ->Convert(primitive_util::NativeToPrimitiveType<DataT>())
                        .ValueOrDie());
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     // Initialize and transfer dynamic slice start indices parameter.
-    ComputationDataHandle starts;
+    XlaOp starts;
     std::unique_ptr<GlobalData> start_data = CreateR1Parameter<IndexT>(
         slice_starts, 0, "slice_starts", &builder, &starts);
     // Build dynamic slice computation.
@@ -476,20 +476,19 @@ class DynamicUpdateSliceTest : public ClientLibraryTestBase {
     Array3D<T> input_values(kSeq, kBatch, kDim);
     Array3D<T> update_values(size, kBatch, kDim);
     Array3D<T> expected_values(kSeq, kBatch, kDim);
+    index = std::min(std::max(0, index), kSeq - size);
 
     input_values.FillIota(static_cast<T>(0));
     T value = static_cast<T>(10);
     update_values.FillIota(static_cast<T>(value));
 
     // TODO(b/34128753) Expected values may vary depending on backend when
-    // the update wraps. According to documentation, the results are technically
-    // implementation specific where the update is out of bounds, and hence
-    // we don't really know what to pass into ComputeAndCompareR3.
+    // the indices are out of bounds.
     expected_values.FillIota(static_cast<T>(0));
     for (int i = 0; i < size; i++) {
       for (int j = 0; j < kBatch; j++) {
         for (int k = 0; k < kDim; k++) {
-          expected_values((index + i) % kSeq, j, k) = value++;
+          expected_values(index + i, j, k) = value++;
         }
       }
     }
@@ -547,12 +546,10 @@ XLA_TEST_F(DynamicUpdateSliceTest, Int32R3) { TestR3<int32, float>(); }
 XLA_TEST_F(DynamicUpdateSliceTest, Int64R3) { TestR3<int64, int64>(); }
 XLA_TEST_F(DynamicUpdateSliceTest, UInt64R3) { TestR3<uint64, uint64>(); }
 
-XLA_TEST_F(DynamicUpdateSliceTest, Int32WrapBF16) {
-  TestWrap<int32, bfloat16>();
-}
-XLA_TEST_F(DynamicUpdateSliceTest, Int32Wrap) { TestWrap<int32, float>(); }
-XLA_TEST_F(DynamicUpdateSliceTest, Int64Wrap) { TestWrap<int64, int64>(); }
-XLA_TEST_F(DynamicUpdateSliceTest, UInt64Wrap) { TestWrap<uint64, uint64>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, Int32OOBBF16) { TestOOB<int32, bfloat16>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, Int32OOB) { TestOOB<int32, float>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, Int64OOB) { TestOOB<int64, int64>(); }
+XLA_TEST_F(DynamicUpdateSliceTest, UInt64OOB) { TestOOB<uint64, uint64>(); }
 
 XLA_TEST_F(DynamicUpdateSliceTest, Int32R1Pred) {
   // Slice at dimension start.
@@ -615,37 +612,37 @@ XLA_TEST_F(DynamicUpdateSliceTest, Int32R3Pred) {
 // Tests for simple R3 case where the update is contiguous (i.e. the minor
 // two dimensions are not sliced).
 XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousSingleElement) {
-  // Single element, no wrap.
+  // Single element, index in-bounds
   std::vector<int32> operand_shape({4, 5, 2});
   RunR3Contiguous<float>(operand_shape, /*index=*/1, /*size=*/1);
 }
 
 XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousSingleElementBF16) {
-  // Single element, no wrap.
+  // Single element, index in-bounds
   std::vector<int32> operand_shape({4, 5, 2});
   RunR3Contiguous<bfloat16>(operand_shape, /*index=*/1, /*size=*/1);
 }
 
 XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousMultipleElements) {
-  // Multiple element, no wrap.
+  // Multiples element, index in-bounds.
   std::vector<int32> operand_shape({4, 5, 2});
   RunR3Contiguous<float>(operand_shape, /*index=*/1, /*size=*/2);
 }
 
 XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousMultipleElementsBF16) {
-  // Multiple element, no wrap.
+  // Multiples element, index in-bounds.
   std::vector<int32> operand_shape({4, 5, 2});
   RunR3Contiguous<bfloat16>(operand_shape, /*index=*/1, /*size=*/2);
 }
 
-XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousMultipleWrapping) {
-  // Multiple element, wrapping.
+XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousMultipleOOB) {
+  // Multiple element, index out of bounds.
   std::vector<int32> operand_shape({4, 5, 2});
   RunR3Contiguous<float>(operand_shape, /*index=*/3, /*size=*/2);
 }
 
-XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousMultipleWrappingBF16) {
-  // Multiple element, wrapping.
+XLA_TEST_F(DynamicUpdateSliceTest, R3ContiguousMultipleOOBBF16) {
+  // Multiple element, index out of bounds.
   std::vector<int32> operand_shape({4, 5, 2});
   RunR3Contiguous<bfloat16>(operand_shape, /*index=*/3, /*size=*/2);
 }
diff --git a/tensorflow/compiler/xla/tests/execution_profile_test.cc b/tensorflow/compiler/xla/tests/execution_profile_test.cc
index c8cc8e40aa3210..a6ba6db5d3bf86 100644
--- a/tensorflow/compiler/xla/tests/execution_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/execution_profile_test.cc
@@ -13,8 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/core/platform/test.h"
@@ -32,9 +33,9 @@ XLA_TEST_F(ExecutionProfileTest, ExecuteWithExecutionProfile) {
       client_->TransferToServer(
           *Literal::CreateR2F32Linspace(1e0, 1e5, 256, 256)));
 
-  ComputationBuilder b(client_, TestName() + ".add");
+  XlaBuilder b(TestName() + ".add");
   b.Dot(b.Parameter(0, shape, "param_0"), b.Parameter(1, shape, "param_1"));
-  TF_ASSERT_OK_AND_ASSIGN(Computation dot_product, b.Build());
+  TF_ASSERT_OK_AND_ASSIGN(XlaComputation dot_product, b.Build());
 
   ExecutionProfile execution_profile;
   TF_ASSERT_OK_AND_ASSIGN(
diff --git a/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc b/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc
index b28fe0c15a89a1..0a37e4d4236201 100644
--- a/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc
+++ b/tensorflow/compiler/xla/tests/exhaustive_f32_elementwise_op_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -35,7 +36,7 @@ class ExhaustiveF32ElementwiseOpTest
     int64 input_size = end - begin;
     LOG(INFO) << "Checking range [" << begin << ", " << end << ")";
 
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
 
     std::unique_ptr<Literal> input_literal =
         Literal::CreateFromDimensions(F32, {input_size});
@@ -78,9 +79,7 @@ XLA_TEST_P(ExhaustiveF32ElementwiseOpTest, LogF32) {
 #endif
 
   ExhaustivelyTestF32Op(
-      [](ComputationBuilder* builder, const ComputationDataHandle& input) {
-        builder->Log(input);
-      },
+      [](XlaBuilder* builder, const XlaOp& input) { builder->Log(input); },
       std::log, known_incorrect_range);
 }
 
@@ -96,17 +95,13 @@ XLA_TEST_P(ExhaustiveF32ElementwiseOpTest, ExpF32) {
 #endif
 
   ExhaustivelyTestF32Op(
-      [](ComputationBuilder* builder, const ComputationDataHandle& input) {
-        builder->Exp(input);
-      },
+      [](XlaBuilder* builder, const XlaOp& input) { builder->Exp(input); },
       std::exp, known_incorrect_range);
 }
 
 XLA_TEST_P(ExhaustiveF32ElementwiseOpTest, TanhF32) {
   ExhaustivelyTestF32Op(
-      [](ComputationBuilder* builder, const ComputationDataHandle& input) {
-        builder->Tanh(input);
-      },
+      [](XlaBuilder* builder, const XlaOp& input) { builder->Tanh(input); },
       std::tanh, /*known_incorrect_range=*/{0, 0});
 }
 
diff --git a/tensorflow/compiler/xla/tests/filecheck.cc b/tensorflow/compiler/xla/tests/filecheck.cc
index a5f6872c46c780..93d1c921c4a138 100644
--- a/tensorflow/compiler/xla/tests/filecheck.cc
+++ b/tensorflow/compiler/xla/tests/filecheck.cc
@@ -38,7 +38,7 @@ StatusOr<bool> RunFileCheck(const string& input, const string& pattern) {
   TF_RETURN_IF_ERROR(tensorflow::WriteStringToFile(env, pattern_path, pattern));
 
   // Invoke FileCheck to check whether input matches `pattern`.
-  const char* file_check_path_suffix = "external/llvm/FileCheck";
+  const char* file_check_path_suffix = "org_tensorflow/external/llvm/FileCheck";
   string file_check_path;
   if (const char* test_srcdir = getenv("TEST_SRCDIR")) {
     file_check_path = JoinPath(test_srcdir, file_check_path_suffix);
@@ -66,6 +66,11 @@ StatusOr<bool> RunFileCheck(const string& input, const string& pattern) {
   // the error message generated by FileCheck and the inputs.
   bool succeeded = (exit_status == 0);
   if (!succeeded) {
+    LOG(WARNING) << "Tried to execute FileCheck at " << file_check_path;
+    if (!env->FileExists(file_check_path).ok()) {
+      LOG(WARNING) << "NOTE: FileCheck binary does not exist!";
+    }
+
     LOG(WARNING) << "FileCheck error: " << standard_error;
     LOG(WARNING) << "FileCheck input was:";
     XLA_LOG_LINES(tensorflow::WARNING, input);
diff --git a/tensorflow/compiler/xla/tests/floor_ceil_test.cc b/tensorflow/compiler/xla/tests/floor_ceil_test.cc
index e75a41acacc3aa..71eb914a8e5eae 100644
--- a/tensorflow/compiler/xla/tests/floor_ceil_test.cc
+++ b/tensorflow/compiler/xla/tests/floor_ceil_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include <limits>
 #include <string>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -41,7 +41,7 @@ class FloorCeilTest : public ClientLibraryTestBase {
                  tensorflow::gtl::ArraySlice<float> expected, Function f) {
     LOG(INFO) << "input: {" << tensorflow::str_util::Join(expected, ", ")
               << "}";
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto c = builder.ConstantR1<float>(input);
     if (f == kCeil) {
       builder.Ceil(c);
@@ -54,7 +54,7 @@ class FloorCeilTest : public ClientLibraryTestBase {
 
   void TestR0F32(float input, float expected, Function f) {
     LOG(INFO) << "input: " << expected;
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto c = builder.ConstantR0<float>(input);
     if (f == kCeil) {
       builder.Ceil(c);
diff --git a/tensorflow/compiler/xla/tests/fmax_test.cc b/tensorflow/compiler/xla/tests/fmax_test.cc
index f2aaf6621c1f0d..73f029b59bc56a 100644
--- a/tensorflow/compiler/xla/tests/fmax_test.cc
+++ b/tensorflow/compiler/xla/tests/fmax_test.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/core/platform/test.h"
@@ -27,7 +27,7 @@ namespace {
 class FmaxSimpleTest : public ClientLibraryTestBase {};
 
 TEST_F(FmaxSimpleTest, FmaxTenValues) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<float>(
       {-0.0, 1.0, 2.0, -3.0, -4.0, 5.0, 6.0, -7.0, -8.0, 9.0});
   auto y = builder.ConstantR1<float>(
diff --git a/tensorflow/compiler/xla/tests/fusion_test.cc b/tensorflow/compiler/xla/tests/fusion_test.cc
index 6f89e9164c8d44..e6f79b5ac55ddd 100644
--- a/tensorflow/compiler/xla/tests/fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/fusion_test.cc
@@ -25,8 +25,7 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
@@ -119,9 +118,9 @@ class FusionTest : public HloTestBase {
     auto expected = Literal::CreateR2FromArray2D(answer_data);
     auto actual = ExecuteAndTransfer(std::move(hlo_module), {});
     if (primitive_util::IsFloatingPointType(prim_type)) {
-      LiteralTestUtil::ExpectNear(*expected, *actual, ErrorSpec(1e-4));
+      EXPECT_TRUE(LiteralTestUtil::Near(*expected, *actual, ErrorSpec(1e-4)));
     } else {
-      LiteralTestUtil::ExpectEqual(*expected, *actual);
+      EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *actual));
     }
   }
 
@@ -222,9 +221,9 @@ XLA_TEST_F(FusionTest, Test) {
            const4, reshape3, add2, const1, const0},
           HloInstruction::FusionKind::kLoop);
 
-  LiteralTestUtil::ExpectNear(*Literal::CreateR2<float>({{0.5}, {2.72}}),
-                              *ExecuteAndTransfer(std::move(hlo_module), {}),
-                              ErrorSpec(1e-4));
+  EXPECT_TRUE(LiteralTestUtil::Near(
+      *Literal::CreateR2<float>({{0.5}, {2.72}}),
+      *ExecuteAndTransfer(std::move(hlo_module), {}), ErrorSpec(1e-4)));
 }
 
 // Test whether we emit appropriate code for parameters of fusion instructions.
@@ -248,9 +247,9 @@ XLA_TEST_F(FusionTest, Parameter) {
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{add3, const2},
                                 HloInstruction::FusionKind::kLoop);
 
-  LiteralTestUtil::ExpectNear(*Literal::CreateR2<float>({{-1.0, 0.0, 1.0}}),
-                              *ExecuteAndTransfer(std::move(hlo_module), {}),
-                              ErrorSpec(1e-4));
+  EXPECT_TRUE(LiteralTestUtil::Near(
+      *Literal::CreateR2<float>({{-1.0, 0.0, 1.0}}),
+      *ExecuteAndTransfer(std::move(hlo_module), {}), ErrorSpec(1e-4)));
 }
 
 XLA_TEST_F(FusionTest, RandomizedParallelPartition) {
@@ -308,9 +307,9 @@ XLA_TEST_F(FusionTest, BroadcastIntoBinaryOp) {
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{add2, broadcast},
                                 HloInstruction::FusionKind::kLoop);
 
-  LiteralTestUtil::ExpectNear(
+  EXPECT_TRUE(LiteralTestUtil::Near(
       *Literal::CreateR2<float>({{0.0, 0.0, -1.0}, {11.0, 22.0, 33.0}}),
-      *ExecuteAndTransfer(std::move(hlo_module), {}), ErrorSpec(1e-4));
+      *ExecuteAndTransfer(std::move(hlo_module), {}), ErrorSpec(1e-4)));
 }
 
 XLA_TEST_F(FusionTest, ReshapeToScalar) {
@@ -323,8 +322,9 @@ XLA_TEST_F(FusionTest, ReshapeToScalar) {
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape},
                                 HloInstruction::FusionKind::kLoop);
-  LiteralTestUtil::ExpectEqual(*Literal::CreateR0<int32>(5),
-                               *ExecuteAndTransfer(std::move(hlo_module), {}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR0<int32>(5),
+                             *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, Reshape_3by2_1by2by3) {
@@ -337,9 +337,9 @@ XLA_TEST_F(FusionTest, Reshape_3by2_1by2by3) {
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1},
                                 HloInstruction::FusionKind::kLoop);
-  LiteralTestUtil::ExpectEqual(
+  EXPECT_TRUE(LiteralTestUtil::Equal(
       *Literal::CreateR3<int32>({{{1, 2, 3}, {4, 5, 6}}}),
-      *ExecuteAndTransfer(std::move(hlo_module), {}));
+      *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, Reshape_1by2by3_3by2) {
@@ -352,9 +352,9 @@ XLA_TEST_F(FusionTest, Reshape_1by2by3_3by2) {
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1},
                                 HloInstruction::FusionKind::kLoop);
-  LiteralTestUtil::ExpectEqual(
+  EXPECT_TRUE(LiteralTestUtil::Equal(
       *Literal::CreateR2<int32>({{1, 2}, {3, 4}, {5, 6}}),
-      *ExecuteAndTransfer(std::move(hlo_module), {}));
+      *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, Reshape_1by1by1_) {
@@ -367,8 +367,9 @@ XLA_TEST_F(FusionTest, Reshape_1by1by1_) {
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1},
                                 HloInstruction::FusionKind::kLoop);
-  LiteralTestUtil::ExpectEqual(*Literal::CreateR0<int32>(7),
-                               *ExecuteAndTransfer(std::move(hlo_module), {}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR0<int32>(7),
+                             *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, Reshape__1by1by1) {
@@ -381,8 +382,9 @@ XLA_TEST_F(FusionTest, Reshape__1by1by1) {
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1},
                                 HloInstruction::FusionKind::kLoop);
-  LiteralTestUtil::ExpectEqual(*Literal::CreateR3<int32>({{{7}}}),
-                               *ExecuteAndTransfer(std::move(hlo_module), {}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR3<int32>({{{7}}}),
+                             *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, Reshape__) {
@@ -395,8 +397,9 @@ XLA_TEST_F(FusionTest, Reshape__) {
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1},
                                 HloInstruction::FusionKind::kLoop);
-  LiteralTestUtil::ExpectEqual(*Literal::CreateR0<int32>(7),
-                               *ExecuteAndTransfer(std::move(hlo_module), {}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR0<int32>(7),
+                             *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, Reshape_3by3_3by3) {
@@ -409,9 +412,9 @@ XLA_TEST_F(FusionTest, Reshape_3by3_3by3) {
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1},
                                 HloInstruction::FusionKind::kLoop);
-  LiteralTestUtil::ExpectEqual(
+  EXPECT_TRUE(LiteralTestUtil::Equal(
       *Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}}),
-      *ExecuteAndTransfer(std::move(hlo_module), {}));
+      *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, Transpose_2by3) {
@@ -424,9 +427,9 @@ XLA_TEST_F(FusionTest, Transpose_2by3) {
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1},
                                 HloInstruction::FusionKind::kLoop);
-  LiteralTestUtil::ExpectEqual(
+  EXPECT_TRUE(LiteralTestUtil::Equal(
       *Literal::CreateR2<int32>({{1, 4}, {2, 5}, {3, 6}}),
-      *ExecuteAndTransfer(std::move(hlo_module), {}));
+      *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, Transpose_3by3) {
@@ -439,9 +442,9 @@ XLA_TEST_F(FusionTest, Transpose_3by3) {
   hlo_module->AddEntryComputation(builder.Build())
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reshape1},
                                 HloInstruction::FusionKind::kLoop);
-  LiteralTestUtil::ExpectEqual(
+  EXPECT_TRUE(LiteralTestUtil::Equal(
       *Literal::CreateR2<int32>({{1, 4, 7}, {2, 5, 8}, {3, 6, 9}}),
-      *ExecuteAndTransfer(std::move(hlo_module), {}));
+      *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, Reverse) {
@@ -455,8 +458,9 @@ XLA_TEST_F(FusionTest, Reverse) {
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reverse1},
                                 HloInstruction::FusionKind::kLoop);
 
-  LiteralTestUtil::ExpectEqual(*Literal::CreateR1<int32>({3, 2, 1}),
-                               *ExecuteAndTransfer(std::move(hlo_module), {}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR1<int32>({3, 2, 1}),
+                             *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, ReverseNegate) {
@@ -472,8 +476,9 @@ XLA_TEST_F(FusionTest, ReverseNegate) {
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{negate2, reverse1},
                                 HloInstruction::FusionKind::kLoop);
 
-  LiteralTestUtil::ExpectEqual(*Literal::CreateR1<int32>({-3, -2, -1}),
-                               *ExecuteAndTransfer(std::move(hlo_module), {}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR1<int32>({-3, -2, -1}),
+                             *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, BroadcastNegate) {
@@ -489,8 +494,9 @@ XLA_TEST_F(FusionTest, BroadcastNegate) {
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{negate2, broadcast1},
                                 HloInstruction::FusionKind::kLoop);
 
-  LiteralTestUtil::ExpectEqual(*Literal::CreateR1<int32>({-1, -1}),
-                               *ExecuteAndTransfer(std::move(hlo_module), {}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR1<int32>({-1, -1}),
+                             *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, SliceNegate) {
@@ -506,8 +512,9 @@ XLA_TEST_F(FusionTest, SliceNegate) {
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{negate2, slice1},
                                 HloInstruction::FusionKind::kLoop);
 
-  LiteralTestUtil::ExpectEqual(*Literal::CreateR1<int32>({-1, -3}),
-                               *ExecuteAndTransfer(std::move(hlo_module), {}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR1<int32>({-1, -3}),
+                             *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, DynamicSliceNegate) {
@@ -527,8 +534,9 @@ XLA_TEST_F(FusionTest, DynamicSliceNegate) {
           /*instructions_to_fuse=*/{negate3, dynamic_slice2},
           HloInstruction::FusionKind::kLoop);
 
-  LiteralTestUtil::ExpectEqual(*Literal::CreateR1<int32>({-2, -3}),
-                               *ExecuteAndTransfer(std::move(hlo_module), {}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR1<int32>({-2, -3}),
+                             *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, ReshapeNegate) {
@@ -544,8 +552,9 @@ XLA_TEST_F(FusionTest, ReshapeNegate) {
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{negate2, reshape1},
                                 HloInstruction::FusionKind::kLoop);
 
-  LiteralTestUtil::ExpectEqual(*Literal::CreateR2<int32>({{-1, -2}, {-3, -4}}),
-                               *ExecuteAndTransfer(std::move(hlo_module), {}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR2<int32>({{-1, -2}, {-3, -4}}),
+                             *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 // TODO(b/64070202): Investigate failure.
@@ -562,8 +571,9 @@ XLA_TEST_F(FusionTest, DISABLED_ON_GPU(TransposeNegate)) {
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{negate2, transpose1},
                                 HloInstruction::FusionKind::kLoop);
 
-  LiteralTestUtil::ExpectEqual(*Literal::CreateR2<int32>({{-1, -3}, {-2, -4}}),
-                               *ExecuteAndTransfer(std::move(hlo_module), {}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR2<int32>({{-1, -3}, {-2, -4}}),
+                             *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 std::unique_ptr<HloComputation> MakeReduceTestComputation() {
@@ -592,8 +602,9 @@ XLA_TEST_F(FusionTest, DISABLED_ON_CPU(Reduce)) {
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reduce2},
                                 HloInstruction::FusionKind::kLoop);
 
-  LiteralTestUtil::ExpectEqual(*Literal::CreateR0<int32>(15),
-                               *ExecuteAndTransfer(std::move(hlo_module), {}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR0<int32>(15),
+                             *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceImplicitBroadcast)) {
@@ -613,8 +624,9 @@ XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceImplicitBroadcast)) {
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{negate3, reduce2},
                                 HloInstruction::FusionKind::kLoop);
 
-  LiteralTestUtil::ExpectEqual(*Literal::CreateR0<int32>(-15),
-                               *ExecuteAndTransfer(std::move(hlo_module), {}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR0<int32>(-15),
+                             *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceWindow)) {
@@ -662,9 +674,9 @@ XLA_TEST_F(FusionTest, DISABLED_ON_CPU(ReduceWindow)) {
       ->CreateFusionInstruction(/*instructions_to_fuse=*/{reduce_window2},
                                 HloInstruction::FusionKind::kLoop);
 
-  LiteralTestUtil::ExpectEqual(
+  EXPECT_TRUE(LiteralTestUtil::Equal(
       *Literal::CreateR2<int32>({{462, 2145}, {24871, 62491}}),
-      *ExecuteAndTransfer(std::move(hlo_module), {}));
+      *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 // When a constant (or other op) which has multiple users is imported
@@ -675,21 +687,20 @@ XLA_TEST_F(FusionTest, SharedConstant) {
 
   auto builder = HloComputation::Builder(TestName());
   auto const0 = builder.AddInstruction(
-          HloInstruction::CreateConstant(Literal::CreateR1<int32>({0})));
+      HloInstruction::CreateConstant(Literal::CreateR1<int32>({0})));
   auto const1 = builder.AddInstruction(
-          HloInstruction::CreateConstant(Literal::CreateR1<int32>({2})));
+      HloInstruction::CreateConstant(Literal::CreateR1<int32>({2})));
   auto add1 = builder.AddInstruction(HloInstruction::CreateBinary(
-          ShapeUtil::MakeShape(S32, {1}), HloOpcode::kAdd, const1, const0));
+      ShapeUtil::MakeShape(S32, {1}), HloOpcode::kAdd, const1, const0));
   auto add2 = builder.AddInstruction(HloInstruction::CreateBinary(
-          ShapeUtil::MakeShape(S32, {1}), HloOpcode::kAdd, const1, add1));
+      ShapeUtil::MakeShape(S32, {1}), HloOpcode::kAdd, const1, add1));
   auto add3 = builder.AddInstruction(HloInstruction::CreateBinary(
-          ShapeUtil::MakeShape(S32, {1}), HloOpcode::kAdd, const1, add2));
+      ShapeUtil::MakeShape(S32, {1}), HloOpcode::kAdd, const1, add2));
   auto add4 = builder.AddInstruction(HloInstruction::CreateBinary(
-          ShapeUtil::MakeShape(S32, {1}), HloOpcode::kAdd, const1, add3));
+      ShapeUtil::MakeShape(S32, {1}), HloOpcode::kAdd, const1, add3));
   hlo_module->AddEntryComputation(builder.Build())
-      ->CreateFusionInstruction(
-        {add4, add3, add2, add1, const1},
-        HloInstruction::FusionKind::kLoop);
+      ->CreateFusionInstruction({add4, add3, add2, add1, const1},
+                                HloInstruction::FusionKind::kLoop);
 
   HloComputation* entry_comp = hlo_module->entry_computation();
 
@@ -699,8 +710,9 @@ XLA_TEST_F(FusionTest, SharedConstant) {
   // fused instruction contains the constant(2), the parameter, and 4 adds
   EXPECT_EQ(entry_comp->root_instruction()->fused_instruction_count(), 6);
 
-  LiteralTestUtil::ExpectEqual(*Literal::CreateR1<int32>({8}),
-          *ExecuteAndTransfer(std::move(hlo_module), {}));
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(*Literal::CreateR1<int32>({8}),
+                             *ExecuteAndTransfer(std::move(hlo_module), {})));
 }
 
 XLA_TEST_F(FusionTest, Add2D) { TestElementwise2D<float, 2>(HloOpcode::kAdd); }
@@ -779,7 +791,7 @@ void BM_ParallelFusion(int num_iters) {
   const int64 param2_dim1 = 1024;
 
   // Create computation.
-  ComputationBuilder builder(client, "ParallelFusion");
+  XlaBuilder builder("ParallelFusion");
   Shape shape0 = ShapeUtil::MakeShape(F32, {param0_dim0, param0_dim1});
   auto param0 = builder.Parameter(0, shape0, "param0");
   Shape shape1 = ShapeUtil::MakeShape(F32, {param1_dim0, param1_dim1});
diff --git a/tensorflow/compiler/xla/tests/gather_operation_test.cc b/tensorflow/compiler/xla/tests/gather_operation_test.cc
index 4dd3acd9af1621..143ffbdeb409d9 100644
--- a/tensorflow/compiler/xla/tests/gather_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/gather_operation_test.cc
@@ -14,12 +14,12 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/execution_options_util.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 
 // NB!  TODO(b/74360564): These tests do not test out of bounds behavior since
 // that hasn't been specced yet.
@@ -41,7 +41,7 @@ class GatherOperationTest : public HloTestBase {
     HloModuleConfig config;
     config.set_debug_options(GetDebugOptionsForTest());
     TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                            tools::Parse(hlo_text, config));
+                            ParseHloString(hlo_text, config));
     EXPECT_TRUE(RunAndCompare(std::move(module), args, nullopt));
   }
 };
@@ -399,6 +399,184 @@ ENTRY main {
   RunTest(hlo_text, operand.get(), gather_indices.get());
 }
 
+XLA_TEST_F(GatherOperationTest, FusedTensorFlowGatherV2) {
+  const string hlo_text = R"(
+HloModule FusedTensorFlowGatherV2
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  gather = s32[3,2] gather(operand, indices),
+      output_window_dims={0},
+      elided_window_dims={1},
+      gather_dims_to_operand_dims={1},
+      index_vector_dim=1,
+      window_bounds={3, 1}
+  one = s32[] constant(1)
+  one_broadcasted = s32[3,2] broadcast(one), dimensions={}
+  ROOT result = s32[3,2]{1,0} add(gather, one_broadcasted)
+}
+)";
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> gather_indices = Literal::CreateR1<int32>({0, 2});
+  RunTest(hlo_text, operand.get(), gather_indices.get());
+}
+
+XLA_TEST_F(GatherOperationTest, FusedTensorFlowGatherMultipleBatchDims) {
+  const string hlo_text = R"(
+HloModule FusedTensorFlowGatherMultipleBatchDims
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2,2] parameter(1)
+  gather = s32[2,3,2] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={1},
+      gather_dims_to_operand_dims={1},
+      index_vector_dim=2,
+      window_bounds={3, 1}
+  one = s32[] constant(1)
+  one_broadcasted = s32[2,3,2] broadcast(one), dimensions={}
+  ROOT result = s32[2,3,2]{2,1,0} add(gather, one_broadcasted)
+}
+)";
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> gather_indices =
+      Literal::CreateR2<int32>({{0, 2}, {2, 1}});
+  RunTest(hlo_text, operand.get(), gather_indices.get());
+}
+
+XLA_TEST_F(GatherOperationTest, FusedTensorFlowGatherNdMultipleBatchDims) {
+  const string hlo_text = R"(
+HloModule FusedTensorFlowGatherNdMultipleBatchDims
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2,2,2] parameter(1)
+  gather = s32[2,2] gather(operand, indices),
+      output_window_dims={},
+      elided_window_dims={0,1},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=2,
+      window_bounds={1, 1}
+  one = s32[] constant(1)
+  one_broadcasted = s32[2,2] broadcast(one), dimensions={}
+  ROOT result = s32[2,2]{1,0} add(gather, one_broadcasted)
+}
+)";
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> gather_indices =
+      Literal::CreateR3<int32>({{{0, 2}, {2, 1}}, {{1, 2}, {2, 0}}});
+  RunTest(hlo_text, operand.get(), gather_indices.get());
+}
+
+XLA_TEST_F(GatherOperationTest, FusedTensorFlowGatherNd) {
+  const string hlo_text = R"(
+HloModule FusedTensorFlowGatherNd
+
+ENTRY main {
+  operand = s32[3,3,2] parameter(0)
+  indices = s32[2,2] parameter(1)
+  gather = s32[2,2] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={0,1},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=1,
+      window_bounds={1,1,2}
+  one = s32[] constant(1)
+  one_broadcasted = s32[2,2] broadcast(one), dimensions={}
+  ROOT result = s32[2,2]{1,0} add(gather, one_broadcasted)
+}
+)";
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
+                                {{-4, 4}, {-5, 5}, {-6, 6}},  //
+                                {{-7, 7}, {-8, 8}, {-9, 9}}});
+  std::unique_ptr<Literal> gather_indices =
+      Literal::CreateR2<int32>({{0, 0}, {1, 0}});
+  RunTest(hlo_text, operand.get(), gather_indices.get());
+}
+
+XLA_TEST_F(GatherOperationTest,
+           FusedTensorFlowGatherNdNonDefaultIndexVectorDim) {
+  const string hlo_text = R"(
+HloModule FusedTensorFlowGatherNd
+
+ENTRY main {
+  operand = s32[3,3,2] parameter(0)
+  indices = s32[2,2] parameter(1)
+  gather = s32[2,2] gather(operand, indices),
+      output_window_dims={1},
+      elided_window_dims={0,1},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=0,
+      window_bounds={1,1,2}
+  one = s32[] constant(1)
+  one_broadcasted = s32[2,2] broadcast(one), dimensions={}
+  ROOT result = s32[2,2]{1,0} add(gather, one_broadcasted)
+}
+)";
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR3<int32>({{{-1, 1}, {-2, 2}, {-3, 3}},  //
+                                {{-4, 4}, {-5, 5}, {-6, 6}},  //
+                                {{-7, 7}, {-8, 8}, {-9, 9}}});
+  std::unique_ptr<Literal> gather_indices =
+      Literal::CreateR2<int32>({{0, 0}, {1, 0}});
+  RunTest(hlo_text, operand.get(), gather_indices.get());
+}
+
+XLA_TEST_F(GatherOperationTest, FusedDynamicSlice) {
+  const char* hlo_text = R"(
+HloModule FusedDynamicSlice
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2] parameter(1)
+  gather = s32[1,1] gather(operand, indices),
+      output_window_dims={0,1},
+      elided_window_dims={},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=0,
+      window_bounds={1,1}
+  one = s32[] constant(1)
+  one_broadcasted = s32[1,1] broadcast(one), dimensions={}
+  ROOT result = s32[1,1]{1,0} add(gather, one_broadcasted)
+}
+)";
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> gather_indices = Literal::CreateR1<int32>({1, 1});
+  RunTest(hlo_text, operand.get(), gather_indices.get());
+}
+
+XLA_TEST_F(GatherOperationTest, FusedBatchDynamicSlice) {
+  const string hlo_text = R"(
+HloModule FusedBatchDynamicSlice
+
+ENTRY main {
+  operand = s32[3,3] parameter(0)
+  indices = s32[2,2] parameter(1)
+  gather = s32[2,1,1] gather(operand, indices),
+      output_window_dims={1,2},
+      elided_window_dims={},
+      gather_dims_to_operand_dims={0,1},
+      index_vector_dim=0,
+      window_bounds={1,1}
+  one = s32[] constant(1)
+  one_broadcasted = s32[2,1,1] broadcast(one), dimensions={}
+  ROOT result = s32[2,1,1]{2,1,0} add(gather, one_broadcasted)
+}
+)";
+  std::unique_ptr<Literal> operand =
+      Literal::CreateR2<int32>({{1, 2, 3}, {4, 5, 6}, {7, 8, 9}});
+  std::unique_ptr<Literal> gather_indices =
+      Literal::CreateR2<int32>({{2, 1}, {1, 1}});
+  RunTest(hlo_text, operand.get(), gather_indices.get());
+}
+
 class GatherClientLibraryTest : public ClientLibraryTestBase {};
 
 XLA_TEST_F(GatherClientLibraryTest, DISABLED_ON_GPU(Basic)) {
@@ -451,8 +629,8 @@ XLA_TEST_F(GatherClientLibraryTest, DISABLED_ON_GPU(Basic)) {
       client_->ExecuteParallel(computation_instances));
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Literal> result_literal,
                           client_->Transfer(*(result_data[0])));
-  LiteralTestUtil::ExpectEqual(
-      *result_literal, *Literal::CreateR2<int32>({{1, 2, 3}, {7, 8, 9}}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      *result_literal, *Literal::CreateR2<int32>({{1, 2, 3}, {7, 8, 9}})));
 }
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/half_test.cc b/tensorflow/compiler/xla/tests/half_test.cc
index ec2f49d43bd8ce..76bf47845ca045 100644
--- a/tensorflow/compiler/xla/tests/half_test.cc
+++ b/tensorflow/compiler/xla/tests/half_test.cc
@@ -16,8 +16,7 @@ limitations under the License.
 #include <cmath>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
@@ -39,7 +38,7 @@ class HalfTestBase : public ClientLibraryTestBase {
 };
 
 using UnaryBuildFuncTy =
-    std::function<void(ComputationBuilder*, const ComputationDataHandle& src)>;
+    std::function<void(xla::XlaBuilder*, const xla::XlaOp& src)>;
 
 struct UnaryOpTestParam {
   std::function<half(half)> compute_func;
@@ -51,8 +50,8 @@ class UnaryOpTest : public HalfTestBase,
 
 XLA_TEST_P(UnaryOpTest, Ops) {
   std::vector<half> x({half(1.4), half(-2.3), half(3.2), half(-4.1)});
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle x_opnd;
+  XlaBuilder builder(TestName());
+  XlaOp x_opnd;
   auto x_data = CreateR1Parameter<half>(x, /*parameter_number=*/0, "x",
                                         &builder, &x_opnd);
 
@@ -79,30 +78,21 @@ half round_imp(half value) {
 
 INSTANTIATE_TEST_CASE_P(
     half, UnaryOpTest,
-    ::testing::Values(UnaryOpTestParam{[](half x) { return abs(x); },
-                                       &ComputationBuilder::Abs},
-                      UnaryOpTestParam{[](half x) { return round_imp(x); },
-                                       &ComputationBuilder::Round},
-                      UnaryOpTestParam{[](half x) { return ceil(x); },
-                                       &ComputationBuilder::Ceil},
-                      UnaryOpTestParam{[](half x) { return cos(x); },
-                                       &ComputationBuilder::Cos},
-                      UnaryOpTestParam{[](half x) { return exp(x); },
-                                       &ComputationBuilder::Exp},
-                      UnaryOpTestParam{[](half x) { return floor(x); },
-                                       &ComputationBuilder::Floor},
-                      UnaryOpTestParam{[](half x) { return log(x); },
-                                       &ComputationBuilder::Log},
-                      UnaryOpTestParam{[](half x) { return -x; },
-                                       &ComputationBuilder::Neg},
-                      UnaryOpTestParam{[](half x) { return sign_imp(x); },
-                                       &ComputationBuilder::Sign},
-                      UnaryOpTestParam{[](half x) { return sin(x); },
-                                       &ComputationBuilder::Sin},
-                      UnaryOpTestParam{[](half x) { return tanh(x); },
-                                       &ComputationBuilder::Tanh}
+    ::testing::Values(
+        UnaryOpTestParam{[](half x) { return abs(x); }, &XlaBuilder::Abs},
+        UnaryOpTestParam{[](half x) { return round_imp(x); },
+                         &XlaBuilder::Round},
+        UnaryOpTestParam{[](half x) { return ceil(x); }, &XlaBuilder::Ceil},
+        UnaryOpTestParam{[](half x) { return cos(x); }, &XlaBuilder::Cos},
+        UnaryOpTestParam{[](half x) { return exp(x); }, &XlaBuilder::Exp},
+        UnaryOpTestParam{[](half x) { return floor(x); }, &XlaBuilder::Floor},
+        UnaryOpTestParam{[](half x) { return log(x); }, &XlaBuilder::Log},
+        UnaryOpTestParam{[](half x) { return -x; }, &XlaBuilder::Neg},
+        UnaryOpTestParam{[](half x) { return sign_imp(x); }, &XlaBuilder::Sign},
+        UnaryOpTestParam{[](half x) { return sin(x); }, &XlaBuilder::Sin},
+        UnaryOpTestParam{[](half x) { return tanh(x); }, &XlaBuilder::Tanh}
 
-                      ));
+        ));
 
 struct UnaryPredTestParam {
   std::function<bool(half)> compute_func;
@@ -115,8 +105,8 @@ class UnaryPredTest : public HalfTestBase,
 
 XLA_TEST_P(UnaryPredTest, Ops) {
   std::vector<half> x({half(1.4), half(-2.3), half(3.2), half(-4.1)});
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle x_opnd;
+  XlaBuilder builder(TestName());
+  XlaOp x_opnd;
   auto x_data = CreateR1Parameter<half>(x, /*parameter_number=*/0, "x",
                                         &builder, &x_opnd);
 
@@ -136,11 +126,11 @@ XLA_TEST_P(UnaryPredTest, Ops) {
 INSTANTIATE_TEST_CASE_P(half, UnaryPredTest,
                         ::testing::Values(UnaryPredTestParam{
                             [](half x) { return isfinite(x); },
-                            &ComputationBuilder::IsFinite}));
+                            &XlaBuilder::IsFinite}));
 
 using BinaryBuildFuncTy = std::function<void(
-    ComputationBuilder*, const ComputationDataHandle& x,
-    const ComputationDataHandle& y, tensorflow::gtl::ArraySlice<int64>)>;
+    xla::XlaBuilder*, const xla::XlaOp& x, const xla::XlaOp& y,
+    tensorflow::gtl::ArraySlice<int64>)>;
 
 struct BinaryOpTestParam {
   std::function<half(half, half)> compute_func;
@@ -153,12 +143,12 @@ class BinaryOpTest : public HalfTestBase,
 XLA_TEST_P(BinaryOpTest, Ops) {
   std::vector<half> x({half(1.0), half(2.0), half(3.0), half(-4.0)});
   std::vector<half> y({half(0.4), half(-0.3), half(0.2), half(0.1)});
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle x_opnd;
+  XlaBuilder builder(TestName());
+  XlaOp x_opnd;
   auto x_data = CreateR1Parameter<half>(x, /*parameter_number=*/0, "x",
                                         &builder, &x_opnd);
 
-  ComputationDataHandle y_opnd;
+  XlaOp y_opnd;
   auto y_data = CreateR1Parameter<half>(y, /*parameter_number=*/1, "y",
                                         &builder, &y_opnd);
 
@@ -184,21 +174,21 @@ INSTANTIATE_TEST_CASE_P(
     half, BinaryOpTest,
     ::testing::Values(
         BinaryOpTestParam{[](half x, half y) { return x + y; },
-                          &ComputationBuilder::Add},
+                          &XlaBuilder::Add},
         BinaryOpTestParam{[](half x, half y) { return atan2_imp(x, y); },
-                          &ComputationBuilder::Atan2},
+                          &XlaBuilder::Atan2},
         BinaryOpTestParam{[](half x, half y) { return x / y; },
-                          &ComputationBuilder::Div},
+                          &XlaBuilder::Div},
         BinaryOpTestParam{[](half x, half y) { return max(x, y); },
-                          &ComputationBuilder::Max},
+                          &XlaBuilder::Max},
         BinaryOpTestParam{[](half x, half y) { return min(x, y); },
-                          &ComputationBuilder::Min},
+                          &XlaBuilder::Min},
         BinaryOpTestParam{[](half x, half y) { return x * y; },
-                          &ComputationBuilder::Mul},
+                          &XlaBuilder::Mul},
         BinaryOpTestParam{[](half x, half y) { return pow(x, y); },
-                          &ComputationBuilder::Pow},
+                          &XlaBuilder::Pow},
         BinaryOpTestParam{[](half x, half y) { return x - y; },
-                          &ComputationBuilder::Sub}
+                          &XlaBuilder::Sub}
 
         ));
 
@@ -214,12 +204,12 @@ class BinaryPredTest
 XLA_TEST_P(BinaryPredTest, Ops) {
   std::vector<half> x({half(1.0), half(2.0), half(0.2), half(-4.0)});
   std::vector<half> y({half(0.4), half(-0.3), half(0.2), half(0.1)});
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle x_opnd;
+  XlaBuilder builder(TestName());
+  XlaOp x_opnd;
   auto x_data = CreateR1Parameter<half>(x, /*parameter_number=*/0, "x",
                                         &builder, &x_opnd);
 
-  ComputationDataHandle y_opnd;
+  XlaOp y_opnd;
   auto y_data = CreateR1Parameter<half>(y, /*parameter_number=*/1, "y",
                                         &builder, &y_opnd);
 
@@ -239,17 +229,17 @@ XLA_TEST_P(BinaryPredTest, Ops) {
 INSTANTIATE_TEST_CASE_P(
     half, BinaryPredTest,
     ::testing::Values(BinaryPredTestParam{[](half x, half y) { return x == y; },
-                                          &ComputationBuilder::Eq},
+                                          &XlaBuilder::Eq},
                       BinaryPredTestParam{[](half x, half y) { return x != y; },
-                                          &ComputationBuilder::Ne},
+                                          &XlaBuilder::Ne},
                       BinaryPredTestParam{[](half x, half y) { return x >= y; },
-                                          &ComputationBuilder::Ge},
+                                          &XlaBuilder::Ge},
                       BinaryPredTestParam{[](half x, half y) { return x > y; },
-                                          &ComputationBuilder::Gt},
+                                          &XlaBuilder::Gt},
                       BinaryPredTestParam{[](half x, half y) { return x <= y; },
-                                          &ComputationBuilder::Le},
+                                          &XlaBuilder::Le},
                       BinaryPredTestParam{[](half x, half y) { return x < y; },
-                                          &ComputationBuilder::Lt}
+                                          &XlaBuilder::Lt}
 
                       ));
 
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index 9984aba089be89..08ed826c80823e 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -23,11 +23,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -93,17 +93,16 @@ HloTestBase::HloTestBase(se::Platform* test_platform,
 }
 
 /* static */
-std::unique_ptr<HloModule> HloTestBase::CreateNewModule() {
-  HloModuleConfig config;
-  config.set_debug_options(GetDebugOptionsForTest());
-  return MakeUnique<HloModule>(TestName(), VersionedComputationHandle(),
-                               config);
+std::unique_ptr<HloModule> HloTestBase::CreateNewModule(const string& name) {
+  return MakeUnique<HloModule>(name, VersionedComputationHandle(),
+                               GetModuleConfigForTest());
 }
 
 /*static*/ DebugOptions HloTestBase::GetDebugOptionsForTest() {
   auto debug_options = legacy_flags::GetDebugOptionsFromFlags();
   // TODO(b/38354253): Change tests to use Parameters instead of Constants.
   debug_options.add_xla_disable_hlo_passes("constant_folding");
+  debug_options.set_xla_gpu_max_kernel_unroll_factor(1);
   return debug_options;
 }
 
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index 79fcea9403e6e2..eb3a2ea76a667a 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -85,13 +85,21 @@ class HloTestBase : public ::testing::Test {
   // options from command-line flags. If you want a fresh HloModule object and
   // then add HloComputations to it, it's recommended to use this method in your
   // tests.
-  static std::unique_ptr<HloModule> CreateNewModule();
+  static std::unique_ptr<HloModule> CreateNewModule(
+      const string& name = TestName());
 
   // Populates debug options from command-line flags and adjusts the options for
   // testing. It is recommended to use this when you need to pass in
   // DebugOptions, e.g. when creating a module from a string or a file.
   static DebugOptions GetDebugOptionsForTest();
 
+  // Gets an HloModuleConfig with options appropriate for tests.
+  static HloModuleConfig GetModuleConfigForTest() {
+    HloModuleConfig config;
+    config.set_debug_options(GetDebugOptionsForTest());
+    return config;
+  }
+
   // Executes the given module and return the result as a Literal.
   StatusOr<std::unique_ptr<Literal>> Execute(
       std::unique_ptr<HloModule> module,
@@ -176,9 +184,13 @@ class HloTestBase : public ::testing::Test {
   // 'layout'.
   void ForceParameterLayout(HloModule* module, int64 param_no,
                             const Layout& layout) {
-    ASSERT_LT(param_no,
-              module->mutable_entry_computation_layout()->parameter_count());
-    module->mutable_entry_computation_layout()
+    ASSERT_LT(
+        param_no,
+        module->mutable_host_entry_computation_layout()->parameter_count());
+    module->mutable_host_entry_computation_layout()
+        ->mutable_parameter_layout(param_no)
+        ->ResetLayout(layout);
+    module->mutable_device_entry_computation_layout()
         ->mutable_parameter_layout(param_no)
         ->ResetLayout(layout);
   }
@@ -186,7 +198,10 @@ class HloTestBase : public ::testing::Test {
   // Convenience method to force the layout of the computation result in a
   // module. The result layout of 'module' is set to 'layout'.
   void ForceResultLayout(HloModule* module, const Layout& layout) {
-    module->mutable_entry_computation_layout()
+    module->mutable_host_entry_computation_layout()
+        ->mutable_result_layout()
+        ->ResetLayout(layout);
+    module->mutable_device_entry_computation_layout()
         ->mutable_result_layout()
         ->ResetLayout(layout);
   }
@@ -194,7 +209,10 @@ class HloTestBase : public ::testing::Test {
   // Convenience method to clear the layout of the computation result in
   // 'module'.
   void ForceClearResultLayout(HloModule* module) {
-    module->mutable_entry_computation_layout()
+    module->mutable_host_entry_computation_layout()
+        ->mutable_result_layout()
+        ->Clear();
+    module->mutable_device_entry_computation_layout()
         ->mutable_result_layout()
         ->Clear();
   }
diff --git a/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc b/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc
index da4cf4ae0c31bc..c8a05c2e9e971d 100644
--- a/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc
@@ -15,10 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
 
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -67,7 +67,7 @@ HloModule& HloVerifiedTestBase::module() {
 void HloVerifiedTestBase::ParseAndVerifyModule(
     tensorflow::StringPiece hlo_text) {
   CHECK(!module_) << "Called ParseModule when test already has a module.";
-  TF_ASSERT_OK_AND_ASSIGN(module_, tools::Parse(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(module_, ParseHloString(hlo_text));
   VerifyModule();
 }
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.cc b/tensorflow/compiler/xla/tests/literal_test_util.cc
index c28f79ae386670..cde1dcd9cd10c8 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util.cc
@@ -15,978 +15,93 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 
-#include <unistd.h>
-#include <cmath>
-#include <vector>
-
-#include "tensorflow/compiler/xla/index_util.h"
-#include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/core/casts.h"
+#include "tensorflow/compiler/xla/literal_comparison.h"
 #include "tensorflow/core/lib/io/path.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/types.h"
 
 namespace xla {
 
-using ::tensorflow::strings::Appendf;
-using ::tensorflow::strings::Printf;
-using ::tensorflow::strings::StrAppend;
-using ::tensorflow::strings::StrCat;
-
-/* static */ ::testing::AssertionResult LiteralTestUtil::EqualShapes(
-    const Shape& expected, const Shape& actual) {
-  if (ShapeUtil::IsTuple(expected) != ShapeUtil::IsTuple(actual)) {
-    return ::testing::AssertionFailure()
-           << "tupleness-mismatch! want: " << ShapeUtil::HumanString(expected)
-           << " got: " << ShapeUtil::HumanString(actual);
-  }
-  if (ShapeUtil::IsTuple(expected)) {
-    if (ShapeUtil::TupleElementCount(expected) !=
-        ShapeUtil::TupleElementCount(actual)) {
-      return ::testing::AssertionFailure()
-             << "want tuple element count: "
-             << ShapeUtil::TupleElementCount(expected)
-             << " got tuple element count: "
-             << ShapeUtil::TupleElementCount(actual);
-    }
-    for (int i = 0; i < expected.tuple_shapes_size(); ++i) {
-      ::testing::AssertionResult result =
-          EqualShapes(expected.tuple_shapes(i), actual.tuple_shapes(i))
-          << "mismatch in tuple index " << i;
-      if (!result) {
-        return result;
-      }
-    }
-  } else {
-    if (ShapeUtil::Rank(expected) != ShapeUtil::Rank(actual)) {
-      return ::testing::AssertionFailure()
-             << "want rank of: " << ShapeUtil::HumanString(expected)
-             << " got rank of: " << ShapeUtil::HumanString(actual);
-    }
-    if (expected.element_type() != actual.element_type()) {
-      return ::testing::AssertionFailure()
-             << PrimitiveType_Name(expected.element_type()) << " vs "
-             << PrimitiveType_Name(actual.element_type());
-    }
-    if (expected.dimensions_size() != actual.dimensions_size()) {
-      return ::testing::AssertionFailure()
-             << "want dimensions_size " << expected.dimensions_size()
-             << " got dimensions_size " << actual.dimensions_size();
-    }
-    for (int i = 0; i < expected.dimensions_size(); ++i) {
-      if (expected.dimensions(i) != actual.dimensions(i)) {
-        return ::testing::AssertionFailure()
-               << "mismatch in dimension #" << i
-               << " expected: " << ShapeUtil::HumanString(expected)
-               << " actual: " << ShapeUtil::HumanString(actual);
-      }
-    }
-  }
-  return ::testing::AssertionSuccess();
-}
-
-/* static */ void LiteralTestUtil::AssertEqualShapes(const Shape& expected,
-                                                     const Shape& actual) {
-  ASSERT_TRUE(EqualShapes(expected, actual));
-}
-
-/* static */ void LiteralTestUtil::AssertEqualShapesAndLayouts(
-    const Shape& expected, const Shape& actual) {
-  ASSERT_EQ(expected.ShortDebugString(), actual.ShortDebugString());
-}
-
-namespace {
-
-// Return a literal with all arrays of type FromNativeT converted to type
-// ToNativeT in the given literal.
-template <typename FromNativeT, typename ToNativeT>
-std::unique_ptr<Literal> ConvertType(const Literal& literal) {
-  // First construct shape of the result.
-  Shape result_shape(literal.shape());
-  ShapeUtil::ForEachMutableSubshape(
-      &result_shape, [](Shape* subshape, const ShapeIndex&) {
-        if (subshape->element_type() ==
-            primitive_util::NativeToPrimitiveType<FromNativeT>()) {
-          subshape->set_element_type(
-              primitive_util::NativeToPrimitiveType<ToNativeT>());
-        }
-      });
-  auto result = MakeUnique<Literal>(result_shape);
-
-  // Then copy over the data from 'literal' converting FromNativeT values to
-  // ToNativeT values as necessary.
-  ShapeUtil::ForEachSubshape(
-      literal.shape(),
-      [&](const Shape& subshape, const ShapeIndex& shape_index) {
-        if (ShapeUtil::IsArray(subshape)) {
-          if (subshape.element_type() ==
-              primitive_util::NativeToPrimitiveType<FromNativeT>()) {
-            auto src = literal.data<FromNativeT>(shape_index);
-            auto dest = result->data<ToNativeT>(shape_index);
-            for (int64 i = 0; i < src.size(); ++i) {
-              dest[i] = static_cast<ToNativeT>(src[i]);
-            }
-          } else {
-            TF_CHECK_OK(result->CopyFrom(literal,
-                                         /*dest_shape_index=*/shape_index,
-                                         /*src_shape_index=*/shape_index));
-          }
-        }
-      });
-  return result;
-}
-
-}  // namespace
-
-/* static */ std::unique_ptr<Literal> LiteralTestUtil::ConvertBF16ToF32(
-    const Literal& literal) {
-  return ConvertType<bfloat16, float>(literal);
-}
-
-/* static */ std::unique_ptr<Literal> LiteralTestUtil::ConvertF32ToBF16(
-    const Literal& literal) {
-  return ConvertType<float, bfloat16>(literal);
-}
-
 namespace {
 
-string Hostname() {
-  char hostname[1024];
-  gethostname(hostname, sizeof hostname);
-  hostname[sizeof hostname - 1] = 0;
-  return string(hostname);
-}
-
-// Helper function for comparing a floating point type, FloatT, bitwise equal
-// between the left-hand-side and right-hand-side, by bit-casting to UnsignedT
-// -- on miscompare, a nice error message is given in the AssertionFailure.
-template <typename FloatT, typename UnsignedT>
-::testing::AssertionResult CompareFloatsBitwiseEqual(FloatT lhs, FloatT rhs) {
-  auto ulhs = tensorflow::bit_cast<UnsignedT>(lhs);
-  auto urhs = tensorflow::bit_cast<UnsignedT>(rhs);
-  auto lhs_double = static_cast<double>(lhs);
-  auto rhs_double = static_cast<double>(rhs);
-  if (ulhs != urhs) {
-    return ::testing::AssertionFailure() << Printf(
-               "floating values are not bitwise-equal; and equality testing "
-               "was requested: %s=%g=%a vs %s=%g=%a",
-               StrCat(tensorflow::strings::Hex(ulhs)).c_str(), lhs_double,
-               lhs_double, StrCat(tensorflow::strings::Hex(urhs)).c_str(),
-               rhs_double, rhs_double);
-  }
-  return ::testing::AssertionSuccess();
-}
-
-// Templated comparator that specializes for float equality comparison with the
-// bitwise helper above (this is the un-specialized fallback, to just use the
-// default gunit implementation).
-template <typename NativeT>
-::testing::AssertionResult CompareEqual(NativeT lhs, NativeT rhs) {
-  if (lhs == rhs) {
+// Writes the given literal to a file in the test temporary directory.
+void WriteLiteralToTempFile(const LiteralSlice& literal, const string& name) {
+  auto get_hostname = [] {
+    char hostname[1024];
+    gethostname(hostname, sizeof hostname);
+    hostname[sizeof hostname - 1] = 0;
+    return string(hostname);
+  };
+  int64 now_usec = tensorflow::Env::Default()->NowMicros();
+  string filename = tensorflow::io::JoinPath(
+      tensorflow::testing::TmpDir(),
+      tensorflow::strings::Printf("tempfile-%s-%llx-%s", get_hostname().c_str(),
+                                  now_usec, name.c_str()));
+  TF_CHECK_OK(tensorflow::WriteBinaryProto(tensorflow::Env::Default(), filename,
+                                           literal.ToProto()));
+  LOG(ERROR) << "wrote to " << name << " file: " << filename;
+}
+
+// Callback helper that dumps literals to temporary files in the event of a
+// miscomparison.
+void OnMiscompare(const LiteralSlice& expected, const LiteralSlice& actual,
+                  const LiteralSlice& mismatches) {
+  LOG(INFO) << "expected: " << ShapeUtil::HumanString(expected.shape()) << " "
+            << literal_comparison::ToStringTruncated(expected);
+  LOG(INFO) << "actual:   " << ShapeUtil::HumanString(actual.shape()) << " "
+            << literal_comparison::ToStringTruncated(actual);
+  LOG(INFO) << "Dumping literals to temp files...";
+  WriteLiteralToTempFile(expected, "expected");
+  WriteLiteralToTempFile(actual, "actual");
+  WriteLiteralToTempFile(mismatches, "mismatches");
+}
+
+::testing::AssertionResult StatusToAssertion(const Status& s) {
+  if (s.ok()) {
     return ::testing::AssertionSuccess();
   }
-  ::testing::Message msg;
-  msg << "Expected equality of these values:";
-  msg << "\n  " << lhs;
-  msg << "\n  " << rhs;
-
-  return ::testing::AssertionFailure() << msg;
-}
-
-// Specializations for floating types that do bitwise comparisons when equality
-// comparison is requested.
-template <>
-::testing::AssertionResult CompareEqual<bfloat16>(bfloat16 lhs, bfloat16 rhs) {
-  return CompareFloatsBitwiseEqual<bfloat16, uint16>(lhs, rhs);
-}
-template <>
-::testing::AssertionResult CompareEqual<Eigen::half>(Eigen::half lhs,
-                                                     Eigen::half rhs) {
-  return CompareFloatsBitwiseEqual<Eigen::half, uint16>(lhs, rhs);
-}
-template <>
-::testing::AssertionResult CompareEqual<float>(float lhs, float rhs) {
-  return CompareFloatsBitwiseEqual<float, uint32>(lhs, rhs);
-}
-template <>
-::testing::AssertionResult CompareEqual<double>(double lhs, double rhs) {
-  return CompareFloatsBitwiseEqual<double, uint64>(lhs, rhs);
-}
-template <>
-::testing::AssertionResult CompareEqual<complex64>(complex64 lhs,
-                                                   complex64 rhs) {
-  auto res = CompareEqual<float>(lhs.real(), rhs.real());
-  if (!res) {
-    return res;
-  }
-  return CompareEqual<float>(lhs.imag(), rhs.imag());
-}
-
-// A recursive function which iterates through every index of expected and
-// actual literal and compares their values elementwise. Returns true if all
-// elements are equal.
-template <typename NativeT>
-bool ExpectLiteralsEqual(const Literal& expected, const Literal& actual,
-                         tensorflow::gtl::MutableArraySlice<int64> multi_index,
-                         int64 dimension) {
-  if (dimension == expected.shape().dimensions_size()) {
-    NativeT expected_value = expected.Get<NativeT>(multi_index);
-    NativeT actual_value = actual.Get<NativeT>(multi_index);
-    ::testing::AssertionResult result =
-        CompareEqual<NativeT>(expected_value, actual_value);
-    return result;  // Defines implicit coersion to bool.
-  }
-
-  bool all_match = true;
-  for (int64 i = 0; i < expected.shape().dimensions(dimension); ++i) {
-    multi_index[dimension] = i;
-    all_match = all_match && ExpectLiteralsEqual<NativeT>(
-                                 expected, actual, multi_index, dimension + 1);
-  }
-  return all_match;
+  return ::testing::AssertionFailure() << s.error_message();
 }
 
 }  // namespace
 
-/* static */ void LiteralTestUtil::ExpectEqual(const Literal& expected,
-                                               const Literal& actual,
-                                               const string& message) {
-  EXPECT_TRUE(Equal(expected, actual))
-      << "expected:\n"
-      << expected.ToString() << "\n\tvs actual:\n"
-      << actual.ToString()
-      << (message.empty() ? "" : StrCat("\nmessage: ", message));
-}
-
-/* static */ void LiteralTestUtil::ExpectNotEqual(const Literal& expected,
-                                                  const Literal& actual) {
-  EXPECT_FALSE(Equal(expected, actual));
-}
-
-/* static */ ::testing::AssertionResult LiteralTestUtil::Equal(
-    const Literal& expected, const Literal& actual) {
-  VLOG(1) << "expected:";
-  XLA_VLOG_LINES(1, expected.ToString());
-  VLOG(1) << "actual:";
-  XLA_VLOG_LINES(1, actual.ToString());
-
-  AssertEqualShapes(expected.shape(), actual.shape());
-  std::vector<int64> multi_index(expected.shape().dimensions_size(), 0);
-  bool match = false;
-  switch (expected.shape().element_type()) {
-    case PRED:
-      match = ExpectLiteralsEqual<bool>(expected, actual, &multi_index, 0);
-      break;
-    case U8:
-      match = ExpectLiteralsEqual<uint8>(expected, actual, &multi_index, 0);
-      break;
-    case S32:
-      match = ExpectLiteralsEqual<int32>(expected, actual, &multi_index, 0);
-      break;
-    case S64:
-      match = ExpectLiteralsEqual<int64>(expected, actual, &multi_index, 0);
-      break;
-    case U32:
-      match = ExpectLiteralsEqual<uint32>(expected, actual, &multi_index, 0);
-      break;
-    case U64:
-      match = ExpectLiteralsEqual<uint64>(expected, actual, &multi_index, 0);
-      break;
-    case BF16:
-      match = ExpectLiteralsEqual<bfloat16>(expected, actual, &multi_index, 0);
-      break;
-    case F16:
-      match = ExpectLiteralsEqual<half>(expected, actual, &multi_index, 0);
-      break;
-    case F32:
-      match = ExpectLiteralsEqual<float>(expected, actual, &multi_index, 0);
-      break;
-    case F64:
-      match = ExpectLiteralsEqual<double>(expected, actual, &multi_index, 0);
-      break;
-    case C64:
-      match = ExpectLiteralsEqual<complex64>(expected, actual, &multi_index, 0);
-      break;
-    case TUPLE: {
-      bool tuple_match = true;
-      for (int i = 0; i < ShapeUtil::TupleElementCount(expected.shape()); ++i) {
-        SCOPED_TRACE(StrCat("Tuple index ", i, " in ",
-                            ShapeUtil::HumanString(expected.shape())));
-
-        // Create LiteralViews of the expected and actual elements.
-        auto result = Equal(LiteralView::Create(expected, {i}),
-                            LiteralView::Create(actual, {i}));
-        tuple_match = tuple_match ? !!result : false;
-      }
-      match = tuple_match;
-      break;
-    }
-    default:
-      LOG(FATAL)
-          << "Unsupported primitive type in LiteralTestUtil::ExpectEqual: "
-          << PrimitiveType_Name(expected.shape().element_type());
-  }
-  ::testing::AssertionResult result = ::testing::AssertionSuccess();
-  if (!match) {
-    result = ::testing::AssertionFailure()
-             << "expected: " << expected.ToString()
-             << "\nactual:   " << actual.ToString();
-    VLOG(1) << result.message();
-  }
-  return result;
-}
-
-namespace {
-
-// Gets the total element count.  For tuples, this is not the count of tuple
-// elements, but the sum of elements of each tuple element.
-int64 RecursiveElementCount(const Shape& shape) {
-  if (ShapeUtil::IsTuple(shape)) {
-    const int64 tuple_elements = ShapeUtil::TupleElementCount(shape);
-    int64 total = 0;
-    for (int64 i = 0; i < tuple_elements; ++i) {
-      total += RecursiveElementCount(ShapeUtil::GetTupleElementShape(shape, i));
-    }
-    return total;
-  } else {
-    return ShapeUtil::ElementsIn(shape);
-  }
-}
-
-// Calling ToString on a literal with over 100 million elements takes around
-// 3 minutes.  The utility of printing a literal with >1000 elements is
-// questionable, especially when writing the Literal proto to disk is orders
-// of magnitude faster.
-string TruncateHugeLiteral(const Literal& literal) {
-  return RecursiveElementCount(literal.shape()) < 1000
-             ? literal.ToString()
-             : "[TRUNCATED, Literal with more than 1000 values]";
+/* static */ ::testing::AssertionResult LiteralTestUtil::EqualShapes(
+    const Shape& expected, const Shape& actual) {
+  return StatusToAssertion(literal_comparison::EqualShapes(expected, actual));
 }
 
-// Returns whether the actual and expected values are mismatched with respect to
-// nans. 'relaxed_nans' is interpreted as in xla::ErrorSpec.
-template <typename NativeT>
-bool NanMismatch(NativeT expected, NativeT actual, bool relaxed_nans) {
-  if (relaxed_nans) {
-    return !std::isnan(expected) && std::isnan(actual);
-  } else {
-    return std::isnan(expected) != std::isnan(actual);
+/* static */ ::testing::AssertionResult LiteralTestUtil::EqualShapesAndLayouts(
+    const Shape& expected, const Shape& actual) {
+  if (expected.ShortDebugString() != actual.ShortDebugString()) {
+    return ::testing::AssertionFailure()
+           << "want: " << expected.ShortDebugString()
+           << " got: " << actual.ShortDebugString();
   }
+  return ::testing::AssertionSuccess();
 }
 
-template <>
-bool NanMismatch<complex64>(complex64 expected, complex64 actual,
-                            bool relaxed_nans) {
-  return NanMismatch<float>(expected.real(), actual.real(), relaxed_nans) ||
-         NanMismatch<float>(expected.imag(), actual.imag(), relaxed_nans);
-}
-
-template <>
-bool NanMismatch<half>(half expected, half actual, bool relaxed_nans) {
-  return NanMismatch<float>(static_cast<float>(expected),
-                            static_cast<float>(actual), relaxed_nans);
-}
-
-// Converts the given floating-point value to a string.
-template <typename NativeT>
-string FpValueToString(NativeT value) {
-  return Printf("%8.4g", static_cast<double>(value));
-}
-
-template <>
-string FpValueToString<complex64>(complex64 value) {
-  return Printf("%8.4g + %8.4fi", value.real(), value.imag());
-}
-
-// Returns the absolute value of the given floating point value. This function
-// is used instead of std::abs directly in order to allow type-dependent
-// implementations for NearComparator.
-template <typename NativeT>
-float FpAbsoluteValue(NativeT value) {
-  return std::abs(value);
-}
-
-template <>
-float FpAbsoluteValue(bfloat16 value) {
-  return FpAbsoluteValue<float>(static_cast<float>(value));
-}
-
-template <>
-float FpAbsoluteValue(half value) {
-  return FpAbsoluteValue<float>(static_cast<float>(value));
-}
-
-// Helper class for comparing floating-point literals within an error bound.
-template <typename NativeT>
-class NearComparator {
- public:
-  // Compares the two array literals elementwise and returns an assertion
-  // result. The assertion result is successful if all actual and expected
-  // elements are within the given error bound. In case of error, the assertion
-  // result contains a detailed error message in case of failure.
-  static ::testing::AssertionResult Compare(const Literal& expected,
-                                            const Literal& actual,
-                                            ErrorSpec error,
-                                            bool detailed_message) {
-    NearComparator<NativeT> comparator(expected, actual, error,
-                                       detailed_message);
-    return comparator.Run();
-  }
-
- private:
-  // Data structure encapsulating metadata about a single element mismatch.
-  struct Mismatch {
-    NativeT actual;
-    NativeT expected;
-    float rel_error;
-    float abs_error;
-
-    // The linear index of the failure within the shape. This linear index is
-    // from the 'actual' literal.
-    int64 linear_index;
-
-    bool operator<(const Mismatch& other) const {
-      return rel_error < other.rel_error;
-    }
-
-    string ToString(const Shape& shape) const {
-      return Printf(
-          "actual %s, expected %s, index %s, rel error %8.3g, abs error %8.3g",
-          FpValueToString(actual).c_str(), FpValueToString(expected).c_str(),
-          LiteralTestUtil::MultiIndexAsString(
-              IndexUtil::LinearIndexToMultidimensionalIndex(shape,
-                                                            linear_index))
-              .c_str(),
-          rel_error, abs_error);
-    }
-  };
-
-  explicit NearComparator(const Literal& expected, const Literal& actual,
-                          ErrorSpec error, bool detailed_message)
-      : expected_(expected),
-        actual_(actual),
-        error_(error),
-        detailed_message_(detailed_message),
-        abs_value_buckets_(kAbsValueBucketBounds.size() - 1, {0, 0}),
-        abs_error_buckets_(kErrorBucketBounds.size(), 0),
-        rel_error_buckets_(kErrorBucketBounds.size(), 0) {}
-
-  // Runs the comparison between expected and actual literals.
-  ::testing::AssertionResult Run() {
-    VLOG(1) << "expected:";
-    XLA_VLOG_LINES(1, TruncateHugeLiteral(expected_));
-    VLOG(1) << "actual:";
-    XLA_VLOG_LINES(1, TruncateHugeLiteral(actual_));
-
-    // If the shapes mismatch, we simply fail the expectation instead of
-    // printing out data, as it's a type error rather than a value error.
-    ::testing::AssertionResult equal_shapes =
-        LiteralTestUtil::EqualShapes(expected_.shape(), actual_.shape());
-    if (!equal_shapes) {
-      return equal_shapes;
-    }
-    if (!ShapeUtil::IsArray(expected_.shape())) {
-      return ::testing::AssertionFailure() << "Expected array shape";
-    }
-
-    mismatches_ = Literal(ShapeUtil::ChangeElementType(actual_.shape(), PRED));
-    mismatches_.PopulateWithValue(false);
-
-    CompareLiterals();
-
-    if (num_mismatches_ == 0) {
-      return ::testing::AssertionSuccess();
-    } else if (!VLOG_IS_ON(1)) {
-      LOG(INFO) << "expected: " << ShapeUtil::HumanString(expected_.shape())
-                << " " << TruncateHugeLiteral(expected_);
-      LOG(INFO) << "actual:   " << ShapeUtil::HumanString(actual_.shape())
-                << " " << TruncateHugeLiteral(actual_);
-      LOG(INFO) << "Dumping literals to temp files...";
-      WriteLiteralToTempFile(expected_, "expected");
-      WriteLiteralToTempFile(actual_, "actual");
-      WriteLiteralToTempFile(mismatches_, "mismatches");
-    }
-    return ::testing::AssertionFailure() << ErrorMessage();
-  }
-
-  // Insert the given absolute value into the absolute value bucket vector. The
-  // bounds of the buckets are given by kAbsValueBucketBounds.
-  void UpdateAbsValueBucket(NativeT value, bool is_mismatch) {
-    // Adjust the bucket containing the absolute values of the 'actual'
-    // elements.
-    const float abs_value = FpAbsoluteValue(value);
-    for (int i = 0; i < abs_value_buckets_.size(); ++i) {
-      if (i == abs_value_buckets_.size() - 1 ||
-          (abs_value >= kAbsValueBucketBounds[i] &&
-           abs_value < kAbsValueBucketBounds[i + 1])) {
-        // The first value of the pair is the count of elements in the bucket,
-        // the second is the count of mismatches in the bucket.
-        abs_value_buckets_[i].first++;
-        if (is_mismatch) {
-          abs_value_buckets_[i].second++;
-        }
-        return;
-      }
-    }
-  }
-
-  // Insert the given error into the given error bucket vector.
-  void UpdateErrorBucket(
-      float error, tensorflow::gtl::MutableArraySlice<int64> error_buckets) {
-    CHECK_EQ(error_buckets.size(), kErrorBucketBounds.size());
-    for (int i = 0; i < error_buckets.size(); ++i) {
-      if (error >= kErrorBucketBounds[i]) {
-        error_buckets[i]++;
-      }
-    }
-  }
-
-  // Compares the two given elements from the expected and actual literals at
-  // the given literal_index and keeps track of various mismatch statistics.
-  void CompareValues(NativeT expected, NativeT actual, int64 linear_index) {
-    const bool is_nan_mismatch =
-        NanMismatch(expected, actual, error_.relaxed_nans);
-    float abs_error;
-    float rel_error;
-    if (actual == expected) {
-      abs_error = 0;
-      rel_error = 0;
-    } else if (is_nan_mismatch) {
-      num_nan_mismatches_++;
-      // A nan mismatch is considered to have infinite error. rel_error is used
-      // for sorting a std::set of the top mismatchs, and a nan value here will
-      // result in undefined behavior because nan's do not satisfy the strict
-      // weak ordering requirement of std containers.
-      abs_error = std::numeric_limits<float>::infinity();
-      rel_error = std::numeric_limits<float>::infinity();
-    } else {
-      abs_error = FpAbsoluteValue(actual - expected);
-      rel_error = abs_error / FpAbsoluteValue(expected);
-    }
-    const bool is_abs_mismatch = abs_error > error_.abs;
-    const bool is_rel_mismatch = rel_error > error_.rel;
-    const bool is_mismatch =
-        is_nan_mismatch || (is_abs_mismatch && is_rel_mismatch);
-
-    // Update the error of the relative bucket only if the *absolute* error
-    // bound is exceeded and vice versa.
-    if (is_abs_mismatch) {
-      num_abs_mismatches_++;
-      UpdateErrorBucket(rel_error, &rel_error_buckets_);
-    }
-    if (is_rel_mismatch) {
-      num_rel_mismatches_++;
-      UpdateErrorBucket(abs_error, &abs_error_buckets_);
-    }
-
-    UpdateAbsValueBucket(actual, is_mismatch);
-
-    if (!is_mismatch) {
-      return;
-    }
-
-    num_mismatches_++;
-
-    // Keep track of the kTopRelativeErrorCount relative error mismatches.
-    if (top_rel_mismatches_.size() < kTopRelativeErrorCount ||
-        rel_error > top_rel_mismatches_.begin()->rel_error) {
-      Mismatch mismatch = {actual, expected, rel_error, abs_error,
-                           linear_index};
-      top_rel_mismatches_.insert(mismatch);
-      if (top_rel_mismatches_.size() > kTopRelativeErrorCount) {
-        top_rel_mismatches_.erase(top_rel_mismatches_.begin());
-      }
-    }
-
-    mismatches_.data<bool>()[linear_index] = true;
-  }
-
-  // Compares the two literals elementwise.
-  void CompareLiterals() {
-    // Fast path optimization for the case were layouts match.
-    if (LayoutUtil::Equal(actual_.shape().layout(),
-                          expected_.shape().layout())) {
-      tensorflow::gtl::ArraySlice<const NativeT> expected_data =
-          expected_.data<NativeT>();
-      tensorflow::gtl::ArraySlice<const NativeT> actual_data =
-          actual_.data<NativeT>();
-      const int64 len = expected_data.size();
-      for (int64 i = 0; i < len; ++i) {
-        CompareValues(expected_data[i], actual_data[i], i);
-      }
-      return;
-    }
-    std::vector<int64> multi_index(ShapeUtil::Rank(actual_.shape()), 0);
-    CompareLiteralsSlow(0, &multi_index);
-  }
-
-  // Slow path for CompareLiterals when 'actual' and 'expected' literals have
-  // different layouts. In this case, multidimensional indices are constructed
-  // and indexed for each element.
-  void CompareLiteralsSlow(int64 dimension, std::vector<int64>* multi_index) {
-    if (dimension == multi_index->size()) {
-      CompareValues(expected_.Get<NativeT>(*multi_index),
-                    actual_.Get<NativeT>(*multi_index),
-                    IndexUtil::MultidimensionalIndexToLinearIndex(
-                        actual_.shape(), *multi_index));
-    } else {
-      for (int64 i = 0; i < expected_.shape().dimensions(dimension); ++i) {
-        (*multi_index)[dimension] = i;
-        CompareLiteralsSlow(dimension + 1, multi_index);
-      }
-    }
-  }
-
-  // Writes the given literal to a file in the test temporary directory.
-  void WriteLiteralToTempFile(const Literal& literal, const string& name) {
-    int64 now_usec = tensorflow::Env::Default()->NowMicros();
-    string filename = tensorflow::io::JoinPath(
-        tensorflow::testing::TmpDir(),
-        Printf("tempfile-%s-%llx-%s", Hostname().c_str(), now_usec,
-               name.c_str()));
-    TF_CHECK_OK(tensorflow::WriteBinaryProto(tensorflow::Env::Default(),
-                                             filename, literal.ToProto()));
-    LOG(ERROR) << "wrote to " << name << " file: " << filename;
-  }
-
-  // Returns an error message string with a detailed breakdown of the
-  // mismatches. Called after calling Run().
-  string ErrorMessage() {
-    string out;
-    int64 element_count = ShapeUtil::ElementsIn(actual_.shape());
-
-    auto percent_string = [](float a, float b) {
-      float pct = b == 0.0 ? 0.0 : 100.0 * a / b;
-      return Printf("%0.4f%%", pct);
-    };
-
-    Appendf(&out,
-            "\nMismatch count %lld (%s) in shape %s (%lld elements), abs bound "
-            "%g, rel bound %g\n",
-            num_mismatches_,
-            percent_string(num_mismatches_, element_count).c_str(),
-            ShapeUtil::HumanString(actual_.shape()).c_str(),
-            ShapeUtil::ElementsIn(actual_.shape()), error_.abs, error_.rel);
-    if (num_nan_mismatches_ > 0) {
-      StrAppend(&out, "nan mismatches ", num_nan_mismatches_, "\n");
-    }
-    Appendf(&out, "Top relative error mismatches:\n");
-    for (auto it = top_rel_mismatches_.rbegin();
-         it != top_rel_mismatches_.rend(); ++it) {
-      StrAppend(&out, "  ", it->ToString(actual_.shape()).c_str(), "\n");
-    }
-
-    if (!detailed_message_) {
-      return out;
-    }
-
-    StrAppend(&out, "Absolute magnitude breakdown of actual values:\n");
-    CHECK_EQ(abs_value_buckets_.size() + 1, kAbsValueBucketBounds.size());
-    for (int i = 0; i < abs_value_buckets_.size(); ++i) {
-      const int64 bucket_size = abs_value_buckets_[i].first;
-      const int64 bucket_mismatches = abs_value_buckets_[i].second;
-      string mismatch_str = bucket_mismatches > 0
-                                ? Printf(", mismatches %lld", bucket_mismatches)
-                                : "";
-      Appendf(&out, "  %-6g <= x < %-6g : %7lld (%9s)%s\n",
-              kAbsValueBucketBounds[i], kAbsValueBucketBounds[i + 1],
-              bucket_size, percent_string(bucket_size, element_count).c_str(),
-              mismatch_str.c_str());
-    }
-
-    auto print_accum_buckets = [&](const string& header, int64 total,
-                                   tensorflow::gtl::ArraySlice<int64> buckets) {
-      StrAppend(&out, header, ":\n");
-      Appendf(&out, "  <  %-6g : %7lld (%s)\n", kErrorBucketBounds[0],
-              total - buckets[0],
-              percent_string(total - buckets[0], total).c_str());
-      CHECK_EQ(buckets.size(), kErrorBucketBounds.size());
-      for (int i = 0; i < kErrorBucketBounds.size(); ++i) {
-        Appendf(&out, "  >= %-6g : %7lld (%s)\n", kErrorBucketBounds[i],
-                buckets[i], percent_string(buckets[i], total).c_str());
-      }
-    };
-    Appendf(&out, "Elements exceeding abs error bound %g: %lld (%s)\n",
-            error_.abs, num_abs_mismatches_,
-            percent_string(num_abs_mismatches_, element_count).c_str());
-    print_accum_buckets(
-        "Relative error breakdown of elements exceeding abs error bound",
-        num_abs_mismatches_, rel_error_buckets_);
-    Appendf(&out, "Elements exceeding rel error bound %g: %lld (%s)\n",
-            error_.rel, num_rel_mismatches_,
-            percent_string(num_rel_mismatches_, element_count).c_str());
-    print_accum_buckets(
-        "Absolute error breakdown of elements exceeding rel error bound",
-        num_rel_mismatches_, abs_error_buckets_);
-    return out;
-  }
-
-  // 'actual' and 'expected' literals being compared.
-  const Literal& expected_;
-  const Literal& actual_;
-
-  // The error bounds of the comparison.
-  ErrorSpec error_;
-
-  // Whether to include detailed breakdown of mismatches in the error message.
-  bool detailed_message_;
-
-  // Number of element element mismatches encountered so far.
-  int64 num_mismatches_ = 0;
-
-  // Number of elements with a nan mismatch.
-  int64 num_nan_mismatches_ = 0;
-
-  // Number of elements which exceed the absolute/relative error bound.
-  int64 num_abs_mismatches_ = 0;
-  int64 num_rel_mismatches_ = 0;
-
-  // A Literal containing which elements did not match in the expected and
-  // actual literals. mismatches_ contains PREDs and is of the same sizes as
-  // the comparison literals.
-  Literal mismatches_;
-
-  // The number of mismatches to report in the output, sorted by relative error
-  // magnitude.
-  static constexpr int64 kTopRelativeErrorCount = 5;
-
-  // The set of mismatches with the largest relative error. The size of this set
-  // is bounded by kTopRelativeErrorCount.
-  std::multiset<Mismatch> top_rel_mismatches_;
-
-  // Actual values are bucketed by absolute value. kAbsValueBucketBounds is the
-  // bounds of these buckets. abs_value_buckets_ contains a pair for each
-  // bucket: the element count and failure count.
-  static constexpr std::array<float, 7> kAbsValueBucketBounds = {
-      0.0, 0.0001, 0.001, 0.01, 0.1, 1, std::numeric_limits<float>::infinity()};
-  std::vector<std::pair<int64, int64>> abs_value_buckets_;
-
-  // Buckets for relative and absolute errors. The relative error buckets only
-  // contains those elements which exceed the *absolute* error bound, and vice
-  // versa. This makes it easy to see the effect of adjusting the relative (or
-  // absolute) error bound on the success of the comparison. kErrorBucketBounds
-  // are the lower bounds of the buckets in both vectors. The error buckets are
-  // a cumulative distribution so an error value may appear in more than one
-  // bucket. For example an error value of 0.003 may appear in the buckets
-  // bounded by 0.01, 0.1, and 1.0.
-  static constexpr std::array<float, 5> kErrorBucketBounds = {0.0001, 0.001,
-                                                              0.01, 0.1, 1};
-  std::vector<int64> abs_error_buckets_;
-  std::vector<int64> rel_error_buckets_;
-};
-
-template <typename NativeT>
-constexpr std::array<float, 7> NearComparator<NativeT>::kAbsValueBucketBounds;
-template <typename NativeT>
-constexpr std::array<float, 5> NearComparator<NativeT>::kErrorBucketBounds;
-
-// Helper function for comparing two literals for nearness. Handles tuple-shapes
-// via recursion. shape_index is the ShapeIndex of expected (or actual)
-// currently being compared.
-::testing::AssertionResult NearHelper(const Literal& expected,
-                                      const Literal& actual,
-                                      const ErrorSpec& error,
-                                      bool detailed_message,
-                                      const ShapeIndex& shape_index) {
-  ::testing::AssertionResult err =
-      LiteralTestUtil::EqualShapes(expected.shape(), actual.shape());
-  if (!err) {
-    return err;
-  }
-
-  if (ShapeUtil::IsTuple(expected.shape())) {
-    for (int64 i = 0; i < ShapeUtil::TupleElementCount(expected.shape()); ++i) {
-      const auto expected_element = LiteralView::Create(expected, {i});
-      const auto actual_element = LiteralView::Create(actual, {i});
-      ShapeIndex element_index = shape_index;
-      element_index.push_back(i);
-      ::testing::AssertionResult res =
-          NearHelper(expected_element, actual_element, error, detailed_message,
-                     element_index);
-      if (!res) {
-        string err_message =
-            Printf("\nArray at shape index %s%s",
-                   element_index.ToString().c_str(), res.message());
-        if (err) {
-          err = ::testing::AssertionFailure() << err_message;
-        } else {
-          err << err_message;
-        }
-      }
-    }
-    if (!err && shape_index.empty()) {
-      // Emit a top-level error message containing the top-level shape in case
-      // of mismatch.
-      int64 total_elements = RecursiveElementCount(actual.shape());
-      err = ::testing::AssertionFailure()
-            << Printf("\nMismatches in shape %s (%lld elements):\n%s",
-                      ShapeUtil::HumanString(actual.shape()).c_str(),
-                      total_elements, err.message());
-    }
-    return err;
-  }
-
-  if (ShapeUtil::ElementIsFloating(expected.shape()) ||
-      ShapeUtil::ElementIsComplex(expected.shape())) {
-    switch (expected.shape().element_type()) {
-      case BF16:
-        return NearComparator<bfloat16>::Compare(expected, actual, error,
-                                                 detailed_message);
-        break;
-      case F16:
-        return NearComparator<half>::Compare(expected, actual, error,
-                                             detailed_message);
-        break;
-      case F32:
-        return NearComparator<float>::Compare(expected, actual, error,
-                                              detailed_message);
-        break;
-      case F64:
-        return NearComparator<double>::Compare(expected, actual, error,
-                                               detailed_message);
-        break;
-      case C64:
-        return NearComparator<complex64>::Compare(expected, actual, error,
-                                                  detailed_message);
-        break;
-      default:
-        LOG(FATAL) << "Unsupported primitive type in near comparator: "
-                   << PrimitiveType_Name(expected.shape().element_type())
-                   << ". Must be floating-point type.";
-    }
-  }
-
-  // Non-floating point literal.
-  return LiteralTestUtil::Equal(expected, actual);
+/* static */ ::testing::AssertionResult LiteralTestUtil::Equal(
+    const LiteralSlice& expected, const LiteralSlice& actual) {
+  return StatusToAssertion(literal_comparison::Equal(expected, actual));
 }
 
-}  // namespace
-
 /* static */ ::testing::AssertionResult LiteralTestUtil::Near(
-    const Literal& expected, const Literal& actual, const ErrorSpec& error,
-    bool detailed_message) {
-  return NearHelper(expected, actual, error, detailed_message,
-                    /*shape_index=*/{});
-}
-
-/* static */ void LiteralTestUtil::ExpectNear(const Literal& expected,
-                                              const Literal& actual,
-                                              const ErrorSpec& error,
-                                              const string& message) {
-  ::testing::AssertionResult res =
-      Near(expected, actual, error, /*detailed_message=*/false);
-  if (!res) {
-    res << "Expected: " << TruncateHugeLiteral(expected) << "\n";
-    res << "Actual: " << TruncateHugeLiteral(actual) << "\n";
-    if (!message.empty()) {
-      res << StrCat("\nmessage: ", message);
-    }
-  }
-  EXPECT_TRUE(res);
+    const LiteralSlice& expected, const LiteralSlice& actual,
+    const ErrorSpec& error_spec, bool detailed_message) {
+  return StatusToAssertion(literal_comparison::Near(
+      expected, actual, error_spec, detailed_message, &OnMiscompare));
 }
 
-/*static*/ ::testing::AssertionResult LiteralTestUtil::NearOrEqual(
-    const Literal& expected, const Literal& actual,
+/* static */ ::testing::AssertionResult LiteralTestUtil::NearOrEqual(
+    const LiteralSlice& expected, const LiteralSlice& actual,
     const tensorflow::gtl::optional<ErrorSpec>& error) {
   if (error.has_value()) {
     VLOG(1) << "Expects near";
-    return Near(expected, actual, *error);
+    return StatusToAssertion(literal_comparison::Near(
+        expected, actual, *error, /*detailed_message=*/false, &OnMiscompare));
   }
   VLOG(1) << "Expects equal";
-  return Equal(expected, actual);
-}
-
-/*static*/ void LiteralTestUtil::ExpectNearOrEqual(
-    const Literal& expected, const Literal& actual,
-    const tensorflow::gtl::optional<ErrorSpec>& error) {
-  EXPECT_TRUE(NearOrEqual(expected, actual, error));
-}
-
-/* static */ string LiteralTestUtil::MultiIndexAsString(
-    tensorflow::gtl::ArraySlice<int64> multi_index) {
-  return StrCat("{", tensorflow::str_util::Join(multi_index, ","), "}");
-}
-
-/* static */ std::unique_ptr<Literal> LiteralTestUtil::Reshape(
-    tensorflow::gtl::ArraySlice<int64> new_dimensions,
-    tensorflow::gtl::ArraySlice<int64> minor_to_major, const Literal& literal) {
-  int64 new_num_elements = 1;
-  for (int64 i = 0; i < new_dimensions.size(); ++i) {
-    new_num_elements *= new_dimensions[i];
-  }
-  CHECK_EQ(ShapeUtil::ElementsIn(literal.shape()), new_num_elements);
-  CHECK_EQ(new_dimensions.size(), minor_to_major.size());
-
-  auto new_literal = MakeUnique<Literal>(
-      ShapeUtil::MakeShape(literal.shape().element_type(), new_dimensions));
-
-  // Create a new shape with the given minor-to-major layout. This shape is used
-  // solely for converting linear address to multi-dimensional addresses when
-  // writing elements to the new literal.
-  Shape shape_with_layout = new_literal->shape();
-  *shape_with_layout.mutable_layout() = LayoutUtil::MakeLayout(minor_to_major);
-
-  // Copy data into new literal, element-by-element.
-  for (int64 i = 0; i < ShapeUtil::ElementsIn(literal.shape()); ++i) {
-    std::vector<int64> from_multi_index =
-        IndexUtil::LinearIndexToMultidimensionalIndex(literal.shape(), i);
-    std::vector<int64> to_multi_index =
-        IndexUtil::LinearIndexToMultidimensionalIndex(shape_with_layout, i);
-    switch (literal.shape().element_type()) {
-      case PRED:
-        new_literal->Set<bool>(to_multi_index,
-                               literal.Get<bool>(from_multi_index));
-        break;
-      case U8:
-        new_literal->Set<uint8>(to_multi_index,
-                                literal.Get<uint8>(from_multi_index));
-        break;
-      case U32:
-        new_literal->Set<uint32>(to_multi_index,
-                                 literal.Get<uint32>(from_multi_index));
-        break;
-      case S32:
-        new_literal->Set<int32>(to_multi_index,
-                                literal.Get<int32>(from_multi_index));
-        break;
-      case U64:
-        new_literal->Set<uint64>(to_multi_index,
-                                 literal.Get<uint64>(from_multi_index));
-        break;
-      case S64:
-        new_literal->Set<int64>(to_multi_index,
-                                literal.Get<int64>(from_multi_index));
-        break;
-      case F32:
-        new_literal->Set<float>(to_multi_index,
-                                literal.Get<float>(from_multi_index));
-        break;
-      case F64:
-        new_literal->Set<double>(to_multi_index,
-                                 literal.Get<double>(from_multi_index));
-        break;
-      case C64:
-        new_literal->Set<complex64>(to_multi_index,
-                                    literal.Get<complex64>(from_multi_index));
-        break;
-      default:
-        LOG(FATAL) << "Unhandled primitive element type: "
-                   << PrimitiveType_Name(literal.shape().element_type());
-    }
-  }
-
-  return new_literal;
+  return StatusToAssertion(literal_comparison::Equal(expected, actual));
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/literal_test_util.h b/tensorflow/compiler/xla/tests/literal_test_util.h
index a755568c0f098e..d1b8a6cf0b2552 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util.h
+++ b/tensorflow/compiler/xla/tests/literal_test_util.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/array4d.h"
+#include "tensorflow/compiler/xla/error_spec.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
@@ -38,282 +39,190 @@ limitations under the License.
 
 namespace xla {
 
-// Structure describing permissible absolute and relative error bounds.
-struct ErrorSpec {
-  explicit ErrorSpec(float aabs, float arel = 0, bool relaxed_nans = false)
-      : abs(aabs), rel(arel), relaxed_nans(relaxed_nans) {}
-
-  float abs;  // Absolute error bound.
-  float rel;  // Relative error bound.
-
-  // If relaxed_nans is true then any result is valid if we are expecting NaNs.
-  // In effect, this allows the tested operation to produce incorrect results
-  // for inputs outside its mathematical domain.
-  bool relaxed_nans;
-};
-
 // Utility class for making expectations/assertions related to XLA literals.
 class LiteralTestUtil {
  public:
   // Asserts that the given shapes have the same rank, dimension sizes, and
   // primitive types.
-  static ::testing::AssertionResult EqualShapes(const Shape& expected,
-                                                const Shape& actual);
-  static void AssertEqualShapes(const Shape& expected, const Shape& actual);
+  static ::testing::AssertionResult EqualShapes(
+      const Shape& expected, const Shape& actual) TF_MUST_USE_RESULT;
 
   // Asserts that the provided shapes are equal as defined in AssertEqualShapes
   // and that they have the same layout.
-  static void AssertEqualShapesAndLayouts(const Shape& expected,
-                                          const Shape& actual);
-
-  // If the given literal's data type is bfloat16, converts it to a float
-  // literal; otherwise, returns a copy of it. If the literal is a tuple,
-  // recursively converts its elements.
-  static std::unique_ptr<Literal> ConvertBF16ToF32(const Literal& bf16_literal);
-
-  // If the given literal's data type is float, converts it to a bfloat16
-  // literal; otherwise, returns a copy of it. If the literal is a tuple,
-  // recursively converts its elements.
-  static std::unique_ptr<Literal> ConvertF32ToBF16(const Literal& f32_literal);
-
-  // Asserts that the expected and actual literals are (bitwise) equal for all
-  // elements in the literal. Also, asserts that the rank, dimensions sizes, and
-  // primitive type are equal.
-  static ::testing::AssertionResult Equal(
-      const Literal& expected, const Literal& actual) TF_MUST_USE_RESULT;
+  static ::testing::AssertionResult EqualShapesAndLayouts(
+      const Shape& expected, const Shape& actual) TF_MUST_USE_RESULT;
 
-  // Expects that expected and actual are Equal.
-  static void ExpectEqual(const Literal& expected, const Literal& actual,
-                          const string& message = "");
-
-  // Expects that expected and actual are Not Equal.
-  static void ExpectNotEqual(const Literal& expected, const Literal& actual);
+  static ::testing::AssertionResult Equal(const LiteralSlice& expected,
+                                          const LiteralSlice& actual)
+      TF_MUST_USE_RESULT;
 
   // Asserts the given literal are (bitwise) equal to given expected values.
   template <typename NativeT>
-  static void ExpectR0Equal(NativeT expected, const Literal& actual);
+  static void ExpectR0Equal(NativeT expected, const LiteralSlice& actual);
+
   template <typename NativeT>
   static void ExpectR1Equal(tensorflow::gtl::ArraySlice<NativeT> expected,
-                            const Literal& actual);
+                            const LiteralSlice& actual);
   template <typename NativeT>
   static void ExpectR2Equal(
       std::initializer_list<std::initializer_list<NativeT>> expected,
-      const Literal& actual);
+      const LiteralSlice& actual);
+
   template <typename NativeT>
   static void ExpectR3Equal(
       std::initializer_list<
           std::initializer_list<std::initializer_list<NativeT>>>
           expected,
-      const Literal& actual);
+      const LiteralSlice& actual);
 
   // Asserts the given literal are (bitwise) equal to given array.
   template <typename NativeT>
   static void ExpectR2EqualArray2D(const Array2D<NativeT>& expected,
-                                   const Literal& actual);
+                                   const LiteralSlice& actual);
   template <typename NativeT>
   static void ExpectR3EqualArray3D(const Array3D<NativeT>& expected,
-                                   const Literal& actual);
+                                   const LiteralSlice& actual);
   template <typename NativeT>
   static void ExpectR4EqualArray4D(const Array4D<NativeT>& expected,
-                                   const Literal& actual);
+                                   const LiteralSlice& actual);
 
-  // Asserts that the expected and actual literals are within the given error
-  // bound for all elements. Also, asserts that the rank, dimensions sizes, and
-  // bounds are equivalent.
+  // Decorates literal_comparison::Near() with an AssertionResult return type.
   //
-  // Tuples are matched recursively.  When comparing tensors of
-  // non-floating-point type, checks for exact equality, ignoring the ErrorSpec.
-  //
-  // If the shape of the literals is neither a complex/floating-point tensor nor
-  // a tuple which contains a complex/floating-point tensor, Near() is
-  // equivalent to Equal().  We don't raise an error in this case, because we
-  // want to allow callers to call Near() even if they have no preconceptions
-  // about the shapes being compared.
-  //
-  // If detailed_message is true, then the error message in the assertion result
-  // will contain a more detailed breakdown of mismatches.
+  // See comment on literal_comparison::Near().
   static ::testing::AssertionResult Near(
-      const Literal& expected, const Literal& actual, const ErrorSpec& error,
+      const LiteralSlice& expected, const LiteralSlice& actual,
+      const ErrorSpec& error_spec,
       bool detailed_message = false) TF_MUST_USE_RESULT;
 
-  // Expects expected and actual to be Near with the given error.
-  static void ExpectNear(const Literal& expected, const Literal& actual,
-                         const ErrorSpec& error, const string& message = "");
-
   // Asserts the given literal are within the given error bound of the given
   // expected values. Only supported for floating point values.
   template <typename NativeT>
-  static void ExpectR0Near(NativeT expected, const Literal& actual,
+  static void ExpectR0Near(NativeT expected, const LiteralSlice& actual,
                            const ErrorSpec& error);
+
   template <typename NativeT>
   static void ExpectR1Near(tensorflow::gtl::ArraySlice<NativeT> expected,
-                           const Literal& actual, const ErrorSpec& error);
+                           const LiteralSlice& actual, const ErrorSpec& error);
+
   template <typename NativeT>
   static void ExpectR2Near(
       std::initializer_list<std::initializer_list<NativeT>> expected,
-      const Literal& actual, const ErrorSpec& error);
+      const LiteralSlice& actual, const ErrorSpec& error);
+
   template <typename NativeT>
   static void ExpectR3Near(
       std::initializer_list<
           std::initializer_list<std::initializer_list<NativeT>>>
           expected,
-      const Literal& actual, const ErrorSpec& error);
+      const LiteralSlice& actual, const ErrorSpec& error);
+
   template <typename NativeT>
   static void ExpectR4Near(
       std::initializer_list<std::initializer_list<
           std::initializer_list<std::initializer_list<NativeT>>>>
           expected,
-      const Literal& actual, const ErrorSpec& error);
+      const LiteralSlice& actual, const ErrorSpec& error);
 
   // Asserts the given literal are within the given error bound to the given
   // array. Only supported for floating point values.
   template <typename NativeT>
   static void ExpectR2NearArray2D(const Array2D<NativeT>& expected,
-                                  const Literal& actual,
+                                  const LiteralSlice& actual,
                                   const ErrorSpec& error);
+
   template <typename NativeT>
   static void ExpectR3NearArray3D(const Array3D<NativeT>& expected,
-                                  const Literal& actual,
+                                  const LiteralSlice& actual,
                                   const ErrorSpec& error);
+
   template <typename NativeT>
   static void ExpectR4NearArray4D(const Array4D<NativeT>& expected,
-                                  const Literal& actual,
+                                  const LiteralSlice& actual,
                                   const ErrorSpec& error);
 
   // If the error spec is given, returns whether the expected and the actual are
   // within the error bound; otherwise, returns whether they are equal. Tuples
   // will be compared recursively.
   static ::testing::AssertionResult NearOrEqual(
-      const Literal& expected, const Literal& actual,
+      const LiteralSlice& expected, const LiteralSlice& actual,
       const tensorflow::gtl::optional<ErrorSpec>& error) TF_MUST_USE_RESULT;
 
-  // If the error spec is given, expects the expected and the actual to be near;
-  // otherwise, expects them to be equal. Tuples will be compared recursively.
-  static void ExpectNearOrEqual(
-      const Literal& expected, const Literal& actual,
-      const tensorflow::gtl::optional<ErrorSpec>& error);
-
-  // Returns a multi-dimensional index as a string. For example: '{7, 8}' will
-  // be returned for a 2-dimensional index with dimension 0 index equal to 7,
-  // dimension 1 equal to 8.
-  static string MultiIndexAsString(
-      tensorflow::gtl::ArraySlice<int64> multi_index);
-
-  // Creates a literal with a new shape with the given new dimensions using the
-  // data in the given input literal. For reshaping purposes the (flat) data
-  // buffer of the input literal is assumed to have the given minor_to_major
-  // layout order.
-  static std::unique_ptr<Literal> Reshape(
-      tensorflow::gtl::ArraySlice<int64> new_dimensions,
-      tensorflow::gtl::ArraySlice<int64> minor_to_major,
-      const Literal& literal);
-
-  // Creates a literal with the supplied shape, and uses the provided value
-  // generator to populate the literal's values.
-  // Returns the new literal object, or an error Status if failed.
-  template <
-      PrimitiveType type,
-      typename T = typename primitive_util::PrimitiveTypeToNative<type>::type>
-  static StatusOr<std::unique_ptr<Literal>> CreateRandomLiteral(
-      const Shape& shape,
-      const std::function<T(tensorflow::gtl::ArraySlice<int64>)>& generator);
-
-  // Creates a literal with the supplied shape, and initializes the literal
-  // values using a normal distribution with given mean and stddev standard
-  // deviation, and using the engine as entropy generator.
-  // Returns the new literal object, or an error Status if failed.
-  template <
-      PrimitiveType type, typename E,
-      typename T = typename primitive_util::PrimitiveTypeToNative<type>::type>
-  static StatusOr<std::unique_ptr<Literal>> CreateRandomLiteral(
-      const Shape& shape, E* engine, T mean, T stddev);
-
-  // Creates a literal with the supplied shape, and initializes the literal
-  // values using a normal distribution with given mean and stddev standard
-  // deviation.
-  // Returns the new literal object, or an error Status if failed.
-  template <
-      PrimitiveType type,
-      typename T = typename primitive_util::PrimitiveTypeToNative<type>::type>
-  static StatusOr<std::unique_ptr<Literal>> CreateRandomLiteral(
-      const Shape& shape, T mean, T stddev);
-
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(LiteralTestUtil);
 };
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR0Equal(NativeT expected,
-                                                 const Literal& actual) {
-  ExpectEqual(*Literal::CreateR0<NativeT>(expected), actual);
+                                                 const LiteralSlice& actual) {
+  EXPECT_TRUE(Equal(*Literal::CreateR0<NativeT>(expected), actual));
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR1Equal(
-    tensorflow::gtl::ArraySlice<NativeT> expected, const Literal& actual) {
-  ExpectEqual(*Literal::CreateR1<NativeT>(expected), actual);
+    tensorflow::gtl::ArraySlice<NativeT> expected, const LiteralSlice& actual) {
+  EXPECT_TRUE(Equal(*Literal::CreateR1<NativeT>(expected), actual));
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR2Equal(
     std::initializer_list<std::initializer_list<NativeT>> expected,
-    const Literal& actual) {
-  ExpectEqual(*Literal::CreateR2<NativeT>(expected), actual);
+    const LiteralSlice& actual) {
+  EXPECT_TRUE(Equal(*Literal::CreateR2<NativeT>(expected), actual));
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR3Equal(
     std::initializer_list<std::initializer_list<std::initializer_list<NativeT>>>
         expected,
-    const Literal& actual) {
-  ExpectEqual(*Literal::CreateR3<NativeT>(expected), actual);
+    const LiteralSlice& actual) {
+  EXPECT_TRUE(Equal(*Literal::CreateR3<NativeT>(expected), actual));
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR2EqualArray2D(
-    const Array2D<NativeT>& expected, const Literal& actual) {
-  ExpectEqual(*Literal::CreateR2FromArray2D(expected), actual);
+    const Array2D<NativeT>& expected, const LiteralSlice& actual) {
+  EXPECT_TRUE(Equal(*Literal::CreateR2FromArray2D(expected), actual));
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR3EqualArray3D(
-    const Array3D<NativeT>& expected, const Literal& actual) {
-  ExpectEqual(*Literal::CreateR3FromArray3D(expected), actual);
+    const Array3D<NativeT>& expected, const LiteralSlice& actual) {
+  EXPECT_TRUE(Equal(*Literal::CreateR3FromArray3D(expected), actual));
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR4EqualArray4D(
-    const Array4D<NativeT>& expected, const Literal& actual) {
-  ExpectEqual(*Literal::CreateR4FromArray4D(expected), actual);
+    const Array4D<NativeT>& expected, const LiteralSlice& actual) {
+  EXPECT_TRUE(Equal(*Literal::CreateR4FromArray4D(expected), actual));
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR0Near(NativeT expected,
-                                                const Literal& actual,
+                                                const LiteralSlice& actual,
                                                 const ErrorSpec& error) {
-  ExpectNear(*Literal::CreateR0<NativeT>(expected), actual, error);
+  EXPECT_TRUE(Near(*Literal::CreateR0<NativeT>(expected), actual, error));
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR1Near(
-    tensorflow::gtl::ArraySlice<NativeT> expected, const Literal& actual,
+    tensorflow::gtl::ArraySlice<NativeT> expected, const LiteralSlice& actual,
     const ErrorSpec& error) {
-  ExpectNear(*Literal::CreateR1<NativeT>(expected), actual, error);
+  EXPECT_TRUE(Near(*Literal::CreateR1<NativeT>(expected), actual, error));
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR2Near(
     std::initializer_list<std::initializer_list<NativeT>> expected,
-    const Literal& actual, const ErrorSpec& error) {
-  ExpectNear(*Literal::CreateR2<NativeT>(expected), actual, error);
+    const LiteralSlice& actual, const ErrorSpec& error) {
+  EXPECT_TRUE(Near(*Literal::CreateR2<NativeT>(expected), actual, error));
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR3Near(
     std::initializer_list<std::initializer_list<std::initializer_list<NativeT>>>
         expected,
-    const Literal& actual, const ErrorSpec& error) {
-  ExpectNear(*Literal::CreateR3<NativeT>(expected), actual, error);
+    const LiteralSlice& actual, const ErrorSpec& error) {
+  EXPECT_TRUE(Near(*Literal::CreateR3<NativeT>(expected), actual, error));
 }
 
 template <typename NativeT>
@@ -321,63 +230,29 @@ template <typename NativeT>
     std::initializer_list<std::initializer_list<
         std::initializer_list<std::initializer_list<NativeT>>>>
         expected,
-    const Literal& actual, const ErrorSpec& error) {
-  ExpectNear(*Literal::CreateR4<NativeT>(expected), actual, error);
+    const LiteralSlice& actual, const ErrorSpec& error) {
+  EXPECT_TRUE(Near(*Literal::CreateR4<NativeT>(expected), actual, error));
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR2NearArray2D(
-    const Array2D<NativeT>& expected, const Literal& actual,
+    const Array2D<NativeT>& expected, const LiteralSlice& actual,
     const ErrorSpec& error) {
-  ExpectNear(*Literal::CreateR2FromArray2D(expected), actual, error);
+  EXPECT_TRUE(Near(*Literal::CreateR2FromArray2D(expected), actual, error));
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR3NearArray3D(
-    const Array3D<NativeT>& expected, const Literal& actual,
+    const Array3D<NativeT>& expected, const LiteralSlice& actual,
     const ErrorSpec& error) {
-  ExpectNear(*Literal::CreateR3FromArray3D(expected), actual, error);
+  EXPECT_TRUE(Near(*Literal::CreateR3FromArray3D(expected), actual, error));
 }
 
 template <typename NativeT>
 /* static */ void LiteralTestUtil::ExpectR4NearArray4D(
-    const Array4D<NativeT>& expected, const Literal& actual,
+    const Array4D<NativeT>& expected, const LiteralSlice& actual,
     const ErrorSpec& error) {
-  ExpectNear(*Literal::CreateR4FromArray4D(expected), actual, error);
-}
-
-template <PrimitiveType type, typename T>
-/* static */ StatusOr<std::unique_ptr<Literal>>
-LiteralTestUtil::CreateRandomLiteral(
-    const Shape& shape,
-    const std::function<T(tensorflow::gtl::ArraySlice<int64>)>& generator) {
-  using NativeT = typename primitive_util::PrimitiveTypeToNative<type>::type;
-  TF_RET_CHECK(shape.element_type() == type);
-  std::unique_ptr<Literal> literal = Literal::CreateFromShape(shape);
-  TF_RETURN_IF_ERROR(literal.get()->Populate<NativeT>(
-      [&](tensorflow::gtl::ArraySlice<int64> indexes) {
-        return generator(indexes);
-      }));
-  return std::move(literal);
-}
-
-template <PrimitiveType type, typename E, typename T>
-/* static */ StatusOr<std::unique_ptr<Literal>>
-LiteralTestUtil::CreateRandomLiteral(const Shape& shape, E* engine, T mean,
-                                     T stddev) {
-  using NativeT = typename primitive_util::PrimitiveTypeToNative<type>::type;
-  std::normal_distribution<NativeT> generator(mean, stddev);
-  return CreateRandomLiteral<type, NativeT>(
-      shape, [&](tensorflow::gtl::ArraySlice<int64> /*indexes*/) {
-        return generator(*engine);
-      });
-}
-
-template <PrimitiveType type, typename T>
-/* static */ StatusOr<std::unique_ptr<Literal>>
-LiteralTestUtil::CreateRandomLiteral(const Shape& shape, T mean, T stddev) {
-  std::minstd_rand0 engine;
-  return CreateRandomLiteral<type>(shape, &engine, mean, stddev);
+  EXPECT_TRUE(Near(*Literal::CreateR4FromArray4D(expected), actual, error));
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/literal_test_util_test.cc b/tensorflow/compiler/xla/tests/literal_test_util_test.cc
index 9d619a77c7e8d6..bbac7285aefbb1 100644
--- a/tensorflow/compiler/xla/tests/literal_test_util_test.cc
+++ b/tensorflow/compiler/xla/tests/literal_test_util_test.cc
@@ -34,7 +34,7 @@ TEST(LiteralTestUtilTest, ComparesEqualTuplesEqual) {
   std::unique_ptr<Literal> literal = Literal::MakeTuple({
       Literal::CreateR0<int32>(42).get(), Literal::CreateR0<int32>(64).get(),
   });
-  LiteralTestUtil::ExpectEqual(*literal, *literal);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*literal, *literal));
 }
 
 TEST(LiteralTestUtilTest, ComparesUnequalTuplesUnequal) {
@@ -97,6 +97,15 @@ TEST(LiteralTestUtilTest, ExpectNearFailurePlacesResultsInTemporaryDirectory) {
   }
 }
 
+TEST(LiteralTestUtilTest, NotEqualHasValuesInMessage) {
+  auto expected = Literal::CreateR1<int32>({1, 2, 3});
+  auto actual = Literal::CreateR1<int32>({4, 5, 6});
+  ::testing::AssertionResult result =
+      LiteralTestUtil::Equal(*expected, *actual);
+  EXPECT_THAT(result.message(), ::testing::HasSubstr("expected: {1, 2, 3}"));
+  EXPECT_THAT(result.message(), ::testing::HasSubstr("actual:   {4, 5, 6}"));
+}
+
 TEST(LiteralTestUtilTest, NearComparatorR1) {
   auto a =
       Literal::CreateR1<float>({0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8});
diff --git a/tensorflow/compiler/xla/tests/llvm_irgen_test_base.cc b/tensorflow/compiler/xla/tests/llvm_irgen_test_base.cc
index 3023df47cda33f..2c45f19c090d26 100644
--- a/tensorflow/compiler/xla/tests/llvm_irgen_test_base.cc
+++ b/tensorflow/compiler/xla/tests/llvm_irgen_test_base.cc
@@ -62,8 +62,8 @@ void LLVMIRGenTestBase::CompileAheadOfTimeAndVerifyIr(
     std::unique_ptr<HloModule> hlo_module, const AotCompilationOptions& options,
     const string& pattern, bool match_optimized_ir) {
   SetIrHook(match_optimized_ir);
-  ASSERT_TRUE(
-      CompileToAotCompilationResult(std::move(hlo_module), options).ok());
+  TF_ASSERT_OK(
+      CompileToAotCompilationResult(std::move(hlo_module), options).status());
   ResetIrHook();
 
   StatusOr<bool> filecheck_result = RunFileCheck(ir_, pattern);
diff --git a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
index 3704ddd8010bf7..a366afe8262e1f 100644
--- a/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
+++ b/tensorflow/compiler/xla/tests/local_client_aot_test_helper.cc
@@ -21,7 +21,8 @@ limitations under the License.
 
 #include "llvm/ADT/Triple.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -29,27 +30,31 @@ limitations under the License.
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/platform/logging.h"
 
+namespace {
+
 using xla::string;
 
-xla::Computation Doubler(xla::Client* client) {
-  xla::ComputationBuilder builder(client, "doubler");
+xla::XlaComputation Doubler() {
+  xla::XlaBuilder builder("doubler");
   auto r0f32 = xla::ShapeUtil::MakeShape(xla::F32, {});
   auto x = builder.Parameter(0, r0f32, "x");
   builder.Mul(x, builder.ConstantR0<float>(2.0));
   return std::move(builder.Build().ValueOrDie());
 }
 
+}  // namespace
+
 int main(int argc, char** argv) {
   tensorflow::port::InitMain(argv[0], &argc, &argv);
 
   auto client = xla::ClientLibrary::GetOrCreateCompileOnlyClient().ValueOrDie();
 
-  xla::ComputationBuilder builder(client, "aot_test_helper");
+  xla::XlaBuilder builder("aot_test_helper");
   auto opaque_shape = xla::ShapeUtil::MakeOpaqueShape();
   auto opaque_param = builder.Parameter(0, opaque_shape, "x");
   auto r0f32 = xla::ShapeUtil::MakeShape(xla::F32, {});
   auto sum = builder.CustomCall("SumStructElements", {opaque_param}, r0f32);
-  builder.Call(Doubler(client), {sum});
+  builder.Call(Doubler(), {sum});
 
   if (argc != 2) {
     LOG(FATAL) << "local_client_aot_test_helper TARGET_CPU";
@@ -71,8 +76,8 @@ int main(int argc, char** argv) {
 
   llvm::Triple triple(xla::llvm_ir::AsStringRef(triple_string));
 
-  xla::Computation computation = builder.Build().ConsumeValueOrDie();
-  xla::CompileOnlyClient::AotComputationInstance instance{
+  xla::XlaComputation computation = builder.Build().ConsumeValueOrDie();
+  xla::CompileOnlyClient::AotXlaComputationInstance instance{
       &computation, /*argument_layouts=*/{&opaque_shape}, &r0f32};
 
   xla::cpu::CpuAotCompilationOptions options(
diff --git a/tensorflow/compiler/xla/tests/local_client_execute_test.cc b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
index 44c6811df84f49..96858c00d6bbe5 100644
--- a/tensorflow/compiler/xla/tests/local_client_execute_test.cc
+++ b/tensorflow/compiler/xla/tests/local_client_execute_test.cc
@@ -210,12 +210,12 @@ XLA_TEST_F(LocalClientExecuteTest, TupleResult) {
 
   std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
   LiteralTestUtil::ExpectR2Equal<float>(
-      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralView::Create(*result_literal, {0}));
+      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralSlice(*result_literal, {0}));
   LiteralTestUtil::ExpectR2Equal<float>(
       {{10.0f, 20.0f}, {30.0f, 40.0f}},
-      LiteralView::Create(*result_literal, {1}));
+      LiteralSlice(*result_literal, {1}));
   LiteralTestUtil::ExpectR2Equal<float>(
-      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralView::Create(*result_literal, {2}));
+      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralSlice(*result_literal, {2}));
 }
 
 XLA_TEST_F(LocalClientExecuteTest, NestedTupleResult) {
@@ -239,16 +239,16 @@ XLA_TEST_F(LocalClientExecuteTest, NestedTupleResult) {
 
   std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
   LiteralTestUtil::ExpectR2Equal<float>(
-      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralView::Create(*result_literal, {1}));
+      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralSlice(*result_literal, {1}));
   LiteralTestUtil::ExpectR2Equal<float>(
       {{1.0f, 2.0f}, {3.0f, 4.0f}},
-      LiteralView::Create(*result_literal, {0, 0}));
+      LiteralSlice(*result_literal, {0, 0}));
   LiteralTestUtil::ExpectR2Equal<float>(
       {{10.0f, 20.0f}, {30.0f, 40.0f}},
-      LiteralView::Create(*result_literal, {0, 1}));
+      LiteralSlice(*result_literal, {0, 1}));
   LiteralTestUtil::ExpectR2Equal<float>(
       {{1.0f, 2.0f}, {3.0f, 4.0f}},
-      LiteralView::Create(*result_literal, {0, 2}));
+      LiteralSlice(*result_literal, {0, 2}));
 }
 
 XLA_TEST_F(LocalClientExecuteTest, TupleResultWithLayout) {
@@ -274,9 +274,9 @@ XLA_TEST_F(LocalClientExecuteTest, TupleResultWithLayout) {
 
   std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
   LiteralTestUtil::ExpectR2Equal<float>(
-      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralView::Create(*result_literal, {0}));
+      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralSlice(*result_literal, {0}));
   LiteralTestUtil::ExpectR2Equal<float>(
-      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralView::Create(*result_literal, {1}));
+      {{1.0f, 2.0f}, {3.0f, 4.0f}}, LiteralSlice(*result_literal, {1}));
 }
 
 XLA_TEST_F(LocalClientExecuteTest, TupleArguments) {
@@ -321,9 +321,9 @@ XLA_TEST_F(LocalClientExecuteTest, TupleArguments) {
   std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
   LiteralTestUtil::ExpectR2Equal<float>(
       {{56.0f, 46.0f}, {36.0f, 26.0f}},
-      LiteralView::Create(*result_literal, {0}));
+      LiteralSlice(*result_literal, {0}));
   LiteralTestUtil::ExpectR1Equal<float>(
-      {40.0f, 71.0f, 117.0f}, LiteralView::Create(*result_literal, {1}));
+      {40.0f, 71.0f, 117.0f}, LiteralSlice(*result_literal, {1}));
 }
 
 XLA_TEST_F(LocalClientExecuteTest, NestedTupleArgument) {
@@ -361,9 +361,9 @@ XLA_TEST_F(LocalClientExecuteTest, NestedTupleArgument) {
 
   std::unique_ptr<Literal> result_literal = ShapedBufferToLiteral(result);
   LiteralTestUtil::ExpectR2Equal<float>(
-      {{-1.0, -2.0}, {-3.0, -4}}, LiteralView::Create(*result_literal, {0}));
+      {{-1.0, -2.0}, {-3.0, -4}}, LiteralSlice(*result_literal, {0}));
   LiteralTestUtil::ExpectR1Equal<float>(
-      {264.0, 73.0, 133.0}, LiteralView::Create(*result_literal, {1}));
+      {264.0, 73.0, 133.0}, LiteralSlice(*result_literal, {1}));
 }
 
 XLA_TEST_F(LocalClientExecuteTest, PassingTupleResultBackIntoComputation) {
@@ -391,16 +391,16 @@ XLA_TEST_F(LocalClientExecuteTest, PassingTupleResultBackIntoComputation) {
   std::unique_ptr<Literal> result_0_literal = ShapedBufferToLiteral(result_0);
   LiteralTestUtil::ExpectR2Equal<float>(
       {{-1.0, -2.0}, {-3.0, -4.0}},
-      LiteralView::Create(*result_0_literal, {0}));
+      LiteralSlice(*result_0_literal, {0}));
   LiteralTestUtil::ExpectR2Equal<float>(
-      {{22.0, 6.0}, {8.0, 10}}, LiteralView::Create(*result_0_literal, {1}));
+      {{22.0, 6.0}, {8.0, 10}}, LiteralSlice(*result_0_literal, {1}));
 
   ScopedShapedBuffer result_1 = ExecuteLocallyOrDie(computation, {&result_0});
   std::unique_ptr<Literal> result_1_literal = ShapedBufferToLiteral(result_1);
   LiteralTestUtil::ExpectR2Equal<float>(
-      {{1.0, 2.0}, {3.0, 4.0}}, LiteralView::Create(*result_1_literal, {0}));
+      {{1.0, 2.0}, {3.0, 4.0}}, LiteralSlice(*result_1_literal, {0}));
   LiteralTestUtil::ExpectR2Equal<float>(
-      {{44.0, 12.0}, {16.0, 20}}, LiteralView::Create(*result_1_literal, {1}));
+      {{44.0, 12.0}, {16.0, 20}}, LiteralSlice(*result_1_literal, {1}));
 }
 
 XLA_TEST_F(LocalClientExecuteTest, LargeTuple) {
@@ -447,7 +447,7 @@ XLA_TEST_F(LocalClientExecuteTest, LargeTuple) {
 
   for (int i = 0; i < kElementCount; ++i) {
     LiteralTestUtil::ExpectR1Near<float>(
-        {2.0f * i, 0.0f}, LiteralView::Create(*result_literal, {i}),
+        {2.0f * i, 0.0f}, LiteralSlice(*result_literal, {i}),
         error_spec_);
   }
 }
@@ -502,7 +502,7 @@ XLA_TEST_F(LocalClientExecuteTest, LargeNestedTuple) {
   for (int i = 0; i < kFanout; ++i) {
     for (int j = 0; j < kFanout; ++j) {
       LiteralTestUtil::ExpectR0Near<float>(
-          i + j + i * kFanout + j, LiteralView::Create(*result_literal, {i, j}),
+          i + j + i * kFanout + j, LiteralSlice(*result_literal, {i, j}),
           error_spec_);
     }
   }
@@ -548,7 +548,7 @@ XLA_TEST_F(LocalClientExecuteTest, DeepTuple) {
     index.push_back(0);
   }
   LiteralTestUtil::ExpectR0Equal<float>(
-      165.0, LiteralView::Create(*result_literal, index));
+      165.0, LiteralSlice(*result_literal, index));
 }
 
 XLA_TEST_F(LocalClientExecuteTest, InvalidNumberOfArguments) {
@@ -754,9 +754,9 @@ XLA_TEST_F(LocalClientExecuteTest, SelectBetweenTuples) {
       ExecuteLocallyOrDie(builder.Build().ValueOrDie(), {});
   std::unique_ptr<Literal> tuple_literal = ShapedBufferToLiteral(result);
   LiteralTestUtil::ExpectR1Equal<float>(
-      {2.0f, 4.0f, 6.0f}, LiteralView::Create(*tuple_literal, {0}));
+      {2.0f, 4.0f, 6.0f}, LiteralSlice(*tuple_literal, {0}));
   LiteralTestUtil::ExpectR1Equal<float>(
-      {1.0f, 2.0f, 3.0f}, LiteralView::Create(*tuple_literal, {1}));
+      {1.0f, 2.0f, 3.0f}, LiteralSlice(*tuple_literal, {1}));
 }
 
 XLA_TEST_F(LocalClientExecuteTest, CompileExecutable) {
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.cc b/tensorflow/compiler/xla/tests/local_client_test_base.cc
index ca8e4cdbdb6a8f..88797a7d0a7d05 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.cc
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.cc
@@ -35,9 +35,9 @@ namespace xla {
 
 /* static */ TestAllocator* LocalClientTestBase::allocator_;
 
-StatusOr<se::DeviceMemoryBase> TestAllocator::Allocate(int device_ordinal,
-                                                       uint64 size,
-                                                       bool retry_on_failure) {
+StatusOr<OwningDeviceMemory> TestAllocator::Allocate(int device_ordinal,
+                                                     uint64 size,
+                                                     bool retry_on_failure) {
   VLOG(2) << "Allocate(" << device_ordinal << ", " << size << ")";
   {
     tensorflow::mutex_lock lock(count_mutex_);
@@ -48,8 +48,7 @@ StatusOr<se::DeviceMemoryBase> TestAllocator::Allocate(int device_ordinal,
                                                  retry_on_failure);
 }
 
-tensorflow::Status TestAllocator::Deallocate(int device_ordinal,
-                                             se::DeviceMemoryBase* mem) {
+Status TestAllocator::Deallocate(int device_ordinal, se::DeviceMemoryBase mem) {
   VLOG(2) << "Deallocate(" << device_ordinal << ")";
   {
     tensorflow::mutex_lock lock(count_mutex_);
@@ -149,8 +148,6 @@ ExecutableBuildOptions LocalClientTestBase::DefaultExecutableBuildOptions()
 
 ExecutableRunOptions LocalClientTestBase::DefaultExecutableRunOptions() const {
   ExecutableRunOptions run_options;
-  run_options.set_inter_op_thread_pool(
-      local_client_->backend().inter_op_thread_pool());
   run_options.set_intra_op_thread_pool(thread_pool_wrapper_->device.get());
   run_options.set_allocator(GetOrCreateAllocator(local_client_->platform()));
   return run_options;
diff --git a/tensorflow/compiler/xla/tests/local_client_test_base.h b/tensorflow/compiler/xla/tests/local_client_test_base.h
index 3bbb760c806412..258226523d830b 100644
--- a/tensorflow/compiler/xla/tests/local_client_test_base.h
+++ b/tensorflow/compiler/xla/tests/local_client_test_base.h
@@ -46,10 +46,9 @@ class TestAllocator : public StreamExecutorMemoryAllocator {
             platform, PlatformUtil::GetStreamExecutors(platform).ValueOrDie()) {
   }
 
-  StatusOr<se::DeviceMemoryBase> Allocate(int device_ordinal, uint64 size,
-                                          bool retry_on_failure) override;
-  tensorflow::Status Deallocate(int device_ordinal,
-                                se::DeviceMemoryBase* mem) override;
+  StatusOr<OwningDeviceMemory> Allocate(int device_ordinal, uint64 size,
+                                        bool retry_on_failure) override;
+  Status Deallocate(int device_ordinal, se::DeviceMemoryBase mem) override;
 
   // Return the number of allocations that have been performed.
   int64 allocation_count() const;
diff --git a/tensorflow/compiler/xla/tests/log_test.cc b/tensorflow/compiler/xla/tests/log_test.cc
index 174d433a9e1731..c0c02e584c2348 100644
--- a/tensorflow/compiler/xla/tests/log_test.cc
+++ b/tensorflow/compiler/xla/tests/log_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include <cmath>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -29,7 +29,7 @@ namespace {
 class LogTest : public ClientLibraryTestBase {};
 
 XLA_TEST_F(LogTest, LogZeroValues) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR3FromArray3D<float>(Array3D<float>(3, 0, 0));
   builder.Log(x);
 
@@ -41,7 +41,7 @@ TEST_F(LogTest, LogTenValues) {
   std::vector<float> input = {-0.0, 1.0, 2.0,  -3.0, -4.0,
                               5.0,  6.0, -7.0, -8.0, 9.0};
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.ConstantR1<float>(input);
   builder.Log(x);
 
diff --git a/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc b/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
index 7fa61eb33c2930..27fd36e06acdc5 100644
--- a/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/matrix_ops_simple_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <string>
 
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
@@ -52,12 +51,7 @@ class MatOpsSimpleTest : public ClientLibraryTestBase {};
 template <typename T>
 class MatOpsSimpleTest_F16F32 : public MatOpsSimpleTest {};
 
-// TODO(bixia): This test for F16 failed on GPU 02-25-2018.
-#ifdef XLA_TEST_BACKEND_GPU
-TYPED_TEST_CASE(MatOpsSimpleTest_F16F32, ::testing::Types<float>);
-#else
 TYPED_TEST_CASE(MatOpsSimpleTest_F16F32, TypesF16F32);
-#endif
 
 XLA_TYPED_TEST(MatOpsSimpleTest_F16F32, ExpTwoByTwoValues) {
   using T = TypeParam;
@@ -72,8 +66,7 @@ XLA_TYPED_TEST(MatOpsSimpleTest_F16F32, ExpTwoByTwoValues) {
       Literal::CreateR2FromArray2D<T>({{2.71828f, 1.00000f},    // row 0
                                        {0.36788f, 1.64872f}});  // row 1
 
-  this->template ComputeAndCompareLiteral(&builder, *expected, {},
-                                          ErrorSpec(1e-5));
+  this->ComputeAndCompareLiteral(&builder, *expected, {}, ErrorSpec(1e-5));
 }
 
 XLA_TYPED_TEST(MatOpsSimpleTest_F16F32, MapTwoByTwo) {
@@ -101,8 +94,7 @@ XLA_TYPED_TEST(MatOpsSimpleTest_F16F32, MapTwoByTwo) {
   std::unique_ptr<Literal> expected =
       Literal::CreateR2FromArray2D<T>({{1.5f, 0.5f},     // row 0
                                        {-0.5f, 1.0f}});  // row 1
-  this->template ComputeAndCompareLiteral(&builder, *expected, {},
-                                          ErrorSpec(1e-5));
+  this->ComputeAndCompareLiteral(&builder, *expected, {}, ErrorSpec(1e-5));
 }
 
 XLA_TYPED_TEST(MatOpsSimpleTest_F16F32, MaxTwoByTwoValues) {
@@ -121,8 +113,7 @@ XLA_TYPED_TEST(MatOpsSimpleTest_F16F32, MaxTwoByTwoValues) {
   std::unique_ptr<Literal> expected =
       Literal::CreateR2FromArray2D<T>({{7.0f, 6.0f},     // row 0
                                        {3.0f, -4.0f}});  // row 1
-  this->template ComputeAndCompareLiteral(&builder, *expected, {},
-                                          ErrorSpec(1e-6));
+  this->ComputeAndCompareLiteral(&builder, *expected, {}, ErrorSpec(1e-6));
 }
 
 struct TestLinspaceMaxParam {
@@ -171,11 +162,8 @@ string PrintTestLinspaceMaxParam(
 }
 
 #ifndef XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16
-// TODO(bixia): This test failed on GPU 02-25-2018
-#ifdef XLA_TEST_BACKEND_CPU
 XLA_TEST_P(TestLinspaceMaxParametric, TestF16) { TestImpl<Eigen::half>(); }
 #endif
-#endif
 XLA_TEST_P(TestLinspaceMaxParametric, TestF32) { TestImpl<float>(); }
 
 INSTANTIATE_TEST_CASE_P(
diff --git a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
index 0a603f4954badd..7bfc8eb546d10d 100644
--- a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <new>
 #include <utility>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
@@ -108,7 +107,7 @@ class MultiOutputFusionTest : public HloTestBase {
     expect.PopulateWithValue<float>(size * 1.5f * 3.5f);
     auto actual = ExecuteAndTransfer(
         std::move(hlo_module), {Literal::CreateR0<float>(-9.0f).get(), &arg1});
-    LiteralTestUtil::ExpectNear(expect, *actual, error_spec_);
+    EXPECT_TRUE(LiteralTestUtil::Near(expect, *actual, error_spec_));
   }
 
   void RunTest1D(bool manual_fusion, int size) {
@@ -168,7 +167,7 @@ class MultiOutputFusionTest : public HloTestBase {
 
     Literal expect = std::move(*Literal::CreateR1<float>({size * 1.5f * 3.5f}));
     auto actual = ExecuteAndTransfer(std::move(hlo_module), {&input0, &input1});
-    LiteralTestUtil::ExpectNear(expect, *actual, error_spec_);
+    EXPECT_TRUE(LiteralTestUtil::Near(expect, *actual, error_spec_));
   }
 };
 
@@ -211,5 +210,175 @@ XLA_TEST_F(MultiOutputFusionTest, FusionNodeIsRoot) {
       *result, *Literal::MakeTupleOwned(Literal::CreateR0<int32>(42))));
 }
 
+XLA_TEST_F(MultiOutputFusionTest, MultiOutputLoopFusion) {
+  const char* testcase = R"(
+    HloModule m
+
+    fused_computation {
+      p = f32[4] parameter(0)
+      multiply = f32[4] multiply(p, p)
+      less-than = pred[4] less-than(p, multiply)
+      ROOT tuple = (pred[4], f32[4]) tuple(less-than, multiply)
+    }
+
+    ENTRY PredFloatMOF {
+      p0 = f32[4] parameter(0)
+      fusion = (pred[4], f32[4]) fusion(p0), kind=kLoop, calls=fused_computation
+      gte0 = pred[4] get-tuple-element(fusion), index=0
+      gte1 = f32[4] get-tuple-element(fusion), index=1
+      const = f32[4] constant({0, 0, 0, 0})
+      ROOT select = f32[4] select(gte0, gte1, const)
+    })";
+  auto module =
+      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
+          .ValueOrDie();
+  auto param = Literal::CreateR1<float>({1.0, 2.0, 3.0, -1.0});
+  TF_ASSERT_OK_AND_ASSIGN(auto result,
+                          Execute(std::move(module), {param.get()}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      *result, *Literal::CreateR1<float>({0.0, 4.0, 9.0, 1.0})));
+}
+
+XLA_TEST_F(MultiOutputFusionTest, MultiOutputLoopFeedingMap) {
+  const char* testcase = R"(
+    HloModule m
+
+    fused_computation {
+      p = f32[] parameter(0)
+      multiply = f32[] multiply(p, p)
+      less-than = pred[] less-than(p, multiply)
+      ROOT tuple = (pred[], f32[]) tuple(less-than, multiply)
+    }
+
+    map_computation {
+      p0 = f32[] parameter(0)
+      fusion = (pred[], f32[]) fusion(p0), kind=kLoop, calls=fused_computation
+      gte0 = pred[] get-tuple-element(fusion), index=0
+      gte1 = f32[] get-tuple-element(fusion), index=1
+      const = f32[] constant(0)
+      ROOT select = f32[] select(gte0, gte1, const)
+    }
+
+    ENTRY MapMOF {
+      p1 = f32[3] parameter(0)
+      ROOT map = f32[3] map(p1), to_apply=map_computation
+    })";
+  auto module =
+      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
+          .ValueOrDie();
+  auto param = Literal::CreateR1<float>({1.0, 2.0, 3.0});
+  TF_ASSERT_OK_AND_ASSIGN(auto result,
+                          Execute(std::move(module), {param.get()}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      *result, *Literal::CreateR1<float>({0.0, 4.0, 9.0})));
+}
+
+const char* const kScalarOps = R"(
+    HloModule m
+
+    Add {
+      lhsadd = f32[] parameter(0)
+      rhsadd = f32[] parameter(1)
+      ROOT add = f32[] add(lhsadd, rhsadd)
+    }
+
+    Max {
+      lhsmax = f32[] parameter(0)
+      rhsmax = f32[] parameter(1)
+      ROOT max = f32[] maximum(lhsmax, rhsmax)
+    }
+)";
+
+XLA_TEST_F(MultiOutputFusionTest,
+           DISABLED_ON_CPU(MultiOutputReduceFusionMinor)) {
+  const string testcase = tensorflow::strings::StrCat(kScalarOps, R"(
+    fused_reduce {
+      p0 = f32[2,2,2]{2,1,0} parameter(0)
+      c0 = f32[] constant(0)
+      r1 = f32[2,2]{1,0} reduce(p0, c0), dimensions={2}, to_apply=Add
+      mul = f32[2,2,2]{2,1,0} multiply(p0, p0)
+      c1 = f32[] constant(5)
+      r2 = f32[2,2]{1,0} reduce(mul, c1), dimensions={2}, to_apply=Max
+      ROOT tuple = (f32[2,2]{1,0}, f32[2,2]{1,0}) tuple(r1, r2)
+    }
+
+    ENTRY reduce {
+      p = f32[2,2,2]{2,1,0} parameter(0)
+      ROOT fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(p), kind=kInput,
+                                                        calls=fused_reduce
+    })");
+  auto module =
+      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
+          .ValueOrDie();
+  auto param = Literal::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}});
+  TF_ASSERT_OK_AND_ASSIGN(auto result,
+                          Execute(std::move(module), {param.get()}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      *result,
+      *Literal::MakeTupleOwned(Literal::CreateR2<float>({{3, 7}, {11, 15}}),
+                               Literal::CreateR2<float>({{5, 16}, {36, 64}}))));
+}
+
+XLA_TEST_F(MultiOutputFusionTest,
+           DISABLED_ON_CPU(MultiOutputReduceFusionMajor)) {
+  const string testcase = tensorflow::strings::StrCat(kScalarOps, R"(
+    fused_reduce {
+      p0 = f32[2,2,2]{2,1,0} parameter(0)
+      c0 = f32[] constant(0)
+      r1 = f32[2,2]{1,0} reduce(p0, c0), dimensions={0}, to_apply=Add
+      mul = f32[2,2,2]{2,1,0} multiply(p0, p0)
+      c1 = f32[] constant(5)
+      r2 = f32[2,2]{1,0} reduce(mul, c1), dimensions={0}, to_apply=Max
+      ROOT tuple = (f32[2,2]{1,0}, f32[2,2]{1,0}) tuple(r1, r2)
+    }
+
+    ENTRY reduce {
+      p = f32[2,2,2]{2,1,0} parameter(0)
+      ROOT fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(p), kind=kInput,
+                                                        calls=fused_reduce
+    })");
+  auto module =
+      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
+          .ValueOrDie();
+  auto param = Literal::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}});
+  TF_ASSERT_OK_AND_ASSIGN(auto result,
+                          Execute(std::move(module), {param.get()}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      *result, *Literal::MakeTupleOwned(
+                   Literal::CreateR2<float>({{6, 8}, {10, 12}}),
+                   Literal::CreateR2<float>({{25, 36}, {49, 64}}))));
+}
+
+XLA_TEST_F(MultiOutputFusionTest,
+           DISABLED_ON_CPU(MultiOutputReduceFusionScalar)) {
+  const string testcase = tensorflow::strings::StrCat(kScalarOps, R"(
+    fused_reduce {
+      p0 = f32[2,2,2]{2,1,0} parameter(0)
+      c0 = f32[] constant(0)
+      r1 = f32[2]{0} reduce(p0, c0), dimensions={0,2}, to_apply=Add
+      mul = f32[2,2,2]{2,1,0} multiply(p0, p0)
+      c1 = f32[] constant(1.17549e-38)
+      r2 = f32[2]{0} reduce(mul, c1), dimensions={0,2}, to_apply=Max
+      r3 = f32[2]{0} reduce(mul, c0), dimensions={0,2}, to_apply=Add
+      ROOT tuple = (f32[2]{0}, f32[2]{0}, f32[2]{0}) tuple(r1, r2, r3)
+    }
+
+    ENTRY reduce {
+      p = f32[2,2,2]{2,1,0} parameter(0)
+      ROOT fusion = (f32[2]{0}, f32[2]{0}, f32[2]{0}) fusion(p), kind=kInput,
+                                                        calls=fused_reduce
+    })");
+  auto module =
+      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
+          .ValueOrDie();
+  auto param = Literal::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}});
+  TF_ASSERT_OK_AND_ASSIGN(auto result,
+                          Execute(std::move(module), {param.get()}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      *result, *Literal::MakeTupleOwned(Literal::CreateR1<float>({14, 22}),
+                                        Literal::CreateR1<float>({36, 64}),
+                                        Literal::CreateR1<float>({66, 138}))));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/params_test.cc b/tensorflow/compiler/xla/tests/params_test.cc
index 97dab860c06bdd..838f1b4e2f0f0e 100644
--- a/tensorflow/compiler/xla/tests/params_test.cc
+++ b/tensorflow/compiler/xla/tests/params_test.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
@@ -161,7 +160,7 @@ XLA_TEST_F(ParamsTest, MissingParameter) {
   auto p = builder.Parameter(2, ShapeUtil::MakeShape(F32, {}), "param2");
   auto computation_status = builder.Build();
 
-  ASSERT_NE(computation_status.status(), tensorflow::Status::OK());
+  ASSERT_NE(computation_status.status(), Status::OK());
 }
 
 XLA_TEST_F(ParamsTest, UnusedParameter) {
diff --git a/tensorflow/compiler/xla/tests/prng_test.cc b/tensorflow/compiler/xla/tests/prng_test.cc
index 29a4f75001c688..1a2de6937c3e13 100644
--- a/tensorflow/compiler/xla/tests/prng_test.cc
+++ b/tensorflow/compiler/xla/tests/prng_test.cc
@@ -273,11 +273,11 @@ XLA_TEST_F(PrngTest, PassInGlobalRngSeed) {
                                              &execution_options_));
   }
 
-  LiteralTestUtil::ExpectEqual(*result1, *result2);
-  LiteralTestUtil::ExpectEqual(*result1, *result3);
-  LiteralTestUtil::ExpectNotEqual(*result1, *result4);
-  LiteralTestUtil::ExpectNotEqual(*result4, *result5);
-  LiteralTestUtil::ExpectNotEqual(*result5, *result6);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*result1, *result2));
+  EXPECT_TRUE(LiteralTestUtil::Equal(*result1, *result3));
+  EXPECT_FALSE(LiteralTestUtil::Equal(*result1, *result4));
+  EXPECT_FALSE(LiteralTestUtil::Equal(*result4, *result5));
+  EXPECT_FALSE(LiteralTestUtil::Equal(*result5, *result6));
 }
 
 XLA_TEST_F(PrngTest, TenValuesN01) {
diff --git a/tensorflow/compiler/xla/tests/reduce_hlo_test.cc b/tensorflow/compiler/xla/tests/reduce_hlo_test.cc
index c0a2c0ca4cb841..9052b188ed09a7 100644
--- a/tensorflow/compiler/xla/tests/reduce_hlo_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_hlo_test.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #include <array>
 
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
@@ -73,7 +73,7 @@ ENTRY reduce.1 {
 }
 )";
 
-  return tools::Parse(hlo_string);
+  return ParseHloString(hlo_string);
 }
 
 // TODO(b/72454718): XLA:GPU does not support executing code compiled without
diff --git a/tensorflow/compiler/xla/tests/reduce_test.cc b/tensorflow/compiler/xla/tests/reduce_test.cc
index bcc05c2d41d843..d671d40456a276 100644
--- a/tensorflow/compiler/xla/tests/reduce_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_test.cc
@@ -34,7 +34,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
diff --git a/tensorflow/compiler/xla/tests/reduce_window_test.cc b/tensorflow/compiler/xla/tests/reduce_window_test.cc
index 10a3da3a387641..266760e8202fdd 100644
--- a/tensorflow/compiler/xla/tests/reduce_window_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_window_test.cc
@@ -356,12 +356,8 @@ XLA_TEST_P(ReduceWindowTest, R6AddMultipleStrides) {
   std::vector<int64> input_dims(6, 8);
   auto shape = ShapeUtil::MakeShape(F32, input_dims);
 
-  std::unique_ptr<Literal> arg_literal = Literal::CreateFromShape(shape);
-  auto generator = [&](tensorflow::gtl::ArraySlice<int64> indexes) -> float {
-    return 1.0f;
-  };
-  TF_EXPECT_OK(arg_literal->Populate<float>(generator));
-
+  auto arg_literal = MakeUnique<Literal>(shape);
+  arg_literal->PopulateWithValue(1.0f);
   const auto input = CreateConstantFromLiteral(*arg_literal, &builder_);
 
   Padding padding = Padding::kValid;
@@ -371,13 +367,8 @@ XLA_TEST_P(ReduceWindowTest, R6AddMultipleStrides) {
   std::vector<int64> output_dims = {6, 8, 6, 6, 8, 8};
   Shape result_shape =
       ShapeUtil::MakeShapeWithLayout(F32, output_dims, output_layout);
-  std::unique_ptr<Literal> expected = Literal::CreateFromShape(result_shape);
-  auto out_generator =
-      [&](tensorflow::gtl::ArraySlice<int64> indexes) -> float {
-    return 27.0f;
-  };
-  TF_EXPECT_OK(expected->Populate<float>(out_generator));
-
+  auto expected = MakeUnique<Literal>(result_shape);
+  expected->PopulateWithValue(27.0f);
   ComputeAndCompareLiteral(&builder_, *expected, {}, DefaultErrorSpec());
 }
 
@@ -1348,7 +1339,7 @@ INSTANTIATE_TEST_CASE_P(
 class ReduceWindowTextTest : public HloTestBase {};
 
 TEST_F(ReduceWindowTextTest, R2General256x384) {
-  const string& hlo_string = R"(
+  const string hlo_string = R"(
 HloModule R2Window
 mul {
   lhs = f32[] parameter(0)
@@ -1365,7 +1356,7 @@ ENTRY R2Window {
 }
 
 TEST_F(ReduceWindowTextTest, R2General256x384Layout01) {
-  const string& hlo_string = R"(
+  const string hlo_string = R"(
 HloModule R2Window
 mul {
 lhs = f32[] parameter(0)
@@ -1382,7 +1373,7 @@ ROOT reduce-window = f32[256,384]{0,1} reduce-window(operand, constant), window=
 }
 
 TEST_F(ReduceWindowTextTest, R2General2x5) {
-  const string& hlo_string = R"(
+  const string hlo_string = R"(
 HloModule R2Window
 mul {
   lhs = f32[] parameter(0)
@@ -1399,7 +1390,7 @@ ENTRY R2Window {
 }
 
 TEST_F(ReduceWindowTextTest, R2EffectiveScalar) {
-  const string& hlo_string = R"(
+  const string hlo_string = R"(
 HloModule R2Window
 mul {
   lhs = f32[] parameter(0)
@@ -1417,7 +1408,7 @@ ENTRY R2Window {
 }
 
 TEST_F(ReduceWindowTextTest, R3EffectiveScalar) {
-  const string& hlo_string = R"(
+  const string hlo_string = R"(
 HloModule R3Window
 mul {
   lhs = f32[] parameter(0)
@@ -1435,7 +1426,7 @@ ENTRY R3Window {
 }
 
 TEST_F(HloTestBase, ReduceWindowIdentity) {
-  const string& hlo_string = R"(
+  const string hlo_string = R"(
 HloModule ReduceWindowIdentity
 identity.pad_to_reduce_window {
   param0 = f32[] parameter(0)
@@ -1444,7 +1435,26 @@ identity.pad_to_reduce_window {
 ENTRY reduce-window-identity {
   operand = f32[1,32,64]{2,1,0} parameter(0)
   constant.4466 = f32[] constant(0)
-  ROOT reduce-window = f32[1,33,64]{2,1,0} reduce-window(operand, constant.4466),     window={size=1x1x1 pad=0_0x1_0x0_0}, to_apply=identity.pad_to_reduce_window
+  ROOT reduce-window = f32[1,33,64]{2,1,0} reduce-window(operand, constant.4466), window={size=1x1x1 pad=0_0x1_0x0_0}, to_apply=identity.pad_to_reduce_window
+}
+
+)";
+  EXPECT_TRUE(RunAndCompare(hlo_string, tensorflow::gtl::nullopt));
+}
+
+TEST_F(HloTestBase, ReduceWindowS32) {
+  const string hlo_string = R"(
+HloModule reduce-window
+
+%identity.pad_to_reduce_window (param0: s32[], param1: s32[]) -> s32[] {
+  %param0 = s32[] parameter(0)
+  ROOT %param1 = s32[] parameter(1)
+}
+
+ENTRY %reduce-window (parameter.0: s32[81,8], parameter.1: s32[]) -> s32[82,8] {
+  %parameter.0 = s32[81,8]{1,0} parameter(0)
+  %parameter.1 = s32[] parameter(1)
+  ROOT %reduce-window = s32[82,8]{1,0} reduce-window(s32[81,8]{1,0} %parameter.0, s32[] %parameter.1), window={size=1x1 pad=0_1x0_0}, to_apply=%identity.pad_to_reduce_window
 }
 
 )";
diff --git a/tensorflow/compiler/xla/tests/reshape_motion_test.cc b/tensorflow/compiler/xla/tests/reshape_motion_test.cc
index 5ebd5268992846..da1b588ec41cef 100644
--- a/tensorflow/compiler/xla/tests/reshape_motion_test.cc
+++ b/tensorflow/compiler/xla/tests/reshape_motion_test.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
diff --git a/tensorflow/compiler/xla/tests/reshape_test.cc b/tensorflow/compiler/xla/tests/reshape_test.cc
index d7462d581b8596..a4580cd71d46ad 100644
--- a/tensorflow/compiler/xla/tests/reshape_test.cc
+++ b/tensorflow/compiler/xla/tests/reshape_test.cc
@@ -656,9 +656,9 @@ XLA_TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
   std::unique_ptr<Literal> expected =
       Literal::CreateR2FromArray2D<float>(expected_array);
   if (use_bfloat16()) {
-    expected = LiteralTestUtil::ConvertF32ToBF16(*expected);
+    expected = Literal::ConvertF32ToBF16(*expected);
   }
-  LiteralTestUtil::ExpectEqual(*expected, *actual);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*expected, *actual));
 }
 
 XLA_TEST_P(ReshapeTest, R2ToR4_3x8_To_3x2x1x4) {
@@ -731,7 +731,7 @@ XLA_TEST_P(ReshapeTest, R4ToR2_2x1x1x1_To_2x1) {
   builder.Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{2, 1});
 
   std::unique_ptr<Literal> expected =
-      LiteralTestUtil::Reshape({2, 1}, {1, 0}, *input_literal);
+      Literal::ReshapeSlice({2, 1}, {1, 0}, *input_literal);
   ComputeAndCompareLiteral(&builder, *expected, {input_data.get()},
                            zero_error_spec_);
 }
@@ -753,7 +753,7 @@ XLA_TEST_P(ReshapeTest, R4ToR2_2x1x4x1_To_4x2) {
   builder.Reshape(parameter, /*dimensions=*/{0, 1, 2, 3}, /*new_sizes=*/{4, 2});
 
   std::unique_ptr<Literal> expected =
-      LiteralTestUtil::Reshape({4, 2}, {1, 0}, *input_literal);
+      Literal::ReshapeSlice({4, 2}, {1, 0}, *input_literal);
   ComputeAndCompareLiteral(&builder, *expected, {input_data.get()},
                            zero_error_spec_);
 }
@@ -817,7 +817,7 @@ XLA_TEST_P(ReshapeTest, NoopReshape) {
   // Since the reshape is a no-op, verify that it does not change the underlying
   // data.
   if (use_bfloat16()) {
-    auto expected = LiteralTestUtil::ConvertF32ToBF16(*input_literal);
+    auto expected = Literal::ConvertF32ToBF16(*input_literal);
     EXPECT_EQ(expected->data<bfloat16>(), output_literal->data<bfloat16>());
   } else {
     EXPECT_EQ(input_literal->data<float>(), output_literal->data<float>());
@@ -886,7 +886,7 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeSimple) {
                   /*new_sizes=*/new_bounds);
 
   std::unique_ptr<Literal> expected =
-      LiteralTestUtil::Reshape(new_bounds, {2, 3, 1, 0}, *input_literal)
+      Literal::ReshapeSlice(new_bounds, {2, 3, 1, 0}, *input_literal)
           ->Relayout(LayoutUtil::MakeLayout({3, 2, 1, 0}));
 
   // Specify the requested output shape explicitly to ensure that this reshape
@@ -915,7 +915,7 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstEffectiveR2) {
                   /*new_sizes=*/new_bounds);
 
   std::unique_ptr<Literal> expected =
-      LiteralTestUtil::Reshape(new_bounds, {2, 3, 1, 0}, *input_literal)
+      Literal::ReshapeSlice(new_bounds, {2, 3, 1, 0}, *input_literal)
           ->Relayout(LayoutUtil::MakeLayout({3, 2, 1, 0}));
 
   // Specify the requested output shape explicitly to ensure that this reshape
@@ -944,7 +944,7 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1) {
                   /*new_sizes=*/new_bounds);
 
   std::unique_ptr<Literal> expected =
-      LiteralTestUtil::Reshape(new_bounds, {2, 3, 1, 0}, *input_literal)
+      Literal::ReshapeSlice(new_bounds, {2, 3, 1, 0}, *input_literal)
           ->Relayout(LayoutUtil::MakeLayout({3, 2, 1, 0}));
 
   // Specify the requested output shape explicitly to ensure that this reshape
@@ -974,7 +974,7 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeMajorFirstMinorEffectiveR1InR2) {
                   /*new_sizes=*/new_bounds);
 
   std::unique_ptr<Literal> expected =
-      LiteralTestUtil::Reshape(new_bounds, {2, 3, 1, 0}, *input_literal)
+      Literal::ReshapeSlice(new_bounds, {2, 3, 1, 0}, *input_literal)
           ->Relayout(LayoutUtil::MakeLayout({3, 2, 1, 0}));
 
   // Specify the requested output shape explicitly to ensure that this reshape
@@ -1003,7 +1003,7 @@ XLA_TEST_P(ReshapeTest, R4TwoMinorTransposeTrivialR2) {
                   /*new_sizes=*/new_bounds);
 
   std::unique_ptr<Literal> expected =
-      LiteralTestUtil::Reshape(new_bounds, {1, 0, 2, 3}, *input_literal)
+      Literal::ReshapeSlice(new_bounds, {1, 0, 2, 3}, *input_literal)
           ->Relayout(input_literal->shape().layout());
 
   // Specify the requested output shape explicitly to ensure that this reshape
diff --git a/tensorflow/compiler/xla/tests/round_trip_packed_literal_test.cc b/tensorflow/compiler/xla/tests/round_trip_packed_literal_test.cc
index 8cbfcc6f5c4272..7cfca781acda15 100644
--- a/tensorflow/compiler/xla/tests/round_trip_packed_literal_test.cc
+++ b/tensorflow/compiler/xla/tests/round_trip_packed_literal_test.cc
@@ -100,7 +100,7 @@ TEST_F(RoundTripPackedLiteralTest, RoundTripsR2F32Size2x2Dim0Minor) {
   EXPECT_EQ(46.0f, actual->Get<float>({1, 1}));
 
   std::unique_ptr<Literal> round_tripped = RoundTripToServer(*actual);
-  LiteralTestUtil::ExpectEqual(*round_tripped, *actual);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*round_tripped, *actual));
 }
 
 TEST_F(RoundTripPackedLiteralTest, RoundTripsR2F32Size2x2Dim1Minor) {
@@ -135,7 +135,7 @@ TEST_F(RoundTripPackedLiteralTest, RoundTripsR2F32Size2x2Dim1Minor) {
   EXPECT_EQ(46.0f, actual->Get<float>({1, 1}));
 
   std::unique_ptr<Literal> round_tripped = RoundTripToServer(*actual);
-  LiteralTestUtil::ExpectEqual(*round_tripped, *actual);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*round_tripped, *actual));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/round_trip_transfer_test.cc b/tensorflow/compiler/xla/tests/round_trip_transfer_test.cc
index 32db45f8a66266..f334a8c1318a59 100644
--- a/tensorflow/compiler/xla/tests/round_trip_transfer_test.cc
+++ b/tensorflow/compiler/xla/tests/round_trip_transfer_test.cc
@@ -41,7 +41,7 @@ class RoundTripTransferTest : public ClientLibraryTestBase {
         client_->TransferToServer(original).ConsumeValueOrDie();
     std::unique_ptr<Literal> result =
         client_->Transfer(*data).ConsumeValueOrDie();
-    LiteralTestUtil::ExpectEqual(original, *result);
+    EXPECT_TRUE(LiteralTestUtil::Equal(original, *result));
   }
 };
 
diff --git a/tensorflow/compiler/xla/tests/scalar_computations_test.cc b/tensorflow/compiler/xla/tests/scalar_computations_test.cc
index 0c88bef69dfc52..308d3fc78a51e6 100644
--- a/tensorflow/compiler/xla/tests/scalar_computations_test.cc
+++ b/tensorflow/compiler/xla/tests/scalar_computations_test.cc
@@ -17,9 +17,10 @@ limitations under the License.
 #include <limits>
 #include <memory>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -43,83 +44,80 @@ class ScalarComputationsTest : public ClientLibraryTestBase {
  protected:
   // A template for building and running a binary comparison test.
   template <typename NativeT>
-  void TestCompare(NativeT lhs, NativeT rhs, bool expected,
-                   ComputationDataHandle (ComputationBuilder::*op)(
-                       const ComputationDataHandle&,
-                       const ComputationDataHandle&,
-                       tensorflow::gtl::ArraySlice<int64>)) {
-    ComputationBuilder builder(client_, TestName());
-    ComputationDataHandle lhs_op = builder.ConstantR0<NativeT>(lhs);
-    ComputationDataHandle rhs_op = builder.ConstantR0<NativeT>(rhs);
-    ComputationDataHandle result = (builder.*op)(lhs_op, rhs_op, {});
+  void TestCompare(
+      NativeT lhs, NativeT rhs, bool expected,
+      XlaOp (XlaBuilder::*op)(const XlaOp&, const XlaOp&,
+                              tensorflow::gtl::ArraySlice<int64>)) {
+    XlaBuilder builder(TestName());
+    XlaOp lhs_op = builder.ConstantR0<NativeT>(lhs);
+    XlaOp rhs_op = builder.ConstantR0<NativeT>(rhs);
+    XlaOp result = (builder.*op)(lhs_op, rhs_op, {});
     ComputeAndCompareR0<bool>(&builder, expected, {});
   }
 
   template <typename NativeT>
   void TestMinMax(NativeT lhs, NativeT rhs, NativeT expected,
-                  ComputationDataHandle (ComputationBuilder::*op)(
-                      const ComputationDataHandle&,
-                      const ComputationDataHandle&,
-                      tensorflow::gtl::ArraySlice<int64>)) {
-    ComputationBuilder builder(client_, TestName());
-    ComputationDataHandle lhs_op = builder.ConstantR0<NativeT>(lhs);
-    ComputationDataHandle rhs_op = builder.ConstantR0<NativeT>(rhs);
-    ComputationDataHandle result = (builder.*op)(lhs_op, rhs_op, {});
+                  XlaOp (XlaBuilder::*op)(const XlaOp&, const XlaOp&,
+                                          tensorflow::gtl::ArraySlice<int64>)) {
+    XlaBuilder builder(TestName());
+    XlaOp lhs_op = builder.ConstantR0<NativeT>(lhs);
+    XlaOp rhs_op = builder.ConstantR0<NativeT>(rhs);
+    XlaOp result = (builder.*op)(lhs_op, rhs_op, {});
     ComputeAndCompareR0<NativeT>(&builder, expected, {});
   }
 };
 
 XLA_TEST_F(ScalarComputationsTest, ReturnScalarF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.ConstantR0<float>(2.1f);
 
   ComputeAndCompareR0<float>(&builder, 2.1f, {}, error_spec_);
 }
 
 XLA_TEST_F(ScalarComputationsTest, NegateScalarF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Neg(builder.ConstantR0<float>(2.1f));
 
   ComputeAndCompareR0<float>(&builder, -2.1f, {}, error_spec_);
 }
 
 XLA_TEST_F(ScalarComputationsTest, NegateScalarS32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Neg(builder.ConstantR0<int32>(2));
 
   ComputeAndCompareR0<int32>(&builder, -2, {});
 }
 
 XLA_TEST_F(ScalarComputationsTest, AddTwoScalarsF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Add(builder.ConstantR0<float>(2.1f), builder.ConstantR0<float>(5.5f));
 
   ComputeAndCompareR0<float>(&builder, 7.6f, {}, error_spec_);
 }
 
 XLA_TEST_F(ScalarComputationsTest, AddTwoScalarsS32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Add(builder.ConstantR0<int32>(2), builder.ConstantR0<int32>(5));
 
   ComputeAndCompareR0<int32>(&builder, 7, {});
 }
 
 XLA_TEST_F(ScalarComputationsTest, AddTwoScalarsU32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Add(builder.ConstantR0<uint32>(35), builder.ConstantR0<uint32>(57));
 
   ComputeAndCompareR0<uint32>(&builder, 92, {});
 }
 
 XLA_TEST_F(ScalarComputationsTest, AddTwoScalarsU8) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Add(builder.ConstantR0<uint8>(35), builder.ConstantR0<uint8>(57));
 
   ComputeAndCompareR0<uint8>(&builder, 92, {});
 }
 
 XLA_TEST_F(ScalarComputationsTest, AddTwoScalarsU64) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   const uint64 a = static_cast<uint64>(1) << 63;
   const uint64 b = a + 1;
   builder.Add(builder.ConstantR0<uint64>(a), builder.ConstantR0<uint64>(b));
@@ -128,7 +126,7 @@ XLA_TEST_F(ScalarComputationsTest, AddTwoScalarsU64) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, AddTwoScalarsS64) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   const int64 a = static_cast<int64>(1) << 62;
   const int64 b = a - 1;
   builder.Add(builder.ConstantR0<int64>(a), builder.ConstantR0<int64>(b));
@@ -137,7 +135,7 @@ XLA_TEST_F(ScalarComputationsTest, AddTwoScalarsS64) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, AddTwoScalarsF64) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Add(builder.ConstantR0<double>(0.25),
               builder.ConstantR0<double>(3.5));
 
@@ -145,21 +143,21 @@ XLA_TEST_F(ScalarComputationsTest, AddTwoScalarsF64) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, SubtractTwoScalarsF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Sub(builder.ConstantR0<float>(2.1f), builder.ConstantR0<float>(5.5f));
 
   ComputeAndCompareR0<float>(&builder, -3.4f, {}, error_spec_);
 }
 
 XLA_TEST_F(ScalarComputationsTest, SubtractTwoScalarsS32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Sub(builder.ConstantR0<int32>(2), builder.ConstantR0<int32>(5));
 
   ComputeAndCompareR0<int32>(&builder, -3, {});
 }
 
 XLA_TEST_F(ScalarComputationsTest, CastS64ToF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto a = builder.Parameter(0, ShapeUtil::MakeShape(S64, {}), "a");
   builder.ConvertElementType(a, F32);
 
@@ -172,7 +170,7 @@ XLA_TEST_F(ScalarComputationsTest, CastS64ToF32) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, MulThreeScalarsF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Mul(builder.Mul(builder.ConstantR0<float>(2.1f),
                           builder.ConstantR0<float>(5.5f)),
               builder.ConstantR0<float>(0.5f));
@@ -191,7 +189,7 @@ XLA_TEST_F(ScalarComputationsTest, MulTwoScalarsS32) {
 
   for (int32 x : data) {
     for (int32 y : data) {
-      ComputationBuilder builder(client_, TestName());
+      XlaBuilder builder(TestName());
       builder.Mul(builder.ConstantR0<int32>(x), builder.ConstantR0<int32>(y));
 
       // Signed integer overflow is undefined behavior in C++. Convert the input
@@ -210,7 +208,7 @@ XLA_TEST_F(ScalarComputationsTest, MulTwoScalarsU32) {
 
   for (uint32 x : data) {
     for (uint32 y : data) {
-      ComputationBuilder builder(client_, TestName());
+      XlaBuilder builder(TestName());
       builder.Mul(builder.ConstantR0<uint32>(x), builder.ConstantR0<uint32>(y));
 
       uint32 expected = x * y;
@@ -220,7 +218,7 @@ XLA_TEST_F(ScalarComputationsTest, MulTwoScalarsU32) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, MulThreeScalarsS32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Mul(
       builder.Mul(builder.ConstantR0<int32>(2), builder.ConstantR0<int32>(5)),
       builder.ConstantR0<int32>(1));
@@ -229,7 +227,7 @@ XLA_TEST_F(ScalarComputationsTest, MulThreeScalarsS32) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, MulThreeScalarsF32Params) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   std::unique_ptr<Literal> a_literal = Literal::CreateR0<float>(2.1f);
   std::unique_ptr<Literal> b_literal = Literal::CreateR0<float>(5.5f);
   std::unique_ptr<Literal> c_literal = Literal::CreateR0<float>(0.5f);
@@ -241,9 +239,9 @@ XLA_TEST_F(ScalarComputationsTest, MulThreeScalarsF32Params) {
   std::unique_ptr<GlobalData> c_data =
       client_->TransferToServer(*c_literal).ConsumeValueOrDie();
 
-  ComputationDataHandle a = builder.Parameter(0, a_literal->shape(), "a");
-  ComputationDataHandle b = builder.Parameter(1, b_literal->shape(), "b");
-  ComputationDataHandle c = builder.Parameter(2, c_literal->shape(), "c");
+  XlaOp a = builder.Parameter(0, a_literal->shape(), "a");
+  XlaOp b = builder.Parameter(1, b_literal->shape(), "b");
+  XlaOp c = builder.Parameter(2, c_literal->shape(), "c");
   builder.Mul(builder.Mul(a, b), c);
 
   ComputeAndCompareR0<float>(&builder, 5.775f,
@@ -252,14 +250,14 @@ XLA_TEST_F(ScalarComputationsTest, MulThreeScalarsF32Params) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, DivideTwoScalarsF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Div(builder.ConstantR0<float>(5.0f), builder.ConstantR0<float>(2.5f));
 
   ComputeAndCompareR0<float>(&builder, 2.0f, {}, error_spec_);
 }
 
 XLA_TEST_F(ScalarComputationsTest, RemTwoScalarsF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Rem(builder.ConstantR0<float>(2.5f), builder.ConstantR0<float>(5.0f));
 
   ComputeAndCompareR0<float>(&builder, 2.5f, {}, error_spec_);
@@ -282,7 +280,7 @@ class DivS32Test : public ClientLibraryTestBase,
 
 XLA_TEST_P(DivS32Test, DivideTwoScalarsS32) {
   DivS32Params p = GetParam();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Div(builder.ConstantR0<int32>(p.dividend),
               builder.ConstantR0<int32>(p.divisor));
 
@@ -291,7 +289,7 @@ XLA_TEST_P(DivS32Test, DivideTwoScalarsS32) {
 
 XLA_TEST_P(DivS32Test, RemainderTwoScalarsS32) {
   DivS32Params p = GetParam();
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Rem(builder.ConstantR0<int32>(p.dividend),
               builder.ConstantR0<int32>(p.divisor));
 
@@ -300,9 +298,9 @@ XLA_TEST_P(DivS32Test, RemainderTwoScalarsS32) {
 
 XLA_TEST_P(DivS32Test, DivideTwoScalarsNonConstS32) {
   DivS32Params p = GetParam();
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle dividend;
-  ComputationDataHandle divisor;
+  XlaBuilder builder(TestName());
+  XlaOp dividend;
+  XlaOp divisor;
   auto dividendd =
       CreateR0Parameter<int32>(p.dividend, 0, "dividend", &builder, &dividend);
   auto divisord =
@@ -315,9 +313,9 @@ XLA_TEST_P(DivS32Test, DivideTwoScalarsNonConstS32) {
 
 XLA_TEST_P(DivS32Test, RemainderTwoScalarsNonConstDivisorS32) {
   DivS32Params p = GetParam();
-  ComputationBuilder builder(client_, TestName());
-  ComputationDataHandle dividend;
-  ComputationDataHandle divisor;
+  XlaBuilder builder(TestName());
+  XlaOp dividend;
+  XlaOp divisor;
   auto dividendd =
       CreateR0Parameter<int32>(p.dividend, 0, "dividend", &builder, &dividend);
   auto divisord =
@@ -364,13 +362,13 @@ XLA_TEST_F(ScalarComputationsTest, DivU32s) {
     0, 1, 2, 17, 101, 3333, 0x7FFFFFFF, 0x80000000, UINT32_MAX - 1, UINT32_MAX};
   // clang-format on
 
-  Computation div_computation;
+  XlaComputation div_computation;
   {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
 
-    ComputationDataHandle dividend =
+    XlaOp dividend =
         builder.Parameter(0, ShapeUtil::MakeShape(U32, {}), "dividend");
-    ComputationDataHandle divisor =
+    XlaOp divisor =
         builder.Parameter(1, ShapeUtil::MakeShape(U32, {}), "divisor");
     builder.Div(dividend, divisor);
     TF_ASSERT_OK_AND_ASSIGN(div_computation, builder.Build());
@@ -392,7 +390,7 @@ XLA_TEST_F(ScalarComputationsTest, DivU32s) {
                                      &execution_options_)
                 .ConsumeValueOrDie();
         auto expected_literal = Literal::CreateR0<uint32>(dividend / divisor);
-        LiteralTestUtil::ExpectEqual(*expected_literal, *actual_literal);
+        EXPECT_TRUE(LiteralTestUtil::Equal(*expected_literal, *actual_literal));
       }
     }
   }
@@ -405,13 +403,13 @@ XLA_TEST_F(ScalarComputationsTest, RemU32s) {
     0, 1, 2, 17, 101, 3333, 0x7FFFFFFF, 0x80000000, UINT32_MAX - 1, UINT32_MAX};
   // clang-format on
 
-  Computation rem_computation;
+  XlaComputation rem_computation;
   {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
 
-    ComputationDataHandle dividend =
+    XlaOp dividend =
         builder.Parameter(0, ShapeUtil::MakeShape(U32, {}), "dividend");
-    ComputationDataHandle divisor =
+    XlaOp divisor =
         builder.Parameter(1, ShapeUtil::MakeShape(U32, {}), "divisor");
     builder.Rem(dividend, divisor);
     TF_ASSERT_OK_AND_ASSIGN(rem_computation, builder.Build());
@@ -433,14 +431,14 @@ XLA_TEST_F(ScalarComputationsTest, RemU32s) {
                                      &execution_options_)
                 .ConsumeValueOrDie();
         auto expected_literal = Literal::CreateR0<uint32>(dividend % divisor);
-        LiteralTestUtil::ExpectEqual(*expected_literal, *actual_literal);
+        EXPECT_TRUE(LiteralTestUtil::Equal(*expected_literal, *actual_literal));
       }
     }
   }
 }
 
 XLA_TEST_F(ScalarComputationsTest, RemainderTwoScalarsNonConstDividendS32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto x = builder.Parameter(0, ShapeUtil::MakeShape(S32, {}), "x");
   builder.Rem(x, builder.ConstantR0<int32>(80000));
 
@@ -450,7 +448,7 @@ XLA_TEST_F(ScalarComputationsTest, RemainderTwoScalarsNonConstDividendS32) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, DivideTwoScalarsU32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   // This verifies 0xFFFFFFFE / 2 = 0x7FFFFFFF. If XLA incorrectly treated U32
   // as S32, it would output -2 / 2 = -1 (0xFFFFFFFF).
   builder.Div(builder.ConstantR0<uint32>(0xFFFFFFFE),
@@ -460,7 +458,7 @@ XLA_TEST_F(ScalarComputationsTest, DivideTwoScalarsU32) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, RemTwoScalarsU32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Rem(builder.ConstantR0<uint32>(11), builder.ConstantR0<uint32>(3));
 
   ComputeAndCompareR0<uint32>(&builder, 2, {});
@@ -469,7 +467,7 @@ XLA_TEST_F(ScalarComputationsTest, RemTwoScalarsU32) {
 XLA_TEST_F(ScalarComputationsTest, AndBool) {
   for (bool x : {false, true}) {
     for (bool y : {false, true}) {
-      ComputationBuilder builder(client_, TestName());
+      XlaBuilder builder(TestName());
       builder.And(builder.ConstantR0<bool>(x), builder.ConstantR0<bool>(y));
 
       ComputeAndCompareR0<bool>(&builder, x && y, {});
@@ -480,7 +478,7 @@ XLA_TEST_F(ScalarComputationsTest, AndBool) {
 XLA_TEST_F(ScalarComputationsTest, AndS32) {
   for (int32 x : {0, 8}) {
     for (int32 y : {1, -16}) {
-      ComputationBuilder builder(client_, TestName());
+      XlaBuilder builder(TestName());
       builder.And(builder.ConstantR0<int32>(x), builder.ConstantR0<int32>(y));
 
       ComputeAndCompareR0<int32>(&builder, x & y, {});
@@ -491,7 +489,7 @@ XLA_TEST_F(ScalarComputationsTest, AndS32) {
 XLA_TEST_F(ScalarComputationsTest, AndU32) {
   for (uint32 x : {0, 8}) {
     for (uint32 y : {1, 16}) {
-      ComputationBuilder builder(client_, TestName());
+      XlaBuilder builder(TestName());
       builder.And(builder.ConstantR0<uint32>(x), builder.ConstantR0<uint32>(y));
 
       ComputeAndCompareR0<uint32>(&builder, x & y, {});
@@ -502,7 +500,7 @@ XLA_TEST_F(ScalarComputationsTest, AndU32) {
 XLA_TEST_F(ScalarComputationsTest, OrBool) {
   for (bool x : {false, true}) {
     for (bool y : {false, true}) {
-      ComputationBuilder builder(client_, TestName());
+      XlaBuilder builder(TestName());
       builder.Or(builder.ConstantR0<bool>(x), builder.ConstantR0<bool>(y));
 
       ComputeAndCompareR0<bool>(&builder, x || y, {});
@@ -513,7 +511,7 @@ XLA_TEST_F(ScalarComputationsTest, OrBool) {
 XLA_TEST_F(ScalarComputationsTest, OrS32) {
   for (int32 x : {0, 8}) {
     for (int32 y : {1, -16}) {
-      ComputationBuilder builder(client_, TestName());
+      XlaBuilder builder(TestName());
       builder.Or(builder.ConstantR0<int32>(x), builder.ConstantR0<int32>(y));
 
       ComputeAndCompareR0<int32>(&builder, x | y, {});
@@ -524,7 +522,7 @@ XLA_TEST_F(ScalarComputationsTest, OrS32) {
 XLA_TEST_F(ScalarComputationsTest, OrU32) {
   for (uint32 x : {0, 8}) {
     for (uint32 y : {1, 16}) {
-      ComputationBuilder builder(client_, TestName());
+      XlaBuilder builder(TestName());
       builder.Or(builder.ConstantR0<uint32>(x), builder.ConstantR0<uint32>(y));
 
       ComputeAndCompareR0<uint32>(&builder, x | y, {});
@@ -534,7 +532,7 @@ XLA_TEST_F(ScalarComputationsTest, OrU32) {
 
 XLA_TEST_F(ScalarComputationsTest, NotBool) {
   for (bool x : {false, true}) {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     builder.Not(builder.ConstantR0<bool>(x));
 
     ComputeAndCompareR0<bool>(&builder, !x, {});
@@ -543,7 +541,7 @@ XLA_TEST_F(ScalarComputationsTest, NotBool) {
 
 XLA_TEST_F(ScalarComputationsTest, NotS32) {
   for (int32 x : {-1, 0, 1}) {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     builder.Not(builder.ConstantR0<int32>(x));
 
     ComputeAndCompareR0<int32>(&builder, ~x, {});
@@ -552,7 +550,7 @@ XLA_TEST_F(ScalarComputationsTest, NotS32) {
 
 XLA_TEST_F(ScalarComputationsTest, NotU32) {
   for (uint32 x : {0, 1, 2}) {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     builder.Not(builder.ConstantR0<uint32>(x));
 
     ComputeAndCompareR0<uint32>(&builder, ~x, {});
@@ -560,7 +558,7 @@ XLA_TEST_F(ScalarComputationsTest, NotU32) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, SelectScalarTrue) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Select(builder.ConstantR0<bool>(true),     // The predicate.
                  builder.ConstantR0<float>(123.0f),  // The value on true.
                  builder.ConstantR0<float>(42.0f));  // The value on false.
@@ -569,7 +567,7 @@ XLA_TEST_F(ScalarComputationsTest, SelectScalarTrue) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, SelectScalarFalse) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Select(builder.ConstantR0<bool>(false),    // The predicate.
                  builder.ConstantR0<float>(123.0f),  // The value on true.
                  builder.ConstantR0<float>(42.0f));  // The value on false.
@@ -580,7 +578,7 @@ XLA_TEST_F(ScalarComputationsTest, SelectScalarFalse) {
 // This test is an explicit version of what is happening in the following
 // templatized comparison tests.
 XLA_TEST_F(ScalarComputationsTest, CompareGtScalar) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Gt(builder.ConstantR0<float>(2.0f), builder.ConstantR0<float>(1.0f));
 
   ComputeAndCompareR0<bool>(&builder, true, {});
@@ -588,157 +586,156 @@ XLA_TEST_F(ScalarComputationsTest, CompareGtScalar) {
 
 // S32 comparisons.
 XLA_TEST_F(ScalarComputationsTest, CompareEqS32Greater) {
-  TestCompare<int32>(2, 1, false, &ComputationBuilder::Eq);
+  TestCompare<int32>(2, 1, false, &XlaBuilder::Eq);
 }
 XLA_TEST_F(ScalarComputationsTest, CompareEqS32Equal) {
-  TestCompare<int32>(3, 3, true, &ComputationBuilder::Eq);
+  TestCompare<int32>(3, 3, true, &XlaBuilder::Eq);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareNeS32) {
-  TestCompare<int32>(2, 1, true, &ComputationBuilder::Ne);
+  TestCompare<int32>(2, 1, true, &XlaBuilder::Ne);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareGeS32) {
-  TestCompare<int32>(2, 1, true, &ComputationBuilder::Ge);
+  TestCompare<int32>(2, 1, true, &XlaBuilder::Ge);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareGtS32) {
-  TestCompare<int32>(1, 5, false, &ComputationBuilder::Gt);
+  TestCompare<int32>(1, 5, false, &XlaBuilder::Gt);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareLeS32) {
-  TestCompare<int32>(2, 1, false, &ComputationBuilder::Le);
+  TestCompare<int32>(2, 1, false, &XlaBuilder::Le);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareLtS32) {
-  TestCompare<int32>(9, 7, false, &ComputationBuilder::Lt);
+  TestCompare<int32>(9, 7, false, &XlaBuilder::Lt);
   TestCompare<int32>(std::numeric_limits<int32>::min(),
-                     std::numeric_limits<int32>::max(), true,
-                     &ComputationBuilder::Lt);
+                     std::numeric_limits<int32>::max(), true, &XlaBuilder::Lt);
 }
 
 // U32 comparisons.
 XLA_TEST_F(ScalarComputationsTest, CompareEqU32False) {
-  TestCompare<uint32>(2, 1, false, &ComputationBuilder::Eq);
+  TestCompare<uint32>(2, 1, false, &XlaBuilder::Eq);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareNeU32) {
-  TestCompare<uint32>(2, 1, true, &ComputationBuilder::Ne);
+  TestCompare<uint32>(2, 1, true, &XlaBuilder::Ne);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareGeU32Greater) {
-  TestCompare<uint32>(2, 1, true, &ComputationBuilder::Ge);
+  TestCompare<uint32>(2, 1, true, &XlaBuilder::Ge);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareGeU32Equal) {
-  TestCompare<uint32>(3, 3, true, &ComputationBuilder::Ge);
+  TestCompare<uint32>(3, 3, true, &XlaBuilder::Ge);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareGtU32) {
-  TestCompare<uint32>(1, 5, false, &ComputationBuilder::Gt);
-  TestCompare<uint32>(5, 5, false, &ComputationBuilder::Gt);
-  TestCompare<uint32>(5, 1, true, &ComputationBuilder::Gt);
+  TestCompare<uint32>(1, 5, false, &XlaBuilder::Gt);
+  TestCompare<uint32>(5, 5, false, &XlaBuilder::Gt);
+  TestCompare<uint32>(5, 1, true, &XlaBuilder::Gt);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareLeU32) {
-  TestCompare<uint32>(2, 1, false, &ComputationBuilder::Le);
+  TestCompare<uint32>(2, 1, false, &XlaBuilder::Le);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareLtU32) {
-  TestCompare<uint32>(9, 7, false, &ComputationBuilder::Lt);
+  TestCompare<uint32>(9, 7, false, &XlaBuilder::Lt);
   TestCompare<uint32>(0, std::numeric_limits<uint32>::max(), true,
-                      &ComputationBuilder::Lt);
+                      &XlaBuilder::Lt);
 }
 
 // F32 comparisons.
 XLA_TEST_F(ScalarComputationsTest, CompareEqF32False) {
-  TestCompare<float>(2.0, 1.3, false, &ComputationBuilder::Eq);
+  TestCompare<float>(2.0, 1.3, false, &XlaBuilder::Eq);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareNeF32) {
-  TestCompare<float>(2.0, 1.3, true, &ComputationBuilder::Ne);
+  TestCompare<float>(2.0, 1.3, true, &XlaBuilder::Ne);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareGeF32Greater) {
-  TestCompare<float>(2.0, 1.9, true, &ComputationBuilder::Ge);
+  TestCompare<float>(2.0, 1.9, true, &XlaBuilder::Ge);
 }
 XLA_TEST_F(ScalarComputationsTest, CompareGeF32Equal) {
-  TestCompare<float>(3.5, 3.5, true, &ComputationBuilder::Ge);
+  TestCompare<float>(3.5, 3.5, true, &XlaBuilder::Ge);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareGtF32) {
-  TestCompare<float>(1.0, 5.2, false, &ComputationBuilder::Gt);
+  TestCompare<float>(1.0, 5.2, false, &XlaBuilder::Gt);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareLeF32) {
-  TestCompare<float>(2.0, 1.2, false, &ComputationBuilder::Le);
+  TestCompare<float>(2.0, 1.2, false, &XlaBuilder::Le);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareLtF32) {
-  TestCompare<float>(9.0, 7.2, false, &ComputationBuilder::Lt);
+  TestCompare<float>(9.0, 7.2, false, &XlaBuilder::Lt);
 }
 
 // F32 comparisons with exceptional values.  The test names encode the
 // left/right operands at the end, and use Minf and Mzero for -inf and -0.0.
 XLA_TEST_F(ScalarComputationsTest, CompareLtF32MinfMzero) {
-  TestCompare<float>(-INFINITY, -0.0, true, &ComputationBuilder::Lt);
+  TestCompare<float>(-INFINITY, -0.0, true, &XlaBuilder::Lt);
 }
 XLA_TEST_F(ScalarComputationsTest, CompareLtF32MzeroZero) {
   // Comparisons of 0.0 to -0.0 consider them equal in IEEE 754.
-  TestCompare<float>(-0.0, 0.0, false, &ComputationBuilder::Lt);
+  TestCompare<float>(-0.0, 0.0, false, &XlaBuilder::Lt);
 }
 XLA_TEST_F(ScalarComputationsTest, CompareLtF32ZeroInf) {
-  TestCompare<float>(0.0, INFINITY, true, &ComputationBuilder::Lt);
+  TestCompare<float>(0.0, INFINITY, true, &XlaBuilder::Lt);
 }
 
 XLA_TEST_F(ScalarComputationsTest, CompareGeF32MinfMzero) {
-  TestCompare<float>(-INFINITY, -0.0, false, &ComputationBuilder::Ge);
+  TestCompare<float>(-INFINITY, -0.0, false, &XlaBuilder::Ge);
 }
 XLA_TEST_F(ScalarComputationsTest, CompareGeF32MzeroZero) {
   // Comparisons of 0.0 to -0.0 consider them equal in IEEE 754.
-  TestCompare<float>(-0.0, 0.0, true, &ComputationBuilder::Ge);
+  TestCompare<float>(-0.0, 0.0, true, &XlaBuilder::Ge);
 }
 XLA_TEST_F(ScalarComputationsTest, CompareGeF32ZeroInf) {
-  TestCompare<float>(0.0, INFINITY, false, &ComputationBuilder::Ge);
+  TestCompare<float>(0.0, INFINITY, false, &XlaBuilder::Ge);
 }
 
 XLA_TEST_F(ScalarComputationsTest, ExpScalar) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Exp(builder.ConstantR0<float>(2.0f));
 
   ComputeAndCompareR0<float>(&builder, 7.3890562, {}, error_spec_);
 }
 
 XLA_TEST_F(ScalarComputationsTest, LogScalar) {
-  ComputationBuilder builder(client_, "log");
+  XlaBuilder builder("log");
   builder.Log(builder.ConstantR0<float>(2.0f));
 
   ComputeAndCompareR0<float>(&builder, 0.6931471, {}, error_spec_);
 }
 
 XLA_TEST_F(ScalarComputationsTest, TanhScalar) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Tanh(builder.ConstantR0<float>(2.0f));
 
   ComputeAndCompareR0<float>(&builder, 0.96402758, {}, error_spec_);
 }
 
 XLA_TEST_F(ScalarComputationsTest, TanhDoubleScalar) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Tanh(builder.ConstantR0<double>(2.0));
 
   ComputeAndCompareR0<double>(&builder, 0.96402758, {}, error_spec_);
 }
 
 XLA_TEST_F(ScalarComputationsTest, PowScalar) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Pow(builder.ConstantR0<float>(2.0f), builder.ConstantR0<float>(3.0f));
 
   ComputeAndCompareR0<float>(&builder, 8.0, {}, error_spec_);
 }
 
 XLA_TEST_F(ScalarComputationsTest, ClampScalarHighS32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Clamp(builder.ConstantR0<int32>(-1),  // The lower bound.
                 builder.ConstantR0<int32>(5),   // The operand to be clamped.
                 builder.ConstantR0<int32>(3));  // The upper bound.
@@ -747,7 +744,7 @@ XLA_TEST_F(ScalarComputationsTest, ClampScalarHighS32) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddleS32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Clamp(builder.ConstantR0<int32>(-1),  // The lower bound.
                 builder.ConstantR0<int32>(2),   // The operand to be clamped.
                 builder.ConstantR0<int32>(3));  // The upper bound.
@@ -756,7 +753,7 @@ XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddleS32) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, ClampScalarLowS32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Clamp(builder.ConstantR0<int32>(-1),  // The lower bound.
                 builder.ConstantR0<int32>(-5),  // The operand to be clamped.
                 builder.ConstantR0<int32>(3));  // The upper bound.
@@ -765,7 +762,7 @@ XLA_TEST_F(ScalarComputationsTest, ClampScalarLowS32) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, ClampScalarHighU32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Clamp(builder.ConstantR0<uint32>(1),   // The lower bound.
                 builder.ConstantR0<uint32>(5),   // The operand to be clamped.
                 builder.ConstantR0<uint32>(3));  // The upper bound.
@@ -774,7 +771,7 @@ XLA_TEST_F(ScalarComputationsTest, ClampScalarHighU32) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddleU32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Clamp(builder.ConstantR0<uint32>(1),   // The lower bound.
                 builder.ConstantR0<uint32>(2),   // The operand to be clamped.
                 builder.ConstantR0<uint32>(3));  // The upper bound.
@@ -783,7 +780,7 @@ XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddleU32) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, ClampScalarLowU32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Clamp(builder.ConstantR0<uint32>(1),   // The lower bound.
                 builder.ConstantR0<uint32>(0),   // The operand to be clamped.
                 builder.ConstantR0<uint32>(3));  // The upper bound.
@@ -792,7 +789,7 @@ XLA_TEST_F(ScalarComputationsTest, ClampScalarLowU32) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, ClampScalarHighF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Clamp(builder.ConstantR0<float>(2.0f),   // The lower bound.
                 builder.ConstantR0<float>(5.0f),   // The operand to be clamped.
                 builder.ConstantR0<float>(3.0f));  // The upper bound.
@@ -801,7 +798,7 @@ XLA_TEST_F(ScalarComputationsTest, ClampScalarHighF32) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddleF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Clamp(builder.ConstantR0<float>(2.0f),   // The lower bound.
                 builder.ConstantR0<float>(2.5f),   // The operand to be clamped.
                 builder.ConstantR0<float>(3.0f));  // The upper bound.
@@ -810,7 +807,7 @@ XLA_TEST_F(ScalarComputationsTest, ClampScalarMiddleF32) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, ClampScalarLowF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Clamp(builder.ConstantR0<float>(2.0f),   // The lower bound.
                 builder.ConstantR0<float>(-5.0f),  // The operand to be clamped.
                 builder.ConstantR0<float>(3.0f));  // The upper bound.
@@ -819,70 +816,70 @@ XLA_TEST_F(ScalarComputationsTest, ClampScalarLowF32) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, MinS32Above) {
-  TestMinMax<int32>(10, 3, 3, &ComputationBuilder::Min);
+  TestMinMax<int32>(10, 3, 3, &XlaBuilder::Min);
 }
 
 XLA_TEST_F(ScalarComputationsTest, MinS32Below) {
-  TestMinMax<int32>(-100, 3, -100, &ComputationBuilder::Min);
+  TestMinMax<int32>(-100, 3, -100, &XlaBuilder::Min);
 }
 
 XLA_TEST_F(ScalarComputationsTest, MaxS32Above) {
-  TestMinMax<int32>(10, 3, 10, &ComputationBuilder::Max);
+  TestMinMax<int32>(10, 3, 10, &XlaBuilder::Max);
 }
 
 XLA_TEST_F(ScalarComputationsTest, MaxS32Below) {
-  TestMinMax<int32>(-100, 3, 3, &ComputationBuilder::Max);
+  TestMinMax<int32>(-100, 3, 3, &XlaBuilder::Max);
 }
 
 XLA_TEST_F(ScalarComputationsTest, MinU32Above) {
   const uint32 large = std::numeric_limits<int32>::max();
-  TestMinMax<uint32>(large, 3, 3, &ComputationBuilder::Min);
+  TestMinMax<uint32>(large, 3, 3, &XlaBuilder::Min);
 }
 
 XLA_TEST_F(ScalarComputationsTest, MinU32Below) {
-  TestMinMax<uint32>(0, 5, 0, &ComputationBuilder::Min);
+  TestMinMax<uint32>(0, 5, 0, &XlaBuilder::Min);
 }
 
 XLA_TEST_F(ScalarComputationsTest, MaxU32Above) {
   const uint32 large = std::numeric_limits<int32>::max();
-  TestMinMax<uint32>(large, 3, large, &ComputationBuilder::Max);
+  TestMinMax<uint32>(large, 3, large, &XlaBuilder::Max);
 }
 
 XLA_TEST_F(ScalarComputationsTest, MaxU32Below) {
-  TestMinMax<uint32>(0, 5, 5, &ComputationBuilder::Max);
+  TestMinMax<uint32>(0, 5, 5, &XlaBuilder::Max);
 }
 
 XLA_TEST_F(ScalarComputationsTest, MinF32Above) {
-  TestMinMax<float>(10.1f, 3.1f, 3.1f, &ComputationBuilder::Min);
+  TestMinMax<float>(10.1f, 3.1f, 3.1f, &XlaBuilder::Min);
 }
 
 XLA_TEST_F(ScalarComputationsTest, MinF32Below) {
-  TestMinMax<float>(-100.1f, 3.1f, -100.1f, &ComputationBuilder::Min);
+  TestMinMax<float>(-100.1f, 3.1f, -100.1f, &XlaBuilder::Min);
 }
 
 XLA_TEST_F(ScalarComputationsTest, MinPropagatesNan) {
   SetFastMathDisabled(true);
-  TestMinMax<float>(NAN, 3.1f, NAN, &ComputationBuilder::Min);
-  TestMinMax<float>(-3.1f, NAN, NAN, &ComputationBuilder::Min);
+  TestMinMax<float>(NAN, 3.1f, NAN, &XlaBuilder::Min);
+  TestMinMax<float>(-3.1f, NAN, NAN, &XlaBuilder::Min);
 }
 
 XLA_TEST_F(ScalarComputationsTest, MaxF32Above) {
-  TestMinMax<float>(10.1f, 3.1f, 10.1f, &ComputationBuilder::Max);
+  TestMinMax<float>(10.1f, 3.1f, 10.1f, &XlaBuilder::Max);
 }
 
 XLA_TEST_F(ScalarComputationsTest, MaxF32Below) {
-  TestMinMax<float>(-100.1f, 3.1f, 3.1f, &ComputationBuilder::Max);
+  TestMinMax<float>(-100.1f, 3.1f, 3.1f, &XlaBuilder::Max);
 }
 
 XLA_TEST_F(ScalarComputationsTest, MaxPropagatesNan) {
   SetFastMathDisabled(true);
-  TestMinMax<float>(NAN, 3.1f, NAN, &ComputationBuilder::Max);
-  TestMinMax<float>(-3.1f, NAN, NAN, &ComputationBuilder::Max);
+  TestMinMax<float>(NAN, 3.1f, NAN, &XlaBuilder::Max);
+  TestMinMax<float>(-3.1f, NAN, NAN, &XlaBuilder::Max);
 }
 
 XLA_TEST_F(ScalarComputationsTest, ComplicatedArithmeticExpressionF32) {
   // Compute the expression (1 * (3 - 1) * (7 + 0) - 4) / 20.
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   b.Div(
       b.Sub(b.Mul(b.ConstantR0<float>(1),
                   b.Mul(b.Sub(b.ConstantR0<float>(3), b.ConstantR0<float>(1)),
@@ -895,7 +892,7 @@ XLA_TEST_F(ScalarComputationsTest, ComplicatedArithmeticExpressionF32) {
 
 XLA_TEST_F(ScalarComputationsTest, ComplicatedArithmeticExpressionS32) {
   // Compute the expression 1 * (3 - 1) * (7 + 0) - 4.
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   b.Sub(b.Mul(b.ConstantR0<int32>(1),
               b.Mul(b.Sub(b.ConstantR0<int32>(3), b.ConstantR0<int32>(1)),
                     b.Add(b.ConstantR0<int32>(7), b.ConstantR0<int32>(0)))),
@@ -905,21 +902,20 @@ XLA_TEST_F(ScalarComputationsTest, ComplicatedArithmeticExpressionS32) {
 }
 
 XLA_TEST_F(ScalarComputationsTest, SqrtF320) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   Literal zero_literal = Literal::Zero(PrimitiveType::F32);
 
   std::unique_ptr<GlobalData> zero_data =
       client_->TransferToServer(zero_literal).ConsumeValueOrDie();
 
-  ComputationDataHandle zero =
-      builder.Parameter(0, zero_literal.shape(), "zero");
+  XlaOp zero = builder.Parameter(0, zero_literal.shape(), "zero");
   builder.SqrtF32(zero);
 
   ComputeAndCompareR0<float>(&builder, 0.0f, {zero_data.get()}, error_spec_);
 }
 
 XLA_TEST_F(ScalarComputationsTest, RoundScalar) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   builder.Round(builder.ConstantR0<float>(1.4f));
 
   ComputeAndCompareR0<float>(&builder, 1.0f, {}, error_spec_);
diff --git a/tensorflow/compiler/xla/tests/select_test.cc b/tensorflow/compiler/xla/tests/select_test.cc
index 009e7d24c5cbfa..72707f224446c7 100644
--- a/tensorflow/compiler/xla/tests/select_test.cc
+++ b/tensorflow/compiler/xla/tests/select_test.cc
@@ -16,13 +16,12 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -35,7 +34,7 @@ class SelectTest : public ClientLibraryTestBase {
 };
 
 TEST_F(SelectTest, SelectScalarF32True) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(true);
   auto on_true = builder.ConstantR0<float>(123.0f);
   auto on_false = builder.ConstantR0<float>(42.0f);
@@ -45,7 +44,7 @@ TEST_F(SelectTest, SelectScalarF32True) {
 }
 
 TEST_F(SelectTest, SelectScalarS32True) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(true);
   auto on_true = builder.ConstantR0<int32>(-42);
   auto on_false = builder.ConstantR0<int32>(42);
@@ -55,7 +54,7 @@ TEST_F(SelectTest, SelectScalarS32True) {
 }
 
 TEST_F(SelectTest, SelectScalarF32False) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto on_true = builder.ConstantR0<float>(123.0f);
   auto on_false = builder.ConstantR0<float>(42.0f);
@@ -65,7 +64,7 @@ TEST_F(SelectTest, SelectScalarF32False) {
 }
 
 XLA_TEST_F(SelectTest, SelectR1S0F32WithConstantR1S0PRED) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR1<bool>({});
   auto on_true = builder.ConstantR1<float>({});
   auto on_false = builder.ConstantR1<float>({});
@@ -75,7 +74,7 @@ XLA_TEST_F(SelectTest, SelectR1S0F32WithConstantR1S0PRED) {
 }
 
 TEST_F(SelectTest, SelectR1F32WithConstantR1PRED) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR1<bool>({false, true, false, true, false});
   auto on_true = builder.ConstantR1<float>({-2.5f, 25.5f, 2.25f, -10.0f, 6.0f});
   auto on_false = builder.ConstantR1<float>({10.0f, 5.0f, 1.0f, 10.0f, -6.0f});
@@ -88,7 +87,7 @@ TEST_F(SelectTest, SelectR1F32WithConstantR1PRED) {
 XLA_TEST_F(SelectTest, SelectR1S0F32WithCmpR1S0S32s) {
   // Similar to SelectR1S0F32WithConstantR1S0PRED, except that the pred vector
   // is not a constant, but rather the result of comparing two other vectors.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto v1 = builder.ConstantR1<int32>({});
   auto v2 = builder.ConstantR1<int32>({});
   auto cmp = builder.Eq(v1, v2);
@@ -102,7 +101,7 @@ XLA_TEST_F(SelectTest, SelectR1S0F32WithCmpR1S0S32s) {
 TEST_F(SelectTest, SelectR1F32WithCmpR1S32s) {
   // Similar to SelectR1F32WithConstantR1PRED, except that the pred vector is
   // not a constant, but rather the result of comparing two other vectors.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto v1 = builder.ConstantR1<int32>({1, 2, 3, 4, 5});
   auto v2 = builder.ConstantR1<int32>({9, 2, 9, 4, 9});
   auto cmp = builder.Eq(v1, v2);
@@ -116,7 +115,7 @@ TEST_F(SelectTest, SelectR1F32WithCmpR1S32s) {
 
 TEST_F(SelectTest, SelectR1F32WithCmpR1F32s) {
   // Similar to SelectR1F32WithCmpR1S32s, except "gt"-comparing two R1F32s.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto v1 = builder.ConstantR1<float>({1.0f, 2.0f, 3.0f, 4.0f, 5.0f});
   auto v2 = builder.ConstantR1<float>({-1.0f, -2.0f, 13.0f, 14.0f, 4.4f});
   auto cmp = builder.Gt(v1, v2);
@@ -131,9 +130,9 @@ TEST_F(SelectTest, SelectR1F32WithCmpR1F32s) {
 TEST_F(SelectTest, SelectR1F32WithCmpR1F32sFromParamsSmall) {
   // Selects among two R1F32s, which come from parameters. v1 and v2 are
   // compared, and selection between them happens based on a gt-comparison mask.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
-  ComputationDataHandle v1, v2;
+  XlaOp v1, v2;
   std::unique_ptr<GlobalData> param0_data = CreateR1Parameter<float>(
       {41.0f, 2.0f, 3.0f, 84.0f}, /*parameter_number=*/0, /*name=*/"v1",
       /*builder=*/&builder, /*data_handle=*/&v1);
@@ -151,7 +150,7 @@ TEST_F(SelectTest, SelectR1F32WithCmpR1F32sFromParamsSmall) {
 TEST_F(SelectTest, SelectR1F32WithCmpR1F32sFromParamsLarge) {
   // Similar to SelectR1F32WithCmpR1F32sFromParamsSmall, except that the
   // data size passed in and out is large.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
 
   // Number of floats in the data passed into and out of the computation.
   constexpr int datalen = 15 * 1000;
@@ -174,7 +173,7 @@ TEST_F(SelectTest, SelectR1F32WithCmpR1F32sFromParamsLarge) {
     expected_vec.push_back(larger);
   }
 
-  ComputationDataHandle v1, v2;
+  XlaOp v1, v2;
   std::unique_ptr<GlobalData> param0_data =
       CreateR1Parameter<float>(v1vec, /*parameter_number=*/0, /*name=*/"v1",
                                /*builder=*/&builder, /*data_handle=*/&v1);
@@ -192,7 +191,7 @@ TEST_F(SelectTest, SelectR1F32WithCmpR1F32sFromParamsLarge) {
 TEST_F(SelectTest, SelectR1F32WithCmpR1S32ToScalar) {
   // "gt"-compares a R1S32 with a S32 scalar, and uses the resulting R1PRED to
   // select between two R1F32s.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto v = builder.ConstantR1<int32>({1, -1, 2, -2});
   auto s = builder.ConstantR0<int32>(0);
   auto cmp = builder.Gt(v, s);
@@ -209,7 +208,7 @@ TEST_F(SelectTest, SelectR1F32WithCmpR1S32ToScalar) {
 TEST_F(SelectTest, SelectR1F32WithCmpR1F32ToScalar) {
   // "gt"-compares a R1F32 with a F32 scalar, and uses the resulting R1PRED to
   // select between two R1F32s.
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto v = builder.ConstantR1<float>({1.0f, 2.0f, 3.0f, 4.0f});
   auto s = builder.ConstantR0<float>(2.5f);
   auto cmp = builder.Gt(v, s);
@@ -225,7 +224,7 @@ TEST_F(SelectTest, SelectR1F32WithCmpR1F32ToScalar) {
 
 XLA_TEST_F(SelectTest, SelectR1S0F32WithScalarPredicate) {
   for (bool which : {false, true}) {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto pred = builder.ConstantR0<bool>(which);
     auto on_true = builder.ConstantR1<float>({});
     auto on_false = builder.ConstantR1<float>({});
@@ -236,7 +235,7 @@ XLA_TEST_F(SelectTest, SelectR1S0F32WithScalarPredicate) {
 }
 
 TEST_F(SelectTest, SelectR1F32WithScalarPredicateTrue) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(true);
   auto on_true = builder.ConstantR1<float>({-2.5f, 25.5f});
   auto on_false = builder.ConstantR1<float>({10.0f, 5.0f});
@@ -246,7 +245,7 @@ TEST_F(SelectTest, SelectR1F32WithScalarPredicateTrue) {
 }
 
 TEST_F(SelectTest, SelectR1F32WithScalarPredicateFalse) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto pred = builder.ConstantR0<bool>(false);
   auto on_true = builder.ConstantR1<float>({-2.5f, 25.5f});
   auto on_false = builder.ConstantR1<float>({10.0f, 5.0f});
diff --git a/tensorflow/compiler/xla/tests/set_return_value_test.cc b/tensorflow/compiler/xla/tests/set_return_value_test.cc
deleted file mode 100644
index 29f79ec28a1ae6..00000000000000
--- a/tensorflow/compiler/xla/tests/set_return_value_test.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <vector>
-
-#include "tensorflow/compiler/xla/client/computation_builder.h"
-#include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
-#include "tensorflow/compiler/xla/tests/literal_test_util.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace xla {
-namespace {
-
-class SetReturnValueTest : public ClientLibraryTestBase {};
-
-TEST_F(SetReturnValueTest, NoSetValue) {
-  ComputationBuilder builder(client_, "no_set_value");
-  auto alpha = builder.ConstantR0<float>(1.0);
-  auto x = builder.ConstantR1<float>(
-      {-1.0, 1.0, 2.0, -2.0, -3.0, 3.0, 4.0, -4.0, -5.0, 5.0});
-  auto ax = builder.Add(alpha, x);
-  auto aax = builder.Add(alpha, ax);
-
-  std::vector<float> expected = {1.0, 3.0, 4.0,  0.0,  -1.0,
-                                 5.0, 6.0, -2.0, -3.0, 7.0};
-
-  ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
-}
-
-TEST_F(SetReturnValueTest, SetValue) {
-  ComputationBuilder builder(client_, "set_value");
-  auto alpha = builder.ConstantR0<float>(1.0);
-  auto x = builder.ConstantR1<float>(
-      {-1.0, 1.0, 2.0, -2.0, -3.0, 3.0, 4.0, -4.0, -5.0, 5.0});
-  auto ax = builder.Add(alpha, x);
-  auto aax = builder.Add(alpha, ax);
-  auto builder_status = builder.SetReturnValue(ax);
-  EXPECT_TRUE(builder_status.ok());
-
-  std::vector<float> expected = {0.0, 2.0, 3.0,  -1.0, -2.0,
-                                 4.0, 5.0, -3.0, -4.0, 6.0};
-
-  ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
-}
-
-TEST_F(SetReturnValueTest, SetValueAndModify) {
-  ComputationBuilder builder(client_, "set_value_and_modify");
-  auto alpha = builder.ConstantR0<float>(1.0);
-  auto x = builder.ConstantR1<float>(
-      {-1.0, 1.0, 2.0, -2.0, -3.0, 3.0, 4.0, -4.0, -5.0, 5.0});
-  auto ax = builder.Add(alpha, x);
-  auto aax = builder.Add(alpha, ax);
-  auto builder_status = builder.SetReturnValue(ax);
-  EXPECT_TRUE(builder_status.ok());
-  auto aaax = builder.Add(alpha, aax);
-
-  std::vector<float> expected = {0.0, 2.0, 3.0,  -1.0, -2.0,
-                                 4.0, 5.0, -3.0, -4.0, 6.0};
-
-  ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
-}
-
-TEST_F(SetReturnValueTest, SetValueMultipleTimesAndModify) {
-  ComputationBuilder builder(client_, "set_value_multiple_times_and_modify");
-  auto alpha = builder.ConstantR0<float>(1.0);
-  auto x = builder.ConstantR1<float>(
-      {-1.0, 1.0, 2.0, -2.0, -3.0, 3.0, 4.0, -4.0, -5.0, 5.0});
-  auto ax = builder.Add(alpha, x);
-  auto aax = builder.Add(alpha, ax);
-  auto builder_status = builder.SetReturnValue(aax);
-  EXPECT_TRUE(builder_status.ok());
-  auto aaax = builder.Add(alpha, aax);
-  builder_status = builder.SetReturnValue(ax);
-  EXPECT_TRUE(builder_status.ok());
-  auto aaaax = builder.Add(alpha, aaax);
-
-  std::vector<float> expected = {0.0, 2.0, 3.0,  -1.0, -2.0,
-                                 4.0, 5.0, -3.0, -4.0, 6.0};
-
-  ComputeAndCompareR1<float>(&builder, expected, {}, ErrorSpec(0.0001));
-}
-
-}  // namespace
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/slice_test.cc b/tensorflow/compiler/xla/tests/slice_test.cc
index 52195db2aa7471..5653bf11a7364b 100644
--- a/tensorflow/compiler/xla/tests/slice_test.cc
+++ b/tensorflow/compiler/xla/tests/slice_test.cc
@@ -197,9 +197,10 @@ class SliceR1Test : public ClientLibraryTestBase,
     // vector<bool>.
     tensorflow::gtl::InlinedVector<NativeT, 1> input(spec.input_dim0);
     std::iota(input.begin(), input.end(), NativeT());
+    auto literal = Literal::CreateR1<NativeT>(input);
 
     XlaBuilder builder(TestName());
-    auto original = builder.ConstantR1<NativeT>(input);
+    auto original = builder.Parameter(0, literal->shape(), "p0");
     builder.Slice(original, {spec.slice_start}, {spec.slice_limit},
                   {spec.slice_stride});
 
@@ -210,7 +211,9 @@ class SliceR1Test : public ClientLibraryTestBase,
       expected.push_back(i);
     }
 
-    ComputeAndCompareR1<NativeT>(&builder, expected, {});
+    TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> arg,
+                            client_->TransferToServer(*literal));
+    ComputeAndCompareR1<NativeT>(&builder, expected, {arg.get()});
   }
 };
 
@@ -365,15 +368,18 @@ XLA_TEST_P(SliceR2Test, DoIt) {
   const R2Spec& spec = GetParam();
   Array2D<int32> input(spec.input_dim0, spec.input_dim1);
   input.FillUnique();
+  auto literal = Literal::CreateR2FromArray2DWithLayout(
+      input, LayoutUtil::MakeLayout(spec.layout));
 
   XlaBuilder builder(TestName());
-  auto a = builder.ConstantR2FromArray2DWithLayout<int32>(
-      input, LayoutUtil::MakeLayout(spec.layout));
+  auto a = builder.Parameter(0, literal->shape(), "p0");
   builder.Slice(a, spec.slice_starts, spec.slice_limits, spec.slice_strides);
 
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<GlobalData> arg,
+                          client_->TransferToServer(*literal));
   std::unique_ptr<Array2D<int32>> expected = ReferenceUtil::Slice2D(
       input, spec.slice_starts, spec.slice_limits, spec.slice_strides);
-  ComputeAndCompareR2<int32>(&builder, *expected, {});
+  ComputeAndCompareR2<int32>(&builder, *expected, {arg.get()});
 }
 
 INSTANTIATE_TEST_CASE_P(
@@ -453,7 +459,7 @@ class SliceR4Test : public ClientLibraryTestBase,
   void Run(const R4Spec& spec) {
     Array4D<float> values(spec.input_dims[0], spec.input_dims[1],
                           spec.input_dims[2], spec.input_dims[3]);
-    values.FillRandom(3.14f);
+    values.FillIota(3.14159);
     auto expected = ReferenceUtil::Slice4D(
         values, spec.slice_starts, spec.slice_limits, spec.slice_strides);
     XlaBuilder builder(TestName());
diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index 997a1d8273736a..dd7c5417336342 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -26,6 +26,7 @@ namespace {
 template <typename FloatT, typename GeneratorT>
 void PopulateWithRandomFloatingPointDataImpl(Literal* literal,
                                              std::minstd_rand0* engine) {
+  CHECK(engine != nullptr);
   CHECK_EQ(literal->shape().element_type(),
            primitive_util::NativeToPrimitiveType<FloatT>());
   // Create uniform numbers between 1 and 1.125 to avoid creating denormal
@@ -59,12 +60,14 @@ void PopulateWithRandomFloatingPointDataImpl(Literal* literal,
 template <typename FloatT>
 void PopulateWithRandomFloatingPointData(Literal* literal,
                                          std::minstd_rand0* engine) {
+  CHECK(engine != nullptr);
   PopulateWithRandomFloatingPointDataImpl<FloatT, FloatT>(literal, engine);
 }
 
 template <>
 void PopulateWithRandomFloatingPointData<half>(Literal* literal,
                                                std::minstd_rand0* engine) {
+  CHECK(engine != nullptr);
   PopulateWithRandomFloatingPointDataImpl<half, float>(literal, engine);
 }
 
@@ -73,6 +76,7 @@ void PopulateWithRandomFloatingPointData<half>(Literal* literal,
 template <>
 void PopulateWithRandomFloatingPointData<bfloat16>(Literal* literal,
                                                    std::minstd_rand0* engine) {
+  CHECK(engine != nullptr);
   CHECK_EQ(literal->shape().element_type(), BF16);
   std::uniform_real_distribution<float> generator(-0.9f, 1.0f);
   TF_CHECK_OK(literal->Populate<bfloat16>(
@@ -84,6 +88,7 @@ void PopulateWithRandomFloatingPointData<bfloat16>(Literal* literal,
 template <typename IntT>
 void PopulateWithRandomIntegralData(Literal* literal,
                                     std::minstd_rand0* engine) {
+  CHECK(engine != nullptr);
   CHECK_EQ(literal->shape().element_type(),
            primitive_util::NativeToPrimitiveType<IntT>());
   std::uniform_int_distribution<IntT> generator(
@@ -107,7 +112,10 @@ StatusOr<std::unique_ptr<Literal>> MakeFakeLiteralInternal(
     }
     return Literal::MakeTupleOwned(std::move(elements));
   }
-  std::unique_ptr<Literal> literal = Literal::CreateFromShape(shape);
+  if (engine == nullptr) {
+    return Literal::CreateFromShape(shape);
+  }
+  auto literal = MakeUnique<Literal>(shape);
   switch (shape.element_type()) {
     case BF16:
       PopulateWithRandomFloatingPointData<bfloat16>(literal.get(), engine);
@@ -201,11 +209,13 @@ std::unique_ptr<Literal> MakeRandomNonwrappingSliceIndex(
     std::minstd_rand0* engine) {
   const int64 rank = ShapeUtil::Rank(input_shape);
   std::vector<int32> start_indices(rank);
-  for (int i = 0; i < rank; ++i) {
-    const int32 upper_bound = ShapeUtil::GetDimension(input_shape, i) -
-                              ShapeUtil::GetDimension(slice_shape, i);
-    std::uniform_int_distribution<int32> generator(0, upper_bound);
-    start_indices[i] = generator(*engine);
+  if (engine != nullptr) {
+    for (int i = 0; i < rank; ++i) {
+      const int32 upper_bound = ShapeUtil::GetDimension(input_shape, i) -
+                                ShapeUtil::GetDimension(slice_shape, i);
+      std::uniform_int_distribution<int32> generator(0, upper_bound);
+      start_indices[i] = generator(*engine);
+    }
   }
   return Literal::CreateR1<int32>(start_indices);
 }
@@ -321,26 +331,26 @@ StatusOr<std::unique_ptr<Literal>> MakeConstrainedArgument(
 
 }  // namespace
 
-StatusOr<std::unique_ptr<Literal>> MakeFakeLiteral(const Shape& shape) {
-  std::minstd_rand0 engine;
-  return MakeFakeLiteralInternal(shape, &engine);
+StatusOr<std::unique_ptr<Literal>> MakeFakeLiteral(const Shape& shape,
+                                                   bool pseudo_random) {
+  auto engine = pseudo_random ? MakeUnique<std::minstd_rand0>() : nullptr;
+  return MakeFakeLiteralInternal(shape, engine.get());
 }
 
 StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
-    HloModule* const module) {
+    HloModule* const module, bool pseudo_random) {
   TF_ASSIGN_OR_RETURN(auto dataflow, HloDataflowAnalysis::Run(*module));
   const auto params = module->entry_computation()->parameter_instructions();
-  std::minstd_rand0 engine;
+  auto engine = pseudo_random ? MakeUnique<std::minstd_rand0>() : nullptr;
   std::vector<std::unique_ptr<Literal>> arguments(params.size());
   for (int i = 0; i < params.size(); ++i) {
-    TF_ASSIGN_OR_RETURN(
-        arguments[i], MakeConstrainedArgument(*dataflow, *params[i], &engine));
+    TF_ASSIGN_OR_RETURN(arguments[i], MakeConstrainedArgument(
+                                          *dataflow, *params[i], engine.get()));
   }
   return std::move(arguments);
 }
 
-Status VerifyHloModule(const se::Platform& platform, HloModule* const module,
-                       bool allow_mixed_precision) {
+Status VerifyHloModule(HloModule* const module, bool allow_mixed_precision) {
   return HloVerifier(allow_mixed_precision).Run(module).status();
 }
 
diff --git a/tensorflow/compiler/xla/tests/test_utils.h b/tensorflow/compiler/xla/tests/test_utils.h
index 30c147910cae85..a8689f64981569 100644
--- a/tensorflow/compiler/xla/tests/test_utils.h
+++ b/tensorflow/compiler/xla/tests/test_utils.h
@@ -55,20 +55,32 @@ class PseudorandomGenerator {
 };
 
 // Generates fake data in a literal of the given shape, or returns an error
-// status if the element type is currently unhandled for fake data generation.
-StatusOr<std::unique_ptr<Literal>> MakeFakeLiteral(const Shape& shape);
+// status if the element type is currently unhandled for fake data
+// generation. See below for documentation of pseudo_random.
+StatusOr<std::unique_ptr<Literal>> MakeFakeLiteral(const Shape& shape,
+                                                   bool pseudo_random = true);
 
 // Generates a vector of arguments containing fake data. The number, shape and
 // layout of the arguments is appropriate for given HLO module.
 //
 // Will handle special cases such as making sure that indices used for dynamic
 // slices are bounded, reduces that call adds use 0 as an init value, etc.
+//
+// If pseudo_random is true, the generated numbers will be generated
+// deterministically in a pseudo random way unless the values are constrated to
+// be e.g. init values as above. If pseudo_random is false, the returned values
+// will be generated in a faster way that yields less interesting data, e.g. the
+// values may all be just the same value.
+//
+// TODO(b/79942829): Make interesting argument generation fast enough that using
+// pseudo_random does not save any noticeable amount of time so that the
+// parameter can be removed.
 StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
-    HloModule* const module);
+    HloModule* const module, bool pseudo_random = true);
 
 // Check that a given module satisfies various constraints before trying to
 // execute it.
-Status VerifyHloModule(const se::Platform& platform, HloModule* const module,
+Status VerifyHloModule(HloModule* const module,
                        bool allow_mixed_precision = false);
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/transfer_manager_test.cc b/tensorflow/compiler/xla/tests/transfer_manager_test.cc
index e2067bc1b835a9..0063e7ad415e9b 100644
--- a/tensorflow/compiler/xla/tests/transfer_manager_test.cc
+++ b/tensorflow/compiler/xla/tests/transfer_manager_test.cc
@@ -175,7 +175,7 @@ XLA_TEST_F(TransferManagerTest, TransferTuple) {
                           transfer_manager_->TransferLiteralFromDevice(
                               stream_executor_, device_buffer));
 
-  LiteralTestUtil::ExpectEqual(*literal, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*literal, *result));
 }
 
 XLA_TEST_F(TransferManagerTest, TransferEmptyTuple) {
@@ -189,7 +189,7 @@ XLA_TEST_F(TransferManagerTest, TransferEmptyTuple) {
                           transfer_manager_->TransferLiteralFromDevice(
                               stream_executor_, device_buffer));
 
-  LiteralTestUtil::ExpectEqual(*literal, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*literal, *result));
 }
 
 XLA_TEST_F(TransferManagerTest, TransferNestedTuple) {
@@ -209,7 +209,7 @@ XLA_TEST_F(TransferManagerTest, TransferNestedTuple) {
                           transfer_manager_->TransferLiteralFromDevice(
                               stream_executor_, device_buffer));
 
-  LiteralTestUtil::ExpectEqual(*literal, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*literal, *result));
 }
 
 XLA_TEST_F(TransferManagerTest, TransferComplexValue) {
@@ -224,7 +224,7 @@ XLA_TEST_F(TransferManagerTest, TransferComplexValue) {
                           transfer_manager_->TransferLiteralFromDevice(
                               stream_executor_, device_buffer));
 
-  LiteralTestUtil::ExpectEqual(*literal, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*literal, *result));
 }
 
 XLA_TEST_F(TransferManagerTest, TransferComplexValueInTuple) {
@@ -243,7 +243,7 @@ XLA_TEST_F(TransferManagerTest, TransferComplexValueInTuple) {
                           transfer_manager_->TransferLiteralFromDevice(
                               stream_executor_, device_buffer));
 
-  LiteralTestUtil::ExpectEqual(*literal, *result);
+  EXPECT_TRUE(LiteralTestUtil::Equal(*literal, *result));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/transpose_test.cc b/tensorflow/compiler/xla/tests/transpose_test.cc
index fe5a1778a2cecf..fe1e3da7eca00e 100644
--- a/tensorflow/compiler/xla/tests/transpose_test.cc
+++ b/tensorflow/compiler/xla/tests/transpose_test.cc
@@ -16,14 +16,13 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/reference_util.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
@@ -38,7 +37,7 @@ class TransposeTest : public ClientLibraryTestBase {
 };
 
 XLA_TEST_F(TransposeTest, Transpose0x0) {
-  ComputationBuilder builder(client_, "Transpose");
+  XlaBuilder builder("Transpose");
   auto lhs = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 0));
   auto result = builder.Transpose(lhs, {1, 0});
 
@@ -46,7 +45,7 @@ XLA_TEST_F(TransposeTest, Transpose0x0) {
 }
 
 XLA_TEST_F(TransposeTest, Transpose0x42) {
-  ComputationBuilder builder(client_, "Transpose");
+  XlaBuilder builder("Transpose");
   auto lhs = builder.ConstantR2FromArray2D<float>(Array2D<float>(0, 42));
   auto result = builder.Transpose(lhs, {1, 0});
 
@@ -54,7 +53,7 @@ XLA_TEST_F(TransposeTest, Transpose0x42) {
 }
 
 XLA_TEST_F(TransposeTest, Transpose7x0) {
-  ComputationBuilder builder(client_, "Transpose");
+  XlaBuilder builder("Transpose");
   auto lhs = builder.ConstantR2FromArray2D<float>(Array2D<float>(7, 0));
   auto result = builder.Transpose(lhs, {1, 0});
 
@@ -62,7 +61,7 @@ XLA_TEST_F(TransposeTest, Transpose7x0) {
 }
 
 TEST_F(TransposeTest, Transpose2x2) {
-  ComputationBuilder builder(client_, "Transpose");
+  XlaBuilder builder("Transpose");
   auto lhs = builder.ConstantR2<float>({
       {1.0, 2.0}, {3.0, 4.0},
   });
@@ -74,7 +73,7 @@ TEST_F(TransposeTest, Transpose2x2) {
 }
 
 XLA_TEST_F(TransposeTest, Transpose0x2x3_2x3x0) {
-  ComputationBuilder builder(client_, "Transpose");
+  XlaBuilder builder("Transpose");
   auto operand = builder.ConstantR3FromArray3D<int32>(Array3D<int32>(0, 2, 3));
   auto result = builder.Transpose(operand, {1, 2, 0});
 
@@ -82,7 +81,7 @@ XLA_TEST_F(TransposeTest, Transpose0x2x3_2x3x0) {
 }
 
 TEST_F(TransposeTest, Transpose1x2x3_2x3x1) {
-  ComputationBuilder builder(client_, "Transpose");
+  XlaBuilder builder("Transpose");
   auto operand = builder.ConstantR3FromArray3D<int32>({{{1, 2, 3}, {4, 5, 6}}});
   auto result = builder.Transpose(operand, {1, 2, 0});
 
@@ -92,7 +91,7 @@ TEST_F(TransposeTest, Transpose1x2x3_2x3x1) {
 }
 
 TEST_F(TransposeTest, Transpose1x2x3_3x2x1) {
-  ComputationBuilder builder(client_, "Transpose");
+  XlaBuilder builder("Transpose");
   auto operand = builder.ConstantR3FromArray3D<int32>({{{1, 2, 3}, {4, 5, 6}}});
   auto result = builder.Transpose(operand, {2, 1, 0});
 
@@ -102,7 +101,7 @@ TEST_F(TransposeTest, Transpose1x2x3_3x2x1) {
 }
 
 TEST_F(TransposeTest, Transpose1x2x3_1x2x3) {
-  ComputationBuilder builder(client_, "Transpose");
+  XlaBuilder builder("Transpose");
   auto operand = builder.ConstantR3FromArray3D<int32>({{{1, 2, 3}, {4, 5, 6}}});
   auto result = builder.Transpose(operand, {0, 1, 2});
 
@@ -116,7 +115,7 @@ TEST_F(TransposeTest, MultiTranspose3x2) {
   Array2D<float> transposed({{1.0f, 3.0f, 5.0f}, {2.0f, 4.0f, 6.0f}});
 
   for (int transposes = 0; transposes <= 10; ++transposes) {
-    ComputationBuilder builder(client_, "Transpose");
+    XlaBuilder builder("Transpose");
     auto computed = builder.ConstantR2FromArray2D<float>(input);
     for (int i = 0; i < transposes; ++i) {
       computed = builder.Transpose(computed, {1, 0});
@@ -130,7 +129,7 @@ TEST_F(TransposeTest, MultiTranspose3x2) {
 TEST_F(TransposeTest, Small_1x1) {
   auto aoperand = MakeLinspaceArray2D(0.0, 1.0, 1, 1);
 
-  ComputationBuilder builder(client_, "transpose_1x1");
+  XlaBuilder builder("transpose_1x1");
   auto operand = builder.ConstantR2FromArray2D<float>(*aoperand);
   builder.Transpose(operand, {1, 0});
 
@@ -142,7 +141,7 @@ TEST_F(TransposeTest, Small_1x1) {
 TEST_F(TransposeTest, Small_2x2) {
   auto aoperand = MakeLinspaceArray2D(0.0, 4.0, 2, 2);
 
-  ComputationBuilder builder(client_, "transpose_2x2");
+  XlaBuilder builder("transpose_2x2");
   auto operand = builder.ConstantR2FromArray2D<float>(*aoperand);
   builder.Transpose(operand, {1, 0});
 
@@ -162,7 +161,7 @@ void TransposeTest::TestTransposeConstant021(size_t n1, size_t n2, size_t n3) {
     }
   }
 
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto operand = builder.ConstantR3FromArray3D(aoperand);
   builder.Transpose(operand, {0, 2, 1});
 
diff --git a/tensorflow/compiler/xla/tests/tuple_test.cc b/tensorflow/compiler/xla/tests/tuple_test.cc
index 61be1746530a19..41189231b90e84 100644
--- a/tensorflow/compiler/xla/tests/tuple_test.cc
+++ b/tensorflow/compiler/xla/tests/tuple_test.cc
@@ -17,8 +17,6 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/computation.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
@@ -287,13 +285,13 @@ XLA_TEST_F(TupleTest, SelectBetweenTuplesOnFalse) {
 }
 
 XLA_TEST_F(TupleTest, TuplesInAMap) {
-  Computation tuple_computation;
+  XlaComputation tuple_computation;
   {
     // tuple_computation(x) = 100 * min(x, x^2) + max(x, x^2) using tuples.
     //
     // Need to put a select in there to prevent HLO-level optimizations from
     // optimizing out the tuples.
-    ComputationBuilder b(client_, "sort_square");
+    XlaBuilder b("sort_square");
     auto x = b.Parameter(0, ShapeUtil::MakeShape(F32, {}), "x");
     auto x2 = b.Mul(x, x);
     auto x_smaller_tuple = b.Tuple({x, x2});
@@ -307,7 +305,7 @@ XLA_TEST_F(TupleTest, TuplesInAMap) {
     tuple_computation = computation_status.ConsumeValueOrDie();
   }
 
-  ComputationBuilder b(client_, TestName());
+  XlaBuilder b(TestName());
   auto input = b.ConstantR1<float>({-1.0f, 1.0f, 2.1f});
   b.Map({input}, tuple_computation, {0});
   ComputeAndCompareR1<float>(&b, {-99.0f, 101.0f, 214.41f}, {}, error_spec_);
@@ -497,7 +495,7 @@ XLA_TEST_F(TupleTest, ComplexTuples) {
   auto sum = Literal::CreateR2<complex64>({{{111, 222}, {331, 442}},
                                            {{1011, 2022}, {3031, 4042}},
                                            {{10011, 20022}, {30031, 40042}}});
-  auto prod = Literal::CreateFromShape(sum->shape());
+  auto prod = MakeUnique<Literal>(sum->shape());
   ASSERT_TRUE(prod->Populate<complex64>(
                       [&sum](tensorflow::gtl::ArraySlice<int64> indexes) {
                         return sum->Get<complex64>(indexes) *
@@ -516,7 +514,7 @@ XLA_TEST_F(TupleTest, ComplexTuples) {
 class TupleHloTest : public HloTestBase {};
 
 // Disabled on the interpreter because bitcast doesn't exist on the interpreter.
-TEST_F(TupleHloTest, DISABLED_ON_INTERPRETER(BitcastAfterGTE)) {
+XLA_TEST_F(TupleHloTest, DISABLED_ON_INTERPRETER(BitcastAfterGTE)) {
   const char* testcase = R"(
     HloModule m
 
diff --git a/tensorflow/compiler/xla/tests/unary_op_test.cc b/tensorflow/compiler/xla/tests/unary_op_test.cc
index 835e2d7e5594d7..c3abe22797f5ea 100644
--- a/tensorflow/compiler/xla/tests/unary_op_test.cc
+++ b/tensorflow/compiler/xla/tests/unary_op_test.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -37,7 +37,7 @@ class UnaryOpTest : public ClientLibraryTestBase {
   }
   template <typename T>
   void AbsSize0TestHelper() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto arg = builder.ConstantR1<T>({});
     auto abs = builder.Abs(arg);
 
@@ -50,7 +50,7 @@ class UnaryOpTest : public ClientLibraryTestBase {
 
   template <typename T>
   void AbsTestHelper() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto arg = builder.ConstantR1<T>({-2, 25, 0, -123, inf<T>(), -inf<T>()});
     auto abs = builder.Abs(arg);
 
@@ -59,7 +59,7 @@ class UnaryOpTest : public ClientLibraryTestBase {
 
   template <typename T>
   void SignTestHelper() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto arg = builder.ConstantR1<T>(
         {-2, 25, 0, static_cast<T>(-0.0), -123, inf<T>(), -inf<T>()});
     auto sign = builder.Sign(arg);
@@ -69,7 +69,7 @@ class UnaryOpTest : public ClientLibraryTestBase {
 
   template <typename T>
   void SignAbsTestHelper() {
-    ComputationBuilder builder(client_, TestName());
+    XlaBuilder builder(TestName());
     auto arg = builder.ConstantR1<T>({-2, 25, 0, -123});
     auto sign = builder.Sign(arg);
     auto abs = builder.Abs(arg);
@@ -84,9 +84,14 @@ int UnaryOpTest::inf<int>() {
   return 2147483647;
 }
 
+template <>
+int64 UnaryOpTest::inf<int64>() {
+  return 0x7FFFFFFFFFFFFFFFl;
+}
+
 template <>
 void UnaryOpTest::AbsTestHelper<complex64>() {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto arg = builder.ConstantR1<complex64>({{-2, 0},
                                             {0, 25},
                                             {0, 0},
@@ -102,7 +107,7 @@ void UnaryOpTest::AbsTestHelper<complex64>() {
 
 template <>
 void UnaryOpTest::SignTestHelper<complex64>() {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto arg = builder.ConstantR1<complex64>(
       {{-2, 0}, {0, 25}, {0, 0}, {static_cast<float>(-0.0), 0}, {-1, 1}});
   auto sign = builder.Sign(arg);
@@ -114,7 +119,7 @@ void UnaryOpTest::SignTestHelper<complex64>() {
 
 template <>
 void UnaryOpTest::SignAbsTestHelper<complex64>() {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto arg =
       builder.ConstantR1<complex64>({{-2, 0}, {0, 25}, {0, 0}, {-0.4, 0.3}});
   auto sign = builder.Sign(arg);
@@ -139,7 +144,7 @@ XLA_TEST_F(UnaryOpTest, AbsTestR1) {
 }
 
 XLA_TEST_F(UnaryOpTest, AbsTestR0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto argi = builder.ConstantR0<int>(-5);
   auto absi = builder.Abs(argi);
   auto argf = builder.ConstantR0<float>(-3.0f);
@@ -155,7 +160,7 @@ XLA_TEST_F(UnaryOpTest, AbsTestR0) {
 }
 
 XLA_TEST_F(UnaryOpTest, SignTestR0) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto argi = builder.ConstantR0<int>(-5);
   auto sgni = builder.Sign(argi);  // -1
   auto argf = builder.ConstantR0<float>(-4.0f);
@@ -176,6 +181,7 @@ XLA_TEST_F(UnaryOpTest, SignTestR0) {
 
 XLA_TEST_F(UnaryOpTest, SignTestR1) {
   SignTestHelper<int>();
+  SignTestHelper<int64>();
   SignTestHelper<float>();
   SignTestHelper<complex64>();
 }
@@ -187,7 +193,7 @@ XLA_TEST_F(UnaryOpTest, SignAbsTestR1) {
 }
 
 XLA_TEST_F(UnaryOpTest, UnsignedAbsTestR1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto arg = builder.ConstantR1<unsigned int>(
       {2, 25, 0, 123, std::numeric_limits<unsigned int>::max()});
   auto abs = builder.Abs(arg);
@@ -197,7 +203,7 @@ XLA_TEST_F(UnaryOpTest, UnsignedAbsTestR1) {
 }
 
 XLA_TEST_F(UnaryOpTest, UnsignedSignTestR1) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto arg = builder.ConstantR1<unsigned int>(
       {2, 25, 0, 123, std::numeric_limits<unsigned int>::max()});
   auto sign = builder.Sign(arg);
@@ -206,7 +212,7 @@ XLA_TEST_F(UnaryOpTest, UnsignedSignTestR1) {
 }
 
 XLA_TEST_F(UnaryOpTest, SignAbsTestR2) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto arg = builder.ConstantR2<float>({{1.0, -2.0}, {-3.0, 4.0}});
   auto sign = builder.Sign(arg);
   auto abs = builder.Abs(arg);
@@ -216,7 +222,7 @@ XLA_TEST_F(UnaryOpTest, SignAbsTestR2) {
 }
 
 XLA_TEST_F(UnaryOpTest, ConvertElementTypePredToS32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<int32>({0, 1});
   auto rhs = builder.ConstantR1<int32>({1, 1});
   builder.ConvertElementType(builder.Eq(lhs, rhs), S32);
@@ -225,7 +231,7 @@ XLA_TEST_F(UnaryOpTest, ConvertElementTypePredToS32) {
 }
 
 XLA_TEST_F(UnaryOpTest, ConvertElementTypePredToF32) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto lhs = builder.ConstantR1<int32>({0, 1});
   auto rhs = builder.ConstantR1<int32>({1, 1});
   builder.ConvertElementType(builder.Eq(lhs, rhs), F32);
diff --git a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
index 3dded3f7157195..5cce7a2bf82c1a 100644
--- a/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/vector_ops_simple_test.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/array4d.h"
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
@@ -350,7 +349,7 @@ XLA_TEST_F(VecOpsSimpleTest, ClampTenValuesConstantNonzeroLower) {
 }
 
 XLA_TEST_F(VecOpsSimpleTest, ClampValuesConstantS64) {
-  ComputationBuilder builder(client_, TestName());
+  XlaBuilder builder(TestName());
   auto zero = builder.ConstantR0<int64>(0);
   auto one = builder.ConstantR0<int64>(10);
   auto x = builder.ConstantR1<int64>({-3, 3, 9, 13});
diff --git a/tensorflow/compiler/xla/tests/while_test.cc b/tensorflow/compiler/xla/tests/while_test.cc
index 336fed27c6f19f..c463f3eac55e5b 100644
--- a/tensorflow/compiler/xla/tests/while_test.cc
+++ b/tensorflow/compiler/xla/tests/while_test.cc
@@ -957,22 +957,21 @@ TEST_F(WhileTest, DISABLED_ON_INTERPRETER(WhileWithPrngScalarResult)) {
 TEST_F(WhileTest, WhileThatSwapsParameterWithTupleElement) {
   auto element_shape = ShapeUtil::MakeShape(F32, {2});
 
-  ComputationBuilder outer(client_, "outer");
+  XlaBuilder outer("outer");
   auto p = outer.Parameter(0, element_shape, "param");
   auto t = outer.Tuple({p, outer.ConstantR1<float>({1, 1})});
 
-  TF_ASSERT_OK_AND_ASSIGN(const std::unique_ptr<Shape> tuple_shape,
-                          outer.GetShape(t));
+  TF_ASSERT_OK_AND_ASSIGN(Shape tuple_shape, outer.GetShape(t));
 
-  ComputationBuilder cond(client_, "cond");
-  auto cond_t = cond.Parameter(0, *tuple_shape, "t");
+  XlaBuilder cond("cond");
+  auto cond_t = cond.Parameter(0, tuple_shape, "t");
   TF_ASSERT_OK(Any(cond.Eq(cond.GetTupleElement(cond_t, 0),
                            cond.ConstantR1<float>({42, 42})),
                    &cond)
                    .status());
 
-  ComputationBuilder body(client_, "body");
-  auto body_t = body.Parameter(0, *tuple_shape, "t");
+  XlaBuilder body("body");
+  auto body_t = body.Parameter(0, tuple_shape, "t");
   auto e = body.GetTupleElement(body_t, 1);
   body.Tuple({e, e});
 
@@ -993,15 +992,15 @@ TEST_F(WhileTest, WhileThatSwapsParameterWithTupleElement) {
 TEST_F(WhileTest, WhileThatSwapsParameterWithBroadcast) {
   auto element_shape = ShapeUtil::MakeShape(F32, {2});
 
-  ComputationBuilder outer(client_, "outer");
+  XlaBuilder outer("outer");
   auto p = outer.Parameter(0, element_shape, "param");
 
-  ComputationBuilder cond(client_, "cond");
+  XlaBuilder cond("cond");
   auto cond_t = cond.Parameter(0, element_shape, "t");
   TF_ASSERT_OK(
       Any(cond.Eq(cond_t, cond.ConstantR1<float>({42, 42})), &cond).status());
 
-  ComputationBuilder body(client_, "body");
+  XlaBuilder body("body");
   auto body_t = body.Parameter(0, element_shape, "t");
   auto e = body.Broadcast(body.ConstantR0<float>(1.0), {2});
 
@@ -1019,14 +1018,14 @@ TEST_F(WhileTest, WhileThatSwapsParameterWithBroadcast) {
 TEST_F(WhileTest, WhileThatTurnsScalarParameterToTupleElement) {
   auto element_shape = ShapeUtil::MakeShape(F32, {});
 
-  ComputationBuilder outer(client_, "outer");
+  XlaBuilder outer("outer");
   auto p = outer.Parameter(0, element_shape, "param");
 
-  ComputationBuilder cond(client_, "cond");
+  XlaBuilder cond("cond");
   auto cond_t = cond.Parameter(0, element_shape, "t");
   cond.Eq(cond_t, cond.ConstantR0<float>(42));
 
-  ComputationBuilder body(client_, "body");
+  XlaBuilder body("body");
   auto body_t = body.Parameter(0, element_shape, "t");
   auto tuple =
       body.Tuple({body_t, body.Add(body_t, body.ConstantR0<float>(1))});
@@ -1055,23 +1054,23 @@ TEST_F(WhileTest, WhileWithMixedTupleElements) {
   auto result_shape = ShapeUtil::MakeTupleShape(
       {ShapeUtil::MakeShape(S32, {}), ShapeUtil::MakeShape(S32, {})});
 
-  ComputationBuilder outer(client_, "outer");
+  XlaBuilder outer("outer");
   auto p =
       outer.Tuple({outer.ConstantR0<int32>(0),
                    outer.Parameter(0, ShapeUtil::MakeShape(S32, {}), "t")});
 
-  ComputationBuilder cond(client_, "cond");
+  XlaBuilder cond("cond");
   auto params = cond.Parameter(0, result_shape, "prev");
   auto cond_t = cond.Add(cond.GetTupleElement(params, 1),
                          cond.GetTupleElement(params, 0));
   cond.Lt(cond_t, cond.ConstantR0<int32>(30));
 
-  ComputationBuilder body(client_, "body");
+  XlaBuilder body("body");
   auto body_t = body.Parameter(0, result_shape, "t");
 
   auto tuple = body.Tuple(
-      {body.Add(body.GetTupleElement(params, 0), body.ConstantR0<int32>(1)),
-       body.Add(body.GetTupleElement(params, 1), body.ConstantR0<int32>(1))});
+      {body.Add(body.GetTupleElement(body_t, 0), body.ConstantR0<int32>(1)),
+       body.Add(body.GetTupleElement(body_t, 1), body.ConstantR0<int32>(1))});
 
   TF_ASSERT_OK_AND_ASSIGN(auto cond_computation, cond.Build());
   TF_ASSERT_OK_AND_ASSIGN(auto body_computation, body.Build());
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
index 8354bb71cb7e88..3c9a01653c6720 100644
--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -17,8 +17,9 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/client/computation_builder.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
+#include "tensorflow/compiler/xla/client/xla_client/xla_computation.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -83,8 +84,8 @@ Status ParseOneProfileOutputLine(
   string match_percentage = "\\d+\\.\\d\\d%";
   string match_cycles = "(\\d+) cycles +\\( *(" + match_percentage + ")\\)";
   string match_usecs = "([0-9.]+) usec";
-  string match_flops = "([^ ]+)";
-  string match_trops = "([^ ]+)";
+  string match_flops = "([^ ]*)";
+  string match_trops = "([^ ]*)";
   string match_bytes_per_sec = "([0-9.TGMKi]+)B/s";
   string match_bytes_per_cycle = "([0-9.TGMKi]+)B/cycle";
 
@@ -119,7 +120,7 @@ Status ParseOneProfileOutputLine(
 
 // Returns void so that we can ASSERT.
 void ExecuteAndFetchProfile(string* profile_output, LocalClient* client,
-                            const Computation& computation,
+                            const XlaComputation& computation,
                             const Shape& lhs_arg_shape,
                             const Shape& rhs_arg_shape) {
   LocalService* service = ClientLibrary::GetXlaService(client->platform());
@@ -185,7 +186,7 @@ XLA_TEST_F(HloProfileTest, ProfileSingleComputation) {
   TF_ASSERT_OK_AND_ASSIGN(LocalClient * client,
                           ClientLibrary::GetOrCreateLocalClient(platform));
 
-  ComputationBuilder builder(client, TestName());
+  XlaBuilder builder(TestName());
   auto result = builder.Tanh(builder.Add(
       builder.Parameter(0, ShapeUtil::MakeShape(F32, {m, k}), "dot_lhs"),
       builder.Parameter(1, ShapeUtil::MakeShape(F32, {k, n}), "dot_rhs")));
@@ -251,18 +252,18 @@ XLA_TEST_F(HloProfileTest, DISABLED_ON_GPU(ProfileWhileComputation)) {
   TF_ASSERT_OK_AND_ASSIGN(LocalClient * client,
                           ClientLibrary::GetOrCreateLocalClient(platform));
 
-  Computation condition;
+  XlaComputation condition;
   {
-    ComputationBuilder builder(client, "condition");
+    XlaBuilder builder("condition");
     auto state = builder.Parameter(0, while_result_shape, "state");
     auto iteration = builder.GetTupleElement(state, 0);
     builder.Gt(builder.ConstantR0<int32>(5), iteration);
     TF_ASSERT_OK_AND_ASSIGN(condition, builder.Build());
   }
 
-  Computation body;
+  XlaComputation body;
   {
-    ComputationBuilder builder(client, "body");
+    XlaBuilder builder("body");
     auto state = builder.Parameter(0, while_result_shape, "state");
     auto matrix = builder.GetTupleElement(state, 1);
     auto next_iteration = builder.Add(builder.GetTupleElement(state, 0),
@@ -271,7 +272,7 @@ XLA_TEST_F(HloProfileTest, DISABLED_ON_GPU(ProfileWhileComputation)) {
     TF_ASSERT_OK_AND_ASSIGN(body, builder.Build());
   }
 
-  ComputationBuilder builder(client, TestName());
+  XlaBuilder builder(TestName());
   auto initial_while_state =
       builder.Tuple({builder.ConstantR0<int32>(0),
                      builder.Parameter(0, matrix_shape, "initial_value")});
diff --git a/tensorflow/compiler/xla/text_literal_reader.cc b/tensorflow/compiler/xla/text_literal_reader.cc
index 44f874cd2ae8e6..56702feab9a4e8 100644
--- a/tensorflow/compiler/xla/text_literal_reader.cc
+++ b/tensorflow/compiler/xla/text_literal_reader.cc
@@ -42,7 +42,7 @@ StatusOr<std::unique_ptr<Literal>> TextLiteralReader::ReadPath(
       << "TextLiteralReader no longer supports reading .gz files";
   std::unique_ptr<tensorflow::RandomAccessFile> file;
   Status s =
-      tensorflow::Env::Default()->NewRandomAccessFile(path.ToString(), &file);
+      tensorflow::Env::Default()->NewRandomAccessFile(std::string(path), &file);
   if (!s.ok()) {
     return s;
   }
@@ -92,7 +92,7 @@ StatusOr<std::unique_ptr<Literal>> TextLiteralReader::ReadAllLines() {
 
   tensorflow::StringPiece sp(shape_string);
   if (tensorflow::str_util::RemoveWhitespaceContext(&sp) > 0) {
-    string tmp = sp.ToString();
+    string tmp = std::string(sp);
     shape_string = tmp;
   }
   TF_ASSIGN_OR_RETURN(Shape shape, ShapeUtil::ParseShapeString(shape_string));
@@ -124,10 +124,10 @@ StatusOr<std::unique_ptr<Literal>> TextLiteralReader::ReadAllLines() {
                              line.c_str());
     }
     float value;
-    if (!tensorflow::strings::safe_strtof(value_string.ToString().c_str(),
+    if (!tensorflow::strings::safe_strtof(std::string(value_string).c_str(),
                                           &value)) {
       return InvalidArgument("could not parse value as float: \"%s\"",
-                             value_string.ToString().c_str());
+                             std::string(value_string).c_str());
     }
     SplitByDelimToStringPieces(coordinates_string, ',', &coordinates);
     coordinate_values.clear();
@@ -136,7 +136,7 @@ StatusOr<std::unique_ptr<Literal>> TextLiteralReader::ReadAllLines() {
       if (!tensorflow::strings::safe_strto64(piece, &coordinate_value)) {
         return InvalidArgument(
             "could not parse coordinate member as int64: \"%s\"",
-            piece.ToString().c_str());
+            std::string(piece).c_str());
       }
       coordinate_values.push_back(coordinate_value);
     }
diff --git a/tensorflow/compiler/xla/text_literal_writer.cc b/tensorflow/compiler/xla/text_literal_writer.cc
index 3fee467594d842..373c0d2d8d8ab0 100644
--- a/tensorflow/compiler/xla/text_literal_writer.cc
+++ b/tensorflow/compiler/xla/text_literal_writer.cc
@@ -30,10 +30,10 @@ limitations under the License.
 
 namespace xla {
 
-/* static */ tensorflow::Status TextLiteralWriter::WriteToPath(
+/* static */ Status TextLiteralWriter::WriteToPath(
     const Literal& literal, tensorflow::StringPiece path) {
   std::unique_ptr<tensorflow::WritableFile> f;
-  auto s = tensorflow::Env::Default()->NewWritableFile(path.ToString(), &f);
+  auto s = tensorflow::Env::Default()->NewWritableFile(std::string(path), &f);
   if (!s.ok()) {
     return s;
   }
@@ -43,7 +43,7 @@ namespace xla {
     return s;
   }
 
-  tensorflow::Status status;
+  Status status;
   tensorflow::WritableFile* f_ptr = f.get();
   literal.EachCellAsString(
       [f_ptr, &status](tensorflow::gtl::ArraySlice<int64> indices,
diff --git a/tensorflow/compiler/xla/text_literal_writer.h b/tensorflow/compiler/xla/text_literal_writer.h
index 7375493f4309c9..0a1235b5e04675 100644
--- a/tensorflow/compiler/xla/text_literal_writer.h
+++ b/tensorflow/compiler/xla/text_literal_writer.h
@@ -37,8 +37,8 @@ namespace xla {
 // This should be readable by xla::TextLiteralReader.
 class TextLiteralWriter {
  public:
-  static tensorflow::Status WriteToPath(const Literal& literal,
-                                        tensorflow::StringPiece path);
+  static Status WriteToPath(const Literal& literal,
+                            tensorflow::StringPiece path);
 
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(TextLiteralWriter);
diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD
index 0bc4045a549031..ff5340ee3fac51 100644
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@@ -36,11 +36,10 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service",
-        "//tensorflow/compiler/xla/service:session_proto",
+        "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/core:lib",
     ],
 )
@@ -63,10 +62,9 @@ tf_cc_binary(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xla/service:interpreter_plugin",
-        "//tensorflow/compiler/xla/service:session_proto",
         "//tensorflow/core:lib",
     ],
 )
@@ -84,11 +82,11 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:testing",
-        "//tensorflow/compiler/xla/service:session_proto",
+        "//tensorflow/compiler/xla/service:hlo_proto",
+        "//tensorflow/compiler/xla/service/gpu:infeed_manager",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
@@ -137,7 +135,7 @@ tf_cc_binary(
     deps = [
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla/service:session_proto",
+        "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/core:lib",
     ],
 )
@@ -164,12 +162,10 @@ tf_cc_binary(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service",
-        "//tensorflow/compiler/xla/service:computation_tracker",
+        "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xla/service:interpreter_plugin",
-        "//tensorflow/compiler/xla/service:session_proto",
         "//tensorflow/core:lib",
     ],
 )
@@ -183,12 +179,11 @@ tf_cc_binary(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xla/service:interpreter_plugin",
-        "//tensorflow/compiler/xla/service:session_proto",
         "//tensorflow/core:lib",
     ],
 )
@@ -201,13 +196,12 @@ tf_cc_binary(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
-        "//tensorflow/compiler/xla/client:computation",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/service",
         "//tensorflow/compiler/xla/service:hlo_graph_dumper",
+        "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xla/service:interpreter_plugin",
-        "//tensorflow/compiler/xla/service:session_proto",
         "//tensorflow/core:lib",
     ],
 )
diff --git a/tensorflow/compiler/xla/tools/convert_computation.cc b/tensorflow/compiler/xla/tools/convert_computation.cc
index fe03a6e7bdfe99..14d01b5bfb067c 100644
--- a/tensorflow/compiler/xla/tools/convert_computation.cc
+++ b/tensorflow/compiler/xla/tools/convert_computation.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include <unistd.h>
 #include <string>
 
-#include "tensorflow/compiler/xla/service/session.pb.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/env.h"
@@ -33,7 +33,7 @@ namespace xla {
 namespace tools {
 
 void RealMain(const string& mode, const string& path) {
-  SessionModule module;
+  HloSnapshot module;
   tensorflow::Env* env = tensorflow::Env::Default();
   if (mode == "txt2bin") {
     TF_CHECK_OK(tensorflow::ReadTextProto(env, path, &module));
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc
index 21ae8583d7cd33..befb55453777dc 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_graphviz.cc
@@ -17,7 +17,7 @@ limitations under the License.
 //
 // Dumps a graphviz URL for a snapshot computation to the command line.
 //
-// some_binary_snapshot_proto is obtained by serializing the SessionModule from
+// some_binary_snapshot_proto is obtained by serializing the HloSnapshot from
 // ServiceInterface::SnapshotComputation to disk.
 //
 // The GraphViz URL is placed into the log stderr, whereas computation
@@ -30,11 +30,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/service.h"
-#include "tensorflow/compiler/xla/service/session.pb.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -49,10 +48,11 @@ namespace tools {
 void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
   Client* client = ClientLibrary::LocalClientOrDie();
   for (char* arg : args) {
-    SessionModule module;
+    HloSnapshot module;
     TF_CHECK_OK(
         tensorflow::ReadBinaryProto(tensorflow::Env::Default(), arg, &module));
-    Computation computation = client->LoadSnapshot(module).ConsumeValueOrDie();
+    XlaComputation computation =
+        client->LoadSnapshot(module).ConsumeValueOrDie();
     DebugOptions debug_options = legacy_flags::GetDebugOptionsFromFlags();
     debug_options.set_xla_generate_hlo_graph(".*");
     ComputationStats stats =
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
index b82f1c81c84b48..cfb8f37487d649 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_operation_list.cc
@@ -21,11 +21,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/service.h"
-#include "tensorflow/compiler/xla/service/session.pb.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -66,16 +65,16 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
   LocalService* local_service =
       ClientLibrary::GetXlaService(client->platform());
   for (char* arg : args) {
-    SessionModule session_module;
+    HloSnapshot snapshot;
     TF_CHECK_OK(tensorflow::ReadBinaryProto(tensorflow::Env::Default(), arg,
-                                            &session_module));
-    auto computation_status = client->LoadSnapshot(session_module);
+                                            &snapshot));
+    auto computation_status = client->LoadSnapshot(snapshot);
     if (!computation_status.ok()) {
       fprintf(stderr, "could not load snapshot for %s: %s\n", arg,
               computation_status.status().ToString().c_str());
       continue;
     }
-    Computation computation = computation_status.ConsumeValueOrDie();
+    XlaComputation computation = computation_status.ConsumeValueOrDie();
 
     std::unique_ptr<ProgramShape> program_shape =
         client->GetComputationShape(computation).ConsumeValueOrDie();
@@ -89,8 +88,7 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
     build_options.set_device_ordinal(0);
     build_options.set_result_layout(program_shape->result());
     StatusOr<std::unique_ptr<Executable>> executable =
-        local_service->CompileExecutable(computation.handle(), layouts,
-                                         build_options);
+        local_service->CompileExecutable(computation, layouts, build_options);
 
     const HloModule& module = executable.ValueOrDie()->module();
 
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
index 05c0fdf97d27c0..5dd5150be33984 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
@@ -19,11 +19,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/service/computation_tracker.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/service.h"
-#include "tensorflow/compiler/xla/service/session.pb.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -40,16 +38,16 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args, bool compile) {
   LocalService* local_service =
       ClientLibrary::GetXlaService(client->platform());
   for (char* arg : args) {
-    SessionModule session_module;
+    HloSnapshot snapshot;
     TF_CHECK_OK(tensorflow::ReadBinaryProto(tensorflow::Env::Default(), arg,
-                                            &session_module));
-    auto computation_status = client->LoadSnapshot(session_module);
+                                            &snapshot));
+    auto computation_status = client->LoadSnapshot(snapshot);
     if (!computation_status.ok()) {
       fprintf(stderr, "could not load snapshot for %s: %s\n", arg,
               computation_status.status().ToString().c_str());
       continue;
     }
-    Computation computation = computation_status.ConsumeValueOrDie();
+    XlaComputation computation = computation_status.ConsumeValueOrDie();
 
     if (compile) {
       std::unique_ptr<ProgramShape> program_shape =
@@ -65,8 +63,7 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args, bool compile) {
       build_options.set_device_ordinal(0);
       build_options.set_result_layout(program_shape->result());
       StatusOr<std::unique_ptr<Executable>> executable =
-          local_service->CompileExecutable(computation.handle(), layouts,
-                                           build_options);
+          local_service->CompileExecutable(computation, layouts, build_options);
 
       const HloModule& module = executable.ValueOrDie()->module();
 
@@ -74,13 +71,11 @@ void RealMain(tensorflow::gtl::ArraySlice<char*> args, bool compile) {
               local_service->backend().platform()->Name().c_str(),
               module.ToString(HloPrintOptions::ShortParsable()).c_str());
     } else {
-      const ComputationTracker& tracker = local_service->computation_tracker();
-      UserComputation* user_computation =
-          tracker.Resolve(computation.handle()).ConsumeValueOrDie();
-      VersionedComputationHandle versioned_handle =
-          user_computation->GetVersionedHandle();
+      auto config = HloModule::CreateModuleConfigFromProto(computation.proto(),
+                                                           DebugOptions())
+                        .ConsumeValueOrDie();
       std::unique_ptr<HloModule> module =
-          tracker.BuildHloModule(versioned_handle, HloModuleConfig())
+          HloModule::CreateFromProto(computation.proto(), config)
               .ConsumeValueOrDie();
 
       fprintf(stdout, "%s\n",
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
index 51f90b07c66f7d..a5dce20456c6a2 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_tf_graphdef.cc
@@ -28,11 +28,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/service.h"
-#include "tensorflow/compiler/xla/service/session.pb.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -48,10 +47,11 @@ namespace tools {
 void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
   Client* client = ClientLibrary::LocalClientOrDie();
   for (char* arg : args) {
-    SessionModule module;
+    HloSnapshot module;
     TF_CHECK_OK(
         tensorflow::ReadBinaryProto(tensorflow::Env::Default(), arg, &module));
-    Computation computation = client->LoadSnapshot(module).ConsumeValueOrDie();
+    XlaComputation computation =
+        client->LoadSnapshot(module).ConsumeValueOrDie();
     DebugOptions debug_options = legacy_flags::GetDebugOptionsFromFlags();
     debug_options.set_xla_generate_hlo_graph(".*");
     debug_options.set_xla_hlo_dump_as_graphdef(true);
diff --git a/tensorflow/compiler/xla/tools/parser/BUILD b/tensorflow/compiler/xla/tools/parser/BUILD
deleted file mode 100644
index 0fa4b98d0a41a1..00000000000000
--- a/tensorflow/compiler/xla/tools/parser/BUILD
+++ /dev/null
@@ -1,72 +0,0 @@
-# Build file for the Hlo parser.
-
-licenses(["notice"])  # Apache 2.0
-
-package(
-    default_visibility = [":friends"],
-)
-
-package_group(
-    name = "friends",
-    includes = [
-        "//tensorflow/compiler/xla:friends",
-    ],
-)
-
-# Filegroup used to collect source files for dependency checking.
-filegroup(
-    name = "c_srcs",
-    data = glob([
-        "**/*.cc",
-        "**/*.h",
-    ]),
-)
-
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-
-cc_library(
-    name = "hlo_lexer",
-    srcs = ["hlo_lexer.cc"],
-    hdrs = [
-        "hlo_lexer.h",
-        "hlo_token.h",
-    ],
-    deps = [
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:regexp_internal",
-    ],
-)
-
-cc_library(
-    name = "hlo_parser",
-    srcs = ["hlo_parser.cc"],
-    hdrs = ["hlo_parser.h"],
-    deps = [
-        ":hlo_lexer",
-        "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_cc_test(
-    name = "hlo_parser_test",
-    size = "small",
-    srcs = ["hlo_parser_test.cc"],
-    deps = [
-        ":hlo_parser",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-    ],
-)
diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index 62a353ad09af00..be094b7890aab0 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -17,7 +17,7 @@ limitations under the License.
 //
 // Replays computations and shows the results on the command line.
 //
-// some_binary_snapshot_proto is obtained by serializing the SessionModule from
+// some_binary_snapshot_proto is obtained by serializing the HloSnapshot from
 // ServiceInterface::SnapshotComputation to disk.
 //
 // Computations that require arguments can be replayed using fake data by
@@ -36,13 +36,13 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/global_data.h"
 #include "tensorflow/compiler/xla/client/lib/testing.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/session.pb.h"
+#include "tensorflow/compiler/xla/service/gpu/infeed_manager.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -64,99 +64,157 @@ namespace {
 // fields.
 struct Options {
   string fake_infeed_shape;
+  bool generate_fake_infeed = false;
   bool use_fake_data = false;
   bool print_result = true;
   int num_runs = 1;
-  bool xla_hlo_profile_last_run = false;
 };
 
 // Invokes the given computation passing arbitrary data for every (unbound)
 // parameter if use_fake_data, Otherwise use recorded data if available.
 //
-// Similarly, infeeds fake data of shape fake_infeed_shape if it is provided;
-// otherwise, no infeed is performed.
-StatusOr<std::unique_ptr<Literal>> ReplayComputation(
-    const SessionModule& module, Client* client, const Options& opts) {
-  TF_ASSIGN_OR_RETURN(Computation computation, client->LoadSnapshot(module));
+// Similarly, infeeds fake data of shape fake_infeed_shape if it is provided.
+// If generate_fake_infeed is true, the required infeed shape is derived from
+// the computation and then used to provide a fake infeed shape.
+//
+// If neither generate_fake_infeed is true nor a fake_infeed_shape is provided,
+// no infeed is performed.
+StatusOr<Literal> ReplayComputation(const HloSnapshot& module,
+                                    LocalClient* client, const Options& opts) {
+  XlaComputation computation(module.hlo().hlo_module());
 
-  std::vector<std::unique_ptr<GlobalData>> arguments;
+  // Build the `argument_ptrs` vector, which contains ShapedBuffer*s to our
+  // arguments.  This is a bit involved, because we may have to convert from
+  // GlobalData to ShapedBuffer*, and we have to manage the lifetime of all our
+  // objects.
+  std::vector<ScopedShapedBuffer> scoped_shaped_buffer_arguments;
+  std::vector<std::unique_ptr<GlobalData>> global_data_arguments;
+  std::vector<const ShapedBuffer*> argument_ptrs;
   if (opts.use_fake_data) {
-    arguments = MakeFakeArgumentsOrDie(computation, client);
+    global_data_arguments = MakeFakeArgumentsOrDie(computation, client);
+    for (const auto& data : global_data_arguments) {
+      argument_ptrs.push_back(
+          client->GlobalDataToShapedBuffer(data->handle(), /*device_ordinal=*/0)
+              .ValueOrDie());
+    }
   } else {  // use recorded data if available
     for (const auto& proto : module.arguments()) {
       TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Literal> literal,
                           Literal::CreateFromProto(proto));
-      TF_ASSIGN_OR_RETURN(std::unique_ptr<GlobalData> data,
-                          client->TransferToServer(*literal));
-      arguments.push_back(std::move(data));
+      TF_ASSIGN_OR_RETURN(
+          ScopedShapedBuffer data,
+          client->LiteralToShapedBuffer(*literal, /*device_ordinal=*/0));
+      scoped_shaped_buffer_arguments.push_back(std::move(data));
+    }
+    for (const auto& argument : scoped_shaped_buffer_arguments) {
+      argument_ptrs.push_back(&argument);
     }
   }
 
+  bool provide_infeed = false;
+  Shape infeed_shape;
+  if (!opts.fake_infeed_shape.empty()) {
+    StatusOr<Shape> shape_status =
+        ShapeUtil::ParseShapeString(opts.fake_infeed_shape);
+    TF_CHECK_OK(shape_status.status());
+    infeed_shape = std::move(shape_status).ValueOrDie();
+    provide_infeed = true;
+  } else if (opts.generate_fake_infeed) {
+    for (const auto& comp : computation.proto().computations()) {
+      for (const auto& instruction : comp.instructions()) {
+        if (instruction.opcode() == HloOpcodeString(HloOpcode::kInfeed)) {
+          CHECK(!provide_infeed)
+              << "--generate_fake_infeed only works if the model has 0 or 1 "
+                 "infeed ops, but this one has >= 2.";
+          provide_infeed = true;
+          infeed_shape = instruction.shape();
+          LOG(INFO) << "Generating fake infeed shape for inferred shape: "
+                    << ShapeUtil::HumanString(infeed_shape);
+        }
+      }
+    }
+  }
   // We only instantiate the thread pool if the user has requested that a
-  // concurrent infeed occur via the fake_infeed_shape.
+  // concurrent infeed occur via the fake_infeed_shape, or when
+  // --generate_fake_infeed is passed and there exists an infeed operation in
+  // the HloSnapshot.
   tensorflow::gtl::optional<tensorflow::thread::ThreadPool> pool;
-
-  if (!opts.fake_infeed_shape.empty()) {
+  std::unique_ptr<Literal> data;
+  if (provide_infeed) {
+    data = std::move(MakeFakeLiteral(infeed_shape)).ValueOrDie();
+  }
+  auto transfer_infeed = [&data, client]() {
+    TF_CHECK_OK(client->TransferToInfeed(*data));
+  };
+  if (provide_infeed) {
     pool.emplace(tensorflow::Env::Default(), "infeed",
                  /*num_threads=*/1);
-    pool->Schedule([opts, client]() {
-      StatusOr<Shape> shape_status =
-          ShapeUtil::ParseShapeString(opts.fake_infeed_shape);
-      TF_CHECK_OK(shape_status.status());
-      Shape shape = std::move(shape_status).ValueOrDie();
-      StatusOr<std::unique_ptr<Literal>> data_status = MakeFakeLiteral(shape);
-      TF_CHECK_OK(data_status.status());
-      std::unique_ptr<Literal> data = std::move(data_status).ValueOrDie();
-      while (true) {
-        TF_CHECK_OK(client->TransferToInfeed(*data));
-      }
+    pool->Schedule([transfer_infeed]() {
+      // There may be several infeed buffers needed, however we don't know how
+      // many. If we proactively transfer too many infeed buffers, we may run
+      // out of memory. If we transfer too few infeed buffers, the program will
+      // hang. Therefore, we register a callback that is called when the infeed
+      // becomes empty, and in this callback we will transfer another fake
+      // infeed.
+      auto infeed_manager = xla::gpu::GetOrCreateInfeedManager();
+      infeed_manager->RegisterOnEmptyCallback(transfer_infeed);
+      transfer_infeed();
     });
   }
 
-  std::vector<GlobalData*> execute_arguments;
-  execute_arguments.reserve(arguments.size());
-  for (auto& argument : arguments) {
-    execute_arguments.push_back(argument.get());
+  std::vector<const Shape*> argument_layouts;
+  for (const auto& param : computation.proto().program_shape().parameters()) {
+    argument_layouts.push_back(&param);
   }
+  std::unique_ptr<LocalExecutable> executable =
+      client->Compile(computation, argument_layouts, ExecutableBuildOptions())
+          .ValueOrDie();
 
   // Run the computation num_runs times, and return the result from the last
   // execution.
-  std::unique_ptr<Literal> result;
+  StreamExecutorMemoryAllocator allocator(
+      client->platform(),
+      {client->platform()->ExecutorForDevice(0).ValueOrDie()});
+  tensorflow::gtl::optional<ScopedShapedBuffer> result;
   for (int i = 0; i < opts.num_runs; ++i) {
     ExecutionProfile profile;
-    ExecutionOptions execution_options = CreateDefaultExecutionOptions();
-    if (opts.xla_hlo_profile_last_run && i == opts.num_runs - 1) {
-      execution_options.mutable_debug_options()->set_xla_hlo_profile(true);
-    }
+    ExecutableRunOptions run_options;
+    run_options.set_execution_profile(&profile);
+    run_options.set_allocator(&allocator);
 
-    if (opts.print_result) {
-      TF_ASSIGN_OR_RETURN(
-          result, client->ExecuteAndTransfer(computation, execute_arguments,
-                                             &execution_options, &profile));
-    } else {
-      // If we're not printing the result, execute the computation but don't
-      // bother retrieving the result.  This can be a significant speedup.
-      TF_RETURN_IF_ERROR(client
-                             ->Execute(computation, execute_arguments,
-                                       &execution_options, &profile)
-                             .status());
-    }
+    TF_ASSIGN_OR_RETURN(result, executable->Run(argument_ptrs, run_options));
     LOG(INFO) << "Execution took "
               << static_cast<double>(profile.compute_time_ns()) / 1e9 << "s";
   }
 
-  return std::move(result);
+  // Check that --num_runs > 0, otherwise *result below will fail with an
+  // unhelpful error (because the loop didn't run any iterations).
+  CHECK_GT(opts.num_runs, 0) << "--num_runs must be > 0";
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> result_literal,
+                      client->ShapedBufferToLiteral(*result));
+  return std::move(*result_literal);
 }
 
 int RealMain(tensorflow::gtl::ArraySlice<char*> args, const Options& opts) {
-  Client* client = ClientLibrary::LocalClientOrDie();
+  LocalClient* client = ClientLibrary::LocalClientOrDie();
   tensorflow::Env* env = tensorflow::Env::Default();
   int exit_status = EXIT_SUCCESS;
   for (char* arg : args) {
-    SessionModule module;
-    TF_CHECK_OK(tensorflow::ReadBinaryProto(env, arg, &module));
-    StatusOr<std::unique_ptr<Literal>> result_status =
-        ReplayComputation(module, client, opts);
+    HloSnapshot snapshot;
+    auto status = tensorflow::ReadBinaryProto(env, arg, &snapshot);
+    if (!status.ok()) {
+      fprintf(stderr, "%s: is not HloSnapshot. Trying HloProto.\n", arg);
+      status = tensorflow::ReadBinaryProto(env, arg, snapshot.mutable_hlo());
+      if (!status.ok()) {
+        fprintf(stderr, "%s: is not HloSnapshot or HloProto: %s.\n", arg,
+                status.ToString().c_str());
+        continue;
+      }
+      CHECK(opts.use_fake_data)
+          << "HloProto input must be handled with --use_fake_data";
+    }
+
+    StatusOr<Literal> result_status = ReplayComputation(snapshot, client, opts);
     if (!result_status.ok()) {
       fprintf(stderr, "%s: error: %s\n", arg,
               result_status.status().ToString().c_str());
@@ -164,16 +222,17 @@ int RealMain(tensorflow::gtl::ArraySlice<char*> args, const Options& opts) {
       continue;
     }
 
-    std::unique_ptr<Literal> result = result_status.ConsumeValueOrDie();
-    if (result != nullptr) {
-      fprintf(stdout, "%s: %s :: %s:%s\n", arg, module.entry().name().c_str(),
-              ShapeUtil::HumanString(result->shape()).c_str(),
-              result->ToString().c_str());
-      if (module.has_result()) {
+    if (opts.print_result) {
+      Literal result = std::move(result_status).ValueOrDie();
+      fprintf(stdout, "%s: %s :: %s:%s\n", arg,
+              snapshot.hlo().hlo_module().name().c_str(),
+              ShapeUtil::HumanString(result.shape()).c_str(),
+              result.ToString().c_str());
+      if (snapshot.has_result()) {
         std::unique_ptr<Literal> literal =
-            Literal::CreateFromProto(module.result()).ConsumeValueOrDie();
+            Literal::CreateFromProto(snapshot.result()).ConsumeValueOrDie();
         fprintf(stdout, "was %s:%s\n",
-                ShapeUtil::HumanString(module.result().shape()).c_str(),
+                ShapeUtil::HumanString(snapshot.result().shape()).c_str(),
                 literal->ToString().c_str());
       }
     }
@@ -198,9 +257,9 @@ int main(int argc, char** argv) {
                        "Number of times to run each computation"),
       tensorflow::Flag("fake_infeed_shape", &opts.fake_infeed_shape,
                        "Shape of fake data to construct for (infinite) infeed"),
-      tensorflow::Flag(
-          "xla_hlo_profile_last_run", &opts.xla_hlo_profile_last_run,
-          "Pass --xla_hlo_profile the last time we run the computation."),
+      tensorflow::Flag("generate_fake_infeed", &opts.generate_fake_infeed,
+                       "Whether a fake infeed shape should be generated "
+                       "derived from the computation"),
   };
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   bool parse_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
diff --git a/tensorflow/compiler/xla/tools/show_signature.cc b/tensorflow/compiler/xla/tools/show_signature.cc
index 1f3340cbc6afa9..4e53fafcc97ff5 100644
--- a/tensorflow/compiler/xla/tools/show_signature.cc
+++ b/tensorflow/compiler/xla/tools/show_signature.cc
@@ -18,7 +18,7 @@ limitations under the License.
 // Shows the signature (ProgramShape) of binary snapshot proto(s) on the command
 // line.
 //
-// some_binary_snapshot_proto is obtained by serializing the SessionModule from
+// some_binary_snapshot_proto is obtained by serializing the HloSnapshot from
 // ServiceInterface::SnapshotComputation to disk.
 //
 // The output format is:
@@ -31,9 +31,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
-#include "tensorflow/compiler/xla/client/computation.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/service/session.pb.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -49,13 +48,14 @@ namespace tools {
 void RealMain(tensorflow::gtl::ArraySlice<char*> args) {
   Client* client = ClientLibrary::LocalClientOrDie();
   for (char* arg : args) {
-    SessionModule module;
+    HloSnapshot module;
     TF_CHECK_OK(
         tensorflow::ReadBinaryProto(tensorflow::Env::Default(), arg, &module));
-    Computation computation = client->LoadSnapshot(module).ConsumeValueOrDie();
+    auto computation = client->LoadSnapshot(module).ConsumeValueOrDie();
     std::unique_ptr<ProgramShape> shape =
         client->GetComputationShape(computation).ConsumeValueOrDie();
-    fprintf(stdout, "%s: %s :: %s\n", arg, module.entry().name().c_str(),
+    fprintf(stdout, "%s: %s :: %s\n", arg,
+            module.hlo().hlo_module().name().c_str(),
             ShapeUtil::HumanString(*shape).c_str());
   }
 }
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index be33bd6dd1304f..b4f45cc972d3d3 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -218,6 +218,12 @@ Status Unavailable(const char* format, ...) TF_PRINTF_ATTRIBUTE(1, 2);
 // Passed-varargs variant of the InvalidArgument factory above.
 Status InvalidArgumentV(const char* format, va_list args);
 
+template <typename... Args>
+Status InvalidArgumentStrCat(Args&&... concat) {
+  return InvalidArgument(
+      "%s", tensorflow::strings::StrCat(std::forward<Args>(concat)...).c_str());
+}
+
 template <typename... Args>
 Status UnimplementedStrCat(Args&&... concat) {
   return Unimplemented(
@@ -486,6 +492,12 @@ bool c_is_sorted(const C& c) {
   return std::is_sorted(std::begin(c), std::end(c));
 }
 
+template <typename C, typename Compare>
+bool c_is_sorted(const C& c, Compare&& comp) {
+  return std::is_sorted(std::begin(c), std::end(c),
+                        std::forward<Compare>(comp));
+}
+
 template <typename C>
 auto c_adjacent_find(const C& c) -> decltype(std::begin(c)) {
   return std::adjacent_find(std::begin(c), std::end(c));
@@ -514,12 +526,29 @@ typename std::decay<T>::type c_accumulate(const Sequence& sequence, T&& init,
                          std::forward<BinaryOp>(binary_op));
 }
 
+template <typename C, typename Pred>
+typename std::iterator_traits<
+    decltype(std::begin(std::declval<C>()))>::difference_type
+c_count_if(const C& c, Pred&& pred) {
+  return std::count_if(std::begin(c), std::end(c), std::forward<Pred>(pred));
+}
+
 template <typename C, typename Value>
 int64 FindIndex(const C& c, Value&& value) {
   auto it = c_find(c, std::forward<Value>(value));
   return std::distance(c.begin(), it);
 }
 
+template <typename C, typename Value>
+void InsertAt(C* c, int64 index, Value&& value) {
+  c->insert(c->begin() + index, std::forward<Value>(value));
+}
+
+template <typename C>
+void EraseAt(C* c, int64 index) {
+  c->erase(c->begin() + index);
+}
+
 // Returns true if `x` fits in 32-bits.
 template <typename T>
 bool IsInt32(T x) {
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index f619b8dc24038a..53ba120d21a9e1 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -17,7 +17,6 @@ syntax = "proto3";
 
 import "tensorflow/compiler/xla/xla_data.proto";
 import "tensorflow/compiler/xla/service/hlo.proto";
-import "tensorflow/compiler/xla/service/session.proto";
 
 package xla;
 
@@ -230,14 +229,6 @@ message SnapshotComputationRequest {
   ComputationHandle computation = 1;
 }
 
-message SnapshotComputationResponse {
-  SessionModule module = 1;
-}
-
-message LoadComputationSnapshotRequest {
-  SessionModule module = 1;
-}
-
 message LoadComputationSnapshotResponse {
   ComputationHandle computation = 1;
 }
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index d23f9e5918f54c..6bdfb0179cd6a5 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -66,11 +66,16 @@ enum PrimitiveType {
   // in the dimensions field.
   TUPLE = 13;
 
-  // An opaque type used for passing context specific data to a custom
-  // operation.
+  // An opaque type used for passing context-specific data to a custom
+  // operation. Shapes of this primitive type will have empty dimensions and
+  // tuple_shapes fields.
   OPAQUE = 14;
 
-  // Next = 17
+  // A token type threaded between side-effecting operations. Shapes of this
+  // primitive type will have empty dimensions and tuple_shapes fields.
+  TOKEN = 17;
+
+  // Next = 18
 }
 
 // Describes the value held inside padding elements.
@@ -134,6 +139,8 @@ enum Format {
 // example, Convert) are ignored.
 //
 // See the XLA documentation for more information on shapes and layouts.
+//
+// LINT.IfChange
 message Layout {
   // The method used to store the data in memory. The format determines which of
   // the other fields are used by the layout.
@@ -159,9 +166,12 @@ message Layout {
   // memory.  This field must be unset unless the format is SPARSE.
   int64 max_sparse_elements = 5;
 
-  // Important: if any field is added, be sure to modify ShapeUtil::Equal()
-  // appropriately to account for the new field.
+  // Important: if any field is added, be sure to modify ShapeUtil::Equal() and
+  // LayoutUtil::Hash appropriately to account for the new field.
 }
+// LINT.ThenChange( \
+//     https://www.tensorflow.org/code/tensorflow/compiler/xla/shape_util.cc,      \
+//     https://www.tensorflow.org/code/tensorflow/compiler/xla/layout_util.cc)
 
 // A shape describes the number of dimensions in the array, the size of each
 // dimension, and the primitive component type.
@@ -170,6 +180,8 @@ message Layout {
 // defined.
 //
 // See the XLA documentation for more information on shapes and layouts.
+//
+// LINT.IfChange
 message Shape {
   reserved 1;
   reserved "rank";
@@ -190,9 +202,12 @@ message Shape {
   // The layout used to back this shape.
   Layout layout = 5;
 
-  // Important: if any field is added, be sure to modify ShapeUtil::Equal() and
-  // ShapeUtil::Compatible() appropriately to account for the new field.
+  // Important: if any field is added, be sure to modify ShapeUtil::Equal(),
+  // ShapeUtil::Compatible() and ShapeUtil::Hash() appropriately to account for
+  // the new field.
 }
+// LINT.ThenChange( \
+//     https://www.tensorflow.org/code/tensorflow/compiler/xla/shape_util.cc)
 
 // Shape of the parameters and output of a computation (like a traditional
 // function signature).
@@ -804,6 +819,12 @@ enum UnaryOperation {
 
   // Elementwise, computes clz(x).
   UNOP_CLZ = 17;
+
+  // Elementwise, computes exp(x)-1.
+  UNOP_EXPM1 = 18;
+
+  // Elementwise, computes log(x+1).
+  UNOP_LOG1P = 19;
 }
 
 message UnaryOpRequest {
diff --git a/tensorflow/compiler/xla/xlalogo.png b/tensorflow/compiler/xla/xlalogo.png
new file mode 100644
index 00000000000000..7a0a295953d0c4
Binary files /dev/null and b/tensorflow/compiler/xla/xlalogo.png differ
diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index abdbdb4cd22ff3..50b1ae5cc3cba2 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -31,13 +31,15 @@ py_library(
         "//tensorflow/contrib/cluster_resolver:cluster_resolver_py",
         "//tensorflow/contrib/coder:coder_py",
         "//tensorflow/contrib/compiler:compiler_py",
+        "//tensorflow/contrib/autograph",
         "//tensorflow/contrib/constrained_optimization",
+        "//tensorflow/contrib/control_flow",
         "//tensorflow/contrib/copy_graph:copy_graph_py",
         "//tensorflow/contrib/crf:crf_py",
         "//tensorflow/contrib/cudnn_rnn:cudnn_rnn_py",
         "//tensorflow/contrib/data",
-        "//tensorflow/contrib/distribute:distribute",
         "//tensorflow/contrib/deprecated:deprecated_py",
+        "//tensorflow/contrib/distribute:distribute",
         "//tensorflow/contrib/distributions:distributions_py",
         "//tensorflow/contrib/eager/python:tfe",
         "//tensorflow/contrib/estimator:estimator_py",
@@ -71,6 +73,7 @@ py_library(
         "//tensorflow/contrib/memory_stats:memory_stats_py",
         "//tensorflow/contrib/meta_graph_transform",
         "//tensorflow/contrib/metrics:metrics_py",
+        "//tensorflow/contrib/mixed_precision:mixed_precision",
         "//tensorflow/contrib/model_pruning",
         "//tensorflow/contrib/nccl:nccl_py",
         "//tensorflow/contrib/nearest_neighbor:nearest_neighbor_py",
@@ -82,7 +85,6 @@ py_library(
         "//tensorflow/contrib/proto",
         "//tensorflow/contrib/quantization:quantization_py",
         "//tensorflow/contrib/quantize:quantize_graph",
-        "//tensorflow/contrib/autograph",
         "//tensorflow/contrib/receptive_field:receptive_field_py",
         "//tensorflow/contrib/recurrent:recurrent_py",
         "//tensorflow/contrib/reduce_slice_ops:reduce_slice_ops_py",
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index 7f33d460dce077..ad8c40395c2cdc 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -30,6 +30,7 @@
 from tensorflow.contrib import coder
 from tensorflow.contrib import compiler
 from tensorflow.contrib import constrained_optimization
+from tensorflow.contrib import control_flow
 from tensorflow.contrib import copy_graph
 from tensorflow.contrib import crf
 from tensorflow.contrib import cudnn_rnn
@@ -60,6 +61,7 @@
 from tensorflow.contrib import losses
 from tensorflow.contrib import memory_stats
 from tensorflow.contrib import metrics
+from tensorflow.contrib import mixed_precision
 from tensorflow.contrib import model_pruning
 from tensorflow.contrib import nccl
 from tensorflow.contrib import nn
@@ -69,7 +71,6 @@
 from tensorflow.contrib import proto
 from tensorflow.contrib import quantization
 from tensorflow.contrib import quantize
-from tensorflow.contrib import recurrent
 from tensorflow.contrib import reduce_slice_ops
 from tensorflow.contrib import resampler
 from tensorflow.contrib import rnn
@@ -96,6 +97,7 @@
   from tensorflow.contrib.lite.python import lite
 from tensorflow.contrib.optimizer_v2 import optimizer_v2_symbols as optimizer_v2
 from tensorflow.contrib.receptive_field import receptive_field_api as receptive_field
+from tensorflow.contrib.recurrent.python import recurrent_api as recurrent
 from tensorflow.contrib.remote_fused_graph import pylib as remote_fused_graph
 from tensorflow.contrib.specs import python as specs
 from tensorflow.contrib.summary import summary
diff --git a/tensorflow/contrib/all_reduce/BUILD b/tensorflow/contrib/all_reduce/BUILD
index 62d1b1cf079d04..881808a98bfd68 100644
--- a/tensorflow/contrib/all_reduce/BUILD
+++ b/tensorflow/contrib/all_reduce/BUILD
@@ -11,6 +11,16 @@ exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
+py_library(
+    name = "all_reduce_py",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":all_reduce",
+        "//tensorflow/python:util",
+    ],
+)
+
 py_library(
     name = "all_reduce",
     srcs = [
diff --git a/tensorflow/contrib/all_reduce/__init__.py b/tensorflow/contrib/all_reduce/__init__.py
new file mode 100644
index 00000000000000..f9824f4cfbf83d
--- /dev/null
+++ b/tensorflow/contrib/all_reduce/__init__.py
@@ -0,0 +1,39 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""All-reduce implementations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import,line-too-long,wildcard-import
+from tensorflow.contrib.all_reduce.python.all_reduce import *
+
+from tensorflow.python.util.all_util import remove_undocumented
+# pylint: enable=unused-import,line-too-long,wildcard-import
+
+_allowed_symbols = [
+    'build_ring_all_reduce',
+    'build_recursive_hd_all_reduce',
+    'build_shuffle_all_reduce',
+    'build_nccl_all_reduce',
+    'build_nccl_then_ring',
+    'build_nccl_then_recursive_hd',
+    'build_nccl_then_shuffle',
+    'build_shuffle_then_ring',
+    'build_shuffle_then_shuffle'
+]
+
+remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/android/BUILD b/tensorflow/contrib/android/BUILD
index 60306ebdc6cddb..c10179ba8b290b 100644
--- a/tensorflow/contrib/android/BUILD
+++ b/tensorflow/contrib/android/BUILD
@@ -72,7 +72,7 @@ cc_binary(
         "-s",
         "-Wl,--gc-sections",
         "-Wl,--version-script",  # This line must be directly followed by LINKER_SCRIPT.
-        LINKER_SCRIPT,
+        "$(location {})".format(LINKER_SCRIPT),
     ]),
     linkshared = 1,
     linkstatic = 1,
diff --git a/tensorflow/contrib/android/jni/run_stats_jni.cc b/tensorflow/contrib/android/jni/run_stats_jni.cc
index 707853b59befc2..30de7b59af79cb 100644
--- a/tensorflow/contrib/android/jni/run_stats_jni.cc
+++ b/tensorflow/contrib/android/jni/run_stats_jni.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/contrib/android/jni/run_stats_jni.h"
 
 #include <jni.h>
+
 #include <sstream>
 
 #include "tensorflow/core/protobuf/config.pb.h"
@@ -73,7 +74,8 @@ JNIEXPORT jstring RUN_STATS_METHOD(summary)(JNIEnv* env, jclass clazz,
   StatSummarizer* s = requireHandle(env, handle);
   if (s == nullptr) return nullptr;
   std::stringstream ret;
-  ret << s->GetStatsByMetric("Top 10 CPU", StatSummarizer::BY_TIME, 10)
+  ret << s->GetStatsByMetric("Top 10 CPU", tensorflow::StatsCalculator::BY_TIME,
+                             10)
       << s->GetStatsByNodeType() << s->ShortSummary();
   return env->NewStringUTF(ret.str().c_str());
 }
diff --git a/tensorflow/contrib/autograph/CONTRIBUTING.md b/tensorflow/contrib/autograph/CONTRIBUTING.md
new file mode 100644
index 00000000000000..a4aec8c74a9ad1
--- /dev/null
+++ b/tensorflow/contrib/autograph/CONTRIBUTING.md
@@ -0,0 +1,48 @@
+# How to Contribute
+
+We'd love to have your patches and contributions! Here are some guidelines. In general, we follow the [TensorFlow contributing guidelines](../../CONTRIBUTING.md), but have some [AutoGraph-specific style guidelines](STYLE_GUIDE.md). More details below.
+
+## TensorFlow Code of Conduct
+Please review and follow the [TensorFlow Code of Conduct](../../CODE_OF_CONDUCT.md).
+
+## Contributor License Agreement
+
+Contributions to this project must be accompanied by a Contributor License
+Agreement. You (or your employer) retain the copyright to your contribution;
+this simply gives us permission to use and redistribute your contributions as
+part of the project. Head over to <https://cla.developers.google.com/> to see
+your current agreements on file or to sign a new one.
+
+You generally only need to submit a CLA once, so if you've already submitted one
+(even if it was for a different project), you probably don't need to do it
+again.
+
+## Code reviews
+
+All submissions, including submissions by project members, require review. We
+use GitHub pull requests for this purpose. Consult [GitHub
+Help](https://help.github.com/articles/about-pull-requests/) for more
+information on using pull requests.
+
+After a pull request is approved, we merge it. Note our merging process differs
+from GitHub in that we pull and submit the change into an internal version
+control system. This system automatically pushes a git commit to the GitHub
+repository (with credit to the original author) and closes the pull request.
+
+## Style
+
+See the [AutoGraph style guide](STYLE_GUIDE.md).
+
+## Unit tests
+
+Please include unit tests when contributing new features ([example here](converters/continue_statements_test.py)), as they help to a) prove that your code works correctly, and b) guard against future breaking
+changes to lower the maintenance cost.
+It's also helpful to check that any
+changes you propose do not break existing unit tests. You can run tests using the command,
+
+```shell
+bazel test --config=opt --copt=-O3 --copt=-march=native \
+  //tensorflow/contrib/autograph/...
+```
+
+from the root of the `tensorflow` repository. For more details see the [main TensorFlow Contributing File](../../CONTRIBUTING.md)
diff --git a/tensorflow/contrib/autograph/README.md b/tensorflow/contrib/autograph/README.md
index 0fcbf5dd59cece..674859bed4ec15 100644
--- a/tensorflow/contrib/autograph/README.md
+++ b/tensorflow/contrib/autograph/README.md
@@ -1,6 +1,6 @@
 # AutoGraph
 
-IMPORTANT: AutoGraph is pre-alpha, under active development. Expect rough edges and bugs, but if you try it, we appreciate early feedback!
+IMPORTANT: AutoGraph is alpha software, and under active development. Expect rough edges and bugs, but if you try it, we appreciate early feedback! We'd also love contributions ([please see our contributing guidelines](CONTRIBUTING.md) and our [style guide](STYLE_GUIDE.md)).
 
 AutoGraph is a Python to TensorFlow compiler.
 
@@ -56,8 +56,6 @@ Use AutoGraph in one of the following ways, described below:
  1. Annotations (simpler)
  2. Functional API (more flexible)
 
-NOTE: You can find more examples in this [interactive notebook](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/autograph/examples/notebooks/dev_summit_2018_demo.ipynb).
-
 To get started, install the latest nightly TensorFlow build:
 
 ```shell
@@ -70,6 +68,13 @@ Then import the `autograph` module from `tf.contrib`:
 from tensorflow.contrib import autograph as ag
 ```
 
+### Interactive demo notebooks
+
+For more extensive examples, check out these interactive notebooks:
+
+ * [RNN trained using Keras and Estimators](https://colab.sandbox.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/autograph/examples/notebooks/rnn_keras_estimator.ipynb)
+ * [Demo from the TF Dev Summit 2018](https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/autograph/examples/notebooks/dev_summit_2018_demo.ipynb)
+
 ## Using with annotations
 
 Annotating a function or class with `@convert` converts it in place:
diff --git a/tensorflow/contrib/autograph/STYLE_GUIDE.md b/tensorflow/contrib/autograph/STYLE_GUIDE.md
new file mode 100644
index 00000000000000..866e5f583a3457
--- /dev/null
+++ b/tensorflow/contrib/autograph/STYLE_GUIDE.md
@@ -0,0 +1,75 @@
+# AutoGraph Style Guide
+
+This page contains style decisions that developers should follow when
+contributing code to AutoGraph.
+
+## TensorFlow Style
+
+Follow the [TensorFlow style
+guide](https://www.tensorflow.org/community/style_guide), the [documentation
+guide](https://www.tensorflow.org/community/documentation) and the
+[Google Python style guide](https://google.github.io/styleguide/pyguide.html).
+
+Naming conventions:
+
+1.  The name is TensorFlow, not Tensorflow.
+2.  The name is AutoGraph, not Autograph.
+
+## AutoGraph Style
+
+Below are AutoGraph-specific conventions. In the event of conflict,
+it supercedes all previous conventions.
+
+1.  __Citations in Docstrings.__ Write a `#### References` subsection at the
+    bottom of any docstring with citations. Use ICLR’s bibliography style to
+    write references; for example, order entries by the first author's last
+    name. Add a link to the paper if the publication is open source (ideally,
+    arXiv).
+
+    Write in-paragraph citations in general, e.g., [(Tran and Blei, 2018)][1].
+    Write in-text citations when the citation is a noun, e.g., [Tran and Blei
+    (2018)][1]. Write citations with more than two authors using et al., e.g.,
+    [(Tran et al., 2018)][1]. Separate multiple citations with semicolon, e.g.,
+    ([Tran and Blei, 2018][1]; [Gelman and Rubin, 1992][2]).
+
+    Examples:
+
+    ```none
+    #### References
+
+    # technical report
+    [1]: Tony Finch. Incremental calculation of weighted mean and variance.
+         _Technical Report_, 2009.
+         http://people.ds.cam.ac.uk/fanf2/hermes/doc/antiforgery/stats.pdf
+
+    # journal
+    [2]: Andrew Gelman and Donald B. Rubin. Inference from Iterative Simulation
+         Using Multiple Sequences. _Statistical Science_, 7(4):457-472, 1992.
+
+    # arXiv preprint
+    # use "et al." for papers with too many authors to maintain
+    [3]: Aaron van den Oord et al. Parallel WaveNet: Fast High-Fidelity Speech
+         Synthesis. _arXiv preprint arXiv:1711.10433_, 2017.
+         https://arxiv.org/abs/1711.10433
+
+    # conference
+    [4]: Yeming Wen, Paul Vicol, Jimmy Ba, Dustin Tran, and Roger Grosse.
+         Flipout: Efficient Pseudo-Independent Weight Perturbations on
+         Mini-Batches. In _International Conference on Learning
+         Representations_, 2018.
+         https://arxiv.org/abs/1803.04386
+    ```
+
+2.  Avoid LaTeX in docstrings.
+
+    *   It is not rendered in many (if not most) editors and can be hard to read
+        for both LaTeX experts and non-experts.
+
+3. Write docstring and comment math using ASCII friendly notation; python using
+    operators. E.g., `x**2` better than `x^2`, `x[i, j]` better than `x_{i,j}`,
+    `sum{ f(x[i]) : i=1...n }` better than `\sum_{i=1}^n f(x_i)` `int{sin(x) dx:
+    x in [0, 2 pi]}` better than `\int_0^{2\pi} sin(x) dx`.
+
+    *   The more we stick to python style, the more someone can
+        copy/paste/execute.
+    *   Python style is usually easier to read as ASCII.
diff --git a/tensorflow/contrib/autograph/__init__.py b/tensorflow/contrib/autograph/__init__.py
index 3386c4eca4b93e..79d73af98097ae 100644
--- a/tensorflow/contrib/autograph/__init__.py
+++ b/tensorflow/contrib/autograph/__init__.py
@@ -23,18 +23,32 @@
 
 # TODO(mdan): Bring only the relevant symbols to the top level.
 from tensorflow.contrib.autograph import utils
+from tensorflow.contrib.autograph import operators
 from tensorflow.contrib.autograph.impl.api import convert
 from tensorflow.contrib.autograph.impl.api import converted_call
 from tensorflow.contrib.autograph.impl.api import do_not_convert
 from tensorflow.contrib.autograph.impl.api import RunMode
 from tensorflow.contrib.autograph.impl.api import to_code
 from tensorflow.contrib.autograph.impl.api import to_graph
+from tensorflow.contrib.autograph.impl.special_functions import stack
 from tensorflow.contrib.autograph.pyct.transformer import AutographParseError
 from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
-    'utils', 'convert', 'converted_call', 'do_not_convert', 'RunMode',
-    'to_code', 'to_graph', 'AutographParseError'
+    # Main API
+    'RunMode',
+    'convert',
+    'converted_call',
+    'do_not_convert',
+    'to_code',
+    'to_graph',
+    # Special functions and overloaded operators
+    'operators',
+    'stack',
+    # Exceptions
+    'AutographParseError',
+    # Utilities: to be removed
+    'utils',
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/autograph/converters/asserts.py b/tensorflow/contrib/autograph/converters/asserts.py
index 2d9e2c58e3afce..3b0db677ce5e41 100644
--- a/tensorflow/contrib/autograph/converters/asserts.py
+++ b/tensorflow/contrib/autograph/converters/asserts.py
@@ -33,7 +33,7 @@ def visit_Assert(self, node):
     # Note: The lone tf.Assert call will be wrapped with control_dependencies
     # by side_effect_guards.
     template = """
-      tf.Assert(test, [msg])
+      tf.Assert(test, (msg,))
     """
 
     if node.msg is None:
diff --git a/tensorflow/contrib/autograph/converters/break_statements.py b/tensorflow/contrib/autograph/converters/break_statements.py
index 91de82f0a78cca..775d92c1d9f8bc 100644
--- a/tensorflow/contrib/autograph/converters/break_statements.py
+++ b/tensorflow/contrib/autograph/converters/break_statements.py
@@ -32,14 +32,6 @@
 class BreakStatementTransformer(transformer.Base):
   """Canonicalizes break statements into additional conditionals."""
 
-  def _track_body(self, nodes, break_var):
-    self.enter_local_scope()
-    self.set_local(CONTROL_VAR_NAME, break_var)
-    nodes = self.visit_block(nodes)
-    break_used = self.get_local(BREAK_USED, False)
-    self.exit_local_scope()
-    return nodes, break_used
-
   def visit_Break(self, node):
     self.set_local(BREAK_USED, True)
     var_name = self.get_local(CONTROL_VAR_NAME)
@@ -54,6 +46,7 @@ def _guard_if_present(self, block, var_name):
     """Prevents the block from executing if var_name is set."""
     if not block:
       return block
+
     template = """
         if not var_name:
           block
@@ -64,9 +57,17 @@ def _guard_if_present(self, block, var_name):
         block=block)
     return node
 
+  def _track_body(self, nodes, break_var):
+    self.enter_local_scope()
+    self.set_local(CONTROL_VAR_NAME, break_var)
+    nodes = self.visit_block(nodes)
+    break_used = self.get_local(BREAK_USED, False)
+    self.exit_local_scope()
+    return nodes, break_used
+
   def visit_While(self, node):
     scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
-    break_var = self.context.namer.new_symbol('break__', scope.referenced)
+    break_var = self.context.namer.new_symbol('break_', scope.referenced)
 
     node.test = self.visit(node.test)
     node.body, break_used = self._track_body(node.body, break_var)
@@ -74,6 +75,10 @@ def visit_While(self, node):
     node.orelse = self.visit_block(node.orelse)
 
     if break_used:
+      # Python's else clause only triggers if the loop exited cleanly (e.g.
+      # break did not trigger).
+      guarded_orelse = self._guard_if_present(node.orelse, break_var)
+
       template = """
         var_name = False
         while test and not var_name:
@@ -81,20 +86,18 @@ def visit_While(self, node):
         else:
           orelse
       """
-      # Python's else clause only triggers if the loop exited cleanly (e.g.
-      # break did not trigger).
       node = templates.replace(
           template,
           var_name=break_var,
           test=node.test,
           body=node.body,
-          orelse=self._guard_if_present(node.orelse, break_var))
+          orelse=guarded_orelse)
 
     return node
 
   def visit_For(self, node):
     scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
-    break_var = self.context.namer.new_symbol('break__', scope.referenced)
+    break_var = self.context.namer.new_symbol('break_', scope.referenced)
 
     node.target = self.visit(node.target)
     node.iter = self.visit(node.iter)
@@ -103,20 +106,33 @@ def visit_For(self, node):
     node.orelse = self.visit_block(node.orelse)
 
     if break_used:
-      node.orelse = self._guard_if_present(node.orelse, break_var)
+      # Python's else clause only triggers if the loop exited cleanly (e.g.
+      # break did not trigger).
+      guarded_orelse = self._guard_if_present(node.orelse, break_var)
+      extra_test = templates.replace_as_expression(
+          'not var_name', var_name=break_var)
+
+      # The extra test is hidden in the AST, which will confuse the static
+      # analysis. To mitigate that, we insert a no-op statement that ensures
+      # the control variable is marked as used.
+      # TODO(mdan): Use a marker instead, e.g. ag__.condition_loop_on(var_name)
       template = """
         var_name = False
-        for_stmt
+        for target in iter_:
+          (var_name,)
+          body
+        else:
+          orelse
       """
-      # Python's else clause only triggers if the loop exited cleanly (e.g.
-      # break did not trigger).
       node = templates.replace(
           template,
           var_name=break_var,
-          for_stmt=node)
-      extra_cond = templates.replace_as_expression(
-          'not var_name', var_name=break_var)
-      anno.setanno(node[1], 'extra_cond', extra_cond)
+          iter_=node.iter,
+          target=node.target,
+          body=node.body,
+          orelse=guarded_orelse)
+
+      anno.setanno(node[1], 'extra_test', extra_test)
 
     return node
 
diff --git a/tensorflow/contrib/autograph/converters/builtin_functions.py b/tensorflow/contrib/autograph/converters/builtin_functions.py
index 317711a866f731..231e4ee35a72f5 100644
--- a/tensorflow/contrib/autograph/converters/builtin_functions.py
+++ b/tensorflow/contrib/autograph/converters/builtin_functions.py
@@ -31,9 +31,6 @@ class BuiltinFunctionTransformer(transformer.Base):
   TF equivalent, like `len`.
   """
 
-  def __init__(self, context):
-    super(BuiltinFunctionTransformer, self).__init__(context)
-
   def _convert_builtin(self, node):
     template = """
       ag__.utils.dynamic_builtin(func, args)
@@ -51,7 +48,7 @@ def visit_Call(self, node):
     # TODO(mdan): This won't work if the function was hidden.
     # TODO(mdan): Rely on the live_val and use inspect_utils.is_builtin instead.
     if (isinstance(node.func, gast.Name) and
-        node.func.id in ('len', 'range', 'xrange')):
+        node.func.id in ('len', 'range', 'xrange', 'float', 'int')):
       return self._convert_builtin(node)
     # Print needs to be handled separately because it can be read as statement.
     if isinstance(node.func, gast.Name) and node.func.id == 'print':
diff --git a/tensorflow/contrib/autograph/converters/call_trees.py b/tensorflow/contrib/autograph/converters/call_trees.py
index 554f0471d44d54..b6ecdcb7809b1a 100644
--- a/tensorflow/contrib/autograph/converters/call_trees.py
+++ b/tensorflow/contrib/autograph/converters/call_trees.py
@@ -292,15 +292,25 @@ def visit_Call(self, node):
         raise NotImplementedError(
             'py_func with return values (unknown function)')
     else:
+      if anno.hasanno(node.func, anno.Basic.QN):
+        # Special-case a few builtins that otherwise go undetected. This
+        # normally doesn't pose a problem, but the dict built-in doesn't
+        # work with inspect.getargspec which is required for dynamic functions.
+        # Note: expecting this is resilient to aliasing (e.g.
+        # dict = an_evil_dict), because in those cases the regular mechanisms
+        # process a simple user function.
+        qn = anno.getanno(node.func, anno.Basic.QN)
+        # Add items to this list as needed.
+        if str(qn) in ('dict',):
+          return node
+
       if ast_util.matches(node, 'super(_)'):
         # super() calls are preserved. The class conversion mechanism will
         # ensure that they return the correct value.
-        pass
-      elif self.context.recursive:
+        return node
+
+      if self.context.recursive:
         node = self._insert_dynamic_conversion(node)
-      else:
-        # Unresolved functions are allowed in non-recursive mode.
-        pass
     return node
 
 
diff --git a/tensorflow/contrib/autograph/converters/continue_statements.py b/tensorflow/contrib/autograph/converters/continue_statements.py
index 4299a8a9d59715..0417817a77e706 100644
--- a/tensorflow/contrib/autograph/converters/continue_statements.py
+++ b/tensorflow/contrib/autograph/converters/continue_statements.py
@@ -24,103 +24,115 @@
 from tensorflow.contrib.autograph.pyct.static_analysis.annos import NodeAnno
 
 
-class ContinueCanonicalizationTransformer(transformer.Base):
-  """Canonicalizes continue statements into additional conditionals."""
+# Tags for local state.
+CONTROL_VAR_NAME = 'control_var_name'
+CONTINUE_USED = 'continue_used'
+GUARD_CREATED = 'guard_created'
+CREATE_GUARD_NEXT = 'create_guard_next'
 
-  def __init__(self, context):
-    super(ContinueCanonicalizationTransformer, self).__init__(context)
-    # This is a stack structure, to correctly process nested loops.
-    self.continuation_uses = []
 
-  def _create_continuation_check(self):
-    template = """
-      if not var_name:
-        pass
-    """
-    cond, = templates.replace(template, var_name=self.continuation_uses[-1][1])
-    cond.body = []
-    return cond
+class ContinueCanonicalizationTransformer(transformer.Base):
+  """Canonicalizes continue statements into additional conditionals."""
 
-  def _create_continuation_trigger(self):
+  def visit_Continue(self, node):
+    self.set_local(CONTINUE_USED, True)
     template = """
       var_name = True
     """
-    assign, = templates.replace(
-        template, var_name=self.continuation_uses[-1][1])
-    return assign
-
-  def _create_continuation_init(self):
-    template = """
-      var_name = False
-    """
-    assign, = templates.replace(
-        template, var_name=self.continuation_uses[-1][1])
-    return assign
-
-  def _visit_and_reindent_if_necessary(self, nodes):
-    reorganized_nodes = []
-    current_dest = reorganized_nodes
-    continue_used_in_block = False
-    for i, n in enumerate(nodes):
-      # TODO(mdan): This could be optimized if control structures are simple.
-      self.continuation_uses[-1][0] = False
-      n = self.visit(n)
-      current_dest.append(n)
-      if self.continuation_uses[-1][0]:
-        continue_used_in_block = True
-        if i < len(nodes) - 1:  # Last statement in block needs no protection.
-          cond = self._create_continuation_check()
-          current_dest.append(cond)
-          current_dest = cond.body
-    self.continuation_uses[-1][0] = continue_used_in_block
-    return reorganized_nodes
-
-  def _process_loop_block(self, block, scope):
-    cont_var = self.context.namer.new_symbol('cont_requested', scope.referenced)
-    self.continuation_uses.append([False, cont_var])
-    block = self._visit_and_reindent_if_necessary(block)
-    if self.continuation_uses[-1][0]:
-      block.insert(0, self._create_continuation_init())
-    self.continuation_uses.pop()
-    return block
+    return templates.replace(
+        template, var_name=self.get_local(CONTROL_VAR_NAME))
+
+  def _postprocess_statement(self, node):
+    # Example of how the state machine below works:
+    #
+    #   1| stmt           # State: CONTINUE_USED = False
+    #    |                # Action: none
+    #   2| if cond:
+    #   3|   continue     # State: CONTINUE_USED = True,
+    #    |                #        GUARD_CREATED = False,
+    #    |                #        CREATE_GUARD_NEXT = False
+    #    |                # Action: set CREATE_GUARD_NEXT = True
+    #   4| stmt           # State: CONTINUE_USED = True,
+    #    |                #        GUARD_CREATED = False,
+    #    |                #        CREATE_GUARD_NEXT = True
+    #    |                # Action: create `if not continue_used`,
+    #    |                #         set GUARD_CREATED = True
+    #   5| stmt           # State: CONTINUE_USED = True, GUARD_CREATED = True
+    #    |                # Action: none (will be wrapped under previously
+    #    |                #         created if node)
+
+    if self.get_local(CONTINUE_USED, False):
+      if self.get_local(GUARD_CREATED, False):
+        return node, None
+
+      elif not self.get_local(CREATE_GUARD_NEXT, False):
+        self.set_local(CREATE_GUARD_NEXT, True)
+        return node, None
+
+      else:
+        self.set_local(GUARD_CREATED, True)
+        template = """
+          if not var_name:
+            original_node
+        """
+        cond, = templates.replace(
+            template,
+            var_name=self.get_local(CONTROL_VAR_NAME),
+            original_node=node)
+        return cond, cond.body
+    return node, None
+
+  def _visit_loop_body(self, node, nodes):
+    self.enter_local_scope()
+    scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
+    continue_var = self.context.namer.new_symbol('continue_', scope.referenced)
+    self.set_local(CONTROL_VAR_NAME, continue_var)
+
+    nodes = self.visit_block(nodes, after_visit=self._postprocess_statement)
+
+    if self.get_local(CONTINUE_USED, False):
+      template = """
+        var_name = False
+      """
+      control_var_init = templates.replace(template, var_name=continue_var)
+      nodes = control_var_init + nodes
+
+    self.exit_local_scope()
+    return nodes
+
+  def _visit_non_loop_body(self, nodes):
+    self.enter_local_scope(inherit=(CONTROL_VAR_NAME,))
+    nodes = self.visit_block(nodes, after_visit=self._postprocess_statement)
+    continue_used = self.get_local(CONTINUE_USED, False)
+    self.exit_local_scope(keep=(CONTINUE_USED,))
+    return nodes, continue_used
 
   def visit_While(self, node):
-    self.generic_visit(node.test)
-    node.body = self._process_loop_block(node.body,
-                                         anno.getanno(node,
-                                                      NodeAnno.BODY_SCOPE))
-    for n in node.orelse:
-      self.generic_visit(n)
+    node.test = self.visit(node.test)
+    node.body = self._visit_loop_body(node, node.body)
+    # A continue in the else clause applies to the containing scope.
+    node.orelse, _ = self._visit_non_loop_body(node.orelse)
     return node
 
   def visit_For(self, node):
-    self.generic_visit(node.target)
-    self.generic_visit(node.iter)
-    node.body = self._process_loop_block(node.body,
-                                         anno.getanno(node,
-                                                      NodeAnno.BODY_SCOPE))
-    for n in node.orelse:
-      self.generic_visit(n)
+    node.target = self.generic_visit(node.target)
+    node.iter = self.generic_visit(node.iter)
+    node.body = self._visit_loop_body(node, node.body)
+    # A continue in the else clause applies to the containing scope.
+    node.orelse, _ = self._visit_non_loop_body(node.orelse)
     return node
 
   def visit_If(self, node):
-    if self.continuation_uses:
-      self.generic_visit(node.test)
-      node.body = self._visit_and_reindent_if_necessary(node.body)
-      continue_used_in_body = self.continuation_uses[-1][0]
-      node.orelse = self._visit_and_reindent_if_necessary(node.orelse)
-      self.continuation_uses[-1][0] = (
-          continue_used_in_body or self.continuation_uses[-1][0])
-    else:
-      node = self.generic_visit(node)
+    node.test = self.generic_visit(node.test)
+    node.body, continue_used_body = self._visit_non_loop_body(node.body)
+    node.orelse, continue_used_orelse = self._visit_non_loop_body(node.orelse)
+    self.set_local(CONTINUE_USED, continue_used_body or continue_used_orelse)
     return node
 
-  def visit_Continue(self, node):
-    self.continuation_uses[-1][0] = True
-    return self._create_continuation_trigger()
-
-  def visit_Break(self, node):
-    assert False, 'break statement should be desugared at this point'
+  def visit_With(self, node):
+    node.items = self.visit_block(node.items)
+    node.body, _ = self._visit_non_loop_body(node.body)
+    return node
 
 
 def transform(node, namer):
diff --git a/tensorflow/contrib/autograph/converters/control_flow.py b/tensorflow/contrib/autograph/converters/control_flow.py
index 2e26cdb3d9387d..d7ddbe8a04f648 100644
--- a/tensorflow/contrib/autograph/converters/control_flow.py
+++ b/tensorflow/contrib/autograph/converters/control_flow.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Handles control flow statements: while, if."""
+"""Handles control flow statements: while, for, if."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -25,6 +25,7 @@
 from tensorflow.contrib.autograph.pyct import parser
 from tensorflow.contrib.autograph.pyct import templates
 from tensorflow.contrib.autograph.pyct import transformer
+from tensorflow.contrib.autograph.pyct.static_analysis import cfg
 from tensorflow.contrib.autograph.pyct.static_analysis.annos import NodeAnno
 
 
@@ -47,9 +48,6 @@ def new_symbol(self, name_root, reserved_locals):
 class ControlFlowTransformer(transformer.Base):
   """Transforms control flow structures like loops an conditionals."""
 
-  def __init__(self, context):
-    super(ControlFlowTransformer, self).__init__(context)
-
   def _create_cond_branch(self, body_name, aliased_orig_names,
                           aliased_new_names, body, returns):
     if aliased_orig_names:
@@ -98,30 +96,63 @@ def visit_If(self, node):
 
     body_scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
     orelse_scope = anno.getanno(node, NodeAnno.ORELSE_SCOPE)
-
-    if body_scope.created - orelse_scope.created:
-      raise ValueError(
-          'The if branch creates new symbols that the else branch does not.')
-    if orelse_scope.created - body_scope.created:
-      raise ValueError(
-          'The else branch creates new symbols that the if branch does not.')
-
-    modified = tuple(body_scope.modified | orelse_scope.modified)
-    all_referenced = body_scope.referenced | orelse_scope.referenced
+    body_defs = body_scope.created | body_scope.modified
+    orelse_defs = orelse_scope.created | orelse_scope.modified
+    live = anno.getanno(node, 'live_out')
+
+    # We'll need to check if we're closing over variables that are defined
+    # elsewhere in the function
+    # NOTE: we can only detect syntactic closure in the scope
+    # of the code passed in. If the AutoGraph'd function itself closes
+    # over other variables, this analysis won't take that into account.
+    defined = anno.getanno(node, 'defined_in')
+
+    # We only need to return variables that are
+    # - modified by one or both branches
+    # - live (or has a live parent) at the end of the conditional
+    modified = []
+    for def_ in body_defs | orelse_defs:
+      def_with_parents = set((def_,)) | def_.support_set
+      if live & def_with_parents:
+        modified.append(def_)
+
+    # We need to check if live created variables are balanced
+    # in both branches
+    created = live & (body_scope.created | orelse_scope.created)
+
+    # The if statement is illegal if there are variables that are created,
+    # that are also live, but both branches don't create them.
+    if created:
+      if created != (body_scope.created & live):
+        raise ValueError(
+            'The main branch does not create all live symbols that the else '
+            'branch does.')
+      if created != (orelse_scope.created & live):
+        raise ValueError(
+            'The else branch does not create all live symbols that the main '
+            'branch does.')
 
     # Alias the closure variables inside the conditional functions
     # to avoid errors caused by the local variables created in the branch
     # functions.
-    need_alias = (
-        (body_scope.modified | orelse_scope.modified) -
-        (body_scope.created | orelse_scope.created))
-    aliased_orig_names = tuple(need_alias)
-    aliased_new_names = tuple(
-        self.context.namer.new_symbol(s.ssf(), all_referenced)
-        for s in aliased_orig_names)
-    alias_map = dict(zip(aliased_orig_names, aliased_new_names))
-    node_body = ast_util.rename_symbols(node.body, alias_map)
-    node_orelse = ast_util.rename_symbols(node.orelse, alias_map)
+    # We will alias variables independently for body and orelse scope,
+    # because different branches might write different variables.
+    aliased_body_orig_names = tuple(body_scope.modified - body_scope.created)
+    aliased_orelse_orig_names = tuple(orelse_scope.modified -
+                                      orelse_scope.created)
+    aliased_body_new_names = tuple(
+        self.context.namer.new_symbol(s.ssf(), body_scope.referenced)
+        for s in aliased_body_orig_names)
+    aliased_orelse_new_names = tuple(
+        self.context.namer.new_symbol(s.ssf(), orelse_scope.referenced)
+        for s in aliased_orelse_orig_names)
+
+    alias_body_map = dict(zip(aliased_body_orig_names, aliased_body_new_names))
+    alias_orelse_map = dict(
+        zip(aliased_orelse_orig_names, aliased_orelse_new_names))
+
+    node_body = ast_util.rename_symbols(node.body, alias_body_map)
+    node_orelse = ast_util.rename_symbols(node.orelse, alias_orelse_map)
 
     if not modified:
       # When the cond would return no value, we leave the cond called without
@@ -134,26 +165,47 @@ def visit_If(self, node):
     else:
       results = gast.Tuple([s.ast() for s in modified], None)
 
-    body_name = self.context.namer.new_symbol('if_true', all_referenced)
-    orelse_name = self.context.namer.new_symbol('if_false', all_referenced)
+    body_name = self.context.namer.new_symbol('if_true', body_scope.referenced)
+    orelse_name = self.context.namer.new_symbol('if_false',
+                                                orelse_scope.referenced)
     if modified:
-      body_returns = tuple(
-          alias_map[s] if s in aliased_orig_names else s for s in modified)
+
+      def build_returns(aliased_names, alias_map, scope):
+        """Builds list of return variables for a branch of a conditional."""
+        returns = []
+        for s in modified:
+          if s in aliased_names:
+            returns.append(alias_map[s])
+          else:
+            if s not in scope.created | defined:
+              raise ValueError(
+                  'Attempting to return variable "%s" from the true branch of '
+                  'a conditional, but it was not closed over, or created in '
+                  'this branch.' % str(s))
+            else:
+              returns.append(s)
+        return tuple(returns)
+
+      body_returns = build_returns(aliased_body_orig_names, alias_body_map,
+                                   body_scope)
+      orelse_returns = build_returns(aliased_orelse_orig_names,
+                                     alias_orelse_map, orelse_scope)
+
     else:
-      body_returns = templates.replace('tf.ones(())')[0].value
+      body_returns = orelse_returns = templates.replace('tf.ones(())')[0].value
 
     body_def = self._create_cond_branch(
         body_name,
-        aliased_orig_names=tuple(aliased_orig_names),
-        aliased_new_names=tuple(aliased_new_names),
+        aliased_orig_names=tuple(aliased_body_orig_names),
+        aliased_new_names=tuple(aliased_body_new_names),
         body=node_body,
         returns=body_returns)
     orelse_def = self._create_cond_branch(
         orelse_name,
-        aliased_orig_names=tuple(aliased_orig_names),
-        aliased_new_names=tuple(aliased_new_names),
+        aliased_orig_names=tuple(aliased_orelse_orig_names),
+        aliased_new_names=tuple(aliased_orelse_new_names),
         body=node_orelse,
-        returns=body_returns)
+        returns=orelse_returns)
     cond_expr = self._create_cond_expr(results, node.test, body_name,
                                        orelse_name)
 
@@ -207,7 +259,7 @@ def test_name(state_ssf):
       def body_name(state_ssf):
         body
         return state_ssf,
-      state_ast_tuple = ag__.while_loop(
+      state_ast_tuple = ag__.while_stmt(
           test_name, body_name, (state,), (extra_deps,))
     """
     node = templates.replace(
@@ -252,31 +304,31 @@ def visit_For(self, node):
       state_ast_tuple = gast.Tuple([n.ast() for n in state], None)
 
     node_body = ast_util.rename_symbols(node.body, ssf_map)
-    if anno.hasanno(node, 'extra_cond'):
-      extra_cond = anno.getanno(node, 'extra_cond')
-      extra_cond = ast_util.rename_symbols(extra_cond, ssf_map)
+    if anno.hasanno(node, 'extra_test'):
+      extra_test = anno.getanno(node, 'extra_test')
+      extra_test = ast_util.rename_symbols(extra_test, ssf_map)
     else:
-      extra_cond = parser.parse_expression('True')
+      extra_test = parser.parse_expression('True')
 
     template = """
-      def extra_cond_name(state_ssf):
-        return extra_cond_expr
+      def extra_test_name(state_ssf):
+        return extra_test_expr
       def body_name(iterate, state_ssf):
         body
         return state_ssf,
-      state_ast_tuple = ag__.for_loop(
-          iterated, extra_cond_name, body_name, (state,))
+      state_ast_tuple = ag__.for_stmt(
+          iter_, extra_test_name, body_name, (state,))
     """
     node = templates.replace(
         template,
         state=state,
         state_ssf=state_ssf,
         state_ast_tuple=state_ast_tuple,
-        iterated=node.iter,
+        iter_=node.iter,
         iterate=node.target,
-        extra_cond_name=self.context.namer.new_symbol('extra_cond',
+        extra_test_name=self.context.namer.new_symbol('extra_test',
                                                       all_referenced),
-        extra_cond_expr=extra_cond,
+        extra_test_expr=extra_test,
         body_name=self.context.namer.new_symbol('loop_body', all_referenced),
         body=node_body)
 
@@ -284,6 +336,7 @@ def body_name(iterate, state_ssf):
 
 
 def transform(node, context):
-  t = ControlFlowTransformer(context)
-  node = t.visit(node)
+  cfg.run_analyses(node, cfg.Liveness(context))
+  cfg.run_analyses(node, cfg.Defined(context))
+  node = ControlFlowTransformer(context).visit(node)
   return node
diff --git a/tensorflow/contrib/autograph/converters/control_flow_test.py b/tensorflow/contrib/autograph/converters/control_flow_test.py
index c5610b16b4e5de..9d23d9b5b7e8e8 100644
--- a/tensorflow/contrib/autograph/converters/control_flow_test.py
+++ b/tensorflow/contrib/autograph/converters/control_flow_test.py
@@ -22,6 +22,7 @@
 from tensorflow.contrib.autograph.converters import converter_test_base
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.platform import test
 
@@ -41,7 +42,7 @@ def test_fn(n):
     node = self.parse_and_analyze(test_fn, {})
     node = control_flow.transform(node, self.ctx)
 
-    with self.compiled(node, control_flow_ops.while_loop) as result:
+    with self.compiled(node) as result:
       with self.test_session() as sess:
         self.assertEqual((10, 5, 5),
                          sess.run(result.test_fn(constant_op.constant(5))))
@@ -56,7 +57,7 @@ def test_fn(n):
     node = self.parse_and_analyze(test_fn, {})
     node = control_flow.transform(node, self.ctx)
 
-    with self.compiled(node, control_flow_ops.while_loop) as result:
+    with self.compiled(node) as result:
       with self.test_session() as sess:
         self.assertEqual(0, sess.run(result.test_fn(constant_op.constant(5))))
 
@@ -74,7 +75,7 @@ def test_fn(n):
     node = self.parse_and_analyze(test_fn, {})
     node = control_flow.transform(node, self.ctx)
 
-    with self.compiled(node, control_flow_ops.cond) as result:
+    with self.compiled(node) as result:
       with self.test_session() as sess:
         self.assertEqual((-1, 0),
                          sess.run(result.test_fn(constant_op.constant(1))))
@@ -91,10 +92,95 @@ def test_fn(n):
     node = self.parse_and_analyze(test_fn, {})
     node = control_flow.transform(node, self.ctx)
 
-    with self.compiled(node, control_flow_ops.cond) as result:
+    with self.compiled(node) as result:
       with self.test_session() as sess:
         self.assertEqual(-1, sess.run(result.test_fn(constant_op.constant(1))))
 
+  def test_imbalanced_aliasing(self):
+
+    def test_fn(n):
+      if n > 0:
+        n = 3
+      return n
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = control_flow.transform(node, self.ctx)
+
+    with self.compiled(node, control_flow_ops.cond) as result:
+      with self.test_session() as sess:
+        self.assertEqual(3, sess.run(result.test_fn(constant_op.constant(2))))
+        self.assertEqual(-3, sess.run(result.test_fn(constant_op.constant(-3))))
+
+  def test_ignore_unread_variable(self):
+
+    def test_fn(n):
+      b = 3  # pylint: disable=unused-variable
+      if n > 0:
+        b = 4
+      return n
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = control_flow.transform(node, self.ctx)
+
+    with self.compiled(node, control_flow_ops.cond, array_ops.ones) as result:
+      with self.test_session() as sess:
+        self.assertEqual(3, sess.run(result.test_fn(constant_op.constant(3))))
+        self.assertEqual(-3, sess.run(result.test_fn(constant_op.constant(-3))))
+
+  def test_handle_temp_variable(self):
+
+    def test_fn_using_temp(x, y, w):
+      if x < y:
+        z = x + y
+      else:
+        w = 2
+        tmp = w
+        z = x - tmp
+      return z, w
+
+    node = self.parse_and_analyze(test_fn_using_temp, {})
+    node = control_flow.transform(node, self.ctx)
+
+    with self.compiled(node, control_flow_ops.cond, array_ops.ones) as result:
+      with self.test_session() as sess:
+        z, w = sess.run(
+            result.test_fn_using_temp(
+                constant_op.constant(-3), constant_op.constant(3),
+                constant_op.constant(3)))
+        self.assertEqual(0, z)
+        self.assertEqual(3, w)
+        z, w = sess.run(
+            result.test_fn_using_temp(
+                constant_op.constant(3), constant_op.constant(-3),
+                constant_op.constant(3)))
+        self.assertEqual(1, z)
+        self.assertEqual(2, w)
+
+    def test_fn_ignoring_temp(x, y, w):
+      if x < y:
+        z = x + y
+      else:
+        w = 2
+        tmp = w
+        z = x - tmp
+      return z
+
+    node = self.parse_and_analyze(test_fn_ignoring_temp, {})
+    node = control_flow.transform(node, self.ctx)
+
+    with self.compiled(node, control_flow_ops.cond, array_ops.ones) as result:
+      with self.test_session() as sess:
+        z = sess.run(
+            result.test_fn_ignoring_temp(
+                constant_op.constant(-3), constant_op.constant(3),
+                constant_op.constant(3)))
+        self.assertEqual(0, z)
+        z = sess.run(
+            result.test_fn_ignoring_temp(
+                constant_op.constant(3), constant_op.constant(-3),
+                constant_op.constant(3)))
+        self.assertEqual(1, z)
+
   def test_simple_for(self):
 
     def test_fn(l):
diff --git a/tensorflow/contrib/autograph/impl/BUILD b/tensorflow/contrib/autograph/impl/BUILD
index 54424e26472b84..91ae0b9b82c6f6 100644
--- a/tensorflow/contrib/autograph/impl/BUILD
+++ b/tensorflow/contrib/autograph/impl/BUILD
@@ -21,6 +21,7 @@ py_library(
         "config.py",
         "conversion.py",
         "naming.py",
+        "special_functions.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
@@ -69,3 +70,13 @@ py_test(
         "//tensorflow/python:client_testlib",
     ],
 )
+
+py_test(
+    name = "special_functions_test",
+    srcs = ["special_functions_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":impl",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/contrib/autograph/impl/config.py b/tensorflow/contrib/autograph/impl/config.py
index 2600088595a127..878bb7e12f2b39 100644
--- a/tensorflow/contrib/autograph/impl/config.py
+++ b/tensorflow/contrib/autograph/impl/config.py
@@ -33,7 +33,7 @@
     (utils.__name__,),
 
     # All of tensorflow's subpackages. Unlike the root tf module, they don't
-    # have well-known names. Not refering to the module directly to avoid
+    # have well-known names. Not referring to the module directly to avoid
     # circular imports.
     (
         utils.__name__[:-len('.contrib.autograph.utils')],),
diff --git a/tensorflow/contrib/autograph/impl/conversion_test.py b/tensorflow/contrib/autograph/impl/conversion_test.py
index 5edd8e74a8899a..bc61498b5422f5 100644
--- a/tensorflow/contrib/autograph/impl/conversion_test.py
+++ b/tensorflow/contrib/autograph/impl/conversion_test.py
@@ -24,7 +24,7 @@
 from tensorflow.contrib.autograph.impl import api
 from tensorflow.contrib.autograph.impl import conversion
 from tensorflow.python.framework import constant_op
-from tensorflow.python.keras._impl.keras.engine import training
+from tensorflow.python.keras.engine import training
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/contrib/autograph/impl/special_functions.py b/tensorflow/contrib/autograph/impl/special_functions.py
new file mode 100644
index 00000000000000..b7a8177c44c882
--- /dev/null
+++ b/tensorflow/contrib/autograph/impl/special_functions.py
@@ -0,0 +1,48 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Special functions that only make sense for AutoGraph.
+
+These functions are meant to ensure feature parity between Python and AutoGraph,
+so that the exact same code works in both modes. In general, AutoGraph will
+replace these calls.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.autograph.operators import data_structures
+
+
+def stack(list_or_tensor, element_dtype=None):
+  """Stacks the input, if it admits the notion of stacking. No-op otherwise.
+
+  For example, a list of tensors can be stacked into a larger tensor. This
+  function is similar to tf.stack, but it accepts non-lists and lists of
+  non-tensors as arguments. In the latter case, the function does nothing.
+
+  Args:
+    list_or_tensor: Any entity.
+    element_dtype: Optional dtype for the elements in the list. Required if the
+        input is stackable, and the list is untyped.
+
+  Returns:
+    If the input is stackable, a new object representing the stacked inputs.
+  Otherwise it returns list_or_tensor unchanged.
+  """
+  return data_structures.list_stack(
+      list_or_tensor,
+      data_structures.ListStackOpts(
+          element_dtype=element_dtype, original_call=lambda x: x))
diff --git a/tensorflow/contrib/autograph/impl/special_functions_test.py b/tensorflow/contrib/autograph/impl/special_functions_test.py
new file mode 100644
index 00000000000000..9b52d2a59b5a3e
--- /dev/null
+++ b/tensorflow/contrib/autograph/impl/special_functions_test.py
@@ -0,0 +1,50 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for special_functions module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.autograph.impl import special_functions
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import list_ops
+from tensorflow.python.platform import test
+
+
+class SpecialFunctionsTest(test.TestCase):
+
+  def test_basic(self):
+    self.assertEqual(special_functions.stack(1), 1)
+    self.assertListEqual(special_functions.stack([1, 2, 3]), [1, 2, 3])
+    # TODO(mdan): This should probably forward to tf.stack.
+    self.assertTrue(
+        isinstance(
+            special_functions.stack(
+                [constant_op.constant(1),
+                 constant_op.constant(2)]), list))
+
+    t = constant_op.constant([1.0, 2.0])
+    l = list_ops.tensor_list_from_tensor(
+        t, element_shape=constant_op.constant([], dtype=dtypes.int32))
+    self.assertTrue(
+        tensor_util.is_tensor(
+            special_functions.stack(l, element_dtype=dtypes.float32)))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/autograph/operators/BUILD b/tensorflow/contrib/autograph/operators/BUILD
index efb8d441dd839b..0c6ab65505ee03 100644
--- a/tensorflow/contrib/autograph/operators/BUILD
+++ b/tensorflow/contrib/autograph/operators/BUILD
@@ -22,6 +22,7 @@ py_library(
         "__init__.py",
         "control_flow.py",
         "data_structures.py",
+        "slices.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
@@ -51,3 +52,13 @@ py_test(
         "//tensorflow/python:client_testlib",
     ],
 )
+
+py_test(
+    name = "slices_test",
+    srcs = ["slices_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":operators",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/contrib/autograph/operators/__init__.py b/tensorflow/contrib/autograph/operators/__init__.py
index 04b4734551d322..c900fd6af2ea5d 100644
--- a/tensorflow/contrib/autograph/operators/__init__.py
+++ b/tensorflow/contrib/autograph/operators/__init__.py
@@ -19,11 +19,32 @@
 closures for the body.
 """
 
+# Naming conventions:
+#  * operator names match the name usually used for the respective Python
+#    idiom; examples: for_stmt, list_append
+#  * operator arguments match either of:
+#    - the corresponding Python AST attribute (e.g. the condition of an if
+#      statement is called test) if the operator represents an AST construct
+#    - the names used in the Python docs, if the operator is a function (e.g.
+#      list_ and x for append, see
+#      https://docs.python.org/3.7/tutorial/datastructures.html)
+#
+# All operators may accept a final argument named "opts", of a type that
+# subclasses namedtuple and contains any arguments that are only required
+# for some specializations of the operator.
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# TODO(mdan): Add a container for implementation-specific toggles (throughout).
-
-from tensorflow.contrib.autograph.operators.control_flow import for_loop
-from tensorflow.contrib.autograph.operators.control_flow import while_loop
+from tensorflow.contrib.autograph.operators.control_flow import for_stmt
+from tensorflow.contrib.autograph.operators.control_flow import while_stmt
+from tensorflow.contrib.autograph.operators.data_structures import list_append
+from tensorflow.contrib.autograph.operators.data_structures import list_pop
+from tensorflow.contrib.autograph.operators.data_structures import list_stack
+from tensorflow.contrib.autograph.operators.data_structures import ListPopOpts
+from tensorflow.contrib.autograph.operators.data_structures import ListStackOpts
+from tensorflow.contrib.autograph.operators.data_structures import new_list
+from tensorflow.contrib.autograph.operators.slices import get_item
+from tensorflow.contrib.autograph.operators.slices import GetItemOpts
+from tensorflow.contrib.autograph.operators.slices import set_item
diff --git a/tensorflow/contrib/autograph/operators/control_flow.py b/tensorflow/contrib/autograph/operators/control_flow.py
index d9d8b0d593e537..671c9ccc13eaa8 100644
--- a/tensorflow/contrib/autograph/operators/control_flow.py
+++ b/tensorflow/contrib/autograph/operators/control_flow.py
@@ -25,44 +25,55 @@
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_math_ops
 
-# TODO(mdan): Rename _loop to _stmt to follow Python nomenclature.
-# TODO(mdan): Rename arguments to match the AST names.
 
-
-def for_loop(iterated, extra_cond, loop_body, init_state):
+def for_stmt(iter_, extra_test, body, init_state):
   """Functional form of a for statement.
 
-  The loop operates on a so-called state, which includes all symbols that are
-  variant across loop iterations, excluding the iterate. In what follows we
-  refer to state as either a tuple of entities that represent an actual state,
-  or a list of arguments of the corresponding types.
+  The loop operates on a state, which includes all symbols that are
+  variant across loop iterations, excluding the iterate as well as the
+  variables local to the loop.
+
+  For example, given the loop below that calculates the geometric and
+  arithmetic means or some numbers:
+
+    geo_mean = 1
+    arith_mean = 0
+    for i in range(n):
+      a = numbers[i]
+      geo_mean *= a
+      arith_mean += a
+
+  The state is represented by the variables geo_mean and arith_mean. The
+  argument for initial_state may contain the tuple (1, 0), the body will
+  include the arguments geo_mean and arith_mean and will return a tuple
+  representing the new values for geo_mean and respectively arith_mean.
 
   Args:
-    iterated: The entity being iterated over.
-    extra_cond: Callable with the state as arguments, and boolean return type.
+    iter_: The entity being iterated over.
+    extra_test: Callable with the state as arguments, and boolean return type.
         An additionnal loop condition.
-    loop_body: Callable with the iterate and the state as arguments, and
+    body: Callable with the iterate and the state as arguments, and
         state as return type. The actual loop body.
     init_state: Tuple containing the initial state.
 
   Returns:
     Tuple containing the final state.
   """
-  if tensor_util.is_tensor(iterated):
-    return _known_len_for_loop(iterated, extra_cond, loop_body, init_state)
-  elif isinstance(iterated, dataset_ops.Dataset):
-    return _dataset_for_loop(iterated, extra_cond, loop_body, init_state)
+  if tensor_util.is_tensor(iter_):
+    return _known_len_for_stmt(iter_, extra_test, body, init_state)
+  elif isinstance(iter_, dataset_ops.Dataset):
+    return _dataset_for_stmt(iter_, extra_test, body, init_state)
   else:
-    return _py_for_loop(iterated, extra_cond, loop_body, init_state)
+    return _py_for_stmt(iter_, extra_test, body, init_state)
 
 
-def _py_for_loop(iterated, extra_cond, loop_body, init_state):
-  """Overload of for_loop that executes a Python for loop."""
+def _py_for_stmt(iter_, extra_test, body, init_state):
+  """Overload of for_stmt that executes a Python for loop."""
   state = init_state
-  for iterate in iterated:
-    if not extra_cond(*state):
+  for target in iter_:
+    if not extra_test(*state):
       break
-    state = loop_body(iterate, *state)
+    state = body(target, *state)
 
   # TODO(mdan): Remove this special case.
   if len(state) == 1:
@@ -70,23 +81,23 @@ def _py_for_loop(iterated, extra_cond, loop_body, init_state):
   return state
 
 
-def _known_len_for_loop(iterated, extra_cond, loop_body, init_state):
-  """Overload of for_loop that iterates over objects that define a length."""
-  n = builtins.dynamic_len(iterated)
+def _known_len_for_stmt(iter_, extra_test, body, init_state):
+  """Overload of for_stmt that iterates over objects that define a length."""
+  n = builtins.dynamic_len(iter_)
 
   def while_body(iterate_index, *state):
-    iterate = iterated[iterate_index]
-    new_state = loop_body(iterate, *state)
+    iterate = iter_[iterate_index]
+    new_state = body(iterate, *state)
     return (iterate_index + 1,) + new_state
 
   def while_cond(iterate_index, *state):
-    return gen_math_ops.logical_and(iterate_index < n, extra_cond(*state))
+    return gen_math_ops.logical_and(iterate_index < n, extra_test(*state))
 
-  results = while_loop(
+  results = while_stmt(
       while_cond,
       while_body,
       init_state=(0,) + init_state,
-      extra_deps=(iterated,),
+      extra_deps=(iter_,),
       opts=dict(maximum_iterations=n))
   # Dropping the iteration index because it's not syntactically visible.
   results = results[1:]
@@ -97,8 +108,8 @@ def while_cond(iterate_index, *state):
   return results
 
 
-def _dataset_for_loop(ds, extra_cond, loop_body, init_state):
-  """Overload of for_loop that iterates over TF Datasets."""
+def _dataset_for_stmt(ds, extra_test, body, init_state):
+  """Overload of for_stmt that iterates over TF Datasets."""
   # Because Datsets only expose get_next, in the style of Python iterators,
   # we are forced to unpack the loop as:
   #
@@ -117,15 +128,15 @@ def tag_with(ds, tag):
     epoch_number, iterate = iterator.get_next()
 
     def while_body(epoch_number, iterate, *state):
-      new_state = loop_body(iterate, *state)
+      new_state = body(iterate, *state)
       epoch_number, iterate = iterator.get_next()
       return (epoch_number, iterate) + new_state
 
     def while_cond(epoch_number, iterate, *state):
       del iterate
-      return gen_math_ops.logical_and(epoch_number < 1, extra_cond(*state))
+      return gen_math_ops.logical_and(epoch_number < 1, extra_test(*state))
 
-    results = while_loop(
+    results = while_stmt(
         while_cond,
         while_body,
         init_state=(epoch_number, iterate) + init_state,
@@ -140,7 +151,7 @@ def while_cond(epoch_number, iterate, *state):
   return results
 
 
-def while_loop(loop_cond, loop_body, init_state, extra_deps, opts=None):
+def while_stmt(test, body, init_state, extra_deps, opts=None):
   """Functional form of a while statement.
 
   The loop operates on a so-called state, which includes all symbols that are
@@ -149,13 +160,13 @@ def while_loop(loop_cond, loop_body, init_state, extra_deps, opts=None):
   of the corresponding types.
 
   Args:
-    loop_cond: Callable with the state as arguments, and boolean return type.
+    test: Callable with the state as arguments, and boolean return type.
         The loop condition.
-    loop_body: Callable with the state as arguments, and state as return type.
+    body: Callable with the state as arguments, and state as return type.
         The actual loop body.
     init_state: Tuple containing the initial state.
     extra_deps: Tuple containing additional entities on which the loop may
-        depend, such as loop invariants referenced by loop_cond. Used
+        depend, such as loop invariants referenced by test. Used
         exclusively for dispatch control.
     opts: Optional dict of extra loop parameters.
 
@@ -163,27 +174,27 @@ def while_loop(loop_cond, loop_body, init_state, extra_deps, opts=None):
     Tuple containing the final state.
   """
   # TODO(mdan): Consider adding a generic mechanism for dynamic dispatch.
-  # That could be somethins as simple as a collection of dispatch rules, with
+  # That could be something as simple as a collection of dispatch rules, with
   # some prioritization.
   if any(tensor_util.is_tensor(v) for v in init_state + extra_deps):
-    return _tf_while_loop(loop_cond, loop_body, init_state, opts)
+    return _tf_while_stmt(test, body, init_state, opts)
   else:
-    return _py_while_loop(loop_cond, loop_body, init_state, opts)
+    return _py_while_stmt(test, body, init_state, opts)
 
 
-def _tf_while_loop(loop_cond, loop_body, init_state, opts):
-  """Overload of while_loop that stages a TF while_loop."""
+def _tf_while_stmt(test, body, init_state, opts):
+  """Overload of while_stmt that stages a TF while_stmt."""
   if opts is None:
     opts = {}
-  return control_flow_ops.while_loop(loop_cond, loop_body, init_state, **opts)
+  return control_flow_ops.while_loop(test, body, init_state, **opts)
 
 
-def _py_while_loop(loop_cond, loop_body, init_state, opts):
-  """Overload of while_loop that executes a Python while loop."""
+def _py_while_stmt(test, body, init_state, opts):
+  """Overload of while_stmt that executes a Python while loop."""
   del opts
   state = init_state
-  while loop_cond(*state):
-    state = loop_body(*state)
+  while test(*state):
+    state = body(*state)
   return state
 
 
diff --git a/tensorflow/contrib/autograph/operators/control_flow_test.py b/tensorflow/contrib/autograph/operators/control_flow_test.py
index a0cd0bfa82bb05..b14d7edba38461 100644
--- a/tensorflow/contrib/autograph/operators/control_flow_test.py
+++ b/tensorflow/contrib/autograph/operators/control_flow_test.py
@@ -29,28 +29,28 @@
 class ForLoopTest(test.TestCase):
 
   def test_tensor(self):
-    s = control_flow.for_loop(
+    s = control_flow.for_stmt(
         constant_op.constant([1, 2, 3, 4]),
-        extra_cond=lambda s: True,
-        loop_body=lambda i, s: (s + i,),
+        extra_test=lambda s: True,
+        body=lambda i, s: (s + i,),
         init_state=(0,))
     with self.test_session() as sess:
       self.assertEqual((10,), sess.run(s))
 
   def test_python(self):
-    s = control_flow.for_loop(
+    s = control_flow.for_stmt(
         range(5),
-        extra_cond=lambda s: True,
-        loop_body=lambda i, s: (s + i,),
+        extra_test=lambda s: True,
+        body=lambda i, s: (s + i,),
         init_state=(0,))
     self.assertEqual(10, s)
 
   def test_dataset(self):
     to_int32 = lambda i: math_ops.cast(i, dtypes.int32)
-    s = control_flow.for_loop(
+    s = control_flow.for_stmt(
         dataset_ops.Dataset.range(5).map(to_int32),
-        extra_cond=lambda s: True,
-        loop_body=lambda i, s: (s + i,),
+        extra_test=lambda s: True,
+        body=lambda i, s: (s + i,),
         init_state=(0,))
     with self.test_session() as sess:
       self.assertEqual((10,), sess.run(s))
@@ -60,9 +60,9 @@ class WhileLoopTest(test.TestCase):
 
   def test_tensor(self):
     n = constant_op.constant(5)
-    results = control_flow.while_loop(
-        loop_cond=lambda i, s: i < n,
-        loop_body=lambda i, s: (i + 1, s + i,),
+    results = control_flow.while_stmt(
+        test=lambda i, s: i < n,
+        body=lambda i, s: (i + 1, s + i,),
         init_state=(0, 0),
         extra_deps=(n,))
     with self.test_session() as sess:
@@ -70,9 +70,9 @@ def test_tensor(self):
 
   def test_python(self):
     n = 5
-    results = control_flow.while_loop(
-        loop_cond=lambda i, s: i < n,
-        loop_body=lambda i, s: (i + 1, s + i),
+    results = control_flow.while_stmt(
+        test=lambda i, s: i < n,
+        body=lambda i, s: (i + 1, s + i),
         init_state=(0, 0),
         extra_deps=(n,))
     self.assertEqual((5, 10), results)
diff --git a/tensorflow/contrib/autograph/operators/data_structures.py b/tensorflow/contrib/autograph/operators/data_structures.py
index c862306baa9e81..06d8727b0fcc30 100644
--- a/tensorflow/contrib/autograph/operators/data_structures.py
+++ b/tensorflow/contrib/autograph/operators/data_structures.py
@@ -18,39 +18,250 @@
 from __future__ import division
 from __future__ import print_function
 
+import collections
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import variables
+
+
+# TODO(mdan): Once control flow supports objects, repackage as a class.
+
+
+def new_list(iterable=None):
+  """The list constructor.
+
+  Args:
+    iterable: Optional elements to fill the list with.
+
+  Returns:
+    A list-like object. The exact return value depends on the initial elements.
+  """
+  if iterable:
+    elements = tuple(iterable)
+  else:
+    elements = ()
+
+  # TODO(mdan): Extend these criteria.
+  if any(isinstance(el, variables.Variable) for el in elements):
+    return _py_list_new(elements)
+  return _tf_tensor_list_new(elements)
 
-# TODO(mdan): Add support for TensorList once functional.
-# TODO(mdan): Add primitives for empty list, list with elements.
 
+def _tf_tensor_list_new(elements):
+  """Overload of new_list that stages a Tensor list creation."""
+  elements = tuple(ops.convert_to_tensor(el) for el in elements)
+  all_dtypes = set(el.dtype for el in elements)
+  if len(all_dtypes) == 1:
+    element_dtype = tuple(all_dtypes)[0]
+  else:
+    # Heterogeneous lists are ok.
+    element_dtype = dtypes.variant
+
+  # TODO(mdan): This may fail for elements of variable shapes.
+  all_shapes = set(tuple(el.shape.as_list()) for el in elements)
+  if len(all_shapes) == 1:
+    element_shape = array_ops.shape(elements[0])
+  else:
+    # Heterogeneous lists are ok.
+    element_shape = constant_op.constant(-1)  # unknown shape, by convention
+
+  l = list_ops.empty_tensor_list(
+      element_shape=element_shape, element_dtype=element_dtype)
+  for el in elements:
+    l = list_ops.tensor_list_push_back(l, el)
+  return l
 
-def append(target, element):
+
+def _py_list_new(elements):
+  """Overload of new_list that creates a Python list."""
+  return list(elements)
+
+
+def list_append(list_, x):
   """The list append function.
 
-  Note: it is unspecified where target will be mutated or not. If target is
-  a TensorFlow entity, it will not be typically mutated. If target is a plain
-  list, it will be. In general, if the target is mutated then the return value
+  Note: it is unspecified where list_ will be mutated or not. If list_ is
+  a TensorFlow entity, it will not be typically mutated. If list_ is a plain
+  list, it will be. In general, if the list is mutated then the return value
   should point to the original entity.
 
   Args:
-    target: An entity that supports append semantics.
-    element: The element to append.
+    list_: An entity that supports append semantics.
+    x: The element to append.
 
   Returns:
-    Same as target, after the append was performed.
+    Same as list_, after the append was performed.
+
+  Raises:
+    ValueError: if list_ is not of a known list-like type.
   """
-  if isinstance(target, tensor_array_ops.TensorArray):
-    return _tf_tensorarray_append(target, element)
+  if isinstance(list_, tensor_array_ops.TensorArray):
+    return _tf_tensorarray_append(list_, x)
+  elif tensor_util.is_tensor(list_):
+    if list_.dtype == dtypes.variant:
+      return _tf_tensor_list_append(list_, x)
+    else:
+      raise ValueError(
+          'tensor lists are expected to be Tensors with dtype=tf.variant,'
+          ' instead found %s' % list_)
   else:
-    return _py_append(target, element)
+    return _py_list_append(list_, x)
+
+
+def _tf_tensor_list_append(list_, x):
+  """Overload of list_append that stages a Tensor list write."""
+  def empty_list_of_elements_like_x():
+    tensor_x = ops.convert_to_tensor(x)
+    return list_ops.empty_tensor_list(
+        element_shape=array_ops.shape(tensor_x),
+        element_dtype=tensor_x.dtype)
+
+  list_ = control_flow_ops.cond(
+      list_ops.tensor_list_length(list_) > 0,
+      lambda: list_,
+      empty_list_of_elements_like_x,
+  )
+  return list_ops.tensor_list_push_back(list_, x)
+
+
+def _tf_tensorarray_append(list_, x):
+  """Overload of list_append that stages a TensorArray write."""
+  return list_.write(list_.size(), x)
+
+
+def _py_list_append(list_, x):
+  """Overload of list_append that executes a Python list append."""
+  # Revert to the original call.
+  list_.append(x)
+  return list_
+
+
+class ListPopOpts(
+    collections.namedtuple('ListPopOpts', ('element_dtype', 'element_shape'))):
+  pass
+
+
+def list_pop(list_, i, opts):
+  """The list pop function.
+
+  Note: it is unspecified where list_ will be mutated or not. If list_ is
+  a TensorFlow entity, it will not be typically mutated. If list_ is a plain
+  list, it will be. In general, if the list is mutated then the return value
+  should point to the original entity.
+
+  Args:
+    list_: An entity that supports pop semantics.
+    i: Optional index to pop from. May be None.
+    opts: A ListPopOpts.
+
+  Returns:
+    Tuple (x, out_list_):
+      out_list_: same as list_, after the removal was performed.
+      x: the removed element value.
+
+  Raises:
+    ValueError: if list_ is not of a known list-like type or the operation is
+    not supported for that type.
+  """
+  assert isinstance(opts, ListPopOpts)
+
+  if isinstance(list_, tensor_array_ops.TensorArray):
+    raise ValueError('TensorArray does not support item removal')
+  elif tensor_util.is_tensor(list_):
+    if list_.dtype == dtypes.variant:
+      return _tf_tensor_list_pop(list_, i, opts)
+    else:
+      raise ValueError(
+          'tensor lists are expected to be Tensors with dtype=tf.variant,'
+          ' instead found %s' % list_)
+  else:
+    return _py_list_pop(list_, i)
+
+
+def _tf_tensor_list_pop(list_, i, opts):
+  """Overload of list_pop that stages a Tensor list pop."""
+  if i is not None:
+    raise NotImplementedError('tensor lists only support removing from the end')
+
+  if opts.element_dtype is None:
+    raise ValueError('cannot pop from a list without knowing its element '
+                     'type; use set_element_type to annotate it')
+  if opts.element_shape is None:
+    raise ValueError('cannot pop from a list without knowing its element '
+                     'shape; use set_element_type to annotate it')
+  list_out, x = list_ops.tensor_list_pop_back(
+      list_, element_dtype=opts.element_dtype)
+  x.set_shape(opts.element_shape)
+  return list_out, x
+
+
+def _py_list_pop(list_, i):
+  """Overload of list_pop that executes a Python list append."""
+  if i is None:
+    x = list_.pop()
+  else:
+    x = list_.pop(i)
+  return list_, x
+
+
+# TODO(mdan): Look into reducing duplication between all these containers.
+class ListStackOpts(
+    collections.namedtuple('ListStackOpts',
+                           ('element_dtype', 'original_call'))):
+  pass
+
+
+def list_stack(list_, opts):
+  """The list stack function.
+
+  This does not have a direct correspondent in Python. The closest idiom to
+  this is tf.append or np.stack. It's different from those in the sense that it
+  accepts a Tensor list, rather than a list of tensors. It can also accept
+  TensorArray. When the target is anything else, the dispatcher will rely on
+  ctx.original_call for fallback.
+
+  Args:
+    list_: An entity that supports append semantics.
+    opts: A ListStackOpts object.
+
+  Returns:
+    The output of the stack operation, typically a Tensor.
+  """
+  assert isinstance(opts, ListStackOpts)
+
+  if isinstance(list_, tensor_array_ops.TensorArray):
+    return _tf_tensorarray_stack(list_)
+  elif tensor_util.is_tensor(list_):
+    if list_.dtype == dtypes.variant:
+      return _tf_tensor_list_stack(list_, opts)
+    else:
+      # No-op for primitive Tensor arguments.
+      return list_
+  else:
+    return _py_list_stack(list_, opts)
+
+
+def _tf_tensorarray_stack(list_):
+  """Overload of list_stack that stages a TensorArray stack."""
+  return list_.stack()
 
 
-def _tf_tensorarray_append(target, element):
-  """Overload of append that stages a TensorArray write at the last position."""
-  return target.write(target.size(), element)
+def _tf_tensor_list_stack(list_, opts):
+  """Overload of list_stack that stages a Tensor list write."""
+  if opts.element_dtype is None:
+    raise ValueError('cannot stack a list without knowing its element type;'
+                     ' use set_element_type to annotate it')
+  return list_ops.tensor_list_stack(list_, element_dtype=opts.element_dtype)
 
 
-def _py_append(target, element):
-  """Overload of append that executes a Python list append."""
-  target.append(element)
-  return target
+def _py_list_stack(list_, opts):
+  """Overload of list_stack that executes a Python list append."""
+  # Revert to the original call.
+  return opts.original_call(list_)
diff --git a/tensorflow/contrib/autograph/operators/data_structures_test.py b/tensorflow/contrib/autograph/operators/data_structures_test.py
index 577d28c34da39f..8bbb52d6c10b24 100644
--- a/tensorflow/contrib/autograph/operators/data_structures_test.py
+++ b/tensorflow/contrib/autograph/operators/data_structures_test.py
@@ -19,25 +19,98 @@
 from __future__ import print_function
 
 from tensorflow.contrib.autograph.operators import data_structures
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.platform import test
 
 
-class AppendTest(test.TestCase):
+class ListTest(test.TestCase):
 
-  def test_tf_tensorarray(self):
+  def test_new_list_empty(self):
+    l = data_structures.new_list()
+    # Can't evaluate an empty list.
+    # TODO(mdan): sess.run should allow tf.variant maybe?
+    self.assertTrue(isinstance(l, ops.Tensor))
+
+  def test_new_list_tensor(self):
+    l = data_structures.new_list([3, 4, 5])
+    t = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
+    with self.test_session() as sess:
+      self.assertAllEqual(sess.run(t), [3, 4, 5])
+
+  def test_append_tensor_list(self):
+    l = data_structures.new_list()
+    x = constant_op.constant([1, 2, 3])
+    l = data_structures.list_append(l, x)
+
+    t = list_ops.tensor_list_stack(l, element_dtype=x.dtype)
+    with self.test_session() as sess:
+      self.assertAllEqual(sess.run(t), [[1, 2, 3]])
+
+  def test_append_tensorarray(self):
     l = tensor_array_ops.TensorArray(dtypes.int32, size=0, dynamic_size=True)
-    l1 = data_structures.append(l, 1)
-    l2 = data_structures.append(l1, 2)
+    l1 = data_structures.list_append(l, 1)
+    l2 = data_structures.list_append(l1, 2)
     with self.test_session() as sess:
       self.assertAllEqual(sess.run(l1.stack()), [1])
       self.assertAllEqual(sess.run(l2.stack()), [1, 2])
 
-  def test_python(self):
+  def test_append_python(self):
     l = []
-    self.assertAllEqual(data_structures.append(l, 1), [1])
-    self.assertAllEqual(data_structures.append(l, 2), [1, 2])
+    self.assertAllEqual(data_structures.list_append(l, 1), [1])
+    self.assertAllEqual(data_structures.list_append(l, 2), [1, 2])
+
+  def test_pop_tensor_list(self):
+    initial_list = constant_op.constant([[1, 2], [3, 4]])
+    elem_shape = constant_op.constant([2])
+    l = list_ops.tensor_list_from_tensor(initial_list, element_shape=elem_shape)
+
+    opts = data_structures.ListPopOpts(
+        element_dtype=initial_list.dtype,
+        element_shape=(2,))
+
+    with self.assertRaises(NotImplementedError):
+      data_structures.list_pop(l, 0, opts)
+
+    with self.test_session() as sess:
+      l, x = data_structures.list_pop(l, None, opts)
+      self.assertAllEqual(sess.run(x), [3, 4])
+
+      t = list_ops.tensor_list_stack(l, element_dtype=initial_list.dtype)
+      self.assertAllEqual(sess.run(t), [[1, 2]])
+
+  def test_pop_python(self):
+    l = [1, 2, 3]
+    opts = data_structures.ListPopOpts(element_dtype=None, element_shape=())
+    self.assertAllEqual(data_structures.list_pop(l, None, opts), ([1, 2], 3))
+    self.assertAllEqual(data_structures.list_pop(l, None, opts), ([1], 2))
+
+  def test_stack_tensor_list(self):
+    initial_list = constant_op.constant([[1, 2], [3, 4]])
+    elem_shape = constant_op.constant([2])
+    l = list_ops.tensor_list_from_tensor(initial_list, element_shape=elem_shape)
+
+    opts = data_structures.ListStackOpts(
+        element_dtype=initial_list.dtype, original_call=None)
+
+    with self.test_session() as sess:
+      t = data_structures.list_stack(l, opts)
+      self.assertAllEqual(sess.run(t), sess.run(initial_list))
+
+  def test_stack_fallback(self):
+
+    def dummy_function(l):
+      # Lazy person's mock: just transform the argument in a way in which we
+      # can check that this function was indeed called.
+      return [x * 2 for x in l]
+
+    opts = data_structures.ListStackOpts(
+        element_dtype=None, original_call=dummy_function)
+
+    self.assertAllEqual(data_structures.list_stack([1, 2], opts), [2, 4])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/applications/densenet/__init__.py b/tensorflow/contrib/autograph/operators/dispatch_context.py
similarity index 60%
rename from tensorflow/python/keras/applications/densenet/__init__.py
rename to tensorflow/contrib/autograph/operators/dispatch_context.py
index 6b8ea83920733a..097002465bd140 100644
--- a/tensorflow/python/keras/applications/densenet/__init__.py
+++ b/tensorflow/contrib/autograph/operators/dispatch_context.py
@@ -12,18 +12,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""DenseNet Keras applications."""
+"""Structures that allow uniform control over the dispatch process."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.applications.densenet import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.densenet import DenseNet121
-from tensorflow.python.keras._impl.keras.applications.densenet import DenseNet169
-from tensorflow.python.keras._impl.keras.applications.densenet import DenseNet201
-from tensorflow.python.keras._impl.keras.applications.densenet import preprocess_input
+import collections
 
-del absolute_import
-del division
-del print_function
+
+# TODO(mdan): This is where macro override controls fit.
+
+
+class DispatchContext(collections.namedtuple(
+    'DispatchContext',
+    ('options',))):
+  """Allows passing additional parameters to the specific implementations.
+
+  Attributes:
+    options: Optional dict of extra arguments that may be required by specific
+      implementations.
+  """
+
+  def option(self, name):
+    return self.options[name]
+
+
+NO_CTX = DispatchContext(options={})
diff --git a/tensorflow/contrib/autograph/operators/slices.py b/tensorflow/contrib/autograph/operators/slices.py
new file mode 100644
index 00000000000000..04fbeb2f6e3923
--- /dev/null
+++ b/tensorflow/contrib/autograph/operators/slices.py
@@ -0,0 +1,133 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operators specific to slicing operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import list_ops
+from tensorflow.python.ops import tensor_array_ops
+
+
+# TODO(mdan): Support extended slices.
+
+
+class GetItemOpts(collections.namedtuple('GetItemOpts', ('element_dtype',))):
+  pass
+
+
+def get_item(target, i, opts):
+  """The slice read operator (i.e. __getitem__).
+
+  Note: it is unspecified whether target will be mutated or not. In general,
+  if target is mutable (like Python lists), it will be mutated.
+
+  Args:
+    target: An entity that supports getitem semantics.
+    i: Index to read from.
+    opts: A GetItemOpts object.
+
+  Returns:
+    The read element.
+
+  Raises:
+    ValueError: if target is not of a supported type.
+  """
+  assert isinstance(opts, GetItemOpts)
+
+  if isinstance(target, tensor_array_ops.TensorArray):
+    return _tf_tensorarray_get_item(target, i)
+  elif tensor_util.is_tensor(target):
+    if target.dtype == dtypes.variant:
+      return _tf_tensor_list_get_item(target, i, opts)
+    else:
+      return _tf_tensor_get_item(target, i)
+  else:
+    return _py_get_item(target, i)
+
+
+def _tf_tensorarray_get_item(target, i):
+  """Overload of get_item that stages a TensorArray read."""
+  return target.read(i)
+
+
+def _tf_tensor_list_get_item(target, i, opts):
+  """Overload of get_item that stages a Tensor list read."""
+  if opts.element_dtype is None:
+    raise ValueError('cannot retrieve from a list without knowing its '
+                     'element type; use set_element_type to annotate it')
+  x = list_ops.tensor_list_get_item(target, i, element_dtype=opts.element_dtype)
+  return x
+
+
+def _tf_tensor_get_item(target, i):
+  """Overload of get_item that stages a Tensor (not Tensor list) read."""
+  return target[i]
+
+
+def _py_get_item(target, i):
+  """Overload of get_item that executes a Python list modification."""
+  return target[i]
+
+
+def set_item(target, i, x):
+  """The slice write operator (i.e. __setitem__).
+
+  Note: it is unspecified whether target will be mutated or not. In general,
+  if target is mutable (like Python lists), it will be mutated.
+
+  Args:
+    target: An entity that supports setitem semantics.
+    i: Index to modify.
+    x: The new element value.
+
+  Returns:
+    Same as target, after the update was performed.
+
+  Raises:
+    ValueError: if target is not of a supported type.
+  """
+  if isinstance(target, tensor_array_ops.TensorArray):
+    return _tf_tensorarray_set_item(target, i, x)
+  elif tensor_util.is_tensor(target):
+    if target.dtype == dtypes.variant:
+      return _tf_tensor_list_set_item(target, i, x)
+    else:
+      raise ValueError(
+          'tensor lists are expected to be Tensors with dtype=tf.variant,'
+          ' instead found %s' % target)
+  else:
+    return _py_set_item(target, i, x)
+
+
+def _tf_tensorarray_set_item(target, i, x):
+  """Overload of set_item that stages a TensorArray write."""
+  return target.write(i, x)
+
+
+def _tf_tensor_list_set_item(target, i, x):
+  """Overload of set_item that stages a Tensor list update."""
+  return list_ops.tensor_list_set_item(target, i, x)
+
+
+def _py_set_item(target, i, x):
+  """Overload of set_item that executes a Python list modification."""
+  target[i] = x
+  return target
diff --git a/tensorflow/contrib/autograph/operators/slices_test.py b/tensorflow/contrib/autograph/operators/slices_test.py
new file mode 100644
index 00000000000000..d4aacb9d2015fe
--- /dev/null
+++ b/tensorflow/contrib/autograph/operators/slices_test.py
@@ -0,0 +1,51 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for slices module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.autograph.operators import slices
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import list_ops
+from tensorflow.python.platform import test
+
+
+class SlicesTest(test.TestCase):
+
+  def test_set_item_tensor_list(self):
+    initial_list = constant_op.constant([[1, 2], [3, 4]])
+    elem_shape = constant_op.constant([2])
+    l = list_ops.tensor_list_from_tensor(initial_list, element_shape=elem_shape)
+    l = slices.set_item(l, 0, [5, 6])
+
+    with self.test_session() as sess:
+      t = list_ops.tensor_list_stack(l, element_dtype=initial_list.dtype)
+      self.assertAllEqual(sess.run(t), [[5, 6], [3, 4]])
+
+  def test_get_item_tensor_list(self):
+    initial_list = constant_op.constant([[1, 2], [3, 4]])
+    elem_shape = constant_op.constant([2])
+    l = list_ops.tensor_list_from_tensor(initial_list, element_shape=elem_shape)
+    t = slices.get_item(
+        l, 1, slices.GetItemOpts(element_dtype=initial_list.dtype))
+
+    with self.test_session() as sess:
+      self.assertAllEqual(sess.run(t), [3, 4])
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/autograph/pyct/BUILD b/tensorflow/contrib/autograph/pyct/BUILD
index 796ab445c74128..989b821e53a5ce 100644
--- a/tensorflow/contrib/autograph/pyct/BUILD
+++ b/tensorflow/contrib/autograph/pyct/BUILD
@@ -130,6 +130,7 @@ py_test(
     name = "transformer_test",
     srcs = ["transformer_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/autograph/pyct/anno.py b/tensorflow/contrib/autograph/pyct/anno.py
index cc4a7edf02ed75..ae861627fd65cc 100644
--- a/tensorflow/contrib/autograph/pyct/anno.py
+++ b/tensorflow/contrib/autograph/pyct/anno.py
@@ -46,8 +46,15 @@ class Basic(NoValue):
       '`name_map` allows renaming symbols.')
 
 
-def getanno(node, key, field_name='___pyct_anno'):
-  return getattr(node, field_name)[key]
+FAIL = object()
+
+
+def getanno(node, key, default=FAIL, field_name='___pyct_anno'):
+  if (default is FAIL or
+      (hasattr(node, field_name) and (key in getattr(node, field_name)))):
+    return getattr(node, field_name)[key]
+  else:
+    return default
 
 
 def hasanno(node, key, field_name='___pyct_anno'):
@@ -73,5 +80,9 @@ def delanno(node, key, field_name='___pyct_anno'):
 
 
 def copyanno(from_node, to_node, key, field_name='___pyct_anno'):
-  if hasanno(from_node, key, field_name):
-    setanno(to_node, key, getanno(from_node, key, field_name), field_name)
+  if hasanno(from_node, key, field_name=field_name):
+    setanno(
+        to_node,
+        key,
+        getanno(from_node, key, field_name=field_name),
+        field_name=field_name)
diff --git a/tensorflow/contrib/autograph/pyct/anno_test.py b/tensorflow/contrib/autograph/pyct/anno_test.py
index 1d4d9d119e0c45..f2c0c8cf05ca4b 100644
--- a/tensorflow/contrib/autograph/pyct/anno_test.py
+++ b/tensorflow/contrib/autograph/pyct/anno_test.py
@@ -38,12 +38,14 @@ def test_basic(self):
 
     anno.setanno(node, 'foo', 3)
     self.assertTrue(anno.hasanno(node, 'foo'))
-    self.assertEqual(3, anno.getanno(node, 'foo'))
+    self.assertEqual(anno.getanno(node, 'foo'), 3)
+    self.assertEqual(anno.getanno(node, 'bar', default=7), 7)
 
     anno.delanno(node, 'foo')
     self.assertFalse(anno.hasanno(node, 'foo'))
     with self.assertRaises(AttributeError):
       anno.getanno(node, 'foo')
+    self.assertIsNone(anno.getanno(node, 'foo', default=None))
 
   def test_copyanno(self):
     node_1 = ast.Name()
diff --git a/tensorflow/contrib/autograph/pyct/qual_names.py b/tensorflow/contrib/autograph/pyct/qual_names.py
index 583cf7ecd7bce3..da07013cf4f430 100644
--- a/tensorflow/contrib/autograph/pyct/qual_names.py
+++ b/tensorflow/contrib/autograph/pyct/qual_names.py
@@ -205,6 +205,7 @@ def visit_Attribute(self, node):
     return node
 
   def visit_Subscript(self, node):
+    # TODO(mdan): This may no longer apply if we overload getitem.
     node = self.generic_visit(node)
     s = node.slice
     if not isinstance(s, gast.Index):
@@ -216,7 +217,11 @@ def visit_Subscript(self, node):
     elif isinstance(s.value, gast.Str):
       subscript = QN(StringLiteral(s.value.s))
     else:
-      subscript = anno.getanno(node.slice.value, anno.Basic.QN)
+      # The index may be an expression, case in which a name doesn't make sense.
+      if anno.hasanno(node.slice.value, anno.Basic.QN):
+        subscript = anno.getanno(node.slice.value, anno.Basic.QN)
+      else:
+        return node
     if anno.hasanno(node.value, anno.Basic.QN):
       anno.setanno(node, anno.Basic.QN,
                    QN(anno.getanno(node.value, anno.Basic.QN),
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/BUILD b/tensorflow/contrib/autograph/pyct/static_analysis/BUILD
index 83f3bafc421764..8064a967cd389e 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/BUILD
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/BUILD
@@ -19,6 +19,7 @@ py_library(
     srcs = [
         "activity.py",
         "annos.py",
+        "cfg.py",
         "live_values.py",
         "type_info.py",
     ],
@@ -43,6 +44,19 @@ py_test(
     ],
 )
 
+py_test(
+    name = "cfg_test",
+    srcs = ["cfg_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_windows"],
+    deps = [
+        ":static_analysis",
+        "//tensorflow/contrib/autograph/pyct",
+        "//tensorflow/python:client_testlib",
+        "@gast_archive//:gast",
+    ],
+)
+
 py_test(
     name = "live_values_test",
     srcs = ["live_values_test.py"],
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/activity.py b/tensorflow/contrib/autograph/pyct/static_analysis/activity.py
index 2c14c2c8c23810..4d7b0cbb7b8f6e 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/activity.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/activity.py
@@ -23,11 +23,12 @@
 import gast
 
 from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import qual_names
 from tensorflow.contrib.autograph.pyct import transformer
-from tensorflow.contrib.autograph.pyct.qual_names import QN
 from tensorflow.contrib.autograph.pyct.static_analysis.annos import NodeAnno
 
 # TODO(mdan): Add support for PY3 (e.g. Param vs arg).
+# TODO(alexbw): Ignore named literals (e.g. None)
 
 
 class Scope(object):
@@ -43,16 +44,20 @@ class Scope(object):
     used: identifiers referenced in this scope
   """
 
-  def __init__(self, parent, isolated=True):
+  def __init__(self, parent, isolated=True, add_unknown_symbols=False):
     """Create a new scope.
 
     Args:
       parent: A Scope or None.
       isolated: Whether the scope is isolated, that is, whether variables
           created in this scope should be visible to the parent scope.
+      add_unknown_symbols: Whether to handle attributed and subscripts
+          without having first seen the base name.
+          E.g., analyzing the statement 'x.y = z' without first having seen 'x'.
     """
     self.isolated = isolated
     self.parent = parent
+    self.add_unknown_symbols = add_unknown_symbols
     self.modified = set()
     self.created = set()
     self.used = set()
@@ -134,13 +139,17 @@ def mark_param(self, name):
     self.params.add(name)
 
   def mark_creation(self, name, writes_create_symbol=False):
+    """Mark a qualified name as created."""
     if name.is_composite():
       parent = name.parent
-      if self.has(parent):
-        if not writes_create_symbol:
-          return
+      if not writes_create_symbol:
+        return
       else:
-        raise ValueError('Unknown symbol "%s".' % parent)
+        if not self.has(parent):
+          if self.add_unknown_symbols:
+            self.mark_read(parent)
+          else:
+            raise ValueError('Unknown symbol "%s".' % parent)
     self.created.add(name)
 
   def mark_write(self, name):
@@ -163,17 +172,25 @@ def mark_returned(self, name):
 
 
 class ActivityAnalyzer(transformer.Base):
-  """Annotates nodes with local scope information. See Scope."""
+  """Annotates nodes with local scope information.
 
-  def __init__(self, context, parent_scope):
+  See Scope.
+
+  The use of this class requires that qual_names.resolve() has been called on
+  the node. This class will ignore nodes have not been
+  annotated with their qualified names.
+  """
+
+  def __init__(self, context, parent_scope=None, add_unknown_symbols=False):
     super(ActivityAnalyzer, self).__init__(context)
-    self.scope = Scope(parent_scope)
+    self.scope = Scope(parent_scope, None, add_unknown_symbols)
     self._in_return_statement = False
+    self._in_aug_assign = False
 
   @property
   def _in_constructor(self):
-    innermost = self.enclosing_entities[-1]
     if len(self.enclosing_entities) > 1:
+      innermost = self.enclosing_entities[-1]
       parent = self.enclosing_entities[-2]
       return isinstance(parent, gast.ClassDef) and innermost.name == '__init__'
     return False
@@ -184,6 +201,7 @@ def _node_sets_self_attribute(self, node):
       # TODO(mdan): The 'self' argument is not guaranteed to be called 'self'.
       if qn.has_attr and qn.parent.qn == ('self',):
         return True
+    return False
 
   def _track_symbol(self,
                     node,
@@ -201,12 +219,14 @@ def _track_symbol(self,
         self.scope.mark_write(qn.parent)
       if writes_create_symbol:
         self.scope.mark_creation(qn, writes_create_symbol=True)
+      if self._in_aug_assign:
+        self.scope.mark_read(qn)
     elif isinstance(node.ctx, gast.Load):
       self.scope.mark_read(qn)
     elif isinstance(node.ctx, gast.Param):
       # Param contexts appear in function defs, so they have the meaning of
       # defining a variable.
-      # TODO(mdan): This bay be incorrect with nested functions.
+      # TODO(mdan): This may be incorrect with nested functions.
       # For nested functions, we'll have to add the notion of hiding args from
       # the parent scope, not writing to them.
       self.scope.mark_creation(qn)
@@ -222,6 +242,14 @@ def _track_symbol(self,
     if self._in_return_statement:
       self.scope.mark_returned(qn)
 
+  def visit_AugAssign(self, node):
+    # Special rules for AugAssign. In Assign, the target is only written,
+    # but in AugAssig (e.g. a += b), the target is both read and written.
+    self._in_aug_assign = True
+    self.generic_visit(node)
+    self._in_aug_assign = False
+    return node
+
   def visit_Name(self, node):
     self.generic_visit(node)
     self._track_symbol(node)
@@ -295,7 +323,7 @@ def _process_parallel_blocks(self, parent, children):
 
   def visit_FunctionDef(self, node):
     if self.scope:
-      qn = QN(node.name)
+      qn = qual_names.QN(node.name)
       self.scope.mark_write(qn)
     current_scope = self.scope
     body_scope = Scope(current_scope, isolated=True)
@@ -355,5 +383,32 @@ def visit_Return(self, node):
     return node
 
 
+def get_read(node, context):
+  """Return the variable names as QNs (qual_names.py) read by this statement."""
+  analyzer = ActivityAnalyzer(context, None, True)
+  analyzer.visit(node)
+  return analyzer.scope.used
+
+
+def get_updated(node, context):
+  """Return the variable names created or mutated by this statement.
+
+  This function considers assign statements, augmented assign statements, and
+  the targets of for loops, as well as function arguments.
+  For example, `x[0] = 2` will return `x`, `x, y = 3, 4` will return `x` and
+  `y`, `for i in range(x)` will return `i`, etc.
+  Args:
+    node: An AST node
+    context: An EntityContext instance
+
+  Returns:
+    A set of variable names (QNs, see qual_names.py) of all the variables
+    created or mutated.
+  """
+  analyzer = ActivityAnalyzer(context, None, True)
+  analyzer.visit(node)
+  return analyzer.scope.created | analyzer.scope.modified
+
+
 def resolve(node, context, parent_scope=None):
   return ActivityAnalyzer(context, parent_scope).visit(node)
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py b/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py
index ef79a295bfa394..fdbd349af9d332 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/activity_test.py
@@ -123,7 +123,7 @@ def _parse_and_analyze(self, test_fn):
         recursive=True)
     node = qual_names.resolve(node)
     node = activity.resolve(node, ctx)
-    return node
+    return node, ctx
 
   def test_local_markers(self):
 
@@ -133,7 +133,7 @@ def test_fn(a):  # pylint:disable=unused-argument
         b -= 1
       return b
 
-    node = self._parse_and_analyze(test_fn)
+    node, _ = self._parse_and_analyze(test_fn)
     self.assertFalse(
         anno.getanno(node.body[0].body[0].value,
                      NodeAnno.IS_LOCAL))  # c in b = c
@@ -156,6 +156,7 @@ def assertSymbolSetsAre(self, expected, actual, name):
                               expected - actual, actual - expected))
 
   def assertScopeIsRmc(self, scope, used, modified, created):
+    """Assert the scope contains specific used, modified & created variables."""
     self.assertSymbolSetsAre(used, scope.used, 'read')
     self.assertSymbolSetsAre(modified, scope.modified, 'modified')
     self.assertSymbolSetsAre(created, scope.created, 'created')
@@ -168,7 +169,7 @@ def test_fn(a):
       print(a, b)
       return c
 
-    node = self._parse_and_analyze(test_fn)
+    node, _ = self._parse_and_analyze(test_fn)
     print_node = node.body[0].body[2]
     if isinstance(print_node, gast.Print):
       # Python 2
@@ -191,7 +192,7 @@ def test_fn(a):
       foo(a, b)  # pylint:disable=undefined-variable
       return c
 
-    node = self._parse_and_analyze(test_fn)
+    node, _ = self._parse_and_analyze(test_fn)
     call_node = node.body[0].body[2].value
     # We basically need to detect which variables are captured by the call
     # arguments.
@@ -208,7 +209,7 @@ def test_fn(a):
       foo(a.b, a.c)
       return a.d
 
-    node = self._parse_and_analyze(test_fn)
+    node, _ = self._parse_and_analyze(test_fn)
     call_node = node.body[0].body[1].value
     self.assertScopeIsRmc(
         anno.getanno(call_node, NodeAnno.ARGS_SCOPE),
@@ -234,7 +235,7 @@ def test_fn(a):
       foo(a[0], a[b])
       return a[c]
 
-    node = self._parse_and_analyze(test_fn)
+    node, _ = self._parse_and_analyze(test_fn)
     call_node = node.body[0].body[2].value
     self.assertScopeIsRmc(
         anno.getanno(call_node, NodeAnno.ARGS_SCOPE),
@@ -258,7 +259,7 @@ def test_fn(a):
         b -= 1
       return b, c
 
-    node = self._parse_and_analyze(test_fn)
+    node, _ = self._parse_and_analyze(test_fn)
     while_node = node.body[0].body[1]
     self.assertScopeIsRmc(
         anno.getanno(while_node, NodeAnno.BODY_SCOPE), ('b',), ('b', 'c'),
@@ -278,7 +279,7 @@ def test_fn(a):
         b -= 1
       return b, c
 
-    node = self._parse_and_analyze(test_fn)
+    node, _ = self._parse_and_analyze(test_fn)
     for_node = node.body[0].body[1]
     self.assertScopeIsRmc(
         anno.getanno(for_node, NodeAnno.BODY_SCOPE), ('b',), ('b', 'c'), ('c',))
@@ -299,7 +300,7 @@ def test_fn(x):
         u = -y
       return z, u
 
-    node = self._parse_and_analyze(test_fn)
+    node, _ = self._parse_and_analyze(test_fn)
     if_node = node.body[0].body[0]
     self.assertScopeIsRmc(
         anno.getanno(if_node, NodeAnno.BODY_SCOPE), ('x', 'y'), ('x', 'y', 'z'),
@@ -326,7 +327,7 @@ def test_fn(a):
         d = 1
       return d
 
-    node = self._parse_and_analyze(test_fn)
+    node, _ = self._parse_and_analyze(test_fn)
     if_node = node.body[0].body[0]
     self.assertScopeIsRmc(
         anno.getanno(if_node, NodeAnno.BODY_SCOPE),
@@ -358,7 +359,7 @@ def test_fn(a, b, c, e):
         d = 1
       return d
 
-    node = self._parse_and_analyze(test_fn)
+    node, _ = self._parse_and_analyze(test_fn)
     if_node = node.body[0].body[0]
     self.assertScopeIsRmc(
         anno.getanno(if_node, NodeAnno.BODY_SCOPE),
@@ -390,7 +391,7 @@ def test_fn(b):
           a = b * b
       return a
 
-    node = self._parse_and_analyze(test_fn)
+    node, _ = self._parse_and_analyze(test_fn)
     inner_if_node = node.body[0].body[0].body[0]
     self.assertScopeIsRmc(
         anno.getanno(inner_if_node, NodeAnno.BODY_SCOPE), ('b',), ('a',),
@@ -413,7 +414,7 @@ def f(x):
         b -= f(i)
       return b, c
 
-    node = self._parse_and_analyze(test_fn)
+    node, _ = self._parse_and_analyze(test_fn)
     fn_def_node = node.body[0].body[0]
 
     self.assertScopeIsRmc(
@@ -434,7 +435,7 @@ def __init__(self, a):
         self.b = a
         self.b.c = 1
 
-    node = self._parse_and_analyze(TestClass)
+    node, _ = self._parse_and_analyze(TestClass)
     init_node = node.body[0].body[0]
     self.assertScopeIsRmc(
         anno.getanno(init_node, NodeAnno.BODY_SCOPE),
@@ -448,15 +449,118 @@ def test_aug_assign_subscripts(self):
     def test_fn(a):
       a[0] += 1
 
-    node = self._parse_and_analyze(test_fn)
+    node, _ = self._parse_and_analyze(test_fn)
     fn_node = node.body[0]
     self.assertScopeIsRmc(
         anno.getanno(fn_node, NodeAnno.BODY_SCOPE),
-        ('a',),
+        ('a', 'a[0]'),
         ('a', 'a[0]'),
         ('a',),
     )
 
+  def test_return_vars_are_read(self):
+
+    def test_fn(a, b, c):  # pylint: disable=unused-argument
+      return c
+
+    node, _ = self._parse_and_analyze(test_fn)
+    fn_node = node.body[0]
+    self.assertScopeIsRmc(
+        anno.getanno(fn_node, NodeAnno.BODY_SCOPE),
+        ('c',),
+        (),
+        (
+            'a',
+            'b',
+            'c',
+        ),
+    )
+
+  def test_aug_assign(self):
+
+    def test_fn(a, b):
+      a += b
+
+    node, _ = self._parse_and_analyze(test_fn)
+    fn_node = node.body[0]
+    self.assertScopeIsRmc(
+        anno.getanno(fn_node, NodeAnno.BODY_SCOPE),
+        ('a', 'b'),
+        ('a'),
+        ('a', 'b'),
+    )
+
+  def test_aug_assign_rvalues(self):
+
+    a = dict(bar=3)
+
+    def foo():
+      return a
+
+    def test_fn(x):
+      foo()['bar'] += x
+
+    node, _ = self._parse_and_analyze(test_fn)
+    fn_node = node.body[0]
+    self.assertScopeIsRmc(
+        anno.getanno(fn_node, NodeAnno.BODY_SCOPE),
+        ('foo', 'x'),
+        (),
+        ('x',),
+    )
+
+  def test_params_created(self):
+
+    def test_fn(a, b):  # pylint: disable=unused-argument
+      return b
+
+    node, _ = self._parse_and_analyze(test_fn)
+    fn_node = node.body[0]
+    self.assertScopeIsRmc(
+        anno.getanno(fn_node, NodeAnno.BODY_SCOPE), ('b',), (('')),
+        (('a', 'b')))
+
+  def test_get_read(self):
+
+    def test_fn(x, y):
+      z = test_fn(x, y)
+      return z
+
+    node, ctx = self._parse_and_analyze(test_fn)
+    node = node.body[0].body[0]
+    read_vars = activity.get_read(node, ctx)
+    self.assertEqual(read_vars, set(map(qual_names.QN, ('test_fn', 'x', 'y'))))
+
+    def test_fn2(x, y, z):
+      z += test_fn2(x, y, z)
+      return z
+
+    node, ctx = self._parse_and_analyze(test_fn2)
+    node = node.body[0].body[0]
+    read_vars = activity.get_read(node, ctx)
+    self.assertEqual(read_vars,
+                     set(map(qual_names.QN, ('test_fn2', 'x', 'y', 'z'))))
+
+  def test_get_updated(self):
+
+    def test_fn(x, y):
+      z = test_fn(x, y)
+      return z
+
+    node, ctx = self._parse_and_analyze(test_fn)
+    node = node.body[0].body[0]
+    updated_vars = activity.get_updated(node, ctx)
+    self.assertEqual(updated_vars, set(map(qual_names.QN, ('z'))))
+
+    def test_fn2(x, y, z):
+      z += test_fn2(x, y, z)
+      return z
+
+    node, ctx = self._parse_and_analyze(test_fn2)
+    node = node.body[0].body[0]
+    updated_vars = activity.get_updated(node, ctx)
+    self.assertEqual(updated_vars, set(map(qual_names.QN, ('z'))))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/cfg.py b/tensorflow/contrib/autograph/pyct/static_analysis/cfg.py
new file mode 100644
index 00000000000000..ad97fdfa8e78d1
--- /dev/null
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/cfg.py
@@ -0,0 +1,445 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Control flow graph analysis.
+
+Given a Python AST we construct a control flow graph, with edges both to the
+next and previous statements (so it can easily walk the graph both ways). Its
+nodes contain the AST of the statements. It can then perform forward or backward
+analysis on this CFG.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import namedtuple
+import functools
+import operator
+
+import gast
+
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct.static_analysis import activity
+
+
+class CfgNode(object):
+  """A node in the CFG."""
+  __slots__ = ['next', 'value', 'prev']
+
+  def __init__(self, value):
+    self.next = set()
+    self.prev = set()
+    self.value = value
+
+
+class Cfg(namedtuple('Cfg', ['entry', 'exit'])):
+  """A Control Flow Graph.
+
+  Each statement is represented as a node. For control flow statements such
+  as conditionals and loops the conditional itself is a node which either
+  branches or cycles, respectively.
+  Attributes:
+    entry: The entry node, which contains the `gast.arguments` node of the
+        function definition.
+    exit: The exit node. This node is special because it has no value (i.e. no
+        corresponding AST node). This is because Python functions can have
+        multiple return statements.
+  """
+  pass
+
+
+class CfgBuilder(gast.NodeVisitor):
+  """Construct a control flow graph.
+
+  Construct a CFG starting from a FunctionDef node.
+  Usage:
+    cfg_obj = CfgBuilder().build_cfg(fndef_node)
+  """
+
+  def __init__(self):
+    # The current leaves of the CFG
+    self.current_leaves = []
+    # TODO(alexbw): generalize to break, return, continue, yield, etc.
+    # A stack of lists, tracking continue statements
+    self.continue_ = []
+    # A stack of lists tracking break nodes
+    self.break_ = []
+
+  def set_current_leaves(self, cfg_node):
+    """Link this cfg_node to the current leaves.
+
+    This is the central function for building the CFG. It links the current
+    head cfg_nodes to the passed cfg_node. It then resets the head to the
+    passed cfg_node.
+
+    Args:
+      cfg_node: A CfgNode instance.
+    """
+    for head in self.current_leaves:
+      head.next.add(cfg_node)
+      # While we're linking the CFG forward, add backlinks
+      cfg_node.prev.add(head)
+    self.current_leaves = [cfg_node]
+
+  def build_cfg(self, node):
+    """Build a CFG for a function.
+
+    Implementation of building a CFG for dataflow analysis. See, e.g.:
+    https://www.seas.harvard.edu/courses/cs252/2011sp/slides/Lec02-Dataflow.pdf
+
+    Args:
+      node: A function definition the body of which to analyze.
+    Returns:
+      A CFG object.
+    Raises:
+      TypeError: If the input is not a function definition.
+    """
+    if not isinstance(node, gast.FunctionDef):
+      raise TypeError('input must be a function definition')
+    entry_cfg_node = CfgNode(node.args)
+    self.current_leaves = [entry_cfg_node]
+    self.visit_statements(node.body)
+    exit_cfg_node = CfgNode(None)
+    self.set_current_leaves(exit_cfg_node)
+    return Cfg(entry_cfg_node, exit_cfg_node)
+
+  def visit_statements(self, nodes):
+    for node in nodes:
+      # Check for control flow
+      if isinstance(node, (gast.For, gast.While, gast.If, gast.Try, gast.Break,
+                           gast.Continue, gast.With)):
+        self.visit(node)
+      else:
+        expr = CfgNode(node)
+        self.set_current_leaves(expr)
+
+  def generic_visit(self, node):
+    raise ValueError('unknown control flow')
+
+  def visit_If(self, node):
+    # TODO(alexbw): change this to use immutable tuples instead of lists
+    # The current head will hold the conditional
+    test = CfgNode(node.test)
+    self.set_current_leaves(test)
+    # Handle the body
+    self.visit_statements(node.body)
+    body_exit = self.current_leaves
+    self.current_leaves = [test]
+    # Handle the orelse
+    self.visit_statements(node.orelse)
+    self.current_leaves.extend(body_exit)
+
+  def visit_While(self, node):
+    test = CfgNode(node.test)
+    self.set_current_leaves(test)
+    # Start a new level of nesting
+    self.break_.append([])
+    self.continue_.append([])
+    # Handle the body
+    self.visit_statements(node.body)
+    body_exit = self.current_leaves
+    self.current_leaves.extend(self.continue_.pop())
+    self.set_current_leaves(test)
+    # Handle the orelse
+    self.visit_statements(node.orelse)
+    # The break statements and the test go to the next node
+    self.current_leaves.extend(self.break_.pop())
+    # Body and orelse statements can reach out of the loop
+    self.current_leaves.extend(body_exit)
+
+  def visit_For(self, node):
+    iter_ = CfgNode(node.iter)
+    self.set_current_leaves(iter_)
+    self.break_.append([])
+    self.continue_.append([])
+    self.visit_statements(node.body)
+    body_exit = self.current_leaves
+    self.current_leaves.extend(self.continue_.pop())
+    self.set_current_leaves(iter_)
+    # Handle the orelse
+    self.visit_statements(node.orelse)
+    # The break statements and the test go to the next node
+    self.current_leaves.extend(self.break_.pop())
+    # Body and orelse statements can reach out of the loop
+    self.current_leaves.extend(body_exit)
+
+  def visit_Break(self, node):
+    self.break_[-1].extend(self.current_leaves)
+    self.current_leaves[:] = []
+
+  def visit_Continue(self, node):
+    self.continue_[-1].extend(self.current_leaves)
+    self.current_leaves[:] = []
+
+  def visit_Try(self, node):
+    self.visit_statements(node.body)
+    body = self.current_leaves
+    handlers = []
+    for handler in node.handlers:
+      self.current_leaves = body[:]
+      self.visit_statements(handler.body)
+      handlers.extend(self.current_leaves)
+    self.current_leaves = body
+    self.visit_statements(node.orelse)
+    self.current_leaves = handlers + self.current_leaves
+    self.visit_statements(node.finalbody)
+
+  def visit_With(self, node):
+    for item in node.items:
+      self.set_current_leaves(CfgNode(item))
+    self.visit_statements(node.body)
+
+
+# TODO(alexbw): once CFG analysis occurs at a block level,
+# this extra class will not be necessary
+class PropagateAnalysis(gast.NodeVisitor):
+  """Port analysis annotations from statements to their enclosing blocks."""
+
+  def __init__(self, analysis):
+    self.transfer_fn = analysis.transfer_fn
+    self.in_label = analysis.in_label
+    self.out_label = analysis.out_label
+    super(PropagateAnalysis, self).__init__()
+
+  def visit_If(self, node):
+    # Depth-first.
+    self.generic_visit(node)
+    incoming = anno.getanno(node.body[0], self.in_label)
+    incoming |= anno.getanno(node.test, self.in_label)
+    outgoing = anno.getanno(node.body[-1], self.out_label)
+    outgoing |= anno.getanno(node.test, self.out_label)
+    if node.orelse:
+      orelse_outgoing = anno.getanno(node.orelse[-1], self.out_label)
+      outgoing = self.transfer_fn(outgoing, orelse_outgoing)
+    anno.setanno(node, self.in_label, incoming)
+    anno.setanno(node, self.out_label, outgoing)
+
+  def visit_For(self, node):
+    self.generic_visit(node)
+    incoming = set(anno.getanno(node.body[0], self.in_label))
+    incoming -= set((anno.getanno(node.target, anno.Basic.QN),))
+    outgoing = anno.getanno(node.body[-1], self.out_label)
+    if node.orelse:
+      orelse_outgoing = anno.getanno(node.orelse[-1], self.out_label)
+      outgoing = self.transfer_fn(outgoing, orelse_outgoing)
+    anno.setanno(node, self.in_label, frozenset(incoming))
+    anno.setanno(node, self.out_label, outgoing)
+
+  def visit_While(self, node):
+    self.generic_visit(node)
+    incoming = anno.getanno(node.body[0], self.in_label)
+    incoming |= anno.getanno(node.test, self.in_label)
+    outgoing = anno.getanno(node.body[-1], self.out_label)
+    if node.orelse:
+      orelse_outgoing = anno.getanno(node.orelse[-1], self.out_label)
+      outgoing = self.transfer_fn(outgoing, orelse_outgoing)
+    anno.setanno(node, self.in_label, incoming)
+    anno.setanno(node, self.out_label, outgoing)
+
+  def visit_With(self, node):
+    self.generic_visit(node)
+    incoming = anno.getanno(node.body[0], self.in_label)
+    for item in node.items:
+      incoming |= anno.getanno(item, self.in_label)
+    outgoing = anno.getanno(node.body[-1], self.out_label)
+    anno.setanno(node, self.in_label, incoming)
+    anno.setanno(node, self.out_label, outgoing)
+
+
+# TODO(alexbw): Abstract the CFG walking machinery into a superclass
+# which is parameterized on which fields it selects when walking.
+# TODO(alexbw): Abstract the application of dataflow analysis
+class Forward(object):
+  """Forward analysis on CFG.
+
+  Args:
+    label: A name for this analysis e.g. 'active' for activity analysis. The AST
+      nodes in the CFG will be given annotations 'name_in', 'name_out',
+      'name_gen' and 'name_kill' which contain the incoming values, outgoing
+      values, values generated by the statement, and values deleted by the
+      statement respectively.
+    transfer_fn: Either the AND or OR operator. If the AND operator is used it
+      turns into forward must analysis (i.e. a value will only be carried
+      forward if it appears on all incoming paths). The OR operator means that
+      forward may analysis is done (i.e. the union of incoming values will be
+      taken).
+  """
+
+  def __init__(self, label, context, transfer_fn=operator.or_):
+    self.transfer_fn = transfer_fn
+    self.context = context
+    self.out_label = label + '_out'
+    self.in_label = label + '_in'
+    self.gen_label = label + '_gen'
+    self.kill_label = label + '_kill'
+
+  # TODO(alexbw): see if we can simplify by visiting breadth-first
+  def visit(self, node):
+    """Depth-first walking the CFG, applying dataflow information propagtion."""
+    # node.value is None only for the exit CfgNode.
+    if not node.value:
+      return
+
+    if anno.hasanno(node.value, self.out_label):
+      before = hash(anno.getanno(node.value, self.out_label))
+    else:
+      before = None
+    preds = [
+        anno.getanno(pred.value, self.out_label)
+        for pred in node.prev
+        if anno.hasanno(pred.value, self.out_label)
+    ]
+    if preds:
+      incoming = functools.reduce(self.transfer_fn, preds[1:], preds[0])
+    else:
+      incoming = frozenset()
+    anno.setanno(node.value, self.in_label, incoming)
+    gen, kill = self.get_gen_kill(node, incoming)
+    anno.setanno(node.value, self.gen_label, gen)
+    anno.setanno(node.value, self.kill_label, kill)
+    anno.setanno(node.value, self.out_label, (incoming - kill) | gen)
+
+    if hash(anno.getanno(node.value, self.out_label)) != before:
+      for succ in node.next:
+        self.visit(succ)
+
+  def get_gen_kill(self, cfg_node, incoming):
+    """Calculate Gen and Kill properties of a CFG node in dataflow analysis.
+
+    A function which takes the CFG node as well as a set of incoming
+    values. It must return a set of newly generated values by the statement as
+    well as a set of deleted (killed) values.
+
+    Args:
+      cfg_node: A CfgNode instance.
+      incoming:
+    """
+    raise NotImplementedError()
+
+
+class Backward(Forward):
+  """Backward analysis on CFG."""
+
+  def visit(self, cfg_node):
+    # cfg_node.value is None for the exit node, which will be visited only once
+    if not cfg_node.value:
+      for pred in cfg_node.prev:
+        self.visit(pred)
+      return
+
+    if anno.hasanno(cfg_node.value, self.in_label):
+      before = hash(anno.getanno(cfg_node.value, self.in_label))
+    else:
+      before = None
+    succs = [
+        anno.getanno(succ.value, self.in_label)
+        for succ in cfg_node.next
+        if anno.hasanno(succ.value, self.in_label)
+    ]
+    if succs:
+      incoming = functools.reduce(self.transfer_fn, succs[1:], succs[0])
+    else:
+      incoming = frozenset()
+    anno.setanno(cfg_node.value, self.out_label, incoming)
+    gen, kill = self.get_gen_kill(cfg_node, incoming)
+    anno.setanno(cfg_node.value, self.gen_label, gen)
+    anno.setanno(cfg_node.value, self.kill_label, kill)
+    anno.setanno(cfg_node.value, self.in_label, (incoming - kill) | gen)
+    if hash(anno.getanno(cfg_node.value, self.in_label)) != before:
+      for pred in cfg_node.prev:
+        self.visit(pred)
+
+
+def run_analyses(node, analyses):
+  """Perform dataflow analysis on all functions within an AST.
+
+  Args:
+    node: An AST node on which to run dataflow analysis.
+    analyses: Either an instance of the Forward or Backward dataflow analysis
+      class, or a list or tuple of them.
+
+  Returns:
+    node: The node, but now with annotations on the AST nodes containing the
+    results of the dataflow analyses.
+  """
+  if not isinstance(analyses, (tuple, list)):
+    analyses = (analyses,)
+  for analysis in analyses:
+    if not isinstance(analysis, (Forward, Backward)):
+      raise TypeError('not a valid forward analysis object')
+
+  for child_node in gast.walk(node):
+    if isinstance(child_node, gast.FunctionDef):
+      cfg_obj = CfgBuilder().build_cfg(child_node)
+      for analysis in analyses:
+        if isinstance(analysis, Backward):
+          analysis.visit(cfg_obj.exit)
+        elif isinstance(analysis, Forward):
+          analysis.visit(cfg_obj.entry)
+  for analysis in analyses:
+    PropagateAnalysis(analysis).visit(node)
+  return node
+
+
+class Liveness(Backward):
+  """Perform a liveness analysis.
+
+  Each statement is annotated with a set of variables that may be used
+  later in the program.
+  """
+
+  def __init__(self, context):
+    super(Liveness, self).__init__('live', context)
+
+  def get_gen_kill(self, node, _):
+    # A variable's parents are live if it is live
+    # e.g. x is live if x.y is live. This means gen needs to return
+    # all parents of a variable (if it's an Attribute or Subscript).
+    # This doesn't apply to kill (e.g. del x.y doesn't affect liveness of x)
+    gen = activity.get_read(node.value, self.context)
+    gen = functools.reduce(lambda left, right: left | right.support_set, gen,
+                           gen)
+    kill = activity.get_updated(node.value, self.context)
+    return gen, kill
+
+
+class ReachingDefinitions(Forward):
+  """Perform reaching definition analysis.
+
+  Each statement is annotated with a set of (variable, definition) pairs.
+  """
+
+  def __init__(self, context):
+    super(ReachingDefinitions, self).__init__('definitions', context)
+
+  def get_gen_kill(self, node, incoming):
+    definitions = activity.get_updated(node.value, self.context)
+    gen = frozenset((id_, node.value) for id_ in definitions)
+    kill = frozenset(def_ for def_ in incoming if def_[0] in definitions)
+    return gen, kill
+
+
+class Defined(Forward):
+  """Perform defined variable analysis.
+
+  Each statement is annotated with a set of variables which are guaranteed to
+  be defined at that point.
+  """
+
+  def __init__(self, context):
+    super(Defined, self).__init__('defined', context, transfer_fn=operator.and_)
+
+  def get_gen_kill(self, node, _):
+    gen = activity.get_updated(node.value, self.context)
+    return gen, frozenset()
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/cfg_test.py b/tensorflow/contrib/autograph/pyct/static_analysis/cfg_test.py
new file mode 100644
index 00000000000000..fc07fa3447b23c
--- /dev/null
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/cfg_test.py
@@ -0,0 +1,306 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for cfg module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+import gast
+
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import context
+from tensorflow.contrib.autograph.pyct import parser
+from tensorflow.contrib.autograph.pyct import qual_names
+from tensorflow.contrib.autograph.pyct.static_analysis import cfg
+from tensorflow.python.platform import test
+
+
+class CFGTest(test.TestCase):
+
+  def _parse_and_analyze(self, test_fn, namespace, arg_types=None):
+    arg_types = arg_types or {}
+    node, source = parser.parse_entity(test_fn)
+    ctx = context.EntityContext(
+        namer=None,
+        source_code=source,
+        source_file=None,
+        namespace=namespace,
+        arg_values=None,
+        arg_types=arg_types,
+        owner_type=None,
+        recursive=True)
+    node = qual_names.resolve(node)
+    return node, ctx
+
+  def _check_anno_matches(self, node, anno_name, var_names):
+    if isinstance(var_names, str):
+      var_names = (var_names,)
+    qual_vars = set()
+    for var_name in var_names:
+      if isinstance(var_name, str):
+        if '[' in var_name or ']' in var_name:
+          raise ValueError('Annotation matching not supported with subscript.')
+        if '.' not in var_name:
+          qual_vars.add(qual_names.QN(var_name))
+        else:
+          attrs = var_name.split('.')
+          this_qn = functools.reduce(qual_names.QN, attrs[1:],
+                                     qual_names.QN(attrs[0]))
+          qual_vars.add(this_qn)
+    self.assertEqual(anno.getanno(node, anno_name), qual_vars)
+
+  def test_reaching(self):
+
+    def f(x):
+      print(x)
+      while True:
+        x = x
+        x = x
+      return x
+
+    node, ctx = self._parse_and_analyze(f, {})
+    cfg.run_analyses(node, cfg.ReachingDefinitions(ctx))
+    body = node.body[0].body
+    # Only the argument reaches the expression
+    def_in = anno.getanno(body[0], 'definitions_in')
+    # One element, x, from arguments
+    self.assertEqual(set(type(d[1]) for d in def_in), set((gast.arguments,)))
+
+    while_body = body[1].body
+    def_in = anno.getanno(while_body[0], 'definitions_in')
+    # One definition, two possible sources.
+    # - One from an assignment (if the loop is entered)
+    # - The other from the arguments (if loop is not entered)
+    self.assertEqual(
+        set(type(d[1]) for d in def_in), set((gast.arguments, gast.Assign)))
+
+    def_in = anno.getanno(while_body[1], 'definitions_in')
+    # If we've reached this line, the only reaching definition of x is the
+    # Assign node in previous line
+    self.assertEqual(set(type(d[1]) for d in def_in), set((gast.Assign,)))
+
+    def_in = anno.getanno(body[2], 'definitions_in')
+    # Same situation as while_body[0]
+    self.assertEqual(
+        set(type(d[1]) for d in def_in), set((gast.arguments, gast.Assign)))
+
+  def test_defined(self):
+
+    def f(x):
+      if x:
+        y = 2  # pylint: disable=unused-variable
+      return x
+
+    node, ctx = self._parse_and_analyze(f, {})
+    cfg.run_analyses(node, cfg.Defined(ctx))
+    body = node.body[0].body
+    # only x is for sure defined at the end
+    self._check_anno_matches(body[1], 'defined_in', 'x')
+    # at the end of the if body both x and y are defined
+    if_body = body[0].body
+    self._check_anno_matches(if_body[0], 'defined_out', ('x', 'y'))
+
+  def _get_live_annotated_fnbody(self, f):
+    node, ctx = self._parse_and_analyze(f, {})
+    cfg.run_analyses(node, cfg.Liveness(ctx))
+    body = node.body[0].body
+    return body
+
+  def test_live_straightline(self):
+
+    def f1(x):
+      a = g(x)  # pylint: disable=undefined-variable
+      b = h(a)  # pylint: disable=undefined-variable, unused-variable
+      return x
+
+    body = self._get_live_annotated_fnbody(f1)
+    self._check_anno_matches(body[1], 'live_in', ('a', 'h', 'x'))
+    self._check_anno_matches(body[2], 'live_in', ('x'))
+    self._check_anno_matches(body[0], 'live_in', ('g', 'h', 'x'))
+    self._check_anno_matches(body[2], 'live_out', ())
+
+  def test_live_stacked_conds_with_else(self):
+
+    def f2(x, a):  # pylint: disable=unused-argument
+      if a > 0:  # x should not be live
+        x = 0
+      if a > 1:
+        x = 1
+      else:
+        x = 2
+
+    body = self._get_live_annotated_fnbody(f2)
+    self._check_anno_matches(body[0], 'live_in', ('a'))
+    self._check_anno_matches(body[1], 'live_in', ('a'))
+
+  def test_live_stacked_conds(self):
+
+    def f3(x, a):
+      if a > 0:  # x and a should be live
+        x = 0
+      if a > 1:  # x and a should be live_in
+        x = 1
+      return x  # x should be live
+
+    body = self._get_live_annotated_fnbody(f3)
+    self._check_anno_matches(body[0], 'live_in', ('a', 'x'))
+    self._check_anno_matches(body[1], 'live_in', ('a', 'x'))
+    self._check_anno_matches(body[2], 'live_in', ('x'))
+
+  def test_live_possibly_unused_cond(self):
+
+    def f4(x, a):
+      if a > 0:  # x should be live
+        x = 0
+      x += 1
+
+    body = self._get_live_annotated_fnbody(f4)
+    self._check_anno_matches(body[0], 'live_in', ('x', 'a'))
+    self._check_anno_matches(body[1], 'live_in', ('x'))
+
+  def test_live_attribute_in_cond(self):
+
+    def f5(x, a):
+      if a > 0:  # x.y should be live
+        x.y = 0
+      return x.y
+
+    body = self._get_live_annotated_fnbody(f5)
+    self._check_anno_matches(body[0], 'live_in', ('x', 'x.y', 'a'))
+
+  def test_live_noop(self):
+
+    def f6(x):
+      return x  # should this cause x.* to be live?
+
+    body = self._get_live_annotated_fnbody(f6)
+    self._check_anno_matches(body[0], 'live_in', ('x'))
+
+  def test_live_loop(self):
+
+    def f7(x, n):
+      for i in range(n):
+        x += i
+      return x
+
+    body = self._get_live_annotated_fnbody(f7)
+    self._check_anno_matches(body[0], 'live_in', ('x', 'n', 'range'))
+    self._check_anno_matches(body[1], 'live_in', ('x'))
+
+  def test_live_context_manager(self):
+
+    def f8(x, f):
+      with f:
+        x += 1
+
+    body = self._get_live_annotated_fnbody(f8)
+    self._check_anno_matches(body[0], 'live_in', ('f', 'x'))
+
+  def test_node_equality(self):
+    node_a = gast.parse('y = x').body[0]
+    node_b = gast.parse('y = x').body[0]
+    self.assertNotEqual(node_a, node_b)
+
+  def test_nested_functions_defined(self):
+
+    def f(x):
+      y = x * 2
+
+      def g(z):
+        return z + y
+
+      return g(x)
+
+    node, ctx = self._parse_and_analyze(f, {})
+    cfg.run_analyses(node, cfg.Defined(ctx))
+
+    body = node.body[0].body
+    self.assertEqual(
+        anno.getanno(body[2], 'defined_in'),
+        frozenset(map(qual_names.QN, ('g', 'x', 'y'))))
+
+    # TODO(alexbw): CFG analysis doesn't currently cross FunctionDef boundaries.
+    # NOTE: 'z' is easy to find, but 'y' is  not identified as
+    # defined, because CFG analysis is applied with each function separately.
+    # fndef_body = body[1].body
+    # self.assertEqual(
+    #     anno.getanno(fndef_body[0], 'defined_in'),
+    #     frozenset(map(qual_names.QN, ('z', 'y'))))
+
+  def test_nested_functions_dont_leak_definitions(self):
+
+    def f(x):
+      print(x)
+
+      def g():
+        y = 2
+        return y
+
+      return g()  # y is not defined here
+
+    node, ctx = self._parse_and_analyze(f, {})
+    cfg.run_analyses(node, cfg.Defined(ctx))
+    body = node.body[0].body
+    self.assertEqual(
+        anno.getanno(body[2], 'defined_in'),
+        frozenset(map(qual_names.QN, ('x', 'g'))))
+
+  def test_loop_else(self):
+
+    # Disabling useless-else-on-loop error, because 'break' and 'continue'
+    # canonicalization are a separate analysis pass, and here we test
+    # the CFG analysis in isolation.
+    def for_orelse(x):
+      y = 0
+      for i in range(len(x)):
+        x += i
+      else:  # pylint: disable=useless-else-on-loop
+        y = 1
+      return x, y
+
+    def while_orelse(x, i):
+      y = 0
+      while x < 10:
+        x += i
+      else:  # pylint: disable=useless-else-on-loop
+        y = 1
+      return x, y
+
+    for f in (for_orelse, while_orelse):
+      node, ctx = self._parse_and_analyze(f, {})
+      cfg.run_analyses(node, cfg.ReachingDefinitions(ctx))
+      body = node.body[0].body
+      return_node = body[-1]
+      reaching_defs = anno.getanno(return_node, 'definitions_in')
+
+      # Y could be defined by Assign(Num(0)) or Assign(Num(1))
+      # X could be defined as an argument or an AugAssign.
+      y_defs = [node for var, node in reaching_defs if str(var) == 'y']
+      x_defs = [node for var, node in reaching_defs if str(var) == 'x']
+
+      self.assertEqual(set((gast.Assign,)), set(type(def_) for def_ in y_defs))
+      self.assertEqual(set((0, 1)), set(def_.value.n for def_ in y_defs))
+      self.assertEqual(len(y_defs), 2)
+      self.assertEqual(
+          set((gast.arguments, gast.AugAssign)),
+          set(type(def_) for def_ in x_defs))
+      self.assertEqual(len(x_defs), 2)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py b/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py
index c00946f9c41bc6..d6555dc7e0b3d4 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py
@@ -136,14 +136,14 @@ def visit_If(self, node):
 
   def _process_function_arg(self, arg_name):
     str_name = str(arg_name)
+    type_holder = arg_name.ast()
+    self.scope.setval(arg_name, type_holder)
     if len(self.enclosing_entities) == 1 and str_name in self.context.arg_types:
       # Forge a node to hold the type information, so that method calls on
       # it can resolve the type.
-      type_holder = arg_name.ast()
       type_string, type_obj = self.context.arg_types[str_name]
       anno.setanno(type_holder, 'type', type_obj)
       anno.setanno(type_holder, 'type_fqn', tuple(type_string.split('.')))
-      self.scope.setval(arg_name, type_holder)
 
   def visit_arg(self, node):
     self._process_function_arg(anno.getanno(node.arg, anno.Basic.QN))
@@ -167,50 +167,41 @@ def visit_Name(self, node):
                      anno.getanno(definition, 'element_type'))
     return node
 
-  def _process_variable_assignment(self, source, targets):
-    # Special case: constructors.
-    if isinstance(source, gast.Call):
-      func = source.func
+  def _process_variable_assignment(self, target, value):
+    # Constructors
+    if isinstance(value, gast.Call):
+      func = value.func
       if anno.hasanno(func, 'live_val'):
         func_obj = anno.getanno(func, 'live_val')
         if tf_inspect.isclass(func_obj):
-          anno.setanno(source, 'is_constructor', True)
-          anno.setanno(source, 'type', func_obj)
-          anno.setanno(source, 'type_fqn', anno.getanno(func, 'fqn'))
+          anno.setanno(value, 'is_constructor', True)
+          anno.setanno(value, 'type', func_obj)
+          anno.setanno(value, 'type_fqn', anno.getanno(func, 'fqn'))
           # TODO(mdan): Raise an error if constructor has side effects.
           # We can have a whitelist of no-side-effects constructors.
           # We can also step inside the constructor and further analyze.
 
-    # Multiple targets mean multiple assignment.
-    for target in targets:
-      # Tuple target means unpacking.
-      if isinstance(target, (gast.Tuple, gast.List)):
-        for i, target_item in enumerate(target.elts):
-          # Two cases here:
-          #   1. Static unpacking, e.g. a, b = c, d
-          #   2. Dynamic unpacking, e.g. a, b = c
-          # The former case is optimized away.
-          if isinstance(source, (gast.Tuple, gast.List)):
-            source_item = source.elts[i]
-          else:
-            source_item = gast.Subscript(source, gast.Index(i), ctx=None)
-          self._process_variable_assignment(source_item, (target_item,))
-      elif isinstance(target, (gast.Name, gast.Attribute)):
-        target_symbol = anno.getanno(target, anno.Basic.QN)
-        self.scope.setval(target_symbol, source)
-      else:
-        raise ValueError('assignment target has unknown type: %s' % target)
+    if isinstance(target, (gast.Name, gast.Attribute)):
+      target_symbol = anno.getanno(target, anno.Basic.QN)
+      self.scope.setval(target_symbol, value)
+    elif isinstance(target, gast.Subscript):
+      pass
+    else:
+      raise ValueError('assignment target has unknown type: %s' % target)
 
   def visit_With(self, node):
-    for wi in node.items:
-      if wi.optional_vars is not None:
-        self._process_variable_assignment(wi.context_expr, (wi.optional_vars,))
+    for item in node.items:
+      if item.optional_vars is not None:
+        self.apply_to_single_assignments((item.optional_vars,),
+                                         item.context_expr,
+                                         self._process_variable_assignment)
     self.generic_visit(node)
     return node
 
   def visit_Assign(self, node):
     self.generic_visit(node)
-    self._process_variable_assignment(node.value, node.targets)
+    self.apply_to_single_assignments(
+        node.targets, node.value, self._process_variable_assignment)
     return node
 
   def visit_Call(self, node):
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/type_info_test.py b/tensorflow/contrib/autograph/pyct/static_analysis/type_info_test.py
index 46b7701624a430..95cbf5ca79a504 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/type_info_test.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/type_info_test.py
@@ -196,6 +196,19 @@ def test_fn():
     f_ref = node.body[0].body[1].value
     self.assertEqual(anno.getanno(f_ref, 'element_type'), Foo)
 
+  def test_type_annotation_args(self):
+
+    class Foo(object):
+      pass
+
+    def test_fn(f):
+      utils.set_element_type(f, Foo)
+      return f
+
+    node = self._parse_and_analyze(test_fn, {'Foo': Foo, 'utils': utils})
+    f_ref = node.body[0].body[1].value
+    self.assertEqual(anno.getanno(f_ref, 'element_type'), Foo)
+
   def test_nested_unpacking(self):
 
     class Foo(object):
diff --git a/tensorflow/contrib/autograph/pyct/transformer.py b/tensorflow/contrib/autograph/pyct/transformer.py
index 4db6cc0adfad90..60bca8b38dcf62 100644
--- a/tensorflow/contrib/autograph/pyct/transformer.py
+++ b/tensorflow/contrib/autograph/pyct/transformer.py
@@ -70,14 +70,40 @@ def enclosing_entities(self):
     return tuple(self._enclosing_entities)
 
   @property
-  def locel_scope_level(self):
+  def local_scope_level(self):
     return len(self._local_scope_state)
 
-  def enter_local_scope(self):
-    self._local_scope_state.append({})
+  def enter_local_scope(self, inherit=None):
+    """Marks entry into a new local scope.
 
-  def exit_local_scope(self):
-    return self._local_scope_state.pop()
+    Args:
+      inherit: Optional enumerable of variable names to copy from the
+          parent scope.
+    """
+    scope_entered = {}
+    if inherit:
+      this_scope = self._local_scope_state[-1]
+      for name in inherit:
+        if name in this_scope:
+          scope_entered[name] = this_scope[name]
+    self._local_scope_state.append(scope_entered)
+
+  def exit_local_scope(self, keep=None):
+    """Marks exit from the current local scope.
+
+    Args:
+      keep: Optional enumerable of variable names to copy into the
+          parent scope.
+    Returns:
+      A dict containing the scope that has just been exited.
+    """
+    scope_left = self._local_scope_state.pop()
+    if keep:
+      this_scope = self._local_scope_state[-1]
+      for name in keep:
+        if name in scope_left:
+          this_scope[name] = scope_left[name]
+    return scope_left
 
   def set_local(self, name, value):
     self._local_scope_state[-1][name] = value
@@ -91,38 +117,163 @@ def debug_print(self, node):
       print(pretty_printer.fmt(node))
     return node
 
-  def visit_block(self, nodes):
-    """Helper equivalent to generic_visit, but for node lists."""
+  def visit_block(self, nodes, before_visit=None, after_visit=None):
+    """A more powerful version of generic_visit for statement blocks.
+
+    An example of a block is the body of an if statement.
+
+    This function allows specifying a postprocessing callback (the
+    after_visit argument) argument which can be used to move nodes to a new
+    destination. This is done by after_visit by returning a non-null
+    second return value, e.g. return new_node, new_destination.
+
+    For example, a transformer could perform the following move:
+
+        foo()
+        bar()
+        baz()
+
+        foo()
+        if cond:
+          bar()
+          baz()
+
+    The above could be done with a postprocessor of this kind:
+
+        def after_visit(node):
+          if node_is_function_call(bar):
+            new_container_node = build_cond()
+            new_container_node.body.append(node)
+            return new_container_node, new_container_node.body
+          else:
+            # Once we set a new destination, all subsequent items will be
+            # moved to it, so we don't need to explicitly handle baz.
+            return node, None
+
+    Args:
+      nodes: enumerable of AST node objects
+      before_visit: optional callable that is called before visiting each item
+          in nodes
+      after_visit: optional callable that takes in an AST node and
+          returns a tuple (new_node, new_destination). It is called after
+          visiting each item in nodes. Is used in the same was as the
+          visit_* methods: new_node will replace the node; if not None,
+          new_destination must be a list, and subsequent nodes will be placed
+          in this list instead of the list returned by visit_block.
+    Returns:
+      A list of AST node objects containing the transformed items fron nodes,
+      except those nodes that have been relocated using after_visit.
+    """
     results = []
+    node_destination = results
     for node in nodes:
+      if before_visit:
+        # TODO(mdan): We can modify node here too, if ever needed.
+        before_visit()
+
       replacement = self.visit(node)
+
+      if after_visit and replacement:
+        replacement, new_destination = after_visit(replacement)
+      else:
+        new_destination = None
+
       if replacement:
         if isinstance(replacement, (list, tuple)):
-          results.extend(replacement)
+          node_destination.extend(replacement)
         else:
-          results.append(replacement)
+          node_destination.append(replacement)
+
+      # Allow the postprocessor to reroute the remaining nodes to a new list.
+      if new_destination is not None:
+        node_destination = new_destination
     return results
 
+  # TODO(mdan): Once we have error tracing, we may be able to just go to SSA.
+  def apply_to_single_assignments(self, targets, values, apply_fn):
+    """Applies a fuction to each individual assignment.
+
+    This function can process a possibly-unpacked (e.g. a, b = c, d) assignment.
+    It tries to break down the unpacking if possible. In effect, it has the same
+    effect as passing the assigned values in SSA form to apply_fn.
+
+    Examples:
+
+    The following will result in apply_fn(a, c), apply_fn(b, d):
+
+        a, b = c, d
+
+    The following will result in apply_fn(a, c[0]), apply_fn(b, c[1]):
+
+        a, b = c
+
+    The following will result in apply_fn(a, (b, c)):
+
+        a = b, c
+
+    It uses the visitor pattern to allow subclasses to process single
+    assignments individually.
+
+    Args:
+      targets: list, tuple of or individual AST node. Should be used with the
+          targets field of an ast.Assign node.
+      values: an AST node.
+      apply_fn: a function of a single argument, which will be called with the
+          respective nodes of each single assignment. The signaure is
+          apply_fn(target, value), no return value.
+    """
+    if not isinstance(targets, (list, tuple)):
+      targets = (targets,)
+    for target in targets:
+      if isinstance(target, (gast.Tuple, gast.List)):
+        for i in range(len(target.elts)):
+          target_el = target.elts[i]
+          if isinstance(values, (gast.Tuple, gast.List)):
+            value_el = values.elts[i]
+          else:
+            value_el = gast.Subscript(values, gast.Index(i), ctx=gast.Store())
+          self.apply_to_single_assignments(target_el, value_el, apply_fn)
+      else:
+        # TODO(mdan): Look into allowing to rewrite the AST here.
+        apply_fn(target, values)
+
   def visit(self, node):
     source_code = self.context.source_code
     source_file = self.context.source_file
     did_enter_function = False
-    local_scope_state_size = len(self._local_scope_state)
+    local_scope_size_at_entry = len(self._local_scope_state)
 
     try:
       if isinstance(node, (gast.FunctionDef, gast.ClassDef, gast.Lambda)):
-        self._enclosing_entities.append(node)
         did_enter_function = True
 
+      if did_enter_function:
+        self._enclosing_entities.append(node)
+
       if source_code and hasattr(node, 'lineno'):
         self._lineno = node.lineno
         self._col_offset = node.col_offset
-      if anno.hasanno(node, anno.Basic.SKIP_PROCESSING):
-        return node
-      return super(Base, self).visit(node)
 
-    except (ValueError, AttributeError, KeyError, NotImplementedError,
-            AssertionError) as e:
+      if not anno.hasanno(node, anno.Basic.SKIP_PROCESSING):
+        result = super(Base, self).visit(node)
+
+      # On exception, the local scope integrity is not guaranteed.
+      if did_enter_function:
+        self._enclosing_entities.pop()
+
+      if local_scope_size_at_entry != len(self._local_scope_state):
+        raise AssertionError(
+            'Inconsistent local scope stack. Before entering node %s, the'
+            ' stack had length %d, after exit it has length %d. This'
+            ' indicates enter_local_scope and exit_local_scope are not'
+            ' well paired.' % (
+                node,
+                local_scope_size_at_entry,
+                len(self._local_scope_state)
+            ))
+      return result
+
+    except (ValueError, AttributeError, KeyError, NotImplementedError) as e:
       msg = '%s: %s\nOffending source:\n%s\n\nOccurred at node:\n%s' % (
           e.__class__.__name__, str(e), try_ast_to_source(node),
           pretty_printer.fmt(node, color=False))
@@ -130,18 +281,11 @@ def visit(self, node):
         line = source_code.splitlines()[self._lineno - 1]
       else:
         line = '<no source available>'
+      # TODO(mdan): Avoid the printing of the original exception.
+      # In other words, we need to find how to suppress the "During handling
+      # of the above exception, another exception occurred" message.
       six.reraise(AutographParseError,
                   AutographParseError(
                       msg,
                       (source_file, self._lineno, self._col_offset + 1, line)),
                   sys.exc_info()[2])
-    finally:
-      if did_enter_function:
-        self._enclosing_entities.pop()
-
-      if local_scope_state_size != len(self._local_scope_state):
-        raise AssertionError(
-            'Inconsistent local scope stack. Before entering node %s, the'
-            ' stack had length %d, after exit it has length %d. This'
-            ' indicates enter_local_scope and exit_local_scope are not'
-            ' well paired.')
diff --git a/tensorflow/contrib/autograph/pyct/transformer_test.py b/tensorflow/contrib/autograph/pyct/transformer_test.py
index f96b0dc377521a..f110e79605945e 100644
--- a/tensorflow/contrib/autograph/pyct/transformer_test.py
+++ b/tensorflow/contrib/autograph/pyct/transformer_test.py
@@ -18,6 +18,8 @@
 from __future__ import division
 from __future__ import print_function
 
+import gast
+
 from tensorflow.contrib.autograph.pyct import anno
 from tensorflow.contrib.autograph.pyct import context
 from tensorflow.contrib.autograph.pyct import parser
@@ -27,7 +29,7 @@
 
 class TransformerTest(test.TestCase):
 
-  def _context_for_nodetesting(self):
+  def _context_for_testing(self):
     return context.EntityContext(
         namer=None,
         source_code=None,
@@ -53,7 +55,7 @@ def visit_BinOp(self, node):
         anno.setanno(node, 'enclosing_entities', self.enclosing_entities)
         return self.generic_visit(node)
 
-    tr = TestTransformer(self._context_for_nodetesting())
+    tr = TestTransformer(self._context_for_testing())
 
     def test_function():
       a = 0
@@ -94,7 +96,7 @@ def inner_function(x):
                       inner_function, lambda_node),
                      anno.getanno(lambda_expr, 'enclosing_entities'))
 
-  def test_statement_info_stack(self):
+  def test_local_scope_info_stack(self):
 
     class TestTransformer(transformer.Base):
 
@@ -116,7 +118,7 @@ def visit_While(self, node):
       def visit_For(self, node):
         return self._annotate_result(node)
 
-    tr = TestTransformer(self._context_for_nodetesting())
+    tr = TestTransformer(self._context_for_testing())
 
     def test_function(a):
       """Docstring."""
@@ -142,7 +144,7 @@ def test_function(a):
     self.assertFalse(anno.hasanno(while_node, 'string'))
     self.assertEqual('1', anno.getanno(while_node, 'test'))
 
-  def test_statement_info_stack_checks_integrity(self):
+  def test_local_scope_info_stack_checks_integrity(self):
 
     class TestTransformer(transformer.Base):
 
@@ -155,7 +157,7 @@ def visit_For(self, node):
         self.exit_local_scope()
         return node
 
-    tr = TestTransformer(self._context_for_nodetesting())
+    tr = TestTransformer(self._context_for_testing())
 
     def no_exit(a):
       if a > 0:
@@ -174,6 +176,38 @@ def no_entry(a):
     with self.assertRaises(AssertionError):
       tr.visit(node)
 
+  def test_visit_block_postprocessing(self):
+
+    class TestTransformer(transformer.Base):
+
+      def _process_body_item(self, node):
+        if isinstance(node, gast.Assign) and (node.value.id == 'y'):
+          if_node = gast.If(gast.Name('x', gast.Load(), None), [node], [])
+          return if_node, if_node.body
+        return node, None
+
+      def visit_FunctionDef(self, node):
+        node.body = self.visit_block(
+            node.body, after_visit=self._process_body_item)
+        return node
+
+    def test_function(x, y):
+      z = x
+      z = y
+      return z
+
+    tr = TestTransformer(self._context_for_testing())
+
+    node, _ = parser.parse_entity(test_function)
+    node = tr.visit(node)
+    node = node.body[0]
+
+    self.assertEqual(len(node.body), 2)
+    self.assertTrue(isinstance(node.body[0], gast.Assign))
+    self.assertTrue(isinstance(node.body[1], gast.If))
+    self.assertTrue(isinstance(node.body[1].body[0], gast.Assign))
+    self.assertTrue(isinstance(node.body[1].body[1], gast.Return))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/autograph/utils/BUILD b/tensorflow/contrib/autograph/utils/BUILD
index d3a1b946889253..d82c17bf2afd01 100644
--- a/tensorflow/contrib/autograph/utils/BUILD
+++ b/tensorflow/contrib/autograph/utils/BUILD
@@ -33,6 +33,8 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
+        "//tensorflow/contrib/autograph/pyct",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:list_ops",
         "//tensorflow/python:script_ops",
         "//tensorflow/python/data/ops:dataset_ops",
diff --git a/tensorflow/contrib/autograph/utils/builtins.py b/tensorflow/contrib/autograph/utils/builtins.py
index 211e8eaee9082d..998087e056c2cd 100644
--- a/tensorflow/contrib/autograph/utils/builtins.py
+++ b/tensorflow/contrib/autograph/utils/builtins.py
@@ -24,6 +24,7 @@
 
 from tensorflow.contrib.autograph.utils import py_func
 from tensorflow.contrib.autograph.utils import type_check
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import logging_ops
@@ -38,7 +39,13 @@ def dynamic_builtin(f, *args, **kwargs):
     return dynamic_range(*args, **kwargs)
   if f is range:
     return dynamic_range(*args, **kwargs)
-  raise ValueError('%s is not supported' % f)
+  if f is int:
+    return dynamic_int(*args, **kwargs)
+  if f is float:
+    return dynamic_float(*args, **kwargs)
+
+  raise NotImplementedError(
+      'The "%s" builtin is not yet supported.' % f.__name__)
 
 
 def dynamic_len(list_or_tensor):
@@ -52,6 +59,20 @@ def dynamic_len(list_or_tensor):
   return len(list_or_tensor)
 
 
+def dynamic_int(num_or_tensor, **kwargs):
+  """Implementation of int() using dynamic dispatch."""
+  if tensor_util.is_tensor(num_or_tensor):
+    return math_ops.cast(num_or_tensor, dtype=dtypes.int32, **kwargs)
+  return int(num_or_tensor)
+
+
+def dynamic_float(num_or_tensor, **kwargs):
+  """Implementation of float() using dynamic dispatch."""
+  if tensor_util.is_tensor(num_or_tensor):
+    return math_ops.cast(num_or_tensor, dtype=dtypes.float32, **kwargs)
+  return float(num_or_tensor)
+
+
 def dynamic_range(start_or_stop, stop=None, step=None):
   """Implementation of range using dynamic dispatch."""
   if type_check.is_tensor(start_or_stop, stop, step):
diff --git a/tensorflow/contrib/autograph/utils/builtins_test.py b/tensorflow/contrib/autograph/utils/builtins_test.py
index 163e6984079fea..0c2312178a9210 100644
--- a/tensorflow/contrib/autograph/utils/builtins_test.py
+++ b/tensorflow/contrib/autograph/utils/builtins_test.py
@@ -24,6 +24,7 @@
 
 from tensorflow.contrib.autograph.utils import builtins
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.platform import test
 
 
@@ -77,7 +78,7 @@ def range(x):  # pylint:disable=redefined-builtin
       return x
 
     # Functions that just have the names of builtins are rejected.
-    with self.assertRaises(ValueError):
+    with self.assertRaises(NotImplementedError):
       self.assertEqual(builtins.dynamic_builtin(range, 1), 1)
     if six.PY2:
       self.assertListEqual(
@@ -87,6 +88,20 @@ def range(x):  # pylint:disable=redefined-builtin
     self.assertListEqual(
         list(builtins.dynamic_builtin(six.moves.xrange, 3)), [0, 1, 2])
 
+  def test_casts(self):
+    i = constant_op.constant(2, dtype=dtypes.int32)
+    f = constant_op.constant(1.0, dtype=dtypes.float32)
+
+    self.assertEqual(builtins.dynamic_builtin(int, i).dtype, dtypes.int32)
+    self.assertEqual(builtins.dynamic_builtin(int, f).dtype, dtypes.int32)
+    self.assertEqual(builtins.dynamic_builtin(float, i).dtype, dtypes.float32)
+    self.assertEqual(builtins.dynamic_builtin(float, f).dtype, dtypes.float32)
+
+    self.assertEqual(builtins.dynamic_builtin(int, True), 1)
+    self.assertEqual(builtins.dynamic_builtin(int, False), 0)
+    self.assertEqual(builtins.dynamic_builtin(float, True), 1.0)
+    self.assertEqual(builtins.dynamic_builtin(float, False), 0.0)
+
   def test_dynamic_print_tf(self):
     try:
       out_capturer = six.StringIO()
diff --git a/tensorflow/contrib/batching/BUILD b/tensorflow/contrib/batching/BUILD
index d65c990c87cbc3..b27a19b16c08cb 100644
--- a/tensorflow/contrib/batching/BUILD
+++ b/tensorflow/contrib/batching/BUILD
@@ -49,6 +49,14 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "serial_device_batch_scheduler",
+    hdrs = ["serial_device_batch_scheduler.h"],
+    deps = [
+        "//tensorflow/core/kernels/batching_util:serial_device_batch_scheduler",
+    ],
+)
+
 cc_library(
     name = "basic_batch_scheduler",
     hdrs = ["basic_batch_scheduler.h"],
@@ -96,6 +104,7 @@ py_test(
     name = "batch_ops_test",
     size = "small",
     srcs = ["python/ops/batch_ops_test.py"],
+    shard_count = 5,
     srcs_version = "PY2AND3",
     tags = [
         "manual",
diff --git a/tensorflow/contrib/batching/python/ops/batch_ops_test.py b/tensorflow/contrib/batching/python/ops/batch_ops_test.py
index fac7aff29f79fa..ea8339334f9b5e 100644
--- a/tensorflow/contrib/batching/python/ops/batch_ops_test.py
+++ b/tensorflow/contrib/batching/python/ops/batch_ops_test.py
@@ -23,7 +23,10 @@
 
 from tensorflow.contrib.batching.python.ops import batch_ops
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+from tensorflow.python.framework.errors import InvalidArgumentError
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_batch_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import script_ops
 from tensorflow.python.platform import test
@@ -205,6 +208,114 @@ def worker():
       self.assertEqual(thread_results[0], [2])
       self.assertEqual(main_results[0], [3])
 
+  def testBatchFunctionOp(self):
+    """Tests that the batch_function op works."""
+    with self.test_session() as sess:
+
+      @function.Defun(dtypes.int32)
+      def computation(in_t):
+        return in_t + 1
+
+      inp = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+      result = gen_batch_ops.batch_function(
+          [inp],
+          num_batch_threads=1,
+          max_batch_size=10,
+          batch_timeout_micros=100000,
+          Tout=[dtypes.int32],
+          f=computation,
+          captured_tensors=computation.captured_inputs)
+      thread_results = []
+
+      def worker():
+        thread_results.extend(sess.run([result], feed_dict={inp: [1]}))
+
+      worker_thread = threading.Thread(target=worker)
+      worker_thread.start()
+      main_results = sess.run([result], feed_dict={inp: [2]})
+      worker_thread.join()
+      self.assertEqual(thread_results[0], [2])
+      self.assertEqual(main_results[0], [3])
+
+  def testBatchFunctionOpWithCapturedInput(self):
+    """Tests that batch_function op works with captured input."""
+    with self.test_session() as sess:
+      captured_inp0 = array_ops.placeholder_with_default(2, shape=[])
+      captured_inp1 = array_ops.placeholder_with_default(1, shape=[])
+      inp = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+
+      @function.Defun(dtypes.int32)
+      def computation(inp):
+        return inp + captured_inp0 - captured_inp1
+
+      result = gen_batch_ops.batch_function(
+          num_batch_threads=1,
+          max_batch_size=10,
+          batch_timeout_micros=100000,  # 100ms
+          allowed_batch_sizes=[3, 10],
+          batching_queue="",
+          f=computation,
+          in_tensors=[inp],
+          captured_tensors=computation.captured_inputs,
+          Tout=[o.type for o in computation.definition.signature.output_arg])
+
+      thread_results = []
+
+      def worker():
+        thread_results.extend(sess.run([result], feed_dict={inp: [1]}))
+
+      worker_thread = threading.Thread(target=worker)
+      worker_thread.start()
+      main_results = sess.run([result], feed_dict={inp: [2]})
+      worker_thread.join()
+      self.assertEqual(thread_results[0], [2])
+      self.assertEqual(main_results[0], [3])
+
+  def testBatchFunctionOpWithInputError(self):
+    """Tests that batch_function op works with error in the inputs."""
+    with self.test_session() as sess:
+      inp = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+
+      @function.Defun(dtypes.int32, dtypes.int32)
+      def computation(in0, in1):
+        return in0 + in1
+
+      result = gen_batch_ops.batch_function(
+          [inp],  # computation actually expects 2 inputs.
+          num_batch_threads=1,
+          max_batch_size=10,
+          batch_timeout_micros=100000,  # 100ms
+          batching_queue="",
+          f=computation,
+          captured_tensors=computation.captured_inputs,
+          Tout=[o.type for o in computation.definition.signature.output_arg])
+
+      with self.assertRaisesRegexp(InvalidArgumentError,
+                                   ".*2 arguments.*but 1.*"):
+        sess.run([result], feed_dict={inp: [2]})
+
+  def testBasicUnbatchDecoratedWithReshape(self):
+    """Tests that the batch_function decorator works."""
+    with self.test_session() as sess:
+
+      @batch_ops.batch_function(1, 10, 100000)
+      def computation(in_t):
+        return array_ops.reshape(in_t, [-1]) + 1
+
+      inp = array_ops.placeholder(dtype=dtypes.int32, shape=[1, 1])
+      result = computation(inp)
+      thread_results = []
+
+      def worker():
+        thread_results.extend(sess.run([result], feed_dict={inp: [[1]]}))
+
+      worker_thread = threading.Thread(target=worker)
+      worker_thread.start()
+      main_results = sess.run([result], feed_dict={inp: [[2]]})
+      worker_thread.join()
+      self.assertEqual(thread_results[0], [2])
+      self.assertEqual(main_results[0], [3])
+
   def testUnbatchTimeout(self):
     """Tests that the unbatch timeout works."""
     with self.test_session() as sess:
@@ -250,7 +361,7 @@ def worker():
   def testUnbatchGrad(self):
     """Tests that batch and unbatch are differentiable."""
     with self.test_session() as sess:
-      inp = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+      inp = array_ops.placeholder(dtype=dtypes.float32, shape=[1])
       batched, index, id_t = batch_ops.batch(
           [inp], num_batch_threads=1, max_batch_size=2,
           batch_timeout_micros=36000000, grad_timeout_micros=1000000,
diff --git a/tensorflow/contrib/batching/serial_device_batch_scheduler.h b/tensorflow/contrib/batching/serial_device_batch_scheduler.h
new file mode 100644
index 00000000000000..bf6b7083612018
--- /dev/null
+++ b/tensorflow/contrib/batching/serial_device_batch_scheduler.h
@@ -0,0 +1,21 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_BATCHING_SERIAL_DEVICE_BATCH_SCHEDULER_H_
+#define TENSORFLOW_CONTRIB_BATCHING_SERIAL_DEVICE_BATCH_SCHEDULER_H_
+
+#include "tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h"
+
+#endif  // TENSORFLOW_CONTRIB_BATCHING_SERIAL_DEVICE_BATCH_SCHEDULER_H_
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
index 9994c84ebdb930..758754feac31f1 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
@@ -45,6 +45,7 @@
 
 _DNN_LEARNING_RATE = 0.001
 
+
 def _get_optimizer(optimizer):
   if callable(optimizer):
     return optimizer()
@@ -73,6 +74,7 @@ def _dnn_tree_combined_model_fn(features,
                                 dnn_input_layer_partitioner=None,
                                 dnn_input_layer_to_tree=True,
                                 dnn_steps_to_train=10000,
+                                predict_with_tree_only=False,
                                 tree_feature_columns=None,
                                 tree_center_bias=False,
                                 use_core_versions=False):
@@ -108,6 +110,8 @@ def _dnn_tree_combined_model_fn(features,
     as a feature to the tree.
     dnn_steps_to_train: Number of steps to train dnn for before switching
       to gbdt.
+    predict_with_tree_only: Whether to use only the tree model output as the
+      final prediction.
     tree_feature_columns: An iterable containing all the feature columns
       used by the model's boosted trees. If dnn_input_layer_to_tree is
       set to True, these features are in addition to dnn_feature_columns.
@@ -132,8 +136,7 @@ def _dnn_tree_combined_model_fn(features,
   dnn_parent_scope = "dnn"
   dnn_partitioner = dnn_input_layer_partitioner or (
       partitioned_variables.min_max_variable_partitioner(
-          max_partitions=config.num_ps_replicas,
-          min_slice_size=64 << 20))
+          max_partitions=config.num_ps_replicas, min_slice_size=64 << 20))
 
   with variable_scope.variable_scope(
       dnn_parent_scope,
@@ -171,8 +174,7 @@ def _dnn_tree_combined_model_fn(features,
       _add_hidden_layer_summary(net, hidden_layer_scope.name)
       previous_layer = net
     with variable_scope.variable_scope(
-        "logits",
-        values=(previous_layer,)) as logits_scope:
+        "logits", values=(previous_layer,)) as logits_scope:
       dnn_logits = layers.fully_connected(
           previous_layer,
           head.logits_dimension,
@@ -190,8 +192,7 @@ def _dnn_train_op_fn(loss):
           optimizer=_get_optimizer(dnn_optimizer),
           name=dnn_parent_scope,
           variables=ops.get_collection(
-              ops.GraphKeys.TRAINABLE_VARIABLES,
-              scope=dnn_parent_scope),
+              ops.GraphKeys.TRAINABLE_VARIABLES, scope=dnn_parent_scope),
           # Empty summaries to prevent optimizers from logging training_loss.
           summaries=[])
 
@@ -230,7 +231,10 @@ def _tree_train_op_fn(loss):
         update_op = state_ops.assign_add(global_step, 1).op
         return update_op
 
-  tree_train_logits = dnn_logits + tree_logits
+  if predict_with_tree_only:
+    tree_train_logits = tree_logits
+  else:
+    tree_train_logits = dnn_logits + tree_logits
 
   def _no_train_op_fn(loss):
     """Returns a no-op."""
@@ -288,10 +292,10 @@ def _no_train_op_fn(loss):
   finalized_trees, attempted_trees = gbdt_model.get_number_of_trees_tensor()
 
   model_fn_ops.training_hooks.extend([
-      trainer_hooks.SwitchTrainOp(
-          dnn_train_op, dnn_steps_to_train, tree_train_op),
-      trainer_hooks.StopAfterNTrees(
-          num_trees, attempted_trees, finalized_trees)])
+      trainer_hooks.SwitchTrainOp(dnn_train_op, dnn_steps_to_train,
+                                  tree_train_op),
+      trainer_hooks.StopAfterNTrees(num_trees, attempted_trees, finalized_trees)
+  ])
 
   return model_fn_ops
 
@@ -318,6 +322,7 @@ def __init__(self,
                dnn_input_layer_partitioner=None,
                dnn_input_layer_to_tree=True,
                dnn_steps_to_train=10000,
+               predict_with_tree_only=False,
                tree_feature_columns=None,
                tree_center_bias=False,
                use_core_versions=False):
@@ -360,6 +365,8 @@ def __init__(self,
       as a feature to the tree.
       dnn_steps_to_train: Number of steps to train dnn for before switching
         to gbdt.
+      predict_with_tree_only: Whether to use only the tree model output as the
+        final prediction.
       tree_feature_columns: An iterable containing all the feature columns
         used by the model's boosted trees. If dnn_input_layer_to_tree is
         set to True, these features are in addition to dnn_feature_columns.
@@ -377,16 +384,32 @@ def __init__(self,
 
     def _model_fn(features, labels, mode, config):
       return _dnn_tree_combined_model_fn(
-          features, labels, mode, head, dnn_hidden_units, dnn_feature_columns,
-          tree_learner_config, num_trees, tree_examples_per_layer, config,
-          dnn_optimizer, dnn_activation_fn, dnn_dropout,
-          dnn_input_layer_partitioner, dnn_input_layer_to_tree,
-          dnn_steps_to_train, tree_feature_columns, tree_center_bias,
-          use_core_versions)
+          features=features,
+          labels=labels,
+          mode=mode,
+          head=head,
+          dnn_hidden_units=dnn_hidden_units,
+          dnn_feature_columns=dnn_feature_columns,
+          tree_learner_config=tree_learner_config,
+          num_trees=num_trees,
+          tree_examples_per_layer=tree_examples_per_layer,
+          config=config,
+          dnn_optimizer=dnn_optimizer,
+          dnn_activation_fn=dnn_activation_fn,
+          dnn_dropout=dnn_dropout,
+          dnn_input_layer_partitioner=dnn_input_layer_partitioner,
+          dnn_input_layer_to_tree=dnn_input_layer_to_tree,
+          dnn_steps_to_train=dnn_steps_to_train,
+          predict_with_tree_only=predict_with_tree_only,
+          tree_feature_columns=tree_feature_columns,
+          tree_center_bias=tree_center_bias,
+          use_core_versions=use_core_versions)
 
     super(DNNBoostedTreeCombinedClassifier, self).__init__(
-        model_fn=_model_fn, model_dir=model_dir,
-        config=config, feature_engineering_fn=feature_engineering_fn)
+        model_fn=_model_fn,
+        model_dir=model_dir,
+        config=config,
+        feature_engineering_fn=feature_engineering_fn)
 
 
 class DNNBoostedTreeCombinedRegressor(estimator.Estimator):
@@ -410,6 +433,7 @@ def __init__(self,
                dnn_input_layer_partitioner=None,
                dnn_input_layer_to_tree=True,
                dnn_steps_to_train=10000,
+               predict_with_tree_only=False,
                tree_feature_columns=None,
                tree_center_bias=False,
                use_core_versions=False):
@@ -452,6 +476,8 @@ def __init__(self,
       as a feature to the tree.
       dnn_steps_to_train: Number of steps to train dnn for before switching
         to gbdt.
+      predict_with_tree_only: Whether to use only the tree model output as the
+        final prediction.
       tree_feature_columns: An iterable containing all the feature columns
         used by the model's boosted trees. If dnn_input_layer_to_tree is
         set to True, these features are in addition to dnn_feature_columns.
@@ -474,16 +500,32 @@ def __init__(self,
 
     def _model_fn(features, labels, mode, config):
       return _dnn_tree_combined_model_fn(
-          features, labels, mode, head, dnn_hidden_units, dnn_feature_columns,
-          tree_learner_config, num_trees, tree_examples_per_layer, config,
-          dnn_optimizer, dnn_activation_fn, dnn_dropout,
-          dnn_input_layer_partitioner, dnn_input_layer_to_tree,
-          dnn_steps_to_train, tree_feature_columns, tree_center_bias,
-          use_core_versions)
+          features=features,
+          labels=labels,
+          mode=mode,
+          head=head,
+          dnn_hidden_units=dnn_hidden_units,
+          dnn_feature_columns=dnn_feature_columns,
+          tree_learner_config=tree_learner_config,
+          num_trees=num_trees,
+          tree_examples_per_layer=tree_examples_per_layer,
+          config=config,
+          dnn_optimizer=dnn_optimizer,
+          dnn_activation_fn=dnn_activation_fn,
+          dnn_dropout=dnn_dropout,
+          dnn_input_layer_partitioner=dnn_input_layer_partitioner,
+          dnn_input_layer_to_tree=dnn_input_layer_to_tree,
+          dnn_steps_to_train=dnn_steps_to_train,
+          predict_with_tree_only=predict_with_tree_only,
+          tree_feature_columns=tree_feature_columns,
+          tree_center_bias=tree_center_bias,
+          use_core_versions=use_core_versions)
 
     super(DNNBoostedTreeCombinedRegressor, self).__init__(
-        model_fn=_model_fn, model_dir=model_dir,
-        config=config, feature_engineering_fn=feature_engineering_fn)
+        model_fn=_model_fn,
+        model_dir=model_dir,
+        config=config,
+        feature_engineering_fn=feature_engineering_fn)
 
 
 class DNNBoostedTreeCombinedEstimator(estimator.Estimator):
@@ -508,6 +550,7 @@ def __init__(self,
                dnn_input_layer_partitioner=None,
                dnn_input_layer_to_tree=True,
                dnn_steps_to_train=10000,
+               predict_with_tree_only=False,
                tree_feature_columns=None,
                tree_center_bias=False,
                use_core_versions=False):
@@ -545,6 +588,8 @@ def __init__(self,
       as a feature to the tree.
       dnn_steps_to_train: Number of steps to train dnn for before switching
         to gbdt.
+      predict_with_tree_only: Whether to use only the tree model output as the
+        final prediction.
       tree_feature_columns: An iterable containing all the feature columns
         used by the model's boosted trees. If dnn_input_layer_to_tree is
         set to True, these features are in addition to dnn_feature_columns.
@@ -553,15 +598,32 @@ def __init__(self,
       use_core_versions: Whether feature columns and loss are from the core (as
         opposed to contrib) version of tensorflow.
     """
+
     def _model_fn(features, labels, mode, config):
       return _dnn_tree_combined_model_fn(
-          features, labels, mode, head, dnn_hidden_units, dnn_feature_columns,
-          tree_learner_config, num_trees, tree_examples_per_layer, config,
-          dnn_optimizer, dnn_activation_fn, dnn_dropout,
-          dnn_input_layer_partitioner, dnn_input_layer_to_tree,
-          dnn_steps_to_train, tree_feature_columns, tree_center_bias,
-          use_core_versions)
+          features=features,
+          labels=labels,
+          mode=mode,
+          head=head,
+          dnn_hidden_units=dnn_hidden_units,
+          dnn_feature_columns=dnn_feature_columns,
+          tree_learner_config=tree_learner_config,
+          num_trees=num_trees,
+          tree_examples_per_layer=tree_examples_per_layer,
+          config=config,
+          dnn_optimizer=dnn_optimizer,
+          dnn_activation_fn=dnn_activation_fn,
+          dnn_dropout=dnn_dropout,
+          dnn_input_layer_partitioner=dnn_input_layer_partitioner,
+          dnn_input_layer_to_tree=dnn_input_layer_to_tree,
+          dnn_steps_to_train=dnn_steps_to_train,
+          predict_with_tree_only=predict_with_tree_only,
+          tree_feature_columns=tree_feature_columns,
+          tree_center_bias=tree_center_bias,
+          use_core_versions=use_core_versions)
 
     super(DNNBoostedTreeCombinedEstimator, self).__init__(
-        model_fn=_model_fn, model_dir=model_dir,
-        config=config, feature_engineering_fn=feature_engineering_fn)
+        model_fn=_model_fn,
+        model_dir=model_dir,
+        config=config,
+        feature_engineering_fn=feature_engineering_fn)
diff --git a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
index 44a8ffaf4b2f5a..401bec84a20a0f 100644
--- a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
@@ -43,47 +43,60 @@ namespace {
 const int32 DUMMY_FEATURE_DIMENSION = -1;
 }  // namespace
 
-class BaseBuildSplitOp : public OpKernel {
+class SplitBuilderState {
  public:
-  explicit BaseBuildSplitOp(OpKernelConstruction* const context)
-      : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("feature_column_group_id",
-                                             &feature_column_group_id_));
+  explicit SplitBuilderState(OpKernelContext* const context) {
+    const Tensor* l1_regularization_t;
     OP_REQUIRES_OK(context,
-                   context->GetAttr("l1_regularization", &l1_regularization_));
+                   context->input("l1_regularization", &l1_regularization_t));
+    const Tensor* l2_regularization_t;
     OP_REQUIRES_OK(context,
-                   context->GetAttr("l2_regularization", &l2_regularization_));
-    OP_REQUIRES_OK(context, context->GetAttr("tree_complexity_regularization",
-                                             &tree_complexity_regularization_));
+                   context->input("l2_regularization", &l2_regularization_t));
+    const Tensor* tree_complexity_regularization_t;
+    OP_REQUIRES_OK(context, context->input("tree_complexity_regularization",
+                                           &tree_complexity_regularization_t));
+    const Tensor* min_node_weight_t;
     OP_REQUIRES_OK(context,
-                   context->GetAttr("min_node_weight", &min_node_weight_));
+                   context->input("min_node_weight", &min_node_weight_t));
 
-    int strategy;
-    OP_REQUIRES_OK(context, context->GetAttr("multiclass_strategy", &strategy));
+    const Tensor* feature_column_group_id_t;
+    OP_REQUIRES_OK(context, context->input("feature_column_group_id",
+                                           &feature_column_group_id_t));
+
+    const Tensor* multiclass_strategy_t;
+    OP_REQUIRES_OK(
+        context, context->input("multiclass_strategy", &multiclass_strategy_t));
+    int strategy = multiclass_strategy_t->scalar<int32>()();
     OP_REQUIRES(
         context,
         boosted_trees::learner::LearnerConfig_MultiClassStrategy_IsValid(
             strategy),
         errors::InvalidArgument("Wrong multiclass strategy passed."));
-    multiclass_strategy_ = LearnerConfig_MultiClassStrategy(strategy);
-  }
 
-  NodeStats ComputeNodeStats(const GradientStats& grad_stats) {
-    return NodeStats(l1_regularization_, l2_regularization_, min_node_weight_,
-                     multiclass_strategy_, grad_stats);
-  }
+    multiclass_strategy_ = LearnerConfig_MultiClassStrategy(strategy);
 
-  void ReadClassId(OpKernelContext* const context, int32* class_id) {
     const Tensor* class_id_t;
     OP_REQUIRES_OK(context, context->input("class_id", &class_id_t));
     OP_REQUIRES(context, TensorShapeUtils::IsScalar(class_id_t->shape()),
                 errors::InvalidArgument("class_id must be a scalar."));
-    *class_id = class_id_t->scalar<int32>()();
+    class_id_ = class_id_t->scalar<int32>()();
+
+    l1_regularization_ = l1_regularization_t->scalar<float>()();
+    l2_regularization_ = l2_regularization_t->scalar<float>()();
+    tree_complexity_regularization_ =
+        tree_complexity_regularization_t->scalar<float>()();
+    min_node_weight_ = min_node_weight_t->scalar<float>()();
+    feature_column_group_id_ = feature_column_group_id_t->scalar<int32>()();
   }
 
-  void FillLeaf(const int class_id, const NodeStats& best_node_stats,
+  NodeStats ComputeNodeStats(const GradientStats& grad_stats) {
+    return NodeStats(l1_regularization_, l2_regularization_, min_node_weight_,
+                     multiclass_strategy_, grad_stats);
+  }
+
+  void FillLeaf(const NodeStats& best_node_stats,
                 boosted_trees::trees::Leaf* leaf) const {
-    if (class_id == -1) {
+    if (class_id_ == -1) {
       // This would be the case either for TREE_PER_CLASS with only 2 classes,
       // or for other multiclass strategies.
       for (float f : best_node_stats.weight_contribution) {
@@ -93,25 +106,31 @@ class BaseBuildSplitOp : public OpKernel {
       CHECK(best_node_stats.weight_contribution.size() == 1)
           << "Weight contribution size = "
           << best_node_stats.weight_contribution.size();
-      leaf->mutable_sparse_vector()->add_index(class_id);
+      leaf->mutable_sparse_vector()->add_index(class_id_);
       leaf->mutable_sparse_vector()->add_value(
           best_node_stats.weight_contribution[0]);
     }
   }
 
- protected:
+  int32 feature_column_group_id() { return feature_column_group_id_; }
+  float tree_complexity_regularization() {
+    return tree_complexity_regularization_;
+  }
+
+ private:
   LearnerConfig_MultiClassStrategy multiclass_strategy_;
-  int32 feature_column_group_id_;
   float l1_regularization_;
   float l2_regularization_;
-  float min_node_weight_;
   float tree_complexity_regularization_;
+  float min_node_weight_;
+  int32 class_id_;
+  int32 feature_column_group_id_;
 };
 
-class BuildDenseInequalitySplitsOp : public BaseBuildSplitOp {
+class BuildDenseInequalitySplitsOp : public OpKernel {
  public:
   explicit BuildDenseInequalitySplitsOp(OpKernelConstruction* const context)
-      : BaseBuildSplitOp(context) {}
+      : OpKernel(context) {}
 
   void Compute(OpKernelContext* const context) override {
     const Tensor* num_minibatches_t;
@@ -139,9 +158,6 @@ class BuildDenseInequalitySplitsOp : public BaseBuildSplitOp {
     const Tensor* hessians_t;
     OP_REQUIRES_OK(context, context->input("hessians", &hessians_t));
 
-    int class_id;
-    ReadClassId(context, &class_id);
-
     // Find the number of unique partitions before we allocate the output.
     std::vector<int32> partition_boundaries;
     partition_boundaries.push_back(0);
@@ -185,6 +201,7 @@ class BuildDenseInequalitySplitsOp : public BaseBuildSplitOp {
                                 &output_splits_t));
     tensorflow::TTypes<string>::Vec output_splits =
         output_splits_t->vec<string>();
+    SplitBuilderState state(context);
     for (int root_idx = 0; root_idx < num_elements; ++root_idx) {
       float best_gain = std::numeric_limits<float>::lowest();
       int start_index = partition_boundaries[root_idx];
@@ -196,7 +213,7 @@ class BuildDenseInequalitySplitsOp : public BaseBuildSplitOp {
             GradientStats(*gradients_t, *hessians_t, bucket_idx);
       }
       root_gradient_stats *= normalizer_ratio;
-      NodeStats root_stats = ComputeNodeStats(root_gradient_stats);
+      NodeStats root_stats = state.ComputeNodeStats(root_gradient_stats);
       int32 best_bucket_idx = 0;
       NodeStats best_right_node_stats(0);
       NodeStats best_left_node_stats(0);
@@ -206,10 +223,10 @@ class BuildDenseInequalitySplitsOp : public BaseBuildSplitOp {
         GradientStats g(*gradients_t, *hessians_t, bucket_idx);
         g *= normalizer_ratio;
         left_gradient_stats += g;
-        NodeStats left_stats = ComputeNodeStats(left_gradient_stats);
+        NodeStats left_stats = state.ComputeNodeStats(left_gradient_stats);
         GradientStats right_gradient_stats =
             root_gradient_stats - left_gradient_stats;
-        NodeStats right_stats = ComputeNodeStats(right_gradient_stats);
+        NodeStats right_stats = state.ComputeNodeStats(right_gradient_stats);
         if (left_stats.gain + right_stats.gain > best_gain) {
           best_gain = left_stats.gain + right_stats.gain;
           best_left_node_stats = left_stats;
@@ -220,18 +237,18 @@ class BuildDenseInequalitySplitsOp : public BaseBuildSplitOp {
       SplitInfo split_info;
       auto* dense_split =
           split_info.mutable_split_node()->mutable_dense_float_binary_split();
-      dense_split->set_feature_column(feature_column_group_id_);
+      dense_split->set_feature_column(state.feature_column_group_id());
       dense_split->set_threshold(
           bucket_boundaries(bucket_ids(best_bucket_idx, 0)));
 
       auto* left_child = split_info.mutable_left_child();
       auto* right_child = split_info.mutable_right_child();
 
-      FillLeaf(class_id, best_left_node_stats, left_child);
-      FillLeaf(class_id, best_right_node_stats, right_child);
+      state.FillLeaf(best_left_node_stats, left_child);
+      state.FillLeaf(best_right_node_stats, right_child);
       split_info.SerializeToString(&output_splits(root_idx));
       gains(root_idx) =
-          best_gain - root_stats.gain - tree_complexity_regularization_;
+          best_gain - root_stats.gain - state.tree_complexity_regularization();
       output_partition_ids(root_idx) = partition_ids(start_index);
     }
   }
@@ -239,13 +256,10 @@ class BuildDenseInequalitySplitsOp : public BaseBuildSplitOp {
 REGISTER_KERNEL_BUILDER(Name("BuildDenseInequalitySplits").Device(DEVICE_CPU),
                         BuildDenseInequalitySplitsOp);
 
-class BuildSparseInequalitySplitsOp : public BaseBuildSplitOp {
+class BuildSparseInequalitySplitsOp : public OpKernel {
  public:
   explicit BuildSparseInequalitySplitsOp(OpKernelConstruction* const context)
-      : BaseBuildSplitOp(context) {
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("bias_feature_id", &bias_feature_id_));
-  }
+      : OpKernel(context) {}
 
   void Compute(OpKernelContext* const context) override {
     const Tensor* num_minibatches_t;
@@ -275,8 +289,10 @@ class BuildSparseInequalitySplitsOp : public BaseBuildSplitOp {
     const Tensor* hessians_t;
     OP_REQUIRES_OK(context, context->input("hessians", &hessians_t));
 
-    int class_id;
-    ReadClassId(context, &class_id);
+    const Tensor* bias_feature_id_t;
+    OP_REQUIRES_OK(context,
+                   context->input("bias_feature_id", &bias_feature_id_t));
+    int64 bias_feature_id = bias_feature_id_t->scalar<int64>()();
 
     // For each partition (tree node), store starting index for each dimension.
     PartitionAndDimensionBoundaries partition_boundaries;
@@ -354,6 +370,7 @@ class BuildSparseInequalitySplitsOp : public BaseBuildSplitOp {
                                 &output_splits_t));
     tensorflow::TTypes<string>::Vec output_splits =
         output_splits_t->vec<string>();
+    SplitBuilderState state(context);
     // For each tree node that needs to be split.
     for (int root_idx = 0; root_idx < num_elements; ++root_idx) {
       const auto& dimension_boundaries =
@@ -372,7 +389,7 @@ class BuildSparseInequalitySplitsOp : public BaseBuildSplitOp {
 
       OP_REQUIRES(
           context,
-          bucket_ids_and_dimensions(bias_start_index, 0) == bias_feature_id_,
+          bucket_ids_and_dimensions(bias_start_index, 0) == bias_feature_id,
           errors::InvalidArgument("Bias feature ID missing."));
 
       // Dimension for bias feature is always 0
@@ -388,7 +405,7 @@ class BuildSparseInequalitySplitsOp : public BaseBuildSplitOp {
       GradientStats root_gradient_stats(*gradients_t, *hessians_t,
                                         bias_start_index);
       root_gradient_stats *= normalizer_ratio;
-      NodeStats root_stats = ComputeNodeStats(root_gradient_stats);
+      NodeStats root_stats = state.ComputeNodeStats(root_gradient_stats);
 
       // Iterate through dimensions.
       for (int j = 0; j < dimension_boundaries.size() - 1; ++j) {
@@ -408,7 +425,7 @@ class BuildSparseInequalitySplitsOp : public BaseBuildSplitOp {
             << bucket_ids_and_dimensions(start_index, 1) << " and for "
             << bucket_ids_and_dimensions(end_index - 1, 0) << " "
             << bucket_ids_and_dimensions(end_index - 1, 1);
-        if (bucket_ids_and_dimensions(start_index, 0) == bias_feature_id_) {
+        if (bucket_ids_and_dimensions(start_index, 0) == bias_feature_id) {
           // 0-dimension case which has a first bucket for catch all feature.
           CHECK(bucket_ids_and_dimensions(start_index, 1) == 0)
               << "Dimension of bias feature should be 0";
@@ -422,6 +439,10 @@ class BuildSparseInequalitySplitsOp : public BaseBuildSplitOp {
               GradientStats(*gradients_t, *hessians_t, bucket_idx);
         }
         present_gradient_stats *= normalizer_ratio;
+        GradientStats not_present =
+            root_gradient_stats - present_gradient_stats;
+        // If there was (almost) no sparsity, fix the default direction to LEFT.
+        bool fixed_default_direction = not_present.IsAlmostZero();
 
         GradientStats left_gradient_stats;
         for (int64 element_idx = start_index; element_idx < end_index;
@@ -441,11 +462,12 @@ class BuildSparseInequalitySplitsOp : public BaseBuildSplitOp {
           // backward pass gradients.
           GradientStats right_gradient_stats =
               present_gradient_stats - left_gradient_stats;
+
           {
-            NodeStats left_stats_default_left =
-                ComputeNodeStats(root_gradient_stats - right_gradient_stats);
+            NodeStats left_stats_default_left = state.ComputeNodeStats(
+                root_gradient_stats - right_gradient_stats);
             NodeStats right_stats_default_left =
-                ComputeNodeStats(right_gradient_stats);
+                state.ComputeNodeStats(right_gradient_stats);
             if (left_stats_default_left.gain + right_stats_default_left.gain >
                 best_gain) {
               best_gain =
@@ -457,11 +479,13 @@ class BuildSparseInequalitySplitsOp : public BaseBuildSplitOp {
               best_dimension_idx = dimension_id;
             }
           }
-          {
+          // Consider calculating the default direction only when there were
+          // enough missing examples.
+          if (!fixed_default_direction) {
             NodeStats left_stats_default_right =
-                ComputeNodeStats(left_gradient_stats);
-            NodeStats right_stats_default_right =
-                ComputeNodeStats(root_gradient_stats - left_gradient_stats);
+                state.ComputeNodeStats(left_gradient_stats);
+            NodeStats right_stats_default_right = state.ComputeNodeStats(
+                root_gradient_stats - left_gradient_stats);
             if (left_stats_default_right.gain + right_stats_default_right.gain >
                 best_gain) {
               best_gain = left_stats_default_right.gain +
@@ -487,7 +511,7 @@ class BuildSparseInequalitySplitsOp : public BaseBuildSplitOp {
                           ->mutable_sparse_float_binary_split_default_left()
                           ->mutable_split();
       }
-      dense_split->set_feature_column(feature_column_group_id_);
+      dense_split->set_feature_column(state.feature_column_group_id());
       // Set the feature index for the best feature column.
       const int64 best_dimension_id =
           bucket_ids_and_dimensions(best_element_idx, 1);
@@ -498,11 +522,11 @@ class BuildSparseInequalitySplitsOp : public BaseBuildSplitOp {
 
       auto* left_child = split_info.mutable_left_child();
       auto* right_child = split_info.mutable_right_child();
-      FillLeaf(class_id, best_left_node_stats, left_child);
-      FillLeaf(class_id, best_right_node_stats, right_child);
+      state.FillLeaf(best_left_node_stats, left_child);
+      state.FillLeaf(best_right_node_stats, right_child);
       split_info.SerializeToString(&output_splits(root_idx));
       gains(root_idx) =
-          best_gain - root_stats.gain - tree_complexity_regularization_;
+          best_gain - root_stats.gain - state.tree_complexity_regularization();
       output_partition_ids(root_idx) = partition_ids(bias_start_index);
     }
   }
@@ -519,19 +543,14 @@ class BuildSparseInequalitySplitsOp : public BaseBuildSplitOp {
   // For each partition, store start indices of feature column dimensions.
   typedef std::vector<std::vector<DimensionBoundary>>
       PartitionAndDimensionBoundaries;
-
-  int64 bias_feature_id_;
 };
 REGISTER_KERNEL_BUILDER(Name("BuildSparseInequalitySplits").Device(DEVICE_CPU),
                         BuildSparseInequalitySplitsOp);
 
-class BuildCategoricalEqualitySplitsOp : public BaseBuildSplitOp {
+class BuildCategoricalEqualitySplitsOp : public OpKernel {
  public:
   explicit BuildCategoricalEqualitySplitsOp(OpKernelConstruction* const context)
-      : BaseBuildSplitOp(context) {
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("bias_feature_id", &bias_feature_id_));
-  }
+      : OpKernel(context) {}
 
   void Compute(OpKernelContext* const context) override {
     const Tensor* num_minibatches_t;
@@ -554,8 +573,10 @@ class BuildCategoricalEqualitySplitsOp : public BaseBuildSplitOp {
     const Tensor* hessians_t;
     OP_REQUIRES_OK(context, context->input("hessians", &hessians_t));
 
-    int class_id;
-    ReadClassId(context, &class_id);
+    const Tensor* bias_feature_id_t;
+    OP_REQUIRES_OK(context,
+                   context->input("bias_feature_id", &bias_feature_id_t));
+    int64 bias_feature_id = bias_feature_id_t->scalar<int64>()();
 
     // Find the number of unique partitions before we allocate the output.
     std::vector<int32> partition_boundaries;
@@ -598,16 +619,17 @@ class BuildCategoricalEqualitySplitsOp : public BaseBuildSplitOp {
                                 &output_splits_t));
     tensorflow::TTypes<string>::Vec output_splits =
         output_splits_t->vec<string>();
+    SplitBuilderState state(context);
     for (int root_idx = 0; root_idx < num_elements; ++root_idx) {
       float best_gain = std::numeric_limits<float>::lowest();
       int start_index = partition_boundaries[non_empty_partitions[root_idx]];
       int end_index = partition_boundaries[non_empty_partitions[root_idx] + 1];
       // First feature ID in each partition should be the bias feature.
-      OP_REQUIRES(context, feature_ids(start_index, 0) == bias_feature_id_,
+      OP_REQUIRES(context, feature_ids(start_index, 0) == bias_feature_id,
                   errors::InvalidArgument("Bias feature ID missing."));
       GradientStats root_gradient_stats(*gradients_t, *hessians_t, start_index);
       root_gradient_stats *= normalizer_ratio;
-      NodeStats root_stats = ComputeNodeStats(root_gradient_stats);
+      NodeStats root_stats = state.ComputeNodeStats(root_gradient_stats);
       int32 best_feature_idx = 0;
       NodeStats best_right_node_stats(0);
       NodeStats best_left_node_stats(0);
@@ -618,8 +640,8 @@ class BuildCategoricalEqualitySplitsOp : public BaseBuildSplitOp {
         left_gradient_stats *= normalizer_ratio;
         GradientStats right_gradient_stats =
             root_gradient_stats - left_gradient_stats;
-        NodeStats left_stats = ComputeNodeStats(left_gradient_stats);
-        NodeStats right_stats = ComputeNodeStats(right_gradient_stats);
+        NodeStats left_stats = state.ComputeNodeStats(left_gradient_stats);
+        NodeStats right_stats = state.ComputeNodeStats(right_gradient_stats);
         if (left_stats.gain + right_stats.gain > best_gain) {
           best_gain = left_stats.gain + right_stats.gain;
           best_left_node_stats = left_stats;
@@ -630,21 +652,18 @@ class BuildCategoricalEqualitySplitsOp : public BaseBuildSplitOp {
       SplitInfo split_info;
       auto* equality_split = split_info.mutable_split_node()
                                  ->mutable_categorical_id_binary_split();
-      equality_split->set_feature_column(feature_column_group_id_);
+      equality_split->set_feature_column(state.feature_column_group_id());
       equality_split->set_feature_id(feature_ids(best_feature_idx, 0));
       auto* left_child = split_info.mutable_left_child();
       auto* right_child = split_info.mutable_right_child();
-      FillLeaf(class_id, best_left_node_stats, left_child);
-      FillLeaf(class_id, best_right_node_stats, right_child);
+      state.FillLeaf(best_left_node_stats, left_child);
+      state.FillLeaf(best_right_node_stats, right_child);
       split_info.SerializeToString(&output_splits(root_idx));
       gains(root_idx) =
-          best_gain - root_stats.gain - tree_complexity_regularization_;
+          best_gain - root_stats.gain - state.tree_complexity_regularization();
       output_partition_ids(root_idx) = partition_ids(start_index);
     }
   }
-
- private:
-  int64 bias_feature_id_;
 };
 
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
index 9d6cc9245aa463..409a2d8f46c331 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
@@ -64,6 +64,8 @@
 import re
 
 from tensorflow.contrib.boosted_trees.lib.learner.batch import base_split_handler
+from tensorflow.contrib.boosted_trees.python.ops import gen_quantile_ops
+from tensorflow.contrib.boosted_trees.python.ops import gen_stats_accumulator_ops
 from tensorflow.contrib.boosted_trees.python.ops import quantile_ops
 from tensorflow.contrib.boosted_trees.python.ops import split_handler_ops
 from tensorflow.contrib.boosted_trees.python.ops import stats_accumulator_ops
@@ -72,9 +74,11 @@
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+
 _BIAS_FEATURE_ID = -1
 # Pattern to remove all non alpha numeric from a string.
 _PATTERN = re.compile(r"[\W_]+")
@@ -130,11 +134,14 @@ def __init__(self,
         gradient_shape,
         hessian_shape,
         name="StatsAccumulator/{}".format(self._name))
-    self._quantile_accumulator = quantile_ops.QuantileAccumulator(
-        init_stamp_token,
-        epsilon=epsilon,
-        num_quantiles=num_quantiles,
-        name="QuantileAccumulator/{}".format(self._name))
+    # Allocate both stats accumulator and quantile accumulator on the same
+    # device so that we can build splits with fewer RPCs.
+    with ops.colocate_with(self._stats_accumulator.resource()):
+      self._quantile_accumulator = quantile_ops.QuantileAccumulator(
+          init_stamp_token,
+          epsilon=epsilon,
+          num_quantiles=num_quantiles,
+          name="QuantileAccumulator/{}".format(self._name))
 
 
 class DenseSplitHandler(InequalitySplitHandler):
@@ -236,45 +243,74 @@ def update_stats(self, stamp_token, example_partition_ids, gradients,
 
   def make_splits(self, stamp_token, next_stamp_token, class_id):
     """Create the best split using the accumulated stats and flush the state."""
-    # Get the bucket boundaries
-    are_splits_ready, buckets = (
-        self._quantile_accumulator.get_buckets(stamp_token))
-    # After we receive the boundaries from previous iteration we can flush
-    # the quantile accumulator.
-    with ops.control_dependencies([buckets]):
-      flush_quantiles = self._quantile_accumulator.flush(
-          stamp_token=stamp_token, next_stamp_token=next_stamp_token)
-
-    # Get the aggregated gradients and hessians per <partition_id, feature_id>
-    # pair.
-    # In order to distribute the computation on all the PSs we use the PS that
-    # had the stats accumulator on.
-    with ops.device(None):
-      with ops.device(self._stats_accumulator.resource().device):
-        num_minibatches, partition_ids, bucket_ids, gradients, hessians = (
-            self._stats_accumulator.flush(stamp_token, next_stamp_token))
-
-        # Put quantile and stats accumulator flushing in the dependency path.
-        are_splits_ready = control_flow_ops.with_dependencies(
-            [flush_quantiles, partition_ids], are_splits_ready)
-
-        partition_ids, gains, split_infos = (
-            split_handler_ops.build_dense_inequality_splits(
-                num_minibatches=num_minibatches,
-                bucket_boundaries=buckets,
-                partition_ids=partition_ids,
-                bucket_ids=bucket_ids,
-                gradients=gradients,
-                hessians=hessians,
-                class_id=class_id,
-                feature_column_group_id=self._feature_column_group_id,
-                l1_regularization=self._l1_regularization,
-                l2_regularization=self._l2_regularization,
-                tree_complexity_regularization=self.
-                _tree_complexity_regularization,
-                min_node_weight=self._min_node_weight,
-                multiclass_strategy=self._multiclass_strategy))
-    return (are_splits_ready, partition_ids, gains, split_infos)
+    if (self._gradient_shape == tensor_shape.scalar() and
+        self._hessian_shape == tensor_shape.scalar()):
+      handler = make_dense_split_scalar
+    else:
+      handler = make_dense_split_tensor
+
+    are_splits_ready, partition_ids, gains, split_infos = (
+        handler(self._quantile_accumulator.resource(),
+                self._stats_accumulator.resource(), stamp_token,
+                next_stamp_token, self._multiclass_strategy, class_id,
+                self._feature_column_group_id, self._l1_regularization,
+                self._l2_regularization, self._tree_complexity_regularization,
+                self._min_node_weight))
+    return are_splits_ready, partition_ids, gains, split_infos
+
+
+def _make_dense_split(quantile_accumulator_handle, stats_accumulator_handle,
+                      stamp_token, next_stamp_token, multiclass_strategy,
+                      class_id, feature_column_id, l1_regularization,
+                      l2_regularization, tree_complexity_regularization,
+                      min_node_weight, is_multi_dimentional):
+  """Function that builds splits for a dense feature column."""
+  # Get the bucket boundaries
+  are_splits_ready, buckets = (
+      gen_quantile_ops.quantile_accumulator_get_buckets(
+          quantile_accumulator_handles=[quantile_accumulator_handle],
+          stamp_token=stamp_token))
+  # quantile_accumulator_get_buckets returns a list of results per handle that
+  # we pass to it. In this case we're getting results just for one resource.
+  are_splits_ready = are_splits_ready[0]
+  buckets = buckets[0]
+
+  # After we receive the boundaries from previous iteration we can flush
+  # the quantile accumulator.
+  with ops.control_dependencies([buckets]):
+    flush_quantiles = gen_quantile_ops.quantile_accumulator_flush(
+        quantile_accumulator_handle=quantile_accumulator_handle,
+        stamp_token=stamp_token,
+        next_stamp_token=next_stamp_token)
+
+  if is_multi_dimentional:
+    num_minibatches, partition_ids, bucket_ids, gradients, hessians = (
+        gen_stats_accumulator_ops.stats_accumulator_tensor_flush(
+            stats_accumulator_handle, stamp_token, next_stamp_token))
+  else:
+    num_minibatches, partition_ids, bucket_ids, gradients, hessians = (
+        gen_stats_accumulator_ops.stats_accumulator_scalar_flush(
+            stats_accumulator_handle, stamp_token, next_stamp_token))
+
+  # Put quantile and stats accumulator flushing in the dependency path.
+  with ops.control_dependencies([flush_quantiles, partition_ids]):
+    are_splits_ready = array_ops.identity(are_splits_ready)
+  partition_ids, gains, split_infos = (
+      split_handler_ops.build_dense_inequality_splits(
+          num_minibatches=num_minibatches,
+          bucket_boundaries=buckets,
+          partition_ids=partition_ids,
+          bucket_ids=bucket_ids,
+          gradients=gradients,
+          hessians=hessians,
+          class_id=class_id,
+          feature_column_group_id=feature_column_id,
+          l1_regularization=l1_regularization,
+          l2_regularization=l2_regularization,
+          tree_complexity_regularization=tree_complexity_regularization,
+          min_node_weight=min_node_weight,
+          multiclass_strategy=multiclass_strategy))
+  return are_splits_ready, partition_ids, gains, split_infos
 
 
 class SparseSplitHandler(InequalitySplitHandler):
@@ -327,9 +363,6 @@ def __init__(self,
         multiclass_strategy=multiclass_strategy,
         init_stamp_token=init_stamp_token,
         name=name)
-    # Register sparse_make_stats_update function as an Op to the graph.
-    g = ops.get_default_graph()
-    sparse_make_stats_update.add_to_graph(g)
     self._sparse_float_column = sparse_float_column
 
   def scheduled_reads(self):
@@ -361,8 +394,8 @@ def update_stats(self, stamp_token, example_partition_ids, gradients,
     are_buckets_ready, buckets = scheduled_reads[0]
     with ops.name_scope(self._name, "SparseSplitHandler"):
       (quantile_indices, quantile_values, quantile_shapes, quantile_weights,
-       example_partition_ids,
-       feature_ids, gradients, hessians) = sparse_make_stats_update(
+       example_partition_ids, feature_ids, gradients,
+       hessians) = sparse_make_stats_update(
            is_active, are_buckets_ready, self._sparse_float_column.indices,
            self._sparse_float_column.values,
            self._sparse_float_column.dense_shape, buckets,
@@ -379,42 +412,115 @@ def update_stats(self, stamp_token, example_partition_ids, gradients,
 
   def make_splits(self, stamp_token, next_stamp_token, class_id):
     """Create the best split using the accumulated stats and flush the state."""
-    # Get the bucket boundaries
-    are_splits_ready, buckets = (
-        self._quantile_accumulator.get_buckets(stamp_token))
-
-    # After we receive the boundaries from previous iteration we can flush
-    # the quantile accumulator.
-    with ops.control_dependencies([buckets]):
-      flush_quantiles = self._quantile_accumulator.flush(
-          stamp_token=stamp_token, next_stamp_token=next_stamp_token)
-
-    with ops.device(None):
-      with ops.device(self._stats_accumulator.resource().device):
-        num_minibatches, partition_ids, bucket_ids, gradients, hessians = (
-            self._stats_accumulator.flush(stamp_token, next_stamp_token))
-
-        # Put quantile and stats accumulator flushing in the dependency path.
-        are_splits_ready = control_flow_ops.with_dependencies(
-            [flush_quantiles, partition_ids], are_splits_ready)
-        partition_ids, gains, split_infos = (
-            split_handler_ops.build_sparse_inequality_splits(
-                num_minibatches=num_minibatches,
-                bucket_boundaries=buckets,
-                partition_ids=partition_ids,
-                bucket_ids=bucket_ids,
-                gradients=gradients,
-                hessians=hessians,
-                class_id=class_id,
-                feature_column_group_id=self._feature_column_group_id,
-                l1_regularization=self._l1_regularization,
-                l2_regularization=self._l2_regularization,
-                tree_complexity_regularization=self.
-                _tree_complexity_regularization,
-                min_node_weight=self._min_node_weight,
-                bias_feature_id=_BIAS_FEATURE_ID,
-                multiclass_strategy=self._multiclass_strategy))
-    return (are_splits_ready, partition_ids, gains, split_infos)
+    if (self._gradient_shape == tensor_shape.scalar() and
+        self._hessian_shape == tensor_shape.scalar()):
+      handler = make_sparse_split_scalar
+    else:
+      handler = make_sparse_split_tensor
+
+    are_splits_ready, partition_ids, gains, split_infos = (
+        handler(self._quantile_accumulator.resource(),
+                self._stats_accumulator.resource(), stamp_token,
+                next_stamp_token, self._multiclass_strategy, class_id,
+                self._feature_column_group_id, self._l1_regularization,
+                self._l2_regularization, self._tree_complexity_regularization,
+                self._min_node_weight))
+    return are_splits_ready, partition_ids, gains, split_infos
+
+
+def _make_sparse_split(quantile_accumulator_handle, stats_accumulator_handle,
+                       stamp_token, next_stamp_token, multiclass_strategy,
+                       class_id, feature_column_id, l1_regularization,
+                       l2_regularization, tree_complexity_regularization,
+                       min_node_weight, is_multi_dimentional):
+  """Function that builds splits for a sparse feature column."""
+  # Get the bucket boundaries
+  are_splits_ready, buckets = (
+      gen_quantile_ops.quantile_accumulator_get_buckets(
+          quantile_accumulator_handles=[quantile_accumulator_handle],
+          stamp_token=stamp_token))
+  # quantile_accumulator_get_buckets returns a list of results per handle that
+  # we pass to it. In this case we're getting results just for one resource.
+  are_splits_ready = are_splits_ready[0]
+  buckets = buckets[0]
+
+  # After we receive the boundaries from previous iteration we can flush
+  # the quantile accumulator.
+  with ops.control_dependencies([buckets]):
+    flush_quantiles = gen_quantile_ops.quantile_accumulator_flush(
+        quantile_accumulator_handle=quantile_accumulator_handle,
+        stamp_token=stamp_token,
+        next_stamp_token=next_stamp_token)
+
+  if is_multi_dimentional:
+    num_minibatches, partition_ids, bucket_ids, gradients, hessians = (
+        gen_stats_accumulator_ops.stats_accumulator_tensor_flush(
+            stats_accumulator_handle, stamp_token, next_stamp_token))
+  else:
+    num_minibatches, partition_ids, bucket_ids, gradients, hessians = (
+        gen_stats_accumulator_ops.stats_accumulator_scalar_flush(
+            stats_accumulator_handle, stamp_token, next_stamp_token))
+
+  # Put quantile and stats accumulator flushing in the dependency path.
+  with ops.control_dependencies([flush_quantiles, partition_ids]):
+    are_splits_ready = array_ops.identity(are_splits_ready)
+  partition_ids, gains, split_infos = (
+      split_handler_ops.build_sparse_inequality_splits(
+          num_minibatches=num_minibatches,
+          bucket_boundaries=buckets,
+          partition_ids=partition_ids,
+          bucket_ids=bucket_ids,
+          gradients=gradients,
+          hessians=hessians,
+          class_id=class_id,
+          feature_column_group_id=feature_column_id,
+          l1_regularization=l1_regularization,
+          l2_regularization=l2_regularization,
+          tree_complexity_regularization=tree_complexity_regularization,
+          min_node_weight=min_node_weight,
+          bias_feature_id=_BIAS_FEATURE_ID,
+          multiclass_strategy=multiclass_strategy))
+  return are_splits_ready, partition_ids, gains, split_infos
+
+
+def _specialize_make_split(func, is_multi_dimentional):
+  """Builds a specialized version of the function."""
+
+  @function.Defun(
+      dtypes.resource,
+      dtypes.resource,
+      dtypes.int64,
+      dtypes.int64,
+      dtypes.int32,
+      dtypes.int32,
+      dtypes.int32,
+      dtypes.float32,
+      dtypes.float32,
+      dtypes.float32,
+      dtypes.float32,
+      noinline=True)
+  def f(quantile_accumulator_handle, stats_accumulator_handle, stamp_token,
+        next_stamp_token, multiclass_strategy, class_id, feature_column_id,
+        l1_regularization, l2_regularization, tree_complexity_regularization,
+        min_node_weight):
+    """Function that builds splits for a sparse feature column."""
+    return func(
+        quantile_accumulator_handle, stats_accumulator_handle, stamp_token,
+        next_stamp_token, multiclass_strategy, class_id, feature_column_id,
+        l1_regularization, l2_regularization, tree_complexity_regularization,
+        min_node_weight, is_multi_dimentional)
+
+  return f
+
+make_dense_split_scalar = _specialize_make_split(_make_dense_split,
+                                                 is_multi_dimentional=False)
+make_dense_split_tensor = _specialize_make_split(_make_dense_split,
+                                                 is_multi_dimentional=True)
+
+make_sparse_split_scalar = _specialize_make_split(_make_sparse_split,
+                                                  is_multi_dimentional=False)
+make_sparse_split_tensor = _specialize_make_split(_make_sparse_split,
+                                                  is_multi_dimentional=True)
 
 
 @function.Defun(
@@ -501,11 +607,18 @@ def quantiles_ready():
         example_partition_ids)
 
     # Compute aggregate stats for each partition.
+    # Since unsorted_segment_sum can be numerically unstable, use 64bit
+    # operation.
+    gradients64 = math_ops.cast(gradients, dtypes.float64)
+    hessians64 = math_ops.cast(hessians, dtypes.float64)
     per_partition_gradients = math_ops.unsorted_segment_sum(
-        gradients, mapped_partitions, array_ops.size(unique_partitions))
+        gradients64, mapped_partitions, array_ops.size(unique_partitions))
     per_partition_hessians = math_ops.unsorted_segment_sum(
-        hessians, mapped_partitions, array_ops.size(unique_partitions))
-
+        hessians64, mapped_partitions, array_ops.size(unique_partitions))
+    per_partition_gradients = math_ops.cast(per_partition_gradients,
+                                            dtypes.float32)
+    per_partition_hessians = math_ops.cast(per_partition_hessians,
+                                           dtypes.float32)
     # Prepend a bias feature per partition that accumulates the stats for all
     # examples in that partition.
     bias_feature_ids = array_ops.fill(
@@ -533,8 +646,9 @@ def quantiles_not_ready():
 
   empty_float = constant_op.constant([], dtype=dtypes.float32)
   handler_not_active = (constant_op.constant(
-      [], dtype=dtypes.int64, shape=[0, 2]), empty_float, constant_op.constant(
-          [0, 1], dtype=dtypes.int64), empty_float)
+      [], dtype=dtypes.int64, shape=[0, 2]), empty_float,
+                        constant_op.constant([0, 1], dtype=dtypes.int64),
+                        empty_float)
   handler_active = (sparse_column_indices, sparse_column_values,
                     sparse_column_shape, weights)
   quantile_indices, quantile_values, quantile_shape, quantile_weights = (
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py
index 54d03018d9e266..2f2c2302113bf5 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py
@@ -18,6 +18,8 @@
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.contrib.boosted_trees.lib.learner.batch import ordinal_split_handler
 from tensorflow.contrib.boosted_trees.proto import learner_pb2
 from tensorflow.contrib.boosted_trees.proto import split_info_pb2
@@ -65,9 +67,9 @@ def testGenerateFeatureSplitCandidates(self):
       hessian_shape = tensor_shape.scalar()
       split_handler = ordinal_split_handler.DenseSplitHandler(
           l1_regularization=0.1,
-          l2_regularization=1,
-          tree_complexity_regularization=0,
-          min_node_weight=0,
+          l2_regularization=1.,
+          tree_complexity_regularization=0.,
+          min_node_weight=0.,
           epsilon=0.001,
           num_quantiles=10,
           feature_column_group_id=0,
@@ -92,7 +94,9 @@ def testGenerateFeatureSplitCandidates(self):
           example_weights,
           is_active=array_ops.constant([True, True]))
       with ops.control_dependencies([update_1]):
-        are_splits_ready = split_handler.make_splits(0, 1, class_id)[0]
+        are_splits_ready = split_handler.make_splits(
+            np.int64(0), np.int64(1), class_id)[0]
+
       with ops.control_dependencies([are_splits_ready]):
         update_2 = split_handler.update_stats_sync(
             1,
@@ -105,7 +109,7 @@ def testGenerateFeatureSplitCandidates(self):
             is_active=array_ops.constant([True, True]))
       with ops.control_dependencies([update_2]):
         are_splits_ready2, partitions, gains, splits = (
-            split_handler.make_splits(1, 2, class_id))
+            split_handler.make_splits(np.int64(1), np.int64(2), class_id))
         are_splits_ready, are_splits_ready2, partitions, gains, splits = (
             sess.run([
                 are_splits_ready, are_splits_ready2, partitions, gains, splits
@@ -199,10 +203,10 @@ def testGenerateFeatureSplitCandidatesMulticlassFullHessian(self):
       hessian_shape = tensor_shape.TensorShape([2, 2])
 
       split_handler = ordinal_split_handler.DenseSplitHandler(
-          l1_regularization=0,
-          l2_regularization=1,
-          tree_complexity_regularization=0,
-          min_node_weight=0,
+          l1_regularization=0.,
+          l2_regularization=1.,
+          tree_complexity_regularization=0.,
+          min_node_weight=0.,
           epsilon=0.001,
           num_quantiles=3,
           feature_column_group_id=0,
@@ -227,7 +231,9 @@ def testGenerateFeatureSplitCandidatesMulticlassFullHessian(self):
           example_weights,
           is_active=array_ops.constant([True, True]))
       with ops.control_dependencies([update_1]):
-        are_splits_ready = split_handler.make_splits(0, 1, class_id)[0]
+        are_splits_ready = split_handler.make_splits(
+            np.int64(0), np.int64(1), class_id)[0]
+
       with ops.control_dependencies([are_splits_ready]):
         update_2 = split_handler.update_stats_sync(
             1,
@@ -240,7 +246,7 @@ def testGenerateFeatureSplitCandidatesMulticlassFullHessian(self):
             is_active=array_ops.constant([True, True]))
       with ops.control_dependencies([update_2]):
         are_splits_ready2, partitions, gains, splits = (
-            split_handler.make_splits(1, 2, class_id))
+            split_handler.make_splits(np.int64(1), np.int64(2), class_id))
         are_splits_ready, are_splits_ready2, partitions, gains, splits = (
             sess.run([
                 are_splits_ready, are_splits_ready2, partitions, gains, splits
@@ -285,10 +291,10 @@ def testGenerateFeatureSplitCandidatesMulticlassDiagonalHessian(self):
       hessian_shape = tensor_shape.TensorShape([2])
 
       split_handler = ordinal_split_handler.DenseSplitHandler(
-          l1_regularization=0,
-          l2_regularization=1,
-          tree_complexity_regularization=0,
-          min_node_weight=0,
+          l1_regularization=0.,
+          l2_regularization=1.,
+          tree_complexity_regularization=0.,
+          min_node_weight=0.,
           epsilon=0.001,
           num_quantiles=3,
           feature_column_group_id=0,
@@ -313,7 +319,8 @@ def testGenerateFeatureSplitCandidatesMulticlassDiagonalHessian(self):
           example_weights,
           is_active=array_ops.constant([True, True]))
       with ops.control_dependencies([update_1]):
-        are_splits_ready = split_handler.make_splits(0, 1, class_id)[0]
+        are_splits_ready = split_handler.make_splits(
+            np.int64(0), np.int64(1), class_id)[0]
       with ops.control_dependencies([are_splits_ready]):
         update_2 = split_handler.update_stats_sync(
             1,
@@ -326,7 +333,7 @@ def testGenerateFeatureSplitCandidatesMulticlassDiagonalHessian(self):
             is_active=array_ops.constant([True, True]))
       with ops.control_dependencies([update_2]):
         are_splits_ready2, partitions, gains, splits = (
-            split_handler.make_splits(1, 2, class_id))
+            split_handler.make_splits(np.int64(1), np.int64(2), class_id))
         are_splits_ready, are_splits_ready2, partitions, gains, splits = (
             sess.run([
                 are_splits_ready, are_splits_ready2, partitions, gains, splits
@@ -369,9 +376,9 @@ def testGenerateFeatureSplitCandidatesInactive(self):
 
       split_handler = ordinal_split_handler.DenseSplitHandler(
           l1_regularization=0.1,
-          l2_regularization=1,
-          tree_complexity_regularization=0,
-          min_node_weight=0,
+          l2_regularization=1.,
+          tree_complexity_regularization=0.,
+          min_node_weight=0.,
           epsilon=0.001,
           num_quantiles=10,
           feature_column_group_id=0,
@@ -396,7 +403,8 @@ def testGenerateFeatureSplitCandidatesInactive(self):
           example_weights,
           is_active=array_ops.constant([True, False]))
       with ops.control_dependencies([update_1]):
-        are_splits_ready = split_handler.make_splits(0, 1, class_id)[0]
+        are_splits_ready = split_handler.make_splits(
+            np.int64(0), np.int64(1), class_id)[0]
       with ops.control_dependencies([are_splits_ready]):
         update_2 = split_handler.update_stats_sync(
             1,
@@ -409,7 +417,7 @@ def testGenerateFeatureSplitCandidatesInactive(self):
             is_active=array_ops.constant([False, True]))
       with ops.control_dependencies([update_2]):
         are_splits_ready2, partitions, gains, splits = (
-            split_handler.make_splits(1, 2, class_id))
+            split_handler.make_splits(np.int64(1), np.int64(2), class_id))
         are_splits_ready, are_splits_ready2, partitions, gains, splits = (
             sess.run([
                 are_splits_ready, are_splits_ready2, partitions, gains, splits
@@ -443,9 +451,9 @@ def testGenerateFeatureSplitCandidatesWithTreeComplexity(self):
 
       split_handler = ordinal_split_handler.DenseSplitHandler(
           l1_regularization=0.1,
-          l2_regularization=1,
+          l2_regularization=1.,
           tree_complexity_regularization=0.5,
-          min_node_weight=0,
+          min_node_weight=0.,
           epsilon=0.001,
           num_quantiles=10,
           feature_column_group_id=0,
@@ -470,7 +478,8 @@ def testGenerateFeatureSplitCandidatesWithTreeComplexity(self):
           example_weights,
           is_active=array_ops.constant([True, True]))
       with ops.control_dependencies([update_1]):
-        are_splits_ready = split_handler.make_splits(0, 1, class_id)[0]
+        are_splits_ready = split_handler.make_splits(
+            np.int64(0), np.int64(1), class_id)[0]
       with ops.control_dependencies([are_splits_ready]):
         update_2 = split_handler.update_stats_sync(
             1,
@@ -483,7 +492,7 @@ def testGenerateFeatureSplitCandidatesWithTreeComplexity(self):
             is_active=array_ops.constant([True, True]))
       with ops.control_dependencies([update_2]):
         are_splits_ready2, partitions, gains, splits = (
-            split_handler.make_splits(1, 2, class_id))
+            split_handler.make_splits(np.int64(1), np.int64(2), class_id))
         are_splits_ready, are_splits_ready2, partitions, gains, splits = (
             sess.run([
                 are_splits_ready, are_splits_ready2, partitions, gains, splits
@@ -576,7 +585,7 @@ def testGenerateFeatureSplitCandidatesWithMinNodeWeight(self):
 
       split_handler = ordinal_split_handler.DenseSplitHandler(
           l1_regularization=0.1,
-          l2_regularization=1,
+          l2_regularization=1.,
           tree_complexity_regularization=0.5,
           min_node_weight=1.5,
           epsilon=0.001,
@@ -603,7 +612,8 @@ def testGenerateFeatureSplitCandidatesWithMinNodeWeight(self):
           example_weights,
           is_active=array_ops.constant([True, True]))
       with ops.control_dependencies([update_1]):
-        are_splits_ready = split_handler.make_splits(0, 1, class_id)[0]
+        are_splits_ready = split_handler.make_splits(
+            np.int64(0), np.int64(1), class_id)[0]
       with ops.control_dependencies([are_splits_ready]):
         update_2 = split_handler.update_stats_sync(
             1,
@@ -616,7 +626,7 @@ def testGenerateFeatureSplitCandidatesWithMinNodeWeight(self):
             is_active=array_ops.constant([True, True]))
       with ops.control_dependencies([update_2]):
         are_splits_ready2, partitions, gains, splits = (
-            split_handler.make_splits(1, 2, class_id))
+            split_handler.make_splits(np.int64(1), np.int64(2), class_id))
         are_splits_ready, are_splits_ready2, partitions, gains, splits = (
             sess.run([
                 are_splits_ready, are_splits_ready2, partitions, gains, splits
@@ -685,10 +695,10 @@ def testGenerateFeatureSplitCandidates(self):
       class_id = -1
 
       split_handler = ordinal_split_handler.SparseSplitHandler(
-          l1_regularization=0,
-          l2_regularization=2,
-          tree_complexity_regularization=0,
-          min_node_weight=0,
+          l1_regularization=0.0,
+          l2_regularization=2.0,
+          tree_complexity_regularization=0.0,
+          min_node_weight=0.0,
           epsilon=0.01,
           num_quantiles=2,
           feature_column_group_id=0,
@@ -713,8 +723,8 @@ def testGenerateFeatureSplitCandidates(self):
           example_weights,
           is_active=array_ops.constant([True, True]))
       with ops.control_dependencies([update_1]):
-        are_splits_ready = split_handler.make_splits(0, 1, class_id)[0]
-
+        are_splits_ready = split_handler.make_splits(
+            np.int64(0), np.int64(1), class_id)[0]
       with ops.control_dependencies([are_splits_ready]):
         update_2 = split_handler.update_stats_sync(
             1,
@@ -727,7 +737,7 @@ def testGenerateFeatureSplitCandidates(self):
             is_active=array_ops.constant([True, True]))
       with ops.control_dependencies([update_2]):
         are_splits_ready2, partitions, gains, splits = (
-            split_handler.make_splits(1, 2, class_id))
+            split_handler.make_splits(np.int64(1), np.int64(2), class_id))
         are_splits_ready, are_splits_ready2, partitions, gains, splits = (
             sess.run([
                 are_splits_ready, are_splits_ready2, partitions, gains, splits
@@ -811,10 +821,10 @@ def testGenerateFeatureSplitCandidatesMulticlassFullHessian(self):
       class_id = -1
 
       split_handler = ordinal_split_handler.SparseSplitHandler(
-          l1_regularization=0,
-          l2_regularization=2,
-          tree_complexity_regularization=0,
-          min_node_weight=0,
+          l1_regularization=0.0,
+          l2_regularization=2.0,
+          tree_complexity_regularization=0.0,
+          min_node_weight=0.0,
           epsilon=0.01,
           num_quantiles=2,
           feature_column_group_id=0,
@@ -839,7 +849,8 @@ def testGenerateFeatureSplitCandidatesMulticlassFullHessian(self):
           example_weights,
           is_active=array_ops.constant([True, True]))
       with ops.control_dependencies([update_1]):
-        are_splits_ready = split_handler.make_splits(0, 1, class_id)[0]
+        are_splits_ready = split_handler.make_splits(
+            np.int64(0), np.int64(1), class_id)[0]
 
       with ops.control_dependencies([are_splits_ready]):
         update_2 = split_handler.update_stats_sync(
@@ -853,7 +864,7 @@ def testGenerateFeatureSplitCandidatesMulticlassFullHessian(self):
             is_active=array_ops.constant([True, True]))
       with ops.control_dependencies([update_2]):
         are_splits_ready2, partitions, gains, splits = (
-            split_handler.make_splits(1, 2, class_id))
+            split_handler.make_splits(np.int64(1), np.int64(2), class_id))
         are_splits_ready, are_splits_ready2, partitions, gains, splits = (
             sess.run([
                 are_splits_ready, are_splits_ready2, partitions, gains, splits
@@ -905,10 +916,10 @@ def testGenerateFeatureSplitCandidatesMulticlassDiagonalHessian(self):
       class_id = -1
 
       split_handler = ordinal_split_handler.SparseSplitHandler(
-          l1_regularization=0,
-          l2_regularization=2,
-          tree_complexity_regularization=0,
-          min_node_weight=0,
+          l1_regularization=0.0,
+          l2_regularization=2.0,
+          tree_complexity_regularization=0.0,
+          min_node_weight=0.0,
           epsilon=0.01,
           num_quantiles=2,
           feature_column_group_id=0,
@@ -933,7 +944,8 @@ def testGenerateFeatureSplitCandidatesMulticlassDiagonalHessian(self):
           example_weights,
           is_active=array_ops.constant([True, True]))
       with ops.control_dependencies([update_1]):
-        are_splits_ready = split_handler.make_splits(0, 1, class_id)[0]
+        are_splits_ready = split_handler.make_splits(
+            np.int64(0), np.int64(1), class_id)[0]
 
       with ops.control_dependencies([are_splits_ready]):
         update_2 = split_handler.update_stats_sync(
@@ -947,7 +959,7 @@ def testGenerateFeatureSplitCandidatesMulticlassDiagonalHessian(self):
             is_active=array_ops.constant([True, True]))
       with ops.control_dependencies([update_2]):
         are_splits_ready2, partitions, gains, splits = (
-            split_handler.make_splits(1, 2, class_id))
+            split_handler.make_splits(np.int64(1), np.int64(2), class_id))
         are_splits_ready, are_splits_ready2, partitions, gains, splits = (
             sess.run([
                 are_splits_ready, are_splits_ready2, partitions, gains, splits
@@ -996,10 +1008,10 @@ def testGenerateFeatureSplitCandidatesInactive(self):
       class_id = -1
 
       split_handler = ordinal_split_handler.SparseSplitHandler(
-          l1_regularization=0,
-          l2_regularization=2,
-          tree_complexity_regularization=0,
-          min_node_weight=0,
+          l1_regularization=0.0,
+          l2_regularization=2.0,
+          tree_complexity_regularization=0.0,
+          min_node_weight=0.0,
           epsilon=0.01,
           num_quantiles=2,
           feature_column_group_id=0,
@@ -1024,7 +1036,8 @@ def testGenerateFeatureSplitCandidatesInactive(self):
           example_weights,
           is_active=array_ops.constant([True, False]))
       with ops.control_dependencies([update_1]):
-        are_splits_ready = split_handler.make_splits(0, 1, class_id)[0]
+        are_splits_ready = split_handler.make_splits(
+            np.int64(0), np.int64(1), class_id)[0]
 
       with ops.control_dependencies([are_splits_ready]):
         update_2 = split_handler.update_stats_sync(
@@ -1038,7 +1051,7 @@ def testGenerateFeatureSplitCandidatesInactive(self):
             is_active=array_ops.constant([False, True]))
       with ops.control_dependencies([update_2]):
         are_splits_ready2, partitions, gains, splits = (
-            split_handler.make_splits(1, 2, class_id))
+            split_handler.make_splits(np.int64(1), np.int64(2), class_id))
         are_splits_ready, are_splits_ready2, partitions, gains, splits = (
             sess.run([
                 are_splits_ready, are_splits_ready2, partitions, gains, splits
@@ -1065,10 +1078,10 @@ def testEmpty(self):
       class_id = -1
 
       split_handler = ordinal_split_handler.SparseSplitHandler(
-          l1_regularization=0,
-          l2_regularization=2,
-          tree_complexity_regularization=0,
-          min_node_weight=0,
+          l1_regularization=0.0,
+          l2_regularization=2.0,
+          tree_complexity_regularization=0.0,
+          min_node_weight=0.0,
           epsilon=0.01,
           num_quantiles=2,
           feature_column_group_id=0,
@@ -1096,7 +1109,8 @@ def testEmpty(self):
           example_weights,
           is_active=array_ops.constant([True, True]))
       with ops.control_dependencies([update_1]):
-        are_splits_ready = split_handler.make_splits(0, 1, class_id)[0]
+        are_splits_ready = split_handler.make_splits(
+            np.int64(0), np.int64(1), class_id)[0]
 
       with ops.control_dependencies([are_splits_ready]):
         update_2 = split_handler.update_stats_sync(
@@ -1110,7 +1124,7 @@ def testEmpty(self):
             is_active=array_ops.constant([True, True]))
       with ops.control_dependencies([update_2]):
         are_splits_ready2, partitions, gains, splits = (
-            split_handler.make_splits(1, 2, class_id))
+            split_handler.make_splits(np.int64(1), np.int64(2), class_id))
         are_splits_ready, are_splits_ready2, partitions, gains, splits = (
             sess.run([
                 are_splits_ready, are_splits_ready2, partitions, gains, splits
@@ -1138,10 +1152,10 @@ def testDegenerativeCase(self):
       class_id = -1
 
       split_handler = ordinal_split_handler.SparseSplitHandler(
-          l1_regularization=0,
-          l2_regularization=2,
-          tree_complexity_regularization=0,
-          min_node_weight=0,
+          l1_regularization=0.0,
+          l2_regularization=2.0,
+          tree_complexity_regularization=0.0,
+          min_node_weight=0.0,
           epsilon=0.01,
           num_quantiles=2,
           feature_column_group_id=0,
@@ -1166,7 +1180,8 @@ def testDegenerativeCase(self):
           example_weights,
           is_active=array_ops.constant([True, True]))
       with ops.control_dependencies([update_1]):
-        are_splits_ready = split_handler.make_splits(0, 1, class_id)[0]
+        are_splits_ready = split_handler.make_splits(
+            np.int64(0), np.int64(1), class_id)[0]
 
       with ops.control_dependencies([are_splits_ready]):
         update_2 = split_handler.update_stats_sync(
@@ -1180,7 +1195,7 @@ def testDegenerativeCase(self):
             is_active=array_ops.constant([True, True]))
       with ops.control_dependencies([update_2]):
         are_splits_ready2, partitions, gains, splits = (
-            split_handler.make_splits(1, 2, class_id))
+            split_handler.make_splits(np.int64(1), np.int64(2), class_id))
         are_splits_ready, are_splits_ready2, partitions, gains, splits = (
             sess.run([
                 are_splits_ready, are_splits_ready2, partitions, gains, splits
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
index 8ad97fedc923ac..c120dd8a6c156e 100644
--- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
@@ -295,7 +295,7 @@ WeightedQuantilesStream<ValueType, WeightType, CompareFn>::GetQuantileSpecs(
   if (eps <= std::numeric_limits<double>::epsilon()) {
     // Exact quantile computation at the expense of RAM.
     max_level = 1;
-    block_size = std::max(max_elements, 2LL);
+    block_size = std::max(max_elements, int64{2});
   } else {
     // The bottom-most level will become full at most
     // (max_elements / block_size) times, the level above will become full
@@ -315,7 +315,7 @@ WeightedQuantilesStream<ValueType, WeightType, CompareFn>::GetQuantileSpecs(
       block_size = static_cast<size_t>(ceil(max_level / eps)) + 1;
     }
   }
-  return std::make_tuple(max_level, std::max(block_size, 2LL));
+  return std::make_tuple(max_level, std::max(block_size, int64{2}));
 }
 
 }  // namespace quantiles
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h
index 7576856dc3a6d0..a7e7bfc13cadce 100644
--- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h
@@ -195,7 +195,7 @@ class WeightedQuantilesSummary {
   // designed to be cache-friendly.
   void Compress(int64 size_hint, double min_eps = 0) {
     // No-op if we're already within the size requirement.
-    size_hint = std::max(size_hint, 2LL);
+    size_hint = std::max(size_hint, int64{2});
     if (entries_.size() <= size_hint) {
       return;
     }
@@ -267,7 +267,7 @@ class WeightedQuantilesSummary {
     if (entries_.empty()) {
       return output;
     }
-    num_quantiles = std::max(num_quantiles, 2LL);
+    num_quantiles = std::max(num_quantiles, int64{2});
     output.reserve(num_quantiles + 1);
 
     // Make successive rank queries to get boundaries.
diff --git a/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc b/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc
index 5d0ebbf73ce127..ca5c7f3d8c78a5 100644
--- a/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc
+++ b/tensorflow/contrib/boosted_trees/ops/split_handler_ops.cc
@@ -23,12 +23,6 @@ using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
 REGISTER_OP("BuildDenseInequalitySplits")
-    .Attr("feature_column_group_id: int")
-    .Attr("l1_regularization: float")
-    .Attr("l2_regularization: float")
-    .Attr("tree_complexity_regularization: float")
-    .Attr("min_node_weight: float")
-    .Attr("multiclass_strategy: int")
     .Input("num_minibatches: int64")
     .Input("partition_ids: int32")
     .Input("bucket_ids: int64")
@@ -36,6 +30,12 @@ REGISTER_OP("BuildDenseInequalitySplits")
     .Input("hessians: float32")
     .Input("bucket_boundaries: float32")
     .Input("class_id: int32")
+    .Input("feature_column_group_id: int32")
+    .Input("l1_regularization: float")
+    .Input("l2_regularization: float")
+    .Input("tree_complexity_regularization: float")
+    .Input("min_node_weight: float")
+    .Input("multiclass_strategy: int32")
     .Output("output_partition_ids: int32")
     .Output("gains: float32")
     .Output("split_infos: string")
@@ -73,6 +73,17 @@ bucket_ids: A rank 2 tensor of buckets IDs and dimensions.
 gradients: A rank 1 tensor of gradients.
 hessians: A rank 1 tensor of hessians.
 bucket_boundaries: A rank 1 tensor, thresholds that were used for bucketization.
+class_id: A scalar, the class id for which we're building the splits.
+feature_column_group_id: A scalar, the index of the feature we are spiltting on.
+l1_regularization: A scalar, which specifies the l1 regularization term.
+l2_regularization: A scalar, which specifies the l2 regularization term.
+tree_complexity_regularization: A scalar, which specifies the tree complexity
+    regularization term.
+min_node_weight: A scalar, minimum sum of example hessian needed in a child.
+    If a split results in a leaf node with a smaller value, the split will not
+    be considered.
+multiclass_strategy: A scalar, specifying the multiclass handling strategy.
+    See LearnerConfig.MultiClassStrategy for valid values.
 output_partition_ids: A rank 1 tensor, the partition IDs that we created splits
     for.
 gains: A rank 1 tensor, for the computed gain for the created splits.
@@ -81,13 +92,6 @@ split_infos: A rank 1 tensor of serialized protos which contains the
 )doc");
 
 REGISTER_OP("BuildSparseInequalitySplits")
-    .Attr("feature_column_group_id: int")
-    .Attr("bias_feature_id: int")
-    .Attr("l1_regularization: float")
-    .Attr("l2_regularization: float")
-    .Attr("tree_complexity_regularization: float")
-    .Attr("min_node_weight: float")
-    .Attr("multiclass_strategy: int")
     .Input("num_minibatches: int64")
     .Input("partition_ids: int32")
     .Input("bucket_ids: int64")
@@ -95,6 +99,13 @@ REGISTER_OP("BuildSparseInequalitySplits")
     .Input("hessians: float32")
     .Input("bucket_boundaries: float32")
     .Input("class_id: int32")
+    .Input("feature_column_group_id: int32")
+    .Input("bias_feature_id: int64")
+    .Input("l1_regularization: float")
+    .Input("l2_regularization: float")
+    .Input("tree_complexity_regularization: float")
+    .Input("min_node_weight: float")
+    .Input("multiclass_strategy: int32")
     .Output("output_partition_ids: int32")
     .Output("gains: float32")
     .Output("split_infos: string")
@@ -133,6 +144,17 @@ bucket_ids: A rank 2 tensor of buckets IDs and dimensions.
 gradients: A rank 1 tensor of gradients.
 hessians: A rank 1 tensor of hessians.
 bucket_boundaries: A rank 1 tensor, thresholds that were used for bucketization.
+class_id: A scalar, the class id for which we're building the splits.
+feature_column_group_id: A scalar, the index of the feature we are spiltting on.
+l1_regularization: A scalar, which specifies the l1 regularization term.
+l2_regularization: A scalar, which specifies the l2 regularization term.
+tree_complexity_regularization: A scalar, which specifies the tree complexity
+    regularization term.
+min_node_weight: A scalar, minimum sum of example hessian needed in a child.
+    If a split results in a leaf node with a smaller value, the split will not
+    be considered.
+multiclass_strategy: A scalar, specifying the multiclass handling strategy.
+    See LearnerConfig.MultiClassStrategy for valid values.
 output_partition_ids: A rank 1 tensor, the partition IDs that we created splits
     for.
 gains: A rank 1 tensor, for the computed gain for the created splits.
@@ -141,19 +163,19 @@ split_infos: A rank 1 tensor of serialized protos which contains the
 )doc");
 
 REGISTER_OP("BuildCategoricalEqualitySplits")
-    .Attr("feature_column_group_id: int")
-    .Attr("bias_feature_id: int")
-    .Attr("l1_regularization: float")
-    .Attr("l2_regularization: float")
-    .Attr("tree_complexity_regularization: float")
-    .Attr("min_node_weight: float")
-    .Attr("multiclass_strategy: int")
     .Input("num_minibatches: int64")
     .Input("partition_ids: int32")
     .Input("feature_ids: int64")
     .Input("gradients: float32")
     .Input("hessians: float32")
     .Input("class_id: int32")
+    .Input("feature_column_group_id: int32")
+    .Input("bias_feature_id: int64")
+    .Input("l1_regularization: float")
+    .Input("l2_regularization: float")
+    .Input("tree_complexity_regularization: float")
+    .Input("min_node_weight: float")
+    .Input("multiclass_strategy: int32")
     .Output("output_partition_ids: int32")
     .Output("gains: float32")
     .Output("split_infos: string")
@@ -188,6 +210,17 @@ partition_ids: A rank 1 tensor of partition IDs.
 feature_ids: A rank 2 tensor of feature IDs and dimensions.
 gradients: A rank 1 tensor of gradients.
 hessians: A rank 1 tensor of hessians.
+class_id: A scalar, the class id for which we're building the splits.
+feature_column_group_id: A scalar, the index of the feature we are spiltting on.
+l1_regularization: A scalar, which specifies the l1 regularization term.
+l2_regularization: A scalar, which specifies the l2 regularization term.
+tree_complexity_regularization: A scalar, which specifies the tree complexity
+    regularization term.
+min_node_weight: A scalar, minimum sum of example hessian needed in a child.
+    If a split results in a leaf node with a smaller value, the split will not
+    be considered.
+multiclass_strategy: A scalar, specifying the multiclass handling strategy.
+    See LearnerConfig.MultiClassStrategy for valid values.
 output_partition_ids: A rank 1 tensor, the partition IDs that we created splits
     for.
 gains: A rank 1 tensor, for the computed gain for the created splits.
@@ -196,4 +229,3 @@ split_infos: A rank 1 tensor of serialized protos which contains the
 )doc");
 
 }  // namespace tensorflow
-   // namespace tensorflow
diff --git a/tensorflow/contrib/boosted_trees/python/kernel_tests/split_handler_ops_test.py b/tensorflow/contrib/boosted_trees/python/kernel_tests/split_handler_ops_test.py
index 28834ef55bf8e1..5cd37ec67ec3bd 100644
--- a/tensorflow/contrib/boosted_trees/python/kernel_tests/split_handler_ops_test.py
+++ b/tensorflow/contrib/boosted_trees/python/kernel_tests/split_handler_ops_test.py
@@ -18,6 +18,8 @@
 from __future__ import division
 from __future__ import print_function
 
+import random
+
 from tensorflow.contrib.boosted_trees.proto import learner_pb2
 from tensorflow.contrib.boosted_trees.proto import split_info_pb2
 from tensorflow.contrib.boosted_trees.python.ops import split_handler_ops
@@ -399,6 +401,65 @@ def testMakeSparseMultidimensionalSplit(self):
 
     self.assertAllClose(0.6, split_node.split.threshold)
 
+  def testMakeSparseSplitDefaultDirectionIsStable(self):
+    """Tests default direction is stable when no sparsity."""
+    random.seed(1123)
+    for _ in range(50):
+      with self.test_session() as sess:
+        grad = random.random()
+        hessian = random.random()
+        # The data looks like the following (divide by the num of steps 2).
+        # Gradients       | Partition | bucket ID       |
+        # (grad, hessian) |  0        | -1              |
+        # And then 100 buckets of
+        # (grad/100, hessian/100), so there is no sparsity.
+        n_buckets = 100
+
+        # 1 for the overall sum, and 100 buckets.
+        partition_ids = array_ops.constant(
+            [0] * (n_buckets + 1), dtype=dtypes.int32)
+        # We have only 1 dimension in our sparse feature column.
+
+        bucket_ids = [-1] + [n for n in range(100)]
+        bucket_ids = array_ops.constant(bucket_ids, dtype=dtypes.int64)
+        dimension_ids = array_ops.constant(
+            [0] * (n_buckets + 1), dtype=dtypes.int64)
+        bucket_ids = array_ops.stack([bucket_ids, dimension_ids], axis=1)
+
+        gradients = [grad] + [grad / n_buckets] * n_buckets
+        gradients = array_ops.constant(gradients)
+        hessians = [hessian] + [hessian / n_buckets] * n_buckets
+        hessians = array_ops.constant(hessians)
+
+        boundaries = [x * 1 for x in range(n_buckets + 1)]
+        bucket_boundaries = array_ops.constant(boundaries, dtype=dtypes.float32)
+
+        partitions, gains, splits = (
+            split_handler_ops.build_sparse_inequality_splits(
+                num_minibatches=2,
+                partition_ids=partition_ids,
+                bucket_ids=bucket_ids,
+                gradients=gradients,
+                hessians=hessians,
+                bucket_boundaries=bucket_boundaries,
+                l1_regularization=0,
+                l2_regularization=2,
+                tree_complexity_regularization=0,
+                min_node_weight=0,
+                feature_column_group_id=0,
+                bias_feature_id=-1,
+                class_id=-1,
+                multiclass_strategy=learner_pb2.LearnerConfig.TREE_PER_CLASS))
+        partitions, gains, splits = (sess.run([partitions, gains, splits]))
+      self.assertAllEqual([0], partitions)
+      self.assertEqual(1, len(splits))
+
+      split_info = split_info_pb2.SplitInfo()
+      split_info.ParseFromString(splits[0])
+      self.assertTrue(
+          split_info.split_node.HasField(
+              'sparse_float_binary_split_default_left'))
+
   def testMakeMulticlassSparseSplit(self):
     """Tests split handler op."""
     with self.test_session() as sess:
diff --git a/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py b/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py
index 7a5f329b7ab321..843420968ac6a6 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/batch_ops_utils.py
@@ -20,6 +20,8 @@
 import abc
 import collections
 
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
@@ -60,6 +62,7 @@ def _move_tensors(tensors, device):
   """Moves a list of tensors to a device by concatenating/splitting them."""
   # Reset the device setting to avoid weird interactions with device merging
   # logic.
+  zero = constant_op.constant(0, dtype=dtypes.int32)
   with ops.device(None):
     if all(tensor.shape == tensor_shape.scalar() for tensor in tensors):
       with ops.device(tensors[0].device):
@@ -68,12 +71,11 @@ def _move_tensors(tensors, device):
         return array_ops.unstack(values)
     else:
       with ops.device(tensors[0].device):
-        sizes = array_ops.stack(
-            [array_ops.shape(tensor)[0] for tensor in tensors])
-        values = array_ops.concat(tensors, axis=0)
+        sizes = array_ops.stack(array_ops.shape_n(tensors))[:, 0]
+        values = array_ops.concat(tensors, axis=zero)
       with ops.device(device):
         sizes = array_ops.unstack(sizes)
-        return list(array_ops.split(values, sizes, axis=0))
+        return list(array_ops.split(values, sizes, axis=zero))
 
 
 def _scheduled_stamp_resource_op_runner(batch, stamp):
diff --git a/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py b/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
index 1b184d296b329c..19b6b3296db394 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/quantile_ops.py
@@ -187,7 +187,7 @@ def flush(self, stamp_token, next_stamp_token):
       stamp_token: Expected current token.
       next_stamp_token: Next value for the token.
     Returns:
-      A list of quantiles or approximate boundaries.
+      The flush operation.
     """
     return gen_quantile_ops.quantile_accumulator_flush(
         quantile_accumulator_handle=self._quantile_accumulator_handle,
@@ -201,3 +201,6 @@ def flush_summary(self, stamp_token, next_stamp_token):
         stamp_token=stamp_token,
         next_stamp_token=next_stamp_token)
     return result
+
+  def resource(self):
+    return self._quantile_accumulator_handle
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
index 08c1dcdd028829..5dd2e0c7f254f3 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
@@ -180,8 +180,7 @@ def extract_features(features, feature_columns, use_core_columns):
         elif isinstance(fc, feature_column_lib._EmbeddingColumn):
           # pylint: enable=protected-access
           transformed_features[fc.name] = fc_core.input_layer(
-              features, [fc],
-              weight_collections=[scope])
+              features, [fc], weight_collections=[scope])
         else:
           result = feature_column_ops.transform_features(features, [fc])
           if len(result) > 1:
@@ -334,10 +333,12 @@ def __init__(self,
     self._feature_columns = feature_columns
     self._learner_config_serialized = learner_config.SerializeToString()
     self._attempted_trees = variables.Variable(
-        initial_value=array_ops.zeros([], dtypes.int64), trainable=False,
+        initial_value=array_ops.zeros([], dtypes.int64),
+        trainable=False,
         name="attempted_trees")
     self._finalized_trees = variables.Variable(
-        initial_value=array_ops.zeros([], dtypes.int64), trainable=False,
+        initial_value=array_ops.zeros([], dtypes.int64),
+        trainable=False,
         name="finalized_trees")
     if not features:
       raise ValueError("Features dictionary must be specified.")
@@ -354,9 +355,10 @@ def __init__(self,
     self._sparse_int_indices = sparse_int_indices
     self._sparse_int_values = sparse_int_values
     self._sparse_int_shapes = sparse_int_shapes
-    self._reduce_dim = (self._learner_config.multi_class_strategy ==
-                        learner_pb2.LearnerConfig.TREE_PER_CLASS and
-                        learner_config.num_classes == 2)
+    self._reduce_dim = (
+        self._learner_config.multi_class_strategy ==
+        learner_pb2.LearnerConfig.TREE_PER_CLASS and
+        learner_config.num_classes == 2)
 
   def _predict_and_return_dict(self, ensemble_handle, ensemble_stamp, mode):
     """Runs prediction and returns a dictionary of the prediction results.
@@ -369,13 +371,13 @@ def _predict_and_return_dict(self, ensemble_handle, ensemble_stamp, mode):
     Returns:
       a dictionary of prediction results -
         ENSEMBLE_STAMP, PREDICTION, PARTITION_IDS,
-        NUM_LAYER_ATTEMPTED, NUM_TREES_ATTEMPED.
+        NUM_LAYER_ATTEMPTED, NUM_TREES_ATTEMPTED.
     """
     ensemble_stats = training_ops.tree_ensemble_stats(ensemble_handle,
                                                       ensemble_stamp)
     num_handlers = (
-        len(self._dense_floats) + len(self._sparse_float_shapes) +
-        len(self._sparse_int_shapes))
+        len(self._dense_floats) + len(self._sparse_float_shapes) + len(
+            self._sparse_int_shapes))
     # Used during feature selection.
     used_handlers = model_ops.tree_ensemble_used_handlers(
         ensemble_handle, ensemble_stamp, num_all_handlers=num_handlers)
@@ -432,8 +434,9 @@ def predict(self, mode):
     # Use the current ensemble to predict on the current batch of input.
     # For faster prediction we check if the inputs are on the same device
     # as the model. If not, we create a copy of the model on the worker.
-    input_deps = (self._dense_floats + self._sparse_float_indices +
-                  self._sparse_int_indices)
+    input_deps = (
+        self._dense_floats + self._sparse_float_indices +
+        self._sparse_int_indices)
     if not input_deps:
       raise ValueError("No input tensors for prediction.")
 
@@ -457,8 +460,8 @@ def predict(self, mode):
 
       # Determine whether the local ensemble is stale and update it if needed.
       def _refresh_local_ensemble_fn():
-        # Serialize the model from parameter server after reading all inputs.
-        with ops.control_dependencies(input_deps):
+        # Serialize the model from parameter server after reading the inputs.
+        with ops.control_dependencies([input_deps[0]]):
           (ensemble_stamp, serialized_model) = (
               model_ops.tree_ensemble_serialize(self._ensemble_handle))
 
@@ -500,8 +503,9 @@ def train(self, loss, predictions_dict, labels):
       ValueError: if inputs are not valid.
     """
     # Get the worker device from input dependencies.
-    input_deps = (self._dense_floats + self._sparse_float_indices +
-                  self._sparse_int_indices)
+    input_deps = (
+        self._dense_floats + self._sparse_float_indices +
+        self._sparse_int_indices)
     worker_device = input_deps[0].device
 
     # Get tensors relevant for training and form the loss.
@@ -517,7 +521,7 @@ def train(self, loss, predictions_dict, labels):
         aggregation_method=None)[0]
     strategy = self._learner_config.multi_class_strategy
 
-    class_id = -1
+    class_id = constant_op.constant(-1, dtype=dtypes.int32)
     # Handle different multiclass strategies.
     if strategy == learner_pb2.LearnerConfig.TREE_PER_CLASS:
       # We build one vs rest trees.
@@ -571,31 +575,39 @@ def train(self, loss, predictions_dict, labels):
     # Get the weights for each example for quantiles calculation,
     weights = self._get_weights(hessian_shape, squeezed_hessians)
 
-    regularization_config = self._learner_config.regularization
-    min_node_weight = self._learner_config.constraints.min_node_weight
     # Create all handlers ensuring resources are evenly allocated across PS.
     fc_name_idx = 0
     handlers = []
     init_stamp_token = constant_op.constant(0, dtype=dtypes.int64)
+    l1_regularization = constant_op.constant(
+        self._learner_config.regularization.l1, dtypes.float32)
+    l2_regularization = constant_op.constant(
+        self._learner_config.regularization.l2, dtypes.float32)
+    tree_complexity_regularization = constant_op.constant(
+        self._learner_config.regularization.tree_complexity, dtypes.float32)
+    min_node_weight = constant_op.constant(
+        self._learner_config.constraints.min_node_weight, dtypes.float32)
+    epsilon = 0.01
+    num_quantiles = 100
+    strategy_tensor = constant_op.constant(strategy)
     with ops.device(self._get_replica_device_setter(worker_device)):
       # Create handlers for dense float columns
       for dense_float_column_idx in range(len(self._dense_floats)):
         fc_name = self._fc_names[fc_name_idx]
         handlers.append(
             ordinal_split_handler.DenseSplitHandler(
-                l1_regularization=regularization_config.l1,
-                l2_regularization=regularization_config.l2,
-                tree_complexity_regularization=(
-                    regularization_config.tree_complexity),
+                l1_regularization=l1_regularization,
+                l2_regularization=l2_regularization,
+                tree_complexity_regularization=tree_complexity_regularization,
                 min_node_weight=min_node_weight,
                 feature_column_group_id=dense_float_column_idx,
-                epsilon=0.01,
-                num_quantiles=100,
+                epsilon=epsilon,
+                num_quantiles=num_quantiles,
                 dense_float_column=self._dense_floats[dense_float_column_idx],
                 name=fc_name,
                 gradient_shape=gradient_shape,
                 hessian_shape=hessian_shape,
-                multiclass_strategy=strategy,
+                multiclass_strategy=strategy_tensor,
                 init_stamp_token=init_stamp_token))
         fc_name_idx += 1
 
@@ -604,14 +616,13 @@ def train(self, loss, predictions_dict, labels):
         fc_name = self._fc_names[fc_name_idx]
         handlers.append(
             ordinal_split_handler.SparseSplitHandler(
-                l1_regularization=regularization_config.l1,
-                l2_regularization=regularization_config.l2,
-                tree_complexity_regularization=(
-                    regularization_config.tree_complexity),
+                l1_regularization=l1_regularization,
+                l2_regularization=l2_regularization,
+                tree_complexity_regularization=tree_complexity_regularization,
                 min_node_weight=min_node_weight,
                 feature_column_group_id=sparse_float_column_idx,
-                epsilon=0.01,
-                num_quantiles=100,
+                epsilon=epsilon,
+                num_quantiles=num_quantiles,
                 sparse_float_column=sparse_tensor.SparseTensor(
                     self._sparse_float_indices[sparse_float_column_idx],
                     self._sparse_float_values[sparse_float_column_idx],
@@ -619,7 +630,7 @@ def train(self, loss, predictions_dict, labels):
                 name=fc_name,
                 gradient_shape=gradient_shape,
                 hessian_shape=hessian_shape,
-                multiclass_strategy=strategy,
+                multiclass_strategy=strategy_tensor,
                 init_stamp_token=init_stamp_token))
         fc_name_idx += 1
 
@@ -628,10 +639,9 @@ def train(self, loss, predictions_dict, labels):
         fc_name = self._fc_names[fc_name_idx]
         handlers.append(
             categorical_split_handler.EqualitySplitHandler(
-                l1_regularization=regularization_config.l1,
-                l2_regularization=regularization_config.l2,
-                tree_complexity_regularization=(
-                    regularization_config.tree_complexity),
+                l1_regularization=l1_regularization,
+                l2_regularization=l2_regularization,
+                tree_complexity_regularization=tree_complexity_regularization,
                 min_node_weight=min_node_weight,
                 feature_column_group_id=sparse_int_column_idx,
                 sparse_int_column=sparse_tensor.SparseTensor(
@@ -641,7 +651,7 @@ def train(self, loss, predictions_dict, labels):
                 name=fc_name,
                 gradient_shape=gradient_shape,
                 hessian_shape=hessian_shape,
-                multiclass_strategy=strategy,
+                multiclass_strategy=strategy_tensor,
                 init_stamp_token=init_stamp_token))
         fc_name_idx += 1
 
@@ -694,11 +704,11 @@ def train(self, loss, predictions_dict, labels):
         name="continue_centering",
         trainable=False)
     stats_update_ops.append(
-        control_flow_ops.cond(continue_centering,
-                              self._make_update_bias_stats_fn(
-                                  ensemble_stamp, predictions, gradients,
-                                  bias_stats_accumulator),
-                              control_flow_ops.no_op))
+        control_flow_ops.cond(
+            continue_centering,
+            self._make_update_bias_stats_fn(ensemble_stamp, predictions,
+                                            gradients, bias_stats_accumulator),
+            control_flow_ops.no_op))
 
     # Update handler stats.
     handler_reads = collections.OrderedDict()
@@ -720,8 +730,8 @@ def train(self, loss, predictions_dict, labels):
           shape=[len(handlers)], seed=[seed + 1, 1])
       active_handlers = array_ops.stack(
           [active_handlers_current_layer, active_handlers_next_layer], axis=1)
-      active_handlers = (active_handlers <
-                         self._learner_config.feature_fraction_per_level)
+      active_handlers = (
+          active_handlers < self._learner_config.feature_fraction_per_level)
     elif subsampling_type == "feature_fraction_per_tree":
       seed = predictions_dict[NUM_TREES_ATTEMPTED]
       active_handlers_current_layer = stateless.stateless_random_uniform(
@@ -729,9 +739,12 @@ def train(self, loss, predictions_dict, labels):
       active_handlers_current_layer = (
           active_handlers_current_layer <
           self._learner_config.feature_fraction_per_tree)
-      active_handlers = array_ops.stack([
-          active_handlers_current_layer,
-          array_ops.ones([len(handlers)], dtype=dtypes.bool)], axis=1)
+      active_handlers = array_ops.stack(
+          [
+              active_handlers_current_layer,
+              array_ops.ones([len(handlers)], dtype=dtypes.bool)
+          ],
+          axis=1)
     else:
       active_handlers = array_ops.ones([len(handlers), 2], dtype=dtypes.bool)
 
@@ -760,6 +773,7 @@ def _feature_selection_active_handlers():
     empty_hessians = constant_op.constant(
         [], dtype=dtypes.float32, shape=empty_hess_shape)
 
+    active_handlers = array_ops.unstack(active_handlers, axis=0)
     for handler_idx in range(len(handlers)):
       handler = handlers[handler_idx]
       is_active = active_handlers[handler_idx]
@@ -901,7 +915,6 @@ def _get_replica_device_setter(self, worker_device):
         "DecisionTreeEnsembleResourceHandleOp",
         "StatsAccumulatorScalarResourceHandleOp",
         "StatsAccumulatorTensorResourceHandleOp",
-        "QuantileStreamResourceHandleOp",
     ]
     ps_strategy = _OpRoundRobinStrategy(ps_ops, ps_tasks)
     return device_setter.replica_device_setter(
@@ -971,7 +984,7 @@ def _update_ensemble():
       # This is a workaround for the slowness of graph building in tf.cond.
       # See (b/36554864).
       split_sizes = array_ops.reshape(
-          array_ops.shape_n(partition_ids_list), [-1])
+          array_ops.shape_n(partition_ids_list), [len(partition_ids_list)])
       partition_ids = array_ops.concat(partition_ids_list, axis=0)
       gains = array_ops.concat(gains_list, axis=0)
       split_infos = array_ops.concat(split_info_list, axis=0)
@@ -1036,8 +1049,11 @@ def _grow_ensemble_fn():
 
       # Update ensemble.
       update_ops = [are_all_splits_ready]
-      update_model = control_flow_ops.cond(continue_centering, _center_bias_fn,
-                                           _grow_ensemble_fn)
+      if self._center_bias:
+        update_model = control_flow_ops.cond(continue_centering,
+                                             _center_bias_fn, _grow_ensemble_fn)
+      else:
+        update_model = _grow_ensemble_fn()
       update_ops.append(update_model)
 
       # Update ensemble stats.
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
index f9c22283b7f513..289fb195db109f 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
@@ -31,7 +31,6 @@
 from tensorflow.contrib.layers.python.layers import feature_column as feature_column_lib
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
 
-
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
@@ -97,8 +96,8 @@ def testExtractFeatures(self):
           array_ops.zeros([2], dtypes.int64))
       features["sparse_int"] = sparse_tensor.SparseTensor(
           array_ops.zeros([2, 2], dtypes.int64),
-          array_ops.zeros([2], dtypes.int64),
-          array_ops.zeros([2], dtypes.int64))
+          array_ops.zeros([2], dtypes.int64), array_ops.zeros([2],
+                                                              dtypes.int64))
       (fc_names, dense_floats, sparse_float_indices, sparse_float_values,
        sparse_float_shapes, sparse_int_indices, sparse_int_values,
        sparse_int_shapes) = (
@@ -139,8 +138,8 @@ def testExtractFeaturesWithTransformation(self):
           array_ops.zeros([2], dtypes.int64))
       features["sparse_categorical"] = sparse_tensor.SparseTensor(
           array_ops.zeros([2, 2], dtypes.int64),
-          array_ops.zeros(
-              [2], dtypes.string), array_ops.zeros([2], dtypes.int64))
+          array_ops.zeros([2], dtypes.string), array_ops.zeros([2],
+                                                               dtypes.int64))
       feature_columns = set()
       feature_columns.add(layers.real_valued_column("dense_float"))
       feature_columns.add(
@@ -235,7 +234,8 @@ def testTrainFnChiefNoBiasCentering(self):
           ensemble_handle=ensemble_handle,
           examples_per_layer=1,
           learner_config=learner_config,
-          logits_dimension=1, features=features)
+          logits_dimension=1,
+          features=features)
 
       predictions = array_ops.constant(
           [[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32)
@@ -316,6 +316,113 @@ def testTrainFnChiefNoBiasCentering(self):
           }"""
       self.assertProtoEquals(expected_tree, output.trees[0])
 
+  def testTrainFnChiefSparseAndDense(self):
+    """Tests the train function with sparse and dense features."""
+    with self.test_session() as sess:
+      ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=0, tree_ensemble_config="", name="tree_ensemble")
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.learning_rate_tuner.fixed.learning_rate = 0.1
+      learner_config.num_classes = 2
+      learner_config.regularization.l1 = 0
+      learner_config.regularization.l2 = 0
+      learner_config.constraints.max_tree_depth = 1
+      learner_config.constraints.min_node_weight = 0
+      features = {}
+      features["dense_float"] = array_ops.ones([4, 1], dtypes.float32)
+      features["sparse_float"] = sparse_tensor.SparseTensor(
+          array_ops.zeros([2, 2], dtypes.int64),
+          array_ops.zeros([2], dtypes.float32),
+          array_ops.constant([4, 1], dtypes.int64))
+
+      gbdt_model = gbdt_batch.GradientBoostedDecisionTreeModel(
+          is_chief=True,
+          num_ps_replicas=0,
+          center_bias=False,
+          ensemble_handle=ensemble_handle,
+          examples_per_layer=1,
+          learner_config=learner_config,
+          logits_dimension=1,
+          features=features)
+
+      predictions = array_ops.constant(
+          [[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32)
+      partition_ids = array_ops.zeros([4], dtypes.int32)
+      ensemble_stamp = variables.Variable(
+          initial_value=0,
+          name="ensemble_stamp",
+          trainable=False,
+          dtype=dtypes.int64)
+
+      predictions_dict = {
+          "predictions": predictions,
+          "predictions_no_dropout": predictions,
+          "partition_ids": partition_ids,
+          "ensemble_stamp": ensemble_stamp,
+          "num_trees": 12,
+      }
+
+      labels = array_ops.ones([4, 1], dtypes.float32)
+      weights = array_ops.ones([4, 1], dtypes.float32)
+      # Create train op.
+      train_op = gbdt_model.train(
+          loss=math_ops.reduce_mean(
+              _squared_loss(labels, weights, predictions)),
+          predictions_dict=predictions_dict,
+          labels=labels)
+      variables.global_variables_initializer().run()
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      # On first run, expect no splits to be chosen because the quantile
+      # buckets will not be ready.
+      train_op.run()
+      stamp_token, serialized = model_ops.tree_ensemble_serialize(
+          ensemble_handle)
+      output = tree_config_pb2.DecisionTreeEnsembleConfig()
+      output.ParseFromString(serialized.eval())
+      self.assertEquals(len(output.trees), 0)
+      self.assertEquals(len(output.tree_weights), 0)
+      self.assertEquals(stamp_token.eval(), 1)
+
+      # Update the stamp to be able to run a second time.
+      sess.run([ensemble_stamp.assign_add(1)])
+
+      train_op.run()
+      stamp_token, serialized = model_ops.tree_ensemble_serialize(
+          ensemble_handle)
+      output = tree_config_pb2.DecisionTreeEnsembleConfig()
+      output.ParseFromString(serialized.eval())
+      self.assertEquals(len(output.trees), 1)
+      self.assertAllClose(output.tree_weights, [0.1])
+      self.assertEquals(stamp_token.eval(), 2)
+      expected_tree = """
+          nodes {
+            sparse_float_binary_split_default_right {
+              split{
+                left_id: 1
+                right_id: 2
+              }
+            }
+            node_metadata {
+              gain: 1.125
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: 1.0
+              }
+            }
+          }
+          nodes {
+            leaf {
+              vector {
+                value: -0.5
+              }
+            }
+          }"""
+      self.assertProtoEquals(expected_tree, output.trees[0])
+
   def testTrainFnChiefScalingNumberOfExamples(self):
     """Tests the train function running on chief without bias centering."""
     with self.test_session() as sess:
@@ -339,7 +446,8 @@ def testTrainFnChiefScalingNumberOfExamples(self):
           ensemble_handle=ensemble_handle,
           examples_per_layer=num_examples_fn,
           learner_config=learner_config,
-          logits_dimension=1, features=features)
+          logits_dimension=1,
+          features=features)
 
       predictions = array_ops.constant(
           [[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32)
@@ -442,7 +550,8 @@ def testTrainFnChiefWithBiasCentering(self):
           ensemble_handle=ensemble_handle,
           examples_per_layer=1,
           learner_config=learner_config,
-          logits_dimension=1, features=features)
+          logits_dimension=1,
+          features=features)
 
       predictions = array_ops.constant(
           [[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32)
@@ -513,7 +622,8 @@ def testTrainFnNonChiefNoBiasCentering(self):
           ensemble_handle=ensemble_handle,
           examples_per_layer=1,
           learner_config=learner_config,
-          logits_dimension=1, features=features)
+          logits_dimension=1,
+          features=features)
 
       predictions = array_ops.constant(
           [[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32)
@@ -576,7 +686,8 @@ def testTrainFnNonChiefWithCentering(self):
           ensemble_handle=ensemble_handle,
           examples_per_layer=1,
           learner_config=learner_config,
-          logits_dimension=1, features=features)
+          logits_dimension=1,
+          features=features)
 
       predictions = array_ops.constant(
           [[0.0], [1.0], [0.0], [2.0]], dtype=dtypes.float32)
@@ -622,7 +733,8 @@ def testPredictFn(self):
     with self.test_session() as sess:
       # Create ensemble with one bias node.
       ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
-      text_format.Merge("""
+      text_format.Merge(
+          """
           trees {
             nodes {
               leaf {
@@ -659,14 +771,15 @@ def testPredictFn(self):
           ensemble_handle=ensemble_handle,
           examples_per_layer=1,
           learner_config=learner_config,
-          logits_dimension=1, features=features)
+          logits_dimension=1,
+          features=features)
 
       # Create predict op.
       mode = model_fn.ModeKeys.EVAL
       predictions_dict = sess.run(gbdt_model.predict(mode))
       self.assertEquals(predictions_dict["ensemble_stamp"], 3)
-      self.assertAllClose(predictions_dict["predictions"], [[0.25], [0.25],
-                                                            [0.25], [0.25]])
+      self.assertAllClose(predictions_dict["predictions"],
+                          [[0.25], [0.25], [0.25], [0.25]])
       self.assertAllClose(predictions_dict["partition_ids"], [0, 0, 0, 0])
 
   def testTrainFnMulticlassFullHessian(self):
@@ -698,7 +811,8 @@ def testTrainFnMulticlassFullHessian(self):
           ensemble_handle=ensemble_handle,
           examples_per_layer=1,
           learner_config=learner_config,
-          logits_dimension=5, features=features)
+          logits_dimension=5,
+          features=features)
 
       predictions = array_ops.constant(
           [[0.0, -1.0, 0.5, 1.2, 3.1], [1.0, 0.0, 0.8, 0.3, 1.0],
@@ -801,7 +915,8 @@ def testTrainFnMulticlassDiagonalHessian(self):
           ensemble_handle=ensemble_handle,
           examples_per_layer=1,
           learner_config=learner_config,
-          logits_dimension=5, features=features)
+          logits_dimension=5,
+          features=features)
 
       predictions = array_ops.constant(
           [[0.0, -1.0, 0.5, 1.2, 3.1], [1.0, 0.0, 0.8, 0.3, 1.0],
@@ -893,8 +1008,8 @@ def testTrainFnMulticlassTreePerClass(self):
       learner_config.constraints.max_tree_depth = 1
       learner_config.constraints.min_node_weight = 0
       features = {
-          "dense_float": array_ops.constant(
-              [[1.0], [1.5], [2.0]], dtypes.float32),
+          "dense_float":
+              array_ops.constant([[1.0], [1.5], [2.0]], dtypes.float32),
       }
 
       gbdt_model = gbdt_batch.GradientBoostedDecisionTreeModel(
@@ -904,7 +1019,8 @@ def testTrainFnMulticlassTreePerClass(self):
           ensemble_handle=ensemble_handle,
           examples_per_layer=1,
           learner_config=learner_config,
-          logits_dimension=5, features=features)
+          logits_dimension=5,
+          features=features)
 
       batch_size = 3
       predictions = array_ops.constant(
@@ -986,7 +1102,8 @@ def testTrainFnMulticlassTreePerClass(self):
       self.assertAllClose(
           0.893284678459,
           output.trees[0].nodes[2].leaf.sparse_vector.value[0],
-          atol=1e-4, rtol=1e-4)
+          atol=1e-4,
+          rtol=1e-4)
 
   def testTrainFnChiefFeatureSelectionReachedLimitNoGoodSplit(self):
     """Tests the train function running on chief with feature selection."""
@@ -1230,9 +1347,9 @@ def testTrainFnChiefFeatureSelectionReachedLimitIncrementAttemptedLayer(self):
       tree_ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
       tree = tree_ensemble_config.trees.add()
 
-      _set_float_split(tree.nodes.add()
-                       .sparse_float_binary_split_default_right.split, 2, 4.0,
-                       1, 2)
+      _set_float_split(
+          tree.nodes.add().sparse_float_binary_split_default_right.split, 2,
+          4.0, 1, 2)
       _append_to_leaf(tree.nodes.add().leaf, 0, 0.5)
       _append_to_leaf(tree.nodes.add().leaf, 1, 1.2)
       tree_ensemble_config.tree_weights.append(1.0)
@@ -1241,7 +1358,8 @@ def testTrainFnChiefFeatureSelectionReachedLimitIncrementAttemptedLayer(self):
       metadata.num_layers_grown = 1
       tree_ensemble_config = tree_ensemble_config.SerializeToString()
       ensemble_handle = model_ops.tree_ensemble_variable(
-          stamp_token=0, tree_ensemble_config=tree_ensemble_config,
+          stamp_token=0,
+          tree_ensemble_config=tree_ensemble_config,
           name="tree_ensemble")
       learner_config = learner_pb2.LearnerConfig()
       learner_config.learning_rate_tuner.fixed.learning_rate = 0.1
@@ -1333,5 +1451,6 @@ def testTrainFnChiefFeatureSelectionReachedLimitIncrementAttemptedLayer(self):
 
       self.assertEquals(output.growing_metadata.num_layers_attempted, 2)
 
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/contrib/checkpoint/__init__.py b/tensorflow/contrib/checkpoint/__init__.py
index 1192cc44a17823..8ae493ba998bd8 100644
--- a/tensorflow/contrib/checkpoint/__init__.py
+++ b/tensorflow/contrib/checkpoint/__init__.py
@@ -14,19 +14,37 @@
 # ==============================================================================
 """Tools for working with object-based checkpoints.
 
-
-For creating and managing dependencies:
+Visualization and inspection:
 @@dot_graph_from_checkpoint
+@@object_metadata
+
+Managing dependencies:
+@@Checkpointable
+@@CheckpointableObjectGraph
+@@NoDependency
 @@split_dependency
+
+Checkpointable data structures:
+@@List
+@@Mapping
+@@UniqueNameTracker
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.checkpoint.python.containers import UniqueNameTracker
 from tensorflow.contrib.checkpoint.python.split_dependency import split_dependency
 from tensorflow.contrib.checkpoint.python.visualize import dot_graph_from_checkpoint
+from tensorflow.core.protobuf.checkpointable_object_graph_pb2 import CheckpointableObjectGraph
+from tensorflow.python.training.checkpointable.base import Checkpointable
+from tensorflow.python.training.checkpointable.base import NoDependency
+from tensorflow.python.training.checkpointable.data_structures import List
+from tensorflow.python.training.checkpointable.data_structures import Mapping
+from tensorflow.python.training.checkpointable.util import object_metadata
 
 from tensorflow.python.util.all_util import remove_undocumented
 
 remove_undocumented(module_name=__name__)
+
diff --git a/tensorflow/contrib/checkpoint/python/BUILD b/tensorflow/contrib/checkpoint/python/BUILD
index a5681ffa61d07e..7b200a29bf6008 100644
--- a/tensorflow/contrib/checkpoint/python/BUILD
+++ b/tensorflow/contrib/checkpoint/python/BUILD
@@ -8,8 +8,35 @@ py_library(
     name = "checkpoint",
     srcs_version = "PY2AND3",
     deps = [
+        ":containers",
         ":split_dependency",
         ":visualize",
+        "//tensorflow/python/training/checkpointable:data_structures",
+    ],
+)
+
+py_library(
+    name = "containers",
+    srcs = ["containers.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/checkpointable:data_structures",
+    ],
+)
+
+py_test(
+    name = "containers_test",
+    srcs = ["containers_test.py"],
+    deps = [
+        ":containers",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/checkpointable:util",
+        "@six_archive//:six",
     ],
 )
 
@@ -21,6 +48,7 @@ py_library(
     deps = [
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:training",
+        "//tensorflow/python/training/checkpointable:base",
     ],
 )
 
@@ -32,8 +60,9 @@ py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:training",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/checkpointable:util",
     ],
 )
 
@@ -44,6 +73,8 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/checkpointable:util",
     ],
 )
 
@@ -52,10 +83,13 @@ py_test(
     srcs = ["visualize_test.py"],
     deps = [
         ":visualize",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:training",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras:engine",
+        "//tensorflow/python/keras:layers",
+        "//tensorflow/python/training/checkpointable:util",
     ],
 )
diff --git a/tensorflow/contrib/checkpoint/python/containers.py b/tensorflow/contrib/checkpoint/python/containers.py
new file mode 100644
index 00000000000000..4d3d5312993740
--- /dev/null
+++ b/tensorflow/contrib/checkpoint/python/containers.py
@@ -0,0 +1,80 @@
+"""Checkpointable data structures."""
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.training.checkpointable import base as checkpointable_lib
+from tensorflow.python.training.checkpointable import data_structures
+
+
+class UniqueNameTracker(data_structures.CheckpointableDataStructure):
+  """Adds dependencies on checkpointable objects with name hints.
+
+  Useful for creating dependencies with locally unique names.
+
+  Example usage:
+  ```python
+  class SlotManager(tf.contrib.checkpoint.Checkpointable):
+
+    def __init__(self):
+      # Create a dependency named "slotdeps" on the container.
+      self.slotdeps = tf.contrib.checkpoint.UniqueNameTracker()
+      slotdeps = self.slotdeps
+      slots = []
+      slots.append(slotdeps.track(tfe.Variable(3.), "x"))  # Named "x"
+      slots.append(slotdeps.track(tfe.Variable(4.), "y"))
+      slots.append(slotdeps.track(tfe.Variable(5.), "x"))  # Named "x_1"
+  ```
+  """
+
+  def __init__(self):
+    super(UniqueNameTracker, self).__init__()
+    self._maybe_initialize_checkpointable()
+    self._name_counts = {}
+
+  def track(self, checkpointable, base_name):
+    """Add a dependency on `checkpointable`.
+
+    Args:
+      checkpointable: An object to add a checkpoint dependency on.
+      base_name: A name hint, which is uniquified to determine the dependency
+        name.
+    Returns:
+      `checkpointable`, for chaining.
+    Raises:
+      ValueError: If `checkpointable` is not a checkpointable object.
+    """
+
+    if not isinstance(checkpointable, checkpointable_lib.CheckpointableBase):
+      raise ValueError(
+          ("Expected a checkpointable value, got %s which does not inherit "
+           "from CheckpointableBase.") % (checkpointable,))
+
+    def _format_name(prefix, number):
+      if number > 0:
+        return "%s_%d" % (prefix, number)
+      else:
+        return prefix
+
+    count = self._name_counts.get(base_name, 0)
+    candidate = _format_name(base_name, count)
+    while self._lookup_dependency(candidate) is not None:
+      count += 1
+      candidate = _format_name(base_name, count)
+    self._name_counts[base_name] = count + 1
+    self._track_value(checkpointable, name=candidate)
+    return checkpointable
diff --git a/tensorflow/contrib/checkpoint/python/containers_test.py b/tensorflow/contrib/checkpoint/python/containers_test.py
new file mode 100644
index 00000000000000..3717d7f583ffdc
--- /dev/null
+++ b/tensorflow/contrib/checkpoint/python/containers_test.py
@@ -0,0 +1,108 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import six
+
+from tensorflow.contrib.checkpoint.python import containers
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import layers
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
+
+
+class UniqueNameTrackerTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testNames(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+
+    x1 = resource_variable_ops.ResourceVariable(2.)
+    x2 = resource_variable_ops.ResourceVariable(3.)
+    x3 = resource_variable_ops.ResourceVariable(4.)
+    y = resource_variable_ops.ResourceVariable(5.)
+    slots = containers.UniqueNameTracker()
+    slots.track(x1, "x")
+    slots.track(x2, "x")
+    slots.track(x3, "x_1")
+    slots.track(y, "y")
+    self.evaluate((x1.initializer, x2.initializer, x3.initializer,
+                   y.initializer))
+    save_root = checkpointable_utils.Checkpoint(slots=slots)
+    save_path = save_root.save(checkpoint_prefix)
+
+    restore_slots = checkpointable.Checkpointable()
+    restore_root = checkpointable_utils.Checkpoint(
+        slots=restore_slots)
+    status = restore_root.restore(save_path)
+    restore_slots.x = resource_variable_ops.ResourceVariable(0.)
+    restore_slots.x_1 = resource_variable_ops.ResourceVariable(0.)
+    restore_slots.x_1_1 = resource_variable_ops.ResourceVariable(0.)
+    restore_slots.y = resource_variable_ops.ResourceVariable(0.)
+    status.assert_consumed().run_restore_ops()
+    self.assertEqual(2., self.evaluate(restore_slots.x))
+    self.assertEqual(3., self.evaluate(restore_slots.x_1))
+    self.assertEqual(4., self.evaluate(restore_slots.x_1_1))
+    self.assertEqual(5., self.evaluate(restore_slots.y))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testExample(self):
+    class SlotManager(checkpointable.Checkpointable):
+
+      def __init__(self):
+        self.slotdeps = containers.UniqueNameTracker()
+        slotdeps = self.slotdeps
+        slots = []
+        slots.append(slotdeps.track(
+            resource_variable_ops.ResourceVariable(3.), "x"))
+        slots.append(slotdeps.track(
+            resource_variable_ops.ResourceVariable(4.), "y"))
+        slots.append(slotdeps.track(
+            resource_variable_ops.ResourceVariable(5.), "x"))
+        self.slots = slots
+
+    manager = SlotManager()
+    self.evaluate([v.initializer for v in manager.slots])
+    checkpoint = checkpointable_utils.Checkpoint(slot_manager=manager)
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = checkpoint.save(checkpoint_prefix)
+    metadata = checkpointable_utils.object_metadata(save_path)
+    dependency_names = []
+    for node in metadata.nodes:
+      for child in node.children:
+        dependency_names.append(child.local_name)
+    six.assertCountEqual(
+        self,
+        dependency_names,
+        ["x", "x_1", "y", "slot_manager", "slotdeps", "save_counter"])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testLayers(self):
+    tracker = containers.UniqueNameTracker()
+    tracker.track(layers.Dense(3), "dense")
+    tracker.layers[0](array_ops.zeros([1, 1]))
+    self.assertEqual(2, len(tracker.trainable_weights))
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/checkpoint/python/split_dependency.py b/tensorflow/contrib/checkpoint/python/split_dependency.py
index 3aec8c96e90440..7e77453f3d848c 100644
--- a/tensorflow/contrib/checkpoint/python/split_dependency.py
+++ b/tensorflow/contrib/checkpoint/python/split_dependency.py
@@ -20,8 +20,8 @@
 import functools
 
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.training import checkpointable as checkpointable
 from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.training.checkpointable import base as checkpointable
 
 
 class _CallbackSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
diff --git a/tensorflow/contrib/checkpoint/python/split_dependency_test.py b/tensorflow/contrib/checkpoint/python/split_dependency_test.py
index f1d9d19b047ee6..69dc0b9be2d554 100644
--- a/tensorflow/contrib/checkpoint/python/split_dependency_test.py
+++ b/tensorflow/contrib/checkpoint/python/split_dependency_test.py
@@ -23,8 +23,8 @@
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.training import checkpointable
-from tensorflow.python.training import checkpointable_utils
+from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
 
 
 def _split_variable_closure(variable):
diff --git a/tensorflow/contrib/checkpoint/python/visualize.py b/tensorflow/contrib/checkpoint/python/visualize.py
index 86fbdb41d2c378..bac071c4cff383 100644
--- a/tensorflow/contrib/checkpoint/python/visualize.py
+++ b/tensorflow/contrib/checkpoint/python/visualize.py
@@ -17,10 +17,9 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.core.protobuf import checkpointable_object_graph_pb2
 from tensorflow.python import pywrap_tensorflow
-from tensorflow.python.framework import errors_impl
-from tensorflow.python.training import checkpointable
+from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
 
 
 def dot_graph_from_checkpoint(save_path):
@@ -52,20 +51,9 @@ def dot_graph_from_checkpoint(save_path):
     A graph in DOT format as a string.
   """
   reader = pywrap_tensorflow.NewCheckpointReader(save_path)
-  try:
-    object_graph_string = reader.get_tensor(
-        checkpointable.OBJECT_GRAPH_PROTO_KEY)
-  except errors_impl.NotFoundError:
-    raise ValueError(
-        ('The specified checkpoint "%s" does not appear to be object-based (it '
-         'is missing the key "%s"). Likely it was created with a name-based '
-         'saver and does not contain an object dependency graph.') % (
-             save_path, checkpointable.OBJECT_GRAPH_PROTO_KEY))
+  object_graph = checkpointable_utils.object_metadata(save_path)
   shape_map = reader.get_variable_to_shape_map()
   dtype_map = reader.get_variable_to_dtype_map()
-  object_graph = (
-      checkpointable_object_graph_pb2.CheckpointableObjectGraph())
-  object_graph.ParseFromString(object_graph_string)
   graph = 'digraph {\n'
   def _escape(name):
     return name.replace('"', '\\"')
diff --git a/tensorflow/contrib/checkpoint/python/visualize_test.py b/tensorflow/contrib/checkpoint/python/visualize_test.py
index 1d9ab789235cb9..583e3bc442893d 100644
--- a/tensorflow/contrib/checkpoint/python/visualize_test.py
+++ b/tensorflow/contrib/checkpoint/python/visualize_test.py
@@ -24,11 +24,11 @@
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
-from tensorflow.python.keras._impl.keras.engine import training
-from tensorflow.python.keras._impl.keras.layers import core
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.layers import core
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.training import adam
-from tensorflow.python.training import checkpointable_utils
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
 
 try:
   import pydot  # pylint: disable=g-import-not-at-top
diff --git a/tensorflow/contrib/cloud/BUILD b/tensorflow/contrib/cloud/BUILD
index f3a75e8688ece1..42ba368531468b 100644
--- a/tensorflow/contrib/cloud/BUILD
+++ b/tensorflow/contrib/cloud/BUILD
@@ -15,7 +15,10 @@ load(
 )
 
 tf_gen_op_libs(
-    op_lib_names = ["bigquery_reader_ops"],
+    op_lib_names = [
+        "bigquery_reader_ops",
+        "gcs_config_ops",
+    ],
     deps = [
         "//tensorflow/core:lib",
     ],
@@ -28,15 +31,25 @@ tf_gen_op_wrapper_py(
     deps = [":bigquery_reader_ops_op_lib"],
 )
 
+tf_gen_op_wrapper_py(
+    name = "gen_gcs_config_ops",
+    out = "python/ops/gen_gcs_config_ops.py",
+    require_shape_functions = True,
+    visibility = ["//tensorflow:internal"],
+    deps = [":gcs_config_ops_op_lib"],
+)
+
 py_library(
     name = "cloud_py",
     srcs = [
         "__init__.py",
         "python/ops/bigquery_reader_ops.py",
+        "python/ops/gcs_config_ops.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
         ":gen_bigquery_reader_ops",
+        ":gen_gcs_config_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:io_ops",
         "//tensorflow/python:util",
diff --git a/tensorflow/contrib/cloud/__init__.py b/tensorflow/contrib/cloud/__init__.py
index 8870264b95dfd9..a6e13ea3ae9384 100644
--- a/tensorflow/contrib/cloud/__init__.py
+++ b/tensorflow/contrib/cloud/__init__.py
@@ -20,9 +20,15 @@
 
 # pylint: disable=line-too-long,wildcard-import
 from tensorflow.contrib.cloud.python.ops.bigquery_reader_ops import *
+from tensorflow.contrib.cloud.python.ops.gcs_config_ops import *
 # pylint: enable=line-too-long,wildcard-import
 
 from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = ['BigQueryReader']
+_allowed_symbols = [
+    'BigQueryReader',
+    'ConfigureColabSession',
+    'ConfigureGcs',
+    'ConfigureGcsHook',
+]
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/cloud/kernels/BUILD b/tensorflow/contrib/cloud/kernels/BUILD
index ff46f0daa80a70..40160706f70e8f 100644
--- a/tensorflow/contrib/cloud/kernels/BUILD
+++ b/tensorflow/contrib/cloud/kernels/BUILD
@@ -73,3 +73,17 @@ tf_proto_library(
     srcs = ["bigquery_table_partition.proto"],
     cc_api_version = 2,
 )
+
+tf_kernel_library(
+    name = "gcs_config_ops",
+    srcs = ["gcs_config_ops.cc"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/platform/cloud:curl_http_request",
+        "//tensorflow/core/platform/cloud:gcs_file_system",
+        "//tensorflow/core/platform/cloud:oauth_client",
+        "@jsoncpp_git//:jsoncpp",
+    ],
+)
diff --git a/tensorflow/contrib/cloud/kernels/gcs_config_ops.cc b/tensorflow/contrib/cloud/kernels/gcs_config_ops.cc
new file mode 100644
index 00000000000000..648a219fb87a6e
--- /dev/null
+++ b/tensorflow/contrib/cloud/kernels/gcs_config_ops.cc
@@ -0,0 +1,205 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <sstream>
+
+#include "include/json/json.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/platform/cloud/curl_http_request.h"
+#include "tensorflow/core/platform/cloud/gcs_file_system.h"
+#include "tensorflow/core/platform/cloud/oauth_client.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace {
+
+// The default initial delay between retries with exponential backoff.
+constexpr int kInitialRetryDelayUsec = 500000;  // 0.5 sec
+
+// The minimum time delta between now and the token expiration time
+// for the token to be re-used.
+constexpr int kExpirationTimeMarginSec = 60;
+
+// The URL to retrieve the auth bearer token via OAuth with a refresh token.
+constexpr char kOAuthV3Url[] = "https://www.googleapis.com/oauth2/v3/token";
+
+// The URL to retrieve the auth bearer token via OAuth with a private key.
+constexpr char kOAuthV4Url[] = "https://www.googleapis.com/oauth2/v4/token";
+
+// The authentication token scope to request.
+constexpr char kOAuthScope[] = "https://www.googleapis.com/auth/cloud-platform";
+
+Status RetrieveGcsFs(OpKernelContext* ctx, RetryingGcsFileSystem** fs) {
+  DCHECK(fs != nullptr);
+  *fs = nullptr;
+
+  FileSystem* filesystem = nullptr;
+  TF_RETURN_IF_ERROR(
+      ctx->env()->GetFileSystemForFile("gs://fake/file.text", &filesystem));
+  if (filesystem == nullptr) {
+    return errors::FailedPrecondition("The GCS file system is not registered.");
+  }
+
+  *fs = dynamic_cast<RetryingGcsFileSystem*>(filesystem);
+  if (*fs == nullptr) {
+    return errors::Internal(
+        "The filesystem registered under the 'gs://' scheme was not a "
+        "tensorflow::RetryingGcsFileSystem*.");
+  }
+  return Status::OK();
+}
+
+template <typename T>
+Status ParseScalarArgument(OpKernelContext* ctx, StringPiece argument_name,
+                           T* output) {
+  const Tensor* argument_t;
+  TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t));
+  if (!TensorShapeUtils::IsScalar(argument_t->shape())) {
+    return errors::InvalidArgument(argument_name, " must be a scalar");
+  }
+  *output = argument_t->scalar<T>()();
+  return Status::OK();
+}
+
+// GcsCredentialsOpKernel overrides the credentials used by the gcs_filesystem.
+class GcsCredentialsOpKernel : public OpKernel {
+ public:
+  explicit GcsCredentialsOpKernel(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  void Compute(OpKernelContext* ctx) override {
+    // Get a handle to the GCS file system.
+    RetryingGcsFileSystem* gcs = nullptr;
+    OP_REQUIRES_OK(ctx, RetrieveGcsFs(ctx, &gcs));
+
+    string json_string;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<string>(ctx, "json", &json_string));
+
+    Json::Value json;
+    Json::Reader reader;
+    std::stringstream json_stream(json_string);
+    OP_REQUIRES(ctx, reader.parse(json_stream, json),
+                errors::InvalidArgument("Could not parse json: ", json_string));
+
+    OP_REQUIRES(
+        ctx, json.isMember("refresh_token") || json.isMember("private_key"),
+        errors::InvalidArgument("JSON format incompatible; did not find fields "
+                                "`refresh_token` or `private_key`."));
+
+    auto provider =
+        tensorflow::MakeUnique<ConstantAuthProvider>(json, ctx->env());
+
+    // Test getting a token
+    string dummy_token;
+    OP_REQUIRES_OK(ctx, provider->GetToken(&dummy_token));
+    OP_REQUIRES(ctx, !dummy_token.empty(),
+                errors::InvalidArgument(
+                    "Could not retrieve a token with the given credentials."));
+
+    // Set the provider.
+    gcs->underlying()->SetAuthProvider(std::move(provider));
+  }
+
+ private:
+  class ConstantAuthProvider : public AuthProvider {
+   public:
+    ConstantAuthProvider(const Json::Value& json,
+                         std::unique_ptr<OAuthClient> oauth_client, Env* env,
+                         int64 initial_retry_delay_usec)
+        : json_(json),
+          oauth_client_(std::move(oauth_client)),
+          env_(env),
+          initial_retry_delay_usec_(initial_retry_delay_usec) {}
+
+    ConstantAuthProvider(const Json::Value& json, Env* env)
+        : ConstantAuthProvider(json, tensorflow::MakeUnique<OAuthClient>(), env,
+                               kInitialRetryDelayUsec) {}
+
+    ~ConstantAuthProvider() override {}
+
+    Status GetToken(string* token) override {
+      mutex_lock l(mu_);
+      const uint64 now_sec = env_->NowSeconds();
+
+      if (!current_token_.empty() &&
+          now_sec + kExpirationTimeMarginSec < expiration_timestamp_sec_) {
+        *token = current_token_;
+        return Status::OK();
+      }
+      if (json_.isMember("refresh_token")) {
+        TF_RETURN_IF_ERROR(oauth_client_->GetTokenFromRefreshTokenJson(
+            json_, kOAuthV3Url, &current_token_, &expiration_timestamp_sec_));
+      } else if (json_.isMember("private_key")) {
+        TF_RETURN_IF_ERROR(oauth_client_->GetTokenFromServiceAccountJson(
+            json_, kOAuthV4Url, kOAuthScope, &current_token_,
+            &expiration_timestamp_sec_));
+      } else {
+        return errors::FailedPrecondition(
+            "Unexpected content of the JSON credentials file.");
+      }
+
+      *token = current_token_;
+      return Status::OK();
+    }
+
+   private:
+    Json::Value json_;
+    std::unique_ptr<OAuthClient> oauth_client_;
+    Env* env_;
+
+    mutex mu_;
+    string current_token_ GUARDED_BY(mu_);
+    uint64 expiration_timestamp_sec_ GUARDED_BY(mu_) = 0;
+
+    // The initial delay for exponential backoffs when retrying failed calls.
+    const int64 initial_retry_delay_usec_;
+    TF_DISALLOW_COPY_AND_ASSIGN(ConstantAuthProvider);
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("GcsConfigureCredentials").Device(DEVICE_CPU),
+                        GcsCredentialsOpKernel);
+
+class GcsBlockCacheOpKernel : public OpKernel {
+ public:
+  explicit GcsBlockCacheOpKernel(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  void Compute(OpKernelContext* ctx) override {
+    // Get a handle to the GCS file system.
+    RetryingGcsFileSystem* gcs = nullptr;
+    OP_REQUIRES_OK(ctx, RetrieveGcsFs(ctx, &gcs));
+
+    size_t max_cache_size, block_size, max_staleness;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<size_t>(ctx, "max_cache_size",
+                                                    &max_cache_size));
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument<size_t>(ctx, "block_size", &block_size));
+    OP_REQUIRES_OK(
+        ctx, ParseScalarArgument<size_t>(ctx, "max_staleness", &max_staleness));
+
+    if (gcs->underlying()->block_size() == block_size &&
+        gcs->underlying()->max_bytes() == max_cache_size &&
+        gcs->underlying()->max_staleness() == max_staleness) {
+      LOG(INFO) << "Skipping resetting the GCS block cache.";
+      return;
+    }
+    gcs->underlying()->ResetFileBlockCache(block_size, max_cache_size,
+                                           max_staleness);
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("GcsConfigureBlockCache").Device(DEVICE_CPU),
+                        GcsBlockCacheOpKernel);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/cloud/ops/gcs_config_ops.cc b/tensorflow/contrib/cloud/ops/gcs_config_ops.cc
new file mode 100644
index 00000000000000..9cf85f5f1811d8
--- /dev/null
+++ b/tensorflow/contrib/cloud/ops/gcs_config_ops.cc
@@ -0,0 +1,70 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_OP("GcsConfigureCredentials")
+    .Input("json: string")
+    .SetShapeFn(shape_inference::NoOutputs)
+    .Doc(R"doc(
+Configures the credentials used by the GCS client of the local TF runtime.
+
+The json input can be of the format:
+
+1. Refresh Token:
+{
+  "client_id": "<redacted>",
+  "client_secret": "<redacted>",
+  "refresh_token: "<redacted>",
+  "type": "authorized_user",
+}
+
+2. Service Account:
+{
+  "type": "service_account",
+  "project_id": "<redacted>",
+  "private_key_id": "<redacted>",
+  "private_key": "------BEGIN PRIVATE KEY-----\n<REDACTED>\n-----END PRIVATE KEY------\n",
+  "client_email": "<REDACTED>@<REDACTED>.iam.gserviceaccount.com",
+  "client_id": "<REDACTED>",
+  # Some additional fields elided
+}
+
+Note the credentials established through this method are shared across all
+sessions run on this runtime.
+
+Note be sure to feed the inputs to this op to ensure the credentials are not
+stored in a constant op within the graph that might accidentally be checkpointed
+or in other ways be persisted or exfiltrated.
+)doc");
+
+REGISTER_OP("GcsConfigureBlockCache")
+    .Input("max_cache_size: uint64")
+    .Input("block_size: uint64")
+    .Input("max_staleness: uint64")
+    .SetShapeFn(shape_inference::NoOutputs)
+    .Doc(R"doc(
+Re-configures the GCS block cache with the new configuration values.
+
+If the values are the same as already configured values, this op is a no-op. If
+they are different, the current contents of the block cache is dropped, and a
+new block cache is created fresh.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/cloud/python/ops/gcs_config_ops.py b/tensorflow/contrib/cloud/python/ops/gcs_config_ops.py
new file mode 100644
index 00000000000000..8c8c5acb31af69
--- /dev/null
+++ b/tensorflow/contrib/cloud/python/ops/gcs_config_ops.py
@@ -0,0 +1,188 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""GCS file system configuration for TensorFlow."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+
+from tensorflow.contrib.cloud.python.ops import gen_gcs_config_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.training import training
+
+
+# @tf_export('contrib.cloud.BlockCacheParams')
+class BlockCacheParams(object):
+  """BlockCacheParams is a struct used for configuring the GCS Block Cache."""
+
+  def __init__(self, block_size=None, max_bytes=None, max_staleness=None):
+    self._block_size = block_size or 128 * 1024 * 1024
+    self._max_bytes = max_bytes or 2 * self._block_size
+    self._max_staleness = max_staleness or 0
+
+  @property
+  def block_size(self):
+    return self._block_size
+
+  @property
+  def max_bytes(self):
+    return self._max_bytes
+
+  @property
+  def max_staleness(self):
+    return self._max_staleness
+
+
+# @tf_export('contrib.cloud.ConfigureGcsHook')
+class ConfigureGcsHook(training.SessionRunHook):
+  """ConfigureGcsHook configures GCS when used with Estimator/TPUEstimator.
+
+  Warning: GCS `credentials` may be transmitted over the network unencrypted.
+  Please ensure that the network is trusted before using this function. For
+  users running code entirely within Google Cloud, your data is protected by
+  encryption in between data centers. For more information, please take a look
+  at https://cloud.google.com/security/encryption-in-transit/.
+
+  Example:
+
+  ```
+  sess = tf.Session()
+  refresh_token = raw_input("Refresh token: ")
+  client_secret = raw_input("Client secret: ")
+  client_id = "<REDACTED>"
+  creds = {
+      "client_id": client_id,
+      "refresh_token": refresh_token,
+      "client_secret": client_secret,
+      "type": "authorized_user",
+  }
+  tf.contrib.cloud.configure_gcs(sess, credentials=creds)
+  ```
+
+  """
+
+  def _verify_dictionary(self, creds_dict):
+    if 'refresh_token' in creds_dict or 'private_key' in creds_dict:
+      return True
+    return False
+
+  def __init__(self, credentials=None, block_cache=None):
+    """Constructs a ConfigureGcsHook.
+
+    Args:
+      credentials: A json-formatted string.
+      block_cache: A `BlockCacheParams`
+
+    Raises:
+      ValueError: If credentials is improperly formatted or block_cache is not a
+        BlockCacheParams.
+    """
+    if credentials is not None:
+      if isinstance(credentials, str):
+        try:
+          data = json.loads(credentials)
+        except ValueError as e:
+          raise ValueError('credentials was not a well formed JSON string.', e)
+        if not self._verify_dictionary(data):
+          raise ValueError(
+              'credentials has neither a "refresh_token" nor a "private_key" '
+              'field.')
+      elif isinstance(credentials, dict):
+        if not self._verify_dictionary(credentials):
+          raise ValueError('credentials has neither a "refresh_token" nor a '
+                           '"private_key" field.')
+        credentials = json.dumps(credentials)
+      else:
+        raise ValueError('credentials is of an unknown type')
+
+    self._credentials = credentials
+
+    if block_cache and not isinstance(block_cache, BlockCacheParams):
+      raise ValueError('block_cache must be an instance of BlockCacheParams.')
+    self._block_cache = block_cache
+
+  def begin(self):
+    if self._credentials:
+      self._credentials_placeholder = array_ops.placeholder(dtypes.string)
+      self._credentials_ops = gen_gcs_config_ops.gcs_configure_credentials(
+          self._credentials_placeholder)
+    if self._block_cache:
+      self._block_cache_op = gen_gcs_config_ops.gcs_configure_block_cache(
+          max_cache_size=self._block_cache.max_bytes,
+          block_size=self._block_cache.block_size,
+          max_staleness=self._block_cache.max_staleness)
+
+  def after_create_session(self, session, coord):
+    del coord
+    if self._credentials_op:
+      session.run(
+          self._credentials_op,
+          feed_dict={self._credentials_placeholder: self._credentials})
+    if self._block_cache_op:
+      session.run(self._block_cache_op)
+
+
+def configure_gcs(session, credentials=None, block_cache=None, device=None):
+  """Configures the GCS file system for a given a session.
+
+  Warning: GCS `credentials` may be transmitted over the network unencrypted.
+  Please ensure that the network is trusted before using this function. For
+  users running code entirely within Google Cloud, your data is protected by
+  encryption in between data centers. For more information, please take a look
+  at https://cloud.google.com/security/encryption-in-transit/.
+
+  Args:
+    session: A `tf.Session` session that should be used to configure the GCS
+      file system.
+    credentials: [Optional.] A JSON string
+    block_cache: [Optional.] A BlockCacheParams to configure the block cache .
+    device: [Optional.] The device to place the configure ops.
+  """
+
+  def configure(credentials, block_cache):
+    """Helper function to actually configure GCS."""
+    if credentials:
+      if isinstance(credentials, dict):
+        credentials = json.dumps(credentials)
+      placeholder = array_ops.placeholder(dtypes.string)
+      op = gen_gcs_config_ops.gcs_configure_credentials(placeholder)
+      session.run(op, feed_dict={placeholder: credentials})
+    if block_cache:
+      op = gen_gcs_config_ops.gcs_configure_block_cache(
+          max_cache_size=block_cache.max_bytes,
+          block_size=block_cache.block_size,
+          max_staleness=block_cache.max_staleness)
+      session.run(op)
+
+  if device:
+    with ops.device(device):
+      return configure(credentials, block_cache)
+  return configure(credentials, block_cache)
+
+
+def configure_colab_session(session):
+  """ConfigureColabSession configures the GCS file system in Colab.
+
+  Args:
+    session: A `tf.Session` session.
+  """
+  # Read from the application default credentials (adc).
+  with open('/content/datalab/adc.json') as f:
+    data = json.load(f)
+  configure_gcs(session, credentials=data)
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
index 1403483d287041..a5a9630a4aa382 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
@@ -36,6 +36,8 @@
 
 
 _GKE_ENV_VARIABLE = 'KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'
+_DEFAULT_ENV_VARIABLE = 'TPU_NAME'
+_DISCOVERY_SERVICE_URL_ENV_VARIABLE = 'TPU_API_DISCOVERY_URL'
 
 
 class TPUClusterResolver(ClusterResolver):
@@ -70,6 +72,16 @@ def _inGke():
   def _gkeMaster():
     return os.environ[_GKE_ENV_VARIABLE].split(',')[0]
 
+  @staticmethod
+  def _envVarFallback():
+    if _DEFAULT_ENV_VARIABLE in os.environ:
+      return os.environ[_DEFAULT_ENV_VARIABLE]
+    return None
+
+  @staticmethod
+  def _discoveryUrl():
+    return os.environ.get(_DISCOVERY_SERVICE_URL_ENV_VARIABLE)
+
   def __init__(self,
                tpu=None,
                zone=None,
@@ -78,7 +90,8 @@ def __init__(self,
                coordinator_name=None,
                coordinator_address=None,
                credentials='default',
-               service=None):
+               service=None,
+               discovery_url=None):
     """Creates a new TPUClusterResolver object.
 
     The ClusterResolver will then use the parameters to query the Cloud TPU APIs
@@ -108,6 +121,11 @@ def __init__(self,
       service: The GCE API object returned by the googleapiclient.discovery
         function. If you specify a custom service object, then the credentials
         parameter will be ignored.
+      discovery_url: A URL template that points to the location of
+        the discovery service. It should have two parameters {api} and
+        {apiVersion} that when filled in produce an absolute URL to the
+        discovery document for that service. The environment variable
+        'TPU_API_DISCOVERY_URL' will override this.
 
     Raises:
       ImportError: If the googleapiclient is not installed.
@@ -123,8 +141,11 @@ def __init__(self,
 
     in_gke = self._inGke()
     # When using GKE with Cloud TPUs, the env variable will be set.
-    if tpu is None and in_gke:
-      tpu = self._gkeMaster()
+    if tpu is None:
+      if in_gke:
+        tpu = self._gkeMaster()
+      else:
+        tpu = self._envVarFallback()
 
     self._tpu = compat.as_bytes(tpu)  # self._tpu is always bytes
     self._job_name = job_name
@@ -149,14 +170,22 @@ def __init__(self,
 
     if service is None and should_resolve:
       if not _GOOGLE_API_CLIENT_INSTALLED:
-        raise ImportError('googleapiclient must be installed before using the '
-                          'TPU cluster resolver. Execute: `pip install '
-                          '--upgrade google-api-python-client` to install with '
-                          'pip.')
-
-      self._service = discovery.build(
-          'tpu', 'v1alpha1',
-          credentials=self._credentials)
+        raise ImportError('googleapiclient and oauth2client must be installed '
+                          'before using the TPU cluster resolver. Execute: '
+                          '`pip install --upgrade google-api-python-client` '
+                          'and `pip install --upgrade oauth2client` to '
+                          'install with pip.')
+
+      final_discovery_url = self._discoveryUrl() or discovery_url
+      if final_discovery_url:
+        self._service = discovery.build(
+            'tpu', 'v1alpha1',
+            credentials=self._credentials,
+            discoveryServiceUrl=final_discovery_url)
+      else:
+        self._service = discovery.build(
+            'tpu', 'v1alpha1',
+            credentials=self._credentials)
     else:
       self._service = service
 
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
index 5b3f9be5a11237..5fac55fd027fa2 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
@@ -367,6 +367,10 @@ def testGkeEnvironment(self):
         compat.as_bytes(TPUClusterResolver._gkeMaster()))
     del os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS']
 
+  def testDiscoveryUrl(self):
+    os.environ['TPU_API_DISCOVERY_URL'] = 'https://{api}.internal/{apiVersion}'
+    self.assertEqual('https://{api}.internal/{apiVersion}',
+                     TPUClusterResolver._discoveryUrl())
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 8f6b412955a480..0676f6c665fa11 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -84,7 +84,7 @@ if (NOT WIN32)
 
   option(systemlib_ALL "Turn on every possible systemlib_* options" OFF)
   if (systemlib_ALL)
-    set (systmelib_ZLIB ON)
+    set (systemlib_ZLIB ON)
   endif (systemlib_ALL)
 endif()
 
@@ -172,19 +172,20 @@ if (tensorflow_OPTIMIZE_FOR_NATIVE_ARCH)
   endif()
 endif()
 
+include(CheckCXXCompilerFlag)
+
+# OpenMP Support
+CHECK_CXX_COMPILER_FLAG("-fopenmp" GCC_OPENMP_SUPPORT)
+if (GCC_OPENMP_SUPPORT)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
+endif()
+CHECK_CXX_COMPILER_FLAG("/openmp" MSVC_OPENMP_SUPPORT)
+if (MSVC_OPENMP_SUPPORT)
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /openmp")
+endif()
+
 # MSVC SIMD instructions
 if (tensorflow_WIN_CPU_SIMD_OPTIONS)
-  include(CheckCXXCompilerFlag)
-  if (tensorflow_ENABLE_MKL_SUPPORT)
-    add_definitions(-DINTEL_MKL -DEIGEN_USE_VML)
-    if (NOT tensorflow_ENABLE_MKLDNN_SUPPORT)
-      add_definitions(-DINTEL_MKL_ML)
-    endif()
-  endif()
-  CHECK_CXX_COMPILER_FLAG("-fopenmp" COMPILER_OPT_OPENMP_SUPPORT)
-  if (COMPILER_OPT_OPENMP_SUPPORT)
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp")
-  endif()
   if (WIN32)
     CHECK_CXX_COMPILER_FLAG(${tensorflow_WIN_CPU_SIMD_OPTIONS} COMPILER_OPT_WIN_CPU_SIMD_SUPPORTED)
     if(COMPILER_OPT_WIN_CPU_SIMD_SUPPORTED)
@@ -323,10 +324,13 @@ if(HAIKU)
   list(APPEND tensorflow_EXTERNAL_LIBRARIES network)
 endif()
 
+# MKL Support
 if (tensorflow_ENABLE_MKL_SUPPORT)
+  add_definitions(-DINTEL_MKL -DEIGEN_USE_VML)
   if (WIN32)
     find_path(MKL_HOME_PLATFORM mkl
       PATHS ${MKL_HOME} ${MKL_HOME}/../ ${MKL_HOME}/../../
+      $ENV{MKLROOT} $ENV{MKLROOT}/../ $ENV{MKLROOT}/../../
       PATH_SUFFIXES windows)
     set(MKL_INCLUDE_DIRS ${MKL_HOME_PLATFORM}/mkl/include)
     set(MKL_LINK_DIRS
@@ -345,6 +349,7 @@ if (tensorflow_ENABLE_MKL_SUPPORT)
     # Fix me: complete the path on linux
     find_path(MKL_HOME_PLATFORM mkl
       HINTS ${MKL_HOME} ${MKL_HOME}/../ ${MKL_HOME}/../../
+      $ENV{MKLROOT} $ENV{MKLROOT}/../ $ENV{MKLROOT}/../../
       PATH_SUFFIXES linux)
     set(MKL_INCLUDE_DIRS ${MKL_HOME_PLATFORM}/mkl/include)
     set(MKL_LINK_DIRS) # incompleted
@@ -357,6 +362,8 @@ if (tensorflow_ENABLE_MKL_SUPPORT)
     list(APPEND tensorflow_EXTERNAL_LIBRARIES ${mkldnn_STATIC_LIBRARIES})
     list(APPEND tensorflow_EXTERNAL_DEPENDENCIES mkldnn)
     include_directories(${mkldnn_INCLUDE_DIRS})
+  else (tensorflow_ENABLE_MKLDNN_SUPPORT)
+    add_definitions(-DINTEL_MKL_ML)
   endif()
 endif (tensorflow_ENABLE_MKL_SUPPORT)
 
diff --git a/tensorflow/contrib/cmake/external/zlib.cmake b/tensorflow/contrib/cmake/external/zlib.cmake
index 116d42309394b9..8942f3eecf07ff 100644
--- a/tensorflow/contrib/cmake/external/zlib.cmake
+++ b/tensorflow/contrib/cmake/external/zlib.cmake
@@ -31,7 +31,8 @@ else (systemlib_ZLIB)
   set(ZLIB_URL https://github.com/madler/zlib)
   set(ZLIB_BUILD ${CMAKE_CURRENT_BINARY_DIR}/zlib/src/zlib)
   set(ZLIB_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/zlib/install)
-  set(ZLIB_TAG 50893291621658f355bc5b4d450a8d06a563053d)
+  # Match zlib version in tensorflow/workspace.bzl
+  set(ZLIB_TAG v1.2.11)
 
   if(WIN32)
     if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index 6468bed4979253..015cb73bbd93bb 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -32,52 +32,13 @@ tensorflow/python/feature_column
 tensorflow/python/framework
 tensorflow/python/grappler
 tensorflow/python/keras
-tensorflow/python/keras/activations
 tensorflow/python/keras/applications
-tensorflow/python/keras/applications/densenet
-tensorflow/python/keras/applications/inception_resnet_v2
-tensorflow/python/keras/applications/inception_v3
-tensorflow/python/keras/applications/mobilenet
-tensorflow/python/keras/applications/nasnet
-tensorflow/python/keras/applications/resnet50
-tensorflow/python/keras/applications/vgg16
-tensorflow/python/keras/applications/vgg19
-tensorflow/python/keras/applications/xception
-tensorflow/python/keras/backend
-tensorflow/python/keras/callbacks
-tensorflow/python/keras/constraints
 tensorflow/python/keras/datasets
-tensorflow/python/keras/datasets/boston_housing
-tensorflow/python/keras/datasets/cifar10
-tensorflow/python/keras/datasets/cifar100
-tensorflow/python/keras/datasets/fashion_mnist
-tensorflow/python/keras/datasets/imdb
-tensorflow/python/keras/datasets/mnist
-tensorflow/python/keras/datasets/reuters
-tensorflow/python/keras/estimator
-tensorflow/python/keras/initializers
+tensorflow/python/keras/engine
 tensorflow/python/keras/layers
-tensorflow/python/keras/losses
-tensorflow/python/keras/metrics
-tensorflow/python/keras/models
-tensorflow/python/keras/optimizers
 tensorflow/python/keras/preprocessing
-tensorflow/python/keras/preprocessing/image
-tensorflow/python/keras/preprocessing/sequence
-tensorflow/python/keras/preprocessing/text
-tensorflow/python/keras/regularizers
 tensorflow/python/keras/utils
 tensorflow/python/keras/wrappers
-tensorflow/python/keras/wrappers/scikit_learn
-tensorflow/python/keras/_impl
-tensorflow/python/keras/_impl/keras
-tensorflow/python/keras/_impl/keras/applications
-tensorflow/python/keras/_impl/keras/datasets
-tensorflow/python/keras/_impl/keras/engine
-tensorflow/python/keras/_impl/keras/layers
-tensorflow/python/keras/_impl/keras/preprocessing
-tensorflow/python/keras/_impl/keras/utils
-tensorflow/python/keras/_impl/keras/wrappers
 tensorflow/python/kernel_tests
 tensorflow/python/kernel_tests/boosted_trees
 tensorflow/python/kernel_tests/distributions
@@ -100,6 +61,7 @@ tensorflow/python/summary
 tensorflow/python/summary/writer
 tensorflow/python/tools
 tensorflow/python/training
+tensorflow/python/training/checkpointable
 tensorflow/python/user_ops
 tensorflow/python/util
 tensorflow/python/util/protobuf
@@ -153,6 +115,8 @@ tensorflow/contrib/coder/python/ops
 tensorflow/contrib/compiler
 tensorflow/contrib/constrained_optimization
 tensorflow/contrib/constrained_optimization/python
+tensorflow/contrib/control_flow
+tensorflow/contrib/control_flow/python
 tensorflow/contrib/copy_graph
 tensorflow/contrib/copy_graph/python
 tensorflow/contrib/copy_graph/python/util
@@ -333,6 +297,8 @@ tensorflow/contrib/metrics
 tensorflow/contrib/metrics/python
 tensorflow/contrib/metrics/python/metrics
 tensorflow/contrib/metrics/python/ops
+tensorflow/contrib/mixed_precision
+tensorflow/contrib/mixed_precision/python
 tensorflow/contrib/mpi_collectives/python
 tensorflow/contrib/mpi_collectives/python/ops
 tensorflow/contrib/model_pruning
diff --git a/tensorflow/contrib/cmake/python_protos.txt b/tensorflow/contrib/cmake/python_protos.txt
index d63c41db844af2..cf1ee2ad76f2cc 100644
--- a/tensorflow/contrib/cmake/python_protos.txt
+++ b/tensorflow/contrib/cmake/python_protos.txt
@@ -11,7 +11,6 @@ tensorflow/contrib/mpi
 tensorflow/contrib/mpi_collectives
 tensorflow/contrib/session_bundle
 tensorflow/contrib/tensor_forest/proto
-tensorflow/contrib/tensorboard/graph_explorer/proto
 tensorflow/contrib/tensorboard/plugins/projector
 tensorflow/contrib/tensorboard/plugins/trace
 tensorflow/contrib/tpu/proto
diff --git a/tensorflow/contrib/cmake/tf_c.cmake b/tensorflow/contrib/cmake/tf_c.cmake
index c6a15f2ca075c8..2e0a2fcef4cbdc 100644
--- a/tensorflow/contrib/cmake/tf_c.cmake
+++ b/tensorflow/contrib/cmake/tf_c.cmake
@@ -21,9 +21,8 @@ set(tf_c_srcs
     "${tensorflow_source_dir}/tensorflow/c/c_api_function.cc"
     "${tensorflow_source_dir}/tensorflow/c/eager/c_api.cc"
     "${tensorflow_source_dir}/tensorflow/c/eager/c_api.h"
+    "${tensorflow_source_dir}/tensorflow/c/eager/c_api_debug.cc"
     "${tensorflow_source_dir}/tensorflow/c/eager/tape.h"
-    "${tensorflow_source_dir}/tensorflow/c/eager/runtime.cc"
-    "${tensorflow_source_dir}/tensorflow/c/eager/runtime.h"
     "${tensorflow_source_dir}/tensorflow/c/checkpoint_reader.cc"
     "${tensorflow_source_dir}/tensorflow/c/checkpoint_reader.h"
     "${tensorflow_source_dir}/tensorflow/c/tf_status_helper.cc"
@@ -38,13 +37,15 @@ add_dependencies(
   tf_core_lib
   tf_protos_cc)
 
-add_library(tf_c_python_api OBJECT
-  "${tensorflow_source_dir}/tensorflow/c/python_api.cc"
-  "${tensorflow_source_dir}/tensorflow/c/python_api.h"
-)
-add_dependencies(
-  tf_c_python_api
-  tf_c
-  tf_core_lib
-  tf_core_framework
-  tf_protos_cc)
+if(tensorflow_BUILD_PYTHON_BINDINGS)
+  add_library(tf_c_python_api OBJECT
+    "${tensorflow_source_dir}/tensorflow/c/python_api.cc"
+    "${tensorflow_source_dir}/tensorflow/c/python_api.h"
+  )
+  add_dependencies(
+    tf_c_python_api
+    tf_c
+    tf_core_lib
+    tf_core_framework
+    tf_protos_cc)
+endif()
diff --git a/tensorflow/contrib/cmake/tf_cc_ops.cmake b/tensorflow/contrib/cmake/tf_cc_ops.cmake
index f73da0b8ab18af..6c90cf398c69c8 100644
--- a/tensorflow/contrib/cmake/tf_cc_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_cc_ops.cmake
@@ -155,7 +155,7 @@ if (WIN32)
     set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/pywrap_tensorflow_internal.lib")
   endif()
 else (WIN32)
-  set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so")
+  set (pywrap_tensorflow_lib "${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal${CMAKE_SHARED_LIBRARY_SUFFIX}")
 endif (WIN32)
 add_custom_target(tf_extension_ops)
 
diff --git a/tensorflow/contrib/cmake/tf_core_framework.cmake b/tensorflow/contrib/cmake/tf_core_framework.cmake
index b47c32f1c48b3d..dac84ccb0dbf48 100644
--- a/tensorflow/contrib/cmake/tf_core_framework.cmake
+++ b/tensorflow/contrib/cmake/tf_core_framework.cmake
@@ -213,10 +213,6 @@ else()
   list(REMOVE_ITEM tf_core_platform_srcs ${tf_core_platform_srcs_exclude})
 endif()
 
-file(GLOB tf_core_platform_exclude_srcs
-  "${tensorflow_source_dir}/tensorflow/core/platform/variant_coding.cc")
-list(REMOVE_ITEM tf_core_platform_srcs ${tf_core_platform_exclude_srcs})
-
 list(APPEND tf_core_lib_srcs ${tf_core_platform_srcs})
 
 if(UNIX)
@@ -286,8 +282,6 @@ set(tf_version_srcs ${tensorflow_source_dir}/tensorflow/core/util/version_info.c
 file(GLOB_RECURSE tf_core_framework_srcs
     "${tensorflow_source_dir}/tensorflow/core/framework/*.h"
     "${tensorflow_source_dir}/tensorflow/core/framework/*.cc"
-    "${tensorflow_source_dir}/tensorflow/core/platform/variant_coding.h"
-    "${tensorflow_source_dir}/tensorflow/core/platform/variant_coding.cc"
     "${tensorflow_source_dir}/tensorflow/core/graph/edgeset.h"
     "${tensorflow_source_dir}/tensorflow/core/graph/edgeset.cc"
     "${tensorflow_source_dir}/tensorflow/core/graph/graph.h"
diff --git a/tensorflow/contrib/cmake/tf_core_kernels.cmake b/tensorflow/contrib/cmake/tf_core_kernels.cmake
index f38c9e05135f9f..2d76bf530a2100 100644
--- a/tensorflow/contrib/cmake/tf_core_kernels.cmake
+++ b/tensorflow/contrib/cmake/tf_core_kernels.cmake
@@ -68,6 +68,8 @@ if(tensorflow_BUILD_CONTRIB_KERNELS)
       "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder_ops.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/coder/kernels/range_coder_ops_util.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/coder/ops/coder_ops.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/csv_dataset_op.cc"
+      "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/prefetching_kernels.cc"
       "${tensorflow_source_dir}/tensorflow/contrib/data/kernels/threadpool_dataset_op.cc"
diff --git a/tensorflow/contrib/cmake/tf_core_ops.cmake b/tensorflow/contrib/cmake/tf_core_ops.cmake
index e558691de4b749..bc753333dba4f6 100644
--- a/tensorflow/contrib/cmake/tf_core_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_core_ops.cmake
@@ -113,6 +113,7 @@ GENERATE_CONTRIB_OP_LIBRARY(tensor_forest_stats "${tensorflow_source_dir}/tensor
 GENERATE_CONTRIB_OP_LIBRARY(text_skip_gram "${tensorflow_source_dir}/tensorflow/contrib/text/ops/skip_gram_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(tpu "${tpu_ops_srcs}")
 GENERATE_CONTRIB_OP_LIBRARY(bigquery_reader "${tensorflow_source_dir}/tensorflow/contrib/cloud/ops/bigquery_reader_ops.cc")
+GENERATE_CONTRIB_OP_LIBRARY(gcs_config "${tensorflow_source_dir}/tensorflow/contrib/cloud/ops/gcs_config_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(reduce_slice_ops "${tensorflow_source_dir}/tensorflow/contrib/reduce_slice_ops/ops/reduce_slice_ops.cc")
 
 ########################################################
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index c4bdb69d828b26..1959ad028a06f3 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -244,13 +244,11 @@ add_custom_command(TARGET tf_python_copy_scripts_to_destination PRE_BUILD
 # tf_python_op_gen_main library
 ########################################################
 set(tf_python_op_gen_main_srcs
-    "${tensorflow_source_dir}/tensorflow/python/eager/python_eager_op_gen.h"
-    "${tensorflow_source_dir}/tensorflow/python/eager/python_eager_op_gen.cc"
     "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen.cc"
-    "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen.cc"
-    "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen_main.cc"
     "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen.h"
+    "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen_internal.cc"
     "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen_internal.h"
+    "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen_main.cc"
 )
 
 add_library(tf_python_op_gen_main OBJECT ${tf_python_op_gen_main_srcs})
@@ -422,6 +420,8 @@ GENERATE_PYTHON_OP_LIB("contrib_text_skip_gram_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/text/python/ops/gen_skip_gram_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_bigquery_reader_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/cloud/python/ops/gen_bigquery_reader_ops.py)
+GENERATE_PYTHON_OP_LIB("contrib_gcs_config_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/cloud/python/ops/gen_gcs_config_ops.py)
 GENERATE_PYTHON_OP_LIB("stateless_random_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/stateless/gen_stateless_random_ops.py)
 GENERATE_PYTHON_OP_LIB("debug_ops"
@@ -464,12 +464,12 @@ set (pywrap_tensorflow_internal_src
     "${tensorflow_source_dir}/tensorflow/python/eager/pywrap_tfe_src.cc"
     "${tensorflow_source_dir}/tensorflow/python/client/tf_session_helper.h"
     "${tensorflow_source_dir}/tensorflow/python/client/tf_session_helper.cc"
-    "${tensorflow_source_dir}/tensorflow/python/eager/python_eager_op_gen.h"
-    "${tensorflow_source_dir}/tensorflow/python/eager/python_eager_op_gen.cc"
     "${tensorflow_source_dir}/tensorflow/python/framework/cpp_shape_inference.h"
     "${tensorflow_source_dir}/tensorflow/python/framework/cpp_shape_inference.cc"
     "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen.h"
     "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen.cc"
+    "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen_internal.h"
+    "${tensorflow_source_dir}/tensorflow/python/framework/python_op_gen_internal.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/bfloat16.h"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/bfloat16.cc"
     "${tensorflow_source_dir}/tensorflow/python/lib/core/numpy.h"
@@ -715,7 +715,7 @@ if(WIN32)
   endif()
 else()
   add_custom_command(TARGET pywrap_tensorflow_internal POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal.so
+    COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libpywrap_tensorflow_internal${CMAKE_SHARED_LIBRARY_SUFFIX}
                                      ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/_pywrap_tensorflow_internal.so)
 endif()
 
@@ -725,7 +725,7 @@ endif()
 ########################################################
 
 # Parse tensorflow/tools/api/generator/BUILD to get list of generated files.
-FILE(READ ${tensorflow_source_dir}/tensorflow/tools/api/generator/BUILD api_generator_BUILD_text)
+FILE(READ ${tensorflow_source_dir}/tensorflow/tools/api/generator/api_gen.bzl api_generator_BUILD_text)
 STRING(REGEX MATCH "# BEGIN GENERATED FILES.*# END GENERATED FILES" api_init_files_text ${api_generator_BUILD_text})
 string(REPLACE "# BEGIN GENERATED FILES" "" api_init_files_text ${api_init_files_text})
 string(REPLACE "# END GENERATED FILES" "" api_init_files_text ${api_init_files_text})
@@ -736,7 +736,7 @@ foreach(api_init_file ${api_init_files_list})
     string(STRIP "${api_init_file}" api_init_file)
     if(api_init_file)
         string(REPLACE "\"" "" api_init_file "${api_init_file}")  # Remove quotes
-        list(APPEND api_init_files "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tools/api/generator/${api_init_file}")
+        list(APPEND api_init_files "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/${api_init_file}")
     endif()
 endforeach(api_init_file)
 set(api_init_list_file "${tensorflow_source_dir}/api_init_files_list.txt")
@@ -749,18 +749,14 @@ add_custom_command(
 
       # tensorflow/__init__.py depends on files generated in this step. So, remove it while
       # this step is running since the files aren't there yet.
-      COMMAND ${CMAKE_COMMAND} -E rename ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
-                                         ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/final.__init__.py
-      COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
+      COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
 
       # Run create_python_api.py to generate API init files.
       COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/tf_python ${PYTHON_EXECUTABLE}
-              "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tools/api/generator/create_python_api.py" "${api_init_list_file}"
-
-      # Re-add tensorflow/__init__.py back.
-      COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
-      COMMAND ${CMAKE_COMMAND} -E rename ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/final.__init__.py
-                                         ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
+              "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tools/api/generator/create_python_api.py"
+              "--root_init_template=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/api_template.__init__.py"
+              "--apidir=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow"
+              "${api_init_list_file}"
 
       COMMENT "Generating __init__.py files for Python API."
       WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/tf_python"
@@ -791,7 +787,6 @@ add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
 add_custom_command(TARGET tf_python_copy_scripts_to_destination PRE_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/contrib/testing/python/framework/util_test.py
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/testing/python/framework/)
-
 add_custom_command(TARGET tf_python_build_pip_package POST_BUILD
   COMMAND ${CMAKE_COMMAND} -E copy ${tensorflow_source_dir}/tensorflow/tools/pip_package/README
                                    ${CMAKE_CURRENT_BINARY_DIR}/tf_python/)
diff --git a/tensorflow/contrib/cmake/tf_tests.cmake b/tensorflow/contrib/cmake/tf_tests.cmake
index 92f2ab6dea8e7d..eb9482dc25f2be 100644
--- a/tensorflow/contrib/cmake/tf_tests.cmake
+++ b/tensorflow/contrib/cmake/tf_tests.cmake
@@ -212,6 +212,10 @@ if (tensorflow_BUILD_PYTHON_TESTS)
     "${tensorflow_source_dir}/tensorflow/contrib/factorization/python/ops/gmm_test.py"
     # Disable following manual tag in BUILD.
     "${tensorflow_source_dir}/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py"
+    # These tests depend on a .so file
+    ${tensorflow_source_dir}/tensorflow/python/kernel_tests/duplicate_op_test.py
+    ${tensorflow_source_dir}/tensorflow/python/kernel_tests/invalid_op_test.py
+    ${tensorflow_source_dir}/tensorflow/python/kernel_tests/ackermann_test.py
 
   )
   if (WIN32)
@@ -267,6 +271,8 @@ if (tensorflow_BUILD_PYTHON_TESTS)
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/variable_scope_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/functional_ops_test.py"
       "${tensorflow_source_dir}/tensorflow/python/kernel_tests/py_func_test.py"
+      # Flaky on Windows cpu with py36 (b/73556968)
+      "${tensorflow_source_dir}/tensorflow/python/kernel_tests/sparse_reshape_op_test.py"
       # Windows file management related issues.
       "${tensorflow_source_dir}/tensorflow/python/training/evaluation_test.py"
       # training tests
diff --git a/tensorflow/contrib/cmake/tools/create_def_file.py b/tensorflow/contrib/cmake/tools/create_def_file.py
index cffe069aa352f8..4f957f1e0b46fd 100644
--- a/tensorflow/contrib/cmake/tools/create_def_file.py
+++ b/tensorflow/contrib/cmake/tools/create_def_file.py
@@ -44,7 +44,8 @@
 DUMPBIN = "dumpbin.exe"
 
 # Exclude if matched
-EXCLUDE_RE = re.compile(r"RTTI|deleting destructor|::internal::")
+EXCLUDE_RE = re.compile(r"RTTI|deleting destructor|::internal::|Internal|"
+                        r"python_op_gen_internal|grappler")
 
 # Include if matched before exclude
 INCLUDEPRE_RE = re.compile(r"google::protobuf::internal::ExplicitlyConstructed|"
@@ -56,6 +57,10 @@
                            r"tensorflow::ops::internal::Enter|"
                            r"tensorflow::strings::internal::AppendPieces|"
                            r"tensorflow::strings::internal::CatPieces|"
+                           r"tensorflow::errors::Internal|"
+                           r"tensorflow::Tensor::CopyFromInternal|"
+                           r"tensorflow::kernel_factory::"
+                           r"OpKernelRegistrar::InitInternal|"
                            r"tensorflow::io::internal::JoinPathImpl")
 
 # Include if matched after exclude
@@ -64,7 +69,7 @@
                         r"tensorflow::|"
                         r"functor::|"
                         r"\?nsync_|"
-                        r"perftools::gputools")
+                        r"stream_executor::")
 
 # We want to identify data members explicitly in the DEF file, so that no one
 # can implicitly link against the DLL if they use one of the variables exported
diff --git a/tensorflow/contrib/coder/kernels/range_coder_ops_test.cc b/tensorflow/contrib/coder/kernels/range_coder_ops_test.cc
index ae4d9d2836a0f8..81b36ca902b822 100644
--- a/tensorflow/contrib/coder/kernels/range_coder_ops_test.cc
+++ b/tensorflow/contrib/coder/kernels/range_coder_ops_test.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/fake_input.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op.h"
diff --git a/tensorflow/contrib/coder/python/layers/entropybottleneck.py b/tensorflow/contrib/coder/python/layers/entropybottleneck.py
index f039cb0f5265b9..0fbe3081af0b4d 100644
--- a/tensorflow/contrib/coder/python/layers/entropybottleneck.py
+++ b/tensorflow/contrib/coder/python/layers/entropybottleneck.py
@@ -28,7 +28,7 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl.keras import engine
+from tensorflow.python.keras import engine
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import init_ops
diff --git a/tensorflow/contrib/compiler/jit_test.py b/tensorflow/contrib/compiler/jit_test.py
index 29a593f6bcfa05..a56a01b16356e1 100644
--- a/tensorflow/contrib/compiler/jit_test.py
+++ b/tensorflow/contrib/compiler/jit_test.py
@@ -24,7 +24,6 @@
 from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -170,12 +169,11 @@ def mulop(x1, x2):
       self.assertEqual(b"jit_scope_0", func_attrs["_XlaScope"].s)
 
 
-@test_util.with_c_api
 class CompilationEnabledInGradientTest(test.TestCase):
 
   def testCompilationInGradient(self):
     with self.test_session():
-      x = constant_op.constant([[3]])
+      x = constant_op.constant([[3.]])
       y_nc = math_ops.matmul(x, x, name="not_compiled")
       with jit.experimental_jit_scope():
         y_c = math_ops.matmul(y_nc, y_nc, name="compiled")
@@ -200,11 +198,11 @@ def testCompilationGradientScopeNames(self):
     with self.test_session(graph=ops.Graph()):
       with jit.experimental_jit_scope():
         # XlaScope 0
-        a1 = constant_op.constant([[1]])
+        a1 = constant_op.constant([[1.]])
         a1t = math_ops.matmul(a1, a1)
       with jit.experimental_jit_scope():
         # XlaScope 1
-        a2 = constant_op.constant([[1]])
+        a2 = constant_op.constant([[1.]])
         a2t = math_ops.matmul(a2, a2)
 
       self.assertEqual(b"jit_scope_0", a1.op.get_attr("_XlaScope"))
@@ -222,11 +220,11 @@ def testCompilationSeparateGradientScopeNames(self):
     with self.test_session(graph=ops.Graph()):
       with jit.experimental_jit_scope(True, separate_compiled_gradients=True):
         # XlaScope 0
-        a1 = constant_op.constant([[1]])
+        a1 = constant_op.constant([[1.]])
         a1t = math_ops.matmul(a1, a1)
       with jit.experimental_jit_scope(True, separate_compiled_gradients=True):
         # XlaScope 1
-        a2 = constant_op.constant([[1]])
+        a2 = constant_op.constant([[1.]])
         a2t = math_ops.matmul(a2, a2)
 
       self.assertEqual(b"jit_scope_0", a1.op.get_attr("_XlaScope"))
diff --git a/tensorflow/contrib/control_flow/BUILD b/tensorflow/contrib/control_flow/BUILD
new file mode 100644
index 00000000000000..746b5b5b5e2fa2
--- /dev/null
+++ b/tensorflow/contrib/control_flow/BUILD
@@ -0,0 +1,48 @@
+# New implementations of control flow ops
+
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//visibility:public"])
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
+py_library(
+    name = "control_flow",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":cond_v2",
+    ],
+)
+
+py_library(
+    name = "cond_v2",
+    srcs = ["python/cond_v2.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:c_api_util",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:function",
+        "//tensorflow/python:functional_ops_gen",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:pywrap_tensorflow",
+    ],
+)
+
+tf_py_test(
+    name = "cond_v2_test",
+    size = "small",
+    srcs = ["python/cond_v2_test.py"],
+    additional_deps = [
+        ":cond_v2",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:gradients",
+    ],
+    grpc_enabled = True,
+)
diff --git a/tensorflow/python/keras/_impl/keras/wrappers/__init__.py b/tensorflow/contrib/control_flow/__init__.py
similarity index 68%
rename from tensorflow/python/keras/_impl/keras/wrappers/__init__.py
rename to tensorflow/contrib/control_flow/__init__.py
index 20c95929e3d2e1..582af2cf10a3d9 100644
--- a/tensorflow/python/keras/_impl/keras/wrappers/__init__.py
+++ b/tensorflow/contrib/control_flow/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,11 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Keras API wrappers.
+
+"""New implementations of TF control flow ops.
+
+@@cond_v2
 """
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.wrappers import scikit_learn
+# pylint: disable=unused-import
+from tensorflow.contrib.control_flow.python.cond_v2 import cond_v2
+# pylint: enable=unused-import
+
+from tensorflow.python.util.all_util import remove_undocumented
 
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/control_flow/python/cond_v2.py b/tensorflow/contrib/control_flow/python/cond_v2.py
new file mode 100644
index 00000000000000..90c678d0f6bd21
--- /dev/null
+++ b/tensorflow/contrib/control_flow/python/cond_v2.py
@@ -0,0 +1,394 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""cond_v2 and gradient.
+
+This is a version of cond that emits a single If op, as well as the gradient
+function for If ops produced by cond_v2. This will eventually replace the
+current tf.cond implementation once it reaches feature and performance parity.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python import pywrap_tensorflow as c_api
+from tensorflow.python.framework import c_api_util
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_functional_ops
+from tensorflow.python.ops import gradients_impl
+
+
+# NOTE(skyewm): TensorFlow uses protected class methods and fields to signify
+# that they aren't part of the official public API. These protected members
+# often need to be used by implementation code however. Rather than litter the
+# code with pylint comments, we ignore protected access violations for
+# readability.
+# pylint: disable=protected-access
+
+
+def cond_v2(pred, true_fn, false_fn, name="cond"):
+  """Like tf.cond, except emits a single If op."""
+  with ops.name_scope(name) as scope:
+    true_graph = function.func_graph_from_py_func(true_fn, [], [],
+                                                  name="%s_true" % scope)
+    false_graph = function.func_graph_from_py_func(false_fn, [], [],
+                                                   name="%s_false" % scope)
+    _check_same_outputs(true_graph, false_graph)
+
+    # Add inputs to true_graph and false_graph to make them match. Note that
+    # this modifies true_graph and false_graph.
+    cond_inputs = _make_inputs_match(true_graph, false_graph,
+                                     true_graph.extra_inputs,
+                                     false_graph.extra_inputs)
+
+    # Add all intermediate tensors as function outputs so they're available for
+    # the gradient computation.
+
+    true_intermediates = _get_intermediates(true_graph)
+    false_intermediates = _get_intermediates(false_graph)
+
+    # Save the original number of outputs to return to the caller.
+    num_cond_outputs = len(true_graph.outputs)
+
+    # Make the number/type of new intermediate outputs match.
+    extra_true_outputs, extra_false_outputs = _pad_params(
+        true_graph, false_graph, true_intermediates, false_intermediates)
+
+    true_graph.outputs.extend(extra_true_outputs)
+    false_graph.outputs.extend(extra_false_outputs)
+
+    # Create the If op.
+    tensors = gen_functional_ops._if(
+        pred, cond_inputs, [t.dtype for t in true_graph.outputs],
+        _create_new_tf_function(true_graph),
+        _create_new_tf_function(false_graph),
+        name=scope)
+
+    # TODO(b/79883549): if we could make Graphs from FunctionDefs, we wouldn't
+    # need this extra state. Requiring extra state also prevents the ability to
+    # take the gradient of deserialized If ops.
+    tensors[0].op._true_graph = true_graph
+    tensors[0].op._false_graph = false_graph
+
+    return tensors[:num_cond_outputs]
+
+
+@ops.RegisterGradient("If")
+def _IfGrad(op, *grads):  # pylint: disable=invalid-name
+  """The gradient of an If op produced by cond_v2."""
+  true_graph = op._true_graph
+  false_graph = op._false_graph
+
+  # Create grad functions that compute the gradient of the true/false forward
+  # graphs. These functions will capture tensors from the forward pass
+  # functions.
+  true_grad_graph = _create_grad_func(
+      true_graph, grads, "%sgrad" % true_graph.name)
+  false_grad_graph = _create_grad_func(
+      false_graph, grads, "%sgrad" % false_graph.name)
+
+  assert ([t.dtype for t in true_grad_graph.outputs] ==
+          [t.dtype for t in false_grad_graph.outputs])
+
+  # Match up the captured grad function inputs with outputs of 'op' and other
+  # external tensors.
+  true_grad_inputs = _get_grad_inputs(op, true_graph, true_grad_graph)
+  false_grad_inputs = _get_grad_inputs(op, false_graph, false_grad_graph)
+
+  # Make the inputs to true_grad_graph and false_grad_graph match. Note that
+  # this modifies true_grad_graph and false_grad_graph.
+  grad_inputs = _make_inputs_match(true_grad_graph, false_grad_graph,
+                                   true_grad_inputs, false_grad_inputs)
+
+  # Add all intermediate tensors as function outputs so they're available for
+  # higher-order gradient computations.
+
+  true_grad_intermediates = _get_intermediates(true_grad_graph)
+  false_grad_intermediates = _get_intermediates(false_grad_graph)
+
+  # Save the original number of gradient outputs to return.
+  num_grad_outputs = len(true_grad_graph.outputs)
+
+  # Make the number/type of new intermediate outputs match.
+  extra_true_grad_outputs, extra_false_grad_outputs = _pad_params(
+      true_grad_graph, false_grad_graph,
+      true_grad_intermediates, false_grad_intermediates)
+
+  true_grad_graph.outputs.extend(extra_true_grad_outputs)
+  false_grad_graph.outputs.extend(extra_false_grad_outputs)
+
+  # Create the gradient If op.
+  tensors = gen_functional_ops._if(
+      op.inputs[0], grad_inputs, [t.dtype for t in true_grad_graph.outputs],
+      _create_new_tf_function(true_grad_graph),
+      _create_new_tf_function(false_grad_graph))
+  tensors[0].op._true_graph = true_grad_graph
+  tensors[0].op._false_graph = false_grad_graph
+
+  # The predicate has no gradient.
+  return [None] + tensors[:num_grad_outputs]
+
+
+def _grad_fn(func_graph, grads):
+  """The gradient function for each conditional branch.
+
+  This function builds the gradient graph of the corresponding forward-pass
+  conditional branch in `func_graph`. This is done by differentiating
+  func_graph's outputs w.r.t. its inputs.
+
+  Args:
+    func_graph: function._FuncGraph. The corresponding forward-pass function.
+    grads: The list of input gradient Tensors.
+
+  Returns:
+    The output gradient Tensors.
+  """
+  # Filter out untrainable function outputs.
+  # NOTE(skyewm): If we don't do this, the untrainable tensors can sometimes
+  # cause _GradientsHelper to raise an exception (e.g. the implementation
+  # doesn't expect 'ys' to contain boolean tensors).
+  assert len(func_graph.outputs) == len(grads)
+  ys = []
+  grad_ys = []
+  for y, grad_y in zip(func_graph.outputs, grads):
+    if not gradients_impl._IsTrainable(y):
+      continue
+    ys.append(y)
+    grad_ys.append(grad_y)
+
+  # Build the gradient graph. Note that this builds the gradient computation of
+  # func_graph in the current graph, which requires capturing tensors from
+  # func_graph. The captured func_graph tensors are resolved to external tensors
+  # in _get_grad_inputs.
+  result = gradients_impl._GradientsHelper(
+      ys, func_graph.inputs, grad_ys=grad_ys,
+      src_graph=func_graph)
+
+  # Functions can't return None; replace Nones with zero tensors.
+  # TODO(b/80444525): don't return anything here and make _IfGrad return None if
+  # both branches have zero gradient.
+  for i in range(len(result)):
+    if result[i] is None:
+      result[i] = array_ops.zeros_like(func_graph.inputs[i])
+
+  return result
+
+
+def _create_grad_func(func_graph, grads, name):
+  """Returns the _FuncGraph representation of _grad_fn."""
+  return function.func_graph_from_py_func(lambda: _grad_fn(func_graph, grads),
+                                          [], [], name)
+
+
+def _get_grad_inputs(if_op, cond_graph, grad_graph):
+  """Returns the tensors we should pass to grad_graph.
+
+  This method handles tensors captured from cond_graph in grad_graph. It
+  converts these to suitable input tensors from the outer graph.
+
+  Args:
+    if_op: Operation. The forward-pass If op that uses cond_graph.
+    cond_graph: function._FuncGraph. The forward-pass function.
+    grad_graph: function._FuncGraph. The gradients function.
+
+  Returns:
+    A list of inputs tensors to be passed to grad_graph.
+  """
+  inputs = []
+
+  # Maps placeholders in cond_graph -> input tensor in outer graph.
+  forward_input_map = {v: k for k, v in cond_graph._captured.items()}
+
+  for t in grad_graph.extra_inputs:
+    if t.graph == ops.get_default_graph():
+      # t is in the outer graph (e.g. one of the input gradients).
+      inputs.append(t)
+    elif t in forward_input_map:
+      # t is an input placeholder in cond_graph. Get the corresponding input
+      # tensor in the outer graph.
+      assert t.graph == cond_graph
+      assert forward_input_map[t].graph == ops.get_default_graph()
+      inputs.append(forward_input_map[t])
+    else:
+      # t is an intermediate value in cond_graph. Get the corresponding output
+      # of 'if_op' (note that all intermediate values are outputs).
+      assert t.graph == cond_graph
+      output_idx = cond_graph.outputs.index(t)
+      inputs.append(if_op.outputs[output_idx])
+
+  return inputs
+
+
+def _create_new_tf_function(func_graph):
+  """Converts func_graph to a TF_Function and adds it to the current graph.
+
+  Args:
+    func_graph: function._FuncGraph
+
+  Returns:
+    The name of the new TF_Function.
+  """
+  func_graph.name = "%s_" % func_graph.name
+  c_func = c_api.TF_GraphToFunction_wrapper(
+      func_graph._c_graph,
+      func_graph.name,
+      False,  # append_hash_to_fn_name
+      None,  # opers
+      [t._as_tf_output() for t in func_graph.inputs],
+      [t._as_tf_output() for t in func_graph.outputs],
+      [],
+      None,  # opts
+      None)  # description
+  c_func = c_api_util.ScopedTFFunction(c_func)
+  c_api.TF_GraphCopyFunction(
+      ops.get_default_graph()._c_graph, c_func.func, None)
+  return func_graph.name
+
+
+def _get_intermediates(func_graph):
+  """Returns all tensors in `func_graph` that aren't inputs or outputs."""
+  intermediates = []
+  for op in func_graph.get_operations():
+    for t in op.outputs:
+      if t in func_graph.inputs: continue
+      if t in func_graph.outputs: continue
+      intermediates.append(t)
+  return intermediates
+
+
+def _separate_unique_inputs(true_inputs, false_inputs):
+  """Separates tensors appearing only in true_inputs or false_inputs, or both.
+
+  Args:
+    true_inputs: list of Tensors
+    false_inputs: list of Tensors
+
+  Returns:
+    Three lists of Tensors:
+      1. The tensors that appear in both true_inputs and false_inputs
+      2. The tensors that only appear in true_inputs
+      3. The tensors that only appear in false_inputs
+  """
+  true_inputs = set(true_inputs)
+  false_inputs = set(false_inputs)
+
+  shared_inputs = true_inputs.intersection(false_inputs)
+  true_only_inputs = true_inputs - false_inputs
+  false_only_inputs = false_inputs - true_inputs
+
+  return list(shared_inputs), list(true_only_inputs), list(false_only_inputs)
+
+
+def _pad_params(true_graph, false_graph, true_params, false_params):
+  """Returns new param lists that have matching signatures.
+
+  This is done by mirroring each param list in the other using dummy params.
+  There is no merging of params.
+
+  Args:
+    true_graph: function._FuncGraph
+    false_graph: function._FuncGraph
+    true_params: a list of Tensors from true_graph
+    false_params: a list of Tensors from false_graph
+
+  Returns:
+    A new list of Tensors in true_graph and a new list of Tensors in
+    false_graph. The two lists have the same number of Tensors, with matching
+    types and shapes across the lists.
+  """
+  new_true_params = (true_params +
+                     _create_dummy_params(true_graph, false_params))
+  new_false_inputs = (_create_dummy_params(false_graph, true_params)
+                      + false_params)
+  return new_true_params, new_false_inputs
+
+
+def _make_inputs_match(true_graph, false_graph, true_inputs, false_inputs):
+  """Modifies true_graph and false_graph so they have the same input signature.
+
+  This method reorders and/or adds parameters to true_graph and false_graph so
+  they have the same input signature, and updates the 'inputs', 'extra_inputs',
+  and '_captured' fields of both graphs accordingly. It uses the input tensors
+  from the outer graph to avoid duplicating shared arguments.
+
+  Args:
+    true_graph: function._FuncGraph
+    false_graph: function._FuncGraph
+    true_inputs: a list of Tensors in the outer graph. The inputs for
+      true_graph.
+    false_inputs: a list of Tensors in the outer graph. The inputs for
+      false_graph.
+
+  Returns:
+    A new list of Tensors from the outer graph that are the new inputs for both
+    true_graph and false_graph. This is a deduped version of true_inputs +
+    false_inputs.
+  """
+  shared_inputs, true_only_inputs, false_only_inputs = _separate_unique_inputs(
+      true_inputs, false_inputs)
+
+  new_inputs = shared_inputs + true_only_inputs + false_only_inputs
+
+  true_input_to_param = dict(zip(true_inputs, true_graph.inputs))
+  false_input_to_param = dict(zip(false_inputs, false_graph.inputs))
+
+  true_graph.inputs = (
+      [true_input_to_param[t] for t in shared_inputs] +
+      [true_input_to_param[t] for t in true_only_inputs] +
+      _create_dummy_params(true_graph, false_only_inputs))
+
+  false_graph.inputs = (
+      [false_input_to_param[t] for t in shared_inputs] +
+      _create_dummy_params(false_graph, true_only_inputs) +
+      [false_input_to_param[t] for t in false_only_inputs])
+
+  # Rewrite the _FuncGraphs' state to reflect the new inputs.
+  true_graph.extra_inputs = new_inputs
+  false_graph.extra_inputs = new_inputs
+
+  true_graph._captured = dict(zip(new_inputs, true_graph.inputs))
+  false_graph._captured = dict(zip(new_inputs, false_graph.inputs))
+
+  return new_inputs
+
+
+def _create_dummy_params(func_graph, template_tensors):
+  """Creates tensors in func_graph to represent template_tensors.
+
+  Args:
+    func_graph: function._FuncGraph.
+    template_tensors: a list of tensors in the outer graph.
+
+  Returns:
+    A list of tensors in func_graph.
+  """
+  with func_graph.as_default():
+    return [gen_functional_ops.fake_param(dtype=t.dtype, shape=t.shape)
+            for t in template_tensors]
+
+
+def _check_same_outputs(true_graph, false_graph):
+  """Raises an error if true_graph and false_graph have different outputs."""
+  true_output_types = [t.dtype for t in true_graph.outputs]
+  false_output_types = [t.dtype for t in false_graph.outputs]
+  if (len(true_graph.outputs) != len(false_graph.outputs) or
+      true_output_types != false_output_types):
+    raise ValueError(
+        "true_fn() and false_fn() must return the same number and type of "
+        "arguments, got:\n"
+        "  true_fn: %s\n"
+        "  false_fn: %s" % (true_output_types, false_output_types))
diff --git a/tensorflow/contrib/control_flow/python/cond_v2_test.py b/tensorflow/contrib/control_flow/python/cond_v2_test.py
new file mode 100644
index 00000000000000..166002ca7faa41
--- /dev/null
+++ b/tensorflow/contrib/control_flow/python/cond_v2_test.py
@@ -0,0 +1,114 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for cond_v2."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.control_flow.python import cond_v2
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class NewCondTest(test.TestCase):
+
+  def _testCond(self, true_fn, false_fn, train_vals):
+    pred = array_ops.placeholder(dtypes.bool, name="pred")
+
+    expected = control_flow_ops.cond(pred, true_fn, false_fn, name="expected")
+    actual = cond_v2.cond_v2(pred, true_fn, false_fn, name="actual")
+
+    expected_grad = gradients_impl.gradients(expected, train_vals)
+    actual_grad = gradients_impl.gradients(actual, train_vals)
+
+    with self.test_session() as sess:
+      expected_val, actual_val, expected_grad_val, actual_grad_val = sess.run(
+          (expected, actual, expected_grad, actual_grad), {pred: True})
+      self.assertEqual(expected_val, actual_val)
+      self.assertEqual(expected_grad_val, actual_grad_val)
+
+      expected_val, actual_val, expected_grad_val, actual_grad_val = sess.run(
+          (expected, actual, expected_grad, actual_grad), {pred: False})
+      self.assertEqual(expected_val, actual_val)
+      self.assertEqual(expected_grad_val, actual_grad_val)
+
+  def testBasic(self):
+    x = constant_op.constant(1.0, name="x")
+    y = constant_op.constant(2.0, name="y")
+
+    def true_fn():
+      return x * 2.0
+
+    def false_fn():
+      return y * 3.0
+
+    self._testCond(true_fn, false_fn, [x])
+    self._testCond(true_fn, false_fn, [x, y])
+    self._testCond(true_fn, false_fn, [y])
+
+  def testBasic2(self):
+    x = constant_op.constant(1.0, name="x")
+    y = constant_op.constant(2.0, name="y")
+
+    def true_fn():
+      return x * y * 2.0
+
+    def false_fn():
+      return 2.0
+
+    self._testCond(true_fn, false_fn, [x])
+    self._testCond(true_fn, false_fn, [x, y])
+    self._testCond(true_fn, false_fn, [y])
+
+  def testSecondDerivative(self):
+    self.skipTest("b/109758172")
+    pred = array_ops.placeholder(dtypes.bool, name="pred")
+    x = constant_op.constant(3.0, name="x")
+
+    def true_fn():
+      return math_ops.pow(x, 3)
+
+    def false_fn():
+      return x
+
+    cond = cond_v2.cond_v2(pred, true_fn, false_fn, name="cond")
+    cond_grad = gradients_impl.gradients(cond, [x])
+    cond_grad_grad = gradients_impl.gradients(cond_grad, [x])
+
+    with self.test_session() as sess:
+      # d[x^3]/dx = 3x^2
+      true_val = sess.run(cond_grad, {pred: True})
+      self.assertEqual(true_val, [27.0])
+      # d[x]/dx = 1
+      false_val = sess.run(cond_grad, {pred: False})
+      self.assertEqual(false_val, [1.0])
+
+      true_val = sess.run(cond_grad_grad, {pred: True})
+      # d2[x^3]/dx2 = 6x
+      self.assertEqual(true_val, [18.0])
+      false_val = sess.run(cond_grad_grad, {pred: False})
+      # d2[x]/dx2 = 0
+      self.assertEqual(false_val, [0.0])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/copy_graph/python/util/copy_elements.py b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
index 102bc460fdadb0..a0dd3881a86c19 100644
--- a/tensorflow/contrib/copy_graph/python/util/copy_elements.py
+++ b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
@@ -218,7 +218,6 @@ def copy_op_to_graph(org_instance, to_graph, variables, scope=''):
                            new_control_inputs, input_types, new_original_op,
                            op_def)
     #Use Graph's hidden methods to add the op
-    to_graph._add_op(new_op)  # pylint: disable=protected-access
     to_graph._record_op_seen_by_control_dependencies(new_op)
     for device_function in reversed(to_graph._device_function_stack):
       new_op._set_device(device_function(new_op))
diff --git a/tensorflow/contrib/crf/python/kernel_tests/crf_test.py b/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
index a5e065b93a23c3..74f2ec22ffaab1 100644
--- a/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
+++ b/tensorflow/contrib/crf/python/kernel_tests/crf_test.py
@@ -152,6 +152,22 @@ def testCrfLogNorm(self):
 
         self.assertAllClose(tf_log_norm, tf_brute_force_log_norm)
 
+  def testCrfLogNormZeroSeqLength(self):
+    """
+    Test `crf_log_norm` when `sequence_lengths` contains one or more zeros.
+    """
+    with self.test_session() as sess:
+      inputs = constant_op.constant(np.ones([2, 10, 5],
+                                            dtype=np.float32))
+      transition_params = constant_op.constant(np.ones([5, 5],
+                                                       dtype=np.float32))
+      sequence_lengths = constant_op.constant(np.zeros([2],
+                                                       dtype=np.int32))
+      expected_log_norm = np.zeros([2], dtype=np.float32)
+      log_norm = crf.crf_log_norm(inputs, sequence_lengths, transition_params)
+      tf_log_norm = sess.run(log_norm)
+      self.assertAllClose(tf_log_norm, expected_log_norm)
+
   def testCrfLogLikelihood(self):
     inputs = np.array(
         [[4, 5, -3], [3, -1, 3], [-1, 2, 1], [0, 0, 0]], dtype=np.float32)
@@ -292,10 +308,10 @@ def testCrfDecodeZeroSeqLength(self):
                                                        dtype=np.float32))
       sequence_lengths = constant_op.constant(np.zeros([2],
                                                        dtype=np.int32))
-      values = crf.crf_decode(inputs, transition_params, sequence_lengths)
-      tags, scores = sess.run(values)
-      self.assertEqual(len(tags.shape), 2)
-      self.assertEqual(len(scores.shape), 1)
+      tags, scores = crf.crf_decode(inputs, transition_params, sequence_lengths)
+      tf_tags, tf_scores = sess.run([tags, scores])
+      self.assertEqual(len(tf_tags.shape), 2)
+      self.assertEqual(len(tf_scores.shape), 1)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/crf/python/ops/crf.py b/tensorflow/contrib/crf/python/ops/crf.py
index e37c029cebf30e..2d2cbdc1990ed9 100644
--- a/tensorflow/contrib/crf/python/ops/crf.py
+++ b/tensorflow/contrib/crf/python/ops/crf.py
@@ -52,6 +52,7 @@
 
 import numpy as np
 
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.layers import utils
 from tensorflow.python.ops import array_ops
@@ -90,9 +91,13 @@ def _single_seq_fn():
     batch_size = array_ops.shape(inputs, out_type=tag_indices.dtype)[0]
     example_inds = array_ops.reshape(
         math_ops.range(batch_size, dtype=tag_indices.dtype), [-1, 1])
-    return array_ops.gather_nd(
+    sequence_scores = array_ops.gather_nd(
         array_ops.squeeze(inputs, [1]),
         array_ops.concat([example_inds, tag_indices], axis=1))
+    sequence_scores = array_ops.where(math_ops.less_equal(sequence_lengths, 0),
+                                      array_ops.zeros_like(sequence_scores),
+                                      sequence_scores)
+    return sequence_scores
 
   def _multi_seq_fn():
     # Compute the scores of the given tag sequence.
@@ -128,7 +133,12 @@ def crf_log_norm(inputs, sequence_lengths, transition_params):
   # If max_seq_len is 1, we skip the algorithm and simply reduce_logsumexp over
   # the "initial state" (the unary potentials).
   def _single_seq_fn():
-    return math_ops.reduce_logsumexp(first_input, [1])
+    log_norm = math_ops.reduce_logsumexp(first_input, [1])
+    # Mask `log_norm` of the sequences with length <= zero.
+    log_norm = array_ops.where(math_ops.less_equal(sequence_lengths, 0),
+                               array_ops.zeros_like(log_norm),
+                               log_norm)
+    return log_norm
 
   def _multi_seq_fn():
     """Forward computation of alpha values."""
@@ -137,13 +147,21 @@ def _multi_seq_fn():
     # Compute the alpha values in the forward algorithm in order to get the
     # partition function.
     forward_cell = CrfForwardRnnCell(transition_params)
+    # Sequence length is not allowed to be less than zero.
+    sequence_lengths_less_one = math_ops.maximum(
+        constant_op.constant(0, dtype=sequence_lengths.dtype),
+        sequence_lengths - 1)
     _, alphas = rnn.dynamic_rnn(
         cell=forward_cell,
         inputs=rest_of_input,
-        sequence_length=sequence_lengths - 1,
+        sequence_length=sequence_lengths_less_one,
         initial_state=first_input,
         dtype=dtypes.float32)
     log_norm = math_ops.reduce_logsumexp(alphas, [1])
+    # Mask `log_norm` of the sequences with length <= zero.
+    log_norm = array_ops.where(math_ops.less_equal(sequence_lengths, 0),
+                               array_ops.zeros_like(log_norm),
+                               log_norm)
     return log_norm
 
   max_seq_len = array_ops.shape(inputs)[1]
@@ -479,7 +497,7 @@ def _multi_seq_fn():
     initial_state = array_ops.slice(potentials, [0, 0, 0], [-1, 1, -1])
     initial_state = array_ops.squeeze(initial_state, axis=[1])  # [B, O]
     inputs = array_ops.slice(potentials, [0, 1, 0], [-1, -1, -1])  # [B, T-1, O]
-    # sequence length is not allowed to be less than zero
+    # Sequence length is not allowed to be less than zero.
     sequence_length_less_one = math_ops.maximum(0, sequence_length - 1)
     backpointers, last_score = rnn.dynamic_rnn(  # [B, T - 1, O], [B, O]
         crf_fwd_cell,
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
index 012b17cee88aec..8285ea04926d3a 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
@@ -54,11 +54,11 @@
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import adagrad
 from tensorflow.python.training import adam
-from tensorflow.python.training import checkpointable_utils
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import momentum
 from tensorflow.python.training import rmsprop
 from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
 
 
 CUDNN_LSTM = cudnn_rnn_ops.CUDNN_LSTM
@@ -717,7 +717,7 @@ def _VerifyCheckpoint(
       inputs = 3. * array_ops.ones([num_applications, num_layers, input_size],
                                    dtype=dtypes.float32)
       cudnn_output, _ = cudnn_layer(inputs)
-      status.assert_consumed().run_restore_ops()
+      status.run_restore_ops()
     second_save_path = cudnn_checkpoint.save(checkpoint_prefix)
     restore_layer = compatible_cell_fn()
     restore_layer_checkpoint = checkpointable_utils.Checkpoint(
@@ -728,7 +728,7 @@ def _VerifyCheckpoint(
       restore_layer_output, current_state = restore_layer(
           inputs=3. * array_ops.ones([1, input_size]),
           state=current_state)
-    status.assert_consumed().run_restore_ops()
+    status.run_restore_ops()
     self.assertTrue(restore_layer.variables)
     for variable, expected_value in zip(
         restore_layer.variables, expected_variable_values):
diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index 73a961992e19fa..8822a7523f6b16 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -20,11 +20,10 @@
 import os
 from tensorflow.contrib.checkpoint.python import split_dependency
 from tensorflow.contrib.rnn.python.ops import lstm_ops
-from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
-from tensorflow.python.keras._impl.keras.engine import base_layer
+from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_cudnn_rnn_ops
 from tensorflow.python.ops import init_ops
@@ -33,8 +32,8 @@
 from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.training import checkpointable as checkpointable_lib
 from tensorflow.python.training import saver
+from tensorflow.python.training.checkpointable import base as checkpointable_lib
 
 CUDNN_RNN_UNIDIRECTION = "unidirectional"
 CUDNN_RNN_BIDIRECTION = "bidirectional"
@@ -1647,10 +1646,3 @@ class CudnnRNNRelu(_CudnnRNNNoInputC):
   # 1 set of weight and bias parameters for the recurrent input, and 1 for the
   # previous layer input.
   _NUM_PARAMS_PER_LAYER = CUDNN_RNN_RELU_PARAMS_PER_LAYER
-
-
-ops.RegisterShape("CudnnRNNParamsSize")(common_shapes.call_cpp_shape_fn)
-ops.RegisterShape("CudnnRNNParamsToCanonical")(common_shapes.call_cpp_shape_fn)
-ops.RegisterShape("CudnnRNNCanonicalToParams")(common_shapes.call_cpp_shape_fn)
-ops.RegisterShape("CudnnRNN")(common_shapes.call_cpp_shape_fn)
-ops.RegisterShape("CudnnRNNBackprop")(common_shapes.call_cpp_shape_fn)
diff --git a/tensorflow/contrib/data/__init__.py b/tensorflow/contrib/data/__init__.py
index 077cbba9d2ae41..1af1ed08b53ee0 100644
--- a/tensorflow/contrib/data/__init__.py
+++ b/tensorflow/contrib/data/__init__.py
@@ -23,11 +23,14 @@
 See the @{$datasets$Importing Data} Programmer's Guide for an overview.
 
 @@Counter
+@@CheckpointInputPipelineHook
+@@CsvDataset
 @@SqlDataset
 
 @@assert_element_shape
 @@batch_and_drop_remainder
 @@bucket_by_sequence_length
+@@choose_from_datasets
 @@dense_to_sparse_batch
 @@enumerate_dataset
 @@group_by_window
@@ -72,8 +75,10 @@
 from tensorflow.contrib.data.python.ops.interleave_ops import parallel_interleave
 from tensorflow.contrib.data.python.ops.interleave_ops import sample_from_datasets
 from tensorflow.contrib.data.python.ops.interleave_ops import sloppy_interleave
+from tensorflow.contrib.data.python.ops.iterator_ops import CheckpointInputPipelineHook
 from tensorflow.contrib.data.python.ops.iterator_ops import make_saveable_from_iterator
 from tensorflow.contrib.data.python.ops.prefetching_ops import prefetch_to_device
+from tensorflow.contrib.data.python.ops.readers import CsvDataset
 from tensorflow.contrib.data.python.ops.readers import make_batched_features_dataset
 from tensorflow.contrib.data.python.ops.readers import make_csv_dataset
 from tensorflow.contrib.data.python.ops.readers import read_batch_features
diff --git a/tensorflow/contrib/data/kernels/BUILD b/tensorflow/contrib/data/kernels/BUILD
index c56910c7833d4c..7b69e10441eba3 100644
--- a/tensorflow/contrib/data/kernels/BUILD
+++ b/tensorflow/contrib/data/kernels/BUILD
@@ -29,6 +29,16 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "csv_dataset_op",
+    srcs = ["csv_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:framework_headers_lib",
+        "//third_party/eigen3",
+        "@protobuf_archive//:protobuf_headers",
+    ],
+)
+
 cc_library(
     name = "ignore_errors_dataset_op",
     srcs = ["ignore_errors_dataset_op.cc"],
@@ -63,6 +73,7 @@ cc_library(
 cc_library(
     name = "dataset_kernels",
     deps = [
+        ":csv_dataset_op",
         ":directed_interleave_dataset_op",
         ":ignore_errors_dataset_op",
         ":prefetching_kernels",
diff --git a/tensorflow/contrib/data/kernels/csv_dataset_op.cc b/tensorflow/contrib/data/kernels/csv_dataset_op.cc
new file mode 100644
index 00000000000000..e88ad3dc32003e
--- /dev/null
+++ b/tensorflow/contrib/data/kernels/csv_dataset_op.cc
@@ -0,0 +1,758 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// See docs in ../ops/parsing_ops.cc.
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/io/random_inputstream.h"
+
+namespace tensorflow {
+namespace {
+
+class CSVDatasetOp : public DatasetOpKernel {
+ public:
+  explicit CSVDatasetOp(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override {
+    const Tensor* filenames_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("filenames", &filenames_tensor));
+    OP_REQUIRES(
+        ctx, filenames_tensor->dims() <= 1,
+        errors::InvalidArgument("`filenames` must be a scalar or a vector."));
+
+    OpInputList record_defaults_list;
+    OP_REQUIRES_OK(ctx,
+                   ctx->input_list("record_defaults", &record_defaults_list));
+    for (int i = 0; i < record_defaults_list.size(); ++i) {
+      OP_REQUIRES(ctx, record_defaults_list[i].NumElements() < 2,
+                  errors::InvalidArgument(
+                      "There should only be 1 default per field but field ", i,
+                      " has ", record_defaults_list[i].NumElements()));
+    }
+
+    const Tensor* select_cols_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("select_cols", &select_cols_tensor));
+    OP_REQUIRES(ctx, select_cols_tensor->dims() == 1,
+                errors::InvalidArgument("`select_cols` must be a vector."));
+
+    int64 buffer_size;
+    OP_REQUIRES_OK(
+        ctx, ParseScalarArgument<int64>(ctx, "buffer_size", &buffer_size));
+    OP_REQUIRES(ctx, buffer_size > 0,
+                errors::InvalidArgument("buffer_size should be positive"));
+
+    string delim;
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument<string>(ctx, "field_delim", &delim));
+    OP_REQUIRES(ctx, delim.size() == 1,
+                errors::InvalidArgument("field_delim should be only 1 char"));
+
+    bool header;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<bool>(ctx, "header", &header));
+
+    bool use_quote_delim;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<bool>(ctx, "use_quote_delim",
+                                                  &use_quote_delim));
+    string na_value;
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument<string>(ctx, "na_value", &na_value));
+
+    std::vector<Tensor> record_defaults;
+    record_defaults.reserve(record_defaults_list.size());
+    for (const Tensor& t : record_defaults_list) {
+      record_defaults.push_back(t);
+    }
+
+    std::vector<string> filenames;
+    filenames.reserve(filenames_tensor->NumElements());
+    for (int i = 0; i < filenames_tensor->NumElements(); ++i) {
+      filenames.push_back(filenames_tensor->flat<string>()(i));
+    }
+
+    std::vector<int64> select_cols;
+    select_cols.reserve(select_cols_tensor->NumElements());
+    for (int i = 0; i < select_cols_tensor->NumElements(); ++i) {
+      select_cols.push_back(select_cols_tensor->flat<int64>()(i));
+    }
+    OP_REQUIRES(
+        ctx, output_types_.size() == select_cols.size() || select_cols.empty(),
+        errors::InvalidArgument("select_cols should match output size"));
+    for (int i = 1; i < select_cols.size(); i++) {
+      OP_REQUIRES(ctx, select_cols[i - 1] < select_cols[i],
+                  errors::InvalidArgument(
+                      "select_cols should be strictly increasing indices"));
+    }
+    OP_REQUIRES(
+        ctx, select_cols.empty() || select_cols.front() >= 0,
+        errors::InvalidArgument("select_cols should be non-negative indices"));
+
+    *output = new Dataset(ctx, std::move(filenames), header, buffer_size,
+                          output_types_, output_shapes_,
+                          std::move(record_defaults), std::move(select_cols),
+                          use_quote_delim, delim[0], std::move(na_value));
+  }
+
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, std::vector<string> filenames, bool header,
+            int64 buffer_size, const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes,
+            std::vector<Tensor> record_defaults, std::vector<int64> select_cols,
+            bool use_quote_delim, char delim, string na_value)
+        : GraphDatasetBase(ctx),
+          filenames_(std::move(filenames)),
+          header_(header),
+          buffer_size_(buffer_size),
+          out_type_(output_types),
+          output_shapes_(output_shapes),
+          record_defaults_(std::move(record_defaults)),
+          select_cols_(std::move(select_cols)),
+          use_quote_delim_(use_quote_delim),
+          delim_(delim),
+          na_value_(std::move(na_value)) {}
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::CSV")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override { return out_type_; }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() const override { return "CSVDatasetOp::Dataset"; }
+
+   protected:
+    Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      // TODO(rachelim): Implement this
+      std::vector<Node*> input_tensors;
+      TF_RETURN_IF_ERROR(b->AddDataset(this, input_tensors, output));
+      return errors::Unimplemented("CSVDataset: AsGraphDefInternal");
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        bool select_all = dataset()->select_cols_.empty();
+        do {
+          // We are currently processing a file, so try to read the next record
+          if (input_stream_) {
+            Status s = ReadRecord(ctx, out_tensors, select_all,
+                                  dataset()->select_cols_);
+            if (s.ok()) {
+              // Validate output
+              if (out_tensors->size() != dataset()->out_type_.size()) {
+                return errors::InvalidArgument(
+                    "Expect ", dataset()->out_type_.size(), " fields but have ",
+                    out_tensors->size(), " in record");
+              }
+
+              *end_of_sequence = false;
+              return s;
+            }
+            if (!errors::IsOutOfRange(s)) {
+              // Not at the end of file, return OK or non-EOF errors to caller.
+              *end_of_sequence = false;
+              return s;
+            }
+            // We have reached the end of the current file, so maybe
+            // move on to next file.
+            ResetStreamsLocked();
+            ++current_file_index_;
+          }
+          // Iteration ends when there are no more files to process.
+          if (current_file_index_ == dataset()->filenames_.size()) {
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+          TF_RETURN_IF_ERROR(SetupStreamsLocked(ctx->env()));
+        } while (true);
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        // TODO(rachelim): Implement save
+        return errors::Unimplemented("CSVDataset: SaveInternal");
+      }
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        // TODO(rachelim): Implement restore
+        return errors::Unimplemented("CSVDataset: RestoreInternal");
+      }
+
+     private:
+      // Reads an entire CSV row from the input stream, either from the
+      // existing buffer or by filling the buffer as needed. Converts extracted
+      // fields to output tensors as we go.
+      //
+      // When this function is called, pos_ should be the index of the first
+      // character of the record in buffer_, or past the end of the buffer.
+      // Note: ctx and out_tensors are only used in this function
+      // when fields are included in the record.
+      Status ReadRecord(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                        bool select_all, const std::vector<int64>& selected)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (pos_ >= buffer_.size()) {
+          // At the end of the file, this will return errors::OutOfRange
+          TF_RETURN_IF_ERROR(FillBuffer(&buffer_));
+          pos_ = 0;
+        }
+
+        // The first character may be \n if this is the continuation of a
+        // \r\n linebreak between this and the previous record. If so, skip it.
+
+        bool end_of_record = false;  // Keep track of when we find \n, \r or EOF
+        size_t num_parsed = 0;
+        size_t num_selected_parsed = 0;
+
+        Status result = Status::OK();
+
+        while (!end_of_record) {  // Read till we reach \n, \r or EOF
+          bool include =
+              select_all || (num_selected_parsed < selected.size() &&
+                             selected[num_selected_parsed] == num_parsed);
+
+          // Don't fail fast, so that the next call to GetNext may still return
+          // a valid record
+          result.Update(
+              ParseOneField(ctx, out_tensors, &end_of_record, include));
+
+          num_parsed++;
+          if (include) num_selected_parsed++;
+        }
+
+        return result;
+      }
+
+      // Parses one field from position pos_ in the buffer. Fields are
+      // delimited by delim, CRLF, or EOF. Advances pos_ to the first char of
+      // the next field.
+      Status ParseOneField(IteratorContext* ctx,
+                           std::vector<Tensor>* out_tensors,
+                           bool* end_of_record, bool include)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (pos_ >= buffer_.size()) {
+          // If we get here, this means the previous field's end coincided
+          // with the end of the buffer. We can fill the buffer without abandon.
+          Status s = FillBuffer(&buffer_);
+
+          if (errors::IsOutOfRange(s)) {
+            // Reached EOF, and last field is empty
+            *end_of_record = true;
+            if (include) {
+              return FieldToOutput(ctx, StringPiece(), out_tensors);
+            } else {
+              return Status::OK();
+            }
+          } else if (!s.ok()) {
+            return s;  // Surface other errors back to caller
+          }
+
+          pos_ = 0;
+        }
+
+        if (dataset()->use_quote_delim_ && buffer_[pos_] == '"') {
+          return ParseQuotedField(ctx, out_tensors, end_of_record, include);
+        }
+
+        return ParseUnquotedField(ctx, out_tensors, end_of_record, include);
+      }
+
+      // For keeping track of relevant parts of a field from a previous buffer
+      struct Piece {
+        size_t start;
+        size_t len;
+        string buffer;
+
+        Piece(string buffer, size_t start, size_t len)
+            : start(start), len(len), buffer(std::move(buffer)) {}
+      };
+
+      // Given that pos_ exceeds the buffer, saves the relevant part of the
+      // current buffer (if necessary), fills the buffer, and resets indices to
+      // 0.
+      Status SaveAndFillBuffer(std::vector<Piece>* earlier_pieces,
+                               size_t* start, bool include)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        string temp_buffer;
+
+        buffer_.swap(temp_buffer);
+        if (include && pos_ > *start) {
+          earlier_pieces->push_back(
+              Piece(std::move(temp_buffer), *start, pos_ - *start));
+        }
+        pos_ = 0;
+        *start = 0;
+        return FillBuffer(&buffer_);
+      }
+
+      // Parses unquoted field from position pos_ in the buffer. Continually
+      // reads from buffer until end of field is reached (delim, CRLF, or EOF).
+      // Advances pos_ to keep track of our position in the buffer as we go,
+      // stopping at the first character of the next field.
+      Status ParseQuotedField(IteratorContext* ctx,
+                              std::vector<Tensor>* out_tensors,
+                              bool* end_of_record, bool include)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        std::vector<Piece> earlier_pieces;
+        size_t start = pos_;
+        pos_++;  // Starting quotation mark
+
+        while (true) {  // Each iter reads 1 char, filling buffer if necessary
+          if (pos_ >= buffer_.size()) {
+            Status s = SaveAndFillBuffer(&earlier_pieces, &start, include);
+            if (errors::IsOutOfRange(s)) {
+              return errors::InvalidArgument(
+                  "Reached end of file without closing quoted field in "
+                  "record");
+            } else if (!s.ok()) {
+              return s;  // Surface all other errors to caller
+            }
+          }
+
+          char ch = buffer_[pos_];
+          if (ch == '"') {
+            // When we encounter a quote, we look ahead to the next character to
+            // decide what to do
+            pos_++;
+            if (pos_ >= buffer_.size()) {
+              Status s = SaveAndFillBuffer(&earlier_pieces, &start, include);
+              if (errors::IsOutOfRange(s)) {
+                // This was the last field. We are done
+                *end_of_record = true;
+                return QuotedFieldToOutput(ctx, StringPiece(), out_tensors,
+                                           earlier_pieces, include);
+              } else if (!s.ok()) {
+                return s;
+              }
+            }
+
+            char next = buffer_[pos_];
+            pos_++;
+            if (next == dataset()->delim_) {
+              return QuotedFieldToOutput(
+                  ctx, StringPiece(&buffer_[start], pos_ - 1 - start),
+                  out_tensors, earlier_pieces, include);
+
+            } else if (next == '\n' || next == '\r') {
+              *end_of_record = true;
+              Status s = QuotedFieldToOutput(
+                  ctx, StringPiece(&buffer_[start], pos_ - 1 - start),
+                  out_tensors, earlier_pieces, include);
+              if (next == '\r') SkipNewLineIfNecessary();
+              return s;
+            } else if (next != '"') {
+              return errors::InvalidArgument(
+                  "Quote inside a string has to be escaped by another quote");
+            }
+
+          } else {
+            pos_++;
+          }
+        }
+      }
+
+      // Converts quoted field to an output tensor, removing the starting
+      // and ending quotes from it and unescaping double quotations if
+      // necessary.
+      Status QuotedFieldToOutput(IteratorContext* ctx, StringPiece field,
+                                 std::vector<Tensor>* out_tensors,
+                                 const std::vector<Piece>& earlier_pieces,
+                                 bool include) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (!include) return Status::OK();
+
+        if (earlier_pieces.empty()) {
+          if (field.find('\"', 1) == field.size() - 1) {
+            // `field` contains no escaped quotation marks.
+            // Exclude framing quotation marks
+            field.remove_prefix(1);
+            field.remove_suffix(1);
+            return FieldToOutput(ctx, field, out_tensors);
+          }
+        }
+        string field_complete;
+        size_t str_len = field.size();
+        for (const Piece& p : earlier_pieces) {
+          str_len += p.len;
+        }
+        field_complete.reserve(str_len);
+
+        // This bool flips every time we see a quote, so that we skip the second
+        // quote of every pair of adjacent quotes in the field. We need to track
+        // this across iterations of the for loop because adjacent double quotes
+        // may be in different buffers. Initialize to true because we also skip
+        // the opening quotation mark of the quoted field.
+        bool skip_next_quote = true;
+        for (const Piece& p : earlier_pieces) {
+          AppendUnescapedPiece(StringPiece(&p.buffer[p.start], p.len),
+                               &field_complete, &skip_next_quote);
+        }
+        AppendUnescapedPiece(field, &field_complete, &skip_next_quote);
+        StringPiece result = StringPiece(field_complete);
+        result.remove_suffix(1);  // Skip final quote
+
+        return FieldToOutput(ctx, result, out_tensors);
+      }
+
+      void AppendUnescapedPiece(StringPiece piece, string* field_complete,
+                                bool* skip_next_quote) {
+        size_t from = 0;
+        size_t found = piece.find('\"', from);
+        while (found != string::npos) {
+          if (!*skip_next_quote) {
+            // This is the first quote in a pair of adjacent double quotes
+            field_complete->append(piece.data() + from, found + 1 - from);
+          }
+          *skip_next_quote = !*skip_next_quote;
+          from = found + 1;
+          found = piece.find('\"', from);
+        }
+        // Include the chunk after the last quotation mark in the string
+        if (from < piece.size()) {
+          field_complete->append(piece.data() + from, piece.size() - from);
+        }
+      }
+
+      // Parses unquoted field from position pos_ in the buffer. Continually
+      // reads from buffer until end of field is reached (delim, CRLF, or EOF).
+      // Advances pos_ to keep track of our position in the buffer as we go,
+      // stopping at the first character of the next field.
+      Status ParseUnquotedField(IteratorContext* ctx,
+                                std::vector<Tensor>* out_tensors,
+                                bool* end_of_record, bool include)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        std::vector<Piece> earlier_pieces;
+        size_t start = pos_;
+        while (true) {  // Each iter reads 1 char, filling buffer if necessary
+          if (pos_ >= buffer_.size()) {
+            Status s = SaveAndFillBuffer(&earlier_pieces, &start, include);
+            // Handle errors
+            if (errors::IsOutOfRange(s)) {
+              // Whatever we have is the last field of the last record
+              *end_of_record = true;
+              return UnquotedFieldToOutput(
+                  ctx, StringPiece(&buffer_[start], pos_ - start), out_tensors,
+                  earlier_pieces, include);
+            } else if (!s.ok()) {
+              return s;  // Surface all other errors to caller
+            }
+          }
+
+          char ch = buffer_[pos_];
+
+          if (ch == dataset()->delim_) {
+            Status s = UnquotedFieldToOutput(
+                ctx, StringPiece(&buffer_[start], pos_ - start), out_tensors,
+                earlier_pieces, include);
+            pos_++;
+            return s;
+          }
+          if (ch == '\n' || ch == '\r') {
+            // need special case to skip over first \n of record if the line
+            // breaks are \r\n
+            Status s = UnquotedFieldToOutput(
+                ctx, StringPiece(&buffer_[start], pos_ - start), out_tensors,
+                earlier_pieces, include);
+            *end_of_record = true;
+            pos_++;
+            if (ch == '\r') SkipNewLineIfNecessary();
+            return s;
+          }
+          if (dataset()->use_quote_delim_ && ch == '"') {
+            // Advance pos_ to the next field anyway so that we can ignore
+            // errors gracefully if required. The caller of this will be able to
+            // call ParseOneField and continue with the rest of the record.
+            AdvanceToNextField(end_of_record);
+            return errors::InvalidArgument(
+                "Unquoted fields cannot have quotes inside");
+          }
+          // Otherwise, go to next character
+          pos_++;
+        }
+      }
+
+      // Advances pos_ to the start of the next field, as delimited by delim,
+      // CRLF, or EOF, ignoring errors, and not keeping track of characters in
+      // the current field.
+      void AdvanceToNextField(bool* end_of_record)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        while (true) {
+          if (pos_ >= buffer_.size()) {
+            Status s = FillBuffer(&buffer_);
+            pos_ = 0;
+            if (!s.ok()) {
+              *end_of_record = true;
+              return;
+            }
+          }
+
+          char ch = buffer_[pos_];
+          pos_++;
+
+          if (ch == dataset()->delim_) {
+            return;
+          }
+
+          if (ch == '\n' || ch == '\r') {
+            *end_of_record = true;
+            if (ch == '\r') SkipNewLineIfNecessary();
+            return;
+          }
+        }
+      }
+
+      Status FillBuffer(string* result) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        result->clear();
+        Status s = input_stream_->ReadNBytes(dataset()->buffer_size_, result);
+
+        if (errors::IsOutOfRange(s) && !result->empty()) {
+          // Ignore OutOfRange error when ReadNBytes read < N bytes.
+          return Status::OK();
+        }
+        return s;
+      }
+
+      // Given a field, converts it to the right output tensor type
+      Status FieldToOutput(IteratorContext* ctx, StringPiece field,
+                           std::vector<Tensor>* out_tensors) {
+        size_t output_idx = out_tensors->size();
+        if (output_idx >= dataset()->out_type_.size()) {
+          // We can get here if we're selecting all columns, but the number of
+          // fields exceeds the number of defaults provided
+          return errors::InvalidArgument("Expect ", dataset()->out_type_.size(),
+                                         " fields but have more in record");
+        }
+        const DataType& dtype = dataset()->out_type_[output_idx];
+        Tensor component(ctx->allocator({}), dtype, {});
+        if ((field.empty() || field == dataset()->na_value_) &&
+            dataset()->record_defaults_[output_idx].NumElements() != 1) {
+          // If the field is empty or NA value, and default is not given,
+          // report error.
+          return errors::InvalidArgument("Field ", output_idx,
+                                         " is required but missing in record!");
+        }
+
+        switch (dtype) {
+          // For each case, if the field is empty, we use the default.
+          // Otherwise, we convert it to the right type.
+          case DT_INT32: {
+            if (field.empty() || field == dataset()->na_value_) {
+              component.scalar<int32>()() =
+                  dataset()->record_defaults_[output_idx].flat<int32>()(0);
+            } else {
+              int32 value;
+              if (!strings::safe_strto32(field, &value)) {
+                return errors::InvalidArgument(
+                    "Field ", output_idx,
+                    " in record is not a valid int32: ", field);
+              }
+              component.scalar<int32>()() = value;
+            }
+            break;
+          }
+          case DT_INT64: {
+            if (field.empty() || field == dataset()->na_value_) {
+              component.scalar<int64>()() =
+                  dataset()->record_defaults_[output_idx].flat<int64>()(0);
+            } else {
+              int64 value;
+              if (!strings::safe_strto64(field, &value)) {
+                return errors::InvalidArgument(
+                    "Field ", output_idx,
+                    " in record is not a valid int64: ", field);
+              }
+              component.scalar<int64>()() = value;
+            }
+            break;
+          }
+          case DT_FLOAT: {
+            if (field.empty() || field == dataset()->na_value_) {
+              component.scalar<float>()() =
+                  dataset()->record_defaults_[output_idx].flat<float>()(0);
+            } else {
+              float value;
+              if (!strings::safe_strtof(field, &value)) {
+                return errors::InvalidArgument(
+                    "Field ", output_idx,
+                    " in record is not a valid float: ", field);
+              }
+              component.scalar<float>()() = value;
+            }
+            break;
+          }
+          case DT_DOUBLE: {
+            if (field.empty() || field == dataset()->na_value_) {
+              component.scalar<double>()() =
+                  dataset()->record_defaults_[output_idx].flat<double>()(0);
+            } else {
+              double value;
+              if (!strings::safe_strtod(field, &value)) {
+                return errors::InvalidArgument(
+                    "Field ", output_idx,
+                    " in record is not a valid double: ", field);
+              }
+              component.scalar<double>()() = value;
+            }
+            break;
+          }
+          case DT_STRING: {
+            if (field.empty() || field == dataset()->na_value_) {
+              component.scalar<string>()() =
+                  dataset()->record_defaults_[output_idx].flat<string>()(0);
+            } else {
+              component.scalar<string>()() = field.ToString();
+            }
+            break;
+          }
+          default:
+            return errors::InvalidArgument("csv: data type ", dtype,
+                                           " not supported in field ",
+                                           output_idx);
+        }
+        out_tensors->push_back(std::move(component));
+        return Status::OK();
+      }
+
+      // Records can be delimited by "\r\n" line breaks. When we encounter a
+      // '\r', we have to check the next character to see if it is part of the
+      // linebreak, and ignore it if so.
+      void SkipNewLineIfNecessary() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (pos_ >= buffer_.size()) {
+          Status s = FillBuffer(&buffer_);
+          pos_ = 0;
+          // If we failed to fill buffer, it doesn't matter because we're done
+          // with the record
+          if (!s.ok()) return;
+        }
+        if (buffer_[pos_] == '\n') {
+          pos_++;
+        }
+      }
+
+      // Given a string field, and its index in the output,
+      // converts it to a Tensor of the right type and adds it to the
+      // out_tensors vector.
+      Status UnquotedFieldToOutput(IteratorContext* ctx, StringPiece field,
+                                   std::vector<Tensor>* out_tensors,
+                                   const std::vector<Piece>& earlier_pieces,
+                                   bool include) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (!include) return Status::OK();
+
+        if (earlier_pieces.empty()) {
+          return FieldToOutput(ctx, field, out_tensors);
+        }
+
+        size_t str_len = field.size();
+        for (const Piece& p : earlier_pieces) {
+          str_len += p.len;
+        }
+        string field_complete;
+        field_complete.reserve(str_len);
+
+        for (const Piece& p : earlier_pieces) {
+          field_complete.append(p.buffer, p.start, p.len);
+        }
+
+        field_complete.append(field.data(), field.size());
+        return FieldToOutput(ctx, field_complete, out_tensors);
+      }
+
+      // Sets up reader streams to read from the file at `current_file_index_`.
+      Status SetupStreamsLocked(Env* env) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (current_file_index_ >= dataset()->filenames_.size()) {
+          return errors::InvalidArgument(
+              "current_file_index_:", current_file_index_,
+              " >= filenames_.size():", dataset()->filenames_.size());
+        }
+
+        // Actually move on to next file.
+        TF_RETURN_IF_ERROR(env->NewRandomAccessFile(
+            dataset()->filenames_[current_file_index_], &file_));
+        input_stream_.reset(
+            new io::RandomAccessInputStream(file_.get(), false));
+        buffer_.clear();
+        pos_ = 0;
+        if (dataset()->header_) {
+          // Read one line, but don't include it. Pass nullptrs as dummy
+          // pointers to objects that shouldn't be invoked anyway
+          // We need to process this as a record here instead of just finding
+          // the first newline because it might contain quoted fields with
+          // newlines in the header as well
+          std::vector<int64> empty;
+          Status s = ReadRecord(nullptr, nullptr, false, empty);
+          if (!s.ok()) {
+            return errors::InvalidArgument("Can't read header of file");
+          }
+        }
+        return Status::OK();
+      }
+
+      // Resets all reader streams.
+      void ResetStreamsLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        input_stream_.reset();
+        file_.reset();
+      }
+
+      mutex mu_;
+      string buffer_ GUARDED_BY(mu_);  // Maintain our own buffer
+      size_t pos_ GUARDED_BY(
+          mu_);  // Index into the buffer must be maintained between iters
+      std::unique_ptr<io::RandomAccessInputStream> input_stream_
+          GUARDED_BY(mu_);
+      size_t current_file_index_ GUARDED_BY(mu_) = 0;
+      std::unique_ptr<RandomAccessFile> file_
+          GUARDED_BY(mu_);  // must outlive input_stream_
+    };                      // class Iterator
+
+    const std::vector<string> filenames_;
+    const bool header_;
+    const int64 buffer_size_;
+    const DataTypeVector out_type_;
+    const std::vector<PartialTensorShape> output_shapes_;
+    const std::vector<Tensor> record_defaults_;
+    const std::vector<int64> select_cols_;
+    const bool use_quote_delim_;
+    const char delim_;
+    const string na_value_;
+  };  // class Dataset
+
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};  // class CSVDatasetOp
+
+// Register the kernel implementation for CSVDataset.
+REGISTER_KERNEL_BUILDER(Name("CSVDataset").Device(DEVICE_CPU), CSVDatasetOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc b/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc
index 48d3734162525f..6a12ca06f4d6cc 100644
--- a/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc
+++ b/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc
@@ -91,7 +91,7 @@ class DirectedInterleaveDatasetOp : public DatasetOpKernel {
       }
     }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(new Iterator(
           {this, strings::StrCat(prefix, "::DirectedInterleave")}));
@@ -105,7 +105,7 @@ class DirectedInterleaveDatasetOp : public DatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override {
+    string DebugString() const override {
       return strings::StrCat("DirectedInterleaveDatasetOp::Dataset");
     }
 
@@ -130,15 +130,21 @@ class DirectedInterleaveDatasetOp : public DatasetOpKernel {
      public:
       explicit Iterator(const Params& params)
           : DatasetIterator<Dataset>(params),
-            selector_input_impl_(params.dataset->selector_input_->MakeIterator(
-                params.prefix + ".selector")),
-            num_active_inputs_(params.dataset->data_inputs_.size()) {
-        data_input_impls_.reserve(params.dataset->data_inputs_.size());
-        for (size_t i = 0; i < params.dataset->data_inputs_.size(); ++i) {
-          const DatasetBase* data_input = params.dataset->data_inputs_[i];
-          data_input_impls_.push_back(data_input->MakeIterator(
-              strings::StrCat(params.prefix, "[", i, "]")));
+            num_active_inputs_(params.dataset->data_inputs_.size()) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(dataset()->selector_input_->MakeIterator(
+            ctx, strings::StrCat(prefix(), ".selector"),
+            &selector_input_impl_));
+        data_input_impls_.resize(dataset()->data_inputs_.size());
+        for (size_t i = 0; i < data_input_impls_.size(); ++i) {
+          const DatasetBase* data_input = dataset()->data_inputs_[i];
+          TF_RETURN_IF_ERROR(data_input->MakeIterator(
+              ctx, strings::StrCat(prefix(), "[", i, "]"),
+              &data_input_impls_[i]));
         }
+        return Status::OK();
       }
 
       Status GetNextInternal(IteratorContext* ctx,
diff --git a/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc b/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc
index bb29df60e8f114..bbec50681c6f5d 100644
--- a/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc
+++ b/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc
@@ -44,7 +44,7 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::IgnoreErrors")}));
@@ -57,7 +57,9 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() override { return "IgnoreErrorsDatasetOp::Dataset"; }
+    string DebugString() const override {
+      return "IgnoreErrorsDatasetOp::Dataset";
+    }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
@@ -72,8 +74,11 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
diff --git a/tensorflow/contrib/data/kernels/threadpool_dataset_op.cc b/tensorflow/contrib/data/kernels/threadpool_dataset_op.cc
index 63e19ae3f837c9..3dfc3741c2b040 100644
--- a/tensorflow/contrib/data/kernels/threadpool_dataset_op.cc
+++ b/tensorflow/contrib/data/kernels/threadpool_dataset_op.cc
@@ -127,7 +127,7 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
       threadpool_->Unref();
     }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::ThreadPool")}));
@@ -140,7 +140,9 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() override { return "ThreadPoolDatasetOp::Dataset"; }
+    string DebugString() const override {
+      return "ThreadPoolDatasetOp::Dataset";
+    }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
@@ -154,8 +156,11 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
diff --git a/tensorflow/contrib/data/kernels/unique_dataset_op.cc b/tensorflow/contrib/data/kernels/unique_dataset_op.cc
index 69fbb0fcdcce87..67c237799c10a2 100644
--- a/tensorflow/contrib/data/kernels/unique_dataset_op.cc
+++ b/tensorflow/contrib/data/kernels/unique_dataset_op.cc
@@ -56,7 +56,7 @@ class UniqueDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::Unique")}));
@@ -70,7 +70,7 @@ class UniqueDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() override {
+    string DebugString() const override {
       return strings::StrCat("UniqueDatasetOp::Dataset");
     }
 
@@ -87,8 +87,11 @@ class UniqueDatasetOp : public UnaryDatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const typename Iterator::Params& params)
-          : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
diff --git a/tensorflow/contrib/data/ops/dataset_ops.cc b/tensorflow/contrib/data/ops/dataset_ops.cc
index 137deb63527f0b..f271d269ab1b93 100644
--- a/tensorflow/contrib/data/ops/dataset_ops.cc
+++ b/tensorflow/contrib/data/ops/dataset_ops.cc
@@ -34,6 +34,40 @@ data_input_datasets: `N` datasets with the same type that will be interleaved
   according to the values of `selector_input_dataset`.
 )doc");
 
+REGISTER_OP("CSVDataset")
+    .Input("filenames: string")
+    .Input("buffer_size: int64")
+    .Input("header: bool")
+    .Input("field_delim: string")
+    .Input("use_quote_delim: bool")
+    .Input("na_value: string")
+    .Input("select_cols: int64")
+    .Input("record_defaults: output_types")
+    .Output("handle: variant")
+    .Attr("output_types: list({float,double,int32,int64,string}) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+                      // stateful to inhibit constant folding.
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // `filenames` must be a scalar or a vector.
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 1, &unused));
+      // `buffer_size`, `header`, `field_delim`, `use_quote_delim`,
+      // `na_value` must be scalars
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));
+      // `select_cols` must be a vector
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 1, &unused));
+      // `record_defaults` must be a list of scalars...?
+      for (size_t i = 7; i < c->num_inputs(); ++i) {
+        TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 1, &unused));
+      }
+      return shape_inference::ScalarShape(c);
+    });
+
 REGISTER_OP("IgnoreErrorsDataset")
     .Input("input_dataset: variant")
     .Output("handle: variant")
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 23a2d7351361ca..995e283a32400d 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -11,7 +11,10 @@ py_test(
     size = "medium",
     srcs = ["batch_dataset_op_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_oss",  # (b/79552534)
+        "no_pip",
+    ],
     deps = [
         ":dataset_serialization_test",
         "//tensorflow/contrib/data/python/ops:batching",
@@ -32,7 +35,7 @@ py_test(
 
 py_test(
     name = "bucketing_test",
-    size = "small",
+    size = "medium",
     srcs = ["bucketing_test.py"],
     srcs_version = "PY2AND3",
     deps = [
@@ -117,6 +120,20 @@ py_library(
     ],
 )
 
+py_test(
+    name = "csv_dataset_op_test",
+    size = "small",
+    srcs = ["csv_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":dataset_serialization_test",
+        "//tensorflow/contrib/data/python/ops:error_ops",
+        "//tensorflow/contrib/data/python/ops:readers",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "filter_dataset_op_test",
     size = "small",
@@ -192,6 +209,23 @@ py_test(
     ],
 )
 
+py_test(
+    name = "directed_interleave_dataset_test",
+    size = "medium",
+    srcs = ["directed_interleave_dataset_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dataset_serialization_test",
+        "//tensorflow/contrib/data/python/ops:interleave_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:training",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
 tf_py_test(
     name = "get_single_element_test",
     size = "small",
@@ -246,6 +280,19 @@ py_test(
     ],
 )
 
+py_test(
+    name = "optimize_dataset_op_test",
+    size = "small",
+    srcs = ["optimize_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dataset_serialization_test",
+        "//tensorflow/contrib/data/python/ops:optimization",
+        "//tensorflow/python:platform",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
 py_test(
     name = "prefetch_dataset_op_test",
     size = "small",
@@ -287,6 +334,7 @@ py_test(
     name = "reader_dataset_ops_test",
     size = "medium",
     srcs = ["reader_dataset_ops_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     tags = ["no_pip"],
     deps = [
@@ -301,6 +349,7 @@ py_test(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:lib",
         "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:string_ops",
         "//tensorflow/python:util",
         "//tensorflow/python/data/ops:iterator_ops",
         "//third_party/py/numpy",
@@ -320,11 +369,15 @@ py_test(
     deps = [
         "//tensorflow/contrib/data/python/ops:resampling",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:random_ops",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -407,6 +460,7 @@ py_test(
     srcs = ["sql_dataset_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":dataset_serialization_test",
         "//tensorflow/contrib/data/python/ops:readers",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
index a4a0ce79b6013d..b5fbc45ad3d8d2 100644
--- a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
@@ -427,7 +427,9 @@ def testBatchAndDropRemainderShapeInference(self):
     self.assertEqual([None], dataset.output_shapes[1][0].as_list())
     self.assertEqual([None, 30], dataset.output_shapes[1][1].as_list())
 
-  def _testMapAndBatchDatasetHelper(self, num_parallel_batches=1):
+  def _testMapAndBatchDatasetHelper(self,
+                                    num_parallel_calls=None,
+                                    num_parallel_batches=None):
     """Test a dataset that maps a TF function across its input elements."""
     # The pipeline is TensorSliceDataset ->
     # RepeatDataset(count) -> MapAndBatchDataset(square_3, batch_size).
@@ -446,6 +448,7 @@ def _map_fn(x, y, z):
             batching.map_and_batch(
                 map_func=_map_fn,
                 batch_size=batch_size,
+                num_parallel_calls=num_parallel_calls,
                 num_parallel_batches=num_parallel_batches))
         .make_initializable_iterator())
     init_op = iterator.initializer
@@ -497,12 +500,18 @@ def _map_fn(x, y, z):
       with self.assertRaises(errors.InvalidArgumentError):
         sess.run(init_op, feed_dict={count: 14, batch_size: 0})
 
-  def testMapAndBatchDataset(self):
+  def testMapAndBatch(self):
     return self._testMapAndBatchDatasetHelper()
 
-  def testMapAndBatchDatasetWithParallelBatching(self):
+  def testMapAndBatchWithParallelBatches(self):
     return self._testMapAndBatchDatasetHelper(num_parallel_batches=10)
 
+  def testMapAndBatchWithSequentialCalls(self):
+    return self._testMapAndBatchDatasetHelper(num_parallel_calls=1)
+
+  def testMapAndBatchWithParallelCalls(self):
+    return self._testMapAndBatchDatasetHelper(num_parallel_calls=2)
+
   def _testMapAndBatchPartialBatchHelper(self, drop_remainder=False):
     iterator = (
         dataset_ops.Dataset.range(10).apply(
@@ -543,6 +552,44 @@ def testMapAndBatchYieldsPartialBatch(self):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
+  def testMapAndBatchParallelGetNext(self):
+    iterator = (dataset_ops.Dataset.range(50000)
+                .apply(batching.map_and_batch(lambda x: x, batch_size=100))
+                .make_one_shot_iterator())
+    elements = []
+    for _ in range(100):
+      elements.append(iterator.get_next())
+    with self.test_session() as sess:
+      for i in range(5):
+        got = sess.run(elements)
+        got.sort(key=lambda x: x[0])
+        expected = []
+        for j in range(100):
+          expected.append(range(i*10000+j*100, i*10000+(j+1)*100))
+        self.assertAllEqual(got, expected)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(elements)
+
+  def testMapAndBatchParallelGetNextDropRemainder(self):
+    iterator = (
+        dataset_ops.Dataset.range(49999).apply(
+            batching.map_and_batch(
+                lambda x: x, batch_size=100, drop_remainder=True))
+        .make_one_shot_iterator())
+    elements = []
+    for _ in range(100):
+      elements.append(iterator.get_next())
+    with self.test_session() as sess:
+      for i in range(4):
+        got = sess.run(elements)
+        got.sort(key=lambda x: x[0])
+        expected = []
+        for j in range(100):
+          expected.append(range(i*10000+j*100, i*10000+(j+1)*100))
+        self.assertAllEqual(got, expected)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(elements)
+
   def testMapAndBatchSparse(self):
 
     def _sparse(i):
@@ -630,9 +677,7 @@ def _build_dataset_dense_to_sparse(self, components):
         lambda x: array_ops.fill([x], x)).apply(
             batching.dense_to_sparse_batch(4, [12]))
 
-  # TODO(b/70988345): Re-enable when sparse tensors are properly supported by
-  # the DatasetSerializationTestBase.
-  def _testDenseToSparseBatchDatasetCore(self):
+  def testDenseToSparseBatchDatasetCore(self):
     components = np.random.randint(5, size=(40,)).astype(np.int32)
     diff_comp = np.random.randint(2, size=(100,)).astype(np.int32)
 
@@ -684,7 +729,7 @@ def testCore(self):
 class MapAndBatchDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
 
-  def testSerializationCore(self):
+  def testNumParallelBatches(self):
     range_size = 11
     num_repeats = 2
     batch_size = 5
@@ -711,6 +756,33 @@ def _map_fn(x):
     self.run_core_tests(lambda: build_ds(10, True), lambda: build_ds(15, True),
                         num_outputs_drop_remainder)
 
+  def testNumParallelCalls(self):
+    range_size = 11
+    num_repeats = 2
+    batch_size = 5
+    total_outputs = range_size * num_repeats
+    num_outputs_drop_remainder = total_outputs // batch_size
+    num_outputs_keep_remainder = int(math.ceil(total_outputs / batch_size))
+    num_parallel_calls = 7
+
+    def build_ds(range_start, drop_remainder=False):
+
+      def _map_fn(x):
+        return math_ops.square(x)
+
+      return dataset_ops.Dataset.range(
+          range_start, range_start + range_size).repeat(num_repeats).apply(
+              batching.map_and_batch(
+                  map_func=_map_fn,
+                  batch_size=batch_size,
+                  num_parallel_calls=num_parallel_calls,
+                  drop_remainder=drop_remainder))
+
+    self.run_core_tests(lambda: build_ds(10), lambda: build_ds(15),
+                        num_outputs_keep_remainder)
+    self.run_core_tests(lambda: build_ds(10, True), lambda: build_ds(15, True),
+                        num_outputs_drop_remainder)
+
 
 class PaddedBatchDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
diff --git a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
index 55a56b83a8efba..bd3e034211c4aa 100644
--- a/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/bucketing_test.py
@@ -28,6 +28,7 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -35,6 +36,179 @@
 from tensorflow.python.platform import test
 
 
+class GroupByReducerTest(test.TestCase):
+
+  def checkResults(self, dataset, shapes, values):
+    self.assertEqual(shapes, dataset.output_shapes)
+    get_next = dataset.make_one_shot_iterator().get_next()
+    with self.test_session() as sess:
+      for expected in values:
+        got = sess.run(get_next)
+        self.assertEqual(got, expected)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testSum(self):
+    reducer = grouping.Reducer(
+        init_func=lambda _: np.int64(0),
+        reduce_func=lambda x, y: x + y,
+        finalize_func=lambda x: x)
+    for i in range(1, 11):
+      dataset = dataset_ops.Dataset.range(2 * i).apply(
+          grouping.group_by_reducer(lambda x: x % 2, reducer))
+      self.checkResults(
+          dataset, shapes=tensor_shape.scalar(), values=[(i - 1) * i, i * i])
+
+  def testAverage(self):
+
+    def reduce_fn(x, y):
+      return (x[0] * x[1] + math_ops.cast(y, dtypes.float32)) / (
+          x[1] + 1), x[1] + 1
+
+    reducer = grouping.Reducer(
+        init_func=lambda _: (0.0, 0.0),
+        reduce_func=reduce_fn,
+        finalize_func=lambda x: x[0])
+    for i in range(1, 11):
+      dataset = dataset_ops.Dataset.range(2 * i).apply(
+          grouping.group_by_reducer(
+              lambda x: math_ops.cast(x, dtypes.int64) % 2, reducer))
+      self.checkResults(
+          dataset, shapes=tensor_shape.scalar(), values=[i - 1, i])
+
+  def testConcat(self):
+    components = np.array(list("abcdefghijklmnopqrst")).view(np.chararray)
+    reducer = grouping.Reducer(
+        init_func=lambda x: "",
+        reduce_func=lambda x, y: x + y[0],
+        finalize_func=lambda x: x)
+    for i in range(1, 11):
+      dataset = dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.from_tensor_slices(components),
+           dataset_ops.Dataset.range(2 * i))).apply(
+               grouping.group_by_reducer(lambda x, y: y % 2, reducer))
+      self.checkResults(
+          dataset,
+          shapes=tensor_shape.scalar(),
+          values=[b"acegikmoqs" [:i], b"bdfhjlnprt" [:i]])
+
+  def testSparseSum(self):
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=np.array([[0, 0]]),
+          values=(i * np.array([1], dtype=np.int64)),
+          dense_shape=np.array([1, 1]))
+
+    reducer = grouping.Reducer(
+        init_func=lambda _: _sparse(np.int64(0)),
+        reduce_func=lambda x, y: _sparse(x.values[0] + y.values[0]),
+        finalize_func=lambda x: x.values[0])
+    for i in range(1, 11):
+      dataset = dataset_ops.Dataset.range(2 * i).map(_sparse).apply(
+          grouping.group_by_reducer(lambda x: x.values[0] % 2, reducer))
+      self.checkResults(
+          dataset, shapes=tensor_shape.scalar(), values=[(i - 1) * i, i * i])
+
+  def testChangingStateShape(self):
+
+    def reduce_fn(x, _):
+      # Statically known rank, but dynamic length.
+      larger_dim = array_ops.concat([x[0], x[0]], 0)
+      # Statically unknown rank.
+      larger_rank = array_ops.expand_dims(x[1], 0)
+      return larger_dim, larger_rank
+
+    reducer = grouping.Reducer(
+        init_func=lambda x: ([0], 1),
+        reduce_func=reduce_fn,
+        finalize_func=lambda x: x)
+
+    for i in range(1, 11):
+      dataset = dataset_ops.Dataset.from_tensors(np.int64(0)).repeat(i).apply(
+          grouping.group_by_reducer(lambda x: x, reducer))
+      self.assertEqual([None], dataset.output_shapes[0].as_list())
+      self.assertIs(None, dataset.output_shapes[1].ndims)
+      iterator = dataset.make_one_shot_iterator()
+      get_next = iterator.get_next()
+      with self.test_session() as sess:
+        x, y = sess.run(get_next)
+        self.assertAllEqual([0] * (2**i), x)
+        self.assertAllEqual(np.array(1, ndmin=i), y)
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(get_next)
+
+  def testTypeMismatch(self):
+    reducer = grouping.Reducer(
+        init_func=lambda x: constant_op.constant(1, dtype=dtypes.int32),
+        reduce_func=lambda x, y: constant_op.constant(1, dtype=dtypes.int64),
+        finalize_func=lambda x: x)
+
+    dataset = dataset_ops.Dataset.range(10)
+    with self.assertRaisesRegexp(
+        TypeError,
+        "The element types for the new state must match the initial state."):
+      dataset.apply(
+          grouping.group_by_reducer(lambda _: np.int64(0), reducer))
+
+  # TODO(b/78665031): Remove once non-scalar keys are supported.
+  def testInvalidKeyShape(self):
+    reducer = grouping.Reducer(
+        init_func=lambda x: np.int64(0),
+        reduce_func=lambda x, y: x + y,
+        finalize_func=lambda x: x)
+
+    dataset = dataset_ops.Dataset.range(10)
+    with self.assertRaisesRegexp(
+        ValueError, "`key_func` must return a single tf.int64 tensor."):
+      dataset.apply(
+          grouping.group_by_reducer(lambda _: np.int64((0, 0)), reducer))
+
+  # TODO(b/78665031): Remove once non-int64 keys are supported.
+  def testInvalidKeyType(self):
+    reducer = grouping.Reducer(
+        init_func=lambda x: np.int64(0),
+        reduce_func=lambda x, y: x + y,
+        finalize_func=lambda x: x)
+
+    dataset = dataset_ops.Dataset.range(10)
+    with self.assertRaisesRegexp(
+        ValueError, "`key_func` must return a single tf.int64 tensor."):
+      dataset.apply(
+          grouping.group_by_reducer(lambda _: "wrong", reducer))
+
+
+class GroupByReducerSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def _build_dataset(self, components):
+    reducer = grouping.Reducer(
+        init_func=lambda _: np.int64(0),
+        reduce_func=lambda x, y: x + y,
+        finalize_func=lambda x: x)
+
+    return dataset_ops.Dataset.from_tensor_slices(components).apply(
+        grouping.group_by_reducer(lambda x: x % 5, reducer))
+
+  def testCoreGroupByReducer(self):
+    components = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=np.int64)
+    self.verify_unused_iterator(
+        lambda: self._build_dataset(components), 5, verify_exhausted=True)
+    self.verify_init_before_restore(
+        lambda: self._build_dataset(components), 5, verify_exhausted=True)
+    self.verify_multiple_breaks(
+        lambda: self._build_dataset(components), 5, verify_exhausted=True)
+    self.verify_reset_restored_iterator(
+        lambda: self._build_dataset(components), 5, verify_exhausted=True)
+    self.verify_restore_in_empty_graph(
+        lambda: self._build_dataset(components), 5, verify_exhausted=True)
+    diff_components = np.array([5, 4, 3, 2, 1, 0], dtype=np.int64)
+    self.verify_restore_in_modified_graph(
+        lambda: self._build_dataset(components),
+        lambda: self._build_dataset(diff_components),
+        5,
+        verify_exhausted=True)
+
+
 class GroupByWindowTest(test.TestCase):
 
   def testSimple(self):
diff --git a/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py
new file mode 100644
index 00000000000000..74b90ec7d1617d
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py
@@ -0,0 +1,600 @@
+#  Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for CsvDatasetOp."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import string
+import tempfile
+import time
+
+import numpy as np
+
+from tensorflow.contrib.data.python.ops import error_ops
+from tensorflow.contrib.data.python.ops import readers
+from tensorflow.python.client import session
+from tensorflow.python.data.ops import readers as core_readers
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_parsing_ops
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import googletest
+from tensorflow.python.platform import test
+
+
+class CsvDatasetOpTest(test.TestCase):
+
+  def _assert_datasets_equal(self, g, ds1, ds2):
+    assert ds1.output_shapes == ds2.output_shapes, ('output_shapes differ: %s, '
+                                                    '%s') % (ds1.output_shapes,
+                                                             ds2.output_shapes)
+    assert ds1.output_types == ds2.output_types
+    assert ds1.output_classes == ds2.output_classes
+    next1 = ds1.make_one_shot_iterator().get_next()
+    next2 = ds2.make_one_shot_iterator().get_next()
+    with self.test_session(graph=g) as sess:
+      # Run through datasets and check that outputs match, or errors match.
+      while True:
+        try:
+          op1 = sess.run(next1)
+        except (errors.OutOfRangeError, ValueError) as e:
+          # If op1 throws an exception, check that op2 throws same exception.
+          with self.assertRaises(type(e)):
+            sess.run(next2)
+          break
+        op2 = sess.run(next2)
+        self.assertAllEqual(op1, op2)
+
+  def setup_files(self, inputs, linebreak='\n'):
+    filenames = []
+    for i, ip in enumerate(inputs):
+      fn = os.path.join(self.get_temp_dir(), 'temp_%d.csv' % i)
+      with open(fn, 'wb') as f:
+        f.write(linebreak.join(ip).encode('utf-8'))
+      filenames.append(fn)
+    return filenames
+
+  def _make_test_datasets(self, inputs, **kwargs):
+    # Test by comparing its output to what we could get with map->decode_csv
+    filenames = self.setup_files(inputs)
+    dataset_expected = core_readers.TextLineDataset(filenames)
+    dataset_expected = dataset_expected.map(
+        lambda l: gen_parsing_ops.decode_csv(l, **kwargs))
+    dataset_actual = readers.CsvDataset(filenames, **kwargs)
+    return (dataset_actual, dataset_expected)
+
+  def _test_by_comparison(self, inputs, **kwargs):
+    """Checks that CsvDataset is equiv to TextLineDataset->map(decode_csv)."""
+    with ops.Graph().as_default() as g:
+      dataset_actual, dataset_expected = self._make_test_datasets(
+          inputs, **kwargs)
+      self._assert_datasets_equal(g, dataset_actual, dataset_expected)
+
+  def _verify_output_or_err(self,
+                            sess,
+                            dataset,
+                            expected_output=None,
+                            expected_err_re=None):
+    nxt = dataset.make_one_shot_iterator().get_next()
+    if expected_err_re is None:
+      # Verify that output is expected, without errors
+      expected_output = [[
+          v.encode('utf-8') if isinstance(v, str) else v for v in op
+      ] for op in expected_output]
+      for value in expected_output:
+        op = sess.run(nxt)
+        self.assertAllEqual(op, value)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(nxt)
+    else:
+      # Verify that OpError is produced as expected
+      with self.assertRaisesOpError(expected_err_re):
+        while True:
+          try:
+            sess.run(nxt)
+          except errors.OutOfRangeError:
+            break
+
+  def _test_dataset(self,
+                    inputs,
+                    expected_output=None,
+                    expected_err_re=None,
+                    linebreak='\n',
+                    **kwargs):
+    """Checks that elements produced by CsvDataset match expected output."""
+    # Convert str type because py3 tf strings are bytestrings
+    filenames = self.setup_files(inputs, linebreak)
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = readers.CsvDataset(filenames, **kwargs)
+        self._verify_output_or_err(sess, dataset, expected_output,
+                                   expected_err_re)
+
+  def testCsvDataset_requiredFields(self):
+    record_defaults = [[]] * 4
+    inputs = [['1,2,3,4']]
+    self._test_by_comparison(inputs, record_defaults=record_defaults)
+
+  def testCsvDataset_int(self):
+    record_defaults = [[0]] * 4
+    inputs = [['1,2,3,4', '5,6,7,8']]
+    self._test_by_comparison(inputs, record_defaults=record_defaults)
+
+  def testCsvDataset_float(self):
+    record_defaults = [[0.0]] * 4
+    inputs = [['1.0,2.1,3.2,4.3', '5.4,6.5,7.6,8.7']]
+    self._test_by_comparison(inputs, record_defaults=record_defaults)
+
+  def testCsvDataset_string(self):
+    record_defaults = [['']] * 4
+    inputs = [['1.0,2.1,hello,4.3', '5.4,6.5,goodbye,8.7']]
+    self._test_by_comparison(inputs, record_defaults=record_defaults)
+
+  def testCsvDataset_withEmptyFields(self):
+    record_defaults = [[0]] * 4
+    inputs = [[',,,', '1,1,1,', ',2,2,2']]
+    self._test_dataset(
+        inputs, [[0, 0, 0, 0], [1, 1, 1, 0], [0, 2, 2, 2]],
+        record_defaults=record_defaults)
+
+  def testCsvDataset_errWithUnquotedQuotes(self):
+    record_defaults = [['']] * 3
+    inputs = [['1,2"3,4']]
+    self._test_dataset(
+        inputs,
+        expected_err_re='Unquoted fields cannot have quotes inside',
+        record_defaults=record_defaults)
+
+  def testCsvDataset_ignoreErrWithUnquotedQuotes(self):
+    record_defaults = [['']] * 3
+    inputs = [['1,2"3,4', 'a,b,c"d', 'e,f,g']]
+    filenames = self.setup_files(inputs)
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = readers.CsvDataset(filenames, record_defaults=record_defaults)
+        dataset = dataset.apply(error_ops.ignore_errors())
+        self._verify_output_or_err(sess, dataset, [['e', 'f', 'g']])
+
+  def testCsvDataset_withNoQuoteDelimAndUnquotedQuotes(self):
+    record_defaults = [['']] * 3
+    inputs = [['1,2"3,4']]
+    self._test_by_comparison(
+        inputs, record_defaults=record_defaults, use_quote_delim=False)
+
+  def testCsvDataset_mixedTypes(self):
+    record_defaults = [
+        constant_op.constant([], dtype=dtypes.int32),
+        constant_op.constant([], dtype=dtypes.float32),
+        constant_op.constant([], dtype=dtypes.string),
+        constant_op.constant([], dtype=dtypes.float64)
+    ]
+    inputs = [['1,2.1,3.2,4.3', '5,6.5,7.6,8.7']]
+    self._test_by_comparison(inputs, record_defaults=record_defaults)
+
+  def testCsvDataset_withUseQuoteDelimFalse(self):
+    record_defaults = [['']] * 4
+    inputs = [['1,2,"3,4"', '"5,6",7,8']]
+    self._test_by_comparison(
+        inputs, record_defaults=record_defaults, use_quote_delim=False)
+
+  def testCsvDataset_withFieldDelim(self):
+    record_defaults = [[0]] * 4
+    inputs = [['1:2:3:4', '5:6:7:8']]
+    self._test_by_comparison(
+        inputs, record_defaults=record_defaults, field_delim=':')
+
+  def testCsvDataset_withNaValue(self):
+    record_defaults = [[0]] * 4
+    inputs = [['1,NA,3,4', 'NA,6,7,8']]
+    self._test_by_comparison(
+        inputs, record_defaults=record_defaults, na_value='NA')
+
+  def testCsvDataset_withSelectCols(self):
+    record_defaults = [['']] * 2
+    inputs = [['1,2,3,4', '"5","6","7","8"']]
+    self._test_by_comparison(
+        inputs, record_defaults=record_defaults, select_cols=[1, 2])
+
+  def testCsvDataset_withSelectColsTooHigh(self):
+    record_defaults = [[0]] * 2
+    inputs = [['1,2,3,4', '5,6,7,8']]
+    self._test_dataset(
+        inputs,
+        expected_err_re='Expect 2 fields but have 1 in record',
+        record_defaults=record_defaults,
+        select_cols=[3, 4])
+
+  def testCsvDataset_withOneCol(self):
+    record_defaults = [['NA']]
+    inputs = [['0', '', '2']]
+    self._test_dataset(
+        inputs, [['0'], ['NA'], ['2']], record_defaults=record_defaults)
+
+  def testCsvDataset_withMultipleFiles(self):
+    record_defaults = [[0]] * 4
+    inputs = [['1,2,3,4', '5,6,7,8'], ['5,6,7,8']]
+    self._test_by_comparison(inputs, record_defaults=record_defaults)
+
+  def testCsvDataset_withLeadingAndTrailingSpaces(self):
+    record_defaults = [[0.0]] * 4
+    inputs = [['0, 1, 2, 3']]
+    expected = [[0.0, 1.0, 2.0, 3.0]]
+    self._test_dataset(inputs, expected, record_defaults=record_defaults)
+
+  def testCsvDataset_errorWithMissingDefault(self):
+    record_defaults = [[]] * 2
+    inputs = [['0,']]
+    self._test_dataset(
+        inputs,
+        expected_err_re='Field 1 is required but missing in record!',
+        record_defaults=record_defaults)
+
+  def testCsvDataset_errorWithFewerDefaultsThanFields(self):
+    record_defaults = [[0.0]] * 2
+    inputs = [['0,1,2,3']]
+    self._test_dataset(
+        inputs,
+        expected_err_re='Expect 2 fields but have more in record',
+        record_defaults=record_defaults)
+
+  def testCsvDataset_errorWithMoreDefaultsThanFields(self):
+    record_defaults = [[0.0]] * 5
+    inputs = [['0,1,2,3']]
+    self._test_dataset(
+        inputs,
+        expected_err_re='Expect 5 fields but have 4 in record',
+        record_defaults=record_defaults)
+
+  def testCsvDataset_withHeader(self):
+    record_defaults = [[0]] * 2
+    inputs = [['col1,col2', '1,2']]
+    expected = [[1, 2]]
+    self._test_dataset(
+        inputs,
+        expected,
+        record_defaults=record_defaults,
+        header=True,
+    )
+
+  def testCsvDataset_withHeaderAndNoRecords(self):
+    record_defaults = [[0]] * 2
+    inputs = [['col1,col2']]
+    expected = []
+    self._test_dataset(
+        inputs,
+        expected,
+        record_defaults=record_defaults,
+        header=True,
+    )
+
+  def testCsvDataset_errorWithHeaderEmptyFile(self):
+    record_defaults = [[0]] * 2
+    inputs = [[]]
+    expected_err_re = "Can't read header of file"
+    self._test_dataset(
+        inputs,
+        expected_err_re=expected_err_re,
+        record_defaults=record_defaults,
+        header=True,
+    )
+
+  def testCsvDataset_withEmptyFile(self):
+    record_defaults = [['']] * 2
+    inputs = [['']]  # Empty file
+    self._test_dataset(
+        inputs, expected_output=[], record_defaults=record_defaults)
+
+  def testCsvDataset_errorWithEmptyRecord(self):
+    record_defaults = [['']] * 2
+    inputs = [['', '1,2']]  # First record is empty
+    self._test_dataset(
+        inputs,
+        expected_err_re='Expect 2 fields but have 1 in record',
+        record_defaults=record_defaults)
+
+  def testCsvDataset_withChainedOps(self):
+    # Testing that one dataset can create multiple iterators fine.
+    # `repeat` creates multiple iterators from the same C++ Dataset.
+    record_defaults = [[0]] * 4
+    inputs = [['1,,3,4', '5,6,,8']]
+    ds_actual, ds_expected = self._make_test_datasets(
+        inputs, record_defaults=record_defaults)
+    with ops.Graph().as_default() as g:
+      self._assert_datasets_equal(g,
+                                  ds_actual.repeat(5).prefetch(1),
+                                  ds_expected.repeat(5).prefetch(1))
+
+  def testCsvDataset_withTypeDefaults(self):
+    # Testing using dtypes as record_defaults for required fields
+    record_defaults = [dtypes.float32, [0.0]]
+    inputs = [['1.0,2.0', '3.0,4.0']]
+    self._test_dataset(
+        inputs,
+        [[1.0, 2.0], [3.0, 4.0]],
+        record_defaults=record_defaults,
+    )
+
+  def testMakeCsvDataset_fieldOrder(self):
+    data = [[
+        '1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19',
+        '1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19'
+    ]]
+    file_path = self.setup_files(data)
+
+    with ops.Graph().as_default() as g:
+      ds = readers.make_csv_dataset(
+          file_path, batch_size=1, shuffle=False, num_epochs=1)
+      next_batch = ds.make_one_shot_iterator().get_next()
+
+    with self.test_session(graph=g) as sess:
+      result = list(sess.run(next_batch).values())
+
+    self.assertEqual(result, sorted(result))
+
+## The following tests exercise parsing logic for quoted fields
+
+  def testCsvDataset_withQuoted(self):
+    record_defaults = [['']] * 4
+    inputs = [['"a","b","c :)","d"', '"e","f","g :(","h"']]
+    self._test_by_comparison(inputs, record_defaults=record_defaults)
+
+  def testCsvDataset_withOneColAndQuotes(self):
+    record_defaults = [['']]
+    inputs = [['"0"', '"1"', '"2"']]
+    self._test_dataset(
+        inputs, [['0'], ['1'], ['2']], record_defaults=record_defaults)
+
+  def testCsvDataset_withNewLine(self):
+    # In this case, we expect it to behave differently from
+    # TextLineDataset->map(decode_csv) since that flow has bugs
+    record_defaults = [['']] * 4
+    inputs = [['a,b,"""c""\n0","d\ne"', 'f,g,h,i']]
+    expected = [['a', 'b', '"c"\n0', 'd\ne'], ['f', 'g', 'h', 'i']]
+    self._test_dataset(inputs, expected, record_defaults=record_defaults)
+
+  def testCsvDataset_withNewLineInUnselectedCol(self):
+    record_defaults = [['']]
+    inputs = [['1,"2\n3",4', '5,6,7']]
+    self._test_dataset(
+        inputs,
+        expected_output=[['1'], ['5']],
+        record_defaults=record_defaults,
+        select_cols=[0])
+
+  def testCsvDataset_withMultipleNewLines(self):
+    # In this case, we expect it to behave differently from
+    # TextLineDataset->map(decode_csv) since that flow has bugs
+    record_defaults = [['']] * 4
+    inputs = [['a,"b\n\nx","""c""\n \n0","d\ne"', 'f,g,h,i']]
+    expected = [['a', 'b\n\nx', '"c"\n \n0', 'd\ne'], ['f', 'g', 'h', 'i']]
+    self._test_dataset(inputs, expected, record_defaults=record_defaults)
+
+  def testCsvDataset_errorWithTerminateMidRecord(self):
+    record_defaults = [['']] * 4
+    inputs = [['a,b,c,"a']]
+    self._test_dataset(
+        inputs,
+        expected_err_re=
+        'Reached end of file without closing quoted field in record',
+        record_defaults=record_defaults)
+
+  def testCsvDataset_withEscapedQuotes(self):
+    record_defaults = [['']] * 4
+    inputs = [['1.0,2.1,"she said: ""hello""",4.3', '5.4,6.5,goodbye,8.7']]
+    self._test_by_comparison(inputs, record_defaults=record_defaults)
+
+
+## Testing that parsing works with all buffer sizes, quoted/unquoted fields,
+## and different types of line breaks
+
+  def testCsvDataset_withInvalidBufferSize(self):
+    record_defaults = [['']] * 4
+    inputs = [['a,b,c,d']]
+    self._test_dataset(
+        inputs,
+        expected_err_re='buffer_size should be positive',
+        record_defaults=record_defaults,
+        buffer_size=0)
+
+  def testCsvDataset_withBufferSize(self):
+    record_defaults = [['NA']] * 3
+    inputs = [['abc,def,ghi', '0,1,2', ',,']]
+    expected = [['abc', 'def', 'ghi'], ['0', '1', '2'], ['NA', 'NA', 'NA']]
+    for i in range(20):
+      # Test a range of buffer sizes that should all work
+      self._test_dataset(
+          inputs, expected, record_defaults=record_defaults, buffer_size=i + 1)
+
+  def testCsvDataset_withCR(self):
+    # Test that when the line separator is '\r', parsing works with all buffer
+    # sizes
+    record_defaults = [['NA']] * 3
+    inputs = [['abc,def,ghi', '0,1,2', ',,']]
+    expected = [['abc', 'def', 'ghi'], ['0', '1', '2'], ['NA', 'NA', 'NA']]
+    for i in range(20):
+      # Test a range of buffer sizes that should all work
+      self._test_dataset(
+          inputs,
+          expected,
+          linebreak='\r',
+          record_defaults=record_defaults,
+          buffer_size=i + 1)
+
+  def testCsvDataset_withCRLF(self):
+    # Test that when the line separator is '\r\n', parsing works with all buffer
+    # sizes
+    record_defaults = [['NA']] * 3
+    inputs = [['abc,def,ghi', '0,1,2', ',,']]
+    expected = [['abc', 'def', 'ghi'], ['0', '1', '2'], ['NA', 'NA', 'NA']]
+    for i in range(20):
+      # Test a range of buffer sizes that should all work
+      self._test_dataset(
+          inputs,
+          expected,
+          linebreak='\r\n',
+          record_defaults=record_defaults,
+          buffer_size=i + 1)
+
+  def testCsvDataset_withBufferSizeAndQuoted(self):
+    record_defaults = [['NA']] * 3
+    inputs = [['"\n\n\n","\r\r\r","abc"', '"0","1","2"', '"","",""']]
+    expected = [['\n\n\n', '\r\r\r', 'abc'], ['0', '1', '2'],
+                ['NA', 'NA', 'NA']]
+    for i in range(20):
+      # Test a range of buffer sizes that should all work
+      self._test_dataset(
+          inputs,
+          expected,
+          linebreak='\n',
+          record_defaults=record_defaults,
+          buffer_size=i + 1)
+    self._test_dataset(
+        inputs, expected, linebreak='\n', record_defaults=record_defaults)
+
+  def testCsvDataset_withCRAndQuoted(self):
+    # Test that when the line separator is '\r', parsing works with all buffer
+    # sizes
+    record_defaults = [['NA']] * 3
+    inputs = [['"\n\n\n","\r\r\r","abc"', '"0","1","2"', '"","",""']]
+    expected = [['\n\n\n', '\r\r\r', 'abc'], ['0', '1', '2'],
+                ['NA', 'NA', 'NA']]
+    for i in range(20):
+      # Test a range of buffer sizes that should all work
+      self._test_dataset(
+          inputs,
+          expected,
+          linebreak='\r',
+          record_defaults=record_defaults,
+          buffer_size=i + 1)
+    self._test_dataset(
+        inputs, expected, linebreak='\r', record_defaults=record_defaults)
+
+  def testCsvDataset_withCRLFAndQuoted(self):
+    # Test that when the line separator is '\r\n', parsing works with all buffer
+    # sizes
+    record_defaults = [['NA']] * 3
+    inputs = [['"\n\n\n","\r\r\r","abc"', '"0","1","2"', '"","",""']]
+    expected = [['\n\n\n', '\r\r\r', 'abc'], ['0', '1', '2'],
+                ['NA', 'NA', 'NA']]
+    for i in range(20):
+      # Test a range of buffer sizes that should all work
+      self._test_dataset(
+          inputs,
+          expected,
+          linebreak='\r\n',
+          record_defaults=record_defaults,
+          buffer_size=i + 1)
+    self._test_dataset(
+        inputs, expected, linebreak='\r\n', record_defaults=record_defaults)
+
+
+class CsvDatasetBenchmark(test.Benchmark):
+  """Benchmarks for the various ways of creating a dataset from CSV files.
+  """
+  FLOAT_VAL = '1.23456E12'
+  STR_VAL = string.ascii_letters * 10
+
+  def _setUp(self, str_val):
+    # Since this isn't test.TestCase, have to manually create a test dir
+    gfile.MakeDirs(googletest.GetTempDir())
+    self._temp_dir = tempfile.mkdtemp(dir=googletest.GetTempDir())
+
+    self._num_cols = [4, 64, 256]
+    self._num_per_iter = 5000
+    self._filenames = []
+    for n in self._num_cols:
+      fn = os.path.join(self._temp_dir, 'file%d.csv' % n)
+      with open(fn, 'wb') as f:
+        # Just write 100 rows and use `repeat`... Assumes the cost
+        # of creating an iterator is not significant
+        row = ','.join([str_val for _ in range(n)])
+        f.write('\n'.join([row for _ in range(100)]))
+      self._filenames.append(fn)
+
+  def _tearDown(self):
+    gfile.DeleteRecursively(self._temp_dir)
+
+  def _runBenchmark(self, dataset, num_cols, prefix):
+    dataset = dataset.skip(self._num_per_iter - 1)
+    deltas = []
+    for _ in range(10):
+      next_element = dataset.make_one_shot_iterator().get_next()
+      with session.Session() as sess:
+        start = time.time()
+        # NOTE: This depends on the underlying implementation of skip, to have
+        # the net effect of calling `GetNext` num_per_iter times on the
+        # input dataset. We do it this way (instead of a python for loop, or
+        # batching N inputs in one iter) so that the overhead from session.run
+        # or batch doesn't dominate. If we eventually optimize skip, this has
+        # to change.
+        sess.run(next_element)
+        end = time.time()
+      deltas.append(end - start)
+    # Median wall time per CSV record read and decoded
+    median_wall_time = np.median(deltas) / self._num_per_iter
+    print('%s num_cols: %d Median wall time: %f' % (prefix, num_cols,
+                                                    median_wall_time))
+    self.report_benchmark(
+        iters=self._num_per_iter,
+        wall_time=median_wall_time,
+        name='%s_with_cols_%d' % (prefix, num_cols))
+
+  def benchmarkMapWithFloats(self):
+    self._setUp(self.FLOAT_VAL)
+    for i in range(len(self._filenames)):
+      num_cols = self._num_cols[i]
+      kwargs = {'record_defaults': [[0.0]] * num_cols}
+      dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
+      dataset = dataset.map(lambda l: gen_parsing_ops.decode_csv(l, **kwargs))  # pylint: disable=cell-var-from-loop
+      self._runBenchmark(dataset, num_cols, 'csv_float_map_decode_csv')
+    self._tearDown()
+
+  def benchmarkMapWithStrings(self):
+    self._setUp(self.STR_VAL)
+    for i in range(len(self._filenames)):
+      num_cols = self._num_cols[i]
+      kwargs = {'record_defaults': [['']] * num_cols}
+      dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
+      dataset = dataset.map(lambda l: gen_parsing_ops.decode_csv(l, **kwargs))  # pylint: disable=cell-var-from-loop
+      self._runBenchmark(dataset, num_cols, 'csv_strings_map_decode_csv')
+    self._tearDown()
+
+  def benchmarkCsvDatasetWithFloats(self):
+    self._setUp(self.FLOAT_VAL)
+    for i in range(len(self._filenames)):
+      num_cols = self._num_cols[i]
+      kwargs = {'record_defaults': [[0.0]] * num_cols}
+      dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
+      dataset = readers.CsvDataset(self._filenames[i], **kwargs).repeat()  # pylint: disable=cell-var-from-loop
+      self._runBenchmark(dataset, num_cols, 'csv_float_fused_dataset')
+    self._tearDown()
+
+  def benchmarkCsvDatasetWithStrings(self):
+    self._setUp(self.STR_VAL)
+    for i in range(len(self._filenames)):
+      num_cols = self._num_cols[i]
+      kwargs = {'record_defaults': [['']] * num_cols}
+      dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
+      dataset = readers.CsvDataset(self._filenames[i], **kwargs).repeat()  # pylint: disable=cell-var-from-loop
+      self._runBenchmark(dataset, num_cols, 'csv_strings_fused_dataset')
+    self._tearDown()
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/directed_interleave_dataset_test.py b/tensorflow/contrib/data/python/kernel_tests/directed_interleave_dataset_test.py
new file mode 100644
index 00000000000000..34b6a080c0aae7
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/directed_interleave_dataset_test.py
@@ -0,0 +1,167 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
+from tensorflow.contrib.data.python.ops import interleave_ops
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import random_seed
+from tensorflow.python.platform import test
+
+
+class DirectedInterleaveDatasetTest(test.TestCase):
+
+  def testBasic(self):
+    selector_dataset = dataset_ops.Dataset.range(10).repeat(100)
+    input_datasets = [
+        dataset_ops.Dataset.from_tensors(i).repeat(100) for i in range(10)
+    ]
+    dataset = interleave_ops.DirectedInterleaveDataset(selector_dataset,
+                                                       input_datasets)
+    iterator = dataset.make_initializable_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(iterator.initializer)
+      for _ in range(100):
+        for i in range(10):
+          self.assertEqual(i, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def _normalize(self, vec):
+    return vec / vec.sum()
+
+  def _chi2(self, expected, actual):
+    actual = np.asarray(actual)
+    expected = np.asarray(expected)
+    diff = actual - expected
+    chi2 = np.sum(diff * diff / expected, axis=0)
+    return chi2
+
+  def _testSampleFromDatasetsHelper(self, weights, num_datasets, num_samples):
+    # Create a dataset that samples each integer in `[0, num_datasets)`
+    # with probability given by `weights[i]`.
+    dataset = interleave_ops.sample_from_datasets([
+        dataset_ops.Dataset.from_tensors(i).repeat(None)
+        for i in range(num_datasets)
+    ], weights)
+    dataset = dataset.take(num_samples)
+    iterator = dataset.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      freqs = np.zeros([num_datasets])
+      for _ in range(num_samples):
+        freqs[sess.run(next_element)] += 1
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+    return freqs
+
+  def testSampleFromDatasets(self):
+    random_seed.set_random_seed(1619)
+    num_samples = 5000
+    rand_probs = self._normalize(np.random.random_sample((15,)))
+
+    # Use chi-squared test to assert that the observed distribution matches the
+    # expected distribution. Based on the implementation in
+    # "tensorflow/python/kernel_tests/multinomial_op_test.py".
+    for probs in [[.85, .05, .1], rand_probs]:
+      probs = np.asarray(probs)
+      classes = len(probs)
+      freqs = self._testSampleFromDatasetsHelper(probs, classes, num_samples)
+      self.assertLess(self._chi2(probs, freqs / num_samples), 1e-2)
+
+      # Also check that `weights` as a dataset samples correctly.
+      probs_ds = dataset_ops.Dataset.from_tensors(probs).repeat()
+      freqs = self._testSampleFromDatasetsHelper(probs_ds, classes, num_samples)
+      self.assertLess(self._chi2(probs, freqs / num_samples), 1e-2)
+
+  def testSelectFromDatasets(self):
+    words = [b"foo", b"bar", b"baz"]
+    datasets = [dataset_ops.Dataset.from_tensors(w).repeat() for w in words]
+    choice_array = np.random.randint(3, size=(15,), dtype=np.int64)
+    choice_dataset = dataset_ops.Dataset.from_tensor_slices(choice_array)
+    dataset = interleave_ops.choose_from_datasets(datasets, choice_dataset)
+    iterator = dataset.make_one_shot_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+      for i in choice_array:
+        self.assertEqual(words[i], sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testErrors(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 r"vector of length `len\(datasets\)`"):
+      interleave_ops.sample_from_datasets(
+          [dataset_ops.Dataset.range(10),
+           dataset_ops.Dataset.range(20)],
+          weights=[0.25, 0.25, 0.25, 0.25])
+
+    with self.assertRaisesRegexp(TypeError, "`tf.float32` or `tf.float64`"):
+      interleave_ops.sample_from_datasets(
+          [dataset_ops.Dataset.range(10),
+           dataset_ops.Dataset.range(20)],
+          weights=[1, 1])
+
+    with self.assertRaisesRegexp(TypeError, "must have the same type"):
+      interleave_ops.sample_from_datasets([
+          dataset_ops.Dataset.from_tensors(0),
+          dataset_ops.Dataset.from_tensors(0.0)
+      ])
+
+    with self.assertRaisesRegexp(TypeError, "tf.int64"):
+      interleave_ops.choose_from_datasets([
+          dataset_ops.Dataset.from_tensors(0),
+          dataset_ops.Dataset.from_tensors(1)
+      ], choice_dataset=dataset_ops.Dataset.from_tensors(1.0))
+
+    with self.assertRaisesRegexp(TypeError, "scalar"):
+      interleave_ops.choose_from_datasets([
+          dataset_ops.Dataset.from_tensors(0),
+          dataset_ops.Dataset.from_tensors(1)
+      ], choice_dataset=dataset_ops.Dataset.from_tensors([1.0]))
+
+
+class SampleFromDatasetsSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def _build_dataset(self, probs, num_samples):
+    dataset = interleave_ops.sample_from_datasets(
+        [
+            dataset_ops.Dataset.from_tensors(i).repeat(None)
+            for i in range(len(probs))
+        ],
+        probs,
+        seed=1813)
+    return dataset.take(num_samples)
+
+  def testSerializationCore(self):
+    self.run_core_tests(
+        lambda: self._build_dataset([0.5, 0.5], 100),
+        lambda: self._build_dataset([0.25, 0.25, 0.25, 0.25], 1000), 100)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
index 43aa4b1bd02791..bee561e3e23a2a 100644
--- a/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/interleave_dataset_op_test.py
@@ -30,7 +30,6 @@
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
-from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -907,114 +906,5 @@ def interleave_fn(x):
         sess.run(self.next_element)
 
 
-class DirectedInterleaveDatasetTest(test.TestCase):
-
-  def testBasic(self):
-    selector_dataset = dataset_ops.Dataset.range(10).repeat(100)
-    input_datasets = [
-        dataset_ops.Dataset.from_tensors(i).repeat(100) for i in range(10)
-    ]
-    dataset = interleave_ops.DirectedInterleaveDataset(selector_dataset,
-                                                       input_datasets)
-    iterator = dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
-
-    with self.test_session() as sess:
-      sess.run(iterator.initializer)
-      for _ in range(100):
-        for i in range(10):
-          self.assertEqual(i, sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def _normalize(self, vec):
-    return vec / vec.sum()
-
-  def _chi2(self, expected, actual):
-    actual = np.asarray(actual)
-    expected = np.asarray(expected)
-    diff = actual - expected
-    chi2 = np.sum(diff * diff / expected, axis=0)
-    return chi2
-
-  def _testSampleFromDatasetsHelper(self, weights, num_datasets, num_samples):
-    # Create a dataset that samples each integer in `[0, num_datasets)`
-    # with probability given by `weights[i]`.
-    dataset = interleave_ops.sample_from_datasets([
-        dataset_ops.Dataset.from_tensors(i).repeat(None)
-        for i in range(num_datasets)
-    ], weights)
-    dataset = dataset.take(num_samples)
-    iterator = dataset.make_one_shot_iterator()
-    next_element = iterator.get_next()
-
-    with self.test_session() as sess:
-      freqs = np.zeros([num_datasets])
-      for _ in range(num_samples):
-        freqs[sess.run(next_element)] += 1
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-    return freqs
-
-  def testSampleFromDatasets(self):
-    random_seed.set_random_seed(1619)
-    num_samples = 10000
-    rand_probs = self._normalize(np.random.random_sample((15,)))
-
-    # Use chi-squared test to assert that the observed distribution matches the
-    # expected distribution. Based on the implementation in
-    # "tensorflow/python/kernel_tests/multinomial_op_test.py".
-    for probs in [[.85, .05, .1], rand_probs]:
-      probs = np.asarray(probs)
-      classes = len(probs)
-      freqs = self._testSampleFromDatasetsHelper(probs, classes, num_samples)
-      self.assertLess(self._chi2(probs, freqs / num_samples), 1e-3)
-
-      # Also check that `weights` as a dataset samples correctly.
-      probs_ds = dataset_ops.Dataset.from_tensors(probs).repeat()
-      freqs = self._testSampleFromDatasetsHelper(probs_ds, classes, num_samples)
-      self.assertLess(self._chi2(probs, freqs / num_samples), 1e-3)
-
-  def testErrors(self):
-    with self.assertRaisesRegexp(ValueError,
-                                 r"vector of length `len\(datasets\)`"):
-      interleave_ops.sample_from_datasets(
-          [dataset_ops.Dataset.range(10),
-           dataset_ops.Dataset.range(20)],
-          weights=[0.25, 0.25, 0.25, 0.25])
-
-    with self.assertRaisesRegexp(TypeError, "`tf.float32` or `tf.float64`"):
-      interleave_ops.sample_from_datasets(
-          [dataset_ops.Dataset.range(10),
-           dataset_ops.Dataset.range(20)],
-          weights=[1, 1])
-
-    with self.assertRaisesRegexp(TypeError, "must have the same type"):
-      interleave_ops.sample_from_datasets([
-          dataset_ops.Dataset.from_tensors(0),
-          dataset_ops.Dataset.from_tensors(0.0)
-      ])
-
-
-class SampleFromDatasetsSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
-
-  def _build_dataset(self, probs, num_samples):
-    dataset = interleave_ops.sample_from_datasets(
-        [
-            dataset_ops.Dataset.from_tensors(i).repeat(None)
-            for i in range(len(probs))
-        ],
-        probs,
-        seed=1813)
-    return dataset.take(num_samples)
-
-  def testSerializationCore(self):
-    self.run_core_tests(
-        lambda: self._build_dataset([0.5, 0.5], 100),
-        lambda: self._build_dataset([0.25, 0.25, 0.25, 0.25], 1000), 100)
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py
new file mode 100644
index 00000000000000..30f1847dcddbfa
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py
@@ -0,0 +1,89 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
+from tensorflow.contrib.data.python.ops import optimization
+from tensorflow.core.framework import graph_pb2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import test
+
+
+class OptimizeDatasetTest(test.TestCase):
+
+  def testDefaultOptimizations(self):
+    dataset = dataset_ops.Dataset.range(10).map(lambda x: x * x).batch(
+        10).apply(optimization.optimize())
+    iterator = dataset.make_one_shot_iterator()
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      graph = graph_pb2.GraphDef().FromString(
+          sess.run(dataset._as_serialized_graph()))
+      self.assertTrue(
+          all([node.op != "MapAndBatchDatasetV2" for node in graph.node]))
+      self.assertAllEqual([x * x for x in range(10)], sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testEmptyOptimizations(self):
+    dataset = dataset_ops.Dataset.range(10).map(lambda x: x * x).batch(
+        10).apply(optimization.optimize([]))
+    iterator = dataset.make_one_shot_iterator()
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      graph = graph_pb2.GraphDef().FromString(
+          sess.run(dataset._as_serialized_graph()))
+      self.assertTrue(
+          all([node.op != "MapAndBatchDatasetV2" for node in graph.node]))
+      self.assertAllEqual([x * x for x in range(10)], sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testOptimization(self):
+    dataset = dataset_ops.Dataset.range(10).map(lambda x: x * x).batch(
+        10).apply(optimization.optimize(["map_and_batch_fusion"]))
+    iterator = dataset.make_one_shot_iterator()
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      graph = graph_pb2.GraphDef().FromString(
+          sess.run(dataset._as_serialized_graph()))
+      self.assertTrue(
+          any([node.op == "MapAndBatchDatasetV2" for node in graph.node]))
+      self.assertAllEqual([x * x for x in range(10)], sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+
+class OptimizeDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def testCore(self):
+
+    def build_dataset(num_elements, batch_size):
+      return dataset_ops.Dataset.range(num_elements).map(lambda x: x * x).batch(
+          batch_size).apply(optimization.optimize(["map_and_batch_fusion"]))
+
+    self.run_core_tests(lambda: build_dataset(200, 10), None, 20)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
index 1075302bae96ca..e0237198b7d47e 100644
--- a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
@@ -36,6 +36,7 @@
 from tensorflow.python.lib.io import python_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
@@ -256,6 +257,29 @@ def testTFRecordWithCompressionCore(self):
         lambda: self._build_iterator_graph(num_epochs * 2), num_outputs)
 
 
+def _interleave(iterators, cycle_length):
+  pending_iterators = iterators
+  open_iterators = []
+  num_open = 0
+  for i in range(cycle_length):
+    if pending_iterators:
+      open_iterators.append(pending_iterators.pop(0))
+      num_open += 1
+
+  while num_open:
+    for i in range(min(cycle_length, len(open_iterators))):
+      if open_iterators[i] is None:
+        continue
+      try:
+        yield next(open_iterators[i])
+      except StopIteration:
+        if pending_iterators:
+          open_iterators[i] = pending_iterators.pop(0)
+        else:
+          open_iterators[i] = None
+          num_open -= 1
+
+
 class ReadBatchFeaturesTest(test.TestCase):
 
   def setUp(self):
@@ -355,8 +379,8 @@ def _next_record(file_indices):
           yield j, i
 
     def _next_record_interleaved(file_indices, cycle_length):
-      return self._interleave([_next_record([i]) for i in file_indices],
-                              cycle_length)
+      return _interleave([_next_record([i]) for i in file_indices],
+                         cycle_length)
 
     file_batch = []
     keywords_batch_indices = []
@@ -397,28 +421,6 @@ def _next_record_interleaved(file_indices, cycle_length):
           [len(file_batch), keywords_batch_max_len], record_batch
       ]
 
-  def _interleave(self, iterators, cycle_length):
-    pending_iterators = iterators
-    open_iterators = []
-    num_open = 0
-    for i in range(cycle_length):
-      if pending_iterators:
-        open_iterators.append(pending_iterators.pop(0))
-        num_open += 1
-
-    while num_open:
-      for i in range(min(cycle_length, len(open_iterators))):
-        if open_iterators[i] is None:
-          continue
-        try:
-          yield next(open_iterators[i])
-        except StopIteration:
-          if pending_iterators:
-            open_iterators[i] = pending_iterators.pop(0)
-          else:
-            open_iterators[i] = None
-            num_open -= 1
-
   def _verify_records(self,
                       sess,
                       batch_size,
@@ -620,14 +622,12 @@ def _write_file(self, filename, rows):
     f.close()
     return fn
 
-  def _create_file(self, fileno, header=True, comment=True):
+  def _create_file(self, fileno, header=True):
     rows = []
     if header:
       rows.append(self.COLUMNS)
     for recno in range(self._num_records):
       rows.append(self._csv_values(fileno, recno))
-      if comment:
-        rows.append("# Some comment goes here. Ignore me.")
     return self._write_file("csv_file%d.csv" % fileno, rows)
 
   def _create_files(self):
@@ -648,9 +648,7 @@ def _make_csv_dataset(
       shuffle=False,
       shuffle_seed=None,
       header=True,
-      comment="#",
       na_value="",
-      default_float_type=dtypes.float32,
   ):
     return readers.make_csv_dataset(
         filenames,
@@ -662,9 +660,7 @@ def _make_csv_dataset(
         shuffle=shuffle,
         shuffle_seed=shuffle_seed,
         header=header,
-        comment=comment,
         na_value=na_value,
-        default_float_type=default_float_type,
         select_columns=select_cols,
     )
 
@@ -786,29 +782,6 @@ def testMakeCSVDataset_withNoLabel(self):
             num_epochs=10,
             label_name=None)
 
-  def testMakeCSVDataset_withNoComments(self):
-    """Tests that datasets can be created from CSV files with no header line.
-    """
-    defaults = self.DEFAULTS
-    file_without_header = self._create_file(
-        len(self._test_filenames), comment=False)
-    with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
-        dataset = self._make_csv_dataset(
-            file_without_header,
-            defaults,
-            batch_size=2,
-            num_epochs=10,
-            comment=None,
-        )
-        self._verify_records(
-            sess,
-            dataset,
-            [len(self._test_filenames)],
-            batch_size=2,
-            num_epochs=10,
-        )
-
   def testMakeCSVDataset_withNoHeader(self):
     """Tests that datasets can be created from CSV files with no header line.
     """
@@ -876,7 +849,7 @@ def testMakeCSVDataset_withTypeInference(self):
 
     In that case, we should infer the types from the first N records.
     """
-    # Test that it works with standard test files (with comments, header, etc)
+    # Test that it works with standard test files (with header, etc)
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g) as sess:
         dataset = self._make_csv_dataset(
@@ -889,7 +862,9 @@ def testMakeCSVDataset_withTypeInference(self):
             num_epochs=10,
             defaults=[[], [], [], [], [""]])
 
-    # Test on a deliberately tricky file
+  def testMakeCSVDataset_withTypeInferenceTricky(self):
+    # Test on a deliberately tricky file (type changes as we read more rows, and
+    # there are null values)
     fn = os.path.join(self.get_temp_dir(), "file.csv")
     expected_dtypes = [
         dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float32,
@@ -914,20 +889,29 @@ def testMakeCSVDataset_withTypeInference(self):
             column_names=None,
             label_name=None,
             na_value="NAN",
-            default_float_type=dtypes.float32,
         )
         features = dataset.make_one_shot_iterator().get_next()
         # Check that types match
         for i in range(len(expected_dtypes)):
+          print(features["col%d" % i].dtype, expected_dtypes[i])
           assert features["col%d" % i].dtype == expected_dtypes[i]
         for i in range(len(rows)):
           assert sess.run(features) == dict(zip(col_names, expected[i]))
 
-    # With float64 as default type for floats
+  def testMakeCSVDataset_withTypeInferenceAllTypes(self):
+    # Test that we make the correct inference for all types with fallthrough
+    fn = os.path.join(self.get_temp_dir(), "file.csv")
     expected_dtypes = [
-        dtypes.int32, dtypes.int64, dtypes.float64, dtypes.float64,
+        dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64,
         dtypes.string, dtypes.string
     ]
+    col_names = ["col%d" % i for i in range(len(expected_dtypes))]
+    rows = [[1, 2**31 + 1, 1.0, 4e40, "abc", ""]]
+    expected = [[
+        1, 2**31 + 1, 1.0, 4e40, "abc".encode("utf-8"), "".encode("utf-8")
+    ]]
+    self._write_file("file.csv", [col_names] + rows)
+
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g) as sess:
         dataset = self._make_csv_dataset(
@@ -936,7 +920,6 @@ def testMakeCSVDataset_withTypeInference(self):
             column_names=None,
             label_name=None,
             na_value="NAN",
-            default_float_type=dtypes.float64,
         )
         features = dataset.make_one_shot_iterator().get_next()
         # Check that types match
@@ -1086,5 +1069,189 @@ def testMakeCSVDataset_withShuffle(self):
           self.assertFalse(all_equal)
 
 
+class MakeTFRecordDatasetTest(TFRecordDatasetTestBase):
+
+  def _next_expected_batch(self,
+                           file_indices,
+                           batch_size,
+                           num_epochs,
+                           cycle_length,
+                           drop_final_batch,
+                           use_parser_fn):
+
+    def _next_record(file_indices):
+      for j in file_indices:
+        for i in range(self._num_records):
+          yield j, i
+
+    def _next_record_interleaved(file_indices, cycle_length):
+      return _interleave([_next_record([i]) for i in file_indices],
+                         cycle_length)
+
+    record_batch = []
+    batch_index = 0
+    for _ in range(num_epochs):
+      if cycle_length == 1:
+        next_records = _next_record(file_indices)
+      else:
+        next_records = _next_record_interleaved(file_indices, cycle_length)
+      for f, r in next_records:
+        record = self._record(f, r)
+        if use_parser_fn:
+          record = record[1:]
+        record_batch.append(record)
+        batch_index += 1
+        if len(record_batch) == batch_size:
+          yield record_batch
+          record_batch = []
+          batch_index = 0
+    if record_batch and not drop_final_batch:
+      yield record_batch
+
+  def _verify_records(self,
+                      sess,
+                      outputs,
+                      batch_size,
+                      file_index,
+                      num_epochs,
+                      interleave_cycle_length,
+                      drop_final_batch,
+                      use_parser_fn):
+    if file_index is not None:
+      file_indices = [file_index]
+    else:
+      file_indices = range(self._num_files)
+
+    for expected_batch in self._next_expected_batch(
+        file_indices, batch_size, num_epochs, interleave_cycle_length,
+        drop_final_batch, use_parser_fn):
+      actual_batch = sess.run(outputs)
+      self.assertAllEqual(expected_batch, actual_batch)
+
+  def _read_test(self, batch_size, num_epochs, file_index=None,
+                 num_parallel_reads=1, drop_final_batch=False, parser_fn=False):
+    if file_index is None:
+      file_pattern = self.test_filenames
+    else:
+      file_pattern = self.test_filenames[file_index]
+
+    if parser_fn:
+      fn = lambda x: string_ops.substr(x, 1, 999)
+    else:
+      fn = None
+
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        outputs = readers.make_tf_record_dataset(
+            file_pattern=file_pattern,
+            num_epochs=num_epochs,
+            batch_size=batch_size,
+            parser_fn=fn,
+            num_parallel_reads=num_parallel_reads,
+            drop_final_batch=drop_final_batch,
+            shuffle=False).make_one_shot_iterator().get_next()
+        self._verify_records(
+            sess, outputs, batch_size, file_index, num_epochs=num_epochs,
+            interleave_cycle_length=num_parallel_reads,
+            drop_final_batch=drop_final_batch, use_parser_fn=parser_fn)
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(outputs)
+
+  def testRead(self):
+    for batch_size in [1, 2]:
+      for num_epochs in [1, 3]:
+        # Basic test: read from file 0.
+        self._read_test(batch_size, num_epochs, 0)
+
+        # Basic test: read from file 1.
+        self._read_test(batch_size, num_epochs, 1)
+
+        # Basic test: read from both files.
+        self._read_test(batch_size, num_epochs)
+
+        # Basic test: read from both files, with parallel reads.
+        self._read_test(batch_size, num_epochs, num_parallel_reads=8)
+
+  def testDropFinalBatch(self):
+    for batch_size in [1, 2, 10]:
+      for num_epochs in [1, 3]:
+        # Read from file 0.
+        self._read_test(batch_size, num_epochs, 0, drop_final_batch=True)
+
+        # Read from both files.
+        self._read_test(batch_size, num_epochs, drop_final_batch=True)
+
+        # Read from both files, with parallel reads.
+        self._read_test(batch_size, num_epochs, num_parallel_reads=8,
+                        drop_final_batch=True)
+
+  def testParserFn(self):
+    for batch_size in [1, 2]:
+      for num_epochs in [1, 3]:
+        for drop_final_batch in [False, True]:
+          self._read_test(batch_size, num_epochs, parser_fn=True,
+                          drop_final_batch=drop_final_batch)
+          self._read_test(batch_size, num_epochs, num_parallel_reads=8,
+                          parser_fn=True, drop_final_batch=drop_final_batch)
+
+  def _shuffle_test(self, batch_size, num_epochs, num_parallel_reads=1,
+                    seed=None):
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = readers.make_tf_record_dataset(
+            file_pattern=self.test_filenames,
+            num_epochs=num_epochs,
+            batch_size=batch_size,
+            num_parallel_reads=num_parallel_reads,
+            shuffle=True,
+            shuffle_seed=seed)
+        iterator = dataset.make_initializable_iterator()
+        next_element = iterator.get_next()
+
+        sess.run(iterator.initializer)
+        first_batches = []
+        try:
+          while True:
+            first_batches.append(sess.run(next_element))
+        except errors.OutOfRangeError:
+          pass
+
+        sess.run(iterator.initializer)
+        second_batches = []
+        try:
+          while True:
+            second_batches.append(sess.run(next_element))
+        except errors.OutOfRangeError:
+          pass
+
+        self.assertEqual(len(first_batches), len(second_batches))
+        if seed is not None:
+          # if you set a seed, should get the same results
+          for i in range(len(first_batches)):
+            self.assertAllEqual(first_batches[i], second_batches[i])
+
+        expected = []
+        for f in range(self._num_files):
+          for r in range(self._num_records):
+            expected.extend([self._record(f, r)] * num_epochs)
+
+        for batches in (first_batches, second_batches):
+          actual = []
+          for b in batches:
+            actual.extend(b)
+          self.assertAllEqual(sorted(expected), sorted(actual))
+
+  def testShuffle(self):
+    for batch_size in [1, 2]:
+      for num_epochs in [1, 3]:
+        for num_parallel_reads in [1, 2]:
+          # Test that all expected elements are produced
+          self._shuffle_test(batch_size, num_epochs, num_parallel_reads)
+          # Test that elements are produced in a consistent order if
+          # you specify a seed.
+          self._shuffle_test(batch_size, num_epochs, num_parallel_reads,
+                             seed=21345)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/resample_test.py b/tensorflow/contrib/data/python/kernel_tests/resample_test.py
index 5f47dcb3399911..bdc003a8a5bd64 100644
--- a/tensorflow/contrib/data/python/kernel_tests/resample_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/resample_test.py
@@ -18,6 +18,9 @@
 from __future__ import print_function
 
 import numpy as np
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import time
+from absl.testing import parameterized
 
 from tensorflow.contrib.data.python.ops import resampling
 from tensorflow.python.data.ops import dataset_ops
@@ -30,52 +33,98 @@
 from tensorflow.python.util import compat
 
 
-class ResampleTest(test.TestCase):
+def _time_resampling(
+    test_obj, data_np, target_dist, init_dist, num_to_sample):
+  dataset = dataset_ops.Dataset.from_tensor_slices(data_np).repeat()
 
-  def testInitialKnownDistribution(self):
-    self._testDistribution(initial_known=True)
+  # Reshape distribution via rejection sampling.
+  dataset = dataset.apply(
+      resampling.rejection_resample(
+          class_func=lambda x: x,
+          target_dist=target_dist,
+          initial_dist=init_dist,
+          seed=142))
 
-  def testInitialNotKnownDistribution(self):
-    self._testDistribution(initial_known=False)
+  get_next = dataset.make_one_shot_iterator().get_next()
 
-  def _testDistribution(self, initial_known):
+  with test_obj.test_session() as sess:
+    start_time = time.time()
+    for _ in xrange(num_to_sample):
+      sess.run(get_next)
+    end_time = time.time()
+
+  return end_time - start_time
+
+
+class ResampleTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ("InitialDistributionKnown", True),
+      ("InitialDistributionUnknown", False))
+  def testDistribution(self, initial_known):
     classes = np.random.randint(5, size=(20000,))  # Uniformly sampled
     target_dist = [0.9, 0.05, 0.05, 0.0, 0.0]
     initial_dist = [0.2] * 5 if initial_known else None
-    iterator = (dataset_ops.Dataset.from_tensor_slices(classes).shuffle(
-        200, seed=21).map(lambda c: (c, string_ops.as_string(c))).apply(
-            resampling.rejection_resample(
-                target_dist=target_dist,
-                initial_dist=initial_dist,
-                class_func=lambda c, _: c,
-                seed=27)).make_one_shot_iterator())
-    get_next = iterator.get_next()
+    classes = math_ops.to_int64(classes)  # needed for Windows build.
+    dataset = dataset_ops.Dataset.from_tensor_slices(classes).shuffle(
+        200, seed=21).map(lambda c: (c, string_ops.as_string(c))).repeat()
+
+    get_next = dataset.apply(
+        resampling.rejection_resample(
+            target_dist=target_dist,
+            initial_dist=initial_dist,
+            class_func=lambda c, _: c,
+            seed=27)).make_one_shot_iterator().get_next()
 
     with self.test_session() as sess:
       returned = []
-      with self.assertRaises(errors.OutOfRangeError):
-        while True:
-          returned.append(sess.run(get_next))
+      while len(returned) < 4000:
+        returned.append(sess.run(get_next))
 
     returned_classes, returned_classes_and_data = zip(*returned)
     _, returned_data = zip(*returned_classes_and_data)
     self.assertAllEqual([compat.as_bytes(str(c))
                          for c in returned_classes], returned_data)
     total_returned = len(returned_classes)
-    # Subsampling rejects a large percentage of the initial data in
-    # this case.
-    self.assertGreater(total_returned, 20000 * 0.2)
     class_counts = np.array([
         len([True for v in returned_classes if v == c])
         for c in range(5)])
     returned_dist = class_counts / total_returned
     self.assertAllClose(target_dist, returned_dist, atol=1e-2)
 
+  @parameterized.named_parameters(
+      ("OnlyInitial", True),
+      ("NotInitial", False))
+  def testEdgeCasesSampleFromInitialDataset(self, only_initial_dist):
+    init_dist = [0.5, 0.5]
+    target_dist = [0.5, 0.5] if only_initial_dist else [0.0, 1.0]
+    num_classes = len(init_dist)
+    # We don't need many samples to test that this works.
+    num_samples = 100
+    data_np = np.random.choice(num_classes, num_samples, p=init_dist)
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(data_np)
+
+    # Reshape distribution.
+    dataset = dataset.apply(
+        resampling.rejection_resample(
+            class_func=lambda x: x,
+            target_dist=target_dist,
+            initial_dist=init_dist))
+
+    get_next = dataset.make_one_shot_iterator().get_next()
+
+    with self.test_session() as sess:
+      returned = []
+      with self.assertRaises(errors.OutOfRangeError):
+        while True:
+          returned.append(sess.run(get_next))
+
   def testRandomClasses(self):
     init_dist = [0.25, 0.25, 0.25, 0.25]
     target_dist = [0.0, 0.0, 0.0, 1.0]
     num_classes = len(init_dist)
-    # We don't need many samples to test a dirac-delta target distribution
+    # We don't need many samples to test a dirac-delta target distribution.
     num_samples = 100
     data_np = np.random.choice(num_classes, num_samples, p=init_dist)
 
@@ -109,5 +158,23 @@ def _remap_fn(_):
 
     self.assertAllClose(target_dist, bincount, atol=1e-2)
 
+
+class ResampleDatasetBenchmark(test.Benchmark):
+
+  def benchmarkResamplePerformance(self):
+    init_dist = [0.25, 0.25, 0.25, 0.25]
+    target_dist = [0.0, 0.0, 0.0, 1.0]
+    num_classes = len(init_dist)
+    # We don't need many samples to test a dirac-delta target distribution
+    num_samples = 1000
+    data_np = np.random.choice(num_classes, num_samples, p=init_dist)
+
+    resample_time = _time_resampling(
+        self, data_np, target_dist, init_dist, num_to_sample=1000)
+
+    self.report_benchmark(
+        iters=1000, wall_time=resample_time, name="benchmark_resample")
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py
index 1a97a84b2cba13..eb2ceff893543f 100644
--- a/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/scan_dataset_op_test.py
@@ -28,6 +28,7 @@
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
@@ -35,15 +36,19 @@
 
 class ScanDatasetTest(test.TestCase):
 
-  def _count(self, start, step):
-    return dataset_ops.Dataset.from_tensors(0).repeat(None).apply(
-        scan_ops.scan(start, lambda state, _: (state + step, state)))
+  def _counting_dataset(self, start, scan_fn):
+    return dataset_ops.Dataset.from_tensors(0).repeat().apply(
+        scan_ops.scan(start, scan_fn))
 
   def testCount(self):
+    def make_scan_fn(step):
+      return lambda state, _: (state + step, state)
+
     start = array_ops.placeholder(dtypes.int32, shape=[])
     step = array_ops.placeholder(dtypes.int32, shape=[])
     take = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = self._count(start, step).take(take).make_initializable_iterator()
+    iterator = self._counting_dataset(
+        start, make_scan_fn(step)).take(take).make_initializable_iterator()
     next_element = iterator.get_next()
 
     with self.test_session() as sess:
@@ -78,6 +83,37 @@ def testFibonacci(self):
     self.assertEqual(5, self.evaluate(next_element()))
     self.assertEqual(8, self.evaluate(next_element()))
 
+  def testSparseCount(self):
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=np.array([[0, 0]]),
+          values=(i * np.array([1])),
+          dense_shape=np.array([1, 1]))
+
+    def make_scan_fn(step):
+      return lambda state, _: (_sparse(state.values[0] + step), state)
+
+    start = array_ops.placeholder(dtypes.int32, shape=[])
+    step = array_ops.placeholder(dtypes.int32, shape=[])
+    take = array_ops.placeholder(dtypes.int64, shape=[])
+    iterator = self._counting_dataset(
+        _sparse(start),
+        make_scan_fn(step)).take(take).make_initializable_iterator()
+    next_element = iterator.get_next()
+
+    with self.test_session() as sess:
+
+      for start_val, step_val, take_val in [(0, 1, 10), (0, 1, 0), (10, 1, 10),
+                                            (10, 2, 10), (10, -1, 10),
+                                            (10, -2, 10)]:
+        sess.run(iterator.initializer,
+                 feed_dict={start: start_val, step: step_val, take: take_val})
+        for expected, _ in zip(
+            itertools.count(start_val, step_val), range(take_val)):
+          self.assertEqual(expected, sess.run(next_element).values[0])
+        with self.assertRaises(errors.OutOfRangeError):
+          sess.run(next_element)
+
   def testChangingStateShape(self):
     # Test the fixed-point shape invariant calculations: start with
     # initial values with known shapes, and use a scan function that
@@ -132,7 +168,7 @@ def _scan_fn(unused_state, unused_input_value):
           scan_ops.scan(constant_op.constant(1, dtype=dtypes.int32), _scan_fn))
 
 
-class ScanDatasetSerialzationTest(
+class ScanDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
 
   def _build_dataset(self, num_elements):
diff --git a/tensorflow/contrib/data/python/kernel_tests/sql_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/sql_dataset_op_test.py
index e26cef8ec522c7..4148addf2878c9 100644
--- a/tensorflow/contrib/data/python/kernel_tests/sql_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/sql_dataset_op_test.py
@@ -22,6 +22,7 @@
 
 import sqlite3
 
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
 from tensorflow.contrib.data.python.ops import readers
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -29,7 +30,7 @@
 from tensorflow.python.platform import test
 
 
-class SqlDatasetTest(test.TestCase):
+class SqlDatasetTestBase(test.TestCase):
 
   def _createSqlDataset(self, output_types, num_repeats=1):
     dataset = readers.SqlDataset(self.driver_name, self.data_source_name,
@@ -92,6 +93,9 @@ def setUp(self):
     conn.commit()
     conn.close()
 
+
+class SqlDatasetTest(SqlDatasetTestBase):
+
   # Test that SqlDataset can read from a database table.
   def testReadResultSet(self):
     init_op, get_next = self._createSqlDataset((dtypes.string, dtypes.string,
@@ -652,5 +656,27 @@ def testReadResultSetFloat64LargestConsecutiveWholeNumbersNotEqual(self):
         sess.run(get_next)
 
 
+class SqlDatasetSerializationTest(
+    SqlDatasetTestBase,
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def _build_dataset(self, num_repeats):
+    data_source_name = os.path.join(test.get_temp_dir(), "tftest.sqlite")
+    driver_name = array_ops.placeholder_with_default(
+        array_ops.constant("sqlite", dtypes.string), shape=[])
+    query = ("SELECT first_name, last_name, motto FROM students ORDER BY "
+             "first_name DESC")
+    output_types = (dtypes.string, dtypes.string, dtypes.string)
+    return readers.SqlDataset(driver_name, data_source_name, query,
+                              output_types).repeat(num_repeats)
+
+  def testSQLSaveable(self):
+    num_repeats = 4
+    num_outputs = num_repeats * 2
+    self.run_core_tests(lambda: self._build_dataset(num_repeats),
+                        lambda: self._build_dataset(num_repeats // 2),
+                        num_outputs)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index 5b04c5316cfbb7..086661adb76033 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -45,6 +45,27 @@ py_library(
         "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:training",
+        "//tensorflow/python/data/ops:iterator_ops",
+    ],
+)
+
+py_test(
+    name = "iterator_ops_test",
+    size = "small",
+    srcs = ["iterator_ops_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":iterator_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:model_fn",
     ],
 )
 
@@ -187,12 +208,27 @@ py_library(
     ],
 )
 
+py_library(
+    name = "optimization",
+    srcs = ["optimization.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":contrib_op_loader",
+        ":gen_dataset_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
 py_library(
     name = "resampling",
     srcs = ["resampling.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":batching",
+        ":interleave_ops",
         ":scan_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
@@ -202,6 +238,7 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -345,6 +382,7 @@ py_library(
         ":get_single_element",
         ":grouping",
         ":interleave_ops",
+        ":optimization",
         ":prefetching_ops",
         ":readers",
         ":resampling",
diff --git a/tensorflow/contrib/data/python/ops/batching.py b/tensorflow/contrib/data/python/ops/batching.py
index 2152bcde84aae6..b9393de4e90ae2 100644
--- a/tensorflow/contrib/data/python/ops/batching.py
+++ b/tensorflow/contrib/data/python/ops/batching.py
@@ -364,7 +364,7 @@ def __init__(self,
         with the structure of `dataset`.
     """
     super(_RestructuredDataset, self).__init__()
-    self._dataset = dataset
+    self._input_dataset = dataset
 
     if not allow_unsafe_cast:
       # Validate that the types are compatible.
@@ -408,7 +408,7 @@ def __init__(self,
       self._output_classes = output_classes
 
   def _as_variant_tensor(self):
-    return self._dataset._as_variant_tensor()  # pylint: disable=protected-access
+    return self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
 
   @property
   def output_classes(self):
@@ -466,14 +466,14 @@ def _apply_fn(dataset):
 class _MapAndBatchDataset(dataset_ops.MapDataset):
   """A `Dataset` that maps a function over a batch of elements."""
 
-  def __init__(self, input_dataset, map_func, batch_size, num_parallel_batches,
+  def __init__(self, input_dataset, map_func, batch_size, num_parallel_calls,
                drop_remainder):
     """See `Dataset.map()` for details."""
     super(_MapAndBatchDataset, self).__init__(input_dataset, map_func)
     self._batch_size_t = ops.convert_to_tensor(
         batch_size, dtype=dtypes.int64, name="batch_size")
-    self._num_parallel_batches_t = ops.convert_to_tensor(
-        num_parallel_batches, dtype=dtypes.int64, name="num_parallel_batches")
+    self._num_parallel_calls_t = ops.convert_to_tensor(
+        num_parallel_calls, dtype=dtypes.int64, name="num_parallel_calls")
     self._drop_remainder_t = ops.convert_to_tensor(
         drop_remainder, dtype=dtypes.bool, name="drop_remainder")
 
@@ -483,12 +483,12 @@ def __init__(self, input_dataset, map_func, batch_size, num_parallel_batches,
   def _as_variant_tensor(self):
     # pylint: disable=protected-access
     input_resource = self._input_dataset._as_variant_tensor()
-    return gen_dataset_ops.map_and_batch_dataset(
+    return gen_dataset_ops.map_and_batch_dataset_v2(
         input_resource,
         self._map_func.captured_inputs,
         f=self._map_func,
         batch_size=self._batch_size_t,
-        num_parallel_batches=self._num_parallel_batches_t,
+        num_parallel_calls=self._num_parallel_calls_t,
         drop_remainder=self._drop_remainder_t,
         output_types=nest.flatten(
             sparse.as_dense_types(self.output_types, self.output_classes)),
@@ -511,8 +511,9 @@ def output_types(self):
 
 def map_and_batch(map_func,
                   batch_size,
-                  num_parallel_batches=1,
-                  drop_remainder=False):
+                  num_parallel_batches=None,
+                  drop_remainder=False,
+                  num_parallel_calls=None):
   """Fused implementation of `map` and `batch`.
 
   Maps `map_func` across `batch_size` consecutive elements of this dataset
@@ -528,21 +529,37 @@ def map_and_batch(map_func,
       nested structure of tensors.
     batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
       consecutive elements of this dataset to combine in a single batch.
-    num_parallel_batches: A `tf.int64` scalar `tf.Tensor`, representing the
-      number of batches to create in parallel. On one hand, higher values can
-      help mitigate the effect of stragglers. On the other hand, higher values
-      can increase contention if CPU is scarce.
-    drop_remainder: A `tf.bool` scalar `tf.Tensor`, representing whether the
-      last batch should be dropped in case its size is smaller than desired;
-      the default behavior is not to drop the smaller batch.
+    num_parallel_batches: (Optional.) A `tf.int64` scalar `tf.Tensor`,
+      representing the number of batches to create in parallel. On one hand,
+      higher values can help mitigate the effect of stragglers. On the other
+      hand, higher values can increase contention if CPU is scarce.
+    drop_remainder: (Optional.) A `tf.bool` scalar `tf.Tensor`, representing
+      whether the last batch should be dropped in case its size is smaller than
+      desired; the default behavior is not to drop the smaller batch.
+    num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`,
+        representing the number of elements to process in parallel. If not
+        specified, `batch_size * num_parallel_batches` elements will be
+        processed in parallel.
 
   Returns:
     A `Dataset` transformation function, which can be passed to
     @{tf.data.Dataset.apply}.
+
+  Raises:
+    ValueError: If both `num_parallel_batches` and `num_parallel_calls` are
+      specified.
   """
 
+  if num_parallel_batches is None and num_parallel_calls is None:
+    num_parallel_calls = batch_size
+  elif num_parallel_batches is not None and num_parallel_calls is None:
+    num_parallel_calls = batch_size * num_parallel_batches
+  elif num_parallel_batches is not None and num_parallel_calls is not None:
+    raise ValueError("The `num_parallel_batches` and `num_parallel_calls` "
+                     "arguments are mutually exclusive.")
+
   def _apply_fn(dataset):
     return _MapAndBatchDataset(dataset, map_func, batch_size,
-                               num_parallel_batches, drop_remainder)
+                               num_parallel_calls, drop_remainder)
 
   return _apply_fn
diff --git a/tensorflow/contrib/data/python/ops/grouping.py b/tensorflow/contrib/data/python/ops/grouping.py
index 0531f9cbb9da6e..ea229b5b27b117 100644
--- a/tensorflow/contrib/data/python/ops/grouping.py
+++ b/tensorflow/contrib/data/python/ops/grouping.py
@@ -26,6 +26,7 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
@@ -33,6 +34,35 @@
 from tensorflow.python.ops import math_ops
 
 
+def group_by_reducer(key_func, reducer):
+  """A transformation that groups elements and performs a reduction.
+
+  This transformation maps element of a dataset to a key using `key_func` and
+  groups the elements by key. The `reducer` is used to process each group; its
+  `init_func` is used to initialize state for each group when it is created, the
+  `reduce_func` is used to update the state every time an element is mapped to
+  the matching group, and the `finalize_func` is used to map the final state to
+  an output value.
+
+  Args:
+    key_func: A function mapping a nested structure of tensors
+      (having shapes and types defined by `self.output_shapes` and
+      `self.output_types`) to a scalar `tf.int64` tensor.
+    reducer: An instance of `Reducer`, which captures the reduction logic using
+      the `init_func`, `reduce_func`, and `finalize_func` functions.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    @{tf.data.Dataset.apply}.
+  """
+
+  def _apply_fn(dataset):
+    """Function from `Dataset` to `Dataset` that applies the transformation."""
+    return GroupByReducerDataset(dataset, key_func, reducer)
+
+  return _apply_fn
+
+
 def group_by_window(key_func,
                     reduce_func,
                     window_size=None,
@@ -227,6 +257,250 @@ def output_types(self):
     return self._output_types
 
 
+class GroupByReducerDataset(dataset_ops.Dataset):
+  """A `Dataset` that groups its input and performs a reduction."""
+
+  def __init__(self, input_dataset, key_func, reducer):
+    """See `group_by_reducer()` for details."""
+    super(GroupByReducerDataset, self).__init__()
+
+    self._input_dataset = input_dataset
+
+    self._make_key_func(key_func, input_dataset)
+    self._make_init_func(reducer.init_func)
+    self._make_reduce_func(reducer.reduce_func, input_dataset)
+    self._make_finalize_func(reducer.finalize_func)
+
+  def _make_key_func(self, key_func, input_dataset):
+    """Make wrapping Defun for key_func."""
+
+    @function.Defun(*nest.flatten(
+        sparse.as_dense_types(input_dataset.output_types,
+                              input_dataset.output_classes)))
+    def tf_key_func(*args):
+      """A wrapper for Defun that facilitates shape inference."""
+      # Pass in shape information from the input_dataset.
+      dense_shapes = sparse.as_dense_shapes(input_dataset.output_shapes,
+                                            input_dataset.output_classes)
+      for arg, shape in zip(args, nest.flatten(dense_shapes)):
+        arg.set_shape(shape)
+
+      nested_args = nest.pack_sequence_as(input_dataset.output_types, args)
+      nested_args = sparse.deserialize_sparse_tensors(
+          nested_args, input_dataset.output_types, input_dataset.output_shapes,
+          input_dataset.output_classes)
+      # pylint: disable=protected-access
+      if dataset_ops._should_unpack_args(nested_args):
+        ret = key_func(*nested_args)
+      # pylint: enable=protected-access
+      else:
+        ret = key_func(nested_args)
+      ret = ops.convert_to_tensor(ret)
+      if ret.dtype != dtypes.int64 or ret.get_shape() != tensor_shape.scalar():
+        raise ValueError(
+            "`key_func` must return a single tf.int64 tensor. "
+            "Got type=%s and shape=%s" % (ret.dtype, ret.get_shape()))
+      return ret
+
+    self._key_func = tf_key_func
+    self._key_func.add_to_graph(ops.get_default_graph())
+
+  def _make_init_func(self, init_func):
+    """Make wrapping Defun for init_func."""
+
+    @function.Defun(dtypes.int64)
+    def tf_init_func(key):
+      """A wrapper for Defun that facilitates shape inference."""
+      key.set_shape([])
+      ret = init_func(key)
+      # Convert any `SparseTensorValue`s to `SparseTensor`s and all other
+      # values to tensors.
+      ret = nest.pack_sequence_as(ret, [
+          sparse_tensor.SparseTensor.from_value(t)
+          if sparse_tensor.is_sparse(t) else ops.convert_to_tensor(t)
+          for t in nest.flatten(ret)
+      ])
+
+      self._state_classes = sparse.get_classes(ret)
+      self._state_shapes = nest.pack_sequence_as(
+          ret, [t.get_shape() for t in nest.flatten(ret)])
+      self._state_types = nest.pack_sequence_as(
+          ret, [t.dtype for t in nest.flatten(ret)])
+
+      # Serialize any sparse tensors.
+      ret = nest.pack_sequence_as(
+          ret, [t for t in nest.flatten(sparse.serialize_sparse_tensors(ret))])
+      return nest.flatten(ret)
+
+    self._init_func = tf_init_func
+    self._init_func.add_to_graph(ops.get_default_graph())
+
+  def _make_reduce_func(self, reduce_func, input_dataset):
+    """Make wrapping Defun for reduce_func."""
+
+    # Iteratively rerun the reduce function until reaching a fixed point on
+    # `self._state_shapes`.
+    need_to_rerun = True
+    while need_to_rerun:
+
+      # Create a list in which `tf_reduce_func` will store the new shapes.
+      flat_new_state_shapes = []
+
+      @function.Defun(*(nest.flatten(
+          sparse.as_dense_types(
+              self._state_types, self._state_classes)) + nest.flatten(
+                  sparse.as_dense_types(input_dataset.output_types,
+                                        input_dataset.output_classes))))
+      def tf_reduce_func(*args):
+        """A wrapper for Defun that facilitates shape inference."""
+        for arg, shape in zip(
+            args,
+            nest.flatten(
+                sparse.as_dense_shapes(self._state_shapes, self._state_classes))
+            + nest.flatten(
+                sparse.as_dense_shapes(input_dataset.output_shapes,
+                                       input_dataset.output_classes))):
+          arg.set_shape(shape)
+
+        pivot = len(nest.flatten(self._state_shapes))
+        nested_state_args = nest.pack_sequence_as(self._state_types,
+                                                  args[:pivot])
+        nested_state_args = sparse.deserialize_sparse_tensors(
+            nested_state_args, self._state_types, self._state_shapes,
+            self._state_classes)
+        nested_input_args = nest.pack_sequence_as(input_dataset.output_types,
+                                                  args[pivot:])
+        nested_input_args = sparse.deserialize_sparse_tensors(
+            nested_input_args, input_dataset.output_types,
+            input_dataset.output_shapes, input_dataset.output_classes)
+
+        ret = reduce_func(nested_state_args, nested_input_args)
+
+        # Convert any `SparseTensorValue`s to `SparseTensor`s and all other
+        # values to tensors.
+        ret = nest.pack_sequence_as(ret, [
+            sparse_tensor.SparseTensor.from_value(t)
+            if sparse_tensor.is_sparse(t) else ops.convert_to_tensor(t)
+            for t in nest.flatten(ret)
+        ])
+
+        # Extract shape information from the returned values.
+        flat_new_state = nest.flatten(ret)
+        flat_new_state_shapes.extend([t.get_shape() for t in flat_new_state])
+
+        # Extract and validate type information from the returned values.
+        for t, dtype in zip(flat_new_state, nest.flatten(self._state_types)):
+          if t.dtype != dtype:
+            raise TypeError(
+                "The element types for the new state must match the initial "
+                "state. Expected %s; got %s." %
+                (self._state_types,
+                 nest.pack_sequence_as(self._state_types,
+                                       [t.dtype for t in flat_new_state])))
+
+        # Serialize any sparse tensors.
+        ret = nest.pack_sequence_as(
+            ret,
+            [t for t in nest.flatten(sparse.serialize_sparse_tensors(ret))])
+        return nest.flatten(ret)
+
+      # Use the private method that will execute `tf_reduce_func` but delay
+      # adding it to the graph in case we need to rerun the function.
+      tf_reduce_func._create_definition_if_needed()  # pylint: disable=protected-access
+
+      flat_state_shapes = nest.flatten(self._state_shapes)
+      weakened_state_shapes = [
+          old.most_specific_compatible_shape(new)
+          for old, new in zip(flat_state_shapes, flat_new_state_shapes)
+      ]
+
+      need_to_rerun = False
+      for old_shape, weakened_shape in zip(flat_state_shapes,
+                                           weakened_state_shapes):
+        if old_shape.ndims is not None and (
+            weakened_shape.ndims is None or
+            old_shape.as_list() != weakened_shape.as_list()):
+          need_to_rerun = True
+          break
+
+      if need_to_rerun:
+        self._state_shapes = nest.pack_sequence_as(self._state_shapes,
+                                                   weakened_state_shapes)
+
+    self._reduce_func = tf_reduce_func
+    self._reduce_func.add_to_graph(ops.get_default_graph())
+
+  def _make_finalize_func(self, finalize_func):
+    """Make wrapping Defun for finalize_func."""
+
+    @function.Defun(*(nest.flatten(
+        sparse.as_dense_types(self._state_types, self._state_classes))))
+    def tf_finalize_func(*args):
+      """A wrapper for Defun that facilitates shape inference."""
+      for arg, shape in zip(
+          args,
+          nest.flatten(
+              sparse.as_dense_shapes(self._state_shapes, self._state_classes))):
+        arg.set_shape(shape)
+
+      nested_args = nest.pack_sequence_as(self._state_types, args)
+      nested_args = sparse.deserialize_sparse_tensors(
+          nested_args, self._state_types, self._state_shapes,
+          self._state_classes)
+
+      ret = finalize_func(nested_args)
+
+      # Convert any `SparseTensorValue`s to `SparseTensor`s and all other
+      # values to tensors.
+      ret = nest.pack_sequence_as(ret, [
+          sparse_tensor.SparseTensor.from_value(t)
+          if sparse_tensor.is_sparse(t) else ops.convert_to_tensor(t)
+          for t in nest.flatten(ret)
+      ])
+
+      self._output_classes = sparse.get_classes(ret)
+      self._output_shapes = nest.pack_sequence_as(
+          ret, [t.get_shape() for t in nest.flatten(ret)])
+      self._output_types = nest.pack_sequence_as(
+          ret, [t.dtype for t in nest.flatten(ret)])
+
+      # Serialize any sparse tensors.
+      ret = nest.pack_sequence_as(
+          ret, [t for t in nest.flatten(sparse.serialize_sparse_tensors(ret))])
+      return nest.flatten(ret)
+
+    self._finalize_func = tf_finalize_func
+    self._finalize_func.add_to_graph(ops.get_default_graph())
+
+  @property
+  def output_classes(self):
+    return self._output_classes
+
+  @property
+  def output_shapes(self):
+    return self._output_shapes
+
+  @property
+  def output_types(self):
+    return self._output_types
+
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.group_by_reducer_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        self._key_func.captured_inputs,
+        self._init_func.captured_inputs,
+        self._reduce_func.captured_inputs,
+        self._finalize_func.captured_inputs,
+        key_func=self._key_func,
+        init_func=self._init_func,
+        reduce_func=self._reduce_func,
+        finalize_func=self._finalize_func,
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)),
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+
+
 class GroupByWindowDataset(dataset_ops.Dataset):
   """A `Dataset` that groups its input and performs a windowed reduction."""
 
@@ -336,3 +610,30 @@ def _as_variant_tensor(self):
             sparse.as_dense_types(self.output_types, self.output_classes)),
         output_shapes=nest.flatten(
             sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+
+
+class Reducer(object):
+  """A reducer is used for reducing a set of elements.
+
+  A reducer is represented as a tuple of the three functions:
+    1) initialization function: key => initial state
+    2) reduce function: (old state, input) => new state
+    3) finalization function: state => result
+  """
+
+  def __init__(self, init_func, reduce_func, finalize_func):
+    self._init_func = init_func
+    self._reduce_func = reduce_func
+    self._finalize_func = finalize_func
+
+  @property
+  def init_func(self):
+    return self._init_func
+
+  @property
+  def reduce_func(self):
+    return self._reduce_func
+
+  @property
+  def finalize_func(self):
+    return self._finalize_func
diff --git a/tensorflow/contrib/data/python/ops/interleave_ops.py b/tensorflow/contrib/data/python/ops/interleave_ops.py
index 812a50ecbf1053..be66fbac50753c 100644
--- a/tensorflow/contrib/data/python/ops/interleave_ops.py
+++ b/tensorflow/contrib/data/python/ops/interleave_ops.py
@@ -27,6 +27,7 @@
 from tensorflow.python.data.util import sparse
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import deprecation
@@ -240,3 +241,47 @@ def select_dataset(logits, seed):
       (logits_ds, random_ops.RandomDataset(seed).batch(2))).map(select_dataset)
 
   return DirectedInterleaveDataset(selector_input, datasets)
+
+
+def choose_from_datasets(datasets, choice_dataset):
+  """Creates a dataset that deterministically chooses elements from `datasets`.
+
+  For example, given the following datasets:
+
+  ```python
+  datasets = [tf.data.Dataset.from_tensors("foo").repeat(),
+              tf.data.Dataset.from_tensors("bar").repeat(),
+              tf.data.Dataset.from_tensors("baz").repeat()]
+
+  # Define a dataset containing `[0, 1, 2, 0, 1, 2, 0, 1, 2]`.
+  choice_dataset = tf.data.Dataset.range(3).repeat(3)
+
+  result = tf.contrib.data.choose_from_datasets(datasets, choice_dataset)
+  ```
+
+  The elements of `result` will be:
+
+  ```
+  "foo", "bar", "baz", "foo", "bar", "baz", "foo", "bar", "baz"
+  ```
+
+  Args:
+    datasets: A list of @{tf.data.Dataset} objects with compatible structure.
+    choice_dataset: A @{tf.data.Dataset} of scalar `tf.int64` tensors between
+      `0` and `len(datasets) - 1`.
+
+  Returns:
+    A dataset that interleaves elements from `datasets` according to the values
+    of `choice_dataset`.
+
+  Raises:
+    TypeError: If the `datasets` or `choice_dataset` arguments have the wrong
+      type.
+  """
+  if not (choice_dataset.output_types == dtypes.int64
+          and choice_dataset.output_shapes.is_compatible_with(
+              tensor_shape.scalar())
+          and choice_dataset.output_classes == ops.Tensor):
+    raise TypeError("`choice_dataset` must be a dataset of scalar "
+                    "`tf.int64` tensors.")
+  return DirectedInterleaveDataset(choice_dataset, datasets)
diff --git a/tensorflow/contrib/data/python/ops/iterator_ops.py b/tensorflow/contrib/data/python/ops/iterator_ops.py
index d736029fb035e5..0d71be66018eee 100644
--- a/tensorflow/contrib/data/python/ops/iterator_ops.py
+++ b/tensorflow/contrib/data/python/ops/iterator_ops.py
@@ -16,10 +16,12 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-
+from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_dataset_ops
-from tensorflow.python.training import saver
+from tensorflow.python.training import basic_session_run_hooks
+from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.training import session_run_hook
 
 
 def make_saveable_from_iterator(iterator):
@@ -60,14 +62,14 @@ def make_saveable_from_iterator(iterator):
   return _Saveable(iterator._iterator_resource)  # pylint: disable=protected-access
 
 
-class _Saveable(saver.BaseSaverBuilder.SaveableObject):
+class _Saveable(saver_lib.BaseSaverBuilder.SaveableObject):
   """SaveableObject for saving/restoring iterator state."""
 
   def __init__(self, iterator_resource):
     serialized_iterator = gen_dataset_ops.serialize_iterator(iterator_resource)
     specs = [
-        saver.BaseSaverBuilder.SaveSpec(serialized_iterator, "",
-                                        iterator_resource.name + "-state")
+        saver_lib.BaseSaverBuilder.SaveSpec(serialized_iterator, "",
+                                            iterator_resource.name + "-state")
     ]
     super(_Saveable, self).__init__(iterator_resource, specs,
                                     iterator_resource.name)
@@ -75,3 +77,182 @@ def __init__(self, iterator_resource):
   def restore(self, restored_tensors, unused_restored_shapes):
     with ops.colocate_with(self.op):
       return gen_dataset_ops.deserialize_iterator(self.op, restored_tensors[0])
+
+
+class CheckpointInputPipelineHook(session_run_hook.SessionRunHook):
+  """Checkpoints input pipeline state every N steps or seconds.
+
+  This hook saves the state of the iterators in the `Graph` so that when
+  training is resumed the input pipeline continues from where it left off.
+  This could potentially avoid overfitting in certain pipelines where the
+  number of training steps per eval are small compared to the dataset
+  size or if the training pipeline is pre-empted.
+
+  Differences from `CheckpointSaverHook`:
+  1. Saves only the input pipelines in the "iterators" collection and not the
+     global variables or other saveable objects.
+  2. Does not write the `GraphDef` and `MetaGraphDef` to the summary.
+
+  Example of checkpointing the training pipeline:
+
+  ```python
+  est = tf.estimator.Estimator(model_fn)
+  while True:
+    est.train(
+        train_input_fn,
+        hooks=[tf.contrib.data.CheckpointInputPipelineHook(est)],
+        steps=train_steps_per_eval)
+    # Note: We do not pass the hook here.
+    metrics = est.evaluate(eval_input_fn)
+    if should_stop_the_training(metrics):
+      break
+  ```
+
+  This hook should be used if the input pipeline state needs to be saved
+  separate from the model checkpoint. Doing so may be useful for a few reasons:
+  1. The input pipeline checkpoint may be large, if there are large shuffle
+     or prefetch buffers for instance, and may bloat the checkpoint size.
+  2. If the input pipeline is shared between training and validation, restoring
+     the checkpoint during validation may override the validation input
+     pipeline.
+
+  For saving the input pipeline checkpoint alongside the model weights use
+  @{tf.contrib.data.make_saveable_from_iterator} directly to create a
+  `SaveableObject` and add to the `SAVEABLE_OBJECTS` collection. Note, however,
+  that you will need to be careful not to restore the training iterator during
+  eval. You can do that by not adding the iterator to the SAVEABLE_OBJECTS
+  collector when building the eval graph.
+  """
+
+  def __init__(self, estimator):
+    """Initializes a `CheckpointInputPipelineHook`.
+
+    Args:
+      estimator: Estimator.
+
+    Raises:
+      ValueError: One of `save_steps` or `save_secs` should be set.
+      ValueError: At most one of saver or scaffold should be set.
+    """
+    # `checkpoint_basename` is "input.ckpt" for non-distributed pipelines or
+    # of the form "input_<task_type>_<task_id>.ckpt" for distributed pipelines.
+    # Note: The default `checkpoint_basename` used by `CheckpointSaverHook` is
+    # "model.ckpt". We intentionally choose the input pipeline checkpoint prefix
+    # to be different to avoid conflicts with the model checkpoint.
+
+    # pylint: disable=protected-access
+    checkpoint_prefix = "input"
+    if estimator._config.num_worker_replicas > 1:
+      # Distributed setting.
+      suffix = "_{}_{}".format(estimator._config.task_type,
+                               estimator._config.task_id)
+      checkpoint_prefix += suffix
+    # pylint: enable=protected-access
+
+    # We use a composition paradigm instead of inheriting from
+    # `CheckpointSaverHook` because `Estimator` does an `isinstance` check
+    # to check whether a `CheckpointSaverHook` is already present in the list
+    # of hooks and if not, adds one. Inheriting from `CheckpointSaverHook`
+    # would thwart this behavior. This hook checkpoints *only the iterators*
+    # and not the graph variables.
+    self._checkpoint_saver_hook = basic_session_run_hooks.CheckpointSaverHook(
+        estimator.model_dir,
+        save_secs=estimator._config.save_checkpoints_secs,  # pylint: disable=protected-access
+        save_steps=estimator._config.save_checkpoints_steps,  # pylint: disable=protected-access
+        checkpoint_basename=checkpoint_prefix + ".ckpt")
+
+    # Name for the protocol buffer file that will contain the list of most
+    # recent checkpoints stored as a `CheckpointState` protocol buffer.
+    # This file, kept in the same directory as the checkpoint files, is
+    # automatically managed by the `Saver` to keep track of recent checkpoints.
+    # The default name used by the `Saver` for this file is "checkpoint". Here
+    # we use the name "checkpoint_<checkpoint_prefix>" so that in case the
+    # `checkpoint_dir` is the same as the model checkpoint directory, there are
+    # no conflicts during restore.
+    self._latest_filename = "checkpoint_" + checkpoint_prefix
+    self._first_run = True
+
+  def begin(self):
+    # Build a Saver that saves all iterators in the `GLOBAL_ITERATORS`
+    # collection if no `Saver` or `Scaffold` is provided.
+    # pylint: disable=protected-access
+    if (self._checkpoint_saver_hook._saver is None and
+        self._checkpoint_saver_hook._scaffold is None):
+      iterators = ops.get_collection(iterator_ops.GLOBAL_ITERATORS)
+      saveables = [_Saveable(i) for i in iterators]
+      self._checkpoint_saver_hook._saver = _CustomSaver(saveables,
+                                                        self._latest_filename)
+    # pylint: enable=protected-access
+    self._checkpoint_saver_hook.begin()
+
+  def _restore_or_save_initial_ckpt(self, session):
+    # Ideally this should be run in after_create_session but is not for the
+    # following reason:
+    # Currently there is no way of enforcing an order of running the
+    # `SessionRunHooks`. Hence it is possible that the `_DatasetInitializerHook`
+    # is run *after* this hook. That is troublesome because
+    # 1. If a checkpoint exists and this hook restores it, the initializer hook
+    #    will override it.
+    # 2. If no checkpoint exists, this hook will try to save an initialized
+    #    iterator which will result in an exception.
+    #
+    # As a temporary fix we enter the following implicit contract between this
+    # hook and the _DatasetInitializerHook.
+    # 1. The _DatasetInitializerHook initializes the iterator in the call to
+    #    after_create_session.
+    # 2. This hook saves the iterator on the first call to `before_run()`, which
+    #    is guaranteed to happen after `after_create_session()` of all hooks
+    #    have been run.
+
+    # Check if there is an existing checkpoint. If so, restore from it.
+    # pylint: disable=protected-access
+    latest_checkpoint_path = saver_lib.latest_checkpoint(
+        self._checkpoint_saver_hook._checkpoint_dir,
+        latest_filename=self._latest_filename)
+    if latest_checkpoint_path:
+      self._checkpoint_saver_hook._get_saver().restore(session,
+                                                       latest_checkpoint_path)
+    else:
+      # The checkpoint saved here is the state at step "global_step".
+      # Note: We do not save the GraphDef or MetaGraphDef here.
+      global_step = session.run(self._checkpoint_saver_hook._global_step_tensor)
+      self._checkpoint_saver_hook._save(session, global_step)
+      self._checkpoint_saver_hook._timer.update_last_triggered_step(global_step)
+    # pylint: enable=protected-access
+
+  def before_run(self, run_context):
+    if self._first_run:
+      self._restore_or_save_initial_ckpt(run_context.session)
+      self._first_run = False
+    return self._checkpoint_saver_hook.before_run(run_context)
+
+  def after_run(self, run_context, run_values):
+    self._checkpoint_saver_hook.after_run(run_context, run_values)
+
+  def end(self, session):
+    self._checkpoint_saver_hook.end(session)
+
+
+class _CustomSaver(saver_lib.Saver):
+  """`Saver` with a different default `latest_filename`.
+
+  This is used in the `CheckpointInputPipelineHook` to avoid conflicts with
+  the model ckpt saved by the `CheckpointSaverHook`.
+  """
+
+  def __init__(self, var_list, latest_filename):
+    super(_CustomSaver, self).__init__(var_list)
+    self._latest_filename = latest_filename
+
+  def save(self,
+           sess,
+           save_path,
+           global_step=None,
+           latest_filename=None,
+           meta_graph_suffix="meta",
+           write_meta_graph=True,
+           write_state=True,
+           strip_default_attrs=False):
+    return super(_CustomSaver, self).save(
+        sess, save_path, global_step, latest_filename or self._latest_filename,
+        meta_graph_suffix, write_meta_graph, write_state, strip_default_attrs)
diff --git a/tensorflow/contrib/data/python/ops/iterator_ops_test.py b/tensorflow/contrib/data/python/ops/iterator_ops_test.py
new file mode 100644
index 00000000000000..30a993b1f7056b
--- /dev/null
+++ b/tensorflow/contrib/data/python/ops/iterator_ops_test.py
@@ -0,0 +1,123 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for experimental iterator_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.data.python.ops import iterator_ops
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator import model_fn
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.training import training_util
+
+
+class CheckpointInputPipelineHookTest(test.TestCase):
+
+  @staticmethod
+  def _model_fn(features, labels, mode, config):
+    del labels
+    del mode
+    del config
+    global_step = training_util.get_or_create_global_step()
+    update_global_step_op = global_step.assign_add(1)
+    latest_feature = variables.Variable(
+        0, name='latest_feature', dtype=dtypes.int64)
+    store_latest_feature_op = latest_feature.assign(features)
+    ops.add_to_collection('my_vars', global_step)
+    ops.add_to_collection('my_vars', latest_feature)
+    return model_fn.EstimatorSpec(
+        mode='train',
+        train_op=control_flow_ops.group(
+            [update_global_step_op, store_latest_feature_op]),
+        loss=constant_op.constant(2.0))
+
+  def _read_vars(self, model_dir):
+    """Returns (global_step, latest_feature)."""
+    with ops.Graph().as_default() as g:
+      ckpt_path = saver_lib.latest_checkpoint(model_dir)
+      meta_filename = ckpt_path + '.meta'
+      saver_lib.import_meta_graph(meta_filename)
+      saver = saver_lib.Saver()
+      with self.test_session(graph=g) as sess:
+        saver.restore(sess, ckpt_path)
+        return sess.run(ops.get_collection('my_vars'))
+
+  def _build_iterator_saver_hook(self, est):
+    return iterator_ops.CheckpointInputPipelineHook(est)
+
+  def testReturnDatasetFromInputFn(self):
+
+    def _input_fn():
+      return dataset_ops.Dataset.range(10)
+
+    est = estimator.Estimator(model_fn=self._model_fn)
+
+    est.train(_input_fn, steps=2, hooks=[self._build_iterator_saver_hook(est)])
+    self.assertSequenceEqual(self._read_vars(est.model_dir), (2, 1))
+    est.train(_input_fn, steps=2, hooks=[self._build_iterator_saver_hook(est)])
+    self.assertSequenceEqual(self._read_vars(est.model_dir), (4, 3))
+
+  def testBuildIteratorInInputFn(self):
+
+    def _input_fn():
+      ds = dataset_ops.Dataset.range(10)
+      iterator = ds.make_one_shot_iterator()
+      return iterator.get_next()
+
+    est = estimator.Estimator(model_fn=self._model_fn)
+
+    est.train(_input_fn, steps=2, hooks=[self._build_iterator_saver_hook(est)])
+    self.assertSequenceEqual(self._read_vars(est.model_dir), (2, 1))
+    est.train(_input_fn, steps=2, hooks=[self._build_iterator_saver_hook(est)])
+    self.assertSequenceEqual(self._read_vars(est.model_dir), (4, 3))
+
+  def testDoNotRestore(self):
+
+    def _input_fn():
+      return dataset_ops.Dataset.range(10)
+
+    est = estimator.Estimator(model_fn=self._model_fn)
+
+    est.train(_input_fn, steps=2, hooks=[self._build_iterator_saver_hook(est)])
+    self.assertSequenceEqual(self._read_vars(est.model_dir), (2, 1))
+    est.train(_input_fn, steps=2, hooks=[self._build_iterator_saver_hook(est)])
+    self.assertSequenceEqual(self._read_vars(est.model_dir), (4, 3))
+    # Hook not provided, input pipeline was not restored.
+    est.train(_input_fn, steps=2)
+    self.assertSequenceEqual(self._read_vars(est.model_dir), (6, 1))
+
+  def testRaiseErrorIfNoIterator(self):
+
+    def _input_fn():
+      return constant_op.constant(1, dtype=dtypes.int64)
+
+    est = estimator.Estimator(model_fn=self._model_fn)
+
+    with self.assertRaises(ValueError):
+      est.train(
+          _input_fn, steps=2, hooks=[self._build_iterator_saver_hook(est)])
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/data/python/ops/optimization.py b/tensorflow/contrib/data/python/ops/optimization.py
new file mode 100644
index 00000000000000..cad41bce2961f2
--- /dev/null
+++ b/tensorflow/contrib/data/python/ops/optimization.py
@@ -0,0 +1,80 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental API for optimizing `tf.data` pipelines."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.data.python.ops import contrib_op_loader  # pylint: disable=unused-import
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_dataset_ops
+
+
+def optimize(optimizations=None):
+  """A transformation that applies optimizations.
+
+  Args:
+    optimizations: (Optional.) A `tf.string` vector `tf.Tensor` identifying
+      optimizations to use. If not specified, the default set of optimizations
+      is applied.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    @{tf.data.Dataset.apply}.
+  """
+
+  def _apply_fn(dataset):
+    """Function from `Dataset` to `Dataset` that applies the transformation."""
+    return OptimizeDataset(dataset, optimizations)
+
+  return _apply_fn
+
+
+class OptimizeDataset(dataset_ops.Dataset):
+  """A `Dataset` that acts as an identity, and applies optimizations."""
+
+  def __init__(self, input_dataset, optimizations):
+    """See `optimize()` for details."""
+    super(OptimizeDataset, self).__init__()
+    self._input_dataset = input_dataset
+    if optimizations is None:
+      optimizations = []
+    self._optimizations = ops.convert_to_tensor(
+        optimizations, dtype=dtypes.string, name="optimizations")
+
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.optimize_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        self._optimizations,
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
diff --git a/tensorflow/contrib/data/python/ops/readers.py b/tensorflow/contrib/data/python/ops/readers.py
index bbb808fbd77300..f938153f5f8c8b 100644
--- a/tensorflow/contrib/data/python/ops/readers.py
+++ b/tensorflow/contrib/data/python/ops/readers.py
@@ -17,16 +17,18 @@
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import csv
-from math import ceil
 
 import numpy as np
 
 from tensorflow.contrib.data.python.ops import batching
+from tensorflow.contrib.data.python.ops import gen_dataset_ops as contrib_gen_dataset_ops
 from tensorflow.contrib.data.python.ops import interleave_ops
 from tensorflow.contrib.data.python.ops import shuffle_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers as core_readers
+from tensorflow.python.data.util import convert
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -34,9 +36,7 @@
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import gen_dataset_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
-from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.util import deprecation
 
@@ -68,7 +68,7 @@ def _is_valid_float(str_val, float_dtype):
     return False
 
 
-def _infer_type(str_val, na_value, prev_type, float_dtype):
+def _infer_type(str_val, na_value, prev_type):
   """Given a string, infers its tensor type.
 
   Infers the type of a value by picking the least 'permissive' type possible,
@@ -79,29 +79,34 @@ def _infer_type(str_val, na_value, prev_type, float_dtype):
     na_value: Additional string to recognize as a NA/NaN CSV value.
     prev_type: Type previously inferred based on values of this column that
       we've seen up till now.
-    float_dtype: Either `tf.float32` or `tf.float64`. Denotes what float type
-      to parse float strings as.
   Returns:
     Inferred dtype.
   """
   if str_val in ("", na_value):
+    # If the field is null, it gives no extra information about its type
     return prev_type
 
-  if _is_valid_int32(str_val) and prev_type in (None, dtypes.int32):
-    return dtypes.int32
+  type_list = [
+      dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64, dtypes.string
+  ]  # list of types to try, ordered from least permissive to most
 
-  if _is_valid_int64(str_val) and prev_type in (None, dtypes.int32,
-                                                dtypes.int64):
-    return dtypes.int64
+  type_functions = [
+      _is_valid_int32,
+      _is_valid_int64,
+      lambda str_val: _is_valid_float(str_val, dtypes.float32),
+      lambda str_val: _is_valid_float(str_val, dtypes.float64),
+      lambda str_val: True,
+  ]  # Corresponding list of validation functions
 
-  if _is_valid_float(str_val, float_dtype) and prev_type != dtypes.string:
-    return float_dtype
+  for i in range(len(type_list)):
+    validation_fn = type_functions[i]
+    if validation_fn(str_val) and (prev_type is None or
+                                   prev_type in type_list[:i + 1]):
+      return type_list[i]
 
-  return dtypes.string
 
-
-def _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header,
-                  comment):
+def _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header):
+  """Generator that yields rows of CSV file(s) in order."""
   for fn in filenames:
     with file_io.FileIO(fn, "r") as f:
       rdr = csv.reader(
@@ -112,9 +117,6 @@ def _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header,
         next(rdr)  # Skip header lines
 
       for csv_row in rdr:
-        if comment is not None and csv_row[0].startswith(comment):
-          continue  # Skip comment lines
-
         if len(csv_row) != num_cols:
           raise ValueError(
               "Problem inferring types: CSV row has different number of fields "
@@ -123,22 +125,21 @@ def _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header,
 
 
 def _infer_column_defaults(filenames, num_cols, field_delim, use_quote_delim,
-                           na_value, header, comment, float_dtype,
-                           num_rows_for_inference, select_columns):
+                           na_value, header, num_rows_for_inference,
+                           select_columns):
   """Infers column types from the first N valid CSV records of files."""
   if select_columns is None:
     select_columns = range(num_cols)
   inferred_types = [None] * len(select_columns)
 
   for i, csv_row in enumerate(
-      _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header,
-                    comment)):
+      _next_csv_row(filenames, num_cols, field_delim, use_quote_delim, header)):
     if num_rows_for_inference is not None and i >= num_rows_for_inference:
       break
 
     for j, col_index in enumerate(select_columns):
       inferred_types[j] = _infer_type(csv_row[col_index], na_value,
-                                      inferred_types[j], float_dtype)
+                                      inferred_types[j])
 
   # Replace None's with a default type
   inferred_types = [t or dtypes.string for t in inferred_types]
@@ -198,6 +199,112 @@ def _get_sorted_col_indices(select_columns, column_names):
   return result
 
 
+def _maybe_shuffle_and_repeat(
+    dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed):
+  """Optionally shuffle and repeat dataset, as requested."""
+  if num_epochs != 1 and shuffle:
+    # Use shuffle_and_repeat for perf
+    return dataset.apply(
+        shuffle_ops.shuffle_and_repeat(shuffle_buffer_size, num_epochs,
+                                       shuffle_seed))
+  elif shuffle:
+    return dataset.shuffle(shuffle_buffer_size, shuffle_seed)
+  elif num_epochs != 1:
+    return dataset.repeat(num_epochs)
+  return dataset
+
+
+def make_tf_record_dataset(
+    file_pattern,
+    batch_size,
+    parser_fn=None,
+    num_epochs=None,
+    shuffle=True,
+    shuffle_buffer_size=None,
+    shuffle_seed=None,
+    prefetch_buffer_size=None,
+    num_parallel_reads=None,
+    num_parallel_parser_calls=None,
+    drop_final_batch=False):
+  """Reads and optionally parses TFRecord files into a dataset.
+
+  Provides common functionality such as batching, optional parsing, shuffling,
+  and performant defaults.
+
+  Args:
+    file_pattern: List of files or patterns of TFRecord file paths.
+      See @{tf.gfile.Glob} for pattern rules.
+    batch_size: An int representing the number of records to combine
+      in a single batch.
+    parser_fn: (Optional.) A function accepting string input to parse
+      and process the record contents. This function must map records
+      to components of a fixed shape, so they may be batched. By
+      default, uses the record contents unmodified.
+    num_epochs: (Optional.) An int specifying the number of times this
+      dataset is repeated.  If None (the default), cycles through the
+      dataset forever.
+    shuffle: (Optional.) A bool that indicates whether the input
+      should be shuffled. Defaults to `True`.
+    shuffle_buffer_size: (Optional.) Buffer size to use for
+      shuffling. A large buffer size ensures better shuffling, but
+      increases memory usage and startup time.
+    shuffle_seed: (Optional.) Randomization seed to use for shuffling.
+    prefetch_buffer_size: (Optional.) An int specifying the number of
+      feature batches to prefetch for performance improvement.
+      Defaults to auto-tune. Set to 0 to disable prefetching.
+    num_parallel_reads: (Optional.) Number of threads used to read
+      records from files. By default or if set to a value >1, the
+      results will be interleaved.
+    num_parallel_parser_calls: (Optional.) Number of parallel
+      records to parse in parallel. Defaults to an automatic selection.
+    drop_final_batch: (Optional.) Whether the last batch should be
+      dropped in case its size is smaller than `batch_size`; the
+      default behavior is not to drop the smaller batch.
+
+  Returns:
+    A dataset, where each element matches the output of `parser_fn`
+    except it will have an additional leading `batch-size` dimension,
+    or a `batch_size`-length 1-D tensor of strings if `parser_fn` is
+    unspecified.
+  """
+  files = dataset_ops.Dataset.list_files(
+      file_pattern, shuffle=shuffle, seed=shuffle_seed)
+
+  if num_parallel_reads is None:
+    # Note: We considered auto-tuning this value, but there is a concern
+    # that this affects the mixing of records from different files, which
+    # could affect training convergence/accuracy, so we are defaulting to
+    # a constant for now.
+    num_parallel_reads = 24
+  dataset = core_readers.TFRecordDataset(
+      files, num_parallel_reads=num_parallel_reads)
+
+  if shuffle_buffer_size is None:
+    # TODO(josh11b): Auto-tune this value when not specified
+    shuffle_buffer_size = 10000
+  dataset = _maybe_shuffle_and_repeat(
+      dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed)
+
+  if parser_fn is None:
+    if drop_final_batch:
+      dataset = dataset.apply(batching.batch_and_drop_remainder(batch_size))
+    else:
+      dataset = dataset.batch(batch_size)
+  else:
+    # TODO(josh11b): if num_parallel_parser_calls is None, use some function
+    # of num cores instead of map_and_batch's default behavior of one batch.
+    dataset = dataset.apply(batching.map_and_batch(
+        parser_fn, batch_size, num_parallel_calls=num_parallel_parser_calls,
+        drop_remainder=drop_final_batch))
+
+  if prefetch_buffer_size is None:
+    prefetch_buffer_size = -1  # tf.config.data.AUTOTUNE
+  if prefetch_buffer_size == 0:
+    return dataset
+  else:
+    return dataset.prefetch(buffer_size=prefetch_buffer_size)
+
+
 def make_csv_dataset(
     file_pattern,
     batch_size,
@@ -209,7 +316,6 @@ def make_csv_dataset(
     use_quote_delim=True,
     na_value="",
     header=True,
-    comment=None,
     num_epochs=None,
     shuffle=True,
     shuffle_buffer_size=10000,
@@ -218,7 +324,6 @@ def make_csv_dataset(
     num_parallel_reads=1,
     num_parallel_parser_calls=2,
     sloppy=False,
-    default_float_type=dtypes.float32,
     num_rows_for_inference=100,
 ):
   """Reads CSV files into a dataset.
@@ -231,8 +336,8 @@ def make_csv_dataset(
   Args:
     file_pattern: List of files or patterns of file paths containing CSV
       records. See @{tf.gfile.Glob} for pattern rules.
-    batch_size: An int representing the number of consecutive elements of this
-      dataset to combine in a single batch.
+    batch_size: An int representing the number of records to combine
+      in a single batch.
     column_names: An optional list of strings that corresponds to the CSV
       columns, in order. One per column of the input record. If this is not
       provided, infers the column names from the first row of the records.
@@ -272,15 +377,11 @@ def make_csv_dataset(
     header: A bool that indicates whether the first rows of provided CSV files
       correspond to header lines with column names, and should not be included
       in the data.
-    comment: An optional character string that marks lines that should not be
-      parsed as csv records. If this is provided, all lines that start with
-      this character will not be parsed.
     num_epochs: An int specifying the number of times this dataset is repeated.
       If None, cycles through the dataset forever.
     shuffle: A bool that indicates whether the input should be shuffled.
     shuffle_buffer_size: Buffer size to use for shuffling. A large buffer size
-      ensures better shuffling, but would increase memory usage and startup
-      time.
+      ensures better shuffling, but increases memory usage and startup time.
     shuffle_seed: Randomization seed to use for shuffling.
     prefetch_buffer_size: An int specifying the number of feature batches to
       prefetch for performance improvement. Recommended value is the number of
@@ -294,8 +395,6 @@ def make_csv_dataset(
       produced is deterministic prior to shuffling (elements are still
       randomized if `shuffle=True`. Note that if the seed is set, then order
       of elements after shuffling is deterministic). Defaults to `False`.
-    default_float_type: Either `tf.float32` or `tf.float64`. If defaults are
-      not provided, float-like strings are interpreted to be this type.
     num_rows_for_inference: Number of rows of a file to use for type inference
       if record_defaults is not provided. If None, reads all the rows of all
       the files. Defaults to 100.
@@ -317,8 +416,6 @@ def make_csv_dataset(
     dataset = dataset.shuffle(len(filenames), shuffle_seed)
 
   # Clean arguments; figure out column names and defaults
-  if comment is not None and len(comment) != 1:
-    raise ValueError("`comment` arg must be a single-character string or None")
 
   if column_names is None:
     if not header:
@@ -341,8 +438,7 @@ def make_csv_dataset(
     # construction time
     column_defaults = _infer_column_defaults(
         filenames, len(column_names), field_delim, use_quote_delim, na_value,
-        header, comment, default_float_type, num_rows_for_inference,
-        select_columns)
+        header, num_rows_for_inference, select_columns)
 
   if select_columns is not None and len(column_defaults) != len(select_columns):
     raise ValueError(
@@ -356,71 +452,189 @@ def make_csv_dataset(
   if label_name is not None and label_name not in column_names:
     raise ValueError("`label_name` provided must be one of the columns.")
 
-  # Define map and filter functions
-  def filter_fn(line):
-    return math_ops.not_equal(string_ops.substr(line, 0, 1), comment)
-
   def filename_to_dataset(filename):
-    ds = core_readers.TextLineDataset(filename)
-    if header:
-      ds = ds.skip(1)
-    if comment is not None:
-      ds = ds.filter(filter_fn)
-    return ds
+    return CsvDataset(
+        filename,
+        record_defaults=column_defaults,
+        field_delim=field_delim,
+        use_quote_delim=use_quote_delim,
+        na_value=na_value,
+        select_cols=select_columns,
+        header=header)
 
-  def decode_csv(line):
-    """Decodes CSV line into features.
+  def map_fn(*columns):
+    """Organizes columns into a features dictionary.
 
     Args:
-      line: String tensor corresponding to one csv record.
+      *columns: list of `Tensor`s corresponding to one csv record.
     Returns:
-      A dictionary of feature names to values for that particular record. If
+      An OrderedDict of feature names to values for that particular record. If
       label_name is provided, extracts the label feature to be returned as the
       second element of the tuple.
     """
-    columns = parsing_ops.decode_csv(
-        line,
-        column_defaults,
-        field_delim=field_delim,
-        use_quote_delim=use_quote_delim,
-        na_value=na_value,
-        select_cols=select_columns,
-    )
-    features = dict(zip(column_names, columns))
+    features = collections.OrderedDict(zip(column_names, columns))
     if label_name is not None:
       label = features.pop(label_name)
       return features, label
     return features
 
-  # Read files sequentially or in parallel
+  # Read files sequentially (if num_parallel_reads=1) or in parallel
   dataset = dataset.apply(
       interleave_ops.parallel_interleave(
           filename_to_dataset, cycle_length=num_parallel_reads, sloppy=sloppy))
 
-  if num_epochs != 1 and shuffle:
-    # Use shuffle_and_repeat for perf
-    dataset = dataset.apply(
-        shuffle_ops.shuffle_and_repeat(shuffle_buffer_size, num_epochs,
-                                       shuffle_seed))
-  elif shuffle:
-    dataset = dataset.shuffle(shuffle_buffer_size, shuffle_seed)
-  elif num_epochs != 1:
-    dataset = dataset.repeat(num_epochs)
-
-  # Use map_and_batch for perf
-  # TODO(b/76425672): use num_parallel_calls for better performance tuning when
-  # that is added
-  dataset = dataset.apply(
-      batching.map_and_batch(
-          map_func=decode_csv,
-          batch_size=batch_size,
-          num_parallel_batches=int(
-              ceil(num_parallel_parser_calls / batch_size))))
+  dataset = _maybe_shuffle_and_repeat(
+      dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed)
 
+  # Apply batch before map for perf, because map has high overhead relative
+  # to the size of the computation in each map
+  dataset = dataset.batch(batch_size=batch_size)
+  dataset = dataset.map(map_fn, num_parallel_calls=num_parallel_parser_calls)
   dataset = dataset.prefetch(prefetch_buffer_size)
+
   return dataset
 
 
+_DEFAULT_READER_BUFFER_SIZE_BYTES = 4 * 1024 * 1024  # 4 MB
+
+
+class CsvDataset(dataset_ops.Dataset):
+  """A Dataset comprising lines from one or more CSV files."""
+
+  def __init__(self,
+               filenames,
+               record_defaults,
+               buffer_size=None,
+               header=False,
+               field_delim=",",
+               use_quote_delim=True,
+               na_value="",
+               select_cols=None):
+    """Creates a `CsvDataset` by reading and decoding CSV files.
+
+    The elements of this dataset correspond to records from the file(s).
+    RFC 4180 format is expected for CSV files
+    (https://tools.ietf.org/html/rfc4180)
+    Note that we allow leading and trailing spaces with int or float field.
+
+
+    For example, suppose we have a file 'my_file0.csv' with four CSV columns of
+    different data types:
+    ```
+    abcdefg,4.28E10,5.55E6,12
+    hijklmn,-5.3E14,,2
+    ```
+
+    We can construct a CsvDataset from it as follows:
+    ```python
+    dataset = tf.contrib.data.CsvDataset(
+      "my_file*.csv",
+      [tf.float32,  # Required field, use dtype or empty tensor
+       tf.constant([0.0], dtype=tf.float32),  # Optional field, default to 0.0
+       tf.int32,  # Required field, use dtype or empty tensor
+       ],
+      select_cols=[1,2,3]  # Only parse last three columns
+    )
+    ```
+
+    The expected output of its iterations is:
+    ```python
+    next = dataset.make_one_shot_iterator().get_next()
+    with tf.Session() as sess:
+      while True:
+        try:
+          print(sess.run(nxt))
+        except tf.errors.OutOfRangeError:
+          break
+
+    >> (4.28e10, 5.55e6, 12)
+    >> (-5.3e14, 0.0, 2)
+    ```
+
+    Args:
+      filenames: A `tf.string` tensor containing one or more filenames.
+      record_defaults: A list of default values for the CSV fields. Each item in
+        the list is either a valid CSV `DType` (float32, float64, int32, int64,
+        string), or a `Tensor` object with one of the above types. One per
+        column of CSV data, with either a scalar `Tensor` default value for the
+        column if it is optional, or `DType` or empty `Tensor` if required. If
+        both this and `select_columns` are specified, these must have the same
+        lengths, and `column_defaults` is assumed to be sorted in order of
+        increasing column index.
+      buffer_size: (Optional.) A `tf.int64` scalar denoting the number of bytes
+        to buffer while reading files. Defaults to 4MB.
+      header: (Optional.) A `tf.bool` scalar indicating whether the CSV file(s)
+        have header line(s) that should be skipped when parsing. Defaults to
+        `False`.
+      field_delim: (Optional.) A `tf.string` scalar containing the delimiter
+        character that separates fields in a record. Defaults to `","`.
+      use_quote_delim: (Optional.) A `tf.bool` scalar. If `False`, treats
+        double quotation marks as regular characters inside of string fields
+        (ignoring RFC 4180, Section 2, Bullet 5). Defaults to `True`.
+      na_value: (Optional.) A `tf.string` scalar indicating a value that will
+        be treated as NA/NaN.
+      select_cols: (Optional.) A sorted list of column indices to select from
+        the input data. If specified, only this subset of columns will be
+        parsed. Defaults to parsing all columns.
+    """
+    super(CsvDataset, self).__init__()
+    self._filenames = ops.convert_to_tensor(
+        filenames, dtype=dtypes.string, name="filenames")
+    record_defaults = [
+        constant_op.constant([], dtype=x) if x in _ACCEPTABLE_CSV_TYPES else x
+        for x in record_defaults
+    ]
+    self._record_defaults = ops.convert_n_to_tensor(
+        record_defaults, name="record_defaults")
+    self._buffer_size = convert.optional_param_to_tensor(
+        "buffer_size", buffer_size, _DEFAULT_READER_BUFFER_SIZE_BYTES)
+    self._header = ops.convert_to_tensor(
+        header, dtype=dtypes.bool, name="header")
+    self._field_delim = ops.convert_to_tensor(
+        field_delim, dtype=dtypes.string, name="field_delim")
+    self._use_quote_delim = ops.convert_to_tensor(
+        use_quote_delim, dtype=dtypes.bool, name="use_quote_delim")
+    self._na_value = ops.convert_to_tensor(
+        na_value, dtype=dtypes.string, name="na_value")
+    self._select_cols = convert.optional_param_to_tensor(
+        "select_cols",
+        select_cols,
+        argument_default=[],
+        argument_dtype=dtypes.int64,
+    )
+    self._output_shapes = tuple(
+        tensor_shape.scalar() for _ in range(len(record_defaults)))
+    self._output_types = tuple(d.dtype for d in self._record_defaults)
+    self._output_classes = tuple(
+        ops.Tensor for _ in range(len(record_defaults)))
+
+  def _as_variant_tensor(self):
+    # Constructs graph node for the dataset op.
+    return contrib_gen_dataset_ops.csv_dataset(
+        filenames=self._filenames,
+        record_defaults=self._record_defaults,
+        buffer_size=self._buffer_size,
+        header=self._header,
+        output_shapes=self._output_shapes,
+        field_delim=self._field_delim,
+        use_quote_delim=self._use_quote_delim,
+        na_value=self._na_value,
+        select_cols=self._select_cols,
+    )
+
+  @property
+  def output_types(self):
+    return self._output_types
+
+  @property
+  def output_shapes(self):
+    return self._output_shapes
+
+  @property
+  def output_classes(self):
+    return self._output_classes
+
+
 def make_batched_features_dataset(file_pattern,
                                   batch_size,
                                   features,
@@ -480,8 +694,8 @@ def make_batched_features_dataset(file_pattern,
   Args:
     file_pattern: List of files or patterns of file paths containing
       `Example` records. See `tf.gfile.Glob` for pattern rules.
-    batch_size: An int representing the number of consecutive elements of this
-      dataset to combine in a single batch.
+    batch_size: An int representing the number of records to combine
+      in a single batch.
     features: A `dict` mapping feature keys to `FixedLenFeature` or
       `VarLenFeature` values. See `tf.parse_example`.
     reader: A function or class that can be
@@ -537,16 +751,8 @@ def make_batched_features_dataset(file_pattern,
     dataset = dataset.map(lambda _, v: v)
 
   # Apply dataset repeat and shuffle transformations.
-  repeat_dataset = (num_epochs != 1)
-  if repeat_dataset and shuffle:
-    # Used fused shuffle_and_repeat operation for better performance
-    dataset = dataset.apply(
-        shuffle_ops.shuffle_and_repeat(shuffle_buffer_size, num_epochs,
-                                       shuffle_seed))
-  elif repeat_dataset:
-    dataset = dataset.repeat(num_epochs)
-  elif shuffle:
-    dataset = dataset.shuffle(shuffle_buffer_size, shuffle_seed)
+  dataset = _maybe_shuffle_and_repeat(
+      dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed)
 
   if drop_final_batch:
     dataset = dataset.apply(batching.batch_and_drop_remainder(batch_size))
@@ -620,8 +826,8 @@ def read_batch_features(file_pattern,
   Args:
     file_pattern: List of files or patterns of file paths containing
       `Example` records. See `tf.gfile.Glob` for pattern rules.
-    batch_size: An int representing the number of consecutive elements of this
-      dataset to combine in a single batch.
+    batch_size: An int representing the number of records to combine
+      in a single batch.
     features: A `dict` mapping feature keys to `FixedLenFeature` or
       `VarLenFeature` values. See `tf.parse_example`.
     reader: A function or class that can be
diff --git a/tensorflow/contrib/data/python/ops/resampling.py b/tensorflow/contrib/data/python/ops/resampling.py
index a182dddd38d23d..bad6edd5147d83 100644
--- a/tensorflow/contrib/data/python/ops/resampling.py
+++ b/tensorflow/contrib/data/python/ops/resampling.py
@@ -20,10 +20,12 @@
 import numpy as np
 
 from tensorflow.contrib.data.python.ops import batching
+from tensorflow.contrib.data.python.ops import interleave_ops
 from tensorflow.contrib.data.python.ops import scan_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import logging_ops
@@ -50,79 +52,182 @@ def rejection_resample(class_func, target_dist, initial_dist=None, seed=None):
     A `Dataset` transformation function, which can be passed to
     @{tf.data.Dataset.apply}.
   """
-
   def _apply_fn(dataset):
     """Function from `Dataset` to `Dataset` that applies the transformation."""
-    dist_estimation_batch_size = 32
     target_dist_t = ops.convert_to_tensor(target_dist, name="target_dist")
     class_values_ds = dataset.map(class_func)
+
+    # Get initial distribution.
     if initial_dist is not None:
       initial_dist_t = ops.convert_to_tensor(initial_dist, name="initial_dist")
-      acceptance_dist = _calculate_acceptance_probs(initial_dist_t,
-                                                    target_dist_t)
+      acceptance_dist, prob_of_original = (
+          _calculate_acceptance_probs_with_mixing(initial_dist_t,
+                                                  target_dist_t))
       initial_dist_ds = dataset_ops.Dataset.from_tensors(
           initial_dist_t).repeat()
       acceptance_dist_ds = dataset_ops.Dataset.from_tensors(
           acceptance_dist).repeat()
+      prob_of_original_ds = dataset_ops.Dataset.from_tensors(
+          prob_of_original).repeat()
+    else:
+      initial_dist_ds = _estimate_initial_dist_ds(
+          target_dist_t, class_values_ds)
+      acceptance_and_original_prob_ds = initial_dist_ds.map(
+          lambda initial: _calculate_acceptance_probs_with_mixing(
+              initial, target_dist_t))
+      acceptance_dist_ds = acceptance_and_original_prob_ds.map(
+          lambda accept_prob, _: accept_prob)
+      prob_of_original_ds = acceptance_and_original_prob_ds.map(
+          lambda _, prob_original: prob_original)
+    filtered_ds = _filter_ds(dataset, acceptance_dist_ds, initial_dist_ds,
+                             class_values_ds, seed)
+    # Prefetch filtered dataset for speed.
+    filtered_ds = filtered_ds.prefetch(3)
+
+    prob_original_static = _get_prob_original_static(
+        initial_dist_t, target_dist_t) if initial_dist is not None else None
+    if prob_original_static == 1:
+      return dataset_ops.Dataset.zip((class_values_ds, dataset))
+    elif prob_original_static == 0:
+      return filtered_ds
     else:
-      num_classes = (target_dist_t.shape[0].value or
-                     array_ops.shape(target_dist_t)[0])
-      smoothing_constant = 10
-      initial_examples_per_class_seen = array_ops.fill(
-          [num_classes], np.int64(smoothing_constant))
-
-      def update_estimate_and_tile(num_examples_per_class_seen, c):
-        updated_examples_per_class_seen, dist = _estimate_data_distribution(
-            c, num_examples_per_class_seen)
-        tiled_dist = array_ops.tile(
-            array_ops.expand_dims(dist, 0), [dist_estimation_batch_size, 1])
-        return updated_examples_per_class_seen, tiled_dist
-
-      initial_dist_ds = (class_values_ds.batch(dist_estimation_batch_size)
-                         .apply(scan_ops.scan(initial_examples_per_class_seen,
-                                              update_estimate_and_tile))
-                         .apply(batching.unbatch()))
-      acceptance_dist_ds = initial_dist_ds.map(
-          lambda initial: _calculate_acceptance_probs(initial, target_dist_t))
-
-    def maybe_warn_on_large_rejection(accept_dist, initial_dist):
-      proportion_rejected = math_ops.reduce_sum(
-          (1 - accept_dist) * initial_dist)
-      return control_flow_ops.cond(
-          math_ops.less(proportion_rejected, .5),
-          lambda: accept_dist,
-          lambda: logging_ops.Print(  # pylint: disable=g-long-lambda
-              accept_dist, [proportion_rejected, initial_dist, accept_dist],
-              message="Proportion of examples rejected by sampler is high: ",
-              summarize=100,
-              first_n=10))
-
-    acceptance_dist_ds = (dataset_ops.Dataset.zip((acceptance_dist_ds,
-                                                   initial_dist_ds))
-                          .map(maybe_warn_on_large_rejection))
-
-    def _gather_and_copy(class_val, acceptance_prob, data):
-      return (class_val, array_ops.gather(acceptance_prob, class_val), data)
-    current_probabilities_and_class_and_data_ds = dataset_ops.Dataset.zip(
-        (class_values_ds, acceptance_dist_ds, dataset)).map(_gather_and_copy)
-    filtered_ds = (
-        current_probabilities_and_class_and_data_ds
-        .filter(lambda _1, p, _2: random_ops.random_uniform([], seed=seed) < p))
-    return filtered_ds.map(lambda class_value, _, data: (class_value, data))
+      return interleave_ops.sample_from_datasets(
+          [dataset_ops.Dataset.zip((class_values_ds, dataset)), filtered_ds],
+          weights=prob_of_original_ds.map(lambda prob: [(prob, 1.0 - prob)]),
+          seed=seed)
 
   return _apply_fn
 
 
-def _calculate_acceptance_probs(initial_probs, target_probs):
-  """Calculate the per-class acceptance rates.
+def _get_prob_original_static(initial_dist_t, target_dist_t):
+  """Returns the static probability of sampling from the original.
+
+  `tensor_util.constant_value(prob_of_original)` returns `None` if it encounters
+  an Op that it isn't defined for. We have some custom logic to avoid this.
+
+  Args:
+    initial_dist_t: A tensor of the initial distribution.
+    target_dist_t: A tensor of the target distribution.
+
+  Returns:
+    The probability of sampling from the original distribution as a constant,
+    if it is a constant, or `None`.
+  """
+  init_static = tensor_util.constant_value(initial_dist_t)
+  target_static = tensor_util.constant_value(target_dist_t)
+
+  if init_static is None or target_static is None:
+    return None
+  else:
+    return np.min(target_static / init_static)
+
+
+def _filter_ds(dataset, acceptance_dist_ds, initial_dist_ds, class_values_ds,
+               seed):
+  """Filters a dataset based on per-class acceptance probabilities.
 
   Args:
-    initial_probs: The class probabilities of the data.
-    target_probs: The desired class proportion in minibatches.
+    dataset: The dataset to be filtered.
+    acceptance_dist_ds: A dataset of acceptance probabilities.
+    initial_dist_ds: A dataset of the initial probability distribution, given or
+        estimated.
+    class_values_ds: A dataset of the corresponding classes.
+    seed: (Optional.) Python integer seed for the resampler.
+
   Returns:
-    A list of the per-class acceptance probabilities.
+    A dataset of (class value, data) after filtering.
+  """
+  def maybe_warn_on_large_rejection(accept_dist, initial_dist):
+    proportion_rejected = math_ops.reduce_sum((1 - accept_dist) * initial_dist)
+    return control_flow_ops.cond(
+        math_ops.less(proportion_rejected, .5),
+        lambda: accept_dist,
+        lambda: logging_ops.Print(  # pylint: disable=g-long-lambda
+            accept_dist, [proportion_rejected, initial_dist, accept_dist],
+            message="Proportion of examples rejected by sampler is high: ",
+            summarize=100,
+            first_n=10))
+
+  acceptance_dist_ds = (dataset_ops.Dataset.zip((acceptance_dist_ds,
+                                                 initial_dist_ds))
+                        .map(maybe_warn_on_large_rejection))
+
+  def _gather_and_copy(class_val, acceptance_prob, data):
+    return class_val, array_ops.gather(acceptance_prob, class_val), data
+
+  current_probabilities_and_class_and_data_ds = dataset_ops.Dataset.zip(
+      (class_values_ds, acceptance_dist_ds, dataset)).map(_gather_and_copy)
+  filtered_ds = (
+      current_probabilities_and_class_and_data_ds
+      .filter(lambda _1, p, _2: random_ops.random_uniform([], seed=seed) < p))
+  return filtered_ds.map(lambda class_value, _, data: (class_value, data))
+
+
+def _estimate_initial_dist_ds(
+    target_dist_t, class_values_ds, dist_estimation_batch_size=32,
+    smoothing_constant=10):
+  num_classes = (target_dist_t.shape[0].value or
+                 array_ops.shape(target_dist_t)[0])
+  initial_examples_per_class_seen = array_ops.fill(
+      [num_classes], np.int64(smoothing_constant))
+
+  def update_estimate_and_tile(num_examples_per_class_seen, c):
+    updated_examples_per_class_seen, dist = _estimate_data_distribution(
+        c, num_examples_per_class_seen)
+    tiled_dist = array_ops.tile(
+        array_ops.expand_dims(dist, 0), [dist_estimation_batch_size, 1])
+    return updated_examples_per_class_seen, tiled_dist
 
-  This method is based on solving the following analysis:
+  initial_dist_ds = (class_values_ds.batch(dist_estimation_batch_size)
+                     .apply(scan_ops.scan(initial_examples_per_class_seen,
+                                          update_estimate_and_tile))
+                     .apply(batching.unbatch()))
+
+  return initial_dist_ds
+
+
+def _get_target_to_initial_ratio(initial_probs, target_probs):
+  # Add tiny to initial_probs to avoid divide by zero.
+  denom = (initial_probs + np.finfo(initial_probs.dtype.as_numpy_dtype).tiny)
+  return target_probs / denom
+
+
+def _estimate_data_distribution(c, num_examples_per_class_seen):
+  """Estimate data distribution as labels are seen.
+
+  Args:
+    c: The class labels.  Type `int32`, shape `[batch_size]`.
+    num_examples_per_class_seen: Type `int64`, shape `[num_classes]`,
+      containing counts.
+
+  Returns:
+    num_examples_per_lass_seen: Updated counts.  Type `int64`, shape
+      `[num_classes]`.
+    dist: The updated distribution.  Type `float32`, shape `[num_classes]`.
+  """
+  num_classes = num_examples_per_class_seen.get_shape()[0].value
+  # Update the class-count based on what labels are seen in batch.
+  num_examples_per_class_seen = math_ops.add(
+      num_examples_per_class_seen, math_ops.reduce_sum(
+          array_ops.one_hot(c, num_classes, dtype=dtypes.int64), 0))
+  init_prob_estimate = math_ops.truediv(
+      num_examples_per_class_seen,
+      math_ops.reduce_sum(num_examples_per_class_seen))
+  dist = math_ops.cast(init_prob_estimate, dtypes.float32)
+  return num_examples_per_class_seen, dist
+
+
+def _calculate_acceptance_probs_with_mixing(initial_probs, target_probs):
+  """Calculates the acceptance probabilities and mixing ratio.
+
+  In this case, we assume that we can *either* sample from the original data
+  distribution with probability `m`, or sample from a reshaped distribution
+  that comes from rejection sampling on the original distribution. This
+  rejection sampling is done on a per-class basis, with `a_i` representing the
+  probability of accepting data from class `i`.
+
+  This method is based on solving the following analysis for the reshaped
+  distribution:
 
   Let F be the probability of a rejection (on any example).
   Let p_i be the proportion of examples in the data in class i (init_probs)
@@ -151,39 +256,39 @@ def _calculate_acceptance_probs(initial_probs, target_probs):
   0 <= t_i <= 1, sum_i(t_i) = 1
   ```
 
-
   A solution for a_i in terms of the other variables is the following:
     ```a_i = (t_i / p_i) / max_i[t_i / p_i]```
-  """
-  # Add tiny to initial_probs to avoid divide by zero.
-  denom = (initial_probs + np.finfo(initial_probs.dtype.as_numpy_dtype).tiny)
-  ratio_l = target_probs / denom
 
-  # Calculate list of acceptance probabilities.
-  max_ratio = math_ops.reduce_max(ratio_l)
-  return ratio_l / max_ratio
+  If we try to minimize the amount of data rejected, we get the following:
 
+  M_max = max_i [ t_i / p_i ]
+  M_min = min_i [ t_i / p_i ]
 
-def _estimate_data_distribution(c, num_examples_per_class_seen):
-  """Estimate data distribution as labels are seen.
+  The desired probability of accepting data if it comes from class `i`:
+
+  a_i = (t_i/p_i - m) / (M_max - m)
+
+  The desired probability of pulling a data element from the original dataset,
+  rather than the filtered one:
+
+  m = M_min
 
   Args:
-    c: The class labels.  Type `int32`, shape `[batch_size]`.
-    num_examples_per_class_seen: Type `int64`, shape `[num_classes]`,
-      containing counts.
+    initial_probs: A Tensor of the initial probability distribution, given or
+      estimated.
+    target_probs: A Tensor of the corresponding classes.
 
   Returns:
-    num_examples_per_lass_seen: Updated counts.  Type `int64`, shape
-      `[num_classes]`.
-    dist: The updated distribution.  Type `float32`, shape `[num_classes]`.
+    (A 1D Tensor with the per-class acceptance probabilities, the desired
+    probability of pull from the original distribution.)
   """
-  num_classes = num_examples_per_class_seen.get_shape()[0].value
-  # Update the class-count based on what labels are seen in batch.
-  num_examples_per_class_seen = math_ops.add(
-      num_examples_per_class_seen, math_ops.reduce_sum(
-          array_ops.one_hot(c, num_classes, dtype=dtypes.int64), 0))
-  init_prob_estimate = math_ops.truediv(
-      num_examples_per_class_seen,
-      math_ops.reduce_sum(num_examples_per_class_seen))
-  dist = math_ops.cast(init_prob_estimate, dtypes.float32)
-  return num_examples_per_class_seen, dist
+  ratio_l = _get_target_to_initial_ratio(initial_probs, target_probs)
+  max_ratio = math_ops.reduce_max(ratio_l)
+  min_ratio = math_ops.reduce_min(ratio_l)
+
+  # Target prob to sample from original distribution.
+  m = min_ratio
+
+  # TODO(joelshor): Simplify fraction, if possible.
+  a_i = (ratio_l - m) / (max_ratio - m)
+  return a_i, m
\ No newline at end of file
diff --git a/tensorflow/contrib/data/python/ops/scan_ops.py b/tensorflow/contrib/data/python/ops/scan_ops.py
index 60ef7efba4bb2b..e911ad0fa0541f 100644
--- a/tensorflow/contrib/data/python/ops/scan_ops.py
+++ b/tensorflow/contrib/data/python/ops/scan_ops.py
@@ -24,6 +24,7 @@
 from tensorflow.python.data.util import sparse
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import gen_dataset_ops
 
 
@@ -36,18 +37,22 @@ def __init__(self, input_dataset, initial_state, scan_func):
     self._input_dataset = input_dataset
 
     with ops.name_scope("initial_state"):
+      # Convert any `SparseTensorValue`s to `SparseTensor`s and all other
+      # values to tensors.
       self._initial_state = nest.pack_sequence_as(initial_state, [
-          ops.convert_to_tensor(t, name="component_%d" % i)
+          sparse_tensor.SparseTensor.from_value(t)
+          if sparse_tensor.is_sparse(t) else ops.convert_to_tensor(
+              t, name="component_%d" % i)
           for i, t in enumerate(nest.flatten(initial_state))
       ])
 
-    # Compute initial values for the state shapes and types based on
-    # the initial state. These will be refined by running
-    # `tf_scan_func` one or more times below.
-    # TODO(b/68937811): Allow the initial state to be a tf.SparseTensor.
+    # Compute initial values for the state classes, shapes and types based on
+    # the initial state. The shapes may be refined by running `tf_scan_func` one
+    # or more times below.
+    self._state_classes = sparse.get_classes(self._initial_state)
     self._state_shapes = nest.pack_sequence_as(
         self._initial_state,
-        [t.shape for t in nest.flatten(self._initial_state)])
+        [t.get_shape() for t in nest.flatten(self._initial_state)])
     self._state_types = nest.pack_sequence_as(
         self._initial_state,
         [t.dtype for t in nest.flatten(self._initial_state)])
@@ -62,67 +67,102 @@ def __init__(self, input_dataset, initial_state, scan_func):
     need_to_rerun = True
     while need_to_rerun:
 
-      flat_state_shapes = nest.flatten(self._state_shapes)
-      flat_state_types = nest.flatten(self._state_types)
-
-      # Create a list in which `tf_scan_func` will store the s
+      # Create a list in which `tf_scan_func` will store the new shapes.
       flat_new_state_shapes = []
 
-      @function.Defun(*(flat_state_types + nest.flatten(
-          sparse.as_dense_types(input_dataset.output_types,
-                                input_dataset.output_classes))))
+      @function.Defun(*(nest.flatten(
+          sparse.as_dense_types(
+              self._state_types, self._state_classes)) + nest.flatten(
+                  sparse.as_dense_types(input_dataset.output_types,
+                                        input_dataset.output_classes))))
       def tf_scan_func(*args):
         """A wrapper for Defun that facilitates shape inference."""
         # Pass in shape information from the state and input_dataset.
-        # TODO(b/69424092): Check that neither inputs nor outputs are sparse.
-        dense_shapes = sparse.as_dense_shapes(input_dataset.output_shapes,
-                                              input_dataset.output_classes)
-        for arg, shape in zip(args,
-                              flat_state_shapes + nest.flatten(dense_shapes)):
+        for arg, shape in zip(
+            args,
+            nest.flatten(
+                sparse.as_dense_shapes(self._state_shapes, self._state_classes))
+            + nest.flatten(
+                sparse.as_dense_shapes(input_dataset.output_shapes,
+                                       input_dataset.output_classes))):
           arg.set_shape(shape)
 
-        pivot = len(flat_state_shapes)
-        old_state = nest.pack_sequence_as(self._initial_state, args[:pivot])
-        input_value = nest.pack_sequence_as(input_dataset.output_types,
-                                            args[pivot:])
-
-        ret = scan_func(old_state, input_value)
+        pivot = len(nest.flatten(self._state_shapes))
+        print(self._state_classes)
+        nested_state_args = nest.pack_sequence_as(self._state_types,
+                                                  args[:pivot])
+        nested_state_args = sparse.deserialize_sparse_tensors(
+            nested_state_args, self._state_types, self._state_shapes,
+            self._state_classes)
+        print(input_dataset.output_classes)
+        nested_input_args = nest.pack_sequence_as(input_dataset.output_types,
+                                                  args[pivot:])
+        nested_input_args = sparse.deserialize_sparse_tensors(
+            nested_input_args, input_dataset.output_types,
+            input_dataset.output_shapes, input_dataset.output_classes)
+
+        ret = scan_func(nested_state_args, nested_input_args)
         if not isinstance(ret, collections.Sequence) or len(ret) != 2:
           raise TypeError("The scan function must return a pair comprising the "
                           "new state and the output value.")
+
+        # Convert any `SparseTensorValue`s to `SparseTensor`s and all other
+        # values to tensors.
+        ret = nest.pack_sequence_as(ret, [
+            sparse_tensor.SparseTensor.from_value(t)
+            if sparse_tensor.is_sparse(t) else ops.convert_to_tensor(t)
+            for t in nest.flatten(ret)
+        ])
         new_state, output_value = ret
 
-        flat_new_state = [
-            ops.convert_to_tensor(t) for t in nest.flatten(new_state)
-        ]
-        flat_output_value = [
-            ops.convert_to_tensor(t) for t in nest.flatten(output_value)
-        ]
+        # Extract and validate class information from the returned values.
+        for t, clazz in zip(
+            nest.flatten(new_state), nest.flatten(self._state_classes)):
+          if not isinstance(t, clazz):
+            raise TypeError(
+                "The element classes for the new state must match the initial "
+                "state. Expected %s; got %s." %
+                (self._state_classes,
+                 nest.pack_sequence_as(
+                     self._state_types,
+                     [type(t) for t in nest.flatten(new_state)])))
+        self._output_classes = sparse.get_classes(output_value)
 
         # Extract shape information from the returned values.
-        flat_new_state_shapes.extend([t.shape for t in flat_new_state])
+        flat_new_state_shapes.extend(
+            [t.get_shape() for t in nest.flatten(new_state)])
         self._output_shapes = nest.pack_sequence_as(
-            output_value, [t.shape for t in flat_output_value])
+            output_value, [t.get_shape() for t in nest.flatten(output_value)])
 
         # Extract and validate type information from the returned values.
-        for t, dtype in zip(flat_new_state, flat_state_types):
+        for t, dtype in zip(
+            nest.flatten(new_state), nest.flatten(self._state_types)):
           if t.dtype != dtype:
             raise TypeError(
                 "The element types for the new state must match the initial "
                 "state. Expected %s; got %s." %
-                (self._state_types, nest.pack_sequence_as(
-                    self._state_types, [t.dtype for t in flat_new_state])))
-        self._output_classes = nest.pack_sequence_as(
-            output_value, [ops.Tensor for _ in flat_output_value])
+                (self._state_types,
+                 nest.pack_sequence_as(
+                     self._state_types,
+                     [t.dtype for t in nest.flatten(new_state)])))
         self._output_types = nest.pack_sequence_as(
-            output_value, [t.dtype for t in flat_output_value])
-
-        return flat_new_state + flat_output_value
+            output_value, [t.dtype for t in nest.flatten(output_value)])
+
+        # Serialize any sparse tensors.
+        new_state = nest.pack_sequence_as(new_state, [
+            t for t in nest.flatten(sparse.serialize_sparse_tensors(new_state))
+        ])
+        output_value = nest.pack_sequence_as(output_value, [
+            t for t in nest.flatten(
+                sparse.serialize_sparse_tensors(output_value))
+        ])
+        return nest.flatten(new_state) + nest.flatten(output_value)
 
       # Use the private method that will execute `tf_scan_func` but delay
       # adding it to the graph in case we need to rerun the function.
       tf_scan_func._create_definition_if_needed()  # pylint: disable=protected-access
 
+      flat_state_shapes = nest.flatten(self._state_shapes)
       weakened_state_shapes = [
           original.most_specific_compatible_shape(new)
           for original, new in zip(flat_state_shapes, flat_new_state_shapes)
@@ -150,7 +190,7 @@ def _as_variant_tensor(self):
     input_t = self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
     return gen_dataset_ops.scan_dataset(
         input_t,
-        nest.flatten(self._initial_state),
+        nest.flatten(sparse.serialize_sparse_tensors(self._initial_state)),
         self._scan_func.captured_inputs,
         f=self._scan_func,
         output_types=nest.flatten(
diff --git a/tensorflow/contrib/data/python/ops/stats_ops.py b/tensorflow/contrib/data/python/ops/stats_ops.py
index d39172039683fe..3cbaab5affd739 100644
--- a/tensorflow/contrib/data/python/ops/stats_ops.py
+++ b/tensorflow/contrib/data/python/ops/stats_ops.py
@@ -136,8 +136,8 @@ def _apply_fn(dataset):
 def bytes_produced_stats(tag):
   """Records the number of bytes produced by each element of the input dataset.
 
-  To consume the statistics, associate a `StatsAggregator` with an iterator
-  over the output dataset.
+  To consume the statistics, associate a `StatsAggregator` with the output
+  dataset.
 
   Args:
     tag: String. All statistics recorded by the returned transformation will
@@ -158,8 +158,8 @@ def _apply_fn(dataset):
 def latency_stats(tag):
   """Records the latency of producing each element of the input dataset.
 
-  To consume the statistics, associate a `StatsAggregator` with an iterator
-  over the output dataset.
+  To consume the statistics, associate a `StatsAggregator` with the output
+  dataset.
 
   Args:
     tag: String. All statistics recorded by the returned transformation will
diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index 2038a8fc742a0c..da66e86f29460c 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -21,11 +21,11 @@ py_library(
     srcs = ["values.py"],
     visibility = ["//tensorflow:internal"],
     deps = [
+        ":input_ops",
         ":prefetching_ops_v2",
         "//tensorflow/contrib/data/python/ops:batching",
         "//tensorflow/contrib/eager/python:datasets",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:checkpointable",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:device_util",
         "//tensorflow/python:distribute",
@@ -33,6 +33,7 @@ py_library(
         "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/training/checkpointable:base",
         "@six_archive//:six",
     ],
 )
@@ -42,6 +43,7 @@ gpu_py_test(
     srcs = ["values_test.py"],
     additional_deps = [
         ":mirrored_strategy",
+        ":multi_worker_test_base",
         ":values",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/data/ops:dataset_ops",
@@ -57,6 +59,9 @@ gpu_py_test(
         "//tensorflow/python/eager:test",
         "//tensorflow/python/estimator:model_fn",
     ],
+    tags = [
+        "no_pip",
+    ],
 )
 
 py_library(
@@ -81,6 +86,19 @@ py_library(
     ],
 )
 
+py_library(
+    name = "multi_worker_strategy",
+    srcs = ["multi_worker_strategy.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":mirrored_strategy",
+        ":values",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+    ],
+)
+
 py_library(
     name = "one_device_strategy",
     srcs = ["one_device_strategy.py"],
@@ -133,6 +151,7 @@ py_library(
         ":one_device_strategy",
         ":tpu_strategy",
         "//tensorflow/contrib/optimizer_v2:training",
+        "//tensorflow/python:distribute",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
@@ -216,6 +235,24 @@ gpu_py_test(
     ],
 )
 
+py_library(
+    name = "multi_worker_test_base",
+    testonly = 1,
+    srcs = ["multi_worker_test_base.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+    ],
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:distributed_framework_test_lib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:session",
+        "//tensorflow/python:training",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
 py_library(
     name = "step_fn",
     srcs = ["step_fn.py"],
@@ -274,6 +311,7 @@ gpu_py_test(
     tags = [
         "multi_and_single_gpu",
         "no_pip",
+        "noguitar",  # TODO(b/109653107): test is flaky.
     ],
 )
 
@@ -408,6 +446,7 @@ py_library(
     srcs = ["cross_tower_utils.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":values",
         "//tensorflow/contrib/nccl:nccl_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_ops",
@@ -415,6 +454,24 @@ py_library(
     ],
 )
 
+gpu_py_test(
+    name = "cross_tower_utils_test",
+    srcs = ["cross_tower_utils_test.py"],
+    additional_deps = [
+        ":combinations",
+        ":cross_tower_utils",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+    ],
+    tags = [
+        "no_pip",
+    ],
+)
+
 py_library(
     name = "cross_tower_ops",
     srcs = ["cross_tower_ops.py"],
@@ -433,24 +490,24 @@ py_library(
     ],
 )
 
-py_test(
+gpu_py_test(
     name = "cross_tower_ops_test",
     srcs = ["cross_tower_ops_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-    ],
-    deps = [
+    additional_deps = [
         ":combinations",
         ":cross_tower_ops",
         ":values",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
-        "@absl_py//absl/testing:parameterized",
+    ],
+    tags = [
+        "multi_and_single_gpu",
+        "no_pip",
     ],
 )
 
@@ -479,3 +536,52 @@ gpu_py_test(
         "//tensorflow/python/data/ops:iterator_ops",
     ],
 )
+
+py_library(
+    name = "input_ops",
+    srcs = ["input_ops.py"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/util:nest",
+    ],
+)
+
+gpu_py_test(
+    name = "input_ops_test",
+    srcs = ["input_ops_test.py"],
+    additional_deps = [
+        ":input_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/contrib/data/python/ops:batching",
+        "//tensorflow/contrib/data/python/ops:interleave_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python:util",
+    ],
+    tags = [
+        "no_pip",
+    ],
+)
+
+gpu_py_test(
+    name = "keras_test",
+    srcs = ["keras_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/contrib/distribute/python:mirrored_strategy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:training",
+        "//tensorflow/python/estimator:keras",
+        "//tensorflow/python/estimator:run_config",
+        "//tensorflow/python/keras",
+    ],
+    tags = [
+        "multi_and_single_gpu",
+        "notsan",
+    ],
+)
diff --git a/tensorflow/contrib/distribute/python/combinations.py b/tensorflow/contrib/distribute/python/combinations.py
index 946310aa6fc210..98e7228f24d8ca 100644
--- a/tensorflow/contrib/distribute/python/combinations.py
+++ b/tensorflow/contrib/distribute/python/combinations.py
@@ -41,16 +41,20 @@ def testOptimizer(self, optimizer):
 
 from collections import OrderedDict
 import sys
+import types
+import unittest
 from absl.testing import parameterized
+import six
 
-from tensorflow.contrib.distribute.python import mirrored_strategy
-from tensorflow.contrib.distribute.python import one_device_strategy
-from tensorflow.contrib.distribute.python import tpu_strategy
+from tensorflow.contrib.distribute.python import mirrored_strategy as mirrored_lib
+from tensorflow.contrib.distribute.python import one_device_strategy as one_device_lib
+from tensorflow.contrib.distribute.python import tpu_strategy as tpu_lib
 from tensorflow.contrib.optimizer_v2 import adam as adam_v2
 from tensorflow.contrib.optimizer_v2 import gradient_descent as gradient_descent_v2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.training import adam
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.util import tf_inspect
 
@@ -66,29 +70,35 @@ def generate(combinations):
     combinations: a list of dictionaries created using combine() and times().
 
   Restrictions:
-   -- there should always be a "mode" argument.  Accepted values are "eager"
-      and "graph".
+   -- the "mode" argument can be either "eager" or "graph".  It's "graph" by
+      default.
    -- arguments of the test method must match by name to get the corresponding
-      value of the combination.  Tests must accept all arguments (except "mode",
-      which is optional).
-   -- distribution argument is special.  It is meant for passing instances of
-      DistributionStrategy.  Each instance is to be passed as `(<int>,
-      <DistributionStrategy>)` tuple, where <int> is the number of required
-      GPUs.  If the required number of GPUs for the DistributionStrategy isn't
-      available then the test case is going to be skipped.
+      value of the combination.  Tests must accept all arguments except the
+      "mode", "required_tpu" and "required_gpus".
+   -- "distribution" argument is special and optional.  It is meant for passing
+      instances of DistributionStrategy.  Each instance is to be passed as via
+      `NamedDistribution`.  If using "distribution", "required_gpus" and
+      "required_tpu" should be specified via the NamedDistribution instance,
+      rather than as separate arguments.
+   -- "required_tpu" argument is special and optional.  If not `None`, then the
+      test will be skipped if TPUs aren't available.
+   -- "required_gpus" argument is special and optional.  If not `None`, then the
+      test will be skipped if the specified number of GPUs aren't available.
 
   Returns:
-    a decorator that will cause the test method to be run under the specified
-    conditions.
+    a decorator that will cause the test method or the test class to be run
+    under the specified conditions.
 
   Raises:
-    ValueError - if "mode" argument wasn't either "eager" or "graph.
+    ValueError - if "mode" argument wasn't either "eager" or "graph" or if other
+      arguments were not accepted by the test method.
   """
 
-  def decorator(test_function):
+  def decorator(test_method_or_class):
     """The decorator to be returned."""
 
     # Generate good test names that can be used with --test_filter.
+    named_combinations = []
     for combination in combinations:
       # We use OrderedDicts in `combine()` and `times()` to ensure stable
       # order of keys in each dictionary.
@@ -99,59 +109,96 @@ def decorator(test_function):
               "".join(filter(str.isalnum, str(value))))
           for key, value in combination.items()
       ])
-      combination.update({"testcase_name": "_test{}".format(name)})
-
-    @parameterized.named_parameters(*combinations)
-    def decorated(self, **kwargs):
-      """A wrapped test method that sets up `test_function`."""
-      assert "mode" in kwargs
-      mode = kwargs["mode"]
-
-      if "distribution" in kwargs:
-        distribution = kwargs["distribution"]
-        kwargs["distribution"] = distribution.strategy
-        if distribution.required_tpu and not TPU_TEST:
-          self.skipTest("Test requires a TPU, but it's not available.")
-        if not distribution.required_tpu and TPU_TEST:
-          self.skipTest("Test that doesn't require a TPU.")
-
-        if not distribution.required_gpus:
-          if GPU_TEST:
-            self.skipTest("Test that doesn't require GPUs.")
-        elif context.num_gpus() < distribution.required_gpus:
-          self.skipTest(
-              "{} GPUs are not available for this test. {} GPUs are available".
-              format(distribution.required_gpus, context.num_gpus()))
-
-      requested_arguments = tf_inspect.getfullargspec(test_function).args
-      missing_arguments = set(list(kwargs.keys()) + ["self"]).difference(
-          set(requested_arguments + ["mode"]))
-      if missing_arguments:
-        raise ValueError("The test is missing arguments {} .".format(
-            missing_arguments))
-
-      kwargs_to_pass = {}
-      for arg in requested_arguments:
-        if arg == "self":
-          kwargs_to_pass[arg] = self
-        else:
-          kwargs_to_pass[arg] = kwargs[arg]
-
-      if mode == "eager":
-        with context.eager_mode(), ops.Graph().as_default():
-          test_function(**kwargs_to_pass)
-      elif mode == "graph":
-        with context.graph_mode(), ops.Graph().as_default():
-          test_function(**kwargs_to_pass)
-      else:
-        raise ValueError(
-            "'mode' has to be either 'eager' or 'graph' and not {}".format(
-                mode))
+      named_combinations.append(
+          OrderedDict(
+              list(combination.items()) + [("testcase_name",
+                                            "_test{}".format(name))]))
+
+    if isinstance(test_method_or_class, type):
+      class_object = test_method_or_class
+      class_object._test_method_ids = test_method_ids = {}
+      for name, test_method in six.iteritems(class_object.__dict__.copy()):
+        if (name.startswith(unittest.TestLoader.testMethodPrefix) and
+            isinstance(test_method, types.FunctionType)):
+          delattr(class_object, name)
+          methods = {}
+          parameterized._update_class_dict_for_param_test_case(
+              class_object.__name__, methods, test_method_ids, name,
+              parameterized._ParameterizedTestIter(
+                  _augment_with_special_arguments(test_method),
+                  named_combinations, parameterized._NAMED, name))
+          for method_name, method in six.iteritems(methods):
+            setattr(class_object, method_name, method)
+
+      return class_object
+    else:
+      test_method = _augment_with_special_arguments(test_method_or_class)
+      return parameterized.named_parameters(*named_combinations)(test_method)
 
-    return decorated
   return decorator
 
 
+def _augment_with_special_arguments(test_method):
+  def decorated(self, **kwargs):
+    """A wrapped test method that treats some arguments in a special way."""
+    mode = kwargs.pop("mode", "graph")
+
+    distribution = kwargs.pop("distribution", None)
+    required_tpu = kwargs.pop("required_tpu", False)
+    required_gpus = kwargs.pop("required_gpus", None)
+
+    if distribution:
+      assert required_gpus is None, (
+          "Do not use `required_gpus` and `distribution` together.")
+      assert required_tpu is False, (
+          "Do not use `required_tpu` and `distribution` together.")
+      kwargs["distribution"] = distribution.strategy
+      required_gpus = distribution.required_gpus
+      required_tpu = distribution.required_tpu
+
+    if required_tpu and not TPU_TEST:
+      self.skipTest("Test requires a TPU, but it's not available.")
+    if not required_tpu and TPU_TEST:
+      self.skipTest("Test that doesn't require a TPU.")
+
+    if not required_gpus:
+      if GPU_TEST:
+        self.skipTest("Test that doesn't require GPUs.")
+    elif context.num_gpus() < required_gpus:
+      self.skipTest(
+          "{} GPUs are not available for this test. {} GPUs are available".
+          format(required_gpus, context.num_gpus()))
+
+    # At this point, `kwargs` doesn't have `required_gpus` or `required_tpu`
+    # that the user might have specified.  `kwargs` still has `mode`, which
+    # the test is allowed to accept or ignore.
+    requested_arguments = tf_inspect.getfullargspec(test_method).args
+    missing_arguments = set(list(kwargs.keys()) + ["self"]).difference(
+        set(requested_arguments + ["mode"]))
+    if missing_arguments:
+      raise ValueError("The test is missing arguments {} .".format(
+          missing_arguments))
+
+    kwargs_to_pass = {}
+    for arg in requested_arguments:
+      if arg == "self":
+        kwargs_to_pass[arg] = self
+      else:
+        kwargs_to_pass[arg] = kwargs[arg]
+
+    if mode == "eager":
+      with ops.Graph().as_default(), context.eager_mode():
+        test_method(**kwargs_to_pass)
+    elif mode == "graph":
+      with ops.Graph().as_default(), context.graph_mode():
+        test_method(**kwargs_to_pass)
+    else:
+      raise ValueError(
+          "'mode' has to be either 'eager' or 'graph' and not {}".format(
+              mode))
+  return decorated
+
+
 def combine(**kwargs):
   """Generate combinations based on its keyword arguments.
 
@@ -159,7 +206,8 @@ def combine(**kwargs):
   can be computed using `times()`.
 
   Args:
-    **kwargs: keyword arguments of form `option=[possibilities, ...]`.
+    **kwargs: keyword arguments of form `option=[possibilities, ...]`
+         or `option=the_only_possibility`.
 
   Returns:
     a list of dictionaries for each combination. Keys in the dictionaries are
@@ -178,6 +226,8 @@ def combine(**kwargs):
 
   key = first[0]
   values = first[1]
+  if not isinstance(values, list):
+    values = [values]
 
   return [
       OrderedDict(sorted(list(combined.items()) + [(key, v)], key=sort_by_key))
@@ -239,9 +289,9 @@ def __repr__(self):
 class NamedDistribution(object):
   """Translates DistributionStrategy and its data into a good name."""
 
-  def __init__(self, name, distribution, required_gpus=None,
+  def __init__(self, name, distribution_fn, required_gpus=None,
                required_tpu=False):
-    self._distribution = distribution
+    self._distribution_fn = distribution_fn
     self._name = name
     self._required_gpus = required_gpus
     self._required_tpu = required_tpu
@@ -251,7 +301,7 @@ def __repr__(self):
 
   @property
   def strategy(self):
-    return self._distribution
+    return self._distribution_fn()
 
   @property
   def required_gpus(self):
@@ -262,21 +312,31 @@ def required_tpu(self):
     return self._required_tpu
 
 
+# pylint: disable=g-long-lambda
+default_strategy = NamedDistribution(
+    "Default",
+    lambda: distribute_lib._default_distribution_strategy,  # pylint: disable=protected-access
+    required_gpus=None)
 one_device_strategy = NamedDistribution(
-    "OneDeviceCPU", one_device_strategy.OneDeviceStrategy("/cpu:0"),
-    None)
-tpu_strategy = NamedDistribution(
-    "TPU", tpu_strategy.TPUStrategy(), required_tpu=True)
+    "OneDeviceCPU", lambda: one_device_lib.OneDeviceStrategy("/cpu:0"),
+    required_gpus=None)
+tpu_strategy_single_iteration = NamedDistribution(
+    "TPUSingleIteration",
+    lambda: tpu_lib.TPUStrategy(iterations_per_step=1),
+    required_tpu=True)
+tpu_strategy = NamedDistribution("TPU", tpu_lib.TPUStrategy, required_tpu=True)
+# Note that we disable prefetching for testing since prefetching makes
+# the input non-deterministic.
 mirrored_strategy_with_gpu_and_cpu = NamedDistribution(
     "MirroredCPUAndGPU",
-    mirrored_strategy.MirroredStrategy(["/gpu:0", "/cpu:0"]), 1)
-mirrored_strategy_without_prefetch = NamedDistribution(
-    "MirroredCPUAndGPUNoPrefetch",
-    mirrored_strategy.MirroredStrategy(
-        ["/gpu:0", "/cpu:0"], prefetch_on_device=False), 1)
+    lambda: mirrored_lib.MirroredStrategy(
+        ["/gpu:0", "/cpu:0"], prefetch_on_device=False),
+    required_gpus=1)
 mirrored_strategy_with_two_gpus = NamedDistribution(
     "Mirrored2GPUs",
-    mirrored_strategy.MirroredStrategy(["/gpu:0", "/gpu:1"]), 2)
+    lambda: mirrored_lib.MirroredStrategy(
+        ["/gpu:0", "/gpu:1"], prefetch_on_device=False),
+    required_gpus=2)
 
 adam_optimizer_v1_fn = NamedObject(
     "AdamV1", lambda: adam.AdamOptimizer(0.2, epsilon=1))
diff --git a/tensorflow/contrib/distribute/python/combinations_test.py b/tensorflow/contrib/distribute/python/combinations_test.py
index 219b24160f3902..86aa48cea889c6 100644
--- a/tensorflow/contrib/distribute/python/combinations_test.py
+++ b/tensorflow/contrib/distribute/python/combinations_test.py
@@ -19,6 +19,7 @@
 from __future__ import print_function
 
 from collections import OrderedDict
+from absl.testing import parameterized
 
 from tensorflow.contrib.distribute.python import combinations
 from tensorflow.python.eager import test
@@ -41,6 +42,15 @@ def test_combine(self):
         "b": 3
     }], combinations.combine(a=[1, 2], b=[2, 3]))
 
+  def test_combine_single_parameter(self):
+    self.assertEqual([{
+        "a": 1,
+        "b": 2
+    }, {
+        "a": 2,
+        "b": 2
+    }], combinations.combine(a=[1, 2], b=2))
+
   def test_add(self):
     self.assertEqual(
         [{
@@ -111,5 +121,28 @@ def test_overlapping_keys(self):
       _ = combinations.times(c1, c2)
 
 
+@combinations.generate(combinations.combine(a=[1, 0], b=[2, 3], c=[1]))
+class CombineTheTestSuite(parameterized.TestCase):
+
+  def test_add_things(self, a, b, c):
+    self.assertLessEqual(3, a + b + c)
+    self.assertLessEqual(a + b + c, 5)
+
+  def test_add_things_one_more(self, a, b, c):
+    self.assertLessEqual(3, a + b + c)
+    self.assertLessEqual(a + b + c, 5)
+
+  def not_a_test(self, a=0, b=0, c=0):
+    del a, b, c
+    self.fail()
+
+  def _test_but_private(self, a=0, b=0, c=0):
+    del a, b, c
+    self.fail()
+
+  # Check that nothing funny happens to a non-callable that starts with "_test".
+  test_member = 0
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distribute/python/cross_tower_ops.py b/tensorflow/contrib/distribute/python/cross_tower_ops.py
index cff717db80f0bd..a411b880e80291 100644
--- a/tensorflow/contrib/distribute/python/cross_tower_ops.py
+++ b/tensorflow/contrib/distribute/python/cross_tower_ops.py
@@ -53,15 +53,14 @@ def _validate_value_destination_pairs(value_destination_pairs):
   return True
 
 
+# TODO(yuefengz): consider calling this function in the caller of CrossTowerOps.
 def _get_devices_from(destinations):
   if isinstance(destinations, value_lib.DistributedValues):
     return list(destinations.devices)
   elif isinstance(destinations, six.string_types):
-    return [device_util.canonicalize(destinations)]
+    return [device_util.resolve(destinations)]
   else:
-    return [
-        device_util.canonicalize(destination) for destination in destinations
-    ]
+    return [device_util.resolve(destination) for destination in destinations]
 
 
 def _devices_match(left, right):
@@ -78,12 +77,12 @@ def _all_devices_match(value_destination_pairs):
   return True
 
 
-def _simple_broadcast(tensor, destinations):
+def _simple_broadcast(value, destinations):
   index = {}
   devices = _get_devices_from(destinations)
   for d in devices:
-    with ops.device(d):
-      index[d] = array_ops.identity(tensor)
+    index[d] = cross_tower_utils.copy_tensor_or_indexed_slices_to_device(
+        value, d)
   return value_lib.Mirrored(index)
 
 
@@ -99,7 +98,9 @@ def _simple_reduce(per_device_value, reduce_to_device, accumulation_fn,
         continue
       count += len(v_list)
       # Sum within each device before aggregating across devices.
-      v = math_ops.add_n(v_list)
+      # TODO(yuefengz): Check whether it helps to use accumulation_fn here.
+      v = cross_tower_utils.aggregate_tensors_or_indexed_slices(
+          v_list, math_ops.add_n)
     else:
       count += 1
     all_values.append(v)
@@ -108,11 +109,12 @@ def _simple_reduce(per_device_value, reduce_to_device, accumulation_fn,
 
   with ops.device(reduce_to_device):
     with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
-      if method_string == "sum":
-        reduced = accumulation_fn(all_values)
-      elif method_string == "mean":
-        reduced = accumulation_fn(all_values) / count
-      else:
+      reduced = cross_tower_utils.aggregate_tensors_or_indexed_slices(
+          all_values, accumulation_fn)
+      if method_string == "mean":
+        reduced = cross_tower_utils.divide_by_n_tensors_or_indexed_slices(
+            reduced, count)
+      elif method_string != "sum":
         raise ValueError("`method_string` must be 'sum' or 'mean'")
   return reduced
 
@@ -445,10 +447,18 @@ def __init__(self,
     super(AllReduceCrossTowerOps, self).__init__()
 
   def _reduce(self, method_string, per_device_value, destinations):
+    contains_indexed_slices = cross_tower_utils.contains_indexed_slices(
+        per_device_value)
     if ((destinations is None or _devices_match(per_device_value, destinations))
-        and not context.executing_eagerly()):
+        and not context.executing_eagerly()
+        and not contains_indexed_slices):
       return self._batch_all_reduce(method_string, [per_device_value])[0]
     else:
+      if contains_indexed_slices:
+        logging.log_first_n(
+            logging.WARN,
+            "Efficient allreduce is not supported for IndexedSlices.", 10)
+
       devices = _get_devices_from(destinations or per_device_value)
       reduce_to_device = devices[0]
       reduced = _simple_reduce(per_device_value, reduce_to_device,
@@ -456,14 +466,18 @@ def _reduce(self, method_string, per_device_value, destinations):
       return self.broadcast(reduced, devices)
 
   def _batch_reduce(self, method_string, value_destination_pairs):
-    if (_all_devices_match(value_destination_pairs) and
-        not context.executing_eagerly()):
+    all_devices_match = _all_devices_match(value_destination_pairs)
+    contains_indexed_slices = cross_tower_utils.contains_indexed_slices(
+        value_destination_pairs)
+    if (all_devices_match and not context.executing_eagerly()
+        and not contains_indexed_slices):
       return self._batch_all_reduce(method_string,
                                     [v[0] for v in value_destination_pairs])
     else:
-      if not context.executing_eagerly():
+      if not all_devices_match:
         logging.warning("Efficient batch_reduce is not supported if "
                         "destinations are different.")
+
       return [
           self._reduce(method_string, t, destinations=v)
           for t, v in value_destination_pairs
diff --git a/tensorflow/contrib/distribute/python/cross_tower_ops_test.py b/tensorflow/contrib/distribute/python/cross_tower_ops_test.py
index 7c7b0870887465..2a266326088def 100644
--- a/tensorflow/contrib/distribute/python/cross_tower_ops_test.py
+++ b/tensorflow/contrib/distribute/python/cross_tower_ops_test.py
@@ -31,6 +31,7 @@
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.training import device_util
 
 
 def _make_per_device(values, devices):
@@ -56,19 +57,46 @@ def _fake_mirrored(value, devices):
       {d: v for d, v in zip(devices, [value] * len(devices))})
 
 
+def _make_indexed_slices(values, indices, dense_shape, device):
+  with ops.device(device):
+    tensor = ops.IndexedSlices(
+        values=constant_op.constant(values),
+        indices=constant_op.constant(indices),
+        dense_shape=constant_op.constant(dense_shape))
+  return tensor
+
+
+def _make_mirrored_indexed_slices(devices, values, indices, dense_shape):
+  return value_lib.Mirrored({
+      d: _make_indexed_slices(values, indices, dense_shape, d) for d in devices
+  })
+
+
 _cpu_device = "/device:CPU:0"
 
 
 class CrossTowerOpsTest(test.TestCase, parameterized.TestCase):
 
-  def _assert_value_equal(self, left, right):
+  def _assert_indexed_slices_equal(self, left, right):
+    self.assertIsInstance(left, ops.IndexedSlices)
+    self.assertIsInstance(right, ops.IndexedSlices)
+    self.assertEqual(device_util.resolve(left.device),
+                     device_util.resolve(right.device))
+    self.assertAllEqual(
+        self.evaluate(ops.convert_to_tensor(left)),
+        self.evaluate(ops.convert_to_tensor(right)))
+
+  def _assert_values_equal(self, left, right):
     if isinstance(left, list):
       for l, r in zip(left, right):
-        self._assert_value_equal(l, r)
+        self._assert_values_equal(l, r)
     else:
       self.assertEqual(type(left), type(right))
       self.assertEqual(left.devices, right.devices)
-      if context.executing_eagerly():
+      if isinstance(list(left._index.values())[0], ops.IndexedSlices):
+        for (d, v) in left._index.iteritems():
+          self._assert_indexed_slices_equal(v, right._index[d])
+      elif context.executing_eagerly():
         self.assertEqual([v.numpy() for v in left._index.values()],
                          list(right._index.values()))
       else:
@@ -143,29 +171,29 @@ def testReductionAndBroadcast(self, cross_tower_ops, distribution):
 
     # test reduce()
     for destinations in all_destinations:
-      self._assert_value_equal(
+      self._assert_values_equal(
           cross_tower_ops.reduce("mean", per_device, destinations=destinations),
           _fake_mirrored(mean, destinations or per_device))
-      self._assert_value_equal(
+      self._assert_values_equal(
           cross_tower_ops.reduce(
               "mean", per_device_2, destinations=destinations),
           _fake_mirrored(mean_2, destinations or per_device))
-      self._assert_value_equal(
+      self._assert_values_equal(
           cross_tower_ops.reduce("sum", per_device, destinations=destinations),
           _fake_mirrored(mean * len(devices), destinations or per_device))
-      self._assert_value_equal(
+      self._assert_values_equal(
           cross_tower_ops.reduce(
               "sum", per_device_2, destinations=destinations),
           _fake_mirrored(mean_2 * len(devices), destinations or per_device))
 
     # test batch_reduce()
     for d1, d2 in itertools.product(all_destinations, all_destinations):
-      self._assert_value_equal(
+      self._assert_values_equal(
           cross_tower_ops.batch_reduce(
               "mean", [(per_device, d1), (per_device_2, d2)]),
           [_fake_mirrored(mean, d1 or per_device),
            _fake_mirrored(mean_2, d2 or per_device_2)])
-      self._assert_value_equal(
+      self._assert_values_equal(
           cross_tower_ops.batch_reduce(
               "sum", [(per_device, d1), (per_device_2, d2)]),
           [_fake_mirrored(mean * len(devices), d1 or per_device),
@@ -176,7 +204,7 @@ def testReductionAndBroadcast(self, cross_tower_ops, distribution):
       if destinations is None:
         continue
       else:
-        self._assert_value_equal(
+        self._assert_values_equal(
             cross_tower_ops.broadcast(constant_op.constant(1.), destinations),
             _fake_mirrored(1., destinations))
 
@@ -184,16 +212,14 @@ def testChooseAlgorithm(self):
     device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7],
                     [0, 5, 6, 7], [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6]]
     result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
-    self.assertTrue(
-        isinstance(result, cross_tower_ops_lib.AllReduceCrossTowerOps))
+    self.assertIsInstance(result, cross_tower_ops_lib.AllReduceCrossTowerOps)
     self.assertEqual(result.all_reduce_alg, "hierarchical_copy")
     self.assertEqual(result.num_packs, 8)
 
     # if there are only 4 devices
     device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7]]
     result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
-    self.assertTrue(
-        isinstance(result, cross_tower_ops_lib.AllReduceCrossTowerOps))
+    self.assertIsInstance(result, cross_tower_ops_lib.AllReduceCrossTowerOps)
     self.assertEqual(result.all_reduce_alg, "nccl")
     self.assertEqual(result.num_packs, 1)
 
@@ -202,8 +228,7 @@ def testChooseAlgorithm(self):
                     [0, 1, 2, 3, 7], [0, 4, 5, 6, 7], [1, 4, 5, 6, 7],
                     [2, 4, 5, 6, 7], [3, 4, 5, 6, 7]]
     result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
-    self.assertTrue(
-        isinstance(result, cross_tower_ops_lib.AllReduceCrossTowerOps))
+    self.assertIsInstance(result, cross_tower_ops_lib.AllReduceCrossTowerOps)
     self.assertEqual(result.all_reduce_alg, "hierarchical_copy")
     self.assertEqual(result.num_packs, 8)
 
@@ -211,11 +236,85 @@ def testChooseAlgorithm(self):
     device_links = [[0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7], [0, 5, 6, 7],
                     [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6], [1, 2, 3, 4]]
     result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
-    self.assertTrue(
-        isinstance(result, cross_tower_ops_lib.AllReduceCrossTowerOps))
+    self.assertIsInstance(result, cross_tower_ops_lib.AllReduceCrossTowerOps)
     self.assertEqual(result.all_reduce_alg, "nccl")
     self.assertEqual(result.num_packs, 1)
 
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      required_gpus=1))
+  def testSimpleReduceWithIndexedSlices(self):
+    devices = ["/cpu:0", "/gpu:0"]
+    t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0])
+    t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1])
+    per_device = value_lib.PerDevice({devices[0]: t0, devices[1]: t1})
+    result = cross_tower_ops_lib._simple_reduce(per_device, devices[0],
+                                                math_ops.add_n, "sum")
+
+    # Test that the result is semantically equal to both the concatenated
+    # IndexedSlices with and without duplicate indices.
+    total_with_dups = _make_indexed_slices(
+        [[1., 2.], [3., 4.], [5., 6.]], [1, 1, 3], [5, 2], devices[0])
+    total_without_dups = _make_indexed_slices(
+        [[4., 6.], [5., 6.]], [1, 3], [5, 2], devices[0])
+    self._assert_indexed_slices_equal(total_with_dups, result)
+    self._assert_indexed_slices_equal(total_without_dups, result)
+
+  @combinations.generate(combinations.combine(
+      cross_tower_ops_instance=[
+          combinations.NamedObject(
+              "ReductionToOneDeviceCrossTowerOps",
+              cross_tower_ops_lib.ReductionToOneDeviceCrossTowerOps()),
+          combinations.NamedObject(
+              "AllReduceCrossTowerOps",
+              cross_tower_ops_lib.AllReduceCrossTowerOps())
+      ],
+      method_string=["sum", "mean"],
+      batch_reduce=[True, False],
+      mode=["graph", "eager"],
+      required_gpus=1))
+  def testIndexedSlicesAllReduce(self, cross_tower_ops_instance,
+                                 method_string, batch_reduce):
+    devices = ["/cpu:0", "/gpu:0"]
+    dense_shape = [5, 2]
+    t0 = _make_indexed_slices([[1., 2.]], [1], dense_shape, devices[0])
+    t1 = _make_indexed_slices(
+        [[3., 4.], [5., 6.]], [1, 3], dense_shape, devices[1])
+    per_device = value_lib.PerDevice({devices[0]: t0, devices[1]: t1})
+
+    if batch_reduce:
+      result = cross_tower_ops_instance.batch_reduce(method_string,
+                                                     [(per_device, devices)])
+    else:
+      result = cross_tower_ops_instance.reduce(method_string, per_device,
+                                               devices)
+
+    total_indices_with_dups = [1, 1, 3]
+    total_indices_without_dups = [1, 3]
+
+    if method_string == "sum":
+      total_values_with_dups = [[1., 2.], [3., 4.], [5., 6.]]
+      total_values_without_dups = [[4., 6.], [5., 6.]]
+    else:
+      assert method_string == "mean"
+      total_values_with_dups = [[0.5, 1.], [1.5, 2.], [2.5, 3.]]
+      total_values_without_dups = [[2., 3.], [2.5, 3.]]
+
+    total_mirrored_with_dups = _make_mirrored_indexed_slices(
+        devices, total_values_with_dups, total_indices_with_dups, dense_shape)
+    total_mirrored_without_dups = _make_mirrored_indexed_slices(
+        devices, total_values_without_dups, total_indices_without_dups,
+        dense_shape)
+
+    # Test that the result is semantically equal to both the concatenated
+    # IndexedSlices, as well as when the duplicate indices are summed up.
+    if batch_reduce:
+      total_mirrored_with_dups = [total_mirrored_with_dups]
+      total_mirrored_without_dups = [total_mirrored_without_dups]
+
+    self._assert_values_equal(total_mirrored_with_dups, result)
+    self._assert_values_equal(total_mirrored_without_dups, result)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distribute/python/cross_tower_utils.py b/tensorflow/contrib/distribute/python/cross_tower_utils.py
index fc04e2195f6d30..137fabf4c739bb 100644
--- a/tensorflow/contrib/distribute/python/cross_tower_utils.py
+++ b/tensorflow/contrib/distribute/python/cross_tower_utils.py
@@ -21,9 +21,11 @@
 import collections as pycoll
 
 from tensorflow.contrib import nccl
+from tensorflow.contrib.distribute.python import values as value_lib
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 
 
@@ -337,3 +339,46 @@ def unpack_small_tensors(tower_grads, packing):
         new_gv_list.insert(idx, gv[gi])
     new_tower_grads.append(new_gv_list)
   return new_tower_grads
+
+
+def aggregate_tensors_or_indexed_slices(values, accumulation_fn=math_ops.add_n):
+  """Aggregate tensors using `accumulation_fn` and IndexedSlices via concat."""
+  if any(isinstance(v, ops.IndexedSlices) for v in values):
+    return gradients_impl._AggregateIndexedSlicesGradients(values)  # pylint: disable=protected-access
+  else:
+    return accumulation_fn(values)
+
+
+def divide_by_n_tensors_or_indexed_slices(value, n):
+  if isinstance(value, ops.IndexedSlices):
+    value = gradients_impl._HandleNestedIndexedSlices(value)  # pylint: disable=protected-access
+    return ops.IndexedSlices(
+        value.values / n, value.indices, value.dense_shape)
+  else:
+    return value / n
+
+
+def copy_tensor_or_indexed_slices_to_device(value, device):
+  with ops.device(device):
+    if isinstance(value, ops.IndexedSlices):
+      copied_values = array_ops.identity(value.values)
+      copied_indices = array_ops.identity(value.indices)
+      copied_shape = array_ops.identity(value.dense_shape)
+      result = ops.IndexedSlices(copied_values, copied_indices, copied_shape)
+    else:
+      result = array_ops.identity(value)
+  return result
+
+
+def contains_indexed_slices(value):
+  """Check whether the value is `IndexedSlices` or contains `IndexedSlices`."""
+  if isinstance(value, ops.IndexedSlices):
+    return True
+  elif isinstance(value, (list, tuple)) and value:
+    return any(contains_indexed_slices(v) for v in value)
+  elif isinstance(value, value_lib.DistributedValues):
+    return contains_indexed_slices(list(value._index.values()))  # pylint: disable=protected-access
+  elif isinstance(value, value_lib.MapOutput):
+    return contains_indexed_slices(value.get())
+  else:
+    return False
diff --git a/tensorflow/contrib/distribute/python/cross_tower_utils_test.py b/tensorflow/contrib/distribute/python/cross_tower_utils_test.py
new file mode 100644
index 00000000000000..4ef8db681503dc
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/cross_tower_utils_test.py
@@ -0,0 +1,152 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for cross_tower_utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.contrib.distribute.python import cross_tower_utils
+from tensorflow.contrib.distribute.python import values as value_lib
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
+from tensorflow.python.training import device_util
+
+
+class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
+
+  def _assert_values_equal(self, left, right):
+    self.assertAllEqual(
+        self.evaluate(ops.convert_to_tensor(left)),
+        self.evaluate(ops.convert_to_tensor(right)))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testAggregateTensors(self):
+    t0 = constant_op.constant([[1., 2.], [0, 0], [3., 4.]])
+    t1 = constant_op.constant([[0., 0.], [5, 6], [7., 8.]])
+    total = constant_op.constant([[1., 2.], [5, 6], [10., 12.]])
+    result = cross_tower_utils.aggregate_tensors_or_indexed_slices([t0, t1])
+    self._assert_values_equal(total, result)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testAggregateIndexedSlices(self):
+    t0 = math_ops._as_indexed_slices(
+        constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
+    t1 = math_ops._as_indexed_slices(
+        constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
+    total = constant_op.constant([[1., 2.], [5, 6], [10., 12.]])
+    result = cross_tower_utils.aggregate_tensors_or_indexed_slices([t0, t1])
+    self.assertIsInstance(result, ops.IndexedSlices)
+    self._assert_values_equal(total, result)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testDivideTensor(self):
+    t = constant_op.constant([[1., 2.], [0, 0], [3., 4.]])
+    n = 2
+    expected = constant_op.constant([[0.5, 1.], [0, 0], [1.5, 2.]])
+    result = cross_tower_utils.divide_by_n_tensors_or_indexed_slices(t, n)
+    self._assert_values_equal(expected, result)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testDivideIndexedSlices(self):
+    t = math_ops._as_indexed_slices(
+        constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
+    n = 2
+    expected = constant_op.constant([[0.5, 1.], [0, 0], [1.5, 2.]])
+    result = cross_tower_utils.divide_by_n_tensors_or_indexed_slices(t, n)
+    self.assertIsInstance(result, ops.IndexedSlices)
+    self._assert_values_equal(expected, result)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testIsIndexedSlices(self):
+    t = math_ops._as_indexed_slices(
+        constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
+    self.assertTrue(cross_tower_utils.contains_indexed_slices(t))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testContainsIndexedSlices_List(self):
+    t0 = math_ops._as_indexed_slices(
+        constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
+    t1 = math_ops._as_indexed_slices(
+        constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
+    self.assertTrue(cross_tower_utils.contains_indexed_slices([t0, t1]))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testContainsIndexedSlices_Tuple(self):
+    t0 = math_ops._as_indexed_slices(
+        constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
+    t1 = math_ops._as_indexed_slices(
+        constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
+    self.assertTrue(cross_tower_utils.contains_indexed_slices((t0, t1)))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testContainsIndexedSlices_PerDevice(self):
+    t0 = math_ops._as_indexed_slices(
+        constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
+    t1 = math_ops._as_indexed_slices(
+        constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
+    per_device = value_lib.PerDevice({"/gpu:0": t0, "/cpu:0": t1})
+    self.assertTrue(cross_tower_utils.contains_indexed_slices(per_device))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testContainsIndexedSlices_PerDeviceMapOutput(self):
+    t0 = math_ops._as_indexed_slices(
+        constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
+    t1 = math_ops._as_indexed_slices(
+        constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
+    per_device = value_lib.PerDevice({
+        "/gpu:0": value_lib.MapOutput([t0]),
+        "/cpu:0": value_lib.MapOutput([t1])})
+    self.assertTrue(cross_tower_utils.contains_indexed_slices(per_device))
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      required_gpus=1))
+  def testCopyTensor(self):
+    with ops.device("/cpu:0"):
+      t = constant_op.constant([[1., 2.], [0, 0], [3., 4.]])
+    destination = "/gpu:0"
+    result = cross_tower_utils.copy_tensor_or_indexed_slices_to_device(
+        t, destination)
+
+    self._assert_values_equal(t, result)
+    self.assertEqual(device_util.resolve(destination),
+                     device_util.resolve(result.device))
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      required_gpus=1))
+  def testCopyIndexedSlices(self):
+    with ops.device("/cpu:0"):
+      t = math_ops._as_indexed_slices(
+          constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
+    destination = "/gpu:0"
+    result = cross_tower_utils.copy_tensor_or_indexed_slices_to_device(
+        t, destination)
+
+    self.assertIsInstance(result, ops.IndexedSlices)
+    self._assert_values_equal(t, result)
+    self.assertEqual(device_util.resolve(destination),
+                     device_util.resolve(result.device))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/examples/simple_tfkeras_example.py b/tensorflow/contrib/distribute/python/examples/simple_tfkeras_example.py
index b87224251ca384..2b05884b9b9347 100644
--- a/tensorflow/contrib/distribute/python/examples/simple_tfkeras_example.py
+++ b/tensorflow/contrib/distribute/python/examples/simple_tfkeras_example.py
@@ -12,11 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""An example tf.keras model that is trained using MirroredStrategy."""
+"""An example of training tf.keras Model using MirroredStrategy."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-from sys import argv
+
+import sys
+
 import numpy as np
 import tensorflow as tf
 
@@ -33,30 +35,37 @@ def input_fn():
 
 def main(args):
   if len(args) < 2:
-    print('You must specify  model_dir for checkpoints such as'
-          ' /tmp/tfkeras_example./')
+    print('You must specify model_dir for checkpoints such as'
+          ' /tmp/tfkeras_example/.')
     return
 
-  print('Using %s to store checkpoints.' % args[1])
-
-  strategy = tf.contrib.distribute.MirroredStrategy(
-      ['/device:GPU:0', '/device:GPU:1'])
-  config = tf.estimator.RunConfig(train_distribute=strategy)
-  optimizer = tf.train.GradientDescentOptimizer(0.2)
+  model_dir = args[1]
+  print('Using %s to store checkpoints.' % model_dir)
 
+  # Define tf.keras Model.
   model = tf.keras.Sequential()
   model.add(tf.keras.layers.Dense(16, activation='relu', input_shape=(10,)))
   model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
 
+  # Compile tf.keras Model.
+  optimizer = tf.train.GradientDescentOptimizer(0.2)
   model.compile(loss='binary_crossentropy', optimizer=optimizer)
   model.summary()
   tf.keras.backend.set_learning_phase(True)
+
+  # Define a DistributionStrategy and convert the tf.keras Model to a
+  # tf.Estimator that utilizes the DistributionStrategy.
+  strategy = tf.contrib.distribute.MirroredStrategy(
+      ['/device:GPU:0', '/device:GPU:1'])
+  config = tf.estimator.RunConfig(train_distribute=strategy)
   keras_estimator = tf.keras.estimator.model_to_estimator(
-      keras_model=model, config=config, model_dir=args[1])
+      keras_model=model, config=config, model_dir=model_dir)
 
+  # Train and evaluate the tf.Estimator.
   keras_estimator.train(input_fn=input_fn, steps=10)
   eval_result = keras_estimator.evaluate(input_fn=input_fn)
   print('Eval result: {}'.format(eval_result))
 
+
 if __name__ == '__main__':
-  tf.app.run(argv=argv)
+  tf.app.run(argv=sys.argv)
diff --git a/tensorflow/contrib/distribute/python/input_ops.py b/tensorflow/contrib/distribute/python/input_ops.py
new file mode 100644
index 00000000000000..1f24f629479b6a
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/input_ops.py
@@ -0,0 +1,141 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Input-pipeline utilities for Distribution strategies."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import readers
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import tf_logging
+
+# TODO(priyag): Any other reader datasets to consider here?
+_READER_DATASET_OPS = [
+    "TextLineDataset",
+    "TFRecordDataset",
+    "FixedLengthRecordDataset"
+]
+
+
+# pylint: disable=protected-access
+def auto_shard_dataset(dataset, num_shards, index):
+  """Shard the input pipeline by sharding the underlying list of files.
+
+  Args:
+    dataset: A `tf.data.Dataset` instance, typically the result of a bunch of
+      dataset transformations.
+    num_shards: A `tf.int64` scalar `tf.Tensor`, representing the number of
+        shards operating in parallel. Same usage as in `Dataset.shard`.
+    index: A `tf.int64` scalar `tf.Tensor`, representing the worker index.
+      Same usage as in `Dataset.shard`.
+
+  Returns:
+    A modified `Dataset` obtained by updating the pipeline sharded by the
+    files.
+
+  Raises:
+    NotImplementedError: If we cannot automatically determine a good way to
+      shard the input dataset.
+  """
+
+  # TODO(priyag): Clone datasets instead of updating in place, similar to the
+  # clone method for TFRecordDataset.
+  def _auto_shard_impl(dataset, found_reader_op):
+    """Recursive implementation of auto sharding."""
+
+    if not found_reader_op:
+      # TODO(priyag): Make this check more robust by enforcing some common
+      # property on reader datasets.
+      if (isinstance(dataset, readers.TextLineDataset) or
+          isinstance(dataset, readers.FixedLengthRecordDataset)):
+        filenames_tensor = dataset._filenames
+        num_files = array_ops.size(filenames_tensor)
+        sharded_filenames_tensor = array_ops.gather(
+            filenames_tensor, math_ops.range(index, num_files, num_shards))
+        dataset._filenames = sharded_filenames_tensor
+        return dataset
+      elif isinstance(dataset, readers.TFRecordDataset):
+        # `TFRecordDataset` needs to be handled separately than other readers
+        # because it converts filenames to a dataset first. Also, we clone it
+        # instead of updating in place because it has special logic in the
+        # constructor. Eventually we will change all cases to clone datasets
+        # instead of updating in-place.
+        return dataset._clone(
+            filenames=dataset._filenames.shard(num_shards, index))
+      elif hasattr(dataset, "_map_func"):
+        # TODO(priyag): Make this check more robust by enforcing some common
+        # property on all map/flatmap/interleave datasets.
+        map_func_def = dataset._map_func.definition
+        for node in map_func_def.node_def:
+          if node.op in _READER_DATASET_OPS:
+            found_reader_op = True
+            break
+          elif node.op == "FlatMapDataset":
+            # TODO(priyag): Should this check for other map datasets? Should it
+            # be recursive? It is too specific to implementation of
+            # TFRecordDataset right now.
+            nested_func_name = node.attr["f"].func.name
+            nested_func = ops.get_default_graph()._functions[nested_func_name]
+            for nested_node in nested_func.definition.node_def:
+              if nested_node.op in _READER_DATASET_OPS:
+                found_reader_op = True
+                break
+            if found_reader_op:
+              break
+        if found_reader_op:
+          dataset._input_dataset = _auto_shard_impl(
+              dataset._input_dataset, found_reader_op)
+          return dataset
+
+    # TODO(priyag): Make _input_dataset(s) a common property of all datasets to
+    # make this check more robust.
+    if hasattr(dataset, "_input_dataset"):
+      dataset._input_dataset = _auto_shard_impl(
+          dataset._input_dataset, found_reader_op)
+      if hasattr(dataset, "_dataset_to_concatenate"):
+        # Special case for `ConcatentateDataset`. We want to shard all input
+        # datasets.
+        dataset._dataset_to_concatenate = _auto_shard_impl(
+            dataset._dataset_to_concatenate, found_reader_op)
+      return dataset
+
+    if hasattr(dataset, "_datasets"):
+      # Special case for `ZipDataset`.
+      dataset._datasets = nest.pack_sequence_as(dataset._datasets, [
+          _auto_shard_impl(ds, found_reader_op)
+          for ds in nest.flatten(dataset._datasets)
+      ])
+      return dataset
+
+    if not found_reader_op:
+      tf_logging.warn(
+          "Could not find a standard reader in the input pipeline"
+          "(one of TextLineDataset, TFRecordDataset, FixedLengthRecordDataset)."
+          "Falling back to sharding the dataset anyway. Please verify"
+          "correctness of auto-sharding for your input.")
+
+    # TODO(priyag): What do we want to do if the number of filenames is
+    # uneven in the number of shards? By default, this will just return as
+    # many items it can before throwing OutOfRangeError.
+    # TODO(priyag): This will shard the filenames before any shuffling of the
+    # filename dataset. It might be desirable to shard after shuffling
+    # filenames? If so, how do we achieve that?
+    return dataset.shard(num_shards, index)
+
+  return _auto_shard_impl(dataset=dataset, found_reader_op=False)
diff --git a/tensorflow/contrib/distribute/python/input_ops_test.py b/tensorflow/contrib/distribute/python/input_ops_test.py
new file mode 100644
index 00000000000000..16179c3a4903c8
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/input_ops_test.py
@@ -0,0 +1,265 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for input pipeline modifications for distribution strategies."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.contrib.data.python.ops import batching
+from tensorflow.contrib.data.python.ops import interleave_ops
+from tensorflow.contrib.distribute.python import input_ops
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import readers
+from tensorflow.python.framework import errors
+from tensorflow.python.lib.io import python_io
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+class AutoShardDatasetTest(test.TestCase):
+
+  def setUp(self):
+    super(AutoShardDatasetTest, self).setUp()
+    self._num_files = 10
+    self._num_records = 4
+    self._num_shards = 2
+    self._shard_index = 0
+    self._record_bytes = 10
+
+  def _record(self, r, f):
+    return compat.as_bytes("Record %d of file %d" % (r, f))
+
+  def _text_line(self, r, f):
+    return compat.as_bytes("Text line %d of file %d" % (r, f))
+
+  def _fixed_length_record(self, r, f):
+    return compat.as_bytes(str((r * f) % 10) * self._record_bytes)
+
+  def _createTFRecordFiles(self):
+    filenames = []
+    for i in range(self._num_files):
+      fn = os.path.join(self.get_temp_dir(), "tf_record.%d.txt" % i)
+      filenames.append(fn)
+      writer = python_io.TFRecordWriter(fn)
+      for j in range(self._num_records):
+        record = self._record(j, i)
+        writer.write(record)
+      writer.close()
+    return filenames
+
+  def _createTextFiles(self):
+    filenames = []
+    for i in range(self._num_files):
+      fn = os.path.join(self.get_temp_dir(), "text_line.%d.txt" % i)
+      filenames.append(fn)
+      contents = []
+      for j in range(self._num_records):
+        contents.append(self._text_line(j, i))
+        if j + 1 != self._num_records or i == 0:
+          contents.append(b"\r\n")
+      contents = b"".join(contents)
+
+      with open(fn, "wb") as f:
+        f.write(contents)
+    return filenames
+
+  def _createFixedLengthRecordFiles(self):
+    filenames = []
+    for i in range(self._num_files):
+      fn = os.path.join(self.get_temp_dir(), "fixed_length_record.%d.txt" % i)
+      filenames.append(fn)
+      with open(fn, "wb") as f:
+        for j in range(self._num_records):
+          f.write(self._fixed_length_record(j, i))
+    return filenames
+
+  def _verifySimpleShardingOutput(self, dataset, record_fn):
+    iterator = dataset.make_one_shot_iterator()
+    next_element = iterator.get_next()
+    with self.test_session() as sess:
+      for f in range(self._shard_index, self._num_files, self._num_shards):
+        for r in range(self._num_records):
+          self.assertAllEqual(record_fn(r, f), sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testTFRecordDataset(self):
+    dataset = readers.TFRecordDataset(self._createTFRecordFiles())
+    dataset = input_ops.auto_shard_dataset(
+        dataset, self._num_shards, self._shard_index)
+
+    self._verifySimpleShardingOutput(dataset, self._record)
+
+  def testFlatMap(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        self._createTFRecordFiles())
+    dataset = dataset.flat_map(readers.TFRecordDataset)
+    dataset = input_ops.auto_shard_dataset(
+        dataset, self._num_shards, self._shard_index)
+
+    self._verifySimpleShardingOutput(dataset, self._record)
+
+  def testInterleave(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        self._createTFRecordFiles())
+    dataset = dataset.interleave(
+        readers.TFRecordDataset, cycle_length=4, block_length=self._num_records)
+    dataset = input_ops.auto_shard_dataset(
+        dataset, self._num_shards, self._shard_index)
+
+    # Since block_length == num records in each file, the output will still
+    # contain records in order of files.
+    self._verifySimpleShardingOutput(dataset, self._record)
+
+  def testParallelInterleave(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        self._createTFRecordFiles())
+    dataset = dataset.apply(interleave_ops.parallel_interleave(
+        readers.TFRecordDataset,
+        cycle_length=4,
+        block_length=self._num_records))
+    dataset = input_ops.auto_shard_dataset(
+        dataset, self._num_shards, self._shard_index)
+
+    # Since block_length == num records in each file, the output will still
+    # contain records in order of files.
+    self._verifySimpleShardingOutput(dataset, self._record)
+
+  def testListfiles(self):
+    filenames = self._createTFRecordFiles()
+    file_pattern = filenames[0].rsplit("/", 1)[0] + "/tf_record.*.txt"
+    dataset = dataset_ops.Dataset.list_files(file_pattern, shuffle=False)
+    dataset = dataset.flat_map(readers.TFRecordDataset)
+    dataset = input_ops.auto_shard_dataset(
+        dataset, self._num_shards, self._shard_index)
+
+    iterator = dataset.make_one_shot_iterator()
+    next_element = iterator.get_next()
+    with self.test_session() as sess:
+      actual, expected = [], []
+      for f in range(self._shard_index, self._num_files, self._num_shards):
+        for r in range(self._num_records):
+          actual.append(sess.run(next_element))
+          expected.append(self._record(r, f))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+      self.assertAllEqual(expected, actual)
+
+  def testComplexPipeline(self):
+    # Setup a complex input pipeline.
+    batch_size = 2
+    num_epochs = 5
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        self._createTFRecordFiles())
+    dataset = dataset.shuffle(buffer_size=self._num_files)
+    dataset = dataset.flat_map(readers.TFRecordDataset)
+    dataset = dataset.prefetch(buffer_size=batch_size)
+    dataset = dataset.shuffle(2 * self._num_files * self._num_records)
+    dataset = dataset.repeat(num_epochs)
+    dataset = dataset.apply(batching.map_and_batch(
+        lambda x: x, batch_size=batch_size))
+    dataset = dataset.prefetch(buffer_size=None)
+
+    # Auto shard.
+    dataset = input_ops.auto_shard_dataset(
+        dataset, self._num_shards, self._shard_index)
+
+    # Verify output.
+    iterator = dataset.make_one_shot_iterator()
+    next_element = iterator.get_next()
+    with self.test_session() as sess:
+      actual = []
+      num_iterations = (self._num_files * self._num_records * num_epochs) // (
+          self._num_shards * batch_size)
+      for _ in range(num_iterations):
+        actual.extend(sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+      expected = []
+      for f in range(0, self._num_files, self._num_shards):
+        for r in range(self._num_records):
+          expected.append(self._record(r, f))
+      expected *= num_epochs
+
+      self.assertAllEqual(sorted(expected), sorted(actual))
+
+  def testZip(self):
+    dataset1 = readers.TFRecordDataset(self._createTFRecordFiles())
+    dataset2 = readers.TextLineDataset(self._createTextFiles())
+    dataset = dataset_ops.Dataset.zip((dataset1, dataset2))
+    dataset = input_ops.auto_shard_dataset(
+        dataset, self._num_shards, self._shard_index)
+
+    record_fn = lambda r, f: (self._record(r, f), self._text_line(r, f))
+    self._verifySimpleShardingOutput(dataset, record_fn)
+
+  def testConcat(self):
+    dataset1 = readers.TFRecordDataset(self._createTFRecordFiles())
+    dataset2 = readers.TextLineDataset(self._createTextFiles())
+    dataset = dataset1.concatenate(dataset2)
+    dataset = input_ops.auto_shard_dataset(
+        dataset, self._num_shards, self._shard_index)
+
+    iterator = dataset.make_one_shot_iterator()
+    next_element = iterator.get_next()
+    with self.test_session() as sess:
+      for f in range(self._shard_index, self._num_files, self._num_shards):
+        for r in range(self._num_records):
+          self.assertAllEqual(self._record(r, f), sess.run(next_element))
+      for f in range(self._shard_index, self._num_files, self._num_shards):
+        for r in range(self._num_records):
+          self.assertAllEqual(self._text_line(r, f), sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+  def testTextLineReader(self):
+    dataset = readers.TextLineDataset(self._createTextFiles())
+    dataset = input_ops.auto_shard_dataset(
+        dataset, self._num_shards, self._shard_index)
+
+    self._verifySimpleShardingOutput(dataset, self._text_line)
+
+  def testTextLineReaderWithFlatMap(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices(self._createTextFiles())
+    dataset = dataset.flat_map(readers.TextLineDataset)
+    dataset = input_ops.auto_shard_dataset(
+        dataset, self._num_shards, self._shard_index)
+
+    self._verifySimpleShardingOutput(dataset, self._text_line)
+
+  def testFixedLengthReader(self):
+    dataset = readers.FixedLengthRecordDataset(
+        self._createFixedLengthRecordFiles(), self._record_bytes)
+    dataset = input_ops.auto_shard_dataset(
+        dataset, self._num_shards, self._shard_index)
+
+    self._verifySimpleShardingOutput(dataset, self._fixed_length_record)
+
+  def testFixedLengthReaderWithFlatMap(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        self._createFixedLengthRecordFiles())
+    dataset = dataset.flat_map(
+        lambda f: readers.FixedLengthRecordDataset(f, self._record_bytes))
+    dataset = input_ops.auto_shard_dataset(
+        dataset, self._num_shards, self._shard_index)
+
+    self._verifySimpleShardingOutput(dataset, self._fixed_length_record)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/keras_test.py b/tensorflow/contrib/distribute/python/keras_test.py
new file mode 100644
index 00000000000000..75ecd90dcffa7a
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/keras_test.py
@@ -0,0 +1,148 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras Sequential and Functional models."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import numpy as np
+
+from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.estimator import keras as keras_lib
+from tensorflow.python.estimator import run_config as run_config_lib
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.summary.writer import writer_cache
+from tensorflow.python.training import rmsprop
+
+_RANDOM_SEED = 1337
+_TRAIN_SIZE = 200
+_INPUT_SIZE = (10,)
+_NUM_CLASS = 2
+
+
+def simple_sequential_model():
+  model = keras.models.Sequential()
+  model.add(keras.layers.Dense(16, activation='relu', input_shape=_INPUT_SIZE))
+  model.add(keras.layers.Dropout(0.1))
+  model.add(keras.layers.Dense(_NUM_CLASS, activation='softmax'))
+  return model
+
+
+def simple_functional_model():
+  a = keras.layers.Input(shape=_INPUT_SIZE)
+  b = keras.layers.Dense(16, activation='relu')(a)
+  b = keras.layers.Dropout(0.1)(b)
+  b = keras.layers.Dense(_NUM_CLASS, activation='softmax')(b)
+  model = keras.models.Model(inputs=[a], outputs=[b])
+  return model
+
+
+def get_ds_train_input_fn():
+  np.random.seed(_RANDOM_SEED)
+  (x_train, y_train), _ = testing_utils.get_test_data(
+      train_samples=_TRAIN_SIZE,
+      test_samples=50,
+      input_shape=_INPUT_SIZE,
+      num_classes=_NUM_CLASS)
+  y_train = keras.utils.to_categorical(y_train)
+
+  dataset = dataset_ops.Dataset.from_tensor_slices((x_train, y_train))
+  dataset = dataset.batch(32)
+  return dataset
+
+
+def get_ds_test_input_fn():
+  np.random.seed(_RANDOM_SEED)
+  _, (x_test, y_test) = testing_utils.get_test_data(
+      train_samples=_TRAIN_SIZE,
+      test_samples=50,
+      input_shape=_INPUT_SIZE,
+      num_classes=_NUM_CLASS)
+  y_test = keras.utils.to_categorical(y_test)
+
+  dataset = dataset_ops.Dataset.from_tensor_slices((x_test, y_test))
+  dataset = dataset.batch(32)
+  return dataset
+
+
+class TestKerasDistributionStrategy(test_util.TensorFlowTestCase):
+
+  def setUp(self):
+    self._base_dir = os.path.join(self.get_temp_dir(),
+                                  'keras_mirrored_strategy_test')
+    gfile.MakeDirs(self._base_dir)
+    self._config = run_config_lib.RunConfig(
+        tf_random_seed=_RANDOM_SEED, model_dir=self._base_dir)
+
+  def tearDown(self):
+    writer_cache.FileWriterCache.clear()
+    if os.path.isdir(self._base_dir):
+      gfile.DeleteRecursively(self._base_dir)
+
+  def test_train_functional_with_distribution_strategy(self):
+    dist = mirrored_strategy.MirroredStrategy(
+        devices=['/device:GPU:0', '/device:GPU:1'])
+    keras_model = simple_functional_model()
+    keras_model.compile(
+        loss='categorical_crossentropy',
+        optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.01))
+    config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED,
+                                      model_dir=self._base_dir,
+                                      train_distribute=dist)
+    with self.test_session():
+      est_keras = keras_lib.model_to_estimator(
+          keras_model=keras_model, config=config)
+      before_eval_results = est_keras.evaluate(
+          input_fn=get_ds_test_input_fn, steps=1)
+      est_keras.train(input_fn=get_ds_train_input_fn, steps=_TRAIN_SIZE / 16)
+      after_eval_results = est_keras.evaluate(input_fn=get_ds_test_input_fn,
+                                              steps=1)
+      self.assertLess(after_eval_results['loss'], before_eval_results['loss'])
+
+    writer_cache.FileWriterCache.clear()
+    gfile.DeleteRecursively(self._config.model_dir)
+
+  def test_train_sequential_with_distribution_strategy(self):
+    dist = mirrored_strategy.MirroredStrategy(
+        devices=['/device:GPU:0', '/device:GPU:1'])
+    keras_model = simple_sequential_model()
+    keras_model.compile(
+        loss='categorical_crossentropy',
+        optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.01))
+    config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED,
+                                      model_dir=self._base_dir,
+                                      train_distribute=dist)
+    with self.test_session():
+      est_keras = keras_lib.model_to_estimator(
+          keras_model=keras_model, config=config)
+      before_eval_results = est_keras.evaluate(
+          input_fn=get_ds_test_input_fn, steps=1)
+      est_keras.train(input_fn=get_ds_train_input_fn, steps=_TRAIN_SIZE / 16)
+      after_eval_results = est_keras.evaluate(input_fn=get_ds_test_input_fn,
+                                              steps=1)
+      self.assertLess(after_eval_results['loss'], before_eval_results['loss'])
+
+    writer_cache.FileWriterCache.clear()
+    gfile.DeleteRecursively(self._config.model_dir)
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py
index e134fe34e10be4..5c056a7c73def2 100644
--- a/tensorflow/contrib/distribute/python/minimize_loss_test.py
+++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py
@@ -44,13 +44,16 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
           combinations.distributions_and_v1_optimizers(),
           combinations.combine(mode=["graph"], use_callable_loss=[True, False])
           + combinations.combine(mode=["eager"], use_callable_loss=[True]),
-          combinations.combine(is_tpu=[False])) +
-      combinations.combine(
-          distribution=[combinations.tpu_strategy],
-          optimizer_fn=[combinations.adam_optimizer_v1_fn],
-          mode=["graph"],
-          use_callable_loss=[False],
-          is_tpu=[True]))
+          combinations.combine(is_tpu=[False])) + combinations.combine(
+              distribution=[combinations.tpu_strategy],
+              optimizer_fn=[
+                  combinations.adam_optimizer_v1_fn,
+                  # TODO(isaprykin):  Make Adam v2 work with while_loops
+                  # and TPUs.
+              ],
+              mode=["graph"],
+              use_callable_loss=[False],
+              is_tpu=[True]))
   def testTrainNetwork(self, distribution, optimizer_fn, use_callable_loss,
                        is_tpu):
     with distribution.scope():
@@ -101,7 +104,8 @@ def run_step():
           distribution=[combinations.tpu_strategy],
           optimizer_fn=[
               combinations.adam_optimizer_v1_fn,
-              combinations.gradient_descent_optimizer_v1_fn
+              combinations.gradient_descent_optimizer_v1_fn,
+              combinations.gradient_descent_optimizer_v2_fn,
           ],
           mode=["graph"],
           is_tpu=[True]))
@@ -171,13 +175,28 @@ def get_expected_variables(optimizer_fn, num_parameter_devices):
           set(created_variables))
 
   @combinations.generate(
-      combinations.times(combinations.distributions_and_v1_optimizers(),
-                         combinations.combine(
-                             mode=["graph", "eager"],
-                             momentum=[0.8, 0.9, 0.99],
-                             renorm=[False, True])))
+      combinations.times(
+          combinations.combine(momentum=[0.8, 0.9, 0.99], renorm=[False, True]),
+          combinations.times(
+              combinations.distributions_and_v1_optimizers(),
+              combinations.combine(
+                  mode=["graph", "eager"],
+                  is_tpu=[False],
+                  # TODO(isaprykin):  Allow False here.  Currently subsequent
+                  # towers will re-execute UPDATE_OPS of previous towers.
+                  update_ops_in_cross_tower_mode=[True])) +
+          combinations.combine(
+              distribution=[combinations.tpu_strategy_single_iteration],
+              optimizer_fn=[
+                  combinations.gradient_descent_optimizer_v1_fn,
+                  combinations.gradient_descent_optimizer_v2_fn
+              ],
+              mode=["graph"],
+              is_tpu=[True],
+              update_ops_in_cross_tower_mode=[False])))
   def testTrainNetworkWithBatchNorm(self, distribution, optimizer_fn, momentum,
-                                    renorm):
+                                    renorm, is_tpu,
+                                    update_ops_in_cross_tower_mode):
     """Verifies that moving mean updates are reduced across towers."""
     with distribution.scope():
       num_towers = len(distribution.worker_devices)
@@ -185,27 +204,30 @@ def testTrainNetworkWithBatchNorm(self, distribution, optimizer_fn, momentum,
           optimizer_fn,
           batch_per_epoch=num_towers,
           momentum=momentum,
-          renorm=renorm)
+          renorm=renorm,
+          update_ops_in_tower_mode=not update_ops_in_cross_tower_mode)
 
-      # Disable prefetching since that makes the specific input on each device
-      # to be non deterministic, and this test relies on specific input being
-      # on each device.
+      # Make sure prefetching is disabled since that makes the
+      # specific input on each device to be non deterministic, and
+      # this test relies on specific input being on each device.
       if isinstance(distribution, mirrored_strategy.MirroredStrategy):
-        distribution._prefetch_on_device = False
+        self.assertFalse(distribution._prefetch_on_device)
       iterator = distribution.distribute_dataset(
           dataset_fn).make_one_shot_iterator()
 
       def run_step():
-        return control_flow_ops.group(
-            distribution.unwrap(
-                distribution.call_for_each_tower(
-                    model_fn,
-                    iterator.get_next(),
-                    run_concurrently=batchnorm.built)) +
-            ops.get_collection(ops.GraphKeys.UPDATE_OPS))
+        fetches = distribution.unwrap(
+            distribution.call_for_each_tower(
+                model_fn, iterator.get_next(),
+                run_concurrently=batchnorm.built))
+        if update_ops_in_cross_tower_mode:
+          fetches += ops.get_collection(ops.GraphKeys.UPDATE_OPS)
+        return control_flow_ops.group(fetches)
 
       if not context.executing_eagerly():
         with self.test_session() as sess:
+          if is_tpu:
+            sess.run(tpu.initialize_system())
           run_step = sess.make_callable(run_step())
         self.evaluate(variables_lib.global_variables_initializer())
 
@@ -229,22 +251,40 @@ def averaged_batch_mean(i):
               expected_moving_mean - averaged_batch_mean(i)) * (1.0 - momentum))
           self.assertNear(expected_moving_means[i], moving_means[i], 0.0001)
 
+      if is_tpu:
+        with self.test_session() as sess:
+          sess.run(tpu.shutdown_system())
+
   @combinations.generate(
       combinations.times(
           combinations.combine(
-              distribution=[combinations.one_device_strategy,
-                            combinations.mirrored_strategy_with_gpu_and_cpu,
-                            combinations.mirrored_strategy_with_two_gpus],
-              optimizer_fn=[combinations.gradient_descent_optimizer_v1_fn,
-                            combinations.gradient_descent_optimizer_v2_fn],
-              loss_reduction=[losses_impl.Reduction.SUM,
-                              losses_impl.Reduction.MEAN,
-                              losses_impl.Reduction.SUM_OVER_BATCH_SIZE,
-                              losses_impl.Reduction.SUM_OVER_NONZERO_WEIGHTS]),
-          combinations.combine(mode=["graph"], use_callable_loss=[True, False])
-          + combinations.combine(mode=["eager"], use_callable_loss=[True])))
+              optimizer_fn=[
+                  combinations.gradient_descent_optimizer_v1_fn,
+                  combinations.gradient_descent_optimizer_v2_fn
+              ],
+              loss_reduction=[
+                  losses_impl.Reduction.SUM, losses_impl.Reduction.MEAN,
+                  losses_impl.Reduction.SUM_OVER_BATCH_SIZE,
+                  losses_impl.Reduction.SUM_OVER_NONZERO_WEIGHTS
+              ]),
+          combinations.times(
+              combinations.combine(
+                  distribution=[
+                      combinations.one_device_strategy,
+                      combinations.mirrored_strategy_with_gpu_and_cpu,
+                      combinations.mirrored_strategy_with_two_gpus
+                  ],
+                  is_tpu=[False]),
+              combinations.combine(
+                  mode=["graph"], use_callable_loss=[True, False]) +
+              combinations.combine(mode=["eager"], use_callable_loss=[True])) +
+          combinations.combine(
+              distribution=[combinations.tpu_strategy_single_iteration],
+              is_tpu=[True],
+              mode=["graph"],
+              use_callable_loss=[True, False])))
   def testMeanVsSum(self, distribution, optimizer_fn, loss_reduction,
-                    use_callable_loss):
+                    use_callable_loss, is_tpu):
     with distribution.scope():
       all_vars = []
 
@@ -280,12 +320,13 @@ def run_step():
 
       if not context.executing_eagerly():
         with self.test_session() as sess:
+          if is_tpu:
+            sess.run(tpu.initialize_system())
           run_step = sess.make_callable(run_step())
         self.evaluate(variables_lib.global_variables_initializer())
 
       run_step()
 
-      self.assertEqual(distribution.num_towers, len(all_vars))
       v = all_vars[0]
       self.assertTrue(all([v is vi for vi in all_vars[1:]]))
       weight = numpy.squeeze(self.evaluate(distribution.fetch(v)))
@@ -312,6 +353,10 @@ def run_step():
         # One of the mean loss reductions.
         self.assertNear(weight, 2 + 10.6, 0.0001)
 
+      if is_tpu:
+        with self.test_session() as sess:
+          sess.run(tpu.shutdown_system())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py
index 6efd578a775da7..cef0a2907b85d2 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py
@@ -18,6 +18,7 @@
 from __future__ import division
 from __future__ import print_function
 
+import contextlib
 import threading
 import six
 
@@ -39,6 +40,16 @@
 # TODO(josh11b): Replace asserts in this file with if ...: raise ...
 
 
+@contextlib.contextmanager
+def _enter_graph(g):
+  if context.executing_eagerly():
+    with g.as_default(), context.eager_mode():
+      yield
+  else:
+    with g.as_default():
+      yield
+
+
 def _cpu_device(device):
   cpu_device = tf_device.DeviceSpec.from_string(device)
   cpu_device.merge_from(tf_device.DeviceSpec(device_type="CPU", device_index=0))
@@ -73,13 +84,13 @@ def __init__(self,
     assert len(set(devices)) == len(devices), (
         "No duplicates allowed in `devices` argument.")
     # TODO(josh11b): Require at least 2 devices?
-    self._devices = devices
-    self._canonical_device_set = set(
-        [device_util.canonicalize(d) for d in devices])
+    self._devices = [device_util.resolve(d) for d in devices]
+    self._canonical_device_set = set(self._devices)
     self._device_index = values.PerDevice(
         dict((d, i) for i, d in enumerate(devices)))
     self._cross_tower_ops = cross_tower_ops
     self._prefetch_on_device = prefetch_on_device
+    # TODO(yuefengz): consider setting the default device.
 
   def _create_variable(self, next_creator, *args, **kwargs):
     """Create a mirrored variable. See `DistributionStrategy.scope`."""
@@ -107,13 +118,19 @@ def _create_variable(self, next_creator, *args, **kwargs):
           if i > 0:
             # Give replicas meaningful distinct names:
             var0name = index[devices[0]].name.split(":")[0]
-            kwargs["name"] = "%s/replica_%d" % (var0name, i)
+            # We append a / to variable names created on towers with id > 0 to
+            # ensure that we ignore the name scope and instead use the given
+            # name as the absolute name of the variable.
+            kwargs["name"] = "%s/replica_%d/" % (var0name, i)
             # Initialize replicas with the same value:
             if context.executing_eagerly():
-              initial_value = index[devices[0]].value()
+              kwargs["initial_value"] = array_ops.identity(
+                  index[devices[0]].value())
             else:
-              initial_value = index[devices[0]].initial_value
-            kwargs["initial_value"] = array_ops.identity(initial_value)
+              def initial_value_fn(device=d):
+                with ops.device(device):
+                  return array_ops.identity(index[devices[0]].initial_value)
+              kwargs["initial_value"] = initial_value_fn
           with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
             v = next_creator(*args, **kwargs)
           assert not isinstance(v, values.DistributedVariable)
@@ -244,8 +261,15 @@ def _call_for_each_tower(self, fn, *args, **kwargs):
                 {t.device: t.merge_args for t in threads})
             merge_kwargs = values.regroup(
                 {t.device: t.merge_kwargs for t in threads})
-            merge_result = threads[0].merge_fn(
-                self, *merge_args, **merge_kwargs)
+            # We capture the name_scope of the MTT when we call merge_fn
+            # to ensure that if we have opened a name scope in the MTT,
+            # it will be respected when executing the merge function. We only
+            # capture the name_scope from the first MTT and assume it is
+            # the same for all other MTTs.
+            mtt_captured_name_scope = threads[0].captured_name_scope
+            with ops.name_scope(mtt_captured_name_scope):
+              merge_result = threads[0].merge_fn(
+                  self, *merge_args, **merge_kwargs)
             for t in threads:
               t.merge_result = values.select_device(t.device, merge_result)
     finally:
@@ -321,7 +345,6 @@ def _update_non_slot(self, colocate_with, fn, *args, **kwargs):
 
   def _fetch(self, val, destination, fn):
     """Return a copy of `val` or `fn(val)` on `destination`."""
-    assert isinstance(destination, six.string_types)
     if isinstance(val, values.TowerLocalVariable):
       val = self.reduce(val.reduce_method, val, destinations=destination)
       with ops.device(destination):
@@ -386,7 +409,9 @@ def _get_devices_from(self, colocate_with=None):
       # pylint: disable=protected-access
       return list(colocate_with._index.keys())
     elif isinstance(colocate_with, six.string_types):
-      return [colocate_with]
+      return [device_util.resolve(colocate_with)]
+    elif isinstance(colocate_with, list):
+      return [device_util.resolve(d) for d in colocate_with]
     else:
       return colocate_with
 
@@ -413,6 +438,7 @@ def __init__(self, dist, coord, device, variable_creator_fn, fn, *args,
       self.merge_args = None
       self.merge_kwargs = None
       self.merge_result = None
+      self.captured_name_scope = None
       # We use a thread.Event for the main thread to signal when this
       # thread should start running (`should_run`), and another for
       # this thread to transfer control back to the main thread
@@ -436,13 +462,13 @@ def __init__(self, dist, coord, device, variable_creator_fn, fn, *args,
       self._variable_creator_stack = self.graph._variable_creator_stack[:]
       self._captured_var_scope = variable_scope.get_variable_scope()
       # Adding a "/" at end lets us re-enter this scope later.
-      self._captured_name_scope = self.graph.get_name_scope()
-      if self._captured_name_scope:
-        self._captured_name_scope += "/"
+      self._name_scope = self.graph.get_name_scope()
+      if self._name_scope:
+        self._name_scope += "/"
       if self.tower_id > 0:
-        if not self._captured_name_scope:
-          self._captured_name_scope = ""
-        self._captured_name_scope += "tower_%d/" % self.tower_id
+        if not self._name_scope:
+          self._name_scope = ""
+        self._name_scope += "tower_%d/" % self.tower_id
 
     def run(self):
       # pylint: disable=protected-access
@@ -455,10 +481,10 @@ def run(self):
         with self.coord.stop_on_exception(), \
             context.context()._mode(self.context_mode), \
             context.context().device_policy(self.context_device_policy), \
-            self.graph.as_default(), \
+            _enter_graph(self.graph), \
             MirroredTowerContext(self.distribution, self.tower_id), \
             ops.device(self.device), \
-            ops.name_scope(self._captured_name_scope), \
+            ops.name_scope(self._name_scope), \
             variable_scope.variable_scope(
                 self._captured_var_scope, reuse=self.tower_id > 0), \
             variable_scope.variable_creator_scope(self.variable_creator_fn):
@@ -484,6 +510,10 @@ def _merge_call(self, fn, *args, **kwargs):
     t.merge_fn = fn
     t.merge_args = args
     t.merge_kwargs = kwargs
+    t.captured_name_scope = t.graph.get_name_scope()
+    # Adding a "/" at end lets us re-enter this scope later.
+    if t.captured_name_scope:
+      t.captured_name_scope += "/"
     t.has_paused.set()
     t.should_run.wait()
     t.should_run.clear()
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
index 6c5c055070c0fc..bccd278847e3c8 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
@@ -28,9 +28,12 @@
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.layers import core
+from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import distribute as distribute_lib
@@ -116,7 +119,6 @@ def run_fn(device_id):
       self.assertEqual(expected, self.evaluate(unwrapped[0]))
 
 
-@test_util.with_c_api
 class MirroredStrategyVariableCreationTest(test.TestCase):
 
   config = config_pb2.ConfigProto()
@@ -370,22 +372,27 @@ def model_fn(device_id):
       expected_sum = 0.0
       expected_mean = 0.0
       for i, d in enumerate(dist.worker_devices):
-        # Test access within a device scope, should see different values.
-        with ops.device(d):
-          v_sum_value = self.evaluate(ret_v_sum.read_value())
-          v_mean_value = self.evaluate(ret_v_mean.read_value())
-          expected = i + 3.0
-          self.assertEqual(expected, v_sum_value)
-          expected_sum += expected
-          expected = i * 6.0
-          self.assertEqual(expected, v_mean_value)
-          expected_mean += expected
-
-      # fetch() should return the value you get by applying the
-      # reduction across all towers.
-      self.assertEqual(expected_sum, self.evaluate(dist.fetch(ret_v_sum)))
+        # Should see different values on different devices.
+        v_sum_value = self.evaluate(ret_v_sum.get(d).read_value())
+        v_mean_value = self.evaluate(ret_v_mean.get(d).read_value())
+        expected = i + 3.0
+        self.assertEqual(expected, v_sum_value)
+        expected_sum += expected
+        expected = i * 6.0
+        self.assertEqual(expected, v_mean_value)
+        expected_mean += expected
       expected_mean /= len(dist.worker_devices)
+
+      # Without get(device), should return the value you get by
+      # applying the reduction across all towers (whether you use
+      # fetch(), get(), or nothing).
+      self.assertEqual(expected_sum, self.evaluate(dist.fetch(ret_v_sum)))
       self.assertEqual(expected_mean, self.evaluate(dist.fetch(ret_v_mean)))
+      self.assertEqual(expected_sum, self.evaluate(ret_v_sum.get()))
+      self.assertEqual(expected_mean, self.evaluate(ret_v_mean.get()))
+      if not context.executing_eagerly():
+        self.assertEqual(expected_sum, self.evaluate(ret_v_sum))
+        self.assertEqual(expected_mean, self.evaluate(ret_v_mean))
 
   # NOTE(priyag): Names and name scopes are ignored in eager, hence we are not
   # testing this in eager mode.
@@ -431,6 +438,98 @@ def model_fn():
         self.assertEquals("foo/" + name + ":0", v0.name)
         self.assertEquals("tower_1/foo/" + name + ":0", v1.name)
 
+  # variable_scope.variable() respects name scopes when creating
+  # variables. On the other hand variable_scope.get_variable() ignores name
+  # scopes when creating variables. We test both methods of creating variables
+  # to make sure that we have the same variable names in both cases.
+  def testNameScopeWithVariable(self):
+    def in_cross_tower(_):
+      c = variable_scope.variable(1.0, name="c")
+      return c
+
+    def model_fn():
+      b = variable_scope.variable(1.0, name="b")
+      with ops.name_scope("foo"):
+        c = distribute_lib.get_tower_context().merge_call(in_cross_tower)
+      return b, c
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with context.graph_mode(), dist.scope():
+      with ops.name_scope("main"):
+        a = variable_scope.variable(1.0, name="a")
+        result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+      result_b = result[0]
+      result_c = result[1]
+      self.assertIsInstance(result_b, values.DistributedValues)
+      self.assertIsInstance(result_c, values.DistributedValues)
+      a0, a1 = dist.unwrap(a)
+      b0, b1 = dist.unwrap(result_b)
+      c0, c1 = dist.unwrap(result_c)
+      self.assertEquals("main/a:0", a0.name)
+      self.assertEquals("main/a/replica_1:0", a1.name)
+      self.assertEquals("main/b:0", b0.name)
+      self.assertEquals("main/b/replica_1:0", b1.name)
+      self.assertEquals("main/foo/c:0", c0.name)
+      self.assertEquals("main/foo/c/replica_1:0", c1.name)
+
+  def testNameScopeWithGetVariable(self):
+    def in_cross_tower(_):
+      c = variable_scope.get_variable("c", [1])
+      return c
+
+    def model_fn():
+      b = variable_scope.get_variable("b", [1])
+      with ops.name_scope("foo"):
+        c = distribute_lib.get_tower_context().merge_call(in_cross_tower)
+      return b, c
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with context.graph_mode(), dist.scope():
+      with ops.name_scope("main"):
+        a = variable_scope.get_variable("a", [1])
+        result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+      result_b = result[0]
+      result_c = result[1]
+      self.assertIsInstance(result_b, values.DistributedValues)
+      self.assertIsInstance(result_c, values.DistributedValues)
+      a0, a1 = dist.unwrap(a)
+      b0, b1 = dist.unwrap(result_b)
+      c0, c1 = dist.unwrap(result_c)
+      self.assertEquals("a:0", a0.name)
+      self.assertEquals("a/replica_1:0", a1.name)
+      self.assertEquals("b:0", b0.name)
+      self.assertEquals("b/replica_1:0", b1.name)
+      self.assertEquals("c:0", c0.name)
+      self.assertEquals("c/replica_1:0", c1.name)
+
+  def testDynamicRnnVariables(self):
+    def model_fn():
+      inputs = constant_op.constant(2 * [2 * [[0.0, 1.0, 2.0, 3.0, 4.0]]])
+      cell_fw = rnn_cell_impl.LSTMCell(300)
+      cell_bw = rnn_cell_impl.LSTMCell(300)
+      (outputs, _) = rnn.bidirectional_dynamic_rnn(
+          cell_fw,
+          cell_bw,
+          inputs,
+          dtype=dtypes.float32)
+      return outputs
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with context.graph_mode(), dist.scope():
+      result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+      # Two variables are created by the RNN layer.
+      self.assertEquals(2, len(result))
+      for v in result:
+        self.assertIsInstance(v, values.DistributedValues)
+        _, v1 = dist.unwrap(v)
+        self.assertStartsWith(v1.name, "tower_1/")
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_test.py
index a1ef0ecc77a8e8..61cbe6df813bb2 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_test.py
@@ -27,7 +27,6 @@
 from tensorflow.python.training import distribute as distribute_lib
 
 
-@test_util.with_c_api
 class MirroredOneCPUDistributionTest(strategy_test_lib.DistributionTestBase):
 
   def _get_distribution_strategy(self):
@@ -53,7 +52,6 @@ def testCallAndMergeExceptions(self):
     self._test_call_and_merge_exceptions(self._get_distribution_strategy())
 
 
-@test_util.with_c_api
 class VariableCreatorStackTest(test.TestCase):
 
   def testCreatorStacksAreThreadLocal(self):
diff --git a/tensorflow/contrib/distribute/python/monitor_test.py b/tensorflow/contrib/distribute/python/monitor_test.py
index 8277e1e7919e86..4fdb9bf69b4f6a 100644
--- a/tensorflow/contrib/distribute/python/monitor_test.py
+++ b/tensorflow/contrib/distribute/python/monitor_test.py
@@ -25,6 +25,7 @@
 from tensorflow.contrib.distribute.python import monitor as monitor_lib
 from tensorflow.contrib.distribute.python import one_device_strategy
 from tensorflow.contrib.distribute.python.single_loss_example import single_loss_example
+from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.framework import ops
@@ -65,7 +66,7 @@ def testPassingASessionInEager(self):
     step_function, _ = single_loss_example(
         lambda: gradient_descent.GradientDescentOptimizer(0.2), distribution)
 
-    with self.test_session() as sess:
+    with session.Session() as sess, context.eager_mode():
       with self.assertRaisesRegexp(ValueError, "Should not provide"):
         _ = monitor_lib.Monitor(step_function, sess)
 
diff --git a/tensorflow/contrib/distribute/python/multi_worker_strategy.py b/tensorflow/contrib/distribute/python/multi_worker_strategy.py
new file mode 100644
index 00000000000000..a552b370ebf359
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/multi_worker_strategy.py
@@ -0,0 +1,141 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Classes implementing a mirrored DistributionStrategy for multiple workers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from functools import partial
+
+from tensorflow.contrib.distribute.python import values
+from tensorflow.contrib.distribute.python.mirrored_strategy import MirroredStrategy
+from tensorflow.core.protobuf import cluster_pb2
+from tensorflow.python.training import device_util
+from tensorflow.python.training import server_lib
+from tensorflow.python.util import nest
+
+
+# TODO(yuefengz): support between-graph replication.
+# TODO(yuefengz): merge this class into its base class.
+# TODO(yuefengz): in some cases, we probably want to use configure method to
+# configure this class.
+# TODO(yuefengz): MirroredStrategy.worker_devices may be confusing after the
+# class is introduced.
+class MultiWorkerMirroredStrategy(MirroredStrategy):
+  """Mirrored strategy that works on multiple workers with in-graph replication.
+
+  There are several important concepts for distributed TensorFlow, e.g.
+  `client`, `job`, 'task', `cluster`, `in-graph replication` and
+  'synchronous training' and they have already been defined in the
+  [TensorFlow's documentation](https://www.tensorflow.org/deploy/distributed).
+  The distribution strategy inherits these concepts as well and in addition to
+  that we also clarify several more concepts:
+    * **In-graph replication**: the `client` creates a single `tf.Graph` that
+    specifies tasks for devices on all workers. The `client` then creates a
+    client session which will talk to the `master` service of a `worker`. Then
+    the `master` will parition the graph and distribute the work to all
+    participating workers.
+    * **Worker**: A `worker` is a TensorFlow `task` that usually maps to one
+    physical machine. We will have multiple `worker`s with different `task`
+    index. They all do similar things except for one worker checkpointing model
+    variables, writing summaries, etc. in addition to its ordinary work.
+
+  This class maps one tower to one device on a worker. It mirrors all model
+  variables on all towers. For example, if you have two `worker`s and each
+  `worker` has 4 GPUs, it will create 8 copies of the model variables on these 8
+  GPUs. Then like in MirroredStrategy, each tower performs their computation
+  with their own copy of variables unless in cross-tower model where variable or
+  tensor reduction happens.
+  """
+
+  def __init__(self,
+               num_gpus_per_worker=1,
+               worker_job_name=None,
+               num_workers=None,
+               cluster=None,
+               cross_tower_ops=None,
+               prefetch_on_device=None):
+    """Initialize the strategy object.
+
+    Args:
+      num_gpus_per_worker: number of GPUs per work. If it is zero, the local
+        CPU will be used.
+      worker_job_name: the job name for `worker`, typically just 'worker'.
+      num_workers: the number of workers. If it is 0, it regenerates to
+        single-worker MirroredStrategy.
+      cluster: a `tf.train.ClusterSpec` object or a dict that can be used to
+        construct a `tf.train.ClusterSpec` object or a `tf.train.ClusterDef`
+        proto buffer. It is an alternative way to initialize this object.
+      cross_tower_ops: the cross tower ops to use. If None, a default one will
+        be used. If configure method is called, a best one for the configuration
+        will be chosen.
+      prefetch_on_device: a boolean to specify whether to prefetech input to
+        each worker's devices.
+
+    Raises:
+      ValueError: if got an unexpected `cluster`.
+    """
+    if cluster is None:
+      self._workers = [
+          '/job:%s/task:%d' % (worker_job_name, task_index)
+          for task_index in range(num_workers)
+      ]
+    else:
+      if isinstance(cluster, (dict, cluster_pb2.ClusterDef)):
+        cluster_spec = server_lib.ClusterSpec(cluster)
+      elif isinstance(cluster, server_lib.ClusterSpec):
+        cluster_spec = cluster
+      else:
+        raise ValueError(
+            "`cluster_spec' should be dict or a `tf.train.ClusterSpec` or a "
+            '`tf.train.ClusterDef` object')
+
+      self._workers = []
+      for job in sorted(cluster_spec.jobs):
+        for task in range(cluster_spec.num_tasks(job)):
+          self._workers.append('/job:%s/task:%d' % (job, task))
+
+    self._num_gpus_per_worker = num_gpus_per_worker
+    if num_gpus_per_worker > 0:
+      self._worker_device_map = {
+          worker: [
+              device_util.canonicalize(worker + '/device:GPU:%d' % gpu)
+              for gpu in range(num_gpus_per_worker)
+          ] for worker in self._workers
+      }
+    else:
+      self._worker_device_map = {
+          worker: [device_util.canonicalize(worker, '/device:CPU:0')]
+          for worker in self._workers
+      }
+    self._devices = nest.flatten(self._worker_device_map.values())
+
+    super(MultiWorkerMirroredStrategy, self).__init__(
+        devices=self._devices, prefetch_on_device=prefetch_on_device)
+
+    # Setting `_default_device` will add a device scope in the
+    # distribution.scope. We set the default device to the first worker. When
+    # users specify device under distribution.scope by
+    #   with tf.device("/cpu:0"):
+    #     ...
+    # their ops will end up on the cpu device of its first worker, e.g.
+    # "/job:worker/task:0/device:CPU:0". Note this is not used in tower mode.
+    self._default_device = self._workers[0]
+
+  def distribute_dataset(self, dataset_fn):
+    return values.MultiWorkerDataset(
+        partial(self._call_dataset_fn, dataset_fn), self._worker_device_map,
+        self._prefetch_on_device)
diff --git a/tensorflow/contrib/distribute/python/multi_worker_strategy_test.py b/tensorflow/contrib/distribute/python/multi_worker_strategy_test.py
new file mode 100644
index 00000000000000..09c859b32a3150
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/multi_worker_strategy_test.py
@@ -0,0 +1,62 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for MultiWorkerMirroredStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distribute.python import multi_worker_strategy
+from tensorflow.contrib.distribute.python import multi_worker_test_base
+from tensorflow.contrib.distribute.python import strategy_test_lib
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.training import server_lib
+
+
+class MultiWorkerStrategyTest(multi_worker_test_base.MultiWorkerTestBase,
+                              strategy_test_lib.DistributionTestBase):
+
+  def _get_distribution_strategy(self):
+    return multi_worker_strategy.MultiWorkerMirroredStrategy(
+        cluster=server_lib.ClusterSpec({
+            'worker': ['/job:worker/task:0', '/job:worker/task:1']
+        }),
+        num_gpus_per_worker=context.num_gpus())
+
+  def testMinimizeLossGraph(self):
+    self._test_minimize_loss_graph(self._get_distribution_strategy())
+
+
+class DeviceScopeTest(test.TestCase):
+  """Test the device scope of MultiWorkerMirroredStrategy."""
+
+  def testDeviceScope(self):
+    with context.graph_mode():
+      strategy = multi_worker_strategy.MultiWorkerMirroredStrategy(
+          cluster={'worker': ['/job:worker/task:0', '/job:worker/task:1']},
+          num_gpus_per_worker=context.num_gpus())
+      with strategy.scope():
+        a = constant_op.constant(1.)
+        with ops.device('/cpu:0'):
+          b = constant_op.constant(1.)
+        self.assertEqual(a.device, '/job:worker/task:0')
+        self.assertEqual(b.device, '/job:worker/task:0/device:CPU:0')
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/multi_worker_test_base.py b/tensorflow/contrib/distribute/python/multi_worker_test_base.py
new file mode 100644
index 00000000000000..f659be5f42594b
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/multi_worker_test_base.py
@@ -0,0 +1,90 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Base testing class for strategies that require multiple nodes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import copy
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.eager import test
+from tensorflow.python.framework import test_util
+
+
+class MultiWorkerTestBase(test.TestCase):
+  """Base class for testing multi node strategy and dataset."""
+
+  @classmethod
+  def setUpClass(cls):
+    """Create a local cluster with 2 workers."""
+    num_workers = 2
+    # Leave some memory for cuda runtime.
+    gpu_mem_frac = 0.7 / num_workers
+    default_config = config_pb2.ConfigProto()
+    default_config.gpu_options.per_process_gpu_memory_fraction = gpu_mem_frac
+
+    # The local cluster takes some portion of the local GPUs and there is no way
+    # for the cluster to terminate unless using multiple processes. Therefore,
+    # we have to only create only one cluster throughout a test process.
+    workers, _ = test_util.create_local_cluster(
+        num_workers, num_ps=0, worker_config=default_config)
+    cls._master_target = workers[0].target
+
+  @contextlib.contextmanager
+  def test_session(self, graph=None, config=None):
+    """Create a test session with master target set to the testing cluster.
+
+    This overrides the base class' method, removes arguments that are not needed
+    by the multi-node case and creates a test session that connects to the local
+    testing cluster.
+
+    Args:
+      graph: Optional graph to use during the returned session.
+      config: An optional config_pb2.ConfigProto to use to configure the
+        session.
+
+    Yields:
+      A Session object that should be used as a context manager to surround
+      the graph building and execution code in a test case.
+    """
+    if self.id().endswith('.test_session'):
+      self.skipTest('Not a test.')
+
+    if config is None:
+      config = config_pb2.ConfigProto(allow_soft_placement=True)
+    else:
+      config = copy.deepcopy(config)
+    # Don't perform optimizations for tests so we don't inadvertently run
+    # gpu ops on cpu
+    config.graph_options.optimizer_options.opt_level = -1
+    config.graph_options.rewrite_options.constant_folding = (
+        rewriter_config_pb2.RewriterConfig.OFF)
+
+    if graph is None:
+      if self._cached_session is None:  # pylint: disable=access-member-before-definition
+        self._cached_session = session.Session(
+            graph=None, config=config, target=self._master_target)
+      sess = self._cached_session
+      with sess.graph.as_default(), sess.as_default():
+        yield sess
+    else:
+      with session.Session(
+          graph=graph, config=config, target=self._master_target) as sess:
+        yield sess
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy.py b/tensorflow/contrib/distribute/python/one_device_strategy.py
index 646d2a5c3b3b0b..09b6d4a515ab46 100644
--- a/tensorflow/contrib/distribute/python/one_device_strategy.py
+++ b/tensorflow/contrib/distribute/python/one_device_strategy.py
@@ -36,9 +36,11 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
   # doing something that won't work with other DistributionStrategy
   # implementations?
 
-  def __init__(self, device):
+  def __init__(self, device, prefetch_on_device=None):
     super(OneDeviceStrategy, self).__init__()
     self._device = device
+    self._prefetch_on_device = prefetch_on_device
+    self._default_device = device
 
   def _create_variable(self, next_creator, *args, **kwargs):
     # No need to distinguish tower-local variables when not mirroring,
@@ -61,7 +63,9 @@ def _create_variable(self, next_creator, *args, **kwargs):
       return next_creator(*args, **kwargs)
 
   def distribute_dataset(self, dataset_fn):
-    return self._call_dataset_fn(dataset_fn)
+    return values.PerDeviceDataset(
+        self._call_dataset_fn(dataset_fn), [self._device],
+        self._prefetch_on_device)
 
   def _broadcast(self, tensor, destinations):
     return tensor
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy_test.py b/tensorflow/contrib/distribute/python/one_device_strategy_test.py
index 7101ed0756f44b..7aad8a953cbedd 100644
--- a/tensorflow/contrib/distribute/python/one_device_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/one_device_strategy_test.py
@@ -24,7 +24,6 @@
 from tensorflow.python.framework import test_util
 
 
-@test_util.with_c_api
 class OneDeviceStrategyTest(strategy_test_lib.DistributionTestBase):
 
   def _get_distribution_strategy(self):
diff --git a/tensorflow/contrib/distribute/python/shared_variable_creator_test.py b/tensorflow/contrib/distribute/python/shared_variable_creator_test.py
index 713494d603b855..a0b452fc2d445d 100644
--- a/tensorflow/contrib/distribute/python/shared_variable_creator_test.py
+++ b/tensorflow/contrib/distribute/python/shared_variable_creator_test.py
@@ -44,7 +44,6 @@ def testWrongPatterns(self):
     self.assertEquals("foo_a", self._canonicalize("foo_a"))
 
 
-@test_util.with_c_api
 class SharedVariableCreatorTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes()
diff --git a/tensorflow/contrib/distribute/python/single_loss_example.py b/tensorflow/contrib/distribute/python/single_loss_example.py
index 0db0b59fcacee2..d1fdb3279cf2a7 100644
--- a/tensorflow/contrib/distribute/python/single_loss_example.py
+++ b/tensorflow/contrib/distribute/python/single_loss_example.py
@@ -22,6 +22,7 @@
 from tensorflow.contrib.distribute.python import step_fn
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
 from tensorflow.python.layers import core
 from tensorflow.python.layers import normalization
 from tensorflow.python.ops import array_ops
@@ -59,7 +60,7 @@ def dataset_fn():
     # TODO(isaprykin): map_and_batch with drop_remainder causes shapes to be
     # fully defined for TPU.  Remove this when XLA supports dynamic shapes.
     return dataset.apply(
-        batching.map_and_batch(lambda x: x, batch_size=2, drop_remainder=True))
+        batching.map_and_batch(lambda x: x, batch_size=1, drop_remainder=True))
 
   # An Optimizer instance is created either outside or inside model_fn.
   outer_optimizer = None
@@ -68,11 +69,10 @@ def dataset_fn():
 
   layer = core.Dense(1, use_bias=use_bias)
 
-  def model_fn(xs):
+  def model_fn(x):
     """A very simple model written by the user."""
 
     def loss_fn():
-      x = math_ops.reduce_mean(xs, keepdims=True)
       y = array_ops.reshape(layer(x), []) - constant_op.constant(1.)
       return y * y
 
@@ -89,7 +89,8 @@ def loss_fn():
 def batchnorm_example(optimizer_fn,
                       batch_per_epoch=1,
                       momentum=0.9,
-                      renorm=False):
+                      renorm=False,
+                      update_ops_in_tower_mode=False):
   """Example of non-distribution-aware legacy code with batch normalization."""
 
   def dataset_fn():
@@ -103,12 +104,19 @@ def dataset_fn():
   optimizer = optimizer_fn()
   batchnorm = normalization.BatchNormalization(
       renorm=renorm, momentum=momentum, fused=False)
+  layer = core.Dense(1, use_bias=False)
 
   def model_fn(x):
+    """A model that uses batchnorm."""
 
     def loss_fn():
-      y = math_ops.reduce_sum(batchnorm(x, training=True), axis=1)
-      loss = math_ops.reduce_mean(y - constant_op.constant(1.))
+      y = batchnorm(x, training=True)
+      with ops.control_dependencies(
+          ops.get_collection(ops.GraphKeys.UPDATE_OPS)
+          if update_ops_in_tower_mode else []):
+        loss = math_ops.reduce_mean(
+            math_ops.reduce_sum(layer(y)) - constant_op.constant(1.))
+      # `x` and `y` will be fetched by the gradient computation, but not `loss`.
       return loss
 
     # Callable loss.
diff --git a/tensorflow/contrib/distribute/python/tpu_strategy.py b/tensorflow/contrib/distribute/python/tpu_strategy.py
index a7e4fe80f3e659..75441786a615fc 100644
--- a/tensorflow/contrib/distribute/python/tpu_strategy.py
+++ b/tensorflow/contrib/distribute/python/tpu_strategy.py
@@ -33,7 +33,6 @@
 from tensorflow.python.util import nest
 
 
-# TODO(isaprykin):  Consider whether inheriting is really appropriate.
 class TPUStrategy(one_device_strategy.OneDeviceStrategy):
   """Experimental TPU distribution strategy implementation."""
 
@@ -73,7 +72,6 @@ def _call_for_each_tower(self, fn, *args, **kwargs):
     def infeed_input(i):
       """Get input, split it and then enqueue."""
       iteration_inputs = [f.get(i) for f in feeds()]
-
       infeed_inputs = [[inputs_per_core[core_id]
                         for inputs_per_core in iteration_inputs]
                        for core_id in range(self._num_cores_per_host)]
@@ -117,3 +115,14 @@ def iterate_on_tpu():
           iterate_on_tpu, [], num_shards=self._num_cores_per_host)
 
     return control_flow_ops.group(tpu_result, enqueue_ops)
+
+  def _reduce(self, method_string, value, destinations):
+    del destinations  # TPU is graph mode only.  Rely on implicit Send/Recv.
+    if method_string == 'mean':
+      # TODO(jhseu):  Revisit once we support model-parallelism.
+      value *= (1. / self._num_cores_per_host)
+    return tpu_ops.cross_replica_sum(value)
+
+  @property
+  def num_towers(self):
+    return self._num_cores_per_host
diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py
index 8cb5276579f48f..9572ade8e497fa 100644
--- a/tensorflow/contrib/distribute/python/values.py
+++ b/tensorflow/contrib/distribute/python/values.py
@@ -27,15 +27,18 @@
 import six
 
 from tensorflow.contrib.data.python.ops import batching
+from tensorflow.contrib.distribute.python import input_ops
 from tensorflow.contrib.distribute.python import prefetching_ops_v2
 from tensorflow.python.eager import context
+from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.training import checkpointable
+from tensorflow.python.ops import math_ops
 from tensorflow.python.training import device_util
 from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import saver
+from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import nest
 
 
@@ -58,13 +61,14 @@ def get(self, device=None):
       else:
         device = distribute_lib.get_update_device()
         if device is None:
-          device = device_util.current()
+          return self._get_cross_tower()
     device = device_util.canonicalize(device)
     try:
       return self._index[device]
-    except KeyError:
-      raise ValueError("Device %s not found in %s (current device %s)" %
-                       (device, self._index.keys(), device_util.current()))
+    except KeyError as e:
+      six.raise_from(
+          ValueError("Device %s not found in %s (current device %s)" %
+                     (device, self._index.keys(), device_util.current())), e)
 
   def on_device(self, device):
     device = device_util.canonicalize(device)
@@ -312,6 +316,18 @@ def assign_add(self, *args, **kwargs):
   def assign(self, *args, **kwargs):
     return self.get(device=_get_update_device()).assign(*args, **kwargs)
 
+  def _get_cross_tower(self):
+    device = device_util.canonicalize(device_util.current())
+    if device in self._index:
+      return array_ops.identity(self._index[device])
+    return array_ops.identity(self._primary_var)
+
+  def _as_graph_element(self):
+    # pylint: disable=protected-access
+    if distribute_lib.get_cross_tower_context():
+      return self._primary_var._as_graph_element()
+    return self.get()._as_graph_element()
+
   def _gather_saveables_for_checkpoint(self):
     """Overrides CheckpointableBase method.
 
@@ -356,6 +372,12 @@ def restore(self, restored_tensors, restored_shapes):
         for d, v in six.iteritems(self._tower_local_variable._index)])  # pylint: disable=protected-access
 
 
+def _assert_tower_context():
+  if not distribute_lib.get_tower_context():
+    raise RuntimeError(
+        "Tower-local variables may only be assigned in a tower context.")
+
+
 class TowerLocalVariable(DistributedVariable, PerDevice,
                          checkpointable.CheckpointableBase):
   """Holds a map from device to variables whose values are reduced on save."""
@@ -366,18 +388,35 @@ def __init__(self, index, primary_var, reduce_method):
     super(TowerLocalVariable, self).__init__(index)
 
   def assign_sub(self, *args, **kwargs):
+    _assert_tower_context()
     return self.get().assign_sub(*args, **kwargs)
 
   def assign_add(self, *args, **kwargs):
+    _assert_tower_context()
     return self.get().assign_add(*args, **kwargs)
 
   def assign(self, *args, **kwargs):
+    _assert_tower_context()
     return self.get().assign(*args, **kwargs)
 
   @property
   def reduce_method(self):
     return self._reduce_method
 
+  def _get_cross_tower(self):
+    all_components = tuple(self._index.values())
+    # TODO(josh11b): Use a strategy-specific method.
+    total = math_ops.add_n(all_components)
+    if self._reduce_method == "mean":
+      return total * (1./ len(all_components))
+    return total
+
+  def _as_graph_element(self):
+    # pylint: disable=protected-access
+    if distribute_lib.get_cross_tower_context():
+      return self._get_cross_tower()
+    return self.get()._as_graph_element()
+
   def _gather_saveables_for_checkpoint(self):
     """Overrides CheckpointableBase method.
 
@@ -570,11 +609,106 @@ def make_initializable_iterator(self):
         dataset_iterator, self._devices, self._prefetch_on_device)
 
 
-class PerIteration(object):
-  """Holds input for multiple iterations at once."""
+class MultiWorkerDataIterator(object):
+  """An iterator (like `tf.data.Iterator`) into a `MultiWorkerDataset`."""
 
-  def __init__(self, index):
-    self._index = index
+  def __init__(self, iterators, worker_device_map):
+    """Initialize the MultiWorkerDataIterator object.
+
+    Args:
+      iterators: a dict mapping from each worker to an iterator for
+        that worker.
+      worker_device_map: a dict mapping from each worker's devices to a list of
+        devices that belong to this worker.
+
+    Raises:
+      ValueError: if iterators and worker_device_map are not compatible.
+    """
+    self._iterators = iterators
+    self._worker_device_map = worker_device_map
+    if set(self._iterators) != set(self._worker_device_map):
+      raise ValueError("iterators and worker_device_map are not compatible.")
+
+  @property
+  def initializer(self):
+    return control_flow_ops.group(
+        [iterator.initializer for iterator in self._iterators.values()])
+
+  def get_next(self, name=None):
+    """Scatter the input across hosts and devices."""
+    index = {}
+    for worker, iterator in six.iteritems(self._iterators):
+      if name is not None:
+        d = tf_device.DeviceSpec.from_string(worker)
+        new_name = "%s_%s_%d" % (name, d.job, d.task)
+      else:
+        new_name = None
+      with ops.device(worker):
+        data_per_worker = iterator.get_next(name=new_name)
+
+      worker_devices = self._worker_device_map[worker]
+      # Ungroup these per-device value so as to get a flat map from devices to
+      # values.
+      for d in worker_devices:
+        v = select_device(d, data_per_worker)
+        if d in index:
+          raise ValueError("Duplicated devices in worker_device_map: %r" % v)
+        index[d] = v
+
+    return regroup(index)
+
+
+class MultiWorkerDataset(object):
+  """Like a `tf.data.Dataset` that distributes data to different workers.
+
+  Each worker gets one shard of the input dataset. It is currently not working
+  in
+  eager mode.
+  """
+
+  def __init__(self, dataset_fn, worker_device_map, prefetch_on_device=None):
+    """Initialize the MultiWorkerDataset object.
+
+    Args:
+      dataset_fn: a function that returns a `tf.data.Dataset`.
+      worker_device_map: a dict mapping from each worker to a list of devices
+        that belong to this worker.
+      prefetch_on_device: whether to prefetch to devices.
+    """
+    self._worker_device_map = worker_device_map
+    self._datasets = {}
+    # TODO(yuefengz, priyag): support different set of jobs for input
+    # processing.
+    for i, (worker, worker_devices) in enumerate(
+        six.iteritems(worker_device_map)):
+      with ops.device(worker):
+        worker_input = dataset_fn()
+        worker_input = input_ops.auto_shard_dataset(
+            worker_input, len(worker_device_map), i)
+        self._datasets[worker] = PerDeviceDataset(
+            worker_input, worker_devices, prefetch_on_device=prefetch_on_device)
+
+  def make_one_shot_iterator(self):
+    iterators = {}
+    for worker, dataset in six.iteritems(self._datasets):
+      with ops.device(worker):
+        iterators[worker] = dataset.make_one_shot_iterator()
+    return MultiWorkerDataIterator(iterators, self._worker_device_map)
+
+  def make_initializable_iterator(self):
+    iterators = {}
+    for worker, dataset in six.iteritems(self._datasets):
+      with ops.device(worker):
+        iterators[worker] = dataset.make_initializable_iterator()
+    return MultiWorkerDataIterator(iterators, self._worker_device_map)
+
+
+class _PerKey(object):
+  """Holds data associated by keys."""
+
+  def __init__(self, *index):
+    # pylint: disable=protected-access
+    self._index = list(index)
 
   def get(self, iteration):
     return array_ops.gather(self._index, iteration)
@@ -585,6 +719,24 @@ def get_shape(self):
   def get_dtype(self):
     return self._index[-1][-1].dtype
 
+  def __str__(self):
+    return "%s:%s" % (self.__class__.__name__, self._index)
+
+  def __repr__(self):
+    return "%s(%r)" % (self.__class__.__name__, self._index)
+
+
+class PerIteration(_PerKey):
+  """Holds input for multiple iterations at once."""
+
+  def __init__(self, *index):
+    # pylint: disable=protected-access
+    super(PerIteration, self).__init__(*[batch._index for batch in index])
+
+
+class Batches(_PerKey):
+  pass
+
 
 class MultiIterator(object):
   """Iterator that returns results of multiple get_next()s."""
@@ -595,11 +747,31 @@ def __init__(self, dataset_iterator, iterations, batches_per_iteration):
     self._batches_per_iteration = batches_per_iteration
 
   def get_next(self, name=None):
-    return PerIteration([[
-        self._dataset_iterator.get_next(name=name)
-        for _ in range(self._batches_per_iteration)
-    ]
-                         for _ in range(self._iterations)])
+    """Return PerIteration with `iterations x batches_per_iteration` inputs."""
+    data = []
+    for _ in range(self._batches_per_iteration):
+      batch = []
+      for _ in range(self._iterations):
+        batch.append(self._dataset_iterator.get_next(name=name))
+      data.append(batch)
+
+    # Here is an example.  Suppose each get_next returns a tuple of two tensors.
+    # For 3 `iterations` and 2 `batches_per_iteration`, the `data` is:
+    # [[(a,z), (b,y), (c,x)], [(A,Z), (B,Y), (C,X)]]
+    #
+    # After the first `map_structure` it gets transformed to:
+    #  [(Batches(a, A), Batches(z, Z)),
+    #   (Batches(b, B), Batches(y, Y)),
+    #   (Batches(c, C), Batches(x, X))]
+    #
+    # After the second `map_structure` it gets transformed to a tuple of:
+    # (PerIteration([Batches(a, A), Batches(b, B), Batches(c, C)]),
+    #  PerIteration([Batches(z, Z), Batches(y, Y), Batches(x, X)]))
+
+    data = nest.map_structure(Batches, *data)
+    data = nest.map_structure(PerIteration, *data)
+
+    return data
 
   @property
   def initializer(self):
diff --git a/tensorflow/contrib/distribute/python/values_test.py b/tensorflow/contrib/distribute/python/values_test.py
index e96ce547415fcb..1c95758d96aba4 100644
--- a/tensorflow/contrib/distribute/python/values_test.py
+++ b/tensorflow/contrib/distribute/python/values_test.py
@@ -18,9 +18,11 @@
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import os
 
 from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.contrib.distribute.python import multi_worker_test_base
 from tensorflow.contrib.distribute.python import values
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.data.ops import dataset_ops
@@ -34,11 +36,12 @@
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.training import device_util
 from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.util import nest
 
 
-@test_util.with_c_api
 class DistributedValuesTest(test.TestCase):
 
   def testGetEager(self):
@@ -77,7 +80,6 @@ def testCanonicalization(self):
       v = values.DistributedValues({"/device:cpu:0": 42})
 
 
-@test_util.with_c_api
 class DistributedDelegateTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes()
@@ -160,7 +162,6 @@ def _make_mirrored():
   return v, devices, mirrored
 
 
-@test_util.with_c_api
 class RegroupAndSelectDeviceTest(test.TestCase):
 
   def _is_per_device(self, result, expected, klass=values.PerDevice):
@@ -313,7 +314,6 @@ def testNamedTupleEstimatorSpec(self):
                                                merged_estimator_spec))
 
 
-@test_util.with_c_api
 class PerDeviceDatasetTest(test.TestCase):
 
   config = config_pb2.ConfigProto()
@@ -436,7 +436,130 @@ def testInitializableIterator(self):
         self.evaluate(next_element)
 
 
-@test_util.with_c_api
+class MultiWorkerDatasetTest(multi_worker_test_base.MultiWorkerTestBase):
+
+  def _test_iterator(self, iterator, devices, expected_values):
+    next_element = iterator.get_next()
+    for device in devices:
+      v = values.select_device(device, next_element)
+      # The `v` here can be a tuple.
+      for element in nest.flatten(v):
+        self.assertTrue(element.device in device)
+
+    for expected_value in expected_values:
+      actual = self.evaluate(
+          [values.select_device(d, next_element) for d in devices])
+      self.assertEqual(expected_value, actual)
+
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate([values.select_device(d, next_element) for d in devices])
+
+  def _test_dataset(self, dataset_fn, worker_device_map, devices,
+                    expected_values):
+    multi_worker_dataset = values.MultiWorkerDataset(
+        dataset_fn, worker_device_map, prefetch_on_device=False)
+    multi_worker_iterator = multi_worker_dataset.make_one_shot_iterator()
+    self._test_iterator(multi_worker_iterator, devices, expected_values)
+
+  def _cpu_devices(self):
+    worker_device_map = collections.OrderedDict(
+        [("/job:worker/replica:0/task:0",
+          ["/job:worker/replica:0/task:0/device:CPU:0"]),
+         ("/job:worker/replica:0/task:1",
+          ["/job:worker/replica:0/task:1/device:CPU:0"])])
+    devices = [
+        "/job:worker/replica:0/task:0/device:CPU:0",
+        "/job:worker/replica:0/task:1/device:CPU:0"
+    ]
+    return worker_device_map, devices
+
+  def _cpu_and_one_gpu_devices(self):
+    # The worker_device_map doesn't have to be a OrderDict object, this is just
+    # to simplify the testing so that we can pass expected values as a list
+    # instead of a dict.
+    worker_device_map = collections.OrderedDict(
+        [("/job:worker/replica:0/task:0", [
+            "/job:worker/replica:0/task:0/device:GPU:0",
+            "/job:worker/replica:0/task:0/device:CPU:0"
+        ]), ("/job:worker/replica:0/task:1", [
+            "/job:worker/replica:0/task:1/device:GPU:0",
+            "/job:worker/replica:0/task:1/device:CPU:0"
+        ])])
+    devices = [
+        "/job:worker/replica:0/task:0/device:GPU:0",
+        "/job:worker/replica:0/task:0/device:CPU:0",
+        "/job:worker/replica:0/task:1/device:GPU:0",
+        "/job:worker/replica:0/task:1/device:CPU:0"
+    ]
+    return worker_device_map, devices
+
+  def testDataDistributionOneDevicePerWorker(self):
+    worker_device_map, devices = self._cpu_devices()
+    with context.graph_mode():
+      dataset_fn = lambda: dataset_ops.Dataset.range(8)
+      self._test_dataset(dataset_fn, worker_device_map, devices,
+                         [[0, 1], [2, 3], [4, 5], [6, 7]])
+
+  def testDataDistributionTwoDevicePerWorker(self):
+    if context.num_gpus() < 1:
+      self.skipTest("A GPU is not available for this test.")
+    worker_device_map, devices = self._cpu_and_one_gpu_devices()
+    with context.graph_mode():
+      dataset_fn = lambda: dataset_ops.Dataset.range(8)
+      self._test_dataset(dataset_fn, worker_device_map, devices,
+                         [[0, 2, 1, 3], [4, 6, 5, 7]])
+
+  def testTupleDataset(self):
+    worker_device_map, devices = self._cpu_devices()
+
+    with context.graph_mode():
+
+      def dataset_fn():
+        dataset1 = dataset_ops.Dataset.range(8)
+        dataset2 = dataset_ops.Dataset.range(8).map(lambda x: x**2)
+        return dataset_ops.Dataset.zip((dataset1, dataset2))
+
+      expected_values = [
+          [(i, i**2), (i + 1, (i + 1)**2)] for i in range(0, 8, 2)
+      ]
+      self._test_dataset(dataset_fn, worker_device_map, devices,
+                         expected_values)
+
+  def testInitializableIterator(self):
+    worker_device_map, devices = self._cpu_devices()
+    with context.graph_mode():
+      dataset_fn = lambda: dataset_ops.Dataset.range(8)
+      multi_worker_dataset = values.MultiWorkerDataset(
+          dataset_fn, worker_device_map, prefetch_on_device=False)
+      multi_worker_iterator = multi_worker_dataset.make_initializable_iterator()
+
+      self.evaluate(multi_worker_iterator.initializer)
+      self._test_iterator(multi_worker_iterator, devices,
+                          [[0, 1], [2, 3], [4, 5], [6, 7]])
+
+      # After re-initializing the iterator, should be able to iterate again.
+      self.evaluate(multi_worker_iterator.initializer)
+      self._test_iterator(multi_worker_iterator, devices,
+                          [[0, 1], [2, 3], [4, 5], [6, 7]])
+
+  def testValueErrorForIterator(self):
+    # Incompatiable arguments.
+    with self.assertRaises(ValueError):
+      values.MultiWorkerDataIterator({"w1": None}, {"w1": "d1", "w2": "d2"})
+
+    # Test duplicated devices under same worker.
+    worker_device_map, _ = self._cpu_devices()
+    worker_device_map["/job:worker/replica:0/task:0"].append(
+        "/job:worker/replica:0/task:0/device:CPU:0")
+    with context.graph_mode():
+      dataset_fn = lambda: dataset_ops.Dataset.range(8)
+      multi_worker_dataset = values.MultiWorkerDataset(
+          dataset_fn, worker_device_map, prefetch_on_device=False)
+      multi_worker_iterator = multi_worker_dataset.make_initializable_iterator()
+      with self.assertRaises(ValueError):
+        multi_worker_iterator.get_next()
+
+
 class MirroredVariableTest(test.TestCase):
 
   config = config_pb2.ConfigProto()
@@ -582,6 +705,21 @@ def testSaveNormalRestoreMirrored(self):
     save_path = self._save_normal()
     self._restore_mirrored(save_path)
 
+  @test_util.run_in_graph_and_eager_modes(config=config)
+  def testFetchAMirroredVariable(self):
+    if context.num_gpus() < 1 or context.executing_eagerly():
+      self.skipTest("A GPU is not available for this test or it's eager mode.")
+
+    with self.test_session(
+        graph=ops.Graph()) as sess, mirrored_strategy.MirroredStrategy(
+            ["/device:GPU:0"]).scope():
+      with ops.device("/device:GPU:0"):
+        v = variable_scope.get_variable(
+            name="v", initializer=1., use_resource=True)
+      mirrored = values.MirroredVariable({"/device:GPU:0": v}, v)
+      sess.run(variables_lib.global_variables_initializer())
+      sess.run({"complicated": mirrored})
+
 
 _devices = ["/device:GPU:0", "/device:CPU:0"]
 
@@ -598,7 +736,6 @@ def _make_tower_local(method):
   return v, tower_local
 
 
-@test_util.with_c_api
 class TowerLocalVariableTest(test.TestCase):
 
   config = config_pb2.ConfigProto()
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index 9a3b02cd813fa8..cbce7a88fd1cff 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -94,7 +94,7 @@ gpu_py_test(
 
 gpu_py_test(
     name = "distribution_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/kernel_tests/distribution_test.py"],
     additional_deps = [
         ":distributions_py",
@@ -337,7 +337,7 @@ gpu_py_test(
 
 gpu_py_test(
     name = "mvn_tril_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/kernel_tests/mvn_tril_test.py"],
     additional_deps = [
         ":distributions_py",
@@ -372,6 +372,7 @@ gpu_py_test(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:variables",
     ],
+    shard_count = 4,
 )
 
 gpu_py_test(
@@ -459,7 +460,7 @@ gpu_py_test(
 
 gpu_py_test(
     name = "batch_reshape_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/kernel_tests/batch_reshape_test.py"],
     additional_deps = [
         ":distributions_py",
@@ -578,7 +579,7 @@ gpu_py_test(
 
 gpu_py_test(
     name = "wishart_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/kernel_tests/wishart_test.py"],
     additional_deps = [
         ":distributions_py",
@@ -709,6 +710,8 @@ gpu_py_test(
         "//tensorflow/contrib/linalg:linalg_py",
         "//tensorflow/python:client_testlib",
     ],
+    shard_count = 4,
+    tags = ["noasan"],  # times out, http://b/78588814
 )
 
 gpu_py_test(
@@ -865,7 +868,7 @@ gpu_py_test(
 
 gpu_py_test(
     name = "batch_normalization_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/kernel_tests/bijectors/batch_normalization_test.py"],
     additional_deps = [
         ":bijectors_py",
@@ -1029,6 +1032,25 @@ gpu_py_test(
     ],
 )
 
+gpu_py_test(
+    name = "matrix_inverse_tril_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/bijectors/matrix_inverse_tril_test.py"],
+    additional_deps = [
+        ":bijectors_py",
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/contrib/linalg:linalg_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 gpu_py_test(
     name = "real_nvp_test",
     size = "small",
diff --git a/tensorflow/contrib/distributions/__init__.py b/tensorflow/contrib/distributions/__init__.py
index ddf59891e626a8..802538ba97578c 100644
--- a/tensorflow/contrib/distributions/__init__.py
+++ b/tensorflow/contrib/distributions/__init__.py
@@ -32,6 +32,7 @@
 from tensorflow.contrib.distributions.python.ops.conditional_transformed_distribution import *
 from tensorflow.contrib.distributions.python.ops.deterministic import *
 from tensorflow.contrib.distributions.python.ops.distribution_util import fill_triangular
+from tensorflow.contrib.distributions.python.ops.distribution_util import fill_triangular_inverse
 from tensorflow.contrib.distributions.python.ops.distribution_util import matrix_diag_transform
 from tensorflow.contrib.distributions.python.ops.distribution_util import reduce_weighted_logsumexp
 from tensorflow.contrib.distributions.python.ops.distribution_util import softplus_inverse
@@ -156,6 +157,7 @@
     'kl_divergence',
     'RegisterKL',
     'fill_triangular',
+    'fill_triangular_inverse',
     'matrix_diag_transform',
     'reduce_weighted_logsumexp',
     'softplus_inverse',
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/batch_reshape_test.py b/tensorflow/contrib/distributions/python/kernel_tests/batch_reshape_test.py
index 59d549b7b80a3d..f2bb2d3325a7cc 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/batch_reshape_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/batch_reshape_test.py
@@ -448,8 +448,7 @@ def test_bad_reshape_size(self):
 
     else:
       with self.test_session():
-        with self.assertRaisesOpError(r"`batch_shape` size must match "
-                                      r"`distributions.batch_shape` size"):
+        with self.assertRaisesOpError(r"Shape sizes do not match."):
           batch_reshape_lib.BatchReshape(
               distribution=mvn,
               batch_shape=new_batch_shape_ph,
@@ -457,8 +456,13 @@ def test_bad_reshape_size(self):
 
   def test_non_positive_shape(self):
     dims = 2
-    new_batch_shape = [-1, -2]   # -1*-2=2 so will pass size check.
-    old_batch_shape = [2]
+    old_batch_shape = [4]
+    if self.is_static_shape:
+      # Unknown first dimension does not trigger size check. Note that
+      # any dimension < 0 is treated statically as unknown.
+      new_batch_shape = [-1, 0]
+    else:
+      new_batch_shape = [-2, -2]  # -2 * -2 = 4, same size as the old shape.
 
     new_batch_shape_ph = (
         constant_op.constant(np.int32(new_batch_shape)) if self.is_static_shape
@@ -471,7 +475,7 @@ def test_non_positive_shape(self):
     mvn = mvn_lib.MultivariateNormalDiag(scale_diag=scale_ph)
 
     if self.is_static_shape:
-      with self.assertRaisesRegexp(ValueError, r".*must be positive.*"):
+      with self.assertRaisesRegexp(ValueError, r".*must be >=-1.*"):
         batch_reshape_lib.BatchReshape(
             distribution=mvn,
             batch_shape=new_batch_shape_ph,
@@ -479,7 +483,7 @@ def test_non_positive_shape(self):
 
     else:
       with self.test_session():
-        with self.assertRaisesOpError(r".*must be positive.*"):
+        with self.assertRaisesOpError(r".*must be >=-1.*"):
           batch_reshape_lib.BatchReshape(
               distribution=mvn,
               batch_shape=new_batch_shape_ph,
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py
index ca20442c394066..dc45114b1c23b5 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/chain_test.py
@@ -26,6 +26,7 @@
 from tensorflow.contrib.distributions.python.ops.bijectors.softmax_centered import SoftmaxCentered
 from tensorflow.contrib.distributions.python.ops.bijectors.softplus import Softplus
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.distributions import bijector
 from tensorflow.python.ops.distributions.bijector_test_util import assert_scalar_congruency
 from tensorflow.python.platform import test
@@ -188,6 +189,15 @@ def testChainAffineExp(self):
         -np.log(6, dtype=np.float32) - np.sum(x),
         self.evaluate(chain.inverse_log_det_jacobian(y, event_ndims=1)))
 
+  def testChainIldjWithPlaceholder(self):
+    chain = Chain((Exp(), Exp()))
+    samples = array_ops.placeholder(
+        dtype=np.float32, shape=[None, 10], name="samples")
+    ildj = chain.inverse_log_det_jacobian(samples, event_ndims=0)
+    self.assertTrue(ildj is not None)
+    with self.test_session():
+      ildj.eval({samples: np.zeros([2, 10], np.float32)})
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/conditional_bijector_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/conditional_bijector_test.py
index 8b279ebcd908b6..f8a52615b0f3f5 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/conditional_bijector_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/conditional_bijector_test.py
@@ -59,7 +59,7 @@ def testConditionalBijector(self):
     for name in ["inverse_log_det_jacobian", "forward_log_det_jacobian"]:
       method = getattr(b, name)
       with self.assertRaisesRegexp(ValueError, name + ".*b1.*b2"):
-        method(1., event_ndims=0., arg1="b1", arg2="b2")
+        method(1., event_ndims=0, arg1="b1", arg2="b2")
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/matrix_inverse_tril_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/matrix_inverse_tril_test.py
new file mode 100644
index 00000000000000..18397035571561
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/matrix_inverse_tril_test.py
@@ -0,0 +1,190 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for MatrixInverseTriL bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops import bijectors
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+class MatrixInverseTriLBijectorTest(test.TestCase):
+  """Tests the correctness of the Y = inv(tril) transformation."""
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testComputesCorrectValues(self):
+    inv = bijectors.MatrixInverseTriL(validate_args=True)
+    self.assertEqual("matrix_inverse_tril", inv.name)
+    x_ = np.array([[0.7, 0., 0.],
+                   [0.1, -1., 0.],
+                   [0.3, 0.25, 0.5]], dtype=np.float32)
+    x_inv_ = np.linalg.inv(x_)
+    expected_fldj_ = -6. * np.sum(np.log(np.abs(np.diag(x_))))
+
+    y = inv.forward(x_)
+    x_back = inv.inverse(x_inv_)
+    fldj = inv.forward_log_det_jacobian(x_, event_ndims=2)
+    ildj = inv.inverse_log_det_jacobian(x_inv_, event_ndims=2)
+
+    y_, x_back_, fldj_, ildj_ = self.evaluate([y, x_back, fldj, ildj])
+
+    self.assertAllClose(x_inv_, y_, atol=0., rtol=1e-5)
+    self.assertAllClose(x_, x_back_, atol=0., rtol=1e-5)
+    self.assertNear(expected_fldj_, fldj_, err=1e-3)
+    self.assertNear(-expected_fldj_, ildj_, err=1e-3)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testOneByOneMatrix(self):
+    inv = bijectors.MatrixInverseTriL(validate_args=True)
+    x_ = np.array([[5.]], dtype=np.float32)
+    x_inv_ = np.array([[0.2]], dtype=np.float32)
+    expected_fldj_ = np.log(0.04)
+
+    y = inv.forward(x_)
+    x_back = inv.inverse(x_inv_)
+    fldj = inv.forward_log_det_jacobian(x_, event_ndims=2)
+    ildj = inv.inverse_log_det_jacobian(x_inv_, event_ndims=2)
+
+    y_, x_back_, fldj_, ildj_ = self.evaluate([y, x_back, fldj, ildj])
+
+    self.assertAllClose(x_inv_, y_, atol=0., rtol=1e-5)
+    self.assertAllClose(x_, x_back_, atol=0., rtol=1e-5)
+    self.assertNear(expected_fldj_, fldj_, err=1e-3)
+    self.assertNear(-expected_fldj_, ildj_, err=1e-3)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testZeroByZeroMatrix(self):
+    inv = bijectors.MatrixInverseTriL(validate_args=True)
+    x_ = np.eye(0, dtype=np.float32)
+    x_inv_ = np.eye(0, dtype=np.float32)
+    expected_fldj_ = 0.
+
+    y = inv.forward(x_)
+    x_back = inv.inverse(x_inv_)
+    fldj = inv.forward_log_det_jacobian(x_, event_ndims=2)
+    ildj = inv.inverse_log_det_jacobian(x_inv_, event_ndims=2)
+
+    y_, x_back_, fldj_, ildj_ = self.evaluate([y, x_back, fldj, ildj])
+
+    self.assertAllClose(x_inv_, y_, atol=0., rtol=1e-5)
+    self.assertAllClose(x_, x_back_, atol=0., rtol=1e-5)
+    self.assertNear(expected_fldj_, fldj_, err=1e-3)
+    self.assertNear(-expected_fldj_, ildj_, err=1e-3)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testBatch(self):
+    # Test batch computation with input shape (2, 1, 2, 2), i.e. batch shape
+    # (2, 1).
+    inv = bijectors.MatrixInverseTriL(validate_args=True)
+    x_ = np.array([[[[1., 0.],
+                     [2., 3.]]],
+                   [[[4., 0.],
+                     [5., -6.]]]], dtype=np.float32)
+    x_inv_ = np.linalg.inv(x_)
+    expected_fldj_ = -4. * np.sum(
+        np.log(np.abs(np.diagonal(x_, axis1=-2, axis2=-1))), axis=-1)
+
+    y = inv.forward(x_)
+    x_back = inv.inverse(x_inv_)
+    fldj = inv.forward_log_det_jacobian(x_, event_ndims=2)
+    ildj = inv.inverse_log_det_jacobian(x_inv_, event_ndims=2)
+
+    y_, x_back_, fldj_, ildj_ = self.evaluate([y, x_back, fldj, ildj])
+
+    self.assertAllClose(x_inv_, y_, atol=0., rtol=1e-5)
+    self.assertAllClose(x_, x_back_, atol=0., rtol=1e-5)
+    self.assertAllClose(expected_fldj_, fldj_, atol=0., rtol=1e-3)
+    self.assertAllClose(-expected_fldj_, ildj_, atol=0., rtol=1e-3)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testErrorOnInputRankTooLow(self):
+    inv = bijectors.MatrixInverseTriL(validate_args=True)
+    x_ = np.array([0.1], dtype=np.float32)
+    rank_error_msg = "must have rank at least 2"
+    with self.test_session():
+      with self.assertRaisesWithPredicateMatch(ValueError, rank_error_msg):
+        inv.forward(x_).eval()
+      with self.assertRaisesWithPredicateMatch(ValueError, rank_error_msg):
+        inv.inverse(x_).eval()
+      with self.assertRaisesWithPredicateMatch(ValueError, rank_error_msg):
+        inv.forward_log_det_jacobian(x_, event_ndims=2).eval()
+      with self.assertRaisesWithPredicateMatch(ValueError, rank_error_msg):
+        inv.inverse_log_det_jacobian(x_, event_ndims=2).eval()
+
+  # TODO(b/80481923): Figure out why these assertions fail, and fix them.
+  ## def testErrorOnInputNonSquare(self):
+  ##   inv = bijectors.MatrixInverseTriL(validate_args=True)
+  ##   x_ = np.array([[1., 2., 3.],
+  ##                  [4., 5., 6.]], dtype=np.float32)
+  ##   square_error_msg = "must be a square matrix"
+  ##   with self.test_session():
+  ##     with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+  ##                                              square_error_msg):
+  ##       inv.forward(x_).eval()
+  ##     with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+  ##                                              square_error_msg):
+  ##       inv.inverse(x_).eval()
+  ##     with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+  ##                                              square_error_msg):
+  ##       inv.forward_log_det_jacobian(x_, event_ndims=2).eval()
+  ##     with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+  ##                                              square_error_msg):
+  ##       inv.inverse_log_det_jacobian(x_, event_ndims=2).eval()
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testErrorOnInputNotLowerTriangular(self):
+    inv = bijectors.MatrixInverseTriL(validate_args=True)
+    x_ = np.array([[1., 2.],
+                   [3., 4.]], dtype=np.float32)
+    triangular_error_msg = "must be lower triangular"
+    with self.test_session():
+      with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+                                               triangular_error_msg):
+        inv.forward(x_).eval()
+      with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+                                               triangular_error_msg):
+        inv.inverse(x_).eval()
+      with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+                                               triangular_error_msg):
+        inv.forward_log_det_jacobian(x_, event_ndims=2).eval()
+      with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+                                               triangular_error_msg):
+        inv.inverse_log_det_jacobian(x_, event_ndims=2).eval()
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testErrorOnInputSingular(self):
+    inv = bijectors.MatrixInverseTriL(validate_args=True)
+    x_ = np.array([[1., 0.],
+                   [0., 0.]], dtype=np.float32)
+    nonsingular_error_msg = "must have all diagonal entries nonzero"
+    with self.test_session():
+      with self.assertRaisesOpError(nonsingular_error_msg):
+        inv.forward(x_).eval()
+      with self.assertRaisesOpError(nonsingular_error_msg):
+        inv.inverse(x_).eval()
+      with self.assertRaisesOpError(nonsingular_error_msg):
+        inv.forward_log_det_jacobian(x_, event_ndims=2).eval()
+      with self.assertRaisesOpError(nonsingular_error_msg):
+        inv.inverse_log_det_jacobian(x_, event_ndims=2).eval()
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py
index 46f2c63f9b0f78..d44e49b4874a5b 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/reshape_test.py
@@ -22,15 +22,12 @@
 
 from tensorflow.contrib.distributions.python.ops.bijectors.reshape import Reshape
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.distributions.bijector_test_util import assert_bijective_and_finite
 from tensorflow.python.platform import test
 
 
-@test_util.with_c_api
 class _ReshapeBijectorTest(object):
   """Base class for testing the reshape transformation.
 
@@ -265,7 +262,6 @@ def build_shapes(self, *args, **kwargs):
     raise NotImplementedError("Subclass failed to implement `build_shapes`.")
 
 
-@test_util.with_c_api
 class ReshapeBijectorTestStatic(test.TestCase, _ReshapeBijectorTest):
 
   def build_shapes(self, shape_in, shape_out):
@@ -305,21 +301,13 @@ def testBijectiveAndFinite(self):
           bijector, x, y, event_ndims=2, rtol=1e-6, atol=0)
 
   def testInvalidDimensionsOpError(self):
-    if ops._USE_C_API:
-      error_message = "Invalid value in tensor used for shape: -2"
-    else:
-      error_message = "elements must be either positive integers or `-1`."
-    self._testInvalidDimensionsOpError(error_message)
+    self._testInvalidDimensionsOpError(
+        "Invalid value in tensor used for shape: -2")
 
   def testInputOutputMismatchOpError(self):
-    if ops._USE_C_API:
-      error_message = "Cannot reshape a tensor with"
-    else:
-      error_message = "Input to reshape is a tensor with"
-    self._testInputOutputMismatchOpError(error_message)
+    self._testInputOutputMismatchOpError("Cannot reshape a tensor with")
 
 
-@test_util.with_c_api
 class ReshapeBijectorTestDynamic(test.TestCase, _ReshapeBijectorTest):
 
   def build_shapes(self, shape_in, shape_out):
@@ -341,7 +329,6 @@ def testInputOutputMismatchOpError(self):
     self._testInputOutputMismatchOpError("Input to reshape is a tensor with")
 
 
-@test_util.with_c_api
 class ReshapeBijectorTestDynamicNdims(test.TestCase, _ReshapeBijectorTest):
 
   def build_shapes(self, shape_in, shape_out):
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py
index 7435bcbc684c16..b003526392709b 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mvn_full_covariance_test.py
@@ -131,8 +131,8 @@ def _random_mu_and_sigma(self, batch_shape, event_shape):
     return mu, sigma
 
   def testKLBatch(self):
-    batch_shape = (2,)
-    event_shape = (3,)
+    batch_shape = [2]
+    event_shape = [3]
     with self.test_session():
       mu_a, sigma_a = self._random_mu_and_sigma(batch_shape, event_shape)
       mu_b, sigma_b = self._random_mu_and_sigma(batch_shape, event_shape)
@@ -156,6 +156,33 @@ def testKLBatch(self):
       self.assertAllClose(expected_kl_0, kl_v[0])
       self.assertAllClose(expected_kl_1, kl_v[1])
 
+  def testKLBatchBroadcast(self):
+    batch_shape = [2]
+    event_shape = [3]
+    with self.test_session():
+      mu_a, sigma_a = self._random_mu_and_sigma(batch_shape, event_shape)
+      # No batch shape.
+      mu_b, sigma_b = self._random_mu_and_sigma([], event_shape)
+      mvn_a = ds.MultivariateNormalFullCovariance(
+          loc=mu_a,
+          covariance_matrix=sigma_a,
+          validate_args=True)
+      mvn_b = ds.MultivariateNormalFullCovariance(
+          loc=mu_b,
+          covariance_matrix=sigma_b,
+          validate_args=True)
+
+      kl = ds.kl_divergence(mvn_a, mvn_b)
+      self.assertEqual(batch_shape, kl.get_shape())
+
+      kl_v = kl.eval()
+      expected_kl_0 = _compute_non_batch_kl(mu_a[0, :], sigma_a[0, :, :],
+                                            mu_b, sigma_b)
+      expected_kl_1 = _compute_non_batch_kl(mu_a[1, :], sigma_a[1, :, :],
+                                            mu_b, sigma_b)
+      self.assertAllClose(expected_kl_0, kl_v[0])
+      self.assertAllClose(expected_kl_1, kl_v[1])
+
 
 def _compute_non_batch_kl(mu_a, sigma_a, mu_b, sigma_b):
   """Non-batch KL for N(mu_a, sigma_a), N(mu_b, sigma_b)."""
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/mvn_tril_test.py b/tensorflow/contrib/distributions/python/kernel_tests/mvn_tril_test.py
index 685f32883dae5b..b556d06123800f 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/mvn_tril_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/mvn_tril_test.py
@@ -235,8 +235,8 @@ def _random_mu_and_sigma(self, batch_shape, event_shape):
     return mu, sigma
 
   def testKLNonBatch(self):
-    batch_shape = ()
-    event_shape = (2,)
+    batch_shape = []
+    event_shape = [2]
     with self.test_session():
       mu_a, sigma_a = self._random_mu_and_sigma(batch_shape, event_shape)
       mu_b, sigma_b = self._random_mu_and_sigma(batch_shape, event_shape)
@@ -257,8 +257,8 @@ def testKLNonBatch(self):
       self.assertAllClose(expected_kl, kl_v)
 
   def testKLBatch(self):
-    batch_shape = (2,)
-    event_shape = (3,)
+    batch_shape = [2]
+    event_shape = [3]
     with self.test_session():
       mu_a, sigma_a = self._random_mu_and_sigma(batch_shape, event_shape)
       mu_b, sigma_b = self._random_mu_and_sigma(batch_shape, event_shape)
@@ -282,9 +282,36 @@ def testKLBatch(self):
       self.assertAllClose(expected_kl_0, kl_v[0])
       self.assertAllClose(expected_kl_1, kl_v[1])
 
+  def testKLBatchBroadcast(self):
+    batch_shape = [2]
+    event_shape = [3]
+    with self.test_session():
+      mu_a, sigma_a = self._random_mu_and_sigma(batch_shape, event_shape)
+      # No batch shape.
+      mu_b, sigma_b = self._random_mu_and_sigma([], event_shape)
+      mvn_a = ds.MultivariateNormalTriL(
+          loc=mu_a,
+          scale_tril=np.linalg.cholesky(sigma_a),
+          validate_args=True)
+      mvn_b = ds.MultivariateNormalTriL(
+          loc=mu_b,
+          scale_tril=np.linalg.cholesky(sigma_b),
+          validate_args=True)
+
+      kl = ds.kl_divergence(mvn_a, mvn_b)
+      self.assertEqual(batch_shape, kl.get_shape())
+
+      kl_v = kl.eval()
+      expected_kl_0 = _compute_non_batch_kl(mu_a[0, :], sigma_a[0, :, :],
+                                            mu_b, sigma_b)
+      expected_kl_1 = _compute_non_batch_kl(mu_a[1, :], sigma_a[1, :, :],
+                                            mu_b, sigma_b)
+      self.assertAllClose(expected_kl_0, kl_v[0])
+      self.assertAllClose(expected_kl_1, kl_v[1])
+
   def testKLTwoIdenticalDistributionsIsZero(self):
-    batch_shape = (2,)
-    event_shape = (3,)
+    batch_shape = [2]
+    event_shape = [3]
     with self.test_session():
       mu_a, sigma_a = self._random_mu_and_sigma(batch_shape, event_shape)
       mvn_a = ds.MultivariateNormalTriL(
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/seed_stream_test.py b/tensorflow/contrib/distributions/python/kernel_tests/seed_stream_test.py
index 96805733178705..b91a610acf1a90 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/seed_stream_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/seed_stream_test.py
@@ -65,6 +65,16 @@ def testNestingRobustness(self):
     self.assertAllUnique(
         outputs + [strm2() for _ in range(50)] + [strm3() for _ in range(50)])
 
+  def testInitFromOtherSeedStream(self):
+    strm1 = seed_stream.SeedStream(seed=4, salt="salt")
+    strm2 = seed_stream.SeedStream(strm1, salt="salt")
+    strm3 = seed_stream.SeedStream(strm1, salt="another salt")
+    out1 = [strm1() for _ in range(50)]
+    out2 = [strm2() for _ in range(50)]
+    out3 = [strm3() for _ in range(50)]
+    self.assertAllEqual(out1, out2)
+    self.assertAllUnique(out1 + out3)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/statistical_testing_test.py b/tensorflow/contrib/distributions/python/kernel_tests/statistical_testing_test.py
index ce6cf702d52279..9c4dfed83631e9 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/statistical_testing_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/statistical_testing_test.py
@@ -98,23 +98,21 @@ def test_true_mean_confidence_interval_by_dkwm_one_sample(self):
     num_samples = 5000
     # 5000 samples is chosen to be enough to find discrepancies of
     # size 0.1 or more with assurance 1e-6, as confirmed here:
-    with self.test_session() as sess:
-      d = st.min_discrepancy_of_true_means_detectable_by_dkwm(
-          num_samples, 0., 1., false_fail_rate=1e-6, false_pass_rate=1e-6)
-      d = sess.run(d)
-      self.assertLess(d, 0.1)
+    d = st.min_discrepancy_of_true_means_detectable_by_dkwm(
+        num_samples, 0., 1., false_fail_rate=1e-6, false_pass_rate=1e-6)
+    d = self.evaluate(d)
+    self.assertLess(d, 0.1)
 
     # Test that the confidence interval computed for the mean includes
     # 0.5 and excludes 0.4 and 0.6.
-    with self.test_session() as sess:
-      samples = rng.uniform(size=num_samples).astype(np.float32)
-      (low, high) = st.true_mean_confidence_interval_by_dkwm(
-          samples, 0., 1., error_rate=1e-6)
-      low, high = sess.run([low, high])
-      self.assertGreater(low, 0.4)
-      self.assertLess(low, 0.5)
-      self.assertGreater(high, 0.5)
-      self.assertLess(high, 0.6)
+    samples = rng.uniform(size=num_samples).astype(np.float32)
+    (low, high) = st.true_mean_confidence_interval_by_dkwm(
+        samples, 0., 1., error_rate=1e-6)
+    low, high = self.evaluate([low, high])
+    self.assertGreater(low, 0.4)
+    self.assertLess(low, 0.5)
+    self.assertGreater(high, 0.5)
+    self.assertLess(high, 0.6)
 
   def test_dkwm_mean_one_sample_assertion(self):
     rng = np.random.RandomState(seed=0)
@@ -123,21 +121,45 @@ def test_dkwm_mean_one_sample_assertion(self):
     # Test that the test assertion agrees that the mean of the standard
     # uniform distribution is 0.5.
     samples = rng.uniform(size=num_samples).astype(np.float32)
-    with self.test_session() as sess:
-      sess.run(st.assert_true_mean_equal_by_dkwm(
-          samples, 0., 1., 0.5, false_fail_rate=1e-6))
-
-      # Test that the test assertion confirms that the mean of the
-      # standard uniform distribution is not 0.4.
-      with self.assertRaisesOpError("Mean confidence interval too high"):
-        sess.run(st.assert_true_mean_equal_by_dkwm(
-            samples, 0., 1., 0.4, false_fail_rate=1e-6))
-
-      # Test that the test assertion confirms that the mean of the
-      # standard uniform distribution is not 0.6.
-      with self.assertRaisesOpError("Mean confidence interval too low"):
-        sess.run(st.assert_true_mean_equal_by_dkwm(
-            samples, 0., 1., 0.6, false_fail_rate=1e-6))
+    self.evaluate(st.assert_true_mean_equal_by_dkwm(
+        samples, 0., 1., 0.5, false_fail_rate=1e-6))
+
+    # Test that the test assertion confirms that the mean of the
+    # standard uniform distribution is not 0.4.
+    with self.assertRaisesOpError("true mean greater than expected"):
+      self.evaluate(st.assert_true_mean_equal_by_dkwm(
+          samples, 0., 1., 0.4, false_fail_rate=1e-6))
+
+    # Test that the test assertion confirms that the mean of the
+    # standard uniform distribution is not 0.6.
+    with self.assertRaisesOpError("true mean smaller than expected"):
+      self.evaluate(st.assert_true_mean_equal_by_dkwm(
+          samples, 0., 1., 0.6, false_fail_rate=1e-6))
+
+  def test_dkwm_mean_in_interval_one_sample_assertion(self):
+    rng = np.random.RandomState(seed=0)
+    num_samples = 5000
+
+    # Test that the test assertion agrees that the mean of the standard
+    # uniform distribution is between 0.4 and 0.6.
+    samples = rng.uniform(size=num_samples).astype(np.float32)
+    self.evaluate(st.assert_true_mean_in_interval_by_dkwm(
+        samples, 0., 1.,
+        expected_low=0.4, expected_high=0.6, false_fail_rate=1e-6))
+
+    # Test that the test assertion confirms that the mean of the
+    # standard uniform distribution is not between 0.2 and 0.4.
+    with self.assertRaisesOpError("true mean greater than expected"):
+      self.evaluate(st.assert_true_mean_in_interval_by_dkwm(
+          samples, 0., 1.,
+          expected_low=0.2, expected_high=0.4, false_fail_rate=1e-6))
+
+    # Test that the test assertion confirms that the mean of the
+    # standard uniform distribution is not between 0.6 and 0.8.
+    with self.assertRaisesOpError("true mean smaller than expected"):
+      self.evaluate(st.assert_true_mean_in_interval_by_dkwm(
+          samples, 0., 1.,
+          expected_low=0.6, expected_high=0.8, false_fail_rate=1e-6))
 
   def test_dkwm_mean_two_sample_assertion(self):
     rng = np.random.RandomState(seed=0)
@@ -145,20 +167,18 @@ def test_dkwm_mean_two_sample_assertion(self):
 
     # 4000 samples is chosen to be enough to find discrepancies of
     # size 0.2 or more with assurance 1e-6, as confirmed here:
-    with self.test_session() as sess:
-      d = st.min_discrepancy_of_true_means_detectable_by_dkwm_two_sample(
-          num_samples, 0., 1., num_samples, 0., 1.,
-          false_fail_rate=1e-6, false_pass_rate=1e-6)
-      d = sess.run(d)
-      self.assertLess(d, 0.2)
+    d = st.min_discrepancy_of_true_means_detectable_by_dkwm_two_sample(
+        num_samples, 0., 1., num_samples, 0., 1.,
+        false_fail_rate=1e-6, false_pass_rate=1e-6)
+    d = self.evaluate(d)
+    self.assertLess(d, 0.2)
 
     # Test that the test assertion agrees that the standard
     # uniform distribution has the same mean as itself.
     samples1 = rng.uniform(size=num_samples).astype(np.float32)
     samples2 = rng.uniform(size=num_samples).astype(np.float32)
-    with self.test_session() as sess:
-      sess.run(st.assert_true_mean_equal_by_dkwm_two_sample(
-          samples1, 0., 1., samples2, 0., 1., false_fail_rate=1e-6))
+    self.evaluate(st.assert_true_mean_equal_by_dkwm_two_sample(
+        samples1, 0., 1., samples2, 0., 1., false_fail_rate=1e-6))
 
   def test_dkwm_mean_two_sample_assertion_beta_2_1_false(self):
     rng = np.random.RandomState(seed=0)
@@ -168,15 +188,14 @@ def test_dkwm_mean_two_sample_assertion_beta_2_1_false(self):
     # As established above, 4000 samples is enough to find discrepancies
     # of size 0.2 or more with assurance 1e-6.
 
-    with self.test_session() as sess:
-      # Test that the test assertion confirms that the mean of the
-      # standard uniform distribution is different from the mean of beta(2, 1).
-      beta_high_samples = rng.beta(2, 1, size=num_samples).astype(np.float32)
-      with self.assertRaisesOpError("samples1 has a smaller mean"):
-        sess.run(st.assert_true_mean_equal_by_dkwm_two_sample(
-            samples1, 0., 1.,
-            beta_high_samples, 0., 1.,
-            false_fail_rate=1e-6))
+    # Test that the test assertion confirms that the mean of the
+    # standard uniform distribution is different from the mean of beta(2, 1).
+    beta_high_samples = rng.beta(2, 1, size=num_samples).astype(np.float32)
+    with self.assertRaisesOpError("true mean smaller than expected"):
+      self.evaluate(st.assert_true_mean_equal_by_dkwm_two_sample(
+          samples1, 0., 1.,
+          beta_high_samples, 0., 1.,
+          false_fail_rate=1e-6))
 
   def test_dkwm_mean_two_sample_assertion_beta_1_2_false(self):
     rng = np.random.RandomState(seed=0)
@@ -186,15 +205,14 @@ def test_dkwm_mean_two_sample_assertion_beta_1_2_false(self):
     # As established above, 4000 samples is enough to find discrepancies
     # of size 0.2 or more with assurance 1e-6.
 
-    with self.test_session() as sess:
-      # Test that the test assertion confirms that the mean of the
-      # standard uniform distribution is different from the mean of beta(1, 2).
-      beta_low_samples = rng.beta(1, 2, size=num_samples).astype(np.float32)
-      with self.assertRaisesOpError("samples2 has a smaller mean"):
-        sess.run(st.assert_true_mean_equal_by_dkwm_two_sample(
-            samples1, 0., 1.,
-            beta_low_samples, 0., 1.,
-            false_fail_rate=1e-6))
+    # Test that the test assertion confirms that the mean of the
+    # standard uniform distribution is different from the mean of beta(1, 2).
+    beta_low_samples = rng.beta(1, 2, size=num_samples).astype(np.float32)
+    with self.assertRaisesOpError("true mean greater than expected"):
+      self.evaluate(st.assert_true_mean_equal_by_dkwm_two_sample(
+          samples1, 0., 1.,
+          beta_low_samples, 0., 1.,
+          false_fail_rate=1e-6))
 
   def test_dkwm_argument_validity_checking(self):
     rng = np.random.RandomState(seed=0)
@@ -203,18 +221,17 @@ def test_dkwm_argument_validity_checking(self):
 
     # Test that the test library complains if the given samples fall
     # outside the purported bounds.
-    with self.test_session() as sess:
-      with self.assertRaisesOpError("maximum value exceeds expectations"):
-        sess.run(st.true_mean_confidence_interval_by_dkwm(
-            samples, [[0., 1.]], [[0.5, 1.5]], error_rate=0.5))
-      with self.assertRaisesOpError("minimum value falls below expectations"):
-        sess.run(st.true_mean_confidence_interval_by_dkwm(
-            samples, [[0.5, 1.5]], [[1., 2.]], error_rate=0.5))
-
-      # But doesn't complain if they don't.
-      op = st.true_mean_confidence_interval_by_dkwm(
-          samples, [[0., 1.]], [[1., 2.]], error_rate=0.5)
-      _ = sess.run(op)
+    with self.assertRaisesOpError("maximum value exceeds expectations"):
+      self.evaluate(st.true_mean_confidence_interval_by_dkwm(
+          samples, [[0., 1.]], [[0.5, 1.5]], error_rate=0.5))
+    with self.assertRaisesOpError("minimum value falls below expectations"):
+      self.evaluate(st.true_mean_confidence_interval_by_dkwm(
+          samples, [[0.5, 1.5]], [[1., 2.]], error_rate=0.5))
+
+    # But doesn't complain if they don't.
+    op = st.true_mean_confidence_interval_by_dkwm(
+        samples, [[0., 1.]], [[1., 2.]], error_rate=0.5)
+    _ = self.evaluate(op)
 
   def test_do_maximum_mean(self):
     n = 117
@@ -223,10 +240,9 @@ def test_do_maximum_mean(self):
     samples = rng.uniform(size=n).astype(np.float32)
 
     # Compute the answer in TF using the code under test
-    with self.test_session() as sess:
-      envelope_t = ops.convert_to_tensor(envelope)
-      max_mean = st._do_maximum_mean(samples, envelope_t, 1)
-      max_mean = sess.run(max_mean)
+    envelope_t = ops.convert_to_tensor(envelope)
+    max_mean = st._do_maximum_mean(samples, envelope_t, 1)
+    max_mean = self.evaluate(max_mean)
 
     # Compute the correct answer for this case in numpy.  In this
     # example, `n` and `envelope` are such that `samples[2]` is the
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/util/BUILD b/tensorflow/contrib/distributions/python/kernel_tests/util/BUILD
new file mode 100644
index 00000000000000..03e26b198ea02a
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/util/BUILD
@@ -0,0 +1,48 @@
+# Description:
+#   Internal testing utilities, e.g., computing the correct answer to
+#   put in a unit test.
+
+licenses(["notice"])  # Apache 2.0
+
+py_library(
+    name = "correlation_matrix_volumes_py",
+    srcs = [
+        "correlation_matrix_volumes_lib.py",
+    ],
+    deps = [
+        "//tensorflow/contrib/distributions:distributions_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_binary(
+    name = "correlation_matrix_volumes",
+    srcs = [
+        "correlation_matrix_volumes.py",
+    ],
+    deps = [
+        ":correlation_matrix_volumes_py",
+    ],
+)
+
+py_test(
+    name = "correlation_matrix_volumes_test",
+    size = "medium",
+    srcs = ["correlation_matrix_volumes_test.py"],
+    tags = ["no_pip"],
+    deps = [
+        ":correlation_matrix_volumes_py",
+        # For statistical testing
+        "//tensorflow/contrib/distributions:distributions_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+    ],
+)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/util/correlation_matrix_volumes.py b/tensorflow/contrib/distributions/python/kernel_tests/util/correlation_matrix_volumes.py
new file mode 100644
index 00000000000000..2eab51cd3053ea
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/util/correlation_matrix_volumes.py
@@ -0,0 +1,98 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Executable to estimate the volume of various sets of correlation matrices.
+
+See correlation_matrix_volumes_lib.py for purpose and methodology.
+
+Invocation example:
+```
+python correlation_matrix_volumes.py --num_samples 1e7
+```
+
+This will compute 10,000,000-sample confidence intervals for the
+volumes of several sets of correlation matrices.  Which sets, and the
+desired statistical significance, are hard-coded in this source file.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import pprint
+
+from absl import app
+from absl import flags
+
+from tensorflow.contrib.distributions.python.kernel_tests.util import correlation_matrix_volumes_lib as corr
+
+FLAGS = flags.FLAGS
+
+# Float to support giving the number of samples in scientific notation.
+# The production run used for the LKJ test used 1e7 samples.
+flags.DEFINE_float('num_samples', 1e4, 'Number of samples to use.')
+
+
+def ctv_debatched(det_bounds, dim, num_samples, error_rate=1e-6, seed=42):
+  # This wrapper undoes the batching in compute_true_volumes, because
+  # apparently several 5x5x9x1e7 Tensors of float32 can strain RAM.
+  bounds = {}
+  for db in det_bounds:
+    bounds[db] = corr.compute_true_volumes(
+        [db], dim, num_samples, error_rate=error_rate, seed=seed)[db]
+  return bounds
+
+
+# The particular bounds in all three of these functions were chosen by
+# a somewhat arbitrary walk through an empirical tradeoff, for the
+# purpose of testing the LKJ distribution.  Setting the determinant
+# bound lower
+# - Covers more of the testee's sample space, and
+# - Increases the probability that the rejection sampler will hit, thus
+# - Decreases the relative error (at a fixed sample count) in the
+#   rejection-based volume estimate;
+# but also
+# - Increases the variance of the estimator used in the LKJ test.
+# This latter variance is also affected by the dimension and the
+# tested concentration parameter, and can be compensated for with more
+# compute (expensive) or a looser discrepancy limit (unsatisfying).
+# The values here are the projection of the points in that test design
+# space that ended up getting chosen.
+def compute_3x3_volumes(num_samples):
+  det_bounds = [0.01, 0.25, 0.3, 0.35, 0.4, 0.45]
+  return ctv_debatched(
+      det_bounds, 3, num_samples, error_rate=5e-7, seed=46)
+
+
+def compute_4x4_volumes(num_samples):
+  det_bounds = [0.01, 0.25, 0.3, 0.35, 0.4, 0.45]
+  return ctv_debatched(
+      det_bounds, 4, num_samples, error_rate=5e-7, seed=47)
+
+
+def compute_5x5_volumes(num_samples):
+  det_bounds = [0.01, 0.2, 0.25, 0.3, 0.35, 0.4]
+  return ctv_debatched(
+      det_bounds, 5, num_samples, error_rate=5e-7, seed=48)
+
+
+def main(_):
+  full_bounds = {}
+  full_bounds[3] = compute_3x3_volumes(int(FLAGS.num_samples))
+  full_bounds[4] = compute_4x4_volumes(int(FLAGS.num_samples))
+  full_bounds[5] = compute_5x5_volumes(int(FLAGS.num_samples))
+  pprint.pprint(full_bounds)
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/util/correlation_matrix_volumes_lib.py b/tensorflow/contrib/distributions/python/kernel_tests/util/correlation_matrix_volumes_lib.py
new file mode 100644
index 00000000000000..455e71f00c96e7
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/util/correlation_matrix_volumes_lib.py
@@ -0,0 +1,323 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Estimating the volume of the correlation matrices with bounded determinant.
+
+Why?  Because lkj_test.py tests the sampler for the LKJ distribution
+by estimating the same volume another way.
+
+How?  Rejection sampling.  Or, more precisely, importance sampling,
+proposing from the uniform distribution on symmetric matrices with
+diagonal 1s and entries in [-1, 1].  Such a matrix is a correlation
+matrix if and only if it is also positive semi-definite.
+
+The samples can then be converted into a confidence interval on the
+volume in question by the [Clopper-Pearson
+method](https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval),
+also implemented here.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import importlib
+import sys
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import uniform
+from tensorflow.python.ops.distributions import util
+from tensorflow.python.platform import tf_logging
+
+__all__ = [
+    "correlation_matrix_volume_rejection_samples",
+    "compute_true_volumes",
+]
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+optimize = try_import("scipy.optimize")
+stats = try_import("scipy.stats")
+
+
+def _psd_mask(x):
+  """Computes whether each square matrix in the input is positive semi-definite.
+
+  Args:
+    x: A floating-point `Tensor` of shape `[B1, ..., Bn, M, M]`.
+
+  Returns:
+    mask: A floating-point `Tensor` of shape `[B1, ... Bn]`.  Each
+      scalar is 1 if the corresponding matrix was PSD, otherwise 0.
+  """
+  # Allegedly
+  # https://scicomp.stackexchange.com/questions/12979/testing-if-a-matrix-is-positive-semi-definite
+  # it is more efficient to test for positive semi-definiteness by
+  # trying to compute the Cholesky decomposition -- the matrix is PSD
+  # if you succeed and not PSD if you fail.  However, TensorFlow's
+  # Cholesky raises an exception if _any_ of the input matrices are
+  # not PSD, from which I don't know how to extract _which ones_, so I
+  # proceed by explicitly computing all the eigenvalues and checking
+  # whether they are all positive or not.
+  #
+  # Also, as was discussed in the answer, it is somewhat dangerous to
+  # treat SPD-ness as binary in floating-point arithmetic. Cholesky
+  # factorization can complete and 'look' like everything is fine
+  # (e.g., O(1) entries and a diagonal of all ones) but the matrix can
+  # have an exponential condition number.
+  eigenvalues, _ = linalg_ops.self_adjoint_eig(x)
+  return math_ops.cast(
+      math_ops.reduce_min(eigenvalues, axis=-1) >= 0, dtype=x.dtype)
+
+
+def _det_large_enough_mask(x, det_bounds):
+  """Returns whether the input matches the given determinant limit.
+
+  Args:
+    x: A floating-point `Tensor` of shape `[B1, ..., Bn, M, M]`.
+    det_bounds: A floating-point `Tensor` that must broadcast to shape
+      `[B1, ..., Bn]`, giving the desired lower bound on the
+      determinants in `x`.
+
+  Returns:
+    mask: A floating-point `Tensor` of shape [B1, ..., Bn].  Each
+      scalar is 1 if the corresponding matrix had determinant above
+      the corresponding bound, otherwise 0.
+  """
+  # For the curious: I wonder whether it is possible and desirable to
+  # use a Cholesky decomposition-based algorithm for this, since the
+  # only matrices whose determinant this code cares about will be PSD.
+  # Didn't figure out how to code that in TensorFlow.
+  #
+  # Expert opinion is that it would be about twice as fast since
+  # Cholesky is roughly half the cost of Gaussian Elimination with
+  # Partial Pivoting. But this is less of an impact than the switch in
+  # _psd_mask.
+  return math_ops.cast(
+      linalg_ops.matrix_determinant(x) > det_bounds, dtype=x.dtype)
+
+
+def _uniform_correlation_like_matrix(num_rows, batch_shape, dtype, seed):
+  """Returns a uniformly random `Tensor` of "correlation-like" matrices.
+
+  A "correlation-like" matrix is a symmetric square matrix with all entries
+  between -1 and 1 (inclusive) and 1s on the main diagonal.  Of these,
+  the ones that are positive semi-definite are exactly the correlation
+  matrices.
+
+  Args:
+    num_rows: Python `int` dimension of the correlation-like matrices.
+    batch_shape: `Tensor` or Python `tuple` of `int` shape of the
+      batch to return.
+    dtype: `dtype` of the `Tensor` to return.
+    seed: Random seed.
+
+  Returns:
+    matrices: A `Tensor` of shape `batch_shape + [num_rows, num_rows]`
+      and dtype `dtype`.  Each entry is in [-1, 1], and each matrix
+      along the bottom two dimensions is symmetric and has 1s on the
+      main diagonal.
+  """
+  num_entries = num_rows * (num_rows + 1) / 2
+  ones = array_ops.ones(shape=[num_entries], dtype=dtype)
+  # It seems wasteful to generate random values for the diagonal since
+  # I am going to throw them away, but `fill_triangular` fills the
+  # diagonal, so I probably need them.
+  # It's not impossible that it would be more efficient to just fill
+  # the whole matrix with random values instead of messing with
+  # `fill_triangular`.  Then would need to filter almost half out with
+  # `matrix_band_part`.
+  unifs = uniform.Uniform(-ones, ones).sample(batch_shape, seed=seed)
+  tril = util.fill_triangular(unifs)
+  symmetric = tril + array_ops.matrix_transpose(tril)
+  diagonal_ones = array_ops.ones(
+      shape=util.pad(batch_shape, axis=0, back=True, value=num_rows),
+      dtype=dtype)
+  return array_ops.matrix_set_diag(symmetric, diagonal_ones)
+
+
+def correlation_matrix_volume_rejection_samples(
+    det_bounds, dim, sample_shape, dtype, seed):
+  """Returns rejection samples from trying to get good correlation matrices.
+
+  The proposal being rejected from is the uniform distribution on
+  "correlation-like" matrices.  We say a matrix is "correlation-like"
+  if it is a symmetric square matrix with all entries between -1 and 1
+  (inclusive) and 1s on the main diagonal.  Of these, the ones that
+  are positive semi-definite are exactly the correlation matrices.
+
+  The rejection algorithm, then, is to sample a `Tensor` of
+  `sample_shape` correlation-like matrices of dimensions `dim` by
+  `dim`, and check each one for (i) being a correlation matrix (i.e.,
+  PSD), and (ii) having determinant at least the corresponding entry
+  of `det_bounds`.
+
+  Args:
+    det_bounds: A `Tensor` of lower bounds on the determinants of
+      acceptable matrices.  The shape must broadcast with `sample_shape`.
+    dim: A Python `int` dimension of correlation matrices to sample.
+    sample_shape: Python `tuple` of `int` shape of the samples to
+      compute, excluding the two matrix dimensions.
+    dtype: The `dtype` in which to do the computation.
+    seed: Random seed.
+
+  Returns:
+    weights: A `Tensor` of shape `sample_shape`.  Each entry is 0 if the
+      corresponding matrix was not a correlation matrix, or had too
+      small of a determinant.  Otherwise, the entry is the
+      multiplicative inverse of the density of proposing that matrix
+      uniformly, i.e., the volume of the set of `dim` by `dim`
+      correlation-like matrices.
+    volume: The volume of the set of `dim` by `dim` correlation-like
+      matrices.
+  """
+  with ops.name_scope("rejection_sampler"):
+    rej_proposals = _uniform_correlation_like_matrix(
+        dim, sample_shape, dtype, seed=seed)
+    rej_proposal_volume = 2. ** (dim * (dim - 1) / 2.)
+    # The density of proposing any given point is 1 / rej_proposal_volume;
+    # The weight of that point should be scaled by
+    # 1 / density = rej_proposal_volume.
+    rej_weights = rej_proposal_volume * _psd_mask(
+        rej_proposals) * _det_large_enough_mask(rej_proposals, det_bounds)
+    return rej_weights, rej_proposal_volume
+
+
+def _clopper_pearson_confidence_interval(samples, error_rate):
+  """Computes a confidence interval for the mean of the given 1-D distribution.
+
+  Assumes (and checks) that the given distribution is Bernoulli, i.e.,
+  takes only two values.  This licenses using the CDF of the binomial
+  distribution for the confidence, which is tighter (for extreme
+  probabilities) than the DKWM inequality.  The method is known as the
+  [Clopper-Pearson method]
+  (https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval).
+
+  Assumes:
+
+  - The given samples were drawn iid from the distribution of interest.
+
+  - The given distribution is a Bernoulli, i.e., supported only on
+    low and high.
+
+  Guarantees:
+
+  - The probability (over the randomness of drawing the given sample)
+    that the true mean is outside the returned interval is no more
+    than the given error_rate.
+
+  Args:
+    samples: `np.ndarray` of samples drawn iid from the distribution
+      of interest.
+    error_rate: Python `float` admissible rate of mistakes.
+
+  Returns:
+    low: Lower bound of confidence interval.
+    high: Upper bound of confidence interval.
+
+  Raises:
+    ValueError: If `samples` has rank other than 1 (batch semantics
+      are not implemented), or if `samples` contains values other than
+      `low` or `high` (as that makes the distribution not Bernoulli).
+  """
+  # TODO(b/78025336) Migrate this confidence interval function
+  # to statistical_testing.py.  In order to do that
+  # - Get the binomial CDF from the Binomial distribution
+  # - Implement scalar root finding in TF.  Batch bisection search
+  #   shouldn't be too hard, and is definitely good enough for this
+  #   problem.  Batching the Brent algorithm (from scipy) that is used
+  #   here may be more involved, but may also not be necessary---it's
+  #   only used here because scipy made it convenient.  In particular,
+  #   robustness is more important than speed here, which may make
+  #   bisection search actively better.
+  # - The rest is just a matter of rewriting in the appropriate style.
+  if optimize is None or stats is None:
+    raise ValueError(
+        "Scipy is required for computing Clopper-Pearson confidence intervals")
+  if len(samples.shape) != 1:
+    raise ValueError("Batch semantics not implemented")
+  n = len(samples)
+  low = np.amin(samples)
+  high = np.amax(samples)
+  successes = np.count_nonzero(samples - low)
+  failures = np.count_nonzero(samples - high)
+  if successes + failures != n:
+    uniques = np.unique(samples)
+    msg = ("Purportedly Bernoulli distribution had distinct samples"
+           " {}, {}, and {}".format(uniques[0], uniques[1], uniques[2]))
+    raise ValueError(msg)
+  def p_small_enough(p):
+    prob = stats.binom.logcdf(successes, n, p)
+    return prob - np.log(error_rate / 2.)
+  def p_big_enough(p):
+    prob = stats.binom.logsf(successes, n, p)
+    return prob - np.log(error_rate / 2.)
+  high_p = optimize.brentq(
+      p_small_enough, float(successes) / n, 1., rtol=1e-9)
+  low_p = optimize.brentq(
+      p_big_enough, 0., float(successes) / n, rtol=1e-9)
+  low_interval = low + (high - low) * low_p
+  high_interval = low + (high - low) * high_p
+  return (low_interval, high_interval)
+
+
+def compute_true_volumes(
+    det_bounds, dim, num_samples, error_rate=1e-6, seed=42):
+  """Returns confidence intervals for the desired correlation matrix volumes.
+
+  The confidence intervals are computed by the [Clopper-Pearson method]
+  (https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval).
+
+  Args:
+    det_bounds: A rank-1 numpy array of lower bounds on the
+      determinants of acceptable matrices.  Entries must be unique.
+    dim: A Python `int` dimension of correlation matrices to sample.
+    num_samples: The number of samples to draw.
+    error_rate: The statistical significance of the returned
+      confidence intervals.  The significance is broadcast: Each
+      returned interval separately may be incorrect with probability
+      (under the sample of correlation-like matrices drawn internally)
+      at most `error_rate`.
+    seed: Random seed.
+
+  Returns:
+    bounds: A Python `dict` mapping each determinant bound to the low, high
+      tuple giving the confidence interval.
+  """
+  bounds = {}
+  with session.Session() as sess:
+    rej_weights, _ = correlation_matrix_volume_rejection_samples(
+        det_bounds, dim, [num_samples, len(det_bounds)], np.float32, seed=seed)
+    rej_weights = sess.run(rej_weights)
+    for rw, det in zip(np.rollaxis(rej_weights, 1), det_bounds):
+      template = ("Estimating volume of {}x{} correlation "
+                  "matrices with determinant >= {}.")
+      print(template.format(dim, dim, det))
+      sys.stdout.flush()
+      bounds[det] = _clopper_pearson_confidence_interval(
+          rw, error_rate=error_rate)
+    return bounds
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/util/correlation_matrix_volumes_test.py b/tensorflow/contrib/distributions/python/kernel_tests/util/correlation_matrix_volumes_test.py
new file mode 100644
index 00000000000000..8f99300e638711
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/util/correlation_matrix_volumes_test.py
@@ -0,0 +1,150 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for correlation_matrix_volumes_lib.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.kernel_tests.util import correlation_matrix_volumes_lib as corr
+from tensorflow.contrib.distributions.python.ops import statistical_testing as st
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.platform import test
+
+
+# NxN correlation matrices are determined by the N*(N-1)/2
+# lower-triangular entries.  In addition to being between -1 and 1,
+# they must also obey the constraint that the determinant of the
+# resulting symmetric matrix is non-negative.  In 2x2, we can even
+# analytically compute the volume when the determinant is bounded to >
+# epsilon, as that boils down to the one lower-triangular entry being
+# less than 1 - epsilon in absolute value.
+def two_by_two_volume(det_bound):
+  return 2 * np.sqrt(1.0 - det_bound)
+
+
+# The post
+# https://psychometroscar.com/the-volume-of-a-3-x-3-correlation-matrix/
+# derives (with elementary calculus) that the volume (with respect to
+# Lebesgue^3 measure) of the set of 3x3 correlation matrices is
+# pi^2/2.  The same result is also obtained by [1].
+def three_by_three_volume():
+  return np.pi**2 / 2.
+
+
+# The volume of the unconstrained set of correlation matrices is also
+# the normalization constant of the LKJ distribution from [2].  As
+# part of defining the distribution, that reference a derives general
+# formula for this volume for all dimensions.  A TensorFlow
+# computation thereof gave the below result for 4x4:
+def four_by_four_volume():
+  # This constant computed as math_ops.exp(lkj.log_norm_const(4, [1.0]))
+  return 11.6973076
+
+# [1] Rousseeuw, P. J., & Molenberghs, G. (1994). "The shape of
+# correlation matrices." The American Statistician, 48(4), 276-279.
+
+# [2] Daniel Lewandowski, Dorota Kurowicka, and Harry Joe, "Generating
+# random correlation matrices based on vines and extended onion
+# method," Journal of Multivariate Analysis 100 (2009), pp 1989-2001.
+
+
+class CorrelationMatrixVolumesTest(test.TestCase):
+
+  def testRejection2D(self):
+    num_samples = int(1e5)  # Chosen for a small min detectable discrepancy
+    det_bounds = np.array(
+        [0.01, 0.02, 0.03, 0.04, 0.05, 0.3, 0.35, 0.4, 0.5], dtype=np.float32)
+    exact_volumes = two_by_two_volume(det_bounds)
+    (rej_weights,
+     rej_proposal_volume) = corr.correlation_matrix_volume_rejection_samples(
+         det_bounds, 2, [num_samples, 9], dtype=np.float32, seed=43)
+    # shape of rej_weights: [num_samples, 9, 2, 2]
+    chk1 = st.assert_true_mean_equal_by_dkwm(
+        rej_weights, low=0., high=rej_proposal_volume, expected=exact_volumes,
+        false_fail_rate=1e-6)
+    chk2 = check_ops.assert_less(
+        st.min_discrepancy_of_true_means_detectable_by_dkwm(
+            num_samples, low=0., high=rej_proposal_volume,
+            # Correct the false fail rate due to different broadcasting
+            false_fail_rate=1.1e-7, false_pass_rate=1e-6),
+        0.036)
+    with ops.control_dependencies([chk1, chk2]):
+      rej_weights = array_ops.identity(rej_weights)
+    self.evaluate(rej_weights)
+
+  def testRejection3D(self):
+    num_samples = int(1e5)  # Chosen for a small min detectable discrepancy
+    det_bounds = np.array([0.0], dtype=np.float32)
+    exact_volumes = np.array([three_by_three_volume()], dtype=np.float32)
+    (rej_weights,
+     rej_proposal_volume) = corr.correlation_matrix_volume_rejection_samples(
+         det_bounds, 3, [num_samples, 1], dtype=np.float32, seed=44)
+    # shape of rej_weights: [num_samples, 1, 3, 3]
+    chk1 = st.assert_true_mean_equal_by_dkwm(
+        rej_weights, low=0., high=rej_proposal_volume, expected=exact_volumes,
+        false_fail_rate=1e-6)
+    chk2 = check_ops.assert_less(
+        st.min_discrepancy_of_true_means_detectable_by_dkwm(
+            num_samples, low=0., high=rej_proposal_volume,
+            false_fail_rate=1e-6, false_pass_rate=1e-6),
+        # Going for about a 3% relative error
+        0.15)
+    with ops.control_dependencies([chk1, chk2]):
+      rej_weights = array_ops.identity(rej_weights)
+    self.evaluate(rej_weights)
+
+  def testRejection4D(self):
+    num_samples = int(1e5)  # Chosen for a small min detectable discrepancy
+    det_bounds = np.array([0.0], dtype=np.float32)
+    exact_volumes = [four_by_four_volume()]
+    (rej_weights,
+     rej_proposal_volume) = corr.correlation_matrix_volume_rejection_samples(
+         det_bounds, 4, [num_samples, 1], dtype=np.float32, seed=45)
+    # shape of rej_weights: [num_samples, 1, 4, 4]
+    chk1 = st.assert_true_mean_equal_by_dkwm(
+        rej_weights, low=0., high=rej_proposal_volume, expected=exact_volumes,
+        false_fail_rate=1e-6)
+    chk2 = check_ops.assert_less(
+        st.min_discrepancy_of_true_means_detectable_by_dkwm(
+            num_samples, low=0., high=rej_proposal_volume,
+            false_fail_rate=1e-6, false_pass_rate=1e-6),
+        # Going for about a 10% relative error
+        1.1)
+    with ops.control_dependencies([chk1, chk2]):
+      rej_weights = array_ops.identity(rej_weights)
+    self.evaluate(rej_weights)
+
+  def testVolumeEstimation2D(self):
+    # Test that the confidence intervals produced by
+    # corr.compte_true_volumes are sound, in the sense of containing
+    # the exact volume.
+    num_samples = int(1e5)  # Chosen by symmetry with testRejection2D
+    det_bounds = np.array(
+        [0.01, 0.02, 0.03, 0.04, 0.05, 0.3, 0.35, 0.4, 0.5], dtype=np.float32)
+    volume_bounds = corr.compute_true_volumes(
+        det_bounds, 2, num_samples, error_rate=1e-6, seed=47)
+    exact_volumes = two_by_two_volume(det_bounds)
+    for det, volume in zip(det_bounds, exact_volumes):
+      computed_low, computed_high = volume_bounds[det]
+      self.assertLess(computed_low, volume)
+      self.assertGreater(computed_high, volume)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/ops/autoregressive.py b/tensorflow/contrib/distributions/python/ops/autoregressive.py
index 88ed0127841093..11ca90c4833d84 100644
--- a/tensorflow/contrib/distributions/python/ops/autoregressive.py
+++ b/tensorflow/contrib/distributions/python/ops/autoregressive.py
@@ -144,7 +144,7 @@ def __init__(self,
         `distribution_fn(sample0).event_shape.num_elements()` are both `None`.
       ValueError: if `num_steps < 1`.
     """
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name) as name:
       self._distribution_fn = distribution_fn
       self._sample0 = sample0
diff --git a/tensorflow/contrib/distributions/python/ops/batch_reshape.py b/tensorflow/contrib/distributions/python/ops/batch_reshape.py
index bf5590cd552a91..4714caad69ee43 100644
--- a/tensorflow/contrib/distributions/python/ops/batch_reshape.py
+++ b/tensorflow/contrib/distributions/python/ops/batch_reshape.py
@@ -41,9 +41,6 @@ class BatchReshape(distribution_lib.Distribution):
   This "meta-distribution" reshapes the batch dimensions of another
   distribution.
 
-  Note: Unlike `tf.reshape`, the `BatchReshape` distribution does not support
-  `-1` for flattening.
-
   #### Examples
 
   ```python
@@ -51,7 +48,7 @@ class BatchReshape(distribution_lib.Distribution):
 
   dtype = np.float32
   dims = 2
-  new_batch_shape = [1, 2, 3]
+  new_batch_shape = [1, 2, -1]
   old_batch_shape = [6]
 
   scale = np.ones(old_batch_shape + [dims], dtype)
@@ -85,8 +82,9 @@ def __init__(self,
     Args:
       distribution: The base distribution instance to reshape. Typically an
         instance of `Distribution`.
-      batch_shape: Positive `int`-like vector-shaped `Tensor` representing the
-        new shape of the batch dimensions.
+      batch_shape: Positive `int`-like vector-shaped `Tensor` representing
+        the new shape of the batch dimensions. Up to one dimension may contain
+        `-1`, meaning the remainder of the batch size.
       validate_args: Python `bool`, default `False`. When `True` distribution
         parameters are checked for validity despite possibly degrading runtime
         performance. When `False` invalid inputs may silently render incorrect
@@ -104,31 +102,28 @@ def __init__(self,
       ValueError: if `batch_shape` size is not the same as a
         `distribution.batch_shape` size.
     """
-    parameters = locals()
+    parameters = dict(locals())
     name = name or "BatchReshape" + distribution.name
-    self._distribution = distribution
     with ops.name_scope(name, values=[batch_shape]) as name:
-      self._batch_shape_ = ops.convert_to_tensor(
-          batch_shape,
-          dtype=dtypes.int32,
-          name="batch_shape")
-      self._batch_shape_static = tensor_util.constant_value(self._batch_shape_)
-      if self._batch_shape_static is not None:
-        self._batch_shape_static = np.int32(self._batch_shape_static)
-      self._runtime_assertions = validate_init_args(
-          self._distribution,
-          self._batch_shape_,
-          validate_args,
-          self._batch_shape_static)
+      # The unexpanded batch shape may contain up to one dimension of -1.
+      self._batch_shape_unexpanded = ops.convert_to_tensor(
+          batch_shape, dtype=dtypes.int32, name="batch_shape")
+      validate_init_args_statically(distribution, self._batch_shape_unexpanded)
+      batch_shape, batch_shape_static, runtime_assertions = calculate_reshape(
+          distribution.batch_shape_tensor(), self._batch_shape_unexpanded,
+          validate_args)
+      self._distribution = distribution
+      self._batch_shape_ = batch_shape
+      self._batch_shape_static = batch_shape_static
+      self._runtime_assertions = runtime_assertions
       super(BatchReshape, self).__init__(
-          dtype=self._distribution.dtype,
-          reparameterization_type=self._distribution.reparameterization_type,
+          dtype=distribution.dtype,
+          reparameterization_type=distribution.reparameterization_type,
           validate_args=validate_args,
           allow_nan_stats=allow_nan_stats,
           parameters=parameters,
           graph_parents=(
-              [self._batch_shape_] +
-              self._distribution._graph_parents),  # pylint: disable=protected-access
+              [self._batch_shape_unexpanded] + distribution._graph_parents),  # pylint: disable=protected-access
           name=name)
 
   @property
@@ -140,7 +135,7 @@ def _batch_shape_tensor(self):
       return array_ops.identity(self._batch_shape_)
 
   def _batch_shape(self):
-    return tensor_shape.TensorShape(self._batch_shape_static)
+    return self._batch_shape_static
 
   def _event_shape_tensor(self):
     with ops.control_dependencies(self._runtime_assertions):
@@ -152,11 +147,13 @@ def _event_shape(self):
   def _sample_n(self, n, seed=None):
     with ops.control_dependencies(self._runtime_assertions):
       x = self.distribution.sample(sample_shape=n, seed=seed)
-      new_shape = array_ops.concat([
-          [n],
-          self.batch_shape_tensor(),
-          self.event_shape_tensor(),
-      ], axis=0)
+      new_shape = array_ops.concat(
+          [
+              [n],
+              self._batch_shape_unexpanded,
+              self.event_shape_tensor(),
+          ],
+          axis=0)
       return array_ops.reshape(x, new_shape)
 
   def _log_prob(self, x):
@@ -213,9 +210,9 @@ def _sample_shape(self, x):
     event_ndims = (array_ops.size(self.event_shape_tensor())
                    if self.event_shape.ndims is None
                    else self.event_shape.ndims)
-    batch_ndims = (array_ops.size(self.batch_shape_tensor())
-                   if self.batch_shape.ndims is None
-                   else self.batch_shape.ndims)
+    batch_ndims = (
+        array_ops.size(self._batch_shape_unexpanded)
+        if self.batch_shape.ndims is None else self.batch_shape.ndims)
     sample_ndims = x_ndims - batch_ndims - event_ndims
     if isinstance(sample_ndims, int):
       static_sample_shape = x.shape[:sample_ndims]
@@ -238,10 +235,11 @@ def _call_reshape_input_output(self, fn, x):
           self.event_shape_tensor(),
       ], axis=0)
       result = fn(array_ops.reshape(x, old_shape))
-      new_shape = array_ops.concat([
-          sample_shape,
-          self.batch_shape_tensor(),
-      ], axis=0)
+      new_shape = array_ops.concat(
+          [
+              sample_shape,
+              self._batch_shape_unexpanded,
+          ], axis=0)
       result = array_ops.reshape(result, new_shape)
       if (static_sample_shape.ndims is not None and
           self.batch_shape.ndims is not None):
@@ -261,8 +259,7 @@ def _call_and_reshape_output(
       if static_event_shape_list is None:
         static_event_shape_list = [self.event_shape]
       new_shape = array_ops.concat(
-          [self.batch_shape_tensor()] + event_shape_list,
-          axis=0)
+          [self._batch_shape_unexpanded] + event_shape_list, axis=0)
       result = array_ops.reshape(fn(), new_shape)
       if (self.batch_shape.ndims is not None and
           self.event_shape.ndims is not None):
@@ -281,9 +278,9 @@ def _validate_sample_arg(self, x):
       event_ndims = (array_ops.size(self.event_shape_tensor())
                      if self.event_shape.ndims is None
                      else self.event_shape.ndims)
-      batch_ndims = (array_ops.size(self.batch_shape_tensor())
-                     if self.batch_shape.ndims is None
-                     else self.batch_shape.ndims)
+      batch_ndims = (
+          array_ops.size(self._batch_shape_unexpanded)
+          if self.batch_shape.ndims is None else self.batch_shape.ndims)
       expected_batch_event_ndims = batch_ndims + event_ndims
 
       if (isinstance(x_ndims, int) and
@@ -355,62 +352,56 @@ def _validate_sample_arg(self, x):
       return runtime_assertions
 
 
-def validate_init_args(
-    distribution,
-    batch_shape,
-    validate_args,
-    batch_shape_static):
+def calculate_reshape(original_shape, new_shape, validate=False, name=None):
+  """Calculates the reshaped dimensions (replacing up to one -1 in reshape)."""
+  batch_shape_static = tensor_util.constant_value_as_shape(new_shape)
+  if batch_shape_static.is_fully_defined():
+    return np.int32(batch_shape_static.as_list()), batch_shape_static, []
+  with ops.name_scope(name, "calculate_reshape", [original_shape, new_shape]):
+    original_size = math_ops.reduce_prod(original_shape)
+    implicit_dim = math_ops.equal(new_shape, -1)
+    size_implicit_dim = (
+        original_size // math_ops.maximum(1, -math_ops.reduce_prod(new_shape)))
+    new_ndims = array_ops.shape(new_shape)
+    expanded_new_shape = array_ops.where(  # Assumes exactly one `-1`.
+        implicit_dim, array_ops.fill(new_ndims, size_implicit_dim), new_shape)
+    validations = [] if not validate else [
+        check_ops.assert_rank(
+            original_shape, 1, message="Original shape must be a vector."),
+        check_ops.assert_rank(
+            new_shape, 1, message="New shape must be a vector."),
+        check_ops.assert_less_equal(
+            math_ops.count_nonzero(implicit_dim, dtype=dtypes.int32),
+            1,
+            message="At most one dimension can be unknown."),
+        check_ops.assert_positive(
+            expanded_new_shape, message="Shape elements must be >=-1."),
+        check_ops.assert_equal(
+            math_ops.reduce_prod(expanded_new_shape),
+            original_size,
+            message="Shape sizes do not match."),
+    ]
+    return expanded_new_shape, batch_shape_static, validations
+
+
+def validate_init_args_statically(distribution, batch_shape):
   """Helper to __init__ which makes or raises assertions."""
-  with ops.name_scope(name="validate_init_args",
-                      values=[batch_shape] + distribution._graph_parents):  # pylint: disable=protected-access
-    runtime_assertions = []
-
-    if batch_shape.shape.ndims is not None:
-      if batch_shape.shape.ndims != 1:
-        raise ValueError("`batch_shape` must be a vector "
-                         "(saw rank: {}).".format(
-                             batch_shape.shape.ndims))
-    elif validate_args:
-      runtime_assertions += [
-          check_ops.assert_rank(
-              batch_shape,
-              1,
-              message="`batch_shape` must be a vector.",
-              name="assert_batch_shape_is_vector"),
-      ]
-
-    batch_size_static = np.prod(batch_shape_static)
-    dist_batch_size_static = (
-        None if not distribution.batch_shape.is_fully_defined()
-        else np.prod(distribution.batch_shape).value)
-
-    if batch_size_static is not None and dist_batch_size_static is not None:
-      if batch_size_static != dist_batch_size_static:
-        raise ValueError("`batch_shape` size ({}) must match "
-                         "`distribution.batch_shape` size ({}).".format(
-                             batch_size_static,
-                             dist_batch_size_static))
-    elif validate_args:
-      runtime_assertions += [
-          check_ops.assert_equal(
-              math_ops.reduce_prod(batch_shape),
-              math_ops.reduce_prod(distribution.batch_shape_tensor()),
-              message=("`batch_shape` size must match "
-                       "`distributions.batch_shape` size."),
-              name="assert_batch_size"),
-      ]
-
-    if batch_shape_static is not None:
-      if np.any(batch_shape_static < 1):
-        raise ValueError("`batch_shape` elements must be positive "
-                         "(i.e., larger than zero).")
-    elif validate_args:
-      runtime_assertions += [
-          check_ops.assert_positive(
-              batch_shape,
-              message=("`batch_shape` elements must be positive "
-                       "(i.e., larger than zero)."),
-              name="assert_batch_shape_positive")
-      ]
-
-    return runtime_assertions
+  if batch_shape.shape.ndims is not None:
+    if batch_shape.shape.ndims != 1:
+      raise ValueError("`batch_shape` must be a vector "
+                       "(saw rank: {}).".format(batch_shape.shape.ndims))
+
+  batch_shape_static = tensor_util.constant_value_as_shape(batch_shape)
+  batch_size_static = batch_shape_static.num_elements()
+  dist_batch_size_static = distribution.batch_shape.num_elements()
+
+  if batch_size_static is not None and dist_batch_size_static is not None:
+    if batch_size_static != dist_batch_size_static:
+      raise ValueError("`batch_shape` size ({}) must match "
+                       "`distribution.batch_shape` size ({}).".format(
+                           batch_size_static, dist_batch_size_static))
+
+  if batch_shape_static.dims is not None:
+    if any(
+        dim.value is not None and dim.value < 1 for dim in batch_shape_static):
+      raise ValueError("`batch_shape` elements must be >=-1.")
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
index 51478dbeffaabc..4965381ef33e14 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
@@ -30,6 +30,7 @@
 @@Invert
 @@Kumaraswamy
 @@MaskedAutoregressiveFlow
+@@MatrixInverseTriL
 @@Ordered
 @@Permute
 @@PowerTransform
@@ -68,6 +69,7 @@
 from tensorflow.contrib.distributions.python.ops.bijectors.invert import *
 from tensorflow.contrib.distributions.python.ops.bijectors.kumaraswamy import *
 from tensorflow.contrib.distributions.python.ops.bijectors.masked_autoregressive import *
+from tensorflow.contrib.distributions.python.ops.bijectors.matrix_inverse_tril import *
 from tensorflow.contrib.distributions.python.ops.bijectors.ordered import *
 from tensorflow.contrib.distributions.python.ops.bijectors.permute import *
 from tensorflow.contrib.distributions.python.ops.bijectors.power_transform import *
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/chain.py b/tensorflow/contrib/distributions/python/ops/bijectors/chain.py
index 85ad23e4133ef0..16f959560ce0f1 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/chain.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/chain.py
@@ -20,10 +20,9 @@
 
 import itertools
 
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import bijector
 
 
@@ -36,15 +35,6 @@ def _use_static_shape(input_tensor, ndims):
   return input_tensor.shape.is_fully_defined() and isinstance(ndims, int)
 
 
-def _maybe_get_event_ndims_statically(event_ndims):
-  static_event_ndims = (event_ndims if isinstance(event_ndims, int)
-                        else tensor_util.constant_value(event_ndims))
-  if static_event_ndims is not None:
-    return static_event_ndims
-
-  return event_ndims
-
-
 def _compute_min_event_ndims(bijector_list, compute_forward=True):
   """Computes the min_event_ndims associated with the give list of bijectors.
 
@@ -238,13 +228,13 @@ def _inverse(self, y, **kwargs):
     return y
 
   def _inverse_log_det_jacobian(self, y, **kwargs):
-    ildj = constant_op.constant(
-        0., dtype=y.dtype.base_dtype, name="inverse_log_det_jacobian")
+    y = ops.convert_to_tensor(y, name="y")
+    ildj = math_ops.cast(0., dtype=y.dtype.base_dtype)
 
     if not self.bijectors:
       return ildj
 
-    event_ndims = _maybe_get_event_ndims_statically(
+    event_ndims = self._maybe_get_static_event_ndims(
         self.inverse_min_event_ndims)
 
     if _use_static_shape(y, event_ndims):
@@ -258,11 +248,15 @@ def _inverse_log_det_jacobian(self, y, **kwargs):
 
       if _use_static_shape(y, event_ndims):
         event_shape = b.inverse_event_shape(event_shape)
-        event_ndims = _maybe_get_event_ndims_statically(event_shape.ndims)
+        event_ndims = self._maybe_get_static_event_ndims(
+            event_shape.ndims)
       else:
         event_shape = b.inverse_event_shape_tensor(event_shape)
-        event_ndims = _maybe_get_event_ndims_statically(
-            array_ops.rank(event_shape))
+        event_ndims = array_ops.size(event_shape)
+        event_ndims_ = self._maybe_get_static_event_ndims(event_ndims)
+        if event_ndims_ is not None:
+          event_ndims = event_ndims_
+
       y = b.inverse(y, **kwargs.get(b.name, {}))
     return ildj
 
@@ -274,13 +268,12 @@ def _forward(self, x, **kwargs):
   def _forward_log_det_jacobian(self, x, **kwargs):
     x = ops.convert_to_tensor(x, name="x")
 
-    fldj = constant_op.constant(
-        0., dtype=x.dtype, name="inverse_log_det_jacobian")
+    fldj = math_ops.cast(0., dtype=x.dtype.base_dtype)
 
     if not self.bijectors:
       return fldj
 
-    event_ndims = _maybe_get_event_ndims_statically(
+    event_ndims = self._maybe_get_static_event_ndims(
         self.forward_min_event_ndims)
 
     if _use_static_shape(x, event_ndims):
@@ -293,13 +286,14 @@ def _forward_log_det_jacobian(self, x, **kwargs):
           x, event_ndims=event_ndims, **kwargs.get(b.name, {}))
       if _use_static_shape(x, event_ndims):
         event_shape = b.forward_event_shape(event_shape)
-        event_ndims = _maybe_get_event_ndims_statically(event_shape.ndims)
+        event_ndims = self._maybe_get_static_event_ndims(event_shape.ndims)
       else:
         event_shape = b.forward_event_shape_tensor(event_shape)
-        event_ndims = _maybe_get_event_ndims_statically(
-            array_ops.rank(event_shape))
+        event_ndims = array_ops.size(event_shape)
+        event_ndims_ = self._maybe_get_static_event_ndims(event_ndims)
+        if event_ndims_ is not None:
+          event_ndims = event_ndims_
 
       x = b.forward(x, **kwargs.get(b.name, {}))
 
     return fldj
-
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
index ecdb8967f43e59..268c8d03426d43 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/cholesky_outer_product.py
@@ -53,7 +53,7 @@ class CholeskyOuterProduct(bijector.Bijector):
   its spectrum), and that the product of two positive-diagonal lower-triangular
   matrices is another positive-diagonal lower-triangular matrix.
 
-  A simple inductive argument (proceding one column of L_3 at a time) shows
+  A simple inductive argument (proceeding one column of L_3 at a time) shows
   that, if `I = L_3 @ L_3.T`, with L_3 being lower-triangular with positive-
   diagonal, then `L_3 = I`. Thus, `L_1 = L_2`, proving injectivity of g.
 
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/matrix_inverse_tril.py b/tensorflow/contrib/distributions/python/ops/bijectors/matrix_inverse_tril.py
new file mode 100644
index 00000000000000..71903f705232f0
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/matrix_inverse_tril.py
@@ -0,0 +1,145 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""MatrixInverseTriL bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
+
+
+__all__ = [
+    "MatrixInverseTriL",
+]
+
+
+class MatrixInverseTriL(bijector.Bijector):
+  """Computes `g(L) = inv(L)`, where `L` is a lower-triangular matrix.
+
+  `L` must be nonsingular; equivalently, all diagonal entries of `L` must be
+  nonzero.
+
+  The input must have `rank >= 2`.  The input is treated as a batch of matrices
+  with batch shape `input.shape[:-2]`, where each matrix has dimensions
+  `input.shape[-2]` by `input.shape[-1]` (hence `input.shape[-2]` must equal
+  `input.shape[-1]`).
+
+  #### Examples
+
+  ```python
+  tfd.bijectors.MatrixInverseTriL().forward(x=[[1., 0], [2, 1]])
+  # Result: [[1., 0], [-2, 1]], i.e., inv(x)
+
+  tfd.bijectors.MatrixInverseTriL().inverse(y=[[1., 0], [-2, 1]])
+  # Result: [[1., 0], [2, 1]], i.e., inv(y).
+  ```
+
+  """
+
+  def __init__(self, validate_args=False, name="matrix_inverse_tril"):
+    """Instantiates the `MatrixInverseTriL` bijector.
+
+    Args:
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+    """
+    self._graph_parents = []
+    self._name = name
+    super(MatrixInverseTriL, self).__init__(
+        forward_min_event_ndims=2,
+        validate_args=validate_args,
+        name=name)
+
+  def _forward(self, x):
+    with ops.control_dependencies(self._assertions(x)):
+      shape = array_ops.shape(x)
+      return linalg_ops.matrix_triangular_solve(
+          x, linalg_ops.eye(shape[-1], batch_shape=shape[:-2]), lower=True)
+
+  def _inverse(self, y):
+    return self._forward(y)
+
+  def _forward_log_det_jacobian(self, x):
+    # Calculation of the Jacobian:
+    #
+    # Let X = (x_{ij}), 0 <= i,j < n, be a matrix of indeterminates.  Let Z =
+    # X^{-1} where Z = (z_{ij}).  Then
+    #
+    #     dZ/dx_{ij} = (d/dt | t=0) Y(t)^{-1},
+    #
+    # where Y(t) = X + t*E_{ij} and E_{ij} is the matrix with a 1 in the (i,j)
+    # entry and zeros elsewhere.  By the product rule,
+    #
+    #     0 = d/dt [Identity matrix]
+    #       = d/dt [Y Y^{-1}]
+    #       = Y d/dt[Y^{-1}] + dY/dt Y^{-1}
+    #
+    # so
+    #
+    #     d/dt[Y^{-1}] = -Y^{-1} dY/dt Y^{-1}
+    #                  = -Y^{-1} E_{ij} Y^{-1}.
+    #
+    # Evaluating at t=0,
+    #
+    #     dZ/dx_{ij} = -Z E_{ij} Z.
+    #
+    # Taking the (r,s) entry of each side,
+    #
+    #     dz_{rs}/dx_{ij} = -z_{ri}z_{sj}.
+    #
+    # Now, let J be the Jacobian dZ/dX, arranged as the n^2-by-n^2 matrix whose
+    # (r*n + s, i*n + j) entry is dz_{rs}/dx_{ij}.  Considering J as an n-by-n
+    # block matrix with n-by-n blocks, the above expression for dz_{rs}/dx_{ij}
+    # shows that the block at position (r,i) is -z_{ri}Z.  Hence
+    #
+    #          J = -KroneckerProduct(Z, Z),
+    #     det(J) = (-1)^(n^2) (det Z)^(2n)
+    #            = (-1)^n (det X)^(-2n).
+    with ops.control_dependencies(self._assertions(x)):
+      return (-2. * math_ops.cast(array_ops.shape(x)[-1], x.dtype.base_dtype) *
+              math_ops.reduce_sum(
+                  math_ops.log(math_ops.abs(array_ops.matrix_diag_part(x))),
+                  axis=-1))
+
+  def _assertions(self, x):
+    if not self.validate_args:
+      return []
+    shape = array_ops.shape(x)
+    is_matrix = check_ops.assert_rank_at_least(
+        x, 2, message="Input must have rank at least 2.")
+    is_square = check_ops.assert_equal(
+        shape[-2], shape[-1], message="Input must be a square matrix.")
+    above_diagonal = array_ops.matrix_band_part(
+        array_ops.matrix_set_diag(
+            x, array_ops.zeros(shape[:-1], dtype=dtypes.float32)),
+        0, -1)
+    is_lower_triangular = check_ops.assert_equal(
+        above_diagonal, array_ops.zeros_like(above_diagonal),
+        message="Input must be lower triangular.")
+    # A lower triangular matrix is nonsingular iff all its diagonal entries are
+    # nonzero.
+    diag_part = array_ops.matrix_diag_part(x)
+    is_nonsingular = check_ops.assert_none_equal(
+        diag_part, array_ops.zeros_like(diag_part),
+        message="Input must have all diagonal entries nonzero.")
+    return [is_matrix, is_square, is_lower_triangular, is_nonsingular]
diff --git a/tensorflow/contrib/distributions/python/ops/binomial.py b/tensorflow/contrib/distributions/python/ops/binomial.py
index 12d16031783b78..e4944beedcbca0 100644
--- a/tensorflow/contrib/distributions/python/ops/binomial.py
+++ b/tensorflow/contrib/distributions/python/ops/binomial.py
@@ -163,7 +163,7 @@ def __init__(self,
         more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[total_count, logits, probs]) as name:
       self._total_count = self._maybe_assert_valid_total_count(
           ops.convert_to_tensor(total_count, name="total_count"),
diff --git a/tensorflow/contrib/distributions/python/ops/cauchy.py b/tensorflow/contrib/distributions/python/ops/cauchy.py
index daacfe657fe154..23b6a83c17d586 100644
--- a/tensorflow/contrib/distributions/python/ops/cauchy.py
+++ b/tensorflow/contrib/distributions/python/ops/cauchy.py
@@ -120,7 +120,7 @@ def __init__(self,
     Raises:
       TypeError: if `loc` and `scale` have different `dtype`.
     """
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[loc, scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(scale)]
                                     if validate_args else []):
diff --git a/tensorflow/contrib/distributions/python/ops/chi2.py b/tensorflow/contrib/distributions/python/ops/chi2.py
index c77c5fd20895a6..686ae1ba74641e 100644
--- a/tensorflow/contrib/distributions/python/ops/chi2.py
+++ b/tensorflow/contrib/distributions/python/ops/chi2.py
@@ -83,7 +83,7 @@ def __init__(self,
         more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
-    parameters = locals()
+    parameters = dict(locals())
     # Even though all stats of chi2 are defined for valid parameters, this is
     # not true in the parent class "gamma."  therefore, passing
     # allow_nan_stats=True
@@ -119,7 +119,7 @@ def __init__(self,
                validate_args=False,
                allow_nan_stats=True,
                name="Chi2WithAbsDf"):
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[df]) as name:
       super(Chi2WithAbsDf, self).__init__(
           df=math_ops.floor(
diff --git a/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py b/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py
index 10b45361358b40..3598c8d23ea900 100644
--- a/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py
+++ b/tensorflow/contrib/distributions/python/ops/conditional_transformed_distribution.py
@@ -20,7 +20,6 @@
 from tensorflow.contrib.distributions.python.ops import conditional_distribution
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import transformed_distribution
@@ -106,7 +105,7 @@ def _log_prob(self, y, bijector_kwargs=None, distribution_kwargs=None):
     bijector_kwargs = bijector_kwargs or {}
     distribution_kwargs = distribution_kwargs or {}
     x = self.bijector.inverse(y, **bijector_kwargs)
-    event_ndims = self._maybe_get_event_ndims_statically()
+    event_ndims = self._maybe_get_static_event_ndims()
     ildj = self.bijector.inverse_log_det_jacobian(
         y, event_ndims=event_ndims, **bijector_kwargs)
     if self.bijector._is_injective:  # pylint: disable=protected-access
@@ -131,7 +130,7 @@ def _prob(self, y, bijector_kwargs=None, distribution_kwargs=None):
     bijector_kwargs = bijector_kwargs or {}
     distribution_kwargs = distribution_kwargs or {}
     x = self.bijector.inverse(y, **bijector_kwargs)
-    event_ndims = self._maybe_get_event_ndims_statically()
+    event_ndims = self._maybe_get_static_event_ndims()
     ildj = self.bijector.inverse_log_det_jacobian(
         y, event_ndims=event_ndims, **bijector_kwargs)
     if self.bijector._is_injective:  # pylint: disable=protected-access
@@ -220,14 +219,14 @@ def _quantile(self, value, bijector_kwargs=None, distribution_kwargs=None):
     inv_cdf = self.distribution.quantile(value, **distribution_kwargs)
     return self.bijector.forward(inv_cdf, **bijector_kwargs)
 
-  def _maybe_get_event_ndims_statically(self):
+  def _maybe_get_static_event_ndims(self):
     if self.event_shape.ndims is not None:
       return self.event_shape.ndims
 
     event_ndims = array_ops.size(self.event_shape_tensor())
-    static_event_ndims = tensor_util.constant_value(event_ndims)
+    event_ndims_ = distribution_util.maybe_get_static_value(event_ndims)
 
-    if static_event_ndims is not None:
-      return static_event_ndims
+    if event_ndims_ is not None:
+      return event_ndims_
 
     return event_ndims
diff --git a/tensorflow/contrib/distributions/python/ops/deterministic.py b/tensorflow/contrib/distributions/python/ops/deterministic.py
index a42350430e9851..c44c76a1338176 100644
--- a/tensorflow/contrib/distributions/python/ops/deterministic.py
+++ b/tensorflow/contrib/distributions/python/ops/deterministic.py
@@ -86,7 +86,7 @@ def __init__(self,
     Raises:
       ValueError:  If `loc` is a scalar.
     """
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[loc, atol, rtol]) as name:
       loc = ops.convert_to_tensor(loc, name="loc")
       if is_vector and validate_args:
diff --git a/tensorflow/contrib/distributions/python/ops/geometric.py b/tensorflow/contrib/distributions/python/ops/geometric.py
index 53dd42f4c83fce..e1e42ee95d200d 100644
--- a/tensorflow/contrib/distributions/python/ops/geometric.py
+++ b/tensorflow/contrib/distributions/python/ops/geometric.py
@@ -85,7 +85,7 @@ def __init__(self,
       name: Python `str` name prefixed to Ops created by this class.
     """
 
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[logits, probs]) as name:
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           logits, probs, validate_args=validate_args, name=name)
diff --git a/tensorflow/contrib/distributions/python/ops/gumbel.py b/tensorflow/contrib/distributions/python/ops/gumbel.py
index 2c261073ee1646..9d94fd11c62ce6 100644
--- a/tensorflow/contrib/distributions/python/ops/gumbel.py
+++ b/tensorflow/contrib/distributions/python/ops/gumbel.py
@@ -124,7 +124,7 @@ def __init__(self,
     Raises:
       TypeError: if loc and scale are different dtypes.
     """
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[loc, scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(scale)] if
                                     validate_args else []):
diff --git a/tensorflow/contrib/distributions/python/ops/half_normal.py b/tensorflow/contrib/distributions/python/ops/half_normal.py
index d0df2befd6e46c..9c96254d1c0a59 100644
--- a/tensorflow/contrib/distributions/python/ops/half_normal.py
+++ b/tensorflow/contrib/distributions/python/ops/half_normal.py
@@ -105,7 +105,7 @@ def __init__(self,
         if one or more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(scale)] if
                                     validate_args else []):
diff --git a/tensorflow/contrib/distributions/python/ops/independent.py b/tensorflow/contrib/distributions/python/ops/independent.py
index fbde55ef310de1..cd6eaa8407477b 100644
--- a/tensorflow/contrib/distributions/python/ops/independent.py
+++ b/tensorflow/contrib/distributions/python/ops/independent.py
@@ -116,7 +116,7 @@ def __init__(
       ValueError: if `reinterpreted_batch_ndims` exceeds
         `distribution.batch_ndims`
     """
-    parameters = locals()
+    parameters = dict(locals())
     name = name or "Independent" + distribution.name
     self._distribution = distribution
     with ops.name_scope(name) as name:
diff --git a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
index 502bd4f493337b..208057b34db288 100644
--- a/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
+++ b/tensorflow/contrib/distributions/python/ops/inverse_gamma.py
@@ -125,7 +125,7 @@ def __init__(self,
     Raises:
       TypeError: if `concentration` and `rate` are different dtypes.
     """
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[concentration, rate]) as name:
       with ops.control_dependencies([
           check_ops.assert_positive(concentration),
@@ -280,7 +280,7 @@ def __init__(self,
                validate_args=False,
                allow_nan_stats=True,
                name="InverseGammaWithSoftplusConcentrationRate"):
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[concentration, rate]) as name:
       super(InverseGammaWithSoftplusConcentrationRate, self).__init__(
           concentration=nn.softplus(concentration,
diff --git a/tensorflow/contrib/distributions/python/ops/kumaraswamy.py b/tensorflow/contrib/distributions/python/ops/kumaraswamy.py
index 66682b2ff5493f..0ff989fc952c6f 100644
--- a/tensorflow/contrib/distributions/python/ops/kumaraswamy.py
+++ b/tensorflow/contrib/distributions/python/ops/kumaraswamy.py
@@ -31,7 +31,6 @@
 from tensorflow.python.ops.distributions import distribution
 from tensorflow.python.ops.distributions import transformed_distribution
 from tensorflow.python.ops.distributions import uniform
-from tensorflow.python.util.tf_export import tf_export
 
 __all__ = [
     "Kumaraswamy",
@@ -59,7 +58,6 @@ def _harmonic_number(x):
   return math_ops.digamma(x + one) - math_ops.digamma(one)
 
 
-@tf_export("distributions.Kumaraswamy")
 class Kumaraswamy(transformed_distribution.TransformedDistribution):
   """Kumaraswamy distribution.
 
diff --git a/tensorflow/contrib/distributions/python/ops/logistic.py b/tensorflow/contrib/distributions/python/ops/logistic.py
index c83b5bc2e3a8c5..27aa863440574e 100644
--- a/tensorflow/contrib/distributions/python/ops/logistic.py
+++ b/tensorflow/contrib/distributions/python/ops/logistic.py
@@ -119,7 +119,7 @@ def __init__(self,
     Raises:
       TypeError: if loc and scale are different dtypes.
     """
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[loc, scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(scale)] if
                                     validate_args else []):
diff --git a/tensorflow/contrib/distributions/python/ops/mixture.py b/tensorflow/contrib/distributions/python/ops/mixture.py
index 2ef294af2e8bc9..bfb53a06c011ce 100644
--- a/tensorflow/contrib/distributions/python/ops/mixture.py
+++ b/tensorflow/contrib/distributions/python/ops/mixture.py
@@ -116,7 +116,7 @@ def __init__(self,
         matching static batch shapes, or all components do not
         have matching static event shapes.
     """
-    parameters = locals()
+    parameters = dict(locals())
     if not isinstance(cat, categorical.Categorical):
       raise TypeError("cat must be a Categorical distribution, but saw: %s" %
                       cat)
diff --git a/tensorflow/contrib/distributions/python/ops/mixture_same_family.py b/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
index 0b1301e551728f..112eefd3691815 100644
--- a/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
+++ b/tensorflow/contrib/distributions/python/ops/mixture_same_family.py
@@ -130,7 +130,7 @@ def __init__(self,
       ValueError: if `mixture_distribution` categories does not equal
         `components_distribution` rightmost batch shape.
     """
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name) as name:
       self._mixture_distribution = mixture_distribution
       self._components_distribution = components_distribution
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_diag.py b/tensorflow/contrib/distributions/python/ops/mvn_diag.py
index e3236c2db93695..d2beb2aff0481e 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_diag.py
@@ -193,7 +193,7 @@ def __init__(self,
     Raises:
       ValueError: if at most `scale_identity_multiplier` is specified.
     """
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name) as name:
       with ops.name_scope("init", values=[
           loc, scale_diag, scale_identity_multiplier]):
@@ -224,7 +224,7 @@ def __init__(self,
                validate_args=False,
                allow_nan_stats=True,
                name="MultivariateNormalDiagWithSoftplusScale"):
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[scale_diag]) as name:
       super(MultivariateNormalDiagWithSoftplusScale, self).__init__(
           loc=loc,
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py b/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
index 2f6a6f198cbcfb..5117379b047f5e 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_diag_plus_low_rank.py
@@ -215,7 +215,7 @@ def __init__(self,
     Raises:
       ValueError: if at most `scale_identity_multiplier` is specified.
     """
-    parameters = locals()
+    parameters = dict(locals())
     def _convert_to_tensor(x, name):
       return None if x is None else ops.convert_to_tensor(x, name=name)
     with ops.name_scope(name) as name:
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py b/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py
index 86fcd4db54ad85..57f47db50c496f 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_full_covariance.py
@@ -45,7 +45,7 @@ class MultivariateNormalFullCovariance(mvn_tril.MultivariateNormalTriL):
   The probability density function (pdf) is, with `@` as matrix multiplication,
 
   ```none
-  pdf(x; loc, covariance_matrix) = exp(-0.5 ||y||**2) / Z,
+  pdf(x; loc, covariance_matrix) = exp(-0.5 y) / Z,
   y = (x - loc)^T @ inv(covariance_matrix) @ (x - loc)
   Z = (2 pi)**(0.5 k) |det(covariance_matrix)|**(0.5).
   ```
@@ -54,8 +54,7 @@ class MultivariateNormalFullCovariance(mvn_tril.MultivariateNormalTriL):
 
   * `loc` is a vector in `R^k`,
   * `covariance_matrix` is an `R^{k x k}` symmetric positive definite matrix,
-  * `Z` denotes the normalization constant, and,
-  * `||y||**2` denotes the squared Euclidean norm of `y`.
+  * `Z` denotes the normalization constant.
 
   Additional leading dimensions (if any) in `loc` and `covariance_matrix` allow
   for batch dimensions.
@@ -156,7 +155,7 @@ def __init__(self,
     Raises:
       ValueError: if neither `loc` nor `covariance_matrix` are specified.
     """
-    parameters = locals()
+    parameters = dict(locals())
 
     # Convert the covariance_matrix up to a scale_tril and call MVNTriL.
     with ops.name_scope(name) as name:
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py b/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
index 44c92312c7dc75..6a0383db025552 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_linear_operator.py
@@ -170,7 +170,7 @@ def __init__(self,
       ValueError: if `scale` is unspecified.
       TypeError: if not `scale.dtype.is_floating`
     """
-    parameters = locals()
+    parameters = dict(locals())
     if scale is None:
       raise ValueError("Missing required `scale` parameter.")
     if not scale.dtype.is_floating:
diff --git a/tensorflow/contrib/distributions/python/ops/mvn_tril.py b/tensorflow/contrib/distributions/python/ops/mvn_tril.py
index d6f8b731cbeed5..c809ef3c1cb5b8 100644
--- a/tensorflow/contrib/distributions/python/ops/mvn_tril.py
+++ b/tensorflow/contrib/distributions/python/ops/mvn_tril.py
@@ -179,7 +179,7 @@ def __init__(self,
     Raises:
       ValueError: if neither `loc` nor `scale_tril` are specified.
     """
-    parameters = locals()
+    parameters = dict(locals())
     def _convert_to_tensor(x, name):
       return None if x is None else ops.convert_to_tensor(x, name=name)
     if loc is None and scale_tril is None:
diff --git a/tensorflow/contrib/distributions/python/ops/negative_binomial.py b/tensorflow/contrib/distributions/python/ops/negative_binomial.py
index eeaf9c0a5ebc13..2bd11e24b315e0 100644
--- a/tensorflow/contrib/distributions/python/ops/negative_binomial.py
+++ b/tensorflow/contrib/distributions/python/ops/negative_binomial.py
@@ -90,7 +90,7 @@ def __init__(self,
       name: Python `str` name prefixed to Ops created by this class.
     """
 
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[total_count, logits, probs]) as name:
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           logits, probs, validate_args=validate_args, name=name)
diff --git a/tensorflow/contrib/distributions/python/ops/onehot_categorical.py b/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
index 305b138fdc2318..3e44c10fab726a 100644
--- a/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/onehot_categorical.py
@@ -115,7 +115,7 @@ def __init__(
         more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[logits, probs]) as name:
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           name=name, logits=logits, probs=probs, validate_args=validate_args,
diff --git a/tensorflow/contrib/distributions/python/ops/poisson.py b/tensorflow/contrib/distributions/python/ops/poisson.py
index a84aad6fc93723..04de8106ee0c06 100644
--- a/tensorflow/contrib/distributions/python/ops/poisson.py
+++ b/tensorflow/contrib/distributions/python/ops/poisson.py
@@ -93,7 +93,7 @@ def __init__(self,
       TypeError: if `rate` is not a float-type.
       TypeError: if `log_rate` is not a float-type.
     """
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[rate]) as name:
       if (rate is None) == (log_rate is None):
         raise ValueError("Must specify exactly one of `rate` and `log_rate`.")
diff --git a/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py b/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py
index 19c99dcee92978..7b10ba998f0cea 100644
--- a/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py
+++ b/tensorflow/contrib/distributions/python/ops/poisson_lognormal.py
@@ -255,7 +255,7 @@ def __init__(self,
       TypeError: if `quadrature_grid` and `quadrature_probs` have different base
         `dtype`.
     """
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[loc, scale]) as name:
       if loc is not None:
         loc = ops.convert_to_tensor(loc, name="loc")
diff --git a/tensorflow/contrib/distributions/python/ops/quantized_distribution.py b/tensorflow/contrib/distributions/python/ops/quantized_distribution.py
index 1ef7651d03a338..5ac6c34b538016 100644
--- a/tensorflow/contrib/distributions/python/ops/quantized_distribution.py
+++ b/tensorflow/contrib/distributions/python/ops/quantized_distribution.py
@@ -128,7 +128,7 @@ def _logsum_expbig_minus_expsmall(big, small):
 class QuantizedDistribution(distributions.Distribution):
   """Distribution representing the quantization `Y = ceiling(X)`.
 
-  #### Definition in terms of sampling.
+  #### Definition in Terms of Sampling
 
   ```
   1. Draw X
@@ -138,7 +138,7 @@ class QuantizedDistribution(distributions.Distribution):
   5. Return Y
   ```
 
-  #### Definition in terms of the probability mass function.
+  #### Definition in Terms of the Probability Mass Function
 
   Given scalar random variable `X`, we define a discrete random variable `Y`
   supported on the integers as follows:
@@ -170,12 +170,62 @@ class QuantizedDistribution(distributions.Distribution):
 
   `P[Y = j]` is still the mass of `X` within the `jth` interval.
 
-  #### Caveats
+  #### Examples
+
+  We illustrate a mixture of discretized logistic distributions
+  [(Salimans et al., 2017)][1]. This is used, for example, for capturing 16-bit
+  audio in WaveNet [(van den Oord et al., 2017)][2]. The values range in
+  a 1-D integer domain of `[0, 2**16-1]`, and the discretization captures
+  `P(x - 0.5 < X <= x + 0.5)` for all `x` in the domain excluding the endpoints.
+  The lowest value has probability `P(X <= 0.5)` and the highest value has
+  probability `P(2**16 - 1.5 < X)`.
+
+  Below we assume a `wavenet` function. It takes as `input` right-shifted audio
+  samples of shape `[..., sequence_length]`. It returns a real-valued tensor of
+  shape `[..., num_mixtures * 3]`, i.e., each mixture component has a `loc` and
+  `scale` parameter belonging to the logistic distribution, and a `logits`
+  parameter determining the unnormalized probability of that component.
+
+  ```python
+  tfd = tf.contrib.distributions
+  tfb = tfd.bijectors
+
+  net = wavenet(inputs)
+  loc, unconstrained_scale, logits = tf.split(net,
+                                              num_or_size_splits=3,
+                                              axis=-1)
+  scale = tf.nn.softplus(unconstrained_scale)
+
+  # Form mixture of discretized logistic distributions. Note we shift the
+  # logistic distribution by -0.5. This lets the quantization capture "rounding"
+  # intervals, `(x-0.5, x+0.5]`, and not "ceiling" intervals, `(x-1, x]`.
+  discretized_logistic_dist = tfd.QuantizedDistribution(
+      distribution=tfd.TransformedDistribution(
+          distribution=tfd.Logistic(loc=loc, scale=scale),
+          bijector=tfb.AffineScalar(shift=-0.5)),
+      low=0.,
+      high=2**16 - 1.)
+  mixture_dist = tfd.MixtureSameFamily(
+      mixture_distribution=tfd.Categorical(logits=logits),
+      components_distribution=discretized_logistic_dist)
+
+  neg_log_likelihood = -tf.reduce_sum(mixture_dist.log_prob(targets))
+  train_op = tf.train.AdamOptimizer().minimize(neg_log_likelihood)
+  ```
+
+  After instantiating `mixture_dist`, we illustrate maximum likelihood by
+  calculating its log-probability of audio samples as `target` and optimizing.
+
+  #### References
 
-  Since evaluation of each `P[Y = j]` involves a cdf evaluation (rather than
-  a closed form function such as for a Poisson), computations such as mean and
-  entropy are better done with samples or approximations, and are not
-  implemented by this class.
+  [1]: Tim Salimans, Andrej Karpathy, Xi Chen, and Diederik P. Kingma.
+       PixelCNN++: Improving the PixelCNN with discretized logistic mixture
+       likelihood and other modifications.
+       _International Conference on Learning Representations_, 2017.
+       https://arxiv.org/abs/1701.05517
+  [2]: Aaron van den Oord et al. Parallel WaveNet: Fast High-Fidelity Speech
+       Synthesis. _arXiv preprint arXiv:1711.10433_, 2017.
+       https://arxiv.org/abs/1711.10433
   """
 
   def __init__(self,
@@ -213,7 +263,7 @@ def __init__(self,
           `Distribution` or continuous.
       NotImplementedError:  If the base distribution does not implement `cdf`.
     """
-    parameters = locals()
+    parameters = dict(locals())
     values = (
         list(distribution.parameters.values()) +
         [low, high])
diff --git a/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py b/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
index 84c8d29072c2f1..4182ca2b56ea80 100644
--- a/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_bernoulli.py
@@ -165,7 +165,7 @@ def __init__(self,
     Raises:
       ValueError: If both `probs` and `logits` are passed, or if neither.
     """
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[logits, probs, temperature]) as name:
       with ops.control_dependencies([check_ops.assert_positive(temperature)]
                                     if validate_args else []):
diff --git a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
index 325f41e37c928b..5414f347cd65e2 100644
--- a/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
+++ b/tensorflow/contrib/distributions/python/ops/relaxed_onehot_categorical.py
@@ -162,7 +162,7 @@ def __init__(
         more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[logits, probs, temperature]) as name:
 
       self._logits, self._probs = distribution_util.get_logits_and_probs(
diff --git a/tensorflow/contrib/distributions/python/ops/seed_stream.py b/tensorflow/contrib/distributions/python/ops/seed_stream.py
index 056d349688511e..cf505ac627b62a 100644
--- a/tensorflow/contrib/distributions/python/ops/seed_stream.py
+++ b/tensorflow/contrib/distributions/python/ops/seed_stream.py
@@ -169,7 +169,7 @@ def __init__(self, seed, salt):
         and TensorFlow Probability code base.  See class docstring for
         rationale.
     """
-    self._seed = seed
+    self._seed = seed.original_seed if isinstance(seed, SeedStream) else seed
     self._salt = salt
     self._counter = 0
 
diff --git a/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py b/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py
index 03828fa61277ee..a764544932cea8 100644
--- a/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py
+++ b/tensorflow/contrib/distributions/python/ops/sinh_arcsinh.py
@@ -132,7 +132,7 @@ def __init__(self,
         if one or more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
-    parameters = locals()
+    parameters = dict(locals())
 
     with ops.name_scope(name,
                         values=[loc, scale, skewness, tailweight]) as name:
diff --git a/tensorflow/contrib/distributions/python/ops/statistical_testing.py b/tensorflow/contrib/distributions/python/ops/statistical_testing.py
index 9c69435fac1099..c25e8c51d7705b 100644
--- a/tensorflow/contrib/distributions/python/ops/statistical_testing.py
+++ b/tensorflow/contrib/distributions/python/ops/statistical_testing.py
@@ -140,6 +140,7 @@
     "assert_true_mean_equal_by_dkwm",
     "min_discrepancy_of_true_means_detectable_by_dkwm",
     "min_num_samples_for_dkwm_mean_test",
+    "assert_true_mean_in_interval_by_dkwm",
     "assert_true_mean_equal_by_dkwm_two_sample",
     "min_discrepancy_of_true_means_detectable_by_dkwm_two_sample",
     "min_num_samples_for_dkwm_mean_two_sample_test",
@@ -209,17 +210,17 @@ def _maximum_mean(samples, envelope, high, name=None):
   separately.
 
   Args:
-    samples: Floating-point tensor of samples from the distribution(s)
+    samples: Floating-point `Tensor` of samples from the distribution(s)
       of interest.  Entries are assumed IID across the 0th dimension.
       The other dimensions must broadcast with `envelope` and `high`.
-    envelope: Floating-point tensor of sizes of admissible CDF
+    envelope: Floating-point `Tensor` of sizes of admissible CDF
       envelopes (i.e., the `eps` above).
-    high: Floating-point tensor of upper bounds on the distributions'
-      supports.
+    high: Floating-point `Tensor` of upper bounds on the distributions'
+      supports.  `samples <= high`.
     name: A name for this operation (optional).
 
   Returns:
-    bound: Floating-point tensor of upper bounds on the true means.
+    bound: Floating-point `Tensor` of upper bounds on the true means.
 
   Raises:
     InvalidArgumentError: If some `sample` is found to be larger than
@@ -254,17 +255,17 @@ def _minimum_mean(samples, envelope, low, name=None):
   separately.
 
   Args:
-    samples: Floating-point tensor of samples from the distribution(s)
+    samples: Floating-point `Tensor` of samples from the distribution(s)
       of interest.  Entries are assumed IID across the 0th dimension.
       The other dimensions must broadcast with `envelope` and `low`.
-    envelope: Floating-point tensor of sizes of admissible CDF
+    envelope: Floating-point `Tensor` of sizes of admissible CDF
       envelopes (i.e., the `eps` above).
-    low: Floating-point tensor of lower bounds on the distributions'
-      supports.
+    low: Floating-point `Tensor` of lower bounds on the distributions'
+      supports.  `samples >= low`.
     name: A name for this operation (optional).
 
   Returns:
-    bound: Floating-point tensor of lower bounds on the true means.
+    bound: Floating-point `Tensor` of lower bounds on the true means.
 
   Raises:
     InvalidArgumentError: If some `sample` is found to be smaller than
@@ -300,12 +301,12 @@ def _dkwm_cdf_envelope(n, error_rate, name=None):
   probability above.
 
   Args:
-    n: Tensor of numbers of samples drawn.
-    error_rate: Floating-point tensor of admissible rates of mistakes.
+    n: `Tensor` of numbers of samples drawn.
+    error_rate: Floating-point `Tensor` of admissible rates of mistakes.
     name: A name for this operation (optional).
 
   Returns:
-    eps: Tensor of maximum distances the true CDF can be from the
+    eps: `Tensor` of maximum distances the true CDF can be from the
       empirical CDF.  This scales as `O(sqrt(-log(error_rate)))` and
       as `O(1 / sqrt(n))`.  The shape is the broadcast of `n` and
       `error_rate`.
@@ -324,8 +325,8 @@ def _check_shape_dominates(samples, parameters):
   sample counts end up inflated.
 
   Args:
-    samples: A Tensor whose shape is to be protected against broadcasting.
-    parameters: A list of Tensors who are parameters for the statistical test.
+    samples: A `Tensor` whose shape is to be protected against broadcasting.
+    parameters: A list of `Tensor`s who are parameters for the statistical test.
 
   Returns:
     samples: Return original `samples` with control dependencies attached
@@ -369,19 +370,23 @@ def true_mean_confidence_interval_by_dkwm(
   members.
 
   Args:
-    samples: Floating-point tensor of samples from the distribution(s)
+    samples: Floating-point `Tensor` of samples from the distribution(s)
       of interest.  Entries are assumed IID across the 0th dimension.
       The other dimensions must broadcast with `low` and `high`.
-    low: Floating-point tensor of lower bounds on the distributions'
+      The support is bounded: `low <= samples <= high`.
+    low: Floating-point `Tensor` of lower bounds on the distributions'
       supports.
-    high: Floating-point tensor of upper bounds on the distributions'
+    high: Floating-point `Tensor` of upper bounds on the distributions'
       supports.
-    error_rate: *Scalar* admissible total rate of mistakes.
+    error_rate: *Scalar* floating-point `Tensor` admissible total rate
+      of mistakes.
     name: A name for this operation (optional).
 
   Returns:
-    low: A floating-point tensor of stochastic lower bounds on the true means.
-    high: A floating-point tensor of stochastic upper bounds on the true means.
+    low: A floating-point `Tensor` of stochastic lower bounds on the
+      true means.
+    high: A floating-point `Tensor` of stochastic upper bounds on the
+      true means.
   """
   with ops.name_scope(
       name, "true_mean_confidence_interval_by_dkwm",
@@ -436,15 +441,17 @@ def assert_true_mean_equal_by_dkwm(
   the assertion will insist on stronger evidence to fail any one member.
 
   Args:
-    samples: Floating-point tensor of samples from the distribution(s)
+    samples: Floating-point `Tensor` of samples from the distribution(s)
       of interest.  Entries are assumed IID across the 0th dimension.
       The other dimensions must broadcast with `low` and `high`.
-    low: Floating-point tensor of lower bounds on the distributions'
+      The support is bounded: `low <= samples <= high`.
+    low: Floating-point `Tensor` of lower bounds on the distributions'
       supports.
-    high: Floating-point tensor of upper bounds on the distributions'
+    high: Floating-point `Tensor` of upper bounds on the distributions'
       supports.
-    expected: Floating-point tensor of expected true means.
-    false_fail_rate: *Scalar* admissible total rate of mistakes.
+    expected: Floating-point `Tensor` of expected true means.
+    false_fail_rate: *Scalar* floating-point `Tensor` admissible total
+      rate of mistakes.
     name: A name for this operation (optional).
 
   Returns:
@@ -454,20 +461,8 @@ def assert_true_mean_equal_by_dkwm(
   with ops.name_scope(
       name, "assert_true_mean_equal_by_dkwm",
       [samples, low, high, expected, false_fail_rate]):
-    samples = ops.convert_to_tensor(samples, name="samples")
-    low = ops.convert_to_tensor(low, name="low")
-    high = ops.convert_to_tensor(high, name="high")
-    expected = ops.convert_to_tensor(expected, name="expected")
-    false_fail_rate = ops.convert_to_tensor(
-        false_fail_rate, name="false_fail_rate")
-    samples = _check_shape_dominates(samples, [low, high, expected])
-    min_mean, max_mean = true_mean_confidence_interval_by_dkwm(
-        samples, low, high, error_rate=false_fail_rate)
-    less_op = check_ops.assert_less(
-        min_mean, expected, message="Mean confidence interval too high")
-    with ops.control_dependencies([less_op]):
-      return check_ops.assert_greater(
-          max_mean, expected, message="Mean confidence interval too low")
+    return assert_true_mean_in_interval_by_dkwm(
+        samples, low, high, expected, expected, false_fail_rate)
 
 
 def min_discrepancy_of_true_means_detectable_by_dkwm(
@@ -487,30 +482,35 @@ def min_discrepancy_of_true_means_detectable_by_dkwm(
   with the same `false_pass_rate`.
 
   Args:
-    n: Tensor of numbers of samples to be drawn from the distributions
+    n: `Tensor` of numbers of samples to be drawn from the distributions
       of interest.
-    low: Floating-point tensor of lower bounds on the distributions'
+    low: Floating-point `Tensor` of lower bounds on the distributions'
       supports.
-    high: Floating-point tensor of upper bounds on the distributions'
+    high: Floating-point `Tensor` of upper bounds on the distributions'
       supports.
-    false_fail_rate: *Scalar* admissible total rate of false failures.
-    false_pass_rate: *Scalar* admissible rate of false passes.
+    false_fail_rate: *Scalar* floating-point `Tensor` admissible total
+      rate of false failures.
+    false_pass_rate: *Scalar* floating-point `Tensor` admissible rate
+      of false passes.
     name: A name for this operation (optional).
 
   Returns:
-    discr: Tensor of lower bounds on the distances between true
+    discr: `Tensor` of lower bounds on the distances between true
        means detectable by a DKWM-based test.
 
   For each batch member `i`, of `K` total, drawing `n[i]` samples from
   some scalar distribution supported on `[low[i], high[i]]` is enough
   to detect a difference in means of size `discr[i]` or more.
   Specifically, we guarantee that (a) if the true mean is the expected
-  mean, `assert_true_mean_equal_by_dkwm` will fail with probability at
-  most `false_fail_rate / K` (which amounts to `false_fail_rate` if
-  applied to the whole batch at once), and (b) if the true mean
-  differs from the expected mean by at least `discr[i]`,
-  `assert_true_mean_equal_by_dkwm` will pass with probability at most
-  `false_pass_rate`.
+  mean (resp. in the expected interval), then `assert_true_mean_equal_by_dkwm`
+  (resp. `assert_true_mean_in_interval_by_dkwm`) will fail with
+  probability at most `false_fail_rate / K` (which amounts to
+  `false_fail_rate` if applied to the whole batch at once), and (b) if
+  the true mean differs from the expected mean (resp. falls outside
+  the expected interval) by at least `discr[i]`,
+  `assert_true_mean_equal_by_dkwm`
+  (resp. `assert_true_mean_in_interval_by_dkwm`) will pass with
+  probability at most `false_pass_rate`.
 
   The detectable discrepancy scales as
 
@@ -558,17 +558,19 @@ def min_num_samples_for_dkwm_mean_test(
   on a scalar distribution supported on `[low, high]`.
 
   Args:
-    discrepancy: Floating-point tensor of desired upper limits on mean
+    discrepancy: Floating-point `Tensor` of desired upper limits on mean
       differences that may go undetected with probability higher than
       `1 - false_pass_rate`.
-    low: Tensor of lower bounds on the distributions' support.
-    high: Tensor of upper bounds on the distributions' support.
-    false_fail_rate: *Scalar* admissible total rate of false failures.
-    false_pass_rate: *Scalar* admissible rate of false passes.
+    low: `Tensor` of lower bounds on the distributions' support.
+    high: `Tensor` of upper bounds on the distributions' support.
+    false_fail_rate: *Scalar* floating-point `Tensor` admissible total
+      rate of false failures.
+    false_pass_rate: *Scalar* floating-point `Tensor` admissible rate
+      of false passes.
     name: A name for this operation (optional).
 
   Returns:
-    n: Tensor of numbers of samples to be drawn from the distributions
+    n: `Tensor` of numbers of samples to be drawn from the distributions
       of interest.
 
   The `discrepancy`, `low`, and `high` tensors must have
@@ -578,12 +580,15 @@ def min_num_samples_for_dkwm_mean_test(
   some scalar distribution supported on `[low[i], high[i]]` is enough
   to detect a difference in means of size `discrepancy[i]` or more.
   Specifically, we guarantee that (a) if the true mean is the expected
-  mean, `assert_true_mean_equal_by_dkwm` will fail with probability at
-  most `false_fail_rate / K` (which amounts to `false_fail_rate` if
-  applied to the whole batch at once), and (b) if the true mean
-  differs from the expected mean by at least `discrepancy[i]`,
-  `assert_true_mean_equal_by_dkwm` will pass with probability at most
-  `false_pass_rate`.
+  mean (resp. in the expected interval), then `assert_true_mean_equal_by_dkwm`
+  (resp. `assert_true_mean_in_interval_by_dkwm`) will fail with
+  probability at most `false_fail_rate / K` (which amounts to
+  `false_fail_rate` if applied to the whole batch at once), and (b) if
+  the true mean differs from the expected mean (resp. falls outside
+  the expected interval) by at least `discrepancy[i]`,
+  `assert_true_mean_equal_by_dkwm`
+  (resp. `assert_true_mean_in_interval_by_dkwm`) will pass with
+  probability at most `false_pass_rate`.
 
   The required number of samples scales
   as `O((high[i] - low[i])**2)`, `O(-log(false_fail_rate/K))`,
@@ -610,6 +615,76 @@ def min_num_samples_for_dkwm_mean_test(
     return math_ops.maximum(n1, n2)
 
 
+def assert_true_mean_in_interval_by_dkwm(
+    samples, low, high, expected_low, expected_high,
+    false_fail_rate=1e-6, name=None):
+  """Asserts the mean of the given distribution is in the given interval.
+
+  More precisely, fails if there is enough evidence (using the
+  [Dvoretzky-Kiefer-Wolfowitz-Massart inequality]
+  (https://en.wikipedia.org/wiki/CDF-based_nonparametric_confidence_interval))
+  that the mean of the distribution from which the given samples are
+  drawn is _outside_ the given interval with statistical significance
+  `false_fail_rate` or stronger, otherwise passes.  If you also want
+  to check that you are gathering enough evidence that a pass is not
+  spurious, see `min_num_samples_for_dkwm_mean_test` and
+  `min_discrepancy_of_true_means_detectable_by_dkwm`.
+
+  Note that `false_fail_rate` is a total false failure rate for all
+  the assertions in the batch.  As such, if the batch is nontrivial,
+  the assertion will insist on stronger evidence to fail any one member.
+
+  Args:
+    samples: Floating-point `Tensor` of samples from the distribution(s)
+      of interest.  Entries are assumed IID across the 0th dimension.
+      The other dimensions must broadcast with `low` and `high`.
+      The support is bounded: `low <= samples <= high`.
+    low: Floating-point `Tensor` of lower bounds on the distributions'
+      supports.
+    high: Floating-point `Tensor` of upper bounds on the distributions'
+      supports.
+    expected_low: Floating-point `Tensor` of lower bounds on the
+      expected true means.
+    expected_high: Floating-point `Tensor` of upper bounds on the
+      expected true means.
+    false_fail_rate: *Scalar* floating-point `Tensor` admissible total
+      rate of mistakes.
+    name: A name for this operation (optional).
+
+  Returns:
+    check: Op that raises `InvalidArgumentError` if any expected mean
+      interval does not overlap with the corresponding confidence
+      interval.
+  """
+  with ops.name_scope(
+      name, "assert_true_mean_in_interval_by_dkwm",
+      [samples, low, high, expected_low, expected_high, false_fail_rate]):
+    samples = ops.convert_to_tensor(samples, name="samples")
+    low = ops.convert_to_tensor(low, name="low")
+    high = ops.convert_to_tensor(high, name="high")
+    expected_low = ops.convert_to_tensor(expected_low, name="expected_low")
+    expected_high = ops.convert_to_tensor(expected_high, name="expected_high")
+    false_fail_rate = ops.convert_to_tensor(
+        false_fail_rate, name="false_fail_rate")
+    samples = _check_shape_dominates(
+        samples, [low, high, expected_low, expected_high])
+    min_mean, max_mean = true_mean_confidence_interval_by_dkwm(
+        samples, low, high, false_fail_rate)
+    # Assert that the interval [min_mean, max_mean] intersects the
+    # interval [expected_low, expected_high].  This is true if
+    #   max_mean >= expected_low and min_mean <= expected_high.
+    # By DeMorgan's law, that's also equivalent to
+    #   not (max_mean < expected_low or min_mean > expected_high),
+    # which is a way of saying the two intervals are not disjoint.
+    check_confidence_interval_can_intersect = check_ops.assert_greater_equal(
+        max_mean, expected_low, message="Confidence interval does not "
+        "intersect: true mean smaller than expected")
+    with ops.control_dependencies([check_confidence_interval_can_intersect]):
+      return check_ops.assert_less_equal(
+          min_mean, expected_high, message="Confidence interval does not "
+          "intersect: true mean greater than expected")
+
+
 def assert_true_mean_equal_by_dkwm_two_sample(
     samples1, low1, high1, samples2, low2, high2,
     false_fail_rate=1e-6, name=None):
@@ -630,23 +705,26 @@ def assert_true_mean_equal_by_dkwm_two_sample(
   the assertion will insist on stronger evidence to fail any one member.
 
   Args:
-    samples1: Floating-point tensor of samples from the
+    samples1: Floating-point `Tensor` of samples from the
       distribution(s) A.  Entries are assumed IID across the 0th
       dimension.  The other dimensions must broadcast with `low1`,
       `high1`, `low2`, and `high2`.
-    low1: Floating-point tensor of lower bounds on the supports of the
+      The support is bounded: `low1 <= samples1 <= high1`.
+    low1: Floating-point `Tensor` of lower bounds on the supports of the
       distributions A.
-    high1: Floating-point tensor of upper bounds on the supports of
+    high1: Floating-point `Tensor` of upper bounds on the supports of
       the distributions A.
-    samples2: Floating-point tensor of samples from the
+    samples2: Floating-point `Tensor` of samples from the
       distribution(s) B.  Entries are assumed IID across the 0th
       dimension.  The other dimensions must broadcast with `low1`,
       `high1`, `low2`, and `high2`.
-    low2: Floating-point tensor of lower bounds on the supports of the
+      The support is bounded: `low2 <= samples2 <= high2`.
+    low2: Floating-point `Tensor` of lower bounds on the supports of the
       distributions B.
-    high2: Floating-point tensor of upper bounds on the supports of
+    high2: Floating-point `Tensor` of upper bounds on the supports of
       the distributions B.
-    false_fail_rate: *Scalar* admissible total rate of mistakes.
+    false_fail_rate: *Scalar* floating-point `Tensor` admissible total
+      rate of mistakes.
     name: A name for this operation (optional).
 
   Returns:
@@ -676,20 +754,10 @@ def assert_true_mean_equal_by_dkwm_two_sample(
       # and sample counts should be valid; however, because the intervals
       # scale as O(-log(false_fail_rate)), there doesn't seem to be much
       # room to win.
-      min_mean_1, max_mean_1 = true_mean_confidence_interval_by_dkwm(
-          samples1, low1, high1, false_fail_rate / 2.)
       min_mean_2, max_mean_2 = true_mean_confidence_interval_by_dkwm(
           samples2, low2, high2, false_fail_rate / 2.)
-      # I want to assert
-      #   not (max_mean_1 < min_mean_2 or min_mean_1 > max_mean_2),
-      # but I think I only have and-combination of asserts, so use DeMorgan.
-      check_confidence_intervals_can_intersect = check_ops.assert_greater_equal(
-          max_mean_1, min_mean_2, message="Confidence intervals do not "
-          "intersect: samples1 has a smaller mean than samples2")
-      with ops.control_dependencies([check_confidence_intervals_can_intersect]):
-        return check_ops.assert_less_equal(
-            min_mean_1, max_mean_2, message="Confidence intervals do not "
-            "intersect: samples2 has a smaller mean than samples1")
+      return assert_true_mean_in_interval_by_dkwm(
+          samples1, low1, high1, min_mean_2, max_mean_2, false_fail_rate / 2.)
 
 
 def min_discrepancy_of_true_means_detectable_by_dkwm_two_sample(
@@ -710,22 +778,24 @@ def min_discrepancy_of_true_means_detectable_by_dkwm_two_sample(
   with the same `false_pass_rate`.
 
   Args:
-    n1: Tensor of numbers of samples to be drawn from the distributions A.
-    low1: Floating-point tensor of lower bounds on the supports of the
+    n1: `Tensor` of numbers of samples to be drawn from the distributions A.
+    low1: Floating-point `Tensor` of lower bounds on the supports of the
       distributions A.
-    high1: Floating-point tensor of upper bounds on the supports of
+    high1: Floating-point `Tensor` of upper bounds on the supports of
       the distributions A.
-    n2: Tensor of numbers of samples to be drawn from the distributions B.
-    low2: Floating-point tensor of lower bounds on the supports of the
+    n2: `Tensor` of numbers of samples to be drawn from the distributions B.
+    low2: Floating-point `Tensor` of lower bounds on the supports of the
       distributions B.
-    high2: Floating-point tensor of upper bounds on the supports of
+    high2: Floating-point `Tensor` of upper bounds on the supports of
       the distributions B.
-    false_fail_rate: *Scalar* admissible total rate of false failures.
-    false_pass_rate: *Scalar* admissible rate of false passes.
+    false_fail_rate: *Scalar* floating-point `Tensor` admissible total
+      rate of false failures.
+    false_pass_rate: *Scalar* floating-point `Tensor` admissible rate
+      of false passes.
     name: A name for this operation (optional).
 
   Returns:
-    discr: Tensor of lower bounds on the distances between true means
+    discr: `Tensor` of lower bounds on the distances between true means
        detectable by a two-sample DKWM-based test.
 
   For each batch member `i`, of `K` total, drawing `n1[i]` samples
@@ -776,24 +846,26 @@ def min_num_samples_for_dkwm_mean_two_sample_test(
   (https://en.wikipedia.org/wiki/CDF-based_nonparametric_confidence_interval).
 
   Args:
-    discrepancy: Floating-point tensor of desired upper limits on mean
+    discrepancy: Floating-point `Tensor` of desired upper limits on mean
       differences that may go undetected with probability higher than
       `1 - false_pass_rate`.
-    low1: Floating-point tensor of lower bounds on the supports of the
+    low1: Floating-point `Tensor` of lower bounds on the supports of the
       distributions A.
-    high1: Floating-point tensor of upper bounds on the supports of
+    high1: Floating-point `Tensor` of upper bounds on the supports of
       the distributions A.
-    low2: Floating-point tensor of lower bounds on the supports of the
+    low2: Floating-point `Tensor` of lower bounds on the supports of the
       distributions B.
-    high2: Floating-point tensor of upper bounds on the supports of
+    high2: Floating-point `Tensor` of upper bounds on the supports of
       the distributions B.
-    false_fail_rate: *Scalar* admissible total rate of false failures.
-    false_pass_rate: *Scalar* admissible rate of false passes.
+    false_fail_rate: *Scalar* floating-point `Tensor` admissible total
+      rate of false failures.
+    false_pass_rate: *Scalar* floating-point `Tensor` admissible rate
+      of false passes.
     name: A name for this operation (optional).
 
   Returns:
-    n1: Tensor of numbers of samples to be drawn from the distributions A.
-    n2: Tensor of numbers of samples to be drawn from the distributions B.
+    n1: `Tensor` of numbers of samples to be drawn from the distributions A.
+    n2: `Tensor` of numbers of samples to be drawn from the distributions B.
 
   For each batch member `i`, of `K` total, drawing `n1[i]` samples
   from scalar distribution A supported on `[low1[i], high1[i]]` and `n2[i]`
diff --git a/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
index af6ff8162b1730..8d4914e16cd374 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_diffeomixture.py
@@ -395,7 +395,7 @@ def __init__(self,
       ValueError: if `not distribution.is_scalar_batch`.
       ValueError: if `not distribution.is_scalar_event`.
     """
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[mix_loc, temperature]) as name:
       if not scale or len(scale) < 2:
         raise ValueError("Must specify list (or list-like object) of scale "
diff --git a/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py b/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py
index e265b5d0f7c10b..a75b3f3df1f286 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_exponential_diag.py
@@ -175,7 +175,7 @@ def __init__(self,
     Raises:
       ValueError: if at most `scale_identity_multiplier` is specified.
     """
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name) as name:
       with ops.name_scope("init", values=[
           loc, scale_diag, scale_identity_multiplier]):
diff --git a/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py b/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py
index 89136d6760bb66..a7d4c55be93f61 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_exponential_linear_operator.py
@@ -175,7 +175,7 @@ def __init__(self,
       ValueError: if `scale` is unspecified.
       TypeError: if not `scale.dtype.is_floating`
     """
-    parameters = locals()
+    parameters = dict(locals())
     if scale is None:
       raise ValueError("Missing required `scale` parameter.")
     if not scale.dtype.is_floating:
diff --git a/tensorflow/contrib/distributions/python/ops/vector_laplace_diag.py b/tensorflow/contrib/distributions/python/ops/vector_laplace_diag.py
index 8dd983b750d9b3..4a53e7a621f273 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_laplace_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_laplace_diag.py
@@ -210,7 +210,7 @@ def __init__(self,
     Raises:
       ValueError: if at most `scale_identity_multiplier` is specified.
     """
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name):
       with ops.name_scope("init", values=[
           loc, scale_diag, scale_identity_multiplier]):
diff --git a/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py b/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
index ec485c95c15da2..0566e04fece6f9 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_laplace_linear_operator.py
@@ -191,7 +191,7 @@ def __init__(self,
       ValueError: if `scale` is unspecified.
       TypeError: if not `scale.dtype.is_floating`
     """
-    parameters = locals()
+    parameters = dict(locals())
     if scale is None:
       raise ValueError("Missing required `scale` parameter.")
     if not scale.dtype.is_floating:
diff --git a/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py b/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py
index 1438ede26500bc..bb33cd0762a368 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_sinh_arcsinh_diag.py
@@ -163,7 +163,7 @@ def __init__(self,
     Raises:
       ValueError: if at most `scale_identity_multiplier` is specified.
     """
-    parameters = locals()
+    parameters = dict(locals())
 
     with ops.name_scope(
         name,
diff --git a/tensorflow/contrib/distributions/python/ops/vector_student_t.py b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
index 7e78ded9df0756..21f84dcbdea8b4 100644
--- a/tensorflow/contrib/distributions/python/ops/vector_student_t.py
+++ b/tensorflow/contrib/distributions/python/ops/vector_student_t.py
@@ -175,7 +175,7 @@ def __init__(self,
         if one or more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
-    parameters = locals()
+    parameters = dict(locals())
     graph_parents = [df, loc, scale_identity_multiplier, scale_diag,
                      scale_tril, scale_perturb_factor, scale_perturb_diag]
     with ops.name_scope(name) as name:
diff --git a/tensorflow/contrib/distributions/python/ops/wishart.py b/tensorflow/contrib/distributions/python/ops/wishart.py
index 91453fed5d2791..88d4280759da7c 100644
--- a/tensorflow/contrib/distributions/python/ops/wishart.py
+++ b/tensorflow/contrib/distributions/python/ops/wishart.py
@@ -107,7 +107,7 @@ def __init__(self,
       ValueError: if df < k, where scale operator event shape is
         `(k, k)`
     """
-    parameters = locals()
+    parameters = dict(locals())
     self._cholesky_input_output_matrices = cholesky_input_output_matrices
     with ops.name_scope(name) as name:
       with ops.name_scope("init", values=[df, scale_operator]):
@@ -530,7 +530,7 @@ def __init__(self,
         more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[scale]) as name:
       with ops.name_scope("init", values=[scale]):
         scale = ops.convert_to_tensor(scale)
@@ -646,7 +646,7 @@ def __init__(self,
         more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name) as name:
       with ops.name_scope("init", values=[scale]):
         scale = ops.convert_to_tensor(scale)
diff --git a/tensorflow/contrib/eager/README.md b/tensorflow/contrib/eager/README.md
index 9a3b780af888a5..4384431e7b9c3e 100644
--- a/tensorflow/contrib/eager/README.md
+++ b/tensorflow/contrib/eager/README.md
@@ -1,6 +1,6 @@
 # Eager Execution
 
-Eager execution provides an imperative interface to TensorFlow (similiar to
+Eager execution provides an imperative interface to TensorFlow (similar to
 [NumPy](http://www.numpy.org)). When you enable eager execution, TensorFlow
 operations execute immediately; you do not execute a pre-constructed graph with
 [`Session.run()`](https://www.tensorflow.org/api_docs/python/tf/Session).
@@ -37,7 +37,7 @@ support for distributed and multi-GPU training and performance.
 
 ## Installation
 
-Eager execution is included in TensorFlow versions 1.7 and above.
+For eager execution, we recommend using TensorFlow version 1.8 or newer.
 Installation instructions at https://www.tensorflow.org/install/
 
 ## Documentation
@@ -48,12 +48,3 @@ For an introduction to eager execution in TensorFlow, see:
 - Notebook: [Basic Usage](python/examples/notebooks/1_basics.ipynb)
 - Notebook: [Gradients](python/examples/notebooks/2_gradients.ipynb)
 - Notebook: [Importing Data](python/examples/notebooks/3_datasets.ipynb)
-
-## Changelog
-
-- 2017/10/31: Initial preview release (in TensorFlow 1.5)
-- 2017/12/01: Example of dynamic neural network:
-  [SPINN: Stack-augmented Parser-Interpreter Neural Network](https://arxiv.org/abs/1603.06021).
-  See [README.md](python/examples/spinn/README.md) for details.
-- 2017/03: Core functionality moved out of the experimental tf.contrib namespace
-  in TensorFlow 1.7.
diff --git a/tensorflow/contrib/eager/python/BUILD b/tensorflow/contrib/eager/python/BUILD
index 4d28d2266d91b9..dccd813256355d 100644
--- a/tensorflow/contrib/eager/python/BUILD
+++ b/tensorflow/contrib/eager/python/BUILD
@@ -120,7 +120,6 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:checkpointable",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
@@ -131,6 +130,7 @@ py_library(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
+        "//tensorflow/python/training/checkpointable:base",
     ],
 )
 
diff --git a/tensorflow/contrib/eager/python/datasets.py b/tensorflow/contrib/eager/python/datasets.py
index 0783d1b5d70e50..adf92c27ea0a27 100644
--- a/tensorflow/contrib/eager/python/datasets.py
+++ b/tensorflow/contrib/eager/python/datasets.py
@@ -31,7 +31,7 @@
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.training import checkpointable
+from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.training.saver import BaseSaverBuilder
 
 _uid_counter = 0
@@ -106,7 +106,8 @@ def remote_fn(h):
             target_device=target,
             buffer_size=10,
             container="",
-            shared_name=_generate_shared_name("function_buffer_resource"))
+            shared_name=_generate_shared_name(
+                "contrib_eager_iterator_function_buffer_resource"))
         self._buffer_resource_deleter = resource_variable_ops.EagerResourceDeleter(  # pylint: disable=line-too-long
             handle=self._buffer_resource_handle,
             handle_device=self._device)
diff --git a/tensorflow/contrib/eager/python/datasets_test.py b/tensorflow/contrib/eager/python/datasets_test.py
index 7b123707cc3a26..68bec9aee894ed 100644
--- a/tensorflow/contrib/eager/python/datasets_test.py
+++ b/tensorflow/contrib/eager/python/datasets_test.py
@@ -37,7 +37,7 @@
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
-from tensorflow.python.training import checkpointable_utils
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
 
 
 class IteratorTest(test.TestCase):
diff --git a/tensorflow/contrib/eager/python/examples/BUILD b/tensorflow/contrib/eager/python/examples/BUILD
index c1fd9e0ed020be..1d9371c7ac405d 100644
--- a/tensorflow/contrib/eager/python/examples/BUILD
+++ b/tensorflow/contrib/eager/python/examples/BUILD
@@ -7,6 +7,8 @@ py_library(
     name = "examples_pip",
     deps = [
         "//tensorflow/contrib/eager/python/examples/gan:mnist",
+        "//tensorflow/contrib/eager/python/examples/l2hmc",
+        "//tensorflow/contrib/eager/python/examples/l2hmc:neural_nets",
         "//tensorflow/contrib/eager/python/examples/linear_regression",
         "//tensorflow/contrib/eager/python/examples/resnet50",
         "//tensorflow/contrib/eager/python/examples/rnn_colorbot",
diff --git a/tensorflow/contrib/eager/python/examples/gan/mnist.py b/tensorflow/contrib/eager/python/examples/gan/mnist.py
index b80c9090235370..cc9cf53410f641 100644
--- a/tensorflow/contrib/eager/python/examples/gan/mnist.py
+++ b/tensorflow/contrib/eager/python/examples/gan/mnist.py
@@ -227,7 +227,7 @@ def train_one_epoch(generator, discriminator, generator_optimizer,
           maxval=1.,
           seed=batch_index)
 
-      with tfe.GradientTape(persistent=True) as g:
+      with tf.GradientTape(persistent=True) as g:
         generated_images = generator(noise)
         tf.contrib.summary.image(
             'generated_images',
@@ -306,7 +306,7 @@ def main(_):
 
 
 if __name__ == '__main__':
-  tfe.enable_eager_execution()
+  tf.enable_eager_execution()
 
   parser = argparse.ArgumentParser()
   parser.add_argument(
diff --git a/tensorflow/contrib/eager/python/examples/gan/mnist_test.py b/tensorflow/contrib/eager/python/examples/gan/mnist_test.py
index bd35e50c1f434d..81ac05e26d23c2 100644
--- a/tensorflow/contrib/eager/python/examples/gan/mnist_test.py
+++ b/tensorflow/contrib/eager/python/examples/gan/mnist_test.py
@@ -111,5 +111,5 @@ def benchmark_generate(self):
 
 
 if __name__ == '__main__':
-  tfe.enable_eager_execution()
+  tf.enable_eager_execution()
   tf.test.main()
diff --git a/tensorflow/contrib/eager/python/examples/l2hmc/BUILD b/tensorflow/contrib/eager/python/examples/l2hmc/BUILD
new file mode 100644
index 00000000000000..72341835ff10e3
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/l2hmc/BUILD
@@ -0,0 +1,39 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//tensorflow:internal"])
+
+load("//tensorflow:tensorflow.bzl", "gpu_py_test")
+
+py_library(
+    name = "neural_nets",
+    srcs = ["neural_nets.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/contrib/eager/python:tfe",
+    ],
+)
+
+py_library(
+    name = "l2hmc",
+    srcs = ["l2hmc.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":neural_nets",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/contrib/eager/python:tfe",
+        "//third_party/py/numpy",
+    ],
+)
+
+gpu_py_test(
+    name = "l2hmc_test",
+    size = "large",
+    srcs = ["l2hmc_test.py"],
+    additional_deps = [
+        ":l2hmc",
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/contrib/eager/python:tfe",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc.py b/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc.py
new file mode 100644
index 00000000000000..98b4ce1b26acf2
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc.py
@@ -0,0 +1,382 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""L2HMC compatible with TensorFlow's eager execution.
+
+Reference [Generalizing Hamiltonian Monte Carlo with Neural
+Networks](https://arxiv.org/pdf/1711.09268.pdf)
+
+Code adapted from the released TensorFlow graph implementation by original
+authors https://github.com/brain-research/l2hmc.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import numpy.random as npr
+import tensorflow as tf
+import tensorflow.contrib.eager as tfe
+from tensorflow.contrib.eager.python.examples.l2hmc import neural_nets
+
+
+class Dynamics(tf.keras.Model):
+  """Dynamics engine of naive L2HMC sampler.
+
+  Args:
+    x_dim: dimensionality of observed data
+    loglikelihood_fn: log-likelihood function of conditional probability
+    n_steps: number of leapfrog steps within each transition
+    eps: initial value learnable scale of step size
+  """
+
+  def __init__(self, x_dim, loglikelihood_fn, n_steps=25, eps=.1):
+    super(Dynamics, self).__init__()
+
+    self.x_dim = x_dim
+    self.potential = loglikelihood_fn
+    self.n_steps = n_steps
+
+    self._construct_time()
+    self._construct_masks()
+
+    self.position_fn = neural_nets.GenericNet(x_dim, factor=2.)
+    self.momentum_fn = neural_nets.GenericNet(x_dim, factor=1.)
+
+    self.eps = tfe.Variable(
+        initial_value=eps, name="eps", dtype=tf.float32, trainable=True)
+
+    # TODO(lxuechen): Remove this after model.add_weight is in place
+    self.vars_not_in_layers = [self.eps]
+    self.vars_not_in_layers += self.position_fn.vars_not_in_layers
+    self.vars_not_in_layers += self.momentum_fn.vars_not_in_layers
+
+  def apply_transition(self, position):
+    """Propose a new state and perform the accept or reject step."""
+
+    # Simulate dynamics both forward and backward;
+    # Use sampled Bernoulli masks to compute the actual solutions
+    position_f, momentum_f, accept_prob_f = self.transition_kernel(
+        position, forward=True)
+    position_b, momentum_b, accept_prob_b = self.transition_kernel(
+        position, forward=False)
+
+    # Decide direction uniformly
+    forward_mask = tf.cast(
+        tf.random_uniform(shape=[tf.shape(position)[0]]) > .5, tf.float32)
+    backward_mask = 1. - forward_mask
+
+    # Obtain proposed states
+    position_post = (
+        forward_mask[:, None] * position_f +
+        backward_mask[:, None] * position_b)
+    momentum_post = (
+        forward_mask[:, None] * momentum_f +
+        backward_mask[:, None] * momentum_b)
+
+    # Probability of accepting the proposed states
+    accept_prob = forward_mask * accept_prob_f + backward_mask * accept_prob_b
+
+    # Accept or reject step
+    accept_mask = tf.cast(
+        accept_prob > tf.random_uniform(tf.shape(accept_prob)), tf.float32)
+    reject_mask = 1. - accept_mask
+
+    # Samples after accept/reject step
+    position_out = (
+        accept_mask[:, None] * position_post + reject_mask[:, None] * position)
+
+    return position_post, momentum_post, accept_prob, position_out
+
+  def transition_kernel(self, position, forward=True):
+    """Transition kernel of augmented leapfrog integrator."""
+
+    lf_fn = self._forward_lf if forward else self._backward_lf
+
+    # Resample momentum
+    momentum = tf.random_normal(tf.shape(position))
+    position_post, momentum_post = position, momentum
+    sumlogdet = 0.
+    # Apply augmented leapfrog steps
+    for i in range(self.n_steps):
+      position_post, momentum_post, logdet = lf_fn(position_post, momentum_post,
+                                                   i)
+      sumlogdet += logdet
+
+    accept_prob = self._compute_accept_prob(position, momentum, position_post,
+                                            momentum_post, sumlogdet)
+
+    return position_post, momentum_post, accept_prob
+
+  def _forward_lf(self, position, momentum, i):
+    """One forward augmented leapfrog step. See eq (5-6) in paper."""
+
+    t = self._get_time(i)
+    mask, mask_inv = self._get_mask(i)
+    sumlogdet = 0.
+
+    momentum, logdet = self._update_momentum_forward(position, momentum, t)
+    sumlogdet += logdet
+
+    position, logdet = self._update_position_forward(position, momentum, t,
+                                                     mask)
+    sumlogdet += logdet
+
+    position, logdet = self._update_position_forward(position, momentum, t,
+                                                     mask_inv)
+    sumlogdet += logdet
+
+    momentum, logdet = self._update_momentum_forward(position, momentum, t)
+    sumlogdet += logdet
+
+    return position, momentum, tf.reduce_sum(sumlogdet, axis=1)
+
+  def _backward_lf(self, position, momentum, i):
+    """One backward augmented leapfrog step. See Appendix A in paper."""
+
+    # Reversed index/sinusoidal time
+    t = self._get_time(self.n_steps - i - 1)
+    mask, mask_inv = self._get_mask(self.n_steps - i - 1)
+    sumlogdet = 0.
+
+    momentum, logdet = self._update_momentum_backward(position, momentum, t)
+    sumlogdet += logdet
+
+    position, logdet = self._update_position_backward(position, momentum, t,
+                                                      mask)
+    sumlogdet += logdet
+
+    position, logdet = self._update_position_backward(position, momentum, t,
+                                                      mask_inv)
+    sumlogdet += logdet
+
+    momentum, logdet = self._update_momentum_backward(position, momentum, t)
+    sumlogdet += logdet
+
+    return position, momentum, tf.reduce_sum(sumlogdet, axis=1)
+
+  def _update_momentum_forward(self, position, momentum, t):
+    """Update v in the forward leapfrog step."""
+
+    grad = self.grad_potential(position)
+    scale, translation, transformed = self.momentum_fn([position, grad, t])
+    scale *= .5 * self.eps
+    transformed *= self.eps
+    momentum = (
+        momentum * tf.exp(scale) -
+        .5 * self.eps * (tf.exp(transformed) * grad - translation))
+
+    return momentum, scale
+
+  def _update_position_forward(self, position, momentum, t, mask):
+    """Update x in the forward leapfrog step."""
+
+    mask_inv = 1. - mask
+    scale, translation, transformed = self.position_fn(
+        [momentum, mask * position, t])
+    scale *= self.eps
+    transformed *= self.eps
+    position = (
+        mask * position +
+        mask_inv * (position * tf.exp(scale) + self.eps *
+                    (tf.exp(transformed) * momentum + translation)))
+
+    return position, mask_inv * scale
+
+  def _update_momentum_backward(self, position, momentum, t):
+    """Update v in the backward leapfrog step. Inverting the forward update."""
+
+    grad = self.grad_potential(position)
+    scale, translation, transformed = self.momentum_fn([position, grad, t])
+    scale *= -.5 * self.eps
+    transformed *= self.eps
+    momentum = (
+        tf.exp(scale) * (momentum + .5 * self.eps *
+                         (tf.exp(transformed) * grad - translation)))
+
+    return momentum, scale
+
+  def _update_position_backward(self, position, momentum, t, mask):
+    """Update x in the backward leapfrog step. Inverting the forward update."""
+
+    mask_inv = 1. - mask
+    scale, translation, transformed = self.position_fn(
+        [momentum, mask_inv * position, t])
+    scale *= -self.eps
+    transformed *= self.eps
+    position = (
+        mask_inv * position + mask * tf.exp(scale) *
+        (position - self.eps * tf.exp(transformed) * momentum + translation))
+
+    return position, mask * scale
+
+  def _compute_accept_prob(self, position, momentum, position_post,
+                           momentum_post, sumlogdet):
+    """Compute the prob of accepting the proposed state given old state."""
+
+    old_hamil = self.hamiltonian(position, momentum)
+    new_hamil = self.hamiltonian(position_post, momentum_post)
+
+    return tf.exp(tf.minimum(old_hamil - new_hamil + sumlogdet, 0.))
+
+  def _construct_time(self):
+    """Convert leapfrog step index into sinusoidal time."""
+
+    self.ts = []
+    for i in range(self.n_steps):
+      t = tf.constant(
+          [
+              np.cos(2 * np.pi * i / self.n_steps),
+              np.sin(2 * np.pi * i / self.n_steps)
+          ],
+          dtype=tf.float32)
+      self.ts.append(t[None, :])
+
+  def _get_time(self, i):
+    """Get sinusoidal time for i-th augmented leapfrog step."""
+
+    return self.ts[i]
+
+  def _construct_masks(self):
+    """Construct different binary masks for different time steps."""
+
+    self.masks = []
+    for _ in range(self.n_steps):
+      idx = npr.permutation(np.arange(self.x_dim))[:self.x_dim // 2]
+      mask = np.zeros((self.x_dim,))
+      mask[idx] = 1.
+      mask = tf.constant(mask, dtype=tf.float32)
+      self.masks.append(mask[None, :])
+
+  def _get_mask(self, i):
+    """Get binary masks for i-th augmented leapfrog step."""
+
+    m = self.masks[i]
+    return m, 1. - m
+
+  def kinetic(self, v):
+    """Compute the kinetic energy."""
+
+    return .5 * tf.reduce_sum(v**2, axis=1)
+
+  def hamiltonian(self, position, momentum):
+    """Compute the overall Hamiltonian."""
+
+    return self.potential(position) + self.kinetic(momentum)
+
+  def grad_potential(self, position, check_numerics=True):
+    """Get gradient of potential function at current location."""
+
+    if not tf.executing_eagerly():
+      # TODO(lxuechen): Change this to tfe.gradients_function when it works
+      grad = tf.gradients(self.potential(position), position)[0]
+    else:
+      grad = tfe.gradients_function(self.potential)(position)[0]
+
+    if check_numerics:
+      return tf.check_numerics(grad, message="gradient of potential")
+
+    return grad
+
+
+# Defining loss and grads for training
+def compute_loss(x, dynamics, scale=.1, eps=1e-4):
+  """Compute loss defined in equation (8)."""
+
+  z = tf.random_normal(tf.shape(x))
+  x_, _, x_accept_prob, x_out = dynamics.apply_transition(x)
+  z_, _, z_accept_prob, _ = dynamics.apply_transition(z)
+
+  # Add eps for numerical stability; following released impl
+  x_loss = tf.reduce_sum((x - x_)**2, axis=1) * x_accept_prob + eps
+  z_loss = tf.reduce_sum((z - z_)**2, axis=1) * z_accept_prob + eps
+
+  loss = tf.reduce_mean(
+      (1. / x_loss + 1. / z_loss) * scale - (x_loss + z_loss) / scale, axis=0)
+
+  return loss, x_out
+
+
+def loss_and_grads(x, dynamics):
+  """Obtain loss value and gradients."""
+
+  with tf.GradientTape() as tape:
+    loss_val, x_out = compute_loss(x, dynamics)
+
+  vars_ = dynamics.variables + dynamics.vars_not_in_layers
+  grads = tape.gradient(loss_val, vars_)
+
+  return loss_val, grads, x_out
+
+
+def warmup(dynamics, optimizer, n_iters=1, n_samples=200):
+  """Warmup optimization to reduce overhead."""
+
+  samples = tf.random_normal(
+      shape=[n_samples, dynamics.x_dim], dtype=tf.float32)
+
+  for _ in range(n_iters):
+    _, grads, samples = loss_and_grads(samples, dynamics)
+    vars_ = dynamics.variables + dynamics.vars_not_in_layers
+    optimizer.apply_gradients(zip(grads, vars_))
+
+
+def fit(dynamics,
+        optimizer,
+        n_samples=200,
+        n_iters=5000,
+        verbose=True,
+        logdir=None):
+  """Fit L2HMC sampler with given log-likelihood function."""
+
+  if logdir:
+    summary_writer = tf.contrib.summary.create_file_writer(logdir)
+
+  samples = tf.random_normal(
+      shape=[n_samples, dynamics.x_dim], dtype=tf.float32)
+
+  tf.train.get_or_create_global_step()
+  for i in range(n_iters):
+    loss, grads, samples = loss_and_grads(samples, dynamics)
+    # TODO(lxuechen): Proper learning rate decay
+    grads_ = [grad * .96**(i // 1000) for grad in grads]
+    vars_ = dynamics.variables + dynamics.vars_not_in_layers
+    optimizer.apply_gradients(
+        zip(grads_, vars_), global_step=tf.train.get_global_step())
+
+    if verbose:
+      print("Iteration %d: loss %.4f" % (i, loss))
+
+    if logdir:
+      with summary_writer.as_default():
+        with tf.contrib.summary.always_record_summaries():
+          tf.contrib.summary.scalar("loss", loss)
+
+
+def get_scg_energy_fn():
+  """Get energy function for 2d strongly correlated Gaussian."""
+
+  # Avoid recreating tf constants on each invocation of gradients
+  mu = tf.constant([0., 0.])
+  sigma = tf.constant([[50.05, -49.95], [-49.95, 50.05]])
+  sigma_inv = tf.matrix_inverse(sigma)
+
+  def energy(x):
+    """Unnormalized log density/energy of 2d strongly correlated Gaussian."""
+
+    xmmu = x - mu
+    return .5 * tf.diag_part(
+        tf.matmul(tf.matmul(xmmu, sigma_inv), tf.transpose(xmmu)))
+
+  return energy
diff --git a/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc_test.py b/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc_test.py
new file mode 100644
index 00000000000000..522a7c9380131b
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc_test.py
@@ -0,0 +1,162 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests l2hmc fit to 2D strongly correlated Gaussian executed eagerly."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy.random as npr
+import tensorflow as tf
+import tensorflow.contrib.eager as tfe
+from tensorflow.contrib.eager.python.examples.l2hmc import l2hmc
+
+
+def get_default_hparams():
+  return tf.contrib.training.HParams(
+      x_dim=2,
+      n_samples=200,
+      n_steps=10,
+      eps=.1,
+      n_iters=5,
+      learning_rate=.001,
+      n_warmup_iters=1)
+
+
+class L2hmcTest(tf.test.TestCase):
+  """Unit tests for l2hmc in both eager and graph mode."""
+
+  def testComputeLoss(self):
+    """Testing function l2hmc.compute_loss in both graph and eager mode."""
+
+    # Eager mode testing
+    hparams = get_default_hparams()
+    dynamics = l2hmc.Dynamics(
+        x_dim=hparams.x_dim,
+        loglikelihood_fn=l2hmc.get_scg_energy_fn(),
+        n_steps=hparams.n_steps,
+        eps=hparams.eps)
+    samples = tf.random_normal(shape=[hparams.n_samples, hparams.x_dim])
+    loss, x_out = l2hmc.compute_loss(samples, dynamics)
+
+    # Check shape and numerical stability
+    self.assertEqual(x_out.shape, samples.shape)
+    self.assertEqual(loss.shape, [])
+    self.assertAllClose(loss.numpy(), loss.numpy(), rtol=1e-5)
+
+    # Graph mode testing
+    with tf.Graph().as_default():
+      dynamics = l2hmc.Dynamics(
+          x_dim=hparams.x_dim,
+          loglikelihood_fn=l2hmc.get_scg_energy_fn(),
+          n_steps=hparams.n_steps,
+          eps=hparams.eps)
+      x = tf.placeholder(tf.float32, shape=[None, hparams.x_dim])
+      loss, x_out = l2hmc.compute_loss(x, dynamics)
+      samples = npr.normal(size=[hparams.n_samples, hparams.x_dim])
+
+      with tf.Session() as sess:
+        sess.run(tf.global_variables_initializer())
+        loss_np, x_out_np = sess.run([loss, x_out], feed_dict={x: samples})
+
+        # Check shape and numerical stability
+        self.assertEqual(x_out_np.shape, samples.shape)
+        self.assertEqual(loss_np.shape, ())
+        self.assertAllClose(loss_np, loss_np, rtol=1e-5)
+
+
+class L2hmcBenchmark(tf.test.Benchmark):
+  """Eager and graph benchmarks for l2hmc."""
+
+  def benchmarkEagerL2hmc(self):
+    """Benchmark Eager performance."""
+
+    hparams = get_default_hparams()
+    dynamics = l2hmc.Dynamics(
+        x_dim=hparams.x_dim,
+        loglikelihood_fn=l2hmc.get_scg_energy_fn(),
+        n_steps=hparams.n_steps,
+        eps=hparams.eps)
+    # TODO(lxuechen): Add learning rate decay
+    optimizer = tf.train.AdamOptimizer(learning_rate=hparams.learning_rate)
+
+    # Warmup to reduce initialization effect when timing
+    l2hmc.warmup(dynamics, optimizer, n_iters=hparams.n_warmup_iters)
+
+    # Time
+    start_time = time.time()
+    l2hmc.fit(
+        dynamics,
+        optimizer,
+        n_samples=hparams.n_samples,
+        n_iters=hparams.n_iters)
+    wall_time = time.time() - start_time
+    examples_per_sec = hparams.n_samples / wall_time
+
+    self.report_benchmark(
+        name="eager_train_%s" % ("gpu" if tfe.num_gpus() > 0 else "cpu"),
+        iters=hparams.n_iters,
+        extras={"examples_per_sec": examples_per_sec},
+        wall_time=wall_time)
+
+  def benchmarkGraphL2hmc(self):
+    """Benchmark Graph performance."""
+
+    hparams = get_default_hparams()
+    with tf.Graph().as_default():
+      dynamics = l2hmc.Dynamics(
+          x_dim=hparams.x_dim,
+          loglikelihood_fn=l2hmc.get_scg_energy_fn(),
+          n_steps=hparams.n_steps,
+          eps=hparams.eps)
+      x = tf.placeholder(tf.float32, shape=[None, hparams.x_dim])
+      loss, x_out = l2hmc.compute_loss(x, dynamics)
+
+      global_step = tf.Variable(0., name="global_step", trainable=False)
+      learning_rate = tf.train.exponential_decay(
+          hparams.learning_rate, global_step, 1000, 0.96, staircase=True)
+      optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
+      train_op = optimizer.minimize(loss, global_step=global_step)
+
+      with tf.Session() as sess:
+        sess.run(tf.global_variables_initializer())
+
+        # Warmup to reduce initialization effect when timing
+        samples = npr.normal(size=[hparams.n_samples, hparams.x_dim])
+        for _ in range(hparams.n_warmup_iters):
+          samples, _, _, _ = sess.run(
+              [x_out, loss, train_op, learning_rate], feed_dict={x: samples})
+
+        # Time
+        start_time = time.time()
+        for _ in range(hparams.n_iters):
+          samples, _, _, _ = sess.run(
+              [x_out, loss, train_op, learning_rate], feed_dict={x: samples})
+        wall_time = time.time() - start_time
+        examples_per_sec = hparams.n_samples / wall_time
+
+        self.report_benchmark(
+            name="graph_train_%s" % ("gpu"
+                                     if tf.test.is_gpu_available() else "cpu"),
+            iters=hparams.n_iters,
+            extras={"examples_per_sec": examples_per_sec},
+            wall_time=wall_time)
+
+
+if __name__ == "__main__":
+  tf.enable_eager_execution()
+  tf.test.main()
diff --git a/tensorflow/contrib/eager/python/examples/l2hmc/neural_nets.py b/tensorflow/contrib/eager/python/examples/l2hmc/neural_nets.py
new file mode 100644
index 00000000000000..c902e1f1f4862d
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/l2hmc/neural_nets.py
@@ -0,0 +1,86 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Neural nets utility for L2HMC compatible with TensorFlow's eager execution.
+
+Reference [Generalizing Hamiltonian Monte Carlo with Neural
+Networks](https://arxiv.org/pdf/1711.09268.pdf)
+
+Code adapted from the released TensorFlow graph implementation by original
+authors https://github.com/brain-research/l2hmc.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+import tensorflow.contrib.eager as tfe
+
+
+class GenericNet(tf.keras.Model):
+  """Generic neural net with different initialization scale based on input.
+
+  Args:
+    x_dim: dimensionality of observed data
+    factor: factor of variance scaling initializer
+    n_hidden: number of hidden units
+  """
+
+  def __init__(self, x_dim, factor, n_hidden=10):
+    super(GenericNet, self).__init__()
+
+    self.v_layer = _custom_dense(n_hidden, 1. / 3.)
+    self.x_layer = _custom_dense(n_hidden, factor / 3.)
+    self.t_layer = _custom_dense(n_hidden, 1. / 3.)
+    self.h_layer = _custom_dense(n_hidden)
+
+    # Scale
+    self.scale_layer = _custom_dense(x_dim, .001)
+    self.coeff_scale = tfe.Variable(
+        initial_value=tf.zeros([1, x_dim]), name='coeff_scale', trainable=True)
+    # Translation
+    self.translation_layer = _custom_dense(x_dim, factor=.001)
+    # Transformation
+    self.transformation_layer = _custom_dense(x_dim, .001)
+    self.coeff_transformation = tfe.Variable(
+        initial_value=tf.zeros([1, x_dim]),
+        name='coeff_transformation',
+        trainable=True)
+    # TODO(lxuechen): Remove this after model.add_weight is in place
+    self.vars_not_in_layers = [self.coeff_scale, self.coeff_transformation]
+
+  def call(self, inputs):
+    v, x, t = inputs
+    h = self.v_layer(v) + self.x_layer(x) + self.t_layer(t)
+    h = tf.nn.relu(h)
+    h = self.h_layer(h)
+    h = tf.nn.relu(h)
+    scale = tf.nn.tanh(self.scale_layer(h)) * tf.exp(self.coeff_scale)
+    translation = self.translation_layer(h)
+    transformation = (
+        tf.nn.tanh(self.transformation_layer(h)) * tf.exp(
+            self.coeff_transformation))
+
+    return scale, translation, transformation
+
+
+def _custom_dense(units, factor=1.):
+  """Custom dense layer with specified weight initialization."""
+
+  return tf.keras.layers.Dense(
+      units=units,
+      use_bias=True,
+      kernel_initializer=tf.contrib.layers.variance_scaling_initializer(
+          factor=factor * 2., mode='FAN_IN', uniform=False),
+      bias_initializer=tf.constant_initializer(0., dtype=tf.float32))
diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
index 4e1380afb2e6e7..099b712fc06d1d 100644
--- a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
+++ b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression.py
@@ -75,7 +75,6 @@ def fit(model, dataset, optimizer, verbose=False, logdir=None):
   mse = lambda xs, ys: mean_square_loss(model, xs, ys)
   loss_and_grads = tfe.implicit_value_and_gradients(mse)
 
-  tf.train.get_or_create_global_step()
   if logdir:
     # Support for TensorBoard summaries. Once training has started, use:
     #   tensorboard --logdir=<logdir>
@@ -87,12 +86,13 @@ def fit(model, dataset, optimizer, verbose=False, logdir=None):
     if verbose:
       print("Iteration %d: loss = %s" % (i, loss.numpy()))
 
-    optimizer.apply_gradients(grads, global_step=tf.train.get_global_step())
+    optimizer.apply_gradients(grads)
 
     if logdir:
       with summary_writer.as_default():
         with tf.contrib.summary.always_record_summaries():
-          tf.contrib.summary.scalar("loss", loss)
+          tf.contrib.summary.scalar("loss", loss, step=i)
+          tf.contrib.summary.scalar("step", i, step=i)
 
 
 def synthetic_dataset(w, b, noise_level, batch_size, num_batches):
@@ -119,7 +119,7 @@ def batch(_):
 
 
 def main(_):
-  tfe.enable_eager_execution()
+  tf.enable_eager_execution()
   # Ground-truth constants.
   true_w = [[-2.0], [4.0], [1.0]]
   true_b = [0.5]
diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_test.py b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_test.py
index e53234b51a7dcc..2bc2fc2aa9150a 100644
--- a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_test.py
+++ b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_test.py
@@ -117,5 +117,5 @@ def benchmarkEagerLinearRegression(self):
 
 
 if __name__ == "__main__":
-  tfe.enable_eager_execution()
+  tf.enable_eager_execution()
   tf.test.main()
diff --git a/tensorflow/contrib/eager/python/examples/notebooks/1_basics.ipynb b/tensorflow/contrib/eager/python/examples/notebooks/1_basics.ipynb
index 459f2f4a7d2afa..51d10a778413cf 100644
--- a/tensorflow/contrib/eager/python/examples/notebooks/1_basics.ipynb
+++ b/tensorflow/contrib/eager/python/examples/notebooks/1_basics.ipynb
@@ -7,17 +7,13 @@
         "id": "U9i2Dsh-ziXr"
       },
       "source": [
-        "# Eager Execution Tutorial: Basics\n",
+        "# An introduction to TensorFlow\n",
         "\n",
-        "This notebook introduces the basics of using TensorFlow's eager execution capabilities. It covers concepts such as:\n",
+        "This is an introductory tutorial for using TensorFlow. It will cover:\n",
         "\n",
         "* Importing required packages\n",
-        "* Enabling eager execution\n",
-        "* Creating and using TensorFlow Tensors and Variables\n",
-        "* Using TensorFlow interactively\n",
-        "* Using GPUs with eager execution enabled\n",
-        "\n",
-        "This notebook does *not* cover modeling topics, such as gradients."
+        "* Creating and using Tensors\n",
+        "* Using GPU acceleration\n"
       ]
     },
     {
@@ -27,9 +23,10 @@
         "id": "z1JcS5iBXMRO"
       },
       "source": [
-        "# Step 1: Import Eager\n",
+        "## Import TensorFlow\n",
         "\n",
-        "The key imports for eager execution are the following:"
+        "To get started, import the `tensorflow` module and enable eager execution.\n",
+        "Eager execution enables a more interactive frontend to TensorFlow, the details of which we will discuss much later."
       ]
     },
     {
@@ -48,11 +45,9 @@
       },
       "outputs": [],
       "source": [
-        "# Import TensorFlow.\n",
         "import tensorflow as tf\n",
         "\n",
-        "# Import TensorFlow eager execution support (subject to future changes).\n",
-        "import tensorflow.contrib.eager as tfe"
+        "tf.enable_eager_execution()"
       ]
     },
     {
@@ -62,10 +57,9 @@
         "id": "H9UySOPLXdaw"
       },
       "source": [
-        "# Step 2: Enable eager execution\n",
+        "## Tensors\n",
         "\n",
-        "All future TensorFlow calls will execute the\n",
-        "underlying TensorFlow ops immediately:"
+        "A Tensor is a multi-dimensional array. Similar to NumPy `ndarray` objects, `Tensor` objects have a data type and a shape. Additionally, Tensors can reside in accelerator (like GPU) memory. TensorFlow offers a rich library of operations ([tf.add](https://www.tensorflow.org/api_docs/python/tf/add), [tf.matmul](https://www.tensorflow.org/api_docs/python/tf/matmul), [tf.linalg.inv](https://www.tensorflow.org/api_docs/python/tf/linalg/inv) etc.) that consume and produce Tensors. These operations automatically convert native Python types. For example:\n"
       ]
     },
     {
@@ -77,60 +71,47 @@
           "autoexec": {
             "startup": false,
             "wait_interval": 0
-          }
+          },
+          "height": 125
         },
         "colab_type": "code",
-        "id": "WPTUfGq6kJ5w"
-      },
-      "outputs": [],
-      "source": [
-        "tfe.enable_eager_execution()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "twBfWd5xyu_d"
-      },
-      "source": [
-        "# Step 3: Interactively Use TensorFlow!\n",
-        "\n",
-        "Now you can call TensorFlow functions and get results, immediately! No more `tf.Sessions`!\n",
-        "\n",
-        "TensorFlow will automatically wrap native Python types for you with operator overloading for TensorFlow Tensors."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "cellView": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
+        "executionInfo": {
+          "elapsed": 320,
+          "status": "ok",
+          "timestamp": 1526420535530,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 420
         },
-        "colab_type": "code",
-        "id": "ngUe237Wt48W"
+        "id": "ngUe237Wt48W",
+        "outputId": "b1a1cd60-4eb3-443d-cd6b-68406390784e"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "tf.Tensor(3, shape=(), dtype=int32)\n",
+            "tf.Tensor([4 6], shape=(2,), dtype=int32)\n",
+            "tf.Tensor(25, shape=(), dtype=int32)\n",
+            "tf.Tensor(6, shape=(), dtype=int32)\n",
+            "tf.Tensor(aGVsbG8gd29ybGQ, shape=(), dtype=string)\n",
+            "tf.Tensor(13, shape=(), dtype=int32)\n"
+          ]
+        }
+      ],
       "source": [
         "print(tf.add(1, 2))\n",
         "print(tf.add([1, 2], [3, 4]))\n",
         "print(tf.square(5))\n",
         "print(tf.reduce_sum([1, 2, 3]))\n",
         "print(tf.encode_base64(\"hello world\"))\n",
-        "print(\"\")\n",
-        "\n",
-        "x = tf.constant(2)\n",
-        "y = tf.constant(3)\n",
-        "print(x * y + 1)\n",
         "\n",
-        "# Most TensorFlow ops are directly usable with eager execution, giving\n",
-        "# results immediately.\n",
-        "print(tf.contrib.signal.hamming_window(x * y + 1))"
+        "# Operator overloading is also supported\n",
+        "print(tf.square(2) + tf.square(3))"
       ]
     },
     {
@@ -140,7 +121,7 @@
         "id": "IDY4WsYRhP81"
       },
       "source": [
-        "Numpy arrays are supported, too:"
+        "Each Tensor has a shape and a datatype"
       ]
     },
     {
@@ -151,178 +132,144 @@
           "autoexec": {
             "startup": false,
             "wait_interval": 0
-          }
+          },
+          "height": 53
         },
         "colab_type": "code",
-        "id": "lCUWzso6mbqR"
+        "executionInfo": {
+          "elapsed": 215,
+          "status": "ok",
+          "timestamp": 1526420538162,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 420
+        },
+        "id": "srYWH1MdJNG7",
+        "outputId": "5e4ac41c-5115-4e50-eba0-42e249c16561"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "(1, 2)\n",
+            "\u003cdtype: 'int32'\u003e\n"
+          ]
+        }
+      ],
       "source": [
-        "import numpy as np\n",
-        "\n",
-        "ones = np.ones([3, 3])\n",
-        "\n",
-        "print(\"numpy 3x3 matrix of 1s:\")\n",
-        "print(ones)\n",
-        "print(\"\")\n",
-        "\n",
-        "print(\"Multiplied by 42:\")\n",
-        "print(tf.multiply(ones, 42))"
+        "x = tf.matmul([[1]], [[2, 3]])\n",
+        "print(x.shape)\n",
+        "print(x.dtype)"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
-        "id": "PBNP8yTRfu_X"
+        "id": "eBPw8e8vrsom"
       },
       "source": [
-        "# Step 4: Define and Print TensorFlow Variables\n",
+        "The most obvious differences between NumPy arrays and TensorFlow Tensors are:\n",
         "\n",
-        "To define TensorFlow variables, use the `get_variable()` function as follows:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "cellView": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "3Twf_Rw-gQFM"
-      },
-      "outputs": [],
-      "source": [
-        "x = tf.get_variable(name=\"x\", shape=[], dtype=tf.float32, initializer=tf.zeros_initializer)"
+        "1. Tensors can be backed by accelerator memory (like GPU, TPU).\n",
+        "2. Tensors are immutable."
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
-        "id": "45G7094TxsMb"
+        "id": "Dwi1tdW3JBw6"
       },
       "source": [
-        "## Printing TensorFlow Variables"
+        "### NumPy Compatibility\n",
+        "\n",
+        "Conversion between TensorFlow Tensors and NumPy ndarrays is quite simple as:\n",
+        "* TensorFlow operations automatically convert NumPy ndarrays to Tensors.\n",
+        "* NumPy operations automatically convert Tensors to NumPy ndarrays.\n",
+        "\n",
+        "Tensors can be explicitly converted to NumPy ndarrays by invoking the `.numpy()` method on them.\n",
+        "These conversions are typically cheap as the array and Tensor share the underlying memory representation if possible. However, sharing the underlying representation isn't always possible since the Tensor may be hosted in GPU memory while NumPy arrays are always backed by host memory, and the conversion will thus involve a copy from GPU to host memory."
       ]
     },
     {
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "cellView": "code",
         "colab": {
           "autoexec": {
             "startup": false,
             "wait_interval": 0
-          }
+          },
+          "height": 251
         },
         "colab_type": "code",
-        "id": "UJBJeZ5XxuwA"
+        "executionInfo": {
+          "elapsed": 238,
+          "status": "ok",
+          "timestamp": 1526420540562,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 420
+        },
+        "id": "lCUWzso6mbqR",
+        "outputId": "fd0a22bc-8249-49dd-fcbd-63161cc47e46"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "TensorFlow operations convert numpy arrays to Tensors automatically\n",
+            "tf.Tensor(\n",
+            "[[ 42.  42.  42.]\n",
+            " [ 42.  42.  42.]\n",
+            " [ 42.  42.  42.]], shape=(3, 3), dtype=float64)\n",
+            "And NumPy operations convert Tensors to numpy arrays automatically\n",
+            "[[ 43.  43.  43.]\n",
+            " [ 43.  43.  43.]\n",
+            " [ 43.  43.  43.]]\n",
+            "The .numpy() method explicitly converts a Tensor to a numpy array\n",
+            "[[ 42.  42.  42.]\n",
+            " [ 42.  42.  42.]\n",
+            " [ 42.  42.  42.]]\n"
+          ]
+        }
+      ],
       "source": [
-        "# This does NOT print the Variable's actual value:\n",
-        "print(\"Printing a TensorFlow Variable:\")\n",
-        "print(x)\n",
-        "print(\"\")\n",
+        "import numpy as np\n",
         "\n",
-        "# A TensorFlow variable represents a reference to a tensor.\n",
-        "# The `read_value()` method provides access to the current value of the\n",
-        "# variable. Tensorflow Variables are automatically initialized according to the\n",
-        "# semantics defined in tf.get_variable().\n",
-        "print(\"Printing a TensorFlow Variable's value using .read_value():\")\n",
-        "print(x.read_value())\n",
-        "print(\"\")\n",
+        "ndarray = np.ones([3, 3])\n",
         "\n",
-        "print(\"Printing a TensorFlow Variable's value using .read_value().numpy():\")\n",
-        "print(x.read_value().numpy())"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "2njjWHcTpBEn"
-      },
-      "source": [
-        "## Changing a TensorFlow Variable's value\n",
+        "print(\"TensorFlow operations convert numpy arrays to Tensors automatically\")\n",
+        "tensor = tf.multiply(ndarray, 42)\n",
+        "print(tensor)\n",
         "\n",
-        "To change a TensorFlow Variable's value, use its `.assign()` or `.assign_add()` method:"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "v3wr6Erbo_hB"
-      },
-      "outputs": [],
-      "source": [
-        "x.assign(42)\n",
-        "print(x.read_value())\n",
         "\n",
-        "x.assign_add(3)\n",
-        "print(x.read_value())"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "uhtynjHVpTB5"
-      },
-      "source": [
-        "## Use a Variable just like any other Tensor"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "7PbktdnHoehR"
-      },
-      "outputs": [],
-      "source": [
-        "print(x + 3)\n",
+        "print(\"And NumPy operations convert Tensors to numpy arrays automatically\")\n",
+        "print(np.add(tensor, 1))\n",
         "\n",
-        "# This code will broadcast the value across the list of numbers:\n",
-        "print(x * [1, 2, 4])"
+        "print(\"The .numpy() method explicitly converts a Tensor to a numpy array\")\n",
+        "print(tensor.numpy())"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
-        "id": "GVChqwlwy1SI"
+        "id": "PBNP8yTRfu_X"
       },
       "source": [
-        "# Step 5: Debug Errors with Instant Feedback\n",
+        "## GPU acceleration\n",
         "\n",
-        "TensorFlow's eager execution helps you identify and debug runtime issues through interactive exploration of code snippets.\n",
-        "\n",
-        "Below, we'll define a length-4 vector, and attempt two `tf.slice()` operations,\n",
-        "one being legal and the other being illegal, leading to a runtime error that is\n",
-        "raised immediately."
+        "Many TensorFlow operations can be accelerated by using the GPU for computation. Without any annotations, TensorFlow automatically decides whether to use the GPU or CPU for an operation (and copies the tensor between CPU and GPU memory if necessary). Tensors produced by an operation are typically backed by the memory of the device on which the operation executed. For example:"
       ]
     },
     {
@@ -334,125 +281,68 @@
           "autoexec": {
             "startup": false,
             "wait_interval": 0
-          }
+          },
+          "height": 53
         },
         "colab_type": "code",
-        "id": "23ap04N0v4k0"
-      },
-      "outputs": [],
-      "source": [
-        "vector = tf.constant([10.0, 20.0, 30.0, 40.0])"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "cellView": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "FCUMsIYxxRRa"
-      },
-      "outputs": [],
-      "source": [
-        "# Works, because the values of `begin` and `size` (the 2nd and 3rd input\n",
-        "# arguments) are within the bound of `vector`.\n",
-        "print(tf.slice(vector, [1], [3]))"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "cellView": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
+        "executionInfo": {
+          "elapsed": 340,
+          "status": "ok",
+          "timestamp": 1526420543562,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 420
         },
-        "colab_type": "code",
-        "id": "T8me2oCNxpFp"
+        "id": "3Twf_Rw-gQFM",
+        "outputId": "2239ae2b-adf3-4895-b1f3-464cf5361d1b"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Is there a GPU available:  False\n",
+            "Is the Tensor on GPU #0:   False\n"
+          ]
+        }
+      ],
       "source": [
-        "# The following does NOT work, because the value of `size` (the 3rd\n",
-        "# argument) causes the indices to go out of the bounds of `vector`. The\n",
-        "# error is raised immediately.\n",
-        "try:\n",
-        "  print(tf.slice(vector, [1], [4]))\n",
-        "except tf.OpError as e:\n",
-        "  print(\"Caught error: %s\" % e)"
+        "x = tf.random_uniform([3, 3])\n",
+        "\n",
+        "print(\"Is there a GPU available: \"),\n",
+        "print(tf.test.is_gpu_available())\n",
+        "\n",
+        "print(\"Is the Tensor on GPU #0:  \"),\n",
+        "print(x.device.endswith('GPU:0'))"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
-        "id": "irxJhAgar84v"
+        "id": "vpgYzgVXW2Ud"
       },
       "source": [
-        "# Step 6: Using the GPU\n",
+        "### Device Names\n",
         "\n",
-        "You can place Tensors on the GPU by calling a Tensor's `.gpu()` method.\n",
-        "\n",
-        "The first operation executing on the GPU may be slow as TensorFlow initializes. Subsequent uses will be much faster."
+        "The `Tensor.device` property provides a fully qualified string name of the device hosting the contents of the Tensor. This name encodes a bunch of details, such as an identifier of the network address of the host on which this program is executing and the device within that host. This is required for distributed execution of TensorFlow programs, but we'll skip that for now. The string will end with `GPU:\u003cN\u003e` if the tensor is placed on the `N`-th tensor on the host."
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
+      "cell_type": "markdown",
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "7J4N9baqaKCL"
+        "colab_type": "text",
+        "id": "ZWZQCimzuqyP"
       },
-      "outputs": [],
       "source": [
-        "# The example code from here on will work only if your notebook\n",
-        "# is running on a machine with a functional CUDA GPU. The following\n",
-        "# line checks that.\n",
-        "is_gpu_available = tfe.num_gpus() \u003e 0\n",
         "\n",
-        "# Create some Tensors\n",
-        "SIZE = 1000\n",
-        "cpu_tensor = tf.random_normal([SIZE, SIZE])\n",
         "\n",
-        "if is_gpu_available:\n",
-        "  gpu_tensor = cpu_tensor.gpu()\n",
-        "else:\n",
-        "  print(\"GPU not available.\")"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "4E-2n7VbzY1n"
-      },
-      "outputs": [],
-      "source": [
-        "# Time a CPU-based matrix multiplication\n",
+        "### Explicit Device Placement\n",
         "\n",
-        "print(\"Time to conduct matmul on CPU:\")\n",
-        "%time tf.matmul(cpu_tensor, cpu_tensor)"
+        "The term \"placement\" in TensorFlow refers to how individual operations are assigned (placed on) a device for execution. As mentioned above, when there is no explicit guidance provided, TensorFlow automatically decides which device to execute an operation, and copies Tensors to that device if needed. However, TensorFlow operations can be explicitly placed on specific devices using the `tf.device` context manager. For example:"
       ]
     },
     {
@@ -463,65 +353,73 @@
           "autoexec": {
             "startup": false,
             "wait_interval": 0
-          }
+          },
+          "height": 53
         },
         "colab_type": "code",
-        "id": "vbSFW-T5zhZF"
+        "executionInfo": {
+          "elapsed": 1762,
+          "status": "ok",
+          "timestamp": 1526420547562,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 420
+        },
+        "id": "RjkNZTuauy-Q",
+        "outputId": "2e613293-ccac-4db2-b793-8ceb5b5adcfd"
       },
-      "outputs": [],
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "On CPU:\n",
+            "10 loops, best of 3: 35.8 ms per loop\n"
+          ]
+        }
+      ],
       "source": [
-        "# Time GPU-based matrix multiplications.\n",
+        "def time_matmul(x):\n",
+        "  %timeit tf.matmul(x, x)\n",
         "\n",
-        "if is_gpu_available:\n",
-        "  # First use of the GPU will be slow:\n",
-        "  print(\"Time to conduct first matmul on GPU:\")\n",
-        "  %time tf.matmul(gpu_tensor, gpu_tensor)\n",
-        "  print()\n",
+        "# Force execution on CPU\n",
+        "print(\"On CPU:\")\n",
+        "with tf.device(\"CPU:0\"):\n",
+        "  x = tf.random_uniform([1000, 1000])\n",
+        "  assert x.device.endswith(\"CPU:0\")\n",
+        "  time_matmul(x)\n",
         "\n",
-        "  # Subsequent uses are much faster:\n",
-        "  print(\"Time to conduct second matmul on GPU:\")\n",
-        "  %time tf.matmul(gpu_tensor, gpu_tensor)"
+        "# Force execution on GPU #0 if available\n",
+        "if tf.test.is_gpu_available():\n",
+        "  with tf.device(\"GPU:0\"): # Or GPU:1 for the 2nd GPU, GPU:2 for the 3rd etc.\n",
+        "    x = tf.random_uniform([1000, 1000])\n",
+        "    assert x.device.endswith(\"GPU:0\")\n",
+        "    time_matmul(x)"
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
+      "cell_type": "markdown",
       "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        },
-        "colab_type": "code",
-        "id": "E5pIOe3Rz7iW"
+        "colab_type": "text",
+        "id": "YEOJTNiOvnpQ"
       },
-      "outputs": [],
       "source": [
-        "# Second timing demo for GPUs, after it has been used once:\n",
-        "\n",
-        "cpu_tensor = tf.random_normal([SIZE, SIZE])\n",
-        "print(\"Time to conduct CPU matmul:\")\n",
-        "%time tf.matmul(cpu_tensor, cpu_tensor)\n",
-        "print()\n",
+        "## Next Steps\n",
         "\n",
-        "if is_gpu_available:\n",
-        "  gpu_tensor = cpu_tensor.gpu()\n",
-        "  print(\"Time to conduct GPU matmul:\")\n",
-        "  %time tf.matmul(gpu_tensor, gpu_tensor)"
+        "In this tutorial we covered the most fundamental concepts in TensorFlow - `Tensor`s, operations, and devices.\n",
+        "In [the next tutorial](https://github.com/tensorflow/models/tree/master/official/contrib/eager/python/examples/notebooks/2_gradients.ipynb) we will cover automatic differentiation - a building block required for training many machine learning models like neural networks."
       ]
     }
   ],
   "metadata": {
     "colab": {
+      "collapsed_sections": [],
       "default_view": {},
-      "name": "Eager Execution Tutorial: Basics",
-      "provenance": [
-        {
-          "file_id": "0B0kLcpwLFwKEVm9XNkFueGk4bTg",
-          "timestamp": 1504118841551
-        }
-      ],
+      "name": "TensorFlow: An introduction",
+      "provenance": [],
       "version": "0.3.2",
       "views": {}
     }
diff --git a/tensorflow/contrib/eager/python/examples/notebooks/2_gradients.ipynb b/tensorflow/contrib/eager/python/examples/notebooks/2_gradients.ipynb
index e6c7c117333e1e..9c1af9c2084bac 100644
--- a/tensorflow/contrib/eager/python/examples/notebooks/2_gradients.ipynb
+++ b/tensorflow/contrib/eager/python/examples/notebooks/2_gradients.ipynb
@@ -7,12 +7,9 @@
         "id": "vDJ4XzMqodTy"
       },
       "source": [
-        "# Eager Execution: Working with Gradients\n",
+        "# Automatic Differentiation\n",
         "\n",
-        "This notebook demonstrates:\n",
-        "\n",
-        "* How to get gradients using TensorFlow's eager execution capabilities\n",
-        "* How to apply the gradients so you can update your variables"
+        "In the previous tutorial we introduced `Tensor`s and operations on them. In this tutorial we will cover [automatic differentiation](https://en.wikipedia.org/wiki/Automatic_differentiation), a key technique for optimizing machine learning models."
       ]
     },
     {
@@ -22,7 +19,7 @@
         "id": "GQJysDM__Qb0"
       },
       "source": [
-        "# Setup: Import eager and enable eager execution.\n"
+        "## Setup\n"
       ]
     },
     {
@@ -40,14 +37,10 @@
       },
       "outputs": [],
       "source": [
-        "# Import TensorFlow.\n",
         "import tensorflow as tf\n",
+        "tf.enable_eager_execution()\n",
         "\n",
-        "# Import TensorFlow eager execution support (subject to future changes).\n",
-        "import tensorflow.contrib.eager as tfe\n",
-        "\n",
-        "# Enable eager execution.\n",
-        "tfe.enable_eager_execution()"
+        "tfe = tf.contrib.eager # Shorthand for some symbols"
       ]
     },
     {
@@ -57,28 +50,15 @@
         "id": "1CLWJl0QliB0"
       },
       "source": [
-        "# Fitting a Simple Linear Model"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "-39gouo7mtgu"
-      },
-      "source": [
-        "## Step 1: Synthesize some data\n",
+        "## Derivatives of a function\n",
         "\n",
-        "To demonstrate fitting a model with TensorFlow's eager execution, we'll fit a linear model to some synthesized data (which includes some noise).\n",
-        "\n",
-        "In the code, we  use the variable names `w` and `b` to represent the single weight and bias we'll use to fit our model."
+        "TensorFlow provides APIs for automatic differentiation - computing the derivative of a function. The way that more closely mimics the math is to encapsulate the computation in a Python function, say `f`, and use `tfe.gradients_function` to create a function that computes the derivatives of `f` with respect to its arguments. If you're familiar with [autograd](https://github.com/HIPS/autograd) for differentiating numpy functions, this will be familiar. For example: "
       ]
     },
     {
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "cellView": "code",
         "colab": {
           "autoexec": {
             "startup": false,
@@ -86,170 +66,115 @@
           }
         },
         "colab_type": "code",
-        "id": "rQsdCg9PfIL-"
+        "id": "9FViq92UX7P8"
       },
       "outputs": [],
       "source": [
-        "# The constants we'll try to fit our variables to:\n",
-        "true_w = 3\n",
-        "true_b = 2\n",
+        "from math import pi\n",
         "\n",
-        "NUM_EXAMPLES = 1000\n",
+        "def f(x):\n",
+        "  return tf.square(tf.sin(x))\n",
         "\n",
-        "# Our inputs:\n",
-        "inputs = tf.random_normal(shape=[NUM_EXAMPLES, 1])\n",
+        "assert f(pi/2).numpy() == 1.0\n",
         "\n",
-        "# Our labels, with noise:\n",
-        "noise = tf.random_normal(shape=[NUM_EXAMPLES, 1])\n",
-        "labels = inputs * true_w + true_b + noise"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 3,
-      "metadata": {
-        "cellView": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          },
-          "height": 360,
-          "output_extras": [
-            {
-              "item_id": 1
-            }
-          ]
-        },
-        "colab_type": "code",
-        "executionInfo": {
-          "elapsed": 127,
-          "status": "ok",
-          "timestamp": 1505502830690,
-          "user": {
-            "displayName": "",
-            "photoUrl": "",
-            "userId": ""
-          },
-          "user_tz": 240
-        },
-        "id": "O4lsC4ckAcar",
-        "outputId": "2f760690-cafb-4777-b970-91d839f99faf"
-      },
-      "outputs": [
-        {
-          "data": {
-            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAesAAAFXCAYAAACC+2avAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzsnXt8VPWd99+TK7kykxtJQIebqZfaqogtrhKNa1ooEKl9\nCrpVn9ZNW6x9VWsbCi7aVUt01NZ9tq21KVZlFey2YkQNohhj3QWK2liCF5RIBCc3yEwmIZnMTOY8\nf/zmzJwzSSBAYibh+369eIU5c87vXLh8zvdu0TRNQxAEQRCEmCVurC9AEARBEISjI2ItCIIgCDGO\niLUgCIIgxDgi1oIgCIIQ44hYC4IgCEKMI2ItCIIgCDHOiIj16tWrufjii1m8eHF4269//Wvmz5/P\n0qVLWbp0Ka+//vpInEoQBEEQTjksI1Fn/eabb5KWlkZFRQWbN28GlFinpaXx7W9/+6QvUhAEQRBO\nZUbEsr7wwgvJzMwcsF36rQiCIAjCyTOqMesnn3ySsrIybr/9drq6ukbzVIIgCIIwYRk1sb722mt5\n5ZVXqK6uJicnh8rKytE6lSAIgiBMaEZNrLOysrBYLAB885vfZPfu3cc8RtzmgiAIgjCQhJFaKFpo\n29vbyc3NBeDll1+mqKjomGtYLBba2yeuuzw3N0Pubxwzke9vIt8byP2Nd06F+zsWIyLWt912Gzt3\n7sTtdnPZZZfxwx/+kJ07d/Lee+8RFxfH1KlTueuuu0biVIIgCIJwyjEiYv3ggw8O2Hb11VePxNKC\nIAiCcMojHcwEQRAEIcYRsRYEQRCEGEfEWhAEQRBiHBFrQRAEQYhxRKwFQRAEIcYRsRYEQRCEGEfE\nWhAEQRBiHBFrQRAEQYhxRKwFQRAEIcYRsRYEQRCEGEfEWhAEQRBiHBFrQRAEQYhxRKwFQRAEIcYR\nsRYEQRCEGEfEWhAEQRBiHBFrQRAEQYhxRKwFQRAEIcYRsRYEQRCEGCdhrC9AEARBOHXo6HCzcmUt\nTU2Z2O2dOBwl2GzWsb6smEfEWhAEQfjMWLmylurq6wAL9fUasJ6qqqVjfVkxj7jBBUEQhM+MpqZM\nwBL6ZAl9Fo6FiLUgCILwmWG3dwJa6JOG3e4Zy8sZN4gbXBAEQfjMcDhKgPWhmLUHh+Pysb6kcYGI\ntSAIgvCZYbNZJUZ9AogbXBAEQRBiHBFrQRAEQYhxRKwFQRAEIcYRsRYEQRCEGEfEWhAEQRBiHBFr\nQRAEQYhxRKwFQRAEIcYRsRYEQRCEGEfEWhAEQRBiHBFrQRAEQYhxRKwFQRAEIcYRsRYEQRCEGEfE\nWhAEQRBiHBFrQRAEQYhxRKwFQRAEIcYRsRYEQRCEGEfEWhAEQRBiHBFrQRAEQYhxRKwFQRAEIcYR\nsRYEQRCEGEfEWhAEQRBiHBFrQRAEQYhxRKwFQRAEIcYRsRYEQRCEGEfEWhAEQRBiHBFrQRAEQYhx\nRKwFQRAEIcYRsRYEQRCEGCdhrC9AEARBODE6OtysXFmL02mjsLADh6MEm806rGOamjKx2zuHdYww\n9oyIWK9evZrXXnuN7OxsNm/eDEBnZye33norn376KdOmTeOhhx4iIyNjJE4nCIIgACtX1lJdfR1g\nATRgPVVVS037RIuzz9dDTc33AQv19YMfI8QeI+IG//rXv866detM237/+98zb948XnrpJb70pS/x\nyCOPjMSpBEEQhBBNTZkooQawhD6b0QW9vv4qqquvZ/v27mMeI8QeIyLWF154IZmZ5j/wbdu2sXSp\neltbunQpr7zyykicShAEQQhht3eiLGoADbvdM2CfaEGH7GMeI8Qeoxaz7ujoICcnB4Dc3FxcLtdo\nnUoQBOGUxOEoAdaHYtYuHI7LAbPru61tD1AM2AAXkyY5sVr/CBxi3rwMHI5FY3cDwrCJuQSz3NyJ\nHdeW+xvfTOT7m8j3BhPz/uLi+klOTgQgOTmBnJwMsrIyuPnm5w2x7DKmTbuPgoJzaG7ew8GDt6PH\nuDMyNpKdncFNNz3Pxx+nM2NGFw8/vJCsrNhLOJuIf37Hw6iJdXZ2NocOHSInJ4f29naysrKGdVx7\ne9doXdKYk5ubIfc3jpnI9zeR7w0m7v2Vlz8XFuVduzT6+lSy2N69KRhd3zk5Z/LCC5dRWtrPwYOR\n7Xv3pnDjjYOvEUtM1D8/neG8iIxYnbWmaabPJSUlPPPMMwBs2rSJK664YqROJQiCIDB0gtlQsezB\ntg8nSU0Ye0bEsr7tttvYuXMnbrebyy67jB/+8Id897vf5Uc/+hF/+ctfKCws5D/+4z9G4lSCIAhC\nCLu9M1R+pdzauijrsWxVruUJx7JXrZrDrl2VuFzTsNkOsnr1EtaufWvQNYTYYkTE+sEHHxx0+2OP\nPTYSywuCIAiDMFSCmc1mHdSVXVn5Nk7nKsBCb6/G2rXrhxR2IbaIuQQzQRAEYXjoojxYTHewTmWD\nubyHEnYhthCxFgRBmIAYu5vpncrsdk1c3uMUEWtBEIQYYai+3SfSz3swK/rpp+cgLu/xiYi1IAhC\njDCYNVxVtXTI7UdjsOQzcXmPX0SsBUEQYoShyqhOpLxKEscmFiLWgiAIMcJQpVjm7S7a2t6ltJSw\nS3ywphojYUXLOM3YQcRaEAQhRhjKGjZub2t7F6dzFU6nconX1T1AaelU7r770mEL6XBF+ETc78Lo\nIGItCIIQIwxlDRu3l5aC0xlxibvdZ/KnPy06rjahwxVh6W4WO4xYu1FBEARh9IluGQpqPvXxCOlw\nRXg4IziFzwaxrAVBEMYRuku8ttaPx5MCLAQ0CgoODXuNoWLjQ51LktTGHhFrQRCEcYTuEr/hhv+i\npiYBeBY4xNtvu3G53ANiz4PFp4crwlLqFTuIWAuCIIxDmpsLgF5gOWChtVWjomJg7Hmo+LSI8PhC\nxFoQBGEcEG0hFxT4qK+fwrFiz5IkNjGQBDNBEITPiI4ON+Xlmygt3UZ5+TO4XO5hf69byPX1V1Fd\nfT0QoLBwN8dKAJMksYmBWNaCIAifEdEu6V27KqmtvS4cZz5aSVW0hdzcXEBt7SIqKgaOyDQiSWIT\nAxFrQRCEz4howXU6P09FRe2Qgmx0WRcUNFNf/xSQAXgoKPAcdUSmznCTxKRbWWwjbnBBEIRBOJbL\n+kQwu6RdwLts3Up4/aO7rBOBa4DFwLWhzyNHtJu9oqJ2RNcXTg6xrAVBEAZhNFptOhwl7NpVidP5\neeBdYCW9vRaqq9X6DkcJfX3r2LEjDjiMz5cWLsdqbs7B7AbPOalriUYS0WIbsawFQRAG4XjFaziW\nuM1mpbb2OsrK3KSkFA5Y32azkpychNv9bdzun1JTsyJs4UZb3QUFLeHzLVv21Elb/pKIFtuIZS0I\ngjAIw+3ypROxxDupr3+RurqXKS6OHxD71WPI5eXPhCxq8/pDvSREJ4r5fAkmy/94eoMb0WPV+/Yl\nUFhYSXZ2ETNn9kgiWowhYi0IgjAIx5tFHRHZGmABbvcWqqvT2LXrCWprrx+QrOVwlODzPcL27V1A\nNj5ffzhuPdhLQnSiWGnpNoZr+R8teczo7geNuXNlslYsImItCIIwCMfbajMisunAFvTOYk7n4kE7\ni9lsVpKSUnG7vwdYqKnRSEpaf9SXBKPotrXtAcoYjuV/PCVhEquOTUSsBUEQBmEoa3So7brI1tW1\n4HafyXAEcDChPNpLgtkKLqawsJK8vLMpKurl7ruHtvyPJsjH6+4XxgYRa0EQhEEYyhodarsusi6X\nm8svfwKnczFDCaAu+Pv3t6CSuoYnlGbRtZGXdzZbt15Bbm4GH3xwgPLyTYO6uo8myNI0ZXwgYi0I\nQswylo06IsLoBmrC9dD79iUQbaV2dLi55ZaXQiVXh5gzJ5kvfnEdzc052O0eVq26wCSkPl8PNTXf\nBzqBDVitXoqLE44plEcT3aO5uo8myDJZa3wgYi0IQswyGrXOwyXSMcwJ3Bauhy4srCTaGl65spYt\nW24Mb9u2bQNlZQG2br0CgPLyTab7sFofCO1rBa5l+vRnqaq6Ilz+NdTLiS66+/bF09HRRGNjEeXl\nz/Doo2VHdXWLII9/RKwFQYhZxjb5Se8Y9rzpGrKzi5g7V1mpBQUt+HwJvPZaErABWIgS4AyamvrD\nK0XfB2SjOphtAdJoa9uDyzXnmC8nuuhef/3TNDSswum0sHu3xo03PoHdjsSeJzAi1oIgxCxjmfwU\n6RjWhdGSnjmzJyygRotZ7fMgUAD00NbWTmkphnGWkTXmzQvyzjsP43SuwpgxPtyXE+Vuj+xXV6ex\nY8cVSOx54iJiLQhCzDKWyU+RF4WFDBVXHmgxfw5YRHLy7TidP8XptFFfr7Fgwe8oKzPex1dYtuwt\nnM7IsVu3gs023HKsQxhfIOCQuLonOCLWgiDELCMtQEdLWIv+bvXqOUReFAI4HFcOSG6LtvyhO/T7\n01Eu7nSgiwMHMnn11SVHPba3N5He3psoLKwkK6uIjo697Ntnp7z8mQGx63nz0qmp2YCawNXF/Pky\nHWuiI2ItCMIpw2Ax4fvuu5yVK2upqwvgdicDl1FfP5nBktmGEvTaWj8eTwrKCteAg4BqdgIaHR2V\nA65F9xps3Qq9vX7Uf8dv0NOTwFlnfUJDw3SczgwaGjz4fM/z+OPfCl8DJGG1eoGDzJuXwaOPXkN/\n/4BTCBMIEWtBEE4ZBosJR7fbhI3ANYPGi4dKALvhhv+ipkYD/gDk4PMlocqyAGpwuQoHWMjmHuEp\nqGQ2C273Il5//U7g1vA1bd/+AKCEuqRkfTjWDarrWVaWdch51sLEQMRaEIRThsES1gbGndOJjhfr\nFvXWrRj27WTz5oMUFf03gcCh0PbbAQuapqGywy3ActMYzGhr3eEooa7uZdzuyDX090+PuqZsQL0s\nqPGa0h70VEPEWhCEU4bBEtYqKl41CbjV+j7FxS5TIlnEot5AJLHrRYLBVSGR1YDHMQusDzWF+OjC\narNZ+fKX+9myJXINOTkHaWszZ4+D7hno5ni6ngkTAxFrQRBOGWw2azhG3dSUSUXFq1GJZB4cjuUD\nEsn27YtHucctwL1YLFPQNLMQQzvmDG0L8Klp2/vvv0lJyRFmzQqYXOIWSwD1IqASxs49N430dHP2\nOOiegSWha0mjsLABh+O6UXteQuwgYi0IQkwQnby1atUcKivfHvFWoyfSFa2jowmIxImTk9fg9Z6F\nWZyt6CIKO1BW9W3AfcDZwBG83ttoaNhCQ8P1pvM2NxcAV4XPd/jws2zYcMWA61Cegc2hZ+LG4bgO\nTYNlyzawd2/KZ96SVfjsELEWBCEmiBbRXbsqw4lUx9NqdLDyrNzcjPD3J9IVbfLkqTidG9FLsU47\nrZDZsz1s3/4A3d2ZBAKTQmumAW8CPwXeAGzAOcBiw2rpA8473OYvg5WyRbcy/SxbsgqfHSLWgiDE\nBNEi6nJN40QSqQaznJ999vrw98cSxsHEvrPzU4yW9ZEjlfzqV9excmUtjY2pHD78AR5PBt3de1Ad\nzGxEOp8ZO6C5gJ1AO3v2OLnhBicPPbT4pJq/yDzqUwMRa0EQYoJoEbXZDtLbG/mcn38oPOSioKAZ\nSAxNtTK7fo8lXqtWzWHnzntoa8sjPv4Q3d3puFzu8PGDiX12dpGp21h2dtGAkq/k5DXARcAelCir\nzmeZmR34fHfg9Z4P7AXuBiz4/VqosckLJCWlnrC7X+ZRnxqIWAuCMGocz4jLaOty9eolrF0b+ezz\n+amuVpOt1DSsaxisucn+/QHgSeBrwOQB4lVZ+TYtLbOAawgGLWzbplFREXEdDyb2M2d2snu3uT94\n9H59fRcBS1Au7/tISSmktBQcjjKWLXuL+vqrgM2mYyCD7ds/xe3+HsNxYw/2PB2OEpKTN4Zi1tIT\nfKIiYi0IwqhxPMlcg8Vjq6rs4d+Xlm4jInQZGEVv61bYtesJnM6bUC5oNYayuHjKAPFSmd1O1DSt\nLmAhdXUBGhubqKx8m/37W4gujVq1ag67dlXick3DZjvA6tVl3HnndswJZkfC1wNTuOyyI1RVqa5j\nEeu3K+qYLlQN9fDc2EM9z6efvkaaokxwRKwFQRg1oq3PfftSB8xr1jSGZX2b3b0ejKKn+mqvRu8+\nBhamTz+DqqqBGdXRmd2wAbd7Epde+if8/n9HdR4zD+6oqKgNJ7v19mosXVpJd7debuUDmoHvh86g\nAclApP+ncQ71oUP30NMzlbi4w8yblw4khLqfHduNLfHpUxcRa0EQRo3oeGpHx14aGswZ3sCg1uJQ\nfbhVL20f8ERo3URgAZFsbDia6EXHn5XYXoXfHwh9tqLizVU0NZ1BRcWrNDamYRTJSBexxSjX9lVA\nDSrT+wPgX2lufi18zqMNJHG53CQlDS+5TOLTpy4i1oIgjBrRceh9++wGoeykrq6Vvr4pKAt1IWAN\nW4tDuXxVL20Vu1ax6eXo4lVY2EBeXvCoojdz5pFQ/LkTeDG09QXgI4zdydzun1Bfr85dWLiWgS5v\njYgrezLqheFFIAd4gYKCwYV0sLjzcEutxnJkqDC2iFgLgjBqRFuU5eXP0NBgFkTzAI3lYWtxKJev\nUbCUIK4LZYV7cDiuO2YmtX58bW0LHs9PDedfh8redtHb68bvj8S0s7KmM3euOmdb27s4nStQrvh7\ngQySklbS359Mf/9cVDvQBcBfBj3/8cTxT0bYhYmFiLUgCJ8ZRqHdv99rGl6RkuKntHR92FocyuUb\n/QJgFLSKilePWfqkH19auo36eqM73A/00tX1KZr2C4wx7Vmz+sOu+VtvPURPzyaOHPkYv//HgA2f\nL5Kdrr94NDfnDCq2xxN3PpFua8LERMRaEITPDKPQKnd2RIxLSzEJ0VAu32gB7Orq5NVXf4guaD7f\nOh5/fNmAc+vH7dsXT0dHE93d8Zhd25OBa9G05zCKqdXqxeG4ElDiWVNzI2ZvwDVEZ6dDGna7e1Cx\ntdu1YcedJaFM0BGxFgRhTDhW/HWopKxoAUxMXItR0LZvjxtwzOHDxjnQG1HZ4CrrOzPTS3d3C8Hg\nTaG9zVOtiosThmy4EkloM2enT5q0i9Wrl/G9731EtNg+/XT04BBJKBOOjYi1IAhjwtEypI9GtGD2\n9+dgtpAPDzjmpptqQhncnahJWJF4dFzcM+Tnazidk0N7LwDuwGqdQXFxAqtWXRAuN2tr2wMUo2q5\nXUyatAuLxU1m5l407S7a2s5HDez4MZdc8is0bSrwGCpbXDVoOZ77loQyQUfEWhCEMWG43c2i9yso\n8Jmszby8Vlpa9PGSrfT2urDbN2GzHWDTpjJmzLDz8cdqAIfK1r4NYzwaDvPHP17OkiVr6OubgcXy\nMZdcks4f/nAlmgaXXfY4LS0/ALYA55KYuJaUlCx6erLwes8EvkZv72Ss1gdQHcwUfv+Foc9DN2g5\nFif6QiNMPESsBUEYE4abPBW934IFv+OKKx6hrs5CMHiY/v4errjiEIcPp/L++014vSo5rLfXRXHx\nL5k9+4vs21cPfA7owezG9jFvXjq//e1H9PWpnt2appGVtR5Ng5KS9bS0fAEl1KpEzO/vxu83J5Op\nuHU2Q3U0G6pBiyAMFxFrQRDGhOEmT0Xv19xcQFvbuwQCqrlKe7vGe+9VUl9/RSimq++7Ba/3Lhoa\nLMDVKFFNxyioU6Y0AqezdStE13qvXFkbcp13o4+1VEQnk6k1580LkpS0nrq6AG53K8aOZhJrFk6W\nURfrkpIS0tPTiYuLIyEhgT//+c+jfUpBEMaI4xncEZ08ZZyqZTx2sCSrDz4wj89U4zTBZjsQmtTV\nCfQxUFQvJTPzfk4/fSYdHXvp7k4JZXfrDVKeBRIpKPDQ1FRApGb6d6huZQNbnVqt71Nc7MLh+Ao2\nmxWXy80ttzzP9u1/ALKZNy+Iw/GVkXvIwinJqIu1xWJh/fr1TJ48+dg7C4IwrhnKtR0t4qtWzaG7\n20Ni4lr6+3PIzm7i7bcTaWubA3RTX78E2ExV1VJWrDiDmprb8fnsQCtvvHGE9HSLaXympn1ISclL\n+P3dJCSsIRBIAaZjdkt3A5NJTw8wa1ZPqO3p86HvazDXSa8LvSQsAZ4DJmOxrCEjYzpz5/aQlGRs\nxLLc9EJis1l5/PFvfRaPWziFGHWx1jSNYDA42qcRBCEGGMq1HS3iu3ZV4nTmoeK8GbS3dwA/wxgH\nbmrKZN++JhYufJ5gMNKk5PDhDcTH/4NJk9agaTPw+/fh9f6UhgYbEXd3MlBCxPX9D1QG94N0d/tp\nbEwNradPwUrH7GrPCZVYbWbfvgQ6OtxkZ5/HzJlHcDiWhsW5o8NNRcXwPAmCcDJ8Jpb1jTfeiMVi\nYdmyZXzzm98c7VMKgjBGDFUXHC3iym3dBhgbjAxsKnL11c8RDH4u6rsM+vtn0t9fTmFhJU7nl1FC\nrH/vB3YDS1HWsga8AawGLHg8GocP672+F6Ji1fuARabr1jOxy8s30dCwCqfTEuopHvEWqNptFdeu\nr19CX99fSE5OEvEWRpxRF+uNGzeSm5tLR0cH3/72t5k5cyYXXnjhaJ9WEIQxYKi64GgRV7HlwtBn\nN7AntIKKERcWNrBq1RIuvvgg8CEDZ0C3As/jdPaihHmx4ftEYAZKhDOIWM+6ld1FZmYuUGmYnnUd\ncB9wNoWFDTgc14Xv6WjeAn1spr7+jh1xuN3SHlQYeUZdrHNzcwHIysriyiuvZPfu3UcV69zcjNG+\npDFF7m98M5Hv70Tv7fBhNzfdVMPHH6czY0YXjz66hKwsszX56KNlrFixMbRPN2vXfotLLnmMlhYN\nFS+OuMCnTbuPd965iRUraggGVwGfALejYtDtKCs6H7gUaADsqIEaBSj394LQmkYSME7n6u6+j6lT\nz8XpXBzeIzW1kEWLjvDwwzeZrr+oqMf0olFU1EtubgZOp41ob4DF8qlpm9Np+8z+zkzkv5sw8e/v\nWIyqWPf29hIMBklLS6Onp4c33niDm2+++ajHtLd3jeYljSm5uRlyf+OYiXx/J3Nv5eXPhePRu3Zp\n9PUNZk3G8+tfLzJtOf/8PGpqNgD6HGkACzk5Z9LfH8/evSmh7XagAvgD8AVU/PkHRIu8Emz98/6o\n78wtSW222WRkfAw8hbK+D5OW9iF7987lO9+pNrmvf/zjL/DGG5W4XNOw2Q5w221ltLd3UVjYgdHi\nLyxs4ItftFJTY9zm+kz+zkzkv5twatzfsRhVsT506BA333wzFouF/v5+Fi9ezCWXXDKapxQEYYQY\nbhnWiQ6baG4uQLXhfAyj6L3zzm7OO28PZ51lrImeDEwFFpGfX09Ly2Sik8JU0xPlyk5IsBIIGL/L\nMp1j5swedu7sBH4Y3tbevoH29qvCCXB5eWdjt3fi8/nD7u7eXo21a9dTVWUfxOWvXOdJSdIeVBh5\nRlWsTzvtNKqrq0fzFIIgjBLD7TA2VFLZYOValZVvG9qGHgkd14MxvqxpBTidNxIM3kNZ2XoaG1Np\nb3+Pnh6NuLg/cs45mZx//jq2b+/A7Y4khcFeVCMSK3APA/uFb8Bq9VJcnIDDcTnnnVdLdOKa/nun\n8/M4nUuor9ewWv/IYC8jQ7UClRi1MBpIBzNBEAZluBbzUEllt9zyElu2qGzv+nqNF164g0DgrvDn\nBQvWsWDBOmpqfIbzgJpkZaGz024Yp9kTfnHYts1Fbu6DdHUB3IPFkk1c3Kf09/8EJdQagUAPkYSy\nbiwWK0uWBHA4rgx7B1SSmwvVSjQNleR2KcqKj7QKhUMYhV+6kQljwcBZcoIgCCiLWYkUgMb+/R9S\nXv4MLpfbtJ/NZuW++y7Hbvewb18ql1/+BCUlz7FtWwuqMxiAhUDAjlH8X3stgaSkROLjm4GvEmnr\n+S7gQtP2hs9lfnF4hvb2NPr7LwJmoWnX0N9fBNRgtT5KYWElKhltOSpLfDmTJ3cDsGzZW+F72LSp\njEmTfhnabwnwMzIz/5NJk+5AJao9BbiYNy+DsrL1nHfes5SVrT8h13ZHh5vy8k2Ulm4b9BkKwrEQ\ny1oQhEFxOEro61vHK68ECAS6cLu7qK6eyvbtf+Svf/22KX5tdJmDhtO5EZXBvQG4FiX6jRgt1N7e\nZKqrl2Ox3INxUIYS2Pvwem+jokJ1MTO72l1EN1BRMenFTJv2JJ98YkH913Ynykp2091dSHV1PHBZ\nKCb9MHl5ZzNp0gy83sgLRFzcJLzefwuvXVhYyUMPXXfStdLDDSkIwlCIWAuCMCg2m5Xk5CQCAWPj\nko20ta3hllseISkpNRx/bmxMY2AfbjXVCjaj6qKTgcdR86SnAN9ATbmaiu76jhxfAPyOLVuslJc/\nw+rVc9Bd7Q0NGVHJY16U21qjo6MJj2cFSvwvBHYBd4X214UdnE7V5ASexBzbzjZdR17e2SPS1ORE\nk/AEQUfEWhCEIYkWGV2Et2/vwu3+HrqlOGXKHShhzkANuuhFiV8Lqq92I5oWaRmqLG5QrmY/8L+Y\nG5skAT+jr+9RqqsTqav7G8XF8Tz99Bxuuul5tm0zCmwLaWk9/PM/r6exsQin02ilw8Dr7yISz+4h\nM/NeZs48C7vdg8/Xbyq9Gqn49FBJeIIwXESsBeEURs/YdjptFBZ2DCjPihYZFVceaIEePpyAcRBG\nQsJdBAIbUNnZk5k82YXbbRRNN/BrlKtcubaTk9fg988kGExBNTbRXd7fwe22UF2t3MdJSQA/B+ag\nLOrvEx9fFWoN+gy7dycSEWM9acyGPiHL6+3E6707fK3p6ZVs3apmTbtc7lEpvRoqCU8QhouItSCc\nwkTHmqNjqQ5HCUeOPMJrr2kEAk7i4rK45JKH2Lv3CGoaVTdwMYFAPkbxPuusc5g5s4emptcGtVhV\n1na84RgbfX3TgY+BuahxlQuAHAa6jzNRLvUl4evs6ckIX++WLY/Q16eL8SLgDmy2WcyfH4fDsZxv\nfGMnu3dH1szOLgqvM1Q51skyWusKpw4i1oJwCjBUg5NjxVJtNitPPfUvpm3l5ZtoabkFXXgtltvR\ntHOIxH5d7NnzNh99VITNtodHHilD0+CVV+7E75+NillfS2Lievx+o4A3Ar8wrZuRkYHHE+0+1qiv\nbzadr7+8jtPXAAAgAElEQVT/AKWl27DbO5k58wzee89oxV/A7NkJVFVdBsDMmUdCAzkiDVIEIdYR\nsRaEcc5wOo0NlY18tFjqcAVe07JQFnEVqnd3F8FgJb29quPX0qWVzJ07Db//34kI8wbmz8/kf/7n\nDrzeuSh39udN606ePJudO6/kllseYfv2LiAbn6+fn/98Hps3dxAMRlzdmvYL6uvVveXn/wJz0th7\nfPRRIeXlz+BwlIhLWhiXiFgLwjgnWojr6h6guDjPJNpDWdC6cKmYtcskXMMVeNU0pNLw+bemc7lc\nBQPOHxfnYc8eNxbLLJQrfSHK9R1Zd9IkJwBJSanhZLaaGo2kpPV85Svp1NQsN5wzsnZPTz6RjmgN\nwApcLls45l1VtXTYLunhtlwVhNFGxFoQxjnRQuh2n0l19SKM8eehLGg9ljrYoITIum6ghq1bCZdR\n9fWtY8eOOI4c2Y/ff6bp/Kq1Z+RcmtaI3T47dP5O4EWCwSAtLbOAr6FqoTcCC4iLu51g8MvAEVpa\nfkBFxeZBXzSefnoO77xTidM5DeVWj2SS9/a2ohLXQL0IbEHPAt+3L/64BFjqo4VYQcRaEMY5g2ds\nm+PPJ+L6LShopr7+KVRJViK9vUuorp7MSy+t4Z/+yYbb/R3gEeAAZrezF2OrT6/XRm1tVyi2nQXc\nZth3I3ANKSl9XHbZ0/zP/2Tg8fhQXczcvPiik/nzC03rt7Q0cMstzbhc01D/hX0/tE4asAO/f7ph\n//0YG6h89NEdfPnLTtzunzAcAZb6aCFWELEWhHGOLsR1dQHc7kkol7Kyns1WpMbTT885DjduIsZy\nLF1Yvd6LeP31N1EW60qUtfwEaiBHO6qLsdFFvQGP59rQPtEzoPuA5wgEGnj11UmGLO6rgY34/d+n\noeEOpky5k9bWTCCHlpZp1NT0oCzq7xPp7f0ukAt8E3gUVfaVazqf13s+Xm8iwxVgqY8WYgURa0EY\n5+iubJfLTUVFbbhcyuG4nIqKod24RiEvKurh7rsvNQl5c7O5bEpZyhpwhP5+O5GuY1ZUE5PridRG\n3wecg5o9nQc0AR+iyq6Mk7KSgCX4/Xpf8IENWDyeWSQnt2O2yO8AZgPVqBeEbuAW8vP/MzQ+MxX4\nDip2bbT6+1CW//AEWJLRhFhBxFoQJgjRtbwdHW7q6gIMZUVGx2O3blWJafooy/37WzAL3QcoUfwq\nmnY/0EYkVmxsF2pDCfWi0P7LUeJ9F8oK3xD6eRi4OXSMGo9pPp9qwKJp+4AZmIXc+HKgERdXyeLF\nm1m9+uusXbuerVuht9eC8jJsJDXVj9V6EKdzRegY87jM4T5TQRgrRKwFYYKycmUtbncyRgFsa3sX\nl2vOoCVYbvdsqqt7eeGFZwkEfoBqevI4CQkHiY930dcHyjLdiKaBGtDxBMpSNSd5KYu6m0gnsjwi\nVvi1obUnh36BalGqhFUJ//+GjrkTTZvOpEmfmu7DYslG0yLXnpmZHxbVqio75eXPhLK/rcByFi3a\nyN13XxdOWLPbzeMyBSHWEbEWhAmKEuPLiCR7fYDTuWKISVYaKuY7hUAgDlV+dROwhUDgCwQC/wvM\nAv7VsP8TKAs3A9iHxXIP8fF5BAKTQts04K+AhylTPqa11Xiut4F+LJZ7yMgoJDHxQw4f/hg4HdUi\nVG9peit9fRZaWlwUFlaSl3c2druH7m6LqT/4vHlB071Hu68ffngJ/f3xYiUL4xYRa0GYYOix6P37\nA8ALRMqjGgAL+/bFU16+iX37EigsrKS7Ow+PJxXoR8V6VwHPM3Bs5YOYXdFeIq7opSQn34HFkkIg\ncD1qulYkOe3ccx8hGFxDe7sVSEFlmF+IpnnxeBaQmPhbIuVWoCzvFoyu9by8s009vCsqjLHkr5ie\nQbT7OitrYGmaIIwnRKwFYYKgi3RdXWu4NElZqA8CU1GZ0y/S0dFEQ8Oq8PcLFqwjI8PCn/6Ui7KI\nLaj4cXTCVzbmmHIbcC9wJtCL16u3En0KJeSRY1991UJcXAoqSWwjymqPZJn7/YVRax9BxbUHTwST\nWLJwqiFiLQgThEjC2POYRfZzKMsYrFYv2dlFoVnO6vvm5hxefPEqtmypxOPJRAnkQuBhzHHoD4B7\nUOVQn6Cs9dNQ/43obvR7UWIcB/wB1VAlm2CwjWBwNsYs78j1pQE+EhPvxO+/ECXUXwX+jB7DLixs\nwOG4bljPYbCmJ7m5GcN/kIIQg4hYC8I4xihMjY3NKGs0Oqv6g9C2i0lNbaGpqdXwvYuWlgYuuiie\n1FQfHs8hIsM0koG1wNmo+dR+4N8M696DuQ57PxExVo1U4EbD9/eGfkZf35vArcyf/0fee68Bl6uA\nYPBBEhPzSEhwMW9eBg89dJ0pGexoXcgG6zr27LPXj+RjF4TPHBFrQRjHDBxxuQFlFW/AYulE0yaj\nksImAz/B6ZyDsnZ19/X7tLTcTkuLGiepYtjxeDyRrl9qNvVUVDmWbhF3hn4+jxLfhahY9FMoV/jn\nQvsaLegzQ9fXgYpPzwQ+4owzTufsszfj82XidN4aPm9f30ZgOe+8U3nU+46uH5euY8JEJG6sL0AQ\nhBMnWpiURfssYCEpCVSZlJVIzPkaVLz4Z6i4snnSVV7e2cTFTQltawLuIxA4DTW+8gPUCwGooRv/\nhnKTXwO8SE5OV+j35aiMbo9hfw34O6rL2b+EzvuvQCVnn51OVdXSIZqwdOJ0JvGlL71MefkzuFzu\nQe/bKMh2ux7rVueVrmPCREAsa0EYx+Tnt2N2KX+KsliXk529FqfT+F02A8XQQ3QS1/79h4hY6SsN\nx98R+mVHZY4b1+rC69Vbieq11A+jeocfRjVKuZWMjN/j9f4Wv/8H4WN1oR28x/mLwG243Zbw1Kyf\n/ewC3n//TZStoWq5jYIsXceEiYiItSCMA4aK0VosAZRL+xxUYtZNJCb+ioUL13PTTZdTVnYHXu8Z\nKBHXk8f0lqC7gHxgLSkpUykuDuDz+QkGe1FCrTcyIfTzDOA6VH11E+aXhAz6+oyNS14GvoDKLs9A\nxbxtBAIF5OYewOmcjHLHv0hjo5fzzvt/ZGbmUlhYSUdHPl7vx8BZKE+B2YK++urn8HrvDp970qQ7\ncDi+G35WkikuTERErAVhHDBUjLa5uQBlyR5BWco1zJ49i6qqpZSXb8LrvQtdnJOSKklIuJ2enjhg\nGiqurGqws7PvIzm5kOrqG9HHWMI+zILsDH13AGVdr0G9JAAsJCnpMIHAGjStCJVsdhvKotbLxzR6\nexPp7b2JwsJKenoScbt/gsdjwePRcDo3AuUUFq7F6bwRlQneHzpeXdP+/V48Hv2zcu9bLGdIJzJh\nwiNiLQjjgH374ol0IusKfdZdx06MYyA7OytDx6QSsUq34PPdh893H2bX9qNAKocPF1BX10JEBK8F\nqlCZ4fkogc5CDc643XD8htC+GoFAK5p2t+E7NaVLfc5An1kNVvLyzgagvn7g4I7U1Azi4n5PMHgm\nyoJ/mMREF37/atzugee12Q6OwBMWhNhGxFoQxgEdHU2ozmJKrDo6lCA7HCVs2/Ys3d0RIXc6M7nh\nho20tzehRk0aB20UYnZtu4Dv0NtrobdXL69agcoeTw/9Mo67/H3U8T5SUp6gtBS2bp0V9V1a6Pca\neXlO2toy0NuPFhR4SEpKHSRGrXHwYDvB4C8M2+8jIeE0/P7I2gkJXSQmPoHNdpBNm5aMwBMWhNhG\nxFoQxgHRjUy6u/MpLd2G3d5JWlob3d03YxS3mpofkJFRScQa34PK3Nbjyrqr20qk3MsKTCUh4X7i\n4vz4fBehksOMAtwedbwPTfuE1auXs2tXdUjw9VjyLs48Mxjq5Z3Ntm3Gmux14USwxsZUDh/eS1aW\nnVmz1g8i+oXYbAdMa3/taykSlxZOKUSsBWEcMHPmEXbvjoiVxzOJ+vqrqK/XyMy8n4Edyyx0dWWh\nOoFtQcWoVwE5KDd2GrAas8t6OZBIIHAPSsD/GZXR/RyRCVr5qASzT9AbpHi9LoqLf8mMGbPp6FiD\nxTILm62ZTZuWMWOGHYDS0m2ma2xuzglN7oL4+ATmzp2KwzEfm83Keef9P5Mwx8W9z2OPLeI3v5EM\nb+HURcRaEGIUYwZ4QcERFixYR3NzDvv3f4jbXR7ay0JcXA4DO5Y9h7Ki70fVNH+CyuZuAaaj/ukb\nBb6XSExZjzHXYIyFq45lFtRkrFzD8Vvweu/ivffUfmVl66mq+qHpXqLLsvLzD1FSsh6n8/NAN/X1\nSwA1DWzTpjKKi+/A650LHCEY/Cm/+c1msaSFUxoRa0GIUaIzwBcseCRUB52NcZqWGg+5jr/+tZfu\n7k+Bi1CW8OmoxiMbMVvRG1ATuIwCvxeoNHzuIjLUg9DPPOC7od8/aTg+zbSfPtXLWGbmcJTQ17eO\nHTvigMP8/e+dtLYas8U3huutZ8ywc+aZc0ICrpAuZMKpjoi1IMQI0bXU+/aZrd/t27twu7+HLqiZ\nmfeTnh7gwAE7s2YFuPRSqKkxCq4+0jJ6cEYGkEpy8hr6+opQJVnXAveRklLIxRd3smePO9yCNLJe\nu2Gdr6EsbTvwIcaBH8apXvX1Gjt33oPXO4nu7kwCgRRUh7PJRJLZrEAaBQVt4WcRbYlLFzLhVEfE\nWhDGEKNAt7Xtwem8CbBRX69RWFiJ2fo1dyCLi8vB6VyK07mFhgYbCQnNmEVZH2kZPTiji4yMOI4c\nyUEN2zgHZWmfTmlpAJhMS8vNqCSyDajGJMmobmcuVAw8DeU670MN67gPOJ1Jk97D5TIniLW0nAbc\nYDi/XtJ1DsrVvhyVABeplT6RLmRHG+4hCOMdEWtBGEPMgzjK0OueIZ3u7ngWLPgdzc0F2O0efL5+\namoiohsMtqISwFTcNxBIxizKHwLrAB8JCXeQmmqnt7cVvz+Prq6bQsdGyrL0TmDLlr2FuW3oY6hE\ntXZUDFwvq1qMst5/HfrcH2rCsiHqOj7GPPAjncjMaj9KvFfQ3Pxa+LmcSBeyW255iS1b1JSv+noN\nn28djz++7LjWEIRYRcRaEMaQgYM4VN0zWPB4FvHOO5U888ylVFa+zYEDqRQWVpKdXcTMmT3s2NGD\nx6N3KFPlUCrTuwg4hEokawb6uPLKqTz++DJKS7dRX38VqtXnFNO5LZaZVFS8SkGBL6r+OQnV4/t7\nqKYo0Znnt6EEWo9xL0QJsB/1wvBjIrHpDSi3ezfqBaAGZWWfvKtbxcONYQOZUyRMHESsBWEM0F22\n+/cHUMlaKlksISGDQCAiOE7n5/n615/D6VyFXtvc3d3K4cOddHbOwFwjnQccxOxyvg/4N/7+919Q\nWrqNtrY9QDHKlW22xHt7J1Fd/VVycx8gLq6SYPBclKguBJ4hMfGXTJ6sceiQUcg7iMTBdXe7FWWx\nbwQuQAm1up+MjB4uuSSN5uYUCgr+Avhpbn52hMqx9AEk+rUdPsn1BCF2ELEWhDEgeg611foAxcVT\n8PniTK5u2EN7+ySU8H0K3IbHsxGP5ybMMWA97mu2llUi1ye0tEyipUUD4oiLewzoIRj8FuamKWnA\nb2lvn40S/UuIWMQp+P134fGsJGJFd6GsZz0uvjD0nQc129ofWidyPxkZbTz00HWjEkueNy+dmprI\ntc2blz7i5xCEsULEWhDGgGj39/TpZ1BVdQUul5va2kiNMXyf/v77gVtRcd9OlGgbY8CHgZ8Dp6Hi\nwy4iItsM/BfG0q1g8FGUqNeiXNyXoBLMsgFzJzRlraeg11/7fJ9HxbHdKBd2H6rZyixUL3Er8+f3\n8NFHHQZvQCRJzelcQUXF6NRMP/TQYpKSamlq6sduD+BwLBrxcwjCWCFiLQhjwGClSR0dbm699QW8\n3lTgXVQf799hsVhD+3Whz3c210w7iSR96f29P48S4FuBNxgqLq72vxPlEg9gdqufTWLiLvx+Y1xc\nb1eqZ3FvBCJWfn7+PVRV/V+WLXsr1B5VT1J7At3CHq2aaRmNKUxkRKwF4SQYrFxI0zhmCdGqVXPY\ntasSl2saNtsBVq8uY+XKWmpqMlGCGBHI/v5VKKH7J1Ss2Si8XSir1rgtK7R/AcrCPow5lpsVtX8i\ng7ce3cP8+Vbee68y1GnsCPA14uJuJxiczWA13IcO5bFs2VuG2Lhu4SeG1tyA3R44mUcuCKckItaC\ncBJEdxnbseNOLJZEWlq+SHQbTSOVlW+H3MRq2tXatetDFmc8oAshqCztIpYsWU9dXStudwCz8LpQ\n7m/jtkmoGHRC6LNuMetx5vej9j8Ns3gfAdYQF5cNTGLTpq+wdu3bNDVl8v77/43X+wsi5VnmGu5A\noIv6+u8BZRQWqpeR3t5EdDe61erF4bhyJB69IJxSiFgLwkkQHXtubc3E7KbeyL598dxww5Ns394F\nZDNvXj8HD9pMx+lWeH19AsqtHRHA5OSPqaqqoKTkJdzuuURiyZ+gBnTEoVzZXyAu7m0SE0/Dau0h\nP9/PO++sQpVyAVyKcksfQrnN1QuFwije7cDdBIMWtm1TLxL6y4YqrzKWZx1Cud3PRDVJ0T0IFvLy\nzmbu3E6qqyO13MXFCdKoRBBOABFrQTgJomPPaqqV0UpN49Chf9DQMBNVp2yhpkajsHAtRoFsa3uX\nRx5ZwvPPr6e/H2ANMAP4kOeeUz2yOzo+QM2n/hmq3KsIVaOs1igsrKS2dkVYDE8/3YHRnR5xbx9C\nJY3prURdpKTcyec+d0FoSMh0ol8kdDIzP6S39ymUlR4EWiksTCMvz0Jb236czhWhPbVQOdbxdyIT\nBGEgItaCcBI4HCXs2mWM6YJRhAsLG+juzid6KEZW1nSCwXtCrTgP4XTm8POf/5WMjM/hdn8ntJ+b\nxMTf8KMfHaCx8QX6+rJRTU+CqCEdlqg1i/jRj14KNQc5hNc7jYHu7TtQGdr5JCSsITGxCJvtIK+/\nfiOZmVmUl3dSXR003YOxWcm5506ltdU8lzovL4etW6/A5ZpDRcVmkzBL0pcgjAwi1oJwEthsVmpr\nr6OiQh9l6QbWceCAlY6OvWRl2Wlvfx9lyUYEsKOjiba2eIwNTF555U5SUuyoEqgkQMPvn857730F\n+CbKMs4Grg8d86RpzY8+eoeGBqMlvQqze/sQytJ+ArievLxK6uuVkObmZtDe3oXDUUJX1yb++te1\n9PfnkJfXyurVXw/fb0tLtOcgKyzmgwmz9OsWhJFBxFo45TlZQRlMpMrLN9HQsCpUvuQCfoXqo51D\ncvJenM6fAq9hFD6//zz8/q8DT2F0b0cGX6SjMrvNk6+s1qmkprbgdJ6FWUhPJ9L0pBs1IUvPFrdw\n6JCV8877T7KzizjrLB93330pNpuVjAwrfv8PUUM4VMz6vvsms3JlLR988AnGF4BJk/6Ow/HdIZ9N\ndAIerBdLWxBOABFr4ZTnZAVlMLGPTjxLTEwmISEPm+0AaWmz+fBDGwOzsveimo34MIuuPviiG5X8\npR8zGYsFtmy5iNLSF4nUQOvrdaJGUBpFXwM+ADz4fB04nbfjdFrYvVtj69YHKC7OGzCas6kp0/CM\nzE1OZs8+86gvNtHPQeZSC8KJIWItnPKcrKAMJvYFBUeor9cTsRrw+1fj96syrbi421GiOR01ZcuF\nSkzTUIMyEjGKrsXyDzTtDSAXaMNYhlVSMpnKyrfxeH6KEtInAC8WSxslJalYLI/wt78l0Nv7CX7/\n5NCx/4pqQ/p703273WdSXb1owGhOu91jeEZ6k5PNwCJmzVp/1Gcjc6kFYWQQsRZOeU5WUAYT+4IC\nH2ZXduT7YLAIZeUeRDUNKUSJbyJKjL9NxH39Ppr2A5S4bgD+D/AUcXFTyM9vZu3aMr73vY9QQl2D\ncnHXk5CQQnp6DqtWzaGy8m2ami6goaGVQOBaw5V7MFvi3YCFtjYbmZn3Ehc3hXnzgjgcX6Gi4lXT\nM7Ja36e42HXM7G7JBheEkUHEWjjlOVlBGUzsm5qMiVjdmEVxHzAXZSk3oTK0dSt6DZo2GX1spGpu\nYkW5x53AfwM/Ixi04HSqeLLdrlFf/yKq8cgW4Iv4/X+jujqVHTueprVVTzp7LOo6JqFqtnWL/VpU\nYxM3Hk8u8G2SktZjs1kHeUbLhxXXl2xwQRgZRKyFU56TFZTBxN5siS5g0iR9OEcD5vnOD2O0ujVt\nKuaksNND3+k9wfVhHjVAOi+++AlPPTWX6upGlFDrDUgWAxtob3cb1r8KPclNZZtnYh7c8SAwFfg+\najZ2JCQgoisIY8uoi/Xrr7/O2rVr0TSNq6++mu9+d+jMUUGINYzJY0VFPeGM6ejv7HaNp5+eg6ZB\nRUUtH3zQR1LSSgKByWhaFgkJCeTkvMGhQzMwzneO7tsdH3+Q/v7lKOFNA/4GrEe5rI3DPJSL3e9f\nxHXX3YHqIKYnkaWH9rMQF5dNMOgCnkPVWbtRZWT7UF3QjIlsn0OJPOgxdIkxC0JsMKpiHQwGufvu\nu3nsscfIy8vjG9/4BldccQWzZs0azdMKwogRnTzW1xfJFDd/52LXrofp6cnH7U5GtQA9D11Uu7s1\nurvvZaBLvBdju87MzG5crl+i3OTdKGv6d8TFHSYYfCp0nC7cABb6+magyrgexNyx7A4CgWyUq/si\nlBv9bsP390ZdS1doTY3MzGYuv3y9xJgFIUYYVbH+xz/+gd1uZ+rUqQB87WtfY9u2bSLWwrjhaJni\n5u+2hAdzRFzKUzBbrlNR85/10qckoAKYTGbm/Vx+eT61tfmodqLGcqtzCAZ3EElYM2drJyU10tc3\nGbgg6nznh873o9Dn56K+n0JcXCWZmflcfHEQTfPT3PxsyJX/LWleIggxxKiKdWtrKwUFBeHPU6ZM\nYffu3aN5SkE4LvQZ0sYhGw899NWwUB0tU9z8XRpmIcxhYLb1p6h48JbQfsbM7CyqqpYye/bTUesk\nomZbzyYya/paVO/wmUAj55+vMWXKeurqWnC7jefrwzzCMtqqn0QwuBq3WyM9fSO//vWiE3+QgiCM\nKqMq1pqmjebygnDSRGZIR4ZsJCVFXN3G5LGiol7uvjviFjZ+19a2B6fzUpT1qgGfEB9/iJSU/Xi9\nOaSmdjB3bhJJSX/hwAErDQ31GIWzp6cJgNTUZjweo6C+jZqQFT2M42z07O3333+A555bSmNjE5dd\npiey7UG9GNQYzrMAWE1CwukEgx0Egz8I3YmFl1/uw+VyizUtCDHKqIp1fn4+Tqcz/Lm1tZW8vLyj\nHpObmzGalzTmyP3FFk6nMdlL/Xz5Zbj55s08/PBCiopO49lnrx/02I8//pitWz/C651OUpKb5OT7\n6euLCGt//4P097t5//2vMmuWPXzcsmUbaGiwY8z61rQscnMzyM+fTUuLMRu8ELOl3YNyg98U3max\n5JKbm8HNN+8OCfUSYD5KqA9jjImXlc3i2Wf/lWXLnuJPf5ocWkPD5UpizZo3ePrpa07mccY04+3v\n5vEi9zexGVWxPvfcc/nkk0/49NNPyc3N5YUXXuCXv/zlUY9pb+866vfjGX1YwkTls7y/kRoQUVjY\ngbI8jVauxp/+dA11dXdywQWn09ycg93eyaOPltHfHx8+trj4v/F6VUJXX9/AMiz4HL29izjnnDWc\nddaF4evcuzcF1RAl0go0Le1+PvjgAG1tjcBqIpb0PZhd1wdRLvGI0H75ywHa27tC6+qubiuwHKv1\nAdzuVeFrfuGFRzj33Cc57bROMjPvx+M5K3TMQvbufW3C/v2Uf3vjm1Ph/o7FqIp1fHw8a9as4Tvf\n+Q6apvGNb3xDksuEESE6S9vne4SkpNTjFm+Ho4QdO35Pa2ukhSf4AQutrZnU1NwYPseKFea4rsrC\nNoqzLvzmjmB9fRdRX78k3IpUNTHRO5KloHqEZ1BS8gRO57+gLO40EhPfRNM6CQSM1xYAesOJYfPm\nBXnooa8Aegx9Sfj4KVPexGJJQLnmu4EFBAIZNDRYaGhYQWHhWjyeReHrLShoobx8k0zIEoQYZNTr\nrOfPn8/8+fNH+zTCKUZ0lvb27V243SrufDzDOGw2K7m5X6S19RuGrZtRYmseB/nxx+mmYxMT38Pn\n08up9gNWLJZVaNoUVCb4wtA6R8JrNDVl8vTTc/D5nmf79k85csSH378aj8cSilXrE7bgnHOCfPCB\nJ6pF6BPAdSxePPD+VAxdnyftxuc7Pfyyoa7j58A09KSzrKzpzJ0bicd3dSXIhCxBiFGkg5kwLonO\n0lZzno8+jGMo13lHxweYLeJ/oCxRv2n71KkdpvXmzTuNurprUAKryq1UUuUToWP+GlrrJlQzkhfZ\nv99LRcWr3HnnpVRWvs3WreD361neVlRWOeiZ521tB+jtjVxDYuJHLFwYqX8+WjigtHQbZsv/QmAR\nqu5aY9as/rAY5+ZmcP75zx7zGQqCMDaIWAvjkugWnz5fPzU1Rx/GMdQozKys6TidxqQuG5BGUtLf\n8fkiLmhN8wMRgfzb3zKJuLKNomhDJXlp5OfXc/75f2H7dhdu909wuy1UV2vs2lUZVZetZ3nvITPz\nAOnpnTQ2FnHWWekEg/fQ2WnHZjvIpk3fZMaMSLLa0cZ75ucbx2lG3PLJyZlkZ1fS2FhEefkzOBwl\n5OZmyIQsQYhhRKyFcUl0r2qXy01SkhLv/PxD+Hx+Skqeo6OjiezsImbOPGKY0+wGati6FcrLn+G0\n047Q0BA993k+gUAzxlpop3MzYBbIwTuBvQtYsFrfp67u/2KzWSkt3UZ9fUTQW1ryMQu8L3TeFfT2\n/gaPZzVOp1qvrGxod/TRmrZYLAHMDViUWz47243TuSo8xxrW8+yz18uELEGIYUSshQmBUbzLyzdR\nXX0jSvwiohSZ01wDLKe3V1m5Cxaso6xsPXV1AdzuScDngQcJBmcCT6JaeU5mxoxuYKBAKsv7DlQ8\nuAOYDnhITvawbNlb2O2dFBT4TFZrMNiIWeCdwCpAw++fynDd0UezhpubC1DDO9TLSUrKc5SWQmNj\nUagP95kAAB2VSURBVOhFwLy+DOsQhNhFxFqYcETE1Ni9y0J2dhFz565n61bo7Y1s37LFD8SRlLSP\nSy9NYceO9/H7jT221wCn8cYbbXz8cdMQ8fJ/Ae4HvoxyN/8Tra1NtLbGU1+vYbM1oAR9BtCIGk/5\nKOACckhI8HHmmU9y8KATtzsXo5C3tb2Ly6WGhETHp49mDUeuU5VxlZYqC728/JmQRS3ubkEYL4hY\nCxOOiEh1YU4QcwNJJCe3mJK21Pzoa+nr0/jb39bQ3z8Ts+V8EbAEp1Nj6dJKamuvo69vHVu39hMM\ndqFqnp/D3GnsTuDfw59drmYiPb9dwC9DP28DLAQCGtOmraOjw4/bnYkS9rMAC07nCioqlAteud87\nqa9/kc2bXyQ//xCbNpWZ4tg6Qwm5uLsFYfwhYi1MOHQx2rcvno6OylDMugefzx9yj3cCG7Bavbjd\nzcC3ULHddPr6JqHKsIyWc6T0yuWahs1mJTk5iWBQj1u7gD9hFvjZUZ+Nru0tqOlYz5v22bEjLtTA\nxAIsxVjGFXGFW1Bu/GsIBi3hF4j6+h8OeA5DubXF3S0I44+4sb4AQTgROjrclJdvorR0G+Xlz+By\nucPf6WL05z/PZ+7cacTHJwAaBw7o7nErcC3Tp2eRnNwN/A8qE/tS1HCM6cDtKOt3FZAMPAW4sNkO\nAkZXuxvVuSwdJewQGdox1Gd96EdX1D6HMQu8uYzLbu8M7Wd277tc007gCQqCMJ4Qy1oYM06mZejR\nSpaG2ieSYBaJ1WZm5vL66z6MFmvEoq4M/VKfU1LuZNOmbwJGV3sNKiFtPpFe3/XAdeidxPLz/8E5\n56Tw1lsPEAza6OvbT1/fYlR2trLwi4sT8PnSTOVn8CbgJjHxQ1avXoamESr5ysKY+Ka/QAiCMHER\nsRbGjOEI7lAcrWRpqH2ys4v4whfWsWNHHHAYny8Nl+t0Iv20zRYrmMurZs/+AmvXvk1T00cUFPhY\nsOB3vPZaGr293ai49TWhdeopK3s93EnM4bjB9BLicrmpqNBjxgEcjiux2azh8jOVld4K3ArY8Ps1\n1q5dD2CqzY6LqyQ/HzZtWjKsZyYIwvhFxFoYM4YjuDC4BR6dkd3W9i6NjbOprHw7FKtuort7CkYL\n9PDhvRw4kIjb/RNAjcMsLFwL5KFi1p+iOnzplu1HGC3xDz98h927fwxsob5+Cvn573DxxbBt23J0\nK1qNpnRTVXXLkPetu+n1+9LLuxyOEqqqluJyufnSl17G7Y5MBDPHrNXPL3zhbLZuveL4HrogCOMS\nEWthzBhux6zBLHCHoyTkEv48cASncwVf//rDIctT1Vfr61qtD5Ca6sfpXAG8gVHwsrKm09PTh9t9\nLSr+vBHoBeJJT7fS3R3pbOb1TkElhy0HOmlp6aalxQU4UAlk76EakMwIdwY7mls/+r5eeukOzjjj\ni8yceYR587yDdGTTpMOYIJyiiFgLY8ZwSog6OtzU1bWiMqe7gIXU1QUAyMs7G6cz4gJWiVadKAs5\nsj9kk5WVHJpdbSznctHR0YT6Z2BM9Ipj0qQP+dKXckNWs3EQhje0dgORUiy9i9mtqBj2tVRXR9z6\nQ8Xmoz0LXu9cdu9ewu7dkUYtA5+NlFwJwqmIiLUwZgynhGjlytqw21qJ4gbc7klUVNSGRk1GLE2b\n7QC9vS+i1y4b909N3R/6HEnqSk1tCVnincCDoTOqY71eDYvlEcrK9CYqicBpgHGKlTG+fQ7K6s4I\nb9Nd10PF5gc2V4mUiG3fHsfOnZcPsMyl5EoQTk2kdEuIaQa29vQBC2lqysThKKGsbD3nnfcsZWXr\n2bSpDKvVO+j+2dlFoX1fo6wswM6dV5KXdzaRUq5CoMh07JtvJlFVtZTSUg3l+p5i+F5PSoOI0Kah\nLHe1TXUecw8am+/ocOPz9ZCYeCeqocq9wFfDx+ovJIIgCCCWtRCj6K7j/ftbMDcoSQYmY7d7BrXM\ni4vfCrmgzfvPnNkzYF+zZbsA1S50cfjYI0eacbnchvi4RiQBbQEqLn4xSqi/Sn7+b9C0Plpbn0OP\no1dUbB7gAbDbPaxcWUtNzfdRVv2LZGZm0Nv7K/z+81Gu9oU0Nb02sg9VEIRxi4i1EJNEXMeq21hm\nppf09BaysuzMmrWeVasuoLx804A4sB4Hb2xM5fDhvUPuv2LFGezc+Qlxcb9H09rQtGyU5RwZien3\n9/GlL71McXE8mzYtYfHi/6Kt7V5UMtmnXHppOllZ7tCam3E4bmDZsrdobdXj6CrePm1aIYWFkU5q\nDsflLFv2FsYGLTNnPovdnkF19VVE9wQfbu25IAgTFxFrIWYwJmIpi7oTo5ht3fp/wvuqyVqROPCO\nHXdywQX/v717D66yvvM4/s4dSAI5QIBEuiGAEay2TC11YVxCsY0SwKBopXWkRZuV0sEx7Qw3124t\n3VBTrbZDhyJip1AqWNYkUAhVA4RWKcvWTTEqZYg0CLmS5DQJhlzI2T8eTs41yUlyDufJyef1jyR5\n8jy/x4if/G7f379QVTWelBQb+/bdicVyj5frjbra+/f/DZttKvZtXUbFskSMFd23AGeBWVitVyks\nXAgcYO7cmRQUrMAepnFxO/rorR/qPsMabMye7VhwVlv7IcYsVAuw8PqCMc8V7mvXHtA8tYgorMU8\nPM+Jfg3jPGnPbUru88A1NaMpKjIWf5WW2igpeZ709AleVl4bVcpsNvszXgVGYdTyjsGo8x2O8yEc\nsIeKitFERUW4PLOqarzHO2zYcAenTm2msXEyHR2tdHZ67iNft+6oS3GT5OTN5OU9isWS4LHCvbfj\nMUVk+NACMzEN9wBOSLjavXjMfZuSo0421/853uV7rdYZFBau6F6k1VNdbSOclwMP4Dhw4zxGr95+\nTSy1tR9y7twZl2c6/wJhr1V+773/Q2VlCq2t99HZGef1evf3nDDh1u6hbvf30l5qEQH1rMVE3Lcy\n/eu/dhET00RFxWjWrj3iUmTEfcjY4LywrAXn3qx9LrukpBqr1blKWTze64I7evUjRpyisvJx4G3g\nBSIj4/nqVyPIy3MMs3uOCuwBFpGQ8DyTJ6fS0HCW8vIUsrPfICmpvcfiJjq+UkS8UViLaTiOthxF\nQ8NZ3n13NE1NI4H5lJaOwbl2uMWSwNGjj/LUUwc5caKZrq6RjBr1X3z6adL178nEOQjtK8dd63I3\n0dLSRXGxZ4979OirhIe/CtTT1TWRq1dPYN9j3dlp429/20xj4z9Zu9Y+x96Ja489DhhDevpE4FPK\nyjZQWRlGWZmNhQt/1UPBEx1fKSLeKazFNOxBlZ2dT1mZY07Xfq6z+/ytxZJAdPQorNYngDCamowg\njI6OoqLimNeeqc3m8hG5uf9Gbu4ujh6toqnJ0eP+9NOP6ezcdP3j3TiOtQQIo7LyNpYuzae6ehoQ\nAdTg3LNPSDhDenqj28pv43urqpJU01tE+kVhLUHR2/GYnoVQjLlfb/O37td6C0LnZ9XWfkBl5SPA\nCUpLLZw6Vcirr36ZoqIyjCpmxtx3Z2eM030XERb2PDabYw82NFJdzfW2NQNfJyrqP/nsZ79w/ZeE\n5S7z0KrpLSKDobCWoOjteEz3cHPupdo5iqZ0Ar/AmKOezJkzZzl/fjqpqSlenwVZwHPAOowe8hKW\nLv0B7e3P4dqTHwf8DmNOu4nY2Gu0tPwEo6zoFYzKaP/h8j2xsVO89pg1Dy0ig6WwlqDo7XhMz3Bb\n7lIYpKHByoIFu64vLmsB/oF9q9XVqzbuv38zpaVrenyWUVrU8XFbW6rb12MxVoR/B8ee6k20tKzC\nqP8dS2Rkk8u2LIhlzhz7QjdXmocWkcFSWEvA+XIetfPQcF/h5r5PGTbjHLaNjZNdnlldfRqoAyYB\nTURHv097u+PZMTEfc/Wq4+Pw8L8QE3Mzra2OeyYm3sq8eYc5e3YkKSlW2tvDXY6wTE4u46WXHvXr\nvyNVLhMRO4W1BFxP51E79557Kh/qjWtP+Z/ANYzDMIxqYBbLRS9D369h1P22MW9eM7Gxjmd/97uZ\nfOtbRiETi+Ui+fnfIDfXtcb41KmfsnfvCurqjIM6GhutREc79/4fHVS49jYtICKisJaA8zbk7d57\ndi8f2ltYTZpUh2Pl9SGc545HjPgB+fkP88QT53Ad2o4HrEAR77wziowMG3v3Oupul5be7vKMvDxj\nq1hP88z+HtrubVpAREQVzCTgfKnK5R5Wb74J2dlv0Nho7b7GXiXs3XerMHrKBzAWejm+b8aMO0hN\nTfFS4awZo/DJclpbV7hUN3O/f0ZG8fUiLF9mz547AFi27CSf+cxmFizY79Euf1DlMhHpjXrWEnB9\nrYb2drBFa2sUhYXLgV0899yXWbfuKCUlnVitMcDNGNXGwFix7Riurq39kIwMSEq6wsKFO6iqGk9S\n0mWgg2PHYl3mod17r84nfZWWHqKk5C1Gjap2mR+/eHEPZWUr8PcwtVaMi0hvFNYyIN4WRCUmxvf6\n9Z7mdD0XjD0HrMIeqJ6lPH+CUdP7MAAjRjzDzTfPor7+LJWV36Gy0kJpqY2srF28+ebd3W2Jiemk\ntXU39pO2ej4cxCg9arWGYbXux3PPt/+HqbViXER6o7CWAfG2IMo4PrLnr/cURp5bq27FOBrTGA72\n/PoM4JcYx1oa27UmT95BRMStVFZauq9zPuXKOewTEp4nPX2i18NBjLY6lx5twbPmuIapReTGUljL\ngHhbEFVfbyU7e7+X86h774m6b+OaNOk0V69eBuppb48lKanNrUjKOVpaEl32OZ84EU56uvftYO5t\nnTLlZrZv77l4iethHwtJTt7MuHFpNDaeIyHhM0yb5jgFTFuuRORGUFjLgHjbJ716dZHP51E7y8tb\nQFvbDv7yl3CgHputDat1JRBGUZG3gy+Wc+edr2G1Ovd468nLM+a43ed9fS332dNhH/ZtWYmJD3Zv\n3bLTlisRuREU1jIg3vZJL1z4v7ifRz1lSkGfC6YslgRiYqKxWpdgzEN3YAR9JpDgtd73nDlxFBW9\nhrElq5k5c+KwWBJYv/4LLFu2n7//PYk//nEbqak3M2WKY7GZL4u3+jN/rC1XInIjKKxlQLztk25s\njMJ5fjc9PdLrcLOd8xCyMWy+H1iBo7e8B1jutSf80ktLiI4+SkXFNVJSOsnLWwzAsmX7XRarffTR\nHj76aEX3YrPBcB7m96USm4iIvyisxS+MHuV8jIAdQVTU/1FefgvZ2W/0OI9rDCHbe9MzgCqce6kj\nR3aQkbGrx+pm3nq/jY2TXe7hz9XbzsP8PVVi05YrEQkEhbX4hdHDHIOx//l3dHQ8S1lZGGVlrvO4\nnr3p/wYex3FutKOXmpFB9/nWvs4LWyyf0NoamNXb5887rxL3XolNRCQQFNbiF3l5CwgL28mxY9do\nauqgq8v7PK7nnukXcD43Oioql8jIz2CxXGTjxvsAKC8fhXNIfvzxKI/n238JGDNmOg0Nz2CzJREW\nVk1q6nTS0nb5pcebmtrMqVMa8haRG09hLT5raLDy1FN/vL5q+zJz5sTx0ktLsFgSsFgSiI6Oxmpd\njrE4zHuouS/IioyMp7PTfu0YOjpS6ej4Bq2tNnJzd7F9ewoNDX93uV99/VngHpe2uf4S8DWysnax\nffsK/Gnr1kza2jTkLSI3nsJafLZu3VEOH7YPWdsoKnqN6Oij3cPAjmHiTGDP9TlnXELNfUHWqFEN\nxMUZ+5g/+eQ8Vms29gM39u/v5NSpXxAbm4QxFx4HtDB2bIpH227EquyxYzXkLSLBobAWn3lWEoun\nouJa99cdw8QJwHIyMjznlh2FRzqxWkfQ1PQdmprGMHv2LqZOnUBh4Rjsq8BttjAqK22MGPEMsAl7\nwE+btsujbVqVLSKhTGEtPjMC0V6TOxb4gKQkxypvX4aJ7QuyMjKKKS1d2v35iorR7N17B7CLwsJm\nHD3pZq5ds7gVRfG8r1Zli0goU1iLz/LyFnDy5C+prjZqcsMSYEf31/szTOzeE66t/ZCHH4aUFBsx\nMVW0ta3u/lpExA/Yvv3fe72fVmWLSChTWIvPLJYEJk26jepqx1B4VdX4Ad3LuSdcW/uhy2lZ8fHb\naWtzPCM19TZ/NF9EZMhSWIvPvJ07PdC5YeeecEYGLqdlRURYcV79nZbWNui2i4gMZQpr8Zn7udPJ\nyZvJy3t00Pd1HxKfMyee6GjNP4uI2CmsxWfuq8EnTLgViyWhuyBJZaWF5OSGfh8T6bk4bLGOmRQR\ncaKwHqYGcg5zT9ujPKuS9e+YyIEsDtM50iIynCish6mBnMPc0/aoG3VMpHNA19Z+QGXlasCic6RF\nJOQprIcJ957oxx/H0t+A7akH3FOP29+9X9cefBbGXuyv+9x+EZGhSmE9TLj3pJOTc+mpfndf3EN4\n40ajmIkxZ93Y3eMeSO+9N54V1GKv/1kVy0QktAUsrLds2cLrr7/OuHHjAMjJyWHevHmBepz0wT3o\nxo6dwuzZA1tx3VMIJybGU1fX3OMzB9v7de/BJyeXMWFCl1aMi0jIC2jPeuXKlaxcuTKQjxAfuQfd\ntGnXBtzL9TWE+1uvu69hc88580e1qExEhoWAhrXNZgvk7aUf/Fk729cQ7u8z+xo2V0lRERmuAhrW\nu3fvprCwkNtuu43169cTHx8fyMdJL/wZdL6GcH+feaNWlYuIDDVhtkF0f1euXMnly5c9Pp+Tk8Os\nWbOwWCyEhYXx4osvUldXR25u7qAaKwNXX29l9eoizp+PIzW1ma1bMxk71lxDyA8//Dtef91Y3Q02\nvva1Pezd+/VgN0tEJOgGFda+unTpEqtWreLAgQN9Xuu8QCnUuC/AupGys/NdCpdkZfl/X/Jg36+x\n0cratUddeuxmmpMO5s8v0EL53UDvN9QNh/frS8CGwevq6khMTATgrbfeIi0tLVCPEh84hpitQBFv\nvgnZ2W+YqvKX5qRFRLwLWFj/9Kc/5aOPPiI8PJybbrqJH/3oR4F6lPjAsSisCFhOa2sYhYUD3/vs\nbeW2L78diohI/wUsrPPy8gJ1axmADRvu4NSpzVRVTcJmG/wiLm8rtwsKVviruSIi4kQVzIaJzZvf\nu3685Wv0VrnM3mMuL4+goaGCcePSmDr1isdwuVZui4jcOArrYcIRrpnAHkaO7CAjA49tV44e8x5g\nA5WVYbz/vudwube91vX1VrKz9+skLBERP1NYDxOOcE0AlpOR4Qhf5/nnf/yjEyOA43DuOZeXjyI7\nO9+jHrh95faGDV9g1qxfcfHiOvpbC1zHXYqI9E5hPUz0VsjE9TSr3RjD5M04D5dfvnyGsrKnsQdx\ne/sOfvObh7vvkZ2dz8WLtzKQoXF/H/ghIhJqFNbDRG/bolznnxeRkPA8kycn09Cw+fqc9accPZqA\ncxCfOBHu5R4tDOQkL81/i4j0TmEtbvPPY0hPn8j27fe5XJOWthXnIIZ6L/e4D2OuO5bk5DLy8h4d\nwPN13KWIiDuFtfhU63vOnDiKil4D4oFm5syJ87hHTMxhzp4dSUqKtV8nYvnzkBERkVB0Q8qN9keo\nl5Qbqu/nSynQofx+vgjl9wvldwO931A3HN6vL+pZB0ioVfhSKVARkeBRWAeIKnyJiIi/hPd9iQyE\nVjiLiIi/KKwDJCXlnxirpkErnEVEZDA0DB4gWuGsymQiIv6isA6Q/i7ICsVgU2UyERH/UFibRCgG\nm+btRUT8Q3PWJhGKwaZ5exER/1DP2iRCseSm5u1FRPxDYW0SoRhsKqQiIuIfCmuTULCJiEhPNGct\nIiJicgprERERk1NYi4iImJzCWkRExOQU1iIiIiansBYRETE5hbWIiIjJKaxFRERMTmEtIiJicgpr\nERERk1NYi4iImJzCWkRExOQU1iIiIiansBYRETE5hbWIiIjJKaxFRERMTmEtIiJicgprERERk1NY\ni4iImJzCWkRExOQU1iIiIiansBYRETE5hbWIiIjJKaxFRERMTmEtIiJicgprERERk1NYi4iImJzC\nWkRExOQU1iIiIiansBYRETE5hbWIiIjJKaxFRERMTmEtIiJicoMK68OHD7N48WJmzpzJBx984PK1\nbdu2kZGRwcKFC/nzn/88qEaKiIgMZ4MK67S0NLZs2cLs2bNdPl9eXk5RURGHDh1i+/btPPvss9hs\ntkE1VEREZLgaVFhPnTqVKVOmeARxcXExmZmZREZGMnnyZFJSUjh9+vSgGioiIjJcBWTOuqamhqSk\npO6PJ06cSE1NTSAeJSIiEvIi+7pg5cqVXL582ePzOTk5LFiwwOv3eBvyDgsLG0DzREREpM+w/vWv\nf93vm06aNImqqqruj6urq5kwYYJP35uYGN/v5w0ler+hLZTfL5TfDfR+Q12ov19f/DYM7tybXrBg\nAYcOHaK9vZ1PPvmECxcu8LnPfc5fjxIRERlWwmyDWKb99ttvs2nTJhobGxk9ejQzZszglVdeAYyt\nW/v27SMyMpKnn36au+66y2+NFhERGU4GFdYiIiISeKpgJiIiYnIKaxEREZNTWIuIiJicacN6x44d\nzJgxA6vVGuym+NXPf/5z7rvvPpYuXcrjjz9OXV1dsJvkV3l5eSxcuJCsrCzWrFlDS0tLsJvkN73V\nwh/Kjh8/zr333ss999zDyy+/HOzm+NXGjRuZO3cuS5YsCXZTAqK6upoVK1aQmZnJkiVL2LlzZ7Cb\n5Dft7e089NBDLF26lCVLlrBly5ZgNykgurq6uP/++1m1alWv15kyrKurq3n33XdJTk4OdlP87tvf\n/jb79++noKCA+fPnh9x/gHfddRcHDx6ksLCQlJQUtm3bFuwm+U1PtfCHsq6uLjZt2sSOHTv4wx/+\nwMGDBykvLw92s/zmgQceYMeOHcFuRsBERESwYcMGDh06xJ49e9i9e3fI/Pyio6PZuXMnBQUFFBQU\ncPz48ZAsW71z506mTZvW53WmDOvc3FzWrl0b7GYERGxsbPefW1tbCQ835Y9gwObOndv9TrNmzaK6\nujrILfKfnmrhD2WnT58mJSWFm266iaioKBYtWkRxcXGwm+U3X/ziFxk9enSwmxEwiYmJzJw5EzD+\n3zJt2jRqa2uD3Cr/GTlyJGD0sjs7O4PcGv+rrq6mpKSEhx56qM9r+6xgdqMdOXKEpKQkbrnllmA3\nJWBefPFFCgsLiY+PD6lhK3f79u1j0aJFwW6G9MJbHf/3338/iC2Sgbp48SJnzpwJqQJUXV1dPPDA\nA1y4cIFHHnkkpN4NHB3T5ubmPq8NSlj3VG/8qaeeYtu2bbz66qvdnxuKvZi+6qnn5OSQk5PDyy+/\nzG9/+1vWrFkThFYOnC/14rdu3UpUVNSQmyscSC38oWwo/v0ST1euXOHJJ59k48aNLqN3Q114eDgF\nBQW0tLSwevVqzp07x/Tp04PdLL84duwY48ePZ+bMmZw8ebLP64MS1j3VGz979iyXLl0iKysLm81G\nTU0Ny5Yt4/e//z3jxo27wa0cOF/rqS9evJgnnnhiyIV1X++Xn59PSUnJkBw1GEgt/KFs0qRJVFZW\ndn9cU1Pjcx1/MYfOzk6efPJJsrKy+MpXvhLs5gREXFwcX/rSl/jTn/4UMmH93nvvceTIEUpKSmhr\na+PKlSusXbuWvLw8r9ebasI0LS2Nd955h+LiYo4cOcLEiRPJz88fUkHdl4qKiu4/FxcXM3Xq1CC2\nxv+OHz/OK6+8wtatW4mOjg52cwImVHqkt99+OxcuXODSpUu0t7dz8OBB7r777mA3y69C5WfVk40b\nNzJ9+nS++c1vBrspftXQ0NA9PHz16lVOnDgRUv+//N73vsexY8coLi7mZz/7GXfeeWePQQ0mnLN2\nFhYWFnJ/0V544QXOnz9PeHg4ycnJPPvss8Fukl/9+Mc/pqOjg8ceewyAz3/+8/zwhz8MbqP8xLkW\n/qpVq1xq4Q9VERERPPPMMzz22GPYbDYefPBBn1amDhXf//73OXnyJFarlfnz57NmzRqWLVsW7Gb5\nzV//+lcOHDhAWloaS5cuJSwsjJycHObNmxfspg1aXV0d69evp6uri66uLjIzM0lPTw92s4JGtcFF\nRERMzlTD4CIiIuJJYS0iImJyCmsRERGTU1iLiIiYnMJaRETE5BTWIiIiJqewFhERMTmFtYiIiMn9\nPyQ+uNKCpR6MAAAAAElFTkSuQmCC\n",
-            "text/plain": [
-              "\u003cmatplotlib.figure.Figure at 0xa813090\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          },
-          "output_type": "display_data"
-        }
-      ],
-      "source": [
-        "# Plot the Data (Optional)\n",
         "\n",
-        "import matplotlib.pyplot as plt\n",
-        "\n",
-        "plt.scatter(inputs.numpy(), labels.numpy())\n",
-        "plt.show()"
+        "# grad_f will return a list of derivatives of f\n",
+        "# with respect to its arguments. Since f() has a single argument,\n",
+        "# grad_f will return a list with a single element.\n",
+        "grad_f = tfe.gradients_function(f)\n",
+        "assert tf.abs(grad_f(pi/2)[0]).numpy() \u003c 1e-7"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
-        "id": "JaFHyAG9nDET"
+        "id": "v9fPs8RyopCf"
       },
       "source": [
-        "## Step 2: Define our TensorFlow variables\n",
+        "### Higher-order gradients\n",
         "\n",
-        "We'll use Keras's object-oriented [`Dense`](https://www.tensorflow.org/api_docs/python/tf/contrib/keras/layers/Dense) layer to create our variables. In this case, we'll create a `Dense` layer with a single weight and bias.\n",
-        "\n",
-        "(**Note**: We're using the implementation of `Dense` found in `tf.layers.Dense` though the documentation link is for `tf.contrib.keras.layers.Dense`. When TensorFlow 1.4 is released, the documentation will also be in `tf.layers.Dense`) "
+        "The same API can be used to differentiate as many times as you like:\n"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 4,
+      "execution_count": 0,
       "metadata": {
-        "cellView": "code",
         "colab": {
           "autoexec": {
             "startup": false,
             "wait_interval": 0
           },
-          "height": 34,
-          "output_extras": [
-            {
-              "item_id": 1
-            }
-          ]
+          "height": 276
         },
         "colab_type": "code",
         "executionInfo": {
-          "elapsed": 22,
+          "elapsed": 730,
           "status": "ok",
-          "timestamp": 1505502830753,
+          "timestamp": 1527005655565,
           "user": {
             "displayName": "",
             "photoUrl": "",
             "userId": ""
           },
-          "user_tz": 240
+          "user_tz": 420
         },
-        "id": "z9r-ZeyrXu3A",
-        "outputId": "6230a7a3-29fe-4d08-f101-da80425bad82"
+        "id": "3D0ZvnGYo0rW",
+        "outputId": "e23f8cc6-6813-4944-f20f-825b8a03c2ff"
       },
       "outputs": [
         {
           "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXYAAAEDCAYAAAAhsS8XAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzsnXd0HNX5sJ/ZXrTq3ZLV3IvcDdgGGwOm2WCbHhJa6C2B\nUBISQioBfoQPkjhACA4QCIQSDITQbGMbsHHvVbZ6s7q0vc18f4xmJVltJa0q+5zDOXhn9s7dqzvv\nfe/briBJkkSYMGHChBkxqAa7A2HChAkTJrSEBXuYMGHCjDDCgj1MmDBhRhhhwR4mTJgwI4ywYA8T\nJkyYEUZYsIcJEybMCCNkgl0URVasWMHtt98eqibDhAkTJkwvCJlgf+2118jJyQlVc2HChAkTppeE\nRLBXVlayceNGrrjiilA0FyZMmDBh+kBIBPvjjz/OQw89hCAIoWguTJgwYcL0gT4L9g0bNhAfH8/E\niRMJVycIEyZMmMFH6GutmGeeeYYPP/wQtVqN2+3Gbrdz3nnn8dRTT3X6HUmSwtp9CKittvH8UxsQ\nxZY/4aXXTGfa7PRB7NXAU1dj5y9PrIfmYUgeFcnya2aQmBI5uB0bYE5WNPHS/9uE6JcHYukVucw8\nPWOQezXw7NhcyCfvH0Bqfi+uumkO4ycnD3KvBpY+C/bWbNu2jdWrV/PCCy90e291tTVUj+03EhIs\nQ7qfWzfls2tzMTNPH01UrJEv/3eU5LRIVnx/5mB3rUP6azw3fnaMQ7vLOX1RNrVVNvIOVZGeFcPS\nq6YNmT6GmlP7KYoi/3ltF9WVNhacO4btXxfi9fi5+Mpc0jJjhkw/+5t9O0r5Zu1xDEYtpy/KZuOn\nR4mOM3HlTbNRqTo3UAynv3swhOPYhymSJJF3sAqtTs35l05mQm4K6VkxVJY2UVdtH+zuDRgOu4ej\n+yqIjDYwbW4a514yiYTkCMqKGnC7vIPdvQFjz9YSqittjJuSxNTZaVywcgoAX3xwCL9PHOTeDRyH\ndpej0ai47PqZTJyWwoTcFOprHBzdf3KwuzaghFSwz507NyhtPUzfOVnehLXRRdbYeLQ6DQATp6UC\ncGhv+WB2bUA5sLMMv19i2pz0gEaWNS4BUZQoOlE3yL0bGDxuHzu+LsRk1jH/nDEApI6OZtL0VFxO\nLyfLmwa5hwNDU4OT+loHozJiiIw2AjB7QSYajYrtXxXg9foHuYcDR1hjH6bkHawCYOzkxMBnmWPj\nMJq1HDtwEt93YBJ7PT4O7CrDYNQwPrfFhpo1Lh6AgmPVg9W1AaWyrBG/X2JCbjIGozbweXqWbIIp\nLawfrK4NKMX58kI+Oic28FmERc/UOWnYbR7yDn53tPawYB+GiKLI8SNVGEzaNvZTtVrFhKkpuF0+\n8o+OfKGWd7gKt8vHlJmj0GrVgc9j4kxExRopzq/7Tixw5cWNAKSkR7f5PHV0NIIApUXfEcHevEMb\nnR3b5vPxU5IAqChpHPA+DRZhwT4MKS2sx+XwMmZCYjuH0MRpKQAc3lsxGF0bUJQXNWdiYpvPBUEg\ne1w8Pq9IyXdAW60oaUAQ5Gig1uj0GhJTIqkqb8Lj9g1S7wYGn89PWXE90XGmgBlGITrWhN6gobIs\nLNjDDGE6MsMoRMUYSUiOoLKsacQ7zaoqrGh1amLiTO2uZY1LAKDgWM1Ad2tA8Xr9VFVYSUi2oNNr\n2l1Py4xBkqC8pGEQejdwVJQ04vOKZJyirYO80CeNiqSpwYXD5h6E3g08YcE+zJAkiZKCOswWHUmp\nHcdpJyRbEEWJupqRGx3jdvloqHWQmGLpMCciMcWCOUJH0fEaRHHkLnBV5U2IokRKelSH10dlyOaZ\nkW5nD5hhcuI6vJ48Sh6fyrLvhiM5LNiHGQ67B6fDS2JyZKdJXgnJcqxrdeXQj8vtLcpv6ywJSRAE\nMsfF43L6RvTLXF4sa+Kn2tcVkkdFodGoKCvqu8b+zjtv8f3vX8Fvf/ton9sKNUX5tWi0KlLSOl7g\nFDPVSJ4LrWm/dwszpKk5aQMgLimi03u+C4JdCeFLTOk8YSMlLYqDu8qpOWkjtRPBN9wpb/YzpHai\nsas1KlLSoygpqMdhc2OK0Pf6WWvWvMsf//hnkpNTet1Gf9BY76Sxzknm2DjUmo51VXlnBye/I3b2\nsMY+zKitkgV7fGLngj02wYxKLVBdaRuobg04VRWyYO/MHAUQlyCPkTJmIw2/X+RkeRNxCWb0Bm2n\n943KaA577IPW/vTTf6C8vIyHH76ft99+s9ft9AeKUzQto/MMW61OQ1xiBFWV1hHve4Kwxj7sUDT2\n+C40drVaRVyCmdpqG36/iFo9stZvSZKoKrditugwWzrXQKNijahUwojLxH17/XF25VXj8fhx+Hzo\nGh1s/+vmTu8X/SJ2RA59egTDxhMd3jNnQiJXLh7TaRsPPPAztm79lj//+UUiI4dWDZ76Zl9SXBfK\nDshmqZqTNqpPWgM295HKyHrjvwPUVNnQGzRERHa9pU5ItiD6pREn1ADsVjcOu6fbIl9qtYqYeBN1\nNfYRWXlU0Ty7W7hVzddFf181VYlApbUhhDLHYxPMXd6XnCbPl5PfATt7WGMfRng9PhrrnM2JJ11X\nx5Tt7BVUn7QGbO4jhZPliuO0+98VlxBBbZWdpgYnUTHtwyKHI1cuHsNdV83g1b9upuhELdf/cG63\ntvN/v7ydpgYnN99xxoirrFpX48Bk1rXJuu2IlsiYRqYxsiughjX2YURts2bSlRlGocWBOvLsy4p9\nPZiyvLGJshZXWzXydi71tXYMJm1QDtHYeBM+r4itaWTFcXs9PqyNLmLiu1+0IyL1mCN0VJY2jcgd\nXGvCgn0YEbCvd2NLBIiNN6NSCdSMwMiYqoqeaeww8hyoPq9fFmixwe1CouPkBa6+ti8L3NDT9Otq\nHED3ZhiQQ2ATUyNx2D3YbZ7+7tqgEhbsw4hgHKcKao2K2AQztVWyA3WkIIoS1ZVWYuJNHWZankpc\n8wtfO8J8DbLfAKI7yLrtiNhmjba+WRD2hnfe+YDIyKHldAzY1+O7F+xAIEu5sa734zAcCAv2YURt\nlQ2VWgj6ZU5ItuD3S4GogZFAU4MTr8dPQlJwfgNThA6DUTPinMi11fIi31E5hY5Q5kx97cgSaMrc\nDkZjB7luDEBDnbPf+jQUCAv2YYIoitRW24mNNwcdvjgS7eyN9fILGR1r7OZOGUEQiE2IoLFeXhBG\nCjXNpqXoYE0xMSYEoa+mmKGHUjYjJi44wR7VPG/CGnuYIUFDnRO/TwzKDKOg3DuS7MuNzZpWVJAC\nDVrMMSOpdk5AsAepsas1KiJjjNTXOEaU47Cuxo7ZokdvCC7AL6yxhxlS9MRxqqBotU0NI2cSN9bL\nmlZUTHAaO7Qkrijmi5GAYpazRBmC/k5snBm3y4fTMTKODHS7vNitnqDNMAAGoxaDUUNDWGMPMxRQ\nbMTdZde1Rm/QojdoaGxw9Ve3BhzFFNMTwa68+HUjJORRkiRqquxExciZtcESHXCgjoxxCETEBBHq\n2JqoWBNNDc4RFVRwKn0W7B6PhyuuuILly5ezbNky/vKXv4SiX2FOQdG6eyLQlPubGpyI4sjYfjfW\nOzGatEFFxCgoERMjJTLGYfPgcfuCdpwqxI4wB2rAcRpkRIxCdIwRSQJr48hReE6lz4Jdp9Px2muv\nsWbNGtasWcOmTZvYt29fKPoWphVNDU7UGhWmCF2PvhcZbUT0S9itwz8xxe8XsTa6Ag6wYNHq1ETF\nGKkbIaYYRTAHa19XiGkWgL0NeWxdtvebb77ijTdeDfq7lZUVfPHFp0Hd+/jjv2bjxvXd3te6lMCa\nNe/x2Wf/C6r9qICdXR6HTz75L9XVLUdJPvnk7ykqKgyqraFKSEoKGI3yi+bxePD5RvYRXINFU4OL\nyChDj9PBFQ2/sd7ZI3vsUMTa6EKS6FVpgMgYIyX5TjxuX4+0/aGIIpCCTU5SUByHvY2MObVs7/z5\nZ7a7x+/3o1ar231eXl7GF198xnnnXdCrZ3eE4gyPjDawfPllQX9PGQfFEf+//33EzJlTSUrKAODh\nh38esj4OFiGZ4aIosnLlSoqLi7n22mvJzc0NRbNhmnG7vLhdvnZnWgZDZLQszGVTTudlTYcDgYiY\nHpqjACKjlHFw9SiyaCjS0EuNXatTY4nU98oU07ps78UXX4LFYuHIkUPcd99DPP74r7FYIsnLO8r4\n8ROZP/9MnnvuaQRBQKvV8OyzL/Dii6soKirkppuu5YILlnLllde0af+ZZ55k9+6dpKSktonaOXr0\nCH/+8zO4XC6ioqL5+c8fIzY2jnvuuQ3RFUt1XSGxH5Zht9sxmUycccYCfve7x3jpJXk3UVlZwcMP\n38+rr77JK6/8nW+++QqHw4lWSmTS9HvYsGEdR44c5sEHH0Sj0fL886t54IF7ufvu+zh8+ADl5eXc\neee9gKzZHz16hB//+AE+//wT3nnnLfx+H5MmTeEnP/npkKrBExLBrlKpWLNmDTabjTvvvJPjx48z\nZkznJUDD9IymZufnqYf0BoMiBEdCZExDLyJiFJQFztroHPaC/Vv315ROK+RP+ZsRCnomTJxjPfh8\nInnfbGgjiGYkTmXlmKWdfu/Usr2ffPLfNt8vLS3mT396AYCHH76Pn/zkp0yZkktEhIamJg+33343\nb731Ok8++f/atb1x45eUlpbwz3++TU1NDd///hUsXXopPp+PZ599iieeeIaoqGjWrfuCF19cxc9+\n9kskScLpsHPdlT9l6VXTWL36bwBkZGTi9/uoqCgnJSWVdes+55xzzgPgssuu4oYbbsbn9XPj9+9k\n566t3P/Idbz33tv88pe/ICGhbWGwRYvO5fbbbwwI9nXrPuf6639IUVEh69Z9zgsvrEatVvPHPz7J\n559/wvnnX9Sjv0V/EtI9aUREBHPnzuWrr77qVrAnJAyPioNDoZ/VzdUMU9OiO+1PZ58b9HLFO5fD\nNyR+S1/64HHKCUaZ2fE9bidttLxb8fukbr87FMapK9xuH6oIAU0v6uyrNSp8PhEkUKtbBLPJqOv2\nd6tUEBdnJjragsViwNj8HYNBy8KFSwPfP/30uTz//HMsW7aMJUuWkJSURHS0CZ1O0+Ezjh07wIoV\nl5KQYCEhwcK8eWcQGWnEZquhoCCfBx+8F0mSEEWRxMREEhIsCAhkpE4nMTmShAQLZrMes9lAQoKF\npUsvZuvWTdxyyy1s2rSeZ599loQEC7t2bebll1/G6XRSVV9FWXkaCQkWtFo1ktQyL7RaNTExJsaO\nTSczM4OKigJGjx5NeXkpixcv4I033uD48WPccceNSJKE2+0mLS15SM2bPgv2uro6tFotFosFl8vF\nli1buPXWW7v9XnX10C9OlZBgGRL9LC2WDyJWaYQO+9NVPyVJQqNVUV1pHfTf0tfxPFkhn5QjIva4\nHalZhlWUNnb53aHyN+8Mr9dPbN5YZo45gwvPn9Lj7x/aU87GT49x9sUTmDA1uc217n63KErU1trw\netVYrS6cTg/V1VZcLi8+X8vcXLHiGqZNm8uWLV9z5ZVX8swzq2hocODx+Dp8htPpwWZzB6653V6a\nmpzU1dnIysrm+edXt+uny+VFY9Gh0amorrZit7uRJDXV1VZOO+0sHn30p8yaNQ+/X8JojKGsrJZf\n/erXrF79OvHxCTx8/2+wNjgpL6vH6/W3+f1er5/6egfV1VYWLDibd99dQ0ZGJvPnL6S62orV6mTJ\nkou47ba7ejR+oSDYxaPPUTHV1dVcd911XHrppVxxxRUsWLCAhQsX9rXZMK1QzCi9McUIgkBktJHG\nBuewzzhsqHNiMut65fxsbYoZziip8PGJPQvxU1Ac6LZ+DPUrKyslOzuHa6+9nilTplBcXIjJZMZu\n79hpO23aTNau/RxRFKmpqWHXrp0AjB6dSX19AwcO7AfA5/NRUJAPtBwy0tE7MWpUGmq1ilde+TuL\nF8tmGI/HgyBAZGQUDoeD44W7geY5ZTJhs3UcMbVw4WK++mpDG5POrFlz2bBhHfX1ssLV1NREZWVl\nr8aqv+izxj5+/Hjef//9UPQlTCcoSTmW6N5FtcihfnacDi8mc8/CJYcKfr+IrcnV6yPN9AY59r1p\nmMcuK6nwSjninqIIdmtTb8YhOHv+O++8ya5dO1Cr1YwfP47TT58PgFqt4cYbv8eFFy5r4zxduPBs\ndu3azvXXX016egYzZswCQKPR8LvfPcmzz/4fNpsNUfRz5ZXXkJWVjd8vtfk9p7J48RKef/5P3HLL\nnYBsJl62bAXXXXcVKSmpZGeNw14vv1sXXbSMxx57DK1Wx/PPr27jO7BYLGRmZlNcXMiECZMAyMzM\n4pZb7uT+++9CFCW0Wi333/8QycnJHfZlMBCkQVLjhvJ2V2GobMtff/5b/H6R6++e1+H17vq5ef0J\n9m4rYcX3Z5CcNnhlV/synvW1dt56aTsTpiZz9sUTetXGO//YQUOtg5t/cmanEQxD5W/eGbu/Lebb\nDflcddOcwCEiPcHn8/PS018xKiOaS66Z3g89bEt/jeen/zlAwbEarr9nXq+UleL8Wj5+ez9zFmQy\ne0HmkP+7KwyYKSZM/6JoqpG91NahVSz7MI6MCZQS6GFyUmssUQZ8PhGnffgesqBkS0b38pg/jUaN\nyawb9lmX1kYXGo0Ko6nr4/A6I1AMrH5kZOGeSliwD3FsTW4kCSKjei/QomLkRUERjsORvsSwKyj2\n2OFsjrE1m1D6Mg4RUfrmeTV8fS7WRheWXiTsKUREGlCpBJrqh+9c6IqwYB/iBBynoRBoI0Fj78OB\n1C3JWsP3ZbY2udHp1d0e3NwVkVEGRFEatsfDuV0+3C5fnzKpVSoBc4QOm3X4zoWuCAv2IU5LclLv\nJ7GinQxrjT0g2Hs/DgHH4TBd4CRJwtroIiKyb6UhlO/3Z2RMf6LsWvpaIiMi0oDd6hmRVR7Dgn2I\n05dQRwWVSsASbRjW205rkwuDSYtW1/tAroDGPkwFmsftw+vxY4nU96kdRSAO13FQ+t3bKDGFiCh5\nHEdCgbxTCQv2IU6LYO/bJI6KNuJyyjVnhhuSJGFvchNhCZFAG6amGGujLIAi+qipBmLZexXyOPhY\nlV1sCDR2kP1YI42wYB/iNDXI3v++xp8PZzu72+XD5xOJ6KOmqtGoMUcM34gQJfbc0kdTjPL94TYO\nu3fv5KGH7gv0uzNTzD333MbRo0e6bU/Z+diaXPzpT39i587tverX22+/idvdsjg89NCPsdsHt0R0\nWLAPYSRJoqnBiSW6995/BWXbaRuG207lRY6w9L3ssCXagK3JNSztqoqG3dcFztI8F4abYAcQBLoV\n7MGiaOyNDU7uvfdeZs2a06t23nnnTdzulrF86qlnMZsHt9Dc8C5MPcJxu3x43H5S0ntvX1dQzBjD\n0Z6oLEZ9FWggh41WljZht7r75LcYDBRTTF8FmlanwWDU9Eiwu1wufvnLn1JdXYUoilx//c0sXnxu\np2V1y8pK+b//exybrQlJEvjtb58gNXUUq1Y9x9atmxEEFddddxPnnHMeu3fvZPXqvxEVFU1BwQkm\nTJjIo4/+FoBvv93Mn//8DNHRMYwdOx6ApkYnGq0qEBnkdrt5/PFfU1RUSEZGBh5PS7TP9u3f8vLL\nf8Pr9TJqVBqPPPIYBoOBK664hLMXXcAXm7/Er1vGV9veZNas09HrDfzvfx/xm9/8AZB3Cf/+9xs8\n8cQzPP30Exw9egi3282iRedw00238u67b1FTU80999xOdHQ0zz33PFdccQkvv/xP3njjNZKTU1ix\n4nIAVq/+G2azmauuupZ//euffPnlF3i9Ps46axE33dR9fa2eEBbsQxjlxeurLRFaBPtwtCfam0In\n2C2tQh6Hm2BXNHb/hv+yY9WePu065tg8iH6J/IffBsAyew4JV1zd6f1bt24mPj6Bp556FgCHw95l\nWd1f//oXXHfdjaxYsZTy8jpEUWTjxvWcOJHHa6/9m/r6Om6++TpmzJgJQF7eMV5//R3i4uK4444f\nsn//XsaPn8hTT/2eP//5RUaNSuOXv/wZ0D6Gfc2adzEajbzyyr84ceI4N910LQCNjQ28+upqnnvu\nr+j1Bt5441Xeeut1brjhZvk3R5pZMu8uRqfHcrSkVB6XOafx9NN/wO12odcbWLfuCxYvXgLAbbfd\nhcViQRRFfvSjO8jPP87ll1/Nv//9ZqCcsYzcr3PPXcJzz/0xINjXr1/LM8/8me3bv6W0tJiXXnoN\nSZJ4+OH72bt3D9OmhS4TOCzYhzC2EAo087DW2BUTRN8XuMCBG43D7+ARa5MLlUpAq1XTVxe4oBKQ\n/CKSJJs3uiM7ewyrVj3HCy/8hTPOWMC0adPJzz9Bfv4J7rvvruayuhLx8Qk4HA5qaqpZsEAuBqjV\nypr1vn17OPfc8wGIiYllxoxZHD58CJPJxKRJk4mPjwdgzJhxVFRUYDAYSU0dxahRaQAsWXIhH3zw\nH3kXm9ayKO/Zs5srmhelnJwxjBkzDoCDBw9QWJjPHXf8EEmS8Pl8TJkyLfC9JUvO57//ymuj7KjV\nak477Qy+/vorFi1azJYtX3PXXT8CYN26z/jwwzX4/X7q6mopKCggO3sMIDX/pyD//9ix42loaKC2\ntob6+noiIyNJTEzinXfeYvv2bdx007VyXXmni9LS4rBg/66gCGFzH6NBWrcxHCMhrMoCF4JxaHEi\nD79xsDW6MVv0JF55NQl33dKn2ibfrDvOvu2lrLxuJkmp3Z/MlZ4+mpdffp0tW77hxRf/wty5p3PW\nWYvIzs5pV1bX4ei4iuOpma6t/60IfwC1WoXf3/HS5fPKu5RTzVGtfVBKu5IkMWfO6Tz22O86bMto\nNBIRaWj3TixefB7/+c/bREZamDhxMkajkYqKct566w1efvmfmM0RPP74r/F4uleSzj77HL78ci21\ntbWcc86SQL9+8IMbuOSSFd1+v7eEnadDmIBtOQQCTa2WD8Iejs5TW5MbQQCzpe+VKZXdj32YmaT8\nPhGH3ROyc2t7GvJYU1ODXq9nyZILuOaa73Ps2NFOy+qaTGYSE5P46qsNAHi9XtxuF9OmzWTdui8Q\nRZH6+nr27dvDpEmTO31mRkYmlZUVlJeXAbB27Wf4mmuntx6H6dNn8PnnnwCQn3+cEyfyAJg8eSr7\n9++lrEw2s7jdLkpKits8IyJSj8ftlw8faWbGjFkcO3aUDz9cEyjVa7fbMRqNmExm6upq+fbbzYH7\nuypJvHjxeaxb9zkbN67n7LPPAeC0007n448/xOl0No9tdaAEcKgIa+xDmFBq7CAvEDVVNiRJGlLn\nM3aHvcmFKUKHStV3PSSwcxlmC5xijuprcpKCEvIYbJJSfv5xVq16DpVKQKPR8sADP+uyrO4vfvFr\n/u//HueVV15CENT89rdPsHDh2Rw8uI8bbrgGQVBx5533EhMTS2FhQZtnKXNTp9Px4IOP8OCDPyI6\nOobc3OmcrKiT+99KsC9ffjmPP/5rbrjhe4wdO45Jk+QDSKKjo3nkkcf41a8ewePxIggCt9xyB+np\no1Hs4Ip5T1kwQD7qc968BXzyycf84he/BmDMmLGMHTueH/zgKlJTR5Gb22LSueSS5TzwwL3Exyfw\n3HPP07q8cVZWNg6Hg4SEJGJj4wCYM+d0iooKuf32GwEwmUw8+uhviYkJnWkwXLa3Cwa7lOcH/9pD\neXEDtz54FuoujkELtp99LXXaV3oznqIo8dLTm0hIsbDyBzND0o9X/vQNOr2G7912Wkj6OBCUFtbz\n0Vt7mTUvg7lnZfW5nzUnrbzzj51MmZnKmUvGhbCnbQn1eH79RR77d5Zx+Q2zSEju+1F0u7YUsXVj\nAVf/cC4xCb2vQzRQhMv2jgDsVjdGs7ZLod4ThmPIo8PuQRSlkJijFMwWPXbr8KpuGKr6KAqBujnD\nLJY9lKGvcjtKlNTwS9zrirBgH6JIkoTN2vc0+tZERA6/kMdQJeW0JsKix+cTh1V5hUCSVojGQT5R\nSh1wTA8X7FY3KrXQp+qWrVHGczifVdARYcE+RHG7fPh9Ysjs69CqNsYwKlVqDziQQ6OpwvAM/VQW\n41Bp7CDb2a2NrmG1c7Fb3Zgj9CHzEQV8DcO48mlH9FmwV1ZWct1113HRRRexbNkyXnvttVD06zuP\nLYQhfgrDWaCFUmMfjg5UpU5MSOdDpB6vx4/X4+/+5iGAKMqRQaGIjlIwRegQhJGnsfc5KkatVvOz\nn/2MiRMnYrfbWblyJfPnzycnJycU/fvOEuqIGBie2afWfjDFBBY42zAah0YXRpMWjVYdsjbNES0L\nvU4/9APkHHYvktTS71AghwHrh/VZBR3RZ409ISGBiRMnAmA2m8nJyaGqqqrPHfuuE8oYdgVThKzp\nDCfB3qKxh84EEXAiD5NxCPhbQjgGMPwWuP5QdkAOIW1qdCGKw8ck1R0htbGXlpZy5MgRcnNzQ9ls\nv2I/sA9nfv5gd6Md/TGJ1WpV83Fg7V9kT2UFjsOHQvasUKE4y3p7aHFHKFv51uMgSRKi14vo9SL5\nhpZT1enwIvqlkO5aAMzNC73d2lI0S3S7se3Zjd/WtuyszWbj/fffDfxbKaHbEU8++XuKigq7fX5X\nbbRGKcMbeCeC0NhffvnFoMvwRkQakEQJR/MC9/bbb+JyOrHu2IansmJIlOHtKSHbf9ntdu69914e\neeQRzGZzt/cHG4/Z3xT+8xU8dfWkLL2IjB9ci1rfdtIMVj+V1OnRmXHExoduPKNiTVSWNRIfFwGS\nSPlHH1P15QYchUUApF9zFaOvvrL3HQ9RPxUcNg9R0UYSE7tPew+WSItcVsDr8ZOQYEH0ejn8uz/Q\nsGcvxwFUKjK+/z3SLuu/lO+eUOlpBCA+IaLN+PV1bqamRcv/I8lt+Z1ODj3zJE2HDiOo1URNyyV1\n6UXEzJqJ293IRx/9h1tvlZNqoqNN6PWaDvvw9NNPtPm3co8oim2SzLpqozVarZqYGBP2etlhmjIq\nqsvviKLIT3/6QPcD0ExisoXjh6vQqNUkJFh49+03mJF/HOn4CZIvWMI//vFy0G0NFUIi2H0+H/fe\ney+XXnoGQVm+AAAgAElEQVQp5557blDfGSpJIEm33U3l6r9R8dHH1Gzbwagf/wRdQiIwuMkqNVXy\nc90eb7d96Ek/DUYNol+iuKgW19frqHn3bVCrMU+bjqesjJI3/43D4SFu2aV9/g196SfIafQ2q5vU\n9KiQ/x10ejX1tQ6qq61UvfUGDXv2ohuVhikhDmt+AUX/fANfXDLmyVNC+tzeUFoip5urNEJgHEIx\nN33N1SGrKps4WVpD2XPP4Dx2FOOEiYgOBw27dtOwZy8Zj/2Gx//2V4qLi1m27BJmzz6NM86YT0ND\nE7fddme7Urv33HMbd999H+PHT2DJkrO46qpr2bbtW+6++8fY7fY2ZXg9Hl+733FqGV673Ul9vYP6\nCh8V1cf42aOrEVRSuzK8F198Cdu3b2XlyivZunUz8+efGVQZ3sYGG3GWCZQUTeTdF/8fVSdP8ugX\nnxEdHcOq85exaNHZg16GVyHYxTwkgv2RRx5hzJgxXH/99aFobkAxZmeT8cvfUPOfd2hY+wU1775N\n6h13D3a3sFvdGIyhdZZBS9hgQ1k19o8+QB1hIePXv0MTFYW3tpbS/3uC2g/eR9DpiD3/wpA+u6co\ntt9Q25ahJUnJumsnDWu/QJeSyuhHHiUpLZ6SbXspfuL3VP79RTIe+y2a6OiQP78nKONgajZBbF5/\ngsK8GsQ+HhaimJSP7KvEsXsHWXlHiZg9h5RbbkdQq7Ht3kX5qj9R9cY/uf32uykszGf16jcAWUB2\nVGp36tRpbZ7hdDrJyRnDD394Gx6Ph6uvXtGuDO+pdFaGt7qqhgN5a3nxpb+RkBTdrgyvTqdn1aqX\nALnMMARXhvfEkZM89PC9HDt8mNOLi3lPq+WPj/6G1IVnN4dVDn4Z3p7SZxv7zp07+eijj/j2229Z\nvnw5K1asYNOmTaHo24Ch0ulIuOp76DOzsO3cgfuUQkEDTX8kJykoNvvyz9Yjud3EX34lmqgoALRx\ncaQ9+DDqqGhqP3i/nZ11oOmPUEeFCIset8tH+auvIOh0pNx+F6pmM5whK5uEK67Cb7VS8dILSOLg\nnrak2MAVm3ioUELBRb+Ir64Oc+40Um6+DUEtKxMRM2ZinjET57Gj2Hbvavd9pdSuIAiBUrunotFo\nWLhwMQBFRYXtyvB2xJ49uwPXWpfhPX7iCI22kzz007u48cbv8emnH3Py5MnA95SCXa1pXYbX7/ez\nZcvXnHmmXE543brPuOmm7/PL395Do/Ukx3ftQLTZEIwmLDNntYqVb1+G9/jxvEAZ3m3btgbK8N50\n07UUFxdRWjq4MqTPGvusWbM4fPhwKPoyqAiCQPzyFZQ9+wy1H35A6l33DFpfPG4fPm9ok5MUApl2\nReUk5owhct78Nte1cfHELDmfmnf+TeNXm4i98KKQ9yFY+iPrVEFxwDk9kPW9a9GPGtXmevQ55+E4\nchj7nt3Y9+4hYkZo6tT0BsWpp8yHeYtzuPSq6SExT73+/Ld4GxsZW7uDhB8/jqBpKxISr7qGwoMH\nqPv4w3YLXDCldnU6Xa+SiToqw+tyekhLnsA//vFih98xGjs+OKW7MrySX8Odt/4Ya1UtKrMZVSft\nwOCV4e0p4czTVpgmT8WQnYNt905cxUWD1g8lWsPcLwJN1vpcmggSr/0BQgcVE6POPAtBr6dh/dpB\njRCx2xRNNfTjYDLJWqkvbhSR889sd10QBOIvXQlA49eDuwPtL40dwKgRcUtajJNz0aWktruujU8g\n9qKlaB0ObLW1PW6/dVZrR2V4O6KzMrwW4yiq6gq6LMPbEd2V4XW6rZRXH8EraIg9/0LM5oghV4a3\np4QFeysEQSDuUnnVrf1wzaD1w94PMewK2qZqAPyJ6RhGZ3R4j9pkJmr+Anz1dR1uwQeK/opbBlDX\nyMJFGJ/b4eIGoE9PR5+ZhX3/PnwNDSHvQ7DYbW40GlW/JBFprDVIggrDWZ0HPcScfwGRlkhydDqu\nu+5q/vrXP7W7p7WG3dn/63Q6Hnro5zz44I+4665bSOlgIQG5DK/D4eCGG77Hm2++zqRJU/B6fKgF\nI8vOv5lf/eoRrr/+Gm677SaKAwpY57sCpQzv1q1bmDdPXsRbl+F96onfkBSdgU+tJ3rxOYEyvD/6\n0R3t2u6sDO95553P7bffyPXXX82jjz6M0+notD8DQbhs7ylIkkTJE7/HdeI4s/72PFbVwJ+LeWhv\nORs/OcbZF09gwtTkbu/vSYTEybfe5D8FSSTGaLns9vaaqoLnZCWFP/8phpwxjP7ZL4Lue6j6CfDZ\n+wfJP1rNdXefEXKtffsfVrFDmMycuUnMXjyx0z42bFhP1euvEb/ycmIvWhrSPgTLK3/+Bp2ubZnh\nkETFNNTz6ZNvURI1kcuun0liSuchpZWvrKbp602kPfAwpgkTO73vVEIVWVZfY+etv29n4rQUFl04\nvs/ttWl7/Vo+3tSI0xTLzQ8uGtJnFYTL9vYSQRCInLcAgLqt2walD/Z+qBMDIIki9p3b0IsunGLX\n2p8uKRlz7jRcJ47jzD8R0n4Ei8Mun5xkNIXWBOEuL0dVIv8mp6/rqCPL3NMRtFoav/lqUIpl+f0i\nTrs3kDUcShq+XI/eKzvIFbNXZ0SedjoA1m1bQ96PYLDb+m/3Ztu1E53fgU8Uhk3dnO4IC/YOiJg+\nAwSB2i3fDsrzbf1kgnDmHcNXX49Rr8Jh93QrqKIXy9tza6tjwAYSu9WDyaxDpQqtBtX09Sb0Pnvg\nGV2hNpmImDUb78mTOPOOhbQfweC094+fQZIkmrZuwaCSfSjdFYYzjp+AOioK687tg+J3USKkQlkA\nDMBvteI8dpSICNkRPFzKK3RHWLB3gCYqCuOYsTQdPoKvqWnAn99iYw/tJLZukxeqiPhI/H4Jj7vr\nF9Q0YSIqgwH7/n0Drq1KkoTD7gm5pir5fDRt+Qa9SYtaLQRV4TFqwVkANH61MaR9CYYWB3Jox8FT\nUY6vpoao0SmAnOHbFYJKhWX2XES7HfuhgyHtSzD0lyPdtncPiCIxaXJSYncL/XAhLNg7IWLGTJAk\n7Ht2D/izbVY3Or0arS50zjLJ58O6cwfqqCgik2SnT3eTWNBoME2egre6Gm9l+xjl/sTjluvRm0L8\nIjvzjuG3Womae7qcpBSEhmYcPwFNfDz23bsGXFvtLweyfd9eAGInjmnznK6wzJVt/IqCMJD0V0CB\nbfdOABInZAItoaXDnbBg74SIGbOAlj/8QOKweUKumdgPHUS02bDMnoup+eVw2LufxObmTEJbsyAY\nKPorxM9+8IDcbm4uZoseh82Dv5sMTkEQME/JRXS5cBUMbME4RZMO9c7Fvm8vCALxM+SSCcEscIbs\nHLTxCdh270Z0D6wA7I8FTnS5cBw8gG5UGjHpSfJzutm5DBfCgr0TtAkJmLMycRw+hN85cLWa/c1H\ntoX6Rbbtkhcoy9zTWqr6BTGJzVOnyvfu3xfS/nSHsuiEWmN3HDyAoNFgHDs+ICQUO3ZXmCdPBhhw\nM0TAaRjCcfA77DiP52HIysIQG41OrwlqLgiCgGXuaUhu14BXArXb3KjVAnpD6Hax9gP7kXw+ImbM\nDJykFLaxfweIPf00JJ8P+/6B01Zb6oKEVrA7jxxGZTJhyMoOtN2dXRVAExWNPjNLNmEM4ALXHxq7\nr6kJd0kxxrHjUOn1LQePBGGGMI6fCCoVjmaNf6DoD03VcfAgiGJgN2a26II+VcvUXBTNcWSABbvV\ng9kSuiPxoGU3HjFzVuDIwWDeieFAWLB3QdzpcwGwD2CSjqNZezSZQ/cie2uq8dZUYxw3HkGlajk5\nJ0jtxDw1F/x+HIcGTqgFxiGEgt1xWNa2TZNk4WQyB7/AqU0mDNk5uAry8XeSldgf2PvBFKPY1825\nzYI9Qq6b4/N2H+pnyM5B0GpxHDkSsv50h9/ffCReCHctkt+Pfd9eNHFx6NNHN5+jGjbFfCcwZWSg\njo7GcfTIgEWFOPohCkJ5CZXEkp4INICIZgFg3zdw5pieHKoQLIq2bWo2q/Rk5wLIJXwlaUC1VbtN\nPrZOG6Iqn5IoYj+wD3VUNPrmzOOWk5S6HweVVotxzFg8pSX4rAMTMRYI+QzhrsVdXITodGKeMhVB\nEFCpBExmXdh5+l1AEARM48bjb2rC26qKXH/SH84yx1G5SJsi2I1mbY+0E31GJmpLJPb9ewes0mGo\nw/wkScJ+8CBqiwV9Wnpz280CLQgnMoBpkrwgOAbQzu6whfbwZldhAX6rFfPU3IBZo+UkpeDGwdg8\nj5xHj4asX13RktcRwnfimNx347gJgc9MEXrstu7zO4YDYcHeDcaxcvqy89jATGJFyChadV+RJAnn\nkcOoLRZ0qXIFQ5VKhdEUvHYiqFSYp0zF39SEp6wsJP3qDiXr1BCirFNPeRn+xgZMkyYHasP0VGM3\nZGahMhqxHzwwIC+/z+vH7fKFdtfSvCgpTnHo+dmnioLgODIwVV0d/RDDrrzPxrHjAp+ZI3T4fWK3\n+R3DgbBg7wbjOFmwO/IGRrAHJnGItp3eqpNytun4CW2KXZkidEFlnyooL4DzeF5I+tUdoc46DZhh\nJrWciNRTk5SgVmOaOAlfTQ3eATiwvT+Sk5S/n6KwyO03C/Ygk3MMGZkIegPOgRLsIfa3SKKIM+8Y\n2oQEtLGxgc+VMOCRkKQUFuzdoEtJQRURMWAae8AUEyKNXdGqTi3cZI7Q4fOKeNzB1cYwjh0LDIxg\n74+sUyVMUQlbBNDpNWi0qh5FQgSiQgbAkRyIkArRIi+JIq4Tx9EmJaGJbCn4pZg4gtXY5XDRcXgq\nK/A19H952lC/E56yMkSHo83iBq1MUiPAzh4W7N0gqFQYx47DV1uLt7am35/nsHnQaENXotXZiWBX\n4sODSVIC0CY3L3An+l+whzrrVBJFXMfz0CYno4mOaXPNZNYFbWMHMI1vti/n9f84hNqR7ikvQ3Q6\nMeaMbfO5orH3xHFomiDbph1H+z865tSjAfuKsvtWduMKph7kdwx1woI9CEwBO3v/F4Gy290hsyVK\nkoTjyBHU0dFok9qW/+2xGUIQMOaMwVdT0+9aWqhj2D3lZYguF8bsMe2umSL0uBxeRDE4k5Q2KUle\n4PKPh6RvXRHqyKCAGWZMW8FuNMsFsHq0c5kwSf7OAJye5rSHVmMP2NfHnaqx93yBG6qEBXsQKBPA\n2c92dlFsLtEaqi1nRTl+axOm8RPbJXa0bL+Df5kVgdDf5phQZ50qZYcNOe0FuzlChySB09GDBS47\nR17gGvv38I1Ql6pV/m6GUwS77EzXYg8iA1dBP3o0KpMJ59H+F+x2m6f5oJG+h3xKkoTz2FFZ2UlI\naHOtJToorLED8MgjjzBv3jyWLVsWiuaGHPr0dFQGQyBEqr9w2r1A6JxErhOyVqnYx1ujJED1RDsZ\nKMEeao3ddUIW7MacnHbXerpzATlJB8DVz3XqQ+08dR0/jspsRpfc/vAWU4QuqNIKCoJKhTFnDN7q\n6n6vgKr4W0KRdeo9eRJ/UxOmcePbtWfqYeLeUCYkgn3lypW8/PLLoWhqSCKo1RjGjMVbWYmvsbHf\nnhNq779SsEoRRK3paagfgD4zE0GjwXm8f80QIR+H/BOoDIZAuGdrejMOxmbN33mifwW70idjCHZw\nvoYGOfs4Z0yHRwGazDo8bj/eILJPFQxZ2QD9WhhNFCWcdk/ozTCnOE4BjCYtKpUwIsoKhESwz549\nm8jIzo/VGgmYAuaY/rOzh7rgkzM/H0GnQz8qrd21nhQCU1BpdegzMuWsvX6s7hdK27LfbsdTUY4h\nK7tjgdbDJCUAQ1YWCEJgR9RfOOweDEYtanXfX9PO7OsKyjj0RGs3ZCuCvf8WOJfTiySFbpFX3l/j\nuHHtrgmCIIcBjwCNPfSn445QAtvvgnwss+cAYPPa2V65G5WgIjMyndSIFLSq4IZUkiRKqmwcKqzH\n6/Oj16pxVcs1SEKhnYhuN56yUoxjxiKo29smjQETRM8msXHMGFwnjuMqyA9E2tS7GjhQewS7186M\nxFySTAndtNKCy+OjqNJKQYUVm9NLXJSB6pPyGZmhMEEoQqejXUvrZ/RES1MZjOhSR+EqKkTy+RA0\nGlw+F2W2SmpddVg9NqbGTySxB+NQ3eCk+KSVmkYXjXYPybEmbFY3lsj+ta8rKHPObvMQGR3cOb+G\nTEWwFwQ+a/JYOVhzBLVKjU6tY5pxLALB/4ayGjvHShpweXx4vCKG5jyLUNVOchacQGU0ouvkIG1T\nhI6aShuSJA3ps0+7Y9AEe7CHsg42Sj995qmUCgL+smI0ESJrDn/G+vxvcPtbBIJereOHs65mUdYZ\nnbbncvt4d30ea7cXU9voanMtFRiFig0HK7GkRzNtbPCC4dTxbDxYDJJEzKTxnY61KUKH2+Xr0d9C\nNWsa9Z99iqqiGPuUFJ7f/k8K6ksC1z/K/4zxcdlcPuVipiVP6rSftY1O3vz8KGu3FeM/JSJlAgIR\nCHyxr4JLzxpDQkzvDxR3VpYCkDRzKrEd/E7RKz9b8kuBvgUzHo1TJnLys1JM9joKLV6e3foyTW5b\n4PoHJ/7HOTkLuHzyxUQbOt7NSpLEoYI63t9wnG2HKmmdKyYAs1FRWu9k8+EqLpqXiVbTdoHuyd+t\nvCgfQaMhbfZU1Pr2QjIxSW5Lq1YF326ChbKUZNyFBcTGGvmycAtv7H0fu7elCqjmoIZrc5dz4biz\nUQkd7zx8fpEvthbxxbZi8kraOqQjgfGo2Ha8moRJSSyYntprgeuz2zlWWUlU7lQSk6La/5wECzGx\nZqrKrUSY9CEvGT2QDJpgD8XJ5f3NqSes65JTsB4/zs8+e4I6dwMx+miWZi3BrDVT2FTCjpO7+eu2\n1yisKueirPPaTEBJktiTV8O/1h6jtsmN2aDh9MlJ5GbHYTHpcHv9HPy2GFu5lX2FdWx9YTNn5qZw\n9TljMXYT097RSfB1u+WEHCk5vdOxNpq0NDW4evS38CXIduqSndt5Ub0Zl8/FxNhxTImbiElrZGvF\nTo7WHucPm1Zx69TrmBrfItwTEixUVDby4TcFfLatBK9PJCnWxPQxcWSlRBJl1lHX5GbfF3l4PH4+\n2JTPf78uYMVZ2Vxw2mhUvXiha/fLBbs8cakd/k63V3ZY19bYqa62djiWHZI6GoAvv3iP12MLUQkq\nFqbNJ9mUiFqlYm3RRj4/vomvCrfx4xm3k2ZpqyHaXV5Wf3yY3XlybkRWSiRzJiQSH2Ug0qyjsKSB\nE5sK8UgSf//gAO9/mcfV54xj1viEwFgG+3cT3W5s+QUYMjKpa/IA7XcnIvKqUlHeSHxK8AuGdnQW\nrq1b+MM7v2efUIlBrWdZ9gVEaE04vE6+LPuKV/e8y9aivVw/+WoidW3bLqux8/f/HqKo0oogQG5O\nHLPGJWAx6dBpVRzeW0HV4WqqrW6een0H//06hmvPG0dKnDnoPiooNeRVqe3fCWU8NVp58SkuriMu\nIaLHz+hvgl10QybYR0LhnO7QjE7HU1GOVFXDBdPO56LMc1GrZC3qtJRZLEybx1/3ruZ/hWupdzdy\n7YTLEQQBUZR4Y+0xvtxVhlolcPEZGSw9IxO9rq0GdnJfJTbgnqum8eaXJ/hqXwWHCuu5fflkclLb\naxhdoURsKHbQjjBF6KmtsuP1+II+hk9jiUSKjcZZkI97ZgLXTbqKuckzA9fnJs8krz6fv+59mb/v\n/ye35t7A5DjZP9Fk9/D/3t7L4aJ6Yix6li/IYt7UZNStbN+SJLH/02MkJ0bww9mjeG/jCd7dcIJj\nJQ3cvHQSEUZt0GMgiSKu/BNok5JQR3T8khqMvXOYGZtNO1VH9hC5KI2bp/6A7KjMwPXTk2ezsWwz\n7+V9xPP7/sFDs+8hSi9r7gUVTTy/5gA1jS7GpUez8qxsxqZFtVEEIlUCJyhk/oxR5Khh3c4yVr2/\nn0vmZ3LJgqwe9dVdUgx+f9dzQTHN9cDGDqDPysK6dQuewgJy58zmqvHLida3zNWLpy7iua//wcHa\nI/xt32vcN/P2wDuzbmcp/15/HJ9fZP6UZFYuzCHmlNBOZ7mVqsPVfO+C8aw7Ws3+/FoeW72dW5dN\nYvaExB711VUom4wMWZ2PnzIOTrsHgt8wDzlC4jz9yU9+wtVXX01BQQGLFi3ivffeC0WzQwqv38s2\nXSUAC8VMlmYtCUxQhWRzIg/MvovRllFsqdjOloodeLx+Vr2/ny93lZGWEMGvb5rLZQtz2gl1kF8q\ntVpgfGYsj14/m6XzMqizunj6zT0cKqzrUX9dBfmoLZFoYuM6vcds7rkDtdZZzwmLG6Nb5Oa0S9oI\ndYWxMdncnnsjgiDw0v5XKWgspqzGzk+e28jhonpmjI3ndzefxpnTUtsIdWjJOjVb9MyfmsKvbpzL\n5KxY9p2o5TevbKemIfjDPjyVFXKmZQeJSQqCIGDsRbnWfK0Nl05gVK3Iw3N+3EaoA6hVahann8ml\n2RfS4G7kxX2v4vF72Hm0isf/uZPaRheXzM/koWtmMC49up15QVlooqMMXLV4LI/dMJuEaAMfflPI\nX98/gKsHhapcRYUAGDK6EGi98DUA7NLLO46JNjO3TP1BG6EOEG2I5I7cG5mdNJ2CpiI+yP8ESZJ4\nb+MJ3vjiGEa9mrtXTuWHSye1E+rQ4sxNSbLw4ytyuXP5FNRqgefXHGDtjpJ293dFQLBndqXs9G4c\nhhohEex//OMf+frrrzlw4AAbNmzgsssuC0WzQ4qPC77ggFGO1811xXRq54vUWbhl6nUY1HrezfuQ\nJ975ht15NUzMiOGn184kNb7zLaTdJod1CYKARq1i5Vk53L1iKn5R5Nl39rEnL7iSBr6GBnx1dRiy\ns7u0R5osPZvEkiTx5tH3qIyRp022tXPn5vjYMdw85Qd4RR+vHnybJ/+1g8paB8vmZXLXyqmdmpdO\nTaOPNOu478ppLJ2XSU2ji6fe3E1NY3DCPbBr6SB+vTXmCB32HhREs3nsvHbk31TGa7FYvZjdnX/v\nvIxFnJ48myJrCX/Z9i9e+OAgGo2K+66axvIzszstcnZqyOeohAgevX4OE0ZHs+tYNb//xza8vuBC\nE92FhYBcfrkzeqOx76s+yIeu3fhVkNOk79SGLggC14xfSaIpnnXFm1i1di0fbykiMcbIo9fPZua4\nzlXj1rH8giAwe0IiP/3eTCLNOv61No/3NgYfkeMqKGhWdmI7vUcJKuhJstZQJJx5GgQn7VWsL/kK\nX1IcqFS4iwq6vD/WEMOKMctw+92UGzczd1Ii9105DVMX5zVKUnO87ikOmxnjEvjRFdNQqWDV+/vZ\nd6K22/4G4tezOtdMAMzmniVkbKvcxeG6YwGNx11U1OX9U+InMit+FtWuKlxRedxxWS4rzsru0lau\nCJbWsdsqQWDlWdmsODNLFu7/Ck64K9Ea3Y2DyaxD9Eu4Xd1rwZIk8caRd2n0WIkeI0cFKZpgRwiC\nwDUTVhKvTeaE8xCaqHruv3IaU7I630lBx4WvIoxa7r9qOtPHxLMnr5oXPjiIr5uDuEHW2AW9ocPE\nJIWeFkRz+py8ceRdVFodmrQ0vKWliN7Ov2vQGPjh5O+jktQckjaQkqziZ9fOJD6qa8d4R+WbM5It\n/PwHs0iKMfLxliI+3VrcbX99TU346moxZGV1qewoCoUzrLGPbCRJ4p28D/FLflZOvBR9Wjru4mIk\nX+dCQJQkDuw04a9PQB1Vx4QZTWi6iUV2OeV6JR3F607OjOX+K6ejUgk8/8EBik927TQLVrD3ZNvZ\n5LHybt6H6NU6zjvjGvk5XQg0gEa7h6Nbk5G8OvTp+czO7d7x4+iiLsiy+Vksbxbu/+/tvTi6EcTu\n4iJQqzuM429NT8Zhb81B9tUcZGx0NhNzF7Y8pwsKym1U7pPNIElTCsgZ1X3OR2dJWhq1ijuWT2ba\n2Hh259Xwj/8d7nKnIbrdchz/6NEdxvG3xmTWBa2xf160AZvXzgWZ5xA1Zjz4/biLuxawhw77cBWN\nQ9B4mTCnhqggok4cNg9GU/vyzfHRRh64egbRETre/vI4Ww5UdtmOMle72rVA730NQ42wYO+GvTUH\nOVx3jImx45iWMAVDZhaSz4e7vPMDJ9798gTbDlUxyn0GBrWeTwq/aBMW2RHdnZw0Lj2aW5ZOwuPx\n8+w7e6lrcnV4H7QW7F072QICLYhJvOb4/3D4nFyacxEJcaloE5PkOO5OhIrXJ7Lq/f1U1/qZrJ+P\niI+Xd/272+d0V6L1kvlZLJmTTkWtg+c/OIC/kxOdJJ8Pd0kx+lFpCJquHcPBVroUJZGP8j9DQDYt\nKDZrxYbdETUNTv7yn/34bdGMi5hMtfsk31bs6PI50PU4aDVqfn7jaeSkRrLl4En+u7nz57uL5bBX\nfWb3DldThB6n3dNtQbQ6Vz1flnxFtD6KxekLMGQpOR6dL/Q7j1bx7/XHMTtyiNPHsa1qBycd1V0+\np7vyzXFRBu6/ajomvYbV/zvMwS78UO4gHKcARlNYsI94vH4v7+V9hFpQc8XYSxAEAUPzC9LZJN5y\nsJJPtxWTEmfivhWncXb6mdi8djaVbu7yWQFbYhfJSbMnJHLl4jE02Dw89+4+3B2kf0uShKuwAG1S\nMmpT1yFhwdZJqXLUsK1yF6nmZM4cdToAhsxMRLsdX017u78kSbzxxVGOlzYyd2Iid5x1PuNixrC7\n4gDHG7rW8oMpJ3Dl2WOYlhPHwYI63lrbcfanp6ICyefDkJnZ5fOgbXJOV+w4uYdK+0lOS5lFkjkR\nTXQ06sjITk1STreP597bh9Xh5drzxnL9tOXoVFo+PPEpTl/nCzO0ONI7K99s1Gu457Jc4iL1vP9V\nAbuPdSwkXUWKwzCzy+eBPA6SJO8eu+Kj/M/wij4uyb4AnVrXUlqgsOPSAkWVVv720SF0WjX3XT6D\n5WMvDCySXeH1+PF5xS7nQlpCBPdenosgwAtrDlDdiXM9GMcpgFqjQm/QhJ2nI5mvirZT56rnrLQz\nSN51YmQAACAASURBVDLLoVXKit/RJC6qtPLqJ0cw6tXcc1kuEUYti9PPxKgxsLZ4Iy5f5xqhI8ia\n00vmpLNoeiolVTZe+7T9IdvemmpEpxNDN1tOCH7b+VnReiQkLsg8J+AgU7a0rg78Det3lbFpbwUZ\nSRZuvGgiKpWKZdnny20Vru/yWcEcqqBSCdx6yWRGJZhZt6uUTXvL292jaNHKgc1dEczOxS/6+Tj/\nc9SCmosyzwVk+7l+dCa+ulr81rbmMUmS+Pt/D1FWbeecWWmcPTONaH0USzIWY/XaWF+8qcs+OZr9\nLV3ZgyPNOu65LBedVsXf/nuI0mpbu3sCAi2I+dCShdv5PC2xlrG9cjdpEanMSZ4BgDYxEUFv6NAU\nY3V4WPX+frw+kdsumUxGsoUZCVPJsKSzu2ofRU2dR7Z0ZZZrzbj0aK49bxx2l49V/9nfTuGRJAlX\nQQGa2Lg2B4x0hnK62HAmLNg7QZREPjwiv8jnjl4Y+FyXOgpBpwts7RRsTi+r3t+Pxydy89JJJMea\nADBpjUFp7cEWvhIEgWvOHUd28zb8y91tTUKK9qgfPbrb36jRqtHp1V1O4hpnHdsqd5FkSmRGYss5\nmYqgcDVHXCjklTbw5to8Ik1a7rlsKnqtHNaZHZXB5MRxHKo7SnFTaafPC/ZlNuo1/OiyXMwGDa9/\nfoyiyraC1V0s90s/OrPLdiC4sgJbKrZT46pjfuppxBlboioMGfLC4TrFzv7p1uJANNTV57SEWy4e\nfSZmjYmNZZvxdGKeCzjSgygtMTrJwg8vnoTb42fVf/bjPCUM0l1UhMpgQJuY1G1bxiAW+k8L5UV+\n+ZiLAou8oFJhGD0aT0V5mxpCoiTx9Bs7qWkO7Zw+Nl6+XxC4NOdCAD488Wmnz+rJwe4Lp49i4fRU\niqtsvHqKwuOrq8NvberWDKNgMssZ2X7fwBzc3h+EBXsn7Ks5RLn1JHOTZ7aJzRXUavTpo3GXlSF6\n5IknShIvfXQoMIFnnFIKYHH6AowaY7PW3vEWvCfHf2k1Ku5cPoUIo5Y31+ZxpJVtUXHkBaOhKc/r\n6kX+vOhLREnkgszFbcLZFE3Y3cq+3OTw8MIHB5GQuGP5FGIjDW3aWjHxAgA+K/qy0+c57B50ejUa\nbfe1t+Ojjdy8dBI+v8hf1+zH4WoxIbiKikClQp/eteMUujdJ+UU/nxauR6vSckHm4jbXlJ1L63E4\nWlzPuxtPEB2h47ZLJreJ1derdZyZdgZ2r6NTW3tXjvSOmDMhkQtPG83Jeif/+KRFqIkuJ57KCvSj\nM7p1nEL3C1yNs5a91QcYbRnFhJi2NWf0ozNAknCXtSzaH35dwK4jVUzJjm2XVDU+dgzjonM4Up9H\nqbX9jgtaFcULsk7M984dR05qJN8ePMmGVgpPixkmSMHeA9/TUCUs2DtAkiQ+L/oSAaGNtq5gyMgA\nUcRdKk/iT74tYn9+LZOz2k9gAKPGyDnpZ2L3Odhcsb3DZ/a0VG1spIHbL52MKEk8+c8d2Jrtoorm\nqE/vXmMHWai5HF78HYTN1bsa+LZiB4nGeGYlTmtzTW0yoU1KDjhQlcWt3upm5VnZjB8d0669qUkT\nyLCks7f6AJX2kx32x9HDEq3TxsSzdF4G1Q0u/v5fOUJEEkXcJcXoUkeh0nbfVncF0fbWHKTe3cAZ\nKbMD2aMKp2rsjTY3z39wEAGB2y+dQmQHv2Vh2jw0Kg3rSr5ClNqPe7C7ltasaM5e3XGkivW7ypr7\nJDtOgxVoxm58DV+WfI2ExOL0s9qZiJQdorJjPFhQx0ffFJIYY+TWZZM7DHFdPPpMud3Srzt8Xkeh\nr12h1ai4Q1F41uUFdnGKshOMWQ5GRmRMWLB3QF7DCYqaSpgzahrJ5vZpywFttaSIYyUNvL+pgBiL\nnluWTeo0RvvMUWegUWnYVLq5y5fZaAo+ZX5SZizLF2RR0+Dk7/89hF8UcRcVoYmL6zSF/lSUhcTl\naO8w21S2Bb/k57yMRe2ybEHeFYgOB97qaj7eXMjBgjpyc+K48PSOXyBBEDg/82wkJD4v2tDuut8v\n4nL0/ASp5QuymZgRw57jNXy2rQRPZQWSxxP0rkWtVmHo4gShDSXfALAwbX67a5rYOFQREbiLChFF\niRc/PEiT3cPli3IYlx7dYXuROgunJc9s1oAPtrvem8ObNWoVt186BYtJy1vr8sgvbwoqMak1gRju\nDsbB4ZWVkmh9FDMTc9tdNzSbvNwlRdRb3fzto4OoVAIPXzen0zIQk+MmkGiMZ0flbqye9v6B3pz5\nGhtpaN7FSc27OF/LLjZowa4cQhMW7COKdc2OrUsnLunwuiLYrScKeOED+bT62y6ZTKSp8wkYoTMz\nO3E61c5aDte1P4HIYfc0F/rv2Z/k4jMymT4ugX0nalm34SB+a1PQmgl0blf1ij42l2/DrDExO2lG\nh99VIi3yt+9nzdcFxEbquXlp54sbwNT4SSQa49lZtReb197mmtOhnCDVs6p6ijM1yqzjvY0nKN4j\nH9emzwh+HMzmjk8QKrGWcaKxgImx4zpc5AVBwDA6A+//Z++9oyS560PfT3WOk3ty3JyjNiqsJAQS\nCiRjHgbDRRhjHDg8Xb/jc1+wr6/TxX6PCxiuMRgso4vBZIQQKGu1knalzTnvTs6xezqHqvdHdfX0\nzHRPV3XXzG6P+nMO54jpqq7f/vpX39/3942jo/zqlYtc7pli++oaHtzdsuDz3tVyDwICL/W8Ns8B\nnm+jkUq3lc8+thFRlPjGL87jV8JeVUTEwMLRQW8OHCWaiHJv850ZN3lLQ4Ncvri7i2/98gLTwRgf\nuX8VazKc3BQMgoF7W+4iLiV4vf/IvM+12NjT2bKymkf2yae4J399iXBvD6bKKoxudQW0SqaYZch4\naIIL41foKGtldXXmI6y1sQmMRgbOX2HKH+VDB1Zk1c7SOdCyH4BDfW/O+yzfLjEGg8Cffmwn5S4L\nxw+eBtRrJpDdvnxq5Cz+WIC9jXdgMWbWuJQN5PQbZzAIAn/4/k05i3QZBAN3Ne0lLsbn2ZgLaVpc\n7rTw2ffJpqmzb+QxD65kB6HobOfjweRvdW8GbV1BmYdTh85QU27j04/M7zE7lzpnLZtq1tPl66Fr\nTmRIPqYYhY0dVTx2ZzvjvjCjF6/JjlOPumJZNocFQZgv0BJigoN9b2I1WrizcU/GewWTCUtzC6He\nPq71TLBzjYcHdub2b+yp34ndZONQ/xFi4uy5L2QePnB3B2tbKrh0sYfE1JSqYAIFRw7TXDFQEuxz\nODxwFAmJu5Lx2pkQTCZC5R5c02NsX1HFQ3vULZpWdzMdZW1cGL/CaHCmNEAsliAaSeTdJabCbeVz\n79tIXVj+zkRt5iYCmZhJzpn9Mh/qO4KAwN2N2WvLm5plrbQiMMZv37eKlU3qKlDubbgDs8HE6/1v\nzTJL5auhKaxvq+T9d3VQ4RtBQsDctLDWnI5ycvGnNTKejvo5PnyaWnsNG6rnt1JTiNfKpYwbouP8\n4Qc24bSpM6cdaJI3+jcH3p7190Ln4X13drCp2YUjMEmgok6V4xRkJcHumF8Q7ezYRaYiXvY27MJh\nzl4CIFBei0FMsMYa5vGH16mqm24zWdnfuJvpqJ+Tw2dmfabFkT4Xo8HAH7x/Ix2CbGcPVOSOClIo\n2diXGQkxwZuDR7Gb7OyY4yxM5/zNca7HnJilBJ/YVampTviB5v1ISBzqnwl9DGl0EmVibWslO8rk\nF/IHF0JZMzLnkmkR90730+nrZn31GjyO7DVNfnFsiCmTi6b4FA/snN9PNBtOs4OdtdsYC41zZWIm\nwUirsywTj+xppSE2yZiljGeOZY62yIQjJdhnopYODxwlLsY50Hxn1gJXsbjIDy7K9+yqiNHRoL5F\n5NqqVVTbqjgxfJpQfCaxphBNFWQB/cntZRiQuBiyc6l7UvW9mWK4lY3nrizaOsDIZJBDI7IA/vB6\nKw6VmxvIG5yAwBsDb836e9BfWK/TCpeVRzrkMT3fk8AXVCeoS6aYZcaZsQtMR/3srd+Z1fwwPBHk\nn5++wIhdFniG4eylBTKxvXYzZRa3XNI3IduU83GWZaJ8eoSIxcGZ4Rg/Oaiu6l0mU8yhPtneqWiU\nmXjr4hDPvd2D112DNRpE1Nip/u5m+USUblstVKABJMZGMSViTLk8PHO4S3VFzJR9OdlvVZREDg8c\nxWIws6dhfmlihe+/dJXzkxA3WamYHtE0VoNg4M7G3UTFGMeGTqX+rkcTa9PYIAAjtiq59rvKcscO\np4V4TCSajIcfD01weeIaK8rbaHRlLiIWiSX4p5+fp9con9hck5kjnrJRba9iXdVqbnq7GUxGSyUS\nIuFQrOAuRmU+OSP3pljGN35+XlXRNKvNJNfoLwn25cGb/UnNpCmzZhIMx/jqT84SjMTZcfc2gJyF\nj+ZiMpjY23AHoXiI06Pn5O/N01mWTsLvJz4+TvmqFdRVO3n+aC+vn82tsc7VTkLxMMeHT1Ftq8xq\nfugemubffn0Zm8XI6js2AvMTdHLR5m6hxd3E2bGLTIbldmip7NsCBFqkV/491u/ZjNlk4F9+dYHB\n8UCOu2bmwZ8U7NcmbzIWnmB77Rbspszmh4On+3nt9ACtdW6cHe3ERoY1N/ne27ALg2DgjYG3U05U\nPZpYK+ty54Ht+EMxvp4hIzMTc9fD4cFjSEhZbeuiJPHtZy7SM+Jn3a4NcvXTXm3vBMD+xt3y8waO\nAoX5W9KJ9HZjcDhZtbGdK71TfO+FqznLM880tS4J9qJnJDjG5clrrKrooN453x4nihL//MsLDE0E\neXB3C3fcK0eKaBVoAPsa5GbYR5LOQz00VeVlcrS384UPyxmZTz13Jecx3GY3z3KYnRw5Q1SMsa9h\nd0bzw4QvzNd/dpZoXOSzj22kZu2qWc9XiyAI3N20FwmJI8nYfj02OGUc9RtW86n3riMUSfA/fniG\nqRyOsJQpxidfd3hQFjCKwJnL2RtjfO/5q7jsZrm+fFurnKDTp635Q7nVzZaajfT7B1NO1KA/e+Er\ntUR65cqW++/blsrI/PavLuYs8JV+gkuICY4MHMNusmUMcQT46cEbnLg6yrrWCn7noY1Y6hsI9/Qg\nqTQFKmyp2YDL7OTtoRPExLgu74QYDhEbGcHa2spnHt1Ia62LQ2cGePlE9sxnBSVxr1g7w5UEexJF\nuNzVON9pKkoS//aby5y/OcHmFdX89r2rMNrtmGvr5BK+Gn/8WkcNqyo6uDp5nbHQuC6mmHBaEkZ9\nlYM/+ZCc/v8/f3aOgbHsGqviMFM0pCMDxxEQ2Nuwc961/lCM//GjM4z7IvzWgRVsW12TSoTKVbo2\nEztrt2IxmHlr8ASiJBIMROXa2xra380lPUFr38Z6Pnh3B+O+MF/58Zl56fbppNvYg7Egp0fPU+fw\nsHJOZySQW9v90y/OYzQKfOHDW/BU2LG2JHMbNJ7gYMZ2/cbAW8RjCaKReEFrQUomz1kbGzGYzXz8\n3WtY21LBiSujPPX8lQXXa7rGfmH8Mt6oj11127EY54/ntdP9/ObtHuqqHPzRBzdjMhqwtrUhRcLE\nRrSZY0wGE3sadhKIBTk7ekGnTb5PTtBqacVqkes3lTkt/ODlaxy9tPD4lBr9UQ2dqm4nSoId2Z76\n9uAJ7CYbWz2bZn0mSRLfe+Eqb5wbpL3ezR+8b2OqNrS1pQUxGCA+kbv5xVz2N8ia4JHB4/os4jkZ\np2tbK/nUe9cRjMT5hx+con8B4a5oJ0OBYTp93ayrWk2lbXb4Zjga58s/OsPAWID37Grh4WQSkqmq\nCoPTSaRXm6YKcvOFHbVbGQ9PcH3qplx72zm/9rYWIr09mKpmErQe3d/OPVsb6Rn28z9/fo5INLM5\nIt0Uc3T4FHExzr6GXfMiO/pH/Xz1x2eIxUU+976NqUggm5J5mYcZQnaiVnJy5CyTPjlRpxDBHh0a\nQopGU2vBZDTw+d/aQmudrLH+7FDmKozpzw0GoryZNItkMsO8fnaAp567gtNm4n//7S2pMFdbARuc\n8k4cHjiqi8Ye7p1dN6m63MYXPrwFq9nIvzxzMWtFTCj+FnklwQ5cmriKN+pjZ922WU5TUZT4wUvX\nOHiqn5Zal1z7Oa0LUioDNQ9tdXvtZmxGK28NHk/VAS/UFCPHLM/UqblzcwMff/cafIEo//D9k/SN\nzM/uA7C7LMSiCd7slU1DiqlIwReM8qUfnqZz0Medm+r5yP2rUgJPEASsLa3ERoZJhNT3I1XY23AH\nMLPBFTIHce8UCa93VsyyIAh84sE1bFtVw8WuSf6/H55KlV9Ix2I1YTAK+H0RDg8cxSAY2DPn1HKj\n38sX//0kvmCM333PWrantXSzNDSC0ZiXYDcIBvbU7ySaiHKm/zKgjzkqPVHNYTPxnz+yLdV16Iev\nXEPMoLkr8z/pnebixBVa3U00u2eHzx483c+Tv76Mw2bi//joduoqHanPlLkP5zEP9c5aVpZ3cHny\nGmNTXnk8eig7afPQ0VDGEx/Zislo4J9+cT6rcz1XeYXbHV0E+6FDh3jooYd48MEH+da3vqXHVy4p\niq17X1LIAATCMf76X9/mpRN9NNY4+dOPbpuXfKMkwITz0E4sRgs767YxFfEy4Z1esPZ2LhKRCNHB\nQawt87vkvGtnM598cC3TwRh///2TnL0xfyErNeBP9VzAaXKwxbMx9dnAWIC/+e5xbvT72Luxjk89\nvG5eeKcyD1GN9mWAVRUdeOzVnB68KNfeLmhzk58/t06O0WDgjz64ib0b67jR7+Pv//0k497ZxdgE\nQcDhtOD1Buj3D7K5ej1llplMxbM3xvh//+MUoUiC33tkPfdtnx3eKZhMWBubiPT1IiXU9SJNZ09y\n7V3ol7OSC5qHLPWCypwW/vSj22iodvD80V65xO2cE4wiSPvGhxElkb1pm3xCFPnF6zd56rkruB1m\n/uxjO2irn53NaU3mNuSzwQHsa5Sf1z0qO/4Lm4ceBLMZS33DrL+vbq7gCx/egtEg8LWfnuVXh7vm\n+R6cRR7yWLBgF0WRv/7rv+Y73/kOv/rVr3j22We5cUN9g9lbTSAW5NzoBeqddbS55UV5Y8DLX/3b\nMY5fGmZjRxX/5eM7MpYLSBU+ykNjB9ifXMTT06FUE+t8CPb0yl1yWjIn5Ny7vYnfe2Q9kViCr/z4\nLN9/8eqsRsj25CKOhBLsqt+O2WAiIYocPNXP3/6vmbKrv//ohlmVChUUAZKPI1kQBFlrj8jfq4um\nmqEAmslo4DOPbuCBO5rpHwvwF//6NgdP9c/SWh1Oi6yhSTMCJhiO893nLvOVH59FkuBPPrSZOzc3\nzPt+5blSLEZ0WJt9GaDGXsWaipWMTckRQos1DzXldv6vT+xkXWsFp66N8bf/6wRXe6dSnyuCdGzK\ni0kwckfdtuT/D/H3/36KX77ZRXWZjT/72A5aaufXIzK6XJiqqvMW7Ns9m7EYLYwm5yHfkE8pHic6\n0I+lqRnBOD/BaV1bJX/2sR1UuK387NBN/vt3j+JNc7Ar85CpzEQxkJ+KmMbZs2dpa2ujqUnWYB55\n5BFefvllVuboDH+7cGz4FHEpwd76ndwY8PGrw12phtEfeWAN79nRlNXmayqvwFhenpd9GeSQv3pH\nHVLEgLUy/58ikOzmtFBFxzs3N9BS6+Kbv7zASyf6OHVtjPt2NHHXlobUIjbFrGyr3s6JKyM8/UYn\nfaMBrBYjv//oBvZtyt4I2VqAfRnktPJXzsjOaz00VVuWeTAIAr/zrtU0e1z88JXrPPX8FY5cGOL+\nHc1sXVWN3WkGUaDcUEGdqY3nj/bw/NEepvxRmj1OHn94/YIJSNbWVjgsR6RYG9Vn/yrsbbiD586f\nBPKfB0mSiPT2YK7xYHQ4Ml7jtMlNsb//4lUOnh7gi/9+kl3rannPrhbaG9wYTQKJMGz2bGRiUuSn\np65w5PwQkViC3etr+eSDaxdMQLK2thI4fYq4dwo86uqzKNhMVnZ4tjB8TijIkR4dHJA7aC1QVmJF\nYxn/9VO7+Oenz/PW+SFOXh7hwLYmHtzdoqo2/e1MwYJ9eHiYhoYZDaauro5z584V+rVLxuHXbuK2\n1fLTX0SIhk4AsKa5nA/cvYK772hldHThxtHWllaC58+R8PtVV1RUEASBXVU76ZQgYgzm/W8I3OyS\nx5KjNkprnZu/+NQufn7oJgdP9/OTgzf4+aGbNDsEagFLqIIvfvs6kgQCcNeWBj50zwoqciSJWOrl\nAlD5OMwAKm0VtFrlscfN+dfnCPf2YLDbMdXUZL1GEATu2drI5hXVfO+FK5y6Nsa1Pi9mk4GV9ghu\nrIhDTfyXf5ZzGkxGgQ/e3cF797blbEg+43PpgT3ZSzFkY1vtZl6NyzZ2m4Yqn+nEp6ZITE9jX71m\nwetMRgOffGgd+zc38IOXrnHs8gjHLo9gtRhZb4hgilk5fzzB4SHZgVpdZuV337OG/Zvqc54srS2y\nYI/09sIq9WUdFPY27OTZ2GWwJPJ2pCvm0VzlqxXz1MkbE/zwxSu8eLyXF4/3Umkzsgq41jfIPopD\nSU2nYMGeb5ynR+NOvliU9zVisVThrK6mbUMZ79ndxuZVM4Ih1zgDa1cRPH8O2/QYFR2Zj+gLsT+4\nk05OMMl43nMyeLMTwWikactaDJbcmt7nP7qDx9+/mVeO9/DayT68kWvgr0OYqmJ9exWbV9Zw59ZG\nOhrV1X4BGGxvI9DVTXWFDYM5u1DK9m9cX76Wq/gYZgCPJ3Ps+EIkwmGuDg9TtnEDtbW50/o9Hjd/\n9bkauod8vHlmgDfODBCMD+CmkcRoLVtX13Dn1ib2b26gXGX2Y9yxnj5AGh7I+7f0mGqJAn7HOOs8\nC6+nTM+Y6L4KQNW61arG4PG42bOliWMXhzhxeYSzN4eJ+idwBMqxhl3csb6Sh/a2cceGeowqhaxh\n01omngHT+FDWcS5Edc0WXox3EbIFcFeYsZltuW+aw/SY/Oy6LesoU/H8h+vKeffuVl4+1suxi8Pc\nnOwkOi4QIXbbyCotFCzY6+vrGRiYyXAcHh6mtjZ3NblcmvBS4XY5KBMcfOKTM45TZWwejzvnOMUa\n+eUbOXeZWEO75uf7RuQ42QlxnDOd17KmbWdDEkUC3d2Y6xsY90YA9RrvvnW17F3r4YsHj8BwHfva\nV/LgYzPt77T8RoaGJqTrNxg4dzWrlrTQfNojZYCPs5PnGRrOXP99IUI3roMkYahv1DRuh1Hg3Tua\n2L3RzZd+fhTGG/n9d+1gzUY5SS0aijIaUn8cN9d4mL5xk5ERX14+E1vcSVgI8XLnm7Q5s5/Ass3l\n+DlZ449X1WmahxV1Lvl/66Z58RdhhEAl//UTu1MmoYnxzBFVmYiVy9FCE5ev0Yz2dz0WjSMkjMRM\nYV64eDjl79DC1JVrIAiEnFVEVDzf43EzNRlk56pqdq6q5t8vXeTwwDH+eNunbxtZBeo3yYKdp5s3\nb6anp4f+/n6i0SjPPvss73rXuwr92iXD6ZKTc/I9eaQch3nalxUbXswS4a2hzK3SFiI2MoIYDmsq\nS5pOr7+fgbhc7yYRzj/LrpAIIYBIQN7gvMIklyauar9/AYehGo4NnyJqliNlCnGYWVtaSUxPk/BO\n5b44A2JIQLLEOTt2nmBMe/io1m5BczkyeCxlDss3httUXYPBbi/4nYibI6mINS2k/Ax1dRhs2rX9\nSCLKyZEzVNrKWVe1OvcNtyEFC3aj0cif//mf8+lPf5pHH32URx55pGgcpyB73RMFZJjJHdqteduX\nlZfHZIWjQydJiNpC5RSBls1hmIu3Bk8gGuIYjIU5itK7SuVD+sv81tAJzfcXItglSeLI4HEkc2zW\nWPIhFcedx3qQJEmO5XdZiIlxToyc1vwdkd4ejC43psrsDS6yMRme4vLENdxuuTZOvvOQym0YHiYR\nztzjdyGUd6LM7eCGt5ORYPZEokzEx8cQQ6G834nTI+cIJyLsadiZtarn7Y4uo77nnnt4/vnneeGF\nF/jsZz+rx1cuGWo61C+EYDBgbW6RO7THtH+H8vKsbmhjOurn4sQVTfcXItBiYpzjQ6dwW1w4XbaC\nsuysTc0gCPlvcIEoJrOB2rIazo1eIBDT5kyO9PSA0Sg3QdFIl6+HocAwq+rlzamgeSigxEIkHEcU\nJWoqKhAQNGuriWSbQmtLa15moLcGTyAhsaJWbpBR8AYnSQS7ta8H5bntHvm31DoPhZ7elAYwe+vv\nyHHl7Utxbkc6okdYk7W1FUSRaL/6+t8KyrH/jhbZtq2kcatFrfc/E+fGLhKIB9lVvx2nq7CiRwab\nDXNdHZFe7bVzYKb29r6GO4hLCY4Pq9dWpUSCSF8v1sYmBJN2t5FSUXBfm1yeVw+NPZ/QT+W55W4H\nG6rX0u3rZcA/pPr+mYxT7WtBlETeGjyGxWBmbf0KoHCTFID/Zqfme5WNdVVdG3aTnbcHj2s6yabe\niTzMUWOhCa5O3ZAT5xboRXC7844X7Hp0S0lpaXmYIZTnrqxrodXdxIXxy0xFvKrvj/T2YPXUaA61\nhJkyxXc27sbutCBJEM6Qbq8WW2sbYihEbEzb0VkUJULBKA6XlV11OzAIBt5MK2Obi+jQEFIslteL\nHI6HOT5yhipbJRtqV2O1mQpaC6bKKowud14ae3oxOKWsw9z2gQtRiGC/PtWZKlNcWe6aNZ58UHwu\ngc4uzfcq819WZmdX3Ta80WlNJ9lCNPa3FW29QbvD9naiJNiz9PzUQiGOQ7n9lwmTycj+xt2pgmRq\niHu9JLxTODsy92ZdiLHQOJcnr7GyvJ16Z50uRY9mzBDa5iEciiFJ8m9RbnWzuWYD/f5BeqZzl1eV\nn9clP19D82qFkyNniSai7G24A4NgwOW2FiTYBUHA2tpKbHSURDB3Hfh00ovBba5Zj9Ps4O2hE6q1\n1ZlSAtrnQaluuq9hly6nWKV2TiAfjT2tAJgSEXNk4Jjq+yM9PRjLyzGVqw/XBbmD2uHBY1iNw/az\nyAAAIABJREFUFrZ7Nue+4TamJNh1qAlhaWzKu8FAeu3tO+q2YTaYOTxwdFYv0GwoJwRnR7vm5x5O\nvihK5T5dTi55OlDnNthQxvRG/1tZ70lH2VBteQi0wwPHEBBSdYJcZTbCwRgJFZ12sjErUUkDMxq7\nFZPBxO76HfhjAc6MXVB1f7inB8FiwVKvLWQ2FA9xauQcHns1qyo6sCeTowra4JK1c4Ld3Zpr56QL\n9hZXE02uBs6NX8IXzR12mPD7iU+M56WtX5y4wlTEy676HdhMhXVuutWUBLsOGrvBYsFS30Ckt1dT\ng4FU+6/kGOReq1sYC09wbTJ7aVWFcHdSsK9coWm8CTHBW4PHsJvsbE82ULiVGvvcssXrq1ZTZavk\n+PBpQvHcURWRnm4QhKy1crIxmFamuMomR5G43PILHQ7mb5KaqSFU2DwovQFe7zuS9R4FMRYjOjiA\ntblZdfNqhbcGTxATY+xv2I0gCBiNBmx287ym1lqxtrUhRqNEhwY13Rf0y450s8WIIAjsb1B/kk1F\nieVhlns9qUjcnaEnQ7FREuw6VXGztrbKDQZG1duXQ0nh4XDOZGoq2qrSwWchlKO3a4U2wX5+/BLe\n6DS767enyhTrobGbysowVlRoPrnMbTSS3gv0+PCphW6diVmu1R6zrPgY0rskKYK9kHlImeY0nlzm\ntoOrd9aypmIlV6duMBRYuLBYdKAfEgnNZhhJkni9/wgmwTgrEShTU2utKPMQ6dY+D+lF8XbXb8ds\nMPN6/5GcJ9l87eujgXEujl+ho6x1XpniYuQdL9hNJiMWa2EOM8jPgTpjgpg59q0ob6PeUcupkXN4\nIwsfPSM93Rhdbiw12rz3mRooOJNp84U2FrC1thGfnCQ+rb65daZGI/uUXqD9CztR42NjiMFgqtGF\nWsLxMEcGj1NucbOlZkPq704dBLu5tg7BastbY7enbfR3N8s1Z17PYZbK13F6ZfI6w8FRttduxW2Z\nccA7nBaikQRxFX1Ss2FtawcgnPSBqCEVy59WBM1hdrC7fjvj4UkujF9e8P5wlpLFuXj55htyb9em\n4tfWoSTYAXRpXGtTFnFXl+p7UppqmkATBIEDzXeSkBK80Z/9CJ4IBOSY5bY2TTHLI8HRlGbS5Jqp\nRTLjMCvw+J2HGSJTa8ByaxmbazbQ5x9I9QLNhCI0tEbEvDV0gnAizN1N+zAZZkIkXW7brDHlg2Aw\nYG1J5jZE1X9PwB9JOdIVttZspMzi5u2hE0QS2b8rX8fpoeQaO9A8u2iZLj6X5hbZ96RBY1cc6XPL\n9R5ovhOAg71vLnh/pLtbDr1VUdZEISEmePnmYewmOzuz9HYtNkqCHXkRh0M6Ocw0LOJsLfH2NOzE\nbrJzqP8IsURmW2+mLjlqeLVX1kzua7l71t9TDrMCN7h8en9mm4d7mmRh82rv61nvjeQRsyxKIq/1\nvYlJMHLXHA3NVVa4xg7JVnnJ3qNqCQWiqYQ5BaNBjpYKxcOcWCC2P9zTI/sZmptVP28yPMXZ0Qu0\nuBppL5ut4ephojRYrdibGjU1t86k7AA0uRpYVSF3VxoKjGS8VwyHiQ4NYm1t0+RnOD16Dm/Yx976\nnRl7uxYjJcGOPkX1jQ4H5to6wt1dquOvszWxthot3NW4B38skDVRJ9zdBYBNQ4ifPxbgyOBxqmyV\nbJvT29VoNGBzmAno4GsArSappAliTqnatZWraHY1cnLkLGOhiYz3ztRGUX/0vjRxjZHgGDvrts0y\nP0CaYC/UcagxQkh2pMczNpa4q3EPAgIH+97MuLYkUSTS24uloUFVdU+FN/rfQkLinub98059ejWa\ncK1ckWxunVkYz2WhXqeK1n4oy0k20tsjN5xJnp7VIEkSL/a8hoDAPc3aSy3frpQEO/o5UG1tbXJz\n67HMfRTnEsiiqQIcaN6PQTDwat8bGV/mGYHWrnp8b/S/TUyMcV/znRmrJzqdloJfZHONB4PDkYrY\nUUMwEMXuNGOYo2UJgsC7Wu9BQuKVLFp7uKcHU2UVJnfuUr0KB/veAODepKBIJ2WKKXiD09YPN7TA\nWqi0VbCzbiv9/kHOj1+a93lsZBgpEtZkhgnHw7ze/xYOkz3VJSkdvRpNOJOOfbV29oUE+9aajZRb\nynh78HjGaKl8lJ0rk9fpne5nT/N2ah2e3DcUCSXBjj72REhzFiUXWC5CSU3VmaHed6Wtgu2ezfT7\nB7k2Nb/VYKS7G4PdPqt59ULExDiv9b2JzWhjX2PmeucOl+wwixXgMBMEAVtbO7HhIRJBdfVeFmpi\nvbN2K5XWCo4MHMUfm53wIzevntKkrQ/4h7g4foUV5e20ls03WzidFgRBB5NUY5Pc3FqlSWohgQbw\nnrb7AHi+65V5G324S04CsmlIVDvUf4RAPMj9LXdnND/oEQYMssYO6k2UC82D0WDkQPN+wolIRlv7\njGBvVz2+F7pfBeD969+j+p5ioCTY0U+w2zQK9kAggsEgYLVlrm9yX8tdAPxmzssshsNEh4dkW6JK\nx+nx4dP4otNy+QBT5rBAvY7fyganRluNRePEogkcWZpZGA1G7mu5i6gY4/W+2ZEh+djXn+18AYD3\ntN2b8XPBIMz0Pi0AwWTC2tSsurl1LsHe5Gpgc80GOn098zZ6xWFva1Mn2COJKC/3HMJmtKXMG3PR\n6xSrJM+p3uCy2NgVDjTvx2ly8HLvoXlljSPd3QhWG+Y6dQla3b5erkxeZ23lKlZW5Vfm+HalJNjR\nJzkH0h2oXaquV7JOswnnjvI2NlSt5erkdS5PXEv9PdIrN69Wm4QRS8T4deeLmAQj97ZkfpFhZh4K\nFWqK5hjuzJ1OHgwosfzZbcPKZnSw7w3CaUfwlIamUmPv8fVxevQ87WWtbKpen/U6R4EF0RSsrW1y\nc+vB3MXhsvlb0nmw7X4Anu96ddbfw12dsuNU5Ty80f8W/liA+1ruxGG2Z7xGL43d5HTKvqcedb6n\nXPNgM9l4oPUAoXiIV5MmNQAxEiE6OICttVW14/TF7oPAzGloOVES7OinsRudTswejyoHaqZ43Uy8\nb+V7AXj6xq9TyRlhjbVRXus/zER4kgPNd6YyLDOhxNMXHPrZnhTs3SoE+5xyAhm/z2Tj/pa78ccC\nPN89I9RmTBDqErSeufk8AI+teHDBk47DaSURF4lG8jdJyePqmDXOhcgWGZROR3kraytXcXnyGlfH\n5MxkKZEg0tONpbEJgzV3Gnw0EePFnoNYjZZ5kVHpWG0mDEZBl2bO1tY2xECA+MR4zmuV9ZDJiaxw\nT/N+XGYnr/a+ntLatTpOu329nB49T6u7ibWVq1TdU0yUBDv6aewgmyHEQID4+MIO1Eg4jpiQcgr2\nFncjd9Rto9c/wMmRs/K93eodp/5YgOe6XsZhsvNQ+/0LXjtz/C4sIsRUVS1XOFQR069GoAE80HqA\nSmsFr/S+zlhoAkmSCHfexFhRgakid1OJ61OdXJy4wpqKlTm74ug1D6kNrjN3eYhcphiFhzveDcC/\nnvwhoiQSHRpEikZTz8rFq72vMx31c6D5TpxmR9brBEE2Sekh2BVnphqHeiAQxeYwY1ygcbjNZE1q\n7WFe6T2U/O6u5LPacz5DlER+dPVpJCQ+uOqRvGrX3+6UBDtgs5sRhMJty6Dezp7LlpjOYysexCgY\neebm88TFOOGebtXFnp7rfJlQPMx729+FY4EXGfQ7uQiCgLW9ndjYKAn/wr0y1ZggACxGCx9Y9TBx\nMc7Prz9LfHKShNerSlsXJZFfXP81AI+tfDDn9XqZIaxNzQgmkzqTlMr1sKqig931O7g52cOhviMz\np5b29pzPGA6O8uuul3CbXTzQeiDn9Ypg18MkBRBRc3LxR3HmWAsga+1ui4sXe15jKDCcMn+q0djf\nGjxOl6+HnbVbWbMMtXUoCXZgRjsp1LYMaY7DHNrJjKaa+/hcY6/m7qa9jIXGefbys0T7+7C1tee0\nJfb7BznUf4QaWxV3N+/P+Rw9Ty6KoMm5wanUVEGOkFlR3s7p0XN0npdjme0qBPvzXa/S6etmR+0W\nVpS357xeL1+DYDJhbWsn0tebMwM1FIgiCLKSkYsPrXoUp8XBMzefw3dD7g9rzeE4FSWRf7/0E+Ji\nnI+s/cCC2rqCw2VBTEhEwvm1jVRIKTs5NrhYNJF0pOdeC1ajhY+u+SBxMc5TF39EuKsLwWrNqewE\nY0GevvEbLEYLH1z1iOp/Q7FREuxJHAU2tVZIFYDKqbHnti2n89iKB6m113DhzKuy4zRH4a9ALMi3\nzn6XhJTgw2veh9mQu7OQXho7gK09Gb+cQ0tTa4oBeQP+8OrHEBA4d+ol+Tk5BHunt5tfd71IhbWc\nj679kJqhF9wuMR1be4ecgZqjMJrib1FjFnBbXHx8ywcJJyIMXz0jtwTMUdnyzYG3ueHtZKtnk+pa\n44rSESgwWcvocmGuqyfcdXPBDFTF9KVG2QHYVruZXXU76J/sITI4ILcEXEDZkSSJH1/7Jf5YgIfb\nH6DSVqHtH1JEFCTYn3vuOR599FHWr1/PhQvqakbfrjicFuJxkVi0MIeZ0eXCVFOT04G6UHJSJmwm\nG7+36XdpnJDHF2/OrpmIksi/XfgBY+EJHmq7n81pRa4WwmwxYjIb9NXYcwl2laYYhbayFt634iEq\nRmQTj9CcvRJfOB7m3y78AEmS+E8bPqpKS4UZwVKojR3SI4Sy29klSSLojy7oMJzL/Sv2s8rVimPE\nR6imbMGWgDemuvj59Wexm2z8b2s+oNqmrOcGZ1+xEjEUWrCEb0CDeVLhI2veR7vfgiBJRBqqFrz2\nmZvPc3ToJK3uplQo8XKlIMG+Zs0avv71r7NrV3G3kQL9Mu1A1lZFv3/BEr4zyUnqF3Gzu5GdIbmS\n43+E3s7YvT0hJvjZtV9xceIKG6rX8sgK9YkXejrMTBWVGMsrcjpQlSbWFqv6XqUPtNxDw6TERJmR\n73U9k7HD0Hhokq+c+iZj4Qne3XYvaypXqv5+XU8uyRPFQoI9GkkQj4ua1oJBMPCJqvswiXDdHeLZ\nzhczXnd54hpfP/0vxMQ4v7vutym3qs/Q1cskBaROmOGb2edB2UDU2NgVHGYHDxnWAfBi4irnxi5m\nvO5g75s83/0KHns1f7T192YVfluOFCTYV6xYQXt7e8Hmi9sBPe3L9lWyQyZ841rWawIabMsKkiTh\nGJwk6rJxnXH++7Gv8ubA28QSMSRJotPbzd8f/0de7XsDj72axzf8DgZB20+smKREsfDf1NbeTnxy\ngrh3Kus1akI+5xIfGcYUjROsr+T06Dn+7uiXOTt6QY6UiYc5N3aRvz/+VXqn+9nXsItHO7RlFerl\nPAW5hK/B4Vjw5JIyy6k0QSiYBuSNPVBXwW+6XuKpiz+k0ys3E58IT/JKzyG+ceZfEZH47OZPsq1W\nW7s3p1OfujkAthXyxhq+OT+LWkFLQEE65UNyiejBWgvfPPtdXuh+lfHQJJIk0Tc9wJMXvs9Prv0S\nt8XFn2z7zLz6QMuR5b1taSC1iHXQ0uwrZcEeun6dsn2ZE4JSha80CLX4xAQJr5eqHTt5fOPd/MeV\nn/H9yz/l+5d/ikEwpOLc72zczftXPpwzCiYTDqc11dRaq8Cdi629g8CZ04S7unBtnV+PRBQlQoEo\ndU3qtUiYMe9s2v4uhhoDHB44xjfPfReb0Uo4IQsho2Dkd9Z+iDsb92gOZzOaDHJTax0EuyAI2No7\nCF68QMLvz9h0PJDH6Q1InYYevPPjdE78hreHTvD20AncZhfTMdlUZTGY+YMtn8oZ4pkJXcOAm5oR\nzGbCndkFeyCPDU6SJELXr2EsL+f37v5j/vncv/H0jd/w9I3f4DQ5CMTlshaNznr+04aPUmPX1rug\nWMkp2B9//HHGMhS1euKJJ7j//oXjohfC43Hnfe9iUN8oCxdBmj22fMYpVmygz2Ih1n0z6/3RcByH\n00J9vfqGu2NXzwFQvXkDWzfdza6Ojfzowq+YCE4RSUSxGM18eOPDrPdof4kVqmuc3LwyitVsKvg3\nMm3byPjTP8cw1IvnATkZJv07/dMRJAkqq5yanjU9JJfCbb1jO19Ys5rf8j3ED889Q59vkFpnNR5H\nNfd27GNVdXte4/Z43JRV2Jn2hnVZp8GN6whevIB1apjKjoZ5nw/2eAGoayjT9LxY900MFgsb9uzm\nHw17OTt8iYOdR7gwcpXtDRvZ0bCZXU1bqXLk5yS0W+UInXhMLGgelHuHV6/Cd/kKVS4TRvv8jFcx\nLp8SW1orqax2qvruyOgoiakpqvbuYf2qjaxs/L95vfsoNya6uTnZTXtVM4+tfTfbGzbm3OBvN5lU\nCDkF+5NPPrkoDx4dzd2YdimJJ731I8PTqbF5PO68x2ltayd4/RpDPSMZF7HPG8JVZtP0/aOnzgOQ\nqGtO3mfmtzs+OG+chcytYJQXf3/fJEZLYUFTiepGEATGz5zH8eD0vHGODcv/bTQZNI158uIVMBoJ\nuqoJj05jxcUn1/zO7IvE/OZBGaPVZmJ0KMbgwBQm8/xKmFoQa5sAGD59gXjzfFv/0IAs2EVJUj3m\nSrtAsKcX+9p1jE/K2ZdNplY+vroV0vb1RABGA/mtB1GUEASYnAjkvabSf3NjcxtcvETf8XM41s0v\n6TAxLhd5C0diqp83ffQMAIaW9uQ9RvbX7GN/zewSvGNjC+dTFPKuLyVqNx/dwh2L3c7u1Cm0S8G2\nchUksyPnEo8liEYSmk0doZs3wGDQVL1OK3ral40OB9bmFsKdNxFj8xuGaAl1VJDicSK9PVhbWjGY\nc8d858tSOlDzsS37Ll8BScK+Kv/TWS4MSkG06cLnANLs7FnmIdVBSsNGGrpxHZgxf5aQKUiwv/TS\nSxw4cIAzZ87wuc99js985jN6jWvJ0VOgAakXLpxceOnkLdB6urE2NauqCZIvelX1U7CvXo0Ui2Us\njKY11BFk+7oUj2PX2MBbK3rOg6miAlNVNaEb1zPGcSvKRKbyzdnwXZCjP+yr1xQ8voXQqyAazETG\nhLI4UIP++R2kchG6cT2ZCLa8qjMWSkHO0wceeIAHHnhAr7HcUowmAza7WZfQLgDbSlk7CV2fHxmT\nj0CL9PUixWIprWex0H2DW72WqVdeJnTtKuzbMeuzlNPQrX4eglfkZsb2Net0GV829HQcAtjXrmX6\nyGGiA/1yL9A0gn456zS9iXUufJcugyBgX7nI68FlZXTITzQSx2or7IRkqqzCWFFB+OYNJEmaZfNO\nxEUi4Tg1deojVsRIhEhPN7aOFRjMy6OlnV6UMk/TcLosuoR2AZjcZZjr6uRFPEdLyycRQ9FycmWc\nFopTx9hlkDV2QBbsc8hHUw1dvSJ/75q1OowuO8qY9BLsjrXyRhRMjj+dgD+C3WGZ10EqG2Isiv/a\ndaytbRhsmcvu6oWe60EQBOwdK0l4vfMqPeZzig13dYIolswwGSgJ9jQcbqvcQShaWG0MBfuKVXK2\n3eDsbDslo1GTQFM01UW0qQLYHMkOQjpkXYKcqGT2eAhdn2+GCE5re5mleJzQtatYGpswlWkLkdSK\ncnIJ6DQP9qRgV35HBSXrVJNA60yao1Yv7lqAxTjBJTf6K7M3uFSoo1P9O6GYOW2LfGopRkqCPQ29\ntVVbMlEpNCdRSUvhK5CbFQcvX8JUVY25tk6XsWXDYBCwOy26vcgA9lVrEIMBgr19s/4e8EcwGAVV\nha9Arr8jRaPY1y6utg76m2LMNR5MlVWErlyZZa/OJ+s0nDTvLbZ9HcDp1i9JCcCxXi5vEbw0O0M0\nmEcsf8lxmp2SYE9D7+O3suDC1+YIdo2mmEhPD2IggGPDhiWpHe10yZUu9Yp0UgSQ7+Lslzngj+J0\nWVX/mxRtVzFrLCZ6a6qCIGBfu5aEf5rowExHpXyyToNXZbOWfdXiC/bUyUWnebA0NWN0uwlcujBr\nfWk1xUiiSOjGdUzV1arq8b/TKAn2NGZqY+ijnVgam+RFfPH87EWs0XmqaDeKtrPYOF3WlDNLD5Tj\nt+/ijBlCFCWC/ogmDW2pHKdAMuzOoFt0EMxsSKErl1J/05p1Koki4RvXsDU2YCpXn9yWL3qfXASD\nAce69SSmpoilFQTT+k5EeroR/f4leyeKjZJgTyNlitEpblcwGHBs3ETC6yXa15v6e9AvF74yW9TF\n6wYvyZUzHeuWSLAnj9+BaX02OHN9A0aXG9/FGYEWDkaRJPWaqhSPE7p+DUtD46Lb1xWcLqu+Jqk1\n8x2oWjX2aH8fYihE2frsPVv1xKljpUsFx/qNAATSzDFaywkEzstZ2M5N2urfvFMoCfY0UuVaddLY\nYWbhKQsRwO+PqDZBiLGoLNCampdEQwP9fQ2CIGBfs4bo2BjRoaFZ36021DHc3YUUiaSckEuBw2kh\nFNSnIBqAubYWU2XlLDu7Vo1dOb2VbVwawa6EYOql7EBmO7tWG3vg/DkQhNQmUWI2JcGeht4CDcCx\ncRMIQkqwJ+Ii4WAspRXnInzjBlI0uqRHTr01dgDnlq0A+M+ckr9bY6ijEua4FPZ1BYfLgiRBKKjn\nBreOxLQvFSml1d/iP30KBIHKnTtyX6wDBoMBu9Osq0nK7PHIkVKXLyEl5JLLWk6xiUCA8I3r2Fas\nxOhUV1PmnUZJsKeRqsmuo8ZucpdhbWsndP0aYjg0I9BUaqpLbV8H/TrnpOPcvFXe4M6clr97WqOm\nelk24yx2/Ho6etuXIS2e/bL8u2rZ4BJ+P6Hr17CtWImlYum6/zhdVgL+iK5lQxzrNyCGQqkG14FA\nRHUHqeClCyBJODdv0W08y42SYE/DaJS1Ez01dkiaYxIJgpcupR291WmqwUsXwGDAsQQhfgrKpqPn\nPJjKy3GvWU3o+jUSfr8mm2oiECB4+RLW1rYlM0dBWv0gHU8ujqRpzn/yhPzdGrJOA+fPgihmLIG8\nmDhcFuKxwruLzfrOpAkleOkCoigSCsRK9nUdKQn2OSyGdpJuZ1eEhBpTTCIYINzZKadML3KGYTqu\nRTDFAFTt3gWiSODc2Rmbqop58J8+CYkE7juWtlNXyiSl48nFXFWFbeUqQlcuE/f5CGrIOvWflk87\nzq3bdRuPGmYK5OnoSF6XPLlcukgoEEs+J/fpTZIkAufPYXS5sbaW6sNkoyTY5+BcBO3E1rECg8NB\n4MI5/Elh6VIj0E6evCVHTovVhNFk0NUkBVC1+w5AtrPPmCByv8z+48cAcO1cWsGu/EZ+nTc4985d\nIElMnzyhOutUiscJnj+L2ePB0pi9z+ti4FgkE6WtYwWhq1fwDcvlBdTMQ7S/j8TUFI6NmxZsXP1O\npzQzc1gM+7JgNOLYsJH42BjTQxOAOk3Vd+RNAMr27Mtxpb4IgiAnKekYCQFgb2nB7PEQPH+OgC+C\n2WLM2es0EQwQuHgBa0srlrrFzbqdy4wTWd95cN0hb3BTx0+qzjoNXrmMGA7j3Lp9SZLU0tGz92k6\n7n37QRQZPymH86oxTwbOlcwwaigJ9jnoHcuu4Eoen729Q7Oek43Y+BihK5exr1mL2ePRdSxqcLqt\nBANREon5ZWbzRRAEnFu3I4bDBLxBVRqa/9QpSCRwLbEZBtJ8DTpr7OaqamwrVjJ1U85tUGNbDiSj\niVzbltYMA/pnZCuU7doDRiMTV+X67K6yhedBkiR8bx0GoxHHpk26jmW5URLsc9C7NoaCa+cdGBxO\npsenEYTcx07fW0cAKNu7X9dxqEV5mUM6hrmBLJhEDISjkioNzX9CNsMstX0dwGQyYrObdDfFgLwe\nIkbZb5Jrk5dEEf/p0xgcjkUvApeJmdr0+s6D0e3GuXkLfp86v1P4+jWi/X24tu/E5F6aJLVipSTY\n57BYx06DxULZnXcRESzYzCzoLJMkCd+RNxFMpluiqcLiRMaAXJ0yXlkLgMO+cMxyIhggcOE81pYW\nLHX1uo5DLU63VXeNHeSNShHsuTT2wNkzxCfGcW3fiWBa+v7zi3WKBSjbt5+ISW66nsvvNHXwFQAq\n7r1P93EsN0qCfQ56t8hLp/yee4kYnViiC/dfjHR1EhsawrV9B0aHQ/dxqGExQv0ABJMJy54DAJgm\nBhe8dvrYMdkMs8RO03ScbiuxaIJoRJ+6OQrm6hrE2mYArGSfY0mSmPj1rwCofM9Duo5BLQ6XFUEA\n/3RY9+92btlGxCr38XQ4sod8xqd9+E8cx9LQuKTZx8VKSbDPYTGSUhSkihpEgxGzf4LIQH/W6xSn\nqXvfrTHDwOKE+ikIa2THl3TzEmI08zyLkQgTv3oawWymbP9duo9BLYsV+gkgJRtbx46+kfWa0LWr\nhG/ewLltO9amJt3HoAaDQcDhshLw6T8HBrOZmKMKczxE5NrlrNf53ngdKR6n/MB9S+48LkZKgn0O\n9mSjCb1NEEDKlmiNB/AefDXjNdGhQbyvH8JYXoFzw61zEC3m8Tuc/EqzfwLf4cxCbfKlF4hPTlL5\n7gcxV1XpPga1KCeXxbCzx9zVAMRPHSHS25PxmolfPwtA1Xsf0f35WnCVWQn49auboyBJEiEs2OIB\nxn72E6T4/JORJIp4XzuIYLFQtv/WKTvFREGC/R/+4R9473vfy/vf/34+//nP4/cvbGIoBpTO7Ho7\nT2FG+7WbJbxvHCLS2zvrc0kUGXryO0ixGLUf+/gtsacqLKbGrmyaNqJMPv+bVL0QhbjPx+RvnsXo\nclP50MO6P18Li1E3R8E/HUEQwBIPMfqTH837PNLbQ/D8Wexr1t7yZhIutxVRlHR3pkfCcRIJCWeZ\njUh3FxPP/XreNVMvv0hsbBT37r0YHaXaMGooSLDfddddPPvsszz99NO0tbXxzW9+U69x3VIcLquu\njSYUFOHg2b0NKRql/2tfJj41lfp88sXnCd+4jnvXbjmJ5RaSciIvgkBTNovqbZuIjY4y+sMfzGqb\nN/7M04jhMFXve/8t8zEoLKZgD/giuMpsODdsIHjh/KwKoHHvFEPffRK49do6LF6yljJpmH0fAAAa\nGklEQVSvVWtXYKyoYPyZp4mklbj2nznN6I/+A2N5OdXve7+uz17OFCTY9+/fn4ru2LZtG0PJkqzF\njtNlIREXCQVjun6vsohrNq+j5kMfJj4xQf/XvkLg/Dkmnv8N47/4GUa3m9qPfULX5+aDEuq3GCYp\nZR4aH3svloZGpl55iYGvfYXg1Sv0f+0reF99GXNdHRX33Kv7s7WSEmg6z0MiIRLwR3G5rdR8+CMg\nCAz809cY/fF/ELx0kZ6//SsiXZ2U7b8zVV/mVuJMxpj7dbazKxuFu8pJ3Sc/BYkEg//yTbyHXmP6\nxDEGv/UNBLOZpj/5Auaqal2fvZzR7az/k5/8hEceufWahR64ymwA+KZCGC36uSHSC4BVvPcRosPD\n+N58nf6vfEm+QBCo/cSnMLrduj2zEBwuK36f/pEQQX8Um92EraaKlv/z/2Hwm/9E4NxZAufOAnIr\nvdqPfeKWmqIUUmGfOgs0xTnvKrNia22j/vd+n7Gf/pjJ559j8vnnAKj+4G9R9fCjt4WzcLGcyOm1\nk1ybtlF+zwG8h15j+KknU9c0/OGfYOtYoetzlzs535zHH3+csbGxeX9/4oknuP/++wH4xje+gdls\n5rHHHlP9YI/n9hBemahvLOP8yX68UyHWbtQvfjoWkW3JbR3VWG1map74Y3qb5DR5Z1srrlUrsdXn\n97zFmM/KagcTowHKy+w5U//V4vG4CQailFfak2N2U/fXf0H3975P4GYnTR98P+Vbt9xSYZY+l5Ik\nYbYYiYRius5xKOmU9tSV4fG48Tz2IB0P3sfwiy8x+tobNH3wfVTv26t6nItNJCg7NRNxUfNzF7pe\nTMjmzqaWSjweNzX/+fP4H32IYE8Pwd4+XCtX4jlwd/4D12mcxUbOt/XJJ59c8POf//znvPbaazz1\n1FOaHjw6Oq3p+qVEMMpCxTcZ0nWck+MBzBYjvukwJGOCHe95FAAJmAam83iex+NelPlUmh50dY5T\nWV24rdvjcTPQP0UkHMdqM80as/PhD+AEYsDY2K1zwmeaS4fLwtSUvmuht2cSAKNZmPW9pt1307D7\nbkQWfkcW6zfPRizp4B4dntb03FzjHB2SP4snEjPXVTVgqGrAtW2PfM0S/DuXej7zRe3mU5Cd4dCh\nQ3z729/mG9/4BhaL+qbEtzvKsdM7FdL1ewMamzffapyL0CpwOmnaUcxdxYDLbSUcjJGI61c3J6Ch\nyuftgMNpwWAQdDfF+DWUsS6hnoLO13/zN39DLBbj05/+NABbt27lL//yL/UY1y1FKUbkndRPsMfj\nCcKhONW1Lt2+c7FZjIgQxWbvLi8ewZ6ejVxWoU9dfH9qgysOgSYnKVkWJSrGZjdhNqtr7F5CHQUJ\n9hdeeEGvcdxWKCnUemrsWhpL3C4ojkM9X+Zpb1JTLRKBBmkRIdN6CnZlHopng3O5rQwP+BBFCYNB\nHx+IPKfFMwfFQinzNAMGg4DLbcWno8ZejEdOd1Lo6Bnipphi3MUk0Bahbo5/OoLJbMBqu/WRP2px\nlVmRJHRrbB2NxIlFE0VjjiomSoI9C64yG9O+sG71yFM2VZV9HW8HFG1y2qtfyGNRmmIWySTlcltv\ni1BGtSjzoFcIbDEqO8VCSbBnwVWe1E50SkyZidctHuep1WbCYjWltGw9mPZGVNWjv53Q2yQVi8n+\nlmIywwC43PJ49drgis2BXEyUBHsWUtqqTkJN0XqLSVMFKCu3Me0N61Zewe8L43RbMRqLZ+nNJOfo\nu8kXm0Bz6Zx9qrbBRgntFM/btcS4dV7ExSrYXeVW4jGRcKjw8gpiQiQwHSk6TdWuc6hfSqAVkQMZ\n0kwxemvsRTYPxUBJsGfBlXIc6qOx+7xhLFYjVlv2ZgK3I8pGpMcG5/OGkaSZTbNYUJp769Vowl+E\nDmSYEcC6bXAlG/uiURLsWdDz2ClJEtPeMGXl+oTKLSXuVN2cwoWaEj7qKrJTC8gbXGA6qkuS0kyo\nY3EJNLtDPrnodYpN+Z2KKKCgWCgJ9iwojiI9NPZwKEY8JhadGQbSNXYdBHsyfLTYNHYgFb+uh8/F\nX6Q2doNB55PLdASL1ahbHaISM5QEexasNhNWm4lpHbSTYrWvw8yY9Qh59E4GgeJKylFwV+h3cim2\nrNN0nGU2gv4ooljYyUWSJDnkswjXQjFQEuwLUF5h10VTTQn2Isyw01ewh2Z9ZzFRVj5TyrlQ/NMR\nrDYTZkvxaaoutz5hwJFwnGgkUco6XSRKgn0ByirtRCMJIuHCOtQrWl4xCjRZABl1MUEUsynGrZhi\nCtzgZE01UnRmGAW9fE/KWtCrREOJ2ZQE+wKUJxddoTZFRRiUFaFgFwQBV5lVN429WDXVMp1MMak0\n+iLc3CDNmV7gelBOPiWNfXEoCfYFKK9MCvYCtZNitrGDvCEVenKRJAnvVKho58DhtGA0GZj2FmaK\nmYlhL855KEu+E4XWUVI2yJLGvjiUBPsCpDT2As0QPm84lZ5fjLh0iIwJh2JFrakKgoC73Fawxp4K\ndSxSU0x5pbwWCq18OqOxlwT7YlAS7AugaCeFRMYoMezFqqmCPsdvRaAVW1JOOmXlNiLheEEnF8W2\nrJwGiw1XmQ1B0FFjL+L34namJNgXQA+NPZTsvFPMtsRULHsBgl0xRxVzeJvyGxZijil2wW40GnCX\n23TR2F1lVoymkghaDEqzugDu8qR2UsDxu9jt66BPyGOqDnt5cZogANzJzOFC1oMSy1+sgh3ksYcC\nMaKR/E4uibiI3xcpaeuLSEmwL4DRaKCswo53In/tRLElLgvBXsDJxZ/snFTM86BHZIx3MoTdaS5a\nfwvM2MXznQdlHZXs64tHSbDnoKLKTjgUy7u64XLQ2O0OczIiJH9fQzE2sZ7LzMklv40+kRCZ9oYp\nr3ToOawlRzlt5NsTuBTquPgUpDZ89atf5eWXX8ZgMFBdXc0Xv/hFPB6PXmO7LSivcsCNCbyTIWx2\n7ZUZZ2LYi1c7EQQBd4Gx7FMTQSxWE3ZHcVW3TCelqeY5D9PJ6pbFbIaBdI09T8E+mXwninwebmcK\n0tg/85nP8Mtf/pJf/OIX3HvvvXz961/Xa1y3DRVV8uKbGg/mdf+Mxl68tmWQtVUlZFEroijhnQxR\nU+sqqlZwc0nVD8rTBKGY9IpesCshjwVr7MU9D7czBQl2p9OZ+u9QKITBsPwsOxVV8rF5ajI/we7z\nhrHZzUWZbZmOklKfz8s87Q0jJiSqa525L77NcZfbknXltXeUUtaQoiwUKwVr7KnkpJIpZrEoWNp8\n+ctf5umnn8btdvPUU0/pMabbivKkYM/HgSpJEn5vmOpal97DWnIqq+V5mBwPUFOn7d+jnHZqlsE8\nlFXYGBv2EwxENdcR9y2T+ihmsxGny5J3LLtvKoTZYszLtFlCHTkF++OPP87Y2Ni8vz/xxBPcf//9\nPPHEEzzxxBN861vf4nvf+x6f//znVT3Y43FrH+0toL2jGrPFiN8X0TxmnzdEIiFRU+ta9H/vYn9/\n+4oa3uQ60VBC87OuXxgBWJJ50IOFxljXUM7NK2MYMWj+twT9sgN+5eparLbCT3C3ci6ra130dE5Q\nWenAZDIueG36OCVJwucNU1XjpLa2bLGHqYliWJtqybm6nnzySVVf9Oijj/IHf/AHqgX76Oi0qutu\nJR6Pm7ExP+UVdsZH/YyM+DTZiHs7JwCwuyyL+u/1eNyLPp8Gs/zv7uuZ1Pysvp5JAKprF3+chZJr\nLk0W2dzY0zWOzaVN4xwdnsbhtOCbDkGB07AUv/lCOJwWkODm9bHUaS4Tc8cZDESJRRM4Fvmd0Mqt\nnk+1qN18CjKKd3d3p/775ZdfZsWKFYV83W1LeZWdeEzU3OtxYjQAQLWn+G3LTpcFs8XI5HhA871T\n40EEAapqijvMD/KPZU8kRPy+cNE7ThXyLQZWcpwuDQWdB7/0pS/R2dmJwWCgsbGR//bf/pte47qt\nSDlQJ0Ka4rAnxmQhWFlT/IJdEAQqaxyMDfkRRVGTo3xyIoi73JbzyF4MKGtB6wbnm1oeoY4KqVh2\njQ7UkuN0aShIsP/jP/6jXuO4rSlXQh4ngjS3V6q+b2IsgMEgLJuXubLaycjANN7J8ILH73TCoRjh\nYIy6huVhv3SX2zBbjIyPaBPsqVICRR4Ro1Be0thva5ZffOIiUJFHZIwkSUyOBamodmA0Lo9pTkXG\njKkXalMTyRA/lRvB7Y4gCFTXOpmaCBKPq4/pXy4x7AqKxq1VY1fWTrGHfN7uLA+Js8ikkpQ0xLL7\nfRFi0QRVy8AMo1BZo5gh1M+DEuqobI7LgSqPC0mCyTH186AIwOUi2K02M1abSXNew9iwH4vVVNQl\nNoqBkmBXgdVmxuYwa9LYFcfpcnAYKlRWy5uUFvuysgksF40dZpzhym+shuWmsYN8gvNNhojH1J1c\nYtEEUxMhauqKOwO5GCgJdpVUVNnxTYVIJERV1yuO06plEBGj4C63YTQZNGmqisau1iZfDCgJZ+Oj\nftX3eCdDOFyWos9ATsdT70aSYFzlBqfM13JIVLvdKQl2lVRUOpAk9WFuyykiRsFgEKiosjM1HlSd\nUj81EcRqMy2rLEPFvKbWgRoJx5n2qnc4Fws19bJDfHRIXfz32LAs2Ks1Zi6X0E5JsKskPTJGDROj\nAYwmw7Lz/lfWOInHRVWVHhMJEd9UmIpqx7I6elttJtxlVtWmGEXw1TbcXpmWheKplwW0WsE+PlLS\n2JeKkmBXiWJSGVOxiEVRYmo8SGW1A4Nh+Qg0SK8Zk3uD802FEEWJymXkOFWoqnURDEQJBaM5rx0Z\n9AFQu0xCPhUqqx2YTAZNGrvBIKSc8CUWj5JgV0ldo6xtDfX7cl477Q0Rj4vLKiJGIeVAVWFnV65Z\nTo5TBcWBqsYcMzwgrxllDS0XDAYD1XUuJsdyh36Kosj4aICqGueyCf+9nSnNsErsDgsVVXaGB3yI\n4sL25YlRWaAtJ8epwkzIo3qBphzZlxNqHaiSJDEyMI3TbcHpLu6a/Jnw1LkRRSnnBuedCJGIiyX7\n+hJREuwaqG8qJxZN5EzQmXGcLj9NtbzSjsEgpOylCzHQO4XBIFDXWL4EI1taUiGPOQRaYDpCMBBd\ndvZ1BbV29jHFvl4S7EtCSbBroK5ZMcd4F7wuFeq4DE0xRqOB2sYyxob9RMLZ+8DGonHGhvx46t2Y\nLcVfI2Yu5VV2jEYhZ6jfyKDiOF1e9nUFj8rIGCUipuQ4XRpKgl0D9U2y5jnUl93OLkkSw33eZZ1d\n19xWgSTBQM9U1muG+mWTVUPL8tPWQbYvV9Y4mRgLLGiaW672dYXKGtmBOja08AkuFepYEuxLQkmw\na6Cy2oHFalpQY58cDzLti9C6onJZhfiloxRC6+uazHrNYK88R40tFUsypltBtcdJIi4u6G9QNHZF\ns11uGAwGqmtdTIwFsjpQJUlibMSPu9ymS4ORErkpCXYNCIJAfVMZvqkwwUDmMLeeG+MAtKyoXsqh\nLSm1jWWYzAb6urNr7AO98mf1zctTYwdobJM3uO7r4xk/F0WJ0aFpKmtkhWC54ql3IYpS1rj+oD9K\nOBgr2deXkJJg10h9k3ykHs4S9thzU+6a1LqiasnGtNQYjQYaWyuYGg/iz9B8JB5PMDLgo6bOtaw1\ntPZV1QgCdF6d3zoS5HIKsWhi2TpOFXLZ2ZV3Yrmao25HSoJdI3WKnT2DOSYaiTPY68VT75Jbhy1j\nmpPaan8Gc8zIwDSJxPK1ryvY7GYaWysYGZzOuMEp9vXl6jhV8CT/fdlMc9cuDgOwan3tko3pnU5J\nsGukrtGNIGROVOrrmkQUJVqXsRlGoSkp2Pu657/Mg0kzzHK2ryt0rKkBoCuD1t6f7PW63DXVqhon\nVR4nXdfG55kofd4Q/d1T1DeXL9tggtuRkmDXiNliorrWxeigj3BodrhfygyzcvmaYRSqa53YHGb6\nuybnFQQbSDpOl7vGDtCxWhbsN6+Ozvq7fzrCjUujVFTZl71tWRAENmxrQBQlrpwbmvXZhdMDAKze\nUNLWlxJdBPt3vvMd1q1bx9RUdmfacmLNxjoSCYmTR2aaeUuSRM/NcWx207K3qYL8Mje3VRDwR2cV\nRgsGogz1e6msdmB3LG9zFICrzEZtg5uBnqlZG/25432IosTWPS3LNjoqnTUb6zCaDFw6Mzhroz9/\nsh+DQWDlOs8tHN07j4IF+9DQEIcPH6axsVGP8RQFm3Y04S6zcu5Ef6rK4cRogMB0lJaOqmVX+Csb\nTcmwx7PH+1N/e/2Fa8RjIhu3v3PWQ8eaGiRpJjomEo5z8fQADqeFNRvrbvHolgarzcyqdR68k7Lp\nBeTQ38E+Ly0dle+ITf52omDB/nd/93f82Z/9mR5jKRqMJgO77ulATEgcfb0T72SIF56+CEB78mj+\nTmD1+lqqPE4unhrg3Ik+blwe4eaVUeqby9m0s+lWD2/J6Fgja6PnT8kb/cUzA0QjCTbf0YTJtPyy\nbrOxYZu8mV86M5A0ywwCsGrDO2Nzu50oKBbtlVdeoaGhgbVr1+o1nqJhzca6/7+9u4tpMkvjAP6v\ntIDDOKaK06DD6CwOG4gFRhPdgURtbeSjVlFRboymDUZvrCB+hKJGA8aAqJekxAjRZDTK2myI0Wym\nWiEIIsYFN6Q6bHAcjAVRMhSj9OvZC9dO2NJqzOgp5fndnSYn+acfT09P3/c56Or4DY/+PYBfe19g\n7I0H6Uu/mVI/OWXRUuQVKPH3c/fQ+nMvZNFSREmnQZX31ymx/fCOfPYXSPxOjt/6hvGT+Q6ipNMg\ni46aUr9aAEAx7yvI47/Af+zP8fiXFng8Psiio/Dd95F/MUG4eW9h1+v1GBoK/Me/uLgYZrMZZ8+e\n9T/2oafqRAKJRIK/rfwLrl56ALfLixU5yf4Vy1QyY2Yscjcq8Y+f/gXXmAc/qpIi6uDqD5W3KQ29\nPQPobP0Vvw+/RvrSRMTERs6pUR9CIpFg8Y/z0fLPX/DVzFjMmhOHH5Z+G1HHAU4WEvrIavzo0SPo\n9XrExsa+7Y8yMACFQoHLly9j9mz+hmaMMVE+urD/P7VaDYvFgpkzI/8SN8YYC2d/2nXsEolkSm3F\nMMZYuPrTVuyMMcbCA995yhhjEYYLO2OMRRgu7IwxFmGEFXa73Y7CwkLk5+ejoKAADx48EBXlvc6f\nP4+cnBzodDrU1NSIjhNUuPfsqa6uRm5uLtatW4ddu3ZhdPT9B2J/Ts3NzcjJyUF2djbq6upEx5mQ\nw+HA1q1bkZeXB51Oh3PnzomOFJTP58P69euxc+dO0VGCcjqdMBqNyM3NhVarRVdXl+hIE2poaMCa\nNWug0+lQWloKl2vig378SBCDwUAtLS1ERGSz2WjLli2iooTU3t5Oer2e3G43ERG9ePFCcKKJPXv2\njAwGA6lUKhoeHhYdZ0Ktra3k9XqJiOjEiRNUU1MjONEfvF4vaTQa6u/vJ5fLRWvXrqXe3l7RsQIM\nDg5ST08PERGNjo7S6tWrwzInEVF9fT2VlpbSjh07REcJ6sCBA9TY2EhERG63m5xOp+BEgRwOB6nV\nahobGyMiot27d5PFYgk5R9iKXSKRwOl8e+KK0+mEQhGe/SQuXLiA7du3Qyp9e/fcrFnh2ZJ3MvTs\nyczMxLRpb99yGRkZcDgc75nx+XR3d2P+/PmYN28eZDIZtFotrFar6FgB5syZg5SUFABAXFwckpKS\nMDg4KDhVIIfDgVu3bmHTpk2iowQ1OjqKzs5ObNy4EQAglUrx5Zfh2WLZ5/Ph9evX8Hg8ePPmDb7+\nOnQbZGH3+paVlaGoqAhVVVUgIly8eFFUlJAeP36Mzs5OnD59GjExMdi/fz+USqXoWONMxp49jY2N\n0Gq1omP4DQwMICEhwT9WKBRhvT0IAP39/bDb7UhLSxMdJcC7hca7xVs46u/vh1wuR1lZGex2OxYt\nWoTy8nLExobXgSAKhQJ6vR4rV67E9OnTkZWVhczMzJBzPmlhD9ZnpqSkBLdv30Z5eTk0Gg2uX78O\nk8mE+vr6TxknqFD9cLxeL0ZGRnDp0iV0d3ejuLhYyEpusvTsCfWaq9VqAEBtbS1kMhl0Ot3njheU\nyOfsY7x69QpGoxEmkwlxcXGi44xjs9kQHx+PlJQU3LlzR3ScoDweD3p6enD48GEolUocO3YMdXV1\nMBqNoqONMzIyAqvVips3b2LGjBkwGo1oamoK/fn55BtEQSxZsmTcePHixYKShFZUVEQdHR3+sUaj\noZcvXwpMNN7Dhw8pMzOT1Go1qVQqSk1NJZVKRUNDQ6KjTejKlStUWFjo3y8MF/fv3yeDweAfm81m\nMpvNAhMF53a7yWAwUENDg+goEzp58iStWLGC1Go1ZWVlUUZGBu3bt090rADPnz8ntVrtH9+9ezcs\n/w+4du0alZeX+8cWi4WOHj0aco6wPXaFQoGOjg4AQFtbGxYsWCAqSkgajQZtbW0AgL6+Png8Hsjl\ncsGp/pCcnIzW1lZYrVbcuHEDCoUCFoslLBuxNTc348yZM6itrUV0dHgdvKBUKvHkyRM8ffoULpcL\nV69exapVq0THmpDJZMLChQuxbds20VEmtGfPHthsNlitVpw6dQrLli1DdXW16FgB4uPjkZCQgL6+\nPgBAe3s7kpKSBKcKNHfuXHR1dWFsbAxE9EE5he2xV1RUoLKyEj6fDzExMaioqBAVJaQNGzbAZDJB\np9NBJpOhqqpKdKSQwrlnT2VlJdxuNwwGAwAgPT0dR44cERvqf6KionDo0CEYDAYQEQoKCsLyQ37v\n3j00NTUhOTkZ+fn5kEgkKCkpwfLly0VHm5QOHjyIvXv3wuPxIDExEcePHxcdKUBaWhqys7ORn58P\nqVSK1NRUbN68OeQc7hXDGGMRhu88ZYyxCMOFnTHGIgwXdsYYizBc2BljLMJwYWeMsQjDhZ0xxiIM\nF3bGGIswXNgZYyzC/Be68EGj7hfMcwAAAABJRU5ErkJggg==\n",
             "text/plain": [
-              "[]"
+              "\u003cmatplotlib.figure.Figure at 0x7f385e198650\u003e"
             ]
           },
-          "execution_count": 4,
           "metadata": {
             "tags": []
           },
-          "output_type": "execute_result"
+          "output_type": "display_data"
         }
       ],
       "source": [
-        "# Create TensorFlow Variables using Keras's Dense layer.\n",
+        "def f(x):\n",
+        "  return tf.square(tf.sin(x))\n",
+        "\n",
+        "def grad(f):\n",
+        "  return lambda x: tfe.gradients_function(f)(x)[0]\n",
         "\n",
-        "wb = tf.layers.Dense(units=1, use_bias=True)\n",
+        "x = tf.lin_space(-2*pi, 2*pi, 100)  # 100 points between -2π and +2π\n",
         "\n",
-        "# We can access the underlying TensorFlow variables using wb.variables.\n",
-        "# However, the variables won't exist until the dimensions of the input\n",
-        "# tensors are known. Once the dimensions of the input tensors are known,\n",
-        "# Keras can create and initialize the variables. Until then, Keras will\n",
-        "# report the variables as an empty list: [].\n",
+        "import matplotlib.pyplot as plt\n",
         "\n",
-        "wb.variables"
+        "plt.plot(x, f(x), label=\"f\")\n",
+        "plt.plot(x, grad(f)(x), label=\"first derivative\")\n",
+        "plt.plot(x, grad(grad(f))(x), label=\"second derivative\")\n",
+        "plt.plot(x, grad(grad(grad(f)))(x), label=\"third derivative\")\n",
+        "plt.legend()\n",
+        "plt.show()"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
-        "id": "docKLUaonYG_"
+        "id": "-39gouo7mtgu"
       },
       "source": [
-        "## Step 3: Define our loss function\n",
+        "## Gradient tapes\n",
         "\n",
-        "Our loss function is the standard L2 loss (where we reduce the loss to its mean across its inputs)."
+        "Every differentiable TensorFlow operation has an associated gradient function. For example, the gradient function of `tf.square(x)` would be a function that returns `2.0 * x`.  To compute the gradient of a user-defined function (like `f(x)` in the example above), TensorFlow first \"records\" all the operations applied to compute the output of the function. We call this record a \"tape\". It then uses that tape and the gradients functions associated with each primitive operation to compute the gradients of the user-defined function using [reverse mode differentiation](https://en.wikipedia.org/wiki/Automatic_differentiation).\n",
+        "\n",
+        "Since operations are recorded as they are executed, Python control flow (using `if`s and `while`s for example) is naturally handled:\n",
+        "\n"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "cellView": "code",
         "colab": {
           "autoexec": {
             "startup": false,
@@ -257,145 +182,42 @@
           }
         },
         "colab_type": "code",
-        "id": "0_w8ZJSCtuY7"
+        "id": "MH0UfjympWf7"
       },
       "outputs": [],
       "source": [
-        "def loss_fn(inputs, labels, wb):\n",
-        "  \"\"\"Calculates the mean L2 loss for our linear model.\"\"\"\n",
-        "  predictions = wb(inputs)\n",
-        "  return tf.reduce_mean(tf.square(predictions - labels))"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 6,
-      "metadata": {
-        "cellView": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          },
-          "height": 34,
-          "output_extras": [
-            {
-              "item_id": 1
-            }
-          ]
-        },
-        "colab_type": "code",
-        "executionInfo": {
-          "elapsed": 24,
-          "status": "ok",
-          "timestamp": 1505502830875,
-          "user": {
-            "displayName": "",
-            "photoUrl": "",
-            "userId": ""
-          },
-          "user_tz": 240
-        },
-        "id": "RkNbXoXkpjVH",
-        "outputId": "c36fc98d-3a57-4074-901d-c10ae017ae3f"
-      },
-      "outputs": [
-        {
-          "data": {
-            "text/plain": [
-              "\u003ctf.Tensor: id=40, shape=(), dtype=float32, numpy=7.3549819\u003e"
-            ]
-          },
-          "execution_count": 6,
-          "metadata": {
-            "tags": []
-          },
-          "output_type": "execute_result"
-        }
-      ],
-      "source": [
-        "# Test loss function (optional).\n",
-        "\n",
-        "loss_fn(inputs, labels, wb)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 7,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          },
-          "height": 51,
-          "output_extras": [
-            {
-              "item_id": 1
-            }
-          ]
-        },
-        "colab_type": "code",
-        "executionInfo": {
-          "elapsed": 57,
-          "status": "ok",
-          "timestamp": 1505502830981,
-          "user": {
-            "displayName": "",
-            "photoUrl": "",
-            "userId": ""
-          },
-          "user_tz": 240
-        },
-        "id": "K_7beXoHOU7t",
-        "outputId": "1ad0856a-02ec-4117-a6c0-b41030981d87"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "w: tf.Tensor([[ 1.56891453]], shape=(1, 1), dtype=float32)\n",
-            "b: tf.Tensor([ 0.], shape=(1,), dtype=float32)\n"
-          ]
-        }
-      ],
-      "source": [
-        "# At this point, the variables exist, and can now be queried:\n",
-        "\n",
-        "w, b = wb.variables\n",
-        "print(\"w: \" + str(w.read_value()))\n",
-        "print(\"b: \" + str(b.read_value()))"
+        "def f(x, y):\n",
+        "  output = 1\n",
+        "  for i in range(y):\n",
+        "    output = tf.multiply(output, x)\n",
+        "  return output\n",
+        "\n",
+        "def g(x, y):\n",
+        "  # Return the gradient of `f` with respect to it's first parameter\n",
+        "  return tfe.gradients_function(f)(x, y)[0]\n",
+        "\n",
+        "assert f(3.0, 2).numpy() == 9.0   # f(x, 2) is essentially x * x\n",
+        "assert g(3.0, 2).numpy() == 6.0   # And its gradient will be 2 * x\n",
+        "assert f(4.0, 3).numpy() == 64.0  # f(x, 3) is essentially x * x * x\n",
+        "assert g(4.0, 3).numpy() == 48.0  # And its gradient will be 3 * x * x"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
-        "id": "YIlebeb_qYtC"
+        "id": "aNmR5-jhpX2t"
       },
       "source": [
-        "## Step 4: Create our gradients function using `implicit_value_and_gradients()`\n",
-        "\n",
-        "With a loss function defined, we can calculate gradients and apply them to our variables to update them.\n",
-        "\n",
-        "To calculate the gradients, we wrap our loss function using the `implicit_value_and_gradients()` function.\n",
-        "\n",
-        "`implicit_value_and_gradients()` returns a function that accepts the same inputs as the function passed in, and returns a tuple consisting of:\n",
+        "At times it may be inconvenient to encapsulate computation of interest into a function. For example, if you want the gradient of the output with respect to intermediate values computed in the function. In such cases, the slightly more verbose but explicit [tf.GradientTape](https://www.tensorflow.org/api_docs/python/tf/GradientTape) context is useful. All computation inside the context of a `tf.GradientTape` is \"recorded\".\n",
         "\n",
-        "1. the value returned by the function passed in (in this case, the loss calculated by `loss_fn()`), and\n",
-        "1. a list of tuples consisting of:\n",
-        "  1. The value of the gradient (a `tf.Tensor`) with respect to a given variable\n",
-        "  1. The corresponding variable (`tf.Variable`)\n",
-        "\n",
-        "Test it out below to get a feel for what it does. Notice how the first value of the returned tuple (the loss) is the same as the value returned in the cell above that tests our loss function."
+        "For example:"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "cellView": "code",
         "colab": {
           "autoexec": {
             "startup": false,
@@ -403,94 +225,48 @@
           }
         },
         "colab_type": "code",
-        "id": "v1spZQ4NwW1U"
+        "id": "bAFeIE8EuVIq"
       },
       "outputs": [],
       "source": [
-        "# Produce our gradients function. See description above for details about\n",
-        "# the returned function's signature.\n",
-        "\n",
-        "value_and_gradients_fn = tfe.implicit_value_and_gradients(loss_fn)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 9,
-      "metadata": {
-        "cellView": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          },
-          "height": 153,
-          "output_extras": [
-            {
-              "item_id": 1
-            }
-          ]
-        },
-        "colab_type": "code",
-        "executionInfo": {
-          "elapsed": 46,
-          "status": "ok",
-          "timestamp": 1505502831114,
-          "user": {
-            "displayName": "",
-            "photoUrl": "",
-            "userId": ""
-          },
-          "user_tz": 240
-        },
-        "id": "21WMcpsmFFLd",
-        "outputId": "f51b3171-33f5-4f87-8bf7-0be2dc8edc8a"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Outputs of value_and_gradients_fn:\n",
-            "Loss: tf.Tensor(7.35498, shape=(), dtype=float32)\n",
-            "\n",
-            "Gradient: tf.Tensor([[-3.00773573]], shape=(1, 1), dtype=float32)\n",
-            "Variable: \u003ctf.Variable 'dense/kernel:0' shape=(1, 1) dtype=float32\u003e\n",
-            "\n",
-            "Gradient: tf.Tensor([-4.06519032], shape=(1,), dtype=float32)\n",
-            "Variable: \u003ctf.Variable 'dense/bias:0' shape=(1,) dtype=float32\u003e\n"
-          ]
-        }
-      ],
-      "source": [
-        "# Show outputs of value_and_gradients_fn.\n",
-        "\n",
-        "print(\"Outputs of value_and_gradients_fn:\")\n",
-        "\n",
-        "value, grads_and_vars = value_and_gradients_fn(inputs, labels, wb)\n",
-        "\n",
-        "print('Loss: {}'.format(value))\n",
-        "for (grad, var) in grads_and_vars:\n",
-        "  print(\"\")\n",
-        "  print('Gradient: {}\\nVariable: {}'.format(grad, var))"
+        "x = tf.ones((2, 2))\n",
+        "  \n",
+        "# TODO(b/78880779): Remove the 'persistent=True' argument and use\n",
+        "# a single t.gradient() call when the bug is resolved.\n",
+        "with tf.GradientTape(persistent=True) as t:\n",
+        "  # TODO(ashankar): Explain with \"watch\" argument better?\n",
+        "  t.watch(x)\n",
+        "  y = tf.reduce_sum(x)\n",
+        "  z = tf.multiply(y, y)\n",
+        "\n",
+        "# Use the same tape to compute the derivative of z with respect to the\n",
+        "# intermediate value y.\n",
+        "dz_dy = t.gradient(z, y)\n",
+        "assert dz_dy.numpy() == 8.0\n",
+        "\n",
+        "# Derivative of z with respect to the original input tensor x\n",
+        "dz_dx = t.gradient(z, x)\n",
+        "for i in [0, 1]:\n",
+        "  for j in [0, 1]:\n",
+        "    assert dz_dx[i][j].numpy() == 8.0"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
-        "id": "JVDWpL9VYWdP"
+        "id": "DK05KXrAAld3"
       },
       "source": [
-        "## Step 5: Create an optimizer\n",
+        "### Higher-order gradients\n",
         "\n",
-        "We'll use a `GradientDescentOptimizer` to fit our model."
+        "Operations inside of the `GradientTape` context manager are recorded for automatic differentiation. If gradients are computed in that context, then the gradient computation is recorded as well. As a result, the exact same API works for higher-order gradients as well. For example:"
       ]
     },
     {
       "cell_type": "code",
       "execution_count": 0,
       "metadata": {
-        "cellView": "code",
         "colab": {
           "autoexec": {
             "startup": false,
@@ -498,362 +274,45 @@
           }
         },
         "colab_type": "code",
-        "id": "DudNEebMKDWN"
+        "id": "cPQgthZ7ugRJ"
       },
       "outputs": [],
       "source": [
-        "optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "YBeJYxY8YaiO"
-      },
-      "source": [
-        "### Step 5a: Test Our Optimizer\n",
-        "\n",
-        "Now we have everything needed to start fitting our variables to the data!\n",
+        "# TODO(ashankar): Should we use the persistent tape here instead? Follow up on Tom and Alex's discussion\n",
         "\n",
-        "In the next cell, we'll demo these capabilities. We'll:\n",
+        "x = tf.constant(1.0)  # Convert the Python 1.0 to a Tensor object\n",
         "\n",
-        "1. Print the current values of `w` and `b`\n",
-        "1. Calculate the loss and gradients\n",
-        "1. Apply the gradients\n",
-        "1. Print out the new values of `w` and `b`\n",
+        "with tf.GradientTape() as t:\n",
+        "  with tf.GradientTape() as t2:\n",
+        "    t2.watch(x)\n",
+        "    y = x * x * x\n",
+        "  # Compute the gradient inside the 't' context manager\n",
+        "  # which means the gradient computation is differentiable as well.\n",
+        "  dy_dx = t2.gradient(y, x)\n",
+        "d2y_dx2 = t.gradient(dy_dx, x)\n",
         "\n",
-        "You can run the cell multiple times. Each time, you should see the values of `w` and `b` get closer to their true values of 3 and 2."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 11,
-      "metadata": {
-        "cellView": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          },
-          "height": 102,
-          "output_extras": [
-            {
-              "item_id": 1
-            }
-          ]
-        },
-        "colab_type": "code",
-        "executionInfo": {
-          "elapsed": 103,
-          "status": "ok",
-          "timestamp": 1505502831285,
-          "user": {
-            "displayName": "",
-            "photoUrl": "",
-            "userId": ""
-          },
-          "user_tz": 240
-        },
-        "id": "diDZfrMJM3OC",
-        "outputId": "d585fff0-ecb3-4e98-9b33-bbae07a95d8c"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Values of w, b, BEFORE applying gradients:\n",
-            "(array([[ 1.56891453]], dtype=float32), array([ 0.], dtype=float32))\n",
-            "()\n",
-            "Values of w, b, AFTER applying gradients:\n",
-            "(array([[ 1.86968815]], dtype=float32), array([ 0.40651903], dtype=float32))\n"
-          ]
-        }
-      ],
-      "source": [
-        "# Test the optimizer.\n",
-        "\n",
-        "print(\"Values of w, b, BEFORE applying gradients:\")\n",
-        "w, b = wb.variables\n",
-        "print(w.read_value().numpy(), b.read_value().numpy())\n",
-        "print()\n",
-        "\n",
-        "# Calculate the gradients:\n",
-        "empirical_loss, gradients_and_variables = value_and_gradients_fn(\n",
-        "    inputs, labels, wb)\n",
-        "optimizer.apply_gradients(gradients_and_variables)\n",
-        "\n",
-        "print(\"Values of w, b, AFTER applying gradients:\")\n",
-        "print(w.read_value().numpy(), b.read_value().numpy())"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "61TgeLVlKEQp"
-      },
-      "source": [
-        "## Step 6: Create a training loop\n",
-        "\n",
-        "Of course, now we can simply turn all of this code into a self-standing training loop. We'll also capture our loss and approximations of `w` and `b` and plot them over time."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 12,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          },
-          "height": 397,
-          "output_extras": [
-            {
-              "item_id": 1
-            },
-            {
-              "item_id": 2
-            }
-          ]
-        },
-        "colab_type": "code",
-        "executionInfo": {
-          "elapsed": 225,
-          "status": "ok",
-          "timestamp": 1505502831550,
-          "user": {
-            "displayName": "",
-            "photoUrl": "",
-            "userId": ""
-          },
-          "user_tz": 240
-        },
-        "id": "VukGe-huNaJ4",
-        "outputId": "f0a8d665-1910-477c-d8ab-c94ccdc4afcd"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[2.111051321029663, 2.3047544956207275, 2.4602210521698, 2.5850086212158203, 2.6851789951324463, 2.7655951976776123, 2.830157995223999, 2.8819968700408936, 2.9236228466033936, 2.9570505619049072]\n"
-          ]
-        },
-        {
-          "data": {
-            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAd0AAAFXCAYAAADnFpTQAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzs3Xd4FFUbBfAzu+m9koSShBQCSC+igIAgRRGkChJEiggo\nHURAEBQBQeADRcWCha50ULFLk6IivYRQQwskhPS6O/P9sckmm4Rkk2x2difn9zz7bLuZvC8JHO7M\n7FxBkiQJREREVOlUchdARERUVTB0iYiIzIShS0REZCYMXSIiIjNh6BIREZkJQ5eIiMhMjArdlJQU\njB8/Hk8//TS6d++OkydPVnZdREREiiMY8znd6dOno2XLlujbty80Gg0yMzPh4uJijvqIiIgUo9TQ\nTU1NRa9evfDbb7+ZqyYiIiJFKnX38s2bN+Hp6YkZM2agd+/emD17NjIzM81RGxERkaKUGroajQbn\nzp3DoEGDsH37djg4OOCzzz4zR21ERESKUmro+vv7w9/fHw0bNgQAdO3aFefOnSvxa3g5ZyIioqJs\nShvg4+ODgIAAXL16FbVr18aRI0cQGhpa4tcIgoC4uBSTFSkHX19Xq+8BUEYfSugBYB+WRAk9AMro\nQwk9ALo+jFFq6ALArFmzMHXqVGg0GtSqVQsLFy6sUHFERERVkVGhW7duXWzdurWyayEiIlI0XpGK\niIjITBi6REREZsLQJSIiMhOGLhERkZkwdImIiMyEoUtERCbRuXM7uUuweAxdIiIyCUEQ5C7B4hn1\nOV0iIqKy+OijFTh69BAEQYUhQ4ajU6fOuH8/HnPmzER6ehq0Wi2mTJmOJ59sgwUL3kZU1HkAArp3\n74nnn39B7vIrDUOXiEhh5s6dhd27d5h0mz169MLcue8aNXbv3t9x+XI01qz5Fg8eJODll4egadNm\n+PXXn9Cq1eN48cVhkCQJmZmZOH/+POLi7uGbbzYBANLSUk1at6Xh7mUiIjKp06dP4qmnugIAPD29\n0LRpc5w/fw716j2CH37Yha+++hyXLkXD0dERtWrVwp07t7F8+RIcPXoYTk7OMldfuTjTJSJSmLlz\n3zV6VloZCq80l/e8ceOm+Oijz3H48EEsWDAXAwcOxuDBA/D11xtx9Ohh7Ny5DX/88StmzHhLjrLN\ngjNdIiIyifxwbYbff/8VoijiwYMHOHXqBOrXfwSxsbHw8PDEs8/2wrPP9sLFixeQmJgIUdSiffsn\n8fLLoxEdHSVzF5WLM10iIjKJvLOX27d/EmfPnsbQoS9AEFR49dXx8PT0wp4932PjxrWwsbGBk5Mz\nZs16G7GxsXj99TcgSSIEQcDo0eNk7qJyCVIlrThv7esjKmmNR2vvQwk9AOzDkiihB0AZfSihB8D4\n9XS5e5mIiMhMGLpERERmwtAlIiIyE4YuERGRmTB0iYiIzIShS0REZCYMXSIismjHjx/DmTOn9M93\n7NiKn3/+0STbXrv2K5Nsx1gMXSIismjHjx/D6dP5odurV1907fqMSba9Zo15Q5dXpCIiogrbsGEN\n7O3t0bfvAHzwwVJcvnwJK1Z8gmPH/sGPP+7C7NnzDMZHRV3Ahx8ug0aTDWdnN7z55hx4eXlj8+ZN\n2LlzG2xsbBAcXBujR4/Fzp1boVbb4Ndf92DixNfx779/w8nJCQMHDsa4caNQp04ETp48gczMTMya\nNRdr136FK1cuo2PHzhg5cgwAYMaMqYiLu4fs7Cz07/8CevTohVWrViI7OwvDh0eidu0QzJ49D7/8\nsgebN2+CVqtB/foNMGXKdJOuE8zQJSJSGOe5s2Bv4qX9snr0QloJiyg0btwM3367Hn37DkBU1AXk\n5ORAq9Xi1KkTaNy4mcFYjUaD5csX4733liEsrBY2bdqGTz/9CDNmvIX167/Bli27YWNjg7S0VDg7\nu+C55/rqQxYA/v33b4Pt2dra4Ysv1mDz5k2YPn0KvvpqPVxcXDFgQC8MGBAJNzc3zJw5B66ursjK\nysLIkUPQvn1HjB49Ftu2bcaXX64HAFy/fg2///4LVq36Emq1GkuXLsIvv+wx2awaYOgSEZEJRETU\nRVTUeaSnp8PW1hYREXVx/vw5nDx5HJMmTTMYGxNzHVeuXMakSa9BrVYhO1sDHx9fAEBYWDjmzn0T\n7dp1wBNPdDDqe7dt2w4AEBoahpCQUHh6egEAqlevgXv37sLNzQ3ffbcBBw7sAwDcu3cPN2/GoH79\nBgYrIv3779+4eDEKI0cOgSRJyM7OhpeXV0X/aAwwdImIFCZt7rslzkorg42NDfz9A/Djj7vQsGFj\nhIWF4/jxf3H79i0EBQUXGi0hJCQUn3zyZZFrL7///gqcOPEfDh7cjzVrvsSaNd+W+r1tbe0A6BZc\nsLW11b8uCAK0Wi2OHz+G//77F5999jXs7OwwbtwoZGdnF7MlCd26dceoUa+V40/AODyRioiITKJx\n46bYuHEdmjRphkaNmmDHjq0ID69TZFxgYDAePEjEmTOnAeh2N1+9egUAcPduLJo2bY4xY8YhLS0N\nGRnpcHJyQlpaWrnrSktLhaurK+zs7HD9+jWcPXtG/56trS20Wi0AoHnzR7F37+948OABACA5ORmx\nsbHl/r7F4UyXiIhMonHjpli79is0aNAQ9vYOsLe3L3I8F9DNit99dxGWL38fy5cvQnZ2Dp5//gXU\nqhWId96ZnRuwEvr3HwhnZxe0adMOs2a9gb/+2o+JE183OLGppJOc8t5r1ao1duzYisGDn0dgYBAa\nNGioH9OzZ2+89NJARETUxezZ8/Dyy2MwefJrEEUJtra2mDx5Gvz9/U32Z8Sl/R5CSctNWXsfSugB\nYB+WRAk9AMroQwk9AFzaj4iIyOIwdImIiMyEoUtERGQmDF0iIiIzYegSERGZCUOXiIjITBi6RERk\ndt99txFZWVlyl2F2DF0iIjK7zZs3Iisrs9j3RFE0czXmw9AlIqIK27BhDbZu1V0n+YMPlmLCBN2S\neseO/YN582YbjN2yZRPi4+MwbtxovPTSSwCAzp3bYeXK5Rg2bBDOnDmF/v17Ijk5CQBw4cJ5jBs3\nCgCQmZmJhQvfwciRL2H48ME4eHC/uVo0CV4GkohIgbyaNyj29YRjZ4p9vazjCyvL0n79+g3Et99u\nxIcfforQ0BqIi0tBZmYGGjRoiLFjJ+aOMry8Y94lHb/5ZjWaN38UM2a8hdTUVIwcOQQtWz4Ke3sH\no+qUG0OXiIgqrCxL++lIuTcdtVqN9u07Fnq/qH/+OYpDhw5g48Y1AHSLJdy9G4vAwGCT9VKZGLpE\nRApk7Ay1vOMLK9vSfkXZ2dkbLF6gVqshirrgzc7OP+FKkiS8++5i1KoVWKF65cJjukREZBLGLu0H\nAE5OzgbL9RVeeycgoDqios4DAPbt+0P/+qOPPoYtWzbpn0dHR5myhUpn1Ey3Y8eOcHFxgUqlgo2N\nDbZs2VLZdRERkZUxdmk/AOjZsxemTh2PgAB/LFmyssgSfUOHjsR7770DFxcXNG3avMDrL+ODD5bi\npZcGAgD8/QOwaNH/Kq8pEzNqab9OnTph27ZtcHd3N2qjFy9ehKdnQIWLk5OSlpuy9j6U0APAPiyJ\nEnoAlNGHEnoATLy0nyRJZfrc1IABA5CTk2P0eCIioqrAqNAVBAEjRoxA37598d1335U6/sSJE/jw\nQ+uZ7hMREZmDUcd0N23aBF9fXyQkJGDYsGEICQlBixYtHjq+Ro0aWLp0Ebp164769R8xWbFERETW\nzKhjugWtXLkSzs7OGDZs2EPH/PDDD3j22WfRvHlzHDlyBDY2/GQSERFRqWmYkZEBURTh7OyM9PR0\nHDx4EGPHji3xa7p3747nn38B3323EXPnvosJE6aYrGBzUdLBfWvvQwk9AOzDkiihB0AZfSihB8D4\nE6lKDd34+HiMHTsWgiBAq9WiR48eaNu2bakbfvfd97Bv3594//2F6NatOyIi6hpVEBERkVKVeiJV\nrVq1sHPnTuzYsQO7d+/GK6+8YtSGPTw88f77y5GdnY0JE8ZAo9FUuFgiIrJMsbF3MGTIAJNuMzr6\nIg4f/kv//ODB/Vi//huTbFuupQUr9YpU3bo9g759n8d//x3DqlUfVea3IiIimRW+wEVFXbp0EUeO\n5Idu27btEBn5kkm2XdLSgpWp0s9wmj9/Efbv34tFi95F165PP/SSYEREZN00Gg3eeWc2Ll68gNq1\nQzFr1tuwt7c3GHPr1k0sW7YYSUmJcHBwwHvvLYCLiw/++OM3fP3151Cr1XB2dsHy5R/jiy9WITs7\nG6dPn8TgwcOQlZWJCxfOYdKkaViw4G3Y2dkjOjoKiYkPMGPGW9iz53ucPXsa9es3wMyZcwAAS5a8\nh6ioc8jKykKHDp0wfPgrBksLenh4YMWKT/D330fw5ZefIScnBzVq1MTMmXPg4GD6lYsqPXS9vLyx\nePH/MGxYJCZMeBW7d/8MtVpd2d+WiKjKmjvXHrt3m/af9x49NJg7t+TdsTEx1zFjxhw0aNAQCxe+\ng+3bN2PgwMEGYxYvXoBp02aiRo2aOHfuDObOnYslS1bim2++wLJlH8HHxwdpaamwsbHByy+PRlTU\neUyc+DoAYM+e7w1m06mpKfj0069w8OA+vPHGJKxa9RVq1w7BiBEv4tKlaISFhWPUqNfg6uoKURQx\nYcIYXLlyyWBpQTc3NyQlJWLNmi+xYsXHsLd3wPr132DTpnUYOvRlk/4ZAmZaZah79x7o1asPduzY\nhs8//wSjR5d89jMREVkfPz9/NGjQEADQtesz2LLlW4PQzcjIwJkzJzF79hsFFjjQ3Tds2Bjz589B\nx46d0b79k0Z9vzZtngAAhISEwcvLG7VrhwAAatcOQWzsbYSFheP333/Grl07oNVqkZBwH1evXkVI\nSBgKLi149uwZXLt2BWPGjIAkSdBoNGjQoFHF/0CKYbYP0C5YsAQHD+7HggXvoEuXbrlNExGRqc2d\nm1XqrLQyFD6mW/gQrySJcHV1w5dfrte/lveRoalTZ+D8+bM4dOggRox4EatXryv1+9nZ2QEAVCqV\n/nHec61Wizt3bmPTpvVYvXotnJ1dsGDB2wbLBObXJaFly8cwZ867ZWm3XMy2tJ+Pjw/ee28pMjMz\nMWHCa2W6ljMREVm+2Ng7OHtWty7vr7/+jEaNmhi87+TkjICA6vjzz9/0r124cAGA7lhvvXqPYMSI\nUfDw8MS9e3fh5ORksPxfSYq7zlNaWhocHR3h5OSMhIT7OHLkkEEtedt+5JGGOH36JG7dugkAyMrK\nxI0bMWXo3HhmvVRUz5690aPHduzevQOrV3+KkSPHmPPbExFRJQoKCsa2bd9h4cK3ERwcgl69+hUZ\nM2fOu3j//YX45psvodVq0LNnD/Tv/yI+/ngFbt68AQBo3rwlwsLCUa2aH9at+xrDh0di8OCHXwUR\nKP7M6bCwcISHRyAysh+qVfNDo0aN9e/lLS3o4+OLFSs+wcyZczB37kxkZ+dAEASMHDkGtWoFVvBP\npJg6y3oZSGM97AojcXFxeOKJlsjMzMSffx7S74O3NEq6Soq196GEHgD2YUmU0AOgjD6U0ANg4qX9\nTMnX1xcLFy5Beno6Jk0ay93MRERUZZg9dAGgV6++ePrpZ3Ho0EF8/fVqOUogIiIyO1lCVxAELF78\nP3h4eOCdd97C9evX5CiDiIjIrGQJXQDw8/PD/PmLkZ6ehsmTxxV75hkREZGSyBa6ANCv3wB06dIN\nBw7sw5o1X8lZChERUaWTNXQFQcCSJSvg7u6BuXNnVdrnooiIiCyBrKELAP7+AZg3byHS0lK5m5mI\nyEoZu7Tfnj3f4/79eDNUZJlkD10AGDBgEDp16ox9+/7Ehg1r5S6HiIjKwZil/X78cTfi4uKKfa8q\nfITUIkJXEAQsXfoBXF3d8NZbM3H79i25SyIiojLKW9pv8OD+mD17epFF4vfu/R0XLpzHvHmzMXx4\nJLKystCxY0d88smHGDHiRfz5528YN24UoqJ0l4ZMSkpE//49AegC+eOPV2DkyJcwdOgg7Nq13ez9\nmYJFhC4AVK9eA++8swApKcmYMmU8dzMTEVVA8+bOxd5MNb44MTHX0afP81i3bjOcnJywfftmg/c7\ndOiEevXqY86cd/Hll+v1a+26u3tg9eq16NSpSzFb1c2ev/9+J1xcXPH559/g88+/wa5d2xEbe6dM\n9VkCiwldABg06EV06NARv//+K779doPc5RARURkUXtrv1KmTRcZIkoTCc6pOnTqXuu2//z6Cn376\nAcOGDcIrr7yE5OQkqzz51qwLHpRGEAQsW/Yh2rV7DLNnz0CHDh3h7x8gd1lERFbn2DHjVucp7/ji\nlLa038M4OjrqH6vVakiS7thudnZ2gVESJk16HS1bPlbRMmVlUTNdAKhZsxbmzJmHpKRETJ06gbuZ\niYisRGlL+wGAs7Mz0tJSH7qNgIAauHDhHAAYLAH46KOPY9u2LdBoNACAGzdikJWVacryzcLiQhcA\nhgwZhieeaI9ffvkJW7Z8K3c5RERkhLyl/QYP7o+UlORil/Z7+ulnsWTJQv2JVIVnxy+8EInt27di\n+PDBSE5O1r/eo0cvBAfXxogRgzFkyAAsWbIQWq220nsyNbMv7WesmJjraNfuMdjZ2eLAgX/g5+dn\nosqMo6Tlpqy9DyX0ALAPS6KEHgBl9KGEHgALXtrPWIGBQZg9+20kJiZi2rRJ3M1MRERWz2JDFwCG\nDXsZrVu3xZ4932PHjq1yl0NERFQhFh26KpUK//vfSjg5OWHGjKm4d++e3CURERGVm0WHLgDUrh2C\nN9+cg4SEBMyYMVXucoiIiMrN4kMXAEaMGIVWrR7H7t07rPbSX0RERFYRuiqVCitWfAQHBwdMnz4F\n8fFVd4UKIiKyXlYRugAQEhKGGTPeQnx8PGbO5G5mIiKyPlYTugDwyitj0KLFo9ixYxt++GG33OUQ\nERGViVWFrlqtxooVH8Pe3h7Tpk1CQsJ9uUsiIiIymlWFLgCEh9fBG2/MQlzcPbz55htyl0NERGQ0\nqwtdABgzZiyaNWuOrVu/w08//Sh3OUREREaxytDV7Wb+BHZ2dnj99YlITHwgd0lERESlssrQBYCI\niLp4/fUZuHs3FrNnz5C7HCIiolJZbegCwGuvTUDjxk3x7bcb8OuvP8ldDhERUYmsOnRtbGzwwQef\nwNbWFlOnTkRSUqLcJRERET2UVYcuANSrVx+TJ0/DnTu3MWfOm3KXQ0RE9FBWH7oAMH78ZDRo0Agb\nNqzFH3/8Jnc5RERExVJE6Nra2uKDDz6BjY0NJk8eh5SUZLlLIiIiKkIRoQsADRo0xMSJU3H79i3M\nnTtb7nKIiIiKUEzoAsDEiVNRv34DrF37Ffbt+1PucoiIiAwYHbqiKKJ3794YPXp0ZdZTIXZ2dvjg\ng4+hVqsxefI4pKamyF0SERGRntGhu2bNGoSGhlZmLSbRqFETjB8/CTduxGDevDlyl0NERKRnVOjG\nxsZi37596N+/f2XXYxKTJ7+BunXr4auvvsDBg/vlLoeIiAiAkaG7YMECTJs2DYIgVHY9JmFvb48V\nKz6GSqXCxIljkZaWJndJREREsCltwN69e+Hj44N69erh6NGjRm/Y19e1QoVVVJcuHTBt2jS89957\nWLZsAT744IMyb0PuHkxFCX0ooQeAfVgSJfQAKKMPJfRgLEGSJKmkAcuWLcOuXbugVquRlZWFtLQ0\ndO7cGYsXLy5xw3Fx8p/ElJmZiaeeegIXL0Zh5849ePzxNkZ/ra+vq0X0UFFK6EMJPQDsw5IooQdA\nGX0ooQfA+P84lLp7efLkydi7dy9+//13LFu2DK1atSo1cC2Fg4MDli//CCqVChMmvIr09HS5SyIi\noipMUZ/TLU6LFo9i9OixuHbtKhYunCd3OUREVIWVKXQfffRRrFq1qrJqqTRvvPEmQkPD8NlnH+Po\n0SNyl0NERFWU4me6AODo6Ijlyz8GAEyc+CoyMjJkroiIiKqiKhG6ANCq1WN45ZUxuHz5EhYtmi93\nOUREVAVVmdAFgBkz3kJwcG2sWrUS//77t9zlEBFRFVOlQtfJyQkrVnwMURQxYcKryMzMlLskIiKq\nQqpU6ALA44+3wcsvj0J09EUsWfKe3OUQEVEVUuVCFwDefHMuAgODsXLlchw/fkzucoiIqIqokqHr\n7OyM5ctX6nczZ2VlyV0SERFVAVUydAGgbdt2GDp0BC5cOI///c86rrBFRETWrcqGLgC89dY7qFUr\nECtWLMOpUyfkLoeIiBSuSoeui4srli37EFqtFuPHv4rs7Gy5SyIiIgWr0qELAO3bP4kXXxyGc+fO\nYPnyJXKXQ0REClblQxcA5s6dhxo1amL58iU4c+a03OUQEZFCMXQBuLq6YenSD6DRaDB+/Bjk5OTI\nXRIRESkQQzdXx45PYdCgF3HmzCl8+OH/5C6HiIgUiKFbwNtvz4e/fwCWLl2E06e5m5mIiEyLoVuA\nu7sHli5dgZycHAwdOhSpqalyl0RERArC0C2kc+duiIwcgv/++w8DBvRGcnKS3CUREZFCMHSL8f77\nyzFo0CD8889R9O3bEwkJ9+UuiYiIFIChWwwbGxusWbMGgwa9iJMnj6N372cRFxcnd1lERGTlGLoP\noVarsWzZhxg+fCTOnz+LXr2exp07t+Uui4iIrBhDtwQqlQoLFy7Bq6+OR3T0RfTs2Q03bsTIXRYR\nEVkphm4pBEHAnDnzMGXKG7h+/Rp69uyGK1cuy10WERFZIYauEQRBwBtvvIlZs+bi1q2beO65pxEV\ndUHusoiIyMowdMtg/PjJmD9/Ee7ejUWvXk/j9OlTcpdERERWhKFbRiNHjsGSJSuQkJCAPn2exfHj\nx+QuiYiIrARDtxyGDBmGDz9chZSUZPTt2xNHjhyWuyQiIrICDN1yev75F/DZZ18hMzMDAwf2xoED\n++QuiYiILBxDtwJ69uyNr75aD41Gg0GD+uG3336WuyQiIrJgDN0K6tr1aaxb9x1UKhVeemkQfvhh\nt9wlERGRhWLomkCHDh2xceNW2NnZ4+WXh2Dbts1yl0RERBaIoWsirVu3xebNO+Ds7IIxY17Ghg1r\n5S6JiIgsDEPXhFq0eBTbtu2Gp6cnJk58DatXfyZ3SUREZEEYuibWqFETbN/+I3x9q2HGjKn4+OMP\n5S6JiIgsBEO3EtSrVx87d+5BQEB1zJ37JpYuXQRJkuQui4iIZMbQrSRhYeHYuXMPAgODsGjRfCxY\n8A6Dl4ioimPoVqLg4NrYuXMPQkJCsWLFUsyePZ3BS0RUhTF0K1mNGjWxc+dPqFu3Hj777BNMnToR\noijKXRYREcmAoWsGfn5+2L79RzRo0Ahr136FceNGQ6PRyF0WERGZGUPXTLy9vbFt2240b94Cmzdv\nwujRI5CTkyN3WUREZEYMXTPy8PDE5s078fjjbbBr13YMHz4YmZmZcpdFRERmwtA1MxcXV2zcuBXt\n2z+Jn3/egyFDBiI9PV3usoiIyAwYujJwcnLC2rXfokuXbti79w8MGtQPqakpcpdFRESVrNTQzc7O\nRv/+/dGrVy/06NEDK1euNEddiufg4IAvv1yHHj164dChg+jfvxeSkhLlLouIiCqRTWkD7OzssGbN\nGjg6OkKr1eKFF15Au3bt0KhRI3PUp2h2dnb49NMvYW9vjy1bvkWfPj3w3Xc74O3tLXdpRERUCYza\nvezo6AhAN+vlR11My8bGBitXfooXXxyK06dPok+f7rh7967cZRERUSUodaYLAKIook+fPoiJiUFk\nZGTps9zgYHiJRa+8lHDsTLHDvZo3KPZ1WcerhCI9VGY9XwFwGDkan3++Cr16PY2tW3ejevUaFd9+\ngT6s6s+/oNweLKaeco5HzHWLqofjOd4SxisiL4CH/v0uzKjQValU2LFjB1JTU/Hqq6/i0qVLCAsL\nK/Fr1CqhyGu+vq4P+QZFx1rC+MI9VHY9n376Mby83LFo0SL07v0M/vjjDwQHB1d4+3l9yP3nWZHx\napVgUfWUZ/xDv8ZK6i843uBrLaCe8ozXP7eQeso7vrh/a+Wsp8zjoYy8MJYglfFiwCtXroSzszOG\nDRtW4ri4OOs+G9fX11WWHiRJwtKli7B48QJUr14D27btRkhIyf/BKYlcfZiSEnoA2IclUUIPgDL6\nsPgeRBHIzISQmQEh9x4ZmRCyMiFkZgKZGRAyMuE+dJBRmyt1ppuQkABbW1u4uroiMzMThw8fxiuv\nvFLhPqh4giBg6tTpcHBwxDvvzEbPnk9jy5ZdqFu3ntylERHJy8gAzHtfN7bg89z3szLzt5M7Hpm5\n28nIHZf3fna2cbWZKnTj4uIwffp0iKIIURTxzDPPoH379sYVQeU2duwEODo6YMaM19G79zP47rsd\naNiwsdxlEREVJUlAdjaE9DQI6em5tzT9PdLTIaQ95D1JA9fEFMMAzMrSPy9XAJa1fEEAHB0hOThA\ncnCE5OICyccXkoM9JAdHIO91BwdIjo6Avb3hcwcHuBj5vUoN3YiICGzfvr2CLVF5jBgxCg4Ojpg8\neRz69OmBTZu2onnzlnKXRUTWSJKAjIwioWd4nw4UfC2taEgWHZf7ulZb7tIcCpZZXAB6+0BydCga\ngA4ORQPRwQGSfe57jo4FxjoCDvaGz/O2aWsLCGU7NluYyUKX5BUZOQQODg4YO3YU+vV7Dhs2bMbj\nj7eRuywiqkyiqAuylBQIqakQUpINH6emQJWaCkg5cI5/UCQQ9Y/T8h8jIx2CCdbzllQqSE7OkJyc\nACcniN4+kJyc9K9JTk6QnAs8dnIGDN43fM+rpi/i00VdANo7AHZ2FQ5AS8bQtQJ9+z4POzt7jB49\nHAMH9sGaNZvQvv2TcpdFRAVJku64YEpKbiimFB+aqbrHKv17KbrX8h6npEBISzU6IJ2KK8XWVh9u\nors7pIDqucH38PArGJYlhSTs7U0bir6ukCz5RCoTY+haiR49noODw3oMH/4iBg9+HqtXr0GXLk/L\nXRaR9cvJyZ095oeeKi0lPwALhmZaam5gFgzRlPyvL+fFgyQ7O0iurpBcXCEGBUN0dc197gLJxS3/\nsasrJFfc0+i4AAAgAElEQVQ3iC4ukFxc4FHTDwlZAJxzw9HRUReMtram/TMik2HoWpHOnbth/frN\nGDJkIIYOjcSnn36JHj16yV0WkbxEEUJyEoTERKiSEiEkJkJISoQqMbH415ISgdRkeCcl6YKynMtr\nSmo1JBddOIoB1SE560JRdHXLD0gXV/2Y/OB0g+iS/1hycdHNHsvD1xXaKjRLVAKGrpVp164DNm3a\nhkGD+mPkyKH48MNV6N9/oNxlEVWMkcGpSnxQJECF5KQyHauUnJwAd3eInl6QagUWmUnqQzMvLAuG\npqsrRGfdPRwdFX3skSoHQ9cKPfZYa2zZshMDBvTB2LGjkJWVhcGDX5K7LKrqSgzOB/qQzAvSigan\n6O4BsXp1iPXqQ/LwgOTuAbHQveThAdHDE5KHJ0R3D0ju7oC9PXx9XfGAM0SSAUPXSjVr1gLbtn2P\n559/DpMnj0NmZgZefnm03GWRUmRkQBUfB9X9eKjux0OIj4fq/n2o7scDmalwi40zTXB6eEKsXgNi\n/UfyQ1Iflh6FXvPUvwc7u0psnqjyMHStWMOGjbBjxx707dsDM2dOQ0ZGJsaNmyh3WWSJ0tL0AaoP\n0fgCz+/H54bsfaji43UXLShB3hFIyckZoocHg5PISAxdKxcRURe7du1B3749MW/eW8jISMfrr8+A\nwGNNyiVJhiEaHwchNyzzn+cFqm52KqSnl75Ze3uI3j7QhIVD8vaG6O2ju/n4QPLxzX3uDc/QWojX\n2up21TI4icqEoasAISFh2LlTN+NdsuQ9ZGZmYvbstxm81kKSdB9FiYszDMr4eMNdvPfv658bc8at\n5OCgC9HwiEIh6gvJx0cfonnPJWcX404MqmKfqyQyJYauQgQGBmHXrp/Qt28PrFy5HBkZ6Zg/f7Hc\nZVVdkqQ7eSg2Fqo7t6G6GwukJcL5+q1Cx0lzH2dllb5JR0eIPr7Q1K2nuwpQboDqZ6O5AZoXrnB2\n5tm1RBaGoasgAQHVsWPHHvTv/xxWr/4MWVlZ+Prr1XKXpTzp6VDF3oH6bm6g6oP1DtR37kAVeweq\nu7HFzkYLXj1IcnKG6OMDTf1HdCFaIDAfGqJEZNUYugpTrVo1bN/+PQYM6IN1677BvXt3MG/eYtSu\nHSJ3aZZPo4Hq3l1daBYIT/Wd27rHsXd0AZuU+NBNSCoVRN9qutmof4D+pg2oDrewIDywdc4PUafi\nLuBHRErG0FUgLy9vbN26C6+8Mgy//PIL9u/fjwkTpmDs2ImwL++Vb6yZJEF4kKAL0oKz0dhYqGIL\nzFTj7pX4kRfRwwNiQAA0TZvlBmkARL8AiAHVIfr76+59fAGbh/y18nWFhsdCiao0hq5Cubm5Y+PG\nrfjzzz2YMGEiFi2ajy1bvsWiRcvQrl0HucsznbQ0qO8WmJnmBqsqNm+GGgvV3TslHjOVHBwg+gcg\np9XjEIsJUq2fP0T/AN0ViIiIKoChq2CCIGDAgAFo0aINFi2aj9WrP0O/fj3Rp08/vP32Qvj5+cld\n4sPlnoikvhEDJMXB4eKV/BlqgWBVJSc9fBMqFUQ/f90xU78AXaDm7uoV/fz1wSq5e/CEIyIyC4Zu\nFeDm5o758xdjwIBBmDZtErZt24Jff/0FM2fOxtChL0OtVstTWFoa1DdioI65BlXMdaivX4c6RndT\nxVyHKiVZP9S10JeKnp4Qa9SEpnkLaP0Dip2hij6+gFy9EREVg6FbhTRq1AQ//PAb1q79GvPnv40Z\nM17Hpk0b8P77/0OTJs1M/w2zs6G6dVMfpLowvaZ7fP06VPFxxX6Z5OQEbWAQcgJbQxsYBKd6dZDs\n6gWtf26g+gcADg6mr5eIqJIxdKsYtVqNoUNH4JlneuDtt2dh8+ZN6Nr1SQwdOgIzZ74Fd3cP4zcm\nirqPzsRch+r6NYNZqjrmOlR3bkMQxSJfJtnaQluzFjSPNIA2MAjawCCIuffawGBIPj4Gu3udfF2R\nxROQiEgBGLpVVLVq1fDRR59h0KAXMW3aJHz11Rf4/vtdePvt+ejb93nd1awkCUJCAtS5s1OVfvdv\n7u7gmzcgZGcX2bYkCBADqiPn0ccKhGkQxKBg3b1/AHf7ElGVxNCt4to2boIDH32OXz/7GCd3bkPW\nqyNxcdZ0NPX0hGNsLFRpqcV+nejjkztTDS4UrEHQ1qhV/kW5iYgUjKGrdFlZUF+OLjBLzdv9mzt7\nTUgAAAzOvQEAEu4jOeE+Yn184dGmLVA7JDdYdTNVba1AwMVFro6IiKwWQ1cJJAlCfDxsoqOgvhgF\ndXQUbC5GQX0pGrh9C17FXPBBsreHtlYgNI2b5odpkC5Qf4m+iCnz38btO7cReOEC3hs6Ak891VWG\nxoiIlIWha01EEapbN2Fz8QLUFy/mh2t0FFQPHhQZrq1eA2jfHhkBNQ1OVBKDgiBW8wNUqmK/Taem\nzXHwmR5YunQRPv30Iwwa1B/du/fEu+++hxo1alZ2l0REisXQtUQ5OVBfvQL1xagCs9eLsLl0sci6\nqJJKBW1wbeS0ehza8Aho6kRAWycC2vA6kFxc4evritRynPnr4uKCOXPm4fnnX8C0aZPwww+78Oef\nv2PatJkYOXI0bG1tTdUtEVGVwdCVU1oabC5dzA/V3Fmr+uoVCBqNwVDJwQHa0HBo6tTJD9fwCGhD\nQiv1pKV69epj5849+PbbDXj77VmYO/dNfPvtBixe/D+0avVYpX1fIiIlYuiagXD/ftHjrdEXob55\no8hY0d0DmibN8kO1Th1owiMg1gqU7WM2KpUKL7wwGF27Po13352Ldeu+QY8eXRAZOQSzZ78NLy9v\nWeoiIrI2DF1TkSSobt8qsEv4ItQXL8AmOgqq+/eLDNf6+SP7iQ76UNXWiYAmPAJStWoWex1gLy9v\nLFv2IQYMiMS0aZOwfv0a7NnzPd56ax4GDoyE6iHHiImISIehW1YaDdTXrhaatUZBHR1d5DOtkkoF\nMTAIWc1bFtglXAfaOhGQ3NxlaqDiWrV6DL/9th9ffPEpFi2aj4kTX8OGDWuxePH/UL/+I3KXR0Rk\nsRi6D5OeDpvTJwuEq+5sYfWVyxBycgyGSnZ20IaGI7tAqGrCI6ANDVPsNYJtbW0xZsxYPPdcb8ya\nNR3ff78TnTq1xahRr2Hq1Olw4ed4iYiKYOgCEFKSYXPqJGxOnoDNqeOwOXkCuHIZnoU+3yq6uELT\nsBG0deoW2CVcB2JQcJW9rGH16jXw5Zdr8dtvP2P69Nfx8ccfYMeOrZg/fzGeeeZZ3eUkiYgIQBUM\nXSE5CTanT+kC9uR/uvsrlw3GiG7uQLt2yKgdVuCEpgjdNYMZIsV66qmuOHCgHVasWIIPP1yOYcMi\n0blzVyxY8D6CgoLlLo+IyCIoOnSF5KQiM9giAevugewn2kPTqAk0TZoip1ETiMG14VvNrVyfb63K\nHB0dMX36bPTtOwBvvDEZv/76Mw4e3I9Jk17Hq6+Oh52dndwlEhHJSjGhW6aAbdwUmsZN9AHL2atp\nhYfXwdatu7F163eYM+dNLFjwDjZv3oRFi5ahbdt2cpdHRCQbqwzdIgF74jhsrl4xGKML2A7QNG7C\ngJWBIAjo128AOnfuioUL5+Grr75Anz7Pol+/AZg7dz6qVasmd4lERGZn8aGrD9gTx/NnsKUFbOOm\nupObGLCyc3f3wHvvLcWAAYMwbdpkbNnyLX755Se8+eYcDBkyDOoqegIaEVVNFhW6QlJi0V3EJQRs\nTpOm0DRqwoC1Ak2bNsdPP/2Br79ejQUL3sEbb0zGpk3r8P77y9GoURO5yyMiMgvZQteogPXwQHa7\nJ3Nnr00YsFZOrVZjxIhX8OyzPTFnzkxs27YFXbp0wPDhIzF9+iy4WfEFQ4iIjGGW0DUI2JPHYXvy\nONTXrhqMYcBWHX5+/li16ku88MKLmD59Cr744lPs2rUD8+YtRK9effnZXiJSrMoJ3T/+gOPev2Bz\n6oRxAdu4KcTAIAZsFdO+/ZPYu/cwVq5cjuXLl2DUqOFYv34tFi1agtDQcLnLIyIyucoJ3U6dkHcR\nQIOAzTsGy4ClXPb29pgy5Q306dMfM2ZMxR9//Ib27R/HuHGTMGHCFDgo9DKaRFQ1VU7oTp+OpPD6\nDFgyWu3aIdi4cSu+/34XZs16A0uXLsLWrd/lnvncW+7yiIhMotS12GJjYzFkyBA888wz6NGjB9as\nWVP6VhcuRHaPXjwmS2UiCAJ69HgOf/31D0aNeg03bsRg4MA+6NevH44d+wdSoWthExFZm1JDV61W\nY8aMGfjxxx+xadMmrF+/HpcvXy7ty4jKzcXFFfPmLcSvv+5HixaPYuvWrXj66U7o0OFxfPbZx0hI\nKLo+MRGRNSg1dH19fVGvXj0AgLOzM0JDQ3Hv3r1KL4yoQYOG+P77X/DTTz+hZ8/euHQpGrNmTUej\nRhEYNWoY9u/fC1EU5S6TiMhoZTqme/PmTVy4cAGNGjWqrHqIDKhUKnTt2hXNmrVGfHw8Nm/ehHXr\nvsb27VuxfftWBAUFIzJyCAYOjIS/f4Dc5RIRlUiQjDxQlpaWhhdffBGvvvoqnnrqqRLHBgej2BnI\nsWNpxY5v3ty52NflHK9SqYr0YE315ynYhyXUU57xeT3kjZckCX//fRTr13+DXbu2Iz39LADAwcER\nLi4ucHBwgCAIFlN/npgYFeKKWbnK0v/8C4/39XU16EPuesozvmAPllBPecf7+roiMLD4vT3WUD8A\ntGzpavV5Aej+fhvDqJmuRqPB+PHj8dxzz5UauHlUqqIF+Pq6PmRs8duQe3zhHuSup7zj8/qwlHrK\nM16lUhmMf/bZznj22c5ISkpCSIgKqakpyMzMQGZmBtRqNVxcXJCUdB9hYWEWUX9JX2MNf/6Fxxd8\nbAn1lGd83nNLqaf844v/AmupX/c11p8XxjJqpjtt2jR4enpixowZRm+4uP/RW5PC/5u3Vkrow9ge\nTp8+hQ0b1mDLlu+QlJQIAGjbth0iI4ege/eesn/mVwk/C0AZfSihB0AZfSihB6Dk/1QUVGpGHzt2\nDLt378aRI0fQq1cv9O7dG/v3769wgUSm1rBhIyxcuASnTkXh448/R5s2T+Dgwf0YM+ZlNGpUBzNn\nvo6zZ8/IXSYRVWFGH9MtK2v/n4uS/vdl7X1UpIcrVy5hw4Z12LhxHeLidGfdN2vWHJGRL6F3775w\ncTHuf6emoISfBaCMPpTQA6CMPpTQA2DCmS6RNQsJCcOsWXNx4sR5fPPNRnTp0g0nThzHlCnj0aBB\nHUyc+Br++ecoL7xBRGbB0KUqwdbWFk8/3R3r1n2H48fPYcaM2fDx8cWGDWvRvXtntGvXCqtWrcT9\n+7zwBhFVHoYuVTkBAdUxadLr+PvvE9i8eSd69eqDq1ev4K23ZqJRozoYOXIo9u79gxfeICKTk20R\neyK5qVQqtG//JNq3fxL379/Hli2bsH79GuzcuQ07d25DYGAQXnhhMF54YTCqV68hd7lEVMmys4H0\ndCA9XShwr3uclpb/WkZG0TEbNxr3PXgi1UMo6eC+tfdhzh4kScKxY/9g/fo12L59K9LT06BSqdCx\n41OIjHwJXbp0g62tbbm2rYSfBaCMPpTQA6CMPsrSgyiimMDLv8/IKDksC79XOEA1mvIv0GNsknKm\nS1SAIAho0eJRtGjxKObNW4gdO7Zh/fpv8Ntvv+C3336Br281DBgwCIMHD0FISNELbxCRjiQBaWlA\naqqAlBQBKSmGj9PSdI+1WiA+3v6hQVg4LE1BpZLg5AQ4OenuvbzEAs91rzk7S3B0zB9jeF/0NehX\nkS8ZZ7oPoYT/QQLK6MMSejh37iw2bFiDzZs34cGDBwCA1q3bIjJyCJ599jk4OjqWug1L6MMUlNCH\nEnoATN+HJAFZWXnhmB+SqanIDUvd4/zwzH+vuK+RpPKHpL19ySFX8N7R0XCMs/PDw9LREbC3N/2q\ns8Z+ZIih+xD8S2k5LKmHzMxM7NnzPdatW4MDB/YCANzc3NGv3/OIjHwJDRs+fDEQS+qjIpTQhxJ6\nAPL70GhQKAx1j4ubZRYOybzHea/n5JQvjezsJLi6SnBxAVxcdI9dXQFXVwnOzvmPdWN0z11cJNSs\n6YTs7LQiM0y12sR/WJWMoVtBSvtLac0stYdr165i48a12LhxPWJj7wAAGjduisjIIejTpx/c3NwN\nxltqH2WlhD4srQdJ0h2rTEwUkJgoICkp7x548CD/eeH309JUSE6Wyr3bVRAMw9DZuWAwokBA5j/P\nC1NdkOaHp719+Xq3tJ9FeTF0K0hJvwjW3oel96DRaPD7779i/fpv8OuvP0Or1cLR0RE9e/ZGZORL\naNXqMQiCYPF9GEsJfVRWD5mZKBSQMAjJwqFZ8P3sbOOD09ZWgru7BC8vFRwdtUVmjwXDMO/1ggGa\n956Tk+l3s5aVEn6fAIZuhSnpF8Ha+7CmHmJj7+Dbbzdg/fo1uHbtKgAgLCwckZEvYeTIobCzc5O5\nwoqzpp/Hw5TUQ04ODELz4YFZ9P3MTOMTTK2W4OEhwd0d8PCQ9Dd3d6nQ86Lv54Wl0n8W1oShW0FK\n+kWw9j6ssQdRFHHo0EGsW/cNfvhhF7KysgAAoaFhaN26rf4WEFBd5krLzlp+Hnlnz8bHC7h/X0B8\nvID4eBXu3xeQnm6PO3dy9KFZcBduWXbVCoIuFN3dJXh65gem4fOi73t46HbXVnSWaS0/i5IooQeA\noVthSvpFsPY+rL2HBw8SsG3bZhw48Cf27z+A1NT8XkJCQvUB3KbNE1YRwnL+PDIzDUM0Li7vsapQ\nuOoeZ2QYl2pubg+bZepC82GzUFfXsq+nakrW/ncDUEYPAEO3wpT0i2DtfSihB0DXx507D3DmzCn8\n9ddBHDp0AEeOHEZKSrJ+TO3aIWjT5gk8/ngbtGnzhEVeCcuUP4+cHCAhQReexYWmLlhV+sepqaWH\nqL29BB8fw5u3twQfH1H/PDTUCZKUCg8PCW5ugI2VXrFACX83lNADwNCtMCX9Ilh7H0roASi+D61W\naxDChw8fMgjh4ODaBiFco0ZNc5ddREk/D61Wd7Zt4QDNn5EWfE+FxMTSQ9TGJi808wPU17domOa9\n7uxc+m5bJf9OWRsl9AAwdCtMSb8I1t6HEnoAjOtDq9Xi7NnTBiGcnJykfz8oKNgghGvWrFXZZUOj\nAe7dE3DnjoDYWBUyMx1x7VpWkRlpfLyAhAQBolhy4qlUEry8Cs9Ciz7OC1N398q5kEFV+Z2ydEro\nAWDoVpiSfhGsvQ8l9ACUrw+tVotz587gr78O4NChgzh8+BCSkhL17wcGBqNNm/wTs2rVCizT9tPS\ngNhYAbdvq/SheueOgNu38x/fu1d6kHp46EKy5Bmp7ubpKcl+4YOq/DtlaZTQA8DQrTAl/SJYex9K\n6AEwTR+6ED6LQ4cO4K+/DuLw4b8KhXAQWrdui8cea4v69dtDrQ7MDVEVYmMF3LmjC1LdTYXk5IeH\nqZ2dBH9/CQEBIgICJAQESPD3FxEW5gBb23T4+OhC1ctLQjnXgJANf6cshxJ6AIwPXSs9fYCoalKr\n1ahTpxHc3BqjcePx6NVLwokT93DqVDyuXMnErVu22LTJD5s21QBg99DtuLtLqFFDRPPmulD195dQ\nvXr+44AA3ey0uN26vr4OiIvTVl6TRArG0CWyEJIEJCejwK5e3Wy04K7e2FjdCUiGaufedMdLfXyy\n4eAQj5yca0hMPI2srCsAbgG4CT8/EW3ahKB9+1Zo3botAgODIMh9SSKiKoShS2QGWi1w6xZw5oyq\nwK7egrt7da+VdGEGJyfdDLRuXU3uzFTM3eWrm6FWr67b3as7XuoKoCFE8RGcP38Ohw8fxF9/peDw\n4YPYtu0Atm37BgBQo0ZN/WeEW7dui6CgYIYwUSXiMd2HUNJxBmvvw1p6SEkBrl9X4do1Fa5fF3D9\nukr//ObNkldv8fExPG4aEKAL1bxdvQEBItzcKn4WryiKuHDhfG4IH8Thwwdx//59/fs1atTUnxnd\nunVbBAfXLhLC1vLzKIkSegCU0YcSegB4IlWFKekXwdr7sJQetFrdmb7Fher16wLu3y/+0kQ+PiKC\ngiSEhqrh5ZVtcGJSQIAIP7/yr9BSUaIoIirqAg4dOoBDh/7CoUMHDEK4evUaBiFcu3YIqlVzs4if\nR0VYyu9URSmhDyX0ADB0K0xJvwjW3oc5e0hNhT5M84JVF6oq3LhR/EowtrYSAgMlBAWJ+ltwcP5z\nFxfz91FekiQhKuoC/vrrAA4f1oVwfHy8/n1//wA0bdoEgYEhqFMnAuHhEQgPrwNvb28Zqy47a/hZ\nGEMJfSihB4BnLxMVSxR1s9W8UL12LT9Ur18v7iQlHW9vEQ0aFAxV3ew1KEg3a5X7c6emIggC6tat\nh7p162HEiFcgSRIuXozSh/CRI4ewZ8+eIl/n7e2tD+D8WwRq1qwFlZwXJyayMAxdUpz0dBjMVAvu\nAo6JUSErq+hs1cZGQq1aEho00BQJ1aAg3fHUqkgQBERE1EVERF0MHz4SAGBjo8GRI/8hOvoiLl6M\nwqVLuvu//z6CI0cOGXy9o6MjQkPDUadOnQKhHIGQkFDYy7VPnUhGDF2yOpIE3L1reGy14Gz13r3i\nZ1aenhLq1Ss6Uw0K0p35a60XvTc3T09PtGjxKFq0eNTg9czMTFy9egXR0VEFwvgiLl+OxpkzpwzG\nqlQqBAUFo06dCISF1cndVa2bIbu7e5izHSKz4j8zZJEkSbcb+MIFFWJjgTNn7PWhGhOjKnbJNrVa\nQs2aEtq10+hDVXevu7m7y9BIFeLg4IB69eqjXr36Bq+LooibN2/khvFF/cw4OjoKP/+8Bz//bLi7\nulo1v9wwDjc4bhwQUJ0fZyKrx9AlWUmS7mL6Fy6oEBWlu124oMbFiyokJRX8B1Z3dSU3Nwnh4WKB\nMJX0M9caNThbtUQqlQqBgUEIDAxCp05dDN67f/8+oqOj9Luqo6OjcOlSNA4e3I+DB/cbjHV2dkF4\neDjCwyMMZsjBwbVha23XoaQqi/9EkdnExeWHa37Iqoss76ZWSwgJEfHEEyIiIkS0bGkPb+80BAWJ\n8OCeR0Xx9vaGt3drPPZYa4PX09PTcflydIEw1s2Qz507ixMnjhuMtbGxQe3aIQXCOFx/7+Ji3Bml\nRObC0CWTu39fKBSsulvhz7GqVBJq15bQurUGdevqAjYiQkRoqGjwuVVfX3vExYlm7oLk5OTkhIYN\nG6Nhw8YGr2s0GsTEXEN0dLTBSVzR0RcRHX0RP/6422B89eo1DM6mzpsh+/i4mLMdIj2GLpXbgwdA\nVJS60K5hVZGP3QiChKAgCS1b5hiEa1iYCAcHmYonq2RjY4OQkDCEhISha9en9a9LkoR79+4VOYkr\nOjoK+/b9iX37/jTYjouLC/z9A+DvHwA/P//cez+D1/z8/OHk5GTuFknhGLpUqqQk4MIFtUGwRkWp\nij1LODBQRJcuGkREaBERIaJuXV248t8uqkyCIMDPzw9+fn5o27adwXupqSkFPt6kmyHfvHkdt2/f\nxqVL0SVu193dA/7+/vDzC4C/v39uKPvnhnL+Y378iYzF0CW9lBTkBqraIFxjY4uGa61aIp56SpM7\na9Wibl0R4eEinJ1lKJyoBC4urmjatDmaNm2ufy3vKkhZWVm4d+8u7t6NRWxsLO7evYPY2FjExt5B\nbOyd3NfvICrqQonfw8vLq5hgDjAI6WrV/HjCFzF0q6LUVODixfwzhfPC9fbtouFao4aIjh01ubNW\n3ey1Tp38SxsSWTN7e3vUqhWIWrUCSxyXkZGBe/fuFgjm/HDOC+Zbt27i/PmzD92GIAjw9vbRB3HB\nXdsFw9nHxxc2PA1fsfiTVbD0dODff4HDh230s9eoKBVu3CgargEBIjp00Oh3CeftHnblyZ9EcHR0\nRFBQMIKCgkscl5aWhrt3Y/VBnB/M+Y+vXLlc5GIhBalUKvj6Vis0Yy46g7a2612TDkNXITQa3a7h\n48fVOH5chf/+081gRREAHPXjqlUT8cQT+WcL581eeeEIoopzdnZGSEgoQkJCSxyXmppisBu74K7t\n/F3a53Hy5PGHbsPGxgZeXl5wc3OHu7s73N09Ctx76J97eHjAza3ovVopFwy3MgxdKyRJQEyMgOPH\n1fjvP13InjqlNrhKk5OThJYttWjZ0gaBgZn62aunp4yFExEA3XHmsDBXhIWFP3SMJElITk4q9hjz\n3bt3ERt7B8nJiUhIeICYmOvIzs4uUw2urm7FhLW7QVjnv+ZpEOCOjo68Olg5MXStQEICcOJEXsDq\nQrbgx3JUKgl164po1kyLZs1ENG2qm73a2OSdMJIjY/VEVB6CIOhnrBERdYsdk3dCmCRJyMjIQHJy\nEhITE5GUlISkpAe597rniYmJ+vcL3sfEXEdKSnKZarOzs9PPmjnLLhuGroXJyABOn87bTawL2mvX\nDI/BBgaKeO65HDRtqgvZhg21PGuYqAoTBAFOTk5wcnKCv39Amb9eq9UiOTnJIKSTkhILBHhigZth\nkF+/fg05OWX7j72rq5s+gH18vGBraw9HR139jo6OcHJy1t87ORV87lRgXP69s7Pu3hrCnKErI60W\niI5W4b//VPpZ7PnzKmg0+bttPD0ldOyoyQ1YLZo0EeHrK8lYNREpjVqthqenFzw9vcr8tWWdZRcM\n7piY6zh79rTJ+rC3ty8S2oXDOu/2sJA3DHvDcLezs6vwbnWGrplIEnD7tqA/Bnv8uBonTqiRlpb/\nA7S3l9CkiW43cdOmulvt2hJ46ISILFVFZ9ne3s6IibmH9PR0ZGSkF3ufd8vIyEB6elqh+4LjdK+l\npaUjOTkZsbGxyMhIhyia5jKyarW6SFjnzcT3799r1DZKDd2ZM2di79698Pb2xu7du0sbTrmSkqDf\nRZx3NnHBKzgJgoSICBFNm4r6WWzduiLs7GQsmojIzFQqFZydneFcScfIJElCVlZWgSDXBXZ6enEB\nXtmry4cAAAsRSURBVFyQFwx9w/vExESkp6eVafd6qaHbp08fvPjii5g2bVqFGleyrCzg7FmVwdnE\nly4ZHluoXl1E9+45aNpUN5Nt3FjLz8ASEVUyQRDg4OAABweHcu0+N4ZJQ7dFixa4detWhQpSElEE\nLl/WHYfNm8meOaNCTk7+PmBXV91C6rrdxLqZrL8/j8MSESlRWS7vyWO6pbh7V3ccNu9kpxMn1EhJ\nyQ9YW1sJDRqI+mOwzZrplqZTFb3oExERVXEM3QIkCTh/XoUDB9Q4fhw4csS5yPWIw8K06NYt/2Sn\nRx4xXPuViIjoYSotdH19reOA5Y0bwG+/6W6//w7cvZv/np+fCj17Ao8+CrRqBbRoAXh4qAGoAVjP\naiHW8rMoiRJ6ANiHJVFCD4Ay+lBCD8YyKnQlqezHI+PiUsr8NeaQlAQcPGiD/fvV2L/fBpcv589k\nq1UT0a+fFu3aadCzpyMcHVMMPq6TkwPExclQdAXkXbHGmimhB4B9WBIl9AAoow8l9AAY/x+HUkN3\nypQpOHr0KBITE9GhQweMGzcOffv2rXCB5pKVBfzzj1ofsidOqCCKuiR1dpbQpYsG7dpp0K6d7tKJ\neSHr62t9AUtERJat1NBdunSpOeowGVHUfXxn3z5dyB49mr8QgI2NbhGAdu10t2bNtOCa0kREZC6K\nOJHq+nUB+/frdhkfOKBGQkL+LuN69fJCVoPHH9dy8XUiIpKNVYZuQoLuuGzebPb69fyQrV5dxMCB\nOWjXToMnntDCz4+fjyUiIstgFaGbkQEcPZp/XPb0aRUkSbfL2M1NwjPP5Ohns6GhvFYxERFZJosM\nXa0WOHVKpd9l/PffamRl6ZLUzk5Cmzb5u4wbNdKtG0tERGTpLCKuJAm4elXAvn26kD140AZJSfnT\n1YYN80O2VSstnJxkLJaIiKicZAvde/cEHDyYv8v45s3847KBgSJ69tTtMm7TRgsfHx6XJSIi62e2\n0E1NBY4cUetns+fP56/C4+kp6UO2XTsNgoMZskREpDyVFro5OcDx4/nHZf/9Vw2NRrfL2MFBQvv2\nugtStG+vQYMGXCCAiIiUr1JCt2dP4M8/XZCaqgtZQZDQpImov/JTy5ZaODhUxncmIiKyXJUSurt3\nAyEhEvr10+0ybttWAw+PyvhORERE1qNSQvfaNcDJKa0yNk1ERGS1KuVIalBQZWyViIjIuvH0JSIi\nIjNh6BIREZkJQ5eIiMhMGLpERERmwtAlIiIyE4YuERGRmTB0iYiIzIShS0REZCYMXSIiIjNh6BIR\nEZkJQ5eIiMhMGLpERERmwtAlIiIyE4YuERGRmTB0iYiIzIShS0REZCYMXSIiIjNh6BIREZkJQ5eI\niMhMGLpERERmwtAlIiIyE4YuERGRmTB0iYiIzIShS0REZCYMXSIiIjNh6BIREZkJQ5eIiMhMGLpE\nRERmwtAlIiIyE4YuERGRmRgVuvv370e3bt3QtWtXfPbZZ5VdExERkSKVGrqiKGLevHlYvXo1vv/+\ne/zwww+4fPmyOWojIiJSlFJD99SpUwgKCkKNGjVga2uL7t274/fffzdHbURERIpSaujevXsXAQEB\n+ud+fn64d+9epRZFRESkRKWGriRJ5qiDiIhI8WxKG+Dv74/bt2/rn9+9exfVqlUrdcO+vq4Vq8wC\nKKEHQBl9KKEHgH1YEiX0ACijDyX0YKxSZ7oNGzZETEwMbt26hezsbPzwww/o1KmTOWojIiJSlFJn\numq1GrNnz8bw4cMhSRL69euH0NBQc9RGRESkKILEg7ZERERmwStSERERmQlDl4iIyEwYukRERGZS\n6olUZbF//34sWLAAkiShb9++eOWVV0y5ebOYOXMm9u7dC29vb+zevVvucsolNjYW06ZNQ3x8PNRq\nNfr3748hQ4bIXVaZZWdnIzIyEjk5OdBqtejatSvGjh0rd1nlIooi+vbtCz8/P6xatUrucsqlY8eO\ncHFxgUqlgo2NDbZs2SJ3SeWSkpKCN998E9HR0VCpVFiwYAEaN24sd1lGu3r1KiZNmgRBECBJEm7c\nuIEJEyZY5d/xr7/+Glu2bIEgCKhTpw4W/r+9u3mJag8DOP6dHKRQexElCyzIjCySFr1AEyamSTXV\nxGCLNiVRbdIow14oghYJLfoHWkREEBEaRG1EszGmQiuGYIgwIhhMKkRT5yXPnOcu4l64G+89x7nz\na7rPZz1n+A6HmYcznHmmo4P8/HzTWY7cunXrr/fCv/qslQxJp9NSX18vsVhMfvz4IXv37pWhoaFM\nPX3WDAwMSDQaFb/fbzrFtS9fvkg0GhURkcnJSdmxY0dOngsRkXg8LiIilmVJU1OTRCIRw0Xu3Lx5\nU9ra2uT48eOmU1yrq6uTsbEx0xmzdvbsWbl//76IiExPT8vExIThIvfS6bT4fD4ZHh42neLYyMiI\n1NXVSSqVEhGRkydPSldXl+EqZ96/fy9+v19SqZRYliWHDx+WT58+zXhMxr5e/l12NG/YsIH58+eb\nzpiV0tJSqqqqACgoKKCioiJnV3fOmzcP+HnVa1mW4Rp3RkZGePr0KU1NTaZTZkVEsG3bdMasTE5O\nMjg4SDAYBMDr9VJYWGi4yr1wOMyyZcv+tqo3l9i2TSKRwLIsksnkv1q89Cv58OED69evJz8/n7y8\nPDZu3Eh3d/eMx2Rs6OqO5l9TLBbj3bt3VFdXm05xxbZtAoEAPp8Pn8+Xk6/j6tWrtLe34/F4TKfM\nisfj4ciRIwSDQe7du2c6x5VYLMaiRYs4f/48+/fv59KlSySTSdNZrj1+/Jjdu3ebznBl8eLFNDc3\nU1tbS01NDUVFRWzZssV0liOVlZUMDAwwPj5OIpEgFArx+fPnGY/J2NAV/bnvL2dqaorW1lYuXLhA\nQUGB6RxX5syZw4MHDwiFQkQiEYaGhkwnOdLX10dJSQlVVVU5/x65e/cunZ2d3Lhxgzt37jA4OGg6\nyTHLsohGoxw8eJCuri7mzp2bs/8RPj09TW9vLzt37jSd4sr379/p6enhyZMn9Pf3E4/Hc+4+moqK\nCo4ePUpzczPHjh1j9erVeL0z3yqVsaHrdkez+m9YlkVrayv79u2jvr7edM6sFRYWsmnTJvr7+02n\nOPL69Wt6e3vZvn07bW1tvHz5kvb2dtNZrpSWlgJQXFxMQ0MDb9++NVzkXFlZGWVlZaxbtw6AxsZG\notGo4Sp3QqEQa9eupbi42HSKK+FwmPLychYuXEheXh4NDQ28efPGdJZjwWCQzs5Obt++zYIFC1i+\nfPmMj8/Y0P2ddjTn+hUJ/LwLe+XKlRw6dMh0imujo6NMTEwAkEwmef78OStWrDBc5czp06fp6+uj\np6eH69evs3nzZq5du2Y6y7FEIsHU1BQA8XicZ8+eUVlZabjKuZKSEpYsWcLHjx8BePHiRc6utX30\n6BF+v990hmtLly4lEomQSqUQkZw9F6OjowAMDw/T3d39j+ckYz8Z+l12NP95NTI2NkZtbS0tLS1/\n3XSRK169esXDhw9ZtWoVgUAAj8fDqVOnqKmpMZ3myNevXzl37hy2bWPbNrt27WLbtm2ms/6Xvn37\nxokTJ/B4PKTTafbs2cPWrVtNZ7ly8eJFzpw5g2VZlJeX09HRYTrJsWQySTgc5sqVK6ZTXKuurqax\nsZFAIIDX62XNmjUcOHDAdJZjLS0tjI+P4/V6uXz5MkVFM/9jku5eVkoppbJEN1IppZRSWaJDVyml\nlMoSHbpKKaVUlujQVUoppbJEh65SSimVJTp0lVJKqSzRoauUUkpliQ5dpZRSKkv+AO2e4yf8wTuC\nAAAAAElFTkSuQmCC\n",
-            "text/plain": [
-              "\u003cmatplotlib.figure.Figure at 0xc1dc310\u003e"
-            ]
-          },
-          "metadata": {
-            "tags": []
-          },
-          "output_type": "display_data"
-        }
-      ],
-      "source": [
-        "# Train our variables.\n",
-        "\n",
-        "# numpy is used for its asscalar() function.\n",
-        "import numpy as np\n",
-        "\n",
-        "num_training_steps = 10\n",
-        "\n",
-        "def train_model(inputs, labels, wb, optimizer, num_training_steps):\n",
-        "  loss_at_step = []\n",
-        "  w_at_step = []\n",
-        "  b_at_step = []\n",
-        "  for step_num in range(num_training_steps):\n",
-        "    loss, gradients_and_variables = value_and_gradients_fn(inputs, labels, wb)\n",
-        "    loss_at_step.append(np.asscalar(loss.numpy()))\n",
-        "    \n",
-        "    optimizer.apply_gradients(gradients_and_variables)\n",
-        "    w, b = wb.variables\n",
-        "    w_at_step.append(np.asscalar(w.read_value().numpy()))\n",
-        "    b_at_step.append(np.asscalar(b.read_value().numpy()))\n",
-        "\n",
-        "  print(w_at_step)\n",
-        "  t = range(0, num_training_steps)\n",
-        "  plt.plot(t, loss_at_step, 'k',\n",
-        "           t, w_at_step, 'r',\n",
-        "           t, [true_w] * num_training_steps, 'r--',\n",
-        "           t, b_at_step, 'b',\n",
-        "           t, [true_b] * num_training_steps, 'b--')\n",
-        "  plt.legend(['loss', 'w estimate', 'w true', 'b estimate', 'b true'])\n",
-        "  plt.show()\n",
-        "\n",
-        "train_model(inputs, labels, wb, optimizer, num_training_steps)"
+        "assert dy_dx.numpy() == 3.0\n",
+        "assert d2y_dx2.numpy() == 6.0"
       ]
     },
     {
       "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
-        "id": "UNurY9VJ-hpH"
-      },
-      "source": [
-        "## Other Ways to Compute Gradients\n",
-        "\n",
-        "Using our loss function as an example (`loss_fn()`), there are several other ways we could compute gradients:\n",
-        "\n",
-        "1. `tfe.implicit_gradients()`\n",
-        "1. `tfe.gradients_function()`\n",
-        "1. `tfe.implicit_value_and_gradients()`\n",
-        "1. `tfe.value_and_gradients_function()`\n",
-        "\n",
-        "Each of these functions does the following:\n",
-        "* Wraps a function.\n",
-        "* Returns a function with the same input signature as the wrapped function.\n",
-        "\n",
-        "They differ only in what information they return.\n",
-        "\n",
-        "### Gradients-only functions\n",
-        "\n",
-        "The following two functions return a function that returns only the variables' gradients:\n",
-        "\n",
-        "1. `tfe.gradients_function()`: Returns the partial derivatives of the function `f()` with respect to the parameters of `f()`.\n",
-        "1. `tfe.implicit_gradients()`: Returns the partial derivatives of the function `f()` with respect to the trainable parameters (`tf.Variable`) used by `f()`.\n",
-        "\n",
-        "In our example above, the `tf.layers.Dense` object encapsulates the trainable parameters.\n",
-        "\n",
-        "### Value and gradients functions\n",
-        "\n",
-        "The following two functions are identical to their counterparts above, except that they also return the value of the wrapped function.\n",
-        "\n",
-        "1. `tfe.implicit_value_and_gradients()`\n",
-        "1. `tfe.value_and_gradients_function()`\n",
-        "\n",
-        "### Gradient demos\n",
-        "\n",
-        "In the demos below, we show examples for the `implicit_*` functions, since our existing loss function works seamlessly with these versions. (The other versions require that your parameters are tensors and tensors only; in our example, we're using a `Dense` layer.)\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 13,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          },
-          "height": 85,
-          "output_extras": [
-            {
-              "item_id": 1
-            }
-          ]
-        },
-        "colab_type": "code",
-        "executionInfo": {
-          "elapsed": 100,
-          "status": "ok",
-          "timestamp": 1505502831671,
-          "user": {
-            "displayName": "",
-            "photoUrl": "",
-            "userId": ""
-          },
-          "user_tz": 240
-        },
-        "id": "aEoCftnfAIH5",
-        "outputId": "72f1c1dc-a574-463f-f860-c4e5f48fcdaa"
-      },
-      "outputs": [
-        {
-          "data": {
-            "text/plain": [
-              "[(\u003ctf.Tensor: id=673, shape=(1, 1), dtype=float32, numpy=array([[-0.26846504]], dtype=float32)\u003e,\n",
-              "  \u003ctf.Variable 'dense/kernel:0' shape=(1, 1) dtype=float32\u003e),\n",
-              " (\u003ctf.Tensor: id=671, shape=(1,), dtype=float32, numpy=array([-0.32890949], dtype=float32)\u003e,\n",
-              "  \u003ctf.Variable 'dense/bias:0' shape=(1,) dtype=float32\u003e)]"
-            ]
-          },
-          "execution_count": 13,
-          "metadata": {
-            "tags": []
-          },
-          "output_type": "execute_result"
-        }
-      ],
-      "source": [
-        "# tfe.implicit_gradients() demo\n",
-        "gradients_fn = tfe.implicit_gradients(loss_fn)\n",
-        "\n",
-        "# Returns only gradients and variables:\n",
-        "gradients_fn(inputs, labels, wb)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": 14,
-      "metadata": {
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          },
-          "height": 102,
-          "output_extras": [
-            {
-              "item_id": 1
-            }
-          ]
-        },
-        "colab_type": "code",
-        "executionInfo": {
-          "elapsed": 88,
-          "status": "ok",
-          "timestamp": 1505502831785,
-          "user": {
-            "displayName": "",
-            "photoUrl": "",
-            "userId": ""
-          },
-          "user_tz": 240
-        },
-        "id": "bbgCUdCzAVhH",
-        "outputId": "152aa9b6-9e42-4b7e-848a-9423c0b1929c"
+        "id": "4U1KKzUpNl58"
       },
-      "outputs": [
-        {
-          "data": {
-            "text/plain": [
-              "(\u003ctf.Tensor: id=688, shape=(), dtype=float32, numpy=1.0623235\u003e,\n",
-              " [(\u003ctf.Tensor: id=720, shape=(1, 1), dtype=float32, numpy=array([[-0.26846504]], dtype=float32)\u003e,\n",
-              "   \u003ctf.Variable 'dense/kernel:0' shape=(1, 1) dtype=float32\u003e),\n",
-              "  (\u003ctf.Tensor: id=718, shape=(1,), dtype=float32, numpy=array([-0.32890949], dtype=float32)\u003e,\n",
-              "   \u003ctf.Variable 'dense/bias:0' shape=(1,) dtype=float32\u003e)])"
-            ]
-          },
-          "execution_count": 14,
-          "metadata": {
-            "tags": []
-          },
-          "output_type": "execute_result"
-        }
-      ],
       "source": [
-        "# tfe.implicit_value_and_gradients() demo\n",
-        "value_gradients_fn = tfe.implicit_value_and_gradients(loss_fn)\n",
+        "## Next Steps\n",
         "\n",
-        "# Returns the value returned by the function passed in, gradients, and variables:\n",
-        "value_gradients_fn(inputs, labels, wb)"
+        "In this tutorial we covered gradient computation in TensorFlow. With that we have enough of the primitives required to build an train neural networks, which we will cover in the [next tutorial](https://github.com/tensorflow/models/tree/master/official/contrib/eager/python/examples/notebooks/3_neural_networks.ipynb)."
       ]
     }
   ],
   "metadata": {
     "colab": {
+      "collapsed_sections": [],
       "default_view": {},
-      "last_runtime": {
-        "build_target": "",
-        "kind": "local"
-      },
-      "name": "Eager Execution Tutorial: Working with Gradients",
+      "name": "Automatic Differentiation",
       "provenance": [],
       "version": "0.3.2",
       "views": {}
diff --git a/tensorflow/contrib/eager/python/examples/notebooks/3_datasets.ipynb b/tensorflow/contrib/eager/python/examples/notebooks/3_datasets.ipynb
index 0088da5c4b583d..bfcc7feb075c40 100644
--- a/tensorflow/contrib/eager/python/examples/notebooks/3_datasets.ipynb
+++ b/tensorflow/contrib/eager/python/examples/notebooks/3_datasets.ipynb
@@ -16,7 +16,9 @@
         "\n",
         "We recommend using the `Dataset`s API for building performant, complex input pipelines from simple, re-usable pieces that will feed your model's training or evaluation loops.\n",
         "\n",
-        "If you're familiar with TensorFlow graphs, the API for constructing the `Dataset` object remains exactly the same when eager execution is enabled, but the process of iterating over elements of the dataset is slightly different.  You will use a Pythonic `Iterator()` class instead of using `make_one_shot_iterator()` and `get_next()`. As a result, the discussion on iterators in the [Programmer's Guide](https://www.tensorflow.org/programmers_guide/datasets) is not relevant when eager execution is enabled."
+        "If you're familiar with TensorFlow graphs, the API for constructing the `Dataset` object remains exactly the same when eager execution is enabled, but the process of iterating over elements of the dataset is slightly simpler.\n",
+        "You can use Python iteration over the `tf.data.Dataset` object and do not need to explicitly create an `tf.data.Iterator` object.\n",
+        "As a result, the discussion on iterators in the [Programmer's Guide](https://www.tensorflow.org/programmers_guide/datasets) is not relevant when eager execution is enabled."
       ]
     },
     {
@@ -48,11 +50,8 @@
         "# Import TensorFlow.\n",
         "import tensorflow as tf\n",
         "\n",
-        "# Import TensorFlow eager execution support (subject to future changes).\n",
-        "import tensorflow.contrib.eager as tfe\n",
-        "\n",
         "# Enable eager execution\n",
-        "tfe.enable_eager_execution()"
+        "tf.enable_eager_execution()"
       ]
     },
     {
@@ -137,32 +136,27 @@
       "source": [
         "# Step 3: Iterate\n",
         "\n",
-        "Use `tfe.Iterator` on the `Dataset` object to get a Python iterator over the contents of the dataset.\n",
-        "\n",
-        "If you're familiar with the use of `Dataset`s in TensorFlow graphs, note that this process of iteration is different. Here there are no calls to `Dataset.make_one_shot_iterator()` and no `get_next()` calls."
+        "When eager execution is enabled `Dataset` objects support iteration.\n",
+        "If you're familiar with the use of `Dataset`s in TensorFlow graphs, note that there is no need for calls to `Dataset.make_one_shot_iterator()` or `get_next()` calls."
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 5,
+      "execution_count": 0,
       "metadata": {
         "colab": {
           "autoexec": {
             "startup": false,
             "wait_interval": 0
           },
-          "height": 153,
-          "output_extras": [
-            {
-              "item_id": 1
-            }
-          ]
+          "base_uri": "https://localhost:8080/",
+          "height": 153
         },
         "colab_type": "code",
         "executionInfo": {
-          "elapsed": 201,
+          "elapsed": 388,
           "status": "ok",
-          "timestamp": 1505952405928,
+          "timestamp": 1525154629129,
           "user": {
             "displayName": "",
             "photoUrl": "",
@@ -171,7 +165,7 @@
           "user_tz": 420
         },
         "id": "lCUWzso6mbqR",
-        "outputId": "ec027d30-96c6-4ea4-9ee1-ef74ec1ae29a"
+        "outputId": "8e4b0298-d27d-4ac7-e26a-ef94af0594ec"
       },
       "outputs": [
         {
@@ -179,9 +173,9 @@
           "output_type": "stream",
           "text": [
             "Elements of ds_tensors:\n",
-            "tf.Tensor([4 9], shape=(2,), dtype=int32)\n",
+            "tf.Tensor([1 9], shape=(2,), dtype=int32)\n",
             "tf.Tensor([16 25], shape=(2,), dtype=int32)\n",
-            "tf.Tensor([36  1], shape=(2,), dtype=int32)\n",
+            "tf.Tensor([ 4 36], shape=(2,), dtype=int32)\n",
             "\n",
             "Elements in ds_file:\n",
             "tf.Tensor(['Line 1' 'Line 2'], shape=(2,), dtype=string)\n",
@@ -191,22 +185,19 @@
       ],
       "source": [
         "print('Elements of ds_tensors:')\n",
-        "for x in tfe.Iterator(ds_tensors):\n",
+        "for x in ds_tensors:\n",
         "  print(x)\n",
         "\n",
         "print('\\nElements in ds_file:')\n",
-        "for x in tfe.Iterator(ds_file):\n",
+        "for x in ds_file:\n",
         "  print(x)"
       ]
     }
   ],
   "metadata": {
     "colab": {
+      "collapsed_sections": [],
       "default_view": {},
-      "last_runtime": {
-        "build_target": "",
-        "kind": "local"
-      },
       "name": "Eager Execution Tutorial: Importing Data",
       "provenance": [],
       "version": "0.3.2",
diff --git a/tensorflow/contrib/eager/python/examples/notebooks/3_training_models.ipynb b/tensorflow/contrib/eager/python/examples/notebooks/3_training_models.ipynb
new file mode 100644
index 00000000000000..84f1d031d40604
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/notebooks/3_training_models.ipynb
@@ -0,0 +1,485 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "k2o3TTG4TFpt"
+      },
+      "source": [
+        "# Training Models\n",
+        "\n",
+        "In the previous tutorial we covered the TensorFlow APIs for automatic differentiation, a basic building block for machine learning.\n",
+        "In this tutorial we will use the TensorFlow primitives introduced in the prior tutorials to do some simple machine learning.\n",
+        "\n",
+        "TensorFlow also includes a higher-level neural networks API (`tf.keras`) which provides useful abstractions to reduce boilerplate. We strongly recommend those higher level APIs for people working with neural networks. However, in this short tutorial we cover neural network training from first principles to establish a strong foundation."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "3LXMVuV0VhDr"
+      },
+      "source": [
+        "## Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "PJ64L90aVir3"
+      },
+      "outputs": [],
+      "source": [
+        "import tensorflow as tf\n",
+        "tf.enable_eager_execution()\n",
+        "tfe = tf.contrib.eager # Shorthand for some symbols"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "eMAWbDJFVmMk"
+      },
+      "source": [
+        "## Variables\n",
+        "\n",
+        "Tensors in TensorFlow are immutable stateless objects. Machine learning models, however, need to have changing state: as your model trains, the same code to compute predictions should behave differently over time (hopefully with a lower loss!). To represent this state which needs to change over the course of your computation, you can choose to rely on the fact that Python is a stateful programming language:\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "VkJwtLS_Jbn8"
+      },
+      "outputs": [],
+      "source": [
+        "# Using python state\n",
+        "x = tf.zeros([10, 10])\n",
+        "x += 2  # This is equivalent to x = x + 2, which does not mutate the original\n",
+        "        # value of x\n",
+        "print(x)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "wfneTXy7JcUz"
+      },
+      "source": [
+        "TensorFlow, however, has stateful operations built in, and these are often more pleasant to use than low-level Python representations of your state. To represent weights in a model, for example, it's often convenient and efficient to use TensorFlow variables.\n",
+        "\n",
+        "A Variable is an object which stores a value and, when used in a TensorFlow computation, will implicitly read from this stored value. There are operations (`tf.assign_sub`, `tf.scatter_update`, etc) which manipulate the value stored in a TensorFlow variable."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "itxmrMil6DQi"
+      },
+      "outputs": [],
+      "source": [
+        "v = tfe.Variable(1.0)\n",
+        "assert v.numpy() == 1.0\n",
+        "\n",
+        "# Re-assign the value\n",
+        "v.assign(3.0)\n",
+        "assert v.numpy() == 3.0\n",
+        "\n",
+        "# Use `v` in a TensorFlow operation like tf.square() and reassign\n",
+        "v.assign(tf.square(v))\n",
+        "assert v.numpy() == 9.0"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "-paSaeq1JzwC"
+      },
+      "source": [
+        "Computations using Variables are automatically traced when computing gradients. For Variables representing embeddings TensorFlow will do sparse updates by default, which are more computation and memory efficient.\n",
+        "\n",
+        "Using Variables is also a way to quickly let a reader of your code know that this piece of state is mutable."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "BMiFcDzE7Qu3"
+      },
+      "source": [
+        "## Example: Fitting a linear model\n",
+        "\n",
+        "Let's now put the few concepts we have so far ---`Tensor`, `GradientTape`, `Variable` --- to build and train a simple model. This typically involves a few steps:\n",
+        "\n",
+        "1. Define the model.\n",
+        "2. Define a loss function.\n",
+        "3. Obtain training data.\n",
+        "4. Run through the training data and use an \"optimizer\" to adjust the variables to fit the data.\n",
+        "\n",
+        "In this tutorial, we'll walk through a trivial example of a simple linear model: `f(x) = x * W + b`, which has two variables - `W` and `b`. Furthermore, we'll synthesize data such that a well trained model would have `W = 3.0` and `b = 2.0`."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "gFzH64Jn9PIm"
+      },
+      "source": [
+        "### Define the model\n",
+        "\n",
+        "Let's define a simple class to encapsulate the variables and the computation."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "_WRu7Pze7wk8"
+      },
+      "outputs": [],
+      "source": [
+        "class Model(object):\n",
+        "  def __init__(self):\n",
+        "    # Initialize variable to (5.0, 0.0)\n",
+        "    # In practice, these should be initialized to random values.\n",
+        "    self.W = tfe.Variable(5.0)\n",
+        "    self.b = tfe.Variable(0.0)\n",
+        "    \n",
+        "  def __call__(self, x):\n",
+        "    return self.W * x + self.b\n",
+        "  \n",
+        "model = Model()\n",
+        "\n",
+        "assert model(3.0).numpy() == 15.0"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "xa6j_yXa-j79"
+      },
+      "source": [
+        "### Define a loss function\n",
+        "\n",
+        "A loss function measures how well the output of a model for a given input matches the desired output. Let's use the standard L2 loss."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "Y0ysUFGY924U"
+      },
+      "outputs": [],
+      "source": [
+        "def loss(predicted_y, desired_y):\n",
+        "  return tf.reduce_mean(tf.square(predicted_y - desired_y))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "qutT_fkl_CBc"
+      },
+      "source": [
+        "### Obtain training data\n",
+        "\n",
+        "Let's synthesize the training data with some noise."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "gxPTb-kt_N5m"
+      },
+      "outputs": [],
+      "source": [
+        "TRUE_W = 3.0\n",
+        "TRUE_b = 2.0\n",
+        "NUM_EXAMPLES = 1000\n",
+        "\n",
+        "inputs  = tf.random_normal(shape=[NUM_EXAMPLES])\n",
+        "noise   = tf.random_normal(shape=[NUM_EXAMPLES])\n",
+        "outputs = inputs * TRUE_W + TRUE_b + noise"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "-50nq-wPBsAW"
+      },
+      "source": [
+        "Before we train the model let's visualize where the model stands right now. We'll plot the model's predictions in red and the training data in blue."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 293
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 1210,
+          "status": "ok",
+          "timestamp": 1527005898290,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 420
+        },
+        "id": "_eb83LtrB4nt",
+        "outputId": "3873f508-72fb-41e7-a7f5-3f513deefe38"
+      },
+      "outputs": [
+        {
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAEDCAYAAAA2k7/eAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJztnXlgU1X2xz/pAhRautCWUsCwWVlcUHHGBUFQcSg7uM8P\nFLUICo4VpygObihI3UdmUHBB0IGZQbEgFNGCqKgMolV2pKylCy1pukDp+n5/3LxmaUsDTUjSns8/\nbZKXd09C+b7zvvfccw2apmkIgiAITR4/TwcgCIIgnB9E8AVBEJoJIviCIAjNBBF8QRCEZoIIviAI\nQjNBBF8QBKGZENDYE+Tk5JCUlER+fj7+/v7cdtttTJgwgcLCQhITEzl27BidOnXijTfeICQkxBUx\nC4IgCOeAobF1+Hl5eeTn59OrVy9OnjzJ2LFj+ec//8mnn35KWFgYCQkJLFy4kKKiIh5//HFXxS0I\ngiCcJY22dKKioujVqxcAbdq0oXv37uTm5pKWlsaYMWMAGDNmDF999VVjhxIEQRAagUs9/MzMTPbs\n2cNll13GiRMniIyMBNRFoaCgwJVDCYIgCGeJywT/5MmTPPLII8ycOZM2bdpgMBhcdWpBEATBBbhE\n8CsrK3nkkUcYNWoUN910EwDt2rUjPz8fUD5/REREg+eRtj6CIAjuo9FVOgAzZ86kR48e3HPPPTXP\nDR48mE8//ZRJkyaxcuVKbrzxxgbPYzAYyMsrdkVIbiUqKkTidCESp2vxhTh9IUbwrTidodGCv23b\nNlavXk1cXByjR4/GYDCQmJhIQkICjz76KJ988gmxsbG8+eabjR1KEARBaASNFvwrr7yS3bt31/na\n4sWLG3t6QRAEwUXISltBEIRmggi+IAhCM0EEXxAEoZkggi8IgtBMEMEXBEFoJojgC4IgNBNE8AVB\nEJoJIviCIAjNBBF8QRCEZoIIviAIQjNBBF8QBKGZIIIvCILQTBDBFwRBaCaI4AuCIDQTRPAFQRCa\nCSL4giAIzQQRfEEQhLOk0GTi84R7+XbIDXyecA+FBSZPh+QULtnTVhAEoTnx7YzHuDflUwyAlv4z\nizEwfNFiT4fVIJLhC4IgnCWhhw9hsPxusDz2BVwi+DNnzuTaa69lxIgRNc/Nnz+fAQMGMGbMGMaM\nGcM333zjiqEEQRA8TqHRiGb5XQMKjV08GI3zuMTSGTt2LOPHjycpKcnu+YkTJzJx4kRXDCEIguA1\nXJ/8OosxEHr4EIXGLlyf/JqnQ3IKlwh+v379OHbsWK3nNU2r42hBEATfJjQ8wic8e0fc6uF//PHH\njBo1iqeeeori4mJ3DiUIgiA0gNsE/+677+arr74iJSWFyMhI5s6d666hBEEQXMLRjAwW9u3FWmN7\nFvbtxeGMDE+H5FLcVpYZERFR8/vtt9/O5MmTnXpfVFSIu0JyKRKna5E4XYsvxOmNMb53xQhmZh1T\n5Zalx5h3ww08cfSop8NyGS4TfEe/Pi8vj6ioKAC+/PJL4uLinDpPXp73Wz9RUSESpwuROF2LL8Tp\nTTEWmkx8O+MxQg8fIjory67cMtZk8po4z4SzF0+XCP706dPZsmULZrOZG264gWnTprFlyxZ2796N\nn58fHTt25Pnnn3fFUIIgCC7FdhHVXFSZpcHyM8vGqWgKuETwX3311VrPjRs3zhWnFgRBcCu2i6ju\nBp4JDKR7QACZ4RH839dfezAy1yMrbQVBaNbYLqK6AOgaP4L4w7lMSt+NsXt3T4bmcqSXjiAIzRpf\nXUR1LojgC4LQrPHVRVTnglg6giA0WXy1jbG7kAxfEIQmi6+2MXYXIviCIDQZbGvqC41Ggg9k+GQb\nY3chgi8IQpPBMaOfE9vRrq7eV9oYuwsRfEEQfB49s/dbn2qX0V8QEcHiq/7odAWOyWRmxoyNHD7c\nFqOxkPffHwX4uzv884YIviAIPk2hycS/B1/HJVnH2In9StnK7heelWc/Y8ZGUlLGAwbS0zWmTFnO\n/PnD3RK3JxDBFwTBJzmakUHquOFE5mTTpbqaAcD1wDygQ1AQ1UOGnnVN/eHDbcHmHuHgwWDXBu1h\nRPAFQfBJUscNt3a2BJYDdwG9gRNDhp5TNY7RWEh6uvUeoWvXEhdG7HlE8AVB8BkKTSY2PDqVgB+/\nI8ZstvPrg1HCvz22I3ec42rZ5OTBwFKLh1/EggUjqapyTezegAi+IAhejz4pq23aQBuzmWHAAuz9\n+t8CA8mPH8Edya8RGn5uXS7tu7w3vS1aRfAFQfBq9ElZR/vmbuBZwGgwkN0hlqEr19C5a7dGjdXU\nJ22ltYIgCF7NtzMe4xKL2IPVvrkAuAgwjBzDpPTdjRZ7aPqTtiL4giB4NaGHD1GC1WDR7ZvktqFs\nbH85r2eMJiHhUwoKzI0ey2gstBtJJm0FQRDciF5u2anARGZ4BC169eIBlI3TBsuk7MbNPJ60Sdkv\nuQa279CApSxaNOasx7NdbNWhw0mGDn2P7OxImbQVBEFwF/rEbNba1cysqKjZSPyF6mo+GzWW0MOH\nOGHsUjMp62i/HD7cttZK2eTkwYSHh51xXEffftSopaxffyMAERHes/euKxDBFwTBK9D74HwO9u0R\nCs3E11FT71gzbzQW1RJvZ7L+ui4cTRWXCP7MmTP5+uuvadeuHatXrwagsLCQxMREjh07RqdOnXjj\njTcICXFuZ3VBEJo+u7Zt48vRQ+ladpqDBgNRrVtjAIqxL7fMrKfE0rFmPjl5EHfcsY2zFe+6LhxN\nFZcI/tixYxk/fjxJSUk1zy1cuJBrrrmGhIQEFi5cyDvvvMPjjz/uiuEEQWgCfDkmntllp5XMahpP\nnzyJBsQDy4BC/MhoFcawxcvqfH94eFit7P1cxLuuC0dTxSWC369fP44dO2b3XFpaGh999BEAY8aM\nYfz48SL4giCwa9s20sbG0+10qZ110x14JSwMf8LZZO7HKt6G0+Hs/8dSFi3q69S5z0W867pwNMS5\nzBV4A27z8E0mE5GRkQBERUVRUFDgrqEEQfAQZyN8+qTs6VUruUjTOIS9dZMNxAwczN8Pjyc9fXTN\n+87GUz8X8T4XzmWuwBvwuknbqCjf8PklTtcicbqW8xXn1Kmf2wlfy5bL+fe/76p1nPnECVbc1J/e\nmZmUAEOBpajOltHAAYOBsBtvZMz7i1g3ZZ2dLRMXV4qfXxUPPZTKwYPBdO1azIIF8UREnJ+Muq7v\nMisrHNu5gqyscJ/423Cb4Ldr1478/HwiIyPJy8sjIsK53ha+UAIVFeUbpVoSp2uROGuzb18QtsK3\nb18QeXnFtTL/+PIVzMjMtGuN0BUYDsxqFcRfjuQCUFEFs2dfT1mZ1ZaZPXsQ99+/qubCsnWrRlnZ\nUubNG+R2W6W+7zI21oTt/UlsbIFH/zacvdi4TPA1+65DDB48mE8//ZRJkyaxcuVKbrzxRlcNJQiC\nl1DfJKlueRgwcUH6FPBbb+fXtwF+A7a0CuLmVevszlmXLVNX6aQnbRVfneh1ieBPnz6dLVu2YDab\nueGGG5g2bRqTJk3iL3/5C5988gmxsbG8+eabrhhKEAQvoi7hKzSZCN34D17hUfIoYS4VLKu29+t3\nderEHWnfOd3Vsq4Liyfr58/XXIGrcYngv/rqq3U+v3jxYlecXhAEL8VW+ApNJj57YAKl337NDSgp\n7mD5GY+yccotO1FNfn8RuXknSUhY6ZQlU9eFJSlpQ7Opn3cVXjdpKwiCb/LtjMeI/fZr7sKayb9k\n+RkG3AkstuxEFRYRwr33fe60JaNfWPS5gTvu2Far742v2CqeRARfEASnqasMs9BUwOJxCVyanU4+\nUIgSeAOqffFLQPuwMAwDB3N98muYTGamTv2c9evB1pLJyPBvMOOvr++NyWQmKcn36uLPNyL4giA4\nja3g/pqeT+DqP3BJ9UHmY83ql6E2J9GAn4Hc9pezLCqRblRzHX4251iGrbNvMh1mx44nOVPGX59v\n76t18ecbEXxBEJxGF1wDJ5jM5fyjOrNWs7NC4G3gd/zZenUS3/74ol0LY6toK2c/KKiCIUPgwIE4\nsrLOPAnrOHl7/PguDhzowaZNucDnqE488U26AVpjEMEXBKEW9a2g7djhGMb0eG5mHa3R6mx2to6r\nWcUPwCrC9uRbXjEDqaxfD+HhO4GBNe8oKytl69Z8evUKtjuTPglr36++nPbtnyY39yrgJFlZUxg7\ndgFm85PY3mMYjZVn/BzNFRF8QRBqoSySEcA60tPD2bp1CY/cW0nv1GfpDeQBuWDX7Kwc2A2s4l+W\nV04C+ZbfU4E7KS01UFqqERT0DKWlLYGZVFcbyMrSqK5+gVGjate2O9o1YWGvACNrYi0o6ITtPUZY\n2GmSk2+u873N3eoRwReEZsDZZrrKElkH3Ik/WxmSNYaCOdX0B0qAB4BVwNNAJ9QFIB9Y1vZeKNoO\n/Aj8iWuuWU6LFktZvx5KS62iXFraDyXS1ucKC411irGjbw/tsL0TCA8/Smmp9fHAgQE1n6059bp3\nBhF8QWgG1JXpnqk1gdFYyK/pp4nnEv7ATiKBUGCA5edyIALoDOwliNfIJCzsM7744g/MmfOz5Zyr\nSU4eTnh4GAkJn5KSYmv8nLT8tBXuzDpjd/Ttr7mmmhYtrHcCM2eOYs6cule9Nqde984ggi8ITRDH\njP7AgTY01JrgxImX+emnYsrKuhKifcxf2EAw0BdqGp6lAnehWiMUA7tpxRtsB8Ixm1vx7LPf0aJF\na8s41nYrtgunjh/fRVbWFEs8y/DzKyYm5gQrV1ptGltqL7q6pdbdyaJFRiff27xr9UXwBaEJ4ijm\nsbFzcJwQdbQ7Nm8uAm0ioxjFzeykEHgCa06+HNCnVf8H/MKlrGUssMvyTDybNy+kqOhBHD1z2xW5\nBQVXMmvWOvbtC8JorCQ5Of6M9lJj2hj4agsEdyGCLwhNEEcxj4jowlVXnbk1gb92kkR6MM/yzCrs\nnfM2wK/Atxh4mXuB14A1qJ6X6hynToXSkGceHh7Gv/99l090Hm1qiOALQhNEedcFqInXlvz++06O\nHGmFn18nOnSoAuztjuh2+7k07Q36Y5XrEuzLLb8DXmYDMAhQ1TLV1aUUFS1BOfoltG5toqiocZ65\nlFK6DxF8QfBRMjIOM27cKgoKOhEefpSVK0fRtavyspOTB7N16wKyslR9elnZGMrKlgHxpKauZfPm\nLwgOziW8bTsuP/4SxvTddMZe5IcCs4COwC78mc8e4D+Wo4rp1CmW7t0rSUmZgC7w/fq9xZ49cy0x\nZTJzZm1fXm+toCyd2oIupZTuQwRfEFzI+cxOx41bVSPopaUaI0c+w9VX9yArK5zYWBMREUa7lasQ\nhFoDO4PiIhN9i/7ENVk/0Qb4G/AKcDvKq28DbAPKgHXczKqaupyLgRGAxv79s3jvvTuxnRQtLw+0\ni2nOnKW1JlQbEnQppXQfIviC4EKczU6dvTDUd5zJZCYnR8O2nUBeXsuasdUuTHOxN2X2AH3wYz/j\nuYgYNHoDpZYjwlBVOCGWM5qBv/Moyqu3LacEMHD6tCrBtP18Q4akoZorpALBbNqUQ0GB2e6z2Qt6\nIZs25TJkSFrN55NSSvchgi8ILsTZ7NTZC4PjcWVl7wGQlpZLdfVMbNsJaFqE3dgFBbG0ajWL06fb\no0Q4GH/SmcooWgNXo8wZ/Qy3AWuBTGB/y1DSus7B//dDVFUtAyqBY8Bky/mV+Dt+PiXWa8HSJNls\nHk5S0lK71saHDlUCHwPDgLWYzY+Tnm79HqSU0n2I4AuCC3E2O63vwmCb0cfE5PH99/arUX/80Q+z\neSLUallWTnCwieJi69iqdcFsYmJe4FTO10xhA3HAPuBFrEK/BJiLMmwOGAL5vPsLxPVpz6fJg3n0\n0dWkpgLkoMR+Hcrw2QU8SEzMJ3YtjWfOvJJNm/6H2Xzmjpb6pC+0q3WslFK6DxF8QXAhzmanHTpk\nk57+L5SBUkSHDrZ7waoeNtAeVd9uFfGiohzL744ty/IpKTlOy5azKC/vhqYFoaZdDbSqzOMeNjCX\nusstw1E9cF7yv4viqo9hv4Hd+zVSU5+ksjIEg8FAy5a5lJXNQ9PiwLLQKiZmPgZDJCkp92N7pzJw\noL/dqlr9oud4kevS5UKMxsI6jxXcgwi+ILgQ57PTQLDbG+o9TCazpc1viuX1AcD1wDwgBmhBdXWU\n5fjrUHl5NKqTzd1o2mbKyu5CtTK7k0BWkMjtdM2HFtRfbvk9MI+HoeoKm6MKKS9vA1wClHD69KXA\nBJt3LeP06WNkZ3fA8U7l3/++krouenXd/Yh9c35xu+APHjyY4OBg/Pz8CAgIYMWKFe4eUhC8nuzs\nSGyFMjs7khkzNmI2P255vgBVUdMH8ENJdjzqYvAekAHMwX4dbIjl8XX48xBTeJvLLM9uwb7c8kng\nAiCdIBaRALyB/YYka1G1O/r5P8T+viAEaFeniNd30bMV97i4UmbPHiT2zXnG7YJvMBhYunQpoaGh\n7h5KEHwG+4VRbTh+fCdVVRdhFdV1wAzL4+G0aJFEefkBVG/KAqAH9gIcDBThzxb+j6uJRTnt+j1E\nf+ApoCfKfd9LIPN4CUjEKub6VuOlqEla2/PnYX9fUMw111Rb2hA7l6HbintUVIistPUAbhd8TdOo\nrq529zCC4HHOpgbfcWFUVtYITKZZwDisjQysgtuyZSTl5UlYBVffHlx/vJcW/EoiH2EE2qLuC/Qz\nhANGlNjPYxgwGnXXsAw4hf1W48ss77I9fy7qziIAP78sbrklnDfeGC4Zuo9xXjL8+++/H4PBwB13\n3MHtt9/u7iEFwSMkJq4hNbUt4E96egDl5Z/z4Yf/V+ex4eFhREf3tlsYdfp0F5RfHw38jvLvwwGN\n4uJg7DPuzsALQCcCWMOdfEJHqJmYreuSsA94jf0og8d2/mA2yh5S7REgwTKOyvZbtjxAWdlUoAug\nMWKErHz1Vdwu+MuXLycqKgqTycTEiRPp1q0b/fr1q/f4qKgQd4fkEiRO1+LNcZ44Yeahh1I5eDCY\nrl2LWbAgnoiIsFrHfPVVNqA6RYLGjz++WvO5Tpww88ADKWzapAF5DBgQRocO2PnfaguRGTaP56E8\n/BIgG3v5Pgp0pxWf8hc+IRi4FPtLQk9Urm4GdhDCAn4BuqPyfNsjL0c1QHseqEB1vDcAd9Kp0zx+\n/fVxpkxJZd++X8jP38vhw0amTl1d5/dwNnjzv7ktvhKnM7hd8KOiogCIiIjg5ptvZvv27WcUfF/w\n9XzFf5Q4XUNCwqqaUsmtW4P57rt/sHHjBMLDw2r62eTktKO6ugu2QlpcHMK+fUctG4Cssus5k5Ky\nBPgNeBWIRAl4H+yFuDd6GwNYgLVBcQkBHGIqM7kQ5ei3pnb1zS6gCEjmJ9Qq226Wcxc5HKkvv7rC\n8rt1nLCwzlRV+TN//nASElaSnj6DzEx9Edi5Z/re/m+u40txOoNbBb+0tJTq6mratGnDqVOn+O67\n75g6dao7hxSEs8IZ3z0jwx94B1XXspOsrF4MGrSEjRsn2PWzUatHrUJaWdmKQYOWEh3d27K61FbM\nNVT3Guvyp8DAX6ioGIO9ZBuAdJSFcyd+7Gc0/ehOUU0bYw1l7tyLtQ/O98AxgviI7aisvrvlqF7A\nXtQU7oVAS9RkrS781cDdNWfu3n1pzfcgPW58H7cKfn5+PlOnTsVgMFBVVcWIESPo37+/O4cUhLPC\nmRYHJtNhVCHjcvQtQbKyNJKSllJQYFuHPgyVscehWo/dR1bWf8nK8kdNehage/JwANs+OFBJUFB7\n2rV7kZycSNS0613AZtQdwAECuZ1EVnARqqmZ7eWjI+oeIBz4BQMvk4Ta+1XP6kNRtf0VwHPAEWAp\nEAXMx8+vkN69+9K5cxHwHtnZkbJdYBPErYLfuXNnUlJS3DmEIDQKZ7LWdu3iLJOr9hOnq1ZVomm7\nsWb1eulxgeXnRqADavJ1OJCEqoTRd4PNRU3QLgBKKSp63tJL/hmU4P8XmI4BE4MZSD920hvV0cYP\ne1PGhDJqnuIOVOb+PKp/zjKgHFWRU2zzGb5HZfnqDDExc9mwwdrKWL/zueOObTV3PrJIyveRlbZC\ns8aZrLVbt5Ns365qz21lVrUviEOJahCqQUEgyjL5K9a+MwuAKajFSvYNz2AkyqefZxnNgLJdDgNt\n8eN1pjKDi6gkFHUPEYqa2l2GtbNlJvAmM1C1OXrzhDCUPbPaMsYCwsJ2YzYPx/Hi1a5dnN1nru/O\nR6pzfBsRfKFZo2etGRmtMZn2kZFhJCHhUzsv33qMH/v3P83p00bUQqRhwHpU24NiVLZ+P8qqWYeq\naTegxHYJyj5xXK2q/x6MkvA2qJLMSIJYwiNs4VpqbyLeFdiBWob1Bd1YxR2oOwioPX2rHgcGmtiy\nZQJJSUvZtCnHIvzqmG7dTtl9L+LXN01E8IVmjb5w6J57PmbHji5kZYWwY0cOP/zwHuXlFwD5XHNN\nMG+8MYJZs75jx47nsbY+eB3ohxL7oSj/Xq+jz0ZZKmGWn0eAVjiuVlVoKKPmYcBAC/L4Cw/QAlUh\nb9s8Qd9E/DCQg4G5bEU1MzuFaocQgrJwnkDdfRxH1c8vY8CAkJrPW1BgJimpfntG/PqmiQi+0OQ4\n212nTCYzX32VhbWG/l8cP/4Mutilpi6jRYuNZGWFY93cIwNV6a7L8XxUhYttHf0ylKWi96XRPfVi\nlOtegrJgwlGZfSEB/I9HeYCXUFOqtvcDbVDSvhmYxzxURq8Bn6CqbabYjP0CMBbdVoqN3cE//zm+\n5jM3tEJW/PqmiQi+0OQ42z1RZ8zYSEVFP6zyGoKj9ZKSchz4BiW5T6KyedvVqi+gJktt31cIvI+q\njLH11FehBD8Y/QLhzxbGE04HrF1yjlG7q2UJLfgHPwArLec5iTJ42tuN3bZtB6677hNLtY2Z5OTx\ntS56Z7owSsuEpokIvtDkcPSfMzL87TbpePLJK5k792ebTUayUTX2+i5MjguTTKiKmj4o774QVSrp\n2Opgj8P7TqKmWG1ragpQ9fVRwDEMHORmRtGXHXRB1ebonW3uRuX/eqOFDYRyqtfrxBauo23bzhQV\n7aBduzgOHy6nqGgnaq5AjT1oUIsGBVs2C29+iOALTQ6r/1wIrGXv3kPs2KGqY9LTNdasmUVl5eya\n15V4ZwMXofZvLUJZNsrDV4+fw75LDdiLeybKdHkS1aYsFHjA8vM9VK+aXqhFVOpcLXmTKXSnDfZe\n/RKsPStPAj8Bb/MhsbGZpG+6tdbn7dv3LYqKpqAvuwoK+onk5IRaxzkiE7PNDz9PByAIrsRkMlNe\nXkFY2AcEBLwCDKWiwr7LTGVlZ8vjVNRkaxFqknMsSozboiY6W6BE+yrss/k+KL9cL4FcjppwDUDZ\nQS1R+XmY5fho1H+1/6F8/+W04s88yqNcBfzB4ewRqPqeA0ApLXibn4AJhIZ2r/Mzq5LKcJTFNJKe\nPS8/45yFjtFYiLrEgEzMNg8kwxe8nrOZhH300S9Yt05tuWetbdFw3A5Q/QxGTWr2xl5y+6Hq4/X3\nV1PbqgkDLkbZKDp6q4IdDsf/BDwGrCKA9fyFJfRE4xDKvsHh6L3Ad0AyM7Dtf1lYmFHnZ7auE1DH\nXXjh6TN9nTXIxGzzQwRf8HrOxmv+8UfbLvB6bcsAVHVMIcq6Kbc8PoaycRzr1k/avL8Y5bnvRmX9\nB1CLqqC21/+75Zi7gVmoydTjwP34cZw7uI8LqKpZLfsAyux5DGsPnP8BvxDLWraj7gqWoyZ9Aykp\naUtBgbnWxc5RuBcsGElVVcPfq0zMNj9E8AWv5+y8Zj1710V4p817v0cJcg9U1h2E6lLZBiW90ZZj\nJlse630oo4CHULZJAaqRWm/LuZdg7SMfClwLfInK9A1AJa14iGmspSWq4YFt82Mj8E/LmX/AwFv8\nRHT0ajiul4BqqN2n/CkqaklS0sZaIu0o3BERvtHhUTj/iOALXo+zi4BMJjMtWxZjbTlcSfv2p+jQ\noYqYmFOsWxeNmjgNQVXbPIG1cuYN1H+HauADVNOx6Vjl+VUgFtXorA/KytmL/cbeT6IuAN2Bv+HP\nVhKYRDBVhKHW49ree8SiMv1iYBVhFF74MqN676CkJJy0tGVAlkMMS2RiVWgUIviC1+Ho2c+ceSWO\nXrPtMR06ZAOB/PCDH2ZzT/SOM4GBc7jiigt4440refTRNahM3LZ2XpffdcCzNs/PcXjdgLqABKP6\n4kRZXg/HWk+Ti6rq6QQYaM10pvI6ccAh1KXAcQeqXagcftfVf+XzVbNqPv+QIWmoLQhXO8QQjtFo\nbvwXLDRbRPAFr8Pesy9g69YFREf3tpuwTUhYaXPMv7AX8uXAXVRUXEpqan9++WW+peVwFUpiQdkx\noGrsq7AX1hhqL3tqgbXR2WzUHMCtKBtHb5u8kAC+ZzjzuAhqvPo4y1nuxtp4YR/wHZEcaD+Jrz+c\nbPf5rXc09s3aYmN3kJw8HkE4V0TwBa/D3rNfR1bWk2RlWSds580bxKZNucBnqLr2AOBDy/GjUR0r\n56JWn75ETs5L2Fe5t8Bq59S1+2suKovXNwjMBfoC/0JZOlGoOv3lqHmAUYCBEN5iCjsJwbbxMDxt\n+WlEratNAl7hCoYOfYCvLRuB22Jt1uaPyTSXdu3i6NbtVJ2rZQXhbBDBF7wG3aZRu0Ppq17bYJt9\n792rcdllb1NW9kfURGknVL2LrdeeidqnNQJrWwMsP09TO6PvgrU12V7L43hUnX4R9nbPMlRWfxKV\nvz+PH/uJJ5o+VDAX1SvT9uzdUReAjqgan9dYQlBQFR9+OK7O70GqZwR3IYIveA22Vg5otG37EuXl\nJzl9Wm8ZUMC+fXuprn4Re4G3ldeLUNOhQ1HevGPVTjFqktb2Ob2RgV4FvxtlEd2Ftbe8fv5y1F3E\nt0B/gunNFPbQBWtdTrHD2fcAJ4C5PI1a2KURGvqi6744QXASEXzBa3AsvywpiaK6uhxYCORjMBRS\nXd0fewFuR+3e7yFY+9G/i/1WIUUoF/0pVO69H2XLrEb5+WGoCdqXUP89CrHtUaNkPRQD2VxPF66h\niDhUBX6X0UVuAAAgAElEQVRLyxHxWHtiHgR+IJhv+NYS0xLgd/r0EWtGOP9IawXB45w4YSYhYSUH\nDuzFdql/dfUhVOVLS+AhNK071kVSYF3s9CyqNn4Z1lYJuhV0G9bSS/189wDXAPcB/ijhH2455/2o\n7cCfQOXl01F2zyrURSKHAJ5kEg9yDUX0Rjn8D6LuJf6Gala8C7WI6oer/8qivbsIC/sSVc4ZCEzn\nxIm62yQIgjtxe4b/zTffMGfOHDRNY9y4cUyaNMndQwpegG3ZZExMHgZDJdnZHepsjfDQQ6kWK8ex\nX/x0rJt+L0fVzt+OypL1TUNOowTeH+XNL0CJ/SFUZh6GyvR160evrNmCEnRQUv0qtdsi2/aoAX/2\ncieP0Qkl246LqK6wRP078D3h7G8/hc/evIvw8DAGDowmJcW6w5T0rRE8gVsFv7q6mtmzZ7N48WKi\no6O59dZbufHGG+neXbKbpo6jH6+EfDTp6Rrl5e/QokXrmjr7I0d0K0fvF78ENXG6DjWRql8AilCZ\nfABqQrYUJdIXUbsscwLwIsryse1cqS+gMqIyeX3B1KWoUk1be2h/zWMD+dxOEp1Qa2mzqb2Iapcl\nor/zHPA05GrMmbOURYuM0rdG8ArcKvi//fYbRqORjh07AjBs2DDS0tJE8JsBGRn+WCtfirGVx82b\n8ygq6g74k54eQIcOv6AmQnWhPWY51rZ0ch5KmF8GrkZZO9NRG4E4ZubBKHHvbvndtsGZXrnTEmuZ\nZXeUXD+OtavNVmASBhYxhme4kZyada/hqBoix0VUR4BlrETdbahY9JWxUnkjeANuFfzc3Fw6dOhQ\n87h9+/Zs377dnUMKHka3cvbu3U99PeSLiqqwzchPnnyRsLBXMJs7oCpkOlN7pWs0ag9Z2wqd5aiL\nQ0vs5fc3lGUzHXVn8aHl+TxUM7OHgR9Qwr4AlZf/EVv7BvJoxTKmMZN5DiPehSoYnYOq9P8dSOav\nQDLWuxn1WcW6EbwJtwq+pmkNH+RAVFSIGyJxPRJn3Uyd+rnFyvkMW8E2GNqiaUtQAh2DdW/YYIqK\nqhk6tA2pqX6orQIN1M6hg1Btix07YZajBFvvn3MUlbFnoCyhLOy3F1mG6pXzrOW5EahGadZiSj/2\n8Sce4BKUfeM4Iqj7h2LgZ+C+las5tKyYgwdX07GjCU2rICtrNV27lrBgwUgiIs7/34ov/H36Qozg\nO3E6g1sFPyYmhqysrJrHubm5REdHn/E9vtDlLyrKN7oReiLOffuCUNKod3pUQqtpLVFTnX1Q2fda\nrFn+cNLSnqBt21YUFenyOgwl4hEosR9qeY/tRWAr1lYJlUCZ5fl01DKnO1HzAbaSHYK6IDjePagW\nyi1ZwZ9ZSRRK7B0bJ+9EXbIOA/P4KzCPqsX1t2uuqjr/f9O+8PfpCzGCb8XpDG4V/EsuuYQjR45w\n7NgxoqKiWLNmDa+99po7hxRsUOWOq5zaOMRVqD4wBajVrh+ibJTTqHLIO1HSeT3wH2xFt7z8QgwG\n6ySpyqFjUcuWdGtoKMoaCkNV1kSiBL81+mbg1sVYRcBbqAzfceGVfZ8cP78f8av+HxN5kWiUQXQZ\nSuyHYu/qlwDbCWAZe1AXDqSDpeAzuFXw/f39mTVrFvfddx+apnHrrbfKhO15xFrueP42qU5OHszW\nrQvIyrLtJvMSyh/XbZxAVAXMIpS9UwSUU1b2V1Stew+sE7ftUNXtF6JEvhR1Z6B78Ccs4ziutu1v\nGddoOacR5d/HoCqBVJ+cmBgThTmnmcrrNVO8rbGK/TqsG5OUAIb7JnHqxLWQ0s0ynvj0gu/g9jr8\nAQMGMGDAAHcPI9TBwYPBOL9xyJlxdpvB8PAwoqN7k5VlK8DtgV9RkqnbOONQojsCdVF4EXVR6Im6\nIPwN6wVjBipTvxh157Ac1YuyBEgE3qb2att1KMG3nW69HVXW+RtgwJ9D9Mx5mb5Qk9lXAL9YzqqL\n/fdArrErr//8ExVVgRQUmJESS8EXkdYKTZiuXYvZurXhjUOcwXGbQb2WPiOjNSbTXiIiutC9eyXJ\nyYOJicnDXoBboeri11HbT9d/jwIWo5oRnLa8pxRVNtkVtSh8JKoLpm255nLUBWW25Ryhlvd84zBW\nBaq0cwYQTkve5EFeJghVgW9bxT8Pa9u0X4G+H3zMjcNGEGbZSUpKLAVfRQS/CbNgQTxlZWfORPXM\nvS7hts3gHfvc/PBDMWbzg+gymZX1Pjt2BLFmzVpURfoLqJbCe1GLnlRFTm0/HcvvIVgbmC1Bib6+\n4YgJlYNrqDsAx7qZwyg//3eU7/8ZyhKy9sDx89tD9+4XcOD3KQzj31yE2tMqHzUlbHvGCNQ9QC6w\ns/f/MX2YbR2/IPguIvhNmIiIhjNRxxWxWVnL2bFjZK1NRxy3GSwpsb0AFKJE/lkqKx27wFdYfgaj\nJmuXo7L3LagFSgtQ3vpfLOcyoKpt9K0D9bJJtayp9sYkO1Fi74eaatVQ62BNGAwLMBgKCAgopLz8\nSTJ/f5XH+DctsC/UdOyGvxdY3fmv9L7iYj4Su0ZoQojgN3McM3clzKvsNh3ZsuUFIiO70bLlLMrK\nugIFVFaWohqSrUMJdBeH8/RC9YzvgzJJ/FENyu5CyeoW1MpWfd1qqOW9GsqnL0RV46iWxMHBwbRu\nncHJkyGcPDkLuBJ1FzAFa0Y/E1sZ17TOaFoorQK2MLG8KxEUcrHlXbaRhqIWUYUDu6KiefS7//FE\neIQLvl1B8C5E8Jsp1s1Gcqg94Wm/yjUnpx05OX7AH1AZ9RSsbvdc6l4odQRrqeQIm2MvRpVq9gJS\nULtP9QdmWc5/EjVluhblxa8FWtO2bQGXXRZJaupk9L48+lixsVmUlMTY1PAb0Dcab8F7TDr1Fn1Q\nS7JKLKPbRhqGarV22YrV3DZgoAu+XUHwTkTwmxG2lTbHj+8kK+shlOwtIyTkFOXlBygr80ctWtJ3\nnApFudlTsIq33mCgN9YLg75QKhrlpV+OfR6t7yk7EiXYek2+vvq1o+U1nWLgH+hZe1aWRnb2U8BS\nlGe/gKCgIMLDs4mIMFJdfYCiIpvaevZzEy25iHIuR80QBAKnLL+/iJrizQUyW7Zk8jdb6Ny1G4LQ\nlBHBb0bY+/WjgPctrxRQXNwG5YPbNv3VO0teQG3bR29yZrtQqhglpzEoJ9w2j24DVGP1823PV4i6\nSNger0u09ThNuxp1UVCWTXi4ucZ6ggJiY+cSHd2bnN8/5s8nV9ADlc3bVuC8iqrSH44ylHq/9TZT\n7ri7MV+rIPgMIvhNGMeVthkZAdgLbQFK0O+zPLbfzi8oKJLQ0AxycsB+Zep2AgK+oby8M0pC26EE\nXpU8qvLKu7GuUf0NdRFohVoEFYSSXF2GQ1H5ti7HJSg7Zz72F4GdqBYIYfj5taekRP8cAOFEtA1j\n4P4JtDhZXNPw7ANq32f8AmwA/mgptxSE5oIIfhPmgQdSSElR1S7p6RrR0S9gK6ABAcFUVtq2Frbv\nHBMenkV09CXk5NyAEu8yoAXV1X+mvPxfqIlafU3qfJTYg8r030bZNDstj5+ynONFVEbvKO6foTJ6\n2wtBEQbDU5bM/iQwGVXeeSctWhykqKhnTbxBPMWf9syhB8qmsZ3yrVXT87fnmPlIoku+Y0HwJUTw\nmzCbNtlPvppMkSi/3AAcoqoqFNiBVWSHoiZXewO7aN26DXv2bANyUPZNN5QL/jFK7HeiRPtt7Gvs\ny7BfHDUHqxVkQElxLPbibsDffzdVVXrXSwNt215ARUUwpaW23n4psbFzadu2M3v2DMOfF3iAp4lE\nTfmWoNbTrkXdY4xCFYgagX1Az7feZqRYOEIzRQTfx6ivxUFdzzvWo1RXF6AmX5cBT6BpytYJCHgG\n6EBl5WGUtbINuICMjN/RtBmo0kt9kdW/UBuRLMde1Gdh3SzcfkMSgyHC0iq72CaeocTEvMjp07EY\nDCauvroNYCQ19YGacw4atJStW49SWmr9DLGxOaSnTyMh4VP279nCgzxNR2qvvT2NMpbyLaMa3nqb\nv4rQC80cEXwfo74WB5s2VWI2twRuID09FFjK1Ve3JDX1JZS1coyIiALy8x0nTcMJDu6C2TwRtcI1\nEHgMNUmql17G2hyvi7njxGtfVG/6LNRCKqtI33ijgV275pKV1cVyvjhiY/ewceM9hIeH1bSgLSgw\n06KF/cpgs7mQMWPmUlDQifDwTFauHMnRjAx6bnyEKyiqKcB0XHt7CDUzkN82lAlfbpIKHEFABN/n\naKjFgV4yefhwW7p00YBpNa/17fsOO3a8YKmpt9opRUU5qOy8LepPwlY+9SZluoAXYW2LYOuOH0Jd\nWKpQ3S5fom3bKAYNakFy8jAAkpI2cvhwT4uYj6/VfK2uHjXh4WGkp0+refzh669wdO7zRKNaIDiu\nHNCAHwEzEP3W20yXrF4QahDB9zEcWxyoCpnaJZNGYxHHjkXYvZafH0OfPhXk5LRCTZqGAK2orn4I\nlQ8/i6qksfXWT6ImVZfTqlUZoaEZ5OYuQS2YeslyjmKUp1+NukNQq2mvu+49OwFvTMOxXdu28Wn8\nYDprGiGWT51nGd22Z/1m4GhQax7/+nvJ6gXBARF8H0H36A8caENs7BxLk7MqysurSE21XgDCwvYw\ncGABTz55Bbfeuhpb8TYai9i06TQwFaugv4+qfAFlyVyCtY/8LtS+sGHAnUREzKWg4EJUnxtFQMBz\nVFY+jaqLWYtqoaA2B8/OjnTJZ1/98VL2JD7MGyhhn24T/VxUfVBHVGfLuLfe5nHJ6gWhTkTwfQTH\nJmeXXvoe0IKjR1sTGzuXdu3i6NbtFMnJdxIeHkZCwkoyMyej574xMb9SXh5JUVE7lH0TjxLyAlQd\n/nKs1TS6NdQH+CcBAZFERBwnK2sq6uKgoQt8dXVny/kqsDY8U6tnjcZKwPle+nXxzovPU/LmK/S0\njOLY2bIT6rK0u20od4lXLwhnRATfwzgrho7e/Y8/+mE2Wy8AV11l3c3KZDKzaVMlqi7+LgCOH99h\n6UNjK+h3Uv8kbAVwjKFDI/jww7sZMiSN48f15z9E7Vg1nerqcMv5PrR7f1jYaZKTbwZqTzQ7s/NW\nWspnbEmYQBuse8sOpfZWJzuB61es5o/SA0cQGkQE38M4K4a1vXt9az8A+92sZszYiNlchbJWQoAi\nqqvD7I4PCqrA3382JSX+qBW2O7H37gNRxY7v2Yy/FvssXu+pY8DP7xjV1db4Bg4MqLlwOV6sGtp5\n6z8LF3D0bzO4Cvu2CMtRl6VZqLqhA35+jFi3kd59Lz/j+QRBUIjgexhnxVDV1VtLFsvL29h590Zj\nUc3dwvr1oNab2u4rOwfb3HjIEPjii3KsneGvB55BbczdApVPG2p8+OTkwWza9CVms2MBJIDGLbdE\n1Cqp1HG8WNW389aWDRv45s7RdEXtcWVfza9G247aB6vlW28zQ7x6QTgr3Cb48+fP5z//+Q/t2rUD\nIDExUfa2rQNnxdCxZLGumvWkJFuf374vjiqvXEZY2GkGDgwgOXkQ69dX2RwTDlyFqrixdrLU4wkP\nD2PgQH9SUmwXQe0gOrqamJh8gHptKceLVV07b+kWzlUood9B7f2xvgcKWrfmwY1SgSMI54JbM/yJ\nEycyceJEdw7h8zgjhnWhXwD0rP6OO7ZZetvrXWRKsG5Q0gb4BX//Mq65xkhy8gjCw8MID8+yW8Wq\nO+V610nHeGrHOr5mgjgl5X7qs6XOtAfsrm3bWDlyCK0qKojCauH0x9pBPwzVmu1SaYsgCI3CrYKv\nVmoKZ+JMYujMhO6jj37BunVKbK37wd4DDMVgeBlNe9Hy2giqql4lNXUKLVooQV65cpRlFWsHNO0A\nXbv2IC5udZ2LovRY580bVBNTUtIGkpMHn7VHD8q++e6uMcRpGq1QbdG+xv5+oyewBwiZ+wp/u39S\ng+cUBOHMuFXwP/74Y1JSUrj44ot54oknCAkJcedwTQZd6Otql+B4cfjxRz9sxTYg4DQXX/yZpea+\nu4PnrpqS6YLctavRbhWrM9Q1yWw0ak7ZUjqrP17KvsSHa/bK0uvpY7G3cHYAwdNncKeIvSC4hEYJ\n/sSJE8nPz6/1fGJiInfffTcPP/wwBoOB119/nblz5zJnzpwGzxkV5RsXBXfFeeKEmZtu+pjMTH17\nQGs1TFZWeK1xDYYT2MpkSEgxv/zyIACjRy+289z1n3FxpWcV/4kTZh56KJWDB4PZv78a2wtMVlY4\n69Zdz5Qpyzl4MJiuXUtYsGAkERG1z3/49995+7rr0PLyuBb7GYYY1KaFy1BtEQ4FBjLhhx+49Mor\nnY7zfNDc/z5diS/ECL4TpzM0SvA/+OADp467/fbbmTx5slPH5uUVNyak84Le7MsdJCSsIjPTdutA\na7uE2NiCWuNefXUbUlP1LpXFXHGFH6NHL7HYQBUMHfoemZlhnDixj4gII927L2X27EHk5RU7vQYg\nIWGVzWSw/d61sbEFVFX5M3/+8Jrjq6pq/ztu2bCBNXeOph2qyfJOVF2QXsV/AHVZy42MYsSaL7nN\nMinrTX8P7vx3dyW+EKcvxAi+FaczuM3SycvLIyoqCoAvv/ySuLg4dw3VpFB2i307ML1dgu0Eqi7W\nmZnRxMbutWm10NbOchk1ailpabcAt9Qay9k1APYe/TDCwl6hS5cLnZpkLjSZ+PD2MZz67Rcisd9A\nUe+8vxXVxrjrW2/zkEzKCoLbcJvgv/zyy+zevRs/Pz86duzI888/766hmhQxMXnArVhbIvzGpk33\n1Mq8HVst6CtthwxJw9kJVGcnW+1LR0MZOLA9ixbd2OBnKTSZePe6fgSeyKcDEIf9fUsUkI5aQnaD\nbDcoCG7HbYKfnJzsrlM3aQyGSlS/GmXRXH55O6daLehi7Wxd/9kce7alo4UmEx/eOoLAHdvpgloC\n1prabYx/B4au38TAmwf4xG2zIPg6stLWy8jO7oCavtQff1bncfWJta04x8WVMnt2/eLsrJCfqXTU\nkS0bNrDqztG0R7VAsF3nexfWNsbfA/1XrJa2CIJwHhHB9zIam3XbinNDE05nI+QNUWgy8cn/3U7Z\nT/8jGrVm19a+CQcWoBZRHbj8Sh5Y/gmh4REuGVsQBOcQwT+POFMV8+STV7J1q76l31FmzhxV57lc\nKdaNpdBk4tUr+hB56iRdgQyUjWNr32QBtGzFdau/kKxeEDyECP55xJmqmLlzfyYr60nAQGmpxpw5\nS1m0yOiJcJ1CX0TVDvsKnGewt286zn1FFlAJgodp8oJfV1ataZzzhhyNwZmqmHNpU+AJ0lI+Iz1h\nAj1QmyN2wN7CuQDV1fJXoK9U4AiCV9DkBb+urBo46w05XIEz/vzZVNl4gkKTifWJD3MkdY1da4SZ\n2Fs4+1FCP12EXhC8hiYv+PVnzOc/i3amKuZcu2eeD45mZLDwun6EVVfVqqmPRO2EG4US+/C/PSdZ\nvSB4GU1e8OvOmM+u2ZercGai1ZsmY3UKTSa+eHgSpWnrCUNtObgT1XxZb41QBJQBmZf2JfG/n0kF\njiB4IU1e8OvPmL0zi/Y2jmZk8J8Bf2RuRTnLgenozZZVa4RoVEYfcNUfeOCj/4jQC4IX0+QFv76M\n2duyaG9k17ZtpA4dRFfq3ua8N/CjwY/79hwQoRcEH6DJC75w9hzNyODzUX/C73guc4FXULZNMbW3\nHBz6xUYRe0HwEUTwBTv0rL43qtfNEdTq2GUooX8JaAvkRbfn9tVfyN6yguBDiOALgEXoRw7hgooK\nLgGGoerrXwKmAGtRk7SFLVpy7efrZbWsIPggIvjNHL0C50Taeru6erXHFrRHZfeHUZ0tbxOhFwSf\nRQTfDTi7k5SnSUv5jG8SJhAFGKlrjy3YB1RFRnHXmi/FvhEEH0cEvwHqEu+GthNzdicpT1JoMvFz\nwgQ6A0+gsnjbCdm9wGZUVi/2jSA0DUTwG6Au8f7sswlnfI8398PRWyPkfLWei4BAVKTxKBvnJJCN\n2nLwZulXLwhNCj9PB+DtnIt4G42FqDwZvKkfztGMDN699CJOpK7huYoKgoBjqEjDgDtRi6hCbhzC\ntL2H+OOAgZ4MVxAEFyMZfgOcSzMzb+yHU2gy8emga7m2vAwT1qx+CfA00BXYHxjI7d9tFa9eEJoo\njRL8devWMX/+fDIyMlixYgV9+vSpee2dd97hk08+wd/fn6eeeor+/fs3OlhPcC7i7S39cMwnTvDf\ne+6hcPO3VBYXM1vTMAAfY83qpwFzAgOpvOkW7ntjviyiEoQmTKMEPy4ujvnz5/P000/bPZ+RkUFq\naipr164lJyeHiRMnsn79egwGQz1n8l68RbzPlqMZGSy89goiNI0eqEVU24FLUTX2r6I6XO5v2Yp7\nf9sjQi8IzYBGCX63burWX9M0u+fT0tKIj48nICCATp06YTQa+e2337jssssaM5zgJLp9E6VpdrtQ\nPY0S/FDADJyIiua2z9eL2AtCM8EtHn5ubi59+/atedy+fXtyc3PdMZTgQKHJxL8HX0e306VUYF9b\n3xVYDByL7ci9GzeL0AtCM6NBwZ84cSL5+fm1nk9MTGTw4MF1vscx4wectnMaqnH3FrwtTvOJE6Q8\n8ACZa9Yws6LCzqvXM/x9QI9Ro3j4/fcJi/Ausfe277M+JE7X4Qsxgu/E6QwNCv4HH3xw1ieNiYkh\nOzu75nFOTg7R0dFOvTcvr/isxzvfREWFeE2cu7Zt48sx8XQ9Xcpx7FfMDgNeADqixF7fW7aiyru+\nZ2/6Ps+ExOk6fCFG8K04ncFldfi2Wf3gwYNZu3Yt5eXlHD16lCNHjnDppZe6aijBhi/HxDP7dCn3\no1bM7sW6AiAU8IvtyIC9h5h+vEi2HBSEZk6jPPyvvvqK2bNnU1BQwOTJk+nZsyfvvvsuPXr0YOjQ\noQwbNoyAgACeeeYZn6zQ8WaOZmSQOm443U6X2vn03VFtEsqBnE6duCPtO/HqBUEAwKDVZbh7EF+5\nffJUnIUmE9/OeIyDa1fzXEUFy1BdLXWffhYQGhZG62uu488fLaGiKtAjcZ4NvnTbLHG6Bl+IEXwr\nTmeQlbY+gi702qYNtDSb6YZ9D5xS4ECrIG5eta6m/01YhG/8sQqCcH4QwfcRvp3xGPemfGqXydv2\nwJkT25G/pO/2ZIiCIHg5IvhejJ7Vhx4+hHbogJ1X3xO1G1V7Pz+yYzowdOUazwUqCIJPIILvxdhm\n9Y419dlhYcQMHMz1ya/JpKwgCE4hgu9l7Nq2jS9G/wljWRm5wALgblRN/SthYXTv0o1CYxfGiNAL\ngnCWiOB7GV+OiefFsrKaTH4ZkIry6SMHDub6RYs9GZ4gCD6MCL6X0a3stJ1XHwKYgoJYPGQo1ye/\n5sHIBEHwdUTwPYztxGyh0cjuwBZo5dYMvxioHjKU4ZLZC4LQSETwPYxduWX6z/x94CCe+vF7jGVl\nHDcYaHX9QMZIZi8IggsQwfcwoYcP2Vk4nQsLuftonidDEgShiSKbmJ9HCk0mPk+4l2+H3MDnCfdQ\nWGCi0Gi02e4cCo1dPBihIAhNGcnwzyOO9s1iDFyf/DqLMVg8/C4yMSsIgtsQwT+PONo3oYcPERoe\nIROygiCcF8TSOY+IfSMIgieRDN8NOJZaXp/8OqHhEWLfCILgUUTw3UBdXv3wRYvFvhEEwaOIpeMG\n6vLqBUEQPI0IvhsQr14QBG9ELB03IF69IAjeSKMEf926dcyfP5+MjAxWrFhBnz59ADh27Bjx8fF0\n69YNgMsuu4xnn3220cH6CuLVC4LgjTRK8OPi4pg/fz5PP/10rdcuuOACVq5c2ZjTC4IgCC6kUYKv\nZ/CapjVwpCAIguBp3DZpm5mZydixYxk/fjw//fSTu4YRBEEQnKTBDH/ixInk5+fXej4xMZHBgwfX\n+Z7o6Gi+/vprQkND2blzJw8//DBr1qyhTZs2DQYUFRXiRNjnD/OJE6Q+9BDBBw9S3LUr8QsWAN4X\nZ31InK5F4nQdvhAj+E6cztCg4H/wwQdnfdLAwEBCQ0MB6NOnD507d+bQoUM1k7pnIi+v+KzHcyef\nJ0yyLqLaupXFZZVM/OwTr4uzLqKiQiROFyJxug5fiBF8K05ncJmlY+vjm0wmqqurATh69ChHjhyh\nc+fOrhrqvCKLqARBaCo0atL2q6++Yvbs2RQUFDB58mR69uzJu+++y08//cTf//53AgIC8PPz4/nn\nn6dt27auivm8Umg0oqX/XLPloCyiEgTBV2mU4N90003cdNNNtZ4fMmQIQ4YMacypvQZZRCUIQlNB\nVto2gCyiEgShqSC9dARBEJoJzVLw69pbVhAEoanTLC2d+vrVC4IgNGWaZYYvpZaCIDRHmqXgS796\nQRCaI03e0qlrf1kptRQEoTnS5AW/Pr9ePHtBEJobTd7SEb9eEARB0eQFX/x6QRAERZO3dMSvFwRB\nUDR5wZfWCIIgCIomb+kIgiAIChF8QRCEZoIIviAIQjNBBF8QBKGZIIIvCILQTBDBFwRBaCY0SvCT\nk5MZOnQoo0aNYtq0aZSUlNS89s477zBkyBCGDh3Kd9991+hABUEQhMbRKMHv378/a9asISUlBaPR\nyDvvvAPA/v37SU1NZe3atSxatIjnnnsOTdMaOJsgCILgThol+Ndeey1+fuoUffv2JScnB4ANGzYQ\nHx9PQEAAnTp1wmg08ttvvzU+WkEQBOGccZmHv2LFCgYOHAhAbm4uHTp0qHmtffv25ObmumooQRAE\n4RxosLXCxIkTyc/Pr/V8YmIigwcPBmDBggUEBgYyfPhwgDrtG4PBUOs5QRAE4fzRoOB/8MEHZ3x9\n5cqVbNq0iSVLltQ8FxMTQ3Z2ds3jnJwcoqOjnQooKirEqeM8jcTpWiRO1+ILcfpCjOA7cTpDoyyd\nb775hnfffZcFCxbQokWLmucHDx7M2rVrKS8v5+jRoxw5coRLL7200cEKgiAI545Ba0T5zJAhQ6io\nqIMzjrUAAATvSURBVCAsLAyAyy67jGeffRZQZZkrVqwgICCAp556iv79+7skYEEQBOHcaJTgC4Ig\nCL6DrLQVBEFoJojgC4IgNBNE8AVBEJoJXiv47733Hj179sRsNns6lDp58803GTlyJKNHj+b+++8n\nLy/P0yHVyZn6HXkT69atY/jw4fTq1YudO3d6Ohw7vvnmG/70pz9xyy23sHDhQk+HUy8zZ87k2muv\nZcSIEZ4OpV5ycnKYMGEC8fHxjBgxwq6c25soLy/ntttuY/To0YwYMYL58+d7OqR6qa6uZsyYMUye\nPLnhgzUvJDs7W7vvvvu0QYMGaQUFBZ4Op05KSkpqfl+yZIn29NNPezCa+tm8ebNWVVWlaZqmvfzy\ny9orr7zi4YjqJiMjQzt48KA2fvx4bceOHZ4Op4aqqirtpptu0jIzM7Xy8nJt5MiR2v79+z0dVp1s\n3bpV27VrlzZ8+HBPh1Ivx48f13bt2qVpmvo/NGTIEK/9Pk+dOqVpmqZVVlZqt912m/brr796OKK6\n+eCDD7Tp06drDz74YIPHemWGP2fOHJKSkjwdxhlp06ZNze+lpaU1PYW8jfr6HXkb3bp1o0uXLl7X\nZO+3337DaDTSsWNHAgMDGTZsGGlpaZ4Oq0769etH27ZtPR3GGYmKiqJXr16A+j/UvXt3jh8/7uGo\n6iYoKAhQ2X5lZaWHo6mbnJwcNm3axG233ebU8Q2utD3fbNiwgQ4dOnDRRRd5OpQGef3110lJSSEk\nJMRrb01tWbFiBcOGDfN0GD5FXX2htm/f7sGImg6ZmZns2bPHaxdlVldXM3bsWI4cOcKf//xnr4xT\nT46Li4udOt4jgl9ff55HH32Ud955h/fff7/mOU9mfA31EUpMTCQxMZGFCxfy0UcfMW3aNA9EeXb9\njjzp7zoTp7fhbXccTYWTJ0/yyCOPMHPmTLu7ZW/Cz8+Pzz77jJKSEh566CH2799Pjx49PB1WDV9/\n/TWRkZH06tWLLVu2OPUejwh+ff159u3bx7Fjxxg1ahSappGbm8u4ceP473//S7t27c5zlA33EdIZ\nPnw4Dz74oMcE/1z6HXkCZ79PbyImJoasrKyax7m5uU73hRLqprKykkceeYRRo0Zx0003eTqcBgkO\nDuYPf/gD3377rVcJ/s8//8yGDRvYtGkTZWVlnDx5kqSkJJKTk+t9j1cZz3FxcWzevJm0tDQ2bNhA\n+/btWblypUfEviEOHz5c83taWhrdunXzYDT1U1+/I2/Gm7LqSy65hCNHjnDs2DHKy8tZs2YNN954\no6fDqhdv+u7qY+bMmfTo0YN77rnH06HUi8lkqrFJTp8+zQ8//OB1/8cfe+wxvv76a9LS0njttdf4\n4x//eEaxBy/08G0xGAxe+wf86quvcvDgQfz8/IiNjeW5557zdEh18sILL1BRUcF9990H2Pc78ia+\n+uorZs+eTUFBAZMnT6Znz568++67ng4Lf39/Zs2axX333Yemadx66610797d02HVyfTp09myZQtm\ns5kbbriBadOmMW7cOE+HZce2bdtYvXo1cXFxjB49GoPBQGJiIgMGDPB0aHbk5eXxxBNPUF1dTXV1\nNfHx8TX7ffgy0ktHEAShmeBVlo4gCILgPkTwBUEQmgki+IIgCM0EEXxBEIRmggi+IAhCM0EEXxAE\noZkggi8IgtBMEMEXBEFoJvw//5K32R/vBHAAAAAASUVORK5CYII=\n",
+            "text/plain": [
+              "\u003cmatplotlib.figure.Figure at 0x7f5be3c99f50\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "display_data"
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Current loss:  9.48636\n"
+          ]
+        }
+      ],
+      "source": [
+        "import matplotlib.pyplot as plt\n",
+        "\n",
+        "plt.scatter(inputs, outputs, c='b')\n",
+        "plt.scatter(inputs, model(inputs), c='r')\n",
+        "plt.show()\n",
+        "\n",
+        "print('Current loss: '),\n",
+        "print(loss(model(inputs), outputs).numpy())"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "sSDP-yeq_4jE"
+      },
+      "source": [
+        "### Define a training loop\n",
+        "\n",
+        "We now have our network and our training data. Let's train it, i.e., use the training data to update the model's variables (`W` and `b`) so that the loss goes down using [gradient descent](https://en.wikipedia.org/wiki/Gradient_descent). There are many variants of the gradient descent scheme that are captured in `tf.train.Optimizer` implementations. We'd highly recommend using those implementations, but in the spirit of building from first principles, in this particular example we will implement the basic math ourselves."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "MBIACgdnA55X"
+      },
+      "outputs": [],
+      "source": [
+        "def train(model, inputs, outputs, learning_rate):\n",
+        "  with tf.GradientTape() as t:\n",
+        "    current_loss = loss(model(inputs), outputs)\n",
+        "  dW, db = t.gradient(current_loss, [model.W, model.b])\n",
+        "  model.W.assign_sub(learning_rate * dW)\n",
+        "  model.b.assign_sub(learning_rate * db)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "RwWPaJryD2aN"
+      },
+      "source": [
+        "Finally, let's repeatedly run through the training data and see how `W` and `b` evolve."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 446
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 569,
+          "status": "ok",
+          "timestamp": 1527005915434,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 420
+        },
+        "id": "XdfkR223D9dW",
+        "outputId": "c43591ae-d5ac-4f2b-a8e7-bfce607e0919"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Epoch  0: W=5.00 b=0.00, loss=9.48636\n",
+            "Epoch  1: W=4.58 b=0.42, loss=6.28101\n",
+            "Epoch  2: W=4.24 b=0.76, loss=4.29357\n",
+            "Epoch  3: W=3.98 b=1.02, loss=3.06128\n",
+            "Epoch  4: W=3.78 b=1.23, loss=2.29721\n",
+            "Epoch  5: W=3.61 b=1.39, loss=1.82345\n",
+            "Epoch  6: W=3.49 b=1.52, loss=1.52970\n",
+            "Epoch  7: W=3.38 b=1.62, loss=1.34756\n",
+            "Epoch  8: W=3.30 b=1.70, loss=1.23463\n",
+            "Epoch  9: W=3.24 b=1.76, loss=1.16460\n"
+          ]
+        },
+        {
+          "data": {
+            "image/png": "iVBORw0KGgoAAAANSUhEUgAAAW0AAAEDCAYAAAD+/1UIAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xl4VOXdPvD7zJZ9XwmELQkQIAELsiTsi6xiEBGXAiIW\nbV8WBY2K0tLa4lbsr283qxURtIoioAi8SpFNg6whi0FJKAoJBgLZt5k5c87vj5OZLIRkgEnOGXJ/\nritXJsmZyT0sN1+enPOMIMuyDCIicgs6tQMQEZHzWNpERG6EpU1E5EZY2kREboSlTUTkRljaRERu\nxODMQePGjYOvry90Oh0MBgM2b97c1rmIiKgZTpW2IAjYuHEjAgIC2joPERG1wKnlEVmWIUlSW2ch\nIqJWCM5cETl+/HgEBARAEATMmTMH9957b3tkIyKiJpxaHvnggw8QFhaG4uJiLFiwAD179sTgwYPb\nOhsRETXh1PJIWFgYACA4OBgTJ05EVlZWi8fL3t6AIADdugFvvglYrTeflIiIWl8eqampgSRJ8PHx\nQXV1NR5++GEsXrwYI0aMuPadCgtRvfoFeL2zDkJtLWxdu6NqRSrMs+8DDE4N9y4XFuaHoqIKVb73\ntTCTc7SYCdBmLmZyjlYzOaPVSfvy5ct44IEHkJKSgjlz5mDcuHEtFzYAREai6oWXUHwkA9WPPApd\n4QX4L/sVgpMGwWPTvwFRdCocERE15tQPIm9Ew3/FdBcK4P3ntfB89x0IVivEmFhUP/kMzCmzAL2+\nLb79VbT6LysztU6LmQBt5mIm52g1kzPa5YpIKaozKl9+DcWHT6Jm7gLof/wB/r98BEGjh8Fj28cA\nTyckInJKu17GLnWJRuXaP6P40AnUPDgP+jN58F+0AEFjhsO0fRvLm4ioFarsPSJ1647KP/0VxWnH\nUTvnAehPf4+AhfMQNG4ETDs/A/hiOkREzVJ1wyipR09U/OV1lHx9FLX3zIH+uxwEPPQAAieMgunz\nXSxvIqImNLHLny0mDhV/fxMlB4+g9u57YMjORMDcOQicNAamPV+wvImI6miitO1scb1Q8fo6lOz/\nBrUzZsJ4Mh0B99+DwKnjYdy7h+VNRNftL395DR999IHj4+XLl2DVqlWOj//61/+HDz/8txrRboim\nStvO1iceFf96B8V702CeNgPG48cQOGcmAu+cBOOBfSxvInJa//6JyM7OAKBsfldWVorc3FzH17Oz\nM5GQMECteNdNk6VtZ+vXH+Vvv4uSPQdhnjwVxiPfIPCeGQhImQpj2ldqxyMiN5CQMBBZWZkAgLNn\nz6Bnzxj4+PigsrISVqsVP/74A+Liequc0nnqXFN+ncSEASjf8AEMJ0/A+9UX4bH7c5hSpsIycjSq\nnloJcdhwtSMSkRN8Vj8Pj+3bXPqY5jtTULX699f8emhoKPR6Ay5duoisrEz075+I6uoyZGdnwsfH\nBzExsTCotL3GjdD0pN2UOPBnKH/vI5Ts2gPL2PEwHdyPoBmTEDD7LhiOHlY7HhFpVGJiIrKyMpCd\nrZT2gAEDkJWVgaws91oaAdxk0m5KHHQ7yjZtheHIYfi8sgam/Xth2r8X5vETUZ26EuJtg9SOSETN\nqFr9+xan4rbSr18isrIy8d//KssjHh4y/vnPf8HX1wfTpt3V7nluhltN2k2JQ4aibPMnKP1kFyzJ\nI+GxZzeCJo2F/8/vhSHzpNrxiEgjEhIGIC3tIPz9/SEIAgICAlBZWYHs7Cz075+gdrzr4talbWcd\nnoyyrTtQuuUzWIYlweOL/0PQhFHwn3c/9HU/gCCijismJhbl5WXo3z+x0ef8/Pzg7+9er33bLrv8\ntStZhvHAPvi8/AcYjx0BAJin3wWPXz+Hom69lRdn0Ait7jTGTM7RYi5mco5WMznjlpi0GxEEWEeP\nRemO3Sj9YAusPxsEj88+AYYMQeCEUfBc/xaEinK1UxIR3ZBbr7TtBAHWcRNQuutLlG7aCsycCUNO\nNvxSn0BIQm/4Ll8CQ/pxXqhDRG7l1i1tO0GAdex4YMsWFJ88hapnV0EKCYHXu+8gaNJYBI4fCc+3\n/wWhvEztpERErbr1S7sBKSIS1U88heKjmSj9YAvM02bAcOpb+D29HCGJveH7xGIYThzj9E1EmtWh\nSttBp4N13ASUv/2uMn2v/DWk0DB4vbcBQZPHIWjcCHiue5PTNxFpTscs7QakiEhUP/4kio9koHTT\nVpinzYD++1Pwe2aFMn0//j8wHD/K6ZuINKHDl7aDTgfr2PHK9J2eg8rnfgMpNBxe/96IoCnjETQ2\nmdM3kZsqLPwJ8+bNUTuGS7C0myFFRKJm2QoUHzmpTN/T74L+9HfK9J3QC77LfgXDsSOcvonciKCh\nazRuBku7Jfbpe91GXEk/hcrnV0MKj4DX++8iaOoEZfp+6w0IZaVqJyWiVoiiiD/8YTXmz78fy5Yt\ng9lsVjvSDbn1roi8BpddASVJMB7YB6+N62Ha9RkEUYTs5QXzXXejZu5DEAcPcfqqS61elcVMztFi\nLq1nWr3aA9u3u3afujvvFLF6dcsFXFj4E2bPnoF//GMd+vdPwJ/+9CI6dYrGfff93KVZbkbHvSKy\nrel0sI4Zh/K3NuDKye9Q+fxvIYVHwPOD9xA0bSKCxiTB861/cvom0piIiEjH5lAzZsxAZmaGyolu\njFtuzaoVcng4apY+gZrFy2A8uB+eG9fDY+d2+D37FHx/92uYZ8xEzbwF1zV9E93KVq82tzoVt5Wm\na9ru+leSk7Yr6HSwjh6Lin+9o0zfq34HKSISnpv+XTd9D4fnv16HUFqidlKiDquw8Cd8+202AGDH\njh1ITByocqIbw9J2MTk8HDVLHkfxN+ko3fwpau+6G/q8XPitTEVIYm/4LXkMhiOHeeYJUTvr3r0H\ndu36DPPn34+ysjKkpNyjdqQbwh9EtgOhqAiem/4Nz41vw3D2vwAAsU88DAsfxpVREyH16KlKruZo\n/QdZWqLFXMzkHK1mcgYn7XYgh4WhZvEylBw6gdKPt6M25W7oz+QBTz2FkKEDETR6OLxf/gMMWRmc\nwImoRfxBZHvS6WAdORrWkaNReeUKQr/eA/Omj2A6sA8+a1+Gz9qXYYvuCvOUabBMvRPWIcMAN3qV\naCJqe2wElcghIcDChSifcS+EygoYv/wPPHZ+BtPuz+H9xj/g/cY/IAUHwzxpKixTpsMyeizg5aV2\nbCJSGUtbA2RfP1hmzIRlxkzAYoHx64NKgf/fDni9/y683n8Xsrc3LGMnwDx1OiwTJ0EODFI7NhGp\ngKWtNSYTrGPHKy/c8PJaGE4cg8euHTDt3A6PHZ/CY8enkA0GWJNGKgU+ZRqkTlFqpyaidsLS1jKd\nDuLgIRAHD0HV86uhP/09PHZ9BtPO7TAd2AvTgb3AMytg/dkgmKdMh2XqnbDF9VI7NRG1IZ494i4E\nAbbefVD9+JMo/WI/rqTnoOLFV2EZOQaGjJPw/cNvEZw8GEFJg+Dz+9XKHuCSpHZqItVVVlZi69bN\nbfb406dPQGVlJQDgypXLGDnydmRlZTT4+kSUl7vuxcSdLm1JkjBz5kw89thjLvvmdOOkzl1Qu/BR\nlH38Ka7knEH5X/8J89Q7oS/Ih/f/voagKeMRPDAevqlPwLjvS8BiUTsykSoqKsqxdetHzX5NcsFg\n07dvArKzMwEA2dmZ6NWrD7KylI/PnfsRgYFB8Pf3v+nvY+d0aW/YsAExMTEu+8bkOnJQMMz33o/y\n9e/h8qmzKHvnfdTOeQCCuRZe699C4L0pCOkbA79fPgLT9m1A3VRA1BG8/vpfceFCAR5++EH8/e//\ni/T045g3bx5++9vnMX/+fVe9QML777+Lt99+EwBQUJCPFSuW4pFH5mHx4kU4d+7Hqx4/ISHRUdpZ\nWZmYM+dBfPttfYknJCS69Pk4taZdWFiI/fv347HHHsPbb7/t0gDkYt7esEyZBsuUaYAowvhNGky7\nPoPHzs/g+fGH8Pz4Q8geHrCMGQfLlOkw3zEFcmio2qmpAwke1L/Zzxcfz3bJ8U398pdL8MMP/8W6\nde8BANLTjyMrKwsbNnyIyMhIFBb+dM0XSHjllTVITV2Jzp27ICcnG2vXvoQ///kfjY7p3z8R69e/\nBQA4depbPPLIY/joo38DUEo8IWGAUzmd5VRpr1mzBqmpqaio0NZln9QKgwHWEaNgHTEKVb9/GYas\nDOUslF074PH5Lnh8vgu+Oh2sQ4fDMnU6zFOmA2HN/wUhupUkJiYiMjKyxWNqamqQnZ2BVauehn23\nD1EUrzqub99+yM39HrW1tbDZbPD09ERUVGcUFOQjOzsD99/v2j27Wy3tffv2ITQ0FPHx8Th8+LDT\nD+zsdfTtqcNnGj9SeVv7CpCbC3zyCYStW2E6lAbToa/hu+pZICEBYWPGAGPGAKNGARqZwrX4ewdo\nM5fmMzWzxAAAYde68/Ue34TFUg69XufIEBjoDS8vL8fHklQNQajPaDQCOp0JwcHeCAgIwPbtn7by\nHfzQvXs37N//OQYMSEBYmB+GDBmMrKxjKC8vw6Br/E/hRrVa2idOnMCXX36J/fv3w2w2o6qqCqmp\nqXjllVdavJ8WN2NhpgYCI4H5jwLzH4Vw8SI8Pt8Jj53bYUr7CsjKAv7yFwCAGN8X1uHJsCSPhHVY\nMuQwZ/+quI4Wf+8AbeZipqvV1sqoqKh0ZCgtrQZQ31GSZMLly1dw5kwBPD09sXv3HgwbloSaGhkR\nEZ3w4YdbMXbsBABAXl4uYmPjrvoeffr0w7p1b2PhwkdRVFSBbt164YUXViE+vp/Tz93Zf2xbLe3l\ny5dj+fLlAIAjR45g3bp1rRY2uRc5IgK18xagdt4ChPmbUPrFPhi/Pghj2tcwHjsMw6kceK1TfjAj\n9u4D6/BkWJNHwjJ8BOTwcJXTE7XM3z8ACQkDMH/+fRg6NAnDhyc3+rrBYMCCBY9g0aL5iIrqjG7d\nuju+9utfv4A//vElvPPOOthsIsaPv6PZ0k5IGIDNmzehXz/llXF69+6DoqIizJgx0+XP57q2ZrWX\n9uuvv97qsfzXvnVukcligSH9BEyHvlKK/OhhCNXVji+Lcb1gHT4C1uQRsCaNgBTR8jqhSzJphBZz\nMZNztJrJGdxPW0VumclqheHkCRgPfQ3T1wdhOHIYuqr6UwjFmFhYk0Y43lxxib0Wf50AbeZiJudo\nNZMzeBk7XR+jEeLtQyHePhQ1S5crJZ55UllKSTsI4+Fv4LVxPbw2rgcA2Lr3UNbD65ZUpM5d1M1P\n5OZY2nRzjEaIg26HOOh21Cx5HBBFGLIylBI/9BWMh9Lg9d4GeL23AQBg69odluQR9SUe3VXlJ0Dk\nXlja5FoGA8TbBkG8bRBq/mcpYLPB8G0WjF9/VV/iddvNAoAtuiusSSNgsS+ndO3mvi+TTdQOWNrU\ntvR6iIkDISYORM0vFwM2G/Q538KUdtAxjXtu+jc8NylXkNk6d3Gsh1uSRkDq3kPlJ0CkLSxtal96\nPWwJiahJSETNo/8DSBL0p3Ial/hHH8Dzow8AALZOUcCY0fDqkwAxcQDEhETI/gEqPwki9bC0SV06\nHWz9+qOmX3/U/OKXSol//x2MaV/BlKYsqeD99+GL9x13EXv0VKb3hAF1RT5Aefk2og6ApU3aotPB\nFt8Xtvi+qF24CJBlhJUWonx/GgyZGcpb1kl4frIF+GSL4262LtH1JZ44AGLiwDY5Z5zcT2VlJXbv\n/j/MnHlPm32PNWt+i+TkkRg9elybfQ87ljZpmyAAvXrBHNQJ5pRZyudkGbr8844CN2RmwJhxEh67\nPoPHrs8cd7WFR9SXeMJAiIkDIHWJ5g86Oxj7ftpNS1uSJOh07vc6MCxtcj+CACm6KyzRXWGZdqfj\n07qLhTBknmwwkWfA4z9fwOM/XziOkYKCHAVuf7N17wm44V9edzVokE+znz9+vMolxzfVcD9tvV4P\nLy9vREVF4ttvc/Dqq39Gaurj2LBhEwBlL+3a2hosWPALFBTk47XXXkFZWSk8PT2Rmvocunbtds3v\nc/ToYXz44fsoKSnG4sVPIClphFP5rhdLm24ZUkQkLBMnwzJxsuNzwpUrMGTVl7gh82T962va7+fr\nBzEh0bE+LiYOhC02DjDwr8etoOF+2unpx5Ga+gTWrn0VRqPfTe+l3VBh4U/429/eRH7+eSxd+hg2\nbdoGo9Ho8ufDP5V0S5NDQmAdMw7WMfVrjUJ5GQzZWfVTeVYGjIcPwXTo6/r7eXlB7NvfsT4uJg6A\n2DseMJnUeBq3FGcn5Bs9vjV9+/ZDVFRUi5exO7uXdkPjxk0EAHTpEo2oqM748ccfmt1c6maxtKnD\nkf0DHOeCO1RVwZCT3WAiz4AhIx3G40fr72c0QozvpxR4/0Rg2CAIIZ2VnQ65Tu42PD09Hbf1ej1s\ntvrXibRYzAAAWZbg5+fveLUbZzSd2K81wd8sljYRAPj4OPZUcTCbYfgup9FZK4Zvs2HMPOk4JBSA\n5B8AW2wsbDFxsMX1glj33tajJ+Dh0f7PhRrx9vZGdd3OlE33xwsKCkZpaQnKy8vh6emJtLSvMGxY\nEry9fdCpUxT27v1Pq3tp2+3d+x9MnjwNFy4U4MKFghbXv28GS5voWjw8IA64DeKA2+o/Z7VCn3sa\nhqwM+F/4EeaMbOjP5MKQlQnjieON7i7rdJC6doMYG+codFtsHMTYXsqLSXA6bxcN99M2mTwQHBzs\n+Jor9tK2i47uhsWLF6GkpBhPPbWyTdazAW7Nqipmco4WMwFNcokidOd+hOFMLvS5udCfyVXKPS8X\nustFV93XMZ3H1he5LTbupqdzLf5aMZNzuDUrUXsyGCD1jIGlZwzQ4OwVABBKS6DPy4U+LxeGuvf6\nvNOtT+f2Iq9bcuF0TgBLm6jNyYFBEAcPgTh4CMwNvyCK0J/7oa7E86DPO11X7KeVc8sbnF8O1E3n\nccpSixjXS1lyccF0Ts7bsGEd9u79DwRBgCzLEAQBY8dOwNy5C9otA5dHVMRMztFiJqBtc101neee\nVpZczv4XgtXa6FjHdB7XCx59eqEyJBK26GhIXaJh69IVcmioqhO6Fn//tJrJGZy0iTTIqem8bu3c\nUFfoHrs/B3Z/Dt+mj+XlBVvnLkqJR3eFFN0VtrpCl6KjIUV2AvT6dnx2dDNY2kTuxGCArWcsbD1j\ngTumNPqSUFKM0MorKMv8Dvr8c9Dln4f+/Pm69z/CkJfb7EPKBgOkqM6wdbFP59GOYpeio2HrHM3l\nFw1haRPdIuSgYKBXN1iir3FaWmUl9PnnlUI/fx76/PPQ5Z9zFLvx0NcQrrFaaguPUAo8uiukLg0K\nvW5al32d+6893TyWNlFH4esLW5942PrEN/91sxm6CwV1ZX4e+vPn6m+fOwdDxkkYjx9r9q5SYKBS\n4F2i69bT64sdiX0A2YNLMC7C0iYihYcHpB49IfXo2fzXbTboLhbWTen1yy/224b/5kHIzmz2rqF6\nPaTQMEjhEZAiIhq/D49s9DG8vdvwSbo/ljYROUevhxTVGVJUZ4hDh139dVmGUFzcYPlFKXPv4iKI\n5wuUrXPP5ELIymjx20h+/pDCwyFFRCrvHcVu/1wEpIhIyMHBHXJLXZY2EbmGIEAOCYEYEgI0uPTf\nO8wPpQ1OrxMqK6C7dBG6ixfr3hdCd+lS3fv6z+v/e+aaa+xA3Q9Qw8KbTO0RjlJvWPJosEmUu2Np\nE1G7kn39YPP1U86AaYnVCt2Vy1eVeePCvwjD96cgZKS3+FBSQGD91B4RAXTtAm9PX0jBIZCCgyEH\nBUMKCoYcEgIpKFjTJc/SJiJtMhohRXZSziNviSxDqChvMq03md7r3gy5px13a/71cOoe0ttbKfSg\nukIPaVDswcH1X6u7LQcHQ/bxbZeLmFjaROTeBAGyfwBs/gHKKw61xGKB7nIRQsQqlJ45D11JMYSS\nYuiuXGl0Wygpga6kGIYzeRCqnXsRBtlobDSty0H1hS4FBSsTfXDj4pcDAq97XZ6lTUQdh8kEKaoz\nEOYHa9dezt3HbFYKvbgYuuIrSrHbbxcX15e9/eOfLsBwKseph5Z1OsiBgZCCQ4AG/wtoCUubiKgl\nHh7KEk1kJ9icvY8oQigtVQq9boq/ZvHX3XYWS5uIyNUMBsihobCFhgJOvkxkmJMP3fFOciQicmMs\nbSIiN8LSJiJyIyxtIiI30uoPIi0WCx588EFYrVbYbDZMmjQJixcvbo9sRETURKulbTKZsGHDBnh5\necFms+H+++/HqFGjkJiY2B75iIioAaeWR7y8vAAoU7coim0aiIiIrs2p0pYkCSkpKUhOTkZycjKn\nbCIilTh1cY1Op8O2bdtQWVmJX/3qV8jLy0NsbAs7dHXvjmDp6i0Vi49nN3t48KD+zX7epcfrhKsy\nqZoHuCqT6nmaZNJEngaZNJPH7tyPmsrD42+N41tzXVdE+vr6YsiQITh48GDLpQ1Ar7t6t6trvkR8\nM8e2xfFNM6mdp2kmLeRpmEkreeyZtJSnxfuolMd+/FX3UznPVffVQJ5GH2skj7MEWW5hl3EAxcXF\nMBqN8PPzQ21tLRYuXIhFixZh9OjRLT5wUYNNz7UgLMyPmZzATM7TYi5mco5WMzmj1Um7qKgIzzzz\nDCRJgiRJmDp1aquFTUREbaPV0u7duze2bt3aHlmIiKgVvCKSiMiNsLSJiNwIS5uIyI2wtImI3AhL\nm4jIjbC0iYjcCEubiMiNsLSJiNwIS5uIyI2wtImI3AhLm4jIjbC0iYjcCEubiMiNsLSJiNwIS5uI\nyI2wtImI3AhLm4jIjbC0iYjcCEubiMiNsLSJiNwIS5uIyI2wtImI3AhLm4jIjbC0iYjcCEubiMiN\nsLSJiNwIS5uIyI2wtImI3AhLm4jIjbC0iYjcCEubiMiNsLSJiNwIS5uIyI2wtImI3AhLm4jIjbC0\niYjciKG1AwoLC5GamorLly9Dr9dj9uzZmDdvXntkIyKiJlotbb1ej2effRbx8fGoqqrC3XffjeTk\nZMTExLRHPiIiaqDV5ZGwsDDEx8cDAHx8fBATE4NLly61eTAiIrrada1p5+fn47vvvkNiYmJb5SEi\noha0ujxiV1VVhaVLl2LlypXw8fFp8dju3QFJuvqY48ermj1+0KDmH8+Vx+t0V2dSMw+AqzKpnadp\nJi3kaZhJK3nszp1r9tOq5eHxt8bxrXGqtEVRxNKlS3HXXXdhwoQJTj2wTnf1EB8W5neNY5t/DFcf\n3zST2nmaZtJCnoaZtJLHnklLeVq6j1p57Mc3vZ/aeZre1kKehh9rJY+zBFmW5dYOSk1NRVBQEJ59\n9lmnH7ioqOKGArWVsDA/ZnICMzlPi7mYyTlazeSMVte0jx8/ju3bt+Obb75BSkoKZs6ciQMHDtx0\nQCIiun6tLo8MGjQIp06dao8sRETUCl4RSUTkRljaRERuhKVNRORGWNpERG6EpU1E5EacviKSiIiu\nnyQBZWVASYmA4uL6t5ISwfG5khIBn37q3OOxtImInGSxoFHRNizgpkWsfAyUlgqQJMFlGVjaRNTh\nyDJQWYmrCvdaU7D981VVzpWvXi8jKEhGaKiMuDgJQUEygoOVt6Ag1L2XHe+DgmQAvk49NkubiG4Z\nNTVAUZGAixcFXLqkw6VLyu2iIuVj5fMCLl8GLBbnLhv39lZKtUcPqVHR1pdw4/INCZHh5wcIrhuu\nG2FpE5GmSZIyEV+6JDhK2F7IDd8uXtShvLzlpvTwkBERIWPgQMDfX2yxfO23vbza6Yk6iaVNRKqo\nqUGjwm1cwvVTcVGRAFFsuYxDQiR07izhtttkhIfLiIiQEB5uvy3X3Zbg769MwMqGUTXt9Exdi6VN\nRC4likBhoYD8fB3OnxdQUQGcPevRYEpWStn5qVhCeLjUqIAblnJYmAyjsZ2enAawtInoutTUABcu\nCDh/Xof8fB3y8+23laK+cEGAzda0kE2OW9eaiusnYuVzbbku7M5Y2kTUSFkZGpVw49sCLl9u/po8\nQZARGSnjZz+T0KWL/U1GfLwnPD2rEBGhnE3RkabitsDSJupAZFlZR25Ywsq0XH+7oqL58dZolNG5\ns4z4eBFdusjo0kVCdLTkuB0VJcNkuvp+YWGeKCqS2viZdRwsbaJbiNUKnDvXtJDrlzIKCgSYzc2X\nso+P3KiEu3SxfywhOlpZtmjppdeofbC0idyMLAM//SQgL0+H3FwdzpzRIS9PeV9QAEhS8xdphIZK\niI9X1pPrC7m+mAMDuYbsDljaRBpVXQ2cOaOUccNyzsvTobr66naNiJCQlARERFgbTczR0TI6d5bg\n7a3CkyCXY2kTqcg+Nefm1heyfWrOz796LcLTU0bPnhJiYxu/xcQoZ1so5x/XqvBMqL2wtInagX1q\nbljK9um5uak5MlLCyJEiYmIal3OXLlxX7uhY2kQuIsvK+csNJ2b7W0HBtafmuDjJUc72277O7R1E\nHRBLm+g6WSzA99/rcOkScOKEqdH03NzU3KmTMjU3XMqIi5PQuTOnZrp+LG2iFtTUADk5OmRm6pGV\npbw/dUoHq9Vezh4AAC+va681c2omV2JpE9WprASys/XIzKwv6dOndY0uyfbwkJGQIKF/fxsGDzYh\nIqIasbGcmqn9sLSpQyopAbKylIJW3utx5kzj1vX2ljF4sA2JiRISEpT3cXGS4zLssDATiopsKqSn\njoylTbe8S5cEx9KGvaTPnWtc0AEBMkaOFJGQICEx0YbERBt69uT0TNrD0qZbhv3sjYblnJmpQ2Fh\n4+YNDZUwbpyIxESbo6S7dpV5NSC5BZY2uSVZBn74QXAUs30N+sqVxgUdFSVh8mRrgwlaQmQkC5rc\nF0ubNM9mA06f1jUq56ws/VWb6HfrJiEpyepYg05IkBAWJquUmqhtsLRJc8xmID1dj6+/1iMtTY/j\nx4Hqah/H1wVBeYXrCRPqp+f+/W0IDFQxNFE7YWmT6mprgRMnlIJOS9Pj2DE9amvrp+h+/YCEBKtj\nDbpfPxvPfaYOi6VN7a6mBjh+vL6kjx/XO/Z4FgQZfftKSE62YfhwG4YPF9G7NzdBIrJjaVObq64G\njh2rL+lQSYIFAAANpklEQVQTJ/SwWOpLun9/CUlJNiQl2TBsmIigIJUDE2kYS5tcrqrq6pK2X/at\n09WXdHKyiKFDuRZNdD1Y2nTTKiuBo0ftJW1AeroOolhf0omJ9klaKemAAJUDE7kxljZdt8pK4MgR\n+9kdBmRk1Je0Xi9jwAAJw4crk/SQITb4+6scmOgW0mppr1y5Evv27UNISAi2b9/eHplIYyoqgMOH\n6yfpjIz6TZT0ehkDB0pIShKRnGzDkCE8s4OoLbVa2nfffTfmzp2L1NTU9shDGlBeDnzzjVLQaWnK\nFYeSpJS0wSDjttskJCeLGD6cJU3U3lot7cGDB6OgoKA9spBKZBk4eVKHXbsMOHgQSE/3dZS00ajs\ndGc/u+P2223w8WnlAYmozXBNu4OyWoFDh/TYtcuAXbsMuHBB2bPDaARuv92G5GSlpAcPtvFVvIk0\npM1KOyzMr60e+oZ19EzV1cAXXwBbtwLbtyt7SgNAYCAwdy6QkgJMmgT4+BigtX/Ptfh7B2gzFzM5\nR4uZnNFmfzOLiira6qFvSFiYX4fMVFICfPGFATt3GrBvnwE1NcqyR2SkhAULREydKiIpyebY2N/H\np2P+Ot0ILeZiJudoNZMznCptWeZOae7kwgUBu3YpRZ2Wpnec6REba8PUqUpRDxwocYN/IjfUammv\nWLEChw8fRmlpKcaMGYMlS5Zg1qxZ7ZGNrkNurg47dypFnZ6ud3z+ttuUop4yRUSvXpKKCYnIFVot\n7bVr17ZHDrpOkqSc8WEv6rw8paj1euVls+xFHRXF/yUR3Uq09dMmapHVCqSl1Z/x8dNPyvqGl5eM\nKVOsmDpVxB13cMMlolsZS1vjqquBvXuVaXr3bgNKS5X16cBAGffeqxT1mDEiT8sj6iBY2hpUUgJ8\n/rlS1Pv315/xERUlYdYspaiHDas/44OIOg6WtkYUFAiOZY+GZ3z06lX/g8SBAyW+IC1RB8fSVtGp\nU8C775qwc6cBJ0/Wn/Hxs5/ZT82zIjaWP0gkonos7XZWWgp89JER775rxKlTAOABg0HGqFH1Z3x0\n6sSiJqLmsbTbgSwDR4/qsGGDCZ9+akBtrQCjUcbMmcCECTWYOFHkq7cQkVNY2m2orEyZqjduNOLU\nKWX5o0cPCXPnmjFnjoi+fX1RVCSqnJKI3AlL28VkGTh2TIeNG0345BPlzA+jUcZdd1kxd64VI0bY\nePk4Ed0wlraLlJUBmzcbsWFD/VTdrZuEuXMtuP9+K8LCuE5NRDePpX0TZBk4cUJZq962TZmqDQYZ\nM2YoU/XIkZyqici1WNo3oLy8fqrOyWk8Vd93nxXh4ZyqiahtsLSdJMtAeroOGzYYsW2bEdXVylQ9\nfboV8+ZZMWoUp2oianss7VZUVChT9caNRmRnK1N11671U3VEBKdqImo/LO1m2F/oduNGI7ZsUaZq\nvV7GtGnKVD16NKdqIlIHS7uBysr6qTorq36q/vnPlTNAOFUTkdpY2gAyMpS16o8/rp+qp05Vpuox\nYzhVE5F2dNjSrqwEtmxRzgDJzFSm6i5dJCxdasEDD1gRGcmpmoi0p8OVdmamDu+8o6xVV1UpU/Xk\nyVbMn69M1Xp9649BRKSWDlHalZXAtm3A3//u7dgCtXNnCYsXK1M1d9UjIndxS5d2eTnwxhsmvP66\nCeXlgE6nw+TJylr12LGcqonI/dySpV1ZCbz1lgl/+5sJpaUCQkIkrF4tICWliq9OTkRu7ZYq7epq\nYN06I/72NxOuXNEhMFDGc8+ZsXChBT16+KGoiIVNRO7tlijt2lpgwwYj/vxnE4qKdPD3l5Gaasai\nRRb4+6udjojIddy6tM1m4N13lbIuLNTBx0fG8uVmPPaYha8EQ0S3JLcsbasVeP99I/70JxMKCnTw\n9paxZIkZv/qVFSEhXAIholuXW5W2KAIffWTA2rUeOHdOB09PGY89ZsGSJRa+yAARdQhuUdo2G7Bl\niwF//KMHzp7VwWSS8cgjFixbZuF+IETUoWi6tCUJ+PRTA1591YTcXD2MRhkPPWTB449beOoeEXVI\nmixtSQJ27lTK+tQpPfR6GT//uVLWXbuyrImo49JUacsy8MUXerz8sgeys/XQ6WTMmWPF8uVm9OjB\nsiYi0kRpyzKwd69S1unpegiCjLvvtuLJJ82IjWVZExHZqVrasgwcPKiU9dGjykYgM2ZY8eSTFvTp\nI6kZjYhIk1Qr7UOH9HjpJRMOHVIiTJlixVNPWdC/P8uaiOha2r20jx7V4aWXPHDwoPKtJ04UkZpq\nxoABLGsiotY49UJaBw4cwOTJkzFp0iS88cYbN/SNTpzQ4b77vDBtmg8OHjRgzBgRu3ZV4b33aljY\nREROanXSliQJL7zwAtavX4/w8HDcc889GD9+PGJiYpz6BllZOrzyigc+/1z5ViNGiEhNtWDYMNvN\nJSci6oBaLe3MzEx069YNnTt3BgBMmzYNe/bsabW0c3J0ePVVE3bsMAIAhg4V8fTTFowYwbImIrpR\nrZb2xYsX0alTJ8fHERERyMrKavE+990HfPihN2RZwKBBNjz9tBmjR9sgCDcfmIioI2u1tGX5+s+T\n3rQJGDBAwtNPmzF+PMuaiMhVWi3tyMhIXLhwwfHxxYsXER4e3uJ9lJ7XA/C+yXiuFRbmp3aEqzCT\nc7SYCdBmLmZyjhYzOaPVs0cSEhJw7tw5FBQUwGKxYMeOHRg/fnx7ZCMioiZanbT1ej1WrVqFhx9+\nGLIs45577nH6zBEiInItQb6RRWsiIlKFUxfXEBGRNrC0iYjcCEubiMiNuHTDqAMHDmDNmjWQZRmz\nZs3CokWLXPnwN2TlypXYt28fQkJCsH37drXjAAAKCwuRmpqKy5cvQ6/XY/bs2Zg3b56qmSwWCx58\n8EFYrVbYbDZMmjQJixcvVjWTnSRJmDVrFiIiIvD666+rHQfjxo2Dr68vdDodDAYDNm/erHYkVFRU\n4LnnnkNubi50Oh3WrFmDAQMGqJrp7NmzeOKJJyAIAmRZxvnz57Fs2TLV/6yvX78emzdvhiAI6NWr\nF1588UWYTCZVM73zzjuOP0et9oHsIjabTZ4wYYKcn58vWywWecaMGXJeXp6rHv6GHT16VM7JyZGn\nT5+udhSHS5cuyTk5ObIsy3JlZaV8xx13aOLXqrq6WpZlWRZFUZ49e7ackZGhciLF22+/La9YsUJ+\n9NFH1Y4iy7Isjxs3Ti4tLVU7RiNPP/20vHnzZlmWZdlqtcoVFRUqJ2rMZrPJycnJ8oULF1TNUVhY\nKI8bN042m82yLMvysmXL5K1bt6qa6fTp0/L06dNls9ksi6IoP/TQQ/KPP/54zeNdtjzScI8So9Ho\n2KNEbYMHD4a/v7/aMRoJCwtDfHw8AMDHxwcxMTG4dOmSyqkALy8vAMrULYqiymkUhYWF2L9/P2bP\nnq12FAdZliFJ2tmZsrKyEseOHcOsWbMAAAaDAb6+viqnaiwtLQ1du3ZttCWGWiRJQk1NDURRRG1t\nbasXC7a1M2fOYODAgTCZTNDr9bj99tuxe/fuax7vstJubo8SLRSR1uXn5+O7775DYmKi2lEgSRJS\nUlKQnJyM5ORkTWRas2YNUlNTIWhoLwRBELBw4ULMmjULH374odpxkJ+fj6CgIDz77LOYOXMmVq1a\nhdraWrVjNbJz505MmzZN7RiIiIjAggULMGbMGIwaNQp+fn5ISkpSNVNcXByOHj2KsrIy1NTU4MCB\nA/jpp5+uebzLSlvm6d7XraqqCkuXLsXKlSvh4+OjdhzodDps27YNBw4cQEZGBvLy8lTNs2/fPoSG\nhiI+Pl5Tf74++OADbNmyBW+++Sbee+89HDt2TNU8oigiJycHDzzwALZu3QpPT88b3ve+LVitVnz5\n5ZeYMmWK2lFQXl6OPXv2YO/evTh48CCqq6tV/1lXTEwMfvGLX2DBggVYtGgR+vTpA4Ph2j9udFlp\n38geJR2ZKIpYunQp7rrrLkyYMEHtOI34+vpiyJAhOHjwoKo5Tpw4gS+//BLjx4/HihUrcPjwYaSm\npqqaCVCWtwAgODgYEydObHXXy7YWGRmJyMhIJCQkAAAmTZqEnJwcVTM1dODAAfTr1w/BwcFqR0Fa\nWhqio6MRGBgIvV6PiRMnIj09Xe1YmDVrFrZs2YKNGzciICAA3bp1u+axLittLe9RoqUpzW7lypWI\njY3F/Pnz1Y4CACguLkZFRQUAoLa2FocOHULPnj1VzbR8+XLs27cPe/bswWuvvYahQ4filVdeUTVT\nTU0NqqqqAADV1dX46quvEBcXp2qm0NBQdOrUCWfPngUAfPPNN5raamLHjh2YPn262jEAAFFRUcjI\nyIDZbIYsy5r5tSouLgYAXLhwAbt3727x18tlp/xpdY8S+4RWWlqKMWPGYMmSJY4f2Kjl+PHj2L59\nO3r16oWUlBQIgoAnnngCo0aNUi1TUVERnnnmGUiSBEmSMHXqVIwePVq1PFp1+fJlLF68GIIgwGaz\n4c4778SIESPUjoXnn38eTz75JERRRHR0NF588UW1IwFQBoC0tDT87ne/UzsKACAxMRGTJk1CSkoK\nDAYD+vbti3vvvVftWFiyZAnKyspgMBjwm9/8Bn5+196BkHuPEBG5EV4RSUTkRljaRERuhKVNRORG\nWNpERG6EpU1E5EZY2kREboSlTUTkRljaRERu5P8D+7Wym3BFpegAAAAASUVORK5CYII=\n",
+            "text/plain": [
+              "\u003cmatplotlib.figure.Figure at 0x7f5be4b8ec50\u003e"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "display_data"
+        }
+      ],
+      "source": [
+        "model = Model()\n",
+        "\n",
+        "# Collect the history of W-values and b-values to plot later\n",
+        "Ws, bs = [], []\n",
+        "epochs = range(10)\n",
+        "for epoch in epochs:\n",
+        "  Ws.append(model.W.numpy())\n",
+        "  bs.append(model.b.numpy())\n",
+        "  current_loss = loss(model(inputs), outputs)\n",
+        "\n",
+        "  train(model, inputs, outputs, learning_rate=0.1)\n",
+        "  print('Epoch %2d: W=%1.2f b=%1.2f, loss=%2.5f' %\n",
+        "        (epoch, Ws[-1], bs[-1], current_loss))\n",
+        "\n",
+        "# Let's plot it all\n",
+        "plt.plot(epochs, Ws, 'r',\n",
+        "         epochs, bs, 'b')\n",
+        "plt.plot([TRUE_W] * len(epochs), 'r--',\n",
+        "         [TRUE_b] * len(epochs), 'b--')\n",
+        "plt.legend(['W', 'b', 'true W', 'true_b'])\n",
+        "plt.show()\n",
+        "  "
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "vPnIVuaSJwWz"
+      },
+      "source": [
+        "## Next Steps\n",
+        "\n",
+        "In this tutorial we covered `Variable`s and built and trained a simple linear model using the TensorFlow primitives discussed so far.\n",
+        "\n",
+        "In theory, this is pretty much all you need to use TensorFlow for your machine learning research.\n",
+        "In practice, particularly for neural networks, the higher level APIs like `tf.keras` will be much more convenient since it provides higher level building blocks (called \"layers\"), utilities to save and restore state, a suite of loss functions, a suite of optimization strategies etc. \n",
+        "\n",
+        "The [next tutorial](TODO) will cover these higher level APIs."
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "default_view": {},
+      "name": "Training Models",
+      "provenance": [],
+      "version": "0.3.2",
+      "views": {}
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb b/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb
new file mode 100644
index 00000000000000..4fe3a0e3f3d431
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb
@@ -0,0 +1,551 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "pwX7Fii1rwsJ"
+      },
+      "outputs": [],
+      "source": [
+        "import tensorflow as tf\n",
+        "tf.enable_eager_execution()\n",
+        "tfe = tf.contrib.eager\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "UEu3q4jmpKVT"
+      },
+      "source": [
+        "# High level API\n",
+        "\n",
+        "We recommend using `tf.keras` as a high-level API for building neural networks. That said, most TensorFlow APIs are usable with eager execution.\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "zSFfVVjkrrsI"
+      },
+      "source": [
+        "## Layers: common sets of useful operations\n",
+        "\n",
+        "Most of the time when writing code for machine learning models you want to operate at a higher level of abstraction than individual operations and manipulation of individual variables.\n",
+        "\n",
+        "Many machine learning models are expressible as the composition and stacking of relatively simple layers, and TensorFlow provides both a set of many common layers as a well as easy ways for you to write your own application-specific layers either from scratch or as the composition of existing layers.\n",
+        "\n",
+        "TensorFlow includes the full [Keras](https://keras.io) API in the tf.keras package, and the Keras layers are very useful when building your own models.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "8PyXlPl-4TzQ"
+      },
+      "outputs": [],
+      "source": [
+        "# In the tf.keras.layers package, layers are objects. To construct a layer,\n",
+        "# simply construct the object. Most layers take as a first argument the number\n",
+        "# of output dimensions / channels.\n",
+        "layer = tf.keras.layers.Dense(100)\n",
+        "# The number of input dimensionss is often unnecessary, as it can be inferred\n",
+        "# the first time the layer is used, but it can be provided if you want to \n",
+        "# specify it manually, which is useful in some complex models.\n",
+        "layer = tf.keras.layers.Dense(10, input_shape=(None, 5))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Fn69xxPO5Psr"
+      },
+      "source": [
+        "The full list of pre-existing layers can be seen in [the documentation](https://www.tensorflow.org/api_docs/python/tf/keras/layers). It includes Dense (a fully-connected layer),\n",
+        "Conv2D, LSTM, BatchNormalization, Dropout, and many others."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 204
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 244,
+          "status": "ok",
+          "timestamp": 1527783641557,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 420
+        },
+        "id": "E3XKNknP5Mhb",
+        "outputId": "c5d52434-d980-4488-efa7-5660819d0207"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "\u003ctf.Tensor: id=30, shape=(10, 10), dtype=float32, numpy=\n",
+              "array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],\n",
+              "       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],\n",
+              "       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],\n",
+              "       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],\n",
+              "       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],\n",
+              "       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],\n",
+              "       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],\n",
+              "       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],\n",
+              "       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],\n",
+              "       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]], dtype=float32)\u003e"
+            ]
+          },
+          "execution_count": 3,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "# To use a layer, simply call it.\n",
+        "layer(tf.zeros([10, 5]))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 221
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 320,
+          "status": "ok",
+          "timestamp": 1527783642457,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 420
+        },
+        "id": "Wt_Nsv-L5t2s",
+        "outputId": "f0d96dce-0128-4080-bfe2-0ee6fbc0ad90"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "[\u003ctf.Variable 'dense_1/kernel:0' shape=(5, 10) dtype=float32, numpy=\n",
+              " array([[ 0.43788117, -0.62099844, -0.30525017, -0.59352523,  0.1783089 ,\n",
+              "          0.47078604, -0.23620895, -0.30482283,  0.01366901, -0.1288507 ],\n",
+              "        [ 0.18407935, -0.56550485,  0.54180616, -0.42254075,  0.3702994 ,\n",
+              "          0.36705834, -0.29678228,  0.36660975,  0.36717761,  0.46269661],\n",
+              "        [ 0.1709305 , -0.11529458,  0.32710236,  0.46300393, -0.62802851,\n",
+              "          0.51641601,  0.39624029,  0.26918125, -0.25196898,  0.21353298],\n",
+              "        [ 0.35752094,  0.44161648,  0.61500639, -0.12653333,  0.41629118,\n",
+              "          0.36193585,  0.066082  , -0.59253877,  0.47318751,  0.17115968],\n",
+              "        [-0.22554061, -0.17727301,  0.5525015 ,  0.3678053 , -0.00454676,\n",
+              "          0.24066836, -0.53640735,  0.13792562, -0.10727292,  0.59708995]], dtype=float32)\u003e,\n",
+              " \u003ctf.Variable 'dense_1/bias:0' shape=(10,) dtype=float32, numpy=array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.], dtype=float32)\u003e]"
+            ]
+          },
+          "execution_count": 4,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "# Layers have many useful methods. For example, you can inspect all variables\n",
+        "# in a layer by calling layer.variables. In this case a fully-connected layer\n",
+        "# will have variables for weights and biases.\n",
+        "layer.variables"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 221
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 226,
+          "status": "ok",
+          "timestamp": 1527783643252,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 420
+        },
+        "id": "6ilvKjz8_4MQ",
+        "outputId": "f647fced-c2d7-41a3-c237-242036784665"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "(\u003ctf.Variable 'dense_1/kernel:0' shape=(5, 10) dtype=float32, numpy=\n",
+              " array([[ 0.43788117, -0.62099844, -0.30525017, -0.59352523,  0.1783089 ,\n",
+              "          0.47078604, -0.23620895, -0.30482283,  0.01366901, -0.1288507 ],\n",
+              "        [ 0.18407935, -0.56550485,  0.54180616, -0.42254075,  0.3702994 ,\n",
+              "          0.36705834, -0.29678228,  0.36660975,  0.36717761,  0.46269661],\n",
+              "        [ 0.1709305 , -0.11529458,  0.32710236,  0.46300393, -0.62802851,\n",
+              "          0.51641601,  0.39624029,  0.26918125, -0.25196898,  0.21353298],\n",
+              "        [ 0.35752094,  0.44161648,  0.61500639, -0.12653333,  0.41629118,\n",
+              "          0.36193585,  0.066082  , -0.59253877,  0.47318751,  0.17115968],\n",
+              "        [-0.22554061, -0.17727301,  0.5525015 ,  0.3678053 , -0.00454676,\n",
+              "          0.24066836, -0.53640735,  0.13792562, -0.10727292,  0.59708995]], dtype=float32)\u003e,\n",
+              " \u003ctf.Variable 'dense_1/bias:0' shape=(10,) dtype=float32, numpy=array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.], dtype=float32)\u003e)"
+            ]
+          },
+          "execution_count": 5,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "# The variables are also accessible through nice accessors\n",
+        "layer.kernel, layer.bias"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "O0kDbE54-5VS"
+      },
+      "source": [
+        "## Implementing custom layers\n",
+        "The best way to implement your own layer is extending the tf.keras.Layer class and implementing:\n",
+        "  *  `__init__` , where you can do all input-independent initialization\n",
+        "  * `build`, where you know the shapes of the input tensors and can do the rest of the initialization\n",
+        "  * `call`, where you do the forward computation\n",
+        "\n",
+        "Note that you don't have to wait until `build` is called to create your variables, you can also create them in `__init__`. However, the advantage of creating them in `build` is that it enables late variable creation based on the shape of the inputs the layer will operate on. On the other hand, creating variables in `__init__` would mean that shapes requires to create the variables will need to be explicitly specified."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 391
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 251,
+          "status": "ok",
+          "timestamp": 1527783661512,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 420
+        },
+        "id": "5Byl3n1k5kIy",
+        "outputId": "6e7f9285-649a-4132-82ce-73ea92f15862"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "tf.Tensor(\n",
+            "[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]\n",
+            " [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]\n",
+            " [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]\n",
+            " [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]\n",
+            " [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]\n",
+            " [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]\n",
+            " [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]\n",
+            " [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]\n",
+            " [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]\n",
+            " [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]], shape=(10, 10), dtype=float32)\n",
+            "[\u003ctf.Variable 'my_dense_layer_1/kernel:0' shape=(5, 10) dtype=float32, numpy=\n",
+            "array([[-0.4011991 ,  0.22458655, -0.33237562, -0.25117266,  0.33528614,\n",
+            "        -0.01392961,  0.58580834, -0.16346583,  0.28465688, -0.47191954],\n",
+            "       [-0.52922136,  0.22416979, -0.58209574, -0.60914612,  0.05226624,\n",
+            "        -0.18325993,  0.5591442 , -0.24718609,  0.37148207,  0.40475875],\n",
+            "       [ 0.16912812, -0.47618777, -0.38989353,  0.30105609, -0.08085585,\n",
+            "         0.44758242,  0.545829  ,  0.51421839,  0.11063248,  0.20159996],\n",
+            "       [ 0.34073615, -0.59835428,  0.06498981, -0.44489855, -0.34302285,\n",
+            "         0.20969599,  0.35527444, -0.03173476, -0.22227573,  0.09303057],\n",
+            "       [ 0.41764337, -0.06435019, -0.52509922, -0.39957345,  0.56811184,\n",
+            "         0.23481232, -0.61666459,  0.31144124, -0.11532354, -0.42421889]], dtype=float32)\u003e]\n"
+          ]
+        }
+      ],
+      "source": [
+        "class MyDenseLayer(tf.keras.layers.Layer):\n",
+        "  def __init__(self, num_outputs):\n",
+        "    super(MyDenseLayer, self).__init__()\n",
+        "    self.num_outputs = num_outputs\n",
+        "    \n",
+        "  def build(self, input_shape):\n",
+        "    self.kernel = self.add_variable(\"kernel\", \n",
+        "                                    shape=[input_shape[-1].value, \n",
+        "                                           self.num_outputs])\n",
+        "    \n",
+        "  def call(self, input):\n",
+        "    return tf.matmul(input, self.kernel)\n",
+        "  \n",
+        "layer = MyDenseLayer(10)\n",
+        "print(layer(tf.zeros([10, 5])))\n",
+        "print(layer.variables)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "tk8E2vY0-z4Z"
+      },
+      "source": [
+        "Note that you don't have to wait until `build` is called to create your variables, you can also create them in `__init__`.\n",
+        "\n",
+        "Overall code is easier to read and maintain if it uses standard layers whenever possible, as other readers will be familiar with the behavior of standard layers. If you want to use a layer which is not present in tf.keras.layers or tf.contrib.layers, consider filing a [github issue](http://github.com/tensorflow/tensorflow/issues/new) or, even better, sending us a pull request!"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Qhg4KlbKrs3G"
+      },
+      "source": [
+        "## Models: composing layers\n",
+        "\n",
+        "Many interesting layer-like things in machine learning models are implemented by composing existing layers. For example, each residual block in a resnet is a composition of convolutions, batch normalizations, and a shortcut.\n",
+        "\n",
+        "The main class used when creating a layer-like thing which contains other layers is tf.keras.Model. Implementing one is done by inheriting from tf.keras.Model."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 190
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 420,
+          "status": "ok",
+          "timestamp": 1527783698512,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 420
+        },
+        "id": "N30DTXiRASlb",
+        "outputId": "a8b23a8e-5cf9-4bbf-f93b-6c763d74e2b3"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "tf.Tensor(\n",
+            "[[[[ 0.  0.  0.]\n",
+            "   [ 0.  0.  0.]\n",
+            "   [ 0.  0.  0.]]\n",
+            "\n",
+            "  [[ 0.  0.  0.]\n",
+            "   [ 0.  0.  0.]\n",
+            "   [ 0.  0.  0.]]]], shape=(1, 2, 3, 3), dtype=float32)\n",
+            "['resnet_identity_block_1/conv2d_3/kernel:0', 'resnet_identity_block_1/conv2d_3/bias:0', 'resnet_identity_block_1/batch_normalization_3/gamma:0', 'resnet_identity_block_1/batch_normalization_3/beta:0', 'resnet_identity_block_1/conv2d_4/kernel:0', 'resnet_identity_block_1/conv2d_4/bias:0', 'resnet_identity_block_1/batch_normalization_4/gamma:0', 'resnet_identity_block_1/batch_normalization_4/beta:0', 'resnet_identity_block_1/conv2d_5/kernel:0', 'resnet_identity_block_1/conv2d_5/bias:0', 'resnet_identity_block_1/batch_normalization_5/gamma:0', 'resnet_identity_block_1/batch_normalization_5/beta:0', 'resnet_identity_block_1/batch_normalization_3/moving_mean:0', 'resnet_identity_block_1/batch_normalization_3/moving_variance:0', 'resnet_identity_block_1/batch_normalization_4/moving_mean:0', 'resnet_identity_block_1/batch_normalization_4/moving_variance:0', 'resnet_identity_block_1/batch_normalization_5/moving_mean:0', 'resnet_identity_block_1/batch_normalization_5/moving_variance:0']\n"
+          ]
+        }
+      ],
+      "source": [
+        "class ResnetIdentityBlock(tf.keras.Model):\n",
+        "  def __init__(self, kernel_size, filters):\n",
+        "    super(ResnetIdentityBlock, self).__init__(name='')\n",
+        "    filters1, filters2, filters3 = filters\n",
+        "\n",
+        "    self.conv2a = tf.keras.layers.Conv2D(filters1, (1, 1))\n",
+        "    self.bn2a = tf.keras.layers.BatchNormalization()\n",
+        "\n",
+        "    self.conv2b = tf.keras.layers.Conv2D(filters2, kernel_size, padding='same')\n",
+        "    self.bn2b = tf.keras.layers.BatchNormalization()\n",
+        "\n",
+        "    self.conv2c = tf.keras.layers.Conv2D(filters3, (1, 1))\n",
+        "    self.bn2c = tf.keras.layers.BatchNormalization()\n",
+        "\n",
+        "  def call(self, input_tensor, training=False):\n",
+        "    x = self.conv2a(input_tensor)\n",
+        "    x = self.bn2a(x, training=training)\n",
+        "    x = tf.nn.relu(x)\n",
+        "\n",
+        "    x = self.conv2b(x)\n",
+        "    x = self.bn2b(x, training=training)\n",
+        "    x = tf.nn.relu(x)\n",
+        "\n",
+        "    x = self.conv2c(x)\n",
+        "    x = self.bn2c(x, training=training)\n",
+        "\n",
+        "    x += input_tensor\n",
+        "    return tf.nn.relu(x)\n",
+        "\n",
+        "    \n",
+        "block = ResnetIdentityBlock(1, [1, 2, 3])\n",
+        "print(block(tf.zeros([1, 2, 3, 3])))\n",
+        "print([x.name for x in block.variables])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "wYfucVw65PMj"
+      },
+      "source": [
+        "Much of the time, however, models which compose many layers simply call one layer after the other. This can be done in very little code using tf.keras.Sequential"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "base_uri": "https://localhost:8080/",
+          "height": 153
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 361,
+          "status": "ok",
+          "timestamp": 1526674830777,
+          "user": {
+            "displayName": "Alexandre Passos",
+            "photoUrl": "//lh4.googleusercontent.com/-kmTTWXEgAPw/AAAAAAAAAAI/AAAAAAAAAC0/q_DoOzKGwds/s50-c-k-no/photo.jpg",
+            "userId": "108023195365833072773"
+          },
+          "user_tz": 420
+        },
+        "id": "L9frk7Ur4uvJ",
+        "outputId": "882e9076-b6d9-4380-bb1e-7c6b57d54c39"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "\u003ctf.Tensor: id=1423, shape=(1, 2, 3, 3), dtype=float32, numpy=\n",
+              "array([[[[0., 0., 0.],\n",
+              "         [0., 0., 0.],\n",
+              "         [0., 0., 0.]],\n",
+              "\n",
+              "        [[0., 0., 0.],\n",
+              "         [0., 0., 0.],\n",
+              "         [0., 0., 0.]]]], dtype=float32)\u003e"
+            ]
+          },
+          "execution_count": 26,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        " my_seq = tf.keras.Sequential([tf.keras.layers.Conv2D(1, (1, 1)),\n",
+        "                               tf.keras.layers.BatchNormalization(),\n",
+        "                               tf.keras.layers.Conv2D(2, 1, \n",
+        "                                                      padding='same'),\n",
+        "                               tf.keras.layers.BatchNormalization(),\n",
+        "                               tf.keras.layers.Conv2D(3, (1, 1)),\n",
+        "                               tf.keras.layers.BatchNormalization()])\n",
+        "my_seq(tf.zeros([1, 2, 3, 3]))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "c5YwYcnuK-wc"
+      },
+      "source": [
+        "# Next steps\n",
+        "\n",
+        "Now you can go back to the previous notebook and adapt the linear regression example to use layers and models to be better structured."
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "default_view": {},
+      "name": "4 - High level API - TensorFlow Eager.ipynb",
+      "provenance": [],
+      "version": "0.3.2",
+      "views": {}
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
index 8517a3bf7b6aeb..b14ef1df8ff4c6 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
+++ b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_test.py
@@ -36,9 +36,7 @@ def device_and_data_format():
                                                               'channels_last')
 
 
-def random_batch(batch_size, device_and_format=None):
-  _, data_format = device_and_format or device_and_data_format()
-
+def random_batch(batch_size, data_format):
   shape = (3, 224, 224) if data_format == 'channels_first' else (224, 224, 3)
   shape = (batch_size,) + shape
 
@@ -51,15 +49,17 @@ def random_batch(batch_size, device_and_format=None):
   return images, one_hot
 
 
-def train_one_step(model, images, labels, optimizer):
-
-  with tfe.GradientTape() as tape:
+def compute_gradients(model, images, labels):
+  with tf.GradientTape() as tape:
     logits = model(images, training=True)
     loss = tf.losses.softmax_cross_entropy(
         logits=logits, onehot_labels=labels)
     tf.contrib.summary.scalar(name='loss', tensor=loss)
-  grads = tape.gradient(loss, model.variables)
-  optimizer.apply_gradients(zip(grads, model.variables))
+  return tape.gradient(loss, model.variables)
+
+
+def apply_gradients(model, optimizer, gradients):
+  optimizer.apply_gradients(zip(gradients, model.variables))
 
 
 class ResNet50Test(tf.test.TestCase):
@@ -70,7 +70,7 @@ def _apply(self, defun=False, execution_mode=None):
     if defun:
       model.call = tfe.defun(model.call)
     with tf.device(device), tfe.execution_mode(execution_mode):
-      images, _ = random_batch(2)
+      images, _ = random_batch(2, data_format)
       output = model(images, training=False)
       tfe.async_wait()
     self.assertEqual((2, 1000), output.shape)
@@ -91,7 +91,7 @@ def test_apply_no_top(self):
     device, data_format = device_and_data_format()
     model = resnet50.ResNet50(data_format, include_top=False)
     with tf.device(device):
-      images, _ = random_batch(2)
+      images, _ = random_batch(2, data_format)
       output = model(images, training=False)
     output_shape = ((2, 2048, 1, 1)
                     if data_format == 'channels_first' else (2, 1, 1, 2048))
@@ -101,7 +101,7 @@ def test_apply_with_pooling(self):
     device, data_format = device_and_data_format()
     model = resnet50.ResNet50(data_format, include_top=False, pooling='avg')
     with tf.device(device):
-      images, _ = random_batch(2)
+      images, _ = random_batch(2, data_format)
       output = model(images, training=False)
     self.assertEqual((2, 2048), output.shape)
 
@@ -115,8 +115,9 @@ def _test_train(self, execution_mode=None):
         name='t0').as_default(), tf.contrib.summary.always_record_summaries():
       with tf.device(device), tfe.execution_mode(execution_mode):
         optimizer = tf.train.GradientDescentOptimizer(0.1)
-        images, labels = random_batch(2)
-        train_one_step(model, images, labels, optimizer)
+        images, labels = random_batch(2, data_format)
+        apply_gradients(model, optimizer,
+                        compute_gradients(model, images, labels))
         self.assertEqual(320, len(model.variables))
         tfe.async_wait()
     events = summary_test_util.events_from_logdir(logdir)
@@ -134,20 +135,22 @@ def test_no_garbage(self):
     model = resnet50.ResNet50(data_format)
     optimizer = tf.train.GradientDescentOptimizer(0.1)
     with tf.device(device):
-      images, labels = random_batch(2)
+      images, labels = random_batch(2, data_format)
       gc.disable()
       # Warm up. Note that this first run does create significant amounts of
       # garbage to be collected. The hope is that this is a build-only effect,
       # and a subsequent training loop will create nothing which needs to be
       # collected.
-      train_one_step(model, images, labels, optimizer)
+      apply_gradients(model, optimizer,
+                      compute_gradients(model, images, labels))
       gc.collect()
       previous_gc_debug_flags = gc.get_debug()
       gc.set_debug(gc.DEBUG_SAVEALL)
       for _ in range(2):
         # Run twice to ensure that garbage that is created on the first
         # iteration is no longer accessible.
-        train_one_step(model, images, labels, optimizer)
+        apply_gradients(model, optimizer,
+                        compute_gradients(model, images, labels))
       gc.collect()
       # There should be no garbage requiring collection.
       self.assertEqual(0, len(gc.garbage))
@@ -182,9 +185,7 @@ def _train_batch_sizes(self):
           return (16, 32, 64)
 
       if tf.DeviceSpec.from_string(device.name).device_type == 'TPU':
-        # TODO(iga): Training fails with batch size of 16, probably because of
-        # no layout optimizations with op-by-op mode. Investigate more.
-        return (8,)
+        return (32,)
     return (16, 32)
 
   def _report(self, label, start, num_iters, device, batch_size, data_format):
@@ -202,18 +203,18 @@ def _force_device_sync(self):
     # which forces a sync. This is a roundabout way, yes.
     tf.constant(1.).cpu()
 
-  def _benchmark_eager_apply(self, label, defun=False, execution_mode=None,
-                             device_and_format=None):
+  def _benchmark_eager_apply(self, label, device_and_format, defun=False,
+                             execution_mode=None, compiled=False):
     with tfe.execution_mode(execution_mode):
-      device, data_format = device_and_format or device_and_data_format()
+      device, data_format = device_and_format
       model = resnet50.ResNet50(data_format)
       if defun:
-        model.call = tfe.defun(model.call)
+        model.call = tfe.defun(model.call, compiled=compiled)
       batch_size = 64
       num_burn = 5
       num_iters = 30
       with tf.device(device):
-        images, _ = random_batch(batch_size, device_and_format)
+        images, _ = random_batch(batch_size, data_format)
         for _ in xrange(num_burn):
           model(images, training=False).cpu()
         if execution_mode:
@@ -227,37 +228,44 @@ def _benchmark_eager_apply(self, label, defun=False, execution_mode=None,
         self._report(label, start, num_iters, device, batch_size, data_format)
 
   def benchmark_eager_apply_sync(self):
-    self._benchmark_eager_apply('eager_apply', defun=False)
+    self._benchmark_eager_apply('eager_apply', device_and_data_format(),
+                                defun=False)
 
   def benchmark_eager_apply_async(self):
     self._benchmark_eager_apply(
-        'eager_apply_async', defun=False, execution_mode=tfe.ASYNC)
+        'eager_apply_async', device_and_data_format(), defun=False,
+        execution_mode=tfe.ASYNC)
 
   def benchmark_eager_apply_with_defun(self):
-    self._benchmark_eager_apply('eager_apply_with_defun', defun=True)
+    self._benchmark_eager_apply('eager_apply_with_defun',
+                                device_and_data_format(), defun=True)
 
   def _benchmark_eager_train(self,
                              label,
                              make_iterator,
+                             device_and_format,
                              defun=False,
                              execution_mode=None,
-                             device_and_format=None):
+                             compiled=False):
     with tfe.execution_mode(execution_mode):
-      device, data_format = device_and_format or device_and_data_format()
+      device, data_format = device_and_format
       for batch_size in self._train_batch_sizes():
-        (images, labels) = random_batch(batch_size, device_and_format)
-        num_burn = 3
-        num_iters = 10
+        (images, labels) = random_batch(batch_size, data_format)
         model = resnet50.ResNet50(data_format)
-        if defun:
-          model.call = tfe.defun(model.call)
         optimizer = tf.train.GradientDescentOptimizer(0.1)
+        apply_grads = apply_gradients
+        if defun:
+          model.call = tfe.defun(model.call, compiled=compiled)
+          apply_grads = tfe.defun(apply_gradients, compiled=compiled)
 
+        num_burn = 3
+        num_iters = 10
         with tf.device(device):
           iterator = make_iterator((images, labels))
           for _ in xrange(num_burn):
             (images, labels) = iterator.next()
-            train_one_step(model, images, labels, optimizer)
+            apply_grads(model, optimizer,
+                        compute_gradients(model, images, labels))
           if execution_mode:
             tfe.async_wait()
           self._force_device_sync()
@@ -266,25 +274,29 @@ def _benchmark_eager_train(self,
           start = time.time()
           for _ in xrange(num_iters):
             (images, labels) = iterator.next()
-            train_one_step(model, images, labels, optimizer)
+            apply_grads(model, optimizer,
+                        compute_gradients(model, images, labels))
           if execution_mode:
             tfe.async_wait()
           self._force_device_sync()
           self._report(label, start, num_iters, device, batch_size, data_format)
 
   def benchmark_eager_train_sync(self):
-    self._benchmark_eager_train('eager_train', MockIterator, defun=False)
+    self._benchmark_eager_train('eager_train', MockIterator,
+                                device_and_data_format(), defun=False)
 
   def benchmark_eager_train_async(self):
     self._benchmark_eager_train(
         'eager_train_async',
         MockIterator,
+        device_and_data_format(),
         defun=False,
         execution_mode=tfe.ASYNC)
 
   def benchmark_eager_train_with_defun(self):
     self._benchmark_eager_train(
-        'eager_train_with_defun', MockIterator, defun=True)
+        'eager_train_with_defun', MockIterator,
+        device_and_data_format(), defun=True)
 
   def benchmark_eager_train_datasets(self):
 
@@ -294,7 +306,8 @@ def make_iterator(tensors):
       return tfe.Iterator(ds)
 
     self._benchmark_eager_train(
-        'eager_train_dataset', make_iterator, defun=False)
+        'eager_train_dataset', make_iterator,
+        device_and_data_format(), defun=False)
 
   def benchmark_eager_train_datasets_with_defun(self):
 
@@ -304,7 +317,8 @@ def make_iterator(tensors):
       return tfe.Iterator(ds)
 
     self._benchmark_eager_train(
-        'eager_train_dataset_with_defun', make_iterator, defun=True)
+        'eager_train_dataset_with_defun', make_iterator,
+        device_and_data_format(), defun=True)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py b/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py
index 492adbe1d80941..5ee2176154ec70 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py
+++ b/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot.py
@@ -152,7 +152,7 @@ def __init__(self, rnn_cell_sizes, label_dimension, keep_prob):
     self.label_dimension = label_dimension
     self.keep_prob = keep_prob
 
-    self.cells = self._add_cells(
+    self.cells = tf.contrib.checkpoint.List(
         [tf.nn.rnn_cell.BasicLSTMCell(size) for size in rnn_cell_sizes])
     self.relu = layers.Dense(
         label_dimension, activation=tf.nn.relu, name="relu")
@@ -204,14 +204,6 @@ def call(self, inputs, training=False):
     hidden_states = tf.gather_nd(chars, indices)
     return self.relu(hidden_states)
 
-  def _add_cells(self, cells):
-    # "Magic" required for keras.Model classes to track all the variables in
-    # a list of layers.Layer objects.
-    # TODO(ashankar): Figure out API so user code doesn't have to do this.
-    for i, c in enumerate(cells):
-      setattr(self, "cell-%d" % i, c)
-    return cells
-
 
 def loss(labels, predictions):
   """Computes mean squared loss."""
diff --git a/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot_test.py b/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot_test.py
index 75b342ba78bd5d..b7d8395e277b52 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot_test.py
+++ b/tensorflow/contrib/eager/python/examples/rnn_colorbot/rnn_colorbot_test.py
@@ -67,5 +67,5 @@ def testTest(self):
 
 
 if __name__ == "__main__":
-  tfe.enable_eager_execution()
+  tf.enable_eager_execution()
   tf.test.main()
diff --git a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
index be5d60449d7e08..c2340a293a8092 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
+++ b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb.py
@@ -50,7 +50,7 @@ class RNN(tf.keras.Model):
   def __init__(self, hidden_dim, num_layers, keep_ratio):
     super(RNN, self).__init__()
     self.keep_ratio = keep_ratio
-    self.cells = self._add_cells([
+    self.cells = tf.contrib.checkpoint.List([
         tf.nn.rnn_cell.BasicLSTMCell(num_units=hidden_dim)
         for _ in range(num_layers)
     ])
@@ -74,14 +74,6 @@ def call(self, input_seq, training):
     # tuple (output, output_states).
     return [input_seq]
 
-  def _add_cells(self, cells):
-    # "Magic" required for keras.Model classes to track all the variables in
-    # a list of Layer objects.
-    # TODO(ashankar): Figure out API so user code doesn't have to do this.
-    for i, c in enumerate(cells):
-      setattr(self, "cell-%d" % i, c)
-    return cells
-
 
 class Embedding(layers.Layer):
   """An Embedding layer."""
@@ -304,7 +296,7 @@ def test_model(use_cudnn_rnn):
 
 
 def main(_):
-  tfe.enable_eager_execution()
+  tf.enable_eager_execution()
 
   if not FLAGS.data_path:
     raise ValueError("Must specify --data-path")
diff --git a/tensorflow/contrib/eager/python/examples/scan/BUILD b/tensorflow/contrib/eager/python/examples/scan/BUILD
new file mode 100644
index 00000000000000..69660fbeabf2f9
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/scan/BUILD
@@ -0,0 +1,25 @@
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//tensorflow:internal"])
+
+load("//tensorflow:tensorflow.bzl", "gpu_py_test")
+
+gpu_py_test(
+    name = "scan_test",
+    size = "small",
+    srcs = ["scan_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+gpu_py_test(
+    name = "scan_graph_test",
+    size = "small",
+    srcs = ["scan_graph_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow:tensorflow_py",
+    ],
+)
diff --git a/tensorflow/contrib/eager/python/examples/scan/scan_graph_test.py b/tensorflow/contrib/eager/python/examples/scan/scan_graph_test.py
new file mode 100644
index 00000000000000..d4b8c8941ec411
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/scan/scan_graph_test.py
@@ -0,0 +1,54 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit test for tf.scan under graph mode execution."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+import tensorflow as tf
+
+
+class ScanBenchmark(tf.test.Benchmark):
+
+  def runScan(self, n):
+    elems = np.arange(n)
+    start_time = time.time()
+    sum_op = tf.scan(lambda a, x: a + x, elems, parallel_iterations=1)
+    with tf.Session() as sess:
+      sess.run(sum_op)
+    wall_time = time.time() - start_time
+
+    self.report_benchmark(
+        name='scan',
+        iters=n,
+        wall_time=wall_time)
+
+  def benchmarkScan16000(self):
+    self.runScan(16000)
+
+  def benchmarkScan32000(self):
+    self.runScan(32000)
+
+  def benchmarkScan64000(self):
+    self.runScan(64000)
+
+  def benchmarkScan128000(self):
+    self.runScan(128000)
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/contrib/eager/python/examples/scan/scan_test.py b/tensorflow/contrib/eager/python/examples/scan/scan_test.py
new file mode 100644
index 00000000000000..a02fc24c79dae6
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/scan/scan_test.py
@@ -0,0 +1,54 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Unit test for tf.scan under eager execution."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+import tensorflow as tf
+
+
+class ScanBenchmark(tf.test.Benchmark):
+
+  def runScan(self, n):
+    elems = np.arange(n)
+    start_time = time.time()
+    _ = tf.scan(lambda a, x: a + x, elems, parallel_iterations=1)
+    wall_time = time.time() - start_time
+
+    self.report_benchmark(
+        name='scan',
+        iters=n,
+        wall_time=wall_time)
+
+  def benchmarkScan16000(self):
+    self.runScan(16000)
+
+  def benchmarkScan32000(self):
+    self.runScan(32000)
+
+  def benchmarkScan64000(self):
+    self.runScan(64000)
+
+  def benchmarkScan128000(self):
+    self.runScan(128000)
+
+
+if __name__ == '__main__':
+  tf.enable_eager_execution()
+  tf.test.main()
diff --git a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
index f825a2a7363fbe..8ac553e0ae7138 100644
--- a/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
+++ b/tensorflow/contrib/eager/python/examples/spinn/spinn_test.py
@@ -34,10 +34,10 @@
 from tensorflow.contrib.eager.python.examples.spinn import data
 from third_party.examples.eager.spinn import spinn
 from tensorflow.contrib.summary import summary_test_util
-from tensorflow.core.protobuf import checkpointable_object_graph_pb2
 from tensorflow.python.eager import test
 from tensorflow.python.framework import test_util
-from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training import saver
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
 # pylint: enable=g-bad-import-order
 
 
@@ -421,10 +421,8 @@ def testTrainSpinn(self):
 
     # 5. Verify that checkpoints exist and contains all the expected variables.
     self.assertTrue(glob.glob(os.path.join(config.logdir, "ckpt*")))
-    object_graph_string = checkpoint_utils.load_variable(
-        config.logdir, name="_CHECKPOINTABLE_OBJECT_GRAPH")
-    object_graph = checkpointable_object_graph_pb2.CheckpointableObjectGraph()
-    object_graph.ParseFromString(object_graph_string)
+    object_graph = checkpointable_utils.object_metadata(
+        saver.latest_checkpoint(config.logdir))
     ckpt_variable_names = set()
     for node in object_graph.nodes:
       for attribute in node.attributes:
diff --git a/tensorflow/contrib/eager/python/metrics_impl.py b/tensorflow/contrib/eager/python/metrics_impl.py
index 907f9204c2d31a..c947ed9dcc4156 100644
--- a/tensorflow/contrib/eager/python/metrics_impl.py
+++ b/tensorflow/contrib/eager/python/metrics_impl.py
@@ -25,12 +25,13 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import summary_ops_v2 as summary_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.training import checkpointable
+from tensorflow.python.training.checkpointable import base as checkpointable
 
 _to_replace = re.compile("[^A-Za-z0-9.]")
 
@@ -367,6 +368,9 @@ def call(self, labels, predictions, weights=None):
     Returns:
       The arguments, for easy chaining.
     """
+    check_ops.assert_equal(
+        array_ops.shape(labels), array_ops.shape(predictions),
+        message="Shapes of labels and predictions are unequal")
     matches = math_ops.equal(labels, predictions)
     matches = math_ops.cast(matches, dtypes.float64)
     super(Accuracy, self).call(matches, weights=weights)
diff --git a/tensorflow/contrib/eager/python/metrics_test.py b/tensorflow/contrib/eager/python/metrics_test.py
index f0fe4ce8c53bb8..02ee05487515b8 100644
--- a/tensorflow/contrib/eager/python/metrics_test.py
+++ b/tensorflow/contrib/eager/python/metrics_test.py
@@ -26,12 +26,13 @@
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import summary_ops_v2 as summary_ops
-from tensorflow.python.training import checkpointable_utils
 from tensorflow.python.training import training_util
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
 
 
 class MetricsTest(test.TestCase):
@@ -117,6 +118,11 @@ def testAccuracy(self):
     self.assertEqual(dtypes.float64, m.dtype)
     self.assertEqual(dtypes.float64, m.result().dtype)
 
+  def testAccuracyDifferentShapes(self):
+    m = metrics.Accuracy()
+    with self.assertRaises(errors.InvalidArgumentError):
+      m([[0], [0]], [0, 1])
+
   def testWeightedAccuracy(self):
     m = metrics.Accuracy()
     # 1 correct, total weight of 2
@@ -146,8 +152,6 @@ def testTwoMeans(self):
     self.assertAllEqual(2.0, m2.result())
 
   def testNamesWithSpaces(self):
-    # Verify two metrics with the same class and name don't
-    # accidentally share state.
     m1 = metrics.Mean("has space")
     m1(0)
     self.assertEqual(m1.name, "has space")
@@ -186,8 +190,8 @@ def testGraphAndEagerTensor(self):
     self.assertEqual(self.evaluate(value), 2.5)
 
   def testTwoMeansGraph(self):
-    # Verify two metrics with the same class and name don't
-    # accidentally share state.
+    # Verify two metrics with the same name in the same graph raises a
+    # ValueError.
     with context.graph_mode():
       m1 = metrics.Mean()
       m1(0)
diff --git a/tensorflow/contrib/eager/python/network.py b/tensorflow/contrib/eager/python/network.py
index 2f8721324f5fc1..f801d9a47b2f83 100644
--- a/tensorflow/contrib/eager/python/network.py
+++ b/tensorflow/contrib/eager/python/network.py
@@ -23,14 +23,16 @@
 import weakref
 
 from tensorflow.python.eager import context
-from tensorflow.python.estimator import util as estimator_util
 from tensorflow.python.framework import ops
-from tensorflow.python.keras._impl.keras.engine import base_layer as keras_base_layer
+from tensorflow.python.keras.engine import base_layer as keras_base_layer
 from tensorflow.python.layers import base
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_utils
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import training_util
+from tensorflow.python.util import deprecation
+from tensorflow.python.util import function_utils
 
 # pylint: disable=protected-access
 # Explanation for protected-access disable: Network has lots of same-class and
@@ -52,9 +54,40 @@ def _network_name_scope_naming(current_variable_scope):
   return current_variable_scope.name + "/"
 
 
+_NETWORK_DEPRECATION_MESSAGE = (
+    "Please inherit from `tf.keras.Model`, and see its documentation for "
+    "details. `tf.keras.Model` should be a drop-in replacement for "
+    "`tfe.Network` in most cases, but note that `track_layer` is no longer "
+    "necessary or supported. Instead, `Layer` instances are tracked on "
+    "attribute assignment (see the section of `tf.keras.Model`'s documentation "
+    "on subclassing). Since the output of `track_layer` is often assigned to "
+    "an attribute anyway, most code can be ported by simply removing the "
+    "`track_layer` calls.\n\n`tf.keras.Model` works with all TensorFlow "
+    "`Layer` instances, including those from `tf.layers`, but switching to "
+    "the `tf.keras.layers` versions along with the migration to "
+    "`tf.keras.Model` is recommended, since it will preserve variable names. "
+    "Feel free to import it with an alias to avoid excess typing :)."
+)
+
+
 class Network(base.Layer):
   """Represents the composition of a set of Layers.
 
+  *Deprecated*. Please inherit from `tf.keras.Model`, and see its documentation
+  for details. `tf.keras.Model` should be a drop-in replacement for
+  `tfe.Network` in most cases, but note that `track_layer` is no longer
+  necessary or supported. Instead, `Layer` instances are tracked on attribute
+  assignment (see the section of `tf.keras.Model`'s documentation on
+  subclassing). Since the output of `track_layer` is often assigned to an
+  attribute anyway, most code can be ported by simply removing the `track_layer`
+  calls.
+
+  `tf.keras.Model` works with all TensorFlow `Layer` instances, including those
+  from `tf.layers`, but switching to the `tf.keras.layers` versions along with
+  the migration to `tf.keras.Model` is recommended, since it will preserve
+  variable names.  Feel free to import it with an alias to avoid excess typing
+  :).
+
   `Network` implements the `Layer` interface and adds convenience methods for
   managing sub-`Layer`s, such as listing variables.
 
@@ -112,6 +145,7 @@ def call(self, inputs):
   # - Detect layers used in __call__ that weren't registered with track_layer.
   # - Convert inputs to __call__ to tensors.
 
+  @deprecation.deprecated(date=None, instructions=_NETWORK_DEPRECATION_MESSAGE)
   def __init__(self, name=None):
     """Configure the `Network`.
 
@@ -130,6 +164,10 @@ def __init__(self, name=None):
       ValueError: If `name` is not valid. Note that some naming errors will
         instead be raised when the `Network` is called.
     """
+    if context.executing_eagerly():
+      logging.warning(
+          ("** tfe.Network is deprecated and will be removed in a future "
+           "version.\n\n%s") % _NETWORK_DEPRECATION_MESSAGE)
     if isinstance(name, variable_scope.VariableScope):
       raise ValueError("VariableScopes are not valid Network names.")
     if name is not None and "/" in name:
@@ -152,6 +190,11 @@ def __init__(self, name=None):
     self._variable_scope_counts_on_init = (
         variable_scope.get_variable_scope_store().variable_scopes_count)
 
+  def _gather_saveables_for_checkpoint(self):
+    raise NotImplementedError(
+        "tfe.Network does not support object-based checkpointing.\n\n%s"
+        % _NETWORK_DEPRECATION_MESSAGE)
+
   def _name_scope_name(self, current_variable_scope):
     """Overrides Layer op naming to match variable naming."""
     return _network_name_scope_naming(
@@ -502,10 +545,10 @@ def __init__(self, layers_funcs=None, name=None):
 
   def add(self, layer_func):
     if isinstance(layer_func, base.Layer):
-      args = estimator_util.fn_args(layer_func.call)
+      args = function_utils.fn_args(layer_func.call)
       self.track_layer(layer_func)
     elif callable(layer_func):
-      args = estimator_util.fn_args(layer_func)
+      args = function_utils.fn_args(layer_func)
     else:
       raise TypeError(
           "Sequential.add() takes only tf.layers.Layer objects or callables; "
@@ -706,6 +749,9 @@ def _strip_variable_prefix(original_variable_name):
   return _strip_variable_prefix
 
 
+@deprecation.deprecated(date=None, instructions=(
+    "Please inherit from tf.keras.Model instead of tfe.Network, and use "
+    "tf.keras.Model.save_weights."))
 def save_network_checkpoint(
     network, save_path, global_step=None, map_func=None):
   """Save variables from the Network to a checkpoint.
@@ -905,6 +951,9 @@ def _set_restore_on_create(network, save_path, map_func, user_map_func,
     _add_deferred_restoration(network, deferred_restoration)
 
 
+@deprecation.deprecated(date=None, instructions=(
+    "Please inherit from tf.keras.Model instead of tfe.Network, and use "
+    "tf.keras.Model.load_weights."))
 def restore_network_checkpoint(network, save_path, map_func=None):
   """Restore the Network from a checkpoint.
 
diff --git a/tensorflow/contrib/eager/python/network_test.py b/tensorflow/contrib/eager/python/network_test.py
index f43376d5d777a7..c92bd15b253b67 100644
--- a/tensorflow/contrib/eager/python/network_test.py
+++ b/tensorflow/contrib/eager/python/network_test.py
@@ -31,6 +31,7 @@
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import training_util
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
 
 
 # pylint: disable=not-callable
@@ -62,6 +63,12 @@ def call(self, values):
 
 class NetworkTest(test.TestCase):
 
+  def test_checkpointing_not_implemented(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint = checkpointable_utils.Checkpoint(net=MyNetwork())
+    with self.assertRaises(NotImplementedError):
+      checkpoint.save(checkpoint_directory)
+
   def _save_modify_load_network_built(self, net, global_step=None):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_path = network.save_network_checkpoint(
diff --git a/tensorflow/contrib/eager/python/saver_test.py b/tensorflow/contrib/eager/python/saver_test.py
index 4032e755f6e7de..90a3711475719a 100644
--- a/tensorflow/contrib/eager/python/saver_test.py
+++ b/tensorflow/contrib/eager/python/saver_test.py
@@ -60,15 +60,9 @@ def model():
 
   def testSameNameNoClobbering(self):
     with ops.device(self._dev()):
-      # Note that this test purposefully uses Graphs rather than
-      # IsolateTest. Users are more likely to accidentally create the same
-      # variable name this way.
-      first_graph = ops.Graph()
-      with first_graph.as_default():
-        v1_first_graph = resource_variable_ops.ResourceVariable(1.0, name='v1')
-      with ops.Graph().as_default():
-        v1_second_graph = resource_variable_ops.ResourceVariable(2.0, name='v1')
-        saver = _saver.Saver([v1_first_graph, v1_second_graph])
+      v1 = resource_variable_ops.ResourceVariable(1.0, name='v1')
+      v2 = resource_variable_ops.ResourceVariable(2.0, name='v1')
+      saver = _saver.Saver([v1, v2])
       ckpt_prefix = os.path.join(test.get_temp_dir(), 'ckpt')
       with self.assertRaisesRegexp(ValueError, 'v1'):
         saver.save(ckpt_prefix)
@@ -126,12 +120,11 @@ def model(init_val):
       saver = _saver.Saver([v1])
       saver.save(ckpt_prefix)
 
-      with ops.Graph().as_default():
-        saver = _saver.Saver([v1])
-        with _saver.restore_variables_on_create(ckpt_prefix):
-          # Value is from checkpoint, but not from argument.
-          ret, _ = model(2.0)
-          self.assertEqual(ret.numpy(), 1.0)
+      saver = _saver.Saver([v1])
+      with _saver.restore_variables_on_create(ckpt_prefix):
+        # Value is from checkpoint, but not from argument.
+        ret, _ = model(2.0)
+        self.assertEqual(ret.numpy(), 1.0)
 
   def testRestoreNotFound(self):
     with ops.device(self._dev()):
@@ -184,17 +177,17 @@ def model(x):
           4, model(array_ops.constant(2, dtype=dtypes.float32)).numpy())
 
       # reset the graph and reload on create, so that 1 + 2 = 3
-      with ops.Graph().as_default():
-        with _saver.restore_variables_on_create(ckpt_prefix):
-          @graph_callable.graph_callable(
-              [graph_callable.ShapeAndDtype(shape=(), dtype=dtypes.float32)])
-          def model2(x):
-            v = variable_scope.get_variable(
-                'v', initializer=init_ops.zeros_initializer(), shape=())
-            return v + x
-
-          self.assertEqual(
-              3, model2(array_ops.constant(2, dtype=dtypes.float32)).numpy())
+      ops.reset_default_graph()
+      with _saver.restore_variables_on_create(ckpt_prefix):
+        @graph_callable.graph_callable(
+            [graph_callable.ShapeAndDtype(shape=(), dtype=dtypes.float32)])
+        def model2(x):
+          v = variable_scope.get_variable(
+              'v', initializer=init_ops.zeros_initializer(), shape=())
+          return v + x
+
+        self.assertEqual(
+            3, model2(array_ops.constant(2, dtype=dtypes.float32)).numpy())
 
 
 class GetOptimizerTests(test.TestCase):
diff --git a/tensorflow/contrib/eager/python/tfe.py b/tensorflow/contrib/eager/python/tfe.py
index 79dd117854e5fe..fee9db46fa4f79 100644
--- a/tensorflow/contrib/eager/python/tfe.py
+++ b/tensorflow/contrib/eager/python/tfe.py
@@ -115,14 +115,15 @@
 from tensorflow.python.framework.ops import enable_eager_execution
 from tensorflow.python.framework.ops import eager_run as run
 from tensorflow.python.framework.test_util import run_in_graph_and_eager_modes as run_test_in_graph_and_eager_modes
+from tensorflow.python.framework.test_util import run_all_in_graph_and_eager_modes as run_all_tests_in_graph_and_eager_modes
 from tensorflow.python.ops.custom_gradient import custom_gradient
 from tensorflow.python.ops.resource_variable_ops import ResourceVariable as Variable
 from tensorflow.python.ops.variable_scope import EagerVariableStore
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import template
-from tensorflow.python.training.checkpointable import Checkpointable
-from tensorflow.python.training.checkpointable_utils import CheckpointableSaver
-from tensorflow.python.training.checkpointable_utils import Checkpoint
+from tensorflow.python.training.checkpointable.base import Checkpointable
+from tensorflow.python.training.checkpointable.util import CheckpointableSaver
+from tensorflow.python.training.checkpointable.util import Checkpoint
 from tensorflow.python.util.all_util import remove_undocumented
 
 py_func = script_ops.eager_py_func
diff --git a/tensorflow/contrib/eager/python/tfe_test.py b/tensorflow/contrib/eager/python/tfe_test.py
index e80ccbb74d8623..db50b33af2e4f1 100644
--- a/tensorflow/contrib/eager/python/tfe_test.py
+++ b/tensorflow/contrib/eager/python/tfe_test.py
@@ -57,7 +57,7 @@ def square(x):
       return math_ops.multiply(x, x)
 
     grad = tfe.gradients_function(square)
-    self.assertEquals([6], [x.numpy() for x in grad(3)])
+    self.assertEquals([6], [x.numpy() for x in grad(3.)])
 
   def testGradOfGrad(self):
 
@@ -66,7 +66,7 @@ def square(x):
 
     grad = tfe.gradients_function(square)
     gradgrad = tfe.gradients_function(lambda x: grad(x)[0])
-    self.assertEquals([2], [x.numpy() for x in gradgrad(3)])
+    self.assertEquals([2], [x.numpy() for x in gradgrad(3.)])
 
   def testCustomGrad(self):
 
@@ -80,7 +80,7 @@ def grad_fn(_):
       return y, grad_fn
 
     grad = tfe.gradients_function(f)
-    self.assertEquals([12], [x.numpy() for x in grad(3)])
+    self.assertEquals([12], [x.numpy() for x in grad(3.)])
 
   def testGPU(self):
     if tfe.num_gpus() <= 0:
diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD
index cca441e9fb6996..ef9932f8355443 100644
--- a/tensorflow/contrib/estimator/BUILD
+++ b/tensorflow/contrib/estimator/BUILD
@@ -14,11 +14,14 @@ py_library(
     srcs = ["__init__.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":baseline",
         ":boosted_trees",
         ":dnn",
         ":dnn_linear_combined",
+        ":export",
         ":extenders",
         ":head",
+        ":hooks",
         ":linear",
         ":logit_fns",
         ":multi_head",
@@ -28,6 +31,49 @@ py_library(
     ],
 )
 
+py_library(
+    name = "baseline",
+    srcs = ["python/estimator/baseline.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:baseline",
+    ],
+)
+
+py_test(
+    name = "baseline_test",
+    size = "small",
+    srcs = ["python/estimator/baseline_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_pip",
+        "notsan",
+    ],
+    deps = [
+        ":baseline",
+        ":head",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:session",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:training",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/estimator:export_export",
+        "//tensorflow/python/estimator:metric_keys",
+        "//tensorflow/python/estimator:numpy_io",
+        "//tensorflow/python/feature_column",
+        "//tensorflow/python/ops/losses",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
 py_library(
     name = "boosted_trees",
     srcs = ["python/estimator/boosted_trees.py"],
@@ -77,6 +123,7 @@ py_test(
     tags = [
         "no_pip",
         "notsan",
+        "optonly",  # times out http://b/79220679
     ],
     deps = [
         ":dnn",
@@ -179,6 +226,43 @@ py_test(
     ],
 )
 
+py_library(
+    name = "export",
+    srcs = [
+        "python/estimator/export.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python/estimator:model_fn",
+    ],
+)
+
+py_test(
+    name = "export_test",
+    size = "medium",
+    srcs = ["python/estimator/export_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["notsan"],  # b/62863147
+    deps = [
+        ":export",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:metrics",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/estimator",
+        "//tensorflow/python/estimator:export_export",
+        "//tensorflow/python/estimator:export_output",
+        "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/saved_model:loader",
+        "//tensorflow/python/saved_model:tag_constants",
+    ],
+)
+
 py_library(
     name = "head",
     srcs = [
@@ -228,6 +312,7 @@ py_test(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:training",
+        "//tensorflow/python:variables",
         "//tensorflow/python/estimator:metric_keys",
         "//tensorflow/python/estimator:model_fn",
         "//tensorflow/python/estimator:prediction_keys",
@@ -238,6 +323,37 @@ py_test(
     ],
 )
 
+py_library(
+    name = "hooks",
+    srcs = [
+        "python/estimator/hooks.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python/estimator:estimator_py",
+    ],
+)
+
+py_test(
+    name = "hooks_test",
+    size = "medium",
+    srcs = ["python/estimator/hooks_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["notsan"],
+    deps = [
+        ":hooks",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/estimator:estimator_py",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+    ],
+)
+
 py_library(
     name = "linear",
     srcs = ["python/estimator/linear.py"],
@@ -283,9 +399,9 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:util",
         "//tensorflow/python/estimator:dnn",
         "//tensorflow/python/estimator:linear",
-        "//tensorflow/python/estimator:util",
     ],
 )
 
@@ -450,20 +566,25 @@ py_test(
         "no_pip",
         "noasan",  # times out
         "notsan",
+        "optonly",  # times out http://b/79220679
     ],
     deps = [
+        ":head",
         ":rnn",
+        "//tensorflow/contrib/data",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:summary",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
         "//tensorflow/python/estimator:numpy_io",
+        "//tensorflow/python/estimator:parsing_utils",
         "//tensorflow/python/feature_column",
         "//third_party/py/numpy",
         "@six_archive//:six",
diff --git a/tensorflow/contrib/estimator/__init__.py b/tensorflow/contrib/estimator/__init__.py
index be20d1b7770d3f..788ac5ca7046d6 100644
--- a/tensorflow/contrib/estimator/__init__.py
+++ b/tensorflow/contrib/estimator/__init__.py
@@ -19,11 +19,14 @@
 from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long,wildcard-import
+from tensorflow.contrib.estimator.python.estimator.baseline import *
 from tensorflow.contrib.estimator.python.estimator.boosted_trees import *
 from tensorflow.contrib.estimator.python.estimator.dnn import *
 from tensorflow.contrib.estimator.python.estimator.dnn_linear_combined import *
+from tensorflow.contrib.estimator.python.estimator.export import *
 from tensorflow.contrib.estimator.python.estimator.extenders import *
 from tensorflow.contrib.estimator.python.estimator.head import *
+from tensorflow.contrib.estimator.python.estimator.hooks import *
 from tensorflow.contrib.estimator.python.estimator.linear import *
 from tensorflow.contrib.estimator.python.estimator.logit_fns import *
 from tensorflow.contrib.estimator.python.estimator.multi_head import *
@@ -38,11 +41,14 @@
     'binary_classification_head',
     'clip_gradients_by_norm',
     'forward_features',
+    'InMemoryEvaluatorHook',
+    'logistic_regression_head',
     'multi_class_head',
     'multi_head',
     'multi_label_head',
     'poisson_regression_head',
     'regression_head',
+    'BaselineEstimator',
     'DNNEstimator',
     'DNNLinearCombinedEstimator',
     'LinearEstimator',
@@ -54,6 +60,9 @@
     'replicate_model_fn',
     'TowerOptimizer',
     'RNNClassifier',
+    'RNNEstimator',
+    'export_saved_model_for_mode',
+    'export_all_saved_models',
 ]
 
 remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/estimator/python/estimator/baseline.py b/tensorflow/contrib/estimator/python/estimator/baseline.py
new file mode 100644
index 00000000000000..beffbee73064b9
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/baseline.py
@@ -0,0 +1,98 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Baseline estimators."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator.canned import baseline
+
+
+class BaselineEstimator(estimator.Estimator):
+  """An estimator that can establish a simple baseline.
+
+  The estimator uses a user-specified head.
+
+  This estimator ignores feature values and will learn to predict the average
+  value of each label. E.g. for single-label classification problems, this will
+  predict the probability distribution of the classes as seen in the labels.
+  For multi-label classification problems, it will predict the ratio of examples
+  that contain each class.
+
+  Example:
+
+  ```python
+
+  # Build baseline multi-label classifier.
+  estimator = BaselineEstimator(
+      head=tf.contrib.estimator.multi_label_head(n_classes=3))
+
+  # Input builders
+  def input_fn_train: # returns x, y (where y represents label's class index).
+    pass
+
+  def input_fn_eval: # returns x, y (where y represents label's class index).
+    pass
+
+  # Fit model.
+  estimator.train(input_fn=input_fn_train)
+
+  # Evaluates cross entropy between the test and train labels.
+  loss = classifier.evaluate(input_fn=input_fn_eval)["loss"]
+
+  # For each class, predicts the ratio of training examples that contain the
+  # class.
+  predictions = classifier.predict(new_samples)
+
+  ```
+
+  Input of `train` and `evaluate` should have following features,
+    otherwise there will be a `KeyError`:
+
+  * if `weight_column` passed to the `head` constructor is not `None`, a feature
+    with `key=weight_column` whose value is a `Tensor`.
+  """
+
+  def __init__(self,
+               head,
+               model_dir=None,
+               optimizer='Ftrl',
+               config=None):
+    """Initializes a BaselineEstimator instance.
+
+    Args:
+      head: A `_Head` instance constructed with a method such as
+        `tf.contrib.estimator.multi_label_head`.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator to
+        continue training a previously saved model.
+      optimizer: String, `tf.Optimizer` object, or callable that creates the
+        optimizer to use for training. If not specified, will use
+        `FtrlOptimizer` with a default learning rate of 0.3.
+      config: `RunConfig` object to configure the runtime settings.
+    """
+    def _model_fn(features, labels, mode, config):
+      return baseline._baseline_model_fn(  # pylint: disable=protected-access
+          features=features,
+          labels=labels,
+          mode=mode,
+          head=head,
+          optimizer=optimizer,
+          config=config)
+    super(BaselineEstimator, self).__init__(
+        model_fn=_model_fn,
+        model_dir=model_dir,
+        config=config)
diff --git a/tensorflow/contrib/estimator/python/estimator/baseline_test.py b/tensorflow/contrib/estimator/python/estimator/baseline_test.py
new file mode 100644
index 00000000000000..d0e3e670f73328
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/baseline_test.py
@@ -0,0 +1,430 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for baseline.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import tempfile
+
+import numpy as np
+import six
+
+from tensorflow.contrib.estimator.python.estimator import baseline
+from tensorflow.contrib.estimator.python.estimator import head as head_lib
+from tensorflow.python.client import session as tf_session
+from tensorflow.python.estimator.canned import metric_keys
+from tensorflow.python.estimator.export import export
+from tensorflow.python.estimator.inputs import numpy_io
+from tensorflow.python.feature_column import feature_column as feature_column_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.summary.writer import writer_cache
+from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.training import optimizer
+from tensorflow.python.training import saver
+
+# Names of variables created by model.
+BIAS_NAME = 'baseline/bias'
+
+
+def assert_close(expected, actual, rtol=1e-04, name='assert_close'):
+  with ops.name_scope(name, 'assert_close', (expected, actual, rtol)) as scope:
+    expected = ops.convert_to_tensor(expected, name='expected')
+    actual = ops.convert_to_tensor(actual, name='actual')
+    rdiff = math_ops.abs(expected - actual, 'diff') / math_ops.abs(expected)
+    rtol = ops.convert_to_tensor(rtol, name='rtol')
+    return check_ops.assert_less(
+        rdiff,
+        rtol,
+        data=('Condition expected =~ actual did not hold element-wise:'
+              'expected = ', expected, 'actual = ', actual, 'rdiff = ', rdiff,
+              'rtol = ', rtol,),
+        name=scope)
+
+
+def save_variables_to_ckpt(model_dir):
+  init_all_op = [variables.global_variables_initializer()]
+  with tf_session.Session() as sess:
+    sess.run(init_all_op)
+    saver.Saver().save(sess, os.path.join(model_dir, 'model.ckpt'))
+
+
+def _baseline_estimator_fn(
+    weight_column=None, label_dimension=1, *args, **kwargs):
+  """Returns a BaselineEstimator that uses regression_head."""
+  return baseline.BaselineEstimator(
+      head=head_lib.regression_head(
+          weight_column=weight_column, label_dimension=label_dimension,
+          # Tests in core (from which this test inherits) test the sum loss.
+          loss_reduction=losses.Reduction.SUM),
+      *args, **kwargs)
+
+
+class BaselineEstimatorEvaluationTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def test_evaluation_batch(self):
+    """Tests evaluation for batch_size==2."""
+    with ops.Graph().as_default():
+      variables.Variable([13.0], name=BIAS_NAME)
+      variables.Variable(
+          100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    baseline_estimator = _baseline_estimator_fn(model_dir=self._model_dir)
+    eval_metrics = baseline_estimator.evaluate(
+        input_fn=lambda: ({'age': ((1,), (1,))}, ((10.,), (10.,))), steps=1)
+
+    # Logit is bias = 13, while label is 10.
+    # Loss per example is 3**2 = 9.
+    # Training loss is the sum over batch = 9 + 9 = 18
+    # Average loss is the average over batch = 9
+    self.assertDictEqual({
+        metric_keys.MetricKeys.LOSS: 18.,
+        metric_keys.MetricKeys.LOSS_MEAN: 9.,
+        ops.GraphKeys.GLOBAL_STEP: 100
+    }, eval_metrics)
+
+  def test_evaluation_weights(self):
+    """Tests evaluation with weights."""
+    with ops.Graph().as_default():
+      variables.Variable([13.0], name=BIAS_NAME)
+      variables.Variable(
+          100, name=ops.GraphKeys.GLOBAL_STEP, dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    def _input_fn():
+      features = {'age': ((1,), (1,)), 'weights': ((1.,), (2.,))}
+      labels = ((10.,), (10.,))
+      return features, labels
+
+    baseline_estimator = _baseline_estimator_fn(
+        weight_column='weights',
+        model_dir=self._model_dir)
+    eval_metrics = baseline_estimator.evaluate(input_fn=_input_fn, steps=1)
+
+    # Logit is bias = 13, while label is 10.
+    # Loss per example is 3**2 = 9.
+    # Training loss is the weighted sum over batch = 9 + 2*9 = 27
+    # average loss is the weighted average = 9 + 2*9 / (1 + 2) = 9
+    self.assertDictEqual({
+        metric_keys.MetricKeys.LOSS: 27.,
+        metric_keys.MetricKeys.LOSS_MEAN: 9.,
+        ops.GraphKeys.GLOBAL_STEP: 100
+    }, eval_metrics)
+
+  def test_evaluation_for_multi_dimensions(self):
+    label_dim = 2
+    with ops.Graph().as_default():
+      variables.Variable([46.0, 58.0], name=BIAS_NAME)
+      variables.Variable(100, name='global_step', dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    baseline_estimator = _baseline_estimator_fn(
+        label_dimension=label_dim,
+        model_dir=self._model_dir)
+    input_fn = numpy_io.numpy_input_fn(
+        x={
+            'age': np.array([[2., 4., 5.]]),
+        },
+        y=np.array([[46., 58.]]),
+        batch_size=1,
+        num_epochs=None,
+        shuffle=False)
+    eval_metrics = baseline_estimator.evaluate(input_fn=input_fn, steps=1)
+
+    self.assertItemsEqual(
+        (metric_keys.MetricKeys.LOSS, metric_keys.MetricKeys.LOSS_MEAN,
+         ops.GraphKeys.GLOBAL_STEP), eval_metrics.keys())
+
+    # Logit is bias which is [46, 58]
+    self.assertAlmostEqual(0, eval_metrics[metric_keys.MetricKeys.LOSS])
+
+
+class BaselineEstimatorPredictTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def test_1d(self):
+    """Tests predict when all variables are one-dimensional."""
+    with ops.Graph().as_default():
+      variables.Variable([.2], name=BIAS_NAME)
+      variables.Variable(100, name='global_step', dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    baseline_estimator = _baseline_estimator_fn(model_dir=self._model_dir)
+
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'x': np.array([[2.]])},
+        y=None,
+        batch_size=1,
+        num_epochs=1,
+        shuffle=False)
+    predictions = baseline_estimator.predict(input_fn=predict_input_fn)
+    predicted_scores = list([x['predictions'] for x in predictions])
+    # x * weight + bias = 2. * 10. + .2 = 20.2
+    self.assertAllClose([[.2]], predicted_scores)
+
+  def testMultiDim(self):
+    """Tests predict when all variables are multi-dimenstional."""
+    batch_size = 2
+    label_dimension = 3
+    with ops.Graph().as_default():
+      variables.Variable(  # shape=[label_dimension]
+          [.2, .4, .6], name=BIAS_NAME)
+      variables.Variable(100, name='global_step', dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    baseline_estimator = _baseline_estimator_fn(
+        label_dimension=label_dimension,
+        model_dir=self._model_dir)
+
+    predict_input_fn = numpy_io.numpy_input_fn(
+        # x shape=[batch_size, x_dim]
+        x={'x': np.array([[1., 2., 3., 4.], [5., 6., 7., 8.]])},
+        y=None,
+        batch_size=batch_size,
+        num_epochs=1,
+        shuffle=False)
+    predictions = baseline_estimator.predict(input_fn=predict_input_fn)
+    predicted_scores = list([x['predictions'] for x in predictions])
+    # score = bias, shape=[batch_size, label_dimension]
+    self.assertAllClose([[0.2, 0.4, 0.6], [0.2, 0.4, 0.6]],
+                        predicted_scores)
+
+
+class BaselineEstimatorIntegrationTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def _test_complete_flow(self, train_input_fn, eval_input_fn, predict_input_fn,
+                          input_dimension, label_dimension, prediction_length):
+    feature_columns = [
+        feature_column_lib.numeric_column('x', shape=(input_dimension,))
+    ]
+    est = _baseline_estimator_fn(
+        label_dimension=label_dimension,
+        model_dir=self._model_dir)
+
+    # TRAIN
+    # learn y = x
+    est.train(train_input_fn, steps=200)
+
+    # EVALUTE
+    scores = est.evaluate(eval_input_fn)
+    self.assertEqual(200, scores[ops.GraphKeys.GLOBAL_STEP])
+    self.assertIn(metric_keys.MetricKeys.LOSS, six.iterkeys(scores))
+
+    # PREDICT
+    predictions = np.array(
+        [x['predictions'] for x in est.predict(predict_input_fn)])
+    self.assertAllEqual((prediction_length, label_dimension), predictions.shape)
+
+    # EXPORT
+    feature_spec = feature_column_lib.make_parse_example_spec(feature_columns)
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+    export_dir = est.export_savedmodel(tempfile.mkdtemp(),
+                                       serving_input_receiver_fn)
+    self.assertTrue(gfile.Exists(export_dir))
+
+  def test_numpy_input_fn(self):
+    """Tests complete flow with numpy_input_fn."""
+    label_dimension = 2
+    input_dimension = label_dimension
+    batch_size = 10
+    prediction_length = batch_size
+    data = np.linspace(0., 2., batch_size * label_dimension, dtype=np.float32)
+    data = data.reshape(batch_size, label_dimension)
+
+    train_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        y=data,
+        batch_size=batch_size,
+        num_epochs=None,
+        shuffle=True)
+    eval_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        y=data,
+        batch_size=batch_size,
+        num_epochs=1,
+        shuffle=False)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x={'x': data},
+        y=None,
+        batch_size=batch_size,
+        num_epochs=1,
+        shuffle=False)
+
+    self._test_complete_flow(
+        train_input_fn=train_input_fn,
+        eval_input_fn=eval_input_fn,
+        predict_input_fn=predict_input_fn,
+        input_dimension=input_dimension,
+        label_dimension=label_dimension,
+        prediction_length=prediction_length)
+
+
+class BaselineEstimatorTrainingTest(test.TestCase):
+
+  def setUp(self):
+    self._model_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    if self._model_dir:
+      writer_cache.FileWriterCache.clear()
+      shutil.rmtree(self._model_dir)
+
+  def _mock_optimizer(self, expected_loss=None):
+    expected_var_names = [
+        '%s:0' % BIAS_NAME
+    ]
+
+    def _minimize(loss, global_step=None, var_list=None):
+      trainable_vars = var_list or ops.get_collection(
+          ops.GraphKeys.TRAINABLE_VARIABLES)
+      self.assertItemsEqual(expected_var_names,
+                            [var.name for var in trainable_vars])
+
+      # Verify loss. We can't check the value directly, so we add an assert op.
+      self.assertEquals(0, loss.shape.ndims)
+      if expected_loss is None:
+        if global_step is not None:
+          return distribute_lib.increment_var(global_step)
+        return control_flow_ops.no_op()
+      assert_loss = assert_close(
+          math_ops.to_float(expected_loss, name='expected'),
+          loss,
+          name='assert_loss')
+      with ops.control_dependencies((assert_loss,)):
+        if global_step is not None:
+          return distribute_lib.increment_var(global_step)
+        return control_flow_ops.no_op()
+
+    mock_optimizer = test.mock.NonCallableMock(
+        spec=optimizer.Optimizer,
+        wraps=optimizer.Optimizer(use_locking=False, name='my_optimizer'))
+    mock_optimizer.minimize = test.mock.MagicMock(wraps=_minimize)
+
+    # NOTE: Estimator.params performs a deepcopy, which wreaks havoc with mocks.
+    # So, return mock_optimizer itself for deepcopy.
+    mock_optimizer.__deepcopy__ = lambda _: mock_optimizer
+    return mock_optimizer
+
+  def _assert_checkpoint(self,
+                         label_dimension,
+                         expected_global_step,
+                         expected_bias=None):
+    shapes = {
+        name: shape
+        for (name, shape) in checkpoint_utils.list_variables(self._model_dir)
+    }
+
+    self.assertEqual([], shapes[ops.GraphKeys.GLOBAL_STEP])
+    self.assertEqual(expected_global_step,
+                     checkpoint_utils.load_variable(self._model_dir,
+                                                    ops.GraphKeys.GLOBAL_STEP))
+
+    self.assertEqual([label_dimension], shapes[BIAS_NAME])
+    if expected_bias is not None:
+      self.assertEqual(expected_bias,
+                       checkpoint_utils.load_variable(self._model_dir,
+                                                      BIAS_NAME))
+
+  def testFromScratch(self):
+    # Create BaselineRegressor.
+    label = 5.
+    age = 17
+    # loss = (logits - label)^2 = (0 - 5.)^2 = 25.
+    mock_optimizer = self._mock_optimizer(expected_loss=25.)
+    baseline_estimator = _baseline_estimator_fn(
+        model_dir=self._model_dir,
+        optimizer=mock_optimizer)
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+
+    # Train for a few steps, and validate optimizer and final checkpoint.
+    num_steps = 10
+    baseline_estimator.train(
+        input_fn=lambda: ({'age': ((age,),)}, ((label,),)), steps=num_steps)
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+    self._assert_checkpoint(
+        label_dimension=1,
+        expected_global_step=num_steps,
+        expected_bias=[0.])
+
+  def testFromCheckpoint(self):
+    # Create initial checkpoint.
+    bias = 7.0
+    initial_global_step = 100
+    with ops.Graph().as_default():
+      variables.Variable([bias], name=BIAS_NAME)
+      variables.Variable(
+          initial_global_step,
+          name=ops.GraphKeys.GLOBAL_STEP,
+          dtype=dtypes.int64)
+      save_variables_to_ckpt(self._model_dir)
+
+    # logits = bias = 6.
+    # loss = (logits - label)^2 = (7 - 5)^2 = 4
+    mock_optimizer = self._mock_optimizer(expected_loss=4.)
+    baseline_estimator = _baseline_estimator_fn(
+        model_dir=self._model_dir,
+        optimizer=mock_optimizer)
+    self.assertEqual(0, mock_optimizer.minimize.call_count)
+
+    # Train for a few steps, and validate optimizer and final checkpoint.
+    num_steps = 10
+    baseline_estimator.train(
+        input_fn=lambda: ({'age': ((17,),)}, ((5.,),)), steps=num_steps)
+    self.assertEqual(1, mock_optimizer.minimize.call_count)
+    self._assert_checkpoint(
+        label_dimension=1,
+        expected_global_step=initial_global_step + num_steps,
+        expected_bias=[bias])
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/dnn.py b/tensorflow/contrib/estimator/python/estimator/dnn.py
index cf6e3329d2e277..7ff25b95c079c7 100644
--- a/tensorflow/contrib/estimator/python/estimator/dnn.py
+++ b/tensorflow/contrib/estimator/python/estimator/dnn.py
@@ -93,7 +93,7 @@ def __init__(self,
                dropout=None,
                input_layer_partitioner=None,
                config=None):
-    """Initializes a `DNNClassifier` instance.
+    """Initializes a `DNNEstimator` instance.
 
     Args:
       head: A `_Head` instance constructed with a method such as
diff --git a/tensorflow/contrib/estimator/python/estimator/export.py b/tensorflow/contrib/estimator/python/estimator/export.py
new file mode 100644
index 00000000000000..03cf6f107c1c55
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/export.py
@@ -0,0 +1,223 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Wrapper for methods to export train/eval graphs from Estimator."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.estimator import model_fn as model_fn_lib
+
+
+def export_saved_model_for_mode(
+    estimator, export_dir_base, input_receiver_fn,
+    assets_extra=None,
+    as_text=False,
+    checkpoint_path=None,
+    strip_default_attrs=False,
+    mode=model_fn_lib.ModeKeys.PREDICT):
+  # pylint: disable=line-too-long
+  """Exports a single train/eval/predict graph as a SavedModel.
+
+  For a detailed guide, see
+  @{$saved_model#using_savedmodel_with_estimators$Using SavedModel with Estimators}.
+
+  Sample usage:
+  ```python
+  classifier = tf.estimator.LinearClassifier(
+      feature_columns=[age, language])
+  classifier.train(input_fn=input_fn, steps=1000)
+
+  feature_spec = {
+      'age': tf.placeholder(dtype=tf.int64),
+      'language': array_ops.placeholder(dtype=tf.string)
+  }
+  label_spec = tf.placeholder(dtype=dtypes.int64)
+
+  train_rcvr_fn = tf.contrib.estimator.build_raw_supervised_input_receiver_fn(
+      feature_spec, label_spec)
+
+  export_dir = tf.contrib.estimator.export_saved_model_for_mode(
+      classifier,
+      export_dir_base='my_model/',
+      input_receiver_fn=train_rcvr_fn,
+      mode=model_fn_lib.ModeKeys.TRAIN)
+
+  # export_dir is a timestamped directory with the SavedModel, which
+  # can be used for serving, analysis with TFMA, or directly loaded in.
+  with ops.Graph().as_default() as graph:
+    with session.Session(graph=graph) as sess:
+      loader.load(sess, [tag_constants.TRAINING], export_dir)
+      weights = graph.get_tensor_by_name(''linear/linear_model/age/weights')
+      ...
+  ```
+
+  This method is a wrapper for _export_all_saved_models, and wraps a raw
+  input_receiver_fn in a dictionary to pass in to that function.
+  See _export_all_saved_models for full docs.
+
+  See tf.contrib.estimator.export_saved_model_for_mode for the currently
+  exposed version of this function.
+
+  Args:
+    estimator: an instance of tf.estimator.Estimator
+    export_dir_base: A string containing a directory in which to create
+      timestamped subdirectories containing exported SavedModels.
+    input_receiver_fn: a function that takes no argument and
+      returns the appropriate subclass of `InputReceiver`.
+    assets_extra: A dict specifying how to populate the assets.extra directory
+      within the exported SavedModel, or `None` if no extra assets are needed.
+    as_text: whether to write the SavedModel proto in text format.
+    checkpoint_path: The checkpoint path to export.  If `None` (the default),
+      the most recent checkpoint found within the model directory is chosen.
+    strip_default_attrs: Boolean. If `True`, default-valued attributes will be
+      removed from the NodeDefs. For a detailed guide, see
+      [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
+    mode: tf.estimator.ModeKeys value indicating with mode will be exported.
+
+  Returns:
+    The string path to the exported directory.
+
+  Raises:
+    ValueError: if input_receiver_fn is None, no export_outputs
+      are provided, or no checkpoint can be found.
+  """
+  # pylint: enable=line-too-long
+
+  # pylint: disable=protected-access
+  return estimator._export_saved_model_for_mode(
+      export_dir_base, input_receiver_fn,
+      assets_extra=assets_extra,
+      as_text=as_text,
+      checkpoint_path=checkpoint_path,
+      strip_default_attrs=strip_default_attrs,
+      mode=mode)
+  # pylint: enable=protected-access
+
+
+def export_all_saved_models(
+    estimator, export_dir_base, input_receiver_fn_map,
+    assets_extra=None,
+    as_text=False,
+    checkpoint_path=None,
+    strip_default_attrs=False):
+  # pylint: disable=line-too-long
+  """Exports requested train/eval/predict graphs as separate SavedModels.
+
+  See tf.contrib.estimator.export_all_saved_models for the currently
+  exposed version of this function.
+
+  For each mode passed in via the input_receiver_fn_map,
+  this method builds a new graph by calling the input_receiver_fn to obtain
+  feature and label `Tensor`s. Next, this method calls the `Estimator`'s
+  model_fn in the passed mode to generate the model graph based on
+  those features and labels, and restores the given checkpoint
+  (or, lacking that, the most recent checkpoint) into the graph.
+  Only one of the modes is used for saving variables to the SavedModel
+  (order of preference: TRAIN, EVAL, then PREDICT), such that up to three
+  MetaGraphDefs are saved with a single set of variables in a single
+  SavedModel directory.
+
+  For prediction, the exported `MetaGraphDef` will provide one `SignatureDef`
+  for each element of the export_outputs dict returned from the model_fn,
+  named using the same keys.  One of these keys is always
+  signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY, indicating which
+  signature will be served when a serving request does not specify one.
+  For each signature, the outputs are provided by the corresponding
+  `ExportOutput`s, and the inputs are always the input receivers provided by
+  the serving_input_receiver_fn.
+
+  For training and evaluation, the train_op is stored in an extra collection,
+  and loss, metrics, and predictions are included in a SignatureDef for the
+  mode in question.
+
+  Extra assets may be written into the SavedModel via the assets_extra
+  argument.  This should be a dict, where each key gives a destination path
+  (including the filename) relative to the assets.extra directory.  The
+  corresponding value gives the full path of the source file to be copied.
+  For example, the simple case of copying a single file without renaming it
+  is specified as `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
+
+  Sample usage:
+  ```python
+  classifier = tf.estimator.LinearClassifier(
+      feature_columns=[age, language])
+  classifier.train(input_fn=input_fn)
+
+  feature_spec = {
+      'age': tf.placeholder(dtype=tf.int64),
+      'language': array_ops.placeholder(dtype=tf.string)
+  }
+  label_spec = tf.placeholder(dtype=dtypes.int64)
+
+  train_rcvr_fn = tf.contrib.estimator.build_raw_supervised_input_receiver_fn(
+      feature_spec, label_spec)
+
+  serve_rcvr_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(
+      feature_spec)
+
+  rcvr_fn_map = {
+      model_fn_lib.ModeKeys.TRAIN: train_rcvr_fn,
+      model_fn_lib.ModeKeys.PREDICT: serve_rcvr_fn,
+  }
+
+  export_dir = tf.contrib.estimator.export_all_saved_models(
+      classifier,
+      export_dir_base='my_model/',
+      input_receiver_fn_map=rcvr_fn_map)
+
+  # export_dirs is a dict of directories with SavedModels, which
+  # can be used for serving, analysis with TFMA, or directly loaded in.
+  with ops.Graph().as_default() as graph:
+    with session.Session(graph=graph) as sess:
+      loader.load(sess, [tag_constants.TRAINING], export_dir)
+      weights = graph.get_tensor_by_name('linear/linear_model/age/weights')
+      ...
+  ```
+
+  Args:
+    estimator: an instance of tf.estimator.Estimator
+    export_dir_base: A string containing a directory in which to create
+      timestamped subdirectories containing exported SavedModels.
+    input_receiver_fn_map: dict of tf.estimator.ModeKeys to input_receiver_fn
+      mappings, where the input_receiver_fn is a function that takes no
+      argument and returns the appropriate subclass of `InputReceiver`.
+    assets_extra: A dict specifying how to populate the assets.extra directory
+      within the exported SavedModel, or `None` if no extra assets are needed.
+    as_text: whether to write the SavedModel proto in text format.
+    checkpoint_path: The checkpoint path to export.  If `None` (the default),
+      the most recent checkpoint found within the model directory is chosen.
+    strip_default_attrs: Boolean. If `True`, default-valued attributes will be
+      removed from the NodeDefs. For a detailed guide, see
+      [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
+
+  Returns:
+    A dict of tf.estimator.ModeKeys value to string path for each exported
+    directory.
+
+  Raises:
+    ValueError: if any input_receiver_fn is None, no export_outputs
+      are provided, or no checkpoint can be found.
+  """
+  # pylint: enable=line-too-long
+
+  # pylint: disable=protected-access
+  return estimator._export_all_saved_models(
+      export_dir_base, input_receiver_fn_map,
+      assets_extra=assets_extra,
+      as_text=as_text,
+      checkpoint_path=checkpoint_path,
+      strip_default_attrs=strip_default_attrs)
+  # pylint: enable=protected-access
diff --git a/tensorflow/contrib/estimator/python/estimator/export_test.py b/tensorflow/contrib/estimator/python/estimator/export_test.py
new file mode 100644
index 00000000000000..050821ee672f30
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/export_test.py
@@ -0,0 +1,373 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for contrib wrapping of export_saved_model_for_mode functionality.
+
+These are direct copies of the tests included in core, with import locations
+changed. These should be removed when the functionality in core is part of the
+public API.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tempfile
+
+from tensorflow.contrib.estimator.python.estimator import export as contrib_export
+from tensorflow.python.client import session
+from tensorflow.python.estimator import estimator
+from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.estimator.export import export
+from tensorflow.python.estimator.export import export_output
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics as metrics_lib
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import loader
+from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.training import training
+from tensorflow.python.util import compat
+
+
+def _model_fn_for_export_tests(features, labels, mode):
+  _, _ = features, labels
+  variables.Variable(1., name='weight')
+  scores = constant_op.constant([3.])
+  classes = constant_op.constant(['wumpus'])
+  update_global_step = state_ops.assign_add(training.get_global_step(), 1)
+  with ops.control_dependencies([update_global_step]):
+    train_op = constant_op.constant(2.)
+  return model_fn_lib.EstimatorSpec(
+      mode,
+      predictions=constant_op.constant(10.),
+      loss=constant_op.constant(1.),
+      train_op=train_op,
+      export_outputs={
+          'test': export_output.ClassificationOutput(scores, classes)})
+
+
+def _x_y_input_fn():
+  return ({'x': constant_op.constant([[1], [1]]),
+           'y': constant_op.constant([[2], [2]])},
+          constant_op.constant([[1], [1]]))
+
+
+def _model_fn_with_x_y(features, labels, mode):
+  _ = labels
+  variables.Variable(1., name='weight')
+  scores = constant_op.constant([3.])
+  classes = constant_op.constant(['wumpus'])
+  if mode == model_fn_lib.ModeKeys.PREDICT:
+    variables.Variable(36., name='name_collision')
+    return model_fn_lib.EstimatorSpec(
+        mode,
+        predictions=constant_op.constant(10.),
+        export_outputs={
+            'test': export_output.ClassificationOutput(scores, classes)})
+  else:
+    prefix = 'eval_' if mode == model_fn_lib.ModeKeys.EVAL else ''
+
+    multiplied = math_ops.multiply(
+        features['x'], features['y'], name='{}multiplied'.format(prefix))
+    metrics = {'mean': metrics_lib.mean(features['x'] - features['y'],
+                                        name='{}mean'.format(prefix))}
+    variables.Variable(1., name='later_var')
+    variables.Variable(3., name='name_collision')
+    return model_fn_lib.EstimatorSpec(
+        mode,
+        predictions=multiplied,
+        loss=constant_op.constant(1.),
+        train_op=state_ops.assign_add(training.get_global_step(), 1),
+        eval_metric_ops=metrics)
+
+
+def _get_serving_input_receiver_fn():
+  feature_spec = {'x': parsing_ops.VarLenFeature(dtype=dtypes.int64),
+                  'y': parsing_ops.VarLenFeature(dtype=dtypes.int64)}
+  return export.build_parsing_serving_input_receiver_fn(feature_spec)
+
+
+def _get_supervised_input_receiver_fn():
+  feature_spec = {
+      'x': array_ops.placeholder(
+          dtype=dtypes.int64, shape=(2, 1), name='feature_x'),
+      'y': array_ops.placeholder(
+          dtype=dtypes.int64, shape=(2, 1), name='feature_y')
+      }
+  label_spec = array_ops.placeholder(
+      dtype=dtypes.float32, shape=[1], name='truth')
+
+  return export.build_raw_supervised_input_receiver_fn(
+      feature_spec, label_spec)
+
+
+class EstimatorExportTest(test.TestCase):
+
+  def test_export_saved_model_train(self):
+    self._test_export_saved_model_for_mode(
+        _get_supervised_input_receiver_fn(), model_fn_lib.ModeKeys.TRAIN)
+
+  def test_export_saved_model_eval(self):
+    self._test_export_saved_model_for_mode(
+        _get_supervised_input_receiver_fn(), model_fn_lib.ModeKeys.EVAL)
+
+  def test_export_saved_model_predict(self):
+    self._test_export_saved_model_for_mode(
+        _get_serving_input_receiver_fn(), model_fn_lib.ModeKeys.PREDICT)
+
+  def _test_export_saved_model_for_mode(self, input_receiver_fn, mode):
+    tmpdir = tempfile.mkdtemp()
+    est = estimator.Estimator(model_fn=_model_fn_for_export_tests)
+    est.train(input_fn=_x_y_input_fn, steps=1)
+
+    # Perform the export.
+    export_dir_base = os.path.join(
+        compat.as_bytes(tmpdir), compat.as_bytes('export'))
+    export_dir = contrib_export.export_saved_model_for_mode(
+        est, export_dir_base, input_receiver_fn, mode=mode)
+
+    # Check that all the files are in the right places.
+    self.assertTrue(gfile.Exists(export_dir_base))
+    self._validate_exported_files(export_dir)
+
+    # Restore, to validate that the export was well-formed.
+    tag_set = model_fn_lib.EXPORT_TAG_MAP[mode]
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, tag_set, export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertFalse('name_collision_1' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def test_export_all_saved_models_proto_roundtrip_receiver_map(self):
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.PREDICT: _get_serving_input_receiver_fn()
+    }
+    export_dir, tmpdir = self._test_export_all_saved_models(
+        input_receiver_fn_map)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.SERVING], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('input_example_tensor' in graph_ops)
+        self.assertTrue('ParseExample/ParseExample' in graph_ops)
+        self.assertFalse('feature_x' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def test_export_all_saved_models_proto_roundtrip_train_only(self):
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
+    }
+    export_dir, tmpdir = self._test_export_all_saved_models(
+        input_receiver_fn_map)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.TRAINING], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('multiplied' in graph_ops)
+        self.assertTrue('mean/update_op' in graph_ops)
+        self.assertFalse('eval_multiplied' in graph_ops)
+        self.assertTrue('feature_x' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def test_export_all_saved_models_proto_roundtrip_eval_only(self):
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.EVAL: _get_supervised_input_receiver_fn()
+    }
+    export_dir, tmpdir = self._test_export_all_saved_models(
+        input_receiver_fn_map)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.EVAL], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('eval_multiplied' in graph_ops)
+        self.assertTrue('eval_mean/value' in graph_ops)
+        self.assertFalse('multiplied' in graph_ops)
+        self.assertTrue('feature_x' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def test_export_all_saved_models_proto_roundtrip_no_serving(self):
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
+        model_fn_lib.ModeKeys.EVAL: _get_supervised_input_receiver_fn()
+    }
+    export_dir, tmpdir = self._test_export_all_saved_models(
+        input_receiver_fn_map)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.TRAINING], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('multiplied' in graph_ops)
+        self.assertFalse('eval_multiplied' in graph_ops)
+        self.assertTrue('feature_x' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.EVAL], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('eval_multiplied' in graph_ops)
+        self.assertFalse('multiplied' in graph_ops)
+        # TODO(karmel): is this the desired behavior when names are shared?
+        self.assertTrue('feature_x_1' in graph_ops)
+        self.assertTrue('feature_y_1' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def test_export_all_saved_models_proto_roundtrip_three_defs(self):
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
+        model_fn_lib.ModeKeys.EVAL: _get_supervised_input_receiver_fn(),
+        model_fn_lib.ModeKeys.PREDICT: _get_serving_input_receiver_fn()
+    }
+    export_dir, tmpdir = self._test_export_all_saved_models(
+        input_receiver_fn_map)
+
+    # Restore, to validate that the export was well-formed.
+    for tag_set in model_fn_lib.EXPORT_TAG_MAP.values():
+      with ops.Graph().as_default() as graph:
+        with session.Session(graph=graph) as sess:
+          loader.load(sess, tag_set, export_dir)
+          graph_ops = [x.name for x in graph.get_operations()]
+          self.assertTrue('global_step/Assign' in graph_ops)
+          self.assertTrue('global_step/Initializer/zeros' in graph_ops)
+          self.assertTrue('weight' in graph_ops)
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def test_export_all_saved_models_proto_roundtrip_all_vars(self):
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
+        model_fn_lib.ModeKeys.PREDICT: _get_serving_input_receiver_fn()
+    }
+    export_dir, tmpdir = self._test_export_all_saved_models(
+        input_receiver_fn_map)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.TRAINING], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('later_var' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.SERVING], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertFalse('later_var' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def test_export_all_saved_models_name_collision(self):
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
+        model_fn_lib.ModeKeys.PREDICT: _get_serving_input_receiver_fn()
+    }
+    export_dir, tmpdir = self._test_export_all_saved_models(
+        input_receiver_fn_map)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.TRAINING], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('name_collision' in graph_ops)
+        self.assertFalse('name_collision_1' in graph_ops)
+        collection_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+        self.assertEqual(3, collection_vars[-1].eval())
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.SERVING], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('name_collision' in graph_ops)
+        self.assertFalse('name_collision_1' in graph_ops)
+        collection_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+        # This is a non-obvious detail: when we load the estimator spec
+        # for predict, name_collision gets set to 36. However, we then restore
+        # from checkpoint, which should overwrite that var and make it the 3
+        # from training. In practice, this would not be a good way to write
+        # a model_fn, but leaving this check in for now to ensure consistency
+        # with what would happen given our current order of spec, then
+        # checkpoint.
+        self.assertEqual(3, collection_vars[-1].eval())
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def _test_export_all_saved_models(self, input_receiver_fn_map):
+    tmpdir = tempfile.mkdtemp()
+    est = estimator.Estimator(model_fn=_model_fn_with_x_y)
+    est.train(input_fn=_x_y_input_fn, steps=1)
+
+    # Perform the export.
+    export_dir_base = os.path.join(
+        compat.as_bytes(tmpdir), compat.as_bytes('export'))
+    export_dir = contrib_export.export_all_saved_models(
+        est, export_dir_base, input_receiver_fn_map)
+
+    # Check that all the files are in the right places.
+    self.assertTrue(gfile.Exists(export_dir_base))
+
+    self._validate_exported_files(export_dir)
+
+    return export_dir, tmpdir
+
+  def _validate_exported_files(self, export_dir):
+    self.assertTrue(gfile.Exists(export_dir))
+    self.assertTrue(gfile.Exists(os.path.join(
+        compat.as_bytes(export_dir),
+        compat.as_bytes('saved_model.pb'))))
+    self.assertTrue(gfile.Exists(os.path.join(
+        compat.as_bytes(export_dir),
+        compat.as_bytes('variables'))))
+    self.assertTrue(gfile.Exists(os.path.join(
+        compat.as_bytes(export_dir),
+        compat.as_bytes('variables/variables.index'))))
+    self.assertTrue(gfile.Exists(os.path.join(
+        compat.as_bytes(export_dir),
+        compat.as_bytes('variables/variables.data-00000-of-00001'))))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/extenders.py b/tensorflow/contrib/estimator/python/estimator/extenders.py
index 201699ed775f70..bf08be09e7baf6 100644
--- a/tensorflow/contrib/estimator/python/estimator/extenders.py
+++ b/tensorflow/contrib/estimator/python/estimator/extenders.py
@@ -22,12 +22,12 @@
 
 from tensorflow.python.estimator import estimator as estimator_lib
 from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator import util as estimator_util
 from tensorflow.python.estimator.export.export_output import PredictOutput
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.training import optimizer as optimizer_lib
+from tensorflow.python.util import function_utils
 
 
 _VALID_METRIC_FN_ARGS = set(['features', 'labels', 'predictions', 'config'])
@@ -330,7 +330,7 @@ def get_slot_names(self, *args, **kwargs):
 
 
 def _verify_metric_fn_args(metric_fn):
-  args = set(estimator_util.fn_args(metric_fn))
+  args = set(function_utils.fn_args(metric_fn))
   invalid_args = list(args - _VALID_METRIC_FN_ARGS)
   if invalid_args:
     raise ValueError('metric_fn (%s) has following not expected args: %s' %
@@ -339,7 +339,7 @@ def _verify_metric_fn_args(metric_fn):
 
 def _call_metric_fn(metric_fn, features, labels, predictions, config):
   """Calls metric fn with proper arguments."""
-  metric_fn_args = estimator_util.fn_args(metric_fn)
+  metric_fn_args = function_utils.fn_args(metric_fn)
   kwargs = {}
   if 'features' in metric_fn_args:
     kwargs['features'] = features
diff --git a/tensorflow/contrib/estimator/python/estimator/head.py b/tensorflow/contrib/estimator/python/estimator/head.py
index 3dcf0374c8a12b..b798769d2cfde6 100644
--- a/tensorflow/contrib/estimator/python/estimator/head.py
+++ b/tensorflow/contrib/estimator/python/estimator/head.py
@@ -18,6 +18,8 @@
 from __future__ import division
 from __future__ import print_function
 
+import six
+
 from tensorflow.python.estimator import model_fn
 from tensorflow.python.estimator.canned import head as head_lib
 from tensorflow.python.estimator.canned import metric_keys
@@ -72,6 +74,33 @@ def multi_class_head(n_classes,
   shape `[D0, D1, ... DN, 1]`. Namely, the head applies `label_vocabulary` to
   the input labels before passing them to `loss_fn`.
 
+  The head can be used with a canned estimator. Example:
+
+  ```python
+  my_head = tf.contrib.estimator.multi_class_head(n_classes=3)
+  my_estimator = tf.contrib.estimator.DNNEstimator(
+      head=my_head,
+      hidden_units=...,
+      feature_columns=...)
+  ```
+
+  It can also be used with a custom `model_fn`. Example:
+
+  ```python
+  def _my_model_fn(features, labels, mode):
+    my_head = tf.contrib.estimator.multi_class_head(n_classes=3)
+    logits = tf.keras.Model(...)(features)
+
+    return my_head.create_estimator_spec(
+        features=features,
+        mode=mode,
+        labels=labels,
+        optimizer=tf.AdagradOptimizer(learning_rate=0.1),
+        logits=logits)
+
+  my_estimator = tf.estimator.Estimator(model_fn=_my_model_fn)
+  ```
+
   Args:
     n_classes: Number of classes, must be greater than 2 (for 2 classes, use
       `binary_classification_head`).
@@ -139,6 +168,33 @@ def binary_classification_head(
   shape `[D0, D1, ... DN, 1]`. Namely, the head applies `label_vocabulary` to
   the input labels before passing them to `loss_fn`.
 
+  The head can be used with a canned estimator. Example:
+
+  ```python
+  my_head = tf.contrib.estimator.binary_classification_head()
+  my_estimator = tf.contrib.estimator.DNNEstimator(
+      head=my_head,
+      hidden_units=...,
+      feature_columns=...)
+  ```
+
+  It can also be used with a custom `model_fn`. Example:
+
+  ```python
+  def _my_model_fn(features, labels, mode):
+    my_head = tf.contrib.estimator.binary_classification_head()
+    logits = tf.keras.Model(...)(features)
+
+    return my_head.create_estimator_spec(
+        features=features,
+        mode=mode,
+        labels=labels,
+        optimizer=tf.AdagradOptimizer(learning_rate=0.1),
+        logits=logits)
+
+  my_estimator = tf.estimator.Estimator(model_fn=_my_model_fn)
+  ```
+
   Args:
     weight_column: A string or a `_NumericColumn` created by
       `tf.feature_column.numeric_column` defining feature column representing
@@ -205,11 +261,39 @@ def regression_head(weight_column=None,
   shape `[D0, D1, ... DN, label_dimension]`.
 
   Also supports custom `inverse_link_fn`, also known as 'mean function'.
-  `inverse_link_fn` takes `logits` as argument and returns predicted values.
-  This function is the inverse of the link function defined in
+  `inverse_link_fn` is only used in `PREDICT` mode. It takes `logits` as
+  argument and returns predicted values. This function is the inverse of the
+  link function defined in
   https://en.wikipedia.org/wiki/Generalized_linear_model#Link_function
   Namely, for poisson regression, set `inverse_link_fn=tf.exp`.
 
+  The head can be used with a canned estimator. Example:
+
+  ```python
+  my_head = tf.contrib.estimator.regression_head()
+  my_estimator = tf.contrib.estimator.DNNEstimator(
+      head=my_head,
+      hidden_units=...,
+      feature_columns=...)
+  ```
+
+  It can also be used with a custom `model_fn`. Example:
+
+  ```python
+  def _my_model_fn(features, labels, mode):
+    my_head = tf.contrib.estimator.regression_head()
+    logits = tf.keras.Model(...)(features)
+
+    return my_head.create_estimator_spec(
+        features=features,
+        mode=mode,
+        labels=labels,
+        optimizer=tf.AdagradOptimizer(learning_rate=0.1),
+        logits=logits)
+
+  my_estimator = tf.estimator.Estimator(model_fn=_my_model_fn)
+  ```
+
   Args:
     weight_column: A string or a `_NumericColumn` created by
       `tf.feature_column.numeric_column` defining feature column representing
@@ -234,7 +318,7 @@ def regression_head(weight_column=None,
   Raises:
     ValueError: If `label_dimension` or `loss_reduction` is invalid.
   """
-  return head_lib._regression_head_with_mean_squared_error_loss(  # pylint:disable=protected-access
+  return head_lib._regression_head(  # pylint:disable=protected-access
       weight_column=weight_column,
       label_dimension=label_dimension,
       loss_reduction=loss_reduction,
@@ -269,6 +353,33 @@ def poisson_regression_head(
   This is implemented as a generalized linear model, see
   https://en.wikipedia.org/wiki/Generalized_linear_model.
 
+  The head can be used with a canned estimator. Example:
+
+  ```python
+  my_head = tf.contrib.estimator.poisson_regression_head()
+  my_estimator = tf.contrib.estimator.DNNEstimator(
+      head=my_head,
+      hidden_units=...,
+      feature_columns=...)
+  ```
+
+  It can also be used with a custom `model_fn`. Example:
+
+  ```python
+  def _my_model_fn(features, labels, mode):
+    my_head = tf.contrib.estimator.poisson_regression_head()
+    logits = tf.keras.Model(...)(features)
+
+    return my_head.create_estimator_spec(
+        features=features,
+        mode=mode,
+        labels=labels,
+        optimizer=tf.AdagradOptimizer(learning_rate=0.1),
+        logits=logits)
+
+  my_estimator = tf.estimator.Estimator(model_fn=_my_model_fn)
+  ```
+
   Args:
     weight_column: A string or a `_NumericColumn` created by
       `tf.feature_column.numeric_column` defining feature column representing
@@ -296,7 +407,7 @@ def poisson_regression_head(
   def _poisson_loss(labels, logits):
     return nn.log_poisson_loss(
         targets=labels, log_input=logits, compute_full_loss=compute_full_loss)
-  return head_lib._regression_head_with_mean_squared_error_loss(  # pylint:disable=protected-access
+  return head_lib._regression_head(  # pylint:disable=protected-access
       weight_column=weight_column,
       label_dimension=label_dimension,
       loss_reduction=loss_reduction,
@@ -305,12 +416,103 @@ def _poisson_loss(labels, logits):
       name=name)
 
 
+def logistic_regression_head(
+    weight_column=None,
+    loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE,
+    name=None):
+  """Creates a `_Head` for logistic regression.
+
+  Uses `sigmoid_cross_entropy_with_logits` loss, which is the same as
+  `binary_classification_head`. The differences compared to
+  `binary_classification_head` are:
+
+  * Does not support `label_vocabulary`. Instead, labels must be float in the
+    range [0, 1].
+  * Does not calculate some metrics that do not make sense, such as AUC.
+  * In `PREDICT` mode, only returns logits and predictions
+    (`=tf.sigmoid(logits)`), whereas `binary_classification_head` also returns
+    probabilities, classes, and class_ids.
+  * Export output defaults to `RegressionOutput`, whereas
+    `binary_classification_head` defaults to `PredictOutput`.
+
+  The head expects `logits` with shape `[D0, D1, ... DN, 1]`.
+  In many applications, the shape is `[batch_size, 1]`.
+
+  The `labels` shape must match `logits`, namely
+  `[D0, D1, ... DN]` or `[D0, D1, ... DN, 1]`.
+
+  If `weight_column` is specified, weights must be of shape
+  `[D0, D1, ... DN]` or `[D0, D1, ... DN, 1]`.
+
+  This is implemented as a generalized linear model, see
+  https://en.wikipedia.org/wiki/Generalized_linear_model.
+
+  The head can be used with a canned estimator. Example:
+
+  ```python
+  my_head = tf.contrib.estimator.logistic_regression_head()
+  my_estimator = tf.contrib.estimator.DNNEstimator(
+      head=my_head,
+      hidden_units=...,
+      feature_columns=...)
+  ```
+
+  It can also be used with a custom `model_fn`. Example:
+
+  ```python
+  def _my_model_fn(features, labels, mode):
+    my_head = tf.contrib.estimator.logistic_regression_head()
+    logits = tf.keras.Model(...)(features)
+
+    return my_head.create_estimator_spec(
+        features=features,
+        mode=mode,
+        labels=labels,
+        optimizer=tf.AdagradOptimizer(learning_rate=0.1),
+        logits=logits)
+
+  my_estimator = tf.estimator.Estimator(model_fn=_my_model_fn)
+  ```
+
+  Args:
+    weight_column: A string or a `_NumericColumn` created by
+      `tf.feature_column.numeric_column` defining feature column representing
+      weights. It is used to down weight or boost examples during training. It
+      will be multiplied by the loss of the example.
+    loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how to
+      reduce training loss over batch and label dimension. Defaults to
+      `SUM_OVER_BATCH_SIZE`, namely weighted sum of losses divided by
+      `batch size * label_dimension`. See `tf.losses.Reduction`.
+    name: name of the head. If provided, summary and metrics keys will be
+      suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
+
+  Returns:
+    An instance of `_Head` for logistic regression.
+
+  Raises:
+    ValueError: If `loss_reduction` is invalid.
+  """
+  def _logistic_loss(labels, logits):
+    labels = head_lib._assert_range(  # pylint:disable=protected-access
+        labels, n_classes=2, message='Labels must be in range [0, 1]')
+    return nn.sigmoid_cross_entropy_with_logits(
+        labels=labels, logits=logits)
+  return head_lib._regression_head(  # pylint:disable=protected-access
+      weight_column=weight_column,
+      label_dimension=1,
+      loss_reduction=loss_reduction,
+      loss_fn=_logistic_loss,
+      inverse_link_fn=math_ops.sigmoid,
+      name=name)
+
+
 def multi_label_head(n_classes,
                      weight_column=None,
                      thresholds=None,
                      label_vocabulary=None,
                      loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE,
                      loss_fn=None,
+                     classes_for_class_based_metrics=None,
                      name=None):
   """Creates a `_Head` for multi-label classification.
 
@@ -342,6 +544,33 @@ def multi_label_head(n_classes,
   shape `[D0, D1, ... DN, n_classes]`. Namely, the head applies
   `label_vocabulary` to the input labels before passing them to `loss_fn`.
 
+  The head can be used with a canned estimator. Example:
+
+  ```python
+  my_head = tf.contrib.estimator.multi_label_head(n_classes=3)
+  my_estimator = tf.contrib.estimator.DNNEstimator(
+      head=my_head,
+      hidden_units=...,
+      feature_columns=...)
+  ```
+
+  It can also be used with a custom `model_fn`. Example:
+
+  ```python
+  def _my_model_fn(features, labels, mode):
+    my_head = tf.contrib.estimator.multi_label_head(n_classes=3)
+    logits = tf.keras.Model(...)(features)
+
+    return my_head.create_estimator_spec(
+        features=features,
+        mode=mode,
+        labels=labels,
+        optimizer=tf.AdagradOptimizer(learning_rate=0.1),
+        logits=logits)
+
+  my_estimator = tf.estimator.Estimator(model_fn=_my_model_fn)
+  ```
+
   Args:
     n_classes: Number of classes, must be greater than 1 (for 1 class, use
       `binary_classification_head`).
@@ -363,6 +592,10 @@ def multi_label_head(n_classes,
       reduce training loss over batch. Defaults to `SUM_OVER_BATCH_SIZE`, namely
       weighted sum of losses divided by batch size. See `tf.losses.Reduction`.
     loss_fn: Optional loss function.
+    classes_for_class_based_metrics: List of integer class IDs or string class
+      names for which per-class metrics are evaluated. If integers, all must be
+      in the range `[0, n_classes - 1]`. If strings, all must be in
+      `label_vocabulary`.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`. Also used as `name_scope` when creating ops.
 
@@ -370,8 +603,8 @@ def multi_label_head(n_classes,
     An instance of `_Head` for multi-label classification.
 
   Raises:
-    ValueError: if `n_classes`, `thresholds`, `loss_reduction` or `loss_fn` is
-    invalid.
+    ValueError: if `n_classes`, `thresholds`, `loss_reduction`, `loss_fn` or
+    `metric_class_ids` is invalid.
   """
   thresholds = tuple(thresholds) if thresholds else tuple()
   if n_classes is None or n_classes < 2:
@@ -396,10 +629,31 @@ def multi_label_head(n_classes,
   if (loss_reduction not in losses.Reduction.all() or
       loss_reduction == losses.Reduction.NONE):
     raise ValueError('Invalid loss_reduction: {}'.format(loss_reduction))
+  classes_for_class_based_metrics = tuple(
+      [] if classes_for_class_based_metrics is None
+      else classes_for_class_based_metrics)
+  if classes_for_class_based_metrics:
+    if isinstance(classes_for_class_based_metrics[0], six.string_types):
+      if not label_vocabulary:
+        raise ValueError(
+            'label_vocabulary must be provided when '
+            'classes_for_class_based_metrics are sting.')
+      class_ids = []
+      for class_string in classes_for_class_based_metrics:
+        class_ids.append(label_vocabulary.index(class_string))
+      classes_for_class_based_metrics = tuple(class_ids)
+    else:
+      for class_id in classes_for_class_based_metrics:
+        if (class_id < 0) or (class_id >= n_classes):
+          raise ValueError(
+              'All classes_for_class_based_metrics must be in range [0, {}]. '
+              'Given: {}'.format(n_classes - 1, class_id))
   return _MultiLabelHead(
       n_classes=n_classes, weight_column=weight_column, thresholds=thresholds,
       label_vocabulary=label_vocabulary, loss_reduction=loss_reduction,
-      loss_fn=loss_fn, name=name)
+      loss_fn=loss_fn,
+      classes_for_class_based_metrics=classes_for_class_based_metrics,
+      name=name)
 
 
 class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
@@ -412,6 +666,7 @@ def __init__(self,
                label_vocabulary=None,
                loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE,
                loss_fn=None,
+               classes_for_class_based_metrics=None,
                name=None):
     self._n_classes = n_classes
     self._weight_column = weight_column
@@ -419,6 +674,7 @@ def __init__(self,
     self._label_vocabulary = label_vocabulary
     self._loss_reduction = loss_reduction
     self._loss_fn = loss_fn
+    self._classes_for_class_based_metrics = classes_for_class_based_metrics
     self._name = name
 
   @property
@@ -496,10 +752,10 @@ def create_loss(self, features, mode, logits, labels):
         weights=weights,
         processed_labels=processed_labels)
 
-  def create_estimator_spec(
+  def _create_tpu_estimator_spec(
       self, features, mode, logits, labels=None, optimizer=None,
       train_op_fn=None, regularization_losses=None):
-    """Returns an `EstimatorSpec`.
+    """Returns an `model_fn._TPUEstimatorSpec`.
 
     Args:
       features: Input `dict` of `Tensor` or `SparseTensor` objects.
@@ -522,7 +778,7 @@ def create_estimator_spec(
         `loss_reduction=SUM_OVER_NONZERO_WEIGHTS` when creating the head to
         avoid scaling errors.
     Returns:
-      `EstimatorSpec`.
+      `model_fn._TPUEstimatorSpec`.
     Raises:
       ValueError: If both `train_op_fn` and `optimizer` are `None` in TRAIN
         mode, or if both are set.
@@ -542,7 +798,7 @@ def create_estimator_spec(
         classifier_output = head_lib._classification_output(  # pylint:disable=protected-access
             scores=probabilities, n_classes=self._n_classes,
             label_vocabulary=self._label_vocabulary)
-        return model_fn.EstimatorSpec(
+        return model_fn._TPUEstimatorSpec(  # pylint:disable=protected-access
             mode=model_fn.ModeKeys.PREDICT,
             predictions=predictions,
             export_outputs={
@@ -565,16 +821,18 @@ def create_estimator_spec(
 
       # Eval.
       if mode == model_fn.ModeKeys.EVAL:
-        return model_fn.EstimatorSpec(
+        return model_fn._TPUEstimatorSpec(  # pylint:disable=protected-access
             mode=model_fn.ModeKeys.EVAL,
             predictions=predictions,
             loss=regularized_training_loss,
-            eval_metric_ops=self._eval_metric_ops(
-                labels=processed_labels,
-                probabilities=probabilities,
-                weights=weights,
-                unreduced_loss=unreduced_loss,
-                regularization_loss=regularization_loss))
+            eval_metrics=head_lib._create_eval_metrics_tuple(  # pylint:disable=protected-access
+                self._eval_metric_ops, {
+                    'labels': processed_labels,
+                    'probabilities': probabilities,
+                    'weights': weights,
+                    'unreduced_loss': unreduced_loss,
+                    'regularization_loss': regularization_loss,
+                }))
 
       # Train.
       if optimizer is not None:
@@ -587,6 +845,7 @@ def create_estimator_spec(
         train_op = train_op_fn(regularized_training_loss)
       else:
         raise ValueError('train_op_fn and optimizer cannot both be None.')
+      train_op = head_lib._append_update_ops(train_op)  # pylint:disable=protected-access
       # Only summarize mean_loss for SUM reduction to preserve backwards
       # compatibility. Otherwise skip it to avoid unnecessary computation.
       if self._loss_reduction == losses.Reduction.SUM:
@@ -608,7 +867,7 @@ def create_estimator_spec(
         summary.scalar(
             head_lib._summary_key(self._name, keys.LOSS_REGULARIZATION),  # pylint:disable=protected-access
             regularization_loss)
-    return model_fn.EstimatorSpec(
+    return model_fn._TPUEstimatorSpec(  # pylint:disable=protected-access
         mode=model_fn.ModeKeys.TRAIN,
         predictions=predictions,
         loss=regularized_training_loss,
@@ -671,4 +930,36 @@ def _eval_metric_ops(
                 weights=weights,
                 threshold=threshold,
                 name=recall_key))
+      for class_id in self._classes_for_class_based_metrics:
+        batch_rank = array_ops.rank(probabilities) - 1
+        begin = array_ops.concat(
+            [array_ops.zeros([batch_rank], dtype=dtypes.int32), [class_id]],
+            axis=0)
+        size = array_ops.concat(
+            [-1 * array_ops.ones([batch_rank], dtype=dtypes.int32), [1]],
+            axis=0)
+        class_probabilities = array_ops.slice(
+            probabilities, begin=begin, size=size)
+        class_labels = array_ops.slice(labels, begin=begin, size=size)
+        prob_key = keys.PROBABILITY_MEAN_AT_CLASS % class_id
+        metric_ops[head_lib._summary_key(self._name, prob_key)] = (  # pylint:disable=protected-access
+            head_lib._predictions_mean(  # pylint:disable=protected-access
+                predictions=class_probabilities,
+                weights=weights,
+                name=prob_key))
+        auc_key = keys.AUC_AT_CLASS % class_id
+        metric_ops[head_lib._summary_key(self._name, auc_key)] = (  # pylint:disable=protected-access
+            head_lib._auc(  # pylint:disable=protected-access
+                labels=class_labels,
+                predictions=class_probabilities,
+                weights=weights,
+                name=auc_key))
+        auc_pr_key = keys.AUC_PR_AT_CLASS % class_id
+        metric_ops[head_lib._summary_key(self._name, auc_pr_key)] = (  # pylint:disable=protected-access
+            head_lib._auc(  # pylint:disable=protected-access
+                labels=class_labels,
+                predictions=class_probabilities,
+                weights=weights,
+                curve='PR',
+                name=auc_pr_key))
     return metric_ops
diff --git a/tensorflow/contrib/estimator/python/estimator/head_test.py b/tensorflow/contrib/estimator/python/estimator/head_test.py
index 98962ca4277a3e..b2b57fa06ba818 100644
--- a/tensorflow/contrib/estimator/python/estimator/head_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/head_test.py
@@ -36,6 +36,7 @@
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import string_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import signature_constants
@@ -175,6 +176,21 @@ def _loss_fn(labels, logits, name=None):
         r'loss_fn has unexpected args: \[\'name\'\]'):
       head_lib.multi_label_head(n_classes=3, loss_fn=_loss_fn)
 
+  def test_classes_for_class_based_metrics_invalid(self):
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'All classes_for_class_based_metrics must be in range \[0, 2\]\. '
+        r'Given: -1'):
+      head_lib.multi_label_head(
+          n_classes=3, classes_for_class_based_metrics=[2, -1])
+
+  def test_classes_for_class_based_metrics_string_invalid(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'\'z\' is not in list'):
+      head_lib.multi_label_head(
+          n_classes=3, label_vocabulary=['a', 'b', 'c'],
+          classes_for_class_based_metrics=['c', 'z'])
+
   def test_name(self):
     head = head_lib.multi_label_head(n_classes=4, name='foo')
     self.assertEqual('foo', head.name)
@@ -591,6 +607,81 @@ def test_eval_with_thresholds(self):
         expected_loss=expected_loss,
         expected_metrics=expected_metrics)
 
+  def test_eval_with_classes_for_class_based_metrics(self):
+    head = head_lib.multi_label_head(
+        n_classes=2, classes_for_class_based_metrics=[0, 1])
+
+    logits = np.array([[-1., 1.], [-1.5, 1.5]], dtype=np.float32)
+    labels = np.array([[1, 0], [1, 1]], dtype=np.int64)
+    # loss = labels * -log(sigmoid(logits)) +
+    #        (1 - labels) * -log(1 - sigmoid(logits))
+    # Sum over examples, divide by batch_size.
+    expected_loss = 0.5 * np.sum(
+        _sigmoid_cross_entropy(labels=labels, logits=logits))
+
+    keys = metric_keys.MetricKeys
+    expected_metrics = {
+        # Average loss over examples.
+        keys.LOSS_MEAN: expected_loss,
+        # auc and auc_pr cannot be reliably calculated for only 4 samples, but
+        # this assert tests that the algorithm remains consistent.
+        keys.AUC: 0.3333,
+        keys.AUC_PR: 0.7639,
+        keys.PROBABILITY_MEAN_AT_CLASS % 0: np.sum(_sigmoid(logits[:, 0])) / 2.,
+        keys.AUC_AT_CLASS % 0: 0.,
+        keys.AUC_PR_AT_CLASS % 0: 1.,
+        keys.PROBABILITY_MEAN_AT_CLASS % 1: np.sum(_sigmoid(logits[:, 1])) / 2.,
+        keys.AUC_AT_CLASS % 1: 1.,
+        keys.AUC_PR_AT_CLASS % 1: 1.,
+    }
+
+    self._test_eval(
+        head=head,
+        logits=logits,
+        labels=labels,
+        expected_loss=expected_loss,
+        expected_metrics=expected_metrics)
+
+  def test_eval_with_classes_for_class_based_metrics_string(self):
+    head = head_lib.multi_label_head(
+        n_classes=2, label_vocabulary=['a', 'b'],
+        classes_for_class_based_metrics=['a', 'b'])
+
+    logits = np.array([[-1., 1.], [-1.5, 1.5]], dtype=np.float32)
+    labels = sparse_tensor.SparseTensor(
+        values=['a', 'a', 'b'],
+        indices=[[0, 0], [1, 0], [1, 1]],
+        dense_shape=[2, 2])
+    labels_onehot = np.array([[1, 0], [1, 1]], dtype=np.int64)
+    # loss = labels * -log(sigmoid(logits)) +
+    #        (1 - labels) * -log(1 - sigmoid(logits))
+    # Sum over examples, divide by batch_size.
+    expected_loss = 0.5 * np.sum(
+        _sigmoid_cross_entropy(labels=labels_onehot, logits=logits))
+
+    keys = metric_keys.MetricKeys
+    expected_metrics = {
+        # Average loss over examples.
+        keys.LOSS_MEAN: expected_loss,
+        # auc and auc_pr cannot be reliably calculated for only 4 samples, but
+        # this assert tests that the algorithm remains consistent.
+        keys.AUC: 0.3333,
+        keys.AUC_PR: 0.7639,
+        keys.PROBABILITY_MEAN_AT_CLASS % 0: np.sum(_sigmoid(logits[:, 0])) / 2.,
+        keys.AUC_AT_CLASS % 0: 0.,
+        keys.AUC_PR_AT_CLASS % 0: 1.,
+        keys.PROBABILITY_MEAN_AT_CLASS % 1: np.sum(_sigmoid(logits[:, 1])) / 2.,
+        keys.AUC_AT_CLASS % 1: 1.,
+        keys.AUC_PR_AT_CLASS % 1: 1.,
+    }
+
+    self._test_eval(
+        head=head,
+        logits=logits,
+        labels=labels,
+        expected_loss=expected_loss,
+        expected_metrics=expected_metrics)
+
   def test_eval_with_weights(self):
     n_classes = 2
     head = head_lib.multi_label_head(n_classes, weight_column='example_weights')
@@ -899,6 +990,34 @@ def minimize(self, loss, global_step):
           six.b('{0:s}{1:.3f}'.format(expected_train_result, expected_loss)),
           train_result)
 
+  def test_train_with_update_ops(self):
+    head = head_lib.multi_label_head(n_classes=2)
+
+    with ops.Graph().as_default():
+      w = variables.Variable(1)
+      update_op = w.assign_add(1)
+      ops.add_to_collection(ops.GraphKeys.UPDATE_OPS, update_op)
+
+      t = variables.Variable('')
+      expected_train_result = b'my_train_op'
+      def _train_op_fn(loss):
+        del loss
+        return t.assign(expected_train_result)
+
+      spec = head.create_estimator_spec(
+          features={'x': np.array(((42,),), dtype=np.int32)},
+          mode=model_fn.ModeKeys.TRAIN,
+          logits=np.array([[-10., 10.], [-15., 10.]], dtype=np.float32),
+          labels=np.array([[1, 0], [1, 1]], dtype=np.int64),
+          train_op_fn=_train_op_fn)
+
+      with self.test_session() as sess:
+        _initialize_variables(self, spec.scaffold)
+        sess.run(spec.train_op)
+        w_value, t_value = sess.run([w, t])
+        self.assertEqual(2, w_value)
+        self.assertEqual(expected_train_result, t_value)
+
   def test_train_with_regularization_losses(self):
     head = head_lib.multi_label_head(
         n_classes=2, loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
@@ -1211,5 +1330,124 @@ def test_predict(self):
       self.assertAllClose(logits, spec.predictions[keys.LOGITS].eval())
 
 
+class LogisticRegressionHead(test.TestCase):
+
+  def setUp(self):
+    ops.reset_default_graph()
+
+  def test_train(self):
+    head = head_lib.logistic_regression_head()
+
+    # Create estimator spec.
+    logits = np.array([[0], [-1], [1]], dtype=np.float32)
+    labels = np.array([[.4], [.6], [.8]], dtype=np.float32)
+    # Following the documentation in
+    # tf.nn.sigmoid_cross_entropy_with_logits:
+    # With x = logits, z = labels.
+    # loss  = max(x, 0) - x * z + log(1 + exp(-abs(x)))
+    # loss = [0 - 0 * 0.4 + ln(1 + exp(-0)),
+    #         0 + 1 * 0.6 + ln(1 + exp(-1)),
+    #         1 - 1 * 0.8 + ln(1 + exp(-1))]
+    #      = [0.6931, 0.9133, 0.5133]
+    # training_loss = (0.6931 + 0.9133 + 0.5133) / 3
+    expected_loss = 0.7066
+    atol = 0.001
+    expected_train_result = b'my_train_op'
+    def _train_op_fn(loss):
+      with ops.control_dependencies((check_ops.assert_near(
+          math_ops.to_float(expected_loss), math_ops.to_float(loss),
+          atol=atol, name='assert_loss'),)):
+        return constant_op.constant(expected_train_result)
+
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42.,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        train_op_fn=_train_op_fn)
+
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      loss, train_result = sess.run([spec.loss, spec.train_op])
+      self.assertAlmostEqual(expected_loss, loss, delta=atol)
+      self.assertEqual(expected_train_result, train_result)
+
+  def test_train_labels_too_large(self):
+    head = head_lib.logistic_regression_head()
+
+    # Create estimator spec.
+    logits = np.array([[0], [-1], [1]], dtype=np.float32)
+    labels = np.array([[.4], [1.2], [.8]], dtype=np.float32)
+    expected_train_result = b'my_train_op'
+    def _train_op_fn(loss):
+      del loss
+      return constant_op.constant(expected_train_result)
+
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42.,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        train_op_fn=_train_op_fn)
+
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r'\[Labels must be in range \[0, 1\]\] .* \[\[0.4\]\[1.2\]\[0.8\]\]'):
+        _ = sess.run(spec.loss)
+
+  def test_train_labels_negative(self):
+    head = head_lib.logistic_regression_head()
+
+    # Create estimator spec.
+    logits = np.array([[0], [-1], [1]], dtype=np.float32)
+    labels = np.array([[.4], [-0.2], [.8]], dtype=np.float32)
+    expected_train_result = b'my_train_op'
+    def _train_op_fn(loss):
+      del loss
+      return constant_op.constant(expected_train_result)
+
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42.,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.TRAIN,
+        logits=logits,
+        labels=labels,
+        train_op_fn=_train_op_fn)
+
+    with self.test_session() as sess:
+      _initialize_variables(self, spec.scaffold)
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r'\[Labels must be in range \[0, 1\]\] .* \[\[0.4\]\[-0.2\]\[0.8\]\]'
+      ):
+        _ = sess.run(spec.loss)
+
+  def test_predict(self):
+    head = head_lib.logistic_regression_head()
+
+    # Create estimator spec.
+    logits = np.array([[0], [-1], [1]], dtype=np.float32)
+    expected_predictions = 1. / (1. + np.exp(-logits))
+    spec = head.create_estimator_spec(
+        features={'x': np.array(((42.,),), dtype=np.int32)},
+        mode=model_fn.ModeKeys.PREDICT,
+        logits=logits)
+
+    # Assert spec contains expected tensors.
+    keys = prediction_keys.PredictionKeys
+    self.assertItemsEqual(
+        (keys.PREDICTIONS, keys.LOGITS), spec.predictions.keys())
+    self.assertEqual(dtypes.float32, spec.predictions[keys.PREDICTIONS].dtype)
+    self.assertEqual(dtypes.float32, spec.predictions[keys.LOGITS].dtype)
+
+    # Assert predictions.
+    with self.test_session():
+      _initialize_variables(self, spec.scaffold)
+      self.assertAllClose(
+          expected_predictions, spec.predictions[keys.PREDICTIONS].eval())
+      self.assertAllClose(logits, spec.predictions[keys.LOGITS].eval())
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/hooks.py b/tensorflow/contrib/estimator/python/estimator/hooks.py
new file mode 100644
index 00000000000000..ddd6aa442f82ba
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/hooks.py
@@ -0,0 +1,213 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Some useful session run hooks."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.estimator import estimator as estimator_lib
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import training
+
+
+# pylint: disable=protected-access
+class InMemoryEvaluatorHook(training.SessionRunHook):
+  """Hook to run evaluation in training without a checkpoint.
+
+  Example:
+
+  ```python
+  def train_input_fn():
+    ...
+    return train_dataset
+
+  def eval_input_fn():
+    ...
+    return eval_dataset
+
+  estimator = tf.estimator.DNNClassifier(...)
+
+  evaluator = tf.contrib.estimator.InMemoryEvaluatorHook(
+      estimator, eval_input_fn)
+  estimator.train(train_input_fn, hooks=[evaluator])
+  ```
+
+  Current limitations of this approach are:
+  * It doesn't support multi-node distributed mode.
+  * It doesn't support saveable objects other than variables (such as boosted
+    tree support)
+  * It doesn't support custom saver logic (such as ExponentialMovingAverage
+    support)
+
+  """
+
+  def __init__(self,
+               estimator,
+               input_fn,
+               steps=None,
+               hooks=None,
+               name=None,
+               every_n_iter=100):
+    """Initializes a `InMemoryEvaluatorHook`.
+
+    Args:
+      estimator: A `tf.estimator.Estimator` instance to call evaluate.
+      input_fn:  Equivalent to the `input_fn` arg to `estimator.evaluate`. A
+        function that constructs the input data for evaluation.
+        See @{$premade_estimators#create_input_functions} for more
+        information. The function should construct and return one of
+        the following:
+
+          * A 'tf.data.Dataset' object: Outputs of `Dataset` object must be a
+            tuple (features, labels) with same constraints as below.
+          * A tuple (features, labels): Where `features` is a `Tensor` or a
+            dictionary of string feature name to `Tensor` and `labels` is a
+            `Tensor` or a dictionary of string label name to `Tensor`. Both
+            `features` and `labels` are consumed by `model_fn`. They should
+            satisfy the expectation of `model_fn` from inputs.
+
+      steps: Equivalent to the `steps` arg to `estimator.evaluate`.  Number of
+        steps for which to evaluate model. If `None`, evaluates until `input_fn`
+        raises an end-of-input exception.
+      hooks: Equivalent to the `hooks` arg to `estimator.evaluate`. List of
+        `SessionRunHook` subclass instances. Used for callbacks inside the
+        evaluation call.
+      name:  Equivalent to the `name` arg to `estimator.evaluate`. Name of the
+        evaluation if user needs to run multiple evaluations on different data
+        sets, such as on training data vs test data. Metrics for different
+        evaluations are saved in separate folders, and appear separately in
+        tensorboard.
+      every_n_iter: `int`, runs the evaluator once every N training iteration.
+
+    Raises:
+      ValueError: if `every_n_iter` is non-positive or it's not a single machine
+        training
+    """
+    if every_n_iter is None or every_n_iter <= 0:
+      raise ValueError('invalid every_n_iter=%s.' % every_n_iter)
+    if (estimator.config.num_ps_replicas > 0 or
+        estimator.config.num_worker_replicas > 1):
+      raise ValueError(
+          'InMemoryEvaluator supports only single machine (aka Local) setting.')
+    self._estimator = estimator
+    self._input_fn = input_fn
+    self._steps = steps
+    self._name = name
+    self._every_n_iter = every_n_iter
+    self._eval_dir = os.path.join(self._estimator.model_dir, 'eval'
+                                  if not name else 'eval_' + name)
+
+    self._graph = None
+    self._hooks = estimator_lib._check_hooks_type(hooks)
+    self._hooks.extend(self._estimator._convert_eval_steps_to_hooks(steps))
+    self._timer = training.SecondOrStepTimer(every_steps=every_n_iter)
+
+  def begin(self):
+    """Build eval graph and restoring op."""
+    self._timer.reset()
+    self._iter_count = 0
+    self._graph = ops.Graph()
+    with self._graph.as_default():
+      (self._scaffold, self._update_op, self._eval_dict,
+       self._all_hooks) = self._estimator._evaluate_build_graph(
+           self._input_fn, self._hooks, checkpoint_path=None)
+
+      if self._scaffold.saver is not None:
+        raise ValueError('InMemoryEvaluator does not support custom saver')
+      if self._scaffold.init_fn is not None:
+        raise ValueError('InMemoryEvaluator does not support custom init_fn')
+
+      self._var_name_to_eval_var = {
+          v.name: v for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+      }
+      self._var_name_to_placeholder = {
+          v.name: array_ops.placeholder(v.dtype)
+          for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+      }
+
+  def after_create_session(self, session, coord):  # pylint: disable=unused-argument
+    """Does first run which shows the eval metrics before training."""
+    if ops.get_collection(ops.GraphKeys.SAVEABLE_OBJECTS):
+      raise ValueError(
+          'InMemoryEvaluator does not support saveables other than global '
+          'variables.')
+    self._var_name_to_train_var = {
+        v.name: v for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    }
+    var_names_to_transfer = set(self._var_name_to_placeholder.keys()) & set(
+        self._var_name_to_train_var.keys())
+    # Filter training var names that are not exist in evaluation
+    self._var_name_to_train_var = {
+        v_name: self._var_name_to_train_var[v_name]
+        for v_name in var_names_to_transfer
+    }
+    # Filter eval var names that are not exist in training
+    self._var_name_to_eval_var = {
+        v_name: self._var_name_to_eval_var[v_name]
+        for v_name in var_names_to_transfer
+    }
+
+    with self._graph.as_default():
+      self._var_feed_op = control_flow_ops.group([
+          state_ops.assign(self._var_name_to_eval_var[v_name],
+                           self._var_name_to_placeholder[v_name])
+          for v_name in var_names_to_transfer
+      ])
+
+    self._evaluate(session)
+
+  def _evaluate(self, train_session):
+    var_name_to_value = train_session.run(self._var_name_to_train_var)
+    placeholder_to_value = {
+        self._var_name_to_placeholder[v_name]: var_name_to_value[v_name]
+        for v_name in var_name_to_value
+    }
+
+    def feed_variables(scaffold, session):
+      del scaffold
+      session.run(self._var_feed_op, feed_dict=placeholder_to_value)
+
+    scaffold = training.Scaffold(
+        init_fn=feed_variables, copy_from_scaffold=self._scaffold)
+
+    with self._graph.as_default():
+      return self._estimator._evaluate_run(
+          checkpoint_path=None,
+          scaffold=scaffold,
+          update_op=self._update_op,
+          eval_dict=self._eval_dict,
+          all_hooks=self._all_hooks,
+          output_dir=self._eval_dir)
+
+    self._timer.update_last_triggered_step(self._iter_count)
+
+  def after_run(self, run_context, run_values):  # pylint: disable=unused-argument
+    """Runs evaluator."""
+    self._iter_count += 1
+    if self._timer.should_trigger_for_step(self._iter_count):
+      self._evaluate(run_context.session)
+
+  def end(self, session):  # pylint: disable=unused-argument
+    """Runs evaluator for final model."""
+    self._evaluate(session)
+
+
+# pylint: enable=protected-access
diff --git a/tensorflow/contrib/estimator/python/estimator/hooks_test.py b/tensorflow/contrib/estimator/python/estimator/hooks_test.py
new file mode 100644
index 00000000000000..95ae971852ee6d
--- /dev/null
+++ b/tensorflow/contrib/estimator/python/estimator/hooks_test.py
@@ -0,0 +1,318 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for hooks."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import glob
+import json
+import os
+
+from tensorflow.contrib.estimator.python.estimator import hooks as hooks_lib
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.estimator import estimator_lib
+from tensorflow.python.estimator import run_config as run_config_lib
+from tensorflow.python.feature_column import feature_column as feature_column_lib
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import metrics as metrics_lib
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.summary import summary_iterator
+from tensorflow.python.summary.writer import writer_cache
+from tensorflow.python.training import training
+
+
+def summary_step_keyword_to_value_mapping(dir_):
+  writer_cache.FileWriterCache.clear()
+
+  # Get last Event written.
+  event_paths = glob.glob(os.path.join(dir_, 'events*'))
+  step_keyword_to_value = {}
+  for last_event in summary_iterator.summary_iterator(event_paths[-1]):
+    if last_event.step not in step_keyword_to_value:
+      step_keyword_to_value[last_event.step] = {}
+    if last_event.summary is not None:
+      for value in last_event.summary.value:
+        step_keyword_to_value[last_event.step][value.tag] = value.simple_value
+
+  return step_keyword_to_value
+
+
+def get_summary_value(dir_, step, keyword):
+  """Get summary value for given step and keyword."""
+
+  writer_cache.FileWriterCache.clear()
+  # Get last Event written.
+  event_paths = glob.glob(os.path.join(dir_, 'events*'))
+  print('XXX', event_paths)
+  for last_event in summary_iterator.summary_iterator(event_paths[-1]):
+    if last_event.step == step and last_event.summary is not None:
+      for value in last_event.summary.value:
+        if keyword in value.tag:
+          return value.simple_value
+  return None
+
+
+class InMemoryEvaluatorHookTest(test.TestCase):
+
+  def test_runs_eval_metrics(self):
+
+    def model_fn(features, labels, mode):
+      _ = labels
+      if estimator_lib.ModeKeys.TRAIN == mode:
+        with ops.control_dependencies([features]):
+          train_op = state_ops.assign_add(training.get_global_step(), 1)
+        return estimator_lib.EstimatorSpec(
+            mode, loss=constant_op.constant(3.), train_op=train_op)
+      if estimator_lib.ModeKeys.EVAL == mode:
+        return estimator_lib.EstimatorSpec(
+            mode,
+            loss=constant_op.constant(5.),
+            eval_metric_ops={'mean_of_features': metrics_lib.mean(features)})
+
+    estimator = estimator_lib.Estimator(model_fn=model_fn)
+
+    def input_fn():
+      return dataset_ops.Dataset.range(10)
+
+    evaluator = hooks_lib.InMemoryEvaluatorHook(
+        estimator, input_fn, every_n_iter=4)
+    estimator.train(input_fn, hooks=[evaluator])
+
+    self.assertTrue(os.path.isdir(estimator.eval_dir()))
+    step_keyword_to_value = summary_step_keyword_to_value_mapping(
+        estimator.eval_dir())
+    # 4.5 = sum(range(10))/10
+    # before training
+    self.assertEqual(4.5, step_keyword_to_value[0]['mean_of_features'])
+    # intervals (every_n_iter=4)
+    self.assertEqual(4.5, step_keyword_to_value[4]['mean_of_features'])
+    self.assertEqual(4.5, step_keyword_to_value[8]['mean_of_features'])
+    # end
+    self.assertEqual(4.5, step_keyword_to_value[10]['mean_of_features'])
+
+  def test_uses_latest_variable_value(self):
+
+    def model_fn(features, labels, mode):
+      _ = labels
+      step = training.get_global_step()
+      w = variable_scope.get_variable(
+          'w',
+          shape=[],
+          initializer=init_ops.zeros_initializer(),
+          dtype=dtypes.int64)
+      if estimator_lib.ModeKeys.TRAIN == mode:
+        # to consume features, we have control dependency
+        with ops.control_dependencies([features]):
+          step_inc = state_ops.assign_add(training.get_global_step(), 1)
+        with ops.control_dependencies([step_inc]):
+          assign_w_to_step_plus_2 = w.assign(step + 2)
+        return estimator_lib.EstimatorSpec(
+            mode,
+            loss=constant_op.constant(3.),
+            train_op=assign_w_to_step_plus_2)
+      if estimator_lib.ModeKeys.EVAL == mode:
+        # to consume features, we have control dependency
+        with ops.control_dependencies([features]):
+          loss = constant_op.constant(5.)
+        return estimator_lib.EstimatorSpec(
+            mode,
+            loss=loss,
+            # w is constant in each step, so the mean.
+            # w = 0 if step==0 else step+2
+            eval_metric_ops={'mean_of_const': metrics_lib.mean(w)})
+
+    estimator = estimator_lib.Estimator(model_fn=model_fn)
+
+    def input_fn():
+      return dataset_ops.Dataset.range(10)
+
+    evaluator = hooks_lib.InMemoryEvaluatorHook(
+        estimator, input_fn, every_n_iter=4)
+    estimator.train(input_fn, hooks=[evaluator])
+
+    self.assertTrue(os.path.isdir(estimator.eval_dir()))
+    step_keyword_to_value = summary_step_keyword_to_value_mapping(
+        estimator.eval_dir())
+    # w = 0 if step==0 else step+2
+    self.assertEqual(0, step_keyword_to_value[0]['mean_of_const'])
+    self.assertEqual(6, step_keyword_to_value[4]['mean_of_const'])
+    self.assertEqual(12, step_keyword_to_value[10]['mean_of_const'])
+
+  def test_dnn_classifier(self):
+    embedding = feature_column_lib.embedding_column(
+        feature_column_lib.categorical_column_with_vocabulary_list(
+            'wire_cast', ['kima', 'omar', 'stringer']), 8)
+    dnn = estimator_lib.DNNClassifier(
+        feature_columns=[embedding], hidden_units=[3, 1])
+
+    def train_input_fn():
+      return dataset_ops.Dataset.from_tensors(({
+          'wire_cast': [['omar'], ['kima']]
+      }, [[0], [1]])).repeat(3)
+
+    def eval_input_fn():
+      return dataset_ops.Dataset.from_tensors(({
+          'wire_cast': [['stringer'], ['kima']]
+      }, [[0], [1]])).repeat(2)
+
+    evaluator = hooks_lib.InMemoryEvaluatorHook(
+        dnn, eval_input_fn, name='in-memory')
+    dnn.train(train_input_fn, hooks=[evaluator])
+    self.assertTrue(os.path.isdir(dnn.eval_dir('in-memory')))
+    step_keyword_to_value = summary_step_keyword_to_value_mapping(
+        dnn.eval_dir('in-memory'))
+
+    final_metrics = dnn.evaluate(eval_input_fn)
+    step = final_metrics[ops.GraphKeys.GLOBAL_STEP]
+    for summary_tag in final_metrics:
+      if summary_tag == ops.GraphKeys.GLOBAL_STEP:
+        continue
+      self.assertEqual(final_metrics[summary_tag],
+                       step_keyword_to_value[step][summary_tag])
+
+  def test_raise_error_with_multi_worker(self):
+    tf_config = {
+        'cluster': {
+            run_config_lib.TaskType.CHIEF: ['host0:0'],
+            run_config_lib.TaskType.WORKER: ['host3:3', 'host4:4', 'host5:5']
+        },
+        'task': {
+            'type': run_config_lib.TaskType.CHIEF,
+            'index': 0
+        }
+    }
+    with test.mock.patch.dict('os.environ',
+                              {'TF_CONFIG': json.dumps(tf_config)}):
+      dnn = estimator_lib.DNNClassifier(
+          feature_columns=[feature_column_lib.numeric_column('x')],
+          hidden_units=[3, 1])
+
+    def eval_input_fn():
+      pass
+
+    with self.assertRaisesRegexp(ValueError, 'supports only single machine'):
+      hooks_lib.InMemoryEvaluatorHook(dnn, eval_input_fn)
+
+  def test_raise_error_with_ps(self):
+    tf_config = {
+        'cluster': {
+            run_config_lib.TaskType.CHIEF: ['host0:0'],
+            run_config_lib.TaskType.PS: ['host1:1'],
+        },
+        'task': {
+            'type': run_config_lib.TaskType.CHIEF,
+            'index': 0
+        }
+    }
+    with test.mock.patch.dict('os.environ',
+                              {'TF_CONFIG': json.dumps(tf_config)}):
+      dnn = estimator_lib.DNNClassifier(
+          feature_columns=[feature_column_lib.numeric_column('x')],
+          hidden_units=[3, 1])
+
+    def eval_input_fn():
+      pass
+
+    with self.assertRaisesRegexp(ValueError, 'supports only single machine'):
+      hooks_lib.InMemoryEvaluatorHook(dnn, eval_input_fn)
+
+  def test_raise_error_with_custom_saver_in_eval(self):
+
+    def model_fn(features, labels, mode):
+      _, _ = features, labels
+      return estimator_lib.EstimatorSpec(
+          mode,
+          loss=constant_op.constant(3.),
+          scaffold=training.Scaffold(saver=training.Saver()),
+          train_op=constant_op.constant(5.),
+          eval_metric_ops={
+              'mean_of_features': metrics_lib.mean(constant_op.constant(2.))
+          })
+
+    estimator = estimator_lib.Estimator(model_fn=model_fn)
+
+    def input_fn():
+      return dataset_ops.Dataset.range(10)
+
+    evaluator = hooks_lib.InMemoryEvaluatorHook(estimator, input_fn)
+    with self.assertRaisesRegexp(ValueError, 'does not support custom saver'):
+      evaluator.begin()
+
+  def test_raise_error_with_custom_init_fn_in_eval(self):
+
+    def model_fn(features, labels, mode):
+      _, _ = features, labels
+
+      def init_fn(scaffold, session):
+        _, _ = scaffold, session
+
+      return estimator_lib.EstimatorSpec(
+          mode,
+          loss=constant_op.constant(3.),
+          scaffold=training.Scaffold(init_fn=init_fn),
+          train_op=constant_op.constant(5.),
+          eval_metric_ops={
+              'mean_of_features': metrics_lib.mean(constant_op.constant(2.))
+          })
+
+    estimator = estimator_lib.Estimator(model_fn=model_fn)
+
+    def input_fn():
+      return dataset_ops.Dataset.range(10)
+
+    evaluator = hooks_lib.InMemoryEvaluatorHook(estimator, input_fn)
+    with self.assertRaisesRegexp(ValueError, 'does not support custom init_fn'):
+      evaluator.begin()
+
+  def test_raise_error_with_saveables_other_than_global_variables(self):
+
+    def model_fn(features, labels, mode):
+      _, _ = features, labels
+      w = variables.Variable(
+          initial_value=[0.],
+          trainable=False,
+          collections=[ops.GraphKeys.SAVEABLE_OBJECTS])
+      init_op = control_flow_ops.group(
+          [w.initializer, training.get_global_step().initializer])
+      return estimator_lib.EstimatorSpec(
+          mode,
+          loss=constant_op.constant(3.),
+          scaffold=training.Scaffold(init_op=init_op),
+          train_op=constant_op.constant(5.),
+          eval_metric_ops={
+              'mean_of_features': metrics_lib.mean(constant_op.constant(2.))
+          })
+
+    estimator = estimator_lib.Estimator(model_fn=model_fn)
+
+    def input_fn():
+      return dataset_ops.Dataset.range(10)
+
+    evaluator = hooks_lib.InMemoryEvaluatorHook(estimator, input_fn)
+    with self.assertRaisesRegexp(ValueError, 'does not support saveables'):
+      estimator.train(input_fn, hooks=[evaluator])
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/estimator/python/estimator/logit_fns.py b/tensorflow/contrib/estimator/python/estimator/logit_fns.py
index 09c2862ccd3f90..c8b0dd62970e34 100644
--- a/tensorflow/contrib/estimator/python/estimator/logit_fns.py
+++ b/tensorflow/contrib/estimator/python/estimator/logit_fns.py
@@ -41,10 +41,10 @@
 
 import six
 
-from tensorflow.python.estimator import util
 from tensorflow.python.estimator.canned import dnn as dnn_core
 from tensorflow.python.estimator.canned import linear as linear_core
 from tensorflow.python.framework import ops
+from tensorflow.python.util import function_utils
 
 # pylint: disable=protected-access
 dnn_logit_fn_builder = dnn_core._dnn_logit_fn_builder
@@ -72,7 +72,7 @@ def call_logit_fn(logit_fn, features, mode, params, config):
     ValueError: if logit_fn does not return a Tensor or a dictionary mapping
       strings to Tensors.
   """
-  logit_fn_args = util.fn_args(logit_fn)
+  logit_fn_args = function_utils.fn_args(logit_fn)
   kwargs = {}
   if 'mode' in logit_fn_args:
     kwargs['mode'] = mode
diff --git a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
index f8564446e5da3e..cda23aa437f954 100644
--- a/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
+++ b/tensorflow/contrib/estimator/python/estimator/replicate_model_fn.py
@@ -32,7 +32,6 @@
 from tensorflow.core.framework import node_def_pb2
 from tensorflow.python.client import device_lib
 from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator import util
 from tensorflow.python.estimator.export import export_output as export_output_lib
 from tensorflow.python.framework import device as framework_device
 from tensorflow.python.framework import ops as ops_lib
@@ -48,6 +47,7 @@
 from tensorflow.python.training import device_setter as device_setter_lib
 from tensorflow.python.training import optimizer as optimizer_lib
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import function_utils
 
 
 @deprecation.deprecated(
@@ -521,7 +521,7 @@ def _get_loss_towers(model_fn,
   """Replicate the loss computation across devices."""
   tower_specs = []
 
-  model_fn_args = util.fn_args(model_fn)
+  model_fn_args = function_utils.fn_args(model_fn)
   optional_params = {}
   if 'params' in model_fn_args:
     optional_params['params'] = copy.deepcopy(params)
diff --git a/tensorflow/contrib/estimator/python/estimator/rnn.py b/tensorflow/contrib/estimator/python/estimator/rnn.py
index b475c12f5af3ae..7c49cd00d16777 100644
--- a/tensorflow/contrib/estimator/python/estimator/rnn.py
+++ b/tensorflow/contrib/estimator/python/estimator/rnn.py
@@ -229,6 +229,7 @@ def rnn_logit_fn(features, mode):
     rnn_outputs, _ = rnn.dynamic_rnn(
         cell=cell,
         inputs=sequence_input,
+        sequence_length=sequence_length,
         dtype=dtypes.float32,
         time_major=False)
     last_activations = _select_last_activations(rnn_outputs, sequence_length)
@@ -328,6 +329,19 @@ def _train_op_fn(loss):
         logits=logits)
 
 
+def _assert_rnn_cell_fn(rnn_cell_fn, num_units, cell_type):
+  """Assert arguments are valid and return rnn_cell_fn."""
+  if rnn_cell_fn and (num_units or cell_type != USE_DEFAULT):
+    raise ValueError(
+        'num_units and cell_type must not be specified when using rnn_cell_fn'
+    )
+  if not rnn_cell_fn:
+    if cell_type == USE_DEFAULT:
+      cell_type = 'basic_rnn'
+    rnn_cell_fn = _make_rnn_cell_fn(num_units, cell_type)
+  return rnn_cell_fn
+
+
 class RNNClassifier(estimator.Estimator):
   """A classifier for TensorFlow RNN models.
 
@@ -341,8 +355,8 @@ class RNNClassifier(estimator.Estimator):
   token_emb = embedding_column(categorical_column=token_sequence, ...)
 
   estimator = RNNClassifier(
-      num_units=[32, 16], cell_type='lstm',
-      sequence_feature_columns=[token_emb])
+      sequence_feature_columns=[token_emb],
+      num_units=[32, 16], cell_type='lstm')
 
   # Input builders
   def input_fn_train: # returns x, y
@@ -438,8 +452,8 @@ def __init__(self,
         encoded as integer values in {0, 1,..., n_classes-1} for `n_classes`>2 .
         Also there will be errors if vocabulary is not provided and labels are
         string.
-      optimizer: An instance of `tf.Optimizer` used to train the model. Defaults
-        to Adagrad optimizer.
+      optimizer: An instance of `tf.Optimizer` or string specifying optimizer
+        type. Defaults to Adagrad optimizer.
       input_layer_partitioner: Optional. Partitioner for input layer. Defaults
         to `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
       config: `RunConfig` object to configure the runtime settings.
@@ -448,14 +462,7 @@ def __init__(self,
       ValueError: If `num_units`, `cell_type`, and `rnn_cell_fn` are not
         compatible.
     """
-    if rnn_cell_fn and (num_units or cell_type != USE_DEFAULT):
-      raise ValueError(
-          'num_units and cell_type must not be specified when using rnn_cell_fn'
-      )
-    if not rnn_cell_fn:
-      if cell_type == USE_DEFAULT:
-        cell_type = 'basic_rnn'
-      rnn_cell_fn = _make_rnn_cell_fn(num_units, cell_type)
+    rnn_cell_fn = _assert_rnn_cell_fn(rnn_cell_fn, num_units, cell_type)
 
     if n_classes == 2:
       head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(  # pylint: disable=protected-access
@@ -479,3 +486,137 @@ def _model_fn(features, labels, mode, config):
           config=config)
     super(RNNClassifier, self).__init__(
         model_fn=_model_fn, model_dir=model_dir, config=config)
+
+
+class RNNEstimator(estimator.Estimator):
+  """An Estimator for TensorFlow RNN models with user-specified head.
+
+  Example:
+
+  ```python
+  token_sequence = sequence_categorical_column_with_hash_bucket(...)
+  token_emb = embedding_column(categorical_column=token_sequence, ...)
+
+  estimator = RNNEstimator(
+      head=tf.contrib.estimator.regression_head(),
+      sequence_feature_columns=[token_emb],
+      num_units=[32, 16], cell_type='lstm')
+
+  # Or with custom RNN cell:
+  def rnn_cell_fn(mode):
+    cells = [ tf.contrib.rnn.LSTMCell(size) for size in [32, 16] ]
+    if mode == tf.estimator.ModeKeys.TRAIN:
+      cells = [ tf.contrib.rnn.DropoutWrapper(cell, input_keep_prob=0.5)
+                    for cell in cells ]
+    return tf.contrib.rnn.MultiRNNCell(cells)
+
+  estimator = RNNEstimator(
+      head=tf.contrib.estimator.regression_head(),
+      sequence_feature_columns=[token_emb],
+      rnn_cell_fn=rnn_cell_fn)
+
+  # Input builders
+  def input_fn_train: # returns x, y
+    pass
+  estimator.train(input_fn=input_fn_train, steps=100)
+
+  def input_fn_eval: # returns x, y
+    pass
+  metrics = estimator.evaluate(input_fn=input_fn_eval, steps=10)
+  def input_fn_predict: # returns x, None
+    pass
+  predictions = estimator.predict(input_fn=input_fn_predict)
+  ```
+
+  Input of `train` and `evaluate` should have following features,
+  otherwise there will be a `KeyError`:
+
+  * if the head's `weight_column` is not `None`, a feature with
+    `key=weight_column` whose value is a `Tensor`.
+  * for each `column` in `sequence_feature_columns`:
+    - a feature with `key=column.name` whose `value` is a `SparseTensor`.
+  * for each `column` in `context_feature_columns`:
+    - if `column` is a `_CategoricalColumn`, a feature with `key=column.name`
+      whose `value` is a `SparseTensor`.
+    - if `column` is a `_WeightedCategoricalColumn`, two features: the first
+      with `key` the id column name, the second with `key` the weight column
+      name. Both features' `value` must be a `SparseTensor`.
+    - if `column` is a `_DenseColumn`, a feature with `key=column.name`
+      whose `value` is a `Tensor`.
+
+  Loss and predicted output are determined by the specified head.
+
+  @compatibility(eager)
+  Estimators are not compatible with eager execution.
+  @end_compatibility
+  """
+
+  def __init__(self,
+               head,
+               sequence_feature_columns,
+               context_feature_columns=None,
+               num_units=None,
+               cell_type=USE_DEFAULT,
+               rnn_cell_fn=None,
+               model_dir=None,
+               optimizer='Adagrad',
+               input_layer_partitioner=None,
+               config=None):
+    """Initializes a `RNNClassifier` instance.
+
+    Args:
+      head: A `_Head` instance constructed with a method such as
+        `tf.contrib.estimator.multi_label_head`. This specifies the model's
+        output and loss function to be optimized.
+      sequence_feature_columns: An iterable containing the `FeatureColumn`s
+        that represent sequential input. All items in the set should either be
+        sequence columns (e.g. `sequence_numeric_column`) or constructed from
+        one (e.g. `embedding_column` with `sequence_categorical_column_*` as
+        input).
+      context_feature_columns: An iterable containing the `FeatureColumn`s
+        for contextual input. The data represented by these columns will be
+        replicated and given to the RNN at each timestep. These columns must be
+        instances of classes derived from `_DenseColumn` such as
+        `numeric_column`, not the sequential variants.
+      num_units: Iterable of integer number of hidden units per RNN layer. If
+        set, `cell_type` must also be specified and `rnn_cell_fn` must be
+        `None`.
+      cell_type: A subclass of `tf.nn.rnn_cell.RNNCell` or a string specifying
+        the cell type. Supported strings are: `'basic_rnn'`, `'lstm'`, and
+        `'gru'`. If set, `num_units` must also be specified and `rnn_cell_fn`
+        must be `None`.
+      rnn_cell_fn: A function with one argument, a `tf.estimator.ModeKeys`, and
+        returns an object of type `tf.nn.rnn_cell.RNNCell` that will be used to
+        construct the RNN. If set, `num_units` and `cell_type` cannot be set.
+        This is for advanced users who need additional customization beyond
+        `num_units` and `cell_type`. Note that `tf.nn.rnn_cell.MultiRNNCell` is
+        needed for stacked RNNs.
+      model_dir: Directory to save model parameters, graph and etc. This can
+        also be used to load checkpoints from the directory into a estimator to
+        continue training a previously saved model.
+      optimizer: An instance of `tf.Optimizer` or string specifying optimizer
+        type. Defaults to Adagrad optimizer.
+      input_layer_partitioner: Optional. Partitioner for input layer. Defaults
+        to `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
+      config: `RunConfig` object to configure the runtime settings.
+
+    Raises:
+      ValueError: If `num_units`, `cell_type`, and `rnn_cell_fn` are not
+        compatible.
+    """
+    rnn_cell_fn = _assert_rnn_cell_fn(rnn_cell_fn, num_units, cell_type)
+
+    def _model_fn(features, labels, mode, config):
+      return _rnn_model_fn(
+          features=features,
+          labels=labels,
+          mode=mode,
+          head=head,
+          rnn_cell_fn=rnn_cell_fn,
+          sequence_feature_columns=tuple(sequence_feature_columns or []),
+          context_feature_columns=tuple(context_feature_columns or []),
+          optimizer=optimizer,
+          input_layer_partitioner=input_layer_partitioner,
+          config=config)
+    super(RNNEstimator, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir, config=config)
diff --git a/tensorflow/contrib/estimator/python/estimator/rnn_test.py b/tensorflow/contrib/estimator/python/estimator/rnn_test.py
index 393f94f5c7de02..959b40371aa5fa 100644
--- a/tensorflow/contrib/estimator/python/estimator/rnn_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/rnn_test.py
@@ -25,12 +25,15 @@
 import numpy as np
 import six
 
+from tensorflow.contrib.data.python.ops import readers
+from tensorflow.contrib.estimator.python.estimator import head as head_lib
 from tensorflow.contrib.estimator.python.estimator import rnn
 from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column as seq_fc
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
 from tensorflow.python.estimator import model_fn
 from tensorflow.python.estimator.canned import metric_keys
+from tensorflow.python.estimator.canned import parsing_utils
 from tensorflow.python.estimator.canned import prediction_keys
 from tensorflow.python.estimator.export import export
 from tensorflow.python.estimator.inputs import numpy_io
@@ -38,9 +41,9 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.lib.io import python_io
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import rnn_cell
 from tensorflow.python.ops import state_ops
@@ -50,7 +53,6 @@
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.training import checkpoint_utils
-from tensorflow.python.training import input as input_lib
 from tensorflow.python.training import monitored_session
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import training_util
@@ -984,7 +986,10 @@ def predict_input_fn():
                      predictions[prediction_keys.PredictionKeys.CLASSES])
 
 
-class RNNClassifierIntegrationTest(test.TestCase):
+class BaseRNNClassificationIntegrationTest(object):
+
+  def __init__(self, _create_estimator_fn):
+    self._create_estimator_fn = _create_estimator_fn
 
   def setUp(self):
     self._model_dir = tempfile.mkdtemp()
@@ -994,20 +999,11 @@ def tearDown(self):
       writer_cache.FileWriterCache.clear()
       shutil.rmtree(self._model_dir)
 
-  def _test_complete_flow(
-      self, train_input_fn, eval_input_fn, predict_input_fn, n_classes,
-      batch_size):
-    col = seq_fc.sequence_categorical_column_with_hash_bucket(
-        'tokens', hash_bucket_size=10)
-    embed = fc.embedding_column(col, dimension=2)
-    feature_columns = [embed]
-
+  def _test_complete_flow(self, feature_columns, train_input_fn, eval_input_fn,
+                          predict_input_fn, n_classes, batch_size):
     cell_units = [4, 2]
-    est = rnn.RNNClassifier(
-        num_units=cell_units,
-        sequence_feature_columns=feature_columns,
-        n_classes=n_classes,
-        model_dir=self._model_dir)
+    est = self._create_estimator_fn(feature_columns, n_classes, cell_units,
+                                    self._model_dir)
 
     # TRAIN
     num_steps = 10
@@ -1026,10 +1022,10 @@ def _test_complete_flow(
     self.assertAllEqual((batch_size, n_classes), predicted_proba.shape)
 
     # EXPORT
-    feature_spec = {
-        'tokens': parsing_ops.VarLenFeature(dtypes.string),
-        'label': parsing_ops.FixedLenFeature([1], dtypes.int64),
-    }
+    feature_spec = parsing_utils.classifier_parse_example_spec(
+        feature_columns,
+        label_key='label',
+        label_dtype=dtypes.int64)
     serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
         feature_spec)
     export_dir = est.export_savedmodel(tempfile.mkdtemp(),
@@ -1069,7 +1065,13 @@ def testNumpyInputFn(self):
         batch_size=batch_size,
         shuffle=False)
 
+    col = seq_fc.sequence_categorical_column_with_hash_bucket(
+        'tokens', hash_bucket_size=10)
+    embed = fc.embedding_column(col, dimension=2)
+    feature_columns = [embed]
+
     self._test_complete_flow(
+        feature_columns=feature_columns,
         train_input_fn=train_input_fn,
         eval_input_fn=eval_input_fn,
         predict_input_fn=predict_input_fn,
@@ -1082,7 +1084,8 @@ def testParseExampleInputFn(self):
     batch_size = 10
     words = [b'dog', b'cat', b'bird', b'the', b'a', b'sat', b'flew', b'slept']
 
-    serialized_examples = []
+    _, examples_file = tempfile.mkstemp()
+    writer = python_io.TFRecordWriter(examples_file)
     for _ in range(batch_size):
       sequence_length = random.randint(1, len(words))
       sentence = random.sample(words, sequence_length)
@@ -1096,30 +1099,36 @@ def testParseExampleInputFn(self):
                   feature_pb2.Feature(int64_list=feature_pb2.Int64List(
                       value=[label])),
           }))
-      serialized_examples.append(example.SerializeToString())
+      writer.write(example.SerializeToString())
+    writer.close()
+
+    col = seq_fc.sequence_categorical_column_with_hash_bucket(
+        'tokens', hash_bucket_size=10)
+    embed = fc.embedding_column(col, dimension=2)
+    feature_columns = [embed]
+    feature_spec = parsing_utils.classifier_parse_example_spec(
+        feature_columns,
+        label_key='label',
+        label_dtype=dtypes.int64)
 
-    feature_spec = {
-        'tokens': parsing_ops.VarLenFeature(dtypes.string),
-        'label': parsing_ops.FixedLenFeature([1], dtypes.int64),
-    }
     def _train_input_fn():
-      features = parsing_ops.parse_example(serialized_examples, feature_spec)
-      labels = features.pop('label')
-      return features, labels
+      dataset = readers.make_batched_features_dataset(
+          examples_file, batch_size, feature_spec)
+      return dataset.map(lambda features: (features, features.pop('label')))
     def _eval_input_fn():
-      features = parsing_ops.parse_example(
-          input_lib.limit_epochs(serialized_examples, num_epochs=1),
-          feature_spec)
-      labels = features.pop('label')
-      return features, labels
+      dataset = readers.make_batched_features_dataset(
+          examples_file, batch_size, feature_spec, num_epochs=1)
+      return dataset.map(lambda features: (features, features.pop('label')))
     def _predict_input_fn():
-      features = parsing_ops.parse_example(
-          input_lib.limit_epochs(serialized_examples, num_epochs=1),
-          feature_spec)
-      features.pop('label')
-      return features, None
+      dataset = readers.make_batched_features_dataset(
+          examples_file, batch_size, feature_spec, num_epochs=1)
+      def features_fn(features):
+        features.pop('label')
+        return features
+      return dataset.map(features_fn)
 
     self._test_complete_flow(
+        feature_columns=feature_columns,
         train_input_fn=_train_input_fn,
         eval_input_fn=_eval_input_fn,
         predict_input_fn=_predict_input_fn,
@@ -1127,5 +1136,37 @@ def _predict_input_fn():
         batch_size=batch_size)
 
 
+def _rnn_classifier_fn(feature_columns, n_classes, cell_units, model_dir):
+  return rnn.RNNClassifier(
+      num_units=cell_units,
+      sequence_feature_columns=feature_columns,
+      n_classes=n_classes,
+      model_dir=model_dir)
+
+
+class RNNClassifierIntegrationTest(BaseRNNClassificationIntegrationTest,
+                                   test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    BaseRNNClassificationIntegrationTest.__init__(self, _rnn_classifier_fn)
+
+
+def _rnn_estimator_fn(feature_columns, n_classes, cell_units, model_dir):
+  return rnn.RNNEstimator(
+      head=head_lib.multi_class_head(n_classes=n_classes),
+      num_units=cell_units,
+      sequence_feature_columns=feature_columns,
+      model_dir=model_dir)
+
+
+class RNNEstimatorIntegrationTest(BaseRNNClassificationIntegrationTest,
+                                  test.TestCase):
+
+  def __init__(self, methodName='runTest'):  # pylint: disable=invalid-name
+    test.TestCase.__init__(self, methodName)
+    BaseRNNClassificationIntegrationTest.__init__(self, _rnn_estimator_fn)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/factorization/BUILD b/tensorflow/contrib/factorization/BUILD
index 8baf0f2f1d07ca..4c0419b9a7623c 100644
--- a/tensorflow/contrib/factorization/BUILD
+++ b/tensorflow/contrib/factorization/BUILD
@@ -215,7 +215,7 @@ tf_py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:sparse_tensor",
     ],
-    tags = ["noasan"],  # times out b/78588193
+    shard_count = 4,
 )
 
 # Estimators tests
diff --git a/tensorflow/contrib/factorization/python/ops/factorization_ops.py b/tensorflow/contrib/factorization/python/ops/factorization_ops.py
index 811fa89bc38c61..8f73274c2a0ebb 100644
--- a/tensorflow/contrib/factorization/python/ops/factorization_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/factorization_ops.py
@@ -107,7 +107,7 @@ class WALSModel(object):
       # the prep_gramian_op for row(column) can be run.
       worker_init_op = model.worker_init
 
-      # To be run once per integration sweep before the row(column) update
+      # To be run once per iteration sweep before the row(column) update
       # initialize ops can be run. Note that in the distributed training
       # situations, this should only be run by the chief trainer. All other
       # trainers need to block until this is done.
@@ -197,7 +197,8 @@ def __init__(self,
                row_weights=1,
                col_weights=1,
                use_factors_weights_cache=True,
-               use_gramian_cache=True):
+               use_gramian_cache=True,
+               use_scoped_vars=False):
     """Creates model for WALS matrix factorization.
 
     Args:
@@ -239,6 +240,8 @@ def __init__(self,
         weights cache to take effect.
       use_gramian_cache: When True, the Gramians will be cached on the workers
         before the updates start. Defaults to True.
+      use_scoped_vars: When True, the factor and weight vars will also be nested
+        in a tf.name_scope.
     """
     self._input_rows = input_rows
     self._input_cols = input_cols
@@ -251,25 +254,46 @@ def __init__(self,
         regularization * linalg_ops.eye(self._n_components)
         if regularization is not None else None)
     assert (row_weights is None) == (col_weights is None)
-    self._row_weights = WALSModel._create_weights(
-        row_weights, self._input_rows, self._num_row_shards, "row_weights")
-    self._col_weights = WALSModel._create_weights(
-        col_weights, self._input_cols, self._num_col_shards, "col_weights")
     self._use_factors_weights_cache = use_factors_weights_cache
     self._use_gramian_cache = use_gramian_cache
-    self._row_factors = self._create_factors(
-        self._input_rows, self._n_components, self._num_row_shards, row_init,
-        "row_factors")
-    self._col_factors = self._create_factors(
-        self._input_cols, self._n_components, self._num_col_shards, col_init,
-        "col_factors")
+
+    if use_scoped_vars:
+      with ops.name_scope("row_weights"):
+        self._row_weights = WALSModel._create_weights(
+            row_weights, self._input_rows, self._num_row_shards, "row_weights")
+      with ops.name_scope("col_weights"):
+        self._col_weights = WALSModel._create_weights(
+            col_weights, self._input_cols, self._num_col_shards, "col_weights")
+      with ops.name_scope("row_factors"):
+        self._row_factors = self._create_factors(
+            self._input_rows, self._n_components, self._num_row_shards,
+            row_init, "row_factors")
+      with ops.name_scope("col_factors"):
+        self._col_factors = self._create_factors(
+            self._input_cols, self._n_components, self._num_col_shards,
+            col_init, "col_factors")
+    else:
+      self._row_weights = WALSModel._create_weights(
+          row_weights, self._input_rows, self._num_row_shards, "row_weights")
+      self._col_weights = WALSModel._create_weights(
+          col_weights, self._input_cols, self._num_col_shards, "col_weights")
+      self._row_factors = self._create_factors(
+          self._input_rows, self._n_components, self._num_row_shards, row_init,
+          "row_factors")
+      self._col_factors = self._create_factors(
+          self._input_cols, self._n_components, self._num_col_shards, col_init,
+          "col_factors")
+
     self._row_gramian = self._create_gramian(self._n_components, "row_gramian")
     self._col_gramian = self._create_gramian(self._n_components, "col_gramian")
-    self._row_update_prep_gramian = self._prepare_gramian(
-        self._col_factors, self._col_gramian)
-    self._col_update_prep_gramian = self._prepare_gramian(
-        self._row_factors, self._row_gramian)
-    self._create_transient_vars()
+    with ops.name_scope("row_prepare_gramian"):
+      self._row_update_prep_gramian = self._prepare_gramian(
+          self._col_factors, self._col_gramian)
+    with ops.name_scope("col_prepare_gramian"):
+      self._col_update_prep_gramian = self._prepare_gramian(
+          self._row_factors, self._row_gramian)
+    with ops.name_scope("transient_vars"):
+      self._create_transient_vars()
 
   @property
   def row_factors(self):
@@ -436,7 +460,7 @@ def _prepare_gramian(self, factors, gramian):
       gramian: Variable storing the gramian calculated from the factors.
 
     Returns:
-      A op that updates the gramian with the calculated value from the factors.
+      An op that updates the gramian with the calculated value from the factors.
     """
     partial_gramians = []
     for f in factors:
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
index 555beddeaab419..b588f75efe9d0b 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
@@ -346,7 +346,8 @@ def sequence_numeric_column(
     key,
     shape=(1,),
     default_value=0.,
-    dtype=dtypes.float32):
+    dtype=dtypes.float32,
+    normalizer_fn=None):
   """Returns a feature column that represents sequences of numeric data.
 
   Example:
@@ -370,6 +371,12 @@ def sequence_numeric_column(
     default_value: A single value compatible with `dtype` that is used for
       padding the sparse data into a dense `Tensor`.
     dtype: The type of values.
+    normalizer_fn: If not `None`, a function that can be used to normalize the
+      value of the tensor after `default_value` is applied for parsing.
+      Normalizer function takes the input `Tensor` as its argument, and returns
+      the output `Tensor`. (e.g. lambda x: (x - 3.0) / 4.2). Please note that
+      even though the most common use case of this function is normalization, it
+      can be used for any kind of Tensorflow transformations.
 
   Returns:
     A `_SequenceNumericColumn`.
@@ -383,12 +390,16 @@ def sequence_numeric_column(
   if not (dtype.is_integer or dtype.is_floating):
     raise ValueError('dtype must be convertible to float. '
                      'dtype: {}, key: {}'.format(dtype, key))
+  if normalizer_fn is not None and not callable(normalizer_fn):
+    raise TypeError(
+        'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
 
   return _SequenceNumericColumn(
       key,
       shape=shape,
       default_value=default_value,
-      dtype=dtype)
+      dtype=dtype,
+      normalizer_fn=normalizer_fn)
 
 
 def _assert_all_equal_and_return(tensors, name=None):
@@ -407,7 +418,7 @@ class _SequenceNumericColumn(
     fc._SequenceDenseColumn,
     collections.namedtuple(
         '_SequenceNumericColumn',
-        ['key', 'shape', 'default_value', 'dtype'])):
+        ['key', 'shape', 'default_value', 'dtype', 'normalizer_fn'])):
   """Represents sequences of numeric data."""
 
   @property
@@ -419,7 +430,10 @@ def _parse_example_spec(self):
     return {self.key: parsing_ops.VarLenFeature(self.dtype)}
 
   def _transform_feature(self, inputs):
-    return inputs.get(self.key)
+    input_tensor = inputs.get(self.key)
+    if self.normalizer_fn is not None:
+      input_tensor = self.normalizer_fn(input_tensor)
+    return input_tensor
 
   @property
   def _variable_shape(self):
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
index 88f5d535162939..89b5f4c4137f6c 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
@@ -28,6 +28,7 @@
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import monitored_session
 
@@ -670,6 +671,7 @@ def test_defaults(self):
     self.assertEqual((1,), a.shape)
     self.assertEqual(0., a.default_value)
     self.assertEqual(dtypes.float32, a.dtype)
+    self.assertIsNone(a.normalizer_fn)
 
   def test_shape_saved_as_tuple(self):
     a = sfc.sequence_numeric_column('aaa', shape=[1, 2])
@@ -688,6 +690,10 @@ def test_dtype_is_convertible_to_float(self):
         ValueError, 'dtype must be convertible to float'):
       sfc.sequence_numeric_column('aaa', dtype=dtypes.string)
 
+  def test_normalizer_fn_must_be_callable(self):
+    with self.assertRaisesRegexp(TypeError, 'must be a callable'):
+      sfc.sequence_numeric_column('aaa', normalizer_fn='NotACallable')
+
   def test_get_sequence_dense_tensor(self):
     sparse_input = sparse_tensor.SparseTensorValue(
         # example 0, values [[0.], [1]]
@@ -708,6 +714,41 @@ def test_get_sequence_dense_tensor(self):
       self.assertAllEqual(
           expected_dense_tensor, dense_tensor.eval(session=sess))
 
+  def test_get_sequence_dense_tensor_with_normalizer_fn(self):
+
+    def _increment_two(input_sparse_tensor):
+      return sparse_ops.sparse_add(
+          input_sparse_tensor,
+          sparse_tensor.SparseTensor(((0, 0), (1, 1)), (2.0, 2.0), (2, 2))
+      )
+
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, values [[0.], [1]]
+        # example 1, [[10.]]
+        indices=((0, 0), (0, 1), (1, 0)),
+        values=(0., 1., 10.),
+        dense_shape=(2, 2))
+
+    # Before _increment_two:
+    #   [[0.], [1.]],
+    #   [[10.], [0.]],
+    # After _increment_two:
+    #   [[2.], [1.]],
+    #   [[10.], [2.]],
+    expected_dense_tensor = [
+        [[2.], [1.]],
+        [[10.], [2.]],
+    ]
+    numeric_column = sfc.sequence_numeric_column(
+        'aaa', normalizer_fn=_increment_two)
+
+    dense_tensor, _ = numeric_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': sparse_input}))
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_dense_tensor, dense_tensor.eval(session=sess))
+
   def test_get_sequence_dense_tensor_with_shape(self):
     """Tests get_sequence_dense_tensor with shape !=(1,)."""
     sparse_input = sparse_tensor.SparseTensorValue(
diff --git a/tensorflow/contrib/ffmpeg/__init__.py b/tensorflow/contrib/ffmpeg/__init__.py
index daba965a98893b..484ffee3e7afe5 100644
--- a/tensorflow/contrib/ffmpeg/__init__.py
+++ b/tensorflow/contrib/ffmpeg/__init__.py
@@ -28,7 +28,6 @@
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_audio
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_video
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import encode_audio
-from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_video
 
 from tensorflow.python.util.all_util import remove_undocumented
 
diff --git a/tensorflow/contrib/ffmpeg/ffmpeg_lib.h b/tensorflow/contrib/ffmpeg/ffmpeg_lib.h
index a8d5a0dd83fb50..bf2aa75545813f 100644
--- a/tensorflow/contrib/ffmpeg/ffmpeg_lib.h
+++ b/tensorflow/contrib/ffmpeg/ffmpeg_lib.h
@@ -53,7 +53,7 @@ Status CreateAudioFile(const string& audio_format_id, int32 bits_per_second,
                        int32 samples_per_second, int32 channel_count,
                        const std::vector<float>& samples, string* output_data);
 
-// Reads an video file using ffmpeg adn converts it into a RGB24 in uint8
+// Reads an video file using ffmpeg and converts it into a RGB24 in uint8
 // [frames, height, width, 3]. The w, h, and frames are obtained from ffmpeg.
 Status ReadVideoFile(const string& filename, std::vector<uint8>* output_data,
                      uint32* width, uint32* height, uint32* frames);
diff --git a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
index 020b5c99c61019..b1b5126d9e9e51 100644
--- a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
+++ b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
@@ -21,7 +21,6 @@
 from tensorflow.contrib.ffmpeg.ops import gen_decode_audio_op_py
 from tensorflow.contrib.ffmpeg.ops import gen_decode_video_op_py
 from tensorflow.contrib.ffmpeg.ops import gen_encode_audio_op_py
-from tensorflow.contrib.ffmpeg.ops import gen_decode_video_op_py
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import resource_loader
diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index 10d1ecc738de67..dc49383c5c300e 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -119,14 +119,13 @@
 from tensorflow.python.framework.smart_cond import smart_constant_value
 from tensorflow.python.framework.tensor_spec import BoundedTensorSpec
 from tensorflow.python.framework.tensor_spec import TensorSpec
-from tensorflow.python.ops.array_ops import broadcast_to
 from tensorflow.python.ops.init_ops import convolutional_delta_orthogonal
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_1d
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_2d
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_3d
 from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = ['nest', 'broadcast_to']
+_allowed_symbols = ['nest']
 _nest_allowed_symbols = [
     'assert_same_structure',
     'is_sequence',
diff --git a/tensorflow/contrib/framework/python/framework/tensor_util_test.py b/tensorflow/contrib/framework/python/framework/tensor_util_test.py
index 8fc4f60492b0bf..af1b404cb51bf5 100644
--- a/tensorflow/contrib/framework/python/framework/tensor_util_test.py
+++ b/tensorflow/contrib/framework/python/framework/tensor_util_test.py
@@ -78,7 +78,6 @@ def test_assert_scalar_int(self):
               [3, 4], dtype=dtypes.int32))
 
 
-@test_util.with_c_api
 class WithShapeTest(test.TestCase):
 
   def _assert_with_shape(self, tensor, expected_value, expected_shape,
@@ -216,25 +215,18 @@ def test_with_shape_partial(self):
       tensor_partial_shape.set_shape([None, 2])
 
       for incompatible_shape in [[0], [1]]:
-        if ops._USE_C_API:
-          error_message = "Shapes must be equal rank, but are 2 and 1"
-        else:
-          error_message = r"Shapes \(\?, 2\) and \([01],\) are not compatible"
         self.assertRaisesRegexp(
-            ValueError, error_message,
+            ValueError, "Shapes must be equal rank, but are 2 and 1",
             tensor_util.with_shape, incompatible_shape, tensor_partial_shape)
       for incompatible_shape in [[1, 2, 1]]:
         self.assertRaisesRegexp(ValueError, "Dimensions must be equal",
                                 tensor_util.with_shape, incompatible_shape,
                                 tensor_partial_shape)
       for incompatible_shape in [[2, 1]]:
-        if ops._USE_C_API:
-          error_message = (r"Dimension 1 in both shapes must be equal, but are "
-                           r"2 and 1. Shapes are \[\?,2\] and \[2,1\].")
-        else:
-          error_message = r"Shapes \(\?, 2\) and \(2, 1\) are not compatible"
         self.assertRaisesRegexp(
-            ValueError, error_message,
+            ValueError,
+            r"Dimension 1 in both shapes must be equal, but are 2 and 1. "
+            r"Shapes are \[\?,2\] and \[2,1\].",
             tensor_util.with_shape, incompatible_shape, tensor_partial_shape)
 
       compatible_shape = [2, 2]
diff --git a/tensorflow/contrib/framework/python/ops/critical_section_ops.py b/tensorflow/contrib/framework/python/ops/critical_section_ops.py
index bd764ed57a6da0..72835c3ad86e63 100644
--- a/tensorflow/contrib/framework/python/ops/critical_section_ops.py
+++ b/tensorflow/contrib/framework/python/ops/critical_section_ops.py
@@ -202,7 +202,7 @@ def execute(self, fn, *args, **kwargs):
         or lazy way that may cause a deadlock.
       ValueError: If `exclusive_resource_access` is not provided (is `True`) and
         another `CriticalSection` has an execution requesting the same
-        resources as in `*args`, `**kwargs`, and any additionaly captured
+        resources as in `*args`, `**kwargs`, and any additionally captured
         inputs in `fn`.  Note, even if `exclusive_resource_access` is `True`,
         if another execution in another `CriticalSection` was created without
         `exclusive_resource_access=True`, a `ValueError` will be raised.
diff --git a/tensorflow/contrib/fused_conv/BUILD b/tensorflow/contrib/fused_conv/BUILD
index e1ac5d77139786..6f1e4d3626140a 100644
--- a/tensorflow/contrib/fused_conv/BUILD
+++ b/tensorflow/contrib/fused_conv/BUILD
@@ -75,6 +75,7 @@ tf_kernel_library(
         "//tensorflow/core/kernels:gpu_util_hdrs",
         "//tensorflow/core/kernels:ops_util_hdrs",
         "//third_party/eigen3",
+        "@local_config_cuda//cuda:cudnn_header",
     ],
     alwayslink = 1,
 )
@@ -94,6 +95,7 @@ tf_custom_op_library(
         "//tensorflow/core/kernels:conv_ops_gpu_hdrs",
         "//tensorflow/core/kernels:gpu_util_hdrs",
         "//tensorflow/core/kernels:ops_util_hdrs",
+        "@local_config_cuda//cuda:cudnn_header",
     ],
 )
 
diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
index 3d0ed899322c26..4d62ac65ff619f 100644
--- a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
@@ -289,8 +289,8 @@ def _VerifyValues(self, tensor_in_sizes, filter_in_sizes, bias, strides,
           conv = tensors[i]
           value = values[i]
           ref_value = ref_values[i]
-          print("expected = ", ref_value)
-          print("actual = ", value)
+          tf_logging.info("expected = ", ref_value)
+          tf_logging.info("actual = ", value)
           tol = 1e-5
           if value.dtype == np.float16:
             tol = 1e-3
@@ -831,7 +831,8 @@ def runTest(self, test_param):
                                                 vertical_stride, padding_type)
     output_width = CalculateConvolvedOutputDim(input_width, filter_width,
                                                horizontal_stride, padding_type)
-    print("output_height=", output_height, ", output_width=", output_width)
+    tf_logging.info("output_height=", output_height, ", output_width=", 
+			                 output_width)
 
     side_input, _, _ = gen_array_ops.quantize_v2(
         random_ops.random_uniform(
@@ -866,8 +867,8 @@ def runTest(self, test_param):
 
     with self.test_session(use_gpu=True) as sess:
       actual_y, expected_y = sess.run([actual, expected])
-      print("actual_y = ", actual_y)
-      print("expected_y = ", expected_y)
+      tf_logging.info("actual_y = ", actual_y)
+      tf_logging.info("expected_y = ", expected_y)
       self.assertTrue(np.array_equal(actual_y, expected_y))
 
   def testFusedConvInt8(self):
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
index e3fc6bf0f03405..4092b320042162 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
@@ -112,6 +112,7 @@ def __init__(self,
                generator_optimizer=None,
                discriminator_optimizer=None,
                get_hooks_fn=None,
+               get_eval_metric_ops_fn=None,
                add_summaries=None,
                use_loss_summaries=True,
                config=None):
@@ -146,6 +147,9 @@ def __init__(self,
         list of hooks. These hooks are run on the generator and discriminator
         train ops, and can be used to implement the GAN training scheme.
         Defaults to `train.get_sequential_train_hooks()`.
+      get_eval_metric_ops_fn: A function that takes a `GANModel`, and returns a
+        dict of metric results keyed by name. The output of this function is
+        passed into `tf.estimator.EstimatorSpec` during evaluation.
       add_summaries: `None`, a single `SummaryType`, or a list of `SummaryType`.
       use_loss_summaries: If `True`, add loss summaries. If `False`, does not.
         If `None`, uses defaults.
@@ -160,7 +164,8 @@ def _model_fn(features, labels, mode):
               else discriminator_optimizer)
       gan_head = head_lib.gan_head(
           generator_loss_fn, discriminator_loss_fn, gopt, dopt,
-          use_loss_summaries, get_hooks_fn=get_hooks_fn)
+          use_loss_summaries, get_hooks_fn=get_hooks_fn,
+          get_eval_metric_ops_fn=get_eval_metric_ops_fn)
       return _gan_model_fn(
           features, labels, mode, generator_fn, discriminator_fn, gan_head,
           add_summaries)
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
index 387a62bd741bd4..955482599b372b 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
@@ -38,6 +38,7 @@
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import metrics as metrics_lib
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
@@ -194,6 +195,12 @@ def make_opt():
       lr = learning_rate_decay.exponential_decay(1.0, gstep, 10, 0.9)
       return training.GradientDescentOptimizer(lr)
 
+    def get_metrics(gan_model):
+      return {
+          'mse_custom_metric': metrics_lib.mean_squared_error(
+              gan_model.real_data, gan_model.generated_data)
+      }
+
     gopt = make_opt if lr_decay else training.GradientDescentOptimizer(1.0)
     dopt = make_opt if lr_decay else training.GradientDescentOptimizer(1.0)
     est = estimator.GANEstimator(
@@ -203,6 +210,7 @@ def make_opt():
         discriminator_loss_fn=losses.wasserstein_discriminator_loss,
         generator_optimizer=gopt,
         discriminator_optimizer=dopt,
+        get_eval_metric_ops_fn=get_metrics,
         model_dir=self._model_dir)
 
     # TRAIN
@@ -213,6 +221,9 @@ def make_opt():
     scores = est.evaluate(eval_input_fn)
     self.assertEqual(num_steps, scores[ops.GraphKeys.GLOBAL_STEP])
     self.assertIn('loss', six.iterkeys(scores))
+    self.assertEqual(scores['discriminator_loss'] + scores['generator_loss'],
+                     scores['loss'])
+    self.assertIn('mse_custom_metric', six.iterkeys(scores))
 
     # PREDICT
     predictions = np.array([x for x in est.predict(predict_input_fn)])
diff --git a/tensorflow/contrib/gan/python/estimator/python/head_impl.py b/tensorflow/contrib/gan/python/estimator/python/head_impl.py
index a21358c50bbdb4..ff903a78cc36c1 100644
--- a/tensorflow/contrib/gan/python/estimator/python/head_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/head_impl.py
@@ -25,17 +25,21 @@
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator.canned import head
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import metrics as metrics_lib
 
 __all__ = [
     'GANHead',
     'gan_head',
 ]
 
+def _summary_key(head_name, val):
+  return '%s/%s' % (val, head_name) if head_name else val
+
 
 def gan_head(generator_loss_fn, discriminator_loss_fn, generator_optimizer,
              discriminator_optimizer, use_loss_summaries=True,
              get_hooks_fn=tfgan_train.get_sequential_train_hooks(),
-             name=None):
+             get_eval_metric_ops_fn=None, name=None):
   """Creates a `GANHead`.
 
   Args:
@@ -47,9 +51,12 @@ def gan_head(generator_loss_fn, discriminator_loss_fn, generator_optimizer,
     discriminator_optimizer: Same as `generator_optimizer`, but for the
       discriminator updates.
     use_loss_summaries: If `True`, add loss summaries. If `False`, does not.
-        If `None`, uses defaults.
-    get_hooks_fn: A function that takes a GANTrainOps tuple and returns a list
-        of hooks.
+      If `None`, uses defaults.
+    get_hooks_fn: A function that takes a `GANTrainOps` tuple and returns a
+      list of hooks.
+    get_eval_metric_ops_fn: A function that takes a `GANModel`, and returns a
+      dict of metric results keyed by name. The output of this function is
+      passed into `tf.estimator.EstimatorSpec` during evaluation.
     name: name of the head. If provided, summary and metrics keys will be
       suffixed by `"/" + name`.
 
@@ -62,6 +69,7 @@ def gan_head(generator_loss_fn, discriminator_loss_fn, generator_optimizer,
                  discriminator_optimizer=discriminator_optimizer,
                  use_loss_summaries=use_loss_summaries,
                  get_hooks_fn=get_hooks_fn,
+                 get_eval_metric_ops_fn=get_eval_metric_ops_fn,
                  name=name)
 
 
@@ -72,6 +80,7 @@ def __init__(self, generator_loss_fn, discriminator_loss_fn,
                generator_optimizer, discriminator_optimizer,
                use_loss_summaries=True,
                get_hooks_fn=None,
+               get_eval_metric_ops_fn=None,
                name=None):
     """`Head` for GAN training.
 
@@ -85,8 +94,11 @@ def __init__(self, generator_loss_fn, discriminator_loss_fn,
         discriminator updates.
       use_loss_summaries: If `True`, add loss summaries. If `False`, does not.
         If `None`, uses defaults.
-      get_hooks_fn: A function that takes a GANTrainOps tuple and returns a list
-        of hooks. Defaults to `train.get_sequential_train_hooks()`
+      get_hooks_fn: A function that takes a `GANTrainOps` tuple and returns a
+        list of hooks. Defaults to `train.get_sequential_train_hooks()`
+      get_eval_metric_ops_fn: A function that takes a `GANModel`, and returns a
+        dict of metric results keyed by name. The output of this function is
+        passed into `tf.estimator.EstimatorSpec` during evaluation.
       name: name of the head. If provided, summary and metrics keys will be
         suffixed by `"/" + name`.
     """
@@ -104,6 +116,8 @@ def __init__(self, generator_loss_fn, discriminator_loss_fn,
     self._generator_optimizer = generator_optimizer
     self._discriminator_optimizer = discriminator_optimizer
     self._get_hooks_fn = get_hooks_fn
+    self._get_eval_metric_ops_fn = get_eval_metric_ops_fn
+    self._name = name
 
   @property
   def name(self):
@@ -173,13 +187,26 @@ def create_estimator_spec(
         gan_loss = self.create_loss(
             features=None, mode=mode, logits=gan_model, labels=None)
         scalar_loss = gan_loss.generator_loss + gan_loss.discriminator_loss
+        with ops.name_scope(None, 'metrics',
+                            [gan_loss.generator_loss,
+                             gan_loss.discriminator_loss]):
+          eval_metric_ops = {
+              _summary_key(self._name, 'generator_loss'):
+                  metrics_lib.mean(gan_loss.generator_loss),
+              _summary_key(self._name, 'discriminator_loss'):
+                  metrics_lib.mean(gan_loss.discriminator_loss)
+          }
+          if self._get_eval_metric_ops_fn is not None:
+            custom_eval_metric_ops = self._get_eval_metric_ops_fn(gan_model)
+            if not isinstance(custom_eval_metric_ops, dict):
+              raise TypeError('get_eval_metric_ops_fn must return a dict, '
+                              'received: {}'.format(custom_eval_metric_ops))
+            eval_metric_ops.update(custom_eval_metric_ops)
         return model_fn_lib.EstimatorSpec(
             mode=model_fn_lib.ModeKeys.EVAL,
             predictions=gan_model.generated_data,
             loss=scalar_loss,
-            # TODO(joelshor): Add metrics. If head name provided, append it to
-            # metric keys.
-            eval_metric_ops={})
+            eval_metric_ops=eval_metric_ops)
       elif mode == model_fn_lib.ModeKeys.TRAIN:
         if train_op_fn is None:
           raise ValueError('train_op_fn can not be None.')
diff --git a/tensorflow/contrib/gan/python/estimator/python/head_test.py b/tensorflow/contrib/gan/python/estimator/python/head_test.py
index 8168f005cd1105..6587f1fc600b94 100644
--- a/tensorflow/contrib/gan/python/estimator/python/head_test.py
+++ b/tensorflow/contrib/gan/python/estimator/python/head_test.py
@@ -62,9 +62,14 @@ def setUp(self):
         generator_loss_fn=dummy_loss,
         discriminator_loss_fn=dummy_loss,
         generator_optimizer=training.GradientDescentOptimizer(1.0),
-        discriminator_optimizer=training.GradientDescentOptimizer(1.0))
+        discriminator_optimizer=training.GradientDescentOptimizer(1.0),
+        get_eval_metric_ops_fn=self.get_metrics)
     self.assertTrue(isinstance(self.gan_head, head.GANHead))
 
+  def get_metrics(self, gan_model):
+    self.assertTrue(isinstance(gan_model, tfgan_tuples.GANModel))
+    return {}
+
   def _test_modes_helper(self, mode):
     self.gan_head.create_estimator_spec(
         features=None,
diff --git a/tensorflow/contrib/gan/python/features/python/conditioning_utils.py b/tensorflow/contrib/gan/python/features/python/conditioning_utils.py
index df71187fbd98c8..a9b8faa7126253 100644
--- a/tensorflow/contrib/gan/python/features/python/conditioning_utils.py
+++ b/tensorflow/contrib/gan/python/features/python/conditioning_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Miscellanous utilities for TFGAN code and examples."""
+"""Miscellaneous utilities for TFGAN code and examples."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py b/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
index 2889e937436d2f..9f5fee45422e0b 100644
--- a/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
+++ b/tensorflow/contrib/gan/python/losses/python/losses_impl_test.py
@@ -570,7 +570,7 @@ def setUp(self):
         'predicted_distributions': self._predicted_distributions,
     }
     self._expected_loss = 1.61610
-    self._expected_op_name = 'mutual_information_loss/mul'
+    self._expected_op_name = 'mutual_information_loss/mul_1'
     self._batch_size = 2
 
 
diff --git a/tensorflow/contrib/graph_editor/transform.py b/tensorflow/contrib/graph_editor/transform.py
index a320a3f232fc1d..026a3d12000334 100644
--- a/tensorflow/contrib/graph_editor/transform.py
+++ b/tensorflow/contrib/graph_editor/transform.py
@@ -189,9 +189,6 @@ def copy_op_handler(info, op, new_inputs, copy_shape=True, nodedef_fn=None):
   if op._original_op:
     op_._original_op = op._original_op
 
-  # Add op to the graph
-  info.graph_._add_op(op_)
-
   return op_, op_.outputs
 
 
@@ -492,7 +489,7 @@ def _finalize_cycles(self, info):
       t_ = info.transformed_ts[t]
       consumer_op_ = info.transformed_ops[consumer_op]
       t_index_ = list(consumer_op_.inputs).index(tmp_t_)
-      consumer_op_._update_input(t_index_, t_, update_dtype=False)  # pylint: disable=protected-access
+      consumer_op_._update_input(t_index_, t_)  # pylint: disable=protected-access
 
   def _connect_control_inputs(self, info):
     """Connect the previously copied ops."""
@@ -677,7 +674,7 @@ def replace_t_with_replacement_handler(info, t):
 
 
 def _add_control_flow_ops(ops, control_ios):
-  """Complete `ops` so that the tranformed graph is valid.
+  """Complete `ops` so that the transformed graph is valid.
 
   Partially copying a graph can lead to a malformed graph. For instance,
   copying half of a while construct is likely to result in an invalid graph.
diff --git a/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c b/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
index 6a5d982dc8514d..2e5c84704f8464 100644
--- a/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
+++ b/tensorflow/contrib/hvx/hexagon_controller/src_impl/hexagon_controller.c
@@ -19,7 +19,7 @@ limitations under the License.
 
 #include "hexagon_controller.h"
 
-#include <malloc.h>
+#include <stdlib.h>
 #include <stdio.h>
 
 #include "adspmsgd.h"
diff --git a/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc b/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc
index 60281951dda940..66939fbb0f0d3b 100644
--- a/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc
+++ b/tensorflow/contrib/hvx/hvx_ops_support_checker/hvx_ops_support_checker_main.cc
@@ -115,7 +115,7 @@ static void CheckOpsSupport(const GraphDef& graph_def,
       HexagonOpsDefinitions::getInstance();
   LOG(INFO) << "Checking " << graph_def.node_size() << " nodes";
   LOG(INFO) << "dump_all_nodes = " << dump_all_nodes
-            << ", dump_shape_and_tpye = " << dump_shape_and_type;
+            << ", dump_shape_and_type = " << dump_shape_and_type;
 
   std::unordered_set<string> unsupported_ops;
   bool all_supported = true;
diff --git a/tensorflow/contrib/image/__init__.py b/tensorflow/contrib/image/__init__.py
index 8f406ace1d5dcc..f230d93da4a9c0 100755
--- a/tensorflow/contrib/image/__init__.py
+++ b/tensorflow/contrib/image/__init__.py
@@ -17,7 +17,7 @@
 ### API
 
 This module provides functions for image manipulation; currently, chrominance
-transformas (including changing saturation and hue) in YIQ space and
+transforms (including changing saturation and hue) in YIQ space and
 projective transforms (including rotation) are supported.
 
 ## Image Transformation `Ops`
diff --git a/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc b/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc
index a4cd4a2cc4b99b..2638b25ec424b5 100644
--- a/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc
+++ b/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc
@@ -64,7 +64,7 @@ class KafkaDatasetOp : public DatasetOpKernel {
           eof_(eof),
           timeout_(timeout) {}
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::Kafka")}));
@@ -81,7 +81,7 @@ class KafkaDatasetOp : public DatasetOpKernel {
       return *shapes;
     }
 
-    string DebugString() override { return "KafkaDatasetOp::Dataset"; }
+    string DebugString() const override { return "KafkaDatasetOp::Dataset"; }
 
    protected:
     Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
diff --git a/tensorflow/contrib/keras/api/keras/activations/__init__.py b/tensorflow/contrib/keras/api/keras/activations/__init__.py
index d04838c218d664..3f0184276f6b90 100644
--- a/tensorflow/contrib/keras/api/keras/activations/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/activations/__init__.py
@@ -19,22 +19,22 @@
 from __future__ import print_function
 
 # Activation functions.
-from tensorflow.python.keras._impl.keras.activations import elu
-from tensorflow.python.keras._impl.keras.activations import hard_sigmoid
-from tensorflow.python.keras._impl.keras.activations import linear
-from tensorflow.python.keras._impl.keras.activations import relu
-from tensorflow.python.keras._impl.keras.activations import selu
-from tensorflow.python.keras._impl.keras.activations import sigmoid
-from tensorflow.python.keras._impl.keras.activations import softmax
-from tensorflow.python.keras._impl.keras.activations import softplus
-from tensorflow.python.keras._impl.keras.activations import softsign
-from tensorflow.python.keras._impl.keras.activations import tanh
+from tensorflow.python.keras.activations import elu
+from tensorflow.python.keras.activations import hard_sigmoid
+from tensorflow.python.keras.activations import linear
+from tensorflow.python.keras.activations import relu
+from tensorflow.python.keras.activations import selu
+from tensorflow.python.keras.activations import sigmoid
+from tensorflow.python.keras.activations import softmax
+from tensorflow.python.keras.activations import softplus
+from tensorflow.python.keras.activations import softsign
+from tensorflow.python.keras.activations import tanh
 
 # Auxiliary utils.
 # pylint: disable=g-bad-import-order
-from tensorflow.python.keras._impl.keras.activations import deserialize
-from tensorflow.python.keras._impl.keras.activations import serialize
-from tensorflow.python.keras._impl.keras.activations import get
+from tensorflow.python.keras.activations import deserialize
+from tensorflow.python.keras.activations import serialize
+from tensorflow.python.keras.activations import get
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/applications/inception_v3/__init__.py b/tensorflow/contrib/keras/api/keras/applications/inception_v3/__init__.py
index abf8393ae45d71..6dfb5cab17c088 100644
--- a/tensorflow/contrib/keras/api/keras/applications/inception_v3/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/applications/inception_v3/__init__.py
@@ -18,9 +18,9 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.applications.inception_v3 import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.inception_v3 import InceptionV3
-from tensorflow.python.keras._impl.keras.applications.inception_v3 import preprocess_input
+from tensorflow.python.keras.applications.inception_v3 import decode_predictions
+from tensorflow.python.keras.applications.inception_v3 import InceptionV3
+from tensorflow.python.keras.applications.inception_v3 import preprocess_input
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/applications/mobilenet/__init__.py b/tensorflow/contrib/keras/api/keras/applications/mobilenet/__init__.py
index b809e91193b459..67306cc51e1927 100644
--- a/tensorflow/contrib/keras/api/keras/applications/mobilenet/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/applications/mobilenet/__init__.py
@@ -18,9 +18,9 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.applications.mobilenet import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.mobilenet import MobileNet
-from tensorflow.python.keras._impl.keras.applications.mobilenet import preprocess_input
+from tensorflow.python.keras.applications.mobilenet import decode_predictions
+from tensorflow.python.keras.applications.mobilenet import MobileNet
+from tensorflow.python.keras.applications.mobilenet import preprocess_input
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/applications/resnet50/__init__.py b/tensorflow/contrib/keras/api/keras/applications/resnet50/__init__.py
index 530805d150bfe3..a25ff48b593a9a 100644
--- a/tensorflow/contrib/keras/api/keras/applications/resnet50/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/applications/resnet50/__init__.py
@@ -18,9 +18,9 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.applications.resnet50 import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.resnet50 import preprocess_input
-from tensorflow.python.keras._impl.keras.applications.resnet50 import ResNet50
+from tensorflow.python.keras.applications.resnet50 import decode_predictions
+from tensorflow.python.keras.applications.resnet50 import preprocess_input
+from tensorflow.python.keras.applications.resnet50 import ResNet50
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/applications/vgg16/__init__.py b/tensorflow/contrib/keras/api/keras/applications/vgg16/__init__.py
index 118361604bbc7e..4964b1b7deb56f 100644
--- a/tensorflow/contrib/keras/api/keras/applications/vgg16/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/applications/vgg16/__init__.py
@@ -18,9 +18,9 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.applications.vgg16 import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.vgg16 import preprocess_input
-from tensorflow.python.keras._impl.keras.applications.vgg16 import VGG16
+from tensorflow.python.keras.applications.vgg16 import decode_predictions
+from tensorflow.python.keras.applications.vgg16 import preprocess_input
+from tensorflow.python.keras.applications.vgg16 import VGG16
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/applications/vgg19/__init__.py b/tensorflow/contrib/keras/api/keras/applications/vgg19/__init__.py
index cda52628f3c10d..afb3abebdd6735 100644
--- a/tensorflow/contrib/keras/api/keras/applications/vgg19/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/applications/vgg19/__init__.py
@@ -18,9 +18,9 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.applications.vgg19 import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.vgg19 import preprocess_input
-from tensorflow.python.keras._impl.keras.applications.vgg19 import VGG19
+from tensorflow.python.keras.applications.vgg19 import decode_predictions
+from tensorflow.python.keras.applications.vgg19 import preprocess_input
+from tensorflow.python.keras.applications.vgg19 import VGG19
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/applications/xception/__init__.py b/tensorflow/contrib/keras/api/keras/applications/xception/__init__.py
index ae9cd9cd18c5cc..2e3335d02aff0f 100644
--- a/tensorflow/contrib/keras/api/keras/applications/xception/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/applications/xception/__init__.py
@@ -18,9 +18,9 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.applications.xception import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.xception import preprocess_input
-from tensorflow.python.keras._impl.keras.applications.xception import Xception
+from tensorflow.python.keras.applications.xception import decode_predictions
+from tensorflow.python.keras.applications.xception import preprocess_input
+from tensorflow.python.keras.applications.xception import Xception
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/backend/__init__.py b/tensorflow/contrib/keras/api/keras/backend/__init__.py
index 10ef5a75852deb..a755364014206e 100644
--- a/tensorflow/contrib/keras/api/keras/backend/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/backend/__init__.py
@@ -19,144 +19,144 @@
 from __future__ import print_function
 
 # pylint: disable=redefined-builtin
-from tensorflow.python.keras._impl.keras.backend import abs
-from tensorflow.python.keras._impl.keras.backend import all
-from tensorflow.python.keras._impl.keras.backend import any
-from tensorflow.python.keras._impl.keras.backend import arange
-from tensorflow.python.keras._impl.keras.backend import argmax
-from tensorflow.python.keras._impl.keras.backend import argmin
-from tensorflow.python.keras._impl.keras.backend import backend
-from tensorflow.python.keras._impl.keras.backend import batch_dot
-from tensorflow.python.keras._impl.keras.backend import batch_flatten
-from tensorflow.python.keras._impl.keras.backend import batch_get_value
-from tensorflow.python.keras._impl.keras.backend import batch_normalization
-from tensorflow.python.keras._impl.keras.backend import batch_set_value
-from tensorflow.python.keras._impl.keras.backend import bias_add
-from tensorflow.python.keras._impl.keras.backend import binary_crossentropy
-from tensorflow.python.keras._impl.keras.backend import cast
-from tensorflow.python.keras._impl.keras.backend import cast_to_floatx
-from tensorflow.python.keras._impl.keras.backend import categorical_crossentropy
-from tensorflow.python.keras._impl.keras.backend import clear_session
-from tensorflow.python.keras._impl.keras.backend import clip
-from tensorflow.python.keras._impl.keras.backend import concatenate
-from tensorflow.python.keras._impl.keras.backend import constant
-from tensorflow.python.keras._impl.keras.backend import conv1d
-from tensorflow.python.keras._impl.keras.backend import conv2d
-from tensorflow.python.keras._impl.keras.backend import conv2d_transpose
-from tensorflow.python.keras._impl.keras.backend import conv3d
-from tensorflow.python.keras._impl.keras.backend import cos
-from tensorflow.python.keras._impl.keras.backend import count_params
-from tensorflow.python.keras._impl.keras.backend import ctc_batch_cost
-from tensorflow.python.keras._impl.keras.backend import ctc_decode
-from tensorflow.python.keras._impl.keras.backend import ctc_label_dense_to_sparse
-from tensorflow.python.keras._impl.keras.backend import dot
-from tensorflow.python.keras._impl.keras.backend import dropout
-from tensorflow.python.keras._impl.keras.backend import dtype
-from tensorflow.python.keras._impl.keras.backend import elu
-from tensorflow.python.keras._impl.keras.backend import epsilon
-from tensorflow.python.keras._impl.keras.backend import equal
-from tensorflow.python.keras._impl.keras.backend import eval
-from tensorflow.python.keras._impl.keras.backend import exp
-from tensorflow.python.keras._impl.keras.backend import expand_dims
-from tensorflow.python.keras._impl.keras.backend import eye
-from tensorflow.python.keras._impl.keras.backend import flatten
-from tensorflow.python.keras._impl.keras.backend import floatx
-from tensorflow.python.keras._impl.keras.backend import foldl
-from tensorflow.python.keras._impl.keras.backend import foldr
-from tensorflow.python.keras._impl.keras.backend import function
-from tensorflow.python.keras._impl.keras.backend import gather
-from tensorflow.python.keras._impl.keras.backend import get_session
-from tensorflow.python.keras._impl.keras.backend import get_uid
-from tensorflow.python.keras._impl.keras.backend import get_value
-from tensorflow.python.keras._impl.keras.backend import gradients
-from tensorflow.python.keras._impl.keras.backend import greater
-from tensorflow.python.keras._impl.keras.backend import greater_equal
-from tensorflow.python.keras._impl.keras.backend import hard_sigmoid
-from tensorflow.python.keras._impl.keras.backend import image_data_format
-from tensorflow.python.keras._impl.keras.backend import in_test_phase
-from tensorflow.python.keras._impl.keras.backend import in_top_k
-from tensorflow.python.keras._impl.keras.backend import in_train_phase
-from tensorflow.python.keras._impl.keras.backend import int_shape
-from tensorflow.python.keras._impl.keras.backend import is_sparse
-from tensorflow.python.keras._impl.keras.backend import l2_normalize
-from tensorflow.python.keras._impl.keras.backend import learning_phase
-from tensorflow.python.keras._impl.keras.backend import less
-from tensorflow.python.keras._impl.keras.backend import less_equal
-from tensorflow.python.keras._impl.keras.backend import log
-from tensorflow.python.keras._impl.keras.backend import manual_variable_initialization
-from tensorflow.python.keras._impl.keras.backend import map_fn
-from tensorflow.python.keras._impl.keras.backend import max
-from tensorflow.python.keras._impl.keras.backend import maximum
-from tensorflow.python.keras._impl.keras.backend import mean
-from tensorflow.python.keras._impl.keras.backend import min
-from tensorflow.python.keras._impl.keras.backend import minimum
-from tensorflow.python.keras._impl.keras.backend import moving_average_update
-from tensorflow.python.keras._impl.keras.backend import name_scope
-from tensorflow.python.keras._impl.keras.backend import ndim
-from tensorflow.python.keras._impl.keras.backend import normalize_batch_in_training
-from tensorflow.python.keras._impl.keras.backend import not_equal
-from tensorflow.python.keras._impl.keras.backend import one_hot
-from tensorflow.python.keras._impl.keras.backend import ones
-from tensorflow.python.keras._impl.keras.backend import ones_like
-from tensorflow.python.keras._impl.keras.backend import permute_dimensions
-from tensorflow.python.keras._impl.keras.backend import placeholder
-from tensorflow.python.keras._impl.keras.backend import pool2d
-from tensorflow.python.keras._impl.keras.backend import pool3d
-from tensorflow.python.keras._impl.keras.backend import pow
-from tensorflow.python.keras._impl.keras.backend import print_tensor
-from tensorflow.python.keras._impl.keras.backend import prod
-from tensorflow.python.keras._impl.keras.backend import random_binomial
-from tensorflow.python.keras._impl.keras.backend import random_normal
-from tensorflow.python.keras._impl.keras.backend import random_normal_variable
-from tensorflow.python.keras._impl.keras.backend import random_uniform
-from tensorflow.python.keras._impl.keras.backend import random_uniform_variable
-from tensorflow.python.keras._impl.keras.backend import relu
-from tensorflow.python.keras._impl.keras.backend import repeat
-from tensorflow.python.keras._impl.keras.backend import repeat_elements
-from tensorflow.python.keras._impl.keras.backend import reset_uids
-from tensorflow.python.keras._impl.keras.backend import reshape
-from tensorflow.python.keras._impl.keras.backend import resize_images
-from tensorflow.python.keras._impl.keras.backend import resize_volumes
-from tensorflow.python.keras._impl.keras.backend import reverse
-from tensorflow.python.keras._impl.keras.backend import rnn
-from tensorflow.python.keras._impl.keras.backend import round
-from tensorflow.python.keras._impl.keras.backend import separable_conv2d
-from tensorflow.python.keras._impl.keras.backend import set_epsilon
-from tensorflow.python.keras._impl.keras.backend import set_floatx
-from tensorflow.python.keras._impl.keras.backend import set_image_data_format
-from tensorflow.python.keras._impl.keras.backend import set_learning_phase
-from tensorflow.python.keras._impl.keras.backend import set_session
-from tensorflow.python.keras._impl.keras.backend import set_value
-from tensorflow.python.keras._impl.keras.backend import shape
-from tensorflow.python.keras._impl.keras.backend import sigmoid
-from tensorflow.python.keras._impl.keras.backend import sign
-from tensorflow.python.keras._impl.keras.backend import sin
-from tensorflow.python.keras._impl.keras.backend import softmax
-from tensorflow.python.keras._impl.keras.backend import softplus
-from tensorflow.python.keras._impl.keras.backend import softsign
-from tensorflow.python.keras._impl.keras.backend import sparse_categorical_crossentropy
-from tensorflow.python.keras._impl.keras.backend import spatial_2d_padding
-from tensorflow.python.keras._impl.keras.backend import spatial_3d_padding
-from tensorflow.python.keras._impl.keras.backend import sqrt
-from tensorflow.python.keras._impl.keras.backend import square
-from tensorflow.python.keras._impl.keras.backend import squeeze
-from tensorflow.python.keras._impl.keras.backend import stack
-from tensorflow.python.keras._impl.keras.backend import std
-from tensorflow.python.keras._impl.keras.backend import stop_gradient
-from tensorflow.python.keras._impl.keras.backend import sum
-from tensorflow.python.keras._impl.keras.backend import switch
-from tensorflow.python.keras._impl.keras.backend import tanh
-from tensorflow.python.keras._impl.keras.backend import temporal_padding
-from tensorflow.python.keras._impl.keras.backend import to_dense
-from tensorflow.python.keras._impl.keras.backend import transpose
-from tensorflow.python.keras._impl.keras.backend import truncated_normal
-from tensorflow.python.keras._impl.keras.backend import update
-from tensorflow.python.keras._impl.keras.backend import update_add
-from tensorflow.python.keras._impl.keras.backend import update_sub
-from tensorflow.python.keras._impl.keras.backend import var
-from tensorflow.python.keras._impl.keras.backend import variable
-from tensorflow.python.keras._impl.keras.backend import zeros
-from tensorflow.python.keras._impl.keras.backend import zeros_like
+from tensorflow.python.keras.backend import abs
+from tensorflow.python.keras.backend import all
+from tensorflow.python.keras.backend import any
+from tensorflow.python.keras.backend import arange
+from tensorflow.python.keras.backend import argmax
+from tensorflow.python.keras.backend import argmin
+from tensorflow.python.keras.backend import backend
+from tensorflow.python.keras.backend import batch_dot
+from tensorflow.python.keras.backend import batch_flatten
+from tensorflow.python.keras.backend import batch_get_value
+from tensorflow.python.keras.backend import batch_normalization
+from tensorflow.python.keras.backend import batch_set_value
+from tensorflow.python.keras.backend import bias_add
+from tensorflow.python.keras.backend import binary_crossentropy
+from tensorflow.python.keras.backend import cast
+from tensorflow.python.keras.backend import cast_to_floatx
+from tensorflow.python.keras.backend import categorical_crossentropy
+from tensorflow.python.keras.backend import clear_session
+from tensorflow.python.keras.backend import clip
+from tensorflow.python.keras.backend import concatenate
+from tensorflow.python.keras.backend import constant
+from tensorflow.python.keras.backend import conv1d
+from tensorflow.python.keras.backend import conv2d
+from tensorflow.python.keras.backend import conv2d_transpose
+from tensorflow.python.keras.backend import conv3d
+from tensorflow.python.keras.backend import cos
+from tensorflow.python.keras.backend import count_params
+from tensorflow.python.keras.backend import ctc_batch_cost
+from tensorflow.python.keras.backend import ctc_decode
+from tensorflow.python.keras.backend import ctc_label_dense_to_sparse
+from tensorflow.python.keras.backend import dot
+from tensorflow.python.keras.backend import dropout
+from tensorflow.python.keras.backend import dtype
+from tensorflow.python.keras.backend import elu
+from tensorflow.python.keras.backend import epsilon
+from tensorflow.python.keras.backend import equal
+from tensorflow.python.keras.backend import eval
+from tensorflow.python.keras.backend import exp
+from tensorflow.python.keras.backend import expand_dims
+from tensorflow.python.keras.backend import eye
+from tensorflow.python.keras.backend import flatten
+from tensorflow.python.keras.backend import floatx
+from tensorflow.python.keras.backend import foldl
+from tensorflow.python.keras.backend import foldr
+from tensorflow.python.keras.backend import function
+from tensorflow.python.keras.backend import gather
+from tensorflow.python.keras.backend import get_session
+from tensorflow.python.keras.backend import get_uid
+from tensorflow.python.keras.backend import get_value
+from tensorflow.python.keras.backend import gradients
+from tensorflow.python.keras.backend import greater
+from tensorflow.python.keras.backend import greater_equal
+from tensorflow.python.keras.backend import hard_sigmoid
+from tensorflow.python.keras.backend import image_data_format
+from tensorflow.python.keras.backend import in_test_phase
+from tensorflow.python.keras.backend import in_top_k
+from tensorflow.python.keras.backend import in_train_phase
+from tensorflow.python.keras.backend import int_shape
+from tensorflow.python.keras.backend import is_sparse
+from tensorflow.python.keras.backend import l2_normalize
+from tensorflow.python.keras.backend import learning_phase
+from tensorflow.python.keras.backend import less
+from tensorflow.python.keras.backend import less_equal
+from tensorflow.python.keras.backend import log
+from tensorflow.python.keras.backend import manual_variable_initialization
+from tensorflow.python.keras.backend import map_fn
+from tensorflow.python.keras.backend import max
+from tensorflow.python.keras.backend import maximum
+from tensorflow.python.keras.backend import mean
+from tensorflow.python.keras.backend import min
+from tensorflow.python.keras.backend import minimum
+from tensorflow.python.keras.backend import moving_average_update
+from tensorflow.python.keras.backend import name_scope
+from tensorflow.python.keras.backend import ndim
+from tensorflow.python.keras.backend import normalize_batch_in_training
+from tensorflow.python.keras.backend import not_equal
+from tensorflow.python.keras.backend import one_hot
+from tensorflow.python.keras.backend import ones
+from tensorflow.python.keras.backend import ones_like
+from tensorflow.python.keras.backend import permute_dimensions
+from tensorflow.python.keras.backend import placeholder
+from tensorflow.python.keras.backend import pool2d
+from tensorflow.python.keras.backend import pool3d
+from tensorflow.python.keras.backend import pow
+from tensorflow.python.keras.backend import print_tensor
+from tensorflow.python.keras.backend import prod
+from tensorflow.python.keras.backend import random_binomial
+from tensorflow.python.keras.backend import random_normal
+from tensorflow.python.keras.backend import random_normal_variable
+from tensorflow.python.keras.backend import random_uniform
+from tensorflow.python.keras.backend import random_uniform_variable
+from tensorflow.python.keras.backend import relu
+from tensorflow.python.keras.backend import repeat
+from tensorflow.python.keras.backend import repeat_elements
+from tensorflow.python.keras.backend import reset_uids
+from tensorflow.python.keras.backend import reshape
+from tensorflow.python.keras.backend import resize_images
+from tensorflow.python.keras.backend import resize_volumes
+from tensorflow.python.keras.backend import reverse
+from tensorflow.python.keras.backend import rnn
+from tensorflow.python.keras.backend import round
+from tensorflow.python.keras.backend import separable_conv2d
+from tensorflow.python.keras.backend import set_epsilon
+from tensorflow.python.keras.backend import set_floatx
+from tensorflow.python.keras.backend import set_image_data_format
+from tensorflow.python.keras.backend import set_learning_phase
+from tensorflow.python.keras.backend import set_session
+from tensorflow.python.keras.backend import set_value
+from tensorflow.python.keras.backend import shape
+from tensorflow.python.keras.backend import sigmoid
+from tensorflow.python.keras.backend import sign
+from tensorflow.python.keras.backend import sin
+from tensorflow.python.keras.backend import softmax
+from tensorflow.python.keras.backend import softplus
+from tensorflow.python.keras.backend import softsign
+from tensorflow.python.keras.backend import sparse_categorical_crossentropy
+from tensorflow.python.keras.backend import spatial_2d_padding
+from tensorflow.python.keras.backend import spatial_3d_padding
+from tensorflow.python.keras.backend import sqrt
+from tensorflow.python.keras.backend import square
+from tensorflow.python.keras.backend import squeeze
+from tensorflow.python.keras.backend import stack
+from tensorflow.python.keras.backend import std
+from tensorflow.python.keras.backend import stop_gradient
+from tensorflow.python.keras.backend import sum
+from tensorflow.python.keras.backend import switch
+from tensorflow.python.keras.backend import tanh
+from tensorflow.python.keras.backend import temporal_padding
+from tensorflow.python.keras.backend import to_dense
+from tensorflow.python.keras.backend import transpose
+from tensorflow.python.keras.backend import truncated_normal
+from tensorflow.python.keras.backend import update
+from tensorflow.python.keras.backend import update_add
+from tensorflow.python.keras.backend import update_sub
+from tensorflow.python.keras.backend import var
+from tensorflow.python.keras.backend import variable
+from tensorflow.python.keras.backend import zeros
+from tensorflow.python.keras.backend import zeros_like
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/callbacks/__init__.py b/tensorflow/contrib/keras/api/keras/callbacks/__init__.py
index 2d884790ddb9cc..10e05f2969bc40 100644
--- a/tensorflow/contrib/keras/api/keras/callbacks/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/callbacks/__init__.py
@@ -18,19 +18,19 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.callbacks import BaseLogger
-from tensorflow.python.keras._impl.keras.callbacks import Callback
-from tensorflow.python.keras._impl.keras.callbacks import CSVLogger
-from tensorflow.python.keras._impl.keras.callbacks import EarlyStopping
-from tensorflow.python.keras._impl.keras.callbacks import History
-from tensorflow.python.keras._impl.keras.callbacks import LambdaCallback
-from tensorflow.python.keras._impl.keras.callbacks import LearningRateScheduler
-from tensorflow.python.keras._impl.keras.callbacks import ModelCheckpoint
-from tensorflow.python.keras._impl.keras.callbacks import ProgbarLogger
-from tensorflow.python.keras._impl.keras.callbacks import ReduceLROnPlateau
-from tensorflow.python.keras._impl.keras.callbacks import RemoteMonitor
-from tensorflow.python.keras._impl.keras.callbacks import TensorBoard
-from tensorflow.python.keras._impl.keras.callbacks import TerminateOnNaN
+from tensorflow.python.keras.callbacks import BaseLogger
+from tensorflow.python.keras.callbacks import Callback
+from tensorflow.python.keras.callbacks import CSVLogger
+from tensorflow.python.keras.callbacks import EarlyStopping
+from tensorflow.python.keras.callbacks import History
+from tensorflow.python.keras.callbacks import LambdaCallback
+from tensorflow.python.keras.callbacks import LearningRateScheduler
+from tensorflow.python.keras.callbacks import ModelCheckpoint
+from tensorflow.python.keras.callbacks import ProgbarLogger
+from tensorflow.python.keras.callbacks import ReduceLROnPlateau
+from tensorflow.python.keras.callbacks import RemoteMonitor
+from tensorflow.python.keras.callbacks import TensorBoard
+from tensorflow.python.keras.callbacks import TerminateOnNaN
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/constraints/__init__.py b/tensorflow/contrib/keras/api/keras/constraints/__init__.py
index 152606d8ebbcad..08debf974ec3a3 100644
--- a/tensorflow/contrib/keras/api/keras/constraints/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/constraints/__init__.py
@@ -19,21 +19,21 @@
 from __future__ import print_function
 
 # Constraints functions / callable classes.
-from tensorflow.python.keras._impl.keras.constraints import Constraint
-from tensorflow.python.keras._impl.keras.constraints import max_norm
-from tensorflow.python.keras._impl.keras.constraints import MaxNorm
-from tensorflow.python.keras._impl.keras.constraints import min_max_norm
-from tensorflow.python.keras._impl.keras.constraints import MinMaxNorm
-from tensorflow.python.keras._impl.keras.constraints import non_neg
-from tensorflow.python.keras._impl.keras.constraints import NonNeg
-from tensorflow.python.keras._impl.keras.constraints import unit_norm
-from tensorflow.python.keras._impl.keras.constraints import UnitNorm
+from tensorflow.python.keras.constraints import Constraint
+from tensorflow.python.keras.constraints import max_norm
+from tensorflow.python.keras.constraints import MaxNorm
+from tensorflow.python.keras.constraints import min_max_norm
+from tensorflow.python.keras.constraints import MinMaxNorm
+from tensorflow.python.keras.constraints import non_neg
+from tensorflow.python.keras.constraints import NonNeg
+from tensorflow.python.keras.constraints import unit_norm
+from tensorflow.python.keras.constraints import UnitNorm
 
 # Auxiliary utils.
 # pylint: disable=g-bad-import-order
-from tensorflow.python.keras._impl.keras.constraints import deserialize
-from tensorflow.python.keras._impl.keras.constraints import serialize
-from tensorflow.python.keras._impl.keras.constraints import get
+from tensorflow.python.keras.constraints import deserialize
+from tensorflow.python.keras.constraints import serialize
+from tensorflow.python.keras.constraints import get
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/datasets/boston_housing/__init__.py b/tensorflow/contrib/keras/api/keras/datasets/boston_housing/__init__.py
index b5371a03fd5f57..a5a6fdab445d2d 100644
--- a/tensorflow/contrib/keras/api/keras/datasets/boston_housing/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/datasets/boston_housing/__init__.py
@@ -18,7 +18,7 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.datasets.boston_housing import load_data
+from tensorflow.python.keras.datasets.boston_housing import load_data
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/datasets/cifar10/__init__.py b/tensorflow/contrib/keras/api/keras/datasets/cifar10/__init__.py
index 68d3eb789ea2c4..e74e5f347df2ee 100644
--- a/tensorflow/contrib/keras/api/keras/datasets/cifar10/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/datasets/cifar10/__init__.py
@@ -18,7 +18,7 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.datasets.cifar10 import load_data
+from tensorflow.python.keras.datasets.cifar10 import load_data
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/datasets/cifar100/__init__.py b/tensorflow/contrib/keras/api/keras/datasets/cifar100/__init__.py
index ca937426733416..8f5753a6360dfb 100644
--- a/tensorflow/contrib/keras/api/keras/datasets/cifar100/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/datasets/cifar100/__init__.py
@@ -18,7 +18,7 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.datasets.cifar100 import load_data
+from tensorflow.python.keras.datasets.cifar100 import load_data
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/datasets/imdb/__init__.py b/tensorflow/contrib/keras/api/keras/datasets/imdb/__init__.py
index 1c6396d2d32b88..bd6ec4b8dfb034 100644
--- a/tensorflow/contrib/keras/api/keras/datasets/imdb/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/datasets/imdb/__init__.py
@@ -18,8 +18,8 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.datasets.imdb import get_word_index
-from tensorflow.python.keras._impl.keras.datasets.imdb import load_data
+from tensorflow.python.keras.datasets.imdb import get_word_index
+from tensorflow.python.keras.datasets.imdb import load_data
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/datasets/mnist/__init__.py b/tensorflow/contrib/keras/api/keras/datasets/mnist/__init__.py
index 364255f3387b59..f61145655bd5d9 100644
--- a/tensorflow/contrib/keras/api/keras/datasets/mnist/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/datasets/mnist/__init__.py
@@ -18,7 +18,7 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.datasets.mnist import load_data
+from tensorflow.python.keras.datasets.mnist import load_data
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/datasets/reuters/__init__.py b/tensorflow/contrib/keras/api/keras/datasets/reuters/__init__.py
index bb6791a344ad0c..ade31f4ea9c332 100644
--- a/tensorflow/contrib/keras/api/keras/datasets/reuters/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/datasets/reuters/__init__.py
@@ -18,8 +18,8 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.datasets.reuters import get_word_index
-from tensorflow.python.keras._impl.keras.datasets.reuters import load_data
+from tensorflow.python.keras.datasets.reuters import get_word_index
+from tensorflow.python.keras.datasets.reuters import load_data
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/initializers/__init__.py b/tensorflow/contrib/keras/api/keras/initializers/__init__.py
index 6b1fcfd2d9585d..c6bdc4f0dac3f4 100644
--- a/tensorflow/contrib/keras/api/keras/initializers/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/initializers/__init__.py
@@ -19,30 +19,30 @@
 from __future__ import print_function
 
 # Initializer functions / callable classes.
-from tensorflow.python.keras._impl.keras.initializers import Constant
-from tensorflow.python.keras._impl.keras.initializers import Identity
-from tensorflow.python.keras._impl.keras.initializers import Initializer
-from tensorflow.python.keras._impl.keras.initializers import Ones
-from tensorflow.python.keras._impl.keras.initializers import Orthogonal
-from tensorflow.python.keras._impl.keras.initializers import RandomNormal
-from tensorflow.python.keras._impl.keras.initializers import RandomUniform
-from tensorflow.python.keras._impl.keras.initializers import TruncatedNormal
-from tensorflow.python.keras._impl.keras.initializers import VarianceScaling
-from tensorflow.python.keras._impl.keras.initializers import Zeros
+from tensorflow.python.keras.initializers import Constant
+from tensorflow.python.keras.initializers import Identity
+from tensorflow.python.keras.initializers import Initializer
+from tensorflow.python.keras.initializers import Ones
+from tensorflow.python.keras.initializers import Orthogonal
+from tensorflow.python.keras.initializers import RandomNormal
+from tensorflow.python.keras.initializers import RandomUniform
+from tensorflow.python.keras.initializers import TruncatedNormal
+from tensorflow.python.keras.initializers import VarianceScaling
+from tensorflow.python.keras.initializers import Zeros
 
 # Functional interface.
 # pylint: disable=g-bad-import-order
-from tensorflow.python.keras._impl.keras.initializers import glorot_normal
-from tensorflow.python.keras._impl.keras.initializers import glorot_uniform
-from tensorflow.python.keras._impl.keras.initializers import he_normal
-from tensorflow.python.keras._impl.keras.initializers import he_uniform
-from tensorflow.python.keras._impl.keras.initializers import lecun_normal
-from tensorflow.python.keras._impl.keras.initializers import lecun_uniform
+from tensorflow.python.keras.initializers import glorot_normal
+from tensorflow.python.keras.initializers import glorot_uniform
+from tensorflow.python.keras.initializers import he_normal
+from tensorflow.python.keras.initializers import he_uniform
+from tensorflow.python.keras.initializers import lecun_normal
+from tensorflow.python.keras.initializers import lecun_uniform
 
 # Auxiliary utils.
-from tensorflow.python.keras._impl.keras.initializers import deserialize
-from tensorflow.python.keras._impl.keras.initializers import serialize
-from tensorflow.python.keras._impl.keras.initializers import get
+from tensorflow.python.keras.initializers import deserialize
+from tensorflow.python.keras.initializers import serialize
+from tensorflow.python.keras.initializers import get
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/layers/__init__.py b/tensorflow/contrib/keras/api/keras/layers/__init__.py
index acf0a5e1799b7c..938c881fcbe186 100644
--- a/tensorflow/contrib/keras/api/keras/layers/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/layers/__init__.py
@@ -20,128 +20,128 @@
 
 # Generic layers.
 # pylint: disable=g-bad-import-order
-from tensorflow.python.keras._impl.keras.engine import Input
-from tensorflow.python.keras._impl.keras.engine import InputLayer
-from tensorflow.python.keras._impl.keras.engine import InputSpec
-from tensorflow.python.keras._impl.keras.engine import Layer
+from tensorflow.python.keras.engine import Input
+from tensorflow.python.keras.engine import InputLayer
+from tensorflow.python.keras.engine import InputSpec
+from tensorflow.python.keras.engine import Layer
 
 # Advanced activations.
-from tensorflow.python.keras._impl.keras.layers.advanced_activations import LeakyReLU
-from tensorflow.python.keras._impl.keras.layers.advanced_activations import PReLU
-from tensorflow.python.keras._impl.keras.layers.advanced_activations import ELU
-from tensorflow.python.keras._impl.keras.layers.advanced_activations import ThresholdedReLU
+from tensorflow.python.keras.layers.advanced_activations import LeakyReLU
+from tensorflow.python.keras.layers.advanced_activations import PReLU
+from tensorflow.python.keras.layers.advanced_activations import ELU
+from tensorflow.python.keras.layers.advanced_activations import ThresholdedReLU
 
 # Convolution layers.
-from tensorflow.python.keras._impl.keras.layers.convolutional import Conv1D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Conv2D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Conv3D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Conv2DTranspose
-from tensorflow.python.keras._impl.keras.layers.convolutional import Conv3DTranspose
-from tensorflow.python.keras._impl.keras.layers.convolutional import SeparableConv2D
+from tensorflow.python.keras.layers.convolutional import Conv1D
+from tensorflow.python.keras.layers.convolutional import Conv2D
+from tensorflow.python.keras.layers.convolutional import Conv3D
+from tensorflow.python.keras.layers.convolutional import Conv2DTranspose
+from tensorflow.python.keras.layers.convolutional import Conv3DTranspose
+from tensorflow.python.keras.layers.convolutional import SeparableConv2D
 
 # Convolution layer aliases.
-from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution1D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution2D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution3D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution2DTranspose
-from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution3DTranspose
-from tensorflow.python.keras._impl.keras.layers.convolutional import SeparableConvolution2D
+from tensorflow.python.keras.layers.convolutional import Convolution1D
+from tensorflow.python.keras.layers.convolutional import Convolution2D
+from tensorflow.python.keras.layers.convolutional import Convolution3D
+from tensorflow.python.keras.layers.convolutional import Convolution2DTranspose
+from tensorflow.python.keras.layers.convolutional import Convolution3DTranspose
+from tensorflow.python.keras.layers.convolutional import SeparableConvolution2D
 
 # Image processing layers.
-from tensorflow.python.keras._impl.keras.layers.convolutional import UpSampling1D
-from tensorflow.python.keras._impl.keras.layers.convolutional import UpSampling2D
-from tensorflow.python.keras._impl.keras.layers.convolutional import UpSampling3D
-from tensorflow.python.keras._impl.keras.layers.convolutional import ZeroPadding1D
-from tensorflow.python.keras._impl.keras.layers.convolutional import ZeroPadding2D
-from tensorflow.python.keras._impl.keras.layers.convolutional import ZeroPadding3D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Cropping1D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Cropping2D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Cropping3D
+from tensorflow.python.keras.layers.convolutional import UpSampling1D
+from tensorflow.python.keras.layers.convolutional import UpSampling2D
+from tensorflow.python.keras.layers.convolutional import UpSampling3D
+from tensorflow.python.keras.layers.convolutional import ZeroPadding1D
+from tensorflow.python.keras.layers.convolutional import ZeroPadding2D
+from tensorflow.python.keras.layers.convolutional import ZeroPadding3D
+from tensorflow.python.keras.layers.convolutional import Cropping1D
+from tensorflow.python.keras.layers.convolutional import Cropping2D
+from tensorflow.python.keras.layers.convolutional import Cropping3D
 
 # Convolutional-recurrent layers.
-from tensorflow.python.keras._impl.keras.layers.convolutional_recurrent import ConvLSTM2D
+from tensorflow.python.keras.layers.convolutional_recurrent import ConvLSTM2D
 
 # Core layers.
-from tensorflow.python.keras._impl.keras.layers.core import Masking
-from tensorflow.python.keras._impl.keras.layers.core import Dropout
-from tensorflow.python.keras._impl.keras.layers.core import SpatialDropout1D
-from tensorflow.python.keras._impl.keras.layers.core import SpatialDropout2D
-from tensorflow.python.keras._impl.keras.layers.core import SpatialDropout3D
-from tensorflow.python.keras._impl.keras.layers.core import Activation
-from tensorflow.python.keras._impl.keras.layers.core import Reshape
-from tensorflow.python.keras._impl.keras.layers.core import Permute
-from tensorflow.python.keras._impl.keras.layers.core import Flatten
-from tensorflow.python.keras._impl.keras.layers.core import RepeatVector
-from tensorflow.python.keras._impl.keras.layers.core import Lambda
-from tensorflow.python.keras._impl.keras.layers.core import Dense
-from tensorflow.python.keras._impl.keras.layers.core import ActivityRegularization
+from tensorflow.python.keras.layers.core import Masking
+from tensorflow.python.keras.layers.core import Dropout
+from tensorflow.python.keras.layers.core import SpatialDropout1D
+from tensorflow.python.keras.layers.core import SpatialDropout2D
+from tensorflow.python.keras.layers.core import SpatialDropout3D
+from tensorflow.python.keras.layers.core import Activation
+from tensorflow.python.keras.layers.core import Reshape
+from tensorflow.python.keras.layers.core import Permute
+from tensorflow.python.keras.layers.core import Flatten
+from tensorflow.python.keras.layers.core import RepeatVector
+from tensorflow.python.keras.layers.core import Lambda
+from tensorflow.python.keras.layers.core import Dense
+from tensorflow.python.keras.layers.core import ActivityRegularization
 
 # Embedding layers.
-from tensorflow.python.keras._impl.keras.layers.embeddings import Embedding
+from tensorflow.python.keras.layers.embeddings import Embedding
 
 # Locally-connected layers.
-from tensorflow.python.keras._impl.keras.layers.local import LocallyConnected1D
-from tensorflow.python.keras._impl.keras.layers.local import LocallyConnected2D
+from tensorflow.python.keras.layers.local import LocallyConnected1D
+from tensorflow.python.keras.layers.local import LocallyConnected2D
 
 # Merge layers.
-from tensorflow.python.keras._impl.keras.layers.merge import Add
-from tensorflow.python.keras._impl.keras.layers.merge import Multiply
-from tensorflow.python.keras._impl.keras.layers.merge import Average
-from tensorflow.python.keras._impl.keras.layers.merge import Maximum
-from tensorflow.python.keras._impl.keras.layers.merge import Concatenate
-from tensorflow.python.keras._impl.keras.layers.merge import Dot
-from tensorflow.python.keras._impl.keras.layers.merge import add
-from tensorflow.python.keras._impl.keras.layers.merge import multiply
-from tensorflow.python.keras._impl.keras.layers.merge import average
-from tensorflow.python.keras._impl.keras.layers.merge import maximum
-from tensorflow.python.keras._impl.keras.layers.merge import concatenate
-from tensorflow.python.keras._impl.keras.layers.merge import dot
+from tensorflow.python.keras.layers.merge import Add
+from tensorflow.python.keras.layers.merge import Multiply
+from tensorflow.python.keras.layers.merge import Average
+from tensorflow.python.keras.layers.merge import Maximum
+from tensorflow.python.keras.layers.merge import Concatenate
+from tensorflow.python.keras.layers.merge import Dot
+from tensorflow.python.keras.layers.merge import add
+from tensorflow.python.keras.layers.merge import multiply
+from tensorflow.python.keras.layers.merge import average
+from tensorflow.python.keras.layers.merge import maximum
+from tensorflow.python.keras.layers.merge import concatenate
+from tensorflow.python.keras.layers.merge import dot
 
 # Noise layers.
-from tensorflow.python.keras._impl.keras.layers.noise import AlphaDropout
-from tensorflow.python.keras._impl.keras.layers.noise import GaussianNoise
-from tensorflow.python.keras._impl.keras.layers.noise import GaussianDropout
+from tensorflow.python.keras.layers.noise import AlphaDropout
+from tensorflow.python.keras.layers.noise import GaussianNoise
+from tensorflow.python.keras.layers.noise import GaussianDropout
 
 # Normalization layers.
-from tensorflow.python.keras._impl.keras.layers.normalization import BatchNormalization
+from tensorflow.python.keras.layers.normalization import BatchNormalization
 
 # Pooling layers.
-from tensorflow.python.keras._impl.keras.layers.pooling import MaxPooling1D
-from tensorflow.python.keras._impl.keras.layers.pooling import MaxPooling2D
-from tensorflow.python.keras._impl.keras.layers.pooling import MaxPooling3D
-from tensorflow.python.keras._impl.keras.layers.pooling import AveragePooling1D
-from tensorflow.python.keras._impl.keras.layers.pooling import AveragePooling2D
-from tensorflow.python.keras._impl.keras.layers.pooling import AveragePooling3D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalAveragePooling1D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalAveragePooling2D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalAveragePooling3D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalMaxPooling1D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalMaxPooling2D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalMaxPooling3D
+from tensorflow.python.keras.layers.pooling import MaxPooling1D
+from tensorflow.python.keras.layers.pooling import MaxPooling2D
+from tensorflow.python.keras.layers.pooling import MaxPooling3D
+from tensorflow.python.keras.layers.pooling import AveragePooling1D
+from tensorflow.python.keras.layers.pooling import AveragePooling2D
+from tensorflow.python.keras.layers.pooling import AveragePooling3D
+from tensorflow.python.keras.layers.pooling import GlobalAveragePooling1D
+from tensorflow.python.keras.layers.pooling import GlobalAveragePooling2D
+from tensorflow.python.keras.layers.pooling import GlobalAveragePooling3D
+from tensorflow.python.keras.layers.pooling import GlobalMaxPooling1D
+from tensorflow.python.keras.layers.pooling import GlobalMaxPooling2D
+from tensorflow.python.keras.layers.pooling import GlobalMaxPooling3D
 
 # Pooling layer aliases.
-from tensorflow.python.keras._impl.keras.layers.pooling import MaxPool1D
-from tensorflow.python.keras._impl.keras.layers.pooling import MaxPool2D
-from tensorflow.python.keras._impl.keras.layers.pooling import MaxPool3D
-from tensorflow.python.keras._impl.keras.layers.pooling import AvgPool1D
-from tensorflow.python.keras._impl.keras.layers.pooling import AvgPool2D
-from tensorflow.python.keras._impl.keras.layers.pooling import AvgPool3D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalAvgPool1D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalAvgPool2D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalAvgPool3D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalMaxPool1D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalMaxPool2D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalMaxPool3D
+from tensorflow.python.keras.layers.pooling import MaxPool1D
+from tensorflow.python.keras.layers.pooling import MaxPool2D
+from tensorflow.python.keras.layers.pooling import MaxPool3D
+from tensorflow.python.keras.layers.pooling import AvgPool1D
+from tensorflow.python.keras.layers.pooling import AvgPool2D
+from tensorflow.python.keras.layers.pooling import AvgPool3D
+from tensorflow.python.keras.layers.pooling import GlobalAvgPool1D
+from tensorflow.python.keras.layers.pooling import GlobalAvgPool2D
+from tensorflow.python.keras.layers.pooling import GlobalAvgPool3D
+from tensorflow.python.keras.layers.pooling import GlobalMaxPool1D
+from tensorflow.python.keras.layers.pooling import GlobalMaxPool2D
+from tensorflow.python.keras.layers.pooling import GlobalMaxPool3D
 
 # Recurrent layers.
-from tensorflow.python.keras._impl.keras.layers.recurrent import SimpleRNN
-from tensorflow.python.keras._impl.keras.layers.recurrent import GRU
-from tensorflow.python.keras._impl.keras.layers.recurrent import LSTM
+from tensorflow.python.keras.layers.recurrent import SimpleRNN
+from tensorflow.python.keras.layers.recurrent import GRU
+from tensorflow.python.keras.layers.recurrent import LSTM
 
 # Wrapper functions
-from tensorflow.python.keras._impl.keras.layers.wrappers import Wrapper
-from tensorflow.python.keras._impl.keras.layers.wrappers import Bidirectional
-from tensorflow.python.keras._impl.keras.layers.wrappers import TimeDistributed
+from tensorflow.python.keras.layers.wrappers import Wrapper
+from tensorflow.python.keras.layers.wrappers import Bidirectional
+from tensorflow.python.keras.layers.wrappers import TimeDistributed
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/losses/__init__.py b/tensorflow/contrib/keras/api/keras/losses/__init__.py
index 66721b694f5fd5..c4476a7bbd5056 100644
--- a/tensorflow/contrib/keras/api/keras/losses/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/losses/__init__.py
@@ -19,26 +19,26 @@
 from __future__ import print_function
 
 # Loss functions.
-from tensorflow.python.keras._impl.keras.losses import binary_crossentropy
-from tensorflow.python.keras._impl.keras.losses import categorical_crossentropy
-from tensorflow.python.keras._impl.keras.losses import categorical_hinge
-from tensorflow.python.keras._impl.keras.losses import cosine_proximity
-from tensorflow.python.keras._impl.keras.losses import hinge
-from tensorflow.python.keras._impl.keras.losses import kullback_leibler_divergence
-from tensorflow.python.keras._impl.keras.losses import logcosh
-from tensorflow.python.keras._impl.keras.losses import mean_absolute_error
-from tensorflow.python.keras._impl.keras.losses import mean_absolute_percentage_error
-from tensorflow.python.keras._impl.keras.losses import mean_squared_error
-from tensorflow.python.keras._impl.keras.losses import mean_squared_logarithmic_error
-from tensorflow.python.keras._impl.keras.losses import poisson
-from tensorflow.python.keras._impl.keras.losses import sparse_categorical_crossentropy
-from tensorflow.python.keras._impl.keras.losses import squared_hinge
+from tensorflow.python.keras.losses import binary_crossentropy
+from tensorflow.python.keras.losses import categorical_crossentropy
+from tensorflow.python.keras.losses import categorical_hinge
+from tensorflow.python.keras.losses import cosine_proximity
+from tensorflow.python.keras.losses import hinge
+from tensorflow.python.keras.losses import kullback_leibler_divergence
+from tensorflow.python.keras.losses import logcosh
+from tensorflow.python.keras.losses import mean_absolute_error
+from tensorflow.python.keras.losses import mean_absolute_percentage_error
+from tensorflow.python.keras.losses import mean_squared_error
+from tensorflow.python.keras.losses import mean_squared_logarithmic_error
+from tensorflow.python.keras.losses import poisson
+from tensorflow.python.keras.losses import sparse_categorical_crossentropy
+from tensorflow.python.keras.losses import squared_hinge
 
 # Auxiliary utils.
 # pylint: disable=g-bad-import-order
-from tensorflow.python.keras._impl.keras.losses import deserialize
-from tensorflow.python.keras._impl.keras.losses import serialize
-from tensorflow.python.keras._impl.keras.losses import get
+from tensorflow.python.keras.losses import deserialize
+from tensorflow.python.keras.losses import serialize
+from tensorflow.python.keras.losses import get
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/metrics/__init__.py b/tensorflow/contrib/keras/api/keras/metrics/__init__.py
index 59faf037bce0f0..7317fdb52c5b79 100644
--- a/tensorflow/contrib/keras/api/keras/metrics/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/metrics/__init__.py
@@ -19,28 +19,28 @@
 from __future__ import print_function
 
 # Metrics functions.
-from tensorflow.python.keras._impl.keras.metrics import binary_accuracy
-from tensorflow.python.keras._impl.keras.metrics import binary_crossentropy
-from tensorflow.python.keras._impl.keras.metrics import categorical_accuracy
-from tensorflow.python.keras._impl.keras.metrics import categorical_crossentropy
-from tensorflow.python.keras._impl.keras.metrics import cosine_proximity
-from tensorflow.python.keras._impl.keras.metrics import hinge
-from tensorflow.python.keras._impl.keras.metrics import kullback_leibler_divergence
-from tensorflow.python.keras._impl.keras.metrics import mean_absolute_error
-from tensorflow.python.keras._impl.keras.metrics import mean_absolute_percentage_error
-from tensorflow.python.keras._impl.keras.metrics import mean_squared_error
-from tensorflow.python.keras._impl.keras.metrics import mean_squared_logarithmic_error
-from tensorflow.python.keras._impl.keras.metrics import poisson
-from tensorflow.python.keras._impl.keras.metrics import sparse_categorical_crossentropy
-from tensorflow.python.keras._impl.keras.metrics import sparse_top_k_categorical_accuracy
-from tensorflow.python.keras._impl.keras.metrics import squared_hinge
-from tensorflow.python.keras._impl.keras.metrics import top_k_categorical_accuracy
+from tensorflow.python.keras.metrics import binary_accuracy
+from tensorflow.python.keras.metrics import binary_crossentropy
+from tensorflow.python.keras.metrics import categorical_accuracy
+from tensorflow.python.keras.metrics import categorical_crossentropy
+from tensorflow.python.keras.metrics import cosine_proximity
+from tensorflow.python.keras.metrics import hinge
+from tensorflow.python.keras.metrics import kullback_leibler_divergence
+from tensorflow.python.keras.metrics import mean_absolute_error
+from tensorflow.python.keras.metrics import mean_absolute_percentage_error
+from tensorflow.python.keras.metrics import mean_squared_error
+from tensorflow.python.keras.metrics import mean_squared_logarithmic_error
+from tensorflow.python.keras.metrics import poisson
+from tensorflow.python.keras.metrics import sparse_categorical_crossentropy
+from tensorflow.python.keras.metrics import sparse_top_k_categorical_accuracy
+from tensorflow.python.keras.metrics import squared_hinge
+from tensorflow.python.keras.metrics import top_k_categorical_accuracy
 
 # Auxiliary utils.
 # pylint: disable=g-bad-import-order
-from tensorflow.python.keras._impl.keras.metrics import deserialize
-from tensorflow.python.keras._impl.keras.metrics import serialize
-from tensorflow.python.keras._impl.keras.metrics import get
+from tensorflow.python.keras.metrics import deserialize
+from tensorflow.python.keras.metrics import serialize
+from tensorflow.python.keras.metrics import get
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/models/__init__.py b/tensorflow/contrib/keras/api/keras/models/__init__.py
index 2fb4ac0960d38f..3a196984cd88cb 100644
--- a/tensorflow/contrib/keras/api/keras/models/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/models/__init__.py
@@ -18,13 +18,13 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.models import load_model
-from tensorflow.python.keras._impl.keras.models import Model
-from tensorflow.python.keras._impl.keras.models import model_from_config
-from tensorflow.python.keras._impl.keras.models import model_from_json
-from tensorflow.python.keras._impl.keras.models import model_from_yaml
-from tensorflow.python.keras._impl.keras.models import save_model
-from tensorflow.python.keras._impl.keras.models import Sequential
+from tensorflow.python.keras.models import load_model
+from tensorflow.python.keras.models import Model
+from tensorflow.python.keras.models import model_from_config
+from tensorflow.python.keras.models import model_from_json
+from tensorflow.python.keras.models import model_from_yaml
+from tensorflow.python.keras.models import save_model
+from tensorflow.python.keras.models import Sequential
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/optimizers/__init__.py b/tensorflow/contrib/keras/api/keras/optimizers/__init__.py
index 44f47bc47f4a0e..4849a06747958a 100644
--- a/tensorflow/contrib/keras/api/keras/optimizers/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/optimizers/__init__.py
@@ -19,20 +19,20 @@
 from __future__ import print_function
 
 # Optimizer classes.
-from tensorflow.python.keras._impl.keras.optimizers import Adadelta
-from tensorflow.python.keras._impl.keras.optimizers import Adagrad
-from tensorflow.python.keras._impl.keras.optimizers import Adam
-from tensorflow.python.keras._impl.keras.optimizers import Adamax
-from tensorflow.python.keras._impl.keras.optimizers import Nadam
-from tensorflow.python.keras._impl.keras.optimizers import Optimizer
-from tensorflow.python.keras._impl.keras.optimizers import RMSprop
-from tensorflow.python.keras._impl.keras.optimizers import SGD
+from tensorflow.python.keras.optimizers import Adadelta
+from tensorflow.python.keras.optimizers import Adagrad
+from tensorflow.python.keras.optimizers import Adam
+from tensorflow.python.keras.optimizers import Adamax
+from tensorflow.python.keras.optimizers import Nadam
+from tensorflow.python.keras.optimizers import Optimizer
+from tensorflow.python.keras.optimizers import RMSprop
+from tensorflow.python.keras.optimizers import SGD
 
 # Auxiliary utils.
 # pylint: disable=g-bad-import-order
-from tensorflow.python.keras._impl.keras.optimizers import deserialize
-from tensorflow.python.keras._impl.keras.optimizers import serialize
-from tensorflow.python.keras._impl.keras.optimizers import get
+from tensorflow.python.keras.optimizers import deserialize
+from tensorflow.python.keras.optimizers import serialize
+from tensorflow.python.keras.optimizers import get
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/preprocessing/image/__init__.py b/tensorflow/contrib/keras/api/keras/preprocessing/image/__init__.py
index b96e7675527041..1f9e82b41bf09b 100644
--- a/tensorflow/contrib/keras/api/keras/preprocessing/image/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/preprocessing/image/__init__.py
@@ -18,20 +18,20 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.preprocessing.image import apply_transform
-from tensorflow.python.keras._impl.keras.preprocessing.image import array_to_img
-from tensorflow.python.keras._impl.keras.preprocessing.image import DirectoryIterator
-from tensorflow.python.keras._impl.keras.preprocessing.image import flip_axis
-from tensorflow.python.keras._impl.keras.preprocessing.image import ImageDataGenerator
-from tensorflow.python.keras._impl.keras.preprocessing.image import img_to_array
-from tensorflow.python.keras._impl.keras.preprocessing.image import Iterator
-from tensorflow.python.keras._impl.keras.preprocessing.image import load_img
-from tensorflow.python.keras._impl.keras.preprocessing.image import NumpyArrayIterator
-from tensorflow.python.keras._impl.keras.preprocessing.image import random_channel_shift
-from tensorflow.python.keras._impl.keras.preprocessing.image import random_rotation
-from tensorflow.python.keras._impl.keras.preprocessing.image import random_shear
-from tensorflow.python.keras._impl.keras.preprocessing.image import random_shift
-from tensorflow.python.keras._impl.keras.preprocessing.image import random_zoom
+from tensorflow.python.keras.preprocessing.image import apply_transform
+from tensorflow.python.keras.preprocessing.image import array_to_img
+from tensorflow.python.keras.preprocessing.image import DirectoryIterator
+from tensorflow.python.keras.preprocessing.image import flip_axis
+from tensorflow.python.keras.preprocessing.image import ImageDataGenerator
+from tensorflow.python.keras.preprocessing.image import img_to_array
+from tensorflow.python.keras.preprocessing.image import Iterator
+from tensorflow.python.keras.preprocessing.image import load_img
+from tensorflow.python.keras.preprocessing.image import NumpyArrayIterator
+from tensorflow.python.keras.preprocessing.image import random_channel_shift
+from tensorflow.python.keras.preprocessing.image import random_rotation
+from tensorflow.python.keras.preprocessing.image import random_shear
+from tensorflow.python.keras.preprocessing.image import random_shift
+from tensorflow.python.keras.preprocessing.image import random_zoom
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/preprocessing/sequence/__init__.py b/tensorflow/contrib/keras/api/keras/preprocessing/sequence/__init__.py
index 112f6af5e588bc..9a93b6fb57ff5a 100644
--- a/tensorflow/contrib/keras/api/keras/preprocessing/sequence/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/preprocessing/sequence/__init__.py
@@ -18,9 +18,9 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.preprocessing.sequence import make_sampling_table
-from tensorflow.python.keras._impl.keras.preprocessing.sequence import pad_sequences
-from tensorflow.python.keras._impl.keras.preprocessing.sequence import skipgrams
+from tensorflow.python.keras.preprocessing.sequence import make_sampling_table
+from tensorflow.python.keras.preprocessing.sequence import pad_sequences
+from tensorflow.python.keras.preprocessing.sequence import skipgrams
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/preprocessing/text/__init__.py b/tensorflow/contrib/keras/api/keras/preprocessing/text/__init__.py
index 5bf1a2fb21dc27..86386a9b6762d1 100644
--- a/tensorflow/contrib/keras/api/keras/preprocessing/text/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/preprocessing/text/__init__.py
@@ -18,9 +18,9 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.preprocessing.text import one_hot
-from tensorflow.python.keras._impl.keras.preprocessing.text import text_to_word_sequence
-from tensorflow.python.keras._impl.keras.preprocessing.text import Tokenizer
+from tensorflow.python.keras.preprocessing.text import one_hot
+from tensorflow.python.keras.preprocessing.text import text_to_word_sequence
+from tensorflow.python.keras.preprocessing.text import Tokenizer
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/regularizers/__init__.py b/tensorflow/contrib/keras/api/keras/regularizers/__init__.py
index 3e707ccab577b5..d668e39c09ca28 100644
--- a/tensorflow/contrib/keras/api/keras/regularizers/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/regularizers/__init__.py
@@ -19,19 +19,19 @@
 from __future__ import print_function
 
 # Regularizer functions / callable classes.
-from tensorflow.python.keras._impl.keras.regularizers import L1L2
-from tensorflow.python.keras._impl.keras.regularizers import Regularizer
+from tensorflow.python.keras.regularizers import L1L2
+from tensorflow.python.keras.regularizers import Regularizer
 
 # Functional interface.
 # pylint: disable=g-bad-import-order
-from tensorflow.python.keras._impl.keras.regularizers import l1
-from tensorflow.python.keras._impl.keras.regularizers import l2
-from tensorflow.python.keras._impl.keras.regularizers import l1_l2
+from tensorflow.python.keras.regularizers import l1
+from tensorflow.python.keras.regularizers import l2
+from tensorflow.python.keras.regularizers import l1_l2
 
 # Auxiliary utils.
-from tensorflow.python.keras._impl.keras.regularizers import deserialize
-from tensorflow.python.keras._impl.keras.regularizers import serialize
-from tensorflow.python.keras._impl.keras.regularizers import get
+from tensorflow.python.keras.regularizers import deserialize
+from tensorflow.python.keras.regularizers import serialize
+from tensorflow.python.keras.regularizers import get
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/utils/__init__.py b/tensorflow/contrib/keras/api/keras/utils/__init__.py
index a7c2179fe7ad43..47cd01b924fb43 100644
--- a/tensorflow/contrib/keras/api/keras/utils/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/utils/__init__.py
@@ -18,21 +18,21 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.utils.data_utils import GeneratorEnqueuer
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
-from tensorflow.python.keras._impl.keras.utils.data_utils import Sequence
-from tensorflow.python.keras._impl.keras.utils.data_utils import SequenceEnqueuer
-from tensorflow.python.keras._impl.keras.utils.generic_utils import custom_object_scope
-from tensorflow.python.keras._impl.keras.utils.generic_utils import CustomObjectScope
-from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
-from tensorflow.python.keras._impl.keras.utils.generic_utils import get_custom_objects
-from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
-from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
-from tensorflow.python.keras._impl.keras.utils.io_utils import HDF5Matrix
-from tensorflow.python.keras._impl.keras.utils.layer_utils import convert_all_kernels_in_model
-from tensorflow.python.keras._impl.keras.utils.np_utils import normalize
-from tensorflow.python.keras._impl.keras.utils.np_utils import to_categorical
-from tensorflow.python.keras._impl.keras.utils.vis_utils import plot_model
+from tensorflow.python.keras.utils.data_utils import GeneratorEnqueuer
+from tensorflow.python.keras.utils.data_utils import get_file
+from tensorflow.python.keras.utils.data_utils import Sequence
+from tensorflow.python.keras.utils.data_utils import SequenceEnqueuer
+from tensorflow.python.keras.utils.generic_utils import custom_object_scope
+from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
+from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.python.keras.utils.generic_utils import get_custom_objects
+from tensorflow.python.keras.utils.generic_utils import Progbar
+from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.keras.utils.io_utils import HDF5Matrix
+from tensorflow.python.keras.utils.layer_utils import convert_all_kernels_in_model
+from tensorflow.python.keras.utils.np_utils import normalize
+from tensorflow.python.keras.utils.np_utils import to_categorical
+from tensorflow.python.keras.utils.vis_utils import plot_model
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/keras/api/keras/wrappers/scikit_learn/__init__.py b/tensorflow/contrib/keras/api/keras/wrappers/scikit_learn/__init__.py
index a46f859273ea01..c4b7aa765c26ba 100644
--- a/tensorflow/contrib/keras/api/keras/wrappers/scikit_learn/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/wrappers/scikit_learn/__init__.py
@@ -18,8 +18,8 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.wrappers.scikit_learn import KerasClassifier
-from tensorflow.python.keras._impl.keras.wrappers.scikit_learn import KerasRegressor
+from tensorflow.python.keras.wrappers.scikit_learn import KerasClassifier
+from tensorflow.python.keras.wrappers.scikit_learn import KerasRegressor
 
 del absolute_import
 del division
diff --git a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py
index 91929184a2e6f3..2ff4d41d75fe59 100644
--- a/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py
+++ b/tensorflow/contrib/kernel_methods/python/mappers/random_fourier_features_test.py
@@ -31,7 +31,7 @@
 
 
 def _inner_product(x, y):
-  """Inner product between tensors x and y.
+  r"""Inner product between tensors x and y.
 
   The input tensors are assumed to be in ROW representation, that is, the method
   returns \\(x * y^T\\).
@@ -131,10 +131,6 @@ def testGoodKernelApproximationAmortized(self):
     mapped_dim = 5000
     stddev = 5.0
 
-    # TODO(sibyl-vie3Poto): Reduce test's running time before moving to third_party. One
-    # possible way to speed the test up is to compute both the approximate and
-    # the exact kernel matrix directly using matrix operations instead of
-    # computing the values for each pair of points separately.
     points_shape = [1, input_dim]
     points = [
         random_ops.random_uniform(shape=points_shape, maxval=1.0)
diff --git a/tensorflow/contrib/kfac/README.md b/tensorflow/contrib/kfac/README.md
index 762a2f0b57e95e..102626925db560 100644
--- a/tensorflow/contrib/kfac/README.md
+++ b/tensorflow/contrib/kfac/README.md
@@ -1,5 +1,10 @@
 # K-FAC: Kronecker-Factored Approximate Curvature
 
+# <font color="red", size=10><u>WARNING: </u></font>
+# ==third_party/tensorflow/contrib/kfac is deprecated. This will be==
+# ==removed on 15-07-2018. <!-- STY:begin_strip_and_replace -->Please import third_party/tensorflow_kfac.==
+# ==<!-- STY:end_strip_and_replace Please check https://github.com/tensorflow/kfac. -->==
+
 **K-FAC in TensorFlow** is an implementation of [K-FAC][kfac-paper], an
 approximate second-order optimization method, in TensorFlow. When applied to
 feedforward and convolutional neural networks, K-FAC can converge `>3.5x`
diff --git a/tensorflow/contrib/kfac/examples/convnet.py b/tensorflow/contrib/kfac/examples/convnet.py
index e8e3353091df25..d6b1a61b716ab7 100644
--- a/tensorflow/contrib/kfac/examples/convnet.py
+++ b/tensorflow/contrib/kfac/examples/convnet.py
@@ -223,26 +223,26 @@ def minimize_loss_single_machine(loss,
   (cov_update_thunks,
    inv_update_thunks) = optimizer.make_vars_and_create_op_thunks()
 
-  with tf.device(device):
-    train_op = optimizer.minimize(loss, global_step=g_step)
-
   def make_update_op(update_thunks):
-    update_op = [thunk() for thunk in update_thunks]
-    return tf.group(*update_op)
+    update_ops = [thunk() for thunk in update_thunks]
+    return tf.group(*update_ops)
 
   cov_update_op = make_update_op(cov_update_thunks)
-  with tf.control_dependencies([train_op, cov_update_op]):
+  with tf.control_dependencies([cov_update_op]):
     inverse_op = tf.cond(
-        tf.equal(tf.mod(g_step + 1, _INVERT_EVERY), 0),
+        tf.equal(tf.mod(g_step, _INVERT_EVERY), 0),
         lambda: make_update_op(inv_update_thunks), tf.no_op)
+    with tf.control_dependencies([inverse_op]):
+      with tf.device(device):
+        train_op = optimizer.minimize(loss, global_step=g_step)
 
   tf.logging.info("Starting training.")
   with tf.train.MonitoredTrainingSession(config=session_config) as sess:
     while not sess.should_stop():
       global_step_, loss_, accuracy_, _ = sess.run(
-          [g_step, loss, accuracy, inverse_op])
+          [g_step, loss, accuracy, train_op])
 
-      if (global_step_ + 1) % _INVERT_EVERY == 0:
+      if global_step_ % _INVERT_EVERY == 0:
         tf.logging.info("global_step: %d | loss: %f | accuracy: %s",
                         global_step_, loss_, accuracy_)
 
@@ -325,7 +325,7 @@ def distributed_grads_only_and_ops_chief_worker(
 
   All workers perform gradient computation. Chief worker applies gradient after
   averaging the gradients obtained from all the workers. All workers block
-  execution untill the update is applied. Chief worker runs covariance and
+  execution until the update is applied. Chief worker runs covariance and
   inverse update ops. Covariance and inverse matrices are placed on parameter
   servers in a round robin manner. For further details on synchronous
   distributed optimization check `tf.train.SyncReplicasOptimizer`.
@@ -357,24 +357,25 @@ def distributed_grads_only_and_ops_chief_worker(
       task_id, num_worker_tasks, num_ps_tasks, layer_collection)
   (cov_update_thunks,
    inv_update_thunks) = optimizer.make_vars_and_create_op_thunks()
-  train_op = sync_optimizer.minimize(loss, global_step=global_step)
 
   tf.logging.info("Starting training.")
   hooks = [sync_optimizer.make_session_run_hook(is_chief)]
 
   def make_update_op(update_thunks):
-    update_op = [thunk() for thunk in update_thunks]
-    return tf.group(*update_op)
+    update_ops = [thunk() for thunk in update_thunks]
+    return tf.group(*update_ops)
 
   if is_chief:
     cov_update_op = make_update_op(cov_update_thunks)
-    with tf.control_dependencies([train_op, cov_update_op]):
-      update_op = tf.cond(
-          tf.equal(tf.mod(global_step + 1, invert_every), 0),
+    with tf.control_dependencies([cov_update_op]):
+      inverse_op = tf.cond(
+          tf.equal(tf.mod(global_step, invert_every), 0),
           lambda: make_update_op(inv_update_thunks),
           tf.no_op)
+      with tf.control_dependencies([inverse_op]):
+        train_op = sync_optimizer.minimize(loss, global_step=global_step)
   else:
-    update_op = train_op
+    train_op = sync_optimizer.minimize(loss, global_step=global_step)
 
   with tf.train.MonitoredTrainingSession(
       master=master,
@@ -384,7 +385,7 @@ def make_update_op(update_thunks):
       stop_grace_period_secs=0) as sess:
     while not sess.should_stop():
       global_step_, loss_, accuracy_, _ = sess.run(
-          [global_step, loss, accuracy, update_op])
+          [global_step, loss, accuracy, train_op])
       tf.logging.info("global_step: %d | loss: %f | accuracy: %s", global_step_,
                       loss_, accuracy_)
   return accuracy_
@@ -577,25 +578,25 @@ def train_mnist_multitower(data_dir, num_epochs, num_towers,
   (cov_update_thunks,
    inv_update_thunks) = optimizer.make_vars_and_create_op_thunks()
 
-  train_op = optimizer.minimize(loss, global_step=g_step)
-
   def make_update_op(update_thunks):
-    update_op = [thunk() for thunk in update_thunks]
-    return tf.group(*update_op)
+    update_ops = [thunk() for thunk in update_thunks]
+    return tf.group(*update_ops)
 
   cov_update_op = make_update_op(cov_update_thunks)
-  with tf.control_dependencies([train_op, cov_update_op]):
+  with tf.control_dependencies([cov_update_op]):
     inverse_op = tf.cond(
-        tf.equal(tf.mod(g_step + 1, _INVERT_EVERY), 0),
+        tf.equal(tf.mod(g_step, _INVERT_EVERY), 0),
         lambda: make_update_op(inv_update_thunks), tf.no_op)
+    with tf.control_dependencies([inverse_op]):
+      train_op = optimizer.minimize(loss, global_step=g_step)
 
   tf.logging.info("Starting training.")
   with tf.train.MonitoredTrainingSession(config=session_config) as sess:
     while not sess.should_stop():
       global_step_, loss_, accuracy_, _ = sess.run(
-          [g_step, loss, accuracy, inverse_op])
+          [g_step, loss, accuracy, train_op])
 
-      if (global_step_ + 1) % _INVERT_EVERY == 0:
+      if global_step_ % _INVERT_EVERY == 0:
         tf.logging.info("global_step: %d | loss: %f | accuracy: %s",
                         global_step_, loss_, accuracy_)
 
diff --git a/tensorflow/contrib/kfac/examples/mlp.py b/tensorflow/contrib/kfac/examples/mlp.py
index 87eed03888c894..ea2b252a05702d 100644
--- a/tensorflow/contrib/kfac/examples/mlp.py
+++ b/tensorflow/contrib/kfac/examples/mlp.py
@@ -105,18 +105,21 @@ def build_model(examples, labels, num_labels, layer_collection):
   return loss, accuracy
 
 
-def minimize(loss, accuracy, layer_collection, session_config=None):
+def minimize(loss, accuracy, layer_collection, num_towers, session_config=None):
   """Minimize 'loss' with KfacOptimizer.
 
   Args:
     loss: 0-D Tensor. Loss to be minimized.
     accuracy: 0-D Tensor. Accuracy of classifier on current minibatch.
     layer_collection: LayerCollection instance. Describes layers in model.
+    num_towers: int. Number of CPUs to split minibatch across.
     session_config: tf.ConfigProto. Configuration for tf.Session().
 
   Returns:
     accuracy of classifier on final minibatch.
   """
+  devices = tuple("/cpu:%d" % tower_id for tower_id in range(num_towers))
+
   # Train with K-FAC. We'll use a decreasing learning rate that's cut in 1/2
   # every 10k iterations.
   tf.logging.info("Building KFAC Optimizer.")
@@ -125,27 +128,38 @@ def minimize(loss, accuracy, layer_collection, session_config=None):
       learning_rate=tf.train.exponential_decay(
           0.00002, global_step, 10000, 0.5, staircase=True),
       cov_ema_decay=0.95,
-      damping=0.0001,
+      damping=0.0005,
       layer_collection=layer_collection,
-      momentum=0.99)
-  train_op = optimizer.minimize(loss, global_step=global_step)
+      momentum=0.99,
+      placement_strategy="round_robin",
+      cov_devices=devices,
+      inv_devices=devices)
+
+  (cov_update_thunks,
+   inv_update_thunks) = optimizer.make_vars_and_create_op_thunks()
+
+  def make_update_op(update_thunks):
+    update_ops = [thunk() for thunk in update_thunks]
+    return tf.group(*update_ops)
+
+  # TODO(b/78537047): change (some) examples to use PeriodicInvCovUpdateKfacOpt
+  # once that gets moved over?  Could still leave more advanced examples as they
+  # are (e.g. train_mnist_estimator in this file)
+
+  cov_update_op = make_update_op(cov_update_thunks)
+  with tf.control_dependencies([cov_update_op]):
+    # We update the inverses only every 20 iterations.
+    inverse_op = tf.cond(
+        tf.equal(tf.mod(global_step, 100), 0),
+        lambda: make_update_op(inv_update_thunks), tf.no_op)
+    with tf.control_dependencies([inverse_op]):
+      train_op = optimizer.minimize(loss, global_step=global_step)
 
   tf.logging.info("Starting training.")
   with tf.train.MonitoredTrainingSession(config=session_config) as sess:
     while not sess.should_stop():
-      # K-FAC has 3 primary ops,
-      # - train_op: Update the weights with the minibatch's gradient.
-      # - cov_update_op: Update statistics used for building K-FAC's
-      #   preconditioner matrix.
-      # - inv_update_op: Update preconditioner matrix using statistics.
-      #
-      # The first 2 of these are cheap and should be done with each step. The
-      # latter is more expensive, and should be updated ~100 iterations.
-      global_step_, loss_, accuracy_, _, _ = sess.run(
-          [global_step, loss, accuracy, train_op, optimizer.cov_update_op])
-
-      if global_step_ % 100 == 0:
-        sess.run(optimizer.inv_update_op)
+      global_step_, loss_, accuracy_, _ = sess.run(
+          [global_step, loss, accuracy, train_op])
 
       if global_step_ % 100 == 0:
         tf.logging.info("global_step: %d | loss: %f | accuracy: %f",
@@ -180,7 +194,7 @@ def train_mnist(data_dir, num_epochs, use_fake_data=False):
   loss, accuracy = build_model(examples, labels, 10, layer_collection)
 
   # Fit model.
-  minimize(loss, accuracy, layer_collection)
+  minimize(loss, accuracy, layer_collection, 1)
 
 
 def train_mnist_multitower(data_dir,
@@ -238,7 +252,8 @@ def train_mnist_multitower(data_dir,
           "CPU": num_towers
       })
   return minimize(
-      loss, accuracy, layer_collection, session_config=session_config)
+      loss, accuracy, layer_collection, num_towers,
+      session_config=session_config)
 
 
 def train_mnist_estimator(data_dir, num_epochs, use_fake_data=False):
@@ -298,13 +313,26 @@ def model_fn(features, labels, mode, params):
         layer_collection=layer_collection,
         momentum=0.99)
 
+    (cov_update_thunks,
+     inv_update_thunks) = optimizer.make_vars_and_create_op_thunks()
+
+    def make_update_op(update_thunks):
+      update_ops = [thunk() for thunk in update_thunks]
+      return tf.group(*update_ops)
+
+    def make_batch_executed_op(update_thunks, batch_size=1):
+      return tf.group(*tf.contrib.kfac.utils.batch_execute(
+          global_step, update_thunks, batch_size=batch_size))
+
     # Run cov_update_op every step. Run 1 inv_update_ops per step.
-    cov_update_op = optimizer.cov_update_op
-    inv_update_op = tf.group(
-        tf.contrib.kfac.utils.batch_execute(
-            global_step, optimizer.inv_update_thunks, batch_size=1))
-    with tf.control_dependencies([cov_update_op, inv_update_op]):
-      train_op = optimizer.minimize(loss, global_step=global_step)
+    cov_update_op = make_update_op(cov_update_thunks)
+    with tf.control_dependencies([cov_update_op]):
+      # But make sure to execute all the inverse ops on the first step
+      inverse_op = tf.cond(tf.equal(global_step, 0),
+                           lambda: make_update_op(inv_update_thunks),
+                           lambda: make_batch_executed_op(inv_update_thunks))
+      with tf.control_dependencies([inverse_op]):
+        train_op = optimizer.minimize(loss, global_step=global_step)
 
     # Print metrics every 5 sec.
     hooks = [
diff --git a/tensorflow/contrib/kfac/examples/tests/convnet_test.py b/tensorflow/contrib/kfac/examples/tests/convnet_test.py
index 6de775cc79953b..adecda71666ee7 100644
--- a/tensorflow/contrib/kfac/examples/tests/convnet_test.py
+++ b/tensorflow/contrib/kfac/examples/tests/convnet_test.py
@@ -157,7 +157,7 @@ def testTrainMnistDistributed(self):
           num_ps_tasks=0,
           master="",
           data_dir=None,
-          num_epochs=1,
+          num_epochs=2,
           op_strategy="chief_worker",
           use_fake_data=True)
 
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/BUILD b/tensorflow/contrib/kfac/python/kernel_tests/BUILD
index 2477d2bfc12c2d..6e4a8d71baa85d 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/kfac/python/kernel_tests/BUILD
@@ -58,6 +58,7 @@ py_test(
     deps = [
         "//tensorflow/contrib/kfac/python/ops:fisher_blocks",
         "//tensorflow/contrib/kfac/python/ops:layer_collection",
+        "//tensorflow/contrib/kfac/python/ops:linear_operator",
         "//tensorflow/contrib/kfac/python/ops:utils",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -96,6 +97,7 @@ py_test(
     srcs = ["optimizer_test.py"],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/contrib/kfac/python/ops:fisher_factors",
         "//tensorflow/contrib/kfac/python/ops:kfac_optimizer",
         "//tensorflow/contrib/kfac/python/ops:layer_collection",
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/estimator_test.py b/tensorflow/contrib/kfac/python/kernel_tests/estimator_test.py
index f22dbcf2156629..0e65d419a31838 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/estimator_test.py
+++ b/tensorflow/contrib/kfac/python/kernel_tests/estimator_test.py
@@ -81,7 +81,7 @@ def testEstimatorInitManualRegistration(self):
             damping=0.2,
             layer_collection=self.layer_collection
         )
-        est.make_ops_and_vars()
+        est.make_vars_and_create_op_thunks()
 
       # Check that we throw an error if we don't include registered variables,
       # i.e. self.weights
@@ -91,7 +91,7 @@ def testEstimatorInitManualRegistration(self):
             cov_ema_decay=0.1,
             damping=0.2,
             layer_collection=self.layer_collection)
-        est.make_ops_and_vars()
+        est.make_vars_and_create_op_thunks()
 
   @test.mock.patch.object(utils.SubGraph, "variable_uses", return_value=42)
   def testVariableWrongNumberOfUses(self, mock_uses):
@@ -101,7 +101,7 @@ def testVariableWrongNumberOfUses(self, mock_uses):
           cov_ema_decay=0.1,
           damping=0.2,
           layer_collection=self.layer_collection)
-      est.make_ops_and_vars()
+      est.make_vars_and_create_op_thunks()
 
   def testInvalidEstimationMode(self):
     with self.assertRaises(ValueError):
@@ -111,7 +111,7 @@ def testInvalidEstimationMode(self):
           damping=0.2,
           layer_collection=self.layer_collection,
           estimation_mode="not_a_real_mode")
-      est.make_ops_and_vars()
+      est.make_vars_and_create_op_thunks()
 
   def testGradientsModeBuild(self):
     with self._graph.as_default():
@@ -121,7 +121,7 @@ def testGradientsModeBuild(self):
           damping=0.2,
           layer_collection=self.layer_collection,
           estimation_mode="gradients")
-      est.make_ops_and_vars()
+      est.make_vars_and_create_op_thunks()
 
   def testEmpiricalModeBuild(self):
     with self._graph.as_default():
@@ -131,7 +131,7 @@ def testEmpiricalModeBuild(self):
           damping=0.2,
           layer_collection=self.layer_collection,
           estimation_mode="empirical")
-      est.make_ops_and_vars()
+      est.make_vars_and_create_op_thunks()
 
   def testCurvaturePropModeBuild(self):
     with self._graph.as_default():
@@ -141,7 +141,7 @@ def testCurvaturePropModeBuild(self):
           damping=0.2,
           layer_collection=self.layer_collection,
           estimation_mode="curvature_prop")
-      est.make_ops_and_vars()
+      est.make_vars_and_create_op_thunks()
 
   def testExactModeBuild(self):
     with self._graph.as_default():
@@ -151,7 +151,7 @@ def testExactModeBuild(self):
           damping=0.2,
           layer_collection=self.layer_collection,
           estimation_mode="exact")
-      est.make_ops_and_vars()
+      est.make_vars_and_create_op_thunks()
 
   def test_cov_update_thunks(self):
     """Ensures covariance update ops run once per global_step."""
@@ -215,8 +215,11 @@ def test_round_robin_placement(self):
           inv_devices=["/cpu:{}".format(i) for i in range(2)])
 
       # Construct an op that executes one covariance update per step.
-      (cov_update_ops, _, inv_update_ops, _, _,
-       _) = fisher_estimator.make_ops_and_vars(scope="test")
+      (cov_update_thunks,
+       inv_update_thunks) = fisher_estimator.make_vars_and_create_op_thunks(
+           scope="test")
+      cov_update_ops = tuple(thunk() for thunk in cov_update_thunks)
+      inv_update_ops = tuple(thunk() for thunk in inv_update_thunks)
       self.assertEqual(cov_update_ops[0].device, "/device:CPU:0")
       self.assertEqual(cov_update_ops[1].device, "/device:CPU:1")
       self.assertEqual(inv_update_ops[0].device, "/device:CPU:0")
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/fisher_blocks_test.py b/tensorflow/contrib/kfac/python/kernel_tests/fisher_blocks_test.py
index 6eda6c31e34370..86ec7a095afdf4 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/fisher_blocks_test.py
+++ b/tensorflow/contrib/kfac/python/kernel_tests/fisher_blocks_test.py
@@ -21,7 +21,9 @@
 import numpy as np
 
 from tensorflow.contrib.kfac.python.ops import fisher_blocks as fb
+from tensorflow.contrib.kfac.python.ops import fisher_factors as ff
 from tensorflow.contrib.kfac.python.ops import layer_collection as lc
+from tensorflow.contrib.kfac.python.ops import linear_operator as lo
 from tensorflow.contrib.kfac.python.ops import utils
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
@@ -34,6 +36,19 @@
 from tensorflow.python.platform import test
 
 
+# We need to set these constants since the numerical values used in the tests
+# were chosen when these used to be the defaults.
+ff.set_global_constants(init_covariances_at_zero=False,
+                        zero_debias=False,
+                        init_inverses_at_zero=False)
+
+# TODO(b/78538100): As far as I can tell, all the tests that say "Make sure our
+# inverse is something other than the identity" are actually broken. They never
+# run the covariance update ops and so the inverse actually is the identity
+# (possible plus the damping term, which would still make it a multiple of the
+# identity).
+
+
 def _make_psd(dim):
   """Constructs a PSD matrix of the given dimension."""
   mat = np.ones((dim, dim), dtype=np.float32)
@@ -46,8 +61,9 @@ class UtilsTest(test.TestCase):
   def testComputePiTracenorm(self):
     with ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
-      left_factor = array_ops.diag([1., 2., 0., 1.])
-      right_factor = array_ops.ones([2., 2.])
+      diag = ops.convert_to_tensor([1., 2., 0., 1.])
+      left_factor = lo.LinearOperatorDiag(diag)
+      right_factor = lo.LinearOperatorFullMatrix(array_ops.ones([2, 2]))
 
       # pi is the sqrt of the left trace norm divided by the right trace norm
       pi = fb.compute_pi_tracenorm(left_factor, right_factor)
@@ -245,7 +261,6 @@ def testMultiplyInverseAgainstExplicit(self):
 
       full = sess.run(block.full_fisher_block())
       explicit = np.dot(np.linalg.inv(full + damping * np.eye(3)), v_flat)
-
       self.assertAllClose(output_flat, explicit)
 
 
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py b/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py
index 432b67e5690003..fad47cd02f372e 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py
+++ b/tensorflow/contrib/kfac/python/kernel_tests/fisher_factors_test.py
@@ -35,6 +35,13 @@
 from tensorflow.python.platform import test
 
 
+# We need to set these constants since the numerical values used in the tests
+# were chosen when these used to be the defaults.
+ff.set_global_constants(init_covariances_at_zero=False,
+                        zero_debias=False,
+                        init_inverses_at_zero=False)
+
+
 def make_damping_func(damping):
   return fb._package_func(lambda: damping, damping)
 
@@ -70,35 +77,44 @@ def make_inverse_update_ops(self):
   def get_cov(self):
     return NotImplementedError
 
-  def left_multiply(self, x, damping):
+  def instantiate_inv_variables(self):
     return NotImplementedError
 
-  def right_multiply(self, x, damping):
-    return NotImplementedError
+  def _num_towers(self):
+    raise NotImplementedError
 
-  def left_multiply_matpower(self, x, exp, damping):
-    return NotImplementedError
+  def _get_data_device(self):
+    raise NotImplementedError
 
-  def right_multiply_matpower(self, x, exp, damping):
-    return NotImplementedError
+  def register_matpower(self, exp, damping_func):
+    raise NotImplementedError
 
-  def instantiate_inv_variables(self):
-    return NotImplementedError
+  def register_cholesky(self, damping_func):
+    raise NotImplementedError
 
-  def _num_towers(self):
+  def register_cholesky_inverse(self, damping_func):
     raise NotImplementedError
 
-  def _get_data_device(self):
+  def get_matpower(self, exp, damping_func):
+    raise NotImplementedError
+
+  def get_cholesky(self, damping_func):
+    raise NotImplementedError
+
+  def get_cholesky_inverse(self, damping_func):
+    raise NotImplementedError
+
+  def get_cov_as_linear_operator(self):
     raise NotImplementedError
 
 
-class InverseProvidingFactorTestingDummy(ff.InverseProvidingFactor):
-  """Dummy class to test the non-abstract methods on ff.InverseProvidingFactor.
+class DenseSquareMatrixFactorTestingDummy(ff.DenseSquareMatrixFactor):
+  """Dummy class to test the non-abstract methods on ff.DenseSquareMatrixFactor.
   """
 
   def __init__(self, shape):
     self._shape = shape
-    super(InverseProvidingFactorTestingDummy, self).__init__()
+    super(DenseSquareMatrixFactorTestingDummy, self).__init__()
 
   @property
   def _var_scope(self):
@@ -230,13 +246,13 @@ def testMakeInverseUpdateOps(self):
       self.assertEqual(0, len(factor.make_inverse_update_ops()))
 
 
-class InverseProvidingFactorTest(test.TestCase):
+class DenseSquareMatrixFactorTest(test.TestCase):
 
   def testRegisterDampedInverse(self):
     with tf_ops.Graph().as_default():
       random_seed.set_random_seed(200)
       shape = [2, 2]
-      factor = InverseProvidingFactorTestingDummy(shape)
+      factor = DenseSquareMatrixFactorTestingDummy(shape)
       factor_var_scope = 'dummy/a_b_c'
 
       damping_funcs = [make_damping_func(0.1),
@@ -248,22 +264,25 @@ def testRegisterDampedInverse(self):
 
       factor.instantiate_inv_variables()
 
-      inv = factor.get_inverse(damping_funcs[0])
-      self.assertEqual(inv, factor.get_inverse(damping_funcs[1]))
-      self.assertNotEqual(inv, factor.get_inverse(damping_funcs[2]))
-      self.assertEqual(factor.get_inverse(damping_funcs[2]),
-                       factor.get_inverse(damping_funcs[3]))
+      inv = factor.get_inverse(damping_funcs[0]).to_dense()
+      self.assertEqual(inv, factor.get_inverse(damping_funcs[1]).to_dense())
+      self.assertNotEqual(inv, factor.get_inverse(damping_funcs[2]).to_dense())
+      self.assertEqual(factor.get_inverse(damping_funcs[2]).to_dense(),
+                       factor.get_inverse(damping_funcs[3]).to_dense())
       factor_vars = tf_ops.get_collection(tf_ops.GraphKeys.GLOBAL_VARIABLES,
                                           factor_var_scope)
-      self.assertEqual(set([inv, factor.get_inverse(damping_funcs[2])]),
-                       set(factor_vars))
+      factor_tensors = (tf_ops.convert_to_tensor(var) for var in factor_vars)
+
+      self.assertEqual(set([inv,
+                            factor.get_inverse(damping_funcs[2]).to_dense()]),
+                       set(factor_tensors))
       self.assertEqual(shape, inv.get_shape())
 
   def testRegisterMatpower(self):
     with tf_ops.Graph().as_default():
       random_seed.set_random_seed(200)
       shape = [3, 3]
-      factor = InverseProvidingFactorTestingDummy(shape)
+      factor = DenseSquareMatrixFactorTestingDummy(shape)
       factor_var_scope = 'dummy/a_b_c'
 
       # TODO(b/74201126): Change to using the same func for both once
@@ -278,10 +297,13 @@ def testRegisterMatpower(self):
 
       factor_vars = tf_ops.get_collection(tf_ops.GraphKeys.GLOBAL_VARIABLES,
                                           factor_var_scope)
-      matpower1 = factor.get_matpower(-0.5, damping_func_1)
-      matpower2 = factor.get_matpower(2, damping_func_2)
 
-      self.assertEqual(set([matpower1, matpower2]), set(factor_vars))
+      factor_tensors = (tf_ops.convert_to_tensor(var) for var in factor_vars)
+
+      matpower1 = factor.get_matpower(-0.5, damping_func_1).to_dense()
+      matpower2 = factor.get_matpower(2, damping_func_2).to_dense()
+
+      self.assertEqual(set([matpower1, matpower2]), set(factor_tensors))
 
       self.assertEqual(shape, matpower1.get_shape())
       self.assertEqual(shape, matpower2.get_shape())
@@ -297,7 +319,7 @@ def testMakeInverseUpdateOpsManyInversesEigenDecomp(self):
     with tf_ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
       cov = np.array([[1., 2.], [3., 4.]])
-      factor = InverseProvidingFactorTestingDummy(cov.shape)
+      factor = DenseSquareMatrixFactorTestingDummy(cov.shape)
       factor._cov = array_ops.constant(cov, dtype=dtypes.float32)
 
       damping_funcs = []
@@ -316,7 +338,8 @@ def testMakeInverseUpdateOpsManyInversesEigenDecomp(self):
       sess.run(ops)
       for i in range(ff.EIGENVALUE_DECOMPOSITION_THRESHOLD):
         # The inverse op will assign the damped inverse of cov to the inv var.
-        new_invs.append(sess.run(factor.get_inverse(damping_funcs[i])))
+        new_invs.append(
+            sess.run(factor.get_inverse(damping_funcs[i]).to_dense()))
 
       # We want to see that the new invs are all different from each other.
       for i in range(len(new_invs)):
@@ -328,7 +351,7 @@ def testMakeInverseUpdateOpsMatPowerEigenDecomp(self):
     with tf_ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
       cov = np.array([[6., 2.], [2., 4.]])
-      factor = InverseProvidingFactorTestingDummy(cov.shape)
+      factor = DenseSquareMatrixFactorTestingDummy(cov.shape)
       factor._cov = array_ops.constant(cov, dtype=dtypes.float32)
       exp = 2  # NOTE(mattjj): must be int to test with np.linalg.matrix_power
       damping = 0.5
@@ -341,7 +364,7 @@ def testMakeInverseUpdateOpsMatPowerEigenDecomp(self):
 
       sess.run(tf_variables.global_variables_initializer())
       sess.run(ops[0])
-      matpower = sess.run(factor.get_matpower(exp, damping_func))
+      matpower = sess.run(factor.get_matpower(exp, damping_func).to_dense())
       matpower_np = np.linalg.matrix_power(cov + np.eye(2) * damping, exp)
       self.assertAllClose(matpower, matpower_np)
 
@@ -349,7 +372,7 @@ def testMakeInverseUpdateOpsNoEigenDecomp(self):
     with tf_ops.Graph().as_default(), self.test_session() as sess:
       random_seed.set_random_seed(200)
       cov = np.array([[5., 2.], [2., 4.]])  # NOTE(mattjj): must be symmetric
-      factor = InverseProvidingFactorTestingDummy(cov.shape)
+      factor = DenseSquareMatrixFactorTestingDummy(cov.shape)
       factor._cov = array_ops.constant(cov, dtype=dtypes.float32)
 
       damping_func = make_damping_func(0)
@@ -361,12 +384,12 @@ def testMakeInverseUpdateOpsNoEigenDecomp(self):
 
       sess.run(tf_variables.global_variables_initializer())
       # The inverse op will assign the damped inverse of cov to the inv var.
-      old_inv = sess.run(factor.get_inverse(damping_func))
+      old_inv = sess.run(factor.get_inverse(damping_func).to_dense())
       self.assertAllClose(
           sess.run(ff.inverse_initializer(cov.shape, dtypes.float32)), old_inv)
 
       sess.run(ops)
-      new_inv = sess.run(factor.get_inverse(damping_func))
+      new_inv = sess.run(factor.get_inverse(damping_func).to_dense())
       self.assertAllClose(new_inv, np.linalg.inv(cov))
 
 
@@ -411,7 +434,7 @@ def testNaiveDiagonalFactorInit(self):
       tensor = array_ops.ones((2, 3), name='a/b/c')
       factor = ff.NaiveDiagonalFactor((tensor,), 32)
       factor.instantiate_cov_variables()
-      self.assertEqual([6, 1], factor.get_cov_var().get_shape().as_list())
+      self.assertEqual([6, 1], factor.get_cov().get_shape().as_list())
 
   def testNaiveDiagonalFactorInitFloat64(self):
     with tf_ops.Graph().as_default():
@@ -420,7 +443,7 @@ def testNaiveDiagonalFactorInitFloat64(self):
       tensor = array_ops.ones((2, 3), dtype=dtype, name='a/b/c')
       factor = ff.NaiveDiagonalFactor((tensor,), 32)
       factor.instantiate_cov_variables()
-      cov = factor.get_cov_var()
+      cov = factor.get_cov()
       self.assertEqual(cov.dtype, dtype)
       self.assertEqual([6, 1], cov.get_shape().as_list())
 
@@ -444,7 +467,7 @@ def testInitialization(self):
       vocab_size = 5
       factor = ff.EmbeddingInputKroneckerFactor((input_ids,), vocab_size)
       factor.instantiate_cov_variables()
-      cov = factor.get_cov_var()
+      cov = factor.get_cov()
       self.assertEqual(cov.shape.as_list(), [vocab_size])
 
   def testCovarianceUpdateOp(self):
@@ -502,7 +525,7 @@ def testInit(self):
           self.kernel_height * self.kernel_width * self.in_channels,
           self.out_channels
       ],
-                       factor.get_cov_var().shape.as_list())
+                       factor.get_cov().shape.as_list())
 
   def testMakeCovarianceUpdateOp(self):
     with tf_ops.Graph().as_default():
@@ -564,7 +587,7 @@ def testHasBias(self):
           self.kernel_height * self.kernel_width * self.in_channels + 1,
           self.out_channels
       ],
-                       factor.get_cov_var().shape.as_list())
+                       factor.get_cov().shape.as_list())
 
       # Ensure update op doesn't crash.
       cov_update_op = factor.make_covariance_update_op(0.0)
@@ -654,13 +677,13 @@ def test3DConvolution(self):
       # Ensure shape of covariance matches input size of filter.
       input_size = in_channels * (width**3)
       self.assertEqual([input_size, input_size],
-                       factor.get_cov_var().shape.as_list())
+                       factor.get_cov().shape.as_list())
 
       # Ensure cov_update_op doesn't crash.
       with self.test_session() as sess:
         sess.run(tf_variables.global_variables_initializer())
         sess.run(factor.make_covariance_update_op(0.0))
-        cov = sess.run(factor.get_cov_var())
+        cov = sess.run(factor.get_cov())
 
       # Cov should be rank-8, as the filter will be applied at each corner of
       # the 4-D cube.
@@ -685,13 +708,13 @@ def testPointwiseConv2d(self):
 
       # Ensure shape of covariance matches input size of filter.
       self.assertEqual([in_channels, in_channels],
-                       factor.get_cov_var().shape.as_list())
+                       factor.get_cov().shape.as_list())
 
       # Ensure cov_update_op doesn't crash.
       with self.test_session() as sess:
         sess.run(tf_variables.global_variables_initializer())
         sess.run(factor.make_covariance_update_op(0.0))
-        cov = sess.run(factor.get_cov_var())
+        cov = sess.run(factor.get_cov())
 
       # Cov should be rank-9, as the filter will be applied at each location.
       self.assertMatrixRank(9, cov)
@@ -716,7 +739,7 @@ def testStrides(self):
       with self.test_session() as sess:
         sess.run(tf_variables.global_variables_initializer())
         sess.run(factor.make_covariance_update_op(0.0))
-        cov = sess.run(factor.get_cov_var())
+        cov = sess.run(factor.get_cov())
 
       # Cov should be the sum of 3 * 2 = 6 outer products.
       self.assertMatrixRank(6, cov)
@@ -742,7 +765,7 @@ def testDilationRate(self):
       with self.test_session() as sess:
         sess.run(tf_variables.global_variables_initializer())
         sess.run(factor.make_covariance_update_op(0.0))
-        cov = sess.run(factor.get_cov_var())
+        cov = sess.run(factor.get_cov())
 
       # Cov should be rank = in_channels, as only the center of the filter
       # receives non-zero input for each input channel.
diff --git a/tensorflow/contrib/kfac/python/kernel_tests/optimizer_test.py b/tensorflow/contrib/kfac/python/kernel_tests/optimizer_test.py
index 9325aa1b7325fa..560a9b0b426ecc 100644
--- a/tensorflow/contrib/kfac/python/kernel_tests/optimizer_test.py
+++ b/tensorflow/contrib/kfac/python/kernel_tests/optimizer_test.py
@@ -20,6 +20,7 @@
 
 import numpy as np
 
+from tensorflow.contrib.kfac.python.ops import fisher_factors as ff
 from tensorflow.contrib.kfac.python.ops import layer_collection as lc
 from tensorflow.contrib.kfac.python.ops import optimizer
 from tensorflow.python.framework import ops
@@ -32,6 +33,13 @@
 from tensorflow.python.platform import test
 
 
+# We need to set these constants since the numerical values used in the tests
+# were chosen when these used to be the defaults.
+ff.set_global_constants(init_covariances_at_zero=False,
+                        zero_debias=False,
+                        init_inverses_at_zero=False)
+
+
 def dummy_layer_collection():
   lcoll = lc.LayerCollection()
   dummy = array_ops.constant([1., 2.])
@@ -186,6 +194,11 @@ def testApplyGradients(self):
           layer_collection,
           momentum=0.5,
           momentum_type='regular')
+      (cov_update_thunks,
+       inv_update_thunks) = opt.make_vars_and_create_op_thunks()
+      cov_update_ops = tuple(thunk() for thunk in cov_update_thunks)
+      inv_update_ops = tuple(thunk() for thunk in inv_update_thunks)
+
       grads_and_vars = opt.compute_gradients(output, [weights, bias])
       all_vars = [grad_and_var[1] for grad_and_var in grads_and_vars]
 
@@ -193,6 +206,8 @@ def testApplyGradients(self):
 
       sess.run(tf_variables.global_variables_initializer())
       old_vars = sess.run(all_vars)
+      sess.run(cov_update_ops)
+      sess.run(inv_update_ops)
       sess.run(op)
       new_vars = sess.run(all_vars)
 
diff --git a/tensorflow/contrib/kfac/python/ops/BUILD b/tensorflow/contrib/kfac/python/ops/BUILD
index cb0917bb851cff..3c01eb65e7a687 100644
--- a/tensorflow/contrib/kfac/python/ops/BUILD
+++ b/tensorflow/contrib/kfac/python/ops/BUILD
@@ -35,6 +35,7 @@ py_library(
     srcs = ["fisher_factors.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":linear_operator",
         ":utils",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
@@ -63,6 +64,19 @@ py_library(
     ],
 )
 
+py_library(
+    name = "linear_operator",
+    srcs = ["linear_operator.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":utils",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/ops/linalg",
+        "@six_archive//:six",
+    ],
+)
+
 py_library(
     name = "loss_functions",
     srcs = ["loss_functions.py"],
diff --git a/tensorflow/contrib/kfac/python/ops/estimator.py b/tensorflow/contrib/kfac/python/ops/estimator.py
index d11c9c82881074..854f885c26f2b4 100644
--- a/tensorflow/contrib/kfac/python/ops/estimator.py
+++ b/tensorflow/contrib/kfac/python/ops/estimator.py
@@ -57,8 +57,8 @@ def make_fisher_estimator(placement_strategy=None, **kwargs):
   if placement_strategy in [None, "round_robin"]:
     return FisherEstimatorRoundRobin(**kwargs)
   else:
-    raise ValueError("Unimplemented vars and ops placement strategy : %s",
-                     placement_strategy)
+    raise ValueError("Unimplemented vars and ops "
+                     "placement strategy : {}".format(placement_strategy))
 # pylint: enable=abstract-class-instantiated
 
 
@@ -81,7 +81,9 @@ def __init__(self,
                exps=(-1,),
                estimation_mode="gradients",
                colocate_gradients_with_ops=True,
-               name="FisherEstimator"):
+               name="FisherEstimator",
+               compute_cholesky=False,
+               compute_cholesky_inverse=False):
     """Create a FisherEstimator object.
 
     Args:
@@ -124,6 +126,12 @@ def __init__(self,
       name: A string. A name given to this estimator, which is added to the
           variable scope when constructing variables and ops.
           (Default: "FisherEstimator")
+      compute_cholesky: Bool. Whether or not the FisherEstimator will be
+          able to multiply vectors by the Cholesky factor.
+          (Default: False)
+      compute_cholesky_inverse: Bool. Whether or not the FisherEstimator
+          will be able to multiply vectors by the Cholesky factor inverse.
+          (Default: False)
     Raises:
       ValueError: If no losses have been registered with layer_collection.
     """
@@ -142,6 +150,8 @@ def __init__(self,
 
     self._made_vars = False
     self._exps = exps
+    self._compute_cholesky = compute_cholesky
+    self._compute_cholesky_inverse = compute_cholesky_inverse
 
     self._name = name
 
@@ -170,44 +180,6 @@ def factors(self):
   def name(self):
     return self._name
 
-  @abc.abstractmethod
-  def make_ops_and_vars(self, scope=None):
-    """Make ops and vars with a specific placement strategy.
-
-    For each factor, all of that factor's cov variables and their associated
-    update ops will be placed on a particular device.  For example in case of
-    round robin placement a new device is chosen for each factor by cycling
-    through list of devices in the cov_devices argument. If cov_devices is None
-    then no explicit device placement occurs.
-
-    An analogous strategy is followed for inverse update ops, with the list of
-    devices being given by the inv_devices argument.
-
-    Inverse variables on the other hand are not placed on any specific device
-    (they will just use the current the device placement context, whatever
-    that happens to be).  The idea is that the inverse variable belong where
-    they will be accessed most often, which is the device that actually applies
-    the preconditioner to the gradient. The user will be responsible for setting
-    the device context for this.
-
-    Args:
-      scope: A string or None.  If None it will be set to the name of this
-        estimator (given by the name property). All variables will be created,
-        and all ops will execute, inside of a variable scope of the given
-        name. (Default: None)
-
-    Returns:
-      cov_update_ops: List of ops that compute the cov updates. Corresponds
-        one-to-one with the list of factors given by the "factors" property.
-      cov_update_op: cov_update_ops grouped into a single op.
-      inv_update_ops: List of ops that compute the inv updates. Corresponds
-        one-to-one with the list of factors given by the "factors" property.
-      inv_update_op: inv_update_ops grouped into a single op.
-      cov_update_thunks: Thunks that make the ops in cov_update_ops.
-      inv_update_thunks: Thunks that make the ops in inv_update_ops.
-    """
-    pass
-
   @abc.abstractmethod
   def make_vars_and_create_op_thunks(self, scope=None):
     """Make vars and create op thunks with a specific placement strategy.
@@ -300,9 +272,54 @@ def multiply_matpower(self, exp, vecs_and_vars):
       A list of (transformed vector, var) pairs in the same order as
       vecs_and_vars.
     """
+    assert exp in self._exps
+
     fcn = lambda fb, vec: fb.multiply_matpower(vec, exp)
     return self._apply_transformation(vecs_and_vars, fcn)
 
+  def multiply_cholesky(self, vecs_and_vars, transpose=False):
+    """Multiplies the vecs by the corresponding Cholesky factors.
+
+    Args:
+      vecs_and_vars: List of (vector, variable) pairs.
+      transpose: Bool. If true the Cholesky factors are transposed before
+        multiplying the vecs. (Default: False)
+
+    Returns:
+      A list of (transformed vector, var) pairs in the same order as
+      vecs_and_vars.
+    """
+    assert self._compute_cholesky
+
+    fcn = lambda fb, vec: fb.multiply_cholesky(vec, transpose=transpose)
+    return self._apply_transformation(vecs_and_vars, fcn)
+
+  def multiply_cholesky_inverse(self, vecs_and_vars, transpose=False):
+    """Mults the vecs by the inverses of the corresponding Cholesky factors.
+
+      Note: if you are using Cholesky inverse multiplication to sample from
+      a matrix-variate Gaussian you will want to multiply by the transpose.
+      Let L be the Cholesky factor of F and observe that
+
+        L^-T * L^-1 = (L * L^T)^-1 = F^-1 .
+
+      Thus we want to multiply by L^-T in order to sample from Gaussian with
+      covariance F^-1.
+
+    Args:
+      vecs_and_vars: List of (vector, variable) pairs.
+      transpose: Bool. If true the Cholesky factor inverses are transposed
+        before multiplying the vecs. (Default: False)
+
+    Returns:
+      A list of (transformed vector, var) pairs in the same order as
+      vecs_and_vars.
+    """
+    assert self._compute_cholesky_inverse
+
+    fcn = lambda fb, vec: fb.multiply_cholesky_inverse(vec, transpose=transpose)
+    return self._apply_transformation(vecs_and_vars, fcn)
+
   def _instantiate_factors(self):
     """Instantiates FisherFactors' variables.
 
@@ -333,9 +350,13 @@ def made_vars(self):
     return self._made_vars
 
   def _register_matrix_functions(self):
-    for exp in self._exps:
-      for block in self.blocks:
+    for block in self.blocks:
+      for exp in self._exps:
         block.register_matpower(exp)
+      if self._compute_cholesky:
+        block.register_cholesky()
+      if self._compute_cholesky_inverse:
+        block.register_cholesky_inverse()
 
   def _finalize_layer_collection(self):
     self._layers.create_subgraph()
diff --git a/tensorflow/contrib/kfac/python/ops/estimator_lib.py b/tensorflow/contrib/kfac/python/ops/estimator_lib.py
index 33c969650615bf..9c9fef471f8033 100644
--- a/tensorflow/contrib/kfac/python/ops/estimator_lib.py
+++ b/tensorflow/contrib/kfac/python/ops/estimator_lib.py
@@ -25,6 +25,7 @@
 
 _allowed_symbols = [
     'FisherEstimator',
+    'make_fisher_estimator',
 ]
 
 remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
diff --git a/tensorflow/contrib/kfac/python/ops/fisher_blocks.py b/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
index 00b3673a742e92..3a5c8eb5f9630f 100644
--- a/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
+++ b/tensorflow/contrib/kfac/python/ops/fisher_blocks.py
@@ -83,34 +83,22 @@ def normalize_damping(damping, num_replications):
 
 
 def compute_pi_tracenorm(left_cov, right_cov):
-  """Computes the scalar constant pi for Tikhonov regularization/damping.
+  r"""Computes the scalar constant pi for Tikhonov regularization/damping.
 
   $$\pi = \sqrt{ (trace(A) / dim(A)) / (trace(B) / dim(B)) }$$
   See section 6.3 of https://arxiv.org/pdf/1503.05671.pdf for details.
 
   Args:
-    left_cov: The left Kronecker factor "covariance".
-    right_cov: The right Kronecker factor "covariance".
+    left_cov: A LinearOperator object. The left Kronecker factor "covariance".
+    right_cov: A LinearOperator object. The right Kronecker factor "covariance".
 
   Returns:
     The computed scalar constant pi for these Kronecker Factors (as a Tensor).
   """
-
-  def _trace(cov):
-    if len(cov.shape) == 1:
-      # Diagonal matrix.
-      return math_ops.reduce_sum(cov)
-    elif len(cov.shape) == 2:
-      # Full matrix.
-      return math_ops.trace(cov)
-    else:
-      raise ValueError(
-          "What's the trace of a Tensor of rank %d?" % len(cov.shape))
-
   # Instead of dividing by the dim of the norm, we multiply by the dim of the
   # other norm. This works out the same in the ratio.
-  left_norm = _trace(left_cov) * right_cov.shape.as_list()[0]
-  right_norm = _trace(right_cov) * left_cov.shape.as_list()[0]
+  left_norm = left_cov.trace() * int(right_cov.domain_dimension)
+  right_norm = right_cov.trace() * int(left_cov.domain_dimension)
   return math_ops.sqrt(left_norm / right_norm)
 
 
@@ -188,6 +176,16 @@ def register_matpower(self, exp):
     """
     pass
 
+  @abc.abstractmethod
+  def register_cholesky(self):
+    """Registers a Cholesky factor to be computed by the block."""
+    pass
+
+  @abc.abstractmethod
+  def register_cholesky_inverse(self):
+    """Registers an inverse Cholesky factor to be computed by the block."""
+    pass
+
   def register_inverse(self):
     """Registers a matrix inverse to be computed by the block."""
     self.register_matpower(-1)
@@ -228,6 +226,33 @@ def multiply(self, vector):
     """
     return self.multiply_matpower(vector, 1)
 
+  @abc.abstractmethod
+  def multiply_cholesky(self, vector, transpose=False):
+    """Multiplies the vector by the (damped) Cholesky-factor of the block.
+
+    Args:
+      vector: The vector (a Tensor or tuple of Tensors) to be multiplied.
+      transpose: Bool. If true the Cholesky factor is transposed before
+        multiplying the vector. (Default: False)
+
+    Returns:
+      The vector left-multiplied by the (damped) Cholesky-factor of the block.
+    """
+    pass
+
+  @abc.abstractmethod
+  def multiply_cholesky_inverse(self, vector, transpose=False):
+    """Multiplies vector by the (damped) inverse Cholesky-factor of the block.
+
+    Args:
+      vector: The vector (a Tensor or tuple of Tensors) to be multiplied.
+      transpose: Bool. If true the Cholesky factor inverse is transposed
+        before multiplying the vector. (Default: False)
+    Returns:
+      Vector left-multiplied by (damped) inverse Cholesky-factor of the block.
+    """
+    pass
+
   @abc.abstractmethod
   def tensors_to_compute_grads(self):
     """Returns the Tensor(s) with respect to which this FisherBlock needs grads.
@@ -275,15 +300,32 @@ def instantiate_factors(self, grads_list, damping):
   def register_matpower(self, exp):
     self._factor.register_matpower(exp, self._damping_func)
 
-  def multiply_matpower(self, vector, exp):
+  def register_cholesky(self):
+    self._factor.register_cholesky(self._damping_func)
+
+  def register_cholesky_inverse(self):
+    self._factor.register_cholesky_inverse(self._damping_func)
+
+  def _multiply_matrix(self, matrix, vector, transpose=False):
     vector_flat = utils.tensors_to_column(vector)
-    out_flat = self._factor.left_multiply_matpower(
-        vector_flat, exp, self._damping_func)
+    out_flat = matrix.matmul(vector_flat, adjoint=transpose)
     return utils.column_to_tensors(vector, out_flat)
 
+  def multiply_matpower(self, vector, exp):
+    matrix = self._factor.get_matpower(exp, self._damping_func)
+    return self._multiply_matrix(matrix, vector)
+
+  def multiply_cholesky(self, vector, transpose=False):
+    matrix = self._factor.get_cholesky(self._damping_func)
+    return self._multiply_matrix(matrix, vector, transpose=transpose)
+
+  def multiply_cholesky_inverse(self, vector, transpose=False):
+    matrix = self._factor.get_cholesky_inverse(self._damping_func)
+    return self._multiply_matrix(matrix, vector, transpose=transpose)
+
   def full_fisher_block(self):
     """Explicitly constructs the full Fisher block."""
-    return self._factor.get_cov()
+    return self._factor.get_cov_as_linear_operator().to_dense()
 
   def tensors_to_compute_grads(self):
     return self._params
@@ -305,7 +347,47 @@ def _batch_size(self):
     return math_ops.reduce_sum(self._batch_sizes)
 
 
-class NaiveDiagonalFB(FisherBlock):
+@six.add_metaclass(abc.ABCMeta)
+class DiagonalFB(FisherBlock):
+  """A base class for FisherBlocks that use diagonal approximations."""
+
+  def register_matpower(self, exp):
+    # Not needed for this.  Matrix powers are computed on demand in the
+    # diagonal case
+    pass
+
+  def register_cholesky(self):
+    # Not needed for this.  Cholesky's are computed on demand in the
+    # diagonal case
+    pass
+
+  def register_cholesky_inverse(self):
+    # Not needed for this.  Cholesky inverses's are computed on demand in the
+    # diagonal case
+    pass
+
+  def _multiply_matrix(self, matrix, vector):
+    vector_flat = utils.tensors_to_column(vector)
+    out_flat = matrix.matmul(vector_flat)
+    return utils.column_to_tensors(vector, out_flat)
+
+  def multiply_matpower(self, vector, exp):
+    matrix = self._factor.get_matpower(exp, self._damping_func)
+    return self._multiply_matrix(matrix, vector)
+
+  def multiply_cholesky(self, vector, transpose=False):
+    matrix = self._factor.get_cholesky(self._damping_func)
+    return self._multiply_matrix(matrix, vector)
+
+  def multiply_cholesky_inverse(self, vector, transpose=False):
+    matrix = self._factor.get_cholesky_inverse(self._damping_func)
+    return self._multiply_matrix(matrix, vector)
+
+  def full_fisher_block(self):
+    return self._factor.get_cov_as_linear_operator().to_dense()
+
+
+class NaiveDiagonalFB(DiagonalFB):
   """FisherBlock using a diagonal matrix approximation.
 
   This type of approximation is generically applicable but quite primitive.
@@ -333,20 +415,6 @@ def instantiate_factors(self, grads_list, damping):
     self._factor = self._layer_collection.make_or_get_factor(
         fisher_factors.NaiveDiagonalFactor, (grads_list, self._batch_size))
 
-  def register_matpower(self, exp):
-    # Not needed for this.  Matrix powers are computed on demand in the
-    # diagonal case
-    pass
-
-  def multiply_matpower(self, vector, exp):
-    vector_flat = utils.tensors_to_column(vector)
-    out_flat = self._factor.left_multiply_matpower(
-        vector_flat, exp, self._damping_func)
-    return utils.column_to_tensors(vector, out_flat)
-
-  def full_fisher_block(self):
-    return self._factor.get_cov()
-
   def tensors_to_compute_grads(self):
     return self._params
 
@@ -452,7 +520,7 @@ def _outputs(self):
     return self.__outputs
 
 
-class FullyConnectedDiagonalFB(InputOutputMultiTower, FisherBlock):
+class FullyConnectedDiagonalFB(InputOutputMultiTower, DiagonalFB):
   """FisherBlock for fully-connected (dense) layers using a diagonal approx.
 
   Estimates the Fisher Information matrix's diagonal entries for a fully
@@ -497,32 +565,8 @@ def instantiate_factors(self, grads_list, damping):
 
     self._damping_func = _package_func(lambda: damping, (damping,))
 
-  def register_matpower(self, exp):
-    # Not needed for this.  Matrix powers are computed on demand in the
-    # diagonal case
-    pass
 
-  def multiply_matpower(self, vector, exp):
-    """Multiplies the vector by the (damped) matrix-power of the block.
-
-    Args:
-      vector: Tensor or 2-tuple of Tensors. if self._has_bias, Tensor of shape
-        [input_size, output_size] corresponding to layer's weights. If not, a
-        2-tuple of the former and a Tensor of shape [output_size] corresponding
-        to the layer's bias.
-      exp: A scalar representing the power to raise the block before multiplying
-           it by the vector.
-
-    Returns:
-      The vector left-multiplied by the (damped) matrix-power of the block.
-    """
-    reshaped_vec = utils.layer_params_to_mat2d(vector)
-    reshaped_out = self._factor.left_multiply_matpower(
-        reshaped_vec, exp, self._damping_func)
-    return utils.mat2d_to_layer_params(vector, reshaped_out)
-
-
-class ConvDiagonalFB(InputOutputMultiTower, FisherBlock):
+class ConvDiagonalFB(InputOutputMultiTower, DiagonalFB):
   """FisherBlock for 2-D convolutional layers using a diagonal approx.
 
   Estimates the Fisher Information matrix's diagonal entries for a convolutional
@@ -621,17 +665,6 @@ def damping_func():
                   self._num_locations)
     self._damping_func = _package_func(damping_func, damping_id)
 
-  def register_matpower(self, exp):
-    # Not needed for this.  Matrix powers are computed on demand in the
-    # diagonal case
-    pass
-
-  def multiply_matpower(self, vector, exp):
-    reshaped_vect = utils.layer_params_to_mat2d(vector)
-    reshaped_out = self._factor.left_multiply_matpower(
-        reshaped_vect, exp, self._damping_func)
-    return utils.mat2d_to_layer_params(vector, reshaped_out)
-
 
 class KroneckerProductFB(FisherBlock):
   """A base class for blocks with separate input and output Kronecker factors.
@@ -640,9 +673,6 @@ class KroneckerProductFB(FisherBlock):
   output factors.
   """
 
-  def __init__(self, layer_collection):
-    super(KroneckerProductFB, self).__init__(layer_collection)
-
   def _setup_damping(self, damping, normalization=None):
     """Makes functions that compute the damping values for both factors."""
     def compute_damping():
@@ -651,9 +681,10 @@ def compute_damping():
       else:
         maybe_normalized_damping = damping
 
-      return compute_pi_adjusted_damping(self._input_factor.get_cov(),
-                                         self._output_factor.get_cov(),
-                                         maybe_normalized_damping**0.5)
+      return compute_pi_adjusted_damping(
+          self._input_factor.get_cov_as_linear_operator(),
+          self._output_factor.get_cov_as_linear_operator(),
+          maybe_normalized_damping**0.5)
 
     if normalization is not None:
       damping_id = ("compute_pi_adjusted_damping",
@@ -675,6 +706,14 @@ def register_matpower(self, exp):
     self._input_factor.register_matpower(exp, self._input_damping_func)
     self._output_factor.register_matpower(exp, self._output_damping_func)
 
+  def register_cholesky(self):
+    self._input_factor.register_cholesky(self._input_damping_func)
+    self._output_factor.register_cholesky(self._output_damping_func)
+
+  def register_cholesky_inverse(self):
+    self._input_factor.register_cholesky_inverse(self._input_damping_func)
+    self._output_factor.register_cholesky_inverse(self._output_damping_func)
+
   @property
   def _renorm_coeff(self):
     """Kronecker factor multiplier coefficient.
@@ -687,17 +726,47 @@ def _renorm_coeff(self):
     """
     return 1.0
 
-  def multiply_matpower(self, vector, exp):
+  def _multiply_factored_matrix(self, left_factor, right_factor, vector,
+                                extra_scale=1.0, transpose_left=False,
+                                transpose_right=False):
     reshaped_vector = utils.layer_params_to_mat2d(vector)
-    reshaped_out = self._output_factor.right_multiply_matpower(
-        reshaped_vector, exp, self._output_damping_func)
-    reshaped_out = self._input_factor.left_multiply_matpower(
-        reshaped_out, exp, self._input_damping_func)
-    if self._renorm_coeff != 1.0:
-      renorm_coeff = math_ops.cast(self._renorm_coeff, dtype=reshaped_out.dtype)
-      reshaped_out *= math_ops.cast(renorm_coeff**exp, dtype=reshaped_out.dtype)
+    reshaped_out = right_factor.matmul_right(reshaped_vector,
+                                             adjoint=transpose_right)
+    reshaped_out = left_factor.matmul(reshaped_out,
+                                      adjoint=transpose_left)
+    if extra_scale != 1.0:
+      reshaped_out *= math_ops.cast(extra_scale, dtype=reshaped_out.dtype)
     return utils.mat2d_to_layer_params(vector, reshaped_out)
 
+  def multiply_matpower(self, vector, exp):
+    left_factor = self._input_factor.get_matpower(
+        exp, self._input_damping_func)
+    right_factor = self._output_factor.get_matpower(
+        exp, self._output_damping_func)
+    extra_scale = float(self._renorm_coeff)**exp
+    return self._multiply_factored_matrix(left_factor, right_factor, vector,
+                                          extra_scale=extra_scale)
+
+  def multiply_cholesky(self, vector, transpose=False):
+    left_factor = self._input_factor.get_cholesky(self._input_damping_func)
+    right_factor = self._output_factor.get_cholesky(self._output_damping_func)
+    extra_scale = float(self._renorm_coeff)**0.5
+    return self._multiply_factored_matrix(left_factor, right_factor, vector,
+                                          extra_scale=extra_scale,
+                                          transpose_left=transpose,
+                                          transpose_right=not transpose)
+
+  def multiply_cholesky_inverse(self, vector, transpose=False):
+    left_factor = self._input_factor.get_cholesky_inverse(
+        self._input_damping_func)
+    right_factor = self._output_factor.get_cholesky_inverse(
+        self._output_damping_func)
+    extra_scale = float(self._renorm_coeff)**-0.5
+    return self._multiply_factored_matrix(left_factor, right_factor, vector,
+                                          extra_scale=extra_scale,
+                                          transpose_left=transpose,
+                                          transpose_right=not transpose)
+
   def full_fisher_block(self):
     """Explicitly constructs the full Fisher block.
 
@@ -706,8 +775,8 @@ def full_fisher_block(self):
     Returns:
       The full Fisher block.
     """
-    left_factor = self._input_factor.get_cov()
-    right_factor = self._output_factor.get_cov()
+    left_factor = self._input_factor.get_cov_as_linear_operator().to_dense()
+    right_factor = self._output_factor.get_cov_as_linear_operator().to_dense()
     return self._renorm_coeff * utils.kronecker_product(left_factor,
                                                         right_factor)
 
@@ -796,7 +865,7 @@ def instantiate_factors(self, grads_list, damping):
 
 
 class ConvKFCBasicFB(InputOutputMultiTower, KroneckerProductFB):
-  """FisherBlock for convolutional layers using the basic KFC approx.
+  r"""FisherBlock for convolutional layers using the basic KFC approx.
 
   Estimates the Fisher Information matrix's blog for a convolutional
   layer.
@@ -945,10 +1014,10 @@ def __init__(self,
     self._filter_shape = (filter_height, filter_width, in_channels,
                           in_channels * channel_multiplier)
 
-  def multiply_matpower(self, vector, exp):
+  def _multiply_matrix(self, matrix, vector):
     conv2d_vector = depthwise_conv2d_filter_to_conv2d_filter(vector)
-    conv2d_result = super(DepthwiseConvDiagonalFB, self).multiply_matpower(
-        conv2d_vector, exp)
+    conv2d_result = super(
+        DepthwiseConvDiagonalFB, self)._multiply_matrix(matrix, conv2d_vector)
     return conv2d_filter_to_depthwise_conv2d_filter(conv2d_result)
 
 
@@ -1016,10 +1085,14 @@ def __init__(self,
     self._filter_shape = (filter_height, filter_width, in_channels,
                           in_channels * channel_multiplier)
 
-  def multiply_matpower(self, vector, exp):
+  def _multiply_factored_matrix(self, left_factor, right_factor, vector,
+                                extra_scale=1.0, transpose_left=False,
+                                transpose_right=False):
     conv2d_vector = depthwise_conv2d_filter_to_conv2d_filter(vector)
-    conv2d_result = super(DepthwiseConvKFCBasicFB, self).multiply_matpower(
-        conv2d_vector, exp)
+    conv2d_result = super(
+        DepthwiseConvKFCBasicFB, self)._multiply_factored_matrix(
+            left_factor, right_factor, conv2d_vector, extra_scale=extra_scale,
+            transpose_left=transpose_left, transpose_right=transpose_right)
     return conv2d_filter_to_depthwise_conv2d_filter(conv2d_result)
 
 
@@ -1233,6 +1306,8 @@ def _process_data(self, grads_list):
       else:
         raise ValueError("Global config variable TOWER_STRATEGY must be one of "
                          "'concat' or 'separate'.")
+    else:
+      inputs = tuple(inputs)
 
     # Now we perform the analogous processing for grads_list
     if isinstance(grads_list[0][0], (list, tuple)):
@@ -1275,6 +1350,8 @@ def _process_data(self, grads_list):
       else:
         raise ValueError("Global config variable TOWER_STRATEGY must be one of "
                          "'concat' or 'separate'.")
+    else:
+      grads_list = tuple(tuple(grads) for grads in grads_list)
 
     if self._num_uses is None:
       raise ValueError("You must supply a value for the num_uses argument if "
@@ -1664,3 +1741,12 @@ def gamma(x):
     return utils.mat2d_to_layer_params(vector, Z)
 
     # pylint: enable=invalid-name
+
+  def multiply_cholesky(self, vector):
+    raise NotImplementedError("FullyConnectedSeriesFB does not support "
+                              "Cholesky computations.")
+
+  def multiply_cholesky_inverse(self, vector):
+    raise NotImplementedError("FullyConnectedSeriesFB does not support "
+                              "Cholesky computations.")
+
diff --git a/tensorflow/contrib/kfac/python/ops/fisher_factors.py b/tensorflow/contrib/kfac/python/ops/fisher_factors.py
index 7988a3b92bf013..b43232dfafaa6d 100644
--- a/tensorflow/contrib/kfac/python/ops/fisher_factors.py
+++ b/tensorflow/contrib/kfac/python/ops/fisher_factors.py
@@ -24,6 +24,7 @@
 import numpy as np
 import six
 
+from tensorflow.contrib.kfac.python.ops import linear_operator as lo
 from tensorflow.contrib.kfac.python.ops import utils
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops as tf_ops
@@ -42,10 +43,14 @@
 
 # Whether to initialize covariance estimators at a zero matrix (or the identity
 # matrix).
-INIT_COVARIANCES_AT_ZERO = False
+INIT_COVARIANCES_AT_ZERO = True
 
 # Whether to zero-debias the moving averages.
-ZERO_DEBIAS = False
+ZERO_DEBIAS = True
+
+# Whether to initialize inverse (and other such matrices computed from the cov
+# matrices) to the zero matrix (or the identity matrix).
+INIT_INVERSES_AT_ZERO = True
 
 # When the number of inverses requested from a FisherFactor exceeds this value,
 # the inverses are computed using an eigenvalue decomposition.
@@ -82,6 +87,7 @@
 
 def set_global_constants(init_covariances_at_zero=None,
                          zero_debias=None,
+                         init_inverses_at_zero=None,
                          eigenvalue_decomposition_threshold=None,
                          eigenvalue_clipping_threshold=None,
                          max_num_outer_products_per_cov_row=None,
@@ -92,6 +98,7 @@ def set_global_constants(init_covariances_at_zero=None,
   """Sets various global constants used by the classes in this module."""
   global INIT_COVARIANCES_AT_ZERO
   global ZERO_DEBIAS
+  global INIT_INVERSES_AT_ZERO
   global EIGENVALUE_DECOMPOSITION_THRESHOLD
   global EIGENVALUE_CLIPPING_THRESHOLD
   global _MAX_NUM_OUTER_PRODUCTS_PER_COV_ROW
@@ -104,6 +111,8 @@ def set_global_constants(init_covariances_at_zero=None,
     INIT_COVARIANCES_AT_ZERO = init_covariances_at_zero
   if zero_debias is not None:
     ZERO_DEBIAS = zero_debias
+  if init_inverses_at_zero is not None:
+    INIT_INVERSES_AT_ZERO = init_inverses_at_zero
   if eigenvalue_decomposition_threshold is not None:
     EIGENVALUE_DECOMPOSITION_THRESHOLD = eigenvalue_decomposition_threshold
   if eigenvalue_clipping_threshold is not None:
@@ -121,19 +130,21 @@ def set_global_constants(init_covariances_at_zero=None,
 
 
 def inverse_initializer(shape, dtype, partition_info=None):  # pylint: disable=unused-argument
-  return array_ops.diag(array_ops.ones(shape[0], dtype))
+  if INIT_INVERSES_AT_ZERO:
+    return array_ops.zeros(shape, dtype=dtype)
+  return linalg_ops.eye(num_rows=shape[0], dtype=dtype)
 
 
 def covariance_initializer(shape, dtype, partition_info=None):  # pylint: disable=unused-argument
   if INIT_COVARIANCES_AT_ZERO:
-    return array_ops.diag(array_ops.zeros(shape[0], dtype))
-  return array_ops.diag(array_ops.ones(shape[0], dtype))
+    return array_ops.zeros(shape, dtype=dtype)
+  return linalg_ops.eye(num_rows=shape[0], dtype=dtype)
 
 
-def diagonal_covariance_initializer(shape, dtype, partition_info):  # pylint: disable=unused-argument
+def diagonal_covariance_initializer(shape, dtype, partition_info=None):  # pylint: disable=unused-argument
   if INIT_COVARIANCES_AT_ZERO:
-    return array_ops.zeros(shape, dtype)
-  return array_ops.ones(shape, dtype)
+    return array_ops.zeros(shape, dtype=dtype)
+  return array_ops.ones(shape, dtype=dtype)
 
 
 @contextlib.contextmanager
@@ -399,7 +410,7 @@ def _compute_new_cov(self, source, tower):
         the cov update.
 
     Returns:
-      Tensor of same shape as self.get_cov_var().
+      Tensor of same shape as self.get_cov().
     """
     pass
 
@@ -448,78 +459,43 @@ def make_inverse_update_ops(self):
     """Create and return update ops corresponding to registered computations."""
     pass
 
-  @abc.abstractmethod
   def get_cov(self):
-    """Get full covariance matrix.
-
-    Returns:
-      Tensor of shape [n, n]. Represents all parameter-parameter correlations
-      captured by this FisherFactor.
-    """
-    pass
-
-  def get_cov_var(self):
-    """Get variable backing this FisherFactor.
-
-    May or may not be the same as self.get_cov()
-
-    Returns:
-      Variable of shape self._cov_shape.
-    """
     return self._cov
 
   @abc.abstractmethod
-  def left_multiply_matpower(self, x, exp, damping_func):
-    """Left multiplies 'x' by matrix power of this factor (w/ damping applied).
-
-    This calculation is essentially:
-      (C + damping * I)**exp * x
-    where * is matrix-multiplication, ** is matrix power, I is the identity
-    matrix, and C is the matrix represented by this factor.
-
-    x can represent either a matrix or a vector.  For some factors, 'x' might
-    represent a vector but actually be stored as a 2D matrix for convenience.
-
-    Args:
-      x: Tensor. Represents a single vector. Shape depends on implementation.
-      exp: float.  The matrix exponent to use.
-      damping_func: A function that computes a 0-D Tensor or a float which will
-        be the damping value used.  i.e. damping = damping_func().
+  def get_cov_as_linear_operator(self):
+    pass
 
-    Returns:
-      Tensor of same shape as 'x' representing the result of the multiplication.
-    """
+  @abc.abstractmethod
+  def register_matpower(self, exp, damping_func):
     pass
 
   @abc.abstractmethod
-  def right_multiply_matpower(self, x, exp, damping_func):
-    """Right multiplies 'x' by matrix power of this factor (w/ damping applied).
+  def register_cholesky(self, damping_func):
+    pass
 
-    This calculation is essentially:
-      x * (C + damping * I)**exp
-    where * is matrix-multiplication, ** is matrix power, I is the identity
-    matrix, and C is the matrix represented by this factor.
+  @abc.abstractmethod
+  def register_cholesky_inverse(self, damping_func):
+    pass
 
-    Unlike left_multiply_matpower, x will always be a matrix.
+  @abc.abstractmethod
+  def get_matpower(self, exp, damping_func):
+    pass
 
-    Args:
-      x: Tensor. Represents a single vector. Shape depends on implementation.
-      exp: float.  The matrix exponent to use.
-      damping_func: A function that computes a 0-D Tensor or a float which will
-        be the damping value used.  i.e. damping = damping_func().
+  @abc.abstractmethod
+  def get_cholesky(self, damping_func):
+    pass
 
-    Returns:
-      Tensor of same shape as 'x' representing the result of the multiplication.
-    """
+  @abc.abstractmethod
+  def get_cholesky_inverse(self, damping_func):
     pass
 
 
-class InverseProvidingFactor(FisherFactor):
-  """Base class for FisherFactors that maintain inverses explicitly.
+class DenseSquareMatrixFactor(FisherFactor):
+  """Base class for FisherFactors that are stored as dense square matrices.
 
-  This class explicitly calculates and stores inverses of covariance matrices
-  provided by the underlying FisherFactor implementation. It is assumed that
-  vectors can be represented as 2-D matrices.
+  This class explicitly calculates and stores inverses of their `cov` matrices,
+  which must be square dense matrices.
 
   Subclasses must implement the _compute_new_cov method, and the _var_scope and
   _cov_shape properties.
@@ -538,7 +514,19 @@ def __init__(self):
     self._eigendecomp = None
     self._damping_funcs_by_id = {}  # {hashable: lambda}
 
-    super(InverseProvidingFactor, self).__init__()
+    self._cholesky_registrations = set()  # { hashable }
+    self._cholesky_inverse_registrations = set()  # { hashable }
+
+    self._cholesky_by_damping = {}  # { hashable: variable }
+    self._cholesky_inverse_by_damping = {}  # { hashable: variable }
+
+    super(DenseSquareMatrixFactor, self).__init__()
+
+  def get_cov_as_linear_operator(self):
+    assert self.get_cov().shape.ndims == 2
+    return lo.LinearOperatorFullMatrix(self.get_cov(),
+                                       is_self_adjoint=True,
+                                       is_square=True)
 
   def _register_damping(self, damping_func):
     damping_id = graph_func_to_id(damping_func)
@@ -563,8 +551,6 @@ def register_matpower(self, exp, damping_func):
         be the damping value used.  i.e. damping = damping_func().
     """
     if exp == 1.0:
-      # We don't register these.  The user shouldn't even be calling this
-      # function with exp = 1.0.
       return
 
     damping_id = self._register_damping(damping_func)
@@ -572,6 +558,38 @@ def register_matpower(self, exp, damping_func):
     if (exp, damping_id) not in self._matpower_registrations:
       self._matpower_registrations.add((exp, damping_id))
 
+  def register_cholesky(self, damping_func):
+    """Registers a Cholesky factor to be maintained and served on demand.
+
+    This creates a variable and signals make_inverse_update_ops to make the
+    corresponding update op.  The variable can be read via the method
+    get_cholesky.
+
+    Args:
+      damping_func: A function that computes a 0-D Tensor or a float which will
+        be the damping value used.  i.e. damping = damping_func().
+    """
+    damping_id = self._register_damping(damping_func)
+
+    if damping_id not in self._cholesky_registrations:
+      self._cholesky_registrations.add(damping_id)
+
+  def register_cholesky_inverse(self, damping_func):
+    """Registers an inverse Cholesky factor to be maintained/served on demand.
+
+    This creates a variable and signals make_inverse_update_ops to make the
+    corresponding update op.  The variable can be read via the method
+    get_cholesky_inverse.
+
+    Args:
+      damping_func: A function that computes a 0-D Tensor or a float which will
+        be the damping value used.  i.e. damping = damping_func().
+    """
+    damping_id = self._register_damping(damping_func)
+
+    if damping_id not in self._cholesky_inverse_registrations:
+      self._cholesky_inverse_registrations.add(damping_id)
+
   def instantiate_inv_variables(self):
     """Makes the internal "inverse" variable(s)."""
 
@@ -589,6 +607,32 @@ def instantiate_inv_variables(self):
       assert (exp, damping_id) not in self._matpower_by_exp_and_damping
       self._matpower_by_exp_and_damping[(exp, damping_id)] = matpower
 
+    for damping_id in self._cholesky_registrations:
+      damping_func = self._damping_funcs_by_id[damping_id]
+      damping_string = graph_func_to_string(damping_func)
+      with variable_scope.variable_scope(self._var_scope):
+        chol = variable_scope.get_variable(
+            "cholesky_damp{}".format(damping_string),
+            initializer=inverse_initializer,
+            shape=self._cov_shape,
+            trainable=False,
+            dtype=self._dtype)
+      assert damping_id not in self._cholesky_by_damping
+      self._cholesky_by_damping[damping_id] = chol
+
+    for damping_id in self._cholesky_inverse_registrations:
+      damping_func = self._damping_funcs_by_id[damping_id]
+      damping_string = graph_func_to_string(damping_func)
+      with variable_scope.variable_scope(self._var_scope):
+        cholinv = variable_scope.get_variable(
+            "cholesky_inverse_damp{}".format(damping_string),
+            initializer=inverse_initializer,
+            shape=self._cov_shape,
+            trainable=False,
+            dtype=self._dtype)
+      assert damping_id not in self._cholesky_inverse_by_damping
+      self._cholesky_inverse_by_damping[damping_id] = cholinv
+
   def make_inverse_update_ops(self):
     """Create and return update ops corresponding to registered computations."""
     ops = []
@@ -606,7 +650,8 @@ def make_inverse_update_ops(self):
 
     # We precompute these so we don't need to evaluate them multiple times (for
     # each matrix power that uses them)
-    damping_value_by_id = {damping_id: self._damping_funcs_by_id[damping_id]()
+    damping_value_by_id = {damping_id: math_ops.cast(
+        self._damping_funcs_by_id[damping_id](), self._dtype)
                            for damping_id in self._damping_funcs_by_id}
 
     if use_eig:
@@ -627,29 +672,91 @@ def make_inverse_update_ops(self):
           self._matpower_by_exp_and_damping.items()):
         assert exp == -1
         damping = damping_value_by_id[damping_id]
-        ops.append(matpower.assign(utils.posdef_inv(self._cov, damping)))
+        ops.append(matpower.assign(utils.posdef_inv(self.get_cov(), damping)))
+
+    # TODO(b/77902055): If inverses are being computed with Cholesky's
+    # we can share the work. Instead this code currently just computes the
+    # Cholesky a second time. It does at least share work between requests for
+    # Cholesky's and Cholesky inverses with the same damping id.
+    for damping_id, cholesky_inv in self._cholesky_inverse_by_damping.items():
+      cholesky_ops = []
+
+      damping = damping_value_by_id[damping_id]
+      cholesky_value = utils.cholesky(self.get_cov(), damping)
+
+      if damping_id in self._cholesky_by_damping:
+        cholesky = self._cholesky_by_damping[damping_id]
+        cholesky_ops.append(cholesky.assign(cholesky_value))
+
+      identity = linalg_ops.eye(cholesky_value.shape.as_list()[0],
+                                dtype=cholesky_value.dtype)
+      cholesky_inv_value = linalg_ops.matrix_triangular_solve(cholesky_value,
+                                                              identity)
+      cholesky_ops.append(cholesky_inv.assign(cholesky_inv_value))
+
+      ops.append(control_flow_ops.group(*cholesky_ops))
+
+    for damping_id, cholesky in self._cholesky_by_damping.items():
+      if damping_id not in self._cholesky_inverse_by_damping:
+        damping = damping_value_by_id[damping_id]
+        cholesky_value = utils.cholesky(self.get_cov(), damping)
+        ops.append(cholesky.assign(cholesky_value))
 
     self._eigendecomp = False
     return ops
 
   def get_inverse(self, damping_func):
     # Just for backwards compatibility of some old code and tests
-    damping_id = graph_func_to_id(damping_func)
-    return self._matpower_by_exp_and_damping[(-1, damping_id)]
+    return self.get_matpower(-1, damping_func)
 
   def get_matpower(self, exp, damping_func):
+    # Note that this function returns a variable which gets updated by the
+    # inverse ops.  It may be stale / inconsistent with the latest value of
+    # get_cov().
+    if exp != 1:
+      damping_id = graph_func_to_id(damping_func)
+      matpower = self._matpower_by_exp_and_damping[(exp, damping_id)]
+    else:
+      matpower = self.get_cov()
+      identity = linalg_ops.eye(matpower.shape.as_list()[0],
+                                dtype=matpower.dtype)
+      matpower += math_ops.cast(damping_func(), dtype=matpower.dtype)*identity
+
+    assert matpower.shape.ndims == 2
+    return lo.LinearOperatorFullMatrix(matpower,
+                                       is_non_singular=True,
+                                       is_self_adjoint=True,
+                                       is_positive_definite=True,
+                                       is_square=True)
+
+  def get_cholesky(self, damping_func):
     # Note that this function returns a variable which gets updated by the
     # inverse ops.  It may be stale / inconsistent with the latest value of
     # get_cov().
     damping_id = graph_func_to_id(damping_func)
-    return self._matpower_by_exp_and_damping[(exp, damping_id)]
+    cholesky = self._cholesky_by_damping[damping_id]
+    assert cholesky.shape.ndims == 2
+    return lo.LinearOperatorFullMatrix(cholesky,
+                                       is_non_singular=True,
+                                       is_square=True)
+
+  def get_cholesky_inverse(self, damping_func):
+    # Note that this function returns a variable which gets updated by the
+    # inverse ops.  It may be stale / inconsistent with the latest value of
+    # get_cov().
+    damping_id = graph_func_to_id(damping_func)
+    cholesky_inv = self._cholesky_inverse_by_damping[damping_id]
+    assert cholesky_inv.shape.ndims == 2
+    return lo.LinearOperatorFullMatrix(cholesky_inv,
+                                       is_non_singular=True,
+                                       is_square=True)
 
   def get_eigendecomp(self):
     """Creates or retrieves eigendecomposition of self._cov."""
     # Unlike get_matpower this doesn't retrieve a stored variable, but instead
     # always computes a fresh version from the current value of get_cov().
     if not self._eigendecomp:
-      eigenvalues, eigenvectors = linalg_ops.self_adjoint_eig(self._cov)
+      eigenvalues, eigenvectors = linalg_ops.self_adjoint_eig(self.get_cov())
 
       # The matrix self._cov is positive semidefinite by construction, but the
       # numerical eigenvalues could be negative due to numerical errors, so here
@@ -660,45 +767,8 @@ def get_eigendecomp(self):
 
     return self._eigendecomp
 
-  def get_cov(self):
-    # Variable contains full covariance matrix.
-    return self.get_cov_var()
-
-  def left_multiply_matpower(self, x, exp, damping_func):
-    if isinstance(x, tf_ops.IndexedSlices):
-      raise ValueError("Left-multiply not yet supported for IndexedSlices.")
-
-    if x.shape.ndims != 2:
-      raise ValueError(
-          "InverseProvidingFactors apply to matrix-shaped vectors. Found: %s."
-          % (x,))
-
-    if exp == 1:
-      return math_ops.matmul(self.get_cov(), x) + damping_func() * x
-
-    return math_ops.matmul(self.get_matpower(exp, damping_func), x)
-
-  def right_multiply_matpower(self, x, exp, damping_func):
-    if isinstance(x, tf_ops.IndexedSlices):
-      if exp == 1:
-        n = self.get_cov().shape[0]
-        damped_cov = self.get_cov() + damping_func() * array_ops.eye(n)
-        return utils.matmul_sparse_dense(x, damped_cov)
-
-      return utils.matmul_sparse_dense(x, self.get_matpower(exp, damping_func))
-
-    if x.shape.ndims != 2:
-      raise ValueError(
-          "InverseProvidingFactors apply to matrix-shaped vectors. Found: %s."
-          % (x,))
 
-    if exp == 1:
-      return math_ops.matmul(x, self.get_cov()) + damping_func() * x
-
-    return math_ops.matmul(x, self.get_matpower(exp, damping_func))
-
-
-class FullFactor(InverseProvidingFactor):
+class FullFactor(DenseSquareMatrixFactor):
   """FisherFactor for a full matrix representation of the Fisher of a parameter.
 
   Note that this uses the naive "square the sum estimator", and so is applicable
@@ -757,41 +827,51 @@ class DiagonalFactor(FisherFactor):
   """
 
   def __init__(self):
-    self._damping_funcs_by_id = {}  # { hashable: lambda }
     super(DiagonalFactor, self).__init__()
 
+  def get_cov_as_linear_operator(self):
+    assert self._matrix_diagonal.shape.ndims == 1
+    return lo.LinearOperatorDiag(self._matrix_diagonal,
+                                 is_self_adjoint=True,
+                                 is_square=True)
+
   @property
   def _cov_initializer(self):
     return diagonal_covariance_initializer
 
+  @property
+  def _matrix_diagonal(self):
+    return array_ops.reshape(self.get_cov(), [-1])
+
   def make_inverse_update_ops(self):
     return []
 
   def instantiate_inv_variables(self):
     pass
 
-  def get_cov(self):
-    # self.get_cov() could be any shape, but it must have one entry per
-    # parameter. Flatten it into a vector.
-    cov_diag_vec = array_ops.reshape(self.get_cov_var(), [-1])
-    return array_ops.diag(cov_diag_vec)
+  def register_matpower(self, exp, damping_func):
+    pass
 
-  def left_multiply_matpower(self, x, exp, damping_func):
-    matpower = (self.get_cov_var() + damping_func())**exp
+  def register_cholesky(self, damping_func):
+    pass
 
-    if isinstance(x, tf_ops.IndexedSlices):
-      return utils.matmul_diag_sparse(array_ops.reshape(matpower, [-1]), x)
+  def register_cholesky_inverse(self, damping_func):
+    pass
 
-    if x.shape != matpower.shape:
-      raise ValueError("x (%s) and cov (%s) must have same shape." %
-                       (x, matpower))
-    return matpower * x
+  def get_matpower(self, exp, damping_func):
+    matpower_diagonal = (self._matrix_diagonal
+                         + math_ops.cast(damping_func(), self._dtype))**exp
+    return lo.LinearOperatorDiag(matpower_diagonal,
+                                 is_non_singular=True,
+                                 is_self_adjoint=True,
+                                 is_positive_definite=True,
+                                 is_square=True)
 
-  def right_multiply_matpower(self, x, exp, damping_func):
-    raise NotImplementedError("Only left-multiply is currently supported.")
+  def get_cholesky(self, damping_func):
+    return self.get_matpower(0.5, damping_func)
 
-  def register_matpower(self, exp, damping_func):
-    pass
+  def get_cholesky_inverse(self, damping_func):
+    return self.get_matpower(-0.5, damping_func)
 
 
 class NaiveDiagonalFactor(DiagonalFactor):
@@ -1167,7 +1247,7 @@ def _get_data_device(self, tower):
     return self._inputs[tower].device
 
 
-class FullyConnectedKroneckerFactor(InverseProvidingFactor):
+class FullyConnectedKroneckerFactor(DenseSquareMatrixFactor):
   """Kronecker factor for the input or output side of a fully-connected layer.
   """
 
@@ -1220,7 +1300,7 @@ def _get_data_device(self, tower):
     return self._tensors[0][tower].device
 
 
-class ConvInputKroneckerFactor(InverseProvidingFactor):
+class ConvInputKroneckerFactor(DenseSquareMatrixFactor):
   r"""Kronecker factor for the input side of a convolutional layer.
 
   Estimates E[ a a^T ] where a is the inputs to a convolutional layer given
@@ -1384,7 +1464,7 @@ def _get_data_device(self, tower):
     return self._inputs[tower].device
 
 
-class ConvOutputKroneckerFactor(InverseProvidingFactor):
+class ConvOutputKroneckerFactor(DenseSquareMatrixFactor):
   r"""Kronecker factor for the output side of a convolutional layer.
 
   Estimates E[ ds ds^T ] where s is the preactivations of a convolutional layer
@@ -1674,6 +1754,7 @@ def make_inverse_update_ops(self):
                        psi_var) in self._option1quants_by_damping.items():
 
         damping = self._damping_funcs_by_id[damping_id]()
+        damping = math_ops.cast(damping, self._dtype)
 
         invsqrtC0 = math_ops.matmul(
             eigen_V * (eigen_e + damping)**(-0.5), eigen_V, transpose_b=True)
@@ -1702,6 +1783,7 @@ def make_inverse_update_ops(self):
                        mu_var) in self._option2quants_by_damping.items():
 
         damping = self._damping_funcs_by_id[damping_id]()
+        damping = math_ops.cast(damping, self._dtype)
 
         # compute C0^(-1/2)
         invsqrtC0 = math_ops.matmul(
diff --git a/tensorflow/contrib/kfac/python/ops/layer_collection.py b/tensorflow/contrib/kfac/python/ops/layer_collection.py
index 366e2a82d56602..cbbfe7212c9d94 100644
--- a/tensorflow/contrib/kfac/python/ops/layer_collection.py
+++ b/tensorflow/contrib/kfac/python/ops/layer_collection.py
@@ -182,7 +182,7 @@ def __init__(self,
     self._graph = graph or ops.get_default_graph()
     self._loss_dict = {}  # {str: LossFunction}
     self._subgraph = None
-    self._default_generic_approximation = APPROX_FULL_NAME
+    self._default_generic_approximation = APPROX_DIAGONAL_NAME
     self._default_embedding_approximation = APPROX_KRONECKER_NAME
     self._default_fully_connected_approximation = APPROX_KRONECKER_NAME
     self._default_conv2d_approximation = APPROX_KRONECKER_NAME
diff --git a/tensorflow/contrib/kfac/python/ops/linear_operator.py b/tensorflow/contrib/kfac/python/ops/linear_operator.py
new file mode 100644
index 00000000000000..61cb955ae85df9
--- /dev/null
+++ b/tensorflow/contrib/kfac/python/ops/linear_operator.py
@@ -0,0 +1,95 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""SmartMatrices definitions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.kfac.python.ops import utils
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.linalg import linalg
+from tensorflow.python.ops.linalg import linalg_impl
+from tensorflow.python.ops.linalg import linear_operator_util as lou
+
+
+class LinearOperatorExtras(object):  # pylint: disable=missing-docstring
+
+  def matmul(self, x, adjoint=False, adjoint_arg=False, name="matmul"):
+
+    with self._name_scope(name, values=[x]):
+      if isinstance(x, ops.IndexedSlices):
+        return self._matmul_sparse(x, adjoint=adjoint, adjoint_arg=adjoint_arg)
+
+      x = ops.convert_to_tensor(x, name="x")
+      self._check_input_dtype(x)
+
+      self_dim = -2 if adjoint else -1
+      arg_dim = -1 if adjoint_arg else -2
+      self.shape[self_dim].assert_is_compatible_with(x.get_shape()[arg_dim])
+
+      return self._matmul(x, adjoint=adjoint, adjoint_arg=adjoint_arg)
+
+  def matmul_right(self, x, adjoint=False, adjoint_arg=False, name="matmul"):
+
+    with self._name_scope(name, values=[x]):
+
+      if isinstance(x, ops.IndexedSlices):
+        return self._matmul_right_sparse(
+            x, adjoint=adjoint, adjoint_arg=adjoint_arg)
+
+      x = ops.convert_to_tensor(x, name="x")
+      self._check_input_dtype(x)
+
+      self_dim = -1 if adjoint else -2
+      arg_dim = -2 if adjoint_arg else -1
+      self.shape[self_dim].assert_is_compatible_with(x.get_shape()[arg_dim])
+
+      return self._matmul_right(x, adjoint=adjoint, adjoint_arg=adjoint_arg)
+
+
+class LinearOperatorFullMatrix(LinearOperatorExtras,
+                               linalg.LinearOperatorFullMatrix):
+
+  # TODO(b/78117889) Remove this definition once core LinearOperator
+  # has _matmul_right.
+  def _matmul_right(self, x, adjoint=False, adjoint_arg=False):
+    return lou.matmul_with_broadcast(
+        x, self._matrix, adjoint_a=adjoint_arg, adjoint_b=adjoint)
+
+  def _matmul_sparse(self, x, adjoint=False, adjoint_arg=False):
+    raise NotImplementedError
+
+  def _matmul_right_sparse(self, x, adjoint=False, adjoint_arg=False):
+    assert not adjoint and not adjoint_arg
+    return utils.matmul_sparse_dense(x, self._matrix)
+
+
+class LinearOperatorDiag(LinearOperatorExtras,  # pylint: disable=missing-docstring
+                         linalg.LinearOperatorDiag):
+
+  def _matmul_right(self, x, adjoint=False, adjoint_arg=False):
+    diag_mat = math_ops.conj(self._diag) if adjoint else self._diag
+    x = linalg_impl.adjoint(x) if adjoint_arg else x
+    return diag_mat * x
+
+  def _matmul_sparse(self, x, adjoint=False, adjoint_arg=False):
+    diag_mat = math_ops.conj(self._diag) if adjoint else self._diag
+    assert not adjoint_arg
+    return utils.matmul_diag_sparse(diag_mat, x)
+
+  def _matmul_right_sparse(self, x, adjoint=False, adjoint_arg=False):
+    raise NotImplementedError
diff --git a/tensorflow/contrib/kfac/python/ops/optimizer.py b/tensorflow/contrib/kfac/python/ops/optimizer.py
index f01c5a832212f8..03b9da793307b9 100644
--- a/tensorflow/contrib/kfac/python/ops/optimizer.py
+++ b/tensorflow/contrib/kfac/python/ops/optimizer.py
@@ -19,6 +19,7 @@
 from __future__ import print_function
 
 import warnings
+
 # pylint disable=long-line
 from tensorflow.contrib.kfac.python.ops import curvature_matrix_vector_products as cmvp
 from tensorflow.contrib.kfac.python.ops import estimator as est
@@ -67,7 +68,7 @@ def __init__(self,
           the local approximation with the Fisher information matrix, and to
           regularize the update direction by making it closer to the gradient.
           If damping is adapted during training then this value is used for
-          initializing damping varaible.
+          initializing damping variable.
           (Higher damping means the update looks more like a standard gradient
           update - see Tikhonov regularization.)
       layer_collection: The layer collection object, which holds the fisher
@@ -108,6 +109,10 @@ def __init__(self,
       ValueError: If momentum is non-zero and momentum_type is not 'regular'
           or 'adam'.
     """
+    warnings.warn(
+        "third_party.tensorflow.contrib.kfac is deprecated."
+        "This will be removed on 15-07-2018. Check README for further details.",
+        DeprecationWarning)
     # Parameters to be passed to the Fisher estimator:
     self._variables = var_list or tf_variables.trainable_variables
     self._cov_ema_decay = cov_ema_decay
@@ -115,7 +120,7 @@ def __init__(self,
     self._estimation_mode = estimation_mode
     self._colocate_gradients_with_ops = colocate_gradients_with_ops
 
-    # The below paramaters are required only if damping needs to be adapated.
+    # The below parameters are required only if damping needs to be adapated.
     # These parameters can be set by calling
     # set_damping_adaptation_params() explicitly.
     self._damping_adaptation_decay = 0.95
@@ -196,7 +201,7 @@ def set_damping_adaptation_params(self,
       min_damping: `float`(Optional), Minimum value the damping parameter
         can take. Default value 1e-5.
       damping_adaptation_decay: `float`(Optional), The `damping` parameter is
-        multipled by the `damping_adaptation_decay` every
+        multiplied by the `damping_adaptation_decay` every
         `damping_adaptation_interval` number of iterations. Default value 0.99.
       damping_adaptation_interval: `int`(Optional), Number of steps in between
         updating the `damping` parameter. Default value 5.
@@ -243,62 +248,6 @@ def damping(self):
   def damping_adaptation_interval(self):
     return self._damping_adaptation_interval
 
-  @property
-  def cov_update_thunks(self):
-    self._maybe_make_and_save_everything()
-    return self._cov_update_thunks
-
-  @property
-  def cov_update_ops(self):
-    self._maybe_make_and_save_everything()
-    return self._cov_update_ops
-
-  @property
-  def cov_update_op(self):
-    self._maybe_make_and_save_everything()
-    return self._cov_update_op
-
-  @property
-  def inv_update_thunks(self):
-    self._maybe_make_and_save_everything()
-    return self._inv_update_thunks
-
-  @property
-  def inv_update_ops(self):
-    self._maybe_make_and_save_everything()
-    return self._inv_update_ops
-
-  @property
-  def inv_update_op(self):
-    self._maybe_make_and_save_everything()
-    return self._inv_update_op
-
-  def _maybe_make_and_save_everything(self):
-    if not self._fisher_est.made_vars():
-      warnings.warn("These convenience properties will be depcrecated soon. "
-                    "Please use explicit op/thunk creation methods instead "
-                    "(e.g. make_ops_and_vars, etc).",
-                    DeprecationWarning)
-      (self._cov_update_ops, self._cov_update_op, self._inv_update_ops,
-       self._inv_update_op, self._cov_update_thunks,
-       self._inv_update_thunks) = self.make_ops_and_vars()
-
-  def make_ops_and_vars(self):
-    """Make ops and vars with device placement `self._placement_strategy`.
-
-    See `FisherEstimator.make_ops_and_vars` for details.
-
-    Returns:
-      cov_update_ops: List of ops that compute the cov updates. Corresponds
-        one-to-one with the list of factors given by the "factors" property.
-      cov_update_op: cov_update_ops grouped into a single op.
-      inv_update_ops: List of ops that compute the inv updates. Corresponds
-        one-to-one with the list of factors given by the "factors" property.
-      cov_update_op: cov_update_ops grouped into a single op.
-      inv_update_op: inv_update_ops grouped into a single op.
-    """
-    return self._fisher_est.make_ops_and_vars(scope=self.get_name())
-
   def make_vars_and_create_op_thunks(self):
     """Make vars and create op thunks.
 
@@ -385,7 +334,6 @@ def apply_gradients(self, grads_and_vars, *args, **kwargs):
     Returns:
       An `Operation` that applies the specified gradients.
     """
-    self._maybe_make_and_save_everything()
     # In Python 3, grads_and_vars can be a zip() object which can only be
     # iterated over once. By converting it to a list, we ensure that it can be
     # iterated over more than once.
diff --git a/tensorflow/contrib/kfac/python/ops/placement.py b/tensorflow/contrib/kfac/python/ops/placement.py
index bf12dbaa9adbaa..c4454325aebe13 100644
--- a/tensorflow/contrib/kfac/python/ops/placement.py
+++ b/tensorflow/contrib/kfac/python/ops/placement.py
@@ -21,8 +21,6 @@
 import itertools
 
 from tensorflow.python.framework import ops as tf_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import variable_scope
 
 
 def _make_thunk_on_device(func, device):
@@ -35,7 +33,7 @@ def thunk():
 class RoundRobinPlacementMixin(object):
   """Implements round robin placement strategy for ops and variables."""
 
-  def __init__(self, cov_devices=None, inv_devices=None, *args, **kwargs):
+  def __init__(self, cov_devices=None, inv_devices=None, **kwargs):
     """Initializes the RoundRobinPlacementMixin class.
 
     Args:
@@ -45,66 +43,15 @@ def __init__(self, cov_devices=None, inv_devices=None, *args, **kwargs):
       inv_devices: Iterable of device strings (e.g. '/gpu:0'). Inversion
         computations will be placed on these devices in a round-robin fashion.
         Can be None, which means that no devices are specified.
-      *args:
-      **kwargs:
+      **kwargs: Need something here?
 
     """
-    super(RoundRobinPlacementMixin, self).__init__(*args, **kwargs)
+    super(RoundRobinPlacementMixin, self).__init__(**kwargs)
     self._cov_devices = cov_devices
     self._inv_devices = inv_devices
 
-  def make_ops_and_vars(self, scope=None):
-    """Make ops and vars with a round-robin device placement strategy.
-
-    For each factor, all of that factor's cov variables and their associated
-    update ops will be placed on a particular device.  A new device is chosen
-    for each factor by cycling through list of devices in the
-    `self._cov_devices` attribute. If `self._cov_devices` is `None` then no
-    explicit device placement occurs.
-
-    An analogous strategy is followed for inverse update ops, with the list of
-    devices being given by the `self._inv_devices` attribute.
-
-    Inverse variables on the other hand are not placed on any specific device
-    (they will just use the current the device placement context, whatever
-    that happens to be).  The idea is that the inverse variable belong where
-    they will be accessed most often, which is the device that actually applies
-    the preconditioner to the gradient. The user will be responsible for setting
-    the device context for this.
-
-    Args:
-      scope: A string or None.  If None it will be set to the name of this
-        estimator (given by the name property). All variables will be created,
-        and all ops will execute, inside of a variable scope of the given
-        name. (Default: None)
-
-    Returns:
-      cov_update_ops: List of ops that compute the cov updates. Corresponds
-        one-to-one with the list of factors given by the "factors" property.
-      cov_update_op: cov_update_ops grouped into a single op.
-      inv_update_ops: List of ops that compute the inv updates. Corresponds
-        one-to-one with the list of factors given by the "factors" property.
-      inv_update_op: inv_update_ops grouped into a single op.
-      cov_update_thunks: Thunks that make the ops in cov_update_ops.
-      inv_update_thunks: Thunks that make the ops in inv_update_ops.
-    """
-    (cov_update_thunks,
-     inv_update_thunks) = self.make_vars_and_create_op_thunks(scope=scope)
-    cov_update_ops = [thunk() for thunk in cov_update_thunks]
-    inv_update_ops = [thunk() for thunk in inv_update_thunks]
-
-    scope = self.name if scope is None else scope
-    with variable_scope.variable_scope(scope):
-      cov_update_op = control_flow_ops.group(cov_update_ops,
-                                             name="cov_update_op")
-      inv_update_op = control_flow_ops.group(inv_update_ops,
-                                             name="inv_update_op")
-
-    return (cov_update_ops, cov_update_op, inv_update_ops, inv_update_op,
-            cov_update_thunks, inv_update_thunks)
-
   def make_vars_and_create_op_thunks(self, scope=None):
-    """Make vars and create op thunks w/ a round-robin device placement strat.
+    """Make vars and create op thunks w/ a round-robin device placement start.
 
     For each factor, all of that factor's cov variables and their associated
     update ops will be placed on a particular device.  A new device is chosen
diff --git a/tensorflow/contrib/kfac/python/ops/utils.py b/tensorflow/contrib/kfac/python/ops/utils.py
index b6f42815e79fa5..144295f4c7e36f 100644
--- a/tensorflow/contrib/kfac/python/ops/utils.py
+++ b/tensorflow/contrib/kfac/python/ops/utils.py
@@ -235,6 +235,13 @@ def posdef_eig_self_adjoint(mat):
 }
 
 
+def cholesky(tensor, damping):
+  """Computes the inverse of tensor + damping * identity."""
+  identity = linalg_ops.eye(tensor.shape.as_list()[0], dtype=tensor.dtype)
+  damping = math_ops.cast(damping, dtype=tensor.dtype)
+  return linalg_ops.cholesky(tensor + damping * identity)
+
+
 class SubGraph(object):
   """Defines a subgraph given by all the dependencies of a given set of outputs.
   """
@@ -553,13 +560,17 @@ def is_data_format_channel_last(data_format):
   return data_format.endswith("C")
 
 
-def matmul_sparse_dense(A, B, name=None):  # pylint: disable=invalid-name
+def matmul_sparse_dense(A, B, name=None, transpose_a=False, transpose_b=False):  # pylint: disable=invalid-name
   """Computes matmul(A, B) where A is sparse, B is dense.
 
   Args:
     A: tf.IndexedSlices with dense shape [m, n].
     B: tf.Tensor with shape [n, k].
     name: str. Name of op.
+    transpose_a: Bool. If true we transpose A before multiplying it by B.
+      (Default: False)
+    transpose_b: Bool. If true we transpose B before multiplying it by A.
+      (Default: False)
 
   Returns:
     tf.IndexedSlices resulting from matmul(A, B).
@@ -573,7 +584,8 @@ def matmul_sparse_dense(A, B, name=None):  # pylint: disable=invalid-name
       raise ValueError("A must represent a matrix. Found: %s." % A)
     if B.shape.ndims != 2:
       raise ValueError("B must be a matrix.")
-    new_values = math_ops.matmul(A.values, B)
+    new_values = math_ops.matmul(
+        A.values, B, transpose_a=transpose_a, transpose_b=transpose_b)
     return ops.IndexedSlices(
         new_values,
         A.indices,
diff --git a/tensorflow/contrib/layers/BUILD b/tensorflow/contrib/layers/BUILD
index b527cdad7088a9..7e79b48cdbe14e 100644
--- a/tensorflow/contrib/layers/BUILD
+++ b/tensorflow/contrib/layers/BUILD
@@ -381,7 +381,7 @@ py_test(
 
 py_test(
     name = "rev_block_lib_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/layers/rev_block_lib_test.py"],
     srcs_version = "PY2AND3",
     deps = [
diff --git a/tensorflow/contrib/layers/python/layers/embedding_ops.py b/tensorflow/contrib/layers/python/layers/embedding_ops.py
index 49c3faf3b7f5ea..60e1d85ea9c08a 100644
--- a/tensorflow/contrib/layers/python/layers/embedding_ops.py
+++ b/tensorflow/contrib/layers/python/layers/embedding_ops.py
@@ -458,7 +458,7 @@ def scattered_embedding_lookup_sparse(params,
     return embeddings
 
 
-def embedding_lookup_unique(params, ids, name=None):
+def embedding_lookup_unique(params, ids, partition_strategy="mod", name=None):
   """Version of embedding_lookup that avoids duplicate lookups.
 
   This can save communication in the case of repeated ids.
@@ -470,6 +470,9 @@ def embedding_lookup_unique(params, ids, name=None):
       `PartitionedVariable`. Shape `[index, d1, d2, ...]`.
     ids: A one-dimensional `Tensor` with type `int32` or `int64` containing
       the ids to be looked up in `params`. Shape `[ids1, ids2, ...]`.
+    partition_strategy: A string specifying the partitioning strategy, relevant
+      if `len(params) > 1`. Currently `"div"` and `"mod"` are supported. Default
+      is `"mod"`.
     name: A name for this operation (optional).
 
   Returns:
@@ -485,7 +488,8 @@ def embedding_lookup_unique(params, ids, name=None):
     ids_flat = array_ops.reshape(
         ids, math_ops.reduce_prod(shape, keepdims=True))
     unique_ids, idx = array_ops.unique(ids_flat)
-    unique_embeddings = embedding_ops.embedding_lookup(params, unique_ids)
+    unique_embeddings = embedding_ops.embedding_lookup(params, unique_ids,
+                                                       partition_strategy)
     embeds_flat = array_ops.gather(unique_embeddings, idx)
     embed_shape = array_ops.concat(
         [shape, array_ops.shape(unique_embeddings)[1:]], 0)
diff --git a/tensorflow/contrib/layers/python/layers/embedding_ops_test.py b/tensorflow/contrib/layers/python/layers/embedding_ops_test.py
index bf2514498202e9..dd2395f8c9748d 100644
--- a/tensorflow/contrib/layers/python/layers/embedding_ops_test.py
+++ b/tensorflow/contrib/layers/python/layers/embedding_ops_test.py
@@ -31,6 +31,7 @@
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import init_ops
@@ -691,11 +692,12 @@ def _GroupByBatchEntry(self, vals, vals_per_batch_entry):
       index += num_val
     return grouped_vals
 
+  @test_util.enable_c_shapes
   def testEmbeddingLookupSparse(self):
     vocab_size = 13
     batch_size = 10
     param_shape = [2, 5]
-    expected_lookup_result_shape = [None] + param_shape
+    expected_lookup_result_shape = param_shape
 
     sp_ids, sp_weights, ids, weights, vals_per_batch_entry = (
         self._RandomIdsAndWeights(batch_size, vocab_size))
@@ -719,7 +721,7 @@ def testEmbeddingLookupSparse(self):
                 None if ignore_weights else sp_weights,
                 combiner=combiner)
 
-        self.assertEqual(embedding_sum.get_shape().as_list(),
+        self.assertEqual(embedding_sum.get_shape().as_list()[1:],
                          expected_lookup_result_shape)
 
         tf_embedding_sum = embedding_sum.eval(feed_dict=feed_dict)
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index 2f3e57653c5d6d..b7194ae3330450 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -2022,6 +2022,7 @@ def build(self, input_shape):
 
     def beta_initializer(shape, dtype=None, partition_info=None):
       del partition_info  # unused
+      pedestal = array_ops.constant(self._reparam_offset**2, dtype=self.dtype)
       return math_ops.sqrt(array_ops.ones(shape, dtype=dtype) + pedestal)
 
     def gamma_initializer(shape, dtype=None, partition_info=None):
@@ -2029,6 +2030,7 @@ def gamma_initializer(shape, dtype=None, partition_info=None):
       assert len(shape) == 2
       assert shape[0] == shape[1]
       eye = linalg_ops.eye(shape[0], dtype=dtype)
+      pedestal = array_ops.constant(self._reparam_offset**2, dtype=self.dtype)
       return math_ops.sqrt(self._gamma_init * eye + pedestal)
 
     beta = self.add_variable(
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index b01fd5d5c95ac1..56e9194cebbe46 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -1333,7 +1333,7 @@ def testCreateDropout(self):
     with self.test_session():
       images = np.random.uniform(size=(5, height, width, 3))
       output = _layers.dropout(images)
-      self.assertEqual(output.op.name, 'Dropout/dropout/mul')
+      self.assertEqual(output.op.name, 'Dropout/dropout_1/mul')
       output.get_shape().assert_is_compatible_with(
           ops.convert_to_tensor(images).get_shape())
 
diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib.py b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
index 02d294c68f1e10..0e35b1aa8bf682 100644
--- a/tensorflow/contrib/layers/python/layers/rev_block_lib.py
+++ b/tensorflow/contrib/layers/python/layers/rev_block_lib.py
@@ -33,23 +33,32 @@
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.contrib.framework.python import ops as contrib_framework_ops
+from tensorflow.python.eager import backprop
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import function
 from tensorflow.python.framework import ops as framework_ops
 from tensorflow.python.layers import base
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
+from tensorflow.python.util import tf_inspect
 
 __all__ = ["rev_block", "RevBlock", "recompute_grad"]
 
 LAYER_RE = re.compile(".*revlayer_([0-9]*)/([fg])/.*")
 _USE_DEFAULT = "__rev_block_lib_default"
+_WRONG_VARS_ERR = """\
+The variables used on recompute were different than the variables originally
+used. The function wrapped with @recompute_grad likley creates its own variable
+scope with a default name and has been called twice in the same enclosing scope.
+To fix, ensure each call to the function happens in its own unique variable
+scope.
+"""
 
 
 def _acc_grads(*lists_of_grads):
@@ -146,7 +155,7 @@ def _scope_wrap(fn, scope):
 
   @functools.wraps(fn)
   def wrap(*args, **kwargs):
-    with variable_scope.variable_scope(scope):
+    with variable_scope.variable_scope(scope, use_resource=True):
       return fn(*args, **kwargs)
 
   return wrap
@@ -221,95 +230,95 @@ def build(self, _):
                  "build.")
     self.built = True
 
-  def _efficient_grad_fn(self, inputs, variables, ys, grad_ys):
-    """Custom gradient fn for a block of reversible residual layers."""
-    # Inputs have passed through an Identity. Recover the original Tensors to
-    # be able to match up side inputs.
-    assert [u"Identity"] == list(set([x.op.type for x in inputs]))
-    inputs = [x.op.inputs[0] for x in inputs]
-    side_inputs = inputs[2:]
-    del inputs
-
-    f_side_idxs = [None] * len(self.f_side_input)
-    g_side_idxs = [None] * len(self.g_side_input)
-    assert len(side_inputs) == len(self.f_side_input) + len(self.g_side_input)
-
-    for i, t in enumerate(side_inputs):
-      if t in self.f_side_input:
-        f_side_idxs[self.f_side_input.index(t)] = i
-      elif t in self.g_side_input:
-        g_side_idxs[self.g_side_input.index(t)] = i
-      else:
-        assert False
-
-    f_vars = [[] for _ in range(self.num_layers)]
-    g_vars = [[] for _ in range(self.num_layers)]
-    f_vars_idxs = [[] for _ in range(self.num_layers)]
-    g_vars_idxs = [[] for _ in range(self.num_layers)]
-
-    for i, ref in enumerate(variables):
-      # Use the name to identify the layer number and function (f or g)
-      regex = LAYER_RE.match(ref.name)
-      layer_no = int(regex.group(1))
-      fn_name = regex.group(2)
-      if fn_name == "f":
-        f_vars[layer_no].append(ref)
-        f_vars_idxs[layer_no].append(i)
-      else:
-        assert fn_name == "g"
-        g_vars[layer_no].append(ref)
-        g_vars_idxs[layer_no].append(i)
-
-    f_var_grads = []
-    g_var_grads = []
-    f_side_grads = []
-    g_side_grads = []
-
-    # Reverse variable containers to go backward
-    f_vars.reverse()
-    g_vars.reverse()
-    f = list(self.f)
-    g = list(self.g)
-    f.reverse()
-    g.reverse()
-
-    with variable_scope.variable_scope(self.scope_name, reuse=True):
-      for i in xrange(self.num_layers):
-        ys, grad_ys, f_ret, g_ret = _rev_layer_backward(
-            ys, grad_ys, f[i], g[i], f_vars[i], self.f_side_input, g_vars[i],
-            self.g_side_input)
-
-        grad_f_vars, grad_f_side = f_ret
-        grad_g_vars, grad_g_side = g_ret
-        f_var_grads.append(grad_f_vars)
-        g_var_grads.append(grad_g_vars)
-        f_side_grads.append(grad_f_side)
-        g_side_grads.append(grad_g_side)
-
-    # Accumulate layer gradients for f_side_input and g_side_input
-    acc_f_side_grads = _acc_grads(*f_side_grads)
-    acc_g_side_grads = _acc_grads(*g_side_grads)
-
-    # Use the stored idxs to put gradients in the passed-in order.
-    side_input_grads = [None] * len(side_inputs)
-    variable_grads = [None] * len(variables)
-
-    # Variable gradients were collected in reverse layer order. Reverse to match
-    # idxs.
-    f_var_grads.reverse()
-    g_var_grads.reverse()
-    for idxs, grads in list(zip(f_vars_idxs, f_var_grads)) + list(
-        zip(g_vars_idxs, g_var_grads)):
-      for i, grad in zip(idxs, grads):
-        variable_grads[i] = grad
-
-    for i, grad in zip(f_side_idxs, acc_f_side_grads):
-      side_input_grads[i] = grad
-    for i, grad in zip(g_side_idxs, acc_g_side_grads):
-      side_input_grads[i] = grad
-
-    grad_x1, grad_x2 = grad_ys
-    return [grad_x1, grad_x2] + side_input_grads, variable_grads
+  def _make_efficient_grad_fn(self, inputs_, ys_):
+    def _efficient_grad_fn(*grad_ys, **kwargs):
+      """Custom gradient fn for a block of reversible residual layers."""
+      inputs = inputs_
+      ys = ys_
+      variables = kwargs["variables"]
+      side_inputs = inputs[2:]
+
+      f_side_idxs = [None] * len(self.f_side_input)
+      g_side_idxs = [None] * len(self.g_side_input)
+      assert len(side_inputs) == len(self.f_side_input) + len(self.g_side_input)
+
+      for i, t in enumerate(side_inputs):
+        if t in self.f_side_input:
+          f_side_idxs[self.f_side_input.index(t)] = i
+        elif t in self.g_side_input:
+          g_side_idxs[self.g_side_input.index(t)] = i
+        else:
+          assert False
+
+      f_vars = [[] for _ in range(self.num_layers)]
+      g_vars = [[] for _ in range(self.num_layers)]
+      f_vars_idxs = [[] for _ in range(self.num_layers)]
+      g_vars_idxs = [[] for _ in range(self.num_layers)]
+
+      for i, ref in enumerate(variables):
+        # Use the name to identify the layer number and function (f or g)
+        regex = LAYER_RE.match(ref.name)
+        layer_no = int(regex.group(1))
+        fn_name = regex.group(2)
+        if fn_name == "f":
+          f_vars[layer_no].append(ref)
+          f_vars_idxs[layer_no].append(i)
+        else:
+          assert fn_name == "g"
+          g_vars[layer_no].append(ref)
+          g_vars_idxs[layer_no].append(i)
+
+      f_var_grads = []
+      g_var_grads = []
+      f_side_grads = []
+      g_side_grads = []
+
+      # Reverse variable containers to go backward
+      f_vars.reverse()
+      g_vars.reverse()
+      f = list(self.f)
+      g = list(self.g)
+      f.reverse()
+      g.reverse()
+
+      with variable_scope.variable_scope(self.scope_name, reuse=True):
+        for i in xrange(self.num_layers):
+          ys, grad_ys, f_ret, g_ret = _rev_layer_backward(
+              ys, grad_ys, f[i], g[i], f_vars[i], self.f_side_input, g_vars[i],
+              self.g_side_input)
+
+          grad_f_vars, grad_f_side = f_ret
+          grad_g_vars, grad_g_side = g_ret
+          f_var_grads.append(grad_f_vars)
+          g_var_grads.append(grad_g_vars)
+          f_side_grads.append(grad_f_side)
+          g_side_grads.append(grad_g_side)
+
+      # Accumulate layer gradients for f_side_input and g_side_input
+      acc_f_side_grads = _acc_grads(*f_side_grads)
+      acc_g_side_grads = _acc_grads(*g_side_grads)
+
+      # Use the stored idxs to put gradients in the passed-in order.
+      side_input_grads = [None] * len(side_inputs)
+      variable_grads = [None] * len(variables)
+
+      # Variable gradients were collected in reverse layer order. Reverse to
+      # match idxs.
+      f_var_grads.reverse()
+      g_var_grads.reverse()
+      for idxs, grads in list(zip(f_vars_idxs, f_var_grads)) + list(
+          zip(g_vars_idxs, g_var_grads)):
+        for i, grad in zip(idxs, grads):
+          variable_grads[i] = grad
+
+      for i, grad in zip(f_side_idxs, acc_f_side_grads):
+        side_input_grads[i] = grad
+      for i, grad in zip(g_side_idxs, acc_g_side_grads):
+        side_input_grads[i] = grad
+
+      grad_x1, grad_x2 = grad_ys
+      return [grad_x1, grad_x2] + side_input_grads, variable_grads
+    return _efficient_grad_fn
 
   def _forward(self, x1, x2):
     """Run forward through the reversible layers."""
@@ -317,10 +326,6 @@ def _forward(self, x1, x2):
     side_inputs = [self.f_side_input, self.g_side_input]
     flat_side_inputs = nest.flatten(side_inputs)
 
-    custom_grad_fn = (
-        self._efficient_grad_fn if self._use_efficient_backprop else None)
-
-    @_fn_with_custom_grad(custom_grad_fn)
     def _forward_wrap(x1_, x2_, *flat_side_inputs):
       f_side, g_side = nest.pack_sequence_as(side_inputs, flat_side_inputs)
       return _rev_block_forward(
@@ -333,7 +338,16 @@ def _forward_wrap(x1_, x2_, *flat_side_inputs):
           g_side_input=g_side,
           gate_outputs=self._use_efficient_backprop)
 
-    return _forward_wrap(x1, x2, *flat_side_inputs)
+    @custom_gradient.custom_gradient
+    def _forward_with_custom_grad(*args):
+      out = _forward_wrap(*args)  # pylint: disable=no-value-for-parameter
+      grad_fn = self._make_efficient_grad_fn(args, out)
+      return out, grad_fn
+
+    if self._use_efficient_backprop:
+      return _forward_with_custom_grad(x1, x2, *flat_side_inputs)
+    else:
+      return _forward_wrap(x1, x2, *flat_side_inputs)
 
   def _backward(self, y1, y2):
     """Run backward through the reversible layers."""
@@ -432,6 +446,19 @@ def new_dec(*args, **kwargs):
 def recompute_grad(fn, use_data_dep=_USE_DEFAULT, tupleize_grads=False):
   """Decorator that recomputes the function on the backwards pass.
 
+  To use this function, you must use `ResourceVariable`s (i.e.
+  `variable_scope(name, use_resource=True), which are the default in Eager mode
+  and when running on TPU.
+
+  Warning: Because the function will be called again on the backwards pass, the
+  user should be careful to not use ops in their function that mutate state or
+  have randomness (for example, batch normalization or dropout). If the function
+  does have such operations, it is recommended that the function take the
+  `is_recomputing` keyword argument which will be `False` on the forward pass
+  and `True` on the backwards pass so that it can disable state changes when
+  `is_recomputing=True` (for example, not updating the moving averages in batch
+  normalization).
+
   Args:
     fn: a function that takes Tensors (all as positional arguments) and returns
       a tuple of Tensors.
@@ -465,6 +492,7 @@ def _is_on_tpu():
 
 def _recompute_grad(fn, args, use_data_dep=_USE_DEFAULT, tupleize_grads=False):
   """See recompute_grad."""
+  has_is_recompute_kwarg = "is_recomputing" in tf_inspect.getargspec(fn).args
   for arg in args:
     if not isinstance(arg, framework_ops.Tensor):
       raise ValueError("All inputs to function must be Tensors")
@@ -472,44 +500,61 @@ def _recompute_grad(fn, args, use_data_dep=_USE_DEFAULT, tupleize_grads=False):
   if use_data_dep_ == _USE_DEFAULT:
     use_data_dep_ = _is_on_tpu()
 
-  cached_vs = []
-  cached_arg_scope = []
-
-  def grad_fn(inputs, variables, outputs, output_grads):
-    """Recompute outputs for gradient computation."""
-    del outputs
-    # Recompute outputs
-    with framework_ops.control_dependencies(output_grads):
-      if use_data_dep_:
-        inputs = _force_data_dependency(output_grads, inputs)
-      with contrib_framework_ops.arg_scope(cached_arg_scope[0]):
-        with variable_scope.variable_scope(cached_vs[0], reuse=True):
-          outputs = fn(*inputs)
-
-    if not (isinstance(outputs, list) or isinstance(outputs, tuple)):
-      outputs = [outputs]
-    outputs = list(outputs)
-    grads = gradients_impl.gradients(outputs, inputs + variables, output_grads)
-
-    if tupleize_grads:
-      if use_data_dep_:
-        grads = _tuple_with_data_dep(grads)
-      else:
-        grads = control_flow_ops.tuple(grads)
-
-    grad_inputs = grads[:len(inputs)]
-    grad_vars = grads[len(inputs):]
-    return grad_inputs, grad_vars
-
-  @_fn_with_custom_grad(grad_fn)
+  @custom_gradient.custom_gradient
   def fn_with_recompute(*args):
-    cached_vs.append(variable_scope.get_variable_scope())
-    # TODO(rsepassi): Rm conditional in TF 1.4
-    if hasattr(contrib_framework_ops, "current_arg_scope"):
-      cached_arg_scope.append(contrib_framework_ops.current_arg_scope())
-    else:
-      cached_arg_scope.append({})
-    return fn(*args)
+    """Wrapper for fn."""
+    # Forward pass
+    vs = variable_scope.get_variable_scope()
+    arg_scope = contrib_framework_ops.current_arg_scope()
+    with backprop.GradientTape() as tape:
+      fn_kwargs = {}
+      if has_is_recompute_kwarg:
+        fn_kwargs["is_recomputing"] = False
+      outputs = fn(*args, **fn_kwargs)
+    original_vars = set(tape.watched_variables())
+
+    # Backward pass
+    def grad_fn(*output_grads, **kwargs):
+      """Recompute outputs for gradient computation."""
+      variables = []
+      if original_vars:
+        variables = kwargs["variables"]
+      if set(variables) != original_vars:
+        raise ValueError(_WRONG_VARS_ERR)
+      del kwargs
+      inputs = list(args)
+      # Recompute outputs
+      with framework_ops.control_dependencies(output_grads):
+        if use_data_dep_:
+          inputs = _force_data_dependency(output_grads, inputs)
+        with contrib_framework_ops.arg_scope(arg_scope):
+          with variable_scope.variable_scope(vs, reuse=True):
+            with backprop.GradientTape() as tape:
+              fn_kwargs = {}
+              if has_is_recompute_kwarg:
+                fn_kwargs["is_recomputing"] = True
+              outputs = fn(*inputs, **fn_kwargs)
+            recompute_vars = set(tape.watched_variables())
+            if original_vars != recompute_vars:
+              raise ValueError(_WRONG_VARS_ERR)
+
+      if not (isinstance(outputs, list) or isinstance(outputs, tuple)):
+        outputs = [outputs]
+      outputs = list(outputs)
+      grads = gradients_impl.gradients(outputs, inputs + variables,
+                                       output_grads)
+
+      if tupleize_grads:
+        if use_data_dep_:
+          grads = _tuple_with_data_dep(grads)
+        else:
+          grads = control_flow_ops.tuple(grads)
+
+      grad_inputs = grads[:len(inputs)]
+      grad_vars = grads[len(inputs):]
+      return grad_inputs, grad_vars
+
+    return outputs, grad_fn
 
   return fn_with_recompute(*args)
 
@@ -536,107 +581,6 @@ def _underlying_variable_ref(t):
     return None
 
 
-def _fn_with_custom_grad(grad_fn, use_global_vars=False):
-  """Decorator to create a subgraph with a custom gradient function.
-
-  The subgraph created by the decorated function is NOT put in a Defun and so
-  does not suffer from the limitations of the Defun (all subgraph ops on the
-  same device, no summaries).
-
-  Args:
-    grad_fn: function with signature
-      (inputs, variables, outputs, output_grads) -> (grad_inputs, grad_vars),
-      all of which are lists of Tensors.
-    use_global_vars: if True, variables will be the global variables created.
-      If False, will be the trainable variables.
-
-  Returns:
-    Decorator for function such that the gradient is defined by grad_fn.
-  """
-
-  def dec(fn):
-
-    @functools.wraps(fn)
-    def wrapped(*args):
-      return _fn_with_custom_grad_internal(
-          fn, args, grad_fn, use_global_vars=use_global_vars)
-
-    return wrapped
-
-  return dec
-
-
-def _fn_with_custom_grad_internal(fn, inputs, grad_fn, use_global_vars=False):
-  """Create a subgraph with a custom gradient.
-
-  Args:
-    fn: function that takes inputs as arguments and produces 1 or more Tensors.
-    inputs: list<Tensor>, will be passed as fn(*inputs).
-    grad_fn: function with signature
-      (inputs, vars, outputs, output_grads) -> (grad_inputs, grad_vars),
-      all of which are lists of Tensors.
-    use_global_vars: if True, variables will be the global variables created.
-      If False, will be the trainable variables.
-
-  Returns:
-    fn(*inputs)
-  """
-  vs = variable_scope.get_variable_scope()
-  get_vars_fn = (
-      vs.global_variables if use_global_vars else vs.trainable_variables)
-  len_before_vars = len(get_vars_fn())
-  inputs = [array_ops.identity(x) for x in inputs]
-  outputs = fn(*inputs)
-  train_vars = get_vars_fn()[len_before_vars:]
-
-  if grad_fn is None:
-    return outputs
-
-  if not (isinstance(outputs, tuple) or isinstance(outputs, list)):
-    outputs = [outputs]
-  outputs = list(outputs)
-
-  defun_inputs = [inputs, train_vars, outputs]
-
-  def custom_grad_fn(op, *dys):
-    """Custom grad fn applying grad_fn for identity Defun."""
-    fn_inputs, fn_vars, fn_outputs = nest.pack_sequence_as(
-        defun_inputs, list(op.inputs))
-    fn_vars = [_underlying_variable_ref(v) for v in fn_vars]
-    dys = list(dys)
-    assert len(fn_outputs) == len(outputs)
-    assert len(fn_outputs) == len(dys)
-
-    grad_inputs, grad_vars = grad_fn(fn_inputs, fn_vars, fn_outputs, dys)
-    grad_outputs = [None] * len(fn_outputs)
-    return tuple(grad_inputs + grad_vars + grad_outputs)
-
-  # The Defun takes as input the original inputs, the trainable variables
-  # created in fn, and the outputs. In the forward it passes through the
-  # outputs. In the backwards, it produces gradients for the original inputs
-  # and the trainable variables.
-  in_types = [t.dtype for t in inputs]
-  out_types = [t.dtype for t in outputs]
-  var_types = [t.dtype for t in train_vars]
-
-  # Get a unique name for the Defun
-  with framework_ops.name_scope("identity_custom_grad") as ns:
-    defun_name = ns
-
-  @function.Defun(
-      *(in_types + var_types + out_types),
-      func_name=defun_name,
-      python_grad_func=custom_grad_fn,
-      shape_func=lambda _: [t.get_shape() for t in outputs])
-  def identity(*args):
-    _, _, outs = nest.pack_sequence_as(defun_inputs, args)
-    return tuple([array_ops.identity(t) for t in outs])
-
-  flat_inputs = nest.flatten(defun_inputs)
-  id_out = identity(*flat_inputs)
-  return id_out
-
-
 def _force_data_dependency(first_compute, then_compute):
   """Force all of `then_compute` to depend on all of `first_compute`.
 
diff --git a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
index 8c118402a4c85d..bc09ba8d439808 100644
--- a/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
+++ b/tensorflow/contrib/layers/python/layers/rev_block_lib_test.py
@@ -21,9 +21,11 @@
 from tensorflow.contrib.layers.python.layers import layers
 from tensorflow.contrib.layers.python.layers import rev_block_lib
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.layers import convolutional
 from tensorflow.python.layers import core as core_layers
+from tensorflow.python.layers import normalization as normalization_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
@@ -83,8 +85,8 @@ def g(x):
       sess.run(variables.global_variables_initializer())
       y1, y2, y1_inv, y2_inv = sess.run([y1, y2, y1_inv, y2_inv])
 
-      self.assertAllClose(y1, y1_inv)
-      self.assertAllClose(y2, y2_inv)
+      self.assertAllClose(y1, y1_inv, rtol=1e-5)
+      self.assertAllClose(y2, y2_inv, rtol=1e-5)
 
   def _testRevBlock(self,
                     x=None,
@@ -179,18 +181,16 @@ def f2(x):
 
     self._testRevBlock(f=[f1, f2, f1, f2])
 
-  # TODO(rsepassi): Recent change to conv seems to have broken this test. Find
-  # out why.
-  def _testConvAndBatchNorm(self):
+  def testConvAndBatchNorm(self):
 
     x = random_ops.random_uniform(
         [self.BATCH_SIZE, 10, self.CHANNELS], dtype=dtypes.float32)
 
     def f(x):
       x = convolutional.conv1d(x, self.CHANNELS // 2, 3, padding="same")
-      x = layers.batch_norm(x, is_training=True)
+      x = layers.batch_norm(x, is_training=False)
       x = convolutional.conv1d(x, self.CHANNELS // 2, 3, padding="same")
-      x = layers.batch_norm(x, is_training=True)
+      x = layers.batch_norm(x, is_training=False)
       return x
 
     self._testRevBlock(x=x, f=f)
@@ -278,7 +278,7 @@ def fn_both(x):
     ]
     outputs_and_vars = []
     for name, wrapped_fn in names_and_fns:
-      with variable_scope.variable_scope(name) as vs:
+      with variable_scope.variable_scope(name, use_resource=True) as vs:
         out = math_ops.reduce_sum(wrapped_fn(x))
         outputs_and_vars.append((out, vs.trainable_variables()))
 
@@ -304,103 +304,73 @@ def fn_both(x):
           self.assertAllClose(current, g)
           current = g
 
-  def testResourceVariable(self):
-    @rev_block_lib.recompute_grad(tupleize_grads=True)
+  def testDoubleCallInSameScopeFails(self):
+
+    @rev_block_lib.recompute_grad
     def layer_with_recompute(inputs):
-      var = variable_scope.get_variable("var", ())
-      return var * inputs
+      return core_layers.dense(inputs, 2)
 
-    inputs = array_ops.ones((), dtypes.float32)
     with variable_scope.variable_scope("layer", use_resource=True):
-      outputs = layer_with_recompute(inputs)
-      loss = math_ops.square(outputs)
-      grads = gradients_impl.gradients(loss, variables.trainable_variables())
-      self.assertEqual(1, len(grads))
-      self.assertTrue(grads[0] is not None)
+      inputs = array_ops.ones((2, 4), dtypes.float32)
+      out1 = layer_with_recompute(inputs)
+      out2 = layer_with_recompute(inputs) + out1
+      out = math_ops.reduce_sum(out2)
 
+    tvars = variables.trainable_variables()
+    assert len(tvars) == 4
+    with self.assertRaisesWithPredicateMatch(
+        ValueError, "called twice in the same enclosing scope"):
+      gradients_impl.gradients(out, [inputs] + tvars)
 
-class FnWithCustomGradTest(test.TestCase):
+  def testDoubleCallInUniqueScope(self):
 
-  def testCorrectness(self):
+    @rev_block_lib.recompute_grad
+    def layer_with_recompute(inputs):
+      with variable_scope.variable_scope("inner", use_resource=True):
+        return core_layers.dense(inputs, 2)
 
-    w = random_ops.random_uniform([6, 10])
+    with variable_scope.variable_scope("layer", use_resource=True):
+      inputs = array_ops.ones((2, 4), dtypes.float32)
 
-    def fn(a, b, c):
-      return core_layers.dense(
-          a,
-          10,
-          use_bias=False,
-          kernel_initializer=lambda shape, dtype, partition_info: w
-      ) + math_ops.matmul(b, c)
-
-    def grad_fn(inputs, trainable_variables, outputs, grad_outputs):
-      outputs = outputs[0]
-      grad_outputs = grad_outputs[0]
-      grad_inputs = gradients_impl.gradients(
-          outputs, inputs, grad_ys=grad_outputs)
-      grad_vars = gradients_impl.gradients(
-          outputs, trainable_variables, grad_ys=grad_outputs)
-      return grad_inputs, grad_vars
-
-    custom_fn = rev_block_lib._fn_with_custom_grad(grad_fn)(fn)
-
-    a = random_ops.random_uniform([11, 6])
-    b = random_ops.random_uniform([11, 7])
-    c = random_ops.random_uniform([7, 10])
-
-    out = fn(a, b, c)
-    custom_out = custom_fn(a, b, c)
-    self.assertEqual(out.get_shape().as_list(),
-                     custom_out.get_shape().as_list())
-
-    loss = math_ops.reduce_mean(out)
-    custom_loss = math_ops.reduce_mean(custom_out)
-
-    grads = gradients_impl.gradients(
-        loss, [a, b, c] + [variables.trainable_variables()[0]])
-    custom_grads = gradients_impl.gradients(
-        custom_loss, [a, b, c] + [variables.trainable_variables()[1]])
+      with variable_scope.variable_scope("layer1", use_resource=True):
+        out1 = layer_with_recompute(inputs)
+      with variable_scope.variable_scope("layer2", use_resource=True):
+        out2 = layer_with_recompute(inputs) + out1
+      out = math_ops.reduce_sum(out2)
 
-    with self.test_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      out_val, custom_out_val, grads_val, custom_grads_val = sess.run(
-          [out, custom_out, grads, custom_grads])
-      self.assertAllClose(out_val, custom_out_val)
-      for g1, g2 in zip(grads_val, custom_grads_val):
-        self.assertAllClose(g1, g2)
-
-  def testCustomGrad(self):
-
-    def fn(a, b, c):
-      return core_layers.dense(a, 10, use_bias=False) + math_ops.matmul(b, c)
-
-    def grad_fn(inputs, trainable_variables, unused_outputs,
-                unused_grad_outputs):
-      grad_inputs = [
-          array_ops.ones_like(t) * (i + 1.) for i, t in enumerate(inputs)
-      ]
-      grad_vars = [
-          array_ops.ones_like(t) * (i + len(inputs) + 1.)
-          for i, t in enumerate(trainable_variables)
-      ]
-      return grad_inputs, grad_vars
-
-    a = random_ops.random_uniform([11, 6])
-    b = random_ops.random_uniform([11, 7])
-    c = random_ops.random_uniform([7, 10])
-    w = random_ops.random_uniform([6, 10])
-    out = rev_block_lib._fn_with_custom_grad(grad_fn)(fn)(a, b, c)
-    loss = math_ops.reduce_mean(out)
-    grads = gradients_impl.gradients(
-        loss, [a, b, c, variables.trainable_variables()[0]])
-    expected_grads = [
-        array_ops.ones_like(t) * (i + 1.) for i, t in enumerate([a, b, c, w])
-    ]
-    with self.test_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      g_val, eg_val = sess.run([grads, expected_grads])
-      for g1, g2 in zip(g_val, eg_val):
-        self.assertAllClose(g1, g2)
+    tvars = variables.trainable_variables()
+    assert len(tvars) == 4
+    grads = gradients_impl.gradients(out, [inputs] + tvars)
+    for grad in grads:
+      self.assertTrue(grad is not None)
+
+  def testWithIsRecomputeKwarg(self):
+
+    kwarg_values = []
+
+    @rev_block_lib.recompute_grad
+    def layer_with_recompute(inputs, is_recomputing=False):
+      kwarg_values.append(is_recomputing)
+      out = core_layers.dense(inputs, 2)
+      out = normalization_layers.batch_normalization(out, training=True)
+      if is_recomputing:
+        # Ensure that the updates are not duplicated by popping off the latest
+        # 2 additions.
+        update_ops = ops.get_collection_ref(ops.GraphKeys.UPDATE_OPS)
+        update_ops.pop()
+        update_ops.pop()
+      return out
+
+    x = array_ops.ones((2, 4), dtypes.float32)
+    with variable_scope.variable_scope("layer1", use_resource=True):
+      y = layer_with_recompute(x)
+    loss = math_ops.reduce_sum(y)
+    tvars = variables.trainable_variables()
+    gradients_impl.gradients(loss, [x] + tvars)
+
+    update_ops = ops.get_collection(ops.GraphKeys.UPDATE_OPS)
+    self.assertEqual(2, len(update_ops))
+    self.assertEqual([False, True], kwarg_values)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index 3b053cd4c66952..b56a88659bbd44 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -284,6 +284,7 @@ py_test(
     tags = [
         "manual",
         "noasan",  # times out
+        "optonly",  # test is flaky without optimization.
     ],
     deps = [
         ":learn",
@@ -434,6 +435,7 @@ py_test(
     name = "kmeans_test",
     size = "medium",
     srcs = ["python/learn/estimators/kmeans_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     tags = [
         "noasan",  # b/73741358
@@ -485,6 +487,7 @@ py_test(
     name = "state_saving_rnn_estimator_test",
     size = "medium",
     srcs = ["python/learn/estimators/state_saving_rnn_estimator_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     tags = ["noasan"],
     deps = [
@@ -744,7 +747,7 @@ py_test(
 
 tf_py_test(
     name = "graph_io_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/learn/learn_io/graph_io_test.py"],
     additional_deps = [
         ":learn",
diff --git a/tensorflow/contrib/learn/python/learn/estimators/head.py b/tensorflow/contrib/learn/python/learn/estimators/head.py
index e28e6854a5097d..339c4e0e360ed9 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/head.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/head.py
@@ -1862,12 +1862,12 @@ def _get_arguments(func):
   if hasattr(func, "__code__"):
     # Regular function.
     return tf_inspect.getargspec(func)
-  elif hasattr(func, "__call__"):
-    # Callable object.
-    return _get_arguments(func.__call__)
   elif hasattr(func, "func"):
     # Partial function.
     return _get_arguments(func.func)
+  elif hasattr(func, "__call__"):
+    # Callable object.
+    return _get_arguments(func.__call__)
 
 
 def _verify_loss_fn_args(loss_fn):
diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear.py b/tensorflow/contrib/learn/python/learn/estimators/linear.py
index 70b70af98c51dc..e100bc7a1e7be4 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/linear.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear.py
@@ -31,7 +31,6 @@
 from tensorflow.contrib import layers
 from tensorflow.contrib.framework import deprecated
 from tensorflow.contrib.framework import deprecated_arg_values
-from tensorflow.python.training import training_util
 from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
@@ -51,6 +50,7 @@
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training as train
+from tensorflow.python.training import training_util
 
 
 # The default learning rate of 0.2 is a historical artifact of the initial
@@ -244,7 +244,9 @@ def sdca_model_fn(features, labels, mode, params):
   parent_scope = "linear"
 
   with variable_scope.variable_scope(
-      values=features.values(), name_or_scope=parent_scope) as scope:
+      values=features.values(),
+      name_or_scope=parent_scope,
+      partitioner=optimizer.partitioner) as scope:
     features = features.copy()
     features.update(layers.transform_features(features, feature_columns))
     logits, columns_to_variables, bias = (
diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear_test.py b/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
index d3bb0fda5765d8..597ca4e86dbf66 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
@@ -43,6 +43,7 @@
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import ftrl
 from tensorflow.python.training import input as input_lib
@@ -863,6 +864,38 @@ def input_fn():
     scores = classifier.evaluate(input_fn=input_fn, steps=1)
     self.assertGreater(scores['accuracy'], 0.9)
 
+  def testSdcaOptimizerWeightedSparseFeaturesOOVWithNoOOVBuckets(self):
+    """LinearClassifier with SDCAOptimizer with OOV features (-1 IDs)."""
+
+    def input_fn():
+      return {
+          'example_id':
+              constant_op.constant(['1', '2', '3']),
+          'price':
+              sparse_tensor.SparseTensor(
+                  values=[2., 3., 1.],
+                  indices=[[0, 0], [1, 0], [2, 0]],
+                  dense_shape=[3, 5]),
+          'country':
+              sparse_tensor.SparseTensor(
+                  # 'GB' is out of the vocabulary.
+                  values=['IT', 'US', 'GB'],
+                  indices=[[0, 0], [1, 0], [2, 0]],
+                  dense_shape=[3, 5])
+      }, constant_op.constant([[1], [0], [1]])
+
+    country = feature_column_lib.sparse_column_with_keys(
+        'country', keys=['US', 'CA', 'MK', 'IT', 'CN'])
+    country_weighted_by_price = feature_column_lib.weighted_sparse_column(
+        country, 'price')
+    sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer(
+        example_id_column='example_id')
+    classifier = linear.LinearClassifier(
+        feature_columns=[country_weighted_by_price], optimizer=sdca_optimizer)
+    classifier.fit(input_fn=input_fn, steps=50)
+    scores = classifier.evaluate(input_fn=input_fn, steps=1)
+    self.assertGreater(scores['accuracy'], 0.9)
+
   def testSdcaOptimizerCrossedFeatures(self):
     """Tests LinearClassifier with SDCAOptimizer and crossed features."""
 
@@ -934,6 +967,63 @@ def input_fn():
     scores = classifier.evaluate(input_fn=input_fn, steps=1)
     self.assertGreater(scores['accuracy'], 0.9)
 
+  def testSdcaOptimizerPartitionedVariables(self):
+    """Tests LinearClassifier with SDCAOptimizer with partitioned variables."""
+
+    def input_fn():
+      return {
+          'example_id':
+              constant_op.constant(['1', '2', '3']),
+          'price':
+              constant_op.constant([[0.6], [0.8], [0.3]]),
+          'sq_footage':
+              constant_op.constant([[900.0], [700.0], [600.0]]),
+          'country':
+              sparse_tensor.SparseTensor(
+                  values=['IT', 'US', 'GB'],
+                  indices=[[0, 0], [1, 3], [2, 1]],
+                  dense_shape=[3, 5]),
+          'weights':
+              constant_op.constant([[3.0], [1.0], [1.0]])
+      }, constant_op.constant([[1], [0], [1]])
+
+    price = feature_column_lib.real_valued_column('price')
+    sq_footage_bucket = feature_column_lib.bucketized_column(
+        feature_column_lib.real_valued_column('sq_footage'),
+        boundaries=[650.0, 800.0])
+    country = feature_column_lib.sparse_column_with_hash_bucket(
+        'country', hash_bucket_size=5)
+    sq_footage_country = feature_column_lib.crossed_column(
+        [sq_footage_bucket, country], hash_bucket_size=10)
+
+    sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer(
+        example_id_column='example_id',
+        partitioner=partitioned_variables.fixed_size_partitioner(
+            num_shards=2, axis=0))
+
+    tf_config = {
+        'cluster': {
+            run_config.TaskType.PS: ['fake_ps_0', 'fake_ps_1']
+        }
+    }
+    with test.mock.patch.dict('os.environ',
+                              {'TF_CONFIG': json.dumps(tf_config)}):
+      config = run_config.RunConfig()
+      # Because we did not start a distributed cluster, we need to pass an
+      # empty ClusterSpec, otherwise the device_setter will look for
+      # distributed jobs, such as "/job:ps" which are not present.
+      config._cluster_spec = server_lib.ClusterSpec({})
+
+    classifier = linear.LinearClassifier(
+        feature_columns=[price, sq_footage_bucket, country, sq_footage_country],
+        weight_column_name='weights',
+        optimizer=sdca_optimizer,
+        config=config)
+    classifier.fit(input_fn=input_fn, steps=50)
+    scores = classifier.evaluate(input_fn=input_fn, steps=1)
+    print('all scores = {}'.format(scores))
+    self.assertGreater(scores['accuracy'], 0.9)
+
   def testEval(self):
     """Tests that eval produces correct metrics.
     """
@@ -1508,6 +1598,60 @@ def input_fn():
     loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
     self.assertLess(loss, 0.05)
 
+  def testSdcaOptimizerPartitionedVariables(self):
+    """Tests LinearRegressor with SDCAOptimizer with partitioned variables."""
+
+    def input_fn():
+      return {
+          'example_id':
+              constant_op.constant(['1', '2', '3']),
+          'price':
+              constant_op.constant([0.6, 0.8, 0.3]),
+          'sq_footage':
+              constant_op.constant([[900.0], [700.0], [600.0]]),
+          'country':
+              sparse_tensor.SparseTensor(
+                  values=['IT', 'US', 'GB'],
+                  indices=[[0, 0], [1, 3], [2, 1]],
+                  dense_shape=[3, 5]),
+          'weights':
+              constant_op.constant([[3.0], [5.0], [7.0]])
+      }, constant_op.constant([[1.55], [-1.25], [-3.0]])
+
+    price = feature_column_lib.real_valued_column('price')
+    sq_footage_bucket = feature_column_lib.bucketized_column(
+        feature_column_lib.real_valued_column('sq_footage'),
+        boundaries=[650.0, 800.0])
+    country = feature_column_lib.sparse_column_with_hash_bucket(
+        'country', hash_bucket_size=5)
+    sq_footage_country = feature_column_lib.crossed_column(
+        [sq_footage_bucket, country], hash_bucket_size=10)
+    sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer(
+        example_id_column='example_id', symmetric_l2_regularization=1.0,
+        partitioner=partitioned_variables.fixed_size_partitioner(
+            num_shards=2, axis=0))
+    tf_config = {
+        'cluster': {
+            run_config.TaskType.PS: ['fake_ps_0', 'fake_ps_1']
+        }
+    }
+    with test.mock.patch.dict('os.environ',
+                              {'TF_CONFIG': json.dumps(tf_config)}):
+      config = run_config.RunConfig()
+      # Because we did not start a distributed cluster, we need to pass an
+      # empty ClusterSpec, otherwise the device_setter will look for
+      # distributed jobs, such as "/job:ps" which are not present.
+      config._cluster_spec = server_lib.ClusterSpec({})
+
+    regressor = linear.LinearRegressor(
+        feature_columns=[price, sq_footage_bucket, country, sq_footage_country],
+        weight_column_name='weights',
+        optimizer=sdca_optimizer,
+        config=config)
+    regressor.fit(input_fn=input_fn, steps=20)
+    loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
+    self.assertLess(loss, 0.05)
+
   def testSdcaOptimizerSparseFeaturesWithL1Reg(self):
     """Tests LinearClassifier with SDCAOptimizer and sparse features."""
 
diff --git a/tensorflow/contrib/learn/python/learn/experiment.py b/tensorflow/contrib/learn/python/learn/experiment.py
index 3744abd860e7f4..541da9061732ad 100644
--- a/tensorflow/contrib/learn/python/learn/experiment.py
+++ b/tensorflow/contrib/learn/python/learn/experiment.py
@@ -38,19 +38,19 @@
 from tensorflow.contrib.learn.python.learn.estimators import run_config
 from tensorflow.contrib.tpu.python.tpu import tpu_estimator
 from tensorflow.python.estimator import estimator as core_estimator
-from tensorflow.python.estimator import util as estimator_util
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import basic_session_run_hooks
 from tensorflow.python.training import saver
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
+from tensorflow.python.util import function_utils
 
 __all__ = ["Experiment"]
 
 
 def _get_standardized_predicate_fn(predicate_fn):
-  pred_fn_args = estimator_util.fn_args(predicate_fn)
+  pred_fn_args = function_utils.fn_args(predicate_fn)
   if "checkpoint_path" not in pred_fn_args:
     # pylint: disable=unused-argument
     def _pred_fn_wrapper(eval_results, checkpoint_path):
@@ -468,10 +468,15 @@ def _continuous_eval(self,
         on which that evaluation was based.
         At the beginning of evaluation, the passed `eval_results` will be None
         so it's expected that the predicate function handles that gracefully.
-        When `predicate_fn` is not specified, continuous eval will run in an
-        infinite loop (if `train_steps` is None). or exit once global step
-        reaches `train_steps`.
-
+        Continuous eval behavior under different conditions:
+          * When `predicate_fn` is specified:
+            + if `train_steps` is None, run until `predicate_fn` returns False.
+            + if `train_steps` is specified, run until either global step
+              reaches `train_steps` or `predicate_fn` returns False.
+          * When `predicate_fn` is not specified:
+            + if `train_steps` is None, run in an infinite loop.
+            + if `train_steps` is specified, run until global step reaches
+              `train_steps`.
       export: Whether to export from this step. Default is 'True'.
 
     Raises:
diff --git a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
index c7cdb4131215c3..f8106d1e4a7e79 100644
--- a/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
+++ b/tensorflow/contrib/learn/python/learn/utils/saved_model_export_utils.py
@@ -343,7 +343,8 @@ def get_temp_export_dir(timestamped_export_dir):
   """
   (dirname, basename) = os.path.split(timestamped_export_dir)
   temp_export_dir = os.path.join(
-      compat.as_bytes(dirname), compat.as_bytes('temp-{}'.format(basename)))
+      compat.as_bytes(dirname),
+      compat.as_bytes('temp-{}'.format(compat.as_text(basename))))
   return temp_export_dir
 
 
diff --git a/tensorflow/contrib/linalg/BUILD b/tensorflow/contrib/linalg/BUILD
index fb6a989b76db97..6c18a0712531e4 100644
--- a/tensorflow/contrib/linalg/BUILD
+++ b/tensorflow/contrib/linalg/BUILD
@@ -42,47 +42,3 @@ gpu_py_test(
         "//tensorflow/python:platform_test",
     ],
 )
-
-gpu_py_test(
-    name = "linear_operator_block_diag_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/linear_operator_block_diag_test.py"],
-    additional_deps = [
-        ":linalg_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-    ],
-    shard_count = 5,
-    tags = [
-        "noasan",
-        "optonly",
-    ],
-)
-
-gpu_py_test(
-    name = "linear_operator_kronecker_test",
-    size = "medium",
-    srcs = ["python/kernel_tests/linear_operator_kronecker_test.py"],
-    additional_deps = [
-        ":linalg_py",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-    ],
-    shard_count = 8,
-    tags = [
-        "noasan",
-        "optonly",
-    ],
-)
diff --git a/tensorflow/contrib/linalg/__init__.py b/tensorflow/contrib/linalg/__init__.py
index 554854da84715e..a262a099cf8f84 100644
--- a/tensorflow/contrib/linalg/__init__.py
+++ b/tensorflow/contrib/linalg/__init__.py
@@ -39,14 +39,14 @@
 # pylint: disable=unused-import,wildcard-import,line-too-long,g-importing-member
 
 from tensorflow.contrib.linalg.python.ops.linear_operator_addition import *
-from tensorflow.contrib.linalg.python.ops.linear_operator_block_diag import *
-from tensorflow.contrib.linalg.python.ops.linear_operator_kronecker import *
 from tensorflow.python.ops.linalg.linear_operator import *
+from tensorflow.python.ops.linalg.linear_operator_block_diag import *
 from tensorflow.python.ops.linalg.linear_operator_circulant import *
 from tensorflow.python.ops.linalg.linear_operator_composition import *
 from tensorflow.python.ops.linalg.linear_operator_diag import *
 from tensorflow.python.ops.linalg.linear_operator_full_matrix import *
 from tensorflow.python.ops.linalg.linear_operator_identity import *
+from tensorflow.python.ops.linalg.linear_operator_kronecker import *
 from tensorflow.python.ops.linalg.linear_operator_low_rank_update import *
 from tensorflow.python.ops.linalg.linear_operator_lower_triangular import *
 
diff --git a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
index b5741967ab5256..ef0e08a777779e 100644
--- a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
@@ -35,6 +35,8 @@
 from tensorflow.python.ops import gen_sdca_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import googletest
 
@@ -132,15 +134,22 @@ def make_random_examples_and_variables_dicts(num_examples, dim, num_non_zero):
   return examples_dict, variables_dict
 
 
-def make_variable_dict(max_age, max_gender):
+def make_variable_dict(max_age, max_gender, partitioned=False):
   # TODO(sibyl-toe9oF2e):  Figure out how to derive max_age & max_gender from
   # examples_dict.
-  age_weights = variables_lib.Variable(
-      array_ops.zeros(
-          [max_age + 1], dtype=dtypes.float32))
-  gender_weights = variables_lib.Variable(
-      array_ops.zeros(
-          [max_gender + 1], dtype=dtypes.float32))
+  partitioner = None
+  if partitioned:
+    partitioner = partitioned_variables.fixed_size_partitioner(num_shards=2,
+                                                               axis=0)
+  with variable_scope.variable_scope(
+      name_or_scope='variables',
+      partitioner=partitioner):
+    age_weights = variables_lib.Variable(
+        array_ops.zeros(
+            [max_age + 1], dtype=dtypes.float32))
+    gender_weights = variables_lib.Variable(
+        array_ops.zeros(
+            [max_gender + 1], dtype=dtypes.float32))
   return dict(
       sparse_features_weights=[age_weights, gender_weights],
       dense_features_weights=[])
@@ -265,6 +274,54 @@ def testSimple(self):
         self.assertAllClose(
             0.01, lr.approximate_duality_gap().eval(), rtol=1e-2, atol=1e-2)
 
+  def testPartitionedPrimals(self):
+    # Setup test data
+    example_protos = [
+        make_example_proto({
+            'age': [0],
+            'gender': [0]
+        }, 0),
+        make_example_proto({
+            'age': [1],
+            'gender': [1]
+        }, 1),
+    ]
+    example_weights = [1.0, 1.0]
+    for num_shards in _SHARD_NUMBERS:
+      with self._single_threaded_test_session():
+        examples = make_example_dict(example_protos, example_weights)
+        variables = make_variable_dict(1, 1, partitioned=True)
+        options = dict(
+            symmetric_l2_regularization=1,
+            symmetric_l1_regularization=0,
+            num_table_shards=num_shards,
+            loss_type='logistic_loss')
+
+        lr = SdcaModel(examples, variables, options)
+        variables_lib.global_variables_initializer().run()
+        unregularized_loss = lr.unregularized_loss(examples)
+        loss = lr.regularized_loss(examples)
+        predictions = lr.predictions(examples)
+        self.assertAllClose(0.693147, unregularized_loss.eval())
+        self.assertAllClose(0.693147, loss.eval())
+        train_op = lr.minimize()
+        for _ in range(_MAX_ITERATIONS):
+          train_op.run()
+        lr.update_weights(train_op).run()
+        # The high tolerance in unregularized_loss comparisons is due to the
+        # fact that it's possible to trade off unregularized_loss vs.
+        # regularization and still have a sum that is quite close to the
+        # optimal regularized_loss value.  SDCA's duality gap only ensures that
+        # the regularized_loss is within 0.01 of optimal.
+        # 0.525457 is the optimal regularized_loss.
+        # 0.411608 is the unregularized_loss at that optimum.
+        self.assertAllClose(0.411608, unregularized_loss.eval(), atol=0.05)
+        self.assertAllClose(0.525457, loss.eval(), atol=0.01)
+        predicted_labels = get_binary_predictions_for_logistic(predictions)
+        self.assertAllEqual([0, 1], predicted_labels.eval())
+        self.assertAllClose(
+            0.01, lr.approximate_duality_gap().eval(), rtol=1e-2, atol=1e-2)
+
   def testSparseRandom(self):
     dim = 20
     num_examples = 1000
@@ -320,7 +377,10 @@ def testSparseDuplicate(self):
         train_op.run()
 
   def testDistributedSimple(self):
-    # Setup test data
+    # Distributed SDCA may not converge if the workers update concurrently the
+    # same example. In this test the examples are partitioned across workers.
+    # The examples are the same for all workers, just the example_ids are
+    # different.
     example_protos = [
         make_example_proto({
             'age': [0],
@@ -332,13 +392,19 @@ def testDistributedSimple(self):
         }, 1),
     ]
     example_weights = [1.0, 1.0]
+    examples = make_example_dict(example_protos, example_weights)
+    example_ids = array_ops.placeholder(
+        dtypes.string, shape=(len(example_weights),))
+    examples['example_ids'] = example_ids
+    variables = make_variable_dict(1, 1)
     for num_shards in _SHARD_NUMBERS:
       for num_loss_partitions in _NUM_LOSS_PARTITIONS:
         with self._single_threaded_test_session():
-          examples = make_example_dict(example_protos, example_weights)
-          variables = make_variable_dict(1, 1)
           options = dict(
-              symmetric_l2_regularization=1,
+              # Keep the same solution as for TestSimple: since the number of
+              # examples is multplied by num_loss_partitions, multiply also
+              # L2 by the same value.
+              symmetric_l2_regularization=num_loss_partitions,
               symmetric_l1_regularization=0,
               loss_type='logistic_loss',
               num_table_shards=num_shards,
@@ -354,32 +420,30 @@ def testDistributedSimple(self):
 
           train_op = lr.minimize()
 
-          def minimize():
+          def minimize(worker_id):
             with self._single_threaded_test_session():
+              feed_dict = {example_ids: [
+                  str(i + worker_id*len(example_weights)) for i in range(
+                      len(example_weights))]}
               for _ in range(_MAX_ITERATIONS):
-                train_op.run()  # pylint: disable=cell-var-from-loop
+                train_op.run(feed_dict=feed_dict)  # pylint: disable=cell-var-from-loop
 
           threads = []
-          for _ in range(num_loss_partitions):
-            threads.append(threading.Thread(target=minimize))
+          for worker_id in range(num_loss_partitions):
+            threads.append(threading.Thread(target=minimize, args=(worker_id,)))
             threads[-1].start()
 
           for t in threads:
             t.join()
-          lr.update_weights(train_op).run()
-
-          # The high tolerance in unregularized_loss comparisons is due to the
-          # fact that it's possible to trade off unregularized_loss vs.
-          # regularization and still have a sum that is quite close to the
-          # optimal regularized_loss value.  SDCA's duality gap only ensures
-          # that the regularized_loss is within 0.01 of optimal.
-          # 0.525457 is the optimal regularized_loss.
-          # 0.411608 is the unregularized_loss at that optimum.
-          self.assertAllClose(0.411608, unregularized_loss.eval(), atol=0.05)
-          self.assertAllClose(0.525457, loss.eval(), atol=0.01)
+          lr.update_weights(train_op).run(feed_dict={
+              example_ids: [str(i) for i in range(len(example_weights))]})
+
+          # Test only the unregularized loss because the optimal value of the
+          # regularized loss depends on num_loss_partitions.
+          self.assertAllClose(0.411608, unregularized_loss.eval(), atol=0.02)
           predicted_labels = get_binary_predictions_for_logistic(predictions)
           self.assertAllEqual([0, 1], predicted_labels.eval())
-          self.assertTrue(lr.approximate_duality_gap().eval() < 0.02)
+          self.assertNear(0.0, lr.approximate_duality_gap().eval(), 0.02)
 
   def testSimpleNoL2(self):
     # Same as test above (so comments from above apply) but without an L2.
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
index f980746a19fb8e..0047d5753a773c 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
@@ -22,12 +22,14 @@
 from six.moves import range
 
 from tensorflow.contrib.linear_optimizer.python.ops.sharded_mutable_dense_hashtable import ShardedMutableDenseHashTable
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework.ops import internal_convert_to_tensor
 from tensorflow.python.framework.ops import name_scope
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gen_sdca_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
@@ -43,9 +45,6 @@
 class SdcaModel(object):
   """Stochastic dual coordinate ascent solver for linear models.
 
-    This class currently only supports a single machine (multi-threaded)
-    implementation. We expect the weights and duals to fit in a single machine.
-
     Loss functions supported:
 
      * Binary logistic loss
@@ -182,18 +181,41 @@ def _num_table_shards(self):
 
   # TODO(sibyl-Aix6ihai): Use optimizer interface to make use of slot creation logic.
   def _create_slots(self):
-    # Make internal variables which have the updates before applying L1
-    # regularization.
+    """Make unshrinked internal variables (slots)."""
+    # Unshrinked variables have the updates before applying L1 regularization.
+    # Each unshrinked slot variable is either a `Variable` or list of
+    # `Variable`, depending on the value of its corresponding primary variable.
+    # We avoid using `PartitionedVariable` for the unshrinked slots since we do
+    # not need any of the extra information.
     self._slots = collections.defaultdict(list)
     for name in ['sparse_features_weights', 'dense_features_weights']:
       for var in self._variables[name]:
-        with ops.device(var.device):
-          # TODO(andreasst): remove SDCAOptimizer suffix once bug 30843109 is
-          # fixed
-          self._slots['unshrinked_' + name].append(
-              var_ops.Variable(
-                  array_ops.zeros_like(var.initialized_value(), dtypes.float32),
-                  name=var.op.name + '_unshrinked/SDCAOptimizer'))
+        # Our primary variable may be either a PartitionedVariable, or a list
+        # of Variables (each representing a partition).
+        if (isinstance(var, var_ops.PartitionedVariable) or
+            isinstance(var, list)):
+          var_list = []
+          # pylint: disable=protected-access
+          for v in var:
+            with ops.colocate_with(v):
+              # TODO(andreasst): remove SDCAOptimizer suffix once bug 30843109
+              # is fixed.
+              slot_var = var_ops.Variable(
+                  initial_value=array_ops.zeros_like(v.initialized_value(),
+                                                     dtypes.float32),
+                  name=v.op.name + '_unshrinked/SDCAOptimizer')
+              var_list.append(slot_var)
+          self._slots['unshrinked_' + name].append(var_list)
+          # pylint: enable=protected-access
+        else:
+          with ops.device(var.device):
+            # TODO(andreasst): remove SDCAOptimizer suffix once bug 30843109 is
+            # fixed.
+            self._slots['unshrinked_' + name].append(
+                var_ops.Variable(
+                    array_ops.zeros_like(var.initialized_value(),
+                                         dtypes.float32),
+                    name=var.op.name + '_unshrinked/SDCAOptimizer'))
 
   def _assertSpecified(self, items, check_in):
     for x in items:
@@ -205,16 +227,25 @@ def _assertList(self, items, check_in):
       if not isinstance(check_in[x], list):
         raise ValueError(x + ' must be a list.')
 
+  def _var_to_list(self, var):
+    """Wraps var in a list if it is not a list or PartitionedVariable."""
+    if not (isinstance(var, list) or
+            isinstance(var, var_ops.PartitionedVariable)):
+      var = [var]
+    return var
+
   def _l1_loss(self):
     """Computes the (un-normalized) l1 loss of the model."""
     with name_scope('sdca/l1_loss'):
       sums = []
       for name in ['sparse_features_weights', 'dense_features_weights']:
-        for weights in self._convert_n_to_tensor(self._variables[name]):
-          with ops.device(weights.device):
-            sums.append(
-                math_ops.reduce_sum(
-                    math_ops.abs(math_ops.cast(weights, dtypes.float64))))
+        for var in self._variables[name]:
+          for v in self._var_to_list(var):
+            weights = internal_convert_to_tensor(v)
+            with ops.device(weights.device):
+              sums.append(
+                  math_ops.reduce_sum(
+                      math_ops.abs(math_ops.cast(weights, dtypes.float64))))
       # SDCA L1 regularization cost is: l1 * sum(|weights|)
       return self._options['symmetric_l1_regularization'] * math_ops.add_n(sums)
 
@@ -223,17 +254,37 @@ def _l2_loss(self, l2):
     with name_scope('sdca/l2_loss'):
       sums = []
       for name in ['sparse_features_weights', 'dense_features_weights']:
-        for weights in self._convert_n_to_tensor(self._variables[name]):
-          with ops.device(weights.device):
-            sums.append(
-                math_ops.reduce_sum(
-                    math_ops.square(math_ops.cast(weights, dtypes.float64))))
+        for var in self._variables[name]:
+          for v in self._var_to_list(var):
+            weights = internal_convert_to_tensor(v)
+            with ops.device(weights.device):
+              sums.append(math_ops.reduce_sum(math_ops.square(math_ops.cast(
+                  weights, dtypes.float64))))
       # SDCA L2 regularization cost is: l2 * sum(weights^2) / 2
       return l2 * math_ops.add_n(sums) / 2.0
 
   def _convert_n_to_tensor(self, input_list, as_ref=False):
     """Converts input list to a set of tensors."""
-    return [internal_convert_to_tensor(x, as_ref=as_ref) for x in input_list]
+    # input_list can be a list of Variables (that are implicitly partitioned),
+    # in which case the underlying logic in internal_convert_to_tensor will not
+    # concatenate the partitions together.  This method takes care of the
+    # concatenating (we only allow partitioning on the first axis).
+    output_list = []
+    for x in input_list:
+      tensor_to_convert = x
+      if isinstance(x, list) or isinstance(x, var_ops.PartitionedVariable):
+        # We only allow for partitioning on the first axis.
+        tensor_to_convert = array_ops.concat(x, axis=0)
+      output_list.append(internal_convert_to_tensor(
+          tensor_to_convert, as_ref=as_ref))
+    return output_list
+
+  def _get_first_dimension_size_statically(self, w, num_partitions):
+    """Compute the static size of the first dimension for a sharded variable."""
+    dim_0_size = w[0].get_shape()[0]
+    for p in range(1, num_partitions):
+      dim_0_size += w[p].get_shape()[0]
+    return dim_0_size
 
   def _linear_predictions(self, examples):
     """Returns predictions of the form w*x."""
@@ -286,6 +337,28 @@ def predictions(self, examples):
         result = math_ops.sigmoid(result)
     return result
 
+  def _get_partitioned_update_ops(self,
+                                  v_num,
+                                  num_partitions_by_var,
+                                  p_assignments_by_var,
+                                  gather_ids_by_var,
+                                  weights,
+                                  full_update,
+                                  p_assignments,
+                                  num_partitions):
+    """Get updates for partitioned variables."""
+    num_partitions = num_partitions_by_var[v_num]
+    p_assignments = p_assignments_by_var[v_num]
+    gather_ids = gather_ids_by_var[v_num]
+    updates = data_flow_ops.dynamic_partition(
+        full_update, p_assignments, num_partitions)
+    update_ops = []
+    for p in range(num_partitions):
+      with ops.colocate_with(weights[p]):
+        result = state_ops.scatter_add(weights[p], gather_ids[p], updates[p])
+      update_ops.append(result)
+    return update_ops
+
   def minimize(self, global_step=None, name=None):
     """Add operations to train a linear model by minimizing the loss function.
 
@@ -318,18 +391,89 @@ def minimize(self, global_step=None, name=None):
       # Solver returns example_state_update, new delta sparse_feature_weights
       # and delta dense_feature_weights.
 
-      weights_tensor = self._convert_n_to_tensor(self._slots[
-          'unshrinked_sparse_features_weights'])
       sparse_weights = []
       sparse_indices = []
-      for w, i in zip(weights_tensor, sparse_feature_indices):
-        # Find the feature ids to lookup in the variables.
-        with ops.device(w.device):
-          sparse_indices.append(
-              math_ops.cast(
-                  array_ops.unique(math_ops.cast(i, dtypes.int32))[0],
-                  dtypes.int64))
-          sparse_weights.append(array_ops.gather(w, sparse_indices[-1]))
+      # If we have partitioned variables, keep a few lists of Tensors around
+      # that we need for the assign_add after the op call to
+      # gen_sdca_ops.sdca_optimizer().
+      num_partitions_by_var = []
+      p_assignments_by_var = []
+      gather_ids_by_var = []
+      for w, i in zip(self._slots['unshrinked_sparse_features_weights'],
+                      sparse_feature_indices):
+        # Append the sparse_indices (in full-variable space).
+        sparse_idx = math_ops.cast(
+            array_ops.unique(math_ops.cast(i, dtypes.int32))[0],
+            dtypes.int64)
+        sparse_indices.append(sparse_idx)
+        if isinstance(w, list) or isinstance(w, var_ops.PartitionedVariable):
+          num_partitions = len(w)
+          flat_ids = array_ops.reshape(sparse_idx, [-1])
+          # We use div partitioning, which is easiest to support downstream.
+          # Compute num_total_ids as the sum of dim-0 of w, then assign
+          # to partitions based on a constant number of ids per partition.
+          # Optimize if we already know the full shape statically.
+          dim_0_size = self._get_first_dimension_size_statically(
+              w, num_partitions)
+
+          if dim_0_size.value:
+            num_total_ids = constant_op.constant(dim_0_size.value,
+                                                 flat_ids.dtype)
+          else:
+            dim_0_sizes = []
+            for p in range(num_partitions):
+              if w[p].get_shape()[0].value is not None:
+                dim_0_sizes.append(w[p].get_shape()[0].value)
+              else:
+                with ops.colocate_with(w[p]):
+                  dim_0_sizes.append(array_ops.shape(w[p])[0])
+            num_total_ids = math_ops.reduce_sum(
+                math_ops.cast(array_ops.stack(dim_0_sizes), flat_ids.dtype))
+          ids_per_partition = num_total_ids // num_partitions
+          extras = num_total_ids % num_partitions
+
+          p_assignments = math_ops.maximum(
+              flat_ids // (ids_per_partition + 1),
+              (flat_ids - extras) // ids_per_partition)
+
+          # Emulate a conditional using a boolean indicator tensor
+          new_ids = array_ops.where(p_assignments < extras,
+                                    flat_ids % (ids_per_partition + 1),
+                                    (flat_ids - extras) % ids_per_partition)
+
+          # Cast partition assignments to int32 for use in dynamic_partition.
+          # There really should not be more than 2^32 partitions.
+          p_assignments = math_ops.cast(p_assignments, dtypes.int32)
+          # Partition list of ids based on assignments into num_partitions
+          # separate lists.
+          gather_ids = data_flow_ops.dynamic_partition(new_ids,
+                                                       p_assignments,
+                                                       num_partitions)
+          # Append these to the lists for use in the later update.
+          num_partitions_by_var.append(num_partitions)
+          p_assignments_by_var.append(p_assignments)
+          gather_ids_by_var.append(gather_ids)
+
+          # Gather the weights from each partition.
+          partition_gathered_weights = []
+          for p in range(num_partitions):
+            with ops.colocate_with(w[p]):
+              partition_gathered_weights.append(
+                  array_ops.gather(w[p], gather_ids[p]))
+
+          # Stitch the weights back together in the same order they were before
+          # we dynamic_partitioned them.
+          condition_indices = data_flow_ops.dynamic_partition(
+              math_ops.range(array_ops.shape(new_ids)[0]),
+              p_assignments, num_partitions)
+          batch_gathered_weights = data_flow_ops.dynamic_stitch(
+              condition_indices, partition_gathered_weights)
+        else:
+          w_as_tensor = internal_convert_to_tensor(w)
+          with ops.device(w_as_tensor.device):
+            batch_gathered_weights = array_ops.gather(
+                w_as_tensor, sparse_idx)
+        sparse_weights.append(batch_gathered_weights)
 
       # pylint: disable=protected-access
       esu, sfw, dfw = gen_sdca_ops.sdca_optimizer(
@@ -355,12 +499,25 @@ def minimize(self, global_step=None, name=None):
       with ops.control_dependencies([esu]):
         update_ops = [self._hashtable.insert(example_ids_hashed, esu)]
         # Update the weights before the proximal step.
-        for w, i, u in zip(self._slots['unshrinked_sparse_features_weights'],
-                           sparse_indices, sfw):
-          update_ops.append(state_ops.scatter_add(w, i, u))
+        for v_num, (w, i, u) in enumerate(
+            zip(self._slots['unshrinked_sparse_features_weights'],
+                sparse_indices, sfw)):
+          if (isinstance(w, var_ops.PartitionedVariable) or
+              isinstance(w, list)):
+            update_ops += self._get_partitioned_update_ops(
+                v_num, num_partitions_by_var, p_assignments_by_var,
+                gather_ids_by_var, w, u, p_assignments, num_partitions)
+          else:
+            update_ops.append(state_ops.scatter_add(w, i, u))
         for w, u in zip(self._slots['unshrinked_dense_features_weights'], dfw):
-          update_ops.append(w.assign_add(u))
-
+          if (isinstance(w, var_ops.PartitionedVariable) or
+              isinstance(w, list)):
+            split_updates = array_ops.split(
+                u, num_or_size_splits=[v.shape.as_list()[0] for v in w])
+            for v, split_update in zip(w, split_updates):
+              update_ops.append(state_ops.assign_add(v, split_update))
+          else:
+            update_ops.append(state_ops.assign_add(w, u))
       if not global_step:
         return control_flow_ops.group(*update_ops)
       with ops.control_dependencies(update_ops):
@@ -385,21 +542,22 @@ def update_weights(self, train_op):
       for name in ['sparse_features_weights', 'dense_features_weights']:
         for var, slot_var in zip(self._variables[name],
                                  self._slots['unshrinked_' + name]):
-          update_ops.append(var.assign(slot_var))
+          for v, sv in zip(self._var_to_list(var), self._var_to_list(slot_var)):
+            update_ops.append(v.assign(sv))
 
     # Apply proximal step.
     with ops.control_dependencies(update_ops):
       update_ops = []
       for name in ['sparse_features_weights', 'dense_features_weights']:
         for var in self._variables[name]:
-          with ops.device(var.device):
-            # pylint: disable=protected-access
-            update_ops.append(
-                gen_sdca_ops.sdca_shrink_l1(
-                    self._convert_n_to_tensor(
-                        [var], as_ref=True),
-                    l1=self._symmetric_l1_regularization(),
-                    l2=self._symmetric_l2_regularization()))
+          for v in self._var_to_list(var):
+            with ops.device(v.device):
+              # pylint: disable=protected-access
+              update_ops.append(
+                  gen_sdca_ops.sdca_shrink_l1(
+                      self._convert_n_to_tensor([v], as_ref=True),
+                      l1=self._symmetric_l1_regularization(),
+                      l2=self._symmetric_l2_regularization()))
       return control_flow_ops.group(*update_ops)
 
   def approximate_duality_gap(self):
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py b/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py
index d4e54c82f988e0..200e7de6b95f17 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_estimator.py
@@ -116,6 +116,7 @@ def sdca_model_fn(features, labels, mode, params, config=None):
   num_loss_partitions = params["num_loss_partitions"]
   weight_column_name = params["weight_column_name"]
   update_weights_hook = params.get("update_weights_hook", None)
+  partitioner = params["partitioner"]
 
   loss_type = None
   if isinstance(head, head_lib._BinarySvmHead):  # pylint: disable=protected-access
@@ -136,12 +137,14 @@ def sdca_model_fn(features, labels, mode, params, config=None):
       example_id_column=example_id_column,
       num_loss_partitions=n_loss_partitions,
       symmetric_l1_regularization=l1_regularization,
-      symmetric_l2_regularization=l2_regularization)
+      symmetric_l2_regularization=l2_regularization,
+      partitioner=partitioner)
 
   parent_scope = "linear"
 
   with variable_scope.variable_scope(
-      values=features.values(), name_or_scope=parent_scope) as scope:
+      values=features.values(), name_or_scope=parent_scope,
+      partitioner=partitioner) as scope:
     features = features.copy()
     features.update(layers.transform_features(features, feature_columns))
     logits, columns_to_variables, bias = (
@@ -213,7 +216,8 @@ def __init__(self,
                l2_regularization=1.0,
                num_loss_partitions=None,
                config=None,
-               feature_engineering_fn=None):
+               feature_engineering_fn=None,
+               partitioner=None):
     """Construct a `_SDCAEstimator` estimator object.
 
     Args:
@@ -241,6 +245,8 @@ def __init__(self,
       feature_engineering_fn: Feature engineering function. Takes features and
         labels which are the output of `input_fn` and returns features and
         labels which will be fed into the model.
+      partitioner: Variable partitioner for the primal weights (`div`
+        partitioning strategy will be used).
 
     Returns:
       A `_SDCAEstimator` estimator.
@@ -267,6 +273,7 @@ def __init__(self,
         "l2_regularization": l2_regularization,
         "weight_column_name": weight_column_name,
         "update_weights_hook": _SdcaUpdateWeightsHook(),
+        "partitioner": partitioner,
     }
 
     super(_SDCAEstimator, self).__init__(
@@ -336,7 +343,8 @@ def __init__(self,
                l2_regularization=1.0,
                num_loss_partitions=None,
                config=None,
-               feature_engineering_fn=None):
+               feature_engineering_fn=None,
+               partitioner=None):
     """Construct a `SDCALogisticClassifier` object.
 
     Args:
@@ -361,6 +369,8 @@ def __init__(self,
       feature_engineering_fn: Feature engineering function. Takes features and
         labels which are the output of `input_fn` and returns features and
         labels which will be fed into the model.
+      partitioner: Variable partitioner for the primal weights (`div`
+        partitioning strategy will be used).
 
     Returns:
       A `SDCALogisiticClassifier` estimator.
@@ -376,7 +386,8 @@ def __init__(self,
         l2_regularization=l2_regularization,
         num_loss_partitions=num_loss_partitions,
         config=config,
-        feature_engineering_fn=None)
+        feature_engineering_fn=None,
+        partitioner=partitioner)
 
   def predict_classes(self, input_fn=None):
     """Runs inference to determine the predicted class.
@@ -463,7 +474,8 @@ def __init__(self,
                l2_regularization=1.0,
                num_loss_partitions=None,
                config=None,
-               feature_engineering_fn=None):
+               feature_engineering_fn=None,
+               partitioner=None):
     """Construct a `SDCALinearRegressor` estimator object.
 
 
@@ -489,6 +501,8 @@ def __init__(self,
       feature_engineering_fn: Feature engineering function. Takes features and
         labels which are the output of `input_fn` and returns features and
         labels which will be fed into the model.
+      partitioner: Variable partitioner for the primal weights (`div`
+        partitioning strategy will be used).
 
     Returns:
       A `SDCALinearRegressor` estimator.
@@ -503,7 +517,8 @@ def __init__(self,
         l2_regularization=l2_regularization,
         num_loss_partitions=num_loss_partitions,
         config=config,
-        feature_engineering_fn=None)
+        feature_engineering_fn=None,
+        partitioner=partitioner)
 
   def predict_scores(self, input_fn):
     """Returns predicted scores for given features.
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py b/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
index bed3d5139fcbf9..647667188238dc 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
@@ -25,6 +25,7 @@
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.platform import test
 
 
@@ -273,6 +274,47 @@ def input_fn():
       metrics = classifier.evaluate(input_fn=input_fn, steps=1)
       self.assertGreater(metrics['accuracy'], 0.9)
 
+  def testPartitionedMixedFeatures(self):
+    """Tests SDCALogisticClassifier with a mix of features (partitioned)."""
+
+    def input_fn():
+      return {
+          'example_id':
+              constant_op.constant(['1', '2', '3']),
+          'price':
+              constant_op.constant([[0.6], [0.8], [0.3]]),
+          'sq_footage':
+              constant_op.constant([900.0, 700.0, 600.0]),
+          'country':
+              sparse_tensor.SparseTensor(
+                  values=['IT', 'US', 'GB'],
+                  indices=[[0, 0], [1, 3], [2, 1]],
+                  dense_shape=[3, 5]),
+          'weights':
+              constant_op.constant([[3.0], [1.0], [1.0]])
+      }, constant_op.constant([[1], [0], [1]])
+
+    with self._single_threaded_test_session():
+      price = feature_column_lib.real_valued_column('price')
+      sq_footage_bucket = feature_column_lib.bucketized_column(
+          feature_column_lib.real_valued_column('sq_footage'),
+          boundaries=[650.0, 800.0])
+      country = feature_column_lib.sparse_column_with_hash_bucket(
+          'country', hash_bucket_size=5)
+      sq_footage_country = feature_column_lib.crossed_column(
+          [sq_footage_bucket, country], hash_bucket_size=10)
+      classifier = sdca_estimator.SDCALogisticClassifier(
+          example_id_column='example_id',
+          feature_columns=[
+              price, sq_footage_bucket, country, sq_footage_country
+          ],
+          weight_column_name='weights',
+          partitioner=partitioned_variables.fixed_size_partitioner(
+              num_shards=2, axis=0))
+      classifier.fit(input_fn=input_fn, steps=50)
+      metrics = classifier.evaluate(input_fn=input_fn, steps=1)
+      self.assertGreater(metrics['accuracy'], 0.9)
+
 
 class SDCALinearRegressorTest(test.TestCase):
 
@@ -350,6 +392,48 @@ def input_fn():
       loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
       self.assertLess(loss, 0.05)
 
+  def testMixedFeaturesArbitraryWeightsPartitioned(self):
+    """Tests SDCALinearRegressor works with a mix of features (partitioned)."""
+
+    def input_fn():
+      return {
+          'example_id':
+              constant_op.constant(['1', '2', '3']),
+          'price':
+              constant_op.constant([[0.6], [0.8], [0.3]]),
+          'sq_footage':
+              constant_op.constant([[900.0], [700.0], [600.0]]),
+          'country':
+              sparse_tensor.SparseTensor(
+                  values=['IT', 'US', 'GB'],
+                  indices=[[0, 0], [1, 3], [2, 1]],
+                  dense_shape=[3, 5]),
+          'weights':
+              constant_op.constant([[3.0], [5.0], [7.0]])
+      }, constant_op.constant([[1.55], [-1.25], [-3.0]])
+
+    with self._single_threaded_test_session():
+      price = feature_column_lib.real_valued_column('price')
+      sq_footage_bucket = feature_column_lib.bucketized_column(
+          feature_column_lib.real_valued_column('sq_footage'),
+          boundaries=[650.0, 800.0])
+      country = feature_column_lib.sparse_column_with_hash_bucket(
+          'country', hash_bucket_size=5)
+      sq_footage_country = feature_column_lib.crossed_column(
+          [sq_footage_bucket, country], hash_bucket_size=10)
+      regressor = sdca_estimator.SDCALinearRegressor(
+          example_id_column='example_id',
+          feature_columns=[
+              price, sq_footage_bucket, country, sq_footage_country
+          ],
+          l2_regularization=1.0,
+          weight_column_name='weights',
+          partitioner=partitioned_variables.fixed_size_partitioner(
+              num_shards=2, axis=0))
+      regressor.fit(input_fn=input_fn, steps=20)
+      loss = regressor.evaluate(input_fn=input_fn, steps=1)['loss']
+      self.assertLess(loss, 0.05)
+
   def testSdcaOptimizerSparseFeaturesWithL1Reg(self):
     """SDCALinearRegressor works with sparse features and L1 regularization."""
 
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py b/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
index 5d4572bf6c761e..9872c6f97c879d 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_optimizer.py
@@ -37,18 +37,18 @@ class SDCAOptimizer(object):
   Example usage:
 
   ```python
-    real_feature_column = real_valued_column(...)
-    sparse_feature_column = sparse_column_with_hash_bucket(...)
-    sdca_optimizer = linear.SDCAOptimizer(example_id_column='example_id',
-                                          num_loss_partitions=1,
-                                          num_table_shards=1,
-                                          symmetric_l2_regularization=2.0)
-    classifier = tf.contrib.learn.LinearClassifier(
-        feature_columns=[real_feature_column, sparse_feature_column],
-        weight_column_name=...,
-        optimizer=sdca_optimizer)
-    classifier.fit(input_fn_train, steps=50)
-    classifier.evaluate(input_fn=input_fn_eval)
+  real_feature_column = real_valued_column(...)
+  sparse_feature_column = sparse_column_with_hash_bucket(...)
+  sdca_optimizer = linear.SDCAOptimizer(example_id_column='example_id',
+                                        num_loss_partitions=1,
+                                        num_table_shards=1,
+                                        symmetric_l2_regularization=2.0)
+  classifier = tf.contrib.learn.LinearClassifier(
+      feature_columns=[real_feature_column, sparse_feature_column],
+      weight_column_name=...,
+      optimizer=sdca_optimizer)
+  classifier.fit(input_fn_train, steps=50)
+  classifier.evaluate(input_fn=input_fn_eval)
   ```
 
   Here the expectation is that the `input_fn_*` functions passed to train and
@@ -64,7 +64,8 @@ class SDCAOptimizer(object):
   of workers running the train steps. It defaults to 1 (single machine).
   `num_table_shards` defines the number of shards for the internal state
   table, typically set to match the number of parameter servers for large
-  data sets.
+  data sets. You can also specify a `partitioner` object to partition the primal
+  weights during training (`div` partitioning strategy will be used).
   """
 
   def __init__(self,
@@ -73,13 +74,15 @@ def __init__(self,
                num_table_shards=None,
                symmetric_l1_regularization=0.0,
                symmetric_l2_regularization=1.0,
-               adaptive=True):
+               adaptive=True,
+               partitioner=None):
     self._example_id_column = example_id_column
     self._num_loss_partitions = num_loss_partitions
     self._num_table_shards = num_table_shards
     self._symmetric_l1_regularization = symmetric_l1_regularization
     self._symmetric_l2_regularization = symmetric_l2_regularization
     self._adaptive = adaptive
+    self._partitioner = partitioner
 
   def get_name(self):
     return 'SDCAOptimizer'
@@ -108,6 +111,10 @@ def symmetric_l2_regularization(self):
   def adaptive(self):
     return self._adaptive
 
+  @property
+  def partitioner(self):
+    return self._partitioner
+
   def get_train_step(self, columns_to_variables, weight_column_name, loss_type,
                      features, targets, global_step):
     """Returns the training operation of an SdcaModel optimizer."""
@@ -175,10 +182,12 @@ def _training_examples_and_variables():
           sparse_feature_column = _dense_tensor_to_sparse_feature_column(
               dense_bucket_tensor)
           sparse_feature_with_values.append(sparse_feature_column)
-          # For bucketized columns, the variables list contains exactly one
-          # element.
-          sparse_feature_with_values_weights.append(
-              columns_to_variables[column][0])
+          # If a partitioner was used during variable creation, we will have a
+          # list of Variables here larger than 1.
+          vars_to_append = columns_to_variables[column][0]
+          if len(columns_to_variables[column]) > 1:
+            vars_to_append = columns_to_variables[column]
+          sparse_feature_with_values_weights.append(vars_to_append)
         elif isinstance(
             column,
             (
@@ -198,6 +207,14 @@ def _training_examples_and_variables():
           example_ids = array_ops.reshape(id_tensor.indices[:, 0], [-1])
 
           flat_ids = array_ops.reshape(id_tensor.values, [-1])
+          # Prune invalid IDs (< 0) from the flat_ids, example_ids, and
+          # weight_tensor.  These can come from looking up an OOV entry in the
+          # vocabulary (default value being -1).
+          is_id_valid = math_ops.greater_equal(flat_ids, 0)
+          flat_ids = array_ops.boolean_mask(flat_ids, is_id_valid)
+          example_ids = array_ops.boolean_mask(example_ids, is_id_valid)
+          weight_tensor = array_ops.boolean_mask(weight_tensor, is_id_valid)
+
           projection_length = math_ops.reduce_max(flat_ids) + 1
           # project ids based on example ids so that we can dedup ids that
           # occur multiple times for a single example.
@@ -218,8 +235,12 @@ def _training_examples_and_variables():
                                             array_ops.shape(ids)[0]), [-1])
           sparse_feature_with_values.append(
               SparseFeatureColumn(example_ids_filtered, reproject_ids, weights))
-          sparse_feature_with_values_weights.append(
-              columns_to_variables[column][0])
+          # If a partitioner was used during variable creation, we will have a
+          # list of Variables here larger than 1.
+          vars_to_append = columns_to_variables[column][0]
+          if len(columns_to_variables[column]) > 1:
+            vars_to_append = columns_to_variables[column]
+          sparse_feature_with_values_weights.append(vars_to_append)
         else:
           raise ValueError('SDCAOptimizer does not support column type %s.' %
                            type(column).__name__)
diff --git a/tensorflow/contrib/lite/BUILD b/tensorflow/contrib/lite/BUILD
index 1534f97d760015..9c804d27854b80 100644
--- a/tensorflow/contrib/lite/BUILD
+++ b/tensorflow/contrib/lite/BUILD
@@ -6,8 +6,6 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts", "gen_selected_ops")
 
-exports_files(["LICENSE"])
-
 exports_files(glob([
     "testdata/*.bin",
     "testdata/*.pb",
@@ -92,6 +90,18 @@ cc_library(
     deps = [":context"],
 )
 
+cc_library(
+    name = "kernel_api",
+    hdrs = [
+        "builtin_op_data.h",
+        "builtin_ops.h",
+        "context.h",
+        "context_util.h",
+    ],
+)
+
+exports_files(["builtin_ops.h"])
+
 cc_library(
     name = "string",
     hdrs = [
@@ -112,6 +122,7 @@ cc_library(
         "interpreter.cc",
         "model.cc",
         "nnapi_delegate.cc",
+        "op_resolver.cc",
         "optional_debug_tools.cc",
     ],
     hdrs = [
@@ -122,6 +133,7 @@ cc_library(
         "interpreter.h",
         "model.h",
         "nnapi_delegate.h",
+        "op_resolver.h",
         "optional_debug_tools.h",
     ],
     copts = tflite_copts(),
@@ -224,6 +236,18 @@ cc_test(
     ],
 )
 
+# Test OpResolver.
+cc_test(
+    name = "op_resolver_test",
+    size = "small",
+    srcs = ["op_resolver_test.cc"],
+    deps = [
+        ":framework",
+        "//tensorflow/contrib/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 # Test the C extension API code.
 cc_test(
     name = "context_test",
diff --git a/tensorflow/contrib/lite/Makefile b/tensorflow/contrib/lite/Makefile
index 65fba52d461461..cc8a8035d1dade 100644
--- a/tensorflow/contrib/lite/Makefile
+++ b/tensorflow/contrib/lite/Makefile
@@ -1,4 +1,3 @@
-
 # Find where we're running from, so we can store generated files here.
 ifeq ($(origin MAKEFILE_DIR), undefined)
 	MAKEFILE_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
@@ -30,7 +29,7 @@ GENDIR := $(MAKEFILE_DIR)/gen/obj/
 CXX := $(CC_PREFIX)gcc
 CXXFLAGS := --std=c++11 -O3 -DNDEBUG
 CC := $(CC_PREFIX)gcc
-CFLAGS := -O3 -DNDEBUG
+CCFLAGS := -O3 -DNDEBUG
 LDOPTS :=
 LDOPTS += -L/usr/local/lib
 ARFLAGS := -r
@@ -69,12 +68,12 @@ LIB_NAME := libtensorflow-lite.a
 LIB_PATH := $(LIBDIR)$(LIB_NAME)
 
 # A small example program that shows how to link against the library.
-BENCHMARK_PATH := $(BINDIR)benchmark_model
+MINIMAL_PATH := $(BINDIR)minimal
 
-BENCHMARK_SRCS := \
-tensorflow/contrib/lite/tools/benchmark_model.cc
-BENCHMARK_OBJS := $(addprefix $(OBJDIR), \
-$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(BENCHMARK_SRCS))))
+MINIMAL_SRCS := \
+tensorflow/contrib/lite/examples/minimal/minimal.cc
+MINIMAL_OBJS := $(addprefix $(OBJDIR), \
+$(patsubst %.cc,%.o,$(patsubst %.c,%.o,$(MINIMAL_SRCS))))
 
 # What sources we want to compile, must be kept in sync with the main Bazel
 # build files.
@@ -100,7 +99,7 @@ $(wildcard tensorflow/contrib/lite/*/*test.cc) \
 $(wildcard tensorflow/contrib/lite/*/*/*test.cc) \
 $(wildcard tensorflow/contrib/lite/*/*/*/*test.cc) \
 $(wildcard tensorflow/contrib/lite/kernels/test_util.cc) \
-$(BENCHMARK_SRCS)
+$(MINIMAL_SRCS)
 # Filter out all the excluded files.
 TF_LITE_CC_SRCS := $(filter-out $(CORE_CC_EXCLUDE_SRCS), $(CORE_CC_ALL_SRCS))
 # File names of the intermediate files target compilation generates.
@@ -119,17 +118,17 @@ $(OBJDIR)%.o: %.c
 	$(CC) $(CCFLAGS) $(INCLUDES) -c $< -o $@
 
 # The target that's compiled if there's no command-line arguments.
-all: $(LIB_PATH) $(BENCHMARK_PATH)
+all: $(LIB_PATH)  $(MINIMAL_PATH)
 
 # Gathers together all the objects we've compiled into a single '.a' archive.
 $(LIB_PATH): $(LIB_OBJS)
 	@mkdir -p $(dir $@)
 	$(AR) $(ARFLAGS) $(LIB_PATH) $(LIB_OBJS)
 
-$(BENCHMARK_PATH): $(BENCHMARK_OBJS) $(LIB_PATH)
+$(MINIMAL_PATH): $(MINIMAL_OBJS) $(LIB_PATH)
 	@mkdir -p $(dir $@)
 	$(CXX) $(CXXFLAGS) $(INCLUDES) \
-	-o $(BENCHMARK_PATH) $(BENCHMARK_OBJS) \
+	-o $(MINIMAL_PATH) $(MINIMAL_OBJS) \
 	$(LIBFLAGS) $(LIB_PATH) $(LDFLAGS) $(LIBS)
 
 # Gets rid of all generated files.
diff --git a/tensorflow/contrib/lite/RELEASE.md b/tensorflow/contrib/lite/RELEASE.md
new file mode 100644
index 00000000000000..8fd63d5cee7db3
--- /dev/null
+++ b/tensorflow/contrib/lite/RELEASE.md
@@ -0,0 +1,8 @@
+# Release 0.1.7
+
+* TensorFlow Lite 0.1.7 is based on tag `tflite-v0.1.7` (git commit
+  fa1db5eb0da85b5baccc2a46d534fdeb3bb473d0).
+* To reproduce the iOS library, it's required to cherry pick git commit
+  f1f1d5172fe5bfeaeb2cf657ffc43ba744187bee to fix a dependency issue.
+* The code is based on TensorFlow 1.8.0 release candidate and it's very close
+  to TensorFlow 1.8.0 release.
diff --git a/tensorflow/contrib/lite/build_def.bzl b/tensorflow/contrib/lite/build_def.bzl
index 85216776823eab..aa6a60dc9ed308 100644
--- a/tensorflow/contrib/lite/build_def.bzl
+++ b/tensorflow/contrib/lite/build_def.bzl
@@ -1,4 +1,8 @@
 """Generate Flatbuffer binary from json."""
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+)
 
 def tflite_copts():
   """Defines compile time flags."""
@@ -185,32 +189,106 @@ def json_to_tflite(name, src, out):
       tools = [flatc],
   )
 
-def gen_zipped_test_files(name, files):
+# This is the master list of generated examples that will be made into tests. A
+# function called make_XXX_tests() must also appear in generate_examples.py.
+# Disable a test by commenting it out. If you do, add a link to a bug or issue.
+def generated_test_models():
+    return [
+        "add",
+        "arg_max",
+        "avg_pool",
+        "batch_to_space_nd",
+        "concat",
+        "constant",
+        "control_dep",
+        "conv",
+        "depthwiseconv",
+        "div",
+        "exp",
+        "expand_dims",
+        "floor",
+        "fully_connected",
+        "fused_batch_norm",
+        "gather",
+        "global_batch_norm",
+        "greater",
+        "greater_equal",
+        "l2norm",
+        "l2_pool",
+        "less",
+        "less_equal",
+        "local_response_norm",
+        "log_softmax",
+        "lstm",
+        "max_pool",
+        "maximum",
+        "mean",
+        "minimum",
+        "mul",
+        "neg",
+        "pad",
+        "padv2",
+        # "prelu",
+        "relu",
+        "relu1",
+        "relu6",
+        "reshape",
+        "resize_bilinear",
+        "sigmoid",
+        "sin",
+        "slice",
+        "softmax",
+        "space_to_batch_nd",
+        "space_to_depth",
+        "sparse_to_dense",
+        "split",
+        "squeeze",
+        "strided_slice",
+        "strided_slice_1d_exhaustive",
+        "sub",
+        "tile",
+        "topk",
+        "transpose",
+        "transpose_conv",
+        "where",
+    ]
+
+def gen_zip_test(name, test_name, **kwargs):
+  """Generate a zipped-example test and its dependent zip files.
+
+  Args:
+    name: Resulting cc_test target name
+    test_name: Test targets this model. Comes from the list above.
+    **kwargs: tf_cc_test kwargs.
+  """
+  gen_zipped_test_file(
+      name = "zip_%s" % test_name,
+      file = "%s.zip" % test_name,
+  )
+  tf_cc_test(name, **kwargs)
+
+def gen_zipped_test_file(name, file):
   """Generate a zip file of tests by using :generate_examples.
 
   Args:
-    name: Name of output. We will produce "`name`_files" as a target.
-    files: A list of zip file basenames.
+    name: Name of output. We will produce "`file`.files" as a target.
+    file: The name of one of the generated_examples targets, e.g. "transpose"
   """
   toco = "//tensorflow/contrib/lite/toco:toco"
-  out_files = []
-  for f in files:
-    out_file = name + "/" + f
-    out_files.append(out_file)
-    native.genrule(
-        name = name + "_" + f + ".files",
-        cmd = ("$(locations :generate_examples) --toco $(locations %s) " % toco
-               + " --zip_to_output " + f + " $(@D)"),
-        outs = [out_file],
-        tools = [
-            ":generate_examples",
-            toco,
-        ],
-    )
+  native.genrule(
+      name = file + ".files",
+      cmd = ("$(locations :generate_examples) --toco $(locations %s) " % toco
+             + " --zip_to_output " + file + " $(@D)"),
+      outs = [file],
+      tools = [
+          ":generate_examples",
+          toco,
+      ],
+  )
 
   native.filegroup(
       name = name,
-      srcs = out_files,
+      srcs = [file],
   )
 
 def gen_selected_ops(name, model):
diff --git a/tensorflow/contrib/lite/builtin_op_data.h b/tensorflow/contrib/lite/builtin_op_data.h
index 4910c89eaebabb..c1cc4476fbd45f 100644
--- a/tensorflow/contrib/lite/builtin_op_data.h
+++ b/tensorflow/contrib/lite/builtin_op_data.h
@@ -148,10 +148,20 @@ typedef struct {
   float beta;
 } TfLiteLocalResponseNormParams;
 
+typedef enum {
+  kTfLiteLSTMFullKernel = 0,
+  kTfLiteLSTMBasicKernel
+} TfLiteLSTMKernelType;
+
 typedef struct {
+  // Parameters for LSTM version 1.
   TfLiteFusedActivation activation;
   float cell_clip;
   float proj_clip;
+
+  // Parameters for LSTM version 2.
+  // kTfLiteLSTMBasicKernel is only supported in version 2 or above.
+  TfLiteLSTMKernelType kernel_type;
 } TfLiteLSTMParams;
 
 typedef struct {
@@ -161,6 +171,9 @@ typedef struct {
 typedef struct {
 } TfLitePadParams;
 
+typedef struct {
+} TfLitePadV2Params;
+
 typedef struct {
   // TODO(ahentz): We can't have dynamic data in this struct, at least not yet.
   // For now we will fix the maximum possible number of dimensions.
@@ -227,6 +240,16 @@ typedef struct {
   TfLiteType output_type;
 } TfLiteArgMaxParams;
 
+typedef struct {
+  TfLitePadding padding;
+  int stride_width;
+  int stride_height;
+} TfLiteTransposeConvParams;
+
+typedef struct {
+  bool validate_indices;
+} TfLiteSparseToDenseParams;
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/contrib/lite/builtin_ops.h b/tensorflow/contrib/lite/builtin_ops.h
index 859bc7ab70dc36..fc6fdd6eefb4ce 100644
--- a/tensorflow/contrib/lite/builtin_ops.h
+++ b/tensorflow/contrib/lite/builtin_ops.h
@@ -33,6 +33,7 @@ typedef enum {
   kTfLiteBuiltinDepthwiseConv2d = 4,
   kTfLiteBuiltinDequantize = 6,
   kTfLiteBuiltinEmbeddingLookup = 7,
+  kTfLiteBuiltinFloor = 8,
   kTfLiteBuiltinFullyConnected = 9,
   kTfLiteBuiltinHashtableLookup = 10,
   kTfLiteBuiltinL2Normalization = 11,
@@ -83,10 +84,21 @@ typedef enum {
   kTfLiteBuiltinArgMax = 56,
   kTfLiteBuiltinMinimum = 57,
   kTfLiteBuiltinLess = 58,
+  kTfLiteBuiltinNeg = 59,
+  kTfLiteBuiltinPadv2 = 60,
+  kTfLiteBuiltinGreater = 61,
+  kTfLiteBuiltinGreaterEqual = 62,
+  kTfLiteBuiltinLessEqual = 63,
+  kTfLiteBuiltinSelect = 64,
+  kTfLiteBuiltinSlice = 65,
+  kTfLiteBuiltinSin = 66,
+  kTfLiteBuiltinTransposeConv = 67,
+  kTfLiteBuiltinSparseToDense = 68,
+  kTfLiteBuiltinTile = 69,
+  kTfLiteBuiltinExpandDims = 70,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
 #endif  // TENSORFLOW_CONTRIB_LITE_BUILTIN_OPS_H_
-}
diff --git a/tensorflow/contrib/lite/context.h b/tensorflow/contrib/lite/context.h
index 12841d233cc1d3..4eb66cc225eb04 100644
--- a/tensorflow/contrib/lite/context.h
+++ b/tensorflow/contrib/lite/context.h
@@ -370,13 +370,21 @@ typedef struct _TfLiteRegistration {
 
   // Builtin codes. If this kernel refers to a builtin this is the code
   // of the builtin. This is so we can do marshaling to other frameworks like
-  // NN API. Note, it is the responsibility of the registration binder to
-  // set this properly.
+  // NN API.
+  // Note: It is the responsibility of the registration binder to set this
+  // properly.
   int32_t builtin_code;
 
   // Custom op name. If the op is a builtin, this will be null.
+  // Note: It is the responsibility of the registration binder to set this
+  // properly.
   // WARNING: This is an experimental interface that is subject to change.
   const char* custom_name;
+
+  // The version of the op.
+  // Note: It is the responsibility of the registration binder to set this
+  // properly.
+  int version;
 } TfLiteRegistration;
 
 // WARNING: This is an experimental interface that is subject to change.
diff --git a/tensorflow/contrib/lite/context_util.h b/tensorflow/contrib/lite/context_util.h
new file mode 100644
index 00000000000000..abe802e34214ca
--- /dev/null
+++ b/tensorflow/contrib/lite/context_util.h
@@ -0,0 +1,48 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This provides a few C++ helpers that are useful for manipulating C structures
+// in C++.
+#ifndef TENSORFLOW_CONTRIB_LITE_CONTEXT_UTIL_H_
+#define TENSORFLOW_CONTRIB_LITE_CONTEXT_UTIL_H_
+
+#include "tensorflow/contrib/lite/context.h"
+
+namespace tflite {
+
+// Provide a range iterable wrapper for TfLiteIntArray* (C lists that TfLite
+// C api uses. Can't use the google array_view, since we can't depend on even
+// absl for embedded device reasons.
+class TfLiteIntArrayView {
+ public:
+  // Construct a view of a TfLiteIntArray*. Note, `int_array` should be non-null
+  // and this view does not take ownership of it.
+  explicit TfLiteIntArrayView(const TfLiteIntArray* int_array)
+      : int_array_(int_array) {}
+
+  TfLiteIntArrayView(const TfLiteIntArrayView&) = default;
+  TfLiteIntArrayView& operator=(const TfLiteIntArrayView& rhs) = default;
+
+  typedef const int* const_iterator;
+  const_iterator begin() const { return int_array_->data; }
+  const_iterator end() const { return &int_array_->data[int_array_->size]; }
+  size_t size() const { return end() - begin(); }
+
+ private:
+  const TfLiteIntArray* int_array_;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_CONTEXT_UTIL_H_
diff --git a/tensorflow/contrib/lite/delegates/nnapi/BUILD b/tensorflow/contrib/lite/delegates/nnapi/BUILD
new file mode 100644
index 00000000000000..35a8f6ca4166e3
--- /dev/null
+++ b/tensorflow/contrib/lite/delegates/nnapi/BUILD
@@ -0,0 +1,31 @@
+package(default_visibility = [
+    "//visibility:public",
+])
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+licenses(["notice"])  # Apache 2.0
+
+cc_library(
+    name = "nnapi_delegate",
+    srcs = ["nnapi_delegate.cc"],
+    hdrs = ["nnapi_delegate.h"],
+    deps = [
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:kernel_api",
+        "//tensorflow/contrib/lite/kernels:kernel_util",
+        "//tensorflow/contrib/lite/nnapi:nnapi_lib",
+    ],
+)
+
+tf_cc_test(
+    name = "nnapi_delegate_test",
+    size = "small",
+    srcs = ["nnapi_delegate_test.cc"],
+    deps = [
+        ":nnapi_delegate",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc
new file mode 100644
index 00000000000000..0731d14419d2de
--- /dev/null
+++ b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc
@@ -0,0 +1,464 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdarg>
+#include <iostream>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/contrib/lite/allocation.h"
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/builtin_ops.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/context_util.h"
+#include "tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h"
+
+namespace tflite {
+namespace {
+
+// TODO(b/80621585): Consider printing error string, but don't for now to
+// minimize binary size.
+#define CHECK_NN(context, code)                                           \
+  if (code != ANEURALNETWORKS_NO_ERROR) {                                 \
+    context->ReportError(context, "NN API returned error (%d).\n", code); \
+    return kTfLiteError;                                                  \
+  }
+
+// RAII NN API Model Destructor for use with std::unique_ptr
+struct NNFreeModel {
+  void operator()(ANeuralNetworksModel* model) {
+    ANeuralNetworksModel_free(model);
+  }
+};
+// RAII NN API Compilation Destructor for use with std::unique_ptr
+struct NNFreeCompilation {
+  void operator()(ANeuralNetworksCompilation* model) {
+    ANeuralNetworksCompilation_free(model);
+  }
+};
+
+// Track tensor indices to NN API tensor indices mapping.
+class OperandMapping {
+ public:
+  // Given a TFLite index return the ANN index. If it doesn't exist
+  // return -1.
+  int lite_index_to_ann(int index) const {
+    if (index < lite_tensor_to_ann_tensor_.size())
+      return lite_tensor_to_ann_tensor_[index];
+    else
+      return -1;
+  }
+
+  // NN API uses non tensor operands instead of structs. This creates one
+  // and returns the index. It uses a std::vector and resizes it as needed
+  // keeping -1 to unmapped values. Intermediate tensors likely will not
+  // be mapped.
+  int add_new_non_tensor_operand() { return next_ann_tensor_index_++; }
+
+  // Add a new mapping from `tflite_index` and return the NN API tensor index.
+  int add_new_ann_tensor_index(int tflite_index) {
+    if (tflite_index >= lite_tensor_to_ann_tensor_.size()) {
+      lite_tensor_to_ann_tensor_.resize(tflite_index + 1);
+    }
+    int new_tensor_index = next_ann_tensor_index_++;
+    lite_tensor_to_ann_tensor_[tflite_index] = new_tensor_index;
+    return new_tensor_index;
+  }
+
+ private:
+  // Next index of ann tensor
+  int next_ann_tensor_index_ = 0;
+
+  // Mapping from lite index. Use a std::vector for speed and code size
+  // rather than a map.
+  std::vector<int> lite_tensor_to_ann_tensor_;
+};
+
+// Abstract builder for building an op in the NN API graph. This handles
+// the disparity between TFLite and NN API operand types. NN API has singular
+// operands for both tensors and parameters, and TFLite separates the two.
+class NNAPIOpBuilder {
+ public:
+  NNAPIOpBuilder(TfLiteContext* context, OperandMapping* tensor_mapping,
+                 ANeuralNetworksModel* nn_model)
+      : context_(context),
+        operand_mapping_(tensor_mapping),
+        nn_model_(nn_model) {}
+
+  TfLiteStatus AddScalarInt32Operand(int value) {
+    ANeuralNetworksOperandType operand_type{.type = ANEURALNETWORKS_INT32};
+    CHECK_NN(context_,
+             ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+    int ann_operand = operand_mapping_->add_new_non_tensor_operand();
+    CHECK_NN(context_, ANeuralNetworksModel_setOperandValue(
+                           nn_model_, ann_operand, &value, sizeof(int32_t)));
+    augmented_inputs_.push_back(ann_operand);
+    return kTfLiteOk;
+  }
+
+  TfLiteStatus AddTensorInput(int tensor_index) {
+    int ann_index;
+    TF_LITE_ENSURE_STATUS(AddTensor(tensor_index, &ann_index));
+    augmented_inputs_.push_back(ann_index);
+    return kTfLiteOk;
+  }
+
+  TfLiteStatus AddTensorOutput(int tensor_index) {
+    int ann_index;
+    TF_LITE_ENSURE_STATUS(AddTensor(tensor_index, &ann_index));
+    augmented_outputs_.push_back(ann_index);
+    return kTfLiteOk;
+  }
+
+  // Adds a new NN API tensor that shadows the TF Lite tensor `tensor_index`.
+  // This returns the NN API tensor index corresponding to the created tensor.
+  // If another caller previously created a NN API tensor for `tensor_index`
+  // then the existing one is returned.
+  TfLiteStatus AddTensor(int tensor_index, int* ann_tensor_index_out) {
+    int ann_tensor_index = operand_mapping_->lite_index_to_ann(tensor_index);
+    if (ann_tensor_index != -1) {
+      *ann_tensor_index_out = ann_tensor_index;
+      return kTfLiteOk;
+    }
+    // Allocate a new tensor index
+    ann_tensor_index = operand_mapping_->add_new_ann_tensor_index(tensor_index);
+
+    // Parameters needed for new type.
+    int32_t nn_type = 0;
+    float scale = 0.0f;
+    int32_t zeroPoint = 0;
+    TfLiteTensor* tensor = &context_->tensors[tensor_index];
+    switch (tensor->type) {
+      case kTfLiteNoType:
+        // Tensors added during initialization of Ops don't have a type yet and
+        // should not be registered with the NNAPI.
+        *ann_tensor_index_out = -1;
+        return kTfLiteOk;
+      case kTfLiteFloat32:
+        nn_type = ANEURALNETWORKS_TENSOR_FLOAT32;
+        scale = 0.f;
+        break;
+      case kTfLiteUInt8:
+        nn_type = ANEURALNETWORKS_TENSOR_QUANT8_ASYMM;
+        scale = tensor->params.scale;
+        zeroPoint = tensor->params.zero_point;
+        break;
+      case kTfLiteInt32:
+        nn_type = ANEURALNETWORKS_TENSOR_INT32;
+        scale = 0.f;
+        zeroPoint = 0;
+        break;
+      default:
+        context_->ReportError(context_, "Logic error in NN API Delegate.\n");
+        return kTfLiteError;
+    }
+
+    ANeuralNetworksOperandType operand_type{
+        nn_type, static_cast<uint32_t>(tensor->dims->size),
+        reinterpret_cast<uint32_t*>(tensor->dims->data), scale, zeroPoint};
+    CHECK_NN(context_,
+             ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+
+    if (tensor->allocation_type == kTfLiteMmapRo) {
+      // TODO(b/80630405): Use NNAPIAllocation.
+      CHECK_NN(context_, ANeuralNetworksModel_setOperandValue(
+                             nn_model_, ann_tensor_index, tensor->data.raw,
+                             tensor->bytes));
+    }
+
+    *ann_tensor_index_out = ann_tensor_index;
+    return kTfLiteOk;
+  }
+
+  // Finish emitting the op (of type `type`) into the NN API.
+  TfLiteStatus FinalizeAddOperation(ANeuralNetworksOperationType type) {
+    // Actually add a NN API operation
+    CHECK_NN(context_, ANeuralNetworksModel_addOperation(
+                           nn_model_, type,
+                           static_cast<uint32_t>(augmented_inputs_.size()),
+                           augmented_inputs_.data(),
+                           static_cast<uint32_t>(augmented_outputs_.size()),
+                           augmented_outputs_.data()));
+    augmented_outputs_.clear();
+    augmented_outputs_.clear();
+    return kTfLiteOk;
+  }
+
+ private:
+  // TfLiteContext for error handling. Must be named context for macros to
+  // work.
+  TfLiteContext* context_;
+
+  // Tracks relationship between indices
+  OperandMapping* operand_mapping_;
+
+  // The model
+  ANeuralNetworksModel* nn_model_;
+
+  // Inputs and outputs for the current op. These are augmented in the sense
+  // that NN API uses operands for all arguments, not just tensors, unlike
+  // TensorFlow lite.
+  std::vector<uint32_t> augmented_inputs_;
+  std::vector<uint32_t> augmented_outputs_;
+};
+
+// The kernel that represents the subgraph of TF Lite being run on NN API.
+class NNAPIDelegateKernel {
+ public:
+  NNAPIDelegateKernel() = default;
+
+  typedef ANeuralNetworksOperationType (*MappingFn)(TfLiteContext*,
+                                                    NNAPIOpBuilder* builder,
+                                                    TfLiteNode* node);
+
+  // Return a function that knows how to translate a node into its operands
+  // when called. You can use this function to see if a node is supported
+  // (i.e. that MappingFn is not nullptr).
+  MappingFn Map(TfLiteContext* context, int builtin_code, TfLiteNode* node) {
+    switch (builtin_code) {
+      case kTfLiteBuiltinAdd:
+        return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                  TfLiteNode* node) -> ANeuralNetworksOperationType {
+          auto builtin = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
+          builder->AddScalarInt32Operand(builtin->activation);
+          return ANEURALNETWORKS_ADD;
+        };
+        break;
+      case kTfLiteBuiltinAveragePool2d:
+        return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                  TfLiteNode* node) -> ANeuralNetworksOperationType {
+          auto builtin =
+              reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
+          builder->AddScalarInt32Operand(builtin->padding);
+          builder->AddScalarInt32Operand(builtin->stride_width);
+          builder->AddScalarInt32Operand(builtin->stride_height);
+          builder->AddScalarInt32Operand(builtin->filter_width);
+          builder->AddScalarInt32Operand(builtin->filter_height);
+          builder->AddScalarInt32Operand(builtin->activation);
+          return ANEURALNETWORKS_AVERAGE_POOL_2D;
+        };
+        break;
+      default:
+        return nullptr;
+    }
+  }
+
+  // Initialize the kernel (a NN model).
+  TfLiteStatus Init(TfLiteContext* context,
+                    const TfLiteDelegateParams* params) {
+    for (auto node_index : TfLiteIntArrayView(params->nodes_to_replace)) {
+      nodes_.push_back(node_index);
+    }
+
+    if (!nn_model_) {
+      ANeuralNetworksModel* model;
+      CHECK_NN(context, ANeuralNetworksModel_create(&model));
+      nn_model_.reset(model);
+
+      TF_LITE_ENSURE_STATUS(
+          BuildGraph(context, params->input_tensors, params->output_tensors));
+    }
+
+    if (!nn_compilation_) {
+      ANeuralNetworksCompilation* compilation;
+      CHECK_NN(context, ANeuralNetworksCompilation_create(nn_model_.get(),
+                                                          &compilation));
+      CHECK_NN(context, ANeuralNetworksCompilation_finish(compilation));
+      nn_compilation_.reset(compilation);
+    }
+    return kTfLiteOk;
+  }
+
+  TfLiteStatus Invoke(TfLiteContext* context, TfLiteNode* node) {
+    ANeuralNetworksExecution* execution = nullptr;
+    CHECK_NN(context, ANeuralNetworksExecution_create(nn_compilation_.get(),
+                                                      &execution));
+
+    // Set the input tensor buffers. Note: we access tflite tensors using
+    // absolute indices but NN api indices inputs by relative indices.
+    int relative_input_index = 0;
+    for (auto absolute_input_index : TfLiteIntArrayView(node->inputs)) {
+      TfLiteTensor* tensor = &context->tensors[absolute_input_index];
+      CHECK_NN(context, ANeuralNetworksExecution_setInput(
+                            execution, relative_input_index, nullptr,
+                            tensor->data.raw, tensor->bytes));
+      relative_input_index++;
+    }
+
+    // Set the output tensor buffers.
+    int relative_output_index = 0;
+    for (auto output_index : TfLiteIntArrayView(node->outputs)) {
+      TfLiteTensor* tensor = &context->tensors[output_index];
+      CHECK_NN(context, ANeuralNetworksExecution_setOutput(
+                            execution, relative_output_index, nullptr,
+                            tensor->data.raw, tensor->bytes));
+      relative_output_index++;
+    }
+    // Invoke ANN in blocking fashion.
+    ANeuralNetworksEvent* event = nullptr;
+    CHECK_NN(context, ANeuralNetworksExecution_startCompute(execution, &event));
+    CHECK_NN(context, ANeuralNetworksEvent_wait(event));
+    ANeuralNetworksEvent_free(event);
+    ANeuralNetworksExecution_free(execution);
+
+    return kTfLiteOk;
+  }
+
+ private:
+  // ANN API state.
+  std::unique_ptr<ANeuralNetworksModel, NNFreeModel> nn_model_;
+  std::unique_ptr<ANeuralNetworksCompilation, NNFreeCompilation>
+      nn_compilation_;
+  // Node indices that this delegate is responsible for. Indices here
+  // indexes into the nodes array in the TfLiteContext.
+  std::vector<int> nodes_;
+  // Track indices we use
+  OperandMapping operand_mapping_;
+
+  TfLiteStatus AddOpsAndTensors(TfLiteContext* context) {
+    // The operand builder allows creating a single op. We create it at this
+    // reduced power position rather than in the for loop to avoid reallocating
+    // the vectors.
+    NNAPIOpBuilder builder(context, &operand_mapping_, nn_model_.get());
+    // Add Tensors
+    // allocate outside to avoid realloc
+    for (auto node_index : nodes_) {
+      // Obtain the op and registration.
+      TfLiteNode* node;
+      TfLiteRegistration* reg;
+      context->GetNodeAndRegistration(context, node_index, &node, &reg);
+      // Map inputs to NN API tensor indices.
+      for (auto input_index : TfLiteIntArrayView(node->inputs)) {
+        TF_LITE_ENSURE_STATUS(builder.AddTensorInput(input_index));
+      }
+      // Get op type and operands
+      int nn_op_type =
+          Map(context, reg->builtin_code, node)(context, &builder, node);
+      // Map outputs to NN API tensor indices.
+      for (auto output_index : TfLiteIntArrayView(node->outputs)) {
+        TF_LITE_ENSURE_STATUS(builder.AddTensorOutput(output_index));
+      }
+
+      builder.FinalizeAddOperation(nn_op_type);
+    }
+    return kTfLiteOk;
+  }
+
+  TfLiteStatus BuildGraph(TfLiteContext* context,
+                          const TfLiteIntArray* input_tensors,
+                          const TfLiteIntArray* output_tensors) {
+    // Build the ops and tensors.
+    TF_LITE_ENSURE_STATUS(AddOpsAndTensors(context));
+    // Map input and output tensor indices to ANN
+    std::vector<uint32_t> inputs;
+    inputs.reserve(input_tensors->size);
+    std::vector<uint32_t> outputs;
+    outputs.reserve(output_tensors->size);
+    // Make the TensorFlow lite inputs and outputs to ann_indices.
+    for (int i : TfLiteIntArrayView(input_tensors))
+      inputs.push_back(operand_mapping_.lite_index_to_ann(i));
+    for (int i : TfLiteIntArrayView(output_tensors))
+      outputs.push_back(operand_mapping_.lite_index_to_ann(i));
+    // Tell ANN to declare inputs/outputs
+    CHECK_NN(context, ANeuralNetworksModel_identifyInputsAndOutputs(
+                          nn_model_.get(), inputs.size(), inputs.data(),
+                          outputs.size(), outputs.data()));
+    // Finalize the model
+    CHECK_NN(context, ANeuralNetworksModel_finish(nn_model_.get()));
+
+    return kTfLiteOk;
+  }
+};
+
+}  // namespace
+
+// Return a NN API Delegate struct that can check for support of ops.
+TfLiteDelegate* NnApiDelegate() {
+  static TfLiteDelegate delegate = {
+      .data_ = nullptr,
+      .Prepare = [](TfLiteContext* context,
+                    TfLiteDelegate* delegate) -> TfLiteStatus {
+        // Do not check nodes_ if NN API is unavailable.
+        if (!NNAPIExists()) return kTfLiteOk;
+
+        std::vector<int> supported_nodes(1);
+        // We don't care about all nodes_, we only care about ones in the
+        // current plan.
+        TfLiteIntArray* plan;
+        TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan));
+        int total_supported_nodes = 0;
+        // Check for every node if it is supported
+        // TODO(b/80625235): Fix this to do more careful checking of versioning.
+        for (int node_index : TfLiteIntArrayView(plan)) {
+          TfLiteNode* node;
+          TfLiteRegistration* registration;
+          TF_LITE_ENSURE_STATUS(context->GetNodeAndRegistration(
+              context, node_index, &node, &registration));
+          NNAPIDelegateKernel dummy_kernel;
+          if (dummy_kernel.Map(context, registration->builtin_code, node)) {
+            supported_nodes.push_back(node_index);
+          }
+          total_supported_nodes += 1;
+        }
+        // Put the size at the beginning of the array.
+        supported_nodes[0] = supported_nodes.size() - 1;
+
+        // NN API Delegate Registration (the pseudo kernel that will invoke NN
+        // API subgraphs)
+        static const TfLiteRegistration nnapi_delegate_kernel = {
+            .init = [](TfLiteContext* context, const char* buffer,
+                       size_t length) -> void* {
+              const TfLiteDelegateParams* params =
+                  reinterpret_cast<const TfLiteDelegateParams*>(buffer);
+              NNAPIDelegateKernel* kernel_state = new NNAPIDelegateKernel;
+              kernel_state->Init(context, params);
+              return kernel_state;
+            },
+
+            .free = [](TfLiteContext* context, void* buffer) -> void {
+              delete reinterpret_cast<NNAPIDelegateKernel*>(buffer);
+            },
+
+            .prepare = [](TfLiteContext* context,
+                          TfLiteNode* node) -> TfLiteStatus {
+              // Since the underlying resize happened ahead of delegation
+              // worked. This does nothing.
+              return kTfLiteOk;
+            },
+
+            .invoke = [](TfLiteContext* context,
+                         TfLiteNode* node) -> TfLiteStatus {
+              NNAPIDelegateKernel* state =
+                  reinterpret_cast<NNAPIDelegateKernel*>(node->user_data);
+              return state->Invoke(context, node);
+            },
+
+            .builtin_code = kTfLiteBuiltinDelegate,
+        };
+
+        // Request TFLite to partition the graph and make kernels
+        // for each independent subgraph a new nnapi_delegate_kernel.
+        context->ReplaceSubgraphsWithDelegateKernels(
+            context, nnapi_delegate_kernel,
+            reinterpret_cast<TfLiteIntArray*>(supported_nodes.data()),
+            delegate);
+        return kTfLiteOk;
+      }};
+
+  return &delegate;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.h b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.h
new file mode 100644
index 00000000000000..44cca2fd285370
--- /dev/null
+++ b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.h
@@ -0,0 +1,31 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_H_
+#define TENSORFLOW_CONTRIB_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_H_
+
+#include "tensorflow/contrib/lite/context.h"
+
+namespace tflite {
+
+// Return a delegate that can be used to use the NN API.
+// e.g.
+//   NnApiDelegate* delegate = NnApiDelegate();
+//   interpreter->ModifyGraphWithDelegate(&delegate);
+// NnApiDelegate() returns a singleton, so you should not free this
+// pointer or worry about its lifetime.
+TfLiteDelegate* NnApiDelegate();
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_H_
diff --git a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc
new file mode 100644
index 00000000000000..ff2e721423f078
--- /dev/null
+++ b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc
@@ -0,0 +1,82 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.h"
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class FloatAddOpModel : public SingleOpModel {
+ public:
+  FloatAddOpModel(const TensorData& input1, const TensorData& input2,
+                  const TensorData& output,
+                  ActivationFunctionType activation_type) {
+    this->SetApplyDelegate([](Interpreter* interpreter) {
+      interpreter->ModifyGraphWithDelegate(NnApiDelegate());
+    });
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_ADD, BuiltinOptions_AddOptions,
+                 CreateAddOptions(builder_, activation_type).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+// Do a test with the NN API using no activation.
+TEST(NNAPIDelegate, AddWithNoActivation) {
+  FloatAddOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+  m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1.9, 0.4, 1.0, 1.3}));
+}
+
+// Do a test with the NN api with relu.
+TEST(NNAPIDelegate, AddWithRelu) {
+  FloatAddOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {}}, ActivationFunctionType_RELU);
+  m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0.0, 0.4, 1.0, 1.3}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/download_dependencies.sh b/tensorflow/contrib/lite/download_dependencies.sh
index 436c3e1d4cad5e..840015a7fad173 100755
--- a/tensorflow/contrib/lite/download_dependencies.sh
+++ b/tensorflow/contrib/lite/download_dependencies.sh
@@ -30,9 +30,7 @@ if [ ! -f $BZL_FILE_PATH ]; then
 fi
 
 EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
-# TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' once
-# the archive has been propagated in mirror.bazel.build.
-GEMMLOWP_URL="$(grep -o 'https://github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
+GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)"
 NEON_2_SSE_URL="https://github.com/intel/ARM_NEON_2_x86_SSE/archive/master.zip"
diff --git a/tensorflow/contrib/lite/examples/android/BUILD b/tensorflow/contrib/lite/examples/android/BUILD
index 49280129971e38..57000072561303 100644
--- a/tensorflow/contrib/lite/examples/android/BUILD
+++ b/tensorflow/contrib/lite/examples/android/BUILD
@@ -42,7 +42,6 @@ android_binary(
     custom_package = "org.tensorflow.lite.demo",
     inline_constants = 1,
     manifest = "AndroidManifest.xml",
-    manifest_merger = "android",
     nocompress_extensions = [
         ".tflite",
     ],
diff --git a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h
index 2a64c1de725b60..e36218e4f12057 100644
--- a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h
+++ b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers_impl.h
@@ -62,8 +62,8 @@ void resize(T* out, uint8_t* in, int image_height, int image_width,
       {1, wanted_height, wanted_width, wanted_channels}, quant);
 
   ops::builtin::BuiltinOpResolver resolver;
-  TfLiteRegistration* resize_op =
-      resolver.FindOp(BuiltinOperator_RESIZE_BILINEAR);
+  const TfLiteRegistration* resize_op =
+      resolver.FindOp(BuiltinOperator_RESIZE_BILINEAR, 1);
   auto* params = reinterpret_cast<TfLiteResizeBilinearParams*>(
       malloc(sizeof(TfLiteResizeBilinearParams)));
   params->align_corners = false;
diff --git a/tensorflow/contrib/lite/examples/label_image/label_image.cc b/tensorflow/contrib/lite/examples/label_image/label_image.cc
index a91467d345fdce..966fcd2a31fd4d 100644
--- a/tensorflow/contrib/lite/examples/label_image/label_image.cc
+++ b/tensorflow/contrib/lite/examples/label_image/label_image.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <cstdio>
 #include <cstdlib>
 #include <fstream>
+#include <iomanip>
 #include <iostream>
 #include <memory>
 #include <sstream>
@@ -70,6 +71,22 @@ TfLiteStatus ReadLabelsFile(const string& file_name,
   return kTfLiteOk;
 }
 
+void PrintProfilingInfo(const profiling::ProfileEvent* e, uint32_t op_index,
+                        TfLiteRegistration registration) {
+  // output something like
+  // time (ms) , Node xxx, OpCode xxx, symblic name
+  //      5.352, Node   5, OpCode   4, DEPTHWISE_CONV_2D
+
+  LOG(INFO) << std::fixed << std::setw(10) << std::setprecision(3)
+            << (e->end_timestamp_us - e->begin_timestamp_us) / 1000.0
+            << ", Node " << std::setw(3) << std::setprecision(3) << op_index
+            << ", OpCode " << std::setw(3) << std::setprecision(3)
+            << registration.builtin_code << ", "
+            << EnumNameBuiltinOperator(
+                   static_cast<BuiltinOperator>(registration.builtin_code))
+            << "\n";
+}
+
 void RunInference(Settings* s) {
   if (!s->model_name.c_str()) {
     LOG(ERROR) << "no model file name\n";
@@ -166,19 +183,36 @@ void RunInference(Settings* s) {
       exit(-1);
   }
 
+  profiling::Profiler* profiler = new profiling::Profiler();
+  interpreter->SetProfiler(profiler);
+
+  if (s->profiling) profiler->StartProfiling();
+
   struct timeval start_time, stop_time;
-  gettimeofday(&start_time, NULL);
+  gettimeofday(&start_time, nullptr);
   for (int i = 0; i < s->loop_count; i++) {
     if (interpreter->Invoke() != kTfLiteOk) {
       LOG(FATAL) << "Failed to invoke tflite!\n";
     }
   }
-  gettimeofday(&stop_time, NULL);
+  gettimeofday(&stop_time, nullptr);
   LOG(INFO) << "invoked \n";
   LOG(INFO) << "average time: "
             << (get_us(stop_time) - get_us(start_time)) / (s->loop_count * 1000)
             << " ms \n";
 
+  if (s->profiling) {
+    profiler->StopProfiling();
+    auto profile_events = profiler->GetProfileEvents();
+    for (int i = 0; i < profile_events.size(); i++) {
+      auto op_index = profile_events[i]->event_metadata;
+      const auto node_and_registration =
+          interpreter->node_and_registration(op_index);
+      const TfLiteRegistration registration = node_and_registration->second;
+      PrintProfilingInfo(profile_events[i], op_index, registration);
+    }
+  }
+
   const int output_size = 1000;
   const size_t num_results = 5;
   const float threshold = 0.001f;
@@ -217,13 +251,14 @@ void RunInference(Settings* s) {
 
 void display_usage() {
   LOG(INFO) << "label_image\n"
-            << "--accelerated, -a: [0|1], use Android NNAPI or note\n"
+            << "--accelerated, -a: [0|1], use Android NNAPI or not\n"
             << "--count, -c: loop interpreter->Invoke() for certain times\n"
             << "--input_mean, -b: input mean\n"
             << "--input_std, -s: input standard deviation\n"
             << "--image, -i: image_name.bmp\n"
             << "--labels, -l: labels for the model\n"
             << "--tflite_model, -m: model_name.tflite\n"
+            << "--profiling, -p: [0|1], profiling or not\n"
             << "--threads, -t: number of threads\n"
             << "--verbose, -v: [0|1] print more information\n"
             << "\n";
@@ -235,21 +270,22 @@ int Main(int argc, char** argv) {
   int c;
   while (1) {
     static struct option long_options[] = {
-        {"accelerated", required_argument, 0, 'a'},
-        {"count", required_argument, 0, 'c'},
-        {"verbose", required_argument, 0, 'v'},
-        {"image", required_argument, 0, 'i'},
-        {"labels", required_argument, 0, 'l'},
-        {"tflite_model", required_argument, 0, 'm'},
-        {"threads", required_argument, 0, 't'},
-        {"input_mean", required_argument, 0, 'b'},
-        {"input_std", required_argument, 0, 's'},
-        {0, 0, 0, 0}};
+        {"accelerated", required_argument, nullptr, 'a'},
+        {"count", required_argument, nullptr, 'c'},
+        {"verbose", required_argument, nullptr, 'v'},
+        {"image", required_argument, nullptr, 'i'},
+        {"labels", required_argument, nullptr, 'l'},
+        {"tflite_model", required_argument, nullptr, 'm'},
+        {"profiling", required_argument, nullptr, 'p'},
+        {"threads", required_argument, nullptr, 't'},
+        {"input_mean", required_argument, nullptr, 'b'},
+        {"input_std", required_argument, nullptr, 's'},
+        {nullptr, 0, nullptr, 0}};
 
     /* getopt_long stores the option index here. */
     int option_index = 0;
 
-    c = getopt_long(argc, argv, "a:b:c:f:i:l:m:s:t:v:", long_options,
+    c = getopt_long(argc, argv, "a:b:c:f:i:l:m:p:s:t:v:", long_options,
                     &option_index);
 
     /* Detect the end of the options. */
@@ -257,15 +293,14 @@ int Main(int argc, char** argv) {
 
     switch (c) {
       case 'a':
-        s.accel = strtol(  // NOLINT(runtime/deprecated_fn)
-            optarg, (char**)NULL, 10);
+        s.accel = strtol(optarg, nullptr, 10);  // NOLINT(runtime/deprecated_fn)
         break;
       case 'b':
-        s.input_mean = strtod(optarg, NULL);
+        s.input_mean = strtod(optarg, nullptr);
         break;
       case 'c':
-        s.loop_count = strtol(  // NOLINT(runtime/deprecated_fn)
-            optarg, (char**)NULL, 10);
+        s.loop_count =
+            strtol(optarg, nullptr, 10);  // NOLINT(runtime/deprecated_fn)
         break;
       case 'i':
         s.input_bmp_name = optarg;
@@ -276,16 +311,20 @@ int Main(int argc, char** argv) {
       case 'm':
         s.model_name = optarg;
         break;
+      case 'p':
+        s.profiling =
+            strtol(optarg, nullptr, 10);  // NOLINT(runtime/deprecated_fn)
+        break;
       case 's':
-        s.input_std = strtod(optarg, NULL);
+        s.input_std = strtod(optarg, nullptr);
         break;
       case 't':
         s.number_of_threads = strtol(  // NOLINT(runtime/deprecated_fn)
-            optarg, (char**)NULL, 10);
+            optarg, nullptr, 10);
         break;
       case 'v':
-        s.verbose = strtol(  // NOLINT(runtime/deprecated_fn)
-            optarg, (char**)NULL, 10);
+        s.verbose =
+            strtol(optarg, nullptr, 10);  // NOLINT(runtime/deprecated_fn)
         break;
       case 'h':
       case '?':
diff --git a/tensorflow/contrib/lite/examples/label_image/label_image.h b/tensorflow/contrib/lite/examples/label_image/label_image.h
index 4de32e33fb4ef2..4b48014e1c77ec 100644
--- a/tensorflow/contrib/lite/examples/label_image/label_image.h
+++ b/tensorflow/contrib/lite/examples/label_image/label_image.h
@@ -25,6 +25,7 @@ struct Settings {
   bool verbose = false;
   bool accel = false;
   bool input_floating = false;
+  bool profiling = false;
   int loop_count = 1;
   float input_mean = 127.5f;
   float input_std = 127.5f;
diff --git a/tensorflow/contrib/lite/examples/minimal/minimal.cc b/tensorflow/contrib/lite/examples/minimal/minimal.cc
new file mode 100644
index 00000000000000..8b0ace96ccaf06
--- /dev/null
+++ b/tensorflow/contrib/lite/examples/minimal/minimal.cc
@@ -0,0 +1,71 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include <cstdio>
+
+// This is an example that is minimal to read a model
+// from disk and perform inference. There is no data being loaded
+// that is up to you to add as a user.
+//
+// NOTE: Do not add any dependencies to this that cannot be built with
+// the minimal makefile. This example must remain trivial to build with
+// the minimal build tool.
+//
+// Usage: minimal <tflite model>
+
+using namespace tflite;
+
+#define TFLITE_MINIMAL_CHECK(x) \
+  if(!(x)) {                                                    \
+    fprintf(stderr, "Error at %s:%d\n",  __FILE__, __LINE__); \
+    exit(1); \
+  }
+
+
+int main(int argc, char *argv[]) {
+  if(argc != 2) {
+    fprintf(stderr, "minimal <tflite model>\n");
+    return 1;
+  }
+  const char* filename = argv[1];
+
+  // Load model
+  std::unique_ptr<tflite::FlatBufferModel> model
+      = tflite::FlatBufferModel::BuildFromFile(filename);
+  TFLITE_MINIMAL_CHECK(model != nullptr);
+
+  // Build the interpreter
+  tflite::ops::builtin::BuiltinOpResolver resolver;
+  InterpreterBuilder builder(*model.get(), resolver);
+  std::unique_ptr<Interpreter> interpreter;
+  builder(&interpreter);
+  TFLITE_MINIMAL_CHECK(interpreter != nullptr);
+
+  // Allocate tensor buffers.
+  TFLITE_MINIMAL_CHECK(interpreter->AllocateTensors() == kTfLiteOk);
+
+  // Fill input buffers
+  // TODO(user): Insert code to fill input tensors
+
+  // Run inference
+  TFLITE_MINIMAL_CHECK(interpreter->Invoke() == kTfLiteOk);
+
+  // Read output buffers
+  // TODO(user): Insert getting data out code.
+
+  return 0;
+}
diff --git a/tensorflow/contrib/lite/g3doc/custom_operators.md b/tensorflow/contrib/lite/g3doc/custom_operators.md
index d7cc854ebac08e..972e57f73e8296 100644
--- a/tensorflow/contrib/lite/g3doc/custom_operators.md
+++ b/tensorflow/contrib/lite/g3doc/custom_operators.md
@@ -39,7 +39,7 @@ TfLiteStatus SinPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
 
   int num_dims = NumDimensions(input);
@@ -54,7 +54,7 @@ TfLiteStatus SinPrepare(TfLiteContext* context, TfLiteNode* node) {
 
 TfLiteStatus SinEval(TfLiteContext* context, TfLiteNode* node) {
   using namespace tflite;
-  TfLiteTensor* input = GetInput(context, node,0);
+  const TfLiteTensor* input = GetInput(context, node,0);
   TfLiteTensor* output = GetOutput(context, node,0);
 
   float* input_data = input->data.f;
diff --git a/tensorflow/contrib/lite/g3doc/models.md b/tensorflow/contrib/lite/g3doc/models.md
index d8134d5a00097b..c1c8ef049f693d 100644
--- a/tensorflow/contrib/lite/g3doc/models.md
+++ b/tensorflow/contrib/lite/g3doc/models.md
@@ -1,28 +1,63 @@
 # List of Hosted Models
 
-*   [NASNet large](https://storage.googleapis.com/download.tensorflow.org/models/tflite/nasnet_large_2018_03_27.zip)
-*   [NASNet mobile](https://storage.googleapis.com/download.tensorflow.org/models/tflite/nasnet_mobile_2018_03_27.zip)
-*   [ResNet v2 101](https://storage.googleapis.com/download.tensorflow.org/models/tflite/resnet_v2_101_2018_03_27.zip)
-*   [ResNet v2 50](https://storage.googleapis.com/download.tensorflow.org/models/tflite/resnet_v2_50_2018_03_27.zip)
-*   [Inception ResNet v2](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_resnet_v2_2018_03_27.zip)
-*   [Inception v4](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v4_2018_03_27.zip)
-*   [Inception v3 2015](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v3_2015_2017_11_10.zip)
-*   [Inception v3 Slim 2016](https://storage.googleapis.com/download.tensorflow.org/models/tflite/inception_v3_slim_2016_android_2017_11_10.zip)
-*   [Mobilenet 0.25 128 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_0.25_128_float_2017_11_08.zip)
-*   [Mobilenet 0.25 160 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_0.25_160_float_2017_11_08.zip)
-*   [Mobilenet 0.25 192 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_0.25_192_float_2017_11_08.zip)
-*   [Mobilenet 0.25 224 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_0.25_224_float_2017_11_08.zip)
-*   [Mobilenet 0.50 128 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_0.50_128_float_2017_11_08.zip)
-*   [Mobilenet 0.50 160 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_0.50_160_float_2017_11_08.zip)
-*   [Mobilenet 0.50 192 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_0.50_192_float_2017_11_08.zip)
-*   [Mobilenet 0.50 224 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_0.50_224_float_2017_11_08.zip)
-*   [Mobilenet 0.75 128 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_0.75_128_float_2017_11_08.zip)
-*   [Mobilenet 0.75 160 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_0.75_160_float_2017_11_08.zip)
-*   [Mobilenet 0.75 192 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_0.75_192_float_2017_11_08.zip)
-*   [Mobilenet 0.75 224 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_0.75_224_float_2017_11_08.zip)
-*   [Mobilenet 1.0 128 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_1.0_128_float_2017_11_08.zip)
-*   [Mobilenet 1.0 160 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_1.0_160_float_2017_11_08.zip)
-*   [Mobilenet 1.0 192 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_1.0_192_float_2017_11_08.zip)
-*   [Mobilenet 1.0 224 Float](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_1.0_224_float_2017_11_08.zip)
-*   [Mobilenet 1.0 224 Quant](https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip)
-*   [Smart Reply 1.0 Android ](https://storage.googleapis.com/download.tensorflow.org/models/tflite/smartreply_1.0_2017_11_01.zip)
+## Image classification (Float Models)
+
+Model Name          | Paper_Model_Files^                                                                                                                                                                        | Model_Size | Top-1 Accuracy | Top-5 Accuracy | TF Lite Performance^^ | Tensorflow Performance
+------------------- | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | ---------: | -------------: | -------------: | --------------------: | ---------------------:
+DenseNet            | [paper](https://arxiv.org/abs/1608.06993), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/densenet_2018_04_27.tgz)            | 43.6 Mb    | 64.2%          | 85.6%          | 894 ms                | 1262 ms
+SqueezeNet          | [paper](https://arxiv.org/abs/1602.07360), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/squeezenet_2018_04_27.tgz)          | 5.0 Mb     | 49.0%          | 72.9%          | 224 ms                | 255 ms
+NASNet mobile       | [paper](https://arxiv.org/abs/1707.07012), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/nasnet_mobile_2018_04_27.tgz)       | 21.4 Mb    | 72.2%          | 90.6%          | 261 ms                | 389 ms
+NASNet large        | [paper](https://arxiv.org/abs/1707.07012), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/nasnet_large_2018_04_27.tgz)        | 355.3 Mb   | 82.1%          | 95.8%          | 6697 ms               | 7940 ms
+ResNet_V2_50        | [paper](https://arxiv.org/abs/1603.05027), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/resnet_v2_50_2018_04_27.tgz)        | 102.3 Mb   | 68.1%          | 88.4%          | 942 ms                | 1008 ms
+ResNet_V2_101       | [paper](https://arxiv.org/abs/1603.05027), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/resnet_v2_101_2018_04_27.tgz)       | 178.3 Mb   | 70.4%          | 89.6%          | 1880 ms               | 1970 ms
+Inception_V3        | [paper](http://arxiv.org/abs/1512.00567), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v3_2018_04_27.tgz)         | 95.3 Mb    | 76.9%          | 93.5%          | 1433 ms               | 1522 ms
+Inception_V4        | [paper](http://arxiv.org/abs/1602.07261), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_v4_2018_04_27.tgz)         | 170.7 Mb   | 79.6%          | 94.6%          | 2986 ms               | 3139 ms
+Inception_ResNet_V2 | [paper](https://arxiv.org/abs/1602.07261), [tflite&pb](https://storage.googleapis.com/download.tensorflow.org/models/tflite/model_zoo/upload_20180427/inception_resnet_v2_2018_04_27.tgz) | 121.0 Mb   | 76.8%          | 93.5%          | 2731 ms               | 2926 ms
+Mobilenet_0.25_128  | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_128.tgz)                                       | 1.9 Mb     | 41.5%          | 66.3%          | 6.2 ms                | 13.0 ms
+Mobilenet_0.25_160  | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_160.tgz)                                       | 1.9 Mb     | 45.5%          | 70.3%          | 8.6 ms                | 19.5 ms
+Mobilenet_0.25_192  | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_192.tgz)                                       | 1.9 Mb     | 47.7%          | 72.3%          | 12.1 ms               | 27.8 ms
+Mobilenet_0.25_224  | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_224.tgz)                                       | 1.9 Mb     | 49.8%          | 74.2%          | 16.2 ms               | 37.3 ms
+Mobilenet_0.50_128  | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.5_128.tgz)                                        | 5.3 Mb     | 56.3%          | 79.4%          | 18.1 ms               | 29.9 ms
+Mobilenet_0.50_160  | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.5_160.tgz)                                        | 5.3 Mb     | 59.1%          | 81.9%          | 26.8 ms               | 45.9 ms
+Mobilenet_0.50_192  | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.5_192.tgz)                                        | 5.3 Mb     | 61.7%          | 83.6%          | 35.6 ms               | 65.3 ms
+Mobilenet_0.50_224  | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.5_224.tgz)                                        | 5.3 Mb     | 63.3%          | 84.9%          | 47.6 ms               | 164.2 ms
+Mobilenet_0.75_128  | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.75_128.tgz)                                       | 10.3 Mb    | 62.1%          | 83.9%          | 34.6 ms               | 48.7 ms
+Mobilenet_0.75_160  | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.75_160.tgz)                                       | 10.3 Mb    | 65.3%          | 86.0%          | 51.3 ms               | 75.2 ms
+Mobilenet_0.75_192  | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.75_192.tgz)                                       | 10.3 Mb    | 67.2%          | 87.3%          | 71.7 ms               | 107.0 ms
+Mobilenet_0.75_224  | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.75_224.tgz)                                       | 10.3 Mb    | 68.4%          | 88.2%          | 95.7 ms               | 143.4 ms
+Mobilenet_1.0_128   | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_128.tgz)                                        | 16.9 Mb    | 65.2%          | 85.8%          | 57.4 ms               | 76.8 ms
+Mobilenet_1.0_160   | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_160.tgz)                                        | 16.9 Mb    | 68.0%          | 87.7%          | 86.0 ms               | 117.7 ms
+Mobilenet_1.0_192   | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_192.tgz)                                        | 16.9 Mb    | 70.0%          | 89.2%          | 118.6 ms              | 167.3 ms
+Mobilenet_1.0_224   | [paper](https://arxiv.org/pdf/1704.04861.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz)                                        | 16.9 Mb    | 70.9%          | 89.9%          | 160.1 ms              | 224.3 ms
+
+^ The model files include both TF Lite FlatBuffer and Tensorflow frozen Graph.
+
+^^ The performance numbers are generated in the benchmark on Pixel-2 using
+single thread large core.
+
+## Image classification (Quantized Models)
+
+Model Name               | Paper_Model_Files                                                                                                                                         | Model_Size | Top-1 Accuracy | Top-5 Accuracy | TF Lite Performance
+------------------------ | :-------------------------------------------------------------------------------------------------------------------------------------------------------: | ---------: | -------------: | -------------: | ------------------:
+Mobilenet_0.25_128_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_128_quant.tgz) | 0.5 Mb     | 39.9%          | 65.8%          | 3.7 ms
+Mobilenet_0.25_160_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_160_quant.tgz) | 0.5 Mb     | 43.5%          | 69.1%          | 5.5 ms
+Mobilenet_0.25_192_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_192_quant.tgz) | 0.5 Mb     | 45.8%          | 71.9%          | 7.9 ms
+Mobilenet_0.25_224_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.25_224_quant.tgz) | 0.5 Mb     | 48.2%          | 73.8%          | 10.4 ms
+Mobilenet_0.50_128_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.5_128_quant.tgz)  | 1.4 Mb     | 54.9%          | 78.9%          | 8.8 ms
+Mobilenet_0.50_160_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.5_160_quant.tgz)  | 1.4 Mb     | 57.7%          | 81.3%          | 13.0 ms
+Mobilenet_0.50_192_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.5_192_quant.tgz)  | 1.4 Mb     | 60.4%          | 83.2%          | 18.3 ms
+Mobilenet_0.50_224_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.5_224_quant.tgz)  | 1.4 Mb     | 62.2%          | 84.5%          | 24.7 ms
+Mobilenet_0.75_128_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.75_128_quant.tgz) | 2.6 Mb     | 59.8%          | 82.8%          | 16.2 ms
+Mobilenet_0.75_160_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.75_160_quant.tgz) | 2.6 Mb     | 63.9%          | 85.5%          | 24.3 ms
+Mobilenet_0.75_192_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.75_192_quant.tgz) | 2.6 Mb     | 66.2%          | 87.1%          | 33.8 ms
+Mobilenet_0.75_224_quant | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_0.75_224_quant.tgz) | 2.6 Mb     | 67.9%          | 88.1%          | 45.4 ms
+Mobilenet_1.0_128_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_128_quant.tgz)  | 4.3 Mb     | 64.0%          | 85.5%          | 24.9 ms
+Mobilenet_1.0_160_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_160_quant.tgz)  | 4.3 Mb     | 67.3%          | 87.7%          | 37.4 ms
+Mobilenet_1.0_192_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_192_quant.tgz)  | 4.3 Mb     | 69.0%          | 88.9%          | 51.9 ms
+Mobilenet_1.0_224_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224_quant.tgz)  | 4.3 Mb     | 69.7%          | 89.5%          | 70.2 ms
+
+## Other models
+
+Model                   | TF Lite FlatBuffer
+----------------------- | :----------------:
+Smart Reply 1.0 Android | [reference](https://research.googleblog.com/2017/11/on-device-conversational-modeling-with.html), [tflite](https://storage.googleapis.com/download.tensorflow.org/models/smartreply_1.0_2017_11_01.zip)
diff --git a/tensorflow/contrib/lite/g3doc/rpi.md b/tensorflow/contrib/lite/g3doc/rpi.md
index 7a3a231626d0e1..ab507893074142 100644
--- a/tensorflow/contrib/lite/g3doc/rpi.md
+++ b/tensorflow/contrib/lite/g3doc/rpi.md
@@ -32,7 +32,7 @@ This has been tested on Raspberry Pi 3b, Raspbian GNU/Linux 9.1 (stretch), gcc v
 
 Log in to you RPI, install the toolchain.
 ```bash
-sudo apt-get instal build-essential
+sudo apt-get install build-essential
 ```
 
 First, clone this TensorFlow repository. Run this at the root of the repository:
diff --git a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
index 203924f03d3101..b2f6444e9e1dc3 100644
--- a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
+++ b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
@@ -132,10 +132,7 @@ TensorFlow operation not listed above are likely unsupported. Notably, the
 following common ops are not supported at the moment:
 
 *   [tf.depth_to_space](https://www.tensorflow.org/api_docs/python/tf/depth_to_space)
-*   [tf.floor](https://www.tensorflow.org/api_docs/python/tf/floor)
-*   [tf.gather](https://www.tensorflow.org/api_docs/python/tf/gather)
 *   [tf.image.resize_bilinear](https://www.tensorflow.org/api_docs/python/tf/image/resize_bilinear)
-*   [tf.slice](https://www.tensorflow.org/api_docs/python/tf/slice)
 *   [tf.tanh](https://www.tensorflow.org/api_docs/python/tf/tanh)
 
 ## TensorFlow Lite Operations
@@ -223,6 +220,23 @@ Options {
 }
 ```
 
+**CONV_2D_TRANSPOSE**
+
+```
+Inputs {
+  0: output_shape
+  1: filter
+  2: 4D tensor
+}
+Outputs {
+  0: the transpose (gradient) of conv2d
+}
+Options {
+  padding: SAME|VALID
+  stride_w,stride_h: stride of the filter window
+}
+```
+
 **DEPTHWISE_CONV_2D**
 
 ```
@@ -254,6 +268,17 @@ Outputs {
 }
 ```
 
+**FLOOR**
+
+```
+inputs {
+  0: tensor
+}
+outputs: {
+  0: result of computing element-wise floor of the input tensor
+}
+```
+
 **FULLY_CONNECTED**
 
 ```
@@ -271,6 +296,45 @@ Options {
 }
 ```
 
+**GATHER**
+
+```
+Inputs {
+  0: params tensor
+  1: indices tensor
+  2: axis tensor (optional)
+}
+Outputs {
+  0: a tensor with same type as the params tensor.
+}
+```
+
+**GREATER**
+
+```
+Inputs {
+  0: a tensor
+  1: a tensor
+}
+Outputs {
+  0: a tensor of type bool, true whenever an element of the first tensor is
+  greater than the corresponding element of the second tensor.
+}
+```
+
+**GREATER_EQUAL**
+
+```
+Inputs {
+  0: a tensor
+  1: a tensor
+}
+Outputs {
+  0: a tensor of type bool, true whenever an element of the first tensor is
+  greater than or equal to the corresponding element of the second tensor.
+}
+```
+
 **L2_NORMALIZATION**
 
 ```
@@ -315,6 +379,19 @@ Outputs {
 }
 ```
 
+**LESS_EQUAL**
+
+```
+Inputs {
+  0: a tensor
+  1: a tensor
+}
+Outputs {
+  0: a tensor of type bool, true whenever an element of the first tensor is less
+  than or equal to the corresponding element of the second tensor.
+}
+```
+
 **LOCAL_RESPONSE_NORMALIZATION**
 
 ```
@@ -387,6 +464,17 @@ Options {
 }
 ```
 
+**NEG**
+
+```
+Inputs {
+  0: a tensor
+}
+Outputs {
+  0: elementwise negation of the input tensor
+}
+```
+
 **PAD**
 
 ```
@@ -463,6 +551,19 @@ Options {
 }
 ```
 
+**SLICE**
+
+```
+Inputs {
+  0: tensor
+  1: 1D tensor
+  2: 1D tensor
+}
+Outputs {
+  0: slice of the input tensor of the given size from the given begin index.
+}
+```
+
 **SOFTMAX**
 
 ```
@@ -506,6 +607,21 @@ Outputs {
 }
 ```
 
+**SPARSE_TO_DENSE**
+
+```
+Inputs {
+  0: 0D or 1D or 2D tensor
+  1: 1D tensor
+  2: 0D or 1D tensor
+  3: 0D tensor
+  4: a boolean value
+}
+Outputs {
+  0: Dense Tensor of shape output_shape. Has the same type as sparse_values.
+}
+```
+
 **SPLIT**
 
 ```
@@ -548,7 +664,7 @@ Outputs {
   0: slice of the input tensor of the given size
 }
 Options {
-  begin_mask: mask for begin indicies
+  begin_mask: mask for begin indices
   end_mask: mask for end indices
   shrink_axis_mask: mask that indicates which dimensions to remove
 }
@@ -563,7 +679,7 @@ Inputs {
 }
 Outputs {
   0: k largest element along each last dimensional slice
-  1: indicies of values within the last dimension of the input ensor
+  1: indices of values within the last dimension of the input ensor
 }
 ```
 
@@ -579,6 +695,20 @@ Outputs {
 }
 ```
 
+**SELECT**
+
+```
+Inputs {
+  0: tensor
+  1: tensor
+  2: tensor
+}
+Outputs {
+  0: tensor that contains the elementwise values of 'tensor 1' if the
+  corresponding value of 'tensor 0' is true or the value of 'tensor 2' if false.
+}
+```
+
 And these are TensorFlow Lite operations that are present but not ready for
 custom models yet:
 
diff --git a/tensorflow/contrib/lite/interpreter.cc b/tensorflow/contrib/lite/interpreter.cc
index 9d8ea55fd1edc0..ebb0aedc2001a8 100644
--- a/tensorflow/contrib/lite/interpreter.cc
+++ b/tensorflow/contrib/lite/interpreter.cc
@@ -125,7 +125,8 @@ Interpreter::~Interpreter() {
 
   for (int i = 0; i < context_.tensors_size; i++) {
     TfLiteTensor* tensor = &context_.tensors[i];
-    if (tensor->buffer_handle != kTfLiteNullBufferHandle) {
+    if (tensor->buffer_handle != kTfLiteNullBufferHandle &&
+        tensor->delegate->FreeBufferHandle != nullptr) {
       tensor->delegate->FreeBufferHandle(tensor->delegate,
                                          &tensor->buffer_handle);
     }
diff --git a/tensorflow/contrib/lite/interpreter.h b/tensorflow/contrib/lite/interpreter.h
index 6f3433abcf71b6..7315d8360680ca 100644
--- a/tensorflow/contrib/lite/interpreter.h
+++ b/tensorflow/contrib/lite/interpreter.h
@@ -201,7 +201,7 @@ class Interpreter {
   // Overrides execution plan. This bounds checks indices sent in.
   TfLiteStatus SetExecutionPlan(const std::vector<int>& new_plan);
 
-  // Get a tensor data structure.
+  // Get a mutable tensor data structure.
   // TODO(aselle): Create a safe ArrayHandle interface to avoid exposing this
   // read/write access to structure
   TfLiteTensor* tensor(int tensor_index) {
@@ -210,9 +210,14 @@ class Interpreter {
     return &context_.tensors[tensor_index];
   }
 
+  // Get an immutable tensor data structure.
+  const TfLiteTensor* tensor(int tensor_index) const {
+    if (tensor_index >= context_.tensors_size || tensor_index < 0)
+      return nullptr;
+    return &context_.tensors[tensor_index];
+  }
+
   // Get a pointer to an operation and registration data structure if in bounds.
-  // TODO(aselle): Create a safe ArrayHandle interface to avoid exposing this
-  // read/write access to structure
   const std::pair<TfLiteNode, TfLiteRegistration>* node_and_registration(
       int node_index) const {
     if (node_index >= nodes_and_registration_.size() || node_index < 0)
@@ -220,7 +225,8 @@ class Interpreter {
     return &nodes_and_registration_[node_index];
   }
 
-  // Perform a checked cast to the appropriate tensor type.
+  // Perform a checked cast to the appropriate tensor type (mutable pointer
+  // version).
   template <class T>
   T* typed_tensor(int tensor_index) {
     if (TfLiteTensor* tensor_ptr = tensor(tensor_index)) {
@@ -231,20 +237,46 @@ class Interpreter {
     return nullptr;
   }
 
-  // Return a pointer into the data of a given input tensor. The given index
-  // must be between 0 and inputs().size().
+  // Perform a checked cast to the appropriate tensor type (immutable pointer
+  // version).
+  template <class T>
+  const T* typed_tensor(int tensor_index) const {
+    if (const TfLiteTensor* tensor_ptr = tensor(tensor_index)) {
+      if (tensor_ptr->type == typeToTfLiteType<T>()) {
+        return reinterpret_cast<const T*>(tensor_ptr->data.raw);
+      }
+    }
+    return nullptr;
+  }
+
+  // Return a mutable pointer into the data of a given input tensor. The given
+  // index must be between 0 and inputs().size().
   template <class T>
   T* typed_input_tensor(int index) {
     return typed_tensor<T>(inputs_[index]);
   }
 
-  // Return a pointer into the data of a given output tensor. The given index
-  // must be between 0 and outputs().size().
+  // Return an immutable pointer into the data of a given input tensor. The
+  // given index must be between 0 and inputs().size().
+  template <class T>
+  const T* typed_input_tensor(int index) const {
+    return typed_tensor<T>(inputs_[index]);
+  }
+
+  // Return a mutable pointer into the data of a given output tensor. The given
+  // index must be between 0 and outputs().size().
   template <class T>
   T* typed_output_tensor(int index) {
     return typed_tensor<T>(outputs_[index]);
   }
 
+  // Return an immutable pointer into the data of a given output tensor. The
+  // given index must be between 0 and outputs().size().
+  template <class T>
+  const T* typed_output_tensor(int index) const {
+    return typed_tensor<T>(outputs_[index]);
+  }
+
   // Change the dimensionality of a given tensor. Note, this is only acceptable
   // for tensor indices that are inputs.
   // Returns status of failure or success.
@@ -325,9 +357,7 @@ class Interpreter {
 
   void SetProfiler(profiling::Profiler* profiler) { profiler_ = profiler; }
 
-  profiling::Profiler* GetProfiler(profiling::Profiler* profiler) {
-    return profiler_;
-  }
+  profiling::Profiler* GetProfiler() { return profiler_; }
 
   // The default capacity of `tensors_` vector.
   static constexpr int kTensorsReservedCapacity = 128;
diff --git a/tensorflow/contrib/lite/java/BUILD b/tensorflow/contrib/lite/java/BUILD
index 1dda55b8edf8f8..593af81a18a1e2 100644
--- a/tensorflow/contrib/lite/java/BUILD
+++ b/tensorflow/contrib/lite/java/BUILD
@@ -1,7 +1,9 @@
 # Description:
 # TensorFlow Lite Java API.
 
-package(default_visibility = ["//visibility:private"])
+package(default_visibility = [
+    "//tensorflow/contrib/lite/java/ovic:__pkg__",
+])
 
 licenses(["notice"])  # Apache 2.0
 
@@ -46,23 +48,6 @@ android_library(
     ],
 )
 
-java_library(
-    name = "ovicbenchmarkerlib",
-    srcs = [
-        "ovic/src/main/java/org/tensorflow/ovic/OvicClassifier.java",
-        "ovic/src/main/java/org/tensorflow/ovic/OvicSingleImageResult.java",
-    ],
-    javacopts = JAVACOPTS,
-    visibility = ["//visibility:public"],
-    deps = [
-        ":libtensorflowlite_jni.so",
-        ":tensorflowlite_java",
-        "//tensorflow/contrib/lite/java/src/main/native",
-        "//tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
-        "@org_checkerframework_qual",
-    ],
-)
-
 java_library(
     name = "tensorflowlitelib",
     srcs = glob(
@@ -165,28 +150,6 @@ java_test(
     ],
 )
 
-java_test(
-    name = "OvicClassifierTest",
-    size = "medium",
-    srcs = ["ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java"],
-    data = [
-        "ovic/src/testdata/float_model.lite",
-        "ovic/src/testdata/labels.txt",
-        "ovic/src/testdata/low_res_model.lite",
-        "ovic/src/testdata/quantized_model.lite",
-        "ovic/src/testdata/test_image_128.jpg",
-        "ovic/src/testdata/test_image_224.jpg",
-    ],
-    javacopts = JAVACOPTS,
-    test_class = "org.tensorflow.ovic.OvicClassifierTest",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":ovicbenchmarkerlib",
-        "@com_google_truth",
-        "@junit",
-    ],
-)
-
 filegroup(
     name = "libtensorflowlite_jni",
     srcs = select({
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/AndroidManifest.xml b/tensorflow/contrib/lite/java/demo/app/src/main/AndroidManifest.xml
index ba63dce5d9a719..95b6b7016f2818 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/AndroidManifest.xml
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/AndroidManifest.xml
@@ -31,6 +31,7 @@
         android:theme="@style/MaterialTheme">
 
         <activity android:name="com.example.android.tflitecamerademo.CameraActivity"
+                  android:screenOrientation="portrait"
                   android:label="@string/app_name">
             <intent-filter>
                 <action android:name="android.intent.action.MAIN" />
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml
index 20f520814d7154..ef8a9e08450d72 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml
@@ -13,51 +13,55 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 -->
-<RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
-    xmlns:app="http://schemas.android.com/apk/res-auto"
+
+<LinearLayout
+    xmlns:android="http://schemas.android.com/apk/res/android"
     android:layout_width="match_parent"
-    android:layout_height="match_parent">
+    android:layout_height="match_parent"
+    android:background="#bb7700"
+    android:orientation="horizontal">
+
+  <com.example.android.tflitecamerademo.AutoFitTextureView
+      android:id="@+id/texture"
+      android:layout_width="0dp"
+      android:layout_height="match_parent"
+      android:layout_weight=".8"/>
+
+  <LinearLayout
+      android:layout_width="0dp"
+      android:layout_height="match_parent"
+      android:layout_weight=".2"
+      android:orientation="vertical">
+
+    <ImageView
+        android:id="@+id/logoview"
+        android:layout_width="wrap_content"
+        android:layout_height="wrap_content"
+        android:scaleType="centerInside"
+        android:src="@drawable/logo"/>
 
-    <LinearLayout
+    <ToggleButton
+        android:id="@+id/button"
+        android:layout_width="match_parent"
+        android:layout_height="wrap_content"
+        android:textOff="@string/tflite"
+        android:textOn="@string/nnapi"/>
+    <NumberPicker
+        android:id="@+id/np"
+        android:layout_width="wrap_content"
+        android:layout_height="47dp"
+        android:layout_gravity="center_horizontal"
+        android:visibility="visible"/>
+
+    <TextView
+        android:id="@+id/text"
+        android:textStyle="bold"
         android:layout_width="match_parent"
         android:layout_height="match_parent"
-        android:background="#bb7700"
-        android:orientation="horizontal"
-        android:weightSum="100">
-
-        <LinearLayout
-            android:layout_width="match_parent"
-            android:layout_height="match_parent"
-            android:layout_weight="30"
-            android:orientation="vertical">
-
-            <com.example.android.tflitecamerademo.AutoFitTextureView
-                android:id="@+id/texture"
-                android:layout_width="match_parent"
-                android:layout_height="match_parent"
-                android:layout_weight="100" />
-
-            <ImageView
-                android:id="@+id/logoview"
-                android:layout_width="match_parent"
-                android:layout_height="wrap_content"
-                android:layout_weight="100"
-                android:scaleType="centerCrop"
-                android:src="@drawable/logo" />
-
-        </LinearLayout>
-
-        <TextView
-            android:id="@+id/text"
-            android:layout_width="match_parent"
-            android:layout_height="match_parent"
-            android:layout_weight="70"
-            android:paddingLeft="5dp"
-            android:paddingTop="20dp"
-            android:textColor="#FFF"
-            android:textSize="20sp"
-            android:textStyle="bold" />
-
-    </LinearLayout>
-
-</RelativeLayout>
+        android:paddingTop="20dp"
+        android:textColor="#FFF"
+        android:textSize="20sp"/>
+
+  </LinearLayout>
+</LinearLayout>
+
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml
index 72a229ecdb19f5..ddb099a950c2f8 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml
@@ -28,7 +28,7 @@
     <LinearLayout
         android:layout_width="wrap_content"
         android:layout_height="wrap_content"
-        android:layout_alignParentBottom="true"
+        android:layout_above="@+id/bottom_info_view"
         android:layout_alignParentEnd="false"
         android:layout_alignParentStart="true"
         android:layout_alignParentTop="false"
@@ -57,32 +57,39 @@
             android:textStyle="bold" />
 
     </LinearLayout>
+    <LinearLayout
+        android:orientation="horizontal"
+        android:background="#513400"
+        android:layout_alignParentBottom="true"
 
-    <RelativeLayout
-        android:id="@+id/control2"
         android:layout_width="match_parent"
-        android:layout_height="135dp"
-        android:layout_alignParentLeft="true"
-        android:layout_alignParentStart="true"
-        android:layout_alignTop="@+id/control"
-        android:layout_marginLeft="300dp"
-        android:layout_marginStart="300dp"
-        android:background="#bb7700">
-
+        android:id="@+id/bottom_info_view"
+        android:layout_marginBottom="10dp"
+        android:layout_height="50dp">
+        <TextView
+            android:layout_width="wrap_content"
+            android:layout_height="match_parent"
+            android:textColor="@android:color/white"
+            android:textAlignment="center"
+            android:gravity="center"
+            android:text="Threads:"/>
+        <NumberPicker
+            android:id="@+id/np"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_marginLeft="10dp"
+            android:theme="@style/AppTheme.Picker"
+            android:visibility="visible" />
         <ToggleButton
             android:id="@+id/button"
             android:textOff="@string/tflite"
             android:textOn="@string/nnapi"
             android:layout_width="wrap_content"
             android:layout_height="wrap_content"
-            android:layout_alignParentLeft="true"
-            android:layout_alignParentStart="true" />
+            android:layout_marginLeft="10dp"
+            android:background="#0000000f"
+            android:textColor="@android:color/white" />
+    </LinearLayout>
+
 
-        <NumberPicker
-            android:id="@+id/np"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:layout_below="@+id/button"
-            android:visibility="visible" />
-    </RelativeLayout>
 </RelativeLayout>
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
index d12435d5abda45..e567009a424ed7 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
@@ -15,101 +15,80 @@
 -->
 <RelativeLayout xmlns:android="http://schemas.android.com/apk/res/android"
     xmlns:app="http://schemas.android.com/apk/res-auto"
-    xmlns:tools="http://schemas.android.com/tools"
     android:layout_width="match_parent"
-    android:layout_height="match_parent">
+    android:layout_height="match_parent"
+    android:background="#bb7700">
 
-    <LinearLayout
+    <com.example.android.tflitecamerademo.AutoFitTextureView
+        android:id="@+id/texture"
         android:layout_width="match_parent"
         android:layout_height="match_parent"
-        android:orientation="vertical"
-        android:weightSum="60">
+        android:layout_weight="1" />
 
-        <FrameLayout
-            android:id="@+id/control"
-            android:layout_width="match_parent"
-            android:layout_height="match_parent"
-            android:layout_alignParentBottom="true"
-            android:layout_alignParentStart="true"
-            android:layout_weight="60"
-            android:background="#cc7700"
-            android:paddingLeft="20dp"
-            android:paddingStart="20dp">
-
-        </FrameLayout>
-
-    <com.example.android.tflitecamerademo.AutoFitTextureView
-        android:id="@+id/texture"
+    <LinearLayout
         android:layout_width="wrap_content"
         android:layout_height="wrap_content"
+        android:layout_above="@+id/bottom_info_view"
+        android:layout_alignParentEnd="false"
         android:layout_alignParentStart="true"
-        android:layout_alignParentLeft="true"
-        android:layout_alignParentTop="true" />
+        android:layout_alignParentTop="false"
+        android:background="#bb7700"
+        android:orientation="vertical"
+        android:weightSum="100">
+
+        <ImageView
+            android:id="@+id/logoview2"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:layout_weight="30"
+            android:scaleType="fitStart"
+            android:src="@drawable/logo" />
 
         <TextView
             android:id="@+id/text"
             android:layout_width="match_parent"
-            android:layout_height="match_parent"
-            android:layout_weight="20"
+            android:layout_height="wrap_content"
+            android:layout_alignParentBottom="true"
+            android:layout_alignParentEnd="true"
+            android:layout_alignParentRight="true"
+            android:layout_weight="30"
             android:textColor="#FFF"
             android:textSize="20sp"
             android:textStyle="bold" />
+
     </LinearLayout>
+    <LinearLayout
+        android:orientation="horizontal"
+        android:background="#aa7700"
+        android:layout_alignParentBottom="true"
 
-    <RelativeLayout
-        android:id="@+id/control2"
         android:layout_width="match_parent"
-        android:layout_height="135dp"
-        android:layout_alignParentLeft="true"
-        android:layout_alignParentStart="true"
-        android:layout_alignTop="@+id/control"
-        android:layout_marginLeft="300dp"
-        android:layout_marginStart="300dp"
-        android:background="#bb7700">
-
-        <ToggleButton
-            android:id="@+id/button"
-            android:textOff="@string/tflite"
-            android:textOn="@string/nnapi"
+        android:id="@+id/bottom_info_view"
+        android:layout_marginBottom="10dp"
+        android:layout_height="50dp">
+        <TextView
             android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:layout_alignParentLeft="true"
-            android:layout_alignParentStart="true" />
-
+            android:layout_height="match_parent"
+            android:textColor="@android:color/white"
+            android:textAlignment="center"
+            android:gravity="center"
+            android:text="@string/threads" />
         <NumberPicker
             android:id="@+id/np"
             android:layout_width="wrap_content"
             android:layout_height="wrap_content"
-            android:layout_below="@+id/button"
+            android:layout_marginLeft="10dp"
+            android:theme="@style/AppTheme.Picker"
             android:visibility="visible" />
-    </RelativeLayout>
-
-    <RelativeLayout
-        android:id="@+id/control2"
-        android:layout_width="match_parent"
-        android:layout_height="135dp"
-        android:layout_alignParentLeft="true"
-        android:layout_alignParentStart="true"
-        android:layout_alignTop="@+id/control"
-        android:layout_marginLeft="300dp"
-        android:layout_marginStart="300dp"
-        android:background="@color/control_background">
-
         <ToggleButton
             android:id="@+id/button"
             android:textOff="@string/tflite"
             android:textOn="@string/nnapi"
             android:layout_width="wrap_content"
             android:layout_height="wrap_content"
-            android:layout_alignParentLeft="true"
-            android:layout_alignParentStart="true" />
-
-        <NumberPicker
-            android:id="@+id/np"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:layout_below="@+id/button"
-            android:visibility="visible" />
-    </RelativeLayout>
+            android:layout_marginLeft="10dp"
+            android:background="#0000000f"
+            android:textColor="@android:color/white" />
 
+    </LinearLayout>
 </RelativeLayout>
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/base-strings.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/values/base-strings.xml
index 0a71dbd0e8010f..7af8f3a98c6319 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/base-strings.xml
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/values/base-strings.xml
@@ -16,7 +16,7 @@
 -->
 
 <resources>
-    <string name="app_name">TfLiteCameraDemo</string>
+    <string name="app_name">TfLite Camera Demo</string>
     <string name="intro_message">
         <![CDATA[
 
@@ -27,4 +27,5 @@
 
         ]]>
     </string>
+    <string name="threads">Threads:</string>
 </resources>
diff --git a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/styles.xml b/tensorflow/contrib/lite/java/demo/app/src/main/res/values/styles.xml
index 3f3bdfb49480e7..1752b3b5f97e28 100644
--- a/tensorflow/contrib/lite/java/demo/app/src/main/res/values/styles.xml
+++ b/tensorflow/contrib/lite/java/demo/app/src/main/res/values/styles.xml
@@ -14,5 +14,10 @@
  limitations under the License.
 -->
 <resources>
-    <style name="MaterialTheme" parent="android:Theme.Material.Light.NoActionBar.Fullscreen" />
+  <style name="MaterialTheme" parent="android:Theme.Material.Light.NoActionBar.Fullscreen" />
+   <style name="AppTheme.Picker" parent="android:Theme.Material.Light.NoActionBar.Fullscreen" >
+    <item name="android:textColorPrimary">@android:color/white</item>
+
+</style>
+
 </resources>
diff --git a/tensorflow/contrib/lite/java/ovic/BUILD b/tensorflow/contrib/lite/java/ovic/BUILD
new file mode 100644
index 00000000000000..362d93636f7220
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/BUILD
@@ -0,0 +1,68 @@
+# Description:
+# OVIC Benchmarker Java API.
+
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow/java:build_defs.bzl", "JAVACOPTS")
+
+java_test(
+    name = "OvicClassifierTest",
+    size = "medium",
+    srcs = ["src/test/java/org/tensorflow/ovic/OvicClassifierTest.java"],
+    data = [
+        "//tensorflow/contrib/lite/java/ovic/src/testdata:labels.txt",
+        "//tensorflow/contrib/lite/java/ovic/src/testdata:ovic_testdata",
+    ],
+    javacopts = JAVACOPTS,
+    test_class = "org.tensorflow.ovic.OvicClassifierTest",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/contrib/lite/java/ovic:ovicbenchmarkerlib_java",
+        "@com_google_truth",
+        "@junit",
+    ],
+)
+
+java_binary(
+    name = "ovic_validator",
+    srcs = ["src/main/java/org/tensorflow/ovic/OvicValidator.java"],
+    data = [
+        "//tensorflow/contrib/lite/java/ovic/src/testdata:labels.txt",
+    ],
+    main_class = "org.tensorflow.ovic.OvicValidator",
+    deps = [
+        "//tensorflow/contrib/lite/java/ovic:ovicbenchmarkerlib_java",
+    ],
+)
+
+android_library(
+    name = "ovicbenchmarkerlib",
+    srcs = [
+        "src/main/java/org/tensorflow/ovic/OvicClassifier.java",
+        "src/main/java/org/tensorflow/ovic/OvicSingleImageResult.java",
+    ],
+    manifest = "//tensorflow/contrib/lite/java:AndroidManifest.xml",
+    deps = [
+        "//tensorflow/contrib/lite/java:tensorflowlite",
+        "//tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
+        "@org_checkerframework_qual",
+    ],
+)
+
+java_library(
+    name = "ovicbenchmarkerlib_java",
+    srcs = [
+        "src/main/java/org/tensorflow/ovic/OvicClassifier.java",
+        "src/main/java/org/tensorflow/ovic/OvicSingleImageResult.java",
+    ],
+    javacopts = JAVACOPTS,
+    deps = [
+        "//tensorflow/contrib/lite/java:libtensorflowlite_jni.so",
+        "//tensorflow/contrib/lite/java:tensorflowlite_java",
+        "//tensorflow/contrib/lite/java/src/main/native",
+        "//tensorflow/contrib/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
+        "@org_checkerframework_qual",
+    ],
+)
diff --git a/tensorflow/contrib/lite/java/ovic/README.md b/tensorflow/contrib/lite/java/ovic/README.md
index 76c33838bfe5b8..26349347faebac 100644
--- a/tensorflow/contrib/lite/java/ovic/README.md
+++ b/tensorflow/contrib/lite/java/ovic/README.md
@@ -2,11 +2,11 @@
 
 This folder contains building code for track one of the [Low Power ImageNet Recognition Challenge workshop at CVPR 2018.](https://rebootingcomputing.ieee.org/home/sitemap/14-lpirc/80-low-power-image-recognition-challenge-lpirc-2018)
 
-## Pre-requesits
+## Pre-requisite
 
 Follow the steps [here](https://www.tensorflow.org/mobile/tflite/demo_android) to install Tensorflow, Bazel, and the Android NDK and SDK.
 
-## To test the benchmarker:
+## Test the benchmarker:
 
 The testing utilities helps the developers (you) to make sure that your submissions in TfLite format will be processed as expected in the competition's benchmarking system.
 
@@ -37,47 +37,122 @@ unzip -j /tmp/ovic.zip -d tensorflow/contrib/lite/java/ovic/src/testdata/
 You can run test with Bazel as below. This helps to ensure that the installation is correct.
 
 ```sh
-bazel test --cxxopt=--std=c++11 //tensorflow/contrib/lite/java:OvicClassifierTest --test_output=all
+bazel test --cxxopt=--std=c++11 //tensorflow/contrib/lite/java/ovic:OvicClassifierTest --cxxopt=-Wno-all --test_output=all
 ```
 
 ### Test your submissions
 
-Once you have a submission that follows the instructions from the [competition site](https://rebootingcomputing.ieee.org/home/sitemap/14-lpirc/80-low-power-image-recognition-challenge-lpirc-2018), you can verify it as below.
+Once you have a submission that follows the instructions from the [competition site](https://rebootingcomputing.ieee.org/home/sitemap/14-lpirc/80-low-power-image-recognition-challenge-lpirc-2018), you can verify it in two ways:
 
-* Move your submission to the testdata folder:
+#### Validate using randomly generated images
 
-Let say the submission file is located at `/tmp/my_model.lite`, then
+You can call the validator binary below to verify that your model fits the format requirements. This often helps you to catch size mismatches (e.g. output should be [1, 1001] instead of [1,1,1,1001]). Let say the submission file is located at `/path/to/my_model.lite`, then call:
 
 ```sh
-cp /tmp/my_model.lite tensorflow/contrib/lite/java/ovic/src/testdata/
+bazel build --cxxopt=--std=c++11 //tensorflow/contrib/lite/java/ovic:ovic_validator --cxxopt=-Wno-all
+bazel-bin/tensorflow/contrib/lite/java/ovic/ovic_validator /path/to/my_model.lite
+```
+
+Successful validation should print the following message to terminal:
+
+```
+Successfully validated /path/to/my_model.lite.
+
+```
+
+#### Test that the model produces sensible outcomes
+
+You can go a step further to verify that the model produces results as expected. This helps you catch bugs during TOCO conversion (e.g. using the wrong mean and std values).
+
+* Move your submission to the testdata folder:
+
+```sh
+cp /path/to/my_model.lite tensorflow/contrib/lite/java/ovic/src/testdata/
 ```
 
 * Resize the test image to the resolutions that are expected by your submission:
 
 The test images can be found at `tensorflow/contrib/lite/java/ovic/src/testdata/test_image_*.jpg`. You may reuse these images if your image resolutions are 128x128 or 224x224.
 
-* Add your model and test image to the BUILD rule:
+* Add your model and test image to the BUILD rule at `tensorflow/contrib/lite/java/ovic/src/testdata/BUILD`:
 
 ```JSON
-java_test(
-  name = "OvicClassifierTest",
-  size = "medium",
-  srcs = ["ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java"],
-  data = [
-      "ovic/src/testdata/float_model.lite",
-      "ovic/src/testdata/labels.txt",
-      "ovic/src/testdata/low_res_model.lite",
-      "ovic/src/testdata/quantized_model.lite",
-      "ovic/src/testdata/test_image_128.jpg",
-      "ovic/src/testdata/test_image_224.jpg",
-      "ovic/src/testdata/my_model.lite",        # <--- Your submission.
-      "ovic/src/testdata/my_test_image.jpg",    # <--- Your test image.
-  ],
-      ...
+filegroup(
+    name = "ovic_testdata",
+    srcs = [
+        "@tflite_ovic_testdata//:float_model.lite",
+        "@tflite_ovic_testdata//:low_res_model.lite",
+        "@tflite_ovic_testdata//:quantized_model.lite",
+        "@tflite_ovic_testdata//:test_image_128.jpg",
+        "@tflite_ovic_testdata//:test_image_224.jpg"
+        "my_model.lite",        # <--- Your submission.
+        "my_test_image.jpg",    # <--- Your test image.
+    ],
+    ...
 ```
 
 * Modify `OvicClassifierTest.java` to test your model.
 
-Change `TEST_IMAGE_PATH` to `testdata/my_test_image.jpg`. If your model runs inference in floating point, change `FLOAT_MODEL_PATH` to `testdata/my_model.lite`. If your model runs [quantized inference](https://www.tensorflow.org/performance/quantization), change `QUANTIZED_MODEL_PATH` to `testdata/my_model.lite`.
+Change `TEST_IMAGE_PATH` to `my_test_image.jpg`. Change either `FLOAT_MODEL_PATH` or `QUANTIZED_MODEL_PATH` to `my_model.lite` depending on whether your model runs inference in float or [8-bit](https://www.tensorflow.org/performance/quantization).
 
 Now you can run the bazel tests to catch any runtime issues with the submission.
+
+Note: Please make sure that your submission passes the test. If a submission fails to pass the test it will not be processed by the submission server.
+
+## Measure on-device latency
+
+We provide two ways to measure the on-device latency of your submission. The first is through our competition server, which is reliable and repeatable, but is limited to a few trials per day. The second is through the benchmarker Apk, which requires a device and may not be as accurate as the server, but has a fast turn-around and no access limitations. We recommend that the participants use the benchmarker apk for early development, and reserve the competition server for evaluating promising submissions.
+
+### Running the benchmarker app
+
+Make sure that you have followed instructions in [Test your submissions](#test-your-submissions) to add your model to the testdata folder and to the corresponding build rules.
+
+Modify `tensorflow/contrib/lite/java/ovic/demo/app/OvicBenchmarkerActivity.java`:
+
+* Add your model to the benchmarker apk by changing `MODEL_PATH` and `TEST_IMAGE_PATH` below to your submission and test image.
+
+```
+  private static final String TEST_IMAGE_PATH = "my_test_image.jpg";
+  private static final String MODEL_PATH = "my_model.lite";
+```
+
+* Adjust the benchmark parameters when needed:
+
+You can chnage the length of each experiment, and the processor affinity below. `BIG_CORE_MASK` is an integer whose binary encoding represents the set of used cores. This number is phone-specific. For example, Pixel 2 has 8 cores: the 4 little cores are represented by the 4 less significant bits, and the 4 big cores by the 4 more significant bits. Therefore a mask value of 16, or in binary `00010000`, represents using only the first big core. The mask 32, or in binary `00100000` uses the second big core and should deliver identical results as the mask 16 because the big cores are interchangeable.
+
+```
+  /** Wall time for each benchmarking experiment. */
+  private static final double WALL_TIME = 3000;
+  /** Maximum number of iterations in each benchmarking experiment. */
+  private static final int MAX_ITERATIONS = 100;
+  /** Mask for binding to a single big core. Pixel 1 (4), Pixel 2 (16). */
+  private static final int BIG_CORE_MASK = 16;
+```
+
+Note: You'll need ROOT access to the phone to change processor affinity.
+
+* Build and install the app.
+
+```
+bazel build -c opt --cxxopt=--std=c++11 --cxxopt=-Wno-all //tensorflow/contrib/lite/java/ovic/demo/app:ovic_benchmarker_binary
+adb install -r bazel-bin/tensorflow/contrib/lite/java/ovic/demo/app/ovic_benchmarker_binary.apk
+```
+
+Start the app and click the `Start` button in dark green. The button should turn bright green, signaling that the experiment is running. The benchmarking results will be displayed after about the `WALL_TIME` you specified above. For example:
+
+```
+my_model.lite: Average latency=158.6ms after 20 runs.
+```
+
+### Sample latencies
+
+Note: the benchmarking results can be quite different depending on the background processes running on the phone. A few things that help stabilize the app's readings are placing the phone on a cooling plate, restarting the phone, and shutting down internet access.
+
+| Model                | Pixel 1 latency (ms)  | Pixel 2 latency (ms) |
+| -------------------- |:---------------------:| --------------------:|
+|  float_model.lite    | 120                   | 155                  |
+| quantized_model.lite | 85                    | 74                   |
+|  low_res_model.lite  | 4.2                   | 4.0                  |
+
+Since Pixel 2 has excellent support for 8-bit quantized models, we strongly recommend you to check out the [quantization training tutorial](https://www.tensorflow.org/performance/quantization).
+
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/AndroidManifest.xml b/tensorflow/contrib/lite/java/ovic/demo/app/AndroidManifest.xml
new file mode 100644
index 00000000000000..55f2961fd717bd
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/demo/app/AndroidManifest.xml
@@ -0,0 +1,48 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!--
+ Copyright 2018 The Android Open Source Project
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    package="ovic.demo.app"
+    android:versionCode="1"
+    android:versionName="1.0" >
+
+    <uses-sdk
+        android:minSdkVersion="19"
+        android:targetSdkVersion="21" />
+
+    <uses-permission android:name="android.permission.READ_EXTERNAL_STORAGE" />
+    <uses-permission android:name="android.permission.WRITE_EXTERNAL_STORAGE" />
+    <uses-permission android:name="android.permission.READ_PHONE_STATE" />
+
+    <application
+        android:allowBackup="true"
+        android:icon="@drawable/ic_launcher"
+        android:largeHeap="true"
+        android:label="@string/app_name">
+        <activity
+            android:name="ovic.demo.app.OvicBenchmarkerActivity"
+            android:label="@string/app_name"
+            android:screenOrientation="portrait">
+
+          <intent-filter>
+              <action android:name="android.intent.action.MAIN" />
+              <category android:name="android.intent.category.LAUNCHER" />
+          </intent-filter>
+      </activity>
+  </application>
+
+</manifest>
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/BUILD b/tensorflow/contrib/lite/java/ovic/demo/app/BUILD
new file mode 100644
index 00000000000000..83974f4b337bae
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/demo/app/BUILD
@@ -0,0 +1,29 @@
+# Sample app for OVIC benchmarking.
+licenses(["notice"])  # Apache 2.0
+
+android_binary(
+    name = "ovic_benchmarker_binary",
+    srcs = [
+        "OvicBenchmarker.java",
+        "OvicBenchmarkerActivity.java",
+    ],
+    assets = [
+        "//tensorflow/contrib/lite/java/ovic/src/testdata:ovic_testdata",
+        "//tensorflow/contrib/lite/java/ovic/src/testdata:labels.txt",
+    ],
+    assets_dir = "",
+    custom_package = "ovic.demo.app",
+    manifest = "AndroidManifest.xml",
+    nocompress_extensions = [
+        ".lite",
+        ".tflite",
+    ],
+    resource_files = glob(["res/**"]),
+    tags = ["manual"],
+    deps = [
+        "//tensorflow/contrib/lite/java:tensorflowlite",
+        "//tensorflow/contrib/lite/java/ovic:ovicbenchmarkerlib",
+        "@androidsdk//com.android.support:support-v13-25.2.0",
+        "@androidsdk//com.android.support:support-v4-25.2.0",
+    ],
+)
diff --git a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java b/tensorflow/contrib/lite/java/ovic/demo/app/OvicBenchmarker.java
similarity index 97%
rename from tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java
rename to tensorflow/contrib/lite/java/ovic/demo/app/OvicBenchmarker.java
index d0102883e6b41f..113ab74a20dabc 100644
--- a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicBenchmarker.java
+++ b/tensorflow/contrib/lite/java/ovic/demo/app/OvicBenchmarker.java
@@ -1,4 +1,4 @@
-/*Copyright 2018 Google LLC
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,7 +12,7 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-package org.tensorflow.ovic;
+package ovic.demo.app;
 
 import android.graphics.Bitmap;
 import android.os.SystemClock;
@@ -22,6 +22,8 @@
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 import java.nio.MappedByteBuffer;
+import org.tensorflow.ovic.OvicClassifier;
+import org.tensorflow.ovic.OvicSingleImageResult;
 
 /**
  * Class that benchmarks image classifier models.
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/OvicBenchmarkerActivity.java b/tensorflow/contrib/lite/java/ovic/demo/app/OvicBenchmarkerActivity.java
new file mode 100644
index 00000000000000..59457c308ad7ca
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/demo/app/OvicBenchmarkerActivity.java
@@ -0,0 +1,247 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+package ovic.demo.app;
+
+import android.app.Activity;
+import android.content.res.AssetFileDescriptor;
+import android.content.res.AssetManager;
+import android.graphics.Bitmap;
+import android.graphics.BitmapFactory;
+import android.os.Bundle;
+import android.os.Process;
+import android.os.SystemClock;
+import android.util.Log;
+import android.view.View;
+import android.widget.TextView;
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.text.DecimalFormat;
+import org.tensorflow.ovic.OvicSingleImageResult;
+
+/** Class that benchmark image classifier models. */
+public class OvicBenchmarkerActivity extends Activity {
+  /** Tag for the {@link Log}. */
+  private static final String TAG = "OvicBenchmarkerActivity";
+
+  /** Name of the label file stored in Assets. */
+  private static final String LABEL_PATH = "labels.txt";
+
+  private static final String TEST_IMAGE_PATH = "test_image_224.jpg";
+  private static final String MODEL_PATH = "float_model.lite";
+  /**
+   * Each bottom press will launch a benchmarking experiment. The experiment stops when either the
+   * total native latency reaches WALL_TIME or the number of iterations reaches MAX_ITERATIONS,
+   * whichever comes first.
+   */
+  /** Wall time for each benchmarking experiment. */
+  private static final double WALL_TIME = 3000;
+  /** Maximum number of iterations in each benchmarking experiment. */
+  private static final int MAX_ITERATIONS = 100;
+  /** Mask for binding to a single big core. Pixel 1 (4), Pixel 2 (16). */
+  private static final int BIG_CORE_MASK = 16;
+  /** Amount of time in milliseconds to wait for affinity to set. */
+  private static final int WAIT_TIME_FOR_AFFINITY = 1000;
+
+  /* The model to be benchmarked. */
+  private MappedByteBuffer model = null;
+  private InputStream labelInputStream = null;
+  private OvicBenchmarker benchmarker;
+  /** Inference result of each iteration. */
+  OvicSingleImageResult iterResult = null;
+
+  private TextView textView = null;
+  // private Button startButton = null;
+  private static final DecimalFormat df2 = new DecimalFormat(".##");
+
+  @Override
+  protected void onCreate(Bundle savedInstanceState) {
+    super.onCreate(savedInstanceState);
+    setContentView(R.layout.activity_main);
+
+    // TextView used to display the progress, for information purposes only.
+    textView = (TextView) findViewById(R.id.textView);
+  }
+
+  private Bitmap loadTestBitmap() throws IOException {
+    InputStream imageStream = getAssets().open(TEST_IMAGE_PATH);
+    return BitmapFactory.decodeStream(imageStream);
+  }
+
+  public void initializeTest() throws IOException {
+    Log.i(TAG, "Initializing benchmarker.");
+    benchmarker = new OvicBenchmarker(WALL_TIME);
+    AssetManager am = getAssets();
+    AssetFileDescriptor fileDescriptor = am.openFd(MODEL_PATH);
+    FileInputStream modelInputStream = new FileInputStream(fileDescriptor.getFileDescriptor());
+    FileChannel fileChannel = modelInputStream.getChannel();
+    long startOffset = fileDescriptor.getStartOffset();
+    long declaredLength = fileDescriptor.getDeclaredLength();
+    model = fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
+    labelInputStream = am.open(LABEL_PATH);
+  }
+
+  public Boolean doTestIteration() throws IOException, InterruptedException {
+    if (benchmarker == null) {
+      throw new RuntimeException("Benchmarker has not been initialized.");
+    }
+    if (benchmarker.shouldStop()) {
+      return false;
+    }
+    if (!benchmarker.readyToTest()) {
+      Log.i(TAG, "getting ready to test.");
+      benchmarker.getReadyToTest(labelInputStream, model);
+      if (!benchmarker.readyToTest()) {
+        throw new RuntimeException("Failed to get the benchmarker ready.");
+      }
+    }
+    Log.i(TAG, "Going to do test iter.");
+    // Start testing.
+    Bitmap testImageBitmap = loadTestBitmap();
+    iterResult = benchmarker.doTestIteration(testImageBitmap);
+    testImageBitmap.recycle();
+    if (iterResult == null) {
+      throw new RuntimeException("Inference failed to produce a result.");
+    }
+    Log.i(TAG, iterResult.toString());
+    return true;
+  }
+
+  public void startPressed(View view) throws IOException {
+    Log.i(TAG, "Start pressed");
+    try {
+      initializeTest();
+    } catch (IOException e) {
+      Log.e(TAG, "Can't initialize benchmarker.", e);
+      throw e;
+    }
+    String displayText = "";
+    try {
+      setProcessorAffinity(BIG_CORE_MASK);
+    } catch (IOException e) {
+      Log.e(TAG, e.getMessage());
+      displayText = e.getMessage() + "\n";
+    }
+    Log.i(TAG, "Successfully initialized benchmarker.");
+    int testIter = 0;
+    Boolean iterSuccess = false;
+    double totalLatency = 0.0f;
+    while (testIter < MAX_ITERATIONS) {
+      try {
+        iterSuccess = doTestIteration();
+      } catch (IOException e) {
+        Log.e(TAG, "Error during iteration " + testIter);
+        throw e;
+      } catch (InterruptedException e) {
+        Log.e(TAG, "Interrupted at iteration " + testIter);
+      }
+      if (!iterSuccess) {
+        break;
+      }
+      testIter++;
+      totalLatency += (double) iterResult.latency;
+    }
+    ;
+    Log.i(TAG, "Benchmarking finished");
+
+    if (textView != null) {
+      if (testIter > 0) {
+        textView.setText(
+            displayText
+                + MODEL_PATH
+                + ": Average latency="
+                + df2.format(totalLatency / testIter)
+                + "ms after "
+                + testIter
+                + " runs.");
+      } else {
+        textView.setText("Benchmarker failed to run on more than one images.");
+      }
+    }
+  }
+
+  private static void setProcessorAffinity(int mask) throws IOException {
+    int myPid = Process.myPid();
+    Log.i(TAG, String.format("Setting processor affinity to 0x%02x", mask));
+
+    String command = String.format("taskset -a -p %x %d", mask, myPid);
+    try {
+      Runtime.getRuntime().exec(command).waitFor();
+    } catch (InterruptedException e) {
+      throw new IOException("Interrupted: " + e);
+    }
+
+    // Make sure set took effect - try for a second to confirm the change took.  If not then fail.
+    long startTimeMs = SystemClock.elapsedRealtime();
+    while (true) {
+      int readBackMask = readCpusAllowedMask();
+      if (readBackMask == mask) {
+        Log.i(TAG, String.format("Successfully set affinity to 0x%02x", mask));
+        return;
+      }
+      if (SystemClock.elapsedRealtime() > startTimeMs + WAIT_TIME_FOR_AFFINITY) {
+        throw new IOException(
+            String.format(
+                "Core-binding failed: affinity set to 0x%02x but read back as 0x%02x\n"
+                    + "please root device.",
+                mask, readBackMask));
+      }
+
+      try {
+        Thread.sleep(50);
+      } catch (InterruptedException e) {
+        // Ignore sleep interrupted, will sleep again and compare is final cross-check.
+      }
+    }
+  }
+
+  public static int readCpusAllowedMask() throws IOException {
+    // Determine how many CPUs there are total
+    final String pathname = "/proc/self/status";
+    final String resultPrefix = "Cpus_allowed:";
+    File file = new File(pathname);
+    String line = "<NO LINE READ>";
+    String allowedCPU = "";
+    Integer allowedMask = null;
+    BufferedReader bufReader = null;
+    try {
+      bufReader = new BufferedReader(new FileReader(file));
+      while ((line = bufReader.readLine()) != null) {
+        if (line.startsWith(resultPrefix)) {
+          allowedMask = Integer.valueOf(line.substring(resultPrefix.length()).trim(), 16);
+          allowedCPU = bufReader.readLine();
+          break;
+        }
+      }
+    } catch (RuntimeException e) {
+      throw new IOException(
+          "Invalid number in " + pathname + " line: \"" + line + "\": " + e.getMessage());
+    } finally {
+      if (bufReader != null) {
+        bufReader.close();
+      }
+    }
+    if (allowedMask == null) {
+      throw new IOException(pathname + " missing " + resultPrefix + " line");
+    }
+    Log.i(TAG, allowedCPU);
+    return allowedMask;
+  }
+}
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/build.gradle b/tensorflow/contrib/lite/java/ovic/demo/app/build.gradle
new file mode 100644
index 00000000000000..c5d19bad89a939
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/demo/app/build.gradle
@@ -0,0 +1,58 @@
+apply plugin: 'com.android.application'
+
+android {
+    compileSdkVersion 26
+    buildToolsVersion "26.0.1"
+    defaultConfig {
+        applicationId "android.example.com.ovicbenchmarker"
+        minSdkVersion 15
+        targetSdkVersion 26
+        versionCode 1
+        versionName "1.0"
+        testInstrumentationRunner "android.support.test.runner.AndroidJUnitRunner"
+
+        // Remove this block.
+        jackOptions {
+            enabled true
+        }
+    }
+    lintOptions {
+        abortOnError false
+    }
+    buildTypes {
+        release {
+            minifyEnabled false
+            proguardFiles getDefaultProguardFile('proguard-android.txt'), 'proguard-rules.pro'
+        }
+    }
+    aaptOptions {
+        noCompress "lite", "tflite"
+    }
+
+    compileOptions {
+        sourceCompatibility JavaVersion.VERSION_1_8
+        targetCompatibility JavaVersion.VERSION_1_8
+    }
+}
+
+repositories {
+    maven {
+        url 'https://google.bintray.com/tensorflow'
+    }
+}
+
+dependencies {
+    compile fileTree(dir: 'libs', include: ['*.jar'])
+    androidTestCompile('com.android.support.test.espresso:espresso-core:2.2.2', {
+        exclude group: 'com.android.support', module: 'support-annotations'
+    })
+    compile 'com.android.support:appcompat-v7:25.2.0'
+    compile 'com.android.support.constraint:constraint-layout:1.0.2'
+    compile 'com.android.support:design:25.2.0'
+    compile 'com.android.support:support-annotations:25.3.1'
+    compile 'com.android.support:support-v13:25.2.0'
+
+    compile 'org.tensorflow:tensorflow-lite:+'
+
+    testCompile 'junit:junit:4.12'
+}
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/res/drawable-mdpi/ic_launcher.png b/tensorflow/contrib/lite/java/ovic/demo/app/res/drawable-mdpi/ic_launcher.png
new file mode 100644
index 00000000000000..715d1b6d69c0f4
Binary files /dev/null and b/tensorflow/contrib/lite/java/ovic/demo/app/res/drawable-mdpi/ic_launcher.png differ
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/res/drawable-xhdpi/ic_launcher.png b/tensorflow/contrib/lite/java/ovic/demo/app/res/drawable-xhdpi/ic_launcher.png
new file mode 100644
index 00000000000000..9beff0885fd4c8
Binary files /dev/null and b/tensorflow/contrib/lite/java/ovic/demo/app/res/drawable-xhdpi/ic_launcher.png differ
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/res/drawable/start_button_color.xml b/tensorflow/contrib/lite/java/ovic/demo/app/res/drawable/start_button_color.xml
new file mode 100644
index 00000000000000..93f5c6a016b499
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/demo/app/res/drawable/start_button_color.xml
@@ -0,0 +1,39 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!--
+ Copyright 2018 The Android Open Source Project
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<selector
+  xmlns:android="http://schemas.android.com/apk/res/android">
+  <item
+    android:state_enabled="false">
+    <shape android:shape="rectangle">
+      <solid android:color="#808080"/>
+    </shape>
+  </item>
+  <item
+    android:state_enabled="true"
+    android:state_pressed="true">
+    <shape android:shape="rectangle">
+      <solid android:color="#44ff44"/>
+    </shape>
+  </item>
+  <item
+    android:state_enabled="true"
+    android:state_pressed="false">
+    <shape android:shape="rectangle"  >
+      <solid android:color="#227f22"/>
+    </shape>
+  </item>
+</selector>
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/res/layout/activity_main.xml b/tensorflow/contrib/lite/java/ovic/demo/app/res/layout/activity_main.xml
new file mode 100644
index 00000000000000..e9d83bae543ae6
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/demo/app/res/layout/activity_main.xml
@@ -0,0 +1,54 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!--
+ Copyright 2018 The Android Open Source Project
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<RelativeLayout
+    xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:tools="http://schemas.android.com/tools"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent"
+    android:paddingBottom="@dimen/activity_vertical_margin"
+    android:paddingLeft="@dimen/activity_horizontal_margin"
+    android:paddingRight="@dimen/activity_horizontal_margin"
+    android:paddingTop="@dimen/activity_vertical_margin"
+    tools:context="ovic.demo.app.OvicBenchmarkerActivity">
+
+  <TextView
+    android:layout_width="wrap_content"
+    android:layout_height="wrap_content"
+    android:text="@string/initial_status_msg"
+    android:id="@+id/textView"
+    android:layout_above="@+id/button_start"
+    android:layout_alignParentTop="true"/>
+
+  <Button
+    android:layout_width="wrap_content"
+    android:layout_height="wrap_content"
+    android:text="@string/start_label"
+    android:id="@id/button_start"
+    android:layout_alignParentBottom="true"
+    android:layout_alignParentLeft="true"
+    android:background="@drawable/start_button_color"
+    android:padding="10dp"
+    android:layout_marginRight="30dp"
+    android:layout_marginLeft="100dp"
+    android:layout_marginTop="10dp"
+    android:foreground="#000000"
+    android:textColor="#ffffff"
+    android:enabled="true"
+    style="?android:attr/buttonBarButtonStyle"
+    android:onClick="startPressed"/>
+
+</RelativeLayout>
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/res/values/dimens.xml b/tensorflow/contrib/lite/java/ovic/demo/app/res/values/dimens.xml
new file mode 100644
index 00000000000000..250b581430a115
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/demo/app/res/values/dimens.xml
@@ -0,0 +1,20 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!--
+ Copyright 2018 The Android Open Source Project
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<resources>
+    <dimen name="activity_vertical_margin">20dp</dimen>
+    <dimen name="activity_horizontal_margin">16dp</dimen>
+</resources>
diff --git a/tensorflow/contrib/lite/java/ovic/demo/app/res/values/strings.xml b/tensorflow/contrib/lite/java/ovic/demo/app/res/values/strings.xml
new file mode 100644
index 00000000000000..d26beb1d27d3a3
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/demo/app/res/values/strings.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="utf-8"?>
+<!--
+ Copyright 2018 The Android Open Source Project
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<resources>
+    <string name="app_name" translatable="false">Benchmarker</string>
+
+    <string name="start_label" translatable="false">Start</string>
+    <string name="initial_status_msg" translatable="false"> Press start to run the benchmarks.</string>
+</resources>
diff --git a/tensorflow/contrib/lite/java/ovic/demo/build.gradle b/tensorflow/contrib/lite/java/ovic/demo/build.gradle
new file mode 100644
index 00000000000000..b78a0b86c93962
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/demo/build.gradle
@@ -0,0 +1,23 @@
+// Top-level build file where you can add configuration options common to all sub-projects/modules.
+
+buildscript {
+    repositories {
+        jcenter()
+    }
+    dependencies {
+        classpath 'com.android.tools.build:gradle:2.3.1'
+
+        // NOTE: Do not place your application dependencies here; they belong
+        // in the individual module build.gradle files
+    }
+}
+
+allprojects {
+    repositories {
+        jcenter()
+    }
+}
+
+task clean(type: Delete) {
+    delete rootProject.buildDir
+}
diff --git a/tensorflow/contrib/lite/java/ovic/demo/gradle.properties b/tensorflow/contrib/lite/java/ovic/demo/gradle.properties
new file mode 100644
index 00000000000000..aac7c9b4614ccf
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/demo/gradle.properties
@@ -0,0 +1,17 @@
+# Project-wide Gradle settings.
+
+# IDE (e.g. Android Studio) users:
+# Gradle settings configured through the IDE *will override*
+# any settings specified in this file.
+
+# For more details on how to configure your build environment visit
+# http://www.gradle.org/docs/current/userguide/build_environment.html
+
+# Specifies the JVM arguments used for the daemon process.
+# The setting is particularly useful for tweaking memory settings.
+org.gradle.jvmargs=-Xmx1536m
+
+# When configured, Gradle will run in incubating parallel mode.
+# This option should only be used with decoupled projects. More details, visit
+# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
+# org.gradle.parallel=true
diff --git a/tensorflow/contrib/lite/java/ovic/demo/gradle/wrapper/gradle-wrapper.jar b/tensorflow/contrib/lite/java/ovic/demo/gradle/wrapper/gradle-wrapper.jar
new file mode 100644
index 00000000000000..13372aef5e24af
Binary files /dev/null and b/tensorflow/contrib/lite/java/ovic/demo/gradle/wrapper/gradle-wrapper.jar differ
diff --git a/tensorflow/contrib/lite/java/ovic/demo/gradle/wrapper/gradle-wrapper.properties b/tensorflow/contrib/lite/java/ovic/demo/gradle/wrapper/gradle-wrapper.properties
new file mode 100644
index 00000000000000..fa7a38a0e43eec
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/demo/gradle/wrapper/gradle-wrapper.properties
@@ -0,0 +1,6 @@
+#Thu Sep 28 09:01:41 PDT 2017
+distributionBase=GRADLE_USER_HOME
+distributionPath=wrapper/dists
+zipStoreBase=GRADLE_USER_HOME
+zipStorePath=wrapper/dists
+distributionUrl=https\://services.gradle.org/distributions/gradle-3.3-all.zip
diff --git a/tensorflow/contrib/lite/java/ovic/demo/gradlew b/tensorflow/contrib/lite/java/ovic/demo/gradlew
new file mode 100755
index 00000000000000..9d82f78915133e
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/demo/gradlew
@@ -0,0 +1,160 @@
+#!/usr/bin/env bash
+
+##############################################################################
+##
+##  Gradle start up script for UN*X
+##
+##############################################################################
+
+# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+DEFAULT_JVM_OPTS=""
+
+APP_NAME="Gradle"
+APP_BASE_NAME=`basename "$0"`
+
+# Use the maximum available, or set MAX_FD != -1 to use that value.
+MAX_FD="maximum"
+
+warn ( ) {
+    echo "$*"
+}
+
+die ( ) {
+    echo
+    echo "$*"
+    echo
+    exit 1
+}
+
+# OS specific support (must be 'true' or 'false').
+cygwin=false
+msys=false
+darwin=false
+case "`uname`" in
+  CYGWIN* )
+    cygwin=true
+    ;;
+  Darwin* )
+    darwin=true
+    ;;
+  MINGW* )
+    msys=true
+    ;;
+esac
+
+# Attempt to set APP_HOME
+# Resolve links: $0 may be a link
+PRG="$0"
+# Need this for relative symlinks.
+while [ -h "$PRG" ] ; do
+    ls=`ls -ld "$PRG"`
+    link=`expr "$ls" : '.*-> \(.*\)$'`
+    if expr "$link" : '/.*' > /dev/null; then
+        PRG="$link"
+    else
+        PRG=`dirname "$PRG"`"/$link"
+    fi
+done
+SAVED="`pwd`"
+cd "`dirname \"$PRG\"`/" >/dev/null
+APP_HOME="`pwd -P`"
+cd "$SAVED" >/dev/null
+
+CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
+
+# Determine the Java command to use to start the JVM.
+if [ -n "$JAVA_HOME" ] ; then
+    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
+        # IBM's JDK on AIX uses strange locations for the executables
+        JAVACMD="$JAVA_HOME/jre/sh/java"
+    else
+        JAVACMD="$JAVA_HOME/bin/java"
+    fi
+    if [ ! -x "$JAVACMD" ] ; then
+        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+    fi
+else
+    JAVACMD="java"
+    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+fi
+
+# Increase the maximum file descriptors if we can.
+if [ "$cygwin" = "false" -a "$darwin" = "false" ] ; then
+    MAX_FD_LIMIT=`ulimit -H -n`
+    if [ $? -eq 0 ] ; then
+        if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
+            MAX_FD="$MAX_FD_LIMIT"
+        fi
+        ulimit -n $MAX_FD
+        if [ $? -ne 0 ] ; then
+            warn "Could not set maximum file descriptor limit: $MAX_FD"
+        fi
+    else
+        warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
+    fi
+fi
+
+# For Darwin, add options to specify how the application appears in the dock
+if $darwin; then
+    GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
+fi
+
+# For Cygwin, switch paths to Windows format before running java
+if $cygwin ; then
+    APP_HOME=`cygpath --path --mixed "$APP_HOME"`
+    CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
+    JAVACMD=`cygpath --unix "$JAVACMD"`
+
+    # We build the pattern for arguments to be converted via cygpath
+    ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
+    SEP=""
+    for dir in $ROOTDIRSRAW ; do
+        ROOTDIRS="$ROOTDIRS$SEP$dir"
+        SEP="|"
+    done
+    OURCYGPATTERN="(^($ROOTDIRS))"
+    # Add a user-defined pattern to the cygpath arguments
+    if [ "$GRADLE_CYGPATTERN" != "" ] ; then
+        OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
+    fi
+    # Now convert the arguments - kludge to limit ourselves to /bin/sh
+    i=0
+    for arg in "$@" ; do
+        CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
+        CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
+
+        if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
+            eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
+        else
+            eval `echo args$i`="\"$arg\""
+        fi
+        i=$((i+1))
+    done
+    case $i in
+        (0) set -- ;;
+        (1) set -- "$args0" ;;
+        (2) set -- "$args0" "$args1" ;;
+        (3) set -- "$args0" "$args1" "$args2" ;;
+        (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
+        (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
+        (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
+        (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
+        (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
+        (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
+    esac
+fi
+
+# Split up the JVM_OPTS And GRADLE_OPTS values into an array, following the shell quoting and substitution rules
+function splitJvmOpts() {
+    JVM_OPTS=("$@")
+}
+eval splitJvmOpts $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS
+JVM_OPTS[${#JVM_OPTS[*]}]="-Dorg.gradle.appname=$APP_BASE_NAME"
+
+exec "$JAVACMD" "${JVM_OPTS[@]}" -classpath "$CLASSPATH" org.gradle.wrapper.GradleWrapperMain "$@"
diff --git a/tensorflow/contrib/lite/java/ovic/demo/gradlew.bat b/tensorflow/contrib/lite/java/ovic/demo/gradlew.bat
new file mode 100644
index 00000000000000..8a0b282aa6885f
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/demo/gradlew.bat
@@ -0,0 +1,90 @@
+@if "%DEBUG%" == "" @echo off
+@rem ##########################################################################
+@rem
+@rem  Gradle startup script for Windows
+@rem
+@rem ##########################################################################
+
+@rem Set local scope for the variables with windows NT shell
+if "%OS%"=="Windows_NT" setlocal
+
+@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+set DEFAULT_JVM_OPTS=
+
+set DIRNAME=%~dp0
+if "%DIRNAME%" == "" set DIRNAME=.
+set APP_BASE_NAME=%~n0
+set APP_HOME=%DIRNAME%
+
+@rem Find java.exe
+if defined JAVA_HOME goto findJavaFromJavaHome
+
+set JAVA_EXE=java.exe
+%JAVA_EXE% -version >NUL 2>&1
+if "%ERRORLEVEL%" == "0" goto init
+
+echo.
+echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:findJavaFromJavaHome
+set JAVA_HOME=%JAVA_HOME:"=%
+set JAVA_EXE=%JAVA_HOME%/bin/java.exe
+
+if exist "%JAVA_EXE%" goto init
+
+echo.
+echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:init
+@rem Get command-line arguments, handling Windowz variants
+
+if not "%OS%" == "Windows_NT" goto win9xME_args
+if "%@eval[2+2]" == "4" goto 4NT_args
+
+:win9xME_args
+@rem Slurp the command line arguments.
+set CMD_LINE_ARGS=
+set _SKIP=2
+
+:win9xME_args_slurp
+if "x%~1" == "x" goto execute
+
+set CMD_LINE_ARGS=%*
+goto execute
+
+:4NT_args
+@rem Get arguments from the 4NT Shell from JP Software
+set CMD_LINE_ARGS=%$
+
+:execute
+@rem Setup the command line
+
+set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
+
+@rem Execute Gradle
+"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
+
+:end
+@rem End local scope for the variables with windows NT shell
+if "%ERRORLEVEL%"=="0" goto mainEnd
+
+:fail
+rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
+rem the _cmd.exe /c_ return code!
+if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
+exit /b 1
+
+:mainEnd
+if "%OS%"=="Windows_NT" endlocal
+
+:omega
diff --git a/tensorflow/contrib/lite/java/ovic/demo/settings.gradle b/tensorflow/contrib/lite/java/ovic/demo/settings.gradle
new file mode 100644
index 00000000000000..e7b4def49cb53d
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/demo/settings.gradle
@@ -0,0 +1 @@
+include ':app'
diff --git a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifier.java b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifier.java
index b2dfd8f2e71032..4cf51bb0fac682 100644
--- a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifier.java
+++ b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicClassifier.java
@@ -67,7 +67,7 @@ public int compare(Map.Entry<Integer, Float> o1, Map.Entry<Integer, Float> o2) {
           });
 
   /** Initializes an {@code OvicClassifier}. */
-  OvicClassifier(InputStream labelInputStream, MappedByteBuffer model)
+  public OvicClassifier(InputStream labelInputStream, MappedByteBuffer model)
       throws IOException, RuntimeException {
     if (model == null) {
       throw new RuntimeException("Input model is empty.");
@@ -106,7 +106,7 @@ public int compare(Map.Entry<Integer, Float> o1, Map.Entry<Integer, Float> o2) {
 
   /** Classifies a {@link ByteBuffer} image. */
   // @throws RuntimeException if model is uninitialized.
-  OvicSingleImageResult classifyByteBuffer(ByteBuffer imgData) throws RuntimeException {
+  public OvicSingleImageResult classifyByteBuffer(ByteBuffer imgData) {
     if (tflite == null) {
       throw new RuntimeException(TAG + ": ImageNet classifier has not been initialized; Failed.");
     }
diff --git a/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicValidator.java b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicValidator.java
new file mode 100644
index 00000000000000..a504ec74a9d0a1
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/src/main/java/org/tensorflow/ovic/OvicValidator.java
@@ -0,0 +1,94 @@
+/*Copyright 2018 Google LLC
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    https://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+package org.tensorflow.ovic;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PrintStream;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.util.Random;
+
+/** Validate a submission model. */
+public class OvicValidator {
+  private static void printUsage(PrintStream s) {
+    s.println("Java program that validates a submission model.");
+    s.println();
+    s.println("Usage: ovic_validator <submission file>");
+    s.println();
+    s.println("Where:");
+    s.println("<submission file> is the model in TfLite format;");
+  }
+
+  public static void main(String[] args) {
+    if (args.length != 1) {
+      printUsage(System.err);
+      System.exit(1);
+    }
+    final String labelPath =
+        "tensorflow/contrib/lite/java/ovic/src/testdata/labels.txt";
+
+    final String modelFile = args[0];
+    try {
+      File labelsfile = new File(labelPath);
+      InputStream labelsInputStream = new FileInputStream(labelsfile);
+      MappedByteBuffer model = loadModelFile(modelFile);
+      OvicClassifier classifier = new OvicClassifier(labelsInputStream, model);
+      ByteBuffer imgData = createByteBufferForClassifier(classifier);
+      OvicSingleImageResult testResult = classifier.classifyByteBuffer(imgData);
+      if (testResult.topKClasses.isEmpty()) {
+        throw new RuntimeException("Failed to return top K predictions.");
+      }
+      System.out.printf("Successfully validated %s.%n", modelFile);
+    } catch (Exception e) {
+      System.out.println(e.getMessage());
+      System.out.printf("Failed to validate %s.%n", modelFile);
+    }
+  }
+
+  private static ByteBuffer createByteBufferForClassifier(OvicClassifier classifier) {
+    if (classifier == null) {
+      throw new RuntimeException("Cannot create image buffer with the classifier.");
+    }
+    int[] inputDims = classifier.getInputDims();
+    int imgHeight = inputDims[1];
+    int imgWidth = inputDims[2];
+    ByteBuffer imgData = ByteBuffer.allocateDirect(imgHeight * imgWidth * 3);
+    imgData.order(ByteOrder.nativeOrder());
+    Random rand = new Random();
+    for (int y = 0; y < imgHeight; y++) {
+      for (int x = 0; x < imgWidth; x++) {
+        int val = rand.nextInt();
+        imgData.put((byte) ((val >> 16) & 0xFF));
+        imgData.put((byte) ((val >> 8) & 0xFF));
+        imgData.put((byte) (val & 0xFF));
+      }
+    }
+    return imgData;
+  }
+
+  private static MappedByteBuffer loadModelFile(String modelFilePath) throws IOException {
+    File modelfile = new File(modelFilePath);
+    FileInputStream inputStream = new FileInputStream(modelfile);
+    FileChannel fileChannel = inputStream.getChannel();
+    long startOffset = 0L;
+    long declaredLength = fileChannel.size();
+    return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
+  }
+}
diff --git a/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java b/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java
index 098ed8ceba52b6..56f3e7604a5b17 100644
--- a/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java
+++ b/tensorflow/contrib/lite/java/ovic/src/test/java/org/tensorflow/ovic/OvicClassifierTest.java
@@ -45,17 +45,17 @@ public final class OvicClassifierTest {
   private ByteBuffer lowResTestImage = null;
   private OvicSingleImageResult testResult = null;
   private static final String LABELS_PATH =
-      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/labels.txt";
+      "tensorflow/contrib/lite/java/ovic/src/testdata/labels.txt";
   private static final String QUANTIZED_MODEL_PATH =
-      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/quantized_model.lite";
+      "external/tflite_ovic_testdata/quantized_model.lite";
   private static final String LOW_RES_MODEL_PATH =
-      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/low_res_model.lite";
+      "external/tflite_ovic_testdata/low_res_model.lite";
   private static final String FLOAT_MODEL_PATH =
-      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/float_model.lite";
+      "external/tflite_ovic_testdata/float_model.lite";
   private static final String TEST_IMAGE_PATH =
-      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/test_image_224.jpg";
+      "external/tflite_ovic_testdata/test_image_224.jpg";
   private static final String TEST_LOW_RES_IMAGE_PATH =
-      "third_party/tensorflow/contrib/lite/java/ovic/src/testdata/test_image_128.jpg";
+      "external/tflite_ovic_testdata/test_image_128.jpg";
   private static final int TEST_IMAGE_GROUNDTRUTH = 653; // "military uniform"
 
   @Before
diff --git a/tensorflow/contrib/lite/java/ovic/src/testdata/BUILD b/tensorflow/contrib/lite/java/ovic/src/testdata/BUILD
new file mode 100644
index 00000000000000..1021ea30dd90b7
--- /dev/null
+++ b/tensorflow/contrib/lite/java/ovic/src/testdata/BUILD
@@ -0,0 +1,19 @@
+# Testdata for OVIC benchmarker demo App and tests.
+licenses(["notice"])  # Apache 2.0
+
+filegroup(
+    name = "ovic_testdata",
+    srcs = [
+        "@tflite_ovic_testdata//:float_model.lite",
+        "@tflite_ovic_testdata//:low_res_model.lite",
+        "@tflite_ovic_testdata//:quantized_model.lite",
+        "@tflite_ovic_testdata//:test_image_128.jpg",
+        "@tflite_ovic_testdata//:test_image_224.jpg",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+exports_files(
+    ["labels.txt"],
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index e84ee7112983ec..fd1f0ffa68eeca 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -16,6 +16,7 @@
 package org.tensorflow.lite;
 
 import java.io.File;
+import java.nio.ByteBuffer;
 import java.nio.MappedByteBuffer;
 import java.util.HashMap;
 import java.util.Map;
@@ -80,6 +81,29 @@ public Interpreter(@NonNull File modelFile, int numThreads) {
     wrapper = new NativeInterpreterWrapper(modelFile.getAbsolutePath(), numThreads);
   }
 
+  /**
+   * Initializes a {@code Interpreter} with a {@code ByteBuffer} of a model file.
+   *
+   * <p>The ByteBuffer should not be modified after the construction of a {@code Interpreter}. The
+   * {@code ByteBuffer} can be either a {@code MappedByteBuffer} that memory-maps a model file, or a
+   * direct {@code ByteBuffer} of nativeOrder() that contains the bytes content of a model.
+   */
+  public Interpreter(@NonNull ByteBuffer byteBuffer) {
+    wrapper = new NativeInterpreterWrapper(byteBuffer);
+  }
+
+  /**
+   * Initializes a {@code Interpreter} with a {@code ByteBuffer} of a model file and specifies the
+   * number of threads used for inference.
+   *
+   * <p>The ByteBuffer should not be modified after the construction of a {@code Interpreter}. The
+   * {@code ByteBuffer} can be either a {@code MappedByteBuffer} that memory-maps a model file, or a
+   * direct {@code ByteBuffer} of nativeOrder() that contains the bytes content of a model.
+   */
+  public Interpreter(@NonNull ByteBuffer byteBuffer, int numThreads) {
+    wrapper = new NativeInterpreterWrapper(byteBuffer, numThreads);
+  }
+
   /**
    * Initializes a {@code Interpreter} with a {@code MappedByteBuffer} to the model file.
    *
@@ -215,11 +239,11 @@ public void setUseNNAPI(boolean useNNAPI) {
     }
   }
 
-  public void setNumThreads(int num_threads) {
+  public void setNumThreads(int numThreads) {
     if (wrapper == null) {
       throw new IllegalStateException("The interpreter has already been closed.");
     }
-    wrapper.setNumThreads(num_threads);
+    wrapper.setNumThreads(numThreads);
   }
 
   /** Release resources associated with the {@code Interpreter}. */
@@ -229,5 +253,14 @@ public void close() {
     wrapper = null;
   }
 
+  @Override
+  protected void finalize() throws Throwable {
+    try {
+      close();
+    } finally {
+      super.finalize();
+    }
+  }
+
   NativeInterpreterWrapper wrapper;
 }
diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
index 2fc803715be5e5..2ae6c516b03ef4 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/NativeInterpreterWrapper.java
@@ -43,21 +43,31 @@ final class NativeInterpreterWrapper implements AutoCloseable {
   }
 
   /**
-   * Initializes a {@code NativeInterpreterWrapper} with a {@code MappedByteBuffer}. The
-   * MappedByteBuffer should not be modified after the construction of a {@code
-   * NativeInterpreterWrapper}.
+   * Initializes a {@code NativeInterpreterWrapper} with a {@code ByteBuffer}. The ByteBuffer should
+   * not be modified after the construction of a {@code NativeInterpreterWrapper}. The {@code
+   * ByteBuffer} can be either a {@code MappedByteBuffer} that memory-maps a model file, or a direct
+   * {@code ByteBuffer} of nativeOrder() that contains the bytes content of a model.
    */
-  NativeInterpreterWrapper(MappedByteBuffer mappedByteBuffer) {
-    this(mappedByteBuffer, /* numThreads= */ -1);
+  NativeInterpreterWrapper(ByteBuffer byteBuffer) {
+    this(byteBuffer, /* numThreads= */ -1);
   }
 
   /**
-   * Initializes a {@code NativeInterpreterWrapper} with a {@code MappedByteBuffer} and specifies
-   * the number of inference threads. The MappedByteBuffer should not be modified after the
-   * construction of a {@code NativeInterpreterWrapper}.
+   * Initializes a {@code NativeInterpreterWrapper} with a {@code ByteBuffer} and specifies the
+   * number of inference threads. The ByteBuffer should not be modified after the construction of a
+   * {@code NativeInterpreterWrapper}. The {@code ByteBuffer} can be either a {@code
+   * MappedByteBuffer} that memory-maps a model file, or a direct {@code ByteBuffer} of
+   * nativeOrder() that contains the bytes content of a model.
    */
-  NativeInterpreterWrapper(MappedByteBuffer mappedByteBuffer, int numThreads) {
-    modelByteBuffer = mappedByteBuffer;
+  NativeInterpreterWrapper(ByteBuffer buffer, int numThreads) {
+    if (buffer == null
+        || (!(buffer instanceof MappedByteBuffer)
+            && (!buffer.isDirect() || buffer.order() != ByteOrder.nativeOrder()))) {
+      throw new IllegalArgumentException(
+          "Model ByteBuffer should be either a MappedByteBuffer of the model file, or a direct "
+              + "ByteBuffer using ByteOrder.nativeOrder() which contains bytes of model content.");
+    }
+    modelByteBuffer = buffer;
     errorHandle = createErrorReporter(ERROR_BUFFER_SIZE);
     modelHandle = createModelWithBuffer(modelByteBuffer, errorHandle);
     interpreterHandle = createInterpreter(modelHandle, errorHandle, numThreads);
@@ -90,9 +100,10 @@ Tensor[] run(Object[] inputs) {
       dataTypes[i] = dataType.getNumber();
       if (dataType == DataType.BYTEBUFFER) {
         ByteBuffer buffer = (ByteBuffer) inputs[i];
-        if (buffer.order() != ByteOrder.nativeOrder()) {
+        if (buffer == null || !buffer.isDirect() || buffer.order() != ByteOrder.nativeOrder()) {
           throw new IllegalArgumentException(
-              "Input error: ByteBuffer shoud use ByteOrder.nativeOrder().");
+              "Input error: ByteBuffer should be a direct ByteBuffer that uses "
+                  + "ByteOrder.nativeOrder().");
         }
         numsOfBytes[i] = buffer.limit();
         sizes[i] = getInputDims(interpreterHandle, i, numsOfBytes[i]);
@@ -153,8 +164,8 @@ void setUseNNAPI(boolean useNNAPI) {
     useNNAPI(interpreterHandle, useNNAPI);
   }
 
-  void setNumThreads(int num_threads) {
-    numThreads(interpreterHandle, num_threads);
+  void setNumThreads(int numThreads) {
+    numThreads(interpreterHandle, numThreads);
   }
 
   /** Gets index of an input given its name. */
@@ -173,8 +184,8 @@ int getInputIndex(String name) {
     } else {
       throw new IllegalArgumentException(
           String.format(
-              "Input error: %s is not a valid name for any input. "
-                  + "The indexes of the inputs are %s",
+              "Input error: '%s' is not a valid name for any input. Names of inputs and their "
+                  + "indexes are %s",
               name, inputsIndexes.toString()));
     }
   }
@@ -195,8 +206,8 @@ int getOutputIndex(String name) {
     } else {
       throw new IllegalArgumentException(
           String.format(
-              "Input error: %s is not a valid name for any output. "
-                  + "The indexes of the outputs are %s",
+              "Input error: '%s' is not a valid name for any output. Names of outputs and their "
+                  + "indexes are %s",
               name, outputsIndexes.toString()));
     }
   }
@@ -314,7 +325,7 @@ String getOutputDataType(int index) {
 
   private long inferenceDurationNanoseconds = -1;
 
-  private MappedByteBuffer modelByteBuffer;
+  private ByteBuffer modelByteBuffer;
 
   private Map<String, Integer> inputsIndexes;
 
@@ -328,13 +339,13 @@ String getOutputDataType(int index) {
 
   private static native void useNNAPI(long interpreterHandle, boolean state);
 
-  private static native void numThreads(long interpreterHandle, int num_threads);
+  private static native void numThreads(long interpreterHandle, int numThreads);
 
   private static native long createErrorReporter(int size);
 
   private static native long createModel(String modelPathOrBuffer, long errorHandle);
 
-  private static native long createModelWithBuffer(MappedByteBuffer modelBuffer, long errorHandle);
+  private static native long createModelWithBuffer(ByteBuffer modelBuffer, long errorHandle);
 
   private static native long createInterpreter(long modelHandle, long errorHandle, int numThreads);
 
diff --git a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
index 45f510da1d940a..1fb6997fb9ba18 100644
--- a/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
+++ b/tensorflow/contrib/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -387,7 +387,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createModelWithBuffer(
   jlong capacity = env->GetDirectBufferCapacity(model_buffer);
   if (!VerifyModel(buf, capacity)) {
     throwException(env, kIllegalArgumentException,
-                   "MappedByteBuffer is not a valid flatbuffer model");
+                   "ByteBuffer is not a valid flatbuffer model");
     return 0;
   }
 
@@ -395,8 +395,7 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createModelWithBuffer(
       buf, static_cast<size_t>(capacity), error_reporter);
   if (!model) {
     throwException(env, kIllegalArgumentException,
-                   "MappedByteBuffer does not encode a valid "
-                   "TensorFlowLite model: %s",
+                   "ByteBuffer does not encode a valid model: %s",
                    error_reporter->CachedErrorMessage());
     return 0;
   }
@@ -426,7 +425,8 @@ Java_org_tensorflow_lite_NativeInterpreterWrapper_createInterpreter(
   status = interpreter->AllocateTensors();
   if (status != kTfLiteOk) {
     throwException(env, kNullPointerException,
-                   "Internal error: Cannot allocate memory for the interpreter",
+                   "Internal error: Cannot allocate memory for the interpreter:"
+                   " %s",
                    error_reporter->CachedErrorMessage());
     return 0;
   }
diff --git a/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc b/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc
index 17f4be09c63a9e..005dca0253d2c3 100644
--- a/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc
+++ b/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc
@@ -238,10 +238,6 @@ Java_org_tensorflow_lite_Tensor_shape(JNIEnv* env, jclass clazz, jlong handle) {
   if (tensor == nullptr) return nullptr;
   int num_dims = tensor->dims->size;
   jintArray result = env->NewIntArray(num_dims);
-  jint* dims = env->GetIntArrayElements(result, nullptr);
-  for (int i = 0; i < num_dims; ++i) {
-    dims[i] = static_cast<jint>(tensor->dims->data[i]);
-  }
-  env->ReleaseIntArrayElements(result, dims, 0);
+  env->SetIntArrayRegion(result, 0, num_dims, tensor->dims->data);
   return result;
 }
diff --git a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
index 61d6c35ec86bee..82007a6ab5be34 100644
--- a/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
+++ b/tensorflow/contrib/lite/java/src/test/java/org/tensorflow/lite/InterpreterTest.java
@@ -19,6 +19,8 @@
 import static org.junit.Assert.fail;
 
 import java.io.File;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
 import java.nio.MappedByteBuffer;
 import java.nio.channels.FileChannel;
 import java.nio.file.Files;
@@ -69,6 +71,49 @@ public void testRunWithMappedByteBufferModel() throws Exception {
     fileChannel.close();
   }
 
+  @Test
+  public void testRunWithDirectByteBufferModel() throws Exception {
+    Path path = MODEL_FILE.toPath();
+    FileChannel fileChannel =
+        (FileChannel) Files.newByteChannel(path, EnumSet.of(StandardOpenOption.READ));
+    ByteBuffer byteBuffer = ByteBuffer.allocateDirect((int) fileChannel.size());
+    byteBuffer.order(ByteOrder.nativeOrder());
+    fileChannel.read(byteBuffer);
+    Interpreter interpreter = new Interpreter(byteBuffer);
+    float[] oneD = {1.23f, 6.54f, 7.81f};
+    float[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+    float[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+    float[][][][] fourD = {threeD, threeD};
+    float[][][][] parsedOutputs = new float[2][8][8][3];
+    interpreter.run(fourD, parsedOutputs);
+    float[] outputOneD = parsedOutputs[0][0][0];
+    float[] expected = {3.69f, 19.62f, 23.43f};
+    assertThat(outputOneD).usingTolerance(0.1f).containsExactly(expected).inOrder();
+    interpreter.close();
+    fileChannel.close();
+  }
+
+  @Test
+  public void testRunWithInvalidByteBufferModel() throws Exception {
+    Path path = MODEL_FILE.toPath();
+    FileChannel fileChannel =
+        (FileChannel) Files.newByteChannel(path, EnumSet.of(StandardOpenOption.READ));
+    ByteBuffer byteBuffer = ByteBuffer.allocate((int) fileChannel.size());
+    byteBuffer.order(ByteOrder.nativeOrder());
+    fileChannel.read(byteBuffer);
+    try {
+      Interpreter interpreter = new Interpreter(byteBuffer);
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e)
+          .hasMessageThat()
+          .contains(
+              "Model ByteBuffer should be either a MappedByteBuffer"
+                  + " of the model file, or a direct ByteBuffer using ByteOrder.nativeOrder()");
+    }
+    fileChannel.close();
+  }
+
   @Test
   public void testRun() {
     Interpreter interpreter = new Interpreter(MODEL_FILE);
@@ -195,8 +240,8 @@ public void testGetInputIndex() {
       assertThat(e)
           .hasMessageThat()
           .contains(
-              "WrongInputName is not a valid name for any input. The indexes of the inputs"
-                  + " are {input=0}");
+              "'WrongInputName' is not a valid name for any input. Names of inputs and their "
+                  + "indexes are {input=0}");
     }
     int index = interpreter.getInputIndex("input");
     assertThat(index).isEqualTo(0);
@@ -212,8 +257,8 @@ public void testGetOutputIndex() {
       assertThat(e)
           .hasMessageThat()
           .contains(
-              "WrongOutputName is not a valid name for any output. The indexes of the outputs"
-                  + " are {MobilenetV1/Predictions/Softmax=0}");
+              "'WrongOutputName' is not a valid name for any output. Names of outputs and their"
+                  + " indexes are {MobilenetV1/Predictions/Softmax=0}");
     }
     int index = interpreter.getOutputIndex("MobilenetV1/Predictions/Softmax");
     assertThat(index).isEqualTo(0);
diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index 80cefe83b29192..cf5d0b4ce9cb3c 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -31,6 +31,7 @@ cc_library(
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:schema_fbs_version",
         "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/kernels/internal:tensor_utils",
         "//tensorflow/contrib/lite/testing:util",
         "//tensorflow/core:tflite_portable_logging",
         "@com_google_googletest//:gtest",
@@ -142,9 +143,12 @@ cc_library(
         "depthwise_conv.cc",
         "dequantize.cc",
         "div.cc",
+        "elementwise.cc",
         "embedding_lookup.cc",
         "embedding_lookup_sparse.cc",
         "exp.cc",
+        "expand_dims.cc",
+        "floor.cc",
         "fully_connected.cc",
         "gather.cc",
         "hashtable_lookup.cc",
@@ -156,21 +160,27 @@ cc_library(
         "mean.cc",
         "mfcc.cc",
         "mul.cc",
+        "neg.cc",
         "pad.cc",
         "pooling.cc",
         "register.cc",
         "reshape.cc",
         "resize_bilinear.cc",
+        "select.cc",
         "skip_gram.cc",
+        "slice.cc",
         "space_to_batch_nd.cc",
         "space_to_depth.cc",
+        "sparse_to_dense.cc",
         "split.cc",
         "squeeze.cc",
         "strided_slice.cc",
         "sub.cc",
         "svdf.cc",
+        "tile.cc",
         "topk_v2.cc",
         "transpose.cc",
+        "transpose_conv.cc",
         "unidirectional_sequence_lstm.cc",
         "unidirectional_sequence_rnn.cc",
     ],
@@ -437,6 +447,32 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "floor_test",
+    size = "small",
+    srcs = ["floor_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "elementwise_test",
+    size = "small",
+    srcs = ["elementwise_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 tf_cc_test(
     name = "unidirectional_sequence_lstm_test",
     size = "small",
@@ -658,6 +694,7 @@ tf_cc_test(
         ":builtin_ops",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite/kernels:test_util",
+        "//tensorflow/contrib/lite/kernels/internal:tensor_utils",
         "@com_google_absl//absl/memory",
         "@com_google_googletest//:gtest",
     ],
@@ -823,6 +860,20 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "tile_test",
+    size = "small",
+    srcs = ["tile_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:builtin_op_data",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 tf_cc_test(
     name = "comparisons_test",
     size = "small",
@@ -840,6 +891,94 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "neg_test",
+    size = "small",
+    srcs = ["neg_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "select_test",
+    size = "small",
+    srcs = [
+        "select_test.cc",
+    ],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "slice_test",
+    size = "small",
+    srcs = [
+        "slice_test.cc",
+    ],
+    tags = [
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "transpose_conv_test",
+    size = "small",
+    srcs = ["transpose_conv_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "expand_dims_test",
+    size = "small",
+    srcs = ["expand_dims_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:builtin_op_data",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "sparse_to_dense_test",
+    size = "small",
+    srcs = ["sparse_to_dense_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:builtin_op_data",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/lite/kernels/activations.cc b/tensorflow/contrib/lite/kernels/activations.cc
index 39a54c93962b33..add36b46c0b8a4 100644
--- a/tensorflow/contrib/lite/kernels/activations.cc
+++ b/tensorflow/contrib/lite/kernels/activations.cc
@@ -55,7 +55,7 @@ void Free(TfLiteContext* context, void* buffer) {
 TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
 
@@ -68,7 +68,7 @@ TfLiteStatus TanhPrepare(TfLiteContext* context, TfLiteNode* node) {
 
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
 
@@ -95,7 +95,7 @@ TfLiteStatus SigmoidPrepare(TfLiteContext* context, TfLiteNode* node) {
 
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
 
@@ -126,7 +126,7 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
 
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
 
@@ -153,9 +153,9 @@ TfLiteStatus SoftmaxPrepare(TfLiteContext* context, TfLiteNode* node) {
 TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
-  TfLiteTensor* alpha = GetInput(context, node, 1);
+  const TfLiteTensor* alpha = GetInput(context, node, 1);
 
   output->type = input->type;
 
@@ -179,7 +179,7 @@ TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   switch (input->type) {
     case kTfLiteFloat32: {
@@ -191,13 +191,14 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     } break;
     default:
-      context->ReportError(context, "Only float32 supported currently.");
+      context->ReportError(context, "Only float32 supported currently, got %d.",
+                           input->type);
       return kTfLiteError;
   }
 }
 
 TfLiteStatus Relu1Eval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   switch (input->type) {
     case kTfLiteFloat32: {
@@ -211,13 +212,14 @@ TfLiteStatus Relu1Eval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     } break;
     default:
-      context->ReportError(context, "Only float32 supported currently.");
+      context->ReportError(context, "Only float32 supported currently, got %d.",
+                           input->type);
       return kTfLiteError;
   }
 }
 
 TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   switch (input->type) {
     case kTfLiteFloat32: {
@@ -229,14 +231,15 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     } break;
     default:
-      context->ReportError(context, "Only float32 supported currently.");
+      context->ReportError(context, "Only float32 supported currently, got %d.",
+                           input->type);
       return kTfLiteError;
   }
 }
 
 TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   switch (input->type) {
     case kTfLiteFloat32: {
@@ -256,7 +259,8 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     } break;
     default:
-      context->ReportError(context, "Only float32 supported currently.");
+      context->ReportError(context, "Only float32 supported currently, got %d.",
+                           input->type);
       return kTfLiteError;
   }
 }
@@ -265,7 +269,7 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
 TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   switch (input->type) {
     case kTfLiteFloat32: {
@@ -285,14 +289,15 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
       break;
     }
     default:
-      context->ReportError(context, "Only float32 supported currently.");
+      context->ReportError(context, "Only float32 supported currently, got %d.",
+                           input->type);
       return kTfLiteError;
   }
   return kTfLiteOk;
 }
 
 // Takes a 2D tensor and perform softmax along the second dimension.
-void Softmax2DFloat(TfLiteTensor* input, TfLiteTensor* output,
+void Softmax2DFloat(const TfLiteTensor* input, TfLiteTensor* output,
                     TfLiteSoftmaxParams* params) {
   const int batch_size = input->dims->data[0];
   const int input_size = input->dims->data[1];
@@ -327,7 +332,7 @@ void Softmax2DFloat(TfLiteTensor* input, TfLiteTensor* output,
   }
 }
 
-void Softmax2DQuantized(TfLiteTensor* input, TfLiteTensor* output,
+void Softmax2DQuantized(const TfLiteTensor* input, TfLiteTensor* output,
                         TfLiteSoftmaxParams* params, OpData* data) {
   // TODO(ahentz): this is arguably a dirty trick. Since the implementation
   // always traverses the last dimension of a 4D tensor, we will pretend our 2D
@@ -343,14 +348,14 @@ void Softmax2DQuantized(TfLiteTensor* input, TfLiteTensor* output,
 }
 
 // Takes a 4D tensor and perform softmax along the forth dimension.
-void Softmax4DFloat(TfLiteTensor* input, TfLiteTensor* output,
+void Softmax4DFloat(const TfLiteTensor* input, TfLiteTensor* output,
                     TfLiteSoftmaxParams* params) {
   optimized_ops::Softmax(GetTensorData<float>(input), GetTensorDims(input),
                          params->beta, GetTensorData<float>(output),
                          GetTensorDims(output));
 }
 
-void Softmax4DQuantized(TfLiteTensor* input, TfLiteTensor* output,
+void Softmax4DQuantized(const TfLiteTensor* input, TfLiteTensor* output,
                         TfLiteSoftmaxParams* params, OpData* data) {
   optimized_ops::Softmax(GetTensorData<uint8_t>(input), GetTensorDims(input),
                          data->input_multiplier, data->input_left_shift,
@@ -362,7 +367,7 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteSoftmaxParams*>(node->builtin_data);
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
 
   // TODO(ahentz): consider an implementation that works for many (all?)
@@ -377,8 +382,9 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
         Softmax4DFloat(input, output, params);
         return kTfLiteOk;
       }
-      context->ReportError(context,
-                           "Only 2D and 4D tensors supported currently.");
+      context->ReportError(
+          context, "Only 2D and 4D tensors supported currently, got %dD.",
+          NumDimensions(input));
       return kTfLiteError;
     }
     case kTfLiteUInt8: {
@@ -390,19 +396,21 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
         Softmax4DQuantized(input, output, params, data);
         return kTfLiteOk;
       }
-      context->ReportError(context,
-                           "Only 2D and 4D tensors supported currently.");
+      context->ReportError(
+          context, "Only 2D and 4D tensors supported currently, got %dD.",
+          NumDimensions(input));
       return kTfLiteError;
     }
     default:
-      context->ReportError(context,
-                           "Only float32 and uint8_t supported currently.");
+      context->ReportError(
+          context, "Only float32 and uint8_t supported currently, got %d.",
+          input->type);
       return kTfLiteError;
   }
 }
 
 TfLiteStatus LogSoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   switch (input->type) {
     case kTfLiteFloat32:
@@ -411,18 +419,20 @@ TfLiteStatus LogSoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
           GetTensorData<float>(output), GetTensorDims(output));
       return kTfLiteOk;
     default:
-      context->ReportError(context, "Only float32 supported currently.");
+      context->ReportError(context, "Only float32 supported currently., got %d",
+                           input->type);
       return kTfLiteError;
   }
 }
 
 TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* input = GetInput(context, node, 0);
-  TfLiteTensor* alpha = GetInput(context, node, 1);
-  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* alpha = GetInput(context, node, 1);
+  const TfLiteTensor* output = GetOutput(context, node, 0);
 
   if (input->type != kTfLiteFloat32) {
-    context->ReportError(context, "Only float32 supported currently.");
+    context->ReportError(context, "Only float32 supported currently, got %d.",
+                         input->type);
     return kTfLiteError;
   }
   TF_LITE_ENSURE_EQ(context, input->dims->size, 4);
diff --git a/tensorflow/contrib/lite/kernels/add.cc b/tensorflow/contrib/lite/kernels/add.cc
index e0aa070e2d02ce..7ca1e35489cba3 100644
--- a/tensorflow/contrib/lite/kernels/add.cc
+++ b/tensorflow/contrib/lite/kernels/add.cc
@@ -57,8 +57,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
@@ -80,7 +80,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 template <KernelType kernel_type>
 void EvalAddFloat(TfLiteContext* context, TfLiteNode* node,
                   TfLiteAddParams* params, const OpData* data,
-                  TfLiteTensor* input1, TfLiteTensor* input2,
+                  const TfLiteTensor* input1, const TfLiteTensor* input2,
                   TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(params->activation, &output_activation_min,
@@ -109,7 +109,7 @@ void EvalAddFloat(TfLiteContext* context, TfLiteNode* node,
 template <KernelType kernel_type>
 void EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
                       TfLiteAddParams* params, const OpData* data,
-                      TfLiteTensor* input1, TfLiteTensor* input2,
+                      const TfLiteTensor* input1, const TfLiteTensor* input2,
                       TfLiteTensor* output) {
   auto input1_offset = -input1->params.zero_point;
   auto input2_offset = -input2->params.zero_point;
@@ -164,8 +164,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   if (output->type == kTfLiteFloat32) {
diff --git a/tensorflow/contrib/lite/kernels/arg_max.cc b/tensorflow/contrib/lite/kernels/arg_max.cc
index a2c5e4ceadbc90..26f57e88962116 100644
--- a/tensorflow/contrib/lite/kernels/arg_max.cc
+++ b/tensorflow/contrib/lite/kernels/arg_max.cc
@@ -33,8 +33,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* axis = GetInput(context, node, kAxis);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* axis = GetInput(context, node, kAxis);
   // Make sure the axis is only 1 dimension.
   TF_LITE_ENSURE_EQ(context, NumElements(axis), 1);
 
@@ -52,7 +52,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       output->type = kTfLiteInt64;
       break;
     default:
-      context->ReportError(context, "Unknown index output data type");
+      context->ReportError(context, "Unknown index output data type: %d",
+                           params->output_type);
       return kTfLiteError;
   }
 
@@ -64,7 +65,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       break;
 
     default:
-      context->ReportError(context, "Only float32 and int types are supported");
+      context->ReportError(
+          context,
+          "Unkonwn input type: %d, only float32 and int types are supported",
+          input->type);
       return kTfLiteError;
   }
 
@@ -79,12 +83,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 // The current impl actually ignores the axis argument.
 // Only determine the index of the maximum value in the last dimension.
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* axis = GetInput(context, node, kAxis);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* axis = GetInput(context, node, kAxis);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
 #define TF_LITE_ARG_MAX(data_type, axis_type, output_type)                     \
-  TF_LITE_ENSURE_EQ(context, GetTensorData<axis_type>(axis)[0], 3);            \
   optimized_ops::ArgMax(GetTensorData<axis_type>(axis),                        \
                         GetTensorData<data_type>(input), GetTensorDims(input), \
                         GetTensorData<output_type>(output),                    \
diff --git a/tensorflow/contrib/lite/kernels/audio_spectrogram.cc b/tensorflow/contrib/lite/kernels/audio_spectrogram.cc
index 602f3888c10b37..91d8dd3fa71b4f 100644
--- a/tensorflow/contrib/lite/kernels/audio_spectrogram.cc
+++ b/tensorflow/contrib/lite/kernels/audio_spectrogram.cc
@@ -72,7 +72,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   TF_LITE_ENSURE_EQ(context, NumDimensions(input), 2);
@@ -102,7 +102,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params =
       reinterpret_cast<TfLiteAudioSpectrogramParams*>(node->user_data);
 
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   TF_LITE_ENSURE(context, params->spectrogram->Initialize(params->window_size,
diff --git a/tensorflow/contrib/lite/kernels/basic_rnn.cc b/tensorflow/contrib/lite/kernels/basic_rnn.cc
index 2c5074eca3176c..c09b15b3d263d6 100644
--- a/tensorflow/contrib/lite/kernels/basic_rnn.cc
+++ b/tensorflow/contrib/lite/kernels/basic_rnn.cc
@@ -12,18 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <unistd.h>
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <iostream>
-#include <limits>
+#include <stddef.h>
+#include <stdint.h>
 
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/kernels/activation_functor.h"
 #include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
 #include "tensorflow/contrib/lite/kernels/op_macros.h"
 
 namespace tflite {
@@ -35,20 +31,29 @@ constexpr int kInputTensor = 0;
 constexpr int kWeightsTensor = 1;
 constexpr int kRecurrentWeightsTensor = 2;
 constexpr int kBiasTensor = 3;
-constexpr int KHiddenStateTensor = 0;
+constexpr int kHiddenStateTensor = 0;
 constexpr int kOutputTensor = 1;
 
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* scratch_tensor_index = new int;
+  context->AddTensors(context, /*tensors_to_add=*/3, scratch_tensor_index);
+  return scratch_tensor_index;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<int*>(buffer);
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Check we have all the inputs and outputs we need.
   TF_LITE_ENSURE_EQ(context, node->inputs->size, 4);
   TF_LITE_ENSURE_EQ(context, node->outputs->size, 2);
 
-  TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]];
-  TfLiteTensor* input_weights =
-      &context->tensors[node->inputs->data[kWeightsTensor]];
-  TfLiteTensor* recurrent_weights =
-      &context->tensors[node->inputs->data[kRecurrentWeightsTensor]];
-  TfLiteTensor* bias = &context->tensors[node->inputs->data[kBiasTensor]];
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input_weights = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* recurrent_weights =
+      GetInput(context, node, kRecurrentWeightsTensor);
+  const TfLiteTensor* bias = GetInput(context, node, kBiasTensor);
 
   // Check all the parameters of tensor match within themselves and match the
   // input configuration.
@@ -58,10 +63,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ASSERT_EQ(input_weights->dims->data[0], bias->dims->data[0]);
   TF_LITE_ASSERT_EQ(recurrent_weights->dims->data[0], bias->dims->data[0]);
   TF_LITE_ASSERT_EQ(recurrent_weights->dims->data[1], bias->dims->data[0]);
+  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, input_weights->type, recurrent_weights->type);
 
-  TfLiteTensor* hidden_state =
-      &context->tensors[node->outputs->data[KHiddenStateTensor]];
-  TfLiteTensor* output = &context->tensors[node->outputs->data[kOutputTensor]];
+  TfLiteTensor* hidden_state = GetOutput(context, node, kHiddenStateTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   // Resize state.
   TfLiteIntArray* hidden_state_size_array = TfLiteIntArrayCreate(2);
@@ -80,25 +86,54 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(context,
                     context->ResizeTensor(context, output, output_size_array));
 
+  // Allocate temporary tensors to store quantized values of input and
+  // hidden_state tensors.
+  if (input->type == kTfLiteFloat32 && input_weights->type == kTfLiteUInt8) {
+    int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
+    TfLiteIntArrayFree(node->temporaries);
+    node->temporaries = TfLiteIntArrayCreate(3);
+    node->temporaries->data[0] = *scratch_tensor_index;
+    TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/0);
+    input_quantized->type = kTfLiteUInt8;
+    input_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
+      TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
+                                                       input_quantized_size));
+    }
+    node->temporaries->data[1] = *scratch_tensor_index + 1;
+    TfLiteTensor* hidden_state_quantized =
+        GetTemporary(context, node, /*index=*/1);
+    hidden_state_quantized->type = kTfLiteUInt8;
+    hidden_state_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(hidden_state_quantized->dims,
+                             hidden_state->dims)) {
+      TfLiteIntArray* hidden_state_quantized_size =
+          TfLiteIntArrayCopy(hidden_state->dims);
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, hidden_state_quantized,
+                                              hidden_state_quantized_size));
+    }
+    node->temporaries->data[2] = *scratch_tensor_index + 2;
+    TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/2);
+    scaling_factors->type = kTfLiteFloat32;
+    scaling_factors->allocation_type = kTfLiteArenaRw;
+    TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
+    scaling_factors_size->data[0] = batch_size;
+    if (!TfLiteIntArrayEqual(scaling_factors->dims, scaling_factors_size)) {
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
+                                                       scaling_factors_size));
+    }
+  }
+
   return kTfLiteOk;
 }
 
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params = reinterpret_cast<TfLiteRNNParams*>(node->builtin_data);
-
-  TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]];
-  TfLiteTensor* input_weights =
-      &context->tensors[node->inputs->data[kWeightsTensor]];
-  TfLiteTensor* recurrent_weights =
-      &context->tensors[node->inputs->data[kRecurrentWeightsTensor]];
-  TfLiteTensor* bias = &context->tensors[node->inputs->data[kBiasTensor]];
-  TfLiteTensor* hidden_state =
-      &context->tensors[node->outputs->data[KHiddenStateTensor]];
-  TfLiteTensor* output = &context->tensors[node->outputs->data[kOutputTensor]];
-
-  // Initialize the pointer bias.
-  const float* bias_ptr = bias->data.f;
-
+TfLiteStatus EvalFloat(const TfLiteTensor* input,
+                       const TfLiteTensor* input_weights,
+                       const TfLiteTensor* recurrent_weights,
+                       const TfLiteTensor* bias, const TfLiteRNNParams* params,
+                       TfLiteTensor* hidden_state, TfLiteTensor* output) {
   const int batch_size = input->dims->data[0];
   const int num_units = input_weights->dims->data[0];
   const int input_size = input->dims->data[1];
@@ -108,9 +143,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // Initialize the pointer to input and output.
   const float* input_ptr_batch = input->data.f;
   float* output_ptr_batch = output->data.f;
-  // Initialize input_weights and recurrent_weights.
+  // Initialize input_weights, recurrent_weights and bias.
   const float* input_weights_ptr = input_weights->data.f;
   const float* recurrent_weights_ptr = recurrent_weights->data.f;
+  const float* bias_ptr = bias->data.f;
 
   kernel_utils::RnnBatchStep(input_ptr_batch, input_weights_ptr,
                              recurrent_weights_ptr, bias_ptr, input_size,
@@ -119,11 +155,85 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
+TfLiteStatus EvalHybrid(const TfLiteTensor* input,
+                        const TfLiteTensor* input_weights,
+                        const TfLiteTensor* recurrent_weights,
+                        const TfLiteTensor* bias, const TfLiteRNNParams* params,
+                        TfLiteTensor* input_scratch,
+                        TfLiteTensor* hidden_state_scratch,
+                        TfLiteTensor* scaling_factors,
+                        TfLiteTensor* hidden_state, TfLiteTensor* output) {
+  const int batch_size = input->dims->data[0];
+  const int num_units = input_weights->dims->data[0];
+  const int input_size = input->dims->data[1];
+
+  // Initialize the pointer to hidden state.
+  float* hidden_state_ptr_batch = hidden_state->data.f;
+  // Initialize the pointer to input and output.
+  const float* input_ptr_batch = input->data.f;
+  float* output_ptr_batch = output->data.f;
+  // Initialize input_weights, recurrent_weights and bias.
+  const int8_t* input_weights_ptr =
+      reinterpret_cast<const int8_t*>(input_weights->data.uint8);
+  const int8_t* recurrent_weights_ptr =
+      reinterpret_cast<const int8_t*>(recurrent_weights->data.uint8);
+  const float* bias_ptr = bias->data.f;
+  // Get the scale of the quantized weights.
+  float input_weights_scale = input_weights->params.scale;
+  float recurrent_weights_scale = recurrent_weights->params.scale;
+  // Initialize temporary storage for quantized values.
+  int8_t* quantized_input_ptr =
+      reinterpret_cast<int8_t*>(input_scratch->data.uint8);
+  int8_t* quantized_hidden_state_ptr =
+      reinterpret_cast<int8_t*>(hidden_state_scratch->data.uint8);
+  float* scaling_factors_ptr = scaling_factors->data.f;
+
+  kernel_utils::RnnBatchStep(
+      input_ptr_batch, input_weights_ptr, input_weights_scale,
+      recurrent_weights_ptr, recurrent_weights_scale, bias_ptr, input_size,
+      num_units, batch_size, params->activation, quantized_input_ptr,
+      quantized_hidden_state_ptr, scaling_factors_ptr, hidden_state_ptr_batch,
+      output_ptr_batch);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteRNNParams*>(node->builtin_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input_weights = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* recurrent_weights =
+      GetInput(context, node, kRecurrentWeightsTensor);
+  const TfLiteTensor* bias = GetInput(context, node, kBiasTensor);
+  TfLiteTensor* hidden_state = GetOutput(context, node, kHiddenStateTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  // We already checked that weight types are consistent, so branch on one.
+  switch (input_weights->type) {
+    case kTfLiteFloat32:
+      return EvalFloat(input, input_weights, recurrent_weights, bias, params,
+                       hidden_state, output);
+    case kTfLiteUInt8: {
+      // TODO(mirkov): implement eval with quantized inputs as well.
+      TfLiteTensor* input_quantized = GetTemporary(context, node, 0);
+      TfLiteTensor* hidden_state_quantized = GetTemporary(context, node, 1);
+      TfLiteTensor* scaling_factors = GetTemporary(context, node, 2);
+      return EvalHybrid(input, input_weights, recurrent_weights, bias, params,
+                        input_quantized, hidden_state_quantized,
+                        scaling_factors, hidden_state, output);
+    }
+    default:
+      context->ReportError(context, "Type %d not currently supported.",
+                           input_weights->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
 }  // namespace rnn
 
 TfLiteRegistration* Register_RNN() {
-  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
-                                 rnn::Prepare, rnn::Eval};
+  static TfLiteRegistration r = {rnn::Init, rnn::Free, rnn::Prepare, rnn::Eval};
   return &r;
 }
 
diff --git a/tensorflow/contrib/lite/kernels/basic_rnn_test.cc b/tensorflow/contrib/lite/kernels/basic_rnn_test.cc
index fa7ef525db47c9..96465fcaf0a785 100644
--- a/tensorflow/contrib/lite/kernels/basic_rnn_test.cc
+++ b/tensorflow/contrib/lite/kernels/basic_rnn_test.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 // Unit test for TFLite RNN op.
 
-#include <iomanip>
+#include <string.h>
+#include <initializer_list>
+#include <memory>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -122,13 +124,62 @@ static float rnn_golden_output[] = {
     0,          2.02616,    0,         0.728256,  0.84183,   0.0907453,
     0.628881,   3.58099,    1.49974,   0};
 
+static std::initializer_list<float> rnn_weights = {
+    0.461459,    0.153381,   0.529743,    -0.00371218, 0.676267,   -0.211346,
+    0.317493,    0.969689,   -0.343251,   0.186423,    0.398151,   0.152399,
+    0.448504,    0.317662,   0.523556,    -0.323514,   0.480877,   0.333113,
+    -0.757714,   -0.674487,  -0.643585,   0.217766,    -0.0251462, 0.79512,
+    -0.595574,   -0.422444,  0.371572,    -0.452178,   -0.556069,  -0.482188,
+    -0.685456,   -0.727851,  0.841829,    0.551535,    -0.232336,  0.729158,
+    -0.00294906, -0.69754,   0.766073,    -0.178424,   0.369513,   -0.423241,
+    0.548547,    -0.0152023, -0.757482,   -0.85491,    0.251331,   -0.989183,
+    0.306261,    -0.340716,  0.886103,    -0.0726757,  -0.723523,  -0.784303,
+    0.0354295,   0.566564,   -0.485469,   -0.620498,   0.832546,   0.697884,
+    -0.279115,   0.294415,   -0.584313,   0.548772,    0.0648819,  0.968726,
+    0.723834,    -0.0080452, -0.350386,   -0.272803,   0.115121,   -0.412644,
+    -0.824713,   -0.992843,  -0.592904,   -0.417893,   0.863791,   -0.423461,
+    -0.147601,   -0.770664,  -0.479006,   0.654782,    0.587314,   -0.639158,
+    0.816969,    -0.337228,  0.659878,    0.73107,     0.754768,   -0.337042,
+    0.0960841,   0.368357,   0.244191,    -0.817703,   -0.211223,  0.442012,
+    0.37225,     -0.623598,  -0.405423,   0.455101,    0.673656,   -0.145345,
+    -0.511346,   -0.901675,  -0.81252,    -0.127006,   0.809865,   -0.721884,
+    0.636255,    0.868989,   -0.347973,   -0.10179,    -0.777449,  0.917274,
+    0.819286,    0.206218,   -0.00785118, 0.167141,    0.45872,    0.972934,
+    -0.276798,   0.837861,   0.747958,    -0.0151566,  -0.330057,  -0.469077,
+    0.277308,    0.415818};
+
+static std::initializer_list<float> rnn_recurrent_weights = {
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1};
+
+static std::initializer_list<float> rnn_bias = {
+    0.065691948, -0.69055247, 0.1107955,  -0.97084129, -0.23957068, -0.23566568,
+    -0.389184,   0.47481549,  -0.4791103, 0.29931796,  0.10463274,  0.83918178,
+    0.37197268,  0.61957061,  0.3956964,  -0.37609905};
+
 class RNNOpModel : public SingleOpModel {
  public:
-  RNNOpModel(int batches, int units, int size)
+  RNNOpModel(int batches, int units, int size,
+             const TensorType& weights = TensorType_FLOAT32,
+             const TensorType& recurrent_weights = TensorType_FLOAT32)
       : batches_(batches), units_(units), input_size_(size) {
     input_ = AddInput(TensorType_FLOAT32);
-    weights_ = AddInput(TensorType_FLOAT32);
-    recurrent_weights_ = AddInput(TensorType_FLOAT32);
+    weights_ = AddInput(weights);
+    recurrent_weights_ = AddInput(recurrent_weights);
     bias_ = AddInput(TensorType_FLOAT32);
     hidden_state_ = AddOutput(TensorType_FLOAT32);
     output_ = AddOutput(TensorType_FLOAT32);
@@ -173,7 +224,7 @@ class RNNOpModel : public SingleOpModel {
   int num_units() { return units_; }
   int num_batches() { return batches_; }
 
- private:
+ protected:
   int input_;
   int weights_;
   int recurrent_weights_;
@@ -186,53 +237,26 @@ class RNNOpModel : public SingleOpModel {
   int input_size_;
 };
 
-TEST(FullyConnectedOpTest, BlackBoxTest) {
+// The hybrid model has quantized weights and recurrent_weights.
+class HybridRNNOpModel : public RNNOpModel {
+ public:
+  HybridRNNOpModel(int batches, int units, int size)
+      : RNNOpModel(batches, units, size, TensorType_UINT8, TensorType_UINT8) {}
+
+  void SetWeights(std::initializer_list<float> f) {
+    SymmetricQuantizeAndPopulate(weights_, f);
+  }
+
+  void SetRecurrentWeights(std::initializer_list<float> f) {
+    SymmetricQuantizeAndPopulate(recurrent_weights_, f);
+  }
+};
+
+TEST(RnnOpTest, BlackBoxTest) {
   RNNOpModel rnn(2, 16, 8);
-  rnn.SetWeights(
-      {0.461459,    0.153381,   0.529743,    -0.00371218, 0.676267,   -0.211346,
-       0.317493,    0.969689,   -0.343251,   0.186423,    0.398151,   0.152399,
-       0.448504,    0.317662,   0.523556,    -0.323514,   0.480877,   0.333113,
-       -0.757714,   -0.674487,  -0.643585,   0.217766,    -0.0251462, 0.79512,
-       -0.595574,   -0.422444,  0.371572,    -0.452178,   -0.556069,  -0.482188,
-       -0.685456,   -0.727851,  0.841829,    0.551535,    -0.232336,  0.729158,
-       -0.00294906, -0.69754,   0.766073,    -0.178424,   0.369513,   -0.423241,
-       0.548547,    -0.0152023, -0.757482,   -0.85491,    0.251331,   -0.989183,
-       0.306261,    -0.340716,  0.886103,    -0.0726757,  -0.723523,  -0.784303,
-       0.0354295,   0.566564,   -0.485469,   -0.620498,   0.832546,   0.697884,
-       -0.279115,   0.294415,   -0.584313,   0.548772,    0.0648819,  0.968726,
-       0.723834,    -0.0080452, -0.350386,   -0.272803,   0.115121,   -0.412644,
-       -0.824713,   -0.992843,  -0.592904,   -0.417893,   0.863791,   -0.423461,
-       -0.147601,   -0.770664,  -0.479006,   0.654782,    0.587314,   -0.639158,
-       0.816969,    -0.337228,  0.659878,    0.73107,     0.754768,   -0.337042,
-       0.0960841,   0.368357,   0.244191,    -0.817703,   -0.211223,  0.442012,
-       0.37225,     -0.623598,  -0.405423,   0.455101,    0.673656,   -0.145345,
-       -0.511346,   -0.901675,  -0.81252,    -0.127006,   0.809865,   -0.721884,
-       0.636255,    0.868989,   -0.347973,   -0.10179,    -0.777449,  0.917274,
-       0.819286,    0.206218,   -0.00785118, 0.167141,    0.45872,    0.972934,
-       -0.276798,   0.837861,   0.747958,    -0.0151566,  -0.330057,  -0.469077,
-       0.277308,    0.415818});
-
-  rnn.SetBias({0.065691948, -0.69055247, 0.1107955, -0.97084129, -0.23957068,
-               -0.23566568, -0.389184, 0.47481549, -0.4791103, 0.29931796,
-               0.10463274, 0.83918178, 0.37197268, 0.61957061, 0.3956964,
-               -0.37609905});
-
-  rnn.SetRecurrentWeights({0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1});
+  rnn.SetWeights(rnn_weights);
+  rnn.SetBias(rnn_bias);
+  rnn.SetRecurrentWeights(rnn_recurrent_weights);
 
   rnn.ResetHiddenState();
   const int input_sequence_size = sizeof(rnn_input) / sizeof(float) /
@@ -256,6 +280,35 @@ TEST(FullyConnectedOpTest, BlackBoxTest) {
   }
 }
 
+TEST(HybridRnnOpTest, BlackBoxTest) {
+  HybridRNNOpModel rnn(2, 16, 8);
+  rnn.SetWeights(rnn_weights);
+  rnn.SetBias(rnn_bias);
+  rnn.SetRecurrentWeights(rnn_recurrent_weights);
+
+  rnn.ResetHiddenState();
+  const int input_sequence_size = sizeof(rnn_input) / sizeof(float) /
+                                  (rnn.input_size() * rnn.num_batches());
+
+  for (int i = 0; i < input_sequence_size; i++) {
+    float* batch_start = rnn_input + i * rnn.input_size();
+    float* batch_end = batch_start + rnn.input_size();
+    rnn.SetInput(0, batch_start, batch_end);
+    rnn.SetInput(rnn.input_size(), batch_start, batch_end);
+
+    rnn.Invoke();
+
+    float* golden_start = rnn_golden_output + i * rnn.num_units();
+    float* golden_end = golden_start + rnn.num_units();
+    std::vector<float> expected;
+    expected.insert(expected.end(), golden_start, golden_end);
+    expected.insert(expected.end(), golden_start, golden_end);
+
+    EXPECT_THAT(rnn.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                     expected, /*max_abs_error=*/0.0104)));
+  }
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc b/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
index 90edf4f9e3683f..c8cee88edfdbf4 100644
--- a/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
+++ b/tensorflow/contrib/lite/kernels/batch_to_space_nd.cc
@@ -40,9 +40,9 @@ struct BatchToSpaceNDContext {
     crops = GetInput(context, node, 2);
     output = GetOutput(context, node, 0);
   }
-  TfLiteTensor* input;
-  TfLiteTensor* block_shape;
-  TfLiteTensor* crops;
+  const TfLiteTensor* input;
+  const TfLiteTensor* block_shape;
+  const TfLiteTensor* crops;
   TfLiteTensor* output;
 };
 
@@ -66,12 +66,10 @@ TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
   TF_LITE_ENSURE_EQ(context, NumDimensions(op_context->crops),
                     kSpatialDimensionNum);
 
-  // TODO(ycling): Add crops as part of calculation. Remove check for a crops
-  // containing all zeroes.
-  TF_LITE_ENSURE_EQ(context, crops[0], 0);
-  TF_LITE_ENSURE_EQ(context, crops[1], 0);
-  TF_LITE_ENSURE_EQ(context, crops[2], 0);
-  TF_LITE_ENSURE_EQ(context, crops[3], 0);
+  TF_LITE_ENSURE(context, crops[0] >= 0);
+  TF_LITE_ENSURE(context, crops[1] >= 0);
+  TF_LITE_ENSURE(context, crops[2] >= 0);
+  TF_LITE_ENSURE(context, crops[3] >= 0);
 
   // Number of batch must be multiple of (block_shape[0] * block_shape[1]).
   TF_LITE_ENSURE_EQ(context,
@@ -79,8 +77,16 @@ TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
 
   const int output_batch_size =
       input_size->data[0] / (block_shape[0] * block_shape[1]);
-  const int output_height = input_size->data[1] * block_shape[0];
-  const int output_width = input_size->data[2] * block_shape[1];
+
+  const int crops_top = crops[0];
+  const int crops_bottom = crops[1];
+  const int crops_left = crops[2];
+  const int crops_right = crops[3];
+  const int output_height =
+      input_size->data[1] * block_shape[0] - crops_top - crops_bottom;
+  const int output_width =
+      input_size->data[2] * block_shape[1] - crops_left - crops_right;
+
   const int output_channel_size = input_size->data[3];
 
   TfLiteIntArray* output_size = TfLiteIntArrayCopy(input_size);
@@ -157,8 +163,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       }
       break;
     default:
-      context->ReportError(context,
-                           "Type is currently not supported by BatchToSpace.");
+      context->ReportError(
+          context, "Type %d is currently not supported by BatchToSpace.",
+          op_context.input->type);
       return kTfLiteError;
   }
 #undef TF_LITE_BATCH_TO_SPACE_ND
diff --git a/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc b/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc
index 8485cde1b40066..95b025c1b30cc6 100644
--- a/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc
+++ b/tensorflow/contrib/lite/kernels/batch_to_space_nd_test.cc
@@ -120,16 +120,16 @@ TEST(BatchToSpaceNDOpTest, InvalidShapeTest) {
 }
 
 TEST(BatchToSpaceNDOpTest, InvalidCropsConstTest) {
-  EXPECT_DEATH(BatchToSpaceNDOpConstModel({3, 2, 2, 1}, {2, 2}, {0, 0, 0, 1}),
-               "1 != 0");
+  EXPECT_DEATH(BatchToSpaceNDOpConstModel({3, 2, 2, 1}, {2, 2}, {0, 0, 0, -1}),
+               "crops.3. >= 0 was not true.");
 }
 
 TEST(BatchToSpaceNDOpTest, InvalidCropsDynamicTest) {
   BatchToSpaceNDOpDynamicModel m({4, 2, 2, 1});
   m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
   m.SetBlockShape({2, 2});
-  m.SetCrops({0, 0, 1, 0});
-  EXPECT_DEATH(m.Invoke(), "1 != 0");
+  m.SetCrops({0, 0, -1, 0});
+  EXPECT_DEATH(m.Invoke(), "crops.2. >= 0 was not true.");
 }
 
 }  // namespace
diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc
index a64ac42bc43336..3425288f027a6f 100644
--- a/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc
+++ b/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm.cc
@@ -96,15 +96,23 @@ constexpr int kBwProjectionWeightsTensor = 33;  // Optional
 constexpr int kBwProjectionBiasTensor = 34;  // Optional
 
 // Output tensors.
-constexpr int kFwScratchBufferTensor = 0;
-constexpr int kFwOutputStateTensor = 1;
-constexpr int kFwCellStateTensor = 2;
-constexpr int kFwOutputTensor = 3;
+constexpr int kFwOutputStateTensor = 0;
+constexpr int kFwCellStateTensor = 1;
+constexpr int kFwOutputTensor = 2;
+
+constexpr int kBwOutputStateTensor = 3;
+constexpr int kBwCellStateTensor = 4;
+constexpr int kBwOutputTensor = 5;
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* scratch_tensor_index = new int;
+  context->AddTensors(context, 2, scratch_tensor_index);
+  return scratch_tensor_index;
+}
 
-constexpr int kBwScratchBufferTensor = 4;
-constexpr int kBwOutputStateTensor = 5;
-constexpr int kBwCellStateTensor = 6;
-constexpr int kBwOutputTensor = 7;
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<int*>(buffer);
+}
 
 // Check that input tensor dimensions matches with each other.
 TfLiteStatus CheckLstmTensorDimensions(
@@ -127,7 +135,7 @@ TfLiteStatus CheckLstmTensorDimensions(
   TF_LITE_ENSURE(context, params->cell_clip >= 0);
   TF_LITE_ENSURE(context, params->proj_clip >= 0);
 
-  TfLiteTensor* input_to_input_weights =
+  const TfLiteTensor* input_to_input_weights =
       GetOptionalInputTensor(context, node, input_to_input_weights_tensor);
   if (input_to_input_weights) {
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->size, 2);
@@ -135,19 +143,19 @@ TfLiteStatus CheckLstmTensorDimensions(
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[1], n_input);
   }
 
-  TfLiteTensor* input_to_forget_weights =
+  const TfLiteTensor* input_to_forget_weights =
       GetInput(context, node, input_to_forget_weights_tensor);
   TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[1], n_input);
 
-  TfLiteTensor* input_to_cell_weights =
+  const TfLiteTensor* input_to_cell_weights =
       GetInput(context, node, input_to_cell_weights_tensor);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[1], n_input);
 
-  TfLiteTensor* recurrent_to_input_weights =
+  const TfLiteTensor* recurrent_to_input_weights =
       GetOptionalInputTensor(context, node, recurrent_to_input_weights_tensor);
   if (recurrent_to_input_weights) {
     TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->size, 2);
@@ -157,7 +165,7 @@ TfLiteStatus CheckLstmTensorDimensions(
                       n_output);
   }
 
-  TfLiteTensor* recurrent_to_forget_weights =
+  const TfLiteTensor* recurrent_to_forget_weights =
       GetInput(context, node, recurrent_to_forget_weights_tensor);
   TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[0],
@@ -165,7 +173,7 @@ TfLiteStatus CheckLstmTensorDimensions(
   TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[1],
                     n_output);
 
-  TfLiteTensor* recurrent_to_cell_weights =
+  const TfLiteTensor* recurrent_to_cell_weights =
       GetInput(context, node, recurrent_to_cell_weights_tensor);
   TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[0], n_cell);
@@ -181,21 +189,21 @@ TfLiteStatus CheckLstmTensorDimensions(
        (recurrent_to_input_weights == nullptr));
   TF_LITE_ENSURE(context, cifg_weights_all_or_none == true);
 
-  TfLiteTensor* cell_to_input_weights =
+  const TfLiteTensor* cell_to_input_weights =
       GetOptionalInputTensor(context, node, cell_to_input_weights_tensor);
   if (cell_to_input_weights) {
     TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->data[0], n_cell);
   }
 
-  TfLiteTensor* cell_to_forget_weights =
+  const TfLiteTensor* cell_to_forget_weights =
       GetOptionalInputTensor(context, node, cell_to_forget_weights_tensor);
   if (cell_to_forget_weights) {
     TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->data[0], n_cell);
   }
 
-  TfLiteTensor* cell_to_output_weights =
+  const TfLiteTensor* cell_to_output_weights =
       GetOptionalInputTensor(context, node, cell_to_output_weights_tensor);
   if (cell_to_output_weights) {
     TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->size, 1);
@@ -214,7 +222,7 @@ TfLiteStatus CheckLstmTensorDimensions(
   TF_LITE_ENSURE(context, peephole_weights_all_or_none == true);
 
   // Make sure the input gate bias is present only when not a CIFG-LSTM.
-  TfLiteTensor* input_gate_bias =
+  const TfLiteTensor* input_gate_bias =
       GetOptionalInputTensor(context, node, input_gate_bias_tensor);
   if (use_cifg) {
     TF_LITE_ENSURE_EQ(context, input_gate_bias, nullptr);
@@ -223,21 +231,22 @@ TfLiteStatus CheckLstmTensorDimensions(
     TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->data[0], n_cell);
   }
 
-  TfLiteTensor* forget_gate_bias =
+  const TfLiteTensor* forget_gate_bias =
       GetInput(context, node, forget_gate_bias_tensor);
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell);
 
-  TfLiteTensor* cell_bias = GetInput(context, node, cell_gate_bias_tensor);
+  const TfLiteTensor* cell_bias =
+      GetInput(context, node, cell_gate_bias_tensor);
   TF_LITE_ENSURE_EQ(context, cell_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, cell_bias->dims->data[0], n_cell);
 
-  TfLiteTensor* output_gate_bias =
+  const TfLiteTensor* output_gate_bias =
       GetInput(context, node, output_gate_bias_tensor);
   TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->data[0], n_cell);
 
-  TfLiteTensor* projection_weights =
+  const TfLiteTensor* projection_weights =
       GetOptionalInputTensor(context, node, projection_weights_tensor);
   if (projection_weights) {
     TF_LITE_ENSURE_EQ(context, projection_weights->dims->size, 2);
@@ -245,7 +254,7 @@ TfLiteStatus CheckLstmTensorDimensions(
     TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[1], n_cell);
   }
 
-  TfLiteTensor* projection_bias =
+  const TfLiteTensor* projection_bias =
       GetOptionalInputTensor(context, node, projection_bias_tensor);
   if (projection_bias) {
     TF_LITE_ENSURE_EQ(context, projection_bias->dims->size, 1);
@@ -296,26 +305,28 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
 // Resize the output, state and scratch tensors based on the sizes of the input
 // tensors. Also check that the size of the input tensors match each other.
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
+
   // Check we have all the inputs and outputs we need.
   TF_LITE_ENSURE_EQ(context, node->inputs->size, 35);
-  TF_LITE_ENSURE_EQ(context, node->outputs->size, 8);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 6);
 
   // Inferring batch size, number of outputs and sequence length and
   // number of cells from the input tensors.
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TF_LITE_ENSURE(context, input->dims->size > 1);
   const int max_time = input->dims->data[0];
   const int n_batch = input->dims->data[1];
   const int n_input = input->dims->data[2];
 
-  TfLiteTensor* fw_input_to_output_weights =
+  const TfLiteTensor* fw_input_to_output_weights =
       GetInput(context, node, kFwInputToOutputWeightsTensor);
   const int n_fw_cell = fw_input_to_output_weights->dims->data[0];
   TF_LITE_ENSURE_EQ(context, fw_input_to_output_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, fw_input_to_output_weights->dims->data[1],
                     n_input);
 
-  TfLiteTensor* fw_recurrent_to_output_weights =
+  const TfLiteTensor* fw_recurrent_to_output_weights =
       GetInput(context, node, kFwRecurrentToOutputWeightsTensor);
   TF_LITE_ENSURE_EQ(context, fw_recurrent_to_output_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, fw_recurrent_to_output_weights->dims->data[0],
@@ -330,12 +341,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* fw_output_state =
       GetOutput(context, node, kFwOutputStateTensor);
   TfLiteTensor* fw_cell_state = GetOutput(context, node, kFwCellStateTensor);
-  // TODO(ghodrat): Modify this as soon as we have a finalized method for
-  // scratch buffers.
-  TfLiteTensor* fw_scratch_buffer =
-      GetOutput(context, node, kFwScratchBufferTensor);
 
-  // Resize the output and output_state tensors.
+  // Resize the output, output_state and cell_state tensors.
   TfLiteIntArray* fw_output_size = TfLiteIntArrayCreate(3);
   fw_output_size->data[0] = max_time;
   fw_output_size->data[1] = n_batch;
@@ -349,18 +356,25 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, fw_output_state,
                                                    fw_output_state_size));
 
-  // Resize the scratch buffer tensor.
   TfLiteIntArray* fw_cell_size = TfLiteIntArrayCreate(2);
   fw_cell_size->data[0] = n_batch;
   fw_cell_size->data[1] = n_fw_cell;
   TF_LITE_ENSURE_OK(
       context, context->ResizeTensor(context, fw_cell_state, fw_cell_size));
 
+  // Create a scratch buffer tensor.
+  TfLiteIntArrayFree(node->temporaries);
+  node->temporaries = TfLiteIntArrayCreate(2);
+  node->temporaries->data[0] = *scratch_tensor_index;
+  TfLiteTensor* fw_scratch_buffer = GetTemporary(context, node, /*index=*/0);
+  fw_scratch_buffer->type = input->type;
+  fw_scratch_buffer->allocation_type = kTfLiteArenaRw;
+
   // Mark state tensors as persistent tensors.
   fw_output_state->allocation_type = kTfLiteArenaRwPersistent;
   fw_cell_state->allocation_type = kTfLiteArenaRwPersistent;
 
-  TfLiteTensor* fw_input_to_input_weights =
+  const TfLiteTensor* fw_input_to_input_weights =
       GetOptionalInputTensor(context, node, kFwInputToInputWeightsTensor);
   const bool fw_use_cifg = (fw_input_to_input_weights == nullptr);
   TfLiteIntArray* fw_scratch_buffer_size = TfLiteIntArrayCreate(2);
@@ -375,14 +389,14 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, fw_scratch_buffer,
                                                    fw_scratch_buffer_size));
   // Same for the backward cell.
-  TfLiteTensor* bw_input_to_output_weights =
+  const TfLiteTensor* bw_input_to_output_weights =
       GetInput(context, node, kBwInputToOutputWeightsTensor);
   const int n_bw_cell = bw_input_to_output_weights->dims->data[0];
   TF_LITE_ENSURE_EQ(context, bw_input_to_output_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, bw_input_to_output_weights->dims->data[1],
                     n_input);
 
-  TfLiteTensor* bw_recurrent_to_output_weights =
+  const TfLiteTensor* bw_recurrent_to_output_weights =
       GetInput(context, node, kBwRecurrentToOutputWeightsTensor);
   TF_LITE_ENSURE_EQ(context, bw_recurrent_to_output_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, bw_recurrent_to_output_weights->dims->data[0],
@@ -392,17 +406,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Check that input tensor dimensions matches with each other.
   CheckInputTensorDimensions(context, node, n_input, n_bw_output, n_bw_cell);
 
-  // Get the pointer to output, state and scratch buffer tensors.
+  // Get the pointer to output, output_state and cell_state buffer tensors.
   TfLiteTensor* bw_output = GetOutput(context, node, kBwOutputTensor);
   TfLiteTensor* bw_output_state =
       GetOutput(context, node, kBwOutputStateTensor);
   TfLiteTensor* bw_cell_state = GetOutput(context, node, kBwCellStateTensor);
-  // TODO(ghodrat): Modify this as soon as we have a finalized method for
-  // scratch buffers.
-  TfLiteTensor* bw_scratch_buffer =
-      GetOutput(context, node, kBwScratchBufferTensor);
 
-  // Resize the output and output_state tensors.
+  // Resize the output, output_state and cell_state tensors.
   TfLiteIntArray* bw_output_size = TfLiteIntArrayCreate(3);
   bw_output_size->data[0] = max_time;
   bw_output_size->data[1] = n_batch;
@@ -416,18 +426,23 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, bw_output_state,
                                                    bw_output_state_size));
 
-  // Resize the scratch buffer tensor.
   TfLiteIntArray* bw_cell_size = TfLiteIntArrayCreate(2);
   bw_cell_size->data[0] = n_batch;
   bw_cell_size->data[1] = n_bw_cell;
   TF_LITE_ENSURE_OK(
       context, context->ResizeTensor(context, bw_cell_state, bw_cell_size));
 
+  // Create a scratch buffer tensor.
+  node->temporaries->data[1] = *(scratch_tensor_index) + 1;
+  TfLiteTensor* bw_scratch_buffer = GetTemporary(context, node, /*index=*/1);
+  bw_scratch_buffer->type = input->type;
+  bw_scratch_buffer->allocation_type = kTfLiteArenaRw;
+
   // Mark state tensors as persistent tensors.
   bw_output_state->allocation_type = kTfLiteArenaRwPersistent;
   bw_cell_state->allocation_type = kTfLiteArenaRwPersistent;
 
-  TfLiteTensor* bw_input_to_input_weights =
+  const TfLiteTensor* bw_input_to_input_weights =
       GetOptionalInputTensor(context, node, kBwInputToInputWeightsTensor);
   const bool bw_use_cifg = (bw_input_to_input_weights == nullptr);
   TfLiteIntArray* bw_scratch_buffer_size = TfLiteIntArrayCreate(2);
@@ -449,48 +464,49 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
 
   // Input tensor.
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const int max_time = input->dims->data[0];
   const int n_batch = input->dims->data[1];
   const int n_input = input->dims->data[2];
 
   // Tensors for the forward cell.
-  TfLiteTensor* fw_input_to_input_weights =
+  const TfLiteTensor* fw_input_to_input_weights =
       GetOptionalInputTensor(context, node, kFwInputToInputWeightsTensor);
-  TfLiteTensor* fw_input_to_forget_weights =
+  const TfLiteTensor* fw_input_to_forget_weights =
       GetInput(context, node, kFwInputToForgetWeightsTensor);
-  TfLiteTensor* fw_input_to_cell_weights =
+  const TfLiteTensor* fw_input_to_cell_weights =
       GetInput(context, node, kFwInputToCellWeightsTensor);
-  TfLiteTensor* fw_input_to_output_weights =
+  const TfLiteTensor* fw_input_to_output_weights =
       GetInput(context, node, kFwInputToOutputWeightsTensor);
 
-  TfLiteTensor* fw_recurrent_to_input_weights =
+  const TfLiteTensor* fw_recurrent_to_input_weights =
       GetOptionalInputTensor(context, node, kFwRecurrentToInputWeightsTensor);
-  TfLiteTensor* fw_recurrent_to_forget_weights =
+  const TfLiteTensor* fw_recurrent_to_forget_weights =
       GetInput(context, node, kFwRecurrentToForgetWeightsTensor);
-  TfLiteTensor* fw_recurrent_to_cell_weights =
+  const TfLiteTensor* fw_recurrent_to_cell_weights =
       GetInput(context, node, kFwRecurrentToCellWeightsTensor);
-  TfLiteTensor* fw_recurrent_to_output_weights =
+  const TfLiteTensor* fw_recurrent_to_output_weights =
       GetInput(context, node, kFwRecurrentToOutputWeightsTensor);
 
-  TfLiteTensor* fw_cell_to_input_weights =
+  const TfLiteTensor* fw_cell_to_input_weights =
       GetOptionalInputTensor(context, node, kFwCellToInputWeightsTensor);
-  TfLiteTensor* fw_cell_to_forget_weights =
+  const TfLiteTensor* fw_cell_to_forget_weights =
       GetOptionalInputTensor(context, node, kFwCellToForgetWeightsTensor);
-  TfLiteTensor* fw_cell_to_output_weights =
+  const TfLiteTensor* fw_cell_to_output_weights =
       GetOptionalInputTensor(context, node, kFwCellToOutputWeightsTensor);
 
-  TfLiteTensor* fw_input_gate_bias =
+  const TfLiteTensor* fw_input_gate_bias =
       GetOptionalInputTensor(context, node, kFwInputGateBiasTensor);
-  TfLiteTensor* fw_forget_gate_bias =
+  const TfLiteTensor* fw_forget_gate_bias =
       GetInput(context, node, kFwForgetGateBiasTensor);
-  TfLiteTensor* fw_cell_bias = GetInput(context, node, kFwCellGateBiasTensor);
-  TfLiteTensor* fw_output_gate_bias =
+  const TfLiteTensor* fw_cell_bias =
+      GetInput(context, node, kFwCellGateBiasTensor);
+  const TfLiteTensor* fw_output_gate_bias =
       GetInput(context, node, kFwOutputGateBiasTensor);
 
-  TfLiteTensor* fw_projection_weights =
+  const TfLiteTensor* fw_projection_weights =
       GetOptionalInputTensor(context, node, kFwProjectionWeightsTensor);
-  TfLiteTensor* fw_projection_bias =
+  const TfLiteTensor* fw_projection_bias =
       GetOptionalInputTensor(context, node, kFwProjectionBiasTensor);
 
   TfLiteTensor* fw_output_state =
@@ -499,42 +515,43 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* fw_output = GetOutput(context, node, kFwOutputTensor);
 
   // Tensors for the backward cell.
-  TfLiteTensor* bw_input_to_input_weights =
+  const TfLiteTensor* bw_input_to_input_weights =
       GetOptionalInputTensor(context, node, kBwInputToInputWeightsTensor);
-  TfLiteTensor* bw_input_to_forget_weights =
+  const TfLiteTensor* bw_input_to_forget_weights =
       GetInput(context, node, kBwInputToForgetWeightsTensor);
-  TfLiteTensor* bw_input_to_cell_weights =
+  const TfLiteTensor* bw_input_to_cell_weights =
       GetInput(context, node, kBwInputToCellWeightsTensor);
-  TfLiteTensor* bw_input_to_output_weights =
+  const TfLiteTensor* bw_input_to_output_weights =
       GetInput(context, node, kBwInputToOutputWeightsTensor);
 
-  TfLiteTensor* bw_recurrent_to_input_weights =
+  const TfLiteTensor* bw_recurrent_to_input_weights =
       GetOptionalInputTensor(context, node, kBwRecurrentToInputWeightsTensor);
-  TfLiteTensor* bw_recurrent_to_forget_weights =
+  const TfLiteTensor* bw_recurrent_to_forget_weights =
       GetInput(context, node, kBwRecurrentToForgetWeightsTensor);
-  TfLiteTensor* bw_recurrent_to_cell_weights =
+  const TfLiteTensor* bw_recurrent_to_cell_weights =
       GetInput(context, node, kBwRecurrentToCellWeightsTensor);
-  TfLiteTensor* bw_recurrent_to_output_weights =
+  const TfLiteTensor* bw_recurrent_to_output_weights =
       GetInput(context, node, kBwRecurrentToOutputWeightsTensor);
 
-  TfLiteTensor* bw_cell_to_input_weights =
+  const TfLiteTensor* bw_cell_to_input_weights =
       GetOptionalInputTensor(context, node, kBwCellToInputWeightsTensor);
-  TfLiteTensor* bw_cell_to_forget_weights =
+  const TfLiteTensor* bw_cell_to_forget_weights =
       GetOptionalInputTensor(context, node, kBwCellToForgetWeightsTensor);
-  TfLiteTensor* bw_cell_to_output_weights =
+  const TfLiteTensor* bw_cell_to_output_weights =
       GetOptionalInputTensor(context, node, kBwCellToOutputWeightsTensor);
 
-  TfLiteTensor* bw_input_gate_bias =
+  const TfLiteTensor* bw_input_gate_bias =
       GetOptionalInputTensor(context, node, kBwInputGateBiasTensor);
-  TfLiteTensor* bw_forget_gate_bias =
+  const TfLiteTensor* bw_forget_gate_bias =
       GetInput(context, node, kBwForgetGateBiasTensor);
-  TfLiteTensor* bw_cell_bias = GetInput(context, node, kBwCellGateBiasTensor);
-  TfLiteTensor* bw_output_gate_bias =
+  const TfLiteTensor* bw_cell_bias =
+      GetInput(context, node, kBwCellGateBiasTensor);
+  const TfLiteTensor* bw_output_gate_bias =
       GetInput(context, node, kBwOutputGateBiasTensor);
 
-  TfLiteTensor* bw_projection_weights =
+  const TfLiteTensor* bw_projection_weights =
       GetOptionalInputTensor(context, node, kBwProjectionWeightsTensor);
-  TfLiteTensor* bw_projection_bias =
+  const TfLiteTensor* bw_projection_bias =
       GetOptionalInputTensor(context, node, kBwProjectionBiasTensor);
 
   TfLiteTensor* bw_output_state =
@@ -553,7 +570,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   // Index the scratch buffers pointers to the global scratch buffer.
   TfLiteTensor* fw_scratch_buffer =
-      GetOutput(context, node, kFwScratchBufferTensor);
+      &context->tensors[node->temporaries->data[0]];
   float* fw_input_gate_scratch = nullptr;
   float* fw_cell_scratch = nullptr;
   float* fw_forget_gate_scratch = nullptr;
@@ -624,7 +641,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   // Index the scratch buffers pointers to the global scratch buffer.
   TfLiteTensor* bw_scratch_buffer =
-      GetOutput(context, node, kBwScratchBufferTensor);
+      &context->tensors[node->temporaries->data[1]];
   float* bw_input_gate_scratch = nullptr;
   float* bw_cell_scratch = nullptr;
   float* bw_forget_gate_scratch = nullptr;
@@ -691,9 +708,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace bidirectional_sequence_lstm
 
 TfLiteRegistration* Register_BIDIRECTIONAL_SEQUENCE_LSTM() {
-  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
-                                 bidirectional_sequence_lstm::Prepare,
-                                 bidirectional_sequence_lstm::Eval};
+  static TfLiteRegistration r = {
+      bidirectional_sequence_lstm::Init, bidirectional_sequence_lstm::Free,
+      bidirectional_sequence_lstm::Prepare, bidirectional_sequence_lstm::Eval};
   return &r;
 }
 
diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm_test.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm_test.cc
index cca857bac0633d..a18e1bce34ca03 100644
--- a/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm_test.cc
+++ b/tensorflow/contrib/lite/kernels/bidirectional_sequence_lstm_test.cc
@@ -102,9 +102,6 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
       fw_projection_bias_ = AddNullInput();
     }
 
-    fw_scratch_buffer_ = AddOutput(TensorType_FLOAT32);
-    // TODO(ghodrat): Modify these states when we have a permanent solution for
-    // persistent buffer.
     fw_output_state_ = AddOutput(TensorType_FLOAT32);
     fw_cell_state_ = AddOutput(TensorType_FLOAT32);
     fw_output_ = AddOutput(TensorType_FLOAT32);
@@ -164,9 +161,6 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
       bw_projection_bias_ = AddNullInput();
     }
 
-    bw_scratch_buffer_ = AddOutput(TensorType_FLOAT32);
-    // TODO(ghodrat): Modify these states when we have a permanent solution for
-    // persistent buffer.
     bw_output_state_ = AddOutput(TensorType_FLOAT32);
     bw_cell_state_ = AddOutput(TensorType_FLOAT32);
     bw_output_ = AddOutput(TensorType_FLOAT32);
@@ -349,12 +343,10 @@ class BidirectionalLSTMOpModel : public SingleOpModel {
   int fw_output_;
   int fw_output_state_;
   int fw_cell_state_;
-  int fw_scratch_buffer_;
 
   int bw_output_;
   int bw_output_state_;
   int bw_cell_state_;
-  int bw_scratch_buffer_;
 
   int n_batch_;
   int n_input_;
diff --git a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc
index 12f4ff97cfd90e..911b108eaad605 100644
--- a/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc
+++ b/tensorflow/contrib/lite/kernels/bidirectional_sequence_rnn_test.cc
@@ -161,7 +161,7 @@ static float rnn_golden_bw_output[] = {
     0,        0,          1.86126,   0,         0.728256,  0.750013,  0.011861,
     0.576383, 3.38891,    1.29273,   0};
 
-constexpr std::initializer_list<float> weights = {
+const std::initializer_list<float> weights = {
     0.461459,    0.153381,   0.529743,    -0.00371218, 0.676267,   -0.211346,
     0.317493,    0.969689,   -0.343251,   0.186423,    0.398151,   0.152399,
     0.448504,    0.317662,   0.523556,    -0.323514,   0.480877,   0.333113,
@@ -628,12 +628,12 @@ static float golden_endtoend_output[] = {
     -2.080307, 0.896140,  -3.104050, 0.983158,  -0.424898, -1.154270, -3.805728,
     1.978917,  -1.314387, 1.235096,  -3.148906, 1.113173,  0.111713,  2.055213,
     -7.565283, 2.100342};
-constexpr std::initializer_list<float> biases = {
+const std::initializer_list<float> biases = {
     0.065691948, -0.69055247, 0.1107955,  -0.97084129, -0.23957068, -0.23566568,
     -0.389184,   0.47481549,  -0.4791103, 0.29931796,  0.10463274,  0.83918178,
     0.37197268,  0.61957061,  0.3956964,  -0.37609905};
 
-constexpr std::initializer_list<float> recurrent_weights = {
+const std::initializer_list<float> recurrent_weights = {
     0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
     0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
diff --git a/tensorflow/contrib/lite/kernels/cast.cc b/tensorflow/contrib/lite/kernels/cast.cc
index 17ef2c572ebbfa..60770ca0aa8b85 100644
--- a/tensorflow/contrib/lite/kernels/cast.cc
+++ b/tensorflow/contrib/lite/kernels/cast.cc
@@ -32,7 +32,7 @@ constexpr int kOutputTensor = 0;
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   // TODO(ahentz): these two checks would make the new implementation
@@ -69,6 +69,9 @@ TfLiteStatus copyToTensor(const FromT* in, TfLiteTensor* out,
     case kTfLiteFloat32:
       copyCast(in, out->data.f, num_elements);
       break;
+    case kTfLiteBool:
+      copyCast(in, out->data.b, num_elements);
+      break;
     default:
       // Unsupported type.
       return kTfLiteError;
@@ -77,7 +80,7 @@ TfLiteStatus copyToTensor(const FromT* in, TfLiteTensor* out,
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   const int num_elements = NumElements(input);
   TF_LITE_ENSURE_EQ(context, num_elements, NumElements(output));
@@ -90,6 +93,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       return copyToTensor(input->data.uint8, output, num_elements);
     case kTfLiteFloat32:
       return copyToTensor(input->data.f, output, num_elements);
+    case kTfLiteBool:
+      return copyToTensor(input->data.b, output, num_elements);
     default:
       // Unsupported type.
       return kTfLiteError;
diff --git a/tensorflow/contrib/lite/kernels/cast_test.cc b/tensorflow/contrib/lite/kernels/cast_test.cc
index 4e56482a371550..53e20007378392 100644
--- a/tensorflow/contrib/lite/kernels/cast_test.cc
+++ b/tensorflow/contrib/lite/kernels/cast_test.cc
@@ -57,6 +57,22 @@ TEST(CastOpModel, CastFloatToInt) {
               ElementsAreArray({100, 20, 3, 0, 0, 1}));
 }
 
+TEST(CastOpModel, CastFloatToBool) {
+  CastOpModel m({TensorType_FLOAT32, {3, 2}}, {TensorType_BOOL, {3, 2}});
+  m.PopulateTensor<float>(m.input(), {100.f, -1.0f, 0.f, 0.4f, 0.999f, 1.1f});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<bool>(m.output()),
+              ElementsAreArray({true, true, false, true, true, true}));
+}
+
+TEST(CastOpModel, CastBoolToFloat) {
+  CastOpModel m({TensorType_BOOL, {3, 2}}, {TensorType_FLOAT32, {3, 2}});
+  m.PopulateTensor<bool>(m.input(), {true, true, false, true, false, true});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<float>(m.output()),
+              ElementsAreArray({1.f, 1.0f, 0.f, 1.0f, 0.0f, 1.0f}));
+}
+
 }  // namespace
 }  // namespace tflite
 int main(int argc, char** argv) {
diff --git a/tensorflow/contrib/lite/kernels/comparisons.cc b/tensorflow/contrib/lite/kernels/comparisons.cc
index 87c413cb982daf..3b81062cd42f04 100644
--- a/tensorflow/contrib/lite/kernels/comparisons.cc
+++ b/tensorflow/contrib/lite/kernels/comparisons.cc
@@ -28,12 +28,12 @@ constexpr int kInputTensor1 = 0;
 constexpr int kInputTensor2 = 1;
 constexpr int kOutputTensor = 0;
 
-TfLiteStatus LessPrepare(TfLiteContext* context, TfLiteNode* node) {
+TfLiteStatus ComparisonPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   // Don't support string and bool.
@@ -56,61 +56,143 @@ TfLiteStatus LessPrepare(TfLiteContext* context, TfLiteNode* node) {
   return context->ResizeTensor(context, output, output_size);
 }
 
-TfLiteStatus LessEval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+#define TF_LITE_COMPARISON(type, opname, requires_broadcast)    \
+  requires_broadcast                                            \
+      ? reference_ops::Broadcast##opname(                       \
+            GetTensorData<type>(input1), GetTensorDims(input1), \
+            GetTensorData<type>(input2), GetTensorDims(input2), \
+            GetTensorData<bool>(output), GetTensorDims(output)) \
+      : reference_ops::opname(                                  \
+            GetTensorData<type>(input1), GetTensorDims(input1), \
+            GetTensorData<type>(input2), GetTensorDims(input2), \
+            GetTensorData<bool>(output), GetTensorDims(output));
+
+TfLiteStatus GreaterEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  bool requires_broadcast = !HaveSameShapes(input1, input2);
+  // TODO(renjieliu): Support quantized data.
+  switch (input1->type) {
+    case kTfLiteFloat32:
+      TF_LITE_COMPARISON(float, Greater, requires_broadcast);
+      break;
+    case kTfLiteInt32:
+      TF_LITE_COMPARISON(int32_t, Greater, requires_broadcast);
+      break;
+    case kTfLiteInt64:
+      TF_LITE_COMPARISON(int64_t, Greater, requires_broadcast);
+      break;
+    default:
+      context->ReportError(context,
+                           "Does not support type %d, requires float|int",
+                           input1->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
 
+TfLiteStatus GreaterEqualEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   bool requires_broadcast = !HaveSameShapes(input1, input2);
+  // TODO(renjieliu): Support quantized data.
+  switch (input1->type) {
+    case kTfLiteFloat32:
+      TF_LITE_COMPARISON(float, GreaterEqual, requires_broadcast);
+      break;
+    case kTfLiteInt32:
+      TF_LITE_COMPARISON(int32_t, GreaterEqual, requires_broadcast);
+      break;
+    case kTfLiteInt64:
+      TF_LITE_COMPARISON(int64_t, GreaterEqual, requires_broadcast);
+      break;
+    default:
+      context->ReportError(context,
+                           "Does not support type %d, requires float|int",
+                           input1->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
 
-#define TF_LITE_LESS(type, opname)                                          \
-  reference_ops::opname(GetTensorData<type>(input1), GetTensorDims(input1), \
-                        GetTensorData<type>(input2), GetTensorDims(input2), \
-                        GetTensorData<bool>(output), GetTensorDims(output));
+TfLiteStatus LessEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  bool requires_broadcast = !HaveSameShapes(input1, input2);
+  // TODO(renjieliu): Support quantized data.
+  switch (input1->type) {
+    case kTfLiteFloat32:
+      TF_LITE_COMPARISON(float, Less, requires_broadcast);
+      break;
+    case kTfLiteInt32:
+      TF_LITE_COMPARISON(int32_t, Less, requires_broadcast);
+      break;
+    case kTfLiteInt64:
+      TF_LITE_COMPARISON(int64_t, Less, requires_broadcast);
+      break;
+    default:
+      context->ReportError(context,
+                           "Does not support type %d, requires float|int",
+                           input1->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
 
+TfLiteStatus LessEqualEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  bool requires_broadcast = !HaveSameShapes(input1, input2);
   // TODO(renjieliu): Support quantized data.
-  if (requires_broadcast) {
-    switch (input1->type) {
-      case kTfLiteFloat32:
-        TF_LITE_LESS(float, BroadcastLess);
-        break;
-      case kTfLiteInt32:
-        TF_LITE_LESS(int32_t, BroadcastLess);
-        break;
-      case kTfLiteInt64:
-        TF_LITE_LESS(int64_t, BroadcastLess);
-        break;
-      default:
-        context->ReportError(context,
-                             "Does not support type other than float|int");
-        return kTfLiteError;
-    }
-  } else {
-    switch (input1->type) {
-      case kTfLiteFloat32:
-        TF_LITE_LESS(float, Less);
-        break;
-      case kTfLiteInt32:
-        TF_LITE_LESS(int32_t, Less);
-        break;
-      case kTfLiteInt64:
-        TF_LITE_LESS(int64_t, Less);
-        break;
-      default:
-        context->ReportError(context,
-                             "Does not support type other than float|int");
-        return kTfLiteError;
-    }
+  switch (input1->type) {
+    case kTfLiteFloat32:
+      TF_LITE_COMPARISON(float, LessEqual, requires_broadcast);
+      break;
+    case kTfLiteInt32:
+      TF_LITE_COMPARISON(int32_t, LessEqual, requires_broadcast);
+      break;
+    case kTfLiteInt64:
+      TF_LITE_COMPARISON(int64_t, LessEqual, requires_broadcast);
+      break;
+    default:
+      context->ReportError(context,
+                           "Does not support type %d, requires float|int",
+                           input1->type);
+      return kTfLiteError;
   }
-#undef TF_LITE_LESS
   return kTfLiteOk;
 }
 
 }  // namespace comparisons
 
+TfLiteRegistration* Register_GREATER() {
+  static TfLiteRegistration r = {nullptr, nullptr,
+                                 comparisons::ComparisonPrepare,
+                                 comparisons::GreaterEval};
+  return &r;
+}
+
+TfLiteRegistration* Register_GREATER_EQUAL() {
+  static TfLiteRegistration r = {nullptr, nullptr,
+                                 comparisons::ComparisonPrepare,
+                                 comparisons::GreaterEqualEval};
+  return &r;
+}
+
 TfLiteRegistration* Register_LESS() {
-  static TfLiteRegistration r = {nullptr, nullptr, comparisons::LessPrepare,
-                                 comparisons::LessEval};
+  static TfLiteRegistration r = {
+      nullptr, nullptr, comparisons::ComparisonPrepare, comparisons::LessEval};
+  return &r;
+}
+
+TfLiteRegistration* Register_LESS_EQUAL() {
+  static TfLiteRegistration r = {nullptr, nullptr,
+                                 comparisons::ComparisonPrepare,
+                                 comparisons::LessEqualEval};
   return &r;
 }
 
diff --git a/tensorflow/contrib/lite/kernels/comparisons_test.cc b/tensorflow/contrib/lite/kernels/comparisons_test.cc
index da2d7f858984a4..835d238d36d175 100644
--- a/tensorflow/contrib/lite/kernels/comparisons_test.cc
+++ b/tensorflow/contrib/lite/kernels/comparisons_test.cc
@@ -23,6 +23,139 @@ namespace {
 
 using ::testing::ElementsAreArray;
 
+class GreaterOpModel : public SingleOpModel {
+ public:
+  GreaterOpModel(std::initializer_list<int> input1_shape,
+                 std::initializer_list<int> input2_shape,
+                 TensorType input_type) {
+    input1_ = AddInput(input_type);
+    input2_ = AddInput(input_type);
+    output_ = AddOutput(TensorType_BOOL);
+    SetBuiltinOp(BuiltinOperator_GREATER, BuiltinOptions_GreaterOptions,
+                 CreateGreaterOptions(builder_).Union());
+    BuildInterpreter({input1_shape, input2_shape});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+
+  std::vector<bool> GetOutput() { return ExtractVector<bool>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+TEST(ComparisonsTest, GreaterFloat) {
+  GreaterOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32);
+  model.PopulateTensor<float>(model.input1(), {0.1, 0.9, 0.7, 0.3});
+  model.PopulateTensor<float>(model.input2(), {0.1, 0.2, 0.6, 0.5});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({false, true, true, false}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(ComparisonsTest, GreaterInt) {
+  GreaterOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32);
+  model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
+  model.PopulateTensor<int>(model.input2(), {1, 2, 7, 5});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({false, true, false, false}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(ComparisonsTest, GreaterBroadcast) {
+  GreaterOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32);
+  model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
+  model.PopulateTensor<int>(model.input2(), {7});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({false, true, false, false}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(ComparisonsTest, GreaterBroadcastTwoD) {
+  GreaterOpModel model({1, 1, 2, 4}, {1, 1, 1, 4}, TensorType_INT32);
+  model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3, 2, 4, 2, 8});
+  model.PopulateTensor<int>(model.input2(), {7, 1, 2, 4});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({false, true, true, false,
+                                                   false, true, false, true}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 4}));
+}
+
+class GreaterEqualOpModel : public SingleOpModel {
+ public:
+  GreaterEqualOpModel(std::initializer_list<int> input1_shape,
+                      std::initializer_list<int> input2_shape,
+                      TensorType input_type) {
+    input1_ = AddInput(input_type);
+    input2_ = AddInput(input_type);
+    output_ = AddOutput(TensorType_BOOL);
+    SetBuiltinOp(BuiltinOperator_GREATER_EQUAL,
+                 BuiltinOptions_GreaterEqualOptions,
+                 CreateGreaterEqualOptions(builder_).Union());
+    BuildInterpreter({input1_shape, input2_shape});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+
+  std::vector<bool> GetOutput() { return ExtractVector<bool>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+TEST(ComparisonsTest, GreaterEqualFloat) {
+  GreaterEqualOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32);
+  model.PopulateTensor<float>(model.input1(), {0.1, 0.9, 0.7, 0.3});
+  model.PopulateTensor<float>(model.input2(), {0.1, 0.2, 0.6, 0.5});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, true, true, false}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(ComparisonsTest, GreaterEqualInt) {
+  GreaterEqualOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32);
+  model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
+  model.PopulateTensor<int>(model.input2(), {1, 2, 7, 5});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({false, true, true, false}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(ComparisonsTest, GreaterEqualBroadcast) {
+  GreaterEqualOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32);
+  model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
+  model.PopulateTensor<int>(model.input2(), {7});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({false, true, true, false}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(ComparisonsTest, GreaterEqualBroadcastTwoD) {
+  GreaterEqualOpModel model({1, 1, 2, 4}, {1, 1, 1, 4}, TensorType_INT32);
+  model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3, 2, 4, 2, 8});
+  model.PopulateTensor<int>(model.input2(), {7, 1, 2, 4});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({false, true, true, false,
+                                                   false, true, true, true}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 4}));
+}
+
 class LessOpModel : public SingleOpModel {
  public:
   LessOpModel(std::initializer_list<int> input1_shape,
@@ -47,7 +180,7 @@ class LessOpModel : public SingleOpModel {
   int output_;
 };
 
-TEST(ArgMaxOpTest, LessFloat) {
+TEST(ComparisonsTest, LessFloat) {
   LessOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32);
   model.PopulateTensor<float>(model.input1(), {0.1, 0.9, 0.7, 0.3});
   model.PopulateTensor<float>(model.input2(), {0.1, 0.2, 0.6, 0.5});
@@ -57,7 +190,7 @@ TEST(ArgMaxOpTest, LessFloat) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
 }
 
-TEST(ArgMaxOpTest, LessInt) {
+TEST(ComparisonsTest, LessInt) {
   LessOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32);
   model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
   model.PopulateTensor<int>(model.input2(), {1, 2, 6, 5});
@@ -67,7 +200,7 @@ TEST(ArgMaxOpTest, LessInt) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
 }
 
-TEST(ArgMaxOpTest, LessBroadcast) {
+TEST(ComparisonsTest, LessBroadcast) {
   LessOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32);
   model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
   model.PopulateTensor<int>(model.input2(), {7});
@@ -77,7 +210,7 @@ TEST(ArgMaxOpTest, LessBroadcast) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
 }
 
-TEST(ArgMaxOpTest, LessBroadcastTwoD) {
+TEST(ComparisonsTest, LessBroadcastTwoD) {
   LessOpModel model({1, 1, 2, 4}, {1, 1, 1, 4}, TensorType_INT32);
   model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3, 2, 4, 6, 8});
   model.PopulateTensor<int>(model.input2(), {7, 1, 2, 4});
@@ -88,6 +221,72 @@ TEST(ArgMaxOpTest, LessBroadcastTwoD) {
   EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 4}));
 }
 
+class LessEqualOpModel : public SingleOpModel {
+ public:
+  LessEqualOpModel(std::initializer_list<int> input1_shape,
+                   std::initializer_list<int> input2_shape,
+                   TensorType input_type) {
+    input1_ = AddInput(input_type);
+    input2_ = AddInput(input_type);
+    output_ = AddOutput(TensorType_BOOL);
+    SetBuiltinOp(BuiltinOperator_LESS_EQUAL, BuiltinOptions_LessEqualOptions,
+                 CreateLessEqualOptions(builder_).Union());
+    BuildInterpreter({input1_shape, input2_shape});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+
+  std::vector<bool> GetOutput() { return ExtractVector<bool>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+TEST(ComparisonsTest, LessEqualFloat) {
+  LessEqualOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32);
+  model.PopulateTensor<float>(model.input1(), {0.1, 0.9, 0.7, 0.3});
+  model.PopulateTensor<float>(model.input2(), {0.1, 0.2, 0.6, 0.5});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, false, false, true}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(ComparisonsTest, LessEqualInt) {
+  LessEqualOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32);
+  model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
+  model.PopulateTensor<int>(model.input2(), {1, 2, 7, 5});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, false, true, true}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(ComparisonsTest, LessEqualBroadcast) {
+  LessEqualOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32);
+  model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
+  model.PopulateTensor<int>(model.input2(), {7});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, false, true, true}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(ComparisonsTest, LessEqualBroadcastTwoD) {
+  LessEqualOpModel model({1, 1, 2, 4}, {1, 1, 1, 4}, TensorType_INT32);
+  model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3, 2, 4, 2, 8});
+  model.PopulateTensor<int>(model.input2(), {7, 1, 2, 4});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, false, false, true,
+                                                   true, false, true, false}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 4}));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/conv.cc b/tensorflow/contrib/lite/kernels/conv.cc
index 3b467b3aa28458..ee42e5cdc838fa 100644
--- a/tensorflow/contrib/lite/kernels/conv.cc
+++ b/tensorflow/contrib/lite/kernels/conv.cc
@@ -212,8 +212,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     } else {
       TF_LITE_ENSURE_EQ(context, bias->type, data_type);
     }
-    TF_LITE_ENSURE_EQ(context, bias->dims->size, 1);
-    TF_LITE_ENSURE_EQ(context, bias->dims->data[0], filter->dims->data[0]);
+    TF_LITE_ENSURE_EQ(context, NumElements(bias), SizeOfDimension(filter, 0));
   }
 
   int channels_out = filter->dims->data[0];
@@ -255,6 +254,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     double real_multiplier = 0.0;
     TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
         context, input, filter, bias, output, &real_multiplier));
+    TF_LITE_ENSURE(context, real_multiplier < 1.0);
     QuantizeMultiplierSmallerThanOne(real_multiplier, &data->output_multiplier,
                                      &data->output_shift);
     CalculateActivationRangeUint8(params->activation, output,
@@ -489,7 +489,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                                  bias, im2col, hwcn_weights, output);
       break;
     default:
-      context->ReportError(context, "Type not currently supported.");
+      context->ReportError(context, "Type %d not currently supported.",
+                           input->type);
       return kTfLiteError;
   }
   return kTfLiteOk;
diff --git a/tensorflow/contrib/lite/kernels/depthwise_conv.cc b/tensorflow/contrib/lite/kernels/depthwise_conv.cc
index eeda1bc3c5ba2d..a308de055f49ed 100644
--- a/tensorflow/contrib/lite/kernels/depthwise_conv.cc
+++ b/tensorflow/contrib/lite/kernels/depthwise_conv.cc
@@ -83,9 +83,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   bool hasBias = NumInputs(node) == 3;
 
   TF_LITE_ENSURE(context, hasBias || NumInputs(node) == 2);
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-  TfLiteTensor* bias = nullptr;
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+  const TfLiteTensor* bias = nullptr;
 
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
@@ -151,8 +151,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     double real_multiplier = 0.0;
     TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
         context, input, filter, bias, output, &real_multiplier));
-    QuantizeMultiplierSmallerThanOne(real_multiplier, &data->output_multiplier,
-                                     &data->output_shift);
+    int exponent;
+    QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
+    data->output_shift = -exponent;
     CalculateActivationRangeUint8(params->activation, output,
                                   &data->output_activation_min,
                                   &data->output_activation_max);
@@ -169,8 +170,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 template <KernelType kernel_type>
 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
                TfLiteDepthwiseConvParams* params, OpData* data,
-               TfLiteTensor* input, TfLiteTensor* filter, TfLiteTensor* bias,
-               TfLiteTensor* output) {
+               const TfLiteTensor* input, const TfLiteTensor* filter,
+               const TfLiteTensor* bias, TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(params->activation, &output_activation_min,
                                 &output_activation_max);
@@ -196,8 +197,8 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
 template <KernelType kernel_type>
 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                    TfLiteDepthwiseConvParams* params, OpData* data,
-                   TfLiteTensor* input, TfLiteTensor* filter,
-                   TfLiteTensor* bias, TfLiteTensor* output) {
+                   const TfLiteTensor* input, const TfLiteTensor* filter,
+                   const TfLiteTensor* bias, TfLiteTensor* output) {
   auto input_offset = -input->params.zero_point;
   auto filter_offset = -filter->params.zero_point;
   auto output_offset = output->params.zero_point;
@@ -230,9 +231,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
-  TfLiteTensor* bias =
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kFilterTensor);
+  const TfLiteTensor* bias =
       (NumInputs(node) == 3) ? GetInput(context, node, kBiasTensor) : nullptr;
 
   // TODO(aselle): Consider whether float conv and quantized conv should be
@@ -247,7 +248,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                                  bias, output);
       break;
     default:
-      context->ReportError(context, "Type not currently supported.");
+      context->ReportError(context, "Type %d not currently supported.",
+                           input->type);
       return kTfLiteError;
   }
   return kTfLiteOk;
diff --git a/tensorflow/contrib/lite/kernels/depthwise_conv_test.cc b/tensorflow/contrib/lite/kernels/depthwise_conv_test.cc
index 1439c8bce14ad1..c00cafb9fbfaf5 100644
--- a/tensorflow/contrib/lite/kernels/depthwise_conv_test.cc
+++ b/tensorflow/contrib/lite/kernels/depthwise_conv_test.cc
@@ -47,12 +47,6 @@ class BaseDepthwiseConvolutionOpModel : public SingleOpModel {
     }
 
     output_ = AddOutput(output);
-    if (input.type != TensorType_FLOAT32) {
-      // The following is required by quantized inference. It is the unittest's
-      // responsibility to make sure the output scale falls into the correct
-      // range.
-      CHECK_LT(GetScale(input_) * GetScale(filter_), GetScale(output_));
-    }
 
     int input_depth = GetShape(input_)[3];
     int output_depth = GetShape(filter_)[3];
@@ -176,6 +170,43 @@ TEST(QuantizedDepthwiseConvolutionOpTest, SimpleTestQuantized) {
                              }));
 }
 
+TEST(QuantizedDepthwiseConvolutionOpTest,
+     SimpleTestQuantizedFilterMultiplierGreaterThan1) {
+  QuantizedDepthwiseConvolutionOpModel quant_op(
+      {TensorType_UINT8, {1, 3, 2, 2}, -63.5, 64},
+      {TensorType_UINT8, {1, 2, 2, 4}, -128.5, 128},
+      {TensorType_UINT8, {}, -127, 128});
+  DepthwiseConvolutionOpModel float_op({TensorType_FLOAT32, {1, 3, 2, 2}},
+                                       {TensorType_FLOAT32, {1, 2, 2, 4}},
+                                       {TensorType_FLOAT32, {}});
+
+  std::initializer_list<float> input = {
+      1, 2, 7,  8,   // column 1
+      3, 4, 9,  10,  // column 2
+      5, 6, 11, 12,  // column 3
+  };
+  std::initializer_list<float> filter = {
+      1,  2,   3,   4,    //
+      -9, 10,  -11, 12,   //
+      5,  6,   7,   8,    //
+      13, -14, 15,  -16,  //
+  };
+  std::initializer_list<float> bias = {1, 2, 3, 4};
+
+  quant_op.SetInput(input);
+  quant_op.SetFilter(filter);
+  quant_op.SetBias(bias);
+  quant_op.Invoke();
+
+  float_op.SetInput(input);
+  float_op.SetFilter(filter);
+  float_op.SetBias(bias);
+  float_op.Invoke();
+
+  EXPECT_THAT(quant_op.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear(float_op.GetOutput(), 1)));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/dequantize.cc b/tensorflow/contrib/lite/kernels/dequantize.cc
index e685f2465f627c..672b2170e4990f 100644
--- a/tensorflow/contrib/lite/kernels/dequantize.cc
+++ b/tensorflow/contrib/lite/kernels/dequantize.cc
@@ -32,7 +32,7 @@ struct OpContext {
     input = GetInput(context, node, 0);
     output = GetOutput(context, node, 0);
   }
-  TfLiteTensor* input;
+  const TfLiteTensor* input;
   TfLiteTensor* output;
 };
 
diff --git a/tensorflow/contrib/lite/kernels/div.cc b/tensorflow/contrib/lite/kernels/div.cc
index ec380c8e4956e5..d264821e30cf62 100644
--- a/tensorflow/contrib/lite/kernels/div.cc
+++ b/tensorflow/contrib/lite/kernels/div.cc
@@ -57,8 +57,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
@@ -80,7 +80,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 template <KernelType kernel_type>
 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
                TfLiteDivParams* params, const OpData* data,
-               TfLiteTensor* input1, TfLiteTensor* input2,
+               const TfLiteTensor* input1, const TfLiteTensor* input2,
                TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(params->activation, &output_activation_min,
@@ -106,22 +106,21 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
 #undef TF_LITE_DIV
 }
 
-
-
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteDivParams*>(node->builtin_data);
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   if (output->type == kTfLiteFloat32) {
     EvalFloat<kernel_type>(context, node, params, data, input1, input2, output);
   } else {
-    context->ReportError(context,
-                         "Div only supports FLOAT32 and quantized UINT8 now.");
+    context->ReportError(
+        context, "Div only supports FLOAT32 and quantized UINT8 now, got %d.",
+        output->type);
     return kTfLiteError;
   }
 
diff --git a/tensorflow/contrib/lite/kernels/elementwise.cc b/tensorflow/contrib/lite/kernels/elementwise.cc
new file mode 100644
index 00000000000000..0bd50469507401
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/elementwise.cc
@@ -0,0 +1,68 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cmath>
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace elementwise {
+
+TfLiteStatus SinPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  // Quantized float is not supported yet.
+  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  return context->ResizeTensor(context, output,
+                               TfLiteIntArrayCopy(input->dims));
+}
+
+TfLiteStatus SinEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      size_t elements = NumElements(input);
+      const float* in = GetTensorData<float>(input);
+      const float* in_end = in + elements;
+      float* out = output->data.f;
+      for (; in < in_end; in++, out++) *out = std::sin(*in);
+      return kTfLiteOk;
+    }
+    default: {
+      context->ReportError(context, "Input type is %d, requires float32",
+                           input->type);
+      return kTfLiteError;
+    }
+  }
+}
+
+}  // namespace elementwise
+
+TfLiteRegistration* Register_SIN() {
+  static TfLiteRegistration r = {nullptr, nullptr, elementwise::SinPrepare,
+                                 elementwise::SinEval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/elementwise_test.cc b/tensorflow/contrib/lite/kernels/elementwise_test.cc
new file mode 100644
index 00000000000000..412ffb04b90fbc
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/elementwise_test.cc
@@ -0,0 +1,60 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class SinOpModel : public SingleOpModel {
+ public:
+  SinOpModel(std::initializer_list<int> input_shape) {
+    input_ = AddInput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(BuiltinOperator_SIN, BuiltinOptions_NONE, 0);
+    BuildInterpreter({input_shape});
+  }
+
+  int input() const { return input_; }
+  int output() const { return output_; }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(ElementWise, Sin) {
+  SinOpModel m({1, 1, 4, 1});
+  m.PopulateTensor<float>(m.input(), {0, 3.1415926, -3.1415926, 1});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<float>(m.output()),
+              ElementsAreArray(ArrayFloatNear({0, 0, 0, 0.84147})));
+  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({1, 1, 4, 1}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/embedding_lookup.cc b/tensorflow/contrib/lite/kernels/embedding_lookup.cc
index 4e8cb396d43a58..7539c0b30ded92 100644
--- a/tensorflow/contrib/lite/kernels/embedding_lookup.cc
+++ b/tensorflow/contrib/lite/kernels/embedding_lookup.cc
@@ -51,11 +51,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* lookup = GetInput(context, node, 0);
+  const TfLiteTensor* lookup = GetInput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, NumDimensions(lookup), 1);
   TF_LITE_ENSURE_EQ(context, lookup->type, kTfLiteInt32);
 
-  TfLiteTensor* value = GetInput(context, node, 1);
+  const TfLiteTensor* value = GetInput(context, node, 1);
   TF_LITE_ENSURE(context, NumDimensions(value) >= 2);
 
   TfLiteTensor* output = GetOutput(context, node, 0);
@@ -71,8 +71,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, 0);
-  TfLiteTensor* lookup = GetInput(context, node, 0);
-  TfLiteTensor* value = GetInput(context, node, 1);
+  const TfLiteTensor* lookup = GetInput(context, node, 0);
+  const TfLiteTensor* value = GetInput(context, node, 1);
 
   const int row_size = SizeOfDimension(value, 0);
   const int row_bytes = value->bytes / row_size;
diff --git a/tensorflow/contrib/lite/kernels/embedding_lookup_sparse.cc b/tensorflow/contrib/lite/kernels/embedding_lookup_sparse.cc
index 6c770e7f71efe8..d3be36993c3843 100644
--- a/tensorflow/contrib/lite/kernels/embedding_lookup_sparse.cc
+++ b/tensorflow/contrib/lite/kernels/embedding_lookup_sparse.cc
@@ -81,19 +81,19 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 5);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* ids = GetInput(context, node, 0);
+  const TfLiteTensor* ids = GetInput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, NumDimensions(ids), 1);
   TF_LITE_ENSURE_EQ(context, ids->type, kTfLiteInt32);
 
-  TfLiteTensor* indices = GetInput(context, node, 1);
+  const TfLiteTensor* indices = GetInput(context, node, 1);
   TF_LITE_ENSURE_EQ(context, NumDimensions(indices), 2);
   TF_LITE_ENSURE_EQ(context, indices->type, kTfLiteInt32);
 
-  TfLiteTensor* shape = GetInput(context, node, 2);
+  const TfLiteTensor* shape = GetInput(context, node, 2);
   TF_LITE_ENSURE_EQ(context, NumDimensions(shape), 1);
   TF_LITE_ENSURE_EQ(context, shape->type, kTfLiteInt32);
 
-  TfLiteTensor* weights = GetInput(context, node, 3);
+  const TfLiteTensor* weights = GetInput(context, node, 3);
   TF_LITE_ENSURE_EQ(context, NumDimensions(weights), 1);
   TF_LITE_ENSURE_EQ(context, weights->type, kTfLiteFloat32);
 
@@ -102,7 +102,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, SizeOfDimension(indices, 0),
                     SizeOfDimension(weights, 0));
 
-  TfLiteTensor* value = GetInput(context, node, 4);
+  const TfLiteTensor* value = GetInput(context, node, 4);
   TF_LITE_ENSURE(context, NumDimensions(value) >= 2);
 
   // Mark the output as a dynamic tensor.
@@ -139,11 +139,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params =
       reinterpret_cast<TfLiteEmbeddingLookupSparseParams*>(node->builtin_data);
   TfLiteTensor* output = GetOutput(context, node, 0);
-  TfLiteTensor* ids = GetInput(context, node, 0);
-  TfLiteTensor* indices = GetInput(context, node, 1);
-  TfLiteTensor* dense_shape = GetInput(context, node, 2);
-  TfLiteTensor* weights = GetInput(context, node, 3);
-  TfLiteTensor* value = GetInput(context, node, 4);
+  const TfLiteTensor* ids = GetInput(context, node, 0);
+  const TfLiteTensor* indices = GetInput(context, node, 1);
+  const TfLiteTensor* dense_shape = GetInput(context, node, 2);
+  const TfLiteTensor* weights = GetInput(context, node, 3);
+  const TfLiteTensor* value = GetInput(context, node, 4);
 
   const int lookup_rank = SizeOfDimension(indices, 1);
   const int embedding_rank = NumDimensions(value);
diff --git a/tensorflow/contrib/lite/kernels/exp.cc b/tensorflow/contrib/lite/kernels/exp.cc
index a9e79b742dc2c8..ce03cdfe26cac8 100644
--- a/tensorflow/contrib/lite/kernels/exp.cc
+++ b/tensorflow/contrib/lite/kernels/exp.cc
@@ -36,7 +36,7 @@ struct ExpContext {
     input = GetInput(context, node, 0);
     output = GetOutput(context, node, 0);
   }
-  TfLiteTensor* input;
+  const TfLiteTensor* input;
   TfLiteTensor* output;
 };
 
diff --git a/tensorflow/contrib/lite/kernels/expand_dims.cc b/tensorflow/contrib/lite/kernels/expand_dims.cc
new file mode 100644
index 00000000000000..ed33012864354c
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/expand_dims.cc
@@ -0,0 +1,113 @@
+
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+#include <vector>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace expand_dims {
+constexpr int kInput = 0;
+constexpr int kAxis = 1;
+constexpr int kOutput = 0;
+
+namespace {
+TfLiteStatus ExpandTensorDim(TfLiteContext* context, const TfLiteTensor& input,
+                             int axis, TfLiteTensor* output) {
+  const TfLiteIntArray& input_dims = *input.dims;
+  if (axis < 0) {
+    axis = input_dims.size + 1 + axis;
+  }
+  TF_LITE_ENSURE(context, axis <= input_dims.size);
+
+  TfLiteIntArray* output_dims = TfLiteIntArrayCreate(input_dims.size + 1);
+  for (int i = 0; i < output_dims->size; ++i) {
+    if (i < axis) {
+      output_dims->data[i] = input_dims.data[i];
+    } else if (i == axis) {
+      output_dims->data[i] = 1;
+    } else {
+      output_dims->data[i] = input_dims.data[i - 1];
+    }
+  }
+
+  return context->ResizeTensor(context, output, output_dims);
+}
+
+TfLiteStatus GetAxisValueFromTensor(TfLiteContext* context,
+                                    const TfLiteTensor& axis, int* axis_value) {
+  TF_LITE_ENSURE_EQ(context, NumElements(&axis), 1);
+  switch (axis.type) {
+    case kTfLiteInt32:
+      *axis_value = *GetTensorData<int32_t>(&axis);
+      return kTfLiteOk;
+    case kTfLiteInt64:
+      *axis_value = *GetTensorData<int64_t>(&axis);
+      return kTfLiteOk;
+    default:
+      return kTfLiteError;
+  }
+}
+
+}  // namespace
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  const TfLiteTensor* input = GetInput(context, node, kInput);
+  const TfLiteTensor* axis = GetInput(context, node, kAxis);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  output->type = input->type;
+  if (IsConstantTensor(axis)) {
+    int axis_value;
+    TF_LITE_ENSURE_OK(context,
+                      GetAxisValueFromTensor(context, *axis, &axis_value));
+    return ExpandTensorDim(context, *input, axis_value, output);
+  }
+  SetTensorToDynamic(output);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  // Just copy input to output.
+  const TfLiteTensor* input = GetInput(context, node, kInput);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* axis = GetInput(context, node, kAxis);
+  if (IsDynamicTensor(output)) {
+    int axis_value;
+    TF_LITE_ENSURE_OK(context,
+                      GetAxisValueFromTensor(context, *axis, &axis_value));
+    TF_LITE_ENSURE_OK(context,
+                      ExpandTensorDim(context, *input, axis_value, output));
+  }
+  memcpy(output->data.raw, input->data.raw, input->bytes);
+  return kTfLiteOk;
+}
+
+}  // namespace expand_dims
+TfLiteRegistration* Register_EXPAND_DIMS() {
+  static TfLiteRegistration r = {nullptr, nullptr, expand_dims::Prepare,
+                                 expand_dims::Eval};
+  return &r;
+}
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/expand_dims_test.cc b/tensorflow/contrib/lite/kernels/expand_dims_test.cc
new file mode 100644
index 00000000000000..b755e8ce293442
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/expand_dims_test.cc
@@ -0,0 +1,83 @@
+
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class ExpandDimsOpModel : public SingleOpModel {
+ public:
+  ExpandDimsOpModel(std::initializer_list<int> input_shape,
+                    TensorType input_type) {
+    input_ = AddInput(input_type);
+    axis_ = AddInput(TensorType_INT32);
+    output_ = AddOutput(input_type);
+    SetBuiltinOp(BuiltinOperator_EXPAND_DIMS, BuiltinOptions_ExpandDimsOptions,
+                 0);
+    BuildInterpreter({input_shape, {1}});
+  }
+  void SetInputFloat(std::initializer_list<float> data) {
+    PopulateTensor<float>(input_, data);
+  }
+  void SetAxis(int axis) { PopulateTensor<int32>(axis_, {axis}); }
+  std::vector<float> GetValuesFloat() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input_;
+  int axis_;
+  int output_;
+};
+
+TEST(ExpandDimsOpTest, DifferentAxis) {
+  ExpandDimsOpModel m({2, 2}, TensorType_FLOAT32);
+  const auto values = {-1.f, 1.f, -2.f, 2.f};
+  m.SetInputFloat(values);
+  m.SetAxis(0);
+  m.Invoke();
+  EXPECT_THAT(m.GetValuesFloat(), ElementsAreArray(values));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 2}));
+
+  m.SetAxis(1);
+  m.Invoke();
+  EXPECT_THAT(m.GetValuesFloat(), ElementsAreArray(values));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 2}));
+
+  m.SetAxis(2);
+  m.Invoke();
+  EXPECT_THAT(m.GetValuesFloat(), ElementsAreArray(values));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 1}));
+
+  m.SetAxis(-1);
+  m.Invoke();
+  EXPECT_THAT(m.GetValuesFloat(), ElementsAreArray(values));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 1}));
+}
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/floor.cc b/tensorflow/contrib/lite/kernels/floor.cc
new file mode 100644
index 00000000000000..697b777693e275
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/floor.cc
@@ -0,0 +1,58 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace floor {
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  output->type = input->type;
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input->dims);
+  return context->ResizeTensor(context, output, output_size);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  optimized_ops::Floor(GetTensorData<float>(input), GetTensorDims(input),
+                       GetTensorData<float>(output), GetTensorDims(output));
+  return kTfLiteOk;
+}
+}  // namespace floor
+
+TfLiteRegistration* Register_FLOOR() {
+  static TfLiteRegistration r = {/*init=*/nullptr,
+                                 /*free=*/nullptr, floor::Prepare, floor::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/floor_test.cc b/tensorflow/contrib/lite/kernels/floor_test.cc
new file mode 100644
index 00000000000000..b71e0400b6dc92
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/floor_test.cc
@@ -0,0 +1,83 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class FloorOpModel : public SingleOpModel {
+ public:
+  FloorOpModel(std::initializer_list<int> input_shape, TensorType input_type) {
+    input_ = AddInput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(BuiltinOperator_FLOOR, BuiltinOptions_NONE, 0);
+    BuildInterpreter({
+        input_shape,
+    });
+  }
+
+  int input() { return input_; }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int output_;
+};
+
+TEST(FloorOpTest, SingleDim) {
+  FloorOpModel model({2}, TensorType_FLOAT32);
+  model.PopulateTensor<float>(model.input(), {8.5, 0.0});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({8, 0}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2}));
+}
+
+TEST(FloorOpTest, MultiDims) {
+  FloorOpModel model({2, 1, 1, 5}, TensorType_FLOAT32);
+  model.PopulateTensor<float>(model.input(), {
+                                                 0.0001,
+                                                 8.0001,
+                                                 0.9999,
+                                                 9.9999,
+                                                 0.5,
+                                                 -0.0001,
+                                                 -8.0001,
+                                                 -0.9999,
+                                                 -9.9999,
+                                                 -0.5,
+                                             });
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({0, 8, 0, 9, 0, -1, -9, -1, -10, -1}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 1, 1, 5}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/fully_connected.cc b/tensorflow/contrib/lite/kernels/fully_connected.cc
index 888e67966c0a40..989920622dff1f 100644
--- a/tensorflow/contrib/lite/kernels/fully_connected.cc
+++ b/tensorflow/contrib/lite/kernels/fully_connected.cc
@@ -55,19 +55,24 @@ struct OpData {
   // uint8_t these would be 0 and 255.
   int32_t output_activation_min;
   int32_t output_activation_max;
+  // The index of the temporary tensor where the quantized inputs are cached.
+  int input_quantized_index;
 };
 
 constexpr int kInputTensor = 0;
 constexpr int kWeightsTensor = 1;
 constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
+constexpr int kScratchBufferTensor = 1;
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   // This is a builtin op, so we don't use the contents in 'buffer', if any.
   // Instead, we allocate a new object to carry information from Prepare() to
   // Eval().
   gemm_support::IncrementUsageCounter(context);
-  return new OpData;
+  auto* op_data = new OpData;
+  context->AddTensors(context, 1, &op_data->input_quantized_index);
+  return op_data;
 }
 
 void Free(TfLiteContext* context, void* buffer) {
@@ -84,9 +89,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, node->inputs->size, 3);
   TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
 
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
-  TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   // Check all the parameters of tensor match within themselves and match the
@@ -96,17 +101,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     input_size *= input->dims->data[i];
   }
 
+  TF_LITE_ENSURE_EQ(context, NumDimensions(filter), 2);
   const int batch_size = input_size / filter->dims->data[1];
   const int num_units = filter->dims->data[0];
 
   TF_LITE_ASSERT_EQ(input_size, batch_size * filter->dims->data[1]);
   if (bias) {
-    TF_LITE_ASSERT_EQ(bias->dims->data[0], num_units);
+    TF_LITE_ENSURE_EQ(context, NumElements(bias), SizeOfDimension(filter, 0));
   }
 
-  TF_LITE_ENSURE_EQ(context, NumDimensions(filter), 2);
-  TF_LITE_ENSURE_EQ(context, NumDimensions(bias), 1);
-
   // Note that quantized inference requires that all tensors have their
   // parameters set. This is usually done during quantized training.
   TfLiteType data_type = input->type;
@@ -114,6 +117,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
     double real_multiplier = 0.0;
     TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
         context, input, filter, bias, output, &real_multiplier));
+    TF_LITE_ENSURE(context, real_multiplier < 1.0);
     QuantizeMultiplierSmallerThanOne(real_multiplier, &data->output_multiplier,
                                      &data->output_shift);
     CalculateActivationRangeUint8(params->activation, output,
@@ -121,6 +125,27 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                   &data->output_activation_max);
   }
 
+  // If we have to perform on-the-fly quantization (with quantized weights and
+  // float inputs) first we need to quantize the inputs. Allocate a temporary
+  // buffer to store the intermediate quantized values.
+  if (input->type == kTfLiteFloat32 && filter->type == kTfLiteUInt8) {
+    TfLiteIntArrayFree(node->temporaries);
+    node->temporaries = TfLiteIntArrayCreate(1);
+    node->temporaries->data[0] = data->input_quantized_index;
+
+    TfLiteTensor* input_quantized =
+        &context->tensors[node->temporaries->data[0]];
+    input_quantized->type = kTfLiteUInt8;
+    input_quantized->allocation_type = kTfLiteArenaRw;
+
+    // TODO(raziel): add this logic to ResizeTensor.
+    if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
+      TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
+                                                       input_quantized_size));
+    }
+  }
+
   // Resize output.
   TfLiteIntArray* output_size_array = TfLiteIntArrayCreate(2);
   output_size_array->data[0] = batch_size;
@@ -132,8 +157,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
 TfLiteStatus EvalPie(TfLiteContext* context, TfLiteNode* node,
                      TfLiteFullyConnectedParams* params, OpData* data,
-                     TfLiteTensor* input, TfLiteTensor* filter,
-                     TfLiteTensor* bias, TfLiteTensor* output) {
+                     const TfLiteTensor* input, const TfLiteTensor* filter,
+                     const TfLiteTensor* bias, TfLiteTensor* output) {
   int total_input_size = 1;
   for (int i = 0; i < input->dims->size; i++) {
     total_input_size *= input->dims->data[i];
@@ -163,6 +188,73 @@ TfLiteStatus EvalPie(TfLiteContext* context, TfLiteNode* node,
   return kTfLiteOk;
 }
 
+TfLiteStatus EvalPieQuantized(TfLiteContext* context, TfLiteNode* node,
+                              TfLiteFullyConnectedParams* params, OpData* data,
+                              const TfLiteTensor* input,
+                              const TfLiteTensor* filter,
+                              const TfLiteTensor* bias,
+                              TfLiteTensor* input_quantized,
+                              TfLiteTensor* output) {
+  // Check the types for this hybrid Op.
+  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, filter->type, kTfLiteUInt8);
+  TF_LITE_ENSURE_EQ(context, bias->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, output->type, kTfLiteFloat32);
+
+  int total_input_size = 1;
+  for (int i = 0; i < input->dims->size; i++) {
+    total_input_size *= input->dims->data[i];
+  }
+
+  const int input_size = filter->dims->data[1];
+  const int batch_size = total_input_size / filter->dims->data[1];
+  const int num_units = filter->dims->data[0];
+
+  // Output = bias if bias tensor exists.
+  if (bias) {
+    tensor_utils::VectorBatchVectorAssign(bias->data.f, num_units, batch_size,
+                                          output->data.f);
+  } else {
+    tensor_utils::ZeroVector(output->data.f, batch_size * num_units);
+  }
+
+  // Save matrix multiplication computation for all zero input.
+  if (tensor_utils::IsZeroVector(input->data.f, total_input_size)) {
+    tensor_utils::ApplyActivationToVector(output->data.f,
+                                          batch_size * num_units,
+                                          params->activation, output->data.f);
+    return kTfLiteOk;
+  }
+
+  // Quantize input from float to uint8 + quantization params (scaling factor).
+  float min, max;
+  float* scaling_factors = new float[batch_size];
+
+  // Quantize each batch independently.
+  for (int b = 0; b < batch_size; ++b) {
+    const int offset = b * input_size;
+    tensor_utils::SymmetricQuantizeFloats(
+        input->data.f + offset, input_size,
+        reinterpret_cast<int8_t*>(input_quantized->data.uint8) + offset, &min,
+        &max, &scaling_factors[b]);
+    // Incorporate scaling of the filter.
+    scaling_factors[b] *= filter->params.scale;
+  }
+
+  // Compute output += weight * quantized_input
+  tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+      reinterpret_cast<int8_t*>(filter->data.uint8), num_units, input_size,
+      reinterpret_cast<int8_t*>(input_quantized->data.uint8), scaling_factors,
+      batch_size, output->data.f, /*result_stride=*/1);
+
+  // Apply activation function to floats.
+  tensor_utils::ApplyActivationToVector(output->data.f, batch_size * num_units,
+                                        params->activation, output->data.f);
+  delete[] scaling_factors;
+
+  return kTfLiteOk;
+}
+
 #define TF_LITE_MACRO_DISPATCH(macro_name, params, target_namespace) \
   if (params->activation == kTfLiteActNone) {                        \
     macro_name(target_namespace, kNone);                             \
@@ -177,8 +269,9 @@ TfLiteStatus EvalPie(TfLiteContext* context, TfLiteNode* node,
 template <KernelType kernel_type>
 TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                            TfLiteFullyConnectedParams* params, OpData* data,
-                           TfLiteTensor* input, TfLiteTensor* filter,
-                           TfLiteTensor* bias, TfLiteTensor* output) {
+                           const TfLiteTensor* input,
+                           const TfLiteTensor* filter, const TfLiteTensor* bias,
+                           TfLiteTensor* output) {
   gemmlowp::GemmContext* gemm_context = gemm_support::GetFromContext(context);
 
   int32_t input_offset = -input->params.zero_point;
@@ -195,9 +288,17 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
   if (kernel_type == kReference) {
     TF_LITE_FULLY_CONNECTED(reference_ops);
   } else if (kernel_type == kPie) {
-    // TODO(ahentz): we don't have a quantized version of the PIE kernels, so
-    // we just defer to the MINI ones.
-    TF_LITE_FULLY_CONNECTED(optimized_ops);
+    if (input->type == kTfLiteFloat32) {
+      // Pie currently only supports quantized models and float inputs/outputs.
+      TfLiteTensor* input_quantized =
+          &context->tensors[node->temporaries->data[0]];
+      return EvalPieQuantized(context, node, params, data, input, filter, bias,
+                              input_quantized, output);
+    } else {
+      // TODO(ahentz): we don't have a quantized version of the PIE kernels, so
+      // we just defer to the MINI ones.
+      TF_LITE_FULLY_CONNECTED(optimized_ops);
+    }
   } else {
     TF_LITE_FULLY_CONNECTED(optimized_ops);
   }
@@ -209,8 +310,8 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
 template <KernelType kernel_type>
 TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
                        TfLiteFullyConnectedParams* params, OpData* data,
-                       TfLiteTensor* input, TfLiteTensor* filter,
-                       TfLiteTensor* bias, TfLiteTensor* output) {
+                       const TfLiteTensor* input, const TfLiteTensor* filter,
+                       const TfLiteTensor* bias, TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(params->activation, &output_activation_min,
                                 &output_activation_max);
@@ -240,12 +341,12 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
-  TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* filter = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
-  switch (input->type) {  // Already know in/out types are same.
+  switch (filter->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
       return EvalFloat<kernel_type>(context, node, params, data, input, filter,
                                     bias, output);
@@ -253,7 +354,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       return EvalQuantized<kernel_type>(context, node, params, data, input,
                                         filter, bias, output);
     default:
-      context->ReportError(context, "Type not currently supported.");
+      context->ReportError(context, "Type %d not currently supported.",
+                           filter->type);
       return kTfLiteError;
   }
   return kTfLiteOk;
diff --git a/tensorflow/contrib/lite/kernels/fully_connected_test.cc b/tensorflow/contrib/lite/kernels/fully_connected_test.cc
index 87413000a93a0a..05dd028b484c09 100644
--- a/tensorflow/contrib/lite/kernels/fully_connected_test.cc
+++ b/tensorflow/contrib/lite/kernels/fully_connected_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/memory/memory.h"
 #include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/kernels/test_util.h"
 #include "tensorflow/contrib/lite/model.h"
@@ -224,6 +225,60 @@ class QuantizedFullyConnectedOpModel : public BaseFullyConnectedOpModel {
   }
 };
 
+// In the hybrid model the weights are quantized (to uint8). But the bias,
+// input (and output) are expected to be in float precision.
+class HybridFullyConnectedOpModel : public SingleOpModel {
+ public:
+  HybridFullyConnectedOpModel(int units, int batches, const TensorData& input,
+                              const TensorData& weights,
+                              const TensorData& output = {TensorType_FLOAT32})
+      : batches_(batches), units_(units) {
+    int total_input_size = 1;
+    for (int i = 0; i < input.shape.size(); ++i) {
+      total_input_size *= input.shape[i];
+    }
+    input_size_ = total_input_size / batches_;
+
+    input_ = AddInput(input);
+    weights_ = AddInput(weights);
+
+    TensorData bias{TensorType_FLOAT32, {units_}};
+    bias_ = AddInput(bias);
+
+    output_ = AddOutput(output);
+
+    SetBuiltinOp(
+        BuiltinOperator_FULLY_CONNECTED, BuiltinOptions_FullyConnectedOptions,
+        CreateFullyConnectedOptions(builder_, ActivationFunctionType_RELU)
+            .Union());
+    resolver_ = absl::make_unique<SingleOpResolver>(
+        BuiltinOperator_FULLY_CONNECTED,
+        ops::builtin::Register_FULLY_CONNECTED_PIE());
+    BuildInterpreter({GetShape(input_), GetShape(weights_), GetShape(bias_)});
+  }
+  void SetBias(std::initializer_list<float> f) { PopulateTensor(bias_, f); }
+  void SetWeights(std::initializer_list<float> data) {
+    SymmetricQuantizeAndPopulate(weights_, data);
+  }
+
+  void SetInput(std::initializer_list<float> f) { PopulateTensor(input_, f); }
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+  int input_size() { return input_size_; }
+  int num_units() { return units_; }
+  int num_batches() { return batches_; }
+
+ protected:
+  int input_;
+  int weights_;
+  int bias_;
+  int output_;
+
+  int batches_;
+  int units_;
+  int input_size_;
+};
+
 const auto kKernelMap = new std::map<string, TfLiteRegistration*>({
     {"Reference", ops::builtin::Register_FULLY_CONNECTED_REF()},
     {"NeonOptimized", ops::builtin::Register_FULLY_CONNECTED_NEON_OPT()},
@@ -231,18 +286,43 @@ const auto kKernelMap = new std::map<string, TfLiteRegistration*>({
     {"Pie", ops::builtin::Register_FULLY_CONNECTED_PIE()},
 });
 
-class FullyConnectedOpTest : public SingleOpTest {
+class FloatFullyConnectedOpTest : public SingleOpTest {
  protected:
   const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
     return *kKernelMap;
   }
 };
 
+const auto kKernelMapNoPie = new std::map<string, TfLiteRegistration*>({
+    {"Reference", ops::builtin::Register_FULLY_CONNECTED_REF()},
+    {"NeonOptimized", ops::builtin::Register_FULLY_CONNECTED_NEON_OPT()},
+    {"GenericOptimized", ops::builtin::Register_FULLY_CONNECTED_GENERIC_OPT()},
+});
+
+class QuantizedFullyConnectedOpTest : public SingleOpTest {
+ protected:
+  const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
+    return *kKernelMapNoPie;
+  }
+};
+
+const auto kKernelMapPie = new std::map<string, TfLiteRegistration*>({
+    {"Pie", ops::builtin::Register_FULLY_CONNECTED_PIE()},
+});
+
+// Hybrid mode is used by the Pie quantized kernel.
+class HybridFullyConnectedOpTest : public SingleOpTest {
+ protected:
+  const std::map<string, TfLiteRegistration*>& GetKernelMap() override {
+    return *kKernelMapPie;
+  }
+};
+
 // TODO(ahentz): add more small tests like this one, focused on making sure the
 // calculations are correct.
-TEST_P(FullyConnectedOpTest, SimpleTest) {
-  FloatFullyConnectedOpModel m(GetRegistration(), 3, 2,
-                               {TensorType_FLOAT32, {2, 10}});
+TEST_P(FloatFullyConnectedOpTest, SimpleTest) {
+  FloatFullyConnectedOpModel m(GetRegistration(), /*units=*/3, /*batches=*/2,
+                               /*input=*/{TensorType_FLOAT32, {2, 10}});
   m.SetWeights({
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
@@ -260,9 +340,9 @@ TEST_P(FullyConnectedOpTest, SimpleTest) {
   EXPECT_THAT(m.GetOutput(), ElementsAre(24, 25, 26, 58, 59, 60));
 }
 
-TEST_P(FullyConnectedOpTest, SimpleTestQuantized) {
+TEST_P(QuantizedFullyConnectedOpTest, SimpleTestQuantized) {
   QuantizedFullyConnectedOpModel m(
-      GetRegistration(), 3, 2,
+      GetRegistration(), /*units=*/3, /*batches*/ 2,
       /*input=*/{TensorType_UINT8, {2, 10}, -63.5, 64},
       /*output=*/{TensorType_UINT8, {}, -127, 128});
 
@@ -288,13 +368,40 @@ TEST_P(FullyConnectedOpTest, SimpleTestQuantized) {
   EXPECT_THAT(m.GetOutput(), ElementsAre(151, 152, 153, 185, 186, 187));
 }
 
-TEST(FullyConnectedOpTest, SimpleTest4DInput) {
+TEST(HybridFullyConnectedOpTest, SimpleTestQuantized) {
+  HybridFullyConnectedOpModel m(
+      /*units=*/3, /*batches=*/2,
+      /*input=*/{TensorType_FLOAT32, {2, 10}},
+      /*weights=*/{TensorType_UINT8, {3, 10}, -63.5, 64});  // PIE
+
+  m.SetWeights({
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+      1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 1
+  });
+  m.SetBias({1, 2, 3});
+
+  m.SetInput({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10,  // b = 0
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10,  // b = 1
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                 {
+                                     24, 25, 26,  //
+                                     58, 59, 60,  //
+                                 },
+                                 /*max_abs_error=*/1.3f)));
+}
+
+TEST(FloatFullyConnectedOpTest, SimpleTest4DInput) {
   // Note that it is not required that the first dimension be the number of
   // batches. All we care is that the input can be evenly distributed in
   // batches. In this case, we need the input to have multiples of '2'.
   FloatFullyConnectedOpModel m(ops::builtin::Register_FULLY_CONNECTED_PIE(),
-                               /*units=*/3,
-                               /*batches=*/2,
+                               /*units=*/3, /*batches=*/2,
                                /*input=*/{TensorType_FLOAT32, {4, 1, 5, 1}});
   m.SetWeights({
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10,  // u = 0
@@ -316,9 +423,9 @@ TEST(FullyConnectedOpTest, SimpleTest4DInput) {
                              }));
 }
 
-TEST_P(FullyConnectedOpTest, SimpleTest4dInputQuantized) {
+TEST_P(QuantizedFullyConnectedOpTest, SimpleTest4dInputQuantized) {
   QuantizedFullyConnectedOpModel m(
-      GetRegistration(), 3, 2,
+      GetRegistration(), /*units=*/3, /*batches=*/2,
       /*input=*/{TensorType_UINT8, {4, 1, 5, 1}, -63.5, 64},
       /*output=*/{TensorType_UINT8, {}, -127, 128});
 
@@ -345,14 +452,18 @@ TEST_P(FullyConnectedOpTest, SimpleTest4dInputQuantized) {
 }
 
 INSTANTIATE_TEST_CASE_P(
-    FullyConnectedOpTest, FullyConnectedOpTest,
+    FloatFullyConnectedOpTest, FloatFullyConnectedOpTest,
     ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMap)));
 
+INSTANTIATE_TEST_CASE_P(
+    QuantizedFullyConnectedOpTest, QuantizedFullyConnectedOpTest,
+    ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMapNoPie)));
+
 // TODO(ahentz): Reconsider this test. Having arbitrary weights makes it hard
 // to debug errors and doesn't necessarily test all the important details.
-TEST_P(FullyConnectedOpTest, BlackBoxTest) {
-  FloatFullyConnectedOpModel m(GetRegistration(), 16, 2,
-                               {TensorType_FLOAT32, {2, 8}});
+TEST_P(FloatFullyConnectedOpTest, BlackBoxTest) {
+  FloatFullyConnectedOpModel m(GetRegistration(), /*units=*/16, /*batches=*/2,
+                               /*input=*/{TensorType_FLOAT32, {2, 8}});
   m.SetWeights(
       {0.091327,  0.103366,  -0.316505, -0.083120, 0.149366,  -0.196636,
        -0.123672, 0.062800,  0.063031,  0.191670,  -0.062001, -0.061504,
diff --git a/tensorflow/contrib/lite/kernels/gather.cc b/tensorflow/contrib/lite/kernels/gather.cc
index 0e4187d1eac646..6a2341461f2c62 100644
--- a/tensorflow/contrib/lite/kernels/gather.cc
+++ b/tensorflow/contrib/lite/kernels/gather.cc
@@ -35,8 +35,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   const auto* params =
       reinterpret_cast<const TfLiteGatherParams*>(node->builtin_data);
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* positions = GetInput(context, node, kInputPositions);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* positions = GetInput(context, node, kInputPositions);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   // Only INT32 positions are supported.
   TF_LITE_ENSURE_EQ(context, positions->type, kTfLiteInt32);
@@ -59,8 +59,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
       TF_LITE_ENSURE_EQ(context, NumDimensions(input), 1);
     } break;
     default:
-      context->ReportError(context,
-                           "Only float32 and string types are supported");
+      context->ReportError(
+          context, "Only float32 and string types are supported, got %d",
+          input->type);
       return kTfLiteError;
   }
   const int num_dimensions =
@@ -81,8 +82,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* positions = GetInput(context, node, kInputPositions);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* positions = GetInput(context, node, kInputPositions);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   const int input_rank = NumDimensions(input);
 #define TF_LITE_GATHER(data_type, index_type)                            \
diff --git a/tensorflow/contrib/lite/kernels/hashtable_lookup.cc b/tensorflow/contrib/lite/kernels/hashtable_lookup.cc
index 3b82601d119b2e..41211d41aa85a5 100644
--- a/tensorflow/contrib/lite/kernels/hashtable_lookup.cc
+++ b/tensorflow/contrib/lite/kernels/hashtable_lookup.cc
@@ -60,15 +60,15 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 2);
 
-  TfLiteTensor* lookup = GetInput(context, node, 0);
+  const TfLiteTensor* lookup = GetInput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, NumDimensions(lookup), 1);
   TF_LITE_ENSURE_EQ(context, lookup->type, kTfLiteInt32);
 
-  TfLiteTensor* key = GetInput(context, node, 1);
+  const TfLiteTensor* key = GetInput(context, node, 1);
   TF_LITE_ENSURE_EQ(context, NumDimensions(key), 1);
   TF_LITE_ENSURE_EQ(context, key->type, kTfLiteInt32);
 
-  TfLiteTensor* value = GetInput(context, node, 2);
+  const TfLiteTensor* value = GetInput(context, node, 2);
   TF_LITE_ENSURE(context, NumDimensions(value) >= 1);
   TF_LITE_ENSURE_EQ(context, SizeOfDimension(key, 0),
                     SizeOfDimension(value, 0));
@@ -102,9 +102,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   TfLiteTensor* output = GetOutput(context, node, 0);
   TfLiteTensor* hits = GetOutput(context, node, 1);
-  TfLiteTensor* lookup = GetInput(context, node, 0);
-  TfLiteTensor* key = GetInput(context, node, 1);
-  TfLiteTensor* value = GetInput(context, node, 2);
+  const TfLiteTensor* lookup = GetInput(context, node, 0);
+  const TfLiteTensor* key = GetInput(context, node, 1);
+  const TfLiteTensor* value = GetInput(context, node, 2);
 
   const int num_rows = SizeOfDimension(value, 0);
   const int row_bytes = value->bytes / num_rows;
diff --git a/tensorflow/contrib/lite/kernels/internal/BUILD b/tensorflow/contrib/lite/kernels/internal/BUILD
index 67dd1884966d8a..0a5223b23529ef 100644
--- a/tensorflow/contrib/lite/kernels/internal/BUILD
+++ b/tensorflow/contrib/lite/kernels/internal/BUILD
@@ -5,6 +5,7 @@ package(default_visibility = [
 licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts")
+load("//tensorflow/contrib/lite:special_rules.bzl", "tflite_portable_test_suite")
 
 tflite_deps_intel = [
     "@arm_neon_2_x86_sse",
@@ -155,7 +156,9 @@ cc_library(
     copts = tflite_copts(),
     deps = [
         ":quantization_util",
+        ":strided_slice_logic",
         ":types",
+        ":reference_base",
         ":round",
         "//third_party/eigen3",
         "@gemmlowp",
@@ -229,6 +232,17 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "strided_slice_logic",
+    srcs = [],
+    hdrs = [
+        "strided_slice_logic.h",
+    ],
+    deps = [
+        ":types",
+    ],
+)
+
 cc_library(
     name = "reference_base",
     srcs = [],
@@ -241,6 +255,7 @@ cc_library(
     deps = [
         ":quantization_util",
         ":round",
+        ":strided_slice_logic",
         ":types",
         "//third_party/eigen3",
         "@gemmlowp",
@@ -276,6 +291,7 @@ cc_library(
         "reference/portable_tensor_utils.h",
     ],
     deps = [
+        ":round",
         "//tensorflow/contrib/lite:builtin_op_data",
         "//tensorflow/contrib/lite/kernels:activation_functor",
         "//tensorflow/contrib/lite/kernels:op_macros",
@@ -286,9 +302,12 @@ cc_library(
     name = "neon_tensor_utils",
     srcs = [
         "optimized/neon_tensor_utils.cc",
+        "reference/portable_tensor_utils.cc",
+        "reference/portable_tensor_utils.h",
     ],
     hdrs = [
         "common.h",
+        "compatibility.h",
         "optimized/cpu_check.h",
         "optimized/neon_tensor_utils.h",
         "optimized/tensor_utils_impl.h",
@@ -296,10 +315,11 @@ cc_library(
     copts = NEON_FLAGS_IF_APPLICABLE + HARD_FP_FLAGS_IF_APPLICABLE,
     deps = [
         ":cpu_check",
-        ":portable_tensor_utils",
+        ":round",
         ":types",
         "//tensorflow/contrib/lite:builtin_op_data",
         "//tensorflow/contrib/lite/kernels:activation_functor",
+        "//tensorflow/contrib/lite/kernels:op_macros",
         "@arm_neon_2_x86_sse",
         "@gemmlowp",
     ],
@@ -370,6 +390,9 @@ cc_library(
         ":armv7a": [
             ":neon_tensor_utils",
         ],
+        ":haswell": [
+            ":neon_tensor_utils",
+        ],
         ":ios_armv7": [
             ":neon_tensor_utils",
         ],
@@ -397,6 +420,15 @@ cc_library(
     }),
 )
 
+cc_library(
+    name = "test_util",
+    srcs = ["test_util.cc"],
+    hdrs = ["test_util.h"],
+    deps = [
+        ":types",
+    ],
+)
+
 cc_test(
     name = "tensor_utils_test",
     srcs = ["tensor_utils_test.cc"],
@@ -408,6 +440,7 @@ cc_test(
         "//conditions:default": [],
     }),
     linkstatic = 1,
+    tags = ["tflite_not_portable_ios"],
     deps = [
         ":tensor_utils",
         "//tensorflow/contrib/lite:builtin_op_data",
@@ -416,6 +449,83 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "depthwiseconv_float_test",
+    srcs = ["depthwiseconv_float_test.cc"],
+    deps = [
+        ":optimized_base",
+        ":reference_base",
+        ":test_util",
+        ":types",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "depthwiseconv_quantized_test",
+    srcs = ["depthwiseconv_quantized_test.cc"],
+    deps = [
+        ":optimized_base",
+        ":reference_base",
+        ":test_util",
+        ":types",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "resize_bilinear_float_test",
+    srcs = ["resize_bilinear_float_test.cc"],
+    deps = [
+        ":optimized_base",
+        ":reference_base",
+        ":test_util",
+        ":types",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "softmax_quantized_test",
+    timeout = "long",
+    srcs = [
+        "softmax_quantized_test.cc",
+    ],
+    deps = [
+        ":optimized_base",
+        ":quantization_util",
+        ":reference_base",
+        ":test_util",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "logsoftmax_quantized_test",
+    timeout = "long",
+    srcs = [
+        "logsoftmax_quantized_test.cc",
+    ],
+    tags = ["tflite_not_portable"],
+    deps = [
+        ":optimized_base",
+        ":quantization_util",
+        ":reference_base",
+        ":test_util",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "log_quantized_test",
+    srcs = ["log_quantized_test.cc"],
+    deps = [
+        ":optimized_base",
+        ":reference_base",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "cpu_check",
     hdrs = [
@@ -442,3 +552,5 @@ cc_test(
 )
 
 exports_files(["optimized/eigen_tensor_reduced_instantiations_oss.h"])
+
+tflite_portable_test_suite()
diff --git a/tensorflow/contrib/lite/kernels/internal/common.h b/tensorflow/contrib/lite/kernels/internal/common.h
index 18601df22c1894..b86ca49c116875 100644
--- a/tensorflow/contrib/lite/kernels/internal/common.h
+++ b/tensorflow/contrib/lite/kernels/internal/common.h
@@ -87,12 +87,12 @@ float ActivationFunction(float x) {
                                       output_activation_max);
 }
 
-inline int32 MultiplyByQuantizedMultiplierSmallerThanOne(
-    int32 x, int32 quantized_multiplier, int right_shift) {
+inline int32 MultiplyByQuantizedMultiplierSmallerThanOneExp(
+    int32 x, int32 quantized_multiplier, int left_shift) {
   using gemmlowp::RoundingDivideByPOT;
   using gemmlowp::SaturatingRoundingDoublingHighMul;
   return RoundingDivideByPOT(
-      SaturatingRoundingDoublingHighMul(x, quantized_multiplier), right_shift);
+      SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift);
 }
 
 inline int32 MultiplyByQuantizedMultiplierGreaterThanOne(
@@ -113,6 +113,20 @@ inline int32 MultiplyByQuantizedMultiplier(int32 x, int32 quantized_multiplier,
                              right_shift);
 }
 
+template <typename T>
+int CountLeadingZeros(T integer_input) {
+  static_assert(std::is_unsigned<T>::value,
+                "Only unsigned integer types handled.");
+  const T one_in_leading_positive = static_cast<T>(1)
+                                    << (std::numeric_limits<T>::digits - 1);
+  int leading_zeros = 0;
+  while (integer_input < one_in_leading_positive) {
+    integer_input <<= 1;
+    ++leading_zeros;
+  }
+  return leading_zeros;
+}
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_COMMON_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/depthwiseconv_float_test.cc b/tensorflow/contrib/lite/kernels/internal/depthwiseconv_float_test.cc
new file mode 100644
index 00000000000000..844ee6a53dd65b
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/depthwiseconv_float_test.cc
@@ -0,0 +1,162 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/kernels/internal/test_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/types.h"
+
+#define ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+#include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_float.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_float.h"
+
+namespace tflite {
+namespace {
+
+// Runs the DepthwiseConv and compares against the reference implementation.
+template <FusedActivationFunctionType Ac>
+void TestOneDepthwiseConv(const float* input_data, const Dims<4>& input_dims,
+                          const float* filter_data, const Dims<4>& filter_dims,
+                          const float* bias_data, const Dims<4>& bias_dims,
+                          int stride, int pad_width, int pad_height,
+                          int depth_multiplier, const Dims<4>& output_dims) {
+  const int output_buffer_size = RequiredBufferSizeForDims(output_dims);
+  std::vector<float> output_data(output_buffer_size);
+  std::vector<float> reference_output_data(output_buffer_size);
+  reference_ops::DepthwiseConv<Ac>(input_data, input_dims, filter_data,
+                                   filter_dims, bias_data, bias_dims, stride,
+                                   pad_width, pad_height, depth_multiplier,
+                                   reference_output_data.data(), output_dims);
+  optimized_ops::DepthwiseConv<Ac>(input_data, input_dims, filter_data,
+                                   filter_dims, bias_data, bias_dims, stride,
+                                   pad_width, pad_height, depth_multiplier,
+                                   output_data.data(), output_dims);
+  double sum_abs_diff = 0;
+  float max_abs_val = 0;
+  for (int i = 0; i < output_buffer_size; i++) {
+    sum_abs_diff += std::abs(output_data[i] - reference_output_data[i]);
+    max_abs_val = std::max(max_abs_val, std::abs(reference_output_data[i]));
+  }
+  if (sum_abs_diff != 0.f) {
+    const float mean_diff =
+        static_cast<float>(sum_abs_diff / output_buffer_size);
+    const float relative_error = std::abs(mean_diff) / max_abs_val;
+    ASSERT_LT(relative_error, 1e-5f);
+  }
+}
+
+void TestOneDepthwiseConv(FusedActivationFunctionType Ac,
+                          const float* input_data, const Dims<4>& input_dims,
+                          const float* filter_data, const Dims<4>& filter_dims,
+                          const float* bias_data, const Dims<4>& bias_dims,
+                          int stride, int pad_width, int pad_height,
+                          int depth_multiplier, const Dims<4>& output_dims) {
+#define TOCO_HANDLE_CASE(AC_TYPE)                                            \
+  if (AC_TYPE == Ac) {                                                       \
+    TestOneDepthwiseConv<AC_TYPE>(input_data, input_dims, filter_data,       \
+                                  filter_dims, bias_data, bias_dims, stride, \
+                                  pad_width, pad_height, depth_multiplier,   \
+                                  output_dims);                              \
+    return;                                                                  \
+  }
+  TOCO_HANDLE_CASE(FusedActivationFunctionType::kNone)
+  TOCO_HANDLE_CASE(FusedActivationFunctionType::kRelu)
+  TOCO_HANDLE_CASE(FusedActivationFunctionType::kRelu1)
+  TOCO_HANDLE_CASE(FusedActivationFunctionType::kRelu6)
+#undef TOCO_HANDLE_CASE
+}
+
+// This function picks some random DepthwiseConv params, which may or may not
+// be legal. If they're not legal, it returns false. If they're legal,
+// it runs the DepthwiseConv test and returns true. This allows the caller
+// to loop until a test has been run.
+bool TryTestOneDepthwiseConv() {
+  // We have to pick a lot of positive values, where we are particularly
+  // interested in small values because they are most likely to be special
+  // cases in optimized implementations, and secondarily because they allow
+  // tests to run fast, which means we can run more tests and get more
+  // coverage.
+  const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
+  const int input_depth = ExponentialRandomPositiveInt(0.9f, 6, 50);
+  const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
+  const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
+  const int filter_width = ExponentialRandomPositiveInt(0.9f, 4, 10);
+  const int filter_height = ExponentialRandomPositiveInt(0.9f, 4, 10);
+  const int depth_multiplier = ExponentialRandomPositiveInt(0.8f, 6, 50);
+  const int stride = ExponentialRandomPositiveInt(0.9f, 3, 8);
+  const int output_depth = input_depth * depth_multiplier;
+  // The optimized DepthwiseConv implementation currently uses a fixed-size
+  // accumulator buffer on the stack, with that size. This currently means
+  // that it does not support larger output depths. It CHECK's for it,
+  // so it's safe in the sense that if a larger output depth was encountered,
+  // it would explicitly fail. We just need to adjust our testing to that
+  // constraint.
+  const int kMaxSupportedOutputDepth = 1024;
+  if (output_depth > kMaxSupportedOutputDepth) {
+    return false;
+  }
+  const auto ac = RandomElement(std::vector<FusedActivationFunctionType>(
+      {FusedActivationFunctionType::kNone, FusedActivationFunctionType::kRelu,
+       FusedActivationFunctionType::kRelu6,
+       FusedActivationFunctionType::kRelu1}));
+  Dims<4> input_dims_inference =
+      MakeDimsForInference(input_depth, input_width, input_height, batch);
+  Dims<4> output_dims_inference;
+  int pad_width, pad_height;
+  const auto padding_type =
+      UniformRandomInt(0, 1) ? PaddingType::kSame : PaddingType::kValid;
+  if (!ComputeConvSizes(input_dims_inference, output_depth, filter_width,
+                        filter_height, stride, padding_type,
+                        &output_dims_inference, &pad_width, &pad_height)) {
+    return false;
+  }
+  Dims<4> filter_dims_inference =
+      MakeDimsForInference(output_depth, filter_width, filter_height, 1);
+  Dims<4> bias_dims_inference = MakeDimsForInference(output_depth, 1, 1, 1);
+  const int input_buffer_size = RequiredBufferSizeForDims(input_dims_inference);
+  const int filter_buffer_size =
+      RequiredBufferSizeForDims(filter_dims_inference);
+  std::vector<float> input_data(input_buffer_size);
+  std::vector<float> filter_data(filter_buffer_size);
+  std::vector<float> bias_data(output_depth);
+  const float input_amplitude = 1.f;
+  const float filter_amplitude = 1.f;
+  const float bias_amplitude =
+      filter_width * filter_height * input_amplitude * filter_amplitude;
+  FillRandom(&input_data, -input_amplitude, input_amplitude);
+  FillRandom(&filter_data, -filter_amplitude, filter_amplitude);
+  FillRandom(&bias_data, -bias_amplitude, bias_amplitude);
+  TestOneDepthwiseConv(ac, input_data.data(), input_dims_inference,
+                       filter_data.data(), filter_dims_inference,
+                       bias_data.data(), bias_dims_inference, stride, pad_width,
+                       pad_height, depth_multiplier, output_dims_inference);
+  return true;
+}
+
+void TestOneDepthwiseConv() {
+  while (!TryTestOneDepthwiseConv()) {
+  }
+}
+
+TEST(TestDepthwiseConv, TestDepthwiseConv) {
+  const int kTestsToRun = 100 * 1000;
+  for (int i = 0; i < kTestsToRun; i++) {
+    TestOneDepthwiseConv();
+  }
+}
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/depthwiseconv_quantized_test.cc b/tensorflow/contrib/lite/kernels/internal/depthwiseconv_quantized_test.cc
new file mode 100644
index 00000000000000..2c0fc8433e18fb
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/depthwiseconv_quantized_test.cc
@@ -0,0 +1,330 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <sys/types.h>
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <cstdlib>
+#include <iterator>
+#include <limits>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/kernels/internal/test_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/types.h"
+
+#define ALLOW_SLOW_GENERIC_DEPTHWISECONV_FALLBACK
+#include "tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h"
+
+namespace tflite {
+namespace {
+
+// Runs the DepthwiseConv and compares against the reference implementation.
+template <FusedActivationFunctionType Ac>
+int TestOneDepthwiseConvWithGivenOutputShift(
+    const std::uint8_t* input_data, const Dims<4>& input_dims,
+    std::int32_t input_offset, const std::uint8_t* filter_data,
+    const Dims<4>& filter_dims, std::int32_t filter_offset,
+    const std::int32_t* bias_data, const Dims<4>& bias_dims, int stride,
+    int pad_width, int pad_height, int depth_multiplier,
+    std::int32_t output_offset, std::int32_t output_multiplier,
+    int output_shift, std::int32_t output_activation_min,
+    std::int32_t output_activation_max, const Dims<4>& output_dims) {
+  const int output_buffer_size = RequiredBufferSizeForDims(output_dims);
+  std::vector<std::uint8_t> output_data(output_buffer_size);
+  std::vector<std::uint8_t> reference_output_data(output_buffer_size);
+  reference_ops::DepthwiseConv<Ac>(
+      input_data, input_dims, input_offset, filter_data, filter_dims,
+      filter_offset, bias_data, bias_dims, stride, pad_width, pad_height,
+      depth_multiplier, output_offset, output_multiplier, output_shift,
+      output_activation_min, output_activation_max,
+      reference_output_data.data(), output_dims);
+  optimized_ops::DepthwiseConv<Ac>(
+      input_data, input_dims, input_offset, filter_data, filter_dims,
+      filter_offset, bias_data, bias_dims, stride, pad_width, pad_height,
+      depth_multiplier, output_offset, output_multiplier, output_shift,
+      output_activation_min, output_activation_max, output_data.data(),
+      output_dims);
+  int saturated_min = 0;
+  int saturated_max = 0;
+  std::vector<int> diff(output_buffer_size);
+  std::int64_t sum_diff = 0;
+  std::int64_t sum_abs_diff = 0;
+  for (int i = 0; i < output_buffer_size; i++) {
+    diff[i] = static_cast<int>(output_data[i]) -
+              static_cast<int>(reference_output_data[i]);
+    sum_diff += diff[i];
+    sum_abs_diff += std::abs(diff[i]);
+    saturated_min += output_data[i] == output_activation_min;
+    saturated_max += output_data[i] == output_activation_max;
+  }
+  // These stats help understand test failures.
+  std::sort(std::begin(diff), std::end(diff));
+  const int min_diff = diff.front();
+  const int max_diff = diff.back();
+  const int median_diff = diff[diff.size() / 2];
+  const float mean_diff = static_cast<float>(sum_diff) / output_buffer_size;
+  const float mean_abs_diff =
+      static_cast<float>(sum_abs_diff) / output_buffer_size;
+  // Normally we should require bit-for-bit exact results. Unfortunately a bug
+  // in the Intel arm_neon_sse.h translation header that we use for x86 tests
+  // causes 1-bit inaccuracy in
+  // the vqrdmulh_n_s32 intrinsic, which causes off-by-1 errors in quantized
+  // DepthwiseConv ops. So we have to live with a few off-by-one errors for now,
+  // yet still ensure that no more than a small minority of values are wrong.
+  EXPECT_TRUE(std::abs(mean_diff) < 1e-5f && mean_abs_diff < 1e-5f &&
+              std::abs(median_diff) == 0 && std::abs(min_diff) <= 1 &&
+              std::abs(max_diff) <= 1);
+  if (saturated_min > 2 * saturated_max) {
+    return -1;
+  }
+  if (saturated_max > 2 * saturated_min) {
+    return 1;
+  }
+  return 0;
+}
+
+// The point of this function is that we can't practically know which
+// output_shift value to pass to test DepthwiseConv. It's not easy to guess (we
+// could do some
+// statistics for large size, but they would be fragile at smaller sizes), and
+// guessing wrong would mean that all the values get saturated so the test
+// becomes
+// vacuous. So we just bisect our way to reasonable output_shift values.
+template <FusedActivationFunctionType Ac>
+void TestOneDepthwiseConvBisectOutputShift(
+    const std::uint8_t* input_data, const Dims<4>& input_dims,
+    std::int32_t input_offset, const std::uint8_t* filter_data,
+    const Dims<4>& filter_dims, std::int32_t filter_offset,
+    const std::int32_t* bias_data, const Dims<4>& bias_dims, int stride,
+    int pad_width, int pad_height, int depth_multiplier,
+    std::int32_t output_offset, std::int32_t output_multiplier,
+    int output_activation_bisect_start, int output_activation_bisect_end,
+    std::int32_t output_activation_min, std::int32_t output_activation_max,
+    const Dims<4>& output_dims) {
+  ASSERT_LT(output_activation_bisect_start, output_activation_bisect_end)
+      << "Bisection failed ?!?!";
+  int output_shift_bisect_midpoint =
+      (output_activation_bisect_start + output_activation_bisect_end) / 2;
+  int bisect_result = TestOneDepthwiseConvWithGivenOutputShift<Ac>(
+      input_data, input_dims, input_offset, filter_data, filter_dims,
+      filter_offset, bias_data, bias_dims, stride, pad_width, pad_height,
+      depth_multiplier, output_offset, output_multiplier,
+      output_shift_bisect_midpoint, output_activation_min,
+      output_activation_max, output_dims);
+  // At this point we know that the test succeeded (otherwise it would have
+  // aborted).
+  if (bisect_result == 0) {
+    // The result isn't particularly saturated on one or the other side.
+    // All good, we're done.
+    return;
+  }
+  if (output_activation_bisect_start == output_activation_bisect_end - 1) {
+    // There is still some saturation on one side, but the bisection is
+    // finished anyways. We're done; nothing more we can do about it. This
+    // happens
+    // in particular when using an activation with a narrow range.
+    return;
+  }
+  // Continue the bisection based on the present result.
+  int new_output_activation_bisect_start = bisect_result == 1
+                                               ? output_shift_bisect_midpoint
+                                               : output_activation_bisect_start;
+  int new_output_activation_bisect_end = bisect_result == 1
+                                             ? output_activation_bisect_end
+                                             : output_shift_bisect_midpoint;
+  TestOneDepthwiseConvBisectOutputShift<Ac>(
+      input_data, input_dims, input_offset, filter_data, filter_dims,
+      filter_offset, bias_data, bias_dims, stride, pad_width, pad_height,
+      depth_multiplier, output_offset, output_multiplier,
+      new_output_activation_bisect_start, new_output_activation_bisect_end,
+      output_activation_min, output_activation_max, output_dims);
+}
+
+template <FusedActivationFunctionType Ac>
+void TestOneDepthwiseConv(
+    const std::uint8_t* input_data, const Dims<4>& input_dims,
+    std::int32_t input_offset, const std::uint8_t* filter_data,
+    const Dims<4>& filter_dims, std::int32_t filter_offset,
+    const std::int32_t* bias_data, const Dims<4>& bias_dims, int stride,
+    int pad_width, int pad_height, int depth_multiplier,
+    std::int32_t output_offset, std::int32_t output_multiplier,
+    std::int32_t output_activation_min, std::int32_t output_activation_max,
+    const Dims<4>& output_dims) {
+  TestOneDepthwiseConvBisectOutputShift<Ac>(
+      input_data, input_dims, input_offset, filter_data, filter_dims,
+      filter_offset, bias_data, bias_dims, stride, pad_width, pad_height,
+      depth_multiplier, output_offset, output_multiplier, 0, 32,
+      output_activation_min, output_activation_max, output_dims);
+}
+
+void TestOneDepthwiseConv(
+    FusedActivationFunctionType Ac, const std::uint8_t* input_data,
+    const Dims<4>& input_dims, std::int32_t input_offset,
+    const std::uint8_t* filter_data, const Dims<4>& filter_dims,
+    std::int32_t filter_offset, const std::int32_t* bias_data,
+    const Dims<4>& bias_dims, int stride, int pad_width, int pad_height,
+    int depth_multiplier, std::int32_t output_offset,
+    std::int32_t output_multiplier, std::int32_t output_activation_min,
+    std::int32_t output_activation_max, const Dims<4>& output_dims) {
+#define TOCO_HANDLE_CASE(AC_TYPE)                                           \
+  if (AC_TYPE == Ac) {                                                      \
+    TestOneDepthwiseConv<AC_TYPE>(                                          \
+        input_data, input_dims, input_offset, filter_data, filter_dims,     \
+        filter_offset, bias_data, bias_dims, stride, pad_width, pad_height, \
+        depth_multiplier, output_offset, output_multiplier,                 \
+        output_activation_min, output_activation_max, output_dims);         \
+    return;                                                                 \
+  }
+  TOCO_HANDLE_CASE(FusedActivationFunctionType::kNone)
+  TOCO_HANDLE_CASE(FusedActivationFunctionType::kRelu)
+  TOCO_HANDLE_CASE(FusedActivationFunctionType::kRelu1)
+  TOCO_HANDLE_CASE(FusedActivationFunctionType::kRelu6)
+#undef TOCO_HANDLE_CASE
+}
+
+bool TryTestDepthwiseConv(int batch, int input_depth, int input_width,
+                          int input_height, int filter_width, int filter_height,
+                          int depth_multiplier, int stride,
+                          PaddingType padding_type) {
+  const int output_depth = input_depth * depth_multiplier;
+  // The optimized DepthwiseConv implementation currently uses a fixed-size
+  // accumulator buffer on the stack, with that size. This currently means
+  // that it does not support larger output depths. It CHECK's for it,
+  // so it's safe in the sense that if a larger output depth was encountered,
+  // it would explicitly fail. We just need to adjust our testing to that
+  // constraint.
+  const int kMaxSupportedOutputDepth = 1024;
+  if (output_depth > kMaxSupportedOutputDepth) {
+    return false;
+  }
+  const auto ac = RandomElement(std::vector<FusedActivationFunctionType>(
+      {FusedActivationFunctionType::kNone, FusedActivationFunctionType::kRelu,
+       FusedActivationFunctionType::kRelu6,
+       FusedActivationFunctionType::kRelu1}));
+  int output_activation_min = 0;
+  int output_activation_max = 255;
+  if (ac != FusedActivationFunctionType::kNone && UniformRandomInt(0, 1)) {
+    output_activation_min = UniformRandomInt(0, 50);
+    output_activation_max = UniformRandomInt(200, 255);
+  }
+  const std::int32_t output_multiplier =
+      UniformRandomInt(1 << 29, std::numeric_limits<std::int32_t>::max());
+  const std::int32_t input_offset = UniformRandomInt(-256, 0);
+  const std::int32_t filter_offset = UniformRandomInt(-256, 0);
+  const std::int32_t output_offset = UniformRandomInt(-256, 0);
+  Dims<4> input_dims_inference =
+      MakeDimsForInference(input_depth, input_width, input_height, batch);
+  Dims<4> output_dims_inference;
+  int pad_width, pad_height;
+  if (!ComputeConvSizes(input_dims_inference, output_depth, filter_width,
+                        filter_height, stride, padding_type,
+                        &output_dims_inference, &pad_width, &pad_height)) {
+    return false;
+  }
+  Dims<4> filter_dims_inference =
+      MakeDimsForInference(output_depth, filter_width, filter_height, 1);
+  Dims<4> bias_dims_inference = MakeDimsForInference(output_depth, 1, 1, 1);
+  const int input_buffer_size = RequiredBufferSizeForDims(input_dims_inference);
+  const int filter_buffer_size =
+      RequiredBufferSizeForDims(filter_dims_inference);
+  std::vector<std::uint8_t> input_data(input_buffer_size);
+  std::vector<std::uint8_t> filter_data(filter_buffer_size);
+  std::vector<std::int32_t> bias_data(output_depth);
+  FillRandom(&input_data);
+  FillRandom(&filter_data);
+  FillRandom(&bias_data, -10000, 10000);
+  TestOneDepthwiseConv(ac, input_data.data(), input_dims_inference,
+                       input_offset, filter_data.data(), filter_dims_inference,
+                       filter_offset, bias_data.data(), bias_dims_inference,
+                       stride, pad_width, pad_height, depth_multiplier,
+                       output_offset, output_multiplier, output_activation_min,
+                       output_activation_max, output_dims_inference);
+  return true;
+}
+
+// This function picks some random DepthwiseConv params, which may or may not
+// be legal. If they're not legal, it returns false. If they're legal,
+// it runs the DepthwiseConv test and returns true. This allows the caller
+// to loop until a test has been run.
+bool TryTestOneDepthwiseConv() {
+  // We have to pick a lot of positive values, where we are particularly
+  // interested in small values because they are most likely to be special
+  // cases in optimized implementations, and secondarily because they allow
+  // tests to run fast, which means we can run more tests and get more
+  // coverage.
+  const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
+  const int input_depth = ExponentialRandomPositiveInt(0.9f, 6, 50);
+  const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
+  const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
+  const int filter_width = ExponentialRandomPositiveInt(0.9f, 4, 10);
+  const int filter_height = ExponentialRandomPositiveInt(0.9f, 4, 10);
+  const int depth_multiplier = ExponentialRandomPositiveInt(0.8f, 6, 50);
+  const int stride = ExponentialRandomPositiveInt(0.9f, 3, 8);
+  const auto padding_type =
+      UniformRandomInt(0, 1) ? PaddingType::kSame : PaddingType::kValid;
+
+  return TryTestDepthwiseConv(batch, input_depth, input_width, input_height,
+                              filter_width, filter_height, depth_multiplier,
+                              stride, padding_type);
+}
+
+// Tests parameters for the 3x3 filter kernel.
+bool TryTestOneDepthwiseConv3x3Filter() {
+  const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
+  const int input_depth = 8 * ExponentialRandomPositiveInt(0.9f, 10, 50);
+  const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
+  const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
+  const int filter_width = 3;
+  const int filter_height = 3;
+  const int depth_multiplier = 1;
+  const int stride = UniformRandomInt(1, 2);
+  // Although the kernel supports only kValid padding, we test that kSame
+  // is using the correct code path.
+  const auto padding_type =
+      UniformRandomInt(0, 1) ? PaddingType::kSame : PaddingType::kValid;
+
+  return TryTestDepthwiseConv(batch, input_depth, input_width, input_height,
+                              filter_width, filter_height, depth_multiplier,
+                              stride, padding_type);
+}
+
+void TestOneDepthwiseConv() {
+  while (!TryTestOneDepthwiseConv()) {
+  }
+}
+
+void TestOneDepthwiseConv3x3Filter() {
+  while (!TryTestOneDepthwiseConv3x3Filter()) {
+  }
+}
+
+TEST(TestDepthwiseConv, TestDepthwiseConv) {
+  const int kTestsToRun = 10 * 1000;
+  for (int i = 0; i < kTestsToRun; i++) {
+    TestOneDepthwiseConv();
+  }
+}
+
+TEST(TestDepthwiseConv3x3Filter, TestDepthwiseConv) {
+  const int kTestsToRun = 3 * 1000;
+  for (int i = 0; i < kTestsToRun; i++) {
+    TestOneDepthwiseConv3x3Filter();
+  }
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc b/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
index f142374269606b..6e621839753b5d 100644
--- a/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
+++ b/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
@@ -12,6 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h"
+
+#include <algorithm>
+
 #include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
 
 namespace tflite {
@@ -40,6 +44,70 @@ void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
                                         hidden_state_ptr_batch);
 }
 
+void RnnBatchStep(const float* input_ptr_batch, const int8_t* input_weights_ptr,
+                  float input_weights_scale,
+                  const int8_t* recurrent_weights_ptr,
+                  float recurrent_weights_scale, const float* bias_ptr,
+                  int input_size, int num_units, int batch_size,
+                  TfLiteFusedActivation activation,
+                  int8_t* quantized_input_ptr_batch,
+                  int8_t* quantized_hidden_state_ptr_batch,
+                  float* scaling_factors, float* hidden_state_ptr_batch,
+                  float* output_ptr_batch) {
+  // Output = bias
+  tensor_utils::VectorBatchVectorAssign(bias_ptr, num_units, batch_size,
+                                        output_ptr_batch);
+
+  // Save quantization and matmul computation for all zero input.
+  if (!tensor_utils::IsZeroVector(input_ptr_batch, batch_size * input_size)) {
+    // Quantize input from float to uint8 + quantization params (scaling
+    // factor).
+    float unused_min, unused_max;
+    // TODO(mirkov,raziel): replace this for-loop with a MACRO (or function)
+    // whichever is faster.
+    for (int b = 0; b < batch_size; ++b) {
+      const int offset = b * input_size;
+      tensor_utils::SymmetricQuantizeFloats(
+          input_ptr_batch + offset, input_size,
+          quantized_input_ptr_batch + offset, &unused_min, &unused_max,
+          &scaling_factors[b]);
+      scaling_factors[b] *= input_weights_scale;
+    }
+
+    // Output += input * input_weights
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        input_weights_ptr, num_units, input_size, quantized_input_ptr_batch,
+        scaling_factors, batch_size, output_ptr_batch, /*result_stride=*/1);
+  }
+
+  // Save quantization and matmul computation for all zero input.
+  if (!tensor_utils::IsZeroVector(hidden_state_ptr_batch,
+                                  batch_size * num_units)) {
+    // Quantize hidden_state
+    float unused_min, unused_max;
+    for (int b = 0; b < batch_size; ++b) {
+      const int offset = b * num_units;
+      tensor_utils::SymmetricQuantizeFloats(
+          hidden_state_ptr_batch + offset, num_units,
+          quantized_hidden_state_ptr_batch + offset, &unused_min, &unused_max,
+          &scaling_factors[b]);
+      scaling_factors[b] *= recurrent_weights_scale;
+    }
+
+    // Output += recurrent_weights * hidden_state
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        recurrent_weights_ptr, num_units, num_units,
+        quantized_hidden_state_ptr_batch, scaling_factors, batch_size,
+        output_ptr_batch, /*result_stride=*/1);
+  }
+
+  // Output = activation(Output) and update hidden_state
+  tensor_utils::ApplyActivationToVector(
+      output_ptr_batch, num_units * batch_size, activation, output_ptr_batch);
+  tensor_utils::VectorBatchVectorAssign(output_ptr_batch, num_units, batch_size,
+                                        hidden_state_ptr_batch);
+}
+
 void LstmStep(
     const float* input_ptr_batch, const float* input_to_input_weights_ptr,
     const float* input_to_forget_weights_ptr,
@@ -81,6 +149,7 @@ void LstmStep(
         input_to_input_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch,
         input_gate_scratch, /*result_stride=*/1);
   }
+
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       input_to_forget_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch,
       forget_gate_scratch, /*result_stride=*/1);
@@ -95,8 +164,7 @@ void LstmStep(
   if (!use_cifg) {
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         recurrent_to_input_weights_ptr, n_cell, n_output, output_state_ptr,
-        n_batch, input_gate_scratch,
-        /*result_stride=*/1);
+        n_batch, input_gate_scratch, /*result_stride=*/1);
   }
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       recurrent_to_forget_weights_ptr, n_cell, n_output, output_state_ptr,
@@ -187,5 +255,261 @@ void LstmStep(
                            output_state_ptr);
 }
 
+// TODO(alanchiao): move this to tensor_utils.
+void VectorMultiply(const int8_t* vector, const int v_size, const float scale,
+                    float* result) {
+  for (int i = 0; i < v_size; ++i) {
+    *result++ = scale * *vector++;
+  }
+}
+
+void LstmStep(
+    const float* input_ptr_batch, const int8_t* input_to_input_weights_ptr,
+    float input_to_input_weights_scale,
+    const int8_t* input_to_forget_weights_ptr,
+    float input_to_forget_weights_scale,
+    const int8_t* input_to_cell_weights_ptr, float input_to_cell_weights_scale,
+    const int8_t* input_to_output_weights_ptr,
+    float input_to_output_weights_scale,
+    const int8_t* recurrent_to_input_weights_ptr,
+    float recurrent_to_input_weights_scale,
+    const int8_t* recurrent_to_forget_weights_ptr,
+    float recurrent_to_forget_weights_scale,
+    const int8_t* recurrent_to_cell_weights_ptr,
+    float recurrent_to_cell_weights_scale,
+    const int8_t* recurrent_to_output_weights_ptr,
+    float recurrent_to_output_weights_scale,
+    const int8_t* cell_to_input_weights_ptr, float cell_to_input_weights_scale,
+    const int8_t* cell_to_forget_weights_ptr,
+    float cell_to_forget_weights_scale,
+    const int8_t* cell_to_output_weights_ptr,
+    float cell_to_output_weights_scale, const float* input_gate_bias_ptr,
+    const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
+    const float* output_gate_bias_ptr, const int8_t* projection_weights_ptr,
+    float projection_weights_scale, const float* projection_bias_ptr,
+    const TfLiteLSTMParams* params, int n_batch, int n_cell, int n_input,
+    int n_output, float* input_gate_scratch, float* forget_gate_scratch,
+    float* cell_scratch, float* output_gate_scratch, float* scaling_factors,
+    float* product_scaling_factors, float* recovered_cell_weights,
+    int8_t* quantized_input_ptr_batch, int8_t* quantized_output_state_ptr,
+    int8_t* quantized_cell_state_ptr, float* output_state_ptr,
+    float* cell_state_ptr, float* output_ptr_batch) {
+  // Since we have already checked that weights are all there or none, we can
+  // check the existense of only one to the get the condition.
+  const bool use_cifg = (input_to_input_weights_ptr == nullptr);
+  const bool use_peephole = (cell_to_output_weights_ptr != nullptr);
+  // Initialize scratch buffers with bias.
+  if (!use_cifg) {
+    tensor_utils::VectorBatchVectorAssign(input_gate_bias_ptr, n_cell, n_batch,
+                                          input_gate_scratch);
+  }
+  tensor_utils::VectorBatchVectorAssign(forget_gate_bias_ptr, n_cell, n_batch,
+                                        forget_gate_scratch);
+  tensor_utils::VectorBatchVectorAssign(cell_bias_ptr, n_cell, n_batch,
+                                        cell_scratch);
+  tensor_utils::VectorBatchVectorAssign(output_gate_bias_ptr, n_cell, n_batch,
+                                        output_gate_scratch);
+
+  if (!tensor_utils::IsZeroVector(input_ptr_batch, n_batch * n_input)) {
+    // Save quantization and matmul computation for all zero input.
+    float unused_min, unused_max;
+    for (int b = 0; b < n_batch; ++b) {
+      const int offset = b * n_input;
+      tensor_utils::SymmetricQuantizeFloats(
+          input_ptr_batch + offset, n_input, quantized_input_ptr_batch + offset,
+          &unused_min, &unused_max, &scaling_factors[b]);
+    }
+    // For each batch and cell: compute input_weight * input.
+    if (!use_cifg) {
+      for (int b = 0; b < n_batch; ++b) {
+        product_scaling_factors[b] =
+            scaling_factors[b] * input_to_input_weights_scale;
+      }
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          input_to_input_weights_ptr, n_cell, n_input,
+          quantized_input_ptr_batch, product_scaling_factors, n_batch,
+          input_gate_scratch, /*result_stride=*/1);
+    }
+
+    for (int b = 0; b < n_batch; ++b) {
+      product_scaling_factors[b] =
+          scaling_factors[b] * input_to_forget_weights_scale;
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        input_to_forget_weights_ptr, n_cell, n_input, quantized_input_ptr_batch,
+        product_scaling_factors, n_batch, forget_gate_scratch,
+        /*result_stride=*/1);
+
+    for (int b = 0; b < n_batch; ++b) {
+      product_scaling_factors[b] =
+          scaling_factors[b] * input_to_cell_weights_scale;
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        input_to_cell_weights_ptr, n_cell, n_input, quantized_input_ptr_batch,
+        product_scaling_factors, n_batch, cell_scratch, /*result_stride=*/1);
+
+    for (int b = 0; b < n_batch; ++b) {
+      product_scaling_factors[b] =
+          scaling_factors[b] * input_to_cell_weights_scale;
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        input_to_output_weights_ptr, n_cell, n_input, quantized_input_ptr_batch,
+        product_scaling_factors, n_batch, output_gate_scratch,
+        /*result_stride=*/1);
+  }
+
+  if (!tensor_utils::IsZeroVector(output_state_ptr, n_batch * n_output)) {
+    // Save quantization and matmul computation for all zero input.
+    float unused_min, unused_max;
+    for (int b = 0; b < n_batch; ++b) {
+      const int offset = b * n_output;
+      tensor_utils::SymmetricQuantizeFloats(output_state_ptr + offset, n_output,
+                                            quantized_output_state_ptr + offset,
+                                            &unused_min, &unused_max,
+                                            &scaling_factors[b]);
+    }
+    // For each batch and cell: compute recurrent_weight * output_state.
+    if (!use_cifg) {
+      for (int b = 0; b < n_batch; ++b) {
+        product_scaling_factors[b] =
+            scaling_factors[b] * recurrent_to_input_weights_scale;
+      }
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          recurrent_to_input_weights_ptr, n_cell, n_output,
+          quantized_output_state_ptr, product_scaling_factors, n_batch,
+          input_gate_scratch, /*result_stride=*/1);
+    }
+
+    for (int b = 0; b < n_batch; ++b) {
+      product_scaling_factors[b] =
+          scaling_factors[b] * recurrent_to_forget_weights_scale;
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        recurrent_to_forget_weights_ptr, n_cell, n_output,
+        quantized_output_state_ptr, product_scaling_factors, n_batch,
+        forget_gate_scratch, /*result_stride=*/1);
+
+    for (int b = 0; b < n_batch; ++b) {
+      product_scaling_factors[b] =
+          scaling_factors[b] * recurrent_to_cell_weights_scale;
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        recurrent_to_cell_weights_ptr, n_cell, n_output,
+        quantized_output_state_ptr, product_scaling_factors, n_batch,
+        cell_scratch, /*result_stride=*/1);
+
+    for (int b = 0; b < n_batch; ++b) {
+      product_scaling_factors[b] =
+          scaling_factors[b] * recurrent_to_output_weights_scale;
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        recurrent_to_output_weights_ptr, n_cell, n_output,
+        quantized_output_state_ptr, product_scaling_factors, n_batch,
+        output_gate_scratch, /*result_stride=*/1);
+  }
+
+  // Save quantization and matmul computation for all zero input.
+  const bool is_cell_state_all_zeros =
+      tensor_utils::IsZeroVector(cell_state_ptr, n_batch * n_cell);
+
+  // For each batch and cell: update input gate.
+  if (!use_cifg) {
+    if (use_peephole && !is_cell_state_all_zeros) {
+      VectorMultiply(cell_to_input_weights_ptr, n_cell,
+                     1. / cell_to_input_weights_scale, recovered_cell_weights);
+      tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+          recovered_cell_weights, n_cell, cell_state_ptr, n_batch,
+          input_gate_scratch);
+    }
+    tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
+                                       input_gate_scratch);
+  }
+
+  // For each batch and cell: update forget gate.
+  if (use_peephole && !is_cell_state_all_zeros) {
+    VectorMultiply(cell_to_forget_weights_ptr, n_cell,
+                   1. / cell_to_forget_weights_scale, recovered_cell_weights);
+    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+        recovered_cell_weights, n_cell, cell_state_ptr, n_batch,
+        forget_gate_scratch);
+  }
+  tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
+                                     forget_gate_scratch);
+
+  // For each batch and cell: update the cell.
+  tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch, cell_state_ptr,
+                                         n_batch * n_cell, cell_state_ptr);
+  tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell,
+                                        params->activation, cell_scratch);
+  if (use_cifg) {
+    tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
+                             forget_gate_scratch);
+    tensor_utils::VectorVectorCwiseProductAccumulate(
+        cell_scratch, forget_gate_scratch, n_batch * n_cell, cell_state_ptr);
+  } else {
+    tensor_utils::VectorVectorCwiseProductAccumulate(
+        cell_scratch, input_gate_scratch, n_batch * n_cell, cell_state_ptr);
+  }
+  if (params->cell_clip > 0.0) {
+    tensor_utils::ClipVector(cell_state_ptr, n_batch * n_cell,
+                             params->cell_clip, cell_state_ptr);
+  }
+
+  // For each batch and cell: update the output gate.
+  if (use_peephole && !is_cell_state_all_zeros) {
+    VectorMultiply(cell_to_output_weights_ptr, n_cell,
+                   1. / cell_to_output_weights_scale, recovered_cell_weights);
+    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+        recovered_cell_weights, n_cell, cell_state_ptr, n_batch,
+        output_gate_scratch);
+  }
+  tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
+                                     output_gate_scratch);
+  tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell,
+                                        params->activation, cell_scratch);
+  tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch,
+                                         n_batch * n_cell, output_gate_scratch);
+
+  // For each batch: update the projection and output_state.
+  const bool use_projection_weight = (projection_weights_ptr != nullptr);
+  const bool use_projection_bias = (projection_bias_ptr != nullptr);
+  if (use_projection_weight) {
+    if (use_projection_bias) {
+      tensor_utils::VectorBatchVectorAssign(projection_bias_ptr, n_output,
+                                            n_batch, output_ptr_batch);
+    } else {
+      tensor_utils::ZeroVector(output_ptr_batch, n_batch * n_output);
+    }
+    if (!tensor_utils::IsZeroVector(output_gate_scratch, n_batch * n_cell)) {
+      // Save quantization and matmul computation for all zero input.
+      float unused_min, unused_max;
+      for (int b = 0; b < n_batch; ++b) {
+        const int offset = b * n_cell;
+        tensor_utils::SymmetricQuantizeFloats(
+            output_gate_scratch + offset, n_cell,
+            quantized_cell_state_ptr + offset, &unused_min, &unused_max,
+            &scaling_factors[b]);
+      }
+      for (int b = 0; b < n_batch; ++b) {
+        product_scaling_factors[b] =
+            scaling_factors[b] * projection_weights_scale;
+      }
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          projection_weights_ptr, n_output, n_cell, quantized_cell_state_ptr,
+          product_scaling_factors, n_batch, output_ptr_batch,
+          /*result_stride=*/1);
+    }
+    if (params->proj_clip > 0.0) {
+      tensor_utils::ClipVector(output_ptr_batch, n_batch * n_output,
+                               params->proj_clip, output_ptr_batch);
+    }
+  } else {
+    tensor_utils::CopyVector(output_gate_scratch, n_batch * n_output,
+                             output_ptr_batch);
+  }
+  tensor_utils::CopyVector(output_ptr_batch, n_batch * n_output,
+                           output_state_ptr);
+}
+
 }  // namespace kernel_utils
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/kernel_utils.h b/tensorflow/contrib/lite/kernels/internal/kernel_utils.h
index 3ec60ee57a8783..2a11b37a606936 100644
--- a/tensorflow/contrib/lite/kernels/internal/kernel_utils.h
+++ b/tensorflow/contrib/lite/kernels/internal/kernel_utils.h
@@ -35,6 +35,27 @@ void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
                   TfLiteFusedActivation activation,
                   float* hidden_state_ptr_batch, float* output_ptr_batch);
 
+// Performs a quantized RNN batch inference step. Same as above, but for
+// quantization purposes, we also pass in quantized_hidden_state_ptr_batch and
+// quantized_input_ptr_batch pointers for temporary storage of the quantized
+// values of hidden_state_ptr_batch and input_ptr_batch, respectively.
+// These temporary storages are expected to be preallocated to the same size as
+// the respective pointers.
+// An additional preallocated temporary storage 'scaling_factors' (of size
+// batch_size) is used to store the scaling factors of the quantization (used
+// for recovery).
+// {input,recurrent}_weights_scale params are used for dequantization/recovery.
+void RnnBatchStep(const float* input_ptr_batch, const int8_t* input_weights_ptr,
+                  float input_weights_scale,
+                  const int8_t* recurrent_weights_ptr,
+                  float recurrent_weights_scale, const float* bias_ptr,
+                  int input_size, int num_units, int batch_size,
+                  TfLiteFusedActivation activation,
+                  int8_t* quantized_input_ptr_batch,
+                  int8_t* quantized_hidden_state_ptr_batch,
+                  float* scaling_factors, float* hidden_state_ptr_batch,
+                  float* output_ptr_batch);
+
 // Performs an LSTM batch inference step for input specified by input_ptr_batch.
 // The LSTM cell is specified by the pointers to its weights (*_weights_ptr) and
 // biases (*_bias_ptr), and buffers (*_scratch), along with additional
@@ -71,6 +92,89 @@ void LstmStep(
     float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch,
     float* output_ptr_batch);
 
+// Same as above but with quantized weight matrices. In detail:
+// Input of size 'n_batch * n_input':
+//   input_ptr_batch
+//
+// LSTM weights:
+// Quantized input weights of size 'n_cell * n_input':
+//   input_to_input_weights            - optional (can be nullptr)
+//   input_to_forget_weights
+//   input_to_cell_weights
+//   input_to_input_weights
+// Quantized recurrent weights of size 'n_cell * n_output':
+//   recurrent_to_input_weights        - optional
+//   recurrent_to_forget_weights
+//   recurrent_to_cell_weights
+//   recurrent_to_input_weights
+// Quantized peephole weights of size 'n_cell', representing diagonal matrices.
+//   cell_to_input_weights             - optional
+//   cell_to_cell_weights              - optional
+//   cell_to_output_weights            - optional
+// Quantized projection weights of size 'n_output * n_cell'
+//   projection_weights_ptr            - optional
+// Weight scales (scalars) for each of the weights above.
+//   input_to_input_weights_scale      - optional
+//   input_to_forget_weights_scale
+//   input_to_cell_weights_scale
+//   input_to_output_weights_scale
+//   recurrent_to_input_weights_scale  - optional
+//   recurrent_to_forget_weights_scale
+//   recurrent_to_cell_weights_scale
+//   recurrent_to_output_weights_scale
+//   cell_to_input_weights_scale,
+//   cell_to_forget_weights_scale,
+//   cell_to_output_weights_scale,
+//   projection_weights_scale          - optional
+// Gate biases of size 'n_cell':
+//   input_gate_bias_ptr               - optional
+//   forget_gate_bias_ptr
+//   cell_gate_bias_ptr
+//   output_gate_bias_ptr
+//
+// Temporary pre-allocated storage for quantized values:
+//   quantized_input_ptr_batch (same size as input_ptr_batch)
+//   quantized_output_state_ptr (same size as output_state_ptr)
+//   quantized_cell_state_ptr (same size as cell_state_ptr)
+// Temporary pre-allocated storage for recovered values:
+//   recovered_cell_weights (same size as cell_to_*_weights)
+//
+// Outputs:
+//   output_state_ptr - size 'n_batch * n_output'
+//   cell_state_ptr   - size 'n_batch * n_cell'
+//   output_ptr_batch - size 'n_batch * n_output'
+void LstmStep(
+    const float* input_ptr_batch, const int8_t* input_to_input_weights_ptr,
+    float input_to_input_weights_scale,
+    const int8_t* input_to_forget_weights_ptr,
+    float input_to_forget_weights_scale,
+    const int8_t* input_to_cell_weights_ptr, float input_to_cell_weights_scale,
+    const int8_t* input_to_output_weights_ptr,
+    float input_to_output_weights_scale,
+    const int8_t* recurrent_to_input_weights_ptr,
+    float recurrent_to_input_weights_scale,
+    const int8_t* recurrent_to_forget_weights_ptr,
+    float recurrent_to_forget_weights_scale,
+    const int8_t* recurrent_to_cell_weights_ptr,
+    float recurrent_to_cell_weights_scale,
+    const int8_t* recurrent_to_output_weights_ptr,
+    float recurrent_to_output_weights_scale,
+    const int8_t* cell_to_input_weights_ptr, float cell_to_input_weights_scale,
+    const int8_t* cell_to_forget_weights_ptr,
+    float cell_to_forget_weights_scale,
+    const int8_t* cell_to_output_weights_ptr,
+    float cell_to_output_weights_scale, const float* input_gate_bias_ptr,
+    const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
+    const float* output_gate_bias_ptr, const int8_t* projection_weights_ptr,
+    float projection_weights_scale, const float* projection_bias_ptr,
+    const TfLiteLSTMParams* params, int n_batch, int n_cell, int n_input,
+    int n_output, float* input_gate_scratch, float* forget_gate_scratch,
+    float* cell_scratch, float* output_gate_scratch, float* scaling_factors,
+    float* product_scaling_factors, float* recovered_cell_weights,
+    int8_t* quantized_input_ptr_batch, int8_t* quantized_output_state_ptr,
+    int8_t* quantized_cell_state_ptr, float* output_state_ptr,
+    float* cell_state_ptr, float* output_ptr_batch);
+
 }  // namespace kernel_utils
 }  // namespace tflite
 #endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_KERNEL_UTILS_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/log_quantized_test.cc b/tensorflow/contrib/lite/kernels/internal/log_quantized_test.cc
new file mode 100644
index 00000000000000..7e9ff5242a43a8
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/log_quantized_test.cc
@@ -0,0 +1,333 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <functional>
+#include <iterator>
+#include <limits>
+#include <random>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#define GEMMLOWP_ENABLE_FIXEDPOINT_CONSTANTS_CHECKS
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+
+namespace {
+
+class NumberGenerator {
+ public:
+  std::vector<int> RandomIntVector(int n, int min_val, int max_val) {
+    std::vector<int> vec(n);
+    double scale = static_cast<double>(max_val + 1 - min_val) / engine_.max();
+    for (auto& it : vec) {
+      it = min_val + std::floor(engine_() * scale);
+    }
+    return vec;
+  }
+
+  std::mt19937 engine_;
+};
+
+class LogQuantizedTest : public ::testing::Test {
+ public:
+  NumberGenerator generator_;
+};
+
+// input_integer_bits <= 30.  output_integer_bits > 0.
+inline int32 LogPositiveValuesViaFloat(int32 input_val, int input_integer_bits,
+                                       int output_integer_bits) {
+  const double float_log_sum_of_exps = std::log(
+      static_cast<double>(input_val) * 0.5 / (1 << (30 - input_integer_bits)));
+  static constexpr double min_int =
+      static_cast<double>(std::numeric_limits<int32>::min());
+  static constexpr double max_int =
+      static_cast<double>(std::numeric_limits<int32>::max());
+  double double_result = tflite::TfLiteRound(float_log_sum_of_exps *
+                                             (1 << (31 - output_integer_bits)));
+  return static_cast<std::int32_t>(
+      std::min(max_int, std::max(min_int, double_result)));
+}
+
+void CheckOutputData(const std::vector<int32>& test_output,
+                     const std::vector<int32>& reference_output,
+                     const std::vector<int32>& test_input,
+                     const string& check_label, int input_integer_bits,
+                     int output_integer_bits, int tolerance) {
+  // In the special case of small input, specifically raw value of 5, a rounding
+  // up leads to difference in the output.  We do not aim to be accurate for
+  // very small input values, and there should be sufficient input fractional
+  // bits that this is a small input.
+  static constexpr double error_from_rounding_up = 0.0224585;
+  const int n = test_output.size();
+  ASSERT_EQ(n, reference_output.size());
+  for (int i = 0; i < n; ++i) {
+    // Adjust tolerance when input <= 5*2^-(31-input_integer_bits).
+    const int adjusted_tolerance =
+        test_input[i] > 5
+            ? tolerance
+            : std::max(tolerance, static_cast<int>(std::ceil(
+                                      error_from_rounding_up *
+                                      (1 << (31 - output_integer_bits)))));
+    ASSERT_LE(std::abs(test_output[i] - reference_output[i]),
+              adjusted_tolerance)
+        << "Failure in \"" << check_label << "\" at i=" << i
+        << ", test_input[i]=" << test_input[i] << "="
+        << static_cast<double>(test_input[i]) / (1 << (31 - input_integer_bits))
+        << ", test_output[i]=" << test_output[i] << "="
+        << static_cast<double>(test_output[i]) /
+               (1 << (31 - output_integer_bits))
+        << ", reference_output[i]=" << reference_output[i] << "="
+        << static_cast<double>(reference_output[i]) /
+               (1 << (31 - output_integer_bits))
+        << ", difference[i]=" << std::abs(reference_output[i] - test_output[i])
+        << "="
+        << static_cast<double>(std::abs(reference_output[i] - test_output[i])) /
+               (1 << (31 - output_integer_bits))
+        << "; tolerance=" << tolerance
+        << ", adj tolerance=" << adjusted_tolerance;
+  }
+}
+
+void RightShiftVector(const std::vector<int32>& shifts,
+                      std::vector<int32>* vec) {
+  const int n = vec->size();
+  ASSERT_EQ(n, shifts.size());
+  for (int i = 0; i < n; ++i) {
+    vec->at(i) = std::max(1, vec->at(i) >> shifts[i]);
+  }
+}
+
+template <int OutputIntegerBits, int InputIntegerBits>
+void RunSingleTest(const std::vector<int32>& test_input,
+                   const string& check_label, int tolerance) {
+  const int n = test_input.size();
+  std::vector<int32> float_gen_output(n, 0);
+  std::vector<int32> reference_output(n, 0);
+  std::vector<int32> optimized_output(n, 0);
+
+  // Workaround the stupid things that intelligent humans do.
+  // Consequence of __builtin_clz(0u) may equal 31 instead of 32.
+  std::vector<int32> fudged_input(n, 0);
+  for (int i = 0; i < n; ++i) {
+    fudged_input[i] = std::max(test_input[i], 2);
+  }
+
+  for (int i = 0; i < n; ++i) {
+    reference_output[i] =
+        tflite::reference_ops::log_x_for_x_greater_than_or_equal_to_1_impl<
+            OutputIntegerBits, InputIntegerBits>(
+            gemmlowp::FixedPoint<int32, InputIntegerBits>::FromRaw(
+                fudged_input[i]))
+            .raw();
+    optimized_output[i] =
+        tflite::optimized_ops::log_x_for_x_greater_than_or_equal_to_1_impl<
+            OutputIntegerBits, InputIntegerBits>(
+            gemmlowp::FixedPoint<int32, InputIntegerBits>::FromRaw(
+                fudged_input[i]))
+            .raw();
+    float_gen_output[i] = LogPositiveValuesViaFloat(
+        fudged_input[i], InputIntegerBits, OutputIntegerBits);
+  }
+  // Note that first check is intolerant.
+  {
+    std::ostringstream label;
+    label << check_label << " / optimized vs reference / InputIntegerBits="
+          << InputIntegerBits << ", OutputIntegerBits=" << OutputIntegerBits;
+    CheckOutputData(
+        optimized_output, reference_output, test_input, label.str(),
+        InputIntegerBits, OutputIntegerBits, 0);
+  }
+  {
+    std::ostringstream label;
+    label << check_label << " / reference vs float-gen / InputIntegerBits="
+          << InputIntegerBits << ", OutputIntegerBits=" << OutputIntegerBits;
+    CheckOutputData(
+        reference_output, float_gen_output, test_input, label.str(),
+        InputIntegerBits, OutputIntegerBits, tolerance);
+  }
+  {
+    std::ostringstream label;
+    label << check_label << " optimized vs float-gen / InputIntegerBits="
+          << InputIntegerBits << ", OutputIntegerBits=" << OutputIntegerBits;
+    CheckOutputData(
+        optimized_output, float_gen_output, test_input, label.str(),
+        InputIntegerBits, OutputIntegerBits, tolerance);
+  }
+}
+
+template <int OutputIntegerBits>
+void RunSingleTest(const std::vector<int32>& test_input, int input_integer_bits,
+                   const string& check_label, int tolerance) {
+#define INPUT_CASE(K)                                                   \
+  case K:                                                               \
+    return RunSingleTest<OutputIntegerBits, K>(test_input, check_label, \
+                                               tolerance)
+  switch (input_integer_bits) {
+    INPUT_CASE(0);
+    INPUT_CASE(1);
+    INPUT_CASE(2);
+    INPUT_CASE(3);
+    INPUT_CASE(4);
+    INPUT_CASE(5);
+    INPUT_CASE(6);
+    INPUT_CASE(7);
+    INPUT_CASE(8);
+    INPUT_CASE(9);
+    INPUT_CASE(10);
+    INPUT_CASE(11);
+    INPUT_CASE(12);
+    INPUT_CASE(13);
+    INPUT_CASE(14);
+    INPUT_CASE(15);
+    INPUT_CASE(16);
+    INPUT_CASE(17);
+    INPUT_CASE(18);
+    INPUT_CASE(19);
+    INPUT_CASE(20);
+    INPUT_CASE(21);
+    INPUT_CASE(22);
+    INPUT_CASE(23);
+    INPUT_CASE(24);
+    INPUT_CASE(25);
+    INPUT_CASE(26);
+    INPUT_CASE(27);
+    INPUT_CASE(28);
+    INPUT_CASE(29);
+    default:
+      ASSERT_LE(input_integer_bits, 30)
+                << "Input integer bits not handled: " << input_integer_bits;
+  }
+#undef INPUT_CASE
+}
+
+void RunSingleTest(const std::vector<int32>& test_input, int input_integer_bits,
+                   int output_integer_bits, const string& check_label,
+                   int tolerance) {
+#define OUTPUT_CASE(K)                                                   \
+  case K:                                                                \
+    return RunSingleTest<K>(test_input, input_integer_bits, check_label, \
+                            tolerance)
+  switch (output_integer_bits) {
+    OUTPUT_CASE(0);
+    OUTPUT_CASE(1);
+    OUTPUT_CASE(2);
+    OUTPUT_CASE(3);
+    OUTPUT_CASE(4);
+    OUTPUT_CASE(5);
+    OUTPUT_CASE(6);
+    OUTPUT_CASE(7);
+    OUTPUT_CASE(8);
+    OUTPUT_CASE(9);
+    OUTPUT_CASE(10);
+    OUTPUT_CASE(11);
+    OUTPUT_CASE(12);
+    OUTPUT_CASE(13);
+    OUTPUT_CASE(14);
+    OUTPUT_CASE(15);
+    OUTPUT_CASE(16);
+    OUTPUT_CASE(17);
+    OUTPUT_CASE(18);
+    OUTPUT_CASE(19);
+    OUTPUT_CASE(20);
+    OUTPUT_CASE(21);
+    OUTPUT_CASE(22);
+    OUTPUT_CASE(23);
+    OUTPUT_CASE(24);
+    OUTPUT_CASE(25);
+    OUTPUT_CASE(26);
+    OUTPUT_CASE(27);
+    OUTPUT_CASE(28);
+    OUTPUT_CASE(29);
+    default:
+      ASSERT_LE(input_integer_bits, 30)
+                << "Input integer bits not handled: " << input_integer_bits;
+  }
+#undef OUTPUT_CASE
+}
+
+void RunUniformTest(int test_size, int input_integer_bits,
+                    int output_integer_bits, const string& check_label,
+                    int tolerance, NumberGenerator* generator) {
+  std::vector<int> test_data = generator->RandomIntVector(
+      test_size, 2, std::numeric_limits<int>::max() - 1);
+  test_data[0] = 2;
+  test_data[1] = 3;
+  test_data[2] = 4;
+  test_data[3] = std::numeric_limits<int32>::max() - 2;
+  test_data[4] = std::numeric_limits<int32>::max() - 1;
+  test_data[5] = std::numeric_limits<int32>::max();
+
+  RunSingleTest(test_data, input_integer_bits, output_integer_bits,
+                check_label + " / uniform test", tolerance);
+}
+
+void RunUniformShiftUniformTest(int test_size, int input_integer_bits,
+                                int output_integer_bits,
+                                const string& check_label, int tolerance,
+                                NumberGenerator* generator) {
+  std::vector<int> test_data = generator->RandomIntVector(
+      test_size, 2, std::numeric_limits<int>::max() - 1);
+  std::vector<int> shifts = generator->RandomIntVector(test_size, 0, 29);
+  RightShiftVector(shifts, &test_data);
+
+  RunSingleTest(test_data, input_integer_bits, output_integer_bits,
+                check_label + " / shifted test", tolerance);
+}
+
+TEST_F(LogQuantizedTest, VariedIntegerBits) {
+  static constexpr int kVariations = 250;
+  static constexpr int kRunSize = 250;
+  static constexpr int kIntegerTolerance = 8;
+  static constexpr double kOutputFloatTolerance = 7.0e-7;
+
+  std::vector<int> input_integer_bits =
+      generator_.RandomIntVector(kVariations, 0, 24);
+  std::vector<int> output_integer_bits =
+      generator_.RandomIntVector(kVariations, 1, 10);
+
+  for (int i = 0; i < kVariations; ++i) {
+    int var_output_integer_bits = output_integer_bits[i];
+    int tolerance =
+        std::max(1.0 * kIntegerTolerance,
+                 (1 << (31 - var_output_integer_bits)) * kOutputFloatTolerance);
+
+    RunUniformTest(kRunSize, input_integer_bits[i], var_output_integer_bits,
+                   "VariedIntegerBits", tolerance, &generator_);
+    RunUniformShiftUniformTest(kRunSize, input_integer_bits[i],
+                               var_output_integer_bits, "VariedIntegerBits",
+                               tolerance, &generator_);
+  }
+}
+
+TEST_F(LogQuantizedTest, SelectedIntegerBits) {
+  static constexpr int kInputBits = 12;
+  static constexpr int kOutputBits = 5;
+  static constexpr int kRunSize = 100000;
+  static constexpr int kIntegerTolerance = 4;
+
+  RunUniformTest(kRunSize, kInputBits, kOutputBits, "SelectedIntegerBits",
+                 kIntegerTolerance, &generator_);
+  RunUniformShiftUniformTest(kRunSize, kInputBits, kOutputBits,
+                             "SelectedIntegerBits", kIntegerTolerance,
+                             &generator_);
+}
+
+}  // namespace
diff --git a/tensorflow/contrib/lite/kernels/internal/logsoftmax_quantized_test.cc b/tensorflow/contrib/lite/kernels/internal/logsoftmax_quantized_test.cc
new file mode 100644
index 00000000000000..b7531ea2e202cd
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/logsoftmax_quantized_test.cc
@@ -0,0 +1,241 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <functional>
+#include <iterator>
+#include <limits>
+#include <random>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/test_util.h"
+
+namespace tflite {
+namespace {
+
+void RunLogSoftmaxFloatReference(const uint8* input_data,
+                                 const Dims<4>& dims_common, int32 input_offset,
+                                 const double input_scale, int stride,
+                                 float beta, uint8* reference_output_data) {
+  const int ref_buffer_size = RequiredBufferSizeForDims(dims_common);
+  std::vector<float> reference_dequant_data(ref_buffer_size);
+  std::vector<float> reference_output_float_data(ref_buffer_size);
+
+  // Reference data generated via Dequant of input into float, and then applying
+  // float LogSoftmax.
+  reference_ops::Dequantize(input_data, dims_common, input_offset, input_scale,
+                            reference_dequant_data.data(), dims_common);
+  optimized_ops::LogSoftmax(reference_dequant_data.data(), dims_common,
+                            reference_output_float_data.data(), dims_common);
+  // Work with quantized scaling for LogSoftmax, under which 255 represents 0,
+  // and -16 gets nudged up to 0.
+  for (int i = 0; i < ref_buffer_size; i++) {
+    reference_output_data[i] = std::max(
+        0, static_cast<int>(
+               255 + std::round(16.0f * reference_output_float_data[i])));
+  }
+}
+
+void CheckOutputData(const uint8* test_output, const uint8* reference_output,
+                     const Dims<4>& dims_common, const string& check_label,
+                     bool be_exacting) {
+  const int buffer_size = RequiredBufferSizeForDims(dims_common);
+  // While calculating some metrics in floating point, we work with quantized
+  // scaling.
+  std::vector<int> diff(buffer_size);
+  int64_t sum_diff = 0;
+  int64_t sum_abs_diff = 0;
+  for (int i = 0; i < buffer_size; i++) {
+    diff[i] = static_cast<int>(test_output[i]) - reference_output[i];
+    sum_diff += diff[i];
+    sum_abs_diff += std::abs(diff[i]);
+  }
+  // These stats help understand test failures.
+  std::sort(std::begin(diff), std::end(diff));
+  const int min_diff = diff.front();
+  const int max_diff = diff.back();
+  const int median_diff = diff[diff.size() / 2];
+  const float mean_diff = static_cast<float>(sum_diff) / buffer_size;
+  const float mean_abs_diff = static_cast<float>(sum_abs_diff) / buffer_size;
+  // We either check for bit exactness (against the reference quantized version)
+  // or for general accuracy, allowing off-by-one (against the float reference).
+  if (be_exacting) {
+    ASSERT_TRUE(std::abs(min_diff) == 0 && std::abs(max_diff) == 0)
+        << check_label << ": "
+        << "std::abs(min_diff)=" << std::abs(min_diff)
+        << ", std::abs(max_diff)=" << std::abs(max_diff);
+  } else {
+    // For small numbers of samples, the estimates of the means vary more.
+    // Rather than widen the tolerances, we skip the smaller tests.
+    ASSERT_TRUE(((std::abs(mean_diff) < 2e-2f && mean_abs_diff < 3e-2f) ||
+                 buffer_size < 10000) &&
+                std::abs(median_diff) == 0 && std::abs(min_diff) <= 1 &&
+                std::abs(max_diff) <= 1)
+        << check_label << ": "
+        << "buffer_size=" << buffer_size << ", mean_diff=" << mean_diff
+        << ", mean_abs_diff=" << mean_abs_diff
+        << ", median_diff=" << median_diff << ", min_diff=" << min_diff
+        << ", max_diff=" << max_diff;
+  }
+}
+
+// Runs the LogSoftmax and compares against the float reference implementation
+// and the quantized reference implementation.
+void RunOneLogSoftmaxTest(const uint8* input_data, const Dims<4>& dims_common,
+                          int32 input_offset, const double input_scale,
+                          int stride, float beta) {
+  const int buffer_size = RequiredBufferSizeForDims(dims_common);
+  std::vector<uint8> optimized_logsoftmax_output(buffer_size);
+  std::vector<uint8> reference_float_logsoftmax_output(buffer_size);
+  std::vector<uint8> reference_quant_logsoftmax_output(buffer_size);
+
+  RunLogSoftmaxFloatReference(input_data, dims_common, input_offset,
+                              input_scale, stride, beta,
+                              reference_float_logsoftmax_output.data());
+
+  int32 input_beta_multiplier;
+  int input_beta_left_shift;
+  int32 reverse_scaling_divisor;
+  int reverse_scaling_right_shift;
+  static const int kScaledDiffIntegerBits = 5;
+  tflite::PreprocessLogSoftmaxScaling(
+      beta, input_scale, kScaledDiffIntegerBits, &input_beta_multiplier,
+      &input_beta_left_shift, &reverse_scaling_divisor,
+      &reverse_scaling_right_shift);
+  // diff_min has a negative value, and is used to limit the maximum magnitude
+  // of the diffs, which are <= 0.
+  const int diff_min = -tflite::CalculateInputRadius(kScaledDiffIntegerBits,
+                                                     input_beta_left_shift);
+
+  optimized_ops::LogSoftmax(input_data, dims_common, input_beta_multiplier,
+                            input_beta_left_shift, reverse_scaling_divisor,
+                            reverse_scaling_right_shift, diff_min,
+                            optimized_logsoftmax_output.data(), dims_common);
+  reference_ops::LogSoftmax(
+      input_data, dims_common, input_beta_multiplier, input_beta_left_shift,
+      reverse_scaling_divisor, reverse_scaling_right_shift, diff_min,
+      reference_quant_logsoftmax_output.data(), dims_common);
+
+  CheckOutputData(optimized_logsoftmax_output.data(),
+                  reference_float_logsoftmax_output.data(), dims_common,
+                  "Optimized vs float reference", false);
+  CheckOutputData(optimized_logsoftmax_output.data(),
+                  reference_quant_logsoftmax_output.data(), dims_common,
+                  "Optimized vs quant reference", true);
+  CheckOutputData(reference_quant_logsoftmax_output.data(),
+                  reference_float_logsoftmax_output.data(), dims_common,
+                  "Quant reference vs float reference", false);
+}
+
+// This function picks some random LogSoftmax params, which are checked for
+// desirability.  If not acceptable, it returns false. If they're OK,
+// it runs the LogSoftmax test and returns true. This allows the caller
+// to loop until a test has been run.
+//
+// Currently we do not reject for any reason.
+bool TryOneUniformLogSoftmax() {
+  // We pick mostly positive values, on the whole emphasizing smaller values and
+  // therefore faster tests.  We test a wider range of depths.  In the case of
+  // LogSoftmax, the width and height really just create test repetitions.
+  const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
+  const int input_depth = ExponentialRandomPositiveInt(0.75f, 175, 500);
+  const int input_width = ExponentialRandomPositiveInt(0.8f, 20, 200);
+  const int input_height = ExponentialRandomPositiveInt(0.8f, 20, 200);
+  const int stride = ExponentialRandomPositiveInt(0.9f, 3, 8);
+  const double input_scale = std::pow(10.0, UniformRandomFloat(-2.0, 1.0));
+  const int32 input_offset = UniformRandomInt(-256, 0);
+  static constexpr float beta = 1.0f;
+
+  Dims<4> dims_common =
+      MakeDimsForInference(input_depth, input_width, input_height, batch);
+  const int buffer_size = RequiredBufferSizeForDims(dims_common);
+
+  std::vector<uint8> input_data(buffer_size);
+  FillRandom(&input_data);
+  RunOneLogSoftmaxTest(input_data.data(), dims_common, input_offset,
+                       input_scale, stride, beta);
+  return true;
+}
+
+// See TryOneUniformLogSoftmax() for a general description.
+//
+// Tests with "skyscraper" input patterns are included for two reasons. (a)
+// Bimodal distributions are potentially challenging and perhaps more
+// realistic than simple uniform random inputs.  (b) Some implementations of
+// LogSoftmax may adapt as they traverse the depth, and so we test handling of
+// cases where relatively small values are encountered at the beginning and end.
+bool TryOneSkyscraperLogSoftmax(bool small_depth) {
+  // We pick mostly positive values, on the whole emphasizing smaller values and
+  // therefore faster tests.  We test a wider range of depths.  In the case of
+  // LogSoftmax, the width and height really just create test repetitions.
+  const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
+  const int input_depth = small_depth
+                              ? ExponentialRandomPositiveInt(0.75f, 40, 500)
+                              : ExponentialRandomPositiveInt(0.75f, 175, 500);
+  const int input_width = ExponentialRandomPositiveInt(0.7f, 20, 200);
+  const int input_height = ExponentialRandomPositiveInt(0.7f, 20, 200);
+  const int stride = ExponentialRandomPositiveInt(0.9f, 3, 8);
+  const double input_scale = std::pow(10.0, UniformRandomFloat(-2.0, 1.0));
+  const int32 input_offset = UniformRandomInt(-256, 0);
+  static constexpr float beta = 1.0f;
+  // Extra parameters for skyscraper input patterns.
+  const double middle_proportion =
+      ExponentialRandomPositiveFloat(0.65f, 0.1, 1.0);
+  const int middle_min = UniformRandomInt(0, 255);
+  const int sides_max = UniformRandomInt(0, middle_min);
+
+  Dims<4> dims_common =
+      MakeDimsForInference(input_depth, input_width, input_height, batch);
+  const int buffer_size = RequiredBufferSizeForDims(dims_common);
+
+  std::vector<uint8> input_data(buffer_size);
+  FillRandomSkyscraper(&input_data, input_depth, middle_proportion, middle_min,
+                       sides_max);
+  RunOneLogSoftmaxTest(input_data.data(), dims_common, input_offset,
+                       input_scale, stride, beta);
+  return true;
+}
+
+TEST(TestQuantizedLogSoftmax, UniformLogSoftmaxTests) {
+  const int kTestsToRun = 1000;
+  for (int i = 0; i < kTestsToRun; i++) {
+    while (!TryOneUniformLogSoftmax()) {
+    }
+  }
+}
+
+TEST(TestQuantizedLogSoftmax, SkyscraperLogSoftmaxTests) {
+  const int kTestsToRun = 1000;
+  for (int i = 0; i < kTestsToRun; i++) {
+    while (!TryOneSkyscraperLogSoftmax(false)) {
+    }
+  }
+}
+
+TEST(TestQuantizedLogSoftmax, SmallSkyscraperLogSoftmaxTests) {
+  const int kTestsToRun = 1000;
+  for (int i = 0; i < kTestsToRun; i++) {
+    while (!TryOneSkyscraperLogSoftmax(true)) {
+    }
+  }
+}
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
index dd6932ffe7b7a6..3fd00c89308d3b 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8.h
@@ -1691,14 +1691,20 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
   const int filter_width = ArraySize(filter_dims, 1);
   const int output_height = ArraySize(output_dims, 2);
   const int output_width = ArraySize(output_dims, 1);
+#ifdef USE_NEON
+  const bool shift_left = (output_shift <= 0);
+  const int32 multiplier_power_of_two = shift_left ? (1 << -output_shift) : 1;
+#endif
   TFLITE_DCHECK(output_depth == input_depth * depth_multiplier);
 
-#ifdef __aarch64__
+// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
+// Jetson TX-2. This compiler does not support the offsetof() macro.
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
   // Call kernel optimized for depthwise convolutions using 3x3 filters if
   // parameters are supported.
-  if (Fast3x3FilterKernelSupported(input_dims, filter_dims, stride_width,
-                                   stride_height, pad_width, pad_height,
-                                   depth_multiplier, output_dims)) {
+  if (Fast3x3FilterKernelSupported(
+          input_dims, filter_dims, stride_width, stride_height, pad_width,
+          pad_height, depth_multiplier, output_dims, output_shift)) {
     DepthwiseConv3x3Filter(input_data, input_dims, input_offset, filter_data,
                            filter_dims, filter_offset, bias_data, bias_dims,
                            stride_width, stride_height, pad_width, pad_height,
@@ -1833,12 +1839,20 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
             acc[j] = vld1q_s32(acc_buffer + i + 4 * j);
           }
 
-          // Fixed-point multiplication.
-          for (int j = 0; j < 4; j++) {
-            acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);
-          }
-          for (int j = 0; j < 4; j++) {
-            acc[j] = RoundingDivideByPOT(acc[j], output_shift);
+          if (!shift_left) {
+            // Fixed-point multiplication.
+            for (int j = 0; j < 4; j++) {
+              acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);
+            }
+            for (int j = 0; j < 4; j++) {
+              acc[j] = RoundingDivideByPOT(acc[j], output_shift);
+            }
+          } else {
+            // Fixed-point multiplication.
+            for (int j = 0; j < 4; j++) {
+              acc[j] = vmulq_n_s32(acc[j], multiplier_power_of_two);
+              acc[j] = vqrdmulhq_n_s32(acc[j], output_multiplier);
+            }
           }
           // Add the output offset.
           for (int j = 0; j < 4; j++) {
@@ -1870,12 +1884,21 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
         for (; i <= num_output_values - 8; i += 8) {
           int32x4_t acc0 = vld1q_s32(acc_buffer + i);
           int32x4_t acc1 = vld1q_s32(acc_buffer + i + 4);
-          // Fixed-point multiplication.
-          acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
-          acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
-          // Rounding right shift.
-          acc0 = RoundingDivideByPOT(acc0, output_shift);
-          acc1 = RoundingDivideByPOT(acc1, output_shift);
+          if (!shift_left) {
+            // Fixed-point multiplication.
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+            // Rounding right shift.
+            acc0 = RoundingDivideByPOT(acc0, output_shift);
+            acc1 = RoundingDivideByPOT(acc1, output_shift);
+          } else {
+            // Fixed-point multiplication.
+            acc0 = vmulq_n_s32(acc0, multiplier_power_of_two);
+            acc0 = vqrdmulhq_n_s32(acc0, output_multiplier);
+
+            acc1 = vmulq_n_s32(acc1, multiplier_power_of_two);
+            acc1 = vqrdmulhq_n_s32(acc1, output_multiplier);
+          }
           // Add the output offset.
           acc0 = vaddq_s32(acc0, output_offset_vec);
           acc1 = vaddq_s32(acc1, output_offset_vec);
@@ -1899,10 +1922,16 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
         // that will have to go through the very slow scalar code.
         for (; i <= num_output_values - 4; i += 4) {
           int32x4_t acc = vld1q_s32(acc_buffer + i);
-          // Fixed-point multiplication.
-          acc = vqrdmulhq_n_s32(acc, output_multiplier);
-          // Rounding right shift.
-          acc = RoundingDivideByPOT(acc, output_shift);
+          if (!shift_left) {
+            // Fixed-point multiplication.
+            acc = vqrdmulhq_n_s32(acc, output_multiplier);
+            // Rounding right shift.
+            acc = RoundingDivideByPOT(acc, output_shift);
+          } else {
+            // Fixed-point multiplication.
+            acc = vmulq_n_s32(acc, multiplier_power_of_two);
+            acc = vqrdmulhq_n_s32(acc, output_multiplier);
+          }
           // Add the output offset.
           acc = vaddq_s32(acc, output_offset_vec);
           // Apply the activation function.
@@ -1923,8 +1952,8 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
         // Handle leftover values, one by one. This is very slow.
         for (; i < num_output_values; i++) {
           int32 acc = acc_buffer[i];
-          acc = MultiplyByQuantizedMultiplierSmallerThanOne(
-              acc, output_multiplier, output_shift);
+          acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
+                                              -output_shift);
           acc += output_offset;
           acc = std::max(acc, output_activation_min);
           acc = std::min(acc, output_activation_max);
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
index 55e0d5c3aa9ebb..a7b0d805a3acd3 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
@@ -23,3848 +23,2912 @@ limitations under the License.
 namespace tflite {
 namespace optimized_ops {
 
-#ifdef __aarch64__
-
-inline void preload_l1_keep(const uint8* ptr) {
-#ifdef GEMMLOWP_ARM_64
-  asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
-#else
-  gemmlowp::Prefetch(ptr);
-#endif
-}
-
-// Implementation of quantized DepthwiseConv for 3x3 filters.
-
-// Below are helper structs to remove the use of arrays.
-// There is an llvm bug that causes significant slowdown when using arrays for
-// NEON intrinsics vector data types.
-// See: https://bugs.llvm.org/show_bug.cgi?id=34945
-
-struct Int32x8 {
-  int32x4_t low, high;
-};
-
-struct Filter3x3x8 {
-  int16x8_t f0, f1, f2, f3, f4, f5, f6, f7, f8;
-};
-
-// Loads 3x3 filter of depth 8 and adds filter offsets.
-inline Filter3x3x8 Load3x3Filter(const uint8* filter_ptr, int32 filter_offset,
-                                 int output_depth) {
-  Filter3x3x8 filter;
-
-  uint8x8_t temp_u8_0, temp_u8_1, temp_u8_2, temp_u8_3, temp_u8_4, temp_u8_5,
-      temp_u8_6, temp_u8_7, temp_u8_8;
-  int16x8_t filter_offset_vec = vdupq_n_s16(filter_offset);
-
-  temp_u8_0 = vld1_u8(filter_ptr + 0 * output_depth);
-  temp_u8_1 = vld1_u8(filter_ptr + 1 * output_depth);
-  temp_u8_2 = vld1_u8(filter_ptr + 2 * output_depth);
-  temp_u8_3 = vld1_u8(filter_ptr + 3 * output_depth);
-  temp_u8_4 = vld1_u8(filter_ptr + 4 * output_depth);
-  temp_u8_5 = vld1_u8(filter_ptr + 5 * output_depth);
-  temp_u8_6 = vld1_u8(filter_ptr + 6 * output_depth);
-  temp_u8_7 = vld1_u8(filter_ptr + 7 * output_depth);
-  temp_u8_8 = vld1_u8(filter_ptr + 8 * output_depth);
-
-  filter.f0 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_0));
-  filter.f1 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_1));
-  filter.f2 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_2));
-  filter.f3 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_3));
-  filter.f4 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_4));
-  filter.f5 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_5));
-  filter.f6 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_6));
-  filter.f7 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_7));
-  filter.f8 = vreinterpretq_s16_u16(vmovl_u8(temp_u8_8));
-
-  filter.f0 = vaddq_s16(filter.f0, filter_offset_vec);
-  filter.f1 = vaddq_s16(filter.f1, filter_offset_vec);
-  filter.f2 = vaddq_s16(filter.f2, filter_offset_vec);
-  filter.f3 = vaddq_s16(filter.f3, filter_offset_vec);
-  filter.f4 = vaddq_s16(filter.f4, filter_offset_vec);
-  filter.f5 = vaddq_s16(filter.f5, filter_offset_vec);
-  filter.f6 = vaddq_s16(filter.f6, filter_offset_vec);
-  filter.f7 = vaddq_s16(filter.f7, filter_offset_vec);
-  filter.f8 = vaddq_s16(filter.f8, filter_offset_vec);
-
-  return filter;
-}
-
-// Applies activation, offset and downquantize on a set of accumulator
-// registers that correspond to a 2x2 output of depth 8.
-// Stores results to output.
-inline void DownquantizeAndStore2x2Output(
-    Int32x8 acc_0, Int32x8 acc_1, Int32x8 acc_2, Int32x8 acc_3,
-    int32 output_offset, int32 output_multiplier, int output_shift,
-    int32 output_activation_min, int32 output_activation_max, uint8* output_ptr,
-    int output_depth, int output_width) {
-  using gemmlowp::RoundingDivideByPOT;
-  const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
-  const int32x4_t output_activation_min_vec =
-      vdupq_n_s32(output_activation_min);
-  const int32x4_t output_activation_max_vec =
-      vdupq_n_s32(output_activation_max);
-
-  // Fixed-point multiplication.
-  acc_0.low = vqrdmulhq_n_s32(acc_0.low, output_multiplier);
-  acc_0.high = vqrdmulhq_n_s32(acc_0.high, output_multiplier);
-  acc_1.low = vqrdmulhq_n_s32(acc_1.low, output_multiplier);
-  acc_1.high = vqrdmulhq_n_s32(acc_1.high, output_multiplier);
-  acc_2.low = vqrdmulhq_n_s32(acc_2.low, output_multiplier);
-  acc_2.high = vqrdmulhq_n_s32(acc_2.high, output_multiplier);
-  acc_3.low = vqrdmulhq_n_s32(acc_3.low, output_multiplier);
-  acc_3.high = vqrdmulhq_n_s32(acc_3.high, output_multiplier);
-
-  acc_0.low = RoundingDivideByPOT(acc_0.low, output_shift);
-  acc_0.high = RoundingDivideByPOT(acc_0.high, output_shift);
-  acc_1.low = RoundingDivideByPOT(acc_1.low, output_shift);
-  acc_1.high = RoundingDivideByPOT(acc_1.high, output_shift);
-  acc_2.low = RoundingDivideByPOT(acc_2.low, output_shift);
-  acc_2.high = RoundingDivideByPOT(acc_2.high, output_shift);
-  acc_3.low = RoundingDivideByPOT(acc_3.low, output_shift);
-  acc_3.high = RoundingDivideByPOT(acc_3.high, output_shift);
-
-  // Add the output offset.
-  acc_0.low = vaddq_s32(acc_0.low, output_offset_vec);
-  acc_0.high = vaddq_s32(acc_0.high, output_offset_vec);
-  acc_1.low = vaddq_s32(acc_1.low, output_offset_vec);
-  acc_1.high = vaddq_s32(acc_1.high, output_offset_vec);
-  acc_2.low = vaddq_s32(acc_2.low, output_offset_vec);
-  acc_2.high = vaddq_s32(acc_2.high, output_offset_vec);
-  acc_3.low = vaddq_s32(acc_3.low, output_offset_vec);
-  acc_3.high = vaddq_s32(acc_3.high, output_offset_vec);
-
-  // Apply the activation function.
-  acc_0.low = vmaxq_s32(acc_0.low, output_activation_min_vec);
-  acc_0.high = vmaxq_s32(acc_0.high, output_activation_min_vec);
-  acc_1.low = vmaxq_s32(acc_1.low, output_activation_min_vec);
-  acc_1.high = vmaxq_s32(acc_1.high, output_activation_min_vec);
-  acc_2.low = vmaxq_s32(acc_2.low, output_activation_min_vec);
-  acc_2.high = vmaxq_s32(acc_2.high, output_activation_min_vec);
-  acc_3.low = vmaxq_s32(acc_3.low, output_activation_min_vec);
-  acc_3.high = vmaxq_s32(acc_3.high, output_activation_min_vec);
-
-  acc_0.low = vminq_s32(acc_0.low, output_activation_max_vec);
-  acc_0.high = vminq_s32(acc_0.high, output_activation_max_vec);
-  acc_1.low = vminq_s32(acc_1.low, output_activation_max_vec);
-  acc_1.high = vminq_s32(acc_1.high, output_activation_max_vec);
-  acc_2.low = vminq_s32(acc_2.low, output_activation_max_vec);
-  acc_2.high = vminq_s32(acc_2.high, output_activation_max_vec);
-  acc_3.low = vminq_s32(acc_3.low, output_activation_max_vec);
-  acc_3.high = vminq_s32(acc_3.high, output_activation_max_vec);
-
-  // Saturating cast to uint8 and store to destination.
-  int16x4_t acc_0_low_s16 = vqmovn_s32(acc_0.low);
-  int16x4_t acc_0_high_s16 = vqmovn_s32(acc_0.high);
-  int16x4_t acc_1_low_s16 = vqmovn_s32(acc_1.low);
-  int16x4_t acc_1_high_s16 = vqmovn_s32(acc_1.high);
-  int16x4_t acc_2_low_s16 = vqmovn_s32(acc_2.low);
-  int16x4_t acc_2_high_s16 = vqmovn_s32(acc_2.high);
-  int16x4_t acc_3_low_s16 = vqmovn_s32(acc_3.low);
-  int16x4_t acc_3_high_s16 = vqmovn_s32(acc_3.high);
-
-  int16x8_t res_0_s16 = vcombine_s16(acc_0_low_s16, acc_0_high_s16);
-  int16x8_t res_1_s16 = vcombine_s16(acc_1_low_s16, acc_1_high_s16);
-  int16x8_t res_2_s16 = vcombine_s16(acc_2_low_s16, acc_2_high_s16);
-  int16x8_t res_3_s16 = vcombine_s16(acc_3_low_s16, acc_3_high_s16);
-
-  uint8x8_t res_0_u8 = vqmovun_s16(res_0_s16);
-  uint8x8_t res_1_u8 = vqmovun_s16(res_1_s16);
-  uint8x8_t res_2_u8 = vqmovun_s16(res_2_s16);
-  uint8x8_t res_3_u8 = vqmovun_s16(res_3_s16);
-
-  vst1_u8(output_ptr, res_0_u8);
-  vst1_u8(output_ptr + output_depth, res_1_u8);
-  vst1_u8(output_ptr + output_depth * output_width, res_2_u8);
-  vst1_u8(output_ptr + output_depth * output_width + output_depth, res_3_u8);
-}
-
-inline void DownquantizeAndStore(Int32x8 acc, int32 output_offset,
-                                 int32 output_multiplier, int output_shift,
-                                 int32 output_activation_min,
-                                 int32 output_activation_max,
-                                 uint8* output_ptr) {
-  using gemmlowp::RoundingDivideByPOT;
-  const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
-  const int32x4_t output_activation_min_vec =
-      vdupq_n_s32(output_activation_min);
-  const int32x4_t output_activation_max_vec =
-      vdupq_n_s32(output_activation_max);
-
-  acc.low = vqrdmulhq_n_s32(acc.low, output_multiplier);
-  acc.high = vqrdmulhq_n_s32(acc.high, output_multiplier);
-
-  acc.low = RoundingDivideByPOT(acc.low, output_shift);
-  acc.high = RoundingDivideByPOT(acc.high, output_shift);
-
-  acc.low = vaddq_s32(acc.low, output_offset_vec);
-  acc.high = vaddq_s32(acc.high, output_offset_vec);
-
-  acc.low = vmaxq_s32(acc.low, output_activation_min_vec);
-  acc.high = vmaxq_s32(acc.high, output_activation_min_vec);
-
-  acc.low = vminq_s32(acc.low, output_activation_max_vec);
-  acc.high = vminq_s32(acc.high, output_activation_max_vec);
-
-  int16x4_t acc_low_s16 = vqmovn_s32(acc.low);
-  int16x4_t acc_high_s16 = vqmovn_s32(acc.high);
-
-  int16x8_t res_s16 = vcombine_s16(acc_low_s16, acc_high_s16);
-  uint8x8_t res_u8 = vqmovun_s16(res_s16);
-  vst1_u8(output_ptr, res_u8);
-}
-
-inline void DownquantizeAndStore2Output(
-    Int32x8 acc_0, Int32x8 acc_1, int32 output_offset, int32 output_multiplier,
-    int output_shift, int32 output_activation_min, int32 output_activation_max,
-    uint8* output_ptr, int output_ptr_offset) {
-  {
-    using gemmlowp::RoundingDivideByPOT;
-    const int32x4_t output_offset_vec = vdupq_n_s32(output_offset);
-    const int32x4_t output_activation_min_vec =
-        vdupq_n_s32(output_activation_min);
-    const int32x4_t output_activation_max_vec =
-        vdupq_n_s32(output_activation_max);
-
-    // Fixed-point multiplication.
-    acc_0.low = vqrdmulhq_n_s32(acc_0.low, output_multiplier);
-    acc_0.high = vqrdmulhq_n_s32(acc_0.high, output_multiplier);
-    acc_1.low = vqrdmulhq_n_s32(acc_1.low, output_multiplier);
-    acc_1.high = vqrdmulhq_n_s32(acc_1.high, output_multiplier);
-
-    acc_0.low = RoundingDivideByPOT(acc_0.low, output_shift);
-    acc_0.high = RoundingDivideByPOT(acc_0.high, output_shift);
-    acc_1.low = RoundingDivideByPOT(acc_1.low, output_shift);
-    acc_1.high = RoundingDivideByPOT(acc_1.high, output_shift);
-
-    // Add the output offset.
-    acc_0.low = vaddq_s32(acc_0.low, output_offset_vec);
-    acc_0.high = vaddq_s32(acc_0.high, output_offset_vec);
-    acc_1.low = vaddq_s32(acc_1.low, output_offset_vec);
-    acc_1.high = vaddq_s32(acc_1.high, output_offset_vec);
-
-    // Apply the activation function.
-    acc_0.low = vmaxq_s32(acc_0.low, output_activation_min_vec);
-    acc_0.high = vmaxq_s32(acc_0.high, output_activation_min_vec);
-    acc_1.low = vmaxq_s32(acc_1.low, output_activation_min_vec);
-    acc_1.high = vmaxq_s32(acc_1.high, output_activation_min_vec);
-
-    acc_0.low = vminq_s32(acc_0.low, output_activation_max_vec);
-    acc_0.high = vminq_s32(acc_0.high, output_activation_max_vec);
-    acc_1.low = vminq_s32(acc_1.low, output_activation_max_vec);
-    acc_1.high = vminq_s32(acc_1.high, output_activation_max_vec);
-  }
-
-  // Saturating cast to uint8 and store to destination.
-  int16x8_t res_0_s16;
-  {
-    int16x4_t acc_0_low_s16 = vqmovn_s32(acc_0.low);
-    int16x4_t acc_0_high_s16 = vqmovn_s32(acc_0.high);
-    res_0_s16 = vcombine_s16(acc_0_low_s16, acc_0_high_s16);
-  }
-
-  int16x8_t res_1_s16;
-  {
-    int16x4_t acc_1_low_s16 = vqmovn_s32(acc_1.low);
-    int16x4_t acc_1_high_s16 = vqmovn_s32(acc_1.high);
-    res_1_s16 = vcombine_s16(acc_1_low_s16, acc_1_high_s16);
-  }
-
-  uint8x8_t res_0_u8 = vqmovun_s16(res_0_s16);
-  uint8x8_t res_1_u8 = vqmovun_s16(res_1_s16);
-  vst1_u8(output_ptr, res_0_u8);
-  vst1_u8(output_ptr + output_ptr_offset, res_1_u8);
-}
-
-// Performs multiply accumulate on 3 inputs of depth 8.
-inline Int32x8 MultiplyAccumulateRow(Int32x8 accum, int16x8_t f0, int16x8_t f1,
-                                     int16x8_t f2, int16x8_t i0, int16x8_t i1,
-                                     int16x8_t i2) {
-  accum.low = vmlal_s16(accum.low, vget_low_s16(f0), vget_low_s16(i0));
-  accum.high = vmlal_s16(accum.high, vget_high_s16(f0), vget_high_s16(i0));
-  accum.low = vmlal_s16(accum.low, vget_low_s16(f1), vget_low_s16(i1));
-  accum.high = vmlal_s16(accum.high, vget_high_s16(f1), vget_high_s16(i1));
-  accum.low = vmlal_s16(accum.low, vget_low_s16(f2), vget_low_s16(i2));
-  accum.high = vmlal_s16(accum.high, vget_high_s16(f2), vget_high_s16(i2));
-  return accum;
-}
-
-// Performs multiply accumulate on 3 inputs of depth 8.
-inline Int32x8 MultiplyAccumulate3x3Filter(const Filter3x3x8& f, int16x8_t i0,
-                                           int16x8_t i1, int16x8_t i2,
-                                           int16x8_t i3, int16x8_t i4,
-                                           int16x8_t i5, int16x8_t i6,
-                                           int16x8_t i7, int16x8_t i8,
-                                           Int32x8 accum) {
-  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f0), vget_low_s16(i0));
-  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f0), vget_high_s16(i0));
-  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f1), vget_low_s16(i1));
-  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f1), vget_high_s16(i1));
-  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f2), vget_low_s16(i2));
-  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f2), vget_high_s16(i2));
-  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f3), vget_low_s16(i3));
-  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f3), vget_high_s16(i3));
-  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f4), vget_low_s16(i4));
-  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f4), vget_high_s16(i4));
-  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f5), vget_low_s16(i5));
-  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f5), vget_high_s16(i5));
-  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f6), vget_low_s16(i6));
-  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f6), vget_high_s16(i6));
-  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f7), vget_low_s16(i7));
-  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f7), vget_high_s16(i7));
-  accum.low = vmlal_s16(accum.low, vget_low_s16(f.f8), vget_low_s16(i8));
-  accum.high = vmlal_s16(accum.high, vget_high_s16(f.f8), vget_high_s16(i8));
-  return accum;
-}
-
-inline void DotProductAndStore(const Filter3x3x8& filter, int16x8_t i0,
-                               int16x8_t i1, int16x8_t i2, int16x8_t i3,
-                               int16x8_t i4, int16x8_t i5, int16x8_t i6,
-                               int16x8_t i7, int16x8_t i8,
-                               const int32* bias_ptr, int32 output_offset,
-                               int32 output_multiplier, int output_shift,
-                               int32 output_activation_min,
-                               int32 output_activation_max, uint8* output_ptr) {
-  Int32x8 acc;
-  acc.low = vld1q_s32(bias_ptr);
-  acc.high = vld1q_s32(bias_ptr + 4);
-
-  acc = MultiplyAccumulate3x3Filter(filter, i0, i1, i2, i3, i4, i5, i6, i7, i8,
-                                    acc);
-
-  DownquantizeAndStore(acc, output_offset, output_multiplier, output_shift,
-                       output_activation_min, output_activation_max,
-                       output_ptr);
-}
-
-// Performs multiply-accumulate on a 3x4 input for 2 horizontal outputs.
-inline void DotProductAndStore2xStride1(
-    const Filter3x3x8& filter, int16x8_t i0, int16x8_t i1, int16x8_t i2,
-    int16x8_t i3, int16x8_t i4, int16x8_t i5, int16x8_t i6, int16x8_t i7,
-    int16x8_t i8, int16x8_t i9, int16x8_t i10, int16x8_t i11,
-    const int32* bias_ptr, int32 output_offset, int32 output_multiplier,
-    int output_shift, int32 output_activation_min, int32 output_activation_max,
-    uint8* output_ptr, int output_ptr_offset) {
-  Int32x8 acc_0, acc_1;
-  acc_0.low = vld1q_s32(bias_ptr);
-  acc_1.low = vld1q_s32(bias_ptr);
-  acc_0.high = vld1q_s32(bias_ptr + 4);
-  acc_1.high = vld1q_s32(bias_ptr + 4);
-
-  acc_0 = MultiplyAccumulate3x3Filter(filter, i0, i1, i2, i4, i5, i6, i8, i9,
-                                      i10, acc_0);
-  acc_1 = MultiplyAccumulate3x3Filter(filter, i1, i2, i3, i5, i6, i7, i9, i10,
-                                      i11, acc_1);
-  DownquantizeAndStore2Output(acc_0, acc_1, output_offset, output_multiplier,
-                              output_shift, output_activation_min,
-                              output_activation_max, output_ptr,
-                              output_ptr_offset);
-}
-
-// Performs multiply-accumulate on a 4x3 input for 2 vertical outputs.
-inline void DotProductAndStore2yStride1(
-    const Filter3x3x8& filter, int16x8_t i0, int16x8_t i1, int16x8_t i2,
-    int16x8_t i3, int16x8_t i4, int16x8_t i5, int16x8_t i6, int16x8_t i7,
-    int16x8_t i8, int16x8_t i9, int16x8_t i10, int16x8_t i11,
-    const int32* bias_ptr, int32 output_offset, int32 output_multiplier,
-    int output_shift, int32 output_activation_min, int32 output_activation_max,
-    uint8* output_ptr, int output_ptr_offset) {
-  Int32x8 acc_0, acc_1;
-  acc_0.low = vld1q_s32(bias_ptr);
-  acc_1.low = vld1q_s32(bias_ptr);
-  acc_0.high = vld1q_s32(bias_ptr + 4);
-  acc_1.high = vld1q_s32(bias_ptr + 4);
-
-  acc_0 = MultiplyAccumulate3x3Filter(filter, i0, i1, i2, i3, i4, i5, i6, i7,
-                                      i8, acc_0);
-  acc_1 = MultiplyAccumulate3x3Filter(filter, i3, i4, i5, i6, i7, i8, i9, i10,
-                                      i11, acc_1);
-  DownquantizeAndStore2Output(acc_0, acc_1, output_offset, output_multiplier,
-                              output_shift, output_activation_min,
-                              output_activation_max, output_ptr,
-                              output_ptr_offset);
-}
-
-// A kernel that is optimized on the number of output cells in the x and y
-// direction, and the stride. Assumes 3x3 filters of 8 depth.
-template <int kFixedOutputY, int kFixedOutputX, int kFixedStrideWidth,
-          int kFixedStrideHeight>
-struct ConvKernel3x3FilterDepth8 {};
-
-template <>
-struct ConvKernel3x3FilterDepth8<8, 8, 1, 1> {
-  static inline void Run(const uint8* input_ptr, int input_depth,
-                         int32 input_offset, int input_row_size,
-                         const uint8* filter_ptr, int32 filter_offset,
-                         const int32* bias_ptr, int32 output_offset,
-                         int32 output_multiplier, int output_shift,
-                         int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_ptr,
-                         int output_depth, int output_width) {
-    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
-
-    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
-    const int output_row_size = output_depth * output_width;
-
-    // To process 8x8 outputs using a 3x3 filter, we require 10x10 inputs.
-    // Load inputs for the first 2 filters on the top left, then slide to
-    // the right, down, left, down, right, etc. in a snake-like path. This
-    // minimizes the total number of loads.
-    //
-    //        INPUT                          OUTPUT
-    //   |\----------------\               |\------------\
-    //   | \                \              | \            \
-    //   |  \----------------\             |  \------------\
-    //   |  | 0    ...     9 |             |  | 0  ...   7 |
-    //   |  | 10   ...    19 |     --->    |  | 8  ...  15 |
-    //   |  | 20   ...    29 |              \ | .. ...  .. |
-    //    \ | ..   ...    .. |               \| 56 ...  63 |
-    //     \| 90   ...   109 |                |------------|
-    //      |----------------|
-    //
-    // The first set of loads corresponds to:
-    //
-    //        INPUT                          OUTPUT
-    //   |\-----------------                |\-----------
-    //   | \                                | \
-    //   |  \-----------------              |  \----------
-    //   |  | 0  1   2  3 ...               |  | 0  1 ...
-    //   |  | 10 11 12 13 ...     --->      |  | ..   ...
-    //   |  | 20 21 22 23 ...                  | ..   ...
-    //   |  | ..   ...    ...
-    //
-    // The next set of loads correspond to a sliding window to the right.
-    // It loads inputs 4, 5, 14, 15, 23, 24 and keeps 2, 3, 12, 13, and 22:
-    //
-    //        INPUT                          OUTPUT
-    //   |\-------------------                |\-------------
-    //   | \                                  | \
-    //   |  \-------------------              |  \------------
-    //   |  | .. 2  3   4  5 ...              |  | .. 2  3 ...
-    //   |  | .. 12 13 14 15 ...     --->     |  | ..      ...
-    //   |  | .. 21 22 23 24 ...                 | ..      ...
-    //   |  | ..    ...      ...
-    //
-    // And so on...
-
-    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8, input_9, input_10, input_11;
-
-    // Load inputs for 1x2 outputs starting from the top left. Referring to the
-    // indexes in the diagram above, this corresponds to outputs (0) and (1).
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
-      const uint8* ptr = input_ptr;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      temp_3 = vld1_u8(ptr + 3 * input_depth);
-
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-      input_2 = vaddq_s16(input_2, input_offset_vec);
-      input_3 = vaddq_s16(input_3, input_offset_vec);
-
-      ptr += input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      temp_3 = vld1_u8(ptr + 3 * input_depth);
-
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-      input_6 = vaddq_s16(input_6, input_offset_vec);
-      input_7 = vaddq_s16(input_7, input_offset_vec);
-
-      ptr += input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      temp_3 = vld1_u8(ptr + 3 * input_depth);
-
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-      input_10 = vaddq_s16(input_10, input_offset_vec);
-      input_11 = vaddq_s16(input_11, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_ptr, output_depth);
-
-    // Slide to the right for outputs x = [2, 3], y = 0. Referring to the
-    // indexes in the diagram above, this corresponds to outputs (2) and (3).
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr + 4 * input_depth;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_2 = vld1_u8(ptr);
-      temp_3 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_4 = vld1_u8(ptr);
-      temp_5 = vld1_u8(ptr + input_depth);
-
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
-        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_ptr + 2 * output_depth, output_depth);
-
-    // Slide to the right again for outputs x = [4, 5], y = 0. Referring to the
-    // indexes in the diagram above, this corresponds to outputs (4) and (5).
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr + 6 * input_depth;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_2 = vld1_u8(ptr);
-      temp_3 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_4 = vld1_u8(ptr);
-      temp_5 = vld1_u8(ptr + input_depth);
-
-      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_2 = vaddq_s16(input_2, input_offset_vec);
-      input_3 = vaddq_s16(input_3, input_offset_vec);
-      input_6 = vaddq_s16(input_6, input_offset_vec);
-      input_7 = vaddq_s16(input_7, input_offset_vec);
-      input_10 = vaddq_s16(input_10, input_offset_vec);
-      input_11 = vaddq_s16(input_11, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_ptr + 4 * output_depth, output_depth);
-
-    // Slide to the right one last time for outputs x = [6, 7], y = 0.
-    // Referring to the indexes in the diagram above, this corresponds to
-    // outputs (6) and (7).
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr + 8 * input_depth;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_2 = vld1_u8(ptr);
-      temp_3 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_4 = vld1_u8(ptr);
-      temp_5 = vld1_u8(ptr + input_depth);
-
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
-        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_ptr + 6 * output_depth, output_depth);
-
-    // Slide to down for outputs x = [6, 7], y = 1. Referring to the indexes in
-    // the diagram above, this corresponds to outputs (14) and (15).
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
-      const uint8* ptr = input_ptr + 6 * input_depth + 3 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      temp_3 = vld1_u8(ptr + 3 * input_depth);
-
-      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
-      input_2 = vaddq_s16(input_2, input_offset_vec);
-      input_3 = vaddq_s16(input_3, input_offset_vec);
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
-        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_ptr + 6 * output_depth + output_row_size,
-        output_depth);
-
-    // Slide left for outputs x = [4, 5], y = 1. Referring to the indexes in
-    // the diagram above, this corresponds to outputs (12) and (13).
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr + 4 * input_depth + input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_2 = vld1_u8(ptr);
-      temp_3 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_4 = vld1_u8(ptr);
-      temp_5 = vld1_u8(ptr + input_depth);
-
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
-        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_ptr + 4 * output_depth + output_row_size,
-        output_depth);
-
-    // Slide left again for outputs x = [2, 3], y = 1. Referring to the indexes
-    // in the diagram above, this corresponds to outputs (10) and (11).
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr + 2 * input_depth + input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_2 = vld1_u8(ptr);
-      temp_3 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_4 = vld1_u8(ptr);
-      temp_5 = vld1_u8(ptr + input_depth);
-
-      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_6 = vaddq_s16(input_6, input_offset_vec);
-      input_7 = vaddq_s16(input_7, input_offset_vec);
-      input_10 = vaddq_s16(input_10, input_offset_vec);
-      input_11 = vaddq_s16(input_11, input_offset_vec);
-      input_2 = vaddq_s16(input_2, input_offset_vec);
-      input_3 = vaddq_s16(input_3, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
-        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_ptr + 2 * output_depth + output_row_size,
-        output_depth);
-
-    // Slide left one more time for outputs x = [0, 1], y = 1. Referring to the
-    // indexes in the diagram above, this corresponds to outputs (8) and (9).
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr + input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_2 = vld1_u8(ptr);
-      temp_3 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_4 = vld1_u8(ptr);
-      temp_5 = vld1_u8(ptr + input_depth);
-
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
-        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_ptr + output_row_size, output_depth);
-
-    // Slide down for outputs x = [0, 1], y = 2. Referring to the
-    // indexes in the diagram above, this corresponds to outputs (16) and (17).
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
-      const uint8* ptr = input_ptr + 4 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      temp_3 = vld1_u8(ptr + 3 * input_depth);
-
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-      input_6 = vaddq_s16(input_6, input_offset_vec);
-      input_7 = vaddq_s16(input_7, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
-        input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_ptr + 2 * output_row_size, output_depth);
-
-    // Slide right for outputs x = [2, 3], y = 2. Referring to the
-    // indexes in the diagram above, this corresponds to outputs (18) and (19).
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr + 4 * input_depth + 2 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_2 = vld1_u8(ptr);
-      temp_3 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_4 = vld1_u8(ptr);
-      temp_5 = vld1_u8(ptr + input_depth);
-
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0,
-        input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max,
-        output_ptr + 2 * output_depth + 2 * output_row_size, output_depth);
-
-    // Slide right for outputs x = [4, 5], y = 2. Referring to the
-    // indexes in the diagram above, this corresponds to outputs (20) and (21).
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr + 6 * input_depth + 2 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_2 = vld1_u8(ptr);
-      temp_3 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_4 = vld1_u8(ptr);
-      temp_5 = vld1_u8(ptr + input_depth);
-
-      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_10 = vaddq_s16(input_10, input_offset_vec);
-      input_11 = vaddq_s16(input_11, input_offset_vec);
-      input_2 = vaddq_s16(input_2, input_offset_vec);
-      input_3 = vaddq_s16(input_3, input_offset_vec);
-      input_6 = vaddq_s16(input_6, input_offset_vec);
-      input_7 = vaddq_s16(input_7, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
-        input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max,
-        output_ptr + 4 * output_depth + 2 * output_row_size, output_depth);
-
-    // Slide right one more time for outputs x = [6, 7], y = 2. Referring to the
-    // indexes in the diagram above, this corresponds to outputs (22) and (23).
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr + 8 * input_depth + 2 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_2 = vld1_u8(ptr);
-      temp_3 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_4 = vld1_u8(ptr);
-      temp_5 = vld1_u8(ptr + input_depth);
-
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0,
-        input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max,
-        output_ptr + 6 * output_depth + 2 * output_row_size, output_depth);
-
-    // Slide down for outputs x = [6, 7], y = 3. Referring to the indexes in
-    // the diagram above, this corresponds to outputs (30) and (31).
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
-      const uint8* ptr = input_ptr + 6 * input_depth + 5 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      temp_3 = vld1_u8(ptr + 3 * input_depth);
-
-      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
-      input_10 = vaddq_s16(input_10, input_offset_vec);
-      input_11 = vaddq_s16(input_11, input_offset_vec);
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
-        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max,
-        output_ptr + 6 * output_depth + 3 * output_row_size, output_depth);
-
-    // Slide left for outputs x = [4, 5], y = 3. Referring to the indexes in
-    // the diagram above, this corresponds to outputs (28) and (29).
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr + 4 * input_depth + 3 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_2 = vld1_u8(ptr);
-      temp_3 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_4 = vld1_u8(ptr);
-      temp_5 = vld1_u8(ptr + input_depth);
-
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max,
-        output_ptr + 4 * output_depth + 3 * output_row_size, output_depth);
-
-    // Slide left for outputs x = [2, 3], y = 3. Referring to the indexes in
-    // the diagram above, this corresponds to outputs (26) and (27).
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr + 2 * input_depth + 3 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_2 = vld1_u8(ptr);
-      temp_3 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_4 = vld1_u8(ptr);
-      temp_5 = vld1_u8(ptr + input_depth);
-
-      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_2 = vaddq_s16(input_2, input_offset_vec);
-      input_3 = vaddq_s16(input_3, input_offset_vec);
-      input_6 = vaddq_s16(input_6, input_offset_vec);
-      input_7 = vaddq_s16(input_7, input_offset_vec);
-      input_10 = vaddq_s16(input_10, input_offset_vec);
-      input_11 = vaddq_s16(input_11, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
-        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max,
-        output_ptr + 2 * output_depth + 3 * output_row_size, output_depth);
-
-    // Slide left one more time for outputs x = [0, 1], y = 3. Referring to the
-    // indexes in the diagram above, this corresponds to outputs (24) and (25).
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr + 3 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_2 = vld1_u8(ptr);
-      temp_3 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_4 = vld1_u8(ptr);
-      temp_5 = vld1_u8(ptr + input_depth);
-
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_ptr + 3 * output_row_size, output_depth);
-
-    // Slide down for outputs x = [0, 1], y = 4. Referring to the indexes in
-    // the diagram above, this corresponds to outputs (32) and (33).
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
-      const uint8* ptr = input_ptr + 6 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      temp_3 = vld1_u8(ptr + 3 * input_depth);
-
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-      input_2 = vaddq_s16(input_2, input_offset_vec);
-      input_3 = vaddq_s16(input_3, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
-        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_ptr + 4 * output_row_size, output_depth);
-
-    // Slide right for outputs x = [2, 3], y = 4. Referring to the indexes in
-    // the diagram above, this corresponds to outputs (34) and (35).
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr + 4 * input_depth + 4 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_2 = vld1_u8(ptr);
-      temp_3 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_4 = vld1_u8(ptr);
-      temp_5 = vld1_u8(ptr + input_depth);
-
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
-        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max,
-        output_ptr + 2 * output_depth + 4 * output_row_size, output_depth);
-
-    // Slide right for outputs x = [4, 5], y = 4. Referring to the indexes in
-    // the diagram above, this corresponds to outputs (36) and (37).
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr + 6 * input_depth + 4 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_2 = vld1_u8(ptr);
-      temp_3 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_4 = vld1_u8(ptr);
-      temp_5 = vld1_u8(ptr + input_depth);
-
-      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_6 = vaddq_s16(input_6, input_offset_vec);
-      input_7 = vaddq_s16(input_7, input_offset_vec);
-      input_10 = vaddq_s16(input_10, input_offset_vec);
-      input_11 = vaddq_s16(input_11, input_offset_vec);
-      input_2 = vaddq_s16(input_2, input_offset_vec);
-      input_3 = vaddq_s16(input_3, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
-        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max,
-        output_ptr + 4 * output_depth + 4 * output_row_size, output_depth);
-
-    // Slide right one more time for outputs x = [6, 7], y = 4. Referring to the
-    // indexes in the diagram above, this corresponds to outputs (38) and (39).
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr + 8 * input_depth + 4 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_2 = vld1_u8(ptr);
-      temp_3 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_4 = vld1_u8(ptr);
-      temp_5 = vld1_u8(ptr + input_depth);
-
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
-        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max,
-        output_ptr + 6 * output_depth + 4 * output_row_size, output_depth);
-
-    // Slide down for outputs x = [6, 7], y = 5. Referring to the  indexes in
-    // the diagram above, this corresponds to outputs (46) and (47).
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
-      const uint8* ptr = input_ptr + 6 * input_depth + 7 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      temp_3 = vld1_u8(ptr + 3 * input_depth);
-
-      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
-      input_6 = vaddq_s16(input_6, input_offset_vec);
-      input_7 = vaddq_s16(input_7, input_offset_vec);
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0,
-        input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max,
-        output_ptr + 6 * output_depth + 5 * output_row_size, output_depth);
-
-    // Slide left for outputs x = [4, 5], y = 5. Referring to the  indexes in
-    // the diagram above, this corresponds to outputs (44) and (45).
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr + 4 * input_depth + 5 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_2 = vld1_u8(ptr);
-      temp_3 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_4 = vld1_u8(ptr);
-      temp_5 = vld1_u8(ptr + input_depth);
-
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
-        input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max,
-        output_ptr + 4 * output_depth + 5 * output_row_size, output_depth);
-
-    // Slide left for outputs x = [2, 3], y = 5. Referring to the  indexes in
-    // the diagram above, this corresponds to outputs (42) and (43).
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr + 2 * input_depth + 5 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_2 = vld1_u8(ptr);
-      temp_3 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_4 = vld1_u8(ptr);
-      temp_5 = vld1_u8(ptr + input_depth);
-
-      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_10 = vaddq_s16(input_10, input_offset_vec);
-      input_11 = vaddq_s16(input_11, input_offset_vec);
-      input_2 = vaddq_s16(input_2, input_offset_vec);
-      input_3 = vaddq_s16(input_3, input_offset_vec);
-      input_6 = vaddq_s16(input_6, input_offset_vec);
-      input_7 = vaddq_s16(input_7, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0,
-        input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max,
-        output_ptr + 2 * output_depth + 5 * output_row_size, output_depth);
-
-    // Slide left one more time for outputs x = [0, 1], y = 5. Referring to the
-    // indexes in the diagram above, this corresponds to outputs (40) and (41).
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr + 5 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_2 = vld1_u8(ptr);
-      temp_3 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_4 = vld1_u8(ptr);
-      temp_5 = vld1_u8(ptr + input_depth);
-
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
-        input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_ptr + 5 * output_row_size, output_depth);
-
-    // Slide down for outputs x = [0, 1], y = 6. Referring to the  indexes in
-    // the diagram above, this corresponds to outputs (48) and (49).
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
-      const uint8* ptr = input_ptr + 8 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      temp_3 = vld1_u8(ptr + 3 * input_depth);
-
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-      input_10 = vaddq_s16(input_10, input_offset_vec);
-      input_11 = vaddq_s16(input_11, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_ptr + 6 * output_row_size, output_depth);
-
-    // Slide right for outputs x = [2, 3], y = 6. Referring to the  indexes in
-    // the diagram above, this corresponds to outputs (50) and (51).
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr + 4 * input_depth + 6 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_2 = vld1_u8(ptr);
-      temp_3 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_4 = vld1_u8(ptr);
-      temp_5 = vld1_u8(ptr + input_depth);
-
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
-        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max,
-        output_ptr + 2 * output_depth + 6 * output_row_size, output_depth);
-
-    // Slide right for outputs x = [4, 5], y = 6. Referring to the  indexes in
-    // the diagram above, this corresponds to outputs (52) and (53).
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr + 6 * input_depth + 6 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_2 = vld1_u8(ptr);
-      temp_3 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_4 = vld1_u8(ptr);
-      temp_5 = vld1_u8(ptr + input_depth);
-
-      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_2 = vaddq_s16(input_2, input_offset_vec);
-      input_3 = vaddq_s16(input_3, input_offset_vec);
-      input_6 = vaddq_s16(input_6, input_offset_vec);
-      input_7 = vaddq_s16(input_7, input_offset_vec);
-      input_10 = vaddq_s16(input_10, input_offset_vec);
-      input_11 = vaddq_s16(input_11, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max,
-        output_ptr + 4 * output_depth + 6 * output_row_size, output_depth);
-
-    // Slide right one more time for outputs x = [6, 7], y = 6. Referring to the
-    // indexes in the diagram above, this corresponds to outputs (54) and (55).
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr + 8 * input_depth + 6 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_2 = vld1_u8(ptr);
-      temp_3 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_4 = vld1_u8(ptr);
-      temp_5 = vld1_u8(ptr + input_depth);
-
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
-        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max,
-        output_ptr + 6 * output_depth + 6 * output_row_size, output_depth);
-
-    // Slide down for outputs x = [6, 7], y = 7. Referring to the indexes in the
-    // diagram above, this corresponds to outputs (62) and (63).
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
-      const uint8* ptr = input_ptr + 6 * input_depth + 9 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      temp_3 = vld1_u8(ptr + 3 * input_depth);
-
-      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
-      input_2 = vaddq_s16(input_2, input_offset_vec);
-      input_3 = vaddq_s16(input_3, input_offset_vec);
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
-        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max,
-        output_ptr + 6 * output_depth + 7 * output_row_size, output_depth);
-
-    // Slide left for outputs x = [4, 5], y = 7. Referring to the indexes in the
-    // diagram above, this corresponds to outputs (60) and (61).
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr + 4 * input_depth + 7 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_2 = vld1_u8(ptr);
-      temp_3 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_4 = vld1_u8(ptr);
-      temp_5 = vld1_u8(ptr + input_depth);
-
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
-        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max,
-        output_ptr + 4 * output_depth + 7 * output_row_size, output_depth);
-
-    // Slide left for outputs x = [2, 3], y = 7. Referring to the indexes in the
-    // diagram above, this corresponds to outputs (58) and (59).
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr + 2 * input_depth + 7 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_2 = vld1_u8(ptr);
-      temp_3 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_4 = vld1_u8(ptr);
-      temp_5 = vld1_u8(ptr + input_depth);
-
-      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_6 = vaddq_s16(input_6, input_offset_vec);
-      input_7 = vaddq_s16(input_7, input_offset_vec);
-      input_10 = vaddq_s16(input_10, input_offset_vec);
-      input_11 = vaddq_s16(input_11, input_offset_vec);
-      input_2 = vaddq_s16(input_2, input_offset_vec);
-      input_3 = vaddq_s16(input_3, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
-        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max,
-        output_ptr + 2 * output_depth + 7 * output_row_size, output_depth);
-
-    // Slide left one more time for outputs x = [0, 1], y = 7. Referring to the
-    // indexes in the diagram above, this corresponds to outputs (56) and (57).
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr + 7 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_2 = vld1_u8(ptr);
-      temp_3 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_4 = vld1_u8(ptr);
-      temp_5 = vld1_u8(ptr + input_depth);
-
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
-        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_ptr + 7 * output_row_size, output_depth);
-  }
-};
-
-template <>
-struct ConvKernel3x3FilterDepth8<4, 4, 1, 1> {
-  static inline void Run(const uint8* input_ptr, int input_depth,
-                         int32 input_offset, int input_row_size,
-                         const uint8* filter_ptr, int32 filter_offset,
-                         const int32* bias_ptr, int32 output_offset,
-                         int32 output_multiplier, int output_shift,
-                         int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_ptr,
-                         int output_depth, int output_width) {
-    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
-
-    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
-    const int output_row_size = output_depth * output_width;
-
-    // To process 4x4 outputs using a 3x3 filter, we require 6x6 inputs.
-    // Load inputs for the first 2 filters on the top left, then slide to
-    // the right, down, left, down, right, etc. in a snake-like path. This
-    // minimizes the total number of loads.
-    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8, input_9, input_10, input_11;
-
-    // Load inputs for 1x2 outputs starting from the top left.
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
-      const uint8* ptr = input_ptr;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      temp_3 = vld1_u8(ptr + 3 * input_depth);
-
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-      input_2 = vaddq_s16(input_2, input_offset_vec);
-      input_3 = vaddq_s16(input_3, input_offset_vec);
-
-      ptr += input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      temp_3 = vld1_u8(ptr + 3 * input_depth);
-
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-      input_6 = vaddq_s16(input_6, input_offset_vec);
-      input_7 = vaddq_s16(input_7, input_offset_vec);
-
-      ptr += input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      temp_3 = vld1_u8(ptr + 3 * input_depth);
-
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-      input_10 = vaddq_s16(input_10, input_offset_vec);
-      input_11 = vaddq_s16(input_11, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_ptr, output_depth);
-
-    // Now load 1x2 inputs on the top right.
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr + 4 * input_depth;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_2 = vld1_u8(ptr);
-      temp_3 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_4 = vld1_u8(ptr);
-      temp_5 = vld1_u8(ptr + input_depth);
-
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
-        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_ptr + 2 * output_depth, output_depth);
-
-    // Now load next inputs when sliding window down.
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
-      const uint8* ptr = input_ptr + 2 * input_depth + 3 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      temp_3 = vld1_u8(ptr + 3 * input_depth);
-
-      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
-      input_2 = vaddq_s16(input_2, input_offset_vec);
-      input_3 = vaddq_s16(input_3, input_offset_vec);
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
-        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_ptr + 2 * output_depth + output_row_size,
-        output_depth);
-
-    // Now load next inputs when sliding window left.
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr + input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_2 = vld1_u8(ptr);
-      temp_3 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_4 = vld1_u8(ptr);
-      temp_5 = vld1_u8(ptr + input_depth);
-
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
-        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_ptr + output_row_size, output_depth);
-
-    // Now load next inputs when sliding window down.
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
-      const uint8* ptr = input_ptr + 4 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      temp_3 = vld1_u8(ptr + 3 * input_depth);
-
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-      input_6 = vaddq_s16(input_6, input_offset_vec);
-      input_7 = vaddq_s16(input_7, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
-        input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_ptr + 2 * output_row_size, output_depth);
-
-    // Now load next inputs when sliding window right.
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr + 4 * input_depth + 2 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_2 = vld1_u8(ptr);
-      temp_3 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_4 = vld1_u8(ptr);
-      temp_5 = vld1_u8(ptr + input_depth);
-
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_10, input_11, input_8, input_9, input_2, input_3, input_0,
-        input_1, input_6, input_7, input_4, input_5, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max,
-        output_ptr + 2 * output_depth + 2 * output_row_size, output_depth);
-
-    // Now load next inputs when sliding window down.
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
-      const uint8* ptr = input_ptr + 2 * input_depth + 5 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      temp_3 = vld1_u8(ptr + 3 * input_depth);
-
-      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
-      input_10 = vaddq_s16(input_10, input_offset_vec);
-      input_11 = vaddq_s16(input_11, input_offset_vec);
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
-        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max,
-        output_ptr + 2 * output_depth + 3 * output_row_size, output_depth);
-
-    // Now load next inputs when sliding window left.
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr + 3 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_2 = vld1_u8(ptr);
-      temp_3 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_4 = vld1_u8(ptr);
-      temp_5 = vld1_u8(ptr + input_depth);
-
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_ptr + 3 * output_row_size, output_depth);
-  }
-};
-
-template <>
-struct ConvKernel3x3FilterDepth8<4, 2, 1, 1> {
-  static inline void Run(const uint8* input_ptr, int input_depth,
-                         int32 input_offset, int input_row_size,
-                         const uint8* filter_ptr, int32 filter_offset,
-                         const int32* bias_ptr, int32 output_offset,
-                         int32 output_multiplier, int output_shift,
-                         int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_ptr,
-                         int output_depth, int output_width) {
-    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
-
-    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
-    const int output_row_size = output_depth * output_width;
-
-    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8, input_9, input_10, input_11;
-
-    // Load inputs for 1x2 outputs starting from the top.
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
-      const uint8* ptr = input_ptr;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      temp_3 = vld1_u8(ptr + 3 * input_depth);
-
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-      input_2 = vaddq_s16(input_2, input_offset_vec);
-      input_3 = vaddq_s16(input_3, input_offset_vec);
-
-      ptr += input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      temp_3 = vld1_u8(ptr + 3 * input_depth);
-
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-      input_6 = vaddq_s16(input_6, input_offset_vec);
-      input_7 = vaddq_s16(input_7, input_offset_vec);
-
-      ptr += input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      temp_3 = vld1_u8(ptr + 3 * input_depth);
-
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-      input_10 = vaddq_s16(input_10, input_offset_vec);
-      input_11 = vaddq_s16(input_11, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_ptr, output_depth);
-
-    output_ptr += output_row_size;
-
-    // Now load next inputs one row down.
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
-      const uint8* ptr = input_ptr + 3 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      temp_3 = vld1_u8(ptr + 3 * input_depth);
-
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-      input_2 = vaddq_s16(input_2, input_offset_vec);
-      input_3 = vaddq_s16(input_3, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
-        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_ptr, output_depth);
-
-    output_ptr += output_row_size;
-
-    // Now load next row.
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
-      const uint8* ptr = input_ptr + 4 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      temp_3 = vld1_u8(ptr + 3 * input_depth);
-
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-      input_6 = vaddq_s16(input_6, input_offset_vec);
-      input_7 = vaddq_s16(input_7, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_8, input_9, input_10, input_11, input_0, input_1, input_2,
-        input_3, input_4, input_5, input_6, input_7, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_ptr, output_depth);
-
-    output_ptr += output_row_size;
-
-    // Now load last row.
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
-      const uint8* ptr = input_ptr + 5 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      temp_3 = vld1_u8(ptr + 3 * input_depth);
-
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-      input_10 = vaddq_s16(input_10, input_offset_vec);
-      input_11 = vaddq_s16(input_11, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_ptr, output_depth);
-  }
-};
-
-template <>
-struct ConvKernel3x3FilterDepth8<4, 1, 1, 1> {
-  static inline void Run(const uint8* input_ptr, int input_depth,
-                         int32 input_offset, int input_row_size,
-                         const uint8* filter_ptr, int32 filter_offset,
-                         const int32* bias_ptr, int32 output_offset,
-                         int32 output_multiplier, int output_shift,
-                         int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_ptr,
-                         int output_depth, int output_width) {
-    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
-
-    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
-    const int output_row_size = output_depth * output_width;
-
-    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8, input_9, input_10, input_11;
-
-    // Load inputs for 2x1 outputs starting from the top.
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      ptr += input_row_size;
-      temp_3 = vld1_u8(ptr);
-      temp_4 = vld1_u8(ptr + input_depth);
-      temp_5 = vld1_u8(ptr + 2 * input_depth);
-
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-      input_2 = vaddq_s16(input_2, input_offset_vec);
-      input_3 = vaddq_s16(input_3, input_offset_vec);
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-
-      ptr += input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      ptr += input_row_size;
-      temp_3 = vld1_u8(ptr);
-      temp_4 = vld1_u8(ptr + input_depth);
-      temp_5 = vld1_u8(ptr + 2 * input_depth);
-
-      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_6 = vaddq_s16(input_6, input_offset_vec);
-      input_7 = vaddq_s16(input_7, input_offset_vec);
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-      input_10 = vaddq_s16(input_10, input_offset_vec);
-      input_11 = vaddq_s16(input_11, input_offset_vec);
-    }
-
-    DotProductAndStore2yStride1(
-        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_ptr, output_row_size);
-
-    // Load inputs for bottom 2 rows.
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr + 4 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      ptr += input_row_size;
-      temp_3 = vld1_u8(ptr);
-      temp_4 = vld1_u8(ptr + input_depth);
-      temp_5 = vld1_u8(ptr + 2 * input_depth);
-
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-      input_2 = vaddq_s16(input_2, input_offset_vec);
-      input_3 = vaddq_s16(input_3, input_offset_vec);
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-    }
-
-    DotProductAndStore2yStride1(
-        filter, input_6, input_7, input_8, input_9, input_10, input_11, input_0,
-        input_1, input_2, input_3, input_4, input_5, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_ptr + 2 * output_row_size,
-        output_row_size);
-  }
-};
-
-template <>
-struct ConvKernel3x3FilterDepth8<2, 2, 1, 1> {
-  static inline void Run(const uint8* input_ptr, int input_depth,
-                         int32 input_offset, int input_row_size,
-                         const uint8* filter_ptr, int32 filter_offset,
-                         const int32* bias_ptr, int32 output_offset,
-                         int32 output_multiplier, int output_shift,
-                         int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_ptr,
-                         int output_depth, int output_width) {
-    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
-
-    Int32x8 acc_0, acc_1, acc_2, acc_3;
-
-    acc_0.low = vld1q_s32(bias_ptr);
-    acc_1.low = vld1q_s32(bias_ptr);
-    acc_2.low = vld1q_s32(bias_ptr);
-    acc_3.low = vld1q_s32(bias_ptr);
-
-    bias_ptr += 4;
-    acc_0.high = vld1q_s32(bias_ptr);
-    acc_1.high = vld1q_s32(bias_ptr);
-    acc_2.high = vld1q_s32(bias_ptr);
-    acc_3.high = vld1q_s32(bias_ptr);
-
-    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
-
-    // Add scope for input registers to help the compiler know that it is
-    // not needed.
-    {
-      // To process 2x2 outputs using a 3x3 filter, we require 4x4 inputs.
-      // Load inputs for the top two filters first.
-      int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-          input_7, input_8, input_9, input_10, input_11;
-
-      const uint8* ptr = input_ptr;
-
-      // Load top 3 rows.
-      {
-        uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
-        temp_0 = vld1_u8(ptr);
-        temp_1 = vld1_u8(ptr + input_depth);
-        temp_2 = vld1_u8(ptr + 2 * input_depth);
-        temp_3 = vld1_u8(ptr + 3 * input_depth);
-
-        input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-        input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-        input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-        input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
-        input_0 = vaddq_s16(input_0, input_offset_vec);
-        input_1 = vaddq_s16(input_1, input_offset_vec);
-        input_2 = vaddq_s16(input_2, input_offset_vec);
-        input_3 = vaddq_s16(input_3, input_offset_vec);
-
-        ptr += input_row_size;
-        temp_0 = vld1_u8(ptr);
-        temp_1 = vld1_u8(ptr + input_depth);
-        temp_2 = vld1_u8(ptr + 2 * input_depth);
-        temp_3 = vld1_u8(ptr + 3 * input_depth);
-
-        input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-        input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-        input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-        input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
-        input_4 = vaddq_s16(input_4, input_offset_vec);
-        input_5 = vaddq_s16(input_5, input_offset_vec);
-        input_6 = vaddq_s16(input_6, input_offset_vec);
-        input_7 = vaddq_s16(input_7, input_offset_vec);
-
-        ptr += input_row_size;
-        temp_0 = vld1_u8(ptr);
-        temp_1 = vld1_u8(ptr + input_depth);
-        temp_2 = vld1_u8(ptr + 2 * input_depth);
-        temp_3 = vld1_u8(ptr + 3 * input_depth);
-
-        input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-        input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-        input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-        input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
-        input_8 = vaddq_s16(input_8, input_offset_vec);
-        input_9 = vaddq_s16(input_9, input_offset_vec);
-        input_10 = vaddq_s16(input_10, input_offset_vec);
-        input_11 = vaddq_s16(input_11, input_offset_vec);
-      }
+// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
+// Jetson TX-2. This compiler does not support the offsetof() macro.
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
 
-      // Multiply-accum for top-left output.
-      acc_0 = MultiplyAccumulate3x3Filter(filter, input_0, input_1, input_2,
-                                          input_4, input_5, input_6, input_8,
-                                          input_9, input_10, acc_0);
-
-      // Multiply-accum for top-right output.
-      acc_1 = MultiplyAccumulate3x3Filter(filter, input_1, input_2, input_3,
-                                          input_5, input_6, input_7, input_9,
-                                          input_10, input_11, acc_1);
-
-      // Now load the bottom row.
-      {
-        uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
-        ptr += input_row_size;
-        temp_0 = vld1_u8(ptr);
-        temp_1 = vld1_u8(ptr + input_depth);
-        temp_2 = vld1_u8(ptr + 2 * input_depth);
-        temp_3 = vld1_u8(ptr + 3 * input_depth);
-
-        input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-        input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-        input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-        input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
-        input_0 = vaddq_s16(input_0, input_offset_vec);
-        input_1 = vaddq_s16(input_1, input_offset_vec);
-        input_2 = vaddq_s16(input_2, input_offset_vec);
-        input_3 = vaddq_s16(input_3, input_offset_vec);
-      }
-
-      // Multiply-accum for bottom-left output.
-      acc_2 = MultiplyAccumulate3x3Filter(filter, input_4, input_5, input_6,
-                                          input_8, input_9, input_10, input_0,
-                                          input_1, input_2, acc_2);
-
-      // Multiply-accum for bottom-right output.
-      acc_3 = MultiplyAccumulate3x3Filter(filter, input_5, input_6, input_7,
-                                          input_9, input_10, input_11, input_1,
-                                          input_2, input_3, acc_3);
-    }
-
-    DownquantizeAndStore2x2Output(acc_0, acc_1, acc_2, acc_3, output_offset,
-                                  output_multiplier, output_shift,
-                                  output_activation_min, output_activation_max,
-                                  output_ptr, output_depth, output_width);
-  }
-};
-
-template <>
-struct ConvKernel3x3FilterDepth8<2, 4, 1, 1> {
-  static inline void Run(const uint8* input_ptr, int input_depth,
-                         int32 input_offset, int input_row_size,
-                         const uint8* filter_ptr, int32 filter_offset,
-                         const int32* bias_ptr, int32 output_offset,
-                         int32 output_multiplier, int output_shift,
-                         int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_ptr,
-                         int output_depth, int output_width) {
-    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
-
-    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
-    const int output_row_size = output_depth * output_width;
-
-    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8, input_9, input_10, input_11;
-
-    // Load inputs for 1x2 outputs starting from the top left.
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
-      const uint8* ptr = input_ptr;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      temp_3 = vld1_u8(ptr + 3 * input_depth);
-
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-      input_2 = vaddq_s16(input_2, input_offset_vec);
-      input_3 = vaddq_s16(input_3, input_offset_vec);
-
-      ptr += input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      temp_3 = vld1_u8(ptr + 3 * input_depth);
-
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-      input_6 = vaddq_s16(input_6, input_offset_vec);
-      input_7 = vaddq_s16(input_7, input_offset_vec);
-
-      ptr += input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      temp_3 = vld1_u8(ptr + 3 * input_depth);
-
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-      input_10 = vaddq_s16(input_10, input_offset_vec);
-      input_11 = vaddq_s16(input_11, input_offset_vec);
-    }
+// clang-format gets confused with this file and ends up formatting lines to
+// be larger than 80 characters. Turn off here and back on at the end of the
+// file.
 
-    DotProductAndStore2xStride1(
-        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_ptr, output_depth);
-
-    // Now load 1x2 inputs on the top right.
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr + 4 * input_depth;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_2 = vld1_u8(ptr);
-      temp_3 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_4 = vld1_u8(ptr);
-      temp_5 = vld1_u8(ptr + input_depth);
-
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
-        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_ptr + 2 * output_depth, output_depth);
-
-    // Now load next inputs when sliding window down.
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
-      const uint8* ptr = input_ptr + 2 * input_depth + 3 * input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      temp_3 = vld1_u8(ptr + 3 * input_depth);
-
-      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
-      input_2 = vaddq_s16(input_2, input_offset_vec);
-      input_3 = vaddq_s16(input_3, input_offset_vec);
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_6, input_7, input_4, input_5, input_10, input_11, input_8,
-        input_9, input_2, input_3, input_0, input_1, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_ptr + 2 * output_depth + output_row_size,
-        output_depth);
-
-    // Now load next inputs when sliding window left.
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr + input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_2 = vld1_u8(ptr);
-      temp_3 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_4 = vld1_u8(ptr);
-      temp_5 = vld1_u8(ptr + input_depth);
-
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-    }
+// clang-format off
 
-    DotProductAndStore2xStride1(
-        filter, input_4, input_5, input_6, input_7, input_8, input_9, input_10,
-        input_11, input_0, input_1, input_2, input_3, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_ptr + output_row_size, output_depth);
-  }
-};
-
-template <>
-struct ConvKernel3x3FilterDepth8<1, 4, 1, 1> {
-  static inline void Run(const uint8* input_ptr, int input_depth,
-                         int32 input_offset, int input_row_size,
-                         const uint8* filter_ptr, int32 filter_offset,
-                         const int32* bias_ptr, int32 output_offset,
-                         int32 output_multiplier, int output_shift,
-                         int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_ptr,
-                         int output_depth, int output_width) {
-    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
-
-    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
-
-    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8, input_9, input_10, input_11;
-
-    // Load inputs for 1x2 outputs starting from the left.
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3;
-
-      const uint8* ptr = input_ptr;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      temp_3 = vld1_u8(ptr + 3 * input_depth);
-
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-      input_2 = vaddq_s16(input_2, input_offset_vec);
-      input_3 = vaddq_s16(input_3, input_offset_vec);
-
-      ptr += input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      temp_3 = vld1_u8(ptr + 3 * input_depth);
-
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-      input_6 = vaddq_s16(input_6, input_offset_vec);
-      input_7 = vaddq_s16(input_7, input_offset_vec);
-
-      ptr += input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      temp_3 = vld1_u8(ptr + 3 * input_depth);
-
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-      input_10 = vaddq_s16(input_10, input_offset_vec);
-      input_11 = vaddq_s16(input_11, input_offset_vec);
-    }
-
-    DotProductAndStore2xStride1(
-        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_ptr, output_depth);
-
-    // Now load 1x2 inputs on the right.
-    {
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr + input_depth * 4;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_2 = vld1_u8(ptr);
-      temp_3 = vld1_u8(ptr + input_depth);
-
-      ptr += input_row_size;
-      temp_4 = vld1_u8(ptr);
-      temp_5 = vld1_u8(ptr + input_depth);
-
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-    }
+#define DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE 10 * 10 * 64
 
-    DotProductAndStore2xStride1(
-        filter, input_2, input_3, input_0, input_1, input_6, input_7, input_4,
-        input_5, input_10, input_11, input_8, input_9, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_ptr + 2 * output_depth, output_depth);
-  }
+// Encapsulates constant parameters used in DepthwiseConv.
+// 64-bit is used for types that will be added to 64-bit addresses in asm.
+struct DepthwiseConvParams {
+  int64_t input_depth;
+  int64_t input_row_size;
+  int64_t output_depth;
+  int64_t output_row_size;
+  int64_t filter_row_size;
+  int32 input_offset;
+  int32 output_offset;
+  int32 filter_offset;
+  int32 output_multiplier;
+  int32 output_activation_min;
+  int32 output_activation_max;
+  int32 output_shift;
+  int32 input_width;
+  int32 input_height;
+  int32 stride_width;
+  int32 stride_height;
+  int32 output_width;
+  int32 output_height;
 };
 
-template <>
-struct ConvKernel3x3FilterDepth8<2, 1, 1, 1> {
-  static inline void Run(const uint8* input_ptr, int input_depth,
-                         int32 input_offset, int input_row_size,
-                         const uint8* filter_ptr, int32 filter_offset,
-                         const int32* bias_ptr, int32 output_offset,
-                         int32 output_multiplier, int output_shift,
-                         int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_ptr,
-                         int output_depth, int output_width) {
-    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
-
-    // To process 2x1 outputs using a 3x3 filter, we require 4x3 inputs.
-    // Load all inputs at the beginning.
-    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8, input_9, input_10, input_11;
-
-    // Load inputs for 1x2 outputs starting from the top left.
-    {
-      const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
-      uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5;
-
-      const uint8* ptr = input_ptr;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      ptr += input_row_size;
-      temp_3 = vld1_u8(ptr);
-      temp_4 = vld1_u8(ptr + input_depth);
-      temp_5 = vld1_u8(ptr + 2 * input_depth);
-
-      input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_0 = vaddq_s16(input_0, input_offset_vec);
-      input_1 = vaddq_s16(input_1, input_offset_vec);
-      input_2 = vaddq_s16(input_2, input_offset_vec);
-      input_3 = vaddq_s16(input_3, input_offset_vec);
-      input_4 = vaddq_s16(input_4, input_offset_vec);
-      input_5 = vaddq_s16(input_5, input_offset_vec);
-
-      ptr += input_row_size;
-      temp_0 = vld1_u8(ptr);
-      temp_1 = vld1_u8(ptr + input_depth);
-      temp_2 = vld1_u8(ptr + 2 * input_depth);
-      ptr += input_row_size;
-      temp_3 = vld1_u8(ptr);
-      temp_4 = vld1_u8(ptr + input_depth);
-      temp_5 = vld1_u8(ptr + 2 * input_depth);
-
-      input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-      input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-      input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-      input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-      input_10 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-      input_11 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-      input_6 = vaddq_s16(input_6, input_offset_vec);
-      input_7 = vaddq_s16(input_7, input_offset_vec);
-      input_8 = vaddq_s16(input_8, input_offset_vec);
-      input_9 = vaddq_s16(input_9, input_offset_vec);
-      input_10 = vaddq_s16(input_10, input_offset_vec);
-      input_11 = vaddq_s16(input_11, input_offset_vec);
-    }
-
-    DotProductAndStore2yStride1(
-        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8, input_9, input_10, input_11, bias_ptr, output_offset,
-        output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_ptr, output_depth * output_width);
-  }
-};
+#define STR(s) STR_UNEXPANDED(s)
+#define STR_UNEXPANDED(s) #s
+
+// Represents the number of bytes offset from the start of the
+// DepthwiseConvParams struct. This is used in the asm to load parameters.
+// Keep these values in sync with the static_asserts below.
+#define OFFSET_INPUT_DEPTH 0
+#define OFFSET_INPUT_ROW_SIZE 8
+#define OFFSET_OUTPUT_DEPTH 16
+#define OFFSET_OUTPUT_ROW_SIZE 24
+#define OFFSET_FILTER_ROW_SIZE 32
+#define OFFSET_INPUT_OFFSET 40
+#define OFFSET_OUTPUT_OFFSET 44
+#define OFFSET_FILTER_OFFSET 48
+#define OFFSET_OUTPUT_MULTIPLIER 52
+#define OFFSET_OUTPUT_ACTIVATION_MIN 56
+#define OFFSET_OUTPUT_ACTIVATION_MAX 60
+#define OFFSET_OUTPUT_SHIFT 64
+#define OFFSET_INPUT_WIDTH 68
+#define OFFSET_INPUT_HEIGHT 72
+#define OFFSET_STRIDE_WIDTH 76
+#define OFFSET_STRIDE_HEIGHT 80
+#define OFFSET_OUTPUT_WIDTH 84
+#define OFFSET_OUTPUT_HEIGHT 88
+
+static_assert(offsetof(DepthwiseConvParams, input_depth) ==
+                  OFFSET_INPUT_DEPTH, "");
+static_assert(offsetof(DepthwiseConvParams, input_row_size) ==
+                  OFFSET_INPUT_ROW_SIZE, "");
+static_assert(offsetof(DepthwiseConvParams, output_depth) ==
+                  OFFSET_OUTPUT_DEPTH, "");
+static_assert(offsetof(DepthwiseConvParams, output_row_size) ==
+                  OFFSET_OUTPUT_ROW_SIZE, "");
+static_assert(offsetof(DepthwiseConvParams, filter_row_size) ==
+                  OFFSET_FILTER_ROW_SIZE, "");
+static_assert(offsetof(DepthwiseConvParams, input_offset) ==
+                  OFFSET_INPUT_OFFSET, "");
+static_assert(offsetof(DepthwiseConvParams, output_offset) ==
+                  OFFSET_OUTPUT_OFFSET, "");
+static_assert(offsetof(DepthwiseConvParams, filter_offset) ==
+                  OFFSET_FILTER_OFFSET, "");
+static_assert(offsetof(DepthwiseConvParams, output_multiplier) ==
+                  OFFSET_OUTPUT_MULTIPLIER, "");
+static_assert(offsetof(DepthwiseConvParams, output_activation_min) ==
+                  OFFSET_OUTPUT_ACTIVATION_MIN, "");
+static_assert(offsetof(DepthwiseConvParams, output_activation_max) ==
+                  OFFSET_OUTPUT_ACTIVATION_MAX, "");
+static_assert(offsetof(DepthwiseConvParams, output_shift) ==
+                  OFFSET_OUTPUT_SHIFT, "");
+static_assert(offsetof(DepthwiseConvParams, input_width) ==
+                  OFFSET_INPUT_WIDTH, "");
+static_assert(offsetof(DepthwiseConvParams, input_height) ==
+                  OFFSET_INPUT_HEIGHT, "");
+static_assert(offsetof(DepthwiseConvParams, stride_width) ==
+                  OFFSET_STRIDE_WIDTH, "");
+static_assert(offsetof(DepthwiseConvParams, stride_height) ==
+                  OFFSET_STRIDE_HEIGHT, "");
+static_assert(offsetof(DepthwiseConvParams, output_width) ==
+                  OFFSET_OUTPUT_WIDTH, "");
+static_assert(offsetof(DepthwiseConvParams, output_height) ==
+                  OFFSET_OUTPUT_HEIGHT, "");
+
+template <int32 kDepth, int32 kStrideWidth, int32 kStrideHeight>
+struct DepthwiseConvWindow {};
 
 template <>
-struct ConvKernel3x3FilterDepth8<4, 2, 2, 2> {
-  static inline void Run(const uint8* input_ptr, int input_depth,
-                         int32 input_offset, int input_row_size,
-                         const uint8* filter_ptr, int32 filter_offset,
-                         const int32* bias_ptr, int32 output_offset,
-                         int32 output_multiplier, int output_shift,
-                         int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_ptr,
-                         int output_depth, int output_width) {
-    const int output_row_size = output_depth * output_width;
-
-    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
-
-    Int32x8 acc_0, acc_1;
-    acc_0.low = vld1q_s32(bias_ptr);
-    acc_1.low = vld1q_s32(bias_ptr);
-    acc_0.high = vld1q_s32(bias_ptr + 4);
-    acc_1.high = vld1q_s32(bias_ptr + 4);
-
-    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
-
-    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8, input_9;
-
-    const uint8* ptr = input_ptr;
-    uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4;
-
-    // Load first 2 rows.
-    temp_0 = vld1_u8(ptr);
-    temp_1 = vld1_u8(ptr + input_depth);
-    temp_2 = vld1_u8(ptr + 2 * input_depth);
-    temp_3 = vld1_u8(ptr + 3 * input_depth);
-    temp_4 = vld1_u8(ptr + 4 * input_depth);
-
-    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-
-    input_0 = vaddq_s16(input_0, input_offset_vec);
-    input_1 = vaddq_s16(input_1, input_offset_vec);
-    input_2 = vaddq_s16(input_2, input_offset_vec);
-    input_3 = vaddq_s16(input_3, input_offset_vec);
-    input_4 = vaddq_s16(input_4, input_offset_vec);
-
-    ptr += input_row_size;
-    temp_0 = vld1_u8(ptr);
-    temp_1 = vld1_u8(ptr + input_depth);
-    temp_2 = vld1_u8(ptr + 2 * input_depth);
-    temp_3 = vld1_u8(ptr + 3 * input_depth);
-    temp_4 = vld1_u8(ptr + 4 * input_depth);
-
-    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-    input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-
-    input_5 = vaddq_s16(input_5, input_offset_vec);
-    input_6 = vaddq_s16(input_6, input_offset_vec);
-    input_7 = vaddq_s16(input_7, input_offset_vec);
-    input_8 = vaddq_s16(input_8, input_offset_vec);
-    input_9 = vaddq_s16(input_9, input_offset_vec);
-
-    acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2,
-                                  input_0, input_1, input_2);
-
-    acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2,
-                                  input_2, input_3, input_4);
-
-    acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5,
-                                  input_5, input_6, input_7);
-
-    acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5,
-                                  input_7, input_8, input_9);
-
-    // Load next 2 rows.
-    ptr += input_row_size;
-    temp_0 = vld1_u8(ptr);
-    temp_1 = vld1_u8(ptr + input_depth);
-    temp_2 = vld1_u8(ptr + 2 * input_depth);
-    temp_3 = vld1_u8(ptr + 3 * input_depth);
-    temp_4 = vld1_u8(ptr + 4 * input_depth);
-
-    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-
-    input_0 = vaddq_s16(input_0, input_offset_vec);
-    input_1 = vaddq_s16(input_1, input_offset_vec);
-    input_2 = vaddq_s16(input_2, input_offset_vec);
-    input_3 = vaddq_s16(input_3, input_offset_vec);
-    input_4 = vaddq_s16(input_4, input_offset_vec);
-
-    ptr += input_row_size;
-    temp_0 = vld1_u8(ptr);
-    temp_1 = vld1_u8(ptr + input_depth);
-    temp_2 = vld1_u8(ptr + 2 * input_depth);
-    temp_3 = vld1_u8(ptr + 3 * input_depth);
-    temp_4 = vld1_u8(ptr + 4 * input_depth);
-
-    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-    input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-
-    input_5 = vaddq_s16(input_5, input_offset_vec);
-    input_6 = vaddq_s16(input_6, input_offset_vec);
-    input_7 = vaddq_s16(input_7, input_offset_vec);
-    input_8 = vaddq_s16(input_8, input_offset_vec);
-    input_9 = vaddq_s16(input_9, input_offset_vec);
-
-    acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8,
-                                  input_0, input_1, input_2);
-
-    acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8,
-                                  input_2, input_3, input_4);
-
-    DownquantizeAndStore2Output(
-        acc_0, acc_1, output_offset, output_multiplier, output_shift,
-        output_activation_min, output_activation_max, output_ptr, output_depth);
-
-    output_ptr += output_row_size;
-
-    // Moving onto the next row of outputs.
-    acc_0.low = vld1q_s32(bias_ptr);
-    acc_1.low = vld1q_s32(bias_ptr);
-    acc_0.high = vld1q_s32(bias_ptr + 4);
-    acc_1.high = vld1q_s32(bias_ptr + 4);
-
-    acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2,
-                                  input_0, input_1, input_2);
-
-    acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2,
-                                  input_2, input_3, input_4);
-
-    acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5,
-                                  input_5, input_6, input_7);
-
-    acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5,
-                                  input_7, input_8, input_9);
-
-    // Load next 2 rows.
-    ptr += input_row_size;
-    temp_0 = vld1_u8(ptr);
-    temp_1 = vld1_u8(ptr + input_depth);
-    temp_2 = vld1_u8(ptr + 2 * input_depth);
-    temp_3 = vld1_u8(ptr + 3 * input_depth);
-    temp_4 = vld1_u8(ptr + 4 * input_depth);
-
-    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-
-    input_0 = vaddq_s16(input_0, input_offset_vec);
-    input_1 = vaddq_s16(input_1, input_offset_vec);
-    input_2 = vaddq_s16(input_2, input_offset_vec);
-    input_3 = vaddq_s16(input_3, input_offset_vec);
-    input_4 = vaddq_s16(input_4, input_offset_vec);
-
-    ptr += input_row_size;
-    temp_0 = vld1_u8(ptr);
-    temp_1 = vld1_u8(ptr + input_depth);
-    temp_2 = vld1_u8(ptr + 2 * input_depth);
-    temp_3 = vld1_u8(ptr + 3 * input_depth);
-    temp_4 = vld1_u8(ptr + 4 * input_depth);
-
-    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-    input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-
-    input_5 = vaddq_s16(input_5, input_offset_vec);
-    input_6 = vaddq_s16(input_6, input_offset_vec);
-    input_7 = vaddq_s16(input_7, input_offset_vec);
-    input_8 = vaddq_s16(input_8, input_offset_vec);
-    input_9 = vaddq_s16(input_9, input_offset_vec);
-
-    acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8,
-                                  input_0, input_1, input_2);
-
-    acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8,
-                                  input_2, input_3, input_4);
-
-    DownquantizeAndStore2Output(
-        acc_0, acc_1, output_offset, output_multiplier, output_shift,
-        output_activation_min, output_activation_max, output_ptr, output_depth);
-
-    output_ptr += output_row_size;
-
-    // Moving onto the next row of outputs.
-    acc_0.low = vld1q_s32(bias_ptr);
-    acc_1.low = vld1q_s32(bias_ptr);
-    acc_0.high = vld1q_s32(bias_ptr + 4);
-    acc_1.high = vld1q_s32(bias_ptr + 4);
-
-    acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2,
-                                  input_0, input_1, input_2);
-
-    acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2,
-                                  input_2, input_3, input_4);
-
-    acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5,
-                                  input_5, input_6, input_7);
-
-    acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5,
-                                  input_7, input_8, input_9);
-
-    // Load next 2 rows.
-    ptr += input_row_size;
-    temp_0 = vld1_u8(ptr);
-    temp_1 = vld1_u8(ptr + input_depth);
-    temp_2 = vld1_u8(ptr + 2 * input_depth);
-    temp_3 = vld1_u8(ptr + 3 * input_depth);
-    temp_4 = vld1_u8(ptr + 4 * input_depth);
-
-    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-
-    input_0 = vaddq_s16(input_0, input_offset_vec);
-    input_1 = vaddq_s16(input_1, input_offset_vec);
-    input_2 = vaddq_s16(input_2, input_offset_vec);
-    input_3 = vaddq_s16(input_3, input_offset_vec);
-    input_4 = vaddq_s16(input_4, input_offset_vec);
-
-    ptr += input_row_size;
-    temp_0 = vld1_u8(ptr);
-    temp_1 = vld1_u8(ptr + input_depth);
-    temp_2 = vld1_u8(ptr + 2 * input_depth);
-    temp_3 = vld1_u8(ptr + 3 * input_depth);
-    temp_4 = vld1_u8(ptr + 4 * input_depth);
-
-    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-    input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-
-    input_5 = vaddq_s16(input_5, input_offset_vec);
-    input_6 = vaddq_s16(input_6, input_offset_vec);
-    input_7 = vaddq_s16(input_7, input_offset_vec);
-    input_8 = vaddq_s16(input_8, input_offset_vec);
-    input_9 = vaddq_s16(input_9, input_offset_vec);
-
-    acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8,
-                                  input_0, input_1, input_2);
-
-    acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8,
-                                  input_2, input_3, input_4);
-
-    DownquantizeAndStore2Output(
-        acc_0, acc_1, output_offset, output_multiplier, output_shift,
-        output_activation_min, output_activation_max, output_ptr, output_depth);
-
-    output_ptr += output_row_size;
-
-    // Moving onto the next row of outputs.
-    acc_0.low = vld1q_s32(bias_ptr);
-    acc_1.low = vld1q_s32(bias_ptr);
-    acc_0.high = vld1q_s32(bias_ptr + 4);
-    acc_1.high = vld1q_s32(bias_ptr + 4);
-
-    acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2,
-                                  input_0, input_1, input_2);
-
-    acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2,
-                                  input_2, input_3, input_4);
-
-    acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5,
-                                  input_5, input_6, input_7);
-
-    acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5,
-                                  input_7, input_8, input_9);
-
-    // Load last row.
-    ptr += input_row_size;
-    temp_0 = vld1_u8(ptr);
-    temp_1 = vld1_u8(ptr + input_depth);
-    temp_2 = vld1_u8(ptr + 2 * input_depth);
-    temp_3 = vld1_u8(ptr + 3 * input_depth);
-    temp_4 = vld1_u8(ptr + 4 * input_depth);
-
-    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-
-    input_0 = vaddq_s16(input_0, input_offset_vec);
-    input_1 = vaddq_s16(input_1, input_offset_vec);
-    input_2 = vaddq_s16(input_2, input_offset_vec);
-    input_3 = vaddq_s16(input_3, input_offset_vec);
-    input_4 = vaddq_s16(input_4, input_offset_vec);
-
-    acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8,
-                                  input_0, input_1, input_2);
-
-    acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8,
-                                  input_2, input_3, input_4);
-
-    DownquantizeAndStore2Output(
-        acc_0, acc_1, output_offset, output_multiplier, output_shift,
-        output_activation_min, output_activation_max, output_ptr, output_depth);
+struct DepthwiseConvWindow<8, 1, 1> {
+ public:
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                  const int32* bias_ptr, uint8* output_ptr, int64_t input_depth,
+                  int64_t input_row_size, int32 output_window_height,
+                  int32 output_window_width,
+                  const DepthwiseConvParams* params_ptr) {
+    const int64_t input_width_increment = 2 * input_depth;
+    const int64_t input_height_increment = 2 * input_row_size;
+    const int64_t output_height_increment = 2 * params_ptr->output_row_size;
+
+#define DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "1"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "2"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "3"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "4"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "5"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "6"
+#define DEPTHWISECONV_LABEL_HEIGHT_1 "7"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "8"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "9"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "10"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_END "11"
+
+    asm volatile(
+        // Performs depthwise convolutions for a window specified by
+        // |output_window_height| and |output_window_width|. The inner-most loop
+        // processes 2x2 outputs, and any leftovers at the end.
+        //
+        // Algorithm works as follows:
+        //
+        //   1. Load filters of 8 depth (8x3x3). Registers v0--v8 hold filter
+        //      values.
+        //   2. For 2 output heights at a time:
+        //        i.  For 2 output widths at a time, load inputs for a 2x1 (2
+        //            height, 1 width) output window (4x3 input window).
+        //            Registers v9--v20 hold input values. Mul-add with
+        //            accumulators v21--v24. Then run activation, downquantize
+        //            and store. Repeat for the next 2x1 output window,
+        //            leveraging overlapping inputs.
+        //        ii. Handle single leftover width if exists.
+        //   3. Handle single leftover height if exists.
+        //        i.  For 2 output widths at a time, load inputs for a 1x2 (1
+        //            height, 2 width) output window (3x4 input window).
+        //            Registers v9--v20 hold input values. Mul-add with
+        //            accumulators v21--v24. Then run activation, downquantize
+        //            and store. Repeat for the next 1x2 output window,
+        //            leveraging overlapping inputs.
+        //        ii. Handle single leftover width if exists.
+        //
+        // Loads are placed as soon as the register is no longer needed and
+        // interleaved with arithmetic operations to take advantage of
+        // dual-issue pipelines. We also add input offsets as far from the loads
+        // as possible to give loads enough cycles to fetch data from memory.
+
+        // Set "constant" registers. These registers may be replaced with temp
+        // values from time to time when there are not enough NEON registers.
+        // We use x9--x15 general purpose registers as they are caller-saved
+        // temporary registers (see http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf).  // NOLINT
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "ldr x3, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "cmp %w[output_window_height], #2\n"
+        "dup v26.8h, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "ldr w2, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_SHIFT) "]\n"
+        "dup v29.4s, w2\n"
+        "ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "dup v30.4s, w4\n"
+        "ldr w0, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v31.4s, w0\n"
+        "neg w9, w9\n"
+        "dup v28.4s, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+        "add x10, %[bias_ptr], #16\n"
+        "ldr x1, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
+        "dup v9.8h, w9\n"
+
+        // Load filters and add offsets.
+        "ld1 {v0.8b}, [%[filter_ptr]], x3\n"
+        "ld1 {v1.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v0.8h, v9.8h, v0.8b\n"
+        "ld1 {v2.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v1.8h, v9.8h, v1.8b\n"
+        "ld1 {v3.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v2.8h, v9.8h, v2.8b\n"
+        "ld1 {v4.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v3.8h, v9.8h, v3.8b\n"
+        "ld1 {v5.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v4.8h, v9.8h, v4.8b\n"
+        "ld1 {v6.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v5.8h, v9.8h, v5.8b\n"
+        "ld1 {v7.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v6.8h, v9.8h, v6.8b\n"
+        "ld1 {v8.8b}, [%[filter_ptr]], x3\n"
+        "uaddw v7.8h, v9.8h, v7.8b\n"
+        "uaddw v8.8h, v9.8h, v8.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_HEIGHT_2_LOOP ":\n"
+          // This loop processes 2x2 outputs. To avoid register exhaustion,
+          // inputs for the left 2 outputs are loaded first, then the right
+          // two outputs.
+          "mov x11, %[input_ptr]\n"
+          "mov x12, x11\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "add x13, x11, %[input_row_size]\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          "add x14, x13, %[input_row_size]\n"
+          "ld1 {v11.8b}, [x12], %[input_depth]\n"
+          "add x15, x14, %[input_row_size]\n"
+          "ld1 {v12.8b}, [x13], %[input_depth]\n"
+          "mov w5, %w[output_window_width]\n"
+          "ld1 {v13.8b}, [x13], %[input_depth]\n"
+          "mov x6, %[output_ptr]\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "add x7, %[output_ptr], x1\n"
+          "ld1 {v15.8b}, [x14], %[input_depth]\n"
+          // The height 2 / width 2 loop loads an extra 2x1 outputs (2 height,
+          // 1 width) in anticipation for the next iteration. Make sure
+          // |output_window_width| is large enough to handle the additional
+          // loads, otherwise jump to specific the appropriate label to handle
+          // smaller widths.
+          "cmp w5, #2\n"
+          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "ld1 {v16.8b}, [x14], %[input_depth]\n"
+          "uaddw v10.8h, v26.8h, v10.8b\n"
+          "ld1 {v17.8b}, [x14], %[input_depth]\n"
+          "uaddw v11.8h, v26.8h, v11.8b\n"
+          "ld1 {v18.8b}, [x15], %[input_depth]\n"
+          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "ld1 {v19.8b}, [x15], %[input_depth]\n"
+          "uaddw v13.8h, v26.8h, v13.8b\n"
+          "ld1 {v20.8b}, [x15], %[input_depth]\n"
+          "uaddw v14.8h, v26.8h, v14.8b\n"
+          "ld1 {v21.4s}, [%[bias_ptr]]\n"
+          "uaddw v15.8h, v26.8h, v15.8b\n"
+          "ld1 {v22.4s}, [x10]\n"
+          "uaddw v16.8h, v26.8h, v16.8b\n"
+          "ld1 {v23.4s}, [%[bias_ptr]]\n"
+          "uaddw v17.8h, v26.8h, v17.8b\n"
+          "ld1 {v24.4s}, [x10]\n"
+          "uaddw v18.8h, v26.8h, v18.8b\n"
+          "uaddw v19.8h, v26.8h, v19.8b\n"
+          "uaddw v20.8h, v26.8h, v20.8b\n"
+
+          "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n"
+          "cmp w5, #1\n"
+          "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
+
+          //"loop_%=:\n"
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP ":\n"
+            // Mul-add left outputs.
+            "smlal v21.4s, v0.4h, v9.4h\n"
+            "subs w5, w5, #2\n"
+            "smlal2 v22.4s, v0.8h, v9.8h\n"
+            "cmp w5, #3\n"
+            "smlal v23.4s, v0.4h, v12.4h\n"
+            "ld1 {v9.8b}, [x12]\n"
+            "smlal2 v24.4s, v0.8h, v12.8h\n"
+            "smlal v21.4s, v1.4h, v10.4h\n"
+            "smlal2 v22.4s, v1.8h, v10.8h\n"
+            "smlal v23.4s, v1.4h, v13.4h\n"
+            "smlal2 v24.4s, v1.8h, v13.8h\n"
+            "smlal v21.4s, v2.4h, v11.4h\n"
+            "smlal2 v22.4s, v2.8h, v11.8h\n"
+            "smlal v23.4s, v2.4h, v14.4h\n"
+            "smlal2 v24.4s, v2.8h, v14.8h\n"
+            "smlal v21.4s, v3.4h, v12.4h\n"
+            "smlal2 v22.4s, v3.8h, v12.8h\n"
+            "ld1 {v12.8b}, [x13]\n"
+            "smlal v23.4s, v3.4h, v15.4h\n"
+            "smlal2 v24.4s, v3.8h, v15.8h\n"
+            "smlal v21.4s, v4.4h, v13.4h\n"
+            "smlal2 v22.4s, v4.8h, v13.8h\n"
+            "smlal v23.4s, v4.4h, v16.4h\n"
+            "smlal2 v24.4s, v4.8h, v16.8h\n"
+            "smlal v21.4s, v5.4h, v14.4h\n"
+            "smlal2 v22.4s, v5.8h, v14.8h\n"
+            "smlal v23.4s, v5.4h, v17.4h\n"
+            "smlal2 v24.4s, v5.8h, v17.8h\n"
+            "smlal v21.4s, v6.4h, v15.4h\n"
+            "smlal2 v22.4s, v6.8h, v15.8h\n"
+            "ld1 {v15.8b}, [x14]\n"
+            "smlal v23.4s, v6.4h, v18.4h\n"
+            "smlal2 v24.4s, v6.8h, v18.8h\n"
+            "ld1 {v18.8b}, [x15]\n"
+            "smlal v21.4s, v7.4h, v16.4h\n"
+            "smlal2 v22.4s, v7.8h, v16.8h\n"
+            "smlal v23.4s, v7.4h, v19.4h\n"
+            "smlal2 v24.4s, v7.8h, v19.8h\n"
+            "smlal v21.4s, v8.4h, v17.4h\n"
+            "smlal2 v22.4s, v8.8h, v17.8h\n"
+            "smlal v23.4s, v8.4h, v20.4h\n"
+            "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+            "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+            "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+            "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+            "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+            "and v25.16b, v21.16b, v28.16b\n"
+            "and v29.16b, v22.16b, v28.16b\n"
+            "and v30.16b, v23.16b, v28.16b\n"
+            "and v31.16b, v24.16b, v28.16b\n"
+            "sshr v25.4s, v25.4s, #31\n"
+            "sshr v29.4s, v29.4s, #31\n"
+            "sshr v30.4s, v30.4s, #31\n"
+            "sshr v31.4s, v31.4s, #31\n"
+            "sqadd v21.4s, v21.4s, v25.4s\n"
+            "sqadd v22.4s, v22.4s, v29.4s\n"
+            "dup v29.4s, w2\n"
+            "sqadd v23.4s, v23.4s, v30.4s\n"
+            "dup v30.4s, w4\n"
+            "sqadd v24.4s, v24.4s, v31.4s\n"
+            "dup v31.4s, w0\n"
+            "srshl v21.4s, v21.4s, v28.4s\n"
+            "srshl v22.4s, v22.4s, v28.4s\n"
+            "srshl v23.4s, v23.4s, v28.4s\n"
+            "srshl v24.4s, v24.4s, v28.4s\n"
+            "add v21.4s, v21.4s, v29.4s\n"
+            "add v22.4s, v22.4s, v29.4s\n"
+            "add v23.4s, v23.4s, v29.4s\n"
+            "add v24.4s, v24.4s, v29.4s\n"
+            "smax v21.4s, v21.4s, v30.4s\n"
+            "smax v22.4s, v22.4s, v30.4s\n"
+            "smax v23.4s, v23.4s, v30.4s\n"
+            "smax v24.4s, v24.4s, v30.4s\n"
+            "smin v21.4s, v21.4s, v31.4s\n"
+            "smin v22.4s, v22.4s, v31.4s\n"
+            "smin v23.4s, v23.4s, v31.4s\n"
+            "smin v24.4s, v24.4s, v31.4s\n"
+            "sqxtn v21.4h, v21.4s\n"
+            "sqxtn v23.4h, v23.4s\n"
+            "sqxtn2 v21.8h, v22.4s\n"
+            "ld1 {v22.4s}, [x10]\n"
+            "sqxtn2 v23.8h, v24.4s\n"
+            "ld1 {v24.4s}, [x10]\n"
+            "sqxtun v21.8b, v21.8h\n"
+            "sqxtun v23.8b, v23.8h\n"
+            "uaddw v9.8h, v26.8h, v9.8b\n"
+            "st1 {v21.8b}, [x6], x3\n"
+            "uaddw v12.8h, v26.8h, v12.8b\n"
+            "st1 {v23.8b}, [x7], x3\n"
+            "uaddw v15.8h, v26.8h, v15.8b\n"
+            "ld1 {v21.4s}, [%[bias_ptr]]\n"
+            "uaddw v18.8h, v26.8h, v18.8b\n"
+            "ld1 {v23.4s}, [%[bias_ptr]]\n"
+
+            // Mul-add right outputs.
+            "smlal v21.4s, v0.4h, v10.4h\n"
+            "add x11, x11, %[input_width_increment]\n"
+            "smlal2 v22.4s, v0.8h, v10.8h\n"
+            "mov x12, x11\n"
+            "smlal v23.4s, v0.4h, v13.4h\n"
+            "add x13, x11, %[input_row_size]\n"
+            "smlal2 v24.4s, v0.8h, v13.8h\n"
+            "add x14, x13, %[input_row_size]\n"
+            "smlal v21.4s, v1.4h, v11.4h\n"
+            "add x15, x14, %[input_row_size]\n"
+            "smlal2 v22.4s, v1.8h, v11.8h\n"
+            "smlal v23.4s, v1.4h, v14.4h\n"
+            "smlal2 v24.4s, v1.8h, v14.8h\n"
+            "smlal v21.4s, v2.4h, v9.4h\n"
+            "smlal2 v22.4s, v2.8h, v9.8h\n"
+            "ld1 {v9.8b}, [x12], %[input_depth]\n"
+            "smlal v23.4s, v2.4h, v12.4h\n"
+            "ld1 {v10.8b}, [x12], %[input_depth]\n"
+            "smlal2 v24.4s, v2.8h, v12.8h\n"
+            "ld1 {v11.8b}, [x12], %[input_depth]\n"
+            "smlal v21.4s, v3.4h, v13.4h\n"
+            "smlal2 v22.4s, v3.8h, v13.8h\n"
+            "smlal v23.4s, v3.4h, v16.4h\n"
+            "smlal2 v24.4s, v3.8h, v16.8h\n"
+            "smlal v21.4s, v4.4h, v14.4h\n"
+            "smlal2 v22.4s, v4.8h, v14.8h\n"
+            "smlal v23.4s, v4.4h, v17.4h\n"
+            "smlal2 v24.4s, v4.8h, v17.8h\n"
+            "smlal v21.4s, v5.4h, v12.4h\n"
+            "smlal2 v22.4s, v5.8h, v12.8h\n"
+            "ld1 {v12.8b}, [x13], %[input_depth]\n"
+            "smlal v23.4s, v5.4h, v15.4h\n"
+            "ld1 {v13.8b}, [x13], %[input_depth]\n"
+            "smlal2 v24.4s, v5.8h, v15.8h\n"
+            "ld1 {v14.8b}, [x13], %[input_depth]\n"
+            "smlal v21.4s, v6.4h, v16.4h\n"
+            "smlal2 v22.4s, v6.8h, v16.8h\n"
+            "smlal v23.4s, v6.4h, v19.4h\n"
+            "smlal2 v24.4s, v6.8h, v19.8h\n"
+            "smlal v21.4s, v7.4h, v17.4h\n"
+            "smlal2 v22.4s, v7.8h, v17.8h\n"
+            "smlal v23.4s, v7.4h, v20.4h\n"
+            "smlal2 v24.4s, v7.8h, v20.8h\n"
+            "smlal v21.4s, v8.4h, v15.4h\n"
+            "smlal2 v22.4s, v8.8h, v15.8h\n"
+            "ld1 {v15.8b}, [x14], %[input_depth]\n"
+            "smlal v23.4s, v8.4h, v18.4h\n"
+            "ld1 {v16.8b}, [x14], %[input_depth]\n"
+            "smlal2 v24.4s, v8.8h, v18.8h\n"
+            "ld1 {v17.8b}, [x14], %[input_depth]\n"
+
+            "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+            "ld1 {v18.8b}, [x15], %[input_depth]\n"
+            "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+            "ld1 {v19.8b}, [x15], %[input_depth]\n"
+            "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+            "ld1 {v20.8b}, [x15], %[input_depth]\n"
+            "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+            "and v25.16b, v21.16b, v28.16b\n"
+            "and v29.16b, v22.16b, v28.16b\n"
+            "and v30.16b, v23.16b, v28.16b\n"
+            "and v31.16b, v24.16b, v28.16b\n"
+            "sshr v25.4s, v25.4s, #31\n"
+            "sshr v29.4s, v29.4s, #31\n"
+            "sshr v30.4s, v30.4s, #31\n"
+            "sshr v31.4s, v31.4s, #31\n"
+            "sqadd v21.4s, v21.4s, v25.4s\n"
+            "sqadd v22.4s, v22.4s, v29.4s\n"
+            "dup v29.4s, w2\n"
+            "sqadd v23.4s, v23.4s, v30.4s\n"
+            "dup v30.4s, w4\n"
+            "sqadd v24.4s, v24.4s, v31.4s\n"
+            "dup v31.4s, w0\n"
+            "srshl v21.4s, v21.4s, v28.4s\n"
+            "srshl v22.4s, v22.4s, v28.4s\n"
+            "srshl v23.4s, v23.4s, v28.4s\n"
+            "srshl v24.4s, v24.4s, v28.4s\n"
+            "add v21.4s, v21.4s, v29.4s\n"
+            "add v22.4s, v22.4s, v29.4s\n"
+            "add v23.4s, v23.4s, v29.4s\n"
+            "add v24.4s, v24.4s, v29.4s\n"
+            "smax v21.4s, v21.4s, v30.4s\n"
+            "smax v22.4s, v22.4s, v30.4s\n"
+            "smax v23.4s, v23.4s, v30.4s\n"
+            "smax v24.4s, v24.4s, v30.4s\n"
+            "smin v21.4s, v21.4s, v31.4s\n"
+            "smin v22.4s, v22.4s, v31.4s\n"
+            "smin v23.4s, v23.4s, v31.4s\n"
+            "smin v24.4s, v24.4s, v31.4s\n"
+            "sqxtn v21.4h, v21.4s\n"
+            "sqxtn v23.4h, v23.4s\n"
+            "sqxtn2 v21.8h, v22.4s\n"
+            "ld1 {v22.4s}, [x10]\n"
+            "sqxtn2 v23.8h, v24.4s\n"
+            "ld1 {v24.4s}, [x10]\n"
+            "sqxtun v21.8b, v21.8h\n"
+            "sqxtun v23.8b, v23.8h\n"
+            "uaddw v9.8h, v26.8h, v9.8b\n"
+            "st1 {v21.8b}, [x6], x3\n"
+            "uaddw v10.8h, v26.8h, v10.8b\n"
+            "st1 {v23.8b}, [x7], x3\n"
+            "uaddw v11.8h, v26.8h, v11.8b\n"
+            "uaddw v12.8h, v26.8h, v12.8b\n"
+            "uaddw v13.8h, v26.8h, v13.8b\n"
+            "uaddw v14.8h, v26.8h, v14.8b\n"
+            "uaddw v15.8h, v26.8h, v15.8b\n"
+            "ld1 {v21.4s}, [%[bias_ptr]]\n"
+            "uaddw v16.8h, v26.8h, v16.8b\n"
+            "ld1 {v23.4s}, [%[bias_ptr]]\n"
+            "uaddw v17.8h, v26.8h, v17.8b\n"
+            "uaddw v18.8h, v26.8h, v18.8b\n"
+            "uaddw v19.8h, v26.8h, v19.8b\n"
+            "uaddw v20.8h, v26.8h, v20.8b\n"
+
+            "bge " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "b\n"
+
+          // At this point, there will be one of 2 width or 1 width leftover,
+          // not both.
+          "cmp w5, #2\n"
+          "blt " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
+
+          // Handle last 2 columns if exists.
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER ":\n"
+          // Mul-add left outputs.
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "smlal v23.4s, v0.4h, v12.4h\n"
+          "ld1 {v9.8b}, [x12]\n"
+          "smlal2 v24.4s, v0.8h, v12.8h\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "smlal v23.4s, v1.4h, v13.4h\n"
+          "smlal2 v24.4s, v1.8h, v13.8h\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "smlal v23.4s, v2.4h, v14.4h\n"
+          "smlal2 v24.4s, v2.8h, v14.8h\n"
+          "smlal v21.4s, v3.4h, v12.4h\n"
+          "smlal2 v22.4s, v3.8h, v12.8h\n"
+          "ld1 {v12.8b}, [x13]\n"
+          "smlal v23.4s, v3.4h, v15.4h\n"
+          "smlal2 v24.4s, v3.8h, v15.8h\n"
+          "smlal v21.4s, v4.4h, v13.4h\n"
+          "smlal2 v22.4s, v4.8h, v13.8h\n"
+          "smlal v23.4s, v4.4h, v16.4h\n"
+          "smlal2 v24.4s, v4.8h, v16.8h\n"
+          "smlal v21.4s, v5.4h, v14.4h\n"
+          "smlal2 v22.4s, v5.8h, v14.8h\n"
+          "smlal v23.4s, v5.4h, v17.4h\n"
+          "smlal2 v24.4s, v5.8h, v17.8h\n"
+          "smlal v21.4s, v6.4h, v15.4h\n"
+          "smlal2 v22.4s, v6.8h, v15.8h\n"
+          "ld1 {v15.8b}, [x14]\n"
+          "smlal v23.4s, v6.4h, v18.4h\n"
+          "smlal2 v24.4s, v6.8h, v18.8h\n"
+          "ld1 {v18.8b}, [x15]\n"
+          "smlal v21.4s, v7.4h, v16.4h\n"
+          "smlal2 v22.4s, v7.8h, v16.8h\n"
+          "smlal v23.4s, v7.4h, v19.4h\n"
+          "smlal2 v24.4s, v7.8h, v19.8h\n"
+          "smlal v21.4s, v8.4h, v17.4h\n"
+          "smlal2 v22.4s, v8.8h, v17.8h\n"
+          "smlal v23.4s, v8.4h, v20.4h\n"
+          "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+          "and v25.16b, v21.16b, v28.16b\n"
+          "and v29.16b, v22.16b, v28.16b\n"
+          "and v30.16b, v23.16b, v28.16b\n"
+          "and v31.16b, v24.16b, v28.16b\n"
+          "sshr v25.4s, v25.4s, #31\n"
+          "sshr v29.4s, v29.4s, #31\n"
+          "sshr v30.4s, v30.4s, #31\n"
+          "sshr v31.4s, v31.4s, #31\n"
+          "sqadd v21.4s, v21.4s, v25.4s\n"
+          "sqadd v22.4s, v22.4s, v29.4s\n"
+          "dup v29.4s, w2\n"
+          "sqadd v23.4s, v23.4s, v30.4s\n"
+          "dup v30.4s, w4\n"
+          "sqadd v24.4s, v24.4s, v31.4s\n"
+          "dup v31.4s, w0\n"
+          "srshl v21.4s, v21.4s, v28.4s\n"
+          "srshl v22.4s, v22.4s, v28.4s\n"
+          "srshl v23.4s, v23.4s, v28.4s\n"
+          "srshl v24.4s, v24.4s, v28.4s\n"
+          "add v21.4s, v21.4s, v29.4s\n"
+          "add v22.4s, v22.4s, v29.4s\n"
+          "add v23.4s, v23.4s, v29.4s\n"
+          "add v24.4s, v24.4s, v29.4s\n"
+          "smax v21.4s, v21.4s, v30.4s\n"
+          "smax v22.4s, v22.4s, v30.4s\n"
+          "smax v23.4s, v23.4s, v30.4s\n"
+          "smax v24.4s, v24.4s, v30.4s\n"
+          "smin v21.4s, v21.4s, v31.4s\n"
+          "smin v22.4s, v22.4s, v31.4s\n"
+          "smin v23.4s, v23.4s, v31.4s\n"
+          "smin v24.4s, v24.4s, v31.4s\n"
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "ld1 {v22.4s}, [x10]\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "ld1 {v24.4s}, [x10]\n"
+          "sqxtun v21.8b, v21.8h\n"
+          "sqxtun v23.8b, v23.8h\n"
+          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "st1 {v21.8b}, [x6], x3\n"
+          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "st1 {v23.8b}, [x7], x3\n"
+          "uaddw v15.8h, v26.8h, v15.8b\n"
+          "ld1 {v21.4s}, [%[bias_ptr]]\n"
+          "uaddw v18.8h, v26.8h, v18.8b\n"
+          "ld1 {v23.4s}, [%[bias_ptr]]\n"
+
+          // Mul-add right outputs.
+          "smlal v21.4s, v0.4h, v10.4h\n"
+          "smlal2 v22.4s, v0.8h, v10.8h\n"
+          "smlal v23.4s, v0.4h, v13.4h\n"
+          "smlal2 v24.4s, v0.8h, v13.8h\n"
+          "smlal v21.4s, v1.4h, v11.4h\n"
+          "smlal2 v22.4s, v1.8h, v11.8h\n"
+          "smlal v23.4s, v1.4h, v14.4h\n"
+          "smlal2 v24.4s, v1.8h, v14.8h\n"
+          "smlal v21.4s, v2.4h, v9.4h\n"
+          "smlal2 v22.4s, v2.8h, v9.8h\n"
+          "smlal v23.4s, v2.4h, v12.4h\n"
+          "smlal2 v24.4s, v2.8h, v12.8h\n"
+          "smlal v21.4s, v3.4h, v13.4h\n"
+          "smlal2 v22.4s, v3.8h, v13.8h\n"
+          "smlal v23.4s, v3.4h, v16.4h\n"
+          "smlal2 v24.4s, v3.8h, v16.8h\n"
+          "smlal v21.4s, v4.4h, v14.4h\n"
+          "smlal2 v22.4s, v4.8h, v14.8h\n"
+          "smlal v23.4s, v4.4h, v17.4h\n"
+          "smlal2 v24.4s, v4.8h, v17.8h\n"
+          "smlal v21.4s, v5.4h, v12.4h\n"
+          "smlal2 v22.4s, v5.8h, v12.8h\n"
+          "smlal v23.4s, v5.4h, v15.4h\n"
+          "smlal2 v24.4s, v5.8h, v15.8h\n"
+          "smlal v21.4s, v6.4h, v16.4h\n"
+          "smlal2 v22.4s, v6.8h, v16.8h\n"
+          "smlal v23.4s, v6.4h, v19.4h\n"
+          "smlal2 v24.4s, v6.8h, v19.8h\n"
+          "smlal v21.4s, v7.4h, v17.4h\n"
+          "smlal2 v22.4s, v7.8h, v17.8h\n"
+          "smlal v23.4s, v7.4h, v20.4h\n"
+          "smlal2 v24.4s, v7.8h, v20.8h\n"
+          "smlal v21.4s, v8.4h, v15.4h\n"
+          "smlal2 v22.4s, v8.8h, v15.8h\n"
+          "smlal v23.4s, v8.4h, v18.4h\n"
+          "smlal2 v24.4s, v8.8h, v18.8h\n"
+
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+          "and v25.16b, v21.16b, v28.16b\n"
+          "and v29.16b, v22.16b, v28.16b\n"
+          "and v30.16b, v23.16b, v28.16b\n"
+          "and v31.16b, v24.16b, v28.16b\n"
+          "sshr v25.4s, v25.4s, #31\n"
+          "sshr v29.4s, v29.4s, #31\n"
+          "sshr v30.4s, v30.4s, #31\n"
+          "sshr v31.4s, v31.4s, #31\n"
+          "sqadd v21.4s, v21.4s, v25.4s\n"
+          "sqadd v22.4s, v22.4s, v29.4s\n"
+          "dup v29.4s, w2\n"
+          "sqadd v23.4s, v23.4s, v30.4s\n"
+          "dup v30.4s, w4\n"
+          "sqadd v24.4s, v24.4s, v31.4s\n"
+          "dup v31.4s, w0\n"
+          "srshl v21.4s, v21.4s, v28.4s\n"
+          "srshl v22.4s, v22.4s, v28.4s\n"
+          "srshl v23.4s, v23.4s, v28.4s\n"
+          "srshl v24.4s, v24.4s, v28.4s\n"
+          "add v21.4s, v21.4s, v29.4s\n"
+          "add v22.4s, v22.4s, v29.4s\n"
+          "add v23.4s, v23.4s, v29.4s\n"
+          "add v24.4s, v24.4s, v29.4s\n"
+          "smax v21.4s, v21.4s, v30.4s\n"
+          "smax v22.4s, v22.4s, v30.4s\n"
+          "smax v23.4s, v23.4s, v30.4s\n"
+          "smax v24.4s, v24.4s, v30.4s\n"
+          "smin v21.4s, v21.4s, v31.4s\n"
+          "smin v22.4s, v22.4s, v31.4s\n"
+          "smin v23.4s, v23.4s, v31.4s\n"
+          "smin v24.4s, v24.4s, v31.4s\n"
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "sqxtun v21.8b, v21.8h\n"
+          "sqxtun v23.8b, v23.8h\n"
+          "st1 {v21.8b}, [x6], x3\n"
+          "st1 {v23.8b}, [x7], x3\n"
+          "b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n"
+
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER ":\n"
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "smlal v23.4s, v0.4h, v12.4h\n"
+          "smlal2 v24.4s, v0.8h, v12.8h\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "smlal v23.4s, v1.4h, v13.4h\n"
+          "smlal2 v24.4s, v1.8h, v13.8h\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "smlal v23.4s, v2.4h, v14.4h\n"
+          "smlal2 v24.4s, v2.8h, v14.8h\n"
+          "smlal v21.4s, v3.4h, v12.4h\n"
+          "smlal2 v22.4s, v3.8h, v12.8h\n"
+          "smlal v23.4s, v3.4h, v15.4h\n"
+          "smlal2 v24.4s, v3.8h, v15.8h\n"
+          "smlal v21.4s, v4.4h, v13.4h\n"
+          "smlal2 v22.4s, v4.8h, v13.8h\n"
+          "smlal v23.4s, v4.4h, v16.4h\n"
+          "smlal2 v24.4s, v4.8h, v16.8h\n"
+          "smlal v21.4s, v5.4h, v14.4h\n"
+          "smlal2 v22.4s, v5.8h, v14.8h\n"
+          "smlal v23.4s, v5.4h, v17.4h\n"
+          "smlal2 v24.4s, v5.8h, v17.8h\n"
+          "smlal v21.4s, v6.4h, v15.4h\n"
+          "smlal2 v22.4s, v6.8h, v15.8h\n"
+          "smlal v23.4s, v6.4h, v18.4h\n"
+          "smlal2 v24.4s, v6.8h, v18.8h\n"
+          "smlal v21.4s, v7.4h, v16.4h\n"
+          "smlal2 v22.4s, v7.8h, v16.8h\n"
+          "smlal v23.4s, v7.4h, v19.4h\n"
+          "smlal2 v24.4s, v7.8h, v19.8h\n"
+          "smlal v21.4s, v8.4h, v17.4h\n"
+          "smlal2 v22.4s, v8.8h, v17.8h\n"
+          "smlal v23.4s, v8.4h, v20.4h\n"
+          "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+          "and v9.16b, v21.16b, v28.16b\n"
+          "and v12.16b, v22.16b, v28.16b\n"
+          "and v15.16b, v23.16b, v28.16b\n"
+          "and v18.16b, v24.16b, v28.16b\n"
+          "sshr v9.4s, v9.4s, #31\n"
+          "sshr v12.4s, v12.4s, #31\n"
+          "sshr v15.4s, v15.4s, #31\n"
+          "sshr v18.4s, v18.4s, #31\n"
+          "sqadd v21.4s, v21.4s, v9.4s\n"
+          "sqadd v22.4s, v22.4s, v12.4s\n"
+          "sqadd v23.4s, v23.4s, v15.4s\n"
+          "sqadd v24.4s, v24.4s, v18.4s\n"
+          "srshl v21.4s, v21.4s, v28.4s\n"
+          "srshl v22.4s, v22.4s, v28.4s\n"
+          "srshl v23.4s, v23.4s, v28.4s\n"
+          "srshl v24.4s, v24.4s, v28.4s\n"
+          "add v21.4s, v21.4s, v29.4s\n"
+          "add v22.4s, v22.4s, v29.4s\n"
+          "add v23.4s, v23.4s, v29.4s\n"
+          "add v24.4s, v24.4s, v29.4s\n"
+          "smax v21.4s, v21.4s, v30.4s\n"
+          "smax v22.4s, v22.4s, v30.4s\n"
+          "smax v23.4s, v23.4s, v30.4s\n"
+          "smax v24.4s, v24.4s, v30.4s\n"
+          "smin v21.4s, v21.4s, v31.4s\n"
+          "smin v22.4s, v22.4s, v31.4s\n"
+          "smin v23.4s, v23.4s, v31.4s\n"
+          "smin v24.4s, v24.4s, v31.4s\n"
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "sqxtun v21.8b, v21.8h\n"
+          "sqxtun v23.8b, v23.8h\n"
+          "st1 {v21.8b}, [x6], x3\n"
+          "st1 {v23.8b}, [x7], x3\n"
+
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n"
+          "subs %w[output_window_height], %w[output_window_height], #2\n"
+          "add %[input_ptr], %[input_ptr], %[input_height_increment]\n"
+          "cmp %w[output_window_height], #2\n"
+          "add %[output_ptr], %[output_ptr], %[output_height_increment]\n"
+          "bge " DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP ":\n"
+        "cmp %w[output_window_height], #1\n"
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_1 ":\n"
+        "mov x12, %[input_ptr]\n"
+        "ld1 {v9.8b}, [x12], %[input_depth]\n"
+        "add x13, %[input_ptr], %[input_row_size]\n"
+        "ld1 {v10.8b}, [x12], %[input_depth]\n"
+        "add x14, x13, %[input_row_size]\n"
+        "ld1 {v11.8b}, [x12], %[input_depth]\n"
+        "add x15, x14, %[input_row_size]\n"
+        "mov w5, %w[output_window_width]\n"
+        "ld1 {v13.8b}, [x13], %[input_depth]\n"
+        "mov x6, %[output_ptr]\n"
+        "ld1 {v14.8b}, [x13], %[input_depth]\n"
+        "add x7, %[output_ptr], x1\n"
+        "ld1 {v15.8b}, [x13], %[input_depth]\n"
+        // The height 1 / width 2 loop loads an extra 1x1 output in anticipation
+        // for the next iteration. Make sure |output_window_width| is large
+        // enough to handle the additional load, otherwise jump to the
+        // appropriate label to handle smaller widths.
+        "cmp w5, #2\n"
+        "ld1 {v17.8b}, [x14], %[input_depth]\n"
+        "ld1 {v18.8b}, [x14], %[input_depth]\n"
+        "ld1 {v19.8b}, [x14], %[input_depth]\n"
+        "ld1 {v21.4s}, [%[bias_ptr]]\n"
+        "ld1 {v22.4s}, [x10]\n"
+        "ld1 {v23.4s}, [%[bias_ptr]]\n"
+        "ld1 {v24.4s}, [x10]\n"
+
+        "uaddw v9.8h, v26.8h, v9.8b\n"
+        "uaddw v10.8h, v26.8h, v10.8b\n"
+        "uaddw v11.8h, v26.8h, v11.8b\n"
+        "uaddw v13.8h, v26.8h, v13.8b\n"
+        "uaddw v14.8h, v26.8h, v14.8b\n"
+        "uaddw v15.8h, v26.8h, v15.8b\n"
+        "uaddw v17.8h, v26.8h, v17.8b\n"
+        "uaddw v18.8h, v26.8h, v18.8b\n"
+        "uaddw v19.8h, v26.8h, v19.8b\n"
+
+        "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "f\n"
+        "cmp w5, #1\n"
+        "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP ":\n"
+          // Load inputs for 3x4 input window which corresponds to a 1x2 output
+          // window.
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "ld1 {v12.8b}, [x12]\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "ld1 {v16.8b}, [x13]\n"
+          "smlal v23.4s, v0.4h, v10.4h\n"
+          "ld1 {v20.8b}, [x14]\n"
+          "smlal2 v24.4s, v0.8h, v10.8h\n"
+          "subs w5, w5, #2\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "cmp w5, #3\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "add %[input_ptr], %[input_ptr], %[input_width_increment]\n"
+          "smlal v23.4s, v1.4h, v11.4h\n"
+          "mov x12, %[input_ptr]\n"
+          "smlal2 v24.4s, v1.8h, v11.8h\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x12], %[input_depth]\n"
+          "add x13, %[input_ptr], %[input_row_size]\n"
+          "smlal v23.4s, v2.4h, v12.4h\n"
+          "add x14, x13, %[input_row_size]\n"
+          "smlal2 v24.4s, v2.8h, v12.8h\n"
+          "smlal v21.4s, v3.4h, v13.4h\n"
+          "add x15, x14, %[input_row_size]\n"
+          "smlal2 v22.4s, v3.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v3.4h, v14.4h\n"
+          "smlal2 v24.4s, v3.8h, v14.8h\n"
+          "smlal v21.4s, v4.4h, v14.4h\n"
+          "smlal2 v22.4s, v4.8h, v14.8h\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v4.4h, v15.4h\n"
+          "smlal2 v24.4s, v4.8h, v15.8h\n"
+          "smlal v21.4s, v5.4h, v15.4h\n"
+          "uaddw v16.8h, v26.8h, v16.8b\n"
+          "smlal2 v22.4s, v5.8h, v15.8h\n"
+          "ld1 {v15.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v5.4h, v16.4h\n"
+          "smlal2 v24.4s, v5.8h, v16.8h\n"
+          "smlal v21.4s, v6.4h, v17.4h\n"
+          "smlal2 v22.4s, v6.8h, v17.8h\n"
+          "ld1 {v17.8b}, [x14], %[input_depth]\n"
+          "smlal v23.4s, v6.4h, v18.4h\n"
+          "smlal2 v24.4s, v6.8h, v18.8h\n"
+          "smlal v21.4s, v7.4h, v18.4h\n"
+          "smlal2 v22.4s, v7.8h, v18.8h\n"
+          "ld1 {v18.8b}, [x14], %[input_depth]\n"
+          "smlal v23.4s, v7.4h, v19.4h\n"
+          "smlal2 v24.4s, v7.8h, v19.8h\n"
+          "smlal v21.4s, v8.4h, v19.4h\n"
+          "uaddw v20.8h, v26.8h, v20.8b\n"
+          "smlal2 v22.4s, v8.8h, v19.8h\n"
+          "ld1 {v19.8b}, [x14], %[input_depth]\n"
+          "smlal v23.4s, v8.4h, v20.4h\n"
+          "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+          "and v25.16b, v21.16b, v28.16b\n"
+          "and v29.16b, v22.16b, v28.16b\n"
+          "and v30.16b, v23.16b, v28.16b\n"
+          "and v31.16b, v24.16b, v28.16b\n"
+          "sshr v25.4s, v25.4s, #31\n"
+          "sshr v29.4s, v29.4s, #31\n"
+          "sshr v30.4s, v30.4s, #31\n"
+          "sshr v31.4s, v31.4s, #31\n"
+          "sqadd v21.4s, v21.4s, v25.4s\n"
+          "sqadd v22.4s, v22.4s, v29.4s\n"
+          "dup v29.4s, w2\n"
+          "sqadd v23.4s, v23.4s, v30.4s\n"
+          "dup v30.4s, w4\n"
+          "sqadd v24.4s, v24.4s, v31.4s\n"
+          "dup v31.4s, w0\n"
+          "srshl v21.4s, v21.4s, v28.4s\n"
+          "srshl v22.4s, v22.4s, v28.4s\n"
+          "srshl v23.4s, v23.4s, v28.4s\n"
+          "srshl v24.4s, v24.4s, v28.4s\n"
+          "add v21.4s, v21.4s, v29.4s\n"
+          "add v22.4s, v22.4s, v29.4s\n"
+          "add v23.4s, v23.4s, v29.4s\n"
+          "add v24.4s, v24.4s, v29.4s\n"
+          "smax v21.4s, v21.4s, v30.4s\n"
+          "smax v22.4s, v22.4s, v30.4s\n"
+          "smax v23.4s, v23.4s, v30.4s\n"
+          "smax v24.4s, v24.4s, v30.4s\n"
+          "smin v21.4s, v21.4s, v31.4s\n"
+          "smin v22.4s, v22.4s, v31.4s\n"
+          "smin v23.4s, v23.4s, v31.4s\n"
+          "smin v24.4s, v24.4s, v31.4s\n"
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "ld1 {v22.4s}, [x10]\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "ld1 {v24.4s}, [x10]\n"
+          "sqxtun v21.8b, v21.8h\n"
+          "sqxtun v23.8b, v23.8h\n"
+          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "st1 {v21.8b}, [%[output_ptr]], x3\n"
+          "uaddw v10.8h, v26.8h, v10.8b\n"
+          "st1 {v23.8b}, [%[output_ptr]], x3\n"
+          "uaddw v11.8h, v26.8h, v11.8b\n"
+          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "uaddw v13.8h, v26.8h, v13.8b\n"
+          "uaddw v14.8h, v26.8h, v14.8b\n"
+          "uaddw v15.8h, v26.8h, v15.8b\n"
+          "ld1 {v21.4s}, [%[bias_ptr]]\n"
+          "uaddw v16.8h, v26.8h, v16.8b\n"
+          "ld1 {v23.4s}, [%[bias_ptr]]\n"
+          "uaddw v17.8h, v26.8h, v17.8b\n"
+          "uaddw v18.8h, v26.8h, v18.8b\n"
+          "uaddw v19.8h, v26.8h, v19.8b\n"
+          "uaddw v20.8h, v26.8h, v20.8b\n"
+
+          "bge " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "b\n"
+
+        // At this point, there will be one of 2 width or 1 width leftover,
+        // not both.
+        "cmp w5, #2\n"
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
+
+        // Handle last two horizontal outputs if exists.
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER ":\n"
+        "smlal v21.4s, v0.4h, v9.4h\n"
+        "ld1 {v12.8b}, [x12], %[input_depth]\n"
+        "smlal2 v22.4s, v0.8h, v9.8h\n"
+        "ld1 {v16.8b}, [x13], %[input_depth]\n"
+        "smlal v23.4s, v0.4h, v10.4h\n"
+        "ld1 {v20.8b}, [x14], %[input_depth]\n"
+        "smlal2 v24.4s, v0.8h, v10.8h\n"
+        "smlal v21.4s, v1.4h, v10.4h\n"
+        "smlal2 v22.4s, v1.8h, v10.8h\n"
+        "smlal v23.4s, v1.4h, v11.4h\n"
+        "smlal2 v24.4s, v1.8h, v11.8h\n"
+        "smlal v21.4s, v2.4h, v11.4h\n"
+        "uaddw v12.8h, v26.8h, v12.8b\n"
+        "smlal2 v22.4s, v2.8h, v11.8h\n"
+        "smlal v23.4s, v2.4h, v12.4h\n"
+        "smlal2 v24.4s, v2.8h, v12.8h\n"
+        "smlal v21.4s, v3.4h, v13.4h\n"
+        "smlal2 v22.4s, v3.8h, v13.8h\n"
+        "smlal v23.4s, v3.4h, v14.4h\n"
+        "smlal2 v24.4s, v3.8h, v14.8h\n"
+        "smlal v21.4s, v4.4h, v14.4h\n"
+        "smlal2 v22.4s, v4.8h, v14.8h\n"
+        "smlal v23.4s, v4.4h, v15.4h\n"
+        "smlal2 v24.4s, v4.8h, v15.8h\n"
+        "smlal v21.4s, v5.4h, v15.4h\n"
+        "uaddw v16.8h, v26.8h, v16.8b\n"
+        "smlal2 v22.4s, v5.8h, v15.8h\n"
+        "smlal v23.4s, v5.4h, v16.4h\n"
+        "smlal2 v24.4s, v5.8h, v16.8h\n"
+        "smlal v21.4s, v6.4h, v17.4h\n"
+        "smlal2 v22.4s, v6.8h, v17.8h\n"
+        "smlal v23.4s, v6.4h, v18.4h\n"
+        "smlal2 v24.4s, v6.8h, v18.8h\n"
+        "smlal v21.4s, v7.4h, v18.4h\n"
+        "smlal2 v22.4s, v7.8h, v18.8h\n"
+        "smlal v23.4s, v7.4h, v19.4h\n"
+        "smlal2 v24.4s, v7.8h, v19.8h\n"
+        "smlal v21.4s, v8.4h, v19.4h\n"
+        "uaddw v20.8h, v26.8h, v20.8b\n"
+        "smlal2 v22.4s, v8.8h, v19.8h\n"
+        "smlal v23.4s, v8.4h, v20.4h\n"
+        "smlal2 v24.4s, v8.8h, v20.8h\n"
+
+        "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+        "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+        "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+        "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+        "and v25.16b, v21.16b, v28.16b\n"
+        "and v29.16b, v22.16b, v28.16b\n"
+        "and v30.16b, v23.16b, v28.16b\n"
+        "and v31.16b, v24.16b, v28.16b\n"
+        "sshr v25.4s, v25.4s, #31\n"
+        "sshr v29.4s, v29.4s, #31\n"
+        "sshr v30.4s, v30.4s, #31\n"
+        "sshr v31.4s, v31.4s, #31\n"
+        "sqadd v21.4s, v21.4s, v25.4s\n"
+        "sqadd v22.4s, v22.4s, v29.4s\n"
+        "dup v29.4s, w2\n"
+        "sqadd v23.4s, v23.4s, v30.4s\n"
+        "dup v30.4s, w4\n"
+        "sqadd v24.4s, v24.4s, v31.4s\n"
+        "dup v31.4s, w0\n"
+        "srshl v21.4s, v21.4s, v28.4s\n"
+        "srshl v22.4s, v22.4s, v28.4s\n"
+        "srshl v23.4s, v23.4s, v28.4s\n"
+        "srshl v24.4s, v24.4s, v28.4s\n"
+        "add v21.4s, v21.4s, v29.4s\n"
+        "add v22.4s, v22.4s, v29.4s\n"
+        "add v23.4s, v23.4s, v29.4s\n"
+        "add v24.4s, v24.4s, v29.4s\n"
+        "smax v21.4s, v21.4s, v30.4s\n"
+        "smax v22.4s, v22.4s, v30.4s\n"
+        "smax v23.4s, v23.4s, v30.4s\n"
+        "smax v24.4s, v24.4s, v30.4s\n"
+        "smin v21.4s, v21.4s, v31.4s\n"
+        "smin v22.4s, v22.4s, v31.4s\n"
+        "smin v23.4s, v23.4s, v31.4s\n"
+        "smin v24.4s, v24.4s, v31.4s\n"
+        "sqxtn v21.4h, v21.4s\n"
+        "sqxtn v23.4h, v23.4s\n"
+        "sqxtn2 v21.8h, v22.4s\n"
+        "sqxtn2 v23.8h, v24.4s\n"
+        "sqxtun v21.8b, v21.8h\n"
+        "sqxtun v23.8b, v23.8h\n"
+        "st1 {v21.8b}, [%[output_ptr]], x3\n"
+        "st1 {v23.8b}, [%[output_ptr]], x3\n"
+        "b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
+
+        // Handle bottom right output if exists.
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n"
+        "smlal v21.4s, v0.4h, v9.4h\n"
+        "smlal2 v22.4s, v0.8h, v9.8h\n"
+        "smlal v21.4s, v1.4h, v10.4h\n"
+        "smlal2 v22.4s, v1.8h, v10.8h\n"
+        "smlal v21.4s, v2.4h, v11.4h\n"
+        "smlal2 v22.4s, v2.8h, v11.8h\n"
+        "smlal v21.4s, v3.4h, v13.4h\n"
+        "smlal2 v22.4s, v3.8h, v13.8h\n"
+        "smlal v21.4s, v4.4h, v14.4h\n"
+        "smlal2 v22.4s, v4.8h, v14.8h\n"
+        "smlal v21.4s, v5.4h, v15.4h\n"
+        "smlal2 v22.4s, v5.8h, v15.8h\n"
+        "smlal v21.4s, v6.4h, v17.4h\n"
+        "smlal2 v22.4s, v6.8h, v17.8h\n"
+        "smlal v21.4s, v7.4h, v18.4h\n"
+        "smlal2 v22.4s, v7.8h, v18.8h\n"
+        "smlal v21.4s, v8.4h, v19.4h\n"
+        "smlal2 v22.4s, v8.8h, v19.8h\n"
+
+        "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+        "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+        "and v9.16b, v21.16b, v28.16b\n"
+        "and v12.16b, v22.16b, v28.16b\n"
+        "sshr v9.4s, v9.4s, #31\n"
+        "sshr v12.4s, v12.4s, #31\n"
+        "sqadd v21.4s, v21.4s, v9.4s\n"
+        "sqadd v22.4s, v22.4s, v12.4s\n"
+        "srshl v21.4s, v21.4s, v28.4s\n"
+        "srshl v22.4s, v22.4s, v28.4s\n"
+        "add v21.4s, v21.4s, v29.4s\n"
+        "add v22.4s, v22.4s, v29.4s\n"
+        "smax v21.4s, v21.4s, v30.4s\n"
+        "smax v22.4s, v22.4s, v30.4s\n"
+        "smin v21.4s, v21.4s, v31.4s\n"
+        "smin v22.4s, v22.4s, v31.4s\n"
+        "sqxtn v21.4h, v21.4s\n"
+        "sqxtn2 v21.8h, v22.4s\n"
+        "sqxtun v21.8b, v21.8h\n"
+        "st1 {v21.8b}, [%[output_ptr]]\n"
+        DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n"
+    :
+    // Outputs.
+    [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+    [output_ptr] "+r"(output_ptr),
+    [output_window_height] "+r"(output_window_height)
+    :
+    // Inputs.
+    [bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size),
+    [input_depth] "r"(input_depth),
+    [output_window_width] "r"(output_window_width),
+    [input_width_increment] "r"(input_width_increment),
+    [input_height_increment] "r"(input_height_increment),
+    [output_height_increment] "r"(output_height_increment),
+    [params_ptr] "r"(params_ptr)
+    :
+    // Clobbers.
+    "cc", "memory",
+    // We use these NEON registers.
+    "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+    "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+    "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+    "v30", "v31",
+    // We use these general-purpose registers.
+    "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+    "x9", "x10", "x11", "x12", "x13", "x14", "x15");
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_1
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_END
   }
 };
 
 template <>
-struct ConvKernel3x3FilterDepth8<4, 4, 2, 2> {
-  static inline void Run(const uint8* input_ptr, int input_depth,
-                         int32 input_offset, int input_row_size,
-                         const uint8* filter_ptr, int32 filter_offset,
-                         const int32* bias_ptr, int32 output_offset,
-                         int32 output_multiplier, int output_shift,
-                         int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_ptr,
-                         int output_depth, int output_width) {
-    // Reuse 4x2 kernel twice.
-    ConvKernel3x3FilterDepth8<4, 2, 2, 2>::Run(
-        input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
-        filter_offset, bias_ptr, output_offset, output_multiplier, output_shift,
-        output_activation_min, output_activation_max, output_ptr, output_depth,
-        output_width);
-
-    ConvKernel3x3FilterDepth8<4, 2, 2, 2>::Run(
-        input_ptr + 4 * input_depth, input_depth, input_offset, input_row_size,
-        filter_ptr, filter_offset, bias_ptr, output_offset, output_multiplier,
-        output_shift, output_activation_min, output_activation_max,
-        output_ptr + 2 * output_depth, output_depth, output_width);
+struct DepthwiseConvWindow<8, 2, 2> {
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                  const int32* bias_ptr, uint8* output_ptr, int64_t input_depth,
+                  int64_t input_row_size, int32 output_window_height,
+                  int32 output_window_width,
+                  const DepthwiseConvParams* params_ptr) {
+    const int64_t input_width_increment = 4 * input_depth;
+    const int64_t input_height_increment = 4 * input_row_size;
+    const int64_t output_height_increment = 2 * params_ptr->output_row_size;
+
+#define DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "1"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "2"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "3"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "4"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "5"
+#define DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "6"
+#define DEPTHWISECONV_LABEL_HEIGHT_1 "7"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "8"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "9"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "10"
+#define DEPTHWISECONV_LABEL_HEIGHT_1_END "11"
+
+    asm volatile(
+        // Performs depthwise convolutions for a window specified by
+        // |output_window_height| and |output_window_width|. The inner-most loop
+        // processes 2x2 outputs, and any leftovers at the end.
+        //
+        // Algorithm works as follows:
+        //
+        //   1. Load filters of 8 depth (8x3x3). Registers v0--v8 hold filter
+        //      values.
+        //   2. For 2 output heights at a time:
+        //        i.  For 2 output widths at a time at stride 2, a 5x5 input
+        //            window is required. To avoid register exhaustion, we load
+        //            the first 2 rows of the 5x5 input window into registers
+        //            v9--v18, and use the same registers to load the next 2
+        //            rows, and finally v9--v13 to load the last row.
+        //            Accumulators for all 2x2 outputs are reserved by registers
+        //            v21-v22 (top left output), v23-v24 (top right output),
+        //            v19-v20 (bottom left output), v25-v26 (bottom right
+        //            output).
+        //        ii. Handle single leftover width if exists.
+        //   3. Handle single leftover height if exists.
+        //        i.  For 2 output widths at a time at stride 2, load inputs for
+        //            a 1x2 (1 height, 2 width) output window (3x5 input
+        //            window). Registers v9--v24 hold input values. Mul-add with
+        //            accumulators v24--v27.
+        //        ii. Handle single leftover width if exists.
+        //
+        // Loads are placed as soon as the register is no longer needed and
+        // interleaved with arithmetic operations to take advantage of
+        // dual-issue pipelines. We also add input offsets as far from the loads
+        // as possible to give loads enough cycles to fetch data from memory.
+
+        // Set "constant" registers. These registers may be replaced with temp
+        // values from time to time when there are not enough NEON registers.
+        // We use x9--x15 general purpose registers as they are caller-saved
+        // temporary registers (see http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf).  // NOLINT
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_SHIFT) "]\n"
+        "ldr w0, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "cmp %w[output_window_height], #2\n"
+        "dup v28.8h, w0\n"
+        "neg w9, w9\n"
+        "ldr w1, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "dup v26.4s, w9\n"
+        "ldr w2, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w1\n"
+        "ldr w3, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "dup v29.4s, w2\n"
+        "ldr w4, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.4s, w3\n"
+        "ldr x5, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "dup v31.4s, w4\n"
+        "ldr x19, [%[params_ptr], #" STR(OFFSET_OUTPUT_ROW_SIZE) "]\n"
+        "ldr w20, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+
+        // Load filters and add offsets.
+        "add x10, %[bias_ptr], #16\n"
+        "ld1 {v0.8b}, [%[filter_ptr]], x5\n"
+        "dup v9.8h, w20\n"
+        "ld1 {v1.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v0.8h, v9.8h, v0.8b\n"
+        "ld1 {v2.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v1.8h, v9.8h, v1.8b\n"
+        "ld1 {v3.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v2.8h, v9.8h, v2.8b\n"
+        "ld1 {v4.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v3.8h, v9.8h, v3.8b\n"
+        "ld1 {v5.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v4.8h, v9.8h, v4.8b\n"
+        "ld1 {v6.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v5.8h, v9.8h, v5.8b\n"
+        "ld1 {v7.8b}, [%[filter_ptr]], x5\n"
+        "uaddw v6.8h, v9.8h, v6.8b\n"
+        "ld1 {v8.8b}, [%[filter_ptr]]\n"
+        "uaddw v7.8h, v9.8h, v7.8b\n"
+        "uaddw v8.8h, v9.8h, v8.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_HEIGHT_2_LOOP ":\n"
+          // Load the first two rows of the 5x5 input window, then reuse the
+          // same registers to load subsequent rows as they become available.
+          "mov x11, %[input_ptr]\n"
+          "mov x12, x11\n"
+          "add x13, x12, %[input_row_size]\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "mov w14, %w[output_window_width]\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          // The height 2 / width 2 loop loads an extra 1 output horizontally in
+          // anticipation for the next iteration. Make sure
+          // |output_window_width| is large enough to handle the additional
+          // load, otherwise jump to the appropriate label to handle smaller
+          // widths.
+          "cmp w14, #2\n"
+          "ld1 {v11.8b}, [x12], %[input_depth]\n"
+          "add x15, x13, %[input_row_size]\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "mov x6, %[output_ptr]\n"
+          "ld1 {v15.8b}, [x13], %[input_depth]\n"
+          "add x7, %[output_ptr], x19\n"
+          "ld1 {v16.8b}, [x13], %[input_depth]\n"
+          "ld1 {v21.4s}, [%[bias_ptr]]\n"
+          "ld1 {v22.4s}, [x10]\n"
+          "ld1 {v23.4s}, [%[bias_ptr]]\n"
+          "uaddw v9.8h, v28.8h, v9.8b\n"
+          "ld1 {v24.4s}, [x10]\n"
+          "uaddw v10.8h, v28.8h, v10.8b\n"
+          "ld1 {v19.4s}, [%[bias_ptr]]\n"
+          "uaddw v11.8h, v28.8h, v11.8b\n"
+          "ld1 {v20.4s}, [x10]\n"
+          "uaddw v14.8h, v28.8h, v14.8b\n"
+          "ld1 {v25.4s}, [%[bias_ptr]]\n"
+          "uaddw v15.8h, v28.8h, v15.8b\n"
+          "ld1 {v26.4s}, [x10]\n"
+          "uaddw v16.8h, v28.8h, v16.8b\n"
+
+          "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER "f\n"
+          "cmp w14, #1\n"
+          "beq " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
+
+          //"loop_%=:\n"
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP ":\n"
+            "smlal v21.4s, v0.4h, v9.4h\n"
+            "ld1 {v12.8b}, [x12], %[input_depth]\n"
+            "smlal2 v22.4s, v0.8h, v9.8h\n"
+            "ld1 {v13.8b}, [x12]\n"
+            "add x12, x15, %[input_row_size]\n"
+            "smlal v23.4s, v0.4h, v11.4h\n"
+            "ld1 {v17.8b}, [x13], %[input_depth]\n"
+            "smlal2 v24.4s, v0.8h, v11.8h\n"
+            "ld1 {v18.8b}, [x13]\n"
+            "add x13, x12, %[input_row_size]\n"
+            "smlal v21.4s, v1.4h, v10.4h\n"
+            "ld1 {v9.8b}, [x15], %[input_depth]\n"
+            "smlal2 v22.4s, v1.8h, v10.8h\n"
+            "ld1 {v10.8b}, [x15], %[input_depth]\n"
+            "smlal v21.4s, v2.4h, v11.4h\n"
+            "smlal2 v22.4s, v2.8h, v11.8h\n"
+            "ld1 {v11.8b}, [x15], %[input_depth]\n"
+            "smlal v21.4s, v3.4h, v14.4h\n"
+            "smlal2 v22.4s, v3.8h, v14.8h\n"
+            "ld1 {v14.8b}, [x12], %[input_depth]\n"
+            "smlal v23.4s, v3.4h, v16.4h\n"
+            "subs w14, w14, #2\n"
+            "smlal2 v24.4s, v3.8h, v16.8h\n"
+            "cmp w14, #3\n"
+            "smlal v21.4s, v4.4h, v15.4h\n"
+            "uaddw v12.8h, v28.8h, v12.8b\n"
+            "smlal2 v22.4s, v4.8h, v15.8h\n"
+            "ld1 {v15.8b}, [x12], %[input_depth]\n"
+            "smlal v21.4s, v5.4h, v16.4h\n"
+            "uaddw v13.8h, v28.8h, v13.8b\n"
+            "smlal2 v22.4s, v5.8h, v16.8h\n"
+            "ld1 {v16.8b}, [x12], %[input_depth]\n"
+            "smlal v23.4s, v1.4h, v12.4h\n"
+            "uaddw v17.8h, v28.8h, v17.8b\n"
+            "smlal2 v24.4s, v1.8h, v12.8h\n"
+            "ld1 {v12.8b}, [x15], %[input_depth]\n"
+            "smlal v23.4s, v2.4h, v13.4h\n"
+            "uaddw v18.8h, v28.8h, v18.8b\n"
+            "smlal2 v24.4s, v2.8h, v13.8h\n"
+            "ld1 {v13.8b}, [x15]\n"
+            "smlal v23.4s, v4.4h, v17.4h\n"
+            "uaddw v9.8h, v28.8h, v9.8b\n"
+            "smlal2 v24.4s, v4.8h, v17.8h\n"
+            "ld1 {v17.8b}, [x12], %[input_depth]\n"
+            "smlal v23.4s, v5.4h, v18.4h\n"
+            "uaddw v10.8h, v28.8h, v10.8b\n"
+            "smlal2 v24.4s, v5.8h, v18.8h\n"
+            "ld1 {v18.8b}, [x12]\n"
+
+            "smlal v21.4s, v6.4h, v9.4h\n"
+            "smlal2 v22.4s, v6.8h, v9.8h\n"
+            "smlal v19.4s, v0.4h, v9.4h\n"
+            "uaddw v11.8h, v28.8h, v11.8b\n"
+            "smlal2 v20.4s, v0.8h, v9.8h\n"
+            "ld1 {v9.8b}, [x13], %[input_depth]\n"
+            "smlal v23.4s, v6.4h, v11.4h\n"
+            "smlal2 v24.4s, v6.8h, v11.8h\n"
+            "smlal v21.4s, v7.4h, v10.4h\n"
+            "smlal2 v22.4s, v7.8h, v10.8h\n"
+            "uaddw v12.8h, v28.8h, v12.8b\n"
+            "smlal v19.4s, v1.4h, v10.4h\n"
+            "smlal2 v20.4s, v1.8h, v10.8h\n"
+            "ld1 {v10.8b}, [x13], %[input_depth]\n"
+            "smlal v23.4s, v7.4h, v12.4h\n"
+            "smlal2 v24.4s, v7.8h, v12.8h\n"
+            "smlal v25.4s, v1.4h, v12.4h\n"
+            "smlal2 v26.4s, v1.8h, v12.8h\n"
+            "smlal v21.4s, v8.4h, v11.4h\n"
+            "smlal2 v22.4s, v8.8h, v11.8h\n"
+            "add x11, x11, %[input_width_increment]\n"
+            "smlal v19.4s, v2.4h, v11.4h\n"
+            "mov x12, x11\n"
+            "smlal2 v20.4s, v2.8h, v11.8h\n"
+            "uaddw v13.8h, v28.8h, v13.8b\n"
+            "smlal v25.4s, v0.4h, v11.4h\n"
+            "smlal2 v26.4s, v0.8h, v11.8h\n"
+            "ld1 {v11.8b}, [x13], %[input_depth]\n"
+            "smlal v23.4s, v8.4h, v13.4h\n"
+            "ld1 {v12.8b}, [x13], %[input_depth]\n"
+            "smlal2 v24.4s, v8.8h, v13.8h\n"
+            "smlal v25.4s, v2.4h, v13.4h\n"
+            "smlal2 v26.4s, v2.8h, v13.8h\n"
+            "ld1 {v13.8b}, [x13]\n"
+            "add x13, x12, %[input_row_size]\n"
+            "add x15, x13, %[input_row_size]\n"
+
+            "dup v28.4s, w9\n"
+            "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+            "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+            "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+            "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+            "and v27.16b, v21.16b, v28.16b\n"
+            "and v29.16b, v22.16b, v28.16b\n"
+            "and v30.16b, v23.16b, v28.16b\n"
+            "and v31.16b, v24.16b, v28.16b\n"
+            "sshr v27.4s, v27.4s, #31\n"
+            "sshr v29.4s, v29.4s, #31\n"
+            "sshr v30.4s, v30.4s, #31\n"
+            "sshr v31.4s, v31.4s, #31\n"
+            "sqadd v21.4s, v21.4s, v27.4s\n"
+            "dup v27.4s, w1\n"
+            "sqadd v22.4s, v22.4s, v29.4s\n"
+            "dup v29.4s, w2\n"
+            "sqadd v23.4s, v23.4s, v30.4s\n"
+            "dup v30.4s, w3\n"
+            "sqadd v24.4s, v24.4s, v31.4s\n"
+            "dup v31.4s, w4\n"
+            "srshl v21.4s, v21.4s, v28.4s\n"
+            "srshl v22.4s, v22.4s, v28.4s\n"
+            "srshl v23.4s, v23.4s, v28.4s\n"
+            "srshl v24.4s, v24.4s, v28.4s\n"
+            "dup v28.8h, w0\n"
+            "add v21.4s, v21.4s, v29.4s\n"
+            "add v22.4s, v22.4s, v29.4s\n"
+            "add v23.4s, v23.4s, v29.4s\n"
+            "add v24.4s, v24.4s, v29.4s\n"
+            "smax v21.4s, v21.4s, v30.4s\n"
+            "smax v22.4s, v22.4s, v30.4s\n"
+            "smax v23.4s, v23.4s, v30.4s\n"
+            "smax v24.4s, v24.4s, v30.4s\n"
+            "smin v21.4s, v21.4s, v31.4s\n"
+            "smin v22.4s, v22.4s, v31.4s\n"
+            "smin v23.4s, v23.4s, v31.4s\n"
+            "smin v24.4s, v24.4s, v31.4s\n"
+            "sqxtn v21.4h, v21.4s\n"
+            "sqxtn v23.4h, v23.4s\n"
+            "sqxtn2 v21.8h, v22.4s\n"
+            "ld1 {v22.4s}, [x10]\n"
+            "sqxtn2 v23.8h, v24.4s\n"
+            "ld1 {v24.4s}, [x10]\n"
+            "sqxtun v21.8b, v21.8h\n"
+            "sqxtun v23.8b, v23.8h\n"
+            "uaddw v9.8h, v28.8h, v9.8b\n"
+            "st1 {v21.8b}, [x6], x5\n"
+            "uaddw v10.8h, v28.8h, v10.8b\n"
+            "st1 {v23.8b}, [x6], x5\n"
+            "uaddw v11.8h, v28.8h, v11.8b\n"
+
+            "smlal v19.4s, v6.4h, v9.4h\n"
+            "smlal2 v20.4s, v6.8h, v9.8h\n"
+            "ld1 {v9.8b}, [x12], %[input_depth]\n"
+            "smlal v25.4s, v6.4h, v11.4h\n"
+            "smlal2 v26.4s, v6.8h, v11.8h\n"
+            "smlal v19.4s, v7.4h, v10.4h\n"
+            "uaddw v12.8h, v28.8h, v12.8b\n"
+            "smlal2 v20.4s, v7.8h, v10.8h\n"
+            "ld1 {v10.8b}, [x12], %[input_depth]\n"
+            "smlal v25.4s, v7.4h, v12.4h\n"
+            "smlal2 v26.4s, v7.8h, v12.8h\n"
+            "smlal v19.4s, v8.4h, v11.4h\n"
+            "uaddw v13.8h, v28.8h, v13.8b\n"
+            "smlal2 v20.4s, v8.8h, v11.8h\n"
+            "ld1 {v11.8b}, [x12], %[input_depth]\n"
+            "smlal v25.4s, v8.4h, v13.4h\n"
+            "uaddw v14.8h, v28.8h, v14.8b\n"
+            "smlal2 v26.4s, v8.8h, v13.8h\n"
+            "uaddw v16.8h, v28.8h, v16.8b\n"
+            "smlal v19.4s, v3.4h, v14.4h\n"
+            "uaddw v15.8h, v28.8h, v15.8b\n"
+            "smlal2 v20.4s, v3.8h, v14.8h\n"
+            "ld1 {v14.8b}, [x13], %[input_depth]\n"
+            "smlal v25.4s, v3.4h, v16.4h\n"
+            "ld1 {v21.4s}, [%[bias_ptr]]\n"
+            "smlal2 v26.4s, v3.8h, v16.8h\n"
+            "ld1 {v23.4s}, [%[bias_ptr]]\n"
+            "smlal v19.4s, v4.4h, v15.4h\n"
+            "uaddw v17.8h, v28.8h, v17.8b\n"
+            "smlal2 v20.4s, v4.8h, v15.8h\n"
+            "ld1 {v15.8b}, [x13], %[input_depth]\n"
+            "smlal v25.4s, v4.4h, v17.4h\n"
+            "smlal2 v26.4s, v4.8h, v17.8h\n"
+            "smlal v19.4s, v5.4h, v16.4h\n"
+            "uaddw v18.8h, v28.8h, v18.8b\n"
+            "smlal2 v20.4s, v5.8h, v16.8h\n"
+            "ld1 {v16.8b}, [x13], %[input_depth]\n"
+            "smlal v25.4s, v5.4h, v18.4h\n"
+            "smlal2 v26.4s, v5.8h, v18.8h\n"
+
+            "dup v28.4s, w9\n"
+            "sqrdmulh v19.4s, v19.4s, v27.4s\n"
+            "sqrdmulh v20.4s, v20.4s, v27.4s\n"
+            "sqrdmulh v25.4s, v25.4s, v27.4s\n"
+            "sqrdmulh v26.4s, v26.4s, v27.4s\n"
+            "and v27.16b, v19.16b, v28.16b\n"
+            "and v29.16b, v20.16b, v28.16b\n"
+            "and v30.16b, v25.16b, v28.16b\n"
+            "and v31.16b, v26.16b, v28.16b\n"
+            "sshr v27.4s, v27.4s, #31\n"
+            "sshr v29.4s, v29.4s, #31\n"
+            "sshr v30.4s, v30.4s, #31\n"
+            "sshr v31.4s, v31.4s, #31\n"
+            "sqadd v19.4s, v19.4s, v27.4s\n"
+            "dup v27.4s, w1\n"
+            "sqadd v20.4s, v20.4s, v29.4s\n"
+            "dup v29.4s, w2\n"
+            "sqadd v25.4s, v25.4s, v30.4s\n"
+            "dup v30.4s, w3\n"
+            "sqadd v26.4s, v26.4s, v31.4s\n"
+            "dup v31.4s, w4\n"
+            "srshl v19.4s, v19.4s, v28.4s\n"
+            "srshl v20.4s, v20.4s, v28.4s\n"
+            "srshl v25.4s, v25.4s, v28.4s\n"
+            "srshl v26.4s, v26.4s, v28.4s\n"
+            "dup v28.8h, w0\n"
+            "add v19.4s, v19.4s, v29.4s\n"
+            "add v20.4s, v20.4s, v29.4s\n"
+            "add v25.4s, v25.4s, v29.4s\n"
+            "add v26.4s, v26.4s, v29.4s\n"
+            "smax v19.4s, v19.4s, v30.4s\n"
+            "smax v20.4s, v20.4s, v30.4s\n"
+            "smax v25.4s, v25.4s, v30.4s\n"
+            "smax v26.4s, v26.4s, v30.4s\n"
+            "smin v19.4s, v19.4s, v31.4s\n"
+            "smin v20.4s, v20.4s, v31.4s\n"
+            "smin v25.4s, v25.4s, v31.4s\n"
+            "smin v26.4s, v26.4s, v31.4s\n"
+            "sqxtn v19.4h, v19.4s\n"
+            "sqxtn v25.4h, v25.4s\n"
+            "sqxtn2 v19.8h, v20.4s\n"
+            "ld1 {v20.4s}, [x10]\n"
+            "sqxtn2 v25.8h, v26.4s\n"
+            "ld1 {v26.4s}, [x10]\n"
+            "sqxtun v19.8b, v19.8h\n"
+            "sqxtun v25.8b, v25.8h\n"
+            "uaddw v9.8h, v28.8h, v9.8b\n"
+            "st1 {v19.8b}, [x7], x5\n"
+            "uaddw v10.8h, v28.8h, v10.8b\n"
+            "st1 {v25.8b}, [x7], x5\n"
+            "uaddw v11.8h, v28.8h, v11.8b\n"
+            "ld1 {v19.4s}, [%[bias_ptr]]\n"
+            "uaddw v14.8h, v28.8h, v14.8b\n"
+            "ld1 {v25.4s}, [%[bias_ptr]]\n"
+            "uaddw v15.8h, v28.8h, v15.8b\n"
+            "uaddw v16.8h, v28.8h, v16.8b\n"
+
+            "bge " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP "b\n"
+
+          // At this point, there will be one of 2 width or 1 width leftover,
+          // not both.
+          "cmp w14, #2\n"
+          "blt " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER "f\n"
+
+          // Handle last 2 columns if exists.
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER ":\n"
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "ld1 {v12.8b}, [x12], %[input_depth]\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "ld1 {v13.8b}, [x12]\n"
+          "add x12, x15, %[input_row_size]\n"
+          "smlal v23.4s, v0.4h, v11.4h\n"
+          "ld1 {v17.8b}, [x13], %[input_depth]\n"
+          "smlal2 v24.4s, v0.8h, v11.8h\n"
+          "ld1 {v18.8b}, [x13]\n"
+          "add x13, x12, %[input_row_size]\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "ld1 {v9.8b}, [x15], %[input_depth]\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x15], %[input_depth]\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x15], %[input_depth]\n"
+          "smlal v21.4s, v3.4h, v14.4h\n"
+          "smlal2 v22.4s, v3.8h, v14.8h\n"
+          "ld1 {v14.8b}, [x12], %[input_depth]\n"
+          "smlal v23.4s, v3.4h, v16.4h\n"
+          "smlal2 v24.4s, v3.8h, v16.8h\n"
+          "smlal v21.4s, v4.4h, v15.4h\n"
+          "uaddw v12.8h, v28.8h, v12.8b\n"
+          "smlal2 v22.4s, v4.8h, v15.8h\n"
+          "ld1 {v15.8b}, [x12], %[input_depth]\n"
+          "smlal v21.4s, v5.4h, v16.4h\n"
+          "uaddw v13.8h, v28.8h, v13.8b\n"
+          "smlal2 v22.4s, v5.8h, v16.8h\n"
+          "ld1 {v16.8b}, [x12], %[input_depth]\n"
+          "smlal v23.4s, v1.4h, v12.4h\n"
+          "uaddw v17.8h, v28.8h, v17.8b\n"
+          "smlal2 v24.4s, v1.8h, v12.8h\n"
+          "ld1 {v12.8b}, [x15], %[input_depth]\n"
+          "smlal v23.4s, v2.4h, v13.4h\n"
+          "uaddw v18.8h, v28.8h, v18.8b\n"
+          "smlal2 v24.4s, v2.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x15]\n"
+          "smlal v23.4s, v4.4h, v17.4h\n"
+          "uaddw v9.8h, v28.8h, v9.8b\n"
+          "smlal2 v24.4s, v4.8h, v17.8h\n"
+          "ld1 {v17.8b}, [x12], %[input_depth]\n"
+          "smlal v23.4s, v5.4h, v18.4h\n"
+          "uaddw v10.8h, v28.8h, v10.8b\n"
+          "smlal2 v24.4s, v5.8h, v18.8h\n"
+          "ld1 {v18.8b}, [x12]\n"
+
+          "smlal v21.4s, v6.4h, v9.4h\n"
+          "smlal2 v22.4s, v6.8h, v9.8h\n"
+          "smlal v19.4s, v0.4h, v9.4h\n"
+          "uaddw v11.8h, v28.8h, v11.8b\n"
+          "smlal2 v20.4s, v0.8h, v9.8h\n"
+          "ld1 {v9.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v6.4h, v11.4h\n"
+          "smlal2 v24.4s, v6.8h, v11.8h\n"
+          "smlal v21.4s, v7.4h, v10.4h\n"
+          "smlal2 v22.4s, v7.8h, v10.8h\n"
+          "uaddw v12.8h, v28.8h, v12.8b\n"
+          "smlal v19.4s, v1.4h, v10.4h\n"
+          "smlal2 v20.4s, v1.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v7.4h, v12.4h\n"
+          "smlal2 v24.4s, v7.8h, v12.8h\n"
+          "smlal v25.4s, v1.4h, v12.4h\n"
+          "smlal2 v26.4s, v1.8h, v12.8h\n"
+          "smlal v21.4s, v8.4h, v11.4h\n"
+          "smlal2 v22.4s, v8.8h, v11.8h\n"
+          "smlal v19.4s, v2.4h, v11.4h\n"
+          "smlal2 v20.4s, v2.8h, v11.8h\n"
+          "uaddw v13.8h, v28.8h, v13.8b\n"
+          "smlal v25.4s, v0.4h, v11.4h\n"
+          "smlal2 v26.4s, v0.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x13], %[input_depth]\n"
+          "smlal v23.4s, v8.4h, v13.4h\n"
+          "ld1 {v12.8b}, [x13], %[input_depth]\n"
+          "smlal2 v24.4s, v8.8h, v13.8h\n"
+          "smlal v25.4s, v2.4h, v13.4h\n"
+          "smlal2 v26.4s, v2.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x13]\n"
+
+          "dup v28.4s, w9\n"
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+          "and v27.16b, v21.16b, v28.16b\n"
+          "and v29.16b, v22.16b, v28.16b\n"
+          "and v30.16b, v23.16b, v28.16b\n"
+          "and v31.16b, v24.16b, v28.16b\n"
+          "sshr v27.4s, v27.4s, #31\n"
+          "sshr v29.4s, v29.4s, #31\n"
+          "sshr v30.4s, v30.4s, #31\n"
+          "sshr v31.4s, v31.4s, #31\n"
+          "sqadd v21.4s, v21.4s, v27.4s\n"
+          "dup v27.4s, w1\n"
+          "sqadd v22.4s, v22.4s, v29.4s\n"
+          "dup v29.4s, w2\n"
+          "sqadd v23.4s, v23.4s, v30.4s\n"
+          "dup v30.4s, w3\n"
+          "sqadd v24.4s, v24.4s, v31.4s\n"
+          "dup v31.4s, w4\n"
+          "srshl v21.4s, v21.4s, v28.4s\n"
+          "srshl v22.4s, v22.4s, v28.4s\n"
+          "srshl v23.4s, v23.4s, v28.4s\n"
+          "srshl v24.4s, v24.4s, v28.4s\n"
+          "dup v28.8h, w0\n"
+          "add v21.4s, v21.4s, v29.4s\n"
+          "add v22.4s, v22.4s, v29.4s\n"
+          "add v23.4s, v23.4s, v29.4s\n"
+          "add v24.4s, v24.4s, v29.4s\n"
+          "smax v21.4s, v21.4s, v30.4s\n"
+          "smax v22.4s, v22.4s, v30.4s\n"
+          "smax v23.4s, v23.4s, v30.4s\n"
+          "smax v24.4s, v24.4s, v30.4s\n"
+          "smin v21.4s, v21.4s, v31.4s\n"
+          "smin v22.4s, v22.4s, v31.4s\n"
+          "smin v23.4s, v23.4s, v31.4s\n"
+          "smin v24.4s, v24.4s, v31.4s\n"
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "ld1 {v22.4s}, [x10]\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "ld1 {v24.4s}, [x10]\n"
+          "sqxtun v21.8b, v21.8h\n"
+          "sqxtun v23.8b, v23.8h\n"
+          "uaddw v9.8h, v28.8h, v9.8b\n"
+          "st1 {v21.8b}, [x6], x5\n"
+          "uaddw v10.8h, v28.8h, v10.8b\n"
+          "st1 {v23.8b}, [x6]\n"
+          "uaddw v11.8h, v28.8h, v11.8b\n"
+
+          "smlal v19.4s, v6.4h, v9.4h\n"
+          "smlal2 v20.4s, v6.8h, v9.8h\n"
+          "smlal v25.4s, v6.4h, v11.4h\n"
+          "smlal2 v26.4s, v6.8h, v11.8h\n"
+          "smlal v19.4s, v7.4h, v10.4h\n"
+          "uaddw v12.8h, v28.8h, v12.8b\n"
+          "smlal2 v20.4s, v7.8h, v10.8h\n"
+          "smlal v25.4s, v7.4h, v12.4h\n"
+          "smlal2 v26.4s, v7.8h, v12.8h\n"
+          "smlal v19.4s, v8.4h, v11.4h\n"
+          "uaddw v13.8h, v28.8h, v13.8b\n"
+          "smlal2 v20.4s, v8.8h, v11.8h\n"
+          "smlal v25.4s, v8.4h, v13.4h\n"
+          "uaddw v14.8h, v28.8h, v14.8b\n"
+          "smlal2 v26.4s, v8.8h, v13.8h\n"
+          "uaddw v16.8h, v28.8h, v16.8b\n"
+          "smlal v19.4s, v3.4h, v14.4h\n"
+          "uaddw v15.8h, v28.8h, v15.8b\n"
+          "smlal2 v20.4s, v3.8h, v14.8h\n"
+          "smlal v25.4s, v3.4h, v16.4h\n"
+          "smlal2 v26.4s, v3.8h, v16.8h\n"
+          "smlal v19.4s, v4.4h, v15.4h\n"
+          "uaddw v17.8h, v28.8h, v17.8b\n"
+          "smlal2 v20.4s, v4.8h, v15.8h\n"
+          "smlal v25.4s, v4.4h, v17.4h\n"
+          "smlal2 v26.4s, v4.8h, v17.8h\n"
+          "smlal v19.4s, v5.4h, v16.4h\n"
+          "uaddw v18.8h, v28.8h, v18.8b\n"
+          "smlal2 v20.4s, v5.8h, v16.8h\n"
+          "smlal v25.4s, v5.4h, v18.4h\n"
+          "smlal2 v26.4s, v5.8h, v18.8h\n"
+
+          "dup v28.4s, w9\n"
+          "sqrdmulh v19.4s, v19.4s, v27.4s\n"
+          "sqrdmulh v20.4s, v20.4s, v27.4s\n"
+          "sqrdmulh v25.4s, v25.4s, v27.4s\n"
+          "sqrdmulh v26.4s, v26.4s, v27.4s\n"
+          "and v27.16b, v19.16b, v28.16b\n"
+          "and v29.16b, v20.16b, v28.16b\n"
+          "and v30.16b, v25.16b, v28.16b\n"
+          "and v31.16b, v26.16b, v28.16b\n"
+          "sshr v27.4s, v27.4s, #31\n"
+          "sshr v29.4s, v29.4s, #31\n"
+          "sshr v30.4s, v30.4s, #31\n"
+          "sshr v31.4s, v31.4s, #31\n"
+          "sqadd v19.4s, v19.4s, v27.4s\n"
+          "dup v27.4s, w1\n"
+          "sqadd v20.4s, v20.4s, v29.4s\n"
+          "dup v29.4s, w2\n"
+          "sqadd v25.4s, v25.4s, v30.4s\n"
+          "dup v30.4s, w3\n"
+          "sqadd v26.4s, v26.4s, v31.4s\n"
+          "dup v31.4s, w4\n"
+          "srshl v19.4s, v19.4s, v28.4s\n"
+          "srshl v20.4s, v20.4s, v28.4s\n"
+          "srshl v25.4s, v25.4s, v28.4s\n"
+          "srshl v26.4s, v26.4s, v28.4s\n"
+          "dup v28.8h, w0\n"
+          "add v19.4s, v19.4s, v29.4s\n"
+          "add v20.4s, v20.4s, v29.4s\n"
+          "add v25.4s, v25.4s, v29.4s\n"
+          "add v26.4s, v26.4s, v29.4s\n"
+          "smax v19.4s, v19.4s, v30.4s\n"
+          "smax v20.4s, v20.4s, v30.4s\n"
+          "smax v25.4s, v25.4s, v30.4s\n"
+          "smax v26.4s, v26.4s, v30.4s\n"
+          "smin v19.4s, v19.4s, v31.4s\n"
+          "smin v20.4s, v20.4s, v31.4s\n"
+          "smin v25.4s, v25.4s, v31.4s\n"
+          "smin v26.4s, v26.4s, v31.4s\n"
+          "sqxtn v19.4h, v19.4s\n"
+          "sqxtn v25.4h, v25.4s\n"
+          "sqxtn2 v19.8h, v20.4s\n"
+          "sqxtn2 v25.8h, v26.4s\n"
+          "sqxtun v19.8b, v19.8h\n"
+          "sqxtun v25.8b, v25.8h\n"
+          "st1 {v19.8b}, [x7], x5\n"
+          "st1 {v25.8b}, [x7]\n"
+          "b " DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP "f\n"
+
+          // Handle last column if exists.
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER ":\n"
+          // Registers v9, v10, v11, v14, v15, and v16 have already been loaded
+          // with the correct values at this point. This corresponds to the
+          // first two input rows of the top left output. Now load the last
+          // input row for this output. Once these inputs are no longer needed,
+          // load the input rows for the bottom left output.
+          "add x12, x15, %[input_row_size]\n"
+          "add x13, x12, %[input_row_size]\n"
+
+          "ld1 {v12.8b}, [x15], %[input_depth]\n"
+          "smlal v21.4s, v0.4h, v9.4h\n"
+          "ld1 {v13.8b}, [x15], %[input_depth]\n"
+          "smlal2 v22.4s, v0.8h, v9.8h\n"
+          "ld1 {v17.8b}, [x15]\n"
+          "smlal v21.4s, v1.4h, v10.4h\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "smlal2 v22.4s, v1.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          "smlal v21.4s, v2.4h, v11.4h\n"
+          "smlal2 v22.4s, v2.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x12]\n"
+          "smlal v21.4s, v3.4h, v14.4h\n"
+          "smlal2 v22.4s, v3.8h, v14.8h\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "smlal v21.4s, v4.4h, v15.4h\n"
+          "smlal2 v22.4s, v4.8h, v15.8h\n"
+          "ld1 {v15.8b}, [x13], %[input_depth]\n"
+          "smlal v21.4s, v5.4h, v16.4h\n"
+          "uaddw v12.8h, v28.8h, v12.8b\n"
+          "smlal2 v22.4s, v5.8h, v16.8h\n"
+          "uaddw v13.8h, v28.8h, v13.8b\n"
+          "ld1 {v16.8b}, [x13]\n"
+
+          "smlal v21.4s, v6.4h, v12.4h\n"
+          "smlal2 v22.4s, v6.8h, v12.8h\n"
+          "smlal v23.4s, v0.4h, v12.4h\n"
+          "uaddw v17.8h, v28.8h, v17.8b\n"
+          "smlal2 v24.4s, v0.8h, v12.8h\n"
+          "smlal v21.4s, v7.4h, v13.4h\n"
+          "smlal2 v22.4s, v7.8h, v13.8h\n"
+          "smlal v23.4s, v1.4h, v13.4h\n"
+          "smlal2 v24.4s, v1.8h, v13.8h\n"
+          "smlal v21.4s, v8.4h, v17.4h\n"
+          "smlal2 v22.4s, v8.8h, v17.8h\n"
+          "smlal v23.4s, v2.4h, v17.4h\n"
+          "smlal2 v24.4s, v2.8h, v17.8h\n"
+
+          "dup v26.4s, w9\n"
+          "sqrdmulh v21.4s, v21.4s, v27.4s\n"
+          "sqrdmulh v22.4s, v22.4s, v27.4s\n"
+          "and v18.16b, v21.16b, v26.16b\n"
+          "and v19.16b, v22.16b, v26.16b\n"
+          "sshr v18.4s, v18.4s, #31\n"
+          "sshr v19.4s, v19.4s, #31\n"
+          "sqadd v21.4s, v21.4s, v18.4s\n"
+          "sqadd v22.4s, v22.4s, v19.4s\n"
+          "srshl v21.4s, v21.4s, v26.4s\n"
+          "srshl v22.4s, v22.4s, v26.4s\n"
+          "add v21.4s, v21.4s, v29.4s\n"
+          "add v22.4s, v22.4s, v29.4s\n"
+          "smax v21.4s, v21.4s, v30.4s\n"
+          "smax v22.4s, v22.4s, v30.4s\n"
+          "smin v21.4s, v21.4s, v31.4s\n"
+          "smin v22.4s, v22.4s, v31.4s\n"
+          "sqxtn v21.4h, v21.4s\n"
+          "sqxtn2 v21.8h, v22.4s\n"
+          "sqxtun v21.8b, v21.8h\n"
+          "uaddw v9.8h, v28.8h, v9.8b\n"
+          "st1 {v21.8b}, [x6]\n"
+          "uaddw v10.8h, v28.8h, v10.8b\n"
+
+          "smlal v23.4s, v3.4h, v9.4h\n"
+          "uaddw v11.8h, v28.8h, v11.8b\n"
+          "smlal2 v24.4s, v3.8h, v9.8h\n"
+          "uaddw v14.8h, v28.8h, v14.8b\n"
+          "smlal v23.4s, v4.4h, v10.4h\n"
+          "uaddw v15.8h, v28.8h, v15.8b\n"
+          "smlal2 v24.4s, v4.8h, v10.8h\n"
+          "uaddw v16.8h, v28.8h, v16.8b\n"
+          "smlal v23.4s, v5.4h, v11.4h\n"
+          "smlal2 v24.4s, v5.8h, v11.8h\n"
+
+          "smlal v23.4s, v6.4h, v14.4h\n"
+          "smlal2 v24.4s, v6.8h, v14.8h\n"
+          "smlal v23.4s, v7.4h, v15.4h\n"
+          "smlal2 v24.4s, v7.8h, v15.8h\n"
+          "smlal v23.4s, v8.4h, v16.4h\n"
+          "smlal2 v24.4s, v8.8h, v16.8h\n"
+
+          "sqrdmulh v23.4s, v23.4s, v27.4s\n"
+          "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+          "and v18.16b, v23.16b, v26.16b\n"
+          "and v19.16b, v24.16b, v26.16b\n"
+          "sshr v18.4s, v18.4s, #31\n"
+          "sshr v19.4s, v19.4s, #31\n"
+          "sqadd v23.4s, v23.4s, v18.4s\n"
+          "sqadd v24.4s, v24.4s, v19.4s\n"
+          "srshl v23.4s, v23.4s, v26.4s\n"
+          "srshl v24.4s, v24.4s, v26.4s\n"
+          "add v23.4s, v23.4s, v29.4s\n"
+          "add v24.4s, v24.4s, v29.4s\n"
+          "smax v23.4s, v23.4s, v30.4s\n"
+          "smax v24.4s, v24.4s, v30.4s\n"
+          "smin v23.4s, v23.4s, v31.4s\n"
+          "smin v24.4s, v24.4s, v31.4s\n"
+          "sqxtn v23.4h, v23.4s\n"
+          "sqxtn2 v23.8h, v24.4s\n"
+          "sqxtun v23.8b, v23.8h\n"
+          "st1 {v23.8b}, [x7]\n"
+
+          DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP ":\n"
+          "subs %w[output_window_height], %w[output_window_height], #2\n"
+          "add %[input_ptr], %[input_ptr], %[input_height_increment]\n"
+          "cmp %w[output_window_height], #2\n"
+          "add %[output_ptr], %[output_ptr], %[output_height_increment]\n"
+          "bge " DEPTHWISECONV_LABEL_HEIGHT_2_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP ":\n"
+        "cmp %w[output_window_height], #1\n"
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_1 ":\n"
+        "mov x11, %[input_ptr]\n"
+        "mov x12, x11\n"
+        "add x13, x12, %[input_row_size]\n"
+        "ld1 {v9.8b}, [x12], %[input_depth]\n"
+        "add x15, x13, %[input_row_size]\n"
+        "ld1 {v10.8b}, [x12], %[input_depth]\n"
+        "mov x6, %[output_ptr]\n"
+        "ld1 {v11.8b}, [x12], %[input_depth]\n"
+        "mov w14, %w[output_window_width]\n"
+        // The height 1 / width 2 loop loads an extra 1x1 output in anticipation
+        // for the next iteration. Make sure |output_window_width| is large
+        // enough to handle the additional load, otherwise jump to the
+        // appropriate label to handle smaller widths.
+        "cmp w14, #2\n"
+        "ld1 {v12.8b}, [x13], %[input_depth]\n"
+        "ld1 {v13.8b}, [x13], %[input_depth]\n"
+        "ld1 {v14.8b}, [x13], %[input_depth]\n"
+        "ld1 {v15.8b}, [x15], %[input_depth]\n"
+        "ld1 {v16.8b}, [x15], %[input_depth]\n"
+        "ld1 {v17.8b}, [x15], %[input_depth]\n"
+
+        "uaddw v9.8h, v28.8h, v9.8b\n"
+        "ld1 {v24.4s}, [%[bias_ptr]]\n"
+        "uaddw v10.8h, v28.8h, v10.8b\n"
+        "ld1 {v25.4s}, [x10]\n"
+        "uaddw v11.8h, v28.8h, v11.8b\n"
+        "ld1 {v26.4s}, [%[bias_ptr]]\n"
+        "ld1 {v27.4s}, [x10]\n"
+        "uaddw v12.8h, v28.8h, v12.8b\n"
+        "uaddw v13.8h, v28.8h, v13.8b\n"
+        "uaddw v14.8h, v28.8h, v14.8b\n"
+        "uaddw v15.8h, v28.8h, v15.8b\n"
+        "uaddw v16.8h, v28.8h, v16.8b\n"
+        "uaddw v17.8h, v28.8h, v17.8b\n"
+
+        "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER "f\n"
+        "cmp w14, #1\n"
+        "beq " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP ":\n"
+          "smlal v24.4s, v0.4h, v9.4h\n"
+          "ld1 {v18.8b}, [x12], %[input_depth]\n"
+          "smlal2 v25.4s, v0.8h, v9.8h\n"
+          "ld1 {v19.8b}, [x12]\n"
+          "smlal v26.4s, v0.4h, v11.4h\n"
+          "ld1 {v20.8b}, [x13], %[input_depth]\n"
+          "smlal2 v27.4s, v0.8h, v11.8h\n"
+          "ld1 {v21.8b}, [x13]\n"
+          "smlal v24.4s, v1.4h, v10.4h\n"
+          "ld1 {v22.8b}, [x15], %[input_depth]\n"
+          "smlal2 v25.4s, v1.8h, v10.8h\n"
+          "ld1 {v23.8b}, [x15]\n"
+          "smlal v24.4s, v2.4h, v11.4h\n"
+          "subs w14, w14, #2\n"
+          "smlal2 v25.4s, v2.8h, v11.8h\n"
+          "cmp w14, #3\n"
+          "smlal v24.4s, v3.4h, v12.4h\n"
+          "add x11, x11, %[input_width_increment]\n"
+          "smlal2 v25.4s, v3.8h, v12.8h\n"
+          "mov x12, x11\n"
+          "smlal v26.4s, v3.4h, v14.4h\n"
+          "add x13, x12, %[input_row_size]\n"
+          "smlal2 v27.4s, v3.8h, v14.8h\n"
+          "add x15, x13, %[input_row_size]\n"
+          "smlal v24.4s, v4.4h, v13.4h\n"
+          "ld1 {v9.8b}, [x12], %[input_depth]\n"
+          "smlal2 v25.4s, v4.8h, v13.8h\n"
+          "ld1 {v10.8b}, [x12], %[input_depth]\n"
+          "smlal v24.4s, v5.4h, v14.4h\n"
+          "ld1 {v11.8b}, [x12], %[input_depth]\n"
+          "smlal2 v25.4s, v5.8h, v14.8h\n"
+          "ld1 {v12.8b}, [x13], %[input_depth]\n"
+          "smlal v24.4s, v6.4h, v15.4h\n"
+          "ld1 {v13.8b}, [x13], %[input_depth]\n"
+          "smlal2 v25.4s, v6.8h, v15.8h\n"
+          "ld1 {v14.8b}, [x13], %[input_depth]\n"
+          "smlal v26.4s, v6.4h, v17.4h\n"
+          "ld1 {v15.8b}, [x15], %[input_depth]\n"
+          "smlal2 v27.4s, v6.8h, v17.8h\n"
+          "smlal v24.4s, v7.4h, v16.4h\n"
+          "smlal2 v25.4s, v7.8h, v16.8h\n"
+          "ld1 {v16.8b}, [x15], %[input_depth]\n"
+          "smlal v24.4s, v8.4h, v17.4h\n"
+          "uaddw v18.8h, v28.8h, v18.8b\n"
+          "smlal2 v25.4s, v8.8h, v17.8h\n"
+          "ld1 {v17.8b}, [x15], %[input_depth]\n"
+          "uaddw v19.8h, v28.8h, v19.8b\n"
+
+          "smlal v26.4s, v1.4h, v18.4h\n"
+          "uaddw v20.8h, v28.8h, v20.8b\n"
+          "smlal2 v27.4s, v1.8h, v18.8h\n"
+          "smlal v26.4s, v2.4h, v19.4h\n"
+          "uaddw v21.8h, v28.8h, v21.8b\n"
+          "smlal2 v27.4s, v2.8h, v19.8h\n"
+          "smlal v26.4s, v4.4h, v20.4h\n"
+          "smlal v26.4s, v5.4h, v21.4h\n"
+          "smlal2 v27.4s, v4.8h, v20.8h\n"
+          "uaddw v22.8h, v28.8h, v22.8b\n"
+          "smlal2 v27.4s, v5.8h, v21.8h\n"
+          "uaddw v23.8h, v28.8h, v23.8b\n"
+          "smlal v26.4s, v7.4h, v22.4h\n"
+          "smlal2 v27.4s, v7.8h, v22.8h\n"
+          "smlal v26.4s, v8.4h, v23.4h\n"
+          "smlal2 v27.4s, v8.8h, v23.8h\n"
+
+          "dup v28.4s, w1\n"
+          "dup v29.4s, w9\n"
+          "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+          "sqrdmulh v25.4s, v25.4s, v28.4s\n"
+          "sqrdmulh v26.4s, v26.4s, v28.4s\n"
+          "sqrdmulh v27.4s, v27.4s, v28.4s\n"
+          "dup v28.4s, w2\n"
+          "and v30.16b, v24.16b, v29.16b\n"
+          "and v31.16b, v25.16b, v29.16b\n"
+          "sshr v30.4s, v30.4s, #31\n"
+          "sshr v31.4s, v31.4s, #31\n"
+          "sqadd v24.4s, v24.4s, v30.4s\n"
+          "sqadd v25.4s, v25.4s, v31.4s\n"
+          "and v30.16b, v26.16b, v29.16b\n"
+          "and v31.16b, v27.16b, v29.16b\n"
+          "sshr v30.4s, v30.4s, #31\n"
+          "sshr v31.4s, v31.4s, #31\n"
+          "sqadd v26.4s, v26.4s, v30.4s\n"
+          "dup v30.4s, w3\n"
+          "sqadd v27.4s, v27.4s, v31.4s\n"
+          "dup v31.4s, w4\n"
+          "srshl v24.4s, v24.4s, v29.4s\n"
+          "srshl v25.4s, v25.4s, v29.4s\n"
+          "srshl v26.4s, v26.4s, v29.4s\n"
+          "srshl v27.4s, v27.4s, v29.4s\n"
+          "add v24.4s, v24.4s, v28.4s\n"
+          "add v25.4s, v25.4s, v28.4s\n"
+          "add v26.4s, v26.4s, v28.4s\n"
+          "add v27.4s, v27.4s, v28.4s\n"
+          "dup v28.8h, w0\n"
+          "smax v24.4s, v24.4s, v30.4s\n"
+          "smax v25.4s, v25.4s, v30.4s\n"
+          "smax v26.4s, v26.4s, v30.4s\n"
+          "smax v27.4s, v27.4s, v30.4s\n"
+          "smin v24.4s, v24.4s, v31.4s\n"
+          "smin v25.4s, v25.4s, v31.4s\n"
+          "smin v26.4s, v26.4s, v31.4s\n"
+          "smin v27.4s, v27.4s, v31.4s\n"
+          "sqxtn v24.4h, v24.4s\n"
+          "sqxtn v26.4h, v26.4s\n"
+          "sqxtn2 v24.8h, v25.4s\n"
+          "ld1 {v25.4s}, [x10]\n"
+          "sqxtn2 v26.8h, v27.4s\n"
+          "ld1 {v27.4s}, [x10]\n"
+          "sqxtun v24.8b, v24.8h\n"
+          "sqxtun v26.8b, v26.8h\n"
+          "uaddw v9.8h, v28.8h, v9.8b\n"
+          "st1 {v24.8b}, [x6], x5\n"
+          "uaddw v10.8h, v28.8h, v10.8b\n"
+          "st1 {v26.8b}, [x6], x5\n"
+          "uaddw v11.8h, v28.8h, v11.8b\n"
+          "uaddw v12.8h, v28.8h, v12.8b\n"
+          "uaddw v13.8h, v28.8h, v13.8b\n"
+          "uaddw v14.8h, v28.8h, v14.8b\n"
+          "ld1 {v24.4s}, [%[bias_ptr]]\n"
+          "uaddw v15.8h, v28.8h, v15.8b\n"
+          "ld1 {v26.4s}, [%[bias_ptr]]\n"
+          "uaddw v16.8h, v28.8h, v16.8b\n"
+          "uaddw v17.8h, v28.8h, v17.8b\n"
+
+          "bge " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP "b\n"
+
+        // At this point, there will be one of 2 width or 1 width leftover,
+        // not both.
+        "cmp w14, #2\n"
+        "blt " DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER "f\n"
+
+        // Handle last two horizontal outputs if exists.
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER ":\n"
+        "smlal v24.4s, v0.4h, v9.4h\n"
+        "ld1 {v18.8b}, [x12], %[input_depth]\n"
+        "smlal2 v25.4s, v0.8h, v9.8h\n"
+        "ld1 {v19.8b}, [x12]\n"
+        "smlal v26.4s, v0.4h, v11.4h\n"
+        "ld1 {v20.8b}, [x13], %[input_depth]\n"
+        "smlal2 v27.4s, v0.8h, v11.8h\n"
+        "ld1 {v21.8b}, [x13]\n"
+        "smlal v24.4s, v1.4h, v10.4h\n"
+        "ld1 {v22.8b}, [x15], %[input_depth]\n"
+        "smlal2 v25.4s, v1.8h, v10.8h\n"
+        "ld1 {v23.8b}, [x15]\n"
+        "smlal v24.4s, v2.4h, v11.4h\n"
+        "smlal2 v25.4s, v2.8h, v11.8h\n"
+        "smlal v24.4s, v3.4h, v12.4h\n"
+        "smlal2 v25.4s, v3.8h, v12.8h\n"
+        "smlal v26.4s, v3.4h, v14.4h\n"
+        "smlal2 v27.4s, v3.8h, v14.8h\n"
+        "smlal v24.4s, v4.4h, v13.4h\n"
+        "smlal2 v25.4s, v4.8h, v13.8h\n"
+        "smlal v24.4s, v5.4h, v14.4h\n"
+        "smlal2 v25.4s, v5.8h, v14.8h\n"
+        "smlal v24.4s, v6.4h, v15.4h\n"
+        "smlal2 v25.4s, v6.8h, v15.8h\n"
+        "smlal v26.4s, v6.4h, v17.4h\n"
+        "smlal2 v27.4s, v6.8h, v17.8h\n"
+        "smlal v24.4s, v7.4h, v16.4h\n"
+        "smlal2 v25.4s, v7.8h, v16.8h\n"
+        "smlal v24.4s, v8.4h, v17.4h\n"
+        "uaddw v18.8h, v28.8h, v18.8b\n"
+        "smlal2 v25.4s, v8.8h, v17.8h\n"
+        "uaddw v19.8h, v28.8h, v19.8b\n"
+
+        "smlal v26.4s, v1.4h, v18.4h\n"
+        "uaddw v20.8h, v28.8h, v20.8b\n"
+        "smlal2 v27.4s, v1.8h, v18.8h\n"
+        "smlal v26.4s, v2.4h, v19.4h\n"
+        "uaddw v21.8h, v28.8h, v21.8b\n"
+        "smlal2 v27.4s, v2.8h, v19.8h\n"
+        "smlal v26.4s, v4.4h, v20.4h\n"
+        "smlal v26.4s, v5.4h, v21.4h\n"
+        "smlal2 v27.4s, v4.8h, v20.8h\n"
+        "uaddw v22.8h, v28.8h, v22.8b\n"
+        "smlal2 v27.4s, v5.8h, v21.8h\n"
+        "uaddw v23.8h, v28.8h, v23.8b\n"
+        "smlal v26.4s, v7.4h, v22.4h\n"
+        "smlal2 v27.4s, v7.8h, v22.8h\n"
+        "smlal v26.4s, v8.4h, v23.4h\n"
+        "smlal2 v27.4s, v8.8h, v23.8h\n"
+
+        "dup v28.4s, w1\n"
+        "dup v29.4s, w9\n"
+        "sqrdmulh v24.4s, v24.4s, v28.4s\n"
+        "sqrdmulh v25.4s, v25.4s, v28.4s\n"
+        "sqrdmulh v26.4s, v26.4s, v28.4s\n"
+        "sqrdmulh v27.4s, v27.4s, v28.4s\n"
+        "dup v28.4s, w2\n"
+        "and v30.16b, v24.16b, v29.16b\n"
+        "and v31.16b, v25.16b, v29.16b\n"
+        "sshr v30.4s, v30.4s, #31\n"
+        "sshr v31.4s, v31.4s, #31\n"
+        "sqadd v24.4s, v24.4s, v30.4s\n"
+        "sqadd v25.4s, v25.4s, v31.4s\n"
+        "and v30.16b, v26.16b, v29.16b\n"
+        "and v31.16b, v27.16b, v29.16b\n"
+        "sshr v30.4s, v30.4s, #31\n"
+        "sshr v31.4s, v31.4s, #31\n"
+        "sqadd v26.4s, v26.4s, v30.4s\n"
+        "dup v30.4s, w3\n"
+        "sqadd v27.4s, v27.4s, v31.4s\n"
+        "dup v31.4s, w4\n"
+        "srshl v24.4s, v24.4s, v29.4s\n"
+        "srshl v25.4s, v25.4s, v29.4s\n"
+        "srshl v26.4s, v26.4s, v29.4s\n"
+        "srshl v27.4s, v27.4s, v29.4s\n"
+        "add v24.4s, v24.4s, v28.4s\n"
+        "add v25.4s, v25.4s, v28.4s\n"
+        "add v26.4s, v26.4s, v28.4s\n"
+        "add v27.4s, v27.4s, v28.4s\n"
+        "dup v28.8h, w0\n"
+        "smax v24.4s, v24.4s, v30.4s\n"
+        "smax v25.4s, v25.4s, v30.4s\n"
+        "smax v26.4s, v26.4s, v30.4s\n"
+        "smax v27.4s, v27.4s, v30.4s\n"
+        "smin v24.4s, v24.4s, v31.4s\n"
+        "smin v25.4s, v25.4s, v31.4s\n"
+        "smin v26.4s, v26.4s, v31.4s\n"
+        "smin v27.4s, v27.4s, v31.4s\n"
+        "sqxtn v24.4h, v24.4s\n"
+        "sqxtn v26.4h, v26.4s\n"
+        "sqxtn2 v24.8h, v25.4s\n"
+        "sqxtn2 v26.8h, v27.4s\n"
+        "sqxtun v24.8b, v24.8h\n"
+        "sqxtun v26.8b, v26.8h\n"
+        "st1 {v24.8b}, [x6], x5\n"
+        "st1 {v26.8b}, [x6]\n"
+        "b " DEPTHWISECONV_LABEL_HEIGHT_1_END "f\n"
+
+        // Handle bottom right output if exists.
+        DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER ":\n"
+        "dup v26.4s, w9\n"
+        "dup v27.4s, w1\n"
+        "dup v29.4s, w2\n"
+
+        "smlal v24.4s, v0.4h, v9.4h\n"
+        "smlal2 v25.4s, v0.8h, v9.8h\n"
+        "smlal v24.4s, v1.4h, v10.4h\n"
+        "smlal2 v25.4s, v1.8h, v10.8h\n"
+        "smlal v24.4s, v2.4h, v11.4h\n"
+        "smlal2 v25.4s, v2.8h, v11.8h\n"
+        "smlal v24.4s, v3.4h, v12.4h\n"
+        "smlal2 v25.4s, v3.8h, v12.8h\n"
+        "smlal v24.4s, v4.4h, v13.4h\n"
+        "smlal2 v25.4s, v4.8h, v13.8h\n"
+        "smlal v24.4s, v5.4h, v14.4h\n"
+        "smlal2 v25.4s, v5.8h, v14.8h\n"
+        "smlal v24.4s, v6.4h, v15.4h\n"
+        "smlal2 v25.4s, v6.8h, v15.8h\n"
+        "smlal v24.4s, v7.4h, v16.4h\n"
+        "smlal2 v25.4s, v7.8h, v16.8h\n"
+        "smlal v24.4s, v8.4h, v17.4h\n"
+        "smlal2 v25.4s, v8.8h, v17.8h\n"
+
+        "sqrdmulh v24.4s, v24.4s, v27.4s\n"
+        "sqrdmulh v25.4s, v25.4s, v27.4s\n"
+        "and v18.16b, v24.16b, v26.16b\n"
+        "and v19.16b, v25.16b, v26.16b\n"
+        "sshr v18.4s, v18.4s, #31\n"
+        "sshr v19.4s, v19.4s, #31\n"
+        "sqadd v24.4s, v24.4s, v18.4s\n"
+        "sqadd v25.4s, v25.4s, v19.4s\n"
+        "srshl v24.4s, v24.4s, v26.4s\n"
+        "srshl v25.4s, v25.4s, v26.4s\n"
+        "add v24.4s, v24.4s, v29.4s\n"
+        "add v25.4s, v25.4s, v29.4s\n"
+        "smax v24.4s, v24.4s, v30.4s\n"
+        "smax v25.4s, v25.4s, v30.4s\n"
+        "smin v24.4s, v24.4s, v31.4s\n"
+        "smin v25.4s, v25.4s, v31.4s\n"
+        "sqxtn v24.4h, v24.4s\n"
+        "sqxtn2 v24.8h, v25.4s\n"
+        "sqxtun v24.8b, v24.8h\n"
+        "st1 {v24.8b}, [x6]\n"
+
+        DEPTHWISECONV_LABEL_HEIGHT_1_END ":\n"
+    :
+    // Outputs.
+    [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+    [output_ptr] "+r"(output_ptr),
+    [output_window_height] "+r"(output_window_height)
+    :
+    // Inputs.
+    [bias_ptr] "r"(bias_ptr), [input_row_size] "r"(input_row_size),
+    [input_depth] "r"(input_depth),
+    [output_window_width] "r"(output_window_width),
+    [input_width_increment] "r"(input_width_increment),
+    [input_height_increment] "r"(input_height_increment),
+    [output_height_increment] "r"(output_height_increment),
+    [params_ptr] "r"(params_ptr)
+    :
+    // Clobbers.
+    "cc", "memory",
+    // We use these NEON registers.
+    "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9",
+    "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19",
+    "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29",
+    "v30", "v31",
+    // We use these general-purpose registers.
+    "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
+    "x9", "x10", "x11", "x12", "x13", "x14", "x15",
+    "x19", "x20");
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_1_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_WIDTH_2_AFTER_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_2_AFTER_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_1
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LOOP
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_1_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_WIDTH_2_LEFTOVER
+#undef DEPTHWISECONV_LABEL_HEIGHT_1_END
   }
 };
 
-template <>
-struct ConvKernel3x3FilterDepth8<4, 1, 2, 2> {
-  static inline void Run(const uint8* input_ptr, int input_depth,
-                         int32 input_offset, int input_row_size,
-                         const uint8* filter_ptr, int32 filter_offset,
-                         const int32* bias_ptr, int32 output_offset,
-                         int32 output_multiplier, int output_shift,
-                         int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_ptr,
-                         int output_depth, int output_width) {
-    const int output_row_size = output_depth * output_width;
-
-    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
-
-    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
-    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8;
-    uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7,
-        temp_8;
-
-    const uint8* ptr = input_ptr;
-
-    // Load all inputs for top output.
-    temp_0 = vld1_u8(ptr);
-    temp_1 = vld1_u8(ptr + input_depth);
-    temp_2 = vld1_u8(ptr + 2 * input_depth);
-    ptr += input_row_size;
-    temp_3 = vld1_u8(ptr);
-    temp_4 = vld1_u8(ptr + input_depth);
-    temp_5 = vld1_u8(ptr + 2 * input_depth);
-    ptr += input_row_size;
-    temp_6 = vld1_u8(ptr);
-    temp_7 = vld1_u8(ptr + input_depth);
-    temp_8 = vld1_u8(ptr + 2 * input_depth);
-
-    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
-    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
-    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
-
-    input_0 = vaddq_s16(input_0, input_offset_vec);
-    input_1 = vaddq_s16(input_1, input_offset_vec);
-    input_2 = vaddq_s16(input_2, input_offset_vec);
-    input_3 = vaddq_s16(input_3, input_offset_vec);
-    input_4 = vaddq_s16(input_4, input_offset_vec);
-    input_5 = vaddq_s16(input_5, input_offset_vec);
-    input_6 = vaddq_s16(input_6, input_offset_vec);
-    input_7 = vaddq_s16(input_7, input_offset_vec);
-    input_8 = vaddq_s16(input_8, input_offset_vec);
-
-    DotProductAndStore(
-        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8, bias_ptr, output_offset, output_multiplier,
-        output_shift, output_activation_min, output_activation_max, output_ptr);
-
-    // Second output.
-    output_ptr += output_row_size;
-
-    ptr += input_row_size;
-    temp_0 = vld1_u8(ptr);
-    temp_1 = vld1_u8(ptr + input_depth);
-    temp_2 = vld1_u8(ptr + 2 * input_depth);
-    ptr += input_row_size;
-    temp_3 = vld1_u8(ptr);
-    temp_4 = vld1_u8(ptr + input_depth);
-    temp_5 = vld1_u8(ptr + 2 * input_depth);
-
-    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-    input_0 = vaddq_s16(input_0, input_offset_vec);
-    input_1 = vaddq_s16(input_1, input_offset_vec);
-    input_2 = vaddq_s16(input_2, input_offset_vec);
-    input_3 = vaddq_s16(input_3, input_offset_vec);
-    input_4 = vaddq_s16(input_4, input_offset_vec);
-    input_5 = vaddq_s16(input_5, input_offset_vec);
-
-    DotProductAndStore(
-        filter, input_6, input_7, input_8, input_0, input_1, input_2, input_3,
-        input_4, input_5, bias_ptr, output_offset, output_multiplier,
-        output_shift, output_activation_min, output_activation_max, output_ptr);
-
-    // Third output.
-    output_ptr += output_row_size;
-
-    ptr += input_row_size;
-    temp_6 = vld1_u8(ptr);
-    temp_7 = vld1_u8(ptr + input_depth);
-    temp_8 = vld1_u8(ptr + 2 * input_depth);
-    ptr += input_row_size;
-    temp_0 = vld1_u8(ptr);
-    temp_1 = vld1_u8(ptr + input_depth);
-    temp_2 = vld1_u8(ptr + 2 * input_depth);
-
-    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
-    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
-    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
-    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-
-    input_6 = vaddq_s16(input_6, input_offset_vec);
-    input_7 = vaddq_s16(input_7, input_offset_vec);
-    input_8 = vaddq_s16(input_8, input_offset_vec);
-    input_0 = vaddq_s16(input_0, input_offset_vec);
-    input_1 = vaddq_s16(input_1, input_offset_vec);
-    input_2 = vaddq_s16(input_2, input_offset_vec);
-
-    DotProductAndStore(
-        filter, input_3, input_4, input_5, input_6, input_7, input_8, input_0,
-        input_1, input_2, bias_ptr, output_offset, output_multiplier,
-        output_shift, output_activation_min, output_activation_max, output_ptr);
-
-    // Fourth output.
-    output_ptr += output_row_size;
-
-    ptr += input_row_size;
-    temp_3 = vld1_u8(ptr);
-    temp_4 = vld1_u8(ptr + input_depth);
-    temp_5 = vld1_u8(ptr + 2 * input_depth);
-    ptr += input_row_size;
-    temp_6 = vld1_u8(ptr);
-    temp_7 = vld1_u8(ptr + input_depth);
-    temp_8 = vld1_u8(ptr + 2 * input_depth);
-
-    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
-    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
-    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
-
-    input_3 = vaddq_s16(input_3, input_offset_vec);
-    input_4 = vaddq_s16(input_4, input_offset_vec);
-    input_5 = vaddq_s16(input_5, input_offset_vec);
-    input_6 = vaddq_s16(input_6, input_offset_vec);
-    input_7 = vaddq_s16(input_7, input_offset_vec);
-    input_8 = vaddq_s16(input_8, input_offset_vec);
-
-    DotProductAndStore(
-        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8, bias_ptr, output_offset, output_multiplier,
-        output_shift, output_activation_min, output_activation_max, output_ptr);
-  }
-};
-
-template <>
-struct ConvKernel3x3FilterDepth8<2, 2, 2, 2> {
-  static inline void Run(const uint8* input_ptr, int input_depth,
-                         int32 input_offset, int input_row_size,
-                         const uint8* filter_ptr, int32 filter_offset,
-                         const int32* bias_ptr, int32 output_offset,
-                         int32 output_multiplier, int output_shift,
-                         int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_ptr,
-                         int output_depth, int output_width) {
-    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
-
-    Int32x8 acc_0, acc_1, acc_2, acc_3;
-    acc_0.low = vld1q_s32(bias_ptr);
-    acc_1.low = vld1q_s32(bias_ptr);
-    acc_2.low = vld1q_s32(bias_ptr);
-    acc_3.low = vld1q_s32(bias_ptr);
-
-    bias_ptr += 4;
-    acc_0.high = vld1q_s32(bias_ptr);
-    acc_1.high = vld1q_s32(bias_ptr);
-    acc_2.high = vld1q_s32(bias_ptr);
-    acc_3.high = vld1q_s32(bias_ptr);
-
-    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
-
-    // Add scope for input registers to help the compiler know that it is
-    // not needed.
-    {
-      // To process 2x2 outputs using a 3x3 filter at stride 2, we require
-      // 5x5 inputs. We load the first 5x2 inputs at a time.
-      int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-          input_7, input_8, input_9;
-
-      const uint8* ptr = input_ptr;
-
-      // Load inputs.
-      {
-        uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4;
-
-        temp_0 = vld1_u8(ptr);
-        temp_1 = vld1_u8(ptr + input_depth);
-        temp_2 = vld1_u8(ptr + 2 * input_depth);
-        temp_3 = vld1_u8(ptr + 3 * input_depth);
-        temp_4 = vld1_u8(ptr + 4 * input_depth);
-
-        input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-        input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-        input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-        input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-        input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-
-        input_0 = vaddq_s16(input_0, input_offset_vec);
-        input_1 = vaddq_s16(input_1, input_offset_vec);
-        input_2 = vaddq_s16(input_2, input_offset_vec);
-        input_3 = vaddq_s16(input_3, input_offset_vec);
-        input_4 = vaddq_s16(input_4, input_offset_vec);
-
-        ptr += input_row_size;
-        temp_0 = vld1_u8(ptr);
-        temp_1 = vld1_u8(ptr + input_depth);
-        temp_2 = vld1_u8(ptr + 2 * input_depth);
-        temp_3 = vld1_u8(ptr + 3 * input_depth);
-        temp_4 = vld1_u8(ptr + 4 * input_depth);
-
-        input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-        input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-        input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-        input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-        input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-
-        input_5 = vaddq_s16(input_5, input_offset_vec);
-        input_6 = vaddq_s16(input_6, input_offset_vec);
-        input_7 = vaddq_s16(input_7, input_offset_vec);
-        input_8 = vaddq_s16(input_8, input_offset_vec);
-        input_9 = vaddq_s16(input_9, input_offset_vec);
-      }
-
-      acc_0 = MultiplyAccumulateRow(acc_0, filter.f0, filter.f1, filter.f2,
-                                    input_0, input_1, input_2);
-
-      acc_1 = MultiplyAccumulateRow(acc_1, filter.f0, filter.f1, filter.f2,
-                                    input_2, input_3, input_4);
-
-      acc_0 = MultiplyAccumulateRow(acc_0, filter.f3, filter.f4, filter.f5,
-                                    input_5, input_6, input_7);
-
-      acc_1 = MultiplyAccumulateRow(acc_1, filter.f3, filter.f4, filter.f5,
-                                    input_7, input_8, input_9);
-
-      // Load next inputs.
-      {
-        uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4;
-
-        ptr += input_row_size;
-        temp_0 = vld1_u8(ptr);
-        temp_1 = vld1_u8(ptr + input_depth);
-        temp_2 = vld1_u8(ptr + 2 * input_depth);
-        temp_3 = vld1_u8(ptr + 3 * input_depth);
-        temp_4 = vld1_u8(ptr + 4 * input_depth);
-
-        input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-        input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-        input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-        input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-        input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-
-        input_0 = vaddq_s16(input_0, input_offset_vec);
-        input_1 = vaddq_s16(input_1, input_offset_vec);
-        input_2 = vaddq_s16(input_2, input_offset_vec);
-        input_3 = vaddq_s16(input_3, input_offset_vec);
-        input_4 = vaddq_s16(input_4, input_offset_vec);
-
-        ptr += input_row_size;
-        temp_0 = vld1_u8(ptr);
-        temp_1 = vld1_u8(ptr + input_depth);
-        temp_2 = vld1_u8(ptr + 2 * input_depth);
-        temp_3 = vld1_u8(ptr + 3 * input_depth);
-        temp_4 = vld1_u8(ptr + 4 * input_depth);
-
-        input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-        input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-        input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-        input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-        input_9 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-
-        input_5 = vaddq_s16(input_5, input_offset_vec);
-        input_6 = vaddq_s16(input_6, input_offset_vec);
-        input_7 = vaddq_s16(input_7, input_offset_vec);
-        input_8 = vaddq_s16(input_8, input_offset_vec);
-        input_9 = vaddq_s16(input_9, input_offset_vec);
-      }
-
-      acc_0 = MultiplyAccumulateRow(acc_0, filter.f6, filter.f7, filter.f8,
-                                    input_0, input_1, input_2);
-
-      acc_1 = MultiplyAccumulateRow(acc_1, filter.f6, filter.f7, filter.f8,
-                                    input_2, input_3, input_4);
-
-      // Moving onto the two bottom outputs.
-      acc_2 = MultiplyAccumulateRow(acc_2, filter.f0, filter.f1, filter.f2,
-                                    input_0, input_1, input_2);
-
-      acc_3 = MultiplyAccumulateRow(acc_3, filter.f0, filter.f1, filter.f2,
-                                    input_2, input_3, input_4);
-
-      acc_2 = MultiplyAccumulateRow(acc_2, filter.f3, filter.f4, filter.f5,
-                                    input_5, input_6, input_7);
-
-      acc_3 = MultiplyAccumulateRow(acc_3, filter.f3, filter.f4, filter.f5,
-                                    input_7, input_8, input_9);
-
-      // Load last input row.
-      {
-        uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4;
+enum class EdgeType { kCorner, kHorizontal, kVertical, kCenter };
 
-        ptr += input_row_size;
-        temp_0 = vld1_u8(ptr);
-        temp_1 = vld1_u8(ptr + input_depth);
-        temp_2 = vld1_u8(ptr + 2 * input_depth);
-        temp_3 = vld1_u8(ptr + 3 * input_depth);
-        temp_4 = vld1_u8(ptr + 4 * input_depth);
-
-        input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-        input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-        input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-        input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-        input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-
-        input_0 = vaddq_s16(input_0, input_offset_vec);
-        input_1 = vaddq_s16(input_1, input_offset_vec);
-        input_2 = vaddq_s16(input_2, input_offset_vec);
-        input_3 = vaddq_s16(input_3, input_offset_vec);
-        input_4 = vaddq_s16(input_4, input_offset_vec);
-      }
-
-      acc_2 = MultiplyAccumulateRow(acc_2, filter.f6, filter.f7, filter.f8,
-                                    input_0, input_1, input_2);
-
-      acc_3 = MultiplyAccumulateRow(acc_3, filter.f6, filter.f7, filter.f8,
-                                    input_2, input_3, input_4);
-    }
-
-    DownquantizeAndStore2x2Output(acc_0, acc_1, acc_2, acc_3, output_offset,
-                                  output_multiplier, output_shift,
-                                  output_activation_min, output_activation_max,
-                                  output_ptr, output_depth, output_width);
-  }
-};
+template <EdgeType kEdgeType, int kPadWidth, int kPadHeight>
+struct DepthwiseConvPartial {};
 
 template <>
-struct ConvKernel3x3FilterDepth8<2, 4, 2, 2> {
-  static inline void Run(const uint8* input_ptr, int input_depth,
-                         int32 input_offset, int input_row_size,
-                         const uint8* filter_ptr, int32 filter_offset,
-                         const int32* bias_ptr, int32 output_offset,
-                         int32 output_multiplier, int output_shift,
-                         int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_ptr,
-                         int output_depth, int output_width) {
-    // Reuse 2x2 kernel twice.
-    ConvKernel3x3FilterDepth8<2, 2, 2, 2>::Run(
-        input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
-        filter_offset, bias_ptr, output_offset, output_multiplier, output_shift,
-        output_activation_min, output_activation_max, output_ptr, output_depth,
-        output_width);
-
-    ConvKernel3x3FilterDepth8<2, 2, 2, 2>::Run(
-        input_ptr + 4 * input_depth, input_depth, input_offset, input_row_size,
-        filter_ptr, filter_offset, bias_ptr, output_offset, output_multiplier,
-        output_shift, output_activation_min, output_activation_max,
-        output_ptr + 2 * output_depth, output_depth, output_width);
+struct DepthwiseConvPartial<EdgeType::kCenter, 1, 1> {
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                         const int32* bias_ptr, uint8* output_ptr,
+                         const DepthwiseConvParams* params_ptr) {
+#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
+#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
+    asm volatile(
+        // Performs depthwise convolutions for an input window of size 1x1 and
+        // padding of 1 across the full depth. Expects |input_ptr| and
+        // |filter_ptr| to be pointing to the 1x1 input and filter values.
+        "ld1 {v8.8b}, [%[input_ptr]], #8\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "ldr x11, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "dup v26.8h, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w10\n"
+        "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
+        "cmp x11, #16\n"
+        "ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_SHIFT) "]\n"
+        "dup v28.4s, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "neg w10, w10\n"
+        "dup v29.4s, w10\n"
+        "ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.4s, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+        "dup v31.4s, w10\n"
+        "dup v25.8h, w9\n"
+
+        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v8.8h, v26.8h, v8.8b\n"
+        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v0.8h, v25.8h, v0.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
+          "smlal v16.4s, v0.4h, v8.4h\n"
+          "subs x11, x11, #8\n"
+          "smlal2 v17.4s, v0.8h, v8.8h\n"
+          "ld1 {v8.8b}, [%[input_ptr]], #8\n"
+          "cmp x11, #16\n"
+          "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
+
+          "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+          "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+          "and v18.16b, v16.16b, v29.16b\n"
+          "and v19.16b, v17.16b, v29.16b\n"
+          "sshr v18.4s, v18.4s, #31\n"
+          "sshr v19.4s, v19.4s, #31\n"
+          "sqadd v16.4s, v16.4s, v18.4s\n"
+          "sqadd v17.4s, v17.4s, v19.4s\n"
+          "srshl v16.4s, v16.4s, v29.4s\n"
+          "srshl v17.4s, v17.4s, v29.4s\n"
+          "add v16.4s, v16.4s, v28.4s\n"
+          "add v17.4s, v17.4s, v28.4s\n"
+          "smax v16.4s, v16.4s, v30.4s\n"
+          "smax v17.4s, v17.4s, v30.4s\n"
+          "smin v16.4s, v16.4s, v31.4s\n"
+          "smin v17.4s, v17.4s, v31.4s\n"
+          "sqxtn v16.4h, v16.4s\n"
+          "sqxtn2 v16.8h, v17.4s\n"
+          "sqxtun v16.8b, v16.8h\n"
+          "st1 {v16.8b}, [%[output_ptr]], #8\n"
+          "uaddw v8.8h, v26.8h, v8.8b\n"
+          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v0.8h, v25.8h, v0.8b\n"
+          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+
+          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
+        "smlal v16.4s, v0.4h, v8.4h\n"
+        "smlal2 v17.4s, v0.8h, v8.8h\n"
+
+        "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+        "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+        "and v18.16b, v16.16b, v29.16b\n"
+        "and v19.16b, v17.16b, v29.16b\n"
+        "sshr v18.4s, v18.4s, #31\n"
+        "sshr v19.4s, v19.4s, #31\n"
+        "sqadd v16.4s, v16.4s, v18.4s\n"
+        "sqadd v17.4s, v17.4s, v19.4s\n"
+        "srshl v16.4s, v16.4s, v29.4s\n"
+        "srshl v17.4s, v17.4s, v29.4s\n"
+
+        "add v16.4s, v16.4s, v28.4s\n"
+        "add v17.4s, v17.4s, v28.4s\n"
+        "smax v16.4s, v16.4s, v30.4s\n"
+        "smax v17.4s, v17.4s, v30.4s\n"
+        "smin v16.4s, v16.4s, v31.4s\n"
+        "smin v17.4s, v17.4s, v31.4s\n"
+        "sqxtn v16.4h, v16.4s\n"
+        "sqxtn2 v16.8h, v17.4s\n"
+        "sqxtun v16.8b, v16.8h\n"
+        "st1 {v16.8b}, [%[output_ptr]]\n"
+        :
+        // Outputs.
+        [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
+        :
+        // Inputs.
+        [params_ptr] "r"(params_ptr)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v8", "v16", "v17", "v18", "v19", "v25", "v26", "v27", "v28",
+        "v29", "v30", "v31",
+        // We use these general-purpose registers.
+        "x9", "x10", "x11");
+#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
+#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
   }
 };
 
 template <>
-struct ConvKernel3x3FilterDepth8<2, 1, 2, 2> {
-  static inline void Run(const uint8* input_ptr, int input_depth,
-                         int32 input_offset, int input_row_size,
-                         const uint8* filter_ptr, int32 filter_offset,
-                         const int32* bias_ptr, int32 output_offset,
-                         int32 output_multiplier, int output_shift,
-                         int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_ptr,
-                         int output_depth, int output_width) {
-    const int output_row_size = output_depth * output_width;
-
-    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
-
-    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
-    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8;
-    uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7,
-        temp_8;
-
-    const uint8* ptr = input_ptr;
-
-    // Load all inputs for top output.
-    temp_0 = vld1_u8(ptr);
-    temp_1 = vld1_u8(ptr + input_depth);
-    temp_2 = vld1_u8(ptr + 2 * input_depth);
-    ptr += input_row_size;
-    temp_3 = vld1_u8(ptr);
-    temp_4 = vld1_u8(ptr + input_depth);
-    temp_5 = vld1_u8(ptr + 2 * input_depth);
-    ptr += input_row_size;
-    temp_6 = vld1_u8(ptr);
-    temp_7 = vld1_u8(ptr + input_depth);
-    temp_8 = vld1_u8(ptr + 2 * input_depth);
-
-    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
-    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
-    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
-
-    input_0 = vaddq_s16(input_0, input_offset_vec);
-    input_1 = vaddq_s16(input_1, input_offset_vec);
-    input_2 = vaddq_s16(input_2, input_offset_vec);
-    input_3 = vaddq_s16(input_3, input_offset_vec);
-    input_4 = vaddq_s16(input_4, input_offset_vec);
-    input_5 = vaddq_s16(input_5, input_offset_vec);
-    input_6 = vaddq_s16(input_6, input_offset_vec);
-    input_7 = vaddq_s16(input_7, input_offset_vec);
-    input_8 = vaddq_s16(input_8, input_offset_vec);
-
-    DotProductAndStore(
-        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8, bias_ptr, output_offset, output_multiplier,
-        output_shift, output_activation_min, output_activation_max, output_ptr);
-
-    // Second output.
-    output_ptr += output_row_size;
-
-    ptr += input_row_size;
-    temp_0 = vld1_u8(ptr);
-    temp_1 = vld1_u8(ptr + input_depth);
-    temp_2 = vld1_u8(ptr + 2 * input_depth);
-    ptr += input_row_size;
-    temp_3 = vld1_u8(ptr);
-    temp_4 = vld1_u8(ptr + input_depth);
-    temp_5 = vld1_u8(ptr + 2 * input_depth);
-
-    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-
-    input_0 = vaddq_s16(input_0, input_offset_vec);
-    input_1 = vaddq_s16(input_1, input_offset_vec);
-    input_2 = vaddq_s16(input_2, input_offset_vec);
-    input_3 = vaddq_s16(input_3, input_offset_vec);
-    input_4 = vaddq_s16(input_4, input_offset_vec);
-    input_5 = vaddq_s16(input_5, input_offset_vec);
-
-    DotProductAndStore(
-        filter, input_6, input_7, input_8, input_0, input_1, input_2, input_3,
-        input_4, input_5, bias_ptr, output_offset, output_multiplier,
-        output_shift, output_activation_min, output_activation_max, output_ptr);
+struct DepthwiseConvPartial<EdgeType::kCorner, 1, 1> {
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                         const int32* bias_ptr, uint8* output_ptr,
+                         const DepthwiseConvParams* params_ptr) {
+#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
+#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
+    asm volatile(
+        // Performs depthwise convolutions for an input window of size 2x2 and
+        // padding of 1 across the full depth. Expects |input_ptr| and
+        // |filter_ptr| to be pointing to the beginning of the 2x2 input and
+        // filter values.
+
+        // Load input and filter values.
+        "ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "ldr x9, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
+        "cmp x15, #16\n"
+        "add x12, %[input_ptr], x15\n"
+        "add x13, %[input_ptr], x9\n"
+        "ld1 {v8.8b}, [%[input_ptr]], #8\n"
+        "add x14, x13, x15\n"
+        "ld1 {v9.8b}, [x12], #8\n"
+        "ldr x6, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
+
+        "add x9, %[filter_ptr], x15\n"
+        "ld1 {v10.8b}, [x13], #8\n"
+        "add x10, %[filter_ptr], x6\n"
+        "ld1 {v11.8b}, [x14], #8\n"
+        "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
+        "add x11, x10, x15\n"
+        "ld1 {v1.8b}, [x9], #8\n"
+        "ld1 {v2.8b}, [x10], #8\n"
+        "ld1 {v3.8b}, [x11], #8\n"
+
+        // Load constants.
+        "ldr w6, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "dup v26.8h, w6\n"
+        "ldr w6, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w7\n"
+        "ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_SHIFT) "]\n"
+        "dup v28.4s, w6\n"
+        "ldr w6, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "neg w7, w7\n"
+        "dup v29.4s, w7\n"
+        "ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.4s, w6\n"
+        "ldr w6, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+        "dup v31.4s, w7\n"
+        "dup v25.8h, w6\n"
+
+        // Add input and filter offsets.
+        "uaddw v8.8h, v26.8h, v8.8b\n"
+        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v9.8h, v26.8h, v9.8b\n"
+        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v10.8h, v26.8h, v10.8b\n"
+        "uaddw v11.8h, v26.8h, v11.8b\n"
+
+        "uaddw v0.8h, v25.8h, v0.8b\n"
+        "uaddw v1.8h, v25.8h, v1.8b\n"
+        "uaddw v2.8h, v25.8h, v2.8b\n"
+        "uaddw v3.8h, v25.8h, v3.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
+          "smlal v16.4s, v0.4h, v8.4h\n"
+          "subs x15, x15, #8\n"
+          "smlal2 v17.4s, v0.8h, v8.8h\n"
+          "ld1 {v8.8b}, [%[input_ptr]], #8\n"
+          "cmp x15, #16\n"
+          "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
+          "smlal v16.4s, v1.4h, v9.4h\n"
+          "smlal2 v17.4s, v1.8h, v9.8h\n"
+          "ld1 {v9.8b}, [x12], #8\n"
+          "smlal v16.4s, v2.4h, v10.4h\n"
+          "ld1 {v1.8b}, [x9], #8\n"
+          "smlal2 v17.4s, v2.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x13], #8\n"
+          "smlal v16.4s, v3.4h, v11.4h\n"
+          "ld1 {v2.8b}, [x10], #8\n"
+          "smlal2 v17.4s, v3.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x14], #8\n"
+          "ld1 {v3.8b}, [x11], #8\n"
+
+          "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+          "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+          "and v18.16b, v16.16b, v29.16b\n"
+          "and v19.16b, v17.16b, v29.16b\n"
+          "sshr v18.4s, v18.4s, #31\n"
+          "sshr v19.4s, v19.4s, #31\n"
+          "sqadd v16.4s, v16.4s, v18.4s\n"
+          "sqadd v17.4s, v17.4s, v19.4s\n"
+          "srshl v16.4s, v16.4s, v29.4s\n"
+          "srshl v17.4s, v17.4s, v29.4s\n"
+          "add v16.4s, v16.4s, v28.4s\n"
+          "add v17.4s, v17.4s, v28.4s\n"
+          "smax v16.4s, v16.4s, v30.4s\n"
+          "smax v17.4s, v17.4s, v30.4s\n"
+          "smin v16.4s, v16.4s, v31.4s\n"
+          "smin v17.4s, v17.4s, v31.4s\n"
+          "sqxtn v16.4h, v16.4s\n"
+          "sqxtn2 v16.8h, v17.4s\n"
+          "sqxtun v16.8b, v16.8h\n"
+          "st1 {v16.8b}, [%[output_ptr]], #8\n"
+          "uaddw v8.8h, v26.8h, v8.8b\n"
+          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v10.8h, v26.8h, v10.8b\n"
+          "uaddw v11.8h, v26.8h, v11.8b\n"
+          "uaddw v0.8h, v25.8h, v0.8b\n"
+          "uaddw v1.8h, v25.8h, v1.8b\n"
+          "uaddw v2.8h, v25.8h, v2.8b\n"
+          "uaddw v3.8h, v25.8h, v3.8b\n"
+
+          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
+        "smlal v16.4s, v0.4h, v8.4h\n"
+        "smlal2 v17.4s, v0.8h, v8.8h\n"
+        "smlal v16.4s, v1.4h, v9.4h\n"
+        "smlal2 v17.4s, v1.8h, v9.8h\n"
+        "smlal v16.4s, v2.4h, v10.4h\n"
+        "smlal2 v17.4s, v2.8h, v10.8h\n"
+        "smlal v16.4s, v3.4h, v11.4h\n"
+        "smlal2 v17.4s, v3.8h, v11.8h\n"
+
+        "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+        "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+        "and v18.16b, v16.16b, v29.16b\n"
+        "and v19.16b, v17.16b, v29.16b\n"
+        "sshr v18.4s, v18.4s, #31\n"
+        "sshr v19.4s, v19.4s, #31\n"
+        "sqadd v16.4s, v16.4s, v18.4s\n"
+        "sqadd v17.4s, v17.4s, v19.4s\n"
+        "srshl v16.4s, v16.4s, v29.4s\n"
+        "srshl v17.4s, v17.4s, v29.4s\n"
+
+        "add v16.4s, v16.4s, v28.4s\n"
+        "add v17.4s, v17.4s, v28.4s\n"
+        "smax v16.4s, v16.4s, v30.4s\n"
+        "smax v17.4s, v17.4s, v30.4s\n"
+        "smin v16.4s, v16.4s, v31.4s\n"
+        "smin v17.4s, v17.4s, v31.4s\n"
+        "sqxtn v16.4h, v16.4s\n"
+        "sqxtn2 v16.8h, v17.4s\n"
+        "sqxtun v16.8b, v16.8h\n"
+        "st1 {v16.8b}, [%[output_ptr]]\n"
+        :
+        // Outputs.
+        [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
+        :
+        // Inputs.
+        [params_ptr] "r"(params_ptr)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v16", "v17", "v18",
+        "v19", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+        // We use these general-purpose registers.
+        "x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
+#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
+#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
   }
 };
 
 template <>
-struct ConvKernel3x3FilterDepth8<1, 2, 2, 2> {
-  static inline void Run(const uint8* input_ptr, int input_depth,
-                         int32 input_offset, int input_row_size,
-                         const uint8* filter_ptr, int32 filter_offset,
-                         const int32* bias_ptr, int32 output_offset,
-                         int32 output_multiplier, int output_shift,
-                         int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_ptr,
-                         int output_depth, int output_width) {
-    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
-
-    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
-    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8;
-    uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7,
-        temp_8;
-
-    const uint8* ptr = input_ptr;
-
-    // Load all inputs for top output.
-    temp_0 = vld1_u8(ptr);
-    temp_1 = vld1_u8(ptr + input_depth);
-    temp_2 = vld1_u8(ptr + 2 * input_depth);
-    ptr += input_row_size;
-    temp_3 = vld1_u8(ptr);
-    temp_4 = vld1_u8(ptr + input_depth);
-    temp_5 = vld1_u8(ptr + 2 * input_depth);
-    ptr += input_row_size;
-    temp_6 = vld1_u8(ptr);
-    temp_7 = vld1_u8(ptr + input_depth);
-    temp_8 = vld1_u8(ptr + 2 * input_depth);
-
-    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
-    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
-    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
-
-    input_0 = vaddq_s16(input_0, input_offset_vec);
-    input_1 = vaddq_s16(input_1, input_offset_vec);
-    input_2 = vaddq_s16(input_2, input_offset_vec);
-    input_3 = vaddq_s16(input_3, input_offset_vec);
-    input_4 = vaddq_s16(input_4, input_offset_vec);
-    input_5 = vaddq_s16(input_5, input_offset_vec);
-    input_6 = vaddq_s16(input_6, input_offset_vec);
-    input_7 = vaddq_s16(input_7, input_offset_vec);
-    input_8 = vaddq_s16(input_8, input_offset_vec);
-
-    DotProductAndStore(
-        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8, bias_ptr, output_offset, output_multiplier,
-        output_shift, output_activation_min, output_activation_max, output_ptr);
-
-    // Second output.
-    output_ptr += output_depth;
-
-    ptr = input_ptr + 3 * input_depth;
-    temp_0 = vld1_u8(ptr);
-    temp_1 = vld1_u8(ptr + input_depth);
-    ptr += input_row_size;
-    temp_3 = vld1_u8(ptr);
-    temp_4 = vld1_u8(ptr + input_depth);
-    ptr += input_row_size;
-    temp_6 = vld1_u8(ptr);
-    temp_7 = vld1_u8(ptr + input_depth);
-
-    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
-    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
-
-    input_0 = vaddq_s16(input_0, input_offset_vec);
-    input_1 = vaddq_s16(input_1, input_offset_vec);
-    input_3 = vaddq_s16(input_3, input_offset_vec);
-    input_4 = vaddq_s16(input_4, input_offset_vec);
-    input_6 = vaddq_s16(input_6, input_offset_vec);
-    input_7 = vaddq_s16(input_7, input_offset_vec);
-
-    DotProductAndStore(
-        filter, input_2, input_0, input_1, input_5, input_3, input_4, input_8,
-        input_6, input_7, bias_ptr, output_offset, output_multiplier,
-        output_shift, output_activation_min, output_activation_max, output_ptr);
+struct DepthwiseConvPartial<EdgeType::kHorizontal, 1, 1> {
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                         const int32* bias_ptr, uint8* output_ptr,
+                         const DepthwiseConvParams* params_ptr) {
+#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
+#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
+    asm volatile(
+        // Performs depthwise convolutions for an input window of size 2x3 and
+        // padding of 1 across the full depth. Expects |input_ptr| and
+        // |filter_ptr| to be pointing to the beginning of the 2x3 input and
+        // filter values.
+
+        // Load input and filter values.
+        "ldr x7, [%[params_ptr], #" STR(OFFSET_INPUT_DEPTH) "]\n"
+        "mov x12, %[input_ptr]\n"
+        "ldr x11, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
+        "mov x9, %[filter_ptr]\n"
+        "ldr x14, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
+        "add x13, x12, x11\n"
+        "ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+
+        "ld1 {v8.8b}, [x12], x7\n"
+        "add x10, x9, x14\n"
+        "ld1 {v9.8b}, [x12], x7\n"
+        "cmp x15, #16\n"
+        "ld1 {v10.8b}, [x12]\n"
+        "add %[input_ptr], %[input_ptr], #8\n"
+        "ld1 {v11.8b}, [x13], x7\n"
+        "add %[filter_ptr], %[filter_ptr], #8\n"
+        "ld1 {v12.8b}, [x13], x7\n"
+        "ld1 {v13.8b}, [x13]\n"
+
+        "ld1 {v0.8b}, [x9], x7\n"
+        "ld1 {v1.8b}, [x9], x7\n"
+        "ld1 {v2.8b}, [x9]\n"
+        "ld1 {v3.8b}, [x10], x7\n"
+        "ld1 {v4.8b}, [x10], x7\n"
+        "ld1 {v5.8b}, [x10]\n"
+
+        // Load constants.
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "dup v26.8h, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w13\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_SHIFT) "]\n"
+        "dup v28.4s, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "neg w13, w13\n"
+        "dup v29.4s, w13\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.4s, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+        "dup v31.4s, w13\n"
+        "dup v25.8h, w12\n"
+
+        // Add input and filter offsets.
+        "uaddw v8.8h, v26.8h, v8.8b\n"
+        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v9.8h, v26.8h, v9.8b\n"
+        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v10.8h, v26.8h, v10.8b\n"
+        "uaddw v11.8h, v26.8h, v11.8b\n"
+        "uaddw v12.8h, v26.8h, v12.8b\n"
+        "uaddw v13.8h, v26.8h, v13.8b\n"
+
+        "uaddw v0.8h, v25.8h, v0.8b\n"
+        "uaddw v1.8h, v25.8h, v1.8b\n"
+        "uaddw v2.8h, v25.8h, v2.8b\n"
+        "uaddw v3.8h, v25.8h, v3.8b\n"
+        "uaddw v4.8h, v25.8h, v4.8b\n"
+        "uaddw v5.8h, v25.8h, v5.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
+          "mov x12, %[input_ptr]\n"
+          "subs x15, x15, #8\n"
+          "add x13, x12, x11\n"
+          "cmp x15, #16\n"
+          "add %[input_ptr], %[input_ptr], #8\n"
+
+          "smlal v16.4s, v0.4h, v8.4h\n"
+          "mov x9, %[filter_ptr]\n"
+          "smlal2 v17.4s, v0.8h, v8.8h\n"
+          "ld1 {v8.8b}, [x12], x7\n"
+          "smlal v16.4s, v1.4h, v9.4h\n"
+          "add x10, x9, x14\n"
+          "smlal2 v17.4s, v1.8h, v9.8h\n"
+          "ld1 {v9.8b}, [x12], x7\n"
+          "smlal v16.4s, v2.4h, v10.4h\n"
+          "add %[filter_ptr], %[filter_ptr], #8\n"
+          "smlal2 v17.4s, v2.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x12]\n"
+          "smlal v16.4s, v3.4h, v11.4h\n"
+          "ld1 {v0.8b}, [x9], x7\n"
+          "smlal2 v17.4s, v3.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x13], x7\n"
+          "smlal v16.4s, v4.4h, v12.4h\n"
+          "ld1 {v1.8b}, [x9], x7\n"
+          "smlal2 v17.4s, v4.8h, v12.8h\n"
+          "ld1 {v12.8b}, [x13], x7\n"
+          "smlal v16.4s, v5.4h, v13.4h\n"
+          "ld1 {v2.8b}, [x9]\n"
+          "smlal2 v17.4s, v5.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x13]\n"
+
+          "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+          "ld1 {v3.8b}, [x10], x7\n"
+          "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+          "ld1 {v4.8b}, [x10], x7\n"
+          "and v18.16b, v16.16b, v29.16b\n"
+          "ld1 {v5.8b}, [x10]\n"
+          "and v19.16b, v17.16b, v29.16b\n"
+          "sshr v18.4s, v18.4s, #31\n"
+          "sshr v19.4s, v19.4s, #31\n"
+          "sqadd v16.4s, v16.4s, v18.4s\n"
+          "sqadd v17.4s, v17.4s, v19.4s\n"
+          "srshl v16.4s, v16.4s, v29.4s\n"
+          "srshl v17.4s, v17.4s, v29.4s\n"
+          "add v16.4s, v16.4s, v28.4s\n"
+          "add v17.4s, v17.4s, v28.4s\n"
+          "smax v16.4s, v16.4s, v30.4s\n"
+          "smax v17.4s, v17.4s, v30.4s\n"
+          "smin v16.4s, v16.4s, v31.4s\n"
+          "smin v17.4s, v17.4s, v31.4s\n"
+          "sqxtn v16.4h, v16.4s\n"
+          "sqxtn2 v16.8h, v17.4s\n"
+          "sqxtun v16.8b, v16.8h\n"
+          "uaddw v8.8h, v26.8h, v8.8b\n"
+          "st1 {v16.8b}, [%[output_ptr]], #8\n"
+          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "uaddw v10.8h, v26.8h, v10.8b\n"
+          "uaddw v11.8h, v26.8h, v11.8b\n"
+          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "uaddw v13.8h, v26.8h, v13.8b\n"
+
+          "uaddw v0.8h, v25.8h, v0.8b\n"
+          "uaddw v1.8h, v25.8h, v1.8b\n"
+          "uaddw v2.8h, v25.8h, v2.8b\n"
+          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v3.8h, v25.8h, v3.8b\n"
+          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v4.8h, v25.8h, v4.8b\n"
+          "uaddw v5.8h, v25.8h, v5.8b\n"
+
+          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
+        "smlal v16.4s, v0.4h, v8.4h\n"
+        "smlal2 v17.4s, v0.8h, v8.8h\n"
+        "smlal v16.4s, v1.4h, v9.4h\n"
+        "smlal2 v17.4s, v1.8h, v9.8h\n"
+        "smlal v16.4s, v2.4h, v10.4h\n"
+        "smlal2 v17.4s, v2.8h, v10.8h\n"
+        "smlal v16.4s, v3.4h, v11.4h\n"
+        "smlal2 v17.4s, v3.8h, v11.8h\n"
+        "smlal v16.4s, v4.4h, v12.4h\n"
+        "smlal2 v17.4s, v4.8h, v12.8h\n"
+        "smlal v16.4s, v5.4h, v13.4h\n"
+        "smlal2 v17.4s, v5.8h, v13.8h\n"
+
+        "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+        "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+        "and v18.16b, v16.16b, v29.16b\n"
+        "and v19.16b, v17.16b, v29.16b\n"
+        "sshr v18.4s, v18.4s, #31\n"
+        "sshr v19.4s, v19.4s, #31\n"
+        "sqadd v16.4s, v16.4s, v18.4s\n"
+        "sqadd v17.4s, v17.4s, v19.4s\n"
+        "srshl v16.4s, v16.4s, v29.4s\n"
+        "srshl v17.4s, v17.4s, v29.4s\n"
+        "add v16.4s, v16.4s, v28.4s\n"
+        "add v17.4s, v17.4s, v28.4s\n"
+        "smax v16.4s, v16.4s, v30.4s\n"
+        "smax v17.4s, v17.4s, v30.4s\n"
+        "smin v16.4s, v16.4s, v31.4s\n"
+        "smin v17.4s, v17.4s, v31.4s\n"
+        "sqxtn v16.4h, v16.4s\n"
+        "sqxtn2 v16.8h, v17.4s\n"
+        "sqxtun v16.8b, v16.8h\n"
+        "st1 {v16.8b}, [%[output_ptr]]\n"
+        :
+        // Outputs.
+        [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
+        :
+        // Inputs.
+        [params_ptr] "r"(params_ptr)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", "v12",
+        "v13", "v16", "v17", "v18", "v19", "v25", "v26", "v27", "v28", "v29",
+        "v30", "v31",
+        // We use these general-purpose registers.
+        "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
+#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
+#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
   }
 };
 
 template <>
-struct ConvKernel3x3FilterDepth8<1, 4, 2, 2> {
-  static inline void Run(const uint8* input_ptr, int input_depth,
-                         int32 input_offset, int input_row_size,
-                         const uint8* filter_ptr, int32 filter_offset,
-                         const int32* bias_ptr, int32 output_offset,
-                         int32 output_multiplier, int output_shift,
-                         int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_ptr,
-                         int output_depth, int output_width) {
-    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
-
-    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
-    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8;
-    uint8x8_t temp_0, temp_1, temp_2, temp_3, temp_4, temp_5, temp_6, temp_7,
-        temp_8;
-
-    const uint8* ptr = input_ptr;
-
-    // Load all inputs for top output.
-    temp_0 = vld1_u8(ptr);
-    temp_1 = vld1_u8(ptr + input_depth);
-    temp_2 = vld1_u8(ptr + 2 * input_depth);
-    ptr += input_row_size;
-    temp_3 = vld1_u8(ptr);
-    temp_4 = vld1_u8(ptr + input_depth);
-    temp_5 = vld1_u8(ptr + 2 * input_depth);
-    ptr += input_row_size;
-    temp_6 = vld1_u8(ptr);
-    temp_7 = vld1_u8(ptr + input_depth);
-    temp_8 = vld1_u8(ptr + 2 * input_depth);
-
-    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
-    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
-    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
-
-    input_0 = vaddq_s16(input_0, input_offset_vec);
-    input_1 = vaddq_s16(input_1, input_offset_vec);
-    input_2 = vaddq_s16(input_2, input_offset_vec);
-    input_3 = vaddq_s16(input_3, input_offset_vec);
-    input_4 = vaddq_s16(input_4, input_offset_vec);
-    input_5 = vaddq_s16(input_5, input_offset_vec);
-    input_6 = vaddq_s16(input_6, input_offset_vec);
-    input_7 = vaddq_s16(input_7, input_offset_vec);
-    input_8 = vaddq_s16(input_8, input_offset_vec);
-
-    DotProductAndStore(
-        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8, bias_ptr, output_offset, output_multiplier,
-        output_shift, output_activation_min, output_activation_max, output_ptr);
-
-    // Second output.
-    output_ptr += output_depth;
-
-    ptr = input_ptr + 3 * input_depth;
-    temp_0 = vld1_u8(ptr);
-    temp_1 = vld1_u8(ptr + input_depth);
-    ptr += input_row_size;
-    temp_3 = vld1_u8(ptr);
-    temp_4 = vld1_u8(ptr + input_depth);
-    ptr += input_row_size;
-    temp_6 = vld1_u8(ptr);
-    temp_7 = vld1_u8(ptr + input_depth);
-
-    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
-    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
-
-    input_0 = vaddq_s16(input_0, input_offset_vec);
-    input_1 = vaddq_s16(input_1, input_offset_vec);
-    input_3 = vaddq_s16(input_3, input_offset_vec);
-    input_4 = vaddq_s16(input_4, input_offset_vec);
-    input_6 = vaddq_s16(input_6, input_offset_vec);
-    input_7 = vaddq_s16(input_7, input_offset_vec);
-
-    DotProductAndStore(
-        filter, input_2, input_0, input_1, input_5, input_3, input_4, input_8,
-        input_6, input_7, bias_ptr, output_offset, output_multiplier,
-        output_shift, output_activation_min, output_activation_max, output_ptr);
-
-    // Third output.
-    output_ptr += output_depth;
-
-    ptr = input_ptr + 5 * input_depth;
-    temp_2 = vld1_u8(ptr);
-    temp_0 = vld1_u8(ptr + input_depth);
-    ptr += input_row_size;
-    temp_5 = vld1_u8(ptr);
-    temp_3 = vld1_u8(ptr + input_depth);
-    ptr += input_row_size;
-    temp_8 = vld1_u8(ptr);
-    temp_6 = vld1_u8(ptr + input_depth);
-
-    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
-    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
-
-    input_2 = vaddq_s16(input_2, input_offset_vec);
-    input_0 = vaddq_s16(input_0, input_offset_vec);
-    input_5 = vaddq_s16(input_5, input_offset_vec);
-    input_3 = vaddq_s16(input_3, input_offset_vec);
-    input_8 = vaddq_s16(input_8, input_offset_vec);
-    input_6 = vaddq_s16(input_6, input_offset_vec);
-
-    DotProductAndStore(
-        filter, input_1, input_2, input_0, input_4, input_5, input_3, input_7,
-        input_8, input_6, bias_ptr, output_offset, output_multiplier,
-        output_shift, output_activation_min, output_activation_max, output_ptr);
-
-    // Fourth output.
-    output_ptr += output_depth;
-
-    ptr = input_ptr + 7 * input_depth;
-    temp_1 = vld1_u8(ptr);
-    temp_2 = vld1_u8(ptr + input_depth);
-    ptr += input_row_size;
-    temp_4 = vld1_u8(ptr);
-    temp_5 = vld1_u8(ptr + input_depth);
-    ptr += input_row_size;
-    temp_7 = vld1_u8(ptr);
-    temp_8 = vld1_u8(ptr + input_depth);
-
-    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
-    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
-
-    input_1 = vaddq_s16(input_1, input_offset_vec);
-    input_2 = vaddq_s16(input_2, input_offset_vec);
-    input_4 = vaddq_s16(input_4, input_offset_vec);
-    input_5 = vaddq_s16(input_5, input_offset_vec);
-    input_7 = vaddq_s16(input_7, input_offset_vec);
-    input_8 = vaddq_s16(input_8, input_offset_vec);
-
-    DotProductAndStore(
-        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8, bias_ptr, output_offset, output_multiplier,
-        output_shift, output_activation_min, output_activation_max, output_ptr);
+struct DepthwiseConvPartial<EdgeType::kVertical, 1, 1> {
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                         const int32* bias_ptr, uint8* output_ptr,
+                         const DepthwiseConvParams* params_ptr) {
+#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
+#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
+    asm volatile(
+        // Performs depthwise convolutions for an input window of size 3x2 and
+        // padding of 1 across the full depth. Expects |input_ptr| and
+        // |filter_ptr| to be pointing to the beginning of the 3x2 input and
+        // filter values.
+
+        // Load input and filter values.
+        "ldr x6, [%[params_ptr], #" STR(OFFSET_INPUT_DEPTH) "]\n"
+        "mov x12, %[input_ptr]\n"
+        "ldr x11, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
+        "mov x7, %[filter_ptr]\n"
+        "ldr x5, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
+        "add x13, x12, x11\n"
+        "ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "add x14, x13, x11\n"
+
+        "ld1 {v8.8b}, [x12], x6\n"
+        "add x9, x7, x5\n"
+        "ld1 {v9.8b}, [x12]\n"
+        "cmp x15, #16\n"
+        "add x10, x9, x5\n"
+        "ld1 {v10.8b}, [x13], x6\n"
+        "add %[input_ptr], %[input_ptr], #8\n"
+        "ld1 {v11.8b}, [x13]\n"
+        "add %[filter_ptr], %[filter_ptr], #8\n"
+        "ld1 {v12.8b}, [x14], x6\n"
+        "ld1 {v13.8b}, [x14]\n"
+
+        "ld1 {v0.8b}, [x7], x6\n"
+        "ld1 {v1.8b}, [x7]\n"
+        "ld1 {v2.8b}, [x9], x6\n"
+        "ld1 {v3.8b}, [x9]\n"
+        "ld1 {v4.8b}, [x10], x6\n"
+        "ld1 {v5.8b}, [x10]\n"
+
+        // Load constants.
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "dup v26.8h, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w13\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_SHIFT) "]\n"
+        "dup v28.4s, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "neg w13, w13\n"
+        "dup v29.4s, w13\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.4s, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+        "dup v31.4s, w13\n"
+        "dup v25.8h, w12\n"
+
+        // Add input and filter offsets.
+        "uaddw v8.8h, v26.8h, v8.8b\n"
+        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v9.8h, v26.8h, v9.8b\n"
+        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v10.8h, v26.8h, v10.8b\n"
+        "uaddw v11.8h, v26.8h, v11.8b\n"
+        "uaddw v12.8h, v26.8h, v12.8b\n"
+        "uaddw v13.8h, v26.8h, v13.8b\n"
+
+        "uaddw v0.8h, v25.8h, v0.8b\n"
+        "uaddw v1.8h, v25.8h, v1.8b\n"
+        "uaddw v2.8h, v25.8h, v2.8b\n"
+        "uaddw v3.8h, v25.8h, v3.8b\n"
+        "uaddw v4.8h, v25.8h, v4.8b\n"
+        "uaddw v5.8h, v25.8h, v5.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
+          "mov x12, %[input_ptr]\n"
+          "subs x15, x15, #8\n"
+          "add x13, x12, x11\n"
+          "cmp x15, #16\n"
+          "add x14, x13, x11\n"
+          "add %[input_ptr], %[input_ptr], #8\n"
+
+          "smlal v16.4s, v0.4h, v8.4h\n"
+          "mov x7, %[filter_ptr]\n"
+          "smlal2 v17.4s, v0.8h, v8.8h\n"
+          "ld1 {v8.8b}, [x12], x6\n"
+          "smlal v16.4s, v1.4h, v9.4h\n"
+          "add x9, x7, x5\n"
+          "smlal2 v17.4s, v1.8h, v9.8h\n"
+          "add x10, x9, x5\n"
+          "ld1 {v9.8b}, [x12]\n"
+          "smlal v16.4s, v2.4h, v10.4h\n"
+          "add %[filter_ptr], %[filter_ptr], #8\n"
+          "smlal2 v17.4s, v2.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x13], x6\n"
+          "smlal v16.4s, v3.4h, v11.4h\n"
+          "ld1 {v0.8b}, [x7], x6\n"
+          "smlal2 v17.4s, v3.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x13]\n"
+          "smlal v16.4s, v4.4h, v12.4h\n"
+          "ld1 {v1.8b}, [x7]\n"
+          "smlal2 v17.4s, v4.8h, v12.8h\n"
+          "ld1 {v12.8b}, [x14], x6\n"
+          "smlal v16.4s, v5.4h, v13.4h\n"
+          "ld1 {v2.8b}, [x9], x6\n"
+          "smlal2 v17.4s, v5.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x14]\n"
+
+          "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+          "ld1 {v3.8b}, [x9]\n"
+          "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+          "ld1 {v4.8b}, [x10], x6\n"
+          "and v18.16b, v16.16b, v29.16b\n"
+          "ld1 {v5.8b}, [x10]\n"
+          "and v19.16b, v17.16b, v29.16b\n"
+          "sshr v18.4s, v18.4s, #31\n"
+          "sshr v19.4s, v19.4s, #31\n"
+          "sqadd v16.4s, v16.4s, v18.4s\n"
+          "sqadd v17.4s, v17.4s, v19.4s\n"
+          "srshl v16.4s, v16.4s, v29.4s\n"
+          "srshl v17.4s, v17.4s, v29.4s\n"
+          "add v16.4s, v16.4s, v28.4s\n"
+          "add v17.4s, v17.4s, v28.4s\n"
+          "smax v16.4s, v16.4s, v30.4s\n"
+          "smax v17.4s, v17.4s, v30.4s\n"
+          "smin v16.4s, v16.4s, v31.4s\n"
+          "smin v17.4s, v17.4s, v31.4s\n"
+          "sqxtn v16.4h, v16.4s\n"
+          "sqxtn2 v16.8h, v17.4s\n"
+          "sqxtun v16.8b, v16.8h\n"
+          "uaddw v8.8h, v26.8h, v8.8b\n"
+          "st1 {v16.8b}, [%[output_ptr]], #8\n"
+          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "uaddw v10.8h, v26.8h, v10.8b\n"
+          "uaddw v11.8h, v26.8h, v11.8b\n"
+          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "uaddw v13.8h, v26.8h, v13.8b\n"
+
+          "uaddw v0.8h, v25.8h, v0.8b\n"
+          "uaddw v1.8h, v25.8h, v1.8b\n"
+          "uaddw v2.8h, v25.8h, v2.8b\n"
+          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v3.8h, v25.8h, v3.8b\n"
+          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v4.8h, v25.8h, v4.8b\n"
+          "uaddw v5.8h, v25.8h, v5.8b\n"
+
+          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
+        "smlal v16.4s, v0.4h, v8.4h\n"
+        "smlal2 v17.4s, v0.8h, v8.8h\n"
+        "smlal v16.4s, v1.4h, v9.4h\n"
+        "smlal2 v17.4s, v1.8h, v9.8h\n"
+        "smlal v16.4s, v2.4h, v10.4h\n"
+        "smlal2 v17.4s, v2.8h, v10.8h\n"
+        "smlal v16.4s, v3.4h, v11.4h\n"
+        "smlal2 v17.4s, v3.8h, v11.8h\n"
+        "smlal v16.4s, v4.4h, v12.4h\n"
+        "smlal2 v17.4s, v4.8h, v12.8h\n"
+        "smlal v16.4s, v5.4h, v13.4h\n"
+        "smlal2 v17.4s, v5.8h, v13.8h\n"
+
+        "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+        "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+        "and v18.16b, v16.16b, v29.16b\n"
+        "and v19.16b, v17.16b, v29.16b\n"
+        "sshr v18.4s, v18.4s, #31\n"
+        "sshr v19.4s, v19.4s, #31\n"
+        "sqadd v16.4s, v16.4s, v18.4s\n"
+        "sqadd v17.4s, v17.4s, v19.4s\n"
+        "srshl v16.4s, v16.4s, v29.4s\n"
+        "srshl v17.4s, v17.4s, v29.4s\n"
+        "add v16.4s, v16.4s, v28.4s\n"
+        "add v17.4s, v17.4s, v28.4s\n"
+        "smax v16.4s, v16.4s, v30.4s\n"
+        "smax v17.4s, v17.4s, v30.4s\n"
+        "smin v16.4s, v16.4s, v31.4s\n"
+        "smin v17.4s, v17.4s, v31.4s\n"
+        "sqxtn v16.4h, v16.4s\n"
+        "sqxtn2 v16.8h, v17.4s\n"
+        "sqxtun v16.8b, v16.8h\n"
+        "st1 {v16.8b}, [%[output_ptr]]\n"
+        :
+        // Outputs.
+        [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
+        :
+        // Inputs.
+        [params_ptr] "r"(params_ptr)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", "v12",
+        "v13", "v16", "v17", "v18", "v19", "v25", "v26", "v27", "v28", "v29",
+        "v30", "v31",
+        // We use these general-purpose registers.
+        "x5", "x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
+#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
+#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
   }
 };
 
-template <int kFixedStrideWidth, int kFixedStrideHeight>
-struct ConvKernel3x3FilterDepth8<1, 1, kFixedStrideWidth, kFixedStrideHeight> {
-  static inline void Run(const uint8* input_ptr, int input_depth,
-                         int32 input_offset, int input_row_size,
-                         const uint8* filter_ptr, int32 filter_offset,
-                         const int32* bias_ptr, int32 output_offset,
-                         int32 output_multiplier, int output_shift,
-                         int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_ptr,
-                         int output_depth, int output_width) {
-    Filter3x3x8 filter = Load3x3Filter(filter_ptr, filter_offset, output_depth);
-
-    int16x8_t input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8;
-
-    uint8x8_t temp_0 = vld1_u8(input_ptr);
-    uint8x8_t temp_1 = vld1_u8(input_ptr + input_depth);
-    uint8x8_t temp_2 = vld1_u8(input_ptr + 2 * input_depth);
-
-    input_ptr += input_row_size;
-    uint8x8_t temp_3 = vld1_u8(input_ptr);
-    uint8x8_t temp_4 = vld1_u8(input_ptr + input_depth);
-    uint8x8_t temp_5 = vld1_u8(input_ptr + 2 * input_depth);
-
-    input_ptr += input_row_size;
-    uint8x8_t temp_6 = vld1_u8(input_ptr);
-    uint8x8_t temp_7 = vld1_u8(input_ptr + input_depth);
-    uint8x8_t temp_8 = vld1_u8(input_ptr + 2 * input_depth);
-
-    input_0 = vreinterpretq_s16_u16(vmovl_u8(temp_0));
-    input_1 = vreinterpretq_s16_u16(vmovl_u8(temp_1));
-    input_2 = vreinterpretq_s16_u16(vmovl_u8(temp_2));
-    input_3 = vreinterpretq_s16_u16(vmovl_u8(temp_3));
-    input_4 = vreinterpretq_s16_u16(vmovl_u8(temp_4));
-    input_5 = vreinterpretq_s16_u16(vmovl_u8(temp_5));
-    input_6 = vreinterpretq_s16_u16(vmovl_u8(temp_6));
-    input_7 = vreinterpretq_s16_u16(vmovl_u8(temp_7));
-    input_8 = vreinterpretq_s16_u16(vmovl_u8(temp_8));
-
-    const int16x8_t input_offset_vec = vdupq_n_s16(input_offset);
-    input_0 = vaddq_s16(input_0, input_offset_vec);
-    input_1 = vaddq_s16(input_1, input_offset_vec);
-    input_2 = vaddq_s16(input_2, input_offset_vec);
-    input_3 = vaddq_s16(input_3, input_offset_vec);
-    input_4 = vaddq_s16(input_4, input_offset_vec);
-    input_5 = vaddq_s16(input_5, input_offset_vec);
-    input_6 = vaddq_s16(input_6, input_offset_vec);
-    input_7 = vaddq_s16(input_7, input_offset_vec);
-    input_8 = vaddq_s16(input_8, input_offset_vec);
-
-    DotProductAndStore(
-        filter, input_0, input_1, input_2, input_3, input_4, input_5, input_6,
-        input_7, input_8, bias_ptr, output_offset, output_multiplier,
-        output_shift, output_activation_min, output_activation_max, output_ptr);
-  }
-};
-
-inline void ShuffleInput(const uint8* input_ptr, int input_depth,
-                         int input_width, int input_height, int output_depth,
-                         int output_width, int output_height,
-                         uint8* output_ptr) {
-  const int input_row_size = input_depth * input_width;
-
-  for (int y = 0; y < output_height; y++) {
+#undef OFFSET_INPUT_DEPTH
+#undef OFFSET_INPUT_ROW_SIZE
+#undef OFFSET_OUTPUT_DEPTH
+#undef OFFSET_OUTPUT_ROW_SIZE
+#undef OFFSET_INPUT_OFFSET
+#undef OFFSET_OUTPUT_OFFSET
+#undef OFFSET_FILTER_OFFSET
+#undef OFFSET_OUTPUT_MULTIPLIER
+#undef OFFSET_OUTPUT_ACTIVATION_MIN
+#undef OFFSET_OUTPUT_ACTIVATION_MAX
+#undef OFFSET_OUTPUT_SHIFT
+#undef OFFSET_INPUT_WIDTH
+#undef OFFSET_INPUT_HEIGHT
+#undef OFFSET_OUTPUT_WIDTH
+#undef OFFSET_OUTPUT_HEIGHT
+#undef STR
+#undef STR_UNEXPANDED
+
+// Copies a subset of the input designated by |input_ptr| into |output_ptr|
+// with the specified output dimensions. Supports output depths of 64 only as
+// this is the cache line size.
+inline void ShuffleInput(const uint8* input_ptr, int64_t input_depth,
+                         int32 input_width, int32 input_height,
+                         int64_t output_depth, int32 output_width,
+                         int32 output_height, uint8* output_ptr) {
+  const int64_t input_row_size = input_depth * input_width;
+  for (int32 y = 0; y < output_height; y++) {
     const uint8* ptr = input_ptr;
-    for (int x = 0; x < output_width; x++) {
+    for (int32 x = 0; x < output_width; x++) {
       memcpy(output_ptr, ptr, output_depth);
       output_ptr += output_depth;
       ptr += input_depth;
@@ -3873,561 +2937,262 @@ inline void ShuffleInput(const uint8* input_ptr, int input_depth,
   }
 }
 
-template <int kFixedHeight, int kFixedStrideWidth, int kFixedStrideHeight>
-struct ConvRow3x3FilterDepth8 {};
-
-template <int kFixedStrideWidth, int kFixedStrideHeight>
-struct ConvRow3x3FilterDepth8<1, kFixedStrideWidth, kFixedStrideHeight> {
-  static inline void Run(const uint8* input_data, int start_x, int start_y,
-                         int input_depth, int input_width, int input_height,
-                         int input_row_size, int32 input_offset,
-                         const uint8* filter_data, int32 filter_offset,
-                         const int32* bias_data, int32 output_offset,
-                         int32 output_multiplier, int output_shift,
-                         int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_data,
-                         int output_depth, int output_width,
-                         uint8* shuffle_workspace) {
-    int out_x = start_x;
-
-    // 1x4 at a time.
-    for (; out_x <= output_width - 4; out_x += 4) {
-      const int32* bias_ptr = bias_data;
-      const uint8* filter_ptr = filter_data;
-
-      const uint8* input_ptr = input_data;
-      uint8* output_ptr = output_data;
-
-      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
-        ConvKernel3x3FilterDepth8<1, 4, kFixedStrideWidth, kFixedStrideHeight>::
-            Run(input_ptr, input_depth, input_offset, input_row_size,
-                filter_ptr, filter_offset, bias_ptr, output_offset,
-                output_multiplier, output_shift, output_activation_min,
-                output_activation_max, output_ptr, output_depth, output_width);
-
-        input_ptr += 8;
-        output_ptr += 8;
-        filter_ptr += 8;
-        bias_ptr += 8;
-      }
-
-      input_data += 4 * kFixedStrideWidth * input_depth;
-      output_data += 4 * output_depth;
-    }
-
-    // 1x1 at a time.
-    for (; out_x < output_width; out_x++) {
-      const int32* bias_ptr = bias_data;
-      const uint8* filter_ptr = filter_data;
-
-      const uint8* input_ptr = input_data;
-      uint8* output_ptr = output_data;
-
-      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
-        ConvKernel3x3FilterDepth8<1, 1, kFixedStrideWidth, kFixedStrideHeight>::
-            Run(input_ptr, input_depth, input_offset, input_row_size,
-                filter_ptr, filter_offset, bias_ptr, output_offset,
-                output_multiplier, output_shift, output_activation_min,
-                output_activation_max, output_ptr, output_depth, output_width);
-
-        input_ptr += 8;
-        output_ptr += 8;
-        filter_ptr += 8;
-        bias_ptr += 8;
-      }
+// Calculates the input size depending on stride and output.
+inline int32 get_shuffle_input_size(int32 stride, int32 output) {
+  return stride * (output - 1) + 3;
+}
 
-      input_data += kFixedStrideWidth * input_depth;
-      output_data += output_depth;
-    }
+// Indicates the input and output dimensions used when shuffling input
+// activations.
+struct ShuffleParams {
+  int32 output_width;
+  int32 output_height;
+  int32 input_width;
+  int32 input_height;
+
+  ShuffleParams() = default;
+  ShuffleParams(int32 output_width, int32 output_height, int32 stride_width,
+                int32 stride_height)
+  : output_width(output_width)
+  , output_height(output_height)
+  , input_width(get_shuffle_input_size(stride_width, output_width))
+  , input_height(get_shuffle_input_size(stride_height, output_height)) {
   }
 };
 
-template <int kFixedStrideWidth, int kFixedStrideHeight>
-struct ConvRow3x3FilterDepth8<2, kFixedStrideWidth, kFixedStrideHeight> {
-  static inline void Run(const uint8* input_data, int start_x, int start_y,
-                         int input_depth, int input_width, int input_height,
-                         int input_row_size, int32 input_offset,
-                         const uint8* filter_data, int32 filter_offset,
-                         const int32* bias_data, int32 output_offset,
-                         int32 output_multiplier, int output_shift,
-                         int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_data,
-                         int output_depth, int output_width,
-                         uint8* shuffle_workspace) {
-    int out_x = start_x;
-
-    // 2x4 at a time.
-    for (; out_x <= output_width - 4; out_x += 4) {
-      const int32* bias_ptr = bias_data;
-      const uint8* filter_ptr = filter_data;
-
-      const uint8* input_ptr = input_data;
-      uint8* output_ptr = output_data;
-
-      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
-        ConvKernel3x3FilterDepth8<2, 4, kFixedStrideWidth, kFixedStrideHeight>::
-            Run(input_ptr, input_depth, input_offset, input_row_size,
-                filter_ptr, filter_offset, bias_ptr, output_offset,
-                output_multiplier, output_shift, output_activation_min,
-                output_activation_max, output_ptr, output_depth, output_width);
-
-        input_ptr += 8;
-        output_ptr += 8;
-        filter_ptr += 8;
-        bias_ptr += 8;
-      }
-
-      input_data += 4 * kFixedStrideWidth * input_depth;
-      output_data += 4 * output_depth;
-    }
-
-    // 2x2 at a time.
-    for (; out_x <= output_width - 2; out_x += 2) {
-      const int32* bias_ptr = bias_data;
-      const uint8* filter_ptr = filter_data;
-
-      const uint8* input_ptr = input_data;
-      uint8* output_ptr = output_data;
-
-      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
-        ConvKernel3x3FilterDepth8<2, 2, kFixedStrideWidth, kFixedStrideHeight>::
-            Run(input_ptr, input_depth, input_offset, input_row_size,
-                filter_ptr, filter_offset, bias_ptr, output_offset,
-                output_multiplier, output_shift, output_activation_min,
-                output_activation_max, output_ptr, output_depth, output_width);
-
-        input_ptr += 8;
-        output_ptr += 8;
-        filter_ptr += 8;
-        bias_ptr += 8;
-      }
-
-      input_data += 2 * kFixedStrideWidth * input_depth;
-      output_data += 2 * output_depth;
-    }
-
-    // 2x1 at a time.
-    for (; out_x < output_width; out_x++) {
-      const int32* bias_ptr = bias_data;
-      const uint8* filter_ptr = filter_data;
-
-      const uint8* input_ptr = input_data;
-      uint8* output_ptr = output_data;
-
-      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
-        ConvKernel3x3FilterDepth8<2, 1, kFixedStrideWidth, kFixedStrideHeight>::
-            Run(input_ptr, input_depth, input_offset, input_row_size,
-                filter_ptr, filter_offset, bias_ptr, output_offset,
-                output_multiplier, output_shift, output_activation_min,
-                output_activation_max, output_ptr, output_depth, output_width);
-
-        input_ptr += 8;
-        output_ptr += 8;
-        filter_ptr += 8;
-        bias_ptr += 8;
-      }
-
-      input_data += kFixedStrideWidth * input_depth;
-      output_data += output_depth;
+template <int32 kStrideWidth, int32 kStrideHeight>
+struct DepthwiseConvThroughDepth {
+  // Runs the DepthwiseConvWindow kernels through the depth dimension from
+  // |start_depth| to |end_depth|. Keep this not inlined to maintain a small
+  // binary size. We use a DepthwiseConvParams struct for read only params
+  // to minimize call overhead.
+  static __attribute__((noinline)) void Run(const uint8* input_ptr,
+      const uint8* filter_ptr, const int32* bias_ptr, uint8* output_ptr,
+      int64_t start_depth, int64_t end_depth, int64_t input_depth,
+      int64_t input_row_size, int32 output_window_height,
+      int32 output_window_width, const DepthwiseConvParams& params) {
+    for (; start_depth <= end_depth - 8; start_depth += 8) {
+      DepthwiseConvWindow<8, kStrideWidth, kStrideHeight>::Run(
+          input_ptr, filter_ptr, bias_ptr, output_ptr, input_depth,
+          input_row_size, output_window_height, output_window_width, &params);
+      input_ptr += 8;
+      output_ptr += 8;
+      filter_ptr += 8;
+      bias_ptr += 8;
     }
   }
 };
 
-template <>
-struct ConvRow3x3FilterDepth8<4, 1, 1> {
-  static inline void Run(const uint8* input_data, int start_x, int start_y,
-                         int input_depth, int input_width, int input_height,
-                         int input_row_size, int32 input_offset,
-                         const uint8* filter_data, int32 filter_offset,
-                         const int32* bias_data, int32 output_offset,
-                         int32 output_multiplier, int output_shift,
-                         int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_data,
-                         int output_depth, int output_width,
-                         uint8* shuffle_workspace) {
-    int out_x = start_x;
-
-    // 4x4 at a time.
-    for (; out_x <= output_width - 4; out_x += 4) {
-      const int32* bias_ptr = bias_data;
-      const uint8* filter_ptr = filter_data;
-
-      const uint8* input_ptr = input_data;
-      uint8* output_ptr = output_data;
-
-      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
-        ConvKernel3x3FilterDepth8<4, 4, 1, 1>::Run(
-            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
-            filter_offset, bias_ptr, output_offset, output_multiplier,
-            output_shift, output_activation_min, output_activation_max,
-            output_ptr, output_depth, output_width);
-
-        input_ptr += 8;
-        output_ptr += 8;
-        filter_ptr += 8;
-        bias_ptr += 8;
-      }
+template <int32 kStrideWidth, int32 kStrideHeight>
+struct DepthwiseConvMultiRow {
+  using ConvKernel = DepthwiseConvThroughDepth<kStrideWidth, kStrideHeight>;
 
-      input_data += 4 * input_depth;
-      output_data += 4 * output_depth;
-    }
-
-    // Handle the rest of the right side.
-    // 4x2 at a time.
-    for (; out_x <= output_width - 2; out_x += 2) {
-      const int32* bias_ptr = bias_data;
-      const uint8* filter_ptr = filter_data;
-
-      const uint8* input_ptr = input_data;
-      uint8* output_ptr = output_data;
-
-      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
-        ConvKernel3x3FilterDepth8<4, 2, 1, 1>::Run(
-            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
-            filter_offset, bias_ptr, output_offset, output_multiplier,
-            output_shift, output_activation_min, output_activation_max,
-            output_ptr, output_depth, output_width);
-
-        input_ptr += 8;
-        output_ptr += 8;
-        filter_ptr += 8;
-        bias_ptr += 8;
-      }
-
-      input_data += 2 * input_depth;
-      output_data += 2 * output_depth;
-    }
-
-    // 4x1 at a time.
-    for (; out_x < output_width; out_x++) {
-      const int32* bias_ptr = bias_data;
-      const uint8* filter_ptr = filter_data;
-
-      const uint8* input_ptr = input_data;
-      uint8* output_ptr = output_data;
-
-      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
-        ConvKernel3x3FilterDepth8<4, 1, 1, 1>::Run(
-            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
-            filter_offset, bias_ptr, output_offset, output_multiplier,
-            output_shift, output_activation_min, output_activation_max,
-            output_ptr, output_depth, output_width);
-
-        input_ptr += 8;
-        output_ptr += 8;
-        filter_ptr += 8;
-        bias_ptr += 8;
-      }
-
-      input_data += input_depth;
-      output_data += output_depth;
-    }
-  }
-};
-
-template <>
-struct ConvRow3x3FilterDepth8<4, 2, 2> {
-  // The buffer size of the shuffled input.
-  static inline constexpr int ShuffleWorkspaceSize() { return 64 * 9 * 9; }
-
-  static inline void Run(const uint8* input_data, int start_x, int start_y,
-                         int input_depth, int input_width, int input_height,
-                         int input_row_size, int32 input_offset,
-                         const uint8* filter_data, int32 filter_offset,
-                         const int32* bias_data, int32 output_offset,
-                         int32 output_multiplier, int output_shift,
-                         int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_data,
-                         int output_depth, int output_width,
+  static inline void Run(const uint8* input_data, int32 start_x, int32 end_x,
+                         const uint8* filter_data, const int32* bias_data,
+                         uint8* output_data, const DepthwiseConvParams& params,
+                         const ShuffleParams& shuffle_params,
                          uint8* shuffle_workspace) {
-    // Branch and cache misses increase substantially with stride 2 kernels.
-    // Adding prefetching reduces latency by as much as 2x.
-    const int i0 = 0;
-    const int i1 = input_depth;
-    const int i2 = 2 * input_depth;
-    const int i3 = 3 * input_depth;
-    const int i4 = 4 * input_depth;
-    const int i5 = 5 * input_depth;
-    const int i6 = 6 * input_depth;
-    const int i7 = 7 * input_depth;
-    const int i8 = 8 * input_depth;
-
-#define DEPTHWISECONV_PRELOAD_ROW(input_ptr, i)         \
-  preload_l1_keep(input_ptr + i * input_row_size + i0); \
-  preload_l1_keep(input_ptr + i * input_row_size + i1); \
-  preload_l1_keep(input_ptr + i * input_row_size + i2); \
-  preload_l1_keep(input_ptr + i * input_row_size + i3); \
-  preload_l1_keep(input_ptr + i * input_row_size + i4); \
-  preload_l1_keep(input_ptr + i * input_row_size + i5); \
-  preload_l1_keep(input_ptr + i * input_row_size + i6); \
-  preload_l1_keep(input_ptr + i * input_row_size + i7); \
-  preload_l1_keep(input_ptr + i * input_row_size + i8);
-
-    int out_x = start_x;
-    // 4x4 at a time.
-    for (; out_x <= output_width - 4; out_x += 4) {
-      const int32* bias_ptr = bias_data;
-      const uint8* filter_ptr = filter_data;
-
-      const uint8* input_ptr = input_data;
-      uint8* output_ptr = output_data;
-
-      int depth = 0;
-      for (; depth <= output_depth - 64; depth += 64) {
-        // Preload 9x9 input.
-        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 0);
-        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 1);
-        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 2);
-        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 3);
-        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 4);
-        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 5);
-        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 6);
-        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 7);
-        DEPTHWISECONV_PRELOAD_ROW(input_ptr, 8);
-
-        // For a large input window (64x9x9) that is small enough to fit in L1
-        // cache, copy the input into a separate buffer and run the kernel on
-        // this new buffer. This reduces the likelihood of cache misses when
-        // the kernel is loading input data. If this size is ever changed,
-        // update the ShuffleWorkspaceSize() function to return the new size.
-        ShuffleInput(input_ptr, input_depth, input_width, input_height, 64, 9,
-                     9, shuffle_workspace);
-        const uint8* shuffled_ptr = &shuffle_workspace[0];
-
-        for (int micro_depth = 0; micro_depth <= 64 - 8; micro_depth += 8) {
-          ConvKernel3x3FilterDepth8<4, 4, 2, 2>::Run(
-              shuffled_ptr, 64, input_offset, 64 * 9, filter_ptr, filter_offset,
-              bias_ptr, output_offset, output_multiplier, output_shift,
-              output_activation_min, output_activation_max, output_ptr,
-              output_depth, output_width);
-
-          shuffled_ptr += 8;
-          output_ptr += 8;
-          filter_ptr += 8;
-          bias_ptr += 8;
+    TFLITE_DCHECK(shuffle_params.input_height ==
+        get_shuffle_input_size(kStrideHeight, shuffle_params.output_height));
+    TFLITE_DCHECK(shuffle_params.input_width ==
+        get_shuffle_input_size(kStrideWidth, shuffle_params.output_width));
+    TFLITE_DCHECK(64 * shuffle_params.input_width * shuffle_params.input_height
+                  <= DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE);
+
+    int32 out_x = start_x;
+
+    // Run shuffling on inputs with sufficiently large depth and width. When
+    // these parameters are large enough, more time is taken to load inputs
+    // from memory. At this point, it becomes useful to prefetch and
+    // preshuffle the input data to maximize locality.
+    if (params.output_depth > 64 ||
+        (params.output_depth <= 64 && params.input_width > 150)) {
+      for (; out_x <= (end_x - shuffle_params.output_width);
+             out_x += shuffle_params.output_width) {
+        const uint8* input_ptr = input_data;
+        const int32* bias_ptr = bias_data;
+        const uint8* filter_ptr = filter_data;
+        uint8* output_ptr = output_data;
+        int64_t depth = 0;
+        const int64_t shuffle_row_size = 64 * shuffle_params.input_width;
+
+        for (; depth <= params.output_depth - 64; depth += 64) {
+          // Preload.
+          const uint8* h_ptr = input_ptr;
+          for (int32 i = 0; i < shuffle_params.input_height; i++) {
+            const uint8* ptr = h_ptr;
+            for (int32 j = 0; j < shuffle_params.input_width; j++) {
+              asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
+              ptr += params.input_depth;
+            }
+            h_ptr += params.input_row_size;
+          }
+
+          // For a large enough input, shuffle into buckets.
+          ShuffleInput(input_ptr, params.input_depth, params.input_width,
+                       params.input_height, 64, shuffle_params.input_width,
+                       shuffle_params.input_height, shuffle_workspace);
+          ConvKernel::Run(shuffle_workspace, filter_ptr, bias_ptr, output_ptr,
+                          0, 64, 64, shuffle_row_size,
+                          shuffle_params.output_height,
+                          shuffle_params.output_width, params);
+          input_ptr += 64;
+          output_ptr += 64;
+          filter_ptr += 64;
+          bias_ptr += 64;
         }
-        input_ptr += 64;
-      }
-
-      // Preload 9x9 input one more time for the rest of the depth.
-      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 0);
-      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 1);
-      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 2);
-      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 3);
-      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 4);
-      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 5);
-      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 6);
-      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 7);
-      DEPTHWISECONV_PRELOAD_ROW(input_ptr, 8);
-
-      for (; depth <= output_depth - 8; depth += 8) {
-        ConvKernel3x3FilterDepth8<4, 4, 2, 2>::Run(
-            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
-            filter_offset, bias_ptr, output_offset, output_multiplier,
-            output_shift, output_activation_min, output_activation_max,
-            output_ptr, output_depth, output_width);
-
-        input_ptr += 8;
-        output_ptr += 8;
-        filter_ptr += 8;
-        bias_ptr += 8;
-      }
-
-      input_data += 4 * 2 * input_depth;
-      output_data += 4 * output_depth;
-    }
-
-#undef DEPTHWISECONV_PRELOAD_ROW
-
-    // Handle the rest of the right side.
-    // 4x2 at a time.
-    for (; out_x <= output_width - 2; out_x += 2) {
-      const int32* bias_ptr = bias_data;
-      const uint8* filter_ptr = filter_data;
 
-      const uint8* input_ptr = input_data;
-      uint8* output_ptr = output_data;
+        // Preload.
+        const uint8* h_ptr = input_ptr;
+        for (int32 i = 0; i < shuffle_params.input_height; i++) {
+          const uint8* ptr = h_ptr;
+          for (int32 j = 0; j < shuffle_params.input_width; j++) {
+            asm volatile("prfm pldl1keep, [%[ptr]]\n" ::[ptr] "r"(ptr) :);
+            ptr += params.input_depth;
+          }
+          h_ptr += params.input_row_size;
+        }
 
-      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
-        ConvKernel3x3FilterDepth8<4, 2, 2, 2>::Run(
-            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
-            filter_offset, bias_ptr, output_offset, output_multiplier,
-            output_shift, output_activation_min, output_activation_max,
-            output_ptr, output_depth, output_width);
+        // Handle leftover depth.
+        ConvKernel::Run(input_ptr, filter_ptr, bias_ptr, output_ptr,
+                        depth, params.output_depth, params.input_depth,
+                        params.input_row_size, shuffle_params.output_height,
+                        shuffle_params.output_width, params);
 
-        input_ptr += 8;
-        output_ptr += 8;
-        filter_ptr += 8;
-        bias_ptr += 8;
+        input_data +=
+            shuffle_params.output_width * kStrideWidth * params.input_depth;
+        output_data += shuffle_params.output_width * params.output_depth;
       }
-
-      input_data += 2 * 2 * input_depth;
-      output_data += 2 * output_depth;
     }
 
-    // 4x1 at a time.
-    for (; out_x < output_width; out_x++) {
-      const int32* bias_ptr = bias_data;
-      const uint8* filter_ptr = filter_data;
-
-      const uint8* input_ptr = input_data;
-      uint8* output_ptr = output_data;
-
-      for (int depth = 0; depth <= output_depth - 8; depth += 8) {
-        ConvKernel3x3FilterDepth8<4, 1, 2, 2>::Run(
-            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
-            filter_offset, bias_ptr, output_offset, output_multiplier,
-            output_shift, output_activation_min, output_activation_max,
-            output_ptr, output_depth, output_width);
-
-        input_ptr += 8;
-        output_ptr += 8;
-        filter_ptr += 8;
-        bias_ptr += 8;
-      }
-
-      input_data += 2 * input_depth;
-      output_data += output_depth;
+    const int32 output_leftover_width = end_x - out_x;
+    if (output_leftover_width > 0) {
+      ConvKernel::Run(input_data, filter_data, bias_data, output_data, 0,
+                      params.output_depth, params.input_depth,
+                      params.input_row_size, shuffle_params.output_height,
+                      output_leftover_width, params);
     }
   }
 };
 
-template <>
-struct ConvRow3x3FilterDepth8<8, 2, 2> {
-  static inline void Run(const uint8* input_data, int start_x, int start_y,
-                         int input_depth, int input_width, int input_height,
-                         int input_row_size, int32 input_offset,
-                         const uint8* filter_data, int32 filter_offset,
-                         const int32* bias_data, int32 output_offset,
-                         int32 output_multiplier, int output_shift,
-                         int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_data,
-                         int output_depth, int output_width,
-                         uint8* shuffle_workspace) {
-    // Reuse 4 row kernels twice.
-    ConvRow3x3FilterDepth8<4, 2, 2>::Run(
-        input_data, start_x, start_y, input_depth, input_width, input_height,
-        input_row_size, input_offset, filter_data, filter_offset, bias_data,
-        output_offset, output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_data, output_depth, output_width,
-        shuffle_workspace);
-
-    ConvRow3x3FilterDepth8<4, 2, 2>::Run(
-        input_data + 2 * 4 * input_row_size, start_x, start_y + 4, input_depth,
-        input_width, input_height, input_row_size, input_offset, filter_data,
-        filter_offset, bias_data, output_offset, output_multiplier,
-        output_shift, output_activation_min, output_activation_max,
-        output_data + 4 * output_depth * output_width, output_depth,
-        output_width, shuffle_workspace);
+// Processes the borders of the input for pad_width and pad_height = 1.
+// Calls 4 asm kernels:
+//   * 1x1 input shape.
+//   * Corner edges.
+//   * Horizontal edges.
+//   * Vertical edges.
+inline void DepthwiseConvHandlePadding(const uint8* input_data,
+    const uint8* filter_data, const int32* bias_data, uint8* output_data,
+    const DepthwiseConvParams& params) {
+  if (params.input_width == 1 && params.input_height == 1) {
+    const uint8* filter_ptr = filter_data + params.filter_row_size
+        + params.output_depth;
+    DepthwiseConvPartial<EdgeType::kCenter, 1, 1>::Run(input_data, filter_ptr,
+        bias_data, output_data, &params);
+    return;
   }
-};
 
-template <>
-struct ConvRow3x3FilterDepth8<8, 1, 1> {
-  // The buffer size of the shuffled input.
-  static inline constexpr int ShuffleWorkspaceSize() { return 64 * 10 * 10; }
-
-  static inline void Run(const uint8* input_data, int start_x, int start_y,
-                         int input_depth, int input_width, int input_height,
-                         int input_row_size, int32 input_offset,
-                         const uint8* filter_data, int32 filter_offset,
-                         const int32* bias_data, int32 output_offset,
-                         int32 output_multiplier, int output_shift,
-                         int32 output_activation_min,
-                         int32 output_activation_max, uint8* output_data,
-                         int output_depth, int output_width,
-                         uint8* shuffle_workspace) {
-    int out_x = start_x;
-    // 8x8 at a time.
-    for (; out_x <= output_width - 8; out_x += 8) {
-      const int32* bias_ptr = bias_data;
-      const uint8* filter_ptr = filter_data;
-
-      const uint8* input_ptr = input_data;
-      uint8* output_ptr = output_data;
-
-      int depth = 0;
-      for (; depth <= output_depth - 64; depth += 64) {
-        // For a large input window (64x10x10) that is small enough to fit in L1
-        // cache, copy the input into a separate buffer and run the kernel on
-        // this new buffer. This reduces the likelihood of cache misses when
-        // the kernel is loading input data. If the size of the input window
-        // changes, update the function ShuffleWorkspaceSize() with the new
-        // size.
-        ShuffleInput(input_ptr, input_depth, input_width, input_height, 64, 10,
-                     10, shuffle_workspace);
-        const uint8* shuffled_ptr = shuffle_workspace;
-
-        for (int micro_depth = 0; micro_depth <= 64 - 8; micro_depth += 8) {
-          ConvKernel3x3FilterDepth8<8, 8, 1, 1>::Run(
-              shuffled_ptr, 64, input_offset, 64 * 10, filter_ptr,
-              filter_offset, bias_ptr, output_offset, output_multiplier,
-              output_shift, output_activation_min, output_activation_max,
-              output_ptr, output_depth, output_width);
-
-          shuffled_ptr += 8;
-          output_ptr += 8;
-          filter_ptr += 8;
-          bias_ptr += 8;
-        }
-        input_ptr += 64;
-      }
+  const int32 out_x_start_corner = 0;
+  const int32 out_x_end_corner = params.output_width - 1;
+  const int32 out_y_start_corner = 0;
+  const int32 out_y_end_corner = params.output_height - 1;
+
+  // Handle top row.
+  const uint8* input_ptr = input_data;
+  const uint8* filter_ptr = filter_data + params.filter_row_size
+      + params.output_depth;
+  uint8* output_ptr = output_data;
+
+  DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(input_ptr, filter_ptr,
+      bias_data, output_ptr, &params);
+
+  input_ptr += (params.stride_width - 1) * params.input_depth;
+  filter_ptr = filter_data + params.filter_row_size;
+  output_ptr += params.output_depth;
+
+  for (int32 out_x = out_x_start_corner + 1; out_x < out_x_end_corner;
+           out_x++) {
+    DepthwiseConvPartial<EdgeType::kHorizontal, 1, 1>::Run(
+        input_ptr, filter_ptr, bias_data, output_ptr, &params);
+    input_ptr += params.stride_width * params.input_depth;
+    output_ptr += params.output_depth;
+  }
 
-      for (; depth <= output_depth - 8; depth += 8) {
-        ConvKernel3x3FilterDepth8<8, 8, 1, 1>::Run(
-            input_ptr, input_depth, input_offset, input_row_size, filter_ptr,
-            filter_offset, bias_ptr, output_offset, output_multiplier,
-            output_shift, output_activation_min, output_activation_max,
-            output_ptr, output_depth, output_width);
-
-        input_ptr += 8;
-        output_ptr += 8;
-        filter_ptr += 8;
-        bias_ptr += 8;
-      }
+  DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(input_ptr, filter_ptr,
+      bias_data, output_ptr, &params);
 
-      input_data += 8 * input_depth;
-      output_data += 8 * output_depth;
-    }
+  // Handle left side.
+  input_ptr = input_data + (params.stride_width - 1) * params.input_row_size;
+  filter_ptr = filter_data + params.input_depth;
+  output_ptr = output_data + params.output_row_size;
 
-    // Handle the rest of the right side by re-using 4 row kernels twice.
-    ConvRow3x3FilterDepth8<4, 1, 1>::Run(
-        input_data, out_x, start_y, input_depth, input_width, input_height,
-        input_row_size, input_offset, filter_data, filter_offset, bias_data,
-        output_offset, output_multiplier, output_shift, output_activation_min,
-        output_activation_max, output_data, output_depth, output_width,
-        shuffle_workspace);
-
-    ConvRow3x3FilterDepth8<4, 1, 1>::Run(
-        input_data + 4 * input_row_size, out_x, start_y + 4, input_depth,
-        input_width, input_height, input_row_size, input_offset, filter_data,
-        filter_offset, bias_data, output_offset, output_multiplier,
-        output_shift, output_activation_min, output_activation_max,
-        output_data + 4 * output_depth * output_width, output_depth,
-        output_width, shuffle_workspace);
+  for (int32 out_y = out_y_start_corner + 1; out_y < out_y_end_corner;
+           out_y++) {
+    DepthwiseConvPartial<EdgeType::kVertical, 1, 1>::Run(
+        input_ptr, filter_ptr, bias_data, output_ptr, &params);
+    input_ptr += params.stride_width * params.input_row_size;
+    output_ptr += params.output_row_size;
   }
-};
 
-inline bool Fast3x3FilterKernelSupported(const Dims<4>& input_dims,
-                                         const Dims<4>& filter_dims,
-                                         int stride_width, int stride_height,
-                                         int pad_width, int pad_height,
-                                         int depth_multiplier,
-                                         const Dims<4>& output_dims) {
-  const int input_height = ArraySize(input_dims, 2);
-  const int input_width = ArraySize(input_dims, 1);
-  const int input_depth = ArraySize(input_dims, 0);
-  const int filter_height = ArraySize(filter_dims, 2);
-  const int filter_width = ArraySize(filter_dims, 1);
-  const int output_height = ArraySize(output_dims, 2);
-  const int output_width = ArraySize(output_dims, 1);
-
-  bool supported = filter_width == 3 && filter_height == 3 &&
-                   depth_multiplier == 1 &&
-                   (stride_width == 1 || stride_width == 2) &&
-                   (stride_height == 1 || stride_height == 2) &&
-                   (stride_width == stride_height) && pad_width == 0 &&
-                   pad_height == 0 && (input_depth % 8) == 0;
+  // Handle right side.
+  input_ptr = input_data + (params.input_width - 2) * params.input_depth
+      + (params.stride_width - 1) * params.input_row_size;
+  filter_ptr = filter_data;
+  output_ptr = output_data + params.output_row_size +
+      (params.output_width - 1) * params.output_depth;
+
+  for (int32 out_y = out_y_start_corner + 1; out_y < out_y_end_corner;
+         out_y++) {
+    DepthwiseConvPartial<EdgeType::kVertical, 1, 1>::Run(
+        input_ptr, filter_ptr, bias_data, output_ptr, &params);
+    input_ptr += params.stride_width * params.input_row_size;
+    output_ptr += params.output_row_size;
+  }
+
+  // Handle bottom row.
+  input_ptr = input_data + (params.input_height - 2) * params.input_row_size;
+  filter_ptr = filter_data + params.output_depth;
+  output_ptr = output_data +
+      (params.output_height - 1) * params.output_row_size;
+
+  DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(input_ptr, filter_ptr,
+      bias_data, output_ptr, &params);
+
+  input_ptr += (params.stride_width == 1) ? 0 : params.input_depth;
+  filter_ptr = filter_data;
+  output_ptr += params.output_depth;
+
+  for (int32 out_x = out_x_start_corner + 1; out_x < out_x_end_corner;
+           out_x++) {
+    DepthwiseConvPartial<EdgeType::kHorizontal, 1, 1>::Run(
+        input_ptr, filter_ptr, bias_data, output_ptr, &params);
+    input_ptr += params.stride_width * params.input_depth;
+    output_ptr += params.output_depth;
+  }
+
+  DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(input_ptr, filter_ptr,
+      bias_data, output_ptr, &params);
+}
+
+inline bool Fast3x3FilterKernelSupported(
+    const Dims<4>& input_dims, const Dims<4>& filter_dims, int32 stride_width,
+    int32 stride_height, int32 pad_width, int32 pad_height,
+    int32 depth_multiplier, const Dims<4>& output_dims, int32 output_shift) {
+  const int32 input_height = ArraySize(input_dims, 2);
+  const int32 input_width = ArraySize(input_dims, 1);
+  const int32 input_depth = ArraySize(input_dims, 0);
+  const int32 filter_height = ArraySize(filter_dims, 2);
+  const int32 filter_width = ArraySize(filter_dims, 1);
+  const int32 output_height = ArraySize(output_dims, 2);
+  const int32 output_width = ArraySize(output_dims, 1);
+
+  bool supported =
+      filter_width == 3 && filter_height == 3 && depth_multiplier == 1 &&
+      (stride_width == 1 || stride_width == 2) &&
+      (stride_height == 1 || stride_height == 2) &&
+      (stride_width == stride_height) && (pad_width == 0 || pad_width == 1) &&
+      (pad_height == 0 || pad_height == 1) && (pad_width == pad_height) &&
+      (input_depth % 8) == 0 && (output_shift > 0);
 
   if (!supported) {
     return false;
@@ -4436,145 +3201,193 @@ inline bool Fast3x3FilterKernelSupported(const Dims<4>& input_dims,
   // Handle case where padding is zero but padding type is not kValid.
   // This would require special boundary case handling that is not supported.
 
-  const int out_x = output_width - 1;
-  const int out_y = output_height - 1;
+  const int32 out_x = output_width - 1;
+  const int32 out_y = output_height - 1;
 
-  const int in_x_origin = (out_x * stride_width) - pad_width;
-  const int in_y_origin = (out_y * stride_height) - pad_height;
+  const int32 in_x_origin = (out_x * stride_width) - pad_width;
+  const int32 in_y_origin = (out_y * stride_height) - pad_height;
 
-  const int in_x_end = in_x_origin + filter_width;
-  const int in_y_end = in_y_origin + filter_height;
+  const int32 in_x_end = in_x_origin + filter_width;
+  const int32 in_y_end = in_y_origin + filter_height;
 
   // Supported only if filter on the right and bottom boundary lies completely
-  // within the input.
-  return in_x_end <= input_width && in_y_end <= input_height;
+  // within the input if padding is zero.
+  if (pad_width == 0 && pad_height == 0) {
+    return in_x_end <= input_width && in_y_end <= input_height;
+  }
+
+  // Else if padding is 1, supported if bottom right filter lies +1 past input
+  // width and height.
+  supported = in_x_end <= (input_width + 1) && in_y_end <= (input_height + 1);
+
+  if (!supported) {
+    return false;
+  }
+
+  // Shapes with width 1 and height > 1, and vice versa are not supported yet.
+  if (input_width == 1) {
+    supported = (input_width == input_height);
+  } else if (input_height == 1) {
+    supported = (input_width == input_height);
+  }
+  return supported;
 }
 
 inline void DepthwiseConv3x3Filter(
     const uint8* input_data, const Dims<4>& input_dims, int32 input_offset,
     const uint8* filter_data, const Dims<4>& filter_dims, int32 filter_offset,
-    const int32* bias_data, const Dims<4>& bias_dims, int stride_width,
-    int stride_height, int pad_width, int pad_height, int depth_multiplier,
-    int32 output_offset, int32 output_multiplier, int output_shift,
-    int32 output_activation_min, int32 output_activation_max,
-    uint8* output_data, const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0);
-  const int input_height = ArraySize(input_dims, 2);
-  const int input_width = ArraySize(input_dims, 1);
-  const int input_depth = ArraySize(input_dims, 0);
-  const int filter_height = ArraySize(filter_dims, 2);
-  const int filter_width = ArraySize(filter_dims, 1);
-  const int output_height = ArraySize(output_dims, 2);
-  const int output_width = ArraySize(output_dims, 1);
-
-  // Algorithm assumes below constraints. It is optimized for depth multiplier
-  // of 1, 3x3 filter, no padding and strides 1 and 2.
-  TFLITE_DCHECK(output_depth == input_depth * depth_multiplier);
+    const int32* bias_data, const Dims<4>& bias_dims, int32 stride_width,
+    int32 stride_height, int32 pad_width, int32 pad_height,
+    int32 depth_multiplier, int32 output_offset, int32 output_multiplier,
+    int32 output_shift, int32 output_activation_min,
+    int32 output_activation_max, uint8* output_data,
+    const Dims<4>& output_dims) {
+  DepthwiseConvParams params;
+  params.input_depth = ArraySize(input_dims, 0);
+  params.input_width = ArraySize(input_dims, 1);
+  params.input_height = ArraySize(input_dims, 2);
+  params.input_row_size = params.input_depth * params.input_width;
+  params.input_offset = input_offset;
+  params.stride_width = stride_width;
+  params.stride_height = stride_height;
+  params.output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0);
+  params.output_width = ArraySize(output_dims, 1);
+  params.output_height = ArraySize(output_dims, 2);
+  params.output_row_size = params.output_depth * params.output_width;
+  params.output_offset = output_offset;
+  params.filter_offset = filter_offset;
+  params.output_multiplier = output_multiplier;
+  params.output_shift = output_shift;
+  params.output_activation_min = output_activation_min;
+  params.output_activation_max = output_activation_max;
+
+  const int32 filter_height = ArraySize(filter_dims, 2);
+  const int32 filter_width = ArraySize(filter_dims, 1);
+  params.filter_row_size = params.output_depth * filter_width;
+
+  // Algorithm assumes below constraints. It is optimized for depth
+  // multiplier of 1, 3x3 filter, no padding and strides 1 and 2.
+  TFLITE_DCHECK(params.output_depth == params.input_depth * depth_multiplier);
   TFLITE_DCHECK(depth_multiplier == 1);
   TFLITE_DCHECK(filter_height == 3);
   TFLITE_DCHECK(filter_width == 3);
-  TFLITE_DCHECK(pad_height == 0);
-  TFLITE_DCHECK(pad_width == 0);
   TFLITE_DCHECK(stride_height == 1 || stride_height == 2);
   TFLITE_DCHECK(stride_width == 1 || stride_width == 2);
   TFLITE_DCHECK(stride_width == stride_height);
+  TFLITE_DCHECK(pad_height == 0 || pad_height == 1);
+  TFLITE_DCHECK(pad_width == 0 || pad_width == 1);
+  TFLITE_DCHECK(pad_width == pad_height);
+
+  const int32 batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int64_t input_batch_size = params.input_row_size * params.input_height;
+  const int64_t output_batch_size =
+      params.output_row_size * params.output_height;
+
+  ShuffleParams one_row_shuffle_params, two_row_shuffle_params,
+      four_row_shuffle_params, eight_row_shuffle_params;
+  if (stride_width == 1) {
+    one_row_shuffle_params = ShuffleParams(30, 1, 1, 1);
+    two_row_shuffle_params = ShuffleParams(22, 2, 1, 1);
+    four_row_shuffle_params = ShuffleParams(14, 4, 1, 1);
+    eight_row_shuffle_params = ShuffleParams(8, 8, 1, 1);
+  } else {
+    one_row_shuffle_params = ShuffleParams(14, 1, 2, 2);
+    two_row_shuffle_params = ShuffleParams(8, 2, 2, 2);
+    four_row_shuffle_params = ShuffleParams(4, 4, 2, 2);
+    eight_row_shuffle_params = ShuffleParams(2, 8, 2, 2);
+  }
 
-  const int input_row_size = input_depth * (input_width + 2 * pad_width);
-  const int output_row_size = output_depth * output_width;
-  const int input_batch_size = input_row_size * (input_height + 2 * pad_height);
-  const int output_batch_size = output_depth * output_width * output_height;
-
-  using conv_row_func_t = decltype(&ConvRow3x3FilterDepth8<1, 1, 1>::Run);
-  conv_row_func_t conv_1_output_row = ConvRow3x3FilterDepth8<1, 1, 1>::Run;
-  conv_row_func_t conv_2_output_rows = ConvRow3x3FilterDepth8<2, 1, 1>::Run;
-  conv_row_func_t conv_4_output_rows = ConvRow3x3FilterDepth8<4, 1, 1>::Run;
-  conv_row_func_t conv_8_output_rows = ConvRow3x3FilterDepth8<8, 1, 1>::Run;
-
+  using conv_multirow_func_t = decltype(&DepthwiseConvMultiRow<1, 1>::Run);
+  conv_multirow_func_t conv_multirow_func = DepthwiseConvMultiRow<1, 1>::Run;
   if (stride_width == 2) {
-    conv_1_output_row = ConvRow3x3FilterDepth8<1, 2, 2>::Run;
-    conv_2_output_rows = ConvRow3x3FilterDepth8<2, 2, 2>::Run;
-    conv_4_output_rows = ConvRow3x3FilterDepth8<4, 2, 2>::Run;
-    conv_8_output_rows = ConvRow3x3FilterDepth8<8, 2, 2>::Run;
+    conv_multirow_func = DepthwiseConvMultiRow<2, 2>::Run;
   }
 
   // Allocate maximum memory needed for shuffled input.
   // TODO(mariewhite): The size of this workspace is small enough to be
   // allocated on the stack. Eventually we will want to move it to the heap
-  // and have it allocated outside of this function, like the im2col_array used
-  // in gemmlowp.
-#define DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE 10 * 10 * 64
+  // and have it allocated outside of this function, like the im2col_array
+  // used in gemmlowp.
   uint8 shuffle_workspace[DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE];
 
-  // Make sure the kernels using this buffer will not run out of bounds.
-  static_assert(ConvRow3x3FilterDepth8<8, 1, 1>::ShuffleWorkspaceSize() <=
-                    DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE,
-                "Shuffle workspace size is too small.");
-  static_assert(ConvRow3x3FilterDepth8<4, 2, 2>::ShuffleWorkspaceSize() <=
-                    DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE,
-                "Shuffle workspace size is too small.");
-
-#undef DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE
-
-  for (int b = 0; b < batches; ++b) {
+  for (int32 b = 0; b < batches; ++b) {
     const uint8* input_ptr = input_data + b * input_batch_size;
     uint8* output_ptr = output_data + b * output_batch_size;
 
-    int out_y = 0;
+    int32 out_x = 0;
+    int32 out_y = 0;
+    int32 end_x = params.output_width;
+    int32 end_y = params.output_height;
+
+    if (pad_width == 1 && pad_height == 1) {
+      DepthwiseConvHandlePadding(input_ptr, filter_data, bias_data, output_ptr,
+                                 params);
+
+      // Update extents now that the edges have been handled.
+      out_x = 1;
+      end_x = params.output_width - 1;
+      out_y = 1;
+      end_y = params.output_height - 1;
+      const int in_x = (out_x * stride_width) - pad_width;
+      const int in_y = (out_y * stride_height) - pad_height;
+      input_ptr += in_y * params.input_row_size + in_x * params.input_depth;
+      output_ptr += out_y * params.output_row_size
+          + out_x * params.output_depth;
+    }
+
+    // Shuffling shapes that maximize width over the shuffle workspace size
+    // perform better since the inputs are closer together, minimizing
+    // shuffling time.
+    //
+    // If the input shape has width large enough for the 2 row kernels,
+    // we prefer to use this. The innermost loop of the kernels handle
+    // 2 height x 2 width so this is the fastest path.
+    //
+    // If the input shape has smaller width but larger height, shuffling is
+    // still useful and can benefit from kernels 4 row and 8 row kernels.
 
     // Handle 8 rows at a time.
-    for (; out_y <= output_height - 8; out_y += 8) {
-      conv_8_output_rows(input_ptr, 0, out_y, input_depth, input_width,
-                         input_height, input_row_size, input_offset,
-                         filter_data, filter_offset, bias_data, output_offset,
-                         output_multiplier, output_shift, output_activation_min,
-                         output_activation_max, output_ptr, output_depth,
-                         output_width, shuffle_workspace);
-
-      input_ptr += 8 * stride_height * input_row_size;
-      output_ptr += 8 * output_row_size;
+    if (params.input_width < four_row_shuffle_params.input_width) {
+      for (; out_y <= end_y - 8; out_y += 8) {
+        conv_multirow_func(input_ptr, out_x, end_x, filter_data, bias_data,
+                           output_ptr, params, eight_row_shuffle_params,
+                           shuffle_workspace);
+        input_ptr += 8 * stride_height * params.input_row_size;
+        output_ptr += 8 * params.output_row_size;
+      }
     }
 
     // Handle 4 rows at a time.
-    for (; out_y <= output_height - 4; out_y += 4) {
-      conv_4_output_rows(input_ptr, 0, out_y, input_depth, input_width,
-                         input_height, input_row_size, input_offset,
-                         filter_data, filter_offset, bias_data, output_offset,
-                         output_multiplier, output_shift, output_activation_min,
-                         output_activation_max, output_ptr, output_depth,
-                         output_width, shuffle_workspace);
-
-      input_ptr += 4 * stride_height * input_row_size;
-      output_ptr += 4 * output_row_size;
+    if (params.input_width < two_row_shuffle_params.input_width) {
+      for (; out_y <= end_y - 4; out_y += 4) {
+        conv_multirow_func(input_ptr, out_x, end_x, filter_data, bias_data,
+                           output_ptr, params, four_row_shuffle_params,
+                           shuffle_workspace);
+        input_ptr += 4 * stride_height * params.input_row_size;
+        output_ptr += 4 * params.output_row_size;
+      }
     }
 
     // Handle 2 rows at a time.
-    for (; out_y <= output_height - 2; out_y += 2) {
-      conv_2_output_rows(input_ptr, 0, out_y, input_depth, input_width,
-                         input_height, input_row_size, input_offset,
-                         filter_data, filter_offset, bias_data, output_offset,
-                         output_multiplier, output_shift, output_activation_min,
-                         output_activation_max, output_ptr, output_depth,
-                         output_width, shuffle_workspace);
-
-      input_ptr += 2 * stride_height * input_row_size;
-      output_ptr += 2 * output_row_size;
+    for (; out_y <= end_y - 2; out_y += 2) {
+      conv_multirow_func(input_ptr, out_x, end_x, filter_data, bias_data,
+                         output_ptr, params, two_row_shuffle_params,
+                         shuffle_workspace);
+      input_ptr += 2 * stride_height * params.input_row_size;
+      output_ptr += 2 * params.output_row_size;
     }
 
     // Handle one row at a time.
-    for (; out_y < output_height; out_y++) {
-      conv_1_output_row(input_ptr, 0, out_y, input_depth, input_width,
-                        input_height, input_row_size, input_offset, filter_data,
-                        filter_offset, bias_data, output_offset,
-                        output_multiplier, output_shift, output_activation_min,
-                        output_activation_max, output_ptr, output_depth,
-                        output_width, shuffle_workspace);
-
-      input_ptr += stride_height * input_row_size;
-      output_ptr += output_row_size;
+    for (; out_y < end_y; out_y++) {
+      conv_multirow_func(input_ptr, out_x, end_x, filter_data, bias_data,
+                         output_ptr, params, one_row_shuffle_params,
+                         shuffle_workspace);
+      input_ptr += stride_height * params.input_row_size;
+      output_ptr += params.output_row_size;
     }
   }
 }
+// clang-format on
 
 #endif  // __aarch64__
 
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h b/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h
index 0bfb4e9b1f8ee4..27d9224512a835 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/multithreaded_conv.h
@@ -129,8 +129,9 @@ class EigenTensorConvFunctor {
       const int conv_width = output_height * output_width;
       Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair;
       dim_pair[0] = Eigen::IndexPair<Eigen::DenseIndex>(1, 0);
-      EigenMatrix output(output_data, conv_width, filter_count);
-      ConstEigenMatrix input(input_data, conv_width, input_depth);
+      EigenMatrix output(output_data, input_batches * conv_width, filter_count);
+      ConstEigenMatrix input(input_data, input_batches * conv_width,
+                             input_depth);
       ConstEigenMatrix filter(filter_data, input_depth, filter_count);
       MatMulConvFunctor<Eigen::ThreadPoolDevice, T>()(device, output, input,
                                                       filter, dim_pair);
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
index 780401e052733c..38ad32c734a228 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -12,13 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdlib.h>
 #include <string.h>
 
 #include "tensorflow/contrib/lite/builtin_op_data.h"
-#include "tensorflow/contrib/lite/kernels/internal/common.h"
 #include "tensorflow/contrib/lite/kernels/activation_functor.h"
 #include "tensorflow/contrib/lite/kernels/internal/common.h"
+#include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
 #include "tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h"
+#include "tensorflow/contrib/lite/kernels/internal/round.h"
 
 #ifdef USE_NEON
 
@@ -26,6 +28,22 @@ limitations under the License.
 
 namespace tflite {
 namespace tensor_utils {
+namespace {
+
+// Allocates, at least, size bytes of uninitialized storage whose alignment is
+// specified by alignment. The size parameter must be an integral multiple of
+// alignment.
+// Caller is responsible by freeing the allocated memory by calling free on
+// the passed freeing_buffer pointer.
+void* aligned_alloc(size_t alignment, size_t size, void** freeing_buffer) {
+  *freeing_buffer = malloc(size + alignment);
+  const size_t offset = ((uintptr_t)*freeing_buffer) % alignment;  // NOLINT
+  return offset == 0
+             ? *freeing_buffer
+             : ((char*)*freeing_buffer + (alignment - offset));  // NOLINT
+}
+
+}  // namespace
 
 void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
                                              int m_cols, const float* vector,
@@ -38,9 +56,12 @@ void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
       m_cols - (m_cols & (kFloatWeightsPerNeonLane - 1));
 
   // The arrays used to cache the vector.
+  void* aligned_vector_cache_free = nullptr;
   float32x4_t* vector_cache_float32x4 =
-      new float32x4_t[(m_cols / kFloatWeightsPerNeonLane) *
-                      sizeof(float32x4_t)];
+      reinterpret_cast<float32x4_t*>(aligned_alloc(
+          sizeof(float32x4_t), (postamble_start >> 2) * sizeof(float32x4_t),
+          &aligned_vector_cache_free));
+
   const int kUnrollSize = 2;
   for (int b = 0; b < n_batch; b++) {
     float* result_in_batch = result + b * m_rows * result_stride;
@@ -53,7 +74,7 @@ void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
       matrix_ptr1 = matrix + m_cols;
     }
 
-    // Cahce the vector.
+    // Cache the vector.
     for (int c = 0; c < postamble_start; c += kFloatWeightsPerNeonLane) {
       vector_cache_float32x4[c >> 2] = vld1q_f32(vector_in_batch + c);
     }
@@ -110,7 +131,115 @@ void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
       result_in_batch += result_stride;
     }
   }
-  delete[] vector_cache_float32x4;
+  free(aligned_vector_cache_free);
+}
+
+void NeonMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors, const float* scaling_factors,
+    int n_batch, float* __restrict__ result, int result_stride) {
+  const int kWeightsPerUint32 = 4;
+  const int kWeightsPerNeonLane = 16;
+  // If the number of rows is not divisible by kWeightsPerUint32, we set a
+  // flag and allocate an aligned memory block. The flag is used to use the
+  // aligned memory block later in the kernel loop.
+  bool unaligned = false;
+  int8* aligned_row = nullptr;
+  void* aligned_row_free = nullptr;
+  if ((m_cols & (kWeightsPerUint32 - 1)) != 0) {
+    unaligned = true;
+    aligned_row = (int8*)aligned_alloc(kWeightsPerUint32, m_cols,  // NOLINT
+                                       &aligned_row_free);
+  }
+  void* aligned_vec_free = nullptr;
+  int8* aligned_vec = (int8*)aligned_alloc(kWeightsPerUint32, m_cols,  // NOLINT
+                                           &aligned_vec_free);
+
+  // If m_cols is not at least kWeightsPerNeonLane, we cannot use the main
+  // vectorized loop, and we need to process sequentially. postamble_start shows
+  // the start index where this should happen.
+  const int postamble_start = m_cols - (m_cols & (kWeightsPerNeonLane - 1));
+
+  int batch, row, col;
+  for (batch = 0; batch < n_batch; ++batch) {
+    const float batch_scaling_factor_inv = 1.0 / scaling_factors[batch];
+    // Copy the vector data to an aligned vector.
+    memcpy(aligned_vec, vectors + batch * m_cols, sizeof(int8) * m_cols);
+    // Compute dot-product for every column.
+    for (row = 0; row < m_rows; ++row, result += result_stride) {
+      // Get the address of the first element of the row.
+      int8* row_ptr = (int8*)matrix + row * m_cols;  // NOLINT
+      if (unaligned) {
+        memcpy(aligned_row, row_ptr, sizeof(int8) * m_cols);
+        row_ptr = aligned_row;
+      }
+
+      // Initialize the dot product sum for the row to 0.
+      int32x4_t dotprod = vmovq_n_s32(0);
+
+      // Prefetch the row to cache.
+      __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
+                         3 /* temporal locality */);
+
+      // For every block of 16 8-bit elements.
+      col = 0;
+      for (; col < postamble_start; col += kWeightsPerNeonLane) {
+        // Load 16 8-bit values from the row and vector, each, to operate on.
+        // Here the assumption is that each buffer is 4-byte aligned.
+        TFLITE_CHECK_EQ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1),
+                        0);
+        const int8x16_t s1_8x16 = vld1q_s8((const int8_t*)(aligned_vec + col));
+        const int8x16_t s2_8x16 = vld1q_s8((const int8_t*)(row_ptr + col));
+        // Multiply the low bits (i.e. the lower 8 8bit numbers in the
+        // registers).
+        int16x8_t prod_16x8 =
+            vmull_s8(vget_low_s8(s1_8x16), vget_low_s8(s2_8x16));
+        // Multiply the high bits (i.e. the lower 8 8bit numbers in the
+        // registers), and accumulate with the result of the low bits product.
+        // The assumption here is that overflow will not happen as we quantize
+        // our values to be in the range [-127, 127]. As such the sum of the 2
+        // products is always strictly smaller than 15-bits (32767 in absolute
+        // value).
+        prod_16x8 =
+            vmlal_s8(prod_16x8, vget_high_s8(s1_8x16), vget_high_s8(s2_8x16));
+
+        dotprod = vpadalq_s16(dotprod, prod_16x8);
+      }  // for col
+
+      int32 postable_sum = 0;
+      // Postamble loop.
+      // TODO(raziel): if (ABSL_PREDICT_FALSE(postamble_start < m_rows))
+      if (postamble_start < m_cols) {
+        col = postamble_start;
+        if ((m_cols - postamble_start) >= (kWeightsPerNeonLane >> 1)) {
+          // Load 8 8-bit values from the row and column each to operate on.
+          // Here the assumption is that each buffer is 4-bytes aligned.
+          TFLITE_CHECK_EQ((uintptr_t)(&row_ptr[col]) & (kWeightsPerUint32 - 1),
+                          0);
+          const int8x8_t s1_8x8 = vld1_s8((const int8_t*)(aligned_vec + col));
+          const int8x8_t s2_8x8 = vld1_s8((const int8_t*)(row_ptr + col));
+          const int16x8_t prod_16x8 = vmull_s8(s1_8x8, s2_8x8);
+          dotprod = vpadalq_s16(dotprod, prod_16x8);
+          col += (kWeightsPerNeonLane >> 1);
+        }
+        for (; col < m_cols; ++col) {
+          postable_sum += row_ptr[col] * aligned_vec[col];
+        }  // for col
+      }
+      // Add the 4 intermediate sum values to get the final dot-prod value for
+      // this row.
+      int64x2_t pairwiseAdded = vpaddlq_s32(dotprod);
+      int32 neon_sum =
+          vgetq_lane_s64(pairwiseAdded, 0) + vgetq_lane_s64(pairwiseAdded, 1);
+
+      *result += ((neon_sum + postable_sum) * batch_scaling_factor_inv);
+    }  // for row
+  }    // for batch
+
+  if (unaligned) {
+    free(aligned_row_free);
+  }
+  free(aligned_vec_free);
 }
 
 void NeonVectorVectorCwiseProduct(const float* vector1, const float* vector2,
@@ -168,9 +297,12 @@ void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector,
       v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
 
   // The arrays used to cache the vector.
+  void* aligned_vector_cache_free = nullptr;
   float32x4_t* vector_cache_float32x4 =
-      new float32x4_t[(v_size / kFloatWeightsPerNeonLane) *
-                      sizeof(float32x4_t)];
+      reinterpret_cast<float32x4_t*>(aligned_alloc(
+          sizeof(float32x4_t), (postamble_start >> 2) * sizeof(float32x4_t),
+          &aligned_vector_cache_free));
+
   for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
     vector_cache_float32x4[v >> 2] = vld1q_f32(vector + v);
   }
@@ -196,7 +328,7 @@ void NeonVectorBatchVectorCwiseProductAccumulate(const float* vector,
     result_ptr += v_size;
     batch_vector_ptr += v_size;
   }
-  delete[] vector_cache_float32x4;
+  free(aligned_vector_cache_free);
 }
 
 void NeonSub1Vector(const float* vector, int v_size, float* result) {
@@ -220,6 +352,30 @@ void NeonSub1Vector(const float* vector, int v_size, float* result) {
   }
 }
 
+bool NeonIsZeroVector(const float* vector, int v_size) {
+  // If v_size is not divisible by kFloatWeightsPerNeonLane, we cannot
+  // use the main vectorized loop, and we need to process sequentially.
+  // postamble_start shows the start index where this should happen.
+  const int postamble_start =
+      v_size - (v_size & (kFloatWeightsPerNeonLane - 1));
+
+  const float32x4_t zero_x4_float = vmovq_n_f32(0.0f);
+  for (int v = 0; v < postamble_start; v += kFloatWeightsPerNeonLane) {
+    const float32x4_t i_x4_float = vld1q_f32(vector + v);
+    uint32x4_t cmp_result = vceqq_f32(i_x4_float, zero_x4_float);
+    if (vgetq_lane_u32(cmp_result, 0) == 0) return false;
+    if (vgetq_lane_u32(cmp_result, 1) == 0) return false;
+    if (vgetq_lane_u32(cmp_result, 2) == 0) return false;
+    if (vgetq_lane_u32(cmp_result, 3) == 0) return false;
+  }
+
+  // Postamble loop
+  for (int v = postamble_start; v < v_size; ++v) {
+    if (vector[v] != 0.0) return false;
+  }
+  return true;
+}
+
 void NeonClipVector(const float* vector, int v_size, float abs_limit,
                     float* result) {
   // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
@@ -248,6 +404,83 @@ void NeonClipVector(const float* vector, int v_size, float abs_limit,
   }
 }
 
+void NeonSymmetricQuantizeFloats(const float* values, const int size,
+                                 int8_t* quantized_values, float* min,
+                                 float* max, float* scaling_factor) {
+  // TODO(raziel): vectorize min/max calculation.
+  auto minmax = std::minmax_element(values, values + size);
+  *min = *minmax.first;
+  *max = *minmax.second;
+  const int kScale = 127;
+  const float range = std::max(std::abs(*min), std::abs(*max));
+  if (range == 0) {
+    memset(quantized_values, 0, size * sizeof(int8_t));
+    *scaling_factor = 1;
+    return;
+  }
+  *scaling_factor = kScale / range;
+
+  const int postamble_start =
+      size - (size & (2 * kFloatWeightsPerNeonLane - 1));
+
+  // Vectorized constants.
+  const float32x4_t q_factor_f32x4 = vmovq_n_f32(*scaling_factor);
+  const float32x4_t point5_f32x4 = vmovq_n_f32(0.5);
+  const float32x4_t zero_f32x4 = vmovq_n_f32(0.0);
+  const int32x4_t scale_i32x4 = vmovq_n_s32(kScale);
+  const int32x4_t neg_scale_i32x4 = vmovq_n_s32(-kScale);
+
+  for (int i = 0; i < postamble_start; i += 2 * kFloatWeightsPerNeonLane) {
+    // Implements the vectorized version of the following:
+    // const int32 quantized_value = static_cast<int32>(
+    //    std::round(*scaling_factor * values[i]));
+    // Since the vectorized round intrinsics (vrndqa_f32) is not supported
+    // on all Neon flavors, we use the following method for rounding: if (x
+    // < 0) (int)(x - 0.5) if (x >= 0) (int)(x + 0.5)
+    float32x4_t value0_f32x4 = vld1q_f32(&values[i]);
+    float32x4_t value1_f32x4 = vld1q_f32(&values[i + kFloatWeightsPerNeonLane]);
+    float32x4_t mul0_f32x4 = vmulq_f32(value0_f32x4, q_factor_f32x4);
+    float32x4_t mul1_f32x4 = vmulq_f32(value1_f32x4, q_factor_f32x4);
+
+    int32x4_t cmp_with_zero0_ui32x4 =
+        (int32x4_t)vcltq_f32(mul0_f32x4, zero_f32x4);  // NOLINT
+    int32x4_t cmp_with_zero1_ui32x4 =
+        (int32x4_t)vcltq_f32(mul1_f32x4, zero_f32x4);  // NOLINT
+
+    float32x4_t cmp_with_zero0_f32x4 = vcvtq_f32_s32(cmp_with_zero0_ui32x4);
+    float32x4_t cmp_with_zero1_f32x4 = vcvtq_f32_s32(cmp_with_zero1_ui32x4);
+    cmp_with_zero0_f32x4 = vaddq_f32(cmp_with_zero0_f32x4, point5_f32x4);
+    cmp_with_zero1_f32x4 = vaddq_f32(cmp_with_zero1_f32x4, point5_f32x4);
+
+    mul0_f32x4 = vaddq_f32(mul0_f32x4, cmp_with_zero0_f32x4);
+    mul1_f32x4 = vaddq_f32(mul1_f32x4, cmp_with_zero1_f32x4);
+
+    int32x4_t f2i0_i32x4 = vcvtq_s32_f32(mul0_f32x4);
+    int32x4_t f2i1_i32x4 = vcvtq_s32_f32(mul1_f32x4);
+
+    // Implements the vectorized version of the folowing block:
+    //  quantized_values[i] = std::min(kScale, std::max(-kScale,
+    //  quantized_value));
+    int32x4_t max0_i32x4 = vmaxq_s32(f2i0_i32x4, neg_scale_i32x4);
+    int32x4_t max1_i32x4 = vmaxq_s32(f2i1_i32x4, neg_scale_i32x4);
+    int32x4_t min0_i32x4 = vminq_s32(max0_i32x4, scale_i32x4);
+    int32x4_t min1_i32x4 = vminq_s32(max1_i32x4, scale_i32x4);
+
+    int16x4_t min0_16x4 = vmovn_s32(min0_i32x4);
+    int16x4_t min1_16x4 = vmovn_s32(min1_i32x4);
+
+    int16x8_t min_16x8 = vcombine_s16(min0_16x4, min1_16x4);
+    int8x8_t min_s8x8 = vqmovn_s16(min_16x8);
+    vst1_s8(&quantized_values[i], min_s8x8);
+  }
+
+  for (int i = postamble_start; i < size; ++i) {
+    const int32 quantized_value =
+        static_cast<int32>(TfLiteRound(*scaling_factor * values[i]));
+    quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value));
+  }
+}
+
 float NeonVectorVectorDotProduct(const float* vector1, const float* vector2,
                                  int v_size) {
   // If v_size is not divisible by kWeightsPerNeonLane, we cannot use the main
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h
index b7e317dc60e2c6..7a5a8fc5412394 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/neon_tensor_utils.h
@@ -32,6 +32,14 @@ void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
                    vector, n_batch, result, result_stride);
 }
 
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors, const float* scaling_factors,
+    int n_batch, float* __restrict__ result, int result_stride) {
+  NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols,
+                   vectors, scaling_factors, n_batch, result, result_stride);
+}
+
 void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
                               int v_size, float* result) {
   NEON_OR_PORTABLE(VectorVectorCwiseProduct, vector1, vector2, v_size, result);
@@ -92,11 +100,23 @@ void ZeroVector(float* vector, int v_size) {
 
 float Clip(float f, float abs_limit) { return PortableClip(f, abs_limit); }
 
+// Check if all entries of a vector are zero.
+bool IsZeroVector(const float* vector, int v_size) {
+  return NEON_OR_PORTABLE(IsZeroVector, vector, v_size);
+}
+
 void ClipVector(const float* vector, int v_size, float abs_limit,
                 float* result) {
   NEON_OR_PORTABLE(ClipVector, vector, v_size, abs_limit, result);
 }
 
+void SymmetricQuantizeFloats(const float* values, const int size,
+                             int8_t* quantized_values, float* min, float* max,
+                             float* scaling_factor) {
+  NEON_OR_PORTABLE(SymmetricQuantizeFloats, values, size, quantized_values, min,
+                   max, scaling_factor);
+}
+
 void VectorShiftLeft(float* vector, int v_size, float shift_value) {
   NEON_OR_PORTABLE(VectorShiftLeft, vector, v_size, shift_value);
 }
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index 9e9aba0169bcd4..0ce781db59a2cf 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -31,12 +31,33 @@ limitations under the License.
 #include "public/gemmlowp.h"
 #include "tensorflow/contrib/lite/kernels/internal/common.h"
 #include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/contrib/lite/kernels/internal/round.h"
+#include "tensorflow/contrib/lite/kernels/internal/strided_slice_logic.h"
 #include "tensorflow/contrib/lite/kernels/internal/types.h"
 
 namespace tflite {
 namespace optimized_ops {
 
+// Unoptimized reference ops:
+using reference_ops::BroadcastGreater;
+using reference_ops::BroadcastGreaterEqual;
+using reference_ops::BroadcastLess;
+using reference_ops::BroadcastLessEqual;
+using reference_ops::Greater;
+using reference_ops::GreaterEqual;
+using reference_ops::Less;
+using reference_ops::LessEqual;
+using reference_ops::RankOneSelect;
+using reference_ops::Select;
+
+// TODO(b/80247582) Remove this constant.
+// This will be phased out as the shifts are revised with more thought. Use of a
+// constant enables us to track progress on this work.
+//
+// Used mainly to convert from old-style shifts (right) to new-style (left).
+static constexpr int kReverseShift = -1;
+
 // Make a local VectorMap typedef allowing to map a float array
 // as a Eigen vector expression. The std::conditional here is to
 // construct the suitable Eigen type for the constness of the
@@ -53,7 +74,7 @@ using VectorMap = typename std::conditional<
 
 template <typename Scalar, int N>
 VectorMap<Scalar> MapAsVector(Scalar* data, const Dims<N>& dims) {
-  const int size = RequiredBufferSizeForDims(dims);
+  const int size = FlatSize(dims);
   return VectorMap<Scalar>(data, size, 1);
 }
 
@@ -126,6 +147,45 @@ MatrixMap<Scalar> MapAsMatrixWithGivenNumberOfRows(Scalar* data,
   return MatrixMap<Scalar>(data, rows, cols);
 }
 
+// This is like the template-parameter version, except that the power-of-two is
+// passed as a function parameter. The template version is to be preferred,
+// since some target hardware optimizations depend on the range of the exponent.
+template <typename IntegerType>
+IntegerType SaturatingRoundingMultiplyByPOTParam(IntegerType x, int exponent) {
+  if (exponent == 0) {
+    return x;
+  }
+  using ScalarIntegerType =
+      typename gemmlowp::FixedPointRawTypeTraits<IntegerType>::ScalarRawType;
+  const IntegerType min =
+      gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::min());
+  const IntegerType max =
+      gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::max());
+  const int ScalarIntegerTypeBits = 8 * sizeof(ScalarIntegerType);
+
+  const std::int32_t threshold =
+      ((1 << (ScalarIntegerTypeBits - 1 - exponent)) - 1);
+  const IntegerType positive_mask =
+      gemmlowp::MaskIfGreaterThan(x, gemmlowp::Dup<IntegerType>(threshold));
+  const IntegerType negative_mask =
+      gemmlowp::MaskIfLessThan(x, gemmlowp::Dup<IntegerType>(-threshold));
+
+  IntegerType result = gemmlowp::ShiftLeft(x, exponent);
+  result = gemmlowp::SelectUsingMask(positive_mask, max, result);
+  result = gemmlowp::SelectUsingMask(negative_mask, min, result);
+  return result;
+}
+
+// This is like the template-parameter version, except that the power-of-two is
+// passed as a function parameter. See raw-integer version for further comments.
+template <typename tRawType, int tIntegerBits>
+gemmlowp::FixedPoint<tRawType, tIntegerBits>
+SaturatingRoundingMultiplyByPOTParam(
+    gemmlowp::FixedPoint<tRawType, tIntegerBits> a, int exponent) {
+  return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw(
+      SaturatingRoundingMultiplyByPOTParam(a.raw(), exponent));
+}
+
 // DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING ELEMENT-WISE
 // BROADCASTING.
 //
@@ -235,8 +295,8 @@ inline void AddBiasAndEvalActivationFunction(const float* bias_data,
                                              float output_activation_max) {
 #ifdef USE_NEON
   gemmlowp::ScopedProfilingLabel label("AddBiasAndEvalActivationFunction");
-  const int bias_size = bias_dims.sizes[3] * bias_dims.strides[3];
-  const int array_size = array_dims.sizes[3] * array_dims.strides[3];
+  const int bias_size = FlatSize(bias_dims);
+  const int array_size = FlatSize(array_dims);
   TFLITE_DCHECK_EQ((array_size % bias_size), 0);
   float* array_ptr = array_data;
   float* array_end_ptr = array_ptr + array_size;
@@ -286,8 +346,8 @@ inline void AddBiasAndEvalActivationFunction(const float* bias_data,
   }
 #else  // not NEON
   gemmlowp::ScopedProfilingLabel label("AddBiasAndEvalActivationFunction");
-  const int bias_size = bias_dims.sizes[3] * bias_dims.strides[3];
-  const int array_size = array_dims.sizes[3] * array_dims.strides[3];
+  const int bias_size = FlatSize(bias_dims);
+  const int array_size = FlatSize(array_dims);
   TFLITE_DCHECK_EQ((array_size % bias_size), 0);
   for (int array_offset = 0; array_offset < array_size;
        array_offset += bias_size) {
@@ -358,10 +418,8 @@ inline void GEMVForLstmCell(const uint8* input_data, const Dims<4>& input_dims,
   TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims));
   TFLITE_DCHECK(IsPackedWithoutStrides(bias_dims));
   TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
-  TFLITE_DCHECK_EQ(ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
-                       ArraySize(output_dims, 3),
-                   1);
-  const int input_size = input_dims.strides[3];
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(output_dims, 0), 1);
+  const int input_size = FlatSizeSkipDim(input_dims, 3);
   const int output_size = MatchingArraySize(weights_dims, 1, output_dims, 0);
   // This special fast path for quantized LSTM cells does not try to support
   // odd sizes that we haven't encountered in any LSTM cell, that would
@@ -544,10 +602,8 @@ inline void GEMVForLstmCellWithSymmetricRange(
   TFLITE_DCHECK(IsPackedWithoutStrides(weights_dims));
   TFLITE_DCHECK(IsPackedWithoutStrides(bias_dims));
   TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
-  TFLITE_DCHECK_EQ(ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
-                       ArraySize(output_dims, 3),
-                   1);
-  const int input_size = input_dims.strides[3];
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(output_dims, 0), 1);
+  const int input_size = FlatSizeSkipDim(input_dims, 3);
   const int output_size = MatchingArraySize(weights_dims, 1, output_dims, 0);
   // This special fast path for quantized LSTM cells does not try to support
   // odd sizes that we haven't encountered in any LSTM cell, that would
@@ -880,10 +936,8 @@ inline void FullyConnectedAsGEMV(
   TFLITE_DCHECK(IsPackedWithoutStrides(filter_dims));
   TFLITE_DCHECK(IsPackedWithoutStrides(bias_dims));
   TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
-  TFLITE_DCHECK_EQ(ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
-                       ArraySize(output_dims, 3),
-                   1);
-  const int input_size = input_dims.strides[3];
+  TFLITE_DCHECK_EQ(FlatSizeSkipDim(output_dims, 0), 1);
+  const int input_size = FlatSizeSkipDim(input_dims, 3);
   const int output_size = MatchingArraySize(filter_dims, 1, output_dims, 0);
   static constexpr int kPeel = 4;
   for (int k = 0; k < input_size; k += 64) {
@@ -1064,8 +1118,7 @@ inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
   // but the current --variable_batch hack consists in overwriting the 3rd
   // dimension with the runtime batch size, as we don't keep track for each
   // array of which dimension is the batch dimension in it.
-  const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
-                      ArraySize(output_dims, 3);
+  const int batches = FlatSizeSkipDim(output_dims, 0);
 #ifdef USE_NEON
   const int output_size = MatchingArraySize(filter_dims, 1, output_dims, 0);
   if (batches == 1 && !(output_size % 4)) {
@@ -1121,8 +1174,7 @@ inline void FullyConnected(
   // but the current --variable_batch hack consists in overwriting the 3rd
   // dimension with the runtime batch size, as we don't keep track for each
   // array of which dimension is the batch dimension in it.
-  const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
-                      ArraySize(output_dims, 3);
+  const int batches = FlatSizeSkipDim(output_dims, 0);
   const int output_depth = MatchingArraySize(filter_dims, 1, output_dims, 0);
   const int accum_depth = ArraySize(filter_dims, 0);
   TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
@@ -1537,8 +1589,7 @@ inline void ExperimentalShuffledFullyConnected(
   // but the current --variable_batch hack consists in overwriting the 3rd
   // dimension with the runtime batch size, as we don't keep track for each
   // array of which dimension is the batch dimension in it.
-  const int batches = ArraySize(output_dims, 1) * ArraySize(output_dims, 2) *
-                      ArraySize(output_dims, 3);
+  const int batches = FlatSizeSkipDim(output_dims, 0);
   const int output_depth = MatchingArraySize(weights_dims, 1, output_dims, 0);
   const int accum_depth = ArraySize(weights_dims, 0);
   TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
@@ -1725,6 +1776,100 @@ inline void ExtractPatchIntoBufferColumn(
   }
 }
 
+template <typename T>
+void DilatedIm2col(const T* input_data, const Dims<4>& input_dims,
+                   const Dims<4>& filter_dims, int stride_width,
+                   int stride_height, int dilation_width_factor,
+                   int dilation_height_factor, int pad_width, int pad_height,
+                   const Dims<4>& output_dims, uint8 byte_zero,
+                   T* im2col_data) {
+  // For dilated convolution, the input pixels are not contiguous therefore we
+  // can't use the same opitimizations as Im2Col(). Though note this code would
+  // work fine for the non-dilated case too (though likely a bit slower).
+  gemmlowp::ScopedProfilingLabel label("DilatedIm2col");
+  TFLITE_DCHECK(dilation_width_factor != 1 || dilation_height_factor != 1);
+  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(filter_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+  TFLITE_DCHECK(im2col_data);
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 0);
+  const int filter_height = ArraySize(filter_dims, 2);
+  const int filter_width = ArraySize(filter_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  MatchingArraySize(output_dims, 0, filter_dims, 3);
+
+  // Construct the MxN sized im2col matrix.
+  // The rows M, are sub-ordered B x H x W
+  Dims<4> row_dims;
+  row_dims.sizes[0] = output_width;
+  row_dims.sizes[1] = output_height;
+  row_dims.sizes[2] = batches;
+  row_dims.sizes[3] = 1;
+  ComputeStrides(&row_dims);
+
+  // The columns, N, are sub-ordered Kh x Kw x Din
+  Dims<4> col_dims;
+  col_dims.sizes[0] = input_depth;
+  col_dims.sizes[1] = filter_width;
+  col_dims.sizes[2] = filter_height;
+  col_dims.sizes[3] = 1;
+  ComputeStrides(&col_dims);
+
+  // Use dimensions M and N to construct dims for indexing directly into im2col
+  Dims<4> im2col_dims;
+  im2col_dims.sizes[0] = col_dims.strides[3];
+  im2col_dims.sizes[1] = row_dims.strides[3];
+  im2col_dims.sizes[2] = 1;
+  im2col_dims.sizes[3] = 1;
+  ComputeStrides(&im2col_dims);
+
+  // Loop through the output rows (B x H x W)
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        // Each row is an output pixel. Arrange the input data into this row in
+        // an order we can conveniently multiply with the filter data.
+        int row_offset = Offset(row_dims, out_x, out_y, batch, 0);
+        const int in_x_origin = (out_x * stride_width) - pad_width;
+        const int in_y_origin = (out_y * stride_height) - pad_height;
+        // Loop through all the pixels of the filter (Kh x Kw)
+        for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+          const int in_y = in_y_origin + dilation_height_factor * filter_y;
+          if ((in_y >= 0) && (in_y < input_height)) {
+            // Filter row is within the input data.
+            // Loop through all the filter pixels in this row.
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+              int col_offset = Offset(col_dims, 0, filter_x, filter_y, 0);
+              T* dst = im2col_data +
+                       Offset(im2col_dims, col_offset, row_offset, 0, 0);
+              if ((in_x >= 0) && (in_x < input_width)) {
+                // Filter pixel is within the input, copy the data.
+                T const* src =
+                    input_data + Offset(input_dims, 0, in_x, in_y, batch);
+                memcpy(dst, src, input_depth * sizeof(T));
+              } else {
+                // Filter pixel is outside the input, zero it out.
+                memset(dst, byte_zero, input_depth * sizeof(T));
+              }
+            }
+          } else {
+            // Filter row is outside the input, zero out the entire im2col row.
+            int col_offset = Offset(col_dims, 0, 0, filter_y, 0);
+            T* dst =
+                im2col_data + Offset(im2col_dims, col_offset, row_offset, 0, 0);
+            memset(dst, byte_zero, filter_width * input_depth * sizeof(T));
+          }
+        }
+      }
+    }
+  }
+}
+
 template <typename T>
 void Im2col(const T* input_data, const Dims<4>& input_dims, int stride_width,
             int stride_height, int pad_width, int pad_height, int kheight,
@@ -1765,74 +1910,6 @@ void Im2col(const T* input_data, const Dims<4>& input_dims, int stride,
          kwidth, byte_zero, output_data, output_dims);
 }
 
-inline void DilatedConv(const float* input_data, const Dims<4>& input_dims,
-                        const float* filter_data, const Dims<4>& filter_dims,
-                        const float* bias_data, const Dims<4>& bias_dims,
-                        int stride_width, int stride_height,
-                        int dilation_width_factor, int dilation_height_factor,
-                        int pad_width, int pad_height,
-                        float output_activation_min,
-                        float output_activation_max, float* output_data,
-                        const Dims<4>& output_dims, float* im2col_data,
-                        const Dims<4>& im2col_dims) {
-  gemmlowp::ScopedProfilingLabel label("DilatedConv");
-  // This is a copy of the reference Conv implementation. We do not currently
-  // have an optimized path for dilation.
-  (void)im2col_data;  // only used in optimized code.
-  (void)im2col_dims;  // only used in optimized code.
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 0);
-  const int output_depth = MatchingArraySize(filter_dims, 3, output_dims, 0);
-  if (bias_data) {
-    TFLITE_DCHECK_EQ(ArraySize(filter_dims, 3), ArraySize(bias_dims, 0));
-  }
-  const int input_height = ArraySize(input_dims, 2);
-  const int input_width = ArraySize(input_dims, 1);
-  const int filter_height = ArraySize(filter_dims, 2);
-  const int filter_width = ArraySize(filter_dims, 1);
-  const int output_height = ArraySize(output_dims, 2);
-  const int output_width = ArraySize(output_dims, 1);
-  for (int batch = 0; batch < batches; ++batch) {
-    for (int out_y = 0; out_y < output_height; ++out_y) {
-      for (int out_x = 0; out_x < output_width; ++out_x) {
-        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
-          const int in_x_origin = (out_x * stride_width) - pad_width;
-          const int in_y_origin = (out_y * stride_height) - pad_height;
-          float total = 0.f;
-          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
-            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
-              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
-                const int in_x = in_x_origin + dilation_width_factor * filter_x;
-                const int in_y =
-                    in_y_origin + dilation_height_factor * filter_y;
-                // If the location is outside the bounds of the input image,
-                // use zero as a default value.
-                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
-                    (in_y < input_height)) {
-                  float input_value = input_data[Offset(input_dims, in_channel,
-                                                        in_x, in_y, batch)];
-                  float filter_value =
-                      filter_data[Offset(filter_dims, in_channel, filter_x,
-                                         filter_y, out_channel)];
-                  total += (input_value * filter_value);
-                }
-              }
-            }
-          }
-          float bias_value = 0.0f;
-          if (bias_data) {
-            bias_value = bias_data[Offset(bias_dims, out_channel, 0, 0, 0)];
-          }
-          output_data[Offset(output_dims, out_channel, out_x, out_y, batch)] =
-              ActivationFunctionWithMinMax(total + bias_value,
-                                           output_activation_min,
-                                           output_activation_max);
-        }
-      }
-    }
-  }
-}
-
 inline void Conv(const float* input_data, const Dims<4>& input_dims,
                  const float* filter_data, const Dims<4>& filter_dims,
                  const float* bias_data, const Dims<4>& bias_dims,
@@ -1841,29 +1918,32 @@ inline void Conv(const float* input_data, const Dims<4>& input_dims,
                  float output_activation_min, float output_activation_max,
                  float* output_data, const Dims<4>& output_dims,
                  float* im2col_data, const Dims<4>& im2col_dims) {
-  if ((dilation_width_factor != 1) || (dilation_height_factor != 1)) {
-    return DilatedConv(input_data, input_dims, filter_data, filter_dims,
-                       bias_data, bias_dims, stride_width, stride_height,
-                       dilation_width_factor, dilation_height_factor, pad_width,
-                       pad_height, output_activation_min, output_activation_max,
-                       output_data, output_dims, im2col_data, im2col_dims);
-  }
-
   (void)im2col_data;
   (void)im2col_dims;
   gemmlowp::ScopedProfilingLabel label("Conv");
 
+  // A float set to 0x00000000h == 0.0f
+  const uint8 float_zero_byte = 0x00;
   const float* gemm_input_data = nullptr;
   const Dims<4>* gemm_input_dims = nullptr;
   const int filter_width = ArraySize(filter_dims, 1);
   const int filter_height = ArraySize(filter_dims, 2);
+  const bool need_dilated_im2col =
+      dilation_width_factor != 1 || dilation_height_factor != 1;
   const bool need_im2col = stride_width != 1 || stride_height != 1 ||
                            filter_width != 1 || filter_height != 1;
-  if (need_im2col) {
+  if (need_dilated_im2col) {
+    DilatedIm2col(input_data, input_dims, filter_dims, stride_width,
+                  stride_height, dilation_width_factor, dilation_height_factor,
+                  pad_width, pad_height, output_dims, float_zero_byte,
+                  im2col_data);
+    gemm_input_data = im2col_data;
+    gemm_input_dims = &im2col_dims;
+  } else if (need_im2col) {
     TFLITE_DCHECK(im2col_data);
     Im2col(input_data, input_dims, stride_width, stride_height, pad_width,
-           pad_height, filter_height, filter_width, 0, im2col_data,
-           im2col_dims);
+           pad_height, filter_height, filter_width, float_zero_byte,
+           im2col_data, im2col_dims);
     gemm_input_data = im2col_data;
     gemm_input_dims = &im2col_dims;
   } else {
@@ -1974,13 +2054,21 @@ inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
   }
 
   const int gemm_input_rows = gemm_input_dims->sizes[0];
+  // Using FlatSizeSkipDim causes segfault in some contexts (see b/79927784).
+  // The root cause has not yet been identified though. Same applies below for
+  // the other calls commented out. This is a partial rollback of cl/196819423.
+  // const int gemm_input_cols = FlatSizeSkipDim(*gemm_input_dims, 0);
   const int gemm_input_cols = gemm_input_dims->sizes[1] *
                               gemm_input_dims->sizes[2] *
                               gemm_input_dims->sizes[3];
   const int filter_rows = filter_dims.sizes[3];
+  // See b/79927784.
+  // const int filter_cols = FlatSizeSkipDim(filter_dims, 3);
   const int filter_cols =
       filter_dims.sizes[0] * filter_dims.sizes[1] * filter_dims.sizes[2];
   const int output_rows = output_dims.sizes[0];
+  // See b/79927784.
+  // const int output_cols = FlatSizeSkipDim(output_dims, 0);
   const int output_cols =
       output_dims.sizes[1] * output_dims.sizes[2] * output_dims.sizes[3];
   TFLITE_DCHECK_EQ(output_rows, filter_rows);
@@ -2136,14 +2224,11 @@ void ConvAsGemm(const uint8* input_data, const Dims<4>& input_dims,
                     Ac == FusedActivationFunctionType::kRelu1,
                 "");
   const int input_rows = input_dims.sizes[0];
-  const int input_cols =
-      input_dims.sizes[1] * input_dims.sizes[2] * input_dims.sizes[3];
+  const int input_cols = FlatSizeSkipDim(input_dims, 0);
   const int filter_rows = filter_dims.sizes[3];
-  const int filter_cols =
-      filter_dims.sizes[0] * filter_dims.sizes[1] * filter_dims.sizes[2];
+  const int filter_cols = FlatSizeSkipDim(filter_dims, 3);
   const int output_rows = output_dims.sizes[0];
-  const int output_cols =
-      output_dims.sizes[1] * output_dims.sizes[2] * output_dims.sizes[3];
+  const int output_cols = FlatSizeSkipDim(output_dims, 0);
   TFLITE_DCHECK_EQ(output_rows, filter_rows);
   TFLITE_DCHECK_EQ(output_cols, input_cols);
   TFLITE_DCHECK_EQ(filter_cols, input_rows);
@@ -2207,27 +2292,15 @@ void NonGlobalBatchNormalization(
     const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("NonGlobalBatchNormalization");
   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height =
-      MatchingArraySize(input_dims, 2, mean_dims, 2, multiplier_dims, 2,
-                        offset_dims, 2, output_dims, 2);
-  const int width =
-      MatchingArraySize(input_dims, 1, mean_dims, 1, multiplier_dims, 1,
-                        offset_dims, 1, output_dims, 1);
-  const int depth =
-      MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
-                        offset_dims, 0, output_dims, 0);
+  const int inner_size = MatchingFlatSizeSkipDim(
+      input_dims, 3, mean_dims, multiplier_dims, offset_dims, output_dims);
 
   for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
-              (input_data[Offset(input_dims, c, x, y, b)] -
-               mean_data[Offset(mean_dims, c, x, y, 0)]) *
-                  multiplier_data[Offset(multiplier_dims, c, x, y, 0)] +
-              offset_data[Offset(offset_dims, c, x, y, 0)]);
-        }
-      }
+    for (int i = 0; i < inner_size; ++i) {
+      *output_data = ActivationFunction<Ac>(
+          (*input_data - mean_data[i]) * multiplier_data[i] + offset_data[i]);
+      ++output_data;
+      ++input_data;
     }
   }
 }
@@ -2242,24 +2315,17 @@ void GlobalBatchNormalization(const float* input_data,
                               const Dims<4>& offset_dims, float* output_data,
                               const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("GlobalBatchNormalization");
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth =
       MatchingArraySize(input_dims, 0, mean_dims, 0, multiplier_dims, 0,
                         offset_dims, 0, output_dims, 0);
 
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] = ActivationFunction<Ac>(
-              (input_data[Offset(input_dims, c, x, y, b)] -
-               mean_data[Offset(mean_dims, c, 0, 0, 0)]) *
-                  multiplier_data[Offset(multiplier_dims, c, 0, 0, 0)] +
-              offset_data[Offset(offset_dims, c, 0, 0, 0)]);
-        }
-      }
+  for (int i = 0; i < outer_size; ++i) {
+    for (int c = 0; c < depth; ++c) {
+      *output_data = ActivationFunction<Ac>(
+          (*input_data - mean_data[c]) * multiplier_data[c] + offset_data[c]);
+      ++output_data;
+      ++input_data;
     }
   }
 }
@@ -2276,44 +2342,26 @@ inline void Relu(const float* input_data, const Dims<4>& input_dims,
 inline void Relu1(const float* input_data, const Dims<4>& input_dims,
                   float* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("Relu1 (not fused)");
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          float val = input_data[Offset(input_dims, c, x, y, b)];
-          const float upper = 1;
-          const float lower = -1;
-          float clamped = val > upper ? upper : val < lower ? lower : val;
-          output_data[Offset(output_dims, c, x, y, b)] = clamped;
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(input_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    const float val = input_data[i];
+    const float upper = 1;
+    const float lower = -1;
+    const float clamped = val > upper ? upper : val < lower ? lower : val;
+    output_data[i] = clamped;
   }
 }
 
 inline void Relu6(const float* input_data, const Dims<4>& input_dims,
                   float* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("Relu6 (not fused)");
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          float val = input_data[Offset(input_dims, c, x, y, b)];
-          const float upper = 6;
-          const float lower = 0;
-          float clamped = val > upper ? upper : val < lower ? lower : val;
-          output_data[Offset(output_dims, c, x, y, b)] = clamped;
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(input_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    const float val = input_data[i];
+    const float upper = 6;
+    const float lower = 0;
+    const float clamped = val > upper ? upper : val < lower ? lower : val;
+    output_data[i] = clamped;
   }
 }
 
@@ -2322,24 +2370,19 @@ void L2Normalization(const float* input_data, const Dims<4>& input_dims,
                      float* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("L2Normalization");
   static_assert(Ac == FusedActivationFunctionType::kNone, "");
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        float squared_l2_norm = 0;
-        for (int c = 0; c < depth; ++c) {
-          float val = input_data[Offset(input_dims, c, x, y, b)];
-          squared_l2_norm += val * val;
-        }
-        float inverse_l2_norm = 1.0f / std::sqrt(squared_l2_norm);
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              input_data[Offset(input_dims, c, x, y, b)] * inverse_l2_norm;
-        }
-      }
+  for (int i = 0; i < outer_size; ++i) {
+    float squared_l2_norm = 0;
+    for (int c = 0; c < depth; ++c) {
+      const float val = input_data[c];
+      squared_l2_norm += val * val;
+    }
+    const float l2_norm = std::sqrt(squared_l2_norm);
+    for (int c = 0; c < depth; ++c) {
+      *output_data = *input_data / l2_norm;
+      ++output_data;
+      ++input_data;
     }
   }
 }
@@ -2393,32 +2436,31 @@ inline void L2Normalization(const uint8* input_data, const Dims<4>& input_dims,
                             int32 input_zero_point, uint8* output_data,
                             const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("L2Normalization/8bit");
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
   TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
   TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
-  TFLITE_DCHECK_EQ(batches, 1);
-  TFLITE_DCHECK_EQ(height, 1);
-  TFLITE_DCHECK_EQ(width, 1);
-  int32 square_l2_norm = 0;
-  for (int i = 0; i < depth; i++) {
-    int32 diff = input_data[i] - input_zero_point;
-    square_l2_norm += diff * diff;
-  }
-  int32 inv_l2norm_multiplier;
-  int inv_l2norm_shift;
-  GetInvSqrtQuantizedMultiplier(square_l2_norm, &inv_l2norm_multiplier,
-                                &inv_l2norm_shift);
-
-  for (int i = 0; i < depth; i++) {
-    int32 diff = input_data[i] - input_zero_point;
-    int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOne(
-        128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
-    int32 unclamped_output_val = 128 + rescaled_diff;
-    int32 output_val = std::min(255, std::max(0, unclamped_output_val));
-    output_data[i] = static_cast<uint8>(output_val);
+  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
+  for (int i = 0; i < outer_size; ++i) {
+    int32 square_l2_norm = 0;
+    for (int c = 0; c < depth; c++) {
+      int32 diff = input_data[c] - input_zero_point;
+      square_l2_norm += diff * diff;
+    }
+    int32 inv_l2norm_multiplier;
+    int inv_l2norm_shift;
+    GetInvSqrtQuantizedMultiplier(square_l2_norm, &inv_l2norm_multiplier,
+                                  &inv_l2norm_shift);
+
+    for (int c = 0; c < depth; c++) {
+      int32 diff = *input_data - input_zero_point;
+      int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+          128 * diff, inv_l2norm_multiplier, kReverseShift * inv_l2norm_shift);
+      int32 unclamped_output_val = 128 + rescaled_diff;
+      int32 output_val = std::min(255, std::max(0, unclamped_output_val));
+      *output_data = static_cast<uint8>(output_val);
+      ++input_data;
+      ++output_data;
+    }
   }
 }
 
@@ -2427,20 +2469,12 @@ inline void Add(const float* input1_data, const Dims<4>& input1_dims,
                 float output_activation_min, float output_activation_max,
                 float* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("Add");
-  /* const int batches = */ MatchingArraySize(input1_dims, 3, input2_dims, 3,
-                                              output_dims, 3);
-  /* const int height = */ MatchingArraySize(input1_dims, 2, input2_dims, 2,
-                                             output_dims, 2);
-  /* const int width = */ MatchingArraySize(input1_dims, 1, input2_dims, 1,
-                                            output_dims, 1);
-  /* const int depth = */ MatchingArraySize(input1_dims, 0, input2_dims, 0,
-                                            output_dims, 0);
   TFLITE_DCHECK(IsPackedWithoutStrides(input1_dims));
   TFLITE_DCHECK(IsPackedWithoutStrides(input2_dims));
   TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
 
   int i = 0;
-  const int size = input1_dims.sizes[3] * input1_dims.strides[3];
+  const int size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
 #ifdef USE_NEON
   const auto activation_min = vdupq_n_f32(output_activation_min);
   const auto activation_max = vdupq_n_f32(output_activation_max);
@@ -2487,52 +2521,17 @@ inline void Add(const float* input1_data, const Dims<4>& input1_dims,
   }
 }
 
-// legacy, for compatibility with old checked-in code
-template <FusedActivationFunctionType Ac>
-void Add(const float* input1_data, const Dims<4>& input1_dims,
-         const float* input2_data, const Dims<4>& input2_dims,
-         float* output_data, const Dims<4>& output_dims) {
-  float output_activation_min, output_activation_max;
-  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
-
-  Add(input1_data, input1_dims, input2_data, input2_dims, output_activation_min,
-      output_activation_max, output_data, output_dims);
-}
-
-template <FusedActivationFunctionType Ac>
-inline void Add(int left_shift, const uint8* input1_data,
-                const Dims<4>& input1_dims, int32 input1_offset,
-                int32 input1_multiplier, int input1_shift,
-                const uint8* input2_data, const Dims<4>& input2_dims,
-                int32 input2_offset, int32 input2_multiplier, int input2_shift,
-                int32 output_offset, int32 output_multiplier, int output_shift,
-                int32 output_activation_min, int32 output_activation_max,
-                uint8* output_data, const Dims<4>& output_dims) {
-  static_assert(Ac == FusedActivationFunctionType::kNone ||
-                    Ac == FusedActivationFunctionType::kRelu ||
-                    Ac == FusedActivationFunctionType::kRelu6 ||
-                    Ac == FusedActivationFunctionType::kRelu1,
-                "");
-  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
-  if (Ac == FusedActivationFunctionType::kNone) {
-    TFLITE_DCHECK_EQ(output_activation_min, 0);
-    TFLITE_DCHECK_EQ(output_activation_max, 255);
-  }
-  gemmlowp::ScopedProfilingLabel label("Add/8bit");
-  /* const int batches = */ MatchingArraySize(input1_dims, 3, input2_dims, 3,
-                                              output_dims, 3);
-  /* const int height = */ MatchingArraySize(input1_dims, 2, input2_dims, 2,
-                                             output_dims, 2);
-  /* const int width = */ MatchingArraySize(input1_dims, 1, input2_dims, 1,
-                                            output_dims, 1);
-  /* const int depth = */ MatchingArraySize(input1_dims, 0, input2_dims, 0,
-                                            output_dims, 0);
-  TFLITE_DCHECK(IsPackedWithoutStrides(input1_dims));
-  TFLITE_DCHECK(IsPackedWithoutStrides(input2_dims));
-  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
-
+// Element-wise add that can often be used for inner loop of broadcast add as
+// well as the non-broadcast add.
+inline void AddElementwise(int size, int left_shift, const uint8* input1_data,
+                           int32 input1_offset, int32 input1_multiplier,
+                           int input1_shift, const uint8* input2_data,
+                           int32 input2_offset, int32 input2_multiplier,
+                           int input2_shift, int32 output_offset,
+                           int32 output_multiplier, int output_shift,
+                           int32 output_activation_min,
+                           int32 output_activation_max, uint8* output_data) {
   int i = 0;
-  const int size = input1_dims.sizes[3] * input1_dims.strides[3];
   TFLITE_DCHECK_GT(input1_offset, -256);
   TFLITE_DCHECK_GT(input2_offset, -256);
   TFLITE_DCHECK_LT(input1_offset, 256);
@@ -2592,25 +2591,78 @@ inline void Add(int left_shift, const uint8* input1_data,
   }
 #endif  // NEON
 
-  for (; i < size; i++) {
+  for (; i < size; ++i) {
     const int32 input1_val = input1_offset + input1_data[i];
     const int32 input2_val = input2_offset + input2_data[i];
     const int32 shifted_input1_val = input1_val * (1 << left_shift);
     const int32 shifted_input2_val = input2_val * (1 << left_shift);
-    const int32 scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOne(
-        shifted_input1_val, input1_multiplier, input1_shift);
-    const int32 scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOne(
-        shifted_input2_val, input2_multiplier, input2_shift);
+    const int32 scaled_input1_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input1_val, input1_multiplier,
+            kReverseShift * input1_shift);
+    const int32 scaled_input2_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input2_val, input2_multiplier,
+            kReverseShift * input2_shift);
     const int32 raw_sum = scaled_input1_val + scaled_input2_val;
-    const int32 raw_output = MultiplyByQuantizedMultiplierSmallerThanOne(
-                                 raw_sum, output_multiplier, output_shift) +
-                             output_offset;
+    const int32 raw_output =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            raw_sum, output_multiplier, kReverseShift * output_shift) +
+        output_offset;
     const int32 clamped_output = std::min(
         output_activation_max, std::max(output_activation_min, raw_output));
     output_data[i] = static_cast<uint8>(clamped_output);
   }
 }
 
+// legacy, for compatibility with old checked-in code
+template <FusedActivationFunctionType Ac>
+void Add(const float* input1_data, const Dims<4>& input1_dims,
+         const float* input2_data, const Dims<4>& input2_dims,
+         float* output_data, const Dims<4>& output_dims) {
+  float output_activation_min, output_activation_max;
+  GetActivationMinMax(Ac, &output_activation_min, &output_activation_max);
+
+  Add(input1_data, input1_dims, input2_data, input2_dims, output_activation_min,
+      output_activation_max, output_data, output_dims);
+}
+
+template <FusedActivationFunctionType Ac>
+inline void Add(int left_shift, const uint8* input1_data,
+                const Dims<4>& input1_dims, int32 input1_offset,
+                int32 input1_multiplier, int input1_shift,
+                const uint8* input2_data, const Dims<4>& input2_dims,
+                int32 input2_offset, int32 input2_multiplier, int input2_shift,
+                int32 output_offset, int32 output_multiplier, int output_shift,
+                int32 output_activation_min, int32 output_activation_max,
+                uint8* output_data, const Dims<4>& output_dims) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  gemmlowp::ScopedProfilingLabel label("Add/8bit");
+  const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
+  TFLITE_DCHECK(IsPackedWithoutStrides(input1_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(input2_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+
+  TFLITE_DCHECK_GT(input1_offset, -256);
+  TFLITE_DCHECK_GT(input2_offset, -256);
+  TFLITE_DCHECK_LT(input1_offset, 256);
+  TFLITE_DCHECK_LT(input2_offset, 256);
+  AddElementwise(flat_size, left_shift, input1_data, input1_offset,
+                 input1_multiplier, input1_shift, input2_data, input2_offset,
+                 input2_multiplier, input2_shift, output_offset,
+                 output_multiplier, output_shift, output_activation_min,
+                 output_activation_max, output_data);
+}
+
 template <FusedActivationFunctionType Ac>
 inline void Add(const int16* input1_data, const Dims<4>& input1_dims,
                 int input1_shift, const int16* input2_data,
@@ -2631,9 +2683,7 @@ inline void Add(const int16* input1_data, const Dims<4>& input1_dims,
     TFLITE_DCHECK_EQ(output_activation_max, 32767);
   }
 
-  const int flat_size = RequiredBufferSizeForDims(output_dims);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input1_dims), flat_size);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input2_dims), flat_size);
+  const int flat_size = MatchingFlatSize(output_dims, input1_dims, input2_dims);
 
   TFLITE_DCHECK(input1_shift == 0 || input2_shift == 0);
   TFLITE_DCHECK_GE(input1_shift, 0);
@@ -2669,10 +2719,10 @@ void Add(const int32* input1_data, const Dims<4>& input1_dims,
   auto output_map = MapAsVector(output_data, output_dims);
   if (AreSameDims(input1_dims, input2_dims)) {
     output_map.array() = input1_map.array() + input2_map.array();
-  } else if (RequiredBufferSizeForDims(input2_dims) == 1) {
+  } else if (FlatSize(input2_dims) == 1) {
     auto scalar = input2_data[0];
     output_map.array() = input1_map.array() + scalar;
-  } else if (RequiredBufferSizeForDims(input1_dims) == 1) {
+  } else if (FlatSize(input1_dims) == 1) {
     auto scalar = input1_data[0];
     output_map.array() = scalar + input2_map.array();
   } else {
@@ -2749,7 +2799,7 @@ inline void BroadcastAdd(int left_shift, const uint8* input1_data,
                          int32 output_activation_min,
                          int32 output_activation_max, uint8* output_data,
                          const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("BroadcastAdd/8bit");
+  gemmlowp::ScopedProfilingLabel label("BroadcastAddGeneric/8bit");
 
   NdArrayDesc<4> desc1;
   NdArrayDesc<4> desc2;
@@ -2777,15 +2827,17 @@ inline void BroadcastAdd(int left_shift, const uint8* input1_data,
           const int32 shifted_input1_val = input1_val * (1 << left_shift);
           const int32 shifted_input2_val = input2_val * (1 << left_shift);
           const int32 scaled_input1_val =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  shifted_input1_val, input1_multiplier, input1_shift);
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input1_val, input1_multiplier,
+                  kReverseShift * input1_shift);
           const int32 scaled_input2_val =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  shifted_input2_val, input2_multiplier, input2_shift);
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input2_val, input2_multiplier,
+                  kReverseShift * input2_shift);
           const int32 raw_sum = scaled_input1_val + scaled_input2_val;
           const int32 raw_output =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  raw_sum, output_multiplier, output_shift) +
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  raw_sum, output_multiplier, kReverseShift * output_shift) +
               output_offset;
           const int32 clamped_output =
               std::min(output_activation_max,
@@ -2798,6 +2850,44 @@ inline void BroadcastAdd(int left_shift, const uint8* input1_data,
   }
 }
 
+inline void BroadcastAddFivefold(
+    int y0, int y1, int y2, int y3, int y4, int left_shift,
+    const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset,
+    int32 input1_multiplier, int input1_shift, const uint8* input2_data,
+    const Dims<4>& input2_dims, int32 input2_offset, int32 input2_multiplier,
+    int input2_shift, int32 output_offset, int32 output_multiplier,
+    int output_shift, int32 output_activation_min, int32 output_activation_max,
+    uint8* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastAddFivefold/8bit");
+
+  // Fivefold nested loops. The second input resets its position for each
+  // iteration of the second loop. The first input resets its position at the
+  // beginning of the fourth loop. The innermost loop is an elementwise add of
+  // sections of the arrays.
+  uint8* output_data_ptr = output_data;
+  const uint8* input1_data_ptr = input1_data;
+  const uint8* input2_data_reset = input2_data;
+  for (int i4 = 0; i4 < y4; ++i4) {
+    const uint8* input2_data_ptr;
+    for (int i3 = 0; i3 < y3; ++i3) {
+      input2_data_ptr = input2_data_reset;
+      for (int i2 = 0; i2 < y2; ++i2) {
+        for (int i1 = 0; i1 < y1; ++i1) {
+          AddElementwise(
+              y0, left_shift, input1_data_ptr, input1_offset, input1_multiplier,
+              input1_shift, input2_data_ptr, input2_offset, input2_multiplier,
+              input2_shift, output_offset, output_multiplier, output_shift,
+              output_activation_min, output_activation_max, output_data_ptr);
+          input2_data_ptr += y0;
+          output_data_ptr += y0;
+        }
+        input1_data_ptr += y0;
+      }
+    }
+    input2_data_reset = input2_data_ptr;
+  }
+}
+
 template <FusedActivationFunctionType Ac>
 inline void BroadcastAdd(int left_shift, const uint8* input1_data,
                          const Dims<4>& input1_dims, int32 input1_offset,
@@ -2826,25 +2916,44 @@ inline void BroadcastAdd(int left_shift, const uint8* input1_data,
                output_activation_max, output_data, output_dims);
 }
 
+template <FusedActivationFunctionType Ac>
+inline void BroadcastAddFivefold(
+    int y0, int y1, int y2, int y3, int y4, int left_shift,
+    const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset,
+    int32 input1_multiplier, int input1_shift, const uint8* input2_data,
+    const Dims<4>& input2_dims, int32 input2_offset, int32 input2_multiplier,
+    int input2_shift, int32 output_offset, int32 output_multiplier,
+    int output_shift, int32 output_activation_min, int32 output_activation_max,
+    uint8* output_data, const Dims<4>& output_dims) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  BroadcastAddFivefold(y0, y1, y2, y3, y4, left_shift, input1_data, input1_dims,
+                       input1_offset, input1_multiplier, input1_shift,
+                       input2_data, input2_dims, input2_offset,
+                       input2_multiplier, input2_shift, output_offset,
+                       output_multiplier, output_shift, output_activation_min,
+                       output_activation_max, output_data, output_dims);
+}
+
 inline void Mul(const float* input1_data, const Dims<4>& input1_dims,
                 const float* input2_data, const Dims<4>& input2_dims,
                 float output_activation_min, float output_activation_max,
                 float* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("Mul");
-  /* const int batches = */ MatchingArraySize(input1_dims, 3, input2_dims, 3,
-                                              output_dims, 3);
-  /* const int height = */ MatchingArraySize(input1_dims, 2, input2_dims, 2,
-                                             output_dims, 2);
-  /* const int width = */ MatchingArraySize(input1_dims, 1, input2_dims, 1,
-                                            output_dims, 1);
-  /* const int depth = */ MatchingArraySize(input1_dims, 0, input2_dims, 0,
-                                            output_dims, 0);
   TFLITE_DCHECK(IsPackedWithoutStrides(input1_dims));
   TFLITE_DCHECK(IsPackedWithoutStrides(input2_dims));
   TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
 
   int i = 0;
-  const int size = input1_dims.sizes[3] * input1_dims.strides[3];
+  const int size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
 #ifdef USE_NEON
   const auto activation_min = vdupq_n_f32(output_activation_min);
   const auto activation_max = vdupq_n_f32(output_activation_max);
@@ -2919,10 +3028,10 @@ void Mul(const int32* input1_data, const Dims<4>& input1_dims,
   auto output_map = MapAsVector(output_data, output_dims);
   if (AreSameDims(input1_dims, input2_dims)) {
     output_map.array() = input1_map.array() * input2_map.array();
-  } else if (RequiredBufferSizeForDims(input2_dims) == 1) {
+  } else if (FlatSize(input2_dims) == 1) {
     auto scalar = input2_data[0];
     output_map.array() = input1_map.array() * scalar;
-  } else if (RequiredBufferSizeForDims(input1_dims) == 1) {
+  } else if (FlatSize(input1_dims) == 1) {
     auto scalar = input1_data[0];
     output_map.array() = scalar * input2_map.array();
   } else {
@@ -2938,9 +3047,7 @@ inline void Mul(const int16* input1_data, const Dims<4>& input1_dims,
   // This is a copy of the reference implementation. We do not currently have a
   // properly optimized version.
 
-  const int flat_size = RequiredBufferSizeForDims(output_dims);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input1_dims), flat_size);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input2_dims), flat_size);
+  const int flat_size = MatchingFlatSize(output_dims, input1_dims, input2_dims);
 
   for (int i = 0; i < flat_size; i++) {
     // F0 uses 0 integer bits, range [-1, 1].
@@ -2962,9 +3069,7 @@ inline void Mul(const int16* input1_data, const Dims<4>& input1_dims,
   // properly optimized version.
   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
 
-  const int flat_size = RequiredBufferSizeForDims(output_dims);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input1_dims), flat_size);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input2_dims), flat_size);
+  const int flat_size = MatchingFlatSize(output_dims, input1_dims, input2_dims);
 
   for (int i = 0; i < flat_size; i++) {
     // F0 uses 0 integer bits, range [-1, 1].
@@ -3073,9 +3178,9 @@ inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
           const int32 input2_val =
               input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
           const int32 unclamped_result =
-              output_offset +
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  input1_val * input2_val, output_multiplier, output_shift);
+              output_offset + MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                                  input1_val * input2_val, output_multiplier,
+                                  kReverseShift * output_shift);
           const int32 clamped_output =
               std::min(output_activation_max,
                        std::max(output_activation_min, unclamped_result));
@@ -3107,26 +3212,11 @@ inline void Div(const float* input1_data, const Dims<4>& input1_dims,
                 const float* input2_data, const Dims<4>& input2_dims,
                 float output_activation_min, float output_activation_max,
                 float* output_data, const Dims<4>& output_dims) {
-  const int batches =
-      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
-  const int height =
-      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
-  const int width =
-      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
-  const int depth =
-      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[Offset(input1_dims, c, x, y, b)] /
-                      input2_data[Offset(input2_dims, c, x, y, b)],
-                  output_activation_min, output_activation_max);
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(output_dims, input1_dims, input2_dims);
+  for (int i = 0; i < flat_size; i++) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] / input2_data[i], output_activation_min,
+        output_activation_max);
   }
 }
 
@@ -3180,26 +3270,12 @@ inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
                 const float* input2_data, const Dims<4>& input2_dims,
                 float output_activation_min, float output_activation_max,
                 float* output_data, const Dims<4>& output_dims) {
-  const int batches =
-      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
-  const int height =
-      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
-  const int width =
-      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
-  const int depth =
-      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[Offset(input1_dims, c, x, y, b)] -
-                      input2_data[Offset(input2_dims, c, x, y, b)],
-                  output_activation_min, output_activation_max);
-        }
-      }
-    }
+  gemmlowp::ScopedProfilingLabel label("Sub");
+  const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] - input2_data[i], output_activation_min,
+        output_activation_max);
   }
 }
 
@@ -3286,15 +3362,17 @@ inline void BroadcastSub(int left_shift, const uint8* input1_data,
           const int32 shifted_input1_val = input1_val * (1 << left_shift);
           const int32 shifted_input2_val = input2_val * (1 << left_shift);
           const int32 scaled_input1_val =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  shifted_input1_val, input1_multiplier, input1_shift);
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input1_val, input1_multiplier,
+                  kReverseShift * input1_shift);
           const int32 scaled_input2_val =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  shifted_input2_val, input2_multiplier, input2_shift);
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input2_val, input2_multiplier,
+                  kReverseShift * input2_shift);
           const int32 raw_sub = scaled_input1_val - scaled_input2_val;
           const int32 raw_output =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  raw_sub, output_multiplier, output_shift) +
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  raw_sub, output_multiplier, kReverseShift * output_shift) +
               output_offset;
           const int32 clamped_output =
               std::min(output_activation_max,
@@ -3354,7 +3432,7 @@ inline void Concatenation(int concat_dim, const uint8* const* input_data,
                           const int32 output_zeropoint,
                           const float output_scale) {
   // The arguments input_zeropoint and input_scale are expected to be an array
-  // that have the quantization paramaters for all the inputs to the concat
+  // that have the quantization parameters for all the inputs to the concat
   // operator.
   gemmlowp::ScopedProfilingLabel label("Concatenation");
   TFLITE_DCHECK_GT(inputs_count, 1);
@@ -3508,15 +3586,9 @@ void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims,
   gemmlowp::ScopedProfilingLabel label(
       "LstmCell/quantized (8bit external, 16bit internal)");
   // Gather dimensions information, and perform consistency checks.
-  const int batches =
-      MatchingArraySize(input_dims, 3, prev_activ_dims, 3, prev_state_dims, 3,
-                        output_state_dims, 3, output_activ_dims, 3);
-  const int height =
-      MatchingArraySize(input_dims, 2, prev_activ_dims, 2, prev_state_dims, 2,
-                        output_state_dims, 2, output_activ_dims, 2);
-  const int width =
-      MatchingArraySize(input_dims, 1, prev_activ_dims, 1, prev_state_dims, 1,
-                        output_state_dims, 1, output_activ_dims, 1);
+  const int outer_size =
+      MatchingFlatSizeSkipDim(input_dims, 0, prev_activ_dims, prev_state_dims,
+                              output_state_dims, output_activ_dims);
   TFLITE_CHECK_EQ(ArraySize(weights_dims, 2), 1);
   TFLITE_CHECK_EQ(ArraySize(weights_dims, 3), 1);
   const int input_depth = ArraySize(input_dims, 0);
@@ -3532,9 +3604,7 @@ void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims,
       MatchingArraySize(prev_state_dims, 0, prev_activ_dims, 0,
                         output_state_dims, 0, output_activ_dims, 0);
   TFLITE_CHECK_EQ(output_depth, intern_activ_depth / 4);
-  const int fc_batches = ArraySize(activ_temp_dims, 1) *
-                         ArraySize(activ_temp_dims, 2) *
-                         ArraySize(activ_temp_dims, 3);
+  const int fc_batches = FlatSizeSkipDim(activ_temp_dims, 0);
   const int fc_output_depth =
       MatchingArraySize(weights_dims, 1, activ_temp_dims, 0);
   const int fc_accum_depth = ArraySize(weights_dims, 0);
@@ -3590,7 +3660,6 @@ void LstmCell(const uint8* input_data_uint8, const Dims<4>& input_dims,
 
   // Rest of the LSTM cell: tanh and logistic math functions, and some adds
   // and muls, all done in 16-bit fixed-point.
-  const int outer_size = batches * width * height;
   const int16* input_gate_input_ptr = activ_temp_data_int16;
   const int16* input_modulation_gate_input_ptr =
       activ_temp_data_int16 + output_depth;
@@ -3756,20 +3825,15 @@ void TensorFlowSplit(const Scalar* input_data, const Dims<4>& input_dims,
   gemmlowp::ScopedProfilingLabel label("TensorFlowSplit");
   TFLITE_DCHECK_GE(outputs_count, 1);
   for (int i = 0; i < outputs_count; i++) {
-    /* batches = */ MatchingArraySize(*output_dims[i], 3, input_dims, 3);
-    /* height = */ MatchingArraySize(*output_dims[i], 2, input_dims, 2);
-    /* width = */ MatchingArraySize(*output_dims[i], 1, input_dims, 1);
+    MatchingFlatSizeSkipDim(*output_dims[i], 0, input_dims);
   }
-  const int batches = MatchingArraySize(*output_dims[0], 3, input_dims, 3);
-  const int height = MatchingArraySize(*output_dims[0], 2, input_dims, 2);
-  const int width = MatchingArraySize(*output_dims[0], 1, input_dims, 1);
+  const int outer_size = FlatSizeSkipDim(input_dims, 0);
   TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
-  // for now we dont have a model with a TensorFlowSplit
+  // For now we don't have a model with a TensorFlowSplit
   // with fused activation function.
   TFLITE_DCHECK(Ac == FusedActivationFunctionType::kNone);
-  const int whb = width * height * batches;
   const Scalar* input_ptr = input_data;
-  for (int k = 0; k < whb; k++) {
+  for (int k = 0; k < outer_size; k++) {
     for (int i = 0; i < outputs_count; ++i) {
       memcpy(output_data[i] + k * output_dims[i]->sizes[0], input_ptr,
              output_dims[i]->sizes[0] * sizeof(Scalar));
@@ -4294,10 +4358,7 @@ inline void LocalResponseNormalization(const float* input_data,
                                        float* output_data,
                                        const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("LocalResponseNormalization");
-  /* const int batches = */ MatchingArraySize(input_dims, 3, output_dims, 3);
-  /* const int height = */ MatchingArraySize(input_dims, 2, output_dims, 2);
-  /* const int width = */ MatchingArraySize(input_dims, 1, output_dims, 1);
-  /* const int depth = */ MatchingArraySize(input_dims, 0, output_dims, 0);
+  MatchingFlatSize(input_dims, output_dims);
 
   const auto data_in = MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
   auto data_out = MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
@@ -4340,10 +4401,7 @@ inline void Softmax(const float* input_data, const Dims<4>& input_dims,
                     float beta, float* output_data,
                     const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("Softmax");
-  /* const int batches = */ MatchingArraySize(input_dims, 3, output_dims, 3);
-  /* const int height = */ MatchingArraySize(input_dims, 2, output_dims, 2);
-  /* const int width = */ MatchingArraySize(input_dims, 1, output_dims, 1);
-  /* const int depth = */ MatchingArraySize(input_dims, 0, output_dims, 0);
+  MatchingFlatSize(input_dims, output_dims);
 
   const auto in_mat = MapAsMatrixWithFirstDimAsRows(input_data, input_dims);
   auto out_mat = MapAsMatrixWithFirstDimAsRows(output_data, output_dims);
@@ -4374,14 +4432,10 @@ inline void Softmax(const uint8* input_data, const Dims<4>& input_dims,
   using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
   using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
 
-gemmlowp::ScopedProfilingLabel label("Softmax/8bit");
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  gemmlowp::ScopedProfilingLabel label("Softmax/8bit");
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
 
-  const int outer_size = batches * height * width;
-
   for (int b = 0; b < outer_size; ++b) {
     const uint8* input_data_ptr = input_data + b * depth;
     uint8* output_data_ptr = output_data + b * depth;
@@ -4573,39 +4627,147 @@ gemmlowp::ScopedProfilingLabel label("Softmax/8bit");
 inline void LogSoftmax(const float* input_data, const Dims<4>& input_dims,
                        float* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("LogSoftmax");
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
 
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        // Find max element value which we'll use to ensure numerical stability
-        // taking advantage of the following equality:
-        // log(exp(x[i])/sum(exp(x[i]))) == log(exp(x[i]+C)/sum(exp(x[i]+C)))
-        float max = std::numeric_limits<float>::lowest();
-        for (int c = 0; c < depth; ++c) {
-          max = std::max(max, input_data[Offset(input_dims, c, x, y, b)]);
-        }
+  for (int i = 0; i < outer_size; ++i) {
+    const float* block_input_data = input_data + i * depth;
+    float* block_output_data = output_data + i * depth;
+    // Find max element value which we'll use to ensure numerical stability
+    // taking advantage of the following equality:
+    // log(exp(x[i])/sum(exp(x[i]))) == log(exp(x[i]+C)/sum(exp(x[i]+C)))
+    float max = std::numeric_limits<float>::lowest();
+    for (int c = 0; c < depth; ++c) {
+      max = std::max(max, block_input_data[c]);
+    }
 
-        // Compute sum.
-        float sum = 0.f;
-        for (int c = 0; c < depth; ++c) {
-          sum += std::exp(input_data[Offset(input_dims, c, x, y, b)] - max);
-        }
+    // Compute sum.
+    float sum = 0.f;
+    for (int c = 0; c < depth; ++c) {
+      sum += std::exp(block_input_data[c] - max);
+    }
 
-        // Compute result.
-        const float log_sum = std::log(sum);
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              input_data[Offset(input_dims, c, x, y, b)] - max - log_sum;
-        }
-      }
+    // Compute result.
+    const float log_sum = std::log(sum);
+    for (int c = 0; c < depth; ++c) {
+      block_output_data[c] = block_input_data[c] - max - log_sum;
     }
   }
 }
 
+template <int OutputIntegerBits, int InputIntegerBits>
+inline gemmlowp::FixedPoint<int32, OutputIntegerBits>
+log_x_for_x_greater_than_or_equal_to_1_impl(
+    gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) {
+  // assert(__builtin_clz(0u) >= std::numeric_limits<uint32>::digits - 1);
+  // assert(__builtin_clz(0u) <= std::numeric_limits<uint32>::digits);
+  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+  // The reason for accumulating the result with an extra bit of headroom is
+  // that z_pow_2_adj * log_2 might be saturated, and adding num_scaled *
+  // recip_denom will otherwise introduce an error.
+  static constexpr int kAccumIntegerBits = OutputIntegerBits + 1;
+  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumIntegerBits>;
+
+  const FixedPoint0 log_2 = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 1488522236, std::log(2.0));
+  const FixedPoint0 sqrt_sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 1805811301, std::sqrt(std::sqrt(0.5)));
+  const FixedPoint0 sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 1518500250, std::sqrt(0.5));
+  const FixedPoint0 one_quarter =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(FixedPoint0, 536870912, 1.0 / 4.0);
+
+  const FixedPoint0 alpha_n = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 117049297, 11.0 / 240.0 * std::sqrt(std::sqrt(2.0)));
+  const FixedPoint0 alpha_d = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 127690142, 1.0 / 20.0 * std::sqrt(std::sqrt(2.0)));
+  const FixedPoint0 alpha_i = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 1057819769,
+      2.0 / std::sqrt(std::sqrt(2.0)) - std::sqrt(std::sqrt(2.0)));
+  const FixedPoint0 alpha_f = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 638450708, 1.0 / 4.0 * std::sqrt(std::sqrt(2.0)));
+
+  const FixedPointAccum shifted_quarter =
+      gemmlowp::Rescale<kAccumIntegerBits>(one_quarter);
+
+  // Reinterpret the input value as Q0.31, because we will figure out the
+  // required shift "ourselves" instead of using, say, Rescale.
+  FixedPoint0 z_a = FixedPoint0::FromRaw(input_val.raw());
+  // z_a_pow_2 = input_integer_bits - z_a_headroom;
+  int z_a_headroom_plus_1 = __builtin_clz(static_cast<uint32>(z_a.raw()));
+  FixedPoint0 r_a_tmp =
+      SaturatingRoundingMultiplyByPOTParam(z_a, (z_a_headroom_plus_1 - 1));
+  const int32 r_a_raw =
+      SaturatingRoundingMultiplyByPOTParam((r_a_tmp * sqrt_half).raw(), 1);
+  // z_pow_2_adj = max(z_pow_2_a - 0.75, z_pow_2_b - 0.25);
+  // z_pow_2_adj = max(InputIntegerBits - z_a_headroom_plus_1 + 0.25,
+  //                   InputIntegerBits - z_b_headroom - 0.25);
+  const FixedPointAccum z_a_pow_2_adj = SaturatingAddNonGemmlowp(
+      FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
+          InputIntegerBits - z_a_headroom_plus_1, 31 - kAccumIntegerBits)),
+      shifted_quarter);
+
+  // z_b is treated like z_a, but premultiplying by sqrt(0.5).
+  FixedPoint0 z_b = z_a * sqrt_half;
+  int z_b_headroom = __builtin_clz(static_cast<uint32>(z_b.raw())) - 1;
+  const int32 r_b_raw =
+      SaturatingRoundingMultiplyByPOTParam(z_a.raw(), z_b_headroom);
+  const FixedPointAccum z_b_pow_2_adj = SaturatingSub(
+      FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
+          InputIntegerBits - z_b_headroom, 31 - kAccumIntegerBits)),
+      shifted_quarter);
+
+  const FixedPoint0 r = FixedPoint0::FromRaw(std::min(r_a_raw, r_b_raw));
+  const FixedPointAccum z_pow_2_adj = FixedPointAccum::FromRaw(
+      std::max(z_a_pow_2_adj.raw(), z_b_pow_2_adj.raw()));
+
+  const FixedPoint0 p = gemmlowp::RoundingHalfSum(r, sqrt_sqrt_half);
+  FixedPoint0 q = r - sqrt_sqrt_half;
+  q = q + q;
+
+  const FixedPoint0 common_sq = q * q;
+  const FixedPoint0 num = q * r + q * common_sq * alpha_n;
+  const FixedPoint0 denom_minus_one_0 =
+      p * (alpha_i + q + alpha_d * common_sq) + alpha_f * q;
+  const FixedPoint0 recip_denom =
+      one_over_one_plus_x_for_x_in_0_1(denom_minus_one_0);
+
+  const FixedPointAccum num_scaled = gemmlowp::Rescale<kAccumIntegerBits>(num);
+  return gemmlowp::Rescale<OutputIntegerBits>(z_pow_2_adj * log_2 +
+                                              num_scaled * recip_denom);
+}
+
+// Minimum output bits to accommodate log of maximum input range.  It actually
+// does not matter if one considers, say, [-64,64] or [-64,64).
+//
+// For example, run this through Octave:
+// [0:127; ...
+//  ceil(log(abs( log(2.^(0:127))+1 ))/log(2)); ...
+//  ceil(log(abs( log(2.^(0:127))+1 ))/log(2))]
+constexpr int min_log_x_output_bits(int input_bits) {
+  return input_bits > 90
+             ? 7
+             : input_bits > 44
+                   ? 6
+                   : input_bits > 21
+                         ? 5
+                         : input_bits > 10
+                               ? 4
+                               : input_bits > 4 ? 3 : input_bits > 1 ? 2 : 1;
+}
+
+template <int OutputIntegerBits, int InputIntegerBits>
+inline gemmlowp::FixedPoint<int32, OutputIntegerBits>
+log_x_for_x_greater_than_or_equal_to_1(
+    gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) {
+  static_assert(
+      OutputIntegerBits >= min_log_x_output_bits(InputIntegerBits),
+      "Output integer bits must be sufficent to accommodate logs of inputs.");
+  return log_x_for_x_greater_than_or_equal_to_1_impl<OutputIntegerBits,
+                                                     InputIntegerBits>(
+      input_val);
+}
+
 // Currently just a copy of the reference code.
 inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims,
                        int32 input_multiplier, int32 input_left_shift,
@@ -4630,15 +4792,16 @@ inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims,
   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
 
   for (int i = 0; i < outer_size; ++i) {
+    const uint8* block_input_data = input_data + i * depth;
+    uint8* block_output_data = output_data + i * depth;
     uint8 max_in_row = 0;
     for (int c = 0; c < depth; ++c) {
-      max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+      max_in_row = std::max(max_in_row, block_input_data[c]);
     }
 
     FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
     for (int c = 0; c < depth; ++c) {
-      int32 input_diff =
-          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      int32 input_diff = static_cast<int32>(block_input_data[c]) - max_in_row;
       if (input_diff >= diff_min) {
         const int32 input_diff_rescaled =
             MultiplyByQuantizedMultiplierGreaterThanOne(
@@ -4650,13 +4813,10 @@ inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims,
       }
     }
 
-    // TODO(b/77858996): Implement fixed-point log().
-    // Not a fully-quantized implementation: floating-point log().
-    const float float_log_sum_of_exps =
-        std::log(static_cast<float>(sum_of_exps.raw()) /
-                 (1 << (31 - kAccumulationIntegerBits)));
-    const int32 fixed_log_sum_of_exps = static_cast<int32>(TfLiteRound(
-        float_log_sum_of_exps * (1 << (31 - kScaledDiffIntegerBits))));
+    const int32 fixed_log_sum_of_exps =
+        log_x_for_x_greater_than_or_equal_to_1<kScaledDiffIntegerBits>(
+            sum_of_exps)
+            .raw();
 
     // rescaled_diff_min is smallest representable in
     // Q(kScaledDiffIntegerBits).(31-kScaledDiffIntegerBits) plus the
@@ -4667,13 +4827,12 @@ inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims,
         fixed_log_sum_of_exps + std::numeric_limits<int32>::lowest();
     const int adjusted_diff_min =
         std::max(diff_min - 1,  // Note use of > below instead of >= above.
-                 MultiplyByQuantizedMultiplierSmallerThanOne(
+                 MultiplyByQuantizedMultiplierSmallerThanOneExp(
                      rescaled_diff_min, reverse_scaling_divisor,
-                     reverse_scaling_right_shift));
+                     kReverseShift * reverse_scaling_right_shift));
 
     for (int c = 0; c < depth; ++c) {
-      int32 input_diff =
-          static_cast<int32>(input_data[i * depth + c]) - max_in_row;
+      int32 input_diff = static_cast<int32>(block_input_data[c]) - max_in_row;
       if (input_diff > adjusted_diff_min) {
         const int32 input_diff_rescaled =
             MultiplyByQuantizedMultiplierGreaterThanOne(
@@ -4684,11 +4843,11 @@ inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims,
                 31 - kScaledDiffIntegerBits - kOutputIntegerBits) +
             255;
 
-        output_data[i * depth + c] = static_cast<uint8>(
+        block_output_data[c] = static_cast<uint8>(
             std::max(std::min(unsat_output, static_cast<int32>(255)), 0));
       } else {
         // Set output to smallest value.
-        output_data[i * depth + c] = 0;
+        block_output_data[c] = 0;
       }
     }
   }
@@ -4708,11 +4867,7 @@ inline void Logistic(const uint8* input_data, const Dims<4>& input_dims,
                      int32 input_multiplier, int input_left_shift,
                      uint8* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("Logistic/Uint8");
-  /* batches */ MatchingArraySize(input_dims, 3, output_dims, 3);
-  /* height */ MatchingArraySize(input_dims, 2, output_dims, 2);
-  /* width */ MatchingArraySize(input_dims, 1, output_dims, 1);
-  /* depth */ MatchingArraySize(input_dims, 0, output_dims, 0);
-  const int size = RequiredBufferSizeForDims(input_dims);
+  const int size = MatchingFlatSize(input_dims, output_dims);
 
   int c = 0;
 #ifdef USE_NEON
@@ -4847,8 +5002,7 @@ inline void Logistic(const uint8* input_data, const Dims<4>& input_dims,
 inline void Logistic(const int16* input_data, const Dims<4>& input_dims,
                      int16* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("Logistic/Int16");
-  const int flat_size = RequiredBufferSizeForDims(output_dims);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input_dims), flat_size);
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
 
   for (int i = 0; i < flat_size; i++) {
   }
@@ -4919,11 +5073,7 @@ inline void Tanh(const uint8* input_data, const Dims<4>& input_dims,
                  uint8* output_data, const Dims<4>& output_dims) {
   // Note that this is almost the exact same code as in Logistic().
   gemmlowp::ScopedProfilingLabel label("Tanh");
-  /* batches */ MatchingArraySize(input_dims, 3, output_dims, 3);
-  /* height */ MatchingArraySize(input_dims, 2, output_dims, 2);
-  /* width */ MatchingArraySize(input_dims, 1, output_dims, 1);
-  /* depth */ MatchingArraySize(input_dims, 0, output_dims, 0);
-  const int size = RequiredBufferSizeForDims(input_dims);
+  const int size = MatchingFlatSize(input_dims, output_dims);
 
   int c = 0;
   int32_t output_zero_point = 128;
@@ -5073,8 +5223,7 @@ inline void Tanh(const int16* input_data, const Dims<4>& input_dims,
   TFLITE_DCHECK_GE(input_left_shift, 0);
   TFLITE_DCHECK_LE(input_left_shift, 1);
 
-  const int flat_size = RequiredBufferSizeForDims(output_dims);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input_dims), flat_size);
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
 
   int c = 0;
   const int16* input_data_ptr = input_data;
@@ -5169,20 +5318,11 @@ inline void Dequantize(const uint8* input_data, const Dims<4>& input_dims,
                        int32 zero_point, double scale, float* output_data,
                        const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("Dequantize");
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          int32 val = input_data[Offset(input_dims, c, x, y, b)];
-          float result = static_cast<float>(scale * (val - zero_point));
-          output_data[Offset(output_dims, c, x, y, b)] = result;
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    int32 val = input_data[i];
+    float result = static_cast<float>(scale * (val - zero_point));
+    output_data[i] = result;
   }
 }
 
@@ -5205,25 +5345,15 @@ inline void FakeQuant(const float* input_data, const Dims<4>& input_dims,
                          &nudged_max, &nudged_scale);
   const float inv_nudged_scale = 1.0f / nudged_scale;
 
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
-  const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          const float src_val = input_data[Offset(input_dims, c, x, y, b)];
-          const float clamped =
-              std::min(nudged_max, std::max(nudged_min, src_val));
-          const float clamped_shifted = clamped - nudged_min;
-          const float dst_val =
-              TfLiteRound(clamped_shifted * inv_nudged_scale) * nudged_scale +
-              nudged_min;
-          output_data[Offset(output_dims, c, x, y, b)] = dst_val;
-        }
-      }
-    }
+  const int flat_size = MatchingFlatSize(output_dims, input_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    const float src_val = input_data[i];
+    const float clamped = std::min(nudged_max, std::max(nudged_min, src_val));
+    const float clamped_shifted = clamped - nudged_min;
+    const float dst_val =
+        TfLiteRound(clamped_shifted * inv_nudged_scale) * nudged_scale +
+        nudged_min;
+    output_data[i] = dst_val;
   }
 }
 
@@ -5769,10 +5899,26 @@ inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims,
 }
 
 template <typename T>
-inline void Pad(const T* input_data, const Dims<4>& input_dims,
-                const std::vector<int>& left_paddings,
-                const std::vector<int>& right_paddings, T* output_data,
-                const Dims<4>& output_dims, const int32_t pad_value) {
+void TypedMemset(void* ptr, T value, size_t num) {
+  // Optimization for common cases where memset() will suffice.
+  if (value == 0 || std::is_same<T, uint8_t>::value) {
+    memset(ptr, value, num * sizeof(T));
+  } else {
+    // Default implementation for cases where memset() will not preserve the
+    // bytes, e.g., typically when sizeof(T) > sizeof(uint8_t).
+    char* pos = static_cast<char*>(ptr);
+    for (size_t i = 0; i < num; ++i) {
+      memcpy(pos, &value, sizeof(T));
+      pos = pos + sizeof(T);
+    }
+  }
+}
+
+template <typename T>
+inline void PadV2(const T* input_data, const Dims<4>& input_dims,
+                  const std::vector<int>& left_paddings,
+                  const std::vector<int>& right_paddings, T* output_data,
+                  const Dims<4>& output_dims, const T pad_value) {
   gemmlowp::ScopedProfilingLabel label("Pad");
   TFLITE_DCHECK_EQ(left_paddings.size(), 4);
   TFLITE_DCHECK_EQ(right_paddings.size(), 4);
@@ -5795,27 +5941,28 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
   const int input_depth = ArraySize(input_dims, 0);
 
   if (left_b_padding != 0) {
-    memset(output_data, pad_value,
-           left_b_padding * output_height * output_width * output_depth *
-               sizeof(T));
+    TypedMemset<T>(
+        output_data, pad_value,
+        left_b_padding * output_height * output_width * output_depth);
   }
   for (int out_b = left_b_padding; out_b < output_batch - right_b_padding;
        ++out_b) {
     if (left_h_padding != 0) {
-      memset(output_data + Offset(output_dims, 0, 0, 0, out_b), pad_value,
-             left_h_padding * output_width * output_depth * sizeof(T));
+      TypedMemset<T>(output_data + Offset(output_dims, 0, 0, 0, out_b),
+                     pad_value, left_h_padding * output_width * output_depth);
     }
     for (int out_h = left_h_padding; out_h < output_height - right_h_padding;
          ++out_h) {
       if (left_w_padding != 0) {
-        memset(output_data + Offset(output_dims, 0, 0, out_h, out_b), pad_value,
-               left_w_padding * output_depth * sizeof(T));
+        TypedMemset<T>(output_data + Offset(output_dims, 0, 0, out_h, out_b),
+                       pad_value, left_w_padding * output_depth);
       }
       for (int out_w = left_w_padding; out_w < output_width - right_w_padding;
            ++out_w) {
         if (left_d_padding != 0) {
-          memset(output_data + Offset(output_dims, 0, out_w, out_h, out_b),
-                 pad_value, left_d_padding * sizeof(T));
+          TypedMemset<T>(
+              output_data + Offset(output_dims, 0, out_w, out_h, out_b),
+              pad_value, left_d_padding);
         }
 
         T* out = output_data +
@@ -5826,35 +5973,46 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
         memcpy(out, in, input_depth * sizeof(T));
 
         if (right_d_padding != 0) {
-          memset(
+          TypedMemset<T>(
               output_data + Offset(output_dims, output_depth - right_d_padding,
                                    out_w, out_h, out_b),
-              pad_value, right_d_padding * sizeof(T));
+              pad_value, right_d_padding);
         }
       }
       if (right_w_padding != 0) {
-        memset(
+        TypedMemset<T>(
             output_data + Offset(output_dims, 0, output_width - right_w_padding,
                                  out_h, out_b),
-            pad_value, right_w_padding * output_depth * sizeof(T));
+            pad_value, right_w_padding * output_depth);
       }
     }
     if (right_h_padding != 0) {
-      memset(output_data + Offset(output_dims, 0, 0,
-                                  output_height - right_h_padding, out_b),
-             pad_value,
-             right_h_padding * output_width * output_depth * sizeof(T));
+      TypedMemset<T>(
+          output_data +
+              Offset(output_dims, 0, 0, output_height - right_h_padding, out_b),
+          pad_value, right_h_padding * output_width * output_depth);
     }
   }
   if (right_b_padding != 0) {
-    memset(output_data +
-               Offset(output_dims, 0, 0, 0, output_batch - right_b_padding),
-           0,
-           right_b_padding * output_height * output_width * output_depth *
-               sizeof(T));
+    TypedMemset<T>(
+        output_data +
+            Offset(output_dims, 0, 0, 0, output_batch - right_b_padding),
+        pad_value,
+        right_b_padding * output_height * output_width * output_depth);
   }
 }
 
+// Legacy Pad() method that casts an int32_t to T before padding.
+template <typename T>
+inline void Pad(const T* input_data, const Dims<4>& input_dims,
+                const std::vector<int>& left_paddings,
+                const std::vector<int>& right_paddings, T* output_data,
+                const Dims<4>& output_dims, const int32_t pad_value) {
+  const T converted_pad_value = static_cast<T>(pad_value);
+  PadV2<T>(input_data, input_dims, left_paddings, right_paddings, output_data,
+           output_dims, converted_pad_value);
+}
+
 template <typename T>
 inline void Pad(const T* input_data, const Dims<4>& input_dims,
                 const std::vector<int>& left_paddings,
@@ -5864,90 +6022,7 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
       output_dims, 0);
 }
 
-// UNOPTIMIZED COPY of StridedSlice from reference_ops.h (see comments there).
-
-// Use until std::clamp() is available from C++17.
-inline int Clamp(const int v, const int lo, const int hi) {
-  TFLITE_DCHECK(!(hi < lo));
-  if (hi < v) return hi;
-  if (v < lo) return lo;
-  return v;
-}
-
-inline int StartForAxis(int begin_mask, const std::vector<int>& start_indices,
-                        const std::vector<int>& strides,
-                        const Dims<4>& input_shape, int axis) {
-  // Begin with the specified index
-  int start = start_indices[axis];
-
-  // begin_mask override
-  if (begin_mask & 1 << axis) {
-    if (strides[axis] > 0) {
-      // Forward iteration - use the first element. These values will get
-      // clamped below (Note: We could have set them to 0 and axis_size-1, but
-      // use lowest() and max() to maintain symmetry with StopForAxis())
-      start = std::numeric_limits<int>::lowest();
-    } else {
-      // Backward iteration - use the last element.
-      start = std::numeric_limits<int>::max();
-    }
-  }
-
-  // Handle negative indices
-  int axis_size = input_shape.sizes[axis];
-  if (start < 0) {
-    start += axis_size;
-  }
-
-  // Clamping
-  start = Clamp(start, 0, axis_size - 1);
-
-  return start;
-}
-
-inline int StopForAxis(int end_mask, const std::vector<int>& stop_indices,
-                       const std::vector<int>& strides,
-                       const Dims<4>& input_shape, int axis) {
-  // Begin with the specified index
-  int stop = stop_indices[axis];
-
-  // end_mask override
-  if (end_mask & (1 << axis)) {
-    if (strides[axis] > 0) {
-      // Forward iteration - use the last element. These values will get
-      // clamped below
-      stop = std::numeric_limits<int>::max();
-    } else {
-      // Backward iteration - use the first element.
-      stop = std::numeric_limits<int>::lowest();
-    }
-  }
-
-  // Handle negative indices
-  int axis_size = input_shape.sizes[axis];
-  if (stop < 0) {
-    stop += axis_size;
-  }
-
-  // Clamping
-  // Because the end index points one past the last element, we need slightly
-  // different clamping ranges depending on the direction.
-  if (strides[axis] > 0) {
-    // Forward iteration
-    stop = Clamp(stop, 0, axis_size);
-  } else {
-    // Backward iteration
-    stop = Clamp(stop, -1, axis_size - 1);
-  }
-
-  return stop;
-}
-
-inline bool LoopCondition(int index, int stop, int stride) {
-  // True when we have reached the end of an axis and should loop.
-  return stride > 0 ? index >= stop : index <= stop;
-}
-
+// UNOPTIMIZED COPY of StridedSlice from reference_ops.h.
 template <typename T>
 inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
                          int begin_mask, int end_mask,
@@ -5958,31 +6033,35 @@ inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
   TFLITE_DCHECK_EQ(start_indices.size(), 4);
   TFLITE_DCHECK_EQ(stop_indices.size(), 4);
   TFLITE_DCHECK_EQ(strides.size(), 4);
-  const int start_b =
-      StartForAxis(begin_mask, start_indices, strides, input_dims, 3);
-  const int stop_b =
-      StopForAxis(end_mask, stop_indices, strides, input_dims, 3);
-  const int start_h =
-      StartForAxis(begin_mask, start_indices, strides, input_dims, 2);
-  const int stop_h =
-      StopForAxis(end_mask, stop_indices, strides, input_dims, 2);
-  const int start_w =
-      StartForAxis(begin_mask, start_indices, strides, input_dims, 1);
-  const int stop_w =
-      StopForAxis(end_mask, stop_indices, strides, input_dims, 1);
-  const int start_d =
-      StartForAxis(begin_mask, start_indices, strides, input_dims, 0);
-  const int stop_d =
-      StopForAxis(end_mask, stop_indices, strides, input_dims, 0);
+  const int start_b = strided_slice::StartForAxis(begin_mask, start_indices,
+                                                  strides, input_dims.sizes, 3);
+  const int stop_b = strided_slice::StopForAxis(end_mask, stop_indices, strides,
+                                                input_dims.sizes, 3);
+  const int start_h = strided_slice::StartForAxis(begin_mask, start_indices,
+                                                  strides, input_dims.sizes, 2);
+  const int stop_h = strided_slice::StopForAxis(end_mask, stop_indices, strides,
+                                                input_dims.sizes, 2);
+  const int start_w = strided_slice::StartForAxis(begin_mask, start_indices,
+                                                  strides, input_dims.sizes, 1);
+  const int stop_w = strided_slice::StopForAxis(end_mask, stop_indices, strides,
+                                                input_dims.sizes, 1);
+  const int start_d = strided_slice::StartForAxis(begin_mask, start_indices,
+                                                  strides, input_dims.sizes, 0);
+  const int stop_d = strided_slice::StopForAxis(end_mask, stop_indices, strides,
+                                                input_dims.sizes, 0);
 
   T* out_ptr = output_data;
-  for (int in_b = start_b; !LoopCondition(in_b, stop_b, strides[3]);
+  for (int in_b = start_b;
+       !strided_slice::LoopCondition(in_b, stop_b, strides[3]);
        in_b += strides[3]) {
-    for (int in_h = start_h; !LoopCondition(in_h, stop_h, strides[2]);
+    for (int in_h = start_h;
+         !strided_slice::LoopCondition(in_h, stop_h, strides[2]);
          in_h += strides[2]) {
-      for (int in_w = start_w; !LoopCondition(in_w, stop_w, strides[1]);
+      for (int in_w = start_w;
+           !strided_slice::LoopCondition(in_w, stop_w, strides[1]);
            in_w += strides[1]) {
-        for (int in_d = start_d; !LoopCondition(in_d, stop_d, strides[0]);
+        for (int in_d = start_d;
+             !strided_slice::LoopCondition(in_d, stop_d, strides[0]);
              in_d += strides[0]) {
           *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)];
         }
@@ -6003,10 +6082,10 @@ inline void Slice(const T* input_data, const Dims<4>& input_dims,
       size[3] == -1 ? input_dims.sizes[3] - start_b : start_b + size[3];
   const int start_h = begin[2];
   const int stop_h =
-      size[2] == -1 ? input_dims.sizes[2] - start_b : start_b + size[2];
+      size[2] == -1 ? input_dims.sizes[2] - start_h : start_h + size[2];
   const int start_w = begin[1];
   const int stop_w =
-      size[1] == -1 ? input_dims.sizes[1] - start_b : start_b + size[1];
+      size[1] == -1 ? input_dims.sizes[1] - start_w : start_w + size[1];
   const int start_d = begin[0];
   const int stop_d =
       size[0] == -1 ? input_dims.sizes[0] - start_d : start_d + size[0];
@@ -6105,10 +6184,10 @@ void Sub(const T* input1_data, const Dims<4>& input1_dims, const T* input2_data,
   auto output_map = MapAsVector(output_data, output_dims);
   if (AreSameDims(input1_dims, input2_dims)) {
     output_map.array() = input1_map.array() - input2_map.array();
-  } else if (RequiredBufferSizeForDims(input1_dims) == 1) {
+  } else if (FlatSize(input1_dims) == 1) {
     auto scalar = input1_data[0];
     output_map.array() = scalar - input2_map.array();
-  } else if (RequiredBufferSizeForDims(input2_dims) == 1) {
+  } else if (FlatSize(input2_dims) == 1) {
     auto scalar = input2_data[0];
     output_map.array() = input1_map.array() - scalar;
   } else {
@@ -6146,32 +6225,28 @@ void ArgMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims,
 
   // The current ArgMax implemention can only determine the index of the maximum
   // value in the last dimension. So the axis argument is ignored.
-  TFLITE_DCHECK_EQ(axis[0], 3);
 
   // For ArgMax, the number of output dimensions = (number of input dimensions -
   // 1). For the sake of simplicity, the output dimensions are equal to the
   // input dimensions here. We enforce the constraint that the last dimension
   // must always be 1.
   TFLITE_DCHECK_EQ(ArraySize(output_dims, 0), 1);
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
   const int depth = ArraySize(input_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        auto max_value = input_data[Offset(input_dims, 0, x, y, b)];
-        int max_index = 0;
-        for (int d = 1; d < depth; ++d) {
-          const auto& curr_value = input_data[Offset(input_dims, d, x, y, b)];
-          if (curr_value > max_value) {
-            max_value = curr_value;
-            max_index = d;
-          }
-        }
-        output_data[Offset(output_dims, 0, x, y, b)] = max_index;
+  for (int i = 0; i < outer_size; ++i) {
+    auto max_value = *input_data;
+    ++input_data;
+    int max_index = 0;
+    for (int d = 1; d < depth; ++d) {
+      const auto& curr_value = *input_data;
+      if (curr_value > max_value) {
+        max_value = curr_value;
+        max_index = d;
       }
+      ++input_data;
     }
+    *output_data = max_index;
+    ++output_data;
   }
 }
 
@@ -6251,7 +6326,7 @@ inline void TransposeConv(const float* input_data, const Dims<4>& input_dims,
           const int out_y_origin = (in_y * stride_height) - pad_height;
           for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
             for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
-              for (int out_channel = 0; out_channel < input_depth;
+              for (int out_channel = 0; out_channel < output_depth;
                    ++out_channel) {
                 // Compute output element location
                 const int out_x = out_x_origin + filter_x;
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h b/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h
index ff15f3e3b10324..f14667090f5c38 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/tensor_utils_impl.h
@@ -40,6 +40,16 @@ void NeonMatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
                                              int n_batch, float* result,
                                              int result_stride);
 
+// Matrix multiplication for quantized values using symmetric quantization.
+void PortableMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors, const float* scaling_factors,
+    int n_batch, float* __restrict__ result, int result_stride);
+void NeonMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors, const float* scaling_factors,
+    int n_batch, float* __restrict__ result, int result_stride);
+
 // Cwise product of two vectors.
 void PortableVectorVectorCwiseProduct(const float* vector1,
                                       const float* vector2, int v_size,
@@ -117,6 +127,18 @@ void PortableZeroVector(float* vector, int v_size);
 // Limit a float input f between +abs_limit and -abs_limit.
 float PortableClip(float f, float abs_limit);
 
+// Check if all entries of a vector are zero.
+bool PortableIsZeroVector(const float* vector, int v_size);
+bool NeonIsZeroVector(const float* vector, int v_size);
+
+// Symmetric quantizer.
+void PortableSymmetricQuantizeFloats(const float* values, const int size,
+                                     int8_t* quantized_values, float* min,
+                                     float* max, float* scaling_factor);
+void NeonSymmetricQuantizeFloats(const float* values, const int size,
+                                 int8_t* quantized_values, float* min,
+                                 float* max, float* scaling_factor);
+
 // Shift left a vector in place with v_size size.
 void PortableVectorShiftLeft(float* vector, int v_size, float shift_value);
 void NeonVectorShiftLeft(float* vector, int v_size, float shift_value);
diff --git a/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc b/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc
index 3e9a3c29ee26e9..2d74b3d3849812 100644
--- a/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc
+++ b/tensorflow/contrib/lite/kernels/internal/quantization_util_test.cc
@@ -167,6 +167,7 @@ TEST(QuantizationUtilTest, ChooseQuantizationParamsZeroPointOnMinBoundary) {
   EXPECT_EQ(qp.zero_point, 0);
 }
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST(QuantizationUtilTest, ChooseQuantizationParamsZeroNotInRange) {
   // Assumption is that zero is within the range.
   EXPECT_DEATH(ChooseQuantizationParams<uint8>(10.0, 30.0), "");
@@ -176,6 +177,7 @@ TEST(QuantizationUtilTest, ChooseQuantizationParamsEmptyRangePositive) {
   // Assumption is that zero is within the range.
   EXPECT_DEATH(ChooseQuantizationParams<uint8>(30.0, 30.0), "");
 }
+#endif  // GTEST_HAS_DEATH_TEST
 
 TEST(QuantizationUtilTest, ChooseQuantizationParamsEmptyRangeZero) {
   QuantizationParams qp = ChooseQuantizationParams<uint8>(0.0, 0.0);
@@ -189,6 +191,7 @@ TEST(QuantizationUtilTest, ChooseQuantizationParamsZeroPointOnMaxBoundary) {
   EXPECT_EQ(qp.zero_point, 255);
 }
 
+#ifdef GTEST_HAS_DEATH_TEST
 TEST(QuantizationUtilTest, ChooseQuantizationParamsInvalidRange) {
   EXPECT_DEATH(ChooseQuantizationParams<uint8>(10.0, -30.0), "");
 }
@@ -261,6 +264,7 @@ TEST(QuantizationUtilTest, PreprocessSoftmaxScaling) {
   EXPECT_THAT(quantize(2.0, 16.0, 5), Pair(2147483647, 31));
   EXPECT_THAT(quantize(2.0, 8.0, 5), Pair(1073741824, 31));
 }
+#endif  // GTEST_HAS_DEATH_TEST
 
 TEST(QuantizationUtilTest, CalculateInputRadius) {
   EXPECT_EQ(CalculateInputRadius(4, 27), 15);
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h
index e9b6baeaee87d2..d57739279f44ad 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/depthwiseconv_uint8.h
@@ -76,8 +76,8 @@ inline void DepthwiseConv(const uint8* input_data, const Dims<4>& input_dims,
             if (bias_data) {
               acc += bias_data[Offset(bias_dims, oc, 0, 0, 0)];
             }
-            acc = MultiplyByQuantizedMultiplierSmallerThanOne(
-                acc, output_multiplier, output_shift);
+            acc = MultiplyByQuantizedMultiplier(acc, output_multiplier,
+                                                -output_shift);
             acc += output_offset;
             acc = std::max(acc, output_activation_min);
             acc = std::min(acc, output_activation_max);
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc
index c5b0bccc9da5fa..f8c6f341f7e615 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -12,10 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <stdlib.h>
 #include <string.h>
 
 #include "tensorflow/contrib/lite/builtin_op_data.h"
 #include "tensorflow/contrib/lite/kernels/activation_functor.h"
+#include "tensorflow/contrib/lite/kernels/internal/round.h"
 #include "tensorflow/contrib/lite/kernels/op_macros.h"
 
 namespace tflite {
@@ -27,6 +29,37 @@ float PortableClip(float f, float abs_limit) {
   return result;
 }
 
+bool PortableIsZeroVector(const float* vector, int v_size) {
+  for (int i = 0; i < v_size; ++i) {
+    if (*vector++ != 0.0f) return false;
+  }
+  return true;
+}
+
+void PortableSymmetricQuantizeFloats(const float* values, const int size,
+                                     int8_t* quantized_values,
+                                     float* __restrict__ min,
+                                     float* __restrict__ max,
+                                     float* __restrict__ scaling_factor) {
+  auto minmax = std::minmax_element(values, values + size);
+  *min = *minmax.first;
+  *max = *minmax.second;
+  const int kScale = 127;
+  const float range = std::max(std::abs(*min), std::abs(*max));
+  if (range == 0) {
+    memset(quantized_values, 0, size * sizeof(int8_t));
+    *scaling_factor = 1;
+    return;
+  }
+  *scaling_factor = kScale / range;
+  for (int i = 0; i < size; ++i) {
+    const int32_t quantized_value =
+        static_cast<int32_t>(TfLiteRound(*scaling_factor * values[i]));
+    // Clamp: just in case some odd numeric offset.
+    quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value));
+  }
+}
+
 void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
                                                  int m_rows, int m_cols,
                                                  const float* vector,
@@ -45,6 +78,31 @@ void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
   }
 }
 
+void PortableMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors,
+    const float* __restrict__ scaling_factors, int n_batch,
+    float* __restrict__ result, int result_stride) {
+  int batch, row, col;
+  for (batch = 0; batch < n_batch; ++batch, vectors += m_cols) {
+    const float batch_scaling_factor_inv = 1.0 / scaling_factors[batch];
+    // Get the address of the first row.
+    const int8_t* row_ptr = matrix;
+    for (row = 0; row < m_rows; ++row, result += result_stride) {
+      // Initialize the dot product sum for the row to 0.
+      int32_t dotprod = 0;
+      // Prefetch the row to cache.
+      __builtin_prefetch(row_ptr, 0 /* prefetch for read */,
+                         3 /* temporal locality */);
+      // For every block of 16 8-bit elements (128-bit register) from each row.
+      for (col = 0; col < m_cols; ++col, ++row_ptr) {
+        dotprod += (*row_ptr) * (vectors[col]);
+      }  // for col
+      *result += (dotprod * batch_scaling_factor_inv);
+    }  // for row
+  }    // for batch
+}
+
 void PortableVectorVectorCwiseProduct(const float* vector1,
                                       const float* vector2, int v_size,
                                       float* result) {
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h
index c05c21b472b05f..d2e1fecd25cf3d 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -25,6 +25,12 @@ namespace tensor_utils {
 // Limit a float input f between +abs_limit and -abs_limit.
 float PortableClip(float f, float abs_limit);
 
+bool PortableIsZeroVector(const float* vector, int v_size);
+
+void PortableSymmetricQuantizeFloats(const float* values, const int size,
+                                     int8_t* quantized_values, float* min,
+                                     float* max, float* scaling_factor);
+
 // Multiply a matrix by a batch vector, and store results in a batch-size
 // vector.
 void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
@@ -33,6 +39,11 @@ void PortableMatrixBatchVectorMultiplyAccumulate(const float* matrix,
                                                  int n_batch, float* result,
                                                  int result_stride);
 
+void PortableMatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors, const float* scaling_factors,
+    int n_batch, float* __restrict__ result, int result_stride);
+
 // Cwise product of two vectors.
 void PortableVectorVectorCwiseProduct(const float* vector1,
                                       const float* vector2, int v_size,
@@ -103,6 +114,17 @@ void PortableReductionSumVector(const float* input_vector, float* output_vector,
 
 float Clip(float f, float abs_limit) { return PortableClip(f, abs_limit); }
 
+bool IsZeroVector(const float* vector, int v_size) {
+  return PortableIsZeroVector(vector, v_size);
+}
+
+void SymmetricQuantizeFloats(const float* values, const int size,
+                             int8_t* quantized_values, float* min, float* max,
+                             float* scaling_factor) {
+  return PortableSymmetricQuantizeFloats(values, size, quantized_values, min,
+                                         max, scaling_factor);
+}
+
 void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
                                          int m_cols, const float* vector,
                                          int n_batch, float* result,
@@ -111,6 +133,15 @@ void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
                                               n_batch, result, result_stride);
 }
 
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vector, const float* scaling_factors,
+    int n_batch, float* __restrict__ result, int result_stride) {
+  PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector,
+                                              scaling_factors, n_batch, result,
+                                              result_stride);
+}
+
 void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
                               int v_size, float* result) {
   PortableVectorVectorCwiseProduct(vector1, vector2, v_size, result);
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 4c8cbe42759d0a..4781bbc70a30a4 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -29,30 +29,89 @@ limitations under the License.
 #include "tensorflow/contrib/lite/kernels/internal/common.h"
 #include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/contrib/lite/kernels/internal/round.h"
+#include "tensorflow/contrib/lite/kernels/internal/strided_slice_logic.h"
 #include "tensorflow/contrib/lite/kernels/internal/types.h"
 
 namespace tflite {
-namespace reference_ops {
 
-inline int32 MultiplyByQuantizedMultiplierSmallerThanOne(
-    int32 x, int32 quantized_multiplier, int right_shift) {
-  using gemmlowp::RoundingDivideByPOT;
-  using gemmlowp::SaturatingRoundingDoublingHighMul;
-  return RoundingDivideByPOT(
-      SaturatingRoundingDoublingHighMul(x, quantized_multiplier), right_shift);
+// TODO(b/77858996): Add these to gemmlowp.
+template <typename IntegerType>
+IntegerType SaturatingAddNonGemmlowp(IntegerType a, IntegerType b) {
+  static_assert(std::is_same<IntegerType, void>::value, "unimplemented");
+  return a;
+}
+
+template <>
+inline std::int32_t SaturatingAddNonGemmlowp(std::int32_t a, std::int32_t b) {
+  std::int64_t a64 = a;
+  std::int64_t b64 = b;
+  std::int64_t sum = a64 + b64;
+  return static_cast<std::int32_t>(std::min(
+      static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::max()),
+      std::max(
+          static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::min()),
+          sum)));
+}
+
+template <typename tRawType, int tIntegerBits>
+gemmlowp::FixedPoint<tRawType, tIntegerBits> SaturatingAddNonGemmlowp(
+    gemmlowp::FixedPoint<tRawType, tIntegerBits> a,
+    gemmlowp::FixedPoint<tRawType, tIntegerBits> b) {
+  return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw(
+      SaturatingAddNonGemmlowp(a.raw(), b.raw()));
+}
+
+template <typename IntegerType>
+IntegerType SaturatingSub(IntegerType a, IntegerType b) {
+  static_assert(std::is_same<IntegerType, void>::value, "unimplemented");
+  return a;
 }
 
-inline int32 MultiplyByQuantizedMultiplierGreaterThanOne(
-    int32 x, int32 quantized_multiplier, int left_shift) {
-  using gemmlowp::SaturatingRoundingDoublingHighMul;
-  return SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
-                                           quantized_multiplier);
+template <>
+inline std::int16_t SaturatingSub(std::int16_t a, std::int16_t b) {
+  std::int32_t a32 = a;
+  std::int32_t b32 = b;
+  std::int32_t diff = a32 - b32;
+  return static_cast<std::int16_t>(std::min(32767, std::max(-32768, diff)));
 }
 
+template <>
+inline std::int32_t SaturatingSub(std::int32_t a, std::int32_t b) {
+  std::int64_t a64 = a;
+  std::int64_t b64 = b;
+  std::int64_t diff = a64 - b64;
+  return static_cast<std::int32_t>(std::min(
+      static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::max()),
+      std::max(
+          static_cast<std::int64_t>(std::numeric_limits<std::int32_t>::min()),
+          diff)));
+}
+
+template <typename tRawType, int tIntegerBits>
+gemmlowp::FixedPoint<tRawType, tIntegerBits> SaturatingSub(
+    gemmlowp::FixedPoint<tRawType, tIntegerBits> a,
+    gemmlowp::FixedPoint<tRawType, tIntegerBits> b) {
+  return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw(
+      SaturatingSub(a.raw(), b.raw()));
+}
+// End section to be moved to gemmlowp.
+
+namespace reference_ops {
+
+// TODO(b/80247582) Remove this constant.
+// This will be phased out as the shifts are revised with more thought. Use of a
+// constant enables us to track progress on this work.
+//
+// Used mainly to convert from old-style shifts (right) to new-style (left).
+static constexpr int kReverseShift = -1;
+
 template <typename T>
 int CountLeadingZeros(T integer_input) {
   static_assert(std::is_unsigned<T>::value,
                 "Only unsigned integer types handled.");
+  if (integer_input == 0) {
+    return std::numeric_limits<T>::digits;
+  }
   const T one_in_leading_positive = static_cast<T>(1)
                                     << (std::numeric_limits<T>::digits - 1);
   int leading_zeros = 0;
@@ -63,6 +122,42 @@ int CountLeadingZeros(T integer_input) {
   return leading_zeros;
 }
 
+template <typename IntegerType>
+IntegerType SaturatingRoundingMultiplyByPOTParam(IntegerType x, int exponent) {
+  if (exponent == 0) {
+    return x;
+  }
+  using ScalarIntegerType =
+      typename gemmlowp::FixedPointRawTypeTraits<IntegerType>::ScalarRawType;
+  const IntegerType min =
+      gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::min());
+  const IntegerType max =
+      gemmlowp::Dup<IntegerType>(std::numeric_limits<ScalarIntegerType>::max());
+  const int ScalarIntegerTypeBits = 8 * sizeof(ScalarIntegerType);
+
+  const std::int32_t threshold =
+      ((1 << (ScalarIntegerTypeBits - 1 - exponent)) - 1);
+  const IntegerType positive_mask =
+      gemmlowp::MaskIfGreaterThan(x, gemmlowp::Dup<IntegerType>(threshold));
+  const IntegerType negative_mask =
+      gemmlowp::MaskIfLessThan(x, gemmlowp::Dup<IntegerType>(-threshold));
+
+  IntegerType result = gemmlowp::ShiftLeft(x, exponent);
+  result = gemmlowp::SelectUsingMask(positive_mask, max, result);
+  result = gemmlowp::SelectUsingMask(negative_mask, min, result);
+  return result;
+}
+
+// If we want to leave IntegerBits fixed, then multiplication
+// by a power of two has to be saturating/rounding, not exact anymore.
+template <typename tRawType, int tIntegerBits>
+gemmlowp::FixedPoint<tRawType, tIntegerBits>
+SaturatingRoundingMultiplyByPOTParam(
+    gemmlowp::FixedPoint<tRawType, tIntegerBits> a, int exponent) {
+  return gemmlowp::FixedPoint<tRawType, tIntegerBits>::FromRaw(
+      SaturatingRoundingMultiplyByPOTParam(a.raw(), exponent));
+}
+
 // DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING ELEMENT-WISE
 // BROADCASTING.
 //
@@ -319,8 +414,8 @@ inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
           if (bias_data) {
             acc += bias_data[Offset(bias_dims, out_channel, 0, 0, 0)];
           }
-          acc = MultiplyByQuantizedMultiplierSmallerThanOne(
-              acc, output_multiplier, output_shift);
+          acc = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+              acc, output_multiplier, kReverseShift * output_shift);
           acc += output_offset;
           acc = std::max(acc, output_activation_min);
           acc = std::min(acc, output_activation_max);
@@ -543,8 +638,8 @@ inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
       if (bias_data) {
         acc += bias_data[Offset(bias_dims, out_c, 0, 0, 0)];
       }
-      acc = MultiplyByQuantizedMultiplierSmallerThanOne(acc, output_multiplier,
-                                                        output_shift);
+      acc = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+          acc, output_multiplier, kReverseShift * output_shift);
       acc += output_offset;
       acc = std::max(acc, output_activation_min);
       acc = std::min(acc, output_activation_max);
@@ -921,31 +1016,30 @@ inline void GetInvSqrtQuantizedMultiplier(int32 input, int32* output_inv_sqrt,
 inline void L2Normalization(const uint8* input_data, const Dims<4>& input_dims,
                             int32 input_zero_point, uint8* output_data,
                             const Dims<4>& output_dims) {
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int height = MatchingArraySize(input_dims, 2, output_dims, 2);
-  const int width = MatchingArraySize(input_dims, 1, output_dims, 1);
   const int depth = MatchingArraySize(input_dims, 0, output_dims, 0);
-  TFLITE_DCHECK_EQ(batches, 1);
-  TFLITE_DCHECK_EQ(height, 1);
-  TFLITE_DCHECK_EQ(width, 1);
-  int32 square_l2_norm = 0;
-  for (int i = 0; i < depth; i++) {
-    int32 diff = input_data[Offset(input_dims, i, 0, 0, 0)] - input_zero_point;
-    square_l2_norm += diff * diff;
-  }
-  int32 inv_l2norm_multiplier;
-  int inv_l2norm_shift;
-  GetInvSqrtQuantizedMultiplier(square_l2_norm, &inv_l2norm_multiplier,
-                                &inv_l2norm_shift);
-
-  for (int i = 0; i < depth; i++) {
-    int32 diff = input_data[Offset(input_dims, i, 0, 0, 0)] - input_zero_point;
-    int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOne(
-        128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
-    int32 unclamped_output_val = 128 + rescaled_diff;
-    int32 output_val = std::min(255, std::max(0, unclamped_output_val));
-    output_data[Offset(output_dims, i, 0, 0, 0)] =
-        static_cast<uint8>(output_val);
+  const int outer_size = MatchingFlatSizeSkipDim(input_dims, 0, output_dims);
+  for (int i = 0; i < outer_size; ++i) {
+    int32 square_l2_norm = 0;
+    for (int c = 0; c < depth; c++) {
+      int32 diff =
+          input_data[Offset(input_dims, c, i, 0, 0)] - input_zero_point;
+      square_l2_norm += diff * diff;
+    }
+    int32 inv_l2norm_multiplier;
+    int inv_l2norm_shift;
+    GetInvSqrtQuantizedMultiplier(square_l2_norm, &inv_l2norm_multiplier,
+                                  &inv_l2norm_shift);
+
+    for (int c = 0; c < depth; c++) {
+      int32 diff =
+          input_data[Offset(input_dims, c, i, 0, 0)] - input_zero_point;
+      int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+          128 * diff, inv_l2norm_multiplier, kReverseShift * inv_l2norm_shift);
+      int32 unclamped_output_val = 128 + rescaled_diff;
+      int32 output_val = std::min(255, std::max(0, unclamped_output_val));
+      output_data[Offset(output_dims, c, i, 0, 0)] =
+          static_cast<uint8>(output_val);
+    }
   }
 }
 
@@ -1011,15 +1105,17 @@ inline void Add(int left_shift, const uint8* input1_data,
           const int32 shifted_input1_val = input1_val * (1 << left_shift);
           const int32 shifted_input2_val = input2_val * (1 << left_shift);
           const int32 scaled_input1_val =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  shifted_input1_val, input1_multiplier, input1_shift);
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input1_val, input1_multiplier,
+                  kReverseShift * input1_shift);
           const int32 scaled_input2_val =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  shifted_input2_val, input2_multiplier, input2_shift);
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input2_val, input2_multiplier,
+                  kReverseShift * input2_shift);
           const int32 raw_sum = scaled_input1_val + scaled_input2_val;
           const int32 raw_output =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  raw_sum, output_multiplier, output_shift) +
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  raw_sum, output_multiplier, kReverseShift * output_shift) +
               output_offset;
           const int32 clamped_output =
               std::min(output_activation_max,
@@ -1049,9 +1145,7 @@ inline void Add(const int16* input1_data, const Dims<4>& input1_dims,
     TFLITE_DCHECK_EQ(output_activation_max, 32767);
   }
 
-  const int flat_size = RequiredBufferSizeForDims(output_dims);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input1_dims), flat_size);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input2_dims), flat_size);
+  const int flat_size = MatchingFlatSize(output_dims, input1_dims, input2_dims);
 
   TFLITE_DCHECK(input1_shift == 0 || input2_shift == 0);
   TFLITE_DCHECK_GE(input1_shift, 0);
@@ -1167,15 +1261,17 @@ inline void BroadcastAdd(int left_shift, const uint8* input1_data,
           const int32 shifted_input1_val = input1_val * (1 << left_shift);
           const int32 shifted_input2_val = input2_val * (1 << left_shift);
           const int32 scaled_input1_val =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  shifted_input1_val, input1_multiplier, input1_shift);
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input1_val, input1_multiplier,
+                  kReverseShift * input1_shift);
           const int32 scaled_input2_val =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  shifted_input2_val, input2_multiplier, input2_shift);
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input2_val, input2_multiplier,
+                  kReverseShift * input2_shift);
           const int32 raw_sum = scaled_input1_val + scaled_input2_val;
           const int32 raw_output =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  raw_sum, output_multiplier, output_shift) +
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  raw_sum, output_multiplier, kReverseShift * output_shift) +
               output_offset;
           const int32 clamped_output =
               std::min(output_activation_max,
@@ -1188,6 +1284,62 @@ inline void BroadcastAdd(int left_shift, const uint8* input1_data,
   }
 }
 
+inline void BroadcastAddFivefold(
+    int y0, int y1, int y2, int y3, int y4, int left_shift,
+    const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset,
+    int32 input1_multiplier, int input1_shift, const uint8* input2_data,
+    const Dims<4>& input2_dims, int32 input2_offset, int32 input2_multiplier,
+    int input2_shift, int32 output_offset, int32 output_multiplier,
+    int output_shift, int32 output_activation_min, int32 output_activation_max,
+    uint8* output_data, const Dims<4>& output_dims) {
+  gemmlowp::ScopedProfilingLabel label("BroadcastAddFivefold/8bit");
+
+  int sb1 = y0;
+  int sa2 = y0;
+  int sb2 = y0 * y1;
+  int sa3 = y0 * y2;
+  int sa4 = y0 * y2 * y3;
+  int sb4 = y0 * y1 * y2;
+
+  uint8* output_data_ptr = output_data;
+  for (int i4 = 0; i4 < y4; ++i4) {
+    for (int i3 = 0; i3 < y3; ++i3) {
+      for (int i2 = 0; i2 < y2; ++i2) {
+        for (int i1 = 0; i1 < y1; ++i1) {
+          for (int i0 = 0; i0 < y0; ++i0) {
+            const int32 input1_val =
+                input1_offset +
+                input1_data[i4 * sa4 + i3 * sa3 + i2 * sa2 + i0];
+            const int32 input2_val =
+                input2_offset +
+                input2_data[i4 * sb4 + i2 * sb2 + i1 * sb1 + i0];
+            const int32 shifted_input1_val = input1_val * (1 << left_shift);
+            const int32 shifted_input2_val = input2_val * (1 << left_shift);
+            const int32 scaled_input1_val =
+                MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                    shifted_input1_val, input1_multiplier,
+                    kReverseShift * input1_shift);
+            const int32 scaled_input2_val =
+                MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                    shifted_input2_val, input2_multiplier,
+                    kReverseShift * input2_shift);
+            const int32 raw_sum = scaled_input1_val + scaled_input2_val;
+            const int32 raw_output =
+                MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                    raw_sum, output_multiplier, kReverseShift * output_shift) +
+                output_offset;
+            const int32 clamped_output =
+                std::min(output_activation_max,
+                         std::max(output_activation_min, raw_output));
+            *output_data_ptr = static_cast<uint8>(clamped_output);
+            ++output_data_ptr;
+          }
+        }
+      }
+    }
+  }
+}
+
 template <FusedActivationFunctionType Ac>
 inline void BroadcastAdd(int left_shift, const uint8* input1_data,
                          const Dims<4>& input1_dims, int32 input1_offset,
@@ -1216,6 +1368,33 @@ inline void BroadcastAdd(int left_shift, const uint8* input1_data,
                output_activation_max, output_data, output_dims);
 }
 
+template <FusedActivationFunctionType Ac>
+inline void BroadcastAddFivefold(
+    int y0, int y1, int y2, int y3, int y4, int left_shift,
+    const uint8* input1_data, const Dims<4>& input1_dims, int32 input1_offset,
+    int32 input1_multiplier, int input1_shift, const uint8* input2_data,
+    const Dims<4>& input2_dims, int32 input2_offset, int32 input2_multiplier,
+    int input2_shift, int32 output_offset, int32 output_multiplier,
+    int output_shift, int32 output_activation_min, int32 output_activation_max,
+    uint8* output_data, const Dims<4>& output_dims) {
+  static_assert(Ac == FusedActivationFunctionType::kNone ||
+                    Ac == FusedActivationFunctionType::kRelu ||
+                    Ac == FusedActivationFunctionType::kRelu6 ||
+                    Ac == FusedActivationFunctionType::kRelu1,
+                "");
+  TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
+  if (Ac == FusedActivationFunctionType::kNone) {
+    TFLITE_DCHECK_EQ(output_activation_min, 0);
+    TFLITE_DCHECK_EQ(output_activation_max, 255);
+  }
+  BroadcastAddFivefold(y0, y1, y2, y3, y4, left_shift, input1_data, input1_dims,
+                       input1_offset, input1_multiplier, input1_shift,
+                       input2_data, input2_dims, input2_offset,
+                       input2_multiplier, input2_shift, output_offset,
+                       output_multiplier, output_shift, output_activation_min,
+                       output_activation_max, output_data, output_dims);
+}
+
 inline void Mul(const float* input1_data, const Dims<4>& input1_dims,
                 const float* input2_data, const Dims<4>& input2_dims,
                 float output_activation_min, float output_activation_max,
@@ -1327,9 +1506,9 @@ inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
           const int32 input2_val =
               input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
           const int32 unclamped_result =
-              output_offset +
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  input1_val * input2_val, output_multiplier, output_shift);
+              output_offset + MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                                  input1_val * input2_val, output_multiplier,
+                                  kReverseShift * output_shift);
           const int32 clamped_output =
               std::min(output_activation_max,
                        std::max(output_activation_min, unclamped_result));
@@ -1346,9 +1525,7 @@ inline void Mul(const int16* input1_data, const Dims<4>& input1_dims,
                 int16* output_data, const Dims<4>& output_dims) {
   gemmlowp::ScopedProfilingLabel label("Mul/Int16");
 
-  const int flat_size = RequiredBufferSizeForDims(output_dims);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input1_dims), flat_size);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input2_dims), flat_size);
+  const int flat_size = MatchingFlatSize(output_dims, input1_dims, input2_dims);
 
   for (int i = 0; i < flat_size; i++) {
     // F0 uses 0 integer bits, range [-1, 1].
@@ -1368,9 +1545,7 @@ inline void Mul(const int16* input1_data, const Dims<4>& input1_dims,
   gemmlowp::ScopedProfilingLabel label("Mul/Int16Uint8");
   TFLITE_DCHECK_LE(output_activation_min, output_activation_max);
 
-  const int flat_size = RequiredBufferSizeForDims(output_dims);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input1_dims), flat_size);
-  TFLITE_DCHECK_EQ(RequiredBufferSizeForDims(input2_dims), flat_size);
+  const int flat_size = MatchingFlatSize(output_dims, input1_dims, input2_dims);
 
   for (int i = 0; i < flat_size; i++) {
     // F0 uses 0 integer bits, range [-1, 1].
@@ -1403,33 +1578,6 @@ inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
                output_data, output_dims);
 }
 
-inline void Div(const float* input1_data, const Dims<4>& input1_dims,
-                const float* input2_data, const Dims<4>& input2_dims,
-                float output_activation_min, float output_activation_max,
-                float* output_data, const Dims<4>& output_dims) {
-  const int batches =
-      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
-  const int height =
-      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
-  const int width =
-      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
-  const int depth =
-      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
-  for (int b = 0; b < batches; ++b) {
-    for (int y = 0; y < height; ++y) {
-      for (int x = 0; x < width; ++x) {
-        for (int c = 0; c < depth; ++c) {
-          output_data[Offset(output_dims, c, x, y, b)] =
-              ActivationFunctionWithMinMax(
-                  input1_data[Offset(input1_dims, c, x, y, b)] /
-                      input2_data[Offset(input2_dims, c, x, y, b)],
-                  output_activation_min, output_activation_max);
-        }
-      }
-    }
-  }
-}
-
 // TODO(jiawen): We can implement BroadcastDiv on buffers of arbitrary
 // dimensionality if the runtime code does a single loop over one dimension
 // that handles broadcasting as the base case. The code generator would then
@@ -1471,6 +1619,18 @@ void BroadcastDiv(const T* input1_data, const Dims<4>& input1_dims,
   }
 }
 
+inline void Div(const float* input1_data, const Dims<4>& input1_dims,
+                const float* input2_data, const Dims<4>& input2_dims,
+                float output_activation_min, float output_activation_max,
+                float* output_data, const Dims<4>& output_dims) {
+  const int flat_size = MatchingFlatSize(input1_dims, input2_dims, output_dims);
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = ActivationFunctionWithMinMax(
+        input1_data[i] / input2_data[i], output_activation_min,
+        output_activation_max);
+  }
+}
+
 inline void Sub(const float* input1_data, const Dims<4>& input1_dims,
                 const float* input2_data, const Dims<4>& input2_dims,
                 float output_activation_min, float output_activation_max,
@@ -1562,15 +1722,17 @@ inline void BroadcastSub(int left_shift, const uint8* input1_data,
           const int32 shifted_input1_val = input1_val * (1 << left_shift);
           const int32 shifted_input2_val = input2_val * (1 << left_shift);
           const int32 scaled_input1_val =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  shifted_input1_val, input1_multiplier, input1_shift);
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input1_val, input1_multiplier,
+                  kReverseShift * input1_shift);
           const int32 scaled_input2_val =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  shifted_input2_val, input2_multiplier, input2_shift);
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input2_val, input2_multiplier,
+                  kReverseShift * input2_shift);
           const int32 raw_sub = scaled_input1_val - scaled_input2_val;
           const int32 raw_output =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  raw_sub, output_multiplier, output_shift) +
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  raw_sub, output_multiplier, kReverseShift * output_shift) +
               output_offset;
           const int32 clamped_output =
               std::min(output_activation_max,
@@ -1627,7 +1789,7 @@ inline void Concatenation(int concat_dim, const uint8* const* input_data,
                           const int32 output_zeropoint,
                           const float output_scale) {
   // The arguments input_zeropoint and input_scale are expected to be an array
-  // that have the quantization paramaters for all the inputs to the concat
+  // that have the quantization parameters for all the inputs to the concat
   // operator.
   TFLITE_DCHECK_GT(inputs_count, 1);
   int64_t concat_size = 0;
@@ -1765,7 +1927,7 @@ inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
 // The quantization of the input, output arrays is as follows:
 //  - The input activations are quantized as uint8 on the interval
 //    [-1, 127/128].
-//    The rationale for that is that that is the natural interval for output
+//    The rationale for that is that is the natural interval for output
 //    activations (see next point) and these need to be concatenated together.
 //    We could accommodate different ranges by re-scaling, but we empirically
 //    found that setting the input activations range to be [-1, 127/128] in the
@@ -1813,7 +1975,7 @@ inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
 // requiring a power-of-two representation interval. Thus, we should right
 // away quantize this array to a power-of-two interval; otherwise,
 // implementation will need to rescale that, losing any benefit that a tighter
-// representation interval might otherwise yield, while introducting some
+// representation interval might otherwise yield, while introducing some
 // numerical error and computational overhead.
 //
 // Now, Logistic and Tanh
@@ -1830,7 +1992,7 @@ inline void LstmCell(const float* input_data, const Dims<4>& input_dims,
 // However, for a fixed-point implementation in 16-bit integers, using 5
 // integer bits to represent the [-16, 16] range would leave only 11
 // fractional bits, giving an increment of 2^-11 = 4.9e-4 between consecutive
-// representable values. Notice that that is higher than the
+// representable values. Notice that is higher than the
 // worst-case clamping error with clamping to [-8, 8]: 3.4e-4 for Logistic.
 // Using [-8, 8] thus seems like the better compromise overall, enjoying
 // an increment of 2.4e-4 between representable values and a worst-case
@@ -2611,6 +2773,121 @@ inline void LogSoftmax(const float* input_data, const Dims<4>& input_dims,
   }
 }
 
+// Although currently the name of this function says that it cannot handle
+// values less than 1, in practice it can handle as low as 1/x_max, where
+// x_max is the largest representable input.  In other words, the output range
+// is symmetric.
+template <int OutputIntegerBits, int InputIntegerBits>
+inline gemmlowp::FixedPoint<int32, OutputIntegerBits>
+log_x_for_x_greater_than_or_equal_to_1_impl(
+    gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) {
+  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
+  // The reason for accumulating the result with an extra bit of headroom is
+  // that z_pow_2_adj * log_2 might be saturated, and adding num_scaled *
+  // recip_denom will otherwise introduce an error.
+  static constexpr int kAccumIntegerBits = OutputIntegerBits + 1;
+  using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumIntegerBits>;
+
+  const FixedPoint0 log_2 = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 1488522236, std::log(2.0));
+  const FixedPoint0 sqrt_sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 1805811301, std::sqrt(std::sqrt(0.5)));
+  const FixedPoint0 sqrt_half = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 1518500250, std::sqrt(0.5));
+  const FixedPoint0 one_quarter =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(FixedPoint0, 536870912, 1.0 / 4.0);
+
+  const FixedPoint0 alpha_n = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 117049297, 11.0 / 240.0 * std::sqrt(std::sqrt(2.0)));
+  const FixedPoint0 alpha_d = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 127690142, 1.0 / 20.0 * std::sqrt(std::sqrt(2.0)));
+  const FixedPoint0 alpha_i = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 1057819769,
+      2.0 / std::sqrt(std::sqrt(2.0)) - std::sqrt(std::sqrt(2.0)));
+  const FixedPoint0 alpha_f = GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(
+      FixedPoint0, 638450708, 1.0 / 4.0 * std::sqrt(std::sqrt(2.0)));
+
+  const FixedPointAccum shifted_quarter =
+      gemmlowp::Rescale<kAccumIntegerBits>(one_quarter);
+
+  // Reinterpret the input value as Q0.31, because we will figure out the
+  // required shift "ourselves" instead of using, say, Rescale.
+  FixedPoint0 z_a = FixedPoint0::FromRaw(input_val.raw());
+  // z_a_pow_2 = input_integer_bits - z_a_headroom;
+  int z_a_headroom_plus_1 = CountLeadingZeros(static_cast<uint32>(z_a.raw()));
+  FixedPoint0 r_a_tmp =
+      SaturatingRoundingMultiplyByPOTParam(z_a, (z_a_headroom_plus_1 - 1));
+  const int32 r_a_raw =
+      SaturatingRoundingMultiplyByPOTParam((r_a_tmp * sqrt_half).raw(), 1);
+  // z_pow_2_adj = max(z_pow_2_a - 0.75, z_pow_2_b - 0.25);
+  // z_pow_2_adj = max(InputIntegerBits - z_a_headroom_plus_1 + 0.25,
+  //                   InputIntegerBits - z_b_headroom - 0.25);
+  const FixedPointAccum z_a_pow_2_adj = SaturatingAddNonGemmlowp(
+      FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
+          InputIntegerBits - z_a_headroom_plus_1, 31 - kAccumIntegerBits)),
+      shifted_quarter);
+
+  // z_b is treated like z_a, but premultiplying by sqrt(0.5).
+  FixedPoint0 z_b = z_a * sqrt_half;
+  int z_b_headroom = CountLeadingZeros(static_cast<uint32>(z_b.raw())) - 1;
+  const int32 r_b_raw =
+      SaturatingRoundingMultiplyByPOTParam(z_a.raw(), z_b_headroom);
+  const FixedPointAccum z_b_pow_2_adj = SaturatingSub(
+      FixedPointAccum::FromRaw(SaturatingRoundingMultiplyByPOTParam(
+          InputIntegerBits - z_b_headroom, 31 - kAccumIntegerBits)),
+      shifted_quarter);
+
+  const FixedPoint0 r = FixedPoint0::FromRaw(std::min(r_a_raw, r_b_raw));
+  const FixedPointAccum z_pow_2_adj = FixedPointAccum::FromRaw(
+      std::max(z_a_pow_2_adj.raw(), z_b_pow_2_adj.raw()));
+
+  const FixedPoint0 p = gemmlowp::RoundingHalfSum(r, sqrt_sqrt_half);
+  FixedPoint0 q = r - sqrt_sqrt_half;
+  q = q + q;
+
+  const FixedPoint0 common_sq = q * q;
+  const FixedPoint0 num = q * r + q * common_sq * alpha_n;
+  const FixedPoint0 denom_minus_one_0 =
+      p * (alpha_i + q + alpha_d * common_sq) + alpha_f * q;
+  const FixedPoint0 recip_denom =
+      one_over_one_plus_x_for_x_in_0_1(denom_minus_one_0);
+
+  const FixedPointAccum num_scaled = gemmlowp::Rescale<kAccumIntegerBits>(num);
+  return gemmlowp::Rescale<OutputIntegerBits>(z_pow_2_adj * log_2 +
+                                              num_scaled * recip_denom);
+}
+
+// Minimum output bits to accommodate log of maximum input range.  It actually
+// does not matter if one considers, say, [-64,64] or [-64,64).
+//
+// For example, run this through Octave:
+// [0:127; ...
+//  ceil(log(abs( log(2.^(0:127))+1 ))/log(2)); ...
+//  ceil(log(abs( log(2.^(0:127))+1 ))/log(2))]
+constexpr int min_log_x_output_bits(int input_bits) {
+  return input_bits > 90
+             ? 7
+             : input_bits > 44
+                   ? 6
+                   : input_bits > 21
+                         ? 5
+                         : input_bits > 10
+                               ? 4
+                               : input_bits > 4 ? 3 : input_bits > 1 ? 2 : 1;
+}
+
+template <int OutputIntegerBits, int InputIntegerBits>
+inline gemmlowp::FixedPoint<int32, OutputIntegerBits>
+log_x_for_x_greater_than_or_equal_to_1(
+    gemmlowp::FixedPoint<int32, InputIntegerBits> input_val) {
+  static_assert(
+      OutputIntegerBits >= min_log_x_output_bits(InputIntegerBits),
+      "Output integer bits must be sufficent to accommodate logs of inputs.");
+  return log_x_for_x_greater_than_or_equal_to_1_impl<OutputIntegerBits,
+                                                     InputIntegerBits>(
+      input_val);
+}
+
 inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims,
                        int32 input_multiplier, int32 input_left_shift,
                        int32 reverse_scaling_divisor,
@@ -2653,13 +2930,10 @@ inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims,
       }
     }
 
-    // TODO(b/77858996): Implement fixed-point log().
-    // Not a fully-quantized implementation: floating-point log().
-    const float float_log_sum_of_exps =
-        std::log(static_cast<float>(sum_of_exps.raw()) /
-                 (1 << (31 - kAccumulationIntegerBits)));
-    const int32 fixed_log_sum_of_exps = static_cast<int32>(TfLiteRound(
-        float_log_sum_of_exps * (1 << (31 - kScaledDiffIntegerBits))));
+    const int32 fixed_log_sum_of_exps =
+        log_x_for_x_greater_than_or_equal_to_1<kScaledDiffIntegerBits>(
+            sum_of_exps)
+            .raw();
 
     // rescaled_diff_min is smallest representable in
     // Q(kScaledDiffIntegerBits).(31-kScaledDiffIntegerBits) plus the
@@ -2670,9 +2944,9 @@ inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims,
         fixed_log_sum_of_exps + std::numeric_limits<int32>::lowest();
     const int adjusted_diff_min =
         std::max(diff_min - 1,  // Note use of > below instead of >= above.
-                 MultiplyByQuantizedMultiplierSmallerThanOne(
+                 MultiplyByQuantizedMultiplierSmallerThanOneExp(
                      rescaled_diff_min, reverse_scaling_divisor,
-                     reverse_scaling_right_shift));
+                     kReverseShift * reverse_scaling_right_shift));
 
     for (int c = 0; c < depth; ++c) {
       int32 input_diff =
@@ -3076,10 +3350,10 @@ inline void BatchToSpaceND(const T* input_data, const Dims<4>& input_dims,
 }
 
 template <typename T>
-inline void Pad(const T* input_data, const Dims<4>& input_dims,
-                const std::vector<int>& left_paddings,
-                const std::vector<int>& right_paddings, T* output_data,
-                const Dims<4>& output_dims, const int32_t pad_value) {
+inline void PadV2(const T* input_data, const Dims<4>& input_dims,
+                  const std::vector<int>& left_paddings,
+                  const std::vector<int>& right_paddings, T* output_data,
+                  const Dims<4>& output_dims, const T pad_value) {
   TFLITE_DCHECK_EQ(left_paddings.size(), 4);
   TFLITE_DCHECK_EQ(right_paddings.size(), 4);
 
@@ -3112,7 +3386,7 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
               out_w >= output_width - right_w_padding ||
               out_d < left_d_padding ||
               out_d >= output_depth - right_d_padding) {
-            *out_ptr++ = static_cast<T>(pad_value);
+            *out_ptr++ = pad_value;
           } else {
             *out_ptr++ = *in_ptr++;
           }
@@ -3122,6 +3396,17 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
+// Legacy Pad() method that casts an int32_t to T before padding.
+template <typename T>
+inline void Pad(const T* input_data, const Dims<4>& input_dims,
+                const std::vector<int>& left_paddings,
+                const std::vector<int>& right_paddings, T* output_data,
+                const Dims<4>& output_dims, const int32_t pad_value) {
+  const T converted_pad_value = static_cast<T>(pad_value);
+  PadV2<T>(input_data, input_dims, left_paddings, right_paddings, output_data,
+           output_dims, converted_pad_value);
+}
+
 template <typename T>
 inline void Pad(const T* input_data, const Dims<4>& input_dims,
                 const std::vector<int>& left_paddings,
@@ -3131,104 +3416,6 @@ inline void Pad(const T* input_data, const Dims<4>& input_dims,
       output_dims, 0);
 }
 
-// STRIDED SLICE
-// The functions below for StridedSlice are mirrored in a number of places:
-//
-//   propagate_fixed_sizes.cc
-//   propagate_shapes.cc
-//   resolve_constant_strided_slice.cc
-//   optimized_ops.h
-//
-// It is designed for an arbitrary number of dimensions, even though dimensions
-// here are fixed at 4. This is because we expect to eventually support
-// arbitrary dimensionality. Also note that the axis orders are reversed for
-// runtime ops, and so the indices and masks must be as well too.
-//
-// Be warned this code involves some rather subtle logic of python slicing. The
-// best "ground truth" is to compare results to actual python execution.
-
-// Use until std::clamp() is available from C++17.
-inline int Clamp(const int v, const int lo, const int hi) {
-  TFLITE_DCHECK(!(hi < lo));
-  if (hi < v) return hi;
-  if (v < lo) return lo;
-  return v;
-}
-
-inline int StartForAxis(int begin_mask, const std::vector<int>& start_indices,
-                        const std::vector<int>& strides,
-                        const Dims<4>& input_shape, int axis) {
-  // Begin with the specified index
-  int start = start_indices[axis];
-
-  // begin_mask override
-  if (begin_mask & 1 << axis) {
-    if (strides[axis] > 0) {
-      // Forward iteration - use the first element. These values will get
-      // clamped below (Note: We could have set them to 0 and axis_size-1, but
-      // use lowest() and max() to maintain symmetry with StopForAxis())
-      start = std::numeric_limits<int>::lowest();
-    } else {
-      // Backward iteration - use the last element.
-      start = std::numeric_limits<int>::max();
-    }
-  }
-
-  // Handle negative indices
-  int axis_size = input_shape.sizes[axis];
-  if (start < 0) {
-    start += axis_size;
-  }
-
-  // Clamping
-  start = Clamp(start, 0, axis_size - 1);
-
-  return start;
-}
-
-inline int StopForAxis(int end_mask, const std::vector<int>& stop_indices,
-                       const std::vector<int>& strides,
-                       const Dims<4>& input_shape, int axis) {
-  // Begin with the specified index
-  int stop = stop_indices[axis];
-
-  // end_mask override
-  if (end_mask & (1 << axis)) {
-    if (strides[axis] > 0) {
-      // Forward iteration - use the last element. These values will get
-      // clamped below
-      stop = std::numeric_limits<int>::max();
-    } else {
-      // Backward iteration - use the first element.
-      stop = std::numeric_limits<int>::lowest();
-    }
-  }
-
-  // Handle negative indices
-  int axis_size = input_shape.sizes[axis];
-  if (stop < 0) {
-    stop += axis_size;
-  }
-
-  // Clamping
-  // Because the end index points one past the last element, we need slightly
-  // different clamping ranges depending on the direction.
-  if (strides[axis] > 0) {
-    // Forward iteration
-    stop = Clamp(stop, 0, axis_size);
-  } else {
-    // Backward iteration
-    stop = Clamp(stop, -1, axis_size - 1);
-  }
-
-  return stop;
-}
-
-inline bool LoopCondition(int index, int stop, int stride) {
-  // True when we have reached the end of an axis and should loop.
-  return stride > 0 ? index >= stop : index <= stop;
-}
-
 template <typename T>
 inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
                          int begin_mask, int end_mask,
@@ -3236,34 +3423,40 @@ inline void StridedSlice(const T* input_data, const Dims<4>& input_dims,
                          const std::vector<int>& stop_indices,
                          const std::vector<int>& strides, T* output_data,
                          const Dims<4>& output_dims) {
+  // Note that the axis orders are reversed for runtime ops, so the indices,
+  // strides and masks must be as well too.
   TFLITE_DCHECK_EQ(start_indices.size(), 4);
   TFLITE_DCHECK_EQ(stop_indices.size(), 4);
   TFLITE_DCHECK_EQ(strides.size(), 4);
-  const int start_b =
-      StartForAxis(begin_mask, start_indices, strides, input_dims, 3);
-  const int stop_b =
-      StopForAxis(end_mask, stop_indices, strides, input_dims, 3);
-  const int start_h =
-      StartForAxis(begin_mask, start_indices, strides, input_dims, 2);
-  const int stop_h =
-      StopForAxis(end_mask, stop_indices, strides, input_dims, 2);
-  const int start_w =
-      StartForAxis(begin_mask, start_indices, strides, input_dims, 1);
-  const int stop_w =
-      StopForAxis(end_mask, stop_indices, strides, input_dims, 1);
-  const int start_d =
-      StartForAxis(begin_mask, start_indices, strides, input_dims, 0);
-  const int stop_d =
-      StopForAxis(end_mask, stop_indices, strides, input_dims, 0);
+  const int start_b = strided_slice::StartForAxis(begin_mask, start_indices,
+                                                  strides, input_dims.sizes, 3);
+  const int stop_b = strided_slice::StopForAxis(end_mask, stop_indices, strides,
+                                                input_dims.sizes, 3);
+  const int start_h = strided_slice::StartForAxis(begin_mask, start_indices,
+                                                  strides, input_dims.sizes, 2);
+  const int stop_h = strided_slice::StopForAxis(end_mask, stop_indices, strides,
+                                                input_dims.sizes, 2);
+  const int start_w = strided_slice::StartForAxis(begin_mask, start_indices,
+                                                  strides, input_dims.sizes, 1);
+  const int stop_w = strided_slice::StopForAxis(end_mask, stop_indices, strides,
+                                                input_dims.sizes, 1);
+  const int start_d = strided_slice::StartForAxis(begin_mask, start_indices,
+                                                  strides, input_dims.sizes, 0);
+  const int stop_d = strided_slice::StopForAxis(end_mask, stop_indices, strides,
+                                                input_dims.sizes, 0);
 
   T* out_ptr = output_data;
-  for (int in_b = start_b; !LoopCondition(in_b, stop_b, strides[3]);
+  for (int in_b = start_b;
+       !strided_slice::LoopCondition(in_b, stop_b, strides[3]);
        in_b += strides[3]) {
-    for (int in_h = start_h; !LoopCondition(in_h, stop_h, strides[2]);
+    for (int in_h = start_h;
+         !strided_slice::LoopCondition(in_h, stop_h, strides[2]);
          in_h += strides[2]) {
-      for (int in_w = start_w; !LoopCondition(in_w, stop_w, strides[1]);
+      for (int in_w = start_w;
+           !strided_slice::LoopCondition(in_w, stop_w, strides[1]);
            in_w += strides[1]) {
-        for (int in_d = start_d; !LoopCondition(in_d, stop_d, strides[0]);
+        for (int in_d = start_d;
+             !strided_slice::LoopCondition(in_d, stop_d, strides[0]);
              in_d += strides[0]) {
           *out_ptr++ = input_data[Offset(input_dims, in_d, in_w, in_h, in_b)];
         }
@@ -3284,10 +3477,10 @@ inline void Slice(const T* input_data, const Dims<4>& input_dims,
       size[3] == -1 ? input_dims.sizes[3] - start_b : start_b + size[3];
   const int start_h = begin[2];
   const int stop_h =
-      size[2] == -1 ? input_dims.sizes[2] - start_b : start_b + size[2];
+      size[2] == -1 ? input_dims.sizes[2] - start_h : start_h + size[2];
   const int start_w = begin[1];
   const int stop_w =
-      size[1] == -1 ? input_dims.sizes[1] - start_b : start_b + size[1];
+      size[1] == -1 ? input_dims.sizes[1] - start_w : start_w + size[1];
   const int start_d = begin[0];
   const int stop_d =
       size[0] == -1 ? input_dims.sizes[0] - start_d : start_d + size[0];
@@ -3312,63 +3505,124 @@ inline void Exp(const T* input_data, const size_t num_elements,
   }
 }
 
-template <typename T, typename U>
-inline bool Mean(T* input_data, const int* input_dims, const int input_num_dims,
-                 T* output_data, const int* output_dims,
-                 const int output_num_dims, const int* axis,
-                 const int num_axis_dimensions, bool keep_dims, int* temp_index,
-                 int* resolved_axis, U* temp_sum) {
-  // resets output data.
-  size_t num_outputs = 1;
-  for (int idx = 0; idx < output_num_dims; ++idx) {
-    num_outputs *= static_cast<size_t>(output_dims[idx]);
-  }
-  for (size_t idx = 0; idx < num_outputs; ++idx) {
-    output_data[idx] = T();
-    temp_sum[idx] = U();
-  }
-  // resets temp index.
+// A generic reduce method that can be used for reduce_sum, reduce_mean, etc.
+// It takes a reducer function as input and returns false when numeric overflow
+// is detected.
+// This method iterates through input data and reduce elements along the
+// dimensions given in axis.
+template <typename In, typename Out>
+inline bool Reduce(const In* input_data, const int* input_dims,
+                   const int* output_dims, const int input_num_dims,
+                   const int output_num_dims, const int* axis,
+                   const int num_axis, int* input_iter,
+                   Out reducer(Out current, const In in, bool* overflow),
+                   Out* output_data) {
+  // Reset input iterator.
+  TFLITE_DCHECK(input_num_dims > 0);
   for (int idx = 0; idx < input_num_dims; ++idx) {
-    temp_index[idx] = 0;
+    input_iter[idx] = 0;
   }
-  // resolves axis.
-  int num_resolved_axis = 0;
-  for (int idx = 0; idx < num_axis_dimensions; ++idx) {
-    int current = axis[idx];
-    TFLITE_DCHECK(current < input_num_dims && current + input_num_dims >= 0);
-    if (current < 0) {
-      current += input_num_dims;
-    }
+  // Iterate through input_data.
+  do {
+    size_t input_offset =
+        ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
+    size_t output_offset = ReducedOutputOffset(input_num_dims, input_dims,
+                                               input_iter, num_axis, axis);
+    bool overflow = false;
+    output_data[output_offset] = reducer(output_data[output_offset],
+                                         input_data[input_offset], &overflow);
+    if (overflow) return false;
+  } while (NextIndex(input_num_dims, input_dims, input_iter));
+  return true;
+}
+
+inline bool ResolveAxis(const int num_dims, const int* axis, const int num_axis,
+                        int* out_axis, int* out_num_axis) {
+  *out_num_axis = 0;  // Just in case.
+  // o(n^2) is fine since out_num_axis should be really small, mostly <= 4
+  for (int idx = 0; idx < num_axis; ++idx) {
+    // Handle negative index.
+    int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx];
+    TFLITE_DCHECK(current >= 0 && current < num_dims);
     bool is_dup = false;
-    for (int j = 0; j < num_resolved_axis; ++j) {
-      if (resolved_axis[j] == current) {
+    for (int j = 0; j < *out_num_axis; ++j) {
+      if (out_axis[j] == current) {
         is_dup = true;
         break;
       }
     }
     if (!is_dup) {
-      resolved_axis[num_resolved_axis++] = current;
+      out_axis[*out_num_axis] = current;
+      *out_num_axis += 1;
     }
   }
-  // iterates through input_data.
-  for (bool has_next = true; has_next;
-       has_next = NextIndex(input_num_dims, input_dims, temp_index)) {
-    size_t input_offset =
-        ReducedOutputOffset(input_num_dims, input_dims, temp_index, 0, nullptr);
-    size_t output_offset =
-        ReducedOutputOffset(input_num_dims, input_dims, temp_index,
-                            num_resolved_axis, resolved_axis);
-    temp_sum[output_offset] += static_cast<U>(input_data[input_offset]);
-  }
-  // takes average by num of elements added to get mean.
-  size_t num_elements_in_axis = 1;
+  return true;
+}
+
+// This method expects that output_data has been initialized.
+template <typename In, typename Out>
+inline bool ReduceSumImpl(const In* input_data, const int* input_dims,
+                          const int* output_dims, const int input_num_dims,
+                          const int output_num_dims, const int* axis,
+                          const int num_axis, int* input_iter,
+                          Out* output_data) {
+  auto reducer = [](Out current, const In in, bool* overflow) -> Out {
+    const Out actual_in = static_cast<Out>(in);
+    return current + actual_in;
+  };
+  return Reduce<In, Out>(input_data, input_dims, output_dims, input_num_dims,
+                         output_num_dims, axis, num_axis, input_iter, reducer,
+                         output_data);
+}
+
+// Computes the mean of elements across dimensions given in axis.
+// It does so in two stages, first calculates the sum of elements along the axis
+// then divides it by the number of element in axis.
+template <typename T, typename U>
+inline bool Mean(const T* input_data, const int* input_dims,
+                 const int input_num_dims, T* output_data,
+                 const int* output_dims, const int output_num_dims,
+                 const int* axis, const int num_axis_dimensions, bool keep_dims,
+                 int* temp_index, int* resolved_axis, U* temp_sum) {
+  // Reset output data.
+  size_t num_outputs = 1;
+  for (int idx = 0; idx < output_num_dims; ++idx) {
+    size_t current = static_cast<size_t>(output_dims[idx]);
+    // Overflow prevention.
+    if (num_outputs > std::numeric_limits<size_t>::max() / current) {
+      return false;
+    }
+    num_outputs *= current;
+  }
+  for (size_t idx = 0; idx < num_outputs; ++idx) {
+    output_data[idx] = T();
+    temp_sum[idx] = U();
+  }
+
+  // Resolve axis.
+  int num_resolved_axis = 0;
+  if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
+                   &num_resolved_axis)) {
+    return false;
+  }
+
+  if (!ReduceSumImpl<T, U>(input_data, input_dims, output_dims, input_num_dims,
+                           output_num_dims, resolved_axis, num_resolved_axis,
+                           temp_index, temp_sum)) {
+    return false;
+  }
+
+  // Calculate mean by dividing output_data by num of aggregated element.
+  U num_elements_in_axis = 1;
   for (int idx = 0; idx < num_resolved_axis; ++idx) {
     size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
+    // Overflow prevention.
     if (current > (std::numeric_limits<U>::max() / num_elements_in_axis)) {
       return false;
     }
     num_elements_in_axis *= current;
   }
+
   if (num_elements_in_axis > 0) {
     for (size_t idx = 0; idx < num_outputs; ++idx) {
       output_data[idx] =
@@ -3498,7 +3752,6 @@ void ArgMax(const T3* axis, const T1* input_data, const Dims<4>& input_dims,
             T2* output_data, const Dims<4>& output_dims) {
   // The current ArgMax implemention can only determine the index of the maximum
   // value in the last dimension. So the axis argument is ignored.
-  TFLITE_DCHECK_EQ(axis[0], 3);
 
   // For ArgMax, the number of output dimensions = (number of input dimensions -
   // 1). For the sake of simplicity, the output dimensions are equal to the
@@ -3573,7 +3826,7 @@ inline void TransposeConv(const float* input_data, const Dims<4>& input_dims,
   // computing their influence on the output, rather than looping through the
   // output elements in the typical "gather" access pattern of a conv. We
   // therefore must initialize the output array to zero.
-  for (int i = 0; i < RequiredBufferSizeForDims(output_dims); i++) {
+  for (int i = 0; i < FlatSize(output_dims); i++) {
     output_data[i] = 0.0f;
   }
 
@@ -3613,50 +3866,243 @@ inline void TransposeConv(const float* input_data, const Dims<4>& input_dims,
 }
 
 template <typename T>
-inline void Less(int64_t num_elements, const T* input1, const T* input2,
-                 bool* output) {
-  for (int64_t i = 0; i < num_elements; ++i) {
-    output[i] = input1[i] < input2[i];
-  }
+inline bool GreaterFn(T lhs, T rhs) {
+  return lhs > rhs;
+}
+template <typename T>
+inline bool GreaterEqualFn(T lhs, T rhs) {
+  return lhs >= rhs;
+}
+template <typename T>
+inline bool LessFn(T lhs, T rhs) {
+  return lhs < rhs;
+}
+template <typename T>
+inline bool LessEqualFn(T lhs, T rhs) {
+  return lhs <= rhs;
 }
 
 template <typename T>
-inline void Less(const T* input1_data, const Dims<4>& input1_dims,
-                 const T* input2_data, const Dims<4>& input2_dims,
-                 bool* output_data, const Dims<4>& output_dims) {
-  const int64_t batches =
-      MatchingArraySize(input1_dims, 3, input2_dims, 3, output_dims, 3);
-  const int64_t height =
-      MatchingArraySize(input1_dims, 2, input2_dims, 2, output_dims, 2);
-  const int64_t width =
-      MatchingArraySize(input1_dims, 1, input2_dims, 1, output_dims, 1);
-  const int64_t depth =
-      MatchingArraySize(input1_dims, 0, input2_dims, 0, output_dims, 0);
-  Less(batches * height * width * depth, input1_data, input2_data, output_data);
+using ComparisonFn = bool (*)(T, T);
+
+template <typename T, ComparisonFn<T> F>
+inline void Comparison(const T* input1_data, const Dims<4>& input1_dims,
+                       const T* input2_data, const Dims<4>& input2_dims,
+                       bool* output_data, const Dims<4>& output_dims) {
+  const int64_t flatsize =
+      MatchingFlatSize(input1_dims, input2_dims, output_dims);
+  for (int64_t i = 0; i < flatsize; ++i) {
+    output_data[i] = F(input1_data[i], input2_data[i]);
+  }
+}
+
+template <typename T, ComparisonFn<int32> F>
+inline void Comparison(int left_shift, const T* input1_data,
+                       const Dims<4>& input1_dims, int32 input1_offset,
+                       int32 input1_multiplier, int input1_shift,
+                       const T* input2_data, const Dims<4>& input2_dims,
+                       int32 input2_offset, int32 input2_multiplier,
+                       int input2_shift, bool* output_data,
+                       const Dims<4>& output_dims) {
+  const int64_t flatsize =
+      MatchingFlatSize(input1_dims, input2_dims, output_dims);
+  for (int64_t i = 0; i < flatsize; ++i) {
+    const int32 input1_val = input1_offset + input1_data[i];
+    const int32 input2_val = input2_offset + input2_data[i];
+    const int32 shifted_input1_val = input1_val * (1 << left_shift);
+    const int32 shifted_input2_val = input2_val * (1 << left_shift);
+    const int32 scaled_input1_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input1_val, input1_multiplier,
+            kReverseShift * input1_shift);
+    const int32 scaled_input2_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input2_val, input2_multiplier,
+            kReverseShift * input2_shift);
+    output_data[i] = F(scaled_input1_val, scaled_input2_val);
+  }
 }
 
-template <typename T1, typename T2>
-inline void BroadcastLess(T1* input1_data, const Dims<4>& input1_dims,
-                          T2* input2_data, const Dims<4>& input2_dims,
-                          bool* output_data, const Dims<4>& output_dims) {
-  gemmlowp::ScopedProfilingLabel label("BroadcastLess");
+template <typename T, ComparisonFn<T> F>
+inline void BroadcastComparison(const T* input1_data,
+                                const Dims<4>& input1_dims,
+                                const T* input2_data,
+                                const Dims<4>& input2_dims, bool* output_data,
+                                const Dims<4>& output_dims) {
   NdArrayDesc<4> desc1;
   NdArrayDesc<4> desc2;
   NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
+  for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
+    for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
+      for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
+        for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          output_data[Offset(output_dims, c, x, y, b)] =
+              F(input1_data[SubscriptToIndex(desc1, c, x, y, b)],
+                input2_data[SubscriptToIndex(desc2, c, x, y, b)]);
+        }
+      }
+    }
+  }
+}
 
+template <typename T, ComparisonFn<int32> F>
+inline void BroadcastComparison(int left_shift, const T* input1_data,
+                                const Dims<4>& input1_dims, int32 input1_offset,
+                                int32 input1_multiplier, int input1_shift,
+                                const T* input2_data,
+                                const Dims<4>& input2_dims, int32 input2_offset,
+                                int32 input2_multiplier, int input2_shift,
+                                bool* output_data, const Dims<4>& output_dims) {
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input1_dims, input2_dims, &desc1, &desc2);
   for (int b = 0; b < ArraySize(output_dims, 3); ++b) {
     for (int y = 0; y < ArraySize(output_dims, 2); ++y) {
       for (int x = 0; x < ArraySize(output_dims, 1); ++x) {
         for (int c = 0; c < ArraySize(output_dims, 0); ++c) {
+          const int32 input1_val =
+              input1_offset + input1_data[SubscriptToIndex(desc1, c, x, y, b)];
+          const int32 input2_val =
+              input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
+          const int32 shifted_input1_val = input1_val * (1 << left_shift);
+          const int32 shifted_input2_val = input2_val * (1 << left_shift);
+          const int32 scaled_input1_val =
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input1_val, input1_multiplier,
+                  kReverseShift * input1_shift);
+          const int32 scaled_input2_val =
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input2_val, input2_multiplier,
+                  kReverseShift * input2_shift);
           output_data[Offset(output_dims, c, x, y, b)] =
-              input1_data[SubscriptToIndex(desc1, c, x, y, b)] <
-              input2_data[SubscriptToIndex(desc2, c, x, y, b)];
+              F(scaled_input1_val, scaled_input2_val);
         }
       }
     }
   }
 }
 
+#define TFLITE_COMPARISON_OP(name)                                            \
+  template <typename T>                                                       \
+  inline void name(const T* input1_data, const Dims<4>& input1_dims,          \
+                   const T* input2_data, const Dims<4>& input2_dims,          \
+                   bool* output_data, const Dims<4>& output_dims) {           \
+    gemmlowp::ScopedProfilingLabel label(#name);                              \
+    Comparison<T, name##Fn>(input1_data, input1_dims, input2_data,            \
+                            input2_dims, output_data, output_dims);           \
+  }                                                                           \
+  template <typename T>                                                       \
+  inline void name(                                                           \
+      int left_shift, const T* input1_data, const Dims<4>& input1_dims,       \
+      int32 input1_offset, int32 input1_multiplier, int input1_shift,         \
+      const T* input2_data, const Dims<4>& input2_dims, int32 input2_offset,  \
+      int32 input2_multiplier, int input2_shift, bool* output_data,           \
+      const Dims<4>& output_dims) {                                           \
+    gemmlowp::ScopedProfilingLabel label(#name "/8bit");                      \
+    Comparison<T, name##Fn>(left_shift, input1_data, input1_dims,             \
+                            input1_offset, input1_multiplier, input1_shift,   \
+                            input2_data, input2_dims, input2_offset,          \
+                            input2_multiplier, input2_shift, output_data,     \
+                            output_dims);                                     \
+  }                                                                           \
+  template <typename T>                                                       \
+  inline void Broadcast##name(                                                \
+      const T* input1_data, const Dims<4>& input1_dims, const T* input2_data, \
+      const Dims<4>& input2_dims, bool* output_data,                          \
+      const Dims<4>& output_dims) {                                           \
+    gemmlowp::ScopedProfilingLabel label("Broadcast" #name);                  \
+    BroadcastComparison<T, name##Fn>(input1_data, input1_dims, input2_data,   \
+                                     input2_dims, output_data, output_dims);  \
+  }                                                                           \
+  template <typename T>                                                       \
+  inline void Broadcast##name(                                                \
+      int left_shift, const T* input1_data, const Dims<4>& input1_dims,       \
+      int32 input1_offset, int32 input1_multiplier, int input1_shift,         \
+      const T* input2_data, const Dims<4>& input2_dims, int32 input2_offset,  \
+      int32 input2_multiplier, int input2_shift, bool* output_data,           \
+      const Dims<4>& output_dims) {                                           \
+    gemmlowp::ScopedProfilingLabel label("Broadcast" #name "/8bit");          \
+    BroadcastComparison<T, name##Fn>(left_shift, input1_data, input1_dims,    \
+                                     input1_offset, input1_multiplier,        \
+                                     input1_shift, input2_data, input2_dims,  \
+                                     input2_offset, input2_multiplier,        \
+                                     input2_shift, output_data, output_dims); \
+  }
+TFLITE_COMPARISON_OP(Greater);
+TFLITE_COMPARISON_OP(GreaterEqual);
+TFLITE_COMPARISON_OP(Less);
+TFLITE_COMPARISON_OP(LessEqual);
+#undef TFLITE_COMPARISON_OP
+
+template <typename D, typename T>
+inline void Select(const D* input_condition_data,
+                   const Dims<4>& input_condition_dims, const T* input_x_data,
+                   const Dims<4>& input_x_dims, const T* input_y_data,
+                   const Dims<4>& input_y_dims, T* output_data,
+                   const Dims<4>& output_dims) {
+  const int64_t flatsize =
+      MatchingFlatSize(input_x_dims, input_y_dims, output_dims);
+  for (int64_t i = 0; i < flatsize; ++i) {
+    output_data[i] =
+        input_condition_data[i] ? input_x_data[i] : input_y_data[i];
+  }
+}
+
+template <typename D, typename T>
+inline void RankOneSelect(const D* input_condition_data,
+                          const Dims<4>& input_condition_dims,
+                          const T* input_x_data, const Dims<4>& input_x_dims,
+                          const T* input_y_data, const Dims<4>& input_y_dims,
+                          T* output_data, const Dims<4>& output_dims) {
+  const int64_t rank = MatchingArraySize(input_condition_dims, 0, input_x_dims,
+                                         3, input_y_dims, 3, output_dims, 3);
+  const int64_t inner_size =
+      MatchingFlatSizeSkipDim(input_x_dims, 3, input_y_dims, output_dims);
+
+  int64_t offset = 0;
+  for (int64_t i = 0; i < rank; i++) {
+    const T* input_data = input_condition_data[i] ? input_x_data : input_y_data;
+    memcpy(output_data + offset, input_data + offset, inner_size * sizeof(T));
+    offset += inner_size;
+  }
+}
+
+// For easy implementation, the indices is always a vector of size-4 vectors.
+template <typename T, typename I>
+inline void SparseToDense(const std::vector<std::vector<I>>& indices,
+                          const T* values, T default_value, T* output_data,
+                          const Dims<4>& output_dims, bool value_is_scalar) {
+  const int value_count = indices.size();
+
+  // First fill the output_data with default value.
+  const int num_elements = FlatSize(output_dims);
+  for (int i = 0; i < num_elements; ++i) {
+    output_data[i] = default_value;
+  }
+
+  // Special handle for value is scalar case to avoid checking the boolean
+  // condition within the loop every time.
+  if (value_is_scalar) {
+    for (int i = 0; i < value_count; ++i) {
+      const std::vector<I>& index = indices[i];
+      TFLITE_DCHECK_EQ(index.size(), 4);
+      const T value = *values;  // just use the first value.
+      output_data[Offset(output_dims, index[3], index[2], index[1], index[0])] =
+          value;
+    }
+    return;
+  }
+
+  // Go through the values and indices to fill the sparse values.
+  for (int i = 0; i < value_count; ++i) {
+    const std::vector<I>& index = indices[i];
+    TFLITE_DCHECK_EQ(index.size(), 4);
+    const T value = values[i];
+    output_data[Offset(output_dims, index[3], index[2], index[1], index[0])] =
+        value;
+  }
+}
+
 }  // namespace reference_ops
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/internal/resize_bilinear_float_test.cc b/tensorflow/contrib/lite/kernels/internal/resize_bilinear_float_test.cc
new file mode 100644
index 00000000000000..c1c50dff4d2a96
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/resize_bilinear_float_test.cc
@@ -0,0 +1,102 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/test_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace {
+void TestOneResizeBilinear(int batch, int depth, int input_width,
+                           int input_height, int output_width,
+                           int output_height) {
+  Dims<4> input_dims_inference =
+      MakeDimsForInference(depth, input_width, input_height, batch);
+  Dims<4> output_dims_inference =
+      MakeDimsForInference(depth, output_width, output_height, batch);
+
+  const int input_buffer_size = RequiredBufferSizeForDims(input_dims_inference);
+  const int output_buffer_size =
+      RequiredBufferSizeForDims(output_dims_inference);
+
+  std::vector<float> input_data(input_buffer_size, 0);
+  std::vector<float> reference_output_data(output_buffer_size, 0);
+  // Initialize the output data with something other than zero, so we can catch
+  // issue with kernels failing to initialize the output.
+  std::vector<float> output_data(output_buffer_size, 3.1415);
+
+  const float input_amplitude = 1.f;
+  FillRandom(&input_data, -input_amplitude, input_amplitude);
+
+  Dims<4> output_size_dims = MakeDimsForInference(2, 1, 1, 1);
+  std::vector<int32> output_size_data = {output_height, output_width};
+
+  reference_ops::ResizeBilinear(
+      input_data.data(), input_dims_inference, output_size_data.data(),
+      output_size_dims, reference_output_data.data(), output_dims_inference);
+  optimized_ops::ResizeBilinear(input_data.data(), input_dims_inference,
+                                output_size_data.data(), output_size_dims,
+                                output_data.data(), output_dims_inference);
+
+  double sum_diff = 0;
+  float max_abs_val = 0;
+  for (int i = 0; i < output_buffer_size; i++) {
+    sum_diff += std::abs(output_data[i] - reference_output_data[i]);
+    max_abs_val = std::max(max_abs_val, std::abs(reference_output_data[i]));
+  }
+
+  if (sum_diff != 0.f) {
+    const float mean_diff = static_cast<float>(sum_diff / output_buffer_size);
+    const float relative_error = std::abs(mean_diff) / max_abs_val;
+    ASSERT_LT(relative_error, 1e-5f);
+  }
+}
+
+TEST(ResizeBilinear, TestResizeBilinear) {
+  const int kTestsToRun = 100 * 1000;
+  for (int i = 0; i < kTestsToRun; i++) {
+    const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
+    const int depth = ExponentialRandomPositiveInt(0.9f, 6, 50);
+    const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
+    const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
+    const int output_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
+    const int output_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
+
+    TestOneResizeBilinear(batch, depth, input_width, input_height, output_width,
+                          output_height);
+  }
+}
+
+TEST(ResizeBilinear2x2, TestResizeBilinear) {
+  const int kTestsToRun = 100 * 1000;
+  for (int i = 0; i < kTestsToRun; i++) {
+    const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
+    const int depth = ExponentialRandomPositiveInt(0.9f, 6, 50);
+    const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
+    const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
+    const int output_width = input_width * 2;
+    const int output_height = input_height * 2;
+
+    TestOneResizeBilinear(batch, depth, input_width, input_height, output_width,
+                          output_height);
+  }
+}
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/softmax_quantized_test.cc b/tensorflow/contrib/lite/kernels/internal/softmax_quantized_test.cc
new file mode 100644
index 00000000000000..d781a7b642036f
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/softmax_quantized_test.cc
@@ -0,0 +1,227 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <algorithm>
+#include <cmath>
+#include <cstdlib>
+#include <functional>
+#include <iterator>
+#include <limits>
+#include <random>
+#include <string>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/test_util.h"
+
+namespace tflite {
+namespace {
+
+void RunSoftmaxFloatReference(const uint8* input_data,
+                              const Dims<4>& dims_common, int32 input_offset,
+                              const double input_scale, int stride, float beta,
+                              uint8* reference_output_data) {
+  const int ref_buffer_size = RequiredBufferSizeForDims(dims_common);
+  std::vector<float> reference_dequant_data(ref_buffer_size);
+  std::vector<float> reference_output_float_data(ref_buffer_size);
+
+  // Reference data generated via Dequant of input into float, and then applying
+  // float Softmax.
+  reference_ops::Dequantize(input_data, dims_common, input_offset, input_scale,
+                            reference_dequant_data.data(), dims_common);
+  optimized_ops::Softmax(reference_dequant_data.data(), dims_common, beta,
+                         reference_output_float_data.data(), dims_common);
+  // Work with quantized scaling for Softmax, under which 256 represents 1, but
+  // we limit this to 255.
+  for (int i = 0; i < ref_buffer_size; i++) {
+    reference_output_data[i] = std::min(
+        255,
+        static_cast<int>(std::round(256.0f * reference_output_float_data[i])));
+  }
+}
+
+void CheckOutputData(const uint8* test_output, const uint8* reference_output,
+                     const Dims<4>& dims_common, const string& check_label,
+                     bool be_exacting) {
+  const int buffer_size = RequiredBufferSizeForDims(dims_common);
+  // While calculating some metrics in floating point, we work with quantized
+  // scaling.
+  std::vector<int> diff(buffer_size);
+  int64_t sum_diff = 0;
+  int64_t sum_abs_diff = 0;
+  for (int i = 0; i < buffer_size; i++) {
+    diff[i] = static_cast<int>(test_output[i]) - reference_output[i];
+    sum_diff += diff[i];
+    sum_abs_diff += std::abs(diff[i]);
+  }
+  // These stats help understand test failures.
+  std::sort(std::begin(diff), std::end(diff));
+  const int min_diff = diff.front();
+  const int max_diff = diff.back();
+  const int median_diff = diff[diff.size() / 2];
+  const float mean_diff = static_cast<float>(sum_diff) / buffer_size;
+  const float mean_abs_diff = static_cast<float>(sum_abs_diff) / buffer_size;
+  // We either check for bit exactness (against the reference quantized version)
+  // or for general accuracy, allowing off-by-one (against the float reference).
+  if (be_exacting) {
+    ASSERT_TRUE(std::abs(min_diff) == 0 && std::abs(max_diff) == 0);
+  } else {
+    // For small numbers of samples, the estimates of the means vary more.
+    // Rather than widen the tolerances, we skip the smaller tests.
+    ASSERT_TRUE(((std::abs(mean_diff) < 2e-2f && mean_abs_diff < 3e-2f) ||
+                 buffer_size < 10000) &&
+                std::abs(median_diff) == 0 && std::abs(min_diff) <= 1 &&
+                std::abs(max_diff) <= 1);
+  }
+}
+
+// Runs the Softmax and compares against the float reference implementation and
+// the quantized reference implementation.
+void RunOneSoftmaxTest(const uint8* input_data, const Dims<4>& dims_common,
+                       int32 input_offset, const double input_scale, int stride,
+                       float beta) {
+  const int buffer_size = RequiredBufferSizeForDims(dims_common);
+  std::vector<uint8> optimized_softmax_output(buffer_size);
+  std::vector<uint8> reference_float_softmax_output(buffer_size);
+  std::vector<uint8> reference_quant_softmax_output(buffer_size);
+
+  RunSoftmaxFloatReference(input_data, dims_common, input_offset, input_scale,
+                           stride, beta, reference_float_softmax_output.data());
+
+  int32 input_beta_multiplier;
+  int input_beta_left_shift;
+  static const int kScaledDiffIntegerBits = 5;
+  tflite::PreprocessSoftmaxScaling(beta, input_scale, kScaledDiffIntegerBits,
+                                   &input_beta_multiplier,
+                                   &input_beta_left_shift);
+  // diff_min has a negative value, and is used to limit the maximum magnitude
+  // of the diffs, which are <= 0.
+  const int diff_min = -tflite::CalculateInputRadius(kScaledDiffIntegerBits,
+                                                     input_beta_left_shift);
+
+  optimized_ops::Softmax(input_data, dims_common, input_beta_multiplier,
+                         input_beta_left_shift, diff_min,
+                         optimized_softmax_output.data(), dims_common);
+  reference_ops::Softmax(input_data, dims_common, input_beta_multiplier,
+                         input_beta_left_shift, diff_min,
+                         reference_quant_softmax_output.data(), dims_common);
+
+  CheckOutputData(optimized_softmax_output.data(),
+                  reference_float_softmax_output.data(), dims_common,
+                  "Optimized vs float reference", false);
+  CheckOutputData(optimized_softmax_output.data(),
+                  reference_quant_softmax_output.data(), dims_common,
+                  "Optimized vs quant reference", true);
+  CheckOutputData(reference_quant_softmax_output.data(),
+                  reference_float_softmax_output.data(), dims_common,
+                  "Quant reference vs float reference", false);
+}
+
+// This function picks some random Softmax params, which are checked for
+// desirability.  If not acceptable, it returns false. If they're OK,
+// it runs the Softmax test and returns true. This allows the caller
+// to loop until a test has been run.
+//
+// Currently we do not reject for any reason.
+bool TryOneUniformSoftmax() {
+  // We pick mostly positive values, on the whole emphasizing smaller values and
+  // therefore faster tests.  We test a wider range of depths.  In the case of
+  // Softmax, the width and height really just create test repetitions.
+  const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
+  const int input_depth = ExponentialRandomPositiveInt(0.75f, 175, 500);
+  const int input_width = ExponentialRandomPositiveInt(0.8f, 20, 200);
+  const int input_height = ExponentialRandomPositiveInt(0.8f, 20, 200);
+  const int stride = ExponentialRandomPositiveInt(0.9f, 3, 8);
+  const double input_scale = std::pow(10.0, UniformRandomFloat(-2.0, 1.0));
+  const int32 input_offset = UniformRandomInt(-256, 0);
+  const float beta = 1.0f + ExponentialRandomPositiveFloat(0.9f, 2, 10);
+
+  Dims<4> dims_common =
+      MakeDimsForInference(input_depth, input_width, input_height, batch);
+  const int buffer_size = RequiredBufferSizeForDims(dims_common);
+
+  std::vector<uint8> input_data(buffer_size);
+  FillRandom(&input_data);
+  RunOneSoftmaxTest(input_data.data(), dims_common, input_offset, input_scale,
+                    stride, beta);
+  return true;
+}
+
+// See TryOneUniformSoftmax() for a general description.
+//
+// Tests with "skyscraper" input patterns are included for two reasons. (a)
+// Bimodal distributions are potentially challenging and perhaps more
+// realistic than simple uniform random inputs.  (b) Some implementations of
+// Softmax may adapt as they traverse the depth, and so we test handling of
+// cases where relatively small values are encountered at the beginning and end.
+bool TryOneSkyscraperSoftmax(bool small_depth) {
+  // We pick mostly positive values, on the whole emphasizing smaller values and
+  // therefore faster tests.  We test a wider range of depths.  In the case of
+  // Softmax, the width and height really just create test repetitions.
+  const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
+  const int input_depth = small_depth
+                              ? ExponentialRandomPositiveInt(0.75f, 40, 500)
+                              : ExponentialRandomPositiveInt(0.75f, 175, 500);
+  const int input_width = ExponentialRandomPositiveInt(0.7f, 20, 200);
+  const int input_height = ExponentialRandomPositiveInt(0.7f, 20, 200);
+  const int stride = ExponentialRandomPositiveInt(0.9f, 3, 8);
+  const double input_scale = std::pow(10.0, UniformRandomFloat(-2.0, 1.0));
+  const int32 input_offset = UniformRandomInt(-256, 0);
+  const float beta = 1.0f + ExponentialRandomPositiveFloat(0.9f, 2, 10);
+  // Extra parameters for skyscraper input patterns.
+  const double middle_proportion =
+      ExponentialRandomPositiveFloat(0.65f, 0.1, 1.0);
+  const int middle_min = UniformRandomInt(0, 255);
+  const int sides_max = UniformRandomInt(0, middle_min);
+
+  Dims<4> dims_common =
+      MakeDimsForInference(input_depth, input_width, input_height, batch);
+  const int buffer_size = RequiredBufferSizeForDims(dims_common);
+
+  std::vector<uint8> input_data(buffer_size);
+  FillRandomSkyscraper(&input_data, input_depth, middle_proportion, middle_min,
+                       sides_max);
+  RunOneSoftmaxTest(input_data.data(), dims_common, input_offset, input_scale,
+                    stride, beta);
+  return true;
+}
+
+TEST(TestQuantizedSoftmax, UniformSoftmaxTests) {
+  const int kTestsToRun = 1000;
+  for (int i = 0; i < kTestsToRun; i++) {
+    while (!TryOneUniformSoftmax()) {
+    }
+  }
+}
+
+TEST(TestQuantizedSoftmax, SkyscraperSoftmaxTests) {
+  const int kTestsToRun = 1000;
+  for (int i = 0; i < kTestsToRun; i++) {
+    while (!TryOneSkyscraperSoftmax(false)) {
+    }
+  }
+}
+
+TEST(TestQuantizedSoftmax, SmallSkyscraperSoftmaxTests) {
+  const int kTestsToRun = 1000;
+  for (int i = 0; i < kTestsToRun; i++) {
+    while (!TryOneSkyscraperSoftmax(true)) {
+    }
+  }
+}
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/strided_slice_logic.h b/tensorflow/contrib/lite/kernels/internal/strided_slice_logic.h
new file mode 100644
index 00000000000000..ef77371bf65cc9
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/strided_slice_logic.h
@@ -0,0 +1,124 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_STRIDED_SLICE_LOGIC_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_STRIDED_SLICE_LOGIC_H_
+
+#include <limits>
+#include <vector>
+#include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
+
+namespace tflite {
+
+namespace strided_slice {
+
+// Use until std::clamp() is available from C++17.
+inline int Clamp(const int v, const int lo, const int hi) {
+  TFLITE_DCHECK(!(hi < lo));
+  if (hi < v) return hi;
+  if (v < lo) return lo;
+  return v;
+}
+
+// Return the index for the first element along that axis. This index will be a
+// positive integer between [0, axis_size - 1] that can be used to index
+// directly into the data.
+template <typename IntType>
+inline int StartForAxis(int begin_mask,
+                        std::vector<IntType> const& start_indices,
+                        std::vector<IntType> const& strides,
+                        int const* input_shape, int axis) {
+  // Begin with the specified index
+  int start = start_indices[axis];
+
+  // begin_mask override
+  if (begin_mask & 1 << axis) {
+    if (strides[axis] > 0) {
+      // Forward iteration - use the first element. These values will get
+      // clamped below (Note: We could have set them to 0 and axis_size-1, but
+      // use lowest() and max() to maintain symmetry with StopForAxis())
+      start = std::numeric_limits<int>::lowest();
+    } else {
+      // Backward iteration - use the last element.
+      start = std::numeric_limits<int>::max();
+    }
+  }
+
+  // Handle negative indices
+  int axis_size = input_shape[axis];
+  if (start < 0) {
+    start += axis_size;
+  }
+
+  // Clamping
+  start = Clamp(start, 0, axis_size - 1);
+
+  return start;
+}
+
+// Return the "real" index for the end of iteration along that axis. This is an
+// "end" in the traditional C sense, in that it points to one past the last
+// element. ie. So if you were iterating through all elements of a 1D array of
+// size 4, this function would return 4 as the stop, because it is one past the
+// "real" indices of 0, 1, 2 & 3.
+template <typename IntType>
+inline int StopForAxis(int end_mask, std::vector<IntType> const& stop_indices,
+                       std::vector<IntType> const& strides,
+                       int const* input_shape, int axis) {
+  // Begin with the specified index
+  int stop = stop_indices[axis];
+
+  // end_mask override
+  if (end_mask & (1 << axis)) {
+    if (strides[axis] > 0) {
+      // Forward iteration - use the last element. These values will get
+      // clamped below
+      stop = std::numeric_limits<int>::max();
+    } else {
+      // Backward iteration - use the first element.
+      stop = std::numeric_limits<int>::lowest();
+    }
+  }
+
+  // Handle negative indices
+  int axis_size = input_shape[axis];
+  if (stop < 0) {
+    stop += axis_size;
+  }
+
+  // Clamping
+  // Because the end index points one past the last element, we need slightly
+  // different clamping ranges depending on the direction.
+  if (strides[axis] > 0) {
+    // Forward iteration
+    stop = Clamp(stop, 0, axis_size);
+  } else {
+    // Backward iteration
+    stop = Clamp(stop, -1, axis_size - 1);
+  }
+
+  return stop;
+}
+
+inline bool LoopCondition(int index, int stop, int stride) {
+  // True when we have reached the end of an axis and should loop.
+  return stride > 0 ? index >= stop : index <= stop;
+}
+
+}  // namespace strided_slice
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_STRIDED_SLICE_LOGIC_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor.h b/tensorflow/contrib/lite/kernels/internal/tensor.h
index 62cea143e6afc0..ce887cea8b794b 100644
--- a/tensorflow/contrib/lite/kernels/internal/tensor.h
+++ b/tensorflow/contrib/lite/kernels/internal/tensor.h
@@ -49,6 +49,34 @@ inline bool* GetTensorData(TfLiteTensor* tensor) {
   return tensor != nullptr ? tensor->data.b : nullptr;
 }
 
+template <typename T>
+inline const T* GetTensorData(const TfLiteTensor* tensor);
+
+template <>
+inline const float* GetTensorData(const TfLiteTensor* tensor) {
+  return tensor != nullptr ? tensor->data.f : nullptr;
+}
+
+template <>
+inline const uint8_t* GetTensorData(const TfLiteTensor* tensor) {
+  return tensor != nullptr ? tensor->data.uint8 : nullptr;
+}
+
+template <>
+inline const int32_t* GetTensorData(const TfLiteTensor* tensor) {
+  return tensor != nullptr ? tensor->data.i32 : nullptr;
+}
+
+template <>
+inline const int64_t* GetTensorData(const TfLiteTensor* tensor) {
+  return tensor != nullptr ? tensor->data.i64 : nullptr;
+}
+
+template <>
+inline const bool* GetTensorData(const TfLiteTensor* tensor) {
+  return tensor != nullptr ? tensor->data.b : nullptr;
+}
+
 inline int RemapDim(int max_dimensions, int d) {
   return max_dimensions - d - 1;
 }
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_utils.h b/tensorflow/contrib/lite/kernels/internal/tensor_utils.h
index 40d144979b2f96..5160e22307ae08 100644
--- a/tensorflow/contrib/lite/kernels/internal/tensor_utils.h
+++ b/tensorflow/contrib/lite/kernels/internal/tensor_utils.h
@@ -23,17 +23,46 @@ namespace tensor_utils {
 // Limit a float input f between +abs_limit and -abs_limit.
 float Clip(float f, float abs_limit);
 
-// Multiply a matrix by a batch vector, and store results in a batch-size
-// vector using a stride value provided in result_stride. 'result_stride' shows
-// how the number of elements between consecutive result values. For example
-// result_stride = 1, will cause the output to look like this:
-// [O_1, 0_2, ... O_rows] in memory, but result_stride = 3, will cause it to be
-// arranged like this in memory: [O_1, x, x, 0_2, x, x, ..., O_rows]
+// Checks if all entries of vector are zero.
+bool IsZeroVector(const float* vector, int v_size);
+
+// Quantizes a buffer of floating point values using a symmetric quantization
+// (i.e. linear quantization without an offset) to 8-bit signed integers.
+// It also outputs the range (min, max) of the floating point buffer, and the
+// scaling factor used to quantize the values.
+void SymmetricQuantizeFloats(const float* values, const int size,
+                             int8_t* quantized_values, float* min, float* max,
+                             float* scaling_factor);
+
+// Multiplies a matrix by a "batched" vector (i.e. a matrix with a batch
+// dimension composed by input vectors independent from each other). The result
+// of the multiplication is accumulated to the passed result buffer.
+// More specifically, for a matrix M of shape [n, i] and a batched-vector
+// of shape [i, batch] it will first compute the product of shape [n, batch].
+// This product will be accumulated to the result buffer, using a stride value
+// provided in result_stride (the number of elements between consecutive result
+// values). For example result_stride = 1, will cause the output to look like
+// this:
+// [O_1, 0_2, ... O_rows]
+// but result_stride = 3, will cause it to be arranged like this in memory:
+// [O_1, x, x, 0_2, x, x, ..., O_rows]
 void MatrixBatchVectorMultiplyAccumulate(const float* matrix, int m_rows,
                                          int m_cols, const float* vector,
                                          int n_batch, float* result,
                                          int result_stride);
 
+// Same as the function above, but for values quantized using symmetric
+// quantization (e.g. by calling SymmetricQuantizeFloats).
+// The passed scaling factors is a buffer of the quantization scaling factors
+// that will be used to dequentize the products into the final result buffer.
+// These scaling factors are the multiplication of the matrix scaling factor
+// by the vector's scaling factor, one per batch (i.e. this allows quantizing
+// each batch in the batch-vector matrix independently).
+void MatrixBatchVectorMultiplyAccumulate(
+    const int8_t* __restrict__ matrix, const int m_rows, const int m_cols,
+    const int8_t* __restrict__ vectors, const float* scaling_factors,
+    int n_batch, float* __restrict__ result, int result_stride);
+
 // Cwise product of two vectors.
 void VectorVectorCwiseProduct(const float* vector1, const float* vector2,
                               int v_size, float* result);
diff --git a/tensorflow/contrib/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/contrib/lite/kernels/internal/tensor_utils_test.cc
index 588f1a428b8c84..14ee528394b687 100644
--- a/tensorflow/contrib/lite/kernels/internal/tensor_utils_test.cc
+++ b/tensorflow/contrib/lite/kernels/internal/tensor_utils_test.cc
@@ -32,6 +32,74 @@ TEST(uKernels, ClipTest) {
                   {0.0, -0.5, 1.0, -1.5, 2.0, -2.0, 2.0, -2.0, 2.0, -2.0})));
 }
 
+TEST(uKernels, IsZeroTest) {
+  constexpr int kVectorSize = 21;
+  static float zeros[kVectorSize] = {0.0};
+  EXPECT_TRUE(IsZeroVector(zeros, kVectorSize));
+
+  static float nonzeros[kVectorSize] = {
+      1e-6,  1e-7,  1e-8,  1e-9,  1e-10, 1e-11, 1e-12,
+      1e-13, 1e-14, 1e-15, 1e-16, 1e-17, 1e-18, 1e-19,
+      1e-20, 1e-21, 1e-22, 1e-23, 1e-24, 1e-25, 1e-26};
+  EXPECT_FALSE(IsZeroVector(nonzeros, kVectorSize));
+}
+
+TEST(uKernels, GeneratedIsZeroTest) {
+  constexpr int kVectorSize = 39;
+  std::vector<float> input(kVectorSize);
+  ZeroVector(input.data(), kVectorSize);
+  EXPECT_TRUE(IsZeroVector(input.data(), kVectorSize));
+}
+
+TEST(uKernels, SymmetricQuantizeFloatsTest) {
+  constexpr int kVectorSize = 9;
+  static float input[kVectorSize] = {-640, -635.0, -630, 10.0,  2.0,
+                                     -5.0, -10.0,  0.0,  1000.0};
+
+  int8 output[kVectorSize];
+  float min, max, scaling_factor;
+  SymmetricQuantizeFloats(input, kVectorSize, output, &min, &max,
+                          &scaling_factor);
+
+  EXPECT_EQ(min, -640);
+  EXPECT_EQ(max, 1000);
+  EXPECT_NEAR(scaling_factor, 0.127, 1e-6);  // EQ won't work due to fpoint.
+  EXPECT_THAT(output,
+              testing::ElementsAreArray({-81, -81, -80, 1, 0, -1, -1, 0, 127}));
+}
+
+TEST(uKernels, SymmetricQuantizeFloatsAllZerosTest) {
+  constexpr int kVectorSize = 9;
+  static float input[kVectorSize] = {0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+  int8 output[kVectorSize];
+  float min, max, scaling_factor;
+  SymmetricQuantizeFloats(input, kVectorSize, output, &min, &max,
+                          &scaling_factor);
+
+  EXPECT_EQ(min, 0);
+  EXPECT_EQ(max, 0);
+  EXPECT_EQ(scaling_factor, 1);
+  EXPECT_THAT(output, testing::ElementsAreArray({0, 0, 0, 0, 0, 0, 0, 0, 0}));
+}
+
+TEST(uKernels, SymmetricQuantizeFloatsAllAlmostZeroTest) {
+  constexpr int kVectorSize = 9;
+  static float input[kVectorSize] = {-1e-5, 3e-5, -7e-6, -9e-5, 1e-6,
+                                     4e-5,  9e-6, 2e-4,  0};
+
+  int8 output[kVectorSize];
+  float min, max, scaling_factor;
+  SymmetricQuantizeFloats(input, kVectorSize, output, &min, &max,
+                          &scaling_factor);
+
+  EXPECT_NEAR(min, -9e-05, 1e-6);
+  EXPECT_NEAR(max, 0.0002, 1e-6);
+  EXPECT_EQ(scaling_factor, 635000);
+  EXPECT_THAT(output,
+              testing::ElementsAreArray({-6, 19, -4, -57, 1, 25, 6, 127, 0}));
+}
+
 TEST(uKernels, MatrixBatchVectorMultiplyAccumulateTest) {
   constexpr int kRow = 3;
   constexpr int kCol = 4;
@@ -58,6 +126,329 @@ TEST(uKernels, MatrixBatchVectorMultiplyAccumulateTest) {
                                                -1., 3., 7., 3., 23., 3.})));
 }
 
+TEST(uKernels, MatrixBatchVectorMultiplyAccumulateSymmetricQuantizedTest) {
+  // Note we use 29 columns as this exercises all the neon kernel: the
+  // 16-block SIMD code, the 8-block postamble, and the leftover postamble.
+  const int a_rows = 4, a_cols = 29;
+  const int kWeightsPerUint32 = 4;
+  const float a_float_data[] = {
+      /* 1st row */
+      1.1, 2.2, 3.3, 4.4, 5.5, 6.6, 7.7, 8.8, 9.9, 10.1, 11.11, 12.12, 13.13,
+      14.14, 15.15, 16.16, 17.17, 18.18, 19.19, 20.2, 21.21, 22.22, 23.23,
+      24.24, 25.25, 26.26, 27.27, 28.28, 0,
+      /* 2nd row */
+      -1.1, -2.2, -3.3, -4.4, -5.5, -6.6, -7.7, -8.8, -9.9, -10.1, -11.11,
+      -12.12, -13.13, -14.14, -15.15, -16.16, -17.17, -18.18, -19.19, -20.2,
+      -21.21, -22.22, -23.23, -24.24, -25.25, -26.26, -27.27, -28.28, 0,
+      /* 3rd row */
+      1.1, -2.2, 3.3, -4.4, 5.5, -6.6, 7.7, -8.8, 9.9, -10.1, 11.11, -12.12,
+      13.13, -14.14, 15.15, -16.16, 17.17, -18.18, 19.19, -20.2, 21.21, -22.22,
+      23.23, -24.24, 25.25, -26.26, 27.27, -28.28, 0,
+      /* 4th row */
+      -1.1, 2.2, -3.3, 4.4, -5.5, 6.6, -7.7, 8.8, -9.9, 10.1, -11.11, 12.12,
+      -13.13, 14.14, -15.15, 16.16, -17.17, 18.18, -19.19, 20.2, -21.21, 22.22,
+      -23.23, 24.24, -25.25, 26.26, -27.27, 28.28, 0};
+
+  int8* a_int8_data = reinterpret_cast<int8*>(
+      aligned_malloc(a_rows * a_cols, kWeightsPerUint32));
+  float a_min, a_max;
+  float scaling_factor_a;
+  SymmetricQuantizeFloats(a_float_data, a_rows * a_cols, a_int8_data, &a_min,
+                          &a_max, &scaling_factor_a);
+  const int8 expected_a_int8_data[] = {
+      /* 1st row */
+      5,
+      10,
+      15,
+      20,
+      25,
+      30,
+      35,
+      40,
+      44,
+      45,
+      50,
+      54,
+      59,
+      64,
+      68,
+      73,
+      77,
+      82,
+      86,
+      91,
+      95,
+      100,
+      104,
+      109,
+      113,
+      118,
+      122,
+      127,
+      0,
+      /* 2nd row */
+      -5,
+      -10,
+      -15,
+      -20,
+      -25,
+      -30,
+      -35,
+      -40,
+      -44,
+      -45,
+      -50,
+      -54,
+      -59,
+      -64,
+      -68,
+      -73,
+      -77,
+      -82,
+      -86,
+      -91,
+      -95,
+      -100,
+      -104,
+      -109,
+      -113,
+      -118,
+      -122,
+      -127,
+      0,
+      /* 3rd row */
+      5,
+      -10,
+      15,
+      -20,
+      25,
+      -30,
+      35,
+      -40,
+      44,
+      -45,
+      50,
+      -54,
+      59,
+      -64,
+      68,
+      -73,
+      77,
+      -82,
+      86,
+      -91,
+      95,
+      -100,
+      104,
+      -109,
+      113,
+      -118,
+      122,
+      -127,
+      0,
+      /* 4th row */
+      -5,
+      10,
+      -15,
+      20,
+      -25,
+      30,
+      -35,
+      40,
+      -44,
+      45,
+      -50,
+      54,
+      -59,
+      64,
+      -68,
+      73,
+      -77,
+      82,
+      -86,
+      91,
+      -95,
+      100,
+      -104,
+      109,
+      -113,
+      118,
+      -122,
+      127,
+      0,
+  };
+  for (int i = 0; i < a_rows * a_cols; ++i) {
+    EXPECT_EQ(expected_a_int8_data[i], a_int8_data[i]);
+  }
+
+  const int b_rows = 29, b_cols = 1, batches = 2;
+  const float b_float_data[] = {
+      /* batch 1 */
+      1.0,
+      -1.0,
+      1.0,
+      -1.0,
+      1.0,
+      -1.0,
+      1.0,
+      -1.0,
+      1.0,
+      -1.0,
+      1.0,
+      -1.0,
+      1.0,
+      -1.0,
+      1.0,
+      -1.0,
+      1.0,
+      -1.0,
+      1.0,
+      -1.0,
+      1.0,
+      -1.0,
+      1.0,
+      -1.0,
+      1.0,
+      -1.0,
+      1.0,
+      -1.0,
+      1.0,
+      /* batch 2 */
+      2.5,
+      -2.1,
+      3.0,
+      -1.3,
+      1.3,
+      -1.1,
+      2.0,
+      -1.7,
+      1.9,
+      -1.5,
+      0.5,
+      -0.7,
+      0.8,
+      -0.3,
+      2.8,
+      -2.8,
+      1.1,
+      -2.3,
+      1.9,
+      -1.9,
+      2.1,
+      -0.5,
+      2.4,
+      -0.1,
+      1.0,
+      -2.5,
+      0.7,
+      -1.9,
+      0.2,
+  };
+
+  // Quantized values of B:
+  int8 b_int8_data[b_rows * b_cols * batches];
+  float b_min, b_max;
+  float scaling_factor_b[batches];
+  SymmetricQuantizeFloats(b_float_data, b_rows * b_cols, b_int8_data, &b_min,
+                          &b_max, &scaling_factor_b[0]);
+  SymmetricQuantizeFloats(&b_float_data[b_rows * b_cols], b_rows * b_cols,
+                          &b_int8_data[b_rows * b_cols], &b_min, &b_max,
+                          &scaling_factor_b[1]);
+
+  const int8 expected_b_int8_data[] = {
+      /* batch 1 */
+      127,
+      -127,
+      127,
+      -127,
+      127,
+      -127,
+      127,
+      -127,
+      127,
+      -127,
+      127,
+      -127,
+      127,
+      -127,
+      127,
+      -127,
+      127,
+      -127,
+      127,
+      -127,
+      127,
+      -127,
+      127,
+      -127,
+      127,
+      -127,
+      127,
+      -127,
+      127,
+      /* batch 2 */
+      106,
+      -89,
+      127,
+      -55,
+      55,
+      -47,
+      85,
+      -72,
+      80,
+      -64,
+      21,
+      -30,
+      34,
+      -13,
+      119,
+      -119,
+      47,
+      -97,
+      80,
+      -80,
+      89,
+      -21,
+      102,
+      -4,
+      42,
+      -106,
+      30,
+      -80,
+      8,
+  };
+  for (int i = 0; i < b_rows * b_cols * batches; ++i) {
+    EXPECT_EQ(expected_b_int8_data[i], b_int8_data[i]);
+  }
+
+  // Full float operation results in:
+  // -13.69, 13.69, 414.11, -414.11
+  // -6.325, 6.325, 631.263, -631.263
+  float c_float_data[a_rows * b_cols * batches];
+  for (int i = 0; i < a_rows * b_cols * batches; ++i) {
+    c_float_data[i] = 0.0;
+  }
+
+  // Testing product.
+  const float scaling_factor_c[2] = {
+      scaling_factor_a * scaling_factor_b[0],
+      scaling_factor_a * scaling_factor_b[1],
+  };
+  MatrixBatchVectorMultiplyAccumulate(a_int8_data, a_rows, a_cols, b_int8_data,
+                                      scaling_factor_c, batches, c_float_data,
+                                      /*result_stride=*/1);
+
+  // Assert we obtain the expected recovered float values.
+  const float expected_c_float_data[] = {
+      -14.474, 14.474, 414.402, -414.402, -6.92228, 6.92228, 632.042, -632.042,
+  };
+  for (int i = 0; i < a_rows * b_cols * batches; ++i) {
+    EXPECT_NEAR(expected_c_float_data[i], c_float_data[i], 0.001);
+  }
+
+  aligned_free(a_int8_data);
+}
+
 TEST(uKernels, VectorVectorCwiseProductTest) {
   constexpr int kVectorSize = 10;
   static float input1[kVectorSize] = {0.0,  -0.5, 1.0,  -1.5, 2.0,
diff --git a/tensorflow/contrib/lite/kernels/internal/test_util.cc b/tensorflow/contrib/lite/kernels/internal/test_util.cc
new file mode 100644
index 00000000000000..9b1fd9b344d991
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/test_util.cc
@@ -0,0 +1,121 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/kernels/internal/test_util.h"
+
+#include <cmath>
+#include <iterator>
+
+namespace tflite {
+
+Dims<4> MakeDimsForInference(int depth, int width, int height, int batch) {
+  Dims<4> result;
+  int cum_prod = 1;
+
+  result.sizes[0] = depth;
+  result.strides[0] = cum_prod;
+  cum_prod *= result.sizes[0];
+
+  result.sizes[1] = width;
+  result.strides[1] = cum_prod;
+  cum_prod *= result.sizes[1];
+
+  result.sizes[2] = height;
+  result.strides[2] = cum_prod;
+  cum_prod *= result.sizes[2];
+
+  result.sizes[3] = batch;
+  result.strides[3] = cum_prod;
+
+  return result;
+}
+
+// this is a copied from an internal function in propagate_fixed_sizes.cc
+bool ComputeConvSizes(Dims<4> input_dims, int output_depth, int filter_width,
+                      int filter_height, int stride, PaddingType padding_type,
+                      Dims<4>* output_dims, int* pad_width, int* pad_height) {
+  const int input_width = ArraySize(input_dims, 1);
+  const int input_height = ArraySize(input_dims, 2);
+  const int batch = ArraySize(input_dims, 3);
+
+  int output_height = 0;
+  int output_width = 0;
+  if (padding_type == PaddingType::kValid) {
+    output_height = (input_height + stride - filter_height) / stride;
+    output_width = (input_width + stride - filter_width) / stride;
+  } else if (padding_type == PaddingType::kSame) {
+    output_height = (input_height + stride - 1) / stride;
+    output_width = (input_width + stride - 1) / stride;
+  } else {
+    return false;
+  }
+
+  if (output_width <= 0 || output_height <= 0) {
+    return false;
+  }
+
+  *pad_height =
+      ((output_height - 1) * stride + filter_height - input_height) / 2;
+  *pad_width = ((output_width - 1) * stride + filter_width - input_width) / 2;
+  *output_dims =
+      MakeDimsForInference(output_depth, output_width, output_height, batch);
+  return true;
+}
+
+std::mt19937& RandomEngine() {
+  static std::mt19937 engine;
+  return engine;
+}
+
+int UniformRandomInt(int min, int max) {
+  std::uniform_int_distribution<int> dist(min, max);
+  return dist(RandomEngine());
+}
+
+float UniformRandomFloat(float min, float max) {
+  std::uniform_real_distribution<float> dist(min, max);
+  return dist(RandomEngine());
+}
+
+int ExponentialRandomPositiveInt(float percentile, int percentile_val,
+                                 int max_val) {
+  const float lambda =
+      -std::log(1.f - percentile) / static_cast<float>(percentile_val);
+  std::exponential_distribution<float> dist(lambda);
+  float val;
+  do {
+    val = dist(RandomEngine());
+  } while (!val || !std::isfinite(val) || val > max_val);
+  return static_cast<int>(std::ceil(val));
+}
+
+float ExponentialRandomPositiveFloat(float percentile, float percentile_val,
+                                     float max_val) {
+  const float lambda =
+      -std::log(1.f - percentile) / static_cast<float>(percentile_val);
+  std::exponential_distribution<float> dist(lambda);
+  float val;
+  do {
+    val = dist(RandomEngine());
+  } while (!std::isfinite(val) || val > max_val);
+  return val;
+}
+
+void FillRandom(std::vector<float>* vec, float min, float max) {
+  std::uniform_real_distribution<float> dist(min, max);
+  auto gen = std::bind(dist, RandomEngine());
+  std::generate(std::begin(*vec), std::end(*vec), gen);
+}
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/test_util.h b/tensorflow/contrib/lite/kernels/internal/test_util.h
new file mode 100644
index 00000000000000..26078cef49a786
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/internal/test_util.h
@@ -0,0 +1,104 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TEST_UTIL_H_
+#define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TEST_UTIL_H_
+
+#include <algorithm>
+#include <functional>
+#include <iterator>
+#include <limits>
+#include <random>
+#include <vector>
+
+#include "tensorflow/contrib/lite/kernels/internal/types.h"
+
+namespace tflite {
+
+// Creates a Dims struct from a set of dimensions.
+Dims<4> MakeDimsForInference(int depth, int width, int height, int batch);
+
+// Computes output and padding dimensions.
+bool ComputeConvSizes(Dims<4> input_dims, int output_depth, int filter_width,
+                      int filter_height, int stride, PaddingType padding_type,
+                      Dims<4>* output_dims, int* pad_width, int* pad_height);
+
+// Returns a mt19937 random engine.
+std::mt19937& RandomEngine();
+
+// Returns a random integer uniformly distributed between |min| and |max|.
+int UniformRandomInt(int min, int max);
+
+// Returns a random float uniformly distributed between |min| and |max|.
+float UniformRandomFloat(float min, float max);
+
+// Returns a random element in |v|.
+template <typename T>
+const T& RandomElement(const std::vector<T>& v) {
+  return v[UniformRandomInt(0, v.size() - 1)];
+}
+
+// Returns a random exponentially distributed integer.
+int ExponentialRandomPositiveInt(float percentile, int percentile_val,
+                                 int max_val);
+
+// Returns a random exponentially distributed float.
+float ExponentialRandomPositiveFloat(float percentile, float percentile_val,
+                                     float max_val);
+
+// Fills a vector with random floats between |min| and |max|.
+void FillRandom(std::vector<float>* vec, float min, float max);
+
+// Fills a vector with random numbers between |min| and |max|.
+template <typename T>
+void FillRandom(std::vector<T>* vec, T min, T max) {
+  std::uniform_int_distribution<T> dist(min, max);
+  auto gen = std::bind(dist, RandomEngine());
+  std::generate(std::begin(*vec), std::end(*vec), gen);
+}
+
+// Fills a vector with random numbers.
+template <typename T>
+void FillRandom(std::vector<T>* vec) {
+  FillRandom(vec, std::numeric_limits<T>::min(), std::numeric_limits<T>::max());
+}
+
+template <typename T>
+void FillRandom(typename std::vector<T>::iterator begin_it,
+                typename std::vector<T>::iterator end_it, T min, T max) {
+  std::uniform_int_distribution<T> dist(min, max);
+  auto gen = std::bind(dist, RandomEngine());
+  std::generate(begin_it, end_it, gen);
+}
+
+// Fill with a "skyscraper" pattern, in which there is a central section (across
+// the depth) with higher values than the surround.
+template <typename T>
+void FillRandomSkyscraper(std::vector<T>* vec, int depth,
+                          double middle_proportion, uint8 middle_min,
+                          uint8 sides_max) {
+  for (auto base_it = std::begin(*vec); base_it != std::end(*vec);
+       base_it += depth) {
+    auto left_it = base_it + std::ceil(0.5 * depth * (1.0 - middle_proportion));
+    auto right_it =
+        base_it + std::ceil(0.5 * depth * (1.0 + middle_proportion));
+    FillRandom(base_it, left_it, std::numeric_limits<T>::min(), sides_max);
+    FillRandom(left_it, right_it, middle_min, std::numeric_limits<T>::max());
+    FillRandom(right_it, base_it + depth, std::numeric_limits<T>::min(),
+               sides_max);
+  }
+}
+
+}  // namespace tflite
+#endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TEST_UTIL_H_
diff --git a/tensorflow/contrib/lite/kernels/internal/types.h b/tensorflow/contrib/lite/kernels/internal/types.h
index 3290c364c18224..0c7fb7a76a5075 100644
--- a/tensorflow/contrib/lite/kernels/internal/types.h
+++ b/tensorflow/contrib/lite/kernels/internal/types.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,11 +15,15 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_
 #define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_
 
+#include <cstring>
+#include <iterator>
+
 #include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
 
 namespace tflite {
 
 enum class FusedActivationFunctionType : uint8 { kNone, kRelu6, kRelu1, kRelu };
+enum class PaddingType { kNone, kSame, kValid };
 
 // Quantization parameters, determining the mapping of quantized values
 // to real values (i.e. determining how quantized values are mathematically
@@ -43,6 +47,101 @@ struct Dims {
   int strides[N];
 };
 
+class RuntimeShape {
+ public:
+  // Shapes with dimensions up to 4 are stored directly in the structure, while
+  // larger shapes are separately allocated.
+  static constexpr int kMaxSmallSize = 4;
+
+  RuntimeShape() : size_(0) {}
+
+  explicit RuntimeShape(int dimensions_count) : size_(dimensions_count) {
+    if (dimensions_count > kMaxSmallSize) {
+      dims_pointer_ = new int32[dimensions_count];
+    }
+  }
+
+  RuntimeShape(int dimensions_count, const int32* dims_data) : size_(0) {
+    ReplaceWith(dimensions_count, dims_data);
+  }
+
+  ~RuntimeShape() {
+    if (size_ > kMaxSmallSize) {
+      delete[] dims_pointer_;
+    }
+  }
+
+  inline int32 DimensionsCount() const { return size_; }
+  inline int32 Dims(int i) const {
+    TFLITE_DCHECK_GE(i, 0);
+    TFLITE_DCHECK_LT(i, size_);
+    return size_ > kMaxSmallSize ? dims_pointer_[i] : dims_[i];
+  }
+  inline void SetDim(int i, int32 val) {
+    TFLITE_DCHECK_GE(i, 0);
+    TFLITE_DCHECK_LT(i, size_);
+    if (size_ > kMaxSmallSize) {
+      dims_pointer_[i] = val;
+    } else {
+      dims_[i] = val;
+    }
+  }
+  inline int32* DimsData() {
+    return size_ > kMaxSmallSize ? dims_pointer_ : dims_;
+  }
+  inline const int32* DimsData() const {
+    return size_ > kMaxSmallSize ? dims_pointer_ : dims_;
+  }
+
+  inline void Resize(int dimensions_count) {
+    if (size_ > kMaxSmallSize) {
+      delete[] dims_pointer_;
+    }
+    size_ = dimensions_count;
+    if (dimensions_count > kMaxSmallSize) {
+      dims_pointer_ = new int32[dimensions_count];
+    }
+  }
+
+  inline void ReplaceWith(int dimensions_count, const int32* dims_data) {
+    Resize(dimensions_count);
+    int32* dst_dims = DimsData();
+    std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32));
+  }
+
+  template <typename T>
+  inline void BuildFrom(const T& src_iterable) {
+    const int dimensions_count =
+        std::distance(src_iterable.begin(), src_iterable.end());
+    Resize(dimensions_count);
+    int32* data = DimsData();
+    for (auto it : src_iterable) {
+      *data = it;
+      ++data;
+    }
+  }
+
+  // Returns the total count of elements, that is the size when flattened into a
+  // vector.
+  inline int FlatSize() const {
+    int buffer_size = 1;
+    const int* dims_data = DimsData();
+    for (int i = 0; i < size_; i++) {
+      const int dim = dims_data[i];
+      TFLITE_DCHECK_GE(dim, 1);
+      buffer_size *= dim;
+    }
+    return buffer_size;
+  }
+
+ private:
+  int32 size_;
+  union {
+    int32 dims_[kMaxSmallSize];
+    int32* dims_pointer_;
+  };
+};
+
 // Gets next index to iterate through a multidimensional array.
 inline bool NextIndex(const int num_dims, const int* dims, int* current) {
   TFLITE_DCHECK_GT(num_dims, 0);
@@ -132,11 +231,11 @@ int MatchingArraySize(const ArrayType1& array1, int index1,
 
 template <int N>
 inline int FlatSize(const Dims<N>& dims) {
-  int max_offset = 0;
-  for (int i = 0; i < N; i++) {
-    max_offset += (dims.sizes[i] - 1) * dims.strides[i];
+  int flat_size = 1;
+  for (int i = 0; i < N; ++i) {
+    flat_size *= dims.sizes[i];
   }
-  return max_offset + 1;
+  return flat_size;
 }
 
 // Deprecated. Prefer FlatSize.
@@ -148,7 +247,7 @@ inline int RequiredBufferSizeForDims(const Dims<4>& dims) {
 // arrays.
 template <int N>
 inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0) {
-  for (int i = 0; i < N; i++) {
+  for (int i = 0; i < N; ++i) {
     TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
   }
   return FlatSize(dims);
@@ -157,7 +256,7 @@ inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0) {
 template <int N>
 inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0,
                             const Dims<N>& check_dims_1) {
-  for (int i = 0; i < N; i++) {
+  for (int i = 0; i < N; ++i) {
     TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
   }
   return MatchingFlatSize(dims, check_dims_1);
@@ -167,7 +266,7 @@ template <int N>
 inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0,
                             const Dims<N>& check_dims_1,
                             const Dims<N>& check_dims_2) {
-  for (int i = 0; i < N; i++) {
+  for (int i = 0; i < N; ++i) {
     TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
   }
   return FlatSize(dims, check_dims_1, check_dims_2);
@@ -178,7 +277,7 @@ inline int MatchingFlatSize(const Dims<N>& dims, const Dims<N>& check_dims_0,
                             const Dims<N>& check_dims_1,
                             const Dims<N>& check_dims_2,
                             const Dims<N>& check_dims_3) {
-  for (int i = 0; i < N; i++) {
+  for (int i = 0; i < N; ++i) {
     TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
   }
   return FlatSize(dims, check_dims_1, check_dims_2, check_dims_3);
@@ -191,7 +290,7 @@ template <int N>
 inline int FlatSizeSkipDim(const Dims<N>& dims, int skip_dim) {
   TFLITE_DCHECK(skip_dim >= 0 && skip_dim < N);
   int flat_size = 1;
-  for (int i = 0; i < N; i++) {
+  for (int i = 0; i < N; ++i) {
     flat_size *= (i == skip_dim) ? 1 : dims.sizes[i];
   }
   return flat_size;
@@ -201,7 +300,7 @@ inline int FlatSizeSkipDim(const Dims<N>& dims, int skip_dim) {
 template <int N>
 inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
                                    const Dims<N>& check_dims_0) {
-  for (int i = 0; i < N; i++) {
+  for (int i = 0; i < N; ++i) {
     if (i != skip_dim) {
       TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
     }
@@ -213,7 +312,7 @@ template <int N>
 inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
                                    const Dims<N>& check_dims_0,
                                    const Dims<N>& check_dims_1) {
-  for (int i = 0; i < N; i++) {
+  for (int i = 0; i < N; ++i) {
     if (i != skip_dim) {
       TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
     }
@@ -226,7 +325,7 @@ inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
                                    const Dims<N>& check_dims_0,
                                    const Dims<N>& check_dims_1,
                                    const Dims<N>& check_dims_2) {
-  for (int i = 0; i < N; i++) {
+  for (int i = 0; i < N; ++i) {
     if (i != skip_dim) {
       TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
     }
@@ -240,7 +339,7 @@ inline int MatchingFlatSizeSkipDim(const Dims<N>& dims, int skip_dim,
                                    const Dims<N>& check_dims_1,
                                    const Dims<N>& check_dims_2,
                                    const Dims<N>& check_dims_3) {
-  for (int i = 0; i < N; i++) {
+  for (int i = 0; i < N; ++i) {
     if (i != skip_dim) {
       TFLITE_DCHECK_EQ(ArraySize(dims, i), ArraySize(check_dims_0, i));
     }
@@ -259,6 +358,14 @@ bool IsPackedWithoutStrides(const Dims<N>& dims) {
   return true;
 }
 
+template <int N>
+void ComputeStrides(Dims<N>* dims) {
+  dims->strides[0] = 1;
+  for (int d = 1; d < N; d++) {
+    dims->strides[d] = dims->strides[d - 1] * dims->sizes[d - 1];
+  }
+}
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_
diff --git a/tensorflow/contrib/lite/kernels/kernel_util.cc b/tensorflow/contrib/lite/kernels/kernel_util.cc
index 955e8c5764c6ad..184028427fb193 100644
--- a/tensorflow/contrib/lite/kernels/kernel_util.cc
+++ b/tensorflow/contrib/lite/kernels/kernel_util.cc
@@ -22,9 +22,12 @@ limitations under the License.
 
 namespace tflite {
 
-TfLiteStatus GetQuantizedConvolutionMultipler(
-    TfLiteContext* context, TfLiteTensor* input, TfLiteTensor* filter,
-    TfLiteTensor* bias, TfLiteTensor* output, double* multiplier) {
+TfLiteStatus GetQuantizedConvolutionMultipler(TfLiteContext* context,
+                                              const TfLiteTensor* input,
+                                              const TfLiteTensor* filter,
+                                              const TfLiteTensor* bias,
+                                              TfLiteTensor* output,
+                                              double* multiplier) {
   const double input_product_scale = input->params.scale * filter->params.scale;
   const double bias_scale = bias->params.scale;
   const double output_scale = output->params.scale;
@@ -34,7 +37,6 @@ TfLiteStatus GetQuantizedConvolutionMultipler(
   TF_LITE_ENSURE(context, std::abs(input_product_scale - bias_scale) <=
                               1e-6 * std::min(input_product_scale, bias_scale));
   TF_LITE_ENSURE(context, input_product_scale >= 0);
-  TF_LITE_ENSURE(context, input_product_scale < output_scale);
 
   *multiplier = input_product_scale / output_scale;
 
@@ -87,13 +89,13 @@ void CalculateActivationRangeFloat(TfLiteFusedActivation activation,
   }
 }
 
-bool HaveSameShapes(TfLiteTensor* input1, TfLiteTensor* input2) {
+bool HaveSameShapes(const TfLiteTensor* input1, const TfLiteTensor* input2) {
   return TfLiteIntArrayEqual(input1->dims, input2->dims);
 }
 
 TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
-                                        TfLiteTensor* input1,
-                                        TfLiteTensor* input2,
+                                        const TfLiteTensor* input1,
+                                        const TfLiteTensor* input2,
                                         TfLiteIntArray** output_shape) {
   int64_t dims1 = NumDimensions(input1);
   int64_t dims2 = NumDimensions(input2);
diff --git a/tensorflow/contrib/lite/kernels/kernel_util.h b/tensorflow/contrib/lite/kernels/kernel_util.h
index 2f407b5da31594..82cded36f2ed27 100644
--- a/tensorflow/contrib/lite/kernels/kernel_util.h
+++ b/tensorflow/contrib/lite/kernels/kernel_util.h
@@ -24,14 +24,18 @@ inline int NumDimensions(const TfLiteTensor* t) { return t->dims->size; }
 inline int SizeOfDimension(const TfLiteTensor* t, int dim) {
   return t->dims->data[dim];
 }
-inline TfLiteTensor* GetInput(TfLiteContext* context, TfLiteNode* node,
-                              int index) {
+inline const TfLiteTensor* GetInput(TfLiteContext* context, TfLiteNode* node,
+                                    int index) {
   return &context->tensors[node->inputs->data[index]];
 }
 inline TfLiteTensor* GetOutput(TfLiteContext* context, TfLiteNode* node,
                                int index) {
   return &context->tensors[node->outputs->data[index]];
 }
+inline TfLiteTensor* GetTemporary(TfLiteContext* context, TfLiteNode* node,
+                                  int index) {
+  return &context->tensors[node->temporaries->data[index]];
+}
 inline int NumInputs(const TfLiteNode* node) { return node->inputs->size; }
 inline int NumOutputs(const TfLiteNode* node) { return node->outputs->size; }
 
@@ -43,8 +47,9 @@ inline int64_t NumElements(const TfLiteTensor* t) {
   return count;
 }
 
-inline TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context,
-                                            const TfLiteNode* node, int index) {
+inline const TfLiteTensor* GetOptionalInputTensor(TfLiteContext* context,
+                                                  const TfLiteNode* node,
+                                                  int index) {
   const bool use_tensor = node->inputs->data[index] != kOptionalTensor;
   if (use_tensor) {
     return &context->tensors[node->inputs->data[index]];
@@ -74,9 +79,12 @@ inline void SetTensorToDynamic(TfLiteTensor* tensor) {
 // Calculates the multiplication factor for a quantized convolution (or
 // quantized depthwise convolution) involving the given tensors. Returns an
 // error if the scales of the tensors are not compatible.
-TfLiteStatus GetQuantizedConvolutionMultipler(
-    TfLiteContext* context, TfLiteTensor* input, TfLiteTensor* filter,
-    TfLiteTensor* bias, TfLiteTensor* output, double* multiplier);
+TfLiteStatus GetQuantizedConvolutionMultipler(TfLiteContext* context,
+                                              const TfLiteTensor* input,
+                                              const TfLiteTensor* filter,
+                                              const TfLiteTensor* bias,
+                                              TfLiteTensor* output,
+                                              double* multiplier);
 
 // Calculates the useful range of an activation layer given its activation
 // tensor.
@@ -88,13 +96,13 @@ void CalculateActivationRangeFloat(TfLiteFusedActivation activation,
                                    float* activation_max);
 
 // Return true if the given tensors have the same shape.
-bool HaveSameShapes(TfLiteTensor* input1, TfLiteTensor* input2);
+bool HaveSameShapes(const TfLiteTensor* input1, const TfLiteTensor* input2);
 
 // Calculate the output_shape that is necessary for element-wise operations
 // with broadcasting involving the two input tensors.
 TfLiteStatus CalculateShapeForBroadcast(TfLiteContext* context,
-                                        TfLiteTensor* input1,
-                                        TfLiteTensor* input2,
+                                        const TfLiteTensor* input1,
+                                        const TfLiteTensor* input2,
                                         TfLiteIntArray** output_shape);
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/kernel_util_test.cc b/tensorflow/contrib/lite/kernels/kernel_util_test.cc
index c65b68970f6853..bf6f249acc85ee 100644
--- a/tensorflow/contrib/lite/kernels/kernel_util_test.cc
+++ b/tensorflow/contrib/lite/kernels/kernel_util_test.cc
@@ -33,7 +33,7 @@ class KernelUtilTest : public ::testing::Test {
     tensor1_.allocation_type = kTfLiteMmapRo;
     tensor2_.allocation_type = kTfLiteMmapRo;
   }
-  ~KernelUtilTest() {
+  ~KernelUtilTest() override {
     TfLiteTensorFree(&tensor1_);
     TfLiteTensorFree(&tensor2_);
   }
diff --git a/tensorflow/contrib/lite/kernels/l2norm.cc b/tensorflow/contrib/lite/kernels/l2norm.cc
index e67f4e06f3680f..3205c1cc527242 100644
--- a/tensorflow/contrib/lite/kernels/l2norm.cc
+++ b/tensorflow/contrib/lite/kernels/l2norm.cc
@@ -40,7 +40,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   TF_LITE_ENSURE(context, NumDimensions(input) <= 4);
@@ -64,7 +64,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
 template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   if (output->type == kTfLiteFloat32) {
@@ -94,7 +94,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     }
 #undef TF_LITE_L2NORM
   } else {
-    context->ReportError(context, "Inputs and outputs not all float types.");
+    context->ReportError(context, "Output type is %d, requires float.",
+                         output->type);
     return kTfLiteError;
   }
 
diff --git a/tensorflow/contrib/lite/kernels/l2norm_test.cc b/tensorflow/contrib/lite/kernels/l2norm_test.cc
index 042314ccf55cb6..070ed60040997f 100644
--- a/tensorflow/contrib/lite/kernels/l2norm_test.cc
+++ b/tensorflow/contrib/lite/kernels/l2norm_test.cc
@@ -67,7 +67,7 @@ class L2NormOpModel : public SingleOpModel {
   int output_;
 };
 
-TEST(L2NormOpTest, SimpleTest) {
+TEST(L2NormOpTest, SimpleFloatTest) {
   L2NormOpModel m({1, 1, 1, 6}, TensorType_FLOAT32,
                   ActivationFunctionType_NONE);
   m.SetInput({-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
@@ -76,6 +76,23 @@ TEST(L2NormOpTest, SimpleTest) {
               ElementsAreArray({-0.55, 0.3, 0.35, 0.6, -0.35, 0.05}));
 }
 
+TEST(L2NormOpTest, MultipleBatchFloatTest) {
+  L2NormOpModel m({3, 1, 1, 6}, TensorType_FLOAT32,
+                  ActivationFunctionType_NONE);
+  m.SetInput({
+      -1.1, 0.6, 0.7, 1.2, -0.7, 0.1,  // batch 1
+      -1.1, 0.6, 0.7, 1.2, -0.7, 0.1,  // batch 2
+      -1.1, 0.6, 0.7, 1.2, -0.7, 0.1,  // batch 3
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray({
+                  -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 1
+                  -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 2
+                  -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 3
+              }));
+}
+
 TEST(L2NormOpTest, SimpleUint8Test) {
   L2NormOpModel m({1, 1, 1, 6}, TensorType_UINT8, ActivationFunctionType_NONE);
 
@@ -88,6 +105,32 @@ TEST(L2NormOpTest, SimpleUint8Test) {
                   ArrayFloatNear({-0.55, 0.3, 0.35, 0.6, -0.35, 0.05}, 0.1)));
 }
 
+TEST(L2NormOpTest, MultipleBatchUint8Test) {
+  L2NormOpModel m({3, 1, 1, 6}, TensorType_UINT8, ActivationFunctionType_NONE);
+
+  m.QuantizeAndPopulate<uint8_t>(m.input(),
+                                 {
+                                     -1.1, 0.6, 0.7, 1.2, -0.7, 0.1,  // batch 1
+                                     -1.1, 0.6, 0.7, 1.2, -0.7, 0.1,  // batch 2
+                                     -1.1, 0.6, 0.7, 1.2, -0.7, 0.1,  // batch 3
+                                 });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<uint8_t>(),
+              ElementsAreArray({
+                  58, 166, 173, 205, 83, 134,  // batch 1
+                  58, 166, 173, 205, 83, 134,  // batch 2
+                  58, 166, 173, 205, 83, 134,  // batch 3
+              }));
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 1
+                      -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 2
+                      -0.55, 0.3, 0.35, 0.6, -0.35, 0.05,  // batch 3
+                  },
+                  0.1)));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/local_response_norm.cc b/tensorflow/contrib/lite/kernels/local_response_norm.cc
index c1c70d0dfa0050..36dca299d0e07a 100644
--- a/tensorflow/contrib/lite/kernels/local_response_norm.cc
+++ b/tensorflow/contrib/lite/kernels/local_response_norm.cc
@@ -38,7 +38,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
@@ -60,7 +60,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params =
       reinterpret_cast<TfLiteLocalResponseNormParams*>(node->builtin_data);
 
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   if (output->type == kTfLiteFloat32) {
@@ -77,7 +77,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     }
 #undef TF_LITE_LOCAL_RESPONSE_NORM
   } else {
-    context->ReportError(context, "Inputs and outputs not all float types.");
+    context->ReportError(context, "Output type is %d, requires float.",
+                         output->type);
     return kTfLiteError;
   }
 
diff --git a/tensorflow/contrib/lite/kernels/lsh_projection.cc b/tensorflow/contrib/lite/kernels/lsh_projection.cc
index 0ee35775d50b87..25d2dc2cdd699b 100644
--- a/tensorflow/contrib/lite/kernels/lsh_projection.cc
+++ b/tensorflow/contrib/lite/kernels/lsh_projection.cc
@@ -77,16 +77,16 @@ TfLiteStatus Resize(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context, NumInputs(node) == 2 || NumInputs(node) == 3);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* hash = GetInput(context, node, 0);
+  const TfLiteTensor* hash = GetInput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, NumDimensions(hash), 2);
   // Support up to 32 bits.
   TF_LITE_ENSURE(context, SizeOfDimension(hash, 1) <= 32);
 
-  TfLiteTensor* input = GetInput(context, node, 1);
+  const TfLiteTensor* input = GetInput(context, node, 1);
   TF_LITE_ENSURE(context, NumDimensions(input) >= 1);
 
   if (NumInputs(node) == 3) {
-    TfLiteTensor* weight = GetInput(context, node, 2);
+    const TfLiteTensor* weight = GetInput(context, node, 2);
     TF_LITE_ENSURE_EQ(context, NumDimensions(weight), 1);
     TF_LITE_ENSURE_EQ(context, SizeOfDimension(weight, 0),
                       SizeOfDimension(input, 0));
@@ -173,9 +173,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       reinterpret_cast<TfLiteLSHProjectionParams*>(node->builtin_data);
 
   int32_t* out_buf = GetOutput(context, node, 0)->data.i32;
-  TfLiteTensor* hash = GetInput(context, node, 0);
-  TfLiteTensor* input = GetInput(context, node, 1);
-  TfLiteTensor* weight =
+  const TfLiteTensor* hash = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 1);
+  const TfLiteTensor* weight =
       NumInputs(node) == 2 ? nullptr : GetInput(context, node, 2);
 
   switch (params->type) {
diff --git a/tensorflow/contrib/lite/kernels/lstm.cc b/tensorflow/contrib/lite/kernels/lstm.cc
index 8cf1165135bdb0..eb26a02455ce2a 100644
--- a/tensorflow/contrib/lite/kernels/lstm.cc
+++ b/tensorflow/contrib/lite/kernels/lstm.cc
@@ -25,6 +25,8 @@ limitations under the License.
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/kernels/activation_functor.h"
 #include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
 #include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/contrib/lite/kernels/kernel_util.h"
 #include "tensorflow/contrib/lite/kernels/op_macros.h"
@@ -34,6 +36,17 @@ namespace ops {
 namespace builtin {
 namespace lstm {
 
+struct OpData {
+  // Which kernel type to use. Full kernel (18-inputs) or basic kernel
+  // (5-inputs).
+  TfLiteLSTMKernelType kernel_type;
+  // Only used by full kernel.
+  int scratch_tensor_index;
+};
+
+// For full inputs kernel (18-inputs).
+namespace full {
+
 // Input Tensors of size {n_batch, n_input}
 constexpr int kInputTensor = 0;
 
@@ -66,16 +79,23 @@ constexpr int kProjectionWeightsTensor = 16;  // Optional
 constexpr int kProjectionBiasTensor = 17;  // Optional
 
 // Output tensors.
-constexpr int kScratchBufferTensor = 0;
-constexpr int kOutputStateTensor = 1;
-constexpr int kCellStateTensor = 2;
-constexpr int kOutputTensor = 3;
+constexpr int kOutputStateTensor = 0;
+constexpr int kCellStateTensor = 1;
+constexpr int kOutputTensor = 2;
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* op_data = new OpData;
+  op_data->kernel_type = kTfLiteLSTMFullKernel;
+  context->AddTensors(context, /*tensors_to_add=*/7,
+                      &op_data->scratch_tensor_index);
+  return op_data;
+}
 
 // Check that input tensor dimensions matches with each other.
 TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
                                         TfLiteNode* node, int n_input,
                                         int n_output, int n_cell) {
-  auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
+  const auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
 
   // Making sure clipping parameters have valid values.
   // == 0 means no clipping
@@ -83,29 +103,29 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   TF_LITE_ENSURE(context, params->cell_clip >= 0);
   TF_LITE_ENSURE(context, params->proj_clip >= 0);
 
-  TfLiteTensor* input_to_input_weights =
+  const TfLiteTensor* input_to_input_weights =
       GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
-  if (input_to_input_weights) {
+  if (input_to_input_weights != nullptr) {
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->size, 2);
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[0], n_cell);
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[1], n_input);
   }
 
-  TfLiteTensor* input_to_forget_weights =
+  const TfLiteTensor* input_to_forget_weights =
       GetInput(context, node, kInputToForgetWeightsTensor);
   TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[1], n_input);
 
-  TfLiteTensor* input_to_cell_weights =
+  const TfLiteTensor* input_to_cell_weights =
       GetInput(context, node, kInputToCellWeightsTensor);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[1], n_input);
 
-  TfLiteTensor* recurrent_to_input_weights =
+  const TfLiteTensor* recurrent_to_input_weights =
       GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
-  if (recurrent_to_input_weights) {
+  if (recurrent_to_input_weights != nullptr) {
     TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->size, 2);
     TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[0],
                       n_cell);
@@ -113,7 +133,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
                       n_output);
   }
 
-  TfLiteTensor* recurrent_to_forget_weights =
+  const TfLiteTensor* recurrent_to_forget_weights =
       GetInput(context, node, kRecurrentToForgetWeightsTensor);
   TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[0],
@@ -121,7 +141,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[1],
                     n_output);
 
-  TfLiteTensor* recurrent_to_cell_weights =
+  const TfLiteTensor* recurrent_to_cell_weights =
       GetInput(context, node, kRecurrentToCellWeightsTensor);
   TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[0], n_cell);
@@ -137,21 +157,21 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
        (recurrent_to_input_weights == nullptr));
   TF_LITE_ENSURE(context, cifg_weights_all_or_none == true);
 
-  TfLiteTensor* cell_to_input_weights =
+  const TfLiteTensor* cell_to_input_weights =
       GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
   if (cell_to_input_weights) {
     TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->data[0], n_cell);
   }
 
-  TfLiteTensor* cell_to_forget_weights =
+  const TfLiteTensor* cell_to_forget_weights =
       GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
   if (cell_to_forget_weights) {
     TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->data[0], n_cell);
   }
 
-  TfLiteTensor* cell_to_output_weights =
+  const TfLiteTensor* cell_to_output_weights =
       GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
   if (cell_to_output_weights) {
     TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->size, 1);
@@ -170,7 +190,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   TF_LITE_ENSURE(context, peephole_weights_all_or_none == true);
 
   // Make sure the input gate bias is present only when not a CIFG-LSTM.
-  TfLiteTensor* input_gate_bias =
+  const TfLiteTensor* input_gate_bias =
       GetOptionalInputTensor(context, node, kInputGateBiasTensor);
   if (use_cifg) {
     TF_LITE_ENSURE_EQ(context, input_gate_bias, nullptr);
@@ -179,31 +199,31 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->data[0], n_cell);
   }
 
-  TfLiteTensor* forget_gate_bias =
+  const TfLiteTensor* forget_gate_bias =
       GetInput(context, node, kForgetGateBiasTensor);
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell);
 
-  TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
+  const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
   TF_LITE_ENSURE_EQ(context, cell_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, cell_bias->dims->data[0], n_cell);
 
-  TfLiteTensor* output_gate_bias =
+  const TfLiteTensor* output_gate_bias =
       GetInput(context, node, kOutputGateBiasTensor);
   TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->data[0], n_cell);
 
-  TfLiteTensor* projection_weights =
+  const TfLiteTensor* projection_weights =
       GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
-  if (projection_weights) {
+  if (projection_weights != nullptr) {
     TF_LITE_ENSURE_EQ(context, projection_weights->dims->size, 2);
     TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[0], n_output);
     TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[1], n_cell);
   }
 
-  TfLiteTensor* projection_bias =
+  const TfLiteTensor* projection_bias =
       GetOptionalInputTensor(context, node, kProjectionBiasTensor);
-  if (projection_bias) {
+  if (projection_bias != nullptr) {
     TF_LITE_ENSURE_EQ(context, projection_bias->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, projection_bias->dims->data[0], n_output);
   }
@@ -220,27 +240,31 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   return kTfLiteOk;
 }
 
-// Resize the output, state and scratch tensors based on the sizes of the input
-// tensors. Also check that the size of the input tensors match each other.
+// Resize the output, state tensors based on the sizes of the input tensors.
+// Allocate a temporary scratch tensor. Also check that the sizes of the input
+// tensors match each other.
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
+
   // Check we have all the inputs and outputs we need.
   TF_LITE_ENSURE_EQ(context, node->inputs->size, 18);
-  TF_LITE_ENSURE_EQ(context, node->outputs->size, 4);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 3);
 
   // Inferring batch size, number of outputs and number of cells from the
   // input tensors.
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
   TF_LITE_ENSURE(context, input->dims->size > 1);
   const int n_batch = input->dims->data[0];
   const int n_input = input->dims->data[1];
 
-  TfLiteTensor* input_to_output_weights =
+  const TfLiteTensor* input_to_output_weights =
       GetInput(context, node, kInputToOutputWeightsTensor);
   const int n_cell = input_to_output_weights->dims->data[0];
   TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->data[1], n_input);
 
-  TfLiteTensor* recurrent_to_output_weights =
+  const TfLiteTensor* recurrent_to_output_weights =
       GetInput(context, node, kRecurrentToOutputWeightsTensor);
   TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->data[0],
@@ -250,15 +274,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Check that input tensor dimensions matches with each other.
   CheckInputTensorDimensions(context, node, n_input, n_output, n_cell);
 
-  // Get the pointer to output, state and scratch buffer tensors.
+  // Get the pointer to output, output_state and cell_state tensors.
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   TfLiteTensor* output_state = GetOutput(context, node, kOutputStateTensor);
   TfLiteTensor* cell_state = GetOutput(context, node, kCellStateTensor);
-  // TODO(ghodrat): Modify this as soon as we have a finalized method for
-  // scratch buffers.
-  TfLiteTensor* scratch_buffer = GetOutput(context, node, kScratchBufferTensor);
 
-  // Resize the output and output_state tensors.
+  // Resize the output, output_state and cell_state tensors.
   TfLiteIntArray* output_size = TfLiteIntArrayCreate(2);
   output_size->data[0] = n_batch;
   output_size->data[1] = n_output;
@@ -271,7 +292,6 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(
       context, context->ResizeTensor(context, output_state, output_state_size));
 
-  // Resize the output, state and scratch buffer tensors.
   TfLiteIntArray* cell_size = TfLiteIntArrayCreate(2);
   cell_size->data[0] = n_batch;
   cell_size->data[1] = n_cell;
@@ -282,74 +302,144 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   output_state->allocation_type = kTfLiteArenaRwPersistent;
   cell_state->allocation_type = kTfLiteArenaRwPersistent;
 
-  TfLiteTensor* input_to_input_weights =
+  // The weights are of consistent type, so it suffices to check one.
+  // TODO(mirkov): create a utility/macro for this check, so all Ops can use it.
+  const bool is_hybrid_op = (input_to_output_weights->type == kTfLiteUInt8 &&
+                             input->type == kTfLiteFloat32);
+
+  TfLiteIntArrayFree(node->temporaries);
+  if (is_hybrid_op) {
+    node->temporaries = TfLiteIntArrayCreate(7);
+  } else {
+    node->temporaries = TfLiteIntArrayCreate(1);
+  }
+  node->temporaries->data[0] = op_data->scratch_tensor_index;
+
+  // Create a scratch buffer tensor.
+  TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
+  scratch_buffer->type = input->type;
+  scratch_buffer->allocation_type = kTfLiteArenaRw;
+
+  const TfLiteTensor* input_to_input_weights =
       GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
   const bool use_cifg = (input_to_input_weights == nullptr);
+  TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2);
+  scratch_buffer_size->data[0] = n_batch;
   if (use_cifg) {
-    TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2);
-    scratch_buffer_size->data[0] = n_batch;
     // Reserving space for Cell, Forget, Output gates
     scratch_buffer_size->data[1] = n_cell * 3;
-    TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer,
-                                                     scratch_buffer_size));
   } else {
-    TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2);
-    scratch_buffer_size->data[0] = n_batch;
     // Reserving space for Input, Cell, Forget, Output gates
     scratch_buffer_size->data[1] = n_cell * 4;
-    TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer,
-                                                     scratch_buffer_size));
+  }
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer,
+                                                   scratch_buffer_size));
+
+  if (is_hybrid_op) {
+    // Allocate temporary tensors to store quantized values of input,
+    // output_state and cell_state tensors.
+    node->temporaries->data[1] = op_data->scratch_tensor_index + 1;
+    TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
+    input_quantized->type = kTfLiteUInt8;
+    input_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
+      TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
+                                                       input_quantized_size));
+    }
+    node->temporaries->data[2] = op_data->scratch_tensor_index + 2;
+    TfLiteTensor* output_state_quantized =
+        GetTemporary(context, node, /*index=*/2);
+    output_state_quantized->type = kTfLiteUInt8;
+    output_state_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(output_state_quantized->dims,
+                             output_state->dims)) {
+      TfLiteIntArray* output_state_quantized_size =
+          TfLiteIntArrayCopy(output_state->dims);
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, output_state_quantized,
+                                              output_state_quantized_size));
+    }
+    node->temporaries->data[3] = op_data->scratch_tensor_index + 3;
+    TfLiteTensor* cell_state_quantized =
+        GetTemporary(context, node, /*index=*/3);
+    cell_state_quantized->type = kTfLiteUInt8;
+    cell_state_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(cell_state_quantized->dims, cell_state->dims)) {
+      TfLiteIntArray* cell_state_quantized_size =
+          TfLiteIntArrayCopy(cell_state->dims);
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, cell_state_quantized,
+                                              cell_state_quantized_size));
+    }
+
+    // Allocate temporary tensors to store scaling factors and product scaling
+    // factors. The latter is a convenience storage which allows to quantize
+    // a vector once (which produces the scaling factors) and multiply it with
+    // different matrices (which requires multiplying the scaling factors with
+    // the scaling factor of the matrix).
+    node->temporaries->data[4] = op_data->scratch_tensor_index + 4;
+    TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/4);
+    scaling_factors->type = kTfLiteFloat32;
+    scaling_factors->allocation_type = kTfLiteArenaRw;
+    TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
+    scaling_factors_size->data[0] = n_batch;
+    if (!TfLiteIntArrayEqual(scaling_factors->dims, scaling_factors_size)) {
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
+                                                       scaling_factors_size));
+    }
+    node->temporaries->data[5] = op_data->scratch_tensor_index + 5;
+    TfLiteTensor* prod_scaling_factors =
+        GetTemporary(context, node, /*index=*/5);
+    prod_scaling_factors->type = kTfLiteFloat32;
+    prod_scaling_factors->allocation_type = kTfLiteArenaRw;
+    TfLiteIntArray* prod_scaling_factors_size = TfLiteIntArrayCreate(1);
+    prod_scaling_factors_size->data[0] = n_batch;
+    if (!TfLiteIntArrayEqual(prod_scaling_factors->dims,
+                             prod_scaling_factors_size)) {
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, prod_scaling_factors,
+                                              prod_scaling_factors_size));
+    }
+
+    // Allocate a temporary tensor to store the recovered cell weights. Since
+    // this is used for diagonal matrices, only need to store n_cell values.
+    node->temporaries->data[6] = op_data->scratch_tensor_index + 6;
+    TfLiteTensor* recovered_cell_weights =
+        GetTemporary(context, node, /*index=*/6);
+    recovered_cell_weights->type = kTfLiteFloat32;
+    recovered_cell_weights->allocation_type = kTfLiteArenaRw;
+    TfLiteIntArray* recovered_cell_weights_size = TfLiteIntArrayCreate(1);
+    recovered_cell_weights_size->data[0] = n_cell;
+    if (!TfLiteIntArrayEqual(recovered_cell_weights->dims,
+                             recovered_cell_weights_size)) {
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, recovered_cell_weights,
+                                              recovered_cell_weights_size));
+    }
   }
   return kTfLiteOk;
 }
 
 // The LSTM Op engine.
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
-
-  TfLiteTensor* input_to_input_weights =
-      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
-  TfLiteTensor* input_to_forget_weights =
-      GetInput(context, node, kInputToForgetWeightsTensor);
-  TfLiteTensor* input_to_cell_weights =
-      GetInput(context, node, kInputToCellWeightsTensor);
-  TfLiteTensor* input_to_output_weights =
-      GetInput(context, node, kInputToOutputWeightsTensor);
-
-  TfLiteTensor* recurrent_to_input_weights =
-      GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
-  TfLiteTensor* recurrent_to_forget_weights =
-      GetInput(context, node, kRecurrentToForgetWeightsTensor);
-  TfLiteTensor* recurrent_to_cell_weights =
-      GetInput(context, node, kRecurrentToCellWeightsTensor);
-  TfLiteTensor* recurrent_to_output_weights =
-      GetInput(context, node, kRecurrentToOutputWeightsTensor);
-
-  TfLiteTensor* cell_to_input_weights =
-      GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
-  TfLiteTensor* cell_to_forget_weights =
-      GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
-  TfLiteTensor* cell_to_output_weights =
-      GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
-
-  TfLiteTensor* input_gate_bias =
-      GetOptionalInputTensor(context, node, kInputGateBiasTensor);
-  TfLiteTensor* forget_gate_bias =
-      GetInput(context, node, kForgetGateBiasTensor);
-  TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
-  TfLiteTensor* output_gate_bias =
-      GetInput(context, node, kOutputGateBiasTensor);
-
-  TfLiteTensor* projection_weights =
-      GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
-  TfLiteTensor* projection_bias =
-      GetOptionalInputTensor(context, node, kProjectionBiasTensor);
-
-  TfLiteTensor* output_state = GetOutput(context, node, kOutputStateTensor);
-  TfLiteTensor* cell_state = GetOutput(context, node, kCellStateTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
+TfLiteStatus EvalFloat(
+    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
+    const TfLiteTensor* input_to_forget_weights,
+    const TfLiteTensor* input_to_cell_weights,
+    const TfLiteTensor* input_to_output_weights,
+    const TfLiteTensor* recurrent_to_input_weights,
+    const TfLiteTensor* recurrent_to_forget_weights,
+    const TfLiteTensor* recurrent_to_cell_weights,
+    const TfLiteTensor* recurrent_to_output_weights,
+    const TfLiteTensor* cell_to_input_weights,
+    const TfLiteTensor* cell_to_forget_weights,
+    const TfLiteTensor* cell_to_output_weights,
+    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
+    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
+    const TfLiteLSTMParams* params, TfLiteTensor* scratch_buffer,
+    TfLiteTensor* output_state, TfLiteTensor* cell_state,
+    TfLiteTensor* output) {
   const int n_batch = input->dims->data[0];
   const int n_input = input->dims->data[1];
   // n_cell and n_output will be the same size when there is no projection.
@@ -361,8 +451,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const bool use_cifg = (input_to_input_weights == nullptr);
   const bool use_peephole = (cell_to_output_weights != nullptr);
 
-  // Index the scratch buffers pointers to the global scratch buffer.
-  TfLiteTensor* scratch_buffer = GetOutput(context, node, kScratchBufferTensor);
   float* input_gate_scratch = nullptr;
   float* cell_scratch = nullptr;
   float* forget_gate_scratch = nullptr;
@@ -430,11 +518,426 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
+TfLiteStatus EvalHybrid(
+    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
+    const TfLiteTensor* input_to_forget_weights,
+    const TfLiteTensor* input_to_cell_weights,
+    const TfLiteTensor* input_to_output_weights,
+    const TfLiteTensor* recurrent_to_input_weights,
+    const TfLiteTensor* recurrent_to_forget_weights,
+    const TfLiteTensor* recurrent_to_cell_weights,
+    const TfLiteTensor* recurrent_to_output_weights,
+    const TfLiteTensor* cell_to_input_weights,
+    const TfLiteTensor* cell_to_forget_weights,
+    const TfLiteTensor* cell_to_output_weights,
+    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
+    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
+    const TfLiteLSTMParams* params, TfLiteTensor* scratch_buffer,
+    TfLiteTensor* scaling_factors, TfLiteTensor* prod_scaling_factors,
+    TfLiteTensor* recovered_cell_weights, TfLiteTensor* input_quantized,
+    TfLiteTensor* output_state_quantized, TfLiteTensor* cell_state_quantized,
+    TfLiteTensor* output_state, TfLiteTensor* cell_state,
+    TfLiteTensor* output) {
+  const int n_batch = input->dims->data[0];
+  const int n_input = input->dims->data[1];
+  // n_cell and n_output will be the same size when there is no projection.
+  const int n_cell = input_to_output_weights->dims->data[0];
+  const int n_output = recurrent_to_output_weights->dims->data[1];
+
+  // Since we have already checked that weights are all there or none, we can
+  // check the existence of only one to get the condition.
+  const bool use_cifg = (input_to_input_weights == nullptr);
+  const bool use_peephole = (cell_to_output_weights != nullptr);
+
+  float* input_gate_scratch = nullptr;
+  float* cell_scratch = nullptr;
+  float* forget_gate_scratch = nullptr;
+  float* output_gate_scratch = nullptr;
+  if (use_cifg) {
+    cell_scratch = scratch_buffer->data.f;
+    forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch;
+    output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
+  } else {
+    input_gate_scratch = scratch_buffer->data.f;
+    cell_scratch = scratch_buffer->data.f + n_cell * n_batch;
+    forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
+    output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch;
+  }
+
+  // Check optional tensors, the respective pointers can be null.
+  int8_t* input_to_input_weights_ptr = nullptr;
+  float input_to_input_weights_scale = 1.0f;
+  int8_t* recurrent_to_input_weights_ptr = nullptr;
+  float recurrent_to_input_weights_scale = 1.0f;
+  float* input_gate_bias_ptr = nullptr;
+  if (!use_cifg) {
+    input_to_input_weights_ptr =
+        reinterpret_cast<int8_t*>(input_to_input_weights->data.uint8);
+    recurrent_to_input_weights_ptr =
+        reinterpret_cast<int8_t*>(recurrent_to_input_weights->data.uint8);
+    input_gate_bias_ptr = input_gate_bias->data.f;
+    input_to_input_weights_scale = input_to_input_weights->params.scale;
+    recurrent_to_input_weights_scale = recurrent_to_input_weights->params.scale;
+  }
+
+  int8_t* cell_to_input_weights_ptr = nullptr;
+  int8_t* cell_to_forget_weights_ptr = nullptr;
+  int8_t* cell_to_output_weights_ptr = nullptr;
+  float cell_to_input_weights_scale = 1.0f;
+  float cell_to_forget_weights_scale = 1.0f;
+  float cell_to_output_weights_scale = 1.0f;
+  if (use_peephole) {
+    if (!use_cifg) {
+      cell_to_input_weights_ptr =
+          reinterpret_cast<int8_t*>(cell_to_input_weights->data.uint8);
+      cell_to_input_weights_scale = cell_to_input_weights->params.scale;
+    }
+    cell_to_forget_weights_ptr =
+        reinterpret_cast<int8_t*>(cell_to_forget_weights->data.uint8);
+    cell_to_output_weights_ptr =
+        reinterpret_cast<int8_t*>(cell_to_output_weights->data.uint8);
+    cell_to_forget_weights_scale = cell_to_forget_weights->params.scale;
+    cell_to_output_weights_scale = cell_to_output_weights->params.scale;
+  }
+
+  const int8_t* projection_weights_ptr =
+      (projection_weights == nullptr)
+          ? nullptr
+          : reinterpret_cast<int8_t*>(projection_weights->data.uint8);
+  const float projection_weights_scale =
+      (projection_weights == nullptr) ? 1.0f : projection_weights->params.scale;
+  const float* projection_bias_ptr =
+      (projection_bias == nullptr) ? nullptr : projection_bias->data.f;
+
+  // Required tensors, pointers are non-null.
+  const float* input_ptr_batch = input->data.f;
+  const int8_t* input_to_forget_weights_ptr =
+      reinterpret_cast<int8_t*>(input_to_forget_weights->data.uint8);
+  const float input_to_forget_weights_scale =
+      input_to_forget_weights->params.scale;
+  const int8_t* input_to_cell_weights_ptr =
+      reinterpret_cast<int8_t*>(input_to_cell_weights->data.uint8);
+  const float input_to_cell_weights_scale = input_to_cell_weights->params.scale;
+  const int8_t* input_to_output_weights_ptr =
+      reinterpret_cast<int8_t*>(input_to_output_weights->data.uint8);
+  const float input_to_output_weights_scale =
+      input_to_output_weights->params.scale;
+  const int8_t* recurrent_to_forget_weights_ptr =
+      reinterpret_cast<int8_t*>(recurrent_to_forget_weights->data.uint8);
+  const float recurrent_to_forget_weights_scale =
+      recurrent_to_forget_weights->params.scale;
+  const int8_t* recurrent_to_cell_weights_ptr =
+      reinterpret_cast<int8_t*>(recurrent_to_cell_weights->data.uint8);
+  const float recurrent_to_cell_weights_scale =
+      recurrent_to_cell_weights->params.scale;
+  const int8_t* recurrent_to_output_weights_ptr =
+      reinterpret_cast<int8_t*>(recurrent_to_output_weights->data.uint8);
+  const float recurrent_to_output_weights_scale =
+      recurrent_to_output_weights->params.scale;
+  const float* forget_gate_bias_ptr = forget_gate_bias->data.f;
+  const float* cell_bias_ptr = cell_bias->data.f;
+  const float* output_gate_bias_ptr = output_gate_bias->data.f;
+
+  float* output_state_ptr = output_state->data.f;
+  float* cell_state_ptr = cell_state->data.f;
+  float* output_ptr_batch = output->data.f;
+
+  // Temporary storage for quantized values and scaling factors.
+  int8_t* quantized_input_ptr =
+      reinterpret_cast<int8_t*>(input_quantized->data.uint8);
+  int8_t* quantized_output_state_ptr =
+      reinterpret_cast<int8_t*>(output_state_quantized->data.uint8);
+  int8_t* quantized_cell_state_ptr =
+      reinterpret_cast<int8_t*>(cell_state_quantized->data.uint8);
+  float* scaling_factors_ptr = scaling_factors->data.f;
+  float* prod_scaling_factors_ptr = prod_scaling_factors->data.f;
+  float* recovered_cell_weights_ptr = recovered_cell_weights->data.f;
+
+  kernel_utils::LstmStep(
+      input_ptr_batch, input_to_input_weights_ptr, input_to_input_weights_scale,
+      input_to_forget_weights_ptr, input_to_forget_weights_scale,
+      input_to_cell_weights_ptr, input_to_cell_weights_scale,
+      input_to_output_weights_ptr, input_to_output_weights_scale,
+      recurrent_to_input_weights_ptr, recurrent_to_input_weights_scale,
+      recurrent_to_forget_weights_ptr, recurrent_to_forget_weights_scale,
+      recurrent_to_cell_weights_ptr, recurrent_to_cell_weights_scale,
+      recurrent_to_output_weights_ptr, recurrent_to_output_weights_scale,
+      cell_to_input_weights_ptr, cell_to_input_weights_scale,
+      cell_to_forget_weights_ptr, cell_to_forget_weights_scale,
+      cell_to_output_weights_ptr, cell_to_output_weights_scale,
+      input_gate_bias_ptr, forget_gate_bias_ptr, cell_bias_ptr,
+      output_gate_bias_ptr, projection_weights_ptr, projection_weights_scale,
+      projection_bias_ptr, params, n_batch, n_cell, n_input, n_output,
+      input_gate_scratch, forget_gate_scratch, cell_scratch,
+      output_gate_scratch, scaling_factors_ptr, prod_scaling_factors_ptr,
+      recovered_cell_weights_ptr, quantized_input_ptr,
+      quantized_output_state_ptr, quantized_cell_state_ptr, output_state_ptr,
+      cell_state_ptr, output_ptr_batch);
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+
+  const TfLiteTensor* input_to_input_weights =
+      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
+  const TfLiteTensor* input_to_forget_weights =
+      GetInput(context, node, kInputToForgetWeightsTensor);
+  const TfLiteTensor* input_to_cell_weights =
+      GetInput(context, node, kInputToCellWeightsTensor);
+  const TfLiteTensor* input_to_output_weights =
+      GetInput(context, node, kInputToOutputWeightsTensor);
+
+  const TfLiteTensor* recurrent_to_input_weights =
+      GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
+  const TfLiteTensor* recurrent_to_forget_weights =
+      GetInput(context, node, kRecurrentToForgetWeightsTensor);
+  const TfLiteTensor* recurrent_to_cell_weights =
+      GetInput(context, node, kRecurrentToCellWeightsTensor);
+  const TfLiteTensor* recurrent_to_output_weights =
+      GetInput(context, node, kRecurrentToOutputWeightsTensor);
+
+  const TfLiteTensor* cell_to_input_weights =
+      GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
+  const TfLiteTensor* cell_to_forget_weights =
+      GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
+  const TfLiteTensor* cell_to_output_weights =
+      GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
+
+  const TfLiteTensor* input_gate_bias =
+      GetOptionalInputTensor(context, node, kInputGateBiasTensor);
+  const TfLiteTensor* forget_gate_bias =
+      GetInput(context, node, kForgetGateBiasTensor);
+  const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
+  const TfLiteTensor* output_gate_bias =
+      GetInput(context, node, kOutputGateBiasTensor);
+
+  const TfLiteTensor* projection_weights =
+      GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
+  const TfLiteTensor* projection_bias =
+      GetOptionalInputTensor(context, node, kProjectionBiasTensor);
+
+  // Index the scratch buffers pointers to the global scratch buffer.
+  TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
+
+  TfLiteTensor* output_state = GetOutput(context, node, kOutputStateTensor);
+  TfLiteTensor* cell_state = GetOutput(context, node, kCellStateTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  // TODO(mirkov): add a check that weights are all uint8s or all floats.
+  switch (input_to_output_weights->type) {
+    case kTfLiteFloat32: {
+      return EvalFloat(input, input_to_input_weights, input_to_forget_weights,
+                       input_to_cell_weights, input_to_output_weights,
+                       recurrent_to_input_weights, recurrent_to_forget_weights,
+                       recurrent_to_cell_weights, recurrent_to_output_weights,
+                       cell_to_input_weights, cell_to_forget_weights,
+                       cell_to_output_weights, input_gate_bias,
+                       forget_gate_bias, cell_bias, output_gate_bias,
+                       projection_weights, projection_bias, params,
+                       scratch_buffer, output_state, cell_state, output);
+    }
+    case kTfLiteUInt8: {
+      TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
+      TfLiteTensor* output_state_quantized =
+          GetTemporary(context, node, /*index=*/2);
+      TfLiteTensor* cell_state_quantized =
+          GetTemporary(context, node, /*index=*/3);
+      TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/4);
+      TfLiteTensor* prod_scaling_factors =
+          GetTemporary(context, node, /*index=*/5);
+      TfLiteTensor* recovered_cell_weights =
+          GetTemporary(context, node, /*index=*/6);
+      return EvalHybrid(
+          input, input_to_input_weights, input_to_forget_weights,
+          input_to_cell_weights, input_to_output_weights,
+          recurrent_to_input_weights, recurrent_to_forget_weights,
+          recurrent_to_cell_weights, recurrent_to_output_weights,
+          cell_to_input_weights, cell_to_forget_weights, cell_to_output_weights,
+          input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias,
+          projection_weights, projection_bias, params, scratch_buffer,
+          scaling_factors, prod_scaling_factors, recovered_cell_weights,
+          input_quantized, output_state_quantized, cell_state_quantized,
+          output_state, cell_state, output);
+    }
+    default:
+      context->ReportError(context, "Type %d is not currently supported.",
+                           input_to_output_weights->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace full
+
+// For basic kernel (5-inputs).
+namespace basic {
+
+enum InputTensor {
+  kInputData = 0,
+  kInputPrevActivation = 1,
+  kInputWeights = 2,
+  kInputBiases = 3,
+  kInputPrevState = 4,
+  kInputNum = 5,
+};
+
+enum OutputTensor {
+  kOutputActivation = 0,
+  kOutputState = 1,
+  kOutputConcatTemp = 2,
+  kOutputActivationTemp = 3,
+  kOutputNum = 4,
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* op_data = new OpData;
+  op_data->kernel_type = kTfLiteLSTMBasicKernel;
+  // `scratch_tensor_index` is unused in this kernel.
+  op_data->scratch_tensor_index = -1;
+  return op_data;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE(context, node->inputs->size == kInputNum);
+  TF_LITE_ENSURE(context, node->outputs->size == kOutputNum);
+
+  // Only Float32 is supported currently.
+  // TODO(ycling): Implement quantize uint8 support.
+  for (int index = 0; index < node->inputs->size; ++index) {
+    TfLiteTensor* tensor = &context->tensors[node->inputs->data[index]];
+    TF_LITE_ENSURE_EQ(context, tensor->type, kTfLiteFloat32);
+  }
+
+  const TfLiteTensor* input = GetInput(context, node, kInputData);
+  const TfLiteTensor* prev_activation =
+      GetInput(context, node, kInputPrevActivation);
+  const TfLiteTensor* weights = GetInput(context, node, kInputWeights);
+  const TfLiteTensor* bias = GetInput(context, node, kInputBiases);
+  const TfLiteTensor* prev_state = GetInput(context, node, kInputPrevState);
+
+  TF_LITE_ENSURE_EQ(context, input->dims->size, 2);
+  const int num_batches = input->dims->data[0];
+
+  TF_LITE_ENSURE_EQ(context, prev_activation->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, prev_activation->dims->data[0], num_batches);
+
+  TF_LITE_ENSURE_EQ(context, weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, bias->dims->size, 1);
+
+  TF_LITE_ENSURE_EQ(context, prev_state->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, prev_state->dims->data[0], num_batches);
+
+  TfLiteTensor* activation_out = GetOutput(context, node, kOutputActivation);
+  TfLiteTensor* state_out = GetOutput(context, node, kOutputState);
+  TfLiteTensor* concat_temp = GetOutput(context, node, kOutputConcatTemp);
+  TfLiteTensor* activation_temp =
+      GetOutput(context, node, kOutputActivationTemp);
+
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(
+                                 context, activation_out,
+                                 TfLiteIntArrayCopy(prev_activation->dims)));
+  TF_LITE_ENSURE_OK(
+      context, context->ResizeTensor(context, state_out,
+                                     TfLiteIntArrayCopy(prev_state->dims)));
+  TfLiteIntArray* concat_temp_size = TfLiteIntArrayCreate(2);
+  concat_temp_size->data[0] = num_batches;
+  concat_temp_size->data[1] = weights->dims->data[1];
+  TF_LITE_ENSURE_OK(
+      context, context->ResizeTensor(context, concat_temp, concat_temp_size));
+  TfLiteIntArray* activation_temp_size = TfLiteIntArrayCreate(2);
+  activation_temp_size->data[0] = num_batches;
+  activation_temp_size->data[1] = weights->dims->data[0];
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, activation_temp,
+                                                   activation_temp_size));
+
+  // Set the state tensors as persistent.
+  for (auto index : {kInputPrevActivation, kInputPrevState}) {
+    TfLiteTensor* tensor = &context->tensors[node->inputs->data[index]];
+    tensor->allocation_type = kTfLiteArenaRwPersistent;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputData);
+  const TfLiteTensor* prev_activation =
+      GetInput(context, node, kInputPrevActivation);
+  const TfLiteTensor* weights = GetInput(context, node, kInputWeights);
+  const TfLiteTensor* bias = GetInput(context, node, kInputBiases);
+  const TfLiteTensor* prev_state = GetInput(context, node, kInputPrevState);
+
+  TfLiteTensor* activation_out = GetOutput(context, node, kOutputActivation);
+  TfLiteTensor* state_out = GetOutput(context, node, kOutputState);
+  TfLiteTensor* concat_temp = GetOutput(context, node, kOutputConcatTemp);
+  TfLiteTensor* activation_temp =
+      GetOutput(context, node, kOutputActivationTemp);
+
+  optimized_ops::LstmCell(
+      // Inputs.
+      GetTensorData<float>(input), GetTensorDims(input),
+      GetTensorData<float>(prev_activation), GetTensorDims(prev_activation),
+      GetTensorData<float>(weights), GetTensorDims(weights),
+      GetTensorData<float>(bias), GetTensorDims(bias),
+      GetTensorData<float>(prev_state), GetTensorDims(prev_state),
+      // Outputs.
+      GetTensorData<float>(state_out), GetTensorDims(state_out),
+      GetTensorData<float>(activation_out), GetTensorDims(activation_out),
+      GetTensorData<float>(concat_temp), GetTensorDims(concat_temp),
+      GetTensorData<float>(activation_temp), GetTensorDims(activation_temp));
+
+  // TODO(ycling): Investigate if this copy can be avoided with the 5-inputs
+  // LSTM kernel.
+  memcpy(prev_activation->data.raw, activation_out->data.raw,
+         activation_out->bytes);
+  memcpy(prev_state->data.raw, state_out->data.raw, state_out->bytes);
+
+  return kTfLiteOk;
+}
+
+}  // namespace basic
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  const auto* params = reinterpret_cast<const TfLiteLSTMParams*>(buffer);
+  switch (params->kernel_type) {
+    case kTfLiteLSTMFullKernel:
+      return full::Init(context, buffer, length);
+    case kTfLiteLSTMBasicKernel:
+      return basic::Init(context, buffer, length);
+  }
+}
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const auto* op_data = reinterpret_cast<const OpData*>(node->user_data);
+  switch (op_data->kernel_type) {
+    case kTfLiteLSTMFullKernel:
+      return full::Prepare(context, node);
+    case kTfLiteLSTMBasicKernel:
+      return basic::Prepare(context, node);
+  }
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const auto* op_data = reinterpret_cast<const OpData*>(node->user_data);
+  switch (op_data->kernel_type) {
+    case kTfLiteLSTMFullKernel:
+      return full::Eval(context, node);
+    case kTfLiteLSTMBasicKernel:
+      return basic::Eval(context, node);
+  }
+}
+
 }  // namespace lstm
 
 TfLiteRegistration* Register_LSTM() {
-  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
-                                 lstm::Prepare, lstm::Eval};
+  static TfLiteRegistration r = {lstm::Init, lstm::Free, lstm::Prepare,
+                                 lstm::Eval};
   return &r;
 }
 
diff --git a/tensorflow/contrib/lite/kernels/lstm_test.cc b/tensorflow/contrib/lite/kernels/lstm_test.cc
index c068286b0d84bc..6da29a4a923f16 100644
--- a/tensorflow/contrib/lite/kernels/lstm_test.cc
+++ b/tensorflow/contrib/lite/kernels/lstm_test.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 // Unit test for TFLite LSTM op.
 
-#include <iomanip>
 #include <memory>
 #include <vector>
 
@@ -35,7 +34,8 @@ class LSTMOpModel : public SingleOpModel {
   LSTMOpModel(int n_batch, int n_input, int n_cell, int n_output, bool use_cifg,
               bool use_peephole, bool use_projection_weights,
               bool use_projection_bias, float cell_clip, float proj_clip,
-              const std::vector<std::vector<int>>& input_shapes)
+              const std::vector<std::vector<int>>& input_shapes,
+              const TensorType& weight_type = TensorType_FLOAT32)
       : n_batch_(n_batch),
         n_input_(n_input),
         n_cell_(n_cell),
@@ -45,31 +45,31 @@ class LSTMOpModel : public SingleOpModel {
     if (use_cifg) {
       input_to_input_weights_ = AddNullInput();
     } else {
-      input_to_input_weights_ = AddInput(TensorType_FLOAT32);
+      input_to_input_weights_ = AddInput(weight_type);
     }
 
-    input_to_forget_weights_ = AddInput(TensorType_FLOAT32);
-    input_to_cell_weights_ = AddInput(TensorType_FLOAT32);
-    input_to_output_weights_ = AddInput(TensorType_FLOAT32);
+    input_to_forget_weights_ = AddInput(weight_type);
+    input_to_cell_weights_ = AddInput(weight_type);
+    input_to_output_weights_ = AddInput(weight_type);
 
     if (use_cifg) {
       recurrent_to_input_weights_ = AddNullInput();
     } else {
-      recurrent_to_input_weights_ = AddInput(TensorType_FLOAT32);
+      recurrent_to_input_weights_ = AddInput(weight_type);
     }
 
-    recurrent_to_forget_weights_ = AddInput(TensorType_FLOAT32);
-    recurrent_to_cell_weights_ = AddInput(TensorType_FLOAT32);
-    recurrent_to_output_weights_ = AddInput(TensorType_FLOAT32);
+    recurrent_to_forget_weights_ = AddInput(weight_type);
+    recurrent_to_cell_weights_ = AddInput(weight_type);
+    recurrent_to_output_weights_ = AddInput(weight_type);
 
     if (use_peephole) {
       if (use_cifg) {
         cell_to_input_weights_ = AddNullInput();
       } else {
-        cell_to_input_weights_ = AddInput(TensorType_FLOAT32);
+        cell_to_input_weights_ = AddInput(weight_type);
       }
-      cell_to_forget_weights_ = AddInput(TensorType_FLOAT32);
-      cell_to_output_weights_ = AddInput(TensorType_FLOAT32);
+      cell_to_forget_weights_ = AddInput(weight_type);
+      cell_to_output_weights_ = AddInput(weight_type);
     } else {
       cell_to_input_weights_ = AddNullInput();
       cell_to_forget_weights_ = AddNullInput();
@@ -86,7 +86,7 @@ class LSTMOpModel : public SingleOpModel {
     output_gate_bias_ = AddInput(TensorType_FLOAT32);
 
     if (use_projection_weights) {
-      projection_weights_ = AddInput(TensorType_FLOAT32);
+      projection_weights_ = AddInput(weight_type);
       if (use_projection_bias) {
         projection_bias_ = AddInput(TensorType_FLOAT32);
       } else {
@@ -97,9 +97,6 @@ class LSTMOpModel : public SingleOpModel {
       projection_bias_ = AddNullInput();
     }
 
-    scratch_buffer_ = AddOutput(TensorType_FLOAT32);
-    // TODO(ghodrat): Modify these states when we have a permanent solution for
-    // persistent buffer.
     output_state_ = AddOutput(TensorType_FLOAT32);
     cell_state_ = AddOutput(TensorType_FLOAT32);
     output_ = AddOutput(TensorType_FLOAT32);
@@ -195,8 +192,9 @@ class LSTMOpModel : public SingleOpModel {
                    zero_buffer.get() + zero_buffer_size);
   }
 
-  void SetInput(int offset, float* begin, float* end) {
-    PopulateTensor(input_, offset, begin, end);
+  void SetInput(int offset, const float* begin, const float* end) {
+    PopulateTensor(input_, offset, const_cast<float*>(begin),
+                   const_cast<float*>(end));
   }
 
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
@@ -206,7 +204,7 @@ class LSTMOpModel : public SingleOpModel {
   int num_cells() { return n_cell_; }
   int num_batches() { return n_batch_; }
 
- private:
+ protected:
   int input_;
   int input_to_input_weights_;
   int input_to_forget_weights_;
@@ -233,7 +231,6 @@ class LSTMOpModel : public SingleOpModel {
   int output_;
   int output_state_;
   int cell_state_;
-  int scratch_buffer_;
 
   int n_batch_;
   int n_input_;
@@ -241,7 +238,182 @@ class LSTMOpModel : public SingleOpModel {
   int n_output_;
 };
 
-TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) {
+class HybridLSTMOpModel : public LSTMOpModel {
+ public:
+  HybridLSTMOpModel(int n_batch, int n_input, int n_cell, int n_output,
+                    bool use_cifg, bool use_peephole,
+                    bool use_projection_weights, bool use_projection_bias,
+                    float cell_clip, float proj_clip,
+                    const std::vector<std::vector<int>>& input_shapes)
+      : LSTMOpModel(n_batch, n_input, n_cell, n_output, use_cifg, use_peephole,
+                    use_projection_weights, use_projection_bias, cell_clip,
+                    proj_clip, input_shapes, TensorType_UINT8) {}
+
+  void SetInputToInputWeights(std::initializer_list<float> f) {
+    SymmetricQuantizeAndPopulate(input_to_input_weights_, f);
+  }
+
+  void SetInputToForgetWeights(std::initializer_list<float> f) {
+    SymmetricQuantizeAndPopulate(input_to_forget_weights_, f);
+  }
+
+  void SetInputToCellWeights(std::initializer_list<float> f) {
+    SymmetricQuantizeAndPopulate(input_to_cell_weights_, f);
+  }
+
+  void SetInputToOutputWeights(std::initializer_list<float> f) {
+    SymmetricQuantizeAndPopulate(input_to_output_weights_, f);
+  }
+
+  void SetRecurrentToInputWeights(std::initializer_list<float> f) {
+    SymmetricQuantizeAndPopulate(recurrent_to_input_weights_, f);
+  }
+
+  void SetRecurrentToForgetWeights(std::initializer_list<float> f) {
+    SymmetricQuantizeAndPopulate(recurrent_to_forget_weights_, f);
+  }
+
+  void SetRecurrentToCellWeights(std::initializer_list<float> f) {
+    SymmetricQuantizeAndPopulate(recurrent_to_cell_weights_, f);
+  }
+
+  void SetRecurrentToOutputWeights(std::initializer_list<float> f) {
+    SymmetricQuantizeAndPopulate(recurrent_to_output_weights_, f);
+  }
+
+  void SetCellToInputWeights(std::initializer_list<float> f) {
+    SymmetricQuantizeAndPopulate(cell_to_input_weights_, f);
+  }
+
+  void SetCellToForgetWeights(std::initializer_list<float> f) {
+    SymmetricQuantizeAndPopulate(cell_to_forget_weights_, f);
+  }
+
+  void SetCellToOutputWeights(std::initializer_list<float> f) {
+    SymmetricQuantizeAndPopulate(cell_to_output_weights_, f);
+  }
+
+  void SetProjectionWeights(std::initializer_list<float> f) {
+    SymmetricQuantizeAndPopulate(projection_weights_, f);
+  }
+};
+
+class BaseLstmTest : public ::testing::Test {
+ protected:
+  // Weights of the LSTM model. Some are optional.
+  std::initializer_list<float> input_to_input_weights_;
+  std::initializer_list<float> input_to_cell_weights_;
+  std::initializer_list<float> input_to_forget_weights_;
+  std::initializer_list<float> input_to_output_weights_;
+  std::initializer_list<float> input_gate_bias_;
+  std::initializer_list<float> cell_gate_bias_;
+  std::initializer_list<float> forget_gate_bias_;
+  std::initializer_list<float> output_gate_bias_;
+  std::initializer_list<float> recurrent_to_input_weights_;
+  std::initializer_list<float> recurrent_to_cell_weights_;
+  std::initializer_list<float> recurrent_to_forget_weights_;
+  std::initializer_list<float> recurrent_to_output_weights_;
+  std::initializer_list<float> cell_to_input_weights_;
+  std::initializer_list<float> cell_to_forget_weights_;
+  std::initializer_list<float> cell_to_output_weights_;
+  std::initializer_list<float> projection_weights_;
+
+  // LSTM input is stored as num_batch x num_inputs vector.
+  std::vector<std::vector<float>> lstm_input_;
+  // LSTM output is stored as num_batch x num_outputs vector.
+  std::vector<std::vector<float>> lstm_golden_output_;
+
+  // Compares output up to tolerance to the result of the lstm given the input.
+  void VerifyGoldens(const std::vector<std::vector<float>>& input,
+                     const std::vector<std::vector<float>>& output,
+                     LSTMOpModel* lstm, float tolerance = 1e-5) {
+    const int num_batches = input.size();
+    EXPECT_GT(num_batches, 0);
+    const int num_inputs = lstm->num_inputs();
+    EXPECT_GT(num_inputs, 0);
+    const int input_sequence_size = input[0].size() / num_inputs;
+    EXPECT_GT(input_sequence_size, 0);
+    for (int i = 0; i < input_sequence_size; ++i) {
+      for (int b = 0; b < num_batches; ++b) {
+        const float* batch_start = input[b].data() + i * num_inputs;
+        const float* batch_end = batch_start + num_inputs;
+
+        lstm->SetInput(b * lstm->num_inputs(), batch_start, batch_end);
+      }
+
+      lstm->Invoke();
+
+      const int num_outputs = lstm->num_outputs();
+      std::vector<float> expected;
+      for (int b = 0; b < num_batches; ++b) {
+        const float* golden_start_batch = output[b].data() + i * num_outputs;
+        const float* golden_end_batch = golden_start_batch + num_outputs;
+        expected.insert(expected.end(), golden_start_batch, golden_end_batch);
+      }
+      EXPECT_THAT(lstm->GetOutput(),
+                  ElementsAreArray(ArrayFloatNear(expected, tolerance)));
+      for (int i = 0; i < num_outputs; ++i) {
+        std::cout << lstm->GetOutput()[i] << ", ";
+      }
+      std::cout << std::endl;
+      for (int i = 0; i < num_outputs; ++i) {
+        std::cout << expected[i] << ", ";
+      }
+      std::cout << std::endl;
+    }
+  }
+};
+
+class NoCifgNoPeepholeNoProjectionNoClippingLstmTest : public BaseLstmTest {
+  void SetUp() override {
+    input_to_input_weights_ = {-0.45018822, -0.02338299, -0.0870589,
+                               -0.34550029, 0.04266912,  -0.15680569,
+                               -0.34856534, 0.43890524};
+    input_to_cell_weights_ = {-0.50013041, 0.1370284,  0.11810488, 0.2013163,
+                              -0.20583314, 0.44344562, 0.22077113, -0.29909778};
+    input_to_forget_weights_ = {0.09701663,  0.20334584,  -0.50592935,
+                                -0.31343272, -0.40032279, 0.44781327,
+                                0.01387155,  -0.35593212};
+    input_to_output_weights_ = {-0.25065863, -0.28290087, 0.04613829,
+                                0.40525138,  0.44272184,  0.03897077,
+                                -0.1556896,  0.19487578};
+    input_gate_bias_ = {0., 0., 0., 0.};
+    cell_gate_bias_ = {0., 0., 0., 0.};
+    forget_gate_bias_ = {1., 1., 1., 1.};
+    output_gate_bias_ = {0., 0., 0., 0.};
+
+    recurrent_to_input_weights_ = {
+        -0.0063535,  -0.2042388,  0.31454784,  -0.35746509,
+        0.28902304,  0.08183324,  -0.16555229, 0.02286911,
+        -0.13566875, 0.03034258,  0.48091322,  -0.12528998,
+        0.24077177,  -0.51332325, -0.33502164, 0.10629296};
+
+    recurrent_to_cell_weights_ = {
+        -0.3407414,  0.24443203,  -0.2078532,  0.26320225,
+        0.05695659,  -0.00123841, -0.4744786,  -0.35869038,
+        -0.06418842, -0.13502428, -0.501764,   0.22830659,
+        -0.46367589, 0.26016325,  -0.03894562, -0.16368064};
+
+    recurrent_to_forget_weights_ = {
+        -0.48684245, -0.06655136, 0.42224967,  0.2112639,
+        0.27654213,  0.20864892,  -0.07646349, 0.45877004,
+        0.00141793,  -0.14609534, 0.36447752,  0.09196436,
+        0.28053468,  0.01560611,  -0.20127171, -0.01140004};
+
+    recurrent_to_output_weights_ = {
+        0.43385774,  -0.17194885, 0.2718237,  0.09215671,
+        0.24107647,  -0.39835793, 0.18212086, 0.01301402,
+        0.48572797,  -0.50656658, 0.20047462, -0.20607421,
+        -0.51818722, -0.15390486, 0.0468148,  0.39922136};
+
+    lstm_input_ = {{2., 3., 3., 4., 1., 1.}};
+    lstm_golden_output_ = {{-0.02973187, 0.1229473, 0.20885126, -0.15358765,
+                            -0.03716109, 0.12507336, 0.41193449, -0.20860538,
+                            -0.15053082, 0.09120187, 0.24278517, -0.12222792}};
+  }
+};
+
+TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) {
   const int n_batch = 1;
   const int n_input = 2;
   // n_cell and n_output have the same size when there is no projection.
@@ -261,10 +433,10 @@ TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) {
                        {n_cell, n_input},  // input_to_cell_weight tensor
                        {n_cell, n_input},  // input_to_output_weight tensor
 
-                       {n_cell, n_output},  // recurrent_to_input_weight tensor
-                       {n_cell, n_output},  // recurrent_to_forget_weight tensor
-                       {n_cell, n_output},  // recurrent_to_cell_weight tensor
-                       {n_cell, n_output},  // recurrent_to_output_weight tensor
+                       {n_cell, n_output},  // recurrent_to_input_weight_tensor
+                       {n_cell, n_output},  // recurrent_to_forget_weight_tensor
+                       {n_cell, n_output},  // recurrent_to_cell_weight_tensor
+                       {n_cell, n_output},  // recurrent_to_output_weight_tensor
 
                        {0},  // cell_to_input_weight tensor
                        {0},  // cell_to_forget_weight tensor
@@ -279,79 +451,137 @@ TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) {
                        {0},     // projection_bias tensor
                    });
 
-  lstm.SetInputToInputWeights({-0.45018822, -0.02338299, -0.0870589,
-                               -0.34550029, 0.04266912, -0.15680569,
-                               -0.34856534, 0.43890524});
-
-  lstm.SetInputToCellWeights({-0.50013041, 0.1370284, 0.11810488, 0.2013163,
-                              -0.20583314, 0.44344562, 0.22077113,
-                              -0.29909778});
-
-  lstm.SetInputToForgetWeights({0.09701663, 0.20334584, -0.50592935,
-                                -0.31343272, -0.40032279, 0.44781327,
-                                0.01387155, -0.35593212});
-
-  lstm.SetInputToOutputWeights({-0.25065863, -0.28290087, 0.04613829,
-                                0.40525138, 0.44272184, 0.03897077, -0.1556896,
-                                0.19487578});
+  lstm.SetInputToInputWeights(input_to_input_weights_);
+  lstm.SetInputToCellWeights(input_to_cell_weights_);
+  lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  lstm.SetInputToOutputWeights(input_to_output_weights_);
 
-  lstm.SetInputGateBias({0., 0., 0., 0.});
+  lstm.SetInputGateBias(input_gate_bias_);
+  lstm.SetCellBias(cell_gate_bias_);
+  lstm.SetForgetGateBias(forget_gate_bias_);
+  lstm.SetOutputGateBias(output_gate_bias_);
 
-  lstm.SetCellBias({0., 0., 0., 0.});
+  lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
+  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
 
-  lstm.SetForgetGateBias({1., 1., 1., 1.});
-
-  lstm.SetOutputGateBias({0., 0., 0., 0.});
-
-  lstm.SetRecurrentToInputWeights(
-      {-0.0063535, -0.2042388, 0.31454784, -0.35746509, 0.28902304, 0.08183324,
-       -0.16555229, 0.02286911, -0.13566875, 0.03034258, 0.48091322,
-       -0.12528998, 0.24077177, -0.51332325, -0.33502164, 0.10629296});
-
-  lstm.SetRecurrentToCellWeights(
-      {-0.3407414, 0.24443203, -0.2078532, 0.26320225, 0.05695659, -0.00123841,
-       -0.4744786, -0.35869038, -0.06418842, -0.13502428, -0.501764, 0.22830659,
-       -0.46367589, 0.26016325, -0.03894562, -0.16368064});
+  // Resetting cell_state and output_state
+  lstm.ResetCellState();
+  lstm.ResetOutputState();
 
-  lstm.SetRecurrentToForgetWeights(
-      {-0.48684245, -0.06655136, 0.42224967, 0.2112639, 0.27654213, 0.20864892,
-       -0.07646349, 0.45877004, 0.00141793, -0.14609534, 0.36447752, 0.09196436,
-       0.28053468, 0.01560611, -0.20127171, -0.01140004});
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm);
+}
 
-  lstm.SetRecurrentToOutputWeights(
-      {0.43385774, -0.17194885, 0.2718237, 0.09215671, 0.24107647, -0.39835793,
-       0.18212086, 0.01301402, 0.48572797, -0.50656658, 0.20047462, -0.20607421,
-       -0.51818722, -0.15390486, 0.0468148, 0.39922136});
+TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when there is no projection.
+  const int n_cell = 4;
+  const int n_output = 4;
 
-  static float lstm_input[] = {2., 3., 3., 4., 1., 1.};
-  static float lstm_golden_output[] = {-0.02973187, 0.1229473,   0.20885126,
-                                       -0.15358765, -0.03716109, 0.12507336,
-                                       0.41193449,  -0.20860538, -0.15053082,
-                                       0.09120187,  0.24278517,  -0.12222792};
+  HybridLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/false, /*use_peephole=*/false,
+      /*use_projection_weights=*/false,
+      /*use_projection_bias=*/false, /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      {
+          {n_batch, n_input},  // input tensor
+
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},  // cell_to_input_weight tensor
+          {0},  // cell_to_forget_weight tensor
+          {0},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {0, 0},  // projection_weight tensor
+          {0},     // projection_bias tensor
+      });
+
+  lstm.SetInputToInputWeights(input_to_input_weights_);
+  lstm.SetInputToCellWeights(input_to_cell_weights_);
+  lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  lstm.SetInputGateBias(input_gate_bias_);
+  lstm.SetCellBias(cell_gate_bias_);
+  lstm.SetForgetGateBias(forget_gate_bias_);
+  lstm.SetOutputGateBias(output_gate_bias_);
+
+  lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
+  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
 
   // Resetting cell_state and output_state
   lstm.ResetCellState();
   lstm.ResetOutputState();
 
-  const int input_sequence_size =
-      sizeof(lstm_input) / sizeof(float) / (lstm.num_inputs());
-  for (int i = 0; i < input_sequence_size; i++) {
-    float* batch0_start = lstm_input + i * lstm.num_inputs();
-    float* batch0_end = batch0_start + lstm.num_inputs();
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm,
+                /*tolerance=*/0.0157651);
+}
 
-    lstm.SetInput(0, batch0_start, batch0_end);
+class CifgNoPeepholeNoProjectionNoClippingLstmTest : public BaseLstmTest {
+  void SetUp() override {
+    input_to_cell_weights_ = {-0.49770179, -0.27711356, -0.09624726,
+                              0.05100781,  0.04717243,  0.48944736,
+                              -0.38535351, -0.17212132};
 
-    lstm.Invoke();
+    input_to_forget_weights_ = {-0.55291498, -0.42866567, 0.13056988,
+                                -0.3633365,  -0.22755712, 0.28253698,
+                                0.24407166,  0.33826375};
 
-    float* golden_start = lstm_golden_output + i * lstm.num_outputs();
-    float* golden_end = golden_start + lstm.num_outputs();
-    std::vector<float> expected;
-    expected.insert(expected.end(), golden_start, golden_end);
-    EXPECT_THAT(lstm.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
+    input_to_output_weights_ = {0.10725588,  -0.02335852, -0.55932593,
+                                -0.09426838, -0.44257352, 0.54939759,
+                                0.01533556,  0.42751634};
+    cell_gate_bias_ = {0., 0., 0., 0.};
+    forget_gate_bias_ = {1., 1., 1., 1.};
+    output_gate_bias_ = {0., 0., 0., 0.};
+
+    recurrent_to_cell_weights_ = {
+        0.54066205,  -0.32668582, -0.43562764, -0.56094903,
+        0.42957711,  0.01841056,  -0.32764608, -0.33027974,
+        -0.10826075, 0.20675004,  0.19069612,  -0.03026325,
+        -0.54532051, 0.33003211,  0.44901288,  0.21193194};
+
+    recurrent_to_forget_weights_ = {
+        -0.13832897, -0.0515101,  -0.2359007, -0.16661474,
+        -0.14340827, 0.36986142,  0.23414481, 0.55899,
+        0.10798943,  -0.41174671, 0.17751795, -0.34484994,
+        -0.35874045, -0.11352962, 0.27268326, 0.54058349};
+
+    recurrent_to_output_weights_ = {
+        0.41613156, 0.42610586,  -0.16495961, -0.5663873,
+        0.30579174, -0.05115908, -0.33941799, 0.23364776,
+        0.11178309, 0.09481031,  -0.26424935, 0.46261835,
+        0.50248802, 0.26114327,  -0.43736315, 0.33149987};
+
+    cell_to_forget_weights_ = {0.47485286, -0.51955009, -0.24458408,
+                               0.31544167};
+    cell_to_output_weights_ = {-0.17135078, 0.82760304, 0.85573703,
+                               -0.77109635};
+
+    lstm_input_ = {{2., 3., 3., 4., 1., 1.}};
+    lstm_golden_output_ = {{-0.36444446, -0.00352185, 0.12886585, -0.05163646,
+                            -0.42312205, -0.01218222, 0.24201041, -0.08124574,
+                            -0.358325, -0.04621704, 0.21641694, -0.06471302}};
   }
-}
+};
 
-TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
+TEST_F(CifgNoPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) {
   const int n_batch = 1;
   const int n_input = 2;
   // n_cell and n_output have the same size when there is no projection.
@@ -389,74 +619,689 @@ TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
                        {0},     // projection_bias tensor
                    });
 
-  lstm.SetInputToCellWeights({-0.49770179, -0.27711356, -0.09624726, 0.05100781,
-                              0.04717243, 0.48944736, -0.38535351,
-                              -0.17212132});
-
-  lstm.SetInputToForgetWeights({-0.55291498, -0.42866567, 0.13056988,
-                                -0.3633365, -0.22755712, 0.28253698, 0.24407166,
-                                0.33826375});
-
-  lstm.SetInputToOutputWeights({0.10725588, -0.02335852, -0.55932593,
-                                -0.09426838, -0.44257352, 0.54939759,
-                                0.01533556, 0.42751634});
-
-  lstm.SetCellBias({0., 0., 0., 0.});
+  lstm.SetInputToCellWeights(input_to_cell_weights_);
+  lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  lstm.SetInputToOutputWeights(input_to_output_weights_);
 
-  lstm.SetForgetGateBias({1., 1., 1., 1.});
+  lstm.SetCellBias(cell_gate_bias_);
+  lstm.SetForgetGateBias(forget_gate_bias_);
+  lstm.SetOutputGateBias(output_gate_bias_);
 
-  lstm.SetOutputGateBias({0., 0., 0., 0.});
+  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
 
-  lstm.SetRecurrentToCellWeights(
-      {0.54066205, -0.32668582, -0.43562764, -0.56094903, 0.42957711,
-       0.01841056, -0.32764608, -0.33027974, -0.10826075, 0.20675004,
-       0.19069612, -0.03026325, -0.54532051, 0.33003211, 0.44901288,
-       0.21193194});
+  lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  lstm.SetCellToOutputWeights(cell_to_output_weights_);
 
-  lstm.SetRecurrentToForgetWeights(
-      {-0.13832897, -0.0515101, -0.2359007, -0.16661474, -0.14340827,
-       0.36986142, 0.23414481, 0.55899, 0.10798943, -0.41174671, 0.17751795,
-       -0.34484994, -0.35874045, -0.11352962, 0.27268326, 0.54058349});
+  // Resetting cell_state and output_state
+  lstm.ResetCellState();
+  lstm.ResetOutputState();
 
-  lstm.SetRecurrentToOutputWeights(
-      {0.41613156, 0.42610586, -0.16495961, -0.5663873, 0.30579174, -0.05115908,
-       -0.33941799, 0.23364776, 0.11178309, 0.09481031, -0.26424935, 0.46261835,
-       0.50248802, 0.26114327, -0.43736315, 0.33149987});
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm);
+}
 
-  lstm.SetCellToForgetWeights(
-      {0.47485286, -0.51955009, -0.24458408, 0.31544167});
-  lstm.SetCellToOutputWeights(
-      {-0.17135078, 0.82760304, 0.85573703, -0.77109635});
+TEST_F(CifgNoPeepholeNoProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when there is no projection.
+  const int n_cell = 4;
+  const int n_output = 4;
 
-  static float lstm_input[] = {2., 3., 3., 4., 1., 1.};
-  static float lstm_golden_output[] = {-0.36444446, -0.00352185, 0.12886585,
-                                       -0.05163646, -0.42312205, -0.01218222,
-                                       0.24201041,  -0.08124574, -0.358325,
-                                       -0.04621704, 0.21641694,  -0.06471302};
+  HybridLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/true, /*use_peephole=*/true,
+      /*use_projection_weights=*/false,
+      /*use_projection_bias=*/false,
+      /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      {
+          {n_batch, n_input},  // input tensor
+
+          {0, 0},             // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {0, 0},              // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},       // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {0},       // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {0, 0},  // projection_weight tensor
+          {0},     // projection_bias tensor
+      });
+
+  lstm.SetInputToCellWeights(input_to_cell_weights_);
+  lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  lstm.SetCellBias(cell_gate_bias_);
+  lstm.SetForgetGateBias(forget_gate_bias_);
+  lstm.SetOutputGateBias(output_gate_bias_);
+
+  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  lstm.SetCellToOutputWeights(cell_to_output_weights_);
 
   // Resetting cell_state and output_state
   lstm.ResetCellState();
   lstm.ResetOutputState();
 
-  const int input_sequence_size =
-      sizeof(lstm_input) / sizeof(float) / (lstm.num_inputs());
-  for (int i = 0; i < input_sequence_size; i++) {
-    float* batch0_start = lstm_input + i * lstm.num_inputs();
-    float* batch0_end = batch0_start + lstm.num_inputs();
-
-    lstm.SetInput(0, batch0_start, batch0_end);
-
-    lstm.Invoke();
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm, /*tolerance=*/0.03573);
+}
 
-    float* golden_start = lstm_golden_output + i * lstm.num_outputs();
-    float* golden_end = golden_start + lstm.num_outputs();
-    std::vector<float> expected;
-    expected.insert(expected.end(), golden_start, golden_end);
-    EXPECT_THAT(lstm.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
+class NoCifgPeepholeProjectionClippingLstmTest : public BaseLstmTest {
+  void SetUp() override {
+    input_to_input_weights_ = {
+        0.021393683,  0.06124551,    0.046905167,  -0.014657677,  -0.03149463,
+        0.09171803,   0.14647801,    0.10797193,   -0.0057968358, 0.0019193048,
+        -0.2726754,   0.10154029,    -0.018539885, 0.080349885,   -0.10262385,
+        -0.022599787, -0.09121155,   -0.008675967, -0.045206103,  -0.0821282,
+        -0.008045952, 0.015478081,   0.055217247,  0.038719587,   0.044153627,
+        -0.06453243,  0.05031825,    -0.046935108, -0.008164439,  0.014574226,
+        -0.1671009,   -0.15519552,   -0.16819797,  -0.13971269,   -0.11953059,
+        0.25005487,   -0.22790983,   0.009855087,  -0.028140958,  -0.11200698,
+        0.11295408,   -0.0035217577, 0.054485075,  0.05184695,    0.064711206,
+        0.10989193,   0.11674786,    0.03490607,   0.07727357,    0.11390585,
+        -0.1863375,   -0.1034451,    -0.13945189,  -0.049401227,  -0.18767063,
+        0.042483903,  0.14233552,    0.13832581,   0.18350165,    0.14545603,
+        -0.028545704, 0.024939531,   0.050929718,  0.0076203286,  -0.0029723682,
+        -0.042484224, -0.11827596,   -0.09171104,  -0.10808628,   -0.16327988,
+        -0.2273378,   -0.0993647,    -0.017155107, 0.0023917493,  0.049272764,
+        0.0038534778, 0.054764505,   0.089753784,  0.06947234,    0.08014476,
+        -0.04544234,  -0.0497073,    -0.07135631,  -0.048929106,  -0.004042012,
+        -0.009284026, 0.018042054,   0.0036860977, -0.07427302,   -0.11434604,
+        -0.018995456, 0.031487543,   0.012834908,  0.019977754,   0.044256654,
+        -0.39292613,  -0.18519334,   -0.11651281,  -0.06809892,   0.011373677};
+
+    input_to_forget_weights_ = {
+        -0.0018401089, -0.004852237, 0.03698424,    0.014181704,
+        0.028273236,   -0.016726194, -0.05249759,   -0.10204261,
+        0.00861066,    -0.040979505, -0.009899187,  0.01923892,
+        -0.028177269,  -0.08535103,  -0.14585495,   0.10662567,
+        -0.01909731,   -0.017883534, -0.0047269356, -0.045103323,
+        0.0030784295,  0.076784775,  0.07463696,    0.094531395,
+        0.0814421,     -0.12257899,  -0.033945758,  -0.031303465,
+        0.045630626,   0.06843887,   -0.13492945,   -0.012480007,
+        -0.0811829,    -0.07224499,  -0.09628791,   0.045100946,
+        0.0012300825,  0.013964662,  0.099372394,   0.02543059,
+        0.06958324,    0.034257296,  0.0482646,     0.06267997,
+        0.052625068,   0.12784666,   0.07077897,    0.025725935,
+        0.04165009,    0.07241905,   0.018668644,   -0.037377294,
+        -0.06277783,   -0.08833636,  -0.040120605,  -0.011405586,
+        -0.007808335,  -0.010301386, -0.005102167,  0.027717464,
+        0.05483423,    0.11449111,   0.11289652,    0.10939839,
+        0.13396506,    -0.08402166,  -0.01901462,   -0.044678304,
+        -0.07720565,   0.014350063,  -0.11757958,   -0.0652038,
+        -0.08185733,   -0.076754324, -0.092614375,  0.10405491,
+        0.052960336,   0.035755895,  0.035839386,   -0.012540553,
+        0.036881298,   0.02913376,   0.03420159,    0.05448447,
+        -0.054523353,  0.02582715,   0.02327355,    -0.011857179,
+        -0.0011980024, -0.034641717, -0.026125094,  -0.17582615,
+        -0.15923657,   -0.27486774,  -0.0006143371, 0.0001771948,
+        -8.470171e-05, 0.02651807,   0.045790765,   0.06956496};
+
+    input_to_cell_weights_ = {
+        -0.04580283,   -0.09549462,   -0.032418985,  -0.06454633,
+        -0.043528453,  0.043018587,   -0.049152344,  -0.12418144,
+        -0.078985475,  -0.07596889,   0.019484362,   -0.11434962,
+        -0.0074034138, -0.06314844,   -0.092981495,  0.0062155537,
+        -0.025034338,  -0.0028890965, 0.048929527,   0.06235075,
+        0.10665918,    -0.032036792,  -0.08505916,   -0.10843358,
+        -0.13002433,   -0.036816437,  -0.02130134,   -0.016518239,
+        0.0047691227,  -0.0025825808, 0.066017866,   0.029991534,
+        -0.10652836,   -0.1037554,    -0.13056071,   -0.03266643,
+        -0.033702414,  -0.006473424,  -0.04611692,   0.014419339,
+        -0.025174323,  0.0396852,     0.081777506,   0.06157468,
+        0.10210095,    -0.009658194,  0.046511717,   0.03603906,
+        0.0069369148,  0.015960095,   -0.06507666,   0.09551598,
+        0.053568836,   0.06408714,    0.12835667,    -0.008714329,
+        -0.20211966,   -0.12093674,   0.029450472,   0.2849013,
+        -0.029227901,  0.1164364,     -0.08560263,   0.09941786,
+        -0.036999565,  -0.028842626,  -0.0033637602, -0.017012902,
+        -0.09720865,   -0.11193351,   -0.029155117,  -0.017936034,
+        -0.009768936,  -0.04223324,   -0.036159635,  0.06505112,
+        -0.021742892,  -0.023377212,  -0.07221364,   -0.06430552,
+        0.05453865,    0.091149814,   0.06387331,    0.007518393,
+        0.055960953,   0.069779344,   0.046411168,   0.10509911,
+        0.07463894,    0.0075130584,  0.012850982,   0.04555431,
+        0.056955688,   0.06555285,    0.050801456,   -0.009862683,
+        0.00826772,    -0.026555609,  -0.0073611983, -0.0014897042};
+
+    input_to_output_weights_ = {
+        -0.0998932,   -0.07201956,  -0.052803773,  -0.15629593,  -0.15001918,
+        -0.07650751,  0.02359855,   -0.075155355,  -0.08037709,  -0.15093534,
+        0.029517552,  -0.04751393,  0.010350531,   -0.02664851,  -0.016839722,
+        -0.023121163, 0.0077019283, 0.012851257,   -0.05040649,  -0.0129761,
+        -0.021737747, -0.038305793, -0.06870586,   -0.01481247,  -0.001285394,
+        0.10124236,   0.083122835,  0.053313006,   -0.062235646, -0.075637154,
+        -0.027833903, 0.029774971,  0.1130802,     0.09218906,   0.09506135,
+        -0.086665764, -0.037162706, -0.038880914,  -0.035832845, -0.014481564,
+        -0.09825003,  -0.12048569,  -0.097665586,  -0.05287633,  -0.0964047,
+        -0.11366429,  0.035777505,  0.13568819,    0.052451383,  0.050649304,
+        0.05798951,   -0.021852335, -0.099848844,  0.014740475,  -0.078897946,
+        0.04974699,   0.014160473,  0.06973932,    0.04964942,   0.033364646,
+        0.08190124,   0.025535367,  0.050893165,   0.048514254,  0.06945813,
+        -0.078907564, -0.06707616,  -0.11844508,   -0.09986688,  -0.07509403,
+        0.06263226,   0.14925587,   0.20188436,    0.12098451,   0.14639415,
+        0.0015017595, -0.014267382, -0.03417257,   0.012711468,  0.0028300495,
+        -0.024758482, -0.05098548,  -0.0821182,    0.014225672,  0.021544158,
+        0.08949725,   0.07505268,   -0.0020780868, 0.04908258,   0.06476295,
+        -0.022907063, 0.027562456,  0.040185735,   0.019567577,  -0.015598739,
+        -0.049097303, -0.017121866, -0.083368234,  -0.02332002,  -0.0840956};
+
+    input_gate_bias_ = {0.02234832,   0.14757581,  0.18176508,  0.10380666,
+                        0.053110216,  -0.06928846, -0.13942584, -0.11816189,
+                        0.19483899,   0.03652339,  -0.10250295, 0.036714908,
+                        -0.18426876,  0.036065217, 0.21810818,  0.02383196,
+                        -0.043370757, 0.08690144,  -0.04444982, 0.00030581196};
+
+    forget_gate_bias_ = {0.035185695, -0.042891346, -0.03032477, 0.23027696,
+                         0.11098921,  0.15378423,   0.09263801,  0.09790885,
+                         0.09508917,  0.061199076,  0.07665568,  -0.015443159,
+                         -0.03499149, 0.046190713,  0.08895977,  0.10899629,
+                         0.40694186,  0.06030037,   0.012413437, -0.06108739};
+
+    cell_gate_bias_ = {-0.024379363, 0.0055531194, 0.23377132,   0.033463873,
+                       -0.1483596,   -0.10639995,  -0.091433935, 0.058573797,
+                       -0.06809782,  -0.07889636,  -0.043246906, -0.09829136,
+                       -0.4279842,   0.034901652,  0.18797937,   0.0075234566,
+                       0.016178843,  0.1749513,    0.13975595,   0.92058027};
+
+    output_gate_bias_ = {0.046159424, -0.0012809046, 0.03563469,   0.12648113,
+                         0.027195795, 0.35373217,    -0.018957434, 0.008907322,
+                         -0.0762701,  0.12018895,    0.04216877,   0.0022856654,
+                         0.040952638, 0.3147856,     0.08225149,   -0.057416286,
+                         -0.14995944, -0.008040261,  0.13208859,   0.029760877};
+
+    recurrent_to_input_weights_ = {
+        -0.001374326,   -0.078856036,   0.10672688,    0.029162422,
+        -0.11585556,    0.02557986,     -0.13446963,   -0.035785314,
+        -0.01244275,    0.025961924,    -0.02337298,   -0.044228926,
+        -0.055839065,   -0.046598054,   -0.010546039,  -0.06900766,
+        0.027239809,    0.022582639,    -0.013296484,  -0.05459212,
+        0.08981,        -0.045407712,   0.08682226,    -0.06867011,
+        -0.14390695,    -0.02916037,    0.000996957,   0.091420636,
+        0.14283475,     -0.07390571,    -0.06402044,   0.062524505,
+        -0.093129106,   0.04860203,     -0.08364217,   -0.08119002,
+        0.009352075,    0.22920375,     0.0016303885,  0.11583097,
+        -0.13732095,    0.012405723,    -0.07551853,   0.06343048,
+        0.12162708,     -0.031923793,   -0.014335606,  0.01790974,
+        -0.10650317,    -0.0724401,     0.08554849,    -0.05727212,
+        0.06556731,     -0.042729504,   -0.043227166,  0.011683251,
+        -0.013082158,   -0.029302018,   -0.010899579,  -0.062036745,
+        -0.022509435,   -0.00964907,    -0.01567329,   0.04260106,
+        -0.07787477,    -0.11576462,    0.017356863,   0.048673786,
+        -0.017577527,   -0.05527947,    -0.082487635,  -0.040137455,
+        -0.10820036,    -0.04666372,    0.022746278,   -0.07851417,
+        0.01068115,     0.032956902,    0.022433773,   0.0026891115,
+        0.08944216,     -0.0685835,     0.010513544,   0.07228705,
+        0.02032331,     -0.059686817,   -0.0005566496, -0.086984694,
+        0.040414046,    -0.1380399,     0.094208956,   -0.05722982,
+        0.012092817,    -0.04989123,    -0.086576,     -0.003399834,
+        -0.04696032,    -0.045747425,   0.10091314,    0.048676282,
+        -0.029037097,   0.031399418,    -0.0040285117, 0.047237843,
+        0.09504992,     0.041799378,    -0.049185462,  -0.031518843,
+        -0.10516937,    0.026374253,    0.10058866,    -0.0033195973,
+        -0.041975245,   0.0073591834,   0.0033782164,  -0.004325073,
+        -0.10167381,    0.042500053,    -0.01447153,   0.06464186,
+        -0.017142897,   0.03312627,     0.009205989,   0.024138335,
+        -0.011337001,   0.035530265,    -0.010912711,  0.0706555,
+        -0.005894094,   0.051841937,    -0.1401738,    -0.02351249,
+        0.0365468,      0.07590991,     0.08838724,    0.021681072,
+        -0.10086113,    0.019608743,    -0.06195883,   0.077335775,
+        0.023646897,    -0.095322326,   0.02233014,    0.09756986,
+        -0.048691444,   -0.009579111,   0.07595467,    0.11480546,
+        -0.09801813,    0.019894179,    0.08502348,    0.004032281,
+        0.037211012,    0.068537936,    -0.048005626,  -0.091520436,
+        -0.028379958,   -0.01556313,    0.06554592,    -0.045599163,
+        -0.01672207,    -0.020169014,   -0.011877351,  -0.20212261,
+        0.010889619,    0.0047078193,   0.038385306,   0.08540671,
+        -0.017140968,   -0.0035865551,  0.016678626,   0.005633034,
+        0.015963363,    0.00871737,     0.060130805,   0.028611384,
+        0.10109069,     -0.015060172,   -0.07894427,   0.06401885,
+        0.011584063,    -0.024466386,   0.0047652307,  -0.09041358,
+        0.030737216,    -0.0046374933,  0.14215417,    -0.11823516,
+        0.019899689,    0.006106124,    -0.027092824,  0.0786356,
+        0.05052217,     -0.058925,      -0.011402121,  -0.024987547,
+        -0.0013661642,  -0.06832946,    -0.015667673,  -0.1083353,
+        -0.00096863037, -0.06988685,    -0.053350925,  -0.027275559,
+        -0.033664223,   -0.07978348,    -0.025200296,  -0.017207067,
+        -0.058403496,   -0.055697463,   0.005798788,   0.12965427,
+        -0.062582195,   0.0013350133,   -0.10482091,   0.0379771,
+        0.072521195,    -0.0029455067,  -0.13797039,   -0.03628521,
+        0.013806405,    -0.017858358,   -0.01008298,   -0.07700066,
+        -0.017081132,   0.019358726,    0.0027079724,  0.004635139,
+        0.062634714,    -0.02338735,    -0.039547626,  -0.02050681,
+        0.03385117,     -0.083611414,   0.002862572,   -0.09421313,
+        0.058618143,    -0.08598433,    0.00972939,    0.023867095,
+        -0.053934585,   -0.023203006,   0.07452513,    -0.048767887,
+        -0.07314807,    -0.056307215,   -0.10433547,   -0.06440842,
+        0.04328182,     0.04389765,     -0.020006588,  -0.09076438,
+        -0.11652589,    -0.021705797,   0.03345259,    -0.010329105,
+        -0.025767034,   0.013057034,    -0.07316461,   -0.10145612,
+        0.06358255,     0.18531723,     0.07759293,    0.12006465,
+        0.1305557,      0.058638252,    -0.03393652,   0.09622831,
+        -0.16253184,    -2.4580743e-06, 0.079869635,   -0.070196845,
+        -0.005644518,   0.06857898,     -0.12598175,   -0.035084512,
+        0.03156317,     -0.12794146,    -0.031963028,  0.04692781,
+        0.030070418,    0.0071660685,   -0.095516115,  -0.004643372,
+        0.040170413,    -0.062104587,   -0.0037324072, 0.0554317,
+        0.08184801,     -0.019164372,   0.06791302,    0.034257166,
+        -0.10307039,    0.021943003,    0.046745934,   0.0790918,
+        -0.0265588,     -0.007824208,   0.042546265,   -0.00977924,
+        -0.0002440307,  -0.017384544,   -0.017990116,  0.12252321,
+        -0.014512694,   -0.08251313,    0.08861942,    0.13589665,
+        0.026351685,    0.012641483,    0.07466548,    0.044301085,
+        -0.045414884,   -0.051112458,   0.03444247,    -0.08502782,
+        -0.04106223,    -0.028126027,   0.028473156,   0.10467447};
+
+    recurrent_to_cell_weights_ = {
+        -0.037322544,   0.018592842,   0.0056175636,  -0.06253426,
+        0.055647098,    -0.05713207,   -0.05626563,   0.005559383,
+        0.03375411,     -0.025757805,  -0.088049285,  0.06017052,
+        -0.06570978,    0.007384076,   0.035123326,   -0.07920549,
+        0.053676967,    0.044480428,   -0.07663568,   0.0071805613,
+        0.08089997,     0.05143358,    0.038261272,   0.03339287,
+        -0.027673481,   0.044746667,   0.028349208,   0.020090483,
+        -0.019443132,   -0.030755889,  -0.0040000007, 0.04465846,
+        -0.021585021,   0.0031670958,  0.0053199246,  -0.056117613,
+        -0.10893326,    0.076739706,   -0.08509834,   -0.027997585,
+        0.037871376,    0.01449768,    -0.09002357,   -0.06111149,
+        -0.046195522,   0.0422062,     -0.005683705,  -0.1253618,
+        -0.012925729,   -0.04890792,   0.06985068,    0.037654128,
+        0.03398274,     -0.004781977,  0.007032333,   -0.031787455,
+        0.010868644,    -0.031489216,  0.09525667,    0.013939797,
+        0.0058680447,   0.0167067,     0.02668468,    -0.04797466,
+        -0.048885044,   -0.12722108,   0.035304096,   0.06554885,
+        0.00972396,     -0.039238118,  -0.05159735,   -0.11329045,
+        0.1613692,      -0.03750952,   0.06529313,    -0.071974665,
+        -0.11769596,    0.015524369,   -0.0013754242, -0.12446318,
+        0.02786344,     -0.014179351,  0.005264273,   0.14376344,
+        0.015983658,    0.03406988,    -0.06939408,   0.040699873,
+        0.02111075,     0.09669095,    0.041345075,   -0.08316494,
+        -0.07684199,    -0.045768797,  0.032298047,   -0.041805092,
+        0.0119405,      0.0061010392,  0.12652606,    0.0064572375,
+        -0.024950314,   0.11574242,    0.04508852,    -0.04335324,
+        0.06760663,     -0.027437469,  0.07216407,    0.06977076,
+        -0.05438599,    0.034033038,   -0.028602652,  0.05346137,
+        0.043184172,    -0.037189785,  0.10420091,    0.00882477,
+        -0.054019816,   -0.074273005,  -0.030617684,  -0.0028467078,
+        0.024302477,    -0.0038869337, 0.005332455,   0.0013399826,
+        0.04361412,     -0.007001822,  0.09631092,    -0.06702025,
+        -0.042049985,   -0.035070654,  -0.04103342,   -0.10273396,
+        0.0544271,      0.037184782,   -0.13150354,   -0.0058036847,
+        -0.008264958,   0.042035464,   0.05891794,    0.029673764,
+        0.0063542654,   0.044788733,   0.054816857,   0.062257513,
+        -0.00093483756, 0.048938446,   -0.004952862,  -0.007730018,
+        -0.04043371,    -0.017094059,  0.07229206,    -0.023670016,
+        -0.052195564,   -0.025616996,  -0.01520939,   0.045104615,
+        -0.007376126,   0.003533447,   0.006570588,   0.056037236,
+        0.12436656,     0.051817212,   0.028532185,   -0.08686856,
+        0.11868599,     0.07663395,    -0.07323171,   0.03463402,
+        -0.050708205,   -0.04458982,   -0.11590894,   0.021273347,
+        0.1251325,      -0.15313013,   -0.12224372,   0.17228661,
+        0.023029093,    0.086124025,   0.006445803,   -0.03496501,
+        0.028332196,    0.04449512,    -0.042436164,  -0.026587414,
+        -0.006041347,   -0.09292539,   -0.05678812,   0.03897832,
+        0.09465633,     0.008115513,   -0.02171956,   0.08304309,
+        0.071401566,    0.019622514,   0.032163795,   -0.004167056,
+        0.02295182,     0.030739572,   0.056506045,   0.004612461,
+        0.06524936,     0.059999723,   0.046395954,   -0.0045512207,
+        -0.1335546,     -0.030136576,  0.11584653,    -0.014678886,
+        0.0020118146,   -0.09688814,   -0.0790206,    0.039770417,
+        -0.0329582,     0.07922767,    0.029322514,   0.026405897,
+        0.04207835,     -0.07073373,   0.063781224,   0.0859677,
+        -0.10925287,    -0.07011058,   0.048005477,   0.03438226,
+        -0.09606514,    -0.006669445,  -0.043381985,  0.04240257,
+        -0.06955775,    -0.06769346,   0.043903265,   -0.026784198,
+        -0.017840602,   0.024307009,   -0.040079936,  -0.019946516,
+        0.045318738,    -0.12233574,   0.026170589,   0.0074471775,
+        0.15978073,     0.10185836,    0.10298046,    -0.015476589,
+        -0.039390966,   -0.072174534,  0.0739445,     -0.1211869,
+        -0.0347889,     -0.07943156,   0.014809798,   -0.12412325,
+        -0.0030663363,  0.039695457,   0.0647603,     -0.08291318,
+        -0.018529687,   -0.004423833,  0.0037507233,  0.084633216,
+        -0.01514876,    -0.056505352,  -0.012800942,  -0.06994386,
+        0.012962922,    -0.031234352,  0.07029052,    0.016418684,
+        0.03618972,     0.055686004,   -0.08663945,   -0.017404709,
+        -0.054761406,   0.029065743,   0.052404847,   0.020238016,
+        0.0048197987,   -0.0214882,    0.07078733,    0.013016777,
+        0.06262858,     0.009184685,   0.020785125,   -0.043904778,
+        -0.0270329,     -0.03299152,   -0.060088247,  -0.015162964,
+        -0.001828936,   0.12642565,    -0.056757294,  0.013586685,
+        0.09232601,     -0.035886683,  0.06000002,    0.05229691,
+        -0.052580316,   -0.082029596,  -0.010794592,  0.012947712,
+        -0.036429964,   -0.085508935,  -0.13127148,   -0.017744139,
+        0.031502828,    0.036232427,   -0.031581745,  0.023051167,
+        -0.05325106,    -0.03421577,   0.028793324,   -0.034633752,
+        -0.009881397,   -0.043551125,  -0.018609839,  0.0019097115,
+        -0.008799762,   0.056595087,   0.0022273948,  0.055752404};
+
+    recurrent_to_forget_weights_ = {
+        -0.057784554,  -0.026057621,  -0.068447545,   -0.022581743,
+        0.14811787,    0.10826372,    0.09471067,     0.03987225,
+        -0.0039523416, 0.00030638507, 0.053185795,    0.10572994,
+        0.08414449,    -0.022036452,  -0.00066928595, -0.09203576,
+        0.032950465,   -0.10985798,   -0.023809856,   0.0021431844,
+        -0.02196096,   -0.00326074,   0.00058621005,  -0.074678116,
+        -0.06193199,   0.055729095,   0.03736828,     0.020123724,
+        0.061878487,   -0.04729229,   0.034919553,    -0.07585433,
+        -0.04421272,   -0.044019096,  0.085488975,    0.04058006,
+        -0.06890133,   -0.030951202,  -0.024628663,   -0.07672815,
+        0.034293607,   0.08556707,    -0.05293577,    -0.033561368,
+        -0.04899627,   0.0241671,     0.015736353,    -0.095442444,
+        -0.029564252,  0.016493602,   -0.035026584,   0.022337519,
+        -0.026871363,  0.004780428,   0.0077918363,   -0.03601621,
+        0.016435321,   -0.03263031,   -0.09543275,    -0.047392778,
+        0.013454138,   0.028934088,   0.01685226,     -0.086110644,
+        -0.046250615,  -0.01847454,   0.047608484,    0.07339695,
+        0.034546845,   -0.04881143,   0.009128804,    -0.08802852,
+        0.03761666,    0.008096139,   -0.014454086,   0.014361001,
+        -0.023502491,  -0.0011840804, -0.07607001,    0.001856849,
+        -0.06509276,   -0.006021153,  -0.08570962,    -0.1451793,
+        0.060212336,   0.055259194,   0.06974018,     0.049454916,
+        -0.027794661,  -0.08077226,   -0.016179763,   0.1169753,
+        0.17213494,    -0.0056326236, -0.053934924,   -0.0124349,
+        -0.11520337,   0.05409887,    0.088759385,    0.0019655675,
+        0.0042065294,  0.03881498,    0.019844765,    0.041858196,
+        -0.05695512,   0.047233116,   0.038937137,    -0.06542224,
+        0.014429736,   -0.09719407,   0.13908425,     -0.05379757,
+        0.012321099,   0.082840554,   -0.029899208,   0.044217527,
+        0.059855383,   0.07711018,    -0.045319796,   0.0948846,
+        -0.011724666,  -0.0033288454, -0.033542685,   -0.04764985,
+        -0.13873616,   0.040668588,   0.034832682,    -0.015319203,
+        -0.018715994,  0.046002675,   0.0599172,      -0.043107376,
+        0.0294216,     -0.002314414,  -0.022424703,   0.0030315618,
+        0.0014641669,  0.0029166266,  -0.11878115,    0.013738511,
+        0.12375372,    -0.0006038222, 0.029104086,    0.087442465,
+        0.052958444,   0.07558703,    0.04817258,     0.044462286,
+        -0.015213451,  -0.08783778,   -0.0561384,     -0.003008196,
+        0.047060397,   -0.002058388,  0.03429439,     -0.018839769,
+        0.024734668,   0.024614193,   -0.042046934,   0.09597743,
+        -0.0043254104, 0.04320769,    0.0064070094,   -0.0019131786,
+        -0.02558259,   -0.022822596,  -0.023273505,   -0.02464396,
+        -0.10991725,   -0.006240552,  0.0074488563,   0.024044557,
+        0.04383914,    -0.046476185,  0.028658995,    0.060410924,
+        0.050786525,   0.009452605,   -0.0073054377,  -0.024810238,
+        0.0052906186,  0.0066939713,  -0.0020913032,  0.014515517,
+        0.015898481,   0.021362653,   -0.030262267,   0.016587038,
+        -0.011442813,  0.041154444,   -0.007631438,   -0.03423484,
+        -0.010977775,  0.036152758,   0.0066366293,   0.11915515,
+        0.02318443,    -0.041350313,  0.021485701,    -0.10906167,
+        -0.028218046,  -0.00954771,   0.020531068,    -0.11995105,
+        -0.03672871,   0.024019798,   0.014255957,    -0.05221243,
+        -0.00661567,   -0.04630967,   0.033188973,    0.10107534,
+        -0.014027541,  0.030796422,   -0.10270911,    -0.035999842,
+        0.15443139,    0.07684145,    0.036571592,    -0.035900835,
+        -0.0034699554, 0.06209149,    0.015920248,    -0.031122351,
+        -0.03858649,   0.01849943,    0.13872518,     0.01503974,
+        0.069941424,   -0.06948533,   -0.0088794185,  0.061282158,
+        -0.047401894,  0.03100163,    -0.041533746,   -0.10430945,
+        0.044574402,   -0.01425562,   -0.024290353,   0.034563623,
+        0.05866852,    0.023947537,   -0.09445152,    0.035450947,
+        0.02247216,    -0.0042998926, 0.061146557,    -0.10250651,
+        0.020881841,   -0.06747029,   0.10062043,     -0.0023941975,
+        0.03532124,    -0.016341697,  0.09685456,     -0.016764693,
+        0.051808182,   0.05875331,    -0.04536488,    0.001626336,
+        -0.028892258,  -0.01048663,   -0.009793449,   -0.017093895,
+        0.010987891,   0.02357273,    -0.00010856845, 0.0099760275,
+        -0.001845119,  -0.03551521,   0.0018358806,   0.05763657,
+        -0.01769146,   0.040995963,   0.02235177,     -0.060430344,
+        0.11475477,    -0.023854522,  0.10071741,     0.0686208,
+        -0.014250481,  0.034261297,   0.047418304,    0.08562733,
+        -0.030519066,  0.0060542435,  0.014653856,    -0.038836084,
+        0.04096551,    0.032249358,   -0.08355519,    -0.026823482,
+        0.056386515,   -0.010401743,  -0.028396193,   0.08507674,
+        0.014410365,   0.020995233,   0.17040324,     0.11511526,
+        0.02459721,    0.0066619175,  0.025853224,    -0.023133837,
+        -0.081302024,  0.017264642,   -0.009585969,   0.09491168,
+        -0.051313367,  0.054532815,   -0.014298593,   0.10657464,
+        0.007076659,   0.10964551,    0.0409152,      0.008275321,
+        -0.07283536,   0.07937492,    0.04192024,     -0.1075027};
+
+    recurrent_to_output_weights_ = {
+        0.025825322,   -0.05813119,   0.09495884,     -0.045984812,
+        -0.01255415,   -0.0026479573, -0.08196161,    -0.054914974,
+        -0.0046604523, -0.029587349,  -0.044576716,   -0.07480124,
+        -0.082868785,  0.023254942,   0.027502948,    -0.0039728214,
+        -0.08683098,   -0.08116779,   -0.014675607,   -0.037924774,
+        -0.023314456,  -0.007401714,  -0.09255757,    0.029460307,
+        -0.08829125,   -0.005139627,  -0.08989442,    -0.0555066,
+        0.13596267,    -0.025062224,  -0.048351806,   -0.03850004,
+        0.07266485,    -0.022414139,  0.05940088,     0.075114764,
+        0.09597592,    -0.010211725,  -0.0049794707,  -0.011523867,
+        -0.025980417,  0.072999895,   0.11091378,     -0.081685916,
+        0.014416728,   0.043229222,   0.034178585,    -0.07530371,
+        0.035837382,   -0.085607,     -0.007721233,   -0.03287832,
+        -0.043848954,  -0.06404588,   -0.06632928,    -0.073643476,
+        0.008214239,   -0.045984086,  0.039764922,    0.03474462,
+        0.060612556,   -0.080590084,  0.049127717,    0.04151091,
+        -0.030063879,  0.008801774,   -0.023021035,   -0.019558564,
+        0.05158114,    -0.010947698,  -0.011825728,   0.0075720972,
+        0.0699727,     -0.0039981045, 0.069350146,    0.08799282,
+        0.016156472,   0.035502106,   0.11695009,     0.006217345,
+        0.13392477,    -0.037875112,  0.025745004,    0.08940699,
+        -0.00924166,   0.0046702605,  -0.036598757,   -0.08811812,
+        0.10522024,    -0.032441203,  0.008176899,    -0.04454919,
+        0.07058152,    0.0067963637,  0.039206743,    0.03259838,
+        0.03725492,    -0.09515802,   0.013326398,    -0.052055415,
+        -0.025676316,  0.03198509,    -0.015951829,   -0.058556724,
+        0.036879618,   0.043357447,   0.028362012,    -0.05908629,
+        0.0059240665,  -0.04995891,   -0.019187413,   0.0276265,
+        -0.01628143,   0.0025863599,  0.08800015,     0.035250366,
+        -0.022165963,  -0.07328642,   -0.009415526,   -0.07455109,
+        0.11690406,    0.0363299,     0.07411125,     0.042103454,
+        -0.009660886,  0.019076364,   0.018299393,    -0.046004917,
+        0.08891175,    0.0431396,     -0.026327137,   -0.051502608,
+        0.08979574,    -0.051670972,  0.04940282,     -0.07491107,
+        -0.021240504,  0.022596184,   -0.034280192,   0.060163025,
+        -0.058211457,  -0.051837247,  -0.01349775,    -0.04639988,
+        -0.035936575,  -0.011681591,  0.064818054,    0.0073146066,
+        -0.021745546,  -0.043124277,  -0.06471268,    -0.07053354,
+        -0.029321948,  -0.05330136,   0.016933719,    -0.053782392,
+        0.13747959,    -0.1361751,    -0.11569455,    0.0033329215,
+        0.05693899,    -0.053219706,  0.063698,       0.07977434,
+        -0.07924483,   0.06936997,    0.0034815092,   -0.007305279,
+        -0.037325785,  -0.07251102,   -0.033633437,   -0.08677009,
+        0.091591336,   -0.14165086,   0.021752775,    0.019683983,
+        0.0011612234,  -0.058154266,  0.049996935,    0.0288841,
+        -0.0024567875, -0.14345716,   0.010955264,    -0.10234828,
+        0.1183656,     -0.0010731248, -0.023590032,   -0.072285876,
+        -0.0724771,    -0.026382286,  -0.0014920527,  0.042667855,
+        0.0018776858,  0.02986552,    0.009814309,    0.0733756,
+        0.12289186,    0.018043943,   -0.0458958,     0.049412545,
+        0.033632483,   0.05495232,    0.036686596,    -0.013781798,
+        -0.010036754,  0.02576849,    -0.08307328,    0.010112348,
+        0.042521734,   -0.05869831,   -0.071689695,   0.03876447,
+        -0.13275425,   -0.0352966,    -0.023077697,   0.10285965,
+        0.084736146,   0.15568255,    -0.00040734606, 0.027835453,
+        -0.10292561,   -0.032401145,  0.10053256,     -0.026142767,
+        -0.08271222,   -0.0030240538, -0.016368777,   0.1070414,
+        0.042672627,   0.013456989,   -0.0437609,     -0.022309763,
+        0.11576483,    0.04108048,    0.061026827,    -0.0190714,
+        -0.0869359,    0.037901703,   0.0610107,      0.07202949,
+        0.01675338,    0.086139716,   -0.08795751,    -0.014898893,
+        -0.023771819,  -0.01965048,   0.007955471,    -0.043740474,
+        0.03346837,    -0.10549954,   0.090567775,    0.042013682,
+        -0.03176985,   0.12569028,    -0.02421228,    -0.029526481,
+        0.023851605,   0.031539805,   0.05292009,     -0.02344001,
+        -0.07811758,   -0.08834428,   0.10094801,     0.16594367,
+        -0.06861939,   -0.021256343,  -0.041093912,   -0.06669611,
+        0.035498552,   0.021757556,   -0.09302526,    -0.015403468,
+        -0.06614931,   -0.051798206,  -0.013874718,   0.03630673,
+        0.010412845,   -0.08077351,   0.046185967,    0.0035662893,
+        0.03541868,    -0.094149634,  -0.034814864,   0.003128424,
+        -0.020674974,  -0.03944324,   -0.008110165,   -0.11113267,
+        0.08484226,    0.043586485,   0.040582247,    0.0968012,
+        -0.065249965,  -0.028036479,  0.0050708856,   0.0017462453,
+        0.0326779,     0.041296225,   0.09164146,     -0.047743853,
+        -0.015952192,  -0.034451712,  0.084197424,    -0.05347844,
+        -0.11768019,   0.085926116,   -0.08251791,    -0.045081906,
+        0.0948852,     0.068401024,   0.024856757,    0.06978981,
+        -0.057309967,  -0.012775832,  -0.0032452994,  0.01977615,
+        -0.041040014,  -0.024264973,  0.063464895,    0.05431621,
+    };
+
+    cell_to_input_weights_ = {
+        0.040369894, 0.030746894,  0.24704495,  0.018586371,  -0.037586458,
+        -0.15312155, -0.11812848,  -0.11465643, 0.20259799,   0.11418174,
+        -0.10116027, -0.011334949, 0.12411352,  -0.076769054, -0.052169047,
+        0.21198851,  -0.38871562,  -0.09061183, -0.09683246,  -0.21929175};
+
+    cell_to_forget_weights_ = {
+        -0.01998659,  -0.15568835,  -0.24248174,   -0.012770197, 0.041331276,
+        -0.072311886, -0.052123554, -0.0066330447, -0.043891653, 0.036225766,
+        -0.047248036, 0.021479502,  0.033189066,   0.11952997,   -0.020432774,
+        0.64658105,   -0.06650122,  -0.03467612,   0.095340036,  0.23647355};
+
+    cell_to_output_weights_ = {
+        0.08286371,  -0.08261836, -0.51210177, 0.002913762, 0.17764764,
+        -0.5495371,  -0.08460716, -0.24552552, 0.030037103, 0.04123544,
+        -0.11940523, 0.007358328, 0.1890978,   0.4833202,   -0.34441817,
+        0.36312827,  -0.26375428, 0.1457655,   -0.19724406, 0.15548733};
+
+    projection_weights_ = {
+        -0.009802181, 0.09401916,   0.0717386,     -0.13895074,
+        0.09641832,   0.060420845,  0.08539281,    0.054285463,
+        0.061395317,  0.034448683,  -0.042991187,  0.019801661,
+        -0.16840284,  -0.015726732, -0.23041931,   -0.024478018,
+        -0.10959692,  -0.013875541, 0.18600968,    -0.061274476,
+        0.0138165,    -0.08160894,  -0.07661644,   0.032372914,
+        0.16169067,   0.22465782,   -0.03993472,   -0.004017731,
+        0.08633481,   -0.28869787,  0.08682067,    0.17240396,
+        0.014975425,  0.056431185,  0.031037588,   0.16702051,
+        0.0077946745, 0.15140012,   0.29405436,    0.120285,
+        -0.188994,    -0.027265169, 0.043389652,   -0.022061434,
+        0.014777949,  -0.20203483,  0.094781205,   0.19100232,
+        0.13987629,   -0.036132768, -0.06426278,   -0.05108664,
+        0.13221376,   0.009441198,  -0.16715929,   0.15859416,
+        -0.040437475, 0.050779544,  -0.022187516,  0.012166504,
+        0.027685808,  -0.07675938,  -0.0055694645, -0.09444123,
+        0.0046453946, 0.050794356,  0.10770313,    -0.20790008,
+        -0.07149004,  -0.11425117,  0.008225835,   -0.035802525,
+        0.14374903,   0.15262283,   0.048710253,   0.1847461,
+        -0.007487823, 0.11000021,   -0.09542012,   0.22619456,
+        -0.029149994, 0.08527916,   0.009043713,   0.0042746216,
+        0.016261552,  0.022461696,  0.12689082,    -0.043589946,
+        -0.12035478,  -0.08361797,  -0.050666027,  -0.1248618,
+        -0.1275799,   -0.071875185, 0.07377272,    0.09944291,
+        -0.18897448,  -0.1593054,   -0.06526116,   -0.040107165,
+        -0.004618631, -0.067624845, -0.007576253,  0.10727444,
+        0.041546922,  -0.20424393,  0.06907816,    0.050412357,
+        0.00724631,   0.039827548,  0.12449835,    0.10747581,
+        0.13708383,   0.09134148,   -0.12617786,   -0.06428341,
+        0.09956831,   0.1208086,    -0.14676677,   -0.0727722,
+        0.1126304,    0.010139365,  0.015571211,   -0.038128063,
+        0.022913318,  -0.042050496, 0.16842307,    -0.060597885,
+        0.10531834,   -0.06411776,  -0.07451711,   -0.03410368,
+        -0.13393489,  0.06534304,   0.003620307,   0.04490757,
+        0.05970546,   0.05197996,   0.02839995,    0.10434969,
+        -0.013699693, -0.028353551, -0.07260381,   0.047201227,
+        -0.024575593, -0.036445823, 0.07155557,    0.009672501,
+        -0.02328883,  0.009533515,  -0.03606021,   -0.07421458,
+        -0.028082801, -0.2678904,   -0.13221288,   0.18419984,
+        -0.13012612,  -0.014588381, -0.035059117,  -0.04824723,
+        0.07830115,   -0.056184657, 0.03277091,    0.025466874,
+        0.14494097,   -0.12522776,  -0.098633975,  -0.10766018,
+        -0.08317623,  0.08594209,   0.07749552,    0.039474737,
+        0.1776665,    -0.07409566,  -0.0477268,    0.29323658,
+        0.10801441,   0.1154011,    0.013952499,   0.10739139,
+        0.10708251,   -0.051456142, 0.0074137426,  -0.10430189,
+        0.10034707,   0.045594677,  0.0635285,     -0.0715442,
+        -0.089667566, -0.10811871,  0.00026344223, 0.08298446,
+        -0.009525053, 0.006585689,  -0.24567553,   -0.09450807,
+        0.09648481,   0.026996298,  -0.06419476,   -0.04752702,
+        -0.11063944,  -0.23441927,  -0.17608605,   -0.052156363,
+        0.067035615,  0.19271925,   -0.0032889997, -0.043264326,
+        0.09663576,   -0.057112187, -0.10100678,   0.0628376,
+        0.04447668,   0.017961001,  -0.10094388,   -0.10190601,
+        0.18335468,   0.10494553,   -0.052095775,  -0.0026118709,
+        0.10539724,   -0.04383912,  -0.042349473,  0.08438151,
+        -0.1947263,   0.02251204,   0.11216432,    -0.10307853,
+        0.17351969,   -0.039091777, 0.08066188,    -0.00561982,
+        0.12633002,   0.11335965,   -0.0088127935, -0.019777594,
+        0.06864014,   -0.059751723, 0.016233567,   -0.06894641,
+        -0.28651384,  -0.004228674, 0.019708522,   -0.16305895,
+        -0.07468996,  -0.0855457,   0.099339016,   -0.07580735,
+        -0.13775392,  0.08434318,   0.08330512,    -0.12131499,
+        0.031935584,  0.09180414,   -0.08876437,   -0.08049874,
+        0.008753825,  0.03498998,   0.030215185,   0.03907079,
+        0.089751154,  0.029194152,  -0.03337423,   -0.019092513,
+        0.04331237,   0.04299654,   -0.036394123,  -0.12915532,
+        0.09793732,   0.07512415,   -0.11319543,   -0.032502122,
+        0.15661901,   0.07671967,   -0.005491124,  -0.19379048,
+        -0.218606,    0.21448623,   0.017840758,   0.1416943,
+        -0.07051762,  0.19488361,   0.02664691,    -0.18104725,
+        -0.09334311,  0.15026465,   -0.15493552,   -0.057762887,
+        -0.11604192,  -0.262013,    -0.01391798,   0.012185008,
+        0.11156489,   -0.07483202,  0.06693364,    -0.26151478,
+        0.046425626,  0.036540434,  -0.16435726,   0.17338543,
+        -0.21401681,  -0.11385144,  -0.08283257,   -0.069031075,
+        0.030635102,  0.010969227,  0.11109743,    0.010919218,
+        0.027526086,  0.13519906,   0.01891392,    -0.046839405,
+        -0.040167913, 0.017953383,  -0.09700955,   0.0061885654,
+        -0.07000971,  0.026893595,  -0.038844477,  0.14543656};
+
+    lstm_input_ = {
+        {// Batch0: 4 (input_sequence_size) * 5 (n_input)
+         0.787926, 0.151646, 0.071352, 0.118426, 0.458058,   // step 0
+         0.596268, 0.998386, 0.568695, 0.864524, 0.571277,   // step 1
+         0.073204, 0.296072, 0.743333, 0.069199, 0.045348,   // step 2
+         0.867394, 0.291279, 0.013714, 0.482521, 0.626339},  // step 3
+
+        {// Batch1: 4 (input_sequence_size) * 5 (n_input)
+         0.295743, 0.544053, 0.690064, 0.858138, 0.497181,  // step 0
+         0.642421, 0.524260, 0.134799, 0.003639, 0.162482,  // step 1
+         0.640394, 0.930399, 0.050782, 0.432485, 0.988078,  // step 2
+         0.082922, 0.563329, 0.865614, 0.333232, 0.259916}  // step 3
+    };
+
+    lstm_golden_output_ = {
+        {// Batch0: 4 (input_sequence_size) * 16 (n_output)
+         -0.00396806, 0.029352,     -0.00279226, 0.0159977,   -0.00835576,
+         -0.0211779,  0.0283512,    -0.0114597,  0.00907307,  -0.0244004,
+         -0.0152191,  -0.0259063,   0.00914318,  0.00415118,  0.017147,
+         0.0134203,   -0.0166936,   0.0381209,   0.000889694, 0.0143363,
+         -0.0328911,  -0.0234288,   0.0333051,   -0.012229,   0.0110322,
+         -0.0457725,  -0.000832209, -0.0202817,  0.0327257,   0.0121308,
+         0.0155969,   0.0312091,    -0.0213783,  0.0350169,   0.000324794,
+         0.0276012,   -0.0263374,   -0.0371449,  0.0446149,   -0.0205474,
+         0.0103729,   -0.0576349,   -0.0150052,  -0.0292043,  0.0376827,
+         0.0136115,   0.0243435,    0.0354492,   -0.0189322,  0.0464512,
+         -0.00251373, 0.0225745,    -0.0308346,  -0.0317124,  0.0460407,
+         -0.0189395,  0.0149363,    -0.0530162,  -0.0150767,  -0.0340193,
+         0.0286833,   0.00824207,   0.0264887,   0.0305169},
+        {// Batch1: 4 (input_sequence_size) * 16 (n_output)
+         -0.013869,    0.0287268,   -0.00334693, 0.00733398,  -0.0287926,
+         -0.0186926,   0.0193662,   -0.0115437,  0.00422612,  -0.0345232,
+         0.00223253,   -0.00957321, 0.0210624,   0.013331,    0.0150954,
+         0.02168,      -0.0141913,  0.0322082,   0.00227024,  0.0260507,
+         -0.0188721,   -0.0296489,  0.0399134,   -0.0160509,  0.0116039,
+         -0.0447318,   -0.0150515,  -0.0277406,  0.0316596,   0.0118233,
+         0.0214762,    0.0293641,   -0.0204549,  0.0450315,   -0.00117378,
+         0.0167673,    -0.0375007,  -0.0238314,  0.038784,    -0.0174034,
+         0.0131743,    -0.0506589,  -0.0048447,  -0.0240239,  0.0325789,
+         0.00790065,   0.0220157,   0.0333314,   -0.0264787,  0.0387855,
+         -0.000764675, 0.0217599,   -0.037537,   -0.0335206,  0.0431679,
+         -0.0211424,   0.010203,    -0.062785,   -0.00832363, -0.025181,
+         0.0412031,    0.0118723,   0.0239643,   0.0394009}};
   }
-}
+};
 
-TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClipping) {
+TEST_F(NoCifgPeepholeProjectionClippingLstmTest, LstmBlackBoxTest) {
   const int n_batch = 2;
   const int n_input = 5;
   const int n_cell = 20;
@@ -493,588 +1338,98 @@ TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClipping) {
                        {0},                 // projection_bias tensor
                    });
 
-  lstm.SetInputToInputWeights(
-      {0.021393683,  0.06124551,    0.046905167,  -0.014657677,  -0.03149463,
-       0.09171803,   0.14647801,    0.10797193,   -0.0057968358, 0.0019193048,
-       -0.2726754,   0.10154029,    -0.018539885, 0.080349885,   -0.10262385,
-       -0.022599787, -0.09121155,   -0.008675967, -0.045206103,  -0.0821282,
-       -0.008045952, 0.015478081,   0.055217247,  0.038719587,   0.044153627,
-       -0.06453243,  0.05031825,    -0.046935108, -0.008164439,  0.014574226,
-       -0.1671009,   -0.15519552,   -0.16819797,  -0.13971269,   -0.11953059,
-       0.25005487,   -0.22790983,   0.009855087,  -0.028140958,  -0.11200698,
-       0.11295408,   -0.0035217577, 0.054485075,  0.05184695,    0.064711206,
-       0.10989193,   0.11674786,    0.03490607,   0.07727357,    0.11390585,
-       -0.1863375,   -0.1034451,    -0.13945189,  -0.049401227,  -0.18767063,
-       0.042483903,  0.14233552,    0.13832581,   0.18350165,    0.14545603,
-       -0.028545704, 0.024939531,   0.050929718,  0.0076203286,  -0.0029723682,
-       -0.042484224, -0.11827596,   -0.09171104,  -0.10808628,   -0.16327988,
-       -0.2273378,   -0.0993647,    -0.017155107, 0.0023917493,  0.049272764,
-       0.0038534778, 0.054764505,   0.089753784,  0.06947234,    0.08014476,
-       -0.04544234,  -0.0497073,    -0.07135631,  -0.048929106,  -0.004042012,
-       -0.009284026, 0.018042054,   0.0036860977, -0.07427302,   -0.11434604,
-       -0.018995456, 0.031487543,   0.012834908,  0.019977754,   0.044256654,
-       -0.39292613,  -0.18519334,   -0.11651281,  -0.06809892,   0.011373677});
-
-  lstm.SetInputToForgetWeights(
-      {-0.0018401089, -0.004852237,  0.03698424,   0.014181704,   0.028273236,
-       -0.016726194,  -0.05249759,   -0.10204261,  0.00861066,    -0.040979505,
-       -0.009899187,  0.01923892,    -0.028177269, -0.08535103,   -0.14585495,
-       0.10662567,    -0.01909731,   -0.017883534, -0.0047269356, -0.045103323,
-       0.0030784295,  0.076784775,   0.07463696,   0.094531395,   0.0814421,
-       -0.12257899,   -0.033945758,  -0.031303465, 0.045630626,   0.06843887,
-       -0.13492945,   -0.012480007,  -0.0811829,   -0.07224499,   -0.09628791,
-       0.045100946,   0.0012300825,  0.013964662,  0.099372394,   0.02543059,
-       0.06958324,    0.034257296,   0.0482646,    0.06267997,    0.052625068,
-       0.12784666,    0.07077897,    0.025725935,  0.04165009,    0.07241905,
-       0.018668644,   -0.037377294,  -0.06277783,  -0.08833636,   -0.040120605,
-       -0.011405586,  -0.007808335,  -0.010301386, -0.005102167,  0.027717464,
-       0.05483423,    0.11449111,    0.11289652,   0.10939839,    0.13396506,
-       -0.08402166,   -0.01901462,   -0.044678304, -0.07720565,   0.014350063,
-       -0.11757958,   -0.0652038,    -0.08185733,  -0.076754324,  -0.092614375,
-       0.10405491,    0.052960336,   0.035755895,  0.035839386,   -0.012540553,
-       0.036881298,   0.02913376,    0.03420159,   0.05448447,    -0.054523353,
-       0.02582715,    0.02327355,    -0.011857179, -0.0011980024, -0.034641717,
-       -0.026125094,  -0.17582615,   -0.15923657,  -0.27486774,   -0.0006143371,
-       0.0001771948,  -8.470171e-05, 0.02651807,   0.045790765,   0.06956496});
-
-  lstm.SetInputToCellWeights(
-      {-0.04580283,   -0.09549462,   -0.032418985,  -0.06454633,
-       -0.043528453,  0.043018587,   -0.049152344,  -0.12418144,
-       -0.078985475,  -0.07596889,   0.019484362,   -0.11434962,
-       -0.0074034138, -0.06314844,   -0.092981495,  0.0062155537,
-       -0.025034338,  -0.0028890965, 0.048929527,   0.06235075,
-       0.10665918,    -0.032036792,  -0.08505916,   -0.10843358,
-       -0.13002433,   -0.036816437,  -0.02130134,   -0.016518239,
-       0.0047691227,  -0.0025825808, 0.066017866,   0.029991534,
-       -0.10652836,   -0.1037554,    -0.13056071,   -0.03266643,
-       -0.033702414,  -0.006473424,  -0.04611692,   0.014419339,
-       -0.025174323,  0.0396852,     0.081777506,   0.06157468,
-       0.10210095,    -0.009658194,  0.046511717,   0.03603906,
-       0.0069369148,  0.015960095,   -0.06507666,   0.09551598,
-       0.053568836,   0.06408714,    0.12835667,    -0.008714329,
-       -0.20211966,   -0.12093674,   0.029450472,   0.2849013,
-       -0.029227901,  0.1164364,     -0.08560263,   0.09941786,
-       -0.036999565,  -0.028842626,  -0.0033637602, -0.017012902,
-       -0.09720865,   -0.11193351,   -0.029155117,  -0.017936034,
-       -0.009768936,  -0.04223324,   -0.036159635,  0.06505112,
-       -0.021742892,  -0.023377212,  -0.07221364,   -0.06430552,
-       0.05453865,    0.091149814,   0.06387331,    0.007518393,
-       0.055960953,   0.069779344,   0.046411168,   0.10509911,
-       0.07463894,    0.0075130584,  0.012850982,   0.04555431,
-       0.056955688,   0.06555285,    0.050801456,   -0.009862683,
-       0.00826772,    -0.026555609,  -0.0073611983, -0.0014897042});
-
-  lstm.SetInputToOutputWeights(
-      {-0.0998932,   -0.07201956,  -0.052803773,  -0.15629593,  -0.15001918,
-       -0.07650751,  0.02359855,   -0.075155355,  -0.08037709,  -0.15093534,
-       0.029517552,  -0.04751393,  0.010350531,   -0.02664851,  -0.016839722,
-       -0.023121163, 0.0077019283, 0.012851257,   -0.05040649,  -0.0129761,
-       -0.021737747, -0.038305793, -0.06870586,   -0.01481247,  -0.001285394,
-       0.10124236,   0.083122835,  0.053313006,   -0.062235646, -0.075637154,
-       -0.027833903, 0.029774971,  0.1130802,     0.09218906,   0.09506135,
-       -0.086665764, -0.037162706, -0.038880914,  -0.035832845, -0.014481564,
-       -0.09825003,  -0.12048569,  -0.097665586,  -0.05287633,  -0.0964047,
-       -0.11366429,  0.035777505,  0.13568819,    0.052451383,  0.050649304,
-       0.05798951,   -0.021852335, -0.099848844,  0.014740475,  -0.078897946,
-       0.04974699,   0.014160473,  0.06973932,    0.04964942,   0.033364646,
-       0.08190124,   0.025535367,  0.050893165,   0.048514254,  0.06945813,
-       -0.078907564, -0.06707616,  -0.11844508,   -0.09986688,  -0.07509403,
-       0.06263226,   0.14925587,   0.20188436,    0.12098451,   0.14639415,
-       0.0015017595, -0.014267382, -0.03417257,   0.012711468,  0.0028300495,
-       -0.024758482, -0.05098548,  -0.0821182,    0.014225672,  0.021544158,
-       0.08949725,   0.07505268,   -0.0020780868, 0.04908258,   0.06476295,
-       -0.022907063, 0.027562456,  0.040185735,   0.019567577,  -0.015598739,
-       -0.049097303, -0.017121866, -0.083368234,  -0.02332002,  -0.0840956});
-
-  lstm.SetInputGateBias(
-      {0.02234832,  0.14757581,   0.18176508,  0.10380666,  0.053110216,
-       -0.06928846, -0.13942584,  -0.11816189, 0.19483899,  0.03652339,
-       -0.10250295, 0.036714908,  -0.18426876, 0.036065217, 0.21810818,
-       0.02383196,  -0.043370757, 0.08690144,  -0.04444982, 0.00030581196});
-
-  lstm.SetForgetGateBias({0.035185695, -0.042891346, -0.03032477, 0.23027696,
-                          0.11098921,  0.15378423,   0.09263801,  0.09790885,
-                          0.09508917,  0.061199076,  0.07665568,  -0.015443159,
-                          -0.03499149, 0.046190713,  0.08895977,  0.10899629,
-                          0.40694186,  0.06030037,   0.012413437, -0.06108739});
-
-  lstm.SetCellBias({-0.024379363, 0.0055531194, 0.23377132,   0.033463873,
-                    -0.1483596,   -0.10639995,  -0.091433935, 0.058573797,
-                    -0.06809782,  -0.07889636,  -0.043246906, -0.09829136,
-                    -0.4279842,   0.034901652,  0.18797937,   0.0075234566,
-                    0.016178843,  0.1749513,    0.13975595,   0.92058027});
-
-  lstm.SetOutputGateBias(
-      {0.046159424,  -0.0012809046, 0.03563469,   0.12648113, 0.027195795,
-       0.35373217,   -0.018957434,  0.008907322,  -0.0762701, 0.12018895,
-       0.04216877,   0.0022856654,  0.040952638,  0.3147856,  0.08225149,
-       -0.057416286, -0.14995944,   -0.008040261, 0.13208859, 0.029760877});
-
-  lstm.SetRecurrentToInputWeights(
-      {-0.001374326,   -0.078856036,   0.10672688,    0.029162422,
-       -0.11585556,    0.02557986,     -0.13446963,   -0.035785314,
-       -0.01244275,    0.025961924,    -0.02337298,   -0.044228926,
-       -0.055839065,   -0.046598054,   -0.010546039,  -0.06900766,
-       0.027239809,    0.022582639,    -0.013296484,  -0.05459212,
-       0.08981,        -0.045407712,   0.08682226,    -0.06867011,
-       -0.14390695,    -0.02916037,    0.000996957,   0.091420636,
-       0.14283475,     -0.07390571,    -0.06402044,   0.062524505,
-       -0.093129106,   0.04860203,     -0.08364217,   -0.08119002,
-       0.009352075,    0.22920375,     0.0016303885,  0.11583097,
-       -0.13732095,    0.012405723,    -0.07551853,   0.06343048,
-       0.12162708,     -0.031923793,   -0.014335606,  0.01790974,
-       -0.10650317,    -0.0724401,     0.08554849,    -0.05727212,
-       0.06556731,     -0.042729504,   -0.043227166,  0.011683251,
-       -0.013082158,   -0.029302018,   -0.010899579,  -0.062036745,
-       -0.022509435,   -0.00964907,    -0.01567329,   0.04260106,
-       -0.07787477,    -0.11576462,    0.017356863,   0.048673786,
-       -0.017577527,   -0.05527947,    -0.082487635,  -0.040137455,
-       -0.10820036,    -0.04666372,    0.022746278,   -0.07851417,
-       0.01068115,     0.032956902,    0.022433773,   0.0026891115,
-       0.08944216,     -0.0685835,     0.010513544,   0.07228705,
-       0.02032331,     -0.059686817,   -0.0005566496, -0.086984694,
-       0.040414046,    -0.1380399,     0.094208956,   -0.05722982,
-       0.012092817,    -0.04989123,    -0.086576,     -0.003399834,
-       -0.04696032,    -0.045747425,   0.10091314,    0.048676282,
-       -0.029037097,   0.031399418,    -0.0040285117, 0.047237843,
-       0.09504992,     0.041799378,    -0.049185462,  -0.031518843,
-       -0.10516937,    0.026374253,    0.10058866,    -0.0033195973,
-       -0.041975245,   0.0073591834,   0.0033782164,  -0.004325073,
-       -0.10167381,    0.042500053,    -0.01447153,   0.06464186,
-       -0.017142897,   0.03312627,     0.009205989,   0.024138335,
-       -0.011337001,   0.035530265,    -0.010912711,  0.0706555,
-       -0.005894094,   0.051841937,    -0.1401738,    -0.02351249,
-       0.0365468,      0.07590991,     0.08838724,    0.021681072,
-       -0.10086113,    0.019608743,    -0.06195883,   0.077335775,
-       0.023646897,    -0.095322326,   0.02233014,    0.09756986,
-       -0.048691444,   -0.009579111,   0.07595467,    0.11480546,
-       -0.09801813,    0.019894179,    0.08502348,    0.004032281,
-       0.037211012,    0.068537936,    -0.048005626,  -0.091520436,
-       -0.028379958,   -0.01556313,    0.06554592,    -0.045599163,
-       -0.01672207,    -0.020169014,   -0.011877351,  -0.20212261,
-       0.010889619,    0.0047078193,   0.038385306,   0.08540671,
-       -0.017140968,   -0.0035865551,  0.016678626,   0.005633034,
-       0.015963363,    0.00871737,     0.060130805,   0.028611384,
-       0.10109069,     -0.015060172,   -0.07894427,   0.06401885,
-       0.011584063,    -0.024466386,   0.0047652307,  -0.09041358,
-       0.030737216,    -0.0046374933,  0.14215417,    -0.11823516,
-       0.019899689,    0.006106124,    -0.027092824,  0.0786356,
-       0.05052217,     -0.058925,      -0.011402121,  -0.024987547,
-       -0.0013661642,  -0.06832946,    -0.015667673,  -0.1083353,
-       -0.00096863037, -0.06988685,    -0.053350925,  -0.027275559,
-       -0.033664223,   -0.07978348,    -0.025200296,  -0.017207067,
-       -0.058403496,   -0.055697463,   0.005798788,   0.12965427,
-       -0.062582195,   0.0013350133,   -0.10482091,   0.0379771,
-       0.072521195,    -0.0029455067,  -0.13797039,   -0.03628521,
-       0.013806405,    -0.017858358,   -0.01008298,   -0.07700066,
-       -0.017081132,   0.019358726,    0.0027079724,  0.004635139,
-       0.062634714,    -0.02338735,    -0.039547626,  -0.02050681,
-       0.03385117,     -0.083611414,   0.002862572,   -0.09421313,
-       0.058618143,    -0.08598433,    0.00972939,    0.023867095,
-       -0.053934585,   -0.023203006,   0.07452513,    -0.048767887,
-       -0.07314807,    -0.056307215,   -0.10433547,   -0.06440842,
-       0.04328182,     0.04389765,     -0.020006588,  -0.09076438,
-       -0.11652589,    -0.021705797,   0.03345259,    -0.010329105,
-       -0.025767034,   0.013057034,    -0.07316461,   -0.10145612,
-       0.06358255,     0.18531723,     0.07759293,    0.12006465,
-       0.1305557,      0.058638252,    -0.03393652,   0.09622831,
-       -0.16253184,    -2.4580743e-06, 0.079869635,   -0.070196845,
-       -0.005644518,   0.06857898,     -0.12598175,   -0.035084512,
-       0.03156317,     -0.12794146,    -0.031963028,  0.04692781,
-       0.030070418,    0.0071660685,   -0.095516115,  -0.004643372,
-       0.040170413,    -0.062104587,   -0.0037324072, 0.0554317,
-       0.08184801,     -0.019164372,   0.06791302,    0.034257166,
-       -0.10307039,    0.021943003,    0.046745934,   0.0790918,
-       -0.0265588,     -0.007824208,   0.042546265,   -0.00977924,
-       -0.0002440307,  -0.017384544,   -0.017990116,  0.12252321,
-       -0.014512694,   -0.08251313,    0.08861942,    0.13589665,
-       0.026351685,    0.012641483,    0.07466548,    0.044301085,
-       -0.045414884,   -0.051112458,   0.03444247,    -0.08502782,
-       -0.04106223,    -0.028126027,   0.028473156,   0.10467447});
-
-  lstm.SetRecurrentToForgetWeights(
-      {-0.057784554,  -0.026057621,  -0.068447545,   -0.022581743,
-       0.14811787,    0.10826372,    0.09471067,     0.03987225,
-       -0.0039523416, 0.00030638507, 0.053185795,    0.10572994,
-       0.08414449,    -0.022036452,  -0.00066928595, -0.09203576,
-       0.032950465,   -0.10985798,   -0.023809856,   0.0021431844,
-       -0.02196096,   -0.00326074,   0.00058621005,  -0.074678116,
-       -0.06193199,   0.055729095,   0.03736828,     0.020123724,
-       0.061878487,   -0.04729229,   0.034919553,    -0.07585433,
-       -0.04421272,   -0.044019096,  0.085488975,    0.04058006,
-       -0.06890133,   -0.030951202,  -0.024628663,   -0.07672815,
-       0.034293607,   0.08556707,    -0.05293577,    -0.033561368,
-       -0.04899627,   0.0241671,     0.015736353,    -0.095442444,
-       -0.029564252,  0.016493602,   -0.035026584,   0.022337519,
-       -0.026871363,  0.004780428,   0.0077918363,   -0.03601621,
-       0.016435321,   -0.03263031,   -0.09543275,    -0.047392778,
-       0.013454138,   0.028934088,   0.01685226,     -0.086110644,
-       -0.046250615,  -0.01847454,   0.047608484,    0.07339695,
-       0.034546845,   -0.04881143,   0.009128804,    -0.08802852,
-       0.03761666,    0.008096139,   -0.014454086,   0.014361001,
-       -0.023502491,  -0.0011840804, -0.07607001,    0.001856849,
-       -0.06509276,   -0.006021153,  -0.08570962,    -0.1451793,
-       0.060212336,   0.055259194,   0.06974018,     0.049454916,
-       -0.027794661,  -0.08077226,   -0.016179763,   0.1169753,
-       0.17213494,    -0.0056326236, -0.053934924,   -0.0124349,
-       -0.11520337,   0.05409887,    0.088759385,    0.0019655675,
-       0.0042065294,  0.03881498,    0.019844765,    0.041858196,
-       -0.05695512,   0.047233116,   0.038937137,    -0.06542224,
-       0.014429736,   -0.09719407,   0.13908425,     -0.05379757,
-       0.012321099,   0.082840554,   -0.029899208,   0.044217527,
-       0.059855383,   0.07711018,    -0.045319796,   0.0948846,
-       -0.011724666,  -0.0033288454, -0.033542685,   -0.04764985,
-       -0.13873616,   0.040668588,   0.034832682,    -0.015319203,
-       -0.018715994,  0.046002675,   0.0599172,      -0.043107376,
-       0.0294216,     -0.002314414,  -0.022424703,   0.0030315618,
-       0.0014641669,  0.0029166266,  -0.11878115,    0.013738511,
-       0.12375372,    -0.0006038222, 0.029104086,    0.087442465,
-       0.052958444,   0.07558703,    0.04817258,     0.044462286,
-       -0.015213451,  -0.08783778,   -0.0561384,     -0.003008196,
-       0.047060397,   -0.002058388,  0.03429439,     -0.018839769,
-       0.024734668,   0.024614193,   -0.042046934,   0.09597743,
-       -0.0043254104, 0.04320769,    0.0064070094,   -0.0019131786,
-       -0.02558259,   -0.022822596,  -0.023273505,   -0.02464396,
-       -0.10991725,   -0.006240552,  0.0074488563,   0.024044557,
-       0.04383914,    -0.046476185,  0.028658995,    0.060410924,
-       0.050786525,   0.009452605,   -0.0073054377,  -0.024810238,
-       0.0052906186,  0.0066939713,  -0.0020913032,  0.014515517,
-       0.015898481,   0.021362653,   -0.030262267,   0.016587038,
-       -0.011442813,  0.041154444,   -0.007631438,   -0.03423484,
-       -0.010977775,  0.036152758,   0.0066366293,   0.11915515,
-       0.02318443,    -0.041350313,  0.021485701,    -0.10906167,
-       -0.028218046,  -0.00954771,   0.020531068,    -0.11995105,
-       -0.03672871,   0.024019798,   0.014255957,    -0.05221243,
-       -0.00661567,   -0.04630967,   0.033188973,    0.10107534,
-       -0.014027541,  0.030796422,   -0.10270911,    -0.035999842,
-       0.15443139,    0.07684145,    0.036571592,    -0.035900835,
-       -0.0034699554, 0.06209149,    0.015920248,    -0.031122351,
-       -0.03858649,   0.01849943,    0.13872518,     0.01503974,
-       0.069941424,   -0.06948533,   -0.0088794185,  0.061282158,
-       -0.047401894,  0.03100163,    -0.041533746,   -0.10430945,
-       0.044574402,   -0.01425562,   -0.024290353,   0.034563623,
-       0.05866852,    0.023947537,   -0.09445152,    0.035450947,
-       0.02247216,    -0.0042998926, 0.061146557,    -0.10250651,
-       0.020881841,   -0.06747029,   0.10062043,     -0.0023941975,
-       0.03532124,    -0.016341697,  0.09685456,     -0.016764693,
-       0.051808182,   0.05875331,    -0.04536488,    0.001626336,
-       -0.028892258,  -0.01048663,   -0.009793449,   -0.017093895,
-       0.010987891,   0.02357273,    -0.00010856845, 0.0099760275,
-       -0.001845119,  -0.03551521,   0.0018358806,   0.05763657,
-       -0.01769146,   0.040995963,   0.02235177,     -0.060430344,
-       0.11475477,    -0.023854522,  0.10071741,     0.0686208,
-       -0.014250481,  0.034261297,   0.047418304,    0.08562733,
-       -0.030519066,  0.0060542435,  0.014653856,    -0.038836084,
-       0.04096551,    0.032249358,   -0.08355519,    -0.026823482,
-       0.056386515,   -0.010401743,  -0.028396193,   0.08507674,
-       0.014410365,   0.020995233,   0.17040324,     0.11511526,
-       0.02459721,    0.0066619175,  0.025853224,    -0.023133837,
-       -0.081302024,  0.017264642,   -0.009585969,   0.09491168,
-       -0.051313367,  0.054532815,   -0.014298593,   0.10657464,
-       0.007076659,   0.10964551,    0.0409152,      0.008275321,
-       -0.07283536,   0.07937492,    0.04192024,     -0.1075027});
-
-  lstm.SetRecurrentToCellWeights(
-      {-0.037322544,   0.018592842,   0.0056175636,  -0.06253426,
-       0.055647098,    -0.05713207,   -0.05626563,   0.005559383,
-       0.03375411,     -0.025757805,  -0.088049285,  0.06017052,
-       -0.06570978,    0.007384076,   0.035123326,   -0.07920549,
-       0.053676967,    0.044480428,   -0.07663568,   0.0071805613,
-       0.08089997,     0.05143358,    0.038261272,   0.03339287,
-       -0.027673481,   0.044746667,   0.028349208,   0.020090483,
-       -0.019443132,   -0.030755889,  -0.0040000007, 0.04465846,
-       -0.021585021,   0.0031670958,  0.0053199246,  -0.056117613,
-       -0.10893326,    0.076739706,   -0.08509834,   -0.027997585,
-       0.037871376,    0.01449768,    -0.09002357,   -0.06111149,
-       -0.046195522,   0.0422062,     -0.005683705,  -0.1253618,
-       -0.012925729,   -0.04890792,   0.06985068,    0.037654128,
-       0.03398274,     -0.004781977,  0.007032333,   -0.031787455,
-       0.010868644,    -0.031489216,  0.09525667,    0.013939797,
-       0.0058680447,   0.0167067,     0.02668468,    -0.04797466,
-       -0.048885044,   -0.12722108,   0.035304096,   0.06554885,
-       0.00972396,     -0.039238118,  -0.05159735,   -0.11329045,
-       0.1613692,      -0.03750952,   0.06529313,    -0.071974665,
-       -0.11769596,    0.015524369,   -0.0013754242, -0.12446318,
-       0.02786344,     -0.014179351,  0.005264273,   0.14376344,
-       0.015983658,    0.03406988,    -0.06939408,   0.040699873,
-       0.02111075,     0.09669095,    0.041345075,   -0.08316494,
-       -0.07684199,    -0.045768797,  0.032298047,   -0.041805092,
-       0.0119405,      0.0061010392,  0.12652606,    0.0064572375,
-       -0.024950314,   0.11574242,    0.04508852,    -0.04335324,
-       0.06760663,     -0.027437469,  0.07216407,    0.06977076,
-       -0.05438599,    0.034033038,   -0.028602652,  0.05346137,
-       0.043184172,    -0.037189785,  0.10420091,    0.00882477,
-       -0.054019816,   -0.074273005,  -0.030617684,  -0.0028467078,
-       0.024302477,    -0.0038869337, 0.005332455,   0.0013399826,
-       0.04361412,     -0.007001822,  0.09631092,    -0.06702025,
-       -0.042049985,   -0.035070654,  -0.04103342,   -0.10273396,
-       0.0544271,      0.037184782,   -0.13150354,   -0.0058036847,
-       -0.008264958,   0.042035464,   0.05891794,    0.029673764,
-       0.0063542654,   0.044788733,   0.054816857,   0.062257513,
-       -0.00093483756, 0.048938446,   -0.004952862,  -0.007730018,
-       -0.04043371,    -0.017094059,  0.07229206,    -0.023670016,
-       -0.052195564,   -0.025616996,  -0.01520939,   0.045104615,
-       -0.007376126,   0.003533447,   0.006570588,   0.056037236,
-       0.12436656,     0.051817212,   0.028532185,   -0.08686856,
-       0.11868599,     0.07663395,    -0.07323171,   0.03463402,
-       -0.050708205,   -0.04458982,   -0.11590894,   0.021273347,
-       0.1251325,      -0.15313013,   -0.12224372,   0.17228661,
-       0.023029093,    0.086124025,   0.006445803,   -0.03496501,
-       0.028332196,    0.04449512,    -0.042436164,  -0.026587414,
-       -0.006041347,   -0.09292539,   -0.05678812,   0.03897832,
-       0.09465633,     0.008115513,   -0.02171956,   0.08304309,
-       0.071401566,    0.019622514,   0.032163795,   -0.004167056,
-       0.02295182,     0.030739572,   0.056506045,   0.004612461,
-       0.06524936,     0.059999723,   0.046395954,   -0.0045512207,
-       -0.1335546,     -0.030136576,  0.11584653,    -0.014678886,
-       0.0020118146,   -0.09688814,   -0.0790206,    0.039770417,
-       -0.0329582,     0.07922767,    0.029322514,   0.026405897,
-       0.04207835,     -0.07073373,   0.063781224,   0.0859677,
-       -0.10925287,    -0.07011058,   0.048005477,   0.03438226,
-       -0.09606514,    -0.006669445,  -0.043381985,  0.04240257,
-       -0.06955775,    -0.06769346,   0.043903265,   -0.026784198,
-       -0.017840602,   0.024307009,   -0.040079936,  -0.019946516,
-       0.045318738,    -0.12233574,   0.026170589,   0.0074471775,
-       0.15978073,     0.10185836,    0.10298046,    -0.015476589,
-       -0.039390966,   -0.072174534,  0.0739445,     -0.1211869,
-       -0.0347889,     -0.07943156,   0.014809798,   -0.12412325,
-       -0.0030663363,  0.039695457,   0.0647603,     -0.08291318,
-       -0.018529687,   -0.004423833,  0.0037507233,  0.084633216,
-       -0.01514876,    -0.056505352,  -0.012800942,  -0.06994386,
-       0.012962922,    -0.031234352,  0.07029052,    0.016418684,
-       0.03618972,     0.055686004,   -0.08663945,   -0.017404709,
-       -0.054761406,   0.029065743,   0.052404847,   0.020238016,
-       0.0048197987,   -0.0214882,    0.07078733,    0.013016777,
-       0.06262858,     0.009184685,   0.020785125,   -0.043904778,
-       -0.0270329,     -0.03299152,   -0.060088247,  -0.015162964,
-       -0.001828936,   0.12642565,    -0.056757294,  0.013586685,
-       0.09232601,     -0.035886683,  0.06000002,    0.05229691,
-       -0.052580316,   -0.082029596,  -0.010794592,  0.012947712,
-       -0.036429964,   -0.085508935,  -0.13127148,   -0.017744139,
-       0.031502828,    0.036232427,   -0.031581745,  0.023051167,
-       -0.05325106,    -0.03421577,   0.028793324,   -0.034633752,
-       -0.009881397,   -0.043551125,  -0.018609839,  0.0019097115,
-       -0.008799762,   0.056595087,   0.0022273948,  0.055752404});
-
-  lstm.SetRecurrentToOutputWeights({
-      0.025825322,   -0.05813119,  0.09495884,   -0.045984812,   -0.01255415,
-      -0.0026479573, -0.08196161,  -0.054914974, -0.0046604523,  -0.029587349,
-      -0.044576716,  -0.07480124,  -0.082868785, 0.023254942,    0.027502948,
-      -0.0039728214, -0.08683098,  -0.08116779,  -0.014675607,   -0.037924774,
-      -0.023314456,  -0.007401714, -0.09255757,  0.029460307,    -0.08829125,
-      -0.005139627,  -0.08989442,  -0.0555066,   0.13596267,     -0.025062224,
-      -0.048351806,  -0.03850004,  0.07266485,   -0.022414139,   0.05940088,
-      0.075114764,   0.09597592,   -0.010211725, -0.0049794707,  -0.011523867,
-      -0.025980417,  0.072999895,  0.11091378,   -0.081685916,   0.014416728,
-      0.043229222,   0.034178585,  -0.07530371,  0.035837382,    -0.085607,
-      -0.007721233,  -0.03287832,  -0.043848954, -0.06404588,    -0.06632928,
-      -0.073643476,  0.008214239,  -0.045984086, 0.039764922,    0.03474462,
-      0.060612556,   -0.080590084, 0.049127717,  0.04151091,     -0.030063879,
-      0.008801774,   -0.023021035, -0.019558564, 0.05158114,     -0.010947698,
-      -0.011825728,  0.0075720972, 0.0699727,    -0.0039981045,  0.069350146,
-      0.08799282,    0.016156472,  0.035502106,  0.11695009,     0.006217345,
-      0.13392477,    -0.037875112, 0.025745004,  0.08940699,     -0.00924166,
-      0.0046702605,  -0.036598757, -0.08811812,  0.10522024,     -0.032441203,
-      0.008176899,   -0.04454919,  0.07058152,   0.0067963637,   0.039206743,
-      0.03259838,    0.03725492,   -0.09515802,  0.013326398,    -0.052055415,
-      -0.025676316,  0.03198509,   -0.015951829, -0.058556724,   0.036879618,
-      0.043357447,   0.028362012,  -0.05908629,  0.0059240665,   -0.04995891,
-      -0.019187413,  0.0276265,    -0.01628143,  0.0025863599,   0.08800015,
-      0.035250366,   -0.022165963, -0.07328642,  -0.009415526,   -0.07455109,
-      0.11690406,    0.0363299,    0.07411125,   0.042103454,    -0.009660886,
-      0.019076364,   0.018299393,  -0.046004917, 0.08891175,     0.0431396,
-      -0.026327137,  -0.051502608, 0.08979574,   -0.051670972,   0.04940282,
-      -0.07491107,   -0.021240504, 0.022596184,  -0.034280192,   0.060163025,
-      -0.058211457,  -0.051837247, -0.01349775,  -0.04639988,    -0.035936575,
-      -0.011681591,  0.064818054,  0.0073146066, -0.021745546,   -0.043124277,
-      -0.06471268,   -0.07053354,  -0.029321948, -0.05330136,    0.016933719,
-      -0.053782392,  0.13747959,   -0.1361751,   -0.11569455,    0.0033329215,
-      0.05693899,    -0.053219706, 0.063698,     0.07977434,     -0.07924483,
-      0.06936997,    0.0034815092, -0.007305279, -0.037325785,   -0.07251102,
-      -0.033633437,  -0.08677009,  0.091591336,  -0.14165086,    0.021752775,
-      0.019683983,   0.0011612234, -0.058154266, 0.049996935,    0.0288841,
-      -0.0024567875, -0.14345716,  0.010955264,  -0.10234828,    0.1183656,
-      -0.0010731248, -0.023590032, -0.072285876, -0.0724771,     -0.026382286,
-      -0.0014920527, 0.042667855,  0.0018776858, 0.02986552,     0.009814309,
-      0.0733756,     0.12289186,   0.018043943,  -0.0458958,     0.049412545,
-      0.033632483,   0.05495232,   0.036686596,  -0.013781798,   -0.010036754,
-      0.02576849,    -0.08307328,  0.010112348,  0.042521734,    -0.05869831,
-      -0.071689695,  0.03876447,   -0.13275425,  -0.0352966,     -0.023077697,
-      0.10285965,    0.084736146,  0.15568255,   -0.00040734606, 0.027835453,
-      -0.10292561,   -0.032401145, 0.10053256,   -0.026142767,   -0.08271222,
-      -0.0030240538, -0.016368777, 0.1070414,    0.042672627,    0.013456989,
-      -0.0437609,    -0.022309763, 0.11576483,   0.04108048,     0.061026827,
-      -0.0190714,    -0.0869359,   0.037901703,  0.0610107,      0.07202949,
-      0.01675338,    0.086139716,  -0.08795751,  -0.014898893,   -0.023771819,
-      -0.01965048,   0.007955471,  -0.043740474, 0.03346837,     -0.10549954,
-      0.090567775,   0.042013682,  -0.03176985,  0.12569028,     -0.02421228,
-      -0.029526481,  0.023851605,  0.031539805,  0.05292009,     -0.02344001,
-      -0.07811758,   -0.08834428,  0.10094801,   0.16594367,     -0.06861939,
-      -0.021256343,  -0.041093912, -0.06669611,  0.035498552,    0.021757556,
-      -0.09302526,   -0.015403468, -0.06614931,  -0.051798206,   -0.013874718,
-      0.03630673,    0.010412845,  -0.08077351,  0.046185967,    0.0035662893,
-      0.03541868,    -0.094149634, -0.034814864, 0.003128424,    -0.020674974,
-      -0.03944324,   -0.008110165, -0.11113267,  0.08484226,     0.043586485,
-      0.040582247,   0.0968012,    -0.065249965, -0.028036479,   0.0050708856,
-      0.0017462453,  0.0326779,    0.041296225,  0.09164146,     -0.047743853,
-      -0.015952192,  -0.034451712, 0.084197424,  -0.05347844,    -0.11768019,
-      0.085926116,   -0.08251791,  -0.045081906, 0.0948852,      0.068401024,
-      0.024856757,   0.06978981,   -0.057309967, -0.012775832,   -0.0032452994,
-      0.01977615,    -0.041040014, -0.024264973, 0.063464895,    0.05431621,
-  });
-
-  lstm.SetCellToInputWeights(
-      {0.040369894, 0.030746894,  0.24704495,  0.018586371,  -0.037586458,
-       -0.15312155, -0.11812848,  -0.11465643, 0.20259799,   0.11418174,
-       -0.10116027, -0.011334949, 0.12411352,  -0.076769054, -0.052169047,
-       0.21198851,  -0.38871562,  -0.09061183, -0.09683246,  -0.21929175});
-
-  lstm.SetCellToForgetWeights(
-      {-0.01998659,  -0.15568835,  -0.24248174,   -0.012770197, 0.041331276,
-       -0.072311886, -0.052123554, -0.0066330447, -0.043891653, 0.036225766,
-       -0.047248036, 0.021479502,  0.033189066,   0.11952997,   -0.020432774,
-       0.64658105,   -0.06650122,  -0.03467612,   0.095340036,  0.23647355});
-
-  lstm.SetCellToOutputWeights(
-      {0.08286371,  -0.08261836, -0.51210177, 0.002913762, 0.17764764,
-       -0.5495371,  -0.08460716, -0.24552552, 0.030037103, 0.04123544,
-       -0.11940523, 0.007358328, 0.1890978,   0.4833202,   -0.34441817,
-       0.36312827,  -0.26375428, 0.1457655,   -0.19724406, 0.15548733});
-
-  lstm.SetProjectionWeights(
-      {-0.009802181,  0.09401916,    0.0717386,     -0.13895074,  0.09641832,
-       0.060420845,   0.08539281,    0.054285463,   0.061395317,  0.034448683,
-       -0.042991187,  0.019801661,   -0.16840284,   -0.015726732, -0.23041931,
-       -0.024478018,  -0.10959692,   -0.013875541,  0.18600968,   -0.061274476,
-       0.0138165,     -0.08160894,   -0.07661644,   0.032372914,  0.16169067,
-       0.22465782,    -0.03993472,   -0.004017731,  0.08633481,   -0.28869787,
-       0.08682067,    0.17240396,    0.014975425,   0.056431185,  0.031037588,
-       0.16702051,    0.0077946745,  0.15140012,    0.29405436,   0.120285,
-       -0.188994,     -0.027265169,  0.043389652,   -0.022061434, 0.014777949,
-       -0.20203483,   0.094781205,   0.19100232,    0.13987629,   -0.036132768,
-       -0.06426278,   -0.05108664,   0.13221376,    0.009441198,  -0.16715929,
-       0.15859416,    -0.040437475,  0.050779544,   -0.022187516, 0.012166504,
-       0.027685808,   -0.07675938,   -0.0055694645, -0.09444123,  0.0046453946,
-       0.050794356,   0.10770313,    -0.20790008,   -0.07149004,  -0.11425117,
-       0.008225835,   -0.035802525,  0.14374903,    0.15262283,   0.048710253,
-       0.1847461,     -0.007487823,  0.11000021,    -0.09542012,  0.22619456,
-       -0.029149994,  0.08527916,    0.009043713,   0.0042746216, 0.016261552,
-       0.022461696,   0.12689082,    -0.043589946,  -0.12035478,  -0.08361797,
-       -0.050666027,  -0.1248618,    -0.1275799,    -0.071875185, 0.07377272,
-       0.09944291,    -0.18897448,   -0.1593054,    -0.06526116,  -0.040107165,
-       -0.004618631,  -0.067624845,  -0.007576253,  0.10727444,   0.041546922,
-       -0.20424393,   0.06907816,    0.050412357,   0.00724631,   0.039827548,
-       0.12449835,    0.10747581,    0.13708383,    0.09134148,   -0.12617786,
-       -0.06428341,   0.09956831,    0.1208086,     -0.14676677,  -0.0727722,
-       0.1126304,     0.010139365,   0.015571211,   -0.038128063, 0.022913318,
-       -0.042050496,  0.16842307,    -0.060597885,  0.10531834,   -0.06411776,
-       -0.07451711,   -0.03410368,   -0.13393489,   0.06534304,   0.003620307,
-       0.04490757,    0.05970546,    0.05197996,    0.02839995,   0.10434969,
-       -0.013699693,  -0.028353551,  -0.07260381,   0.047201227,  -0.024575593,
-       -0.036445823,  0.07155557,    0.009672501,   -0.02328883,  0.009533515,
-       -0.03606021,   -0.07421458,   -0.028082801,  -0.2678904,   -0.13221288,
-       0.18419984,    -0.13012612,   -0.014588381,  -0.035059117, -0.04824723,
-       0.07830115,    -0.056184657,  0.03277091,    0.025466874,  0.14494097,
-       -0.12522776,   -0.098633975,  -0.10766018,   -0.08317623,  0.08594209,
-       0.07749552,    0.039474737,   0.1776665,     -0.07409566,  -0.0477268,
-       0.29323658,    0.10801441,    0.1154011,     0.013952499,  0.10739139,
-       0.10708251,    -0.051456142,  0.0074137426,  -0.10430189,  0.10034707,
-       0.045594677,   0.0635285,     -0.0715442,    -0.089667566, -0.10811871,
-       0.00026344223, 0.08298446,    -0.009525053,  0.006585689,  -0.24567553,
-       -0.09450807,   0.09648481,    0.026996298,   -0.06419476,  -0.04752702,
-       -0.11063944,   -0.23441927,   -0.17608605,   -0.052156363, 0.067035615,
-       0.19271925,    -0.0032889997, -0.043264326,  0.09663576,   -0.057112187,
-       -0.10100678,   0.0628376,     0.04447668,    0.017961001,  -0.10094388,
-       -0.10190601,   0.18335468,    0.10494553,    -0.052095775, -0.0026118709,
-       0.10539724,    -0.04383912,   -0.042349473,  0.08438151,   -0.1947263,
-       0.02251204,    0.11216432,    -0.10307853,   0.17351969,   -0.039091777,
-       0.08066188,    -0.00561982,   0.12633002,    0.11335965,   -0.0088127935,
-       -0.019777594,  0.06864014,    -0.059751723,  0.016233567,  -0.06894641,
-       -0.28651384,   -0.004228674,  0.019708522,   -0.16305895,  -0.07468996,
-       -0.0855457,    0.099339016,   -0.07580735,   -0.13775392,  0.08434318,
-       0.08330512,    -0.12131499,   0.031935584,   0.09180414,   -0.08876437,
-       -0.08049874,   0.008753825,   0.03498998,    0.030215185,  0.03907079,
-       0.089751154,   0.029194152,   -0.03337423,   -0.019092513, 0.04331237,
-       0.04299654,    -0.036394123,  -0.12915532,   0.09793732,   0.07512415,
-       -0.11319543,   -0.032502122,  0.15661901,    0.07671967,   -0.005491124,
-       -0.19379048,   -0.218606,     0.21448623,    0.017840758,  0.1416943,
-       -0.07051762,   0.19488361,    0.02664691,    -0.18104725,  -0.09334311,
-       0.15026465,    -0.15493552,   -0.057762887,  -0.11604192,  -0.262013,
-       -0.01391798,   0.012185008,   0.11156489,    -0.07483202,  0.06693364,
-       -0.26151478,   0.046425626,   0.036540434,   -0.16435726,  0.17338543,
-       -0.21401681,   -0.11385144,   -0.08283257,   -0.069031075, 0.030635102,
-       0.010969227,   0.11109743,    0.010919218,   0.027526086,  0.13519906,
-       0.01891392,    -0.046839405,  -0.040167913,  0.017953383,  -0.09700955,
-       0.0061885654,  -0.07000971,   0.026893595,   -0.038844477, 0.14543656});
-
-  static float lstm_input[][20] = {
-      {// Batch0: 4 (input_sequence_size) * 5 (n_input)
-       0.787926, 0.151646, 0.071352, 0.118426, 0.458058, 0.596268, 0.998386,
-       0.568695, 0.864524, 0.571277, 0.073204, 0.296072, 0.743333, 0.069199,
-       0.045348, 0.867394, 0.291279, 0.013714, 0.482521, 0.626339},
-
-      {// Batch1: 4 (input_sequence_size) * 5 (n_input)
-       0.295743, 0.544053, 0.690064, 0.858138, 0.497181, 0.642421, 0.524260,
-       0.134799, 0.003639, 0.162482, 0.640394, 0.930399, 0.050782, 0.432485,
-       0.988078, 0.082922, 0.563329, 0.865614, 0.333232, 0.259916}};
-
-  static float lstm_golden_output[][64] = {
-      {// Batch0: 4 (input_sequence_size) * 16 (n_output)
-       -0.00396806, 0.029352,     -0.00279226, 0.0159977,   -0.00835576,
-       -0.0211779,  0.0283512,    -0.0114597,  0.00907307,  -0.0244004,
-       -0.0152191,  -0.0259063,   0.00914318,  0.00415118,  0.017147,
-       0.0134203,   -0.0166936,   0.0381209,   0.000889694, 0.0143363,
-       -0.0328911,  -0.0234288,   0.0333051,   -0.012229,   0.0110322,
-       -0.0457725,  -0.000832209, -0.0202817,  0.0327257,   0.0121308,
-       0.0155969,   0.0312091,    -0.0213783,  0.0350169,   0.000324794,
-       0.0276012,   -0.0263374,   -0.0371449,  0.0446149,   -0.0205474,
-       0.0103729,   -0.0576349,   -0.0150052,  -0.0292043,  0.0376827,
-       0.0136115,   0.0243435,    0.0354492,   -0.0189322,  0.0464512,
-       -0.00251373, 0.0225745,    -0.0308346,  -0.0317124,  0.0460407,
-       -0.0189395,  0.0149363,    -0.0530162,  -0.0150767,  -0.0340193,
-       0.0286833,   0.00824207,   0.0264887,   0.0305169},
-      {// Batch1: 4 (input_sequence_size) * 16 (n_output)
-       -0.013869,    0.0287268,   -0.00334693, 0.00733398,  -0.0287926,
-       -0.0186926,   0.0193662,   -0.0115437,  0.00422612,  -0.0345232,
-       0.00223253,   -0.00957321, 0.0210624,   0.013331,    0.0150954,
-       0.02168,      -0.0141913,  0.0322082,   0.00227024,  0.0260507,
-       -0.0188721,   -0.0296489,  0.0399134,   -0.0160509,  0.0116039,
-       -0.0447318,   -0.0150515,  -0.0277406,  0.0316596,   0.0118233,
-       0.0214762,    0.0293641,   -0.0204549,  0.0450315,   -0.00117378,
-       0.0167673,    -0.0375007,  -0.0238314,  0.038784,    -0.0174034,
-       0.0131743,    -0.0506589,  -0.0048447,  -0.0240239,  0.0325789,
-       0.00790065,   0.0220157,   0.0333314,   -0.0264787,  0.0387855,
-       -0.000764675, 0.0217599,   -0.037537,   -0.0335206,  0.0431679,
-       -0.0211424,   0.010203,    -0.062785,   -0.00832363, -0.025181,
-       0.0412031,    0.0118723,   0.0239643,   0.0394009}};
+  lstm.SetInputToInputWeights(input_to_input_weights_);
+  lstm.SetInputToCellWeights(input_to_cell_weights_);
+  lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  lstm.SetInputGateBias(input_gate_bias_);
+  lstm.SetCellBias(cell_gate_bias_);
+  lstm.SetForgetGateBias(forget_gate_bias_);
+  lstm.SetOutputGateBias(output_gate_bias_);
+
+  lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
+  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  lstm.SetCellToInputWeights(cell_to_input_weights_);
+  lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  lstm.SetCellToOutputWeights(cell_to_output_weights_);
+
+  lstm.SetProjectionWeights(projection_weights_);
 
   // Resetting cell_state and output_state
   lstm.ResetCellState();
   lstm.ResetOutputState();
 
-  const int input_sequence_size =
-      sizeof(lstm_input[0]) / sizeof(float) / (lstm.num_inputs());
-  for (int i = 0; i < input_sequence_size; i++) {
-    float* batch0_start = lstm_input[0] + i * lstm.num_inputs();
-    float* batch0_end = batch0_start + lstm.num_inputs();
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm);
+}
 
-    lstm.SetInput(0, batch0_start, batch0_end);
+TEST_F(NoCifgPeepholeProjectionClippingLstmTest, HybridLstmBlackBoxTest) {
+  const int n_batch = 2;
+  const int n_input = 5;
+  const int n_cell = 20;
+  const int n_output = 16;
 
-    float* batch1_start = lstm_input[1] + i * lstm.num_inputs();
-    float* batch1_end = batch1_start + lstm.num_inputs();
-    lstm.SetInput(lstm.num_inputs(), batch1_start, batch1_end);
+  HybridLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/false, /*use_peephole=*/true,
+      /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false,
+      /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      {
+          {n_batch, n_input},  // input tensor
+
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {n_cell},  // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {n_output, n_cell},  // projection_weight tensor
+          {0},                 // projection_bias tensor
+      });
+
+  lstm.SetInputToInputWeights(input_to_input_weights_);
+  lstm.SetInputToCellWeights(input_to_cell_weights_);
+  lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  lstm.SetInputGateBias(input_gate_bias_);
+  lstm.SetCellBias(cell_gate_bias_);
+  lstm.SetForgetGateBias(forget_gate_bias_);
+  lstm.SetOutputGateBias(output_gate_bias_);
+
+  lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
+  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  lstm.SetCellToInputWeights(cell_to_input_weights_);
+  lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  lstm.SetCellToOutputWeights(cell_to_output_weights_);
+
+  lstm.SetProjectionWeights(projection_weights_);
 
-    lstm.Invoke();
+  // Resetting cell_state and output_state
+  lstm.ResetCellState();
+  lstm.ResetOutputState();
 
-    float* golden_start_batch0 = lstm_golden_output[0] + i * lstm.num_outputs();
-    float* golden_end_batch0 = golden_start_batch0 + lstm.num_outputs();
-    float* golden_start_batch1 = lstm_golden_output[1] + i * lstm.num_outputs();
-    float* golden_end_batch1 = golden_start_batch1 + lstm.num_outputs();
-    std::vector<float> expected;
-    expected.insert(expected.end(), golden_start_batch0, golden_end_batch0);
-    expected.insert(expected.end(), golden_start_batch1, golden_end_batch1);
-    EXPECT_THAT(lstm.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
-  }
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm, /*tolerance=*/0.00467);
 }
 
 }  // namespace
diff --git a/tensorflow/contrib/lite/kernels/maximum_minimum.cc b/tensorflow/contrib/lite/kernels/maximum_minimum.cc
index 5a28d663c9e756..8d676218bdcf71 100644
--- a/tensorflow/contrib/lite/kernels/maximum_minimum.cc
+++ b/tensorflow/contrib/lite/kernels/maximum_minimum.cc
@@ -41,8 +41,8 @@ struct OpContext {
     input2 = GetInput(context, node, kInputTensor2);
     output = GetOutput(context, node, kOutputTensor);
   }
-  TfLiteTensor* input1;
-  TfLiteTensor* input2;
+  const TfLiteTensor* input1;
+  const TfLiteTensor* input2;
   TfLiteTensor* output;
 };
 
diff --git a/tensorflow/contrib/lite/kernels/mean.cc b/tensorflow/contrib/lite/kernels/mean.cc
index 047bdd1039b993..03e5db24de3f3c 100644
--- a/tensorflow/contrib/lite/kernels/mean.cc
+++ b/tensorflow/contrib/lite/kernels/mean.cc
@@ -40,8 +40,8 @@ struct MeanContext {
     output = GetOutput(context, node, 0);
   }
   TfLiteMeanParams* params;
-  TfLiteTensor* input;
-  TfLiteTensor* axis;
+  const TfLiteTensor* input;
+  const TfLiteTensor* axis;
   TfLiteTensor* output;
 };
 
@@ -146,7 +146,7 @@ TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node,
   TfLiteIntArrayFree(node->temporaries);
   node->temporaries = TfLiteIntArrayCreate(3);
   node->temporaries->data[0] = *scratch_tensor_index;
-  TfLiteTensor* scratch_tensor = &context->tensors[node->temporaries->data[0]];
+  TfLiteTensor* scratch_tensor = GetTemporary(context, node, /*index=*/0);
   scratch_tensor->type = kTfLiteInt32;
   scratch_tensor->allocation_type = kTfLiteArenaRw;
   TfLiteIntArray* index_size = TfLiteIntArrayCreate(1);
@@ -156,11 +156,11 @@ TfLiteStatus InitializeTemporaries(TfLiteContext* context, TfLiteNode* node,
 
   // Creates a temp tensor to store resolved axis given input data.
   node->temporaries->data[1] = *scratch_tensor_index + 1;
-  TfLiteTensor* resolved_axis = &context->tensors[node->temporaries->data[1]];
+  TfLiteTensor* resolved_axis = GetTemporary(context, node, /*index=*/1);
   resolved_axis->type = kTfLiteInt32;
   // Creates a temp tensor to store temp sums when calculating mean.
   node->temporaries->data[2] = *scratch_tensor_index + 2;
-  TfLiteTensor* temp_sum = &context->tensors[node->temporaries->data[2]];
+  TfLiteTensor* temp_sum = GetTemporary(context, node, /*index=*/2);
   switch (op_context->input->type) {
     case kTfLiteFloat32:
       temp_sum->type = kTfLiteFloat32;
@@ -187,8 +187,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   MeanContext op_context(context, node);
   TF_LITE_ENSURE_OK(context, InitializeTemporaries(context, node, &op_context));
 
-  TfLiteTensor* resolved_axis = &context->tensors[node->temporaries->data[1]];
-  TfLiteTensor* temp_sum = &context->tensors[node->temporaries->data[2]];
+  TfLiteTensor* resolved_axis = GetTemporary(context, node, /*index=*/1);
+  TfLiteTensor* temp_sum = GetTemporary(context, node, /*index=*/2);
   // Leaves work to Eval if axis is not constant; else resizes output.
   if (!IsConstantTensor(op_context.axis)) {
     SetTensorToDynamic(op_context.output);
@@ -208,9 +208,9 @@ template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   MeanContext op_context(context, node);
   int num_axis = static_cast<int>(NumElements(op_context.axis));
-  TfLiteTensor* temp_index = &context->tensors[node->temporaries->data[0]];
-  TfLiteTensor* resolved_axis = &context->tensors[node->temporaries->data[1]];
-  TfLiteTensor* temp_sum = &context->tensors[node->temporaries->data[2]];
+  TfLiteTensor* temp_index = GetTemporary(context, node, /*index=*/0);
+  TfLiteTensor* resolved_axis = GetTemporary(context, node, /*index=*/1);
+  TfLiteTensor* temp_sum = GetTemporary(context, node, /*index=*/2);
   // Resize the output tensor if the output tensor is dynamic.
   if (IsDynamicTensor(op_context.output)) {
     TF_LITE_ENSURE_OK(context,
diff --git a/tensorflow/contrib/lite/kernels/mfcc.cc b/tensorflow/contrib/lite/kernels/mfcc.cc
index 018db0dc54c5d2..3f5bc4d68a57da 100644
--- a/tensorflow/contrib/lite/kernels/mfcc.cc
+++ b/tensorflow/contrib/lite/kernels/mfcc.cc
@@ -67,8 +67,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* inputWav = GetInput(context, node, kInputTensorWav);
-  TfLiteTensor* inputRate = GetInput(context, node, kInputTensorRate);
+  const TfLiteTensor* inputWav = GetInput(context, node, kInputTensorWav);
+  const TfLiteTensor* inputRate = GetInput(context, node, kInputTensorRate);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   TF_LITE_ENSURE_EQ(context, NumDimensions(inputWav), 3);
@@ -94,8 +94,8 @@ template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteMfccParams*>(node->user_data);
 
-  TfLiteTensor* inputWav = GetInput(context, node, kInputTensorWav);
-  TfLiteTensor* inputRate = GetInput(context, node, kInputTensorRate);
+  const TfLiteTensor* inputWav = GetInput(context, node, kInputTensorWav);
+  const TfLiteTensor* inputRate = GetInput(context, node, kInputTensorRate);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   const int32 sample_rate = *GetTensorData<int>(inputRate);
diff --git a/tensorflow/contrib/lite/kernels/mul.cc b/tensorflow/contrib/lite/kernels/mul.cc
index 54575019de4c67..62f4e94a386fbb 100644
--- a/tensorflow/contrib/lite/kernels/mul.cc
+++ b/tensorflow/contrib/lite/kernels/mul.cc
@@ -57,8 +57,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
@@ -80,7 +80,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 template <KernelType kernel_type>
 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
                TfLiteMulParams* params, const OpData* data,
-               TfLiteTensor* input1, TfLiteTensor* input2,
+               const TfLiteTensor* input1, const TfLiteTensor* input2,
                TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(params->activation, &output_activation_min,
@@ -109,7 +109,7 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
 template <KernelType kernel_type>
 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                    TfLiteMulParams* params, const OpData* data,
-                   TfLiteTensor* input1, TfLiteTensor* input2,
+                   const TfLiteTensor* input1, const TfLiteTensor* input2,
                    TfLiteTensor* output) {
   auto input1_offset = -input1->params.zero_point;
   auto input2_offset = -input2->params.zero_point;
@@ -149,8 +149,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteMulParams*>(node->builtin_data);
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   if (output->type == kTfLiteFloat32) {
@@ -159,8 +159,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     EvalQuantized<kernel_type>(context, node, params, data, input1, input2,
                                output);
   } else {
-    context->ReportError(context,
-                         "Mul only supports FLOAT32 and quantized UINT8 now.");
+    context->ReportError(
+        context, "Mul only supports FLOAT32 and quantized UINT8 now, got %d.",
+        output->type);
     return kTfLiteError;
   }
 
diff --git a/tensorflow/contrib/lite/kernels/neg.cc b/tensorflow/contrib/lite/kernels/neg.cc
new file mode 100644
index 00000000000000..4124c05388cca1
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/neg.cc
@@ -0,0 +1,80 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace neg {
+
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  output->type = input->type;
+  return context->ResizeTensor(context, output,
+                               TfLiteIntArrayCopy(input->dims));
+}
+
+template <typename T>
+void Negate(const T* in_data, int num_elements, T* out_data) {
+  // TODO(alanchiao): add vectorized version.
+  for (int i = 0; i < num_elements; ++i) {
+    out_data[i] = -in_data[i];
+  }
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const int num_elements = NumElements(input);
+  switch (input->type) {
+    case kTfLiteInt64:
+      Negate(input->data.i64, num_elements, output->data.i64);
+      break;
+    case kTfLiteInt32:
+      Negate(input->data.i32, num_elements, output->data.i32);
+      break;
+    case kTfLiteFloat32:
+      Negate(input->data.f, num_elements, output->data.f);
+      break;
+    default:
+      context->ReportError(
+          context,
+          "Neg only currently supports int64, int32, and float32, got %d.",
+          input->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace neg
+
+TfLiteRegistration* Register_NEG() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 neg::Prepare, neg::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/neg_test.cc b/tensorflow/contrib/lite/kernels/neg_test.cc
new file mode 100644
index 00000000000000..3c95ac8cc2727f
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/neg_test.cc
@@ -0,0 +1,80 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class NegOpModel : public SingleOpModel {
+ public:
+  NegOpModel(const TensorData& input, const TensorData& output) {
+    input_ = AddInput(input);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_NEG, BuiltinOptions_NegOptions,
+                 CreateNegOptions(builder_).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+
+  template <class T>
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor<T>(input_, data);
+  }
+
+  template <class T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+
+ protected:
+  int input_;
+  int output_;
+};
+
+TEST(NegOpModel, NegFloat) {
+  NegOpModel m({TensorType_FLOAT32, {2, 3}}, {TensorType_FLOAT32, {2, 3}});
+  m.SetInput<float>({-2.0f, -1.0f, 0.f, 1.0f, 2.0f, 3.0f});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray({2.0f, 1.0f, 0.f, -1.0f, -2.0f, -3.0f}));
+}
+
+TEST(NegOpModel, NegInt32) {
+  NegOpModel m({TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 3}});
+  m.SetInput<int32>({-2, -1, 0, 1, 2, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int32>(), ElementsAreArray({2, 1, 0, -1, -2, -3}));
+}
+
+TEST(NegOpModel, NegInt64) {
+  NegOpModel m({TensorType_INT64, {2, 3}}, {TensorType_INT64, {2, 3}});
+  m.SetInput<int64_t>({-2, -1, 0, 1, 2, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<int64_t>(), ElementsAreArray({2, 1, 0, -1, -2, -3}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/optional_tensor_test.cc b/tensorflow/contrib/lite/kernels/optional_tensor_test.cc
index cee3ec6197c698..bcad58406af1cd 100644
--- a/tensorflow/contrib/lite/kernels/optional_tensor_test.cc
+++ b/tensorflow/contrib/lite/kernels/optional_tensor_test.cc
@@ -95,9 +95,6 @@ class LSTMOpModel : public SingleOpModel {
       projection_bias_ = AddNullInput();
     }
 
-    scratch_buffer_ = AddOutput(TensorType_FLOAT32);
-    // TODO(ghodrat): Modify these states when we have a permanent solution for
-    // persistent buffer.
     output_state_ = AddOutput(TensorType_FLOAT32);
     cell_state_ = AddOutput(TensorType_FLOAT32);
     output_ = AddOutput(TensorType_FLOAT32);
@@ -235,7 +232,6 @@ class LSTMOpModel : public SingleOpModel {
   int output_;
   int output_state_;
   int cell_state_;
-  int scratch_buffer_;
 
   int n_batch_;
   int n_input_;
diff --git a/tensorflow/contrib/lite/kernels/pad.cc b/tensorflow/contrib/lite/kernels/pad.cc
index 4f9449a225c66a..83668cb4ca87e9 100644
--- a/tensorflow/contrib/lite/kernels/pad.cc
+++ b/tensorflow/contrib/lite/kernels/pad.cc
@@ -37,11 +37,17 @@ struct PadContext {
   PadContext(TfLiteContext* context, TfLiteNode* node) {
     input = GetInput(context, node, 0);
     paddings = GetInput(context, node, 1);
+    if (NumInputs(node) == 3) {
+      constant_values = GetOptionalInputTensor(context, node, 2);
+    } else {
+      constant_values = nullptr;
+    }
     output = GetOutput(context, node, 0);
     dims = NumDimensions(input);
   }
-  TfLiteTensor* input;
-  TfLiteTensor* paddings;
+  const TfLiteTensor* constant_values;
+  const TfLiteTensor* input;
+  const TfLiteTensor* paddings;
   TfLiteTensor* output;
   int dims;
 };
@@ -76,11 +82,15 @@ TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
 }
 
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE(context, NumInputs(node) == 2 || NumInputs(node) == 3);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
   PadContext op_context(context, node);
   TF_LITE_ENSURE_EQ(context, op_context.input->type, op_context.output->type);
+  if (op_context.constant_values != nullptr) {
+    TF_LITE_ENSURE_EQ(context, op_context.input->type,
+                      op_context.constant_values->type);
+  }
 
   // TODO(nupurgarg): Our current implementations rely on the inputs being 4D.
   TF_LITE_ENSURE_EQ(context, op_context.dims, 4);
@@ -98,6 +108,11 @@ template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   PadContext op_context(context, node);
 
+  if (op_context.constant_values != nullptr) {
+    // Ensure that constant_values is a scalar.
+    TF_LITE_ENSURE_EQ(context, NumElements(op_context.constant_values), 1);
+  }
+
   // Resize the output tensor if the output tensor is dynamic.
   if (IsDynamicTensor(op_context.output)) {
     TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
@@ -119,50 +134,74 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     after_padding.push_back(paddings_data[idx * 2 + 1]);
   }
 
-#define TF_LITE_PAD(type, scalar, pad_value)                                \
-  type::Pad(GetTensorData<scalar>(op_context.input),                        \
-            GetTensorDims(op_context.input), before_padding, after_padding, \
-            GetTensorData<scalar>(op_context.output),                       \
-            GetTensorDims(op_context.output), pad_value)
+#define TF_LITE_PAD(type, scalar, pad_value)                                  \
+  type::PadV2(GetTensorData<scalar>(op_context.input),                        \
+              GetTensorDims(op_context.input), before_padding, after_padding, \
+              GetTensorData<scalar>(op_context.output),                       \
+              GetTensorDims(op_context.output), pad_value)
 
   switch (op_context.input->type) {
-    case kTfLiteFloat32:
+    case kTfLiteFloat32: {
+      float pad_value = op_context.constant_values == nullptr
+                            ? 0.f
+                            : *GetTensorData<float>(op_context.constant_values);
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, float, 0);
+        TF_LITE_PAD(reference_ops, float, pad_value);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, float, 0);
+        TF_LITE_PAD(optimized_ops, float, pad_value);
+      }
+    } break;
+    case kTfLiteUInt8: {
+      uint8_t pad_value;
+      if (op_context.constant_values == nullptr) {
+        // Quantized Pad requires that 0 is represented in the quantized
+        // range.
+        TF_LITE_ENSURE(context, op_context.output->params.zero_point >=
+                                    std::numeric_limits<uint8_t>::min());
+        TF_LITE_ENSURE(context, op_context.output->params.zero_point <=
+                                    std::numeric_limits<uint8_t>::max());
+        pad_value = static_cast<uint8_t>(op_context.output->params.zero_point);
+      } else {
+        // Quantized Pad requires that 'constant_values' is represented in the
+        // same quantized range as the input and output tensors.
+        TF_LITE_ENSURE_EQ(context, op_context.output->params.zero_point,
+                          op_context.constant_values->params.zero_point);
+        TF_LITE_ENSURE_EQ(context, op_context.output->params.scale,
+                          op_context.constant_values->params.scale);
+        pad_value = *GetTensorData<uint8_t>(op_context.constant_values);
       }
-      break;
-    case kTfLiteUInt8:
-      // Quantized Pad requires that 0 is represented in the quantized range.
-      TF_LITE_ENSURE(context, op_context.output->params.zero_point >=
-                                  std::numeric_limits<uint8_t>::min());
-      TF_LITE_ENSURE(context, op_context.output->params.zero_point <=
-                                  std::numeric_limits<uint8_t>::max());
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, uint8_t,
-                    op_context.output->params.zero_point);
+        TF_LITE_PAD(reference_ops, uint8_t, pad_value);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, uint8_t,
-                    op_context.output->params.zero_point);
+        TF_LITE_PAD(optimized_ops, uint8_t, pad_value);
       }
-      break;
-    case kTfLiteInt32:
+    } break;
+    case kTfLiteInt32: {
+      int32_t pad_value =
+          op_context.constant_values == nullptr
+              ? 0
+              : *GetTensorData<int32_t>(op_context.constant_values);
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, int32_t, 0);
+        TF_LITE_PAD(reference_ops, int32_t, pad_value);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, int32_t, 0);
+        TF_LITE_PAD(optimized_ops, int32_t, pad_value);
       }
-      break;
-    case kTfLiteInt64:
+    } break;
+    case kTfLiteInt64: {
+      int64_t pad_value =
+          op_context.constant_values == nullptr
+              ? 0L
+              : *GetTensorData<int64_t>(op_context.constant_values);
       if (kernel_type == kReference) {
-        TF_LITE_PAD(reference_ops, int64_t, 0);
+        TF_LITE_PAD(reference_ops, int64_t, pad_value);
       } else if (kernel_type == kGenericOptimized) {
-        TF_LITE_PAD(optimized_ops, int64_t, 0);
+        TF_LITE_PAD(optimized_ops, int64_t, pad_value);
       }
-      break;
+    } break;
     default:
-      context->ReportError(context, "Type is currently not supported by Pad.");
+      context->ReportError(context,
+                           "Type %d is currently not supported by Pad.",
+                           op_context.input->type);
       return kTfLiteError;
   }
 #undef TF_LITE_PAD
@@ -185,6 +224,21 @@ TfLiteRegistration* Register_PAD_GENERIC_OPT() {
 
 TfLiteRegistration* Register_PAD() { return Register_PAD_GENERIC_OPT(); }
 
+// Also register Pad as PadV2.
+TfLiteRegistration* Register_PADV2_REF() {
+  static TfLiteRegistration r = {nullptr, nullptr, pad::Prepare,
+                                 pad::Eval<pad::kReference>};
+  return &r;
+}
+
+TfLiteRegistration* Register_PADV2_GENERIC_OPT() {
+  static TfLiteRegistration r = {nullptr, nullptr, pad::Prepare,
+                                 pad::Eval<pad::kGenericOptimized>};
+  return &r;
+}
+
+TfLiteRegistration* Register_PADV2() { return Register_PADV2_GENERIC_OPT(); }
+
 }  // namespace builtin
 }  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/pad_test.cc b/tensorflow/contrib/lite/kernels/pad_test.cc
index c06237e5720874..f8b9064fbbdd25 100644
--- a/tensorflow/contrib/lite/kernels/pad_test.cc
+++ b/tensorflow/contrib/lite/kernels/pad_test.cc
@@ -24,21 +24,26 @@ namespace {
 using ::testing::ElementsAreArray;
 using ::testing::Matcher;
 
+template <typename T>
 class PadOpModel : public SingleOpModel {
  public:
-  void SetInput(std::initializer_list<float> data) {
-    PopulateTensor<float>(input_, data);
+  void SetInput(std::initializer_list<T> data) {
+    PopulateTensor<T>(input_, data);
   }
 
   void SetQuantizedInput(std::initializer_list<float> data) {
     QuantizeAndPopulate<uint8_t>(input_, data);
   }
 
+  void SetQuantizedPadValue(float data) {
+    QuantizeAndPopulate<uint8_t>(constant_values_, {data});
+  }
+
   void SetPaddings(std::initializer_list<int> paddings) {
     PopulateTensor<int>(paddings_, paddings);
   }
 
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
   std::vector<float> GetDequantizedOutput() {
@@ -50,6 +55,59 @@ class PadOpModel : public SingleOpModel {
   int input_;
   int output_;
   int paddings_;
+  int constant_values_;
+};
+
+namespace {
+
+// Returns the corresponding TensorType given the type T.
+template <typename T>
+TensorType GetTensorType() {
+  if (std::is_same<T, float>::value) return TensorType_FLOAT32;
+  if (std::is_same<T, int32_t>::value) return TensorType_INT32;
+  if (std::is_same<T, uint8_t>::value) return TensorType_UINT8;
+  return TensorType_MIN;  // default value
+}
+
+}  // namespace
+
+// Tests case where paddings is a const tensor. Type T is the dtype.
+template <typename T>
+class PadV2OpConstModel : public PadOpModel<T> {
+ public:
+  PadV2OpConstModel(const TensorData& input,
+                    std::initializer_list<int> paddings_shape,
+                    std::initializer_list<int> paddings, T constant_values,
+                    const TensorData& output) {
+    this->input_ = this->AddInput(input);
+    this->paddings_ =
+        this->AddConstInput(TensorType_INT32, paddings, paddings_shape);
+    this->constant_values_ =
+        this->AddConstInput(GetTensorType<T>(), {constant_values}, {1});
+
+    this->output_ = this->AddOutput(output);
+
+    this->SetBuiltinOp(BuiltinOperator_PADV2, BuiltinOptions_PadV2Options,
+                       CreatePadV2Options(this->builder_).Union());
+    this->BuildInterpreter({input.shape});
+  }
+
+  PadV2OpConstModel(const TensorData& input,
+                    std::initializer_list<int> paddings_shape,
+                    std::initializer_list<int> paddings,
+                    const TensorData& constant_values,
+                    const TensorData& output) {
+    this->input_ = this->AddInput(input);
+    this->paddings_ =
+        this->AddConstInput(TensorType_INT32, paddings, paddings_shape);
+    this->constant_values_ = this->AddInput(constant_values);
+
+    this->output_ = this->AddOutput(output);
+
+    this->SetBuiltinOp(BuiltinOperator_PADV2, BuiltinOptions_PadV2Options,
+                       CreatePadV2Options(this->builder_).Union());
+    this->BuildInterpreter({input.shape});
+  }
 };
 
 // Tests case where paddings is a const tensor.
@@ -58,7 +116,7 @@ class PadOpModel : public SingleOpModel {
 //    PadOpDynamicModel m(input_shape, paddings_shape, paddings_data);
 //    m.SetInput(input_data);
 //    m.Invoke();
-class PadOpConstModel : public PadOpModel {
+class PadOpConstModel : public PadOpModel<float> {
  public:
   PadOpConstModel(const TensorData& input,
                   std::initializer_list<int> paddings_shape,
@@ -66,6 +124,7 @@ class PadOpConstModel : public PadOpModel {
                   const TensorData& output) {
     input_ = AddInput(input);
     paddings_ = AddConstInput(TensorType_INT32, paddings, paddings_shape);
+    constant_values_ = AddNullInput();
     output_ = AddOutput(output);
 
     SetBuiltinOp(BuiltinOperator_PAD, BuiltinOptions_PadOptions,
@@ -74,6 +133,38 @@ class PadOpConstModel : public PadOpModel {
   }
 };
 
+// Test case where paddings is a non-const tensor.
+template <typename T>
+class PadV2OpDynamicModel : public PadOpModel<T> {
+ public:
+  PadV2OpDynamicModel(const TensorData& input,
+                      std::initializer_list<int> paddings_shape,
+                      T constant_values, const TensorData& output) {
+    this->input_ = this->AddInput(input);
+    this->paddings_ = this->AddInput(TensorType_INT32);
+    this->constant_values_ =
+        this->AddConstInput(GetTensorType<T>(), {constant_values}, {1});
+    this->output_ = this->AddOutput(output);
+
+    this->SetBuiltinOp(BuiltinOperator_PADV2, BuiltinOptions_PadV2Options,
+                       CreatePadV2Options(this->builder_).Union());
+    this->BuildInterpreter({input.shape, paddings_shape});
+  }
+  PadV2OpDynamicModel(const TensorData& input,
+                      std::initializer_list<int> paddings_shape,
+                      const TensorData& constant_values,
+                      const TensorData& output) {
+    this->input_ = this->AddInput(input);
+    this->paddings_ = this->AddInput(TensorType_INT32);
+    this->constant_values_ = this->AddInput(constant_values);
+    this->output_ = this->AddOutput(output);
+
+    this->SetBuiltinOp(BuiltinOperator_PADV2, BuiltinOptions_PadV2Options,
+                       CreatePadV2Options(this->builder_).Union());
+    this->BuildInterpreter({input.shape, paddings_shape});
+  }
+};
+
 // Test case where paddings is a non-const tensor.
 //
 // Example usage is as follows:
@@ -81,13 +172,14 @@ class PadOpConstModel : public PadOpModel {
 //    m.SetInput(input_data);
 //    m.SetPaddings(paddings_data);
 //    m.Invoke();
-class PadOpDynamicModel : public PadOpModel {
+class PadOpDynamicModel : public PadOpModel<float> {
  public:
   PadOpDynamicModel(const TensorData& input,
                     std::initializer_list<int> paddings_shape,
                     const TensorData& output) {
     input_ = AddInput(input);
     paddings_ = AddInput(TensorType_INT32);
+    constant_values_ = AddNullInput();
     output_ = AddOutput(output);
 
     SetBuiltinOp(BuiltinOperator_PAD, BuiltinOptions_PadOptions,
@@ -237,6 +329,272 @@ TEST_F(QuantizedPadOpTest, AdvancedDynamicTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
 }
 
+TEST(PadV2OpTest, TooManyDimensions) {
+  EXPECT_DEATH(PadV2OpConstModel<float>(
+                   {TensorType_FLOAT32, {1, 2, 3, 4, 5, 6, 7, 8, 9}}, {9, 2},
+                   {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9}, 0.0,
+                   {TensorType_FLOAT32}),
+               "dims != 4");
+}
+
+TEST(PadV2OpTest, UnequalDimensions) {
+  EXPECT_DEATH(
+      PadV2OpConstModel<float>({TensorType_FLOAT32, {1, 1, 2, 1}}, {3, 2},
+                               {1, 1, 2, 2, 3, 3}, 0.0, {TensorType_FLOAT32}),
+      "3 != 4");
+}
+
+TEST(PadV2OpTest, InvalidPadValue) {
+  EXPECT_DEATH(PadV2OpConstModel<float>({TensorType_FLOAT32, {1, 1, 2, 1}},
+                                        {4, 2}, {0, 0, 1, -1, 2, -1, 0, 0}, 0.0,
+                                        {TensorType_FLOAT32}),
+               "Pad value has to be greater than equal to 0.");
+}
+
+TEST(PadV2OpTest, SimpleConstTest) {
+  // Padding is represented as four 2-D lists representing above padding and
+  // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
+  PadV2OpConstModel<float> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
+                             {0, 0, 1, 1, 1, 1, 0, 0}, 0.0,
+                             {TensorType_FLOAT32});
+  m.SetInput({1, 2, 3, 4});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4,
+                                               0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST(PadV2OpTest, SimpleConstFloat32ValuedTest) {
+  // Padding is represented as four 2-D lists representing above padding and
+  // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
+  PadV2OpConstModel<float> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
+                             {0, 0, 1, 1, 1, 1, 0, 0}, 5, {TensorType_FLOAT32});
+  m.SetInput({1, 2, 3, 4});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 5, 5, 5, 5, 1, 2, 5, 5, 3, 4,
+                                               5, 5, 5, 5, 5}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST(PadV2OpTest, Simple4DConstFloat32ValuedTest) {
+  // Padding is represented as four 2-D lists representing above padding and
+  // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
+  PadV2OpConstModel<float> m({TensorType_FLOAT32, {1, 1, 2, 1}}, {4, 2},
+                             {0, 1, 0, 0, 0, 0, 0, 1}, 5, {TensorType_FLOAT32});
+  m.SetInput({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 5, 3, 5, 5, 5, 5, 5}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 2, 2}));
+}
+
+TEST(PadV2OpTest, SimpleConstInt32ValuedTest) {
+  // Padding is represented as four 2-D lists representing above padding and
+  // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
+  PadV2OpConstModel<int32_t> m({TensorType_INT32, {1, 2, 2, 1}}, {4, 2},
+                               {0, 0, 1, 1, 1, 1, 0, 0}, 5, {TensorType_INT32});
+  m.SetInput({1, 2, 3, 4});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 5, 5, 5, 5, 1, 2, 5, 5, 3, 4,
+                                               5, 5, 5, 5, 5}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST(PadV2OpTest, SimpleDynamicTest) {
+  PadV2OpDynamicModel<float> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2}, 0.0,
+                               {TensorType_FLOAT32});
+  m.SetInput({1, 2, 3, 4});
+  m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4,
+                                               0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST(PadV2OpTest, SimpleDynamicValuedTest) {
+  PadV2OpDynamicModel<float> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2}, 5,
+                               {TensorType_FLOAT32});
+  m.SetInput({1, 2, 3, 4});
+  m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 5, 5, 5, 5, 1, 2, 5, 5, 3, 4,
+                                               5, 5, 5, 5, 5}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST(PadV2OpTest, AdvancedConstTest) {
+  PadV2OpConstModel<float> m({TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2},
+                             {0, 0, 0, 2, 1, 3, 0, 0}, 0, {TensorType_FLOAT32});
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({0, 1, 2, 3, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
+TEST(PadV2OpTest, AdvancedDynamicTest) {
+  PadV2OpDynamicModel<float> m({TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2}, 0,
+                               {TensorType_FLOAT32});
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({0, 1, 2, 3, 0, 0, 0, 0, 4, 5, 6, 0, 0, 0,
+                                0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
+class QuantizedPadV2OpTest : public ::testing::Test {
+ protected:
+  std::vector<Matcher<float>> DequantizedArrayNear(
+      const std::vector<float>& values, const float min, const float max) {
+    const float quantization_tolerance = (max - min) / 255.0;
+    return ArrayFloatNear(values, quantization_tolerance);
+  }
+};
+
+TEST_F(QuantizedPadV2OpTest, ZeroNotInQuantizationRange) {
+  // The test_util and actual quantization code currently ensure that the range
+  // must include zero, but if that ever changes, this test will catch it.
+  EXPECT_DEATH(
+      PadV2OpConstModel<float> m({TensorType_UINT8, {1, 2, 2, 1}, 1.0, 2.0},
+                                 {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0}, 0,
+                                 {TensorType_UINT8, {}, 1.0, 2.0}),
+      ".*Check failed: f_min <= 0.*");
+}
+
+TEST_F(QuantizedPadV2OpTest, SimpleConstTest) {
+  // Padding is represented as four 2-D lists representing above padding and
+  // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
+  PadV2OpConstModel<uint8_t> m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                               {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0},
+                               {TensorType_UINT8, {1}, -1.0, 1.0},
+                               {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
+  m.SetQuantizedPadValue(0);
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, 0, 0, 0, 0, -0.8, 0.2, 0, 0, 0.9, 0.7, 0, 0, 0, 0, 0},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST_F(QuantizedPadV2OpTest, SimpleDynamicTest) {
+  PadV2OpDynamicModel<uint8_t> m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                                 {4, 2}, {TensorType_UINT8, {1}, -1.0, 1.0},
+                                 {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
+  m.SetQuantizedPadValue(0);
+  m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, 0, 0, 0, 0, -0.8, 0.2, 0, 0, 0.9, 0.7, 0, 0, 0, 0, 0},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST_F(QuantizedPadV2OpTest, AdvancedConstTest) {
+  PadV2OpConstModel<uint8_t> m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0},
+                               {4, 2}, {0, 0, 0, 2, 1, 3, 0, 0},
+                               {TensorType_UINT8, {1}, -1.0, 1.0},
+                               {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
+  m.SetQuantizedPadValue(0);
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, -0.8, 0.2, 0.9, 0, 0, 0, 0, 0.7, 0.1, -0.3, 0, 0, 0,
+                   0, 0,    0,   0,   0, 0, 0, 0, 0,   0,   0,    0, 0, 0},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
+TEST_F(QuantizedPadV2OpTest, AdvancedDynamicTest) {
+  PadV2OpDynamicModel<uint8_t> m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0},
+                                 {4, 2}, {TensorType_UINT8, {1}, -1.0, 1.0},
+                                 {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
+  m.SetQuantizedPadValue(0);
+  m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {0, -0.8, 0.2, 0.9, 0, 0, 0, 0, 0.7, 0.1, -0.3, 0, 0, 0,
+                   0, 0,    0,   0,   0, 0, 0, 0, 0,   0,   0,    0, 0, 0},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
+TEST_F(QuantizedPadV2OpTest, SimpleConstValuedTest) {
+  // Padding is represented as four 2-D lists representing above padding and
+  // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
+  PadV2OpConstModel<uint8_t> m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                               {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0},
+                               {TensorType_UINT8, {1}, -1.0, 1.0},
+                               {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
+  m.SetQuantizedPadValue(-0.5);
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {-0.5, -0.5, -0.5, -0.5, -0.5, -0.8, 0.2, -0.5, -0.5, 0.9,
+                   0.7, -0.5, -0.5, -0.5, -0.5, -0.5},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST_F(QuantizedPadV2OpTest, SimpleDynamicValuedTest) {
+  PadV2OpDynamicModel<uint8_t> m({TensorType_UINT8, {1, 2, 2, 1}, -1.0, 1.0},
+                                 {4, 2}, {TensorType_UINT8, {1}, -1.0, 1.0},
+                                 {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7});
+  m.SetQuantizedPadValue(-0.5);
+  m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {-0.5, -0.5, -0.5, -0.5, -0.5, -0.8, 0.2, -0.5, -0.5, 0.9,
+                   0.7, -0.5, -0.5, -0.5, -0.5, -0.5},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+TEST_F(QuantizedPadV2OpTest, AdvancedConstValuedTest) {
+  PadV2OpConstModel<uint8_t> m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0},
+                               {4, 2}, {0, 0, 0, 2, 1, 3, 0, 0},
+                               {TensorType_UINT8, {1}, -1.0, 1.0},
+                               {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
+  m.SetQuantizedPadValue(-0.5);
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {-0.5, -0.8, 0.2,  0.9,  -0.5, -0.5, -0.5, -0.5, 0.7,  0.1,
+                   -0.3, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5,
+                   -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
+TEST_F(QuantizedPadV2OpTest, AdvancedDynamicValuedTest) {
+  PadV2OpDynamicModel<uint8_t> m({TensorType_UINT8, {1, 2, 3, 1}, -1.0, 1.0},
+                                 {4, 2}, {TensorType_UINT8, {1}, -1.0, 1.0},
+                                 {TensorType_UINT8, {}, -1.0, 1.0});
+  m.SetQuantizedInput({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
+  m.SetQuantizedPadValue(-0.5);
+  m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput(),
+              ElementsAreArray(DequantizedArrayNear(
+                  {-0.5, -0.8, 0.2,  0.9,  -0.5, -0.5, -0.5, -0.5, 0.7,  0.1,
+                   -0.3, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5,
+                   -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5, -0.5},
+                  -1.0, 1.0)));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/padding.h b/tensorflow/contrib/lite/kernels/padding.h
index e81b970e0fb149..3cb55f19a99f3e 100644
--- a/tensorflow/contrib/lite/kernels/padding.h
+++ b/tensorflow/contrib/lite/kernels/padding.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_PADDING_H_
 #define TENSORFLOW_CONTRIB_LITE_KERNELS_PADDING_H_
 
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+
 namespace tflite {
 
 inline int ComputePadding(int stride, int dilation_rate, int in_size,
@@ -24,6 +26,33 @@ inline int ComputePadding(int stride, int dilation_rate, int in_size,
   return padding > 0 ? padding : 0;
 }
 
+// Matching GetWindowedOutputSize in TensorFlow.
+inline int ComputeOutSize(TfLitePadding padding, int image_size,
+                          int filter_size, int stride) {
+  switch (padding) {
+    case kTfLitePaddingSame:
+      return (image_size + stride - 1) / stride;
+    case kTfLitePaddingValid:
+      return (image_size + stride - filter_size) / stride;
+    default:
+      return 0;
+  }
+}
+
+inline TfLitePaddingValues ComputePaddingHeightWidth(
+    int stride_height, int stride_width, int dilation_rate, int in_height,
+    int in_width, int filter_height, int filter_width, TfLitePadding padding) {
+  int out_width = ComputeOutSize(padding, in_width, filter_width, stride_width);
+  int out_height =
+      ComputeOutSize(padding, in_height, filter_height, stride_height);
+
+  TfLitePaddingValues padding_values;
+  padding_values.height =
+      ComputePadding(stride_height, 1, in_height, filter_height, out_height);
+  padding_values.width =
+      ComputePadding(stride_width, 1, in_width, filter_width, out_width);
+  return padding_values;
+}
 }  // namespace tflite
 
 #endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_PADDING_H_
diff --git a/tensorflow/contrib/lite/kernels/pooling.cc b/tensorflow/contrib/lite/kernels/pooling.cc
index 0bf27c34c1337b..311e9b8399726d 100644
--- a/tensorflow/contrib/lite/kernels/pooling.cc
+++ b/tensorflow/contrib/lite/kernels/pooling.cc
@@ -69,7 +69,7 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
   TfLiteTensor* output = GetOutput(context, node, 0);
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
   TF_LITE_ENSURE_EQ(context, input->type, output->type);
 
@@ -122,7 +122,7 @@ TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
 template <KernelType kernel_type>
 void AverageEvalFloat(TfLiteContext* context, TfLiteNode* node,
                       TfLitePoolParams* params, OpData* data,
-                      TfLiteTensor* input, TfLiteTensor* output) {
+                      const TfLiteTensor* input, TfLiteTensor* output) {
   float activation_min, activation_max;
   CalculateActivationRangeFloat(params->activation, &activation_min,
                                 &activation_max);
@@ -143,7 +143,7 @@ void AverageEvalFloat(TfLiteContext* context, TfLiteNode* node,
 template <KernelType kernel_type>
 void AverageEvalQuantized(TfLiteContext* context, TfLiteNode* node,
                           TfLitePoolParams* params, OpData* data,
-                          TfLiteTensor* input, TfLiteTensor* output) {
+                          const TfLiteTensor* input, TfLiteTensor* output) {
   int32_t activation_min;
   int32_t activation_max;
   CalculateActivationRangeUint8(params->activation, output, &activation_min,
@@ -165,8 +165,8 @@ void AverageEvalQuantized(TfLiteContext* context, TfLiteNode* node,
 
 template <KernelType kernel_type>
 void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
-                  TfLitePoolParams* params, OpData* data, TfLiteTensor* input,
-                  TfLiteTensor* output) {
+                  TfLitePoolParams* params, OpData* data,
+                  const TfLiteTensor* input, TfLiteTensor* output) {
   float activation_min, activation_max;
   CalculateActivationRangeFloat(params->activation, &activation_min,
                                 &activation_max);
@@ -187,7 +187,7 @@ void MaxEvalFloat(TfLiteContext* context, TfLiteNode* node,
 template <KernelType kernel_type>
 void MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
                       TfLitePoolParams* params, OpData* data,
-                      TfLiteTensor* input, TfLiteTensor* output) {
+                      const TfLiteTensor* input, TfLiteTensor* output) {
   int32_t activation_min;
   int32_t activation_max;
   CalculateActivationRangeUint8(params->activation, output, &activation_min,
@@ -209,8 +209,8 @@ void MaxEvalQuantized(TfLiteContext* context, TfLiteNode* node,
 
 template <KernelType kernel_type>
 void L2EvalFloat(TfLiteContext* context, TfLiteNode* node,
-                 TfLitePoolParams* params, OpData* data, TfLiteTensor* input,
-                 TfLiteTensor* output) {
+                 TfLitePoolParams* params, OpData* data,
+                 const TfLiteTensor* input, TfLiteTensor* output) {
   float activation_min, activation_max;
   CalculateActivationRangeFloat(params->activation, &activation_min,
                                 &activation_max);
@@ -236,7 +236,7 @@ TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
   TfLiteTensor* output = GetOutput(context, node, 0);
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   switch (input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
       AverageEvalFloat<kernel_type>(context, node, params, data, input, output);
@@ -246,7 +246,8 @@ TfLiteStatus AverageEval(TfLiteContext* context, TfLiteNode* node) {
                                         output);
       break;
     default:
-      context->ReportError(context, "Type not currently supported.");
+      context->ReportError(context, "Type %d not currently supported.",
+                           input->type);
       return kTfLiteError;
   }
   return kTfLiteOk;
@@ -258,7 +259,7 @@ TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
   TfLiteTensor* output = GetOutput(context, node, 0);
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   switch (input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
       MaxEvalFloat<kernel_type>(context, node, params, data, input, output);
@@ -267,7 +268,8 @@ TfLiteStatus MaxEval(TfLiteContext* context, TfLiteNode* node) {
       MaxEvalQuantized<kernel_type>(context, node, params, data, input, output);
       break;
     default:
-      context->ReportError(context, "Type not currently supported.");
+      context->ReportError(context, "Type %d not currently supported.",
+                           input->type);
       return kTfLiteError;
   }
   return kTfLiteOk;
@@ -279,7 +281,7 @@ TfLiteStatus L2Eval(TfLiteContext* context, TfLiteNode* node) {
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
   TfLiteTensor* output = GetOutput(context, node, 0);
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   switch (input->type) {  // Already know in/out types are same.
     case kTfLiteFloat32:
       L2EvalFloat<kernel_type>(context, node, params, data, input, output);
@@ -288,7 +290,8 @@ TfLiteStatus L2Eval(TfLiteContext* context, TfLiteNode* node) {
     // We don't have a quantized implementation, so just fall through to the
     // 'default' case.
     default:
-      context->ReportError(context, "Type not currently supported.");
+      context->ReportError(context, "Type %d not currently supported.",
+                           input->type);
       return kTfLiteError;
   }
   return kTfLiteOk;
diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/contrib/lite/kernels/register.cc
index b07e7b6ff32e9e..184b02dcecc580 100644
--- a/tensorflow/contrib/lite/kernels/register.cc
+++ b/tensorflow/contrib/lite/kernels/register.cc
@@ -60,6 +60,7 @@ TfLiteRegistration* Register_LSTM();
 TfLiteRegistration* Register_BIDIRECTIONAL_SEQUENCE_LSTM();
 TfLiteRegistration* Register_UNIDIRECTIONAL_SEQUENCE_LSTM();
 TfLiteRegistration* Register_PAD();
+TfLiteRegistration* Register_PADV2();
 TfLiteRegistration* Register_RESHAPE();
 TfLiteRegistration* Register_RESIZE_BILINEAR();
 TfLiteRegistration* Register_SKIP_GRAM();
@@ -79,7 +80,19 @@ TfLiteRegistration* Register_PRELU();
 TfLiteRegistration* Register_MAXIMUM();
 TfLiteRegistration* Register_MINIMUM();
 TfLiteRegistration* Register_ARG_MAX();
+TfLiteRegistration* Register_GREATER();
+TfLiteRegistration* Register_GREATER_EQUAL();
 TfLiteRegistration* Register_LESS();
+TfLiteRegistration* Register_LESS_EQUAL();
+TfLiteRegistration* Register_FLOOR();
+TfLiteRegistration* Register_TILE();
+TfLiteRegistration* Register_NEG();
+TfLiteRegistration* Register_SELECT();
+TfLiteRegistration* Register_SLICE();
+TfLiteRegistration* Register_SIN();
+TfLiteRegistration* Register_TRANSPOSE_CONV();
+TfLiteRegistration* Register_EXPAND_DIMS();
+TfLiteRegistration* Register_SPARSE_TO_DENSE();
 
 BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_RELU, Register_RELU());
@@ -113,12 +126,14 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_L2_NORMALIZATION, Register_L2_NORMALIZATION());
   AddBuiltin(BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION,
              Register_LOCAL_RESPONSE_NORMALIZATION());
-  AddBuiltin(BuiltinOperator_LSTM, Register_LSTM());
+  AddBuiltin(BuiltinOperator_LSTM, Register_LSTM(), /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM,
              Register_BIDIRECTIONAL_SEQUENCE_LSTM());
   AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
              Register_UNIDIRECTIONAL_SEQUENCE_LSTM());
   AddBuiltin(BuiltinOperator_PAD, Register_PAD());
+  AddBuiltin(BuiltinOperator_PADV2, Register_PADV2());
   AddBuiltin(BuiltinOperator_RESHAPE, Register_RESHAPE());
   AddBuiltin(BuiltinOperator_RESIZE_BILINEAR, Register_RESIZE_BILINEAR());
   AddBuiltin(BuiltinOperator_SKIP_GRAM, Register_SKIP_GRAM());
@@ -140,7 +155,19 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM());
   AddBuiltin(BuiltinOperator_MINIMUM, Register_MINIMUM());
   AddBuiltin(BuiltinOperator_ARG_MAX, Register_ARG_MAX());
+  AddBuiltin(BuiltinOperator_GREATER, Register_GREATER());
+  AddBuiltin(BuiltinOperator_GREATER_EQUAL, Register_GREATER_EQUAL());
   AddBuiltin(BuiltinOperator_LESS, Register_LESS());
+  AddBuiltin(BuiltinOperator_LESS_EQUAL, Register_LESS_EQUAL());
+  AddBuiltin(BuiltinOperator_FLOOR, Register_FLOOR());
+  AddBuiltin(BuiltinOperator_NEG, Register_NEG());
+  AddBuiltin(BuiltinOperator_SELECT, Register_SELECT());
+  AddBuiltin(BuiltinOperator_SLICE, Register_SLICE());
+  AddBuiltin(BuiltinOperator_SIN, Register_SIN());
+  AddBuiltin(BuiltinOperator_TRANSPOSE_CONV, Register_TRANSPOSE_CONV());
+  AddBuiltin(BuiltinOperator_TILE, Register_TILE());
+  AddBuiltin(BuiltinOperator_EXPAND_DIMS, Register_EXPAND_DIMS());
+  AddBuiltin(BuiltinOperator_SPARSE_TO_DENSE, Register_SPARSE_TO_DENSE());
 
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
   // custom ops aren't always included by default.
@@ -149,29 +176,6 @@ BuiltinOpResolver::BuiltinOpResolver() {
             tflite::ops::custom::Register_AUDIO_SPECTROGRAM());
 }
 
-TfLiteRegistration* BuiltinOpResolver::FindOp(
-    tflite::BuiltinOperator op) const {
-  auto it = builtins_.find(op);
-  return it != builtins_.end() ? it->second : nullptr;
-}
-
-TfLiteRegistration* BuiltinOpResolver::FindOp(const char* op) const {
-  auto it = custom_ops_.find(op);
-  return it != custom_ops_.end() ? it->second : nullptr;
-}
-
-void BuiltinOpResolver::AddBuiltin(tflite::BuiltinOperator op,
-                                   TfLiteRegistration* registration) {
-  registration->builtin_code = op;
-  builtins_.insert(std::make_pair(op, registration));
-}
-
-void BuiltinOpResolver::AddCustom(const char* name,
-                                  TfLiteRegistration* registration) {
-  registration->builtin_code = BuiltinOperator_CUSTOM;
-  custom_ops_.insert(std::make_pair(std::string(name), registration));
-}
-
 }  // namespace builtin
 }  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/register.h b/tensorflow/contrib/lite/kernels/register.h
index b9cff0ae21086b..b928f1b302580d 100644
--- a/tensorflow/contrib/lite/kernels/register.h
+++ b/tensorflow/contrib/lite/kernels/register.h
@@ -23,24 +23,9 @@ namespace tflite {
 namespace ops {
 namespace builtin {
 
-class BuiltinOpResolver : public OpResolver {
+class BuiltinOpResolver : public MutableOpResolver {
  public:
   BuiltinOpResolver();
-  TfLiteRegistration* FindOp(tflite::BuiltinOperator op) const override;
-  TfLiteRegistration* FindOp(const char* op) const override;
-  void AddBuiltin(tflite::BuiltinOperator op, TfLiteRegistration* registration);
-  void AddCustom(const char* name, TfLiteRegistration* registration);
-
- private:
-  struct BuiltinOperatorHasher {
-    size_t operator()(const tflite::BuiltinOperator& x) const {
-      return std::hash<size_t>()(static_cast<size_t>(x));
-    }
-  };
-  std::unordered_map<tflite::BuiltinOperator, TfLiteRegistration*,
-                     BuiltinOperatorHasher>
-      builtins_;
-  std::unordered_map<std::string, TfLiteRegistration*> custom_ops_;
 };
 
 }  // namespace builtin
diff --git a/tensorflow/contrib/lite/kernels/reshape.cc b/tensorflow/contrib/lite/kernels/reshape.cc
index 438f70d3115130..3287040695140e 100644
--- a/tensorflow/contrib/lite/kernels/reshape.cc
+++ b/tensorflow/contrib/lite/kernels/reshape.cc
@@ -35,7 +35,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context, NumInputs(node) == 1 || NumInputs(node) == 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   // Tensorflow's Reshape allows one of the shape components to have the
@@ -70,7 +70,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   memcpy(output->data.raw, input->data.raw, input->bytes);
diff --git a/tensorflow/contrib/lite/kernels/resize_bilinear.cc b/tensorflow/contrib/lite/kernels/resize_bilinear.cc
index 9e3e19c09a4012..f2092eaa36db32 100644
--- a/tensorflow/contrib/lite/kernels/resize_bilinear.cc
+++ b/tensorflow/contrib/lite/kernels/resize_bilinear.cc
@@ -36,8 +36,10 @@ constexpr int kInputTensor = 0;
 constexpr int kSizeTensor = 1;
 constexpr int kOutputTensor = 0;
 
-TfLiteStatus ResizeOutputTensor(TfLiteContext* context, TfLiteTensor* input,
-                                TfLiteTensor* size, TfLiteTensor* output) {
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
+                                const TfLiteTensor* input,
+                                const TfLiteTensor* size,
+                                TfLiteTensor* output) {
   TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
   output_size->data[0] = input->dims->data[0];
   const int32* size_data = GetTensorData<int32>(size);
@@ -51,8 +53,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
-  TfLiteTensor* size = GetInput(context, node, kSizeTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* size = GetInput(context, node, kSizeTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   // TODO(ahentz): Our current implementations rely on the inputs being 4D.
@@ -78,9 +80,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params =
       reinterpret_cast<TfLiteResizeBilinearParams*>(node->builtin_data);
 
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-  TfLiteTensor* size = GetInput(context, node, kSizeTensor);
+  const TfLiteTensor* size = GetInput(context, node, kSizeTensor);
 
   if (IsDynamicTensor(output)) {
     TF_LITE_ENSURE_OK(context,
@@ -102,7 +104,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     }
 #undef TF_LITE_RESIZE_BILINEAR
   } else {
-    context->ReportError(context, "Inputs and outputs not all float types.");
+    context->ReportError(context, "Output type is %d, requires float.",
+                         output->type);
     return kTfLiteError;
   }
 
diff --git a/tensorflow/contrib/lite/kernels/select.cc b/tensorflow/contrib/lite/kernels/select.cc
new file mode 100644
index 00000000000000..9b6cee3cb55bf9
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/select.cc
@@ -0,0 +1,127 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/contrib/lite/string_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace select {
+
+constexpr int kInputTensorCondition = 0;
+constexpr int kInputTensorX = 1;
+constexpr int kInputTensorY = 2;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus SelectPrepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input_condition =
+      GetInput(context, node, kInputTensorCondition);
+  const TfLiteTensor* input_x = GetInput(context, node, kInputTensorX);
+  const TfLiteTensor* input_y = GetInput(context, node, kInputTensorY);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  // Input must be bool.
+  TF_LITE_ENSURE(context, input_condition->type == kTfLiteBool);
+
+  // Input tensors must have the same type and size
+  TF_LITE_ENSURE_EQ(context, input_x->type, input_y->type);
+  TF_LITE_ENSURE(context, HaveSameShapes(input_x, input_y));
+  output->type = input_x->type;
+
+  // Either the same shape, or input_condition must be Rank 1 and match over the
+  // first dimension.
+  bool same_shape = HaveSameShapes(input_condition, input_x);
+  if (!same_shape && NumDimensions(input_condition) == 1) {
+    same_shape =
+        SizeOfDimension(input_condition, 0) == SizeOfDimension(input_x, 0);
+  }
+
+  TF_LITE_ENSURE(context, same_shape);
+
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input_x->dims);
+  return context->ResizeTensor(context, output, output_size);
+}
+
+TfLiteStatus SelectEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input_condition =
+      GetInput(context, node, kInputTensorCondition);
+  const TfLiteTensor* input_x = GetInput(context, node, kInputTensorX);
+  const TfLiteTensor* input_y = GetInput(context, node, kInputTensorY);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  bool is_rank_one = !HaveSameShapes(input_condition, input_x);
+
+#define TF_LITE_SELECT(type, op)                                          \
+  reference_ops::op(GetTensorData<bool>(input_condition),                 \
+                    GetTensorDims(input_condition),                       \
+                    GetTensorData<type>(input_x), GetTensorDims(input_x), \
+                    GetTensorData<type>(input_y), GetTensorDims(input_y), \
+                    GetTensorData<type>(output), GetTensorDims(output));
+
+#define TF_LITE_SWITCH(type, op)                                               \
+  switch (type) {                                                              \
+    break;                                                                     \
+    case kTfLiteBool:                                                          \
+      TF_LITE_SELECT(bool, op);                                                \
+      break;                                                                   \
+    case kTfLiteFloat32:                                                       \
+      TF_LITE_SELECT(float, op);                                               \
+      break;                                                                   \
+    case kTfLiteUInt8:                                                         \
+      TF_LITE_SELECT(uint8_t, op);                                             \
+      break;                                                                   \
+    case kTfLiteInt32:                                                         \
+      TF_LITE_SELECT(int32_t, op);                                             \
+      break;                                                                   \
+    case kTfLiteInt64:                                                         \
+      TF_LITE_SELECT(int64_t, op);                                             \
+      break;                                                                   \
+    default:                                                                   \
+      context->ReportError(context,                                            \
+                           "Does not support type other than bool|float|int, " \
+                           "got %d",                                           \
+                           type);                                              \
+      return kTfLiteError;                                                     \
+  }
+
+  if (is_rank_one) {
+    TF_LITE_SWITCH(input_x->type, RankOneSelect);
+  } else {
+    TF_LITE_SWITCH(input_x->type, Select);
+  }
+
+#undef TF_LITE_SELECT
+#undef TF_LITE_SWITCH
+  return kTfLiteOk;
+}
+
+}  // namespace select
+
+TfLiteRegistration* Register_SELECT() {
+  static TfLiteRegistration r = {nullptr, nullptr, select::SelectPrepare,
+                                 select::SelectEval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/select_test.cc b/tensorflow/contrib/lite/kernels/select_test.cc
new file mode 100644
index 00000000000000..cfe24a5fc92765
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/select_test.cc
@@ -0,0 +1,143 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class SelectOpModel : public SingleOpModel {
+ public:
+  SelectOpModel(std::initializer_list<int> input1_shape,
+                std::initializer_list<int> input2_shape,
+                std::initializer_list<int> input3_shape,
+                TensorType input_type) {
+    input1_ = AddInput(TensorType_BOOL);
+    input2_ = AddInput(input_type);
+    input3_ = AddInput(input_type);
+    output_ = AddOutput(input_type);
+    SetBuiltinOp(BuiltinOperator_SELECT, BuiltinOptions_SelectOptions,
+                 CreateSelectOptions(builder_).Union());
+    BuildInterpreter({input1_shape, input2_shape, input3_shape});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+  int input3() { return input3_; }
+
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input1_;
+  int input2_;
+  int input3_;
+  int output_;
+};
+
+TEST(SelectOpTest, SelectBool) {
+  SelectOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, {1, 1, 1, 4},
+                      TensorType_BOOL);
+
+  model.PopulateTensor<bool>(model.input1(), {true, false, true, false});
+  model.PopulateTensor<bool>(model.input2(), {false, false, false, false});
+  model.PopulateTensor<bool>(model.input3(), {true, true, true, true});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput<bool>(),
+              ElementsAreArray({false, true, false, true}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(SelectOpTest, SelectFloat) {
+  SelectOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, {1, 1, 1, 4},
+                      TensorType_FLOAT32);
+
+  model.PopulateTensor<bool>(model.input1(), {true, false, true, false});
+  model.PopulateTensor<float>(model.input2(), {0.1, 0.2, 0.3, 0.4});
+  model.PopulateTensor<float>(model.input3(), {0.5, 0.6, 0.7, 0.8});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput<float>(), ElementsAreArray({0.1, 0.6, 0.3, 0.8}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(SelectOpTest, SelectUInt8) {
+  SelectOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, {1, 1, 1, 4},
+                      TensorType_UINT8);
+
+  model.PopulateTensor<bool>(model.input1(), {false, true, false, false});
+  model.PopulateTensor<uint8>(model.input2(), {1, 2, 3, 4});
+  model.PopulateTensor<uint8>(model.input3(), {5, 6, 7, 8});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput<uint8>(), ElementsAreArray({5, 2, 7, 8}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(SelectOpTest, SelectInt32) {
+  SelectOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, {1, 1, 1, 4},
+                      TensorType_INT32);
+
+  model.PopulateTensor<bool>(model.input1(), {false, true, false, false});
+  model.PopulateTensor<int32>(model.input2(), {1, 2, 3, 4});
+  model.PopulateTensor<int32>(model.input3(), {5, 6, 7, 8});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput<int32>(), ElementsAreArray({5, 2, 7, 8}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+}
+
+TEST(SelectOpTest, RankOneSelectInt32) {
+  SelectOpModel model({2}, {2, 1, 2, 1}, {2, 1, 2, 1}, TensorType_INT32);
+
+  model.PopulateTensor<bool>(model.input1(), {false, true});
+  model.PopulateTensor<int32>(model.input2(), {1, 2, 3, 4});
+  model.PopulateTensor<int32>(model.input3(), {5, 6, 7, 8});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput<int32>(), ElementsAreArray({5, 6, 3, 4}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({2, 1, 2, 1}));
+}
+
+TEST(SelectOpTest, RankZeroSelectInt32) {
+  SelectOpModel model({1}, {1, 2, 2, 1}, {1, 2, 2, 1}, TensorType_INT32);
+
+  model.PopulateTensor<bool>(model.input1(), {false});
+  model.PopulateTensor<int32>(model.input2(), {1, 2, 3, 4});
+  model.PopulateTensor<int32>(model.input3(), {5, 6, 7, 8});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput<int32>(), ElementsAreArray({5, 6, 7, 8}));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 2, 2, 1}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/slice.cc b/tensorflow/contrib/lite/kernels/slice.cc
new file mode 100644
index 00000000000000..6a20e802a99cdf
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/slice.cc
@@ -0,0 +1,201 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string.h>
+#include <cmath>
+#include <vector>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace slice {
+
+constexpr int kInputTensor = 0;
+constexpr int kBeginTensor = 1;
+constexpr int kSizeTensor = 2;
+constexpr int kOutputTensor = 0;
+
+// This Op only supports 1-4D cases and since we use the optimized ops 4D
+// implementation, the 1-3D tensors are mapped to 4D.
+const int kMaxDim = 4;
+
+template <typename T>
+TfLiteStatus CalculateOutputShapeVector(
+    TfLiteContext* context, const TfLiteTensor* input,
+    const TfLiteTensor* begin, const TfLiteTensor* size,
+    std::vector<int64_t>* output_shape_vector) {
+  for (int idx = 0; idx < NumDimensions(input); ++idx) {
+    T size_value = GetTensorData<T>(size)[idx];
+    if (size_value < 0) {
+      if (size_value != -1) {
+        context->ReportError(context, "Invalid size.");
+        return kTfLiteError;
+      }
+      size_value = SizeOfDimension(input, idx) - GetTensorData<T>(begin)[idx];
+    } else {
+      if (SizeOfDimension(input, idx) <
+          GetTensorData<T>(begin)[idx] + size_value) {
+        context->ReportError(context, "Invalid begin and size.");
+        return kTfLiteError;
+      }
+    }
+    output_shape_vector->push_back(size_value);
+  }
+  return kTfLiteOk;
+}
+
+template <typename T>
+void GetBeginAndSizeVectors(int dimensions, const TfLiteTensor* begin,
+                            const TfLiteTensor* size, std::vector<int>* begins,
+                            std::vector<int>* sizes) {
+  for (int idx = dimensions - 1; idx >= 0; --idx) {
+    begins->push_back(GetTensorData<T>(begin)[idx]);
+    sizes->push_back(GetTensorData<T>(size)[idx]);
+  }
+}
+
+TfLiteStatus ResizeOutputShape(TfLiteContext* context,
+                               const TfLiteTensor* input,
+                               const TfLiteTensor* begin,
+                               const TfLiteTensor* size, TfLiteTensor* output) {
+  std::vector<int64_t> output_shape_vector;
+
+  if (begin->type == kTfLiteInt32) {
+    TF_LITE_ENSURE_STATUS(CalculateOutputShapeVector<int32_t>(
+        context, input, begin, size, &output_shape_vector));
+  } else if (begin->type == kTfLiteInt64) {
+    TF_LITE_ENSURE_STATUS(CalculateOutputShapeVector<int64_t>(
+        context, input, begin, size, &output_shape_vector));
+  } else {
+    context->ReportError(
+        context, "Type %d is currently not supported by Slice.", begin->type);
+    return kTfLiteError;
+  }
+
+  TfLiteIntArray* output_shape =
+      TfLiteIntArrayCreate(output_shape_vector.size());
+  std::copy(output_shape_vector.begin(), output_shape_vector.end(),
+            output_shape->data);
+  return context->ResizeTensor(context, output, output_shape);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* begin = GetInput(context, node, kBeginTensor);
+  const TfLiteTensor* size = GetInput(context, node, kSizeTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  // Ensure validity of input tensor and its dimension.
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+  TF_LITE_ENSURE(context,
+                 begin->type == kTfLiteInt32 || begin->type == kTfLiteInt64);
+  TF_LITE_ENSURE(context,
+                 size->type == kTfLiteInt32 || size->type == kTfLiteInt64);
+  TF_LITE_ENSURE(context, NumDimensions(begin) == NumDimensions(size) == 1);
+  TF_LITE_ENSURE_MSG(context, NumDimensions(input) <= kMaxDim,
+                     "Slice op only supports 1D-4D input arrays.");
+
+  // Postpone allocation of output if any of the indexing tensors is not
+  // constant
+  if (!(IsConstantTensor(begin) && IsConstantTensor(size))) {
+    SetTensorToDynamic(output);
+    return kTfLiteOk;
+  }
+
+  return ResizeOutputShape(context, input, begin, size, output);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* begin = GetInput(context, node, kBeginTensor);
+  const TfLiteTensor* size = GetInput(context, node, kSizeTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (IsDynamicTensor(output)) {
+    TF_LITE_ENSURE_OK(context,
+                      ResizeOutputShape(context, input, begin, size, output));
+  }
+
+  std::vector<int> begins;
+  begins.reserve(kMaxDim);
+  std::vector<int> sizes;
+  sizes.reserve(kMaxDim);
+
+  if (begin->type == kTfLiteInt32) {
+    GetBeginAndSizeVectors<int32_t>(NumDimensions(input), begin, size, &begins,
+                                    &sizes);
+  } else if (begin->type == kTfLiteInt64) {
+    GetBeginAndSizeVectors<int64_t>(NumDimensions(input), begin, size, &begins,
+                                    &sizes);
+  } else {
+    context->ReportError(
+        context, "Type %d is currently not supported by Slice.", begin->type);
+    return kTfLiteError;
+  }
+
+  for (int i = NumDimensions(input); i < kMaxDim; ++i) {
+    begins.push_back(0);
+    sizes.push_back(1);
+  }
+
+#define TF_LITE_SLICE(data_type)                                            \
+  optimized_ops::Slice<data_type>(                                          \
+      GetTensorData<data_type>(input), GetTensorDims(input), begins, sizes, \
+      GetTensorData<data_type>(output), GetTensorDims(output))
+
+  switch (input->type) {
+    case kTfLiteFloat32:
+      TF_LITE_SLICE(float);
+      break;
+    case kTfLiteInt32:
+      TF_LITE_SLICE(int32_t);
+      break;
+    case kTfLiteInt64:
+      TF_LITE_SLICE(int64_t);
+      break;
+    case kTfLiteUInt8:
+      TF_LITE_SLICE(uint8_t);
+      break;
+    case kTfLiteBool:
+      TF_LITE_SLICE(bool);
+      break;
+    default:
+      context->ReportError(
+          context, "Type %d is currently not supported by Slice.", input->type);
+      return kTfLiteError;
+  }
+#undef TF_LITE_SLICE
+  return kTfLiteOk;
+}
+
+}  // namespace slice
+
+TfLiteRegistration* Register_SLICE() {
+  static TfLiteRegistration r = {nullptr, nullptr, slice::Prepare, slice::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/slice_test.cc b/tensorflow/contrib/lite/kernels/slice_test.cc
new file mode 100644
index 00000000000000..4828f88f36bc1e
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/slice_test.cc
@@ -0,0 +1,173 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+template <typename input_type, typename index_type>
+class SliceOpModel : public SingleOpModel {
+ public:
+  SliceOpModel(std::initializer_list<int> input_shape,
+               std::initializer_list<int> begin_shape,
+               std::initializer_list<int> size_shape,
+               TensorType tensor_index_type, TensorType tensor_input_type) {
+    input_ = AddInput(tensor_input_type);
+    begin_ = AddInput(tensor_index_type);
+    size_ = AddInput(tensor_index_type);
+    output_ = AddOutput(tensor_input_type);
+    SetBuiltinOp(BuiltinOperator_SLICE, BuiltinOptions_SliceOptions,
+                 CreateSliceOptions(builder_).Union());
+    BuildInterpreter({input_shape, begin_shape, size_shape});
+  }
+
+  void SetInput(std::initializer_list<input_type> data) {
+    PopulateTensor<input_type>(input_, data);
+  }
+  void SetBegin(std::initializer_list<index_type> data) {
+    PopulateTensor<index_type>(begin_, data);
+  }
+  void SetSize(std::initializer_list<index_type> data) {
+    PopulateTensor<index_type>(size_, data);
+  }
+
+  std::vector<input_type> GetOutput() {
+    return ExtractVector<input_type>(output_);
+  }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int input_;
+  int begin_;
+  int size_;
+  int output_;
+};
+
+TEST(SliceOpTest, In1D) {
+  SliceOpModel<float, int32_t> m({4}, {1}, {1}, TensorType_INT32,
+                                 TensorType_FLOAT32);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({1});
+  m.SetSize({2});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({2, 3}));
+}
+
+TEST(SliceOpTest, In2D) {
+  SliceOpModel<float, int32_t> m({2, 3}, {2}, {2}, TensorType_INT32,
+                                 TensorType_FLOAT32);
+  m.SetInput({1, 2, 3, 4, 5, 6});
+  m.SetBegin({1, 0});
+  m.SetSize({1, 2});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({4, 5}));
+}
+
+TEST(SliceOpTest, In3D) {
+  SliceOpModel<float, int32_t> m({2, 3, 2}, {3}, {4}, TensorType_INT32,
+                                 TensorType_FLOAT32);
+  m.SetInput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m.SetBegin({0, 0, 0});
+  m.SetSize({2, 3, 2});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3, 2}));
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}));
+}
+
+TEST(SliceOpTest, InputFloat) {
+  SliceOpModel<float, int32_t> m({4, 1, 1, 1}, {4}, {4}, TensorType_INT32,
+                                 TensorType_FLOAT32);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({1, 0, 0, 0});
+  m.SetSize({3, 1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1, 1, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({2, 3, 4}));
+}
+
+TEST(SliceOpTest, IndexInt64) {
+  SliceOpModel<float, int64_t> m({4, 1, 1, 1}, {4}, {4}, TensorType_INT64,
+                                 TensorType_FLOAT32);
+  m.SetInput({1, 2, 3, 4});
+  m.SetBegin({1, 0, 0, 0});
+  m.SetSize({3, 1, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1, 1, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({2, 3, 4}));
+}
+
+// See these test cases under:
+// https://www.tensorflow.org/versions/master/api_docs/python/tf/slice
+TEST(SliceOpTest, InputInteger1) {
+  SliceOpModel<int32_t, int32_t> m({3, 2, 3, 1}, {4}, {4}, TensorType_INT32,
+                                   TensorType_INT32);
+  m.SetInput({1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6});
+  m.SetBegin({1, 0, 0, 0});
+  m.SetSize({1, 1, 3, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 1, 3, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 3, 3}));
+}
+
+TEST(SliceOpTest, InputInteger2) {
+  SliceOpModel<int32_t, int32_t> m({3, 2, 3, 1}, {4}, {4}, TensorType_INT32,
+                                   TensorType_INT32);
+  m.SetInput({1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6});
+  m.SetBegin({1, 0, 0, 0});
+  m.SetSize({1, 2, 3, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 3, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 3, 3, 4, 4, 4}));
+}
+
+TEST(SliceOpTest, InputInteger3) {
+  SliceOpModel<int32_t, int32_t> m({3, 2, 3, 1}, {4}, {4}, TensorType_INT32,
+                                   TensorType_INT32);
+  m.SetInput({1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6});
+  m.SetBegin({1, 0, 0, 0});
+  m.SetSize({2, 1, 3, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 3, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 3, 3, 5, 5, 5}));
+}
+
+TEST(SliceOpTest, SizeMinus1) {
+  SliceOpModel<int32_t, int32_t> m({3, 2, 3, 1}, {4}, {4}, TensorType_INT32,
+                                   TensorType_INT32);
+  m.SetInput({1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6});
+  m.SetBegin({1, 0, 0, 0});
+  m.SetSize({2, 1, -1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 3, 1}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 3, 3, 5, 5, 5}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc b/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc
index d8c9e352f00627..c9269599e58f09 100644
--- a/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc
+++ b/tensorflow/contrib/lite/kernels/space_to_batch_nd.cc
@@ -40,9 +40,9 @@ struct SpaceToBatchNDContext {
     paddings = GetInput(context, node, 2);
     output = GetOutput(context, node, 0);
   }
-  TfLiteTensor* input;
-  TfLiteTensor* block_shape;
-  TfLiteTensor* paddings;
+  const TfLiteTensor* input;
+  const TfLiteTensor* block_shape;
+  const TfLiteTensor* paddings;
   TfLiteTensor* output;
 };
 
@@ -152,8 +152,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       }
       break;
     default:
-      context->ReportError(context,
-                           "Type is currently not supported by SpaceToBatch.");
+      context->ReportError(
+          context, "Type %d is currently not supported by SpaceToBatch.",
+          op_context.input->type);
       return kTfLiteError;
   }
 #undef TF_LITE_SPACE_TO_BATCH_ND
diff --git a/tensorflow/contrib/lite/kernels/space_to_depth.cc b/tensorflow/contrib/lite/kernels/space_to_depth.cc
index cb2e509c9811b1..9dbe9b9edaccc3 100644
--- a/tensorflow/contrib/lite/kernels/space_to_depth.cc
+++ b/tensorflow/contrib/lite/kernels/space_to_depth.cc
@@ -42,7 +42,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
@@ -76,7 +76,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params =
       reinterpret_cast<TfLiteSpaceToDepthParams*>(node->builtin_data);
 
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
 #define TF_LITE_SPACE_TO_DEPTH(type, scalar)                                  \
@@ -113,7 +113,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       }
       break;
     default:
-      context->ReportError(context, "Type not currently supported.");
+      context->ReportError(context, "Type %d not currently supported.",
+                           input->type);
       return kTfLiteError;
   }
 #undef TF_LITE_SPACE_TO_DEPTH
diff --git a/tensorflow/contrib/lite/kernels/sparse_to_dense.cc b/tensorflow/contrib/lite/kernels/sparse_to_dense.cc
new file mode 100644
index 00000000000000..404c32ad9ca8b9
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/sparse_to_dense.cc
@@ -0,0 +1,275 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/contrib/lite/kernels/padding.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace sparse_to_dense {
+
+constexpr int kIndicesTensor = 0;
+constexpr int kOutputShapeTensor = 1;
+constexpr int kValueInputTensor = 2;
+constexpr int kDefaultValueTensor = 3;
+constexpr int kOutputTensor = 0;
+
+constexpr int kMaxDimensions = 4;
+
+template <typename T>
+TfLiteStatus Resize(TfLiteContext* context, const TfLiteTensor* output_shape,
+                    TfLiteTensor* output) {
+  const int output_dimensions = NumElements(output_shape);
+  TfLiteIntArray* output_shape_array = TfLiteIntArrayCreate(output_dimensions);
+  for (int i = 0; i < output_dimensions; ++i) {
+    output_shape_array->data[i] = GetTensorData<T>(output_shape)[i];
+  }
+
+  return context->ResizeTensor(context, output, output_shape_array);
+}
+
+TfLiteStatus CheckDimensionsMatch(TfLiteContext* context,
+                                  const TfLiteTensor* indices,
+                                  const TfLiteTensor* output_shape,
+                                  const TfLiteTensor* values) {
+  switch (NumDimensions(indices)) {
+    case 0:
+    case 1: {
+      if (NumDimensions(values) == 0) {
+        TF_LITE_ENSURE_EQ(context, NumElements(indices), NumElements(values));
+      }
+      TF_LITE_ENSURE_EQ(context, NumElements(output_shape), 1);
+      break;
+    }
+    case 2: {
+      TF_LITE_ENSURE_EQ(context, SizeOfDimension(indices, 1),
+                        NumElements(output_shape));
+      if (NumDimensions(values) == 0)
+        TF_LITE_ENSURE_EQ(context, SizeOfDimension(indices, 0),
+                          NumElements(values));
+      break;
+    }
+    default:
+      context->ReportError(
+          context, "Wrong indices dimensions %d, should be less than 3.",
+          NumDimensions(indices));
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+// Convert indices into a vector of 4-d vectors.
+// TODO(renjieliu): Revisit here to improve the performance, since multiple
+// allocations of std::vectors will be quite slow on phones.
+template <typename T>
+TfLiteStatus GetIndicesVector(TfLiteContext* context,
+                              const TfLiteTensor* indices,
+                              const int num_indices,
+                              std::vector<std::vector<T>>* indices_vector) {
+  // Note because TfLite will reverse the dimensions, so pad zeros upfront.
+  switch (NumDimensions(indices)) {
+    case 0:
+    case 1: {
+      const auto indices_data = GetTensorData<T>(indices);
+      for (int i = 0; i < num_indices; ++i) {
+        std::vector<T> index({0, 0, 0, indices_data[i]});
+        indices_vector->push_back(index);
+      }
+      break;
+    }
+    case 2: {
+      const int true_dimensions = SizeOfDimension(indices, 1);
+      TF_LITE_ENSURE(context, true_dimensions <= kMaxDimensions);
+      for (int i = 0; i < num_indices; ++i) {
+        std::vector<T> index;
+        index.reserve(kMaxDimensions);
+        // Fill the index with 1 up to kMaxDimensions - true_dimensions to
+        // satisfy the needs for 4-dimension index.
+        for (int j = 0; j < kMaxDimensions - true_dimensions; ++j) {
+          index.push_back(0);
+        }
+        for (int j = 0; j < true_dimensions; ++j) {
+          index.push_back(GetTensorData<T>(indices)[i * true_dimensions + j]);
+        }
+
+        indices_vector->push_back(index);
+      }
+      break;
+    }
+    default:
+      context->ReportError(context,
+                           "Indices dimensions problem, got %d dimensions",
+                           NumDimensions(indices));
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus ResizeOutputShape(TfLiteContext* context,
+                               const TfLiteTensor* output_shape,
+                               TfLiteTensor* output) {
+  if (output_shape->type == kTfLiteInt32) {
+    return Resize<int32_t>(context, output_shape, output);
+  } else if (output_shape->type == kTfLiteInt64) {
+    return Resize<int64_t>(context, output_shape, output);
+  } else {
+    context->ReportError(context, "Dense shape type %d not supported.",
+                         output_shape->type);
+    return kTfLiteError;
+  }
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 4);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* indices = GetInput(context, node, kIndicesTensor);
+  const TfLiteTensor* output_shape =
+      GetInput(context, node, kOutputShapeTensor);
+  const TfLiteTensor* values = GetInput(context, node, kValueInputTensor);
+  const TfLiteTensor* default_value =
+      GetInput(context, node, kDefaultValueTensor);
+
+  // TODO(renjieliu): Handle validate_indices.
+
+  // Indices can be 0-D, 1-D or 2-D.
+  TF_LITE_ASSERT(NumDimensions(indices) >= 0);
+  TF_LITE_ENSURE(context, NumDimensions(indices) < 3);
+  TF_LITE_ASSERT(NumDimensions(output_shape) >= 0);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(output_shape), 1);
+  // Values can be 0-D or 1-D.
+  TF_LITE_ASSERT(NumDimensions(values) >= 0);
+  TF_LITE_ENSURE(context, NumDimensions(values) < 2);
+
+  TF_LITE_ENSURE_EQ(context, NumElements(default_value), 1);
+
+  TF_LITE_ENSURE(
+      context, indices->type == kTfLiteInt32 || indices->type == kTfLiteInt64);
+  TF_LITE_ENSURE(context, output_shape->type == kTfLiteInt32 ||
+                              output_shape->type == kTfLiteInt64);
+  TF_LITE_ENSURE_EQ(context, values->type, default_value->type);
+
+  // Ensure dimensions match.
+  TF_LITE_ENSURE_OK(
+      context, CheckDimensionsMatch(context, indices, output_shape, values));
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(output_shape), 1);
+
+  if (!IsConstantTensor(output_shape)) {
+    SetTensorToDynamic(output);
+    return kTfLiteOk;
+  }
+  return ResizeOutputShape(context, output_shape, output);
+}
+
+template <typename T, typename I>
+TfLiteStatus SparseToDenseImpl(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* indices = GetInput(context, node, kIndicesTensor);
+  const TfLiteTensor* output_shape =
+      GetInput(context, node, kOutputShapeTensor);
+  const TfLiteTensor* values = GetInput(context, node, kValueInputTensor);
+  const TfLiteTensor* default_value =
+      GetInput(context, node, kDefaultValueTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (IsDynamicTensor(output)) {
+    TF_LITE_ENSURE_OK(context,
+                      ResizeOutputShape(context, output_shape, output));
+  }
+
+  const int num_indices = SizeOfDimension(indices, 0);
+  const bool value_is_scalar = NumDimensions(values) == 0;
+  std::vector<std::vector<I>> indices_vector;
+  indices_vector.reserve(num_indices);
+  TF_LITE_ENSURE_OK(context, GetIndicesVector<I>(context, indices, num_indices,
+                                                 &indices_vector));
+  reference_ops::SparseToDense(indices_vector, GetTensorData<T>(values),
+                               *GetTensorData<T>(default_value),
+                               GetTensorData<T>(output), GetTensorDims(output),
+                               value_is_scalar);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* indices = GetInput(context, node, kIndicesTensor);
+  const TfLiteTensor* values = GetInput(context, node, kValueInputTensor);
+
+  // Currently only supports float32 and int32.
+  switch (values->type) {
+    case kTfLiteFloat32: {
+      switch (indices->type) {
+        case kTfLiteInt32: {
+          return SparseToDenseImpl<float, int32_t>(context, node);
+        }
+        case kTfLiteInt64: {
+          return SparseToDenseImpl<float, int64_t>(context, node);
+        }
+        default:
+          context->ReportError(
+              context, "Type %d is currently not supported by sparse to dense.",
+              indices->type);
+          return kTfLiteError;
+      }
+      break;
+    }
+    case kTfLiteInt32: {
+      switch (indices->type) {
+        case kTfLiteInt32: {
+          return SparseToDenseImpl<int32_t, int32_t>(context, node);
+        }
+        case kTfLiteInt64: {
+          return SparseToDenseImpl<int32_t, int64_t>(context, node);
+        }
+        default:
+          context->ReportError(
+              context, "Type %d is currently not supported by sparse to dense.",
+              indices->type);
+          return kTfLiteError;
+      }
+      break;
+    }
+    default:
+      context->ReportError(
+          context, "Type %d is currently not supported by sparse to dense.",
+          values->type);
+      return kTfLiteError;
+  }
+}
+
+}  // namespace sparse_to_dense
+
+TfLiteRegistration* Register_SPARSE_TO_DENSE() {
+  static TfLiteRegistration r = {nullptr, nullptr, sparse_to_dense::Prepare,
+                                 sparse_to_dense::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/sparse_to_dense_test.cc b/tensorflow/contrib/lite/kernels/sparse_to_dense_test.cc
new file mode 100644
index 00000000000000..a51ec17afcefd7
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/sparse_to_dense_test.cc
@@ -0,0 +1,155 @@
+
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdarg>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+template <typename T>
+class SparseToDenseOpModel : public SingleOpModel {
+ public:
+  SparseToDenseOpModel(std::initializer_list<int> indices_shape,
+                       std::initializer_list<int> output_shape_shape,
+                       std::initializer_list<int> values_shape, T default_value,
+                       TensorType tensor_index_type,
+                       TensorType tensor_input_type) {
+    indices_ = AddInput(tensor_index_type);
+    output_shape_ = AddInput(TensorType_INT32);
+    values_ = AddInput(tensor_input_type);
+    default_value_ = AddInput(tensor_input_type);
+    output_ = AddOutput(tensor_input_type);
+
+    SetBuiltinOp(BuiltinOperator_SPARSE_TO_DENSE,
+                 BuiltinOptions_SparseToDenseOptions,
+                 CreateSparseToDenseOptions(builder_, false).Union());
+    BuildInterpreter({indices_shape, output_shape_shape, values_shape, {1}});
+
+    PopulateTensor<T>(default_value_, {default_value});
+  }
+
+  int indices() { return indices_; }
+  int output_shape() { return output_shape_; }
+  int values() { return values_; }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int indices_;
+  int output_shape_;
+  int values_;
+  int default_value_;
+  int output_;
+};
+
+TEST(SparseToDenseOpModelTest, ZeroDimensionTest) {
+  SparseToDenseOpModel<float> m({1}, {1}, {1}, 0, TensorType_INT32,
+                                TensorType_FLOAT32);
+  m.PopulateTensor<int32_t>(m.indices(), {3});
+  m.PopulateTensor<int32_t>(m.output_shape(), {5});
+  m.PopulateTensor<float>(m.values(), {7});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 7, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({5}));
+}
+
+TEST(SparseToDenseOpModelTest, OneDimensionTest) {
+  SparseToDenseOpModel<float> m({3}, {1}, {3}, 0, TensorType_INT32,
+                                TensorType_FLOAT32);
+  m.PopulateTensor<int32_t>(m.indices(), {1, 3, 5});
+  m.PopulateTensor<int32_t>(m.output_shape(), {7});
+  m.PopulateTensor<float>(m.values(), {2, 4, 6});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 2, 0, 4, 0, 6, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({7}));
+}
+
+TEST(SparseToDenseOpModelTest, TwoDimensionsTest) {
+  SparseToDenseOpModel<float> m({3, 3}, {3}, {3}, 0, TensorType_INT32,
+                                TensorType_FLOAT32);
+  m.PopulateTensor<int32_t>(m.indices(), {0, 0, 0, 1, 2, 1, 2, 0, 1});
+  m.PopulateTensor<int32_t>(m.output_shape(), {3, 3, 3});
+  m.PopulateTensor<float>(m.values(), {2, 4, 6});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 4, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 3, 3}));
+}
+
+TEST(SparseToDenseOpModelTest, DefaultValueTest) {
+  SparseToDenseOpModel<float> m({3, 3}, {3}, {3}, -1, TensorType_INT32,
+                                TensorType_FLOAT32);
+  m.PopulateTensor<int32_t>(m.indices(), {0, 0, 0, 1, 2, 1, 2, 0, 1});
+  m.PopulateTensor<int32_t>(m.output_shape(), {3, 3, 3});
+  m.PopulateTensor<float>(m.values(), {2, 4, 6});
+  m.Invoke();
+
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray({2,  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                        -1, -1, 4,  -1, -1, 6,  -1, -1, -1, -1, -1, -1, -1}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 3, 3}));
+}
+
+TEST(SparseToDenseOpModelTest, IntegerValueTest) {
+  SparseToDenseOpModel<int32_t> m({3, 3}, {3}, {3}, -1, TensorType_INT32,
+                                  TensorType_INT32);
+  m.PopulateTensor<int32_t>(m.indices(), {0, 0, 0, 1, 2, 1, 2, 0, 1});
+  m.PopulateTensor<int32_t>(m.output_shape(), {3, 3, 3});
+  m.PopulateTensor<int32_t>(m.values(), {2, 4, 6});
+  m.Invoke();
+
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray({2,  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                        -1, -1, 4,  -1, -1, 6,  -1, -1, -1, -1, -1, -1, -1}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 3, 3}));
+}
+
+TEST(SparseToDenseOpModelTest, Int64IndexTest) {
+  SparseToDenseOpModel<float> m({3, 3}, {3}, {3}, -1, TensorType_INT64,
+                                TensorType_FLOAT32);
+  m.PopulateTensor<int64_t>(m.indices(), {0, 0, 0, 1, 2, 1, 2, 0, 1});
+  m.PopulateTensor<int32_t>(m.output_shape(), {3, 3, 3});
+  m.PopulateTensor<float>(m.values(), {2, 4, 6});
+  m.Invoke();
+
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray({2,  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                        -1, -1, 4,  -1, -1, 6,  -1, -1, -1, -1, -1, -1, -1}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 3, 3}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/split.cc b/tensorflow/contrib/lite/kernels/split.cc
index b524c79f8779b0..43387df9ceb4d5 100644
--- a/tensorflow/contrib/lite/kernels/split.cc
+++ b/tensorflow/contrib/lite/kernels/split.cc
@@ -34,8 +34,8 @@ struct OpContext {
     input = GetInput(context, node, 1);
   }
   TfLiteSplitParams* params;
-  TfLiteTensor* axis;
-  TfLiteTensor* input;
+  const TfLiteTensor* axis;
+  const TfLiteTensor* input;
 };
 
 TfLiteStatus UseDynamicOutputTensors(TfLiteContext* context, TfLiteNode* node) {
@@ -46,8 +46,8 @@ TfLiteStatus UseDynamicOutputTensors(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus ResizeOutputTensors(TfLiteContext* context, TfLiteNode* node,
-                                 TfLiteTensor* axis, TfLiteTensor* input,
-                                 int num_splits) {
+                                 const TfLiteTensor* axis,
+                                 const TfLiteTensor* input, int num_splits) {
   int axis_value = GetTensorData<int>(axis)[0];
   if (axis_value < 0) {
     axis_value += NumDimensions(input);
@@ -138,8 +138,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       break;
     }
     default:
-      context->ReportError(context,
-                           "Only float32 and uint8 are currently supported.");
+      context->ReportError(
+          context, "Only float32 and uint8 are currently supported, got %d.",
+          op_context.input->type);
       return kTfLiteError;
   }
 #undef TF_LITE_SPLIT
diff --git a/tensorflow/contrib/lite/kernels/squeeze.cc b/tensorflow/contrib/lite/kernels/squeeze.cc
index 29447ab021c7b6..09a5662fd9e70d 100644
--- a/tensorflow/contrib/lite/kernels/squeeze.cc
+++ b/tensorflow/contrib/lite/kernels/squeeze.cc
@@ -26,13 +26,12 @@ namespace builtin {
 namespace squeeze {
 
 struct SqueezeContext {
-  SqueezeContext(TfLiteContext* context, TfLiteNode* node) {
-    params = reinterpret_cast<TfLiteSqueezeParams*>(node->builtin_data);
-    input = GetInput(context, node, 0);
-    output = GetOutput(context, node, 0);
-  }
+  SqueezeContext(TfLiteContext* context, TfLiteNode* node)
+      : params(reinterpret_cast<TfLiteSqueezeParams*>(node->builtin_data)),
+        input(GetInput(context, node, 0)),
+        output(GetOutput(context, node, 0)) {}
   TfLiteSqueezeParams* params;
-  TfLiteTensor* input;
+  const TfLiteTensor* const input;
   TfLiteTensor* output;
 };
 
diff --git a/tensorflow/contrib/lite/kernels/strided_slice.cc b/tensorflow/contrib/lite/kernels/strided_slice.cc
index 40ac436b7dcabe..725dd8105ab950 100644
--- a/tensorflow/contrib/lite/kernels/strided_slice.cc
+++ b/tensorflow/contrib/lite/kernels/strided_slice.cc
@@ -49,10 +49,10 @@ struct StridedSliceContext {
     dims = NumDimensions(input);
   }
   const TfLiteStridedSliceParams* params;
-  TfLiteTensor* input;
-  TfLiteTensor* begin;
-  TfLiteTensor* end;
-  TfLiteTensor* strides;
+  const TfLiteTensor* input;
+  const TfLiteTensor* begin;
+  const TfLiteTensor* end;
+  const TfLiteTensor* strides;
   TfLiteTensor* output;
   int dims;
 };
@@ -235,8 +235,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       break;
     default:
       context->ReportError(context,
-                           "Type is currently not supported "
-                           "by StridedSlice.");
+                           "Type %d is currently not supported "
+                           "by StridedSlice.",
+                           op_context.input->type);
       return kTfLiteError;
   }
 #undef TF_LITE_STRIDED_SLICE
diff --git a/tensorflow/contrib/lite/kernels/sub.cc b/tensorflow/contrib/lite/kernels/sub.cc
index 7c60a4fdbffdc9..bdcaab8e2fa8a3 100644
--- a/tensorflow/contrib/lite/kernels/sub.cc
+++ b/tensorflow/contrib/lite/kernels/sub.cc
@@ -57,8 +57,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
-  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
@@ -80,7 +80,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 template <KernelType kernel_type>
 void EvalFloat(TfLiteContext* context, TfLiteNode* node,
                TfLiteSubParams* params, const OpData* data,
-               TfLiteTensor* input1, TfLiteTensor* input2,
+               const TfLiteTensor* input1, const TfLiteTensor* input2,
                TfLiteTensor* output) {
   float output_activation_min, output_activation_max;
   CalculateActivationRangeFloat(params->activation, &output_activation_min,
@@ -109,7 +109,7 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
 template <KernelType kernel_type>
 void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
                    TfLiteSubParams* params, const OpData* data,
-                   TfLiteTensor* input1, TfLiteTensor* input2,
+                   const TfLiteTensor* input1, const TfLiteTensor* input2,
                    TfLiteTensor* output) {
   auto input1_offset = -input1->params.zero_point;
   auto input2_offset = -input2->params.zero_point;
@@ -164,8 +164,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteSubParams*>(node->builtin_data);
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
 
-  TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
-  TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   if (output->type == kTfLiteFloat32) {
@@ -174,8 +174,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     EvalQuantized<kernel_type>(context, node, params, data, input1, input2,
                                output);
   } else {
-    context->ReportError(context,
-                         "Inputs and outputs not all float|uint8 types.");
+    context->ReportError(
+        context, "output type %d is not supported, requires float|uint8 types.",
+        output->type);
     return kTfLiteError;
   }
 
diff --git a/tensorflow/contrib/lite/kernels/svdf.cc b/tensorflow/contrib/lite/kernels/svdf.cc
index c69755447d5093..308860c299e9d7 100644
--- a/tensorflow/contrib/lite/kernels/svdf.cc
+++ b/tensorflow/contrib/lite/kernels/svdf.cc
@@ -37,7 +37,7 @@ constexpr int kWeightsFeatureTensor = 1;
 constexpr int kWeightsTimeTensor = 2;
 constexpr int kBiasTensor = 3;
 constexpr int kStateTensor = 0;
-constexpr int KOutputTensor = 1;
+constexpr int kOutputTensor = 1;
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   auto* scratch_tensor_index = new int;
@@ -58,10 +58,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, node->outputs->size, 2);
 
   TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]];
-  TfLiteTensor* weights_feature =
-      &context->tensors[node->inputs->data[kWeightsFeatureTensor]];
-  TfLiteTensor* weights_time =
-      &context->tensors[node->inputs->data[kWeightsTimeTensor]];
+  const TfLiteTensor* weights_feature =
+      GetInput(context, node, kWeightsFeatureTensor);
+  const TfLiteTensor* weights_time =
+      GetInput(context, node, kWeightsTimeTensor);
 
   // Check all the parameters of tensor match within themselves and match the
   // input configuration.
@@ -74,13 +74,13 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ASSERT_EQ(input->dims->data[1], weights_feature->dims->data[1]);
   TF_LITE_ASSERT_EQ(weights_time->dims->data[0], num_filters);
 
-  TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
   if (bias) {
     TF_LITE_ASSERT_EQ(bias->dims->data[0], num_units);
   }
 
-  TfLiteTensor* state = &context->tensors[node->outputs->data[kStateTensor]];
-  TfLiteTensor* output = &context->tensors[node->outputs->data[KOutputTensor]];
+  TfLiteTensor* state = GetOutput(context, node, kStateTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   // Resize state.
   // For each batch, the state is a 2-D tensor: memory_size * num_filters
@@ -112,7 +112,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   scratch_size_array->data[0] = batch_size;
   scratch_size_array->data[1] = num_filters;
 
-  TfLiteTensor* scratch_tensor = &context->tensors[node->temporaries->data[0]];
+  TfLiteTensor* scratch_tensor = GetTemporary(context, node, /*index=*/0);
   scratch_tensor->type = input->type;
   scratch_tensor->allocation_type = kTfLiteArenaRw;
   TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_tensor,
@@ -124,17 +124,17 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteSVDFParams*>(node->builtin_data);
 
-  TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]];
-  TfLiteTensor* weights_feature =
-      &context->tensors[node->inputs->data[kWeightsFeatureTensor]];
-  TfLiteTensor* weights_time =
-      &context->tensors[node->inputs->data[kWeightsTimeTensor]];
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* weights_feature =
+      GetInput(context, node, kWeightsFeatureTensor);
+  const TfLiteTensor* weights_time =
+      GetInput(context, node, kWeightsTimeTensor);
 
-  TfLiteTensor* state = &context->tensors[node->outputs->data[kStateTensor]];
-  TfLiteTensor* output = &context->tensors[node->outputs->data[KOutputTensor]];
-  TfLiteTensor* scratch = &context->tensors[node->temporaries->data[0]];
+  TfLiteTensor* state = GetOutput(context, node, kStateTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TfLiteTensor* scratch = GetTemporary(context, node, /*index=*/0);
 
-  TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
+  const TfLiteTensor* bias = GetOptionalInputTensor(context, node, kBiasTensor);
 
   const int rank = params->rank;
   const int batch_size = input->dims->data[0];
diff --git a/tensorflow/contrib/lite/kernels/test_util.cc b/tensorflow/contrib/lite/kernels/test_util.cc
index 0bb28b50b2a5e5..d23ec201b41887 100644
--- a/tensorflow/contrib/lite/kernels/test_util.cc
+++ b/tensorflow/contrib/lite/kernels/test_util.cc
@@ -22,23 +22,6 @@ namespace tflite {
 using ::testing::FloatNear;
 using ::testing::Matcher;
 
-namespace {
-template <typename T>
-std::pair<float, int32_t> QuantizationParams(float f_min, float f_max) {
-  // These are required by many quantized operations.
-  CHECK_LE(f_min, 0);
-  CHECK_GE(f_max, 0);
-  T q_min = std::numeric_limits<T>::min();
-  T q_max = std::numeric_limits<T>::max();
-  float range = q_max - q_min;
-  float scale = (f_max - f_min) / range;
-  int32_t zero_point = std::min(
-      q_max,
-      std::max(q_min, static_cast<T>(std::round(q_min - f_min / scale))));
-  return {scale, zero_point};
-}
-}  // namespace
-
 std::vector<Matcher<float>> ArrayFloatNear(const std::vector<float>& values,
                                            float max_abs_error) {
   std::vector<Matcher<float>> matchers;
@@ -49,69 +32,8 @@ std::vector<Matcher<float>> ArrayFloatNear(const std::vector<float>& values,
   return matchers;
 }
 
-int SingleOpModel::AddTensor(TensorData t, std::initializer_list<int> data) {
-  int id = tensors_.size();
-
-  // This is slightly different depending on whether we are adding a
-  // quantized or a regular tensor.
-  bool is_quantized = (t.min != 0 || t.max != 0 || t.scale != 0);
-
-  flatbuffers::Offset<QuantizationParameters> q_params = 0;
-
-  if (is_quantized) {
-    if (t.min != 0 || t.max != 0) {
-      if (t.type == TensorType_UINT8) {
-        std::tie(t.scale, t.zero_point) =
-            QuantizationParams<uint8_t>(t.min, t.max);
-      } else if (t.type == TensorType_INT32) {
-        std::tie(t.scale, t.zero_point) =
-            QuantizationParams<int32_t>(t.min, t.max);
-      } else {
-        LOG(FATAL) << "No support for the requested quantized type";
-      }
-      t.min = 0;
-      t.max = 0;
-    }
-
-    q_params = CreateQuantizationParameters(
-        builder_, /*min=*/0, /*max=*/0, builder_.CreateVector<float>({t.scale}),
-        builder_.CreateVector<int64_t>({t.zero_point}));
-  }
-
-  int buffer_id = 0;
-  if (data.size()) {
-    // Initialize buffers list with empty buffer to allow for non-const tensors.
-    if (buffers_.empty()) {
-      buffers_.push_back(CreateBuffer(builder_, builder_.CreateVector({})));
-    }
-
-    // Add data as a Buffer to buffers list.
-    buffer_id = buffers_.size();
-    auto data_buffer =
-        builder_.CreateVector(reinterpret_cast<const uint8_t*>(data.begin()),
-                              sizeof(int) * data.size());
-    buffers_.push_back(CreateBuffer(builder_, data_buffer));
-  }
-
-  tensors_.push_back(CreateTensor(builder_, builder_.CreateVector<int>(t.shape),
-                                  t.type, /*buffer=*/buffer_id,
-                                  /*name=*/0, q_params));
-
-  tensor_data_[id] = t;
-
-  return id;
-}
-
 int SingleOpModel::AddInput(const TensorData& t) {
-  int id = AddTensor(t, {});
-  inputs_.push_back(id);
-  return id;
-}
-
-int SingleOpModel::AddConstInput(TensorType type,
-                                 std::initializer_list<int> data,
-                                 std::initializer_list<int> shape) {
-  int id = AddTensor(TensorData{type, shape}, data);
+  int id = AddTensor<float>(t, {});
   inputs_.push_back(id);
   return id;
 }
@@ -123,7 +45,7 @@ int SingleOpModel::AddNullInput() {
 }
 
 int SingleOpModel::AddOutput(const TensorData& t) {
-  int id = AddTensor(t, {});
+  int id = AddTensor<float>(t, {});
   outputs_.push_back(id);
   return id;
 }
@@ -179,7 +101,7 @@ void SingleOpModel::BuildInterpreter(
     }
     resolver_ = std::unique_ptr<OpResolver>(resolver);
   }
-  InterpreterBuilder(model, *resolver_)(&interpreter_);
+  CHECK(InterpreterBuilder(model, *resolver_)(&interpreter_) == kTfLiteOk);
 
   CHECK(interpreter_ != nullptr);
 
@@ -190,6 +112,12 @@ void SingleOpModel::BuildInterpreter(
     if (shape.empty()) continue;
     CHECK(interpreter_->ResizeInputTensor(input_idx, shape) == kTfLiteOk);
   }
+
+  // Modify delegate with function.
+  if (apply_delegate_fn_) {
+    apply_delegate_fn_(interpreter_.get());
+  }
+
   CHECK(interpreter_->AllocateTensors() == kTfLiteOk)
       << "Cannot allocate tensors";
 }
diff --git a/tensorflow/contrib/lite/kernels/test_util.h b/tensorflow/contrib/lite/kernels/test_util.h
index a5f345e98a9d4f..db80c0082c394a 100644
--- a/tensorflow/contrib/lite/kernels/test_util.h
+++ b/tensorflow/contrib/lite/kernels/test_util.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 
 #include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/model.h"
 #include "tensorflow/contrib/lite/string_util.h"
@@ -88,20 +89,24 @@ struct TensorData {
 class SingleOpResolver : public OpResolver {
  public:
   SingleOpResolver(const BuiltinOperator op, TfLiteRegistration* registration)
-      : op_(op), registration_(registration) {
-    registration_->builtin_code = op;
+      : op_(op), registration_(*registration) {
+    registration_.builtin_code = static_cast<int32_t>(op);
+    registration_.version = 1;
   }
-  TfLiteRegistration* FindOp(BuiltinOperator op) const override {
+  const TfLiteRegistration* FindOp(BuiltinOperator op,
+                                   int version) const override {
     if (op == op_) {
-      return registration_;
+      return &registration_;
     }
     return nullptr;
   }
-  TfLiteRegistration* FindOp(const char* op) const override { return nullptr; }
+  const TfLiteRegistration* FindOp(const char* op, int version) const override {
+    return nullptr;
+  }
 
  private:
   const BuiltinOperator op_;
-  TfLiteRegistration* registration_;
+  TfLiteRegistration registration_;
 };
 
 class SingleOpModel {
@@ -109,6 +114,13 @@ class SingleOpModel {
   SingleOpModel() {}
   ~SingleOpModel() {}
 
+  // Set a function callback that is run right after graph is prepared
+  // that allows applying external delegates. This is useful for testing
+  // other runtimes like NN API or GPU.
+  void SetApplyDelegate(std::function<void(Interpreter*)> apply_delegate_fn) {
+    apply_delegate_fn_ = apply_delegate_fn;
+  }
+
   // Copying or assignment is disallowed to simplify ownership semantics.
   SingleOpModel(const SingleOpModel&) = delete;
   SingleOpModel& operator=(const SingleOpModel&) = delete;
@@ -117,9 +129,14 @@ class SingleOpModel {
   int AddInput(TensorType type) { return AddInput(TensorData{type}); }
   int AddInput(const TensorData& t);
 
-  // Add a Tensor containing const data and return the tensor id.
-  int AddConstInput(TensorType type, std::initializer_list<int> data,
-                    std::initializer_list<int> shape);
+  // Templated version of AddConstInput().
+  template <typename T>
+  int AddConstInput(TensorType type, std::initializer_list<T> data,
+                    std::initializer_list<int> shape) {
+    int id = AddTensor(TensorData{type, shape}, data);
+    inputs_.push_back(id);
+    return id;
+  }
 
   // Add a null input tensor (optional input) and return kOptionalTensor.
   int AddNullInput();
@@ -135,6 +152,22 @@ class SingleOpModel {
     PopulateTensor(index, 0, q.data(), q.data() + q.size());
   }
 
+  void SymmetricQuantizeAndPopulate(int index,
+                                    std::initializer_list<float> data) {
+    TfLiteTensor* t = interpreter_->tensor(index);
+    std::vector<float> values(data);
+    const int length = values.size();
+    std::vector<int8_t> q(length);
+    float min, max, scaling_factor;
+    tensor_utils::SymmetricQuantizeFloats(values.data(), length, q.data(), &min,
+                                          &max, &scaling_factor);
+    // Update quantization params.
+    t->params.scale = scaling_factor;
+    t->params.zero_point = 0;
+    PopulateTensor(index, /*offset=*/0, reinterpret_cast<uint8_t*>(q.data()),
+                   reinterpret_cast<uint8_t*>(q.data() + q.size()));
+  }
+
   const std::vector<int>& GetShape(int id) { return tensor_data_.at(id).shape; }
 
   float GetScale(int id) { return tensor_data_.at(id).scale; }
@@ -209,7 +242,79 @@ class SingleOpModel {
   std::unique_ptr<OpResolver> resolver_;
 
  private:
-  int AddTensor(TensorData t, std::initializer_list<int> data);
+  // TODO(gavinbelson): sync this method with
+  // //tensorflow/contrib/lite/kernels/internal/quantization_util.h?l=31
+  template <typename T>
+  std::pair<float, int32_t> QuantizationParams(float f_min, float f_max) {
+    // These are required by many quantized operations.
+    CHECK_LE(f_min, 0);
+    CHECK_GE(f_max, 0);
+    T q_min = std::numeric_limits<T>::min();
+    T q_max = std::numeric_limits<T>::max();
+    float range = q_max - q_min;
+    float scale = (f_max - f_min) / range;
+    int32_t zero_point = std::min(
+        q_max,
+        std::max(q_min, static_cast<T>(std::round(q_min - f_min / scale))));
+    return {scale, zero_point};
+  }
+
+  template <typename T>
+  int AddTensor(TensorData t, std::initializer_list<T> data) {
+    int id = tensors_.size();
+
+    // This is slightly different depending on whether we are adding a
+    // quantized or a regular tensor.
+    bool is_quantized = (t.min != 0 || t.max != 0 || t.scale != 0);
+
+    flatbuffers::Offset<QuantizationParameters> q_params = 0;
+
+    if (is_quantized) {
+      if (t.min != 0 || t.max != 0) {
+        if (t.type == TensorType_UINT8) {
+          std::tie(t.scale, t.zero_point) =
+              QuantizationParams<uint8_t>(t.min, t.max);
+        } else if (t.type == TensorType_INT32) {
+          std::tie(t.scale, t.zero_point) =
+              QuantizationParams<int32_t>(t.min, t.max);
+        } else {
+          LOG(FATAL) << "No support for the requested quantized type";
+        }
+        t.min = 0;
+        t.max = 0;
+      }
+
+      q_params = CreateQuantizationParameters(
+          builder_, /*min=*/0, /*max=*/0,
+          builder_.CreateVector<float>({t.scale}),
+          builder_.CreateVector<int64_t>({t.zero_point}));
+    }
+
+    int buffer_id = 0;
+    if (data.size()) {
+      // Initialize buffers list with empty buffer to allow for non-const
+      // tensors.
+      if (buffers_.empty()) {
+        buffers_.push_back(CreateBuffer(builder_, builder_.CreateVector({})));
+      }
+
+      // Add data as a Buffer to buffers list.
+      buffer_id = buffers_.size();
+      auto data_buffer =
+          builder_.CreateVector(reinterpret_cast<const uint8_t*>(data.begin()),
+                                sizeof(T) * data.size());
+      buffers_.push_back(CreateBuffer(builder_, data_buffer));
+    }
+
+    tensors_.push_back(CreateTensor(builder_,
+                                    builder_.CreateVector<int>(t.shape), t.type,
+                                    /*buffer=*/buffer_id,
+                                    /*name=*/0, q_params));
+
+    tensor_data_[id] = t;
+
+    return id;
+  }
 
   std::map<int, TensorData> tensor_data_;
   std::vector<int32_t> inputs_;
@@ -219,6 +324,9 @@ class SingleOpModel {
   std::vector<flatbuffers::Offset<Operator>> operators_;
   std::vector<flatbuffers::Offset<Buffer>> buffers_;
   std::map<string, std::function<TfLiteRegistration*()>> custom_registrations_;
+  // A function pointer that gets called after the interpreter is created but
+  // before evaluation happens. This is useful for applying a delegate.
+  std::function<void(Interpreter*)> apply_delegate_fn_;
 };
 
 // Base class for single op unit tests.
diff --git a/tensorflow/contrib/lite/kernels/tile.cc b/tensorflow/contrib/lite/kernels/tile.cc
new file mode 100644
index 00000000000000..af77f074742eb3
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/tile.cc
@@ -0,0 +1,194 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+#include <vector>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace tile {
+
+constexpr int kInputTensor = 0;
+constexpr int kInputMultipliers = 1;
+constexpr int kOutputTensor = 0;
+
+namespace {
+template <typename T>
+TfLiteIntArray* MultiplyShapeDims(const TfLiteIntArray& shape,
+                                  const TfLiteTensor* multipliers,
+                                  int num_dimensions) {
+  const T* multipliers_v = GetTensorData<T>(multipliers);
+
+  TfLiteIntArray* output_shape = TfLiteIntArrayCreate(num_dimensions);
+  for (int i = 0; i < num_dimensions; ++i) {
+    output_shape->data[i] = shape.data[i] * multipliers_v[i];
+  }
+  return output_shape;
+}
+
+TfLiteStatus ResizeOutput(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* multipliers = GetInput(context, node, kInputMultipliers);
+
+  const int num_dimensions = NumDimensions(input);
+  const int num_multipliers = NumElements(multipliers);
+  TF_LITE_ENSURE_EQ(context, num_dimensions, num_multipliers);
+  switch (multipliers->type) {
+    case kTfLiteInt32:
+      return context->ResizeTensor(
+          context, output,
+          MultiplyShapeDims<int32_t>(*input->dims, multipliers,
+                                     num_dimensions));
+    case kTfLiteInt64:
+      return context->ResizeTensor(
+          context, output,
+          MultiplyShapeDims<int64_t>(*input->dims, multipliers,
+                                     num_dimensions));
+    default:
+      context->ReportError(context, "Tile not supported multiply tensor type.");
+      return kTfLiteError;
+  }
+}
+
+template <typename T>
+void CopyMultipleTimes(const T* in_data, int32_t in_size, int32_t multiplier,
+                       T* out_data) {
+  for (int i = 0; i < multiplier; ++i) {
+    const T* in_end = in_data + in_size;
+    T* new_out_data = std::copy(in_data, in_end, out_data);
+    in_data = out_data;
+    out_data = new_out_data;
+  }
+}
+
+template <typename T, typename M>
+std::pair<int, int> TileOneDimension(const TfLiteIntArray& in_dimensions,
+                                     const T* in_data, const M* multipliers,
+                                     T* out_data, int dimension) {
+  const int dimension_size = in_dimensions.data[dimension];
+  if (dimension == in_dimensions.size - 1) {
+    CopyMultipleTimes(in_data, dimension_size, multipliers[dimension],
+                      out_data);
+    return std::make_pair(dimension_size,
+                          dimension_size * multipliers[dimension]);
+  }
+  int total_stride_size = 0, total_tiled_stride_size = 0;
+  const T* copy_from_data = in_data;
+  T* copy_to_data = out_data;
+  for (int i = 0; i < dimension_size; ++i) {
+    int stride_size = 0, tiled_stride_size = 0;
+    std::tie(stride_size, tiled_stride_size) =
+        TileOneDimension(in_dimensions, copy_from_data, multipliers,
+                         copy_to_data, dimension + 1);
+    copy_from_data += stride_size;
+    copy_to_data += tiled_stride_size;
+    total_stride_size += stride_size;
+    total_tiled_stride_size += tiled_stride_size;
+  }
+  CopyMultipleTimes(out_data, total_tiled_stride_size,
+                    multipliers[dimension] - 1,
+                    out_data + total_tiled_stride_size);
+  return std::make_pair(total_stride_size,
+                        total_tiled_stride_size * multipliers[dimension]);
+}
+
+template <typename T>
+void Tile(const TfLiteIntArray& in_dimensions, const TfLiteTensor* in_data,
+          const TfLiteTensor* multipliers, TfLiteTensor* out_data) {
+  // Doing recursively tiling from top to down dimension.
+  switch (multipliers->type) {
+    case kTfLiteInt32:
+      TileOneDimension(in_dimensions, GetTensorData<T>(in_data),
+                       GetTensorData<int32_t>(multipliers),
+                       GetTensorData<T>(out_data), 0);
+      break;
+    case kTfLiteInt64:
+      TileOneDimension(in_dimensions, GetTensorData<T>(in_data),
+                       GetTensorData<int64_t>(multipliers),
+                       GetTensorData<T>(out_data), 0);
+      break;
+    default:
+      break;
+  }
+}
+}  // namespace
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+
+  const TfLiteTensor* multipliers = GetInput(context, node, kInputMultipliers);
+  // Only int32 and int64 multipliers type is supported.
+  TF_LITE_ENSURE_MSG(context,
+                     (multipliers->type == kTfLiteInt32) ||
+                         (multipliers->type == kTfLiteInt64),
+                     "Tile only supports int32 and int64 mutlipliers.");
+
+  if (IsConstantTensor(multipliers)) {
+    TF_LITE_ENSURE_OK(context, ResizeOutput(context, node));
+  } else {
+    SetTensorToDynamic(output);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* multipliers = GetInput(context, node, kInputMultipliers);
+
+  if (IsDynamicTensor(output)) {
+    TF_LITE_ENSURE_OK(context, ResizeOutput(context, node));
+  }
+
+  switch (output->type) {
+    case kTfLiteFloat32:
+      Tile<float>(*(input->dims), input, multipliers, output);
+      break;
+    case kTfLiteUInt8:
+      Tile<uint8_t>(*(input->dims), input, multipliers, output);
+      break;
+    case kTfLiteInt32:
+      Tile<int32_t>(*(input->dims), input, multipliers, output);
+      break;
+    case kTfLiteInt64:
+      Tile<int64_t>(*(input->dims), input, multipliers, output);
+      break;
+    default:
+      context->ReportError(context, "Type is currently not supported by Tile.");
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace tile
+TfLiteRegistration* Register_TILE() {
+  static TfLiteRegistration r = {nullptr, nullptr, tile::Prepare, tile::Eval};
+  return &r;
+}
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/tile_test.cc b/tensorflow/contrib/lite/kernels/tile_test.cc
new file mode 100644
index 00000000000000..a134a75d56ae03
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/tile_test.cc
@@ -0,0 +1,256 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+class TileOpModel : public SingleOpModel {
+ public:
+  TileOpModel(std::initializer_list<int> input_shape, TensorType input_type,
+              TensorType multiply_type) {
+    input_ = AddInput(input_type);
+    multipliers_ = AddInput(TensorType_INT32);
+    output_ = AddOutput(input_type);
+    SetBuiltinOp(BuiltinOperator_TILE, BuiltinOptions_TileOptions, 0);
+    BuildInterpreter({input_shape, {static_cast<int>(input_shape.size())}});
+  }
+
+  void SetInputFloat(std::initializer_list<float> data) {
+    PopulateTensor<float>(input_, data);
+  }
+
+  void SetInputUInt8(std::initializer_list<uint8> data) {
+    PopulateTensor<uint8>(input_, data);
+  }
+
+  void SetInputInt32(std::initializer_list<int32> data) {
+    PopulateTensor<int32>(input_, data);
+  }
+
+  void SetInputInt64(std::initializer_list<int64_t> data) {
+    PopulateTensor<int64_t>(input_, data);
+  }
+
+  void SetMultipliers(std::initializer_list<int32> data) {
+    PopulateTensor<int32>(multipliers_, data);
+  }
+
+  std::vector<float> GetOutputFloat() { return ExtractVector<float>(output_); }
+
+  std::vector<uint8> GetOutputUInt8() { return ExtractVector<uint8>(output_); }
+
+  std::vector<int32> GetOutputInt32() { return ExtractVector<int32>(output_); }
+
+  std::vector<int64_t> GetOutputInt64() {
+    return ExtractVector<int64_t>(output_);
+  }
+
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input_;
+  int multipliers_;
+  int output_;
+};
+
+TEST(TileTest, Float32Vector) {
+  TileOpModel m({3}, TensorType_FLOAT32, TensorType_INT32);
+  m.SetInputFloat({1.f, 2.f, 3.f});
+  m.SetMultipliers({2});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputFloat(),
+              ElementsAreArray({1.f, 2.f, 3.f, 1.f, 2.f, 3.f}));
+}
+
+TEST(TileTest, Float32Matrix) {
+  TileOpModel m({2, 3}, TensorType_FLOAT32, TensorType_INT32);
+  m.SetInputFloat({
+      11.f,
+      12.f,
+      13.f,
+      21.f,
+      22.f,
+      23.f,
+  });
+  m.SetMultipliers({2, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputFloat(), ElementsAreArray({
+                                      11.f,
+                                      12.f,
+                                      13.f,
+                                      21.f,
+                                      22.f,
+                                      23.f,
+                                      11.f,
+                                      12.f,
+                                      13.f,
+                                      21.f,
+                                      22.f,
+                                      23.f,
+                                  }));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3}));
+}
+
+TEST(TileTest, Float32HighDimension) {
+  TileOpModel m({1, 2, 3}, TensorType_FLOAT32, TensorType_INT32);
+  m.SetInputFloat({
+      11.f,
+      12.f,
+      13.f,
+      21.f,
+      22.f,
+      23.f,
+  });
+  m.SetMultipliers({2, 3, 1});
+  m.Invoke();
+  EXPECT_THAT(
+      m.GetOutputFloat(),
+      ElementsAreArray({11.f, 12.f, 13.f, 21.f, 22.f, 23.f, 11.f, 12.f, 13.f,
+                        21.f, 22.f, 23.f, 11.f, 12.f, 13.f, 21.f, 22.f, 23.f,
+                        11.f, 12.f, 13.f, 21.f, 22.f, 23.f, 11.f, 12.f, 13.f,
+                        21.f, 22.f, 23.f, 11.f, 12.f, 13.f, 21.f, 22.f, 23.f}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 6, 3}));
+}
+
+TEST(TileTest, Uint8Matrix) {
+  TileOpModel m({2, 3}, TensorType_UINT8, TensorType_INT32);
+  m.SetInputUInt8({
+      11,
+      12,
+      13,
+      21,
+      22,
+      23,
+  });
+  m.SetMultipliers({2, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputUInt8(), ElementsAreArray({
+                                      11,
+                                      12,
+                                      13,
+                                      21,
+                                      22,
+                                      23,
+                                      11,
+                                      12,
+                                      13,
+                                      21,
+                                      22,
+                                      23,
+                                  }));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3}));
+}
+
+TEST(TileTest, Int32Matrix) {
+  TileOpModel m({2, 3}, TensorType_INT32, TensorType_INT32);
+  m.SetInputInt32({
+      11,
+      12,
+      13,
+      21,
+      22,
+      23,
+  });
+  m.SetMultipliers({2, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputInt32(), ElementsAreArray({
+                                      11,
+                                      12,
+                                      13,
+                                      21,
+                                      22,
+                                      23,
+                                      11,
+                                      12,
+                                      13,
+                                      21,
+                                      22,
+                                      23,
+                                  }));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3}));
+}
+
+TEST(TileTest, Int64Matrix) {
+  TileOpModel m({2, 3}, TensorType_INT64, TensorType_INT32);
+  m.SetInputInt64({
+      11,
+      12,
+      13,
+      21,
+      22,
+      23,
+  });
+  m.SetMultipliers({2, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputInt64(), ElementsAreArray({
+                                      11,
+                                      12,
+                                      13,
+                                      21,
+                                      22,
+                                      23,
+                                      11,
+                                      12,
+                                      13,
+                                      21,
+                                      22,
+                                      23,
+                                  }));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3}));
+}
+
+TEST(TileTest, Int64Matrix64Multipliers) {
+  TileOpModel m({2, 3}, TensorType_INT64, TensorType_INT64);
+  m.SetInputInt64({
+      11,
+      12,
+      13,
+      21,
+      22,
+      23,
+  });
+  m.SetMultipliers({2, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputInt64(), ElementsAreArray({
+                                      11,
+                                      12,
+                                      13,
+                                      21,
+                                      22,
+                                      23,
+                                      11,
+                                      12,
+                                      13,
+                                      21,
+                                      22,
+                                      23,
+                                  }));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3}));
+}
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/topk_v2.cc b/tensorflow/contrib/lite/kernels/topk_v2.cc
index ad9b744f1af271..fb0e49c90c4174 100644
--- a/tensorflow/contrib/lite/kernels/topk_v2.cc
+++ b/tensorflow/contrib/lite/kernels/topk_v2.cc
@@ -30,15 +30,14 @@ constexpr int kOutputIndexes = 1;
 
 namespace {
 TfLiteStatus ResizeOutput(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* top_k = GetInput(context, node, kInputTopK);
+  const TfLiteTensor* top_k = GetInput(context, node, kInputTopK);
   // INT32 number of top results is supported.
   TF_LITE_ENSURE_EQ(context, top_k->type, kTfLiteInt32);
   // Check that the tensor contains only one value.
-  TF_LITE_ENSURE_EQ(context, NumDimensions(top_k), 1);
   TF_LITE_ENSURE_EQ(context, NumElements(top_k), 1);
-  const int32 k = top_k->data.i32[0];
+  const int32 k = *GetTensorData<int32_t>(top_k);
 
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const int num_dimensions = NumDimensions(input);
   // Check that input has one or more dimensions.
   TF_LITE_ENSURE_MSG(context, input->dims->size >= 1,
@@ -162,11 +161,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 2);
 
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TfLiteTensor* output_values = GetOutput(context, node, kOutputValues);
   TF_LITE_ENSURE_EQ(context, input->type, output_values->type);
 
-  TfLiteTensor* top_k = GetInput(context, node, kInputTopK);
+  const TfLiteTensor* top_k = GetInput(context, node, kInputTopK);
   TF_LITE_ENSURE_EQ(context, top_k->type, kTfLiteInt32);
 
   // Set output dynamic if the input is not const.
@@ -187,11 +186,11 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   if (IsDynamicTensor(output_values)) {
     TF_LITE_ENSURE_OK(context, ResizeOutput(context, node));
   }
-  TfLiteTensor* top_k = GetInput(context, node, kInputTopK);
+  const TfLiteTensor* top_k = GetInput(context, node, kInputTopK);
   const int32 k = top_k->data.i32[0];
   // The tensor can have more than 2 dimensions or even be a vector, the code
   // anyway calls the internal dimension as row;
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   const int32 row_size = input->dims->data[input->dims->size - 1];
   int32 num_rows = 1;
   for (int i = 0; i < input->dims->size - 1; ++i) {
@@ -215,7 +214,9 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
            output_values->data.i64);
       break;
     default:
-      context->ReportError(context, "Type is currently not supported by TopK.");
+      context->ReportError(context,
+                           "Type %d is currently not supported by TopK.",
+                           output_values->type);
       return kTfLiteError;
   }
 
diff --git a/tensorflow/contrib/lite/kernels/transpose.cc b/tensorflow/contrib/lite/kernels/transpose.cc
index d3c10a9bb7b074..800b0563d7ee61 100644
--- a/tensorflow/contrib/lite/kernels/transpose.cc
+++ b/tensorflow/contrib/lite/kernels/transpose.cc
@@ -37,8 +37,8 @@ struct TransposeContext {
     perm = GetInput(context, node, 1);
     output = GetOutput(context, node, 0);
   }
-  TfLiteTensor* input;
-  TfLiteTensor* perm;
+  const TfLiteTensor* input;
+  const TfLiteTensor* perm;
   TfLiteTensor* output;
 };
 
@@ -136,7 +136,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       break;
     default:
       context->ReportError(context,
-                           "Type is currently not supported by Transpose.");
+                           "Type %d is currently not supported by Transpose.",
+                           op_context.input->type);
       return kTfLiteError;
   }
 #undef TF_LITE_TRANSPOSE
diff --git a/tensorflow/contrib/lite/kernels/transpose_conv.cc b/tensorflow/contrib/lite/kernels/transpose_conv.cc
new file mode 100644
index 00000000000000..3c99661029ed1a
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/transpose_conv.cc
@@ -0,0 +1,146 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/contrib/lite/kernels/padding.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace transpose_conv {
+
+constexpr int kOutputShapeTensor = 0;
+constexpr int kWeightsTensor = 1;
+constexpr int kDataInputTensor = 2;
+constexpr int kOutputTensor = 0;
+
+TfLiteStatus ResizeOutputShape(TfLiteContext* context,
+                               const TfLiteTensor* output_shape,
+                               TfLiteTensor* output) {
+  // Currently only support int32 for output shape.
+  if (output_shape->type != kTfLiteInt32) {
+    context->ReportError(context, "Output shape is %d, not int32.",
+                         output_shape->type);
+    return kTfLiteError;
+  }
+  const int output_dimensions = NumElements(output_shape);
+  TfLiteIntArray* output_shape_array = TfLiteIntArrayCreate(output_dimensions);
+  for (int i = 0; i < output_dimensions; ++i) {
+    output_shape_array->data[i] = GetTensorData<int32_t>(output_shape)[i];
+  }
+
+  return context->ResizeTensor(context, output, output_shape_array);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* output_shape =
+      GetInput(context, node, kOutputShapeTensor);
+  const TfLiteTensor* weights = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* input = GetInput(context, node, kDataInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_EQ(context, NumDimensions(output_shape), 1);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(weights), 4);
+
+  // Currenlty only supports float32.
+  const TfLiteType data_type = input->type;
+  TF_LITE_ENSURE(context, data_type == kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, output->type, data_type);
+  TF_LITE_ENSURE_EQ(context, weights->type, data_type);
+
+  // Ensure that weights and inputs have the same channel dimension.
+  // Note: TOCO will reorder weights in the following format: OHWI.
+  TF_LITE_ENSURE_EQ(context, SizeOfDimension(input, 3),
+                    SizeOfDimension(weights, 0));
+
+  if (!IsConstantTensor(output_shape)) {
+    SetTensorToDynamic(output);
+    return kTfLiteOk;
+  }
+  return ResizeOutputShape(context, output_shape, output);
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* output_shape =
+      GetInput(context, node, kOutputShapeTensor);
+  const TfLiteTensor* weights = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* input = GetInput(context, node, kDataInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  const auto* params =
+      reinterpret_cast<TfLiteTransposeConvParams*>(node->builtin_data);
+
+  if (IsDynamicTensor(output)) {
+    TF_LITE_ENSURE_OK(context,
+                      ResizeOutputShape(context, output_shape, output));
+  }
+
+  // Get height and width of the output image.
+  const int width = SizeOfDimension(output, 2);
+  const int height = SizeOfDimension(output, 1);
+  const int filter_width = SizeOfDimension(weights, 1);
+  const int filter_height = SizeOfDimension(weights, 2);
+
+  const int stride_width = params->stride_width;
+  const int stride_height = params->stride_height;
+
+  const TfLitePaddingValues& padding_size =
+      ComputePaddingHeightWidth(stride_height, stride_width, 1, height, width,
+                                filter_height, filter_width, params->padding);
+
+  // Currently only support float32.
+  switch (input->type) {
+    case kTfLiteFloat32:
+      optimized_ops::TransposeConv(
+          GetTensorData<float>(input), GetTensorDims(input),
+          GetTensorData<float>(weights), GetTensorDims(weights), stride_width,
+          stride_height, padding_size.width, padding_size.height,
+          GetTensorData<float>(output), GetTensorDims(output));
+      break;
+    default:
+      context->ReportError(context, "Type %d, not currently supported.",
+                           input->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace transpose_conv
+
+TfLiteRegistration* Register_TRANSPOSE_CONV() {
+  static TfLiteRegistration r = {nullptr, nullptr, transpose_conv::Prepare,
+                                 transpose_conv::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/transpose_conv_test.cc b/tensorflow/contrib/lite/kernels/transpose_conv_test.cc
new file mode 100644
index 00000000000000..52be08934997f4
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/transpose_conv_test.cc
@@ -0,0 +1,222 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdarg>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class TransposeConvOpModel : public SingleOpModel {
+ public:
+  TransposeConvOpModel(std::initializer_list<int> input_shape,
+                       std::initializer_list<int> filter_shape, Padding padding,
+                       int stride_w, int stride_h) {
+    output_shape_ = AddInput(TensorType_INT32);
+    filter_ = AddInput(TensorType_FLOAT32);
+    input_ = AddInput(TensorType_FLOAT32);
+    output_ = AddOutput(TensorType_FLOAT32);
+    SetBuiltinOp(
+        BuiltinOperator_TRANSPOSE_CONV, BuiltinOptions_TransposeConvOptions,
+        CreateTransposeConvOptions(builder_, padding, stride_w, stride_h)
+            .Union());
+    BuildInterpreter({{4}, filter_shape, input_shape});
+  }
+
+  int output_shape() { return output_shape_; }
+  int filter() { return filter_; }
+  int input() { return input_; }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int output_shape_;
+  int filter_;
+  int input_;
+  int output_;
+};
+
+// Test case:
+// output = tf.nn.conv2d_backprop_input(
+//     tf.constant([ 1, 4, 4, 1 ]),
+//     tf.constant(np.arange(1, 10), shape=[ 3, 3, 1, 1 ], dtype=tf.float32),
+//     tf.constant(np.arange(1, 17), shape=[ 1, 4, 4, 1 ], dtype=tf.float32),
+//     [1, 1, 1, 1 ],
+//     "SAME")
+TEST(TransposeConvOpModelTest, SimpleTest) {
+  TransposeConvOpModel m({1, 4, 4, 1}, {1, 3, 3, 1}, Padding_SAME, 1, 1);
+  m.PopulateTensor<int>(m.output_shape(), {1, 4, 4, 1});
+  m.PopulateTensor<float>(m.filter(), {1, 2, 3, 4, 5, 6, 7, 8, 9});
+  m.PopulateTensor<float>(
+      m.input(), {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({29, 62, 83, 75, 99, 192, 237, 198, 207, 372,
+                                417, 330, 263, 446, 485, 365}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+// Test case:
+// filter = tf.constant(np.arange(1, 19),
+//                      shape=[ 3, 3, 1, 2 ],
+//                      dtype=tf.float32)
+// output = tf.nn.conv2d_backprop_input(
+//     tf.constant([ 1, 4, 4, 1 ]),
+//     filter,
+//     tf.constant(np.arange(1, 33), shape=[ 1, 4, 4, 2 ], dtype=tf.float32),
+//     [1, 1, 1, 1 ],
+//     "SAME")
+// And filter value is derived by:
+// filter = tf.reshape(tf.transpose(filter, perm=[3, 0, 1, 2]), shape=[18, 1])
+TEST(TransposeConvOpModelTest, TwoFiltersTest) {
+  TransposeConvOpModel m({1, 4, 4, 2}, {2, 3, 3, 1}, Padding_SAME, 1, 1);
+  m.PopulateTensor<int>(m.output_shape(), {1, 4, 4, 1});
+  m.PopulateTensor<float>(m.filter(), {1, 3, 5, 7, 9, 11, 13, 15, 17, 2, 4, 6,
+                                       8, 10, 12, 14, 16, 18});
+  m.PopulateTensor<float>(
+      m.input(),
+      {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({184, 412, 568, 528, 678, 1347, 1689, 1434, 1494,
+                                2715, 3057, 2442, 1968, 3352, 3652, 2760}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
+}
+
+// Test case:
+// filter = tf.constant(np.arange(1, 19),
+//                      shape=[ 3, 3, 1, 2 ],
+//                      dtype=tf.float32)
+// output = tf.nn.conv2d_backprop_input(
+//     tf.constant([ 1, 6, 6, 1 ]),
+//     filter,
+//     tf.constant(np.arange(1, 33), shape=[ 1, 4, 4, 2 ], dtype=tf.float32),
+//     [1, 1, 1, 1 ],
+//     "VALID")
+// And filter value is derived by:
+// filter = tf.reshape(tf.transpose(filter, perm=[3, 0, 1, 2]), shape=[1, 18])
+TEST(TransposeConvOpModelTest, PaddingValidTest) {
+  TransposeConvOpModel m({1, 4, 4, 2}, {2, 3, 3, 1}, Padding_VALID, 1, 1);
+  m.PopulateTensor<int>(m.output_shape(), {1, 6, 6, 1});
+  m.PopulateTensor<float>(m.filter(), {1, 3, 5, 7, 9, 11, 13, 15, 17, 2, 4, 6,
+                                       8, 10, 12, 14, 16, 18});
+  m.PopulateTensor<float>(
+      m.input(),
+      {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32});
+  m.Invoke();
+
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray({5,    22,   59,   101,  114,  83,   52,   184,  412,
+                        568,  528,  344,  237,  678,  1347, 1689, 1434, 879,
+                        597,  1494, 2715, 3057, 2442, 1431, 856,  1968, 3352,
+                        3652, 2760, 1548, 689,  1534, 2543, 2729, 2010, 1103}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 6, 6, 1}));
+}
+
+// Test case:
+// filter = tf.constant(np.arange(1, 10),
+//                      shape=[ 3, 3, 1, 1 ],
+//                      dtype=tf.float32)
+// output = tf.nn.conv2d_backprop_input(
+//     tf.constant([ 1, 5, 5, 1 ]),
+//     filter,
+//     tf.constant(np.arange(1, 5), shape=[ 1, 2, 2, 1 ], dtype=tf.float32),
+//     [1, 2, 2, 1 ],
+//     "VALID")
+TEST(TransposeConvOpModelTest, StrideValidTest) {
+  TransposeConvOpModel m({1, 2, 2, 1}, {1, 3, 3, 1}, Padding_VALID, 2, 2);
+  m.PopulateTensor<int>(m.output_shape(), {1, 5, 5, 1});
+  m.PopulateTensor<float>(m.filter(), {1, 2, 3, 4, 5, 6, 7, 8, 9});
+  m.PopulateTensor<float>(m.input(), {1, 2, 3, 4});
+  m.Invoke();
+
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray({1,  2,  5,  4,  6,  4,  5,  14, 10, 12, 10, 14, 36,
+                        24, 30, 12, 15, 34, 20, 24, 21, 24, 55, 32, 36}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 5, 5, 1}));
+}
+
+// Test case:
+// filter = tf.constant(np.arange(1, 19),
+//                      shape=[ 3, 3, 2, 1 ],
+//                      dtype=tf.float32)
+// output = tf.nn.conv2d_backprop_input(
+//     tf.constant([ 1, 5, 5, 2 ]),
+//     filter,
+//     tf.constant(np.arange(1, 5), shape=[ 1, 2, 2, 1 ], dtype=tf.float32),
+//     [1, 2, 2, 1 ],
+//     "VALID")
+TEST(TransposeConvOpModelTest, MultiChannelTest) {
+  TransposeConvOpModel m({1, 2, 2, 1}, {1, 3, 3, 2}, Padding_VALID, 2, 2);
+  m.PopulateTensor<int>(m.output_shape(), {1, 5, 5, 2});
+  m.PopulateTensor<float>(m.filter(), {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+                                       13, 14, 15, 16, 17, 18});
+  m.PopulateTensor<float>(m.input(), {1, 2, 3, 4});
+  m.Invoke();
+
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray({1,  2,  3,  4,  7,  10,  6,   8,  10, 12, 7,  8,  9,
+                        10, 25, 28, 18, 20, 22,  24,  16, 20, 24, 28, 62, 72,
+                        42, 48, 54, 60, 21, 24,  27,  30, 61, 68, 36, 40, 44,
+                        48, 39, 42, 45, 48, 103, 110, 60, 64, 68, 72}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 5, 5, 2}));
+}
+
+// Test case:
+// filter = tf.constant(np.random.randint(1, 10, size=9),
+//                      shape=[ 3, 3, 1, 1 ],
+//                      dtype=tf.float32)
+// output = tf.nn.conv2d_backprop_input(
+//     tf.constant([ 1, 3, 4, 1 ]),
+//     filter,
+//     tf.constant([323, 521], shape=[ 1, 1, 2, 1], dtype=tf.float32),
+//     [1, 3, 3, 1 ],
+//     "SAME")
+// And filter value is derived by:
+// filter = tf.reshape(tf.transpose(filter, perm=[3, 0, 1, 2]), shape=[-1])
+TEST(TransposeConvOpModelTest, AccuracyTest) {
+  TransposeConvOpModel m({1, 1, 2, 1}, {1, 3, 3, 1}, Padding_SAME, 3, 3);
+  m.PopulateTensor<int>(m.output_shape(), {1, 3, 4, 1});
+  m.PopulateTensor<float>(m.filter(), {9, 5, 6, 9, 8, 5, 3, 1, 4});
+  m.PopulateTensor<float>(m.input(), {323, 521});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                 {1615., 1938., 4689., 2605., 2584., 1615.,
+                                  4689., 4168., 323., 1292., 1563., 521.})));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 3, 4, 1}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc
index 42941a97db70ad..1c28123a24edd9 100644
--- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc
+++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm.cc
@@ -66,10 +66,19 @@ constexpr int kProjectionWeightsTensor = 16;  // Optional
 constexpr int kProjectionBiasTensor = 17;  // Optional
 
 // Output tensors.
-constexpr int kScratchBufferTensor = 0;
-constexpr int kOutputStateTensor = 1;
-constexpr int kCellStateTensor = 2;
-constexpr int kOutputTensor = 3;
+constexpr int kOutputStateTensor = 0;
+constexpr int kCellStateTensor = 1;
+constexpr int kOutputTensor = 2;
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* scratch_tensor_index = new int;
+  context->AddTensors(context, 1, scratch_tensor_index);
+  return scratch_tensor_index;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<int*>(buffer);
+}
 
 // Check that input tensor dimensions matches with each other.
 TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
@@ -83,7 +92,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   TF_LITE_ENSURE(context, params->cell_clip >= 0);
   TF_LITE_ENSURE(context, params->proj_clip >= 0);
 
-  TfLiteTensor* input_to_input_weights =
+  const TfLiteTensor* input_to_input_weights =
       GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
   if (input_to_input_weights) {
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->size, 2);
@@ -91,19 +100,19 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[1], n_input);
   }
 
-  TfLiteTensor* input_to_forget_weights =
+  const TfLiteTensor* input_to_forget_weights =
       GetInput(context, node, kInputToForgetWeightsTensor);
   TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, input_to_forget_weights->dims->data[1], n_input);
 
-  TfLiteTensor* input_to_cell_weights =
+  const TfLiteTensor* input_to_cell_weights =
       GetInput(context, node, kInputToCellWeightsTensor);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[0], n_cell);
   TF_LITE_ENSURE_EQ(context, input_to_cell_weights->dims->data[1], n_input);
 
-  TfLiteTensor* recurrent_to_input_weights =
+  const TfLiteTensor* recurrent_to_input_weights =
       GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
   if (recurrent_to_input_weights) {
     TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->size, 2);
@@ -113,7 +122,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
                       n_output);
   }
 
-  TfLiteTensor* recurrent_to_forget_weights =
+  const TfLiteTensor* recurrent_to_forget_weights =
       GetInput(context, node, kRecurrentToForgetWeightsTensor);
   TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[0],
@@ -121,7 +130,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   TF_LITE_ENSURE_EQ(context, recurrent_to_forget_weights->dims->data[1],
                     n_output);
 
-  TfLiteTensor* recurrent_to_cell_weights =
+  const TfLiteTensor* recurrent_to_cell_weights =
       GetInput(context, node, kRecurrentToCellWeightsTensor);
   TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[0], n_cell);
@@ -137,21 +146,21 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
        (recurrent_to_input_weights == nullptr));
   TF_LITE_ENSURE(context, cifg_weights_all_or_none == true);
 
-  TfLiteTensor* cell_to_input_weights =
+  const TfLiteTensor* cell_to_input_weights =
       GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
   if (cell_to_input_weights) {
     TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_to_input_weights->dims->data[0], n_cell);
   }
 
-  TfLiteTensor* cell_to_forget_weights =
+  const TfLiteTensor* cell_to_forget_weights =
       GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
   if (cell_to_forget_weights) {
     TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, cell_to_forget_weights->dims->data[0], n_cell);
   }
 
-  TfLiteTensor* cell_to_output_weights =
+  const TfLiteTensor* cell_to_output_weights =
       GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
   if (cell_to_output_weights) {
     TF_LITE_ENSURE_EQ(context, cell_to_output_weights->dims->size, 1);
@@ -170,7 +179,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   TF_LITE_ENSURE(context, peephole_weights_all_or_none == true);
 
   // Make sure the input gate bias is present only when not a CIFG-LSTM.
-  TfLiteTensor* input_gate_bias =
+  const TfLiteTensor* input_gate_bias =
       GetOptionalInputTensor(context, node, kInputGateBiasTensor);
   if (use_cifg) {
     TF_LITE_ENSURE_EQ(context, input_gate_bias, nullptr);
@@ -179,21 +188,21 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     TF_LITE_ENSURE_EQ(context, input_gate_bias->dims->data[0], n_cell);
   }
 
-  TfLiteTensor* forget_gate_bias =
+  const TfLiteTensor* forget_gate_bias =
       GetInput(context, node, kForgetGateBiasTensor);
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, forget_gate_bias->dims->data[0], n_cell);
 
-  TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
+  const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
   TF_LITE_ENSURE_EQ(context, cell_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, cell_bias->dims->data[0], n_cell);
 
-  TfLiteTensor* output_gate_bias =
+  const TfLiteTensor* output_gate_bias =
       GetInput(context, node, kOutputGateBiasTensor);
   TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->size, 1);
   TF_LITE_ENSURE_EQ(context, output_gate_bias->dims->data[0], n_cell);
 
-  TfLiteTensor* projection_weights =
+  const TfLiteTensor* projection_weights =
       GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
   if (projection_weights) {
     TF_LITE_ENSURE_EQ(context, projection_weights->dims->size, 2);
@@ -201,7 +210,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
     TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[1], n_cell);
   }
 
-  TfLiteTensor* projection_bias =
+  const TfLiteTensor* projection_bias =
       GetOptionalInputTensor(context, node, kProjectionBiasTensor);
   if (projection_bias) {
     TF_LITE_ENSURE_EQ(context, projection_bias->dims->size, 1);
@@ -220,28 +229,31 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   return kTfLiteOk;
 }
 
-// Resize the output, state and scratch tensors based on the sizes of the input
-// tensors. Also check that the size of the input tensors match each other.
+// Resize the output and  state tensors based on the sizes of the input tensors.
+// Allocate a temprory scratch tensor. Also check that the sizes of the input
+// tensors match each other.
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
+
   // Check we have all the inputs and outputs we need.
   TF_LITE_ENSURE_EQ(context, node->inputs->size, 18);
-  TF_LITE_ENSURE_EQ(context, node->outputs->size, 4);
+  TF_LITE_ENSURE_EQ(context, node->outputs->size, 3);
 
   // Inferring batch size, number of outputs and sequence length and
   // number of cells from the input tensors.
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
   TF_LITE_ENSURE(context, input->dims->size > 1);
   const int max_time = input->dims->data[0];
   const int n_batch = input->dims->data[1];
   const int n_input = input->dims->data[2];
 
-  TfLiteTensor* input_to_output_weights =
+  const TfLiteTensor* input_to_output_weights =
       GetInput(context, node, kInputToOutputWeightsTensor);
   const int n_cell = input_to_output_weights->dims->data[0];
   TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, input_to_output_weights->dims->data[1], n_input);
 
-  TfLiteTensor* recurrent_to_output_weights =
+  const TfLiteTensor* recurrent_to_output_weights =
       GetInput(context, node, kRecurrentToOutputWeightsTensor);
   TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->size, 2);
   TF_LITE_ENSURE_EQ(context, recurrent_to_output_weights->dims->data[0],
@@ -251,15 +263,12 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Check that input tensor dimensions matches with each other.
   CheckInputTensorDimensions(context, node, n_input, n_output, n_cell);
 
-  // Get the pointer to output, state and scratch buffer tensors.
+  // Get the pointer to output, output_state and cell_state buffer tensors.
   TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
   TfLiteTensor* output_state = GetOutput(context, node, kOutputStateTensor);
   TfLiteTensor* cell_state = GetOutput(context, node, kCellStateTensor);
-  // TODO(ghodrat): Modify this as soon as we have a finalized method for
-  // scratch buffers.
-  TfLiteTensor* scratch_buffer = GetOutput(context, node, kScratchBufferTensor);
 
-  // Resize the output and output_state tensors.
+  // Resize the output, output_state and cell_state tensors.
   TfLiteIntArray* output_size = TfLiteIntArrayCreate(3);
   output_size->data[0] = max_time;
   output_size->data[1] = n_batch;
@@ -273,18 +282,25 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(
       context, context->ResizeTensor(context, output_state, output_state_size));
 
-  // Resize the scratch buffer tensor.
   TfLiteIntArray* cell_size = TfLiteIntArrayCreate(2);
   cell_size->data[0] = n_batch;
   cell_size->data[1] = n_cell;
   TF_LITE_ENSURE_OK(context,
                     context->ResizeTensor(context, cell_state, cell_size));
 
+  // Create a scratch buffer tensor.
+  TfLiteIntArrayFree(node->temporaries);
+  node->temporaries = TfLiteIntArrayCreate(1);
+  node->temporaries->data[0] = *scratch_tensor_index;
+  TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
+  scratch_buffer->type = input->type;
+  scratch_buffer->allocation_type = kTfLiteArenaRw;
+
   // Mark state tensors as persistent tensors.
   output_state->allocation_type = kTfLiteArenaRwPersistent;
   cell_state->allocation_type = kTfLiteArenaRwPersistent;
 
-  TfLiteTensor* input_to_input_weights =
+  const TfLiteTensor* input_to_input_weights =
       GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
   const bool use_cifg = (input_to_input_weights == nullptr);
   if (use_cifg) {
@@ -308,44 +324,44 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 // The LSTM Op engine.
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
-  TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
 
-  TfLiteTensor* input_to_input_weights =
+  const TfLiteTensor* input_to_input_weights =
       GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
-  TfLiteTensor* input_to_forget_weights =
+  const TfLiteTensor* input_to_forget_weights =
       GetInput(context, node, kInputToForgetWeightsTensor);
-  TfLiteTensor* input_to_cell_weights =
+  const TfLiteTensor* input_to_cell_weights =
       GetInput(context, node, kInputToCellWeightsTensor);
-  TfLiteTensor* input_to_output_weights =
+  const TfLiteTensor* input_to_output_weights =
       GetInput(context, node, kInputToOutputWeightsTensor);
 
-  TfLiteTensor* recurrent_to_input_weights =
+  const TfLiteTensor* recurrent_to_input_weights =
       GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
-  TfLiteTensor* recurrent_to_forget_weights =
+  const TfLiteTensor* recurrent_to_forget_weights =
       GetInput(context, node, kRecurrentToForgetWeightsTensor);
-  TfLiteTensor* recurrent_to_cell_weights =
+  const TfLiteTensor* recurrent_to_cell_weights =
       GetInput(context, node, kRecurrentToCellWeightsTensor);
-  TfLiteTensor* recurrent_to_output_weights =
+  const TfLiteTensor* recurrent_to_output_weights =
       GetInput(context, node, kRecurrentToOutputWeightsTensor);
 
-  TfLiteTensor* cell_to_input_weights =
+  const TfLiteTensor* cell_to_input_weights =
       GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
-  TfLiteTensor* cell_to_forget_weights =
+  const TfLiteTensor* cell_to_forget_weights =
       GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
-  TfLiteTensor* cell_to_output_weights =
+  const TfLiteTensor* cell_to_output_weights =
       GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
 
-  TfLiteTensor* input_gate_bias =
+  const TfLiteTensor* input_gate_bias =
       GetOptionalInputTensor(context, node, kInputGateBiasTensor);
-  TfLiteTensor* forget_gate_bias =
+  const TfLiteTensor* forget_gate_bias =
       GetInput(context, node, kForgetGateBiasTensor);
-  TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
-  TfLiteTensor* output_gate_bias =
+  const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
+  const TfLiteTensor* output_gate_bias =
       GetInput(context, node, kOutputGateBiasTensor);
 
-  TfLiteTensor* projection_weights =
+  const TfLiteTensor* projection_weights =
       GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
-  TfLiteTensor* projection_bias =
+  const TfLiteTensor* projection_bias =
       GetOptionalInputTensor(context, node, kProjectionBiasTensor);
 
   TfLiteTensor* output_state = GetOutput(context, node, kOutputStateTensor);
@@ -365,7 +381,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const bool use_peephole = (cell_to_output_weights != nullptr);
 
   // Index the scratch buffers pointers to the global scratch buffer.
-  TfLiteTensor* scratch_buffer = GetOutput(context, node, kScratchBufferTensor);
+  TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
   float* input_gate_scratch = nullptr;
   float* cell_scratch = nullptr;
   float* forget_gate_scratch = nullptr;
@@ -439,7 +455,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace unidirectional_sequence_lstm
 
 TfLiteRegistration* Register_UNIDIRECTIONAL_SEQUENCE_LSTM() {
-  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+  static TfLiteRegistration r = {unidirectional_sequence_lstm::Init,
+                                 unidirectional_sequence_lstm::Free,
                                  unidirectional_sequence_lstm::Prepare,
                                  unidirectional_sequence_lstm::Eval};
   return &r;
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm_test.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm_test.cc
index 93b635ae576e99..5881ced7c7a616 100644
--- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm_test.cc
+++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_lstm_test.cc
@@ -100,9 +100,6 @@ class UnidirectionalLSTMOpModel : public SingleOpModel {
       projection_bias_ = AddNullInput();
     }
 
-    scratch_buffer_ = AddOutput(TensorType_FLOAT32);
-    // TODO(ghodrat): Modify these states when we have a permanent solution for
-    // persistent buffer.
     output_state_ = AddOutput(TensorType_FLOAT32);
     cell_state_ = AddOutput(TensorType_FLOAT32);
     output_ = AddOutput(TensorType_FLOAT32);
@@ -238,7 +235,6 @@ class UnidirectionalLSTMOpModel : public SingleOpModel {
   int output_;
   int output_state_;
   int cell_state_;
-  int scratch_buffer_;
 
   int n_batch_;
   int n_input_;
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
index ac00c37b67dcbe..164a0cbd08d6ce 100644
--- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
+++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/kernels/activation_functor.h"
 #include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
 #include "tensorflow/contrib/lite/kernels/op_macros.h"
 
 namespace tflite {
@@ -38,17 +39,26 @@ constexpr int kBiasTensor = 3;
 constexpr int kHiddenStateTensor = 0;
 constexpr int kOutputTensor = 1;
 
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* scratch_tensor_index = new int;
+  context->AddTensors(context, /*tensors_to_add=*/3, scratch_tensor_index);
+  return scratch_tensor_index;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<int*>(buffer);
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Check we have all the inputs and outputs we need.
   TF_LITE_ENSURE_EQ(context, node->inputs->size, 4);
   TF_LITE_ENSURE_EQ(context, node->outputs->size, 2);
 
-  TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]];
-  TfLiteTensor* input_weights =
-      &context->tensors[node->inputs->data[kWeightsTensor]];
-  TfLiteTensor* recurrent_weights =
-      &context->tensors[node->inputs->data[kRecurrentWeightsTensor]];
-  TfLiteTensor* bias = &context->tensors[node->inputs->data[kBiasTensor]];
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input_weights = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* recurrent_weights =
+      GetInput(context, node, kRecurrentWeightsTensor);
+  const TfLiteTensor* bias = GetInput(context, node, kBiasTensor);
 
   // Check all the parameters of tensor match within themselves and match the
   // input configuration.
@@ -63,10 +73,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ASSERT_EQ(input_weights->dims->data[0], bias->dims->data[0]);
   TF_LITE_ASSERT_EQ(recurrent_weights->dims->data[0], bias->dims->data[0]);
   TF_LITE_ASSERT_EQ(recurrent_weights->dims->data[1], bias->dims->data[0]);
+  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, input_weights->type, recurrent_weights->type);
 
-  TfLiteTensor* hidden_state =
-      &context->tensors[node->outputs->data[kHiddenStateTensor]];
-  TfLiteTensor* output = &context->tensors[node->outputs->data[kOutputTensor]];
+  TfLiteTensor* hidden_state = GetOutput(context, node, kHiddenStateTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
 
   // Resize state.
   TfLiteIntArray* hidden_state_size_array = TfLiteIntArrayCreate(2);
@@ -86,22 +97,54 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(context,
                     context->ResizeTensor(context, output, output_size_array));
 
+  // Allocate temporary tensors to store quantized values of input and
+  // hidden_state tensors.
+  if (input->type == kTfLiteFloat32 && input_weights->type == kTfLiteUInt8) {
+    int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
+    TfLiteIntArrayFree(node->temporaries);
+    node->temporaries = TfLiteIntArrayCreate(3);
+    node->temporaries->data[0] = *scratch_tensor_index;
+    TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/0);
+    input_quantized->type = kTfLiteUInt8;
+    input_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
+      TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
+                                                       input_quantized_size));
+    }
+    node->temporaries->data[1] = *scratch_tensor_index + 1;
+    TfLiteTensor* hidden_state_quantized =
+        GetTemporary(context, node, /*index=*/1);
+    hidden_state_quantized->type = kTfLiteUInt8;
+    hidden_state_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(hidden_state_quantized->dims,
+                             hidden_state->dims)) {
+      TfLiteIntArray* hidden_state_quantized_size =
+          TfLiteIntArrayCopy(hidden_state->dims);
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, hidden_state_quantized,
+                                              hidden_state_quantized_size));
+    }
+    node->temporaries->data[2] = *scratch_tensor_index + 2;
+    TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/2);
+    scaling_factors->type = kTfLiteFloat32;
+    scaling_factors->allocation_type = kTfLiteArenaRw;
+    TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
+    scaling_factors_size->data[0] = batch_size;
+    if (!TfLiteIntArrayEqual(scaling_factors->dims, scaling_factors_size)) {
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
+                                                       scaling_factors_size));
+    }
+  }
   return kTfLiteOk;
 }
 
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params = reinterpret_cast<TfLiteSequenceRNNParams*>(node->builtin_data);
-
-  TfLiteTensor* input = &context->tensors[node->inputs->data[kInputTensor]];
-  TfLiteTensor* input_weights =
-      &context->tensors[node->inputs->data[kWeightsTensor]];
-  TfLiteTensor* recurrent_weights =
-      &context->tensors[node->inputs->data[kRecurrentWeightsTensor]];
-  TfLiteTensor* bias = &context->tensors[node->inputs->data[kBiasTensor]];
-  TfLiteTensor* hidden_state =
-      &context->tensors[node->outputs->data[kHiddenStateTensor]];
-  TfLiteTensor* output = &context->tensors[node->outputs->data[kOutputTensor]];
-
+TfLiteStatus EvalFloat(const TfLiteTensor* input,
+                       const TfLiteTensor* input_weights,
+                       const TfLiteTensor* recurrent_weights,
+                       const TfLiteTensor* bias,
+                       const TfLiteSequenceRNNParams* params,
+                       TfLiteTensor* hidden_state, TfLiteTensor* output) {
   // Initialize the pointer bias.
   const float* bias_ptr = bias->data.f;
 
@@ -120,7 +163,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   if (time_major) {
     // Initialize the pointer to hidden state.
     float* hidden_state_ptr_batch = hidden_state->data.f;
-    // Unroll the sequence and use batch batch operations for efficiency.
+    // Unroll the sequence and use batch operations for efficiency.
     for (int s = 0; s < max_time; s++) {
       // Initialize the pointer to input and output.
       const float* input_ptr_batch =
@@ -154,12 +197,116 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
+TfLiteStatus EvalHybrid(
+    const TfLiteTensor* input, const TfLiteTensor* input_weights,
+    const TfLiteTensor* recurrent_weights, const TfLiteTensor* bias,
+    const TfLiteSequenceRNNParams* params, TfLiteTensor* input_scratch,
+    TfLiteTensor* hidden_state_scratch, TfLiteTensor* scaling_factors,
+    TfLiteTensor* hidden_state, TfLiteTensor* output) {
+  const bool time_major = params->time_major;
+  const int batch_size =
+      (time_major) ? input->dims->data[1] : input->dims->data[0];
+  const int max_time =
+      (time_major) ? input->dims->data[0] : input->dims->data[1];
+  const int num_units = input_weights->dims->data[0];
+  const int input_size = input->dims->data[2];
+
+  // Initialize the pointer bias.
+  const float* bias_ptr = bias->data.f;
+  // Initialize input_weights and recurrent_weights.
+  const int8_t* input_weights_ptr =
+      reinterpret_cast<const int8_t*>(input_weights->data.uint8);
+  const int8_t* recurrent_weights_ptr =
+      reinterpret_cast<const int8_t*>(recurrent_weights->data.uint8);
+  // Get the scale of the quantized weights.
+  float input_weights_scale = input_weights->params.scale;
+  float recurrent_weights_scale = recurrent_weights->params.scale;
+  // Initialize temporary storage for quantized values.
+  int8_t* quantized_input_ptr =
+      reinterpret_cast<int8_t*>(input_scratch->data.uint8);
+  int8_t* quantized_hidden_state_ptr =
+      reinterpret_cast<int8_t*>(hidden_state_scratch->data.uint8);
+  float* scaling_factors_ptr = scaling_factors->data.f;
+
+  if (time_major) {
+    // Initialize the pointer to hidden state.
+    float* hidden_state_ptr_batch = hidden_state->data.f;
+    // Unroll the sequence and use batch operations for efficiency.
+    for (int s = 0; s < max_time; s++) {
+      // Initialize the pointer to input and output.
+      const float* input_ptr_batch =
+          input->data.f + s * input_size * batch_size;
+      float* output_ptr_batch = output->data.f + s * num_units * batch_size;
+
+      kernel_utils::RnnBatchStep(
+          input_ptr_batch, input_weights_ptr, input_weights_scale,
+          recurrent_weights_ptr, recurrent_weights_scale, bias_ptr, input_size,
+          num_units, batch_size, params->activation, quantized_input_ptr,
+          quantized_hidden_state_ptr, scaling_factors_ptr,
+          hidden_state_ptr_batch, output_ptr_batch);
+    }
+  } else {
+    // For each batch
+    for (int b = 0; b < batch_size; b++) {
+      // Initialize the pointer to hidden state.
+      float* hidden_state_ptr_batch = hidden_state->data.f + b * num_units;
+      for (int s = 0; s < max_time; s++) {
+        // Initialize the pointer to input and output.
+        const float* input_ptr_batch =
+            input->data.f + b * input_size * max_time + s * input_size;
+        float* output_ptr_batch =
+            output->data.f + b * num_units * max_time + s * num_units;
+
+        kernel_utils::RnnBatchStep(
+            input_ptr_batch, input_weights_ptr, input_weights_scale,
+            recurrent_weights_ptr, recurrent_weights_scale, bias_ptr,
+            input_size, num_units, /*batch_size=*/1, params->activation,
+            quantized_input_ptr, quantized_hidden_state_ptr,
+            scaling_factors_ptr, hidden_state_ptr_batch, output_ptr_batch);
+      }
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  auto* params = reinterpret_cast<TfLiteSequenceRNNParams*>(node->builtin_data);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  const TfLiteTensor* input_weights = GetInput(context, node, kWeightsTensor);
+  const TfLiteTensor* recurrent_weights =
+      GetInput(context, node, kRecurrentWeightsTensor);
+  const TfLiteTensor* bias = GetInput(context, node, kBiasTensor);
+  TfLiteTensor* hidden_state = GetOutput(context, node, kHiddenStateTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  switch (input_weights->type) {
+    case kTfLiteFloat32:
+      return EvalFloat(input, input_weights, recurrent_weights, bias, params,
+                       hidden_state, output);
+    case kTfLiteUInt8: {
+      // TODO(mirkov): implement eval with quantized inputs as well.
+      TfLiteTensor* input_quantized = GetTemporary(context, node, 0);
+      TfLiteTensor* hidden_state_quantized = GetTemporary(context, node, 1);
+      TfLiteTensor* scaling_factors = GetTemporary(context, node, 2);
+      return EvalHybrid(input, input_weights, recurrent_weights, bias, params,
+                        input_quantized, hidden_state_quantized,
+                        scaling_factors, hidden_state, output);
+    }
+    default:
+      context->ReportError(context, "Type %d not currently supported.",
+                           input_weights->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
 }  // namespace unidirectional_sequence_rnn
 
 TfLiteRegistration* Register_UNIDIRECTIONAL_SEQUENCE_RNN() {
-  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
-                                 unidirectional_sequence_rnn::Prepare,
-                                 unidirectional_sequence_rnn::Eval};
+  static TfLiteRegistration r = {
+      unidirectional_sequence_rnn::Init, unidirectional_sequence_rnn::Free,
+      unidirectional_sequence_rnn::Prepare, unidirectional_sequence_rnn::Eval};
   return &r;
 }
 
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc
index 7e32969763b596..0adab837b07a6d 100644
--- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc
+++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn_test.cc
@@ -122,17 +122,66 @@ static float rnn_golden_output[] = {
     0,          2.02616,    0,         0.728256,  0.84183,   0.0907453,
     0.628881,   3.58099,    1.49974,   0};
 
+static std::initializer_list<float> rnn_weights = {
+    0.461459,    0.153381,   0.529743,    -0.00371218, 0.676267,   -0.211346,
+    0.317493,    0.969689,   -0.343251,   0.186423,    0.398151,   0.152399,
+    0.448504,    0.317662,   0.523556,    -0.323514,   0.480877,   0.333113,
+    -0.757714,   -0.674487,  -0.643585,   0.217766,    -0.0251462, 0.79512,
+    -0.595574,   -0.422444,  0.371572,    -0.452178,   -0.556069,  -0.482188,
+    -0.685456,   -0.727851,  0.841829,    0.551535,    -0.232336,  0.729158,
+    -0.00294906, -0.69754,   0.766073,    -0.178424,   0.369513,   -0.423241,
+    0.548547,    -0.0152023, -0.757482,   -0.85491,    0.251331,   -0.989183,
+    0.306261,    -0.340716,  0.886103,    -0.0726757,  -0.723523,  -0.784303,
+    0.0354295,   0.566564,   -0.485469,   -0.620498,   0.832546,   0.697884,
+    -0.279115,   0.294415,   -0.584313,   0.548772,    0.0648819,  0.968726,
+    0.723834,    -0.0080452, -0.350386,   -0.272803,   0.115121,   -0.412644,
+    -0.824713,   -0.992843,  -0.592904,   -0.417893,   0.863791,   -0.423461,
+    -0.147601,   -0.770664,  -0.479006,   0.654782,    0.587314,   -0.639158,
+    0.816969,    -0.337228,  0.659878,    0.73107,     0.754768,   -0.337042,
+    0.0960841,   0.368357,   0.244191,    -0.817703,   -0.211223,  0.442012,
+    0.37225,     -0.623598,  -0.405423,   0.455101,    0.673656,   -0.145345,
+    -0.511346,   -0.901675,  -0.81252,    -0.127006,   0.809865,   -0.721884,
+    0.636255,    0.868989,   -0.347973,   -0.10179,    -0.777449,  0.917274,
+    0.819286,    0.206218,   -0.00785118, 0.167141,    0.45872,    0.972934,
+    -0.276798,   0.837861,   0.747958,    -0.0151566,  -0.330057,  -0.469077,
+    0.277308,    0.415818};
+
+static std::initializer_list<float> rnn_recurrent_weights = {
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0.1};
+
+static std::initializer_list<float> rnn_bias = {
+    0.065691948, -0.69055247, 0.1107955,  -0.97084129, -0.23957068, -0.23566568,
+    -0.389184,   0.47481549,  -0.4791103, 0.29931796,  0.10463274,  0.83918178,
+    0.37197268,  0.61957061,  0.3956964,  -0.37609905};
+
 class UnidirectionalRNNOpModel : public SingleOpModel {
  public:
-  UnidirectionalRNNOpModel(int batches, int sequence_len, int units, int size,
-                           bool time_major)
+  UnidirectionalRNNOpModel(
+      int batches, int sequence_len, int units, int size, bool time_major,
+      const TensorType& weights = TensorType_FLOAT32,
+      const TensorType& recurrent_weights = TensorType_FLOAT32)
       : batches_(batches),
         sequence_len_(sequence_len),
         units_(units),
         input_size_(size) {
     input_ = AddInput(TensorType_FLOAT32);
-    weights_ = AddInput(TensorType_FLOAT32);
-    recurrent_weights_ = AddInput(TensorType_FLOAT32);
+    weights_ = AddInput(weights);
+    recurrent_weights_ = AddInput(recurrent_weights);
     bias_ = AddInput(TensorType_FLOAT32);
     hidden_state_ = AddOutput(TensorType_FLOAT32);
     output_ = AddOutput(TensorType_FLOAT32);
@@ -187,7 +236,7 @@ class UnidirectionalRNNOpModel : public SingleOpModel {
   int num_batches() { return batches_; }
   int sequence_len() { return sequence_len_; }
 
- private:
+ protected:
   int input_;
   int weights_;
   int recurrent_weights_;
@@ -201,58 +250,31 @@ class UnidirectionalRNNOpModel : public SingleOpModel {
   int input_size_;
 };
 
-// TODO(mirkov): add another test which directly compares to TF once TOCO
-// supports the conversion from dynamic_rnn with BasicRNNCell.
-TEST(FullyConnectedOpTest, BlackBoxTest) {
+// The hybrid model has quantized weights and recurrent_weights.
+class HybridUnidirectionalRNNOpModel : public UnidirectionalRNNOpModel {
+ public:
+  HybridUnidirectionalRNNOpModel(int batches, int sequence_len, int units,
+                                 int size, bool time_major)
+      : UnidirectionalRNNOpModel(batches, sequence_len, units, size, time_major,
+                                 TensorType_UINT8, TensorType_UINT8) {}
+
+  void SetWeights(std::initializer_list<float> f) {
+    SymmetricQuantizeAndPopulate(weights_, f);
+  }
+
+  void SetRecurrentWeights(std::initializer_list<float> f) {
+    SymmetricQuantizeAndPopulate(recurrent_weights_, f);
+  }
+};
+
+TEST(UnidirectionalRNNOpTest, BlackBoxTest) {
   UnidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
                                /*units=*/16, /*size=*/8, /*time_major=*/false);
-  rnn.SetWeights(
-      {0.461459,    0.153381,   0.529743,    -0.00371218, 0.676267,   -0.211346,
-       0.317493,    0.969689,   -0.343251,   0.186423,    0.398151,   0.152399,
-       0.448504,    0.317662,   0.523556,    -0.323514,   0.480877,   0.333113,
-       -0.757714,   -0.674487,  -0.643585,   0.217766,    -0.0251462, 0.79512,
-       -0.595574,   -0.422444,  0.371572,    -0.452178,   -0.556069,  -0.482188,
-       -0.685456,   -0.727851,  0.841829,    0.551535,    -0.232336,  0.729158,
-       -0.00294906, -0.69754,   0.766073,    -0.178424,   0.369513,   -0.423241,
-       0.548547,    -0.0152023, -0.757482,   -0.85491,    0.251331,   -0.989183,
-       0.306261,    -0.340716,  0.886103,    -0.0726757,  -0.723523,  -0.784303,
-       0.0354295,   0.566564,   -0.485469,   -0.620498,   0.832546,   0.697884,
-       -0.279115,   0.294415,   -0.584313,   0.548772,    0.0648819,  0.968726,
-       0.723834,    -0.0080452, -0.350386,   -0.272803,   0.115121,   -0.412644,
-       -0.824713,   -0.992843,  -0.592904,   -0.417893,   0.863791,   -0.423461,
-       -0.147601,   -0.770664,  -0.479006,   0.654782,    0.587314,   -0.639158,
-       0.816969,    -0.337228,  0.659878,    0.73107,     0.754768,   -0.337042,
-       0.0960841,   0.368357,   0.244191,    -0.817703,   -0.211223,  0.442012,
-       0.37225,     -0.623598,  -0.405423,   0.455101,    0.673656,   -0.145345,
-       -0.511346,   -0.901675,  -0.81252,    -0.127006,   0.809865,   -0.721884,
-       0.636255,    0.868989,   -0.347973,   -0.10179,    -0.777449,  0.917274,
-       0.819286,    0.206218,   -0.00785118, 0.167141,    0.45872,    0.972934,
-       -0.276798,   0.837861,   0.747958,    -0.0151566,  -0.330057,  -0.469077,
-       0.277308,    0.415818});
-
-  rnn.SetBias({0.065691948, -0.69055247, 0.1107955, -0.97084129, -0.23957068,
-               -0.23566568, -0.389184, 0.47481549, -0.4791103, 0.29931796,
-               0.10463274, 0.83918178, 0.37197268, 0.61957061, 0.3956964,
-               -0.37609905});
-
-  rnn.SetRecurrentWeights({0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1});
-
+  rnn.SetWeights(rnn_weights);
+  rnn.SetBias(rnn_bias);
+  rnn.SetRecurrentWeights(rnn_recurrent_weights);
   rnn.ResetHiddenState();
+
   const int input_sequence_size = rnn.input_size() * rnn.sequence_len();
   float* batch_start = rnn_input;
   float* batch_end = batch_start + input_sequence_size;
@@ -270,56 +292,42 @@ TEST(FullyConnectedOpTest, BlackBoxTest) {
   EXPECT_THAT(rnn.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
 }
 
-TEST(FullyConnectedOpTest, TimeMajorBlackBoxTest) {
-  UnidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
-                               /*units=*/16, /*size=*/8, /*time_major=*/true);
-  rnn.SetWeights(
-      {0.461459,    0.153381,   0.529743,    -0.00371218, 0.676267,   -0.211346,
-       0.317493,    0.969689,   -0.343251,   0.186423,    0.398151,   0.152399,
-       0.448504,    0.317662,   0.523556,    -0.323514,   0.480877,   0.333113,
-       -0.757714,   -0.674487,  -0.643585,   0.217766,    -0.0251462, 0.79512,
-       -0.595574,   -0.422444,  0.371572,    -0.452178,   -0.556069,  -0.482188,
-       -0.685456,   -0.727851,  0.841829,    0.551535,    -0.232336,  0.729158,
-       -0.00294906, -0.69754,   0.766073,    -0.178424,   0.369513,   -0.423241,
-       0.548547,    -0.0152023, -0.757482,   -0.85491,    0.251331,   -0.989183,
-       0.306261,    -0.340716,  0.886103,    -0.0726757,  -0.723523,  -0.784303,
-       0.0354295,   0.566564,   -0.485469,   -0.620498,   0.832546,   0.697884,
-       -0.279115,   0.294415,   -0.584313,   0.548772,    0.0648819,  0.968726,
-       0.723834,    -0.0080452, -0.350386,   -0.272803,   0.115121,   -0.412644,
-       -0.824713,   -0.992843,  -0.592904,   -0.417893,   0.863791,   -0.423461,
-       -0.147601,   -0.770664,  -0.479006,   0.654782,    0.587314,   -0.639158,
-       0.816969,    -0.337228,  0.659878,    0.73107,     0.754768,   -0.337042,
-       0.0960841,   0.368357,   0.244191,    -0.817703,   -0.211223,  0.442012,
-       0.37225,     -0.623598,  -0.405423,   0.455101,    0.673656,   -0.145345,
-       -0.511346,   -0.901675,  -0.81252,    -0.127006,   0.809865,   -0.721884,
-       0.636255,    0.868989,   -0.347973,   -0.10179,    -0.777449,  0.917274,
-       0.819286,    0.206218,   -0.00785118, 0.167141,    0.45872,    0.972934,
-       -0.276798,   0.837861,   0.747958,    -0.0151566,  -0.330057,  -0.469077,
-       0.277308,    0.415818});
-
-  rnn.SetBias({0.065691948, -0.69055247, 0.1107955, -0.97084129, -0.23957068,
-               -0.23566568, -0.389184, 0.47481549, -0.4791103, 0.29931796,
-               0.10463274, 0.83918178, 0.37197268, 0.61957061, 0.3956964,
-               -0.37609905});
-
-  rnn.SetRecurrentWeights({0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                           0.1});
+TEST(HybridUnidirectionalRNNOpModelOpTest, BlackBoxTest) {
+  HybridUnidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                                     /*units=*/16, /*size=*/8,
+                                     /*time_major=*/false);
+  rnn.SetWeights(rnn_weights);
+  rnn.SetBias(rnn_bias);
+  rnn.SetRecurrentWeights(rnn_recurrent_weights);
+  rnn.ResetHiddenState();
+
+  const int input_sequence_size = rnn.input_size() * rnn.sequence_len();
+  float* batch_start = rnn_input;
+  float* batch_end = batch_start + input_sequence_size;
+  rnn.SetInput(0, batch_start, batch_end);
+  rnn.SetInput(input_sequence_size, batch_start, batch_end);
+
+  rnn.Invoke();
+
+  float* golden_start = rnn_golden_output;
+  float* golden_end = golden_start + rnn.num_units() * rnn.sequence_len();
+  std::vector<float> expected;
+  expected.insert(expected.end(), golden_start, golden_end);
+  expected.insert(expected.end(), golden_start, golden_end);
+
+  EXPECT_THAT(rnn.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                   expected, /*max_abs_error=*/0.013)));
+}
 
+TEST(UnidirectionalRNNOpTest, TimeMajorBlackBoxTest) {
+  UnidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                               /*units=*/16, /*size=*/8,
+                               /*time_major=*/true);
+  rnn.SetWeights(rnn_weights);
+  rnn.SetBias(rnn_bias);
+  rnn.SetRecurrentWeights(rnn_recurrent_weights);
   rnn.ResetHiddenState();
+
   for (int i = 0; i < rnn.sequence_len(); i++) {
     float* batch_start = rnn_input + i * rnn.input_size();
     float* batch_end = batch_start + rnn.input_size();
@@ -341,6 +349,37 @@ TEST(FullyConnectedOpTest, TimeMajorBlackBoxTest) {
   EXPECT_THAT(rnn.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
 }
 
+TEST(HybridUnidirectionalRNNOpModelOpTest, TimeMajorBlackBoxTest) {
+  HybridUnidirectionalRNNOpModel rnn(/*batches=*/2, /*sequence_len=*/16,
+                                     /*units=*/16, /*size=*/8,
+                                     /*time_major=*/true);
+  rnn.SetWeights(rnn_weights);
+  rnn.SetBias(rnn_bias);
+  rnn.SetRecurrentWeights(rnn_recurrent_weights);
+  rnn.ResetHiddenState();
+
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* batch_start = rnn_input + i * rnn.input_size();
+    float* batch_end = batch_start + rnn.input_size();
+    // The two batches are identical.
+    rnn.SetInput(2 * i * rnn.input_size(), batch_start, batch_end);
+    rnn.SetInput((2 * i + 1) * rnn.input_size(), batch_start, batch_end);
+  }
+
+  rnn.Invoke();
+
+  std::vector<float> expected;
+  for (int i = 0; i < rnn.sequence_len(); i++) {
+    float* golden_batch_start = rnn_golden_output + i * rnn.num_units();
+    float* golden_batch_end = golden_batch_start + rnn.num_units();
+    expected.insert(expected.end(), golden_batch_start, golden_batch_end);
+    expected.insert(expected.end(), golden_batch_start, golden_batch_end);
+  }
+
+  EXPECT_THAT(rnn.GetOutput(), ElementsAreArray(ArrayFloatNear(
+                                   expected, /*max_abs_error=*/0.013)));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index f45f39d1e6f874..8d8d74adfb5c40 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -184,8 +184,10 @@ TfLiteStatus InterpreterBuilder::BuildLocalIndexToRegistrationMapping() {
   TfLiteStatus status = kTfLiteOk;
   auto opcodes = model_->operator_codes();
   for (const OperatorCode* opcode : *opcodes) {
-    TfLiteRegistration* registration = nullptr;
+    const TfLiteRegistration* registration = nullptr;
     auto builtin_code = opcode->builtin_code();
+    int version = opcode->version();
+
     if (builtin_code > BuiltinOperator_MAX ||
         builtin_code < BuiltinOperator_MIN) {
       error_reporter_->Report(
@@ -194,7 +196,7 @@ TfLiteStatus InterpreterBuilder::BuildLocalIndexToRegistrationMapping() {
           builtin_code);
       status = kTfLiteError;
     } else if (builtin_code != BuiltinOperator_CUSTOM) {
-      registration = op_resolver_.FindOp(builtin_code);
+      registration = op_resolver_.FindOp(builtin_code, version);
       if (registration == nullptr) {
         error_reporter_->Report("Didn't find op for builtin opcode '%s'\n",
                                 EnumNameBuiltinOperator(builtin_code));
@@ -206,9 +208,13 @@ TfLiteStatus InterpreterBuilder::BuildLocalIndexToRegistrationMapping() {
       status = kTfLiteError;
     } else {
       const char* name = opcode->custom_code()->c_str();
-      registration = op_resolver_.FindOp(name);
+      registration = op_resolver_.FindOp(name, version);
+      flatbuffer_op_index_to_registration_types_.push_back(
+          BuiltinOperator_CUSTOM);
       if (registration == nullptr) {
-        error_reporter_->Report("Didn't find custom op for name '%s'\n", name);
+        error_reporter_->Report(
+            "Didn't find custom op for name '%s' with version %d\n", name,
+            version);
         status = kTfLiteError;
       }
     }
@@ -330,6 +336,7 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
         params->stride_height = conv_params->stride_h();
         params->activation =
             parse_activation(conv_params->fused_activation_function());
+
         params->dilation_width_factor = conv_params->dilation_w_factor();
         params->dilation_height_factor = conv_params->dilation_h_factor();
       }
@@ -347,6 +354,9 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_LOG_SOFTMAX:
     case BuiltinOperator_DEQUANTIZE:
     case BuiltinOperator_PRELU:
+    case BuiltinOperator_FLOOR:
+    case BuiltinOperator_NEG:
+    case BuiltinOperator_SIN:
       break;
     case BuiltinOperator_CAST: {
       TfLiteCastParams* params = MallocPOD<TfLiteCastParams>();
@@ -548,6 +558,14 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
             parse_activation(lstm_params->fused_activation_function());
         params->cell_clip = lstm_params->cell_clip();
         params->proj_clip = lstm_params->proj_clip();
+        switch (lstm_params->kernel_type()) {
+          case LSTMKernelType_FULL:
+            params->kernel_type = kTfLiteLSTMFullKernel;
+            break;
+          case LSTMKernelType_BASIC:
+            params->kernel_type = kTfLiteLSTMBasicKernel;
+            break;
+        }
       }
       *builtin_data = reinterpret_cast<void*>(params);
       break;
@@ -564,6 +582,9 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_PAD: {
       break;
     }
+    case BuiltinOperator_PADV2: {
+      break;
+    }
     case BuiltinOperator_RESHAPE: {
       auto* params = MallocPOD<TfLiteReshapeParams>();
       if (auto* schema_params = op->builtin_options_as_ReshapeOptions()) {
@@ -664,7 +685,36 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
-    case BuiltinOperator_LESS: {
+    case BuiltinOperator_GREATER:
+    case BuiltinOperator_GREATER_EQUAL:
+    case BuiltinOperator_LESS:
+    case BuiltinOperator_LESS_EQUAL:
+    case BuiltinOperator_SELECT: {
+      break;
+    }
+    case BuiltinOperator_SLICE: {
+      break;
+    }
+    case BuiltinOperator_TRANSPOSE_CONV: {
+      TfLiteTransposeConvParams* params =
+          MallocPOD<TfLiteTransposeConvParams>();
+      if (auto* transpose_conv_params =
+              op->builtin_options_as_TransposeConvOptions()) {
+        params->padding = parse_padding(transpose_conv_params->padding());
+        params->stride_width = transpose_conv_params->stride_w();
+        params->stride_height = transpose_conv_params->stride_h();
+      }
+      *builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+    case BuiltinOperator_SPARSE_TO_DENSE: {
+      TfLiteSparseToDenseParams* params =
+          MallocPOD<TfLiteSparseToDenseParams>();
+      if (auto* sparse_to_dense_params =
+              op->builtin_options_as_SparseToDenseOptions()) {
+        params->validate_indices = sparse_to_dense_params->validate_indices();
+      }
+      *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
     case BuiltinOperator_DELEGATE: {
@@ -672,6 +722,10 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       error_reporter->Report("DELEGATE op shouldn't exist in model.");
       return kTfLiteError;
     }
+    case BuiltinOperator_EXPAND_DIMS:
+    case BuiltinOperator_TILE: {
+      break;
+    }
   }
   return kTfLiteOk;
 }
@@ -691,26 +745,30 @@ TfLiteStatus InterpreterBuilder::ParseNodes(
       status = kTfLiteError;
       continue;
     }
-    const TfLiteRegistration* reg =
+
+    const TfLiteRegistration* registration =
         flatbuffer_op_index_to_registration_[op->opcode_index()];
-    if (reg == nullptr) {
+    if (registration == nullptr) {
       error_reporter_->Report("Skipping op for opcode_index %d\n", index);
       status = kTfLiteError;
       continue;
     }
 
-    BuiltinOperator op_type = static_cast<BuiltinOperator>(reg->builtin_code);
+    BuiltinOperator op_type =
+        static_cast<BuiltinOperator>(registration->builtin_code);
+
     if (op_type != BuiltinOperator_CUSTOM && op->custom_options()) {
       error_reporter_->Report(
           "Found builtin operator %s with custom options.\n",
           EnumNameBuiltinOperator(op_type));
     }
+
     if (op->custom_options()) {
       interpreter->AddNodeWithParameters(
           FlatBufferIntArrayToVector(op->inputs()),
           FlatBufferIntArrayToVector(op->outputs()),
           reinterpret_cast<const char*>(op->custom_options()->data()),
-          op->custom_options()->size(), nullptr, reg);
+          op->custom_options()->size(), nullptr, registration);
     } else {
       void* builtin_data = nullptr;
       TF_LITE_ENSURE_STATUS(
@@ -718,7 +776,7 @@ TfLiteStatus InterpreterBuilder::ParseNodes(
       interpreter->AddNodeWithParameters(
           FlatBufferIntArrayToVector(op->inputs()),
           FlatBufferIntArrayToVector(op->outputs()), nullptr, 0, builtin_data,
-          reg);
+          registration);
     }
   }
 
diff --git a/tensorflow/contrib/lite/model.h b/tensorflow/contrib/lite/model.h
index a7d7f3ea109679..3946b490417104 100644
--- a/tensorflow/contrib/lite/model.h
+++ b/tensorflow/contrib/lite/model.h
@@ -37,6 +37,7 @@ limitations under the License.
 #include <memory>
 #include "tensorflow/contrib/lite/error_reporter.h"
 #include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/op_resolver.h"
 #include "tensorflow/contrib/lite/schema/schema_generated.h"
 
 namespace tflite {
@@ -131,18 +132,6 @@ class FlatBufferModel {
   Allocation* allocation_ = nullptr;
 };
 
-// Abstract interface that returns TfLiteRegistrations given op codes or custom
-// op names. This is the mechanism that ops being referenced in the flatbuffer
-// model are mapped to executable function pointers (TfLiteRegistrations).
-class OpResolver {
- public:
-  // Finds the op registration for a builtin operator by enum code.
-  virtual TfLiteRegistration* FindOp(tflite::BuiltinOperator op) const = 0;
-  // Finds the op registration of a custom operator by op name.
-  virtual TfLiteRegistration* FindOp(const char* op) const = 0;
-  virtual ~OpResolver() {}
-};
-
 // Build an interpreter capable of interpreting `model`.
 //
 // model: a scoped model whose lifetime must be at least as long as
@@ -187,7 +176,8 @@ class InterpreterBuilder {
   const OpResolver& op_resolver_;
   ErrorReporter* error_reporter_;
 
-  std::vector<TfLiteRegistration*> flatbuffer_op_index_to_registration_;
+  std::vector<const TfLiteRegistration*> flatbuffer_op_index_to_registration_;
+  std::vector<BuiltinOperator> flatbuffer_op_index_to_registration_types_;
   const Allocation* allocation_ = nullptr;
 };
 
diff --git a/tensorflow/contrib/lite/model_test.cc b/tensorflow/contrib/lite/model_test.cc
index ae6c1ece18963f..15bae21a411c12 100644
--- a/tensorflow/contrib/lite/model_test.cc
+++ b/tensorflow/contrib/lite/model_test.cc
@@ -55,11 +55,12 @@ class TrivialResolver : public OpResolver {
   explicit TrivialResolver(TfLiteRegistration* constant_return = nullptr)
       : constant_return_(constant_return) {}
   // Find the op registration of a custom operator by op name.
-  TfLiteRegistration* FindOp(tflite::BuiltinOperator op) const override {
+  const TfLiteRegistration* FindOp(tflite::BuiltinOperator op,
+                                   int version) const override {
     return constant_return_;
   }
   // Find the op registration of a custom operator by op name.
-  TfLiteRegistration* FindOp(const char* op) const override {
+  const TfLiteRegistration* FindOp(const char* op, int version) const override {
     return constant_return_;
   }
 
diff --git a/tensorflow/contrib/lite/models/smartreply/BUILD b/tensorflow/contrib/lite/models/smartreply/BUILD
index a82d1f2eb673b9..8b5fa240ac31d9 100644
--- a/tensorflow/contrib/lite/models/smartreply/BUILD
+++ b/tensorflow/contrib/lite/models/smartreply/BUILD
@@ -22,7 +22,6 @@ cc_library(
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/kernels:builtin_ops",
-        "//tensorflow/contrib/lite/tools:mutable_op_resolver",
         "@com_google_absl//absl/strings",
         "@com_googlesource_code_re2//:re2",
         "@farmhash_archive//:farmhash",
@@ -39,7 +38,6 @@ cc_library(
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/kernels:builtin_ops",
-        "//tensorflow/contrib/lite/tools:mutable_op_resolver",
         "@com_google_absl//absl/strings",
         "@com_googlesource_code_re2//:re2",
     ],
diff --git a/tensorflow/contrib/lite/models/smartreply/ops/extract_feature.cc b/tensorflow/contrib/lite/models/smartreply/ops/extract_feature.cc
index f97a6486d6c11c..29c8ad2286d705 100644
--- a/tensorflow/contrib/lite/models/smartreply/ops/extract_feature.cc
+++ b/tensorflow/contrib/lite/models/smartreply/ops/extract_feature.cc
@@ -61,7 +61,7 @@ bool IsValidNgram(const tflite::StringRef& strref) {
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TfLiteIntArray* outputSize1 = TfLiteIntArrayCreate(1);
   TfLiteIntArray* outputSize2 = TfLiteIntArrayCreate(1);
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   int dim = input->dims->data[0];
   if (dim == 0) {
     // TFLite non-string output should have size greater than 0.
@@ -76,7 +76,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 }
 
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* input = GetInput(context, node, 0);
+  const TfLiteTensor* input = GetInput(context, node, 0);
   int num_strings = tflite::GetStringCount(input);
   TfLiteTensor* label = GetOutput(context, node, 0);
   TfLiteTensor* weight = GetOutput(context, node, 1);
diff --git a/tensorflow/contrib/lite/models/smartreply/predictor.cc b/tensorflow/contrib/lite/models/smartreply/predictor.cc
index 6da5cc8eecc092..5d6c47dce8d901 100644
--- a/tensorflow/contrib/lite/models/smartreply/predictor.cc
+++ b/tensorflow/contrib/lite/models/smartreply/predictor.cc
@@ -20,8 +20,8 @@ limitations under the License.
 #include "tensorflow/contrib/lite/interpreter.h"
 #include "tensorflow/contrib/lite/kernels/register.h"
 #include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/op_resolver.h"
 #include "tensorflow/contrib/lite/string_util.h"
-#include "tensorflow/contrib/lite/tools/mutable_op_resolver.h"
 
 void RegisterSelectedOps(::tflite::MutableOpResolver* resolver);
 
@@ -104,11 +104,11 @@ void GetSegmentPredictions(
             });
 
   // Add backoff response.
-  for (const string& backoff : config.backoff_responses) {
+  for (const auto& backoff : config.backoff_responses) {
     if (predictor_responses->size() >= config.num_response) {
       break;
     }
-    predictor_responses->push_back({backoff, config.backoff_confidence});
+    predictor_responses->emplace_back(backoff, config.backoff_confidence);
   }
 }
 
diff --git a/tensorflow/contrib/lite/models/smartreply/predictor_test.cc b/tensorflow/contrib/lite/models/smartreply/predictor_test.cc
index e6c8d966f1aff5..c7e08814fdf502 100644
--- a/tensorflow/contrib/lite/models/smartreply/predictor_test.cc
+++ b/tensorflow/contrib/lite/models/smartreply/predictor_test.cc
@@ -35,8 +35,8 @@ const char kModelName[] = "smartreply_ondevice_model.bin";
 const char kSamples[] = "smartreply_samples.tsv";
 
 string TestDataPath() {
-  return string(StrCat(tensorflow::testing::TensorFlowSrcRoot(), "/",
-                       "contrib/lite/models/testdata/"));
+  return string(absl::StrCat(tensorflow::testing::TensorFlowSrcRoot(), "/",
+                             "contrib/lite/models/testdata/"));
 }
 
 MATCHER_P(IncludeAnyResponesIn, expected_response, "contains the response") {
@@ -55,7 +55,7 @@ class PredictorTest : public ::testing::Test {
  protected:
   PredictorTest() {
     model_ = tflite::FlatBufferModel::BuildFromFile(
-        StrCat(TestDataPath(), "/", kModelName).c_str());
+        absl::StrCat(TestDataPath(), "/", kModelName).c_str());
     CHECK(model_);
   }
   ~PredictorTest() override {}
@@ -121,7 +121,7 @@ TEST_F(PredictorTest, BatchTest) {
   int total_triggers = 0;
 
   string line;
-  std::ifstream fin(StrCat(TestDataPath(), "/", kSamples));
+  std::ifstream fin(absl::StrCat(TestDataPath(), "/", kSamples));
   while (std::getline(fin, line)) {
     const std::vector<string> fields = absl::StrSplit(line, '\t');
     if (fields.empty()) {
diff --git a/tensorflow/contrib/lite/models/speech_test.cc b/tensorflow/contrib/lite/models/speech_test.cc
index a354179a9480c1..206de1962d1964 100644
--- a/tensorflow/contrib/lite/models/speech_test.cc
+++ b/tensorflow/contrib/lite/models/speech_test.cc
@@ -131,8 +131,8 @@ TEST_P(SpeechTest, SpeakerIdOkGoogleTest) {
   ASSERT_TRUE(ConvertCsvData(
       "speech_speakerid_model.tflite", "speech_speakerid_model_in.csv",
       "speech_speakerid_model_out.csv", /*input_tensor=*/"0",
-      /*output_tensor=*/"66",
-      /*persistent_tensors=*/"19,20,40,41,61,62",
+      /*output_tensor=*/"63",
+      /*persistent_tensors=*/"18,19,38,39,58,59",
       /*sequence_size=*/80, &os));
   testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
   ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver, GetMaxInvocations()))
@@ -144,8 +144,8 @@ TEST_P(SpeechTest, AsrAmTest) {
   ASSERT_TRUE(
       ConvertCsvData("speech_asr_am_model.tflite", "speech_asr_am_model_in.csv",
                      "speech_asr_am_model_out.csv", /*input_tensor=*/"0",
-                     /*output_tensor=*/"109",
-                     /*persistent_tensors=*/"19,20,40,41,61,62,82,83,103,104",
+                     /*output_tensor=*/"104",
+                     /*persistent_tensors=*/"18,19,38,39,58,59,78,79,98,99",
                      /*sequence_size=*/320, &os));
   testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
   ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver, GetMaxInvocations()))
@@ -170,8 +170,8 @@ TEST_P(SpeechTest, EndpointerTest) {
   ASSERT_TRUE(ConvertCsvData(
       "speech_endpointer_model.tflite", "speech_endpointer_model_in.csv",
       "speech_endpointer_model_out.csv", /*input_tensor=*/"0",
-      /*output_tensor=*/"58",
-      /*persistent_tensors=*/"28,29,49,50",
+      /*output_tensor=*/"56",
+      /*persistent_tensors=*/"27,28,47,48",
       /*sequence_size=*/320, &os));
   testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
   ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver, GetMaxInvocations()))
@@ -183,8 +183,8 @@ TEST_P(SpeechTest, TtsTest) {
   ASSERT_TRUE(ConvertCsvData("speech_tts_model.tflite",
                              "speech_tts_model_in.csv",
                              "speech_tts_model_out.csv", /*input_tensor=*/"0",
-                             /*output_tensor=*/"74",
-                             /*persistent_tensors=*/"25,26,46,47,67,68,73",
+                             /*output_tensor=*/"71",
+                             /*persistent_tensors=*/"24,25,44,45,64,65,70",
                              /*sequence_size=*/334, &os));
   testing::TfLiteDriver test_driver(/*use_nnapi=*/false);
   ASSERT_TRUE(testing::ParseAndRunTests(&os, &test_driver, GetMaxInvocations()))
diff --git a/tensorflow/contrib/lite/models/testdata/speech_asr_lm_model.test_spec b/tensorflow/contrib/lite/models/testdata/speech_asr_lm_model.test_spec
index 5812de4b30382f..f7f518b75f5652 100644
--- a/tensorflow/contrib/lite/models/testdata/speech_asr_lm_model.test_spec
+++ b/tensorflow/contrib/lite/models/testdata/speech_asr_lm_model.test_spec
@@ -1,5 +1,5 @@
 load_model: "speech_asr_lm_model.tflite"
-init_state: "21,22,42,43,63,64"
+init_state: "20,21,40,41,60,61"
 invoke {
   id: 3
   input: "63982"
@@ -18,7 +18,7 @@ invoke {
   input: "63981"
   output: "-0.314846"
 }
-init_state: "21,22,42,43,63,64"
+init_state: "20,21,40,41,60,61"
 invoke {
   id: 6
   input: "63982"
@@ -31,7 +31,7 @@ invoke {
   input: "3082"
   output: "-3.63721"
 }
-init_state: "21,22,42,43,63,64"
+init_state: "20,21,40,41,60,61"
 invoke {
   id: 8
   input: "63982"
@@ -44,7 +44,7 @@ invoke {
   input: "18965"
   output: "-6.93985"
 }
-init_state: "21,22,42,43,63,64"
+init_state: "20,21,40,41,60,61"
 invoke {
   id: 13
   input: "63982"
@@ -63,7 +63,7 @@ invoke {
   input: "63981"
   output: "-3.82091"
 }
-init_state: "21,22,42,43,63,64"
+init_state: "20,21,40,41,60,61"
 invoke {
   id: 19
   input: "63982"
@@ -88,7 +88,7 @@ invoke {
   input: "63981"
   output: "-0.677399"
 }
-init_state: "21,22,42,43,63,64"
+init_state: "20,21,40,41,60,61"
 invoke {
   id: 26
   input: "63982"
@@ -113,7 +113,7 @@ invoke {
   input: "63981"
   output: "0.415889"
 }
-init_state: "21,22,42,43,63,64"
+init_state: "20,21,40,41,60,61"
 invoke {
   id: 30
   input: "63982"
@@ -131,7 +131,7 @@ invoke {
   input: "51923"
   output: "-14.1147"
 }
-init_state: "21,22,42,43,63,64"
+init_state: "20,21,40,41,60,61"
 invoke {
   id: 34
   input: "63982"
@@ -144,7 +144,7 @@ invoke {
   input: "16318"
   output: "-1.54815"
 }
-init_state: "21,22,42,43,63,64"
+init_state: "20,21,40,41,60,61"
 invoke {
   id: 36
   input: "63982"
@@ -157,7 +157,7 @@ invoke {
   input: "28303"
   output: "-14.0947"
 }
-init_state: "21,22,42,43,63,64"
+init_state: "20,21,40,41,60,61"
 invoke {
   id: 38
   input: "63982"
diff --git a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
index ace4827d8ce215..becd1f615f04a8 100644
--- a/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
+++ b/tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h
@@ -65,7 +65,8 @@ inline bool NNAPIExists() {
   return nnapi_is_available;
 }
 
-// nn api types
+// NN api types based on NNAPI header file
+// https://developer.android.com/ndk/reference/group/neural-networks
 
 /**
  * Operand types.
@@ -77,31 +78,11 @@ inline bool NNAPIExists() {
  * ANEURALNETWORKS_TENSOR_QUANT8_ASYMM, and ANEURALNETWORKS_INT32.
  */
 enum {
-  /** The following entries are used to declare scalars. */
-
-  /** A 32 bit floating point scalar value. */
   ANEURALNETWORKS_FLOAT32 = 0,
-  /** A signed 32 bit integer scalar value. */
   ANEURALNETWORKS_INT32 = 1,
-  /** An unsigned 32 bit integer scalar value. */
   ANEURALNETWORKS_UINT32 = 2,
-
-  /** The following entries are used to declare tensors. */
-
-  /** A tensor of 32 bit floating point values. */
   ANEURALNETWORKS_TENSOR_FLOAT32 = 3,
-  /** A tensor of 32 bit integer values. */
   ANEURALNETWORKS_TENSOR_INT32 = 4,
-  /** A tensor of 8 bit integers that represent real numbers.
-   *
-   * Attached to this tensor are two numbers that can be used to convert
-   * the 8 bit integer to the real value and vice versa.  These two numbers are:
-   * - scale: a 32 bit floating point value
-   * - zero_value: an 32 bit integer
-   *
-   * The formula is:
-   * real_value = (integer_value - zero_value) * scale.
-   */
   ANEURALNETWORKS_TENSOR_QUANT8_ASYMM = 5,
 };
 
@@ -111,968 +92,44 @@ enum {
  * The type of operations that can be added to a model.
  */
 enum {
-  /** Adds two tensors, element-wise.
-   *
-   * Takes two input tensors of identical type and compatible dimensions. The
-   * output is the sum of both input tensors, optionally modified by an
-   * activation function.
-   *
-   * Two dimensions are compatible when:
-   *     1. they are equal, or
-   *     2. one of them is 1
-   *
-   * The size of the output is the maximum size along each dimension of the
-   * input operands. It starts with the trailing dimensions, and works its way
-   * forward.
-   *
-   * Example:
-   *
-   *     input1.dimension = {4, 1, 2}
-   *     input2.dimension = {5, 4, 3, 1}
-   *     output.dimension = {5, 4, 3, 2}
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   *
-   * Supported tensor rank: up to 4
-   *
-   * Inputs:
-   * * 0: A tensor.
-   * * 1: A tensor of the same type, and compatible dimensions as input0.
-   * * 2: An INT32 value, and has to be one of the {@link FuseCode} values.
-   *      Specifies the activation to invoke on the result of each addition.
-   *
-   * Outputs:
-   * * 0: The sum, a tensor of the same type as input0.
-   */
   ANEURALNETWORKS_ADD = 0,
-  /** Performs a 2-D average pooling operation.
-   *
-   * The output dimensions are functions of the filter dimensions, stride, and
-   * padding.
-   *
-   * The values in the output tensor are computed as:
-   *
-   *     output[batch, row, col, channel] =
-   *         sum_{i, j}(input[batch, row + i, col + j, channel]) / sum(1)
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
-   *
-   * Supported tensor rank: 4, with "NHWC" data layout.
-   *
-   * Inputs:
-   * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying the
-   * input.
-   * * 1: An INT32 value, specifying the padding on the left, in the ‘width’
-   * dimension.
-   * * 2: An INT32 value, specifying the padding on the right,in the ‘width’
-   * dimension.
-   * * 3: An INT32 value, specifying the padding on the top, in the ‘height’
-   * dimension.
-   * * 4: An INT32 value, specifying the padding on the bottom, in the ‘height’
-   * dimension.
-   * * 5: An INT32 value, specifying the output stride in the ‘width’ dimension.
-   * * 6: An INT32 value, specifying the output stride in the ‘height’
-   * dimension.
-   * * 7: An INT32 value, specifying the filter width.
-   * * 8: An INT32 value, specifying the filter height.
-   * * 9: An INT32 value, and has to be one of the {@link FuseCode} values.
-   *      Specifies the activation to invoke on the result of each addition.
-   *
-   * Outputs:
-   * * 0: The output 4-D tensor, of shape [batches, out_height, out_width,
-   * depth].
-   */
   ANEURALNETWORKS_AVERAGE_POOL_2D = 1,
-  /** Concatenates the input tensors along the given dimension.
-   *
-   * The input tensors must have identical type and the same dimensions except
-   * the dimension along the concatenation axis.
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
-   *
-   * Supported tensor rank: up to 4
-   *
-   * Inputs:
-   * 0 ~ n: The list on n input tensors, of shape [D0, D1, ..., Daxis(i), ...,
-   * Dm] n+1: An INT32 value, specifying the concatenation axis. n+2: An INT32
-   * value, and has to be one of the {@link FuseCode} values. Specifies the
-   * activation to invoke on the result of each addition.
-   *
-   * Outputs:
-   * * 0: The output, a tensor of the same type as the input tensors.
-   *      The output shape is [D0, D1, ..., sum(Daxis(i)), ..., Dm].
-   */
   ANEURALNETWORKS_CONCATENATION = 2,
-  /** Performs an 2-D convolution operation.
-   *
-   * The CONV_2D op sweeps a 2-D filter that can mix channels together over a
-   * batch of images, applying the filter to each window of each image of the
-   * appropriate size.
-   *
-   * The output dimensions are functions of the filter dimensions, stride, and
-   * padding.
-   *
-   * The values in the output tensor are computed as:
-   *
-   *     output[batch, row, col, channel] =
-   *         sum_{i, j} (
-   *             input[batch, row + i, col + j, k] *
-   *             filter[channel, row + i, col + j, k] +
-   *             bias[channel]
-   *         )
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
-   *
-   * Supported tensor rank: 4, with "NHWC" data layout.
-   *
-   * Inputs:
-   * * 0: A 4-D tensor, of shape [batches, height, width, depth_in], specifying
-   * the input.
-   * * 1: A 4-D tensor, of shape [depth_out, filter_height, filter_width,
-   * depth_in], specifying the filter.
-   * * 2: A 1-D tensor, of shape [depth_out], specifying the bias.
-   *      For input tensor of {@link ANEURALNETWORKS_TENSOR_FLOAT32} type, the
-   * bias should also be of {@link ANEURALNETWORKS_TENSOR_FLOAT32}. For input
-   * tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} type, the bias should
-   * be of {@link ANEURALNETWORKS_TENSOR_INT32}.
-   * * 3: An INT32 value, specifying the padding on the left, in the ‘width’
-   * dimension.
-   * * 4: An INT32 value, specifying the padding on the right,in the ‘width’
-   * dimension.
-   * * 5: An INT32 value, specifying the padding on the top, in the ‘height’
-   * dimension.
-   * * 6: An INT32 value, specifying the padding on the bottom, in the ‘height’
-   * dimension.
-   * * 7: An INT32 value, specifying the output stride in the ‘width’ dimension.
-   * * 8: An INT32 value, specifying the output stride in the ‘height’
-   * dimension.
-   * * 9: An INT32 value, and has to be one of the {@link FuseCode} values.
-   *      Specifies the activation to invoke on the result of each addition.
-   *
-   * Outputs:
-   * * 0: The output 4-D tensor, of shape [batches, out_height, out_width,
-   * depth_out].
-   */
   ANEURALNETWORKS_CONV_2D = 3,
-  /** Performs a depthwise 2-D convolution operation.
-   *
-   * Given an input tensor of shape [batches, height, width, depth_in] and a
-   * filter tensor of shape [depth_out, filter_height, filter_width, depth_in]
-   * containing in_channels convolutional filters of depth 1, DEPTHWISE_CONV
-   * applies a different filter to each input channel (expanding from 1 channel
-   * to channel_multiplier channels for each), then concatenates the results
-   * together.
-   *
-   * The output has depth_out = depth_in * depth_multiplier channels.
-   * The output dimensions are functions of the filter dimensions, stride, and
-   * padding.
-   *
-   * The values in the output tensor are computed as:
-   *
-   *     output[b, i, j, k * channel_multiplier + q] =
-   *         sum_{di, dj} (
-   *             input[b, strides[1] * i + di, strides[2] * j + dj, k] *
-   *             filter[di, dj, k, q]
-   *         )
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
-   *
-   * Supported tensor rank: 4, with "NHWC" data layout.
-   *
-   * Inputs:
-   * * 0: A 4-D tensor, of shape [batches, height, width, depth_in], specifying
-   * the input.
-   * * 1: A 4-D tensor, of shape [depth_out, filter_height, filter_width,
-   * depth_in], specifying the filter.
-   * * 2: A 1-D tensor, of shape [depth_out], specifying the bias.
-   *      For input tensor of {@link ANEURALNETWORKS_TENSOR_FLOAT32} type, the
-   * bias should also be of {@link ANEURALNETWORKS_TENSOR_FLOAT32}. For input
-   * tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} type, the bias should
-   * be of {@link ANEURALNETWORKS_TENSOR_INT32}.
-   * * 3: An INT32 value, specifying the padding on the left, in the ‘width’
-   * dimension.
-   * * 4: An INT32 value, specifying the padding on the right,in the ‘width’
-   * dimension.
-   * * 5: An INT32 value, specifying the padding on the top, in the ‘height’
-   * dimension.
-   * * 6: An INT32 value, specifying the padding on the bottom, in the ‘height’
-   * dimension.
-   * * 7: An INT32 value, specifying the output stride in the ‘width’ dimension.
-   * * 8: An INT32 value, specifying the output stride in the ‘height’
-   * dimension.
-   * * 9: An INT32 value, specifying the depthwise multiplier.
-   * * 10: An INT32 value, and has to be one of the {@link FuseCode} values.
-   *       Specifies the activation to invoke on the result of each addition.
-   *
-   * Outputs:
-   * * 0: The output 4-D tensor, of shape [batches, out_height, out_width,
-   * depth_out].
-   */
   ANEURALNETWORKS_DEPTHWISE_CONV_2D = 4,
-  /** Rearranges data from depth into blocks of spatial data.
-   *
-   * More specifically, this op outputs a copy of the input tensor where values
-   * from the depth dimension are moved in spatial blocks to the height and
-   * width dimensions. The value block_size indicates the input block size and
-   * how the data is moved.
-   *
-   * Chunks of data of size block_size * block_size from depth are rearranged
-   * into non-overlapping blocks of size block_size x block_size.
-   *
-   * The width of the output tensor is input_depth * block_size, whereas the
-   * height is input_height * block_size. The depth of the input tensor must be
-   * divisible by block_size * block_size
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
-   *
-   * Supported tensor rank: 4, with "NHWC" data layout.
-   *
-   * Inputs:
-   * * 0: A 4-D tensor, of shape [batches, height, width, depth_in], specifying
-   * the input.
-   * * 1: An INT32 value, specifying the block_size. block_size must be >=1 and
-   *      block_size * block_size must be a divisor of the input depth.
-   *
-   * Outputs:
-   * * 0: The output 4-D tensor, of shape [batch, height*block_size,
-   * width*block_size, depth/(block_size*block_size)].
-   */
   ANEURALNETWORKS_DEPTH_TO_SPACE = 5,
-  /** Dequantizes the input tensor.
-   *
-   * The formula is:
-   *
-   *     output = (input - zero_value) * scale.
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
-   *
-   * Supported tensor rank: up to 4
-   *
-   * Inputs:
-   * * 0: A tensor of type {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}.
-   *
-   * Outputs:
-   * * 0: The output tensor of same shape as input0, but with type
-   *      {@link ANEURALNETWORKS_TENSOR_FLOAT32}.
-   */
   ANEURALNETWORKS_DEQUANTIZE = 6,
-
-  /**
-   * Looks up items from a given tensor.
-   *
-   * Each item in the output is a raw copy of the corresponding item in
-   * the input “values”. If the given “lookup” indices are out of bounds,
-   * the op will fail and an error will be reported.
-   *
-   * Inputs:
-   * * 0: Values. An n-D tensor of any type X (where n >= 2). E.g., if n is 2,
-   *      then the shape would be [lookup_dimension, values_dimension], where
-   *      “lookup_dimension” corresponds to the indexing dimension in the lookup
-   *      table, and “values_dimension” to the contents.
-   * * 1: Lookups. An 1-D tensor of type T, of shape [lookup_size], where
-   *      “lookup_size” is the number of elements to look for, and each entry
-   *      corresponds to the first dimension of the “values” tensor.
-   *
-   * Output:
-   * * 0: A n-D tensor of type X and the same rank and shape as the “values”
-   *      tensor, except for the first dimension which has size “lookup_size”.
-   */
   ANEURALNETWORKS_EMBEDDING_LOOKUP = 7,
-
-  /** Computes element-wise floor() on the input tensor.
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   *
-   * Supported tensor rank: up to 4
-   *
-   * Inputs:
-   * * 0: A tensor.
-   *
-   * Outputs:
-   * * 0: The output, a tensor of the same type and dimensions as input0.
-   */
   ANEURALNETWORKS_FLOOR = 8,
-  /** Denotes a fully (densely) connected layer, which connects all elements in
-   * the input tensor with each element in the output tensor.
-   *
-   * This layer implements the operation:
-   *
-   *     outputs = activation(inputs * weights’ + bias)
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
-   *
-   * Supported tensor rank: up to 4.
-   *
-   * Inputs:
-   * * 0: A tensor, specifying the input. If rank is greater than 2, then it
-   * gets flattened to a 2-D Tensor. The 2-D Tensor is handled as if dimensions
-   * corresponded to shape [batch_size, input_size], where “batch_size”
-   * corresponds to the batching dimension, and “input_size” is the size of the
-   * input.
-   * * 1: A 2-D tensor, specifying the weights, of shape [num_units,
-   * input_size], where "num_units" corresponds to the number of output nodes.
-   * * 2: A 1-D tensor, of shape [num_units], specifying the bias.
-   *      For input tensor of {@link ANEURALNETWORKS_TENSOR_FLOAT32} type, the
-   * bias should also be of {@link ANEURALNETWORKS_TENSOR_FLOAT32}. For input
-   * tensor of {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM} type, the bias should
-   * be of {@link ANEURALNETWORKS_TENSOR_INT32}.
-   * * 3: An INT32 value, and has to be one of the {@link FuseCode} values.
-   *      Specifies the activation to invoke on the result of each addition.
-   *
-   * Outputs:
-   * * 0: The output tensor, of shape [batch_size, num_units].
-   */
   ANEURALNETWORKS_FULLY_CONNECTED = 9,
-
-  /**
-   * Looks up values of a hash table with given keys.
-   *
-   * Inputs:
-   * * 0: Lookups. A 1-D int32 tensor with shape [ k ].
-   * * 1: Keys. A 1-D int32 tensor with shape [ n ], *MUST* be sorted in
-   *      ascending order.
-   * * 2: Values. A tensor with shape [ n … ].
-   *
-   * Outputs:
-   * * 0: Output. A tensor with shape [ k …].
-   * * 1: Hits. A uint8 tensor with shape [ k ] indicates whether the lookup
-   *      hits or not.
-   */
   ANEURALNETWORKS_HASHTABLE_LOOKUP = 10,
-
-  /** Applies L2 normalization along the depth dimension.
-   *
-   * The values in the output tensor are computed as:
-   *
-   *     output[batch, row, col, channel] =
-   *         input[batch, row, col, channel] /
-   *         sqrt(sum_{c} pow(input[batch, row, col, c], 2))
-   *
-   * For x with more dimensions, independently normalizes each 1-D slice along
-   * dimension dim.
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   *
-   * Supported tensor rank: 4, with "NHWC" data layout.
-   *
-   * Inputs:
-   * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying the
-   * input.
-   *
-   * Outputs:
-   * * 0: The output 4-D tensor, of shape [batches, out_height, out_width,
-   * depth].
-   */
   ANEURALNETWORKS_L2_NORMALIZATION = 11,
-
-  /** Performs an 2-D L2 pooling operation.
-   *
-   * The output dimensions are functions of the filter dimensions, stride, and
-   * padding.
-   *
-   * The values in the output tensor are computed as:
-   *
-   *     output[batch, row, col, channel] =
-   *         sqrt(sum_{i, j} pow(input[batch, row + i, col + j, channel], 2) /
-   * sum(1))
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   *
-   * Supported tensor rank: 4, with "NHWC" data layout.
-   *
-   * Inputs:
-   * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying the
-   * input.
-   * * 1: An INT32 value, specifying the padding on the left, in the ‘width’
-   * dimension.
-   * * 2: An INT32 value, specifying the padding on the right,in the ‘width’
-   * dimension.
-   * * 3: An INT32 value, specifying the padding on the top, in the ‘height’
-   * dimension.
-   * * 4: An INT32 value, specifying the padding on the bottom, in the ‘height’
-   * dimension.
-   * * 5: An INT32 value, specifying the output stride in the ‘width’ dimension.
-   * * 6: An INT32 value, specifying the output stride in the ‘height’
-   * dimension.
-   * * 7: An INT32 value, specifying the filter width.
-   * * 8: An INT32 value, specifying the filter height.
-   * * 9: An INT32 value, and has to be one of the {@link FuseCode} values.
-   *      Specifies the activation to invoke on the result of each addition.
-   *
-   * Outputs:
-   * * 0: The output 4-D tensor, of shape [batches, out_height, out_width,
-   * depth].
-   */
   ANEURALNETWORKS_L2_POOL_2D = 12,
-  /** Applies Local Response Normalization along the depth dimension.
-   *
-   * The 4-D input tensor is treated as a 3-D array of 1-D vectors (along the
-   * last dimension), and each vector is normalized independently. Within a
-   * given vector, each component is divided by the weighted, squared sum of
-   * inputs within depth_radius.
-   *
-   * The output is calculated using this formula:
-   *
-   *     sqr_sum[a, b, c, d] =
-   *         sum(pow(input[a, b, c, d - depth_radius : d + depth_radius + 1], 2)
-   *     output = input / pow((bias + alpha * sqr_sum), beta)
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   *
-   * Supported tensor rank: 4, with "NHWC" data layout.
-   *
-   * Inputs:
-   * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying the
-   * input.
-   * * 1: An INT32 value, specifying the radius of the normalization window.
-   * * 2: A FLOAT32 value, specifying the bias, must not be zero.
-   * * 3: A FLOAT32 value, specifying the scale factor, alpha.
-   * * 4: A FLOAT32 value, specifying the exponent, beta.
-   *
-   * Outputs:
-   * * 0: The output tensor of same shape as input0.
-   */
   ANEURALNETWORKS_LOCAL_RESPONSE_NORMALIZATION = 13,
-  /** Computes sigmoid activation on the input tensor element-wise.
-   *
-   * The output is calculated using this formula:
-   *
-   *     output = 1 / (1 + exp(-input))
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
-   *
-   * Supported tensor rank: up to 4.
-   *
-   * Inputs:
-   * * 0: A tensor, specifying the input.
-   *
-   * Outputs:
-   * * 0: The output tensor of same shape as input0.
-   */
   ANEURALNETWORKS_LOGISTIC = 14,
-
-  /**
-   * Projects an input to a bit vector via locality sensitive hashing.
-   *
-   * Inputs:
-   * * 0: Hash functions. Dim.size == 2, DataType: Float.
-   *            Tensor[0].Dim[0]: Number of hash functions.
-   *            Tensor[0].Dim[1]: Number of seeds per hash functions.
-   *            Tensor[0].Dim[1] <= 32 in sparse case.
-   *
-   * * 1: Input. Dim.size >= 1, no restriction on DataType.
-   * * 2: Weight. Optional. Dim.size == 1, DataType: Float.
-   *     If not set, each input element is considered to have the same weight of
-   *     1.0.
-   *     Tensor[1].Dim[0] == Tensor[2].Dim[0]
-   * * 3: Type:
-   *        Sparse: Value LSHProjectionType_SPARSE(=1).
-   *          Computed bit vector is considered to be sparse.
-   *          Each output element is an int32 made up of multiple bits computed
-   * from hash functions.
-   *
-   *        Dense: Value LSHProjectionType_DENSE(=2).
-   *          Computed bit vector is considered to be dense. Each output element
-   *          represents a bit and can take the value of either 0 or 1.
-   *
-   * Outputs:
-   * * 0: If the projection type is sparse:
-   *        Output.Dim == { Tensor[0].Dim[0] }
-   *        A tensor of int32 that represents hash signatures.
-   *      If the projection type is Dense:
-   *        Output.Dim == { Tensor[0].Dim[0] * Tensor[0].Dim[1] }
-   *        A flattened tensor that represents projected bit vectors.
-   */
   ANEURALNETWORKS_LSH_PROJECTION = 15,
-
-  /**
-   * Long short-term memory unit (LSTM) recurrent network layer.
-   *
-   * The default non-peephole implementation is based on:
-   * http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf
-   * S. Hochreiter and J. Schmidhuber. "Long Short-Term Memory". Neural
-   * Computation, 9(8):1735-1780, 1997.
-   *
-   * The peephole implementation is based on:
-   * https://research.google.com/pubs/archive/43905.pdf
-   * Hasim Sak, Andrew Senior, and Francoise Beaufays. "Long short-term memory
-   * recurrent neural network architectures for large scale acoustic modeling."
-   * INTERSPEECH, 2014.
-   *
-   * The coupling of input and forget gate (CIFG) is based on:
-   * http://arxiv.org/pdf/1503.04069.pdf
-   * Greff et al. "LSTM: A Search Space Odyssey"
-   *
-   * The class has the following independently optional inputs:
-   * * If input gate (if CIFG): “input_to_forget_weights”,
-   *   “recurrent_to_input_weights”, “cell_to_input_weights”, “input_gate_bias”.
-   * * If no peephole connections: “cell_to_input_weights”,
-   *   “cell_to_forget_weights”, “cell_to_output_weights”.
-   * * If no projection layer: “projection_weights” and “projection_bias”.
-   * * If no projection bias: “projection_bias”.
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   *
-   * Inputs:
-   * * 0: Input.
-   *      A 2-D tensor of type T, of shape [batch_size, input_size], where
-   *      “batch_size” corresponds to the batching dimension, and “input_size”
-   *      is the size of the input.
-   * * 1: input_to_input_weights.
-   *      A 2-D tensor of type T, of shape [num_units, input_size], where
-   *      “num_units” corresponds to the number of cell units.
-   * * 2: input_to_forget_weights.
-   *      A 2-D tensor of type T, of shape [num_units, input_size].
-   * * 3: input_to_cell_weights.
-   *      A 2-D tensor of type T, of shape [num_units, input_size].
-   * * 4: input_to_output_weights.
-   *      A 2-D tensor of type T, of shape [num_units, input_size].
-   * * 5: recurrent_to_input_weights.
-   *      A 2-D tensor of type T, of shape [num_units, output_size], where
-   *      “output_size” corresponds to either the number of cell units (i.e.,
-   *      “num_units”), or the second dimension of the “projection_weights”, if
-   *      defined.
-   * * 6: recurrent_to_forget_weights.
-   *      A 2-D tensor of type T, of shape [num_units, output_size].
-   * * 7: recurrent_to_cell_weights.
-   *      A 2-D tensor of type T, of shape [num_units, output_size].
-   * * 8: recurrent_to_output_weights.
-   *      A 2-D tensor of type T, of shape [num_units, output_size].
-   * * 9: cell_to_input_weights.
-   *      A 1-D tensor of type T, of shape [num_units].
-   * * 10:cell_to_forget_weights.
-   *      A 1-D tensor of type T, of shape [num_units].
-   * * 11:cell_to_output_weights.
-   *      A 1-D tensor of type T, of shape [num_units].
-   * * 12:input_gate_bias.
-   *      A 1-D tensor of type T, of shape [num_units].
-   * * 13:forget_gate_bias.
-   *      A 1-D tensor of type T, of shape [num_units].
-   * * 14:cell_bias.
-   *      A 1-D tensor of type T, of shape [num_units].
-   * * 15:output_gate_bias.
-   *      A 1-D tensor of type T, of shape [num_units].
-   * * 16:projection_weights.
-   *      A 2-D tensor of type T, of shape [output_size, num_units].
-   * * 17:projection_bias.
-   *      A 1-D tensor of type T, of shape [output_size].
-   *
-   * Parameters:
-   * * 18:fused_activation_function.
-   *      An (optional) ActivationFunctionType indicating the activation
-   *      function.
-   *      If “NONE” is specified then it results in a linear activation.
-   * * 19:cell_clip.
-   *      A clipping threshold for the cell state, such that values are bound
-   *      within [-cell_clip, cell_clip]. If set to 0.0 then clipping is
-   *      disabled.
-   * * 20:proj_clip.
-   *      A clipping threshold for the output from the projection layer, such
-   *      that values are bound within [-proj_clip, proj_clip]. If set to 0.0
-   *      then clipping is disabled.
-   *
-   * Outputs:
-   * * 0: scratch_buffer.
-   *      A 3-D tensor of type T, of shape [batch_size, num_cell, 4].
-   * * 1: output_state.
-   *      A 2-D tensor of type T, of shape [batch_size, output_size].
-   * * 2: cell_state.
-   *      A 2-D tensor of type T, of shape [batch_size, num_units].
-   * * 3: output.
-   *      A 2-D tensor of type T, of shape [batch_size, output_size]. This is
-   *      effectively the same as the current “output_state” value.
-   */
   ANEURALNETWORKS_LSTM = 16,
-
-  /** Performs an 2-D max pooling operation.
-   *
-   * The output dimensions are functions of the filter dimensions, stride, and
-   * padding.
-   *
-   * The values in the output tensor are computed as:
-   *
-   *     output[batch, row, col, channel] =
-   *         max_{i, j} (input[batch, row + i, col + j, channel])
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
-   *
-   * Supported tensor rank: 4, with "NHWC" data layout.
-   *
-   * Inputs:
-   * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying the
-   * input.
-   * * 1: An INT32 value, specifying the padding on the left, in the ‘width’
-   * dimension.
-   * * 2: An INT32 value, specifying the padding on the right,in the ‘width’
-   * dimension.
-   * * 3: An INT32 value, specifying the padding on the top, in the ‘height’
-   * dimension.
-   * * 4: An INT32 value, specifying the padding on the bottom, in the ‘height’
-   * dimension.
-   * * 5: An INT32 value, specifying the output stride in the ‘width’ dimension.
-   * * 6: An INT32 value, specifying the output stride in the ‘height’
-   * dimension.
-   * * 7: An INT32 value, specifying the filter width.
-   * * 8: An INT32 value, specifying the filter height.
-   * * 9: An INT32 value, and has to be one of the {@link FuseCode} values.
-   *      Specifies the activation to invoke on the result of each addition.
-   *
-   * Outputs:
-   * * 0: The output 4-D tensor, of shape [batches, out_height, out_width,
-   * depth].
-   */
   ANEURALNETWORKS_MAX_POOL_2D = 17,
-
-  /** Multiplies two tensors, element-wise.
-   *
-   * Takes two input tensors of identical type and compatible dimensions. The
-   * output is the product of both input tensors, optionally modified by an
-   * activation function.
-   *
-   * Two dimensions are compatible when:
-   *     1. they are equal, or
-   *     2. one of them is 1
-   *
-   * The size of the resulting output is the maximum size along each dimension
-   * of the input operands. It starts with the trailing dimensions, and works
-   * its way forward.
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   *
-   * Supported tensor rank: up to 4
-   *
-   * Inputs:
-   * * 0: A tensor.
-   * * 1: A tensor of the same type, and compatible dimensions as input0.
-   * * 2: An INT32 value, and has to be one of the {@link FuseCode} values.
-   *      Specifies the activation to invoke on the result of each addition.
-   *
-   * Outputs:
-   * * 0: The product, a tensor of the same type as input0.
-   */
   ANEURALNETWORKS_MUL = 18,
-  /** Computes rectified linear activation on the input tensor element-wise.
-   *
-   * The output is calculated using this formula:
-   *
-   *     output = max(0, input)
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
-   *
-   * Supported tensor rank: up to 4.
-   *
-   * Inputs:
-   * * 0: A tensor, specifying the input.
-   *
-   * Outputs:
-   * * 0: The output tensor of same shape as input0.
-   */
   ANEURALNETWORKS_RELU = 19,
-  /** Computes rectified linear 1 activation on the input tensor element-wise.
-   *
-   * The output is calculated using this formula:
-   *
-   *     output = min(1.f, max(-1.f, input))
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
-   *
-   * Supported tensor rank: up to 4.
-   *
-   * Inputs:
-   * * 0: A tensor, specifying the input.
-   *
-   * Outputs:
-   * * 0: The output tensor of same shape as input0.
-   */
   ANEURALNETWORKS_RELU1 = 20,
-  /** Computes rectified linear 6 activation on the input tensor element-wise.
-   *
-   * The output is calculated using this formula:
-   *
-   *     output = min(6, max(0, input))
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
-   *
-   * Supported tensor rank: up to 4.
-   *
-   * Inputs:
-   * * 0: A tensor, specifying the input.
-   *
-   * Outputs:
-   * * 0: The output tensor of same shape as input0.
-   */
   ANEURALNETWORKS_RELU6 = 21,
-  /** Reshapes a tensor.
-   *
-   * Given tensor, this operation returns a tensor that has the same values as
-   * tensor, but with a newly specified shape.
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
-   *
-   * Supported tensor rank: up to 4.
-   *
-   * Inputs:
-   * * 0: A tensor, specifying the tensor to be reshaped.
-   * * 1: A 1-D tensor of type {@link ANEURALNETWORKS_TENSOR_INT32}, defining
-   * the shape of the output tensor. The number of elements implied by shape
-   * must be the same as the number of elements in the input tensor.
-   *
-   * Outputs:
-   * * 0: The output tensor, of shape specified by the input shape.
-   */
   ANEURALNETWORKS_RESHAPE = 22,
-  /** Resizes images to given size using the bilinear interpretation.
-   *
-   * Resized images will be distorted if their original aspect ratio is not the
-   * same as input.
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   *
-   * Supported tensor rank: 4, with "NHWC" data layout.
-   *
-   * Inputs:
-   * * 0: A 4-D tensor, of shape [batches, height, width, depth], specifying the
-   * input.
-   * * 1: An INT32 value, specifying the output width of the output tensor.
-   * * 2: An INT32 value, specifying the output height of the output tensor.
-   *
-   * Outputs:
-   * * 0: The output 4-D tensor, of shape [batches, new_height, new_width,
-   * depth].
-   */
   ANEURALNETWORKS_RESIZE_BILINEAR = 23,
-
-  /**
-   * A basic recurrent neural network layer.
-   *
-   * This layer implements the operation:
-   * outputs = state = activation(inputs * input_weights + state *
-   * recurrent_weights + bias)
-   *
-   * Where:
-   * * “input_weights” is a weight matrix that multiplies the inputs;
-   * * “recurrent_weights” is a weight matrix that multiplies the current
-   *    “state” which itself is the output from the previous time step
-   *    computation;
-   * * “bias” is a bias vector (added to each output vector in the batch);
-   * * “activation” is the function passed as the “fused_activation_function”
-   *   argument (if not “NONE”).
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   *
-   * Inputs:
-   * * 0: input.
-   *      A 2-D tensor of type T, of shape [batch_size, input_size], where
-   *      “batch_size” corresponds to the batching dimension, and “input_size”
-   * is the size of the input.
-   * * 1: weights.
-   *      A 2-D tensor of type T, of shape [num_units, input_size], where
-   *      “num_units” corresponds to the number of units.
-   * * 2: recurrent_weights.
-   *      A 2-D tensor of type T, of shape [num_units, num_units], with columns
-   *      corresponding to the weights from each unit.
-   * * 3: bias.
-   *      A 1-D tensor of type T, of shape [num_units].
-   *
-   *    For FLOAT32 input tensor, bias must also be FLOAT32.
-   *    For UINT8 input tensor, bias must be INT32.
-   *
-   * Parameters
-   * * 4: fused_activation_function.
-   *      An (optional) ActivationFunctionType indicating the activation
-   *      function. If “NONE” is specified then it results in a linear
-   *      activation.
-   *
-   * * 5: Hidden state.
-   *      A 2-D tensor of type T, of shape [batch_size, num_units].
-   *
-   * Outputs:
-   * * 0: output.
-   *      A 2-D tensor of type T, of shape [batch_size, num_units]. This is
-   *      effectively the same as the current state value.
-   */
   ANEURALNETWORKS_RNN = 24,
-
-  /** Computes the softmax activation on the input tensor element-wise, per
-   * batch, by normalizing the input vector so the maximum coefficient is zero.
-   *
-   * The output is calculated using this formula:
-   *
-   *     output[batch, i] =
-   *         exp((input[batch, i] - max(input[batch, :])) * beta) /
-   *         sum_{k}{exp((input[batch, k] - max(input[batch, :])) * beta)}
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
-   *
-   * Supported tensor rank: 2 or 4.
-   *
-   * Inputs:
-   * * 0: A 2-D or 4-D tensor, specifying the tensor to be reshaped.
-   * * 1: A FLOAT32 value, specifying the scaling factor for the exponent, beta.
-   *
-   * Outputs:
-   * * 0: The output tensor of same shape as input0.
-   */
   ANEURALNETWORKS_SOFTMAX = 25,
-
-  /** Rearranges blocks of spatial data, into depth.
-   *
-   * More specifically, this op outputs a copy of the input tensor where values
-   * from the height and width dimensions are moved to the depth dimension. The
-   * value block_size indicates the input block size and how the data is moved.
-   *
-   * Chunks of data of size block_size * block_size from depth are rearranged
-   * into non-overlapping blocks of size block_size x block_size.
-   *
-   * The depth of the output tensor is input_depth * block_size * block_size.
-   * The input tensor's height and width must be divisible by block_size.
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   * * {@link ANEURALNETWORKS_TENSOR_QUANT8_ASYMM}
-   *
-   * Supported tensor rank: 4, with "NHWC" data layout.
-   *
-   * Inputs:
-   * * 0: A 4-D tensor, of shape [batches, height, width, depth_in], specifying
-   * the input.
-   * * 1: An INT32 value, specifying the block_size. block_size must be >=1 and
-   *      block_size must be a divisor of both the input height and width.
-   *
-   * Outputs:
-   * * 0: The output 4-D tensor, of shape [batch, height/block_size,
-   * width/block_size, depth*block_size*block_size].
-   */
   ANEURALNETWORKS_SPACE_TO_DEPTH = 26,
-
-  /**
-   * SVDF op is a kind of stateful layer derived from the notion that a
-   * densely connected layer that's processing a sequence of input frames can
-   * be approximated by using a singular value decomposition of each of its
-   * nodes. The implementation is based on:
-   *
-   * https://research.google.com/pubs/archive/43813.pdf
-   *
-   * P. Nakkiran, R. Alvarez, R. Prabhavalkar, C. Parada.
-   * “Compressing Deep Neural Networks using a Rank-Constrained Topology”.
-   * INTERSPEECH, 2015.
-   *
-   * It processes the incoming input using a 2-stage filtering mechanism:
-   * * stage 1 performs filtering on the "features" dimension, whose outputs get
-   *   pushed into a memory of fixed-size memory_size.
-   * * stage 2 performs filtering on the "time" dimension of the memory_size
-   *   memoized outputs of stage 1.
-   *
-   * Specifically, for rank 1, this layer implements the operation:
-   *
-   *    memory = push(conv1d(inputs, weights_feature, feature_dim, "VALID"));
-   *    outputs = activation(memory * weights_time + bias);
-   *
-   * Where:
-   * * “weights_feature” is a weights matrix that processes the inputs (by
-   *   convolving the input with every “feature filter”), and whose outputs get
-   *   pushed, stacked in order, into the fixed-size “memory” (the oldest entry
-   *   gets dropped);
-   * * “weights_time” is a weights matrix that processes the “memory” (by a
-   *   batched matrix multiplication on the num_units);
-   * * “bias” is an optional bias vector (added to each output vector in the
-   *   batch); and
-   * * “activation” is the function passed as the “fused_activation_function”
-   *   argument (if not “NONE”).
-   *
-   * Each rank adds a dimension to the weights matrices by means of stacking
-   * the filters.
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   *
-   * Inputs:
-   * * 0: input.
-   *      A 2-D tensor of type T, of shape [batch_size, input_size], where
-   *      “batch_size” corresponds to the batching dimension, and “input_size”
-   * is the size of the input.
-   * * 1: weights_feature.
-   *      A 2-D tensor of type T, of shape [num_units, input_size], where
-   *      “num_units” corresponds to the number of units.
-   * * 2: weights_time.
-   *      A 2-D tensor of type T, of shape [num_units, memory_size], where
-   *      “memory_size” corresponds to the fixed-size of the memory.
-   * * 3: bias.
-   *      A optional 1-D tensor of type T, of shape [num_units].
-   *
-   *    For FLOAT32 input tensor, bias must also be FLOAT32.
-   *    For UINT8 input tensor, bias must be INT32.
-   *
-   * Parameters:
-   * * 4: rank.
-   *      The rank of the SVD approximation.
-   * * 5: fused_activation_function.
-   *      An (optional) ActivationFunctionType indicating the activation
-   * function. If “NONE” is specified then it results in a linear activation.
-   *
-   * Outputs:
-   * * 0: state.
-   *      A 2-D tensor of type T, of shape [batch_size, (memory_size - 1) *
-   * num_units * rank].
-   * * 1: output.
-   *      A 2-D tensor of type T, of shape [batch_size, num_units].
-   */
   ANEURALNETWORKS_SVDF = 27,
-
-  /** Computes hyperbolic tangent of input tensor element-wise.
-   *
-   * The output is calculated using this formula:
-   *
-   *     output = tanh(input)
-   *
-   * Supported tensor types:
-   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32}
-   *
-   * Supported tensor rank: up to 4.
-   *
-   * Inputs:
-   * * 0: A tensor, specifying the input.
-   *
-   * Outputs:
-   * * 0: The output tensor of same shape as input0.
-   */
   ANEURALNETWORKS_TANH = 28,
+  ANEURALNETWORKS_BATCH_TO_SPACE_ND = 29,
+  ANEURALNETWORKS_DIV = 30,
+  ANEURALNETWORKS_MEAN = 31,
+  ANEURALNETWORKS_PAD = 32,
+  ANEURALNETWORKS_SPACE_TO_BATCH_ND = 33,
+  ANEURALNETWORKS_SQUEEZE = 34,
+  ANEURALNETWORKS_STRIDED_SLICE = 35,
+  ANEURALNETWORKS_SUB = 36,
+  ANEURALNETWORKS_TRANSPOSE = 37,
 };
 
 /**
@@ -1080,13 +137,9 @@ enum {
  *
  */
 enum {
-  /** NO fused activation function. */
   ANEURALNETWORKS_FUSED_NONE = 0,
-  /** Fused ReLU activation function. */
   ANEURALNETWORKS_FUSED_RELU = 1,
-  /** Fused ReLU1 activation function. */
   ANEURALNETWORKS_FUSED_RELU1 = 2,
-  /** Fused ReLU6 activation function. */
   ANEURALNETWORKS_FUSED_RELU6 = 3,
 };
 
@@ -1094,20 +147,8 @@ enum {
  * Execution preferences.
  */
 enum {
-  /**
-   * Prefer executing in a way that minimizes battery drain.
-   * This is desirable for compilations that will be executed often.
-   */
   ANEURALNETWORKS_PREFER_LOW_POWER = 0,
-  /**
-   * Prefer returning a single answer as fast as possible, even if this causes
-   * more power consumption.
-   */
   ANEURALNETWORKS_PREFER_FAST_SINGLE_ANSWER = 1,
-  /**
-   * Prefer maximizing the throughput of successive frames, for example when
-   * processing successive frames coming from the camera.
-   */
   ANEURALNETWORKS_PREFER_SUSTAINED_SPEED = 2,
 };
 
diff --git a/tensorflow/contrib/lite/nnapi_delegate.cc b/tensorflow/contrib/lite/nnapi_delegate.cc
index eab82ea8ef2354..d27ab0c033aa01 100644
--- a/tensorflow/contrib/lite/nnapi_delegate.cc
+++ b/tensorflow/contrib/lite/nnapi_delegate.cc
@@ -23,6 +23,10 @@ limitations under the License.
 #include "tensorflow/contrib/lite/model.h"
 #include "tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h"
 
+#ifdef __ANDROID__
+#include <sys/system_properties.h>
+#endif
+
 namespace tflite {
 
 // TODO(aselle): FATAL leaves resources hanging.
@@ -46,6 +50,32 @@ void FATAL(const char* format, ...) {
     FATAL("Aborting since tflite returned failure."); \
   }
 
+namespace {
+
+int32_t GetAndroidSdkVersion() {
+#ifdef __ANDROID__
+  const char* sdkProp = "ro.build.version.sdk";
+  char sdkVersion[PROP_VALUE_MAX];
+  int length = __system_property_get(sdkProp, sdkVersion);
+  if (length != 0) {
+    for (int i = 0; i < length; ++i) {
+      int digit = sdkVersion[i] - '0';
+      if (digit < 0 || digit > 9) {
+        // Non-numeric SDK version, assume it's higher then expected;
+        return 0xFFFF;
+      }
+    }
+    return atoi(sdkVersion);
+  }
+  FATAL("No %s prop", sdkProp);
+#endif  // __ANDROID__
+  return 0;
+}
+
+static const int32_t kAndroidSdkVersion = GetAndroidSdkVersion();
+
+}  // namespace
+
 NNAPIAllocation::NNAPIAllocation(const char* filename,
                                  ErrorReporter* error_reporter)
     : MMAPAllocation(filename, error_reporter) {
@@ -61,6 +91,10 @@ NNAPIAllocation::~NNAPIAllocation() {
 }
 
 NNAPIDelegate::~NNAPIDelegate() {
+  if (nn_compiled_model_) {
+    ANeuralNetworksCompilation_free(nn_compiled_model_);
+    nn_compiled_model_ = nullptr;
+  }
   if (nn_model_) {
     ANeuralNetworksModel_free(nn_model_);
     nn_model_ = nullptr;
@@ -72,11 +106,23 @@ NNAPIDelegate::~NNAPIDelegate() {
 // Adds the tensors of the interpreter to the NN API model.
 // Returns the number of operands added.
 uint32_t addTensorOperands(tflite::Interpreter* interpreter,
-                           ANeuralNetworksModel* nn_model) {
+                           ANeuralNetworksModel* nn_model,
+                           const std::vector<uint32_t>& skip_list) {
   uint32_t next_id = 0;
   for (size_t i = 0; i < interpreter->tensors_size(); i++) {
+    // skip temporaries tensors.
+    bool shouldSkip = false;
+    for (auto skip_idx : skip_list) {
+      if (i == skip_idx) {
+        shouldSkip = true;
+        break;
+      }
+    }
+    if (shouldSkip) continue;
+
     int32_t nn_type = 0;
-    float scale = 1.0f;
+    // NNAPI requires 32-bit float scale to be zero, tflite doesn't care
+    float scale = 0.0f;
     int32_t zeroPoint = 0;
     TfLiteTensor* tensor = interpreter->tensor(i);
     switch (tensor->type) {
@@ -109,20 +155,24 @@ uint32_t addTensorOperands(tflite::Interpreter* interpreter,
         nn_type, static_cast<uint32_t>(tensor->dims->size),
         reinterpret_cast<uint32_t*>(tensor->dims->data), scale, zeroPoint};
     CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type));
-
     // TODO(aselle): Based on Michael's suggestion, limiting this to read
     // only memory
     if (tensor->allocation_type == kTfLiteMmapRo) {
       if (const NNAPIAllocation* alloc = dynamic_cast<const NNAPIAllocation*>(
               static_cast<const Allocation*>(tensor->allocation))) {
         CHECK_NN(ANeuralNetworksModel_setOperandValueFromMemory(
-            nn_model, i, alloc->memory(), alloc->offset(tensor->data.raw),
+            nn_model, next_id, alloc->memory(), alloc->offset(tensor->data.raw),
             tensor->bytes));
       } else {
         CHECK_NN(ANeuralNetworksModel_setOperandValue(
-            nn_model, i, tensor->data.raw, tensor->bytes));
+            nn_model, next_id, tensor->data.raw, tensor->bytes));
       }
+    } else if (tensor->bytes == 0) {
+      // These size 0 tensors are optional tensors reserved.
+      CHECK_NN(
+          ANeuralNetworksModel_setOperandValue(nn_model, next_id, nullptr, 0));
     }
+
     ++next_id;
   }
   return next_id;
@@ -131,7 +181,9 @@ uint32_t addTensorOperands(tflite::Interpreter* interpreter,
 // Adds the operations and their parameters to the NN API model.
 // 'next-id' is the operand ID of the next operand of the model.
 void AddOpsAndParams(tflite::Interpreter* interpreter,
-                     ANeuralNetworksModel* nn_model, uint32_t next_id) {
+                     ANeuralNetworksModel* nn_model, uint32_t next_id,
+                     std::vector<int>* model_state_inputs,
+                     std::vector<int>* model_state_outputs) {
   for (size_t i = 0; i < interpreter->nodes_size(); i++) {
     const auto* node_and_registration = interpreter->node_and_registration(i);
     const TfLiteNode& node = node_and_registration->first;
@@ -142,6 +194,8 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
     // Add the parameters.
     std::vector<uint32_t> augmented_inputs(
         node.inputs->data, node.inputs->data + node.inputs->size);
+    std::vector<uint32_t> augmented_outputs(
+        node.outputs->data, node.outputs->data + node.outputs->size);
 
     auto add_scalar_int32 = [&nn_model, &augmented_inputs,
                              &next_id](int value) {
@@ -161,12 +215,23 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       augmented_inputs.push_back(next_id++);
     };
 
+    // Handle state tensors of RNN, LSTM, SVDF.
+    // For each state_out tensor, a corresponding state_in operand needs to be
+    // created for NNAPI.
     auto duplicate_state_tensor_float32 =
-        [interpreter, &nn_model, &augmented_inputs](int tensor_id) {
+        [interpreter, &nn_model, &next_id, &augmented_inputs,
+         &model_state_inputs, &model_state_outputs](int tensor_id) {
           const TfLiteTensor* tensor = interpreter->tensor(tensor_id);
-          CHECK_NN(ANeuralNetworksModel_setOperandValue(
-              nn_model, tensor_id, tensor->data.raw, tensor->bytes));
-          augmented_inputs.push_back(tensor_id);
+          ANeuralNetworksOperandType operand_type{
+              ANEURALNETWORKS_TENSOR_FLOAT32,
+              static_cast<uint32_t>(tensor->dims->size),
+              reinterpret_cast<uint32_t*>(tensor->dims->data),
+              tensor->params.scale, tensor->params.zero_point};
+          CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type));
+          augmented_inputs.push_back(next_id);
+          model_state_inputs->push_back(next_id);
+          model_state_outputs->push_back(tensor_id);
+          next_id++;
         };
 
     auto add_add_params = [&add_scalar_int32]() { add_scalar_int32(0); };
@@ -229,30 +294,63 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       add_scalar_float32(builtin->proj_clip);
     };
 
-#if 0
-    auto add_reshape_params = [&](void* data) {
-      auto builtin = reinterpret_cast<TfLiteReshapeParams*>(data);
-      uint32_t tensor_size_shape = builtin->num_dimensions;
+    // LSTM in NNAPI requires scratch tensor as an output operand.
+    auto add_lstm_scratch_tensor_float32 = [interpreter, &node, &nn_model,
+                                            &next_id, &augmented_outputs]() {
+      int scratch_buffer_index = node.temporaries->data[0];
+      const TfLiteTensor* tensor = interpreter->tensor(scratch_buffer_index);
       ANeuralNetworksOperandType operand_type{
-          ANEURALNETWORKS_TENSOR_INT32,
-          {static_cast<uint32_t>(1),
-           reinterpret_cast<uint32_t*>(&tensor_size_shape)},
-          0,
-          0};
-      CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
-      CHECK_NN(ANeuralNetworksModel_setOperandValue(
-          nn_model, next_id, builtin->shape,
-          sizeof(int) * builtin->num_dimensions));
-      augmented_inputs.push_back(next_id++);
+          ANEURALNETWORKS_TENSOR_FLOAT32,
+          static_cast<uint32_t>(tensor->dims->size),
+          reinterpret_cast<uint32_t*>(tensor->dims->data), tensor->params.scale,
+          tensor->params.zero_point};
+      CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type));
+      augmented_outputs.insert(augmented_outputs.begin(), next_id++);
+    };
+
+    auto add_mean_params = [&add_scalar_int32](void* data) {
+      auto builtin = reinterpret_cast<TfLiteMeanParams*>(data);
+      add_scalar_int32(builtin->keep_dims);
+    };
+
+    auto add_svdf_params = [&add_scalar_int32](void* data) {
+      auto builtin = reinterpret_cast<TfLiteSVDFParams*>(data);
+      add_scalar_int32(builtin->rank);
+      add_scalar_int32(builtin->activation);
+    };
+
+    auto add_rnn_params = [&add_scalar_int32](void* data) {
+      auto builtin = reinterpret_cast<TfLiteRNNParams*>(data);
+      add_scalar_int32(builtin->activation);
     };
-#endif
 
+    // Handle optional input tensors.
+    auto add_optional_tensors = [&nn_model, &augmented_inputs,
+                                 &next_id](int nn_type) {
+      for (size_t idx = 0; idx < augmented_inputs.size(); idx++) {
+        if (augmented_inputs[idx] == kOptionalTensor) {
+          const std::vector<uint32_t> dim = {0, 0};
+          ANeuralNetworksOperandType operand_type{nn_type, 2, dim.data(), 0, 0};
+          CHECK_NN(ANeuralNetworksModel_addOperand(nn_model, &operand_type))
+          CHECK_NN(ANeuralNetworksModel_setOperandValue(nn_model, next_id,
+                                                        nullptr, 0))
+          augmented_inputs[idx] = next_id++;
+        }
+      }
+    };
+
+    int nnapi_version = 10;
     ANeuralNetworksOperationType nn_op_type;
+
     switch (builtin) {
       case tflite::BuiltinOperator_ADD:
         nn_op_type = ANEURALNETWORKS_ADD;
         add_add_params();
         break;
+      case tflite::BuiltinOperator_MUL:
+        nn_op_type = ANEURALNETWORKS_MUL;
+        add_add_params();
+        break;
       case tflite::BuiltinOperator_AVERAGE_POOL_2D:
         add_pooling_params(node.builtin_data);
         nn_op_type = ANEURALNETWORKS_AVERAGE_POOL_2D;
@@ -278,6 +376,9 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       case tflite::BuiltinOperator_TANH:
         nn_op_type = ANEURALNETWORKS_TANH;
         break;
+      case tflite::BuiltinOperator_FLOOR:
+        nn_op_type = ANEURALNETWORKS_FLOOR;
+        break;
       case tflite::BuiltinOperator_LOGISTIC:
         nn_op_type = ANEURALNETWORKS_LOGISTIC;
         break;
@@ -307,28 +408,59 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
         break;
       case tflite::BuiltinOperator_LSTM: {
         duplicate_state_tensor_float32(
-            node.outputs->data[/*kOutputStateTensor*/ 1]);
+            node.outputs->data[/*kOutputStateTensor*/ 0]);
         duplicate_state_tensor_float32(
-            node.outputs->data[/*kCellStateTensor*/ 2]);
+            node.outputs->data[/*kCellStateTensor*/ 1]);
         add_lstm_params(node.builtin_data);
+        add_lstm_scratch_tensor_float32();
+        add_optional_tensors(ANEURALNETWORKS_TENSOR_FLOAT32);
         nn_op_type = ANEURALNETWORKS_LSTM;
         break;
       }
+      case tflite::BuiltinOperator_SVDF: {
+        duplicate_state_tensor_float32(node.outputs->data[/*kStateTensor*/ 0]);
+        add_svdf_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_SVDF;
+        break;
+      }
+      case tflite::BuiltinOperator_RNN: {
+        duplicate_state_tensor_float32(
+            node.outputs->data[/*kHiddenStateTensor*/ 0]);
+        add_rnn_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_RNN;
+        break;
+      }
+      case tflite::BuiltinOperator_EMBEDDING_LOOKUP:
+        nn_op_type = ANEURALNETWORKS_EMBEDDING_LOOKUP;
+        break;
+      case tflite::BuiltinOperator_PAD:
+        nnapi_version = 11;  // require NNAPI 1.1
+        nn_op_type = ANEURALNETWORKS_PAD;
+        break;
+      case tflite::BuiltinOperator_MEAN:
+        nnapi_version = 11;  // require NNAPI 1.1
+        add_mean_params(node.builtin_data);
+        nn_op_type = ANEURALNETWORKS_MEAN;
+        break;
+      case tflite::BuiltinOperator_DIV:
+        nnapi_version = 11;  // require NNAPI 1.1
+        nn_op_type = ANEURALNETWORKS_DIV;
+        break;
+      case tflite::BuiltinOperator_SUB:
+        nnapi_version = 11;  // require NNAPI 1.1
+        nn_op_type = ANEURALNETWORKS_SUB;
+        break;
       case tflite::BuiltinOperator_CONCAT_EMBEDDINGS:
       case tflite::BuiltinOperator_LSH_PROJECTION:
-      case tflite::BuiltinOperator_SVDF:
       case tflite::BuiltinOperator_HASHTABLE_LOOKUP:
-      case tflite::BuiltinOperator_RNN:
       case tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN:
       case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN:
-      case tflite::BuiltinOperator_EMBEDDING_LOOKUP:
       case tflite::BuiltinOperator_EMBEDDING_LOOKUP_SPARSE:
       case tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM:
       case tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM:
       case tflite::BuiltinOperator_L2_NORMALIZATION:
       case tflite::BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION:
-      case tflite::BuiltinOperator_MUL:
-      case tflite::BuiltinOperator_PAD:
+      case tflite::BuiltinOperator_PADV2:
       case tflite::BuiltinOperator_RESIZE_BILINEAR:
       case tflite::BuiltinOperator_CALL:
       case tflite::BuiltinOperator_SKIP_GRAM:
@@ -338,9 +470,6 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       case tflite::BuiltinOperator_BATCH_TO_SPACE_ND:
       case tflite::BuiltinOperator_TOPK_V2:
       case tflite::BuiltinOperator_TRANSPOSE:
-      case tflite::BuiltinOperator_MEAN:
-      case tflite::BuiltinOperator_DIV:
-      case tflite::BuiltinOperator_SUB:
       case tflite::BuiltinOperator_SPLIT:
       case tflite::BuiltinOperator_SQUEEZE:
       case tflite::BuiltinOperator_STRIDED_SLICE:
@@ -353,7 +482,18 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       case tflite::BuiltinOperator_MAXIMUM:
       case tflite::BuiltinOperator_MINIMUM:
       case tflite::BuiltinOperator_ARG_MAX:
+      case tflite::BuiltinOperator_GREATER:
+      case tflite::BuiltinOperator_GREATER_EQUAL:
       case tflite::BuiltinOperator_LESS:
+      case tflite::BuiltinOperator_LESS_EQUAL:
+      case tflite::BuiltinOperator_NEG:
+      case tflite::BuiltinOperator_SELECT:
+      case tflite::BuiltinOperator_SLICE:
+      case tflite::BuiltinOperator_SIN:
+      case tflite::BuiltinOperator_TRANSPOSE_CONV:
+      case tflite::BuiltinOperator_TILE:
+      case tflite::BuiltinOperator_EXPAND_DIMS:
+      case tflite::BuiltinOperator_SPARSE_TO_DENSE:
         FATAL("Op code %d is currently not delegated to NNAPI", builtin);
         nn_op_type = -1;  // set to invalid
         break;
@@ -363,11 +503,16 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
         break;
     }
 
+    if (nnapi_version == 11 && kAndroidSdkVersion < 28) {
+      FATAL("Op %d needs NNAPI1.1", builtin);
+    }
+
     // Add the operation.
     CHECK_NN(ANeuralNetworksModel_addOperation(
         nn_model, nn_op_type, static_cast<uint32_t>(augmented_inputs.size()),
-        augmented_inputs.data(), static_cast<uint32_t>(node.outputs->size),
-        reinterpret_cast<uint32_t*>(node.outputs->data)));
+        augmented_inputs.data(),
+        static_cast<uint32_t>(augmented_outputs.size()),
+        reinterpret_cast<uint32_t*>(augmented_outputs.data())));
   }
 }
 
@@ -378,13 +523,38 @@ TfLiteStatus NNAPIDelegate::BuildGraph(Interpreter* interpreter) {
   if (!nn_model_) {
     CHECK_NN(ANeuralNetworksModel_create(&nn_model_));
 
-    uint32_t next_id = addTensorOperands(interpreter, nn_model_);
-    AddOpsAndParams(interpreter, nn_model_, next_id);
+    // Find all the temporary tensors and put them in a skip_list.
+    std::vector<uint32_t> skip_list;
+    for (size_t i = 0; i < interpreter->nodes_size(); i++) {
+      const auto* node_and_registration = interpreter->node_and_registration(i);
+      const TfLiteNode& node = node_and_registration->first;
+      if (node.temporaries != nullptr) {
+        for (int j = 0; j < node.temporaries->size; j++) {
+          skip_list.push_back(static_cast<uint32_t>(node.temporaries->data[j]));
+        }
+      }
+    }
+
+    uint32_t next_id = addTensorOperands(interpreter, nn_model_, skip_list);
+    AddOpsAndParams(interpreter, nn_model_, next_id, &model_states_inputs_,
+                    &model_states_outputs_);
+
+    std::vector<int> augmented_inputs = interpreter->inputs();
+    std::vector<int> augmented_outputs = interpreter->outputs();
+
+    // All state tensors input/output need to be treated as model input/output.
+    augmented_inputs.insert(augmented_inputs.end(),
+                            model_states_inputs_.begin(),
+                            model_states_inputs_.end());
+    augmented_outputs.insert(augmented_outputs.end(),
+                             model_states_outputs_.begin(),
+                             model_states_outputs_.end());
+
     CHECK_NN(ANeuralNetworksModel_identifyInputsAndOutputs(
-        nn_model_, static_cast<uint32_t>(interpreter->inputs().size()),
-        reinterpret_cast<const uint32_t*>(interpreter->inputs().data()),
-        static_cast<uint32_t>(interpreter->outputs().size()),
-        reinterpret_cast<const uint32_t*>(interpreter->outputs().data())));
+        nn_model_, static_cast<uint32_t>(augmented_inputs.size()),
+        reinterpret_cast<const uint32_t*>(augmented_inputs.data()),
+        static_cast<uint32_t>(augmented_outputs.size()),
+        reinterpret_cast<const uint32_t*>(augmented_outputs.data())));
     CHECK_NN(ANeuralNetworksModel_finish(nn_model_));
   }
   if (!nn_compiled_model_) {
@@ -411,6 +581,7 @@ TfLiteStatus NNAPIDelegate::Invoke(Interpreter* interpreter) {
     CHECK_NN(ANeuralNetworksExecution_setInput(
         execution, i, nullptr, tensor->data.raw, tensor->bytes));
   }
+
   // Tell nn api where to place final data.
   for (size_t i = 0; i < interpreter->outputs().size(); i++) {
     int output = interpreter->outputs()[i];
@@ -418,6 +589,24 @@ TfLiteStatus NNAPIDelegate::Invoke(Interpreter* interpreter) {
     CHECK_NN(ANeuralNetworksExecution_setOutput(
         execution, i, nullptr, tensor->data.raw, tensor->bytes));
   }
+
+  // The state_out of previous invocation need to be mapped to state_in of
+  // current invocation.
+  for (size_t i = 0; i < model_states_outputs_.size(); i++) {
+    int state_tensor_idx = model_states_outputs_[i];
+    TfLiteTensor* tensor = interpreter->tensor(state_tensor_idx);
+    // Here we are using a deep copy for state_in tensors so that we are not
+    // reading and writing into the same buffer during a invocation.
+    // TODO(miaowang): using double shared buffer to minimize the copies.
+    CHECK_NN(ANeuralNetworksExecution_setInput(
+        execution, i + interpreter->inputs().size(), nullptr, tensor->data.raw,
+        tensor->bytes));
+    // Tell NNAPI where to output the state_out.
+    CHECK_NN(ANeuralNetworksExecution_setOutput(
+        execution, i + interpreter->outputs().size(), nullptr, tensor->data.raw,
+        tensor->bytes));
+  }
+
   // Currently use blocking compute.
   ANeuralNetworksEvent* event = nullptr;
   CHECK_NN(ANeuralNetworksExecution_startCompute(execution, &event));
diff --git a/tensorflow/contrib/lite/nnapi_delegate.h b/tensorflow/contrib/lite/nnapi_delegate.h
index e98000929a1168..94dea4f9b23f20 100644
--- a/tensorflow/contrib/lite/nnapi_delegate.h
+++ b/tensorflow/contrib/lite/nnapi_delegate.h
@@ -59,6 +59,14 @@ class NNAPIDelegate {
   ANeuralNetworksModel* nn_model_ = nullptr;
   // The NN API compilation handle
   ANeuralNetworksCompilation* nn_compiled_model_ = nullptr;
+
+  // List of state tensors for LSTM, RNN, SVDF.
+  // NN API does not allow ops to maintain states across multiple
+  // invocations. We need to manually create state input tensors from
+  // corresponding state output tensors of TFLite operations, and map them
+  // correctly.
+  std::vector<int> model_states_inputs_;
+  std::vector<int> model_states_outputs_;
 };
 
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/op_resolver.cc b/tensorflow/contrib/lite/op_resolver.cc
new file mode 100644
index 00000000000000..f6e435e9824432
--- /dev/null
+++ b/tensorflow/contrib/lite/op_resolver.cc
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/op_resolver.h"
+#include "tensorflow/contrib/lite/context.h"
+
+namespace tflite {
+
+const TfLiteRegistration* MutableOpResolver::FindOp(tflite::BuiltinOperator op,
+                                                    int version) const {
+  auto it = builtins_.find(std::make_pair(op, version));
+  return it != builtins_.end() ? &it->second : nullptr;
+}
+
+const TfLiteRegistration* MutableOpResolver::FindOp(const char* op,
+                                                    int version) const {
+  auto it = custom_ops_.find(std::make_pair(op, version));
+  return it != custom_ops_.end() ? &it->second : nullptr;
+}
+
+void MutableOpResolver::AddBuiltin(tflite::BuiltinOperator op,
+                                   TfLiteRegistration* registration,
+                                   int min_version, int max_version) {
+  for (int version = min_version; version <= max_version; ++version) {
+    TfLiteRegistration new_registration = *registration;
+    new_registration.builtin_code = op;
+    new_registration.version = version;
+    auto op_key = std::make_pair(op, version);
+    builtins_[op_key] = new_registration;
+  }
+}
+
+void MutableOpResolver::AddCustom(const char* name,
+                                  TfLiteRegistration* registration,
+                                  int min_version, int max_version) {
+  for (int version = min_version; version <= max_version; ++version) {
+    TfLiteRegistration new_registration = *registration;
+    new_registration.builtin_code = BuiltinOperator_CUSTOM;
+    new_registration.version = version;
+    auto op_key = std::make_pair(name, version);
+    custom_ops_[op_key] = new_registration;
+  }
+}
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/op_resolver.h b/tensorflow/contrib/lite/op_resolver.h
new file mode 100644
index 00000000000000..9d7e3f20854a35
--- /dev/null
+++ b/tensorflow/contrib/lite/op_resolver.h
@@ -0,0 +1,94 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_OP_RESOLVER_H_
+#define TENSORFLOW_CONTRIB_LITE_OP_RESOLVER_H_
+
+#include <unordered_map>
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/schema/schema_generated.h"
+#include "tensorflow/contrib/lite/util.h"
+
+namespace tflite {
+
+// Abstract interface that returns TfLiteRegistrations given op codes or custom
+// op names. This is the mechanism that ops being referenced in the flatbuffer
+// model are mapped to executable function pointers (TfLiteRegistrations).
+class OpResolver {
+ public:
+  // Finds the op registration for a builtin operator by enum code.
+  virtual const TfLiteRegistration* FindOp(tflite::BuiltinOperator op,
+                                           int version) const = 0;
+  // Finds the op registration of a custom operator by op name.
+  virtual const TfLiteRegistration* FindOp(const char* op,
+                                           int version) const = 0;
+  virtual ~OpResolver() {}
+};
+
+// Some versions of gcc doesn't support partial specialization in class scope,
+// so these are defined in a namescope.
+namespace op_resolver_hasher {
+template <typename V>
+struct ValueHasher {
+  size_t operator()(const V& v) const { return std::hash<V>()(v); }
+};
+
+template <>
+struct ValueHasher<tflite::BuiltinOperator> {
+  size_t operator()(const tflite::BuiltinOperator& v) const {
+    return std::hash<int>()(static_cast<int>(v));
+  }
+};
+
+template <typename T>
+struct OperatorKeyHasher {
+  size_t operator()(const T& x) const {
+    size_t a = ValueHasher<typename T::first_type>()(x.first);
+    size_t b = ValueHasher<typename T::second_type>()(x.second);
+    return CombineHashes({a, b});
+  }
+};
+}  // namespace op_resolver_hasher
+
+// An OpResolver that is mutable, also used as the op in gen_op_registration.
+// A typical usage:
+//   MutableOpResolver resolver;
+//   resolver.AddBuiltin(BuiltinOperator_ADD, Register_ADD());
+//   resolver.AddCustom("CustomOp", Register_CUSTOM_OP());
+//   InterpreterBuilder(model, resolver)(&interpreter);
+class MutableOpResolver : public OpResolver {
+ public:
+  const TfLiteRegistration* FindOp(tflite::BuiltinOperator op,
+                                   int version) const override;
+  const TfLiteRegistration* FindOp(const char* op, int version) const override;
+  void AddBuiltin(tflite::BuiltinOperator op, TfLiteRegistration* registration,
+                  int min_version = 1, int max_version = 1);
+  void AddCustom(const char* name, TfLiteRegistration* registration,
+                 int min_version = 1, int max_version = 1);
+
+ private:
+  typedef std::pair<tflite::BuiltinOperator, int> BuiltinOperatorKey;
+  typedef std::pair<std::string, int> CustomOperatorKey;
+
+  std::unordered_map<BuiltinOperatorKey, TfLiteRegistration,
+                     op_resolver_hasher::OperatorKeyHasher<BuiltinOperatorKey> >
+      builtins_;
+  std::unordered_map<CustomOperatorKey, TfLiteRegistration,
+                     op_resolver_hasher::OperatorKeyHasher<CustomOperatorKey> >
+      custom_ops_;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_OP_RESOLVER_H_
diff --git a/tensorflow/contrib/lite/op_resolver_test.cc b/tensorflow/contrib/lite/op_resolver_test.cc
new file mode 100644
index 00000000000000..10b7e319722faa
--- /dev/null
+++ b/tensorflow/contrib/lite/op_resolver_test.cc
@@ -0,0 +1,129 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/op_resolver.h"
+
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/testing/util.h"
+
+namespace tflite {
+namespace {
+
+// We need some dummy functions to identify the registrations.
+TfLiteStatus DummyInvoke(TfLiteContext* context, TfLiteNode* node) {
+  return kTfLiteOk;
+}
+
+TfLiteRegistration* GetDummyRegistration() {
+  static TfLiteRegistration registration = {
+      .init = nullptr,
+      .free = nullptr,
+      .prepare = nullptr,
+      .invoke = DummyInvoke,
+  };
+  return &registration;
+}
+
+TEST(MutableOpResolverTest, FinOp) {
+  MutableOpResolver resolver;
+  resolver.AddBuiltin(BuiltinOperator_ADD, GetDummyRegistration());
+
+  const TfLiteRegistration* found_registration =
+      resolver.FindOp(BuiltinOperator_ADD, 1);
+  ASSERT_NE(found_registration, nullptr);
+  EXPECT_TRUE(found_registration->invoke == DummyInvoke);
+  EXPECT_EQ(found_registration->builtin_code, BuiltinOperator_ADD);
+  EXPECT_EQ(found_registration->version, 1);
+}
+
+TEST(MutableOpResolverTest, FindMissingOp) {
+  MutableOpResolver resolver;
+  resolver.AddBuiltin(BuiltinOperator_ADD, GetDummyRegistration());
+
+  const TfLiteRegistration* found_registration =
+      resolver.FindOp(BuiltinOperator_CONV_2D, 1);
+  EXPECT_EQ(found_registration, nullptr);
+}
+
+TEST(MutableOpResolverTest, RegisterOpWithMultipleVersions) {
+  MutableOpResolver resolver;
+  // The kernel supports version 2 and 3
+  resolver.AddBuiltin(BuiltinOperator_ADD, GetDummyRegistration(), 2, 3);
+
+  const TfLiteRegistration* found_registration;
+
+  found_registration = resolver.FindOp(BuiltinOperator_ADD, 2);
+  ASSERT_NE(found_registration, nullptr);
+  EXPECT_TRUE(found_registration->invoke == DummyInvoke);
+  EXPECT_EQ(found_registration->version, 2);
+
+  found_registration = resolver.FindOp(BuiltinOperator_ADD, 3);
+  ASSERT_NE(found_registration, nullptr);
+  EXPECT_TRUE(found_registration->invoke == DummyInvoke);
+  EXPECT_EQ(found_registration->version, 3);
+}
+
+TEST(MutableOpResolverTest, FindOpWithUnsupportedVersions) {
+  MutableOpResolver resolver;
+  // The kernel supports version 2 and 3
+  resolver.AddBuiltin(BuiltinOperator_ADD, GetDummyRegistration(), 2, 3);
+
+  const TfLiteRegistration* found_registration;
+
+  found_registration = resolver.FindOp(BuiltinOperator_ADD, 1);
+  EXPECT_EQ(found_registration, nullptr);
+
+  found_registration = resolver.FindOp(BuiltinOperator_ADD, 4);
+  EXPECT_EQ(found_registration, nullptr);
+}
+
+TEST(MutableOpResolverTest, FindCustomOp) {
+  MutableOpResolver resolver;
+  resolver.AddCustom("AWESOME", GetDummyRegistration());
+
+  const TfLiteRegistration* found_registration = resolver.FindOp("AWESOME", 1);
+  ASSERT_NE(found_registration, nullptr);
+  EXPECT_EQ(found_registration->builtin_code, BuiltinOperator_CUSTOM);
+  EXPECT_TRUE(found_registration->invoke == DummyInvoke);
+  EXPECT_EQ(found_registration->version, 1);
+  // TODO(ycling): The `custom_name` in TfLiteRegistration isn't properly
+  // filled yet. Fix this and add tests.
+}
+
+TEST(MutableOpResolverTest, FindMissingCustomOp) {
+  MutableOpResolver resolver;
+  resolver.AddCustom("AWESOME", GetDummyRegistration());
+
+  const TfLiteRegistration* found_registration =
+      resolver.FindOp("EXCELLENT", 1);
+  EXPECT_EQ(found_registration, nullptr);
+}
+
+TEST(MutableOpResolverTest, FindCustomOpWithUnsupportedVersion) {
+  MutableOpResolver resolver;
+  resolver.AddCustom("AWESOME", GetDummyRegistration());
+
+  const TfLiteRegistration* found_registration = resolver.FindOp("AWESOME", 2);
+  EXPECT_EQ(found_registration, nullptr);
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/profiling/BUILD b/tensorflow/contrib/lite/profiling/BUILD
index 15999e5d4188db..c31189f2b1f1ad 100644
--- a/tensorflow/contrib/lite/profiling/BUILD
+++ b/tensorflow/contrib/lite/profiling/BUILD
@@ -29,6 +29,40 @@ cc_library(
     name = "profile_buffer",
     hdrs = ["profile_buffer.h"],
     copts = common_copts,
+    deps = [":time"],
+)
+
+cc_library(
+    name = "time",
+    srcs = ["time.cc"],
+    hdrs = ["time.h"],
+)
+
+cc_library(
+    name = "profile_summarizer",
+    srcs = ["profile_summarizer.cc"],
+    hdrs = ["profile_summarizer.h"],
+    deps = [
+        ":profiler",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/schema:schema_fbs",
+        "//tensorflow/core:stats_calculator_portable",
+    ],
+)
+
+cc_test(
+    name = "profile_summarizer_test",
+    srcs = ["profile_summarizer_test.cc"],
+    deps = [
+        ":profile_summarizer",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:schema_fbs_version",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+        "//tensorflow/contrib/lite/kernels:kernel_util",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "//tensorflow/contrib/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
 )
 
 cc_test(
diff --git a/tensorflow/contrib/lite/profiling/profile_buffer.h b/tensorflow/contrib/lite/profiling/profile_buffer.h
index b2f565376c3d1c..65d86dce47f397 100644
--- a/tensorflow/contrib/lite/profiling/profile_buffer.h
+++ b/tensorflow/contrib/lite/profiling/profile_buffer.h
@@ -18,6 +18,8 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 
+#include "tensorflow/contrib/lite/profiling/time.h"
+
 namespace tflite {
 namespace profiling {
 
@@ -37,9 +39,9 @@ struct ProfileEvent {
   // Label of the event. This usually describes the event.
   const char* tag;
   // Timestamp in microseconds when the event began.
-  int64_t begin_timestamp_us;
+  uint64_t begin_timestamp_us;
   // Timestamp in microseconds when the event ended.
-  int64_t end_timestamp_us;
+  uint64_t end_timestamp_us;
   // The field containing the type of event. This must be one of the event types
   // in EventType.
   EventType event_type;
@@ -74,7 +76,7 @@ class ProfileBuffer {
     if (!enabled_) {
       return kInvalidEventHandle;
     }
-    int64_t timestamp = NowMicros();
+    uint64_t timestamp = time::NowMicros();
     int index = current_index_ % event_buffer_.size();
     event_buffer_[index].tag = tag;
     event_buffer_[index].event_type = event_type;
@@ -103,7 +105,7 @@ class ProfileBuffer {
     }
 
     int event_index = event_handle % max_size;
-    event_buffer_[event_index].end_timestamp_us = NowMicros();
+    event_buffer_[event_index].end_timestamp_us = time::NowMicros();
   }
 
   // Returns the size of the buffer.
@@ -134,12 +136,6 @@ class ProfileBuffer {
   }
 
  private:
-  static int64_t NowMicros() {
-    // TODO(shashishekhar): Refactor this to a separate file.
-    struct timeval tv;
-    gettimeofday(&tv, nullptr);
-    return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
-  }
   bool enabled_;
   uint32_t current_index_;
   std::vector<ProfileEvent> event_buffer_;
diff --git a/tensorflow/contrib/lite/profiling/profile_summarizer.cc b/tensorflow/contrib/lite/profiling/profile_summarizer.cc
new file mode 100644
index 00000000000000..6f2c9cd2b39a1d
--- /dev/null
+++ b/tensorflow/contrib/lite/profiling/profile_summarizer.cc
@@ -0,0 +1,141 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/profiling/profile_summarizer.h"
+
+#include <sstream>
+
+#include "tensorflow/contrib/lite/schema/schema_generated.h"
+
+namespace tflite {
+namespace profiling {
+namespace {
+
+using Detail = tensorflow::StatsCalculator::Detail;
+
+struct OperatorDetails {
+  std::string name;
+  std::vector<std::string> inputs;
+  std::vector<std::string> outputs;
+};
+
+std::string GetTensorName(const tflite::Interpreter& interpreter,
+                          int tensor_index) {
+  const auto tensor = interpreter.tensor(tensor_index);
+  if (tensor == nullptr || tensor->name == nullptr) {
+    return "Unknown";
+  }
+  return tensor->name;
+}
+std::vector<std::string> GetTensorNames(const tflite::Interpreter& interpreter,
+                                        const TfLiteIntArray* tensor_indices) {
+  std::vector<std::string> tensors;
+  tensors.reserve(tensor_indices->size);
+  for (int i = 0; i < tensor_indices->size; i++) {
+    tensors.push_back(GetTensorName(interpreter, tensor_indices->data[i]));
+  }
+  return tensors;
+}
+
+std::string ToString(const std::vector<std::string>& str_vector) {
+  std::stringstream stream;
+  stream << "[";
+  bool first = true;
+  for (const auto& s : str_vector) {
+    if (!first) {
+      stream << ", ";
+    } else {
+      first = false;
+    }
+    stream << s;
+  }
+  stream << "]";
+  return stream.str();
+}
+
+OperatorDetails GetOperatorDetails(const tflite::Interpreter& interpreter,
+                                   int node_index) {
+  auto node_reg = interpreter.node_and_registration(node_index);
+  auto inputs = node_reg->first.inputs;
+  auto outputs = node_reg->first.outputs;
+  int code = node_reg->second.builtin_code;
+  const char* op_name = nullptr;
+  if (code == tflite::BuiltinOperator_CUSTOM) {
+    const char* custom_name = node_reg->second.custom_name;
+    op_name = custom_name ? custom_name : "UnknownCustomOp";
+  } else {
+    op_name = tflite::EnumNamesBuiltinOperator()[code];
+  }
+  OperatorDetails details;
+  details.name = op_name;
+  details.inputs = GetTensorNames(interpreter, inputs);
+  details.outputs = GetTensorNames(interpreter, outputs);
+  return details;
+}
+
+}  // namespace
+
+ProfileSummarizer::ProfileSummarizer()
+    : stats_calculator_(new ::tensorflow::StatsCalculator(
+          tensorflow::StatSummarizerOptions())) {}
+
+void ProfileSummarizer::ProcessProfiles(
+    const std::vector<const ProfileEvent*>& profile_stats,
+    const tflite::Interpreter& interpreter) {
+  std::vector<const ProfileEvent*> events;
+  std::copy_if(profile_stats.begin(), profile_stats.end(),
+               std::back_inserter(events), [](const ProfileEvent* e) {
+                 return e->event_type ==
+                            ProfileEvent::EventType::OPERATOR_INVOKE_EVENT &&
+                        e->end_timestamp_us >= e->begin_timestamp_us;
+               });
+  // Sort with begin_time.
+  std::sort(events.begin(), events.end(),
+            [](const ProfileEvent* const& a, const ProfileEvent* const& b) {
+              return a->begin_timestamp_us < b->begin_timestamp_us;
+            });
+  if (events.empty()) {
+    return;
+  }
+
+  int64_t base_start_us = events[0]->begin_timestamp_us;
+  int node_num = 0;
+  int64_t curr_total_us = 0;
+  std::map<std::string, Detail> details;
+  for (auto event : events) {
+    auto op_details = GetOperatorDetails(interpreter, event->event_metadata);
+    auto node_name = ToString(op_details.outputs);
+    auto result = details.emplace(node_name, Detail());
+    Detail* detail = &(result.first->second);
+    detail->start_us.UpdateStat(event->begin_timestamp_us - base_start_us);
+    int64_t node_exec_time =
+        event->end_timestamp_us - event->begin_timestamp_us;
+    detail->rel_end_us.UpdateStat(node_exec_time);
+    curr_total_us += node_exec_time;
+    ++node_num;
+
+    if (result.second) {
+      detail->name = node_name;
+      detail->type = op_details.name;
+      detail->run_order = node_num;
+      detail->times_called = 0;
+    }
+    ++detail->times_called;
+  }
+  stats_calculator_->UpdateDetails(details);
+  stats_calculator_->UpdateRunTotalUs(curr_total_us);
+}
+}  // namespace profiling
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/profiling/profile_summarizer.h b/tensorflow/contrib/lite/profiling/profile_summarizer.h
new file mode 100644
index 00000000000000..a529ff87428d70
--- /dev/null
+++ b/tensorflow/contrib/lite/profiling/profile_summarizer.h
@@ -0,0 +1,55 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILE_SUMMARIZER_H_
+#define TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILE_SUMMARIZER_H_
+
+#include <vector>
+
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/profiling/profiler.h"
+#include "tensorflow/core/util/stats_calculator.h"
+
+namespace tflite {
+namespace profiling {
+
+// Creates a summary of operator invocations in the interpreter.
+class ProfileSummarizer {
+ public:
+  ProfileSummarizer();
+  virtual ~ProfileSummarizer() {}
+
+  // Process profile events to update statistics for operator invocations.
+  void ProcessProfiles(const std::vector<const ProfileEvent*>& profile_stats,
+                       const tflite::Interpreter& interpreter);
+
+  // Returns a string detailing the accumulated runtime stats in a tab-separated
+  // format which can be pasted into a spreadsheet for further analysis.
+  std::string GetOutputString() const {
+    return stats_calculator_->GetOutputString();
+  }
+
+  std::string GetShortSummary() const {
+    return stats_calculator_->GetShortSummary();
+  }
+
+ private:
+  std::unique_ptr<tensorflow::StatsCalculator> stats_calculator_;
+};
+
+}  // namespace profiling
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_PROFILING_PROFILE_SUMMARIZER_H_
diff --git a/tensorflow/contrib/lite/profiling/profile_summarizer_test.cc b/tensorflow/contrib/lite/profiling/profile_summarizer_test.cc
new file mode 100644
index 00000000000000..35cf780713b93d
--- /dev/null
+++ b/tensorflow/contrib/lite/profiling/profile_summarizer_test.cc
@@ -0,0 +1,116 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/profiling/profile_summarizer.h"
+#include "tensorflow/contrib/lite/testing/util.h"
+#include "tensorflow/contrib/lite/version.h"
+
+namespace tflite {
+namespace profiling {
+
+namespace {
+
+TfLiteStatus SimpleOpEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input1 = tflite::GetInput(context, node, /*index=*/0);
+  const TfLiteTensor* input2 = tflite::GetInput(context, node, /*index=*/1);
+
+  TfLiteTensor* output = GetOutput(context, node, /*index=*/0);
+
+  int32_t* output_data = output->data.i32;
+  *output_data = *(input1->data.i32) + *(input2->data.i32);
+  return kTfLiteOk;
+}
+
+TfLiteRegistration* RegisterSimpleOp() {
+  static TfLiteRegistration registration = {nullptr,
+                                            nullptr,
+                                            nullptr,
+                                            SimpleOpEval,
+                                            tflite::BuiltinOperator_CUSTOM,
+                                            "SimpleOpEval",
+                                            1};
+  return &registration;
+}
+
+class SimpleOpModel : public SingleOpModel {
+ public:
+  void Init();
+  tflite::Interpreter* GetInterpreter() { return interpreter_.get(); }
+  void SetInputs(int32_t x, int32_t y) {
+    PopulateTensor(inputs_[0], {x});
+    PopulateTensor(inputs_[1], {y});
+  }
+  int32_t GetOutput() { return ExtractVector<int32_t>(output_)[0]; }
+
+ private:
+  int inputs_[2];
+  int output_;
+};
+
+void SimpleOpModel::Init() {
+  inputs_[0] = AddInput({TensorType_INT32, {1}});
+  inputs_[1] = AddInput({TensorType_INT32, {1}});
+  output_ = AddOutput({TensorType_INT32, {}});
+  SetCustomOp("SimpleAdd", {}, RegisterSimpleOp);
+  BuildInterpreter({GetShape(inputs_[0]), GetShape(inputs_[1])});
+}
+
+TEST(ProfileSummarizerTest, Empty) {
+  ProfileSummarizer summarizer;
+  std::string output = summarizer.GetOutputString();
+  EXPECT_GT(output.size(), 0);
+}
+
+#ifdef TFLITE_PROFILING_ENABLED
+TEST(ProfileSummarizerTest, Interpreter) {
+  Profiler profiler;
+  SimpleOpModel m;
+  m.Init();
+  auto interpreter = m.GetInterpreter();
+  interpreter->SetProfiler(&profiler);
+  profiler.StartProfiling();
+  m.SetInputs(1, 2);
+  m.Invoke();
+  // 3 = 1 + 2
+  EXPECT_EQ(m.GetOutput(), 3);
+  profiler.StopProfiling();
+  ProfileSummarizer summarizer;
+  auto events = profiler.GetProfileEvents();
+  EXPECT_EQ(1, events.size());
+  summarizer.ProcessProfiles(profiler.GetProfileEvents(), *interpreter);
+  auto output = summarizer.GetOutputString();
+  // TODO(shashishekhar): Add a better test here.
+  ASSERT_TRUE(output.find("SimpleOp") != std::string::npos) << output;
+}
+#endif
+
+}  // namespace
+}  // namespace profiling
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/profiling/profiler.h b/tensorflow/contrib/lite/profiling/profiler.h
index dfa98a6708edc8..8c3e4dc76d8061 100644
--- a/tensorflow/contrib/lite/profiling/profiler.h
+++ b/tensorflow/contrib/lite/profiling/profiler.h
@@ -85,7 +85,7 @@ class Profiler {
   std::vector<const ProfileEvent*> GetProfileEvents() {
     std::vector<const ProfileEvent*> profile_events;
     profile_events.reserve(buffer_.Size());
-    for (int i = 0; i < buffer_.Size(); i++) {
+    for (size_t i = 0; i < buffer_.Size(); i++) {
       profile_events.push_back(buffer_.At(i));
     }
     return profile_events;
@@ -103,7 +103,9 @@ class ScopedProfile {
   // Adds a profile event to profile that begins with the construction
   // of object and ends when the object goes out of scope.
   // The lifetime of tag should be at least the lifetime of profiler.
-  ScopedProfile(Profiler* profiler, const char* tag) {
+
+  ScopedProfile(Profiler* profiler, const char* tag)
+      : buffer_(nullptr), event_handle_(0) {
     if (profiler) {
       buffer_ = profiler->GetProfileBuffer();
       event_handle_ =
@@ -126,7 +128,8 @@ class ScopedOperatorProfile {
   // Adds a profile event to profile that begins with the construction
   // of object and ends when the object goes out of scope.
   // The lifetime of tag should be at least the lifetime of profiler.
-  ScopedOperatorProfile(Profiler* profiler, const char* tag, int node_index) {
+  ScopedOperatorProfile(Profiler* profiler, const char* tag, int node_index)
+      : buffer_(nullptr), event_handle_(0) {
     if (profiler) {
       buffer_ = profiler->GetProfileBuffer();
       event_handle_ = buffer_->BeginEvent(
@@ -148,9 +151,11 @@ class ScopedOperatorProfile {
 }  // namespace profiling
 }  // namespace tflite
 
-#define SCOPED_OPERATOR_PROFILE(profiler, node_index)                       \
-  tflite::profiling::ScopedOperatorProfile _profile((profiler), "OpInvoke", \
-                                                    (node_index))
+#define VARNAME_UNIQ(name, ctr) name##ctr
+
+#define SCOPED_OPERATOR_PROFILE(profiler, node_index)    \
+  tflite::profiling::ScopedOperatorProfile VARNAME_UNIQ( \
+      _profile_, __COUNTER__)((profiler), "OpInvoke", (node_index))
 #else
 
 namespace tflite {
diff --git a/tensorflow/contrib/lite/profiling/profiler_test.cc b/tensorflow/contrib/lite/profiling/profiler_test.cc
index 7914f36a319b85..0fba0450a0359e 100644
--- a/tensorflow/contrib/lite/profiling/profiler_test.cc
+++ b/tensorflow/contrib/lite/profiling/profiler_test.cc
@@ -82,16 +82,29 @@ TEST(ProfilingTest, ProfilesAreCollected) {
   EXPECT_EQ("Child", profile_events[3]->tag);
   EXPECT_EQ("SleepForQuarter", profile_events[4]->tag);
 
-  AssertDurationOfEventAroundMs(profile_events[0], /*expected_ms*/ 500,
-                                /*eps_ms*/ 2);
-  AssertDurationOfEventAroundMs(profile_events[1], /*expected_ms*/ 250,
-                                /*eps_ms*/ 2);
-  AssertDurationOfEventAroundMs(profile_events[2], /*expected_ms*/ 250,
-                                /*eps_ms*/ 2);
-  AssertDurationOfEventAroundMs(profile_events[3], /*expected_ms*/ 250,
-                                /*eps_ms*/ 2);
-  AssertDurationOfEventAroundMs(profile_events[4], /*expected_ms*/ 250,
-                                /*eps_ms*/ 2);
+#ifndef ADDRESS_SANITIZER
+  // ASAN build is sometimes very slow.
+  const int eps_ms = 10;
+  AssertDurationOfEventAroundMs(profile_events[0], /*expected_ms*/ 500, eps_ms);
+  AssertDurationOfEventAroundMs(profile_events[1], /*expected_ms*/ 250, eps_ms);
+  AssertDurationOfEventAroundMs(profile_events[2], /*expected_ms*/ 250, eps_ms);
+  AssertDurationOfEventAroundMs(profile_events[3], /*expected_ms*/ 250, eps_ms);
+  AssertDurationOfEventAroundMs(profile_events[4], /*expected_ms*/ 250, eps_ms);
+#endif
+}
+
+TEST(ProfilingTest, NullProfiler) {
+  Profiler* profiler = nullptr;
+  { SCOPED_OPERATOR_PROFILE(profiler, 1); }
+}
+
+TEST(ProfilingTest, ScopedProfile) {
+  Profiler profiler;
+  profiler.StartProfiling();
+  { SCOPED_OPERATOR_PROFILE(&profiler, 1); }
+  profiler.StopProfiling();
+  auto profile_events = profiler.GetProfileEvents();
+  EXPECT_EQ(1, profile_events.size());
 }
 
 }  // namespace
diff --git a/tensorflow/contrib/lite/profiling/time.cc b/tensorflow/contrib/lite/profiling/time.cc
new file mode 100644
index 00000000000000..446660bb747cd6
--- /dev/null
+++ b/tensorflow/contrib/lite/profiling/time.cc
@@ -0,0 +1,29 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/profiling/time.h"
+
+#include <sys/time.h>
+
+namespace tflite {
+namespace profiling {
+namespace time {
+uint64_t NowMicros() {
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+}
+}  // namespace time
+}  // namespace profiling
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/profiling/time.h b/tensorflow/contrib/lite/profiling/time.h
new file mode 100644
index 00000000000000..cc2ec319b8a95b
--- /dev/null
+++ b/tensorflow/contrib/lite/profiling/time.h
@@ -0,0 +1,27 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_PROFILING_TIME_H_
+#define TENSORFLOW_CONTRIB_LITE_PROFILING_TIME_H_
+
+#include <cstdint>
+
+namespace tflite {
+namespace profiling {
+namespace time {
+uint64_t NowMicros();
+}  // namespace time
+}  // namespace profiling
+}  // namespace tflite
+#endif  // TENSORFLOW_CONTRIB_LITE_PROFILING_TIME_H_
diff --git a/tensorflow/contrib/lite/python/BUILD b/tensorflow/contrib/lite/python/BUILD
index e6dcc7aa099ccd..7e6ff6c0a8314e 100644
--- a/tensorflow/contrib/lite/python/BUILD
+++ b/tensorflow/contrib/lite/python/BUILD
@@ -36,6 +36,16 @@ py_test(
     ],
 )
 
+py_binary(
+    name = "tflite_convert",
+    srcs = ["tflite_convert.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":lite",
+    ],
+)
+
 py_library(
     name = "lite",
     srcs = ["lite.py"],
@@ -44,7 +54,23 @@ py_library(
     deps = [
         ":convert",
         ":convert_saved_model",
+        ":interpreter",
+        ":lite_constants",
         ":op_hint",
+        "//tensorflow/contrib/saved_model:saved_model_py",
+        "//tensorflow/python:graph_util",
+        "//tensorflow/python/tools:freeze_graph_lib",
+    ],
+)
+
+py_test(
+    name = "lite_test",
+    srcs = ["lite_test.py"],
+    data = [":interpreter_test_data"],
+    srcs_version = "PY2AND3",
+    tags = ["no_windows"],
+    deps = [
+        ":lite",
     ],
 )
 
@@ -110,9 +136,9 @@ py_library(
     visibility = ["//visibility:public"],
     deps = [
         ":convert",
-        ":lite_constants",
         "//tensorflow/contrib/saved_model:saved_model_py",
         "//tensorflow/python:graph_util",
+        "//tensorflow/python:platform",
         "//tensorflow/python/tools:freeze_graph_lib",
     ],
 )
@@ -149,20 +175,3 @@ py_test(
         "//tensorflow/python/saved_model",
     ],
 )
-
-py_binary(
-    name = "convert_saved_model_to_frozen_graph",
-    srcs = ["convert_saved_model_to_frozen_graph.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":convert_saved_model",
-    ],
-)
-
-# Transitive dependencies of this target will be included in the pip package.
-py_library(
-    name = "tf_lite_py_pip",
-    deps = [
-        ":convert_saved_model",
-    ],
-)
diff --git a/tensorflow/contrib/lite/python/convert.py b/tensorflow/contrib/lite/python/convert.py
index c4200c879ba0e1..0819475240d03a 100644
--- a/tensorflow/contrib/lite/python/convert.py
+++ b/tensorflow/contrib/lite/python/convert.py
@@ -115,10 +115,15 @@ def toco_convert(input_data,
                  input_tensors,
                  output_tensors,
                  inference_type=lite_constants.FLOAT,
+                 inference_input_type=None,
                  input_format=lite_constants.TENSORFLOW_GRAPHDEF,
                  output_format=lite_constants.TFLITE,
                  quantized_input_stats=None,
-                 drop_control_dependency=True):
+                 default_ranges_stats=None,
+                 drop_control_dependency=True,
+                 reorder_across_fake_quant=False,
+                 allow_custom_ops=False,
+                 change_concat_input_ranges=False):
   """Convert a model using TOCO from `input_format` to `output_format`.
 
   Typically this is to convert from TensorFlow GraphDef to TFLite, in which
@@ -129,18 +134,41 @@ def toco_convert(input_data,
     input_tensors: List of input tensors. Type and shape are computed using
       `foo.get_shape()` and `foo.dtype`.
     output_tensors: List of output tensors (only .name is used from this).
-    inference_type: Currently must be `{FLOAT, QUANTIZED_UINT8}`.
-    input_format: Type of data to read (currently must be TENSORFLOW_GRAPHDEF).
-    output_format: Type of data to write (currently must be TFLITE or
-      GRAPHVIZ_DOT)
-    quantized_input_stats: For each member of input_tensors the mean and
-      std deviation of training data. Only needed if `inference_type` is
-      `QUANTIZED_UINT8`.
-    drop_control_dependency: Drops control dependencies silently. This is due
-      to tf lite not supporting control dependencies.
+    inference_type: Target data type of arrays in the output file. Currently
+      must be `{FLOAT, QUANTIZED_UINT8}`.  (default FLOAT)
+    inference_input_type: Target data type of input arrays. Allows for a
+      different type for input arrays in the case of quantization. Currently
+      must be `{FLOAT, QUANTIZED_UINT8}`. (default `inference_type`)
+    input_format: Type of data to read Currently must be
+      `{TENSORFLOW_GRAPHDEF}`. (default TENSORFLOW_GRAPHDEF)
+    output_format: Output file format. Currently must be `{TFLITE,
+      GRAPHVIZ_DOT}`. (default TFLITE)
+    quantized_input_stats: Dict of strings representing input tensor names
+      mapped to tuple of integers representing the mean and standard deviation
+      of the training data (e.g., {"foo" : (0., 1.)}). Only need if
+      `inference_type` is `QUANTIZED_UINT8`. (default None)
+    default_ranges_stats: Tuple of integers representing (min, max) range values
+      for all arrays without a specified range. Intended for experimenting with
+      quantization via "dummy quantization". (default None)
+    drop_control_dependency: Boolean indicating whether to drop control
+      dependencies silently. This is due to TFLite not supporting control
+      dependencies. (default True)
+    reorder_across_fake_quant: Boolean indicating whether to reorder FakeQuant
+      nodes in unexpected locations. Used when the location of the FakeQuant
+      nodes is preventing graph transformations necessary to convert the graph.
+      Results in a graph that differs from the quantized training graph,
+      potentially causing differing arithmetic behavior. (default False)
+    change_concat_input_ranges: Boolean to change behavior of min/max ranges for
+      inputs and outputs of the concat operator for quantized models. Changes
+      the ranges of concat operator overlap when true. (default False)
+    allow_custom_ops: Boolean indicating whether to allow custom operations.
+      When false any unknown operation is an error. When true, custom ops are
+      created for any op that is unknown. The developer will need to provide
+      these to the TensorFlow Lite runtime with a custom resolver.
+      (default False)
 
   Returns:
-    The converted data. For example if tflite was the destination, then
+    The converted data. For example if TFLite was the destination, then
     this will be a tflite flatbuffer in a bytes array.
 
   Raises:
@@ -151,9 +179,18 @@ def toco_convert(input_data,
   toco = _toco_flags_pb2.TocoFlags()
   toco.input_format = input_format
   toco.output_format = output_format
+  toco.inference_type = inference_type
+  if inference_input_type:
+    toco.inference_input_type = inference_input_type
   toco.drop_control_dependency = drop_control_dependency
+  toco.reorder_across_fake_quant = reorder_across_fake_quant
+  toco.allow_custom_ops = allow_custom_ops
+  if default_ranges_stats:
+    toco.default_ranges_min = default_ranges_stats[0]
+    toco.default_ranges_max = default_ranges_stats[1]
+
   model = _model_flags_pb2.ModelFlags()
-  toco.inference_type = inference_type
+  model.change_concat_input_ranges = change_concat_input_ranges
   for idx, input_tensor in enumerate(input_tensors):
     if input_tensor.dtype == _dtypes.float32:
       tflite_input_type = lite_constants.FLOAT
@@ -161,6 +198,8 @@ def toco_convert(input_data,
       tflite_input_type = lite_constants.INT32
     elif input_tensor.dtype == _dtypes.int64:
       tflite_input_type = lite_constants.INT64
+    elif input_tensor.dtype == _dtypes.uint8:
+      tflite_input_type = lite_constants.QUANTIZED_UINT8
     # TODO(aselle): Insert strings when they are available
     else:
       raise ValueError("Tensors %s not known type %r" % (input_tensor.name,
diff --git a/tensorflow/contrib/lite/python/convert_saved_model.py b/tensorflow/contrib/lite/python/convert_saved_model.py
index a7eddf3408f54d..5dad49f1ed29f3 100644
--- a/tensorflow/contrib/lite/python/convert_saved_model.py
+++ b/tensorflow/contrib/lite/python/convert_saved_model.py
@@ -18,34 +18,15 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.lite.python import convert
-from tensorflow.contrib.lite.python import lite_constants
-from tensorflow.contrib.lite.toco import model_flags_pb2
+from tensorflow.contrib.lite.python.convert import tensor_name
 from tensorflow.contrib.saved_model.python.saved_model import reader
 from tensorflow.contrib.saved_model.python.saved_model import signature_def_utils
 from tensorflow.core.framework import types_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import graph_util as tf_graph_util
 from tensorflow.python.framework import ops
-from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import loader
-from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.saved_model import tag_constants
-
-
-def _write_and_flush_file(file_path, data_str):
-  """Writes data to file path.
-
-  Args:
-    file_path: Full path of the file to store data in.
-    data_str: Data represented as a string.
-
-  Returns: None.
-  """
-  with gfile.Open(file_path, "wb") as data_file:
-    data_file.write(data_str)
-    data_file.flush()
 
 
 def _log_tensor_details(tensor_info):
@@ -110,12 +91,12 @@ def _get_signature_def(meta_graph, signature_key):
   signature_def_map = meta_graph.signature_def
   signature_def_keys = set(signature_def_map.keys())
   logging.info(
-      "The given saved_model MetaGraphDef contains SignatureDefs with the "
+      "The given SavedModel MetaGraphDef contains SignatureDefs with the "
       "following keys: %s", signature_def_keys)
   if signature_key not in signature_def_keys:
-    raise ValueError("No '{}' in the saved_model\'s SignatureDefs. Possible "
-                     "values are '{}'. ".format(signature_key,
-                                                signature_def_keys))
+    raise ValueError("No '{}' in the SavedModel\'s SignatureDefs. Possible "
+                     "values are '{}'.".format(signature_key,
+                                               ",".join(signature_def_keys)))
   signature_def = signature_def_utils.get_signature_def_by_key(
       meta_graph, signature_key)
   return signature_def
@@ -170,29 +151,10 @@ def _get_tensors(graph, signature_def_tensor_names=None,
   """
   tensors = []
   if user_tensor_names:
-    # Get the list of all of the tensors with and without the tensor index.
-    all_tensor_names = [
-        tensor.name for op in graph.get_operations() for tensor in op.outputs
-    ]
-    all_tensor_names_only = [name.split(":")[0] for name in all_tensor_names]
-
     # Sort the tensor names.
     user_tensor_names = sorted(user_tensor_names)
 
-    # Get the tensors associated with the tensor names.
-    tensors = []
-    invalid_tensors = []
-    for name in user_tensor_names:
-      if name not in all_tensor_names_only:
-        invalid_tensors.append(name)
-      else:
-        idx = all_tensor_names_only.index(name)
-        tensors.append(graph.get_tensor_by_name(all_tensor_names[idx]))
-
-    # Throw ValueError if any user input names are not valid tensors.
-    if invalid_tensors:
-      raise ValueError("Invalid tensors '{}' were found.".format(
-          ",".join(invalid_tensors)))
+    tensors = get_tensors_from_tensor_names(graph, user_tensor_names)
   elif signature_def_tensor_names:
     tensors = [
         graph.get_tensor_by_name(name)
@@ -207,25 +169,74 @@ def _get_tensors(graph, signature_def_tensor_names=None,
   return tensors
 
 
-def _freeze_saved_model(saved_model_dir, input_arrays, input_shapes,
-                        output_arrays, tag_set, signature_key, batch_size):
+def get_tensors_from_tensor_names(graph, tensor_names):
+  """Gets the Tensors associated with the `tensor_names` in the provided graph.
+
+  Args:
+    graph: TensorFlow Graph.
+    tensor_names: List of strings that represent names of tensors in the graph.
+
+  Returns:
+    A list of Tensor objects in the same order the names are provided.
+
+  Raises:
+    ValueError:
+      tensor_names contains an invalid tensor name.
+  """
+  # Get the list of all of the tensors.
+  tensor_name_to_tensor = {
+      tensor_name(tensor): tensor for op in graph.get_operations()
+      for tensor in op.values()
+  }
+
+  # Get the tensors associated with tensor_names.
+  tensors = []
+  invalid_tensors = []
+  for name in tensor_names:
+    tensor = tensor_name_to_tensor.get(name)
+    if tensor is None:
+      invalid_tensors.append(name)
+    else:
+      tensors.append(tensor)
+
+  # Throw ValueError if any user input names are not valid tensors.
+  if invalid_tensors:
+    raise ValueError("Invalid tensors '{}' were found.".format(
+        ",".join(invalid_tensors)))
+  return tensors
+
+
+def set_tensor_shapes(tensors, shapes):
+  """Sets Tensor shape for each tensor if the shape is defined.
+
+  Args:
+    tensors: TensorFlow ops.Tensor.
+    shapes: Dict of strings representing input tensor names to list of
+      integers representing input shapes (e.g., {"foo": : [1, 16, 16, 3]}).
+  """
+  if shapes:
+    for tensor in tensors:
+      shape = shapes.get(tensor_name(tensor))
+      if shape is not None:
+        tensor.set_shape(shape)
+
+
+def freeze_saved_model(saved_model_dir, input_arrays, input_shapes,
+                       output_arrays, tag_set, signature_key):
   """Converts a SavedModel to a frozen graph.
 
   Args:
     saved_model_dir: SavedModel directory to convert.
     input_arrays: List of input tensors to freeze graph with. Uses input arrays
-      from SignatureDef when none are provided. (default None)
-    input_shapes: Map of strings representing input tensor names to list of
+      from SignatureDef when none are provided.
+    input_shapes: Dict of strings representing input tensor names to list of
       integers representing input shapes (e.g., {"foo": : [1, 16, 16, 3]}).
       Automatically determined when input shapes is None (e.g., {"foo" : None}).
-      (default None)
     output_arrays: List of output tensors to freeze graph with. Uses output
-      arrays from SignatureDef when none are provided. (default None)
+      arrays from SignatureDef when none are provided.
     tag_set: Set of tags identifying the MetaGraphDef within the SavedModel to
-      analyze. All tags in the tag set must be present. (default "serve")
+      analyze. All tags in the tag set must be present.
     signature_key: Key identifying SignatureDef containing inputs and outputs.
-    batch_size: Batch size for the model. Replaces the first dimension of an
-      input size array if undefined. (default 1)
 
   Returns:
     frozen_graph_def: Frozen GraphDef.
@@ -237,18 +248,8 @@ def _freeze_saved_model(saved_model_dir, input_arrays, input_shapes,
       SavedModel doesn't contain a MetaGraphDef identified by tag_set.
       signature_key is not in the MetaGraphDef.
       input_shapes does not match the length of input_arrays.
-      input_shapes has a None value after the 1st dimension.
       input_arrays or output_arrays are not valid.
-      Unable to load Session.
   """
-  # Set default values for inputs if they are set to None.
-  if signature_key is None:
-    signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
-  if tag_set is None:
-    tag_set = set([tag_constants.SERVING])
-  if batch_size is None:
-    batch_size = 1
-
   # Read SignatureDef.
   meta_graph = _get_meta_graph_def(saved_model_dir, tag_set)
   signature_def = _get_signature_def(meta_graph, signature_key)
@@ -263,159 +264,10 @@ def _freeze_saved_model(saved_model_dir, input_arrays, input_shapes,
     # TODO(zhixianyan): Use TFLite supported Op list to filter outputs.
     in_tensors = _get_tensors(graph, inputs, input_arrays)
     out_tensors = _get_tensors(graph, outputs, output_arrays)
-
-    # Gets fully defined tensor shape. An input tensor with None in the first
-    # dimension, e.g. (None, 224, 224, 3), is replaced with the batch_size.
-    # Shapes with None after the first dimension result in a ValueError.
-    # TODO(zhixianyan): Add supports for input tensor with more None in shape.
-    for tensor in in_tensors:
-      if (input_shapes and tensor.name in input_shapes and
-          input_shapes[tensor.name] is not None):
-        shape = input_shapes[tensor.name]
-      else:
-        shape = tensor.get_shape().as_list()
-
-      if None in shape[1:]:
-        raise ValueError(
-            "None is only supported in the 1st dimension. Tensor '{0}' has "
-            "invalid shape '{1}'.".format(tensor.name, shape))
-      elif shape[0] is None:
-        shape[0] = batch_size
-      tensor.set_shape(shape)
+    set_tensor_shapes(in_tensors, input_shapes)
 
     output_names = [node.split(":")[0] for node in outputs]
     frozen_graph_def = tf_graph_util.convert_variables_to_constants(
         sess, graph.as_graph_def(), output_names)
 
     return frozen_graph_def, in_tensors, out_tensors
-  raise ValueError("Unable to load Session.")
-
-
-def saved_model_to_frozen_graphdef(
-    saved_model_dir,
-    output_file_model,
-    output_file_flags,
-    input_arrays=None,
-    input_shapes=None,
-    output_arrays=None,
-    tag_set=None,
-    signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
-    batch_size=1):
-  """Converts a SavedModel to a frozen graph. Writes graph to tmp directory.
-
-  Stores frozen graph and command line flags in the tmp directory.
-
-  Args:
-    saved_model_dir: SavedModel directory to convert.
-    output_file_model: Full file path to save frozen graph.
-    output_file_flags: Full file path to save ModelFlags.
-    input_arrays: List of input tensors to freeze graph with. Uses input arrays
-      from SignatureDef when none are provided. (default None)
-    input_shapes: Map of strings representing input tensor names to list of
-      integers representing input shapes (e.g., {"foo": : [1, 16, 16, 3]}).
-      Automatically determined when input shapes is None (e.g., {"foo" : None}).
-      (default None)
-    output_arrays: List of output tensors to freeze graph with. Uses output
-      arrays from SignatureDef when none are provided. (default None)
-    tag_set: Set of tags identifying the MetaGraphDef within the SavedModel to
-      analyze. All tags in the tag set must be present. (default "serve")
-    signature_key: Key identifying SignatureDef containing inputs and outputs.
-    batch_size: Batch size for the model. Replaces the first dimension of an
-      input size array if undefined. (default 1)
-
-  Returns: None.
-
-  Raises:
-    ValueError: Unable to convert to frozen graph.
-  """
-  frozen_graph_def, in_tensors, out_tensors = _freeze_saved_model(
-      saved_model_dir, input_arrays, input_shapes, output_arrays, tag_set,
-      signature_key, batch_size)
-
-  # Initialize model flags.
-  model = model_flags_pb2.ModelFlags()
-
-  for input_tensor in in_tensors:
-    input_array = model.input_arrays.add()
-    input_array.name = convert.tensor_name(input_tensor)
-    input_array.shape.dims.extend(map(int, input_tensor.get_shape()))
-
-  for output_tensor in out_tensors:
-    model.output_arrays.append(convert.tensor_name(output_tensor))
-
-  # Write model and ModelFlags to file. ModelFlags contain input array and
-  # output array information that is parsed from the SignatureDef and used for
-  # analysis by TOCO.
-  _write_and_flush_file(output_file_model, frozen_graph_def.SerializeToString())
-  _write_and_flush_file(output_file_flags, model.SerializeToString())
-
-
-def tflite_from_saved_model(
-    saved_model_dir,
-    output_file=None,
-    input_arrays=None,
-    input_shapes=None,
-    output_arrays=None,
-    tag_set=None,
-    signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY,
-    batch_size=1,
-    inference_type=lite_constants.FLOAT,
-    input_format=lite_constants.TENSORFLOW_GRAPHDEF,
-    output_format=lite_constants.TFLITE,
-    quantized_input_stats=None,
-    drop_control_dependency=True):
-  """Converts a SavedModel to TFLite FlatBuffer.
-
-  Args:
-    saved_model_dir: SavedModel directory to convert.
-    output_file: File path to write result TFLite FlatBuffer.
-    input_arrays: List of input tensors to freeze graph with. Uses input arrays
-      from SignatureDef when none are provided. (default None)
-    input_shapes: Map of strings representing input tensor names to list of
-      integers representing input shapes (e.g., {"foo": : [1, 16, 16, 3]}).
-      Automatically determined when input shapes is None (e.g., {"foo" : None}).
-      (default None)
-    output_arrays: List of output tensors to freeze graph with. Uses output
-      arrays from SignatureDef when none are provided. (default None)
-    tag_set: Set of tags identifying the MetaGraphDef within the SavedModel to
-      analyze. All tags in the tag set must be present. (default "serve")
-    signature_key: Key identifying SignatureDef containing inputs and outputs.
-    batch_size: Batch size for the model. Replaces the first dimension of an
-      input size array if undefined. (default 1)
-    inference_type: Currently must be `{FLOAT, QUANTIZED_UINT8}`.
-    input_format: Type of data to read (currently must be TENSORFLOW_GRAPHDEF).
-    output_format: Type of data to write (currently must be TFLITE or
-      GRAPHVIZ_DOT)
-    quantized_input_stats: For each member of input_tensors the mean and
-      std deviation of training data. Only needed if `inference_type` is
-      `QUANTIZED_UINT8`.
-    drop_control_dependency: Drops control dependencies silently. This is due
-      to tf lite not supporting control dependencies.
-
-  Returns:
-    The converted data. For example if tflite was the destination, then
-    this will be a tflite flatbuffer in a bytes array.
-
-  Raises:
-    ValueError: Unable to convert to frozen graph.
-  """
-  frozen_graph_def, in_tensors, out_tensors = _freeze_saved_model(
-      saved_model_dir, input_arrays, input_shapes, output_arrays, tag_set,
-      signature_key, batch_size)
-
-  result = convert.toco_convert(
-      input_data=frozen_graph_def,
-      input_tensors=in_tensors,
-      output_tensors=out_tensors,
-      inference_type=inference_type,
-      input_format=input_format,
-      output_format=output_format,
-      quantized_input_stats=quantized_input_stats,
-      drop_control_dependency=drop_control_dependency)
-
-  if output_file is not None:
-    with gfile.Open(output_file, "wb") as f:
-      f.write(result)
-    logging.info("Successfully converted to: %s", output_file)
-
-  return result
diff --git a/tensorflow/contrib/lite/python/convert_saved_model_test.py b/tensorflow/contrib/lite/python/convert_saved_model_test.py
index db95fc8ad7f94b..92c4ebb2465c2a 100644
--- a/tensorflow/contrib/lite/python/convert_saved_model_test.py
+++ b/tensorflow/contrib/lite/python/convert_saved_model_test.py
@@ -25,12 +25,12 @@
 
 import os
 from tensorflow.contrib.lite.python import convert_saved_model
-from tensorflow.contrib.lite.toco import model_flags_pb2 as _model_flags_pb2
 from tensorflow.python import keras
 from tensorflow.python.client import session
 from tensorflow.python.estimator import estimator_lib as estimator
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.layers import layers
 from tensorflow.python.ops import array_ops
@@ -38,13 +38,68 @@
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.losses import losses
-from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import saved_model
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.training import training as train
 
 
-class ConvertSavedModelTestBasicGraph(test_util.TensorFlowTestCase):
+class TensorFunctionsTest(test_util.TensorFlowTestCase):
+
+  def testGetTensorsValid(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    _ = in_tensor + in_tensor
+    sess = session.Session()
+
+    tensors = convert_saved_model.get_tensors_from_tensor_names(
+        sess.graph, ["Placeholder"])
+    self.assertEqual("Placeholder:0", tensors[0].name)
+
+  def testGetTensorsInvalid(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    _ = in_tensor + in_tensor
+    sess = session.Session()
+
+    with self.assertRaises(ValueError) as error:
+      convert_saved_model.get_tensors_from_tensor_names(sess.graph,
+                                                        ["invalid-input"])
+    self.assertEqual("Invalid tensors 'invalid-input' were found.",
+                     str(error.exception))
+
+  def testSetTensorShapeValid(self):
+    tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
+    self.assertEqual([None, 3, 5], tensor.shape.as_list())
+
+    convert_saved_model.set_tensor_shapes([tensor], {"Placeholder": [5, 3, 5]})
+    self.assertEqual([5, 3, 5], tensor.shape.as_list())
+
+  def testSetTensorShapeNoneValid(self):
+    tensor = array_ops.placeholder(dtype=dtypes.float32)
+    self.assertEqual(None, tensor.shape)
+
+    convert_saved_model.set_tensor_shapes([tensor], {"Placeholder": [1, 3, 5]})
+    self.assertEqual([1, 3, 5], tensor.shape.as_list())
+
+  def testSetTensorShapeInvalid(self):
+    tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
+    self.assertEqual([None, 3, 5], tensor.shape.as_list())
+
+    convert_saved_model.set_tensor_shapes([tensor],
+                                          {"invalid-input": [5, 3, 5]})
+    self.assertEqual([None, 3, 5], tensor.shape.as_list())
+
+  def testSetTensorShapeEmpty(self):
+    tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
+    self.assertEqual([None, 3, 5], tensor.shape.as_list())
+
+    convert_saved_model.set_tensor_shapes([tensor], {})
+    self.assertEqual([None, 3, 5], tensor.shape.as_list())
+
+
+class FreezeSavedModelTest(test_util.TensorFlowTestCase):
 
   def _createSimpleSavedModel(self, shape):
     """Create a simple SavedModel on the fly."""
@@ -57,82 +112,167 @@ def _createSimpleSavedModel(self, shape):
       saved_model.simple_save(sess, saved_model_dir, inputs, outputs)
     return saved_model_dir
 
+  def _createSavedModelTwoInputArrays(self, shape):
+    """Create a simple SavedModel."""
+    saved_model_dir = os.path.join(self.get_temp_dir(), "simple_savedmodel")
+    with session.Session() as sess:
+      in_tensor_1 = array_ops.placeholder(
+          shape=shape, dtype=dtypes.float32, name="inputB")
+      in_tensor_2 = array_ops.placeholder(
+          shape=shape, dtype=dtypes.float32, name="inputA")
+      out_tensor = in_tensor_1 + in_tensor_2
+      inputs = {"x": in_tensor_1, "y": in_tensor_2}
+      outputs = {"z": out_tensor}
+      saved_model.simple_save(sess, saved_model_dir, inputs, outputs)
+    return saved_model_dir
+
+  def _getArrayNames(self, tensors):
+    return [tensor.name for tensor in tensors]
+
+  def _getArrayShapes(self, tensors):
+    dims = []
+    for tensor in tensors:
+      dim_tensor = []
+      for dim in tensor.shape:
+        if isinstance(dim, tensor_shape.Dimension):
+          dim_tensor.append(dim.value)
+        else:
+          dim_tensor.append(dim)
+      dims.append(dim_tensor)
+    return dims
+
+  def _convertSavedModel(self,
+                         saved_model_dir,
+                         input_arrays=None,
+                         input_shapes=None,
+                         output_arrays=None,
+                         tag_set=None,
+                         signature_key=None):
+    if tag_set is None:
+      tag_set = set([tag_constants.SERVING])
+    if signature_key is None:
+      signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+    graph_def, in_tensors, out_tensors = convert_saved_model.freeze_saved_model(
+        saved_model_dir=saved_model_dir,
+        input_arrays=input_arrays,
+        input_shapes=input_shapes,
+        output_arrays=output_arrays,
+        tag_set=tag_set,
+        signature_key=signature_key)
+    return graph_def, in_tensors, out_tensors
+
   def testSimpleSavedModel(self):
-    """Test a simple SavedModel created on the fly."""
-    # Create a simple SavedModel
+    """Test a SavedModel."""
     saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
-    # Convert to tflite
-    result = convert_saved_model.tflite_from_saved_model(
-        saved_model_dir=saved_model_dir)
-    self.assertTrue(result)
+    _, in_tensors, out_tensors = self._convertSavedModel(saved_model_dir)
+
+    self.assertEqual(self._getArrayNames(out_tensors), ["add:0"])
+    self.assertEqual(self._getArrayNames(in_tensors), ["Placeholder:0"])
+    self.assertEqual(self._getArrayShapes(in_tensors), [[1, 16, 16, 3]])
 
   def testSimpleSavedModelWithNoneBatchSizeInShape(self):
-    """Test a simple SavedModel, with None in input tensor's shape."""
+    """Test a SavedModel with None in input tensor's shape."""
     saved_model_dir = self._createSimpleSavedModel(shape=[None, 16, 16, 3])
-    result = convert_saved_model.tflite_from_saved_model(
-        saved_model_dir=saved_model_dir)
-    self.assertTrue(result)
+    _, in_tensors, out_tensors = self._convertSavedModel(saved_model_dir)
 
-  def testSimpleSavedModelWithMoreNoneInShape(self):
-    """Test a simple SavedModel, fail as more None in input shape."""
-    saved_model_dir = self._createSimpleSavedModel(shape=[None, 16, None, 3])
-    # Convert to tflite: this should raise ValueError, as 3rd dim is None.
-    with self.assertRaises(ValueError):
-      convert_saved_model.tflite_from_saved_model(
-          saved_model_dir=saved_model_dir)
+    self.assertEqual(self._getArrayNames(out_tensors), ["add:0"])
+    self.assertEqual(self._getArrayNames(in_tensors), ["Placeholder:0"])
+    self.assertEqual(self._getArrayShapes(in_tensors), [[None, 16, 16, 3]])
 
-  def testSimpleSavedModelWithWrongSignatureKey(self):
-    """Test a simple SavedModel, fail as given signature is invalid."""
+  def testSimpleSavedModelWithInvalidSignatureKey(self):
+    """Test a SavedModel that fails due to an invalid signature_key."""
     saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
-    # Convert to tflite: this should raise ValueError, as
-    # signature_key does not exit in the saved_model.
-    with self.assertRaises(ValueError):
-      convert_saved_model.tflite_from_saved_model(
-          saved_model_dir=saved_model_dir, signature_key="wrong-key")
-
-  def testSimpleSavedModelWithWrongOutputArray(self):
-    """Test a simple SavedModel, fail as given output_arrays is invalid."""
-    # Create a simple SavedModel
+    with self.assertRaises(ValueError) as error:
+      self._convertSavedModel(saved_model_dir, signature_key="invalid-key")
+    self.assertEqual(
+        "No 'invalid-key' in the SavedModel's SignatureDefs. "
+        "Possible values are 'serving_default'.", str(error.exception))
+
+  def testSimpleSavedModelWithInvalidOutputArray(self):
+    """Test a SavedModel that fails due to invalid output arrays."""
     saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
-    # Convert to tflite: this should raise ValueError, as
-    # output_arrays is not valid for the saved_model.
-    with self.assertRaises(ValueError):
-      convert_saved_model.tflite_from_saved_model(
-          saved_model_dir=saved_model_dir, output_arrays=["wrong-output"])
+    with self.assertRaises(ValueError) as error:
+      self._convertSavedModel(saved_model_dir, output_arrays=["invalid-output"])
+    self.assertEqual("Invalid tensors 'invalid-output' were found.",
+                     str(error.exception))
 
   def testSimpleSavedModelWithWrongInputArrays(self):
-    """Test a simple SavedModel, fail as given input_arrays is invalid."""
+    """Test a SavedModel that fails due to invalid input arrays."""
     saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
-    # Checks invalid input_arrays.
-    with self.assertRaises(ValueError):
-      convert_saved_model.tflite_from_saved_model(
-          saved_model_dir=saved_model_dir, input_arrays=["wrong-input"])
-    # Checks valid and invalid input_arrays.
-    with self.assertRaises(ValueError):
-      convert_saved_model.tflite_from_saved_model(
-          saved_model_dir=saved_model_dir,
-          input_arrays=["Placeholder", "wrong-input"])
+
+    # Check invalid input_arrays.
+    with self.assertRaises(ValueError) as error:
+      self._convertSavedModel(saved_model_dir, input_arrays=["invalid-input"])
+    self.assertEqual("Invalid tensors 'invalid-input' were found.",
+                     str(error.exception))
+
+    # Check valid and invalid input_arrays.
+    with self.assertRaises(ValueError) as error:
+      self._convertSavedModel(
+          saved_model_dir, input_arrays=["Placeholder", "invalid-input"])
+    self.assertEqual("Invalid tensors 'invalid-input' were found.",
+                     str(error.exception))
 
   def testSimpleSavedModelWithCorrectArrays(self):
-    """Test a simple SavedModel, with correct input_arrays and output_arrays."""
+    """Test a SavedModel with correct input_arrays and output_arrays."""
     saved_model_dir = self._createSimpleSavedModel(shape=[None, 16, 16, 3])
-    result = convert_saved_model.tflite_from_saved_model(
+    _, in_tensors, out_tensors = self._convertSavedModel(
         saved_model_dir=saved_model_dir,
         input_arrays=["Placeholder"],
         output_arrays=["add"])
-    self.assertTrue(result)
+
+    self.assertEqual(self._getArrayNames(out_tensors), ["add:0"])
+    self.assertEqual(self._getArrayNames(in_tensors), ["Placeholder:0"])
+    self.assertEqual(self._getArrayShapes(in_tensors), [[None, 16, 16, 3]])
 
   def testSimpleSavedModelWithCorrectInputArrays(self):
-    """Test a simple SavedModel, with correct input_arrays and input_shapes."""
+    """Test a SavedModel with correct input_arrays and input_shapes."""
     saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
-    result = convert_saved_model.tflite_from_saved_model(
+    _, in_tensors, out_tensors = self._convertSavedModel(
         saved_model_dir=saved_model_dir,
         input_arrays=["Placeholder"],
         input_shapes={"Placeholder": [1, 16, 16, 3]})
-    self.assertTrue(result)
+
+    self.assertEqual(self._getArrayNames(out_tensors), ["add:0"])
+    self.assertEqual(self._getArrayNames(in_tensors), ["Placeholder:0"])
+    self.assertEqual(self._getArrayShapes(in_tensors), [[1, 16, 16, 3]])
+
+  def testTwoInputArrays(self):
+    """Test a simple SavedModel."""
+    saved_model_dir = self._createSavedModelTwoInputArrays(shape=[1, 16, 16, 3])
+
+    _, in_tensors, out_tensors = self._convertSavedModel(
+        saved_model_dir=saved_model_dir, input_arrays=["inputB", "inputA"])
+
+    self.assertEqual(self._getArrayNames(out_tensors), ["add:0"])
+    self.assertEqual(self._getArrayNames(in_tensors), ["inputA:0", "inputB:0"])
+    self.assertEqual(
+        self._getArrayShapes(in_tensors), [[1, 16, 16, 3], [1, 16, 16, 3]])
+
+  def testSubsetInputArrays(self):
+    """Test a SavedModel with a subset of the input array names of the model."""
+    saved_model_dir = self._createSavedModelTwoInputArrays(shape=[1, 16, 16, 3])
+
+    # Check case where input shape is given.
+    _, in_tensors, out_tensors = self._convertSavedModel(
+        saved_model_dir=saved_model_dir,
+        input_arrays=["inputA"],
+        input_shapes={"inputA": [1, 16, 16, 3]})
+
+    self.assertEqual(self._getArrayNames(out_tensors), ["add:0"])
+    self.assertEqual(self._getArrayNames(in_tensors), ["inputA:0"])
+    self.assertEqual(self._getArrayShapes(in_tensors), [[1, 16, 16, 3]])
+
+    # Check case where input shape is None.
+    _, in_tensors, out_tensors = self._convertSavedModel(
+        saved_model_dir=saved_model_dir, input_arrays=["inputA"])
+
+    self.assertEqual(self._getArrayNames(out_tensors), ["add:0"])
+    self.assertEqual(self._getArrayNames(in_tensors), ["inputA:0"])
+    self.assertEqual(self._getArrayShapes(in_tensors), [[1, 16, 16, 3]])
 
   def testMultipleMetaGraphDef(self):
-    """Test saved model with multiple MetaGraphDef."""
+    """Test saved model with multiple MetaGraphDefs."""
     saved_model_dir = os.path.join(self.get_temp_dir(), "savedmodel_two_mgd")
     builder = saved_model.builder.SavedModelBuilder(saved_model_dir)
     with session.Session(graph=ops.Graph()) as sess:
@@ -161,91 +301,13 @@ def testMultipleMetaGraphDef(self):
       builder.save(True)
 
     # Convert to tflite
-    convert_saved_model.tflite_from_saved_model(
+    _, in_tensors, out_tensors = self._convertSavedModel(
         saved_model_dir=saved_model_dir,
         tag_set=set([saved_model.tag_constants.SERVING, "additional_test_tag"]))
 
-
-class ConvertSavedModelTestBasicGraphToText(test_util.TensorFlowTestCase):
-
-  def _createSimpleSavedModel(self, shape):
-    """Create a simple SavedModel."""
-    saved_model_dir = os.path.join(self.get_temp_dir(), "simple_savedmodel")
-    with session.Session() as sess:
-      in_tensor_1 = array_ops.placeholder(
-          shape=shape, dtype=dtypes.float32, name="inputB")
-      in_tensor_2 = array_ops.placeholder(
-          shape=shape, dtype=dtypes.float32, name="inputA")
-      out_tensor = in_tensor_1 + in_tensor_2
-      inputs = {"x": in_tensor_1, "y": in_tensor_2}
-      outputs = {"z": out_tensor}
-      saved_model.simple_save(sess, saved_model_dir, inputs, outputs)
-    return saved_model_dir
-
-  def _getInputArrayNames(self, model_proto):
-    return [data.name for data in model_proto.input_arrays]
-
-  def _getInputArrayShapes(self, model_proto):
-    return [
-        [dim for dim in data.shape.dims] for data in model_proto.input_arrays
-    ]
-
-  def _get_model_flags_proto_from_file(self, filename):
-    proto = _model_flags_pb2.ModelFlags()
-    with gfile.Open(filename, "rb") as output_file:
-      proto.ParseFromString(output_file.read())
-      output_file.close()
-    return proto
-
-  def testSimpleSavedModel(self):
-    """Test a simple SavedModel."""
-    saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
-    output_file_model = os.path.join(self.get_temp_dir(), "model.pb")
-    output_file_flags = os.path.join(self.get_temp_dir(), "model.pbtxt")
-
-    convert_saved_model.saved_model_to_frozen_graphdef(
-        saved_model_dir=saved_model_dir,
-        output_file_model=output_file_model,
-        output_file_flags=output_file_flags,
-        input_arrays=["inputB", "inputA"])
-
-    proto = self._get_model_flags_proto_from_file(output_file_flags)
-    self.assertEqual(proto.output_arrays, ["add"])
-    self.assertEqual(self._getInputArrayNames(proto), ["inputA", "inputB"])
-    self.assertEqual(
-        self._getInputArrayShapes(proto), [[1, 16, 16, 3], [1, 16, 16, 3]])
-
-  def testSimpleSavedModelWithDifferentInputNames(self):
-    """Test a simple SavedModel."""
-    saved_model_dir = self._createSimpleSavedModel(shape=[1, 16, 16, 3])
-    output_file_model = os.path.join(self.get_temp_dir(), "model.pb")
-    output_file_flags = os.path.join(self.get_temp_dir(), "model.pbtxt")
-
-    # Check case where input shape is given.
-    convert_saved_model.saved_model_to_frozen_graphdef(
-        saved_model_dir=saved_model_dir,
-        output_file_model=output_file_model,
-        output_file_flags=output_file_flags,
-        input_arrays=["inputA"],
-        input_shapes={"inputA": [1, 16, 16, 3]})
-
-    proto = self._get_model_flags_proto_from_file(output_file_flags)
-    self.assertEqual(proto.output_arrays, ["add"])
-    self.assertEqual(self._getInputArrayNames(proto), ["inputA"])
-    self.assertEqual(self._getInputArrayShapes(proto), [[1, 16, 16, 3]])
-
-    # Check case where input shape is None.
-    convert_saved_model.saved_model_to_frozen_graphdef(
-        saved_model_dir=saved_model_dir,
-        output_file_model=output_file_model,
-        output_file_flags=output_file_flags,
-        input_arrays=["inputA"],
-        input_shapes={"inputA": None})
-
-    proto = self._get_model_flags_proto_from_file(output_file_flags)
-    self.assertEqual(proto.output_arrays, ["add"])
-    self.assertEqual(self._getInputArrayNames(proto), ["inputA"])
-    self.assertEqual(self._getInputArrayShapes(proto), [[1, 16, 16, 3]])
+    self.assertEqual(self._getArrayNames(out_tensors), ["add:0"])
+    self.assertEqual(self._getArrayNames(in_tensors), ["Placeholder:0"])
+    self.assertEqual(self._getArrayShapes(in_tensors), [[1, 28, 28]])
 
 
 class Model(keras.Model):
@@ -354,7 +416,7 @@ def dummy_input_fn():
   return image, labels
 
 
-class ConvertSavedModelTestTrainGraph(test_util.TensorFlowTestCase):
+class FreezeSavedModelTestTrainGraph(test_util.TensorFlowTestCase):
 
   def testTrainedMnistSavedModel(self):
     """Test mnist SavedModel, trained with dummy data and small steps."""
@@ -379,13 +441,16 @@ def testTrainedMnistSavedModel(self):
     # Convert to tflite and test output
     saved_model_name = os.listdir(saved_model_dir)[0]
     saved_model_final_dir = os.path.join(saved_model_dir, saved_model_name)
-    output_file = os.path.join(saved_model_dir, saved_model_final_dir + ".lite")
+
     # TODO(zhixianyan): no need to limit output_arrays to `Softmax'
     # once b/74205001 fixed and argmax implemented in tflite.
-    result = convert_saved_model.tflite_from_saved_model(
+    result = convert_saved_model.freeze_saved_model(
         saved_model_dir=saved_model_final_dir,
+        input_arrays=None,
+        input_shapes=None,
         output_arrays=["Softmax"],
-        output_file=output_file)
+        tag_set=set([tag_constants.SERVING]),
+        signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY)
 
     self.assertTrue(result)
 
diff --git a/tensorflow/contrib/lite/python/convert_saved_model_to_frozen_graph.py b/tensorflow/contrib/lite/python/convert_saved_model_to_frozen_graph.py
deleted file mode 100644
index 4d9782f4a6a9e8..00000000000000
--- a/tensorflow/contrib/lite/python/convert_saved_model_to_frozen_graph.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Python console command for generating frozen models from SavedModels.
-
-This exists to add SavedModel compatibility to TOCO.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import sys
-from tensorflow.contrib.lite.python.convert_saved_model import saved_model_to_frozen_graphdef
-from tensorflow.python.platform import app
-
-FLAGS = None
-
-
-def execute(unused_args):
-  """Calls function to convert the SavedModel to a frozen graph."""
-  # Error handling.
-  if FLAGS.input_shapes and not FLAGS.input_arrays:
-    raise ValueError("Input shapes requires input arrays to be specified.")
-
-  # Calls saved_model_to_frozen_graphdef function to generate frozen graph.
-  input_arrays = (FLAGS.input_arrays.split(",") if FLAGS.input_arrays else None)
-  input_shapes = None
-  if FLAGS.input_shapes:
-    input_shapes = {
-        input_arrays[idx]: shape.split(",")
-        for idx, shape in enumerate(FLAGS.input_shapes.split(":"))
-    }
-  output_arrays = (
-      FLAGS.output_arrays.split(",") if FLAGS.output_arrays else None)
-  tag_set = set(FLAGS.tag_set.split(",")) if FLAGS.tag_set else None
-
-  saved_model_to_frozen_graphdef(
-      saved_model_dir=FLAGS.saved_model_directory,
-      output_file_model=FLAGS.output_file_model,
-      output_file_flags=FLAGS.output_file_flags,
-      input_arrays=input_arrays,
-      input_shapes=input_shapes,
-      output_arrays=output_arrays,
-      tag_set=tag_set,
-      signature_key=FLAGS.signature_key,
-      batch_size=FLAGS.batch_size)
-
-
-def main():
-  global FLAGS
-  # Parses flags.
-  parser = argparse.ArgumentParser(
-      description="Invoke SavedModel to frozen model converter.")
-  parser.add_argument(
-      "saved_model_directory",
-      type=str,
-      help="Full path to directory containing the SavedModel.")
-  parser.add_argument(
-      "output_file_model",
-      type=str,
-      help="Full file path to save frozen graph.")
-  parser.add_argument(
-      "output_file_flags", type=str, help="Full file path to save ModelFlags.")
-  parser.add_argument(
-      "--input_arrays",
-      type=str,
-      help="Name of the input arrays, comma-separated.")
-  parser.add_argument(
-      "--input_shapes",
-      type=str,
-      help="Shapes corresponding to --input_arrays, colon-separated.")
-  parser.add_argument(
-      "--output_arrays",
-      type=str,
-      help="Name of the output arrays, comma-separated.")
-  parser.add_argument(
-      "--tag_set", type=str, help="Name of output arrays, comma-separated.")
-  parser.add_argument(
-      "--signature_key",
-      type=str,
-      help="Key identifying SignatureDef containing inputs and outputs.")
-  parser.add_argument(
-      "--batch_size",
-      type=int,
-      help="Batch size for the model. Replaces the first dimension of an "
-      "input size array if undefined.")
-
-  FLAGS, unparsed = parser.parse_known_args()
-
-  app.run(main=execute, argv=[sys.argv[0]] + unparsed)
-
-
-if __name__ == "__main__":
-  main()
diff --git a/tensorflow/contrib/lite/python/interpreter.py b/tensorflow/contrib/lite/python/interpreter.py
index cb9c0d31218955..779bda4c9d05fd 100644
--- a/tensorflow/contrib/lite/python/interpreter.py
+++ b/tensorflow/contrib/lite/python/interpreter.py
@@ -17,7 +17,19 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.lite.python.interpreter_wrapper import tensorflow_wrap_interpreter_wrapper as interpreter_wrapper
+from tensorflow.python.util.lazy_loader import LazyLoader
+
+# Lazy load since some of the performance benchmark skylark rules
+# break dependencies. Must use double quotes to match code internal rewrite
+# rule.
+# pylint: disable=g-inconsistent-quotes
+_interpreter_wrapper = LazyLoader(
+    "_interpreter_wrapper", globals(),
+    "tensorflow.contrib.lite.python.interpreter_wrapper."
+    "tensorflow_wrap_interpreter_wrapper")
+# pylint: enable=g-inconsistent-quotes
+
+del LazyLoader
 
 
 class Interpreter(object):
@@ -35,14 +47,14 @@ def __init__(self, model_path=None, model_content=None):
     """
     if model_path and not model_content:
       self._interpreter = (
-          interpreter_wrapper.InterpreterWrapper_CreateWrapperCPPFromFile(
+          _interpreter_wrapper.InterpreterWrapper_CreateWrapperCPPFromFile(
               model_path))
       if not self._interpreter:
         raise ValueError('Failed to open {}'.format(model_path))
     elif model_content and not model_path:
       self._interpreter = (
-          interpreter_wrapper.InterpreterWrapper_CreateWrapperCPPFromBuffer(
-              model_content, len(model_content)))
+          _interpreter_wrapper.InterpreterWrapper_CreateWrapperCPPFromBuffer(
+              model_content))
       if not self._interpreter:
         raise ValueError(
             'Failed to create model from {} bytes'.format(len(model_content)))
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/BUILD b/tensorflow/contrib/lite/python/interpreter_wrapper/BUILD
index 453eda6e734576..12ab38847dc0f8 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/BUILD
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/BUILD
@@ -15,7 +15,7 @@ cc_library(
         "//tensorflow/contrib/lite/kernels:builtin_ops",
         "//tensorflow/core:lib",
         "//tensorflow/python:numpy_lib",
-        "//util/python:python_headers",
+        "//third_party/python_runtime:headers",
         "@com_google_absl//absl/memory",
     ],
 )
@@ -27,6 +27,6 @@ tf_py_wrap_cc(
     ],
     deps = [
         ":interpreter_wrapper_lib",
-        "//util/python:python_headers",
+        "//third_party/python_runtime:headers",
     ],
 )
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index 04fc098129854e..5f304ad45d400b 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -42,6 +42,8 @@ std::unique_ptr<tflite::Interpreter> CreateInterpreter(
     return nullptr;
   }
 
+  tensorflow::ImportNumpy();
+
   std::unique_ptr<tflite::Interpreter> interpreter;
   tflite::InterpreterBuilder(*model, resolver)(&interpreter);
   if (interpreter) {
@@ -116,7 +118,7 @@ PyObject* PyArrayFromIntVector(const int* data, npy_intp size) {
 PyObject* PyTupleFromQuantizationParam(const TfLiteQuantizationParams& param) {
   PyObject* result = PyTuple_New(2);
   PyTuple_SET_ITEM(result, 0, PyFloat_FromDouble(param.scale));
-  PyTuple_SET_ITEM(result, 1, PyInt_FromLong(param.zero_point));
+  PyTuple_SET_ITEM(result, 1, PyLong_FromLong(param.zero_point));
   return result;
 }
 
@@ -331,9 +333,14 @@ InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromFile(
 }
 
 InterpreterWrapper* InterpreterWrapper::CreateWrapperCPPFromBuffer(
-    const char* data, size_t len) {
+    PyObject* data) {
+  char * buf = nullptr;
+  Py_ssize_t length;
+  if (PY_TO_CPPSTRING(data, &buf, &length) == -1) {
+    return nullptr;
+  }
   std::unique_ptr<tflite::FlatBufferModel> model =
-      tflite::FlatBufferModel::BuildFromBuffer(data, len);
+      tflite::FlatBufferModel::BuildFromBuffer(buf, length);
   return model ? new InterpreterWrapper(std::move(model)) : nullptr;
 }
 
diff --git a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
index 0972c572595f50..01320af7a9ea3a 100644
--- a/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
+++ b/tensorflow/contrib/lite/python/interpreter_wrapper/interpreter_wrapper.h
@@ -40,8 +40,7 @@ class InterpreterWrapper {
   static InterpreterWrapper* CreateWrapperCPPFromFile(const char* model_path);
 
   // SWIG caller takes ownership of pointer.
-  static InterpreterWrapper* CreateWrapperCPPFromBuffer(const char* data,
-                                                        size_t len);
+  static InterpreterWrapper* CreateWrapperCPPFromBuffer(PyObject* data);
 
   ~InterpreterWrapper();
   bool AllocateTensors();
diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index 4ea40201f73bb0..eee8faa2dc044d 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -16,21 +16,359 @@
 
 EXPERIMENTAL: APIs here are unstable and likely to change without notice.
 
+@@TocoConverter
 @@toco_convert
 @@toco_convert_protos
-@@tflite_from_saved_model
+@@Interpreter
 @@OpHint
 @@convert_op_hints_to_stubs
 
+@@FLOAT
+@@QUANTIZED_UINT8
+@@TFLITE
+@@GRAPHVIZ_DOT
+
 """
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=unused-import
+from six import PY3
+
+from google.protobuf import text_format as _text_format
+from google.protobuf.message import DecodeError
+from tensorflow.contrib.lite.python import lite_constants as constants
+from tensorflow.contrib.lite.python.convert import tensor_name
 from tensorflow.contrib.lite.python.convert import toco_convert
-from tensorflow.contrib.lite.python.convert import toco_convert_protos
-from tensorflow.contrib.lite.python.convert_saved_model import tflite_from_saved_model
-from tensorflow.contrib.lite.python.op_hint import convert_op_hints_to_stubs
-from tensorflow.contrib.lite.python.op_hint import OpHint
-# pylint: enable=unused-import
+from tensorflow.contrib.lite.python.convert import toco_convert_protos  # pylint: disable=unused-import
+from tensorflow.contrib.lite.python.convert_saved_model import freeze_saved_model
+from tensorflow.contrib.lite.python.convert_saved_model import get_tensors_from_tensor_names
+from tensorflow.contrib.lite.python.convert_saved_model import set_tensor_shapes
+from tensorflow.contrib.lite.python.interpreter import Interpreter  # pylint: disable=unused-import
+from tensorflow.contrib.lite.python.op_hint import convert_op_hints_to_stubs  # pylint: disable=unused-import
+from tensorflow.contrib.lite.python.op_hint import OpHint  # pylint: disable=unused-import
+from tensorflow.core.framework import graph_pb2 as _graph_pb2
+from tensorflow.python.client import session as _session
+from tensorflow.python.framework import graph_util as tf_graph_util
+from tensorflow.python.framework.importer import import_graph_def
+from tensorflow.python.ops.variables import global_variables_initializer
+from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import tag_constants
+
+
+class TocoConverter(object):
+  """Convert a TensorFlow model into `output_format` using TOCO.
+
+  This is used to convert from a TensorFlow GraphDef or SavedModel into either a
+  TFLite FlatBuffer or graph visualization.
+
+  Attributes:
+
+    inference_type: Target data type of arrays in the output file. Currently
+      must be `{FLOAT, QUANTIZED_UINT8}`.  (default FLOAT)
+    inference_input_type: Target data type of input arrays. Allows for a
+      different type for input arrays in the case of quantization. Currently
+      must be `{FLOAT, QUANTIZED_UINT8}`. (default `inference_type`)
+    output_format: Output file format. Currently must be `{TFLITE,
+      GRAPHVIZ_DOT}`. (default TFLITE)
+    quantized_input_stats: Dict of strings representing input tensor names
+      mapped to tuple of integers representing the mean and standard deviation
+      of the training data (e.g., {"foo" : (0., 1.)}). Only need if
+      `inference_type` is `QUANTIZED_UINT8`. (default {})
+    default_ranges_stats: Tuple of integers representing (min, max) range values
+      for all arrays without a specified range. Intended for experimenting with
+      quantization via "dummy quantization". (default None)
+    drop_control_dependency: Boolean indicating whether to drop control
+      dependencies silently. This is due to TFLite not supporting control
+      dependencies. (default True)
+    reorder_across_fake_quant: Boolean indicating whether to reorder FakeQuant
+      nodes in unexpected locations. Used when the location of the FakeQuant
+      nodes is preventing graph transformations necessary to convert the graph.
+      Results in a graph that differs from the quantized training graph,
+      potentially causing differing arithmetic behavior. (default False)
+    change_concat_input_ranges: Boolean to change behavior of min/max ranges for
+      inputs and outputs of the concat operator for quantized models. Changes
+      the ranges of concat operator overlap when true. (default False)
+    allow_custom_ops: Boolean indicating whether to allow custom operations.
+      When false any unknown operation is an error. When true, custom ops are
+      created for any op that is unknown. The developer will need to provide
+      these to the TensorFlow Lite runtime with a custom resolver.
+      (default False)
+
+  Example usage:
+
+    # Converting a GraphDef from session.
+    converter = lite.TocoConverter.from_session(sess, in_tensors, out_tensors)
+    tflite_model = converter.convert()
+    open("converted_model.tflite", "wb").write(tflite_model)
+
+    # Converting a GraphDef from file.
+    converter = lite.TocoConverter.from_frozen_graph(
+      graph_def_file, input_arrays, output_arrays)
+    tflite_model = converter.convert()
+    open("converted_model.tflite", "wb").write(tflite_model)
+
+    # Converting a SavedModel.
+    converter = lite.TocoConverter.from_saved_model(saved_model_dir)
+    tflite_model = converter.convert()
+  """
+
+  def __init__(self, graph_def, input_tensors, output_tensors):
+    """Constructor for TocoConverter.
+
+    Args:
+
+      graph_def: TensorFlow GraphDef.
+      input_tensors: List of input tensors. Type and shape are computed using
+        `foo.get_shape()` and `foo.dtype`.
+      output_tensors: List of output tensors (only .name is used from this).
+    """
+    self._graph_def = graph_def
+    self._input_tensors = input_tensors
+    self._output_tensors = output_tensors
+    self.inference_type = constants.FLOAT
+    self.inference_input_type = None
+    self.output_format = constants.TFLITE
+    self.quantized_input_stats = {}
+    self.default_ranges_stats = None
+    self.drop_control_dependency = True
+    self.reorder_across_fake_quant = False
+    self.change_concat_input_ranges = False
+    self.allow_custom_ops = False
+
+  @classmethod
+  def from_session(cls, sess, input_tensors, output_tensors):
+    """Creates a TocoConverter class from a TensorFlow Session.
+
+    Args:
+      sess: TensorFlow Session.
+      input_tensors: List of input tensors. Type and shape are computed using
+        `foo.get_shape()` and `foo.dtype`.
+      output_tensors: List of output tensors (only .name is used from this).
+
+    Returns:
+      TocoConverter class.
+    """
+    graph_def = _freeze_graph(sess, output_tensors)
+    return cls(graph_def, input_tensors, output_tensors)
+
+  @classmethod
+  def from_frozen_graph(cls,
+                        graph_def_file,
+                        input_arrays,
+                        output_arrays,
+                        input_shapes=None):
+    """Creates a TocoConverter class from a file containing a frozen GraphDef.
+
+    Args:
+      graph_def_file: Full filepath of file containing TensorFlow GraphDef.
+      input_arrays: List of input tensors to freeze graph with.
+      output_arrays: List of output tensors to freeze graph with.
+      input_shapes: Dict of strings representing input tensor names to list of
+        integers representing input shapes (e.g., {"foo" : [1, 16, 16, 3]}).
+        Automatically determined when input shapes is None (e.g., {"foo" :
+        None}). (default None)
+
+    Returns:
+      TocoConverter class.
+
+    Raises:
+      ValueError:
+        Unable to parse input file.
+        The graph is not frozen.
+        input_arrays or output_arrays contains an invalid tensor name.
+    """
+    with _session.Session() as sess:
+      sess.run(global_variables_initializer())
+
+      # Read GraphDef from file.
+      graph_def = _graph_pb2.GraphDef()
+      with open(graph_def_file, "rb") as f:
+        file_content = f.read()
+      try:
+        graph_def.ParseFromString(file_content)
+      except (_text_format.ParseError, DecodeError):
+        try:
+          print("Ignore 'tcmalloc: large alloc' warnings.")
+
+          if not isinstance(file_content, str):
+            if PY3:
+              file_content = file_content.decode('utf-8')
+            else:
+              file_content = file_content.encode('utf-8')
+          _text_format.Merge(file_content, graph_def)
+        except (_text_format.ParseError, DecodeError):
+          raise ValueError(
+              "Unable to parse input file '{}'.".format(graph_def_file))
+      sess.graph.as_default()
+      import_graph_def(graph_def, name="")
+
+      # Get input and output tensors.
+      input_tensors = get_tensors_from_tensor_names(sess.graph, input_arrays)
+      output_tensors = get_tensors_from_tensor_names(sess.graph, output_arrays)
+      set_tensor_shapes(input_tensors, input_shapes)
+
+      # Check if graph is frozen.
+      if not _is_frozen_graph(sess):
+        raise ValueError("Please freeze the graph using freeze_graph.py")
+
+      # Create TocoConverter class.
+      return cls(sess.graph_def, input_tensors, output_tensors)
+
+  @classmethod
+  def from_saved_model(cls,
+                       saved_model_dir,
+                       input_arrays=None,
+                       input_shapes=None,
+                       output_arrays=None,
+                       tag_set=None,
+                       signature_key=None):
+    """Creates a TocoConverter class from a SavedModel.
+
+    Args:
+      saved_model_dir: SavedModel directory to convert.
+      input_arrays: List of input tensors to freeze graph with. Uses input
+        arrays from SignatureDef when none are provided. (default None)
+      input_shapes: Dict of strings representing input tensor names to list of
+        integers representing input shapes (e.g., {"foo" : [1, 16, 16, 3]}).
+        Automatically determined when input shapes is None (e.g., {"foo" :
+        None}). (default None)
+      output_arrays: List of output tensors to freeze graph with. Uses output
+        arrays from SignatureDef when none are provided. (default None)
+      tag_set: Set of tags identifying the MetaGraphDef within the SavedModel to
+        analyze. All tags in the tag set must be present. (default set("serve"))
+      signature_key: Key identifying SignatureDef containing inputs and outputs.
+        (default DEFAULT_SERVING_SIGNATURE_DEF_KEY)
+
+    Returns:
+      TocoConverter class.
+    """
+    if tag_set is None:
+      tag_set = set([tag_constants.SERVING])
+    if signature_key is None:
+      signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
+
+    result = freeze_saved_model(saved_model_dir, input_arrays, input_shapes,
+                                output_arrays, tag_set, signature_key)
+    return cls(
+        graph_def=result[0], input_tensors=result[1], output_tensors=result[2])
+
+  def convert(self):
+    """Converts a TensorFlow GraphDef based on instance variables.
+
+    Returns:
+      The converted data in serialized format. Either a TFLite Flatbuffer or a
+      Graphviz graph depending on value in `output_format`.
+
+    Raises:
+      ValueError:
+        Input shape is not specified.
+        None value for dimension in input_tensor.
+    """
+    # Checks dimensions in input tensor.
+    for tensor in self._input_tensors:
+      if not tensor.get_shape():
+        raise ValueError("Provide an input shape for input array '{0}'.".format(
+            tensor_name(tensor)))
+      shape = tensor.get_shape().as_list()
+      if None in shape[1:]:
+        raise ValueError(
+            "None is only supported in the 1st dimension. Tensor '{0}' has "
+            "invalid shape '{1}'.".format(tensor_name(tensor), shape))
+      elif shape[0] is None:
+        self._set_batch_size(batch_size=1)
+
+    # Get quantization stats. Ensures there is one stat per name if the stats
+    # are specified.
+    if self.quantized_input_stats:
+      quantized_stats = []
+      invalid_stats = []
+      for tensor in self._input_tensors:
+        name = tensor_name(tensor)
+        if name in self.quantized_input_stats:
+          quantized_stats.append(self.quantized_input_stats[name])
+        else:
+          invalid_stats.append(name)
+
+      if invalid_stats:
+        raise ValueError("Quantization input stats are not available for input "
+                         "tensors '{0}'.".format(",".join(invalid_stats)))
+    else:
+      quantized_stats = None
+
+    # Converts model.
+    result = toco_convert(
+        input_data=self._graph_def,
+        input_tensors=self._input_tensors,
+        output_tensors=self._output_tensors,
+        inference_type=self.inference_type,
+        inference_input_type=self.inference_input_type,
+        input_format=constants.TENSORFLOW_GRAPHDEF,
+        output_format=self.output_format,
+        quantized_input_stats=quantized_stats,
+        default_ranges_stats=self.default_ranges_stats,
+        drop_control_dependency=self.drop_control_dependency,
+        reorder_across_fake_quant=self.reorder_across_fake_quant,
+        change_concat_input_ranges=self.change_concat_input_ranges,
+        allow_custom_ops=self.allow_custom_ops)
+    return result
+
+  def get_input_arrays(self):
+    """Returns a list of the names of the input tensors.
+
+    Returns:
+      List of strings.
+    """
+    return [tensor_name(tensor) for tensor in self._input_tensors]
+
+  def _set_batch_size(self, batch_size):
+    """Sets the first dimension of the input tensor to `batch_size`.
+
+    Args:
+      batch_size: Batch size for the model. Replaces the first dimension of an
+        input size array if undefined. (default 1)
+    """
+    for tensor in self._input_tensors:
+      shape = tensor.get_shape().as_list()
+      shape[0] = batch_size
+      tensor.set_shape(shape)
+
+
+def _is_frozen_graph(sess):
+  """Determines if the graph is frozen.
+
+  Determines if a graph has previously been frozen by checking for any
+  operations of type Variable*. If variables are found, the graph is not frozen.
+
+  Args:
+    sess: TensorFlow Session.
+
+  Returns:
+    Bool.
+  """
+  for op in sess.graph.get_operations():
+    if op.type.startswith("Variable"):
+      return False
+  return True
+
+
+def _freeze_graph(sess, output_tensors):
+  """Returns a frozen GraphDef.
+
+  Freezes a graph with Variables in it. Otherwise the existing GraphDef is
+  returned.
+
+  Args:
+    sess: TensorFlow Session.
+    output_tensors: List of output tensors (only .name is used from this).
+
+  Returns:
+    Frozen GraphDef.
+  """
+  if not _is_frozen_graph(sess):
+    sess.run(global_variables_initializer())
+    output_arrays = [tensor_name(tensor) for tensor in output_tensors]
+    return tf_graph_util.convert_variables_to_constants(sess, sess.graph_def,
+                                                        output_arrays)
+  else:
+    return sess.graph_def
diff --git a/tensorflow/contrib/lite/python/lite_test.py b/tensorflow/contrib/lite/python/lite_test.py
new file mode 100644
index 00000000000000..019a3a5f69b35f
--- /dev/null
+++ b/tensorflow/contrib/lite/python/lite_test.py
@@ -0,0 +1,556 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for lite.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import numpy as np
+
+from tensorflow.contrib.lite.python import lite
+from tensorflow.contrib.lite.python import lite_constants
+from tensorflow.contrib.lite.python.interpreter import Interpreter
+from tensorflow.python.client import session
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+from tensorflow.python.saved_model import saved_model
+from tensorflow.python.training.training_util import write_graph
+
+
+class FromSessionTest(test_util.TensorFlowTestCase):
+
+  def testFloat(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    out_tensor = in_tensor + in_tensor
+    sess = session.Session()
+
+    # Convert model and ensure model is not None.
+    converter = lite.TocoConverter.from_session(sess, [in_tensor], [out_tensor])
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(1, len(input_details))
+    self.assertEqual('Placeholder', input_details[0]['name'])
+    self.assertEqual(np.float32, input_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+    self.assertEqual((0., 0.), input_details[0]['quantization'])
+
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual('add', output_details[0]['name'])
+    self.assertEqual(np.float32, output_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+    self.assertEqual((0., 0.), output_details[0]['quantization'])
+
+  def testQuantization(self):
+    in_tensor_1 = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputA')
+    in_tensor_2 = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputB')
+    out_tensor = array_ops.fake_quant_with_min_max_args(
+        in_tensor_1 + in_tensor_2, min=0., max=1., name='output')
+    sess = session.Session()
+
+    # Convert model and ensure model is not None.
+    converter = lite.TocoConverter.from_session(
+        sess, [in_tensor_1, in_tensor_2], [out_tensor])
+    converter.inference_type = lite_constants.QUANTIZED_UINT8
+    converter.quantized_input_stats = {
+        'inputA': (0., 1.),
+        'inputB': (0., 1.)
+    }  # mean, std_dev
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(2, len(input_details))
+    self.assertEqual('inputA', input_details[0]['name'])
+    self.assertEqual(np.uint8, input_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+    self.assertEqual((1., 0.),
+                     input_details[0]['quantization'])  # scale, zero_point
+
+    self.assertEqual('inputB', input_details[1]['name'])
+    self.assertEqual(np.uint8, input_details[1]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == input_details[1]['shape']).all())
+    self.assertEqual((1., 0.),
+                     input_details[1]['quantization'])  # scale, zero_point
+
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual('output', output_details[0]['name'])
+    self.assertEqual(np.uint8, output_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+    self.assertTrue(output_details[0]['quantization'][0] > 0)  # scale
+
+  def testQuantizationInvalid(self):
+    in_tensor_1 = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputA')
+    in_tensor_2 = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputB')
+    out_tensor = array_ops.fake_quant_with_min_max_args(
+        in_tensor_1 + in_tensor_2, min=0., max=1., name='output')
+    sess = session.Session()
+
+    # Convert model and ensure model is not None.
+    converter = lite.TocoConverter.from_session(
+        sess, [in_tensor_1, in_tensor_2], [out_tensor])
+    converter.inference_type = lite_constants.QUANTIZED_UINT8
+    converter.quantized_input_stats = {'inputA': (0., 1.)}  # mean, std_dev
+    with self.assertRaises(ValueError) as error:
+      converter.convert()
+    self.assertEqual(
+        'Quantization input stats are not available for input tensors '
+        '\'inputB\'.', str(error.exception))
+
+  def testSizeNoneInvalid(self):
+    in_tensor = array_ops.placeholder(dtype=dtypes.float32)
+    out_tensor = in_tensor + in_tensor
+    sess = session.Session()
+
+    # Test invalid shape. None after 1st dimension.
+    converter = lite.TocoConverter.from_session(sess, [in_tensor], [out_tensor])
+    with self.assertRaises(ValueError) as error:
+      converter.convert()
+    self.assertEqual('Provide an input shape for input array \'Placeholder\'.',
+                     str(error.exception))
+
+  def testBatchSizeInvalid(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, None, 16, 3], dtype=dtypes.float32)
+    out_tensor = in_tensor + in_tensor
+    sess = session.Session()
+
+    # Test invalid shape. None after 1st dimension.
+    converter = lite.TocoConverter.from_session(sess, [in_tensor], [out_tensor])
+    with self.assertRaises(ValueError) as error:
+      converter.convert()
+    self.assertEqual(
+        'None is only supported in the 1st dimension. Tensor '
+        '\'Placeholder\' has invalid shape \'[1, None, 16, 3]\'.',
+        str(error.exception))
+
+  def testBatchSizeValid(self):
+    in_tensor = array_ops.placeholder(
+        shape=[None, 16, 16, 3], dtype=dtypes.float32)
+    out_tensor = in_tensor + in_tensor
+    sess = session.Session()
+
+    # Convert model and ensure model is not None.
+    converter = lite.TocoConverter.from_session(sess, [in_tensor], [out_tensor])
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(1, len(input_details))
+    self.assertEqual('Placeholder', input_details[0]['name'])
+    self.assertEqual(np.float32, input_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+    self.assertEqual((0., 0.), input_details[0]['quantization'])
+
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual('add', output_details[0]['name'])
+    self.assertEqual(np.float32, output_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+    self.assertEqual((0., 0.), output_details[0]['quantization'])
+
+  def testFreezeGraph(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    var = variable_scope.get_variable(
+        'weights', shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    out_tensor = in_tensor + var
+    sess = session.Session()
+
+    # Convert model and ensure model is not None.
+    converter = lite.TocoConverter.from_session(sess, [in_tensor], [out_tensor])
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(1, len(input_details))
+    self.assertEqual('Placeholder', input_details[0]['name'])
+    self.assertEqual(np.float32, input_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+    self.assertEqual((0., 0.), input_details[0]['quantization'])
+
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual('add', output_details[0]['name'])
+    self.assertEqual(np.float32, output_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+    self.assertEqual((0., 0.), output_details[0]['quantization'])
+
+  def testGraphviz(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    out_tensor = in_tensor + in_tensor
+    sess = session.Session()
+
+    # Convert model and ensure model is not None.
+    converter = lite.TocoConverter.from_session(sess, [in_tensor], [out_tensor])
+    converter.output_format = lite_constants.GRAPHVIZ_DOT
+    graphviz_output = converter.convert()
+    self.assertTrue(graphviz_output)
+
+  def testInferenceInputType(self):
+    in_tensor = array_ops.placeholder(shape=[1, 16, 16, 3], dtype=dtypes.uint8)
+    out_tensor = in_tensor + in_tensor
+    sess = session.Session()
+
+    # Convert model and ensure model is not None.
+    converter = lite.TocoConverter.from_session(sess, [in_tensor], [out_tensor])
+    converter.inference_input_type = lite_constants.QUANTIZED_UINT8
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(1, len(input_details))
+    self.assertEqual('Placeholder', input_details[0]['name'])
+    self.assertEqual(np.uint8, input_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+    self.assertEqual((0., 0.), input_details[0]['quantization'])
+
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual('add', output_details[0]['name'])
+    self.assertEqual(np.uint8, output_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+    self.assertEqual((0., 0.), input_details[0]['quantization'])
+
+  def testDefaultRangesStats(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    out_tensor = in_tensor + in_tensor
+    sess = session.Session()
+
+    # Convert model and ensure model is not None.
+    converter = lite.TocoConverter.from_session(sess, [in_tensor], [out_tensor])
+    converter.inference_type = lite_constants.QUANTIZED_UINT8
+    converter.quantized_input_stats = {'Placeholder': (0., 1.)}  # mean, std_dev
+    converter.default_ranges_stats = (0, 6)  # min, max
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(1, len(input_details))
+    self.assertEqual('Placeholder', input_details[0]['name'])
+    self.assertEqual(np.uint8, input_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+    self.assertEqual((1., 0.), input_details[0]['quantization'])
+
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual('add', output_details[0]['name'])
+    self.assertEqual(np.uint8, output_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+    self.assertTrue(output_details[0]['quantization'][0] > 0)  # scale
+
+
+class FromFrozenGraphFile(test_util.TensorFlowTestCase):
+
+  def testFloat(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    _ = in_tensor + in_tensor
+    sess = session.Session()
+
+    # Write graph to file.
+    graph_def_file = os.path.join(self.get_temp_dir(), 'model.pb')
+    write_graph(sess.graph_def, '', graph_def_file, False)
+
+    # Convert model and ensure model is not None.
+    converter = lite.TocoConverter.from_frozen_graph(graph_def_file,
+                                                     ['Placeholder'], ['add'])
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(1, len(input_details))
+    self.assertEqual('Placeholder', input_details[0]['name'])
+    self.assertEqual(np.float32, input_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+    self.assertEqual((0., 0.), input_details[0]['quantization'])
+
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual('add', output_details[0]['name'])
+    self.assertEqual(np.float32, output_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+    self.assertEqual((0., 0.), output_details[0]['quantization'])
+
+  def testFloatWithShapesArray(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    _ = in_tensor + in_tensor
+    sess = session.Session()
+
+    # Write graph to file.
+    graph_def_file = os.path.join(self.get_temp_dir(), 'model.pb')
+    write_graph(sess.graph_def, '', graph_def_file, False)
+
+    # Convert model and ensure model is not None.
+    converter = lite.TocoConverter.from_frozen_graph(
+        graph_def_file, ['Placeholder'], ['add'],
+        input_shapes={'Placeholder': [1, 16, 16, 3]})
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(1, len(input_details))
+    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+
+  def testFreezeGraph(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    var = variable_scope.get_variable(
+        'weights', shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    _ = in_tensor + var
+    sess = session.Session()
+
+    # Write graph to file.
+    graph_def_file = os.path.join(self.get_temp_dir(), 'model.pb')
+    write_graph(sess.graph_def, '', graph_def_file, False)
+
+    # Ensure the graph with variables cannot be converted.
+    with self.assertRaises(ValueError) as error:
+      lite.TocoConverter.from_frozen_graph(graph_def_file, ['Placeholder'],
+                                           ['add'])
+    self.assertEqual('Please freeze the graph using freeze_graph.py',
+                     str(error.exception))
+
+  def testPbtxt(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    _ = in_tensor + in_tensor
+    sess = session.Session()
+
+    # Write graph to file.
+    graph_def_file = os.path.join(self.get_temp_dir(), 'model.pbtxt')
+    write_graph(sess.graph_def, '', graph_def_file, True)
+
+    # Convert model and ensure model is not None.
+    converter = lite.TocoConverter.from_frozen_graph(graph_def_file,
+                                                     ['Placeholder'], ['add'])
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(1, len(input_details))
+    self.assertEqual('Placeholder', input_details[0]['name'])
+    self.assertEqual(np.float32, input_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+    self.assertEqual((0., 0.), input_details[0]['quantization'])
+
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual('add', output_details[0]['name'])
+    self.assertEqual(np.float32, output_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+    self.assertEqual((0., 0.), output_details[0]['quantization'])
+
+  def testInvalidFile(self):
+    graph_def_file = os.path.join(self.get_temp_dir(), 'invalid_file')
+    with gfile.Open(graph_def_file, 'wb') as temp_file:
+      temp_file.write('bad data')
+      temp_file.flush()
+
+    # Attempts to convert the invalid model.
+    with self.assertRaises(ValueError) as error:
+      lite.TocoConverter.from_frozen_graph(graph_def_file, ['Placeholder'],
+                                           ['add'])
+    self.assertEqual(
+        'Unable to parse input file \'{}\'.'.format(graph_def_file),
+        str(error.exception))
+
+
+class FromSavedModelTest(test_util.TensorFlowTestCase):
+
+  def _createSavedModel(self, shape):
+    """Create a simple SavedModel."""
+    saved_model_dir = os.path.join(self.get_temp_dir(), 'simple_savedmodel')
+    with session.Session() as sess:
+      in_tensor_1 = array_ops.placeholder(
+          shape=shape, dtype=dtypes.float32, name='inputB')
+      in_tensor_2 = array_ops.placeholder(
+          shape=shape, dtype=dtypes.float32, name='inputA')
+      out_tensor = in_tensor_1 + in_tensor_2
+      inputs = {'x': in_tensor_1, 'y': in_tensor_2}
+      outputs = {'z': out_tensor}
+      saved_model.simple_save(sess, saved_model_dir, inputs, outputs)
+    return saved_model_dir
+
+  def testSimpleModel(self):
+    """Test a SavedModel."""
+    saved_model_dir = self._createSavedModel(shape=[1, 16, 16, 3])
+
+    # Convert model and ensure model is not None.
+    converter = lite.TocoConverter.from_saved_model(saved_model_dir)
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(2, len(input_details))
+    self.assertEqual('inputA', input_details[0]['name'])
+    self.assertEqual(np.float32, input_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+    self.assertEqual((0., 0.), input_details[0]['quantization'])
+
+    self.assertEqual('inputB', input_details[1]['name'])
+    self.assertEqual(np.float32, input_details[1]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == input_details[1]['shape']).all())
+    self.assertEqual((0., 0.), input_details[1]['quantization'])
+
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual('add', output_details[0]['name'])
+    self.assertEqual(np.float32, output_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+    self.assertEqual((0., 0.), output_details[0]['quantization'])
+
+  def testNoneBatchSize(self):
+    """Test a SavedModel, with None in input tensor's shape."""
+    saved_model_dir = self._createSavedModel(shape=[None, 16, 16, 3])
+
+    converter = lite.TocoConverter.from_saved_model(saved_model_dir)
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(2, len(input_details))
+    self.assertEqual('inputA', input_details[0]['name'])
+    self.assertEqual(np.float32, input_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+    self.assertEqual((0., 0.), input_details[0]['quantization'])
+
+    self.assertEqual('inputB', input_details[1]['name'])
+    self.assertEqual(np.float32, input_details[1]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == input_details[1]['shape']).all())
+    self.assertEqual((0., 0.), input_details[1]['quantization'])
+
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual('add', output_details[0]['name'])
+    self.assertEqual(np.float32, output_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+    self.assertEqual((0., 0.), output_details[0]['quantization'])
+
+  def testOrderInputArrays(self):
+    """Test a SavedModel ordering of input arrays."""
+    saved_model_dir = self._createSavedModel(shape=[1, 16, 16, 3])
+
+    converter = lite.TocoConverter.from_saved_model(
+        saved_model_dir, input_arrays=['inputB', 'inputA'])
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(2, len(input_details))
+    self.assertEqual('inputA', input_details[0]['name'])
+    self.assertEqual(np.float32, input_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+    self.assertEqual((0., 0.), input_details[0]['quantization'])
+
+    self.assertEqual('inputB', input_details[1]['name'])
+    self.assertEqual(np.float32, input_details[1]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == input_details[1]['shape']).all())
+    self.assertEqual((0., 0.), input_details[1]['quantization'])
+
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual('add', output_details[0]['name'])
+    self.assertEqual(np.float32, output_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+    self.assertEqual((0., 0.), output_details[0]['quantization'])
+
+  def testSubsetInputArrays(self):
+    """Test a SavedModel with a subset of the input array names of the model."""
+    saved_model_dir = self._createSavedModel(shape=[1, 16, 16, 3])
+
+    # Check case where input shape is given.
+    converter = lite.TocoConverter.from_saved_model(
+        saved_model_dir,
+        input_arrays=['inputA'],
+        input_shapes={'inputA': [1, 16, 16, 3]})
+
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Check case where input shape is None.
+    converter = lite.TocoConverter.from_saved_model(
+        saved_model_dir, input_arrays=['inputA'], input_shapes={'inputA': None})
+
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/lite/python/tflite_convert.py b/tensorflow/contrib/lite/python/tflite_convert.py
new file mode 100644
index 00000000000000..6d77626a4be97b
--- /dev/null
+++ b/tensorflow/contrib/lite/python/tflite_convert.py
@@ -0,0 +1,333 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python command line interface for running TOCO."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import os
+import sys
+
+from tensorflow.contrib.lite.python import lite
+from tensorflow.contrib.lite.toco import toco_flags_pb2 as _toco_flags_pb2
+from tensorflow.contrib.lite.toco import types_pb2 as _types_pb2
+from tensorflow.python.platform import app
+
+
+def _parse_array(values):
+  if values:
+    return values.split(",")
+
+
+def _parse_int_array(values):
+  if values:
+    return [int(val) for val in values.split(",")]
+
+
+def _parse_set(values):
+  if values:
+    return set(values.split(","))
+
+
+def _get_toco_converter(flags):
+  """Makes a TocoConverter object based on the flags provided.
+
+  Args:
+    flags: argparse.Namespace object containing TFLite flags.
+
+  Returns:
+    TocoConverter object.
+  """
+  # Parse input and output arrays.
+  input_arrays = _parse_array(flags.input_arrays)
+  input_shapes = None
+  if flags.input_shapes:
+    input_shapes_list = [
+        _parse_int_array(shape) for shape in flags.input_shapes.split(":")
+    ]
+    input_shapes = dict(zip(input_arrays, input_shapes_list))
+  output_arrays = _parse_array(flags.output_arrays)
+
+  converter_kwargs = {
+      "input_arrays": input_arrays,
+      "input_shapes": input_shapes,
+      "output_arrays": output_arrays
+  }
+
+  # Create TocoConverter.
+  if flags.graph_def_file:
+    converter_fn = lite.TocoConverter.from_frozen_graph
+    converter_kwargs["graph_def_file"] = flags.graph_def_file
+  elif flags.saved_model_dir:
+    converter_fn = lite.TocoConverter.from_saved_model
+    converter_kwargs["saved_model_dir"] = flags.saved_model_dir
+    converter_kwargs["tag_set"] = _parse_set(flags.saved_model_tag_set)
+    converter_kwargs["signature_key"] = flags.saved_model_signature_key
+
+  return converter_fn(**converter_kwargs)
+
+
+def _convert_model(flags):
+  """Calls function to convert the TensorFlow model into a TFLite model.
+
+  Args:
+    flags: argparse.Namespace object.
+
+  Raises:
+    ValueError: Invalid flags.
+  """
+  # Create converter.
+  converter = _get_toco_converter(flags)
+  if flags.inference_type:
+    converter.inference_type = _types_pb2.IODataType.Value(flags.inference_type)
+  if flags.inference_input_type:
+    converter.inference_input_type = _types_pb2.IODataType.Value(
+        flags.inference_input_type)
+  if flags.output_format:
+    converter.output_format = _toco_flags_pb2.FileFormat.Value(
+        flags.output_format)
+
+  if flags.mean_values and flags.std_dev_values:
+    input_arrays = converter.get_input_arrays()
+    std_dev_values = _parse_int_array(flags.std_dev_values)
+    mean_values = _parse_int_array(flags.mean_values)
+    quant_stats = zip(mean_values, std_dev_values)
+    if ((not flags.input_arrays and len(input_arrays) > 1) or
+        (len(input_arrays) != len(quant_stats))):
+      raise ValueError("Mismatching --input_arrays, --std_dev_values, and "
+                       "--mean_values. The flags must have the same number of "
+                       "items. The current input arrays are '{0}'. "
+                       "--input_arrays must be present when specifying "
+                       "--std_dev_values and --mean_values with multiple input "
+                       "tensors in order to map between names and "
+                       "values".format(",".join(input_arrays)))
+    converter.quantized_input_stats = dict(zip(input_arrays, quant_stats))
+  if flags.default_ranges_min and flags.default_ranges_max:
+    converter.default_ranges_stats = (flags.default_ranges_min,
+                                      flags.default_ranges_max)
+
+  if flags.drop_control_dependency:
+    converter.drop_control_dependency = flags.drop_control_dependency
+  if flags.reorder_across_fake_quant:
+    converter.reorder_across_fake_quant = flags.reorder_across_fake_quant
+  if flags.change_concat_input_ranges:
+    converter.change_concat_input_ranges = flags.change_concat_input_ranges
+  if flags.allow_custom_ops:
+    converter.allow_custom_ops = flags.allow_custom_ops
+
+  # Convert model.
+  output_data = converter.convert()
+  with open(flags.output_file, "wb") as f:
+    f.write(output_data)
+
+
+def _check_flags(flags, unparsed):
+  """Checks the parsed and unparsed flags to ensure they are valid.
+
+  Raises an error if previously support unparsed flags are found. Raises an
+  error for parsed flags that don't meet the required conditions.
+
+  Args:
+    flags: argparse.Namespace object containing TFLite flags.
+    unparsed: List of unparsed flags.
+
+  Raises:
+    ValueError: Invalid flags.
+  """
+
+  # Check unparsed flags for common mistakes based on previous TOCO.
+  def _get_message_unparsed(flag, orig_flag, new_flag):
+    if flag.startswith(orig_flag):
+      return "\n  Use {0} instead of {1}".format(new_flag, orig_flag)
+    return ""
+
+  if unparsed:
+    output = ""
+    for flag in unparsed:
+      output += _get_message_unparsed(flag, "--input_file", "--graph_def_file")
+      output += _get_message_unparsed(flag, "--std_value", "--std_dev_values")
+      output += _get_message_unparsed(flag, "--batch_size", "--input_shapes")
+    if output:
+      raise ValueError(output)
+
+  # Check that flags are valid.
+  if flags.graph_def_file and (not flags.input_arrays or
+                               not flags.output_arrays):
+    raise ValueError("--input_arrays and --output_arrays are required with "
+                     "--graph_def_file")
+
+  if flags.input_shapes:
+    if not flags.input_arrays:
+      raise ValueError("--input_shapes must be used with --input_arrays")
+    if flags.input_shapes.count(":") != flags.input_arrays.count(","):
+      raise ValueError("--input_shapes and --input_arrays must have the same "
+                       "number of items")
+
+  if flags.std_dev_values or flags.mean_values:
+    if bool(flags.std_dev_values) != bool(flags.mean_values):
+      raise ValueError("--std_dev_values and --mean_values must be used "
+                       "together")
+    if flags.std_dev_values.count(",") != flags.mean_values.count(","):
+      raise ValueError("--std_dev_values, --mean_values must have the same "
+                       "number of items")
+
+  if bool(flags.default_ranges_min) != bool(flags.default_ranges_max):
+    raise ValueError("--default_ranges_min and --default_ranges_max must be "
+                     "used together")
+
+
+def run_main(_):
+  """Main in toco_convert.py."""
+  parser = argparse.ArgumentParser(
+      description=("Command line tool to run TensorFlow Lite Optimizing "
+                   "Converter (TOCO)."))
+
+  # Output file flag.
+  parser.add_argument(
+      "--output_file",
+      type=str,
+      help="Full filepath of the output file.",
+      required=True)
+
+  # Input file flags.
+  input_file_group = parser.add_mutually_exclusive_group(required=True)
+  input_file_group.add_argument(
+      "--graph_def_file",
+      type=str,
+      help="Full filepath of file containing TensorFlow GraphDef.")
+  input_file_group.add_argument(
+      "--saved_model_dir",
+      type=str,
+      help="Full filepath of directory containing the SavedModel.")
+
+  # Model format flags.
+  parser.add_argument(
+      "--output_format",
+      type=str,
+      choices=["TFLITE", "GRAPHVIZ_DOT"],
+      help="Output file format.")
+  parser.add_argument(
+      "--inference_type",
+      type=str,
+      choices=["FLOAT", "QUANTIZED_UINT8"],
+      help="Target data type of arrays in the output file.")
+  parser.add_argument(
+      "--inference_input_type",
+      type=str,
+      choices=["FLOAT", "QUANTIZED_UINT8"],
+      help=("Target data type of input arrays. Allows for a different type for "
+            "input arrays in the case of quantization."))
+
+  # Input and output arrays flags.
+  parser.add_argument(
+      "--input_arrays",
+      type=str,
+      help="Names of the output arrays, comma-separated.")
+  parser.add_argument(
+      "--input_shapes",
+      type=str,
+      help="Shapes corresponding to --input_arrays, colon-separated.")
+  parser.add_argument(
+      "--output_arrays",
+      type=str,
+      help="Names of the output arrays, comma-separated.")
+
+  # SavedModel related flags.
+  parser.add_argument(
+      "--saved_model_tag_set",
+      type=str,
+      help=("Comma-separated set of tags identifying the MetaGraphDef within "
+            "the SavedModel to analyze. All tags must be present. "
+            "(default \"serve\")"))
+  parser.add_argument(
+      "--saved_model_signature_key",
+      type=str,
+      help=("Key identifying the SignatureDef containing inputs and outputs. "
+            "(default DEFAULT_SERVING_SIGNATURE_DEF_KEY)"))
+
+  # Quantization flags.
+  parser.add_argument(
+      "--std_dev_values",
+      type=str,
+      help=("Standard deviation of training data for each input tensor, "
+            "comma-separated. Used for quantization. (default None)"))
+  parser.add_argument(
+      "--mean_values",
+      type=str,
+      help=("Mean of training data for each input tensor, comma-separated. "
+            "Used for quantization. (default None)"))
+  parser.add_argument(
+      "--default_ranges_min",
+      type=int,
+      help=("Default value for min bound of min/max range values used for all "
+            "arrays without a specified range, Intended for experimenting with "
+            "quantization via \"dummy quantization\". (default None)"))
+  parser.add_argument(
+      "--default_ranges_max",
+      type=int,
+      help=("Default value for max bound of min/max range values used for all "
+            "arrays without a specified range, Intended for experimenting with "
+            "quantization via \"dummy quantization\". (default None)"))
+
+  # Graph manipulation flags.
+  parser.add_argument(
+      "--drop_control_dependency",
+      action="store_true",
+      help=("Boolean indicating whether to drop control dependencies silently. "
+            "This is due to TensorFlow not supporting control dependencies. "
+            "(default True)"))
+  parser.add_argument(
+      "--reorder_across_fake_quant",
+      action="store_true",
+      help=("Boolean indicating whether to reorder FakeQuant nodes in "
+            "unexpected locations. Used when the location of the FakeQuant "
+            "nodes is preventing graph transformations necessary to convert "
+            "the graph. Results in a graph that differs from the quantized "
+            "training graph, potentially causing differing arithmetic "
+            "behavior. (default False)"))
+  parser.add_argument(
+      "--change_concat_input_ranges",
+      action="store_true",
+      help=("Boolean to change behavior of min/max ranges for inputs and "
+            "outputs of the concat operator for quantized models. Changes the "
+            "ranges of concat operator overlap when true. (default False)"))
+  parser.add_argument(
+      "--allow_custom_ops",
+      action="store_true",
+      help=("Boolean indicating whether to allow custom operations. When false "
+            "any unknown operation is an error. When true, custom ops are "
+            "created for any op that is unknown. The developer will need to "
+            "provide these to the TensorFlow Lite runtime with a custom "
+            "resolver. (default False)"))
+
+  tflite_flags, unparsed = parser.parse_known_args(args=sys.argv[1:])
+  try:
+    _check_flags(tflite_flags, unparsed)
+  except ValueError as e:
+    parser.print_usage()
+    file_name = os.path.basename(sys.argv[0])
+    sys.stderr.write("{0}: error: {1}\n".format(file_name, str(e)))
+    sys.exit(1)
+  _convert_model(tflite_flags)
+
+
+def main():
+  app.run(main=run_main, argv=sys.argv[:1])
+
+
+if __name__ == "__main__":
+  main()
diff --git a/tensorflow/contrib/lite/schema/builtin_ops_header/generator.cc b/tensorflow/contrib/lite/schema/builtin_ops_header/generator.cc
index ac408d2f94b98d..64ab0a9fe2f01a 100644
--- a/tensorflow/contrib/lite/schema/builtin_ops_header/generator.cc
+++ b/tensorflow/contrib/lite/schema/builtin_ops_header/generator.cc
@@ -57,7 +57,6 @@ const char* kFileFooter =
 }  // extern "C"
 #endif  // __cplusplus
 #endif  // TENSORFLOW_CONTRIB_LITE_BUILTIN_OPS_H_
-}
 )";
 }  // anonymous namespace
 
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index a65c2e0c70dcf9..7dbb36c864a84b 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -65,7 +65,7 @@ table Tensor {
   quantization:QuantizationParameters;  // Optional.
 }
 
-// A list of builtin operators. Builtin operators a slighlty faster than custom
+// A list of builtin operators. Builtin operators are slightly faster than custom
 // ones, but not by much. Moreover, while custom operators accept an opaque
 // object containing configuration parameters, builtins have a predetermined
 // set of acceptable options.
@@ -78,7 +78,7 @@ enum BuiltinOperator : byte {
   // DEPTH_TO_SPACE = 5,
   DEQUANTIZE = 6,
   EMBEDDING_LOOKUP = 7,
-  // FLOOR = 8,
+  FLOOR = 8,
   FULLY_CONNECTED = 9,
   HASHTABLE_LOOKUP = 10,
   L2_NORMALIZATION = 11,
@@ -136,6 +136,18 @@ enum BuiltinOperator : byte {
   ARG_MAX = 56,
   MINIMUM = 57,
   LESS = 58,
+  NEG = 59,
+  PADV2 = 60,
+  GREATER = 61,
+  GREATER_EQUAL = 62,
+  LESS_EQUAL = 63,
+  SELECT = 64,
+  SLICE = 65,
+  SIN = 66,
+  TRANSPOSE_CONV = 67,
+  SPARSE_TO_DENSE = 68,
+  TILE = 69,
+  EXPAND_DIMS = 70,
 }
 
 // Options for the builtin operators.
@@ -181,6 +193,17 @@ union BuiltinOptions {
   MaximumMinimumOptions,
   ArgMaxOptions,
   LessOptions,
+  NegOptions,
+  PadV2Options,
+  GreaterOptions,
+  GreaterEqualOptions,
+  LessEqualOptions,
+  SelectOptions,
+  SliceOptions,
+  TransposeConvOptions,
+  SparseToDenseOptions,
+  TileOptions,
+  ExpandDimsOptions,
 }
 
 enum Padding : byte { SAME, VALID }
@@ -292,11 +315,23 @@ table LocalResponseNormalizationOptions {
   beta:float;
 }
 
+enum LSTMKernelType : byte {
+  // Full LSTM kernel which supports peephole and projection.
+  FULL = 0,
+  // Basic LSTM kernels. Equivalent to TensorFlow BasicLSTMCell.
+  BASIC = 1,
+}
+
 // An implementation of TensorFlow LSTMCell and CoupledInputForgetGateLSTMCell
 table LSTMOptions {
+  // Parameters for LSTM version 1 or above.
   fused_activation_function:ActivationFunctionType;
   cell_clip: float; // Optional, 0.0 means no clipping
   proj_clip: float; // Optional, 0.0 means no clipping
+
+  // Parameters for LSTM version 2 or above.
+  // Basic kernel is only supported in version 2 or above.
+  kernel_type: LSTMKernelType = FULL;
 }
 
 table ResizeBilinearOptions {
@@ -314,6 +349,9 @@ table CallOptions {
 table PadOptions {
 }
 
+table PadV2Options {
+}
+
 table ReshapeOptions {
   new_shape:[int];
 }
@@ -399,18 +437,56 @@ table DequantizeOptions {
 table MaximumMinimumOptions {
 }
 
+table TileOptions {
+}
+
 table ArgMaxOptions {
   output_type : TensorType;
 }
 
+table GreaterOptions {
+}
+
+table GreaterEqualOptions {
+}
+
 table LessOptions {
 }
 
+table LessEqualOptions {
+}
+
+table NegOptions {
+}
+
+table SelectOptions {
+}
+
+table SliceOptions {
+}
+
+table TransposeConvOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+}
+
+table ExpandDimsOptions {
+}
+
+table SparseToDenseOptions {
+  validate_indices:bool;
+}
+
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
 // builtin, or a string if the operator is custom.
 table OperatorCode {
   builtin_code:BuiltinOperator;
   custom_code:string;
+
+  // The version of the operator. The version need to be bumped whenever new
+  // parameters are introduced into an op.
+  version:int = 1;
 }
 
 enum CustomOptionsFormat : byte {
@@ -485,6 +561,8 @@ table Model {
   // their buffer.
   buffers:[Buffer];
 
+  // Metadata about the model.  Indirects into the existings buffers list.
+  metadata_buffer:[int];
 }
 
 root_type Model;
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index 0b9961d606d609..b1beb39b28e79e 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -88,6 +88,9 @@ struct CallOptionsT;
 struct PadOptions;
 struct PadOptionsT;
 
+struct PadV2Options;
+struct PadV2OptionsT;
+
 struct ReshapeOptions;
 struct ReshapeOptionsT;
 
@@ -148,12 +151,42 @@ struct DequantizeOptionsT;
 struct MaximumMinimumOptions;
 struct MaximumMinimumOptionsT;
 
+struct TileOptions;
+struct TileOptionsT;
+
 struct ArgMaxOptions;
 struct ArgMaxOptionsT;
 
+struct GreaterOptions;
+struct GreaterOptionsT;
+
+struct GreaterEqualOptions;
+struct GreaterEqualOptionsT;
+
 struct LessOptions;
 struct LessOptionsT;
 
+struct LessEqualOptions;
+struct LessEqualOptionsT;
+
+struct NegOptions;
+struct NegOptionsT;
+
+struct SelectOptions;
+struct SelectOptionsT;
+
+struct SliceOptions;
+struct SliceOptionsT;
+
+struct TransposeConvOptions;
+struct TransposeConvOptionsT;
+
+struct ExpandDimsOptions;
+struct ExpandDimsOptionsT;
+
+struct SparseToDenseOptions;
+struct SparseToDenseOptionsT;
+
 struct OperatorCode;
 struct OperatorCodeT;
 
@@ -221,6 +254,7 @@ enum BuiltinOperator {
   BuiltinOperator_DEPTHWISE_CONV_2D = 4,
   BuiltinOperator_DEQUANTIZE = 6,
   BuiltinOperator_EMBEDDING_LOOKUP = 7,
+  BuiltinOperator_FLOOR = 8,
   BuiltinOperator_FULLY_CONNECTED = 9,
   BuiltinOperator_HASHTABLE_LOOKUP = 10,
   BuiltinOperator_L2_NORMALIZATION = 11,
@@ -271,11 +305,23 @@ enum BuiltinOperator {
   BuiltinOperator_ARG_MAX = 56,
   BuiltinOperator_MINIMUM = 57,
   BuiltinOperator_LESS = 58,
+  BuiltinOperator_NEG = 59,
+  BuiltinOperator_PADV2 = 60,
+  BuiltinOperator_GREATER = 61,
+  BuiltinOperator_GREATER_EQUAL = 62,
+  BuiltinOperator_LESS_EQUAL = 63,
+  BuiltinOperator_SELECT = 64,
+  BuiltinOperator_SLICE = 65,
+  BuiltinOperator_SIN = 66,
+  BuiltinOperator_TRANSPOSE_CONV = 67,
+  BuiltinOperator_SPARSE_TO_DENSE = 68,
+  BuiltinOperator_TILE = 69,
+  BuiltinOperator_EXPAND_DIMS = 70,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_LESS
+  BuiltinOperator_MAX = BuiltinOperator_EXPAND_DIMS
 };
 
-inline BuiltinOperator (&EnumValuesBuiltinOperator())[57] {
+inline BuiltinOperator (&EnumValuesBuiltinOperator())[70] {
   static BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
@@ -284,6 +330,7 @@ inline BuiltinOperator (&EnumValuesBuiltinOperator())[57] {
     BuiltinOperator_DEPTHWISE_CONV_2D,
     BuiltinOperator_DEQUANTIZE,
     BuiltinOperator_EMBEDDING_LOOKUP,
+    BuiltinOperator_FLOOR,
     BuiltinOperator_FULLY_CONNECTED,
     BuiltinOperator_HASHTABLE_LOOKUP,
     BuiltinOperator_L2_NORMALIZATION,
@@ -333,7 +380,19 @@ inline BuiltinOperator (&EnumValuesBuiltinOperator())[57] {
     BuiltinOperator_MAXIMUM,
     BuiltinOperator_ARG_MAX,
     BuiltinOperator_MINIMUM,
-    BuiltinOperator_LESS
+    BuiltinOperator_LESS,
+    BuiltinOperator_NEG,
+    BuiltinOperator_PADV2,
+    BuiltinOperator_GREATER,
+    BuiltinOperator_GREATER_EQUAL,
+    BuiltinOperator_LESS_EQUAL,
+    BuiltinOperator_SELECT,
+    BuiltinOperator_SLICE,
+    BuiltinOperator_SIN,
+    BuiltinOperator_TRANSPOSE_CONV,
+    BuiltinOperator_SPARSE_TO_DENSE,
+    BuiltinOperator_TILE,
+    BuiltinOperator_EXPAND_DIMS
   };
   return values;
 }
@@ -348,7 +407,7 @@ inline const char **EnumNamesBuiltinOperator() {
     "",
     "DEQUANTIZE",
     "EMBEDDING_LOOKUP",
-    "",
+    "FLOOR",
     "FULLY_CONNECTED",
     "HASHTABLE_LOOKUP",
     "L2_NORMALIZATION",
@@ -399,6 +458,18 @@ inline const char **EnumNamesBuiltinOperator() {
     "ARG_MAX",
     "MINIMUM",
     "LESS",
+    "NEG",
+    "PADV2",
+    "GREATER",
+    "GREATER_EQUAL",
+    "LESS_EQUAL",
+    "SELECT",
+    "SLICE",
+    "SIN",
+    "TRANSPOSE_CONV",
+    "SPARSE_TO_DENSE",
+    "TILE",
+    "EXPAND_DIMS",
     nullptr
   };
   return names;
@@ -452,11 +523,22 @@ enum BuiltinOptions {
   BuiltinOptions_MaximumMinimumOptions = 39,
   BuiltinOptions_ArgMaxOptions = 40,
   BuiltinOptions_LessOptions = 41,
+  BuiltinOptions_NegOptions = 42,
+  BuiltinOptions_PadV2Options = 43,
+  BuiltinOptions_GreaterOptions = 44,
+  BuiltinOptions_GreaterEqualOptions = 45,
+  BuiltinOptions_LessEqualOptions = 46,
+  BuiltinOptions_SelectOptions = 47,
+  BuiltinOptions_SliceOptions = 48,
+  BuiltinOptions_TransposeConvOptions = 49,
+  BuiltinOptions_SparseToDenseOptions = 50,
+  BuiltinOptions_TileOptions = 51,
+  BuiltinOptions_ExpandDimsOptions = 52,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_LessOptions
+  BuiltinOptions_MAX = BuiltinOptions_ExpandDimsOptions
 };
 
-inline BuiltinOptions (&EnumValuesBuiltinOptions())[42] {
+inline BuiltinOptions (&EnumValuesBuiltinOptions())[53] {
   static BuiltinOptions values[] = {
     BuiltinOptions_NONE,
     BuiltinOptions_Conv2DOptions,
@@ -499,7 +581,18 @@ inline BuiltinOptions (&EnumValuesBuiltinOptions())[42] {
     BuiltinOptions_DequantizeOptions,
     BuiltinOptions_MaximumMinimumOptions,
     BuiltinOptions_ArgMaxOptions,
-    BuiltinOptions_LessOptions
+    BuiltinOptions_LessOptions,
+    BuiltinOptions_NegOptions,
+    BuiltinOptions_PadV2Options,
+    BuiltinOptions_GreaterOptions,
+    BuiltinOptions_GreaterEqualOptions,
+    BuiltinOptions_LessEqualOptions,
+    BuiltinOptions_SelectOptions,
+    BuiltinOptions_SliceOptions,
+    BuiltinOptions_TransposeConvOptions,
+    BuiltinOptions_SparseToDenseOptions,
+    BuiltinOptions_TileOptions,
+    BuiltinOptions_ExpandDimsOptions
   };
   return values;
 }
@@ -548,6 +641,17 @@ inline const char **EnumNamesBuiltinOptions() {
     "MaximumMinimumOptions",
     "ArgMaxOptions",
     "LessOptions",
+    "NegOptions",
+    "PadV2Options",
+    "GreaterOptions",
+    "GreaterEqualOptions",
+    "LessEqualOptions",
+    "SelectOptions",
+    "SliceOptions",
+    "TransposeConvOptions",
+    "SparseToDenseOptions",
+    "TileOptions",
+    "ExpandDimsOptions",
     nullptr
   };
   return names;
@@ -726,6 +830,50 @@ template<> struct BuiltinOptionsTraits<LessOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_LessOptions;
 };
 
+template<> struct BuiltinOptionsTraits<NegOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_NegOptions;
+};
+
+template<> struct BuiltinOptionsTraits<PadV2Options> {
+  static const BuiltinOptions enum_value = BuiltinOptions_PadV2Options;
+};
+
+template<> struct BuiltinOptionsTraits<GreaterOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_GreaterOptions;
+};
+
+template<> struct BuiltinOptionsTraits<GreaterEqualOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_GreaterEqualOptions;
+};
+
+template<> struct BuiltinOptionsTraits<LessEqualOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LessEqualOptions;
+};
+
+template<> struct BuiltinOptionsTraits<SelectOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SelectOptions;
+};
+
+template<> struct BuiltinOptionsTraits<SliceOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SliceOptions;
+};
+
+template<> struct BuiltinOptionsTraits<TransposeConvOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_TransposeConvOptions;
+};
+
+template<> struct BuiltinOptionsTraits<SparseToDenseOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SparseToDenseOptions;
+};
+
+template<> struct BuiltinOptionsTraits<TileOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_TileOptions;
+};
+
+template<> struct BuiltinOptionsTraits<ExpandDimsOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ExpandDimsOptions;
+};
+
 struct BuiltinOptionsUnion {
   BuiltinOptions type;
   void *value;
@@ -1085,6 +1233,94 @@ struct BuiltinOptionsUnion {
     return type == BuiltinOptions_LessOptions ?
       reinterpret_cast<const LessOptionsT *>(value) : nullptr;
   }
+  NegOptionsT *AsNegOptions() {
+    return type == BuiltinOptions_NegOptions ?
+      reinterpret_cast<NegOptionsT *>(value) : nullptr;
+  }
+  const NegOptionsT *AsNegOptions() const {
+    return type == BuiltinOptions_NegOptions ?
+      reinterpret_cast<const NegOptionsT *>(value) : nullptr;
+  }
+  PadV2OptionsT *AsPadV2Options() {
+    return type == BuiltinOptions_PadV2Options ?
+      reinterpret_cast<PadV2OptionsT *>(value) : nullptr;
+  }
+  const PadV2OptionsT *AsPadV2Options() const {
+    return type == BuiltinOptions_PadV2Options ?
+      reinterpret_cast<const PadV2OptionsT *>(value) : nullptr;
+  }
+  GreaterOptionsT *AsGreaterOptions() {
+    return type == BuiltinOptions_GreaterOptions ?
+      reinterpret_cast<GreaterOptionsT *>(value) : nullptr;
+  }
+  const GreaterOptionsT *AsGreaterOptions() const {
+    return type == BuiltinOptions_GreaterOptions ?
+      reinterpret_cast<const GreaterOptionsT *>(value) : nullptr;
+  }
+  GreaterEqualOptionsT *AsGreaterEqualOptions() {
+    return type == BuiltinOptions_GreaterEqualOptions ?
+      reinterpret_cast<GreaterEqualOptionsT *>(value) : nullptr;
+  }
+  const GreaterEqualOptionsT *AsGreaterEqualOptions() const {
+    return type == BuiltinOptions_GreaterEqualOptions ?
+      reinterpret_cast<const GreaterEqualOptionsT *>(value) : nullptr;
+  }
+  LessEqualOptionsT *AsLessEqualOptions() {
+    return type == BuiltinOptions_LessEqualOptions ?
+      reinterpret_cast<LessEqualOptionsT *>(value) : nullptr;
+  }
+  const LessEqualOptionsT *AsLessEqualOptions() const {
+    return type == BuiltinOptions_LessEqualOptions ?
+      reinterpret_cast<const LessEqualOptionsT *>(value) : nullptr;
+  }
+  SelectOptionsT *AsSelectOptions() {
+    return type == BuiltinOptions_SelectOptions ?
+      reinterpret_cast<SelectOptionsT *>(value) : nullptr;
+  }
+  const SelectOptionsT *AsSelectOptions() const {
+    return type == BuiltinOptions_SelectOptions ?
+      reinterpret_cast<const SelectOptionsT *>(value) : nullptr;
+  }
+  SliceOptionsT *AsSliceOptions() {
+    return type == BuiltinOptions_SliceOptions ?
+      reinterpret_cast<SliceOptionsT *>(value) : nullptr;
+  }
+  const SliceOptionsT *AsSliceOptions() const {
+    return type == BuiltinOptions_SliceOptions ?
+      reinterpret_cast<const SliceOptionsT *>(value) : nullptr;
+  }
+  TransposeConvOptionsT *AsTransposeConvOptions() {
+    return type == BuiltinOptions_TransposeConvOptions ?
+      reinterpret_cast<TransposeConvOptionsT *>(value) : nullptr;
+  }
+  const TransposeConvOptionsT *AsTransposeConvOptions() const {
+    return type == BuiltinOptions_TransposeConvOptions ?
+      reinterpret_cast<const TransposeConvOptionsT *>(value) : nullptr;
+  }
+  SparseToDenseOptionsT *AsSparseToDenseOptions() {
+    return type == BuiltinOptions_SparseToDenseOptions ?
+      reinterpret_cast<SparseToDenseOptionsT *>(value) : nullptr;
+  }
+  const SparseToDenseOptionsT *AsSparseToDenseOptions() const {
+    return type == BuiltinOptions_SparseToDenseOptions ?
+      reinterpret_cast<const SparseToDenseOptionsT *>(value) : nullptr;
+  }
+  TileOptionsT *AsTileOptions() {
+    return type == BuiltinOptions_TileOptions ?
+      reinterpret_cast<TileOptionsT *>(value) : nullptr;
+  }
+  const TileOptionsT *AsTileOptions() const {
+    return type == BuiltinOptions_TileOptions ?
+      reinterpret_cast<const TileOptionsT *>(value) : nullptr;
+  }
+  ExpandDimsOptionsT *AsExpandDimsOptions() {
+    return type == BuiltinOptions_ExpandDimsOptions ?
+      reinterpret_cast<ExpandDimsOptionsT *>(value) : nullptr;
+  }
+  const ExpandDimsOptionsT *AsExpandDimsOptions() const {
+    return type == BuiltinOptions_ExpandDimsOptions ?
+      reinterpret_cast<const ExpandDimsOptionsT *>(value) : nullptr;
+  }
 };
 
 bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
@@ -1192,6 +1428,35 @@ inline const char *EnumNameLSHProjectionType(LSHProjectionType e) {
   return EnumNamesLSHProjectionType()[index];
 }
 
+enum LSTMKernelType {
+  LSTMKernelType_FULL = 0,
+  LSTMKernelType_BASIC = 1,
+  LSTMKernelType_MIN = LSTMKernelType_FULL,
+  LSTMKernelType_MAX = LSTMKernelType_BASIC
+};
+
+inline LSTMKernelType (&EnumValuesLSTMKernelType())[2] {
+  static LSTMKernelType values[] = {
+    LSTMKernelType_FULL,
+    LSTMKernelType_BASIC
+  };
+  return values;
+}
+
+inline const char **EnumNamesLSTMKernelType() {
+  static const char *names[] = {
+    "FULL",
+    "BASIC",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameLSTMKernelType(LSTMKernelType e) {
+  const size_t index = static_cast<int>(e);
+  return EnumNamesLSTMKernelType()[index];
+}
+
 enum CombinerType {
   CombinerType_SUM = 0,
   CombinerType_MEAN = 1,
@@ -1485,8 +1750,8 @@ struct Conv2DOptionsT : public flatbuffers::NativeTable {
         stride_w(0),
         stride_h(0),
         fused_activation_function(ActivationFunctionType_NONE),
-        dilation_w_factor(0),
-        dilation_h_factor(0) {
+        dilation_w_factor(1),
+        dilation_h_factor(1) {
   }
 };
 
@@ -1513,10 +1778,10 @@ struct Conv2DOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
   }
   int32_t dilation_w_factor() const {
-    return GetField<int32_t>(VT_DILATION_W_FACTOR, 0);
+    return GetField<int32_t>(VT_DILATION_W_FACTOR, 1);
   }
   int32_t dilation_h_factor() const {
-    return GetField<int32_t>(VT_DILATION_H_FACTOR, 0);
+    return GetField<int32_t>(VT_DILATION_H_FACTOR, 1);
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
@@ -1549,10 +1814,10 @@ struct Conv2DOptionsBuilder {
     fbb_.AddElement<int8_t>(Conv2DOptions::VT_FUSED_ACTIVATION_FUNCTION, static_cast<int8_t>(fused_activation_function), 0);
   }
   void add_dilation_w_factor(int32_t dilation_w_factor) {
-    fbb_.AddElement<int32_t>(Conv2DOptions::VT_DILATION_W_FACTOR, dilation_w_factor, 0);
+    fbb_.AddElement<int32_t>(Conv2DOptions::VT_DILATION_W_FACTOR, dilation_w_factor, 1);
   }
   void add_dilation_h_factor(int32_t dilation_h_factor) {
-    fbb_.AddElement<int32_t>(Conv2DOptions::VT_DILATION_H_FACTOR, dilation_h_factor, 0);
+    fbb_.AddElement<int32_t>(Conv2DOptions::VT_DILATION_H_FACTOR, dilation_h_factor, 1);
   }
   explicit Conv2DOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
@@ -1572,8 +1837,8 @@ inline flatbuffers::Offset<Conv2DOptions> CreateConv2DOptions(
     int32_t stride_w = 0,
     int32_t stride_h = 0,
     ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE,
-    int32_t dilation_w_factor = 0,
-    int32_t dilation_h_factor = 0) {
+    int32_t dilation_w_factor = 1,
+    int32_t dilation_h_factor = 1) {
   Conv2DOptionsBuilder builder_(_fbb);
   builder_.add_dilation_h_factor(dilation_h_factor);
   builder_.add_dilation_w_factor(dilation_w_factor);
@@ -2629,10 +2894,12 @@ struct LSTMOptionsT : public flatbuffers::NativeTable {
   ActivationFunctionType fused_activation_function;
   float cell_clip;
   float proj_clip;
+  LSTMKernelType kernel_type;
   LSTMOptionsT()
       : fused_activation_function(ActivationFunctionType_NONE),
         cell_clip(0.0f),
-        proj_clip(0.0f) {
+        proj_clip(0.0f),
+        kernel_type(LSTMKernelType_FULL) {
   }
 };
 
@@ -2641,7 +2908,8 @@ struct LSTMOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   enum {
     VT_FUSED_ACTIVATION_FUNCTION = 4,
     VT_CELL_CLIP = 6,
-    VT_PROJ_CLIP = 8
+    VT_PROJ_CLIP = 8,
+    VT_KERNEL_TYPE = 10
   };
   ActivationFunctionType fused_activation_function() const {
     return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
@@ -2652,11 +2920,15 @@ struct LSTMOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   float proj_clip() const {
     return GetField<float>(VT_PROJ_CLIP, 0.0f);
   }
+  LSTMKernelType kernel_type() const {
+    return static_cast<LSTMKernelType>(GetField<int8_t>(VT_KERNEL_TYPE, 0));
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
            VerifyField<float>(verifier, VT_CELL_CLIP) &&
            VerifyField<float>(verifier, VT_PROJ_CLIP) &&
+           VerifyField<int8_t>(verifier, VT_KERNEL_TYPE) &&
            verifier.EndTable();
   }
   LSTMOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -2676,6 +2948,9 @@ struct LSTMOptionsBuilder {
   void add_proj_clip(float proj_clip) {
     fbb_.AddElement<float>(LSTMOptions::VT_PROJ_CLIP, proj_clip, 0.0f);
   }
+  void add_kernel_type(LSTMKernelType kernel_type) {
+    fbb_.AddElement<int8_t>(LSTMOptions::VT_KERNEL_TYPE, static_cast<int8_t>(kernel_type), 0);
+  }
   explicit LSTMOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -2692,10 +2967,12 @@ inline flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(
     flatbuffers::FlatBufferBuilder &_fbb,
     ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE,
     float cell_clip = 0.0f,
-    float proj_clip = 0.0f) {
+    float proj_clip = 0.0f,
+    LSTMKernelType kernel_type = LSTMKernelType_FULL) {
   LSTMOptionsBuilder builder_(_fbb);
   builder_.add_proj_clip(proj_clip);
   builder_.add_cell_clip(cell_clip);
+  builder_.add_kernel_type(kernel_type);
   builder_.add_fused_activation_function(fused_activation_function);
   return builder_.Finish();
 }
@@ -2850,6 +3127,46 @@ inline flatbuffers::Offset<PadOptions> CreatePadOptions(
 
 flatbuffers::Offset<PadOptions> CreatePadOptions(flatbuffers::FlatBufferBuilder &_fbb, const PadOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct PadV2OptionsT : public flatbuffers::NativeTable {
+  typedef PadV2Options TableType;
+  PadV2OptionsT() {
+  }
+};
+
+struct PadV2Options FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef PadV2OptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  PadV2OptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(PadV2OptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<PadV2Options> Pack(flatbuffers::FlatBufferBuilder &_fbb, const PadV2OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct PadV2OptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit PadV2OptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  PadV2OptionsBuilder &operator=(const PadV2OptionsBuilder &);
+  flatbuffers::Offset<PadV2Options> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<PadV2Options>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<PadV2Options> CreatePadV2Options(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  PadV2OptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<PadV2Options> CreatePadV2Options(flatbuffers::FlatBufferBuilder &_fbb, const PadV2OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct ReshapeOptionsT : public flatbuffers::NativeTable {
   typedef ReshapeOptions TableType;
   std::vector<int32_t> new_shape;
@@ -3918,6 +4235,46 @@ inline flatbuffers::Offset<MaximumMinimumOptions> CreateMaximumMinimumOptions(
 
 flatbuffers::Offset<MaximumMinimumOptions> CreateMaximumMinimumOptions(flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct TileOptionsT : public flatbuffers::NativeTable {
+  typedef TileOptions TableType;
+  TileOptionsT() {
+  }
+};
+
+struct TileOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef TileOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  TileOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TileOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<TileOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const TileOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct TileOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit TileOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  TileOptionsBuilder &operator=(const TileOptionsBuilder &);
+  flatbuffers::Offset<TileOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<TileOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<TileOptions> CreateTileOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  TileOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<TileOptions> CreateTileOptions(flatbuffers::FlatBufferBuilder &_fbb, const TileOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct ArgMaxOptionsT : public flatbuffers::NativeTable {
   typedef ArgMaxOptions TableType;
   TensorType output_type;
@@ -3972,6 +4329,86 @@ inline flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(
 
 flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct GreaterOptionsT : public flatbuffers::NativeTable {
+  typedef GreaterOptions TableType;
+  GreaterOptionsT() {
+  }
+};
+
+struct GreaterOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef GreaterOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  GreaterOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(GreaterOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<GreaterOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const GreaterOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct GreaterOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit GreaterOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  GreaterOptionsBuilder &operator=(const GreaterOptionsBuilder &);
+  flatbuffers::Offset<GreaterOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<GreaterOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<GreaterOptions> CreateGreaterOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  GreaterOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<GreaterOptions> CreateGreaterOptions(flatbuffers::FlatBufferBuilder &_fbb, const GreaterOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct GreaterEqualOptionsT : public flatbuffers::NativeTable {
+  typedef GreaterEqualOptions TableType;
+  GreaterEqualOptionsT() {
+  }
+};
+
+struct GreaterEqualOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef GreaterEqualOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  GreaterEqualOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(GreaterEqualOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<GreaterEqualOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const GreaterEqualOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct GreaterEqualOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit GreaterEqualOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  GreaterEqualOptionsBuilder &operator=(const GreaterEqualOptionsBuilder &);
+  flatbuffers::Offset<GreaterEqualOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<GreaterEqualOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<GreaterEqualOptions> CreateGreaterEqualOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  GreaterEqualOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<GreaterEqualOptions> CreateGreaterEqualOptions(flatbuffers::FlatBufferBuilder &_fbb, const GreaterEqualOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct LessOptionsT : public flatbuffers::NativeTable {
   typedef LessOptions TableType;
   LessOptionsT() {
@@ -4012,115 +4449,461 @@ inline flatbuffers::Offset<LessOptions> CreateLessOptions(
 
 flatbuffers::Offset<LessOptions> CreateLessOptions(flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-struct OperatorCodeT : public flatbuffers::NativeTable {
-  typedef OperatorCode TableType;
-  BuiltinOperator builtin_code;
-  std::string custom_code;
-  OperatorCodeT()
-      : builtin_code(BuiltinOperator_ADD) {
+struct LessEqualOptionsT : public flatbuffers::NativeTable {
+  typedef LessEqualOptions TableType;
+  LessEqualOptionsT() {
   }
 };
 
-struct OperatorCode FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
-  typedef OperatorCodeT NativeTableType;
-  enum {
-    VT_BUILTIN_CODE = 4,
-    VT_CUSTOM_CODE = 6
-  };
-  BuiltinOperator builtin_code() const {
-    return static_cast<BuiltinOperator>(GetField<int8_t>(VT_BUILTIN_CODE, 0));
-  }
-  const flatbuffers::String *custom_code() const {
-    return GetPointer<const flatbuffers::String *>(VT_CUSTOM_CODE);
-  }
+struct LessEqualOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef LessEqualOptionsT NativeTableType;
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
-           VerifyField<int8_t>(verifier, VT_BUILTIN_CODE) &&
-           VerifyOffset(verifier, VT_CUSTOM_CODE) &&
-           verifier.Verify(custom_code()) &&
            verifier.EndTable();
   }
-  OperatorCodeT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  void UnPackTo(OperatorCodeT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
-  static flatbuffers::Offset<OperatorCode> Pack(flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+  LessEqualOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LessEqualOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<LessEqualOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LessEqualOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
-struct OperatorCodeBuilder {
+struct LessEqualOptionsBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_builtin_code(BuiltinOperator builtin_code) {
-    fbb_.AddElement<int8_t>(OperatorCode::VT_BUILTIN_CODE, static_cast<int8_t>(builtin_code), 0);
-  }
-  void add_custom_code(flatbuffers::Offset<flatbuffers::String> custom_code) {
-    fbb_.AddOffset(OperatorCode::VT_CUSTOM_CODE, custom_code);
-  }
-  explicit OperatorCodeBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+  explicit LessEqualOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
   }
-  OperatorCodeBuilder &operator=(const OperatorCodeBuilder &);
-  flatbuffers::Offset<OperatorCode> Finish() {
+  LessEqualOptionsBuilder &operator=(const LessEqualOptionsBuilder &);
+  flatbuffers::Offset<LessEqualOptions> Finish() {
     const auto end = fbb_.EndTable(start_);
-    auto o = flatbuffers::Offset<OperatorCode>(end);
+    auto o = flatbuffers::Offset<LessEqualOptions>(end);
     return o;
   }
 };
 
-inline flatbuffers::Offset<OperatorCode> CreateOperatorCode(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    BuiltinOperator builtin_code = BuiltinOperator_ADD,
-    flatbuffers::Offset<flatbuffers::String> custom_code = 0) {
-  OperatorCodeBuilder builder_(_fbb);
-  builder_.add_custom_code(custom_code);
-  builder_.add_builtin_code(builtin_code);
+inline flatbuffers::Offset<LessEqualOptions> CreateLessEqualOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  LessEqualOptionsBuilder builder_(_fbb);
   return builder_.Finish();
 }
 
-inline flatbuffers::Offset<OperatorCode> CreateOperatorCodeDirect(
-    flatbuffers::FlatBufferBuilder &_fbb,
-    BuiltinOperator builtin_code = BuiltinOperator_ADD,
-    const char *custom_code = nullptr) {
-  return tflite::CreateOperatorCode(
-      _fbb,
-      builtin_code,
-      custom_code ? _fbb.CreateString(custom_code) : 0);
-}
+flatbuffers::Offset<LessEqualOptions> CreateLessEqualOptions(flatbuffers::FlatBufferBuilder &_fbb, const LessEqualOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
-flatbuffers::Offset<OperatorCode> CreateOperatorCode(flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+struct NegOptionsT : public flatbuffers::NativeTable {
+  typedef NegOptions TableType;
+  NegOptionsT() {
+  }
+};
 
-struct OperatorT : public flatbuffers::NativeTable {
-  typedef Operator TableType;
-  uint32_t opcode_index;
-  std::vector<int32_t> inputs;
-  std::vector<int32_t> outputs;
-  BuiltinOptionsUnion builtin_options;
-  std::vector<uint8_t> custom_options;
-  CustomOptionsFormat custom_options_format;
-  OperatorT()
-      : opcode_index(0),
-        custom_options_format(CustomOptionsFormat_FLEXBUFFERS) {
+struct NegOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef NegOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
   }
+  NegOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(NegOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<NegOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const NegOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 };
 
-struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
-  typedef OperatorT NativeTableType;
-  enum {
-    VT_OPCODE_INDEX = 4,
-    VT_INPUTS = 6,
-    VT_OUTPUTS = 8,
-    VT_BUILTIN_OPTIONS_TYPE = 10,
-    VT_BUILTIN_OPTIONS = 12,
-    VT_CUSTOM_OPTIONS = 14,
-    VT_CUSTOM_OPTIONS_FORMAT = 16
-  };
-  uint32_t opcode_index() const {
-    return GetField<uint32_t>(VT_OPCODE_INDEX, 0);
+struct NegOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit NegOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
   }
-  const flatbuffers::Vector<int32_t> *inputs() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_INPUTS);
+  NegOptionsBuilder &operator=(const NegOptionsBuilder &);
+  flatbuffers::Offset<NegOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<NegOptions>(end);
+    return o;
   }
-  const flatbuffers::Vector<int32_t> *outputs() const {
-    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_OUTPUTS);
+};
+
+inline flatbuffers::Offset<NegOptions> CreateNegOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  NegOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<NegOptions> CreateNegOptions(flatbuffers::FlatBufferBuilder &_fbb, const NegOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SelectOptionsT : public flatbuffers::NativeTable {
+  typedef SelectOptions TableType;
+  SelectOptionsT() {
+  }
+};
+
+struct SelectOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SelectOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  SelectOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SelectOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SelectOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SelectOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SelectOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit SelectOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  SelectOptionsBuilder &operator=(const SelectOptionsBuilder &);
+  flatbuffers::Offset<SelectOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SelectOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SelectOptions> CreateSelectOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  SelectOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SelectOptions> CreateSelectOptions(flatbuffers::FlatBufferBuilder &_fbb, const SelectOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SliceOptionsT : public flatbuffers::NativeTable {
+  typedef SliceOptions TableType;
+  SliceOptionsT() {
+  }
+};
+
+struct SliceOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SliceOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  SliceOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SliceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SliceOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SliceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SliceOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit SliceOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  SliceOptionsBuilder &operator=(const SliceOptionsBuilder &);
+  flatbuffers::Offset<SliceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SliceOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SliceOptions> CreateSliceOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  SliceOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SliceOptions> CreateSliceOptions(flatbuffers::FlatBufferBuilder &_fbb, const SliceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct TransposeConvOptionsT : public flatbuffers::NativeTable {
+  typedef TransposeConvOptions TableType;
+  Padding padding;
+  int32_t stride_w;
+  int32_t stride_h;
+  TransposeConvOptionsT()
+      : padding(Padding_SAME),
+        stride_w(0),
+        stride_h(0) {
+  }
+};
+
+struct TransposeConvOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef TransposeConvOptionsT NativeTableType;
+  enum {
+    VT_PADDING = 4,
+    VT_STRIDE_W = 6,
+    VT_STRIDE_H = 8
+  };
+  Padding padding() const {
+    return static_cast<Padding>(GetField<int8_t>(VT_PADDING, 0));
+  }
+  int32_t stride_w() const {
+    return GetField<int32_t>(VT_STRIDE_W, 0);
+  }
+  int32_t stride_h() const {
+    return GetField<int32_t>(VT_STRIDE_H, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_PADDING) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_W) &&
+           VerifyField<int32_t>(verifier, VT_STRIDE_H) &&
+           verifier.EndTable();
+  }
+  TransposeConvOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TransposeConvOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<TransposeConvOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const TransposeConvOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct TransposeConvOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_padding(Padding padding) {
+    fbb_.AddElement<int8_t>(TransposeConvOptions::VT_PADDING, static_cast<int8_t>(padding), 0);
+  }
+  void add_stride_w(int32_t stride_w) {
+    fbb_.AddElement<int32_t>(TransposeConvOptions::VT_STRIDE_W, stride_w, 0);
+  }
+  void add_stride_h(int32_t stride_h) {
+    fbb_.AddElement<int32_t>(TransposeConvOptions::VT_STRIDE_H, stride_h, 0);
+  }
+  explicit TransposeConvOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  TransposeConvOptionsBuilder &operator=(const TransposeConvOptionsBuilder &);
+  flatbuffers::Offset<TransposeConvOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<TransposeConvOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<TransposeConvOptions> CreateTransposeConvOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    Padding padding = Padding_SAME,
+    int32_t stride_w = 0,
+    int32_t stride_h = 0) {
+  TransposeConvOptionsBuilder builder_(_fbb);
+  builder_.add_stride_h(stride_h);
+  builder_.add_stride_w(stride_w);
+  builder_.add_padding(padding);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<TransposeConvOptions> CreateTransposeConvOptions(flatbuffers::FlatBufferBuilder &_fbb, const TransposeConvOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct ExpandDimsOptionsT : public flatbuffers::NativeTable {
+  typedef ExpandDimsOptions TableType;
+  ExpandDimsOptionsT() {
+  }
+};
+
+struct ExpandDimsOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ExpandDimsOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  ExpandDimsOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ExpandDimsOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ExpandDimsOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ExpandDimsOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ExpandDimsOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit ExpandDimsOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ExpandDimsOptionsBuilder &operator=(const ExpandDimsOptionsBuilder &);
+  flatbuffers::Offset<ExpandDimsOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ExpandDimsOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ExpandDimsOptions> CreateExpandDimsOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  ExpandDimsOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<ExpandDimsOptions> CreateExpandDimsOptions(flatbuffers::FlatBufferBuilder &_fbb, const ExpandDimsOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SparseToDenseOptionsT : public flatbuffers::NativeTable {
+  typedef SparseToDenseOptions TableType;
+  bool validate_indices;
+  SparseToDenseOptionsT()
+      : validate_indices(false) {
+  }
+};
+
+struct SparseToDenseOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SparseToDenseOptionsT NativeTableType;
+  enum {
+    VT_VALIDATE_INDICES = 4
+  };
+  bool validate_indices() const {
+    return GetField<uint8_t>(VT_VALIDATE_INDICES, 0) != 0;
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_VALIDATE_INDICES) &&
+           verifier.EndTable();
+  }
+  SparseToDenseOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SparseToDenseOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SparseToDenseOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SparseToDenseOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SparseToDenseOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_validate_indices(bool validate_indices) {
+    fbb_.AddElement<uint8_t>(SparseToDenseOptions::VT_VALIDATE_INDICES, static_cast<uint8_t>(validate_indices), 0);
+  }
+  explicit SparseToDenseOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  SparseToDenseOptionsBuilder &operator=(const SparseToDenseOptionsBuilder &);
+  flatbuffers::Offset<SparseToDenseOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SparseToDenseOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SparseToDenseOptions> CreateSparseToDenseOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    bool validate_indices = false) {
+  SparseToDenseOptionsBuilder builder_(_fbb);
+  builder_.add_validate_indices(validate_indices);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SparseToDenseOptions> CreateSparseToDenseOptions(flatbuffers::FlatBufferBuilder &_fbb, const SparseToDenseOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct OperatorCodeT : public flatbuffers::NativeTable {
+  typedef OperatorCode TableType;
+  BuiltinOperator builtin_code;
+  std::string custom_code;
+  int32_t version;
+  OperatorCodeT()
+      : builtin_code(BuiltinOperator_ADD),
+        version(1) {
+  }
+};
+
+struct OperatorCode FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef OperatorCodeT NativeTableType;
+  enum {
+    VT_BUILTIN_CODE = 4,
+    VT_CUSTOM_CODE = 6,
+    VT_VERSION = 8
+  };
+  BuiltinOperator builtin_code() const {
+    return static_cast<BuiltinOperator>(GetField<int8_t>(VT_BUILTIN_CODE, 0));
+  }
+  const flatbuffers::String *custom_code() const {
+    return GetPointer<const flatbuffers::String *>(VT_CUSTOM_CODE);
+  }
+  int32_t version() const {
+    return GetField<int32_t>(VT_VERSION, 1);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_BUILTIN_CODE) &&
+           VerifyOffset(verifier, VT_CUSTOM_CODE) &&
+           verifier.Verify(custom_code()) &&
+           VerifyField<int32_t>(verifier, VT_VERSION) &&
+           verifier.EndTable();
+  }
+  OperatorCodeT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(OperatorCodeT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<OperatorCode> Pack(flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct OperatorCodeBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_builtin_code(BuiltinOperator builtin_code) {
+    fbb_.AddElement<int8_t>(OperatorCode::VT_BUILTIN_CODE, static_cast<int8_t>(builtin_code), 0);
+  }
+  void add_custom_code(flatbuffers::Offset<flatbuffers::String> custom_code) {
+    fbb_.AddOffset(OperatorCode::VT_CUSTOM_CODE, custom_code);
+  }
+  void add_version(int32_t version) {
+    fbb_.AddElement<int32_t>(OperatorCode::VT_VERSION, version, 1);
+  }
+  explicit OperatorCodeBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  OperatorCodeBuilder &operator=(const OperatorCodeBuilder &);
+  flatbuffers::Offset<OperatorCode> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<OperatorCode>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<OperatorCode> CreateOperatorCode(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    BuiltinOperator builtin_code = BuiltinOperator_ADD,
+    flatbuffers::Offset<flatbuffers::String> custom_code = 0,
+    int32_t version = 1) {
+  OperatorCodeBuilder builder_(_fbb);
+  builder_.add_version(version);
+  builder_.add_custom_code(custom_code);
+  builder_.add_builtin_code(builtin_code);
+  return builder_.Finish();
+}
+
+inline flatbuffers::Offset<OperatorCode> CreateOperatorCodeDirect(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    BuiltinOperator builtin_code = BuiltinOperator_ADD,
+    const char *custom_code = nullptr,
+    int32_t version = 1) {
+  return tflite::CreateOperatorCode(
+      _fbb,
+      builtin_code,
+      custom_code ? _fbb.CreateString(custom_code) : 0,
+      version);
+}
+
+flatbuffers::Offset<OperatorCode> CreateOperatorCode(flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct OperatorT : public flatbuffers::NativeTable {
+  typedef Operator TableType;
+  uint32_t opcode_index;
+  std::vector<int32_t> inputs;
+  std::vector<int32_t> outputs;
+  BuiltinOptionsUnion builtin_options;
+  std::vector<uint8_t> custom_options;
+  CustomOptionsFormat custom_options_format;
+  OperatorT()
+      : opcode_index(0),
+        custom_options_format(CustomOptionsFormat_FLEXBUFFERS) {
+  }
+};
+
+struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef OperatorT NativeTableType;
+  enum {
+    VT_OPCODE_INDEX = 4,
+    VT_INPUTS = 6,
+    VT_OUTPUTS = 8,
+    VT_BUILTIN_OPTIONS_TYPE = 10,
+    VT_BUILTIN_OPTIONS = 12,
+    VT_CUSTOM_OPTIONS = 14,
+    VT_CUSTOM_OPTIONS_FORMAT = 16
+  };
+  uint32_t opcode_index() const {
+    return GetField<uint32_t>(VT_OPCODE_INDEX, 0);
+  }
+  const flatbuffers::Vector<int32_t> *inputs() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_INPUTS);
+  }
+  const flatbuffers::Vector<int32_t> *outputs() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_OUTPUTS);
   }
   BuiltinOptions builtin_options_type() const {
     return static_cast<BuiltinOptions>(GetField<uint8_t>(VT_BUILTIN_OPTIONS_TYPE, 0));
@@ -4252,6 +5035,39 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const LessOptions *builtin_options_as_LessOptions() const {
     return builtin_options_type() == BuiltinOptions_LessOptions ? static_cast<const LessOptions *>(builtin_options()) : nullptr;
   }
+  const NegOptions *builtin_options_as_NegOptions() const {
+    return builtin_options_type() == BuiltinOptions_NegOptions ? static_cast<const NegOptions *>(builtin_options()) : nullptr;
+  }
+  const PadV2Options *builtin_options_as_PadV2Options() const {
+    return builtin_options_type() == BuiltinOptions_PadV2Options ? static_cast<const PadV2Options *>(builtin_options()) : nullptr;
+  }
+  const GreaterOptions *builtin_options_as_GreaterOptions() const {
+    return builtin_options_type() == BuiltinOptions_GreaterOptions ? static_cast<const GreaterOptions *>(builtin_options()) : nullptr;
+  }
+  const GreaterEqualOptions *builtin_options_as_GreaterEqualOptions() const {
+    return builtin_options_type() == BuiltinOptions_GreaterEqualOptions ? static_cast<const GreaterEqualOptions *>(builtin_options()) : nullptr;
+  }
+  const LessEqualOptions *builtin_options_as_LessEqualOptions() const {
+    return builtin_options_type() == BuiltinOptions_LessEqualOptions ? static_cast<const LessEqualOptions *>(builtin_options()) : nullptr;
+  }
+  const SelectOptions *builtin_options_as_SelectOptions() const {
+    return builtin_options_type() == BuiltinOptions_SelectOptions ? static_cast<const SelectOptions *>(builtin_options()) : nullptr;
+  }
+  const SliceOptions *builtin_options_as_SliceOptions() const {
+    return builtin_options_type() == BuiltinOptions_SliceOptions ? static_cast<const SliceOptions *>(builtin_options()) : nullptr;
+  }
+  const TransposeConvOptions *builtin_options_as_TransposeConvOptions() const {
+    return builtin_options_type() == BuiltinOptions_TransposeConvOptions ? static_cast<const TransposeConvOptions *>(builtin_options()) : nullptr;
+  }
+  const SparseToDenseOptions *builtin_options_as_SparseToDenseOptions() const {
+    return builtin_options_type() == BuiltinOptions_SparseToDenseOptions ? static_cast<const SparseToDenseOptions *>(builtin_options()) : nullptr;
+  }
+  const TileOptions *builtin_options_as_TileOptions() const {
+    return builtin_options_type() == BuiltinOptions_TileOptions ? static_cast<const TileOptions *>(builtin_options()) : nullptr;
+  }
+  const ExpandDimsOptions *builtin_options_as_ExpandDimsOptions() const {
+    return builtin_options_type() == BuiltinOptions_ExpandDimsOptions ? static_cast<const ExpandDimsOptions *>(builtin_options()) : nullptr;
+  }
   const flatbuffers::Vector<uint8_t> *custom_options() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
   }
@@ -4442,6 +5258,50 @@ template<> inline const LessOptions *Operator::builtin_options_as<LessOptions>()
   return builtin_options_as_LessOptions();
 }
 
+template<> inline const NegOptions *Operator::builtin_options_as<NegOptions>() const {
+  return builtin_options_as_NegOptions();
+}
+
+template<> inline const PadV2Options *Operator::builtin_options_as<PadV2Options>() const {
+  return builtin_options_as_PadV2Options();
+}
+
+template<> inline const GreaterOptions *Operator::builtin_options_as<GreaterOptions>() const {
+  return builtin_options_as_GreaterOptions();
+}
+
+template<> inline const GreaterEqualOptions *Operator::builtin_options_as<GreaterEqualOptions>() const {
+  return builtin_options_as_GreaterEqualOptions();
+}
+
+template<> inline const LessEqualOptions *Operator::builtin_options_as<LessEqualOptions>() const {
+  return builtin_options_as_LessEqualOptions();
+}
+
+template<> inline const SelectOptions *Operator::builtin_options_as<SelectOptions>() const {
+  return builtin_options_as_SelectOptions();
+}
+
+template<> inline const SliceOptions *Operator::builtin_options_as<SliceOptions>() const {
+  return builtin_options_as_SliceOptions();
+}
+
+template<> inline const TransposeConvOptions *Operator::builtin_options_as<TransposeConvOptions>() const {
+  return builtin_options_as_TransposeConvOptions();
+}
+
+template<> inline const SparseToDenseOptions *Operator::builtin_options_as<SparseToDenseOptions>() const {
+  return builtin_options_as_SparseToDenseOptions();
+}
+
+template<> inline const TileOptions *Operator::builtin_options_as<TileOptions>() const {
+  return builtin_options_as_TileOptions();
+}
+
+template<> inline const ExpandDimsOptions *Operator::builtin_options_as<ExpandDimsOptions>() const {
+  return builtin_options_as_ExpandDimsOptions();
+}
+
 struct OperatorBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -4709,6 +5569,7 @@ struct ModelT : public flatbuffers::NativeTable {
   std::vector<std::unique_ptr<SubGraphT>> subgraphs;
   std::string description;
   std::vector<std::unique_ptr<BufferT>> buffers;
+  std::vector<int32_t> metadata_buffer;
   ModelT()
       : version(0) {
   }
@@ -4721,7 +5582,8 @@ struct Model FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
     VT_OPERATOR_CODES = 6,
     VT_SUBGRAPHS = 8,
     VT_DESCRIPTION = 10,
-    VT_BUFFERS = 12
+    VT_BUFFERS = 12,
+    VT_METADATA_BUFFER = 14
   };
   uint32_t version() const {
     return GetField<uint32_t>(VT_VERSION, 0);
@@ -4738,6 +5600,9 @@ struct Model FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const flatbuffers::Vector<flatbuffers::Offset<Buffer>> *buffers() const {
     return GetPointer<const flatbuffers::Vector<flatbuffers::Offset<Buffer>> *>(VT_BUFFERS);
   }
+  const flatbuffers::Vector<int32_t> *metadata_buffer() const {
+    return GetPointer<const flatbuffers::Vector<int32_t> *>(VT_METADATA_BUFFER);
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<uint32_t>(verifier, VT_VERSION) &&
@@ -4752,6 +5617,8 @@ struct Model FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
            VerifyOffset(verifier, VT_BUFFERS) &&
            verifier.Verify(buffers()) &&
            verifier.VerifyVectorOfTables(buffers()) &&
+           VerifyOffset(verifier, VT_METADATA_BUFFER) &&
+           verifier.Verify(metadata_buffer()) &&
            verifier.EndTable();
   }
   ModelT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -4777,6 +5644,9 @@ struct ModelBuilder {
   void add_buffers(flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Buffer>>> buffers) {
     fbb_.AddOffset(Model::VT_BUFFERS, buffers);
   }
+  void add_metadata_buffer(flatbuffers::Offset<flatbuffers::Vector<int32_t>> metadata_buffer) {
+    fbb_.AddOffset(Model::VT_METADATA_BUFFER, metadata_buffer);
+  }
   explicit ModelBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -4795,8 +5665,10 @@ inline flatbuffers::Offset<Model> CreateModel(
     flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<OperatorCode>>> operator_codes = 0,
     flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<SubGraph>>> subgraphs = 0,
     flatbuffers::Offset<flatbuffers::String> description = 0,
-    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Buffer>>> buffers = 0) {
+    flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<Buffer>>> buffers = 0,
+    flatbuffers::Offset<flatbuffers::Vector<int32_t>> metadata_buffer = 0) {
   ModelBuilder builder_(_fbb);
+  builder_.add_metadata_buffer(metadata_buffer);
   builder_.add_buffers(buffers);
   builder_.add_description(description);
   builder_.add_subgraphs(subgraphs);
@@ -4811,14 +5683,16 @@ inline flatbuffers::Offset<Model> CreateModelDirect(
     const std::vector<flatbuffers::Offset<OperatorCode>> *operator_codes = nullptr,
     const std::vector<flatbuffers::Offset<SubGraph>> *subgraphs = nullptr,
     const char *description = nullptr,
-    const std::vector<flatbuffers::Offset<Buffer>> *buffers = nullptr) {
+    const std::vector<flatbuffers::Offset<Buffer>> *buffers = nullptr,
+    const std::vector<int32_t> *metadata_buffer = nullptr) {
   return tflite::CreateModel(
       _fbb,
       version,
       operator_codes ? _fbb.CreateVector<flatbuffers::Offset<OperatorCode>>(*operator_codes) : 0,
       subgraphs ? _fbb.CreateVector<flatbuffers::Offset<SubGraph>>(*subgraphs) : 0,
       description ? _fbb.CreateString(description) : 0,
-      buffers ? _fbb.CreateVector<flatbuffers::Offset<Buffer>>(*buffers) : 0);
+      buffers ? _fbb.CreateVector<flatbuffers::Offset<Buffer>>(*buffers) : 0,
+      metadata_buffer ? _fbb.CreateVector<int32_t>(*metadata_buffer) : 0);
 }
 
 flatbuffers::Offset<Model> CreateModel(flatbuffers::FlatBufferBuilder &_fbb, const ModelT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
@@ -5393,6 +6267,7 @@ inline void LSTMOptions::UnPackTo(LSTMOptionsT *_o, const flatbuffers::resolver_
   { auto _e = fused_activation_function(); _o->fused_activation_function = _e; };
   { auto _e = cell_clip(); _o->cell_clip = _e; };
   { auto _e = proj_clip(); _o->proj_clip = _e; };
+  { auto _e = kernel_type(); _o->kernel_type = _e; };
 }
 
 inline flatbuffers::Offset<LSTMOptions> LSTMOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -5406,11 +6281,13 @@ inline flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(flatbuffers::FlatBuffe
   auto _fused_activation_function = _o->fused_activation_function;
   auto _cell_clip = _o->cell_clip;
   auto _proj_clip = _o->proj_clip;
+  auto _kernel_type = _o->kernel_type;
   return tflite::CreateLSTMOptions(
       _fbb,
       _fused_activation_function,
       _cell_clip,
-      _proj_clip);
+      _proj_clip,
+      _kernel_type);
 }
 
 inline ResizeBilinearOptionsT *ResizeBilinearOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
@@ -5480,11 +6357,34 @@ inline flatbuffers::Offset<PadOptions> PadOptions::Pack(flatbuffers::FlatBufferB
   return CreatePadOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<PadOptions> CreatePadOptions(flatbuffers::FlatBufferBuilder &_fbb, const PadOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<PadOptions> CreatePadOptions(flatbuffers::FlatBufferBuilder &_fbb, const PadOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const PadOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreatePadOptions(
+      _fbb);
+}
+
+inline PadV2OptionsT *PadV2Options::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new PadV2OptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void PadV2Options::UnPackTo(PadV2OptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<PadV2Options> PadV2Options::Pack(flatbuffers::FlatBufferBuilder &_fbb, const PadV2OptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreatePadV2Options(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<PadV2Options> CreatePadV2Options(flatbuffers::FlatBufferBuilder &_fbb, const PadV2OptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const PadOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  return tflite::CreatePadOptions(
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const PadV2OptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreatePadV2Options(
       _fbb);
 }
 
@@ -5797,261 +6697,503 @@ inline MeanOptionsT *MeanOptions::UnPack(const flatbuffers::resolver_function_t
   return _o;
 }
 
-inline void MeanOptions::UnPackTo(MeanOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void MeanOptions::UnPackTo(MeanOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = keep_dims(); _o->keep_dims = _e; };
+}
+
+inline flatbuffers::Offset<MeanOptions> MeanOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MeanOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMeanOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<MeanOptions> CreateMeanOptions(flatbuffers::FlatBufferBuilder &_fbb, const MeanOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MeanOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _keep_dims = _o->keep_dims;
+  return tflite::CreateMeanOptions(
+      _fbb,
+      _keep_dims);
+}
+
+inline SqueezeOptionsT *SqueezeOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new SqueezeOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void SqueezeOptions::UnPackTo(SqueezeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = squeeze_dims(); if (_e) { _o->squeeze_dims.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->squeeze_dims[_i] = _e->Get(_i); } } };
+}
+
+inline flatbuffers::Offset<SqueezeOptions> SqueezeOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SqueezeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSqueezeOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SqueezeOptions> CreateSqueezeOptions(flatbuffers::FlatBufferBuilder &_fbb, const SqueezeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SqueezeOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _squeeze_dims = _o->squeeze_dims.size() ? _fbb.CreateVector(_o->squeeze_dims) : 0;
+  return tflite::CreateSqueezeOptions(
+      _fbb,
+      _squeeze_dims);
+}
+
+inline SplitOptionsT *SplitOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new SplitOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void SplitOptions::UnPackTo(SplitOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = num_splits(); _o->num_splits = _e; };
+}
+
+inline flatbuffers::Offset<SplitOptions> SplitOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SplitOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSplitOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SplitOptions> CreateSplitOptions(flatbuffers::FlatBufferBuilder &_fbb, const SplitOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SplitOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _num_splits = _o->num_splits;
+  return tflite::CreateSplitOptions(
+      _fbb,
+      _num_splits);
+}
+
+inline StridedSliceOptionsT *StridedSliceOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new StridedSliceOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void StridedSliceOptions::UnPackTo(StridedSliceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = begin_mask(); _o->begin_mask = _e; };
+  { auto _e = end_mask(); _o->end_mask = _e; };
+  { auto _e = ellipsis_mask(); _o->ellipsis_mask = _e; };
+  { auto _e = new_axis_mask(); _o->new_axis_mask = _e; };
+  { auto _e = shrink_axis_mask(); _o->shrink_axis_mask = _e; };
+}
+
+inline flatbuffers::Offset<StridedSliceOptions> StridedSliceOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const StridedSliceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateStridedSliceOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<StridedSliceOptions> CreateStridedSliceOptions(flatbuffers::FlatBufferBuilder &_fbb, const StridedSliceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const StridedSliceOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _begin_mask = _o->begin_mask;
+  auto _end_mask = _o->end_mask;
+  auto _ellipsis_mask = _o->ellipsis_mask;
+  auto _new_axis_mask = _o->new_axis_mask;
+  auto _shrink_axis_mask = _o->shrink_axis_mask;
+  return tflite::CreateStridedSliceOptions(
+      _fbb,
+      _begin_mask,
+      _end_mask,
+      _ellipsis_mask,
+      _new_axis_mask,
+      _shrink_axis_mask);
+}
+
+inline LogSoftmaxOptionsT *LogSoftmaxOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new LogSoftmaxOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void LogSoftmaxOptions::UnPackTo(LogSoftmaxOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<LogSoftmaxOptions> LogSoftmaxOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LogSoftmaxOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLogSoftmaxOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<LogSoftmaxOptions> CreateLogSoftmaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const LogSoftmaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LogSoftmaxOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateLogSoftmaxOptions(
+      _fbb);
+}
+
+inline CastOptionsT *CastOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new CastOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void CastOptions::UnPackTo(CastOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = in_data_type(); _o->in_data_type = _e; };
+  { auto _e = out_data_type(); _o->out_data_type = _e; };
+}
+
+inline flatbuffers::Offset<CastOptions> CastOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CastOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateCastOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<CastOptions> CreateCastOptions(flatbuffers::FlatBufferBuilder &_fbb, const CastOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const CastOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _in_data_type = _o->in_data_type;
+  auto _out_data_type = _o->out_data_type;
+  return tflite::CreateCastOptions(
+      _fbb,
+      _in_data_type,
+      _out_data_type);
+}
+
+inline DequantizeOptionsT *DequantizeOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new DequantizeOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void DequantizeOptions::UnPackTo(DequantizeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<DequantizeOptions> DequantizeOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const DequantizeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateDequantizeOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<DequantizeOptions> CreateDequantizeOptions(flatbuffers::FlatBufferBuilder &_fbb, const DequantizeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const DequantizeOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateDequantizeOptions(
+      _fbb);
+}
+
+inline MaximumMinimumOptionsT *MaximumMinimumOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new MaximumMinimumOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void MaximumMinimumOptions::UnPackTo(MaximumMinimumOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<MaximumMinimumOptions> MaximumMinimumOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMaximumMinimumOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<MaximumMinimumOptions> CreateMaximumMinimumOptions(flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MaximumMinimumOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateMaximumMinimumOptions(
+      _fbb);
+}
+
+inline TileOptionsT *TileOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new TileOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void TileOptions::UnPackTo(TileOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<TileOptions> TileOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TileOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateTileOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<TileOptions> CreateTileOptions(flatbuffers::FlatBufferBuilder &_fbb, const TileOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TileOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateTileOptions(
+      _fbb);
+}
+
+inline ArgMaxOptionsT *ArgMaxOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new ArgMaxOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void ArgMaxOptions::UnPackTo(ArgMaxOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = output_type(); _o->output_type = _e; };
+}
+
+inline flatbuffers::Offset<ArgMaxOptions> ArgMaxOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateArgMaxOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ArgMaxOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _output_type = _o->output_type;
+  return tflite::CreateArgMaxOptions(
+      _fbb,
+      _output_type);
+}
+
+inline GreaterOptionsT *GreaterOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new GreaterOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void GreaterOptions::UnPackTo(GreaterOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = keep_dims(); _o->keep_dims = _e; };
 }
 
-inline flatbuffers::Offset<MeanOptions> MeanOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MeanOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateMeanOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<GreaterOptions> GreaterOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const GreaterOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateGreaterOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<MeanOptions> CreateMeanOptions(flatbuffers::FlatBufferBuilder &_fbb, const MeanOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<GreaterOptions> CreateGreaterOptions(flatbuffers::FlatBufferBuilder &_fbb, const GreaterOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MeanOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _keep_dims = _o->keep_dims;
-  return tflite::CreateMeanOptions(
-      _fbb,
-      _keep_dims);
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const GreaterOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateGreaterOptions(
+      _fbb);
 }
 
-inline SqueezeOptionsT *SqueezeOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new SqueezeOptionsT();
+inline GreaterEqualOptionsT *GreaterEqualOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new GreaterEqualOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void SqueezeOptions::UnPackTo(SqueezeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void GreaterEqualOptions::UnPackTo(GreaterEqualOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = squeeze_dims(); if (_e) { _o->squeeze_dims.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->squeeze_dims[_i] = _e->Get(_i); } } };
 }
 
-inline flatbuffers::Offset<SqueezeOptions> SqueezeOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SqueezeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateSqueezeOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<GreaterEqualOptions> GreaterEqualOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const GreaterEqualOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateGreaterEqualOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SqueezeOptions> CreateSqueezeOptions(flatbuffers::FlatBufferBuilder &_fbb, const SqueezeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<GreaterEqualOptions> CreateGreaterEqualOptions(flatbuffers::FlatBufferBuilder &_fbb, const GreaterEqualOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SqueezeOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _squeeze_dims = _o->squeeze_dims.size() ? _fbb.CreateVector(_o->squeeze_dims) : 0;
-  return tflite::CreateSqueezeOptions(
-      _fbb,
-      _squeeze_dims);
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const GreaterEqualOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateGreaterEqualOptions(
+      _fbb);
 }
 
-inline SplitOptionsT *SplitOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new SplitOptionsT();
+inline LessOptionsT *LessOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new LessOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void SplitOptions::UnPackTo(SplitOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void LessOptions::UnPackTo(LessOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = num_splits(); _o->num_splits = _e; };
 }
 
-inline flatbuffers::Offset<SplitOptions> SplitOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SplitOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateSplitOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<LessOptions> LessOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLessOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<SplitOptions> CreateSplitOptions(flatbuffers::FlatBufferBuilder &_fbb, const SplitOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<LessOptions> CreateLessOptions(flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SplitOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _num_splits = _o->num_splits;
-  return tflite::CreateSplitOptions(
-      _fbb,
-      _num_splits);
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LessOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateLessOptions(
+      _fbb);
 }
 
-inline StridedSliceOptionsT *StridedSliceOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new StridedSliceOptionsT();
+inline LessEqualOptionsT *LessEqualOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new LessEqualOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void StridedSliceOptions::UnPackTo(StridedSliceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void LessEqualOptions::UnPackTo(LessEqualOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = begin_mask(); _o->begin_mask = _e; };
-  { auto _e = end_mask(); _o->end_mask = _e; };
-  { auto _e = ellipsis_mask(); _o->ellipsis_mask = _e; };
-  { auto _e = new_axis_mask(); _o->new_axis_mask = _e; };
-  { auto _e = shrink_axis_mask(); _o->shrink_axis_mask = _e; };
 }
 
-inline flatbuffers::Offset<StridedSliceOptions> StridedSliceOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const StridedSliceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateStridedSliceOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<LessEqualOptions> LessEqualOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LessEqualOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLessEqualOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<StridedSliceOptions> CreateStridedSliceOptions(flatbuffers::FlatBufferBuilder &_fbb, const StridedSliceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<LessEqualOptions> CreateLessEqualOptions(flatbuffers::FlatBufferBuilder &_fbb, const LessEqualOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const StridedSliceOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _begin_mask = _o->begin_mask;
-  auto _end_mask = _o->end_mask;
-  auto _ellipsis_mask = _o->ellipsis_mask;
-  auto _new_axis_mask = _o->new_axis_mask;
-  auto _shrink_axis_mask = _o->shrink_axis_mask;
-  return tflite::CreateStridedSliceOptions(
-      _fbb,
-      _begin_mask,
-      _end_mask,
-      _ellipsis_mask,
-      _new_axis_mask,
-      _shrink_axis_mask);
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LessEqualOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateLessEqualOptions(
+      _fbb);
 }
 
-inline LogSoftmaxOptionsT *LogSoftmaxOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new LogSoftmaxOptionsT();
+inline NegOptionsT *NegOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new NegOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void LogSoftmaxOptions::UnPackTo(LogSoftmaxOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void NegOptions::UnPackTo(NegOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<LogSoftmaxOptions> LogSoftmaxOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LogSoftmaxOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateLogSoftmaxOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<NegOptions> NegOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const NegOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateNegOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<LogSoftmaxOptions> CreateLogSoftmaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const LogSoftmaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<NegOptions> CreateNegOptions(flatbuffers::FlatBufferBuilder &_fbb, const NegOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LogSoftmaxOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  return tflite::CreateLogSoftmaxOptions(
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const NegOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateNegOptions(
       _fbb);
 }
 
-inline CastOptionsT *CastOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new CastOptionsT();
+inline SelectOptionsT *SelectOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new SelectOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void CastOptions::UnPackTo(CastOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SelectOptions::UnPackTo(SelectOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = in_data_type(); _o->in_data_type = _e; };
-  { auto _e = out_data_type(); _o->out_data_type = _e; };
 }
 
-inline flatbuffers::Offset<CastOptions> CastOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const CastOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateCastOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<SelectOptions> SelectOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SelectOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSelectOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<CastOptions> CreateCastOptions(flatbuffers::FlatBufferBuilder &_fbb, const CastOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<SelectOptions> CreateSelectOptions(flatbuffers::FlatBufferBuilder &_fbb, const SelectOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const CastOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _in_data_type = _o->in_data_type;
-  auto _out_data_type = _o->out_data_type;
-  return tflite::CreateCastOptions(
-      _fbb,
-      _in_data_type,
-      _out_data_type);
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SelectOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateSelectOptions(
+      _fbb);
 }
 
-inline DequantizeOptionsT *DequantizeOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new DequantizeOptionsT();
+inline SliceOptionsT *SliceOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new SliceOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void DequantizeOptions::UnPackTo(DequantizeOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SliceOptions::UnPackTo(SliceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
 }
 
-inline flatbuffers::Offset<DequantizeOptions> DequantizeOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const DequantizeOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateDequantizeOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<SliceOptions> SliceOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SliceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSliceOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<DequantizeOptions> CreateDequantizeOptions(flatbuffers::FlatBufferBuilder &_fbb, const DequantizeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<SliceOptions> CreateSliceOptions(flatbuffers::FlatBufferBuilder &_fbb, const SliceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const DequantizeOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  return tflite::CreateDequantizeOptions(
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SliceOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateSliceOptions(
       _fbb);
 }
 
-inline MaximumMinimumOptionsT *MaximumMinimumOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new MaximumMinimumOptionsT();
+inline TransposeConvOptionsT *TransposeConvOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new TransposeConvOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void MaximumMinimumOptions::UnPackTo(MaximumMinimumOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void TransposeConvOptions::UnPackTo(TransposeConvOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
+  { auto _e = padding(); _o->padding = _e; };
+  { auto _e = stride_w(); _o->stride_w = _e; };
+  { auto _e = stride_h(); _o->stride_h = _e; };
 }
 
-inline flatbuffers::Offset<MaximumMinimumOptions> MaximumMinimumOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateMaximumMinimumOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<TransposeConvOptions> TransposeConvOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TransposeConvOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateTransposeConvOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<MaximumMinimumOptions> CreateMaximumMinimumOptions(flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<TransposeConvOptions> CreateTransposeConvOptions(flatbuffers::FlatBufferBuilder &_fbb, const TransposeConvOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MaximumMinimumOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  return tflite::CreateMaximumMinimumOptions(
-      _fbb);
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TransposeConvOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _padding = _o->padding;
+  auto _stride_w = _o->stride_w;
+  auto _stride_h = _o->stride_h;
+  return tflite::CreateTransposeConvOptions(
+      _fbb,
+      _padding,
+      _stride_w,
+      _stride_h);
 }
 
-inline ArgMaxOptionsT *ArgMaxOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new ArgMaxOptionsT();
+inline ExpandDimsOptionsT *ExpandDimsOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new ExpandDimsOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void ArgMaxOptions::UnPackTo(ArgMaxOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void ExpandDimsOptions::UnPackTo(ExpandDimsOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
-  { auto _e = output_type(); _o->output_type = _e; };
 }
 
-inline flatbuffers::Offset<ArgMaxOptions> ArgMaxOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateArgMaxOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<ExpandDimsOptions> ExpandDimsOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ExpandDimsOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateExpandDimsOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<ArgMaxOptions> CreateArgMaxOptions(flatbuffers::FlatBufferBuilder &_fbb, const ArgMaxOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<ExpandDimsOptions> CreateExpandDimsOptions(flatbuffers::FlatBufferBuilder &_fbb, const ExpandDimsOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ArgMaxOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  auto _output_type = _o->output_type;
-  return tflite::CreateArgMaxOptions(
-      _fbb,
-      _output_type);
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ExpandDimsOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateExpandDimsOptions(
+      _fbb);
 }
 
-inline LessOptionsT *LessOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
-  auto _o = new LessOptionsT();
+inline SparseToDenseOptionsT *SparseToDenseOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new SparseToDenseOptionsT();
   UnPackTo(_o, _resolver);
   return _o;
 }
 
-inline void LessOptions::UnPackTo(LessOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+inline void SparseToDenseOptions::UnPackTo(SparseToDenseOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
   (void)_o;
   (void)_resolver;
+  { auto _e = validate_indices(); _o->validate_indices = _e; };
 }
 
-inline flatbuffers::Offset<LessOptions> LessOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
-  return CreateLessOptions(_fbb, _o, _rehasher);
+inline flatbuffers::Offset<SparseToDenseOptions> SparseToDenseOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SparseToDenseOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSparseToDenseOptions(_fbb, _o, _rehasher);
 }
 
-inline flatbuffers::Offset<LessOptions> CreateLessOptions(flatbuffers::FlatBufferBuilder &_fbb, const LessOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+inline flatbuffers::Offset<SparseToDenseOptions> CreateSparseToDenseOptions(flatbuffers::FlatBufferBuilder &_fbb, const SparseToDenseOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
   (void)_rehasher;
   (void)_o;
-  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LessOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
-  return tflite::CreateLessOptions(
-      _fbb);
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SparseToDenseOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _validate_indices = _o->validate_indices;
+  return tflite::CreateSparseToDenseOptions(
+      _fbb,
+      _validate_indices);
 }
 
 inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
@@ -6065,6 +7207,7 @@ inline void OperatorCode::UnPackTo(OperatorCodeT *_o, const flatbuffers::resolve
   (void)_resolver;
   { auto _e = builtin_code(); _o->builtin_code = _e; };
   { auto _e = custom_code(); if (_e) _o->custom_code = _e->str(); };
+  { auto _e = version(); _o->version = _e; };
 }
 
 inline flatbuffers::Offset<OperatorCode> OperatorCode::Pack(flatbuffers::FlatBufferBuilder &_fbb, const OperatorCodeT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -6077,10 +7220,12 @@ inline flatbuffers::Offset<OperatorCode> CreateOperatorCode(flatbuffers::FlatBuf
   struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const OperatorCodeT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
   auto _builtin_code = _o->builtin_code;
   auto _custom_code = _o->custom_code.empty() ? 0 : _fbb.CreateString(_o->custom_code);
+  auto _version = _o->version;
   return tflite::CreateOperatorCode(
       _fbb,
       _builtin_code,
-      _custom_code);
+      _custom_code,
+      _version);
 }
 
 inline OperatorT *Operator::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
@@ -6205,6 +7350,7 @@ inline void Model::UnPackTo(ModelT *_o, const flatbuffers::resolver_function_t *
   { auto _e = subgraphs(); if (_e) { _o->subgraphs.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->subgraphs[_i] = std::unique_ptr<SubGraphT>(_e->Get(_i)->UnPack(_resolver)); } } };
   { auto _e = description(); if (_e) _o->description = _e->str(); };
   { auto _e = buffers(); if (_e) { _o->buffers.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->buffers[_i] = std::unique_ptr<BufferT>(_e->Get(_i)->UnPack(_resolver)); } } };
+  { auto _e = metadata_buffer(); if (_e) { _o->metadata_buffer.resize(_e->size()); for (flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->metadata_buffer[_i] = _e->Get(_i); } } };
 }
 
 inline flatbuffers::Offset<Model> Model::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ModelT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -6220,13 +7366,15 @@ inline flatbuffers::Offset<Model> CreateModel(flatbuffers::FlatBufferBuilder &_f
   auto _subgraphs = _o->subgraphs.size() ? _fbb.CreateVector<flatbuffers::Offset<SubGraph>> (_o->subgraphs.size(), [](size_t i, _VectorArgs *__va) { return CreateSubGraph(*__va->__fbb, __va->__o->subgraphs[i].get(), __va->__rehasher); }, &_va ) : 0;
   auto _description = _o->description.empty() ? 0 : _fbb.CreateString(_o->description);
   auto _buffers = _o->buffers.size() ? _fbb.CreateVector<flatbuffers::Offset<Buffer>> (_o->buffers.size(), [](size_t i, _VectorArgs *__va) { return CreateBuffer(*__va->__fbb, __va->__o->buffers[i].get(), __va->__rehasher); }, &_va ) : 0;
+  auto _metadata_buffer = _o->metadata_buffer.size() ? _fbb.CreateVector(_o->metadata_buffer) : 0;
   return tflite::CreateModel(
       _fbb,
       _version,
       _operator_codes,
       _subgraphs,
       _description,
-      _buffers);
+      _buffers,
+      _metadata_buffer);
 }
 
 inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type) {
@@ -6398,6 +7546,50 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const LessOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case BuiltinOptions_NegOptions: {
+      auto ptr = reinterpret_cast<const NegOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_PadV2Options: {
+      auto ptr = reinterpret_cast<const PadV2Options *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_GreaterOptions: {
+      auto ptr = reinterpret_cast<const GreaterOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_GreaterEqualOptions: {
+      auto ptr = reinterpret_cast<const GreaterEqualOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_LessEqualOptions: {
+      auto ptr = reinterpret_cast<const LessEqualOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SelectOptions: {
+      auto ptr = reinterpret_cast<const SelectOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SliceOptions: {
+      auto ptr = reinterpret_cast<const SliceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_TransposeConvOptions: {
+      auto ptr = reinterpret_cast<const TransposeConvOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SparseToDenseOptions: {
+      auto ptr = reinterpret_cast<const SparseToDenseOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_TileOptions: {
+      auto ptr = reinterpret_cast<const TileOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ExpandDimsOptions: {
+      auto ptr = reinterpret_cast<const ExpandDimsOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default: return false;
   }
 }
@@ -6580,6 +7772,50 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const LessOptions *>(obj);
       return ptr->UnPack(resolver);
     }
+    case BuiltinOptions_NegOptions: {
+      auto ptr = reinterpret_cast<const NegOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_PadV2Options: {
+      auto ptr = reinterpret_cast<const PadV2Options *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_GreaterOptions: {
+      auto ptr = reinterpret_cast<const GreaterOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_GreaterEqualOptions: {
+      auto ptr = reinterpret_cast<const GreaterEqualOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_LessEqualOptions: {
+      auto ptr = reinterpret_cast<const LessEqualOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SelectOptions: {
+      auto ptr = reinterpret_cast<const SelectOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SliceOptions: {
+      auto ptr = reinterpret_cast<const SliceOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_TransposeConvOptions: {
+      auto ptr = reinterpret_cast<const TransposeConvOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SparseToDenseOptions: {
+      auto ptr = reinterpret_cast<const SparseToDenseOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_TileOptions: {
+      auto ptr = reinterpret_cast<const TileOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ExpandDimsOptions: {
+      auto ptr = reinterpret_cast<const ExpandDimsOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default: return nullptr;
   }
 }
@@ -6750,6 +7986,50 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const LessOptionsT *>(value);
       return CreateLessOptions(_fbb, ptr, _rehasher).Union();
     }
+    case BuiltinOptions_NegOptions: {
+      auto ptr = reinterpret_cast<const NegOptionsT *>(value);
+      return CreateNegOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_PadV2Options: {
+      auto ptr = reinterpret_cast<const PadV2OptionsT *>(value);
+      return CreatePadV2Options(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_GreaterOptions: {
+      auto ptr = reinterpret_cast<const GreaterOptionsT *>(value);
+      return CreateGreaterOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_GreaterEqualOptions: {
+      auto ptr = reinterpret_cast<const GreaterEqualOptionsT *>(value);
+      return CreateGreaterEqualOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_LessEqualOptions: {
+      auto ptr = reinterpret_cast<const LessEqualOptionsT *>(value);
+      return CreateLessEqualOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SelectOptions: {
+      auto ptr = reinterpret_cast<const SelectOptionsT *>(value);
+      return CreateSelectOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SliceOptions: {
+      auto ptr = reinterpret_cast<const SliceOptionsT *>(value);
+      return CreateSliceOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_TransposeConvOptions: {
+      auto ptr = reinterpret_cast<const TransposeConvOptionsT *>(value);
+      return CreateTransposeConvOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SparseToDenseOptions: {
+      auto ptr = reinterpret_cast<const SparseToDenseOptionsT *>(value);
+      return CreateSparseToDenseOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_TileOptions: {
+      auto ptr = reinterpret_cast<const TileOptionsT *>(value);
+      return CreateTileOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ExpandDimsOptions: {
+      auto ptr = reinterpret_cast<const ExpandDimsOptionsT *>(value);
+      return CreateExpandDimsOptions(_fbb, ptr, _rehasher).Union();
+    }
     default: return 0;
   }
 }
@@ -6920,6 +8200,50 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       value = new LessOptionsT(*reinterpret_cast<LessOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions_NegOptions: {
+      value = new NegOptionsT(*reinterpret_cast<NegOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_PadV2Options: {
+      value = new PadV2OptionsT(*reinterpret_cast<PadV2OptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_GreaterOptions: {
+      value = new GreaterOptionsT(*reinterpret_cast<GreaterOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_GreaterEqualOptions: {
+      value = new GreaterEqualOptionsT(*reinterpret_cast<GreaterEqualOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_LessEqualOptions: {
+      value = new LessEqualOptionsT(*reinterpret_cast<LessEqualOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SelectOptions: {
+      value = new SelectOptionsT(*reinterpret_cast<SelectOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SliceOptions: {
+      value = new SliceOptionsT(*reinterpret_cast<SliceOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_TransposeConvOptions: {
+      value = new TransposeConvOptionsT(*reinterpret_cast<TransposeConvOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SparseToDenseOptions: {
+      value = new SparseToDenseOptionsT(*reinterpret_cast<SparseToDenseOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_TileOptions: {
+      value = new TileOptionsT(*reinterpret_cast<TileOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ExpandDimsOptions: {
+      value = new ExpandDimsOptionsT(*reinterpret_cast<ExpandDimsOptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -7132,6 +8456,61 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
+    case BuiltinOptions_NegOptions: {
+      auto ptr = reinterpret_cast<NegOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_PadV2Options: {
+      auto ptr = reinterpret_cast<PadV2OptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_GreaterOptions: {
+      auto ptr = reinterpret_cast<GreaterOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_GreaterEqualOptions: {
+      auto ptr = reinterpret_cast<GreaterEqualOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_LessEqualOptions: {
+      auto ptr = reinterpret_cast<LessEqualOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SelectOptions: {
+      auto ptr = reinterpret_cast<SelectOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SliceOptions: {
+      auto ptr = reinterpret_cast<SliceOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_TransposeConvOptions: {
+      auto ptr = reinterpret_cast<TransposeConvOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SparseToDenseOptions: {
+      auto ptr = reinterpret_cast<SparseToDenseOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_TileOptions: {
+      auto ptr = reinterpret_cast<TileOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ExpandDimsOptions: {
+      auto ptr = reinterpret_cast<ExpandDimsOptionsT *>(value);
+      delete ptr;
+      break;
+    }
     default: break;
   }
   value = nullptr;
diff --git a/tensorflow/contrib/lite/schema/schema_v0.fbs b/tensorflow/contrib/lite/schema/schema_v0.fbs
index 852ea988f3ddc7..891d8366ccae35 100644
--- a/tensorflow/contrib/lite/schema/schema_v0.fbs
+++ b/tensorflow/contrib/lite/schema/schema_v0.fbs
@@ -48,7 +48,7 @@ table Tensor {
   quantization:QuantizationParameters;  // Optional.
 }
 
-// A list of builtin operators. Builtin operators a slighlty faster than custom
+// A list of builtin operators. Builtin operators are slightly faster than custom
 // ones, but not by much. Moreover, while custom operators accept an opaque
 // object containing configuration parameters, builtins have a predetermined
 // set of acceptable options.
diff --git a/tensorflow/contrib/lite/schema/schema_v1.fbs b/tensorflow/contrib/lite/schema/schema_v1.fbs
index 06cd9408edb710..b438b569e67ac5 100644
--- a/tensorflow/contrib/lite/schema/schema_v1.fbs
+++ b/tensorflow/contrib/lite/schema/schema_v1.fbs
@@ -53,7 +53,7 @@ table Tensor {
   quantization:QuantizationParameters;  // Optional.
 }
 
-// A list of builtin operators. Builtin operators a slighlty faster than custom
+// A list of builtin operators. Builtin operators are slightly faster than custom
 // ones, but not by much. Moreover, while custom operators accept an opaque
 // object containing configuration parameters, builtins have a predetermined
 // set of acceptable options.
diff --git a/tensorflow/contrib/lite/schema/schema_v2.fbs b/tensorflow/contrib/lite/schema/schema_v2.fbs
index 96731c8aaebf69..b90408ff6d09fd 100644
--- a/tensorflow/contrib/lite/schema/schema_v2.fbs
+++ b/tensorflow/contrib/lite/schema/schema_v2.fbs
@@ -54,7 +54,7 @@ table Tensor {
   quantization:QuantizationParameters;  // Optional.
 }
 
-// A list of builtin operators. Builtin operators a slighlty faster than custom
+// A list of builtin operators. Builtin operators are slightly faster than custom
 // ones, but not by much. Moreover, while custom operators accept an opaque
 // object containing configuration parameters, builtins have a predetermined
 // set of acceptable options.
diff --git a/tensorflow/contrib/lite/schema/schema_v3.fbs b/tensorflow/contrib/lite/schema/schema_v3.fbs
index cedefe08f35cbb..020da38493980d 100644
--- a/tensorflow/contrib/lite/schema/schema_v3.fbs
+++ b/tensorflow/contrib/lite/schema/schema_v3.fbs
@@ -53,7 +53,7 @@ table Tensor {
   type:TensorType;
   // An index that refers to the buffers table at the root of the model. Or,
   // if there is no data buffer associated (i.e. intermediate results), then
-  // this is 0 (which refers to an always existant empty buffer).
+  // this is 0 (which refers to an always existent empty buffer).
   //
   // The data_buffer itself is an opaque container, with the assumption that the
   // target device is little-endian. In addition, all builtin operators assume
@@ -64,7 +64,7 @@ table Tensor {
   quantization:QuantizationParameters;  // Optional.
 }
 
-// A list of builtin operators. Builtin operators a slighlty faster than custom
+// A list of builtin operators. Builtin operators are slightly faster than custom
 // ones, but not by much. Moreover, while custom operators accept an opaque
 // object containing configuration parameters, builtins have a predetermined
 // set of acceptable options.
diff --git a/tensorflow/contrib/lite/testing/BUILD b/tensorflow/contrib/lite/testing/BUILD
index bd888a415b0359..80e4c5a4dde470 100644
--- a/tensorflow/contrib/lite/testing/BUILD
+++ b/tensorflow/contrib/lite/testing/BUILD
@@ -6,7 +6,8 @@ licenses(["notice"])  # Apache 2.0
 
 load(
     "//tensorflow/contrib/lite:build_def.bzl",
-    "gen_zipped_test_files",
+    "gen_zip_test",
+    "generated_test_models",
 )
 load("//tensorflow/contrib/lite:special_rules.bzl", "tflite_portable_test_suite")
 load(
@@ -14,50 +15,52 @@ load(
     "tf_cc_test",
 )
 
-gen_zipped_test_files(
-    name = "optest",
-    files = [
-        "add.zip",
-        "arg_max.zip",
-        "avg_pool.zip",
-        "batch_to_space_nd.zip",
-        "concat.zip",
-        "constant.zip",
-        "control_dep.zip",
-        "conv.zip",
-        "depthwiseconv.zip",
-        "div.zip",
-        "exp.zip",
-        "fully_connected.zip",
-        "fused_batch_norm.zip",
-        "gather.zip",
-        "global_batch_norm.zip",
-        "l2_pool.zip",
-        "l2norm.zip",
-        "less.zip",
-        "local_response_norm.zip",
-        "log_softmax.zip",
-        "max_pool.zip",
-        "maximum.zip",
-        "mean.zip",
-        "minimum.zip",
-        "mul.zip",
-        "pad.zip",
-        "relu.zip",
-        "relu1.zip",
-        "relu6.zip",
-        "reshape.zip",
-        "resize_bilinear.zip",
-        "sigmoid.zip",
-        "softmax.zip",
-        "space_to_batch_nd.zip",
-        "space_to_depth.zip",
-        "split.zip",
-        "squeeze.zip",
-        "strided_slice.zip",
-        "sub.zip",
-        "topk.zip",
-        "transpose.zip",
+[gen_zip_test(
+    name = "zip_test_%s" % test_name,
+    size = "large",
+    srcs = ["generated_examples_zip_test.cc"],
+    args = [
+        "--zip_file_path=$(location :zip_%s)" % test_name,
+        # TODO(angerson) We may be able to add an external unzip binary instead
+        # of relying on an existing one for OSS builds.
+        "--unzip_binary_path=/usr/bin/unzip",
+    ],
+    data = [
+        ":zip_%s" % test_name,
+    ],
+    shard_count = 20,
+    tags = [
+        "gen_zip_test",
+        "no_oss",
+        "tflite_not_portable",
+    ],
+    test_name = test_name,
+    deps = [
+        ":parse_testdata_lib",
+        ":tflite_driver",
+        ":util",
+        "@com_google_googletest//:gtest",
+        "@com_googlesource_code_re2//:re2",
+        "//tensorflow/contrib/lite:builtin_op_data",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+    ] + select({
+        "//conditions:default": [
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:test",
+        ],
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib",
+            "//tensorflow/core:android_tensorflow_test_lib",
+        ],
+    }),
+) for test_name in generated_test_models()]
+
+test_suite(
+    name = "generated_zip_tests",
+    tags = [
+        "gen_zip_test",
     ],
 )
 
@@ -152,6 +155,7 @@ cc_library(
     deps = [
         ":split",
         ":test_runner",
+        "//tensorflow/contrib/lite:builtin_op_data",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite/kernels:builtin_ops",
     ],
@@ -283,13 +287,9 @@ cc_library(
     deps = [
         ":generate_testspec",
         ":parse_testdata_lib",
-        ":split",
         ":tflite_driver",
-        ":util",
-        "//tensorflow/contrib/lite:builtin_op_data",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:string",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
     ],
 )
 
@@ -346,42 +346,4 @@ cc_binary(
     ],
 )
 
-tf_cc_test(
-    name = "generated_examples_zip_test",
-    size = "large",
-    srcs = ["generated_examples_zip_test.cc"],
-    args = [
-        "--zip_files_dir=tensorflow/contrib/lite/testing/optest",
-        # TODO(angerson) We may be able to add an external unzip binary instead
-        # of relying on an existing one for OSS builds.
-        "--unzip_binary_path=/usr/bin/unzip",
-    ],
-    data = [":optest"],
-    shard_count = 20,
-    tags = [
-        "no_oss",
-        "tflite_not_portable",
-    ],
-    deps = [
-        ":parse_testdata_lib",
-        ":tflite_driver",
-        ":util",
-        "@com_google_googletest//:gtest",
-        "@com_googlesource_code_re2//:re2",
-        "//tensorflow/contrib/lite:builtin_op_data",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
-    ] + select({
-        "//conditions:default": [
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-            "//tensorflow/core:test",
-        ],
-        "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
-            "//tensorflow/core:android_tensorflow_test_lib",
-        ],
-    }),
-)
-
 tflite_portable_test_suite()
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index f72a4e0d8cbc89..351187f520d80b 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -20,14 +20,21 @@
 generate_examples <output directory>
 
 bazel run //tensorflow/contrib/lite/testing:generate_examples
+
+To more easily debug failures use (or override) the --save_graphdefs flag to
+place text proto graphdefs into the generated zip files.
 """
+
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 import argparse
+import functools
 import itertools
+import operator
 import os
+import random
 import re
 import sys
 import tempfile
@@ -51,10 +58,11 @@
 parser = argparse.ArgumentParser(description="Script to generate TFLite tests.")
 parser.add_argument("output_path",
                     help="Directory where the outputs will be go.")
-parser.add_argument("--zip_to_output",
-                    type=str,
-                    help="Particular zip to output.",
-                    required=False)
+parser.add_argument(
+    "--zip_to_output",
+    type=str,
+    help="Particular zip to output.",
+    required=True)
 parser.add_argument("--toco",
                     type=str,
                     help="Path to toco tool.",
@@ -90,26 +98,17 @@
     r"fully_connected.*transpose_.=True": "67586970",
     # Softmax graphs are too complex.
     r"softmax.*dim=0": "67749831",
-    r"softmax.*input_shape=\[1,3,4,3\]": "67749831",
-    # SpaceToDepth only supports float32.
-    r"space_to_depth.*(float16|int32|uint8|int64)": "68018134",
     # BatchToSpaceND only supports 4D tensors.
     r"batch_to_space_nd.*input_shape=\[8,2,2,2,1,1\]": "70594733",
     # Div will use floordiv.
     r"div.*int32": "72051395",
-    # TOCO require matching dimensions in strided_slice.
-    r"strided_slice.*begin=\[0\].*end=\[1\].*": "73170889",
     # No support for SplitV
     r"split.*num_or_size_splits=\[2,2\]": "73377559",
-    # Needs support for dimensions other than the last one in argmax.
-    r"arg_max.*axis=0.*": "77546240",
-    r"arg_max.*axis=1.*": "77546240",
-    r"arg_max.*axis=2.*": "77546240",
 }
 
 
 class ExtraTocoOptions(object):
-  """Additonal toco options besides input, output, shape."""
+  """Additional toco options besides input, output, shape."""
 
   def __init__(self):
     # Whether to ignore control dependency nodes.
@@ -118,6 +117,8 @@ def __init__(self):
     self.allow_custom_ops = False
     # Rnn states that are used to support rnn / lstm cells.
     self.rnn_states = None
+    # Split the LSTM inputs from 5 inoputs to 18 inputs for TFLite.
+    self.split_tflite_lstm_inputs = None
 
 
 def toco_options(data_types,
@@ -146,14 +147,20 @@ def toco_options(data_types,
        " --inference_type=%s" % inference_type +
        " --input_format=TENSORFLOW_GRAPHDEF" + " --output_format=TFLITE" +
        " --input_arrays=%s" % ",".join(input_arrays) +
-       " --input_shapes=%s" % shape_str +
        " --output_arrays=%s" % ",".join(output_arrays))
+  if shape_str:
+    s += (" --input_shapes=%s" % shape_str)
   if extra_toco_options.drop_control_dependency:
     s += " --drop_control_dependency"
   if extra_toco_options.allow_custom_ops:
     s += " --allow_custom_ops"
   if extra_toco_options.rnn_states:
     s += (" --rnn_states='" + extra_toco_options.rnn_states + "'")
+  if extra_toco_options.split_tflite_lstm_inputs is not None:
+    if extra_toco_options.split_tflite_lstm_inputs:
+      s += " --split_tflite_lstm_inputs=true"
+    else:
+      s += " --split_tflite_lstm_inputs=false"
   return s
 
 
@@ -238,6 +245,19 @@ def create_tensor_data(dtype, shape, min_value=-100, max_value=100):
   return value.astype(dtype)
 
 
+def create_scalar_data(dtype, min_value=-100, max_value=100):
+  """Build scalar tensor data range from min_value to max_value exclusively."""
+
+  if dtype in _TF_TYPE_INFO:
+    dtype = _TF_TYPE_INFO[dtype][0]
+
+  if dtype in (tf.float32, tf.float16):
+    value = (max_value - min_value) * np.random.random() + min_value
+  elif dtype in (tf.int32, tf.uint8, tf.int64):
+    value = np.random.randint(min_value, max_value + 1)
+  return np.array(value, dtype=dtype)
+
+
 def freeze_graph(session, outputs):
   """Freeze the current graph.
 
@@ -328,6 +348,11 @@ def normalize_output_name(output_name):
       ":0") else output_name
 
 
+# How many test cases we may have in a zip file. Too many test cases will
+# slow down the test data generation process.
+_MAX_TESTS_PER_ZIP = 500
+
+
 def make_zip_of_tests(zip_path,
                       test_parameters,
                       make_graph,
@@ -357,19 +382,39 @@ def make_zip_of_tests(zip_path,
   Raises:
     RuntimeError: if there are toco errors that can't be ignored.
   """
+  parameter_count = 0
+  for parameters in test_parameters:
+    parameter_count += functools.reduce(
+        operator.mul, [len(values) for values in parameters.values()])
+
+  if parameter_count > _MAX_TESTS_PER_ZIP:
+    raise RuntimeError(
+        "Too many parameter combinations for generating '%s'.\n"
+        "There are %d combinations while the upper limit is %d.\n"
+        "Having too many combinations will slow down the tests.\n"
+        "Please consider splitting the test into multiple functions.\n"
+        % (zip_path, parameter_count, _MAX_TESTS_PER_ZIP))
 
   # TODO(aselle): Make this allow multiple inputs outputs.
   archive = zipfile.PyZipFile(zip_path, "w")
   zip_manifest = []
   convert_report = []
   toco_errors = 0
+
+  processed_labels = set()
   for parameters in test_parameters:
     keys = parameters.keys()
     for curr in itertools.product(*parameters.values()):
-      label = zip_path.replace(".zip", "") + (",".join(
+      label = zip_path.replace(".zip", "_") + (",".join(
           "%s=%r" % z for z in sorted(zip(keys, curr))).replace(" ", ""))
       if label[0] == "/":
         label = label[1:]
+      if label in processed_labels:
+        # Do not populate data for the same label more than once. It will cause
+        # errors when unzipping.
+        continue
+      processed_labels.add(label)
+
       param_dict = dict(zip(keys, curr))
 
       def build_example(label, param_dict_real):
@@ -422,6 +467,11 @@ def build_example(label, param_dict_real):
             sess,
             tf.global_variables() + inputs +
             outputs) if use_frozen_graph else sess.graph_def
+
+        if "split_tflite_lstm_inputs" in param_dict_real:
+          extra_toco_options.split_tflite_lstm_inputs = param_dict_real[
+              "split_tflite_lstm_inputs"]
+
         tflite_model_binary, toco_log = toco_convert(
             graph_def.SerializeToString(), input_tensors, output_tensors,
             extra_toco_options)
@@ -430,7 +480,7 @@ def build_example(label, param_dict_real):
         report["toco_log"] = toco_log
 
         if FLAGS.save_graphdefs:
-          archive.writestr(label + ".pb",
+          archive.writestr(label + ".pbtxt",
                            text_format.MessageToString(graph_def),
                            zipfile.ZIP_DEFLATED)
 
@@ -468,6 +518,7 @@ def build_example(label, param_dict_real):
                 report["toco_log"])
 
       convert_report.append((param_dict, report))
+
   report_io = StringIO()
   report_lib.make_report_table(report_io, zip_path, convert_report)
   archive.writestr("report.html", report_io.getvalue())
@@ -704,65 +755,83 @@ def build_inputs(parameters, sess, inputs, outputs):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
-def make_mean_tests(zip_path):
-  """Make a set of tests to do mean."""
+def make_reduce_tests(reduce_op):
+  """Make a set of tests to do reduce operation.
 
-  test_parameters = [{
-      "input_dtype": [tf.float32, tf.int32, tf.int64],
-      "input_shape": [[3, 2, 4]],
-      "axis": [
-          None, 0, 1, 2, [0, 1], [0, 2], [1, 2], [0, 1, 2], [1, 0], [2, 0],
-          [2, 1], [2, 1, 0], [2, 0, 1], -1, -2, -3, [1, -1], [0, -1], [-1, 0],
-          [-1, -2, -3], [0, 0, 0], [2, 2, 0], [1, 0, -3, -3]
-      ],
-      "const_axis": [True, False],
-      "keepdims": [True, False],
-  }, {
-      "input_dtype": [tf.float32, tf.int32, tf.int64],
-      "input_shape": [[1, 224, 224, 3]],
-      "axis": [
-          None, 0, 1, 2, 3, [1, 2], [0, 3], [1, 2, 3], [0, 1, 2, 3],
-          [3, 2, 1, 0], [3, 1, 0, 2], [2, 0], [3, 0], [3, 1], [1, 0], -1, -2,
-          -3, -4, [0, -2], [2, 3, -1, 0], [3, 1, 2, -3], [3, -4], [2, 2, 2],
-          [2, 2, 3], [-3, -3, -4], [-3, 2, 1]
-      ],
-      "const_axis": [True, False],
-      "keepdims": [True, False],
-  }]
+  Args:
+    reduce_op: TensorFlow reduce operation to test, i.e. `tf.reduce_mean`.
 
-  def build_graph(parameters):
-    """Build the mean op testing graph."""
-    input_tensor = tf.placeholder(
-        dtype=parameters["input_dtype"],
-        name="input",
-        shape=parameters["input_shape"])
+  Returns:
+    a function representing the true generator with `reduce_op_in` curried.
+  """
 
-    # Get axis as either a placeholder or constants.
-    if parameters["const_axis"]:
-      axis = parameters["axis"]
-      input_tensors = [input_tensor]
-    else:
-      if isinstance(parameters["axis"], list):
-        shape = [len(parameters["axis"])]
+  def f(zip_path):
+    """Actual function that generates examples."""
+
+    test_parameters = [{
+        "input_dtype": [tf.float32, tf.int32, tf.int64],
+        "input_shape": [[3, 2, 4]],
+        "axis": [
+            None, 0, 1, 2, [0, 1], [0, 2], [1, 2], [0, 1, 2], [1, 0], [2, 0],
+            [2, 1], [2, 1, 0], [2, 0, 1], -1, -2, -3, [1, -1], [0, -1], [-1, 0],
+            [-1, -2, -3], [0, 0, 0], [2, 2, 0], [1, 0, -3, -3]
+        ],
+        "const_axis": [True, False],
+        "keepdims": [True, False],
+    }, {
+        "input_dtype": [tf.float32],
+        "input_shape": [[1, 8, 8, 3]],
+        "axis": [
+            None, 0, 1, 2, 3, [1, 2], [0, 3], [1, 2, 3], [0, 1, 2, 3],
+            [3, 2, 1, 0], [3, 1, 0, 2], [2, 0], [3, 0], [3, 1], [1, 0], -1, -2,
+            -3, -4, [0, -2], [2, 3, -1, 0], [3, 1, 2, -3], [3, -4], [2, 2, 2],
+            [2, 2, 3], [-3, -3, -4], [-3, 2, 1]
+        ],
+        "const_axis": [True, False],
+        "keepdims": [True, False],
+    }]
+
+    def build_graph(parameters):
+      """Build the mean op testing graph."""
+      input_tensor = tf.placeholder(
+          dtype=parameters["input_dtype"],
+          name="input",
+          shape=parameters["input_shape"])
+
+      # Get axis as either a placeholder or constants.
+      if parameters["const_axis"]:
+        axis = parameters["axis"]
+        input_tensors = [input_tensor]
       else:
-        shape = [0]  # shape for None or integers.
-      axis = tf.placeholder(dtype=tf.int32, name="axis", shape=shape)
-      input_tensors = [input_tensor, axis]
+        if isinstance(parameters["axis"], list):
+          shape = [len(parameters["axis"])]
+        else:
+          shape = [0]  # shape for None or integers.
+        axis = tf.placeholder(dtype=tf.int32, name="axis", shape=shape)
+        input_tensors = [input_tensor, axis]
 
-    out = tf.reduce_mean(
-        input_tensor, axis=axis, keepdims=parameters["keepdims"])
-    return input_tensors, [out]
+      out = reduce_op(
+          input_tensor, axis=axis, keepdims=parameters["keepdims"])
+      return input_tensors, [out]
 
-  def build_inputs(parameters, sess, inputs, outputs):
-    values = [
-        create_tensor_data(parameters["input_dtype"], parameters["input_shape"])
-    ]
-    if not parameters["const_axis"]:
-      if parameters["axis"]:
-        values.append(np.array(parameters["axis"]))
-    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+    def build_inputs(parameters, sess, inputs, outputs):
+      values = [
+          create_tensor_data(parameters["input_dtype"],
+                             parameters["input_shape"])]
+      if not parameters["const_axis"]:
+        if parameters["axis"]:
+          values.append(np.array(parameters["axis"]))
+      return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+    make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+  return f
+
+
+def make_mean_tests(zip_path):
+  """Make a set of tests to do mean."""
+
+  return make_reduce_tests(tf.reduce_mean)(zip_path)
 
 
 def make_exp_tests(zip_path):
@@ -1034,40 +1103,39 @@ def build_inputs(parameters, sess, inputs, outputs):
 def make_conv_tests(zip_path):
   """Make a set of tests to do convolution."""
 
-  test_parameters = [
-      {
-          "input_shape": [[1, 3, 4, 3]],
-          "filter_shape": [[1, 1, 3, 2]],
-          "strides": [[1, 1, 1, 1], [1, 2, 3, 1]],
-          "dilations": [[1, 1, 1, 1], [1, 3, 2, 1], [1, 2, 2, 1]],
-          "padding": ["SAME", "VALID"],
-          "data_format": ["NHWC"],  # TODO(aselle): NCHW  would be good
-          "constant_filter": [True, False],
-      },
-      {
-          "input_shape": [[2, 14, 14, 2]],
-          "filter_shape": [[6, 6, 2, 2]],
-          "strides": [[1, 1, 1, 1], [1, 2, 3, 1]],
-          "dilations": [[1, 1, 1, 1], [1, 2, 2, 1]],
-          "padding": ["SAME", "VALID"],
-          "data_format": ["NHWC"],  # TODO(aselle): NCHW  would be good
-          "constant_filter": [True, False],
-      }
-  ]
+  test_parameters = [{
+      "input_shape": [[1, 3, 4, 3], [4, 6, 6, 1]],
+      "filter_shape": [[1, 1], [2, 3], [3, 3]],
+      "strides": [[1, 1, 1, 1], [1, 2, 3, 1]],
+      "dilations": [[1, 1, 1, 1], [1, 3, 2, 1], [1, 2, 2, 1]],
+      "padding": ["SAME", "VALID"],
+      "data_format": ["NHWC"],  # TODO(aselle): NCHW  would be good
+      "constant_filter": [True, False],
+      "channel_multiplier": [1, 2],
+  }]
+
+  def get_tensor_shapes(parameters):
+    input_shape = parameters["input_shape"]
+    filter_size = parameters["filter_shape"]
+    filter_shape = filter_size + [
+        input_shape[3], parameters["channel_multiplier"]
+    ]
+    return [input_shape, filter_shape]
 
   def build_graph(parameters):
     """Build a conv graph given `parameters`."""
+    input_shape, filter_shape = get_tensor_shapes(parameters)
     input_tensor = tf.placeholder(
-        dtype=tf.float32, name="input", shape=parameters["input_shape"])
+        dtype=tf.float32, name="input", shape=input_shape)
 
     # Get filter input either as a placeholder or constants. Also get a list of
     # the input tensors that are represented as placeholders.
     if parameters["constant_filter"]:
-      filter_input = create_tensor_data(np.float32, parameters["filter_shape"])
+      filter_input = create_tensor_data(np.float32, filter_shape)
       input_tensors = [input_tensor]
     else:
       filter_input = tf.placeholder(
-          dtype=tf.float32, name="filter", shape=parameters["filter_shape"])
+          dtype=tf.float32, name="filter", shape=filter_shape)
       input_tensors = [input_tensor, filter_input]
 
     out = tf.nn.conv2d(
@@ -1082,9 +1150,10 @@ def build_graph(parameters):
   def build_inputs(parameters, sess, inputs, outputs):
     # Build list of input values either containing 1 tensor (input) or 2 tensors
     # (input, filter) based on whether filter is constant or variable input.
-    values = [create_tensor_data(np.float32, parameters["input_shape"])]
+    input_shape, filter_shape = get_tensor_shapes(parameters)
+    values = [create_tensor_data(np.float32, input_shape)]
     if not parameters["constant_filter"]:
-      values.append(create_tensor_data(np.float32, parameters["filter_shape"]))
+      values.append(create_tensor_data(np.float32, filter_shape))
     return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
@@ -1316,10 +1385,10 @@ def make_local_response_norm_tests(zip_path):
   # Chose a set of parameters
   test_parameters = [{
       "input_shape": [[1, 1, 1, 1], [1, 3, 4, 3], [3, 15, 14, 3]],
-      "depth_radius": [None, 0, 1, 3, 4, 5],
-      "bias": [None, 0.1, 0.3, -0.1],
-      "alpha": [None, 1, 2, -3],
-      "beta": [None, 0.5, 0.25, 2],
+      "depth_radius": [None, 0, 1, 3, 5],
+      "bias": [None, 0.3, -0.1],
+      "alpha": [None, 2, -3],
+      "beta": [None, 0.25, 2],
   }]
 
   def build_graph(parameters):
@@ -1391,6 +1460,60 @@ def build_inputs(parameters, sess, inputs, outputs):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_padv2_tests(zip_path):
+  """Make a set of tests to do padv2."""
+
+  # TODO(nupurgarg): Add test for tf.uint8.
+  test_parameters = [
+      {
+          "dtype": [tf.int32, tf.int64, tf.float32],
+          "input_shape": [[1, 1, 2, 1], [2, 1, 1, 1]],
+          "paddings": [[[0, 0], [0, 1], [2, 3], [0, 0]], [[0, 1], [0, 0],
+                                                          [0, 0], [2, 3]]],
+          "constant_paddings": [True, False],
+          "constant_values": [0, 2],
+      },
+      # Non-4D use case.
+      {
+          "dtype": [tf.int32, tf.int64, tf.float32],
+          "input_shape": [[1, 2], [0, 1, 2]],
+          "paddings": [[[0, 1], [2, 3]]],
+          "constant_paddings": [True, False],
+          "constant_values": [0, 2],
+      },
+  ]
+
+  def build_graph(parameters):
+    """Build a pad graph given `parameters`."""
+    input_tensor = tf.placeholder(
+        dtype=parameters["dtype"],
+        name="input",
+        shape=parameters["input_shape"])
+
+    # Get paddings as either a placeholder or constants.
+    if parameters["constant_paddings"]:
+      paddings = parameters["paddings"]
+      input_tensors = [input_tensor]
+    else:
+      shape = [len(parameters["paddings"]), 2]
+      paddings = tf.placeholder(dtype=tf.int32, name="padding", shape=shape)
+      input_tensors = [input_tensor, paddings]
+
+    out = tf.pad(input_tensor, paddings=paddings,
+                 constant_values=parameters["constant_values"])
+    return input_tensors, [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    values = [
+        create_tensor_data(parameters["dtype"], parameters["input_shape"])
+    ]
+    if not parameters["constant_paddings"]:
+      values.append(np.array(parameters["paddings"]))
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_reshape_tests(zip_path):
   """Make a set of tests to do reshape."""
 
@@ -1497,7 +1620,7 @@ def make_space_to_depth_tests(zip_path):
   """Make a set of tests to do space_to_depth."""
 
   test_parameters = [{
-      "dtype": [tf.float32, tf.float16, tf.int32, tf.uint8, tf.int64],
+      "dtype": [tf.float32, tf.int32, tf.uint8, tf.int64],
       "input_shape": [[2, 12, 24, 1]],
       "block_size": [2, 3, 4],
   }]
@@ -1740,52 +1863,8 @@ def build_inputs(parameters, sess, inputs, outputs):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
-def make_strided_slice_tests(zip_path):
-  """Make a set of tests to do strided_slice."""
-
-  # TODO(soroosh): add test/support for uint8.
-  test_parameters = [
-      # 4-D
-      {
-          "dtype": [tf.float32, tf.int32, tf.int64],
-          "index_type": [tf.int32],
-          "input_shape": [[12, 2, 2, 5]],
-          "begin": [[0, 0, 0, 0], [1, 0, 1, 0]],
-          "end": [[8, 2, 2, 3], [12, 2, 2, 5]],
-          "strides": [None, [2, 1, 3, 1]],
-          "begin_mask": [None, 1, 8],
-          "end_mask": [None, 1, 8],
-          "shrink_axis_mask": [None, 1, 8, 11, 15, -1],
-          "constant_indices": [False, True],
-      },
-      # TODO(b/73170889) Restore test paramaters removed in cl/191608113.
-      # 2-D
-      {
-          "dtype": [tf.float32, tf.int32, tf.int64],
-          "index_type": [tf.int32],
-          "input_shape": [[2, 3]],
-          "begin": [[0, 0], [1, 0]],
-          "end": [[2, 3], [2, 2]],
-          "strides": [None, [2, 2]],
-          "begin_mask": [None, 1, 2],
-          "end_mask": [None, 1, 2],
-          "shrink_axis_mask": [None, 1, 2, 3, -1],
-          "constant_indices": [False, True],
-      },
-      # Negative strides
-      {
-          "dtype": [tf.float32],
-          "index_type": [tf.int32],
-          "input_shape": [[2, 3]],
-          "begin": [[0, -1]],
-          "end": [[2, -3]],
-          "strides": [[1, -1]],
-          "begin_mask": [None, 1, 2],
-          "end_mask": [None, 1, 2],
-          "shrink_axis_mask": [None, 1, 2, 3, -1],
-          "constant_indices": [False],
-      },
-  ]
+def _make_strided_slice_tests(zip_path, test_parameters):
+  """Utility function to make strided_slice_tests based on parameters."""
 
   def build_graph(parameters):
     """Build graph for stride_slice test."""
@@ -1847,6 +1926,100 @@ def build_inputs(parameters, sess, inputs, outputs):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_strided_slice_tests(zip_path):
+  """Make a set of tests to do strided_slice."""
+
+  # TODO(soroosh): add test/support for uint8.
+  test_parameters = [
+      # 4-D (basic cases with const/non-const indices).
+      {
+          "dtype": [tf.float32, tf.int32, tf.int64],
+          "index_type": [tf.int32],
+          "input_shape": [[12, 2, 2, 5]],
+          "strides": [None, [2, 1, 3, 1]],
+          "begin": [[0, 0, 0, 0]],
+          "end": [[12, 2, 2, 5]],
+          "begin_mask": [None],
+          "end_mask": [None],
+          "shrink_axis_mask": [None],
+          "constant_indices": [False, True],
+      },
+      # 4-D with non-trivial begin & end.
+      {
+          "dtype": [tf.float32],
+          "index_type": [tf.int32],
+          "input_shape": [[12, 2, 2, 5]],
+          "begin": [[0, 0, 0, 0], [1, 0, 1, 0]],
+          "end": [[8, 2, 2, 3], [12, 2, 2, 5]],
+          "strides": [None, [2, 1, 3, 1]],
+          "begin_mask": [None, 8],
+          "end_mask": [None, 3],
+          "shrink_axis_mask": [None, 15, -1],
+          "constant_indices": [True],
+      },
+      # Begin, end, strides dim are different from input shape
+      {
+          "dtype": [tf.float32],
+          "index_type": [tf.int32],
+          "input_shape": [[12, 2, 2, 5]],
+          "begin": [[0]],
+          "end": [[1]],
+          "strides": [None, [1]],
+          "begin_mask": [0],
+          "end_mask": [0],
+          "shrink_axis_mask": [1],
+          "constant_indices": [True],
+      },
+      # 2-D
+      {
+          "dtype": [tf.float32],
+          "index_type": [tf.int32],
+          "input_shape": [[2, 3]],
+          "begin": [[0, 0]],
+          "end": [[2, 2]],
+          "strides": [None, [2, 2]],
+          "begin_mask": [None, 1, 2],
+          "end_mask": [None, 1, 2],
+          "shrink_axis_mask": [None, 1, 2, 3, -1],
+          "constant_indices": [False, True],
+      },
+      # Negative strides
+      {
+          "dtype": [tf.float32],
+          "index_type": [tf.int32],
+          "input_shape": [[2, 3]],
+          "begin": [[0, -1]],
+          "end": [[2, -3]],
+          "strides": [[1, -1]],
+          "begin_mask": [None, 1, 2],
+          "end_mask": [None, 1, 2],
+          "shrink_axis_mask": [None, 1, 2, 3, -1],
+          "constant_indices": [False],
+      },
+  ]
+  _make_strided_slice_tests(zip_path, test_parameters)
+
+
+def make_strided_slice_1d_exhaustive_tests(zip_path):
+  """Make a set of exhaustive tests for 1D strided_slice."""
+  test_parameters = [
+      # 1-D Exhaustive
+      {
+          "dtype": [tf.float32],
+          "index_type": [tf.int32],
+          "input_shape": [[3]],
+          "begin": [[-2], [-1], [0], [1], [2]],
+          "end": [[-2], [-1], [0], [1], [2]],
+          "strides": [[-2], [-1], [1], [2]],
+          "begin_mask": [0, 1],
+          "end_mask": [0, 1],
+          "shrink_axis_mask": [0],
+          "constant_indices": [False],
+      },
+  ]
+  _make_strided_slice_tests(zip_path, test_parameters)
+
+
 def make_lstm_tests(zip_path):
   """Make a set of tests to do basic Lstm cell."""
 
@@ -1857,6 +2030,7 @@ def make_lstm_tests(zip_path):
           "time_step_size": [1],
           "input_vec_size": [3],
           "num_cells": [4],
+          "split_tflite_lstm_inputs": [True, False],
       },
   ]
 
@@ -1886,7 +2060,7 @@ def build_graph(parameters):
     return inputs_after_split, [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
-    """Feed inputs, assign vairables, and freeze graph."""
+    """Feed inputs, assign variables, and freeze graph."""
 
     with tf.variable_scope("", reuse=True):
       kernel = tf.get_variable("rnn/basic_lstm_cell/kernel")
@@ -1965,8 +2139,8 @@ def make_arg_max_tests(zip_path):
   test_parameters = [{
       "input_dtype": [tf.float32, tf.int32],
       "input_shape": [[1, 1, 1, 3], [2, 3, 4, 5], [2, 3, 3], [5, 5], [10]],
-      "axis": [0, 1, 2, 3],
       "output_type": [tf.int32, tf.int64],
+      "axis_is_last_dim": [True, False],
   }]
 
   def build_graph(parameters):
@@ -1975,7 +2149,10 @@ def build_graph(parameters):
         dtype=parameters["input_dtype"],
         name="input",
         shape=parameters["input_shape"])
-    axis = tf.constant(parameters["axis"], name="axis")
+    if parameters["axis_is_last_dim"]:
+      axis = len(parameters["input_shape"]) - 1
+    else:
+      axis = random.randint(0, max(len(parameters["input_shape"]) - 2, 0))
     out = tf.arg_max(input_value, axis, output_type=parameters["output_type"])
     return [input_value], [out]
 
@@ -1988,8 +2165,8 @@ def build_inputs(parameters, sess, inputs, outputs):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
-def make_less_tests(zip_path):
-  """Make a set of tests to do less."""
+def make_greater_tests(zip_path):
+  """Make a set of tests to do greater."""
 
   test_parameters = [{
       "input_dtype": [tf.float32, tf.int32, tf.int64],
@@ -1999,7 +2176,7 @@ def make_less_tests(zip_path):
   }]
 
   def build_graph(parameters):
-    """Build the less op testing graph."""
+    """Build the greater op testing graph."""
     input_value1 = tf.placeholder(
         dtype=parameters["input_dtype"],
         name="input1",
@@ -2008,7 +2185,7 @@ def build_graph(parameters):
         dtype=parameters["input_dtype"],
         name="input2",
         shape=parameters["input_shape_pair"][1])
-    out = tf.less(input_value1, input_value2)
+    out = tf.greater(input_value1, input_value2)
     return [input_value1, input_value2], [out]
 
   def build_inputs(parameters, sess, inputs, outputs):
@@ -2021,6 +2198,465 @@ def build_inputs(parameters, sess, inputs, outputs):
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
+
+def make_greater_equal_tests(zip_path):
+  """Make a set of tests to do greater_equal."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32, tf.int32, tf.int64],
+      "input_shape_pair": [([1, 1, 1, 3], [1, 1, 1, 3]),
+                           ([2, 3, 4, 5], [2, 3, 4, 5]), ([2, 3, 3], [2, 3]),
+                           ([5, 5], [1]), ([10], [2, 4, 10])],
+  }]
+
+  def build_graph(parameters):
+    """Build the greater_equal op testing graph."""
+    input_value1 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input1",
+        shape=parameters["input_shape_pair"][0])
+    input_value2 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input2",
+        shape=parameters["input_shape_pair"][1])
+    out = tf.greater_equal(input_value1, input_value2)
+    return [input_value1, input_value2], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value1 = create_tensor_data(parameters["input_dtype"],
+                                      parameters["input_shape_pair"][0])
+    input_value2 = create_tensor_data(parameters["input_dtype"],
+                                      parameters["input_shape_pair"][1])
+    return [input_value1, input_value2], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_less_tests(zip_path):
+  """Make a set of tests to do less."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32, tf.int32, tf.int64],
+      "input_shape_pair": [([1, 1, 1, 3], [1, 1, 1, 3]),
+                           ([2, 3, 4, 5], [2, 3, 4, 5]), ([2, 3, 3], [2, 3]),
+                           ([5, 5], [1]), ([10], [2, 4, 10])],
+  }]
+
+  def build_graph(parameters):
+    """Build the less op testing graph."""
+    input_value1 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input1",
+        shape=parameters["input_shape_pair"][0])
+    input_value2 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input2",
+        shape=parameters["input_shape_pair"][1])
+    out = tf.less(input_value1, input_value2)
+    return [input_value1, input_value2], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value1 = create_tensor_data(parameters["input_dtype"],
+                                      parameters["input_shape_pair"][0])
+    input_value2 = create_tensor_data(parameters["input_dtype"],
+                                      parameters["input_shape_pair"][1])
+    return [input_value1, input_value2], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_less_equal_tests(zip_path):
+  """Make a set of tests to do less_equal."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32, tf.int32, tf.int64],
+      "input_shape_pair": [([1, 1, 1, 3], [1, 1, 1, 3]),
+                           ([2, 3, 4, 5], [2, 3, 4, 5]), ([2, 3, 3], [2, 3]),
+                           ([5, 5], [1]), ([10], [2, 4, 10])],
+  }]
+
+  def build_graph(parameters):
+    """Build the less_equal op testing graph."""
+    input_value1 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input1",
+        shape=parameters["input_shape_pair"][0])
+    input_value2 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input2",
+        shape=parameters["input_shape_pair"][1])
+    out = tf.less_equal(input_value1, input_value2)
+    return [input_value1, input_value2], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value1 = create_tensor_data(parameters["input_dtype"],
+                                      parameters["input_shape_pair"][0])
+    input_value2 = create_tensor_data(parameters["input_dtype"],
+                                      parameters["input_shape_pair"][1])
+    return [input_value1, input_value2], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_floor_tests(zip_path):
+  """Make a set of tests to do floor."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32],
+      "input_shape": [[1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+  }]
+
+  def build_graph(parameters):
+    """Build the floor op testing graph."""
+    input_value = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input1",
+        shape=parameters["input_shape"])
+    out = tf.floor(input_value)
+    return [input_value], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value = create_tensor_data(parameters["input_dtype"],
+                                     parameters["input_shape"])
+    return [input_value], sess.run(
+        outputs, feed_dict={inputs[0]: input_value})
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_neg_tests(zip_path):
+  """Make a set of tests to do neg."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32, tf.int32],
+      "input_shape": [[1, 3, 4, 3], [5]],
+  }]
+
+  def build_graph(parameters):
+    """Build the neg op testing graph."""
+    input_tensor = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input",
+        shape=parameters["input_shape"])
+    out = tf.negative(input_tensor)
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    values = create_tensor_data(parameters["input_dtype"],
+                                parameters["input_shape"])
+    return [values], sess.run(outputs, feed_dict=dict(zip(inputs, [values])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_sin_tests(zip_path):
+  """Make a set of tests to do sin."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32],
+      "input_shape": [[1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+  }]
+
+  def build_graph(parameters):
+    """Build the sin op testing graph."""
+    input_value = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input1",
+        shape=parameters["input_shape"])
+    out = tf.sin(input_value)
+    return [input_value], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value = create_tensor_data(parameters["input_dtype"],
+                                     parameters["input_shape"])
+    return [input_value], sess.run(
+        outputs, feed_dict={inputs[0]: input_value})
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_where_tests(zip_path):
+  """Make a set of tests to do where."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32, tf.int32],
+      "input_shape_set": [([1, 2, 3, 4], [1, 2, 3, 4]),],
+  }]
+
+  def build_graph(parameters):
+    """Build the where op testing graph."""
+    input_value1 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input2",
+        shape=parameters["input_shape_set"][0])
+    input_value2 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input3",
+        shape=parameters["input_shape_set"][1])
+    less = tf.less(input_value1, input_value2)
+    out = tf.where(less, input_value1, input_value2)
+    return [input_value1, input_value2], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value1 = create_tensor_data(parameters["input_dtype"],
+                                      parameters["input_shape_set"][0])
+    input_value2 = create_tensor_data(parameters["input_dtype"],
+                                      parameters["input_shape_set"][1])
+    return [input_value1, input_value2], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_slice_tests(zip_path):
+  """Make a set of tests to do slice."""
+
+  # TODO(renjieliu): add test/support for uint8.
+  test_parameters = [
+      # 4-D
+      {
+          "dtype": [tf.float32, tf.int32, tf.int64],
+          "index_type": [tf.int32, tf.int64],
+          "input_shape": [[12, 2, 2, 5]],
+          "begin": [[0, 0, 0, 0], [1, 0, 1, 0]],
+          "size": [[8, 2, 2, 3], [11, 2, 1, 5]],
+      },
+      # 2-D
+      {
+          "dtype": [tf.float32, tf.int32, tf.int64],
+          "index_type": [tf.int32, tf.int64],
+          "input_shape": [[2, 3]],
+          "begin": [[0, 0], [1, 0]],
+          "size": [[2, 3], [2, 2]],
+      },
+  ]
+
+  def build_graph(parameters):
+    """Build graph for slice test."""
+    input_tensor = tf.placeholder(
+        dtype=parameters["dtype"],
+        name="input",
+        shape=parameters["input_shape"])
+    begin = tf.placeholder(
+        dtype=parameters["index_type"],
+        name="begin",
+        shape=[len(parameters["input_shape"])])
+    size = tf.placeholder(
+        dtype=parameters["index_type"],
+        name="size",
+        shape=[len(parameters["input_shape"])])
+    tensors = [input_tensor, begin, size]
+    out = tf.slice(input_tensor, begin, size)
+    return tensors, [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    """Build inputs for slice test."""
+    input_values = create_tensor_data(parameters["dtype"],
+                                      parameters["input_shape"])
+    index_type = _TF_TYPE_INFO[parameters["index_type"]][0]
+
+    begin_values = np.array(parameters["begin"]).astype(index_type)
+    size_values = np.array(parameters["size"]).astype(index_type)
+    values = [input_values, begin_values, size_values]
+
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+# Since compute output_shape is fairly complicated for
+# tf.nn.conv2d_backprop_input input_sizes argument, so we here first perform a
+# "conv2d" operation to get the output, then we use the output to feed in
+# tf.nn.conv2d_backprop_input.
+# This test will depend on the "conv2d" operation's correctness.
+def make_transpose_conv_tests(zip_path):
+  """Make a set of tests to do transpose_conv."""
+
+  # Tensorflow only supports equal strides
+  test_parameters = [{
+      "input_shape": [[1, 3, 4, 1], [1, 10, 10, 3], [3, 20, 20, 1]],
+      "filter_size": [[1, 1], [1, 2], [3, 3]],
+      "strides": [[1, 1, 1, 1], [1, 3, 3, 1]],
+      "padding": ["SAME", "VALID"],
+      "data_format": ["NHWC"],
+      "channel_multiplier": [1, 2],
+  }]
+
+  def get_tensor_shapes(parameters):
+    input_shape = parameters["input_shape"]
+    filter_size = parameters["filter_size"]
+    filter_shape = filter_size + [
+        input_shape[3], parameters["channel_multiplier"]
+    ]
+    return [input_shape, filter_shape]
+
+  def build_graph(parameters):
+    """Build a transpose_conv graph given `parameters`."""
+    input_shape, filter_shape = get_tensor_shapes(parameters)
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name="input", shape=input_shape)
+
+    filter_input = tf.placeholder(
+        dtype=tf.float32, name="filter", shape=filter_shape)
+
+    conv_outputs = tf.nn.conv2d(
+        input_tensor,
+        filter_input,
+        strides=parameters["strides"],
+        padding=parameters["padding"],
+        data_format=parameters["data_format"])
+    out = tf.nn.conv2d_backprop_input(
+        input_shape,
+        filter_input,
+        conv_outputs,
+        strides=parameters["strides"],
+        padding=parameters["padding"],
+        data_format=parameters["data_format"])
+    input_tensors = [input_tensor, filter_input]
+    return input_tensors, [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_shape, filter_shape = get_tensor_shapes(parameters)
+    values = [
+        create_tensor_data(np.float32, input_shape),
+        create_tensor_data(np.float32, filter_shape)
+    ]
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_tile_tests(zip_path):
+  """Make a set of tests to do tile."""
+  test_parameters = [{
+      "input_dtype": [tf.float32, tf.int32],
+      "input_shape": [[3, 2, 1], [2, 2, 2]],
+      "multiplier_dtype": [tf.int32, tf.int64],
+      "multiplier_shape": [[3]]
+  }]
+
+  def build_graph(parameters):
+    """Build the tile op testing graph."""
+    input_value = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        shape=parameters["input_shape"],
+        name="input")
+    multiplier_value = tf.placeholder(
+        dtype=parameters["multiplier_dtype"],
+        shape=parameters["multiplier_shape"],
+        name="multiplier")
+    out = tf.tile(input_value, multiplier_value)
+    return [input_value, multiplier_value], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value = create_tensor_data(parameters["input_dtype"],
+                                     parameters["input_shape"])
+    multipliers_value = create_tensor_data(parameters["multiplier_dtype"],
+                                           parameters["multiplier_shape"])
+    return [input_value, multipliers_value], sess.run(
+        outputs,
+        feed_dict={
+            inputs[0]: input_value,
+            inputs[1]: multipliers_value
+        })
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_expand_dims_tests(zip_path):
+  """Make a set of tests to do expand_dims."""
+
+  test_parameters = [{
+      "input_type": [tf.float32, tf.int32],
+      "input_shape": [[3, 4], [10, 10, 3]],
+      "axis_value": [0, 1, 2, -1, -2],
+  }]
+
+  def build_graph(parameters):
+    """Build the where op testing graph."""
+    input_value = tf.placeholder(
+        dtype=parameters["input_type"],
+        name="input",
+        shape=parameters["input_shape"])
+    axis_value = tf.placeholder(dtype=tf.int32, name="axis", shape=[1])
+    out = tf.expand_dims(input_value, axis=axis_value)
+    return [input_value, axis_value], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value = create_tensor_data(parameters["input_type"],
+                                     parameters["input_shape"])
+    axis_value = np.array([parameters["axis_value"]], dtype=np.int32)
+    return [input_value, axis_value], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value, axis_value])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_sparse_to_dense_tests(zip_path):
+  """Make a set of tests to do sparse to dense."""
+
+  test_parameters = [{
+      "value_dtype": [tf.float32, tf.int32],
+      "index_dtype": [tf.int32, tf.int64],
+      "value_count": [1, 3, 6, 8],
+      "dense_shape": [[15], [3, 10], [4, 4, 4, 4], [7, 10, 9]],
+      "default_value": [0, -1],
+      "value_is_scalar": [True, False],
+  }]
+
+  # Return a single value for 1-D dense shape, but a tuple for other shapes.
+  def generate_index(dense_shape):
+    if len(dense_shape) == 1:
+      return np.random.randint(dense_shape[0])
+    else:
+      index = []
+      for shape in dense_shape:
+        index.append(np.random.randint(shape))
+      return tuple(index)
+
+  def build_graph(parameters):
+    """Build the sparse_to_dense op testing graph."""
+    dense_shape = parameters["dense_shape"]
+
+    # Special handle for value_is_scalar case.
+    # value_count must be 1.
+    if parameters["value_is_scalar"] and parameters["value_count"] == 1:
+      value = tf.placeholder(
+          name="value", dtype=parameters["value_dtype"], shape=())
+    else:
+      value = tf.placeholder(
+          name="value",
+          dtype=parameters["value_dtype"],
+          shape=[parameters["value_count"]])
+    indices = set()
+    while len(indices) < parameters["value_count"]:
+      indices.add(generate_index(dense_shape))
+    indices = tf.constant(tuple(indices), dtype=parameters["index_dtype"])
+    # TODO(renjieliu): Add test for validate_indices case.
+    out = tf.sparse_to_dense(
+        indices,
+        dense_shape,
+        value,
+        parameters["default_value"],
+        validate_indices=False)
+
+    return [value], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    if parameters["value_is_scalar"] and parameters["value_count"] == 1:
+      input_value = create_scalar_data(parameters["value_dtype"])
+    else:
+      input_value = create_tensor_data(parameters["value_dtype"],
+                                       [parameters["value_count"]])
+    return [input_value], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 # Toco binary path provided by the generate rule.
 bin_path = None
 
diff --git a/tensorflow/contrib/lite/testing/generate_testspec.cc b/tensorflow/contrib/lite/testing/generate_testspec.cc
index 6580845af42b3c..c0c861ff6da2fc 100644
--- a/tensorflow/contrib/lite/testing/generate_testspec.cc
+++ b/tensorflow/contrib/lite/testing/generate_testspec.cc
@@ -80,11 +80,30 @@ bool GenerateTestSpecFromTensorflowModel(
   // Invoke tensorflow model.
   TfDriver runner(input_layer, input_layer_type, input_layer_shape,
                   output_layer);
+  if (!runner.IsValid()) {
+    cerr << runner.GetErrorMessage() << endl;
+    return false;
+  }
+
   runner.LoadModel(tensorflow_model_path);
+  if (!runner.IsValid()) {
+    cerr << runner.GetErrorMessage() << endl;
+    return false;
+  }
+
   for (int i = 0; i < input_values.size(); i++) {
     runner.SetInput(i, input_values[i]);
+    if (!runner.IsValid()) {
+      cerr << runner.GetErrorMessage() << endl;
+      return false;
+    }
   }
+
   runner.Invoke();
+  if (!runner.IsValid()) {
+    cerr << runner.GetErrorMessage() << endl;
+    return false;
+  }
 
   // Write test spec.
   stream << "load_model: " << tflite_model_path << "\n";
@@ -99,6 +118,10 @@ bool GenerateTestSpecFromTensorflowModel(
   }
   for (int i = 0; i < output_layer.size(); i++) {
     stream << "  output: \"" << runner.ReadOutput(i) << "\"\n";
+    if (!runner.IsValid()) {
+      cerr << runner.GetErrorMessage() << endl;
+      return false;
+    }
   }
   stream << "}\n";
 
diff --git a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
index 9da8bd7a28891f..e85020448a5726 100644
--- a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
@@ -35,7 +35,7 @@ namespace {
 bool FLAGS_ignore_known_bugs = true;
 // TODO(b/71769302) zip_files_dir should have a more accurate default, if
 // possible
-string* FLAGS_zip_files_dir = new string("./");
+string* FLAGS_zip_file_path = new string("./");
 string* FLAGS_unzip_binary_path = new string("/usr/bin/unzip");
 }  // namespace
 
@@ -48,40 +48,38 @@ tensorflow::Env* env = tensorflow::Env::Default();
 // TODO(ahentz): make sure we clean this list up frequently.
 std::map<string, string> kBrokenTests = {
     // Add only supports float32. (and "constant" tests use Add)
-    {R"(^\/adda.*int32)", "68808744"},
+    {R"(^\/add_a.*int32)", "68808744"},
     {R"(^\/constant.*int32)", "68808744"},
     {R"(^\/mul.*int32)", "68808744"},
     {R"(^\/div.*int32)", "68808744"},
     {R"(^\/sub.*int32)", "68808744"},
 
-    // Pad only supports 4D tensors.
+    // Pad and PadV2 only supports 4D tensors.
     {R"(^\/pad.*,input_shape=\[.,.\],paddings=\[\[.,.\],\[.,.\]\])",
      "70527055"},
+    {R"(^\/padv2.*,input_shape=\[.,.\],paddings=\[\[.,.\],\[.,.\]\])",
+     "70527055"},
 
     // L2Norm only supports tensors with 4D or fewer.
-    {R"(^\/l2normdim=.*,epsilon=.*,input_shape=\[.,.,.,.,.*\])", "67963684"},
-
-    // BatchToSpaceND doesn't support cropping. This catches test cases with
-    // non-const tensors as crops.
-    {R"(^\/batch_to_space_nd.*crops=\[\[1,1\],\[1,1\]\])", "70594634"},
+    {R"(^\/l2norm_dim=.*,epsilon=.*,input_shape=\[.,.,.,.,.*\])", "67963684"},
 
     // SpaceToBatchND only supports 4D tensors.
     {R"(^\/space_to_batch_nd.*input_shape=\[1,4,4,4,1,1\])", "70848787"},
 
     // L2Norm only works for dim=-1.
-    {R"(^\/l2normdim=-2,epsilon=.*,input_shape=\[.,.\])", "67963812"},
-    {R"(^\/l2normdim=0,epsilon=.*,input_shape=\[.,.\])", "67963812"},
-    {R"(^\/l2normdim=-2,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
-    {R"(^\/l2normdim=-2,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
-    {R"(^\/l2normdim=2,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
-    {R"(^\/l2normdim=2,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
-    {R"(^\/l2normdim=0,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
-    {R"(^\/l2normdim=0,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
-    {R"(^\/l2normdim=1,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
-    {R"(^\/l2normdim=1,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
-    {R"(^\/l2normdim=\[2,3\],epsilon=.*,input_shape=\[3,15,14,3\])",
+    {R"(^\/l2norm_dim=-2,epsilon=.*,input_shape=\[.,.\])", "67963812"},
+    {R"(^\/l2norm_dim=0,epsilon=.*,input_shape=\[.,.\])", "67963812"},
+    {R"(^\/l2norm_dim=-2,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
+    {R"(^\/l2norm_dim=-2,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
+    {R"(^\/l2norm_dim=2,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
+    {R"(^\/l2norm_dim=2,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
+    {R"(^\/l2norm_dim=0,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
+    {R"(^\/l2norm_dim=0,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
+    {R"(^\/l2norm_dim=1,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
+    {R"(^\/l2norm_dim=1,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
+    {R"(^\/l2norm_dim=\[2,3\],epsilon=.*,input_shape=\[3,15,14,3\])",
      "67963812"},
-    {R"(^\/l2normdim=\[2,3\],epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
+    {R"(^\/l2norm_dim=\[2,3\],epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
 
     // ResizeBilinear looks completely incompatible with Tensorflow
     {R"(^\/resize_bilinear.*dtype=tf.int32)", "72401107"},
@@ -96,9 +94,11 @@ std::map<string, string> kBrokenTests = {
     {R"(^\/gather.*axis=1)", "76910444"},
 
     // No support for arbitrary dimensions in ArgMax.
-    {R"(^\/arg_max.*axis=0)", "77546240"},
-    {R"(^\/arg_max.*axis=1)", "77546240"},
-    {R"(^\/arg_max.*axis=2)", "77546240"},
+    {R"(^\/arg_max.*axis_is_last_dim=False.*input_shape=\[.,.,.,.\])",
+     "77546240"},
+    {R"(^\/arg_max.*axis_is_last_dim=False.*input_shape=\[.,.,.\])",
+     "77546240"},
+    {R"(^\/arg_max.*axis_is_last_dim=False.*input_shape=\[.,.\])", "77546240"},
 };
 
 // Allows test data to be unzipped into a temporary directory and makes
@@ -137,7 +137,10 @@ class ZipEnvironment : public ::testing::Environment {
       *out_dir = dir;
       return tensorflow::Status::OK();
     } else {
-      return tensorflow::Status(tensorflow::error::UNKNOWN, "unzip failed");
+      return tensorflow::Status(tensorflow::error::UNKNOWN,
+                                "unzip failed. "
+                                "stdout:\n" +
+                                    out + "\nstderr:\n" + err);
     }
   }
 
@@ -191,8 +194,7 @@ tensorflow::Status ReadManifest(const string& original_file, const string& dir,
 }
 
 // Get a list of tests from a zip file `zip_file_name`.
-std::vector<string> UnarchiveZipAndFindTestNames(const string& zip_file_name) {
-  string zip_file = *FLAGS_zip_files_dir + "/" + zip_file_name;
+std::vector<string> UnarchiveZipAndFindTestNames(const string& zip_file) {
   string decompress_tmp_dir;
   TF_CHECK_OK(zip_environment()->UnZip(zip_file, &decompress_tmp_dir));
   std::vector<string> stuff;
@@ -202,7 +204,7 @@ std::vector<string> UnarchiveZipAndFindTestNames(const string& zip_file_name) {
 
 class OpsTest : public ::testing::TestWithParam<string> {};
 
-TEST_P(OpsTest, RunStuff) {
+TEST_P(OpsTest, RunZipTests) {
   string test_path = GetParam();
   string tflite_test_case = test_path + "_tests.txt";
   string tflite_dir = test_path.substr(0, test_path.find_last_of("/"));
@@ -225,7 +227,9 @@ TEST_P(OpsTest, RunStuff) {
     EXPECT_TRUE(result) << test_driver.GetErrorMessage();
   } else {
     if (FLAGS_ignore_known_bugs) {
-      EXPECT_FALSE(result);
+      EXPECT_FALSE(result) << "Test was expected to fail but is now passing; "
+                              "you can mark http://b/"
+                           << bug_number << " as fixed! Yay!";
     } else {
       EXPECT_TRUE(result) << test_driver.GetErrorMessage()
                           << ": Possibly due to http://b/" << bug_number;
@@ -233,54 +237,26 @@ TEST_P(OpsTest, RunStuff) {
   }
 }
 
-// Instantiate a test. This assumes `zip_base`.zip is a declared data file
-// of this test.
-#define INSTANTIATE_TESTS(zip_base) \
-  INSTANTIATE_TEST_CASE_P(          \
-      zip_base, OpsTest,            \
-      ::testing::ValuesIn(UnarchiveZipAndFindTestNames(#zip_base ".zip")));
-
-INSTANTIATE_TESTS(add)
-INSTANTIATE_TESTS(arg_max)
-INSTANTIATE_TESTS(avg_pool)
-INSTANTIATE_TESTS(batch_to_space_nd)
-INSTANTIATE_TESTS(concat)
-INSTANTIATE_TESTS(constant)
-INSTANTIATE_TESTS(control_dep)
-INSTANTIATE_TESTS(conv)
-INSTANTIATE_TESTS(depthwiseconv)
-INSTANTIATE_TESTS(div)
-INSTANTIATE_TESTS(exp)
-INSTANTIATE_TESTS(fully_connected)
-INSTANTIATE_TESTS(fused_batch_norm)
-INSTANTIATE_TESTS(gather)
-INSTANTIATE_TESTS(global_batch_norm)
-INSTANTIATE_TESTS(l2_pool)
-INSTANTIATE_TESTS(l2norm)
-INSTANTIATE_TESTS(local_response_norm)
-INSTANTIATE_TESTS(log_softmax)
-INSTANTIATE_TESTS(maximum)
-INSTANTIATE_TESTS(max_pool)
-INSTANTIATE_TESTS(mean)
-INSTANTIATE_TESTS(minimum)
-INSTANTIATE_TESTS(mul)
-INSTANTIATE_TESTS(pad)
-INSTANTIATE_TESTS(relu)
-INSTANTIATE_TESTS(relu1)
-// INSTANTIATE_TESTS(prelu)
-INSTANTIATE_TESTS(relu6)
-INSTANTIATE_TESTS(reshape)
-INSTANTIATE_TESTS(resize_bilinear)
-INSTANTIATE_TESTS(sigmoid)
-INSTANTIATE_TESTS(softmax)
-INSTANTIATE_TESTS(space_to_batch_nd)
-INSTANTIATE_TESTS(space_to_depth)
-INSTANTIATE_TESTS(split)
-INSTANTIATE_TESTS(squeeze)
-INSTANTIATE_TESTS(strided_slice)
-INSTANTIATE_TESTS(sub)
-INSTANTIATE_TESTS(transpose)
-INSTANTIATE_TESTS(less)
+struct ZipPathParamName {
+  template <class ParamType>
+  string operator()(const ::testing::TestParamInfo<ParamType>& info) const {
+    string param_name = info.param;
+    size_t last_slash = param_name.find_last_of("\\/");
+    if (last_slash != string::npos) {
+      param_name = param_name.substr(last_slash);
+    }
+    for (size_t index = 0; index < param_name.size(); ++index) {
+      if (!isalnum(param_name[index]) && param_name[index] != '_')
+        param_name[index] = '_';
+    }
+    return param_name;
+  }
+};
+
+INSTANTIATE_TEST_CASE_P(
+    tests, OpsTest,
+    ::testing::ValuesIn(UnarchiveZipAndFindTestNames(*FLAGS_zip_file_path)),
+    ZipPathParamName());
 
 }  // namespace testing
 }  // namespace tflite
@@ -293,8 +269,8 @@ int main(int argc, char** argv) {
           "ignore_known_bugs", &tflite::testing::FLAGS_ignore_known_bugs,
           "If a particular model is affected by a known bug, the "
           "corresponding test should expect the outputs to not match."),
-      tensorflow::Flag("zip_files_dir", tflite::testing::FLAGS_zip_files_dir,
-                       "Required: Location of the test zips."),
+      tensorflow::Flag("zip_file_path", tflite::testing::FLAGS_zip_file_path,
+                       "Required: Location of the test zip file."),
       tensorflow::Flag("unzip_binary_path",
                        tflite::testing::FLAGS_unzip_binary_path,
                        "Required: Location of a suitable unzip binary.")};
diff --git a/tensorflow/contrib/lite/testing/join.h b/tensorflow/contrib/lite/testing/join.h
index ce8c072a21c6e6..1edee01cf97da3 100644
--- a/tensorflow/contrib/lite/testing/join.h
+++ b/tensorflow/contrib/lite/testing/join.h
@@ -22,7 +22,7 @@ limitations under the License.
 namespace tflite {
 namespace testing {
 
-// Join a list of data separated by delimieter.
+// Join a list of data separated by delimiter.
 template <typename T>
 string Join(T* data, size_t len, const string& delimiter) {
   if (len == 0 || data == nullptr) {
@@ -36,6 +36,22 @@ string Join(T* data, size_t len, const string& delimiter) {
   return result.str();
 }
 
+// Join a list of uint8 data separated by a delimiter. Cast data to int before
+// placing it in the string to prevent values from being treated like chars.
+template <>
+inline string Join<uint8_t>(uint8_t* data, size_t len,
+                            const string& delimiter) {
+  if (len == 0 || data == nullptr) {
+    return "";
+  }
+  std::stringstream result;
+  result << static_cast<int>(data[0]);
+  for (int i = 1; i < len; i++) {
+    result << delimiter << static_cast<int>(data[i]);
+  }
+  return result.str();
+}
+
 }  // namespace testing
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/testing/test_runner.h b/tensorflow/contrib/lite/testing/test_runner.h
index 05770beee23275..96ab6be54e5283 100644
--- a/tensorflow/contrib/lite/testing/test_runner.h
+++ b/tensorflow/contrib/lite/testing/test_runner.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_LITE_TESTING_TEST_RUNNER_H_
 #define TENSORFLOW_CONTRIB_LITE_TESTING_TEST_RUNNER_H_
 
+#include <iostream>
 #include <memory>
 #include <string>
 #include <vector>
@@ -89,6 +90,7 @@ class TestRunner {
 
   // Invalidate the test runner, preventing it from executing any further.
   void Invalidate(const string& error_message) {
+    cerr << error_message << std::endl;
     error_message_ = error_message;
   }
   bool IsValid() const { return error_message_.empty(); }
diff --git a/tensorflow/contrib/lite/testing/tf_driver.cc b/tensorflow/contrib/lite/testing/tf_driver.cc
index 7b295875aab12b..3b27f6f3da92ce 100644
--- a/tensorflow/contrib/lite/testing/tf_driver.cc
+++ b/tensorflow/contrib/lite/testing/tf_driver.cc
@@ -103,7 +103,7 @@ void TfDriver::LoadModel(const string& bin_file_path) {
   session_.reset(tensorflow::NewSession(options));
   auto status = session_->Create(graphdef);
   if (!status.ok()) {
-    Invalidate("Failed to create session");
+    Invalidate("Failed to create session. " + status.error_message());
   }
 }
 
diff --git a/tensorflow/contrib/lite/testing/tflite_driver.cc b/tensorflow/contrib/lite/testing/tflite_driver.cc
index 58fe5bd6e40b3d..fc28faf52405b3 100644
--- a/tensorflow/contrib/lite/testing/tflite_driver.cc
+++ b/tensorflow/contrib/lite/testing/tflite_driver.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <iostream>
 
+#include "tensorflow/contrib/lite/builtin_op_data.h"
 #include "tensorflow/contrib/lite/testing/split.h"
 
 namespace tflite {
@@ -143,6 +144,7 @@ void TfLiteDriver::AllocateTensors() {
       Invalidate("Failed to allocate tensors");
       return;
     }
+    ResetLSTMStateTensors();
     must_allocate_tensors_ = false;
   }
 }
@@ -226,8 +228,8 @@ void TfLiteDriver::SetExpectation(int id, const string& csv_values) {
   if (!IsValid()) return;
   auto* tensor = interpreter_->tensor(id);
   if (expected_output_.count(id) != 0) {
-    fprintf(stderr, "Overriden expectation for tensor %d\n", id);
-    Invalidate("Overriden expectation");
+    fprintf(stderr, "Overridden expectation for tensor %d\n", id);
+    Invalidate("Overridden expectation");
   }
   expected_output_[id].reset(new Expectation);
   switch (tensor->type) {
@@ -281,5 +283,36 @@ bool TfLiteDriver::CheckResults() {
   return success;
 }
 
+void TfLiteDriver::ResetLSTMStateTensors() {
+  // This is a workaround for initializing state tensors for LSTM.
+  // TODO(ycling): Refactoring and find a better way to initialize state
+  // tensors. Maybe write the reset instructions into the test data.
+  for (auto node_index : interpreter_->execution_plan()) {
+    const auto& node_and_reg = interpreter_->node_and_registration(node_index);
+    const auto& node = node_and_reg->first;
+    const auto& registration = node_and_reg->second;
+
+    if (registration.builtin_code == tflite::BuiltinOperator_LSTM) {
+      const auto* params =
+          reinterpret_cast<const TfLiteLSTMParams*>(node.builtin_data);
+      if (params->kernel_type == kTfLiteLSTMFullKernel &&
+          node.outputs->size >= 2) {
+        // The first 2 outputs of LSTM are state tensors.
+        for (int i = 0; i < 2; ++i) {
+          int node_index = node.outputs->data[i];
+          ResetTensor(node_index);
+        }
+      } else if (params->kernel_type == kTfLiteLSTMBasicKernel &&
+                 node.inputs->size == 5) {
+        // The 2th and 5th inputs are state tensors.
+        for (int i : {1, 4}) {
+          int node_index = node.inputs->data[i];
+          ResetTensor(node_index);
+        }
+      }
+    }
+  }
+}
+
 }  // namespace testing
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/testing/tflite_driver.h b/tensorflow/contrib/lite/testing/tflite_driver.h
index 02b7de1534e648..5493ba3631b042 100644
--- a/tensorflow/contrib/lite/testing/tflite_driver.h
+++ b/tensorflow/contrib/lite/testing/tflite_driver.h
@@ -48,6 +48,8 @@ class TfLiteDriver : public TestRunner {
   string ReadOutput(int id) override { return "no-op"; }
 
  private:
+  void ResetLSTMStateTensors();
+
   class Expectation;
 
   bool use_nnapi_ = false;
diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD
index 3f73ef620e121b..7ea4f32ef694f3 100644
--- a/tensorflow/contrib/lite/toco/BUILD
+++ b/tensorflow/contrib/lite/toco/BUILD
@@ -245,6 +245,7 @@ cc_library(
         "graph_transformations/quantization_util.cc",
         "graph_transformations/quantization_util.h",
         "graph_transformations/quantize.cc",
+        "graph_transformations/quantize_weights.cc",
         "graph_transformations/read_fake_quant_min_max.cc",
         "graph_transformations/remove_final_dequantize_op.cc",
         "graph_transformations/remove_tensorflow_assert.cc",
@@ -273,6 +274,7 @@ cc_library(
         "graph_transformations/resolve_constant_range.cc",
         "graph_transformations/resolve_constant_reshape.cc",
         "graph_transformations/resolve_constant_shape_or_rank.cc",
+        "graph_transformations/resolve_constant_slice.cc",
         "graph_transformations/resolve_constant_stack.cc",
         "graph_transformations/resolve_constant_strided_slice.cc",
         "graph_transformations/resolve_constant_transpose.cc",
@@ -280,6 +282,7 @@ cc_library(
         "graph_transformations/resolve_mean_attributes.cc",
         "graph_transformations/resolve_multiply_by_zero.cc",
         "graph_transformations/resolve_pad_attributes.cc",
+        "graph_transformations/resolve_padv2_attributes.cc",
         "graph_transformations/resolve_reorder_axes.cc",
         "graph_transformations/resolve_reshape_attributes.cc",
         "graph_transformations/resolve_slice_attributes.cc",
@@ -308,6 +311,7 @@ cc_library(
         ":toco_port",
         ":tooling_util",
         "//tensorflow/contrib/lite/kernels/internal:quantization_util",
+        "//tensorflow/contrib/lite/kernels/internal:strided_slice_logic",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
@@ -363,6 +367,18 @@ cc_library(
     }),
 )
 
+tf_cc_test(
+    name = "import_tensorflow_test",
+    srcs = ["import_tensorflow_test.cc"],
+    deps = [
+        ":toco_tooling",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "tooling_util",
     srcs = [
@@ -384,6 +400,7 @@ cc_library(
         ":types_proto_cc",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
+        "@com_googlesource_code_re2//:re2",
         "@protobuf_archive//:protobuf_headers",
     ],
 )
diff --git a/tensorflow/contrib/lite/toco/args.h b/tensorflow/contrib/lite/toco/args.h
index fe30b88344c534..9f5ca66d050f0e 100644
--- a/tensorflow/contrib/lite/toco/args.h
+++ b/tensorflow/contrib/lite/toco/args.h
@@ -234,6 +234,7 @@ struct ParsedTocoFlags {
   Arg<bool> drop_fake_quant = Arg<bool>(false);
   Arg<bool> reorder_across_fake_quant = Arg<bool>(false);
   Arg<bool> allow_custom_ops = Arg<bool>(false);
+  Arg<bool> quantize_weights = Arg<bool>(false);
   // Deprecated flags
   Arg<string> input_type;
   Arg<string> input_types;
@@ -241,6 +242,8 @@ struct ParsedTocoFlags {
   Arg<bool> drop_control_dependency = Arg<bool>(false);
   Arg<bool> propagate_fake_quant_num_bits = Arg<bool>(false);
   Arg<bool> allow_nudging_weights_to_use_fast_gemm_kernel = Arg<bool>(false);
+  Arg<int64> dedupe_array_min_size_bytes = Arg<int64>(64);
+  Arg<bool> split_tflite_lstm_inputs = Arg<bool>(true);
 };
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/dump_graphviz.cc b/tensorflow/contrib/lite/toco/dump_graphviz.cc
index 5bb0e3ba4d289c..8913b5c3ea9627 100644
--- a/tensorflow/contrib/lite/toco/dump_graphviz.cc
+++ b/tensorflow/contrib/lite/toco/dump_graphviz.cc
@@ -14,9 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/lite/toco/dump_graphviz.h"
 
+#include <cmath>
 #include <memory>
-#include <set>
-#include <unordered_set>
 #include <vector>
 
 #include "absl/strings/str_replace.h"
@@ -63,6 +62,7 @@ struct NodeProperties {
   // color will be chosen for the 'fontcolor' for the inside text
   // label, see Color::TextColorString.
   Color color;
+  float log2_buffer_size;
 };
 
 // All colors in this file are from:
@@ -89,10 +89,7 @@ Color GetColorForArray(const Model& model, const string& array_name) {
   // We use gray colors for them because they are the majority
   // of arrays so we want to highlight other arrays instead of them.
   // First, we use a bolder gray for input/output arrays:
-  const auto& dump_options = *GraphVizDumpOptions::singleton();
-  if (IsInputArray(model, array_name) ||
-      array_name == dump_options.graphviz_first_array ||
-      array_name == dump_options.graphviz_last_array) {
+  if (IsInputArray(model, array_name)) {
     return Color(0x9E, 0x9E, 0x9E);
   }
   if (IsOutputArray(model, array_name)) {
@@ -135,6 +132,12 @@ void AppendArrayVal(string* string, Array const& array, int index) {
       return;
     }
     AppendF(string, "%d", data[index]);
+  } else if (array.buffer->type == ArrayDataType::kBool) {
+    const auto& data = array.GetBuffer<ArrayDataType::kBool>().data;
+    if (index >= data.size()) {
+      return;
+    }
+    AppendF(string, "%d", data[index]);
   }
 }
 
@@ -162,9 +165,12 @@ NodeProperties GetPropertiesForArray(const Model& model,
     }
     node_properties.label += "]";
 
+    int buffer_size = RequiredBufferSizeForShape(array.shape());
+    node_properties.log2_buffer_size =
+        std::log2(static_cast<float>(buffer_size));
+
     if (array.buffer) {
       const auto& array = model.GetArray(array_name);
-      int buffer_size = RequiredBufferSizeForShape(array.shape());
       if (buffer_size <= 4) {
         AppendF(&node_properties.label, " = ");
         if (array.shape().dimensions_count() > 0) {
@@ -194,6 +200,8 @@ NodeProperties GetPropertiesForArray(const Model& model,
         AppendF(&node_properties.label, "}");
       }
     }
+  } else {
+    node_properties.log2_buffer_size = 0.0f;
   }
 
   if (array.minmax) {
@@ -280,102 +288,82 @@ NodeProperties GetPropertiesForOperator(const Operator& op) {
   return node_properties;
 }
 
-std::vector<const Operator*> OperatorsToDump(const Model& model) {
-  const auto& dump_options = *GraphVizDumpOptions::singleton();
-  bool first_specified = !dump_options.graphviz_first_array.empty();
-  bool last_specified = !dump_options.graphviz_last_array.empty();
-  CHECK_EQ(first_specified, last_specified);
-  std::vector<const Operator*> ops_to_dump;
-  if (last_specified) {
-    // Return only the part of the graph between graphviz_first_array
-    // and graphviz_last_array.
-    CHECK(model.HasArray(dump_options.graphviz_first_array));
-    CHECK(model.HasArray(dump_options.graphviz_last_array));
-    std::unordered_set<string> arrays_already_produced;
-    std::vector<string> arrays_to_produce;
-    arrays_to_produce.push_back(dump_options.graphviz_last_array);
-    while (!arrays_to_produce.empty()) {
-      const string array = arrays_to_produce.back();
-      arrays_to_produce.pop_back();
-      CHECK(!arrays_already_produced.count(array));
-      arrays_already_produced.insert(array);
-      const Operator* op = GetOpWithOutput(model, array);
-      if (!op) {
-        continue;
-      }
-      ops_to_dump.push_back(op);
-      for (const string& input : op->inputs) {
-        if (arrays_already_produced.count(input) ||
-            input == dump_options.graphviz_first_array) {
-          continue;
-        }
-        arrays_to_produce.push_back(input);
-      }
-    }
-  } else {
-    // Return the whole graph.
-    for (const auto& op : model.operators) {
-      ops_to_dump.push_back(op.get());
-    }
-  }
-  return ops_to_dump;
-}
-
 }  // namespace
 
 void DumpGraphviz(const Model& model, string* output_file_contents) {
   AppendF(output_file_contents, "digraph Computegraph {\n");
+  // 'nslimit' is a graphviz (dot) paramater that limits the iterations during
+  // the layout phase. Omitting it allows infinite iterations, causing some
+  // complex graphs to never finish. A value of 125 produces good graphs
+  // while allowing complex graphs to finish.
+  AppendF(output_file_contents, "\t nslimit=125;\n");
 
   constexpr char kNodeFormat[] =
       "\t \"%s\" [label=\"%s\", shape=%s, style=filled, fillcolor=\"#%s\", "
       "fontcolor = \"#%sDD\"];\n";
 
-  constexpr char kEdgeFormat[] = "\t \"%s\" -> \"%s\";\n";
+  constexpr char kEdgeFormat[] =
+      "\t \"%s\" -> \"%s\" [penwidth=%f, weight=%f];\n";
 
   constexpr char kRNNBackEdgeFormat[] =
       "\t \"%s\" -> \"%s\" [color=\"#0F9D58\"];\n";
 
-  std::vector<const Operator*> ops_to_dump = OperatorsToDump(model);
-  std::set<string> already_added_arrays;
-  for (int op_index = 0; op_index < ops_to_dump.size(); op_index++) {
-    const Operator& op = *ops_to_dump[op_index];
+  for (const auto& array_kv : model.GetArrayMap()) {
+    // Add node for array.
+    const string& array_name = array_kv.first;
+    const auto& array_properties = GetPropertiesForArray(model, array_name);
+    AppendF(output_file_contents, kNodeFormat, array_name,
+            array_properties.label, "octagon",
+            array_properties.color.FillColorString().c_str(),
+            array_properties.color.TextColorString().c_str());
+  }
+  for (int op_index = 0; op_index < model.operators.size(); op_index++) {
+    const Operator& op = *model.operators[op_index];
     // Add node for operator.
     auto op_properties = GetPropertiesForOperator(op);
     string operator_id = StringF("op%05d", op_index);
     AppendF(output_file_contents, kNodeFormat, operator_id, op_properties.label,
             "box", op_properties.color.FillColorString().c_str(),
             op_properties.color.TextColorString().c_str());
-    // Add nodes and edges for all inputs of the operator.
+    // Add edges for all inputs of the operator.
     for (const auto& input : op.inputs) {
       if (!model.HasArray(input)) {
         // Arrays should _always_ exist. Except, perhaps, during development.
         continue;
       }
       auto array_properties = GetPropertiesForArray(model, input);
-      if (!already_added_arrays.count(input)) {
-        AppendF(output_file_contents, kNodeFormat, input,
-                array_properties.label, "octagon",
-                array_properties.color.FillColorString().c_str(),
-                array_properties.color.TextColorString().c_str());
+      // Draw lines that transport more data thicker (Otherwise, where would the
+      // data fit? right?).
+      float line_width =
+          std::max(0.5f, array_properties.log2_buffer_size / 3.0f);
+      // Keep edges that transport more data shorter than those with less.
+      float weight = std::max(1.0f, array_properties.log2_buffer_size);
+      if (!IsInputArray(model, input) &&
+          GetOpWithOutput(model, input) == nullptr) {
+        // Give the main line of data flow a straighter path by penalizing edges
+        // to standalone buffers. Weights are generally very large buffers that
+        // otherwise skew the layout without this.
+        weight = 1.0f;
       }
-      AppendF(output_file_contents, kEdgeFormat, input, operator_id);
-      already_added_arrays.insert(input);
+      AppendF(output_file_contents, kEdgeFormat, input, operator_id, line_width,
+              weight);
     }
-    // Add nodes and edges for all outputs of the operator.
+    // Add edges for all outputs of the operator.
     for (const auto& output : op.outputs) {
       if (!model.HasArray(output)) {
         // Arrays should _always_ exist. Except, perhaps, during development.
         continue;
       }
       auto array_properties = GetPropertiesForArray(model, output);
-      if (!already_added_arrays.count(output)) {
-        AppendF(output_file_contents, kNodeFormat, output,
-                array_properties.label, "octagon",
-                array_properties.color.FillColorString().c_str(),
-                array_properties.color.TextColorString().c_str());
+      // See comments above regarding weight and line_width calculations.
+      float line_width =
+          std::max(0.5f, array_properties.log2_buffer_size / 3.0f);
+      float weight = std::max(1.0f, array_properties.log2_buffer_size);
+      if (!IsArrayConsumed(model, output)) {
+        weight = 1.0f;
       }
-      AppendF(output_file_contents, kEdgeFormat, operator_id, output);
-      already_added_arrays.insert(output);
+      AppendF(output_file_contents, kEdgeFormat, operator_id, output,
+              line_width, weight);
     }
   }
 
diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.cc b/tensorflow/contrib/lite/toco/export_tensorflow.cc
index 99ccfaea648077..99f0c81a1bd8b7 100644
--- a/tensorflow/contrib/lite/toco/export_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/export_tensorflow.cc
@@ -1492,6 +1492,37 @@ void ConvertPadOperator(const Model& model, const PadOperator& src_op,
   shape->add_dim()->set_size(2);
 }
 
+void ConvertPadV2Operator(const Model& model, const PadV2Operator& src_op,
+                          GraphDef* tensorflow_graph) {
+  auto* new_op = tensorflow_graph->add_node();
+  new_op->set_op("PadV2");
+  new_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 2);
+  *new_op->add_input() = src_op.inputs[0];
+  *new_op->add_input() = src_op.inputs[1];
+  *new_op->add_input() = src_op.inputs[2];
+
+  const auto params_type = GetTensorFlowDataType(model, src_op.inputs[0]);
+  (*new_op->mutable_attr())["T"].set_type(params_type);
+
+  // Create the params tensor.
+  auto* params_op = tensorflow_graph->add_node();
+  params_op->set_op("Const");
+  params_op->set_name(src_op.inputs[1]);
+  (*params_op->mutable_attr())["dtype"].set_type(DT_INT32);
+  auto* tensor = (*params_op->mutable_attr())["value"].mutable_tensor();
+  tensor->set_dtype(DT_INT32);
+
+  CHECK_EQ(src_op.left_padding.size(), src_op.right_padding.size());
+  for (int i = 0; i < src_op.left_padding.size(); ++i) {
+    tensor->add_int_val(src_op.left_padding[i]);
+    tensor->add_int_val(src_op.right_padding[i]);
+  }
+  auto* shape = tensor->mutable_tensor_shape();
+  shape->add_dim()->set_size(src_op.left_padding.size());
+  shape->add_dim()->set_size(2);
+}
+
 void CreateSliceInput(const string& input_name, const std::vector<int>& values,
                       GraphDef* tensorflow_graph) {
   auto* params_op = tensorflow_graph->add_node();
@@ -1643,6 +1674,19 @@ void ConvertTensorFlowMaximumOperator(const Model& model,
   (*sub_op->mutable_attr())["T"].set_type(data_type);
 }
 
+void ConvertSelectOperator(const Model& model, const SelectOperator& src_op,
+                           GraphDef* tensorflow_graph) {
+  auto* sub_op = tensorflow_graph->add_node();
+  sub_op->set_op("Select");
+  sub_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 3);
+  *sub_op->add_input() = src_op.inputs[0];
+  *sub_op->add_input() = src_op.inputs[1];
+  *sub_op->add_input() = src_op.inputs[2];
+  const auto data_type = GetTensorFlowDataType(model, src_op.inputs[1]);
+  (*sub_op->mutable_attr())["T"].set_type(data_type);
+}
+
 void ConvertTopKV2Operator(const Model& model, const TopKV2Operator& src_op,
                            GraphDef* tensorflow_graph) {
   auto* topk_op = tensorflow_graph->add_node();
@@ -1671,6 +1715,38 @@ void ConvertRandomUniformOperator(const Model& model,
   (*new_op->mutable_attr())["seed2"].set_i(src_op.seed2);
 }
 
+void ConvertComparisonOperator(const Model& model, const Operator& src_op,
+                               const char* op_name,
+                               GraphDef* tensorflow_graph) {
+  auto* comparison_op = tensorflow_graph->add_node();
+  comparison_op->set_op(op_name);
+  comparison_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 2);
+  *comparison_op->add_input() = src_op.inputs[0];
+  *comparison_op->add_input() = src_op.inputs[1];
+  const auto data_type = GetTensorFlowDataType(model, src_op.inputs[0]);
+  (*comparison_op->mutable_attr())["T"].set_type(data_type);
+}
+
+void ConvertSparseToDenseOperator(const Model& model,
+                                  const SparseToDenseOperator& src_op,
+                                  const char* op_name,
+                                  GraphDef* tensorflow_graph) {
+  auto* sparse_to_dense_op = tensorflow_graph->add_node();
+  sparse_to_dense_op->set_op(op_name);
+  sparse_to_dense_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 4);
+  for (int i = 0; i < 4; ++i) {
+    *sparse_to_dense_op->add_input() = src_op.inputs[i];
+  }
+  const auto data_type = GetTensorFlowDataType(model, src_op.inputs[3]);
+  (*sparse_to_dense_op->mutable_attr())["T"].set_type(data_type);
+  const auto index_type = GetTensorFlowDataType(model, src_op.inputs[0]);
+  (*sparse_to_dense_op->mutable_attr())["Tindices"].set_type(index_type);
+  (*sparse_to_dense_op->mutable_attr())["Tindices"].set_b(
+      src_op.validate_indices);
+}
+
 void ConvertOperator(const Model& model, const Operator& src_op,
                      GraphDef* tensorflow_graph) {
   if (src_op.fused_activation_function != FusedActivationFunctionType::kNone) {
@@ -1795,6 +1871,9 @@ void ConvertOperator(const Model& model, const Operator& src_op,
   } else if (src_op.type == OperatorType::kPad) {
     ConvertPadOperator(model, static_cast<const PadOperator&>(src_op),
                        tensorflow_graph);
+  } else if (src_op.type == OperatorType::kPadV2) {
+    ConvertPadV2Operator(model, static_cast<const PadV2Operator&>(src_op),
+                         tensorflow_graph);
   } else if (src_op.type == OperatorType::kStridedSlice) {
     ConvertStridedSliceOperator(
         model, static_cast<const StridedSliceOperator&>(src_op),
@@ -1859,6 +1938,17 @@ void ConvertOperator(const Model& model, const Operator& src_op,
     ConvertRandomUniformOperator(
         model, static_cast<const RandomUniformOperator&>(src_op),
         tensorflow_graph);
+  } else if (src_op.type == OperatorType::kTensorFlowGreater) {
+    ConvertComparisonOperator(model, src_op, "Greater", tensorflow_graph);
+  } else if (src_op.type == OperatorType::kTensorFlowGreaterEqual) {
+    ConvertComparisonOperator(model, src_op, "GreaterEqual", tensorflow_graph);
+  } else if (src_op.type == OperatorType::kTensorFlowLess) {
+    ConvertComparisonOperator(model, src_op, "Less", tensorflow_graph);
+  } else if (src_op.type == OperatorType::kTensorFlowLessEqual) {
+    ConvertComparisonOperator(model, src_op, "LessEqual", tensorflow_graph);
+  } else if (src_op.type == OperatorType::kSelect) {
+    ConvertSelectOperator(model, static_cast<const SelectOperator&>(src_op),
+                          tensorflow_graph);
   } else {
     LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(src_op.type);
   }
diff --git a/tensorflow/contrib/lite/toco/format_port.h b/tensorflow/contrib/lite/toco/format_port.h
index eb81e90faf2013..44e66845715237 100644
--- a/tensorflow/contrib/lite/toco/format_port.h
+++ b/tensorflow/contrib/lite/toco/format_port.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-// This file is used to provide equivalents of internal util::format::FormatF
-// and util::format::AppendF. Unfortunately, type safety is not as good as a
+// This file is used to provide equivalents of internal absl::FormatF
+// and absl::StrAppendFormat. Unfortunately, type safety is not as good as a
 // a full C++ example.
 // TODO(aselle): When absl adds support for StrFormat, use that instead.
 #ifndef TENSORFLOW_CONTRIB_LITE_TOCO_FORMAT_PORT_H_
diff --git a/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md b/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
index 495014c6fc67ab..8e93f02ef109f7 100644
--- a/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
+++ b/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
@@ -26,8 +26,6 @@ Table of contents:
     *   [Convert a TensorFlow Lite FlatBuffer back into TensorFlow GraphDef
         format](#to-graphdef)
 *   [Logging](#logging)
-    *   [Standard logging](#standard-logging)
-    *   [Verbose logging](#verbose-logging)
     *   [Graph "video" logging](#graph-video-logging)
 *   [Graph visualizations](#graph-visualizations)
     *   [Using --output_format=GRAPHVIZ_DOT](#using-output-formatgraphviz-dot)
@@ -115,7 +113,7 @@ bazel run --config=opt \
 
 In order to evaluate the possible benefit of generating a quantized graph, TOCO
 allows "dummy-quantization" on float graphs. The flags `--default_ranges_min`
-and `--default_ranges_max` accept plausable values for the min-max ranges of the
+and `--default_ranges_max` accept plausible values for the min-max ranges of the
 values in all arrays that do not have min-max information. "Dummy-quantization"
 will produce lower accuracy but will emulate the performance of a correctly
 quantized model.
@@ -277,49 +275,6 @@ bazel run --config=opt \
 
 ## Logging
 
-### Standard logging
-
-The converter generates some informative log messages during processing. The
-easiest way to view them is to add `--logtostderr` to command lines as seen in
-the following example.
-
-```
-curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
-  | tar xzv -C /tmp
-bazel run --config=opt \
-  //tensorflow/contrib/lite/toco:toco -- \
-  --input_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
-  --output_file=/tmp/foo.tflite \
-  --input_format=TENSORFLOW_GRAPHDEF \
-  --output_format=TFLITE \
-  --inference_type=FLOAT \
-  --input_shape=1,128,128,3 \
-  --input_array=input \
-  --output_array=MobilenetV1/Predictions/Reshape_1 \
-  --logtostderr
-```
-
-After some initialization messages, we get the following informative messages:
-
-```
-I1101 21:51:33.297475    5339 graph_transformations.cc:39] Before general graph transformations: 416 operators, 583 arrays (0 quantized)
-I1101 21:51:33.308972    5339 graph_transformations.cc:39] After general graph transformations pass 1: 31 operators, 89 arrays (0 quantized)
-I1101 21:51:33.309204    5339 graph_transformations.cc:39] Before dequantization graph transformations: 31 operators, 89 arrays (0 quantized)
-I1101 21:51:33.309368    5339 allocate_transient_arrays.cc:312] Total transient array allocated size: 1048576 bytes, theoretical optimal value: 786432 bytes.
-I1101 21:51:33.309484    5339 toco_tooling.cc:249] Estimated count of arithmetic ops: 0.099218 billion (note that a multiply-add is counted as 2 ops).
-```
-
-### Verbose logging
-
-For debugging purposes, the converter supports two levels of verbose logging,
-which can be set by passing a `--v=` flag:
-
-*   For `--v=1`, the converter generates text dumps of the graph at various
-    points during processing as well as log messages about every graph
-    transformation that took place.
-*   For `--v=2`, the converter additionally generates log messages about graph
-    transformations that were considered but not performed.
-
 ### Graph "video" logging
 
 When `--dump_graphviz=` is used (see the section on [graph
@@ -338,7 +293,7 @@ below outline the use cases for each.
 ### Using `--output_format=GRAPHVIZ_DOT`
 
 The first way to get a graphviz rendering is to pass `GRAPHVIZ_DOT` into
-`--output_format`. This results in a plausable visualization of the graph. This
+`--output_format`. This results in a plausible visualization of the graph. This
 reduces the requirements that normally exist during conversion between other
 input and output formats. For example, this may be useful if conversion from
 TENSORFLOW_GRAPHDEF to TFLITE is failing.
diff --git a/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md b/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md
index 9e99287f828c22..8085ae07489816 100644
--- a/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md
+++ b/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md
@@ -203,17 +203,11 @@ have.
     graph transformations on them, at the cost of no longer faithfully matching
     inference and training arithmetic.
 
-## Logging flags
-
-The following are standard Google logging flags:
+*   `--quantize_weights`. Type: boolean. Default: false. Store weights as
+    quantized weights followed by dequantize operations. Computation is still
+    done in float, but reduces model size (at the cost of accuracy and latency).
 
-*   `--logtostderr` redirects Google logging to standard error, typically making
-    it visible in a terminal.
-*   `--v` sets verbose logging levels (for debugging purposes). Defined levels:
-    *   `--v=1`: log all graph transformations that did make a change on the
-        graph.
-    *   `--v=2`: log all graph transformations that did *not* make a change on
-        the graph.
+## Logging flags
 
 The following flags allow to generate graph visualizations of the actual graph
 at various points during transformations:
diff --git a/tensorflow/contrib/lite/toco/g3doc/python_api.md b/tensorflow/contrib/lite/toco/g3doc/python_api.md
index f0fd638a618c75..5071361bfdb4ed 100644
--- a/tensorflow/contrib/lite/toco/g3doc/python_api.md
+++ b/tensorflow/contrib/lite/toco/g3doc/python_api.md
@@ -1,69 +1,201 @@
-# TensorFlow Lite Optimizing Converter (TOCO) Python API reference
+# TensorFlow Lite Optimizing Converter & Interpreter Python API reference
 
-This page provides examples on how to use TOCO via the Python API. It is
-complemented by the following documents:
+This page provides examples on how to use TOCO and the TensorFlow Lite
+interpreter via the Python API. It is complemented by the following documents:
 
 *   [README](../README.md)
 *   [Command-line examples](cmdline_examples.md)
 *   [Command-line glossary](cmdline_reference.md)
 
+Table of contents:
+
+*   [High-level overview](#high-level-overview)
+*   [API](#api)
+*   [Basic examples](#basic)
+    *   [Exporting a GraphDef from tf.Session](#basic-graphdef-sess)
+    *   [Exporting a GraphDef from file](#basic-graphdef-file)
+    *   [Exporting a SavedModel](#basic-savedmodel)
+*   [Complex examples](#complex)
+    *   [Exporting a quantized GraphDef](#complex-quant)
+*   [TensorFlow Lite Python interpreter](#interpreter)
+    *   [Using the interpreter from a model file](#interpreter-file)
+    *   [Using the interpreter from model data](#interpreter-data)
+
 ## High-level overview
 
 While the TensorFlow Lite Optimizing Converter can be used from the command
-line, it is often convenient to use it as part of Python model build and
+line, it is often convenient to use it as part of a Python model build and
 training script. This is so that conversion can be part of your model
 development pipeline. This allows you to know early and often that you are
 designing a model that can be targeted to devices with mobile.
 
 ## API
 
-In Python you can run `help(tf.contrib.lite)` to get documentation on functions.
-In particular, `tf.contrib.lite.toco_convert` presents a simple API and
-`tf.contrib.lite.toco_from_protos` allows more detailed control of TOCO using
-the protobuf interface to TOCO.
+The API for converting TensorFlow models to TensorFlow Lite is
+`tf.contrib.lite.TocoConverter`. The API for calling the Python intepreter is
+`tf.contrib.lite.Interpreter`.
+
+`TocoConverter` provides class methods based on the original format of the
+model. `TocoConverter.from_session()` is available for GraphDefs.
+`TocoConverter.from_saved_model()` is available for SavedModels. Example usages
+for simple float-point models are shown in [Basic Examples](#basic). Examples
+usages for more complex models is shown in [Complex Examples](#complex).
+
+**NOTE**: Currently, `TocoConverter` will cause a fatal error to the Python
+interpreter when the conversion fails. This will be remedied as soon as
+possible.
+
+## Basic examples <a name="basic"></a>
+
+The following section shows examples of how to convert a basic float-point model
+from each of the supported data formats into a TensorFlow Lite FlatBuffers.
 
-## Example
+### Exporting a GraphDef from tf.Session <a name="basic-graphdef-sess"></a>
 
-In particular, here we show creating a simple model and converting it to a
-TensorFlow Lite Model.
+The following example shows how to convert a TensorFlow GraphDef into a
+TensorFlow Lite FlatBuffer from a `tf.Session` object.
 
 ```python
 import tensorflow as tf
 
 img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
-val = img + tf.constant([1., 2., 3.]) + tf.constant([1., 4., 4.])
+var = tf.get_variable("weights", dtype=tf.float32, shape=(1, 64, 64, 3))
+val = img + var
 out = tf.identity(val, name="out")
+
 with tf.Session() as sess:
-  tflite_model = tf.contrib.lite.toco_convert(sess.graph_def, [img], [out])
-  open("test.tflite", "wb").write(tflite_model)
+  converter = tf.contrib.lite.TocoConverter.from_session(sess, [img], [out])
+  tflite_model = converter.convert()
+  open("converted_model.tflite", "wb").write(tflite_model)
 ```
 
-**NOTE** Currently, the TOCO command will cause a fatal error to the Python
-interpreter when TOCO conversion fails. This will be remedied as soon as
-possible.
+### Exporting a GraphDef from file <a name="basic-graphdef-file"></a>
+
+The following example shows how to convert a TensorFlow GraphDef into a
+TensorFlow Lite FlatBuffer when the GraphDef is stored in a file. Both `.pb` and
+`.pbtxt` files are accepted.
+
+The example uses
+[Mobilenet_1.0_224](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz).
+The function only supports GraphDefs frozen via
+[freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py).
+
+```python
+import tensorflow as tf
+
+graph_def_file = "/path/to/Downloads/mobilenet_v1_1.0_224/frozen_graph.pb"
+input_arrays = ["input"]
+output_arrays = ["MobilenetV1/Predictions/Softmax"]
+
+converter = tf.contrib.lite.TocoConverter.from_frozen_graph(
+  graph_def_file, input_arrays, output_arrays)
+tflite_model = converter.convert()
+open("converted_model.tflite", "wb").write(tflite_model)
+```
+
+### Exporting a SavedModel <a name="basic-savedmodel"></a>
+
+The following example shows how to convert a SavedModel into a TensorFlow Lite
+FlatBuffer.
+
+```python
+import tensorflow as tf
+
+converter = tf.contrib.lite.TocoConverter.from_saved_model(saved_model_dir)
+tflite_model = converter.convert()
+open("converted_model.tflite", "wb").write(tflite_model)
+```
+
+For more complex SavedModels, the optional parameters that can be passed into
+`TocoConverter.from_saved_model()` are `input_arrays`, `input_shapes`,
+`output_arrays`, `tag_set` and `signature_key`. Details of each parameter are
+available by running `help(tf.contrib.lite.TocoConverter)`.
 
-## Example 2: Export with variables
+## Complex examples <a name="complex"></a>
 
-If a model has variables, they need to be turned into constants. This process is
-known as freezing, and it can actually be accomplished with
+For models where the default value of the attributes is not sufficient, the
+attribute's values should be set before calling `convert()`. In order to call
+any constants use `tf.contrib.lite.constants.<CONSTANT_NAME>` as seen below with
+`QUANTIZED_UINT8`. Run `help(tf.contrib.lite.TocoConverter)` in the Python
+terminal for detailed documentation on the attributes.
+
+Although the examples are demonstrated on GraphDefs containing only constants.
+The same logic can be applied irrespective of the input data format.
+
+### Exporting a quantized GraphDef <a name="complex-quant"></a>
+
+The following example shows how to convert a quantized model into a TensorFlow
+Lite FlatBuffer.
 
 ```python
 import tensorflow as tf
 
 img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
-var = tf.get_variable("weights", dtype=tf.float32, shape=(1,64,64,3))
-val = img + var
+const = tf.constant([1., 2., 3.]) + tf.constant([1., 4., 4.])
+val = img + const
+out = tf.fake_quant_with_min_max_args(val, min=0., max=1., name="output")
+
+with tf.Session() as sess:
+  converter = tf.contrib.lite.TocoConverter.from_session(sess, [img], [out])
+  converter.inference_type = tf.contrib.lite.constants.QUANTIZED_UINT8
+  converter.quantized_input_stats = {"img" : (0., 1.)}  # mean, std_dev
+  tflite_model = converter.convert()
+  open("converted_model.tflite", "wb").write(tflite_model)
+```
+
+## TensorFlow Lite Python interpreter <a name="interpreter"></a>
+
+### Using the interpreter from a model file <a name="interpreter-file"></a>
+
+The following example shows how to use the TensorFlow Lite Python interpreter
+when provided a TensorFlow Lite FlatBuffer file. The example also demonstrates
+how to run inference on random input data. Run
+`help(tf.contrib.lite.Interpreter)` in the Python terminal to get detailed
+documentation on the interpreter.
+
+```python
+import numpy as np
+import tensorflow as tf
 
-def canonical_name(x):
-  return x.name.split(":")[0]
+# Load TFLite model and allocate tensors.
+interpreter = tf.contrib.lite.Interpreter(model_path="converted_model.tflite")
+interpreter.allocate_tensors()
 
+# Get input and output tensors.
+input_details = interpreter.get_input_details()
+output_details = interpreter.get_output_details()
+
+# Test model on random input data.
+input_shape = input_details[0]['shape']
+input_data = np.array(np.random.random_sample(input_shape), dtype=np.float32)
+interpreter.set_tensor(input_details[0]['index'], input_data)
+
+interpreter.invoke()
+output_data = interpreter.get_tensor(output_details[0]['index'])
+print(output_data)
+```
+
+### Using the interpreter from model data <a name="interpreter-data"></a>
+
+The following example shows how to use the TensorFlow Lite Python interpreter
+when starting with the TensorFlow Lite Flatbuffer model previously loaded. This
+example shows an end-to-end use case, starting from building the TensorFlow
+model.
+
+```python
+import numpy as np
+import tensorflow as tf
+
+img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
+const = tf.constant([1., 2., 3.]) + tf.constant([1., 4., 4.])
+val = img + const
 out = tf.identity(val, name="out")
+
 with tf.Session() as sess:
-  sess.run(tf.global_variables_initializer())
-  out_tensors = [out]
-  frozen_graphdef = tf.graph_util.convert_variables_to_constants(
-      sess, sess.graph_def, map(canonical_name, out_tensors))
-  tflite_model = tf.contrib.lite.toco_convert(
-      frozen_graphdef, [img], out_tensors)
-  open("converted_model.tflite", "wb").write(tflite_model)
+  converter = tf.contrib.lite.TocoConverter.from_session(sess, [img], [out])
+  tflite_model = converter.convert()
+
+# Load TFLite model and allocate tensors.
+interpreter = tf.contrib.lite.Interpreter(model_content=tflite_model)
+interpreter.allocate_tensors()
 ```
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
index 72ffd51db45d08..1bc7557d46cfa5 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
@@ -139,6 +139,7 @@ DECLARE_GRAPH_TRANSFORMATION(PropagateFakeQuantNumBits);
 DECLARE_GRAPH_TRANSFORMATION(PropagateFixedSizes)
 DECLARE_GRAPH_TRANSFORMATION(HardcodeMinMax)
 DECLARE_GRAPH_TRANSFORMATION(Quantize)
+DECLARE_GRAPH_TRANSFORMATION(QuantizeWeights)
 DECLARE_GRAPH_TRANSFORMATION(RemoveFinalDequantizeOp)
 DECLARE_GRAPH_TRANSFORMATION(RemoveTensorFlowAssert)
 DECLARE_GRAPH_TRANSFORMATION(RemoveTensorFlowIdentity)
@@ -174,6 +175,7 @@ DECLARE_GRAPH_TRANSFORMATION(UnrollBatchMatMul)
 DECLARE_GRAPH_TRANSFORMATION(ResolveSpaceToBatchNDAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveBatchToSpaceNDAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolvePadAttributes)
+DECLARE_GRAPH_TRANSFORMATION(ResolvePadV2Attributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveStridedSliceAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveSliceAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveMeanAttributes)
@@ -181,6 +183,7 @@ DECLARE_GRAPH_TRANSFORMATION(ResolveTransposeAttributes)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantRandomUniform)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantRange)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantShapeOrRank)
+DECLARE_GRAPH_TRANSFORMATION(ResolveConstantSlice)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantStack)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantStridedSlice)
 DECLARE_GRAPH_TRANSFORMATION(ResolveConstantFill)
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
index 437e30a91803bf..d63ee7c9519d16 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
@@ -188,6 +188,32 @@ bool HardcodeMinMaxFromFirstInput(Model* model, Operator* op) {
   return true;
 }
 
+bool HardcodeMinMaxForSelect(Model* model, Operator* op) {
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.minmax) {
+    return false;
+  }
+  const auto& input_array_1 = model->GetArray(op->inputs[1]);
+  if (!input_array_1.minmax) {
+    return false;
+  }
+  const auto& input_array_2 = model->GetArray(op->inputs[2]);
+  if (!input_array_2.minmax) {
+    return false;
+  }
+
+  const auto& input_minmax_1 = input_array_1.GetMinMax();
+  const auto& input_minmax_2 = input_array_2.GetMinMax();
+
+  CHECK_EQ(input_minmax_1.min, input_minmax_2.min);
+  CHECK_EQ(input_minmax_1.max, input_minmax_2.max);
+  CHECK(!output_array.minmax);
+  auto& output_minmax = output_array.GetOrCreateMinMax();
+  output_minmax.min = input_minmax_1.min;
+  output_minmax.max = input_minmax_1.max;
+  return true;
+}
+
 bool HardcodeMinMaxForOutput(Model* model, Operator* op, double min,
                              double max) {
   CHECK_EQ(op->outputs.size(), 1);
@@ -345,7 +371,9 @@ bool HardcodeMinMax::Run(Model* model, std::size_t op_index) {
     case OperatorType::kMean:
       changed = HardcodeMinMaxFromFirstInput(model, op);
       break;
-
+    case OperatorType::kSelect:
+      changed = HardcodeMinMaxForSelect(model, op);
+      break;
     case OperatorType::kLogistic:
       // We hardcode quantization_params to: zero_point=0, scale=1/256.
       // This choice of minmax is the one that is equivalent to that.
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc
index 45335fd78c99a5..5b6a984ee143a6 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc
@@ -33,9 +33,10 @@ bool MergeLstmCellInputs::Run(Model* model, std::size_t op_index) {
     return false;
   }
 
-  // Already a compact LstmCell with LstmCellOperator::NUM_INPUTS of inputs,
-  // do not need to merge cell inputs.
-  if (src_op->inputs.size() == LstmCellOperator::NUM_INPUTS) {
+  // Already a compact LstmCell. Do not need to merge cell inputs.
+  const auto* src_lstm_op = static_cast<LstmCellOperator*>(src_op);
+  if (src_lstm_op->kernel_type != LstmCellOperator::KERNEL_FULL ||
+      src_lstm_op->inputs.size() != kExtendedLstmInputCount) {
     return false;
   }
 
@@ -136,6 +137,7 @@ bool MergeLstmCellInputs::Run(Model* model, std::size_t op_index) {
 
   // Emplace a new LSTM cell operator (use basic 5 inputs kernel).
   auto lstm_cell_op = absl::make_unique<LstmCellOperator>();
+  lstm_cell_op->kernel_type = LstmCellOperator::KERNEL_BASIC;
 
   // Compact LstmCell's 5 inputs.
   lstm_cell_op->inputs.resize(LstmCellOperator::NUM_INPUTS);
@@ -146,16 +148,19 @@ bool MergeLstmCellInputs::Run(Model* model, std::size_t op_index) {
   lstm_cell_op->inputs[LstmCellOperator::PREV_ACTIV_INPUT] = prev_activ_input;
   lstm_cell_op->inputs[LstmCellOperator::PREV_STATE_INPUT] = prev_state_input;
 
-  // Reorder LstmCell's 4 outputs.
+  // Reorder LstmCell's 3 outputs.
   lstm_cell_op->outputs.resize(LstmCellOperator::NUM_OUTPUTS);
   lstm_cell_op->outputs[LstmCellOperator::ACTIV_OUTPUT] =
       src_op->outputs[kOutputTensor];
   lstm_cell_op->outputs[LstmCellOperator::STATE_OUTPUT] =
       src_op->outputs[kCellStateTensor];
-  lstm_cell_op->outputs[LstmCellOperator::CONCAT_TEMP] =
-      src_op->outputs[kScratchBufferTensor];
   lstm_cell_op->outputs[LstmCellOperator::ACTIV_TEMP] =
       src_op->outputs[kOutputStateTensor];
+  // Create a new temp array for the fourth output.
+  const string& concat_temp_array_name =
+      AvailableArrayName(*model, base_name + "concat_temp");
+  model->GetOrCreateArray(concat_temp_array_name);
+  lstm_cell_op->outputs[LstmCellOperator::CONCAT_TEMP] = concat_temp_array_name;
 
   // Add the op into model.
   model->operators.emplace(op_it, std::move(lstm_cell_op));
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_split_inputs.cc b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_split_inputs.cc
index eca717680af281..e6e3dfa1de9c9f 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_split_inputs.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_split_inputs.cc
@@ -33,9 +33,10 @@ bool SplitLstmCellInputs::Run(Model* model, std::size_t op_index) {
     return false;
   }
 
-  // Already an extended LstmCell with kExtendedLstmInputCount of inputs,
-  // do not need to split cell inputs.
-  if (curr_op->inputs.size() == kExtendedLstmInputCount) {
+  const auto* curr_lstm_op = static_cast<LstmCellOperator*>(curr_op);
+  // Already an extended LstmCell. Do not need to split cell inputs.
+  if (curr_lstm_op->kernel_type != LstmCellOperator::KERNEL_BASIC ||
+      curr_lstm_op->inputs.size() != LstmCellOperator::NUM_INPUTS) {
     return false;
   }
 
@@ -56,6 +57,7 @@ bool SplitLstmCellInputs::Run(Model* model, std::size_t op_index) {
 
   // Emplace a new LstmCell operator with extended inputs (kernel/lstm.cc).
   auto lstm_cell_op = absl::make_unique<LstmCellOperator>();
+  lstm_cell_op->kernel_type = LstmCellOperator::KERNEL_FULL;
   lstm_cell_op->inputs.resize(kExtendedLstmInputCount);
   int num_input = model->GetArray(curr_op->inputs[LstmCellOperator::DATA_INPUT])
                       .shape()
@@ -138,10 +140,9 @@ bool SplitLstmCellInputs::Run(Model* model, std::size_t op_index) {
   CreateOptionalArray(model, &(lstm_cell_op->inputs[kProjectionBiasTensor]),
                       base_name + "proj_bias");
 
-  // Reorder LstmCell's outputs.
-  lstm_cell_op->outputs.resize(LstmCellOperator::NUM_OUTPUTS);
-  lstm_cell_op->outputs[kScratchBufferTensor] =
-      curr_op->outputs[LstmCellOperator::CONCAT_TEMP];
+  // Reorder and resize LstmCell's outputs.
+  lstm_cell_op->outputs.resize(
+      ExtendedLstmCellOutputs::kExtendedLstmOutputCount);
   lstm_cell_op->outputs[kOutputStateTensor] =
       curr_op->outputs[LstmCellOperator::ACTIV_TEMP];
   lstm_cell_op->outputs[kCellStateTensor] =
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_relu1.cc b/tensorflow/contrib/lite/toco/graph_transformations/identify_relu1.cc
index de6d8889fb4ccd..bddb563206f763 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/identify_relu1.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/identify_relu1.cc
@@ -79,8 +79,9 @@ bool IdentifyRelu1::Run(Model* model, std::size_t op_index) {
   const auto* max_op =
       op_0->type == OperatorType::kTensorFlowMaximum ? op_0 : op_1;
 
-  CHECK_EQ(min_op->inputs.size(), 2);
-  CHECK_EQ(max_op->inputs.size(), 2);
+  if (min_op->inputs.size() != 2 || max_op->inputs.size() != 2) {
+    return false;
+  }
   if (min_op->outputs.size() != 1 || max_op->outputs.size() != 1) {
     return false;
   }
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h b/tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h
index 4a9974ed4e0ebe..1c32a781698ec7 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/lstm_utils.h
@@ -51,10 +51,10 @@ enum ExtendedLstmCellInputs {
 };
 
 enum ExtendedLstmCellOutputs {
-  kScratchBufferTensor = 0,
-  kOutputStateTensor = 1,
-  kCellStateTensor = 2,
-  kOutputTensor = 3
+  kOutputStateTensor = 0,
+  kCellStateTensor = 1,
+  kOutputTensor = 2,
+  kExtendedLstmOutputCount = 3
 };
 
 // Create optional array used for optional tensor in ExtendedLstmCell inputs.
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
index c1cf79f62614c4..64096fb069d639 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
@@ -152,6 +152,27 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
       // Yield on ExpandDim until it is converted to Reshape
       return false;
     }
+    case OperatorType::kSelect: {
+      // Select produces outputs with the same type as their 2nd input
+      CHECK_EQ(op->inputs.size(), 3);
+      const ArrayDataType data_type_x =
+          model->GetArray(op->inputs[1]).data_type;
+      const ArrayDataType data_type_y =
+          model->GetArray(op->inputs[2]).data_type;
+      CHECK(data_type_x == data_type_y);
+      SetDataTypeForAllOutputs(model, op, data_type_x);
+      break;
+    }
+    case OperatorType::kSparseToDense: {
+      // Select produces outputs with the same type as their 3rd input
+      CHECK_EQ(op->inputs.size(), 4);
+      const ArrayDataType data_type = model->GetArray(op->inputs[2]).data_type;
+      const ArrayDataType data_type_default =
+          model->GetArray(op->inputs[3]).data_type;
+      CHECK(data_type == data_type_default);
+      SetDataTypeForAllOutputs(model, op, data_type);
+      break;
+    }
     default: {
       // These operators produce outputs with the same type as their 1st input
       CHECK_GT(op->inputs.size(), 0);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
index 0bce183c1897df..6d51fc8c31e6c8 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fake_quant_num_bits.cc
@@ -102,6 +102,7 @@ bool DoesOpBlockBackwardPropagation(const Operator& op) {
       // Gathers need their parameters changed to the appropriate data type.
     case OperatorType::kTensorFlowReshape:
     case OperatorType::kTranspose:
+    case OperatorType::kSelect:
       // Reshapes and transposes don't change values.
       return false;
     default:
@@ -113,6 +114,8 @@ bool DoesOpBlockBackwardPropagation(const Operator& op) {
 // propagation.
 bool DoesOpInputBlockBackwardPropagation(const Operator& op, int input_index) {
   switch (op.type) {
+    case OperatorType::kSelect:
+      return input_index == 0;
     case OperatorType::kGather:
       // Ignore gather indices.
       return input_index != 0;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index a15e4f22418a04..adb241da3246d9 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_join.h"
+#include "tensorflow/contrib/lite/kernels/internal/strided_slice_logic.h"
 #include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
 #include "tensorflow/contrib/lite/toco/model.h"
 #include "tensorflow/contrib/lite/toco/tooling_util.h"
@@ -498,8 +499,8 @@ void ProcessTensorFlowReshapeOperator(Model* model,
       << op->outputs[0] << "\". Are your input shapes correct?";
 }
 
-void ProcessSimpleOperator(Model* model, Operator* op) {
-  const auto& input_array = model->GetArray(op->inputs[0]);
+void ProcessSimpleOperator(Model* model, Operator* op, int input_index) {
+  const auto& input_array = model->GetArray(op->inputs[input_index]);
   // Yield until input dims have been resolved.
   if (!input_array.has_shape()) {
     return;
@@ -528,6 +529,21 @@ void ProcessSimpleBinaryOperator(Model* model, Operator* op) {
                                   &output_array);
 }
 
+void ProcessSelectOperator(Model* model, SelectOperator* op) {
+  // Yield until all input dims have been resolved.
+  for (const auto& input : op->inputs) {
+    const auto& input_array = model->GetArray(input);
+    if (!input_array.has_shape()) {
+      return;
+    }
+  }
+
+  // Select's output matches the second and third output.
+  const auto& input1_array = model->GetArray(op->inputs[1]);
+  auto& output_array = model->GetArray(op->outputs[0]);
+  output_array.copy_shape(input1_array.shape());
+}
+
 void ProcessAddNOperator(Model* model, Operator* op) {
   // Yield until all input dims have been resolved.
   //
@@ -669,8 +685,7 @@ void ProcessConcatenationOperator(Model* model, ConcatenationOperator* op) {
   const auto& first_input_array = model->GetArray(op->inputs[0]);
   output_array.copy_shape(first_input_array.shape());
   // Negative axis means the count starts at the back of the dims().
-  int axis = op->axis;
-  if (axis < 0) axis += first_input_array.shape().dims().size();
+  if (op->axis < 0) op->axis += first_input_array.shape().dims().size();
   // Determine the concat size, and enfore that all inputs have
   // the same dimensions count.
   int concat_size = 0;
@@ -683,14 +698,14 @@ void ProcessConcatenationOperator(Model* model, ConcatenationOperator* op) {
     CHECK_EQ(input_array.shape().dimensions_count(),
              output_array.shape().dimensions_count());
     const std::vector<int>& input_dims = input_array.shape().dims();
-    CHECK_LT(axis, input_dims.size());
-    concat_size += input_dims[axis];
+    CHECK_LT(op->axis, input_dims.size());
+    concat_size += input_dims[op->axis];
   }
   // Write out the concat_size on the output array shape.
   auto& output_shape = *output_array.mutable_shape();
   auto& output_dims = *output_shape.mutable_dims();
-  CHECK_LT(axis, output_shape.dimensions_count());
-  output_dims[axis] = concat_size;
+  CHECK_LT(op->axis, output_shape.dimensions_count());
+  output_dims[op->axis] = concat_size;
 }
 
 void ProcessRangeOperator(Model* model, RangeOperator* op) {
@@ -1146,6 +1161,32 @@ void ProcessPadOperator(Model* model, PadOperator* op) {
   output_array.copy_shape(output_shape);
 }
 
+void ProcessPadV2Operator(Model* model, PadV2Operator* op) {
+  CHECK_EQ(op->inputs.size(), 3);
+  CHECK_EQ(op->outputs.size(), 1);
+
+  const auto& input_array = model->GetArray(op->inputs[0]);
+
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) return;
+
+  if (op->left_padding.empty()) return;
+  CHECK_EQ(op->left_padding.size(), op->right_padding.size());
+
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.has_shape()) return;
+
+  Shape output_shape = input_array.shape();
+  std::vector<int>& dims = *output_shape.mutable_dims();
+  CHECK_EQ(op->left_padding.size(), dims.size());
+
+  for (int i = 0; i < op->left_padding.size(); ++i) {
+    dims[i] += op->left_padding[i] + op->right_padding[i];
+  }
+
+  output_array.copy_shape(output_shape);
+}
+
 void ProcessRankOperator(Model* model, RankOperator* op) {
   CHECK_GE(op->inputs.size(), 1);
   CHECK_EQ(op->outputs.size(), 1);
@@ -1235,83 +1276,6 @@ void ProcessStackOperator(Model* model, StackOperator* op) {
   output_array.copy_shape(*stacked_shape);
 }
 
-// These StridedSlice utility functions are essentially a COPY of those in
-// reference_ops.h. See comments there.
-
-// Use until std::clamp() is available from C++17.
-int Clamp(const int v, const int lo, const int hi) {
-  if (hi < v) return hi;
-  if (v < lo) return lo;
-  return v;
-}
-
-int StartForAxis(StridedSliceOperator const& op, Shape const& input_shape,
-                 int axis) {
-  // Begin with the specified index
-  int start = op.start_indices[axis];
-
-  // begin_mask override
-  if (op.begin_mask & 1 << axis) {
-    if (op.strides[axis] > 0) {
-      // Forward iteration - use the first element. These values will get
-      // clamped below (Note: We could have set them to 0 and axis_size-1, but
-      // use lowest() and max() to maintain symmetry with StopForAxis())
-      start = std::numeric_limits<int>::lowest();
-    } else {
-      // Backward iteration - use the last element.
-      start = std::numeric_limits<int>::max();
-    }
-  }
-
-  // Handle negative indices
-  int axis_size = input_shape.dims(axis);
-  if (start < 0) {
-    start += axis_size;
-  }
-
-  // Clamping
-  start = Clamp(start, 0, axis_size - 1);
-
-  return start;
-}
-
-int StopForAxis(StridedSliceOperator const& op, Shape const& input_shape,
-                int axis) {
-  // Begin with the specified index
-  int stop = op.stop_indices[axis];
-
-  // end_mask override
-  if (op.end_mask & (1 << axis)) {
-    if (op.strides[axis] > 0) {
-      // Forward iteration - use the last element. These values will get
-      // clamped below
-      stop = std::numeric_limits<int>::max();
-    } else {
-      // Backward iteration - use the first element.
-      stop = std::numeric_limits<int>::lowest();
-    }
-  }
-
-  // Handle negative indices
-  int axis_size = input_shape.dims(axis);
-  if (stop < 0) {
-    stop += axis_size;
-  }
-
-  // Clamping
-  // Because the end index points one past the last element, we need slightly
-  // different clamping ranges depending on the direction.
-  if (op.strides[axis] > 0) {
-    // Forward iteration
-    stop = Clamp(stop, 0, axis_size);
-  } else {
-    // Backward iteration
-    stop = Clamp(stop, -1, axis_size - 1);
-  }
-
-  return stop;
-}
-
 void ProcessStridedSliceOperator(Model* model, StridedSliceOperator* op) {
   CHECK_GE(op->inputs.size(), 1);
   CHECK_EQ(op->outputs.size(), 1);
@@ -1364,18 +1328,17 @@ void ProcessStridedSliceOperator(Model* model, StridedSliceOperator* op) {
                                 << " has stride=" << op->strides[i] << ".";
   }
 
-  // The TensorFlow documentation is not explicit on how it handles fewer
-  // supplied indices than dimensions, but they are accepted. We emulate TF's
-  // behavior by fully iterating over each "forgotten" dimension.
-  op->PadIndices(num_input_axes);
-
   // Create output shape
   std::vector<int>* dims = output_array.mutable_shape()->mutable_dims();
 
   // Compute output shape
   for (int axis = 0; axis < num_input_axes; ++axis) {
-    int start_index = StartForAxis(*op, input_array.shape(), axis);
-    int stop_index = StopForAxis(*op, input_array.shape(), axis);
+    int start_index = tflite::strided_slice::StartForAxis(
+        op->begin_mask, op->start_indices, op->strides,
+        input_array.shape().dims().data(), axis);
+    int stop_index = tflite::strided_slice::StopForAxis(
+        op->end_mask, op->stop_indices, op->strides,
+        input_array.shape().dims().data(), axis);
     int dim_size =
         ceil(static_cast<float>(stop_index - start_index) / op->strides[axis]);
 
@@ -1514,6 +1477,34 @@ void ProcessArgMaxOperator(Model* model, ArgMaxOperator* op) {
   *output_array.mutable_shape()->mutable_dims() = output_dims;
 }
 
+void ProcessSparseToDenseOperator(Model* model, SparseToDenseOperator* op) {
+  CHECK_EQ(op->inputs.size(), 4);
+
+  const Array& output_shape_array = model->GetArray(op->inputs[1]);
+  if (!output_shape_array.has_shape()) return;
+  CHECK_EQ(output_shape_array.shape().dimensions_count(), 1);
+
+  // Output should not go over four dimensions.
+  CHECK_LE(output_shape_array.shape().dims(0), 4);
+
+  const string& output_name = op->outputs[0];
+  Array& output_array = model->GetArray(output_name);
+  if (output_array.has_shape()) return;
+
+  CHECK(output_shape_array.data_type == ArrayDataType::kInt32 ||
+        output_shape_array.data_type == ArrayDataType::kInt64);
+  if (output_shape_array.data_type == ArrayDataType::kInt32) {
+    *output_array.mutable_shape()->mutable_dims() =
+        output_shape_array.GetBuffer<ArrayDataType::kInt32>().data;
+  } else {
+    const std::vector<int64>& output_shape_data =
+        output_shape_array.GetBuffer<ArrayDataType::kInt64>().data;
+    std::copy(
+        output_shape_data.begin(), output_shape_data.end(),
+        std::back_inserter(*output_array.mutable_shape()->mutable_dims()));
+  }
+}
+
 }  // namespace
 
 bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
@@ -1551,7 +1542,8 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kCast:
     case OperatorType::kFloor:
     case OperatorType::kExp:
-      ProcessSimpleOperator(model, op);
+    case OperatorType::kSin:
+      ProcessSimpleOperator(model, op, 0);
       break;
     case OperatorType::kGather:
       ProcessGatherOperator(model, static_cast<GatherOperator*>(op));
@@ -1622,7 +1614,9 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kMean:
       ProcessTensorFlowReductionOperator(model, op);
       break;
-
+    case OperatorType::kSelect:
+      ProcessSelectOperator(model, static_cast<SelectOperator*>(op));
+      break;
     case OperatorType::kSlice:
       ProcessSliceOperator(model, static_cast<SliceOperator*>(op));
       break;
@@ -1706,6 +1700,9 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kPad:
       ProcessPadOperator(model, static_cast<PadOperator*>(op));
       break;
+    case OperatorType::kPadV2:
+      ProcessPadV2Operator(model, static_cast<PadV2Operator*>(op));
+      break;
     case OperatorType::kStridedSlice:
       ProcessStridedSliceOperator(model,
                                   static_cast<StridedSliceOperator*>(op));
@@ -1731,6 +1728,10 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
       CHECK_EQ(op->inputs.size(), 1);
       ProcessOpWithShapeInput(model, op);
       break;
+    case OperatorType::kSparseToDense:
+      ProcessSparseToDenseOperator(model,
+                                   static_cast<SparseToDenseOperator*>(op));
+      break;
     default:
       // Unimplemented, another graph transformation should drop it.
       LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(op->type);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
index fa46e6bc3805d3..142841fcc460e8 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
@@ -48,13 +48,19 @@ bool SupportsQuantization(const Operator& op) {
          type == OperatorType::kLogSoftmax ||
          type == OperatorType::kTensorFlowSplit || type == OperatorType::kSub ||
          type == OperatorType::kSqueeze || type == OperatorType::kPad ||
+         type == OperatorType::kPadV2 ||
          type == OperatorType::kTensorFlowReshape ||
          type == OperatorType::kTanh || type == OperatorType::kMul ||
          type == OperatorType::kSpaceToDepth ||
          type == OperatorType::kStridedSlice ||
          type == OperatorType::kDepthToSpace ||
          type == OperatorType::kLstmCell || type == OperatorType::kGather ||
-         type == OperatorType::kTranspose || type == OperatorType::kMean;
+         type == OperatorType::kTranspose || type == OperatorType::kMean ||
+         type == OperatorType::kTensorFlowGreater ||
+         type == OperatorType::kTensorFlowGreaterEqual ||
+         type == OperatorType::kTensorFlowLess ||
+         type == OperatorType::kTensorFlowLessEqual ||
+         type == OperatorType::kSelect;
 }
 
 const MinMax& GetOrComputeMinMax(Model* model, const string& array_name) {
@@ -96,6 +102,11 @@ const MinMax& GetOrComputeMinMax(Model* model, const string& array_name) {
       min = std::min(min, val);
       max = std::max(max, val);
     }
+    if (min == 0.f && max == 0.f) {
+      // Prevent downstream anger from quantized math that expects min and max
+      // to not be equal.
+      max = 1.f;
+    }
     auto& minmax = array.GetOrCreateMinMax();
     minmax.min = min;
     minmax.max = max;
@@ -251,8 +262,7 @@ bool ChooseHardcodedQuantizationForOperatorOutput(
         IsExactlyRepresentable(0., *quantized_data_type, *quantization_params));
     return true;
   }
-  if ((op.type == OperatorType::kLogistic) ||
-      (op.type == OperatorType::kSoftmax)) {
+  if (op.type == OperatorType::kLogistic || op.type == OperatorType::kSoftmax) {
     // Logistic and Softmax have range: [0, 1].
     //
     // For Logistic, 0.5 should be exactly representable, as implementations
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantize_weights.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantize_weights.cc
new file mode 100644
index 00000000000000..88ea0945e7dd15
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantize_weights.cc
@@ -0,0 +1,108 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iterator>
+#include <string>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+
+namespace toco {
+
+namespace {
+
+// The minimum number of elements a weights array must have to be quantized
+// by this transformation.
+// TODO(suharshs): Make this minimum size configurable.
+const int kWeightsMinSize = 1024;
+
+// Gets the quantization params from the float array.
+void GetQuantizationParamsFromArray(const Array& array,
+                                    QuantizationParams* params) {
+  const std::vector<float>& float_vals =
+      array.GetBuffer<ArrayDataType::kFloat>().data;
+  auto minmax = std::minmax_element(float_vals.begin(), float_vals.end());
+  MinMax toco_minmax;
+  toco_minmax.min = *minmax.first;
+  toco_minmax.max = *minmax.second;
+  GetQuantizationParams(ArrayDataType::kUint8, toco_minmax, params);
+}
+
+}  // namespace
+
+bool QuantizeWeights::Run(Model* model, std::size_t op_index) {
+  const auto op_it = model->operators.begin() + op_index;
+  Operator* op = op_it->get();
+
+  // Get the weights tensor, if the current operator has one.
+  int weights_index;
+  if (op->type == OperatorType::kConv ||
+      op->type == OperatorType::kDepthwiseConv ||
+      op->type == OperatorType::kFullyConnected) {
+    weights_index = 1;
+  } else if (op->type == OperatorType::kLstmCell) {
+    weights_index = LstmCellOperator::WEIGHTS_INPUT;
+  } else {
+    return false;
+  }
+
+  // Return early if the array isn't a constant param, this can happen in early
+  // transformation passes until transpose operations following the weight array
+  // are resolved.
+  const string weights = op->inputs[weights_index];
+  if (!IsConstantParameterArray(*model, weights)) {
+    return false;
+  }
+
+  // Return early if the weight tensor is not type float.
+  Array& weights_array = model->GetArray(weights);
+  if (weights_array.data_type != ArrayDataType::kFloat) {
+    return false;
+  }
+
+  // Return early if the tensor is too small. Small tensors don't take up too
+  // much space and can result in bad quantization results.
+  if (weights_array.GetBuffer<ArrayDataType::kFloat>().data.size() <
+      kWeightsMinSize) {
+    return false;
+  }
+
+  // Quantize the weight tensor to type kUint8.
+  QuantizationParams params;
+  GetQuantizationParamsFromArray(weights_array, &params);
+  QuantizeArray(this, model, weights, ArrayDataType::kUint8, params);
+
+  // Insert a Dequantize operation after the quantized weights tensor.
+  auto* dequantize_op = new DequantizeOperator;
+  model->operators.emplace(op_it, dequantize_op);
+
+  // Create a new intermediate tensor to connect the Dequantize op to the
+  // original op.
+  const string dequantized_output =
+      AvailableArrayName(*model, weights + "_dequantized");
+  Array& dequantized_output_array = model->GetOrCreateArray(dequantized_output);
+  dequantized_output_array.data_type = ArrayDataType::kFloat;
+
+  // Connect up the new Dequantize op with the weights and original op.
+  op->inputs[weights_index] = dequantized_output;
+  dequantize_op->inputs = {weights};
+  dequantize_op->outputs = {dequantized_output};
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_binary.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_binary.cc
index 95a50c61794092..0dfdc40e4c3410 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_binary.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_binary.cc
@@ -78,6 +78,25 @@ bool RemoveTrivialBinaryOperator::Run(Model* model, std::size_t op_index) {
   CHECK(is_input_constant[index_of_constant_input]);
   CHECK(!is_input_constant[index_of_variable_input]);
 
+  // If this was a broadcasting op we can't remove it as we need the broadcast.
+  // It's possible we could replace it with a cheaper op, though.
+  const auto& input_array_0 = model->GetArray(binary_op->inputs[0]);
+  const auto& input_array_1 = model->GetArray(binary_op->inputs[1]);
+  if (!input_array_0.has_shape() || !input_array_1.has_shape()) {
+    // Both input shapes must be known.
+    return false;
+  }
+  if (input_array_0.shape().dimensions_count() ==
+          input_array_1.shape().dimensions_count() &&
+      input_array_0.shape() != input_array_1.shape()) {
+    AddMessageF(
+        "Preserving %s even though it's trivial as we need to broadcast "
+        "(lhs %s, rhs %s)",
+        LogName(*binary_op), ShapeToString(input_array_0.shape()),
+        ShapeToString(input_array_1.shape()));
+    return false;
+  }
+
   // Now check if the constant operand makes this binary
   // operator trivial.
   const auto& constant_input_array =
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
index 3e021b819fc82d..a950fe6442bc65 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_trivial_passthrough.cc
@@ -85,9 +85,11 @@ bool RemoveTrivialPassthroughOp(GraphTransformation* transformation,
         "Removing %s, keeping its non-constant input array %s and removing %s",
         LogName(*passthru_op), main_input_name, output_name);
     RerouteEdges(output_name, main_input_name, model);
-  } else if (IsDiscardableArray(*model, main_input_name)) {
+  } else if (IsDiscardableArray(*model, main_input_name) &&
+             !IsConstantParameterArray(*model, main_input_name)) {
     transformation->AddMessageF(
-        "Removing %s, keeping its output array %s and removing input %s",
+        "Removing %s, keeping its output array %s and removing non-constant "
+        "input %s",
         LogName(*passthru_op), output_name, main_input_name);
     RerouteEdges(main_input_name, output_name, model);
   } else {
@@ -95,10 +97,23 @@ bool RemoveTrivialPassthroughOp(GraphTransformation* transformation,
         "Cannot remove %s, neither its main input nor its output may be "
         "discarded",
         LogName(*passthru_op));
-    return false;
+    if (passthru_op->type != OperatorType::kTensorFlowReshape &&
+        model->GetArray(main_input_name).has_shape()) {
+      // We can't remove either array but we can remove the op. Converting it to
+      // a reshape gives us some hope of later on fixing that (either in the
+      // final runtime or as an additional fixup step).
+      //
+      // Note that we don't try to insert copies in place of reshapes as the
+      // copy itself is a trivial reshape and we'd go into an infinite loop!
+      transformation->AddMessageF("Replacing with a copy (reshape) instead");
+      InsertCopyOperator(model, main_input_name, output_name);
+    } else {
+      return false;
+    }
   }
 
   // Remove the pass-through node.
+  CHECK_EQ(passthru_it->get(), passthru_op);
   model->operators.erase(passthru_it);
 
   // Remove any array that is no longer used.
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc b/tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc
index 8e6aaf544aa531..1956ab2d2021cd 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/remove_unused_op.cc
@@ -88,13 +88,11 @@ bool RemoveUnusedOp::Run(Model* model, std::size_t op_index) {
   // At that point we know that none of the outputs is used, so we will
   // definitely remove the node and all its outputs.
 
-  // Remove any input array that is not used by anything else,
-  // and that is not the output of some other operator.
+  // Remove any input array that not the output of another op, and only used by
+  // this op.
   for (const auto& input : op->inputs) {
-    if (IsDiscardableArray(*model, input) &&
-        CountOpsWithInput(*model, input) == 1 &&
-        !GetOpWithOutput(*model, input)) {
-      model->EraseArray(input);
+    if (!GetOpWithOutput(*model, input)) {
+      DeleteArrayIfUsedOnce(input, model);
     }
   }
 
@@ -102,22 +100,9 @@ bool RemoveUnusedOp::Run(Model* model, std::size_t op_index) {
   for (const auto& output : op->outputs) {
     // If the output array is the model's input array, don't remove that.
     // That's the case when cropping a model at a given --input_array.
-    if (!IsDiscardableArray(*model, output)) {
-      continue;
-    }
-    // Likewise, if the output array is a RNN state array, don't remove that.
-    bool found_output_as_rnn_state_array = false;
-    for (const auto& rnn_state : model->flags.rnn_states()) {
-      if (output == rnn_state.state_array()) {
-        found_output_as_rnn_state_array = true;
-        break;
-      }
-    }
-    if (found_output_as_rnn_state_array) {
-      continue;
+    if (IsDiscardableArray(*model, output)) {
+      model->EraseArray(output);
     }
-    // Generic case: do delete this output array.
-    model->EraseArray(output);
   }
   model->operators.erase(it);
   return true;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc
index 2b3ee36ad10e24..8f2c1f81628398 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_batch_normalization.cc
@@ -134,9 +134,9 @@ bool ResolveBatchNormalization::Run(Model* model, std::size_t op_index) {
   }
 
   // Remove the old param arrays
-  model->EraseArray(bn_op->inputs[1]);
-  model->EraseArray(bn_op->inputs[2]);
-  model->EraseArray(bn_op->inputs[3]);
+  DeleteArrayIfUsedOnce(bn_op->inputs[1], model);
+  DeleteArrayIfUsedOnce(bn_op->inputs[2], model);
+  DeleteArrayIfUsedOnce(bn_op->inputs[3], model);
 
   // Remove the old operator
   DCHECK_EQ(bn_it->get(), bn_op);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_slice.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_slice.cc
new file mode 100644
index 00000000000000..b35c3e19c43b1c
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_slice.cc
@@ -0,0 +1,165 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+namespace {
+
+template <ArrayDataType Type>
+bool Slice(SliceOperator const& op, Array const& input_array,
+           Array* output_array) {
+  // Implementation is taken from the tflite kernel.
+
+  CHECK(input_array.data_type == Type);
+  CHECK(output_array->data_type == Type);
+  const auto& input_data = input_array.GetBuffer<Type>().data;
+
+  // Create a buffer for the output array.
+  std::vector<DataType<Type>>& output_data =
+      output_array->GetMutableBuffer<Type>().data;
+  output_data.resize(RequiredBufferSizeForShape(output_array->shape()));
+
+  std::vector<int> size = op.size;
+  if (size.size() != op.begin.size()) {
+    // Broadcast the end positions.
+    CHECK_EQ(op.size.size(), 1);
+    int broadcast_size = size[0];
+    while (size.size() < op.begin.size()) size.push_back(broadcast_size);
+  }
+
+  // Calculate begin and end indices along each dimension.
+  CHECK_LE(op.begin.size(), 4);
+  CHECK_LE(size.size(), 4);
+  std::vector<int> begin = op.begin;
+  std::vector<int> end;
+  for (int i = 0; i < begin.size(); ++i) {
+    int dim_size = size[i];
+    if (dim_size == -1) {
+      // -1 means the rest of the dimension.
+      dim_size = input_array.shape().dims()[i] - begin[i];
+    }
+    CHECK_GE(dim_size, 1);
+    end.push_back(begin[i] + dim_size - 1);
+  }
+
+  // Pad out so that we always have 4 dims, makes this loop easier.
+  while (begin.size() < 4) begin.insert(begin.begin(), 0);
+  while (end.size() < 4) end.insert(end.begin(), 0);
+  Shape padded_shape = input_array.shape();
+  while (padded_shape.dimensions_count() < 4) {
+    padded_shape.mutable_dims()->insert(padded_shape.mutable_dims()->begin(),
+                                        1);
+  }
+
+  auto* out_ptr = output_data.data();
+  for (int in_b = begin[0]; in_b <= end[0]; ++in_b) {
+    for (int in_h = begin[1]; in_h <= end[1]; ++in_h) {
+      for (int in_w = begin[2]; in_w <= end[2]; ++in_w) {
+        for (int in_d = begin[3]; in_d <= end[3]; ++in_d) {
+          *out_ptr++ =
+              input_data[Offset(padded_shape, {in_b, in_h, in_w, in_d})];
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+}  // namespace
+
+bool ResolveConstantSlice::Run(Model* model, std::size_t op_index) {
+  const auto it = model->operators.begin() + op_index;
+  const auto* base_op = it->get();
+  if (base_op->type != OperatorType::kSlice) {
+    return false;
+  }
+
+  const SliceOperator* op = static_cast<const SliceOperator*>(base_op);
+
+  CHECK_EQ(op->outputs.size(), 1);
+  auto& output_array = model->GetArray(op->outputs[0]);
+  if (output_array.data_type == ArrayDataType::kNone) {
+    // Yield until the output type has been set by PropagateArrayDataTypes.
+    return false;
+  }
+
+  if (!output_array.has_shape()) {
+    // Yield until the output shape has been set by PropagateFixedShapes.
+    return false;
+  }
+
+  if (op->begin.empty() || op->size.empty()) {
+    // Attributes have not resolved yet.
+    return false;
+  }
+
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  if (!input_array.has_shape()) {
+    // Yield until the value shape has been resolved.
+    return false;
+  }
+  if (!IsConstantParameterArray(*model, op->inputs[0])) {
+    // Yield until the value is constant.
+    return false;
+  }
+
+  CHECK(!output_array.buffer);
+  switch (output_array.data_type) {
+    case ArrayDataType::kFloat:
+      if (!Slice<ArrayDataType::kFloat>(*op, input_array, &output_array)) {
+        return false;
+      }
+      break;
+    case ArrayDataType::kUint8:
+      if (!Slice<ArrayDataType::kUint8>(*op, input_array, &output_array)) {
+        return false;
+      }
+      break;
+    case ArrayDataType::kInt32:
+      if (!Slice<ArrayDataType::kInt32>(*op, input_array, &output_array)) {
+        return false;
+      }
+      break;
+    case ArrayDataType::kInt64:
+      if (!Slice<ArrayDataType::kInt64>(*op, input_array, &output_array)) {
+        return false;
+      }
+      break;
+    default:
+      LOG(FATAL) << "Unsupported data type input to Slice op with output \""
+                 << op->outputs[0] << "\"";
+      break;
+  }
+
+  // Erase input array if no longer used.
+  if (IsDiscardableArray(*model, op->inputs[0]) &&
+      CountOpsWithInput(*model, op->inputs[0]) == 1) {
+    model->EraseArray(op->inputs[0]);
+  }
+
+  // Erase the operator
+  model->operators.erase(it);
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_stack.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_stack.cc
index ea0d6dc8200897..69db1942cd52af 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_stack.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_stack.cc
@@ -77,6 +77,13 @@ bool ResolveConstantStack::Run(Model* model, std::size_t op_index) {
     }
   }
 
+  int axis = op->axis;
+  if (axis < 0) {
+    // Handle negative axis
+    axis += model->GetArray(op->inputs[0]).shape().dims().size();
+  }
+  CHECK_EQ(axis, 0) << "Stacking only supported along 0th axis";
+
   CHECK(!output_array.buffer);
   switch (output_array.data_type) {
     case ArrayDataType::kFloat:
@@ -99,10 +106,7 @@ bool ResolveConstantStack::Run(Model* model, std::size_t op_index) {
 
   // Erase input arrays if no longer used
   for (const auto& input : op->inputs) {
-    if (IsDiscardableArray(*model, input) &&
-        CountOpsWithInput(*model, input) == 1) {
-      model->EraseArray(input);
-    }
+    toco::DeleteArrayIfUsedOnce(input, model);
   }
 
   // Erase the operator
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
index 8df3c2f1955c23..1dd52e906900e9 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_constant_strided_slice.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include <vector>
 
+#include "tensorflow/contrib/lite/kernels/internal/strided_slice_logic.h"
 #include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
 #include "tensorflow/contrib/lite/toco/model.h"
 #include "tensorflow/contrib/lite/toco/tooling_util.h"
@@ -23,88 +24,6 @@ namespace toco {
 
 namespace {
 
-// These StridedSlice utility functions are essentially a COPY of those in
-// reference_ops.h. See comments there.
-
-// Use until std::clamp() is available from C++17.
-int Clamp(const int v, const int lo, const int hi) {
-  if (hi < v) return hi;
-  if (v < lo) return lo;
-  return v;
-}
-
-int StartForAxis(StridedSliceOperator const& op, Shape const& input_shape,
-                 int axis) {
-  // Begin with the specified index
-  int start = op.start_indices[axis];
-
-  // begin_mask override
-  if (op.begin_mask & 1 << axis) {
-    if (op.strides[axis] > 0) {
-      // Forward iteration - use the first element. These values will get
-      // clamped below (Note: We could have set them to 0 and axis_size-1, but
-      // use lowest() and max() to maintain symmetry with StopForAxis())
-      start = std::numeric_limits<int>::lowest();
-    } else {
-      // Backward iteration - use the last element.
-      start = std::numeric_limits<int>::max();
-    }
-  }
-
-  // Handle negative indices
-  int axis_size = input_shape.dims(axis);
-  if (start < 0) {
-    start += axis_size;
-  }
-
-  // Clamping
-  start = Clamp(start, 0, axis_size - 1);
-
-  return start;
-}
-
-int StopForAxis(StridedSliceOperator const& op, Shape const& input_shape,
-                int axis) {
-  // Begin with the specified index
-  int stop = op.stop_indices[axis];
-
-  // end_mask override
-  if (op.end_mask & (1 << axis)) {
-    if (op.strides[axis] > 0) {
-      // Forward iteration - use the last element. These values will get
-      // clamped below
-      stop = std::numeric_limits<int>::max();
-    } else {
-      // Backward iteration - use the first element.
-      stop = std::numeric_limits<int>::lowest();
-    }
-  }
-
-  // Handle negative indices
-  int axis_size = input_shape.dims(axis);
-  if (stop < 0) {
-    stop += axis_size;
-  }
-
-  // Clamping
-  // Because the end index points one past the last element, we need slightly
-  // different clamping ranges depending on the direction.
-  if (op.strides[axis] > 0) {
-    // Forward iteration
-    stop = Clamp(stop, 0, axis_size);
-  } else {
-    // Backward iteration
-    stop = Clamp(stop, -1, axis_size - 1);
-  }
-
-  return stop;
-}
-
-bool LoopCondition(int index, int stop, int stride) {
-  // True when we have reached the end of an axis and should loop.
-  return stride > 0 ? index >= stop : index <= stop;
-}
-
 template <ArrayDataType Type>
 void StridedSlice(StridedSliceOperator const& op, Array const& input_array,
                   Array* output_array) {
@@ -132,7 +51,9 @@ void StridedSlice(StridedSliceOperator const& op, Array const& input_array,
   Buffer<Type> const& input_buffer = input_array.GetBuffer<Type>();
   std::vector<int> src_coord(op.start_indices.size());
   for (int axis = 0; axis < num_input_axes; axis++) {
-    src_coord[axis] = StartForAxis(op, input_shape, axis);
+    src_coord[axis] = tflite::strided_slice::StartForAxis(
+        op.begin_mask, op.start_indices, op.strides, input_shape.dims().data(),
+        axis);
   }
 
   // In order to handle any number (N) of dimensions, we copy elements one by
@@ -155,10 +76,14 @@ void StridedSlice(StridedSliceOperator const& op, Array const& input_array,
       }
 
       // Check if we've overflowed.
-      int stop = StopForAxis(op, input_shape, axis);
-      if (LoopCondition(src_coord[axis], stop, stride)) {
+      int stop = tflite::strided_slice::StopForAxis(
+          op.end_mask, op.stop_indices, op.strides, input_shape.dims().data(),
+          axis);
+      if (tflite::strided_slice::LoopCondition(src_coord[axis], stop, stride)) {
         // Reset axis and set carry
-        src_coord[axis] = StartForAxis(op, input_shape, axis);
+        src_coord[axis] = tflite::strided_slice::StartForAxis(
+            op.begin_mask, op.start_indices, op.strides,
+            input_shape.dims().data(), axis);
         carry = true;
       } else {
         carry = false;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_padv2_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_padv2_attributes.cc
new file mode 100644
index 00000000000000..ebb023e34223a5
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_padv2_attributes.cc
@@ -0,0 +1,55 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+
+bool ResolvePadV2Attributes::Run(Model* model, std::size_t op_index) {
+  const auto pad_it = model->operators.begin() + op_index;
+  auto* pad_op = pad_it->get();
+  if (pad_op->type != OperatorType::kPadV2) return false;
+
+  auto* op = static_cast<PadV2Operator*>(pad_op);
+  if (!op->left_padding.empty()) return false;
+
+  CHECK_EQ(op->inputs.size(), 3);
+  if (!IsConstantParameterArray(*model, op->inputs[1])) return false;
+
+  const auto& array = model->GetArray(op->inputs[1]);
+  if (!array.has_shape()) return false;
+
+  const std::vector<int>& dims = array.shape().dims();
+  CHECK_EQ(dims.size(), 2);
+
+  std::vector<int> buffer = array.GetBuffer<ArrayDataType::kInt32>().data;
+
+  for (int i = 0; i < dims[0]; ++i) {
+    op->left_padding.push_back(buffer[i * 2]);
+    op->right_padding.push_back(buffer[i * 2 + 1]);
+  }
+
+  // TODO(dkalenichenko): Delete the extra input?
+
+  return true;
+}
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc b/tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
index 7e8b249b07ecca..65132d7d1ef062 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/resolve_strided_slice_attributes.cc
@@ -19,6 +19,24 @@ limitations under the License.
 
 namespace toco {
 
+int PadAttributeArray(Array* attribute_array, std::vector<int> pad_values,
+                      int mask) {
+  int attribute_dim_count = attribute_array->shape().dims(0);
+  int dim_count = pad_values.size();
+  if (attribute_dim_count < dim_count) {
+    Shape strided_slice_shape = Shape({dim_count});
+    attribute_array->copy_shape(strided_slice_shape);
+    Buffer<ArrayDataType::kInt32>* buffer =
+        &(attribute_array->GetMutableBuffer<ArrayDataType::kInt32>());
+    buffer->data.resize(RequiredBufferSizeForShape(strided_slice_shape));
+    for (int i = attribute_dim_count; i < dim_count; i++) {
+      buffer->data[i] = pad_values[i];
+      mask |= 1 << i;
+    }
+  }
+  return mask;
+}
+
 bool ResolveStridedSliceAttributes::Run(Model* model, std::size_t op_index) {
   const auto slice_it = model->operators.begin() + op_index;
   auto* slice_op = slice_it->get();
@@ -31,37 +49,69 @@ bool ResolveStridedSliceAttributes::Run(Model* model, std::size_t op_index) {
   }
 
   CHECK_EQ(op->inputs.size(), 4);
-  const auto& start_array = model->GetArray(op->inputs[1]);
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  if (!input_array.has_shape()) {
+    // We require the dimensionality of the input to pad the indices
+    return false;
+  }
+
+  auto& start_array = model->GetArray(op->inputs[1]);
   if (!start_array.has_shape()) return false;
   if (toco::RequiredBufferSizeForShape(start_array.shape()) > 4) {
     // Only 1-4D arrays are supported for now.
     return false;
   }
 
-  const auto& stop_array = model->GetArray(op->inputs[2]);
+  auto& stop_array = model->GetArray(op->inputs[2]);
   if (!stop_array.has_shape()) return false;
 
-  const auto& stride_array = model->GetArray(op->inputs[3]);
+  auto& stride_array = model->GetArray(op->inputs[3]);
   if (!stride_array.has_shape()) return false;
 
   if (!IsConstantParameterArray(*model, op->inputs[1])) return false;
   if (!IsConstantParameterArray(*model, op->inputs[2])) return false;
   if (!IsConstantParameterArray(*model, op->inputs[3])) return false;
 
-  op->start_indices = start_array.GetBuffer<ArrayDataType::kInt32>().data;
-  op->stop_indices = stop_array.GetBuffer<ArrayDataType::kInt32>().data;
-  op->strides = stride_array.GetBuffer<ArrayDataType::kInt32>().data;
+  int num_input_axes = input_array.shape().dimensions_count();
+  int start_indices_size = start_array.shape().dims(0);
+  int stop_indices_size = stop_array.shape().dims(0);
+  int stride_indices_size = stride_array.shape().dims(0);
+
+  CHECK_GE(start_indices_size, 1);
+  CHECK_LE(start_indices_size, 4);
+  CHECK_LE(stop_indices_size, 4);
+  CHECK_LE(stride_indices_size, 4);
 
-  CHECK_GE(op->start_indices.size(), 1);
-  CHECK_LE(op->start_indices.size(), 4);
-  CHECK_EQ(op->stop_indices.size(), op->start_indices.size());
-  CHECK_EQ(op->strides.size(), op->stop_indices.size());
+  // The TensorFlow documentation is not explicit on how it handles fewer
+  // supplied indices than dimensions, but they are accepted. We emulate TF's
+  // behavior by fully iterating over each omitted dimension.
+  CHECK_LE(start_indices_size, num_input_axes)
+      << "StridedSlice op requires no more than " << num_input_axes
+      << " start indices";
+  CHECK_LE(stop_indices_size, num_input_axes)
+      << "StridedSlice op requires no more than " << num_input_axes
+      << " stop indices";
+  CHECK_LE(stride_indices_size, num_input_axes)
+      << "StridedSlice op requires no more than " << num_input_axes
+      << " strides";
 
   // Ideally, we would remove the input arrays after they have been resolved.
   // However, we must then reconstitute these input arrays for all supported
   // export formats. For now, leave the arrays so we don't have to modify our
   // exporters. Ideally, we wouldn't have op attributes, and would work directly
   // with the input arrays.
+  std::vector<int> begin_pad_values(num_input_axes, 0);
+  op->begin_mask =
+      PadAttributeArray(&start_array, begin_pad_values, op->begin_mask);
+  op->end_mask =
+      PadAttributeArray(&stop_array, input_array.shape().dims(), op->end_mask);
+  std::vector<int> stride_pad_values(num_input_axes, 1);
+  PadAttributeArray(&stride_array, stride_pad_values, 0);
+
+  op->start_indices = start_array.GetBuffer<ArrayDataType::kInt32>().data;
+  op->stop_indices = stop_array.GetBuffer<ArrayDataType::kInt32>().data;
+  op->strides = stride_array.GetBuffer<ArrayDataType::kInt32>().data;
+
   return true;
 }
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD b/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD
index 8dcd4adc90b188..95e8433be2a332 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD
+++ b/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD
@@ -8,8 +8,8 @@ load(
 )
 
 tf_cc_test(
-    name = "resolve_constant_concatenation_test",
-    srcs = ["resolve_constant_concatenation_test.cc"],
+    name = "lstm_utils_test",
+    srcs = ["lstm_utils_test.cc"],
     deps = [
         "//tensorflow/contrib/lite/toco:graph_transformations",
         "//tensorflow/contrib/lite/toco:model",
@@ -19,8 +19,20 @@ tf_cc_test(
 )
 
 tf_cc_test(
-    name = "lstm_utils_test",
-    srcs = ["lstm_utils_test.cc"],
+    name = "quantize_weights_test",
+    srcs = ["quantize_weights_test.cc"],
+    deps = [
+        "//tensorflow/contrib/lite/toco:graph_transformations",
+        "//tensorflow/contrib/lite/toco:model",
+        "//tensorflow/contrib/lite/toco:tooling_util",
+        "@com_google_absl//absl/memory",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tf_cc_test(
+    name = "resolve_constant_concatenation_test",
+    srcs = ["resolve_constant_concatenation_test.cc"],
     deps = [
         "//tensorflow/contrib/lite/toco:graph_transformations",
         "//tensorflow/contrib/lite/toco:model",
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/tests/quantize_weights_test.cc b/tensorflow/contrib/lite/toco/graph_transformations/tests/quantize_weights_test.cc
new file mode 100644
index 00000000000000..c05eb0929fd775
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/tests/quantize_weights_test.cc
@@ -0,0 +1,167 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <math.h>
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/memory/memory.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+
+namespace toco {
+
+class QuantizeWeightsTest : public ::testing::Test {
+ protected:
+  QuantizeWeightsTest() {}
+
+  // The name of the weights input array.
+  const string kWeightsName = "weights";
+  // The zero_point of the values in the input array.
+  const int kZeroPoint = 128;
+
+  // Prepare a hypothetical TOCO model of a quantizable fully connected float
+  // layer.
+  void PrepareModel(Model* model, int elements_per_dim) {
+    std::vector<string> fc_input_names = {"inputs", kWeightsName};
+
+    const int kDim = 4;
+    const int buf_size = std::pow(elements_per_dim, static_cast<double>(kDim));
+    auto in_buf = absl::make_unique<float[]>(buf_size);
+    // Initialize the array with values from -128.0 to 127.0, since these values
+    // should be exactly representable by quantization.
+    for (int i = 0; i < buf_size; i++) {
+      in_buf[i] = static_cast<float>(i % 256 - kZeroPoint);
+    }
+
+    for (const string& fc_input_name : fc_input_names) {
+      Array& in_array = model->GetOrCreateArray(fc_input_name);
+      in_array.data_type = ArrayDataType::kFloat;
+
+      // Initialize shape for the input array.
+      Shape* in_array_shape = in_array.mutable_shape();
+      std::vector<int>* in_array_shape_dim = in_array_shape->mutable_dims();
+      in_array_shape_dim->resize(kDim, elements_per_dim);
+      auto& in_array_buffer =
+          in_array.GetMutableBuffer<ArrayDataType::kFloat>();
+      in_array_buffer.data.resize(buf_size);
+      float* buf_ptr =
+          in_array.GetMutableBuffer<ArrayDataType::kFloat>().data.data();
+      std::copy(in_buf.get(), in_buf.get() + buf_size, buf_ptr);
+    }
+
+    auto* fc_op = new FullyConnectedOperator;
+    fc_op->inputs = fc_input_names;
+    fc_op->outputs = {"fc_op_outputs"};
+    Array& out_array = model->GetOrCreateArray(fc_op->outputs[0]);
+    out_array.data_type = ArrayDataType::kFloat;
+    Shape* out_array_shape = out_array.mutable_shape();
+    std::vector<int>* out_array_shape_dim = out_array_shape->mutable_dims();
+    out_array_shape_dim->resize(kDim, elements_per_dim);
+    model->operators.push_back(std::unique_ptr<Operator>(fc_op));
+  }
+};
+
+TEST_F(QuantizeWeightsTest, QuantizedFullyConnected) {
+  // Test that weight arrays that are large enough are quantized.
+  Model model;
+  // 6 elements per dim gives us 1296 elements, which is sufficient to be
+  // quantized.
+  PrepareModel(&model, 6);
+
+  // Check the state of the graph before the transformation.
+  const auto& float_array_map = model.GetArrayMap();
+  EXPECT_EQ(float_array_map.size(), 3);
+  // Before the transformation, all arrays should be type float.
+  for (const auto& element : float_array_map) {
+    EXPECT_EQ(element.second->data_type, ArrayDataType::kFloat);
+  }
+  const std::vector<float> float_weight_vals =
+      model.GetArray(kWeightsName).GetBuffer<ArrayDataType::kFloat>().data;
+
+  // Invoke the transformation.
+  GraphTransformationsSet graph_transformation_set;
+  graph_transformation_set.Add(new toco::QuantizeWeights);
+  (*graph_transformation_set.begin())->Run(&model, /*op_index=*/0);
+
+  // Check the state of the graph after the transformation.
+  const auto& quantized_array_map = model.GetArrayMap();
+  EXPECT_EQ(quantized_array_map.size(), 4);
+  // After the transformation, three arrays should be type float and one array
+  // should be uint8.
+  int num_float = 0;
+  int num_uint8 = 0;
+  for (const auto& element : quantized_array_map) {
+    if (element.second->data_type == ArrayDataType::kFloat) {
+      num_float++;
+    } else if (element.second->data_type == ArrayDataType::kUint8) {
+      num_uint8++;
+    } else {
+      FAIL() << "Unexpected array type.";
+    }
+  }
+  EXPECT_EQ(num_float, 3);
+  EXPECT_EQ(num_uint8, 1);
+  // Ensure that the values were quantized correctly.
+  const std::vector<uint8>& quantized_weight_vals =
+      model.GetArray(kWeightsName).GetBuffer<ArrayDataType::kUint8>().data;
+  for (int i = 0; i < quantized_weight_vals.size(); i++) {
+    EXPECT_EQ(quantized_weight_vals[i], float_weight_vals[i] + kZeroPoint);
+  }
+
+  // Ensure that a Dequantize operator has been inserted before the
+  // FullyConnectedLayer.
+  EXPECT_EQ(model.operators[0]->type, OperatorType::kDequantize);
+}
+
+TEST_F(QuantizeWeightsTest, NotQuantizedFullyConnected) {
+  // Test that weight arrays that are too small are left untouched.
+  Model model;
+  // 5 elements per dim gives us 625 elements, which is NOT sufficient to be
+  // quantized.
+  PrepareModel(&model, 5);
+
+  // Check the state of the graph before the transformation.
+  const auto& float_array_map = model.GetArrayMap();
+  EXPECT_EQ(float_array_map.size(), 3);
+  // Before the transformation, all arrays should be type float.
+  for (auto it = float_array_map.begin(); it != float_array_map.end(); it++) {
+    EXPECT_EQ(it->second->data_type, ArrayDataType::kFloat);
+  }
+  std::vector<float> float_weight_vals =
+      model.GetArray(kWeightsName).GetBuffer<ArrayDataType::kFloat>().data;
+
+  // Invoke the transformation.
+  GraphTransformationsSet graph_transformation_set;
+  graph_transformation_set.Add(new toco::QuantizeWeights);
+  (*graph_transformation_set.begin())->Run(&model, /*op_index=*/0);
+
+  // Check the state of the graph after the transformation.
+  const auto& post_array_map = model.GetArrayMap();
+  EXPECT_EQ(post_array_map.size(), 3);
+  for (auto it = post_array_map.begin(); it != post_array_map.end(); it++) {
+    EXPECT_EQ(it->second->data_type, ArrayDataType::kFloat);
+  }
+  // Ensure that the values remain unchanged.
+  std::vector<float> const& quantized_weight_vals =
+      model.GetArray(kWeightsName).GetBuffer<ArrayDataType::kFloat>().data;
+  for (int i = 0; i < quantized_weight_vals.size(); i++) {
+    EXPECT_EQ(quantized_weight_vals[i], float_weight_vals[i]);
+  }
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc b/tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc
index 3a1d175b9823f0..66cfed4ac26969 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc
@@ -12,9 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <memory>
 #include <string>
-#include <unordered_map>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -126,7 +124,7 @@ class ResolveConstantConcatenationTest : public ::testing::Test {
       Array& in_array = model->GetOrCreateArray(concat_input_name);
       in_array.data_type = ArrayDataType::kFloat;
 
-      // Initialize shape for the input  array.
+      // Initialize shape for the input array.
       Shape* in_array_shape = in_array.mutable_shape();
       std::vector<int>* in_array_shape_dim = in_array_shape->mutable_dims();
       for (int i = 0; i < kDim; i++) {
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index 2b413c0290f691..323d25a5bdbc70 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -62,6 +62,9 @@ using tensorflow::TensorProto;
 using tensorflow::TensorShapeProto;
 
 namespace toco {
+
+using port::Status;
+
 namespace {
 bool HasAttr(const NodeDef& node, const string& attr_name) {
   return node.attr().count(attr_name) > 0;
@@ -113,7 +116,7 @@ const TensorShapeProto& GetShapeAttr(const NodeDef& node,
 }
 
 const TensorProto& GetTensorAttr(const NodeDef& node, const string& attr_name) {
-  CHECK(HasAttr(node, attr_name));
+  CHECK(HasAttr(node, attr_name)) << "No attr named '" << attr_name << "'";
   const auto& attr = node.attr().at(attr_name);
   CHECK_EQ(attr.value_case(), AttrValue::kTensor);
   return attr.tensor();
@@ -141,13 +144,13 @@ ArrayDataType ConvertDataType(tensorflow::DataType dtype) {
   else if (dtype == DT_STRING)
     return ArrayDataType::kString;
   else
-    LOG(INFO) << "Unsupported data type in placehoder op: " << dtype;
+    LOG(INFO) << "Unsupported data type in placeholder op: " << dtype;
   return ArrayDataType::kNone;
 }
 
-void ImportShape(const TFLITE_PROTO_NS::RepeatedPtrField<
-                     tensorflow::TensorShapeProto_Dim>& input_dims,
-                 Shape* shape) {
+Status ImportShape(const TFLITE_PROTO_NS::RepeatedPtrField<
+                       tensorflow::TensorShapeProto_Dim>& input_dims,
+                   int* input_flat_size, Shape* shape) {
   std::vector<int> input_dims_only_sizes;
   for (auto& d : input_dims) {
     if (d.size() == 0) {
@@ -155,27 +158,38 @@ void ImportShape(const TFLITE_PROTO_NS::RepeatedPtrField<
       // them of flat size 0 even though they have other nonzero dims.
       // This breaks our invariant, that array dims can't be 0.
       // For now, tweaking this to record a 0-D shape instead.
-      input_dims_only_sizes.clear();
-      break;
+      shape->mutable_dims()->clear();
+      if (input_flat_size != nullptr) *input_flat_size = 0;
+      return Status::OK();
+    }
+    // TensorFlow's shapes use int64s, while TOCO uses ints.
+    if (d.size() > std::numeric_limits<int>::max()) {
+      return Status(false, "Shape element overflows");
     }
+
     input_dims_only_sizes.push_back(d.size());
   }
   *shape->mutable_dims() = input_dims_only_sizes;
+
+  if (input_flat_size == nullptr) return Status::OK();
+
+  return NumElements(input_dims_only_sizes, input_flat_size);
 }
 
-void ImportFloatArray(const TensorProto& input_tensor, Array* output_array) {
+Status ImportFloatArray(const TensorProto& input_tensor, Array* output_array) {
   CHECK_EQ(input_tensor.dtype(), DT_FLOAT);
   const auto& input_shape = input_tensor.tensor_shape();
   CHECK_LE(input_shape.dim_size(), 4);
-  ImportShape(input_shape.dim(), output_array->mutable_shape());
-  int input_flat_size = 1;
-  for (int k = 0; k < input_shape.dim_size(); k++) {
-    input_flat_size *= input_shape.dim(k).size();
-  }
+  int input_flat_size;
+  auto status = ImportShape(input_shape.dim(), &input_flat_size,
+                            output_array->mutable_shape());
+  if (!status.ok()) return status;
+
   auto& output_float_data =
       output_array->GetMutableBuffer<ArrayDataType::kFloat>().data;
   output_float_data.resize(RequiredBufferSizeForShape(output_array->shape()),
                            0.f);
+  CHECK_GE(output_float_data.size(), input_flat_size);
   if (input_tensor.float_val_size() == 1) {
     for (int i = 0; i < input_flat_size; i++) {
       output_float_data[i] = input_tensor.float_val(0);
@@ -189,23 +203,30 @@ void ImportFloatArray(const TensorProto& input_tensor, Array* output_array) {
     toco::port::CopyToBuffer(input_tensor.tensor_content(),
                              reinterpret_cast<char*>(output_float_data.data()));
   } else {
-    LOG(FATAL) << "Neither input_content nor float_val have the right "
-                  "dimensions for this float tensor.";
+    return Status(
+        false,
+        absl::StrCat("Neither input_content (",
+                     input_tensor.tensor_content().size() / sizeof(float),
+                     ") nor float_val (", input_tensor.float_val_size(),
+                     ") have the right dimensions (", input_flat_size,
+                     ") for this float tensor"));
   }
+  return Status::OK();
 }
 
-void ImportQuint8Array(const TensorProto& input_tensor, Array* output_array) {
+Status ImportQuint8Array(const TensorProto& input_tensor, Array* output_array) {
   CHECK_EQ(input_tensor.dtype(), DT_QUINT8);
   const auto& input_shape = input_tensor.tensor_shape();
   CHECK_LE(input_shape.dim_size(), 4);
-  ImportShape(input_shape.dim(), output_array->mutable_shape());
-  int input_flat_size = 1;
-  for (int k = 0; k < input_shape.dim_size(); k++) {
-    input_flat_size *= input_shape.dim(k).size();
-  }
+  int input_flat_size;
+  auto status = ImportShape(input_shape.dim(), &input_flat_size,
+                            output_array->mutable_shape());
+  if (!status.ok()) return status;
+
   auto& output_int_data =
       output_array->GetMutableBuffer<ArrayDataType::kUint8>().data;
   output_int_data.resize(RequiredBufferSizeForShape(output_array->shape()), 0);
+  CHECK_GE(output_int_data.size(), input_flat_size);
   if (input_tensor.int_val_size()) {
     for (int i = 0; i < input_tensor.int_val_size(); i++) {
       output_int_data[i] = input_tensor.int_val(i);
@@ -215,23 +236,30 @@ void ImportQuint8Array(const TensorProto& input_tensor, Array* output_array) {
     toco::port::CopyToBuffer(input_tensor.tensor_content(),
                              reinterpret_cast<char*>(output_int_data.data()));
   } else {
-    LOG(FATAL) << "Neither input_content nor int_val have the right "
-                  "dimensions for this uint8 tensor.";
+    return Status(
+        false,
+        absl::StrCat("Neither input_content (",
+                     input_tensor.tensor_content().size() / sizeof(uint8_t),
+                     ") nor int_val (", input_tensor.int_val_size(),
+                     ") have the right dimensions (", input_flat_size,
+                     ") for this uint8 tensor"));
   }
+  return Status::OK();
 }
 
-void ImportInt32Array(const TensorProto& input_tensor, Array* output_array) {
+Status ImportInt32Array(const TensorProto& input_tensor, Array* output_array) {
   CHECK_EQ(input_tensor.dtype(), DT_INT32);
   const auto& input_shape = input_tensor.tensor_shape();
   CHECK_LE(input_shape.dim_size(), 4);
-  ImportShape(input_shape.dim(), output_array->mutable_shape());
-  int input_flat_size = 1;
-  for (int k = 0; k < input_shape.dim_size(); k++) {
-    input_flat_size *= input_shape.dim(k).size();
-  }
+  int input_flat_size;
+  auto status = ImportShape(input_shape.dim(), &input_flat_size,
+                            output_array->mutable_shape());
+  if (!status.ok()) return status;
+
   auto& output_int_data =
       output_array->GetMutableBuffer<ArrayDataType::kInt32>().data;
   output_int_data.resize(RequiredBufferSizeForShape(output_array->shape()), 0);
+  CHECK_GE(output_int_data.size(), input_flat_size);
   if (input_tensor.int_val_size()) {
     for (int i = 0; i < input_tensor.int_val_size(); i++) {
       output_int_data[i] = input_tensor.int_val(i);
@@ -241,23 +269,30 @@ void ImportInt32Array(const TensorProto& input_tensor, Array* output_array) {
     toco::port::CopyToBuffer(input_tensor.tensor_content(),
                              reinterpret_cast<char*>(output_int_data.data()));
   } else {
-    LOG(FATAL) << "Neither input_content nor int_val have the right "
-                  "dimensions for this int32 tensor.";
+    return Status(
+        false,
+        absl::StrCat("Neither input_content (",
+                     input_tensor.tensor_content().size() / sizeof(int32),
+                     ") nor int_val (", input_tensor.int_val_size(),
+                     ") have the right dimensions (", input_flat_size,
+                     ") for this int32 tensor"));
   }
+  return Status::OK();
 }
 
-void ImportInt64Array(const TensorProto& input_tensor, Array* output_array) {
+Status ImportInt64Array(const TensorProto& input_tensor, Array* output_array) {
   CHECK_EQ(input_tensor.dtype(), DT_INT64);
   const auto& input_shape = input_tensor.tensor_shape();
   CHECK_LE(input_shape.dim_size(), 4);
-  ImportShape(input_shape.dim(), output_array->mutable_shape());
-  int input_flat_size = 1;
-  for (int k = 0; k < input_shape.dim_size(); k++) {
-    input_flat_size *= input_shape.dim(k).size();
-  }
+  int input_flat_size;
+  auto status = ImportShape(input_shape.dim(), &input_flat_size,
+                            output_array->mutable_shape());
+  if (!status.ok()) return status;
+
   auto& output_int_data =
       output_array->GetMutableBuffer<ArrayDataType::kInt64>().data;
   output_int_data.resize(RequiredBufferSizeForShape(output_array->shape()), 0);
+  CHECK_GE(output_int_data.size(), input_flat_size);
   if (input_tensor.int64_val_size()) {
     for (int i = 0; i < input_tensor.int64_val_size(); i++) {
       output_int_data[i] = input_tensor.int64_val(i);
@@ -267,24 +302,31 @@ void ImportInt64Array(const TensorProto& input_tensor, Array* output_array) {
     toco::port::CopyToBuffer(input_tensor.tensor_content(),
                              reinterpret_cast<char*>(output_int_data.data()));
   } else {
-    LOG(FATAL) << "Neither input_content nor int64_val have the right "
-                  "dimensions for this int64 tensor.";
+    return Status(
+        false,
+        absl::StrCat("Neither input_content (",
+                     input_tensor.tensor_content().size() / sizeof(int64),
+                     ") nor int64_val (", input_tensor.int64_val_size(),
+                     ") have the right dimensions (", input_flat_size,
+                     ") for this int64 tensor"));
   }
+  return Status::OK();
 }
 
-void ImportBoolArray(const TensorProto& input_tensor, Array* output_array) {
+Status ImportBoolArray(const TensorProto& input_tensor, Array* output_array) {
   CHECK_EQ(input_tensor.dtype(), DT_BOOL);
   const auto& input_shape = input_tensor.tensor_shape();
   CHECK_LE(input_shape.dim_size(), 4);
-  ImportShape(input_shape.dim(), output_array->mutable_shape());
-  int input_flat_size = 1;
-  for (int k = 0; k < input_shape.dim_size(); k++) {
-    input_flat_size *= input_shape.dim(k).size();
-  }
+  int input_flat_size;
+  auto status = ImportShape(input_shape.dim(), &input_flat_size,
+                            output_array->mutable_shape());
+  if (!status.ok()) return status;
+
   auto& output_bool_data =
       output_array->GetMutableBuffer<ArrayDataType::kBool>().data;
   output_bool_data.resize(RequiredBufferSizeForShape(output_array->shape()),
                           false);
+  CHECK_GE(output_bool_data.size(), input_flat_size);
   if (input_tensor.bool_val_size()) {
     for (int i = 0; i < input_tensor.bool_val_size(); i++) {
       output_bool_data[i] = input_tensor.bool_val(i);
@@ -300,30 +342,42 @@ void ImportBoolArray(const TensorProto& input_tensor, Array* output_array) {
     // assuming that 'false' is implied.
     // So far only encountered that in an array with 1 entry, let's
     // require that until we encounter a graph where that's not the case.
-    CHECK_EQ(output_bool_data.size(), 1);
+    if (output_bool_data.size() != 1) {
+      return Status(
+          false, absl::StrCat("Neither input_content (",
+                              input_tensor.tensor_content().size(),
+                              ") nor bool_val (", input_tensor.bool_val_size(),
+                              ") have the right dimensions (", input_flat_size,
+                              ") for this bool tensor"));
+    }
     output_bool_data[0] = false;
   }
+  return Status::OK();
 }
 
-void ImportStringArray(const TensorProto& input_tensor, Array* output_array) {
+Status ImportStringArray(const TensorProto& input_tensor, Array* output_array) {
   CHECK_EQ(input_tensor.dtype(), DT_STRING);
   const auto& input_shape = input_tensor.tensor_shape();
   CHECK_LE(input_shape.dim_size(), 4);
-  ImportShape(input_shape.dim(), output_array->mutable_shape());
-  int input_flat_size = 1;
-  for (int k = 0; k < input_shape.dim_size(); k++) {
-    input_flat_size *= input_shape.dim(k).size();
+  int input_flat_size;
+  auto status = ImportShape(input_shape.dim(), &input_flat_size,
+                            output_array->mutable_shape());
+  if (!status.ok()) return status;
+
+  if (input_flat_size != input_tensor.string_val_size()) {
+    return Status(false,
+                  "Input_content string_val doesn't have the right dimensions "
+                  "for this string tensor");
   }
+
   auto& output_string_data =
       output_array->GetMutableBuffer<ArrayDataType::kString>().data;
   output_string_data.resize(RequiredBufferSizeForShape(output_array->shape()));
-  if (input_flat_size != input_tensor.string_val_size()) {
-    LOG(FATAL) << "Input_content string_val doesn't have the right "
-                  "dimensions for this string tensor.";
-  }
+  CHECK_GE(output_string_data.size(), input_flat_size);
   for (int i = 0; i < input_flat_size; ++i) {
     output_string_data[i] = input_tensor.string_val(i);
   }
+  return Status::OK();
 }
 
 // Count the number of inputs of a given node. If
@@ -363,38 +417,40 @@ string CreateConstArray(Model* model, string const& name,
   return array_name;
 }
 
-void ConvertConstOperator(const NodeDef& node,
-                          const TensorFlowImportFlags& tf_import_flags,
-                          Model* model) {
+Status ConvertConstOperator(const NodeDef& node,
+                            const TensorFlowImportFlags& tf_import_flags,
+                            Model* model) {
   CHECK_EQ(node.op(), "Const");
   const auto& tensor = GetTensorAttr(node, "value");
   const auto dtype = GetDataTypeAttr(node, "dtype");
 
+  Status status = Status::OK();
+
   auto& array = model->GetOrCreateArray(node.name());
   switch (dtype) {
     case DT_FLOAT:
       array.data_type = ArrayDataType::kFloat;
-      ImportFloatArray(tensor, &array);
+      status = ImportFloatArray(tensor, &array);
       break;
     case DT_INT32:
       array.data_type = ArrayDataType::kInt32;
-      ImportInt32Array(tensor, &array);
+      status = ImportInt32Array(tensor, &array);
       break;
     case DT_QUINT8:
       array.data_type = ArrayDataType::kUint8;
-      ImportQuint8Array(tensor, &array);
+      status = ImportQuint8Array(tensor, &array);
       break;
     case DT_INT64:
       array.data_type = ArrayDataType::kInt64;
-      ImportInt64Array(tensor, &array);
+      status = ImportInt64Array(tensor, &array);
       break;
     case DT_STRING:
       array.data_type = ArrayDataType::kString;
-      ImportStringArray(tensor, &array);
+      status = ImportStringArray(tensor, &array);
       break;
     case DT_BOOL:
       array.data_type = ArrayDataType::kBool;
-      ImportBoolArray(tensor, &array);
+      status = ImportBoolArray(tensor, &array);
       break;
     default:
       array.data_type = ArrayDataType::kNone;
@@ -404,6 +460,10 @@ void ConvertConstOperator(const NodeDef& node,
       array.GetMutableBuffer<ArrayDataType::kNone>();
       break;
   }
+  if (!status.ok()) {
+    status.AppendMessage(" (while processing node '" + node.name() + "')");
+  }
+  return status;
 }
 
 void ConvertConvOperator(const NodeDef& node,
@@ -451,8 +511,18 @@ void ConvertConvOperator(const NodeDef& node,
   if (HasAttr(node, "dilations")) {
     const auto& dilations = GetListAttr(node, "dilations");
     CHECK_EQ(dilations.i_size(), 4);
-    CHECK_EQ(dilations.i(0), 1);
-    CHECK_EQ(dilations.i(3), 1);
+    CHECK_EQ(dilations.i(0), 1)
+        << "Can only import Conv ops with dilation along the height (1st) or "
+           "width (2nd) axis. TensorFlow op \""
+        << node.name() << "\" had dilations:[ " << dilations.i(0) << ", "
+        << dilations.i(1) << ", " << dilations.i(2) << ", " << dilations.i(3)
+        << "].";
+    CHECK_EQ(dilations.i(3), 1)
+        << "Can only import Conv ops with dilation along the height (1st) or "
+           "width (2nd) axis. TensorFlow op \""
+        << node.name() << "\" had dilations:[ " << dilations.i(0) << ", "
+        << dilations.i(1) << ", " << dilations.i(2) << ", " << dilations.i(3)
+        << "].";
     conv->dilation_height_factor = dilations.i(1);
     conv->dilation_width_factor = dilations.i(2);
   } else {
@@ -544,7 +614,14 @@ void ConvertSpaceToDepthOperator(const NodeDef& node,
   CHECK_EQ(node.op(), "SpaceToDepth");
   CheckInputsCount(node, tf_import_flags, 1);
 
-  CHECK_EQ(GetDataTypeAttr(node, "T"), DT_FLOAT);
+  tensorflow::DataType dtype = GetDataTypeAttr(node, "T");
+  if (dtype != DT_FLOAT && dtype != DT_UINT8 && dtype != DT_INT32 &&
+      dtype != DT_INT64) {
+    const auto* enum_descriptor = tensorflow::DataType_descriptor();
+    LOG(FATAL) << "TFLite does not support SpaceToDepth with type T:"
+               << enum_descriptor->FindValueByNumber(dtype)->name() << ". "
+               << "T must be one of {DT_FLOAT, DT_INT8, DT_INT32, DT_INT64}.";
+  }
   auto* op = new SpaceToDepthOperator;
   op->inputs.push_back(node.input(0));
   op->outputs.push_back(node.name());
@@ -586,81 +663,6 @@ void ConvertRandomUniform(const NodeDef& node,
   model->operators.emplace_back(std::move(op));
 }
 
-void ConvertReluOperator(const NodeDef& node,
-                         const TensorFlowImportFlags& tf_import_flags,
-                         Model* model) {
-  CHECK_EQ(node.op(), "Relu");
-  CheckInputsCount(node, tf_import_flags, 1);
-  const auto& input_name = node.input(0);
-  auto* relu = new ReluOperator;
-  relu->inputs.push_back(input_name);
-  relu->outputs.push_back(node.name());
-  model->operators.emplace_back(relu);
-}
-
-void ConvertRelu6Operator(const NodeDef& node,
-                          const TensorFlowImportFlags& tf_import_flags,
-                          Model* model) {
-  CHECK_EQ(node.op(), "Relu6");
-  CheckInputsCount(node, tf_import_flags, 1);
-
-  const auto& input_name = node.input(0);
-  auto* op = new Relu6Operator;
-  op->inputs.push_back(input_name);
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertLogOperator(const NodeDef& node,
-                        const TensorFlowImportFlags& tf_import_flags,
-                        Model* model) {
-  CHECK_EQ(node.op(), "Log");
-  CheckInputsCount(node, tf_import_flags, 1);
-
-  auto op = absl::make_unique<LogOperator>();
-  op->inputs.push_back(node.input(0));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(std::move(op));
-}
-
-void ConvertLogisticOperator(const NodeDef& node,
-                             const TensorFlowImportFlags& tf_import_flags,
-                             Model* model) {
-  CHECK_EQ(node.op(), "Sigmoid");
-  CheckInputsCount(node, tf_import_flags, 1);
-
-  const auto& input_name = node.input(0);
-  auto* op = new LogisticOperator;
-  op->inputs.push_back(input_name);
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertTanhOperator(const NodeDef& node,
-                         const TensorFlowImportFlags& tf_import_flags,
-                         Model* model) {
-  CHECK_EQ(node.op(), "Tanh");
-  CheckInputsCount(node, tf_import_flags, 1);
-
-  const auto& input_name = node.input(0);
-  auto* op = new TanhOperator;
-  op->inputs.push_back(input_name);
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertDivOperator(const NodeDef& node,
-                        const TensorFlowImportFlags& tf_import_flags,
-                        Model* model) {
-  CHECK(node.op() == "Div" || node.op() == "RealDiv");
-  CheckInputsCount(node, tf_import_flags, 2);
-  auto* op = new DivOperator;
-  op->inputs.push_back(node.input(0));
-  op->inputs.push_back(node.input(1));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
 void ConvertIdentityOperator(const NodeDef& node,
                              const TensorFlowImportFlags& tf_import_flags,
                              Model* model) {
@@ -717,38 +719,6 @@ void ConvertFakeQuantWithMinMaxVars(
   model->operators.emplace_back(op);
 }
 
-void ConvertNegOperator(const NodeDef& node,
-                        const TensorFlowImportFlags& tf_import_flags,
-                        Model* model) {
-  CHECK_EQ(node.op(), "Neg");
-  CheckInputsCount(node, tf_import_flags, 1);
-  auto* op = new NegOperator;
-  op->inputs.push_back(node.input(0));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertRsqrtOperator(const NodeDef& node,
-                          const TensorFlowImportFlags& tf_import_flags,
-                          Model* model) {
-  CHECK_EQ(node.op(), "Rsqrt");
-  CheckInputsCount(node, tf_import_flags, 1);
-  auto* op = new TensorFlowRsqrtOperator;
-  op->inputs.push_back(node.input(0));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertSqrtOperator(const NodeDef& node,
-                         const TensorFlowImportFlags& tf_import_flags,
-                         Model* model) {
-  CHECK_EQ(node.op(), "Sqrt");
-  CheckInputsCount(node, tf_import_flags, 1);
-  auto* op = new TensorFlowSqrtOperator;
-  op->inputs.push_back(node.input(0));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
 
 void ConvertSqueezeOperator(const NodeDef& node,
                             const TensorFlowImportFlags& tf_import_flags,
@@ -770,66 +740,6 @@ void ConvertSqueezeOperator(const NodeDef& node,
   model->operators.emplace_back(op);
 }
 
-void ConvertSquareOperator(const NodeDef& node,
-                           const TensorFlowImportFlags& tf_import_flags,
-                           Model* model) {
-  CHECK_EQ(node.op(), "Square");
-  CheckInputsCount(node, tf_import_flags, 1);
-  auto* op = new TensorFlowSquareOperator;
-  op->inputs.push_back(node.input(0));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertAddOperator(const NodeDef& node,
-                        const TensorFlowImportFlags& tf_import_flags,
-                        Model* model) {
-  CHECK_EQ(node.op(), "Add");
-  CheckInputsCount(node, tf_import_flags, 2);
-  auto* op = new AddOperator;
-  op->inputs.push_back(node.input(0));
-  op->inputs.push_back(node.input(1));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertAddNOperator(const NodeDef& node,
-                         const TensorFlowImportFlags& tf_import_flags,
-                         Model* model) {
-  CHECK_EQ(node.op(), "AddN");
-  const int num_inputs = GetInputsCount(node, tf_import_flags);
-  auto* op = new AddNOperator;
-  for (int i = 0; i < num_inputs; ++i) {
-    op->inputs.push_back(node.input(i));
-  }
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertMulOperator(const NodeDef& node,
-                        const TensorFlowImportFlags& tf_import_flags,
-                        Model* model) {
-  CHECK_EQ(node.op(), "Mul");
-  CheckInputsCount(node, tf_import_flags, 2);
-  auto* op = new MulOperator;
-  op->inputs.push_back(node.input(0));
-  op->inputs.push_back(node.input(1));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertSubOperator(const NodeDef& node,
-                        const TensorFlowImportFlags& tf_import_flags,
-                        Model* model) {
-  CHECK_EQ(node.op(), "Sub");
-  CheckInputsCount(node, tf_import_flags, 2);
-  auto* op = new SubOperator;
-  op->inputs.push_back(node.input(0));
-  op->inputs.push_back(node.input(1));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
 void ConvertSumOperator(const NodeDef& node,
                         const TensorFlowImportFlags& tf_import_flags,
                         Model* model) {
@@ -845,54 +755,6 @@ void ConvertSumOperator(const NodeDef& node,
   }
 }
 
-void ConvertTileOperator(const NodeDef& node,
-                         const TensorFlowImportFlags& tf_import_flags,
-                         Model* model) {
-  CHECK_EQ(node.op(), "Tile");
-  CheckInputsCount(node, tf_import_flags, 2);
-  auto* op = new TensorFlowTileOperator;
-  op->inputs.push_back(node.input(0));
-  op->inputs.push_back(node.input(1));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertSliceOperator(const NodeDef& node,
-                          const TensorFlowImportFlags& tf_import_flags,
-                          Model* model) {
-  CHECK_EQ(node.op(), "Slice");
-  CheckInputsCount(node, tf_import_flags, 3);
-  auto* op = new SliceOperator;
-  for (int i = 0; i < 3; ++i) {
-    op->inputs.push_back(node.input(i));
-  }
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertPadOperator(const NodeDef& node,
-                        const TensorFlowImportFlags& tf_import_flags,
-                        Model* model) {
-  CHECK_EQ(node.op(), "Pad");
-  CheckInputsCount(node, tf_import_flags, 2);
-  auto* op = new PadOperator;
-  op->inputs.push_back(node.input(0));
-  op->inputs.push_back(node.input(1));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertShapeOperator(const NodeDef& node,
-                          const TensorFlowImportFlags& tf_import_flags,
-                          Model* model) {
-  CHECK_EQ(node.op(), "Shape");
-  CheckInputsCount(node, tf_import_flags, 1);
-  auto* op = new TensorFlowShapeOperator;
-  op->inputs.push_back(node.input(0));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
 void ConvertSplitOperator(const NodeDef& node,
                           const TensorFlowImportFlags& tf_import_flags,
                           Model* model) {
@@ -910,18 +772,6 @@ void ConvertSplitOperator(const NodeDef& node,
   model->operators.emplace_back(op);
 }
 
-void ConvertMergeOperator(const NodeDef& node,
-                          const TensorFlowImportFlags& tf_import_flags,
-                          Model* model) {
-  CHECK_EQ(node.op(), "Merge");
-  CheckInputsCount(node, tf_import_flags, 2);
-  auto* op = new TensorFlowMergeOperator;
-  op->inputs.push_back(node.input(0));
-  op->inputs.push_back(node.input(1));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
 void ConvertSwitchOperator(const NodeDef& node,
                            const TensorFlowImportFlags& tf_import_flags,
                            Model* model) {
@@ -951,18 +801,6 @@ void ConvertSoftmaxOperator(const NodeDef& node,
   model->operators.emplace_back(softmax);
 }
 
-void ConvertLogSoftmaxOperator(const NodeDef& node,
-                               const TensorFlowImportFlags& tf_import_flags,
-                               Model* model) {
-  CHECK_EQ(node.op(), "LogSoftmax");
-  CheckInputsCount(node, tf_import_flags, 1);
-  const auto& input_name = node.input(0);
-  auto* log_softmax = new LogSoftmaxOperator;
-  log_softmax->inputs.push_back(input_name);
-  log_softmax->outputs.push_back(node.name());
-  model->operators.emplace_back(log_softmax);
-}
-
 void ConvertLRNOperator(const NodeDef& node,
                         const TensorFlowImportFlags& tf_import_flags,
                         Model* model) {
@@ -1059,17 +897,6 @@ void ConvertAvgPoolOperator(const NodeDef& node,
   model->operators.emplace_back(avgpool);
 }
 
-void ConvertReshapeOperator(const NodeDef& node,
-                            const TensorFlowImportFlags& tf_import_flags,
-                            Model* model) {
-  CHECK_EQ(node.op(), "Reshape");
-  CheckInputsCount(node, tf_import_flags, 2);
-  auto* op = new TensorFlowReshapeOperator;
-  op->inputs.push_back(node.input(0));
-  op->inputs.push_back(node.input(1));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
 
 void ConvertBatchMatMulOperator(const NodeDef& node,
                                 const TensorFlowImportFlags& tf_import_flags,
@@ -1132,37 +959,12 @@ void ConvertConcatOperator(const NodeDef& node,
   model->operators.emplace_back(op);
 }
 
-void ConvertAllOperator(const NodeDef& node,
-                        const TensorFlowImportFlags& tf_import_flags,
-                        Model* model) {
-  CHECK_EQ(node.op(), "All");
-  auto* op = new TensorFlowAllOperator;
-  const int num_inputs = GetInputsCount(node, tf_import_flags);
-  for (int i = 0; i < num_inputs; ++i) {
-    op->inputs.push_back(node.input(i));
-  }
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertAssertOperator(const NodeDef& node,
+// This method supports simple operators without additional attributes.
+template <typename Op>
+void ConvertSimpleOperator(const NodeDef& node,
                            const TensorFlowImportFlags& tf_import_flags,
                            Model* model) {
-  CHECK_EQ(node.op(), "Assert");
-  auto* op = new TensorFlowAssertOperator;
-  const int num_inputs = GetInputsCount(node, tf_import_flags);
-  for (int i = 0; i < num_inputs; ++i) {
-    op->inputs.push_back(node.input(i));
-  }
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertLessOperator(const NodeDef& node,
-                         const TensorFlowImportFlags& tf_import_flags,
-                         Model* model) {
-  CHECK_EQ(node.op(), "Less");
-  auto* op = new TensorFlowLessOperator;
+  auto* op = new Op;
   const int num_inputs = GetInputsCount(node, tf_import_flags);
   for (int i = 0; i < num_inputs; ++i) {
     op->inputs.push_back(node.input(i));
@@ -1171,43 +973,13 @@ void ConvertLessOperator(const NodeDef& node,
   model->operators.emplace_back(op);
 }
 
-void ConvertLessEqualOperator(const NodeDef& node,
-                              const TensorFlowImportFlags& tf_import_flags,
-                              Model* model) {
-  CHECK_EQ(node.op(), "LessEqual");
-  auto* op = new TensorFlowLessEqualOperator;
-  const int num_inputs = GetInputsCount(node, tf_import_flags);
-  for (int i = 0; i < num_inputs; ++i) {
-    op->inputs.push_back(node.input(i));
-  }
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertGreaterOperator(const NodeDef& node,
-                            const TensorFlowImportFlags& tf_import_flags,
-                            Model* model) {
-  CHECK_EQ(node.op(), "Greater");
-  auto* op = new TensorFlowGreaterOperator;
-  const int num_inputs = GetInputsCount(node, tf_import_flags);
-  for (int i = 0; i < num_inputs; ++i) {
-    op->inputs.push_back(node.input(i));
-  }
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertGreaterEqualOperator(const NodeDef& node,
-                                 const TensorFlowImportFlags& tf_import_flags,
-                                 Model* model) {
-  CHECK_EQ(node.op(), "GreaterEqual");
-  auto* op = new TensorFlowGreaterEqualOperator;
-  const int num_inputs = GetInputsCount(node, tf_import_flags);
-  for (int i = 0; i < num_inputs; ++i) {
-    op->inputs.push_back(node.input(i));
-  }
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
+// This method supports simple operators without additional attributes.
+template <typename Op, unsigned int NumInputs>
+void ConvertSimpleOperator(const NodeDef& node,
+                           const TensorFlowImportFlags& tf_import_flags,
+                           Model* model) {
+  CheckInputsCount(node, tf_import_flags, NumInputs);
+  ConvertSimpleOperator<Op>(node, tf_import_flags, model);
 }
 
 void ConvertMaxOperator(const NodeDef& node,
@@ -1240,29 +1012,6 @@ void ConvertMinOperator(const NodeDef& node,
   }
 }
 
-void ConvertMaximumOperator(const NodeDef& node,
-                            const TensorFlowImportFlags& tf_import_flags,
-                            Model* model) {
-  CHECK_EQ(node.op(), "Maximum");
-  CheckInputsCount(node, tf_import_flags, 2);
-  auto* op = new TensorFlowMaximumOperator;
-  op->inputs.push_back(node.input(0));
-  op->inputs.push_back(node.input(1));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertMinimumOperator(const NodeDef& node,
-                            const TensorFlowImportFlags& tf_import_flags,
-                            Model* model) {
-  CHECK_EQ(node.op(), "Minimum");
-  CheckInputsCount(node, tf_import_flags, 2);
-  auto* op = new TensorFlowMinimumOperator;
-  op->inputs.push_back(node.input(0));
-  op->inputs.push_back(node.input(1));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
 
 void ConvertUnsupportedOperator(const NodeDef& node,
                                 const TensorFlowImportFlags& tf_import_flags,
@@ -1285,6 +1034,9 @@ void ConvertUnsupportedOperator(const NodeDef& node,
     for (int i = 0; i < output_types.type_size(); ++i) {
       op->output_data_types.push_back(ConvertDataType(output_types.type(i)));
     }
+  } else if (HasAttr(node, "Tout")) {
+    const auto& output_type = GetDataTypeAttr(node, "Tout");
+    op->output_data_types.push_back(ConvertDataType(output_type));
   }
 }
 
@@ -1566,17 +1318,6 @@ void ConvertBatchToSpaceNDOperator(const NodeDef& node,
   model->operators.emplace_back(op);
 }
 
-void ConvertExpOperator(const NodeDef& node,
-                        const TensorFlowImportFlags& tf_import_flags,
-                        Model* model) {
-  CHECK_EQ(node.op(), "Exp");
-  CheckInputsCount(node, tf_import_flags, 1);
-  auto* op = new ExpOperator;
-  op->inputs.push_back(node.input(0));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
 void ConvertMeanOperator(const NodeDef& node,
                          const TensorFlowImportFlags& tf_import_flags,
                          Model* model) {
@@ -1690,53 +1431,6 @@ void ConvertTransposeConvOperator(const NodeDef& node,
   model->operators.emplace_back(op);
 }
 
-void ConvertExpandDimsOperator(const NodeDef& node,
-                               const TensorFlowImportFlags& tf_import_flags,
-                               Model* model) {
-  CHECK_EQ(node.op(), "ExpandDims");
-  CheckInputsCount(node, tf_import_flags, 2);
-  auto* op = new ExpandDimsOperator;
-  op->inputs.push_back(node.input(0));
-  op->inputs.push_back(node.input(1));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertFillOperator(const NodeDef& node,
-                         const TensorFlowImportFlags& tf_import_flags,
-                         Model* model) {
-  CHECK_EQ(node.op(), "Fill");
-  CheckInputsCount(node, tf_import_flags, 2);
-  auto* op = new FillOperator;
-  op->inputs.push_back(node.input(0));
-  op->inputs.push_back(node.input(1));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertFloorDivOperator(const NodeDef& node,
-                             const TensorFlowImportFlags& tf_import_flags,
-                             Model* model) {
-  CHECK_EQ(node.op(), "FloorDiv");
-  CheckInputsCount(node, tf_import_flags, 2);
-  auto* op = new FloorDivOperator;
-  op->inputs.push_back(node.input(0));
-  op->inputs.push_back(node.input(1));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertFloorModOperator(const NodeDef& node,
-                             const TensorFlowImportFlags& tf_import_flags,
-                             Model* model) {
-  CHECK_EQ(node.op(), "FloorMod");
-  CheckInputsCount(node, tf_import_flags, 2);
-  auto* op = new FloorModOperator;
-  op->inputs.push_back(node.input(0));
-  op->inputs.push_back(node.input(1));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
 
 void ConvertRangeOperator(const NodeDef& node,
                           const TensorFlowImportFlags& tf_import_flags,
@@ -1757,17 +1451,6 @@ void ConvertRangeOperator(const NodeDef& node,
   model->operators.emplace_back(op);
 }
 
-void ConvertRankOperator(const NodeDef& node,
-                         const TensorFlowImportFlags& tf_import_flags,
-                         Model* model) {
-  CHECK_EQ(node.op(), "Rank");
-  CheckInputsCount(node, tf_import_flags, 1);
-  auto* op = new RankOperator;
-  op->inputs.push_back(node.input(0));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
 void ConvertStackOperator(const NodeDef& node,
                           const TensorFlowImportFlags& tf_import_flags,
                           Model* model) {
@@ -1788,17 +1471,6 @@ void ConvertStackOperator(const NodeDef& node,
   model->operators.emplace_back(op);
 }
 
-void ConvertTransposeOperator(const NodeDef& node,
-                              const TensorFlowImportFlags& tf_import_flags,
-                              Model* model) {
-  CHECK_EQ(node.op(), "Transpose");
-  CheckInputsCount(node, tf_import_flags, 2);
-  auto* op = new TransposeOperator;
-  op->inputs.push_back(node.input(0));
-  op->inputs.push_back(node.input(1));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
 
 // Some TensorFlow ops only occur in graph cycles, representing
 // control flow. We do not currently support control flow, so we wouldn't
@@ -2021,8 +1693,228 @@ void ConvertDynamicStitchOperator(const NodeDef& node,
   model->operators.emplace_back(op.release());
 }
 
+void ConvertSparseToDenseOperator(const NodeDef& node,
+                                  const TensorFlowImportFlags& tf_import_flags,
+                                  Model* model) {
+  CHECK_EQ(node.op(), "SparseToDense");
+  CheckInputsCount(node, tf_import_flags, 4);
+
+  auto* op = new SparseToDenseOperator;
+  for (const string& input : node.input()) {
+    op->inputs.push_back(input);
+  }
+  op->outputs.push_back(node.name());
+
+  op->validate_indices = HasAttr(node, "validate_indices")
+                             ? GetBoolAttr(node, "validate_indices")
+                             : true;
+  model->operators.emplace_back(op);
+}
+
 }  // namespace
 
+namespace internal {
+Status ImportTensorFlowNode(const tensorflow::NodeDef& node,
+                            const TensorFlowImportFlags& tf_import_flags,
+                            Model* model) {
+  // TODO(ahentz): Historically these functions all CHECK-fail on error. We've
+  // been slowly converting them to return Status.
+  if (node.op() == "Const") {
+    return ConvertConstOperator(node, tf_import_flags, model);
+  } else if (node.op() == "Conv2D") {
+    ConvertConvOperator(node, tf_import_flags, model);
+  } else if (node.op() == "Conv2DBackpropInput") {
+    ConvertTransposeConvOperator(node, tf_import_flags, model);
+  } else if (node.op() == "DepthwiseConv2dNative") {
+    ConvertDepthwiseConvOperator(node, tf_import_flags, model);
+  } else if (node.op() == "DepthToSpace") {
+    ConvertDepthToSpaceOperator(node, tf_import_flags, model);
+  } else if (node.op() == "SpaceToDepth") {
+    ConvertSpaceToDepthOperator(node, tf_import_flags, model);
+  } else if (node.op() == "BiasAdd") {
+    ConvertBiasAddOperator(node, tf_import_flags, model);
+  } else if (node.op() == "Relu") {
+    ConvertSimpleOperator<ReluOperator, 1>(node, tf_import_flags, model);
+  } else if (node.op() == "Relu6") {
+    ConvertSimpleOperator<Relu6Operator, 1>(node, tf_import_flags, model);
+  } else if (node.op() == "Sigmoid") {
+    ConvertSimpleOperator<LogisticOperator, 1>(node, tf_import_flags, model);
+  } else if (node.op() == "Tanh") {
+    ConvertSimpleOperator<TanhOperator, 1>(node, tf_import_flags, model);
+  } else if (node.op() == "MaxPool") {
+    ConvertMaxPoolOperator(node, tf_import_flags, model);
+  } else if (node.op() == "AvgPool") {
+    ConvertAvgPoolOperator(node, tf_import_flags, model);
+  } else if (node.op() == "Reshape") {
+    ConvertSimpleOperator<TensorFlowReshapeOperator, 2>(node, tf_import_flags,
+                                                        model);
+  } else if (node.op() == "BatchMatMul") {
+    ConvertBatchMatMulOperator(node, tf_import_flags, model);
+  } else if (node.op() == "MatMul") {
+    ConvertMatMulOperator(node, tf_import_flags, model);
+  } else if (node.op() == "Div" || node.op() == "RealDiv") {
+    ConvertSimpleOperator<DivOperator, 2>(node, tf_import_flags, model);
+  } else if (node.op() == "Identity" || node.op() == "CheckNumerics" ||
+             node.op() == "StopGradient") {
+    ConvertIdentityOperator(node, tf_import_flags, model);
+  } else if (node.op() == "FakeQuantWithMinMaxVars") {
+    ConvertFakeQuantWithMinMaxVars(node, tf_import_flags, model);
+  } else if (node.op() == "FakeQuantWithMinMaxArgs") {
+    ConvertFakeQuantWithMinMaxArgs(node, tf_import_flags, model);
+  } else if (node.op() == "Neg") {
+    ConvertSimpleOperator<NegOperator, 1>(node, tf_import_flags, model);
+  } else if (node.op() == "Rsqrt") {
+    ConvertSimpleOperator<TensorFlowRsqrtOperator, 1>(node, tf_import_flags,
+                                                      model);
+  } else if (node.op() == "Squeeze") {
+    ConvertSqueezeOperator(node, tf_import_flags, model);
+  } else if (node.op() == "Sqrt") {
+    ConvertSimpleOperator<TensorFlowSqrtOperator, 1>(node, tf_import_flags,
+                                                     model);
+  } else if (node.op() == "Square") {
+    ConvertSimpleOperator<TensorFlowSquareOperator, 1>(node, tf_import_flags,
+                                                       model);
+  } else if (node.op() == "Add") {
+    ConvertSimpleOperator<AddOperator, 2>(node, tf_import_flags, model);
+  } else if (node.op() == "AddN") {
+    ConvertSimpleOperator<AddNOperator>(node, tf_import_flags, model);
+  } else if (node.op() == "Mul") {
+    ConvertSimpleOperator<MulOperator, 2>(node, tf_import_flags, model);
+  } else if (node.op() == "Sub") {
+    ConvertSimpleOperator<SubOperator, 2>(node, tf_import_flags, model);
+  } else if (node.op() == "Sum") {
+    ConvertSumOperator(node, tf_import_flags, model);
+  } else if (node.op() == "Tile") {
+    ConvertSimpleOperator<TensorFlowTileOperator, 2>(node, tf_import_flags,
+                                                     model);
+  } else if (node.op() == "Concat" || node.op() == "ConcatV2") {
+    ConvertConcatOperator(node, tf_import_flags, model);
+  } else if (node.op() == "LRN") {
+    ConvertLRNOperator(node, tf_import_flags, model);
+  } else if (node.op() == "Softmax") {
+    ConvertSoftmaxOperator(node, tf_import_flags, model);
+  } else if (node.op() == "Log") {
+    ConvertSimpleOperator<LogOperator, 1>(node, tf_import_flags, model);
+  } else if (node.op() == "LogSoftmax") {
+    ConvertSimpleOperator<LogSoftmaxOperator, 1>(node, tf_import_flags, model);
+  } else if (node.op() == "All") {
+    ConvertSimpleOperator<TensorFlowAllOperator>(node, tf_import_flags, model);
+  } else if (node.op() == "Assert") {
+    ConvertSimpleOperator<TensorFlowAssertOperator>(node, tf_import_flags,
+                                                    model);
+  } else if (node.op() == "Less") {
+    ConvertSimpleOperator<TensorFlowLessOperator, 2>(node, tf_import_flags,
+                                                     model);
+  } else if (node.op() == "LessEqual") {
+    ConvertSimpleOperator<TensorFlowLessEqualOperator, 2>(node, tf_import_flags,
+                                                          model);
+  } else if (node.op() == "Greater") {
+    ConvertSimpleOperator<TensorFlowGreaterOperator, 2>(node, tf_import_flags,
+                                                        model);
+  } else if (node.op() == "GreaterEqual") {
+    ConvertSimpleOperator<TensorFlowGreaterEqualOperator, 2>(
+        node, tf_import_flags, model);
+  } else if (node.op() == "Max") {
+    ConvertMaxOperator(node, tf_import_flags, model);
+  } else if (node.op() == "Min") {
+    ConvertMinOperator(node, tf_import_flags, model);
+  } else if (node.op() == "Maximum") {
+    ConvertSimpleOperator<TensorFlowMaximumOperator, 2>(node, tf_import_flags,
+                                                        model);
+  } else if (node.op() == "Minimum") {
+    ConvertSimpleOperator<TensorFlowMinimumOperator, 2>(node, tf_import_flags,
+                                                        model);
+  } else if (node.op() == "Merge") {
+    ConvertSimpleOperator<TensorFlowMergeOperator, 2>(node, tf_import_flags,
+                                                      model);
+  } else if (node.op() == "Pad") {
+    ConvertSimpleOperator<PadOperator, 2>(node, tf_import_flags, model);
+  } else if (node.op() == "PadV2") {
+    ConvertSimpleOperator<PadV2Operator, 3>(node, tf_import_flags, model);
+  } else if (node.op() == "StridedSlice") {
+    ConvertStridedSliceOperator(node, tf_import_flags, model);
+  } else if (node.op() == "Shape") {
+    ConvertSimpleOperator<TensorFlowShapeOperator, 1>(node, tf_import_flags,
+                                                      model);
+  } else if (node.op() == "Slice") {
+    ConvertSimpleOperator<SliceOperator, 3>(node, tf_import_flags, model);
+  } else if (node.op() == "Split") {
+    ConvertSplitOperator(node, tf_import_flags, model);
+  } else if (node.op() == "Switch") {
+    ConvertSwitchOperator(node, tf_import_flags, model);
+  } else if (node.op() == "Placeholder") {
+    ConvertPlaceholderOperator(node, tf_import_flags, model);
+  } else if (node.op() == "PlaceholderWithDefault") {
+    ConvertIdentityOperator(node, tf_import_flags, model);
+  } else if (node.op() == "LegacyFedInput") {
+    ConvertPlaceholderOperator(node, tf_import_flags, model);
+  } else if (node.op() == "NoOp") {
+    ConvertNoOpOperator(node, tf_import_flags, model);
+  } else if (node.op() == "Cast") {
+    ConvertCastOperator(node, tf_import_flags, model);
+  } else if (node.op() == "Floor") {
+    ConvertFloorOperator(node, tf_import_flags, model);
+  } else if (node.op() == "Gather" || node.op() == "GatherV2") {
+    ConvertGatherOperator(node, tf_import_flags, model);
+  } else if (node.op() == "ResizeBilinear") {
+    ConvertResizeBilinearOperator(node, tf_import_flags, model);
+  } else if (node.op() == "BatchNormWithGlobalNormalization") {
+    ConvertBatchNormWithGlobalNormalizationOperator(node, tf_import_flags,
+                                                    model);
+  } else if (node.op() == "FusedBatchNorm") {
+    ConvertFusedBatchNormOperator(node, tf_import_flags, model);
+  } else if (node.op() == "SpaceToBatchND") {
+    ConvertSpaceToBatchNDOperator(node, tf_import_flags, model);
+  } else if (node.op() == "BatchToSpaceND") {
+    ConvertBatchToSpaceNDOperator(node, tf_import_flags, model);
+  } else if (node.op() == "Mean") {
+    ConvertMeanOperator(node, tf_import_flags, model);
+  } else if (node.op() == "Svdf") {
+    ConvertSvdfOperator(node, tf_import_flags, model);
+  } else if (node.op() == "NextIteration") {
+    ConvertOperatorSpecialCasedAsRNNBackEdge(node, tf_import_flags, model);
+  } else if (node.op() == "ExpandDims") {
+    ConvertSimpleOperator<ExpandDimsOperator, 2>(node, tf_import_flags, model);
+  } else if (node.op() == "Fill") {
+    ConvertSimpleOperator<FillOperator, 2>(node, tf_import_flags, model);
+  } else if (node.op() == "FloorDiv") {
+    ConvertSimpleOperator<FloorDivOperator, 2>(node, tf_import_flags, model);
+  } else if (node.op() == "FloorMod") {
+    ConvertSimpleOperator<FloorModOperator, 2>(node, tf_import_flags, model);
+  } else if (node.op() == "Range") {
+    ConvertRangeOperator(node, tf_import_flags, model);
+  } else if (node.op() == "Rank") {
+    ConvertSimpleOperator<RankOperator, 1>(node, tf_import_flags, model);
+  } else if (node.op() == "Stack" || node.op() == "Pack") {
+    ConvertStackOperator(node, tf_import_flags, model);
+  } else if (node.op() == "Transpose") {
+    ConvertSimpleOperator<TransposeOperator, 2>(node, tf_import_flags, model);
+  } else if (node.op() == "ArgMax") {
+    ConvertArgMaxOperator(node, tf_import_flags, model);
+  } else if (node.op() == "Exp") {
+    ConvertSimpleOperator<ExpOperator, 1>(node, tf_import_flags, model);
+  } else if (node.op() == "TopK" || node.op() == "TopKV2") {
+    ConvertTopKV2Operator(node, tf_import_flags, model);
+  } else if (node.op() == "DynamicPartition") {
+    ConvertDynamicPartitionOperator(node, tf_import_flags, model);
+  } else if (node.op() == "DynamicStitch" ||
+             node.op() == "ParallelDynamicStitch") {
+    ConvertDynamicStitchOperator(node, tf_import_flags, model);
+  } else if (node.op() == "RandomUniform") {
+    ConvertRandomUniform(node, tf_import_flags, model);
+  } else if (node.op() == "Sin") {
+    ConvertSimpleOperator<SinOperator, 1>(node, tf_import_flags, model);
+  } else if (node.op() == "Select") {
+    ConvertSimpleOperator<SelectOperator, 3>(node, tf_import_flags, model);
+  } else if (node.op() == "SparseToDense") {
+    ConvertSparseToDenseOperator(node, tf_import_flags, model);
+  } else {
+    ConvertUnsupportedOperator(node, tf_import_flags, model);
+  }
+  return Status::OK();
+}
+}  // namespace internal
+
 std::unique_ptr<Model> ImportTensorFlowGraphDef(
     const ModelFlags& model_flags, const TensorFlowImportFlags& tf_import_flags,
     const GraphDef& tf_graph) {
@@ -2048,176 +1940,8 @@ std::unique_ptr<Model> ImportTensorFlowGraphDef(
 
   for (auto node : inlined_graph.node()) {
     StripZeroOutputIndexFromInputs(&node);
-    if (node.op() == "Const") {
-      ConvertConstOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Conv2D") {
-      ConvertConvOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Conv2DBackpropInput") {
-      ConvertTransposeConvOperator(node, tf_import_flags, model);
-    } else if (node.op() == "DepthwiseConv2dNative") {
-      ConvertDepthwiseConvOperator(node, tf_import_flags, model);
-    } else if (node.op() == "DepthToSpace") {
-      ConvertDepthToSpaceOperator(node, tf_import_flags, model);
-    } else if (node.op() == "SpaceToDepth") {
-      ConvertSpaceToDepthOperator(node, tf_import_flags, model);
-    } else if (node.op() == "BiasAdd") {
-      ConvertBiasAddOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Relu") {
-      ConvertReluOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Relu6") {
-      ConvertRelu6Operator(node, tf_import_flags, model);
-    } else if (node.op() == "Sigmoid") {
-      ConvertLogisticOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Tanh") {
-      ConvertTanhOperator(node, tf_import_flags, model);
-    } else if (node.op() == "MaxPool") {
-      ConvertMaxPoolOperator(node, tf_import_flags, model);
-    } else if (node.op() == "AvgPool") {
-      ConvertAvgPoolOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Reshape") {
-      ConvertReshapeOperator(node, tf_import_flags, model);
-    } else if (node.op() == "BatchMatMul") {
-      ConvertBatchMatMulOperator(node, tf_import_flags, model);
-    } else if (node.op() == "MatMul") {
-      ConvertMatMulOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Div" || node.op() == "RealDiv") {
-      ConvertDivOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Identity" || node.op() == "CheckNumerics" ||
-               node.op() == "StopGradient") {
-      ConvertIdentityOperator(node, tf_import_flags, model);
-    } else if (node.op() == "FakeQuantWithMinMaxVars") {
-      ConvertFakeQuantWithMinMaxVars(node, tf_import_flags, model);
-    } else if (node.op() == "FakeQuantWithMinMaxArgs") {
-      ConvertFakeQuantWithMinMaxArgs(node, tf_import_flags, model);
-    } else if (node.op() == "Neg") {
-      ConvertNegOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Rsqrt") {
-      ConvertRsqrtOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Squeeze") {
-      ConvertSqueezeOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Sqrt") {
-      ConvertSqrtOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Square") {
-      ConvertSquareOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Add") {
-      ConvertAddOperator(node, tf_import_flags, model);
-    } else if (node.op() == "AddN") {
-      ConvertAddNOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Mul") {
-      ConvertMulOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Sub") {
-      ConvertSubOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Sum") {
-      ConvertSumOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Tile") {
-      ConvertTileOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Concat" || node.op() == "ConcatV2") {
-      ConvertConcatOperator(node, tf_import_flags, model);
-    } else if (node.op() == "LRN") {
-      ConvertLRNOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Softmax") {
-      ConvertSoftmaxOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Log") {
-      ConvertLogOperator(node, tf_import_flags, model);
-    } else if (node.op() == "LogSoftmax") {
-      ConvertLogSoftmaxOperator(node, tf_import_flags, model);
-    } else if (node.op() == "All") {
-      ConvertAllOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Assert") {
-      ConvertAssertOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Less") {
-      ConvertLessOperator(node, tf_import_flags, model);
-    } else if (node.op() == "LessEqual") {
-      ConvertLessEqualOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Greater") {
-      ConvertGreaterOperator(node, tf_import_flags, model);
-    } else if (node.op() == "GreaterEqual") {
-      ConvertGreaterEqualOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Max") {
-      ConvertMaxOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Min") {
-      ConvertMinOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Maximum") {
-      ConvertMaximumOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Minimum") {
-      ConvertMinimumOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Merge") {
-      ConvertMergeOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Pad") {
-      ConvertPadOperator(node, tf_import_flags, model);
-    } else if (node.op() == "StridedSlice") {
-      ConvertStridedSliceOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Shape") {
-      ConvertShapeOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Slice") {
-      ConvertSliceOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Split") {
-      ConvertSplitOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Switch") {
-      ConvertSwitchOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Placeholder") {
-      ConvertPlaceholderOperator(node, tf_import_flags, model);
-    } else if (node.op() == "PlaceholderWithDefault") {
-      ConvertIdentityOperator(node, tf_import_flags, model);
-    } else if (node.op() == "LegacyFedInput") {
-      ConvertPlaceholderOperator(node, tf_import_flags, model);
-    } else if (node.op() == "NoOp") {
-      ConvertNoOpOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Cast") {
-      ConvertCastOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Floor") {
-      ConvertFloorOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Gather" || node.op() == "GatherV2") {
-      ConvertGatherOperator(node, tf_import_flags, model);
-    } else if (node.op() == "ResizeBilinear") {
-      ConvertResizeBilinearOperator(node, tf_import_flags, model);
-    } else if (node.op() == "BatchNormWithGlobalNormalization") {
-      ConvertBatchNormWithGlobalNormalizationOperator(node, tf_import_flags,
-                                                      model);
-    } else if (node.op() == "FusedBatchNorm") {
-      ConvertFusedBatchNormOperator(node, tf_import_flags, model);
-    } else if (node.op() == "SpaceToBatchND") {
-      ConvertSpaceToBatchNDOperator(node, tf_import_flags, model);
-    } else if (node.op() == "BatchToSpaceND") {
-      ConvertBatchToSpaceNDOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Mean") {
-      ConvertMeanOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Svdf") {
-      ConvertSvdfOperator(node, tf_import_flags, model);
-    } else if (node.op() == "NextIteration") {
-      ConvertOperatorSpecialCasedAsRNNBackEdge(node, tf_import_flags, model);
-    } else if (node.op() == "ExpandDims") {
-      ConvertExpandDimsOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Fill") {
-      ConvertFillOperator(node, tf_import_flags, model);
-    } else if (node.op() == "FloorDiv") {
-      ConvertFloorDivOperator(node, tf_import_flags, model);
-    } else if (node.op() == "FloorMod") {
-      ConvertFloorModOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Range") {
-      ConvertRangeOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Rank") {
-      ConvertRankOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Stack" || node.op() == "Pack") {
-      ConvertStackOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Transpose") {
-      ConvertTransposeOperator(node, tf_import_flags, model);
-    } else if (node.op() == "ArgMax") {
-      ConvertArgMaxOperator(node, tf_import_flags, model);
-    } else if (node.op() == "Exp") {
-      ConvertExpOperator(node, tf_import_flags, model);
-    } else if (node.op() == "TopK" || node.op() == "TopKV2") {
-      ConvertTopKV2Operator(node, tf_import_flags, model);
-    } else if (node.op() == "DynamicPartition") {
-      ConvertDynamicPartitionOperator(node, tf_import_flags, model);
-    } else if (node.op() == "DynamicStitch" ||
-               node.op() == "ParallelDynamicStitch") {
-      ConvertDynamicStitchOperator(node, tf_import_flags, model);
-    } else if (node.op() == "RandomUniform") {
-      ConvertRandomUniform(node, tf_import_flags, model);
-    } else {
-      ConvertUnsupportedOperator(node, tf_import_flags, model);
-    }
+    auto status = internal::ImportTensorFlowNode(node, tf_import_flags, model);
+    CHECK(status.ok()) << status.error_message();
   }
 
   ResolveModelFlags(model_flags, model);
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow_test.cc b/tensorflow/contrib/lite/toco/import_tensorflow_test.cc
new file mode 100644
index 00000000000000..835676662b9cb7
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/import_tensorflow_test.cc
@@ -0,0 +1,161 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/toco/import_tensorflow.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+
+namespace toco {
+
+using port::Status;
+using tensorflow::AttrValue;
+using tensorflow::DT_BOOL;
+using tensorflow::DT_FLOAT;
+using tensorflow::DT_INT32;
+using tensorflow::DT_INT64;
+using tensorflow::DT_QUINT8;
+using tensorflow::DT_STRING;
+using tensorflow::NodeDef;
+
+namespace internal {
+Status ImportTensorFlowNode(const NodeDef&, const TensorFlowImportFlags&,
+                            Model*);
+}  // namespace internal
+
+namespace {
+
+class ShapeImportTest : public ::testing::TestWithParam<tensorflow::DataType> {
+ protected:
+  ShapeImportTest() {}
+
+  void BuildConstNode(std::initializer_list<int64_t> shape,
+                      tensorflow::DataType dtype, int64_t num_elements,
+                      NodeDef* node) {
+    node->set_op("Const");
+    node->set_name("Node1");
+
+    // An attribute describing the type of this const node.
+    AttrValue dtype_attr;
+    SetAttrValue(dtype, &dtype_attr);
+    (*node->mutable_attr())["dtype"] = dtype_attr;
+
+    // An attribute describing the content of this const node.
+    tensorflow::TensorProto t;
+    t.set_dtype(dtype);
+    auto* s = t.mutable_tensor_shape();
+    for (auto d : shape) {
+      s->add_dim()->set_size(d);
+    }
+
+    // TODO(ahentz): also need to test via tensor_content()
+    switch (dtype) {
+      case DT_FLOAT:
+        for (int64_t i = 0; i < num_elements; ++i) {
+          t.add_float_val(i / 10000.0);
+        }
+        break;
+      case DT_INT32:
+        for (int64_t i = 0; i < num_elements; ++i) {
+          t.add_int_val(i % std::numeric_limits<int>::max());
+        }
+        break;
+      case DT_QUINT8:
+        for (int64_t i = 0; i < num_elements; ++i) {
+          t.add_int_val(i % std::numeric_limits<uint8_t>::max());
+        }
+        break;
+      case DT_INT64:
+        for (int64_t i = 0; i < num_elements; ++i) {
+          t.add_int64_val(i);
+        }
+        break;
+      case DT_STRING:
+        break;
+      case DT_BOOL:
+        for (int64_t i = 0; i < num_elements; ++i) {
+          t.add_bool_val(i % 2);
+        }
+        break;
+      default:
+        break;
+    }
+
+    AttrValue value_attr;
+    SetAttrValue(t, &value_attr);
+    (*node->mutable_attr())["value"] = value_attr;
+  }
+
+  Status ImportNode(const NodeDef& node) {
+    Model model;
+    return internal::ImportTensorFlowNode(node, TensorFlowImportFlags(),
+                                          &model);
+  }
+};
+
+std::vector<tensorflow::DataType> TestTypes() {
+  return {DT_FLOAT, DT_INT32, DT_INT64, DT_BOOL, DT_QUINT8};
+}
+
+TEST_P(ShapeImportTest, ShapeElementIsNegative) {
+  NodeDef node;
+  BuildConstNode({1, -2, 10}, GetParam(), 0, &node);
+  auto status = ImportNode(node);
+  EXPECT_EQ(status.error_message(),
+            "Tensor shape should not include negative values (while processing "
+            "node 'Node1')");
+}
+INSTANTIATE_TEST_CASE_P(ShapeElementIsNegative, ShapeImportTest,
+                        ::testing::ValuesIn(TestTypes()));
+
+TEST_P(ShapeImportTest, ShapeElementTooLarge) {
+  NodeDef node;
+  BuildConstNode({3000000000}, GetParam(), 0, &node);
+  auto status = ImportNode(node);
+  EXPECT_EQ(status.error_message(),
+            "Shape element overflows (while processing node 'Node1')");
+}
+INSTANTIATE_TEST_CASE_P(ShapeElementTooLarge, ShapeImportTest,
+                        ::testing::ValuesIn(TestTypes()));
+
+TEST_P(ShapeImportTest, ShapeTooLarge) {
+  NodeDef node;
+  BuildConstNode({1000000, 2000000, 2000000, 2000000}, GetParam(), 0, &node);
+  auto status = ImportNode(node);
+  EXPECT_EQ(status.error_message(),
+            "Tensor shape is too large (while processing node 'Node1')");
+}
+INSTANTIATE_TEST_CASE_P(ShapeTooLarge, ShapeImportTest,
+                        ::testing::ValuesIn(TestTypes()));
+
+TEST_P(ShapeImportTest, ValidShapeButZeroElements) {
+  NodeDef node;
+  BuildConstNode({1, 2, 2, 2}, GetParam(), 0, &node);
+  auto status = ImportNode(node);
+  EXPECT_THAT(
+      status.error_message(),
+      ::testing::MatchesRegex(
+          "Neither input_content .0. nor .*_val .0. have the right "
+          "dimensions .8. for this .* tensor .while processing node 'Node1'."));
+}
+INSTANTIATE_TEST_CASE_P(ValidShapeButZeroElements, ShapeImportTest,
+                        ::testing::ValuesIn(TestTypes()));
+
+}  // namespace
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 482cc71d8b3470..1a4f87e3636177 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -78,10 +78,12 @@ enum class OperatorType {
   kFloor,
   kGather,
   kResizeBilinear,
+  kSin,
   kSpaceToBatchND,
   kStack,
   kBatchToSpaceND,
   kPad,
+  kPadV2,
   kStridedSlice,
   kSlice,
   kSqueeze,
@@ -132,6 +134,8 @@ enum class OperatorType {
   // instead of being given as plain constant arrays. So we need to insert
   // special nodes in the graph to shuffle axes.
   kReorderAxes,
+  kSelect,
+  kSparseToDense,
 };
 
 // Helper to deal with TensorFlow arrays using a different ordering of
@@ -523,7 +527,15 @@ struct LstmCellOperator : Operator {
     ACTIV_TEMP = 3,
     NUM_OUTPUTS = 4
   };
-  LstmCellOperator() : Operator(OperatorType::kLstmCell) {}
+  enum KernelType {
+    KERNEL_BASIC = 0,
+    KERNEL_FULL = 1,
+  };
+
+  LstmCellOperator()
+      : Operator(OperatorType::kLstmCell), kernel_type(KERNEL_BASIC) {}
+
+  KernelType kernel_type;
 };
 
 // Element-wise multiplication operator.
@@ -616,6 +628,17 @@ struct TanhOperator : Operator {
   TanhOperator() : Operator(OperatorType::kTanh) {}
 };
 
+// Element-wise Sin operator:
+//   x -> Sin(x) = sin(x)
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Sin
+struct SinOperator : Operator {
+  SinOperator() : Operator(OperatorType::kSin) {}
+};
+
 // Element-wise addition operator.
 //
 // Inputs:
@@ -825,6 +848,29 @@ struct PadOperator : Operator {
   std::vector<int> right_padding;
 };
 
+// PaddingV2 operator. Pads a tensor with the given constant value.
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//   inputs[1]: required: the padding array
+//   inputs[2]: required: the scalar constant_values
+//
+// This operation pads input according to the paddings and constant_values you
+// specify. paddings is an integer tensor with shape [Dn, 2], where n is the
+// rank of input. For each dimension D of input, paddings[D, 0] indicates how
+// many padding values to add before the contents of input in that dimension,
+// and paddings[D, 1] indicates how many padding values to add after the
+// contents of input in that dimension. constant_values is a scalar tensor of
+// the same type as input that indicates the value to use for padding input.
+//
+// TensorFlow equivalent: PadV2
+struct PadV2Operator : Operator {
+  PadV2Operator() : Operator(OperatorType::kPadV2) {}
+
+  std::vector<int> left_padding;
+  std::vector<int> right_padding;
+};
+
 // Strided slice operator.
 //
 // Inputs:
@@ -1063,6 +1109,18 @@ struct NegOperator : Operator {
   NegOperator() : Operator(OperatorType::kNeg) {}
 };
 
+// Element-wise select operator choosing elements from inputs[1] or input[2]
+//
+// Inputs:
+//  inputs[0]: required: boolean mask per index
+//  inputs[1]: required: tensor of values if true
+//  inputs[2]: required: tensor of values if false
+//
+//  TensorFlow equivalent: Select
+struct SelectOperator : Operator {
+  SelectOperator() : Operator(OperatorType::kSelect) {}
+};
+
 // Element-wise reciprocal-square-root (x^-0.5) operator.
 //
 // Inputs:
@@ -1549,6 +1607,19 @@ struct DynamicStitchOperator : Operator {
   int num_partitions;
 };
 
+// SparseToDense operator:
+//
+// Inputs:
+// Inputs[0]: required: sparse_indices.
+// Inputs[1]: required: output_shape.
+// Inputs[2]: required: sparse_values.
+//
+// TensorFlow equivalent: SparseToDense.
+struct SparseToDenseOperator : Operator {
+  SparseToDenseOperator() : Operator(OperatorType::kSparseToDense) {}
+  bool validate_indices;
+};
+
 // Alloc's are used for transient arrays only. An Alloc specifies which interval
 // of the "transient_data" workspace buffer passed to inference functions, is to
 // be used for the transient array at hand. The 'start' and 'end' values are
@@ -1780,6 +1851,8 @@ class Model {
   }
   const ArrayMap& GetArrayMap() const { return arrays; }
 
+  int64 ArithmeticOpsCount() const { return ops_count; }
+
   // Optional arrays are used for optional tensors,
   // these tensors do not have data, but with reserved names as op inputs.
   std::set<string> optional_arrays;
@@ -1796,6 +1869,8 @@ class Model {
   std::size_t transient_data_size = 0;
   // For code-generation only: required alignment of the transient_data buffer
   std::size_t transient_data_alignment = 0;
+  // Arithmatic operations performed in the model.
+  int64 ops_count = 0;
 
  private:
   // The associative array mapping names to Array's.
diff --git a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
index 7bbeab7c9d1e42..0f104d5e2d02dc 100644
--- a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
@@ -83,7 +83,7 @@ bool ParseModelFlagsFromCommandLineFlags(
            "Deprecated: use --input_data_types instead. Input array type, if "
            "not already provided in the graph. "
            "Typically needs to be specified when passing arbitrary arrays "
-           "to --input_array."),
+           "to --input_arrays."),
       Flag("input_data_types", parsed_flags.input_data_types.bind(),
            parsed_flags.input_data_types.default_value(),
            "Input arrays types, comma-separated, if not already provided in "
@@ -124,14 +124,6 @@ bool ParseModelFlagsFromCommandLineFlags(
            parsed_flags.model_checks.default_value(),
            "A list of model checks to be applied to verify the form of the "
            "model.  Applied after the graph transformations after import."),
-      Flag("graphviz_first_array", parsed_flags.graphviz_first_array.bind(),
-           parsed_flags.graphviz_first_array.default_value(),
-           "If set, defines the start of the sub-graph to be dumped to "
-           "GraphViz."),
-      Flag(
-          "graphviz_last_array", parsed_flags.graphviz_last_array.bind(),
-          parsed_flags.graphviz_last_array.default_value(),
-          "If set, defines the end of the sub-graph to be dumped to GraphViz."),
       Flag("dump_graphviz", parsed_flags.dump_graphviz.bind(),
            parsed_flags.dump_graphviz.default_value(),
            "Dump graphviz during LogDump call. If string is non-empty then "
@@ -180,8 +172,6 @@ bool ParseModelFlagsFromCommandLineFlags(
     if (!tensorflow::Flags::Parse(argc, argv, flags)) return false;
   }
   auto& dump_options = *GraphVizDumpOptions::singleton();
-  dump_options.graphviz_first_array = parsed_flags.graphviz_first_array.value();
-  dump_options.graphviz_last_array = parsed_flags.graphviz_last_array.value();
   dump_options.dump_graphviz_video = parsed_flags.dump_graphviz_video.value();
   dump_options.dump_graphviz = parsed_flags.dump_graphviz.value();
 
diff --git a/tensorflow/contrib/lite/toco/model_flags.proto b/tensorflow/contrib/lite/toco/model_flags.proto
index d23e80c464c9fe..6c1c53658c0736 100644
--- a/tensorflow/contrib/lite/toco/model_flags.proto
+++ b/tensorflow/contrib/lite/toco/model_flags.proto
@@ -96,8 +96,9 @@ message RnnState {
 // model that does not already contain such MinMax information.
 message ArraysExtraInfo {
   message Entry {
-    // Next ID to use: 7.
+    // Next ID to use: 8.
     optional string name = 1;
+    optional string name_regexp = 7;
     optional double min = 2;
     optional double max = 3;
     optional IODataType data_type = 4;
diff --git a/tensorflow/contrib/lite/toco/python/BUILD b/tensorflow/contrib/lite/toco/python/BUILD
index 6c4f8e12cdd5b3..a954f1d6ba65f2 100644
--- a/tensorflow/contrib/lite/toco/python/BUILD
+++ b/tensorflow/contrib/lite/toco/python/BUILD
@@ -15,7 +15,7 @@ cc_library(
         "//tensorflow/contrib/lite/toco:toco_port",
         "//tensorflow/contrib/lite/toco:toco_tooling",
         "//tensorflow/core:lib",
-        "//util/python:python_headers",
+        "//third_party/python_runtime:headers",
     ],
 )
 
@@ -26,7 +26,7 @@ tf_py_wrap_cc(
         ":toco_python_api",
         "//tensorflow/contrib/lite/toco:model_flags_proto_cc",
         "//tensorflow/contrib/lite/toco:toco_flags_proto_cc",
-        "//util/python:python_headers",
+        "//third_party/python_runtime:headers",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -41,12 +41,6 @@ py_binary(
     ],
 )
 
-py_binary(
-    name = "toco_wrapper",
-    srcs = ["toco_wrapper.py"],
-    srcs_version = "PY2AND3",
-)
-
 tf_py_test(
     name = "toco_from_protos_test",
     srcs = ["toco_from_protos_test.py"],
diff --git a/tensorflow/contrib/lite/toco/python/toco.i b/tensorflow/contrib/lite/toco/python/toco.i
index 3787cba4a371f1..0d2fbdd67b3aa5 100644
--- a/tensorflow/contrib/lite/toco/python/toco.i
+++ b/tensorflow/contrib/lite/toco/python/toco.i
@@ -24,9 +24,12 @@ namespace toco {
 // Convert a model represented in `input_contents`. `model_flags_proto`
 // describes model parameters. `toco_flags_proto` describes conversion
 // parameters (see relevant .protos for more information). Returns a string
-// representing the contents of the converted model.
+// representing the contents of the converted model. When extended_return
+// flag is set to true returns a dictionary that contains string representation
+// of the converted model and some statitics like arithmetic ops count.
 PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
                         PyObject* toco_flags_proto_txt_raw,
-                        PyObject* input_contents_txt_raw);
+                        PyObject* input_contents_txt_raw,
+                        bool extended_return = false);
 
 } // namespace toco
\ No newline at end of file
diff --git a/tensorflow/contrib/lite/toco/python/toco_python_api.cc b/tensorflow/contrib/lite/toco/python/toco_python_api.cc
index 153c117d17e456..5b1db852b4f8e8 100644
--- a/tensorflow/contrib/lite/toco/python/toco_python_api.cc
+++ b/tensorflow/contrib/lite/toco/python/toco_python_api.cc
@@ -37,7 +37,7 @@ namespace toco {
 // sure we input and output bytes rather than unicode strings for Python3.
 PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
                       PyObject* toco_flags_proto_txt_raw,
-                      PyObject* input_contents_txt_raw) {
+                      PyObject* input_contents_txt_raw, bool extended_return) {
   // Use Python C API to validate and convert arguments. In py3 (bytes),
   // in py2 (str).
   auto ConvertArg = [&](PyObject* obj, bool* error) {
@@ -78,6 +78,16 @@ PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
   Export(toco_flags, *model, toco_flags.allow_custom_ops(),
          &output_file_contents_txt);
 
+  if (extended_return) {
+    PyObject* dict = PyDict_New();
+    PyDict_SetItemString(
+        dict, "flatbuffer",
+        TOCO_FROM_CPPSTRING_TO_PY(output_file_contents_txt.data(),
+                                  output_file_contents_txt.size()));
+    PyDict_SetItemString(dict, "arithmetic_ops",
+                         PyLong_FromLong(model->ArithmeticOpsCount()));
+    return dict;
+  }
   // Convert arguments back to byte (py3) or str (py2)
   return TOCO_FROM_CPPSTRING_TO_PY(output_file_contents_txt.data(),
                                    output_file_contents_txt.size());
diff --git a/tensorflow/contrib/lite/toco/python/toco_python_api.h b/tensorflow/contrib/lite/toco/python/toco_python_api.h
index dc378353f79945..7e8ad9c1dafa68 100644
--- a/tensorflow/contrib/lite/toco/python/toco_python_api.h
+++ b/tensorflow/contrib/lite/toco/python/toco_python_api.h
@@ -15,18 +15,21 @@ limitations under the License.
 #ifndef _THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_PYTHON_TOCO_PYTHON_API_H_
 #define _THIRD_PARTY_TENSORFLOW_CONTRIB_LITE_TOCO_PYTHON_TOCO_PYTHON_API_H_
 
-#include <string>
 #include <Python.h>
+#include <string>
 
 namespace toco {
 
 // Convert a model represented in `input_contents`. `model_flags_proto`
 // describes model parameters. `toco_flags_proto` describes conversion
 // parameters (see relevant .protos for more information). Returns a string
-// representing the contents of the converted model.
+// representing the contents of the converted model. When extended_return
+// flag is set to true returns a dictionary that contains string representation
+// of the converted model and some statitics like arithmetic ops count.
 PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
                       PyObject* toco_flags_proto_txt_raw,
-                      PyObject* input_contents_txt_raw);
+                      PyObject* input_contents_txt_raw,
+                      bool extended_return = false);
 
 }  // namespace toco
 
diff --git a/tensorflow/contrib/lite/toco/python/toco_wrapper.py b/tensorflow/contrib/lite/toco/python/toco_wrapper.py
deleted file mode 100644
index 6d6b500d7eccd3..00000000000000
--- a/tensorflow/contrib/lite/toco/python/toco_wrapper.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Wrapper for runninmg toco binary embedded in pip site-package.
-
-NOTE: this mainly exists since PIP setup.py cannot install binaries to bin/.
-It can only install Python "console-scripts." This will work as a console
-script. See tools/pip_package/setup.py (search for CONSOLE_SCRIPTS).
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import sys
-
-
-def main():
-  # Pip installs the binary in aux-bin off of main site-package install.
-  # Just find it and exec, passing all arguments in the process.
-  # TODO(aselle): it is unfortunate to use all of tensorflow to lookup binary.
-  print("""TOCO from pip install is currently not working on command line.
-Please use the python TOCO API or use
-bazel run tensorflow/contrib/lite:toco -- <args> from a TensorFlow source dir.
-""")
-  sys.exit(1)
-  # TODO(aselle): Replace this when we find a way to run toco without
-  # blowing up executable size.
-  # binary = os.path.join(tf.__path__[0], 'aux-bin/toco')
-  # os.execvp(binary, sys.argv)
diff --git a/tensorflow/contrib/lite/toco/tflite/export.cc b/tensorflow/contrib/lite/toco/tflite/export.cc
index 335b496dccdbdb..5daa703c80b3b5 100644
--- a/tensorflow/contrib/lite/toco/tflite/export.cc
+++ b/tensorflow/contrib/lite/toco/tflite/export.cc
@@ -45,14 +45,20 @@ using ::tflite::Tensor;
 
 namespace {
 
-details::OperatorKey GetOperatorKey(const ::toco::Operator& op) {
+details::OperatorKey GetOperatorKey(
+    const ::toco::Operator& op,
+    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type) {
   string custom_code;
   if (op.type == OperatorType::kTensorFlowUnsupported) {
     const TensorFlowUnsupportedOperator& unsupported_op =
         static_cast<const TensorFlowUnsupportedOperator&>(op);
     custom_code = unsupported_op.tensorflow_op;
   }
-  return details::OperatorKey(op.type, custom_code);
+  int version = 1;
+  if (ops_by_type.count(op.type) != 0) {
+    version = ops_by_type.at(op.type)->GetVersion(op);
+  }
+  return details::OperatorKey(op.type, custom_code, version);
 }
 
 }  // Anonymous namespace.
@@ -74,11 +80,13 @@ void LoadTensorsMap(const Model& model, TensorsMap* tensors_map) {
   }
 }
 
-void LoadOperatorsMap(const Model& model, OperatorsMap* operators_map) {
+void LoadOperatorsMap(
+    const Model& model, OperatorsMap* operators_map,
+    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type) {
   // First find a list of unique operator types.
   std::set<OperatorKey> keys;
   for (const auto& op : model.operators) {
-    keys.insert(GetOperatorKey(*op));
+    keys.insert(GetOperatorKey(*op, ops_by_type));
   }
   // Now assign indices to them and fill in the map.
   int index = 0;
@@ -185,8 +193,9 @@ Offset<Vector<Offset<OperatorCode>>> ExportOperatorCodes(
   std::map<int, Offset<OperatorCode>> ordered_opcodes;
 
   for (const auto& op : model.operators) {
-    const details::OperatorKey operator_key = GetOperatorKey(*op);
+    const details::OperatorKey operator_key = GetOperatorKey(*op, ops_by_type);
     int op_index = operators_map.at(operator_key);
+    int op_version = operator_key.version;
 
     string name = HelpfulOperatorTypeName(*op);
     bool is_builtin = false;
@@ -197,7 +206,7 @@ Offset<Vector<Offset<OperatorCode>>> ExportOperatorCodes(
 
     if (is_builtin) {
       ordered_opcodes[op_index] =
-          CreateOperatorCode(*builder, builtin_ops[name], 0);
+          CreateOperatorCode(*builder, builtin_ops[name], 0, op_version);
     } else {
       // This could be a kTensorFlowUnsupported, in which case we should be
       // able to retrieve the original Tensorflow name from the OperatorKey, or
@@ -211,8 +220,9 @@ Offset<Vector<Offset<OperatorCode>>> ExportOperatorCodes(
       if (error_summary) {
         error_summary->insert(name);
       }
-      ordered_opcodes[op_index] = CreateOperatorCode(
-          *builder, BuiltinOperator_CUSTOM, builder->CreateString(name));
+      ordered_opcodes[op_index] =
+          CreateOperatorCode(*builder, BuiltinOperator_CUSTOM,
+                             builder->CreateString(name), op_version);
     }
   }
 
@@ -244,7 +254,7 @@ Offset<Vector<Offset<Operator>>> ExportOperators(
       outputs.push_back(tensors_map.at(output));
     }
 
-    int op_index = operators_map.at(GetOperatorKey(*op));
+    int op_index = operators_map.at(GetOperatorKey(*op, ops_by_type));
 
     // This is a custom op unless we can find it in ops_by_type, and even then
     // it could be a custom op (such as kTensorFlowUnsupported).
@@ -279,15 +289,20 @@ Offset<Vector<Offset<Buffer>>> ExportBuffers(
 
 void Export(const Model& model, bool allow_custom_ops,
             string* output_file_contents) {
-  flatbuffers::FlatBufferBuilder builder(/*initial_size=*/10240);
-
   const auto ops_by_type = BuildOperatorByTypeMap();
+  Export(model, allow_custom_ops, output_file_contents, ops_by_type);
+}
+
+void Export(
+    const Model& model, bool allow_custom_ops, string* output_file_contents,
+    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type) {
+  flatbuffers::FlatBufferBuilder builder(/*initial_size=*/10240);
 
   details::TensorsMap tensors_map;
   details::LoadTensorsMap(model, &tensors_map);
 
   details::OperatorsMap operators_map;
-  details::LoadOperatorsMap(model, &operators_map);
+  details::LoadOperatorsMap(model, &operators_map, ops_by_type);
 
   std::vector<const Array*> buffers_to_write;
   Array empty_array;
@@ -312,12 +327,14 @@ void Export(const Model& model, bool allow_custom_ops,
     error_summary.erase(fake_quant_operation_name);
   }
   if (!allow_custom_ops && !error_summary.empty()) {
-    LOG(QFATAL) << "Some of the operators in the model are not supported by "
-                   "the standard TensorFlow Lite runtime. If you have a custom "
-                   "implementation for them you can disable this error with "
-                   "--allow_custom_ops. Here is a list of operators for which "
-                   "you will need custom implementations: "
-                << absl::StrJoin(error_summary, ", ") << ".";
+    LOG(QFATAL)
+        << "Some of the operators in the model are not supported by "
+           "the standard TensorFlow Lite runtime. If you have a custom "
+           "implementation for them you can disable this error with "
+           "--allow_custom_ops, or by setting allow_custom_ops=True "
+           "when calling tf.contrib.lite.toco_convert(). Here is a list "
+           "of operators for which  you will need custom implementations: "
+        << absl::StrJoin(error_summary, ", ") << ".";
   }
 
   auto ops =
diff --git a/tensorflow/contrib/lite/toco/tflite/export.h b/tensorflow/contrib/lite/toco/tflite/export.h
index 8c79cb820015e1..098d2163e6c2fe 100644
--- a/tensorflow/contrib/lite/toco/tflite/export.h
+++ b/tensorflow/contrib/lite/toco/tflite/export.h
@@ -16,6 +16,8 @@ limitations under the License.
 #define TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_EXPORT_H_
 
 #include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tflite/operator.h"
+#include "tensorflow/contrib/lite/util.h"
 
 namespace toco {
 
@@ -25,11 +27,18 @@ namespace tflite {
 // result in the given string.
 void Export(const Model& model, bool allow_custom_ops,
             string* output_file_contents);
+
 // This if backward-compatibility.
+// TODO(ycling): Remove the deprecated entry functions.
 inline void Export(const Model& model, string* output_file_contents) {
   Export(model, true, output_file_contents);
 }
 
+// Export API with custom TFLite operator mapping.
+void Export(
+    const Model& model, bool allow_custom_ops, string* output_file_contents,
+    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type);
+
 namespace details {
 
 // A maps from tensor name to its final position in the TF Lite buffer.
@@ -39,25 +48,35 @@ using TensorsMap = std::unordered_map<string, int>;
 // Only when `type` is `kTensorFlowUnsupported`, `custom_code` is filled to
 // identify which operation is used.
 struct OperatorKey {
-  OperatorKey(OperatorType type, const std::string& custom_code)
-      : type(type), custom_code(custom_code) {}
+  OperatorKey(OperatorType type, const std::string& custom_code, int version)
+      : type(type), custom_code(custom_code), version(version) {}
   const OperatorType type;
   const std::string custom_code;
+  const int version;
 
   bool operator<(const OperatorKey& other) const {
     if (type < other.type) return true;
-    if (type > other.type) return false;
-    return custom_code < other.custom_code;
+    else if (type > other.type)
+      return false;
+    else if (custom_code < other.custom_code)
+      return true;
+    else if (custom_code > other.custom_code)
+      return false;
+    else
+      return version < other.version;
   }
 
   bool operator==(const OperatorKey& other) const {
-    return type == other.type && custom_code == other.custom_code;
+    return type == other.type && custom_code == other.custom_code &&
+           version == other.version;
   }
 
   struct Hash {
-    std::size_t operator()(const OperatorKey& key) const {
-      return std::hash<size_t>()(static_cast<size_t>(key.type)) ^
-             std::hash<std::string>()(key.custom_code);
+    size_t operator()(const OperatorKey& key) const {
+      return ::tflite::CombineHashes(
+          {std::hash<size_t>()(static_cast<size_t>(key.type)),
+           std::hash<std::string>()(key.custom_code),
+           std::hash<int>()(key.version)});
     }
   };
 };
@@ -66,11 +85,12 @@ struct OperatorKey {
 using OperatorsMap = std::unordered_map<OperatorKey, int, OperatorKey::Hash>;
 
 void LoadTensorsMap(const Model& model, TensorsMap* tensors_map);
-void LoadOperatorsMap(const Model& model, OperatorsMap* operators_map);
+void LoadOperatorsMap(
+    const Model& model, OperatorsMap* operators_map,
+    const std::map<OperatorType, std::unique_ptr<BaseOperator>>& ops_by_type);
 
 }  // namespace details
 }  // namespace tflite
-
 }  // namespace toco
 
 #endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TFLITE_EXPORT_H_
diff --git a/tensorflow/contrib/lite/toco/tflite/export_test.cc b/tensorflow/contrib/lite/toco/tflite/export_test.cc
index 6754372330797a..409e7d72a57076 100644
--- a/tensorflow/contrib/lite/toco/tflite/export_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/export_test.cc
@@ -17,6 +17,9 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/contrib/lite/schema/schema_generated.h"
+#include "tensorflow/contrib/lite/toco/tflite/builtin_operator.h"
+#include "tensorflow/contrib/lite/toco/tflite/operator.h"
+#include "tensorflow/contrib/lite/toco/tflite/types.h"
 
 namespace toco {
 namespace tflite {
@@ -65,12 +68,13 @@ TEST_F(ExportTest, LoadOperatorsMap) {
   BuildTestModel();
 
   details::OperatorsMap operators;
-  details::LoadOperatorsMap(input_model_, &operators);
-  EXPECT_EQ(0, operators[details::OperatorKey(OperatorType::kAdd, "")]);
-  EXPECT_EQ(1, operators[details::OperatorKey(OperatorType::kConv, "")]);
-  EXPECT_EQ(2, operators[details::OperatorKey(OperatorType::kSub, "")]);
+  const auto ops_by_type = BuildOperatorByTypeMap();
+  details::LoadOperatorsMap(input_model_, &operators, ops_by_type);
+  EXPECT_EQ(0, operators[details::OperatorKey(OperatorType::kAdd, "", 1)]);
+  EXPECT_EQ(1, operators[details::OperatorKey(OperatorType::kConv, "", 1)]);
+  EXPECT_EQ(2, operators[details::OperatorKey(OperatorType::kSub, "", 1)]);
   EXPECT_EQ(3, operators[details::OperatorKey(
-                   OperatorType::kTensorFlowUnsupported, "MyCrazyOp")]);
+                   OperatorType::kTensorFlowUnsupported, "MyCrazyOp", 1)]);
 }
 
 TEST_F(ExportTest, Export) {
@@ -104,6 +108,160 @@ TEST_F(ExportTest, Export) {
   EXPECT_THAT(indices, ElementsAre(1, 0, 3, 2));
 }
 
+// This test is based on a hypothetical scenario that dilation is supported
+// only in Conv version 2. So Toco populates version=1 when dialation
+// parameters are all 1, and version=2 otehrwise.
+class FakeConvolutionOperator
+    : public BuiltinOperator<ConvOperator, ::tflite::Conv2DOptions,
+                             ::tflite::BuiltinOptions_Conv2DOptions> {
+ public:
+  FakeConvolutionOperator()
+      : BuiltinOperator(::tflite::BuiltinOperator_CONV_2D,
+                        OperatorType::kConv) {}
+
+  // Returning the op version according to the op parameters.
+  int GetVersion(const Operator& op) const override {
+    const TocoOperator& conv_op = static_cast<const TocoOperator&>(op);
+    if (conv_op.dilation_width_factor != 1 ||
+        conv_op.dilation_height_factor != 1) {
+      // Version 2 if dilation is used.
+      return 2;
+    }
+    return 1;
+  }
+
+  // Note: The read / write code doesn't need to be changed if we stick with
+  // the restrictions:
+  // * Only adding parameters at the bottom of the Flatbuffer tables.
+  // * When the default value of parameters are used, the op works consistently
+  //   with the previous version.
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    auto padding = Padding::Serialize(op.padding.type);
+    auto activation_function =
+        ActivationFunction::Serialize(op.fused_activation_function);
+    return ::tflite::CreateConv2DOptions(*builder, padding, op.stride_width,
+                                         op.stride_height, activation_function,
+                                         op.dilation_width_factor,
+                                         op.dilation_height_factor);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->padding.type = Padding::Deserialize(options.padding());
+    op->stride_width = options.stride_w();
+    op->stride_height = options.stride_h();
+    op->dilation_width_factor = options.dilation_w_factor();
+    op->dilation_height_factor = options.dilation_h_factor();
+    op->fused_activation_function =
+        ActivationFunction::Deserialize(options.fused_activation_function());
+  }
+};
+
+class VersionedOpExportTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    input_model_.GetOrCreateArray("input");
+    input_model_.GetOrCreateArray("filter");
+    input_model_.GetOrCreateArray("output");
+  }
+  void AddConvOp(bool use_dialation) {
+    {
+      auto* op = new ConvOperator;
+      op->inputs.push_back("input");
+      op->inputs.push_back("filter");
+      op->inputs.push_back("output");
+
+      op->padding.type = PaddingType::kSame;
+      op->stride_width = 1;
+      op->stride_height = 1;
+      if (use_dialation) {
+        op->dilation_width_factor = 2;
+        op->dilation_height_factor = 2;
+      } else {
+        op->dilation_width_factor = 1;
+        op->dilation_height_factor = 1;
+      }
+      input_model_.operators.emplace_back(op);
+    }
+  }
+
+  std::map<OperatorType, std::unique_ptr<BaseOperator>>
+  BuildFakeOperatorByTypeMap() {
+    std::map<OperatorType, std::unique_ptr<BaseOperator>> result;
+    result[OperatorType::kConv] =
+        std::unique_ptr<BaseOperator>(new FakeConvolutionOperator);
+    return result;
+  }
+
+  Model input_model_;
+};
+
+TEST_F(VersionedOpExportTest, LoadOperatorsMapWithOpV1) {
+  AddConvOp(false);
+
+  details::OperatorsMap operators;
+  const auto ops_by_type = BuildFakeOperatorByTypeMap();
+  details::LoadOperatorsMap(input_model_, &operators, ops_by_type);
+
+  EXPECT_EQ(1, operators.size());
+  EXPECT_EQ(0, operators.at(details::OperatorKey(OperatorType::kConv, "", 1)));
+}
+
+TEST_F(VersionedOpExportTest, LoadOperatorsMapWithOpV2) {
+  AddConvOp(true);
+
+  details::OperatorsMap operators;
+  const auto ops_by_type = BuildFakeOperatorByTypeMap();
+  details::LoadOperatorsMap(input_model_, &operators, ops_by_type);
+
+  EXPECT_EQ(1, operators.size());
+  EXPECT_EQ(0, operators.at(details::OperatorKey(OperatorType::kConv, "", 2)));
+}
+
+TEST_F(VersionedOpExportTest, LoadOperatorsMapWithBothVersions) {
+  AddConvOp(false);
+  AddConvOp(true);
+
+  details::OperatorsMap operators;
+  const auto ops_by_type = BuildFakeOperatorByTypeMap();
+  details::LoadOperatorsMap(input_model_, &operators, ops_by_type);
+
+  EXPECT_EQ(2, operators.size());
+  EXPECT_EQ(0, operators.at(details::OperatorKey(OperatorType::kConv, "", 1)));
+  EXPECT_EQ(1, operators.at(details::OperatorKey(OperatorType::kConv, "", 2)));
+}
+
+TEST_F(VersionedOpExportTest, Export) {
+  AddConvOp(false);
+  AddConvOp(true);
+
+  string result;
+  const auto ops_by_type = BuildFakeOperatorByTypeMap();
+  Export(input_model_, true, &result, ops_by_type);
+
+  auto* model = ::tflite::GetModel(result.data());
+  auto operator_codes = model->operator_codes();
+
+  // Verify that 2 operator codes are populdated. Both are CONV_2D but with
+  // different versions.
+  EXPECT_EQ(2, operator_codes->size());
+  EXPECT_EQ(::tflite::BuiltinOperator_CONV_2D,
+            (*operator_codes)[0]->builtin_code());
+  EXPECT_EQ(1, (*operator_codes)[0]->version());
+  EXPECT_EQ(::tflite::BuiltinOperator_CONV_2D,
+            (*operator_codes)[1]->builtin_code());
+  EXPECT_EQ(2, (*operator_codes)[1]->version());
+
+  // Verify that the 2 operators points to the correct indices of the operation
+  // codes.
+  auto operators = (*model->subgraphs())[0]->operators();
+  EXPECT_EQ(2, operators->size());
+  EXPECT_EQ(0, (*operators)[0]->opcode_index());
+  EXPECT_EQ(1, (*operators)[1]->opcode_index());
+}
+
 // TODO(ahentz): tests for tensors, inputs, outpus, opcodes and operators.
 
 }  // namespace
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index d2e14ac5e0d7b0..a8518adefcf392 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -53,6 +53,8 @@ class AveragePool
     op->fused_activation_function =
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class Convolution
@@ -83,6 +85,8 @@ class Convolution
     op->fused_activation_function =
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class DepthwiseConvolution
@@ -112,6 +116,8 @@ class DepthwiseConvolution
     op->fused_activation_function =
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class Add : public BuiltinOperator<AddOperator, ::tflite::AddOptions,
@@ -132,6 +138,8 @@ class Add : public BuiltinOperator<AddOperator, ::tflite::AddOptions,
     op->fused_activation_function =
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class SpaceToBatchND
@@ -149,6 +157,8 @@ class SpaceToBatchND
 
   void ReadOptions(const TfLiteOptions& options,
                    TocoOperator* op) const override {}
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class Sub : public BuiltinOperator<SubOperator, ::tflite::SubOptions,
@@ -169,6 +179,8 @@ class Sub : public BuiltinOperator<SubOperator, ::tflite::SubOptions,
     op->fused_activation_function =
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class Div : public BuiltinOperator<DivOperator, ::tflite::DivOptions,
@@ -189,6 +201,8 @@ class Div : public BuiltinOperator<DivOperator, ::tflite::DivOptions,
     op->fused_activation_function =
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class BatchToSpaceND
@@ -206,6 +220,8 @@ class BatchToSpaceND
 
   void ReadOptions(const TfLiteOptions& options,
                    TocoOperator* op) const override {}
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class Cast : public BuiltinOperator<CastOperator, ::tflite::CastOptions,
@@ -225,6 +241,8 @@ class Cast : public BuiltinOperator<CastOperator, ::tflite::CastOptions,
     op->src_data_type = DataType::Deserialize(options.in_data_type());
     op->dst_data_type = DataType::Deserialize(options.out_data_type());
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class Concatenation
@@ -243,6 +261,8 @@ class Concatenation
                    TocoOperator* op) const override {
     op->axis = options.axis();
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class DepthToSpace : public CustomOperator<DepthToSpaceOperator> {
@@ -255,6 +275,8 @@ class DepthToSpace : public CustomOperator<DepthToSpaceOperator> {
   void ReadOptions(const flexbuffers::Map& m, TocoOperator* op) const override {
     op->block_size = m["block_size"].AsInt64();
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class FakeQuant : public CustomOperator<FakeQuantOperator> {
@@ -274,6 +296,8 @@ class FakeQuant : public CustomOperator<FakeQuantOperator> {
     const auto& num_bits = m["num_bits"];
     op->num_bits = num_bits.IsInt() ? num_bits.AsInt32() : 8;
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class FullyConnected
@@ -295,6 +319,8 @@ class FullyConnected
     op->fused_activation_function =
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class Gather : public BuiltinOperator<GatherOperator, ::tflite::GatherOptions,
@@ -311,6 +337,8 @@ class Gather : public BuiltinOperator<GatherOperator, ::tflite::GatherOptions,
                    TocoOperator* op) const override {
     op->axis = options.axis();
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class Svdf : public BuiltinOperator<SvdfOperator, ::tflite::SVDFOptions,
@@ -331,6 +359,8 @@ class Svdf : public BuiltinOperator<SvdfOperator, ::tflite::SVDFOptions,
         ActivationFunction::Deserialize(options.fused_activation_function());
     op->rank = options.rank();
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class L2Normalization
@@ -351,6 +381,8 @@ class L2Normalization
     op->fused_activation_function =
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class L2Pool : public BuiltinOperator<L2PoolOperator, ::tflite::Pool2DOptions,
@@ -378,6 +410,8 @@ class L2Pool : public BuiltinOperator<L2PoolOperator, ::tflite::Pool2DOptions,
     op->fused_activation_function =
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class LocalResponseNormalization
@@ -401,6 +435,8 @@ class LocalResponseNormalization
     op->alpha = options.alpha();
     op->beta = options.beta();
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class MaxPool : public BuiltinOperator<MaxPoolOperator, ::tflite::Pool2DOptions,
@@ -428,6 +464,8 @@ class MaxPool : public BuiltinOperator<MaxPoolOperator, ::tflite::Pool2DOptions,
     op->fused_activation_function =
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class Mul : public BuiltinOperator<MulOperator, ::tflite::MulOptions,
@@ -448,6 +486,8 @@ class Mul : public BuiltinOperator<MulOperator, ::tflite::MulOptions,
     op->fused_activation_function =
         ActivationFunction::Deserialize(options.fused_activation_function());
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class Pad : public BuiltinOperator<PadOperator, ::tflite::PadOptions,
@@ -463,6 +503,41 @@ class Pad : public BuiltinOperator<PadOperator, ::tflite::PadOptions,
 
   void ReadOptions(const TfLiteOptions& options,
                    TocoOperator* op) const override {}
+
+  int GetVersion(const Operator& op) const override { return 1; }
+};
+
+class Tile
+    : public BuiltinOperator<TensorFlowTileOperator, ::tflite::TileOptions,
+                             ::tflite::BuiltinOptions_TileOptions> {
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateTileOptions(*builder);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {}
+  int GetVersion(const Operator& op) const override { return 1; }
+};
+
+class PadV2 : public BuiltinOperator<PadV2Operator, ::tflite::PadV2Options,
+                                     ::tflite::BuiltinOptions_PadV2Options> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreatePadV2Options(*builder);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {}
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class Reshape
@@ -484,6 +559,8 @@ class Reshape
     op->shape.insert(op->shape.end(), options.new_shape()->begin(),
                      options.new_shape()->end());
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class Softmax
@@ -501,6 +578,8 @@ class Softmax
                    TocoOperator* op) const override {
     op->beta = options.beta();
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class SpaceToDepth
@@ -519,6 +598,8 @@ class SpaceToDepth
                    TocoOperator* op) const override {
     op->block_size = options.block_size();
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class Transpose
@@ -534,6 +615,8 @@ class Transpose
 
   void ReadOptions(const TfLiteOptions& options,
                    TocoOperator* op) const override {}
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class Lstm : public BuiltinOperator<LstmCellOperator, ::tflite::LSTMOptions,
@@ -543,11 +626,21 @@ class Lstm : public BuiltinOperator<LstmCellOperator, ::tflite::LSTMOptions,
   flatbuffers::Offset<TfLiteOptions> WriteOptions(
       const TocoOperator& op,
       flatbuffers::FlatBufferBuilder* builder) const override {
+    ::tflite::LSTMKernelType kernel_type;
+    switch (op.kernel_type) {
+      case LstmCellOperator::KERNEL_BASIC:
+        kernel_type = ::tflite::LSTMKernelType_BASIC;
+        break;
+      case LstmCellOperator::KERNEL_FULL:
+        kernel_type = ::tflite::LSTMKernelType_FULL;
+        break;
+    }
+
     // Current toco converter only supports tanh, no clip.
     return ::tflite::CreateLSTMOptions(*builder, /*fused_activation_function=*/
                                        ::tflite::ActivationFunctionType_TANH,
                                        /*cell_clip=*/0.0,
-                                       /*proj_clip=*/0.0);
+                                       /*proj_clip=*/0.0, kernel_type);
   }
 
   void ReadOptions(const TfLiteOptions& options,
@@ -555,6 +648,25 @@ class Lstm : public BuiltinOperator<LstmCellOperator, ::tflite::LSTMOptions,
     // Only support tanh activation, so check that tflite type is tanh.
     CHECK(options.fused_activation_function() ==
           ::tflite::ActivationFunctionType_TANH);
+
+    switch (options.kernel_type()) {
+      case ::tflite::LSTMKernelType_BASIC:
+        op->kernel_type = LstmCellOperator::KERNEL_BASIC;
+        break;
+      case ::tflite::LSTMKernelType_FULL:
+        op->kernel_type = LstmCellOperator::KERNEL_FULL;
+        break;
+    }
+  }
+
+  int GetVersion(const Operator& op) const override {
+    const auto& lstm_op = static_cast<const LstmCellOperator&>(op);
+    switch (lstm_op.kernel_type) {
+      case LstmCellOperator::KERNEL_FULL:
+        return 1;
+      case LstmCellOperator::KERNEL_BASIC:
+        return 2;
+    }
   }
 };
 
@@ -572,6 +684,8 @@ class Mean : public BuiltinOperator<MeanOperator, ::tflite::MeanOptions,
                    TocoOperator* op) const override {
     op->keep_dims = options.keep_dims();
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class ResizeBilinear
@@ -590,6 +704,8 @@ class ResizeBilinear
                    TocoOperator* op) const override {
     op->align_corners = options.align_corners();
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class Squeeze
@@ -611,6 +727,8 @@ class Squeeze
                             options.squeeze_dims()->begin(),
                             options.squeeze_dims()->end());
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class Split
@@ -629,6 +747,8 @@ class Split
                    TocoOperator* op) const override {
     op->num_split = options.num_splits();
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class StridedSlice
@@ -653,6 +773,8 @@ class StridedSlice
     op->new_axis_mask = options.new_axis_mask();
     op->shrink_axis_mask = options.shrink_axis_mask();
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class TopK_V2 : public BuiltinOperator<TopKV2Operator, ::tflite::TopKV2Options,
@@ -667,6 +789,8 @@ class TopK_V2 : public BuiltinOperator<TopKV2Operator, ::tflite::TopKV2Options,
 
   void ReadOptions(const TfLiteOptions& options,
                    TocoOperator* op) const override {}
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class ArgMax : public BuiltinOperator<ArgMaxOperator, ::tflite::ArgMaxOptions,
@@ -684,6 +808,72 @@ class ArgMax : public BuiltinOperator<ArgMaxOperator, ::tflite::ArgMaxOptions,
                    TocoOperator* op) const override {
     op->output_data_type = DataType::Deserialize(options.output_type());
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
+};
+
+class TransposeConv
+    : public BuiltinOperator<TransposeConvOperator,
+                             ::tflite::TransposeConvOptions,
+                             ::tflite::BuiltinOptions_TransposeConvOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    auto padding = Padding::Serialize(op.padding.type);
+    return ::tflite::CreateTransposeConvOptions(
+        *builder, padding, op.stride_width, op.stride_height);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->padding.type = Padding::Deserialize(options.padding());
+    op->stride_width = options.stride_w();
+    op->stride_height = options.stride_h();
+  }
+
+  int GetVersion(const Operator& op) const override { return 1; }
+};
+
+class SparseToDense
+    : public BuiltinOperator<SparseToDenseOperator,
+                             ::tflite::SparseToDenseOptions,
+                             ::tflite::BuiltinOptions_SparseToDenseOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateSparseToDenseOptions(*builder, op.validate_indices);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->validate_indices = options.validate_indices();
+  }
+
+  int GetVersion(const Operator& op) const override { return 1; }
+};
+
+class ExpandDims
+    : public BuiltinOperator<ExpandDimsOperator, ::tflite::ExpandDimsOptions,
+                             ::tflite::BuiltinOptions_ExpandDimsOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateExpandDimsOptions(*builder);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {}
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 class TensorFlowUnsupported : public BaseOperator {
@@ -790,6 +980,12 @@ class TensorFlowUnsupported : public BaseOperator {
     }
     node_def.SerializeToString(&op->tensorflow_node_def);
   }
+
+  int GetVersion(const Operator& op) const override {
+    // TODO(ycling): Deisng and implement a way to plumb the version of
+    // custom ops.
+    return 1;
+  }
 };
 
 namespace {
@@ -832,6 +1028,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
                                OperatorType::kMaxPool));
   ops.emplace_back(new Mul(::tflite::BuiltinOperator_MUL, OperatorType::kMul));
   ops.emplace_back(new Pad(::tflite::BuiltinOperator_PAD, OperatorType::kPad));
+  ops.emplace_back(
+      new PadV2(::tflite::BuiltinOperator_PADV2, OperatorType::kPadV2));
   ops.emplace_back(new Reshape(::tflite::BuiltinOperator_RESHAPE,
                                OperatorType::kTensorFlowReshape));
   ops.emplace_back(
@@ -860,6 +1058,14 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
       new Cast(::tflite::BuiltinOperator_CAST, OperatorType::kCast));
   ops.emplace_back(
       new ArgMax(::tflite::BuiltinOperator_ARG_MAX, OperatorType::kArgMax));
+  ops.emplace_back(
+      new Tile(::tflite::BuiltinOperator_TILE, OperatorType::kTensorFlowTile));
+  ops.emplace_back(new ExpandDims(::tflite::BuiltinOperator_EXPAND_DIMS,
+                                  OperatorType::kExpandDims));
+  ops.emplace_back(new TransposeConv(::tflite::BuiltinOperator_TRANSPOSE_CONV,
+                                     OperatorType::kTransposeConv));
+  ops.emplace_back(new SparseToDense(::tflite::BuiltinOperator_SPARSE_TO_DENSE,
+                                     OperatorType::kSparseToDense));
 
   // Custom Operators.
   ops.emplace_back(
@@ -872,7 +1078,6 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
   // attributes.
   ops.emplace_back(
       new SimpleOperator<AddNOperator>("ADDN", OperatorType::kAddN));
-  ops.emplace_back(new SimpleOperator<NegOperator>("NEG", OperatorType::kNeg));
   ops.emplace_back(new SimpleOperator<TensorFlowRsqrtOperator>(
       "RSQRT", OperatorType::kTensorFlowRsqrt));
   // Simple Operators.
@@ -899,8 +1104,20 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
       "MAXIMUM", OperatorType::kTensorFlowMaximum));
   ops.emplace_back(new SimpleOperator<TensorFlowMinimumOperator>(
       "MINIMUM", OperatorType::kTensorFlowMinimum));
+  ops.emplace_back(new SimpleOperator<TensorFlowGreaterOperator>(
+      "GREATER", OperatorType::kTensorFlowGreater));
+  ops.emplace_back(new SimpleOperator<TensorFlowGreaterEqualOperator>(
+      "GREATER_EQUAL", OperatorType::kTensorFlowGreaterEqual));
   ops.emplace_back(new SimpleOperator<TensorFlowLessOperator>(
       "LESS", OperatorType::kTensorFlowLess));
+  ops.emplace_back(new SimpleOperator<TensorFlowLessEqualOperator>(
+      "LESS_EQUAL", OperatorType::kTensorFlowLessEqual));
+  ops.emplace_back(new SimpleOperator<NegOperator>("NEG", OperatorType::kNeg));
+  ops.emplace_back(
+      new SimpleOperator<SelectOperator>("SELECT", OperatorType::kSelect));
+  ops.emplace_back(
+      new SimpleOperator<SliceOperator>("SLICE", OperatorType::kSlice));
+  ops.emplace_back(new SimpleOperator<SinOperator>("SIN", OperatorType::kSin));
 
   return ops;
 }
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.h b/tensorflow/contrib/lite/toco/tflite/operator.h
index 88af3d6ab6c6af..5e9c20e40dd627 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.h
+++ b/tensorflow/contrib/lite/toco/tflite/operator.h
@@ -25,10 +25,10 @@ namespace tflite {
 
 class BaseOperator;
 
-// Return a map contained all knwo TF Lite Operators, keyed by their names.
+// Return a map contained all know TF Lite Operators, keyed by their names.
 std::map<string, std::unique_ptr<BaseOperator>> BuildOperatorByNameMap();
 
-// Return a map contained all knwo TF Lite Operators, keyed by the type of
+// Return a map contained all know TF Lite Operators, keyed by the type of
 // their tf.mini counterparts.
 std::map<OperatorType, std::unique_ptr<BaseOperator>> BuildOperatorByTypeMap();
 
@@ -77,6 +77,16 @@ class BaseOperator {
       const BuiltinOptions* builtin_options,
       const CustomOptions* custom_options) const = 0;
 
+  // Get the op version by op parameters.
+  // The function need to be overridden to return the op version based on the
+  // parameters. Note:
+  // * The first version for each op should be 1 (to be consistent with the
+  //   default value in Flatbuffer. `return 1;` is okay for newly implemented
+  //   ops.
+  // * When multiple versions are defined for an op, this function need to be
+  //   overridden. (See example in `operator_test.cc`)
+  virtual int GetVersion(const Operator& op) const = 0;
+
  private:
   string name_;
   OperatorType type_;
diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
index 36ed741541eadb..d63c99a5f992be 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
@@ -115,6 +115,10 @@ TEST_F(OperatorTest, SimpleOperators) {
       "MINIMUM", OperatorType::kTensorFlowMinimum);
   CheckSimpleOperator<TensorFlowLessOperator>("LESS",
                                               OperatorType::kTensorFlowLess);
+  CheckSimpleOperator<NegOperator>("NEG", OperatorType::kNeg);
+  CheckSimpleOperator<SelectOperator>("SELECT", OperatorType::kSelect);
+  CheckSimpleOperator<SliceOperator>("SLICE", OperatorType::kSlice);
+  CheckSimpleOperator<SinOperator>("SIN", OperatorType::kSin);
 }
 
 TEST_F(OperatorTest, BuiltinAdd) {
@@ -404,6 +408,27 @@ TEST_F(OperatorTest, BuiltinArgMax) {
   EXPECT_EQ(op.output_data_type, output_toco_op->output_data_type);
 }
 
+TEST_F(OperatorTest, BuiltinTransposeConv) {
+  TransposeConvOperator op;
+  op.stride_width = 123;
+  op.stride_height = 124;
+  op.padding.type = PaddingType::kValid;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("TRANSPOSE_CONV", OperatorType::kTransposeConv), op);
+  EXPECT_EQ(op.stride_width, output_toco_op->stride_width);
+  EXPECT_EQ(op.stride_height, output_toco_op->stride_height);
+  EXPECT_EQ(op.padding.type, output_toco_op->padding.type);
+}
+
+TEST_F(OperatorTest, BuiltinSparseToDense) {
+  SparseToDenseOperator op;
+  op.validate_indices = false;
+  std::unique_ptr<toco::SparseToDenseOperator> output_toco_op =
+      SerializeAndDeserialize(
+          GetOperator("SPARSE_TO_DENSE", OperatorType::kSparseToDense), op);
+  EXPECT_EQ(op.validate_indices, output_toco_op->validate_indices);
+}
+
 TEST_F(OperatorTest, TensorFlowUnsupported) {
   TensorFlowUnsupportedOperator op;
   op.tensorflow_op = "MyCustomUnsupportedOp";
diff --git a/tensorflow/contrib/lite/toco/tflite/simple_operator.h b/tensorflow/contrib/lite/toco/tflite/simple_operator.h
index 72678c82a22a71..a7f7e886f61d3b 100644
--- a/tensorflow/contrib/lite/toco/tflite/simple_operator.h
+++ b/tensorflow/contrib/lite/toco/tflite/simple_operator.h
@@ -41,6 +41,8 @@ class SimpleOperator : public BaseOperator {
       const CustomOptions* custom_options) const override {
     return std::unique_ptr<Operator>(new T);
   }
+
+  int GetVersion(const Operator& op) const override { return 1; }
 };
 
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/toco/tflite/types.cc b/tensorflow/contrib/lite/toco/tflite/types.cc
index c9c2e9ba0184ef..4867c3a62e6840 100644
--- a/tensorflow/contrib/lite/toco/tflite/types.cc
+++ b/tensorflow/contrib/lite/toco/tflite/types.cc
@@ -36,6 +36,16 @@ DataBuffer::FlatBufferOffset CopyStringToBuffer(
   return builder->CreateVector(dst_data.data(), bytes);
 }
 
+// vector<bool> may be implemented using a bit-set, so we can't just
+// reinterpret_cast, accesing it data as vector<bool> and let flatbuffer
+// CreateVector handle it.
+// Background: https://isocpp.org/blog/2012/11/on-vectorbool
+DataBuffer::FlatBufferOffset CopyBoolToBuffer(
+    const Array& array, flatbuffers::FlatBufferBuilder* builder) {
+  const auto& src_data = array.GetBuffer<ArrayDataType::kBool>().data;
+  return builder->CreateVector(src_data);
+}
+
 template <ArrayDataType T>
 DataBuffer::FlatBufferOffset CopyBuffer(
     const Array& array, flatbuffers::FlatBufferBuilder* builder) {
@@ -86,6 +96,8 @@ ::tflite::TensorType DataType::Serialize(ArrayDataType array_data_type) {
       return ::tflite::TensorType_UINT8;
     case ArrayDataType::kString:
       return ::tflite::TensorType_STRING;
+    case ArrayDataType::kBool:
+      return ::tflite::TensorType_BOOL;
     default:
       // FLOAT32 is filled for unknown data types.
       // TODO(ycling): Implement type inference in TF Lite interpreter.
@@ -105,6 +117,8 @@ ArrayDataType DataType::Deserialize(int tensor_type) {
       return ArrayDataType::kString;
     case ::tflite::TensorType_UINT8:
       return ArrayDataType::kUint8;
+    case ::tflite::TensorType_BOOL:
+      return ArrayDataType::kBool;
     default:
       LOG(FATAL) << "Unhandled tensor type '" << tensor_type << "'.";
   }
@@ -125,6 +139,8 @@ flatbuffers::Offset<flatbuffers::Vector<uint8_t>> DataBuffer::Serialize(
       return CopyStringToBuffer(array, builder);
     case ArrayDataType::kUint8:
       return CopyBuffer<ArrayDataType::kUint8>(array, builder);
+    case ArrayDataType::kBool:
+      return CopyBoolToBuffer(array, builder);
     default:
       LOG(FATAL) << "Unhandled array data type.";
   }
@@ -146,6 +162,8 @@ void DataBuffer::Deserialize(const ::tflite::Tensor& tensor,
       return CopyStringFromBuffer(buffer, array);
     case ::tflite::TensorType_UINT8:
       return CopyBuffer<ArrayDataType::kUint8>(buffer, array);
+    case ::tflite::TensorType_BOOL:
+      return CopyBuffer<ArrayDataType::kBool>(buffer, array);
     default:
       LOG(FATAL) << "Unhandled tensor type.";
   }
diff --git a/tensorflow/contrib/lite/toco/tflite/types_test.cc b/tensorflow/contrib/lite/toco/tflite/types_test.cc
index 29fb0b2af22ef1..564f303b9bb41a 100644
--- a/tensorflow/contrib/lite/toco/tflite/types_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/types_test.cc
@@ -28,8 +28,7 @@ using flatbuffers::Vector;
 
 // These are types that exist in TF Mini but don't have a correspondence
 // in TF Lite.
-static const ArrayDataType kUnsupportedTocoTypes[] = {ArrayDataType::kNone,
-                                                      ArrayDataType::kBool};
+static const ArrayDataType kUnsupportedTocoTypes[] = {ArrayDataType::kNone};
 
 // These are TF Lite types for which there is no correspondence in TF Mini.
 static const ::tflite::TensorType kUnsupportedTfLiteTypes[] = {
@@ -44,7 +43,7 @@ template <ArrayDataType T>
 Array ToFlatBufferAndBack(std::initializer_list<::toco::DataType<T>> items) {
   // NOTE: This test does not construct the full buffers list. Since
   // Deserialize normally takes a buffer, we need to synthesize one and provide
-  // an index that is non-zero so the buffer is not assumed to be emtpy.
+  // an index that is non-zero so the buffer is not assumed to be empty.
   Array src;
   src.data_type = T;
   src.GetMutableBuffer<T>().data = items;
@@ -71,7 +70,8 @@ TEST(DataType, SupportedTypes) {
       {ArrayDataType::kUint8, ::tflite::TensorType_UINT8},
       {ArrayDataType::kInt32, ::tflite::TensorType_INT32},
       {ArrayDataType::kInt64, ::tflite::TensorType_INT64},
-      {ArrayDataType::kFloat, ::tflite::TensorType_FLOAT32}};
+      {ArrayDataType::kFloat, ::tflite::TensorType_FLOAT32},
+      {ArrayDataType::kBool, ::tflite::TensorType_BOOL}};
   for (auto x : testdata) {
     EXPECT_EQ(x.second, DataType::Serialize(x.first));
     EXPECT_EQ(x.first, DataType::Deserialize(x.second));
@@ -158,6 +158,13 @@ TEST(DataBuffer, String) {
               ::testing::ElementsAre("AA", "BBB", "Best. String. Ever."));
 }
 
+TEST(DataBuffer, Bool) {
+  Array recovered =
+      ToFlatBufferAndBack<ArrayDataType::kBool>({true, false, true});
+  EXPECT_THAT(recovered.GetBuffer<ArrayDataType::kBool>().data,
+              ::testing::ElementsAre(true, false, true));
+}
+
 TEST(Padding, All) {
   EXPECT_EQ(::tflite::Padding_SAME, Padding::Serialize(PaddingType::kSame));
   EXPECT_EQ(PaddingType::kSame, Padding::Deserialize(::tflite::Padding_SAME));
diff --git a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
index 1611c4d0c0b148..87a1e429b928bf 100644
--- a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
@@ -148,6 +148,21 @@ bool ParseTocoFlagsFromCommandLineFlags(
            "Some fast uint8 GEMM kernels require uint8 weights to avoid the "
            "value 0. This flag allows nudging them to 1 to allow proceeding, "
            "with moderate inaccuracy."),
+      Flag("dedupe_array_min_size_bytes",
+           parsed_flags.dedupe_array_min_size_bytes.bind(),
+           parsed_flags.dedupe_array_min_size_bytes.default_value(),
+           "Minimum size of constant arrays to deduplicate; arrays smaller "
+           "will not be deduplicated."),
+      Flag("split_tflite_lstm_inputs",
+           parsed_flags.split_tflite_lstm_inputs.bind(),
+           parsed_flags.split_tflite_lstm_inputs.default_value(),
+           "Split the LSTM inputs from 5 tensors to 18 tensors for TFLite. "
+           "Ignored if the output format is not TFLite."),
+      Flag("quantize_weights", parsed_flags.quantize_weights.bind(),
+           parsed_flags.quantize_weights.default_value(),
+           "Store weights as quantized weights followed by dequantize "
+           "operations. Computation is still done in float, but reduces model "
+           "size (at the cost of accuracy and latency)."),
   };
   bool asked_for_help =
       *argc == 2 && (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-help"));
@@ -239,6 +254,9 @@ void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags,
   READ_TOCO_FLAG(propagate_fake_quant_num_bits, FlagRequirement::kNone);
   READ_TOCO_FLAG(allow_nudging_weights_to_use_fast_gemm_kernel,
                  FlagRequirement::kNone);
+  READ_TOCO_FLAG(dedupe_array_min_size_bytes, FlagRequirement::kNone);
+  READ_TOCO_FLAG(split_tflite_lstm_inputs, FlagRequirement::kNone);
+  READ_TOCO_FLAG(quantize_weights, FlagRequirement::kNone);
 
   // Deprecated flag handling.
   if (parsed_toco_flags.input_type.specified()) {
@@ -272,6 +290,11 @@ void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags,
     QCHECK(toco::IODataType_Parse(input_types[0], &input_type));
     toco_flags->set_inference_input_type(input_type);
   }
+  if (parsed_toco_flags.quantize_weights.value()) {
+    QCHECK_NE(toco_flags->inference_type(), IODataType::QUANTIZED_UINT8)
+        << "quantize_weights is not supported with inference_type "
+           "QUANTIZED_UINT8.";
+  }
 
 #undef READ_TOCO_FLAG
 #undef PARSE_TOCO_FLAG
diff --git a/tensorflow/contrib/lite/toco/toco_flags.proto b/tensorflow/contrib/lite/toco/toco_flags.proto
index a04017a6bf05fb..4fe57879fb0f38 100644
--- a/tensorflow/contrib/lite/toco/toco_flags.proto
+++ b/tensorflow/contrib/lite/toco/toco_flags.proto
@@ -37,7 +37,7 @@ enum FileFormat {
 // of as properties of models, instead describing how models are to be
 // processed in the context of the present tooling job.
 //
-// Next ID to use: 18.
+// Next ID to use: 21.
 message TocoFlags {
   // Input file format
   optional FileFormat input_format = 1;
@@ -127,7 +127,7 @@ message TocoFlags {
   // transformations that are necessary in order to generate inference
   // code for these graphs. Such graphs should be fixed, but as a
   // temporary work-around, setting this reorder_across_fake_quant flag
-  // allows toco to perform necessary graph transformaitons on them,
+  // allows toco to perform necessary graph transformations on them,
   // at the cost of no longer faithfully matching inference and training
   // arithmetic.
   optional bool reorder_across_fake_quant = 8;
@@ -161,4 +161,17 @@ message TocoFlags {
   // This flag allows nudging them to 1 to allow proceeding, with moderate
   // inaccuracy.
   optional bool allow_nudging_weights_to_use_fast_gemm_kernel = 17;
+
+  // Minimum size of constant arrays to deduplicate; arrays smaller will not be
+  // deduplicated.
+  optional int64 dedupe_array_min_size_bytes = 18 [default = 64];
+
+  // Split the LSTM inputs from 5 tensors to 18 tensors for TFLite.
+  // Ignored if the output format is not TFLite.
+  optional bool split_tflite_lstm_inputs = 19 [default = true];
+
+  // Store weights as quantized weights followed by dequantize operations.
+  // Computation is still done in float, but reduces model size (at the cost of
+  // accuracy and latency).
+  optional bool quantize_weights = 20 [default = false];
 }
diff --git a/tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h b/tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h
index d6c3ba6543378b..7cdd55e5422589 100644
--- a/tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h
+++ b/tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h
@@ -21,8 +21,6 @@ namespace toco {
 
 // Global data for determining whether to output graph viz format from toco.
 struct GraphVizDumpOptions {
-  std::string graphviz_first_array;
-  std::string graphviz_last_array;
   std::string dump_graphviz;
   bool dump_graphviz_video = false;
 
diff --git a/tensorflow/contrib/lite/toco/toco_port.cc b/tensorflow/contrib/lite/toco/toco_port.cc
index a1c8696cd06a30..3a5911c28dc546 100644
--- a/tensorflow/contrib/lite/toco/toco_port.cc
+++ b/tensorflow/contrib/lite/toco/toco_port.cc
@@ -18,6 +18,12 @@ limitations under the License.
 #include "tensorflow/contrib/lite/toco/toco_types.h"
 #include "tensorflow/core/platform/logging.h"
 
+#if defined(__ANDROID__) && defined(__ARM_ARCH_7A__)
+namespace std {
+double round(double x) { return ::round(x); }
+}  // namespace std
+#endif
+
 namespace toco {
 namespace port {
 void CopyToBuffer(const string& src, char* dest) {
diff --git a/tensorflow/contrib/lite/toco/toco_port.h b/tensorflow/contrib/lite/toco/toco_port.h
index 2d5c231bef3508..b00b1e89e85619 100644
--- a/tensorflow/contrib/lite/toco/toco_port.h
+++ b/tensorflow/contrib/lite/toco/toco_port.h
@@ -33,15 +33,38 @@ limitations under the License.
 #define TFLITE_PROTO_NS google::protobuf
 #endif
 
+#ifdef __ANDROID__
+#include <sstream>
+namespace std {
+
+template <typename T>
+std::string to_string(T value)
+{
+    std::ostringstream os ;
+    os << value ;
+    return os.str() ;
+}
+
+#ifdef __ARM_ARCH_7A__
+double round(double x);
+#endif
+}
+#endif
+
 namespace toco {
 namespace port {
 
 class Status {
  public:
+  static Status OK() { return Status(true, ""); }
+
+  // Create a failed status with no message.
   Status() {}
 
   Status(bool ok, const string& message) : ok_(ok), message_(message) {}
 
+  void AppendMessage(const string& message) { message_ += message; }
+
   bool ok() const { return ok_; }
 
   const string error_message() const { return message_; }
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index 7252ec2ea4d886..1fe76f8163cdf2 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -86,6 +86,7 @@ void MakeGeneralGraphTransformationsSet(
   transformations->Add(new ResolveConstantRandomUniform);
   transformations->Add(new ResolveConstantRange);
   transformations->Add(new ResolveConstantReshape);
+  transformations->Add(new ResolveConstantSlice);
   transformations->Add(new ResolveConstantStack);
   transformations->Add(new ResolveConstantStridedSlice);
   transformations->Add(new ResolveConstantTranspose);
@@ -106,6 +107,7 @@ void MakeGeneralGraphTransformationsSet(
   transformations->Add(new ResolveSpaceToBatchNDAttributes);
   transformations->Add(new ResolveBatchToSpaceNDAttributes);
   transformations->Add(new ResolvePadAttributes);
+  transformations->Add(new ResolvePadV2Attributes);
   transformations->Add(new ResolveStridedSliceAttributes);
   transformations->Add(new ResolveSliceAttributes);
   transformations->Add(new ResolveMeanAttributes);
@@ -261,12 +263,15 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
     if (!toco_flags.debug_disable_recurrent_cell_fusion()) {
       transformations.Add(new IdentifyLstmCell);
     }
-    if (output_format == TFLITE) {
+    if (output_format == TFLITE && toco_flags.split_tflite_lstm_inputs()) {
       transformations.Add(new toco::SplitLstmCellInputs);
     } else {
       transformations.Add(new toco::MergeLstmCellInputs);
     }
   }
+  if (toco_flags.quantize_weights()) {
+    transformations.Add(new QuantizeWeights);
+  }
   transformations.Add(new ResolveConstantConcatenation);
   RunGraphTransformations(model, "general graph transformations",
                           transformations);
@@ -345,6 +350,11 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
     EncodeConstantArraysMinMaxByWrappingThemInFakeQuantNodes(model);
   }
 
+  // Deduplicate large constant arrays.
+  if (toco_flags.has_dedupe_array_min_size_bytes()) {
+    DedupeConstantArrays(model, toco_flags.dedupe_array_min_size_bytes());
+  }
+
   LogDump(kLogLevelModelChanged, "AFTER TRANSFORMATIONS", *model);
 
   if (output_format != GRAPHVIZ_DOT && output_format != TFLITE) {
@@ -366,6 +376,7 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
     LOG(INFO) << "Estimated count of arithmetic ops: " << 1e-9 * ops_count
               << " billion (note that a multiply-add is counted as 2 ops).";
   }
+  model->ops_count = ops_count;
 }
 
 void Export(const TocoFlags& toco_flags, const Model& model,
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index f334c51bbb35b8..fe7bed885d8003 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/str_split.h"
+#include "re2/re2.h"
 #include "tensorflow/contrib/lite/toco/dump_graphviz.h"
 #include "tensorflow/contrib/lite/toco/model_flags.pb.h"
 #include "tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h"
@@ -142,6 +143,10 @@ int CountOpsWithInput(const Model& model, const string& array_name) {
     for (auto& input : op->inputs) {
       if (input == array_name) {
         count++;
+        // Breaking here is important: some graphs have ops that use the
+        // same array as more than one of their inputs, and in that case
+        // we want it counted only once.
+        break;
       }
     }
   }
@@ -259,6 +264,23 @@ Operator* GetFirstOpWithInput(const Model& model, const string& array_name) {
   return it == model.operators.end() ? nullptr : it->get();
 }
 
+void ReplaceArrayUsage(Model* model, const string& old_array_name,
+                       const string& new_array_name) {
+  for (auto& op_it : model->operators) {
+    Operator* op = op_it.get();
+    for (size_t i = 0; i < op->inputs.size(); ++i) {
+      if (op->inputs[i] == old_array_name) {
+        op->inputs[i] = new_array_name;
+      }
+    }
+    for (size_t i = 0; i < op->outputs.size(); ++i) {
+      if (op->outputs[i] == old_array_name) {
+        op->outputs[i] = new_array_name;
+      }
+    }
+  }
+}
+
 string FormatArraysList(const Model& model, const std::vector<string>& list) {
   if (list.empty()) {
     return "[]";
@@ -315,6 +337,7 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(LogSoftmax)
     HANDLE_OPERATORTYPENAME_CASE(Div)
     HANDLE_OPERATORTYPENAME_CASE(Tanh)
+    HANDLE_OPERATORTYPENAME_CASE(Sin)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowAll)
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowAssert)
     HANDLE_OPERATORTYPENAME_CASE(ExpandDims)
@@ -334,6 +357,7 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(TensorFlowMinimum)
     HANDLE_OPERATORTYPENAME_CASE(Neg)
     HANDLE_OPERATORTYPENAME_CASE(Pad)
+    HANDLE_OPERATORTYPENAME_CASE(PadV2)
     HANDLE_OPERATORTYPENAME_CASE(StridedSlice)
     HANDLE_OPERATORTYPENAME_CASE(Stack)
     HANDLE_OPERATORTYPENAME_CASE(Range)
@@ -368,6 +392,8 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(Exp)
     HANDLE_OPERATORTYPENAME_CASE(DynamicPartition)
     HANDLE_OPERATORTYPENAME_CASE(DynamicStitch)
+    HANDLE_OPERATORTYPENAME_CASE(Select)
+    HANDLE_OPERATORTYPENAME_CASE(SparseToDense)
     default:
       LOG(FATAL) << "Unhandled op type";
 #undef HANDLE_OPERATORTYPENAME_CASE
@@ -647,6 +673,65 @@ bool IsConstantParameterArray(const Model& model, const string& name) {
   return !!model.GetArray(name).buffer;
 }
 
+namespace {
+template <ArrayDataType A>
+bool CompareArrayBuffers(const Array& lhs_array, const Array& rhs_array) {
+  CHECK(lhs_array.data_type == rhs_array.data_type) << "Data types must match";
+  CHECK(lhs_array.buffer) << "LHS must be constant";
+  CHECK(rhs_array.buffer) << "RHS must be constant";
+  const auto& lhs_data = lhs_array.GetBuffer<A>().data;
+  const auto& rhs_data = rhs_array.GetBuffer<A>().data;
+  CHECK_EQ(lhs_data.size(), rhs_data.size())
+      << "Buffer sizes must match in element count";
+  for (int i = 0; i < lhs_data.size(); ++i) {
+    if (lhs_data[i] != rhs_data[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+}  // namespace
+
+bool CompareConstantArrays(const Array& lhs_array, const Array& rhs_array) {
+  bool attrs_equal =
+      lhs_array.shape() == rhs_array.shape() &&
+      lhs_array.data_type == rhs_array.data_type &&
+      lhs_array.final_data_type == rhs_array.final_data_type &&
+      lhs_array.minmax == rhs_array.minmax &&
+      lhs_array.quantization_params == rhs_array.quantization_params;
+  if (!attrs_equal) {
+    return false;
+  }
+  switch (lhs_array.data_type) {
+    case ArrayDataType::kBool:
+      return CompareArrayBuffers<ArrayDataType::kBool>(lhs_array, rhs_array);
+    case ArrayDataType::kFloat:
+      return CompareArrayBuffers<ArrayDataType::kFloat>(lhs_array, rhs_array);
+    case ArrayDataType::kInt8:
+      return CompareArrayBuffers<ArrayDataType::kInt8>(lhs_array, rhs_array);
+    case ArrayDataType::kUint8:
+      return CompareArrayBuffers<ArrayDataType::kUint8>(lhs_array, rhs_array);
+    case ArrayDataType::kInt16:
+      return CompareArrayBuffers<ArrayDataType::kInt16>(lhs_array, rhs_array);
+    case ArrayDataType::kUint16:
+      return CompareArrayBuffers<ArrayDataType::kUint16>(lhs_array, rhs_array);
+    case ArrayDataType::kInt32:
+      return CompareArrayBuffers<ArrayDataType::kInt32>(lhs_array, rhs_array);
+    case ArrayDataType::kUint32:
+      return CompareArrayBuffers<ArrayDataType::kUint32>(lhs_array, rhs_array);
+    case ArrayDataType::kInt64:
+      return CompareArrayBuffers<ArrayDataType::kInt64>(lhs_array, rhs_array);
+    case ArrayDataType::kUint64:
+      return CompareArrayBuffers<ArrayDataType::kUint64>(lhs_array, rhs_array);
+    case ArrayDataType::kString:
+      return CompareArrayBuffers<ArrayDataType::kString>(lhs_array, rhs_array);
+    default:
+      LOG(FATAL) << "Unsupported data type: "
+                 << ArrayDataTypeName(lhs_array.data_type);
+      return false;
+  }
+}
+
 namespace {
 // Take an array name, which may be something like "name:3_5" and make it
 // acceptable as a TF node name, say "name_3_5";
@@ -903,7 +988,7 @@ void FixOperatorOrdering(Model* model) {
     for (auto i : remaining) {
       bool can_insert = true;
       auto& op = old_operators[i];
-      CHECK(op.get());
+      CHECK(op);
       for (const auto& input : op->inputs) {
         if (!IsConstantParameterArray(*model, input) &&
             !arrays_behind_us.count(input)) {
@@ -1071,6 +1156,60 @@ void FixEdgeArrays(Model* model) {
   }
 }
 
+void DedupeConstantArrays(Model* model, size_t min_size) {
+  // Walk all 0..N and compare with the remaining n+1..N.
+  // This lets us avoid N^2 comparisions and erase duplicate arrays while
+  // iterating.
+  const auto& array_map = model->GetArrayMap();
+  for (auto lhs_array_it = array_map.begin(); lhs_array_it != array_map.end();
+       ++lhs_array_it) {
+    const auto& lhs_array_name = lhs_array_it->first;
+    const auto& lhs_array = *lhs_array_it->second;
+    if (!IsConstantParameterArray(*model, lhs_array_name)) {
+      // Not a constant array; skip.
+      continue;
+    }
+    ArrayDataType final_data_type =
+        lhs_array.final_data_type != ArrayDataType::kNone
+            ? lhs_array.final_data_type
+            : lhs_array.data_type;
+    size_t array_byte_size =
+        lhs_array.buffer->Length() * ElementSize(final_data_type);
+    if (array_byte_size < min_size) {
+      // Too small; skip.
+      continue;
+    }
+
+    auto next_lhs_array_it = lhs_array_it;
+    ++next_lhs_array_it;
+    for (auto rhs_array_it = next_lhs_array_it;
+         rhs_array_it != array_map.end();) {
+      const auto& rhs_array_name = rhs_array_it->first;
+      const auto& rhs_array = *rhs_array_it->second;
+      ++rhs_array_it;
+      if (!IsConstantParameterArray(*model, rhs_array_name)) {
+        // Not a constant array; skip.
+        continue;
+      }
+      if (!IsDiscardableArray(*model, rhs_array_name)) {
+        // Can't remove the array as it's not discardable (such as an IO edge).
+        continue;
+      }
+      if (!CompareConstantArrays(lhs_array, rhs_array)) {
+        // Arrays aren't equal; skip.
+        continue;
+      }
+
+      // Arrays can be deduped!
+      VLOG(1) << "Deduplicating arrays; using " << lhs_array_name
+              << " in place of " << rhs_array_name;
+      ReplaceArrayUsage(model, rhs_array_name, lhs_array_name);
+      // Note: rhs_array_it above is already incremented so this is safe.
+      model->EraseArray(rhs_array_name);
+    }
+  }
+}
+
 namespace {
 void CopyArrayAttribs(const Array& source_array, Array* target_array) {
   target_array->data_type = source_array.data_type;
@@ -1936,15 +2075,21 @@ bool ReshapeIsEquivalentToTranspose(const Model& model,
 void CheckFinalDataTypesSatisfied(const Model& model) {
   for (const auto& array_entry : model.GetArrayMap()) {
     const auto& array = *array_entry.second;
+    if (array.data_type == ArrayDataType::kBool) {
+      // Boolean values are never quantized.
+      continue;
+    }
+
     // If the final data type is int16, the data type may be float, for example
     // after dequantization.
     if (array.final_data_type != ArrayDataType::kNone &&
         array.final_data_type != ArrayDataType::kInt16) {
-      CHECK(array.final_data_type == array.data_type)
+      CHECK(array.data_type == array.final_data_type)
           << "Array \"" << array_entry.first
-          << "\" has mis-matching actual and final data types ("
-          << ArrayDataTypeName(array.data_type) << ","
-          << ArrayDataTypeName(array.final_data_type) << ").";
+          << "\" has mis-matching actual and final data types (data_type="
+          << ArrayDataTypeName(array.data_type)
+          << ", final_data_type=" << ArrayDataTypeName(array.final_data_type)
+          << ").";
     }
   }
 }
@@ -1961,6 +2106,8 @@ ArrayDataType ConvertIODataTypeToArrayDataType(IODataType type) {
       return ArrayDataType::kInt32;
     case INT64:
       return ArrayDataType::kInt64;
+    case BOOL:
+      return ArrayDataType::kBool;
     default:
       return ArrayDataType::kNone;
   }
@@ -1983,38 +2130,58 @@ void FinishBuildingRNNStates(Model* model) {
   }
 }
 
+// Returns the array names that match the ArraysExtraInfo's name and
+// name_regexp. The regexp match is for a full match.
+std::unordered_set<string> ScanArrayNames(
+    const Model& model, const toco::ArraysExtraInfo_Entry& entry) {
+  std::unordered_set<string> matches;
+  if (model.HasArray(entry.name())) {
+    matches.insert(entry.name());
+  }
+  if (!entry.name_regexp().empty()) {
+    const auto& arrays = model.GetArrayMap();
+    const RE2 name_regexp = {entry.name_regexp()};
+    for (auto it = arrays.begin(); it != arrays.end(); ++it) {
+      if (RE2::FullMatch(it->first, name_regexp)) {
+        matches.insert(it->first);
+      }
+    }
+  }
+  return matches;
+}
+
 void UseArraysExtraInfo(Model* model, bool quantize_output) {
   for (const auto& entry : model->flags.arrays_extra_info().entries()) {
-    if (!model->HasArray(entry.name())) {
-      continue;
-    }
-    auto& array = model->GetArray(entry.name());
-    if (entry.has_min() || entry.has_max()) {
-      CHECK_EQ(entry.has_min(), entry.has_max());
-      auto& minmax = array.GetOrCreateMinMax();
-      minmax.min = entry.min();
-      minmax.max = entry.max();
-    }
-    if (entry.has_data_type() && quantize_output) {
-      array.final_data_type =
-          ConvertIODataTypeToArrayDataType(entry.data_type());
-    }
-    if (entry.has_shape()) {
-      array.clear_shape();
-      // Make sure to create the shape even if there are no dims, to
-      // correctly record 0-D shapes.
-      array.mutable_shape();
-      for (int dim : entry.shape().dims()) {
-        array.mutable_shape()->mutable_dims()->push_back(dim);
+    const auto matches = ScanArrayNames(*model, entry);
+    for (const auto& matched_name : matches) {
+      auto& array = model->GetArray(matched_name);
+      if (entry.has_min() || entry.has_max()) {
+        CHECK_EQ(entry.has_min(), entry.has_max());
+        auto& minmax = array.GetOrCreateMinMax();
+        minmax.min = entry.min();
+        minmax.max = entry.max();
       }
-    }
-    if (entry.has_constant_float_value()) {
-      CHECK(array.has_shape());
-      if (array.data_type == ArrayDataType::kFloat) {
-        auto& data = array.GetMutableBuffer<ArrayDataType::kFloat>().data;
-        data.resize(RequiredBufferSizeForShape(array.shape()));
-        for (float& f : data) {
-          f = entry.constant_float_value();
+      if (entry.has_data_type() && quantize_output) {
+        array.final_data_type =
+            ConvertIODataTypeToArrayDataType(entry.data_type());
+      }
+      if (entry.has_shape()) {
+        array.clear_shape();
+        // Make sure to create the shape even if there are no dims, to
+        // correctly record 0-D shapes.
+        array.mutable_shape();
+        for (int dim : entry.shape().dims()) {
+          array.mutable_shape()->mutable_dims()->push_back(dim);
+        }
+      }
+      if (entry.has_constant_float_value()) {
+        CHECK(array.has_shape());
+        if (array.data_type == ArrayDataType::kFloat) {
+          auto& data = array.GetMutableBuffer<ArrayDataType::kFloat>().data;
+          data.resize(RequiredBufferSizeForShape(array.shape()));
+          for (float& f : data) {
+            f = entry.constant_float_value();
+          }
         }
       }
     }
diff --git a/tensorflow/contrib/lite/toco/tooling_util.h b/tensorflow/contrib/lite/toco/tooling_util.h
index 5cc15fa57b3ea4..3b320e80134959 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.h
+++ b/tensorflow/contrib/lite/toco/tooling_util.h
@@ -26,7 +26,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/logging.h"
 #if TOCO_SUPPORT_PORTABLE_PROTOS
-#include "third_party/protobuf/src/google/protobuf/text_format.h"
+#include "third_party/protobuf/include/google/protobuf/text_format.h"
 #endif  // TOCO_SUPPORT_PORTABLE_PROTOS
 #include "tensorflow/contrib/lite/toco/model.h"
 #include "tensorflow/contrib/lite/toco/model_flags.pb.h"
@@ -88,6 +88,10 @@ std::vector<std::unique_ptr<Operator>>::iterator FindOpWithInput(
 Operator* GetOpWithInput(const Model& model, const string& array_name);
 Operator* GetFirstOpWithInput(const Model& model, const string& array_name);
 
+// Replaces all uses of the |old_array_name| with the |new_array_name|.
+void ReplaceArrayUsage(Model* model, const string& old_array_name,
+                       const string& new_array_name);
+
 std::vector<std::unique_ptr<Operator>>::const_iterator FindOp(
     const Model& model, const Operator* op);
 std::vector<std::unique_ptr<Operator>>::iterator FindOp(Model& model,
@@ -138,6 +142,9 @@ int RequiredBufferSizeForShape(const Shape& shape);
 
 bool IsConstantParameterArray(const Model& model, const string& name);
 
+// Compares two constant parameter arrays for exact equality.
+bool CompareConstantArrays(const Array& lhs_array, const Array& rhs_array);
+
 void CheckNoMissingArray(const Model& model);
 void CheckInvariants(const Model& model);
 
@@ -150,6 +157,15 @@ void FixNoOrphanedArray(Model* model);
 // Fixes input/output arrays that may have issues during export or inference.
 void FixEdgeArrays(Model* model);
 
+// Finds and deduplicates large constant arrays in the model.
+// After constant propagation runs it's possible to end up with several of the
+// same large array (whether they be zeros or otherwise).
+//
+// |min_size| is used to adjust the minimum size in bytes of an array before
+// it's considered for deduping. As deduping can make the graphs more difficult
+// to read this helps prevent small arrays from spidering out.
+void DedupeConstantArrays(Model* model, size_t min_size);
+
 // Copies the contents of an array into another.
 // Expects that the shape and data type match.
 template <ArrayDataType A>
@@ -294,6 +310,35 @@ void FinishBuildingRNNStates(Model* model);
 
 void UseArraysExtraInfo(Model* model, bool quantize_output);
 
+// Calculates the number of elements in tensor given a shape. Shape elements
+// are assumed to be of type T, while the result total is of type U. If U
+// doesn't have enough range to represent the sum of elements, an error is
+// returned.
+template <typename T, typename U>
+port::Status NumElements(const std::vector<T>& shape, U* num_elements) {
+  static_assert(
+      std::numeric_limits<T>::max() <= std::numeric_limits<uint64_t>::max(),
+      "vector type exceed capabilities of NumElements");
+
+  *num_elements = 1;
+  for (const T& dim : shape) {
+    if (dim < 0) {
+      // TensorFlow's shapes sometimes include -1 to represent an "unknown"
+      // size but TOCO isn't able to create arrays of unknown sizes and will
+      // crash in RequiredBufferSizeForShape().
+      return port::Status(false,
+                          "Tensor shape should not include negative values");
+    }
+    if (static_cast<uint64_t>(dim) >
+        std::numeric_limits<U>::max() / *num_elements) {
+      *num_elements = 0;
+      return port::Status(false, "Tensor shape is too large");
+    }
+    *num_elements *= dim;
+  }
+  return port::Status::OK();
+}
+
 }  // namespace toco
 
 #endif  // TENSORFLOW_CONTRIB_LITE_TOCO_TOOLING_UTIL_H_
diff --git a/tensorflow/contrib/lite/toco/tooling_util_test.cc b/tensorflow/contrib/lite/toco/tooling_util_test.cc
index 22955ce95661a9..87fd30db2cf548 100644
--- a/tensorflow/contrib/lite/toco/tooling_util_test.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util_test.cc
@@ -93,4 +93,85 @@ TEST_P(ShapeTest, Agrees) {
 INSTANTIATE_TEST_CASE_P(AgreeBroadcast, ShapeTest,
                         ::testing::ValuesIn(CreateShapePairs()));
 
+static const char kNegativeValuesMessage[] =
+    "Tensor shape should not include negative values";
+static const char kLargeTensorMessage[] = "Tensor shape is too large";
+
+TEST(NumElementsTest, Int) {
+  int count;
+  port::Status status = port::Status::OK();
+
+  status = NumElements(std::vector<int>{1024, 1024, 2047}, &count);
+  EXPECT_TRUE(status.ok());
+  EXPECT_EQ(count, 2146435072);
+
+  status = NumElements(std::vector<int>{1, 2, -3}, &count);
+  EXPECT_EQ(status.error_message(), kNegativeValuesMessage);
+
+  status = NumElements(std::vector<int>{1024, 1024, 2048}, &count);
+  EXPECT_EQ(status.error_message(), kLargeTensorMessage);
+}
+
+TEST(NumElementsTest, Int32) {
+  int32_t count;
+  port::Status status = port::Status::OK();
+
+  status = NumElements(std::vector<int32_t>{1024, 1024, 2047}, &count);
+  EXPECT_TRUE(status.ok());
+  EXPECT_EQ(count, 2146435072);
+
+  status = NumElements(std::vector<int32_t>{1, 2, -3}, &count);
+  EXPECT_EQ(status.error_message(), kNegativeValuesMessage);
+
+  status = NumElements(std::vector<int32_t>{1024, 1024, 2048}, &count);
+  EXPECT_EQ(status.error_message(), kLargeTensorMessage);
+}
+
+TEST(NumElementsTest, Int64) {
+  int64_t count;
+  port::Status status = port::Status::OK();
+
+  status = NumElements(std::vector<int64_t>{16777216, 16777216, 32767}, &count);
+  EXPECT_TRUE(status.ok());
+  EXPECT_EQ(count, 9223090561878065152LL);
+
+  status = NumElements(std::vector<int64_t>{1, 2, -3}, &count);
+  EXPECT_EQ(status.error_message(), kNegativeValuesMessage);
+
+  status = NumElements(std::vector<int64_t>{16777216, 16777216, 32768}, &count);
+  EXPECT_EQ(status.error_message(), kLargeTensorMessage);
+}
+
+TEST(NumElementsTest, UnsignedInt32) {
+  uint32_t count;
+  port::Status status = port::Status::OK();
+
+  status = NumElements(std::vector<uint32_t>{1024, 2048, 2047}, &count);
+  EXPECT_TRUE(status.ok());
+  EXPECT_EQ(count, 4292870144);
+
+  status = NumElements(std::vector<int>{1, 2, -3}, &count);
+  EXPECT_EQ(status.error_message(), kNegativeValuesMessage);
+
+  status = NumElements(std::vector<uint32_t>{1024, 2048, 2048}, &count);
+  EXPECT_EQ(status.error_message(), kLargeTensorMessage);
+}
+
+TEST(NumElementsTest, UnsignedInt64) {
+  uint64_t count;
+  port::Status status = port::Status::OK();
+
+  status =
+      NumElements(std::vector<uint64_t>{16777216, 16777216, 65535}, &count);
+  EXPECT_TRUE(status.ok());
+  EXPECT_EQ(count, 18446462598732840960ULL);
+
+  status = NumElements(std::vector<int>{1, 2, -3}, &count);
+  EXPECT_EQ(status.error_message(), kNegativeValuesMessage);
+
+  status =
+      NumElements(std::vector<uint64_t>{16777216, 16777216, 65536}, &count);
+  EXPECT_EQ(status.error_message(), kLargeTensorMessage);
+}
+
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/types.proto b/tensorflow/contrib/lite/toco/types.proto
index 03bd6150bc86bb..421667a83c14a7 100644
--- a/tensorflow/contrib/lite/toco/types.proto
+++ b/tensorflow/contrib/lite/toco/types.proto
@@ -37,4 +37,7 @@ enum IODataType {
 
   // Int16, quantized
   QUANTIZED_INT16 = 6;
+
+  // Boolean
+  BOOL = 7;
 }
diff --git a/tensorflow/contrib/lite/tools/BUILD b/tensorflow/contrib/lite/tools/BUILD
index 7b3569ea9c8b15..5913847329eeae 100644
--- a/tensorflow/contrib/lite/tools/BUILD
+++ b/tensorflow/contrib/lite/tools/BUILD
@@ -7,6 +7,8 @@ licenses(["notice"])  # Apache 2.0
 load("//tensorflow/contrib/lite:special_rules.bzl", "tflite_portable_test_suite")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
+common_copts = ["-Wall"]
+
 py_binary(
     name = "visualize",
     srcs = ["visualize.py"],
@@ -28,35 +30,6 @@ tf_cc_binary(
     ],
 )
 
-tf_cc_binary(
-    name = "benchmark_model",
-    srcs = ["benchmark_model.cc"],
-    linkopts = select({
-        "//tensorflow:android": [
-            "-pie",
-            "-landroid",
-            "-lm",
-            "-z defs",
-            "-Wl,--exclude-libs,ALL",  # Exclude syms in all libs from auto export
-        ],
-        "//conditions:default": [],
-    }),
-    deps = [
-        ":mutable_op_resolver",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite:string_util",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
-    ] + select({
-        "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
-        ],
-        "//conditions:default": [
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-        ],
-    }),
-)
-
 cc_library(
     name = "gen_op_registration",
     srcs = ["gen_op_registration.cc"],
@@ -87,13 +60,6 @@ cc_test(
     ],
 )
 
-cc_library(
-    name = "mutable_op_resolver",
-    srcs = ["mutable_op_resolver.cc"],
-    hdrs = ["mutable_op_resolver.h"],
-    deps = ["//tensorflow/contrib/lite:framework"],
-)
-
 cc_library(
     name = "verifier",
     srcs = ["verifier.cc"],
@@ -103,7 +69,6 @@ cc_library(
         "//tensorflow/contrib/lite:schema_fbs_version",
         "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/schema:schema_fbs",
-        "@com_google_absl//absl/base:core_headers",
     ],
 )
 
@@ -115,11 +80,9 @@ cc_test(
         "tflite_not_portable",
     ],
     deps = [
-        ":mutable_op_resolver",
         ":verifier",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:schema_fbs_version",
-        "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/schema:schema_fbs",
         "//tensorflow/contrib/lite/testing:util",
         "//tensorflow/core:framework_lite",
diff --git a/tensorflow/contrib/lite/tools/benchmark/BUILD b/tensorflow/contrib/lite/tools/benchmark/BUILD
new file mode 100644
index 00000000000000..4824a4dbdef6f5
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/benchmark/BUILD
@@ -0,0 +1,91 @@
+package(default_visibility = [
+    "//visibility:public",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow/contrib/lite:special_rules.bzl", "tflite_portable_test_suite")
+
+common_copts = ["-Wall"]
+
+cc_binary(
+    name = "benchmark_model",
+    srcs = [
+        "benchmark_main.cc",
+        "logging.h",
+    ],
+    copts = common_copts,
+    linkopts = select({
+        "//tensorflow:android": [
+            "-pie",
+            "-landroid",
+            "-lm",
+            "-z defs",
+            "-Wl,--exclude-libs,ALL",  # Exclude syms in all libs from auto export
+        ],
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":benchmark_tflite_model_lib",
+    ],
+)
+
+cc_library(
+    name = "command_line_flags",
+    srcs = ["command_line_flags.cc"],
+    hdrs = ["command_line_flags.h"],
+    copts = common_copts,
+    visibility = ["//visibility:private"],
+)
+
+cc_test(
+    name = "command_line_flags_test",
+    srcs = ["command_line_flags_test.cc"],
+    copts = common_copts,
+    visibility = ["//visibility:private"],
+    deps = [
+        ":command_line_flags",
+        "//tensorflow/contrib/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "benchmark_tflite_model_lib",
+    srcs = [
+        "benchmark_tflite_model.cc",
+        "logging.h",
+    ],
+    hdrs = ["benchmark_tflite_model.h"],
+    copts = common_copts,
+    deps = [
+        ":benchmark_model_lib",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+        "//tensorflow/contrib/lite/profiling:profile_summarizer",
+        "//tensorflow/contrib/lite/profiling:profiler",
+    ],
+)
+
+cc_library(
+    name = "benchmark_model_lib",
+    srcs = [
+        "benchmark_model.cc",
+        "logging.h",
+    ],
+    hdrs = ["benchmark_model.h"],
+    copts = common_copts,
+    deps = [
+        ":command_line_flags",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+        "//tensorflow/contrib/lite/profiling:profile_summarizer",
+        "//tensorflow/contrib/lite/profiling:profiler",
+        "//tensorflow/contrib/lite/profiling:time",
+        "//tensorflow/core:stats_calculator_portable",
+    ],
+)
+
+tflite_portable_test_suite()
diff --git a/tensorflow/contrib/lite/tools/benchmark/README.md b/tensorflow/contrib/lite/tools/benchmark/README.md
new file mode 100644
index 00000000000000..e6f333aa5bb114
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/benchmark/README.md
@@ -0,0 +1,172 @@
+# TFLite Model Benchmark Tool
+
+## Description
+
+A simple C++ binary to benchmark a TFLite model and its individual operators,
+both on desktop machines and on Android.
+
+## To build/install/run
+
+### On Android:
+
+(0) Refer to https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android to edit the `WORKSPACE` to configure the android NDK/SDK.
+
+(1) Build for your specific platform, e.g.:
+
+```
+bazel build -c opt \
+  --config=android_arm \
+  --cxxopt='--std=c++11' \
+  tensorflow/contrib/lite/tools/benchmark:benchmark_model
+```
+
+(2) Connect your phone. Push the binary to your phone with adb push
+     (make the directory if required):
+
+```
+adb push bazel-bin/tensorflow/contrib/lite/tools/benchmark/benchmark_model /data/local/tmp
+```
+
+(3) Make the binary executable.
+
+```
+adb shell chmod +x /data/local/tmp/benchmark_model
+```
+
+(4) Push the compute graph that you need to test. For example:
+
+```
+adb push mobilenet_quant_v1_224.tflite /data/local/tmp
+```
+
+(5) Run the benchmark. For example:
+
+```
+adb shell /data/local/tmp/benchmark_model \
+  --graph=/data/local/tmp/mobilenet_quant_v1_224.tflite \
+  --input_layer="Placeholder" \
+  --input_layer_shape="1,224,224,3" \
+  --input_layer_type="uint8" \
+  --output_layer="MobilenetV1/Predictions/Reshape_1" \
+  --num_threads=4
+```
+
+### On desktop:
+(1) build the binary
+
+```
+bazel build -c opt tensorflow/contrib/lite/tools/benchmark:benchmark_model
+```
+
+(2) Run on your compute graph, similar to the Android case but without the need of adb shell.
+For example:
+
+```
+bazel-bin/tensorflow/contrib/lite/tools/benchmark/benchmark_model \
+  --graph=mobilenet_quant_v1_224.tflite \
+  --input_layer="Placeholder" \
+  --input_layer_shape="1,224,224,3" \
+  --input_layer_type="uint8" \
+  --output_layer="MobilenetV1/Predictions/Reshape_1" \
+  --num_threads=4
+```
+
+The MobileNet graph used as an example here may be downloaded from
+https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip
+
+## Profiling model operators
+The benchmark model binary also allows you to profile operators and give execution times of each operator. To do this,
+compile the binary with a compiler flag that enables profiling to be compiled in. Pass **--copt=-DTFLITE_PROFILING_ENABLED**
+to compile benchmark with profiling support.
+For example, to compile with profiling support on Android, add this flag to the previous command:
+
+```
+bazel build -c opt \
+  --config=android_arm \
+  --cxxopt='--std=c++11' \
+  --copt=-DTFLITE_PROFILING_ENABLED \
+  tensorflow/contrib/lite/tools/benchmark:benchmark_model
+```
+This compiles TFLite with profiling enabled, now you can run the benchmark binary like before. The binary will produce detailed statistics for each operation similar to those shown below:
+
+```
+
+============================== Run Order ==============================
+	             [node type]	  [start]	  [first]	 [avg ms]	     [%]	  [cdf%]	  [mem KB]	[times called]	[Name]
+	                 CONV_2D	    0.000	    9.132	    9.132	  0.121%	  0.121%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_0/Relu6]
+	       DEPTHWISE_CONV_2D	    9.135	    3.280	    3.280	  0.043%	  0.165%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_1_depthwise/Relu6]
+	                 CONV_2D	   12.419	    6.877	    6.877	  0.091%	  0.256%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_1_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   19.299	    1.708	    1.708	  0.023%	  0.278%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_2_depthwise/Relu6]
+	                 CONV_2D	   21.012	    4.162	    4.162	  0.055%	  0.334%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_2_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   25.177	    3.520	    3.520	  0.047%	  0.380%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_3_depthwise/Relu6]
+	                 CONV_2D	   28.701	   10.218	   10.218	  0.136%	  0.516%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_3_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   38.922	    0.827	    0.827	  0.011%	  0.527%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_4_depthwise/Relu6]
+	                 CONV_2D	   39.752	    1.401	    1.401	  0.019%	  0.545%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_4_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   41.156	    1.290	    1.290	  0.017%	  0.563%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_5_depthwise/Relu6]
+	                 CONV_2D	   42.448	    5.995	    5.995	  0.080%	  0.642%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_5_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   48.445	    0.409	    0.409	  0.005%	  0.647%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_6_depthwise/Relu6]
+	                 CONV_2D	   48.856	    6.167	    6.167	  0.082%	  0.729%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_6_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   55.026	    0.629	    0.629	  0.008%	  0.738%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_7_depthwise/Relu6]
+	                 CONV_2D	   55.656	    6.464	    6.464	  0.086%	  0.823%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_7_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   62.124	    0.647	    0.647	  0.009%	  0.832%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_8_depthwise/Relu6]
+	                 CONV_2D	   62.774	   14.666	   14.666	  0.195%	  1.026%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_8_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   77.444	    0.635	    0.635	  0.008%	  1.035%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_9_depthwise/Relu6]
+	                 CONV_2D	   78.081	    7.186	    7.186	  0.095%	  1.130%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_9_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   85.270	    0.646	    0.646	  0.009%	  1.139%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_10_depthwise/Relu6]
+	                 CONV_2D	   85.918	    9.529	    9.529	  0.126%	  1.265%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_10_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   95.451	    0.628	    0.628	  0.008%	  1.273%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_11_depthwise/Relu6]
+	                 CONV_2D	   96.081	    2.077	    2.077	  0.028%	  1.301%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_11_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   98.162	    0.168	    0.168	  0.002%	  1.303%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_12_depthwise/Relu6]
+	                 CONV_2D	   98.332	    1.007	    1.007	  0.013%	  1.317%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_12_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   99.342	    0.288	    0.288	  0.004%	  1.320%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_13_depthwise/Relu6]
+	                 CONV_2D	   99.632	    8.197	    8.197	  0.109%	  1.429%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_13_pointwise/Relu6]
+	         AVERAGE_POOL_2D	  107.832	    0.045	    0.045	  0.001%	  1.430%	     0.000	        0	[MobilenetV1/Logits/AvgPool_1a/AvgPool]
+	                 CONV_2D	  107.878	    0.325	    0.325	  0.004%	  1.434%	     0.000	        0	[MobilenetV1/Logits/Conv2d_1c_1x1/BiasAdd]
+	                 RESHAPE	  108.206	    0.003	    0.003	  0.000%	  1.434%	     0.000	        0	[MobilenetV1/Predictions/Reshape]
+	                 SOFTMAX	  108.211	    0.038	    0.038	  0.001%	  1.434%	     0.000	        0	[MobilenetV1/Predictions/Softmax]
+
+============================== Top by Computation Time ==============================
+	             [node type]	  [start]	  [first]	 [avg ms]	     [%]	  [cdf%]	  [mem KB]	[times called]	[Name]
+	                 CONV_2D	   62.774	   14.666	   14.666	  0.195%	  0.195%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_8_pointwise/Relu6]
+	                 CONV_2D	   28.701	   10.218	   10.218	  0.136%	  0.330%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_3_pointwise/Relu6]
+	                 CONV_2D	   85.918	    9.529	    9.529	  0.126%	  0.456%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_10_pointwise/Relu6]
+	                 CONV_2D	    0.000	    9.132	    9.132	  0.121%	  0.578%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_0/Relu6]
+	                 CONV_2D	   99.632	    8.197	    8.197	  0.109%	  0.686%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_13_pointwise/Relu6]
+	                 CONV_2D	   78.081	    7.186	    7.186	  0.095%	  0.782%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_9_pointwise/Relu6]
+	                 CONV_2D	   12.419	    6.877	    6.877	  0.091%	  0.873%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_1_pointwise/Relu6]
+	                 CONV_2D	   55.656	    6.464	    6.464	  0.086%	  0.958%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_7_pointwise/Relu6]
+	                 CONV_2D	   48.856	    6.167	    6.167	  0.082%	  1.040%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_6_pointwise/Relu6]
+	                 CONV_2D	   42.448	    5.995	    5.995	  0.080%	  1.120%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_5_pointwise/Relu6]
+
+============================== Top by Memory Use ==============================
+	             [node type]	  [start]	  [first]	 [avg ms]	     [%]	  [cdf%]	  [mem KB]	[times called]	[Name]
+	                 SOFTMAX	  108.211	    0.038	    0.038	  0.001%	  0.001%	     0.000	        0	[MobilenetV1/Predictions/Softmax]
+	                 RESHAPE	  108.206	    0.003	    0.003	  0.000%	  0.001%	     0.000	        0	[MobilenetV1/Predictions/Reshape]
+	                 CONV_2D	   78.081	    7.186	    7.186	  0.095%	  0.096%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_9_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   77.444	    0.635	    0.635	  0.008%	  0.104%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_9_depthwise/Relu6]
+	                 CONV_2D	   62.774	   14.666	   14.666	  0.195%	  0.299%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_8_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   62.124	    0.647	    0.647	  0.009%	  0.307%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_8_depthwise/Relu6]
+	                 CONV_2D	   55.656	    6.464	    6.464	  0.086%	  0.393%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_7_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   55.026	    0.629	    0.629	  0.008%	  0.401%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_7_depthwise/Relu6]
+	                 CONV_2D	   48.856	    6.167	    6.167	  0.082%	  0.483%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_6_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   48.445	    0.409	    0.409	  0.005%	  0.489%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_6_depthwise/Relu6]
+
+Number of nodes executed: 31
+============================== Summary by node type ==============================
+	             [Node type]	  [count]	  [avg ms]	    [avg %]	    [cdf %]	  [mem KB]	[times called]
+	                 CONV_2D	       15	     1.861	    86.679%	    86.679%	     0.000	        0
+	       DEPTHWISE_CONV_2D	       13	     0.286	    13.321%	   100.000%	     0.000	        0
+	                 SOFTMAX	        1	     0.000	     0.000%	   100.000%	     0.000	        0
+	                 RESHAPE	        1	     0.000	     0.000%	   100.000%	     0.000	        0
+	         AVERAGE_POOL_2D	        1	     0.000	     0.000%	   100.000%	     0.000	        0
+
+Timings (microseconds): count=50 first=108164 curr=128308 min=102850 max=197072 avg=150805 std=24368
+Memory (bytes): count=0
+31 nodes observed
+
+
+Average inference timings in us: Warmup: 135310, Init: 12123, no stats: 150988
+
+```
+
+
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_main.cc b/tensorflow/contrib/lite/tools/benchmark/benchmark_main.cc
new file mode 100644
index 00000000000000..372d31e838e566
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/benchmark/benchmark_main.cc
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h"
+#include "tensorflow/contrib/lite/tools/benchmark/logging.h"
+
+namespace tflite {
+namespace benchmark {
+
+int Main(int argc, char** argv) {
+#ifdef TFLITE_CUSTOM_OPS_HEADER
+  TFLITE_LOG(INFO) << "STARTING with custom ops!";
+#else
+  TFLITE_LOG(INFO) << "STARTING!";
+#endif
+  BenchmarkTfLiteModel benchmark;
+  BenchmarkLoggingListener listener;
+  benchmark.AddListener(&listener);
+  benchmark.Run(argc, argv);
+  return 0;
+}
+}  // namespace benchmark
+}  // namespace tflite
+
+int main(int argc, char** argv) { return tflite::benchmark::Main(argc, argv); }
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_model.cc b/tensorflow/contrib/lite/tools/benchmark/benchmark_model.cc
new file mode 100644
index 00000000000000..a8a9a6112c1ec0
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/benchmark/benchmark_model.cc
@@ -0,0 +1,139 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/tools/benchmark/benchmark_model.h"
+
+#include <time.h>
+
+#include <iostream>
+#include <sstream>
+
+#include "tensorflow/contrib/lite/profiling/time.h"
+#include "tensorflow/contrib/lite/tools/benchmark/logging.h"
+
+namespace {
+void SleepForSeconds(double sleep_seconds) {
+  if (sleep_seconds <= 0.0) {
+    return;
+  }
+  // Convert the run_delay string into a timespec.
+  timespec req;
+  req.tv_sec = static_cast<time_t>(sleep_seconds);
+  req.tv_nsec = (sleep_seconds - req.tv_sec) * 1000000000;
+  // If requested, sleep between runs for an arbitrary amount of time.
+  // This can be helpful to determine the effect of mobile processor
+  // scaling and thermal throttling.
+#ifdef PLATFORM_WINDOWS
+  Sleep(sleep_seconds * 1000);
+#else
+  nanosleep(&req, nullptr);
+#endif
+}
+
+}  // namespace
+
+namespace tflite {
+namespace benchmark {
+using tensorflow::Stat;
+
+void BenchmarkLoggingListener::OnBenchmarkEnd(const BenchmarkResults &results) {
+  auto inference_us = results.inference_time_us();
+  auto init_us = results.startup_latency_us();
+  auto warmup_us = results.warmup_time_us();
+  TFLITE_LOG(INFO) << "Average inference timings in us: "
+                   << "Warmup: " << warmup_us.avg() << ", "
+                   << "Init: " << init_us << ", "
+                   << "no stats: " << inference_us.avg();
+}
+
+std::vector<Flag> BenchmarkModel::GetFlags() {
+  return {
+      Flag("num_runs", &params_.num_runs, "number of runs"),
+      Flag("run_delay", &params_.run_delay, "delay between runs in seconds"),
+      Flag("num_threads", &params_.num_threads, "number of threads"),
+      Flag("benchmark_name", &params_.benchmark_name, "benchmark name"),
+      Flag("output_prefix", &params_.output_prefix, "benchmark output prefix"),
+      Flag("warmup_runs", &params_.warmup_runs,
+           "how many runs to initialize model"),
+  };
+}
+
+void BenchmarkModel::LogFlags() {
+  TFLITE_LOG(INFO) << "Num runs: [" << params_.num_runs << "]";
+  TFLITE_LOG(INFO) << "Inter-run delay (seconds): [" << params_.run_delay
+                   << "]";
+  TFLITE_LOG(INFO) << "Num threads: [" << params_.num_threads << "]";
+  TFLITE_LOG(INFO) << "Benchmark name: [" << params_.benchmark_name << "]";
+  TFLITE_LOG(INFO) << "Output prefix: [" << params_.output_prefix << "]";
+  TFLITE_LOG(INFO) << "Warmup runs: [" << params_.warmup_runs << "]";
+}
+
+Stat<int64_t> BenchmarkModel::Run(int num_times, RunType run_type) {
+  Stat<int64_t> run_stats;
+  TFLITE_LOG(INFO) << "Running benchmark for " << num_times << " iterations ";
+  for (int run = 0; run < num_times; run++) {
+    listeners_.OnSingleRunStart(run_type);
+    int64_t start_us = profiling::time::NowMicros();
+    RunImpl();
+    int64_t end_us = profiling::time::NowMicros();
+    listeners_.OnSingleRunEnd();
+
+    run_stats.UpdateStat(end_us - start_us);
+    SleepForSeconds(params_.run_delay);
+  }
+
+  std::stringstream stream;
+  run_stats.OutputToStream(&stream);
+  TFLITE_LOG(INFO) << stream.str() << std::endl;
+
+  return run_stats;
+}
+
+void BenchmarkModel::Run(int argc, char **argv) {
+  if (!ParseFlags(argc, argv)) {
+    return;
+  }
+
+  LogFlags();
+
+  listeners_.OnBenchmarkStart(params_);
+  int64_t initialization_start_us = profiling::time::NowMicros();
+  Init();
+  int64_t initialization_end_us = profiling::time::NowMicros();
+  int64_t startup_latency_us = initialization_end_us - initialization_start_us;
+  TFLITE_LOG(INFO) << "Initialized session in " << startup_latency_us / 1e3
+                   << "ms";
+
+  uint64_t input_bytes = ComputeInputBytes();
+  Stat<int64_t> warmup_time_us = Run(params_.warmup_runs, WARMUP);
+  Stat<int64_t> inference_time_us = Run(params_.num_runs, REGULAR);
+  listeners_.OnBenchmarkEnd(
+      {startup_latency_us, input_bytes, warmup_time_us, inference_time_us});
+}
+
+bool BenchmarkModel::ParseFlags(int argc, char **argv) {
+  auto flag_list = GetFlags();
+  const bool parse_result =
+      Flags::Parse(&argc, const_cast<const char **>(argv), flag_list);
+  if (!parse_result) {
+    std::string usage = Flags::Usage(argv[0], flag_list);
+    TFLITE_LOG(ERROR) << usage;
+    return false;
+  }
+  return ValidateFlags();
+}
+
+}  // namespace benchmark
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_model.h b/tensorflow/contrib/lite/tools/benchmark/benchmark_model.h
new file mode 100644
index 00000000000000..d48f693693c2ce
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/benchmark/benchmark_model.h
@@ -0,0 +1,161 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_MODEL_H_
+#define TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_MODEL_H_
+
+#include <cmath>
+#include <limits>
+#include <ostream>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/contrib/lite/tools/benchmark/command_line_flags.h"
+#include "tensorflow/core/util/stats_calculator.h"
+
+namespace tflite {
+namespace benchmark {
+
+enum RunType {
+  WARMUP,
+  REGULAR,
+};
+
+class BenchmarkResults {
+ public:
+  BenchmarkResults(int64_t startup_latency_us, uint64_t input_bytes,
+                   tensorflow::Stat<int64_t> warmup_time_us,
+                   tensorflow::Stat<int64_t> inference_time_us)
+      : startup_latency_us_(startup_latency_us),
+        input_bytes_(input_bytes),
+        warmup_time_us_(warmup_time_us),
+        inference_time_us_(inference_time_us) {}
+
+  tensorflow::Stat<int64_t> inference_time_us() const {
+    return inference_time_us_;
+  }
+  tensorflow::Stat<int64_t> warmup_time_us() const { return warmup_time_us_; }
+  int64_t startup_latency_us() const { return startup_latency_us_; }
+  uint64_t input_bytes() const { return input_bytes_; }
+  double throughput_MB_per_second() const {
+    double bytes_per_sec = (input_bytes_ * inference_time_us_.count() * 1e6) /
+                           inference_time_us_.sum();
+    return bytes_per_sec / (1024.0 * 1024.0);
+  }
+
+ private:
+  int64_t startup_latency_us_;
+  uint64_t input_bytes_;
+  tensorflow::Stat<int64_t> warmup_time_us_;
+  tensorflow::Stat<int64_t> inference_time_us_;
+};
+
+struct BenchmarkParams {
+  BenchmarkParams()
+      : num_runs(50), warmup_runs(1), run_delay(-1.0), num_threads(1) {}
+  int num_runs;
+  int warmup_runs;
+  float run_delay;
+  int num_threads;
+  std::string benchmark_name;
+  std::string output_prefix;
+};
+
+class BenchmarkListener {
+ public:
+  virtual void OnBenchmarkStart(const BenchmarkParams& params) {}
+  virtual void OnSingleRunStart(RunType runType) {}
+  virtual void OnSingleRunEnd() {}
+  virtual void OnBenchmarkEnd(const BenchmarkResults& results) {}
+  virtual ~BenchmarkListener() {}
+};
+
+// A listener that forwards its method calls to a collection of listeners.
+class BenchmarkListeners : public BenchmarkListener {
+ public:
+  // Added a listener to the listener collection.
+  // |listener| is not owned by the instance of |BenchmarkListeners|.
+  // |listener| should not be null and should outlast the instance of
+  // |BenchmarkListeners|.
+  void AddListener(BenchmarkListener* listener) {
+    listeners_.push_back(listener);
+  }
+
+  void OnBenchmarkStart(const BenchmarkParams& params) override {
+    for (auto listener : listeners_) {
+      listener->OnBenchmarkStart(params);
+    }
+  }
+
+  void OnSingleRunStart(RunType runType) override {
+    for (auto listener : listeners_) {
+      listener->OnSingleRunStart(runType);
+    }
+  }
+
+  void OnSingleRunEnd() override {
+    for (auto listener : listeners_) {
+      listener->OnSingleRunEnd();
+    }
+  }
+
+  void OnBenchmarkEnd(const BenchmarkResults& results) override {
+    for (auto listener : listeners_) {
+      listener->OnBenchmarkEnd(results);
+    }
+  }
+
+  ~BenchmarkListeners() {}
+
+ private:
+  // Use vector so listeners are invoked in the order they are added.
+  std::vector<BenchmarkListener*> listeners_;
+};
+
+// Benchmark listener that just logs the results of benchmark run.
+class BenchmarkLoggingListener : public BenchmarkListener {
+  void OnBenchmarkEnd(const BenchmarkResults& results) override;
+};
+
+// Benchmarks a model.
+//
+// Subclasses need to implement initialization and running of the model.
+// The results can be collected by adding BenchmarkListener(s).
+class BenchmarkModel {
+ public:
+  virtual ~BenchmarkModel() {}
+  bool ParseFlags(int argc, char** argv);
+  virtual void Init() = 0;
+  void Run(int argc, char** argv);
+  void AddListener(BenchmarkListener* listener) {
+    listeners_.AddListener(listener);
+  }
+
+ protected:
+  virtual void LogFlags();
+  virtual bool ValidateFlags() { return true; }
+  virtual std::vector<Flag> GetFlags();
+  virtual uint64_t ComputeInputBytes() = 0;
+  virtual tensorflow::Stat<int64_t> Run(int num_times, RunType run_type);
+  virtual void RunImpl() = 0;
+  BenchmarkParams params_;
+  BenchmarkListeners listeners_;
+};
+
+}  // namespace benchmark
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_BENCHMARK_MODEL_H_
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc
new file mode 100644
index 00000000000000..2e5b86627322c2
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -0,0 +1,352 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h"
+
+#include <cstdarg>
+#include <cstdlib>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/op_resolver.h"
+#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/contrib/lite/tools/benchmark/logging.h"
+
+#ifdef TFLITE_CUSTOM_OPS_HEADER
+void RegisterSelectedOps(::tflite::MutableOpResolver* resolver);
+#endif
+
+namespace tflite {
+namespace benchmark {
+
+void ProfilingListener::SetInterpreter(tflite::Interpreter* interpreter) {
+  TFLITE_BENCHMARK_CHECK(interpreter);
+  interpreter_ = interpreter;
+  interpreter_->SetProfiler(&profiler_);
+}
+
+void ProfilingListener::OnSingleRunStart(RunType run_type) {
+  if (run_type == REGULAR) {
+    profiler_.Reset();
+    profiler_.StartProfiling();
+  }
+}
+
+void ProfilingListener::OnBenchmarkEnd(const BenchmarkResults& results) {
+  if (has_profiles_) {
+    TFLITE_LOG(INFO) << summarizer_.GetOutputString();
+  }
+}
+
+void ProfilingListener::OnSingleRunEnd() {
+  profiler_.StopProfiling();
+  auto profile_events = profiler_.GetProfileEvents();
+  has_profiles_ = !profile_events.empty();
+  summarizer_.ProcessProfiles(profile_events, *interpreter_);
+}
+
+namespace {
+
+std::vector<std::string> Split(const std::string& str, const char delim) {
+  std::istringstream input(str);
+  std::vector<std::string> results;
+  std::string item;
+  while (std::getline(input, item, delim)) {
+    results.push_back(item);
+  }
+  return results;
+}
+
+template <typename T>
+bool SplitAndParse(const std::string& str, char delim, std::vector<T>* values) {
+  std::istringstream input(str);
+  bool first = true;
+  while (!input.eof()) {
+    if (!first) {
+      char c;
+      input >> c;
+      if (c != delim) {
+        return false;
+      }
+    } else {
+      first = false;
+    }
+    T val;
+    input >> val;
+    if (!input.eof() && !input.good()) {
+      return false;
+    }
+    values->push_back(val);
+  }
+  return true;
+}
+
+template <typename T>
+void FillRandomValue(T* ptr, const std::vector<int>& sizes,
+                     const std::function<T()>& random_func) {
+  int num_elements = 1;
+  for (int dim : sizes) {
+    num_elements *= dim;
+  }
+  for (int i = 0; i < num_elements; ++i) {
+    *ptr++ = random_func();
+  }
+}
+
+void FillRandomString(tflite::DynamicBuffer* buffer,
+                      const std::vector<int>& sizes,
+                      const std::function<string()>& random_func) {
+  int num_elements = 1;
+  for (int dim : sizes) {
+    num_elements *= dim;
+  }
+  for (int i = 0; i < num_elements; ++i) {
+    auto str = random_func();
+    buffer->AddString(str.data(), str.length());
+  }
+}
+
+TfLiteType TfLiteTypeFromString(const string& input_layer_type) {
+  if (input_layer_type == "string")
+    return kTfLiteString;
+  else if (input_layer_type == "float")
+    return kTfLiteFloat32;
+  else if (input_layer_type == "uint8")
+    return kTfLiteUInt8;
+  else if (input_layer_type == "int32")
+    return kTfLiteInt32;
+  else if (input_layer_type == "int64")
+    return kTfLiteInt64;
+  else
+    return kTfLiteNoType;
+}
+
+bool PopulateInputLayerInfo(
+    const string& names_string, const string& shapes_string,
+    const string& types_string, const string& values_string,
+    std::vector<BenchmarkTfLiteModel::InputLayerInfo>* info) {
+  std::vector<std::string> names = Split(names_string, ',');
+  std::vector<std::string> shapes = Split(shapes_string, ':');
+  std::vector<std::string> types = Split(types_string, ',');
+  std::vector<std::string> values = Split(values_string, ':');
+
+  if (names.size() != shapes.size()) {
+    TFLITE_LOG(ERROR) << "The number of items in"
+                      << " --input_layer_shape (" << shapes_string << ", with "
+                      << shapes.size() << " items)"
+                      << " must match the number of items in"
+                      << " --input_layer (" << names_string << ", with "
+                      << names.size() << " items)."
+                      << " For example --input_layer=input1,input2"
+                      << " --input_layer_shape=1,224,224,4:1,20";
+    return false;
+  }
+  if (names.size() != types.size()) {
+    TFLITE_LOG(ERROR) << "The number of items in"
+                      << " --input_layer_type (" << types_string << ", with "
+                      << types.size() << " items)"
+                      << " must match the number of items in"
+                      << " --input_layer (" << names_string << ", with "
+                      << names.size() << " items)."
+                      << " For example --input_layer=input1,input2"
+                      << " --input_layer_type=float,int";
+    return false;
+  }
+
+  for (int i = 0; i < names.size(); ++i) {
+    info->push_back(BenchmarkTfLiteModel::InputLayerInfo());
+    BenchmarkTfLiteModel::InputLayerInfo& input = info->back();
+
+    input.name = names[i];
+
+    input.data_type = TfLiteTypeFromString(types[i]);
+    TFLITE_BENCHMARK_CHECK(input.data_type != kTfLiteNoType)
+        << types[i] << " was an invalid type";
+
+    TFLITE_BENCHMARK_CHECK(SplitAndParse(shapes[i], ',', &input.shape))
+        << "Incorrect size string specified: " << shapes[i];
+    for (int dim : input.shape) {
+      if (dim == -1) {
+        TFLITE_LOG(ERROR)
+            << "Any unknown sizes in the shapes (-1's) must be replaced"
+            << " with the size you want to benchmark with.";
+        return false;
+      }
+    }
+
+    if (i < values.size()) {
+      TFLITE_BENCHMARK_CHECK(
+          SplitAndParse(values[i], ',', &input.initialization_values))
+          << "Incorrect initialization values string specified: " << values[i];
+    }
+  }
+
+  return true;
+}
+
+}  // namespace
+
+std::vector<Flag> BenchmarkTfLiteModel::GetFlags() {
+  std::vector<Flag> flags = BenchmarkTfLiteModel::BenchmarkModel::GetFlags();
+  std::vector<Flag> specific_flags = {
+      Flag("graph", &graph, "graph file name"),
+      Flag("input_layer", &input_layer_string, "input layer names"),
+      Flag("input_layer_shape", &input_layer_shape_string, "input layer shape"),
+      Flag("input_layer_type", &input_layer_type_string, "input layer type"),
+      Flag("input_layer_values", &input_layer_values_string,
+           "values to initialize the inputs with"),
+      Flag("output_layer", &output_layer_string, "output layer name"),
+      Flag("use_nnapi", &use_nnapi, "use nnapi api")};
+
+  flags.insert(flags.end(), specific_flags.begin(), specific_flags.end());
+  return flags;
+}
+
+void BenchmarkTfLiteModel::LogFlags() {
+  BenchmarkModel::LogFlags();
+  TFLITE_LOG(INFO) << "Graph: [" << graph << "]";
+  TFLITE_LOG(INFO) << "Input layers: [" << input_layer_string << "]";
+  TFLITE_LOG(INFO) << "Input shapes: [" << input_layer_shape_string << "]";
+  TFLITE_LOG(INFO) << "Input types: [" << input_layer_type_string << "]";
+  TFLITE_LOG(INFO) << "Output layers: [" << output_layer_string << "]";
+  TFLITE_LOG(INFO) << "Use nnapi : [" << use_nnapi << "]";
+}
+
+bool BenchmarkTfLiteModel::ValidateFlags() {
+  if (graph.empty()) {
+    TFLITE_LOG(ERROR)
+        << "Please specify the name of your TF Lite input file with --graph";
+    return false;
+  }
+  return PopulateInputLayerInfo(input_layer_string, input_layer_shape_string,
+                                input_layer_type_string,
+                                input_layer_values_string, &inputs);
+}
+
+uint64_t BenchmarkTfLiteModel::ComputeInputBytes() {
+  TFLITE_BENCHMARK_CHECK(interpreter);
+  uint64_t total_input_bytes = 0;
+  for (int input : interpreter->inputs()) {
+    auto* t = interpreter->tensor(input);
+    total_input_bytes += t->bytes;
+  }
+  return total_input_bytes;
+}
+
+void BenchmarkTfLiteModel::Init() {
+  model = tflite::FlatBufferModel::BuildFromFile(graph.c_str());
+  if (!model) {
+    TFLITE_LOG(FATAL) << "Failed to mmap model " << graph;
+  }
+  TFLITE_LOG(INFO) << "Loaded model " << graph;
+  model->error_reporter();
+  TFLITE_LOG(INFO) << "resolved reporter";
+
+#ifdef TFLITE_CUSTOM_OPS_HEADER
+  tflite::MutableOpResolver resolver;
+  RegisterSelectedOps(&resolver);
+#else
+  tflite::ops::builtin::BuiltinOpResolver resolver;
+#endif
+
+  tflite::InterpreterBuilder(*model, resolver)(&interpreter);
+  if (!interpreter) {
+    TFLITE_LOG(FATAL) << "Failed to construct interpreter";
+  }
+  profiling_listener_.SetInterpreter(interpreter.get());
+
+  if (params_.num_threads != -1) {
+    interpreter->SetNumThreads(params_.num_threads);
+  }
+
+  interpreter->UseNNAPI(use_nnapi);
+  auto interpreter_inputs = interpreter->inputs();
+
+  if (!inputs.empty()) {
+    TFLITE_BENCHMARK_CHECK_EQ(inputs.size(), interpreter_inputs.size())
+        << "Inputs mismatch: Model inputs #:" << interpreter_inputs.size()
+        << " expected: " << inputs.size();
+  }
+
+  // TFLITE_BENCHMARK_CHECK that all names and types match
+  for (int j = 0; j < inputs.size(); ++j) {
+    const InputLayerInfo& input = inputs[j];
+    int i = interpreter_inputs[j];
+    TfLiteTensor* t = interpreter->tensor(i);
+    TFLITE_BENCHMARK_CHECK_EQ(t->name, input.name)
+        << "Tensor # " << i << " is named " << t->name << " but flags call it "
+        << input.name;
+    TFLITE_BENCHMARK_CHECK_EQ(t->type, input.data_type)
+        << "Could not match the type of input tensor " << t->name;
+  }
+
+  // Resize all non-string tensors.
+  for (int j = 0; j < inputs.size(); ++j) {
+    const InputLayerInfo& input = inputs[j];
+    int i = interpreter_inputs[j];
+    TfLiteTensor* t = interpreter->tensor(i);
+    if (t->type != kTfLiteString) {
+      interpreter->ResizeInputTensor(i, input.shape);
+    }
+  }
+
+  if (interpreter->AllocateTensors() != kTfLiteOk) {
+    TFLITE_LOG(FATAL) << "Failed to allocate tensors!";
+  }
+
+  // Set the values of the input tensors.
+  for (int j = 0; j < inputs.size(); ++j) {
+    const InputLayerInfo& input = inputs[j];
+    int i = interpreter_inputs[j];
+    TfLiteTensor* t = interpreter->tensor(i);
+    std::vector<int> sizes = input.shape;
+
+    // TODO(ahentz): below we ignore the O-th dimension (number of batches).
+    if (t->type == kTfLiteFloat32) {
+      FillRandomValue<float>(
+          interpreter->typed_tensor<float>(i),
+          std::vector<int>(sizes.begin() + 1, sizes.end()),
+          []() { return static_cast<float>(rand()) / RAND_MAX - 0.5f; });
+    } else if (t->type == kTfLiteUInt8) {
+      FillRandomValue<uint8_t>(
+          interpreter->typed_tensor<uint8_t>(i),
+          std::vector<int>(sizes.begin() + 1, sizes.end()),
+          []() { return static_cast<uint8_t>(rand()) % 255; });
+    } else if (t->type == kTfLiteString) {
+      tflite::DynamicBuffer buffer;
+      FillRandomString(&buffer, sizes, []() {
+        return "we're have some friends over saturday to hang out in the yard";
+      });
+      buffer.WriteToTensor(interpreter->tensor(i));
+    } else {
+      TFLITE_LOG(FATAL) << "Don't know how to populate tensor " << t->name
+                        << " of type " << t->type;
+    }
+  }
+}
+
+void BenchmarkTfLiteModel::RunImpl() {
+  if (interpreter->Invoke() != kTfLiteOk) {
+    TFLITE_LOG(FATAL) << "Failed to invoke!";
+  }
+}
+
+}  // namespace benchmark
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h
new file mode 100644
index 00000000000000..e70f6de1bf461f
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h
@@ -0,0 +1,90 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_TFLITE_MODEL_H_
+#define TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_TFLITE_MODEL_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/profiling/profile_summarizer.h"
+#include "tensorflow/contrib/lite/tools/benchmark/benchmark_model.h"
+
+namespace tflite {
+namespace benchmark {
+
+// Dumps profiling events if profiling is enabled
+class ProfilingListener : public BenchmarkListener {
+ public:
+  explicit ProfilingListener() : interpreter_(nullptr), has_profiles_(false) {}
+
+  void SetInterpreter(Interpreter* interpreter);
+
+  void OnSingleRunStart(RunType run_type) override;
+
+  void OnSingleRunEnd() override;
+
+  void OnBenchmarkEnd(const BenchmarkResults& results) override;
+
+ private:
+  Interpreter* interpreter_;
+  profiling::Profiler profiler_;
+  profiling::ProfileSummarizer summarizer_;
+  bool has_profiles_;
+};
+
+// Benchmarks a TFLite model by running tflite interpreter.
+class BenchmarkTfLiteModel : public BenchmarkModel {
+ public:
+  BenchmarkTfLiteModel() : use_nnapi(false) {
+    AddListener(&profiling_listener_);
+  }
+
+  std::vector<Flag> GetFlags() override;
+  void LogFlags() override;
+  bool ValidateFlags() override;
+  uint64_t ComputeInputBytes() override;
+  void Init() override;
+  void RunImpl() override;
+  virtual ~BenchmarkTfLiteModel() {}
+
+  struct InputLayerInfo {
+    std::string name;
+    TfLiteType data_type;
+    std::vector<int> shape;
+    // Note that initialization_values is currently unused.
+    std::vector<float> initialization_values;
+  };
+
+ private:
+  std::unique_ptr<tflite::FlatBufferModel> model;
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  std::string graph;
+  std::string input_layer_string;
+  std::string input_layer_type_string;
+  std::string input_layer_shape_string;
+  std::string input_layer_values_string;
+  std::string output_layer_string;
+  std::vector<InputLayerInfo> inputs;
+  bool use_nnapi;
+  ProfilingListener profiling_listener_;
+};
+
+}  // namespace benchmark
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_BENCHMARK_TFLITE_MODEL_H_
diff --git a/tensorflow/contrib/lite/tools/benchmark/command_line_flags.cc b/tensorflow/contrib/lite/tools/benchmark/command_line_flags.cc
new file mode 100644
index 00000000000000..8195fc44beb288
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/benchmark/command_line_flags.cc
@@ -0,0 +1,194 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/tools/benchmark/command_line_flags.h"
+
+#include <cstring>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace tflite {
+namespace {
+
+template <typename T>
+std::string ToString(T val) {
+  std::ostringstream stream;
+  stream << val;
+  return stream.str();
+}
+
+bool ParseFlag(const std::string& arg, const std::string& flag,
+               const std::function<bool(const std::string&)>& parse_func,
+               bool* value_parsing_ok) {
+  *value_parsing_ok = true;
+  std::string flag_prefix = "--" + flag + "=";
+  if (arg.find(flag_prefix) != 0) {
+    return false;
+  }
+  bool has_value = arg.size() >= flag_prefix.size();
+  *value_parsing_ok = has_value;
+  if (has_value) {
+    *value_parsing_ok = parse_func(arg.substr(flag_prefix.size()));
+  }
+  return true;
+}
+
+template <typename T>
+bool ParseFlag(const std::string& flag_value, T* value) {
+  std::istringstream stream(flag_value);
+  T read_value;
+  stream >> read_value;
+  if (!stream.eof() && !stream.good()) {
+    return false;
+  }
+  *value = read_value;
+  return true;
+}
+
+bool ParseBoolFlag(const std::string& flag_value, bool* value) {
+  if (flag_value != "true" && flag_value != "false") {
+    return false;
+  }
+
+  *value = (flag_value == "true");
+  return true;
+}
+
+bool ParseStringFlag(const std::string& flag_value, std::string* value) {
+  *value = flag_value;
+  return true;
+}
+
+}  // namespace
+
+Flag::Flag(const char* name, int32_t* dst, const std::string& usage_text)
+    : name_(name),
+      type_(TYPE_INT32),
+      value_hook_([dst](const std::string& flag_value) {
+        return ParseFlag<int32_t>(flag_value, dst);
+      }),
+      default_for_display_(ToString(*dst)),
+      usage_text_(usage_text) {}
+
+Flag::Flag(const char* name, int64_t* dst, const std::string& usage_text)
+    : name_(name),
+      type_(TYPE_INT64),
+      value_hook_([dst](const std::string& flag_value) {
+        return ParseFlag<int64_t>(flag_value, dst);
+      }),
+      default_for_display_(ToString(*dst)),
+      usage_text_(usage_text) {}
+
+Flag::Flag(const char* name, float* dst, const std::string& usage_text)
+    : name_(name),
+      type_(TYPE_FLOAT),
+      value_hook_([dst](const std::string& flag_value) {
+        return ParseFlag<float>(flag_value, dst);
+      }),
+      default_for_display_(ToString(*dst)),
+      usage_text_(usage_text) {}
+
+Flag::Flag(const char* name, bool* dst, const std::string& usage_text)
+    : name_(name),
+      type_(TYPE_BOOL),
+      value_hook_([dst](const std::string& flag_value) {
+        return ParseBoolFlag(flag_value, dst);
+      }),
+      default_for_display_((*dst) ? "true" : "false"),
+      usage_text_(usage_text) {}
+
+Flag::Flag(const char* name, std::string* dst, const std::string& usage_text)
+    : name_(name),
+      type_(TYPE_STRING),
+      value_hook_([dst](const std::string& flag_value) {
+        return ParseStringFlag(flag_value, dst);
+      }),
+      default_for_display_(*dst),
+      usage_text_(usage_text) {}
+
+bool Flag::Parse(const std::string& arg, bool* value_parsing_ok) const {
+  return ParseFlag(arg, name_, value_hook_, value_parsing_ok);
+}
+
+std::string Flag::GetTypeName() const {
+  switch (type_) {
+    case TYPE_INT32:
+      return "int32";
+    case TYPE_INT64:
+      return "int64";
+    case TYPE_FLOAT:
+      return "float";
+    case TYPE_BOOL:
+      return "bool";
+    case TYPE_STRING:
+      return "string";
+  }
+
+  return "unknown";
+}
+
+/*static*/ bool Flags::Parse(int* argc, const char** argv,
+                             const std::vector<Flag>& flag_list) {
+  bool result = true;
+  std::vector<const char*> unknown_flags;
+  for (int i = 1; i < *argc; ++i) {
+    if (std::string(argv[i]) == "--") {
+      while (i < *argc) {
+        unknown_flags.push_back(argv[i]);
+        ++i;
+      }
+      break;
+    }
+
+    bool was_found = false;
+    for (const Flag& flag : flag_list) {
+      bool value_parsing_ok;
+      was_found = flag.Parse(argv[i], &value_parsing_ok);
+      if (!value_parsing_ok) {
+        result = false;
+      }
+      if (was_found) {
+        break;
+      }
+    }
+    if (!was_found) {
+      unknown_flags.push_back(argv[i]);
+    }
+  }
+  int dst = 1;  // Skip argv[0]
+  for (auto f : unknown_flags) {
+    argv[dst++] = f;
+  }
+  argv[dst++] = nullptr;
+  *argc = unknown_flags.size() + 1;
+  return result && (*argc < 2 || std::strcmp(argv[1], "--help") != 0);
+}
+
+/*static*/ std::string Flags::Usage(const std::string& cmdline,
+                                    const std::vector<Flag>& flag_list) {
+  std::ostringstream usage_text;
+  usage_text << "usage: " << cmdline << "\n";
+  if (!flag_list.empty()) {
+    usage_text << "Flags:\n";
+  }
+
+  for (const Flag& flag : flag_list) {
+    auto type_name = flag.GetTypeName();
+    usage_text << "\t";
+    usage_text << "--" << flag.name_ << "=" << flag.default_for_display_;
+    usage_text << "\t" << type_name << "\t" << flag.usage_text_ << "\n";
+  }
+  return usage_text.str();
+}
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/tools/benchmark/command_line_flags.h b/tensorflow/contrib/lite/tools/benchmark/command_line_flags.h
new file mode 100644
index 00000000000000..36f9e64767315a
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/benchmark/command_line_flags.h
@@ -0,0 +1,112 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_COMMAND_LINE_FLAGS_H_
+#define TENSORFLOW_CONTRIB_LITE_TOOLS_COMMAND_LINE_FLAGS_H_
+
+#include <functional>
+#include <string>
+#include <vector>
+
+namespace tflite {
+// A simple command-line argument parsing module.
+// Dependency free simplified port of core/util/command_line_flags.
+// This class is written for benchmarks and uses inefficient string
+// concatenation. This was written to avoid dependency on tensorflow/core/util
+// which transitively brings in a lot of other dependencies that are not
+// necessary for tflite benchmarking code.
+// The recommended way of using it is with local variables and an initializer
+// list of Flag objects, for example:
+//
+// int some_int = 10;
+// bool some_switch = false;
+// std::string some_name = "something";
+// std::vector<tensorFlow::Flag> flag_list = {
+//   Flag("some_int", &some_int, "an integer that affects X"),
+//   Flag("some_switch", &some_switch, "a bool that affects Y"),
+//   Flag("some_name", &some_name, "a std::string that affects Z")
+// };
+// // Get usage message before ParseFlags() to capture default values.
+// std::string usage = Flag::Usage(argv[0], flag_list);
+// bool parsed_values_ok = Flags::Parse(&argc, argv, flag_list);
+//
+// tensorflow::port::InitMain(usage.c_str(), &argc, &argv);
+// if (argc != 1 || !parsed_values_ok) {
+//    ...output usage and error message...
+// }
+//
+// The argc and argv values are adjusted by the Parse function so all that
+// remains is the program name (at argv[0]) and any unknown arguments fill the
+// rest of the array. This means you can check for flags that weren't understood
+// by seeing if argv is greater than 1.
+// The result indicates if there were any errors parsing the values that were
+// passed to the command-line switches. For example, --some_int=foo would return
+// false because the argument is expected to be an integer.
+//
+// NOTE: Unlike gflags-style libraries, this library is intended to be
+// used in the `main()` function of your binary. It does not handle
+// flag definitions that are scattered around the source code.
+
+// A description of a single command line flag, holding its name, type, usage
+// text, and a pointer to the corresponding variable.
+class Flag {
+ public:
+  Flag(const char* name, int32_t* dst, const std::string& usage_text);
+  Flag(const char* name, int64_t* dst, const std::string& usage_text);
+  Flag(const char* name, bool* dst, const std::string& usage_text);
+  Flag(const char* name, std::string* dst, const std::string& usage_text);
+  Flag(const char* name, float* dst, const std::string& usage_text);
+
+ private:
+  friend class Flags;
+
+  bool Parse(const std::string& arg, bool* value_parsing_ok) const;
+
+  std::string name_;
+  enum {
+    TYPE_INT32,
+    TYPE_INT64,
+    TYPE_BOOL,
+    TYPE_STRING,
+    TYPE_FLOAT,
+  } type_;
+
+  std::string GetTypeName() const;
+
+  std::function<bool(const std::string&)> value_hook_;
+  std::string default_for_display_;
+
+  std::string usage_text_;
+};
+
+class Flags {
+ public:
+  // Parse the command line represented by argv[0, ..., (*argc)-1] to find flag
+  // instances matching flags in flaglist[].  Update the variables associated
+  // with matching flags, and remove the matching arguments from (*argc, argv).
+  // Return true iff all recognized flag values were parsed correctly, and the
+  // first remaining argument is not "--help".
+  static bool Parse(int* argc, const char** argv,
+                    const std::vector<Flag>& flag_list);
+
+  // Return a usage message with command line cmdline, and the
+  // usage_text strings in flag_list[].
+  static std::string Usage(const std::string& cmdline,
+                           const std::vector<Flag>& flag_list);
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_COMMAND_LINE_FLAGS_H_
diff --git a/tensorflow/contrib/lite/tools/benchmark/command_line_flags_test.cc b/tensorflow/contrib/lite/tools/benchmark/command_line_flags_test.cc
new file mode 100644
index 00000000000000..9a931d5ddd52ba
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/benchmark/command_line_flags_test.cc
@@ -0,0 +1,166 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/tools/benchmark/command_line_flags.h"
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/testing/util.h"
+
+namespace tflite {
+namespace {
+
+TEST(CommandLineFlagsTest, BasicUsage) {
+  int some_int32 = 10;
+  int64_t some_int64 = 21474836470;  // max int32 is 2147483647
+  bool some_switch = false;
+  std::string some_name = "something_a";
+  float some_float = -23.23f;
+  const char* argv_strings[] = {"program_name",
+                                "--some_int32=20",
+                                "--some_int64=214748364700",
+                                "--some_switch=true",
+                                "--some_name=somethingelse",
+                                "--some_float=42.0"};
+  int argc = 6;
+  bool parsed_ok =
+      Flags::Parse(&argc, reinterpret_cast<const char**>(argv_strings),
+                   {
+                       Flag("some_int32", &some_int32, "some int32"),
+                       Flag("some_int64", &some_int64, "some int64"),
+                       Flag("some_switch", &some_switch, "some switch"),
+                       Flag("some_name", &some_name, "some name"),
+                       Flag("some_float", &some_float, "some float"),
+                   });
+
+  EXPECT_EQ(true, parsed_ok);
+  EXPECT_EQ(20, some_int32);
+  EXPECT_EQ(214748364700, some_int64);
+  EXPECT_EQ(true, some_switch);
+  EXPECT_EQ("somethingelse", some_name);
+  EXPECT_NEAR(42.0f, some_float, 1e-5f);
+  EXPECT_EQ(argc, 1);
+}
+
+TEST(CommandLineFlagsTest, EmptyStringFlag) {
+  int argc = 2;
+  std::string some_string = "invalid";
+  const char* argv_strings[] = {"program_name", "--some_string="};
+  bool parsed_ok =
+      Flags::Parse(&argc, reinterpret_cast<const char**>(argv_strings),
+                   {Flag("some_string", &some_string, "some string")});
+
+  EXPECT_EQ(true, parsed_ok);
+  EXPECT_EQ(some_string, "");
+  EXPECT_EQ(argc, 1);
+}
+
+TEST(CommandLineFlagsTest, BadIntValue) {
+  int some_int = 10;
+  int argc = 2;
+  const char* argv_strings[] = {"program_name", "--some_int=notanumber"};
+  bool parsed_ok =
+      Flags::Parse(&argc, reinterpret_cast<const char**>(argv_strings),
+                   {Flag("some_int", &some_int, "some int")});
+
+  EXPECT_EQ(false, parsed_ok);
+  EXPECT_EQ(10, some_int);
+  EXPECT_EQ(argc, 1);
+}
+
+TEST(CommandLineFlagsTest, BadBoolValue) {
+  bool some_switch = false;
+  int argc = 2;
+  const char* argv_strings[] = {"program_name", "--some_switch=notabool"};
+  bool parsed_ok =
+      Flags::Parse(&argc, reinterpret_cast<const char**>(argv_strings),
+                   {Flag("some_switch", &some_switch, "some switch")});
+
+  EXPECT_EQ(false, parsed_ok);
+  EXPECT_EQ(false, some_switch);
+  EXPECT_EQ(argc, 1);
+}
+
+TEST(CommandLineFlagsTest, BadFloatValue) {
+  float some_float = -23.23f;
+  int argc = 2;
+  const char* argv_strings[] = {"program_name", "--some_float=notanumber"};
+  bool parsed_ok =
+      Flags::Parse(&argc, reinterpret_cast<const char**>(argv_strings),
+                   {Flag("some_float", &some_float, "some float")});
+
+  EXPECT_EQ(false, parsed_ok);
+  EXPECT_NEAR(-23.23f, some_float, 1e-5f);
+  EXPECT_EQ(argc, 1);
+}
+
+// Return whether str==pat, but allowing any whitespace in pat
+// to match zero or more whitespace characters in str.
+static bool MatchWithAnyWhitespace(const std::string& str,
+                                   const std::string& pat) {
+  bool matching = true;
+  int pat_i = 0;
+  for (int str_i = 0; str_i != str.size() && matching; str_i++) {
+    if (isspace(str[str_i])) {
+      matching = (pat_i != pat.size() && isspace(pat[pat_i]));
+    } else {
+      while (pat_i != pat.size() && isspace(pat[pat_i])) {
+        pat_i++;
+      }
+      matching = (pat_i != pat.size() && str[str_i] == pat[pat_i++]);
+    }
+  }
+  while (pat_i != pat.size() && isspace(pat[pat_i])) {
+    pat_i++;
+  }
+  return (matching && pat_i == pat.size());
+}
+
+TEST(CommandLineFlagsTest, UsageString) {
+  int some_int = 10;
+  int64_t some_int64 = 21474836470;  // max int32 is 2147483647
+  bool some_switch = false;
+  std::string some_name = "something";
+  // Don't test float in this case, because precision is hard to predict and
+  // match against, and we don't want a flakey test.
+  const string tool_name = "some_tool_name";
+  string usage = Flags::Usage(tool_name + " <flags>",
+                              {Flag("some_int", &some_int, "some int"),
+                               Flag("some_int64", &some_int64, "some int64"),
+                               Flag("some_switch", &some_switch, "some switch"),
+                               Flag("some_name", &some_name, "some name")});
+  // Match the usage message, being sloppy about whitespace.
+  const char* expected_usage =
+      " usage: some_tool_name <flags>\n"
+      "Flags:\n"
+      "--some_int=10\tint32\tsome int\n"
+      "--some_int64=21474836470\tint64\tsome int64\n"
+      "--some_switch=false\tbool\tsome switch\n"
+      "--some_name=something\tstring\tsome name\n";
+  ASSERT_EQ(MatchWithAnyWhitespace(usage, expected_usage), true) << usage;
+
+  // Again but with no flags.
+  usage = Flags::Usage(tool_name, {});
+  ASSERT_EQ(MatchWithAnyWhitespace(usage, " usage: some_tool_name\n"), true)
+      << usage;
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/tools/benchmark/logging.h b/tensorflow/contrib/lite/tools/benchmark/logging.h
new file mode 100644
index 00000000000000..9e9292e2feacf0
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/benchmark/logging.h
@@ -0,0 +1,76 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_LOGGING_H_
+#define TENSORFLOW_CONTRIB_LITE_TOOLS_LOGGING_H_
+
+// LOG and CHECK macros for benchmarks.
+
+#include <cstdlib>
+#include <iostream>
+#include <sstream>
+
+namespace tflite {
+namespace logging {
+// A wrapper that logs to stderr.
+//
+// Used for TFLITE_LOG and TFLITE_BENCHMARK_CHECK macros.
+class LoggingWrapper {
+ public:
+  enum class LogSeverity : int {
+    INFO = 0,
+    WARN = 1,
+    ERROR = 2,
+    FATAL = 3,
+  };
+  LoggingWrapper(LogSeverity severity)
+      : severity_(severity), should_log_(true) {}
+  LoggingWrapper(LogSeverity severity, bool log)
+      : severity_(severity), should_log_(log) {}
+  std::stringstream& Stream() { return stream_; }
+  ~LoggingWrapper() {
+    if (should_log_) {
+      std::cerr << stream_.str() << std::endl;
+      if (severity_ == LogSeverity::FATAL) {
+        std::flush(std::cerr);
+        std::abort();
+      }
+    }
+  }
+
+ private:
+  std::stringstream stream_;
+  LogSeverity severity_;
+  bool should_log_;
+};
+
+}  // namespace logging
+
+}  // namespace tflite
+
+#define TFLITE_LOG(severity)                                  \
+  tflite::logging::LoggingWrapper(                            \
+      tflite::logging::LoggingWrapper::LogSeverity::severity) \
+      .Stream()
+
+#define TFLITE_BENCHMARK_CHECK(condition)                  \
+  tflite::logging::LoggingWrapper(                         \
+      tflite::logging::LoggingWrapper::LogSeverity::FATAL, \
+      (condition) ? false : true)                          \
+      .Stream()
+
+#define TFLITE_BENCHMARK_CHECK_EQ(a, b) TFLITE_BENCHMARK_CHECK(a == b)
+
+#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_LOGGING_H_
diff --git a/tensorflow/contrib/lite/tools/benchmark_model.cc b/tensorflow/contrib/lite/tools/benchmark_model.cc
deleted file mode 100644
index 93c80e0f5e021f..00000000000000
--- a/tensorflow/contrib/lite/tools/benchmark_model.cc
+++ /dev/null
@@ -1,475 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cstdarg>
-#include <cstdlib>
-#include <iostream>
-#include <memory>
-#include <string>
-#include <unordered_set>
-#include <vector>
-
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/string_util.h"
-#include "tensorflow/contrib/lite/tools/mutable_op_resolver.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/init_main.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-#ifdef TFLITE_CUSTOM_OPS_HEADER
-void RegisterSelectedOps(::tflite::MutableOpResolver* resolver);
-#endif
-
-namespace tflite {
-
-using ::tensorflow::Env;
-using ::tensorflow::str_util::Split;
-using ::tensorflow::str_util::SplitAndParseAsFloats;
-using ::tensorflow::str_util::SplitAndParseAsInts;
-
-struct InputLayerInfo {
-  string name;
-  TfLiteType data_type;
-  std::vector<int> shape;
-  // Note that initialization_values is currently unused.
-  std::vector<float> initialization_values;
-};
-
-template <typename T>
-void FillRandomValue(T* ptr, const std::vector<int>& sizes,
-                     const std::function<T()>& random_func) {
-  int num_elements = 1;
-  for (int dim : sizes) {
-    num_elements *= dim;
-  }
-  for (int i = 0; i < num_elements; ++i) {
-    *ptr++ = random_func();
-  }
-}
-
-void FillRandomString(tflite::DynamicBuffer* buffer,
-                      const std::vector<int>& sizes,
-                      const std::function<string()>& random_func) {
-  int num_elements = 1;
-  for (int dim : sizes) {
-    num_elements *= dim;
-  }
-  for (int i = 0; i < num_elements; ++i) {
-    auto str = random_func();
-    buffer->AddString(str.data(), str.length());
-  }
-}
-
-TfLiteType TfLiteTypeFromString(const string& input_layer_type) {
-  if (input_layer_type == "string")
-    return kTfLiteString;
-  else if (input_layer_type == "float")
-    return kTfLiteFloat32;
-  else if (input_layer_type == "uint8")
-    return kTfLiteUInt8;
-  else if (input_layer_type == "int32")
-    return kTfLiteInt32;
-  else if (input_layer_type == "int64")
-    return kTfLiteInt64;
-  else
-    return kTfLiteNoType;
-}
-
-std::vector<int> ShapeFromTfLiteTensor(TfLiteTensor* t) {
-  std::vector<int> result;
-  result.reserve(t->dims->size);
-  for (int i = 0; i < t->dims->size; ++i) {
-    result.push_back(t->dims->data[i]);
-  }
-  CHECK(!result.empty()) << "Found no shapes in model";
-  return result;
-}
-
-bool CreateInterpreter(const string& graph,
-                       std::unique_ptr<FlatBufferModel>* model,
-                       std::unique_ptr<Interpreter>* interpreter) {
-  *model = tflite::FlatBufferModel::BuildFromFile(graph.c_str());
-  if (!model) {
-    std::cerr << "Failed to load model " << graph << std::endl;
-    return false;
-  }
-
-#ifdef TFLITE_CUSTOM_OPS_HEADER
-  tflite::MutableOpResolver resolver;
-  RegisterSelectedOps(&resolver);
-#else
-  tflite::ops::builtin::BuiltinOpResolver resolver;
-#endif
-
-  tflite::InterpreterBuilder(*(model->get()), resolver)(interpreter);
-  if (!(*interpreter)) {
-    std::cerr << "Failed to construct interpreter" << std::endl;
-    return false;
-  }
-
-  return true;
-}
-
-bool PrepareInterpreter(const std::vector<InputLayerInfo> inputs,
-                        int num_threads, bool use_nnapi,
-                        Interpreter* interpreter) {
-  if (num_threads != -1) {
-    interpreter->SetNumThreads(num_threads);
-  }
-
-  interpreter->UseNNAPI(use_nnapi);
-
-  // Check that all names and types match
-  for (const InputLayerInfo& input : inputs) {
-    for (int i : interpreter->inputs()) {
-      TfLiteTensor* t = interpreter->tensor(i);
-      CHECK_EQ(t->name, input.name)
-          << "Tensor # " << i << " is named " << t->name
-          << " but flags call it " << input.name;
-      CHECK_EQ(t->type, input.data_type)
-          << "Could not match the type of input tensor " << t->name;
-    }
-  }
-
-  // Resize all non-string tensors.
-  for (const InputLayerInfo& input : inputs) {
-    for (int i : interpreter->inputs()) {
-      TfLiteTensor* t = interpreter->tensor(i);
-      if (t->type != kTfLiteString) {
-        interpreter->ResizeInputTensor(i, input.shape);
-      }
-    }
-  }
-
-  if (interpreter->AllocateTensors() != kTfLiteOk) {
-    std::cerr << "Failed to allocate tensors!" << std::endl;
-    return false;
-  }
-
-  // Set the values of the input tensors.
-  for (int i : interpreter->inputs()) {
-    TfLiteTensor* t = interpreter->tensor(i);
-    std::vector<int> sizes = ShapeFromTfLiteTensor(t);
-
-    // TODO(ahentz): below we ignore the O-th dimension (number of batches).
-    if (t->type == kTfLiteFloat32) {
-      FillRandomValue<float>(
-          interpreter->typed_tensor<float>(i),
-          std::vector<int>(sizes.begin() + 1, sizes.end()),
-          []() { return static_cast<float>(rand()) / RAND_MAX - 0.5f; });
-    } else if (t->type == kTfLiteUInt8) {
-      FillRandomValue<uint8_t>(
-          interpreter->typed_tensor<uint8_t>(i),
-          std::vector<int>(sizes.begin() + 1, sizes.end()),
-          []() { return static_cast<uint8_t>(rand()) % 255; });
-    } else if (t->type == kTfLiteString) {
-      tflite::DynamicBuffer buffer;
-      FillRandomString(&buffer, sizes, []() {
-        return "we're have some friends over saturday to hang out in the yard";
-      });
-      buffer.WriteToTensor(interpreter->tensor(i));
-    } else {
-      std::cerr << "Don't know how to populate tensor " << t->name
-                << " of type " << t->type << std::endl;
-      return false;
-    }
-  }
-  return true;
-}
-
-bool PopulateInputLayerInfo(const string& names_string,
-                            const string& shapes_string,
-                            const string& types_string,
-                            const string& values_string,
-                            std::vector<InputLayerInfo>* info) {
-  std::vector<string> names = Split(names_string, ',');
-  std::vector<string> shapes = Split(shapes_string, ':');
-  std::vector<string> types = Split(types_string, ',');
-  std::vector<string> values = Split(values_string, ':');
-
-  if (names.size() != shapes.size()) {
-    LOG(ERROR) << "The number of items in"
-               << " --input_layer_shape (" << shapes_string << ", with "
-               << shapes.size() << " items)"
-               << " must match the number of items in"
-               << " --input_layer (" << names_string << ", with "
-               << names.size() << " items)."
-               << " For example --input_layer=input1,input2"
-               << " --input_layer_shape=1,224,224,4:1,20";
-    return false;
-  }
-  if (names.size() != types.size()) {
-    LOG(ERROR) << "The number of items in"
-               << " --input_layer_type (" << types_string << ", with "
-               << types.size() << " items)"
-               << " must match the number of items in"
-               << " --input_layer (" << names_string << ", with "
-               << names.size() << " items)."
-               << " For example --input_layer=input1,input2"
-               << " --input_layer_type=float,int";
-    return false;
-  }
-
-  for (int i = 0; i < names.size(); ++i) {
-    info->push_back(InputLayerInfo());
-    InputLayerInfo& input = info->back();
-
-    input.name = names[i];
-
-    input.data_type = TfLiteTypeFromString(types[i]);
-    CHECK(input.data_type != kTfLiteNoType)
-        << types[i] << " was an invalid type";
-
-    CHECK(SplitAndParseAsInts(shapes[i], ',', &input.shape))
-        << "Incorrect size string specified: " << shapes[i];
-    for (int dim : input.shape) {
-      if (dim == -1) {
-        LOG(ERROR) << "Any unknown sizes in the shapes (-1's) must be replaced"
-                   << " with the size you want to benchmark with.";
-        return false;
-      }
-    }
-
-    if (i < values.size()) {
-      CHECK(SplitAndParseAsFloats(values[i], ',', &input.initialization_values))
-          << "Incorrect initialization values string specified: " << values[i];
-    }
-  }
-
-  return true;
-}
-
-bool RunBenchmark(Interpreter* interpreter, int64_t* inference_time_us) {
-  const int64_t start_time = Env::Default()->NowMicros();
-
-  if (interpreter->Invoke() != kTfLiteOk) {
-    std::cerr << "Failed to invoke!";
-    return false;
-  }
-
-  const int64_t end_time = Env::Default()->NowMicros();
-  *inference_time_us = end_time - start_time;
-  return true;
-}
-
-class Latencies {
- public:
-  void AddMeasurement(int64_t time_us) {
-    max_ = std::max(time_us, max_);
-    min_ = std::min(time_us, min_);
-    ++count_;
-    sum_ += time_us;
-    squared_sum_ += static_cast<double>(time_us) * time_us;
-  }
-
-  double avg() const {
-    if (count_ == 0) return std::numeric_limits<int64_t>::quiet_NaN();
-    return static_cast<double>(sum_) / count_;
-  }
-
-  int64_t std_deviation() const {
-    if (count_ == 0 || min_ == max_) return 0;
-    return sqrt(squared_sum_ / count_ - avg() * avg());
-  }
-
-  void OutputToStream(std::ostream* stream) const {
-    *stream << "count=" << count_;
-    if (count_ == 0) return;
-    *stream << " min=" << min_ << " max=" << max_;
-    *stream << " avg=" << avg() << " std=" << std_deviation();
-  }
-
- private:
-  int64_t count_ = 0;
-  int64_t min_ = std::numeric_limits<int64_t>::max();
-  int64_t max_ = std::numeric_limits<int64_t>::min();
-  int64_t sum_ = 0;
-  double squared_sum_ = 0;
-};
-
-bool TimeMultipleRuns(Interpreter* interpreter, double sleep_seconds,
-                      int num_runs, int64* total_time_us) {
-  // Convert the run_delay string into a timespec.
-  timespec req;
-  req.tv_sec = static_cast<time_t>(sleep_seconds);
-  req.tv_nsec = (sleep_seconds - req.tv_sec) * 1000000000;
-
-  *total_time_us = 0;
-
-  std::cout << "Running benchmark for " << num_runs
-            << " iterations: " << std::endl;
-
-  Latencies latencies;
-  for (int i = 0; i < num_runs; ++i) {
-    int64_t time_us;
-    bool run_status = RunBenchmark(interpreter, &time_us);
-    latencies.AddMeasurement(time_us);
-    *total_time_us += time_us;
-    if (!run_status) {
-      std::cout << "Failed on run " << i << std::endl;
-      return false;
-    }
-
-    // If requested, sleep between runs for an arbitrary amount of time.
-    // This can be helpful to determine the effect of mobile processor
-    // scaling and thermal throttling.
-    if (sleep_seconds > 0.0) {
-#ifdef PLATFORM_WINDOWS
-      Sleep(sleep_seconds * 1000);
-#else
-      nanosleep(&req, nullptr);
-#endif
-    }
-  }
-  latencies.OutputToStream(&std::cout);
-  std::cout << std::endl;
-
-  return true;
-}
-
-int Main(int argc, char** argv) {
-  using tensorflow::Flag;
-  using tensorflow::Flags;
-
-  string graph;               // e.g.: /data/local/tmp/tfl_inception-v1_model.fb
-  string input_layer_string;  // e.g.: input
-  string input_layer_shape_string;  // e.g.: 1,224,224,3
-  string input_layer_type_string;   // e.g.: float
-  string input_layer_values_string;
-  string output_layer_string;  // e.g.: output
-  int num_runs = 50;
-  string run_delay = "-1.0";
-  int num_threads = -1;
-  string benchmark_name = "";
-  string output_prefix = "";
-  int warmup_runs = 1;
-  bool use_nnapi = false;
-
-  std::vector<Flag> flag_list = {
-      Flag("graph", &graph, "graph file name"),
-      // All the following flags are optional, but can be used in order
-      // to benchmark different input shapes.
-      Flag("input_layer", &input_layer_string, "input layer names"),
-      Flag("input_layer_shape", &input_layer_shape_string, "input layer shape"),
-      Flag("input_layer_type", &input_layer_type_string, "input layer type"),
-      Flag("input_layer_values", &input_layer_values_string,
-           "values to initialize the inputs with"),
-      Flag("output_layer", &output_layer_string, "output layer name"),
-      Flag("num_runs", &num_runs, "number of runs"),
-      Flag("run_delay", &run_delay, "delay between runs in seconds"),
-      Flag("num_threads", &num_threads, "number of threads"),
-      Flag("benchmark_name", &benchmark_name, "benchmark name"),
-      Flag("output_prefix", &output_prefix, "benchmark output prefix"),
-      Flag("warmup_runs", &warmup_runs, "how many runs to initialize model"),
-      Flag("use_nnapi", &use_nnapi, "use nnapi api"),
-  };
-  string usage = Flags::Usage(argv[0], flag_list);
-  const bool parse_result = Flags::Parse(&argc, argv, flag_list);
-  tensorflow::port::InitMain(argv[0], &argc, &argv);
-
-  if (!parse_result) {
-    std::cerr << usage << std::endl;
-    return -1;
-  }
-
-  std::cout << "Graph: [" << graph << "]" << std::endl;
-  if (!input_layer_string.empty()) {
-    std::cout << "Input layers: [" << input_layer_string << "]" << std::endl;
-    std::cout << "Input shapes: [" << input_layer_shape_string << "]"
-              << std::endl;
-    std::cout << "Input types: [" << input_layer_type_string << "]"
-              << std::endl;
-  }
-  if (!output_layer_string.empty()) {
-    std::cout << "Output layers: [" << output_layer_string << "]" << std::endl;
-  }
-  std::cout << "Num runs: [" << num_runs << "]" << std::endl;
-  std::cout << "Inter-run delay (seconds): [" << run_delay << "]" << std::endl;
-  std::cout << "Num threads: [" << num_threads << "]" << std::endl;
-  if (!benchmark_name.empty()) {
-    std::cout << "Benchmark name: [" << benchmark_name << "]" << std::endl;
-    std::cout << "Output prefix: [" << output_prefix << "]" << std::endl;
-  }
-  std::cout << "Warmup runs: [" << warmup_runs << "]" << std::endl;
-  std::cout << "Use nnapi : [" << use_nnapi << "]" << std::endl;
-
-  if (graph.empty()) {
-    std::cout
-        << "Please specify the name of your TF Lite input file with --graph"
-        << std::endl;
-    return -1;
-  }
-
-  std::vector<InputLayerInfo> inputs;
-  if (!PopulateInputLayerInfo(input_layer_string, input_layer_shape_string,
-                              input_layer_type_string,
-                              input_layer_values_string, &inputs)) {
-    return -1;
-  }
-
-  int64 initialization_start_us = Env::Default()->NowMicros();
-
-  std::unique_ptr<tflite::FlatBufferModel> model;
-  std::unique_ptr<tflite::Interpreter> interpreter;
-  if (!CreateInterpreter(graph, &model, &interpreter)) {
-    return -1;
-  }
-  if (!PrepareInterpreter(inputs, num_threads, use_nnapi, interpreter.get())) {
-    return -1;
-  }
-
-  int64 initialization_end_us = Env::Default()->NowMicros();
-
-  const double initialization_time_s =
-      (initialization_end_us - initialization_start_us) / 1000000.0f;
-  std::cout << "Initialized session in " << initialization_time_s << "s"
-            << std::endl;
-
-  const double sleep_seconds = std::strtod(run_delay.c_str(), nullptr);
-
-  // If requested, run through the graph first to preinitialize everything
-  // before the benchmarking runs.
-  int64 warmup_time_us = 0;
-  if (warmup_runs > 0) {
-    if (!TimeMultipleRuns(interpreter.get(), sleep_seconds, warmup_runs,
-                          &warmup_time_us)) {
-      std::cerr << "Warmup failed" << std::endl;
-      return -1;
-    }
-  }
-
-  // Capture overall inference time without stat logging overhead. This is the
-  // timing data that can be compared to other libaries.
-  int64 no_stat_time_us = 0;
-  if (!TimeMultipleRuns(interpreter.get(), sleep_seconds, num_runs,
-                        &no_stat_time_us)) {
-    std::cerr << "Timing failed." << std::endl;
-    return -1;
-  }
-
-  std::cout << "Average inference timings in us: " << no_stat_time_us / num_runs
-            << " , Warmup: "
-            << (warmup_runs > 0 ? warmup_time_us / warmup_runs : 0) << ", "
-            << std::endl;
-
-  return 0;
-}
-
-}  // namespace tflite
-
-int main(int argc, char** argv) { return ::tflite::Main(argc, argv); }
diff --git a/tensorflow/contrib/lite/tools/gen_op_registration_main.cc b/tensorflow/contrib/lite/tools/gen_op_registration_main.cc
index 17b514c9169817..f7df80821fc383 100644
--- a/tensorflow/contrib/lite/tools/gen_op_registration_main.cc
+++ b/tensorflow/contrib/lite/tools/gen_op_registration_main.cc
@@ -55,7 +55,7 @@ void GenerateFileContent(const std::string& tflite_path,
   std::ofstream fout(filename);
 
   fout << "#include \"" << tflite_path << "/model.h\"\n";
-  fout << "#include \"" << tflite_path << "/tools/mutable_op_resolver.h\"\n";
+  fout << "#include \"" << tflite_path << "/op_resolver.h\"\n";
 
   fout << "namespace tflite {\n";
   fout << "namespace ops {\n";
diff --git a/tensorflow/contrib/lite/tools/mutable_op_resolver.cc b/tensorflow/contrib/lite/tools/mutable_op_resolver.cc
deleted file mode 100644
index 8a921d7c5aa20c..00000000000000
--- a/tensorflow/contrib/lite/tools/mutable_op_resolver.cc
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/contrib/lite/tools/mutable_op_resolver.h"
-
-namespace tflite {
-
-TfLiteRegistration* MutableOpResolver::FindOp(
-    tflite::BuiltinOperator op) const {
-  auto it = builtins_.find(op);
-  return it != builtins_.end() ? it->second : nullptr;
-}
-
-TfLiteRegistration* MutableOpResolver::FindOp(const char* op) const {
-  auto it = custom_ops_.find(op);
-  return it != custom_ops_.end() ? it->second : nullptr;
-}
-
-void MutableOpResolver::AddBuiltin(tflite::BuiltinOperator op,
-                                   TfLiteRegistration* registration) {
-  registration->builtin_code = op;
-  builtins_.insert(std::make_pair(op, registration));
-}
-
-void MutableOpResolver::AddCustom(const char* name,
-                                  TfLiteRegistration* registration) {
-  registration->builtin_code = BuiltinOperator_CUSTOM;
-  custom_ops_.insert(std::make_pair(std::string(name), registration));
-}
-
-}  // namespace tflite
diff --git a/tensorflow/contrib/lite/tools/mutable_op_resolver.h b/tensorflow/contrib/lite/tools/mutable_op_resolver.h
deleted file mode 100644
index 573a359c458acb..00000000000000
--- a/tensorflow/contrib/lite/tools/mutable_op_resolver.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_MUTABLE_OP_RESOLVER_H_
-#define TENSORFLOW_CONTRIB_LITE_TOOLS_MUTABLE_OP_RESOLVER_H_
-
-#include <map>
-#include "tensorflow/contrib/lite/context.h"
-#include "tensorflow/contrib/lite/model.h"
-
-// Needed to resolve unordered_set hash on older compilers.
-namespace std {
-template <>
-struct hash<tflite::BuiltinOperator> {
-  size_t operator()(const tflite::BuiltinOperator& op) const {
-    return std::hash<int>()(op);
-  }
-};
-}  // namespace std
-
-namespace tflite {
-
-// An OpResolver that is mutable, also used as the op in gen_op_registration.
-// A typical usage:
-//   MutableOpResolver resolver;
-//   resolver.AddBuiltin(BuiltinOperator_ADD, Register_ADD());
-//   resolver.AddCustom("CustomOp", Register_CUSTOM_OP());
-//   InterpreterBuilder(model, resolver)(&interpreter);
-class MutableOpResolver : public OpResolver {
- public:
-  MutableOpResolver() {}
-  TfLiteRegistration* FindOp(tflite::BuiltinOperator op) const override;
-  TfLiteRegistration* FindOp(const char* op) const override;
-  void AddBuiltin(tflite::BuiltinOperator op, TfLiteRegistration* registration);
-  void AddCustom(const char* name, TfLiteRegistration* registration);
-
- private:
-  std::map<int, TfLiteRegistration*> builtins_;
-  std::map<std::string, TfLiteRegistration*> custom_ops_;
-};
-
-}  // namespace tflite
-
-#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_MUTABLE_OP_RESOLVER_H_
diff --git a/tensorflow/contrib/lite/tools/verifier.cc b/tensorflow/contrib/lite/tools/verifier.cc
index 8818a7dc85d9ff..8d3a7a624265ca 100644
--- a/tensorflow/contrib/lite/tools/verifier.cc
+++ b/tensorflow/contrib/lite/tools/verifier.cc
@@ -246,15 +246,16 @@ bool VerifyOps(const Model& model, const OpResolver& resolver,
     }
 
     if (opcode->builtin_code() == BuiltinOperator_CUSTOM) {
-      if (!resolver.FindOp(opcode->custom_code()->c_str())) {
-        ReportError(error_reporter, "Unsupported custom op: %s",
-                    opcode->custom_code()->c_str());
+      if (!resolver.FindOp(opcode->custom_code()->c_str(), opcode->version())) {
+        ReportError(error_reporter, "Unsupported custom op: %s, version: %d",
+                    opcode->custom_code()->c_str(), opcode->version());
         return false;
       }
     } else {
-      if (!resolver.FindOp(opcode->builtin_code())) {
-        ReportError(error_reporter, "Unsupported builtin op: %s",
-                    EnumNameBuiltinOperator(opcode->builtin_code()));
+      if (!resolver.FindOp(opcode->builtin_code(), opcode->version())) {
+        ReportError(error_reporter, "Unsupported builtin op: %s, version: %d",
+                    EnumNameBuiltinOperator(opcode->builtin_code()),
+                    opcode->version());
         return false;
       }
     }
diff --git a/tensorflow/contrib/lite/tools/verifier.h b/tensorflow/contrib/lite/tools/verifier.h
index b7ce4e830576af..a596c650a0c253 100644
--- a/tensorflow/contrib/lite/tools/verifier.h
+++ b/tensorflow/contrib/lite/tools/verifier.h
@@ -26,12 +26,13 @@ namespace tflite {
 class AlwaysTrueResolver : public OpResolver {
  public:
   AlwaysTrueResolver() {}
-  TfLiteRegistration* FindOp(tflite::BuiltinOperator op) const override {
+  const TfLiteRegistration* FindOp(tflite::BuiltinOperator op,
+                                   int version) const override {
     static TfLiteRegistration null_registration = {nullptr, nullptr, nullptr,
                                                    nullptr};
     return &null_registration;
   }
-  TfLiteRegistration* FindOp(const char* op) const override {
+  const TfLiteRegistration* FindOp(const char* op, int version) const override {
     static TfLiteRegistration null_registration = {nullptr, nullptr, nullptr,
                                                    nullptr};
     return &null_registration;
diff --git a/tensorflow/contrib/lite/tools/verifier_test.cc b/tensorflow/contrib/lite/tools/verifier_test.cc
index 03b93afe3ed04b..ce8a7857d2dd66 100644
--- a/tensorflow/contrib/lite/tools/verifier_test.cc
+++ b/tensorflow/contrib/lite/tools/verifier_test.cc
@@ -20,9 +20,9 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "tensorflow/contrib/lite/allocation.h"
 #include "tensorflow/contrib/lite/error_reporter.h"
+#include "tensorflow/contrib/lite/op_resolver.h"
 #include "tensorflow/contrib/lite/schema/schema_generated.h"
 #include "tensorflow/contrib/lite/testing/util.h"
-#include "tensorflow/contrib/lite/tools/mutable_op_resolver.h"
 #include "tensorflow/contrib/lite/tools/verifier.h"
 #include "tensorflow/contrib/lite/version.h"
 #include "tensorflow/core/framework/numeric_types.h"
@@ -31,7 +31,6 @@ namespace tflite {
 
 using flatbuffers::FlatBufferBuilder;
 using flatbuffers::Offset;
-using flatbuffers::Vector;
 
 // Build single subgraph model.
 class TfLiteFlatbufferModelBuilder {
diff --git a/tensorflow/contrib/lite/util.cc b/tensorflow/contrib/lite/util.cc
index fb4af07d060cac..8ccb65c24fd64f 100644
--- a/tensorflow/contrib/lite/util.cc
+++ b/tensorflow/contrib/lite/util.cc
@@ -38,4 +38,14 @@ bool EqualArrayAndTfLiteIntArray(const TfLiteIntArray* a, const int b_size,
   return true;
 }
 
+size_t CombineHashes(std::initializer_list<size_t> hashes) {
+  size_t result = 0;
+  // Hash combiner used by TensorFlow core.
+  for (size_t hash : hashes) {
+    result = result ^
+             (hash + 0x9e3779b97f4a7800ULL + (result << 10) + (result >> 4));
+  }
+  return result;
+}
+
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/util.h b/tensorflow/contrib/lite/util.h
index a34db358231044..89d9b4f5cffa99 100644
--- a/tensorflow/contrib/lite/util.h
+++ b/tensorflow/contrib/lite/util.h
@@ -35,6 +35,8 @@ TfLiteIntArray* ConvertArrayToTfLiteIntArray(const int rank, const int* dims);
 bool EqualArrayAndTfLiteIntArray(const TfLiteIntArray* a, const int b_size,
                                  const int* b);
 
+size_t CombineHashes(std::initializer_list<size_t> hashes);
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_CONTRIB_LITE_UTIL_H_
diff --git a/tensorflow/contrib/losses/python/losses/loss_ops.py b/tensorflow/contrib/losses/python/losses/loss_ops.py
index bdad34a665e47a..651de4e2f446b2 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops.py
@@ -482,9 +482,12 @@ def hinge_loss(logits, labels=None, scope=None):
   """Method that returns the loss tensor for hinge loss.
 
   Args:
-    logits: The logits, a float tensor.
+    logits: The logits, a float tensor. Note that logits are assumed to be
+      unbounded and 0-centered. A value > 0 (resp. < 0) is considered a positive
+      (resp. negative) binary prediction.
     labels: The ground truth output tensor. Its shape should match the shape of
-      logits. The values of the tensor are expected to be 0.0 or 1.0.
+      logits. The values of the tensor are expected to be 0.0 or 1.0. Internally
+      the {0,1} labels are converted to {-1,1} when calculating the hinge loss.
     scope: The scope for the operations performed in computing the loss.
 
   Returns:
diff --git a/tensorflow/contrib/losses/python/losses/loss_ops_test.py b/tensorflow/contrib/losses/python/losses/loss_ops_test.py
index 1417772e0496cb..2a442a8fc85c8a 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops_test.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops_test.py
@@ -24,10 +24,8 @@
 from tensorflow.contrib.losses.python.losses import loss_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -275,7 +273,6 @@ def testLossWithDynamicallyShapedWeights2D(self):
       self.assertAlmostEqual(np.average(weights) * 10.0, loss, 3)
 
 
-@test_util.with_c_api
 class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
 
   def testNoneWeightRaisesValueError(self):
@@ -473,11 +470,7 @@ def testInconsistentLabelShapeRaisesException(self):
       labels = constant_op.constant([[0, 1], [2, 3]])
       weights = constant_op.constant([1.2, 3.4, 5.6, 7.8])
 
-      if ops._USE_C_API:
-        error_type = ValueError
-      else:
-        error_type = errors_impl.InvalidArgumentError
-      with self.assertRaises(error_type):
+      with self.assertRaises(ValueError):
         loss_ops.sparse_softmax_cross_entropy(
             logits, labels, weights=weights).eval()
 
diff --git a/tensorflow/contrib/makefile/compile_nsync.sh b/tensorflow/contrib/makefile/compile_nsync.sh
index e8c6edd7ba9aa6..a28fc3a87f9503 100755
--- a/tensorflow/contrib/makefile/compile_nsync.sh
+++ b/tensorflow/contrib/makefile/compile_nsync.sh
@@ -270,7 +270,7 @@ for arch in $archs; do
                         PLATFORM_LDFLAGS=-pthread
                         MKDEP=${CC} -M -std=c++11
                         PLATFORM_C=../../platform/c++11/src/nsync_semaphore_mutex.cc \
-                                   ../../platform/c++11/src/per_thread_waiter.cc \
+                                   ../../platform/posix/src/per_thread_waiter.c \
                                    ../../platform/c++11/src/yield.cc \
                                    ../../platform/c++11/src/time_rep_timespec.cc \
                                    ../../platform/c++11/src/nsync_panic.cc
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index eff9081e35c285..48953e2e3843ff 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -27,9 +27,7 @@ if [ ! -f $BZL_FILE_PATH ]; then
 fi
 
 EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
-# TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' once
-# the archive has been propagated in mirror.bazel.build.
-GEMMLOWP_URL="$(grep -o 'https://github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
+GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 NSYNC_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
 PROTOBUF_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/protobuf/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index d4c3f2eda8be0c..89db9ee2794ddf 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -300,7 +300,6 @@ tensorflow/core/kernels/spacetobatch_op.cc
 tensorflow/core/kernels/batchtospace_op.cc
 tensorflow/core/kernels/warn_about_ints.cc
 tensorflow/core/kernels/segment_reduction_ops.cc
-tensorflow/core/kernels/batch_util.cc
 tensorflow/core/ops/audio_ops.cc
 tensorflow/core/kernels/decode_proto_op.cc
 tensorflow/core/kernels/encode_proto_op.cc
diff --git a/tensorflow/contrib/metrics/BUILD b/tensorflow/contrib/metrics/BUILD
index e050f3c8d4fc61..4f2c82ca230116 100644
--- a/tensorflow/contrib/metrics/BUILD
+++ b/tensorflow/contrib/metrics/BUILD
@@ -77,7 +77,7 @@ py_test(
 py_test(
     name = "metric_ops_test",
     srcs = ["python/ops/metric_ops_test.py"],
-    shard_count = 8,
+    shard_count = 16,
     srcs_version = "PY2AND3",
     tags = ["noasan"],  # times out b/63678675
     deps = [
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 00a933e5e0c537..a6be2084aae6bb 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -1544,7 +1544,7 @@ def precision_recall_at_equal_thresholds(labels,
     result: A named tuple (See PrecisionRecallData within the implementation of
       this function) with properties that are variables of shape
       `[num_thresholds]`. The names of the properties are tp, fp, tn, fn,
-      precision, recall, thresholds.
+      precision, recall, thresholds. Types are same as that of predictions.
     update_op: An op that accumulates values.
 
   Raises:
@@ -1570,7 +1570,6 @@ def precision_recall_at_equal_thresholds(labels,
 
   check_ops.assert_type(labels, dtypes.bool)
 
-  dtype = predictions.dtype
   with variable_scope.variable_scope(name,
                                      'precision_recall_at_equal_thresholds',
                                      (labels, predictions, weights)):
@@ -1592,11 +1591,16 @@ def precision_recall_at_equal_thresholds(labels,
 
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
 
-    # We cast to float to ensure we have 0.0 or 1.0.
-    f_labels = math_ops.cast(labels, dtype)
+    # It's important we aggregate using float64 since we're accumulating a lot
+    # of 1.0's for the true/false labels, and accumulating to float32 will
+    # be quite inaccurate even with just a modest amount of values (~20M).
+    # We use float64 instead of integer primarily since GPU scatter kernel
+    # only support floats.
+    agg_dtype = dtypes.float64
 
-    # Get weighted true/false labels.
-    true_labels = f_labels * weights
+    f_labels = math_ops.cast(labels, agg_dtype)
+    weights = math_ops.cast(weights, agg_dtype)
+    true_labels = f_labels  * weights
     false_labels = (1.0 - f_labels) * weights
 
     # Flatten predictions and labels.
@@ -1638,9 +1642,9 @@ def precision_recall_at_equal_thresholds(labels,
 
     with ops.name_scope('variables'):
       tp_buckets_v = metrics_impl.metric_variable(
-          [num_thresholds], dtype, name='tp_buckets')
+          [num_thresholds], agg_dtype, name='tp_buckets')
       fp_buckets_v = metrics_impl.metric_variable(
-          [num_thresholds], dtype, name='fp_buckets')
+          [num_thresholds], agg_dtype, name='fp_buckets')
 
     with ops.name_scope('update_op'):
       update_tp = state_ops.scatter_add(
@@ -1660,18 +1664,21 @@ def precision_recall_at_equal_thresholds(labels,
     fn = tp[0] - tp
 
     # We use a minimum to prevent division by 0.
-    epsilon = 1e-7
+    epsilon = ops.convert_to_tensor(1e-7, dtype=agg_dtype)
     precision = tp / math_ops.maximum(epsilon, tp + fp)
     recall = tp / math_ops.maximum(epsilon, tp + fn)
 
+    # Convert all tensors back to predictions' dtype (as per function contract).
+    out_dtype = predictions.dtype
+    _convert = lambda tensor: math_ops.cast(tensor, out_dtype)
     result = PrecisionRecallData(
-        tp=tp,
-        fp=fp,
-        tn=tn,
-        fn=fn,
-        precision=precision,
-        recall=recall,
-        thresholds=math_ops.lin_space(0.0, 1.0, num_thresholds))
+        tp=_convert(tp),
+        fp=_convert(fp),
+        tn=_convert(tn),
+        fn=_convert(fn),
+        precision=_convert(precision),
+        recall=_convert(recall),
+        thresholds=_convert(math_ops.lin_space(0.0, 1.0, num_thresholds)))
     update_op = control_flow_ops.group(update_tp, update_fp)
     return result, update_op
 
@@ -2496,7 +2503,7 @@ def _compute_recall_at_precision(tp, fp, fn, precision, name):
     name: An optional variable_scope name.
 
   Returns:
-    The recall at a the given `precision`.
+    The recall at a given `precision`.
   """
   precisions = math_ops.div(tp, tp + fp + _EPSILON)
   tf_index = math_ops.argmin(
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
index 76420db8bda394..b13f08a37d9e85 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
@@ -2333,47 +2333,24 @@ def setUp(self):
     np.random.seed(1)
     ops.reset_default_graph()
 
-  def _testResultsEqual(self, expected_dict, gotten_result):
+  def _testResultsEqual(self, expected_dict, gotten_result, eps=None):
     """Tests that 2 results (dicts) represent the same data.
 
     Args:
       expected_dict: A dictionary with keys that are the names of properties
         of PrecisionRecallData and whose values are lists of floats.
       gotten_result: A PrecisionRecallData object.
+      eps: Epsilon value to use for testing output values. If unspecified, use
+        default from assertAllClose.
     """
     gotten_dict = {k: t.eval() for k, t in gotten_result._asdict().items()}
     self.assertItemsEqual(list(expected_dict.keys()), list(gotten_dict.keys()))
 
     for key, expected_values in expected_dict.items():
-      self.assertAllClose(expected_values, gotten_dict[key])
-
-  def _testCase(self, predictions, labels, expected_result, weights=None):
-    """Performs a test given a certain scenario of labels, predictions, weights.
-
-    Args:
-      predictions: The predictions tensor. Of type float32.
-      labels: The labels tensor. Of type bool.
-      expected_result: The expected result (dict) that maps to tensors.
-      weights: Optional weights tensor.
-    """
-    with self.test_session() as sess:
-      predictions_tensor = constant_op.constant(
-          predictions, dtype=dtypes_lib.float32)
-      labels_tensor = constant_op.constant(labels, dtype=dtypes_lib.bool)
-      weights_tensor = None
-      if weights:
-        weights_tensor = constant_op.constant(weights, dtype=dtypes_lib.float32)
-      gotten_result, update_op = (
-          metric_ops.precision_recall_at_equal_thresholds(
-              labels=labels_tensor,
-              predictions=predictions_tensor,
-              weights=weights_tensor,
-              num_thresholds=3))
-
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
-
-      self._testResultsEqual(expected_result, gotten_result)
+      if eps is not None:
+        self.assertAllClose(expected_values, gotten_dict[key], atol=eps)
+      else:
+        self.assertAllClose(expected_values, gotten_dict[key])
 
   def testVars(self):
     metric_ops.precision_recall_at_equal_thresholds(
@@ -2414,6 +2391,78 @@ def testValuesAreIdempotent(self):
       for _ in range(3):
         self._testResultsEqual(initial_result, result)
 
+  def testLargeCase(self):
+    self.skipTest("Test consistently timing out")
+    shape = [32, 512, 256, 1]
+    predictions = random_ops.random_uniform(
+        shape, 0.0, 1.0, dtype=dtypes_lib.float32)
+    labels = math_ops.greater(random_ops.random_uniform(shape, 0.0, 1.0), 0.5)
+
+    result, update_op = metric_ops.precision_recall_at_equal_thresholds(
+        labels=labels, predictions=predictions, num_thresholds=201)
+    # Run many updates, enough to cause highly inaccurate values if the
+    # code used float32 for accumulation.
+    num_updates = 71
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+      for _ in xrange(num_updates):
+        sess.run(update_op)
+
+      prdata = sess.run(result)
+
+      # Since we use random values, we won't know the tp/fp/tn/fn values, but
+      # tp and fp at threshold 0 should be the total number of positive and
+      # negative labels, hence their sum should be total number of pixels.
+      expected_value = 1.0 * np.product(shape) * num_updates
+      got_value = prdata.tp[0] + prdata.fp[0]
+      # They should be at least within 1.
+      self.assertNear(got_value, expected_value, 1.0)
+
+  def _testCase(self,
+                predictions,
+                labels,
+                expected_result,
+                dtype=dtypes_lib.float32,
+                eps=None,
+                weights=None):
+    """Performs a test given a certain scenario of labels, predictions, weights.
+
+    Args:
+      predictions: The predictions tensor. Of type dtype.
+      labels: The labels tensor. Of type bool.
+      expected_result: The expected result (dict) that maps to tensors.
+      dtype: Data type to use for predictions and weights tensor. Default
+        is float32.
+      eps: Epsilon value to use for testing output values. If unspecified, use
+        default from assertAllClose.
+      weights: Optional weights tensor.
+    """
+    with self.test_session() as sess:
+      predictions_tensor = constant_op.constant(predictions, dtype=dtype)
+      labels_tensor = constant_op.constant(labels, dtype=dtypes_lib.bool)
+      weights_tensor = None
+      if weights:
+        weights_tensor = constant_op.constant(weights, dtype=dtype)
+      gotten_result, update_op = (
+          metric_ops.precision_recall_at_equal_thresholds(
+              labels=labels_tensor,
+              predictions=predictions_tensor,
+              weights=weights_tensor,
+              num_thresholds=3))
+      self.assertEqual(gotten_result.tp.dtype, dtype)
+      self.assertEqual(gotten_result.fp.dtype, dtype)
+      self.assertEqual(gotten_result.tn.dtype, dtype)
+      self.assertEqual(gotten_result.fn.dtype, dtype)
+      self.assertEqual(gotten_result.precision.dtype, dtype)
+      self.assertEqual(gotten_result.recall.dtype, dtype)
+      self.assertEqual(gotten_result.thresholds.dtype, dtype)
+
+      sess.run(variables.local_variables_initializer())
+      sess.run(update_op)
+
+      self._testResultsEqual(expected_result, gotten_result, eps=eps)
+
   def testAllTruePositives(self):
     self._testCase(
         [[1]], [[True]], {
@@ -2489,6 +2538,35 @@ def testManyValuesWithWeights(self):
         },
         weights=[[0.0, 0.5, 2.0, 0.0, 0.5, 1.0]])
 
+  def testFloat64(self):
+    self._testCase(
+        [[0.2, 0.3, 0.4, 0.6, 0.7, 0.8]],
+        [[True, False, False, True, True, True]], {
+            'tp': [4, 3, 0],
+            'fp': [2, 0, 0],
+            'tn': [0, 2, 2],
+            'fn': [0, 1, 4],
+            'precision': [2.0 / 3.0, 1.0, 0.0],
+            'recall': [1.0, 0.75, 0.0],
+            'thresholds': [0.0, 0.5, 1.0],
+        },
+        dtype=dtypes_lib.float64)
+
+  def testFloat16(self):
+    self._testCase(
+        [[0.2, 0.3, 0.4, 0.6, 0.7, 0.8]],
+        [[True, False, False, True, True, True]], {
+            'tp': [4, 3, 0],
+            'fp': [2, 0, 0],
+            'tn': [0, 2, 2],
+            'fn': [0, 1, 4],
+            'precision': [2.0 / 3.0, 1.0, 0.0],
+            'recall': [1.0, 0.75, 0.0],
+            'thresholds': [0.0, 0.5, 1.0],
+        },
+        dtype=dtypes_lib.float16,
+        eps=1e-3)
+
 
 class StreamingSpecificityAtSensitivityTest(test.TestCase):
 
@@ -7101,6 +7179,14 @@ def testInvalidDimension(self):
     with self.assertRaises(ValueError):
       metrics.cohen_kappa(labels, invalid_predictions, 3)
 
+  def testConditionalPackingOptimization(self):
+    placeholder = array_ops.placeholder(dtypes_lib.float32, [None])
+    values, update_op = metric_ops.streaming_concat(placeholder)
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+      for feed in range(10):
+        sess.run(update_op, feed_dict={placeholder: [feed]})
+        print(sess.run(values))
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/mixed_precision/BUILD b/tensorflow/contrib/mixed_precision/BUILD
new file mode 100644
index 00000000000000..3dfb95e0a006b1
--- /dev/null
+++ b/tensorflow/contrib/mixed_precision/BUILD
@@ -0,0 +1,32 @@
+# Mixed precision training optimizers
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+filegroup(
+    name = "all_files",
+    srcs = glob(
+        ["**/*"],
+        exclude = [
+            "**/METADATA",
+            "**/OWNERS",
+        ],
+    ),
+    visibility = ["//tensorflow:__subpackages__"],
+)
+
+py_library(
+    name = "mixed_precision",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/contrib/mixed_precision/python:loss_scale_manager",
+        "//tensorflow/contrib/mixed_precision/python:loss_scale_optimizer",
+    ],
+)
diff --git a/tensorflow/contrib/mixed_precision/__init__.py b/tensorflow/contrib/mixed_precision/__init__.py
new file mode 100644
index 00000000000000..43e98cdda09222
--- /dev/null
+++ b/tensorflow/contrib/mixed_precision/__init__.py
@@ -0,0 +1,34 @@
+# Copyright 2018 Google Inc. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# mixed_precisiond under the License is mixed_precisiond on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Library for mixed precision training."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import,wildcard-import
+from tensorflow.contrib.mixed_precision.python.loss_scale_manager import *
+from tensorflow.contrib.mixed_precision.python.loss_scale_optimizer import *
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    "LossScaleManager",
+    "FixedLossScaleManager",
+    "ExponentialUpdateLossScaleManager",
+    "LossScaleOptimizer",
+]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/mixed_precision/python/BUILD b/tensorflow/contrib/mixed_precision/python/BUILD
new file mode 100644
index 00000000000000..1d769e16141e3e
--- /dev/null
+++ b/tensorflow/contrib/mixed_precision/python/BUILD
@@ -0,0 +1,74 @@
+# Mixed precision training optimizers
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_library(
+    name = "loss_scale_manager",
+    srcs = ["loss_scale_manager.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:variable_scope",
+    ],
+)
+
+py_test(
+    name = "loss_scale_manager_test",
+    size = "small",
+    srcs = ["loss_scale_manager_test.py"],
+    deps = [
+        ":loss_scale_manager",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_library(
+    name = "loss_scale_optimizer",
+    srcs = ["loss_scale_optimizer.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":loss_scale_manager",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_test(
+    name = "loss_scale_optimizer_test",
+    size = "small",
+    srcs = ["loss_scale_optimizer_test.py"],
+    deps = [
+        ":loss_scale_optimizer",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/contrib/mixed_precision/python/loss_scale_manager.py b/tensorflow/contrib/mixed_precision/python/loss_scale_manager.py
new file mode 100644
index 00000000000000..be7377b1519f3b
--- /dev/null
+++ b/tensorflow/contrib/mixed_precision/python/loss_scale_manager.py
@@ -0,0 +1,200 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""LossScaleManager classes for mixed precision training."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+import six
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_control_flow_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+
+
+@six.add_metaclass(abc.ABCMeta)
+class LossScaleManager(object):
+  """Abstract loss scale manager class.
+
+  Loss scale managers with a different strategy should subclass this class.
+  Loss scaling is a process that:
+
+  1) Applies a multiplier on the loss before computing gradients, and
+  2) Applies the reciprocal of the multiplier on the gradients before they are
+     applied on variables.
+
+  This class is used together with
+  @{tf.contrib.mixed_precision.LossScaleOptimizer} for mixed precision training
+  (float32 variables and float16 ops) on Nvidia GPUs in order to achieve the
+  same model quality as single precision training, with the benefits of
+  potential higher throughput.
+
+  See @{tf.contrib.mixed_precision.LossScaleOptimizer} for more details.
+  """
+
+  @abc.abstractmethod
+  def get_loss_scale(self):
+    """Returns the loss scale as a scalar `float32` tensor."""
+    pass
+
+  @abc.abstractmethod
+  def update_loss_scale(self, finite_grads):
+    """Updates loss scale based on if gradients are finite in current step.
+
+    Args:
+      finite_grads: bool scalar tensor indicating if all gradients are
+        finite (i.e., not inf or nan).
+
+    Returns:
+      An op, when executed updates the loss scale. If eager execution is
+      enabled, does not return anything.
+    """
+    del finite_grads
+    return
+
+
+class FixedLossScaleManager(LossScaleManager):
+  """Loss scale manager with a fixed loss scale.
+
+  The loss scale is not updated for the lifetime of the class.
+  """
+
+  def __init__(self, loss_scale):
+    """Creates the fixed loss scale manager.
+
+    Args:
+      loss_scale: A Python float. Its ideal value varies depending on models to
+        run. Choosing a too small loss_scale might affect model quality; a too
+        big loss_scale might cause inf or nan. There is no single right
+        loss_scale to apply. There is no harm choosing a relatively big number
+        as long as no nan or inf is encountered in training.
+
+    Raises:
+      ValueError: If loss_scale is less than 1.
+    """
+    if loss_scale < 1:
+      raise ValueError("loss scale must be at least 1.")
+    self._loss_scale = ops.convert_to_tensor(loss_scale, dtype=dtypes.float32)
+
+  def get_loss_scale(self):
+    return self._loss_scale
+
+  def update_loss_scale(self, finite_grads):
+    del finite_grads
+    return gen_control_flow_ops.no_op()
+
+
+class ExponentialUpdateLossScaleManager(LossScaleManager):
+  """Loss scale manager uses an exponential update strategy.
+
+  In general, the strategy increases loss scale by a greater-than-one factor
+  after encountering a consecutive series of steps with finite gradients;
+  Similarly, it decreases the loss scale by a factor when the accumulated number
+  of steps with non-finite (nan or inf) gradients are met. An update is not
+  applied if its result is less than 1 or overflows the float32 dynamic range.
+
+  The number of finite and non-finite steps are cleared every time the loss
+  scale is changed. The condition to decrease the loss scale is looser than to
+  increase it since the former does not require the steps to be consecutive.
+  """
+
+  def __init__(self,
+               init_loss_scale,
+               incr_every_n_steps,
+               decr_every_n_nan_or_inf=2,
+               incr_ratio=2,
+               decr_ratio=0.8):
+    """Constructor of exponential-update loss scale manager.
+
+    Args:
+      init_loss_scale: A Python float.  The loss scale to use at the beginning.
+      incr_every_n_steps: Increases loss scale every n consecutive steps with
+        finite gradients.
+      decr_every_n_nan_or_inf: Decreases loss scale every n accumulated steps
+        with nan or inf gradients.
+      incr_ratio: The multiplier to use when increasing the loss scale.
+      decr_ratio: The less-than-one-multiplier to use when decreasing the loss
+        scale.
+    """
+    self._incr_every_n_steps = incr_every_n_steps
+    self._decr_every_n_nan_or_inf = decr_every_n_nan_or_inf
+    self._incr_ratio = incr_ratio
+    self._decr_ratio = decr_ratio
+    self._loss_scale = variable_scope.variable(
+        name="loss_scale",
+        initial_value=ops.convert_to_tensor(init_loss_scale, dtypes.float32),
+        dtype=dtypes.float32,
+        trainable=False)
+    self._num_good_steps = variable_scope.variable(
+        name="good_steps", initial_value=0, dtype=dtypes.int32, trainable=False)
+    self._num_bad_steps = variable_scope.variable(
+        name="bad_steps", initial_value=0, dtype=dtypes.int32, trainable=False)
+
+  def _reset_stats(self):
+    return control_flow_ops.group(
+        state_ops.assign(self._num_good_steps, 0),
+        state_ops.assign(self._num_bad_steps, 0))
+
+  def get_loss_scale(self):
+    """Returns the loss scale."""
+    return self._loss_scale
+
+  def update_loss_scale(self, finite_grads):
+    """Updates loss scale based on if gradients are finite in current step."""
+
+    def update_if_finite_grads():
+      """Branch function when grads are all finite."""
+
+      def incr_loss_scale():
+        new_loss_scale = control_flow_ops.cond(
+            gen_math_ops.is_finite(self._loss_scale * self._incr_ratio),
+            lambda: self._loss_scale * self._incr_ratio,
+            lambda: self._loss_scale)
+        update_op = state_ops.assign(self._loss_scale, new_loss_scale)
+        # When loss_scale is updated, both good and bad steps are reset.
+        return control_flow_ops.group(update_op, self._reset_stats())
+
+      return control_flow_ops.cond(
+          self._num_good_steps + 1 >= self._incr_every_n_steps,
+          incr_loss_scale,
+          lambda: state_ops.assign_add(self._num_good_steps, 1).op)
+
+    def update_if_not_finite_grads():
+      """Branch function when any grad is not finite."""
+
+      def decr_loss_scale():
+        update_op = state_ops.assign(
+            self._loss_scale,
+            gen_math_ops.maximum(1., self._loss_scale * self._decr_ratio))
+        # When loss_scale is updated, both good and bad steps are reset.
+        return control_flow_ops.group(update_op, self._reset_stats())
+
+      def just_update_steps():
+        # When bad_steps is incremented, good_step is reset.
+        return control_flow_ops.group(
+            state_ops.assign_add(self._num_bad_steps, 1),
+            state_ops.assign(self._num_good_steps, 0))
+
+      return control_flow_ops.cond(
+          self._num_bad_steps + 1 >= self._decr_every_n_nan_or_inf,
+          decr_loss_scale, just_update_steps)
+
+    return control_flow_ops.cond(finite_grads, update_if_finite_grads,
+                                 update_if_not_finite_grads)
diff --git a/tensorflow/contrib/mixed_precision/python/loss_scale_manager_test.py b/tensorflow/contrib/mixed_precision/python/loss_scale_manager_test.py
new file mode 100644
index 00000000000000..480f5f6eaf493c
--- /dev/null
+++ b/tensorflow/contrib/mixed_precision/python/loss_scale_manager_test.py
@@ -0,0 +1,182 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for LossScaleManager classes.."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.mixed_precision.python import loss_scale_manager as lsm_lib
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def _GetExampleIter(inputs):
+  dataset = dataset_ops.Dataset.from_tensor_slices(inputs)
+  return dataset.make_one_shot_iterator()
+
+
+class FixedLossScaleManagerTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_basic(self):
+    itr = _GetExampleIter([True] * 10 + [False] * 10)
+
+    loss_scale = 1000
+    lsm = lsm_lib.FixedLossScaleManager(loss_scale)
+    update_fn = lambda: lsm.update_loss_scale(itr.get_next())
+
+    self.evaluate(variables.global_variables_initializer())
+    if not context.executing_eagerly():
+      update_op = update_fn()
+    for _ in range(10):
+      if context.executing_eagerly():
+        update_fn()
+      else:
+        self.evaluate(update_op)
+      self.assertEqual(loss_scale, self.evaluate(lsm.get_loss_scale()))
+
+
+class ExponentialUpdateLossScaleManagerTest(test.TestCase):
+
+  def _test_helper(self,
+                   inputs,
+                   expected_outputs,
+                   init_loss_scale=1,
+                   incr_every_n_step=2,
+                   decr_every_n_nan_or_inf=2):
+    ratio = 2
+    lsm = lsm_lib.ExponentialUpdateLossScaleManager(
+        init_loss_scale=init_loss_scale,
+        incr_every_n_steps=incr_every_n_step,
+        decr_every_n_nan_or_inf=decr_every_n_nan_or_inf,
+        incr_ratio=ratio,
+        decr_ratio=1. / ratio)
+    itr = _GetExampleIter(inputs)
+    update_fn = lambda: lsm.update_loss_scale(itr.get_next())
+
+    self.evaluate(variables.global_variables_initializer())
+    actual_outputs = []
+
+    if not context.executing_eagerly():
+      update_op = update_fn()
+    for _ in range(len(inputs)):
+      if context.executing_eagerly():
+        update_fn()
+      else:
+        self.evaluate(update_op)
+      actual_outputs.append(self.evaluate(lsm.get_loss_scale()))
+    self.assertEqual(actual_outputs, expected_outputs)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_increase_every_n_steps(self):
+    inputs = [True] * 6
+    expected_outputs = [1, 2, 2, 4, 4, 8]
+    self._test_helper(inputs, expected_outputs)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_keep_increasing_until_capped(self):
+    init_loss_scale = np.finfo(np.float32).max / 4 + 10
+    max_float = np.finfo(np.float32).max
+
+    inputs = [True] * 6
+    # Output is capped the 2nd time it doubles.
+    expected_outputs = [
+        init_loss_scale, init_loss_scale * 2, init_loss_scale * 2, max_float,
+        max_float, max_float
+    ]
+
+    self._test_helper(inputs, expected_outputs, init_loss_scale)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_decrease_every_n_steps(self):
+    inputs = [False] * 6
+    init_loss_scale = 1024
+    expected_outputs = [1024, 512, 512, 256, 256, 128]
+
+    self._test_helper(inputs, expected_outputs, init_loss_scale)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_keep_decreasing_until_one(self):
+    inputs = [False] * 10
+    init_loss_scale = 16
+    expected_outputs = [16, 8, 8, 4, 4, 2, 2, 1, 1, 1]
+
+    self._test_helper(inputs, expected_outputs, init_loss_scale)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_incr_bad_step_clear_good_step(self):
+    inputs = [True, True, True, False, True]
+    expected_outputs = [1, 2, 2, 2, 2]
+    self._test_helper(inputs, expected_outputs)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_incr_good_step_does_not_clear_bad_step(self):
+    inputs = [True, True, True, False, True, False]
+    expected_outputs = [1, 2, 2, 2, 2, 1]
+    self._test_helper(inputs, expected_outputs)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_trigger_loss_scale_update_each_step(self):
+    """Test when incr_every_n_step and decr_every_n_nan_or_inf is 1."""
+    init_loss_scale = 1
+    incr_every_n_step = 1
+    decr_every_n_nan_or_inf = 1
+
+    inputs = [True] * 3 + [False, True, True]
+    expected_outputs = [2, 4, 8, 4, 8, 16]
+
+    self._test_helper(inputs, expected_outputs, init_loss_scale,
+                      incr_every_n_step, decr_every_n_nan_or_inf)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_alternating_good_and_bad_gradients_trigger_each_step(self):
+    init_loss_scale = 1
+    incr_every_n_step = 1
+    decr_every_n_nan_or_inf = 1
+
+    inputs = [True, False] * 4 + [True]
+    expected_outputs = [2, 1, 2, 1, 2, 1, 2, 1, 2]
+    self._test_helper(inputs, expected_outputs, init_loss_scale,
+                      incr_every_n_step, decr_every_n_nan_or_inf)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_alternating_good_and_bad_gradients_trigger_incr_every_2steps(self):
+    init_loss_scale = 32
+    incr_every_n_step = 2
+    decr_every_n_nan_or_inf = 1
+
+    inputs = [True, False] * 3 + [True]
+    expected_outputs = [32, 16, 16, 8, 8, 4, 4]
+    self._test_helper(inputs, expected_outputs, init_loss_scale,
+                      incr_every_n_step, decr_every_n_nan_or_inf)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_random_mix_good_and_bad_gradients(self):
+    init_loss_scale = 4
+    inputs = [
+        False, False, True, True, True, False, True, False, True, True, True,
+        False
+    ]
+    expected_outputs = [4, 2, 2, 4, 4, 4, 4, 2, 2, 4, 4, 4]
+    self._test_helper(inputs, expected_outputs, init_loss_scale)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer.py b/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer.py
new file mode 100644
index 00000000000000..e4e5ccc33472ad
--- /dev/null
+++ b/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer.py
@@ -0,0 +1,166 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Loss scaling optimizer."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import context
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_control_flow_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.training import optimizer
+
+
+class LossScaleOptimizer(optimizer.Optimizer):
+  """An optimizer that applies loss scaling in backprop.
+
+  This class is useful for mixed precision training on GPUs (or other potential
+  accelerators), which is an approach to improve compute throughput without loss
+  of model quality.
+
+  The commmon configuration of mixed precision models is the following:
+  * variables are kept in high precision (e.g. float32).
+  * computations are done in lower precision (e.g. float16). variables are
+    casted to lower precision before they're used.
+  * (in training), final gradients are casted back to variable precision and get
+    applied.
+
+  Because computations happen in lower precision, gradients in the backprop pass
+  might underflow in the smaller dynamic range, causing a model to converge at a
+  suboptimal level. This optimizer multiplies the loss by a factor before
+  backprop starts to prevent underflow. Before gradients are applied, they are
+  casted to higher precision and down-scaled by the same factor, so
+  mathematically the variable updates are no different from regular
+  same-precision training.
+
+  See [Nvidia's manual on mixed precision training](
+  https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html)
+  for more details.
+
+  To use loss scale optimizer, one only needs choose a loss scale strategy and
+  wrap a regular optimizer. See examples below.
+
+  ```
+  loss = loss_fn()
+  opt = tf.AdamOptimizer(learning_rate=...)
+
+  # Choose a loss scale manager which decides how to pick the right loss scale
+  # throughout the training process.
+  loss_scale_manger = tf.contrib.mixed_precision.FixedLossScaleManager(5000)
+
+  # Wraps the original optimizer in a LossScaleOptimizer.
+  loss_scale_optimizer = LossScaleOptimizer(opt, loss_scale_manager)
+
+  # Call minimize() on the loss scale optimizer.
+  train_op = loss_scale_optimizer.minimize(loss)
+  ```
+
+  If gradients clipping is applied, one can call
+  `optimizer.compute_gradients()` and `optimizer.apply_gradients()`
+  seperately.
+
+  Notice the following way of using LossScaleOptimizer is not intended. Always
+  use `loss_scale_optimizer.compute_gradients()` to compute gradients instead of
+  `tf.gradients()` if doing mixed precision training.
+
+  ```
+  # The following is a wrong way to use LossScaleOptimizer along with
+  # tf.gradients().
+
+  # Always use loss_scale_optimizer.compute_gradients() to compute grads, or
+  # loss scale is not correctly applied.
+  grads = tf.gradients(loss, ...)
+
+  # Do some custom grad clipping.
+  grads = clip_grads(grads, ...)
+
+  loss_scale_optimizer.apply(grads_and_vars)
+  ```
+  """
+
+  def __init__(self, opt, loss_scale_manager):
+    """Construct a loss scaling optimizer.
+
+    Args:
+      opt: The actual optimizer that will be used to compute and apply the
+        gradients. Must be an implementation of the @{tf.train.Optimizer}
+        interface.
+      loss_scale_manager: A LossScaleManager object.
+    """
+    self._opt = opt
+    self._loss_scale_manager = loss_scale_manager
+
+  def compute_gradients(self,
+                        loss,
+                        var_list=None,
+                        gate_gradients=optimizer.Optimizer.GATE_OP,
+                        aggregation_method=None,
+                        colocate_gradients_with_ops=False,
+                        grad_loss=None):
+    """Compute gradients. See base class @{tf.train.Optimizer}."""
+    loss_scale = self._loss_scale_manager.get_loss_scale()
+    if context.executing_eagerly():
+
+      def scaled_loss():
+        loss_val = loss()
+        return loss_val * math_ops.cast(loss_scale, loss_val.dtype.base_dtype)
+    else:
+      if callable(loss):
+        loss_val = loss()
+      else:
+        loss_val = loss
+      scaled_loss = loss_val * math_ops.cast(loss_scale,
+                                             loss_val.dtype.base_dtype)
+    grads_and_vars = self._opt.compute_gradients(
+        scaled_loss,
+        var_list=var_list,
+        gate_gradients=gate_gradients,
+        aggregation_method=aggregation_method,
+        colocate_gradients_with_ops=colocate_gradients_with_ops,
+        grad_loss=grad_loss)
+    return self._down_scale(grads_and_vars, loss_scale)
+
+  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
+    """Apply gradients. See base class @{tf.train.Optimizer}."""
+    grads = [g for (g, _) in grads_and_vars]
+
+    is_finite_grad = []
+    for g in grads:
+      is_finite_grad.append(math_ops.reduce_all(gen_math_ops.is_finite(g)))
+    is_overall_finite = math_ops.reduce_all(is_finite_grad)
+
+    # Only update gradients when all grads are finite.
+    def true_apply_gradients_fn():
+      return self._opt.apply_gradients(grads_and_vars, global_step, name)
+
+    update_vars = control_flow_ops.cond(
+        is_overall_finite, true_apply_gradients_fn, gen_control_flow_ops.no_op)
+    # Potentially adjust gradient scale in case of finite gradients.
+    return control_flow_ops.group(
+        update_vars,
+        self._loss_scale_manager.update_loss_scale(is_overall_finite))
+
+  def _down_scale(self, grads_vars, loss_scale):
+    # Down scale grads by the loss_scale.
+    gv = []
+    inv_loss_scale = gen_math_ops.reciprocal(loss_scale)
+    for g, v in grads_vars:
+      if g is not None:
+        gv.append((g * math_ops.cast(inv_loss_scale, g.dtype.base_dtype), v))
+      else:
+        gv.append((g, v))
+    return gv
diff --git a/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer_test.py b/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer_test.py
new file mode 100644
index 00000000000000..dded61ccd58eb7
--- /dev/null
+++ b/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer_test.py
@@ -0,0 +1,216 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for LossScaleOptimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.mixed_precision.python import loss_scale_manager as lsm_lib
+from tensorflow.contrib.mixed_precision.python import loss_scale_optimizer as lso
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import gradient_descent as gd
+
+
+class LossScaleOptimizerTest(test.TestCase):
+
+  def _build_graph(self, lr, init_val, loss_scale_opt_fn=None):
+    x = variable_scope.get_variable(
+        "x", initializer=init_val, dtype=dtypes.float32)
+    c1 = constant_op.constant(1e4, dtype=dtypes.float16)
+    c2 = constant_op.constant(1e-4, dtype=dtypes.float16)
+    c3 = constant_op.constant(1e-4, dtype=dtypes.float16)
+    if context.executing_eagerly():
+      loss = lambda: math_ops.cast(x, dtypes.float16) * c1 * c2 * c3
+    else:
+      loss = math_ops.cast(x, dtypes.float16) * c1 * c2 * c3
+
+    opt = gd.GradientDescentOptimizer(lr)
+    if loss_scale_opt_fn:
+      opt = loss_scale_opt_fn(opt)
+    return x, loss, opt
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_float16_underflow_without_loss_scale(self):
+    lr = 1
+    init_val = 1.
+    x, loss, opt = self._build_graph(lr, init_val)
+
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(opt.minimize(loss, var_list=[x]))
+
+    # Symbolic grad is c1 * c2 * c3 = 1e-4 and actual grad is 0, since in
+    # backprop, c2 * c3 underflows in fp16 range. So variable isn't updated.
+    expected_update = 0
+    symbolic_update = 1e-4 * lr
+    self.assertAllClose(
+        init_val - expected_update,
+        self.evaluate(x),
+        rtol=0,
+        atol=min(symbolic_update, 1e-6))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_float16_with_loss_scale(self):
+    lr = 1.
+    init_val = 1.
+
+    def loss_scale_opt_fn(opt):
+      return lso.LossScaleOptimizer(opt, lsm_lib.FixedLossScaleManager(1e4))
+
+    x, loss, opt = self._build_graph(lr, init_val, loss_scale_opt_fn)
+
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(opt.minimize(loss, var_list=[x]))
+
+    # Symbolic grad is c1 * c2 * c3 = 1e-4 and actual grad is the same, due to
+    # up-scaled loss before backprop starts.
+    expected_update = 1.e-4 * lr
+    self.assertAllClose(
+        init_val - expected_update,
+        self.evaluate(x),
+        rtol=0,
+        atol=min(expected_update, 1e-6))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_compute_gradients_with_loss_scale(self):
+    lr = 1
+    init_val = 1.
+
+    def loss_scale_opt_fn(opt):
+      return lso.LossScaleOptimizer(opt, lsm_lib.FixedLossScaleManager(1e4))
+
+    x, loss, opt = self._build_graph(lr, init_val, loss_scale_opt_fn)
+    grads_and_vars = opt.compute_gradients(loss, var_list=[x])
+
+    self.assertEqual(len(grads_and_vars), 1)
+
+    self.evaluate(variables.global_variables_initializer())
+    g_v = self.evaluate(grads_and_vars[0][0])
+    self.assertAllClose(g_v, 1e-4)
+    self.assertIs(grads_and_vars[0][1], x)
+    # Gradients aren't applied.
+    self.assertAllClose(init_val, self.evaluate(x), rtol=0, atol=1e-6)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_compute_gradients_without_loss_scale(self):
+    lr = 1
+    init_val = 1.
+    x, loss, opt = self._build_graph(lr, init_val)
+    grads_and_vars = opt.compute_gradients(loss, var_list=[x])
+
+    self.assertEqual(len(grads_and_vars), 1)
+    self.evaluate(variables.global_variables_initializer())
+    g_v = self.evaluate(grads_and_vars[0][0])
+    self.assertAllClose(g_v, 0)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_apply_gradients(self):
+
+    x = variable_scope.get_variable("x", initializer=1., dtype=dtypes.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices([np.nan, np.inf, 0.1])
+    itr = dataset.make_one_shot_iterator()
+
+    lr = 1
+    opt = gd.GradientDescentOptimizer(lr)
+    lsm = lsm_lib.FixedLossScaleManager(1.e4)
+    opt = lso.LossScaleOptimizer(opt, lsm)
+    train_fn = lambda: opt.apply_gradients([(itr.get_next(), x)])
+    if not context.executing_eagerly():
+      train_op = train_fn()
+
+    expected_output = [1, 1, 1 - 0.1]
+    actual_output = []
+
+    self.evaluate(variables.global_variables_initializer())
+    for _ in range(3):
+      # nan or inf is not applied.
+      if context.executing_eagerly():
+        train_fn()
+      else:
+        self.evaluate(train_op)
+      actual_output.append(self.evaluate(x))
+    self.assertAllClose(expected_output, actual_output)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_apply_gradients_loss_scale_is_updated(self):
+
+    class SimpleLossScaleManager(lsm_lib.LossScaleManager):
+      """A simple loss scale manager for easier testing.
+
+      It increments loss scale by 1 if grads are finite, and decreases loss
+      scale by 1 if otherwise.
+      """
+
+      def __init__(self, loss_scale):
+        self._loss_scale = variable_scope.variable(
+            name="loss_scale",
+            initial_value=loss_scale,
+            dtype=dtypes.float32,
+            trainable=False)
+
+      def get_loss_scale(self):
+        return self._loss_scale
+
+      def update_loss_scale(self, if_finite_grads):
+        return control_flow_ops.cond(
+            if_finite_grads, lambda: state_ops.assign_add(self._loss_scale, 1),
+            lambda: state_ops.assign_sub(self._loss_scale, 1))
+
+    x = variable_scope.get_variable("x", initializer=1., dtype=dtypes.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices([np.nan, np.inf, 0.1])
+    itr = dataset.make_one_shot_iterator()
+
+    lr = 1
+    init_loss_scale = 8
+    opt = gd.GradientDescentOptimizer(lr)
+    lsm = SimpleLossScaleManager(init_loss_scale)
+    opt = lso.LossScaleOptimizer(opt, lsm)
+    train_fn = lambda: opt.apply_gradients([(itr.get_next(), x)])
+    if not context.executing_eagerly():
+      train_op = train_fn()
+
+    self.evaluate(variables.global_variables_initializer())
+
+    expected_loss_scale = [
+        init_loss_scale - 1, init_loss_scale - 2, init_loss_scale - 2 + 1
+    ]
+    expected_output = [1, 1, 1 - 0.1]
+    actual_output = []
+    for i in range(3):
+      # nan or inf is not applied.
+      if context.executing_eagerly():
+        train_fn()
+      else:
+        self.evaluate(train_op)
+      actual_output.append(self.evaluate(x))
+      self.assertAllClose(expected_loss_scale[i],
+                          self.evaluate(lsm._loss_scale))
+    self.assertAllClose(expected_output, actual_output)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/model_pruning/python/pruning.py b/tensorflow/contrib/model_pruning/python/pruning.py
index ea6032e588cf39..4b7af18b331695 100644
--- a/tensorflow/contrib/model_pruning/python/pruning.py
+++ b/tensorflow/contrib/model_pruning/python/pruning.py
@@ -396,14 +396,19 @@ def _maybe_update_block_mask(self, weights, threshold):
                        self._block_pooling_function)
 
     with ops.name_scope(weights.op.name + '_pruning_ops'):
-      abs_weights = math_ops.abs(
-          array_ops.reshape(weights, [
-              1,
-              squeezed_weights.get_shape()[0],
-              squeezed_weights.get_shape()[1], 1
-          ]))
+      abs_weights = math_ops.abs(squeezed_weights)
+
       pool_window = [self._block_dim[0], self._block_dim[1]]
-      pooled_weights = nn_ops.pool(
+      pool_fn = pruning_utils.factorized_pool
+
+      if not self._spec.use_tpu:
+        pool_fn = nn_ops.pool
+        abs_weights = array_ops.reshape(
+            abs_weights,
+            [1, abs_weights.get_shape()[0],
+             abs_weights.get_shape()[1], 1])
+
+      pooled_weights = pool_fn(
           abs_weights,
           window_shape=pool_window,
           pooling_type=self._block_pooling_function,
@@ -411,19 +416,18 @@ def _maybe_update_block_mask(self, weights, threshold):
           padding='SAME',
           name=weights.op.name + '_pooled')
 
+      if pooled_weights.get_shape().ndims != 2:
+        pooled_weights = array_ops.squeeze(pooled_weights)
+
       smoothed_threshold, new_mask = self._update_mask(pooled_weights,
                                                        threshold)
-
-      reshaped_mask = array_ops.reshape(
-          new_mask,
-          [pooled_weights.get_shape()[1],
-           pooled_weights.get_shape()[2]])
       updated_mask = pruning_utils.kronecker_product(
-          reshaped_mask, array_ops.ones(self._block_dim))
+          new_mask, array_ops.ones(self._block_dim))
       sliced_mask = array_ops.slice(
           updated_mask, [0, 0],
           [squeezed_weights.get_shape()[0],
            squeezed_weights.get_shape()[1]])
+
     return smoothed_threshold, array_ops.reshape(sliced_mask,
                                                  array_ops.shape(weights))
 
diff --git a/tensorflow/contrib/model_pruning/python/pruning_utils.py b/tensorflow/contrib/model_pruning/python/pruning_utils.py
index 56d3dcef20d1b1..ef6c6a3f5d7aa2 100644
--- a/tensorflow/contrib/model_pruning/python/pruning_utils.py
+++ b/tensorflow/contrib/model_pruning/python/pruning_utils.py
@@ -29,6 +29,7 @@
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 
@@ -221,6 +222,56 @@ def loop_body(loop_count, cdf):
     return math_ops.div(cdf, math_ops.reduce_max(cdf))
 
 
+def factorized_pool(input_tensor,
+                    window_shape,
+                    pooling_type,
+                    strides,
+                    padding,
+                    name=None):
+  """Performs m x n pooling through a combination of 1xm and 1xn pooling.
+
+  Args:
+    input_tensor: Input tensor. Must be rank 2
+    window_shape: Pooling window shape
+    pooling_type: Either 'MAX' or 'AVG'
+    strides: The stride of the pooling window
+    padding: 'SAME' or 'VALID'.
+    name: Name of the op
+
+  Returns:
+    A rank 2 tensor containing the pooled output
+
+  Raises:
+    ValueError: if the input tensor is not rank 2
+  """
+  if input_tensor.get_shape().ndims != 2:
+    raise ValueError('factorized_pool() accepts tensors of rank 2 only')
+
+  [height, width] = input_tensor.get_shape()
+  with ops.name_scope(name, 'factorized_pool'):
+    input_tensor_aligned = array_ops.reshape(
+        input_tensor, [1, 1, height, width],
+        name=input_tensor.op.name + '_aligned')
+
+    height_pooling = nn_ops.pool(
+        input_tensor_aligned,
+        window_shape=[1, window_shape[0]],
+        pooling_type=pooling_type,
+        strides=[1, strides[0]],
+        padding=padding)
+    swap_height_width = array_ops.transpose(height_pooling, perm=[0, 1, 3, 2])
+
+    width_pooling = nn_ops.pool(
+        swap_height_width,
+        window_shape=[1, window_shape[1]],
+        pooling_type=pooling_type,
+        strides=[1, strides[1]],
+        padding=padding)
+
+  return array_ops.squeeze(
+      array_ops.transpose(width_pooling, perm=[0, 1, 3, 2]))
+
+
 def determine_partitioned_axis(partitioned_variable):
   partitioned_axis = 0
   concatenated_variable_shape = partitioned_variable.get_shape()
diff --git a/tensorflow/contrib/model_pruning/python/pruning_utils_test.py b/tensorflow/contrib/model_pruning/python/pruning_utils_test.py
index 10e1dd0a8eee88..ccde5b4e8a86fc 100644
--- a/tensorflow/contrib/model_pruning/python/pruning_utils_test.py
+++ b/tensorflow/contrib/model_pruning/python/pruning_utils_test.py
@@ -22,8 +22,10 @@
 
 from tensorflow.contrib.model_pruning.python import pruning_utils
 from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -31,6 +33,30 @@
 
 class PruningUtilsTest(test.TestCase):
 
+  def _compare_cdf(self, values):
+    abs_values = math_ops.abs(values)
+    max_value = math_ops.reduce_max(abs_values)
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      cdf_from_histogram = pruning_utils.compute_cdf_from_histogram(
+          abs_values, [0.0, max_value], nbins=pruning_utils._NBINS)
+      cdf = pruning_utils.compute_cdf(abs_values, [0.0, max_value])
+      self.assertAllEqual(cdf.eval(), cdf_from_histogram.eval())
+
+  def _compare_pooling_methods(self, weights, pooling_kwargs):
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      pooled_weights_tf = array_ops.squeeze(
+          nn_ops.pool(
+              array_ops.reshape(
+                  weights,
+                  [1, weights.get_shape()[0],
+                   weights.get_shape()[1], 1]), **pooling_kwargs))
+      pooled_weights_factorized_pool = pruning_utils.factorized_pool(
+          weights, **pooling_kwargs)
+      self.assertAllClose(pooled_weights_tf.eval(),
+                          pooled_weights_factorized_pool.eval())
+
   def testHistogram(self):
     width = 10
     height = 10
@@ -59,27 +85,35 @@ def testCDF(self):
       self.assertAllEqual(len(norm_cdf_val), nbins)
       self.assertAllEqual(expected_cdf, norm_cdf_val)
 
-  def _compare_cdf(self, values):
-    abs_values = math_ops.abs(values)
-    max_value = math_ops.reduce_max(abs_values)
-    with self.test_session():
-      variables.global_variables_initializer().run()
-      cdf_from_histogram = pruning_utils.compute_cdf_from_histogram(
-          abs_values, [0.0, max_value], nbins=pruning_utils._NBINS)
-      cdf = pruning_utils.compute_cdf(abs_values, [0.0, max_value])
-      return cdf.eval(), cdf_from_histogram.eval()
-
   def testCDFEquivalence2D(self):
     width = 100
     height = 100
     weights = variable_scope.get_variable("weights", shape=[width, height])
-    cdf_val, cdf_from_histogram_val = self._compare_cdf(weights)
-    self.assertAllEqual(cdf_val, cdf_from_histogram_val)
+    self._compare_cdf(weights)
 
   def testCDFEquivalence4D(self):
     weights = variable_scope.get_variable("weights", shape=[5, 5, 128, 128])
-    cdf_val, cdf_from_histogram_val = self._compare_cdf(weights)
-    self.assertAllEqual(cdf_val, cdf_from_histogram_val)
+    self._compare_cdf(weights)
+
+  def testFactorizedAvgPool(self):
+    weights = variable_scope.get_variable("weights", shape=[1024, 2048])
+    pooling_kwargs = {
+        "window_shape": [2, 4],
+        "pooling_type": "AVG",
+        "strides": [2, 4],
+        "padding": "SAME"
+    }
+    self._compare_pooling_methods(weights, pooling_kwargs)
+
+  def testFactorizedMaxPool(self):
+    weights = variable_scope.get_variable("weights", shape=[1024, 2048])
+    pooling_kwargs = {
+        "window_shape": [2, 4],
+        "pooling_type": "MAX",
+        "strides": [2, 4],
+        "padding": "SAME"
+    }
+    self._compare_pooling_methods(weights, pooling_kwargs)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/mpi_collectives/kernels/ring.h b/tensorflow/contrib/mpi_collectives/kernels/ring.h
index 1d56d588bc49ed..c001615d3ffbdf 100644
--- a/tensorflow/contrib/mpi_collectives/kernels/ring.h
+++ b/tensorflow/contrib/mpi_collectives/kernels/ring.h
@@ -129,7 +129,7 @@ cudaStream_t CudaStreamForMPI();
  *  has the fully accumulated Segment 1; and so on. The scatter-reduce is
  * complete.
  *
- *  Next, the allgather distributes these fully accumululated chunks across all
+ *  Next, the allgather distributes these fully accumulated chunks across all
  * nodes. Communication proceeds in the same ring, once again in N-1 steps. At
  * the ith step, node j will send chunk (j - i + 1) and receive chunk (j - i).
  * For example, at the first iteration, the following transfers will occur:
diff --git a/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc b/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
index 4d8d922cb42d29..5144f7c38c8650 100644
--- a/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
+++ b/tensorflow/contrib/nccl/kernels/nccl_manager_test.cc
@@ -171,8 +171,7 @@ class NcclManagerTest : public ::testing::Test {
 
  private:
   static Allocator* GpuAllocator(BaseGPUDevice* device) {
-    return device->GetStepAllocator(AllocatorAttributes(),
-                                    nullptr /* step_resource_manager */);
+    return device->GetAllocator(AllocatorAttributes());
   }
 
   static se::DeviceMemory<Scalar> AsDeviceMemory(const Scalar* cuda_memory) {
diff --git a/tensorflow/contrib/opt/python/training/adamax_test.py b/tensorflow/contrib/opt/python/training/adamax_test.py
index bc92a7006f1a0a..915e6504e1e59f 100644
--- a/tensorflow/contrib/opt/python/training/adamax_test.py
+++ b/tensorflow/contrib/opt/python/training/adamax_test.py
@@ -198,11 +198,11 @@ def doTestBasic(self, use_resource=False):
         self.assertTrue(beta1_power is not None)
         self.assertIn(beta1_power, opt_variables)
 
-        with ops.Graph().as_default():
-          # Shouldn't return non-slot variables from other graphs.
-          self.assertEqual(0, len(opt.variables()))
-
         if not context.executing_eagerly():
+          with ops.Graph().as_default():
+            # Shouldn't return non-slot variables from other graphs.
+            self.assertEqual(0, len(opt.variables()))
+
           self.evaluate(variables.global_variables_initializer())
           # Fetch params to validate initial values
           self.assertAllClose([1.0, 2.0], self.evaluate(var0))
@@ -224,8 +224,10 @@ def doTestBasic(self, use_resource=False):
           var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
-          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0),
+                                             rtol=1e-2)
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1),
+                                             rtol=1e-2)
           if use_resource:
             self.assertEqual("var0_%d/AdaMax:0" % (i,),
                              opt.get_slot(var=var0, name="m").name)
diff --git a/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py b/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py
index 37539b959959b5..5ed8057b865cf4 100644
--- a/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/elastic_average_optimizer_test.py
@@ -58,7 +58,7 @@ def create_local_cluster(num_workers, num_ps, protocol="grpc"):
 
 
 # Creates the workers and return their sessions, graphs, train_ops.
-# Cheif worker will update at last
+# Chief worker will update at last
 def _get_workers(num_workers, period, workers, moving_rate):
   sessions = []
   graphs = []
diff --git a/tensorflow/contrib/opt/python/training/model_average_optimizer.py b/tensorflow/contrib/opt/python/training/model_average_optimizer.py
index a7c97a1da2baf2..b6b10e500b6af8 100644
--- a/tensorflow/contrib/opt/python/training/model_average_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/model_average_optimizer.py
@@ -62,7 +62,7 @@ class ModelAverageCustomGetter(object):
   """
 
   def __init__(self, worker_device):
-    """Create a new `ElasticAverageCustomGetter`.
+    """Create a new `ModelAverageCustomGetter`.
 
     Args:
       worker_device: String.  Name of the `worker` job.
diff --git a/tensorflow/contrib/opt/python/training/model_average_optimizer_test.py b/tensorflow/contrib/opt/python/training/model_average_optimizer_test.py
index 6cca0a8a009456..3acd9402684fa2 100644
--- a/tensorflow/contrib/opt/python/training/model_average_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/model_average_optimizer_test.py
@@ -57,7 +57,7 @@ def create_local_cluster(num_workers, num_ps, protocol="grpc"):
 
 
 # Creates the workers and return their sessions, graphs, train_ops.
-# Cheif worker will update at last
+# Chief worker will update at last
 def _get_workers(num_workers, steps, workers):
   sessions = []
   graphs = []
@@ -146,7 +146,7 @@ def test1Workers2Period(self):
     self.assertAllEqual(1.0, sessions[0].run(global_var_1))
     self.assertAllEqual(0, sessions[0].run(global_step))
 
-    # iteration 2, global varibale update
+    # iteration 2, global variable update
     thread_0 = self.checkedThread(
         target=self._run, args=(train_ops[0], sessions[0]))
     thread_1 = self.checkedThread(
diff --git a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
index 9e2858d00ff192..64b95786b5c7a7 100644
--- a/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
+++ b/tensorflow/contrib/optimizer_v2/checkpointable_utils_test.py
@@ -31,19 +31,20 @@
 from tensorflow.python.eager import function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras._impl.keras.engine import training
-from tensorflow.python.keras._impl.keras.layers import core
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.layers import core
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import template
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.training import checkpointable
-from tensorflow.python.training import checkpointable_utils
 from tensorflow.python.training import saver as core_saver
 from tensorflow.python.training import training_util
+from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
 
 
 class NonLayerCheckpointable(checkpointable.Checkpointable):
@@ -139,8 +140,9 @@ def testNamingWithOptimizer(self):
       self.evaluate(checkpointable_utils.gather_initializers(
           root_checkpointable))
       self.evaluate(train_op)
-    named_variables, serialized_graph = (
-        checkpointable_utils._serialize_object_graph(root_checkpointable))
+    named_variables, serialized_graph, _ = (
+        checkpointable_utils._serialize_object_graph(
+            root_checkpointable, saveables_cache=None))
     expected_checkpoint_names = (
         # Created in the root node, so no prefix.
         "optimizer_step",
@@ -163,24 +165,29 @@ def testNamingWithOptimizer(self):
     suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
     expected_checkpoint_names = [
         name + suffix for name in expected_checkpoint_names]
+    # The Dense layers also save get_config() JSON
+    expected_checkpoint_names.extend(
+        ["model/_second/.ATTRIBUTES/OBJECT_CONFIG_JSON",
+         "model/_named_dense/.ATTRIBUTES/OBJECT_CONFIG_JSON"])
+    named_variables = {v.name: v for v in named_variables}
     six.assertCountEqual(self, expected_checkpoint_names,
                          named_variables.keys())
     # Check that we've mapped to the right variable objects (not exhaustive)
     self.assertEqual(
-        "global_step:0",
-        named_variables["optimizer_step" + suffix].name)
+        "global_step",
+        named_variables["optimizer_step" + suffix].full_name)
     self.assertEqual(
-        "my_model/dense_1/kernel:0",
-        named_variables["model/_second/kernel" + suffix].name)
+        "my_model/dense_1/kernel",
+        named_variables["model/_second/kernel" + suffix].full_name)
     self.assertEqual(
-        "my_model/dense/kernel:0",
-        named_variables["model/_named_dense/kernel" + suffix].name)
+        "my_model/dense/kernel",
+        named_variables["model/_named_dense/kernel" + suffix].full_name)
     self.assertEqual(
-        "beta1_power:0",
-        named_variables["optimizer/beta1_power" + suffix].name)
+        "beta1_power",
+        named_variables["optimizer/beta1_power" + suffix].full_name)
     self.assertEqual(
-        "beta2_power:0",
-        named_variables["optimizer/beta2_power" + suffix].name)
+        "beta2_power",
+        named_variables["optimizer/beta2_power" + suffix].full_name)
     # Spot check the generated protocol buffers.
     self.assertEqual("optimizer",
                      serialized_graph.nodes[0].children[1].local_name)
@@ -205,7 +212,7 @@ def testNamingWithOptimizer(self):
     self.assertEqual(
         "my_model/dense/kernel/Adam:0",
         optimizer.get_slot(
-            var=named_variables["model/_named_dense/kernel" + suffix],
+            var=model._named_dense.kernel,
             name="m").name)
     self.assertEqual(
         "model/_named_dense/kernel" + suffix,
@@ -417,16 +424,6 @@ def _call_model(x):
                          self.evaluate(root.save_counter))
   # pylint: enable=cell-var-from-loop
 
-  def _get_checkpoint_name(self, name):
-    root = checkpointable.Checkpointable()
-    checkpointable_utils.add_variable(
-        root, name=name, shape=[1, 2], dtype=dtypes.float64)
-    named_variables, _ = checkpointable_utils._serialize_object_graph(root)
-    checkpoint_name, = named_variables.keys()
-    with ops.name_scope("root/" + checkpoint_name):
-      pass  # Make sure we can use this as an op name if we prefix it.
-    return checkpoint_name
-
   def testAnonymousVarsInInit(self):
 
     class Model(training.Model):
@@ -617,6 +614,49 @@ def testMultipleGraphsNonSlotVariables(self):
         self.assertAllEqual(3., self.evaluate(beta1_power))
 
 
+class TemplateTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_checkpointable_save_restore(self):
+
+    def _templated():
+      v = variable_scope.get_variable(
+          "v", shape=[1], initializer=init_ops.zeros_initializer(),
+          use_resource=True)
+      v2 = variable_scope.get_variable(
+          "v2", shape=[1], initializer=init_ops.zeros_initializer(),
+          use_resource=True)
+      return v, v + 1., v2
+
+    save_template = template.make_template("s1", _templated)
+    v1_save, _, v2_save = save_template()
+    optimizer = adam.AdamOptimizer(0.0)
+    save_root = checkpointable_utils.Checkpoint(
+        my_template=save_template, optimizer=optimizer)
+    optimizer.minimize(v1_save.read_value)
+    self.evaluate([v.initializer for v in optimizer.variables()])
+    self.evaluate(v1_save.assign([12.]))
+    self.evaluate(v2_save.assign([14.]))
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    save_path = save_root.save(checkpoint_prefix)
+
+    load_template = template.make_template("s2", _templated)
+    load_optimizer = adam.AdamOptimizer(0.0)
+    load_root = checkpointable_utils.Checkpoint(
+        my_template=load_template, optimizer=load_optimizer)
+    status = load_root.restore(save_path)
+    var, var_plus_one, var2 = load_template()
+    load_optimizer.minimize(var.read_value)
+    self.assertEqual(2, len(load_template._checkpoint_dependencies))
+    self.assertEqual("v", load_template._checkpoint_dependencies[0].name)
+    self.assertEqual("v2", load_template._checkpoint_dependencies[1].name)
+    status.assert_consumed().run_restore_ops()
+    self.assertAllEqual([12.], self.evaluate(var))
+    self.assertAllEqual([13.], self.evaluate(var_plus_one))
+    self.assertAllEqual([14.], self.evaluate(var2))
+
+
 class CheckpointCompatibilityTests(test.TestCase):
 
   def _initialized_model(self):
@@ -682,12 +722,22 @@ def testLoadFromNameBasedSaver(self):
       with self.assertRaises(AssertionError):
         self._check_sentinels(root)
       object_saver = checkpointable_utils.CheckpointableSaver(root)
+      self._set_sentinels(root)
       status = object_saver.restore(save_path)
-      with self.assertRaises(AssertionError):
-        status.assert_consumed()
+      if context.executing_eagerly():
+        self._check_sentinels(root)
+      if context.executing_eagerly():
+        with self.assertRaisesRegexp(AssertionError, "OBJECT_CONFIG_JSON"):
+          status.assert_consumed()
+      else:
+        # When graph building, we haven't read any keys, so we don't know
+        # whether the restore will be complete.
+        with self.assertRaisesRegexp(AssertionError, "not restored"):
+          status.assert_consumed()
       status.run_restore_ops()
       self._check_sentinels(root)
       self._set_sentinels(root)
+      status = object_saver.restore(save_path)
       status.initialize_or_restore()
       self._check_sentinels(root)
 
diff --git a/tensorflow/contrib/optimizer_v2/momentum_test.py b/tensorflow/contrib/optimizer_v2/momentum_test.py
index 26724f66c2a1db..24cdab462665ad 100644
--- a/tensorflow/contrib/optimizer_v2/momentum_test.py
+++ b/tensorflow/contrib/optimizer_v2/momentum_test.py
@@ -134,7 +134,6 @@ def testBasicCallableParams(self):
     with context.eager_mode():
       self.doTestBasic(use_resource=True, use_callable_params=True)
 
-  @test_util.run_in_graph_and_eager_modes(reset_test=True)
   def testVariablesAcrossGraphs(self):
     optimizer = momentum_lib.MomentumOptimizer(0.01, 0.5)
     with ops.Graph().as_default():
@@ -142,10 +141,7 @@ def testVariablesAcrossGraphs(self):
           [1.0, 2.0], dtype=dtypes.float32, name="var0")
       var1 = resource_variable_ops.ResourceVariable(
           [3.0, 4.0], dtype=dtypes.float32, name="var1")
-      if context.executing_eagerly():
-        loss = lambda: math_ops.reduce_sum(var0 + var1)
-      else:
-        loss = math_ops.reduce_sum(var0 + var1)
+      loss = math_ops.reduce_sum(var0 + var1)
       optimizer.minimize(loss)
       optimizer_variables = optimizer.variables()
       self.assertStartsWith(optimizer_variables[0].name, "var0")
@@ -157,10 +153,7 @@ def testVariablesAcrossGraphs(self):
           [1.0, 2.0], dtype=dtypes.float32, name="var2")
       var3 = resource_variable_ops.ResourceVariable(
           [3.0, 4.0], dtype=dtypes.float32, name="var3")
-      if context.executing_eagerly():
-        loss = lambda: math_ops.reduce_sum(var2 + var3)
-      else:
-        loss = math_ops.reduce_sum(var2 + var3)
+      loss = math_ops.reduce_sum(var2 + var3)
       optimizer.minimize(loss)
       optimizer_variables = optimizer.variables()
       self.assertStartsWith(optimizer_variables[0].name, "var2")
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2.py b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
index 46bfbb729fa9cd..f537318b32986c 100644
--- a/tensorflow/contrib/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
@@ -33,10 +33,10 @@
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.training import checkpointable
 from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import optimizer as optimizer_v1
 from tensorflow.python.training import slot_creator
+from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import nest
 
 
@@ -360,7 +360,16 @@ def _create_or_restore_slot_variable(
     """
     slot_variable = self.get_slot(var=variable, name=slot_name)
     if (slot_variable is None and context.executing_eagerly() and
-        slot_variable_position.is_simple_variable()):
+        slot_variable_position.is_simple_variable()
+        # Defer slot variable creation if there is an active variable creator
+        # scope. Generally we'd like to eagerly create/restore slot variables
+        # when possible, but this may mean that scopes intended to catch
+        # `variable` also catch its eagerly created slot variable
+        # unintentionally (specifically make_template would add a dependency on
+        # a slot variable if not for this case). Deferring is mostly harmless
+        # (aside from double initialization), and makes variable creator scopes
+        # behave the same way they do when graph building.
+        and not ops.get_default_graph()._variable_creator_stack):  # pylint: disable=protected-access
       initializer = checkpointable.CheckpointInitialValue(
           checkpoint_position=slot_variable_position)
       slot_variable = self.create_slot(
diff --git a/tensorflow/contrib/periodic_resample/BUILD b/tensorflow/contrib/periodic_resample/BUILD
index 6ca7fe8b6e59b0..976b312e8345a8 100644
--- a/tensorflow/contrib/periodic_resample/BUILD
+++ b/tensorflow/contrib/periodic_resample/BUILD
@@ -6,12 +6,13 @@ exports_files(["LICENSE"])
 
 load(
     "//tensorflow:tensorflow.bzl",
-    "py_test",
+    "tf_cc_test",
     "tf_gen_op_libs",
     "tf_custom_op_library",
     "tf_custom_op_py_library",
     "tf_gen_op_wrapper_py",
 )
+load("//tensorflow:tensorflow.bzl", "py_test")
 
 cc_library(
     name = "all_ops",
@@ -84,6 +85,20 @@ py_test(
         ":init_py",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradient_checker",
+    ],
+)
+
+tf_cc_test(
+    name = "periodic_resample_op_cc_test",
+    size = "small",
+    srcs = [
+        "ops/array_ops_test.cc",
+    ],
+    deps = [
+        ":all_ops",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
     ],
 )
 
diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
index e18923c8aae74c..514689cf4543cd 100644
--- a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
+++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
@@ -22,4 +22,9 @@ namespace tensorflow {
 REGISTER_KERNEL_BUILDER(Name("PeriodicResample").Device(DEVICE_CPU),
                         PeriodicResampleOp);
 
+
+REGISTER_KERNEL_BUILDER(Name("PeriodicResampleOpGrad")
+                            .Device(DEVICE_CPU),
+                        PeriodicResampleOpGrad);
+
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
index 3ab588c45881c8..42fba81a5cb949 100644
--- a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
+++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
@@ -25,92 +25,202 @@
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/work_sharder.h"
 
 namespace {
 
-template <class IndexVecT, class IndexT>
-IndexT compute_input_index(
-    IndexVecT* target_dimensions, const IndexT& output_index,
-    const IndexVecT& original_dimensions, const int& adjustable_dimension,
-    const std::vector<tensorflow::int64>& dimension_ceiling,
-    const std::vector<tensorflow::int64>& cumulative_dimensions, IndexT* result,
-    std::vector<IndexT>* output_indices, const int& rank) {
-  *result = 0;
-  output_indices->clear();
+// Computes input tensor index for given output index during forward
+// propagation through periodic_resample operation.
+class InputIndexer {
+ public:
+  InputIndexer(const std::vector<tensorflow::int64>& output_dimensions,
+               const tensorflow::TensorShape& input_shape,
+               int adjustable_dimension)
+      : output_dimensions_(output_dimensions),
+        adjustable_dimension_(adjustable_dimension),
+        rank_(input_shape.dims()),
+        linear_output_index_(0),
+        linear_input_index_(0),
+        adjustable_dimension_carriage_sum_(0) {
+    auto input_dimensions = TensorShapeToVector(input_shape);
+    // factors by which input_dimensions increases/decreases w.r.t.
+    // output_dimensions
+    dimension_ceiling_ =
+        ComputeDimensionCeiling(output_dimensions, input_dimensions);
+    cumulative_dimensions_ = ComputeCumulativeDimensions();
+
+    output_indices_.resize(output_dimensions_.size());
+    input_indices_.resize(output_dimensions_.size());
+
+    // Compute index_factors
+    index_factors_.resize(rank_);
+    tensorflow::int64 last_index_factor = 1;
+    for (auto r = rank_ - 1; r >= 0; --r) {
+      index_factors_[r] = last_index_factor;
+      last_index_factor *= input_dimensions[r];
+    }
+  }
+
+  tensorflow::int64 linear_input_index() const { return linear_input_index_; }
+
+  void MoveToOutputIndex(tensorflow::int64 output_index);
+  void IncrementOutputIndex();
+
+ private:
+  void RecomputeInputAdjustableDimensionIndex() {
+    tensorflow::int64 index = adjustable_dimension_carriage_sum_;
+    index *= output_dimensions_[adjustable_dimension_];
+    index += output_indices_[adjustable_dimension_];
+    input_indices_[adjustable_dimension_] = index;
+  }
+
+  std::vector<tensorflow::int64> TensorShapeToVector(
+      const tensorflow::TensorShape& tensor_shape);
+
+  std::vector<tensorflow::int64> ComputeDimensionCeiling(
+      const std::vector<tensorflow::int64>& output_dimensions,
+      const std::vector<tensorflow::int64>& input_dimensions);
+
+  std::vector<tensorflow::int64> ComputeCumulativeDimensions();
+
+  const std::vector<tensorflow::int64> output_dimensions_;
+  std::vector<tensorflow::int64> dimension_ceiling_;
+  std::vector<tensorflow::int64> index_factors_;
+  std::vector<tensorflow::int64> cumulative_dimensions_;
+  std::vector<tensorflow::int64> output_indices_;
+  std::vector<tensorflow::int64> input_indices_;
+
+  const int adjustable_dimension_;
+  const int rank_;
+  tensorflow::int64 linear_output_index_;
+  tensorflow::int64 linear_input_index_;
+  tensorflow::int64 adjustable_dimension_carriage_sum_;
+};
+
+void InputIndexer::MoveToOutputIndex(tensorflow::int64 output_index) {
+  linear_output_index_ = output_index;
+  linear_input_index_ = 0;
 
   // un-rasterize the output index
   auto last_reduced_i = output_index;
-  for (auto r = rank - 1; r >= 0; --r) {
-    (*output_indices)[r] = last_reduced_i % (*target_dimensions)[r];
+  for (auto r = rank_ - 1; r >= 0; --r) {
+    output_indices_[r] = last_reduced_i % output_dimensions_[r];
     last_reduced_i =
-        (last_reduced_i - (*output_indices)[r]) / (*target_dimensions)[r];
+        (last_reduced_i - output_indices_[r]) / output_dimensions_[r];
   }
 
+  tensorflow::int64 carriage_sum = 0;
+  for (int qi = 0; qi < rank_; ++qi) {
+    if (qi == adjustable_dimension_) continue;
+    carriage_sum += cumulative_dimensions_[qi] *
+                    (output_indices_[qi] % dimension_ceiling_[qi]);
+  }
+  adjustable_dimension_carriage_sum_ = carriage_sum;
+
   // rasterize the input index
-  IndexT last_index_factor = 1;
-  for (auto r = rank - 1; r >= 0; --r) {
-    IndexT index = 0;
-    if (r != adjustable_dimension)
-      index = (*output_indices)[r] / dimension_ceiling[r];
-    else {
-      for (int qi = 0; qi < rank; ++qi) {
-        if (qi == adjustable_dimension) continue;
-        index += cumulative_dimensions[qi] *
-                 ((*output_indices)[qi] % dimension_ceiling[qi]);
-      }
-      index *= (*target_dimensions)[adjustable_dimension];
-      index += (*output_indices)[r];
+  for (auto r = rank_ - 1; r >= 0; --r) {
+    if (r != adjustable_dimension_) {
+      input_indices_[r] = output_indices_[r] / dimension_ceiling_[r];
+    } else {
+      RecomputeInputAdjustableDimensionIndex();
     }
-    *result += last_index_factor * index;
-    last_index_factor *= original_dimensions[r];
   }
+  for (auto r = rank_ - 1; r >= 0; --r) {
+    linear_input_index_ += index_factors_[r] * input_indices_[r];
+  }
+}
+
+void InputIndexer::IncrementOutputIndex() {
+  linear_output_index_++;
+  for (auto r = rank_ - 1; r >= 0; --r) {
+    auto old_carriage_sum_increment =
+        cumulative_dimensions_[r] *
+        (output_indices_[r] % dimension_ceiling_[r]);
+    output_indices_[r] = (output_indices_[r] + 1) % output_dimensions_[r];
+    if (r != adjustable_dimension_) {
+      auto new_input_index = output_indices_[r] / dimension_ceiling_[r];
+      linear_input_index_ +=
+          (new_input_index - input_indices_[r]) * index_factors_[r];
+
+      input_indices_[r] = new_input_index;
+
+      auto new_carriage_sum_increment =
+          cumulative_dimensions_[r] *
+          (output_indices_[r] % dimension_ceiling_[r]);
 
-  return *result;
+      adjustable_dimension_carriage_sum_ = adjustable_dimension_carriage_sum_ -
+                                           old_carriage_sum_increment +
+                                           new_carriage_sum_increment;
+    }
+
+    if (output_indices_[r] != 0) {
+      // No more carries to higher indices.
+      break;
+    }
+  }
+  auto old_adjustable_dimension_input_index =
+      input_indices_[adjustable_dimension_];
+  RecomputeInputAdjustableDimensionIndex();
+  linear_input_index_ += (input_indices_[adjustable_dimension_] -
+                           old_adjustable_dimension_input_index) *
+                          index_factors_[adjustable_dimension_];
 }
 
-template <class InputDataT,
-          class IndexVecT>  // both types are needed here b/c IndexVecT and
-                            // InputDataT are not related
-                            void
-                            fill_periodic_tensor(
-                                tensorflow::OpKernelContext* context,
-                                const IndexVecT& desired_shape,
-                                const tensorflow::Tensor& input_tensor) {
-  // input is a strided array (last index is fastest, C-ordered)
-  auto input = input_tensor.flat<InputDataT>();
-  const int rank = input_tensor.dims();
-  // original and target dimensions
-  std::vector<tensorflow::int64> original_dimensions(rank),
-      target_dimensions(rank);
-  tensorflow::int64 total_size(input_tensor.NumElements()), new_sliced_size(1);
-  // factors by which original_dimensions increases/decreases w.r.t.
-  // target_dimensions
-  std::vector<tensorflow::int64> dimension_ceiling(rank),
-      cumulative_dimensions(rank);
-  // index of adjustable dimension
-  int adjustable_dimension;
-  tensorflow::TensorShape output_shape;
+std::vector<tensorflow::int64> InputIndexer::TensorShapeToVector(
+    const tensorflow::TensorShape& tensor_shape) {
+  std::vector<tensorflow::int64> result(tensor_shape.dims());
+  int count = 0;
+  for (const auto dim_info : tensor_shape) {
+    result[count] = dim_info.size;
+    ++count;
+  }
+  return result;
+}
 
-  // requires that the rank of the input tensor and length of the desired shape
-  // are equal
-  OP_REQUIRES(context, rank == desired_shape.size(),
-              tensorflow::errors::InvalidArgument(
-                  "periodic_resample expects the rank of the input tensor, ",
-                  rank, ", to be the same as the length of the desired shape, ",
-                  desired_shape.size(), "."));
+std::vector<tensorflow::int64> InputIndexer::ComputeDimensionCeiling(
+    const std::vector<tensorflow::int64>& output_dimensions,
+    const std::vector<tensorflow::int64>& input_dimensions) {
+  std::vector<tensorflow::int64> dimension_ceiling(input_dimensions.size());
+  for (size_t i = 0; i < input_dimensions.size(); ++i) {
+    dimension_ceiling[i] = (output_dimensions[i] + input_dimensions[i] - 1) /
+        input_dimensions[i];
+  }
+  return dimension_ceiling;
+}
 
-  bool found = false;
-  const auto& input_tensor_shape = input_tensor.shape();
+std::vector<tensorflow::int64> InputIndexer::ComputeCumulativeDimensions() {
+  std::vector<tensorflow::int64> cumulative_dimensions(rank_);
+  int count = 0;
+  for (int i = 0; i < rank_; ++i) {
+    if (count == 0) {
+      cumulative_dimensions[count] = 1;
+    } else {
+      cumulative_dimensions[count] =
+          cumulative_dimensions[count - 1] * dimension_ceiling_[count - 1];
+    }
+    ++count;
+  }
+  return cumulative_dimensions;
+}
 
+template <typename IndexVecT>
+void process_desired_shape(tensorflow::OpKernelContext* context,
+                           const tensorflow::TensorShape& input_tensor_shape,
+                           const IndexVecT& desired_shape,
+                           int* adjustable_dimension,
+                           std::vector<tensorflow::int64>* target_dimensions,
+                           tensorflow::int64* output_size) {
+  tensorflow::int64 new_sliced_size = 1;
+  bool found = false;
+  const int rank = input_tensor_shape.dims();
   for (int i = 0; i < rank; ++i) {
-    // if (desired_shape(i) < 1) {
     if (desired_shape[i] < 1) {
       // only one index can be adjustable
       OP_REQUIRES(context, !found,
                   tensorflow::errors::InvalidArgument(
                       "periodic_resample expects only "
                       "one index to be marked as adjustable."));
-      adjustable_dimension = i;
+      *adjustable_dimension = i;
       found = true;
     } else {
       OP_REQUIRES(
@@ -122,9 +232,8 @@ template <class InputDataT,
               i, " input tensor has size ", input_tensor_shape.dim_size(i),
               ", desired shape has size ", desired_shape[i], "."));
 
-      // target_dimensions[i] = desired_shape(i);
-      target_dimensions[i] = desired_shape[i];
-      new_sliced_size *= target_dimensions[i];
+      (*target_dimensions)[i] = desired_shape[i];
+      new_sliced_size *= (*target_dimensions)[i];
     }
   }
   // at least one index needs to be adjustable
@@ -132,26 +241,50 @@ template <class InputDataT,
               tensorflow::errors::InvalidArgument(
                   "periodic_resample expects at least "
                   "one index to be marked as adjustable."));
+  (*target_dimensions)[*adjustable_dimension] =
+      input_tensor_shape.num_elements() / new_sliced_size;
 
-  int count = 0;
-  for (const auto dim_info : input_tensor.shape()) {
-    original_dimensions[count] = dim_info.size;
-    ++count;
-  }
+  *output_size = new_sliced_size * (*target_dimensions)[*adjustable_dimension];
+}
 
-  target_dimensions[adjustable_dimension] = total_size / new_sliced_size;
+// Heuristic number based on measurements on
+// Intel(R) Core(TM) i7-4930K CPU @ 3.40GHz
+const tensorflow::int64 costPerFillIndex = 35;
 
-  count = 0;
-  for (int i = 0; i < input_tensor.shape().dims(); ++i) {
-    dimension_ceiling[count] = tensorflow::int64(std::ceil(
-        float(target_dimensions[count]) / float(original_dimensions[count])));
-    if (count == 0)
-      cumulative_dimensions[count] = 1;
-    else
-      cumulative_dimensions[count] =
-          cumulative_dimensions[count - 1] * dimension_ceiling[count - 1];
-    ++count;
-  }
+enum class Mode {
+  kForward,
+  kGradient
+};
+
+// Computes either periodic_resample operation output or gradients for it,
+// depending on |mode|.
+// |original_shape| is always shape of input to periodic_resample operation.
+// |source_tensor| is either source for periodic_resample (for forward mode)
+//     or gradients tensor.
+// |desired_shape| is always shape, provided by user, to which forward
+//     propagation attempts resample input tensor.
+template <class InputDataT, Mode mode>
+void
+do_periodic_resample_op(tensorflow::OpKernelContext* context,
+                        const tensorflow::TensorShape& original_shape,
+                        const tensorflow::PartialTensorShape& desired_shape,
+                        const tensorflow::Tensor& source_tensor) {
+  const int rank = source_tensor.dims();
+
+  // requires that the rank of the input tensor and length of the desired shape
+  // are equal
+  OP_REQUIRES(context, rank == desired_shape.dims(),
+              tensorflow::errors::InvalidArgument(
+                  "periodic_resample expects the rank of the input tensor, ",
+                  rank, ", to be the same as the length of the desired shape, ",
+                  desired_shape.dims(), "."));
+
+  std::vector<tensorflow::int64> target_dimensions(rank);
+  tensorflow::int64 new_size = 0;
+  // index of adjustable dimension
+  int adjustable_dimension = 0;
+  process_desired_shape(context, original_shape, desired_shape.dim_sizes(),
+                        &adjustable_dimension, &target_dimensions, &new_size);
 
   // ensure that the new dimension is greater than zero
   OP_REQUIRES(context, target_dimensions[adjustable_dimension] > 0,
@@ -160,11 +293,14 @@ template <class InputDataT,
                   "adjustable dimension, ",
                   adjustable_dimension, ", isn't greater than zero, ",
                   target_dimensions[adjustable_dimension], "."));
-  for (int i = 0; i < rank; ++i) {
-    output_shape.AddDim(target_dimensions[i]);
+  tensorflow::TensorShape output_shape;
+  if (mode == Mode::kForward) {
+    for (int i = 0; i < rank; ++i) {
+      output_shape.AddDim(target_dimensions[i]);
+    }
+  } else {
+    output_shape = original_shape;
   }
-  const auto new_size =
-      new_sliced_size * target_dimensions[adjustable_dimension];
 
   // Create an output tensor and attach it to the current context
   tensorflow::Tensor* output_tensor = nullptr;
@@ -172,47 +308,73 @@ template <class InputDataT,
                  context->allocate_output(0, output_shape, &output_tensor));
   auto output = output_tensor->flat<InputDataT>();
 
-  // memory is allocated for these variables outside the inner loop for
-  // efficiency (although, I could create a separate class scope for
-  // this purpose instead)
-  tensorflow::int64 result = 0;
-  std::vector<tensorflow::int64> output_indices(target_dimensions.size());
+  // input is a strided array (last index is fastest, C-ordered)
+  auto input = source_tensor.flat<InputDataT>();
 
   // Fill output tensor with periodically resampled input tensor values
-  for (tensorflow::int64 output_index = 0; output_index < new_size;
-       ++output_index) {
-    output(output_index) = input(compute_input_index(
-        &target_dimensions, output_index, original_dimensions,
-        adjustable_dimension, dimension_ceiling, cumulative_dimensions, &result,
-        &output_indices, rank));
-  }
+  InputIndexer input_indexer(target_dimensions, original_shape,
+                             adjustable_dimension);
+
+  auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+  auto fill_output_tensor = [&input_indexer, &output, &input](
+      tensorflow::int64 start, tensorflow::int64 limit) {
+    InputIndexer local_indexer(input_indexer);
+    local_indexer.MoveToOutputIndex(start);
+    for (tensorflow::int64 output_index = start; output_index < limit;
+         ++output_index) {
+      if (mode == Mode::kForward) {
+        output(output_index) = input(local_indexer.linear_input_index());
+      } else {
+        output(local_indexer.linear_input_index()) = input(output_index);
+      }
+      local_indexer.IncrementOutputIndex();
+    }
+  };
+  ::tensorflow::Shard(worker_threads.num_threads, worker_threads.workers,
+                      new_size, costPerFillIndex, fill_output_tensor);
 }
 
+#define DATA_TYPE_SWITCH(data_type, context, CASE)                            \
+  switch (data_type) {                                                        \
+    CASE(float)                                                               \
+    CASE(double)                                                              \
+    CASE(tensorflow::int32)                                                   \
+    CASE(tensorflow::int64)                                                   \
+    default:                                                                  \
+      context->CtxFailure(__FILE__, __LINE__,                                 \
+          tensorflow::errors::InvalidArgument(                                \
+              "Unsuppored tensor elements type"));                            \
+      break;                                                                  \
+  }
+
 void create_output_tensor(
     tensorflow::OpKernelContext* context,
     const tensorflow::Tensor& input_tensor,
     const tensorflow::DataType& input_tensor_type,
-    const tensorflow::PartialTensorShape& desired_shape_tensor) {
-  auto desired_shape = desired_shape_tensor.dim_sizes();
-
-  // obligatory type switch
-  switch (input_tensor_type) {
-    case tensorflow::DataTypeToEnum<float>::value:
-      fill_periodic_tensor<float>(context, desired_shape, input_tensor);
+    const tensorflow::PartialTensorShape& desired_shape) {
+#define CASE(type)                                                            \
+    case tensorflow::DataTypeToEnum<type>::value:                             \
+      do_periodic_resample_op<type, Mode::kForward>(                          \
+          context, input_tensor.shape(), desired_shape, input_tensor);        \
       break;
-    case tensorflow::DataTypeToEnum<double>::value:
-      fill_periodic_tensor<double>(context, desired_shape, input_tensor);
-      break;
-    case tensorflow::DataTypeToEnum<tensorflow::int32>::value:
-      fill_periodic_tensor<tensorflow::int32>(context, desired_shape,
-                                              input_tensor);
-      break;
-    case tensorflow::DataTypeToEnum<tensorflow::int64>::value:
-      fill_periodic_tensor<tensorflow::int64>(context, desired_shape,
-                                              input_tensor);
+
+  DATA_TYPE_SWITCH(input_tensor_type, context, CASE);
+#undef CASE
+}
+
+void create_grad_tensor(tensorflow::OpKernelContext* context,
+                        const tensorflow::Tensor& grad_tensor,
+                        const tensorflow::DataType& grad_tensor_type,
+                        const tensorflow::TensorShape& original_shape,
+                        const tensorflow::PartialTensorShape& desired_shape) {
+#define CASE(type)                                                            \
+    case tensorflow::DataTypeToEnum<type>::value:                             \
+      do_periodic_resample_op<type, Mode::kGradient>(                         \
+          context, original_shape, desired_shape, grad_tensor);               \
       break;
-    default:;
-  }
+
+  DATA_TYPE_SWITCH(grad_tensor_type, context, CASE);
+#undef CASE
 }
 
 }  // namespace
@@ -238,4 +400,25 @@ class PeriodicResampleOp : public tensorflow::OpKernel {
   tensorflow::PartialTensorShape desired_shape;
 };
 
+class PeriodicResampleOpGrad : public tensorflow::OpKernel {
+ public:
+  explicit PeriodicResampleOpGrad(tensorflow::OpKernelConstruction* context)
+      : tensorflow::OpKernel(context) {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("original_shape", &original_shape));
+    OP_REQUIRES_OK(context, context->GetAttr("desired_shape", &desired_shape));
+  }
+
+  void Compute(tensorflow::OpKernelContext* context) override {
+    const tensorflow::Tensor& grad_tensor = context->input(0);
+    const tensorflow::DataType grad_tensor_type = context->input_dtype(0);
+    create_grad_tensor(context, grad_tensor, grad_tensor_type, original_shape,
+                       desired_shape);
+  }
+
+ private:
+  tensorflow::TensorShape original_shape;
+  tensorflow::PartialTensorShape desired_shape;
+};
+
 #endif  // TENSORFLOW_KERNELS_PERIODICRESAMPLE_OP_H_
diff --git a/tensorflow/contrib/periodic_resample/ops/array_ops.cc b/tensorflow/contrib/periodic_resample/ops/array_ops.cc
index 82bd79695646e3..fd38cd09b4d093 100644
--- a/tensorflow/contrib/periodic_resample/ops/array_ops.cc
+++ b/tensorflow/contrib/periodic_resample/ops/array_ops.cc
@@ -26,7 +26,42 @@ REGISTER_OP("PeriodicResample")
     .Input("values: T")
     .Attr("shape: shape")
     .Output("output: T")
-    .SetShapeFn(shape_inference::ExplicitShape)
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      tensorflow::PartialTensorShape desired_shape;
+      TF_RETURN_IF_ERROR(c->GetAttr("shape", &desired_shape));
+      shape_inference::ShapeHandle input_tensor_shape = c->input(0);
+      shape_inference::DimensionHandle num_input_elements =
+          c->NumElements(input_tensor_shape);
+      shape_inference::ShapeHandle result_shape_handle;
+      if (!shape_inference::InferenceContext::ValueKnown(num_input_elements)) {
+        TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
+            desired_shape, &result_shape_handle));
+      } else {
+        const int rank = c->Rank(input_tensor_shape);
+        std::vector<tensorflow::int64> target_dimensions(rank);
+        tensorflow::int64 new_sliced_size = 1;
+        int adjustable_dimension = 0;
+        for (int i = 0; i < rank; ++i) {
+          if (desired_shape.dim_size(i) < 1) {
+            adjustable_dimension = i;
+          } else {
+            target_dimensions[i] = desired_shape.dim_size(i);
+            new_sliced_size *= target_dimensions[i];
+          }
+        }
+        target_dimensions[adjustable_dimension] =
+            shape_inference::InferenceContext::Value(
+                num_input_elements) / new_sliced_size;
+        tensorflow::TensorShape result_shape;
+        for (int i = 0; i < rank; ++i) {
+          result_shape.AddDim(target_dimensions[i]);
+        }
+        TF_RETURN_IF_ERROR(c->MakeShapeFromTensorShape(
+            result_shape, &result_shape_handle));
+      }
+      c->set_output(0, result_shape_handle);
+      return Status::OK();
+    })
     .Doc(R"doc(
 Periodically resample elements of a tensor to conform to `shape`.
 
@@ -101,4 +136,20 @@ output: Periodically resampled tensor that has dimensions specified as in
 
 )doc");
 
+
+REGISTER_OP("PeriodicResampleOpGrad")
+    .Attr("T: numbertype")
+    .Input("grad: T")
+    .Attr("original_shape: shape")
+    .Attr("desired_shape: shape")
+    .Output("grad_values: T")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      tensorflow::TensorShape original_shape;
+      TF_RETURN_IF_ERROR(c->GetAttr("original_shape", &original_shape));
+      shape_inference::ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromTensorShape(original_shape, &s));
+      c->set_output(0, s);
+      return Status::OK();
+});
+
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/ops/array_ops_test.cc b/tensorflow/contrib/periodic_resample/ops/array_ops_test.cc
new file mode 100644
index 00000000000000..55edf76fcd3eed
--- /dev/null
+++ b/tensorflow/contrib/periodic_resample/ops/array_ops_test.cc
@@ -0,0 +1,40 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+TEST(ArrayOpsTest, PeriodicResample_ShapeFn) {
+  ShapeInferenceTestOp op("PeriodicResample");
+  // Case 1: output shape can be fully inferreed.
+  PartialTensorShape shape({4, 4, -1});
+  TensorShapeProto shape_proto;
+  shape.AsProto(&shape_proto);
+
+  TF_ASSERT_OK(NodeDefBuilder("test", "PeriodicResample")
+                   .Input({"values", 0, DT_INT32})
+                   .Attr("shape", shape_proto)
+                   .Finalize(&op.node_def));
+  INFER_OK(op, "[2,2,4]", "[4,4,1]");
+  // Case 2: output shape can not be inferred - report desired shape.
+  INFER_OK(op, "[2,2,?]", "[4,4,?]");
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py b/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
index a25de55e18b223..31a6fe1d94b8a9 100644
--- a/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
+++ b/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
@@ -21,8 +21,11 @@
 import numpy
 
 from tensorflow.contrib.periodic_resample import periodic_resample
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
@@ -93,7 +96,6 @@ def testPeriodicResampleBasic4D(self):
   def testPeriodicResampleErrors(self):
     input_tensor = numpy.zeros(shape=[1, 2, 2, 4])
     with self.test_session():
-      variables.global_variables_initializer().run()
       with self.assertRaisesWithPredicateMatch(
           errors_impl.InvalidArgumentError,
           'Dimension 3 input tensor has size 4, desired shape has size 1'):
@@ -103,6 +105,29 @@ def testPeriodicResampleErrors(self):
           '4, to be the same as the length of the desired shape, 3'):
         periodic_resample(input_tensor, [None, 4, 4]).eval()
 
+  def testPeriodicResampleGradient(self):
+    desired_shape = numpy.array([4, 4, None])
+    result_shape = (4, 4, 1)
+    input_shape = (2, 2, 4)
+    with self.test_session() as sess:
+      x = array_ops.placeholder(dtypes.float32, shape=input_shape)
+      output = periodic_resample(x, desired_shape)
+      error = gradient_checker.compute_gradient_error(
+          x, input_shape, output, result_shape)
+      self.assertLess(error, 1e-4)
+
+  def testPeriodicResampleShapeInference(self):
+    with self.test_session() as sess:
+      # Case 1: output shape can be fully inferreed.
+      x = array_ops.placeholder(dtypes.float32, shape=(2, 2, 4))
+      output = periodic_resample(x, [4, 4, None])
+      self.assertEqual(output.shape, [4, 4, 1])
+      # Case 2: output shape can not be inferred - report desired shape.
+      x = array_ops.placeholder(dtypes.float32, shape=(2, 2, None))
+      output = periodic_resample(x, [4, 4, None])
+      self.assertTrue(output.shape.is_compatible_with([4, 4, None]))
+      self.assertEqual(output.shape[2].value, None)
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py b/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
index 348623d8f8d0c2..470e300ccbe710 100644
--- a/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
+++ b/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
@@ -21,11 +21,17 @@
 # pylint: disable=unused-import
 from tensorflow.contrib.periodic_resample.python.ops import gen_periodic_resample_op
 
-from tensorflow.contrib.periodic_resample.python.ops.gen_periodic_resample_op import periodic_resample
+from tensorflow.contrib.periodic_resample.python.ops.gen_periodic_resample_op import periodic_resample, periodic_resample_op_grad
 
 from tensorflow.contrib.util import loader
+from tensorflow.python.framework import ops
 from tensorflow.python.platform import resource_loader
 # pylint: enable=unused-import
 
 _periodic_resample_op = loader.load_op_library(
     resource_loader.get_path_to_datafile('_periodic_resample_op.so'))
+
+@ops.RegisterGradient("PeriodicResample")
+def _periodic_resample_grad_cc(op, grad):
+  return periodic_resample_op_grad(
+      grad, op.inputs[0].shape, op.get_attr('shape'))
diff --git a/tensorflow/contrib/predictor/contrib_estimator_predictor.py b/tensorflow/contrib/predictor/contrib_estimator_predictor.py
index b7a98c68e2343e..af3b2ad1b531b8 100644
--- a/tensorflow/contrib/predictor/contrib_estimator_predictor.py
+++ b/tensorflow/contrib/predictor/contrib_estimator_predictor.py
@@ -34,7 +34,8 @@ def __init__(self,
                prediction_input_fn,
                input_alternative_key=None,
                output_alternative_key=None,
-               graph=None):
+               graph=None,
+               config=None):
     """Initialize a `ContribEstimatorPredictor`.
 
     Args:
@@ -48,6 +49,7 @@ def __init__(self,
         multi-headed models.
       graph: Optional. The Tensorflow `graph` in which prediction should be
         done.
+      config: `ConfigProto` proto used to configure the session.
     """
     self._graph = graph or ops.Graph()
     with self._graph.as_default():
@@ -58,6 +60,7 @@ def __init__(self,
       checkpoint_path = saver.latest_checkpoint(estimator.model_dir)
       self._session = monitored_session.MonitoredSession(
           session_creator=monitored_session.ChiefSessionCreator(
+              config=config,
               checkpoint_filename_with_path=checkpoint_path))
 
     input_alternative_key = (
diff --git a/tensorflow/contrib/predictor/core_estimator_predictor.py b/tensorflow/contrib/predictor/core_estimator_predictor.py
index d78d94c2699b14..a725072e72df2d 100644
--- a/tensorflow/contrib/predictor/core_estimator_predictor.py
+++ b/tensorflow/contrib/predictor/core_estimator_predictor.py
@@ -51,7 +51,8 @@ def __init__(self,
                estimator,
                serving_input_receiver_fn,
                output_key=None,
-               graph=None):
+               graph=None,
+               config=None):
     """Initialize a `CoreEstimatorPredictor`.
 
     Args:
@@ -62,6 +63,7 @@ def __init__(self,
         `None`, then `DEFAULT_SERVING_SIGNATURE_DEF_KEY` is used.
       graph: Optional. The Tensorflow `graph` in which prediction should be
         done.
+      config: `ConfigProto` proto used to configure the session.
     """
     self._graph = graph or ops.Graph()
     with self._graph.as_default():
@@ -71,6 +73,7 @@ def __init__(self,
       checkpoint_dir = estimator.model_dir
       self._session = monitored_session.MonitoredSession(
           session_creator=monitored_session.ChiefSessionCreator(
+              config=config,
               checkpoint_dir=checkpoint_dir))
 
     feed_tensor_info = signature_def.inputs
diff --git a/tensorflow/contrib/predictor/predictor_factories.py b/tensorflow/contrib/predictor/predictor_factories.py
index 6e77e934fe1985..f275bc15adfa0a 100644
--- a/tensorflow/contrib/predictor/predictor_factories.py
+++ b/tensorflow/contrib/predictor/predictor_factories.py
@@ -30,7 +30,8 @@ def from_contrib_estimator(estimator,
                            prediction_input_fn,
                            input_alternative_key=None,
                            output_alternative_key=None,
-                           graph=None):
+                           graph=None,
+                           config=None):
   """Constructs a `Predictor` from a `tf.contrib.learn.Estimator`.
 
   Args:
@@ -44,6 +45,7 @@ def from_contrib_estimator(estimator,
       multi-headed models.
     graph: Optional. The Tensorflow `graph` in which prediction should be
       done.
+    config: `ConfigProto` proto used to configure the session.
 
   Returns:
     An initialized `Predictor`.
@@ -62,13 +64,15 @@ def from_contrib_estimator(estimator,
       prediction_input_fn,
       input_alternative_key=input_alternative_key,
       output_alternative_key=output_alternative_key,
-      graph=graph)
+      graph=graph,
+      config=config)
 
 
 def from_estimator(estimator,
                    serving_input_receiver_fn,
                    output_key=None,
-                   graph=None):
+                   graph=None,
+                   config=None):
   """Constructs a `Predictor` from a `tf.python.estimator.Estimator`.
 
   Args:
@@ -79,6 +83,7 @@ def from_estimator(estimator,
       `None`, then `DEFAULT_SERVING_SIGNATURE_DEF_KEY` is used.
     graph: Optional. The Tensorflow `graph` in which prediction should be
       done.
+    config: `ConfigProto` proto used to configure the session.
 
   Returns:
     An initialized `Predictor`.
@@ -93,14 +98,19 @@ def from_estimator(estimator,
                     'tf.contrib.learn.Estimator. You likely want to call '
                     'from_contrib_estimator.')
   return core_estimator_predictor.CoreEstimatorPredictor(
-      estimator, serving_input_receiver_fn, output_key=output_key, graph=graph)
+      estimator,
+      serving_input_receiver_fn,
+      output_key=output_key,
+      graph=graph,
+      config=config)
 
 
 def from_saved_model(export_dir,
                      signature_def_key=None,
                      signature_def=None,
                      tags=None,
-                     graph=None):
+                     graph=None,
+                     config=None):
   """Constructs a `Predictor` from a `SavedModel` on disk.
 
   Args:
@@ -115,6 +125,7 @@ def from_saved_model(export_dir,
       `SignatureDef`. Defaults to `DEFAULT_TAGS`.
     graph: Optional. The Tensorflow `graph` in which prediction should be
       done.
+    config: `ConfigProto` proto used to configure the session.
 
   Returns:
     An initialized `Predictor`.
@@ -128,4 +139,5 @@ def from_saved_model(export_dir,
       signature_def_key=signature_def_key,
       signature_def=signature_def,
       tags=tags,
-      graph=graph)
+      graph=graph,
+      config=config)
diff --git a/tensorflow/contrib/predictor/predictor_factories_test.py b/tensorflow/contrib/predictor/predictor_factories_test.py
index 578d9424b25dd3..a2ef1dc3af0986 100644
--- a/tensorflow/contrib/predictor/predictor_factories_test.py
+++ b/tensorflow/contrib/predictor/predictor_factories_test.py
@@ -20,6 +20,7 @@
 
 from tensorflow.contrib.predictor import predictor_factories
 from tensorflow.contrib.predictor import testing_common
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.platform import test
 
 MODEL_DIR_NAME = 'contrib/predictor/test_export_dir'
@@ -41,6 +42,11 @@ def testFromSavedModelWithTags(self):
     """Test loading from_saved_model with tags."""
     predictor_factories.from_saved_model(self._export_dir, tags='serve')
 
+  def testFromSavedModelWithSessionConfig(self):
+    """Test loading from_saved_model with session config."""
+    predictor_factories.from_saved_model(
+        self._export_dir, config=config_pb2.ConfigProto())
+
   def testFromSavedModelWithBadTags(self):
     """Test that loading fails for bad tags."""
     bad_tags_regex = ('.*? could not be found in SavedModel')
@@ -53,6 +59,13 @@ def testFromContribEstimator(self):
     predictor_factories.from_contrib_estimator(
         estimator, input_fn, output_alternative_key='sum')
 
+  def testFromContribEstimatorWithSessionConfig(self):
+    estimator = testing_common.get_arithmetic_estimator(core=False)
+    input_fn = testing_common.get_arithmetic_input_fn(core=False)
+    predictor_factories.from_contrib_estimator(
+        estimator, input_fn, output_alternative_key='sum',
+        config=config_pb2.ConfigProto())
+
   def testFromContribEstimatorWithCoreEstimatorRaises(self):
     estimator = testing_common.get_arithmetic_estimator(core=True)
     input_fn = testing_common.get_arithmetic_input_fn(core=True)
@@ -64,6 +77,12 @@ def testFromCoreEstimator(self):
     input_fn = testing_common.get_arithmetic_input_fn(core=True)
     predictor_factories.from_estimator(estimator, input_fn)
 
+  def testFromCoreEstimatorWithSessionConfig(self):
+    estimator = testing_common.get_arithmetic_estimator(core=True)
+    input_fn = testing_common.get_arithmetic_input_fn(core=True)
+    predictor_factories.from_estimator(
+        estimator, input_fn, config=config_pb2.ConfigProto())
+
   def testFromCoreEstimatorWithContribEstimatorRaises(self):
     estimator = testing_common.get_arithmetic_estimator(core=False)
     input_fn = testing_common.get_arithmetic_input_fn(core=False)
diff --git a/tensorflow/contrib/predictor/saved_model_predictor.py b/tensorflow/contrib/predictor/saved_model_predictor.py
index 0dbca0f8136e4e..95da6d04edc521 100644
--- a/tensorflow/contrib/predictor/saved_model_predictor.py
+++ b/tensorflow/contrib/predictor/saved_model_predictor.py
@@ -121,7 +121,8 @@ def __init__(self,
                input_names=None,
                output_names=None,
                tags=None,
-               graph=None):
+               graph=None,
+               config=None):
     """Initialize a `CoreEstimatorPredictor`.
 
     Args:
@@ -142,6 +143,7 @@ def __init__(self,
         the correct `SignatureDef`. Defaults to `DEFAULT_TAGS`.
       graph: Optional. The Tensorflow `graph` in which prediction should be
         done.
+      config: `ConfigProto` proto used to configure the session.
     Raises:
       ValueError: If more than one of signature_def_key OR signature_def OR
         (input_names AND output_names) is specified.
@@ -152,7 +154,7 @@ def __init__(self,
     self._graph = graph or ops.Graph()
 
     with self._graph.as_default():
-      self._session = session.Session()
+      self._session = session.Session(config=config)
       loader.load(self._session, tags.split(','), export_dir)
 
     if input_names is None:
diff --git a/tensorflow/contrib/quantize/BUILD b/tensorflow/contrib/quantize/BUILD
index b9918fdee1ece2..23363617eddd20 100644
--- a/tensorflow/contrib/quantize/BUILD
+++ b/tensorflow/contrib/quantize/BUILD
@@ -155,8 +155,10 @@ py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:partitioned_variables",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:session",
+        "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
     ],
 )
diff --git a/tensorflow/contrib/quantize/README.md b/tensorflow/contrib/quantize/README.md
index c83623ec947c15..27a933c0f945e5 100644
--- a/tensorflow/contrib/quantize/README.md
+++ b/tensorflow/contrib/quantize/README.md
@@ -6,7 +6,7 @@ inference. The details of the transformation implemented in this package is
 described here [1].
 
 This is done using the
-[fake quantization op](https://www.tensorflow.org/versions/r0.12/api_docs/python/array_ops/fake_quantization).
+[fake quantization op](https://www.tensorflow.org/api_guides/python/array_ops#Fake_quantization).
 
 Literature has shown that fixed point networks provide comparable performance to
 floating point networks [2]. This is achieved by modeling the quantization
diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms.py b/tensorflow/contrib/quantize/python/fold_batch_norms.py
index 6f41722748b475..55479bf5f74299 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms.py
@@ -414,7 +414,8 @@ def _CloneWithNewOperands(layer_op, input_tensor, weight_tensor):
 def _FoldFusedBatchNormGrad(op, unused_grad_y, grad_mean, grad_var, unused_1,
                             unused_2):
   x = op.inputs[0]
-  n = x.get_shape().num_elements() / grad_mean.get_shape().num_elements()
+  n = math_ops.cast(
+      array_ops.size(x) / array_ops.size(grad_mean), dtypes.float32)
   dmean_dx = grad_mean / n
   dvar_dx = 2 * grad_var * (x - op.outputs[1]) / (n - 1)
   return (dmean_dx + dvar_dx), None, None, None, None
@@ -474,12 +475,49 @@ def _FoldUnfusedBatchNorms(graph, is_training, freeze_batch_norm_delay):
 def _IsValidUnfusedBatchNorm(graph, context):
   """Checks that the output of the unfused batch norm has consumers."""
   add_shift = graph.get_operation_by_name(
-      context + '/BatchNorm/batchnorm/add_1')
+      context + '/BatchNorm/batchnorm_1/add_1')
   # Ensure that the output tensor of batch norm has consumers, otherwise this
   # is a dangling node and not a match.
   return bool(add_shift.outputs[0].consumers())
 
 
+def _FindMatchingTensor(graph, match_pattern, scope):
+  """Finds best match of ops matching match_pattern with scope.
+
+     Example: _FindMatchingTensor(graph,'/BatchNorm/moments/Squeeze',
+     'MobilenetV1/MobilenetV1/Conv2d_0/') returns:
+      Tensor('MobilenetV1/Conv2d_0/BatchNorm/moments/Squeeze')
+
+  Args:
+    graph: Graph to inspect.
+    match_pattern: Part of the name of the op that we need to match, should
+    be present in the op's name
+    scope: The scope of the op. All the elements of the scope need not be
+    present in the op's name.
+
+  Returns:
+    Tensor from graph that provides the best match to the match_pattern and
+    scope
+  """
+
+  oplist = graph.get_operations()
+  split_context = set(scope.split('/'))
+  match_dict = {}
+  for op in oplist:
+    if op.name.endswith(match_pattern):
+      split_name = op.name.split('/')
+      num_matches = len(set(split_name) & split_context)
+      if num_matches > 0:
+        match_dict[op.name] = num_matches
+  # match_dict contains matching op names from graph with values being
+  # number of matches to scope. We pick the key with the most matches
+  if match_dict:
+    max_key = max(match_dict, key=match_dict.get)
+    return graph.get_tensor_by_name(max_key + ':0')
+  else:
+    return None
+
+
 def _GetBatchNormParams(graph, context, has_scaling):
   """Extracts relevant tensors for folding batch norms.
 
@@ -500,7 +538,8 @@ def _GetBatchNormParams(graph, context, has_scaling):
   bn_decay_mean_tensor = None
   bn_decay_var_tensor = None
 
-  split_context = context.split('/')
+  # TODO(raghuramank) This code relies on string matching and needs to be
+  # updated if unfused batch norm continues to be widely used
   # Matching variable names is brittle and relies on scoping
   # conventions. Fused batch norm folding is more robust. Support for unfused
   # batch norms will be deprecated as we move forward. Fused batch norms allow
@@ -518,49 +557,48 @@ def _GetBatchNormParams(graph, context, has_scaling):
   # and the names of the tensors start with a single MobilenetV2
   # The moving mean for example, has the name:
   # MobilenetV2/expanded_conv_3/depthwise/BatchNorm/moving_mean/read
-  # We ignore the first string (MobilenetV1 or MobilenetV2)
-  # in the context to match correctly in both cases
-
-  base_context = '/'.join(split_context[1:])
-  oplist = graph.get_operations()
-  op_suffix_mean = base_context + '/BatchNorm/moments/Squeeze'
-  op_suffix_variance = base_context + '/BatchNorm/moments/Squeeze_1'
-  op_suffix_epsilon = base_context + '/BatchNorm/batchnorm/add/y'
-  op_suffix_bn_decay_mean = base_context + '/BatchNorm/AssignMovingAvg/decay'
-  op_suffix_bn_decay_var = base_context + '/BatchNorm/AssignMovingAvg_1/decay'
+  # We identify the best match for an op by checking for
+  # 1. The suffix of the op is exactly matched
+  # 2. Maximum number of matches with the context.The matching
+  # score is given by the number of parts of context (split by /) that
+  # are present in the parts of the tensor name (again split by /).
+  # For example: scope= MobilenetV2/MobilenetV2/expanded_conv_3 and
+  # op.name =  MobilenetV2/expanded_conv_3/depthwise/BatchNorm/moving_mean/read
+  # will have 2 matches,scope with a different conv layer will have one match.
+
+  op_suffix_mean = '/BatchNorm/moments/Squeeze'
+  op_suffix_variance = '/BatchNorm/moments/Squeeze_1'
+  op_suffix_epsilon = '/BatchNorm/batchnorm_1/add/y'
+  op_suffix_bn_decay_mean = '/BatchNorm/AssignMovingAvg/decay'
+  op_suffix_bn_decay_var = '/BatchNorm/AssignMovingAvg_1/decay'
 
   if variable_scope.get_variable_scope().use_resource:
-    op_suffix_gamma = base_context + '/BatchNorm/gamma/Read/ReadVariableOp'
+    op_suffix_gamma = '/BatchNorm/gamma/Read/ReadVariableOp'
     op_suffix_moving_variance = (
-        base_context + '/BatchNorm/moving_variance/Read/ReadVariableOp')
-    op_suffix_moving_mean = (
-        base_context + '/BatchNorm/moving_mean/Read/ReadVariableOp')
+        '/BatchNorm/moving_variance/Read/ReadVariableOp')
+    op_suffix_moving_mean = ('/BatchNorm/moving_mean/Read/ReadVariableOp')
   else:
-    op_suffix_gamma = base_context + '/BatchNorm/gamma'
-    op_suffix_moving_variance = base_context + '/BatchNorm/moving_variance/read'
-    op_suffix_moving_mean = base_context + '/BatchNorm/moving_mean/read'
+    op_suffix_gamma = '/BatchNorm/gamma'
+    op_suffix_moving_variance = '/BatchNorm/moving_variance/read'
+    op_suffix_moving_mean = '/BatchNorm/moving_mean/read'
   # Parse through list of ops to find relevant ops
-  for op in oplist:
-    if op.name.endswith(op_suffix_mean):
-      # This is an efficient way to check for two things:
-      # Is batch norm present and is it training mode?
-      # Batch statistics are computed only during batch norm in training
-      batch_mean_tensor = graph.get_tensor_by_name(op.name + ':0')
-    if op.name.endswith(op_suffix_variance):
-      batch_variance_tensor = graph.get_tensor_by_name(op.name + ':0')
-    if op.name.endswith(op_suffix_moving_mean):
-      moving_mean_tensor = graph.get_tensor_by_name(op.name + ':0')
-    if op.name.endswith(op_suffix_moving_variance):
-      moving_variance_tensor = graph.get_tensor_by_name(op.name + ':0')
-    if op.name.endswith(op_suffix_epsilon):
-      batch_epsilon = graph.get_tensor_by_name(op.name + ':0')
-    if op.name.endswith(op_suffix_bn_decay_mean):
-      bn_decay_mean_tensor = graph.get_tensor_by_name(op.name + ':0')
-    if op.name.endswith(op_suffix_bn_decay_var):
-      bn_decay_var_tensor = graph.get_tensor_by_name(op.name + ':0')
-    if has_scaling:
-      if op.name.endswith(op_suffix_gamma):
-        gamma_tensor = graph.get_tensor_by_name(op.name + ':0')
+
+  batch_mean_tensor = _FindMatchingTensor(graph, op_suffix_mean, context)
+  batch_variance_tensor = _FindMatchingTensor(graph, op_suffix_variance,
+                                              context)
+  moving_mean_tensor = _FindMatchingTensor(graph, op_suffix_moving_mean,
+                                           context)
+  moving_variance_tensor = _FindMatchingTensor(graph, op_suffix_moving_variance,
+                                               context)
+  batch_epsilon = _FindMatchingTensor(graph, op_suffix_epsilon, context)
+  bn_decay_mean_tensor = _FindMatchingTensor(graph, op_suffix_bn_decay_mean,
+                                             context)
+  bn_decay_var_tensor = _FindMatchingTensor(graph, op_suffix_bn_decay_var,
+                                            context)
+  if batch_mean_tensor is None and moving_mean_tensor is None:
+    ValueError('Error folding unfused batch norms')
+  if has_scaling:
+    gamma_tensor = _FindMatchingTensor(graph, op_suffix_gamma, context)
 
   if not has_scaling:
     gamma_tensor = array_ops.ones(moving_mean_tensor.shape)
@@ -605,12 +643,12 @@ def _CreateFoldedOp(graph, context, has_scaling, freeze_batch_norm_delay,
 
   Returns:
     A pair of Operations, the first is the original consumer node of the batch
-      norm (../BatchNorm/batchnorm/add_1), the second is the consumer node of
+      norm (../BatchNorm/batchnorm_1/add_1), the second is the consumer node of
       the folded graph (add_fold).
   """
   mul_scale_name = 'mul_1' if has_scaling else 'mul'
   mul_scale = graph.get_operation_by_name(context +
-                                          '/BatchNorm/batchnorm/' +
+                                          '/BatchNorm/batchnorm_1/' +
                                           mul_scale_name)
   op_below = mul_scale.inputs[0].op
   weights = op_below.inputs[1]
@@ -632,7 +670,7 @@ def _CreateFoldedOp(graph, context, has_scaling, freeze_batch_norm_delay,
     ]
     scale_name = 'mul' if has_scaling else 'Rsqrt'
     scale = graph.get_operation_by_name(
-        context + '/BatchNorm/batchnorm/' + scale_name)
+        context + '/BatchNorm/batchnorm_1/' + scale_name)
     scale = array_ops.reshape(scale.outputs[0], new_shape,
                               context + '/scale_reshape')
 
@@ -660,7 +698,7 @@ def _CreateFoldedOp(graph, context, has_scaling, freeze_batch_norm_delay,
                                [(1, mul_fold.outputs[0])])
 
   add_shift = graph.get_operation_by_name(
-      context + '/BatchNorm/batchnorm/add_1')
+      context + '/BatchNorm/batchnorm_1/add_1')
 
   corrected_output = conv_or_fc_folded.outputs[0]
   if correction_offset is not None:
@@ -848,7 +886,7 @@ def _HasScaling(graph, input_to_ops_map, bn):
   Returns:
     A boolean indicating whether this batch norm layer has scaling enabled.
   """
-  rsqrt_op = graph.get_operation_by_name(bn + '/BatchNorm/batchnorm/Rsqrt')
+  rsqrt_op = graph.get_operation_by_name(bn + '/BatchNorm/batchnorm_1/Rsqrt')
   rsqrt_consumers = input_to_ops_map.ConsumerOperations(rsqrt_op)
 
   return sum(1 for op in rsqrt_consumers if op.type == 'Mul') == 1
diff --git a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
index 64e8142e7c6092..bfa9d3bf705e32 100644
--- a/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
+++ b/tensorflow/contrib/quantize/python/fold_batch_norms_test.py
@@ -31,6 +31,7 @@
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 from tensorflow.python.training import saver as saver_lib
@@ -157,32 +158,38 @@ def testMultipleLayerConv2d(self,
       out_depth = 3
       stride = 1
       activation_fn = relu
-      scope = 'network/expanded_conv_1/conv'
-      layer1 = conv2d(
-          inputs,
-          out_depth, [5, 5],
-          stride=stride,
-          padding='SAME',
-          weights_initializer=self._WeightInit(0.09),
-          activation_fn=activation_fn,
-          normalizer_fn=batch_norm,
-          normalizer_params=self._BatchNormParams(
-              scale=has_scaling, fused=fused_batch_norm),
-          scope=scope)
-      # Add another layer
-      scope = 'network/expanded_conv_2/conv'
-
-      _ = conv2d(
-          layer1,
-          2 * out_depth, [5, 5],
-          stride=stride,
-          padding='SAME',
-          weights_initializer=self._WeightInit(0.09),
-          activation_fn=activation_fn,
-          normalizer_fn=batch_norm,
-          normalizer_params=self._BatchNormParams(
-              scale=has_scaling, fused=fused_batch_norm),
-          scope=scope)
+      scope = 'topnet/testnet'
+      with variable_scope.variable_scope(scope, [inputs]):
+        layer1 = conv2d(
+            inputs,
+            out_depth, [5, 5],
+            stride=stride,
+            padding='SAME',
+            weights_initializer=self._WeightInit(0.09),
+            activation_fn=None,
+            normalizer_fn=None,
+            scope='testnet/layer1')
+        # Add bn and relu with different scope
+        layer1 = batch_norm(
+            layer1, scale=has_scaling, fused=fused_batch_norm, scope='layer1')
+        layer1 = activation_fn(layer1)
+        layer2 = conv2d(
+            layer1,
+            2 * out_depth, [5, 5],
+            stride=stride,
+            padding='SAME',
+            weights_initializer=self._WeightInit(0.09),
+            activation_fn=activation_fn,
+            normalizer_fn=batch_norm,
+            normalizer_params=self._BatchNormParams(
+                scale=has_scaling, fused=fused_batch_norm),
+            scope='testnet/layer2')
+        # Add bn and relu with different scope
+        layer2 = batch_norm(
+            layer2, scale=has_scaling, fused=fused_batch_norm, scope='layer2')
+        _ = activation_fn(layer2)
+
+      scope = 'topnet/testnet/testnet/layer2'
 
       fold_batch_norms.FoldBatchNorms(
           g, is_training=True, freeze_batch_norm_delay=freeze_batch_norm_delay)
@@ -509,13 +516,13 @@ def _BatchNormMultiplierName(self, scope, has_scaling, fused):
     if has_scaling:
       if fused:
         return scope + '/BatchNorm_Fold/mul'
-      return scope + '/BatchNorm/batchnorm/mul'
-    return scope + '/BatchNorm/batchnorm/Rsqrt'
+      return scope + '/BatchNorm/batchnorm_1/mul'
+    return scope + '/BatchNorm/batchnorm_1/Rsqrt'
 
   def _BathNormBiasName(self, scope, fused):
     if fused:
       return scope + '/BatchNorm_Fold/bias'
-    return scope + '/BatchNorm/batchnorm/sub'
+    return scope + '/BatchNorm/batchnorm_1/sub'
 
   def _WeightInit(self, stddev):
     """Returns a truncated normal variable initializer.
diff --git a/tensorflow/contrib/quantize/python/graph_matcher.py b/tensorflow/contrib/quantize/python/graph_matcher.py
index bacc707a3abb55..aa3ca991c060b2 100644
--- a/tensorflow/contrib/quantize/python/graph_matcher.py
+++ b/tensorflow/contrib/quantize/python/graph_matcher.py
@@ -19,6 +19,7 @@
 from __future__ import print_function
 
 import abc
+import itertools
 
 
 class Pattern(object):
@@ -33,7 +34,7 @@ def match(self, op, tensor):
 class OpTypePattern(Pattern):
   """A tree pattern that matches TF expressions with certain op types."""
 
-  def __init__(self, op_type, name=None, inputs=None):
+  def __init__(self, op_type, name=None, inputs=None, ordered_inputs=True):
     """Initializes an OpTypePattern.
 
     Args:
@@ -48,16 +49,25 @@ def __init__(self, op_type, name=None, inputs=None):
       inputs: Optional list of `Pattern`s or strings that specify the
         patterns for the inputs of a matching op. If None, this pattern accepts
         any inputs of a matching op.
+      ordered_inputs: Defaults to True. If False, will match any op that
+        matches a permutation of the inputs.
+
+    Raises:
+      ValueError: if too many inputs are provided when order_inputs is False.
     """
     self._op_type = op_type
     self._name = name
     if inputs is None:
       inputs = []
+    if len(inputs) > 8:
+      raise ValueError(
+          'Only < 8 inputs are allowed when ordered_inputs is False.')
     self._inputs = [
         input_pattern
         if isinstance(input_pattern, Pattern) else OpTypePattern(input_pattern)
         for input_pattern in inputs
     ]
+    self._ordered_inputs = ordered_inputs
 
   @property
   def name(self):
@@ -78,12 +88,23 @@ def match(self, op, tensor):
     if len(op.inputs) != len(self._inputs):
       return None
 
-    for input_tensor, input_pattern in zip(op.inputs, self._inputs):
-      input_match_result = input_pattern.match(input_tensor.op, input_tensor)
-      if input_match_result is None:
-        return None
-      match_result.merge_from(input_match_result)
-    return match_result
+    input_patterns_list = [self._inputs]
+    # If order doesn't matter for the inputs, then make sure we match at least
+    # one permutation of the inputs.
+    if not self._ordered_inputs:
+      input_patterns_list = list(itertools.permutations(self._inputs))
+
+    for input_patterns in input_patterns_list:
+      match_failed = False
+      for input_tensor, input_pattern in zip(op.inputs, input_patterns):
+        input_match_result = input_pattern.match(input_tensor.op, input_tensor)
+        if input_match_result is None:
+          match_failed = True
+          break
+        match_result.merge_from(input_match_result)
+      if not match_failed:
+        return match_result
+    return None
 
 
 class OneofPattern(Pattern):
diff --git a/tensorflow/contrib/quantize/python/graph_matcher_test.py b/tensorflow/contrib/quantize/python/graph_matcher_test.py
index 6d587572181c12..be741644b61541 100644
--- a/tensorflow/contrib/quantize/python/graph_matcher_test.py
+++ b/tensorflow/contrib/quantize/python/graph_matcher_test.py
@@ -22,6 +22,7 @@
 from tensorflow.contrib.layers.python.layers import initializers
 from tensorflow.contrib.layers.python.layers import layers
 from tensorflow.contrib.quantize.python import graph_matcher
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
@@ -163,6 +164,44 @@ def test_oneof_pattern(self):
       self.assertEqual(match_result.get_tensor('slice'), slicing)
       self.assertEqual(match_result.get_op('transpose'), transpose.op)
 
+  def test_ordered_pattern(self):
+    #   +            +
+    #  / \          / \
+    # x   y  and   y   x  should both match when ordered inputs is False.
+    # Even when x and y are different operations.
+    g = ops.Graph()
+    with g.as_default():
+      x = array_ops.placeholder(dtypes.float32, shape=[], name='x')
+      y = constant_op.constant(1.0, dtype=dtypes.float32)
+      plus = x + y
+
+    add_pattern_a = graph_matcher.OpTypePattern(
+        'Add', inputs=['Const', 'Placeholder'], ordered_inputs=False)
+    add_pattern_b = graph_matcher.OpTypePattern(
+        'Add', inputs=['Placeholder', 'Const'], ordered_inputs=False)
+    add_pattern_fail = graph_matcher.OpTypePattern(
+        'Add', inputs=['Const', 'Placeholder'], ordered_inputs=True)
+    # Both add_pattern_a and add_pattern_b should match the graph since
+    # ordered_input was set False.
+    matcher_a = graph_matcher.GraphMatcher(add_pattern_a)
+    self.assertEqual([
+        match_result.get_op(add_pattern_a)
+        for match_result in matcher_a.match_graph(g)
+    ], [plus.op])
+    matcher_b = graph_matcher.GraphMatcher(add_pattern_b)
+    self.assertEqual([
+        match_result.get_op(add_pattern_b)
+        for match_result in matcher_b.match_graph(g)
+    ], [plus.op])
+    # But if ordered_inputs is True, the inputs list match should fail if not
+    # specified in the right order.
+    matcher_fail = graph_matcher.GraphMatcher(add_pattern_fail)
+    self.assertEqual(
+        len([
+            match_result.get_op(add_pattern_fail)
+            for match_result in matcher_fail.match_graph(g)
+        ]), 0)
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/contrib/quantize/python/quant_ops.py b/tensorflow/contrib/quantize/python/quant_ops.py
index 5c0e17dc8646ce..27069444a4bf84 100644
--- a/tensorflow/contrib/quantize/python/quant_ops.py
+++ b/tensorflow/contrib/quantize/python/quant_ops.py
@@ -81,7 +81,8 @@ def LastValueQuantize(inputs,
     a tensor containing quantized values.
   """
   with variable_scope.variable_scope(
-      None, default_name=name_prefix, values=[inputs], reuse=reuse):
+      None, default_name=name_prefix, values=[inputs], reuse=reuse) as scope:
+    scope.set_partitioner(None)
     input_shape = inputs.get_shape()
     input_dim = len(input_shape)
     if per_channel:
@@ -189,7 +190,8 @@ def MovingAvgQuantize(inputs,
     a tensor containing quantized values.
   """
   with variable_scope.variable_scope(
-      None, default_name=name_prefix, values=[inputs], reuse=reuse):
+      None, default_name=name_prefix, values=[inputs], reuse=reuse) as scope:
+    scope.set_partitioner(None)
     input_shape = inputs.get_shape()
     input_dim = len(input_shape)
     if per_channel:
diff --git a/tensorflow/contrib/quantize/python/quant_ops_test.py b/tensorflow/contrib/quantize/python/quant_ops_test.py
index 38846796028512..c2a8def48012c8 100644
--- a/tensorflow/contrib/quantize/python/quant_ops_test.py
+++ b/tensorflow/contrib/quantize/python/quant_ops_test.py
@@ -23,6 +23,8 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
@@ -73,6 +75,36 @@ def testMovingAvgQuantizeTrainingAssign(self):
       self.assertGreater(max_value, 0.0)
       self.assertLess(max_value, 1.0)
 
+  def testVariablesNotParitioned_LastValue(self):
+    # Variables added should not use a default partiioner since they are
+    # scalar. There would be a tensorflow error thrown if the partitioner was
+    # respected by the rewrite.
+    with ops.Graph().as_default():
+      with variable_scope.variable_scope(
+          'part', partitioner=partitioned_variables.fixed_size_partitioner(2)):
+        x = array_ops.placeholder(dtypes.float32, shape=[2])
+        _ = quant_ops.LastValueQuantize(
+            x,
+            init_min=0.0,
+            init_max=0.0,
+            is_training=True,
+            vars_collection=_MIN_MAX_VARS)
+
+  def testVariablesNotParitioned_MovingAvg(self):
+    # Variables added should not use a default partiioner since they are
+    # scalar. There would be a tensorflow error thrown if the partitioner was
+    # respected by the rewrite.
+    with ops.Graph().as_default():
+      with variable_scope.variable_scope(
+          'part', partitioner=partitioned_variables.fixed_size_partitioner(2)):
+        x = array_ops.placeholder(dtypes.float32, shape=[2])
+        _ = quant_ops.MovingAvgQuantize(
+            x,
+            init_min=0.0,
+            init_max=0.0,
+            is_training=True,
+            vars_collection=_MIN_MAX_VARS)
+
   def _GetMinMaxValues(self, sess):
     min_max_vars = ops.get_collection(_MIN_MAX_VARS)
     self.assertEqual(len(min_max_vars), 2)
diff --git a/tensorflow/contrib/quantize/python/quantize.py b/tensorflow/contrib/quantize/python/quantize.py
index efc1a94b3c6e34..cbba72643f7f16 100644
--- a/tensorflow/contrib/quantize/python/quantize.py
+++ b/tensorflow/contrib/quantize/python/quantize.py
@@ -33,7 +33,7 @@
 _QUANTIZABLE_TYPES = {'Conv2D', 'MatMul', 'DepthwiseConv2dNative'}
 
 # Activations that are supported by the quantization rewrite.
-_ACTIVATION_TYPES = {'Relu', 'Relu6', 'Identity'}
+_ACTIVATION_TYPES = {'Relu', 'Relu6'}
 
 
 def Quantize(graph,
@@ -218,8 +218,19 @@ def _FindLayersToQuantize(graph):
   """
   input_pattern = graph_matcher.OpTypePattern('*')
   weight_var_pattern = graph_matcher.OpTypePattern('Variable|VariableV2')
-  weight_identity_pattern = graph_matcher.OpTypePattern(
+  weight_partition_identity_pattern = graph_matcher.OpTypePattern(
       'Identity', inputs=[weight_var_pattern])
+  weight_partition_concat_pattern = graph_matcher.OpTypePattern(
+      'ConcatV2', inputs=[weight_partition_identity_pattern, '*', '*'])
+  weight_identity_pattern = graph_matcher.OpTypePattern(
+      'Identity',
+      inputs=[
+          graph_matcher.OneofPattern([
+              weight_partition_identity_pattern,
+              weight_partition_concat_pattern,
+              weight_var_pattern,
+          ])
+      ])
   weight_resource_var_pattern = graph_matcher.OpTypePattern('ReadVariableOp')
   folded_weight_pattern = graph_matcher.OpTypePattern('Mul')
 
@@ -233,53 +244,54 @@ def _FindLayersToQuantize(graph):
               weight_identity_pattern, weight_resource_var_pattern,
               folded_weight_pattern
           ])
-      ])
+      ],
+      ordered_inputs=False)
 
   folded_bias_mul_pattern = graph_matcher.OpTypePattern(
-      'Mul', inputs=[graph_matcher.OpTypePattern('*'), layer_pattern])
+      'Mul',
+      inputs=[graph_matcher.OpTypePattern('*'), layer_pattern],
+      ordered_inputs=False)
   post_layer_op_correction_pattern = graph_matcher.OpTypePattern(
-      'Add', inputs=[folded_bias_mul_pattern,
-                     graph_matcher.OpTypePattern('*')])
+      'Add',
+      inputs=[folded_bias_mul_pattern,
+              graph_matcher.OpTypePattern('*')],
+      ordered_inputs=False)
   folded_bias_add_pattern = graph_matcher.OpTypePattern(
       'Add',
       inputs=[
           post_layer_op_correction_pattern,
           graph_matcher.OpTypePattern('*')
-      ])
+      ],
+      ordered_inputs=False)
 
   bias_add_pattern = graph_matcher.OpTypePattern(
-      'Add|BiasAdd', inputs=[layer_pattern, '*'])
+      'Add|BiasAdd', inputs=[layer_pattern, '*'], ordered_inputs=False)
 
   # The bias can come from the bias add or the folded bias add.
-  bypass_pattern_a = graph_matcher.OpTypePattern(
+  bypass_pattern = graph_matcher.OpTypePattern(
       'Add',
       inputs=[
           graph_matcher.OneofPattern(
               [bias_add_pattern, folded_bias_add_pattern]), '*'
-      ])
-  bypass_pattern_b = graph_matcher.OpTypePattern(
-      'Add',
-      inputs=[
-          '*',
-          graph_matcher.OneofPattern(
-              [bias_add_pattern, folded_bias_add_pattern])
-      ])
+      ],
+      ordered_inputs=False)
 
   # The input to the activation can come from bias add, fold bias add, the
   # bypasses.
+  # TODO(suharshs): We should ideally skip Identity operations instead of
+  # treating them as an activation.
   activation_pattern = graph_matcher.OpTypePattern(
-      '|'.join(_ACTIVATION_TYPES),
+      '|'.join(_ACTIVATION_TYPES) + '|Identity',
       inputs=[
           graph_matcher.OneofPattern([
-              bias_add_pattern, folded_bias_add_pattern, bypass_pattern_a,
-              bypass_pattern_b
+              bias_add_pattern,
+              folded_bias_add_pattern,
+              bypass_pattern,
           ])
       ])
 
-  post_activation_bypass_pattern_a = graph_matcher.OpTypePattern(
-      'Add', inputs=['*', activation_pattern])
-  post_activation_bypass_pattern_b = graph_matcher.OpTypePattern(
-      'Add', inputs=[activation_pattern, '*'])
+  post_activation_bypass_pattern = graph_matcher.OpTypePattern(
+      'Add', inputs=['*', activation_pattern], ordered_inputs=False)
 
   # The order of the following matching blocks is very important. Since matches
   # aren't guaranteed to be disjoint, we structure matches from largest to
@@ -295,10 +307,7 @@ def _FindLayersToQuantize(graph):
   # to ensure we don't match only the first part of this layer, missing the
   # post activation bypass node.
   post_activation_bypass_layer_matcher = graph_matcher.GraphMatcher(
-      graph_matcher.OneofPattern([
-          post_activation_bypass_pattern_a,
-          post_activation_bypass_pattern_b,
-      ]))
+      post_activation_bypass_pattern)
   for match_result in post_activation_bypass_layer_matcher.match_graph(graph):
     layer_op = match_result.get_op(layer_pattern)
     weight_tensor = match_result.get_tensor(weight_identity_pattern)
@@ -310,14 +319,9 @@ def _FindLayersToQuantize(graph):
     bias_add_op = match_result.get_op(bias_add_pattern)
     if bias_add_op is None:
       bias_add_op = match_result.get_op(folded_bias_add_pattern)
-    bypass_op = match_result.get_op(bypass_pattern_a)
-    if bypass_op is None:
-      bypass_op = match_result.get_op(bypass_pattern_b)
+    bypass_op = match_result.get_op(bypass_pattern)
     post_activation_bypass_op = match_result.get_op(
-        post_activation_bypass_pattern_a)
-    if post_activation_bypass_op is None:
-      post_activation_bypass_op = match_result.get_op(
-          post_activation_bypass_pattern_b)
+        post_activation_bypass_pattern)
     if layer_op not in matched_layer_set:
       matched_layer_set.add(layer_op)
       layer_matches.append(
@@ -338,9 +342,7 @@ def _FindLayersToQuantize(graph):
     bias_add_op = match_result.get_op(bias_add_pattern)
     if bias_add_op is None:
       bias_add_op = match_result.get_op(folded_bias_add_pattern)
-    bypass_op = match_result.get_op(bypass_pattern_a)
-    if bypass_op is None:
-      bypass_op = match_result.get_op(bypass_pattern_b)
+    bypass_op = match_result.get_op(bypass_pattern)
     if layer_op not in matched_layer_set:
       matched_layer_set.add(layer_op)
       layer_matches.append(
diff --git a/tensorflow/contrib/quantize/python/quantize_test.py b/tensorflow/contrib/quantize/python/quantize_test.py
index 5e479f39468042..92ca4a1b0c3126 100644
--- a/tensorflow/contrib/quantize/python/quantize_test.py
+++ b/tensorflow/contrib/quantize/python/quantize_test.py
@@ -27,6 +27,8 @@
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import googletest
 
 conv2d = layers.conv2d
@@ -74,7 +76,7 @@ def _TestInsertQuantOpForAddAfterConv2d(self, is_training):
                     weights_initializer=self._WeightInit(0.09),
                     activation_fn=None, scope='test/test')
       node = math_ops.add(conv, input2, name='test/add')
-      node = array_ops.identity(node, name='test/identity')
+      node = nn_ops.relu6(node, name='test/relu6')
       update_barrier = control_flow_ops.no_op(name='update_barrier')
       with ops.control_dependencies([update_barrier]):
         array_ops.identity(node, name='control_dependency')
@@ -97,7 +99,7 @@ def _TestInsertQuantOpForAddAfterConv2d(self, is_training):
         for output in quant_op.outputs:
           consumers.extend(output.consumers())
 
-        self.assertNotIn('test/identity', [c.name for c in consumers])
+        self.assertNotIn('test/relu6', [c.name for c in consumers])
 
   def testInsertQuantOpForAddAfterSeparableConv2d(self):
     self._RunTestOverParameters(
@@ -114,7 +116,7 @@ def _TestInsertQuantOpForAddAfterSeparableConv2d(self, is_training):
                               weights_initializer=self._WeightInit(0.09),
                               activation_fn=None, scope='test/test')
       node = math_ops.add(conv, input2, name='test/add')
-      node = array_ops.identity(node, name='test/identity')
+      node = nn_ops.relu6(node, name='test/relu6')
       update_barrier = control_flow_ops.no_op(name='update_barrier')
       with ops.control_dependencies([update_barrier]):
         array_ops.identity(node, name='control_dependency')
@@ -135,7 +137,7 @@ def _TestInsertQuantOpForAddAfterSeparableConv2d(self, is_training):
         for output in quant_op.outputs:
           consumers.extend(output.consumers())
 
-        self.assertNotIn('test/identity', [c.name for c in consumers])
+        self.assertNotIn('test/relu6', [c.name for c in consumers])
 
   def testFinalLayerQuantized(self):
     self._RunTestOverParameters(self._TestFinalLayerQuantized)
@@ -174,7 +176,7 @@ def _TestPostActivationBypassQuantized(self, is_training):
           stride=2,
           padding='SAME',
           weights_initializer=self._WeightInit(0.09),
-          activation_fn=array_ops.identity,
+          activation_fn=nn_ops.relu6,
           scope='test/test')
       bypass_tensor = math_ops.add(conv, input2, name='test/add')
       # The output of the post_activation bypass will be another layer.
@@ -184,7 +186,7 @@ def _TestPostActivationBypassQuantized(self, is_training):
           stride=2,
           padding='SAME',
           weights_initializer=self._WeightInit(0.09),
-          activation_fn=array_ops.identity,
+          activation_fn=nn_ops.relu6,
           scope='test/unused')
 
       quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
@@ -212,7 +214,7 @@ def _TestOverlappingPostActivationBypassQuantized(self, is_training):
           stride=2,
           padding='SAME',
           weights_initializer=self._WeightInit(0.09),
-          activation_fn=array_ops.identity,
+          activation_fn=nn_ops.relu6,
           scope='test/test1')
 
       # The bypass of this conv is the post activation bypass of the previous
@@ -227,7 +229,7 @@ def _TestOverlappingPostActivationBypassQuantized(self, is_training):
           scope='test/test2')
 
       bypass_tensor = math_ops.add(conv1, conv2, name='test/add')
-      _ = array_ops.identity(bypass_tensor, name='test/output')
+      _ = nn_ops.relu6(bypass_tensor, name='test/output')
 
       quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
 
@@ -248,11 +250,11 @@ def _TestOverlappingPostActivationBypassQuantized(self, is_training):
           'test/test1/act_quant/FakeQuantWithMinMaxVars' in op_names)
       self.assertTrue('test/act_quant/FakeQuantWithMinMaxVars' in op_names)
       self.assertEqual(
-          'Identity',
+          'Relu6',
           graph.get_operation_by_name(
               'test/test1/act_quant/FakeQuantWithMinMaxVars').inputs[0].op.type)
       self.assertEqual(
-          'Identity',
+          'Relu6',
           graph.get_operation_by_name(
               'test/act_quant/FakeQuantWithMinMaxVars').inputs[0].op.type)
 
@@ -327,6 +329,66 @@ def _testWithNonMatchingNameScope(self, is_training):
     # No ops should be inserted or removed.
     self.assertEqual(op_names_before_quantize, op_names_after_quantize)
 
+  def testSinglePartitionedVariable(self):
+    self._RunTestOverParameters(self._testSinglePartitionedVariable)
+
+  def _testSinglePartitionedVariable(self, is_training):
+    # When weights are partitioned into a single partition, the weights variable
+    # is followed by a identity -> identity (An additional identity node).
+    partitioner = partitioned_variables.fixed_size_partitioner(1)
+    graph = ops.Graph()
+    with graph.as_default():
+      with variable_scope.variable_scope('part', partitioner=partitioner):
+        batch_size, height, width, depth = 5, 128, 128, 3
+        input1 = array_ops.zeros((batch_size, height, width, depth))
+        input2 = array_ops.zeros((batch_size, height / 2, width / 2, 32))
+        conv = conv2d(
+            input1,
+            32, [5, 5],
+            stride=2,
+            padding='SAME',
+            weights_initializer=self._WeightInit(0.09),
+            activation_fn=None,
+            scope='test/test')
+        node = math_ops.add(conv, input2, name='test/add')
+        node = nn_ops.relu6(node, name='test/relu6')
+
+      quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
+      # Check that the weight's quant node was added.
+      op_names = [op.name for op in graph.get_operations()]
+      self.assertTrue(
+          'part/test/test/weights_quant/FakeQuantWithMinMaxVars' in op_names)
+
+  def testMultiplePartitionedVariables(self):
+    self._RunTestOverParameters(self._testMultiplePartitionedVariables)
+
+  def _testMultiplePartitionedVariables(self, is_training):
+    # When weights are partitioned into multiple partitions the weights variable
+    # is followed by a identity -> concat -> identity to group the partitions.
+    partitioner = partitioned_variables.fixed_size_partitioner(2)
+    graph = ops.Graph()
+    with graph.as_default():
+      with variable_scope.variable_scope('part', partitioner=partitioner):
+        batch_size, height, width, depth = 5, 128, 128, 3
+        input1 = array_ops.zeros((batch_size, height, width, depth))
+        input2 = array_ops.zeros((batch_size, height / 2, width / 2, 32))
+        conv = conv2d(
+            input1,
+            32, [5, 5],
+            stride=2,
+            padding='SAME',
+            weights_initializer=self._WeightInit(0.09),
+            activation_fn=None,
+            scope='test/test')
+        node = math_ops.add(conv, input2, name='test/add')
+        node = nn_ops.relu6(node, name='test/relu6')
+
+      quantize.Quantize(graph, is_training, weight_bits=8, activation_bits=8)
+      # Check that the weight's quant node was added.
+      op_names = [op.name for op in graph.get_operations()]
+      self.assertTrue(
+          'part/test/test/weights_quant/FakeQuantWithMinMaxVars' in op_names)
+
   def _WeightInit(self, stddev):
     """Returns truncated normal variable initializer.
 
diff --git a/tensorflow/contrib/receptive_field/README.md b/tensorflow/contrib/receptive_field/README.md
index 3ff85faf611afa..79b015a9163f57 100644
--- a/tensorflow/contrib/receptive_field/README.md
+++ b/tensorflow/contrib/receptive_field/README.md
@@ -6,6 +6,32 @@ region your output features depend on. Better yet, using the parameters computed
 by the library, you can easily find the exact image region which is used to
 compute each convnet feature.
 
+This library can be used to compute receptive field parameters of popular
+convnets:
+
+<center>
+
+convnet model       | receptive field | effective stride | effective padding
+:-----------------: | :-------------: | :--------------: | :---------------:
+alexnet_v2          | 195             | 32               | 64
+vgg_16              | 212             | 32               | 90
+inception_v2        | 699             | 32               | 318
+inception_v3        | 1311            | 32               | 618
+inception_v4        | 2071            | 32               | 998
+inception_resnet_v2 | 3039            | 32               | 1482
+mobilenet_v1        | 315             | 32               | 126
+mobilenet_v1_075    | 315             | 32               | 126
+resnet_v1_50        | 483             | 32               | 241
+resnet_v1_101       | 1027            | 32               | 513
+resnet_v1_152       | 1507            | 32               | 753
+resnet_v1_200       | 1763            | 32               | 881
+
+</center>
+
+A comprehensive table with pre-computed receptive field parameters for different
+end-points, input resolutions, and other variants of these networks can be found
+[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/receptive_field/RECEPTIVE_FIELD_TABLE.md).
+
 ## Basic usage
 
 The main function to be called is `compute_receptive_field_from_graph_def`,
@@ -96,9 +122,9 @@ The script will write to stdout the receptive field parameters for many variants
 of several popular convnets: AlexNet, VGG, ResNet, Inception, Mobilenet. They
 are also written to the file `/tmp/rf_benchmark_results.csv`.
 
-TODO: include here a plot for receptive field sizes of different convnets.
-
-TODO: include table/link to pre-computed RF parameters.
+A comprehensive table with pre-computed receptive field parameters for different
+networks can be found
+[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/receptive_field/RECEPTIVE_FIELD_TABLE.md).
 
 ## Compute RF parameters from a graph pbtxt
 
diff --git a/tensorflow/contrib/receptive_field/RECEPTIVE_FIELD_TABLE.md b/tensorflow/contrib/receptive_field/RECEPTIVE_FIELD_TABLE.md
new file mode 100644
index 00000000000000..736fbef6e7c661
--- /dev/null
+++ b/tensorflow/contrib/receptive_field/RECEPTIVE_FIELD_TABLE.md
@@ -0,0 +1,629 @@
+# Pre-computed receptive field parameters
+
+## Table with results
+
+The table below presents the receptive field parameters for several popular
+convolutional neural networks. These are computed using the models from the
+[TF-Slim
+repository](https://github.com/tensorflow/models/tree/master/research/slim),
+by using the [rf_benchmark
+script](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/receptive_field/python/util/examples/rf_benchmark.py).
+
+Questions? See the [FAQ](#faq).
+
+CNN                            | resolution | end-point            | RF   | effective stride | effective padding
+:----------------------------: | :--------: | :------------------: | :--: | :--------------: | :---------------:
+alexnet_v2                     | None       | alexnet_v2/conv1     | 11   | 4                | 0
+alexnet_v2                     | None       | alexnet_v2/pool1     | 19   | 8                | 0
+alexnet_v2                     | None       | alexnet_v2/conv2     | 51   | 8                | 16
+alexnet_v2                     | None       | alexnet_v2/conv3     | 99   | 16               | 32
+alexnet_v2                     | None       | alexnet_v2/conv4     | 131  | 16               | 48
+alexnet_v2                     | None       | alexnet_v2/conv5     | 163  | 16               | 64
+alexnet_v2                     | None       | alexnet_v2/pool5     | 195  | 32               | 64
+alexnet_v2                     | 224        | alexnet_v2/conv1     | 11   | 4                | 0
+alexnet_v2                     | 224        | alexnet_v2/pool1     | 19   | 8                | 0
+alexnet_v2                     | 224        | alexnet_v2/conv2     | 51   | 8                | 16
+alexnet_v2                     | 224        | alexnet_v2/conv3     | 99   | 16               | 32
+alexnet_v2                     | 224        | alexnet_v2/conv4     | 131  | 16               | 48
+alexnet_v2                     | 224        | alexnet_v2/conv5     | 163  | 16               | 64
+alexnet_v2                     | 224        | alexnet_v2/pool5     | 195  | 32               | 64
+alexnet_v2                     | 321        | alexnet_v2/conv1     | 11   | 4                | 0
+alexnet_v2                     | 321        | alexnet_v2/pool1     | 19   | 8                | 0
+alexnet_v2                     | 321        | alexnet_v2/conv2     | 51   | 8                | 16
+alexnet_v2                     | 321        | alexnet_v2/conv3     | 99   | 16               | 32
+alexnet_v2                     | 321        | alexnet_v2/conv4     | 131  | 16               | 48
+alexnet_v2                     | 321        | alexnet_v2/conv5     | 163  | 16               | 64
+alexnet_v2                     | 321        | alexnet_v2/pool5     | 195  | 32               | 64
+vgg_a                          | None       | vgg_a/conv1/conv1_1  | 3    | 1                | 1
+vgg_a                          | None       | vgg_a/pool1          | 4    | 2                | 1
+vgg_a                          | None       | vgg_a/conv2/conv2_1  | 8    | 2                | 3
+vgg_a                          | None       | vgg_a/pool2          | 10   | 4                | 3
+vgg_a                          | None       | vgg_a/conv3/conv3_1  | 18   | 4                | 7
+vgg_a                          | None       | vgg_a/conv3/conv3_2  | 26   | 4                | 11
+vgg_a                          | None       | vgg_a/pool3          | 30   | 8                | 11
+vgg_a                          | None       | vgg_a/conv4/conv4_1  | 46   | 8                | 19
+vgg_a                          | None       | vgg_a/conv4/conv4_2  | 62   | 8                | 27
+vgg_a                          | None       | vgg_a/pool4          | 70   | 16               | 27
+vgg_a                          | None       | vgg_a/conv5/conv5_1  | 102  | 16               | 43
+vgg_a                          | None       | vgg_a/conv5/conv5_2  | 134  | 16               | 59
+vgg_a                          | None       | vgg_a/pool5          | 150  | 32               | 59
+vgg_a                          | 224        | vgg_a/conv1/conv1_1  | 3    | 1                | 1
+vgg_a                          | 224        | vgg_a/pool1          | 4    | 2                | 1
+vgg_a                          | 224        | vgg_a/conv2/conv2_1  | 8    | 2                | 3
+vgg_a                          | 224        | vgg_a/pool2          | 10   | 4                | 3
+vgg_a                          | 224        | vgg_a/conv3/conv3_1  | 18   | 4                | 7
+vgg_a                          | 224        | vgg_a/conv3/conv3_2  | 26   | 4                | 11
+vgg_a                          | 224        | vgg_a/pool3          | 30   | 8                | 11
+vgg_a                          | 224        | vgg_a/conv4/conv4_1  | 46   | 8                | 19
+vgg_a                          | 224        | vgg_a/conv4/conv4_2  | 62   | 8                | 27
+vgg_a                          | 224        | vgg_a/pool4          | 70   | 16               | 27
+vgg_a                          | 224        | vgg_a/conv5/conv5_1  | 102  | 16               | 43
+vgg_a                          | 224        | vgg_a/conv5/conv5_2  | 134  | 16               | 59
+vgg_a                          | 224        | vgg_a/pool5          | 150  | 32               | 59
+vgg_a                          | 321        | vgg_a/conv1/conv1_1  | 3    | 1                | 1
+vgg_a                          | 321        | vgg_a/pool1          | 4    | 2                | 1
+vgg_a                          | 321        | vgg_a/conv2/conv2_1  | 8    | 2                | 3
+vgg_a                          | 321        | vgg_a/pool2          | 10   | 4                | 3
+vgg_a                          | 321        | vgg_a/conv3/conv3_1  | 18   | 4                | 7
+vgg_a                          | 321        | vgg_a/conv3/conv3_2  | 26   | 4                | 11
+vgg_a                          | 321        | vgg_a/pool3          | 30   | 8                | 11
+vgg_a                          | 321        | vgg_a/conv4/conv4_1  | 46   | 8                | 19
+vgg_a                          | 321        | vgg_a/conv4/conv4_2  | 62   | 8                | 27
+vgg_a                          | 321        | vgg_a/pool4          | 70   | 16               | 27
+vgg_a                          | 321        | vgg_a/conv5/conv5_1  | 102  | 16               | 43
+vgg_a                          | 321        | vgg_a/conv5/conv5_2  | 134  | 16               | 59
+vgg_a                          | 321        | vgg_a/pool5          | 150  | 32               | 59
+vgg_16                         | None       | vgg_16/conv1/conv1_1 | 3    | 1                | 1
+vgg_16                         | None       | vgg_16/pool1         | 6    | 2                | 2
+vgg_16                         | None       | vgg_16/conv2/conv2_1 | 10   | 2                | 4
+vgg_16                         | None       | vgg_16/pool2         | 16   | 4                | 6
+vgg_16                         | None       | vgg_16/conv3/conv3_1 | 24   | 4                | 10
+vgg_16                         | None       | vgg_16/conv3/conv3_2 | 32   | 4                | 14
+vgg_16                         | None       | vgg_16/pool3         | 44   | 8                | 18
+vgg_16                         | None       | vgg_16/conv4/conv4_1 | 60   | 8                | 26
+vgg_16                         | None       | vgg_16/conv4/conv4_2 | 76   | 8                | 34
+vgg_16                         | None       | vgg_16/pool4         | 100  | 16               | 42
+vgg_16                         | None       | vgg_16/conv5/conv5_1 | 132  | 16               | 58
+vgg_16                         | None       | vgg_16/conv5/conv5_2 | 164  | 16               | 74
+vgg_16                         | None       | vgg_16/pool5         | 212  | 32               | 90
+vgg_16                         | 224        | vgg_16/conv1/conv1_1 | 3    | 1                | 1
+vgg_16                         | 224        | vgg_16/pool1         | 6    | 2                | 2
+vgg_16                         | 224        | vgg_16/conv2/conv2_1 | 10   | 2                | 4
+vgg_16                         | 224        | vgg_16/pool2         | 16   | 4                | 6
+vgg_16                         | 224        | vgg_16/conv3/conv3_1 | 24   | 4                | 10
+vgg_16                         | 224        | vgg_16/conv3/conv3_2 | 32   | 4                | 14
+vgg_16                         | 224        | vgg_16/pool3         | 44   | 8                | 18
+vgg_16                         | 224        | vgg_16/conv4/conv4_1 | 60   | 8                | 26
+vgg_16                         | 224        | vgg_16/conv4/conv4_2 | 76   | 8                | 34
+vgg_16                         | 224        | vgg_16/pool4         | 100  | 16               | 42
+vgg_16                         | 224        | vgg_16/conv5/conv5_1 | 132  | 16               | 58
+vgg_16                         | 224        | vgg_16/conv5/conv5_2 | 164  | 16               | 74
+vgg_16                         | 224        | vgg_16/pool5         | 212  | 32               | 90
+vgg_16                         | 321        | vgg_16/conv1/conv1_1 | 3    | 1                | 1
+vgg_16                         | 321        | vgg_16/pool1         | 6    | 2                | 2
+vgg_16                         | 321        | vgg_16/conv2/conv2_1 | 10   | 2                | 4
+vgg_16                         | 321        | vgg_16/pool2         | 16   | 4                | 6
+vgg_16                         | 321        | vgg_16/conv3/conv3_1 | 24   | 4                | 10
+vgg_16                         | 321        | vgg_16/conv3/conv3_2 | 32   | 4                | 14
+vgg_16                         | 321        | vgg_16/pool3         | 44   | 8                | 18
+vgg_16                         | 321        | vgg_16/conv4/conv4_1 | 60   | 8                | 26
+vgg_16                         | 321        | vgg_16/conv4/conv4_2 | 76   | 8                | 34
+vgg_16                         | 321        | vgg_16/pool4         | 100  | 16               | 42
+vgg_16                         | 321        | vgg_16/conv5/conv5_1 | 132  | 16               | 58
+vgg_16                         | 321        | vgg_16/conv5/conv5_2 | 164  | 16               | 74
+vgg_16                         | 321        | vgg_16/pool5         | 212  | 32               | 90
+inception_v2                   | None       | Conv2d_1a_7x7        | 7    | 2                | None
+inception_v2                   | None       | MaxPool_2a_3x3       | 11   | 4                | None
+inception_v2                   | None       | Conv2d_2b_1x1        | 11   | 4                | None
+inception_v2                   | None       | Conv2d_2c_3x3        | 19   | 4                | None
+inception_v2                   | None       | MaxPool_3a_3x3       | 27   | 8                | None
+inception_v2                   | None       | Mixed_3b             | 59   | 8                | None
+inception_v2                   | None       | Mixed_3c             | 91   | 8                | None
+inception_v2                   | None       | Mixed_4a             | 123  | 16               | None
+inception_v2                   | None       | Mixed_4b             | 187  | 16               | None
+inception_v2                   | None       | Mixed_4c             | 251  | 16               | None
+inception_v2                   | None       | Mixed_4d             | 315  | 16               | None
+inception_v2                   | None       | Mixed_4e             | 379  | 16               | None
+inception_v2                   | None       | Mixed_5a             | 443  | 32               | None
+inception_v2                   | None       | Mixed_5b             | 571  | 32               | None
+inception_v2                   | None       | Mixed_5c             | 699  | 32               | None
+inception_v2                   | 224        | Conv2d_1a_7x7        | 7    | 2                | 2
+inception_v2                   | 224        | MaxPool_2a_3x3       | 11   | 4                | 2
+inception_v2                   | 224        | Conv2d_2b_1x1        | 11   | 4                | 2
+inception_v2                   | 224        | Conv2d_2c_3x3        | 19   | 4                | 6
+inception_v2                   | 224        | MaxPool_3a_3x3       | 27   | 8                | 6
+inception_v2                   | 224        | Mixed_3b             | 59   | 8                | 22
+inception_v2                   | 224        | Mixed_3c             | 91   | 8                | 38
+inception_v2                   | 224        | Mixed_4a             | 123  | 16               | 46
+inception_v2                   | 224        | Mixed_4b             | 187  | 16               | 78
+inception_v2                   | 224        | Mixed_4c             | 251  | 16               | 110
+inception_v2                   | 224        | Mixed_4d             | 315  | 16               | 142
+inception_v2                   | 224        | Mixed_4e             | 379  | 16               | 174
+inception_v2                   | 224        | Mixed_5a             | 443  | 32               | 190
+inception_v2                   | 224        | Mixed_5b             | 571  | 32               | 254
+inception_v2                   | 224        | Mixed_5c             | 699  | 32               | 318
+inception_v2                   | 321        | Conv2d_1a_7x7        | 7    | 2                | 3
+inception_v2                   | 321        | MaxPool_2a_3x3       | 11   | 4                | 5
+inception_v2                   | 321        | Conv2d_2b_1x1        | 11   | 4                | 5
+inception_v2                   | 321        | Conv2d_2c_3x3        | 19   | 4                | 9
+inception_v2                   | 321        | MaxPool_3a_3x3       | 27   | 8                | 13
+inception_v2                   | 321        | Mixed_3b             | 59   | 8                | 29
+inception_v2                   | 321        | Mixed_3c             | 91   | 8                | 45
+inception_v2                   | 321        | Mixed_4a             | 123  | 16               | 61
+inception_v2                   | 321        | Mixed_4b             | 187  | 16               | 93
+inception_v2                   | 321        | Mixed_4c             | 251  | 16               | 125
+inception_v2                   | 321        | Mixed_4d             | 315  | 16               | 157
+inception_v2                   | 321        | Mixed_4e             | 379  | 16               | 189
+inception_v2                   | 321        | Mixed_5a             | 443  | 32               | 221
+inception_v2                   | 321        | Mixed_5b             | 571  | 32               | 285
+inception_v2                   | 321        | Mixed_5c             | 699  | 32               | 349
+inception_v2-no-separable-conv | None       | Conv2d_1a_7x7        | 7    | 2                | None
+inception_v2-no-separable-conv | None       | MaxPool_2a_3x3       | 11   | 4                | None
+inception_v2-no-separable-conv | None       | Conv2d_2b_1x1        | 11   | 4                | None
+inception_v2-no-separable-conv | None       | Conv2d_2c_3x3        | 19   | 4                | None
+inception_v2-no-separable-conv | None       | MaxPool_3a_3x3       | 27   | 8                | None
+inception_v2-no-separable-conv | None       | Mixed_3b             | 59   | 8                | None
+inception_v2-no-separable-conv | None       | Mixed_3c             | 91   | 8                | None
+inception_v2-no-separable-conv | None       | Mixed_4a             | 123  | 16               | None
+inception_v2-no-separable-conv | None       | Mixed_4b             | 187  | 16               | None
+inception_v2-no-separable-conv | None       | Mixed_4c             | 251  | 16               | None
+inception_v2-no-separable-conv | None       | Mixed_4d             | 315  | 16               | None
+inception_v2-no-separable-conv | None       | Mixed_4e             | 379  | 16               | None
+inception_v2-no-separable-conv | None       | Mixed_5a             | 443  | 32               | None
+inception_v2-no-separable-conv | None       | Mixed_5b             | 571  | 32               | None
+inception_v2-no-separable-conv | None       | Mixed_5c             | 699  | 32               | None
+inception_v2-no-separable-conv | 224        | Conv2d_1a_7x7        | 7    | 2                | 2
+inception_v2-no-separable-conv | 224        | MaxPool_2a_3x3       | 11   | 4                | 2
+inception_v2-no-separable-conv | 224        | Conv2d_2b_1x1        | 11   | 4                | 2
+inception_v2-no-separable-conv | 224        | Conv2d_2c_3x3        | 19   | 4                | 6
+inception_v2-no-separable-conv | 224        | MaxPool_3a_3x3       | 27   | 8                | 6
+inception_v2-no-separable-conv | 224        | Mixed_3b             | 59   | 8                | 22
+inception_v2-no-separable-conv | 224        | Mixed_3c             | 91   | 8                | 38
+inception_v2-no-separable-conv | 224        | Mixed_4a             | 123  | 16               | 46
+inception_v2-no-separable-conv | 224        | Mixed_4b             | 187  | 16               | 78
+inception_v2-no-separable-conv | 224        | Mixed_4c             | 251  | 16               | 110
+inception_v2-no-separable-conv | 224        | Mixed_4d             | 315  | 16               | 142
+inception_v2-no-separable-conv | 224        | Mixed_4e             | 379  | 16               | 174
+inception_v2-no-separable-conv | 224        | Mixed_5a             | 443  | 32               | 190
+inception_v2-no-separable-conv | 224        | Mixed_5b             | 571  | 32               | 254
+inception_v2-no-separable-conv | 224        | Mixed_5c             | 699  | 32               | 318
+inception_v2-no-separable-conv | 321        | Conv2d_1a_7x7        | 7    | 2                | 3
+inception_v2-no-separable-conv | 321        | MaxPool_2a_3x3       | 11   | 4                | 5
+inception_v2-no-separable-conv | 321        | Conv2d_2b_1x1        | 11   | 4                | 5
+inception_v2-no-separable-conv | 321        | Conv2d_2c_3x3        | 19   | 4                | 9
+inception_v2-no-separable-conv | 321        | MaxPool_3a_3x3       | 27   | 8                | 13
+inception_v2-no-separable-conv | 321        | Mixed_3b             | 59   | 8                | 29
+inception_v2-no-separable-conv | 321        | Mixed_3c             | 91   | 8                | 45
+inception_v2-no-separable-conv | 321        | Mixed_4a             | 123  | 16               | 61
+inception_v2-no-separable-conv | 321        | Mixed_4b             | 187  | 16               | 93
+inception_v2-no-separable-conv | 321        | Mixed_4c             | 251  | 16               | 125
+inception_v2-no-separable-conv | 321        | Mixed_4d             | 315  | 16               | 157
+inception_v2-no-separable-conv | 321        | Mixed_4e             | 379  | 16               | 189
+inception_v2-no-separable-conv | 321        | Mixed_5a             | 443  | 32               | 221
+inception_v2-no-separable-conv | 321        | Mixed_5b             | 571  | 32               | 285
+inception_v2-no-separable-conv | 321        | Mixed_5c             | 699  | 32               | 349
+inception_v3                   | None       | Conv2d_1a_3x3        | 3    | 2                | 0
+inception_v3                   | None       | Conv2d_2a_3x3        | 7    | 2                | 0
+inception_v3                   | None       | Conv2d_2b_3x3        | 11   | 2                | 2
+inception_v3                   | None       | MaxPool_3a_3x3       | 15   | 4                | 2
+inception_v3                   | None       | Conv2d_3b_1x1        | 15   | 4                | 2
+inception_v3                   | None       | Conv2d_4a_3x3        | 23   | 4                | 2
+inception_v3                   | None       | MaxPool_5a_3x3       | 31   | 8                | 2
+inception_v3                   | None       | Mixed_5b             | 63   | 8                | 18
+inception_v3                   | None       | Mixed_5c             | 95   | 8                | 34
+inception_v3                   | None       | Mixed_5d             | 127  | 8                | 50
+inception_v3                   | None       | Mixed_6a             | 159  | 16               | 58
+inception_v3                   | None       | Mixed_6b             | 351  | 16               | 154
+inception_v3                   | None       | Mixed_6c             | 543  | 16               | 250
+inception_v3                   | None       | Mixed_6d             | 735  | 16               | 346
+inception_v3                   | None       | Mixed_6e             | 927  | 16               | 442
+inception_v3                   | None       | Mixed_7a             | 1055 | 32               | 490
+inception_v3                   | None       | Mixed_7b             | 1183 | 32               | 554
+inception_v3                   | None       | Mixed_7c             | 1311 | 32               | 618
+inception_v3                   | 224        | Conv2d_1a_3x3        | 3    | 2                | 0
+inception_v3                   | 224        | Conv2d_2a_3x3        | 7    | 2                | 0
+inception_v3                   | 224        | Conv2d_2b_3x3        | 11   | 2                | 2
+inception_v3                   | 224        | MaxPool_3a_3x3       | 15   | 4                | 2
+inception_v3                   | 224        | Conv2d_3b_1x1        | 15   | 4                | 2
+inception_v3                   | 224        | Conv2d_4a_3x3        | 23   | 4                | 2
+inception_v3                   | 224        | MaxPool_5a_3x3       | 31   | 8                | 2
+inception_v3                   | 224        | Mixed_5b             | 63   | 8                | 18
+inception_v3                   | 224        | Mixed_5c             | 95   | 8                | 34
+inception_v3                   | 224        | Mixed_5d             | 127  | 8                | 50
+inception_v3                   | 224        | Mixed_6a             | 159  | 16               | 58
+inception_v3                   | 224        | Mixed_6b             | 351  | 16               | 154
+inception_v3                   | 224        | Mixed_6c             | 543  | 16               | 250
+inception_v3                   | 224        | Mixed_6d             | 735  | 16               | 346
+inception_v3                   | 224        | Mixed_6e             | 927  | 16               | 442
+inception_v3                   | 224        | Mixed_7a             | 1055 | 32               | 490
+inception_v3                   | 224        | Mixed_7b             | 1183 | 32               | 554
+inception_v3                   | 224        | Mixed_7c             | 1311 | 32               | 618
+inception_v3                   | 321        | Conv2d_1a_3x3        | 3    | 2                | 0
+inception_v3                   | 321        | Conv2d_2a_3x3        | 7    | 2                | 0
+inception_v3                   | 321        | Conv2d_2b_3x3        | 11   | 2                | 2
+inception_v3                   | 321        | MaxPool_3a_3x3       | 15   | 4                | 2
+inception_v3                   | 321        | Conv2d_3b_1x1        | 15   | 4                | 2
+inception_v3                   | 321        | Conv2d_4a_3x3        | 23   | 4                | 2
+inception_v3                   | 321        | MaxPool_5a_3x3       | 31   | 8                | 2
+inception_v3                   | 321        | Mixed_5b             | 63   | 8                | 18
+inception_v3                   | 321        | Mixed_5c             | 95   | 8                | 34
+inception_v3                   | 321        | Mixed_5d             | 127  | 8                | 50
+inception_v3                   | 321        | Mixed_6a             | 159  | 16               | 58
+inception_v3                   | 321        | Mixed_6b             | 351  | 16               | 154
+inception_v3                   | 321        | Mixed_6c             | 543  | 16               | 250
+inception_v3                   | 321        | Mixed_6d             | 735  | 16               | 346
+inception_v3                   | 321        | Mixed_6e             | 927  | 16               | 442
+inception_v3                   | 321        | Mixed_7a             | 1055 | 32               | 490
+inception_v3                   | 321        | Mixed_7b             | 1183 | 32               | 554
+inception_v3                   | 321        | Mixed_7c             | 1311 | 32               | 618
+inception_v4                   | None       | Conv2d_1a_3x3        | 3    | 2                | 0
+inception_v4                   | None       | Conv2d_2a_3x3        | 7    | 2                | 0
+inception_v4                   | None       | Conv2d_2b_3x3        | 11   | 2                | 2
+inception_v4                   | None       | Mixed_3a             | 15   | 4                | 2
+inception_v4                   | None       | Mixed_4a             | 47   | 4                | 14
+inception_v4                   | None       | Mixed_5a             | 55   | 8                | 14
+inception_v4                   | None       | Mixed_5b             | 87   | 8                | 30
+inception_v4                   | None       | Mixed_5c             | 119  | 8                | 46
+inception_v4                   | None       | Mixed_5d             | 151  | 8                | 62
+inception_v4                   | None       | Mixed_5e             | 183  | 8                | 78
+inception_v4                   | None       | Mixed_6a             | 215  | 16               | 86
+inception_v4                   | None       | Mixed_6b             | 407  | 16               | 182
+inception_v4                   | None       | Mixed_6c             | 599  | 16               | 278
+inception_v4                   | None       | Mixed_6d             | 791  | 16               | 374
+inception_v4                   | None       | Mixed_6e             | 983  | 16               | 470
+inception_v4                   | None       | Mixed_6f             | 1175 | 16               | 566
+inception_v4                   | None       | Mixed_6g             | 1367 | 16               | 662
+inception_v4                   | None       | Mixed_6h             | 1559 | 16               | 758
+inception_v4                   | None       | Mixed_7a             | 1687 | 32               | 806
+inception_v4                   | None       | Mixed_7b             | 1815 | 32               | 870
+inception_v4                   | None       | Mixed_7c             | 1943 | 32               | 934
+inception_v4                   | None       | Mixed_7d             | 2071 | 32               | 998
+inception_v4                   | 224        | Conv2d_1a_3x3        | 3    | 2                | 0
+inception_v4                   | 224        | Conv2d_2a_3x3        | 7    | 2                | 0
+inception_v4                   | 224        | Conv2d_2b_3x3        | 11   | 2                | 2
+inception_v4                   | 224        | Mixed_3a             | 15   | 4                | 2
+inception_v4                   | 224        | Mixed_4a             | 47   | 4                | 14
+inception_v4                   | 224        | Mixed_5a             | 55   | 8                | 14
+inception_v4                   | 224        | Mixed_5b             | 87   | 8                | 30
+inception_v4                   | 224        | Mixed_5c             | 119  | 8                | 46
+inception_v4                   | 224        | Mixed_5d             | 151  | 8                | 62
+inception_v4                   | 224        | Mixed_5e             | 183  | 8                | 78
+inception_v4                   | 224        | Mixed_6a             | 215  | 16               | 86
+inception_v4                   | 224        | Mixed_6b             | 407  | 16               | 182
+inception_v4                   | 224        | Mixed_6c             | 599  | 16               | 278
+inception_v4                   | 224        | Mixed_6d             | 791  | 16               | 374
+inception_v4                   | 224        | Mixed_6e             | 983  | 16               | 470
+inception_v4                   | 224        | Mixed_6f             | 1175 | 16               | 566
+inception_v4                   | 224        | Mixed_6g             | 1367 | 16               | 662
+inception_v4                   | 224        | Mixed_6h             | 1559 | 16               | 758
+inception_v4                   | 224        | Mixed_7a             | 1687 | 32               | 806
+inception_v4                   | 224        | Mixed_7b             | 1815 | 32               | 870
+inception_v4                   | 224        | Mixed_7c             | 1943 | 32               | 934
+inception_v4                   | 224        | Mixed_7d             | 2071 | 32               | 998
+inception_v4                   | 321        | Conv2d_1a_3x3        | 3    | 2                | 0
+inception_v4                   | 321        | Conv2d_2a_3x3        | 7    | 2                | 0
+inception_v4                   | 321        | Conv2d_2b_3x3        | 11   | 2                | 2
+inception_v4                   | 321        | Mixed_3a             | 15   | 4                | 2
+inception_v4                   | 321        | Mixed_4a             | 47   | 4                | 14
+inception_v4                   | 321        | Mixed_5a             | 55   | 8                | 14
+inception_v4                   | 321        | Mixed_5b             | 87   | 8                | 30
+inception_v4                   | 321        | Mixed_5c             | 119  | 8                | 46
+inception_v4                   | 321        | Mixed_5d             | 151  | 8                | 62
+inception_v4                   | 321        | Mixed_5e             | 183  | 8                | 78
+inception_v4                   | 321        | Mixed_6a             | 215  | 16               | 86
+inception_v4                   | 321        | Mixed_6b             | 407  | 16               | 182
+inception_v4                   | 321        | Mixed_6c             | 599  | 16               | 278
+inception_v4                   | 321        | Mixed_6d             | 791  | 16               | 374
+inception_v4                   | 321        | Mixed_6e             | 983  | 16               | 470
+inception_v4                   | 321        | Mixed_6f             | 1175 | 16               | 566
+inception_v4                   | 321        | Mixed_6g             | 1367 | 16               | 662
+inception_v4                   | 321        | Mixed_6h             | 1559 | 16               | 758
+inception_v4                   | 321        | Mixed_7a             | 1687 | 32               | 806
+inception_v4                   | 321        | Mixed_7b             | 1815 | 32               | 870
+inception_v4                   | 321        | Mixed_7c             | 1943 | 32               | 934
+inception_v4                   | 321        | Mixed_7d             | 2071 | 32               | 998
+inception_resnet_v2            | None       | Conv2d_1a_3x3        | 3    | 2                | 0
+inception_resnet_v2            | None       | Conv2d_2a_3x3        | 7    | 2                | 0
+inception_resnet_v2            | None       | Conv2d_2b_3x3        | 11   | 2                | 2
+inception_resnet_v2            | None       | MaxPool_3a_3x3       | 15   | 4                | 2
+inception_resnet_v2            | None       | Conv2d_3b_1x1        | 15   | 4                | 2
+inception_resnet_v2            | None       | Conv2d_4a_3x3        | 23   | 4                | 2
+inception_resnet_v2            | None       | MaxPool_5a_3x3       | 31   | 8                | 2
+inception_resnet_v2            | None       | Mixed_5b             | 63   | 8                | 18
+inception_resnet_v2            | None       | Mixed_6a             | 415  | 16               | 186
+inception_resnet_v2            | None       | PreAuxLogits         | 2335 | 16               | 1146
+inception_resnet_v2            | None       | Mixed_7a             | 2399 | 32               | 1162
+inception_resnet_v2            | None       | Conv2d_7b_1x1        | 3039 | 32               | 1482
+inception_resnet_v2            | 224        | Conv2d_1a_3x3        | 3    | 2                | 0
+inception_resnet_v2            | 224        | Conv2d_2a_3x3        | 7    | 2                | 0
+inception_resnet_v2            | 224        | Conv2d_2b_3x3        | 11   | 2                | 2
+inception_resnet_v2            | 224        | MaxPool_3a_3x3       | 15   | 4                | 2
+inception_resnet_v2            | 224        | Conv2d_3b_1x1        | 15   | 4                | 2
+inception_resnet_v2            | 224        | Conv2d_4a_3x3        | 23   | 4                | 2
+inception_resnet_v2            | 224        | MaxPool_5a_3x3       | 31   | 8                | 2
+inception_resnet_v2            | 224        | Mixed_5b             | 63   | 8                | 18
+inception_resnet_v2            | 224        | Mixed_6a             | 415  | 16               | 186
+inception_resnet_v2            | 224        | PreAuxLogits         | 2335 | 16               | 1146
+inception_resnet_v2            | 224        | Mixed_7a             | 2399 | 32               | 1162
+inception_resnet_v2            | 224        | Conv2d_7b_1x1        | 3039 | 32               | 1482
+inception_resnet_v2            | 321        | Conv2d_1a_3x3        | 3    | 2                | 0
+inception_resnet_v2            | 321        | Conv2d_2a_3x3        | 7    | 2                | 0
+inception_resnet_v2            | 321        | Conv2d_2b_3x3        | 11   | 2                | 2
+inception_resnet_v2            | 321        | MaxPool_3a_3x3       | 15   | 4                | 2
+inception_resnet_v2            | 321        | Conv2d_3b_1x1        | 15   | 4                | 2
+inception_resnet_v2            | 321        | Conv2d_4a_3x3        | 23   | 4                | 2
+inception_resnet_v2            | 321        | MaxPool_5a_3x3       | 31   | 8                | 2
+inception_resnet_v2            | 321        | Mixed_5b             | 63   | 8                | 18
+inception_resnet_v2            | 321        | Mixed_6a             | 415  | 16               | 186
+inception_resnet_v2            | 321        | PreAuxLogits         | 2335 | 16               | 1146
+inception_resnet_v2            | 321        | Mixed_7a             | 2399 | 32               | 1162
+inception_resnet_v2            | 321        | Conv2d_7b_1x1        | 3039 | 32               | 1482
+inception_resnet_v2-same       | None       | Conv2d_1a_3x3        | 3    | 2                | None
+inception_resnet_v2-same       | None       | Conv2d_2a_3x3        | 7    | 2                | None
+inception_resnet_v2-same       | None       | Conv2d_2b_3x3        | 11   | 2                | None
+inception_resnet_v2-same       | None       | MaxPool_3a_3x3       | 15   | 4                | None
+inception_resnet_v2-same       | None       | Conv2d_3b_1x1        | 15   | 4                | None
+inception_resnet_v2-same       | None       | Conv2d_4a_3x3        | 23   | 4                | None
+inception_resnet_v2-same       | None       | MaxPool_5a_3x3       | 31   | 8                | None
+inception_resnet_v2-same       | None       | Mixed_5b             | 63   | 8                | None
+inception_resnet_v2-same       | None       | Mixed_6a             | 415  | 16               | None
+inception_resnet_v2-same       | None       | PreAuxLogits         | 2335 | 16               | None
+inception_resnet_v2-same       | None       | Mixed_7a             | 2399 | 32               | None
+inception_resnet_v2-same       | None       | Conv2d_7b_1x1        | 3039 | 32               | None
+inception_resnet_v2-same       | 224        | Conv2d_1a_3x3        | 3    | 2                | 0
+inception_resnet_v2-same       | 224        | Conv2d_2a_3x3        | 7    | 2                | 2
+inception_resnet_v2-same       | 224        | Conv2d_2b_3x3        | 11   | 2                | 4
+inception_resnet_v2-same       | 224        | MaxPool_3a_3x3       | 15   | 4                | 4
+inception_resnet_v2-same       | 224        | Conv2d_3b_1x1        | 15   | 4                | 4
+inception_resnet_v2-same       | 224        | Conv2d_4a_3x3        | 23   | 4                | 8
+inception_resnet_v2-same       | 224        | MaxPool_5a_3x3       | 31   | 8                | 8
+inception_resnet_v2-same       | 224        | Mixed_5b             | 63   | 8                | 24
+inception_resnet_v2-same       | 224        | Mixed_6a             | 415  | 16               | 192
+inception_resnet_v2-same       | 224        | PreAuxLogits         | 2335 | 16               | 1152
+inception_resnet_v2-same       | 224        | Mixed_7a             | 2399 | 32               | 1168
+inception_resnet_v2-same       | 224        | Conv2d_7b_1x1        | 3039 | 32               | 1488
+inception_resnet_v2-same       | 321        | Conv2d_1a_3x3        | 3    | 2                | 1
+inception_resnet_v2-same       | 321        | Conv2d_2a_3x3        | 7    | 2                | 3
+inception_resnet_v2-same       | 321        | Conv2d_2b_3x3        | 11   | 2                | 5
+inception_resnet_v2-same       | 321        | MaxPool_3a_3x3       | 15   | 4                | 7
+inception_resnet_v2-same       | 321        | Conv2d_3b_1x1        | 15   | 4                | 7
+inception_resnet_v2-same       | 321        | Conv2d_4a_3x3        | 23   | 4                | 11
+inception_resnet_v2-same       | 321        | MaxPool_5a_3x3       | 31   | 8                | 15
+inception_resnet_v2-same       | 321        | Mixed_5b             | 63   | 8                | 31
+inception_resnet_v2-same       | 321        | Mixed_6a             | 415  | 16               | 207
+inception_resnet_v2-same       | 321        | PreAuxLogits         | 2335 | 16               | 1167
+inception_resnet_v2-same       | 321        | Mixed_7a             | 2399 | 32               | 1199
+inception_resnet_v2-same       | 321        | Conv2d_7b_1x1        | 3039 | 32               | 1519
+mobilenet_v1                   | None       | Conv2d_0             | 3    | 2                | None
+mobilenet_v1                   | None       | Conv2d_1_pointwise   | 7    | 2                | None
+mobilenet_v1                   | None       | Conv2d_2_pointwise   | 11   | 4                | None
+mobilenet_v1                   | None       | Conv2d_3_pointwise   | 19   | 4                | None
+mobilenet_v1                   | None       | Conv2d_4_pointwise   | 27   | 8                | None
+mobilenet_v1                   | None       | Conv2d_5_pointwise   | 43   | 8                | None
+mobilenet_v1                   | None       | Conv2d_6_pointwise   | 59   | 16               | None
+mobilenet_v1                   | None       | Conv2d_7_pointwise   | 91   | 16               | None
+mobilenet_v1                   | None       | Conv2d_8_pointwise   | 123  | 16               | None
+mobilenet_v1                   | None       | Conv2d_9_pointwise   | 155  | 16               | None
+mobilenet_v1                   | None       | Conv2d_10_pointwise  | 187  | 16               | None
+mobilenet_v1                   | None       | Conv2d_11_pointwise  | 219  | 16               | None
+mobilenet_v1                   | None       | Conv2d_12_pointwise  | 251  | 32               | None
+mobilenet_v1                   | None       | Conv2d_13_pointwise  | 315  | 32               | None
+mobilenet_v1                   | 224        | Conv2d_0             | 3    | 2                | 0
+mobilenet_v1                   | 224        | Conv2d_1_pointwise   | 7    | 2                | 2
+mobilenet_v1                   | 224        | Conv2d_2_pointwise   | 11   | 4                | 2
+mobilenet_v1                   | 224        | Conv2d_3_pointwise   | 19   | 4                | 6
+mobilenet_v1                   | 224        | Conv2d_4_pointwise   | 27   | 8                | 6
+mobilenet_v1                   | 224        | Conv2d_5_pointwise   | 43   | 8                | 14
+mobilenet_v1                   | 224        | Conv2d_6_pointwise   | 59   | 16               | 14
+mobilenet_v1                   | 224        | Conv2d_7_pointwise   | 91   | 16               | 30
+mobilenet_v1                   | 224        | Conv2d_8_pointwise   | 123  | 16               | 46
+mobilenet_v1                   | 224        | Conv2d_9_pointwise   | 155  | 16               | 62
+mobilenet_v1                   | 224        | Conv2d_10_pointwise  | 187  | 16               | 78
+mobilenet_v1                   | 224        | Conv2d_11_pointwise  | 219  | 16               | 94
+mobilenet_v1                   | 224        | Conv2d_12_pointwise  | 251  | 32               | 94
+mobilenet_v1                   | 224        | Conv2d_13_pointwise  | 315  | 32               | 126
+mobilenet_v1                   | 321        | Conv2d_0             | 3    | 2                | 1
+mobilenet_v1                   | 321        | Conv2d_1_pointwise   | 7    | 2                | 3
+mobilenet_v1                   | 321        | Conv2d_2_pointwise   | 11   | 4                | 5
+mobilenet_v1                   | 321        | Conv2d_3_pointwise   | 19   | 4                | 9
+mobilenet_v1                   | 321        | Conv2d_4_pointwise   | 27   | 8                | 13
+mobilenet_v1                   | 321        | Conv2d_5_pointwise   | 43   | 8                | 21
+mobilenet_v1                   | 321        | Conv2d_6_pointwise   | 59   | 16               | 29
+mobilenet_v1                   | 321        | Conv2d_7_pointwise   | 91   | 16               | 45
+mobilenet_v1                   | 321        | Conv2d_8_pointwise   | 123  | 16               | 61
+mobilenet_v1                   | 321        | Conv2d_9_pointwise   | 155  | 16               | 77
+mobilenet_v1                   | 321        | Conv2d_10_pointwise  | 187  | 16               | 93
+mobilenet_v1                   | 321        | Conv2d_11_pointwise  | 219  | 16               | 109
+mobilenet_v1                   | 321        | Conv2d_12_pointwise  | 251  | 32               | 125
+mobilenet_v1                   | 321        | Conv2d_13_pointwise  | 315  | 32               | 157
+mobilenet_v1_075               | None       | Conv2d_0             | 3    | 2                | None
+mobilenet_v1_075               | None       | Conv2d_1_pointwise   | 7    | 2                | None
+mobilenet_v1_075               | None       | Conv2d_2_pointwise   | 11   | 4                | None
+mobilenet_v1_075               | None       | Conv2d_3_pointwise   | 19   | 4                | None
+mobilenet_v1_075               | None       | Conv2d_4_pointwise   | 27   | 8                | None
+mobilenet_v1_075               | None       | Conv2d_5_pointwise   | 43   | 8                | None
+mobilenet_v1_075               | None       | Conv2d_6_pointwise   | 59   | 16               | None
+mobilenet_v1_075               | None       | Conv2d_7_pointwise   | 91   | 16               | None
+mobilenet_v1_075               | None       | Conv2d_8_pointwise   | 123  | 16               | None
+mobilenet_v1_075               | None       | Conv2d_9_pointwise   | 155  | 16               | None
+mobilenet_v1_075               | None       | Conv2d_10_pointwise  | 187  | 16               | None
+mobilenet_v1_075               | None       | Conv2d_11_pointwise  | 219  | 16               | None
+mobilenet_v1_075               | None       | Conv2d_12_pointwise  | 251  | 32               | None
+mobilenet_v1_075               | None       | Conv2d_13_pointwise  | 315  | 32               | None
+mobilenet_v1_075               | 224        | Conv2d_0             | 3    | 2                | 0
+mobilenet_v1_075               | 224        | Conv2d_1_pointwise   | 7    | 2                | 2
+mobilenet_v1_075               | 224        | Conv2d_2_pointwise   | 11   | 4                | 2
+mobilenet_v1_075               | 224        | Conv2d_3_pointwise   | 19   | 4                | 6
+mobilenet_v1_075               | 224        | Conv2d_4_pointwise   | 27   | 8                | 6
+mobilenet_v1_075               | 224        | Conv2d_5_pointwise   | 43   | 8                | 14
+mobilenet_v1_075               | 224        | Conv2d_6_pointwise   | 59   | 16               | 14
+mobilenet_v1_075               | 224        | Conv2d_7_pointwise   | 91   | 16               | 30
+mobilenet_v1_075               | 224        | Conv2d_8_pointwise   | 123  | 16               | 46
+mobilenet_v1_075               | 224        | Conv2d_9_pointwise   | 155  | 16               | 62
+mobilenet_v1_075               | 224        | Conv2d_10_pointwise  | 187  | 16               | 78
+mobilenet_v1_075               | 224        | Conv2d_11_pointwise  | 219  | 16               | 94
+mobilenet_v1_075               | 224        | Conv2d_12_pointwise  | 251  | 32               | 94
+mobilenet_v1_075               | 224        | Conv2d_13_pointwise  | 315  | 32               | 126
+mobilenet_v1_075               | 321        | Conv2d_0             | 3    | 2                | 1
+mobilenet_v1_075               | 321        | Conv2d_1_pointwise   | 7    | 2                | 3
+mobilenet_v1_075               | 321        | Conv2d_2_pointwise   | 11   | 4                | 5
+mobilenet_v1_075               | 321        | Conv2d_3_pointwise   | 19   | 4                | 9
+mobilenet_v1_075               | 321        | Conv2d_4_pointwise   | 27   | 8                | 13
+mobilenet_v1_075               | 321        | Conv2d_5_pointwise   | 43   | 8                | 21
+mobilenet_v1_075               | 321        | Conv2d_6_pointwise   | 59   | 16               | 29
+mobilenet_v1_075               | 321        | Conv2d_7_pointwise   | 91   | 16               | 45
+mobilenet_v1_075               | 321        | Conv2d_8_pointwise   | 123  | 16               | 61
+mobilenet_v1_075               | 321        | Conv2d_9_pointwise   | 155  | 16               | 77
+mobilenet_v1_075               | 321        | Conv2d_10_pointwise  | 187  | 16               | 93
+mobilenet_v1_075               | 321        | Conv2d_11_pointwise  | 219  | 16               | 109
+mobilenet_v1_075               | 321        | Conv2d_12_pointwise  | 251  | 32               | 125
+mobilenet_v1_075               | 321        | Conv2d_13_pointwise  | 315  | 32               | 157
+resnet_v1_50                   | None       | resnet_v1_50/block1  | 35   | 8                | None
+resnet_v1_50                   | None       | resnet_v1_50/block2  | 99   | 16               | None
+resnet_v1_50                   | None       | resnet_v1_50/block3  | 291  | 32               | None
+resnet_v1_50                   | None       | resnet_v1_50/block4  | 483  | 32               | None
+resnet_v1_50                   | 224        | resnet_v1_50/block1  | 35   | 8                | 15
+resnet_v1_50                   | 224        | resnet_v1_50/block2  | 99   | 16               | 47
+resnet_v1_50                   | 224        | resnet_v1_50/block3  | 291  | 32               | 143
+resnet_v1_50                   | 224        | resnet_v1_50/block4  | 483  | 32               | 239
+resnet_v1_50                   | 321        | resnet_v1_50/block1  | 35   | 8                | 17
+resnet_v1_50                   | 321        | resnet_v1_50/block2  | 99   | 16               | 49
+resnet_v1_50                   | 321        | resnet_v1_50/block3  | 291  | 32               | 145
+resnet_v1_50                   | 321        | resnet_v1_50/block4  | 483  | 32               | 241
+resnet_v1_101                  | None       | resnet_v1_101/block1 | 35   | 8                | None
+resnet_v1_101                  | None       | resnet_v1_101/block2 | 99   | 16               | None
+resnet_v1_101                  | None       | resnet_v1_101/block3 | 835  | 32               | None
+resnet_v1_101                  | None       | resnet_v1_101/block4 | 1027 | 32               | None
+resnet_v1_101                  | 224        | resnet_v1_101/block1 | 35   | 8                | 15
+resnet_v1_101                  | 224        | resnet_v1_101/block2 | 99   | 16               | 47
+resnet_v1_101                  | 224        | resnet_v1_101/block3 | 835  | 32               | 415
+resnet_v1_101                  | 224        | resnet_v1_101/block4 | 1027 | 32               | 511
+resnet_v1_101                  | 321        | resnet_v1_101/block1 | 35   | 8                | 17
+resnet_v1_101                  | 321        | resnet_v1_101/block2 | 99   | 16               | 49
+resnet_v1_101                  | 321        | resnet_v1_101/block3 | 835  | 32               | 417
+resnet_v1_101                  | 321        | resnet_v1_101/block4 | 1027 | 32               | 513
+resnet_v1_152                  | None       | resnet_v1_152/block1 | 35   | 8                | None
+resnet_v1_152                  | None       | resnet_v1_152/block2 | 163  | 16               | None
+resnet_v1_152                  | None       | resnet_v1_152/block3 | 1315 | 32               | None
+resnet_v1_152                  | None       | resnet_v1_152/block4 | 1507 | 32               | None
+resnet_v1_152                  | 224        | resnet_v1_152/block1 | 35   | 8                | 15
+resnet_v1_152                  | 224        | resnet_v1_152/block2 | 163  | 16               | 79
+resnet_v1_152                  | 224        | resnet_v1_152/block3 | 1315 | 32               | 655
+resnet_v1_152                  | 224        | resnet_v1_152/block4 | 1507 | 32               | 751
+resnet_v1_152                  | 321        | resnet_v1_152/block1 | 35   | 8                | 17
+resnet_v1_152                  | 321        | resnet_v1_152/block2 | 163  | 16               | 81
+resnet_v1_152                  | 321        | resnet_v1_152/block3 | 1315 | 32               | 657
+resnet_v1_152                  | 321        | resnet_v1_152/block4 | 1507 | 32               | 753
+resnet_v1_200                  | None       | resnet_v1_200/block1 | 35   | 8                | None
+resnet_v1_200                  | None       | resnet_v1_200/block2 | 419  | 16               | None
+resnet_v1_200                  | None       | resnet_v1_200/block3 | 1571 | 32               | None
+resnet_v1_200                  | None       | resnet_v1_200/block4 | 1763 | 32               | None
+resnet_v1_200                  | 224        | resnet_v1_200/block1 | 35   | 8                | 15
+resnet_v1_200                  | 224        | resnet_v1_200/block2 | 419  | 16               | 207
+resnet_v1_200                  | 224        | resnet_v1_200/block3 | 1571 | 32               | 783
+resnet_v1_200                  | 224        | resnet_v1_200/block4 | 1763 | 32               | 879
+resnet_v1_200                  | 321        | resnet_v1_200/block1 | 35   | 8                | 17
+resnet_v1_200                  | 321        | resnet_v1_200/block2 | 419  | 16               | 209
+resnet_v1_200                  | 321        | resnet_v1_200/block3 | 1571 | 32               | 785
+resnet_v1_200                  | 321        | resnet_v1_200/block4 | 1763 | 32               | 881
+resnet_v2_50                   | None       | resnet_v2_50/block1  | 35   | 8                | None
+resnet_v2_50                   | None       | resnet_v2_50/block2  | 99   | 16               | None
+resnet_v2_50                   | None       | resnet_v2_50/block3  | 291  | 32               | None
+resnet_v2_50                   | None       | resnet_v2_50/block4  | 483  | 32               | None
+resnet_v2_50                   | 224        | resnet_v2_50/block1  | 35   | 8                | 15
+resnet_v2_50                   | 224        | resnet_v2_50/block2  | 99   | 16               | 47
+resnet_v2_50                   | 224        | resnet_v2_50/block3  | 291  | 32               | 143
+resnet_v2_50                   | 224        | resnet_v2_50/block4  | 483  | 32               | 239
+resnet_v2_50                   | 321        | resnet_v2_50/block1  | 35   | 8                | 17
+resnet_v2_50                   | 321        | resnet_v2_50/block2  | 99   | 16               | 49
+resnet_v2_50                   | 321        | resnet_v2_50/block3  | 291  | 32               | 145
+resnet_v2_50                   | 321        | resnet_v2_50/block4  | 483  | 32               | 241
+resnet_v2_101                  | None       | resnet_v2_101/block1 | 35   | 8                | None
+resnet_v2_101                  | None       | resnet_v2_101/block2 | 99   | 16               | None
+resnet_v2_101                  | None       | resnet_v2_101/block3 | 835  | 32               | None
+resnet_v2_101                  | None       | resnet_v2_101/block4 | 1027 | 32               | None
+resnet_v2_101                  | 224        | resnet_v2_101/block1 | 35   | 8                | 15
+resnet_v2_101                  | 224        | resnet_v2_101/block2 | 99   | 16               | 47
+resnet_v2_101                  | 224        | resnet_v2_101/block3 | 835  | 32               | 415
+resnet_v2_101                  | 224        | resnet_v2_101/block4 | 1027 | 32               | 511
+resnet_v2_101                  | 321        | resnet_v2_101/block1 | 35   | 8                | 17
+resnet_v2_101                  | 321        | resnet_v2_101/block2 | 99   | 16               | 49
+resnet_v2_101                  | 321        | resnet_v2_101/block3 | 835  | 32               | 417
+resnet_v2_101                  | 321        | resnet_v2_101/block4 | 1027 | 32               | 513
+resnet_v2_152                  | None       | resnet_v2_152/block1 | 35   | 8                | None
+resnet_v2_152                  | None       | resnet_v2_152/block2 | 163  | 16               | None
+resnet_v2_152                  | None       | resnet_v2_152/block3 | 1315 | 32               | None
+resnet_v2_152                  | None       | resnet_v2_152/block4 | 1507 | 32               | None
+resnet_v2_152                  | 224        | resnet_v2_152/block1 | 35   | 8                | 15
+resnet_v2_152                  | 224        | resnet_v2_152/block2 | 163  | 16               | 79
+resnet_v2_152                  | 224        | resnet_v2_152/block3 | 1315 | 32               | 655
+resnet_v2_152                  | 224        | resnet_v2_152/block4 | 1507 | 32               | 751
+resnet_v2_152                  | 321        | resnet_v2_152/block1 | 35   | 8                | 17
+resnet_v2_152                  | 321        | resnet_v2_152/block2 | 163  | 16               | 81
+resnet_v2_152                  | 321        | resnet_v2_152/block3 | 1315 | 32               | 657
+resnet_v2_152                  | 321        | resnet_v2_152/block4 | 1507 | 32               | 753
+resnet_v2_200                  | None       | resnet_v2_200/block1 | 35   | 8                | None
+resnet_v2_200                  | None       | resnet_v2_200/block2 | 419  | 16               | None
+resnet_v2_200                  | None       | resnet_v2_200/block3 | 1571 | 32               | None
+resnet_v2_200                  | None       | resnet_v2_200/block4 | 1763 | 32               | None
+resnet_v2_200                  | 224        | resnet_v2_200/block1 | 35   | 8                | 15
+resnet_v2_200                  | 224        | resnet_v2_200/block2 | 419  | 16               | 207
+resnet_v2_200                  | 224        | resnet_v2_200/block3 | 1571 | 32               | 783
+resnet_v2_200                  | 224        | resnet_v2_200/block4 | 1763 | 32               | 879
+resnet_v2_200                  | 321        | resnet_v2_200/block1 | 35   | 8                | 17
+resnet_v2_200                  | 321        | resnet_v2_200/block2 | 419  | 16               | 209
+resnet_v2_200                  | 321        | resnet_v2_200/block3 | 1571 | 32               | 785
+resnet_v2_200                  | 321        | resnet_v2_200/block4 | 1763 | 32               | 881
+
+## FAQ
+
+### What does a resolution of 'None' mean?
+
+In this case, the input resolution is undefined. For most models, the receptive
+field parameters can be computed even without knowing the input resolution.
+
+### For some networks, effective_padding shows as 'None' (eg, for Inception_v2 or Mobilenet_v1 when input size is not specified). Why is that?
+
+This means that the padding for these networks depends on the input size. So,
+unless we know exactly the input image dimensionality to be used, it is not
+possible to determine the padding applied at the different layers. Look at the
+other entries where the input size is fixed; for those cases, effective_padding
+is not None.
+
+This happens due to Tensorflow's implementation of the 'SAME' padding mode,
+which may depend on the input feature map size to a given layer. For background
+on this, see [these notes from the TF
+documentation](https://www.tensorflow.org/versions/master/api_guides/python/nn#Notes_on_SAME_Convolution_Padding).
+
+Also, note that in this case the program is not able to check if the network is
+aligned (ie, it could be that the different paths from input to output have
+receptive fields which are not consistently centered at the same position in the
+input image).
+
+So you should be aware that such networks might not be aligned -- the program
+has no way of checking it when the padding cannot be determined.
+
+### The receptive field parameters for network X seem different from what I expected... maybe your calculation is incorrect?
+
+First, note that the results presented here are based on the tensorflow
+implementations from the [TF-Slim model
+library](https://github.com/tensorflow/models/tree/master/research/slim).
+
+So, it is possible that due to some implementation details the RF parameters are
+different.
+
+One common case of confusion is the TF-Slim Resnet implementation, which applies
+stride in the last residual unit of each block, instead of at the input
+activations in the first residual unit of each block (which is what is described
+in the Resnet paper) -- see [this
+comment](https://github.com/tensorflow/models/blob/master/research/slim/nets/resnet_utils.py#L30).
+This makes the stride with respect to each convolution block potentially
+different. In this case, though, note that a
+[flag](https://github.com/tensorflow/models/blob/master/research/slim/nets/resnet_v1.py#L150)
+may be used to recover the original striding convention.
+
+Second, it could be that we have a bug somewhere. While we include [many
+tests](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/receptive_field/python/util/receptive_field_test.py)
+in our library, it is always possible that we missed something. If you suspect
+this is happening, please file a GitHub issue
+[here](https://github.com/tensorflow/tensorflow/issues).
diff --git a/tensorflow/contrib/receptive_field/python/util/examples/csv_to_markdown_table.py b/tensorflow/contrib/receptive_field/python/util/examples/csv_to_markdown_table.py
new file mode 100644
index 00000000000000..4495d74bbf66fa
--- /dev/null
+++ b/tensorflow/contrib/receptive_field/python/util/examples/csv_to_markdown_table.py
@@ -0,0 +1,82 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Simple script to convert CSV output from rf_benchmark to Markdown format.
+
+The input CSV should have the following fields:
+- CNN
+- input resolution
+- end_point
+- RF size hor
+- RF size ver
+- effective stride hor
+- effective stride ver
+- effective padding hor
+- effective padding ver
+
+Since usually in all cases the parameters in the horizontal and vertical
+directions are the same, this is assumed by this script, which only prints one
+of them to the Markdown file.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import csv
+import sys
+
+from tensorflow.python.platform import app
+
+cmd_args = None
+
+
+def main(unused_argv):
+  with open(cmd_args.markdown_path, 'w') as f:
+    # Write table header and field size.
+    f.write('CNN | resolution | end-point | RF | effective stride | '
+            'effective padding|\n')
+    f.write(
+        ':--------------------: | :----------: | :---------------: | :-----: |'
+        ' :----: | :----:|\n')
+    with open(cmd_args.csv_path) as csvfile:
+      reader = csv.DictReader(csvfile)
+      for row in reader:
+        # Make sure horizontal and parameters are the same.
+        assert row['RF size hor'] == row['RF size ver']
+        assert row['effective stride hor'] == row['effective stride ver']
+        assert row['effective padding hor'] == row['effective padding ver']
+
+        f.write('%s|%s|%s|%s|%s|%s\n' %
+                (row['CNN'], row['input resolution'], row['end_point'],
+                 row['RF size hor'], row['effective stride hor'],
+                 row['effective padding hor']))
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.register('type', 'bool', lambda v: v.lower() == 'true')
+  parser.add_argument(
+      '--csv_path',
+      type=str,
+      default='/tmp/rf.csv',
+      help='Path where CSV output of rf_benchmark was saved.')
+  parser.add_argument(
+      '--markdown_path',
+      type=str,
+      default='/tmp/rf.md',
+      help='Path where Markdown output will be saved.')
+  cmd_args, unparsed = parser.parse_known_args()
+  app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py b/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py
index bc383a80349638..0e3c46f17d2e2a 100644
--- a/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py
+++ b/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py
@@ -27,7 +27,7 @@
 _UNCHANGED_RF_LAYER_OPS = [
     "Add", "BiasAdd", "Cast", "Ceil", "ConcatV2", "Const", "Floor",
     "FusedBatchNorm", "Identity", "Log", "Mul", "Pow", "RealDiv", "Relu",
-    "Relu6", "Round", "Rsqrt", "Softplus", "Sub", "VariableV2"
+    "Relu6", "Round", "Rsqrt", "Softplus", "Sub", "VariableV2", "LRN"
 ]
 
 # Different ways in which padding modes may be spelled.
diff --git a/tensorflow/contrib/receptive_field/python/util/receptive_field_test.py b/tensorflow/contrib/receptive_field/python/util/receptive_field_test.py
index cf55da27236d17..a42bbca61135a5 100644
--- a/tensorflow/contrib/receptive_field/python/util/receptive_field_test.py
+++ b/tensorflow/contrib/receptive_field/python/util/receptive_field_test.py
@@ -385,7 +385,7 @@ def testComputeRFFromGraphDefStopPropagation(self):
      effective_stride_y, effective_padding_x, effective_padding_y) = (
          receptive_field.compute_receptive_field_from_graph_def(
              graph_def, input_node, output_node,
-             ['Dropout/dropout/random_uniform']))
+             ['Dropout/dropout_1/random_uniform']))
     self.assertEqual(receptive_field_x, 3)
     self.assertEqual(receptive_field_y, 3)
     self.assertEqual(effective_stride_x, 4)
diff --git a/tensorflow/contrib/recurrent/python/recurrent_api.py b/tensorflow/contrib/recurrent/python/recurrent_api.py
index ffe1dcf7dc4955..f1c97927dfe4c2 100644
--- a/tensorflow/contrib/recurrent/python/recurrent_api.py
+++ b/tensorflow/contrib/recurrent/python/recurrent_api.py
@@ -19,9 +19,9 @@
 from __future__ import print_function
 
 # pylint: disable=unused-import
-from tensorflow.contrib.recurrent.python.ops import functional_bidirectional_rnn
-from tensorflow.contrib.recurrent.python.ops import functional_rnn
-from tensorflow.contrib.recurrent.python.ops import Recurrent
+from tensorflow.contrib.recurrent.python.ops.functional_rnn import bidirectional_functional_rnn
+from tensorflow.contrib.recurrent.python.ops.functional_rnn import functional_rnn
+from tensorflow.contrib.recurrent.python.ops.recurrent import Recurrent
 # pylint: enable=unused-import
 
 del absolute_import
diff --git a/tensorflow/contrib/rnn/BUILD b/tensorflow/contrib/rnn/BUILD
index 5a5dab8c6de591..cabb0dde68a57a 100644
--- a/tensorflow/contrib/rnn/BUILD
+++ b/tensorflow/contrib/rnn/BUILD
@@ -193,6 +193,10 @@ tf_py_test(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
     ],
+    tags = [
+        "manual",
+        "notap",
+    ],
 )
 
 gpu_py_tests(
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
index d41fc0b3ac1cee..b8840a8f2420f1 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
@@ -19,6 +19,7 @@
 from __future__ import print_function
 
 import functools
+import os
 
 import numpy as np
 
@@ -30,6 +31,7 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -39,6 +41,7 @@
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
 
 # pylint: enable=protected-access
 Linear = core_rnn_cell._Linear  # pylint: disable=invalid-name
@@ -189,6 +192,7 @@ def testBasicLSTMCell(self):
           self.assertEqual(cell.dtype, None)
           self.assertEqual("cell-0", cell._checkpoint_dependencies[0].name)
           self.assertEqual("cell-1", cell._checkpoint_dependencies[1].name)
+          cell.get_config()  # Should not throw an error
           g, out_m = cell(x, m)
           # Layer infers the input type.
           self.assertEqual(cell.dtype, dtype.name)
@@ -439,6 +443,26 @@ def testLSTMCellLayerNorm(self):
           self.assertTrue(
               float(np.linalg.norm((res[1][0, :] - res[1][i, :]))) < 1e-6)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testWrapperCheckpointing(self):
+    for wrapper_type in [
+        rnn_cell_impl.DropoutWrapper,
+        rnn_cell_impl.ResidualWrapper,
+        lambda cell: rnn_cell_impl.MultiRNNCell([cell])]:
+      with self.test_session():
+        cell = rnn_cell_impl.BasicRNNCell(1)
+        wrapper = wrapper_type(cell)
+        wrapper(array_ops.ones([1, 1]),
+                state=wrapper.zero_state(batch_size=1, dtype=dtypes.float32))
+        self.evaluate([v.initializer for v in cell.variables])
+        checkpoint = checkpointable_utils.Checkpoint(wrapper=wrapper)
+        prefix = os.path.join(self.get_temp_dir(), "ckpt")
+        self.evaluate(cell._bias.assign([40.]))
+        save_path = checkpoint.save(prefix)
+        self.evaluate(cell._bias.assign([0.]))
+        checkpoint.restore(save_path).assert_consumed().run_restore_ops()
+        self.assertAllEqual([40.], self.evaluate(cell._bias))
+
   def testOutputProjectionWrapper(self):
     with self.test_session() as sess:
       with variable_scope.variable_scope(
@@ -483,7 +507,13 @@ def testResidualWrapper(self):
         base_cell = rnn_cell_impl.GRUCell(3)
         g, m_new = base_cell(x, m)
         variable_scope.get_variable_scope().reuse_variables()
-        g_res, m_new_res = rnn_cell_impl.ResidualWrapper(base_cell)(x, m)
+        wrapper_object = rnn_cell_impl.ResidualWrapper(base_cell)
+        (name, dep), = wrapper_object._checkpoint_dependencies
+        wrapper_object.get_config()  # Should not throw an error
+        self.assertIs(dep, base_cell)
+        self.assertEqual("cell", name)
+
+        g_res, m_new_res = wrapper_object(x, m)
         sess.run([variables_lib.global_variables_initializer()])
         res = sess.run([g, g_res, m_new, m_new_res], {
             x: np.array([[1., 1., 1.]]),
@@ -526,7 +556,13 @@ def testDeviceWrapper(self):
         "root", initializer=init_ops.constant_initializer(0.5)):
       x = array_ops.zeros([1, 3])
       m = array_ops.zeros([1, 3])
-      cell = rnn_cell_impl.DeviceWrapper(rnn_cell_impl.GRUCell(3), "/cpu:14159")
+      wrapped = rnn_cell_impl.GRUCell(3)
+      cell = rnn_cell_impl.DeviceWrapper(wrapped, "/cpu:14159")
+      (name, dep), = cell._checkpoint_dependencies
+      cell.get_config()  # Should not throw an error
+      self.assertIs(dep, wrapped)
+      self.assertEqual("cell", name)
+
       outputs, _ = cell(x, m)
       self.assertTrue("cpu:14159" in outputs.device.lower())
 
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
index de5df912921932..be99a5d67a3e49 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
@@ -38,6 +38,7 @@
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import rnn
 from tensorflow.python.ops import rnn_cell
+from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
@@ -142,6 +143,47 @@ def save_state(self, name, state):
     self.saved_state[name] = state
     return array_ops.identity(state)
 
+  @property
+  def batch_size(self):
+    return self._batch_size
+
+  @property
+  def state_size(self):
+    return self._state_size
+
+
+class TestStateSaverWithCounters(TestStateSaver):
+  """Class wrapper around TestStateSaver.
+
+  A dummy class used for testing of static_state_saving_rnn. It helps test if
+  save_state and state functions got called same number of time when we
+  evaluate output of rnn cell and state or either of them separately. It
+  inherits from the TestStateSaver and adds the counters for calls of functions.
+  """
+
+  def __init__(self, batch_size, state_size):
+    super(TestStateSaverWithCounters, self).__init__(batch_size, state_size)
+    self._num_state_calls = variables_lib.Variable(0)
+    self._num_save_state_calls = variables_lib.Variable(0)
+
+  def state(self, name):
+    with ops_lib.control_dependencies(
+        [state_ops.assign_add(self._num_state_calls, 1)]):
+      return super(TestStateSaverWithCounters, self).state(name)
+
+  def save_state(self, name, state):
+    with ops_lib.control_dependencies([state_ops.assign_add(
+        self._num_save_state_calls, 1)]):
+      return super(TestStateSaverWithCounters, self).save_state(name, state)
+
+  @property
+  def num_state_calls(self):
+    return self._num_state_calls
+
+  @property
+  def num_save_state_calls(self):
+    return self._num_save_state_calls
+
 
 class RNNTest(test.TestCase):
 
@@ -186,6 +228,9 @@ def testDropout(self):
     cell = Plus1RNNCell()
     full_dropout_cell = rnn_cell.DropoutWrapper(
         cell, input_keep_prob=1e-12, seed=0)
+    (name, dep), = full_dropout_cell._checkpoint_dependencies
+    self.assertIs(dep, cell)
+    self.assertEqual("cell", name)
     batch_size = 2
     input_size = 5
     max_length = 8
@@ -307,6 +352,21 @@ def setUp(self):
     self._seed = 23489
     np.random.seed(self._seed)
 
+  def testDType(self):
+    # Test case for GitHub issue 16228
+    # Not passing dtype in constructor results in default float32
+    lstm = rnn_cell.LSTMCell(10)
+    input_tensor = array_ops.ones([10, 50])
+    lstm.build(input_tensor.get_shape())
+    self.assertEqual(lstm._bias.dtype, dtypes.float32_ref)
+
+    # Explicitly pass dtype in constructor
+    for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
+      lstm = rnn_cell.LSTMCell(10, dtype=dtype)
+      input_tensor = array_ops.ones([10, 50])
+      lstm.build(input_tensor.get_shape())
+      self.assertEqual(lstm._bias.dtype, dtype._as_ref)
+
   def testNoProjNoSharding(self):
     num_units = 3
     input_size = 5
@@ -1777,13 +1837,40 @@ def setUp(self):
     self._seed = 23489
     np.random.seed(self._seed)
 
-  def _testScope(self, factory, prefix="prefix", use_outer_scope=True):
+  def _factory(self, scope, state_saver):
+    num_units = state_saver.state_size // 2
+    batch_size = state_saver.batch_size
+    input_size = 5
+    max_length = 8
+    initializer = init_ops.random_uniform_initializer(
+        -0.01, 0.01, seed=self._seed)
+    cell = rnn_cell.LSTMCell(
+        num_units,
+        use_peepholes=False,
+        initializer=initializer,
+        state_is_tuple=False)
+    inputs = max_length * [
+        array_ops.zeros(dtype=dtypes.float32, shape=(batch_size, input_size))
+    ]
+    out, state = rnn.static_state_saving_rnn(
+        cell,
+        inputs,
+        state_saver=state_saver,
+        state_name="save_lstm",
+        scope=scope)
+    return out, state, state_saver
+
+  def _testScope(self, prefix="prefix", use_outer_scope=True):
+    num_units = 3
+    batch_size = 2
+    state_saver = TestStateSaver(batch_size, 2 * num_units)
+
     with self.test_session(use_gpu=True, graph=ops_lib.Graph()):
       if use_outer_scope:
         with variable_scope.variable_scope(prefix) as scope:
-          factory(scope)
+          self._factory(scope=scope, state_saver=state_saver)
       else:
-        factory(prefix)
+        self._factory(scope=prefix, state_saver=state_saver)
         variables_lib.global_variables_initializer()
 
       # check that all the variables names starts
@@ -1798,34 +1885,46 @@ def _testScope(self, factory, prefix="prefix", use_outer_scope=True):
       self.assertEqual(len(scope_vars), len(all_vars))
 
   def testStateSaverRNNScope(self):
-    num_units = 3
-    input_size = 5
-    batch_size = 2
-    max_length = 8
+    self._testScope(use_outer_scope=True)
+    self._testScope(use_outer_scope=False)
+    self._testScope(prefix=None, use_outer_scope=False)
 
-    def factory(scope):
-      initializer = init_ops.random_uniform_initializer(
-          -0.01, 0.01, seed=self._seed)
-      state_saver = TestStateSaver(batch_size, 2 * num_units)
-      cell = rnn_cell.LSTMCell(
-          num_units,
-          use_peepholes=False,
-          initializer=initializer,
-          state_is_tuple=False)
-      inputs = max_length * [
-          array_ops.placeholder(dtypes.float32, shape=(batch_size, input_size))
-      ]
-      return rnn.static_state_saving_rnn(
-          cell,
-          inputs,
-          state_saver=state_saver,
-          state_name="save_lstm",
-          scope=scope)
+  def testStateSaverCallsSaveState(self):
+    """Test that number of calls to state and save_state is equal.
 
-    self._testScope(factory, use_outer_scope=True)
-    self._testScope(factory, use_outer_scope=False)
-    self._testScope(factory, prefix=None, use_outer_scope=False)
+    Test if the order of actual evaluating or skipping evaluation of out,
+    state tensors, which are the output tensors from static_state_saving_rnn,
+    have influence on number of calls to save_state and state methods of
+    state_saver object (the number of calls should be same.)
+    """
 
+    num_units = 3
+    batch_size = 2
+    state_saver = TestStateSaverWithCounters(batch_size, 2 * num_units)
+    out, state, state_saver = self._factory(scope=None, state_saver=state_saver)
+
+    with self.test_session() as sess:
+      sess.run(variables_lib.global_variables_initializer())
+      sess.run(variables_lib.local_variables_initializer())
+
+      _, _, num_state_calls, num_save_state_calls = sess.run([
+          out,
+          state,
+          state_saver.num_state_calls,
+          state_saver.num_save_state_calls])
+      self.assertEqual(num_state_calls, num_save_state_calls)
+
+      _, num_state_calls, num_save_state_calls = sess.run([
+          out,
+          state_saver.num_state_calls,
+          state_saver.num_save_state_calls])
+      self.assertEqual(num_state_calls, num_save_state_calls)
+
+      _, num_state_calls, num_save_state_calls = sess.run([
+          state,
+          state_saver.num_state_calls,
+          state_saver.num_save_state_calls])
+      self.assertEqual(num_state_calls, num_save_state_calls)
 
 class GRUTest(test.TestCase):
 
diff --git a/tensorflow/contrib/signal/BUILD b/tensorflow/contrib/signal/BUILD
index 7d802a8fcf6385..b63c87ee273c86 100644
--- a/tensorflow/contrib/signal/BUILD
+++ b/tensorflow/contrib/signal/BUILD
@@ -1,4 +1,4 @@
-package(default_visibility = ["//tensorflow:__subpackages__"])
+package(default_visibility = ["//tensorflow:internal"])
 
 licenses(["notice"])  # Apache 2.0
 
diff --git a/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py b/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py
index 64cc8c7ea54673..f1320501535f87 100644
--- a/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py
+++ b/tensorflow/contrib/signal/python/kernel_tests/shape_ops_test.py
@@ -119,7 +119,7 @@ def test_shape_inference(self):
     frame_step = 1
     result = shape_ops.frame(signal, frame_length, frame_step,
                              pad_end=True, pad_value=99, axis=1)
-    self.assertEqual([1, None, None, 3, 4], result.shape.as_list())
+    self.assertEqual([1, 2, None, 3, 4], result.shape.as_list())
 
     result = shape_ops.frame(signal, frame_length, frame_step,
                              pad_end=False, axis=1)
diff --git a/tensorflow/contrib/signal/python/ops/shape_ops.py b/tensorflow/contrib/signal/python/ops/shape_ops.py
index 1ddc2941ec4029..91862f0cc0ba53 100644
--- a/tensorflow/contrib/signal/python/ops/shape_ops.py
+++ b/tensorflow/contrib/signal/python/ops/shape_ops.py
@@ -43,13 +43,13 @@ def _infer_frame_shape(signal, frame_length, frame_step, pad_end, axis):
   outer_dimensions = signal_shape[:axis]
   inner_dimensions = signal_shape[axis:][1:]
   if signal_shape and frame_axis is not None:
-    if frame_step and frame_length is not None:
-      if pad_end:
-        # Double negative is so that we round up.
-        num_frames = -(-frame_axis // frame_step)
-      else:
-        num_frames = (frame_axis - frame_length + frame_step) // frame_step
-      num_frames = max(0, num_frames)
+    if frame_step is not None and pad_end:
+      # Double negative is so that we round up.
+      num_frames = max(0, -(-frame_axis // frame_step))
+    elif frame_step is not None and frame_length is not None:
+      assert not pad_end
+      num_frames = max(
+          0, (frame_axis - frame_length + frame_step) // frame_step)
   return outer_dimensions + [num_frames, frame_length] + inner_dimensions
 
 
diff --git a/tensorflow/contrib/signal/python/ops/window_ops.py b/tensorflow/contrib/signal/python/ops/window_ops.py
index 50094010dc75cf..59e67e8ba414df 100644
--- a/tensorflow/contrib/signal/python/ops/window_ops.py
+++ b/tensorflow/contrib/signal/python/ops/window_ops.py
@@ -47,7 +47,7 @@ def hann_window(window_length, periodic=True, dtype=dtypes.float32, name=None):
   Raises:
     ValueError: If `dtype` is not a floating point type.
 
-  [hann]: https://en.wikipedia.org/wiki/Window_function#Hann_window
+  [hann]: https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows
   """
   return _raised_cosine_window(name, 'hann_window', window_length, periodic,
                                dtype, 0.5, 0.5)
@@ -72,7 +72,7 @@ def hamming_window(window_length, periodic=True, dtype=dtypes.float32,
   Raises:
     ValueError: If `dtype` is not a floating point type.
 
-  [hamming]: https://en.wikipedia.org/wiki/Window_function#Hamming_window
+  [hamming]: https://en.wikipedia.org/wiki/Window_function#Hann_and_Hamming_windows
   """
   return _raised_cosine_window(name, 'hamming_window', window_length, periodic,
                                dtype, 0.54, 0.46)
diff --git a/tensorflow/contrib/slim/README.md b/tensorflow/contrib/slim/README.md
index 746b9556423761..f2bb458848fab5 100644
--- a/tensorflow/contrib/slim/README.md
+++ b/tensorflow/contrib/slim/README.md
@@ -909,3 +909,8 @@ slim.evaluation.evaluation_loop(
 
 ## Authors
 Sergio Guadarrama and Nathan Silberman
+
+## Citation
+"TensorFlow-Slim: a lightweight library for defining, training and evaluating complex models in TensorFlow"
+S. Guadarrama, N. Silberman, 2016.
+https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/slim
diff --git a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
index f2d31dc8db5688..d877831fce99a3 100644
--- a/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
+++ b/tensorflow/contrib/slim/python/slim/data/tfexample_decoder.py
@@ -102,7 +102,7 @@ class BoundingBox(ItemHandler):
   """An ItemHandler that concatenates a set of parsed Tensors to Bounding Boxes.
   """
 
-  def __init__(self, keys=None, prefix=None):
+  def __init__(self, keys=None, prefix=''):
     """Initialize the bounding box handler.
 
     Args:
diff --git a/tensorflow/contrib/slim/python/slim/evaluation_test.py b/tensorflow/contrib/slim/python/slim/evaluation_test.py
index 94fc12ca814721..3d0308aaf3da3b 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation_test.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation_test.py
@@ -26,7 +26,6 @@
 import numpy as np
 
 from tensorflow.contrib.framework.python.ops import variables as variables_lib
-from tensorflow.contrib.metrics.python.ops import metric_ops
 from tensorflow.contrib.slim.python.slim import evaluation
 from tensorflow.contrib.training.python.training import evaluation as evaluation_lib
 from tensorflow.core.protobuf import saver_pb2
@@ -37,6 +36,7 @@
 from tensorflow.python.framework import errors
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import flags
 from tensorflow.python.platform import gfile
@@ -89,8 +89,8 @@ def setUp(self):
     self._predictions, self._scale = TestModel(self._inputs)
 
   def testFinalOpsOnEvaluationLoop(self):
-    value_op, update_op = metric_ops.streaming_accuracy(self._predictions,
-                                                        self._labels)
+    value_op, update_op = metrics.accuracy(
+        labels=self._labels, predictions=self._predictions)
     init_op = control_flow_ops.group(variables.global_variables_initializer(),
                                      variables.local_variables_initializer())
     # Create checkpoint and log directories:
@@ -136,9 +136,10 @@ def end(self, session):
     self.assertTrue(obj.hook_was_run)
 
   def _create_names_to_metrics(self, predictions, labels):
-    accuracy0, update_op0 = metric_ops.streaming_accuracy(predictions, labels)
-    accuracy1, update_op1 = metric_ops.streaming_accuracy(predictions + 1,
-                                                          labels)
+    accuracy0, update_op0 = metrics.accuracy(
+        labels=labels, predictions=predictions)
+    accuracy1, update_op1 = metrics.accuracy(
+        labels=labels, predictions=predictions + 1)
 
     names_to_values = {'Accuracy': accuracy0, 'Another_accuracy': accuracy1}
     names_to_updates = {'Accuracy': update_op0, 'Another_accuracy': update_op1}
@@ -198,8 +199,8 @@ def testWithEpochLimit(self):
     predictions_limited = input.limit_epochs(self._predictions, num_epochs=1)
     labels_limited = input.limit_epochs(self._labels, num_epochs=1)
 
-    value_op, update_op = metric_ops.streaming_accuracy(
-        predictions_limited, labels_limited)
+    value_op, update_op = metrics.accuracy(
+        labels=labels_limited, predictions=predictions_limited)
 
     init_op = control_flow_ops.group(variables.global_variables_initializer(),
                                      variables.local_variables_initializer())
@@ -260,8 +261,8 @@ def testRestoredModelPerformance(self):
     self._prepareCheckpoint(checkpoint_path)
 
     # Next, determine the metric to evaluate:
-    value_op, update_op = metric_ops.streaming_accuracy(self._predictions,
-                                                        self._labels)
+    value_op, update_op = metrics.accuracy(
+        labels=self._labels, predictions=self._predictions)
 
     # Run the evaluation and verify the results:
     accuracy_value = evaluation.evaluate_once(
@@ -276,8 +277,8 @@ def testAdditionalHooks(self):
     self._prepareCheckpoint(checkpoint_path)
 
     # Next, determine the metric to evaluate:
-    value_op, update_op = metric_ops.streaming_accuracy(self._predictions,
-                                                        self._labels)
+    value_op, update_op = metrics.accuracy(
+        labels=self._labels, predictions=self._predictions)
 
     dumping_root = os.path.join(self.get_temp_dir(), 'tfdbg_dump_dir')
     dumping_hook = hooks.DumpingDebugHook(dumping_root, log_usage=False)
diff --git a/tensorflow/contrib/slim/python/slim/learning.py b/tensorflow/contrib/slim/python/slim/learning.py
index 8a2c74742a8ebb..6e55b9407bce5c 100644
--- a/tensorflow/contrib/slim/python/slim/learning.py
+++ b/tensorflow/contrib/slim/python/slim/learning.py
@@ -571,7 +571,7 @@ def train(train_op,
       default, two `Boolean`, scalar ops called "should_stop" and "should_log"
       are provided.
     log_every_n_steps: The frequency, in terms of global steps, that the loss
-      and global step and logged.
+      and global step are logged.
     graph: The graph to pass to the supervisor. If no graph is supplied the
       default graph is used.
     master: The address of the tensorflow master.
diff --git a/tensorflow/contrib/sparsemax/BUILD b/tensorflow/contrib/sparsemax/BUILD
index f860771c6626e1..656319c8c01c6c 100644
--- a/tensorflow/contrib/sparsemax/BUILD
+++ b/tensorflow/contrib/sparsemax/BUILD
@@ -38,7 +38,7 @@ py_library(
 
 gpu_py_tests(
     name = "sparsemax_test",
-    size = "small",
+    size = "medium",
     srcs = ["python/kernel_tests/sparsemax_test.py"],
     additional_deps = [
         ":sparsemax_py",
diff --git a/tensorflow/contrib/stat_summarizer/BUILD b/tensorflow/contrib/stat_summarizer/BUILD
index 30be14c10cd857..0b8fc0cdc66ae4 100644
--- a/tensorflow/contrib/stat_summarizer/BUILD
+++ b/tensorflow/contrib/stat_summarizer/BUILD
@@ -31,5 +31,8 @@ tf_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:variables",
     ],
-    tags = ["no_windows"],
+    tags = [
+        "no_windows",
+        "notap",  # TODO(b/80546574): test is flaky
+    ],
 )
diff --git a/tensorflow/contrib/summary/summary.py b/tensorflow/contrib/summary/summary.py
index 99ced53e1167ec..d22b80ac88a9ce 100644
--- a/tensorflow/contrib/summary/summary.py
+++ b/tensorflow/contrib/summary/summary.py
@@ -21,6 +21,7 @@
 
 To use with eager execution enabled, write your code as follows:
 
+```python
 global_step = tf.train.get_or_create_global_step()
 summary_writer = tf.contrib.summary.create_file_writer(
     train_dir, flush_millis=10000)
@@ -30,9 +31,11 @@
   tf.contrib.summary.scalar("loss", my_loss)
   # In this case every call to tf.contrib.summary.scalar will generate a record
   # ...
+```
 
 To use it with graph execution, write your code as follows:
 
+```python
 global_step = tf.train.get_or_create_global_step()
 summary_writer = tf.contrib.summary.create_file_writer(
     train_dir, flush_millis=10000)
@@ -53,7 +56,7 @@
   while not_done_training:
     sess.run([train_op, tf.contrib.summary.all_summary_ops()])
     # ...
-
+```
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/contrib/tensor_forest/client/eval_metrics.py b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
index e893e1d1c836cc..d8236a0a6fa6d0 100644
--- a/tensorflow/contrib/tensor_forest/client/eval_metrics.py
+++ b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
@@ -21,10 +21,10 @@
 
 from tensorflow.contrib import losses
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
-from tensorflow.contrib.metrics.python.ops import metric_ops
 
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics
 from tensorflow.python.ops import nn
 
 INFERENCE_PROB_NAME = prediction_key.PredictionKey.PROBABILITIES
@@ -38,12 +38,13 @@ def _top_k(probabilities, targets):
     targets = math_ops.to_int32(targets)
     if targets.get_shape().ndims > 1:
       targets = array_ops.squeeze(targets, axis=[1])
-    return metric_ops.streaming_mean(nn.in_top_k(probabilities, targets, k))
+    return metrics.mean(nn.in_top_k(probabilities, targets, k))
   return _top_k
 
 
 def _accuracy(predictions, targets, weights=None):
-  return metric_ops.streaming_accuracy(predictions, targets, weights=weights)
+  return metrics.accuracy(
+      labels=targets, predictions=predictions, weights=weights)
 
 
 def _r2(probabilities, targets, weights=None):
@@ -53,7 +54,7 @@ def _r2(probabilities, targets, weights=None):
   squares_residuals = math_ops.reduce_sum(
       math_ops.square(targets - probabilities), 0)
   score = 1 - math_ops.reduce_sum(squares_residuals / squares_total)
-  return metric_ops.streaming_mean(score, weights=weights)
+  return metrics.mean(score, weights=weights)
 
 
 def _squeeze_and_onehot(targets, depth):
@@ -62,7 +63,7 @@ def _squeeze_and_onehot(targets, depth):
 
 
 def _sigmoid_entropy(probabilities, targets, weights=None):
-  return metric_ops.streaming_mean(
+  return metrics.mean(
       losses.sigmoid_cross_entropy(probabilities,
                                    _squeeze_and_onehot(
                                        targets,
@@ -71,7 +72,7 @@ def _sigmoid_entropy(probabilities, targets, weights=None):
 
 
 def _softmax_entropy(probabilities, targets, weights=None):
-  return metric_ops.streaming_mean(
+  return metrics.mean(
       losses.sparse_softmax_cross_entropy(probabilities,
                                           math_ops.to_int32(targets)),
       weights=weights)
@@ -82,7 +83,7 @@ def _predictions(predictions, unused_targets, **unused_kwargs):
 
 
 def _class_log_loss(probabilities, targets, weights=None):
-  return metric_ops.streaming_mean(
+  return metrics.mean(
       losses.log_loss(probabilities,
                       _squeeze_and_onehot(targets,
                                           array_ops.shape(probabilities)[1])),
@@ -90,34 +91,36 @@ def _class_log_loss(probabilities, targets, weights=None):
 
 
 def _precision(predictions, targets, weights=None):
-  return metric_ops.streaming_precision(predictions, targets, weights=weights)
+  return metrics.precision(
+      labels=targets, predictions=predictions, weights=weights)
 
 
 def _precision_at_thresholds(predictions, targets, weights=None):
-  return metric_ops.streaming_precision_at_thresholds(
-      array_ops.slice(predictions, [0, 1], [-1, 1]),
-      targets,
-      np.arange(
-          0, 1, 0.01, dtype=np.float32),
+  return metrics.precision_at_thresholds(
+      labels=targets,
+      predictions=array_ops.slice(predictions, [0, 1], [-1, 1]),
+      thresholds=np.arange(0, 1, 0.01, dtype=np.float32),
       weights=weights)
 
 
 def _recall(predictions, targets, weights=None):
-  return metric_ops.streaming_recall(predictions, targets, weights=weights)
+  return metrics.recall(
+      labels=targets, predictions=predictions, weights=weights)
 
 
 def _recall_at_thresholds(predictions, targets, weights=None):
-  return metric_ops.streaming_recall_at_thresholds(
-      array_ops.slice(predictions, [0, 1], [-1, 1]),
-      targets,
-      np.arange(
-          0, 1, 0.01, dtype=np.float32),
+  return metrics.recall_at_thresholds(
+      labels=targets,
+      predictions=array_ops.slice(predictions, [0, 1], [-1, 1]),
+      thresholds=np.arange(0, 1, 0.01, dtype=np.float32),
       weights=weights)
 
 
 def _auc(probs, targets, weights=None):
-  return metric_ops.streaming_auc(array_ops.slice(probs, [0, 1], [-1, 1]),
-                                  targets, weights=weights)
+  return metrics.auc(
+      labels=targets,
+      predictions=array_ops.slice(probs, [0, 1], [-1, 1]),
+      weights=weights)
 
 
 _EVAL_METRICS = {
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest.py b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
index 7a35a70bbe3112..6f62cd11a97339 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest.py
@@ -295,7 +295,7 @@ def get_epoch_variable():
 
 
 # A simple container to hold the training variables for a single tree.
-class TreeTrainingVariables(object):
+class TreeVariables(object):
   """Stores tf.Variables for training a single random tree.
 
   Uses tf.get_variable to get tree-specific names so that this can be used
@@ -303,7 +303,7 @@ class TreeTrainingVariables(object):
   then relies on restoring that model to evaluate).
   """
 
-  def __init__(self, params, tree_num, training):
+  def __init__(self, params, tree_num, training, tree_config='', tree_stat=''):
     if (not hasattr(params, 'params_proto') or
         not isinstance(params.params_proto,
                        _params_proto.TensorForestParams)):
@@ -315,27 +315,28 @@ def __init__(self, params, tree_num, training):
       # TODO(gilberth): Manually shard this to be able to fit it on
       # multiple machines.
       self.stats = stats_ops.fertile_stats_variable(
-          params, '', self.get_tree_name('stats', tree_num))
+          params, tree_stat, self.get_tree_name('stats', tree_num))
     self.tree = model_ops.tree_variable(
-        params, '', self.stats, self.get_tree_name('tree', tree_num))
+        params, tree_config, self.stats, self.get_tree_name('tree', tree_num))
 
   def get_tree_name(self, name, num):
     return '{0}-{1}'.format(name, num)
 
 
-class ForestTrainingVariables(object):
+class ForestVariables(object):
   """A container for a forests training data, consisting of multiple trees.
 
-  Instantiates a TreeTrainingVariables object for each tree. We override the
+  Instantiates a TreeVariables object for each tree. We override the
   __getitem__ and __setitem__ function so that usage looks like this:
 
-    forest_variables = ForestTrainingVariables(params)
+    forest_variables = ForestVariables(params)
 
     ... forest_variables.tree ...
   """
 
   def __init__(self, params, device_assigner, training=True,
-               tree_variables_class=TreeTrainingVariables):
+               tree_variables_class=TreeVariables,
+               tree_configs=None, tree_stats=None):
     self.variables = []
     # Set up some scalar variables to run through the device assigner, then
     # we can use those to colocate everything related to a tree.
@@ -347,7 +348,13 @@ def __init__(self, params, device_assigner, training=True,
 
     for i in range(params.num_trees):
       with ops.device(self.device_dummies[i].device):
-        self.variables.append(tree_variables_class(params, i, training))
+        kwargs = {}
+        if tree_configs is not None:
+          kwargs.update(dict(tree_config=tree_configs[i]))
+        if tree_stats is not None:
+          kwargs.update(dict(tree_stat=tree_stats[i]))
+        self.variables.append(tree_variables_class(
+            params, i, training, **kwargs))
 
   def __setitem__(self, t, val):
     self.variables[t] = val
@@ -361,9 +368,11 @@ class RandomForestGraphs(object):
 
   def __init__(self,
                params,
+               tree_configs=None,
+               tree_stats=None,
                device_assigner=None,
                variables=None,
-               tree_variables_class=TreeTrainingVariables,
+               tree_variables_class=TreeVariables,
                tree_graphs=None,
                training=True):
     self.params = params
@@ -371,9 +380,10 @@ def __init__(self,
         device_assigner or framework_variables.VariableDeviceChooser())
     logging.info('Constructing forest with params = ')
     logging.info(self.params.__dict__)
-    self.variables = variables or ForestTrainingVariables(
+    self.variables = variables or ForestVariables(
         self.params, device_assigner=self.device_assigner, training=training,
-        tree_variables_class=tree_variables_class)
+        tree_variables_class=tree_variables_class,
+        tree_configs=tree_configs, tree_stats=tree_stats)
     tree_graph_class = tree_graphs or RandomTreeGraphs
     self.trees = [
         tree_graph_class(self.variables[i], self.params, i)
diff --git a/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py b/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
index bbe627b15773fa..1c9c81827e0f25 100644
--- a/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
+++ b/tensorflow/contrib/tensor_forest/python/tensor_forest_test.py
@@ -18,10 +18,14 @@
 from __future__ import division
 from __future__ import print_function
 
+from google.protobuf.json_format import ParseDict
+from tensorflow.contrib.decision_trees.proto import generic_tree_model_pb2 as _tree_proto
 from tensorflow.contrib.tensor_forest.python import tensor_forest
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import resources
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
@@ -110,6 +114,47 @@ def testInferenceConstruction(self):
     self.assertTrue(isinstance(paths, ops.Tensor))
     self.assertTrue(isinstance(var, ops.Tensor))
 
+  def testInfrenceFromRestoredModel(self):
+    input_data = [[-1., 0.], [-1., 2.],  # node 1
+                  [1., 0.], [1., -2.]]  # node 2
+    expected_prediction = [[0.0, 1.0], [0.0, 1.0],
+                           [0.0, 1.0], [0.0, 1.0]]
+    hparams = tensor_forest.ForestHParams(
+        num_classes=2,
+        num_features=2,
+        num_trees=1,
+        max_nodes=1000,
+        split_after_samples=25).fill()
+    tree_weight = {'decisionTree':
+                       {'nodes':
+                        [{'binaryNode':
+                          {'rightChildId': 2,
+                           'leftChildId': 1,
+                           'inequalityLeftChildTest':
+                           {'featureId': {'id': '0'},
+                            'threshold': {'floatValue': 0}}}},
+                         {'leaf': {'vector':
+                                   {'value': [{'floatValue': 0.0},
+                                              {'floatValue': 1.0}]}},
+                          'nodeId': 1},
+                         {'leaf': {'vector':
+                                   {'value': [{'floatValue': 0.0},
+                                              {'floatValue': 1.0}]}},
+                          'nodeId': 2}]}}
+    restored_tree_param = ParseDict(tree_weight,
+                                    _tree_proto.Model()).SerializeToString()
+    graph_builder = tensor_forest.RandomForestGraphs(hparams,
+                                                     [restored_tree_param])
+    probs, paths, var = graph_builder.inference_graph(input_data)
+    self.assertTrue(isinstance(probs, ops.Tensor))
+    self.assertTrue(isinstance(paths, ops.Tensor))
+    self.assertTrue(isinstance(var, ops.Tensor))
+    with self.test_session():
+      variables.global_variables_initializer().run()
+      resources.initialize_resources(resources.shared_resources()).run()
+      self.assertEquals(probs.eval().shape, (4, 2))
+      self.assertEquals(probs.eval().tolist(), expected_prediction)
+
   def testTrainingConstructionClassificationSparse(self):
     input_data = sparse_tensor.SparseTensor(
         indices=[[0, 0], [0, 3], [1, 0], [1, 7], [2, 1], [3, 9]],
diff --git a/tensorflow/contrib/tensorboard/db/summary_db_writer.cc b/tensorflow/contrib/tensorboard/db/summary_db_writer.cc
index 6590d6f7df4f35..cfdc884277a025 100644
--- a/tensorflow/contrib/tensorboard/db/summary_db_writer.cc
+++ b/tensorflow/contrib/tensorboard/db/summary_db_writer.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/contrib/tensorboard/db/summary_db_writer.h"
 
+#include <deque>
+
 #include "tensorflow/contrib/tensorboard/db/summary_converter.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -66,14 +68,9 @@ const char* kImagePluginName = "images";
 const char* kAudioPluginName = "audio";
 const char* kHistogramPluginName = "histograms";
 
-const int kScalarSlots = 10000;
-const int kImageSlots = 10;
-const int kAudioSlots = 10;
-const int kHistogramSlots = 1;
-const int kTensorSlots = 10;
-
 const int64 kReserveMinBytes = 32;
 const double kReserveMultiplier = 1.5;
+const int64 kPreallocateRows = 1000;
 
 // Flush is a misnomer because what we're actually doing is having lots
 // of commits inside any SqliteTransaction that writes potentially
@@ -139,22 +136,6 @@ void PatchPluginName(SummaryMetadata* metadata, const char* name) {
   }
 }
 
-int GetSlots(const Tensor& t, const SummaryMetadata& metadata) {
-  if (metadata.plugin_data().plugin_name() == kScalarPluginName) {
-    return kScalarSlots;
-  } else if (metadata.plugin_data().plugin_name() == kImagePluginName) {
-    return kImageSlots;
-  } else if (metadata.plugin_data().plugin_name() == kAudioPluginName) {
-    return kAudioSlots;
-  } else if (metadata.plugin_data().plugin_name() == kHistogramPluginName) {
-    return kHistogramSlots;
-  } else if (t.dims() == 0 && t.dtype() != DT_STRING) {
-    return kScalarSlots;
-  } else {
-    return kTensorSlots;
-  }
-}
-
 Status SetDescription(Sqlite* db, int64 id, const StringPiece& markdown) {
   const char* sql = R"sql(
     INSERT OR REPLACE INTO Descriptions (id, description) VALUES (?, ?)
@@ -481,24 +462,6 @@ class RunMetadata {
     return insert.StepAndReset();
   }
 
-  Status GetIsWatching(Sqlite* db, bool* is_watching)
-      SQLITE_TRANSACTIONS_EXCLUDED(*db) LOCKS_EXCLUDED(mu_) {
-    mutex_lock lock(mu_);
-    if (experiment_id_ == kAbsent) {
-      *is_watching = true;
-      return Status::OK();
-    }
-    const char* sql = R"sql(
-      SELECT is_watching FROM Experiments WHERE experiment_id = ?
-    )sql";
-    SqliteStatement stmt;
-    TF_RETURN_IF_ERROR(db->Prepare(sql, &stmt));
-    stmt.BindInt(1, experiment_id_);
-    TF_RETURN_IF_ERROR(stmt.StepOnce());
-    *is_watching = stmt.ColumnInt(0) != 0;
-    return Status::OK();
-  }
-
  private:
   Status InitializeUser(Sqlite* db, uint64 now) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     if (user_id_ != kAbsent || user_name_.empty()) return Status::OK();
@@ -659,43 +622,15 @@ class RunMetadata {
 
 /// \brief Tensor writer for a single series, e.g. Tag.
 ///
-/// This class can be used to write an infinite stream of Tensors to the
-/// database in a fixed block of contiguous disk space. This is
-/// accomplished using Algorithm R reservoir sampling.
-///
-/// The reservoir consists of a fixed number of rows, which are inserted
-/// using ZEROBLOB upon receiving the first sample, which is used to
-/// predict how big the other ones are likely to be. This is done
-/// transactionally in a way that tries to be mindful of other processes
-/// that might be trying to access the same DB.
-///
-/// Once the reservoir fills up, rows are replaced at random, and writes
-/// gradually become no-ops. This allows long training to go fast
-/// without configuration. The exception is when someone is actually
-/// looking at TensorBoard. When that happens, the "keep last" behavior
-/// is turned on and Append() will always result in a write.
-///
-/// If no one is watching training, this class still holds on to the
-/// most recent "dangling" Tensor, so if Finish() is called, the most
-/// recent training state can be written to disk.
-///
-/// The randomly selected sampling points should be consistent across
-/// multiple instances.
-///
 /// This class is thread safe.
 class SeriesWriter {
  public:
-  SeriesWriter(int64 series, int slots, RunMetadata* meta)
-      : series_{series},
-        slots_{slots},
-        meta_{meta},
-        rng_{std::mt19937_64::default_seed} {
+  SeriesWriter(int64 series, RunMetadata* meta) : series_{series}, meta_{meta} {
     DCHECK(series_ > 0);
-    DCHECK(slots_ > 0);
   }
 
   Status Append(Sqlite* db, int64 step, uint64 now, double computed_time,
-                Tensor t) SQLITE_TRANSACTIONS_EXCLUDED(*db)
+                const Tensor& t) SQLITE_TRANSACTIONS_EXCLUDED(*db)
       LOCKS_EXCLUDED(mu_) {
     mutex_lock lock(mu_);
     if (rowids_.empty()) {
@@ -705,41 +640,20 @@ class SeriesWriter {
         return s;
       }
     }
-    DCHECK(rowids_.size() == slots_);
-    int64 rowid;
-    size_t i = count_;
-    if (i < slots_) {
-      rowid = last_rowid_ = rowids_[i];
-    } else {
-      i = rng_() % (i + 1);
-      if (i < slots_) {
-        rowid = last_rowid_ = rowids_[i];
-      } else {
-        bool keep_last;
-        TF_RETURN_IF_ERROR(meta_->GetIsWatching(db, &keep_last));
-        if (!keep_last) {
-          ++count_;
-          dangling_tensor_.reset(new Tensor(std::move(t)));
-          dangling_step_ = step;
-          dangling_computed_time_ = computed_time;
-          return Status::OK();
-        }
-        rowid = last_rowid_;
-      }
-    }
+    int64 rowid = rowids_.front();
     Status s = Write(db, rowid, step, computed_time, t);
     if (s.ok()) {
       ++count_;
-      dangling_tensor_.reset();
     }
+    rowids_.pop_front();
     return s;
   }
 
   Status Finish(Sqlite* db) SQLITE_TRANSACTIONS_EXCLUDED(*db)
       LOCKS_EXCLUDED(mu_) {
     mutex_lock lock(mu_);
-    // Short runs: Delete unused pre-allocated Tensors.
-    if (count_ < rowids_.size()) {
+    // Delete unused pre-allocated Tensors.
+    if (!rowids_.empty()) {
       SqliteTransaction txn(*db);
       const char* sql = R"sql(
         DELETE FROM Tensors WHERE rowid = ?
@@ -747,19 +661,13 @@ class SeriesWriter {
       SqliteStatement deleter;
       TF_RETURN_IF_ERROR(db->Prepare(sql, &deleter));
       for (size_t i = count_; i < rowids_.size(); ++i) {
-        deleter.BindInt(1, rowids_[i]);
+        deleter.BindInt(1, rowids_.front());
         TF_RETURN_IF_ERROR(deleter.StepAndReset());
+        rowids_.pop_front();
       }
       TF_RETURN_IF_ERROR(txn.Commit());
       rowids_.clear();
     }
-    // Long runs: Make last sample be the very most recent one.
-    if (dangling_tensor_) {
-      DCHECK(last_rowid_ != kAbsent);
-      TF_RETURN_IF_ERROR(Write(db, last_rowid_, dangling_step_,
-                               dangling_computed_time_, *dangling_tensor_));
-      dangling_tensor_.reset();
-    }
     return Status::OK();
   }
 
@@ -783,7 +691,6 @@ class SeriesWriter {
 
   Status Update(Sqlite* db, int64 step, double computed_time, const Tensor& t,
                 const StringPiece& data, int64 rowid) {
-    // TODO(jart): How can we ensure reservoir fills on replace?
     const char* sql = R"sql(
       UPDATE OR REPLACE
         Tensors
@@ -878,7 +785,7 @@ class SeriesWriter {
     // TODO(jart): Maybe preallocate index pages by setting step. This
     //             is tricky because UPDATE OR REPLACE can have a side
     //             effect of deleting preallocated rows.
-    for (int64 i = 0; i < slots_; ++i) {
+    for (int64 i = 0; i < kPreallocateRows; ++i) {
       insert.BindInt(1, series_);
       insert.BindInt(2, reserved_bytes);
       TF_RETURN_WITH_CONTEXT_IF_ERROR(insert.StepAndReset(), "i=", i);
@@ -902,16 +809,10 @@ class SeriesWriter {
 
   mutex mu_;
   const int64 series_;
-  const int slots_;
   RunMetadata* const meta_;
-  std::mt19937_64 rng_ GUARDED_BY(mu_);
   uint64 count_ GUARDED_BY(mu_) = 0;
-  int64 last_rowid_ GUARDED_BY(mu_) = kAbsent;
-  std::vector<int64> rowids_ GUARDED_BY(mu_);
+  std::deque<int64> rowids_ GUARDED_BY(mu_);
   uint64 unflushed_bytes_ GUARDED_BY(mu_) = 0;
-  std::unique_ptr<Tensor> dangling_tensor_ GUARDED_BY(mu_);
-  int64 dangling_step_ GUARDED_BY(mu_) = 0;
-  double dangling_computed_time_ GUARDED_BY(mu_) = 0.0;
 
   TF_DISALLOW_COPY_AND_ASSIGN(SeriesWriter);
 };
@@ -928,10 +829,10 @@ class RunWriter {
   explicit RunWriter(RunMetadata* meta) : meta_{meta} {}
 
   Status Append(Sqlite* db, int64 tag_id, int64 step, uint64 now,
-                double computed_time, Tensor t, int slots)
+                double computed_time, const Tensor& t)
       SQLITE_TRANSACTIONS_EXCLUDED(*db) LOCKS_EXCLUDED(mu_) {
-    SeriesWriter* writer = GetSeriesWriter(tag_id, slots);
-    return writer->Append(db, step, now, computed_time, std::move(t));
+    SeriesWriter* writer = GetSeriesWriter(tag_id);
+    return writer->Append(db, step, now, computed_time, t);
   }
 
   Status Finish(Sqlite* db) SQLITE_TRANSACTIONS_EXCLUDED(*db)
@@ -948,11 +849,11 @@ class RunWriter {
   }
 
  private:
-  SeriesWriter* GetSeriesWriter(int64 tag_id, int slots) LOCKS_EXCLUDED(mu_) {
+  SeriesWriter* GetSeriesWriter(int64 tag_id) LOCKS_EXCLUDED(mu_) {
     mutex_lock sl(mu_);
     auto spot = series_writers_.find(tag_id);
     if (spot == series_writers_.end()) {
-      SeriesWriter* writer = new SeriesWriter(tag_id, slots, meta_);
+      SeriesWriter* writer = new SeriesWriter(tag_id, meta_);
       series_writers_[tag_id].reset(writer);
       return writer;
     } else {
@@ -1082,8 +983,7 @@ class SummaryDbWriter : public SummaryWriterInterface {
     TF_RETURN_IF_ERROR(
         meta_.GetTagId(db_, now, computed_time, tag, &tag_id, metadata));
     TF_RETURN_WITH_CONTEXT_IF_ERROR(
-        run_.Append(db_, tag_id, step, now, computed_time, t,
-                    GetSlots(t, metadata)),
+        run_.Append(db_, tag_id, step, now, computed_time, t),
         meta_.user_name(), "/", meta_.experiment_name(), "/", meta_.run_name(),
         "/", tag, "@", step);
     return Status::OK();
@@ -1155,8 +1055,7 @@ class SummaryDbWriter : public SummaryWriterInterface {
     int64 tag_id;
     TF_RETURN_IF_ERROR(meta_.GetTagId(db_, now, e->wall_time(), s->tag(),
                                       &tag_id, s->metadata()));
-    return run_.Append(db_, tag_id, e->step(), now, e->wall_time(), t,
-                       GetSlots(t, s->metadata()));
+    return run_.Append(db_, tag_id, e->step(), now, e->wall_time(), t);
   }
 
   // TODO(jart): Refactor Summary -> Tensor logic into separate file.
@@ -1169,8 +1068,7 @@ class SummaryDbWriter : public SummaryWriterInterface {
     PatchPluginName(s->mutable_metadata(), kScalarPluginName);
     TF_RETURN_IF_ERROR(meta_.GetTagId(db_, now, e->wall_time(), s->tag(),
                                       &tag_id, s->metadata()));
-    return run_.Append(db_, tag_id, e->step(), now, e->wall_time(),
-                       std::move(t), kScalarSlots);
+    return run_.Append(db_, tag_id, e->step(), now, e->wall_time(), t);
   }
 
   Status MigrateHistogram(const Event* e, Summary::Value* s, uint64 now) {
@@ -1182,21 +1080,26 @@ class SummaryDbWriter : public SummaryWriterInterface {
     // See tensorboard/plugins/histogram/summary.py and data_compat.py
     Tensor t{DT_DOUBLE, {k, 3}};
     auto data = t.flat<double>();
-    for (int i = 0; i < k; ++i) {
-      double left_edge = ((i - 1 >= 0) ? histo.bucket_limit(i - 1)
-                                       : std::numeric_limits<double>::min());
-      double right_edge = ((i + 1 < k) ? histo.bucket_limit(i + 1)
-                                       : std::numeric_limits<double>::max());
-      data(i + 0) = left_edge;
-      data(i + 1) = right_edge;
-      data(i + 2) = histo.bucket(i);
+    for (int i = 0, j = 0; i < k; ++i) {
+      // TODO(nickfelt): reconcile with TensorBoard's data_compat.py
+      // From summary.proto
+      // Parallel arrays encoding the bucket boundaries and the bucket values.
+      // bucket(i) is the count for the bucket i.  The range for
+      // a bucket is:
+      //   i == 0:  -DBL_MAX .. bucket_limit(0)
+      //   i != 0:  bucket_limit(i-1) .. bucket_limit(i)
+      double left_edge = (i == 0) ? std::numeric_limits<double>::min()
+                                  : histo.bucket_limit(i - 1);
+
+      data(j++) = left_edge;
+      data(j++) = histo.bucket_limit(i);
+      data(j++) = histo.bucket(i);
     }
     int64 tag_id;
     PatchPluginName(s->mutable_metadata(), kHistogramPluginName);
     TF_RETURN_IF_ERROR(meta_.GetTagId(db_, now, e->wall_time(), s->tag(),
                                       &tag_id, s->metadata()));
-    return run_.Append(db_, tag_id, e->step(), now, e->wall_time(),
-                       std::move(t), kHistogramSlots);
+    return run_.Append(db_, tag_id, e->step(), now, e->wall_time(), t);
   }
 
   Status MigrateImage(const Event* e, Summary::Value* s, uint64 now) {
@@ -1210,8 +1113,7 @@ class SummaryDbWriter : public SummaryWriterInterface {
     PatchPluginName(s->mutable_metadata(), kImagePluginName);
     TF_RETURN_IF_ERROR(meta_.GetTagId(db_, now, e->wall_time(), s->tag(),
                                       &tag_id, s->metadata()));
-    return run_.Append(db_, tag_id, e->step(), now, e->wall_time(),
-                       std::move(t), kImageSlots);
+    return run_.Append(db_, tag_id, e->step(), now, e->wall_time(), t);
   }
 
   Status MigrateAudio(const Event* e, Summary::Value* s, uint64 now) {
@@ -1224,8 +1126,7 @@ class SummaryDbWriter : public SummaryWriterInterface {
     PatchPluginName(s->mutable_metadata(), kAudioPluginName);
     TF_RETURN_IF_ERROR(meta_.GetTagId(db_, now, e->wall_time(), s->tag(),
                                       &tag_id, s->metadata()));
-    return run_.Append(db_, tag_id, e->step(), now, e->wall_time(),
-                       std::move(t), kAudioSlots);
+    return run_.Append(db_, tag_id, e->step(), now, e->wall_time(), t);
   }
 
   Env* const env_;
diff --git a/tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc b/tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc
index 29b8063218de72..2e8d4109dd624a 100644
--- a/tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc
+++ b/tensorflow/contrib/tensorboard/db/summary_db_writer_test.cc
@@ -100,6 +100,56 @@ class SummaryDbWriterTest : public ::testing::Test {
   SummaryWriterInterface* writer_ = nullptr;
 };
 
+TEST_F(SummaryDbWriterTest, WriteHistogram_VerifyTensorValues) {
+  TF_ASSERT_OK(CreateSummaryDbWriter(db_, "histtest", "test1", "user1", &env_,
+                                     &writer_));
+  int step = 0;
+  std::unique_ptr<Event> e{new Event};
+  e->set_step(step);
+  e->set_wall_time(123);
+  Summary::Value* s = e->mutable_summary()->add_value();
+  s->set_tag("normal/myhisto");
+
+  double dummy_value = 10.123;
+  HistogramProto* proto = s->mutable_histo();
+  proto->Clear();
+  proto->set_min(dummy_value);
+  proto->set_max(dummy_value);
+  proto->set_num(dummy_value);
+  proto->set_sum(dummy_value);
+  proto->set_sum_squares(dummy_value);
+
+  int size = 3;
+  double bucket_limits[] = {-30.5, -10.5, -5.5};
+  double bucket[] = {-10, 10, 20};
+  for (int i = 0; i < size; i++) {
+    proto->add_bucket_limit(bucket_limits[i]);
+    proto->add_bucket(bucket[i]);
+  }
+  TF_ASSERT_OK(writer_->WriteEvent(std::move(e)));
+  TF_ASSERT_OK(writer_->Flush());
+  writer_->Unref();
+  writer_ = nullptr;
+
+  // TODO(nickfelt): implement QueryTensor() to encapsulate this
+  // Verify the data
+  string result = QueryString("SELECT data FROM Tensors");
+  const double* val = reinterpret_cast<const double*>(result.data());
+  double histarray[] = {std::numeric_limits<double>::min(),
+                        -30.5,
+                        -10,
+                        -30.5,
+                        -10.5,
+                        10,
+                        -10.5,
+                        -5.5,
+                        20};
+  int histarray_size = 9;
+  for (int i = 0; i < histarray_size; i++) {
+    EXPECT_EQ(histarray[i], val[i]);
+  }
+}
+
 TEST_F(SummaryDbWriterTest, NothingWritten_NoRowsCreated) {
   TF_ASSERT_OK(CreateSummaryDbWriter(db_, "mad-science", "train", "jart", &env_,
                                      &writer_));
@@ -139,7 +189,7 @@ TEST_F(SummaryDbWriterTest, TensorsWritten_RowsGetInitialized) {
   ASSERT_EQ(1LL, QueryInt("SELECT COUNT(*) FROM Experiments"));
   ASSERT_EQ(1LL, QueryInt("SELECT COUNT(*) FROM Runs"));
   ASSERT_EQ(1LL, QueryInt("SELECT COUNT(*) FROM Tags"));
-  ASSERT_EQ(10000LL, QueryInt("SELECT COUNT(*) FROM Tensors"));
+  ASSERT_EQ(1000LL, QueryInt("SELECT COUNT(*) FROM Tensors"));
 
   int64 user_id = QueryInt("SELECT user_id FROM Users");
   int64 experiment_id = QueryInt("SELECT experiment_id FROM Experiments");
@@ -188,7 +238,7 @@ TEST_F(SummaryDbWriterTest, EmptyParentNames_NoParentsCreated) {
   ASSERT_EQ(0LL, QueryInt("SELECT COUNT(*) FROM Experiments"));
   ASSERT_EQ(0LL, QueryInt("SELECT COUNT(*) FROM Runs"));
   ASSERT_EQ(1LL, QueryInt("SELECT COUNT(*) FROM Tags"));
-  ASSERT_EQ(10000LL, QueryInt("SELECT COUNT(*) FROM Tensors"));
+  ASSERT_EQ(1000LL, QueryInt("SELECT COUNT(*) FROM Tensors"));
 }
 
 TEST_F(SummaryDbWriterTest, WriteEvent_Scalar) {
@@ -205,7 +255,7 @@ TEST_F(SummaryDbWriterTest, WriteEvent_Scalar) {
   TF_ASSERT_OK(writer_->WriteEvent(std::move(e)));
   TF_ASSERT_OK(writer_->Flush());
   ASSERT_EQ(2LL, QueryInt("SELECT COUNT(*) FROM Tags"));
-  ASSERT_EQ(20000LL, QueryInt("SELECT COUNT(*) FROM Tensors"));
+  ASSERT_EQ(2000LL, QueryInt("SELECT COUNT(*) FROM Tensors"));
   int64 tag1_id = QueryInt("SELECT tag_id FROM Tags WHERE tag_name = 'π'");
   int64 tag2_id = QueryInt("SELECT tag_id FROM Tags WHERE tag_name = 'φ'");
   EXPECT_GT(tag1_id, 0LL);
diff --git a/tensorflow/contrib/tensorboard/graph_explorer/proto/graph_explorer.proto b/tensorflow/contrib/tensorboard/graph_explorer/proto/graph_explorer.proto
deleted file mode 100644
index 835337ed5c58d0..00000000000000
--- a/tensorflow/contrib/tensorboard/graph_explorer/proto/graph_explorer.proto
+++ /dev/null
@@ -1,96 +0,0 @@
-// Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the 'License');
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an 'AS IS' BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-// =============================================================================
-
-// GraphExplorer is a tool that supports interactive, hierarchical visualization
-// of graphs. GraphExplorer renders graphs generated by TensorFlow represented
-// as GraphDef messages defined in tensorflow/core/framework/graph.proto. The
-// GraphDef proto does not allow for explicitly specifying visual attributes of
-// the graph such as color, line thickness, fonts, etc. This file introduces a
-// new proto for representing graphs and specifying visual attributes of graphs.
-//
-// The structure of the Graph proto is given by the EBNF grammar below. Consult
-// the message definitions below for details.
-//
-//  graph ::= node* edge* node_attribute* metanode_attribute* edge_attribute*
-//            graph_attribute*
-//  node  ::= node_id node_attribute* metanode_attribute* node_data*
-//  edge  ::= source_id target_id edge_attribute* edge_data*
-//
-// A graph consists of a list of nodes and a list of edges and attributes for
-// nodes, edges and the graph. Attributes have a name and a value and are
-// represented as key-value pairs, with {"color", "blue"} being an example.
-// Attributes have a scope, where the broadest scope is the graph and the
-// narrowest is a node that has no internal structure.
-syntax = "proto3";
-
-package graph_explorer;
-
-// There are two types of nodes. A 'metanode' contains other
-// nodes and a 'leaf node' has no internal structure.  The metanode containment
-// relationship is acyclic, meaning that if a metanode 'A' contains the metanode
-// 'B', then 'B' cannot contain 'A'.
-message Node {
-  // The identifier of a node is a sequence of strings separated by '/'. The
-  // identifier provides a unique name for a node and defines its hierarchical
-  // relation to other nodes.  If no label is provided  the last part of the
-  // identifier is used as a label.
-  //
-  // Example: In the graph below, metanodes are written with square brackets and
-  // leaf nodes with parentheses. The metanode 'node1' contains the leaf node
-  // 'node4' and the metanode 'node2', which contains the leaf node 'node3'.
-  //
-  //   [node1 [node2 (node3)] (node4)]
-  //
-  // The identifiers for these nodes are: "node1", "node1/node2",
-  // "node1/node2/node3", and "node1/node4".
-  string name = 1;
-
-  // A node attribute is information used by Graph Explorer to style a node.
-  map<string, string> node_attr = 2;
-
-  // A metanode attribute is one that is inherited by all nodes inside the
-  // current metanode. If an attribute applies only to the current node and
-  // should not be inherited, it should be specified as a node attribute.
-  map<string, string> metanode_attr = 3;
-};
-
-// An edge consists of a source and a target node, specified by their
-// identifiers. An edge has attributes and data that are similar to node
-// attributes and node data. Edges do not form a hierarchy so there are no
-// metanode attributes.
-message Edge {
-  // The source and target fields must have the format of a Node name.
-  string source = 1;
-  string target = 2;
-
-  // Edge attributes.
-  map<string, string> edge_attr = 3;
-}
-
-message Graph {
-  // List of nodes in the graph.
-  repeated Node node = 1;
-
-  // List of edges in the graph.
-  repeated Edge edge = 2;
-
-  // Default values of node, metanode and edge attributes.
-  map<string, string> node_attr = 3;
-  map<string, string> metanode_attr = 4;
-  map<string, string> edge_attr = 5;
-
-  // Graph attributes.
-  map<string, string> graph_attr = 6;
-};
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index 5e082770bebba6..44440fa00372c1 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -67,6 +67,7 @@ tf_gpu_library(
     visibility = ["//visibility:public"],
     deps = [
         ":trt_logging",
+        ":trt_plugins",
     ] + if_tensorrt([
         "@local_config_tensorrt//:nv_infer",
     ]) + tf_custom_op_library_additional_deps(),
@@ -86,6 +87,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":trt_logging",
+        ":trt_plugins",
         ":trt_resources",
         "//tensorflow/core:gpu_headers_lib",
         "//tensorflow/core:lib_proto_parsing",
@@ -102,9 +104,6 @@ tf_gen_op_libs(
         "trt_engine_op",
         "trt_calib_op",
     ],
-    deps = if_tensorrt([
-        "@local_config_tensorrt//:nv_infer",
-    ]),
 )
 
 tf_gpu_library(
@@ -138,6 +137,12 @@ tf_custom_op_py_library(
     ] + if_tensorrt([
         "@local_config_tensorrt//:nv_infer",
     ]),
+    kernels = [
+        ":trt_engine_op_kernel",
+        ":trt_engine_op_op_lib",
+        ":trt_calib_op_op_lib",
+        ":trt_shape_function",
+    ],
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/contrib/util:util_py",
@@ -187,17 +192,19 @@ tf_py_wrap_cc(
         ":trt_conversion",
         ":trt_engine_op_kernel",
         "//tensorflow/core:framework_lite",
-        "//util/python:python_headers",
+        "//third_party/python_runtime:headers",
     ],
 )
 
 tf_gpu_library(
     name = "trt_resources",
     srcs = [
+        "resources/trt_allocator.cc",
         "resources/trt_int8_calibrator.cc",
         "resources/trt_resource_manager.cc",
     ],
     hdrs = [
+        "resources/trt_allocator.h",
         "resources/trt_int8_calibrator.h",
         "resources/trt_resource_manager.h",
         "resources/trt_resources.h",
@@ -218,18 +225,25 @@ tf_gpu_library(
     srcs = [
         "convert/convert_graph.cc",
         "convert/convert_nodes.cc",
+        "convert/trt_optimization_pass.cc",
     ],
     hdrs = [
         "convert/convert_graph.h",
         "convert/convert_nodes.h",
+        "convert/trt_optimization_pass.h",
     ],
     deps = [
         ":segment",
+        ":trt_plugins",
         ":trt_logging",
         ":trt_resources",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core:framework",
+        "//tensorflow/core:gpu_runtime",
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
@@ -238,8 +252,7 @@ tf_gpu_library(
         "//tensorflow/core/grappler:devices",
         "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler/costs:graph_properties",
-        "//tensorflow/core/grappler/optimizers:constant_folding",
-        "//tensorflow/core/grappler/optimizers:layout_optimizer",
+        "//tensorflow/core/grappler/optimizers:meta_optimizer",
     ] + if_tensorrt([
         "@local_config_tensorrt//:nv_infer",
     ]) + tf_custom_op_library_additional_deps(),
@@ -253,7 +266,6 @@ cc_library(
         "segment/segment.h",
         "segment/union_find.h",
     ],
-    linkstatic = 1,
     deps = [
         "//tensorflow/core:graph",
         "//tensorflow/core:lib_proto_parsing",
@@ -276,6 +288,46 @@ tf_cc_test(
     ],
 )
 
+# Library for the plugin factory
+tf_gpu_library(
+    name = "trt_plugins",
+    srcs = [
+        "plugin/trt_plugin.cc",
+        "plugin/trt_plugin_factory.cc",
+        "plugin/trt_plugin_utils.cc",
+    ],
+    hdrs = [
+        "plugin/trt_plugin.h",
+        "plugin/trt_plugin_factory.h",
+        "plugin/trt_plugin_utils.h",
+    ],
+    deps = [
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:lib_proto_parsing",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:nv_infer",
+    ]),
+)
+
+tf_gpu_cc_test(
+    name = "trt_plugin_factory_test",
+    size = "small",
+    srcs = ["plugin/trt_plugin_factory_test.cc"],
+    tags = [
+        "manual",
+        "notap",
+    ],
+    deps = [
+        ":trt_plugins",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ] + if_tensorrt([
+        "@local_config_cuda//cuda:cuda_headers",
+        "@local_config_tensorrt//:nv_infer",
+    ]),
+)
+
 py_test(
     name = "tf_trt_integration_test",
     srcs = ["test/tf_trt_integration_test.py"],
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index 07740277115fe4..da4dd5a14cd745 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
 
 #include <list>
 #include <map>
@@ -24,6 +25,9 @@ limitations under the License.
 
 #include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
 #include "tensorflow/contrib/tensorrt/segment/segment.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
+#include "tensorflow/core/common_runtime/gpu/process_state.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -31,8 +35,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/devices.h"
 #include "tensorflow/core/grappler/grappler_item.h"
-#include "tensorflow/core/grappler/optimizers/constant_folding.h"
-#include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -75,7 +78,8 @@ bool IsTensorRTCandidate(const tensorflow::Node* node) {
       // TODO(ben,jie): ...
   };
   // LINT.ThenChange(//tensorflow/contrib/tensorrt/convert/convert_nodes.h)
-  return candidate_ops.count(node->type_string());
+  return (candidate_ops.count(node->type_string()) ||
+          PluginFactoryTensorRT::GetInstance()->IsPlugin(node->type_string()));
 }
 
 void GetSubGraphIncomingEdges(const tensorflow::Graph& graph,
@@ -87,8 +91,11 @@ void GetSubGraphIncomingEdges(const tensorflow::Graph& graph,
       if (!subgraph_node_ids.count(edge->src()->id()) &&
           !edge->src()->IsSource() && !edge->IsControlEdge()) {
         incoming_edges->insert(edge);
+        VLOG(2) << "INCOMING " << edge->src()->name() << " -> " << node->name()
+                << " Y, ";
       } else {
-        VLOG(2) << node->name() << " -> " << edge->src()->name() << " N, ";
+        VLOG(2) << "INCOMING " << edge->src()->name() << " -> " << node->name()
+                << " N, ";
       }
     }
   }
@@ -102,10 +109,12 @@ void GetSubGraphOutgoingEdges(const tensorflow::Graph& graph,
     for (const tensorflow::Edge* edge : node->out_edges()) {
       if (!subgraph_node_ids.count(edge->dst()->id()) &&
           !edge->dst()->IsSink() && !edge->IsControlEdge()) {
-        VLOG(2) << node->name() << " -> " << edge->dst()->name() << " Y, ";
+        VLOG(2) << "OUTGOING " << node->name() << " -> " << edge->dst()->name()
+                << " Y, ";
         outgoing_edges->insert(edge);
       } else {
-        VLOG(2) << node->name() << " -> " << edge->dst()->name() << " N, ";
+        VLOG(2) << "OUTGOING " << node->name() << " -> " << edge->dst()->name()
+                << " N, ";
       }
     }
   }
@@ -144,7 +153,8 @@ struct ConvertGraphParams {
       size_t max_supported_batch_size, size_t max_consumed_workspace_size_bytes,
       const tensorflow::grappler::GraphProperties& current_graph_properties,
       std::unordered_map<string, std::pair<int, string>>* output_edges,
-      int engine_precision_mode)
+      int engine_precision_mode, const string& device_name,
+      std::shared_ptr<nvinfer1::IGpuAllocator> allocator, int cuda_gpu_id)
       : graph(inp_graph),
         output_names(output_node_names),
         subgraph_node_ids(subgraph_node_id_numbers),
@@ -152,7 +162,10 @@ struct ConvertGraphParams {
         max_workspace_size_bytes(max_consumed_workspace_size_bytes),
         graph_properties(current_graph_properties),
         output_edge_map(output_edges),
-        precision_mode(engine_precision_mode) {}
+        precision_mode(engine_precision_mode),
+        device_name_(device_name),
+        allocator_(allocator),
+        cuda_gpu_id_(cuda_gpu_id) {}
   tensorflow::Graph& graph;
   const std::vector<string>& output_names;
   const std::set<int>& subgraph_node_ids;
@@ -161,6 +174,9 @@ struct ConvertGraphParams {
   const tensorflow::grappler::GraphProperties& graph_properties;
   std::unordered_map<string, std::pair<int, string>>* output_edge_map;
   int precision_mode;
+  string device_name_;
+  std::shared_ptr<nvinfer1::IGpuAllocator> allocator_;
+  int cuda_gpu_id_;
   std::vector<std::pair<int, int>> subgraph_inputs;
   std::vector<std::pair<int, int>> subgraph_outputs;
   tensorflow::EdgeSet subgraph_incoming_edges;
@@ -170,31 +186,29 @@ struct ConvertGraphParams {
 static tensorflow::Status FillSubGraphEdgeSets(ConvertGraphParams* p) {
   GetSubGraphIncomingEdges(p->graph, p->subgraph_node_ids,
                            &p->subgraph_incoming_edges);
+
+  std::set<std::pair<int, int>> unique_tensors;
+  // Add only unique input source nodes. If output of an outside node is shared
+  // between multiple nodes inside the engine, only one edge should be created
   for (const tensorflow::Edge* edge : p->subgraph_incoming_edges) {
-    p->subgraph_inputs.push_back({edge->src()->id(), edge->src_output()});
-  }
-  auto output_name_to_index_map = BuildTensorNameMap(p->output_names);
-  std::set<std::pair<int, int>> subgraph_outputs_set;
-  // Collect outputs referenced from output_names
-  for (int node_id : p->subgraph_node_ids) {
-    tensorflow::Node* node = p->graph.FindNodeId(node_id);
-    if (output_name_to_index_map.count(node->name())) {
-      for (int index : output_name_to_index_map.at(node->name())) {
-        subgraph_outputs_set.insert({node_id, index});
-      }
-    }
+    unique_tensors.insert({edge->src()->id(), edge->src_output()});
   }
+  p->subgraph_inputs.insert(p->subgraph_inputs.begin(), unique_tensors.begin(),
+                            unique_tensors.end());
   GetSubGraphOutgoingEdges(p->graph, p->subgraph_node_ids,
                            &p->subgraph_outgoing_edges);
+  unique_tensors.clear();
+  // Similar to above, if multiple ouside nodes are sharing the output of an
+  // internal node only one output port should be created and shared between
+  // outputs
   for (const tensorflow::Edge* edge : p->subgraph_outgoing_edges) {
-    subgraph_outputs_set.insert({edge->src()->id(), edge->src_output()});
+    unique_tensors.insert({edge->src()->id(), edge->src_output()});
   }
-  p->subgraph_outputs.reserve(subgraph_outputs_set.size());
+  p->subgraph_outputs.reserve(unique_tensors.size());
   p->subgraph_outputs.insert(p->subgraph_outputs.begin(),
-                             subgraph_outputs_set.begin(),
-                             subgraph_outputs_set.end());
+                             unique_tensors.begin(), unique_tensors.end());
   return tensorflow::Status::OK();
-};
+}
 
 tensorflow::Status GetCalibNode(ConvertGraphParams* params) {
   TF_RETURN_IF_ERROR(FillSubGraphEdgeSets(params));
@@ -203,7 +217,8 @@ tensorflow::Status GetCalibNode(ConvertGraphParams* params) {
                    params->subgraph_inputs, params->subgraph_outputs,
                    params->max_batch_size, params->max_workspace_size_bytes,
                    params->graph_properties, params->output_edge_map,
-                   &trt_node_def, params->precision_mode);
+                   &trt_node_def, params->precision_mode, params->device_name_,
+                   params->allocator_, params->cuda_gpu_id_);
   TF_RETURN_IF_ERROR(InjectCalibrationNode(s));
   tensorflow::Status status;
   tensorflow::Node* trt_node = params->graph.AddNode(trt_node_def, &status);
@@ -213,7 +228,6 @@ tensorflow::Status GetCalibNode(ConvertGraphParams* params) {
   for (auto in_edge :
        params->subgraph_incoming_edges) {  // loop over incoming edges and
                                            // attach them to calib node
-    // tensorflow::Node* src_node = in_edge->src();
     auto src_output = in_edge->src_output();
     auto dst_node = in_edge->dst();
     auto dst_input = in_edge->dst_input();
@@ -233,7 +247,8 @@ tensorflow::Status ConvertSubGraphToTensorRT(ConvertGraphParams* params) {
                    params->subgraph_inputs, params->subgraph_outputs,
                    params->max_batch_size, params->max_workspace_size_bytes,
                    params->graph_properties, params->output_edge_map,
-                   &trt_node_def, params->precision_mode);
+                   &trt_node_def, params->precision_mode, params->device_name_,
+                   params->allocator_, params->cuda_gpu_id_);
   TF_RETURN_IF_ERROR(ConvertSubGraphToTensorRTNodeDef(s));
   tensorflow::Status status;
   tensorflow::Node* trt_node = params->graph.AddNode(trt_node_def, &status);
@@ -244,19 +259,24 @@ tensorflow::Status ConvertSubGraphToTensorRT(ConvertGraphParams* params) {
   for (size_t i = 0; i < params->subgraph_inputs.size(); ++i) {
     subgraph_edge_to_input_map.insert({params->subgraph_inputs.at(i), i});
   }
+  std::set<std::pair<int, int>> unique_tensors;
   for (const tensorflow::Edge* edge : params->subgraph_incoming_edges) {
     std::pair<int, int> old_src = {edge->src()->id(), edge->src_output()};
+    if (unique_tensors.count(old_src)) continue;
+    unique_tensors.insert(old_src);
     int new_src_output = subgraph_edge_to_input_map.at(old_src);
     params->graph.AddEdge(edge->src(), edge->src_output(), trt_node,
                           new_src_output);
+    VLOG(1) << "Wire " << edge->src()->name() << ":" << edge->src_output()
+            << " -> " << trt_node->name() << ":" << new_src_output;
     params->graph.RemoveEdge(edge);
   }
-
-  VLOG(2) << "new wiring edges: " << trt_node->in_edges().size();
-  for (const tensorflow::Edge* edge : trt_node->in_edges()) {
-    VLOG(2) << edge->src()->name() << " port: " << edge->src_output();
+  if (VLOG_IS_ON(2)) {
+    VLOG(2) << "new edge count: " << trt_node->in_edges().size();
+    for (const tensorflow::Edge* edge : trt_node->in_edges()) {
+      VLOG(2) << edge->src()->name() << " port: " << edge->src_output();
+    }
   }
-
   TF_RETURN_IF_ERROR(status);
 
   // Re-map outgoing edges to use the new TRT node instead of the orig subgraph
@@ -270,6 +290,8 @@ tensorflow::Status ConvertSubGraphToTensorRT(ConvertGraphParams* params) {
     int new_src_output = subgraph_edge_to_output_map.at(old_src);
     TF_RETURN_IF_ERROR(params->graph.UpdateEdge(
         trt_node, new_src_output, edge->dst(), edge->dst_input()));
+    VLOG(1) << "Wire " << trt_node->name() << ":" << new_src_output << " -> "
+            << edge->dst()->name() << ":" << edge->dst_input();
   }
   // Remove the original subgraph
   for (int node_id : params->subgraph_node_ids) {
@@ -304,9 +326,12 @@ tensorflow::Status ConvertCalibGraphToInferGraph(
       tensorflow::GraphConstructorOptions(), graph_def, &graph));
   //  get calib nodes
   std::vector<tensorflow::Node*> calib_nodes;
-  for (auto node : graph.op_nodes()) {
+  std::vector<tensorflow::Node*> topo_order;
+  tensorflow::GetPostOrder(graph, &topo_order);
+  for (auto rit = topo_order.rbegin(); rit != topo_order.rend(); ++rit) {
+    auto node = *rit;
     if (node->type_string() == "TRTCalibOp") {
-      VLOG(1) << "Found Calib Node";
+      VLOG(1) << "Found Calib Node " << node->name();
       calib_nodes.push_back(node);
     }
   }
@@ -331,19 +356,12 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   // optimization pass
   tensorflow::grappler::GrapplerItem item;
   item.fetch = output_names;
-  tensorflow::GraphDef gdef;
-
-  // Layout optimization
   item.graph = graph_def;
-  tensorflow::grappler::LayoutOptimizer optimizer;
-  tensorflow::grappler::Cluster* cluster;
 
-  // virtual cluster
   tensorflow::DeviceProperties device_properties;
-
   device_properties.set_type("GPU");
   device_properties.mutable_environment()->insert({"architecture", "6"});
-  cluster =
+  tensorflow::grappler::Cluster* cluster =
       new tensorflow::grappler::VirtualCluster({{"/GPU:0", device_properties}});
 
   // single machine
@@ -351,27 +369,38 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   int num_gpus = tensorflow::grappler::GetNumAvailableGPUs();
   VLOG(2) << "cpu_cores: " << num_cpu_cores;
   VLOG(2) << "gpus: " << num_gpus;
-
-  TF_RETURN_IF_ERROR(optimizer.Optimize(cluster, item, &gdef));
-
-  // constant folding
+  tensorflow::RewriterConfig rw_cfg;
+  tensorflow::grappler::MetaOptimizer meta_opt(nullptr, rw_cfg);
+  tensorflow::GraphDef gdef;
+  TF_RETURN_IF_ERROR(meta_opt.Optimize(cluster, item, &gdef));
   item.graph = gdef;
-  tensorflow::grappler::ConstantFolding fold(nullptr);
-  TF_RETURN_IF_ERROR(fold.Optimize(nullptr, item, &gdef));
 
   // AJ refactoring shape inference through grappler/GraphProperties.
   tensorflow::grappler::GraphProperties static_graph_properties(item);
-  TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(false));
+  TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(true));
   // Build full graph
+
+  return ConvertAfterShapes(gdef, output_names, max_batch_size,
+                            max_workspace_size_bytes, new_graph_def,
+                            precision_mode, minimum_segment_size,
+                            static_graph_properties, nullptr);
+}
+
+tensorflow::Status ConvertAfterShapes(
+    const tensorflow::GraphDef& gdef, const std::vector<string>& output_names,
+    size_t max_batch_size, size_t max_workspace_size_bytes,
+    tensorflow::GraphDef* new_graph_def, int precision_mode,
+    int minimum_segment_size,
+    const tensorflow::grappler::GraphProperties& graph_properties,
+    const tensorflow::grappler::Cluster* cluster) {
+  // Segment the graph into subgraphs that can be converted to TensorRT
+  tensorflow::tensorrt::segment::SegmentOptions segment_options;
   tensorflow::FunctionLibraryDefinition flib(tensorflow::OpRegistry::Global(),
                                              gdef.library());
   tensorflow::Graph graph(flib);
   TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToGraph(
       tensorflow::GraphConstructorOptions(), gdef, &graph));
 
-  // Segment the graph into subgraphs that can be converted to TensorRT
-  tensorflow::tensorrt::segment::SegmentOptions segment_options;
-
   // TODO(ben,jie,sami): exclude output nodes (DISCUSS IT)
   for (auto node : output_names) {
     segment_options.exclude_node_list.insert(node);
@@ -381,7 +410,7 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   segment_options.minimum_segment_size = minimum_segment_size;
   tensorflow::tensorrt::segment::SegmentNodesVector segments;
   TF_RETURN_IF_ERROR(tensorrt::segment::SegmentGraph(
-      gdef, IsTensorRTCandidate, segment_options, &segments));
+      &graph, IsTensorRTCandidate, segment_options, &segments));
   if (segments.size() > 1) {
     VLOG(0) << "MULTIPLE tensorrt candidate conversion: " << segments.size();
   }
@@ -391,9 +420,21 @@ tensorflow::Status ConvertGraphDefToTensorRT(
   int count = 0;
   float total_num_nodes_in_segments = 0.;
   for (auto s : segments) {
-    total_num_nodes_in_segments += s.size();
+    total_num_nodes_in_segments += s.first.size();
+  }
+  // We create the map here since cluster may not be available in all cases.
+  std::map<string, tensorflow::Device*> name_to_device_map;
+  if (cluster) {
+    // TODO(aaroey): consider using DeviceSet::FindDeviceByName(), as in a
+    // distributed environment, devices from different workers can have same
+    // short name.
+    for (const auto dm : cluster->GetDeviceSet()->devices()) {
+      name_to_device_map[dm->name()] = dm;
+    }
   }
-  for (const std::set<string>& subgraph_node_names : segments) {
+  for (const auto& segment_nodes_and_device : segments) {
+    const std::set<string>& subgraph_node_names =
+        segment_nodes_and_device.first;
     std::set<int> subgraph_node_ids;
     size_t max_mem_per_engine =
         max_workspace_size_bytes *
@@ -403,10 +444,40 @@ tensorflow::Status ConvertGraphDefToTensorRT(
       oss << " " << node_name;
       subgraph_node_ids.insert(node_map.at(node_name)->id());
     }
-    VLOG(2) << "Subgraph nodes" << oss.str();
+    VLOG(1) << "Subgraph nodes at device " << segment_nodes_and_device.second
+            << " : " << oss.str();
+    auto target_device =
+        name_to_device_map.find(segment_nodes_and_device.second);
+    std::shared_ptr<nvinfer1::IGpuAllocator> allocator(0);
+
+    int cuda_device_id = 0;
+    if (target_device != name_to_device_map.end()) {
+      tensorflow::TfGpuId tf_gpu_id(target_device->second->parsed_name().id);
+      CudaGpuId cuda_gpu_id;
+      Status s = GpuIdManager::TfToCudaGpuId(tf_gpu_id, &cuda_gpu_id);
+      if (!s.ok()) {
+        LOG(ERROR)
+            << "Cuda device identification failed, using device 0. Error= "
+            << s;
+      } else {
+        cuda_device_id = cuda_gpu_id.value();
+      }
+      tensorflow::GPUOptions gpuoptions;
+      // we need to us PM here since in python path there is no way to get to
+      // allocators
+      auto pm = tensorflow::ProcessState::singleton();
+      // this should be instantiated by now
+      auto dev_allocator = pm->GetGPUAllocator(gpuoptions, tf_gpu_id, 1);
+      VLOG(1) << "Got an allocator for device tf_device=" << tf_gpu_id.value()
+              << " cuda device= " << cuda_device_id << " at " << dev_allocator;
+      allocator = std::make_shared<TRTDeviceAllocator>(dev_allocator);
+    } else {  // device unknown or not available
+      allocator = std::make_shared<TRTCudaAllocator>();
+    }
     ConvertGraphParams p(graph, output_names, subgraph_node_ids, max_batch_size,
-                         max_mem_per_engine, static_graph_properties,
-                         &output_edge_map, precision_mode);
+                         max_mem_per_engine, graph_properties, &output_edge_map,
+                         precision_mode, segment_nodes_and_device.second,
+                         allocator, cuda_device_id);
     if (precision_mode == INT8MODE) {
       tensorflow::Status status = GetCalibNode(&p);
       if (status != tensorflow::Status::OK()) {
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.h b/tensorflow/contrib/tensorrt/convert/convert_graph.h
index e01e4a5328061a..65a67d7e73e32f 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.h
@@ -18,6 +18,8 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -43,6 +45,14 @@ tensorflow::Status ConvertGraphDefToTensorRT(
     size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def,
     int precision_mode, int minimum_segment_size);
 
+// Method to call from optimization pass
+tensorflow::Status ConvertAfterShapes(
+    const tensorflow::GraphDef& graph, const std::vector<string>& output_names,
+    size_t max_batch_size, size_t max_workspace_size_bytes,
+    tensorflow::GraphDef* new_graph_def, int precision_mode,
+    int minimum_segment_size,
+    const tensorflow::grappler::GraphProperties& graph_properties,
+    const tensorflow::grappler::Cluster* cluster);
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index b81ae9dc3eeed6..4e4d295538edad 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/contrib/tensorrt/convert/convert_nodes.h"
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
 
 #include <algorithm>
 #include <list>
@@ -240,35 +241,49 @@ class TFAttrs {
     return attrs_.at(key);
   }
   template <typename T>
-  T get(string key) const;
+  T get(const string& key) const;
   template <typename T>
-  T get(string key, const T& default_value) const {
+  T get(const string& key, const T& default_value) const {
     return attrs_.count(key) ? this->get<T>(key) : default_value;
   }
 
+  std::vector<string> GetAllAttrKey() {
+    std::vector<string> attr_list;
+    for (const auto& attr_item : attrs_) {
+      attr_list.emplace_back(attr_item.first);
+    }
+    return attr_list;
+  }
+
  private:
   typedef std::map<string, tensorflow::AttrValue const*> AttrMap;
   AttrMap attrs_;
 };
 
 template <>
-string TFAttrs::get<string>(string key) const {
+string TFAttrs::get<string>(const string& key) const {
   return this->at(key)->s();
 }
 
 template <>
-std::vector<int> TFAttrs::get<std::vector<int>>(string key) const {
+std::vector<int> TFAttrs::get<std::vector<int>>(const string& key) const {
   auto attr = this->at(key)->list().i();
   return std::vector<int>(attr.begin(), attr.end());
 }
 
 template <>
-std::vector<string> TFAttrs::get<std::vector<string>>(string key) const {
+std::vector<float> TFAttrs::get<std::vector<float>>(const string& key) const {
+  auto attr = this->at(key)->list().f();
+  return std::vector<float>(attr.begin(), attr.end());
+}
+
+template <>
+std::vector<string> TFAttrs::get<std::vector<string>>(const string& key) const {
   auto attr = this->at(key)->list().s();
   return std::vector<string>(attr.begin(), attr.end());
 }
 template <>
-nvinfer1::Dims TFAttrs::get<nvinfer1::Dims>(string key) const {
+nvinfer1::Dims TFAttrs::get<nvinfer1::Dims>(const string& key) const {
   auto values = this->get<std::vector<int>>(key);
   nvinfer1::Dims dims;
   dims.nbDims = values.size();
@@ -278,24 +293,25 @@ nvinfer1::Dims TFAttrs::get<nvinfer1::Dims>(string key) const {
 }
 
 template <>
-nvinfer1::DataType TFAttrs::get<nvinfer1::DataType>(string key) const {
+nvinfer1::DataType TFAttrs::get<nvinfer1::DataType>(const string& key) const {
   nvinfer1::DataType trt_dtype(nvinfer1::DataType::kFLOAT);
   TF_CHECK_OK(ConvertDType(this->at(key)->type(), &trt_dtype));
   return trt_dtype;
 }
 
 template <>
-tensorflow::DataType TFAttrs::get<tensorflow::DataType>(string key) const {
+tensorflow::DataType TFAttrs::get<tensorflow::DataType>(
+    const string& key) const {
   return this->at(key)->type();
 }
 
 template <>
-float TFAttrs::get<float>(string key) const {
+float TFAttrs::get<float>(const string& key) const {
   return this->at(key)->f();
 }
 
 template <>
-bool TFAttrs::get<bool>(string key) const {
+bool TFAttrs::get<bool>(const string& key) const {
   return this->at(key)->b();
 }
 
@@ -346,10 +362,11 @@ void ReorderCKtoKC(const TRT_ShapedWeights& iweights,
       break;
     }
     case tensorflow::DataType::DT_HALF: {
-      Reorder2({k, c}, static_cast<Eigen::half const*>(iweights.GetValues()),
-               istrides, static_cast<Eigen::half*>(
-                             const_cast<void*>(oweights->GetValues())),
-               ostrides);
+      Reorder2(
+          {k, c}, static_cast<Eigen::half const*>(iweights.GetValues()),
+          istrides,
+          static_cast<Eigen::half*>(const_cast<void*>(oweights->GetValues())),
+          ostrides);
       break;
     }
     default:
@@ -424,6 +441,7 @@ using OpConverter =
 class Converter {
   std::unordered_map<string, TRT_TensorOrWeights> trt_tensors_;
   std::unordered_map<string, OpConverter> op_registry_;
+  OpConverter plugin_converter_;
   nvinfer1::INetworkDefinition* trt_network_;
   std::list<std::vector<uint8_t>> temp_bufs_;
   tensorflow::tensorrt::TRTWeightStore* weight_store_;
@@ -481,7 +499,7 @@ class Converter {
     weights.SetValues(weight_store_->store_.back().data());
     return weights;
   }
-  bool isFP16() { return fp16_; };
+  bool isFP16() { return fp16_; }
   TRT_ShapedWeights get_temp_weights_like(const TRT_ShapedWeights& weights) {
     return this->get_temp_weights(weights.type_, weights.shape_);
   }
@@ -490,13 +508,17 @@ class Converter {
     std::vector<TRT_TensorOrWeights> inputs;
     TF_RETURN_IF_ERROR(this->get_inputs(node_def, &inputs));
     string op = node_def.op();
-    if (!op_registry_.count(op)) {
-      return tensorflow::errors::Unimplemented(
-          "No converter registered for op: " + op);
-    }
-    OpConverter op_converter = op_registry_.at(op);
     std::vector<TRT_TensorOrWeights> outputs;
-    TF_RETURN_IF_ERROR(op_converter(*this, node_def, inputs, &outputs));
+    if (PluginFactoryTensorRT::GetInstance()->IsPlugin(op)) {
+      TF_RETURN_IF_ERROR(plugin_converter_(*this, node_def, inputs, &outputs));
+    } else {
+      if (!op_registry_.count(op)) {
+        return tensorflow::errors::Unimplemented(
+            "No converter registered for op: " + op);
+      }
+      OpConverter op_converter = op_registry_.at(op);
+      TF_RETURN_IF_ERROR(op_converter(*this, node_def, inputs, &outputs));
+    }
     for (size_t i = 0; i < outputs.size(); ++i) {
       TRT_TensorOrWeights output = outputs.at(i);
       // TODO(jie): tf protobuf seems to be omitting the :0 suffix
@@ -672,7 +694,7 @@ std::function<Eigen::half(Eigen::half)> LambdaFactory::unary<Eigen::half>() {
     case OP_CATEGORY::RSQRT: {
       VLOG(2) << "RSQRT GETS DONE";
       return [](Eigen::half t) -> Eigen::half {
-        return Eigen::half(1.0 / sqrt(float(t)));
+        return Eigen::half(1.0 / sqrt(static_cast<float>(t)));
       };
     }
     case OP_CATEGORY::NEG:
@@ -1173,6 +1195,45 @@ tensorflow::Status BinaryTensorOpTensor(
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertPlugin(Converter& ctx,
+                                 const tensorflow::NodeDef& node_def,
+                                 const std::vector<TRT_TensorOrWeights>& inputs,
+                                 std::vector<TRT_TensorOrWeights>* outputs) {
+  // prepare input
+  std::vector<nvinfer1::ITensor*> all_inputs;
+  for (auto input : inputs) {
+    all_inputs.emplace_back(const_cast<nvinfer1::ITensor*>(input.tensor()));
+  }
+
+  // plugin is owned by PluginFactory
+  // TODO(jie): destroy plugins later (resource management)
+  PluginTensorRT* plugin =
+      PluginFactoryTensorRT::GetInstance()->CreatePlugin(node_def.op());
+
+  // passing attributes
+  // TODO(jie): support more general attribute
+  TFAttrs attrs(node_def);
+  auto attr_key_vector = attrs.GetAllAttrKey();
+  for (auto attr_key : attr_key_vector) {
+    // TODO(jie): support only list of float for toy example here.
+    auto data = attrs.get<std::vector<float>>(attr_key);
+    size_t size_data = data.size() * sizeof(float);
+    if (!plugin->SetAttribute(attr_key, static_cast<void*>(data.data()),
+                              size_data)) {
+      return tensorflow::errors::InvalidArgument("plugin SetAttribute failed");
+    }
+  }
+
+  nvinfer1::IPluginLayer* layer = ctx.network()->addPlugin(
+      &all_inputs[0], static_cast<int>(inputs.size()), *plugin);
+
+  for (int i = 0; i < layer->getNbOutputs(); i++) {
+    nvinfer1::ITensor* output_tensor = layer->getOutput(i);
+    outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  }
+  return tensorflow::Status::OK();
+}
+
 tensorflow::Status ConvertPlaceholder(
     Converter& ctx, const tensorflow::NodeDef& node_def,
     const std::vector<TRT_TensorOrWeights>& inputs,
@@ -2073,12 +2134,12 @@ void Converter::register_op_converters() {
   op_registry_["Reshape"] = ConvertReshape;
   op_registry_["FusedBatchNorm"] = ConvertFusedBatchNorm;
   op_registry_["FusedBatchNormV2"] = ConvertFusedBatchNorm;
+
+  plugin_converter_ = ConvertPlugin;
 }
 
 }  // namespace
-tensorflow::Status GetTensorRTGraph(tensorrt::convert::SubGraphParams& s) {
-  return tensorflow::errors::Unimplemented("Not implemented yet");
-}
+
 tensorflow::Status ConvertCalibrationNodeToEngineNode(
     tensorflow::Graph& graph, tensorflow::Node* c_node) {
   const auto ndef = c_node->def();
@@ -2102,9 +2163,23 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   for (auto n : graph.op_nodes()) {
     node_maps.insert({n->name(), n});
   }
+  std::set<int> subgraph_ids;
+  for (const auto internal_node : segment_nodes) {
+    subgraph_ids.insert(node_maps.at(internal_node)->id());
+  }
+  if (VLOG_IS_ON(2)) {
+    string node_names = StrCat(c_node->name(), " segment nodes= ");
+
+    for (const auto& node_name : segment_nodes) {
+      StrAppend(&node_names, node_name, ", ");
+    }
+    VLOG(2) << node_names;
+  }
+
   VLOG(1) << "Output Nodes:";
   std::vector<tensorflow::DataType> out_types;
   std::vector<const tensorflow::Edge*> out_edges;
+
   for (auto& i : output_nodes) {
     auto node_port = tensorflow::str_util::Split(i, ":");
     VLOG(1) << " " << i << " in graph " << node_maps.count(i);
@@ -2124,18 +2199,24 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
         out_types.push_back(out_node->output_type(0));
       }
       for (auto out_edge : out_node->out_edges()) {
+        if (subgraph_ids.count(out_edge->dst()->id()))
+          continue;  // skip internal edges;
         if (out_edge->src_output() == port) {
           out_edges.push_back(out_edge);
-          break;
+          VLOG(1) << "OUTPUT EDGE " << out_edge->src()->name() << ":"
+                  << out_edge->src_output() << " -> " << out_edge->dst()->name()
+                  << ":" << out_edge->dst_input();
         }
       }
     } else {
       LOG(WARNING) << " couldn't find output node " << out_node_name;
     }
   }
-  VLOG(1) << "Input Nodes:";
-  for (auto& i : input_names) {
-    VLOG(1) << " " << i << " in graph " << node_maps.count(i);
+  if (VLOG_IS_ON(1)) {
+    VLOG(1) << c_node->name() << " Input Nodes:";
+    for (auto& i : input_names) {
+      VLOG(1) << " Input " << i << " in graph " << node_maps.count(i);
+    }
   }
   auto trt_rm = tensorflow::tensorrt::TRTResourceManager::instance();
   auto resmgr = trt_rm->getManager("TRTCalibOps");
@@ -2144,7 +2225,7 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   if (!status.ok() || !calib_res->calibrator_) {
     return tensorflow::errors::FailedPrecondition(
         "You must run calibration"
-        " and inference conversion in the same proces");
+        " and inference conversion in the same process");
   }
 
   calib_res->calibrator_->setDone();
@@ -2169,14 +2250,24 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   calib_res->builder_ = nullptr;
   tensorflow::NodeDefBuilder op_builder(engine_name, "TRTEngineOp");
   std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
+  income_edges.resize(c_node->num_inputs());
   for (const auto in_edge : c_node->in_edges()) {
     auto src = in_edge->src();
     int dest_port = in_edge->dst_input();
-    income_edges.emplace_back(src->name(), in_edge->src_output(),
-                              c_node->input_type(dest_port));
+    VLOG(1) << "Incoming connection " << src->name() << ":"
+            << in_edge->src_output() << " -> " << c_node->name() << ":"
+            << dest_port;
+    income_edges.at(dest_port) = {src->name(), in_edge->src_output(),
+                                  c_node->input_type(dest_port)};
   }
   tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
       income_edges);
+  if (VLOG_IS_ON(2)) {
+    for (const auto& inp : input_list) {
+      VLOG(2) << " Input from inputlist " << inp.node << ":" << inp.index << " "
+              << tensorflow::DataTypeString(inp.data_type);
+    }
+  }
   op_builder.Input(input_list);
   tensorflow::NodeDef engine_node;
   const char* engine_plan_data = static_cast<const char*>(engine_plan->data());
@@ -2193,13 +2284,26 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   }
   auto trt_engine_node = graph.AddNode(engine_node, &status);
   TF_RETURN_IF_ERROR(status);
-  for (size_t i = 0; i < out_edges.size(); i++) {
-    VLOG(1) << "Connecting trt_engine_node output " << i << " with "
-            << out_edges.at(i)->dst()->name() << " port "
-            << out_edges.at(i)->dst_input();
-    TF_RETURN_IF_ERROR(graph.UpdateEdge(trt_engine_node, i,
-                                        out_edges.at(i)->dst(),
-                                        out_edges.at(i)->dst_input()));
+  std::map<string, int> port_map;
+  for (size_t t = 0; t < output_nodes.size(); t++) {
+    port_map.insert({output_nodes.at(t), t});
+  }
+  for (auto& i : out_edges) {
+    string s(i->src()->name());
+    if (i->src_output()) StrAppend(&s, ":", i->src_output());
+    int out_port = port_map.at(s);
+    VLOG(1) << "Connecting " << trt_engine_node->name() << ":" << out_port
+            << " -> " << i->dst()->name() << ":" << i->dst_input();
+    TF_RETURN_IF_ERROR(
+        graph.UpdateEdge(trt_engine_node, out_port, i->dst(), i->dst_input()));
+  }
+  for (const auto ed : trt_engine_node->in_edges()) {
+    VLOG(1) << "In Edge  " << ed->src()->name() << ":" << ed->src_output()
+            << " -> " << ed->dst()->name() << ":" << ed->dst_input();
+  }
+  for (const auto ed : trt_engine_node->out_edges()) {
+    VLOG(1) << "Out Edge " << ed->src()->name() << ":" << ed->src_output()
+            << " -> " << ed->dst()->name() << ":" << ed->dst_input();
   }
   VLOG(1) << "Segment nodes:";
   for (auto& i : segment_nodes) {
@@ -2213,60 +2317,64 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
-  // Visit nodes in reverse topological order and construct the TRT network.
-
-  // Toposort
+tensorflow::Status ReverseTopologicalSort(
+    const tensorrt::convert::SubGraphParams& s,
+    std::list<tensorflow::Node*>* order) {
   std::vector<tensorflow::Node*> order_vec;
   tensorflow::GetPostOrder(s.graph, &order_vec);
   // Select just the subgraph
-  std::list<tensorflow::Node*> order;
   for (tensorflow::Node* node : order_vec) {
     if (s.subgraph_node_ids.count(node->id())) {
-      order.push_front(node);  // we want topological order to construct the
+      // We want topological order to contstruct the
       // network layer by layer
+      order->push_front(node);
     }
   }
-  // topological order is needed to build TRT network
-  static int static_id = 0;
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status SetInputList(
+    const tensorrt::convert::SubGraphParams& s,
+    tensorflow::NodeDefBuilder* op_builder,
+    const std::vector<string>* input_names,
+    std::vector<tensorflow::DataType>* input_dtypes) {
+  std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
+  VLOG(2) << "input edge size: " << input_names->size();
+  for (size_t i = 0; i < input_names->size(); ++i) {
+    VLOG(2) << "input edges: " << i << " " << input_names->at(i);
+    int output_idx = s.input_inds.at(i).second;
+    // we wired up the input here already, it is redundant to do it again in
+    //  ConvertSubGraphToTensorRT(convert_graph.cc)
+    auto incoming_edge = tensorflow::NodeDefBuilder::NodeOut(
+        input_names->at(i), output_idx, input_dtypes->at(i));
+    income_edges.push_back(incoming_edge);
+  }
+  tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
+      income_edges);
+  op_builder->Input(input_list);
+  return tensorflow::Status::OK();
+}
+
+string SubgraphNameScopeGenerator(const std::list<tensorflow::Node*>* order) {
   string subgraph_name_scope;
-  if (!order.empty()) {
-    subgraph_name_scope = order.front()->name();
+  if (!order->empty()) {
+    subgraph_name_scope = order->front()->name();
   }
-  for (const tensorflow::Node* node : order) {
+  for (const tensorflow::Node* node : *order) {
     subgraph_name_scope = GetCommonNameScope(subgraph_name_scope, node->name());
   }
   // TODO(sami,ben,jie): proper naming!
-  string calib_op_name =
-      StrCat(subgraph_name_scope, "my_trt_calib_op_", static_id);
-  string engine_name = StrCat(subgraph_name_scope, "my_trt_op", static_id);
-  static_id++;
-  auto trt_rmgr = tensorflow::tensorrt::TRTResourceManager::instance();
-  auto op_rmgr = trt_rmgr->getManager("TRTCalibOps");
-  auto op_res = new tensorflow::tensorrt::TRTCalibrationResource();
-  TF_CHECK_OK(op_rmgr->Create(calib_op_name, calib_op_name, op_res));
-  op_res->logger_ = new tensorflow::tensorrt::Logger();
-  op_res->builder_ = nvinfer1::createInferBuilder(*(op_res->logger_));
-
-  if (!op_res->builder_) {
-    return tensorflow::errors::Internal(
-        "failed to create TensorRT builder object");
-  }
-
-  op_res->network_ = op_res->builder_->createNetwork();
-  if (!op_res->network_) {
-    return tensorflow::errors::Internal(
-        "failed to create TensorRT network object");
-  }
-
-  // Build the network
-  auto weight_rmgr = trt_rmgr->getManager("WeightStore");
-  auto ws = new tensorflow::tensorrt::TRTWeightStore();
-  TF_CHECK_OK(weight_rmgr->Create(calib_op_name, calib_op_name, ws));
-  Converter converter(op_res->network_, ws, s.precision_mode == FP16MODE);
+  return subgraph_name_scope;
+}
 
-  std::vector<string> input_names;
-  std::vector<tensorflow::DataType> input_dtypes;
+tensorflow::Status ConvertSubgraph(
+    Converter& converter, tensorrt::convert::SubGraphParams& s,
+    std::list<tensorflow::Node*>* order, std::vector<string>* input_names,
+    std::vector<tensorflow::DataType>* input_dtypes,
+    std::vector<string>* output_names,
+    std::vector<tensorflow::DataType>* output_dtypes,
+    const string& engine_name) {
+  std::set<string> added_tensors;
   for (const std::pair<int, int>& input : s.input_inds) {
     VLOG(2) << "parsing input. Node id= " << input.first;
     int node_id = input.first;
@@ -2309,22 +2417,20 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
 
     auto op_info = op_info_vec.at(shape_inference_output_idx);
     tensorflow::DataType tf_dtype = op_info.dtype();
-    input_dtypes.push_back(tf_dtype);
 
     nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
     auto type_status = ConvertDType(tf_dtype, &dtype);
     if (type_status != tensorflow::Status::OK()) {
-      LOG(WARNING) << "Data type conversion for input '" << node_name
-                   << "' failed";
+      LOG(WARNING) << "Type conversion failed for " << node_name;
       return type_status;
     }
 
-    VLOG(2) << "accessing output index of: " << output_idx
+    VLOG(2) << "Accessing output index of: " << output_idx
             << ", at node: " << node_name
-            << "with output entry from shape_map: " << op_info_vec.size();
+            << " with output entry from shape_map: " << op_info_vec.size();
     // TODO(ben,jie): update TRT input format/dimension
-    nvinfer1::DimsCHW input_dim_psuedo_chw;
-    for (int i = 0; i < 3; i++) input_dim_psuedo_chw.d[i] = 1;
+    nvinfer1::DimsCHW input_dim_pseudo_chw;
+    for (int i = 0; i < 3; i++) input_dim_pseudo_chw.d[i] = 1;
 
     // TODO(jie): TRT 3.x only support 4 dimensional input tensor.
     //            update the code once TRT 4.0 comes out.
@@ -2338,7 +2444,7 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
     for (int i = 1; i < op_info.shape().dim_size(); i++) {
       VLOG(2) << "dimension: " << i
               << " , size: " << op_info.shape().dim(i).size();
-      input_dim_psuedo_chw.d[i - 1] = op_info.shape().dim(i).size();
+      input_dim_pseudo_chw.d[i - 1] = op_info.shape().dim(i).size();
     }
 
     // TODO(ben,jie): proper way to restore input tensor name?
@@ -2346,35 +2452,34 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
     if (output_idx != 0) {
       input_tensor_name = StrCat(node_name, ":", output_idx);
     }
-
-    input_names.push_back(input_tensor_name);
+    if (added_tensors.count(input_tensor_name)) continue;
+    added_tensors.insert(input_tensor_name);
+    input_names->push_back(input_tensor_name);
+    input_dtypes->push_back(tf_dtype);
     nvinfer1::ITensor* input_tensor = converter.network()->addInput(
-        input_tensor_name.c_str(), dtype, input_dim_psuedo_chw);
+        input_tensor_name.c_str(), dtype, input_dim_pseudo_chw);
 
     if (!input_tensor)
       return tensorflow::errors::InvalidArgument(
           "Failed to create Input layer");
-    VLOG(2) << "input tensor name :" << input_tensor_name;
+    VLOG(2) << "Input tensor name :" << input_tensor_name;
 
     if (!converter.insert_input_tensor(input_tensor_name, input_tensor))
       return tensorflow::errors::AlreadyExists(
-          "output tensor already exists for op: " + input_tensor_name);
+          "Output tensor already exists for op: " + input_tensor_name);
   }
 
-  VLOG(2) << "finished sorting";
-
-  for (const tensorflow::Node* node : order) {
+  for (const tensorflow::Node* node : *order) {
     const tensorflow::NodeDef& node_def = node->def();
-    VLOG(2) << "converting node: " << node_def.name() << " , " << node_def.op();
+    VLOG(2) << "Converting node: " << node_def.name() << " , " << node_def.op();
     TF_RETURN_IF_ERROR(converter.convert_node(node_def));
   }
 
-  VLOG(2) << "finished conversion";
+  VLOG(2) << "Finished conversion";
 
   // Gather output metadata
-  std::vector<string> output_names;
-  std::vector<tensorflow::DataType> output_dtypes;
   int trt_engine_op_output_idx = 0;
+  added_tensors.clear();
   for (const std::pair<int, int>& output : s.output_inds) {
     int node_id = output.first;
     int output_idx = output.second;
@@ -2388,14 +2493,15 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
              : StrCat(engine_name, ":", trt_engine_op_output_idx),
          {output_idx, tensor_name}});
     trt_engine_op_output_idx++;
-    if (output_idx != 0) {
-      tensor_name = StrCat(tensor_name, ":", output_idx);
-    }
-    VLOG(1) << "output tensor name: " << tensor_name;
-    output_names.push_back(tensor_name);
+    if (output_idx != 0)
+      tensorflow::strings::StrAppend(&tensor_name, ":", output_idx);
+    VLOG(2) << "Output tensor name: " << tensor_name;
+    if (added_tensors.count(tensor_name)) continue;
+    added_tensors.insert(tensor_name);
+    output_names->push_back(tensor_name);
     auto tensor_or_weights = converter.get_tensor(tensor_name);
     if (!tensor_or_weights.is_tensor()) {
-      return tensorflow::errors::InvalidArgument("Output node'" + tensor_name +
+      return tensorflow::errors::InvalidArgument("Output node '" + tensor_name +
                                                  "' is weights not tensor");
     }
     nvinfer1::ITensor* tensor = tensor_or_weights.tensor();
@@ -2405,12 +2511,65 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
     }
     converter.network()->markOutput(*tensor);
     tensorflow::DataType tf_dtype = node->output_type(output_idx);
-    output_dtypes.push_back(tf_dtype);
+    output_dtypes->push_back(tf_dtype);
     nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT;
     TF_RETURN_IF_ERROR(ConvertDType(tf_dtype, &trt_dtype));
     tensor->setType(trt_dtype);
   }
 
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
+  // Visit nodes in reverse topological order and construct the TRT network.
+  // Toposort
+  std::list<tensorflow::Node*> order;
+  TF_RETURN_IF_ERROR(ReverseTopologicalSort(s, &order));
+
+  static int static_id = 0;
+  string subgraph_name_scope = SubgraphNameScopeGenerator(&order);
+  // TODO(sami,ben,jie): proper naming!
+  string calib_op_name =
+      StrCat(subgraph_name_scope, "my_trt_calib_op_", static_id);
+  string engine_name = StrCat(subgraph_name_scope, "my_trt_op", static_id);
+  static_id++;
+
+  auto trt_rmgr = tensorflow::tensorrt::TRTResourceManager::instance();
+  auto op_rmgr = trt_rmgr->getManager("TRTCalibOps");
+  auto op_res = new tensorflow::tensorrt::TRTCalibrationResource();
+  TF_CHECK_OK(op_rmgr->Create(calib_op_name, calib_op_name, op_res));
+  op_res->logger_ = new tensorflow::tensorrt::Logger();
+  cudaSetDevice(s.cuda_gpu_id_);
+  op_res->builder_ = nvinfer1::createInferBuilder(*(op_res->logger_));
+  op_res->allocator_ = s.allocator_;
+#if NV_TENSORRT_MAJOR > 3
+  op_res->builder_->setGpuAllocator(s.allocator_.get());
+#endif
+  if (!op_res->builder_) {
+    return tensorflow::errors::Internal(
+        "failed to create TensorRT builder object");
+  }
+
+  op_res->network_ = op_res->builder_->createNetwork();
+  if (!op_res->network_) {
+    return tensorflow::errors::Internal(
+        "failed to create TensorRT network object");
+  }
+
+  // Build the network
+  auto weight_rmgr = trt_rmgr->getManager("WeightStore");
+  auto ws = new tensorflow::tensorrt::TRTWeightStore();
+  TF_CHECK_OK(weight_rmgr->Create(calib_op_name, calib_op_name, ws));
+  Converter converter(op_res->network_, ws, s.precision_mode == FP16MODE);
+
+  std::vector<string> input_names;
+  std::vector<tensorflow::DataType> input_dtypes;
+  std::vector<string> output_names;
+  std::vector<tensorflow::DataType> output_dtypes;
+  TF_RETURN_IF_ERROR(ConvertSubgraph(converter, s, &order, &input_names,
+                                     &input_dtypes, &output_names,
+                                     &output_dtypes, engine_name));
+
   VLOG(2) << "Finished processing outputs";
 
   // Build the engine
@@ -2422,21 +2581,8 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
   // Build the TRT op
   // TODO(sami,ben,jie): proper naming!
   tensorflow::NodeDefBuilder op_builder(calib_op_name, "TRTCalibOp");
-  std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
-  for (size_t i = 0; i < input_names.size(); ++i) {
-    int output_idx = s.input_inds.at(i).second;
-    // we wired up the input here already, it is redundant to do it again in
-    //  ConvertSubGraphToTensorRT(convert_graph.cc)
-    auto incoming_edge = tensorflow::NodeDefBuilder::NodeOut(
-        input_names.at(i), output_idx, input_dtypes.at(i));
-    VLOG(1) << calib_op_name << " input " << i << " = " << input_names.at(i)
-            << ":" << output_idx
-            << " dType= " << tensorflow::DataTypeString(input_dtypes.at(i));
-    income_edges.push_back(incoming_edge);
-  }
-  tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
-      income_edges);
-  op_builder.Input(input_list);
+  TF_RETURN_IF_ERROR(SetInputList(s, &op_builder, &input_names, &input_dtypes));
+
   std::vector<string> segment_names;
   segment_names.reserve(s.subgraph_node_ids.size());
   for (int i : s.subgraph_node_ids) {
@@ -2460,46 +2606,29 @@ tensorflow::Status InjectCalibrationNode(tensorrt::convert::SubGraphParams& s) {
 tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
     tensorrt::convert::SubGraphParams& s) {
   // Visit nodes in reverse topological order and construct the TRT network.
-
-  // Toposort
-  std::vector<tensorflow::Node*> order_vec;
-  tensorflow::GetPostOrder(s.graph, &order_vec);
-  // Select just the subgraph
   std::list<tensorflow::Node*> order;
-  for (tensorflow::Node* node : order_vec) {
-    if (s.subgraph_node_ids.count(node->id())) {
-      // We want topological order to contstruct the
-      // network layer by layer
-      order.push_front(node);
-    }
-  }
-  // Topological order is needed to build TRT network
+  TF_RETURN_IF_ERROR(ReverseTopologicalSort(s, &order));
 
-  tensorflow::tensorrt::Logger trt_logger;
+  static int static_id = 0;
+  string subgraph_name_scope = SubgraphNameScopeGenerator(&order);
+  string engine_name = StrCat(subgraph_name_scope, "my_trt_op", static_id++);
 
+  tensorflow::tensorrt::Logger trt_logger;
+  cudaSetDevice(s.cuda_gpu_id_);
   auto trt_builder = infer_object(nvinfer1::createInferBuilder(trt_logger));
   if (!trt_builder) {
     return tensorflow::errors::Internal(
         "Failed to create TensorRT builder object");
   }
-
+#if NV_TENSORRT_MAJOR > 3
+  trt_builder->setGpuAllocator(s.allocator_.get());
+#endif
   auto trt_network = infer_object(trt_builder->createNetwork());
   if (!trt_network) {
     return tensorflow::errors::Internal(
         "Failed to create TensorRT network object");
   }
 
-  string subgraph_name_scope;
-  if (!order.empty()) {
-    subgraph_name_scope = order.front()->name();
-  }
-  for (const tensorflow::Node* node : order) {
-    subgraph_name_scope = GetCommonNameScope(subgraph_name_scope, node->name());
-  }
-  static int static_id = 0;
-  // TODO(sami,ben,jie): proper naming!
-  string engine_name = StrCat(subgraph_name_scope, "my_trt_op");
-  engine_name = StrCat(engine_name, static_id++);
   auto trt_rmgr = tensorflow::tensorrt::TRTResourceManager::instance();
   auto weight_rmgr = trt_rmgr->getManager("WeightStore");
   auto ws = new tensorflow::tensorrt::TRTWeightStore();
@@ -2510,147 +2639,11 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
 
   std::vector<string> input_names;
   std::vector<tensorflow::DataType> input_dtypes;
-  for (const std::pair<int, int>& input : s.input_inds) {
-    VLOG(2) << "parsing input. Node id= " << input.first;
-    int node_id = input.first;
-    int output_idx = input.second;
-    tensorflow::Node* node = s.graph.FindNodeId(node_id);
-    auto node_name = node->name();
-    // input_names should use the node name in the graph
-    // here it should be the input tensor name -> matching the binding
-    // insert original node name without port
-    auto tensor_name = node_name;
-    if (output_idx != 0) {
-      tensor_name = StrCat(tensor_name, ":", output_idx);
-    }
-
-    VLOG(2) << "input name: " << node_name << " tensor_name: " << tensor_name
-            << " idx: " << output_idx;
-
-    auto shape_inference_node_name = node_name;
-    auto shape_inference_output_idx = output_idx;
-    // rewire the shape inference to original node in the graph
-    if (s.output_edge_map->count(tensor_name)) {
-      shape_inference_node_name = s.output_edge_map->at(tensor_name).second;
-      shape_inference_output_idx = s.output_edge_map->at(tensor_name).first;
-    }
-    if (shape_inference_output_idx < 0) continue;
-    VLOG(2) << "shapeinference name: " << shape_inference_node_name
-            << " idx: " << shape_inference_output_idx;
-
-    if (!s.graph_properties.HasOutputProperties(shape_inference_node_name))
-      return tensorflow::errors::Internal("failed to find input node: " +
-                                          shape_inference_node_name);
-
-    auto op_info_vec =
-        s.graph_properties.GetOutputProperties(shape_inference_node_name);
-    if (static_cast<int>(op_info_vec.size()) <= shape_inference_output_idx)
-      return tensorflow::errors::Internal(
-          "accessing output index of: ", shape_inference_output_idx,
-          ", at node: ", shape_inference_node_name,
-          " with output entry from shape_map: ", op_info_vec.size());
-
-    auto op_info = op_info_vec.at(shape_inference_output_idx);
-    tensorflow::DataType tf_dtype = op_info.dtype();
-    input_dtypes.push_back(tf_dtype);
-
-    nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
-    auto type_status = ConvertDType(tf_dtype, &dtype);
-    if (type_status != tensorflow::Status::OK()) {
-      LOG(WARNING) << "Type conversion failed for " << node_name;
-      return type_status;
-    }
-
-    VLOG(2) << "Accessing output index of: " << output_idx
-            << ", at node: " << node_name
-            << " with output entry from shape_map: " << op_info_vec.size();
-    // TODO(ben,jie): update TRT input format/dimension
-    nvinfer1::DimsCHW input_dim_psuedo_chw;
-    for (int i = 0; i < 3; i++) input_dim_psuedo_chw.d[i] = 1;
-
-    // TODO(jie): TRT 3.x only support 4 dimensional input tensor.
-    //            update the code once TRT 4.0 comes out.
-    if (op_info.shape().dim_size() != 4) {
-      string err_str = "Require 4 dimensional input.";
-      StrAppend(&err_str, " Got ", op_info.shape().dim_size(), " ",
-                shape_inference_node_name);
-      return tensorflow::errors::Unimplemented(err_str);
-    }
-
-    for (int i = 1; i < op_info.shape().dim_size(); i++) {
-      VLOG(2) << "dimension: " << i
-              << " , size: " << op_info.shape().dim(i).size();
-      input_dim_psuedo_chw.d[i - 1] = op_info.shape().dim(i).size();
-    }
-
-    // TODO(ben,jie): proper way to restore input tensor name?
-    auto input_tensor_name = node_name;
-    if (output_idx != 0) {
-      input_tensor_name = StrCat(node_name, ":", output_idx);
-    }
-
-    input_names.push_back(input_tensor_name);
-    nvinfer1::ITensor* input_tensor = converter.network()->addInput(
-        input_tensor_name.c_str(), dtype, input_dim_psuedo_chw);
-
-    if (!input_tensor)
-      return tensorflow::errors::InvalidArgument(
-          "Failed to create Input layer");
-    VLOG(2) << "Input tensor name :" << input_tensor_name;
-
-    if (!converter.insert_input_tensor(input_tensor_name, input_tensor))
-      return tensorflow::errors::AlreadyExists(
-          "Output tensor already exists for op: " + input_tensor_name);
-  }
-
-  VLOG(2) << "Finished sorting";
-
-  for (const tensorflow::Node* node : order) {
-    const tensorflow::NodeDef& node_def = node->def();
-    VLOG(2) << "Converting node: " << node_def.name() << " , " << node_def.op();
-    TF_RETURN_IF_ERROR(converter.convert_node(node_def));
-  }
-
-  VLOG(2) << "Finished conversion";
-
-  // Gather output metadata
   std::vector<string> output_names;
   std::vector<tensorflow::DataType> output_dtypes;
-  int trt_engine_op_output_idx = 0;
-  for (const std::pair<int, int>& output : s.output_inds) {
-    int node_id = output.first;
-    int output_idx = output.second;
-    tensorflow::Node* node = s.graph.FindNodeId(node_id);
-    string op_name = node->name();
-    string tensor_name = op_name;
-
-    s.output_edge_map->insert(
-        {trt_engine_op_output_idx == 0
-             ? engine_name
-             : StrCat(engine_name, ":", trt_engine_op_output_idx),
-         {output_idx, tensor_name}});
-    trt_engine_op_output_idx++;
-    if (output_idx != 0)
-      tensorflow::strings::StrAppend(&tensor_name, ":", output_idx);
-    VLOG(2) << "Output tensor name: " << tensor_name;
-    output_names.push_back(tensor_name);
-    auto tensor_or_weights = converter.get_tensor(tensor_name);
-    if (!tensor_or_weights.is_tensor()) {
-      return tensorflow::errors::InvalidArgument("Output node '" + tensor_name +
-                                                 "' is weights not tensor");
-    }
-    nvinfer1::ITensor* tensor = tensor_or_weights.tensor();
-    if (!tensor) {
-      return tensorflow::errors::NotFound("Output tensor not found: " +
-                                          tensor_name);
-    }
-    converter.network()->markOutput(*tensor);
-    tensorflow::DataType tf_dtype = node->output_type(output_idx);
-    output_dtypes.push_back(tf_dtype);
-    nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT;
-    TF_RETURN_IF_ERROR(ConvertDType(tf_dtype, &trt_dtype));
-    tensor->setType(trt_dtype);
-  }
+  TF_RETURN_IF_ERROR(ConvertSubgraph(converter, s, &order, &input_names,
+                                     &input_dtypes, &output_names,
+                                     &output_dtypes, engine_name));
 
   VLOG(2) << "Finished output";
 
@@ -2686,20 +2679,7 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
 
   // Build the TRT op
   tensorflow::NodeDefBuilder op_builder(engine_name, "TRTEngineOp");
-  std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
-  VLOG(2) << "input edge size: " << input_names.size();
-  for (size_t i = 0; i < input_names.size(); ++i) {
-    VLOG(2) << "input edges: " << i << " " << input_names.at(i);
-    int output_idx = s.input_inds.at(i).second;
-    // we wired up the input here already, it is redundant to do it again in
-    //  ConvertSubGraphToTensorRT(convert_graph.cc)
-    auto incoming_edge = tensorflow::NodeDefBuilder::NodeOut(
-        input_names.at(i), output_idx, input_dtypes.at(i));
-    income_edges.push_back(incoming_edge);
-  }
-  tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
-      income_edges);
-  op_builder.Input(input_list);
+  TF_RETURN_IF_ERROR(SetInputList(s, &op_builder, &input_names, &input_dtypes));
 
   VLOG(0) << "Finished op preparation";
 
@@ -2707,9 +2687,11 @@ tensorflow::Status ConvertSubGraphToTensorRTNodeDef(
                     .Attr("input_nodes", input_names)
                     .Attr("output_nodes", output_names)
                     .Attr("OutT", output_dtypes)
+                    .Device(s.device_name_)
                     .Finalize(s.trt_node);
 
-  VLOG(0) << status.ToString() << " finished op building";
+  VLOG(0) << status.ToString() << " finished op building for " << engine_name
+          << " on device " << s.device_name_;
 
   return tensorflow::Status::OK();
 }
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
index 954a1e72f86043..3f6592cd25ff01 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
@@ -22,11 +22,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/lib/core/status.h"
-
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
 
@@ -48,7 +48,9 @@ struct SubGraphParams {
       const tensorflow::grappler::GraphProperties& current_graph_properties,
       std::unordered_map<string, std::pair<int, string>>* output_edges,
       tensorflow::NodeDef* constructed_trt_node,
-      int engine_precision_mode = FP32MODE)
+      int engine_precision_mode = FP32MODE, const string& device_name = "",
+      std::shared_ptr<nvinfer1::IGpuAllocator> allocator = nullptr,
+      int cuda_gpu_id = 0)
       : graph(inp_graph),
         subgraph_node_ids(subgraph_node_id_numbers),
         input_inds(input_indices),
@@ -58,7 +60,10 @@ struct SubGraphParams {
         graph_properties(current_graph_properties),
         output_edge_map(output_edges),
         trt_node(constructed_trt_node),
-        precision_mode(engine_precision_mode) {}
+        precision_mode(engine_precision_mode),
+        device_name_(device_name),
+        allocator_(allocator),
+        cuda_gpu_id_(cuda_gpu_id) {}
 
   tensorflow::Graph& graph;
   const std::set<int>& subgraph_node_ids;
@@ -70,6 +75,9 @@ struct SubGraphParams {
   std::unordered_map<string, std::pair<int, string>>* output_edge_map;
   tensorflow::NodeDef* trt_node;
   const int precision_mode;
+  const string device_name_;
+  std::shared_ptr<nvinfer1::IGpuAllocator> allocator_;
+  const int cuda_gpu_id_;
 };
 
 // TODO(sami): Replace references with const reference or pointers
diff --git a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc
new file mode 100644
index 00000000000000..8f634b1f747173
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc
@@ -0,0 +1,246 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h"
+#include "tensorflow/contrib/tensorrt/convert/convert_graph.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+// TODO(sami): Remove VLOG messages once the code matures
+using tensorflow::str_util::Uppercase;
+using tensorflow::strings::StrAppend;
+using tensorflow::strings::StrCat;
+
+tensorflow::Status TRTOptimizationPass::Init(
+    const tensorflow::RewriterConfig_CustomGraphOptimizer* config) {
+  VLOG(1) << "Called INIT for " << name_ << " with config = " << config;
+  if (config == nullptr) {
+    maximum_workspace_size_ = 2 << 30;
+    return tensorflow::Status::OK();
+  }
+  const auto params = config->parameter_map();
+  if (params.count("minimum_segment_size")) {
+    minimum_segment_size_ = params.at("minimum_segment_size").i();
+  }
+  if (params.count("max_batch_size")) {
+    maximum_batch_size_ = params.at("max_batch_size").i();
+  }
+  if (params.count("max_workspace_size_bytes"))
+    maximum_workspace_size_ = params.at("max_workspace_size_bytes").i();
+  if (params.count("precision_mode")) {
+    string pm = Uppercase(params.at("precision_mode").s());
+    if (pm == "FP32") {
+      precision_mode_ = 0;
+    } else if (pm == "FP16") {
+      precision_mode_ = 1;
+    } else if (pm == "INT8") {
+      precision_mode_ = 2;
+    } else {
+      LOG(ERROR) << "Unknown precision mode '" << pm << "'";
+      return tensorflow::errors::InvalidArgument(
+          "Unknown precision mode argument" + pm +
+          " Valid values are FP32, FP16, INT8");
+    }
+  }
+  return tensorflow::Status::OK();
+}
+
+void TRTOptimizationPass::PrintDebugInfo(
+    tensorflow::grappler::Cluster* cluster,
+    const tensorflow::grappler::GrapplerItem& item) {
+  VLOG(1) << "Cluster = " << cluster;
+  string offset("  ");
+  string offset2 = StrCat(offset, offset);
+  string offset3 = StrCat(offset2, offset);
+  string offset4 = StrCat(offset2, offset2);
+  if (cluster) {
+    VLOG(1) << offset << "type             = " << cluster->type();
+    VLOG(1) << offset << "num warmup steps = " << cluster->NumWarmupSteps();
+    const auto dev_names = cluster->GetDeviceNames();
+    if (dev_names.size()) {
+      VLOG(1) << offset << " Device names:";
+      for (const auto s : dev_names) {
+        VLOG(1) << offset2 << s;
+      }
+    }
+    std::unordered_map<string, uint64> peak_mem;
+    auto status = cluster->GetPeakMemoryUsage(&peak_mem);
+    if (status == tensorflow::Status::OK()) {
+      VLOG(1) << offset << "Peak Memory Usage :";
+      for (auto s : peak_mem) {
+        VLOG(1) << offset2 << s.first << " = " << s.second;
+      }
+    }
+
+    const auto dev_props = cluster->GetDevices();
+    if (dev_props.size()) {
+      VLOG(1) << offset << "Device properties:";
+      for (auto k : dev_props) {
+        VLOG(1) << offset2 << k.first;
+        const auto& dt = k.second;
+        VLOG(1) << offset3 << "type          = " << dt.type();
+        VLOG(1) << offset3 << "vendor        = " << dt.vendor();
+        VLOG(1) << offset3 << "model         = " << dt.model();
+        VLOG(1) << offset3 << "frequency     = " << dt.frequency();
+        VLOG(1) << offset3 << "num cores     = " << dt.num_cores();
+        VLOG(1) << offset3 << "num registers = " << dt.num_registers();
+        VLOG(1) << offset3 << "L1 cache size = " << dt.l1_cache_size();
+        VLOG(1) << offset3 << "L2 cache size = " << dt.l2_cache_size();
+        VLOG(1) << offset3 << "L3 cache size = " << dt.l3_cache_size();
+        VLOG(1) << offset3 << "SHMem per SMP = "
+                << dt.shared_memory_size_per_multiprocessor();
+        VLOG(1) << offset3 << "memory size   = " << dt.memory_size();
+        VLOG(1) << offset3 << "bandwidth     = " << dt.bandwidth();
+        if (dt.environment_size()) {
+          VLOG(1) << offset3 << "environment   :";
+          for (const auto e : dt.environment()) {
+            VLOG(1) << offset4 << e.first << " = " << e.second;
+          }
+        }
+      }
+    }
+  }
+  VLOG(1) << "item: " << item.id;
+  if (item.feed.size()) {
+    VLOG(1) << offset << "Feeds  :";
+    for (const auto& f : item.feed) {
+      const auto& shape = f.second.shape();
+      VLOG(1) << offset2 << f.first << " = shaped " << shape.DebugString();
+    }
+  } else {
+    VLOG(1) << offset << "No Feeds";
+  }
+  if (item.fetch.size()) {
+    VLOG(1) << offset << "Fetches  :";
+    for (const auto& f : item.fetch) {
+      VLOG(1) << offset2 << f;
+    }
+  } else {
+    VLOG(1) << offset << "No Fetches";
+  }
+
+  if (item.init_ops.size()) {
+    VLOG(1) << offset << "init ops  :";
+    for (const auto& f : item.init_ops) {
+      VLOG(1) << offset2 << f;
+    }
+  } else {
+    VLOG(1) << offset << "No init ops";
+  }
+  VLOG(1) << "Save Op = " << item.save_op;
+  VLOG(1) << "Restore Op = " << item.restore_op;
+  VLOG(1) << "save_restore_loc_tensor = " << item.save_restore_loc_tensor;
+  if (item.keep_ops.size()) {
+    VLOG(1) << offset << "keep ops  :";
+    for (const auto& f : item.keep_ops) {
+      VLOG(1) << offset2 << f;
+    }
+  } else {
+    VLOG(1) << offset << "No keep ops";
+  }
+  VLOG(3) << item.graph.DebugString();
+  for (const auto dev : cluster->GetDeviceSet()->devices()) {
+    const auto& pname = dev->parsed_name();
+    VLOG(1) << "Device name= " << dev->name()
+            << " parsedname job= " << pname.job << " id= " << pname.id
+            << " has_id: " << pname.has_id << " has_job: " << pname.has_job
+            << "has_type: " << pname.has_type << " type =" << pname.type;
+  }
+}
+
+tensorflow::Status TRTOptimizationPass::Optimize(
+    tensorflow::grappler::Cluster* cluster,
+    const tensorflow::grappler::GrapplerItem& item, GraphDef* optimized_graph) {
+  VLOG(1) << "Called TRTOptimization Pass " << name_;
+  if (VLOG_IS_ON(1)) {
+    PrintDebugInfo(cluster, item);
+  }
+  int max_dim = -1;
+  if (item.feed.size()) {
+    for (const auto& f : item.feed) {
+      const auto& shape = f.second.shape();
+      if (shape.dims() > 0) {
+        if (shape.dim_size(0) > max_dim) max_dim = shape.dim_size(0);
+      }
+    }
+  }
+  if (maximum_batch_size_ < 0) {  // automatic batch size from input
+    if (max_dim > 0) {
+      maximum_batch_size_ = max_dim;
+      VLOG(1) << "Setting maximum batch size to " << max_dim;
+    } else {
+      maximum_batch_size_ = 128;
+      LOG(WARNING) << "Maximum batch size is not set"
+                      " and can't be deduced from inputs setting it to"
+                   << maximum_batch_size_
+                   << ". Suggest configuring it from configuration parameters";
+    }
+  } else {
+    if (max_dim > maximum_batch_size_) {
+      LOG(WARNING) << "Configured batch size " << maximum_batch_size_
+                   << " is less than input batch size " << max_dim
+                   << " adjusting maximum batch size to match input batch size";
+    }
+  }
+  tensorflow::grappler::GraphProperties static_graph_properties(item);
+  TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(true));
+  auto status = tensorflow::tensorrt::convert::ConvertAfterShapes(
+      item.graph, item.fetch, maximum_batch_size_, maximum_workspace_size_,
+      optimized_graph, precision_mode_, minimum_segment_size_,
+      static_graph_properties, cluster);
+  VLOG(2) << optimized_graph->DebugString();
+  return status;
+}
+
+void TRTOptimizationPass::Feedback(
+    tensorflow::grappler::Cluster* cluster,
+    const tensorflow::grappler::GrapplerItem& item,
+    const GraphDef& optimized_graph, double result) {}
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+class VerboseCustomGraphOptimizerRegistrar
+    : public tensorflow::grappler::CustomGraphOptimizerRegistrar {
+ public:
+  VerboseCustomGraphOptimizerRegistrar(
+      const tensorflow::grappler::CustomGraphOptimizerRegistry::Creator& cr,
+      const tensorflow::string& name)
+      : tensorflow::grappler::CustomGraphOptimizerRegistrar(cr, name) {
+    VLOG(1) << "Constructing a CustomOptimizationPass registration object for "
+            << name;
+  }
+};
+
+static VerboseCustomGraphOptimizerRegistrar TRTOptimizationPass_Registrar(
+    []() {
+      VLOG(1)
+          << "Instantiating CustomOptimizationPass object TensorRTOptimizer";
+      return new tensorflow::tensorrt::convert::TRTOptimizationPass(
+          "TensorRTOptimizer");
+    },
+    ("TensorRTOptimizer"));
+
+#endif
+#endif
diff --git a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h
new file mode 100644
index 00000000000000..d8ecead23efaa5
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h
@@ -0,0 +1,73 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
+
+#include <string>
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+#include "tensorflow/core/platform/logging.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+namespace convert {
+
+class TRTOptimizationPass : public tensorflow::grappler::CustomGraphOptimizer {
+ public:
+  TRTOptimizationPass(const string& name = "TRTOptimizationPass")
+      : name_(name),
+        minimum_segment_size_(3),
+        precision_mode_(0),
+        maximum_batch_size_(-1),
+        maximum_workspace_size_(-1) {
+    VLOG(1) << "Constructing " << name_;
+  }
+
+  string name() const override { return name_; };
+
+  tensorflow::Status Init(const tensorflow::RewriterConfig_CustomGraphOptimizer*
+                              config = nullptr) override;
+
+  tensorflow::Status Optimize(tensorflow::grappler::Cluster* cluster,
+                              const tensorflow::grappler::GrapplerItem& item,
+                              GraphDef* optimized_graph) override;
+
+  void Feedback(tensorflow::grappler::Cluster* cluster,
+                const tensorflow::grappler::GrapplerItem& item,
+                const GraphDef& optimized_graph, double result) override;
+
+  void PrintDebugInfo(tensorflow::grappler::Cluster* cluster,
+                      const tensorflow::grappler::GrapplerItem& item);
+
+ private:
+  string name_;
+  int minimum_segment_size_;
+  int precision_mode_;
+  int maximum_batch_size_;
+  int64_t maximum_workspace_size_;
+};
+
+}  // namespace convert
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_CONVERT_TRT_OPTIMIZATION_PASS_H_
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD b/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD
new file mode 100644
index 00000000000000..dbd3aa0bd0b5a5
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/BUILD
@@ -0,0 +1,118 @@
+# Description:
+#   Example for plugin support in TensorRT(http://developer.nvidia.com/tensorrt)
+#   through TensorFlow integration. Targeting TensorRT 3.0.4
+#   APIs are meant to change while upgrading TRT.
+#   add init_py into pip package BUILD dependency to install it.
+
+package(default_visibility = ["//tensorflow:__subpackages__"])
+
+licenses(["notice"])  # Apache 2.0
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_custom_op_library",
+    "tf_custom_op_library_additional_deps",
+    "tf_gen_op_libs",
+    "tf_gen_op_wrapper_py",
+    "tf_kernel_library",
+)
+load("//tensorflow:tensorflow.bzl", "gpu_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+load(
+    "@local_config_tensorrt//:build_defs.bzl",
+    "if_tensorrt",
+)
+
+tf_gen_op_libs(
+    op_lib_names = ["inc_op"],
+)
+
+tf_gen_op_wrapper_py(
+    name = "inc_op",
+    deps = [":inc_op_op_lib"],
+)
+
+tf_custom_op_library(
+    name = "_inc_op.so",
+    srcs = [
+        "inc_op_kernel.h",
+        "inc_op_plugin.cc",
+        "inc_op_plugin.h",
+        "ops/inc_op.cc",
+    ],
+    gpu_srcs = [
+        "inc_op_kernel.h",
+        "inc_op_kernel.cu.cc",
+    ],
+    deps = [
+        "//tensorflow/contrib/tensorrt:trt_plugins",
+        "//tensorflow/core:framework_lite",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:nv_infer",
+    ]),
+)
+
+tf_kernel_library(
+    name = "inc_op_plugin_kernel",
+    srcs = ["inc_op_plugin.cc"],
+    hdrs = [
+        "inc_op_kernel.h",
+        "inc_op_plugin.h",
+    ],
+    gpu_srcs = [
+        "inc_op_kernel.h",
+        "inc_op_kernel.cu.cc",
+    ],
+    deps = [
+        "//tensorflow/contrib/tensorrt:trt_plugins",
+        "//tensorflow/core:stream_executor_headers_lib",
+    ] + if_tensorrt([
+        "@local_config_tensorrt//:nv_infer",
+    ]) + tf_custom_op_library_additional_deps(),
+)
+
+tf_custom_op_py_library(
+    name = "inc_op_loader",
+    srcs = ["inc_op.py"],
+    dso = [
+        ":_inc_op.so",
+    ],
+    kernels = [
+        ":inc_op_op_lib",
+        ":inc_op_plugin_kernel",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:resources",
+    ],
+)
+
+py_library(
+    name = "init_py",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":inc_op",
+        ":inc_op_loader",
+    ],
+)
+
+gpu_py_test(
+    name = "plugin_test",
+    size = "small",
+    srcs = ["plugin_test.py"],
+    additional_deps = [
+        ":init_py",
+        "//tensorflow/contrib/util:util_py",
+        "//tensorflow/contrib/tensorrt:init_py",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:tf_optimizer",
+    ],
+    tags = [
+        "manual",
+        "noguitar",
+        "notap",
+    ],
+)
diff --git a/tensorflow/python/keras/datasets/cifar10/__init__.py b/tensorflow/contrib/tensorrt/custom_plugin_examples/__init__.py
similarity index 66%
rename from tensorflow/python/keras/datasets/cifar10/__init__.py
rename to tensorflow/contrib/tensorrt/custom_plugin_examples/__init__.py
index 68d3eb789ea2c4..363edab2e80ada 100644
--- a/tensorflow/python/keras/datasets/cifar10/__init__.py
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,15 +11,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""CIFAR10 small image classification dataset."""
+# =============================================================================
+"""Import custom op for plugin and register it in plugin factory registry."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.datasets.cifar10 import load_data
+from tensorflow.contrib.tensorrt.custom_plugin_examples import inc_op as import_inc_op_so
+from tensorflow.contrib.tensorrt.custom_plugin_examples.ops import gen_inc_op
 
-del absolute_import
-del division
-del print_function
+inc_op = gen_inc_op.inc_plugin_trt
diff --git a/tensorflow/python/keras/applications/nasnet/__init__.py b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op.py
similarity index 64%
rename from tensorflow/python/keras/applications/nasnet/__init__.py
rename to tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op.py
index 94eb145b85b85b..a007c3f54e208b 100644
--- a/tensorflow/python/keras/applications/nasnet/__init__.py
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op.py
@@ -11,18 +11,22 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""NASNet Keras applications."""
+# =============================================================================
+"""Loader for the custom inc_op."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.applications.nasnet import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.nasnet import NASNetLarge
-from tensorflow.python.keras._impl.keras.applications.nasnet import NASNetMobile
-from tensorflow.python.keras._impl.keras.applications.nasnet import preprocess_input
+import platform
 
-del absolute_import
-del division
-del print_function
+if platform.system() != "Windows":
+  # pylint: disable=g-import-not-at-top
+  from tensorflow.contrib.util import loader
+  from tensorflow.python.platform import resource_loader
+  # pylint: enable=g-import-not-at-top
+
+  _inc_op = loader.load_op_library(
+      resource_loader.get_path_to_datafile("_inc_op.so"))
+else:
+  raise RuntimeError("Windows not supported")
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc
new file mode 100644
index 00000000000000..988b35f74f3989
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.cu.cc
@@ -0,0 +1,84 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h"
+
+#include <vector>
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "cuda/include/cuda_runtime_api.h"
+#include "tensorflow/core/platform/stream_executor.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+__global__ void VecInc(const float* vec, float inc, float* dest, int n) {
+  int i = blockDim.x * blockIdx.x + threadIdx.x;
+  if (i < n) dest[i] = vec[i] + inc;
+}
+
+void IncrementKernel(const float* d_input, float inc, float* d_output,
+                     int count, cudaStream_t stream) {
+  int threads_per_block = 256;
+  int blocks_per_grid = (count + threads_per_block - 1) / threads_per_block;
+
+  VecInc<<<threads_per_block, blocks_per_grid, 0, stream>>>(d_input, inc,
+                                                            d_output, count);
+}
+
+// Note: this kernel definition is not needed in the plugin_test rule, but it is
+// required for correctness of the TF program, i.e. if not using plugin or when
+// run with trt optimization pass, the test should work.
+class IncPluginTRT : public OpKernel {
+ public:
+  explicit IncPluginTRT(OpKernelConstruction* context) : OpKernel(context) {
+    std::vector<float> inc_list;
+    OP_REQUIRES_OK(context, context->GetAttr("inc", &inc_list));
+    OP_REQUIRES(context, inc_list.size() == 1,
+                errors::InvalidArgument(
+                    "The increment list should contain single element."));
+    inc_ = inc_list[0];
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor& input_tensor = context->input(0);
+    const TensorShape& input_shape = input_tensor.shape();
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input_shape, &output_tensor));
+    const cudaStream_t* stream = CHECK_NOTNULL(
+        reinterpret_cast<const cudaStream_t*>(context->op_device_context()
+                                                  ->stream()
+                                                  ->implementation()
+                                                  ->CudaStreamMemberHack()));
+    IncrementKernel(input_tensor.flat<float>().data(), inc_,
+                    output_tensor->flat<float>().data(),
+                    input_shape.num_elements(), *stream);
+  }
+
+ private:
+  float inc_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("IncPluginTRT").Device(DEVICE_GPU), IncPluginTRT);
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h
new file mode 100644
index 00000000000000..c35955e105798b
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h
@@ -0,0 +1,35 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_CUSTOM_PLUGIN_EXAMPLES_INC_OP_KERNEL_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_CUSTOM_PLUGIN_EXAMPLES_INC_OP_KERNEL_H_
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "cuda/include/cuda_runtime_api.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+void IncrementKernel(const float* d_input, float inc, float* d_output,
+                     int count, cudaStream_t stream);
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_CUSTOM_PLUGIN_EXAMPLES_INC_OP_KERNEL_H_
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.cc b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.cc
new file mode 100644
index 00000000000000..8d4c893af56689
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.cc
@@ -0,0 +1,86 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h"
+
+#include "tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_kernel.h"
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+
+const char* kPluginName = "IncPluginTRT";
+
+IncOpPlugin* CreateIncPlugin() { return new IncOpPlugin(); }
+
+IncOpPlugin* CreateIncPluginDeserialize(const void* buffer, size_t length) {
+  return new IncOpPlugin(buffer, length);
+}
+
+REGISTER_TRT_PLUGIN(kPluginName, CreateIncPluginDeserialize, CreateIncPlugin);
+
+IncOpPlugin::IncOpPlugin() : plugin_name_(kPluginName) {}
+
+IncOpPlugin::IncOpPlugin(const void* serialized_data, size_t length)
+    : PluginTensorRT(serialized_data, length), plugin_name_(kPluginName) {
+  // account for the consumed pointer.
+  size_t consumed_data = PluginTensorRT::getSerializationSize();
+  assert(length - consumed_data >= sizeof(float));
+  const char* buffer = reinterpret_cast<const char*>(serialized_data);
+  SetAttribute("inc", buffer + consumed_data, sizeof(float));
+}
+
+bool IncOpPlugin::SetAttribute(const string& key, const void* ptr,
+                               const size_t size) {
+  if (strcmp(key.c_str(), "inc") == 0 && size == sizeof(float)) {
+    StoreAttribute(key, ptr, size);  // save the attribute to own the data;
+    inc_ = *static_cast<const float*>(ptr);
+    return true;
+  }
+  return false;
+}
+
+bool IncOpPlugin::GetAttribute(const string& key, const void** ptr,
+                               size_t* size) const {
+  const auto& iter = attr_map_.find(key);
+  if (iter != attr_map_.end()) {
+    *ptr = iter->second.data();
+    *size = iter->second.size();
+    return true;
+  }
+  return false;
+}
+
+int IncOpPlugin::enqueue(int batch_size, const void* const* inputs,
+                         void** outputs, void*, cudaStream_t stream) {
+  int count = 1;
+  for (int i = 0; i < input_dim_list_[0].nbDims; i++) {
+    count *= input_dim_list_[0].d[i];
+  }
+  count *= batch_size;
+  const float* input = reinterpret_cast<const float*>(inputs[0]);
+  float* output = reinterpret_cast<float*>(outputs[0]);
+  IncrementKernel(input, inc_, output, count, stream);
+  return 0;
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h
new file mode 100644
index 00000000000000..189e9c939b9ffd
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/inc_op_plugin.h
@@ -0,0 +1,102 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_CUSTOM_PLUGIN_EXAMPLES_INC_OP_PLUGIN_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_CUSTOM_PLUGIN_EXAMPLES_INC_OP_PLUGIN_H_
+
+#include <cassert>
+#include <cstring>
+
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+class IncOpPlugin : public PluginTensorRT {
+ public:
+  IncOpPlugin();
+
+  IncOpPlugin(const void* serialized_data, size_t length);
+
+  const string& GetPluginName() const override { return plugin_name_; };
+
+  bool Finalize() override { return true; };
+
+  bool SetAttribute(const string& key, const void* ptr,
+                    const size_t size) override;
+
+  bool GetAttribute(const string& key, const void** ptr,
+                    size_t* size) const override;
+
+  int getNbOutputs() const override { return 1; }
+
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
+                                     int num_input_dims) override {
+    assert(index == 0);
+    assert(num_input_dims == 1);
+    return inputs[0];
+  }
+
+  // use configure to setup input dimensions
+  void configure(const nvinfer1::Dims* inputs, int num_inputs,
+                 const nvinfer1::Dims* outputs, int num_outputs,
+                 int max_batch_size) override {
+    assert(num_inputs == 1);
+    PluginTensorRT::configure(inputs, num_inputs, outputs, num_outputs,
+                              max_batch_size);
+  }
+
+  int initialize() override { return 0; }
+
+  void terminate() override {}
+
+  size_t getWorkspaceSize(int max_batch_size) const override { return 0; }
+
+  int enqueue(int batch_size, const void* const* inputs, void** outputs,
+              void* workspace, cudaStream_t stream) override;
+
+  size_t getSerializationSize() override {
+    return PluginTensorRT::getSerializationSize() + sizeof(float);
+  }
+
+  void serialize(void* buffer) override {
+    // Serialize parent data.
+    PluginTensorRT::serialize(buffer);
+    // Incremented buffer after parent serialization.
+    buffer =
+        static_cast<char*>(buffer) + PluginTensorRT::getSerializationSize();
+    std::memcpy(buffer, &inc_, sizeof(float));
+    buffer = static_cast<char*>(buffer) + sizeof(float);
+  }
+
+ protected:
+  float inc_;
+  nvinfer1::Dims dim_;
+
+ private:
+  const string plugin_name_;
+};
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_CUSTOM_PLUGIN_EXAMPLES_INC_OP_PLUGIN_H_
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/ops/inc_op.cc b/tensorflow/contrib/tensorrt/custom_plugin_examples/ops/inc_op.cc
new file mode 100644
index 00000000000000..d0eb0d299dd61d
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/ops/inc_op.cc
@@ -0,0 +1,36 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+
+REGISTER_OP("IncPluginTRT")
+    .Attr("inc: list(float)")
+    .Input("input: float32")
+    .Output("output: float32")
+    .SetShapeFn([](::tensorflow::shape_inference::InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      return Status::OK();
+    });
+
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/custom_plugin_examples/plugin_test.py b/tensorflow/contrib/tensorrt/custom_plugin_examples/plugin_test.py
new file mode 100644
index 00000000000000..bc4d270bec4fb8
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/custom_plugin_examples/plugin_test.py
@@ -0,0 +1,95 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Script to show usage of TensorRT custom op & plugin."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy
+
+from tensorflow.contrib import tensorrt
+from tensorflow.contrib.tensorrt import custom_plugin_examples
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import importer
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.platform import test
+
+
+class TrtPluginTest(test_util.TensorFlowTestCase):
+
+  def _get_plugin_graph_def(self):
+    """Create a simple graph and return its graph_def."""
+    g = ops.Graph()
+    with g.as_default():
+      a = array_ops.placeholder(
+          dtype=dtypes.float32, shape=(None, 24, 24, 2), name="input")
+      relu = nn.relu(a, "relu")
+      v = nn_ops.max_pool(
+          relu, [1, 2, 2, 1], [1, 2, 2, 1], "VALID", name="max_pool")
+
+      # insert custom_op in the graph
+      v = custom_plugin_examples.inc_op(v, inc=[16.5], name="plugin_test")
+
+      v *= 2.0
+      v = nn.relu(v)
+      v = nn.relu(v)
+      array_ops.squeeze(v, name="output")
+    return g.as_graph_def()
+
+  def _run_graph(self, gdef, dumm_inp):
+    """Run given graphdef once."""
+    gpu_options = config_pb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
+    ops.reset_default_graph()
+    g = ops.Graph()
+    with g.as_default():
+      inp, out = importer.import_graph_def(
+          graph_def=gdef, return_elements=["input", "output"])
+      inp = inp.outputs[0]
+      out = out.outputs[0]
+
+    with session.Session(
+        config=config_pb2.ConfigProto(gpu_options=gpu_options),
+        graph=g) as sess:
+      val = sess.run(out, {inp: dumm_inp})
+    return val
+
+  def testIncOpPlugin(self):
+    inp_dims = (5, 24, 24, 2)
+    dummy_input = numpy.ones(inp_dims).astype(numpy.float32)
+    orig_graph = self._get_plugin_graph_def()  # graph with plugin node
+
+    # trigger conversion.
+    # plugin nodes have been registered during import, converter will be able to
+    # create corresponding plugin layer during conversion.
+    trt_graph = tensorrt.create_inference_graph(
+        input_graph_def=orig_graph,
+        outputs=["output"],
+        max_batch_size=inp_dims[0],
+        max_workspace_size_bytes=1 << 25,
+        precision_mode="FP32",
+        minimum_segment_size=2)
+    o2 = self._run_graph(trt_graph, dummy_input)
+    self.assertEqual(35, o2.reshape([-1])[0])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
index 20d66551a015dc..1c5d283c724e47 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/contrib/tensorrt/kernels/trt_engine_op.h"
 
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/stream_executor.h"
 #include "tensorflow/core/platform/types.h"
@@ -32,38 +33,40 @@ namespace tensorrt {
 
 TRTEngineOp::TRTEngineOp(OpKernelConstruction* context) : OpKernel(context) {
   // read serialized_engine
-  string serialized_engine;
   OP_REQUIRES_OK(context,
-                 context->GetAttr("serialized_engine", &serialized_engine));
+                 context->GetAttr("serialized_engine", &serialized_engine_));
 
   // register input output node name in trt_sub_graph
   OP_REQUIRES_OK(context, context->GetAttr("input_nodes", &input_nodes_));
   OP_REQUIRES_OK(context, context->GetAttr("output_nodes", &output_nodes_));
+}
 
+void TRTEngineOp::Compute(OpKernelContext* context) {
   // TODO(samikama) runtime should be taken from a resourcemanager as well.
   // Only engine should be in the op and context and runtime should be taken
   // from resourcemanager
-  // TODO(jie): cudaSetDevice make sure trt engine is allocated on the same
-  // gpu where the input/output is also located.
-  int gpu_id = context->device()->tensorflow_gpu_device_info()->gpu_id;
-  cudaSetDevice(gpu_id);
-  int device;
-  cudaGetDevice(&device);
-  if (gpu_id != device) LOG(FATAL) << "set device failed!";
-
-  // TODO(samikama) runtime should be taken from a resourcemanager as well.
-  // Only engine should be in the op and context and runtime should be taken
-  // from resourcemanager
-
-  IRuntime* infer = nvinfer1::createInferRuntime(logger);
-  trt_engine_ptr_.reset(infer->deserializeCudaEngine(
-      serialized_engine.c_str(), serialized_engine.size(), nullptr));
-  trt_execution_context_ptr_.reset(trt_engine_ptr_->createExecutionContext());
-  // Runtime is safe to delete after engine creation
-  infer->destroy();
-}
 
-void TRTEngineOp::Compute(OpKernelContext* context) {
+  if (!trt_execution_context_ptr_) {
+    IRuntime* infer = nvinfer1::createInferRuntime(logger);
+#if NV_TENSORRT_MAJOR > 3
+    auto device = context->device();
+    auto dev_allocator =
+        device->GetAllocator(tensorflow::AllocatorAttributes());
+    if (!dev_allocator) {
+      LOG(FATAL) << "Can't find device allocator for gpu device "
+                 << device->name();
+    }
+    allocator_ = std::make_shared<TRTDeviceAllocator>(dev_allocator);
+    infer->setGpuAllocator(allocator_.get());
+#endif
+    trt_engine_ptr_.reset(infer->deserializeCudaEngine(
+        serialized_engine_.c_str(), serialized_engine_.size(),
+        PluginFactoryTensorRT::GetInstance()));
+    trt_execution_context_ptr_.reset(trt_engine_ptr_->createExecutionContext());
+    // Runtime is safe to delete after engine creation
+    infer->destroy();
+    serialized_engine_.clear();
+  }
   int num_binding = context->num_inputs() + context->num_outputs();
   std::vector<void*> buffers(num_binding);
 
@@ -85,7 +88,8 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
       LOG(FATAL) << "input data inconsistent batch size";
       break;
     }
-    switch (trt_engine_ptr_->getBindingDataType(binding_index)) {
+    auto dtype = trt_engine_ptr_->getBindingDataType(binding_index);
+    switch (dtype) {
       case nvinfer1::DataType::kFLOAT:
         buffers[binding_index] = (void*)(input_tensor.flat<float>().data());
         break;
@@ -95,6 +99,9 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
       case nvinfer1::DataType::kINT8:
         LOG(FATAL) << "int8 is not supported yet!";
         break;
+      default:
+        LOG(FATAL) << "Unknown data type: " << int(dtype);
+        break;
     }
   }
 
@@ -120,7 +127,8 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
 
     OP_REQUIRES_OK(context,
                    context->allocate_output(i, output_shape, &output_tensor));
-    switch (trt_engine_ptr_->getBindingDataType(binding_index)) {
+    auto dtype = trt_engine_ptr_->getBindingDataType(binding_index);
+    switch (dtype) {
       case nvinfer1::DataType::kFLOAT:
         buffers[binding_index] =
             reinterpret_cast<void*>(output_tensor->flat<float>().data());
@@ -131,6 +139,9 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
       case nvinfer1::DataType::kINT8:
         LOG(FATAL) << "int8 is not supported yet!";
         break;
+      default:
+        LOG(FATAL) << "Unknown data type: " << int(dtype);
+        break;
     }
   }
   // copied from cuda_kernel_helper since it seems only valid in *.cu.cc files
@@ -146,7 +157,12 @@ void TRTEngineOp::Compute(OpKernelContext* context) {
   VLOG(2) << "enqueue returns: " << ret;
   // sync should be done by TF.
 }
-
+TRTEngineOp::~TRTEngineOp() {
+  // Order matters!
+  trt_execution_context_ptr_.reset();
+  trt_engine_ptr_.reset();
+  allocator_.reset();
+}
 REGISTER_KERNEL_BUILDER(Name("TRTEngineOp").Device(DEVICE_GPU), TRTEngineOp);
 
 }  // namespace tensorrt
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
index 0964b4b18a7811..e613a71422852e 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
@@ -17,25 +17,28 @@ limitations under the License.
 #define TENSORFLOW_CONTRIB_TENSORRT_KERNELS_TRT_ENGINE_OP_H_
 
 #include <memory>
-#include <string>
 #include <vector>
 
+#include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
 #include "cuda/include/cuda_runtime_api.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorrt/include/NvInfer.h"
 
 namespace tensorflow {
 namespace tensorrt {
 class Logger;
 
+//  TODO(Sami): Remove this file?
 class TRTEngineOp : public OpKernel {
  public:
   explicit TRTEngineOp(OpKernelConstruction* context);
 
   void Compute(OpKernelContext* context) override;
+  ~TRTEngineOp();
 
  private:
   template <typename T>
@@ -51,6 +54,8 @@ class TRTEngineOp : public OpKernel {
 
   std::vector<string> input_nodes_;
   std::vector<string> output_nodes_;
+  std::shared_ptr<nvinfer1::IGpuAllocator> allocator_;
+  string serialized_engine_;
 };
 
 }  // namespace tensorrt
diff --git a/tensorflow/contrib/tensorrt/log/trt_logger.h b/tensorflow/contrib/tensorrt/log/trt_logger.h
index 7f3544f8cfda8d..96ccacb791e401 100644
--- a/tensorflow/contrib/tensorrt/log/trt_logger.h
+++ b/tensorflow/contrib/tensorrt/log/trt_logger.h
@@ -28,7 +28,7 @@ namespace tensorrt {
 // Logger for GIE info/warning/errors
 class Logger : public nvinfer1::ILogger {
  public:
-  Logger(string name = "DefaultLogger") : name_(name){};
+  Logger(string name = "DefaultLogger") : name_(name) {}
   void log(nvinfer1::ILogger::Severity severity, const char* msg) override;
 
  private:
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin.cc b/tensorflow/contrib/tensorrt/plugin/trt_plugin.cc
new file mode 100644
index 00000000000000..062f86e8bb4dc7
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin.cc
@@ -0,0 +1,106 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
+#include <cassert>
+#include <cstring>
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+
+PluginTensorRT::PluginTensorRT(const void* serialized_data, size_t length) {
+  const char* buffer = static_cast<const char*>(serialized_data);
+  size_t op_name_char_count = *reinterpret_cast<const size_t*>(buffer);
+  buffer += sizeof(size_t);
+  buffer += op_name_char_count;
+
+  size_t count = *reinterpret_cast<const size_t*>(buffer);
+  buffer += sizeof(size_t);
+
+  for (int i = 0; i < count; i++) {
+    nvinfer1::Dims dim;
+    std::memcpy(&(dim.nbDims), buffer, sizeof(dim.nbDims));
+    buffer += sizeof(dim.nbDims);
+    std::memcpy(dim.d, buffer, sizeof(dim.d));
+    buffer += sizeof(dim.d);
+    std::memcpy(dim.type, buffer, sizeof(dim.type));
+    buffer += sizeof(dim.type);
+    input_dim_list_.emplace_back(dim);
+  }
+}
+
+void PluginTensorRT::configure(const nvinfer1::Dims* inputs, int num_inputs,
+                               const nvinfer1::Dims* outputs, int num_outputs,
+                               int max_batch_size) {
+  for (int index = 0; index < num_inputs; index++) {
+    nvinfer1::Dims dim;
+    dim.nbDims = inputs[index].nbDims;
+    for (int i = 0; i < dim.nbDims; i++) {
+      dim.d[i] = inputs[index].d[i];
+      dim.type[i] = inputs[index].type[i];
+    }
+    input_dim_list_.emplace_back(dim);
+  }
+}
+
+size_t PluginTensorRT::getSerializationSize() {
+  nvinfer1::Dims dim;
+  return sizeof(size_t) + GetPluginName().size() +
+         sizeof(input_dim_list_.size()) + sizeof(dim.nbDims) + sizeof(dim.d) +
+         sizeof(dim.type);
+}
+
+void PluginTensorRT::serialize(void* serialized_data) {
+  size_t op_name_size = GetPluginName().size();
+  char* buffer = static_cast<char*>(serialized_data);
+  std::memcpy(buffer, &op_name_size, sizeof(size_t));
+  buffer += sizeof(size_t);
+
+  std::memcpy(buffer, GetPluginName().data(), op_name_size);
+  buffer += op_name_size;
+
+  auto list_size = input_dim_list_.size();
+  std::memcpy(buffer, &list_size, sizeof(input_dim_list_.size()));
+  buffer += sizeof(input_dim_list_.size());
+
+  for (int i = 0; i < input_dim_list_.size(); i++) {
+    auto dim = input_dim_list_[i];
+    std::memcpy(buffer, &(dim.nbDims), sizeof(dim.nbDims));
+    buffer += sizeof(dim.nbDims);
+    std::memcpy(buffer, dim.d, sizeof(dim.d));
+    buffer += sizeof(dim.d);
+    std::memcpy(buffer, dim.type, sizeof(dim.type));
+    buffer += sizeof(dim.type);
+  }
+}
+
+bool PluginTensorRT::StoreAttribute(const string& key, const void* ptr,
+                                    const size_t size) {
+  if (attr_map_.count(key) != 0) return false;
+
+  attr_map_.emplace(key, std::vector<char>(size));
+  std::memcpy(attr_map_[key].data(), ptr, size);
+  return true;
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin.h b/tensorflow/contrib/tensorrt/plugin/trt_plugin.h
new file mode 100644
index 00000000000000..754920b60ca743
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin.h
@@ -0,0 +1,74 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_H_
+
+#include <iostream>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/platform/types.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+// A wrapper class for TensorRT plugin
+// User application should inherit from this class to write custom kernels.
+// Allows user to insert custom op in TensorRT engine
+// To register plugin in converter, user should also register custom
+// PluginDeserializeFunc & PluginConstructFunc through PluginFactoryTensorRT
+class PluginTensorRT : public nvinfer1::IPlugin {
+ public:
+  PluginTensorRT() {}
+  PluginTensorRT(const void* serialized_data, size_t length);
+
+  virtual const string& GetPluginName() const = 0;
+
+  virtual bool Finalize() = 0;
+
+  virtual bool SetAttribute(const string& key, const void* ptr,
+                            const size_t size) = 0;
+  virtual bool GetAttribute(const string& key, const void** ptr,
+                            size_t* size) const = 0;
+
+  void configure(const nvinfer1::Dims* inputs, int num_inputs,
+                 const nvinfer1::Dims* outputs, int num_outputs,
+                 int max_batch_size) override;
+
+  virtual bool StoreAttribute(const string& key, const void* ptr,
+                              const size_t size);
+
+  size_t getSerializationSize() override;
+
+  void serialize(void* buffer) override;
+
+ protected:
+  std::unordered_map<string, std::vector<char> > attr_map_;
+
+  std::vector<nvinfer1::Dims> input_dim_list_;
+};
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_H_
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.cc b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.cc
new file mode 100644
index 00000000000000..2bc591484dcaf5
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.cc
@@ -0,0 +1,78 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+
+PluginTensorRT* PluginFactoryTensorRT::createPlugin(const char* layer_name,
+                                                    const void* serial_data,
+                                                    size_t serial_length) {
+  size_t parsed_byte = 0;
+  // extract op_name from serial_data
+  string encoded_op_name =
+      ExtractOpName(serial_data, serial_length, &parsed_byte);
+
+  if (!IsPlugin(encoded_op_name)) {
+    return nullptr;
+  }
+
+  tensorflow::mutex_lock lock(instance_m_);
+  auto plugin_ptr =
+      plugin_registry_[encoded_op_name].first(serial_data, serial_length);
+  owned_plugins_.emplace_back(plugin_ptr);
+
+  return plugin_ptr;
+}
+
+PluginTensorRT* PluginFactoryTensorRT::CreatePlugin(const string& op_name) {
+  if (!IsPlugin(op_name)) return nullptr;
+
+  tensorflow::mutex_lock lock(instance_m_);
+  auto plugin_ptr = plugin_registry_[op_name].second();
+  owned_plugins_.emplace_back(plugin_ptr);
+
+  return plugin_ptr;
+}
+
+bool PluginFactoryTensorRT::RegisterPlugin(
+    const string& op_name, PluginDeserializeFunc deserialize_func,
+    PluginConstructFunc construct_func) {
+  if (IsPlugin(op_name)) return false;
+
+  tensorflow::mutex_lock lock(instance_m_);
+  auto ret = plugin_registry_.emplace(
+      op_name, std::make_pair(deserialize_func, construct_func));
+
+  return ret.second;
+}
+
+void PluginFactoryTensorRT::DestroyPlugins() {
+  tensorflow::mutex_lock lock(instance_m_);
+  for (auto& owned_plugin_ptr : owned_plugins_) {
+    owned_plugin_ptr.release();
+  }
+  owned_plugins_.clear();
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h
new file mode 100644
index 00000000000000..bbae9fb65c22cf
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h
@@ -0,0 +1,102 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_FACTORY_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_FACTORY_H_
+
+#include <memory>
+#include <unordered_map>
+
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+class PluginFactoryTensorRT : public nvinfer1::IPluginFactory {
+ public:
+  // TODO(aaroey): this static method has to be inlined to make the singleton a
+  // unique global symbol. Find a way to fix it.
+  static PluginFactoryTensorRT* GetInstance() {
+    static PluginFactoryTensorRT* factory_instance =
+        new PluginFactoryTensorRT();
+    return factory_instance;
+  }
+
+  // Deserialization method
+  PluginTensorRT* createPlugin(const char* layer_name, const void* serial_data,
+                               size_t serial_length) override;
+
+  // Plugin construction, PluginFactoryTensorRT owns the plugin.
+  PluginTensorRT* CreatePlugin(const string& op_name);
+
+  bool RegisterPlugin(const string& op_name,
+                      PluginDeserializeFunc deserialize_func,
+                      PluginConstructFunc construct_func);
+
+  bool IsPlugin(const string& op_name) {
+    return plugin_registry_.find(op_name) != plugin_registry_.end();
+  }
+
+  size_t CountOwnedPlugins() { return owned_plugins_.size(); }
+
+  void DestroyPlugins();
+
+ protected:
+  std::unordered_map<string,
+                     std::pair<PluginDeserializeFunc, PluginConstructFunc>>
+      plugin_registry_;
+
+  // TODO(jie): Owned plugin should be associated with different sessions;
+  //            should really hand ownership of plugins to resource management;
+  std::vector<std::unique_ptr<PluginTensorRT>> owned_plugins_;
+  tensorflow::mutex instance_m_;
+};
+
+class TrtPluginRegistrar {
+ public:
+  TrtPluginRegistrar(const string& name, PluginDeserializeFunc deserialize_func,
+                     PluginConstructFunc construct_func) {
+    auto factory = PluginFactoryTensorRT::GetInstance();
+    QCHECK(factory->RegisterPlugin(name, deserialize_func, construct_func))
+        << "Failed to register plugin: " << name;
+  }
+};
+
+#define REGISTER_TRT_PLUGIN(name, deserialize_func, construct_func)    \
+  REGISTER_TRT_PLUGIN_UNIQ_HELPER(__COUNTER__, name, deserialize_func, \
+                                  construct_func)
+#define REGISTER_TRT_PLUGIN_UNIQ_HELPER(ctr, name, deserialize_func, \
+                                        construct_func)              \
+  REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func, construct_func)
+#define REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func, construct_func) \
+  static ::tensorflow::tensorrt::TrtPluginRegistrar trt_plugin_registrar##ctr \
+      TF_ATTRIBUTE_UNUSED = ::tensorflow::tensorrt::TrtPluginRegistrar(       \
+          name, deserialize_func, construct_func)
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_FACTORY_H_
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory_test.cc b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory_test.cc
new file mode 100644
index 00000000000000..129bdcdbc2f8d9
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin_factory_test.cc
@@ -0,0 +1,125 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
+
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+namespace test {
+
+class StubPlugin : public PluginTensorRT {
+ public:
+  static const char* kPluginName;
+
+  StubPlugin() : plugin_name_(kPluginName) {}
+
+  StubPlugin(const void* serialized_data, size_t length)
+      : PluginTensorRT(serialized_data, length) {}
+
+  const string& GetPluginName() const override { return plugin_name_; }
+
+  bool Finalize() override { return true; }
+
+  bool SetAttribute(const string& key, const void* ptr,
+                    const size_t size) override {
+    return true;
+  }
+
+  bool GetAttribute(const string& key, const void** ptr,
+                    size_t* size) const override {
+    return true;
+  }
+
+  int getNbOutputs() const override { return 1; }
+
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs,
+                                     int nbInputDims) override {
+    return inputs[0];
+  }
+
+  int initialize() override { return 0; }
+
+  void terminate() override {}
+
+  size_t getWorkspaceSize(int maxBatchSize) const override { return 0; }
+
+  int enqueue(int batch_size, const void* const* inputs, void** outputs,
+              void* workspace, cudaStream_t stream) override {
+    return 0;
+  }
+
+ private:
+  const string plugin_name_;
+};
+
+const char* StubPlugin::kPluginName = "StubPlugin";
+
+StubPlugin* CreateStubPlugin() { return new StubPlugin(); }
+
+StubPlugin* CreateStubPluginDeserialize(const void* serialized_data,
+                                        size_t length) {
+  return new StubPlugin(serialized_data, length);
+}
+
+class TrtPluginFactoryTest : public ::testing::Test {
+ public:
+  bool RegisterStubPlugin() {
+    if (PluginFactoryTensorRT::GetInstance()->IsPlugin(
+            StubPlugin::kPluginName)) {
+      return true;
+    }
+    return PluginFactoryTensorRT::GetInstance()->RegisterPlugin(
+        StubPlugin::kPluginName, CreateStubPluginDeserialize, CreateStubPlugin);
+  }
+};
+
+TEST_F(TrtPluginFactoryTest, Registration) {
+  EXPECT_FALSE(
+      PluginFactoryTensorRT::GetInstance()->IsPlugin(StubPlugin::kPluginName));
+  EXPECT_TRUE(RegisterStubPlugin());
+
+  ASSERT_TRUE(
+      PluginFactoryTensorRT::GetInstance()->IsPlugin(StubPlugin::kPluginName));
+}
+
+TEST_F(TrtPluginFactoryTest, CreationDeletion) {
+  EXPECT_TRUE(RegisterStubPlugin());
+  ASSERT_TRUE(
+      PluginFactoryTensorRT::GetInstance()->IsPlugin(StubPlugin::kPluginName));
+
+  PluginFactoryTensorRT::GetInstance()->DestroyPlugins();
+  ASSERT_TRUE(PluginFactoryTensorRT::GetInstance()->CreatePlugin(
+      StubPlugin::kPluginName));
+  ASSERT_EQ(1, PluginFactoryTensorRT::GetInstance()->CountOwnedPlugins());
+  PluginFactoryTensorRT::GetInstance()->DestroyPlugins();
+  ASSERT_EQ(0, PluginFactoryTensorRT::GetInstance()->CountOwnedPlugins());
+}
+
+}  // namespace test
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.cc b/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.cc
new file mode 100644
index 00000000000000..a8f60886c03c17
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.cc
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h"
+#include <cassert>
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+namespace tensorflow {
+namespace tensorrt {
+
+string ExtractOpName(const void* serial_data, size_t serial_length,
+                     size_t* incremental) {
+  size_t op_name_char_count = *static_cast<const size_t*>(serial_data);
+  *incremental = sizeof(size_t) + op_name_char_count;
+
+  assert(serial_length >= *incremental);
+
+  const char* buffer = static_cast<const char*>(serial_data) + sizeof(size_t);
+  string op_name(buffer, op_name_char_count);
+
+  return op_name;
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h b/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h
new file mode 100644
index 00000000000000..274ce42fec9283
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/plugin/trt_plugin_utils.h
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_UTILS_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_UTILS_H_
+
+#include <functional>
+
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin.h"
+#include "tensorflow/core/platform/types.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+
+namespace tensorflow {
+namespace tensorrt {
+
+typedef std::function<PluginTensorRT*(const void*, size_t)>
+    PluginDeserializeFunc;
+
+typedef std::function<PluginTensorRT*(void)> PluginConstructFunc;
+
+// TODO(jie): work on error handling here
+string ExtractOpName(const void* serial_data, size_t serial_length,
+                     size_t* incremental);
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_PLUGIN_TRT_PLUGIN_UTILS_H_
diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator.cc b/tensorflow/contrib/tensorrt/resources/trt_allocator.cc
new file mode 100644
index 00000000000000..0f0508331c1305
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/resources/trt_allocator.cc
@@ -0,0 +1,62 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
+
+#include "tensorflow/core/platform/logging.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+
+#if NV_TENSORRT_MAJOR > 2
+#include "cuda/include/cuda_runtime_api.h"
+
+namespace tensorflow {
+namespace tensorrt {
+void* TRTCudaAllocator::allocate(uint64_t size, uint64_t alignment,
+                                 uint32_t flags) {
+  assert((alignment & (alignment - 1)) == 0);  // zero or a power of 2.
+  void* memory;
+  cudaMalloc(&memory, size);
+  return memory;
+}
+
+void TRTCudaAllocator::free(void* memory) { cudaFree(memory); }
+
+void* TRTDeviceAllocator::allocate(uint64_t size, uint64_t alignment,
+                                   uint32_t flags) {
+  assert((alignment & (alignment - 1)) == 0);  // zero or a power of 2.
+  void* mem = allocator_->AllocateRaw(alignment, size);
+  VLOG(2) << "Allocated " << size << " bytes with alignment " << alignment
+          << " @ " << mem;
+  return mem;
+}
+
+TRTDeviceAllocator::TRTDeviceAllocator(tensorflow::Allocator* allocator)
+    : allocator_(allocator) {
+  VLOG(1) << "Using " << allocator->Name() << " allocator from TensorFlow";
+}
+
+void TRTDeviceAllocator::free(void* memory) {
+  VLOG(2) << "Deallocating " << memory;
+  allocator_->DeallocateRaw(memory);
+}
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif
+#endif
+#endif
diff --git a/tensorflow/contrib/tensorrt/resources/trt_allocator.h b/tensorflow/contrib/tensorrt/resources/trt_allocator.h
new file mode 100644
index 00000000000000..a0c2540a7698bc
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/resources/trt_allocator.h
@@ -0,0 +1,68 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_ALLOCATOR_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_ALLOCATOR_H_
+
+
+#include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorflow/core/framework/allocator.h"
+
+#if GOOGLE_CUDA
+#if GOOGLE_TENSORRT
+#include "tensorrt/include/NvInfer.h"
+
+#if NV_TENSORRT_MAJOR == 3
+// Define interface here temporarily until TRT 4.0 is released
+namespace nvinfer1 {
+class IGpuAllocator {
+ public:
+  virtual void* allocate(uint64_t size, uint64_t alignment, uint32_t flags) = 0;
+  virtual void free(void* memory) = 0;
+};
+}  // namespace nvinfer1
+#endif
+
+namespace tensorflow {
+namespace tensorrt {
+
+class TRTCudaAllocator : public nvinfer1::IGpuAllocator {
+  // Allocator implementation that is using cuda allocator instead of device
+  // allocator in case we can't get device allocator from TF.
+ public:
+  TRTCudaAllocator() {}
+  virtual ~TRTCudaAllocator() {}
+  void* allocate(uint64_t size, uint64_t alignment, uint32_t flags) override;
+  void free(void* memory) override;
+};
+
+class TRTDeviceAllocator : public nvinfer1::IGpuAllocator {
+  // Allocator implementation wrapping TF device allocators.
+ public:
+  TRTDeviceAllocator(tensorflow::Allocator* allocator);
+  virtual ~TRTDeviceAllocator() {}
+  void* allocate(uint64_t size, uint64_t alignment, uint32_t flags) override;
+  void free(void* memory) override;
+
+ private:
+  tensorflow::Allocator* allocator_;
+};
+
+}  // namespace tensorrt
+}  // namespace tensorflow
+
+#endif  // GOOGLE_TENSORRT
+#endif  // GOOGLE_CUDA
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_ALLOCATOR_H_
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resources.h b/tensorflow/contrib/tensorrt/resources/trt_resources.h
index 3c85968ae7acf5..e3469124acd4b9 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_resources.h
+++ b/tensorflow/contrib/tensorrt/resources/trt_resources.h
@@ -13,20 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRTRESOURCES_H_
-#define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRTRESOURCES_H_
+#ifndef TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_RESOURCES_H_
+#define TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_RESOURCES_H_
 
 #include <list>
 #include <sstream>
 #include <string>
 #include <thread>
 #include <vector>
+
 #include "tensorflow/contrib/tensorrt/log/trt_logger.h"
+#include "tensorflow/contrib/tensorrt/resources/trt_allocator.h"
+#include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
-#include "tensorflow/contrib/tensorrt/resources/trt_int8_calibrator.h"
+
 #include "tensorrt/include/NvInfer.h"
 
 namespace tensorflow {
@@ -40,6 +43,11 @@ class TRTCalibrationResource : public tensorflow::ResourceBase {
         engine_(nullptr),
         logger_(nullptr),
         thr_(nullptr) {}
+
+  ~TRTCalibrationResource() {
+    VLOG(0) << "Destroying Calibration Resource " << std::endl << DebugString();
+  }
+
   string DebugString() override {
     std::stringstream oss;
     oss << " Calibrator = " << std::hex << calibrator_ << std::dec << std::endl
@@ -47,16 +55,17 @@ class TRTCalibrationResource : public tensorflow::ResourceBase {
         << " Network    = " << std::hex << network_ << std::dec << std::endl
         << " Engine     = " << std::hex << engine_ << std::dec << std::endl
         << " Logger     = " << std::hex << logger_ << std::dec << std::endl
+        << " Allocator  = " << std::hex << allocator_.get() << std::dec
+        << std::endl
         << " Thread     = " << std::hex << thr_ << std::dec << std::endl;
     return oss.str();
   }
-  ~TRTCalibrationResource() {
-    VLOG(0) << "Destroying Calibration Resource " << std::endl << DebugString();
-  }
+
   TRTInt8Calibrator* calibrator_;
   nvinfer1::IBuilder* builder_;
   nvinfer1::INetworkDefinition* network_;
   nvinfer1::ICudaEngine* engine_;
+  std::shared_ptr<nvinfer1::IGpuAllocator> allocator_;
   tensorflow::tensorrt::Logger* logger_;
   // TODO(sami): Use threadpool threads!
   std::thread* thr_;
@@ -65,31 +74,28 @@ class TRTCalibrationResource : public tensorflow::ResourceBase {
 class TRTWeightStore : public tensorflow::ResourceBase {
  public:
   TRTWeightStore() {}
-  std::list<std::vector<uint8_t>> store_;
+
+  virtual ~TRTWeightStore() { VLOG(1) << "Destroying store" << DebugString(); }
+
   string DebugString() override {
     std::stringstream oss;
-    size_t lenBytes = 0;
+    size_t len_bytes = 0;
     for (const auto& v : store_) {
-      lenBytes += v.size() * sizeof(uint8_t);
+      len_bytes += v.size() * sizeof(uint8_t);
     }
     oss << " Number of entries     = " << store_.size() << std::endl
         << " Total number of bytes = "
-        << store_.size() * sizeof(std::vector<uint8_t>) + lenBytes << std::endl;
+        << store_.size() * sizeof(std::vector<uint8_t>) + len_bytes
+        << std::endl;
     return oss.str();
   }
-  virtual ~TRTWeightStore() { VLOG(1) << "Destroying store" << DebugString(); }
-};
 
-class TRTEngineResource : public tensorflow::ResourceBase {
- public:
-  TRTEngineResource() : runtime_(nullptr), ctx_(nullptr){};
-  string DebugString() override { return string(""); }
-  nvinfer1::IRuntime* runtime_;
-  nvinfer1::IExecutionContext* ctx_;
+  std::list<std::vector<uint8_t>> store_;
 };
 
 }  // namespace tensorrt
 }  // namespace tensorflow
-#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCEMGR_TRTRESOURCES_H_
+
 #endif
 #endif
+#endif  // TENSORFLOW_CONTRIB_TENSORRT_RESOURCES_TRT_RESOURCES_H_
diff --git a/tensorflow/contrib/tensorrt/segment/segment.cc b/tensorflow/contrib/tensorrt/segment/segment.cc
index 8fc4697c513057..cc42913ecadc3e 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment.cc
@@ -25,18 +25,239 @@ limitations under the License.
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace tensorrt {
 namespace segment {
+using ::tensorflow::strings::StrAppend;
+// A simple graph representation to mirror tensorflow::Graph. This structure
+// helps saving memory since segmenter modifies the graph in place, preventing
+// the need to create a copy of the graph. It is composed of edges and nodes.
+// Nodes keep pointers to original TF nodes.
+class SimpleNode;
+class SimpleGraph;
+class SimpleEdge {
+ public:
+  SimpleEdge(int id, SimpleNode* src, int src_port, SimpleNode* dst,
+             int dst_port, bool is_control = false)
+      : id_(id),
+        src_(src),
+        src_port_(src_port),
+        dst_(dst),
+        dst_port_(dst_port),
+        control_(is_control) {}
+  ~SimpleEdge() {}
+
+  SimpleNode* src() const { return src_; }
+  SimpleNode* dst() const { return dst_; }
+  int src_output() const { return src_port_; }
+  int dst_input() const { return dst_port_; }
+  int id() const { return id_; }
+  bool IsControlEdge() const { return control_; }
+
+ private:
+  int id_;
+  SimpleNode* src_;
+  int src_port_;
+  SimpleNode* dst_;
+  int dst_port_;
+  bool control_;
+};
+
+class SimpleNode {
+ public:
+  SimpleNode(const tensorflow::Node* node, const int id);
+
+  const std::vector<SimpleEdge*>& in_edges() const { return in_edges_; }
+  const std::vector<SimpleEdge*>& out_edges() const { return out_edges_; }
+  std::vector<SimpleNode*> in_nodes() const {
+    std::vector<SimpleNode*> res;
+    res.reserve(in_edges_.size());
+    for (const auto e : in_edges_) {
+      if (e) res.push_back(e->src());
+    }
+    return res;
+  }
+  const string& name() const { return node_->name(); }
+  const tensorflow::Node* tf_node() const { return node_; }
+  int id() const { return id_; }
+
+ private:
+  const tensorflow::Node* node_;
+  std::vector<SimpleEdge*> in_edges_;
+  std::vector<SimpleEdge*> out_edges_;
+  int id_;
+
+  friend class SimpleGraph;
+};
+
+class SimpleGraph {
+ public:
+  explicit SimpleGraph(const tensorflow::Graph* g);
+  ~SimpleGraph();
+
+  void AddControlEdge(SimpleNode* src, SimpleNode* dst);
+  void AddEdge(SimpleNode* src, int out_port, SimpleNode* dst, int in_port);
+  void RemoveEdge(const SimpleEdge*);
+  SimpleNode* FindNodeId(int node_id) {
+    if (node_id < 0 || node_id > static_cast<int>(nodes_.size())) {
+      return nullptr;
+    }
+    return nodes_[node_id];
+  }
+  int num_node_ids() const { return nodes_.size(); }
+  const SimpleNode* source_node() const {
+    return nodes_[tensorflow::Graph::kSourceId];
+  }
+  const SimpleNode* sink_node() const {
+    return nodes_[tensorflow::Graph::kSinkId];
+  }
+
+ private:
+  const tensorflow::Graph* g_;
+  std::vector<SimpleNode*> nodes_;
+  std::vector<SimpleEdge*> edges_;
+  // free_edge_ids_ and free_node_ids_ contain freed indices.
+  std::set<int> free_edge_ids_;
+  std::set<int> free_node_ids_;
+};
+
+SimpleNode::SimpleNode(const tensorflow::Node* node, const int id)
+    : node_(node), id_(id) {
+  if (node_) {
+    in_edges_.reserve(node_->in_edges().size());
+    out_edges_.reserve(node_->out_edges().size());
+  }
+}
+
+SimpleGraph::SimpleGraph(const tensorflow::Graph* g) : g_(g) {
+  int n_nodes = g_->num_node_ids();
+  nodes_.resize(n_nodes, nullptr);
+  nodes_[g->kSourceId] = new SimpleNode(g->source_node(), g->kSourceId);
+  nodes_[g->kSinkId] = new SimpleNode(g->sink_node(), g->kSinkId);
+  int n_edges = g->num_edge_ids();
+  edges_.resize(n_edges, nullptr);
+  for (int i = 2; i < n_nodes; i++) {
+    const auto n = g->FindNodeId(i);
+    if (n) {
+      nodes_[i] = new SimpleNode(n, i);
+    } else {
+      free_node_ids_.insert(i);
+    }
+  }
+  for (int i = 0; i < n_edges; i++) {
+    const auto e = g->FindEdgeId(i);
+    if (e) {
+      const auto tfsrc = e->src();
+      const auto tfdst = e->dst();
+      bool is_control = e->IsControlEdge();
+      auto src = nodes_[tfsrc->id()];
+      auto dst = nodes_[tfdst->id()];
+      auto edge = new SimpleEdge(i, src, e->src_output(), dst, e->dst_input(),
+                                 is_control);
+      edges_[i] = edge;
+      src->out_edges_.push_back(edge);
+      dst->in_edges_.push_back(edge);
+    } else {
+      free_edge_ids_.insert(i);
+    }
+  }
+}
+
+void SimpleGraph::AddEdge(SimpleNode* src, int out_port, SimpleNode* dst,
+                          int in_port) {
+  int i = edges_.size();
+  if (!free_edge_ids_.empty()) {
+    auto it = free_edge_ids_.begin();
+    i = *it;
+    free_edge_ids_.erase(it);
+  } else {
+    edges_.push_back(nullptr);
+  }
+  bool is_control = (out_port == tensorflow::Graph::kControlSlot);
+  is_control |= (in_port == tensorflow::Graph::kControlSlot);
+  auto edge = new SimpleEdge(i, src, out_port, dst, in_port, is_control);
+  edges_[i] = edge;
+  src->out_edges_.push_back(edge);
+  dst->in_edges_.push_back(edge);
+}
+
+void SimpleGraph::AddControlEdge(SimpleNode* src, SimpleNode* dst) {
+  AddEdge(src, tensorflow::Graph::kControlSlot, dst,
+          tensorflow::Graph::kControlSlot);
+}
+
+void SimpleGraph::RemoveEdge(const SimpleEdge* edge) {
+  auto src = edge->src();
+  auto dst = edge->dst();
+  for (auto it = src->out_edges_.begin(); it != src->out_edges_.end(); ++it) {
+    if (*it == edge) {
+      src->out_edges_.erase(it);
+      break;
+    }
+  }
+  for (auto it = dst->in_edges_.begin(); it != dst->in_edges_.end(); ++it) {
+    if (*it == edge) {
+      dst->in_edges_.erase(it);
+      break;
+    }
+  }
+}
+
+SimpleGraph::~SimpleGraph() {
+  for (auto x : nodes_) delete x;
+  for (auto x : edges_) delete x;
+}
 
 namespace {
 
-bool CanContractEdge(const tensorflow::Edge* edge,
-                     const tensorflow::Graph& graph) {
-  const tensorflow::Node* src = edge->src();
-  const tensorflow::Node* dst = edge->dst();
+bool CheckCycles(const std::unique_ptr<SimpleGraph>& g, const SimpleNode* src,
+                 const std::vector<SimpleNode*>& start) {
+  // copied from TF ReverseDFS.
+  struct Work {
+    SimpleNode* node;
+    bool leave;  // Are we entering or leaving n?
+  };
+
+  std::vector<Work> stack(start.size());
+  for (int i = 0; i < start.size(); ++i) {
+    stack[i] = Work{start[i], false};
+  }
+
+  std::vector<bool> visited(g->num_node_ids(), false);
+  while (!stack.empty()) {
+    Work w = stack.back();
+    stack.pop_back();
+
+    auto n = w.node;
+    if (w.leave) {
+      if (n == src) {
+        return true;
+      }
+      continue;
+    }
+
+    if (visited[n->id()]) continue;
+    visited[n->id()] = true;
+    // Arrange to call leave(n) when all done with descendants.
+    stack.push_back(Work{n, true});
+
+    auto nodes = n->in_nodes();
+    for (const auto node : nodes) {
+      if (!visited[node->id()]) {
+        stack.push_back(Work{node, false});
+      }
+    }
+  }
+  return false;
+}
+
+bool CanContractEdge(const SimpleEdge* edge,
+                     const std::unique_ptr<SimpleGraph>& graph) {
+  const auto src = edge->src();
+  const auto dst = edge->dst();
 
   // Can't contract edge if doing so would cause a cycle in the
   // graph. So, if there is a directed path from 'src' to 'dst', other
@@ -48,46 +269,38 @@ bool CanContractEdge(const tensorflow::Edge* edge,
   //   1. Get all nodes incoming to 'dst', excluding 'src'
   //   2. Reverse DFS from those nodes
   //   3. If reverse DFS reaches 'src' then we have a cycle
-  std::vector<tensorflow::Node*> dfs_start_nodes;
-  for (tensorflow::Node* node : dst->in_nodes()) {
+  std::vector<SimpleNode*> dfs_start_nodes;
+  for (SimpleNode* node : dst->in_nodes()) {
     if (node != src) {
       dfs_start_nodes.push_back(node);
     }
   }
 
-  bool is_cycle = false;
-  if (!dfs_start_nodes.empty()) {
-    tensorflow::ReverseDFSFrom(graph, dfs_start_nodes, {},
-                               [&is_cycle, src](tensorflow::Node* node) {
-                                 if (node == src) {
-                                   is_cycle = true;
-                                 }
-                               });
-  }
-
+  bool is_cycle = CheckCycles(graph, src, dfs_start_nodes);
   return !is_cycle;
 }
+}  // namespace
 
-void ContractEdge(tensorflow::Edge* edge, tensorflow::Graph* graph,
-                  std::vector<const tensorflow::Edge*>* remove_edges) {
+void ContractEdge(SimpleEdge* edge, SimpleGraph* graph,
+                  std::vector<const SimpleEdge*>* remove_edges) {
   // Transfer all inputs and outputs of 'dst' to 'src' except edges
   // connecting the two.
-  tensorflow::Node* src = edge->src();
-  tensorflow::Node* dst = edge->dst();
+  auto src = edge->src();
+  auto dst = edge->dst();
 
   // We can use '0' for input/output index because we don't need them
   // to be accurate for the way we are using the graph.
-  std::vector<const tensorflow::Edge*> in_edges(dst->in_edges().begin(),
-                                                dst->in_edges().end());
-  for (const tensorflow::Edge* in_edge : in_edges) {
+  std::vector<const SimpleEdge*> in_edges(dst->in_edges().begin(),
+                                          dst->in_edges().end());
+  for (const SimpleEdge* in_edge : in_edges) {
     if (in_edge->IsControlEdge()) {
       if (in_edge->src() != src) {
-        tensorflow::Edge* e = const_cast<tensorflow::Edge*>(in_edge);
+        SimpleEdge* e = const_cast<SimpleEdge*>(in_edge);
         graph->AddControlEdge(e->src(), src);
       }
     } else {
       if (in_edge->src() != src) {
-        tensorflow::Edge* e = const_cast<tensorflow::Edge*>(in_edge);
+        SimpleEdge* e = const_cast<SimpleEdge*>(in_edge);
         if (e->src() == graph->source_node()) {
           graph->AddEdge(e->src(), e->src_output(), src,
                          tensorflow::Graph::kControlSlot);
@@ -98,14 +311,14 @@ void ContractEdge(tensorflow::Edge* edge, tensorflow::Graph* graph,
     }
   }
 
-  std::vector<const tensorflow::Edge*> out_edges(dst->out_edges().begin(),
-                                                 dst->out_edges().end());
-  for (const tensorflow::Edge* out_edge : out_edges) {
+  std::vector<const SimpleEdge*> out_edges(dst->out_edges().begin(),
+                                           dst->out_edges().end());
+  for (const SimpleEdge* out_edge : out_edges) {
     if (out_edge->IsControlEdge()) {
-      tensorflow::Edge* e = const_cast<tensorflow::Edge*>(out_edge);
+      SimpleEdge* e = const_cast<SimpleEdge*>(out_edge);
       graph->AddControlEdge(src, e->dst());
     } else {
-      tensorflow::Edge* e = const_cast<tensorflow::Edge*>(out_edge);
+      SimpleEdge* e = const_cast<SimpleEdge*>(out_edge);
       if (e->dst() == graph->sink_node()) {
         VLOG(1) << " edge to sink node " << src->name() << " -> "
                 << e->dst()->name();
@@ -128,8 +341,6 @@ void ContractEdge(tensorflow::Edge* edge, tensorflow::Graph* graph,
   }
 }
 
-}  // namespace
-
 tensorflow::Status SegmentGraph(
     const tensorflow::GraphDef& gdef,
     const std::function<bool(const tensorflow::Node*)>& candidate_fn,
@@ -140,17 +351,22 @@ tensorflow::Status SegmentGraph(
   tensorflow::Graph graph(flib);
   TF_RETURN_IF_ERROR(tensorflow::ConvertGraphDefToGraph(
       tensorflow::GraphConstructorOptions(), gdef, &graph));
+  return SegmentGraph(&graph, candidate_fn, options, segments);
+}
 
-  // tensorflow::DumpGraph("Pre-Segment", &graph);
-
+tensorflow::Status SegmentGraph(
+    tensorflow::Graph* tf_graph,
+    const std::function<bool(const tensorflow::Node*)>& candidate_fn,
+    const SegmentOptions& options, SegmentNodesVector* segments) {
+  auto graph = std::unique_ptr<SimpleGraph>(new SimpleGraph(tf_graph));
   // Use a union-find to collect the nodes that belong to the same
-  // segment. A node value of nullptr indicates that the node is not a
-  // candidate for TRT.
-  std::vector<UnionFind<tensorflow::Node*>> node_segments;
-  for (int i = 0; i < graph.num_node_ids(); ++i) {
-    tensorflow::Node* node = graph.FindNodeId(i);
+  // segment. A node value of nullptr indicates that the node is not a candidate
+  // for TRT.
+  std::vector<UnionFind<SimpleNode*>> node_segments;
+  for (int i = 0; i < graph->num_node_ids(); ++i) {
+    SimpleNode* node = graph->FindNodeId(i);
     if (options.exclude_node_list.count(node->name()) != 0 ||
-        !candidate_fn(node)) {
+        !candidate_fn(node->tf_node())) {
       node = nullptr;
     }
     node_segments.emplace_back(node);
@@ -164,10 +380,16 @@ tensorflow::Status SegmentGraph(
   // a measure of how beneficial it is to include a given node in a
   // TRT subgraph then we can revisit this algorithm to take advantage
   // of that information.
-  std::vector<tensorflow::Node*> order;
-  tensorflow::GetPostOrder(graph, &order);
-
-  for (const tensorflow::Node* node : order) {
+  std::vector<tensorflow::Node*> tforder;
+  tensorflow::GetPostOrder(*tf_graph, &tforder);
+  // use postorder implementation from tensorflow and construct mirror in
+  // internal format
+  std::vector<SimpleNode*> order;
+  order.reserve(tforder.size());
+  for (const auto tfnode : tforder) {
+    order.push_back(graph->FindNodeId(tfnode->id()));
+  }
+  for (const SimpleNode* node : order) {
     // All output nodes of 'node' have been visited...
     VLOG(2) << "Trying node " << node->name() << " id=" << node->id();
 
@@ -181,8 +403,8 @@ tensorflow::Status SegmentGraph(
     // nodes. Iterate since combining two nodes may unblock other
     // combining.
     while (true) {
-      std::set<const tensorflow::Edge*> contract_edges;
-      for (const tensorflow::Edge* out_edge : node->out_edges()) {
+      std::set<const SimpleEdge*> contract_edges;
+      for (const SimpleEdge* out_edge : node->out_edges()) {
         VLOG(2) << "... out node " << out_edge->dst()->name() << " ( "
                 << out_edge->dst()->id() << " <- " << node->id() << " )";
         if (out_edge->IsControlEdge()) {
@@ -210,9 +432,9 @@ tensorflow::Status SegmentGraph(
       // Contract edges and collect the adjacent nodes into the same
       // segment/subgraph.
       while (!contract_edges.empty()) {
-        const tensorflow::Edge* contract_edge = *contract_edges.begin();
-        const tensorflow::Node* src = contract_edge->src();
-        const tensorflow::Node* dst = contract_edge->dst();
+        const SimpleEdge* contract_edge = *contract_edges.begin();
+        const SimpleNode* src = contract_edge->src();
+        const SimpleNode* dst = contract_edge->dst();
 
         VLOG(2) << "Merge " << src->name() << " <- " << dst->name() << " ("
                 << src->id() << " <- " << dst->id();
@@ -221,13 +443,13 @@ tensorflow::Status SegmentGraph(
         // Contracting the edge leaves disconnected graph edges.
         // Remove these from the graph and from 'contract_edges' so we
         // don't visit them again.
-        tensorflow::Edge* e = const_cast<tensorflow::Edge*>(contract_edge);
-        std::vector<const tensorflow::Edge*> remove_edges;
-        ContractEdge(e, &graph, &remove_edges);
+        SimpleEdge* e = const_cast<SimpleEdge*>(contract_edge);
+        std::vector<const SimpleEdge*> remove_edges;
+        ContractEdge(e, graph.get(), &remove_edges);
 
-        for (const tensorflow::Edge* r : remove_edges) {
+        for (const SimpleEdge* r : remove_edges) {
           contract_edges.erase(r);
-          graph.RemoveEdge(r);
+          graph->RemoveEdge(r);
         }
       }
     }
@@ -236,9 +458,27 @@ tensorflow::Status SegmentGraph(
   // Collect the segments/subgraphs. Each subgraph is represented by a
   // set of the names of the nodes in that subgraph.
   std::unordered_map<string, std::set<string>> sg_map;
+  std::unordered_map<string, std::set<string>> device_maps;
   for (auto& u : node_segments) {
     if ((u.Value() != nullptr) && (u.ParentValue() != nullptr)) {
       sg_map[u.ParentValue()->name()].insert(u.Value()->name());
+      auto tf_node = u.Value()->tf_node();
+      // has_assigned_device_name() is expected to return true
+      // when called from optimization pass. However, since graph
+      // is converted back and forth between graph and graphdef,
+      // assigned devices demoted to requested devices. If the graph
+      // is passed directly to this module, assigned devices will be set.
+      if (tf_node->has_assigned_device_name()) {
+        device_maps[u.ParentValue()->name()].insert(
+            tf_node->assigned_device_name());
+      } else if (!tf_node->requested_device().empty()) {
+        device_maps[u.ParentValue()->name()].insert(
+            tf_node->requested_device());
+      } else {
+        VLOG(1) << "Node " << tf_node->name()
+                << " has no device assigned requested device is: "
+                << tf_node->requested_device();
+      }
     }
   }
 
@@ -260,10 +500,35 @@ tensorflow::Status SegmentGraph(
               << segment_node_names.size() << " nodes, dropping";
       continue;
     }
-
-    segments->emplace_back(segment_node_names);
+    // TODO(sami): Make segmenter placement aware once trtscopes are in place
+    const auto& dev_itr = device_maps.find(itr.first);
+    if (dev_itr == device_maps.end() || dev_itr->second.empty()) {
+      VLOG(1) << "No device assigned to segment " << segments->size();
+      segments->emplace_back(std::make_pair(segment_node_names, string()));
+    } else if (dev_itr->second.size() > 1) {
+      string s("Segment ");
+      StrAppend(&s, segments->size(), " has multiple devices attached: ");
+      for (const auto& dev : dev_itr->second) {
+        StrAppend(&s, dev, ", ");
+      }
+      LOG(WARNING) << s << " choosing " << *(dev_itr->second.begin());
+      segments->emplace_back(
+          std::make_pair(segment_node_names, *(dev_itr->second.begin())));
+    } else {
+      segments->emplace_back(
+          std::make_pair(segment_node_names, *(dev_itr->second.begin())));
+    }
+  }
+  if (VLOG_IS_ON(1)) {
+    for (const auto& d : device_maps) {
+      string s("Segment ");
+      StrAppend(&s, ": '", d.first, "' ");
+      for (const auto& dd : d.second) {
+        StrAppend(&s, dd, ", ");
+      }
+      VLOG(1) << "Devices " << s;
+    }
   }
-
   return tensorflow::Status::OK();
 }
 
diff --git a/tensorflow/contrib/tensorrt/segment/segment.h b/tensorflow/contrib/tensorrt/segment/segment.h
index 7e8685f44a8c8a..1568dd915344e6 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.h
+++ b/tensorflow/contrib/tensorrt/segment/segment.h
@@ -29,7 +29,9 @@ namespace tensorflow {
 namespace tensorrt {
 namespace segment {
 
-using SegmentNodesVector = std::vector<std::set<string>>;
+// vector of segments, each entry contains a device name and a set of nodes in
+// segment
+using SegmentNodesVector = std::vector<std::pair<std::set<string>, string>>;
 
 struct SegmentOptions {
   // Segment must contain at least this many nodes.
@@ -51,6 +53,20 @@ tensorflow::Status SegmentGraph(
     const std::function<bool(const tensorflow::Node*)>& candidate_fn,
     const SegmentOptions& options, SegmentNodesVector* segments);
 
+// Get the subgraphs of a graph that can be handled by TensorRT.
+//
+// @param graph tensorflow::Graph of the network
+// @param candidate_fn A function that returns true for a Node* if
+// that node can be handled by TensorRT.
+// @param segments Returns the TensorRT segments/subgraphs. Each entry
+// in the vector describes a subgraph by giving a set of the names of
+// all the NodeDefs in that subgraph.
+// @return the status.
+tensorflow::Status SegmentGraph(
+    tensorflow::Graph* tf_graph,
+    const std::function<bool(const tensorflow::Node*)>& candidate_fn,
+    const SegmentOptions& options, SegmentNodesVector* segments);
+
 }  // namespace segment
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/segment/segment_test.cc b/tensorflow/contrib/tensorrt/segment/segment_test.cc
index 7ddabec268d4ef..f5b2d258d70d55 100644
--- a/tensorflow/contrib/tensorrt/segment/segment_test.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment_test.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/contrib/tensorrt/segment/segment.h"
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/test.h"
@@ -35,7 +34,7 @@ class SegmentTest : public ::testing::Test {
   TF_Operation* Add(TF_Operation* l, TF_Operation* r, TF_Graph* graph,
                     TF_Status* s, const char* name);
 
-  std::function<bool(const Node*)> MakeCandidateFn(
+  std::function<bool(const tensorflow::Node*)> MakeCandidateFn(
       const std::set<string>& node_names);
 
  protected:
@@ -60,9 +59,9 @@ bool SegmentTest::GetGraphDef(TF_Graph* graph,
   return ret;
 }
 
-std::function<bool(const Node*)> SegmentTest::MakeCandidateFn(
+std::function<bool(const tensorflow::Node*)> SegmentTest::MakeCandidateFn(
     const std::set<string>& node_names) {
-  return [node_names](const Node* node) -> bool {
+  return [node_names](const tensorflow::Node* node) -> bool {
     return node_names.find(node->name()) != node_names.end();
   };
 }
@@ -165,7 +164,7 @@ TEST_F(SegmentTest, Simple) {
   ASSERT_EQ(segments.size(), 1);
   std::vector<string> expected{"add0", "add1", "add2", "add3", "add4"};
   for (const auto& ex : expected) {
-    EXPECT_TRUE(segments[0].find(ex) != segments[0].end())
+    EXPECT_TRUE(segments[0].first.find(ex) != segments[0].first.end())
         << "Missing expected node " << ex;
   }
   TF_DeleteGraph(graph);
@@ -276,15 +275,15 @@ TEST_F(SegmentTest, Multiple) {
   // Expect two subgraphs
   EXPECT_EQ(segments.size(), 2);
 
-  std::vector<string> expected0{"add0", "add1", "add2", "add3"};
+  std::vector<string> expected0{"add6", "add8"};
   for (const auto& ex : expected0) {
-    EXPECT_TRUE(segments[0].find(ex) != segments[0].end())
+    EXPECT_TRUE(segments[0].first.find(ex) != segments[0].first.end())
         << "Missing expected node " << ex;
   }
 
-  std::vector<string> expected1{"add6", "add8"};
+  std::vector<string> expected1{"add0", "add1", "add2", "add3"};
   for (const auto& ex : expected1) {
-    EXPECT_TRUE(segments[1].find(ex) != segments[1].end())
+    EXPECT_TRUE(segments[1].first.find(ex) != segments[1].first.end())
         << "Missing expected node " << ex;
   }
   TF_DeleteGraph(graph);
@@ -348,13 +347,13 @@ TEST_F(SegmentTest, BigIfElse) {
 
   std::vector<string> expected0{"add3", "add4", "add5", "add6", "add7"};
   for (const auto& ex : expected0) {
-    EXPECT_TRUE(segments[0].find(ex) != segments[0].end())
+    EXPECT_TRUE(segments[0].first.find(ex) != segments[0].first.end())
         << "Missing expected node " << ex;
   }
 
   std::vector<string> expected1{"add0", "add1"};
   for (const auto& ex : expected1) {
-    EXPECT_TRUE(segments[1].find(ex) != segments[1].end())
+    EXPECT_TRUE(segments[1].first.find(ex) != segments[1].first.end())
         << "Missing expected node " << ex;
   }
   TF_DeleteGraph(graph);
diff --git a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
index 8b475177bc670d..f36495f6b69ecb 100644
--- a/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
+++ b/tensorflow/contrib/tensorrt/shape_fn/trt_shfn.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/contrib/tensorrt/shape_fn/trt_shfn.h"
+#include "tensorflow/contrib/tensorrt/plugin/trt_plugin_factory.h"
 
 #include <string>
 #include <vector>
@@ -33,7 +34,8 @@ tensorflow::Status TRTEngineOpShapeInference(InferenceContext* context) {
   TF_RETURN_IF_ERROR(context->GetAttr("serialized_engine", &serialized_engine));
   nvinfer1::IRuntime* infer = nvinfer1::createInferRuntime(logger);
   nvinfer1::ICudaEngine* trt_engine = infer->deserializeCudaEngine(
-      serialized_engine.c_str(), serialized_engine.size(), nullptr);
+      serialized_engine.c_str(), serialized_engine.size(),
+      tensorrt::PluginFactoryTensorRT::GetInstance());
 
   int num_batch = -1;
   std::vector<::tensorflow::DataType> input_type;
diff --git a/tensorflow/contrib/tensorrt/tensorrt_test.cc b/tensorflow/contrib/tensorrt/tensorrt_test.cc
index e11522ea5bda7f..3712a9a6fe349d 100644
--- a/tensorflow/contrib/tensorrt/tensorrt_test.cc
+++ b/tensorflow/contrib/tensorrt/tensorrt_test.cc
@@ -95,9 +95,9 @@ nvinfer1::IHostMemory* CreateNetwork() {
 }
 
 // Executes the network.
-void Execute(nvinfer1::IExecutionContext& context, const float* input,
+void Execute(nvinfer1::IExecutionContext* context, const float* input,
              float* output) {
-  const nvinfer1::ICudaEngine& engine = context.getEngine();
+  const nvinfer1::ICudaEngine& engine = context->getEngine();
 
   // We have two bindings: input and output.
   ASSERT_EQ(engine.getNbBindings(), 2);
@@ -118,7 +118,7 @@ void Execute(nvinfer1::IExecutionContext& context, const float* input,
   // could be removed.
   ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index], input, sizeof(float),
                                cudaMemcpyHostToDevice, stream));
-  context.enqueue(1, buffers, stream, nullptr);
+  context->enqueue(1, buffers, stream, nullptr);
   ASSERT_EQ(0, cudaMemcpyAsync(output, buffers[output_index], sizeof(float),
                                cudaMemcpyDeviceToHost, stream));
   cudaStreamSynchronize(stream);
@@ -143,7 +143,7 @@ TEST(TensorrtTest, BasicFunctions) {
   // Execute the network.
   float input = 1234;
   float output;
-  Execute(*context, &input, &output);
+  Execute(context, &input, &output);
   EXPECT_EQ(output, input * 2 + 3);
 
   // Destroy the engine.
diff --git a/tensorflow/contrib/tensorrt/test/test_tftrt.py b/tensorflow/contrib/tensorrt/test/test_tftrt.py
index ad01bedd8fa066..175ccd80068625 100644
--- a/tensorflow/contrib/tensorrt/test/test_tftrt.py
+++ b/tensorflow/contrib/tensorrt/test/test_tftrt.py
@@ -18,7 +18,9 @@
 from __future__ import division
 from __future__ import print_function
 
+import argparse
 import numpy as np
+
 # normally we should do import tensorflow as tf and then
 # tf.placeholder, tf.constant, tf.nn.conv2d etc but
 # it looks like internal builds don't like it so
@@ -26,6 +28,7 @@
 
 from tensorflow.contrib import tensorrt as trt
 from tensorflow.core.protobuf import config_pb2 as cpb2
+from tensorflow.core.protobuf import rewriter_config_pb2 as rwpb2
 from tensorflow.python.client import session as csess
 from tensorflow.python.framework import constant_op as cop
 from tensorflow.python.framework import dtypes as dtypes
@@ -59,9 +62,11 @@ def get_simple_graph_def():
   return g.as_graph_def()
 
 
-def run_graph(gdef, dumm_inp):
+def execute_graph(gdef, dumm_inp):
   """Run given graphdef once."""
+  print("executing")
   gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
+  sessconfig = cpb2.ConfigProto(gpu_options=gpu_options)
   ops.reset_default_graph()
   g = ops.Graph()
   with g.as_default():
@@ -69,15 +74,14 @@ def run_graph(gdef, dumm_inp):
         graph_def=gdef, return_elements=["input", "output"])
     inp = inp.outputs[0]
     out = out.outputs[0]
-  with csess.Session(
-      config=cpb2.ConfigProto(gpu_options=gpu_options), graph=g) as sess:
+  with csess.Session(config=sessconfig, graph=g) as sess:
     val = sess.run(out, {inp: dumm_inp})
   return val
 
 
 # Use real data that is representative of the inference dataset
 # for calibration. For this test script it is random data.
-def run_calibration(gdef, dumm_inp):
+def execute_calibration(gdef, dumm_inp):
   """Run given calibration graph multiple times."""
   gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
   ops.reset_default_graph()
@@ -96,7 +100,9 @@ def run_calibration(gdef, dumm_inp):
   return val
 
 
-if "__main__" in __name__:
+def user(run_graph=execute_graph, run_calibration=execute_calibration):
+  """Example function that converts a graph to TFTRT graph."""
+
   inp_dims = (100, 24, 24, 2)
   dummy_input = np.random.random_sample(inp_dims)
   orig_graph = get_simple_graph_def()  # use a frozen graph for inference
@@ -137,3 +143,51 @@ def run_calibration(gdef, dumm_inp):
   assert np.allclose(o1, o4)
   assert np.allclose(o1, o5)
   print("Pass")
+
+
+def auto():
+  """Run the conversion as an optimization pass."""
+  inp_dims = (100, 24, 24, 2)
+  dummy_input = np.random.random_sample(inp_dims)
+  orig_graph = get_simple_graph_def()
+  opt_config = rwpb2.RewriterConfig()
+  opt_config.optimizers.extend(["constfold", "layout"])
+  custom_op = opt_config.custom_optimizers.add()
+  custom_op.name = "TensorRTOptimizer"
+  custom_op.parameter_map["minimum_segment_size"].i = 3
+  custom_op.parameter_map["precision_mode"].s = "FP32"
+  custom_op.parameter_map["max_batch_size"].i = inp_dims[0]
+  custom_op.parameter_map["max_workspace_size_bytes"].i = 1 << 25
+  print(custom_op)
+  gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
+  graph_options = cpb2.GraphOptions(rewrite_options=opt_config)
+  sessconfig = cpb2.ConfigProto(
+      gpu_options=gpu_options, graph_options=graph_options)
+  print(sessconfig)
+  g = ops.Graph()
+  ops.reset_default_graph()
+  with g.as_default():
+    inp, out = importer.import_graph_def(
+        graph_def=orig_graph, return_elements=["input", "output"])
+    inp = inp.outputs[0]
+    out = out.outputs[0]
+    with csess.Session(config=sessconfig, graph=g) as sess:
+      val = sess.run(out, {inp: dummy_input})
+  print(val.shape)
+
+
+if "__main__" in __name__:
+  P = argparse.ArgumentParser(
+      prog="tftrt_test",
+      description="Example utilization of TensorFlow-TensorRT integration")
+  P.add_argument(
+      "--automatic",
+      "-a",
+      action="store_true",
+      help="Do TRT conversion automatically",
+      default=False)
+  flags, unparsed = P.parse_known_args()
+  if flags.automatic:
+    auto()
+  else:
+    user()
diff --git a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py
index 7a4732876286a9..0403b652d72877 100644
--- a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py
+++ b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test.py
@@ -34,7 +34,6 @@
 from tensorflow.python.platform import googletest
 
 
-@test_util.with_c_api
 class IntegrationTest(test_util.TensorFlowTestCase):
   """Class to test Tensorflow-TensorRT integration."""
 
@@ -45,8 +44,7 @@ def setUp(self):
     inp_dims = (100, 24, 24, 2)
     self._input = np.random.random_sample(inp_dims)
     self._original_graph = self.get_simple_graph_def()
-    self._gpu_options = cpb2.GPUOptions(
-        per_process_gpu_memory_fraction=0.50)
+    self._gpu_options = cpb2.GPUOptions(per_process_gpu_memory_fraction=0.50)
     self._config = cpb2.ConfigProto(gpu_options=self._gpu_options)
     self._reference = self.run_graph(self._original_graph, self._input)
 
@@ -61,11 +59,7 @@ def get_simple_graph_def(self):
           name="weights",
           dtype=dtypes.float32)
       conv = nn.conv2d(
-          input=a,
-          filter=e,
-          strides=[1, 2, 2, 1],
-          padding="SAME",
-          name="conv")
+          input=a, filter=e, strides=[1, 2, 2, 1], padding="SAME", name="conv")
       b = cop.constant(
           [4., 1.5, 2., 3., 5., 7.], name="bias", dtype=dtypes.float32)
       t = nn.bias_add(conv, b, name="biasAdd")
@@ -86,8 +80,7 @@ def run_graph(self, gdef, dumm_inp):
       inp = inp.outputs[0]
       out = out.outputs[0]
     with self.test_session(
-        graph=g, config=self._config, use_gpu=True,
-        force_gpu=True) as sess:
+        graph=g, config=self._config, use_gpu=True, force_gpu=True) as sess:
       val = sess.run(out, {inp: dumm_inp})
     return val
 
@@ -105,15 +98,14 @@ def run_calibration(self, gdef, dumm_inp):
       # run over real calibration data here, we are mimicking a calibration
       # set of 30 different batches. Use as much calibration data as you want
     with self.test_session(
-        graph=g, config=self._config, use_gpu=True,
-        force_gpu=True) as sess:
+        graph=g, config=self._config, use_gpu=True, force_gpu=True) as sess:
       for _ in range(30):
         val = sess.run(out, {inp: dumm_inp})
     return val
 
   def get_trt_graph(self, mode):
     """Return trt converted graph."""
-    if mode in  ["FP32", "FP16", "INT8"]:
+    if mode in ["FP32", "FP16", "INT8"]:
       return trt.create_inference_graph(
           input_graph_def=self._original_graph,
           outputs=["output"],
@@ -121,7 +113,7 @@ def get_trt_graph(self, mode):
           max_workspace_size_bytes=1 << 25,
           precision_mode=mode,  # TRT Engine precision "FP32","FP16" or "INT8"
           minimum_segment_size=2  # minimum number of nodes in an engine
-          )
+      )
     return None
 
   def testFP32(self):
diff --git a/tensorflow/contrib/timeseries/python/timeseries/BUILD b/tensorflow/contrib/timeseries/python/timeseries/BUILD
index d2746032a04946..e4963596d38dbe 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/BUILD
@@ -110,6 +110,7 @@ py_test(
         "no_pip_gpu",  # b/63391119
         "nomsan",  # Takes too long to run.
         "notsan",  # b/67865658
+        "optonly",  # Takes too long to run without optimization.
     ],
     deps = [
         ":ar_model",
diff --git a/tensorflow/contrib/timeseries/python/timeseries/ar_model.py b/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
index 558d9480b495ca..d8089453340e89 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/ar_model.py
@@ -20,6 +20,7 @@
 
 from tensorflow.contrib import distributions
 
+from tensorflow.contrib.rnn.python.ops import lstm_ops
 from tensorflow.contrib.timeseries.python.timeseries import model
 from tensorflow.contrib.timeseries.python.timeseries import model_utils
 from tensorflow.contrib.timeseries.python.timeseries.feature_keys import PredictionFeatures
@@ -29,6 +30,9 @@
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.layers import core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
@@ -40,12 +44,150 @@
 from tensorflow.python.ops import variable_scope
 
 
+class FlatPredictionModel(training.Model):
+  """Flattens input and output windows and puts them through dense layers.
+
+  This model does not operate on its own, but rather is a plugin to
+  `ARModel`. See `ARModel`'s constructor documentation
+  (`prediction_model_factory`) for a usage example.
+  """
+
+  def __init__(self,
+               num_features,
+               input_window_size,
+               output_window_size,
+               hidden_layer_sizes=None):
+    """Construct the flat prediction model.
+
+    Args:
+      num_features: number of input features per time step.
+      input_window_size: Number of past time steps of data to look at when doing
+        the regression.
+      output_window_size: Number of future time steps to predict. Note that
+        setting it to > 1 empirically seems to give a better fit.
+      hidden_layer_sizes: list of sizes of hidden layers.
+    """
+    super(FlatPredictionModel, self).__init__()
+    self._input_flatten = core.Flatten()
+    self._output_flatten = core.Flatten()
+    if hidden_layer_sizes:
+      self._hidden_layers = sequential.Sequential([
+          core.Dense(layer_size, activation=nn_ops.relu)
+          for layer_size in hidden_layer_sizes])
+    else:
+      self._hidden_layers = None
+    self._mean_transform = core.Dense(num_features * output_window_size,
+                                      name="predicted_mean")
+    self._covariance_transform = core.Dense(num_features * output_window_size,
+                                            name="log_sigma_square")
+    self._prediction_shape = [-1, output_window_size, num_features]
+
+  def call(self, input_window_features, output_window_features):
+    """Compute predictions from input and output windows.
+
+    Args:
+      input_window_features: A floating point Tensor with shape [batch size,
+        input window size, input features]. The batch dimension may not have
+        static shape information, but the window size and number of input
+        features are known at graph construction time and recorded in the static
+        shape information for the `input_window_features` `Tensor`. Note that
+        `input_window_size` may be zero.
+      output_window_features: A floating point Tensor with shape [batch size,
+        output window size, output features]. As with `input_window_features`,
+        the last two dimensions have static shape information. If there are no
+        output features, the size of the last dimension will be zero.
+    Returns:
+      A dictionary of predictions with keys "mean" and "covariance" (only
+      diagonal covariances are currently supported). Each has shape
+      [batch size, output window size, num_features], where num_features is the
+      same as the constructor argument.
+    """
+    if input_window_features.shape[1].value == 0:
+      # TODO(allenl): Make reshape()'s static shape information work on
+      # zero-size Tensors? Currently this special case is required because
+      # otherwise the Dense layers get unknown last dimensions.
+      activation = self._output_flatten(output_window_features)
+    elif output_window_features.shape[2].value == 0:
+      activation = self._input_flatten(input_window_features)
+    else:
+      activation = array_ops.concat(
+          [self._input_flatten(input_window_features),
+           self._output_flatten(output_window_features)],
+          axis=1)
+    if self._hidden_layers:
+      activation = self._hidden_layers(activation)
+    predicted_mean = array_ops.reshape(
+        self._mean_transform(activation),
+        self._prediction_shape)
+    predicted_covariance = array_ops.reshape(
+        gen_math_ops.exp(self._covariance_transform(activation)),
+        self._prediction_shape)
+    return {"mean": predicted_mean,
+            "covariance": predicted_covariance}
+
+
+class LSTMPredictionModel(training.Model):
+  """A simple encoder/decoder model using an LSTM.
+
+  This model does not operate on its own, but rather is a plugin to
+  `ARModel`. See `ARModel`'s constructor documentation
+  (`prediction_model_factory`) for a usage example.
+  """
+
+  def __init__(self,
+               num_features,
+               input_window_size,
+               output_window_size,
+               num_units=128):
+    """Construct the LSTM prediction model.
+
+    Args:
+      num_features: number of input features per time step.
+      input_window_size: Number of past time steps of data to look at when doing
+        the regression.
+      output_window_size: Number of future time steps to predict. Note that
+        setting it to > 1 empirically seems to give a better fit.
+      num_units: The number of units in the encoder and decoder LSTM cells.
+    """
+    super(LSTMPredictionModel, self).__init__()
+    self._encoder = lstm_ops.LSTMBlockFusedCell(
+        num_units=num_units, name="encoder")
+    self._decoder = lstm_ops.LSTMBlockFusedCell(
+        num_units=num_units, name="decoder")
+    self._mean_transform = core.Dense(num_features,
+                                      name="mean_transform")
+    self._covariance_transform = core.Dense(num_features,
+                                            name="covariance_transform")
+
+  def call(self, input_window_features, output_window_features):
+    """Compute predictions from input and output windows."""
+    # Convert to time major
+    input_window_features = array_ops.transpose(input_window_features,
+                                                [1, 0, 2])
+    output_window_features = array_ops.transpose(output_window_features,
+                                                 [1, 0, 2])
+    _, encoder_state = self._encoder(
+        input_window_features, dtype=self.dtype)
+    decoder_output, _ = self._decoder(
+        output_window_features, dtype=self.dtype,
+        initial_state=encoder_state)
+
+    # Switch back to batch major
+    decoder_output = array_ops.transpose(decoder_output, [1, 0, 2])
+    predicted_mean = self._mean_transform(decoder_output)
+    predicted_covariance = gen_math_ops.exp(
+        self._covariance_transform(decoder_output))
+    return {"mean": predicted_mean,
+            "covariance": predicted_covariance}
+
+
 class ARModel(model.TimeSeriesModel):
   """Auto-regressive model, both linear and non-linear.
 
   Features to the model include time and values of input_window_size timesteps,
-  and times for output_window_size timesteps. These are passed through zero or
-  more hidden layers, and then fed to a loss function (e.g. squared loss).
+  and times for output_window_size timesteps. These are passed through a
+  configurable prediction model, and then fed to a loss function (e.g. squared
+  loss).
 
   Note that this class can also be used to regress against time only by setting
   the input_window_size to zero.
@@ -58,9 +200,9 @@ def __init__(self,
                input_window_size,
                output_window_size,
                num_features,
+               prediction_model_factory=FlatPredictionModel,
                num_time_buckets=10,
                loss=NORMAL_LIKELIHOOD_LOSS,
-               hidden_layer_sizes=None,
                exogenous_feature_columns=None):
     """Constructs an auto-regressive model.
 
@@ -73,6 +215,22 @@ def __init__(self,
       output_window_size: Number of future time steps to predict. Note that
         setting it to > 1 empirically seems to give a better fit.
       num_features: number of input features per time step.
+      prediction_model_factory: A callable taking arguments `num_features`,
+        `input_window_size`, and `output_window_size` and returning a
+        `tf.keras.Model`. The `Model`'s `call()` takes two arguments: an input
+        window and an output window, and returns a dictionary of
+        predictions. See `FlatPredictionModel` for an example. Example usage:
+
+        ```python
+        model = ar_model.ARModel(
+          periodicities=2, num_features=3,
+          prediction_model_factory=functools.partial(
+              FlatPredictionModel,
+              hidden_layer_sizes=[10, 10]))
+        ```
+
+        The default model computes predictions as a linear function of flattened
+        input and output windows.
       num_time_buckets: Number of buckets into which to divide (time %
         periodicity) for generating time based features.
       loss: Loss function to use for training. Currently supported values are
@@ -81,18 +239,15 @@ def __init__(self,
         SQUARED_LOSS, the evaluation loss is reported based on un-scaled
         observations and predictions, while the training loss is computed on
         normalized data (if input statistics are available).
-      hidden_layer_sizes: list of sizes of hidden layers.
       exogenous_feature_columns: A list of `tf.feature_column`s (for example
           `tf.feature_column.embedding_column`) corresponding to exogenous
           features which provide extra information to the model but are not part
           of the series to be predicted. Passed to
           `tf.feature_column.input_layer`.
     """
+    self._model_factory = prediction_model_factory
     self.input_window_size = input_window_size
     self.output_window_size = output_window_size
-    if hidden_layer_sizes is None:
-      hidden_layer_sizes = []
-    self.hidden_layer_sizes = hidden_layer_sizes
     self.window_size = self.input_window_size + self.output_window_size
     self.loss = loss
     super(ARModel, self).__init__(
@@ -115,6 +270,19 @@ def __init__(self,
     assert len(self._periods) or self.input_window_size
     assert output_window_size > 0
 
+  def initialize_graph(self, input_statistics=None):
+    super(ARModel, self).initialize_graph(input_statistics=input_statistics)
+    self._model_scope = variable_scope.variable_scope(
+        # The trailing slash means we strip all enclosing variable_scopes, which
+        # unfortunately is necessary because the model gets called inside and
+        # outside a "while" scope (for prediction and training respectively),
+        # and the variables names need to match.
+        "model/", use_resource=True)
+    self._model_instance = self._model_factory(
+        num_features=self.num_features,
+        input_window_size=self.input_window_size,
+        output_window_size=self.output_window_size)
+
   def get_start_state(self):
     # State which matches the format we'll return later. Typically this will not
     # be used by the model directly, but the shapes and dtypes should match so
@@ -166,17 +334,6 @@ def _predicted_mean_op(self, activations):
     return array_ops.reshape(predicted_mean,
                              [-1, self.output_window_size, self.num_features])
 
-  def _create_hidden_stack(self, activation, activation_size):
-    activations = []
-    for layer_number, layer_size in enumerate(self.hidden_layer_sizes):
-      # TODO(agarwal): Migrate to fully_connected in tf slim
-      activation = model_utils.fully_connected(
-          activation, activation_size, layer_size,
-          name="layer_{}".format(layer_number))
-      activation_size = layer_size
-      activations.append((activation, activation_size))
-    return activations
-
   def prediction_ops(self, times, values, exogenous_regressors):
     """Compute model predictions given input data.
 
@@ -195,7 +352,7 @@ def prediction_ops(self, times, values, exogenous_regressors):
       self.num_features].
     """
     times.get_shape().assert_is_compatible_with([None, self.window_size])
-    activations = []
+    batch_size = array_ops.shape(times)[0]
     if self.input_window_size:
       values.get_shape().assert_is_compatible_with(
           [None, self.input_window_size, self.num_features])
@@ -203,39 +360,66 @@ def prediction_ops(self, times, values, exogenous_regressors):
       exogenous_regressors.get_shape().assert_is_compatible_with(
           [None, self.window_size, self.exogenous_size])
     # Create input features.
-    activation_components = []
+    input_window_features = []
+    input_feature_size = 0
+    output_window_features = []
+    output_feature_size = 0
     if self._periods:
       _, time_features = self._compute_time_features(times)
-      activation_size = self.window_size * self._buckets * len(self._periods)
-      activation_components.append(
-          array_ops.reshape(time_features, [-1, activation_size]))
-    else:
-      activation_size = 0
+      num_time_features = self._buckets * len(self._periods)
+      time_features = array_ops.reshape(
+          time_features,
+          [batch_size,
+           self.window_size,
+           num_time_features])
+      input_time_features, output_time_features = array_ops.split(
+          time_features, (self.input_window_size, self.output_window_size),
+          axis=1)
+      input_feature_size += num_time_features
+      output_feature_size += num_time_features
+      input_window_features.append(input_time_features)
+      output_window_features.append(output_time_features)
     if self.input_window_size:
       inp = array_ops.slice(values, [0, 0, 0], [-1, self.input_window_size, -1])
-      inp_size = self.input_window_size * self.num_features
-      inp = array_ops.reshape(inp, [-1, inp_size])
-      activation_components.append(inp)
-      activation_size += inp_size
+      input_window_features.append(
+          array_ops.reshape(
+              inp,
+              [batch_size, self.input_window_size, self.num_features]))
+      input_feature_size += self.num_features
     if self.exogenous_size:
-      exogenous_size = self.window_size * self.exogenous_size
-      activation_size += exogenous_size
-      exogenous_flattened = array_ops.reshape(
-          exogenous_regressors, [-1, exogenous_size])
-      activation_components.append(exogenous_flattened)
-    assert activation_size
-    assert activation_components
-    activation = array_ops.concat(activation_components, axis=1)
-    activations.append((activation, activation_size))
-    # Create hidden layers.
-    activations += self._create_hidden_stack(activation, activation_size)
-    # Create mean and convariance ops.
-    predicted_mean = self._predicted_mean_op(activations)
-    predicted_covariance = self._predicted_covariance_op(activations,
-                                                         self.num_features)
-    return {"activations": activations,
-            "mean": predicted_mean,
-            "covariance": predicted_covariance}
+      input_exogenous_features, output_exogenous_features = array_ops.split(
+          exogenous_regressors,
+          (self.input_window_size, self.output_window_size),
+          axis=1)
+      input_feature_size += self.exogenous_size
+      output_feature_size += self.exogenous_size
+      input_window_features.append(input_exogenous_features)
+      output_window_features.append(output_exogenous_features)
+    assert input_window_features
+    input_window_features = array_ops.concat(input_window_features, axis=2)
+    if output_window_features:
+      output_window_features = array_ops.concat(output_window_features, axis=2)
+    else:
+      output_window_features = array_ops.zeros(
+          [batch_size, self.output_window_size, 0],
+          dtype=self.dtype)
+    static_batch_size = times.get_shape()[0].value
+    input_window_features.set_shape(
+        [static_batch_size, self.input_window_size, input_feature_size])
+    output_window_features.set_shape(
+        [static_batch_size, self.output_window_size, output_feature_size])
+    return self._output_window_predictions(input_window_features,
+                                           output_window_features)
+
+  def _output_window_predictions(
+      self, input_window_features, output_window_features):
+    with self._model_scope:
+      predictions = self._model_instance(
+          input_window_features, output_window_features)
+      result_shape = [None, self.output_window_size, self.num_features]
+      for v in predictions.values():
+        v.set_shape(result_shape)
+      return predictions
 
   def loss_op(self, targets, prediction_ops):
     """Create loss_op."""
@@ -286,6 +470,8 @@ def predict(self, features):
       values are Tensors of shape [batch_size, predict window size,
       num_features] and correspond to the values passed in `TIMES`.
     """
+    if not self._graph_initialized:
+      self.initialize_graph()
     predict_times = math_ops.cast(
         ops.convert_to_tensor(features[PredictionFeatures.TIMES]), dtypes.int32)
     exogenous_regressors = self._process_exogenous_features(
@@ -701,9 +887,9 @@ def __init__(self,
                input_window_size,
                output_window_size,
                num_features,
+               prediction_model_factory=FlatPredictionModel,
                anomaly_distribution=GAUSSIAN_ANOMALY,
                num_time_buckets=10,
-               hidden_layer_sizes=None,
                exogenous_feature_columns=None):
     assert (anomaly_prior_probability < 1.0 and
             anomaly_prior_probability > 0.0)
@@ -719,7 +905,7 @@ def __init__(self,
         input_window_size=input_window_size,
         output_window_size=output_window_size,
         loss=ARModel.NORMAL_LIKELIHOOD_LOSS,
-        hidden_layer_sizes=hidden_layer_sizes,
+        prediction_model_factory=prediction_model_factory,
         exogenous_feature_columns=exogenous_feature_columns)
 
   def _create_anomaly_ops(self, times, values, prediction_ops_dict):
diff --git a/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py b/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py
index d078ac8d46397d..63f5d3568bc208 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/ar_model_test.py
@@ -18,12 +18,13 @@
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 import numpy as np
 
+from tensorflow.contrib.timeseries.python.timeseries import ar_model
 from tensorflow.contrib.timeseries.python.timeseries import input_pipeline
 from tensorflow.contrib.timeseries.python.timeseries import test_utils
-from tensorflow.contrib.timeseries.python.timeseries.ar_model import AnomalyMixtureARModel
-from tensorflow.contrib.timeseries.python.timeseries.ar_model import ARModel
 from tensorflow.contrib.timeseries.python.timeseries.estimators import ARRegressor
 from tensorflow.contrib.timeseries.python.timeseries.feature_keys import PredictionFeatures
 from tensorflow.contrib.timeseries.python.timeseries.feature_keys import TrainEvalFeatures
@@ -91,7 +92,7 @@ def train_helper(self, input_window_size, loss,
     np.random.seed(3)
     data_noise_stddev = 0.2
     if max_loss is None:
-      if loss == ARModel.NORMAL_LIKELIHOOD_LOSS:
+      if loss == ar_model.ARModel.NORMAL_LIKELIHOOD_LOSS:
         max_loss = 1.0
       else:
         max_loss = 0.05 / (data_noise_stddev ** 2)
@@ -137,7 +138,7 @@ def tf_random_seed(self):
     test_loss = test_evaluation["loss"]
     logging.info("Final test loss: %f", test_loss)
     self.assertLess(test_loss, max_loss)
-    if loss == ARModel.SQUARED_LOSS:
+    if loss == ar_model.ARModel.SQUARED_LOSS:
       # Test that the evaluation loss is reported without input scaling.
       self.assertAllClose(
           test_loss,
@@ -169,7 +170,7 @@ def prediction_input_fn():
     predicted_mean = predictions["mean"][:, 0]
     true_values = predict_true_values[0, :, 0]
 
-    if loss == ARModel.NORMAL_LIKELIHOOD_LOSS:
+    if loss == ar_model.ARModel.NORMAL_LIKELIHOOD_LOSS:
       variances = predictions["covariance"][:, 0]
       standard_deviations = np.sqrt(variances)
       # Note that we may get tighter bounds with more training steps.
@@ -180,26 +181,26 @@ def prediction_input_fn():
   def test_time_regression_squared(self):
     self.train_helper(input_window_size=0,
                       train_steps=350,
-                      loss=ARModel.SQUARED_LOSS)
+                      loss=ar_model.ARModel.SQUARED_LOSS)
 
   def test_autoregression_squared(self):
     self.train_helper(input_window_size=15,
-                      loss=ARModel.SQUARED_LOSS)
+                      loss=ar_model.ARModel.SQUARED_LOSS)
 
   def test_autoregression_short_input_window(self):
     self.train_helper(input_window_size=8,
-                      loss=ARModel.SQUARED_LOSS)
+                      loss=ar_model.ARModel.SQUARED_LOSS)
 
   def test_autoregression_normal(self):
     self.train_helper(input_window_size=10,
-                      loss=ARModel.NORMAL_LIKELIHOOD_LOSS,
+                      loss=ar_model.ARModel.NORMAL_LIKELIHOOD_LOSS,
                       train_steps=300,
                       max_loss=1.5,
                       anomaly_distribution=None)
 
   def test_autoregression_normal_multiple_periods(self):
     self.train_helper(input_window_size=10,
-                      loss=ARModel.NORMAL_LIKELIHOOD_LOSS,
+                      loss=ar_model.ARModel.NORMAL_LIKELIHOOD_LOSS,
                       max_loss=2.0,
                       multiple_periods=True,
                       anomaly_distribution=None)
@@ -207,15 +208,15 @@ def test_autoregression_normal_multiple_periods(self):
   def test_autoregression_normal_anomalies_normal(self):
     self.train_helper(
         input_window_size=10,
-        loss=ARModel.NORMAL_LIKELIHOOD_LOSS,
-        anomaly_distribution=AnomalyMixtureARModel.GAUSSIAN_ANOMALY)
+        loss=ar_model.ARModel.NORMAL_LIKELIHOOD_LOSS,
+        anomaly_distribution=ar_model.AnomalyMixtureARModel.GAUSSIAN_ANOMALY)
 
   def test_autoregression_normal_anomalies_cauchy(self):
     self.train_helper(
         input_window_size=10,
         max_loss=1.5,
-        loss=ARModel.NORMAL_LIKELIHOOD_LOSS,
-        anomaly_distribution=AnomalyMixtureARModel.CAUCHY_ANOMALY)
+        loss=ar_model.ARModel.NORMAL_LIKELIHOOD_LOSS,
+        anomaly_distribution=ar_model.AnomalyMixtureARModel.CAUCHY_ANOMALY)
 
   def test_wrong_window_size(self):
     estimator = ARRegressor(
@@ -237,15 +238,38 @@ def _good_data():
     with self.assertRaisesRegexp(ValueError, "requires a window of at least"):
       estimator.evaluate(input_fn=_bad_window_size_input_fn, steps=1)
 
-  def test_predictions_direct(self):
+  def test_predictions_direct_flat(self):
+    g = ops.Graph()
+    with g.as_default():
+      model = ar_model.ARModel(periodicities=2,
+                               num_features=1,
+                               num_time_buckets=10,
+                               input_window_size=2,
+                               output_window_size=2,
+                               prediction_model_factory=functools.partial(
+                                   ar_model.FlatPredictionModel,
+                                   hidden_layer_sizes=[40, 10]))
+      with session.Session():
+        predicted_values = model.predict({
+            PredictionFeatures.TIMES: [[4, 6, 10]],
+            PredictionFeatures.STATE_TUPLE: (
+                [[1, 2]], [[[1.], [2.]]], [[[], []]])
+        })
+        variables.global_variables_initializer().run()
+        self.assertAllEqual(predicted_values["mean"].eval().shape,
+                            [1, 3, 1])
+
+  def test_predictions_direct_lstm(self):
     g = ops.Graph()
     with g.as_default():
-      model = ARModel(periodicities=2,
-                      num_features=1,
-                      num_time_buckets=10,
-                      input_window_size=2,
-                      output_window_size=2,
-                      hidden_layer_sizes=[40, 10])
+      model = ar_model.ARModel(periodicities=2,
+                               num_features=1,
+                               num_time_buckets=10,
+                               input_window_size=2,
+                               output_window_size=2,
+                               prediction_model_factory=functools.partial(
+                                   ar_model.LSTMPredictionModel,
+                                   num_units=16))
       with session.Session():
         predicted_values = model.predict({
             PredictionFeatures.TIMES: [[4, 6, 10]],
@@ -259,11 +283,11 @@ def test_predictions_direct(self):
   def test_long_eval(self):
     g = ops.Graph()
     with g.as_default():
-      model = ARModel(periodicities=2,
-                      num_features=1,
-                      num_time_buckets=10,
-                      input_window_size=2,
-                      output_window_size=1)
+      model = ar_model.ARModel(periodicities=2,
+                               num_features=1,
+                               num_time_buckets=10,
+                               input_window_size=2,
+                               output_window_size=1)
       raw_features = {
           TrainEvalFeatures.TIMES: [[1, 3, 5, 7, 11]],
           TrainEvalFeatures.VALUES: [[[1.], [2.], [3.], [4.], [5.]]]}
@@ -309,11 +333,11 @@ def test_long_eval(self):
   def test_long_eval_discard_indivisible(self):
     g = ops.Graph()
     with g.as_default():
-      model = ARModel(periodicities=2,
-                      num_features=1,
-                      num_time_buckets=10,
-                      input_window_size=2,
-                      output_window_size=2)
+      model = ar_model.ARModel(periodicities=2,
+                               num_features=1,
+                               num_time_buckets=10,
+                               input_window_size=2,
+                               output_window_size=2)
       raw_features = {
           TrainEvalFeatures.TIMES: [[1, 3, 5, 7, 11]],
           TrainEvalFeatures.VALUES: [[[1.], [2.], [3.], [4.], [5.]]]}
diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators.py b/tensorflow/contrib/timeseries/python/timeseries/estimators.py
index f4608ca2d1cc28..4ec8d26116159f 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/estimators.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/estimators.py
@@ -18,6 +18,8 @@
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 from tensorflow.contrib.timeseries.python.timeseries import ar_model
 from tensorflow.contrib.timeseries.python.timeseries import feature_keys
 from tensorflow.contrib.timeseries.python.timeseries import head as ts_head_lib
@@ -61,7 +63,10 @@ def __init__(self, model, state_manager=None, optimizer=None, model_dir=None,
     input_statistics_generator = math_utils.InputStatisticsFromMiniBatch(
         dtype=model.dtype, num_features=model.num_features)
     if state_manager is None:
-      state_manager = state_management.PassthroughStateManager()
+      if isinstance(model, ar_model.ARModel):
+        state_manager = state_management.FilteringOnlyStateManager()
+      else:
+        state_manager = state_management.PassthroughStateManager()
     if optimizer is None:
       optimizer = train.AdamOptimizer(0.02)
     self._model = model
@@ -246,11 +251,13 @@ def __init__(
         anomaly_distribution = ar_model.AnomalyMixtureARModel.GAUSSIAN_ANOMALY
       model = ar_model.ARModel(
           periodicities=periodicities, num_features=num_features,
+          prediction_model_factory=functools.partial(
+              ar_model.FlatPredictionModel,
+              hidden_layer_sizes=hidden_layer_sizes),
           exogenous_feature_columns=exogenous_feature_columns,
           num_time_buckets=num_time_buckets,
           input_window_size=input_window_size,
-          output_window_size=output_window_size, loss=loss,
-          hidden_layer_sizes=hidden_layer_sizes)
+          output_window_size=output_window_size, loss=loss)
     else:
       if loss != ar_model.ARModel.NORMAL_LIKELIHOOD_LOSS:
         raise ValueError(
@@ -261,9 +268,11 @@ def __init__(
           input_window_size=input_window_size,
           output_window_size=output_window_size,
           num_features=num_features,
+          prediction_model_factory=functools.partial(
+              ar_model.FlatPredictionModel,
+              hidden_layer_sizes=hidden_layer_sizes),
           exogenous_feature_columns=exogenous_feature_columns,
           num_time_buckets=num_time_buckets,
-          hidden_layer_sizes=hidden_layer_sizes,
           anomaly_prior_probability=anomaly_prior_probability,
           anomaly_distribution=anomaly_distribution)
     state_manager = state_management.FilteringOnlyStateManager()
diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py b/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
index eebee053f8e600..983455f63db079 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
@@ -16,6 +16,7 @@
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import tempfile
 
 import numpy
@@ -67,15 +68,16 @@ def _fit_restore_fit_test_template(self, estimator_fn, dtype):
     eval_input_fn = input_pipeline.RandomWindowInputFn(
         input_pipeline.NumpyReader(features), shuffle_seed=3, num_threads=1,
         batch_size=16, window_size=16)
-    first_estimator.train(input_fn=train_input_fn, steps=5)
+    first_estimator.train(input_fn=train_input_fn, steps=1)
     first_loss_before_fit = first_estimator.evaluate(
         input_fn=eval_input_fn, steps=1)["loss"]
-    first_estimator.train(input_fn=train_input_fn, steps=50)
+    self.assertAllEqual([], first_loss_before_fit.shape)
+    first_estimator.train(input_fn=train_input_fn, steps=1)
     first_loss_after_fit = first_estimator.evaluate(
         input_fn=eval_input_fn, steps=1)["loss"]
-    self.assertLess(first_loss_after_fit, first_loss_before_fit)
+    self.assertAllEqual([], first_loss_after_fit.shape)
     second_estimator = estimator_fn(model_dir, exogenous_feature_columns)
-    second_estimator.train(input_fn=train_input_fn, steps=2)
+    second_estimator.train(input_fn=train_input_fn, steps=1)
     whole_dataset_input_fn = input_pipeline.WholeDatasetInputFn(
         input_pipeline.NumpyReader(features))
     whole_dataset_evaluation = second_estimator.evaluate(
@@ -178,7 +180,7 @@ def _fit_restore_fit_test_template(self, estimator_fn, dtype):
             session=sess)
         self.assertAllEqual([10, 15, 1], predictions["mean"].shape)
 
-  def test_fit_restore_fit_ar_regressor(self):
+  def test_fit_restore_fit_ar_flat(self):
     def _estimator_fn(model_dir, exogenous_feature_columns):
       return estimators.ARRegressor(
           periodicities=10, input_window_size=10, output_window_size=6,
@@ -189,6 +191,20 @@ def _estimator_fn(model_dir, exogenous_feature_columns):
           exogenous_feature_columns=exogenous_feature_columns)
     self._fit_restore_fit_test_template(_estimator_fn, dtype=dtypes.float32)
 
+  def test_fit_restore_fit_ar_lstm(self):
+    def _estimator_fn(model_dir, exogenous_feature_columns):
+      return estimators.TimeSeriesRegressor(
+          model=ar_model.ARModel(
+              periodicities=10, input_window_size=10, output_window_size=6,
+              num_features=1,
+              exogenous_feature_columns=exogenous_feature_columns,
+              prediction_model_factory=functools.partial(
+                  ar_model.LSTMPredictionModel,
+                  num_units=10)),
+          config=_SeedRunConfig(),
+          model_dir=model_dir)
+    self._fit_restore_fit_test_template(_estimator_fn, dtype=dtypes.float32)
+
   def test_fit_restore_fit_structural_ensemble_regressor(self):
     dtype = dtypes.float32
     def _estimator_fn(model_dir, exogenous_feature_columns):
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
index 5d33e23a427bd5..3c07a74ed8af9e 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
@@ -176,8 +176,9 @@ py_library(
 
 py_test(
     name = "structural_ensemble_test",
-    timeout = "long",  # Moderate but for asan/tsan timeouts
+    timeout = "long",  # Moderate but for asan/tsan/msan timeouts
     srcs = ["structural_ensemble_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     deps = [
         ":state_space_model",
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index eac210418b57ea..f84ff1bfe9b014 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -24,6 +24,7 @@ cc_library(
     name = "all_ops",
     deps = [
         ":cross_replica_ops_op_lib",
+        ":heartbeat_ops_op_lib",
         ":host_compute_ops_op_lib",
         ":infeed_ops_op_lib",
         ":outfeed_ops_op_lib",
@@ -71,6 +72,7 @@ py_library(
 tf_gen_op_libs(
     op_lib_names = [
         "cross_replica_ops",
+        "heartbeat_ops",
         "host_compute_ops",
         "infeed_ops",
         "outfeed_ops",
@@ -89,6 +91,7 @@ tf_custom_op_library(
     name = "python/ops/_tpu_ops.so",
     srcs = [
         "ops/cross_replica_ops.cc",
+        "ops/heartbeat_ops.cc",
         "ops/host_compute_ops.cc",
         "ops/infeed_ops.cc",
         "ops/outfeed_ops.cc",
@@ -106,6 +109,7 @@ tf_gen_op_wrapper_py(
     name = "tpu_ops",
     deps = [
         ":cross_replica_ops_op_lib",
+        ":heartbeat_ops_op_lib",
         ":host_compute_ops_op_lib",
         ":infeed_ops_op_lib",
         ":outfeed_ops_op_lib",
@@ -163,6 +167,7 @@ py_library(
         "python/tpu/bfloat16.py",
         "python/tpu/device_assignment.py",
         "python/tpu/keras_support.py",
+        "python/tpu/session_support.py",
         "python/tpu/topology.py",
         "python/tpu/tpu.py",
         "python/tpu/tpu_feed.py",
@@ -176,6 +181,7 @@ py_library(
         ":datasets",
         ":profiler",
         ":tpu_py",
+        "//tensorflow/contrib/tpu/proto:compilation_result_proto_py",
         "//tensorflow/contrib/tpu/proto:topology_proto_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/contrib/tpu/ops/heartbeat_ops.cc b/tensorflow/contrib/tpu/ops/heartbeat_ops.cc
new file mode 100644
index 00000000000000..ca0f5bc0e562cd
--- /dev/null
+++ b/tensorflow/contrib/tpu/ops/heartbeat_ops.cc
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+REGISTER_OP("WorkerHeartbeat")
+    .Input("request: string")
+    .Output("response: string")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Doc(R"doc(
+Worker heartbeat op.
+
+Heartbeats may be sent periodically to indicate the coordinator is still active,
+to retrieve the current worker status and to expedite shutdown when necessary.
+
+request: A string tensor containing a serialized WorkerHeartbeatRequest
+response: A string tensor containing a serialized WorkerHeartbeatResponse
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/tpu/ops/replication_ops.cc b/tensorflow/contrib/tpu/ops/replication_ops.cc
index 3bdf7c2f83b037..ab2a7a0d4bec48 100644
--- a/tensorflow/contrib/tpu/ops/replication_ops.cc
+++ b/tensorflow/contrib/tpu/ops/replication_ops.cc
@@ -25,6 +25,7 @@ using shape_inference::ShapeHandle;
 REGISTER_OP("TPUReplicateMetadata")
     .Attr("num_replicas: int >= 0")
     .Attr("topology: string = \"\"")
+    .Attr("use_tpu: bool = true")
     .Attr("device_assignment: list(int) = []")
     .Attr("computation_shape: list(int) = []")
     .Attr("host_compute_core: list(string) = []")
@@ -64,10 +65,15 @@ REGISTER_OP("TPUReplicatedOutput")
         "Operator that connects the output of an N-way replicated TPU "
         "computation to N separate outputs.");
 
+REGISTER_OP("TPUCompilationResult")
+    .Output("output: string")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("TPUReplicate")
     .Attr("computation: func")
     .Attr("num_replicas: int >= 1")
     .Attr("topology: string = \"\"")
+    .Attr("use_tpu: bool = true")
     .Attr("device_assignment: list(int) = []")
     .Attr("host_compute_core: list(string) = []")
     .Attr("computation_shape: list(int) = []")
@@ -89,6 +95,9 @@ computation: a function containing the computation to run.
 num_replicas: the number of replicas of the computation to run.
 topology: A serialized tensorflow.tpu.TopologyProto that describes the TPU
 topology.
+use_tpu: a bool indicating if this computation will run on TPU or CPU/GPU.
+Currently, only supports a default placement (computation is placed on GPU
+if one is available, and on CPU if not).
 computation_shape: a [mesh_dimension] array describing the shape of each
   computation replica in numbers of cores in the TPU mesh.
 device_assignment: a flattened array with shape
diff --git a/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc b/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc
index 7bf5c21d0b526e..d5600eef4a9dc6 100644
--- a/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc
+++ b/tensorflow/contrib/tpu/ops/tpu_configuration_ops.cc
@@ -214,20 +214,4 @@ An op that shuts down a running distributed TPU system. The Op returns
 an error if no system is running.
 )doc");
 
-REGISTER_OP("SessionStatus")
-    .Input("fetch_start_timestamp: double")
-    .Output("status: string")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Doc(R"doc(
-Not for public usage.
-
-Returns messages from the current session as a serialized SessionStatusProto.
-
-This includes the current state of the compiler, along with any critical
-logging or warning messages.
-
-fetch_start_timestamp: any messages earlier than this will be excluded from the
-returned proto.
-)doc");
-
 }  // end namespace tensorflow
diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
index 816897499b7a49..99485322c6b943 100644
--- a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
@@ -79,7 +79,9 @@ ProfileRequest PopulateProfileRequest(int duration_ms,
     request.set_repository_root(repository_root);
     request.set_session_id(session_id);
   }
+  request.add_tools("op_profile");
   request.add_tools("input_pipeline");
+  request.add_tools("memory_viewer");
   request.add_tools("overview_page");
   *request.mutable_opts() = opts;
   std::cout << "Limiting the number of trace events to " << kMaxEvents
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
index 5e85a967ad4ea3..98cc31f18d2d34 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/contrib/tpu/profiler/op_profile.pb.h"
 #include "tensorflow/contrib/tpu/profiler/trace_events.pb.h"
 #include "tensorflow/contrib/tpu/profiler/trace_events_to_json.h"
-#include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/io/compression.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -30,8 +29,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/protobuf.h"
-#include "tensorflow/core/protobuf/config.pb.h"
-#include "tensorflow/core/util/event.pb.h"
 #include "tensorflow/core/util/events_writer.h"
 
 namespace tensorflow {
@@ -41,6 +38,7 @@ namespace {
 using ::tensorflow::io::JoinPath;
 using ::tensorflow::protobuf::util::JsonOptions;
 using ::tensorflow::protobuf::util::MessageToJsonString;
+using ::tensorflow::str_util::EndsWith;
 using ::tensorflow::strings::StrCat;
 
 constexpr char kGraphRunPrefix[] = "tpu_profiler.hlo_graph.";
@@ -49,6 +47,9 @@ constexpr char kJsonTraceFileName[] = "trace.json.gz";
 constexpr char kProfilePluginDirectory[] = "plugins/profile/";
 constexpr char kProtoTraceFileName[] = "trace";
 
+constexpr char kFlatProfilerFileName[] = "flat_profiler.pb";
+constexpr char kTfStatsHelperSuffix[] = "tf_stats_helper_result";
+
 Status WriteGzippedDataToFile(const string& filename, const string& data) {
   std::unique_ptr<WritableFile> file;
   TF_RETURN_IF_ERROR(Env::Default()->NewWritableFile(filename, &file));
@@ -110,6 +111,10 @@ Status DumpToolDataToLogDirectory(StringPiece run_dir,
                                   const string& host_prefix,
                                   const tensorflow::ProfileToolData& tool,
                                   std::ostream* os) {
+  // Don't save the intermediate results for combining the per host tool data.
+  if (EndsWith(tool.name(), kFlatProfilerFileName) ||
+      EndsWith(tool.name(), kTfStatsHelperSuffix))
+    return Status::OK();
   string path = JoinPath(run_dir, StrCat(host_prefix, tool.name()));
   TF_RETURN_IF_ERROR(WriteStringToFile(Env::Default(), path, tool.data()));
   if (os) {
diff --git a/tensorflow/contrib/tpu/profiler/op_profile.proto b/tensorflow/contrib/tpu/profiler/op_profile.proto
index 840a43913ba0f1..1f249de314a540 100644
--- a/tensorflow/contrib/tpu/profiler/op_profile.proto
+++ b/tensorflow/contrib/tpu/profiler/op_profile.proto
@@ -60,6 +60,11 @@ message Metrics {
   //  - it does not reveal the peak core FLOPS of the hardware
   double flops = 2;
 
+  // The VMEM bandwidth used to load operands from HBM, as a fraction of
+  // thereotical VMEM bandwidth on the specific hardware.
+  double memory_bandwidth = 3;
+
   double raw_time = 11;   // Elapsed core-time in picoseconds.
   double raw_flops = 12;  // Total floating-point operations performed.
+  double raw_bytes_accessed = 13;  // Total bytes accessed (include read/write).
 }
diff --git a/tensorflow/contrib/tpu/profiler/tf_op_stats.proto b/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
index 63955d18068fc9..2b13343efa4e82 100644
--- a/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
+++ b/tensorflow/contrib/tpu/profiler/tf_op_stats.proto
@@ -87,6 +87,8 @@ message StepInfoResult {
   optional uint64 wait_duration_ps = 5;
   // The time spent on cross-replica-sum in picoseconds.
   optional uint64 crs_duration_ps = 6;
+  // Percentage of unit b time spent on infeed.
+  optional double unit_b_infeed_percent = 7;
 }
 
 // Result proto for a sequence of steps.
@@ -245,4 +247,6 @@ message TfOpStats {
   optional HostOpsResult host_ops = 8;
   // A map from core ID to name.
   map<uint32, string> core_id_to_name_map = 9;
+  // The result for hw unit b stats.
+  optional bytes unit_b_stats = 10;
 }
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
index 7be694e866729c..f0fca63db0bca8 100644
--- a/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
+++ b/tensorflow/contrib/tpu/profiler/tpu_profiler.proto
@@ -68,7 +68,8 @@ message ProfileRequest {
 }
 
 message ProfileToolData {
-  // The tool's name which this data is associated. (e.g. "input_pipeline".)
+  // The file name which this data is associated (e.g. "input_pipeline.json",
+  // "cluster_xxx.memory_viewer.json").
   string name = 1;
 
   // The data payload (likely json) for the specific tool.
diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto b/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto
index 8b0bbde98e6a1d..d3c34bfd490080 100644
--- a/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto
+++ b/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto
@@ -38,6 +38,9 @@ message EnumProfileSessionsAndToolsResponse {
 message ProfileSessionDataRequest {
   string repository_root = 1;
   string session_id = 2;
+  // Which host the data is associated. if empty, data from all hosts are
+  // aggregated.
+  string host_name = 5;
   // Which tool
   string tool_name = 3;
   // Tool's specific parameters. e.g. TraceViewer's viewport etc
diff --git a/tensorflow/contrib/tpu/proto/BUILD b/tensorflow/contrib/tpu/proto/BUILD
index fcfbbe1a213b69..7ecb36852c53bb 100644
--- a/tensorflow/contrib/tpu/proto/BUILD
+++ b/tensorflow/contrib/tpu/proto/BUILD
@@ -21,3 +21,13 @@ tf_proto_library(
     cc_api_version = 2,
     visibility = ["//visibility:public"],
 )
+
+tf_proto_library(
+    name = "compilation_result_proto",
+    srcs = [
+        "compilation_result.proto",
+    ],
+    cc_api_version = 2,
+    protodeps = ["//tensorflow/core:protos_all"],
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/contrib/tpu/proto/compilation_result.proto b/tensorflow/contrib/tpu/proto/compilation_result.proto
new file mode 100644
index 00000000000000..cf52897de3d0fe
--- /dev/null
+++ b/tensorflow/contrib/tpu/proto/compilation_result.proto
@@ -0,0 +1,13 @@
+syntax = "proto3";
+
+option cc_enable_arenas = true;
+package tensorflow.tpu;
+
+import "tensorflow/core/lib/core/error_codes.proto";
+
+// Describes the result of a TPU compilation.
+message CompilationResultProto {
+  // The error message, if any, returned during compilation.
+  error.Code status_code = 1;
+  string status_error_message = 2;
+}
diff --git a/tensorflow/contrib/tpu/python/tpu/bfloat16.py b/tensorflow/contrib/tpu/python/tpu/bfloat16.py
index 5e49af6408e8aa..fa74f651aa63c7 100644
--- a/tensorflow/contrib/tpu/python/tpu/bfloat16.py
+++ b/tensorflow/contrib/tpu/python/tpu/bfloat16.py
@@ -73,5 +73,5 @@ def bfloat16_scope():
   This enables variables to be read as bfloat16 type when using get_variable.
   """
   with variable_scope.variable_scope(
-      'bfloat16', custom_getter=_get_custom_getter()) as varscope:
+      '', custom_getter=_get_custom_getter()) as varscope:
     yield varscope
diff --git a/tensorflow/contrib/tpu/python/tpu/bfloat16_test.py b/tensorflow/contrib/tpu/python/tpu/bfloat16_test.py
index 48a01c7308fbf1..26fd3768278cac 100644
--- a/tensorflow/contrib/tpu/python/tpu/bfloat16_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/bfloat16_test.py
@@ -32,7 +32,7 @@ def testScopeName(self):
     """Test if name for the variable scope is propogated correctly.
     """
     with bfloat16.bfloat16_scope() as bf:
-      self.assertEqual(bf.name, "bfloat16")
+      self.assertEqual(bf.name, "")
 
   def testRequestedDType(self):
     """Test if requested dtype is honored in the getter.
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets.py b/tensorflow/contrib/tpu/python/tpu/datasets.py
index 2e472a2805f98b..d879170b6875b3 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets.py
+++ b/tensorflow/contrib/tpu/python/tpu/datasets.py
@@ -166,11 +166,21 @@ def LoadingFunc(h):
     return remote_iterator.get_next()
 
   def MapFn(unused_input):
-    return functional_ops.remote_call(
+    if isinstance(source_dataset.output_types, dtypes.DType):
+      output_types = [source_dataset.output_types]
+    elif isinstance(source_dataset.output_types, (list, tuple)):
+      output_types = source_dataset.output_types
+    else:
+      raise ValueError('source dataset has invalid output types')
+    remote_calls = functional_ops.remote_call(
         args=[source_handle],
-        Tout=[dtypes.string],
+        Tout=output_types,
         f=LoadingFunc,
-        target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)[0]
+        target='/job:%s/replica:0/task:0/cpu:0' % file_reader_job)
+    if len(remote_calls) == 1:
+      return remote_calls[0]
+    else:
+      return remote_calls
 
   with ops.device('/job:%s' % worker_job):
     output_dataset = dataset_ops.Dataset.range(2).repeat().map(
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets_test.py b/tensorflow/contrib/tpu/python/tpu/datasets_test.py
index 918cf0ed8e513d..b58d05eac56f35 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/datasets_test.py
@@ -26,6 +26,8 @@
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.lib.io import python_io
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
@@ -162,6 +164,30 @@ def FixedLengthFile(filename):
 
     self.assertEqual(set(all_contents), set(retrieved_values))
 
+  def testArbitraryReaderFuncFromDatasetGenerator(self):
+
+    def my_generator():
+      yield (1, [1] * 10)
+
+    def gen_dataset(dummy):
+      return dataset_ops.Dataset.from_generator(
+          my_generator, (dtypes.int64, dtypes.int64),
+          (tensor_shape.TensorShape([]), tensor_shape.TensorShape([10])))
+
+    dataset = datasets.StreamingFilesDataset(
+        dataset_ops.Dataset.range(10), filetype=gen_dataset)
+
+    iterator = dataset.make_initializable_iterator()
+    self._sess.run(iterator.initializer)
+    get_next = iterator.get_next()
+
+    retrieved_values = self._sess.run(get_next)
+
+    self.assertIsInstance(retrieved_values, (list, tuple))
+    self.assertEqual(len(retrieved_values), 2)
+    self.assertEqual(retrieved_values[0], 1)
+    self.assertItemsEqual(retrieved_values[1], [1] * 10)
+
   def testUnexpectedFiletypeString(self):
     with self.assertRaises(ValueError):
       datasets.StreamingFilesDataset(
diff --git a/tensorflow/contrib/tpu/python/tpu/keras_support.py b/tensorflow/contrib/tpu/python/tpu/keras_support.py
index e86ca0a1d8f15e..f1a11fa6548b87 100644
--- a/tensorflow/contrib/tpu/python/tpu/keras_support.py
+++ b/tensorflow/contrib/tpu/python/tpu/keras_support.py
@@ -49,24 +49,26 @@
 
 import collections
 import re
+import time
 
 from tensorflow.contrib.framework.python.framework import experimental
+from tensorflow.contrib.tpu.proto import compilation_result_pb2 as tpu_compilation_result
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu
+from tensorflow.contrib.tpu.python.tpu import tpu_optimizer
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session as tf_session
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import layers
-from tensorflow.python.keras._impl.keras import models
-from tensorflow.python.keras._impl.keras import optimizers as keras_optimizers
-from tensorflow.python.keras._impl.keras.layers import embeddings
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import layers
+from tensorflow.python.keras import models
+from tensorflow.python.keras import optimizers as keras_optimizers
+from tensorflow.python.keras.layers import embeddings
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import training_util
 
 
 class TPUEmbedding(embeddings.Embedding):
@@ -76,9 +78,6 @@ class TPUEmbedding(embeddings.Embedding):
   replacement: it has the same behavior and will work on CPU and GPU devices.
   """
 
-  def __init__(self, *args, **kw):
-    super(TPUEmbedding, self).__init__(*args, **kw)
-
   def build(self, input_shape):
     if input_shape[0] is None:
       raise ValueError(
@@ -93,10 +92,11 @@ def call(self, inputs):
     return math_ops.tensordot(inputs, self.embeddings, 1)
 
 
-class CompiledTPUOp(
+class TPUModelOp(
     collections.namedtuple(
-        'CompiledTPUOp',
-        ['tpu_execute_op', 'infeed_tensors', 'infeed_op', 'outfeed_op'])):
+        'TPUModelOp',
+        ['compile_op', 'execute_op', 'infeed_tensors', 'infeed_op',
+         'outfeed_op'])):
   pass
 
 
@@ -105,6 +105,15 @@ def _valid_name(tensor_name):
   return re.sub('[^a-zA-Z0-9_-]+', '', tensor_name)
 
 
+def _replicated_optimizer(opt, num_replicas):
+  """Wrap the optimizer `opt` with CrossShardOptimizer if applicable."""
+  if num_replicas == 1:
+    return opt
+  return keras_optimizers.TFOptimizer(
+      optimizer=tpu_optimizer.CrossShardOptimizer(opt.optimizer)
+  )
+
+
 class TPUFunction(object):
   """K.function compatible interface for invoking a TPU compiled function.
 
@@ -117,16 +126,19 @@ class TPUFunction(object):
   instead of being injected as `feed_dict` items or fetches.
   """
 
-  def __init__(self, model, execution_mode):
+  def __init__(self, model, execution_mode, num_replicas=1):
     self.model = model
     self.execution_mode = execution_mode
     self._compilation_cache = {}
+    self.num_replicas = num_replicas
 
   def _specialize_model(self, input_specs):
     """Specialize `self.model` (a Keras model) for the given input shapes."""
     # Re-create our input and output layers inside our subgraph.  They will be
     # attached to the true computation when we clone our model in `tpu_fn`.
-    K.set_learning_phase(self.execution_mode == model_fn_lib.ModeKeys.TRAIN)
+    K.set_learning_phase(
+        self.execution_mode == model_fn_lib.ModeKeys.TRAIN
+    )
 
     # functools.partial and callable objects are not supported by tpu.rewrite
     def _model_fn():
@@ -161,15 +173,14 @@ def _model_fn():
         if layer in self.model._output_layers:
           tpu_targets.append(tensor)
 
-      optimizer = self.model.optimizer
-      optimizer.iterations = training_util.get_or_create_global_step()
-
       # Call our model with our infeed inputs (re-using the weights).
       model_outputs = self.model(tpu_inputs)
       child_model = models.Model(inputs=tpu_inputs, outputs=model_outputs)
+
       if is_training or is_test:
         child_model.compile(
-            optimizer=self.model.optimizer,
+            optimizer=_replicated_optimizer(self.model.optimizer,
+                                            self.num_replicas),
             loss=self.model.loss,
             loss_weights=self.model.loss_weights,
             metrics=self.model.metrics,
@@ -187,7 +198,8 @@ def _model_fn():
         return [
             child_model.train_function.updates_op,
             tpu_ops.outfeed_enqueue_tuple(
-                child_model.train_function.outputs, name='oufeed-enqueue-train')
+                child_model.train_function.outputs,
+                name='outfeed-enqueue-train')
         ]
       elif is_test:
         child_model._make_test_function()
@@ -197,7 +209,8 @@ def _model_fn():
         ]
         return [
             tpu_ops.outfeed_enqueue_tuple(
-                child_model.test_function.outputs, name='outfeed-enqueue-test')
+                child_model.test_function.outputs,
+                name='outfeed-enqueue-test')
         ]
       elif is_predict:
         child_model._make_predict_function()
@@ -217,31 +230,85 @@ def _model_fn():
     # Capture outfeed metadata computed during the rewrite.
     self._outfeed_spec = None
 
-    tpu_execute_op = tpu.rewrite(_model_fn)
-
-    K._initialize_variables(K.get_session())  # pylint-disable: protected-access
+    # Generate out TPU operations using `tpu.split_compile_and_replicate`.
+    # `compile_op` can be used to test the TPU model compiles before execution.
+    # `execute op` replicates `_model_fn` `num_replicas` times, with each shard
+    # running on a different logical core.
+    compile_op, execute_op = tpu.split_compile_and_replicate(
+        _model_fn, inputs=[[]] * self.num_replicas)
 
     # Generate CPU side operations to enqueue features/labels and dequeue
     # outputs from the model call.
-    with ops.device('/device:TPU:0'):
-      infeed_tensors = []
-      for spec in input_specs:
-        infeed_tensors.append(
-            array_ops.placeholder(
-                dtype=spec.dtype,
-                shape=spec.shape,
-                name='infeed-enqueue-%s' % spec.name))
-
-      infeed_op = tpu_ops.infeed_enqueue_tuple(
-          infeed_tensors, [spec.shape for spec in input_specs],
-          name='infeed-enqueue-%s' % self.execution_mode)
-
-      outfeed_op = tpu_ops.outfeed_dequeue_tuple(
-          dtypes=[spec.dtype for spec in self._outfeed_spec],
-          shapes=[spec.shape for spec in self._outfeed_spec],
-          name='outfeed-dequeue-%s' % self.execution_mode)
-
-    return CompiledTPUOp(tpu_execute_op, infeed_tensors, infeed_op, outfeed_op)
+    infeed_op = []
+    outfeed_op = []
+    shard_infeed_tensors = []
+
+    for shard_id in range(self.num_replicas):
+      with ops.device('/device:TPU:%d' % shard_id):
+        infeed_tensors = []
+        for spec in input_specs:
+          infeed_tensors.append(
+              array_ops.placeholder(
+                  dtype=spec.dtype,
+                  shape=spec.shape,
+                  name='infeed-enqueue-%s-%d' % (spec.name, shard_id)))
+        shard_infeed_tensors.append(infeed_tensors)
+
+        infeed_op.append(tpu_ops.infeed_enqueue_tuple(
+            infeed_tensors, [spec.shape for spec in input_specs],
+            name='infeed-enqueue-%s-%d' % (self.execution_mode, shard_id)))
+
+        outfeed_op.extend(tpu_ops.outfeed_dequeue_tuple(
+            dtypes=[spec.dtype for spec in self._outfeed_spec],
+            shapes=[spec.shape for spec in self._outfeed_spec],
+            name='outfeed-dequeue-%s-%d' % (self.execution_mode, shard_id)))
+
+    return TPUModelOp(
+        compile_op, execute_op, infeed_tensors=shard_infeed_tensors,
+        infeed_op=infeed_op, outfeed_op=outfeed_op)
+
+  def _test_model_compiles(self, tpu_model_ops):
+    """Verifies that the given TPUModelOp can be compiled via XLA."""
+    session = K.get_session()
+
+    logging.info('Started compiling')
+    start_time = time.clock()
+
+    result = session.run(tpu_model_ops.compile_op)
+    proto = tpu_compilation_result.CompilationResultProto()
+    proto.ParseFromString(result)
+    if proto.status_error_message:
+      raise RuntimeError(
+          'Compilation failed: {}'.format(proto.status_error_message))
+
+    end_time = time.clock()
+    logging.info('Finished compiling. Time elapsed: %s secs',
+                 end_time - start_time)
+
+  def _split_tensors(self, inputs):
+    """Split input data across shards.
+
+    Each input is sliced along the batch axis.
+
+    Args:
+      inputs: List of Numpy arrays to run on the TPU.
+
+    Returns:
+      List of lists containing the input to feed to each TPU shard.
+    """
+    if self.num_replicas == 1:
+      return [inputs]
+
+    batch_size = inputs[0].shape[0]
+    assert batch_size % self.num_replicas == 0, (
+        'batch_size must be divisible by num_replicas')
+    shard_size = batch_size // self.num_replicas
+    input_list = []
+    for index in range(self.num_replicas):
+      shard_inputs = [x[index * shard_size:(index + 1) * shard_size]
+                      for x in inputs]
+      input_list.append(shard_inputs)
+    return input_list
 
   def __call__(self, inputs):
     assert isinstance(inputs, list)
@@ -254,12 +321,18 @@ def __call__(self, inputs):
     else:
       input_tensors = self.model._feed_inputs
 
+    shard_inputs = self._split_tensors(inputs)
+    del inputs  # To avoid accident usage.
+
     # Compute an input specification (used to generate infeed enqueue and
     # dequeue operations).  We use the shape from our input array and the
     # dtype from our model.  A user may pass in a float64 for a float32
     # input: for model compatibility we still must generate a float32 infeed.
     input_specs = []
-    for tensor, ary in zip(input_tensors, inputs):
+
+    # We use the shape and dtype from the first shard to compute the input
+    # metadata (`input_specs`); all replicas have the same type and shape.
+    for tensor, ary in zip(input_tensors, shard_inputs[0]):
       input_specs.append(
           tensor_spec.TensorSpec(ary.shape, tensor.dtype,
                                  _valid_name(tensor.name)))
@@ -272,21 +345,26 @@ def __call__(self, inputs):
     if shape_key not in self._compilation_cache:
       logging.info('New input shapes; (re-)compiling: mode=%s, %s',
                    self.execution_mode, input_specs)
-      self._compilation_cache[shape_key] = self._specialize_model(input_specs)
+      new_tpu_model_ops = self._specialize_model(input_specs)
+      self._compilation_cache[shape_key] = new_tpu_model_ops
+      self._test_model_compiles(new_tpu_model_ops)
 
-    compiled_model = self._compilation_cache[shape_key]
+    tpu_model_ops = self._compilation_cache[shape_key]
 
     infeed_dict = {}
-    for tensor, value in zip(compiled_model.infeed_tensors, inputs):
-      infeed_dict[tensor] = value
+    for infeed_tensors, inputs in zip(tpu_model_ops.infeed_tensors,
+                                      shard_inputs):
+      for tensor, value in zip(infeed_tensors, inputs):
+        infeed_dict[tensor] = value
 
     session = K.get_session()
     _, _, outfeed_outputs = session.run([
-        compiled_model.infeed_op, compiled_model.tpu_execute_op,
-        compiled_model.outfeed_op
+        tpu_model_ops.infeed_op, tpu_model_ops.execute_op,
+        tpu_model_ops.outfeed_op
     ], infeed_dict)
 
-    return outfeed_outputs
+    # TODO(xiejw): Decide how to reduce outputs, or just discard all but first.
+    return outfeed_outputs[:len(outfeed_outputs) // self.num_replicas]
 
 
 @experimental
@@ -296,7 +374,6 @@ def setup_tpu_session(master):
       target=master, config=config_pb2.ConfigProto(isolate_session_state=True))
   K.set_session(session)
   K.get_session().run(tpu.initialize_system())
-  K.manual_variable_initialization(True)
   return session
 
 
@@ -322,8 +399,8 @@ def shutdown_tpu_session(session=None):
 class KerasTPUModel(models.Model):
   """TPU compatible Keras model wrapper."""
 
-  def __init__(self, inputs, outputs, name=None):
-    super(models.Model, self).__init__(
+  def __init__(self, inputs, outputs, name, replicas=1):
+    super(models.Model, self).__init__(  # pylint: disable=bad-super-call
         inputs=inputs,
         outputs=outputs,
         name=name,
@@ -331,6 +408,7 @@ def __init__(self, inputs, outputs, name=None):
     self.predict_function = None
     self.test_function = None
     self.train_function = None
+    self.replicas = replicas
 
   def compile(self,
               optimizer,
@@ -357,13 +435,10 @@ def compile(self,
       raise ValueError(
           'Optimizer must be a TFOptimizer, got: %s' % self.optimizer)
 
-  def train_on_batch(self, x, y, sample_weight=None, class_weight=None):
-    return super(KerasTPUModel, self).train_on_batch(x, y, sample_weight,
-                                                     class_weight)
-
   def _make_train_function(self):
     if not self.train_function:
-      self.train_function = TPUFunction(self, model_fn_lib.ModeKeys.TRAIN)
+      self.train_function = TPUFunction(self, model_fn_lib.ModeKeys.TRAIN,
+                                        num_replicas=self.replicas)
 
     return self.train_function
 
@@ -378,14 +453,104 @@ def _make_predict_function(self):
     return self.predict_function
 
   def cpu_model(self):
-    return models.Model(
+    cpu_model = models.Model(
         inputs=self.inputs,
         outputs=self.outputs,
         name=self.name,
     )
 
+    if self.optimizer:
+      cpu_model.compile(
+          optimizer=self.optimizer,
+          loss=self.loss,
+          metrics=self.metrics,
+          loss_weights=self.loss_weights,
+      )
+
+    return cpu_model
+
+
+def _validate_shapes(model):
+  """Validate that all layers in `model` have constant shape."""
+  for layer in model.layers:
+    if isinstance(layer.input_shape, tuple):
+      input_shapes = [layer.input_shape]
+    else:
+      input_shapes = layer.input_shape
+
+    if isinstance(layer.output_shape, tuple):
+      output_shapes = [layer.output_shape]
+    else:
+      output_shapes = layer.output_shape
+
+    for shape in input_shapes + output_shapes:
+      for dim in shape[1:]:
+        if dim is None:
+          raise ValueError(
+              """
+Layer %(layer)s has a variable shape in a non-batch dimension.  TPU models must
+have constant shapes for all operations.
+
+You may have to specify `input_length` for RNN/TimeDistributed layers.
+
+Layer: %(layer)s
+Input shape: %(input_shape)s
+Output shape: %(output_shape)s
+  """ % {
+      'layer': layer,
+      'input_shape': layer.input_shape,
+      'output_shape': layer.output_shape
+      })
+
 
 @experimental
-def tpu_model(model):
+def tpu_model(model, replicas=None):
+  """Runs a model on TPU(s).
+
+  Usage:
+  ```
+  a = Input(shape=(32,))
+  b = Dense(32)(a)
+  model = Model(inputs=a, outputs=b)
+
+  model = keras_support.tpu_model(model)
+  model.compile(
+      optimizer=tf.train.GradientDescentOptimizer(learning_rate=1.0),
+      ...)
+  ```
+
+  If `replicas` is set, replicates the model computation on all TPU cores. The
+  model computation is replicated `num_replicas` times; each shard will run on a
+  different TPU core.
+
+  Limitation: Currently, replication is only supported for training.
+
+  Usage:
+  ```
+  a = Input(shape=(32,))
+  b = Dense(32)(a)
+  model = Model(inputs=a, outputs=b)
+
+  model = keras_support.tpu_model(model, replicas=2)
+  model.compile(
+      optimizer=tf.train.GradientDescentOptimizer(learning_rate=1.0),
+      ...)
+  ```
+
+  Args:
+    model: A `KerasTPUModel`.
+    replicas: (Optional) Int, number of TPU cores which to create model
+        replicas. If `None`, the model runs on single core only, i.e., no
+        replication.
+
+  Returns:
+    A new `KerasTPUModel` instance.
+  """
+  _validate_shapes(model)
+  # TODO(xiejw): Validate TPU model. TPUModel only?
+  # TODO(xiejw): Validate replicas. Full or 1. Shall we allow subset?
+  # TODO(xiejw): Adds reduction option.
+  replicas = 1 if replicas is None else replicas
   return KerasTPUModel(
-      inputs=model.inputs, outputs=model.outputs, name=model.name)
+      inputs=model.inputs, outputs=model.outputs, name=model.name,
+      replicas=replicas)
diff --git a/tensorflow/contrib/tpu/python/tpu/session_support.py b/tensorflow/contrib/tpu/python/tpu/session_support.py
new file mode 100644
index 00000000000000..3e91e2df32e6f1
--- /dev/null
+++ b/tensorflow/contrib/tpu/python/tpu/session_support.py
@@ -0,0 +1,367 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the 'License');
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an 'AS IS' BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ======================================
+"""Operations for handling session logging and shutdown notifications."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import threading
+
+import time
+from google.protobuf import text_format
+
+from tensorflow.contrib.tpu.python.ops import tpu_ops
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.util import event_pb2
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import session_run_hook
+from tensorflow.python.training import training_util
+
+
+class CoordinatorShutdownException(Exception):
+  """Raised when the coordinator needs to shutdown."""
+  pass
+
+
+class WorkerHeartbeatManager(object):
+  """Manages the status/heartbeat monitor for a set of workers."""
+
+  def __init__(self, session, devices, heartbeat_ops, request_placeholder):
+    """Construct a new WorkerHeartbeatManager.
+
+    (Prefer using `WorkerHeartbeatManager.from_devices` when possible.)
+
+    Args:
+      session: `tf.Session`, session to use for heartbeat operations.
+      devices: `list[string]` Set of devices to connect to.
+      heartbeat_ops: `list[tf.Operation]` Heartbeat operations.
+      request_placeholder: `tf.Placeholder[String]` Placeholder used to specify
+        the WorkerHeartbeatRequest protocol buffer.
+    """
+    self._session = session
+    self._devices = devices
+    self._ops = heartbeat_ops
+    self._request_placeholder = request_placeholder
+
+  @staticmethod
+  def from_devices(session, devices):
+    """Construct a heartbeat manager for the given devices."""
+    if not devices:
+      logging.error('Trying to create heartbeat manager with no devices?')
+
+    logging.info('Creating heartbeat manager for %s', devices)
+    request_placeholder = array_ops.placeholder(
+        name='worker_heartbeat_request', dtype=dtypes.string)
+
+    heartbeat_ops = []
+    for device in devices:
+      with ops.device(device):
+        heartbeat_ops.append(tpu_ops.worker_heartbeat(request_placeholder))
+
+    return WorkerHeartbeatManager(session, devices, heartbeat_ops,
+                                  request_placeholder)
+
+  def heartbeat_supported(self):
+    """Returns True if heartbeat operations are supported on all workers."""
+    try:
+      # Send ping to verify worker has heartbeat support.
+      self.ping()
+      return True
+    except errors.InvalidArgumentError as _:
+      return False
+
+  def configure(self, message):
+    """Configure heartbeat manager for all devices.
+
+    Args:
+      message: `event_pb2.WorkerHeartbeatRequest`
+
+    Returns: `None`
+
+    """
+    logging.info('Configuring worker heartbeat: %s',
+                 text_format.MessageToString(message))
+    self._session.run(self._ops,
+                      {self._request_placeholder: message.SerializeToString()})
+
+  def ping(self, request=None, timeout_in_ms=5000):
+    """Ping all workers, returning the parsed status results."""
+    if request is None:
+      request = event_pb2.WorkerHeartbeatRequest()
+
+    options = config_pb2.RunOptions(timeout_in_ms=timeout_in_ms)
+    results = self._session.run(
+        self._ops,
+        feed_dict={self._request_placeholder: request.SerializeToString()},
+        options=options)
+    parsed_results = [
+        event_pb2.WorkerHeartbeatResponse.FromString(res_pb)
+        for res_pb in results
+    ]
+    logging.debug('Ping results: %s', parsed_results)
+    return parsed_results
+
+  def lame_workers(self):
+    """Ping all workers, returning manager containing lame workers (or None)."""
+    ping_results = self.ping()
+    lame_workers = []
+
+    for ping_response, device, op in zip(ping_results, self._devices,
+                                         self._ops):
+      if ping_response.health_status != event_pb2.OK:
+        lame_workers.append((device, op))
+
+    if not lame_workers:
+      return None
+
+    bad_devices, bad_ops = zip(*lame_workers)
+    return WorkerHeartbeatManager(self._session, bad_devices, bad_ops,
+                                  self._request_placeholder)
+
+  def __repr__(self):
+    return 'HeartbeatManager(%s)' % ','.join(self._devices)
+
+  def shutdown(self, timeout_ms=10000):
+    """Shutdown all workers after `shutdown_timeout_secs`."""
+    logging.info('Shutting down %s.', self)
+    req = event_pb2.WorkerHeartbeatRequest(
+        watchdog_config=event_pb2.WatchdogConfig(timeout_ms=timeout_ms))
+    self.configure(req)
+
+    # Wait for workers to shutdown.  This isn't strictly required
+    # but it avoids triggering multiple checkpoints with the same lame worker.
+    logging.info('Waiting %dms for worker shutdown.', timeout_ms)
+    time.sleep(timeout_ms / 1000)
+
+
+def all_worker_devices(session):
+  """Return a list of devices for each worker in the system."""
+  devices = session.list_devices()
+  return [device.name for device in devices if 'CPU' in device.name]
+
+
+class WatchdogManager(threading.Thread):
+  """Configures worker watchdog timer and handles periodic pings.
+
+  Usage:
+    # Ping workers every minute, shutting down workers if they haven't received
+    # a ping after 1 hour.
+    watchdog_manager = WatchdogManager(
+      ping_interval=60, shutdown_timeout=3600
+    )
+
+    # Use as a context manager, resetting watchdog on context exit:
+    with watchdog_manager:
+      session.run(...)
+
+    # Or setup globally; watchdog will remain active until program exit.
+    watchdog_manager.configure_and_run()
+  """
+
+  def __init__(self,
+               session,
+               devices=None,
+               ping_interval=60,
+               shutdown_timeout=3600):
+    """Initialize a watchdog manager.
+
+    Args:
+
+      session: Session connected to worker devices.  A cloned session and graph
+        will be created for managing worker pings.
+      devices: Set of devices to monitor.  If none, all workers will be
+        monitored.
+      ping_interval: Time, in seconds, between watchdog pings.
+      shutdown_timeout: Time, in seconds, before watchdog timeout.
+    """
+    threading.Thread.__init__(self)
+    self.ping_interval = ping_interval
+    self.shutdown_timeout = shutdown_timeout
+    self.daemon = True
+    self._running = False
+    self._graph = ops.Graph()
+    self._session = session_lib.Session(
+        target=session.sess_str,
+        graph=self._graph,
+    )
+
+    with self._graph.as_default():
+      if devices is None:
+        devices = all_worker_devices(self._session)
+      self._worker_manager = WorkerHeartbeatManager.from_devices(
+          self._session, devices)
+
+  def configure_and_run(self):
+    logging.info('Enabling worker watchdog.')
+    self._running = True
+    self._worker_manager.configure(
+        event_pb2.WorkerHeartbeatRequest(
+            watchdog_config=event_pb2.WatchdogConfig(
+                timeout_ms=self.shutdown_timeout * 1000,)))
+
+    self.start()
+
+  def __enter__(self):
+    self.configure_and_run()
+
+  def __exit__(self, exc_type, exc_val, exc_tb):
+    logging.info('Disabling worker watchdog.')
+    self._worker_manager.configure(
+        event_pb2.WorkerHeartbeatRequest(
+            watchdog_config=event_pb2.WatchdogConfig(timeout_ms=-1,)))
+    self._running = False
+    self.join()
+
+  def run(self):
+    # Don't fetch logs or adjust timing: just ping the watchdog.
+    while self._running:
+      self._worker_manager.ping(request=None)
+      time.sleep(self.ping_interval)
+
+
+class GracefulShutdownHook(session_run_hook.SessionRunHook):
+  """Session hook that watches for shutdown events.
+
+  If a shutdown is indicated, `saver.save(checkpoint_prefix)` is executed, and a
+  SystemShutdown exception is raised to terminate the main session.  If `saver`
+  is None the `SAVERS` collection will be read to find a saver.
+
+  `on_shutdown_hooks` is an optional list of functions that should be called
+  after checkpointing.  The function is called with (`run_context`,
+  `all_workers`, `lame_workers`).
+
+  If `heartbeat_group` is not specified, it will default to all CPU workers
+  in the system.
+  """
+
+  def __init__(self, checkpoint_prefix, saver=None, on_shutdown_hooks=None):
+    self._saver = saver
+    self._checkpoint_prefix = checkpoint_prefix
+    self._on_shutdown_hooks = on_shutdown_hooks if on_shutdown_hooks else []
+
+    # Worker heartbeats are managed independently of the main training graph.
+    self._graph = ops.Graph()
+    self._workers = None
+    self._session = None
+    self._heartbeat_supported = False
+
+  def after_create_session(self, training_session, coord):  # pylint: disable=unused-argument
+    # N.B. We have to pull the global step here to avoid it being unavailable
+    # at checkpoint time; the graph has been frozen at that point.
+    if training_util.get_global_step() is None and self.saver() is not None:
+      raise ValueError(
+          'Saver defined but no global step.  Run `get_or_create_global_step()`'
+          ' in your model definition to allow checkpointing.')
+
+    with self._graph.as_default():
+      logging.info('Installing graceful shutdown hook.')
+      self._session = session_lib.Session(
+          target=training_session.sess_str, graph=self._graph)
+      self._workers = WorkerHeartbeatManager.from_devices(
+          self._session, all_worker_devices(self._session))
+      self._heartbeat_supported = self._workers.heartbeat_supported()
+      if self._heartbeat_supported:
+        self._workers.configure(
+            event_pb2.WorkerHeartbeatRequest(
+                shutdown_mode=event_pb2.WAIT_FOR_COORDINATOR))
+      else:
+        logging.warn(
+            'Worker heartbeats not supported by all workers.  No failure '
+            'handling will be enabled.'
+        )
+
+  def saver(self):
+    if self._saver:
+      return self._saver
+
+    savers = ops.get_collection(ops.GraphKeys.SAVERS)
+    if not savers:
+      return None
+
+    if not isinstance(savers, list):
+      return savers
+
+    if len(savers) > 1:
+      logging.error(
+          'Multiple savers in the SAVERS collection.  On-demand checkpointing '
+          'will be disabled. Pass an explicit `saver` to the constructor to '
+          'override this behavior.'
+      )
+      return None
+
+    return savers[0]
+
+  def after_run(self, run_context, run_values):
+    del run_values
+
+    if not self._heartbeat_supported:
+      return
+
+    lame_workers = self._workers.lame_workers()
+    if lame_workers:
+      logging.info('ShutdownHook: lame workers found: %s', lame_workers)
+
+      if self.saver():
+        logging.info('ShutdownHook: saving checkpoint to %s',
+                     self._checkpoint_prefix)
+        self.saver().save(
+            run_context.session,
+            self._checkpoint_prefix,
+            global_step=training_util.get_global_step(),
+            write_state=True,
+        )
+      else:
+        logging.info('ShutdownHook: no Saver defined.')
+
+      for fn in self._on_shutdown_hooks:
+        fn(run_context, self._workers, lame_workers)
+
+
+class RestartComputation(object):
+  """Restart the entire computation.
+
+  This hook shuts down all workers and returns control to the top-level by
+  throwing a CoordinatorShutdownException.
+  """
+
+  def __init__(self, timeout_ms=10000):
+    self.timeout_ms = timeout_ms
+
+  def __call__(self, run_context, all_workers, lame_workers):
+    del run_context, lame_workers
+    all_workers.shutdown(timeout_ms=self.timeout_ms)
+
+    logging.info('Terminating coordinator.')
+    raise CoordinatorShutdownException()
+
+
+class ShutdownLameWorkers(object):
+  """Shutdown lamed workers.
+
+  Processing will continue normally (typically by waiting for the down
+  workers to be restarted).
+  """
+
+  def __init__(self, timeout_ms=10000):
+    self.timeout_in_ms = timeout_ms
+
+  def __call__(self, run_context, all_workers, lame_workers):
+    lame_workers.shutdown(timeout_ms=self.timeout_in_ms)
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index 7b8786304ccf7e..71a50126910568 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -21,6 +21,7 @@
 
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.contrib.framework.python.framework import experimental
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu_function
 
@@ -58,6 +59,7 @@
 _MAX_WARNING_LINES = 5
 
 _TPU_REPLICATE_ATTR = "_tpu_replicate"
+_TPU_COMPILATION_STATUS_ATTR = "_tpu_compilation_status"
 _OUTSIDE_COMPILATION_ATTR = "_xla_outside_compilation"
 
 
@@ -124,7 +126,19 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
   outside the replicated computation.
   """
 
-  def __init__(self, name, num_replicas):
+  def __init__(self, name, num_replicas, pivot):
+    """Builds a new TPUReplicateContext.
+
+    Args:
+      name: a unique name for the context, used to populate the `_tpu_replicate`
+        attribute.
+      num_replicas: an integer that gives the number of replicas for the
+        computation.
+      pivot: a pivot node. Nodes in the TPUReplicateContext that do not have any
+        inputs will have a control dependency on the pivot node. This ensures
+        that nodes are correctly included in any enclosing control flow
+        contexts.
+    """
     super(TPUReplicateContext, self).__init__()
     self._num_replicas = num_replicas
     self._outer_device_function_stack = None
@@ -136,6 +150,7 @@ def __init__(self, name, num_replicas):
     self._host_compute_core = []
     self._name = name
     self._unsupported_ops = []
+    self._pivot = pivot
 
   def report_unsupported_operations(self):
     if self._unsupported_ops:
@@ -260,9 +275,6 @@ def Enter(self):
       self._outer_device_function_stack = list(graph._device_function_stack)  # pylint: disable=protected-access
     super(TPUReplicateContext, self).Enter()
 
-  def Exit(self):
-    super(TPUReplicateContext, self).Exit()
-
   def HostComputeCore(self):
     return self._host_compute_core
 
@@ -298,10 +310,64 @@ def _AddOpInternal(self, op):
       op.graph.prevent_feeding(op)
       op.graph.prevent_fetching(op)
 
+    # Remove any control edges from outer control flow contexts. These may cause
+    # mismatched frame errors.
+    control_inputs, external_inputs = self._RemoveExternalControlEdges(op)
+
+    if not op.inputs:
+      # Add a control edge from the control pivot to this op.
+      if not control_inputs:
+        # pylint: disable=protected-access
+        op._add_control_input(self.GetControlPivot())
+        # pylint: enable=protected-access
+    else:
+      for index in xrange(len(op.inputs)):
+        x = op.inputs[index]
+        real_x = self.AddValue(x)
+        if real_x != x:
+          op._update_input(index, real_x)  # pylint: disable=protected-access
+
+    if external_inputs:
+      # Use an identity to pull control inputs as data inputs. Note that we
+      # ignore ops which don't have outputs. TODO(phawkins): fix that.
+      with ops.control_dependencies(None):
+        self.Enter()
+        external_inputs = [
+            array_ops.identity(x.outputs[0]).op
+            for x in external_inputs
+            if x.outputs
+        ]
+        self.Exit()
+      # pylint: disable=protected-access
+      op._add_control_inputs(external_inputs)
+      # pylint: enable=protected-access
+
+    # Mark op's outputs as seen by this context and any outer contexts.
+    output_names = [x.name for x in op.outputs]
+    context = self
+    while context is not None:
+      # pylint: disable=protected-access
+      context._values.update(output_names)
+      context = context._outer_context
+      # pylint: enable=protected-access
+
+    if self._outer_context:
+      self._outer_context.AddInnerOp(op)
+
   def AddValue(self, val):
+    if val.name in self._values:
+      # Use the real value if it comes from outer context.
+      result = self._external_values.get(val.name)
+      return val if result is None else result
+
     result = val
+    self._values.add(val.name)
     if self._outer_context:
       result = self._outer_context.AddValue(val)
+      self._values.add(result.name)
+
+    self._external_values[val.name] = result
+
     return result
 
   def AddInnerOp(self, op):
@@ -317,17 +383,30 @@ def grad_state(self):
     # grad_state should be as if this is the top-level gradient state.
     return None
 
+  @property
+  def back_prop(self):
+    """Forwards to the enclosing while context, if any."""
+    if self.GetWhileContext():
+      return self.GetWhileContext().back_prop
+    return False
+
+  def GetControlPivot(self):
+    return self._pivot
+
 
-def outside_compilation(computation, args=None):
+def outside_compilation(computation, *args, **kwargs):
   """Builds part of a computation outside any current TPU replicate scope.
 
   Args:
     computation: A Python function that builds the computation to
       place on the host.
-    args: Inputs to pass to computation.
+    *args: the positional arguments for the computation.
+    **kwargs: the keyword arguments for the computation.
+
   Returns:
     The Tensors returned by computation.
   """
+  args = [] if args is None else args
   graph = ops.get_default_graph()
 
   # If we are in a TPUReplicateContext, signal that we are now
@@ -339,7 +418,7 @@ def outside_compilation(computation, args=None):
       context._EnterOutsideCompilationScope()  # pylint: disable=protected-access
     context = context.outer_context
 
-  retval = computation(*args)
+  retval = computation(*args, **kwargs)
 
   # If we are in a TPUReplicateContext, signal that we are no longer
   # outside_compilation
@@ -385,6 +464,49 @@ def replicate(computation,
     ValueError: If the number of inputs per replica does not match
       the number of formal parameters to `computation`.
   """
+  return split_compile_and_replicate(computation, inputs, infeed_queue,
+                                     device_assignment, name)[1]
+
+
+def split_compile_and_replicate(computation,
+                                inputs=None,
+                                infeed_queue=None,
+                                device_assignment=None,
+                                name=None,
+                                use_tpu=True):
+  """Builds graph operators that runs compilation and replicated computation.
+
+  This is a lower level interface than replicate that returns a separate compile
+  and execute output tensor. In the generated graph the compile op feeds into
+  the execute op and no additional compilation is incurred when running the
+  compile op before the execute op. The compile op returns additional
+  information about the compilation but does not return the compiled program.
+
+  Args:
+    computation: A Python function that builds the computation to replicate.
+    inputs: A list of lists of input tensors or `None` (equivalent to
+      `[[]]`), indexed by `[replica_num][input_num]`. All replicas must
+      have the same number of inputs.
+    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
+      of arguments as inputs to computation.
+    device_assignment: If not `None`, a `DeviceAssignment` describing the
+      mapping between logical cores in the computation with physical cores in
+      the TPU topology. Uses a default device assignment if `None`. The
+      `DeviceAssignment` may be omitted if each replica of the computation uses
+      only one core, and there is either only one replica, or the number of
+      replicas is equal to the number of cores in the TPU system.
+    name: (Deprecated) Does nothing.
+    use_tpu: When false, the input `computation` is executed on the XLA CPU/GPU
+      backends. Currently, only supports a default placement (computation is
+      placed on GPU if one is available, and on CPU if not).
+  Returns:
+    A list of lists with the first list corresponding to the compile op and the
+    second a list of output tensors, indexed by `[replica_num][output_num]`.
+  Raises:
+    ValueError: If all replicas do not have equal numbers of input tensors.
+    ValueError: If the number of inputs per replica does not match
+      the number of formal parameters to `computation`.
+  """
   del name
   inputs = [[]] if inputs is None else inputs
 
@@ -456,13 +578,15 @@ def replicate(computation,
     computation_inputs.append(
         tpu_ops.tpu_replicated_input(replicas, name="input{}".format(i)))
 
+  cluster_name = graph.unique_name("cluster")
+  pivot = control_flow_ops.no_op(name=cluster_name + "/pivot")
   context = TPUReplicateContext(
-      name=graph.unique_name("cluster"), num_replicas=num_replicas)
+      name=cluster_name, num_replicas=num_replicas, pivot=pivot)
   try:
     context.Enter()
 
     metadata = tpu_ops.tpu_replicate_metadata(
-        num_replicas=num_replicas, **metadata_kwargs)
+        num_replicas=num_replicas, use_tpu=use_tpu, **metadata_kwargs)
 
     with tpu_function.tpu_shard_context(
         num_replicas), ops.control_dependencies([metadata]):
@@ -516,8 +640,7 @@ def replicate(computation,
 
     # Separates the returned Operations and Tensors.
     output_operations = [o for o in outputs if isinstance(o, ops.Operation)]
-    output_tensors = [o for o in outputs
-                      if not isinstance(o, ops.Operation)]
+    output_tensors = [o for o in outputs if not isinstance(o, ops.Operation)]
 
     if outputs != output_tensors + output_operations:
       raise ValueError(
@@ -535,6 +658,7 @@ def replicate(computation,
       with ops.device(t.device if t.device else core(0)):
         new_output_tensors.append(array_ops.identity(t))
     output_tensors = new_output_tensors
+    context.ExitResult(output_tensors)
   finally:
     context.report_unsupported_operations()
     context.Exit()
@@ -550,22 +674,36 @@ def replicate(computation,
                                            name="output{}".format(i))
              for i in xrange(output_arity)]
 
+  with ops.control_dependencies([metadata]):
+    if use_tpu:
+      compile_status = tpu_ops.tpu_compilation_result()
+      op = compile_status.op
+      attr_value = attr_value_pb2.AttrValue(s=compat.as_bytes(cluster_name))
+      op._set_attr(_TPU_COMPILATION_STATUS_ATTR, attr_value)  # pylint: disable=protected-access
+    else:
+      compile_status = control_flow_ops.no_op(name="compilation_status")
+
   with ops.control_dependencies(output_operations):
     if output_arity == 0:
       # Returns a list of NoOps dependent on the replication Op, indexed by
       # [replica_num].
       return [
-          control_flow_ops.no_op(name="shard_%d" % i)
-          for i in range(num_replicas)
+          compile_status, [
+              control_flow_ops.no_op(name="shard_%d" % i)
+              for i in range(num_replicas)
+          ]
       ]
     else:
       # Wraps the outputs in identity operators so the names of any possible
       # `fetch` nodes are preserved by the replication rewrite.
       return [
-          [array_ops.identity(outputs[out][replica],
-                              name="output_%d_shard_%d" % (out, replica))
-           for out in xrange(output_arity)]
-          for replica in xrange(num_replicas)
+          compile_status, [[
+              array_ops.identity(
+                  outputs[out][replica],
+                  name="output_%d_shard_%d" % (out, replica))
+              for out in xrange(output_arity)
+          ]
+                           for replica in xrange(num_replicas)]
       ]
 
 
@@ -810,3 +948,152 @@ def rewrite(computation,
       device_assignment=device_assignment,
       name=name)[0]
   # pylint: enable=indexing-exception
+
+  # Operations that indicate some error in the user's inference graph.
+_BLACKLISTED_INFERENCE_OPS = set([
+    "ReadVariableOp",
+    "AssignVariableOp",
+    "AssignAddVariableOp",
+    "AssignSubVariableOp",
+    "VarHandleOp",
+    "Variable",
+    "VariableV2",
+])
+
+
+class _TPUInferenceContext(control_flow_ops.XLAControlFlowContext):
+  """A `ControlFlowContext` for nodes inside a TPU inference computation.
+
+  The primary role of `TPUReplicateContext` is to sanity check operators inside
+  a tpu.rewrite_for_inference() computation.
+  """
+
+  def __init__(self, name):
+    super(_TPUInferenceContext, self).__init__()
+    self._name = name
+
+  def AddOp(self, op):
+    self._AddOpInternal(op)
+
+  def _AddOpInternal(self, op):
+    # pylint: disable=protected-access
+    if op.type in _BLACKLISTED_INFERENCE_OPS:
+      raise NotImplementedError(
+          "Operation of type %s (%s) is not supported on the TPU for inference."
+          " Execution will fail if this op is used in the graph. Make sure your"
+          " variables are using variable_scope." % (op.type, op.name))
+    if self._outer_context:
+      self._outer_context.AddInnerOp(op)
+
+  def AddValue(self, val):
+    result = val
+    if self._outer_context:
+      result = self._outer_context.AddValue(val)
+    return result
+
+  def AddInnerOp(self, op):
+    self._AddOpInternal(op)
+
+  @property
+  def grad_state(self):
+    return None
+
+
+@experimental
+def validate_inference_rewrite_for_variables(graph):
+  """Validates whether rewrite_for_inference() 'worked' for variables.
+
+     The rewrite_for_inference() method is supposed to append
+     GuaranteeConstOps after ReadVariableOps, but this mechanism works only
+     if you are using tf.get_variable() to create and access variables in your
+     tpu computation. This validation method can be called immediately after
+     calling tpu.rewrite_for_inference() to check whether GuaranteeConstOps
+     where added to the graph.
+
+     Typical usages:
+       tpu.validate_inference_rewrite_for_variables(tf.get_default_graph())
+
+       tpu.validate_inference_rewrite_for_variables(sess.graph)
+
+  Args:
+    graph: The graph which needs to be validated.
+  Raises:
+    RuntimeError: if validation failed.
+  """
+  if not any([x.type == "GuaranteeConst" for x in graph.get_operations()]):
+    raise RuntimeError(
+        "No GuaranteeConst ops found in the graph after "
+        "running tpu.rewrite_for_inference(...). Please "
+        "check that you are using tf.get_variable() to "
+        "create and access variables in your tpu "
+        "computation.")
+
+
+@experimental
+def rewrite_for_inference(computation,
+                          inputs=None,
+                          infeed_queue=None,
+                          device_assignment=None,
+                          name=None):
+  """Rewrites `computation` for inference on a TPU system.
+
+     Other than 'rewriting' the computation to run on a TPU, if using variables
+     in your computation, it moves the ReadVariableOps outside the TPU
+     computation, and adds GuaranteeConst ops just after the ReadVariableOps.
+     This mechanism works only if you are using tf.get_variable() to create and
+     access variables in your tpu computation. You can validate whether
+     this worked, by calling validate_inference_rewrite_for_variables() method
+     immediately after this method to check whether GuaranteeConstOps where
+     added to the graph.
+
+  Args:
+    computation: A Python function that builds a computation to apply
+      to the input. If the function takes n inputs, 'inputs' should be
+      a list of n tensors. If the function returns m outputs, rewrite
+      will return a list of m tensors.
+    inputs: A list of input tensors or `None` (equivalent to an empty list).
+    infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
+      of arguments as inputs to `computation`.
+    device_assignment: if not `None`, a `DeviceAssignment` describing the
+      mapping between logical cores in the computation with physical cores in
+      the TPU topology. May be omitted for a single-core computation, in which
+      case the core attached to task 0, TPU device 0 is used.
+    name: The name of the operator.
+  Returns:
+    A list of output tensors.
+  """
+
+  def guarantee_const_getter(getter, name, *args, **kwargs):
+    with ops.control_dependencies(None):
+      return array_ops.guarantee_const(
+          getter(name, *args, **kwargs), name=name + "/GuaranteeConst")
+
+  def wrapped_computation(*args, **kwargs):
+    """Execute computation under `_TPUInferenceContext`."""
+    context = _TPUInferenceContext(
+        name=ops.get_default_graph().unique_name("rewrite_for_inference"))
+    try:
+      context.Enter()
+
+      vscope = variable_scope.get_variable_scope()
+      prev_custom_getter = vscope.custom_getter
+      prev_caching_device = vscope.caching_device
+      vscope.set_custom_getter(guarantee_const_getter)
+      vscope.set_caching_device(lambda op: op.device)
+
+      result = computation(*args, **kwargs)
+
+      vscope.set_custom_getter(prev_custom_getter)
+      vscope.set_caching_device(prev_caching_device)
+    finally:
+      context.Exit()
+    return result
+
+  # pylint: disable=undefined-variable
+  return rewrite(
+      wrapped_computation,
+      inputs=inputs,
+      infeed_queue=infeed_queue,
+      device_assignment=device_assignment,
+      name=name)
+  # pylint: enable=undefined-variable
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_context.py b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
index fbc1173e49fd6e..5b9aeaa8797b92 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_context.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ===================================================================
-"""TPU system metdata and associated tooling."""
+"""TPU system metadata and associated tooling."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -35,7 +35,98 @@
 _LOCAL_MASTERS = ('', 'local')
 
 
-class _TPUContext(object):
+class TPUContext(object):
+  """The context of current input_fn invocation."""
+
+  def __init__(self, internal_ctx, input_device=None, invocation_index=None):
+    self._internal_ctx = internal_ctx
+    self._input_device = input_device
+    self._invocation_index = invocation_index
+
+  def current_input_fn_deployment(self):
+    """The configuration of the current input_fn invocation.
+
+    The configuration depends on `TPUConfig.per_host_input_for_training`. See
+    `TPUConfig` for details.
+
+    Only set in params dict of input_fn
+
+    Returns:
+      A tuple of
+        1. Device spec string: String, is the current CPU host where the
+           input_fn is invoked.
+        2. Current invocation index: Int, 0-based index of the input_fn
+           invocation. See next item for details.
+        3. Total invocation count: Int, the total number of times to invoke the
+           input_fn on all CPU hosts. Each invocation will be passed with a new
+           `TPUContext` instance with current invocation index set properly.
+        4. Total number of replicas consumed by current_invocation: Int, the
+           number of replicas fed by the data returned by current input_fn. For
+           example, for per_core input pipeline deployment
+           and non-model-parallelism, total invocation count is equal to
+           the number of cores in the system and num replicas consumed by
+           current invocation is 1. For per-host v2 input pipeline deployment,
+           total invocation count is equal to the number of hosts in the system
+           and num replicas consumed by current invocation is equal to number of
+           cores per host.
+    """
+    if self._internal_ctx.is_input_sharded_per_core():
+      total_invocation_count = (self._internal_ctx.num_hosts
+                                * self._internal_ctx.num_of_replicas_per_host)
+      replicas_consumed = 1
+    else:
+      total_invocation_count = self._internal_ctx.num_hosts
+      replicas_consumed = self._internal_ctx.num_of_replicas_per_host
+    return (self._input_device, self._invocation_index,
+            total_invocation_count, replicas_consumed)
+
+  @property
+  def num_replicas(self):
+    """The total number of replicas.
+
+    For non-model-parallelism, num_replicas should be the total num of TPU
+    cores in the system.
+
+    Returns:
+      The number of replicas.
+    """
+    return self._internal_ctx.num_replicas
+
+  def device_for_replica(self, replica_id):
+    """Returns the tuple of (CPU device and device ordinal) for replica.
+
+    This should be used for full replicate for non-model-parallelism.
+
+    Args:
+       replica_id: Int, the replica index.
+
+    Returns:
+       A tuple of device spec for CPU device and int device ordinal.
+    """
+    # Note that: For the non-model parallelism, the mapping could be
+    # a random permutation. The order should not matter in most cases
+    # as far as model is replicated to all cores in the system.
+
+    # If the precise replica_id to device mapping is required, please
+    # set the computation_shape as [1,1,1] in TPUConfig to enable
+    # the model parallelism.
+    if self._internal_ctx.model_parallelism_enabled:
+      return RuntimeError(
+          'device_for_replica is not yet implemented for model parallelism. '
+          'b/79689078.')
+
+    master = self._internal_ctx.master_job
+    job_device = '' if master is None else ('/job:%s' % master)
+
+    num_of_replicas_per_host = self._internal_ctx.num_of_replicas_per_host
+    host_id = replica_id / num_of_replicas_per_host
+    ordinal_id = replica_id % num_of_replicas_per_host
+
+    host_device = '%s/task:%d/device:CPU:0' % (job_device, host_id)
+    return (host_device, ordinal_id)
+
+
+class _InternalTPUContext(object):
   """A context holds immutable states of TPU computation.
 
   This immutable object holds TPUEstimator config, train/eval batch size, and
@@ -44,9 +135,13 @@ class _TPUContext(object):
   information commonly required by TPU computation, such as TPU device names,
   TPU hosts, shard batch size, etc.
 
+  if eval_on_tpu is False, then execution of eval on TPU is disabled.
+  if eval_on_tpu is True, but use_tpu is False, a warning is issued,
+  and TPU execution is disabled for all modes.
+
   N.B. As `mode` is not immutable state in Estimator, but essential to
   distinguish between TPU training and evaluation, a common usage for
-  _TPUContext with `mode` is as follows:
+  _InternalTPUContext with `mode` is as follows:
   ```
   with _ctx.with_mode(mode) as ctx:
     if ctx.is_running_on_cpu():
@@ -55,12 +150,17 @@ class _TPUContext(object):
   """
 
   def __init__(self, config, train_batch_size, eval_batch_size,
-               predict_batch_size, use_tpu):
+               predict_batch_size, use_tpu, eval_on_tpu=True):
     self._config = config
     self._train_batch_size = train_batch_size
     self._eval_batch_size = eval_batch_size
     self._predict_batch_size = predict_batch_size
     self._use_tpu = use_tpu
+    logging.info('_TPUContext: eval_on_tpu %s', eval_on_tpu)
+    if not use_tpu and eval_on_tpu:
+      logging.warning('eval_on_tpu ignored because use_tpu is False.')
+
+    self._eval_on_tpu = eval_on_tpu
     self._model_parallelism_enabled = (
         use_tpu and config.tpu_config.computation_shape)
     self._mode = None
@@ -246,6 +346,10 @@ def _is_running_on_cpu(self, is_export_mode):
     if not self._use_tpu:
       return True
 
+    if mode == model_fn_lib.ModeKeys.EVAL and not self._eval_on_tpu:
+      logging.info('_is_running_on_cpu: eval_on_tpu disabled')
+      return True
+
     if mode != model_fn_lib.ModeKeys.PREDICT:
       return False
 
@@ -345,6 +449,7 @@ def master_job(self):
   @property
   def tpu_host_placement_function(self):
     """Returns the TPU host place function."""
+
     master = self.master_job
 
     def _placement_function(_sentinal=None, core_id=None, host_id=None):  # pylint: disable=invalid-name
@@ -473,8 +578,8 @@ def _validate_tpu_configuration(self):
     self._lazy_validation_dict[mode] = True
 
 
-class _OneCoreTPUContext(_TPUContext):
-  """Special _TPUContext for one core usage."""
+class _OneCoreTPUContext(_InternalTPUContext):
+  """Special _InternalTPUContext for one core usage."""
 
   def __init__(self, config, train_batch_size, eval_batch_size,
                predict_batch_size, use_tpu):
@@ -503,8 +608,8 @@ def _get_tpu_system_metadata(self):
 
 
 def _get_tpu_context(config, train_batch_size, eval_batch_size,
-                     predict_batch_size, use_tpu):
-  """Returns an instance of `_TPUContext`."""
+                     predict_batch_size, use_tpu, eval_on_tpu):
+  """Returns an instance of `_InternalTPUContext`."""
 
   if (config.tpu_config.num_shards == 1 and
       config.tpu_config.computation_shape is None):
@@ -514,5 +619,5 @@ def _get_tpu_context(config, train_batch_size, eval_batch_size,
     return _OneCoreTPUContext(config, train_batch_size, eval_batch_size,
                               predict_batch_size, use_tpu)
 
-  return _TPUContext(config, train_batch_size, eval_batch_size,
-                     predict_batch_size, use_tpu)
+  return _InternalTPUContext(config, train_batch_size, eval_batch_size,
+                             predict_batch_size, use_tpu, eval_on_tpu)
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 98eb0e240f0666..64ae35dfc5e6d3 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -20,6 +20,7 @@
 
 import collections
 import copy
+import os
 import signal
 import threading
 import time
@@ -31,6 +32,7 @@
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.contrib.tpu.python.ops import tpu_ops
+from tensorflow.contrib.tpu.python.tpu import session_support
 from tensorflow.contrib.tpu.python.tpu import tpu
 from tensorflow.contrib.tpu.python.tpu import tpu_config
 from tensorflow.contrib.tpu.python.tpu import tpu_context
@@ -44,7 +46,8 @@
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.estimator import estimator as estimator_lib
 from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.estimator import util
+from tensorflow.python.estimator import util as estimator_util
+from tensorflow.python.estimator.export import export_output as export_output_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -60,26 +63,31 @@
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.summary import summary
 from tensorflow.python.training import basic_session_run_hooks
 from tensorflow.python.training import evaluation
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training
 from tensorflow.python.training import training_util
+from tensorflow.python.util import function_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 
+
 _INITIAL_LOSS = 1e7
 _ZERO_LOSS = 0.
 _TPU_ESTIMATOR = 'tpu_estimator'
 _ITERATIONS_PER_LOOP_VAR = 'iterations_per_loop'
 _BATCH_SIZE_KEY = 'batch_size'
+_CTX_KEY = 'context'
 _CROSS_REPLICA_SUM_OP = 'CrossReplicaSum'
 _ONE_GIGABYTE = 1024 * 1024 * 1024
 _TPU_ENQUEUE_OPS = '_tpu_enqueue_ops'
 _TPU_TRAIN_OP = '_tpu_train_op'
+_REWRITE_FOR_INFERENCE_MODE = '_rewrite_for_inference'
 
-_RESERVED_PARAMS_KEYS = [_BATCH_SIZE_KEY]
+_RESERVED_PARAMS_KEYS = [_BATCH_SIZE_KEY, _CTX_KEY]
 
 
 # TODO(b/65703635): Flip the value and remove all dead code. Currently, this is
@@ -114,6 +122,33 @@ def _create_global_step(graph):
 
 
 def _create_or_get_iterations_per_loop():
+  """Creates or gets the iterations_per_loop variable.
+
+  In TPUEstimator, the user provided computation, the model_fn, is wrapped
+  inside a tf.while_loop for peak performance. The iterations of the loop are
+  specified by this variable, which adjusts its value on the CPU after each TPU
+  program execution and before the next TPU execution.
+
+  The purpose of using a variable, rather then a constant, is to allow
+  TPUEstimator adapt the TPU training iterations according to the final steps
+  specified by users. For example, if the user sets the iterations_per_loop as 4
+  in TPUConfig and steps as 10 in TPUEstimator.train(), the iterations_per_loop
+  variable will have the following value before each TPU training.
+
+      - 1-th TPU execution: iterations_per_loop = 4
+      - 2-th TPU execution: iterations_per_loop = 4
+      - 3-th TPU execution: iterations_per_loop = 2
+
+  As model_fn increases the global step once per train_op invocation, the global
+  step is 10 after all TPU executions, matching the steps=10 inputs passed in by
+  users.
+
+  Returns:
+    A TF non-trainable resource variable.
+
+  Raises:
+    RuntimeError: If multi iterations_per_loop variables were found.
+  """
   graph = ops.get_default_graph()
   collection_name = '{}_{}'.format(_TPU_ESTIMATOR, _ITERATIONS_PER_LOOP_VAR)
   iter_vars = graph.get_collection(collection_name)
@@ -173,17 +208,7 @@ class _SIGNAL(object):
   STOP = -2
 
 
-class TPUEstimatorSpec(
-    collections.namedtuple('TPUEstimatorSpec', [
-        'mode',
-        'predictions',
-        'loss',
-        'train_op',
-        'eval_metrics',
-        'export_outputs',
-        'scaffold_fn',
-        'host_call'
-    ])):
+class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
   """Ops and objects returned from a `model_fn` and passed to `TPUEstimator`.
 
   See `EstimatorSpec` for `mode`, 'predictions, 'loss', 'train_op', and
@@ -390,20 +415,21 @@ def _log_error(self, session, error):
       return
 
     def _cancel_session():
-      # Close the session to avoid the main thread from hanging. If input
-      # pipeline triggers any error, the infeed thread dies but the main thread
-      # for TPU computation waits for the infeed enqueue forever. Close the
-      # Session to cancel the main thread Session.run execution.
-      #
-      # We sleep for a few seconds before closing to give some time
-      # for the TPU compilation error, if any, propagating, from TPU to CPU
-      # host. Compilation errors should be reported by the main thread so that
-      # the program can be interrupted and users can take action.  Due to a race
-      # condition, the infeed thread might see an error first.  Closing the
-      # session here immediately would result in a session cancellation
-      # exception in the main thread, instead of the expected compile error.
-      # User code that depends on having the proper exception type will
-      # therefore be confused.
+      """Close the session to avoid the main thread from hanging.
+
+      If input pipeline triggers any error, the infeed thread dies but the main
+      thread for TPU computation waits for the infeed enqueue forever. Close the
+      Session to cancel the main thread Session.run execution.
+
+      We sleep for a few seconds before closing to give some time for the TPU
+      compilation error, if any, propagating, from TPU to CPU host. Compilation
+      errors should be reported by the main thread so that the program can be
+      interrupted and users can take action.  Due to a race condition, the
+      infeed thread might see an error first.  Closing the session here
+      immediately would result in a session cancellation exception in the main
+      thread, instead of the expected compile error.  User code that depends on
+      having the proper exception type will therefore be confused.
+      """
       time.sleep(5)
 
       # If the main session is still running, the infeed/outfeed errors are
@@ -457,11 +483,9 @@ def after_create_session(self, session, coord):
     session.run(self._init_ops,
                 options=config_pb2.RunOptions(timeout_in_ms=5 * 60 * 1000))
 
-    logging.info('Start infeed thread controller')
     self._infeed_controller = self._create_infeed_controller(
         name='InfeedController', target=self._run_infeed, args=(session,))
 
-    logging.info('Start outfeed thread controller')
     self._outfeed_controller = _OpQueueContext(
         name='OutfeedController', target=self._run_outfeed, args=(session,))
 
@@ -636,8 +660,8 @@ def after_run(self, run_context, run_values):
       raise errors.OutOfRangeError(None, None, 'Stopped by stopping signal.')
 
 
-def generate_per_core_enqueue_ops_fn_for_host(ctx, input_fn,
-                                              inputs_structure_recorder):
+def generate_per_core_enqueue_ops_fn_for_host(
+    ctx, input_fn, inputs_structure_recorder, host_device, host_id):
   """Generates infeed enqueue ops for per-core input_fn on a single host."""
   captured_infeed_queue = _CapturedObject()
 
@@ -647,7 +671,12 @@ def enqueue_ops_fn():
     per_host_sharded_inputs = []
     for core_ordinal in range(num_cores_per_host):
       with ops.name_scope('ordinal_%d' % (core_ordinal)):
-        inputs = _Inputs.from_input_fn(input_fn())
+        user_context = tpu_context.TPUContext(
+            internal_ctx=ctx,
+            input_device=host_device,
+            invocation_index=host_id * ctx.num_of_cores_per_host + core_ordinal
+        )
+        inputs = _Inputs.from_input_fn(input_fn(user_context))
         if inputs.is_dataset:
           raise TypeError(
               '`input_fn` returning `Dataset`  is not yet supported in '
@@ -684,7 +713,11 @@ def generate_per_host_enqueue_ops_fn_for_host(
   hooks = []
 
   with ops.device(device):
-    inputs = _Inputs.from_input_fn(input_fn())
+    user_context = tpu_context.TPUContext(
+        internal_ctx=ctx,
+        input_device=device,
+        invocation_index=host_id)
+    inputs = _Inputs.from_input_fn(input_fn(user_context))
 
     is_dataset = inputs.is_dataset
     if ctx.mode == model_fn_lib.ModeKeys.PREDICT:
@@ -702,7 +735,7 @@ def generate_per_host_enqueue_ops_fn_for_host(
       hooks.append(inputs.dataset_initializer_hook())
 
   # TODO(ylc): Refactoring the code to merge the tpu ordinal logic here and the
-  # _TPUContext.tpu_ordinal_function. We should either introduce another
+  # _InternalTPUContext.tpu_ordinal_function. We should either introduce another
   # abstraction or a different helper method.
   def _tpu_ordinal_function_impl(shard_index_in_host):
     # We put both enqueue/dequeue op at tpu.core(0) in each replica.
@@ -716,6 +749,15 @@ def _tpu_ordinal_function_impl(shard_index_in_host):
     tpu_ordinal_function = None
 
   def enqueue_ops_fn():
+    """A Fn returning the TPU infeed enqueue ops.
+
+    By providing as a Fn, it can be invoked inside the tf.while_loop such that
+    the input pipeline for multiple iterations can be executed by one
+    Session.run call.
+
+    Returns:
+      list of dict of ops.
+    """
     with ops.device(device):
       num_of_replicas_per_host = ctx.num_of_replicas_per_host
       # Convert user input to features and labels.  If the user returns a
@@ -755,12 +797,15 @@ def enqueue_ops_fn():
 def generate_per_host_v2_enqueue_ops_fn_for_host(
     ctx, input_fn, inputs_structure_recorder, device, host_id):
   """Generates infeed enqueue ops for per-host input_fn on a single host."""
-  del host_id  # unused
   captured_infeed_queue = _CapturedObject()
   hooks = []
 
   with ops.device(device):
-    inputs = _Inputs.from_input_fn(input_fn())
+    user_context = tpu_context.TPUContext(
+        internal_ctx=ctx,
+        input_device=device,
+        invocation_index=host_id)
+    inputs = _Inputs.from_input_fn(input_fn(user_context))
 
     is_dataset = inputs.is_dataset
     if not is_dataset:
@@ -811,13 +856,14 @@ class _InputPipeline(object):
   """`_InputPipeline` handles invoking `input_fn` and piping to infeed queue.
 
   `_InputPipeline` abstracts the per-core/per-host `input_fn` invocation from
-  call site.  To be precise, based on the configuration in `_TPUContext`,  it
-  invokes `input_fn` for all cores (usually multi-host TPU training) or for one
-  host (usually for single-host TPU evaluation), and sends all `features` and
-  `labels` returned by `input_fn` to TPU infeed. For per-core invocation,
-  `features` and `labels` are piped to infeed directly, one tuple for each
-  core. For per-host invocation,  `features` and `labels` are split at host
-  (with respect to `batch_axis`) and piped to all cores accordingly.
+  call site.  To be precise, based on the configuration in
+  `_InternalTPUContext`,  it invokes `input_fn` for all cores (usually
+  multi-host TPU training) or for one host (usually for single-host TPU
+  evaluation), and sends all `features` and `labels` returned by `input_fn` to
+  TPU infeed. For per-core invocation, `features` and `labels` are piped to
+  infeed directly, one tuple for each core. For per-host invocation,  `features`
+  and `labels` are split at host (with respect to `batch_axis`) and piped to all
+  cores accordingly.
 
   In addition, flatten/unflatten are handled by `_InputPipeline` also.  Model
   inputs returned by the `input_fn` can have one of the following forms:
@@ -970,7 +1016,7 @@ def __init__(self, input_fn, batch_axis, ctx):
       batch_axis: A python tuple of int values describing how each tensor
         produced by the Estimator `input_fn` should be split across the TPU
         compute shards.
-      ctx: A `_TPUContext` instance with mode.
+      ctx: A `_InternalTPUContext` instance with mode.
 
     Raises:
       ValueError: If both `sharded_features` and `num_cores` are `None`.
@@ -1025,7 +1071,8 @@ def _invoke_input_fn_and_record_structure(self):
           with ops.name_scope('input_pipeline_task%d' % (host_id)):
             enqueue_ops_fn, captured_infeed_queue = (
                 generate_per_core_enqueue_ops_fn_for_host(
-                    self._ctx, self._input_fn, self._inputs_structure_recorder))
+                    self._ctx, self._input_fn, self._inputs_structure_recorder,
+                    host_device, host_id))
 
             if _WRAP_INPUT_FN_INTO_WHILE_LOOP:
               run_infeed_loop_on_coordinator = False
@@ -1085,10 +1132,16 @@ def _invoke_input_fn_and_record_structure(self):
     return enqueue_ops, all_hooks, run_infeed_loop_on_coordinator
 
   def _validate_input_pipeline(self):
-    # Perform some sanity checks to log user friendly information. We should
-    # error out to give users better error message. But, if
-    # _WRAP_INPUT_FN_INTO_WHILE_LOOP is False (legacy behavior), we cannot break
-    # user code, so, log a warning.
+    """Validates the input pipeline.
+
+    Perform some sanity checks to log user friendly information. We should
+    error out to give users better error message. But, if
+    _WRAP_INPUT_FN_INTO_WHILE_LOOP is False (legacy behavior), we cannot break
+    user code, so, log a warning.
+
+    Raises:
+      RuntimeError: If the validation failed.
+    """
     if ops.get_default_graph().get_collection(ops.GraphKeys.QUEUE_RUNNERS):
       err_msg = ('Input pipeline contains one or more QueueRunners. '
                  'It could be slow and not scalable. Please consider '
@@ -1156,7 +1209,7 @@ def train_step(loss):
           self._call_model_fn(features, labels))
       loss, train_op = estimator_spec.loss, estimator_spec.train_op
 
-      if isinstance(estimator_spec, TPUEstimatorSpec):
+      if isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
         captured_scaffold_fn.capture(estimator_spec.scaffold_fn)
       else:
         captured_scaffold_fn.capture(None)
@@ -1165,8 +1218,8 @@ def train_step(loss):
       # outfeed.
       with ops.control_dependencies([train_op]):
         host_call_outfeed_ops = []
-        if (isinstance(estimator_spec, TPUEstimatorSpec) and
-            estimator_spec.host_call is not None):
+        if (isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec)  # pylint: disable=protected-access
+            and estimator_spec.host_call is not None):
           host_call.record({'host_call': estimator_spec.host_call})
           host_call_outfeed_ops = host_call.create_enqueue_op()
         with ops.control_dependencies(host_call_outfeed_ops):
@@ -1209,7 +1262,7 @@ def eval_step(total_loss):
       features, labels = inputs.features_and_labels()
 
       tpu_estimator_spec = self._call_model_fn(features, labels)
-      if not isinstance(tpu_estimator_spec, TPUEstimatorSpec):
+      if not isinstance(tpu_estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
         raise RuntimeError(
             'estimator_spec used by TPU evaluation must have type'
             '`TPUEstimatorSpec`. Got {}'.format(type(tpu_estimator_spec)))
@@ -1254,18 +1307,16 @@ def predict_step(unused_scalar_stopping_signal):
 
       tpu_estimator_spec = self._call_model_fn(
           features, labels, is_export_mode=False)
-      if not isinstance(tpu_estimator_spec, TPUEstimatorSpec):
+      if not isinstance(tpu_estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
         raise RuntimeError(
             'estimator_spec used by TPU prediction must have type'
             '`TPUEstimatorSpec`. Got {}'.format(type(tpu_estimator_spec)))
 
+      self._verify_tpu_spec_predictions(tpu_estimator_spec.predictions)
+
       captured_scaffold_fn.capture(tpu_estimator_spec.scaffold_fn)
       to_record = {}
       identity_fn = lambda **kwargs: kwargs
-      # TODO(xiejw): Adds validation for prediction dictionrary.
-      # TODO(xiejw): Adds support for single tensor as predictions.
-      if not isinstance(tpu_estimator_spec.predictions, dict):
-        raise TypeError('TPUEstimatorSpec.predictions must be dict of Tensors.')
       to_record['predictions'] = [identity_fn, tpu_estimator_spec.predictions]
       to_record['signals'] = [identity_fn, stopping_signals]
       if tpu_estimator_spec.host_call is not None:
@@ -1277,9 +1328,24 @@ def predict_step(unused_scalar_stopping_signal):
 
     return predict_step, host_calls, captured_scaffold_fn
 
+  def _verify_tpu_spec_predictions(self, predictions):
+    """Validates TPUEstimatorSpec.predictions dict."""
+    # TODO(xiejw): Adds validation for prediction dictionrary.
+    # TODO(xiejw): Adds support for single tensor as predictions.
+    if not isinstance(predictions, dict):
+      raise TypeError('TPUEstimatorSpec.predictions must be dict of Tensors.')
+
+    for (key, tensor) in predictions.items():
+      if tensor.shape[0].value is None:
+        raise ValueError(
+            'The tensor with key ({}) in TPUEstimatorSpec.predictions has '
+            'dynamic shape (should be static). Tensor: {}'.format(
+                key, tensor))
+    return predictions
+
   def _call_model_fn(self, features, labels, is_export_mode=False):
     """Calls the model_fn with required parameters."""
-    model_fn_args = util.fn_args(self._model_fn)
+    model_fn_args = function_utils.fn_args(self._model_fn)
     kwargs = {}
 
     # Makes deep copy with `config` and params` in case user mutates them.
@@ -1309,14 +1375,11 @@ def _call_model_fn(self, features, labels, is_export_mode=False):
       batch_size_for_model_fn = self._ctx.batch_size_for_model_fn
 
     if batch_size_for_model_fn is not None:
-      if isinstance(params, hparam.HParams):
-        params.add_hparam(_BATCH_SIZE_KEY, batch_size_for_model_fn)
-      else:
-        params[_BATCH_SIZE_KEY] = batch_size_for_model_fn
+      _add_item_to_params(params, _BATCH_SIZE_KEY, batch_size_for_model_fn)
 
     estimator_spec = self._model_fn(features=features, **kwargs)
     if (self._ctx.is_running_on_cpu(is_export_mode) and
-        isinstance(estimator_spec, TPUEstimatorSpec)):
+        isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec)):  # pylint: disable=protected-access
       # The estimator_spec will be passed to `Estimator` directly, which expects
       # type `EstimatorSpec`.
       return estimator_spec.as_estimator_spec()
@@ -1325,7 +1388,7 @@ def _call_model_fn(self, features, labels, is_export_mode=False):
 
   def _verify_estimator_spec(self, estimator_spec):
     """Validates the estimator_spec."""
-    if isinstance(estimator_spec, TPUEstimatorSpec):
+    if isinstance(estimator_spec, model_fn_lib._TPUEstimatorSpec):  # pylint: disable=protected-access
       return estimator_spec
 
     err_msg = '{} returned by EstimatorSpec is not supported in TPUEstimator.'
@@ -1371,7 +1434,7 @@ def validate(host_calls):
 
       if isinstance(host_call[1], (tuple, list)):
         fullargspec = tf_inspect.getfullargspec(host_call[0])
-        fn_args = util.fn_args(host_call[0])
+        fn_args = function_utils.fn_args(host_call[0])
         # wrapped_hostcall_with_global_step uses varargs, so we allow that.
         if fullargspec.varargs is None and len(host_call[1]) != len(fn_args):
           raise RuntimeError(
@@ -1551,7 +1614,7 @@ def end(self, session):
 
 
 class ExamplesPerSecondHook(basic_session_run_hooks.StepCounterHook):
-  """Count examples during runtime."""
+  """Calculate and report global_step/sec and examples/sec during runtime."""
 
   def __init__(self,
                batch_size,
@@ -1567,12 +1630,18 @@ def __init__(self,
         summary_writer=summary_writer)
 
   def _log_and_record(self, elapsed_steps, elapsed_time, global_step):
-    examples_per_sec = self._batch_size * elapsed_steps / elapsed_time
+    global_step_per_sec = elapsed_steps / elapsed_time
+    examples_per_sec = self._batch_size * global_step_per_sec
     if self._summary_writer is not None:
+      global_step_summary = Summary(value=[
+          Summary.Value(tag='global_step/sec', simple_value=global_step_per_sec)
+      ])
       example_summary = Summary(value=[
-          Summary.Value(tag='examples_sec', simple_value=examples_per_sec)
+          Summary.Value(tag='examples/sec', simple_value=examples_per_sec)
       ])
+      self._summary_writer.add_summary(global_step_summary, global_step)
       self._summary_writer.add_summary(example_summary, global_step)
+    logging.info('global_step/sec: %g', global_step_per_sec)
     logging.info('examples/sec: %g', examples_per_sec)
 
 
@@ -1616,7 +1685,9 @@ class TPUEstimator(estimator_lib.Estimator):
   ==========
 
   `model_fn` should return `TPUEstimatorSpec`, which expects the `eval_metrics`
-  for TPU evaluation.
+  for TPU evaluation. However, if eval_on_tpu is False, `model_fn` must return
+  `EstimatorSpec` and the evaluation will execute on CPU or GPU; in this case
+  the following discussion on TPU evaluation does not apply.
 
   `TPUEstimatorSpec.eval_metrics` is a tuple of `metric_fn` and `tensors`, where
   `tensors` could be a list of `Tensor`s or dict of names to `Tensor`s. (See
@@ -1750,8 +1821,45 @@ def model_fn(features, labels, params, mode):
   Exporting
   =========
 
-  Exporting `SavedModel` support on TPU is not yet implemented. So,
-  `export_savedmodel` is executed on CPU, even if `use_tpu` is true.
+  `export_savedmodel` exports 2 metagraphs, one with `tag_constants.SERVING`,
+  and another with `tag_constants.SERVING` and `tag_constants.TPU`.
+  At serving time, these tags are used to select metagraph to load.
+
+  Before running the graph on TPU, TPU system needs to be initialized. If
+  TensorFlow Serving model-server is used, this is done automatically. If
+  not, please call `session.run(tpu.initialize_system())`.
+
+  `tpu.outside_compilation` can be used to wrap TPU incompatible ops in
+  `model_fn`.
+
+  Example:
+  ----------------
+
+  ```
+  def model_fn(features, labels, mode, config, params):
+    ...
+    logits = ...
+    export_outputs = {
+      'logits': export_output_lib.PredictOutput(
+        {'logits': logits})
+    }
+
+    def host_call(logits):
+      class_ids = math_ops.argmax(logits)
+      classes = string_ops.as_string(class_ids)
+      export_outputs['classes'] =
+        export_output_lib.ClassificationOutput(classes=classes)
+
+    tpu.outside_compilation(host_call, logits)
+
+    ...
+  ```
+
+  Current limitations:
+  --------------------
+
+  1. Outside compilation does not work yet (b/79991729).
+
   """
 
   def __init__(self,
@@ -1763,13 +1871,17 @@ def __init__(self,
                train_batch_size=None,
                eval_batch_size=None,
                predict_batch_size=None,
-               batch_axis=None):
+               batch_axis=None,
+               eval_on_tpu=True,
+               export_to_tpu=True,
+               warm_start_from=None):
     """Constructs an `TPUEstimator` instance.
 
     Args:
       model_fn: Model function as required by `Estimator`. For training, the
         returned `EstimatorSpec` cannot have hooks as it is not supported in
-        `TPUEstimator`.
+        `TPUEstimator`. Instead, the user can pass the training hooks as
+        an argument to `TPUEstimator.train()`.
       model_dir: Directory to save model parameters, graph and etc. This can
         also be used to load checkpoints from the directory into a estimator to
         continue training a previously saved model. If `None`, the model_dir in
@@ -1781,7 +1893,8 @@ def __init__(self,
         basic python types. There are reserved keys for `TPUEstimator`,
         including 'batch_size'.
       use_tpu: A bool indicating whether TPU support is enabled. Currently,
-        - TPU training and evaluation respect this bit.
+        - TPU training and evaluation respect this bit, but eval_on_tpu can
+          override execution of eval. See below.
         - Predict still happens on CPU.
       train_batch_size: An int representing the global training batch size.
         TPUEstimator transforms this global batch size to a per-shard batch
@@ -1802,6 +1915,16 @@ def __init__(self,
         and per_host_input_for_training is True, batches will be sharded based
         on the major dimension. If tpu_config.per_host_input_for_training is
         False or `PER_HOST_V2`, batch_axis is ignored.
+      eval_on_tpu: If False, evaluation runs on CPU or GPU. In this case, the
+        model_fn must return `EstimatorSpec` when called with `mode` as `EVAL`.
+      export_to_tpu: If True, `export_savedmodel()` exports a metagraph for
+        serving on TPU besides the one on CPU.
+      warm_start_from: Optional string filepath to a checkpoint or SavedModel to
+                       warm-start from, or a `tf.estimator.WarmStartSettings`
+                       object to fully configure warm-starting.  If the string
+                       filepath is provided instead of a `WarmStartSettings`,
+                       then all variables are warm-started, and it is assumed
+                       that vocabularies and Tensor names are unchanged.
 
     Raises:
       ValueError: `params` has reserved keys already.
@@ -1816,7 +1939,7 @@ def __init__(self,
 
     if use_tpu:
       # Perform some very basic validations. More validations will be found in
-      # _TPUContext.
+      # _InternalTPUContext.
       if train_batch_size is None:
         raise ValueError('`train_batch_size` cannot be `None`')
       util_lib.check_positive_integer(train_batch_size, 'train_batch_size')
@@ -1842,25 +1965,134 @@ def __init__(self,
     # config.model_dir.
     model_function = self._augment_model_fn(model_fn, batch_axis)
 
+    # Overwrite log_step_count_steps to disable TensorLoggingHook and
+    # StepCounterHook from being created in Estimator. TPUEstimator already
+    # added equivalent hooks in _augment_model_fn above.
+    self._log_every_n_steps = config.log_step_count_steps
+    config = config.replace(log_step_count_steps=None)
+
     # Passing non-None params as wrapped model_fn has it.
     params = params or {}
     super(TPUEstimator, self).__init__(
         model_fn=model_function,
         model_dir=model_dir,
         config=config,
-        params=params)
+        params=params,
+        warm_start_from=warm_start_from)
     self._iterations_per_training_loop = (
         self._config.tpu_config.iterations_per_loop)
 
-    # All properties passed to _TPUContext are immutable.
+    # All properties passed to _InternalTPUContext are immutable.
     # pylint: disable=protected-access
     self._ctx = tpu_context._get_tpu_context(
         self._config, train_batch_size,
         eval_batch_size, predict_batch_size,
-        use_tpu)
+        use_tpu,
+        eval_on_tpu)
+
+    self._export_to_tpu = export_to_tpu
 
     self._is_input_fn_invoked = None
 
+  def _add_meta_graph_for_mode(self,
+                               builder,
+                               input_receiver_fn_map,
+                               checkpoint_path,
+                               strip_default_attrs,
+                               save_variables=True,
+                               mode=model_fn_lib.ModeKeys.PREDICT,
+                               export_tags=None):
+    if mode != model_fn_lib.ModeKeys.PREDICT:
+      raise NotImplementedError(
+          'TPUEstimator only handles mode PREDICT for export_savedmodel(); '
+          'got {}.'.format(mode))
+
+    super(TPUEstimator, self)._add_meta_graph_for_mode(builder,
+                                                       input_receiver_fn_map,
+                                                       checkpoint_path,
+                                                       strip_default_attrs,
+                                                       save_variables,
+                                                       mode=mode)
+
+    if self._export_to_tpu:
+      input_receiver_fn_map = {_REWRITE_FOR_INFERENCE_MODE:
+                               input_receiver_fn_map[mode]}
+      export_tags = [tag_constants.SERVING, tag_constants.TPU]
+      mode = _REWRITE_FOR_INFERENCE_MODE
+      (super(TPUEstimator, self).
+       _add_meta_graph_for_mode(builder,
+                                input_receiver_fn_map,
+                                checkpoint_path,
+                                strip_default_attrs,
+                                save_variables=False,
+                                mode=mode,
+                                export_tags=export_tags))
+
+  def _call_model_fn(self, features, labels, mode, config):
+    if mode == _REWRITE_FOR_INFERENCE_MODE:
+      return self._call_model_fn_for_inference(features, labels, mode, config)
+    else:
+      return super(TPUEstimator, self)._call_model_fn(
+          features, labels, mode, config)
+
+  def _call_model_fn_for_inference(self, features, labels, mode, config):
+    """Wraps `_call_model_fn` for `export_savedmodel`."""
+    if mode != _REWRITE_FOR_INFERENCE_MODE:
+      raise ValueError('mode must be {}; '
+                       'got {}.'.format(_REWRITE_FOR_INFERENCE_MODE, mode))
+
+    capture = _CapturedObject()
+
+    def computation():
+      """Compute tpu tensors used in export_outputs.
+
+      Passed to rewrite_for_inference so that model_fn will be called under
+      the rewriting contexts. Only tpu tensors are returned, but export_outputs
+      and scaffold are captured.
+
+      Returns:
+         A list of Tensors used in export_outputs and not marked for
+         outside_compilation.
+      """
+      # We should only call model fn once and it should be inside `computation`
+      # so that building the graph will happen under `rewrite_for_inference`.
+      mode = model_fn_lib.ModeKeys.PREDICT
+      estimator_spec = self._call_model_fn(features, labels, mode, config)
+
+      # We pick the TPU tensors out from `export_output` and later return them
+      # from `computation` for rewriting.
+      tensors_dict = collections.OrderedDict(
+          (k, _export_output_to_tensors(v))
+          for k, v in six.iteritems(estimator_spec.export_outputs)
+      )
+      tensors = nest.flatten(tensors_dict)
+      tpu_tensors = [t for t in tensors if _is_tpu_tensor(t)]
+
+      # We cannot return anything other than `tpu_tensors` here so we capture
+      # the rest for later use.
+      capture.capture((estimator_spec, tensors_dict, tensors))
+      return tpu_tensors
+
+    tpu_tensors_on_cpu = tpu.rewrite_for_inference(computation)
+    estimator_spec, tensors_dict, tensors = capture.get()
+
+    # Reconstruct `tensors`, but with `tpu_tensors` replaced with
+    # `tpu_tensors_on_cpu`.
+    new_tensors = [
+        tpu_tensors_on_cpu.pop(0) if _is_tpu_tensor(t) else t
+        for t in tensors
+    ]
+    # Reconstruct `tensors_dict`.
+    new_tensors_dict = nest.pack_sequence_as(tensors_dict, new_tensors)
+    # Reconstruct `export_outputs`.
+    export_outputs = estimator_spec.export_outputs
+    new_export_outputs = collections.OrderedDict(
+        (k, _clone_export_output_with_tensors(export_outputs[k], v))
+        for k, v in six.iteritems(new_tensors_dict)
+    )
+
+    return estimator_spec._replace(export_outputs=new_export_outputs)
+
   def _create_global_step(self, graph):
     """Creates a global step suitable for TPUs.
 
@@ -1928,7 +2160,7 @@ def _call_input_fn(self, input_fn, mode):
     Raises:
       ValueError: if input_fn takes invalid arguments or does not have `params`.
     """
-    input_fn_args = util.fn_args(input_fn)
+    input_fn_args = function_utils.fn_args(input_fn)
     config = self.config  # a deep copy.
     kwargs = {}
     if 'params' in input_fn_args:
@@ -1951,10 +2183,8 @@ def _call_input_fn(self, input_fn, mode):
       # input_fn for use_tpu=True/False.
       batch_size_for_input_fn = ctx.batch_size_for_input_fn
       if batch_size_for_input_fn is not None:
-        if isinstance(kwargs['params'], hparam.HParams):
-          kwargs['params'].add_hparam(_BATCH_SIZE_KEY, batch_size_for_input_fn)
-        else:
-          kwargs['params'][_BATCH_SIZE_KEY] = batch_size_for_input_fn
+        _add_item_to_params(kwargs['params'],
+                            _BATCH_SIZE_KEY, batch_size_for_input_fn)
 
       # For export_savedmodel, input_fn is never passed to Estimator. So,
       # `is_export_mode` must be False.
@@ -1972,7 +2202,8 @@ def _call_input_fn(self, input_fn, mode):
       # tf.while_loop also. So, we either pass input_fn to model_fn or pass
       # dequeue_fn to model_fn. Here, `input_fn` is passed directly as
       # `features` in `model_fn` signature.
-      def _input_fn():
+      def _input_fn(ctx):
+        _add_item_to_params(kwargs['params'], _CTX_KEY, ctx)
         return input_fn(**kwargs)
 
       return _input_fn
@@ -2037,33 +2268,69 @@ def _model_fn(features, labels, mode, config, params):
           host_ops = host_call.create_tpu_hostcall()
           if host_ops is None:
             host_ops = []
-          hooks = [
+
+          shutdown_hooks = []
+          shutdown_mode = os.environ.get('TF_TPU_GRACEFUL_SHUTDOWN_MODE',
+                                         'shutdown_worker')
+          if shutdown_mode:
+            if shutdown_mode == 'shutdown_worker':
+              finalizer_hooks = [
+                  session_support.ShutdownLameWorkers(timeout_ms=60*1000),
+              ]
+            elif shutdown_mode == 'shutdown_computation':
+              finalizer_hooks = [
+                  session_support.RestartComputation(timeout_ms=60*1000),
+              ]
+            else:
+              raise ValueError('Unknown TF_TPU_GRACEFUL_SHUTDOWN_MODE "%s"' %
+                               shutdown_mode)
+
+            shutdown_hooks.append(session_support.GracefulShutdownHook(
+                checkpoint_prefix=self.model_dir + '/model.ckpt',
+                on_shutdown_hooks=finalizer_hooks
+            ))
+
+          with ops.control_dependencies([loss]):
+            global_step = array_ops.identity(training.get_global_step())
+          hooks = input_hooks + shutdown_hooks
+          logging_hook_frequency = (    # Divide and round up
+              (self._log_every_n_steps +
+               self._config.tpu_config.iterations_per_loop - 1) //
+              self._config.tpu_config.iterations_per_loop)
+          hooks.extend([
               TPUInfeedOutfeedSessionHook(
                   ctx,
                   enqueue_ops,
                   host_ops,
                   run_infeed_loop_on_coordinator=(
                       run_infeed_loop_on_coordinator)),
-              ExamplesPerSecondHook(ctx.global_batch_size,
-                                    output_dir=self.model_dir),
               InstallSignalHandlerHook(),
               training.LoggingTensorHook(
                   {
                       'loss': array_ops.identity(loss),
-                      'step': training.get_global_step()
+                      'step': global_step,
                   },
-                  every_n_secs=30)
-          ] + input_hooks
+                  every_n_iter=logging_hook_frequency)
+          ])
+          examples_hook = ExamplesPerSecondHook(
+              ctx.global_batch_size,
+              output_dir=self.model_dir,
+              every_n_steps=self._log_every_n_steps)
+          examples_hook._set_steps_per_run(   # pylint: disable=protected-access
+              self._config.tpu_config.iterations_per_loop)
+          hooks.append(examples_hook)
+
           chief_hooks = []
           if (self._config.save_checkpoints_secs or
               self._config.save_checkpoints_steps):
-            chief_hooks.append(
-                training.CheckpointSaverHook(
-                    self.model_dir,
-                    save_secs=self._config.save_checkpoints_secs,
-                    save_steps=self._config.save_checkpoints_steps,
-                    steps_per_run=self._config.tpu_config.iterations_per_loop,
-                    scaffold=scaffold))
+            checkpoint_hook = training.CheckpointSaverHook(
+                self.model_dir,
+                save_secs=self._config.save_checkpoints_secs,
+                save_steps=self._config.save_checkpoints_steps,
+                scaffold=scaffold)
+            checkpoint_hook._set_steps_per_run(   # pylint: disable=protected-access
+                self._config.tpu_config.iterations_per_loop)
+            chief_hooks.append(checkpoint_hook)
           summary.scalar(model_fn_lib.LOSS_METRIC_KEY, loss)
           with ops.control_dependencies([loss]):
             update_ops = _sync_variables_ops()
@@ -2201,6 +2468,76 @@ def _model_fn(features, labels, mode, config, params):
     return _model_fn
 
 
+def _is_tpu_tensor(tensor):
+  if not isinstance(tensor, ops.Tensor):
+    return False
+  try:
+    tensor.op.get_attr(tpu._OUTSIDE_COMPILATION_ATTR)  # pylint: disable=protected-access
+  except ValueError:
+    return True
+  else:
+    return False
+
+
+def _export_output_to_tensors(export_output):
+  """Get a list of `Tensors` used in `export_output`.
+
+  Args:
+    export_output: an `ExportOutput` object such as `ClassificationOutput`,
+            `RegressionOutput`, or `PredictOutput`.
+  Returns:
+    a list of tensors used in export_output.
+
+  Raises:
+    ValueError: if `export_output` is not one of `ClassificationOutput`,
+        `RegressionOutput`, or `PredictOutput`.
+  """
+  if isinstance(export_output, export_output_lib.ClassificationOutput):
+    return [export_output.scores, export_output.classes]
+  elif isinstance(export_output, export_output_lib.RegressionOutput):
+    return [export_output.value]
+  elif isinstance(export_output, export_output_lib.PredictOutput):
+    return export_output.outputs.values()
+  else:
+    raise ValueError(
+        '`export_output` must be have type `ClassificationOutput`, '
+        '`RegressionOutput`, or `PredictOutput`; got {}.'.format(export_output))
+
+
+def _clone_export_output_with_tensors(export_output, tensors):
+  """Clones `export_output` but with new `tensors`.
+
+  Args:
+    export_output: an `ExportOutput` object such as `ClassificationOutput`,
+            `RegressionOutput`, or `PredictOutput`.
+    tensors: a list of `Tensors` used to construct a new `export_output`.
+
+  Returns:
+    A dict similar to `export_output` but with `tensors`.
+
+  Raises:
+    ValueError: if `export_output` is not one of `ClassificationOutput`,
+        `RegressionOutput`, or `PredictOutput`.
+  """
+  if isinstance(export_output, export_output_lib.ClassificationOutput):
+    if len(tensors) != 2:
+      raise ValueError('tensors must be of length 2; '
+                       'got {}.'.format(len(tensors)))
+    return export_output_lib.ClassificationOutput(*tensors)
+  elif isinstance(export_output, export_output_lib.RegressionOutput):
+    if len(tensors) != 1:
+      raise ValueError('tensors must be of length 1; '
+                       'got {}'.format(len(tensors)))
+    return export_output_lib.RegressionOutput(*tensors)
+  elif isinstance(export_output, export_output_lib.PredictOutput):
+    return export_output_lib.PredictOutput(
+        dict(zip(export_output.outputs.keys(), tensors)))
+  else:
+    raise ValueError(
+        '`export_output` must be have type `ClassificationOutput`, '
+        '`RegressionOutput`, or `PredictOutput`; got {}.'.format(export_output))
+
+
 def _eval_on_tpu_system(ctx, model_fn_wrapper, dequeue_fn):
   """Executes `model_fn_wrapper` multiple times on all TPU shards."""
   iterations_per_loop_var = _create_or_get_iterations_per_loop()
@@ -2348,7 +2685,7 @@ def __init__(self):
   def capture(self, o):
     if self._captured:
       raise RuntimeError(
-          'InternalError: Object can be captured only. Please file bug .')
+          'InternalError: Object can capture only once. Please file bug.')
 
     self._captured = True
     self._object = o
@@ -2357,7 +2694,7 @@ def get(self):
     if not self._captured:
       raise RuntimeError(
           'InternalError: Object is not captured properly before `get`. '
-          'Please file bug .')
+          'Please file bug.')
     return self._object
 
 
@@ -2458,7 +2795,8 @@ def dataset_initializer_hook(self):
     """
     iterator = self._dataset.make_initializable_iterator()
     # pylint: disable=protected-access
-    hook = estimator_lib._DatasetInitializerHook(iterator)
+    hook = estimator_util._DatasetInitializerHook(iterator)
+    # pylint: enable=protected-access
     self._iterator = iterator
     return hook
 
@@ -2604,6 +2942,7 @@ def as_scalar_stopping_signal(signals):
 
   @staticmethod
   def should_stop(scalar_stopping_signal):
+    """Detects whether scalar_stopping_signal indicates stopping."""
     if isinstance(scalar_stopping_signal, ops.Tensor):
       # STOPPING_SIGNAL is a constant True. Here, the logical_and is just the TF
       # way to express the bool check whether scalar_stopping_signal is True.
@@ -2754,3 +3093,16 @@ def _verify_cross_hosts_transfer_size(tensor_dict, message):
         '{}'.format(message, '\n'.join([
             ' -- Key: {}, Shape: {}'.format(k, v)
             for k, v in tensor_structure.items()])))
+
+
+def _add_item_to_params(params, key, value):
+  """Adds a new item into `params`."""
+  if isinstance(params, hparam.HParams):
+    # For HParams, we need to use special API.
+    if key in params:
+      params.key = value
+    else:
+      params.add_hparam(key, value)
+  else:
+    # Now params is Python dict.
+    params[key] = value
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py b/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
index 3ae350c7bb345c..894f21d0635ca4 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
@@ -60,7 +60,7 @@ def _query_tpu_system_metadata(master_address, run_config,
       with ops.Graph().as_default():
         with session_lib.Session(
             master_address,
-            config=_get_session_config_with_timeout(
+            config=get_session_config_with_timeout(
                 _PINGING_MASTER_TIMEOUT_IN_MS, run_config)) as sess:
           devices = sess.list_devices()
           for device in devices:
@@ -133,7 +133,7 @@ def _obtain_topology(master_address, run_config):
                  'for model parallelism. This might take a while.',
                  master_address)
     with ops.Graph().as_default():
-      session_config = _get_session_config_with_timeout(
+      session_config = get_session_config_with_timeout(
           _INITIAL_TPU_SYSTEM_TIMEOUT_IN_MS, run_config)
       with session_lib.Session(
           master_address, config=session_config) as sess:
@@ -146,7 +146,7 @@ def _obtain_topology(master_address, run_config):
             master_address))
 
 
-def _get_session_config_with_timeout(timeout_in_secs, run_config):
+def get_session_config_with_timeout(timeout_in_secs, run_config):
   cluster_def = None
   if run_config.session_config and run_config.session_config.cluster_def.job:
     cluster_def = run_config.session_config.cluster_def
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_test.py b/tensorflow/contrib/tpu/python/tpu/tpu_test.py
index c3882b8a27bc83..6bdaa528f9f946 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_test.py
@@ -26,6 +26,7 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.layers import convolutional
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
 
@@ -37,7 +38,8 @@ class TPUContextTest(test.TestCase):
   def testIsInContext(self):
     """Test that control_flow_util can check that we're in a TPU context."""
     z1 = array_ops.identity(1)
-    context = tpu.TPUReplicateContext(b"context", 1)
+    pivot = control_flow_ops.no_op()
+    context = tpu.TPUReplicateContext(b"context", 1, pivot=pivot)
     context.Enter()
     z2 = array_ops.identity(1)
     context.Exit()
diff --git a/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py b/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
index f305197c190b67..df07ff44ee6823 100644
--- a/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
+++ b/tensorflow/contrib/training/python/training/batch_sequences_with_states_test.py
@@ -27,7 +27,6 @@
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -506,19 +505,6 @@ def testAdvancedPadding(self):
         expected_seq4_batch2=expected_seq4_batch2)
 
 
-class BatchSequencesWithStatesTestWithCApi(BatchSequencesWithStatesTest):
-
-  def setUp(self):
-    self._prev_value = ops._USE_C_API
-    ops._USE_C_API = True
-    super(BatchSequencesWithStatesTestWithCApi, self).setUp()
-
-  def tearDown(self):
-    super(BatchSequencesWithStatesTestWithCApi, self).tearDown()
-    ops._USE_C_API = self._prev_value
-
-
-@test_util.with_c_api
 class PaddingTest(test.TestCase):
 
   def testPaddingInvalidLengths(self):
diff --git a/tensorflow/contrib/training/python/training/hparam.py b/tensorflow/contrib/training/python/training/hparam.py
index 6c59b68053cfc6..3beb7bfe3048a8 100644
--- a/tensorflow/contrib/training/python/training/hparam.py
+++ b/tensorflow/contrib/training/python/training/hparam.py
@@ -34,7 +34,7 @@
 # where <rhs> is either a single token or [] enclosed list of tokens.
 # For example:  "var[1] = a" or "x = [1,2,3]"
 PARAM_RE = re.compile(r"""
-  (?P<name>[a-zA-Z][\w]*)      # variable name: "var" or "x"
+  (?P<name>[a-zA-Z][\w\.]*)      # variable name: "var" or "x"
   (\[\s*(?P<index>\d+)\s*\])?  # (optional) index: "1" or None
   \s*=\s*
   ((?P<val>[^,\[]*)            # single value: "a" or None
@@ -200,6 +200,13 @@ def parse_values(values, type_map):
   If a hyperparameter name in both an index assignment and scalar assignment,
   a ValueError is raised.  (e.g. 'a=[1,2,3],a[0] = 1').
 
+  The hyperparameter name may contain '.' symbols, which will result in an
+  attribute name that is only accessible through the getattr and setattr
+  functions.  (And must be first explicit added through add_hparam.)
+
+  WARNING: Use of '.' in your variable names is allowed, but is not well
+  supported and not recommended.
+
   The `value` in `name=value` must follows the syntax according to the
   type of the parameter:
 
@@ -502,6 +509,16 @@ def set_hparam(self, name, value):
             'Must pass a list for multi-valued parameter: %s.' % name)
       setattr(self, name, _cast_to_type_if_compatible(name, param_type, value))
 
+  def del_hparam(self, name):
+    """Removes the hyperparameter with key 'name'.
+
+    Args:
+      name: Name of the hyperparameter.
+    """
+    if hasattr(self, name):
+      delattr(self, name)
+      del self._hparam_types[name]
+
   def parse(self, values):
     """Override hyperparameter values, parsing new values from a string.
 
diff --git a/tensorflow/contrib/training/python/training/hparam_test.py b/tensorflow/contrib/training/python/training/hparam_test.py
index 96eff86d8d48bb..660c97f25e8458 100644
--- a/tensorflow/contrib/training/python/training/hparam_test.py
+++ b/tensorflow/contrib/training/python/training/hparam_test.py
@@ -118,6 +118,21 @@ def testSomeValues(self):
     self.assertEqual('2.3"', hparams2.c_c)
     self.assertEqual('/a=b/c/d', hparams2.d)
 
+  def testWithPeriodInVariableName(self):
+    hparams = hparam.HParams()
+    hparams.add_hparam(name='a.b', value=0.0)
+    hparams.parse('a.b=1.0')
+    self.assertEqual(1.0, getattr(hparams, 'a.b'))
+    hparams.add_hparam(name='c.d', value=0.0)
+    with self.assertRaisesRegexp(ValueError, 'Could not parse'):
+      hparams.parse('c.d=abc')
+    hparams.add_hparam(name='e.f', value='')
+    hparams.parse('e.f=abc')
+    self.assertEqual('abc', getattr(hparams, 'e.f'))
+    hparams.add_hparam(name='d..', value=0.0)
+    hparams.parse('d..=10.0')
+    self.assertEqual(10.0, getattr(hparams, 'd..'))
+
   def testSetFromMap(self):
     hparams = hparam.HParams(a=1, b=2.0, c='tanh')
     hparams.override_from_dict({'a': -2, 'c': 'identity'})
@@ -439,6 +454,22 @@ def testGet(self):
     self.assertEqual(123, hparams.get('unknown', 123))
     self.assertEqual([1, 2, 3], hparams.get('unknown', [1, 2, 3]))
 
+  def testDel(self):
+    hparams = hparam.HParams(aaa=1, b=2.0)
+
+    with self.assertRaises(ValueError):
+      hparams.set_hparam('aaa', 'will fail')
+
+    with self.assertRaises(ValueError):
+      hparams.add_hparam('aaa', 'will fail')
+
+    hparams.del_hparam('aaa')
+    hparams.add_hparam('aaa', 'will work')
+    self.assertEqual('will work', hparams.get('aaa'))
+
+    hparams.set_hparam('aaa', 'still works')
+    self.assertEqual('still works', hparams.get('aaa'))
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/verbs/README.md b/tensorflow/contrib/verbs/README.md
index 4b6104a8b4d542..3137bfd03e3faa 100644
--- a/tensorflow/contrib/verbs/README.md
+++ b/tensorflow/contrib/verbs/README.md
@@ -159,7 +159,7 @@ When the receiver receives the RDMA write, it will locate the relevant **RdmaTen
 	* step_id - Step ID.
 	* request_index - Request index.
 	* remote_addr/rkey - Address/rkey of the reallocated result/proxy tensor.
-* **RDMA_MESSAGE_ERROR_STATUS** - (sender ==> receiver) Notify the receiver that an error had occured on the sender side, so it can propagate it to the upper levels.
+* **RDMA_MESSAGE_ERROR_STATUS** - (sender ==> receiver) Notify the receiver that an error had occurred on the sender side, so it can propagate it to the upper levels.
 	* type - The message type.
 	* name (name_size) - Name of the requested tensor.
 	* step_id - Step ID.
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 3c86f970e121ea..8f3d43b72149aa 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -72,80 +72,79 @@ licenses(["notice"])  # Apache 2.0
 
 load(
     "//tensorflow:tensorflow.bzl",
+    "cc_header_only_library",
     "full_path",
     "if_android",
-    "if_not_android_mips_and_mips64",
     "if_ios",
     "if_linux_x86_64",
     "if_mobile",
     "if_not_mobile",
-    "if_windows",
     "if_not_windows",
-    "tf_copts",
+    "if_windows",
     "tf_cc_test",
     "tf_cc_tests",
+    "tf_copts",
     "tf_gpu_library",
     "tf_gen_op_libs",
     "tf_generate_proto_text_sources",
     "tf_genrule_cmd_append_to_srcs",
     "tf_opts_nortti_if_android",
-    "cc_header_only_library",
 )
 load("//tensorflow:tensorflow.bzl", "tf_cc_test_mkl")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test_gpu")
 load("//tensorflow:tensorflow.bzl", "tf_cc_tests_gpu")
+load("//tensorflow:tensorflow.bzl", "tf_gpu_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_version_info_genrule")
 load("//tensorflow:tensorflow.bzl", "tf_gpu_only_cc_test")
 
 # For platform specific build config
 load(
     "//tensorflow/core:platform/default/build_config.bzl",
-    "tf_proto_library",
-    "tf_proto_library_cc",
     "tf_additional_all_protos",
+    "tf_additional_cloud_kernel_deps",
+    "tf_additional_cloud_op_deps",
     "tf_additional_core_deps",
+    "tf_additional_cupti_wrapper_deps",
+    "tf_additional_device_tracer_cuda_deps",
+    "tf_additional_device_tracer_deps",
+    "tf_additional_device_tracer_srcs",
+    "tf_additional_gdr_lib_defines",
+    "tf_additional_human_readable_json_deps",
     "tf_additional_lib_defines",
     "tf_additional_lib_deps",
     "tf_additional_lib_hdrs",
     "tf_additional_lib_srcs",
-    "tf_additional_framework_hdrs",
-    "tf_additional_framework_srcs",
-    "tf_additional_minimal_lib_srcs",
-    "tf_additional_proto_hdrs",
-    "tf_additional_proto_srcs",
-    "tf_additional_cupti_wrapper_deps",
     "tf_additional_libdevice_data",
     "tf_additional_libdevice_deps",
     "tf_additional_libdevice_srcs",
     "tf_additional_rocdl_data",
     "tf_additional_rocdl_deps",
     "tf_additional_rocdl_srcs",
+    "tf_additional_minimal_lib_srcs",
+    "tf_additional_mpi_lib_defines",
+    "tf_additional_proto_hdrs",
+    "tf_additional_proto_srcs",
     "tf_additional_test_deps",
     "tf_additional_test_srcs",
-    "tf_env_time_hdrs",
-    "tf_env_time_srcs",
-    "tf_kernel_tests_linkstatic",
-    "tf_additional_cloud_op_deps",
-    "tf_additional_cloud_kernel_deps",
-    "tf_lib_proto_parsing_deps",
     "tf_additional_verbs_lib_defines",
-    "tf_additional_mpi_lib_defines",
-    "tf_additional_gdr_lib_defines",
-    "tf_additional_device_tracer_srcs",
-    "tf_additional_device_tracer_deps",
-    "tf_additional_device_tracer_cuda_deps",
-    "tf_pyclif_proto_library",
     "tf_jspb_proto_library",
+    "tf_kernel_tests_linkstatic",
+    "tf_lib_proto_parsing_deps",
     "tf_nano_proto_library",
+    "tf_platform_hdrs",
+    "tf_platform_srcs",
+    "tf_proto_library",
+    "tf_proto_library_cc",
     "tf_protos_all",
     "tf_protos_all_impl",
     "tf_protos_grappler",
     "tf_protos_grappler_impl",
+    "tf_pyclif_proto_library",
 )
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
-    "tf_gpu_tests_tags",
     "if_static",
+    "tf_gpu_tests_tags",
 )
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@io_bazel_rules_closure//closure:defs.bzl", "closure_proto_library")
@@ -228,6 +227,7 @@ ADDITIONAL_CORE_PROTO_SRCS = [
     "protobuf/named_tensor.proto",
     "protobuf/saved_model.proto",
     "protobuf/tensorflow_server.proto",
+    "protobuf/transport_options.proto",
     "util/test_log.proto",
 ]
 
@@ -235,6 +235,7 @@ tf_proto_library(
     name = "protos_all",
     srcs = [],
     cc_api_version = 2,
+    dart_api_version = 2,
     default_header = True,
     j2objc_api_version = 1,
     java_api_version = 2,
@@ -270,6 +271,12 @@ proto_library(
     visibility = ["//visibility:public"],
 )
 
+closure_proto_library(
+    name = "example_protos_closure",
+    visibility = ["//visibility:public"],
+    deps = [":example_protos"],
+)
+
 exports_files([
     "framework/types.proto",
 ])
@@ -290,60 +297,303 @@ cc_library(
     ],
 )
 
-PLATFORM_BASE_HDRS = [
-    "platform/logging.h",
-    "platform/macros.h",
-    "platform/types.h",
-    "platform/byte_order.h",
-]
-
-PLATFORM_OTHER_HDRS = [
-    "platform/abi.h",
-    "platform/stacktrace.h",
-    "platform/stacktrace_handler.h",
-    "platform/context.h",
-    "platform/cpu_info.h",
-    "platform/cpu_feature_guard.h",
-    "platform/dynamic_annotations.h",
-    "platform/env.h",
-    "platform/env_time.h",
-    "platform/file_system.h",
-    "platform/file_system_helper.h",
-    "platform/fingerprint.h",
-    "platform/init_main.h",
-    "platform/mem.h",
-    "platform/mutex.h",
-    "platform/net.h",
-    "platform/notification.h",
-    "platform/null_file_system.h",
-    "platform/prefetch.h",
-    "platform/profile_utils/clock_cycle_profiler.h",
-    "platform/profile_utils/cpu_utils.h",
-    "platform/protobuf.h",
-    "platform/strong_hash.h",
-    "platform/subprocess.h",
-    "platform/thread_annotations.h",
-]
+filegroup(
+    name = "platform_base_hdrs",
+    srcs = [
+        "platform/byte_order.h",
+        "platform/env_time.h",
+        "platform/logging.h",
+        "platform/macros.h",
+        "platform/types.h",
+    ],
+    visibility = ["//visibility:private"],
+)
 
-# Smaller platform libraries that don't depend on "lib" or "lib_internal".
 cc_library(
     name = "platform_base",
-    srcs = glob([
-        "platform/*/integral_types.h",
-        "platform/*/logging.h",
-    ]),
-    hdrs = PLATFORM_BASE_HDRS,
+    srcs = tf_platform_hdrs([
+        "integral_types.h",
+        "logging.h",
+    ]) + tf_platform_srcs([
+        "logging.cc",
+        "env_time.cc",
+    ]) + [
+        "platform/env_time.cc",
+    ],
+    hdrs = [":platform_base_hdrs"],
+    copts = tf_copts(),
+    tags = ["avoid_dep"],
+    visibility = ["//tensorflow/core:__subpackages__"],
     deps = [
         ":lib_platform",
         "//tensorflow/core/platform/default/build_config:base",
     ],
 )
 
+filegroup(
+    name = "platform_port_hdrs",
+    srcs = [
+        "platform/cpu_info.h",
+        "platform/dynamic_annotations.h",
+        "platform/init_main.h",
+        "platform/mem.h",
+        "platform/mutex.h",
+        "platform/thread_annotations.h",
+    ],
+    visibility = ["//visibility:private"],
+)
+
+# Headers that are not exported as part of ":lib".
+filegroup(
+    name = "platform_port_internal_hdrs",
+    srcs = [
+        "platform/demangle.h",
+        "platform/host_info.h",
+        "platform/snappy.h",
+    ],
+    visibility = ["//visibility:private"],
+)
+
+cc_library(
+    name = "platform_port",
+    srcs = tf_platform_hdrs([
+        "cpu_info.h",
+        "dynamic_annotations.h",
+        "thread_annotations.h",
+        "mutex.h",
+    ]) + tf_platform_srcs([
+        "port.cc",
+    ]) + [
+        "platform/cpu_info.cc",
+    ],
+    hdrs = [
+        ":platform_port_hdrs",
+        ":platform_port_internal_hdrs",
+    ],
+    copts = tf_copts(),
+    visibility = ["//tensorflow/core:__subpackages__"],
+    deps = [
+        ":lib_platform",
+        ":platform_base",
+        "//tensorflow/core/platform/default/build_config:port",
+        "@snappy",
+    ],
+)
+
+filegroup(
+    name = "platform_protobuf_hdrs",
+    srcs = [
+        "platform/protobuf.h",
+    ],
+    visibility = ["//visibility:private"],
+)
+
+# Headers that are not exported as part of ":lib".
+filegroup(
+    name = "platform_protobuf_internal_hdrs",
+    srcs = [
+        "platform/protobuf_internal.h",
+    ],
+    visibility = ["//visibility:private"],
+)
+
+cc_library(
+    name = "platform_protobuf",
+    srcs = tf_platform_hdrs([
+        "protobuf.h",
+    ]) + tf_platform_srcs([
+        "protobuf.cc",
+    ]) + [
+        "platform/protobuf_util.cc",
+        "lib/core/status.h",
+    ],
+    hdrs = [
+        ":platform_protobuf_hdrs",
+        ":platform_protobuf_internal_hdrs",
+    ],
+    copts = tf_copts(),
+    visibility = ["//tensorflow/core:__subpackages__"],
+    deps = [
+        ":lib_platform",
+        ":platform_base",
+        ":platform_port",
+        "//tensorflow/core/platform/default/build_config:protobuf",
+        "@protobuf_archive//:protobuf",
+    ],
+)
+
+cc_library(
+    name = "human_readable_json",
+    srcs = tf_platform_srcs(["human_readable_json.cc"]),
+    hdrs = ["platform/human_readable_json.h"],
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":lib",
+        ":lib_internal",
+    ] + tf_additional_human_readable_json_deps(),
+)
+
+filegroup(
+    name = "platform_env_hdrs",
+    srcs = [
+        "platform/env.h",
+        "platform/file_statistics.h",
+        "platform/file_system.h",
+    ],
+    visibility = ["//visibility:private"],
+)
+
+# Headers that are not exported as part of ":lib".
+filegroup(
+    name = "platform_env_internal_hdrs",
+    srcs = [
+        "platform/load_library.h",
+    ],
+    visibility = ["//visibility:private"],
+)
+
+cc_library(
+    name = "platform_env",
+    srcs = tf_platform_srcs([
+        "env.cc",
+        "load_library.cc",
+    ]) + tf_platform_hdrs([
+        "wide_char.h",
+    ]) + [
+        "platform/env.cc",
+        "platform/file_system.cc",
+    ],
+    hdrs = [
+        ":platform_env_hdrs",
+        ":platform_env_internal_hdrs",
+    ],
+    copts = tf_copts(),
+    visibility = ["//tensorflow/core:__subpackages__"],
+    deps = [
+        ":error_codes_proto_cc",
+        ":lib",
+        ":lib_internal",
+        ":lib_platform",
+        ":platform_base",
+        ":platform_port",
+        ":platform_protobuf",
+        "//tensorflow/core/platform/default/build_config:env",
+    ],
+)
+
+filegroup(
+    name = "platform_file_system_hdrs",
+    srcs = [
+        "platform/file_system_helper.h",
+        "platform/null_file_system.h",
+    ],
+    visibility = ["//visibility:private"],
+)
+
+cc_library(
+    name = "platform_file_system",
+    srcs = tf_platform_srcs([
+    ]) + tf_platform_hdrs([
+        "windows_file_system.h",
+    ]) + [
+        "platform/file_system_helper.cc",
+    ],
+    hdrs = [
+        ":platform_file_system_hdrs",
+    ],
+    copts = tf_copts(),
+    visibility = ["//tensorflow/core:__subpackages__"],
+    deps = [
+        ":lib",
+        ":lib_platform",
+        ":platform_env",
+    ],
+)
+
+filegroup(
+    name = "platform_other_hdrs",
+    srcs = [
+        "platform/abi.h",
+        "platform/context.h",
+        "platform/cpu_feature_guard.h",
+        "platform/error.h",
+        "platform/fingerprint.h",
+        "platform/net.h",
+        "platform/notification.h",
+        "platform/prefetch.h",
+        "platform/profile_utils/android_armv7a_cpu_utils_helper.h",
+        "platform/profile_utils/clock_cycle_profiler.h",
+        "platform/profile_utils/cpu_utils.h",
+        "platform/profile_utils/i_cpu_utils_helper.h",
+        "platform/stacktrace.h",
+        "platform/stacktrace_handler.h",
+        "platform/strong_hash.h",
+        "platform/subprocess.h",
+    ],
+    visibility = ["//visibility:private"],
+)
+
+# Headers that are not exported as part of ":lib".
+filegroup(
+    name = "platform_other_internal_hdrs",
+    srcs = [
+        "platform/denormal.h",
+        "platform/setround.h",
+        "platform/tracing.h",
+    ],
+    visibility = ["//visibility:private"],
+)
+
+cc_library(
+    name = "platform_other",
+    srcs = tf_platform_srcs([
+        "subprocess.cc",
+        "net.cc",
+        "tracing.cc",
+    ]) + tf_platform_hdrs([
+        "tracing.h",
+        "error.h",
+        "context.h",
+        "fingerprint.h",
+        "notification.h",
+        "stacktrace.h",
+        "strong_hash.h",
+        "subprocess.h",
+        "tracing_impl.h",
+    ]) + [
+        "platform/cpu_feature_guard.cc",
+        "platform/setround.cc",
+        "platform/tracing.cc",
+        "platform/denormal.cc",
+        "platform/profile_utils/android_armv7a_cpu_utils_helper.cc",
+        "platform/profile_utils/clock_cycle_profiler.cc",
+        "platform/profile_utils/cpu_utils.cc",
+    ],
+    hdrs = [
+        ":platform_other_hdrs",
+        ":platform_other_internal_hdrs",
+    ],
+    copts = tf_copts(),
+    visibility = ["//tensorflow/core:__subpackages__"],
+    deps = [
+        ":lib",
+        ":lib_platform",
+        ":platform_base",
+        ":platform_env",
+        ":platform_port",
+        ":platform_protobuf",
+        "//tensorflow/core/platform/default/build_config:other",
+        "//tensorflow/core/platform/default/build_config:platformlib",
+        "//tensorflow/core/platform/default/build_config:port",
+    ],
+)
+
 # Minimal lib so that tools used for mobile compilation
 # don't have to depend on lib/platformlib.
 cc_library(
     name = "lib_proto_parsing",
-    srcs = glob(tf_additional_proto_srcs()) + tf_env_time_srcs(),
+    srcs = glob(tf_additional_proto_srcs()),
     hdrs = [
         "lib/core/errors.h",
         "lib/core/status.h",
@@ -358,9 +608,10 @@ cc_library(
         "platform/types.h",
         "platform/windows/cpu_info.h",
         "lib/bfloat16/bfloat16.h",
-    ] + tf_additional_proto_hdrs() + glob(tf_env_time_hdrs()),
+    ] + tf_additional_proto_hdrs(),
     copts = tf_copts(),
     deps = tf_lib_proto_parsing_deps() + [
+        ":platform_base",
         "@double_conversion//:double-conversion",
     ],
 )
@@ -371,8 +622,7 @@ cc_library(
 # tf_cc_test and tf_cc_binary will include the necessary symbols.
 cc_library(
     name = "lib",
-    hdrs = PLATFORM_BASE_HDRS +
-           PLATFORM_OTHER_HDRS + [
+    hdrs = [
         "lib/bfloat16/bfloat16.h",
         "lib/core/arena.h",
         "lib/core/bitmap.h",
@@ -419,6 +669,12 @@ cc_library(
         "lib/strings/str_util.h",
         "lib/strings/strcat.h",
         "lib/strings/stringprintf.h",
+        ":platform_base_hdrs",
+        ":platform_env_hdrs",
+        ":platform_file_system_hdrs",
+        ":platform_other_hdrs",
+        ":platform_port_hdrs",
+        ":platform_protobuf_hdrs",
     ],
     visibility = ["//visibility:public"],
     deps = [
@@ -454,17 +710,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "session_message",
-    srcs = ["util/session_message.cc"],
-    hdrs = ["util/session_message.h"],
-    deps = [
-        ":framework",
-        ":lib",
-        ":protos_all_cc",
-    ],
-)
-
 # Libraries that will eventually be moved into lib/core
 # Note that stringpiece_test can't be place here yet, because we are
 # required to use tf_cc_test, and that rule will change / into _
@@ -585,6 +830,7 @@ tf_gpu_library(
         "framework/types.h",
         "public/version.h",
         "util/activation_mode.h",
+        "util/batch_util.h",
         "util/bcast.h",
         "util/gpu_kernel_helper.h",
         "util/device_name_utils.h",
@@ -602,6 +848,7 @@ tf_gpu_library(
         "util/sparse/group_iterator.h",
         "util/sparse/sparse_tensor.h",
         "util/stat_summarizer.h",
+        "util/stat_summarizer_options.h",
         "util/stream_executor_util.h",
         "util/strided_slice_op.h",
         "util/tensor_format.h",
@@ -626,6 +873,17 @@ tf_gpu_library(
     deps = [":framework_internal"],
 )
 
+cc_library(
+    name = "stats_calculator_portable",
+    srcs = [
+        "util/stat_summarizer_options.h",
+        "util/stats_calculator.cc",
+    ],
+    hdrs = [
+        "util/stats_calculator.h",
+    ],
+)
+
 cc_library(
     name = "overflow",
     hdrs = ["util/overflow.h"],
@@ -635,6 +893,12 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "exec_on_stall",
+    hdrs = ["util/exec_on_stall.h"],
+    deps = [":framework_lite"],
+)
+
 cc_library(
     name = "ptr_util",
     hdrs = ["util/ptr_util.h"],
@@ -808,8 +1072,6 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:stream_executor",
         "//tensorflow/core/kernels:bounds_check_lib",
-        "//third_party/eigen3",
-        "@farmhash_archive//:farmhash",
     ],
     alwayslink = 1,
 )
@@ -1108,6 +1370,7 @@ cc_library(
         ":shape_inference_testutil",
         ":tensor_testutil",
         ":test",
+        ":testlib_ops",
         "//tensorflow/cc:scope",
         "//tensorflow/core/kernels:constant_op",
         "//tensorflow/core/kernels:ops_testutil",
@@ -1115,6 +1378,18 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "testlib_ops",
+    testonly = 1,
+    srcs = ["common_runtime/testlib_ops.cc"],
+    linkstatic = 1,  # Seems to be needed since alwayslink is broken in bazel
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+    alwayslink = 1,
+)
+
 # This is a link-only library to provide a DirectSession
 # implementation of the Session interface.
 tf_gpu_library(
@@ -1176,6 +1451,7 @@ filegroup(
             "lib/png/**/*",
             "lib/gif/**/*",
             "util/events_writer.*",
+            "util/stats_calculator.*",
             "util/reporter.*",
             "platform/**/cuda_libdevice_path.*",
             "platform/**/rocm_rocdl_path.*",
@@ -1260,6 +1536,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":protos_all_cc_impl",
+        ":stats_calculator_portable",
         "//third_party/eigen3",
         "@double_conversion//:double-conversion",
         "@nsync//:nsync_cpp",
@@ -1300,6 +1577,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":protos_all_cc_impl",
+        ":stats_calculator_portable",
         "//third_party/eigen3",
         "@double_conversion//:double-conversion",
         "@nsync//:nsync_cpp",
@@ -1534,6 +1812,13 @@ tf_pyclif_proto_library(
     visibility = ["//visibility:public"],
 )
 
+tf_pyclif_proto_library(
+    name = "framework/cost_graph_pyclif",
+    proto_lib = ":protos_all_cc",
+    proto_srcfile = "framework/cost_graph.proto",
+    visibility = ["//visibility:public"],
+)
+
 tf_pyclif_proto_library(
     name = "framework/tensor_pyclif",
     proto_lib = ":protos_all_cc",
@@ -1671,13 +1956,6 @@ LIB_INTERNAL_PRIVATE_HEADERS = ["framework/resource_handle.h"] + glob(
         "platform/**/cuda.h",
         "platform/**/stream_executor.h",
     ],
-) + tf_additional_lib_srcs(
-    exclude = [
-        "**/*.cc",
-        "**/*test*",
-        "platform/**/cuda.h",
-        "platform/**/stream_executor.h",
-    ],
 )
 
 LIB_INTERNAL_PUBLIC_HEADERS = tf_additional_lib_hdrs() + [
@@ -1773,9 +2051,9 @@ cc_library(
             "platform/**/cuda_libdevice_path.cc",
             "platform/**/rocm_rocdl_path.cc",
             "platform/**/device_tracer.cc",
+            "platform/**/logging.cc",
+            "platform/**/human_readable_json.cc",
             "platform/abi.cc",
-            "platform/variant_coding.cc",
-            "platform/**/variant_cord_coding.cc",
         ],
     ) + tf_additional_lib_srcs(
         exclude = [
@@ -1788,9 +2066,9 @@ cc_library(
             "platform/**/stream_executor.h",
             "platform/**/env_time.cc",
             "platform/**/device_tracer.cc",
+            "platform/**/logging.cc",
+            "platform/**/human_readable_json.cc",
             "platform/abi.cc",
-            "platform/variant_coding.cc",
-            "platform/**/variant_cord_coding.cc",
         ] +
         # Protobuf deps already included through the ":lib_proto_parsing"
         # dependency.
@@ -1974,10 +2252,12 @@ tf_proto_library(
     name = "error_codes_proto",
     srcs = ERROR_CODES_PROTO_SRCS,
     cc_api_version = 2,
+    dart_api_version = 2,
     default_header = True,
     j2objc_api_version = 1,
     java_api_version = 2,
     js_api_version = 2,
+    provide_cc_alias = True,
 )
 
 tf_generate_proto_text_sources(
@@ -1995,6 +2275,7 @@ tf_proto_library(
     name = "protos_all_proto",
     srcs = COMMON_PROTO_SRCS + ADDITIONAL_CORE_PROTO_SRCS,
     cc_api_version = 2,
+    dart_api_version = 2,
     default_header = True,
     j2objc_api_version = 1,
     java_api_version = 2,
@@ -2039,7 +2320,6 @@ cc_library(
 )
 
 FRAMEWORK_INTERNAL_PRIVATE_HEADERS = [
-    "platform/variant_coding.h",
     "graph/edgeset.h",
     "graph/graph.h",
     "graph/graph_def_builder.h",
@@ -2080,14 +2360,13 @@ FRAMEWORK_INTERNAL_PUBLIC_HEADERS = [
     "framework/tracking_allocator.h",  # only needed for tests
     "framework/unique_tensor_references.h",
     "framework/variant.h",
-    "platform/variant_coding.h",
     "util/command_line_flags.h",
     "util/env_var.h",
     "util/equal_graph_def.h",
     "util/presized_cuckoo_map.h",
     "util/tensor_slice_set.h",
     "util/tensor_slice_util.h",
-] + tf_additional_framework_hdrs()
+]
 
 tf_gpu_library(
     name = "framework_internal",
@@ -2129,9 +2408,7 @@ cc_header_only_library(
 
 tf_gpu_library(
     name = "framework_internal_impl",
-    srcs = FRAMEWORK_INTERNAL_PRIVATE_HEADERS + [
-        "platform/variant_coding.cc",
-    ] + glob(
+    srcs = FRAMEWORK_INTERNAL_PRIVATE_HEADERS + glob(
         [
             "example/**/*.cc",
             "framework/**/*.cc",
@@ -2156,7 +2433,6 @@ tf_gpu_library(
             "framework/resource_handle.cc",
             "util/memmapped_file_system.*",
             "util/memmapped_file_system_writer.*",
-            "util/session_message.cc",
             "util/version_info.cc",
         ],
     ) + select({
@@ -2166,7 +2442,7 @@ tf_gpu_library(
             "util/memmapped_file_system.cc",
             "util/memmapped_file_system_writer.cc",
         ],
-    }) + tf_additional_framework_srcs(),
+    }),
     hdrs = FRAMEWORK_INTERNAL_PUBLIC_HEADERS,
     copts = tf_copts(),
     linkopts = select({
@@ -2311,7 +2587,9 @@ tf_gpu_library(
 
 CORE_CPU_BASE_HDRS = GRAPH_HDRS + [
     "common_runtime/device.h",
+    "common_runtime/device_factory.h",
     "common_runtime/device_mgr.h",
+    "common_runtime/device_set.h",
     "common_runtime/eval_const_tensor.h",
     "common_runtime/graph_runner.h",
     "common_runtime/shape_refiner.h",
@@ -2369,14 +2647,13 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/copy_tensor.h",
     "common_runtime/costmodel_manager.h",
     "common_runtime/debugger_state_interface.h",
-    "common_runtime/device_factory.h",
     "common_runtime/device_resolver_local.h",
-    "common_runtime/device_set.h",
     "common_runtime/dma_helper.h",
     "common_runtime/eigen_thread_pool.h",
     "common_runtime/executor.h",
     "common_runtime/graph_optimizer.h",
     "common_runtime/local_device.h",
+    "common_runtime/lower_if_op.h",
     "common_runtime/memory_types.h",
     "common_runtime/mkl_cpu_allocator.h",
     "common_runtime/optimization_registry.h",
@@ -2427,6 +2704,7 @@ tf_gpu_library(
         "common_runtime/graph_optimizer.cc",
         "common_runtime/graph_runner.cc",
         "common_runtime/local_device.cc",
+        "common_runtime/lower_if_op.cc",
         "common_runtime/memory_types.cc",
         "common_runtime/mkl_cpu_allocator.cc",
         "common_runtime/optimization_registry.cc",
@@ -2529,6 +2807,7 @@ cc_library(
     ],
     visibility = [
         "//tensorflow/compiler:__subpackages__",
+        "//tensorflow/core/kernels:__subpackages__",
         "//tensorflow/core/profiler:__subpackages__",
     ],
     deps = [":lib_internal"],
@@ -2586,6 +2865,7 @@ tf_gpu_library(
     # ROCM TODO figure this out
     #gpu_deps = tf_additional_cupti_wrapper_deps() + tf_additional_device_tracer_cuda_deps(),
     gpu_deps = [],
+    visibility = ["//visibility:private"],
     deps = [
         ":core_cpu_internal",
         ":lib",
@@ -2823,7 +3103,10 @@ cc_library(
     srcs = ["platform/test_main.cc"],
     copts = tf_copts(),
     deps = [
-        ":core_stringpiece",
+        # TODO(ahentz): we don't want to depend on "lib" here. It used to be
+        # that "core_stringpiece" was enough but that recently changed and
+        # we now need at least "str_util".
+        ":lib",
         ":lib_platform",
         ":stacktrace_handler",
         ":test_lite",
@@ -3000,6 +3283,18 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "exec_on_stall_test",
+    size = "small",
+    srcs = ["util/exec_on_stall_test.cc"],
+    deps = [
+        ":exec_on_stall",
+        ":framework_lite",
+        ":test",
+        ":test_main",
+    ],
+)
+
 tf_cc_test(
     name = "lib_jpeg_jpeg_mem_unittest",
     srcs = ["lib/jpeg/jpeg_mem_unittest.cc"],
@@ -3294,7 +3589,10 @@ tf_cc_tests_gpu(
 tf_cc_test_mkl(
     name = "mkl_runtime_tests",
     size = "small",
-    srcs = ["common_runtime/mkl_cpu_allocator_test.cc"],
+    srcs = [
+        "common_runtime/mkl_cpu_allocator_test.cc",
+        "common_runtime/mkl_threadpool_device_test.cc",
+    ],
     linkstatic = 1,
     deps = [
         ":core",
@@ -3396,6 +3694,37 @@ tf_cc_tests_gpu(
     ],
 )
 
+tf_gpu_cc_test(
+    name = "gpu_device_unified_memory_test",
+    size = "small",
+    srcs = [
+        "common_runtime/gpu/gpu_device_test.cc",
+    ],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    # Runs test on a Guitar cluster that uses P100s to test unified memory
+    # allocations.
+    tags = tf_gpu_tests_tags() + [
+        "guitar",
+        "multi_gpu",
+    ],
+    deps = [
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":direct_session",
+        ":framework",
+        ":framework_internal",
+        ":gpu_id",
+        ":lib",
+        ":lib_internal",
+        ":protos_all_cc",
+        ":test",
+        ":test_main",
+        ":testlib",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core/kernels:ops_util",
+    ],
+)
+
 tf_cc_test_gpu(
     name = "cuda_libdevice_path_test",
     size = "small",
@@ -3433,7 +3762,11 @@ tf_gpu_only_cc_test(
         ":test",
         ":test_main",
         "//third_party/eigen3",
-    ],
+    ] + if_mkl(
+        [
+            "//third_party/mkl:intel_binary_blob",
+        ],
+    ),
 )
 
 tf_cc_test_gpu(
@@ -3739,6 +4072,31 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "common_runtime_executor_test",
+    size = "small",
+    srcs = ["common_runtime/executor_test.cc"],
+    linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":core",
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":framework",
+        ":framework_internal",
+        ":lib",
+        ":lib_internal",
+        ":protos_all_cc",
+        ":test",
+        ":test_main",
+        ":testlib",
+        "//tensorflow/core/kernels:array",
+        "//tensorflow/core/kernels:control_flow_ops",
+        "//tensorflow/core/kernels:math",
+        "//tensorflow/core/kernels:random_ops",
+        "//tensorflow/core/kernels:state",
+    ],
+)
+
 tf_cc_test(
     name = "common_runtime_function_test",
     size = "small",
@@ -4099,6 +4457,29 @@ tf_cc_test_gpu(
     ],
 )
 
+tf_cc_tests(
+    name = "common_runtime_lower_if_op_test",
+    size = "small",
+    srcs = ["common_runtime/lower_if_op_test.cc"],
+    deps = [
+        ":all_kernels",
+        ":core_cpu",
+        ":core_cpu_internal",
+        ":direct_session",
+        ":framework",
+        ":framework_internal",
+        ":lib",
+        ":test",
+        ":test_main",
+        ":testlib",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:cc_ops_internal",
+        "//tensorflow/cc:client_session",
+        "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:ops",
+    ],
+)
+
 # Test data
 filegroup(
     name = "image_testdata",
@@ -4205,9 +4586,3 @@ alias(
     actual = ":mobile_srcs",
     visibility = ["//visibility:public"],
 )
-
-closure_proto_library(
-    name = "example_protos_closure",
-    visibility = ["//visibility:public"],
-    deps = [":example_protos"],
-)
diff --git a/tensorflow/core/api_def/base_api/api_def_AnonymousIterator.pbtxt b/tensorflow/core/api_def/base_api/api_def_AnonymousIterator.pbtxt
new file mode 100644
index 00000000000000..d8c2ed40a324d4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_AnonymousIterator.pbtxt
@@ -0,0 +1,13 @@
+op {
+  graph_op_name: "AnonymousIterator"
+  out_arg {
+    name: "handle"
+    description: <<END
+A handle to the iterator that can be passed to a "MakeIterator" or
+"IteratorGetNext" op. In contrast to Iterator, AnonymousIterator prevents
+resource sharing by name, and does not keep a reference to the resource
+container.
+END
+  }
+  summary: "A container for an iterator resource."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchFunction.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchFunction.pbtxt
new file mode 100644
index 00000000000000..09eff6177b1bcb
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BatchFunction.pbtxt
@@ -0,0 +1,128 @@
+op {
+  graph_op_name: "BatchFunction"
+  in_arg {
+    name: "in_tensors"
+    description: <<END
+The tensors to be batched.
+END
+  }
+  in_arg {
+    name: "captured_tensors"
+    description: <<END
+The tensors which are captured in the function, and don't need
+to be batched.
+END
+  }
+  out_arg {
+    name: "out_tensors"
+    description: <<END
+The output tensors.
+END
+  }
+  attr {
+    name: "num_batch_threads"
+    description: <<END
+Number of scheduling threads for processing batches of work.
+Determines the number of batches processed in parallel.
+END
+  }
+  attr {
+    name: "max_batch_size"
+    description: <<END
+Batch sizes will never be bigger than this.
+END
+  }
+  attr {
+    name: "batch_timeout_micros"
+    description: <<END
+Maximum number of microseconds to wait before outputting
+an incomplete batch.
+END
+  }
+  attr {
+    name: "max_enqueued_batches"
+    description: <<END
+Maximum number of batches enqueued. Default: 10.
+END
+  }
+  attr {
+    name: "allowed_batch_sizes"
+    description: <<END
+Optional list of allowed batch sizes. If left empty, does
+nothing. Otherwise, supplies a list of batch sizes, causing the op to pad
+batches up to one of those sizes. The entries must increase monotonically, and
+the final entry must equal max_batch_size.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+Controls the scope of sharing of this batch.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+Concurrently running instances of batch in the same device with the
+same container and shared_name will batch their elements together. If left
+empty, the op name will be used as the shared name.
+END
+  }
+  attr {
+    name: "Tin"
+    description: <<END
+the types of tensors to be batched.
+END
+  }
+  attr {
+    name: "Tcaptured"
+    description: <<END
+the types of the captured tensors.
+END
+  }
+  attr {
+    name: "Tout"
+    description: <<END
+the types of the output tensors.
+END
+  }
+  summary: "Batches all the inputs tensors to the computation done by the function."
+  description: <<END
+So, for example, in the following code
+
+  ```python
+
+  # This input will be captured.
+  y = tf.placeholder_with_default(1.0, shape=[])
+
+  @tf.Defun(tf.float32)
+  def computation(a):
+    return tf.matmul(a, a) + y
+
+  b = gen_batch_ops.batch_function(
+          f=computation
+          in_tensors=[a],
+          captured_tensors=computation.captured_inputs,
+          Tout=[o.type for o in computation.definition.signature.output_arg],
+          num_batch_threads=1,
+          max_batch_size=10,
+          batch_timeout_micros=100000,  # 100ms
+          allowed_batch_sizes=[3, 10],
+          batching_queue="")
+
+If more than one session.run call is simultaneously trying to compute `b`
+the values of `a` will be gathered, non-deterministically concatenated
+along the first axis, and only one thread will run the computation.
+
+Assumes that all arguments of the function are Tensors which will be batched
+along their first dimension.
+
+Arguments that are captured, are not batched. The session.run call which does
+the concatenation, will use the values of the captured tensors available to it.
+Therefore, typical uses of captured tensors should involve values which remain
+unchanged across session.run calls. Inference is a good example of this.
+
+SparseTensor is not supported. The return value of the decorated function
+must be a Tensor or a list/tuple of Tensors.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_CollectiveBcastRecv.pbtxt b/tensorflow/core/api_def/base_api/api_def_CollectiveBcastRecv.pbtxt
index 88049bca365f4a..988bf0a0f82796 100644
--- a/tensorflow/core/api_def/base_api/api_def_CollectiveBcastRecv.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_CollectiveBcastRecv.pbtxt
@@ -1,5 +1,5 @@
 op {
   graph_op_name: "CollectiveBcastRecv"
-  visibility: SKIP
   summary: "Receives a tensor value broadcast from another device."
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_CollectiveBcastSend.pbtxt b/tensorflow/core/api_def/base_api/api_def_CollectiveBcastSend.pbtxt
index 7ff70f5b178af1..d212f6dce723e1 100644
--- a/tensorflow/core/api_def/base_api/api_def_CollectiveBcastSend.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_CollectiveBcastSend.pbtxt
@@ -1,5 +1,5 @@
 op {
   graph_op_name: "CollectiveBcastSend"
-  visibility: SKIP
   summary: "Broadcasts a tensor value to one or more other devices."
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_CollectiveReduce.pbtxt b/tensorflow/core/api_def/base_api/api_def_CollectiveReduce.pbtxt
index 10d9771d46d408..fdd9443ba5a4d7 100644
--- a/tensorflow/core/api_def/base_api/api_def_CollectiveReduce.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_CollectiveReduce.pbtxt
@@ -1,5 +1,5 @@
 op {
   graph_op_name: "CollectiveReduce"
-  visibility: SKIP
   summary: "Mutually reduces multiple tensors of identical type and shape."
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_CropAndResize.pbtxt b/tensorflow/core/api_def/base_api/api_def_CropAndResize.pbtxt
index 629f575d0a25ce..e6609a16e125ad 100644
--- a/tensorflow/core/api_def/base_api/api_def_CropAndResize.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_CropAndResize.pbtxt
@@ -47,8 +47,9 @@ END
   attr {
     name: "method"
     description: <<END
-A string specifying the interpolation method. Only 'bilinear' is
-supported for now.
+A string specifying the sampling method for resizing. It can be either
+`"bilinear"` or `"nearest"` and default to `"bilinear"`. Currently two sampling
+methods are supported: Bilinear and Nearest Neighbor.
 END
   }
   attr {
@@ -57,18 +58,22 @@ END
 Value used for extrapolation, when applicable.
 END
   }
-  summary: "Extracts crops from the input image tensor and bilinearly resizes them (possibly"
+  summary: "Extracts crops from the input image tensor and resizes them."
   description: <<END
-with aspect ratio change) to a common output size specified by `crop_size`. This
-is more general than the `crop_to_bounding_box` op which extracts a fixed size
-slice from the input image and does not allow resizing or aspect ratio change.
+Extracts crops from the input image tensor and resizes them using bilinear
+sampling or nearest neighbor sampling (possibly with aspect ratio change) to a
+common output size specified by `crop_size`. This is more general than the
+`crop_to_bounding_box` op which extracts a fixed size slice from the input image
+and does not allow resizing or aspect ratio change.
 
 Returns a tensor with `crops` from the input `image` at positions defined at the
 bounding box locations in `boxes`. The cropped boxes are all resized (with
-bilinear interpolation) to a fixed `size = [crop_height, crop_width]`. The
-result is a 4-D tensor `[num_boxes, crop_height, crop_width, depth]`. The
-resizing is corner aligned. In particular, if `boxes = [[0, 0, 1, 1]]`, the
-method will give identical results to using `tf.image.resize_bilinear()`
-with `align_corners=True`.
+bilinear or nearest neighbor interpolation) to a fixed
+`size = [crop_height, crop_width]`. The result is a 4-D tensor
+`[num_boxes, crop_height, crop_width, depth]`. The resizing is corner aligned.
+In particular, if `boxes = [[0, 0, 1, 1]]`, the method will give identical
+results to using `tf.image.resize_bilinear()` or
+`tf.image.resize_nearest_neighbor()`(depends on the `method` argument) with
+`align_corners=True`.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_DatasetToGraph.pbtxt b/tensorflow/core/api_def/base_api/api_def_DatasetToGraph.pbtxt
new file mode 100644
index 00000000000000..55dd6179dd60f6
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DatasetToGraph.pbtxt
@@ -0,0 +1,20 @@
+op {
+  graph_op_name: "DatasetToGraph"
+  visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A variant tensor representing the dataset to return the graph representation for.
+END
+  }
+  out_arg {
+    name: "graph"
+    description: <<END
+The graph representation of the dataset (as serialized GraphDef).
+END
+  }
+  summary: "Returns a serialized GraphDef representing `input_dataset`."
+  description: <<END
+Returns a graph representation for `input_dataset`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_FakeParam.pbtxt b/tensorflow/core/api_def/base_api/api_def_FakeParam.pbtxt
new file mode 100644
index 00000000000000..d110aba42b27a0
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FakeParam.pbtxt
@@ -0,0 +1,24 @@
+op {
+  graph_op_name: "FakeParam"
+  visibility: SKIP
+  out_arg {
+    name: "output"
+    description: <<END
+    \"Fake\" output value. This should not be consumed by another op.
+END
+  }
+  attr { name: "dtype"  description: "The type of the output." }
+  attr {
+    name: "shape"
+    description: <<END
+    The purported shape of the output. This is only used for shape inference;
+    the output will not necessarily have this shape. Can be a partial shape.
+END
+  }
+  summary: <<END
+  This op is used as a placeholder in If branch functions. It doesn't provide a
+  valid output when run, so must either be removed (e.g. replaced with a
+  function input) or guaranteed not to be used (e.g. if mirroring an
+  intermediate output needed for the gradient computation of the other branch).
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_GroupByReducerDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_GroupByReducerDataset.pbtxt
new file mode 100644
index 00000000000000..067ad4018b09d4
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_GroupByReducerDataset.pbtxt
@@ -0,0 +1,69 @@
+op {
+  graph_op_name: "GroupByReducerDataset"
+  visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A variant tensor representing the input dataset.
+END
+  }
+  in_arg {
+    name: "key_func_other_arguments"
+    description: <<END
+A list of tensors, typically values that were captured when
+building a closure for `key_func`.
+END
+  }
+  attr {
+    name: "key_func"
+    description: <<END
+A function mapping an element of `input_dataset`, concatenated
+with `key_func_other_arguments` to a scalar value of type DT_INT64.
+END
+  }
+  in_arg {
+    name: "init_func_other_arguments"
+    description: <<END
+A list of tensors, typically values that were captured when
+building a closure for `init_func`.
+END
+  }
+  attr {
+    name: "init_func"
+    description: <<END
+A function mapping a key of type DT_INT64, concatenated with
+`init_func_other_arguments` to the initial reducer state.
+END
+  }
+  in_arg {
+    name: "reduce_func_other_arguments"
+    description: <<END
+A list of tensors, typically values that were captured when
+building a closure for `reduce_func`.
+END
+  }
+  attr {
+    name: "reduce_func"
+    description: <<END
+A function mapping the current reducer state and an element of `input_dataset`,
+concatenated with `reduce_func_other_arguments` to a new reducer state.
+END
+  }
+  in_arg {
+    name: "finalize_func_other_arguments"
+    description: <<END
+A list of tensors, typically values that were captured when
+building a closure for `finalize_func`.
+END
+  }
+  attr {
+    name: "finalize_func"
+    description: <<END
+A function mapping the final reducer state to an output element.
+END
+  }
+  summary: "Creates a dataset that computes a group-by on `input_dataset`."
+  description: <<END
+Creates a dataset that computes a group-by on `input_dataset`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_IdentityDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_IdentityDataset.pbtxt
new file mode 100644
index 00000000000000..ff2854fd2c1ddf
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_IdentityDataset.pbtxt
@@ -0,0 +1,14 @@
+op {
+  graph_op_name: "IdentityDataset"
+  visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A variant tensor representing the input dataset.
+END
+  }
+  summary: "A placeholder for input pipeline graph optimizations."
+  description: <<END
+A placeholder for input pipeline graph optimizations.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MapAndBatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_MapAndBatchDataset.pbtxt
index bf544703de5599..e230c51edfe935 100644
--- a/tensorflow/core/api_def/base_api/api_def_MapAndBatchDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_MapAndBatchDataset.pbtxt
@@ -1,5 +1,19 @@
 op {
   graph_op_name: "MapAndBatchDataset"
+  visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A variant tensor representing the input dataset.
+END
+  }
+  in_arg {
+    name: "other_arguments"
+    description: <<END
+A list of tensors, typically values that were captured when building a closure
+for `f`.
+END
+  }
   in_arg {
     name: "batch_size"
     description: <<END
@@ -11,13 +25,26 @@ END
   in_arg {
     name: "num_parallel_batches"
     description: <<END
-A scalar representing the number of batches to create in
-parallel. Processing multiple batches in parallel benefits workloads prone to
-stragglers.
+A scalar representing the number of batches to create in parallel. Processing
+multiple batches in parallel benefits workloads prone to stragglers.
+END
+  }
+  in_arg {
+    name: "drop_remainder"
+    description: <<END
+A scalar representing whether the last batch should be dropped in case its size
+is smaller than desired.
+END
+  }
+  attr {
+    name: "f"
+    description: <<END
+A function to apply to the outputs of `input_dataset`.
 END
   }
-  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset` and then"
+  summary: "Creates a dataset that fuses mapping with batching."
   description: <<END
+Creates a dataset that applies `f` to the outputs of `input_dataset` and then
 batches `batch_size` of them.
 
 Unlike a "MapDataset", which applies `f` sequentially, this dataset invokes up
diff --git a/tensorflow/core/api_def/base_api/api_def_MapAndBatchDatasetV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_MapAndBatchDatasetV2.pbtxt
new file mode 100644
index 00000000000000..81ef92cae0c95c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_MapAndBatchDatasetV2.pbtxt
@@ -0,0 +1,54 @@
+op {
+  graph_op_name: "MapAndBatchDatasetV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A variant tensor representing the input dataset.
+END
+  }
+  in_arg {
+    name: "other_arguments"
+    description: <<END
+A list of tensors, typically values that were captured when building a closure
+for `f`.
+END
+  }
+  in_arg {
+    name: "batch_size"
+    description: <<END
+A scalar representing the number of elements to accumulate in a
+batch. It determines the number of concurrent invocations of `f` that process
+elements from `input_dataset` in parallel.
+END
+  }
+  in_arg {
+    name: "num_parallel_calls"
+    description: <<END
+A scalar representing the maximum number of parallel invocations of the `map_fn`
+function. Applying the `map_fn` on consecutive input elements in parallel has
+the potential to improve input pipeline throughput.
+END
+  }
+  in_arg {
+    name: "drop_remainder"
+    description: <<END
+A scalar representing whether the last batch should be dropped in case its size
+is smaller than desired.
+END
+  }
+  attr {
+    name: "f"
+    description: <<END
+A function to apply to the outputs of `input_dataset`.
+END
+  }
+  summary: "Creates a dataset that fuses mapping with batching."
+  description: <<END
+Creates a dataset that applies `f` to the outputs of `input_dataset` and then
+batches `batch_size` of them.
+
+Unlike a "MapDataset", which applies `f` sequentially, this dataset invokes up
+to `batch_size * num_parallel_batches` copies of `f` in parallel.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MatrixTriangularSolve.pbtxt b/tensorflow/core/api_def/base_api/api_def_MatrixTriangularSolve.pbtxt
index a2bfcdc66e4818..e90de741093e86 100644
--- a/tensorflow/core/api_def/base_api/api_def_MatrixTriangularSolve.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_MatrixTriangularSolve.pbtxt
@@ -32,7 +32,7 @@ Boolean indicating whether to solve with `matrix` or its (block-wise)
          adjoint.
 
 @compatibility(numpy)
-Equivalent to np.linalg.triangular_solve
+Equivalent to scipy.linalg.solve_triangular
 @end_compatibility
 END
   }
diff --git a/tensorflow/core/api_def/base_api/api_def_NonMaxSuppressionV3.pbtxt b/tensorflow/core/api_def/base_api/api_def_NonMaxSuppressionV3.pbtxt
new file mode 100644
index 00000000000000..25ec87eeca27c7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_NonMaxSuppressionV3.pbtxt
@@ -0,0 +1,64 @@
+op {
+  graph_op_name: "NonMaxSuppressionV3"
+  in_arg {
+    name: "boxes"
+    description: <<END
+A 2-D float tensor of shape `[num_boxes, 4]`.
+END
+  }
+  in_arg {
+    name: "scores"
+    description: <<END
+A 1-D float tensor of shape `[num_boxes]` representing a single
+score corresponding to each box (each row of boxes).
+END
+  }
+  in_arg {
+    name: "max_output_size"
+    description: <<END
+A scalar integer tensor representing the maximum number of
+boxes to be selected by non max suppression.
+END
+  }
+  in_arg {
+    name: "iou_threshold"
+    description: <<END
+A 0-D float tensor representing the threshold for deciding whether
+boxes overlap too much with respect to IOU.
+END
+  }
+  in_arg {
+    name: "score_threshold"
+    description: <<END
+A 0-D float tensor representing the threshold for deciding when to remove
+boxes based on score.
+END
+  }
+  out_arg {
+    name: "selected_indices"
+    description: <<END
+A 1-D integer tensor of shape `[M]` representing the selected
+indices from the boxes tensor, where `M <= max_output_size`.
+END
+  }
+  summary: "Greedily selects a subset of bounding boxes in descending order of score,"
+  description: <<END
+pruning away boxes that have high intersection-over-union (IOU) overlap
+with previously selected boxes.  Bounding boxes with score less than
+`score_threshold` are removed.  Bounding boxes are supplied as
+[y1, x1, y2, x2], where (y1, x1) and (y2, x2) are the coordinates of any
+diagonal pair of box corners and the coordinates can be provided as normalized
+(i.e., lying in the interval [0, 1]) or absolute.  Note that this algorithm
+is agnostic to where the origin is in the coordinate system and more
+generally is invariant to orthogonal transformations and translations
+of the coordinate system; thus translating or reflections of the coordinate
+system result in the same boxes being selected by the algorithm.
+The output of this operation is a set of integers indexing into the input
+collection of bounding boxes representing the selected boxes.  The bounding
+box coordinates corresponding to the selected indices can then be obtained
+using the `tf.gather operation`.  For example:
+  selected_indices = tf.image.non_max_suppression_v2(
+      boxes, scores, max_output_size, iou_threshold, score_threshold)
+  selected_boxes = tf.gather(boxes, selected_indices)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_OptimizeDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_OptimizeDataset.pbtxt
new file mode 100644
index 00000000000000..f26eb6e3c34487
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_OptimizeDataset.pbtxt
@@ -0,0 +1,20 @@
+op {
+  graph_op_name: "OptimizeDataset"
+  visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A variant tensor representing the input dataset.
+END
+  }
+  in_arg {
+    name: "optimizations"
+    description: <<END
+A `tf.string` vector `tf.Tensor` identifying optimizations to use.
+END
+  }
+  summary: "Creates a dataset by applying optimizations to `input_dataset`."
+  description: <<END
+Creates a dataset by applying optimizations to `input_dataset`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2.pbtxt
index 1fc9c9034a1f56..9b500d0b58d2da 100644
--- a/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2.pbtxt
@@ -9,21 +9,24 @@ END
   in_arg {
     name: "input_min"
     description: <<END
-If range_given, this is the min of the range, otherwise this input
-will be ignored.
+If `range_given == True`, this specifies the minimum input value that needs to
+be represented, otherwise it is determined from the min value of the `input`
+tensor.
 END
   }
   in_arg {
     name: "input_max"
     description: <<END
-If range_given, this is the max of the range, otherwise this input
-will be ignored.
+If `range_given == True`, this specifies the maximum input value that needs to
+be represented, otherwise it is determined from the max value of the `input`
+tensor.
 END
   }
   attr {
     name: "signed_input"
     description: <<END
-If the quantization is signed or unsigned.
+Whether the quantization is signed or unsigned. (actually this parameter should
+have been called <b>`signed_output`</b>)
 END
   }
   attr {
@@ -35,59 +38,60 @@ END
   attr {
     name: "range_given"
     description: <<END
-If the range is given or should be computed from the tensor.
+Whether the range is given or should be determined from the `input` tensor.
 END
   }
   summary: "Quantizes then dequantizes a tensor."
   description: <<END
 This op simulates the precision loss from the quantized forward pass by:
+
 1. Quantizing the tensor to fixed point numbers, which should match the target
    quantization method when it is used in inference.
 2. Dequantizing it back to floating point numbers for the following ops, most
    likely matmul.
 
-There are different ways to quantize. This version does not use the full range
-of the output type, choosing to elide the lowest possible value for symmetry
-(e.g., output range is -127 to 127, not -128 to 127 for signed 8 bit
-quantization), so that 0.0 maps to 0.
-
-To perform this op, we first find the range of values in our tensor. The range
-we use is always centered on 0, so we find m such that
-
-1. m = max(abs(input_min), abs(input_max)) if range_given is true,
-2. m = max(abs(min_elem(input)), abs(max_elem(input))) otherwise.
-
-Our input tensor range is then [-m, m].
+There are different ways to quantize. This version uses only scaling, so 0.0
+maps to 0.
 
-Next, we choose our fixed-point quantization buckets, [min_fixed, max_fixed].
-If signed_input is true, this is
+From the specified 'num_bits' in the quantized output type, it determines
+minimum and maximum representable quantized values.
 
-  [min_fixed, max_fixed ] =
-      [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1].
+e.g.
 
-Otherwise, if signed_input is false, the fixed-point range is
+*   [-128, 127] for signed, num_bits = 8, or
+*   [0, 255] for unsigned, num_bits = 8.
 
-  [min_fixed, max_fixed] = [0, (1 << num_bits) - 1].
+If range_given == False, the initial input_min, input_max will be determined
+automatically as the minimum and maximum values in the input tensor, otherwise
+the specified values of input_min, input_max are used.
 
-From this we compute our scaling factor, s:
+Note: If the input_min, input_max are specified, they do not need to equal the
+actual minimum and maximum values in the tensor. e.g. in some cases it may be
+beneficial to specify these values such that the low probability extremes of the
+input distribution are clipped.
 
-  s = (max_fixed - min_fixed) / (2 * m).
+This op determines the maximum scale_factor that would map the initial
+[input_min, input_max] range to a range that lies within the representable
+quantized range.
 
-Now we can quantize and dequantize the elements of our tensor.  An element e
-is transformed into e':
+It determines the scale from one of input_min and input_max, then updates the
+other one to maximize the respresentable range.
 
-  e' = (e * s).round_to_nearest() / s.
+e.g.
 
-Note that we have a different number of buckets in the signed vs. unsigned
-cases.  For example, if num_bits == 8, we get 254 buckets in the signed case
-vs. 255 in the unsigned case.
+*   if the output is signed, num_bits = 8, [input_min, input_max] = [-10.0,
+    5.0]: it would use a scale_factor of -128 / -10.0 = 12.8 In this case, it
+    would update input_max to be 127 / 12.8 = 9.921875
+*   if the output is signed, num_bits = 8, [input_min, input_max] = [-10.0,
+    10.0]: it would use a scale_factor of 127 / 10.0 = 12.7 In this case, it
+    would update input_min to be 128.0 / 12.7 = -10.07874
+*   if the output is unsigned, input_min is forced to be 0, and only the
+    specified input_max is used.
 
-For example, suppose num_bits = 8 and m = 1.  Then
+After determining the scale_factor and updating the input range, it applies the
+following to each value in the 'input' tensor.
 
-  [min_fixed, max_fixed] = [-127, 127], and
-  s = (127 + 127) / 2 = 127.
+output = round(clamp(value, input_min, input_max) * scale_factor) / scale_factor.
 
-Given the vector {-1, -0.5, 0, 0.3}, this is quantized to
-{-127, -63, 0, 38}, and dequantized to {-1, -63.0/127, 0, 38.0/127}.
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_ReduceJoin.pbtxt b/tensorflow/core/api_def/base_api/api_def_ReduceJoin.pbtxt
index ca7e0d3beefa21..d13866ddaa1308 100644
--- a/tensorflow/core/api_def/base_api/api_def_ReduceJoin.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ReduceJoin.pbtxt
@@ -38,7 +38,9 @@ END
 Computes the string join across dimensions in the given string Tensor of shape
 `[d_0, d_1, ..., d_n-1]`.  Returns a new Tensor created by joining the input
 strings with the given separator (default: empty string).  Negative indices are
-counted backwards from the end, with `-1` being equivalent to `n - 1`.
+counted backwards from the end, with `-1` being equivalent to `n - 1`.  If
+indices are not specified, joins across all dimensions beginning from `n - 1`
+through `0`.
 
 For example:
 
@@ -51,9 +53,10 @@ tf.reduce_join(a, -1) = tf.reduce_join(a, 1) ==> ["ab", "cd"]
 tf.reduce_join(a, 0, keep_dims=True) ==> [["ac", "bd"]]
 tf.reduce_join(a, 1, keep_dims=True) ==> [["ab"], ["cd"]]
 tf.reduce_join(a, 0, separator=".") ==> ["a.c", "b.d"]
-tf.reduce_join(a, [0, 1]) ==> ["acbd"]
-tf.reduce_join(a, [1, 0]) ==> ["abcd"]
-tf.reduce_join(a, []) ==> ["abcd"]
+tf.reduce_join(a, [0, 1]) ==> "acbd"
+tf.reduce_join(a, [1, 0]) ==> "abcd"
+tf.reduce_join(a, []) ==> [["a", "b"], ["c", "d"]]
+tf.reduce_join(a) = tf.reduce_join(a, [1, 0]) ==> "abcd"
 ```
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_RegexFullMatch.pbtxt b/tensorflow/core/api_def/base_api/api_def_RegexFullMatch.pbtxt
new file mode 100644
index 00000000000000..8cef243aee3a9b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_RegexFullMatch.pbtxt
@@ -0,0 +1,30 @@
+op {
+  graph_op_name: "RegexFullMatch"
+  in_arg {
+    name: "input"
+    description: <<END
+A string tensor of the text to be processed.
+END
+  }
+  in_arg {
+    name: "pattern"
+    description: <<END
+A 1-D string tensor of the regular expression to match the input.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A bool tensor with the same shape as `input`.
+END
+  }
+  summary: "Check if the input matches the regex pattern."
+  description: <<END
+The input is a string tensor of any shape. The pattern is a scalar
+string tensor which is applied to every element of the input tensor.
+The boolean values (True or False) of the output tensor indicate
+if the input matches the regex pattern provided.
+
+The pattern follows the re2 syntax (https://github.com/google/re2/wiki/Syntax)
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt b/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt
index cbe76de4151256..985f09312f515c 100644
--- a/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt
@@ -4,6 +4,10 @@ op {
   description: <<END
 if < 0, `scale * features` otherwise.
 
+To be used together with
+`initializer = tf.variance_scaling_initializer(factor=1.0, mode='FAN_IN')`.
+For correct dropout, use `tf.contrib.nn.alpha_dropout`.
+
 See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt
new file mode 100644
index 00000000000000..6e13d0d0497a61
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt
@@ -0,0 +1,48 @@
+op {
+  graph_op_name: "StringSplitV2"
+  in_arg {
+    name: "input"
+    description: <<END
+`1-D` string `Tensor`, the strings to split.
+END
+  }
+  in_arg {
+    name: "sep"
+    description: <<END
+`0-D` string `Tensor`, the delimiter character.
+END
+  }
+  attr {
+    name: "maxsplit"
+    description: <<END
+An `int`. If `maxsplit > 0`, limit of the split of the result.
+END
+  }
+  summary: "Split elements of `source` based on `sep` into a `SparseTensor`."
+  description: <<END
+Let N be the size of source (typically N will be the batch size). Split each
+element of `source` based on `sep` and return a `SparseTensor`
+containing the split tokens. Empty tokens are ignored.
+
+For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
+then the output will be
+```
+st.indices = [0, 0;
+              0, 1;
+              1, 0;
+              1, 1;
+              1, 2]
+st.shape = [2, 3]
+st.values = ['hello', 'world', 'a', 'b', 'c']
+```
+
+If `sep` is given, consecutive delimiters are not grouped together and are
+deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
+sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
+string, consecutive whitespace are regarded as a single separator, and the
+result will contain no empty strings at the startor end if the string has
+leading or trailing whitespace.
+
+Note that the above mentioned behavior matches python's str.split.
+END
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_AnonymousIterator.pbtxt b/tensorflow/core/api_def/python_api/api_def_AnonymousIterator.pbtxt
new file mode 100644
index 00000000000000..98b7def4d67a74
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_AnonymousIterator.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "AnonymousIterator"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BroadcastTo.pbtxt b/tensorflow/core/api_def/python_api/api_def_BroadcastTo.pbtxt
deleted file mode 100644
index 083eeced81dfc0..00000000000000
--- a/tensorflow/core/api_def/python_api/api_def_BroadcastTo.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "BroadcastTo"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_CollectiveBcastRecv.pbtxt b/tensorflow/core/api_def/python_api/api_def_CollectiveBcastRecv.pbtxt
new file mode 100644
index 00000000000000..78034ccffdfad0
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_CollectiveBcastRecv.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CollectiveBcastRecv"
+  endpoint {
+    name: "collective.broadcast_recv"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_CollectiveBcastSend.pbtxt b/tensorflow/core/api_def/python_api/api_def_CollectiveBcastSend.pbtxt
new file mode 100644
index 00000000000000..9d6b2f83fe012f
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_CollectiveBcastSend.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CollectiveBcastSend"
+  endpoint {
+    name: "collective.broadcast_send"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_CollectiveReduce.pbtxt b/tensorflow/core/api_def/python_api/api_def_CollectiveReduce.pbtxt
new file mode 100644
index 00000000000000..27ae8a833a26f2
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_CollectiveReduce.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "CollectiveReduce"
+  endpoint {
+    name: "collective.all_reduce"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_FakeParam.pbtxt b/tensorflow/core/api_def/python_api/api_def_FakeParam.pbtxt
new file mode 100644
index 00000000000000..57fa8ff5b982a4
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_FakeParam.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FakeParam"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_NonMaxSuppressionV3.pbtxt b/tensorflow/core/api_def/python_api/api_def_NonMaxSuppressionV3.pbtxt
new file mode 100644
index 00000000000000..263cba14eb9307
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_NonMaxSuppressionV3.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "NonMaxSuppressionV3"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_RegexFullMatch.pbtxt b/tensorflow/core/api_def/python_api/api_def_RegexFullMatch.pbtxt
new file mode 100644
index 00000000000000..ec310c8aebdee9
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_RegexFullMatch.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "RegexFullMatch"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt
new file mode 100644
index 00000000000000..0e8576fb01a347
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StringSplitV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc
index e9f839289af482..9cda17867bafd6 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/bfc_allocator.cc
@@ -86,7 +86,7 @@ BFCAllocator::Chunk* BFCAllocator::ChunkFromHandle(ChunkHandle h) {
   return &(chunks_[h]);
 }
 
-bool BFCAllocator::Extend(size_t rounded_bytes) {
+bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
   size_t available_bytes = memory_limit_ - total_region_allocated_bytes_;
   // Rounds available_bytes down to the nearest multiple of kMinAllocationSize.
   available_bytes = (available_bytes / kMinAllocationSize) * kMinAllocationSize;
@@ -108,7 +108,7 @@ bool BFCAllocator::Extend(size_t rounded_bytes) {
 
   // Try allocating.
   size_t bytes = std::min(curr_region_allocation_bytes_, available_bytes);
-  void* mem_addr = suballocator_->Alloc(32, bytes);
+  void* mem_addr = suballocator_->Alloc(alignment, bytes);
   if (mem_addr == nullptr && !started_backpedal_) {
     // Only backpedal once.
     started_backpedal_ = true;
@@ -119,7 +119,7 @@ bool BFCAllocator::Extend(size_t rounded_bytes) {
     while (mem_addr == nullptr) {
       bytes = RoundedBytes(bytes * kBackpedalFactor);
       if (bytes < rounded_bytes) break;
-      mem_addr = suballocator_->Alloc(32, bytes);
+      mem_addr = suballocator_->Alloc(alignment, bytes);
     }
   }
 
@@ -261,7 +261,7 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
   }
 
   // Try to extend
-  if (Extend(rounded_bytes)) {
+  if (Extend(unused_alignment, rounded_bytes)) {
     ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes);
     if (ptr != nullptr) {
       return ptr;
@@ -616,7 +616,7 @@ string BFCAllocator::RenderOccupancy() {
     region_offset += region.memory_size();
   }
 
-  return StringPiece(rendered, resolution).ToString();
+  return std::string(rendered, resolution);
 }
 
 void BFCAllocator::DumpMemoryLog(size_t num_bytes) {
diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h
index ba5a3eea3ac071..52aedb1e9c2865 100644
--- a/tensorflow/core/common_runtime/bfc_allocator.h
+++ b/tensorflow/core/common_runtime/bfc_allocator.h
@@ -305,7 +305,8 @@ class BFCAllocator : public VisitableAllocator {
   // Try to add a new memory region that can satisfy an allocation of
   // 'rounded_bytes' bytes.  Returns true on success and false on
   // failure.
-  bool Extend(size_t rounded_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_);
+  bool Extend(size_t alignment, size_t rounded_bytes)
+      EXCLUSIVE_LOCKS_REQUIRED(lock_);
 
   // Returns a pointer to an underlying allocated chunk of size
   // 'rounded_bytes'.
diff --git a/tensorflow/core/common_runtime/broadcaster.cc b/tensorflow/core/common_runtime/broadcaster.cc
index 5e8af8653dc011..9646a0856ed60b 100644
--- a/tensorflow/core/common_runtime/broadcaster.cc
+++ b/tensorflow/core/common_runtime/broadcaster.cc
@@ -80,7 +80,7 @@ void Broadcaster::Run(StatusCallback done) {
 // continuing to occupy its current position.  Hence we calculate as
 // though each device's rank is actually r+1, then subtract 1 again to
 // get the descendent ranks.  If the source is not rank 0 then its
-// decendents include both {0,1} and the descendents of its current
+// descendants include both {0,1} and the descendents of its current
 // position.  Where a non-0-rank source is a descendent of another
 // device, no send to it is necessary.
 
@@ -115,7 +115,7 @@ void Broadcaster::TreeSendTo(const CollectiveParams& cp,
   DCHECK_NE(successor_rank, my_rank);
   if (cp.is_source && source_rank != 0) {
     // The source sends to rank 0,1 in addition to its positional
-    // decendents.
+    // descendants.
     if (cp.group.group_size > 1) {
       targets->push_back(0);
     }
@@ -134,7 +134,7 @@ void Broadcaster::TreeSendTo(const CollectiveParams& cp,
 // Execute a tree broadcast, i.e. each non-source device receives from
 // one other and sends to up-to two others.
 void Broadcaster::RunTree() {
-  mutex mu;
+  mutex mu;               // also guards status_ while callbacks are pending
   int pending_count = 0;  // GUARDED_BY(mu)
   condition_variable all_done;
   std::vector<int> send_to_ranks;
@@ -162,15 +162,13 @@ void Broadcaster::RunTree() {
         ++pending_count;
       }
       DispatchSend(
-          target_rank, output_,
+          target_rank, (is_source_ ? &ctx_->input(0) : output_),
           [this, target_rank, &mu, &pending_count, &all_done](const Status& s) {
+            mutex_lock l(mu);
             status_.Update(s);
-            {
-              mutex_lock l(mu);
-              --pending_count;
-              if (pending_count == 0) {
-                all_done.notify_all();
-              }
+            --pending_count;
+            if (pending_count == 0) {
+              all_done.notify_all();
             }
           });
     }
@@ -191,13 +189,11 @@ void Broadcaster::RunTree() {
           op_dev_ctx, op_dev_ctx, device_, device_, ctx_->input_alloc_attr(0),
           ctx_->output_alloc_attr(0), input, output_,
           [this, &mu, &pending_count, &all_done](const Status& s) {
+            mutex_lock l(mu);
             status_.Update(s);
-            {
-              mutex_lock l(mu);
-              --pending_count;
-              if (0 == pending_count) {
-                all_done.notify_all();
-              }
+            --pending_count;
+            if (0 == pending_count) {
+              all_done.notify_all();
             }
           });
     }
diff --git a/tensorflow/core/common_runtime/broadcaster_test.cc b/tensorflow/core/common_runtime/broadcaster_test.cc
index 89d39144b3d212..959b93d56e7fd4 100644
--- a/tensorflow/core/common_runtime/broadcaster_test.cc
+++ b/tensorflow/core/common_runtime/broadcaster_test.cc
@@ -314,11 +314,11 @@ class BroadcasterTest : public ::testing::Test {
 
   typedef std::function<void(Tensor*)> InitFunc;
 
-  void Broadcast() {
+  void Broadcast(bool forward_input) {
     std::atomic<int> done(0);
     for (auto di : instances_) {
-      SchedClosure([di, &done] {
-        di->DoBroadcast();
+      SchedClosure([di, forward_input, &done] {
+        di->DoBroadcast(forward_input);
         ++done;
       });
     }
@@ -380,7 +380,8 @@ class BroadcasterTest : public ::testing::Test {
 
   template <typename T>
   void RunTest(DataType dtype, const DeviceType& device_type, int num_workers,
-               int num_devices, int tensor_len, int fail_after) {
+               int num_devices, int tensor_len, int fail_after,
+               bool forward_input) {
     Init(num_workers, num_devices, dtype, device_type, fail_after);
 
     // Initialize each instance tensor with distinct values.
@@ -423,7 +424,7 @@ class BroadcasterTest : public ::testing::Test {
       expected[i] = t->flat<T>()(i);
     }
 
-    Broadcast();
+    Broadcast(forward_input);
 
     // At this point all of the ops have terminated.
     for (int di = 0; di < instances_.size(); ++di) {
@@ -573,7 +574,7 @@ class BroadcasterTest : public ::testing::Test {
       }
     }
 
-    void DoBroadcast() {
+    void DoBroadcast(bool forward_input) {
       // Prepare an OpKernelContext.
       OpKernelContext::Params op_params;
       op_params.step_id = parent_->step_id_;
@@ -596,7 +597,8 @@ class BroadcasterTest : public ::testing::Test {
       input_dc.push_back(dev_ctx);
       op_params.input_device_contexts = &input_dc;
       op_params.op_device_context = dev_ctx;
-      int forward_from[] = {0};
+      int forward_from[] = {OpKernelContext::Params::kNeverForward};
+      if (forward_input) forward_from[0] = 0;
       if (col_params_.is_source) {
         op_params.forward_from_array = &forward_from[0];
       }
@@ -680,61 +682,61 @@ class BroadcasterTest : public ::testing::Test {
 // D = number of devices per worker
 // L = tensor length
 // A = abort after count
-#define DEF_TEST(B, T, W, D, L, A)                                 \
-  TEST_F(BroadcasterTest,                                          \
-         DaTy##B##_DevTy##T##_Wkr##W##_Dev##D##_Len##L##_Abt##A) { \
-    DataType dtype = DT_##B;                                       \
-    switch (dtype) {                                               \
-      case DT_FLOAT: {                                             \
-        RunTest<float>(dtype, DEVICE_##T, W, D, L, A);             \
-      } break;                                                     \
-      case DT_DOUBLE: {                                            \
-        RunTest<double>(dtype, DEVICE_##T, W, D, L, A);            \
-      } break;                                                     \
-      case DT_INT32: {                                             \
-        RunTest<int32>(dtype, DEVICE_##T, W, D, L, A);             \
-      } break;                                                     \
-      case DT_INT64: {                                             \
-        RunTest<int64>(dtype, DEVICE_##T, W, D, L, A);             \
-      } break;                                                     \
-      default:                                                     \
-        LOG(FATAL) << "Unimplemented";                             \
-    }                                                              \
+#define DEF_TEST(B, T, W, D, L, A, F)                                      \
+  TEST_F(BroadcasterTest,                                                  \
+         DaTy##B##_DevTy##T##_Wkr##W##_Dev##D##_Len##L##_Abt##A##_Fw##F) { \
+    DataType dtype = DT_##B;                                               \
+    switch (dtype) {                                                       \
+      case DT_FLOAT: {                                                     \
+        RunTest<float>(dtype, DEVICE_##T, W, D, L, A, F);                  \
+      } break;                                                             \
+      case DT_DOUBLE: {                                                    \
+        RunTest<double>(dtype, DEVICE_##T, W, D, L, A, F);                 \
+      } break;                                                             \
+      case DT_INT32: {                                                     \
+        RunTest<int32>(dtype, DEVICE_##T, W, D, L, A, F);                  \
+      } break;                                                             \
+      case DT_INT64: {                                                     \
+        RunTest<int64>(dtype, DEVICE_##T, W, D, L, A, F);                  \
+      } break;                                                             \
+      default:                                                             \
+        LOG(FATAL) << "Unimplemented";                                     \
+    }                                                                      \
   }
 
 #ifndef GOOGLE_CUDA
-//       B      T    W  D  L  A
-DEF_TEST(FLOAT, CPU, 1, 2, 1, 0)
-DEF_TEST(FLOAT, CPU, 1, 2, 1001, 0)
-DEF_TEST(FLOAT, CPU, 2, 1, 128, 0)
-DEF_TEST(FLOAT, CPU, 2, 4, 128, 0)
-DEF_TEST(FLOAT, CPU, 2, 8, 4095, 0)
-DEF_TEST(FLOAT, CPU, 4, 4, 1045991, 0)
-
-DEF_TEST(DOUBLE, CPU, 2, 4, 128, 0)
-DEF_TEST(INT32, CPU, 2, 4, 128, 0)
-DEF_TEST(INT64, CPU, 2, 4, 128, 0)
+//       B      T    W  D  L  A  F
+DEF_TEST(FLOAT, CPU, 1, 2, 1, 0, false)
+DEF_TEST(FLOAT, CPU, 1, 2, 1001, 0, true)
+DEF_TEST(FLOAT, CPU, 2, 1, 128, 0, false)
+DEF_TEST(FLOAT, CPU, 2, 4, 128, 0, true)
+DEF_TEST(FLOAT, CPU, 2, 8, 4095, 0, false)
+DEF_TEST(FLOAT, CPU, 4, 4, 1045991, 0, true)
+
+DEF_TEST(DOUBLE, CPU, 2, 4, 128, 0, false)
+DEF_TEST(INT32, CPU, 2, 4, 128, 0, true)
+DEF_TEST(INT64, CPU, 2, 4, 128, 0, false)
 
 // Failure cases
-DEF_TEST(FLOAT, CPU, 2, 4, 128, 1)
-DEF_TEST(FLOAT, CPU, 2, 4, 128, 5)
+DEF_TEST(FLOAT, CPU, 2, 4, 128, 1, true)
+DEF_TEST(FLOAT, CPU, 2, 4, 128, 5, false)
 #endif
 
 #ifdef GOOGLE_CUDA
 // Can only set W=1 for GPU tests.
-//       B      T    W  D  L  A
-DEF_TEST(FLOAT, GPU, 1, 2, 1, 0)
-DEF_TEST(FLOAT, GPU, 1, 2, 33, 0)
-DEF_TEST(FLOAT, GPU, 1, 3, 64, 0)
-DEF_TEST(FLOAT, GPU, 1, 8, 1001, 0)
-DEF_TEST(FLOAT, GPU, 1, 8, 4095, 0)
-DEF_TEST(FLOAT, GPU, 1, 8, 1045991, 0)
+//       B      T    W  D  L  A  F
+DEF_TEST(FLOAT, GPU, 1, 2, 1, 0, true)
+DEF_TEST(FLOAT, GPU, 1, 2, 33, 0, false)
+DEF_TEST(FLOAT, GPU, 1, 3, 64, 0, true)
+DEF_TEST(FLOAT, GPU, 1, 8, 1001, 0, false)
+DEF_TEST(FLOAT, GPU, 1, 8, 4095, 0, true)
+DEF_TEST(FLOAT, GPU, 1, 8, 1045991, 0, false)
 
-DEF_TEST(DOUBLE, GPU, 1, 8, 1001, 0)
-DEF_TEST(INT64, GPU, 1, 8, 1001, 0)
+DEF_TEST(DOUBLE, GPU, 1, 8, 1001, 0, true)
+DEF_TEST(INT64, GPU, 1, 8, 1001, 0, false)
 
 // Failure cases
-DEF_TEST(FLOAT, GPU, 1, 8, 128, 6)
+DEF_TEST(FLOAT, GPU, 1, 8, 128, 6, true)
 #endif
 
 }  // namespace
diff --git a/tensorflow/core/common_runtime/buf_rendezvous.h b/tensorflow/core/common_runtime/buf_rendezvous.h
index e94e88b323ec74..9eb9f060f6bac2 100644
--- a/tensorflow/core/common_runtime/buf_rendezvous.h
+++ b/tensorflow/core/common_runtime/buf_rendezvous.h
@@ -79,7 +79,7 @@ class BufRendezvous {
                   const ProducerCallback& done);
 
   // Called to request access to a Tensor value corresponding to key.
-  // Consumer is provide with a Hook as soon as availble.
+  // Consumer is provide with a Hook as soon as available.
   void ConsumeBuf(const string& key, const ConsumerCallback& done);
 
   // Consumer must call this function when it's done reading the Hook provided
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.cc b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
index bdddf927d89152..8b2e0d1e0a47c3 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
@@ -34,7 +34,8 @@ void CollectiveParamResolverLocal::CompleteGroupAsync(
 
 void CollectiveParamResolverLocal::CompleteGroupLocal(
     const string& device, CollectiveParams* cp, const GroupRecCallback& done) {
-  VLOG(1) << "CompleteGroupLocal " << cp << ": " << cp->ToString();
+  VLOG(1) << "CompleteGroupLocal device=" << device << " cp: " << cp << ": "
+          << cp->ToString();
   std::vector<StatusCallback> to_be_called;
   GroupRec* gr = nullptr;
   {
@@ -317,9 +318,6 @@ void SortDevicesAndTasks(CollectiveParams* cp) {
 // ring order implicit in the device order.
 void GenerateSubdivPerms(const string& device, int source_rank,
                          CollectiveParams* cp) {
-  CHECK_GT(cp->instance.impl_details.subdiv_offsets.size(), 0);
-  cp->instance.impl_details.subdiv_permutations.resize(
-      cp->instance.impl_details.subdiv_offsets.size());
   // Each subdiv permutation is a ring formed by rotating each
   // single-task subsequence of devices by an offset.  This makes most
   // sense when each task has the same number of devices but we can't
@@ -434,8 +432,9 @@ void CollectiveParamResolverLocal::SetDefaultRank(const string& device,
   }
 }
 
-Status CollectiveParamResolverLocal::InitInstanceSharedParams(
-    const GroupRec* gr, const CollectiveParams* cp, InstanceRec* ir) {
+void CollectiveParamResolverLocal::InitInstanceSharedParams(
+    const GroupRec* gr, const CollectiveParams* cp, InstanceRec* ir,
+    const StatusCallback& done) {
   VLOG(1) << "InitInstanceSharedParams " << ir;
   ir->shared.instance = cp->instance;
   {
@@ -461,19 +460,19 @@ Status CollectiveParamResolverLocal::InitInstanceSharedParams(
   // called by a derived class, some of the devices may be non-local and
   // GetDeviceLocalitiesAsync will use those fields to launch RPCs.
   CompleteTaskIsLocal(task_name_, &ir->shared);
-  std::vector<DeviceLocality> localities;
-  Notification note;
-  Status status;
-  dev_resolver_->GetDeviceLocalitiesAsync(ir->shared.instance, &localities,
-                                          [&note, &status](const Status& s) {
-                                            status = s;
-                                            note.Notify();
-                                          });
-  note.WaitForNotification();
-  if (status.ok()) {
-    CompleteDefaultRanking(gr, cp, ir, localities);
-  }
-  return status;
+  std::vector<DeviceLocality>* localities = new std::vector<DeviceLocality>;
+  dev_resolver_->GetDeviceLocalitiesAsync(
+      ir->shared.instance, localities,
+      [this, gr, cp, ir, localities, done](const Status& s)
+          EXCLUSIVE_LOCKS_REQUIRED(ir->out_mu) {
+            if (s.ok()) {
+              CompleteDefaultRanking(gr, cp, ir, *localities);
+              done(Status::OK());
+            } else {
+              done(s);
+            }
+            delete localities;
+          });
 }
 
 void CollectiveParamResolverLocal::CompleteDefaultRanking(
@@ -548,28 +547,50 @@ void CollectiveParamResolverLocal::FindInstanceRec(
     CallbackWithStatus(done, irec);
     return;
   }
-  // Initialize the new InstanceRec while holding out_mu.
-  {
-    mutex_lock il(irec->out_mu);
-    irec->known.resize(cp->group.group_size, false);
-    irec->status = InitInstanceSharedParams(gr, cp, irec);
-  }
-  // Prepare to invoke any waiters that accumlated during initialization.
-  std::vector<IRConsumer> init_waiters;
-  {
-    mutex_lock tl(instance_mu_);
-    {
-      mutex_lock l(irec->in_mu);
-      irec->is_init = true;
-      if (!irec->init_waiters.empty()) {
-        std::swap(init_waiters, irec->init_waiters);
-      }
-    }
-  }
-  CallbackWithStatus(done, irec);
-  for (auto& f : init_waiters) {
-    f(irec);
-  }
+
+  CallInitInstanceSharedParams(gr, cp, irec, done);
+}
+
+void CollectiveParamResolverLocal::CallInitInstanceSharedParams(
+    const GroupRec* gr, const CollectiveParams* cp, InstanceRec* ir,
+    const InstanceRecCallback& done) NO_THREAD_SAFETY_ANALYSIS {
+  // This function serves merely to make a function call that should
+  // be thread/mutex safe but violates the simple model applied by
+  // static analysis, so we turn off analysis only within this
+  // function body.
+  //
+  // A lock on ir->out_mu must be held throughout the _bodies_ of the
+  // chain of function calls initiated here, each of which calls
+  // another as its last action, but it will be dropped within the
+  // callback defined below, which means that the lock can be dropped
+  // before all the function stack frames pop. The static analysis will
+  // not allow that.
+  ir->out_mu.lock();
+  ir->known.resize(cp->group.group_size, false);
+  InitInstanceSharedParams(
+      gr, cp, ir,
+      [this, ir, done](const Status& s) UNLOCK_FUNCTION(ir->out_mu) {
+        DCHECK(!ir->out_mu.try_lock());
+        ir->status.Update(s);
+        ir->out_mu.unlock();
+        // Prepare to invoke any waiters that accumlated during
+        // initialization.
+        std::vector<IRConsumer> init_waiters;
+        {
+          mutex_lock tl(instance_mu_);
+          {
+            mutex_lock l(ir->in_mu);
+            ir->is_init = true;
+            if (!ir->init_waiters.empty()) {
+              std::swap(init_waiters, ir->init_waiters);
+            }
+          }
+        }
+        CallbackWithStatus(done, ir);
+        for (auto& f : init_waiters) {
+          f(ir);
+        }
+      });
 }
 
 void CollectiveParamResolverLocal::CompleteParamsAsync(
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.h b/tensorflow/core/common_runtime/collective_param_resolver_local.h
index 7b2946e9368518..3a871f962dfbef 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.h
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.h
@@ -145,10 +145,15 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
   //
   // Preconditions:
   //  cp is populated with all DeviceLocalities
-  Status InitInstanceSharedParams(const GroupRec* gr,
-                                  const CollectiveParams* cp, InstanceRec* ir)
+  void InitInstanceSharedParams(const GroupRec* gr, const CollectiveParams* cp,
+                                InstanceRec* ir, const StatusCallback& done)
       EXCLUSIVE_LOCKS_REQUIRED(ir->out_mu) LOCKS_EXCLUDED(gr->mu);
 
+  void CallInitInstanceSharedParams(const GroupRec* gr,
+                                    const CollectiveParams* cp, InstanceRec* ir,
+                                    const InstanceRecCallback& done)
+      LOCKS_EXCLUDED(ir->out_mu, gr->mu);
+
   // Establishes the final order of ir->shared.instance.device_names and
   // ir->shared.instance.task_names by considering localities of all devices.
   void CompleteDefaultRanking(const GroupRec* gr, const CollectiveParams* cp,
diff --git a/tensorflow/core/common_runtime/collective_rma_local.cc b/tensorflow/core/common_runtime/collective_rma_local.cc
index ad9b32ce3514dc..69f1a9f24cde88 100644
--- a/tensorflow/core/common_runtime/collective_rma_local.cc
+++ b/tensorflow/core/common_runtime/collective_rma_local.cc
@@ -54,9 +54,13 @@ void CollectiveRemoteAccessLocal::RecvFromPeer(
                       hook->prod_value,  // src Tensor*
                       to_tensor,         // dst Tensor*
                       [hook, done](const Status& s) {
+                        // This callback may be executing in the GPUEventMgr
+                        // pool in which case it must be very short duration
+                        // and non-blocking (except e.g. for queue insertion).
+                        // It would be safer, though expensive, to transfer
+                        // to another thread here.
                         done(s);
-                        hook->prod_cb(s);
-                        delete hook;
+                        BufRendezvous::DoneWithHook(hook);
                       });
         }
       });
@@ -91,6 +95,21 @@ void CollectiveRemoteAccessLocal::MemCpyAsync(
       dst_attr.on_host() ? DEVICE_CPU : dst_dev->attributes().device_type());
   const bool non_cpu_src = src_device_type != DeviceType(DEVICE_CPU);
   const bool non_cpu_dst = dst_device_type != DeviceType(DEVICE_CPU);
+  // For GPU devices when only one compute stream is used (the default)
+  // the OpKernelContext does not supply a DeviceContext.  It's assumed
+  // that all nodes use the default context.
+  if (src_dev_ctx == nullptr && src_device_type == DEVICE_GPU) {
+    const DeviceBase::GpuDeviceInfo* dev_info =
+        src_dev->tensorflow_gpu_device_info();
+    CHECK(dev_info);
+    src_dev_ctx = dev_info->default_context;
+  }
+  if (dst_dev_ctx == nullptr && dst_device_type == DEVICE_GPU) {
+    const DeviceBase::GpuDeviceInfo* dev_info =
+        src_dev->tensorflow_gpu_device_info();
+    CHECK(dev_info);
+    dst_dev_ctx = dev_info->default_context;
+  }
   if (non_cpu_src) CHECK(src_dev_ctx);
   if (non_cpu_dst) CHECK(dst_dev_ctx);
   if (non_cpu_src || non_cpu_dst) {
diff --git a/tensorflow/core/common_runtime/copy_tensor.cc b/tensorflow/core/common_runtime/copy_tensor.cc
index e35548729b993c..08d120c7a5bed6 100644
--- a/tensorflow/core/common_runtime/copy_tensor.cc
+++ b/tensorflow/core/common_runtime/copy_tensor.cc
@@ -237,7 +237,7 @@ void CopyTensor::ViaDMA(StringPiece edge_name, DeviceContext* send_dev_context,
                         const AllocatorAttributes dst_alloc_attr,
                         const Tensor* input, Tensor* output,
                         StatusCallback done) {
-  port::Tracing::ScopedAnnotation annotation(edge_name);
+  tracing::ScopedAnnotation annotation(edge_name);
   VLOG(1) << "Copy " << edge_name;
 
   const DeviceType src_device_type(
diff --git a/tensorflow/core/common_runtime/device.h b/tensorflow/core/common_runtime/device.h
index 5918cd9bbf35a7..b537666492ce29 100644
--- a/tensorflow/core/common_runtime/device.h
+++ b/tensorflow/core/common_runtime/device.h
@@ -51,6 +51,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+class DeviceMgr;
+
 class Device : public DeviceBase {
  public:
   Device(Env* env, const DeviceAttributes& device_attributes);
@@ -133,6 +135,10 @@ class Device : public DeviceBase {
   // Returns the resource manager associated w/ this device.
   virtual ResourceMgr* resource_manager() { return rmgr_; }
 
+  // Returns the device manager that owns this device, or nullptr if this Device
+  // is not owned by a device manager.
+  DeviceMgr* device_mgr() const { return device_mgr_; }
+
   // Summarizes the status of this Device, for debugging.
   string DebugString() const { return ProtoDebugString(device_attributes_); }
 
@@ -158,6 +164,11 @@ class Device : public DeviceBase {
   }
 
  private:
+  friend class DeviceMgr;
+
+  // Pointer to the device manager that owns this device. Not owned.
+  DeviceMgr* device_mgr_ = nullptr;
+
   const DeviceAttributes device_attributes_;
   DeviceNameUtils::ParsedName parsed_name_;
 
diff --git a/tensorflow/core/common_runtime/device_mgr.cc b/tensorflow/core/common_runtime/device_mgr.cc
index a77601ba79bf29..470abc14312928 100644
--- a/tensorflow/core/common_runtime/device_mgr.cc
+++ b/tensorflow/core/common_runtime/device_mgr.cc
@@ -27,6 +27,9 @@ namespace tensorflow {
 DeviceMgr::DeviceMgr(const std::vector<Device*>& devices)
     : name_backing_store_(128) {
   for (Device* d : devices) {
+    CHECK(d->device_mgr_ == nullptr);
+    d->device_mgr_ = this;
+
     devices_.push_back(d);
 
     // Register under the (1) full name and (2) canonical name.
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 0afbd02e866c27..07c1eafedc323d 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -19,15 +19,19 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/core/common_runtime/collective_executor_mgr.h"
+#include "tensorflow/core/common_runtime/collective_param_resolver_local.h"
 #include "tensorflow/core/common_runtime/constant_folding.h"
 #include "tensorflow/core/common_runtime/debugger_state_interface.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
 #include "tensorflow/core/common_runtime/executor.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/common_runtime/memory_types.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb_text.h"
@@ -443,6 +447,18 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
   // Create a run state and start execution.
   RunState run_state(step_id, &devices_);
   run_state.rendez = new IntraProcessRendezvous(device_mgr_.get());
+  // Set up for collectives if the RunOption declares a key.
+  if (run_options.experimental().collective_graph_key() > 0) {
+    if (!collective_executor_mgr_) {
+      DeviceResolverLocal* drl = new DeviceResolverLocal(device_mgr_.get());
+      collective_executor_mgr_.reset(new CollectiveExecutorMgr(
+          options_.config, device_mgr_.get(), drl,
+          new CollectiveParamResolverLocal(device_mgr_.get(), drl,
+                                           "/job:localhost/replica:0/task:0")));
+    }
+    run_state.collective_executor.reset(new CollectiveExecutor::Handle(
+        collective_executor_mgr_->FindOrCreate(step_id), true /*inherit_ref*/));
+  }
 
   // Start parallel Executors.
   const size_t num_executors = executors_and_keys->items.size();
@@ -459,6 +475,9 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
   args.step_id = step_id;
   args.call_frame = call_frame;
   args.rendezvous = run_state.rendez;
+  args.collective_executor =
+      (run_state.collective_executor ? run_state.collective_executor->get()
+                                     : nullptr);
   CancellationManager step_cancellation_manager;
   args.cancellation_manager = &step_cancellation_manager;
   args.session_state = &session_state_;
@@ -768,6 +787,10 @@ Status DirectSession::PRunSetup(const std::vector<string>& input_names,
 
   args.rendezvous = run_state->rendez;
   args.cancellation_manager = cancellation_manager_;
+  // Note that Collectives are not supported in partial runs
+  // because RunOptions is not passed in so we can't know whether
+  // their use is intended.
+  args.collective_executor = nullptr;
   args.runner = [this, pool](Executor::Args::Closure c) {
     SchedClosure(pool, std::move(c));
   };
@@ -1518,11 +1541,13 @@ DirectSession::RunState::RunState(
     const std::vector<string>& pending_input_names,
     const std::vector<string>& pending_output_names, int64 step_id,
     const std::vector<Device*>* devices)
-    : step_container(step_id, [devices](const string& name) {
+    : step_container(step_id, [devices, step_id](const string& name) {
         for (auto d : *devices) {
           if (!d->resource_manager()->Cleanup(name).ok()) {
             // Do nothing...
           }
+          ScopedAllocatorMgr* sam = d->GetScopedAllocatorMgr();
+          if (sam) sam->Cleanup(step_id);
         }
       }) {
   // Initially all the feeds and fetches are pending.
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index 6f9c1b980b3021..72a2be48162dec 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/common_runtime/session_factory.h"
 #include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/session_state.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -175,6 +176,7 @@ class DirectSession : public Session {
     mutex mu_;
     Status status GUARDED_BY(mu_);
     IntraProcessRendezvous* rendez = nullptr;
+    std::unique_ptr<CollectiveExecutor::Handle> collective_executor;
     std::unique_ptr<StepStatsCollector> collector;
     Notification executors_done;
     std::unordered_map<string, bool> pending_inputs;   // true if fed
@@ -352,6 +354,7 @@ class DirectSession : public Session {
 
   DirectSessionFactory* const factory_;  // not owned
   CancellationManager* cancellation_manager_;
+  std::unique_ptr<CollectiveExecutorMgrInterface> collective_executor_mgr_;
 
   // Map of placed stateful nodes, i.e. nodes for which is_stateful()
   // is true, such as "params" and "queue" nodes.  Once placed these
diff --git a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
index b4dd521bbc80c8..9028e6298c5035 100644
--- a/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_with_tracking_alloc_test.cc
@@ -102,9 +102,25 @@ TEST(DirectSessionWithTrackingAllocTest, CostModelTest) {
         EXPECT_EQ(2, shape.dim(0).size());
         EXPECT_EQ(1, shape.dim(1).size());
         if (node->name() == y->name()) {
-          EXPECT_EQ(7, cm->AllocationId(node, 0));
+#ifdef INTEL_MKL
+          // if MKL is used, it goes through various additional 
+          // graph rewrite pass. In TF, everytime a graph pass 
+          // happens, "constant" nodes are allocated
+          // and deallocated. Each allocation calls the
+          // (FindChunkPtr of BFCAllocator),
+          // which increments the value of AllocationId. 
+          // Thus AllocationId becomes more than 3 and 4 if 
+          // MKL is used. Now they are 9 and 10 for MKL. 
+          EXPECT_EQ(19, cm->AllocationId(node, 0));
+#else
+          EXPECT_EQ(21, cm->AllocationId(node, 0));
+#endif 
         } else {
-          EXPECT_EQ(8, cm->AllocationId(node, 0));
+#ifdef INTEL_MKL
+          EXPECT_EQ(20, cm->AllocationId(node, 0));
+#else
+          EXPECT_EQ(22, cm->AllocationId(node, 0));
+#endif 
         }
       }
       EXPECT_LE(0, cm->MaxExecutionTime(node));
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index 53271cbdf11ef1..da6b441b62ff43 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -51,6 +51,9 @@ tf_gpu_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:session_options",
+        "//tensorflow/core/distributed_runtime:worker_session",
+        "//tensorflow/core/distributed_runtime/eager:eager_client",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
     ],
 )
 
@@ -64,9 +67,9 @@ tf_gpu_library(
     ],
     visibility = ["//tensorflow:internal"],
     deps = [
+        ":attr_builder",
         ":context",
         ":tensor_handle",
-        "//tensorflow/c/eager:runtime",
     ],
 )
 
@@ -136,8 +139,8 @@ tf_cc_test(
     name = "kernel_and_device_test",
     srcs = ["kernel_and_device_test.cc"],
     deps = [
+        ":attr_builder",
         ":kernel_and_device",
-        "//tensorflow/c/eager:runtime",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:client_session",
         "//tensorflow/cc:ops",
@@ -168,6 +171,49 @@ cc_library(
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/distributed_runtime/eager:eager_client",
+        "//tensorflow/core/distributed_runtime/eager:remote_execute_node",
+    ],
+)
+
+tf_gpu_library(
+    name = "attr_builder",
+    srcs = ["attr_builder.cc"],
+    hdrs = ["attr_builder.h"],
+    visibility = ["//tensorflow:internal"],
+    deps = select({
+        "//tensorflow:android": [
+            "//tensorflow/core:android_tensorflow_lib_lite",
+        ],
+        "//conditions:default": [
+            ":kernel_and_device",
+            "//tensorflow/c:c_api",
+            "//tensorflow/core:core_cpu",
+            "//tensorflow/core:core_cpu_internal",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:framework_internal",
+            "//tensorflow/core:lib",
+            "//tensorflow/core:lib_internal",
+            "//tensorflow/core:protos_all_cc",
+        ],
+    }),
+)
+
+tf_cc_test(
+    name = "attr_builder_test",
+    srcs = ["attr_builder_test.cc"],
+    deps = [
+        ":attr_builder",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/cc:client_session",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
     ],
 )
diff --git a/tensorflow/c/eager/runtime.cc b/tensorflow/core/common_runtime/eager/attr_builder.cc
similarity index 99%
rename from tensorflow/c/eager/runtime.cc
rename to tensorflow/core/common_runtime/eager/attr_builder.cc
index e6c51ab17a867a..92307d78f2d0b3 100644
--- a/tensorflow/c/eager/runtime.cc
+++ b/tensorflow/core/common_runtime/eager/attr_builder.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/c/eager/runtime.h"
+#include "tensorflow/core/common_runtime/eager/attr_builder.h"
 
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
diff --git a/tensorflow/c/eager/runtime.h b/tensorflow/core/common_runtime/eager/attr_builder.h
similarity index 100%
rename from tensorflow/c/eager/runtime.h
rename to tensorflow/core/common_runtime/eager/attr_builder.h
diff --git a/tensorflow/c/eager/runtime_test.cc b/tensorflow/core/common_runtime/eager/attr_builder_test.cc
similarity index 97%
rename from tensorflow/c/eager/runtime_test.cc
rename to tensorflow/core/common_runtime/eager/attr_builder_test.cc
index 27ebeb0508844e..79b094f2e00878 100644
--- a/tensorflow/c/eager/runtime_test.cc
+++ b/tensorflow/core/common_runtime/eager/attr_builder_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/c/eager/runtime.h"
+#include "tensorflow/core/common_runtime/eager/attr_builder.h"
 
 #include <memory>
 #include <vector>
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index d3fe6a7edeabec..8381cb58d23e4b 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/context.h"
 
 #include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/lib/core/blocking_counter.h"
 
 namespace tensorflow {
 
@@ -24,15 +25,44 @@ EagerContext::EagerContext(const SessionOptions& opts,
                            bool async, std::unique_ptr<DeviceMgr> device_mgr,
                            Rendezvous* rendezvous)
     : policy_(default_policy),
-      device_manager_(std::move(device_mgr)),
-      devices_(device_manager_->ListDevices()),
+      local_device_manager_(std::move(device_mgr)),
+      local_unowned_device_manager_(nullptr),
+      devices_(local_device_manager_->ListDevices()),
       rendezvous_(rendezvous),
       thread_pool_(NewThreadPoolFromSessionOptions(opts)),
       pflr_(new ProcessFunctionLibraryRuntime(
-          device_manager_.get(), opts.env, TF_GRAPH_DEF_VERSION, &func_lib_def_,
-          {}, thread_pool_.get())),
+          local_device_manager_.get(), opts.env, TF_GRAPH_DEF_VERSION,
+          &func_lib_def_, {}, thread_pool_.get())),
       log_device_placement_(opts.config.log_device_placement()),
       async_default_(async) {
+  InitDeviceMapAndAsync();
+}
+
+EagerContext::EagerContext(
+    const SessionOptions& opts, ContextDevicePlacementPolicy default_policy,
+    bool async, DeviceMgr* local_device_mgr, Rendezvous* rendezvous,
+    std::unique_ptr<GrpcServer> server,
+    std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
+    std::unique_ptr<DeviceMgr> remote_device_manager,
+    const gtl::FlatMap<string, uint64>& remote_contexts)
+    : policy_(default_policy),
+      local_unowned_device_manager_(local_device_mgr),
+      devices_(local_unowned_device_manager_->ListDevices()),
+      rendezvous_(rendezvous),
+      thread_pool_(NewThreadPoolFromSessionOptions(opts)),
+      pflr_(new ProcessFunctionLibraryRuntime(
+          local_unowned_device_manager_, opts.env, TF_GRAPH_DEF_VERSION,
+          &func_lib_def_, {}, thread_pool_.get())),
+      log_device_placement_(opts.config.log_device_placement()),
+      async_default_(async),
+      server_(std::move(server)),
+      remote_eager_workers_(std::move(remote_eager_workers)),
+      remote_device_manager_(std::move(remote_device_manager)),
+      remote_contexts_(remote_contexts) {
+  InitDeviceMapAndAsync();
+}
+
+void EagerContext::InitDeviceMapAndAsync() {
   if (async_default_) {
     executor_.EnableAsync();
   }
@@ -40,6 +70,15 @@ EagerContext::EagerContext(const SessionOptions& opts,
   for (auto* device : devices_) {
     devices_map_[device->name()] = device;
   }
+
+  if (remote_device_manager_ != nullptr) {
+    for (auto* device : remote_device_manager_->ListDevices()) {
+      if (devices_map_.find(device->name()) == devices_map_.end()) {
+        devices_map_[device->name()] = device;
+        devices_.push_back(device);
+      }
+    }
+  }
 }
 
 bool EagerContext::Async() const {
@@ -86,6 +125,40 @@ ContextDevicePlacementPolicy EagerContext::GetDevicePlacementPolicy() {
 }
 
 EagerContext::~EagerContext() {
+  if (server_) {
+    // TODO(nareshmodi): Fix this.
+    LOG(WARNING) << "Unable to destroy server_ object, so releasing instead. "
+                    "GrpcServer doesn't support clean shutdown.";
+    server_.release();
+  }
+
+  // Close all remote contexts.
+  std::vector<eager::CloseContextRequest> requests(remote_contexts_.size());
+  std::vector<eager::CloseContextResponse> responses(remote_contexts_.size());
+  BlockingCounter counter(static_cast<int>(remote_contexts_.size()));
+
+  int i = 0;
+  for (const auto& worker_and_context_id : remote_contexts_) {
+    auto* client =
+        remote_eager_workers_->GetClient(worker_and_context_id.first);
+
+    requests[i].set_context_id(worker_and_context_id.second);
+    client->CloseContextAsync(
+        &requests[i], &responses[i],
+        [&worker_and_context_id, &counter](const Status& s) {
+          if (!s.ok()) {
+            LOG(ERROR) << "Unable to close remote context with ID "
+                       << worker_and_context_id.second
+                       << " for worker: " << worker_and_context_id.first
+                       << " due to " << s.error_message();
+          }
+          counter.DecrementCount();
+        });
+    i++;
+  }
+
+  counter.Wait();
+
   executor_.WaitForAllPendingNodes().IgnoreError();
   ClearCaches();
   rendezvous_->Unref();
@@ -140,4 +213,45 @@ void EagerContext::SetShouldStoreMetadata(bool value) {
   }
 }
 
+namespace {
+Status GetTaskName(Device* d, string* task_name) {
+  string ignored;
+  if (!DeviceNameUtils::SplitDeviceName(d->name(), task_name, &ignored)) {
+    return errors::InvalidArgument("Unable to parse device name: ", d->name());
+  }
+
+  return Status::OK();
+}
+}  // namespace
+
+Status EagerContext::GetClientAndContextID(Device* device,
+                                           eager::EagerClient** client,
+                                           uint64* context_id) {
+  auto it = device_to_client_cache_.find(device);
+  if (it != device_to_client_cache_.end()) {
+    *client = it->second.first;
+    *context_id = it->second.second;
+  }
+  string device_task_name;
+  TF_RETURN_IF_ERROR(GetTaskName(device, &device_task_name));
+
+  *client = remote_eager_workers_->GetClient(device_task_name);
+
+  if (*client == nullptr) {
+    return errors::InvalidArgument(
+        "Unable to find eager client corresponding to device ", device->name());
+  }
+
+  auto context_iterator = remote_contexts_.find(device_task_name);
+  if (context_iterator == remote_contexts_.end()) {
+    return errors::Internal("Unable to find a context for handle on task: ",
+                            device_task_name, ". This should not be possible");
+  }
+  *context_id = context_iterator->second;
+
+  device_to_client_cache_.insert({device, {*client, *context_id}});
+
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 6665df27d09a73..096ed3112e8443 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -24,13 +24,17 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/eager/eager_executor.h"
 #include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/eager/eager_client.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
@@ -63,6 +67,29 @@ class EagerContext {
                         std::unique_ptr<DeviceMgr> device_mgr,
                         Rendezvous* rendezvous);
 
+  // TODO(nareshmodi): Split this into 2 classes and hide functionality behind
+  // an interface. Alternatively, encapsulate remote state into a separate
+  // class/struct.
+  //
+  // Constructs an eager context that is able to communicate with remote
+  // workers.
+  //
+  // Additional remote-specific args are:
+  //  - server: A GrpcServer that exports the tensorflow.WorkerService. Note
+  //  that this class expects the server to already have been started.
+  //  - remote_eager_workers: A cache from which we can get "EagerClient"s to
+  //  communicate with remote eager services.
+  //  - remote_device_mgr: A DeviceMgr* which contains all remote devices
+  //  (should contain no local devices).
+  //  - remote_contexts: A map containing task name to remote context ID.
+  explicit EagerContext(
+      const SessionOptions& opts, ContextDevicePlacementPolicy default_policy,
+      bool async, DeviceMgr* local_device_mgr, Rendezvous* rendezvous,
+      std::unique_ptr<GrpcServer> server,
+      std::unique_ptr<eager::EagerClientCache> remote_eager_workers,
+      std::unique_ptr<DeviceMgr> remote_device_manager,
+      const gtl::FlatMap<string, uint64>& remote_contexts);
+
   ~EagerContext();
 
   // Returns the function library runtime for the given device.
@@ -128,10 +155,16 @@ class EagerContext {
 
   mutex* FunctionsMu() { return &functions_mu_; }
 
-  tensorflow::DeviceMgr* device_mgr() { return device_manager_.get(); }
+  const tensorflow::DeviceMgr* local_device_mgr() const {
+    return (local_device_manager_ != nullptr) ? local_device_manager_.get()
+                                              : local_unowned_device_manager_;
+  }
+  const tensorflow::DeviceMgr* remote_device_mgr() {
+    return remote_device_manager_.get();
+  }
 
   // TODO(apassos) remove the need for this
-  void ReleaseDeviceMgr() { device_manager_.release(); }
+  void ReleaseDeviceMgr() { local_device_manager_.release(); }
 
   // TODO(apassos) clean up RunMetadata storage.
   mutex* MetadataMu() { return &metadata_mu_; }
@@ -141,7 +174,12 @@ class EagerContext {
 
   FunctionLibraryDefinition* FuncLibDef() { return &func_lib_def_; }
 
+  Status GetClientAndContextID(Device* device, eager::EagerClient** client,
+                               uint64* context_id);
+
  private:
+  void InitDeviceMapAndAsync();
+
   const ContextDevicePlacementPolicy policy_;
 
   // Note: we cannot use C++11 thread_local here as there is no concept of a
@@ -150,7 +188,10 @@ class EagerContext {
   std::unordered_map<std::thread::id, ContextDevicePlacementPolicy>
       thread_local_policies_ GUARDED_BY(policy_map_mu_);
 
-  std::unique_ptr<DeviceMgr> device_manager_;
+  // Only one of the below is set.
+  std::unique_ptr<DeviceMgr> local_device_manager_;
+  const DeviceMgr* local_unowned_device_manager_;
+
   // Devices owned by device_manager
   std::vector<Device*> devices_;
   // All devices are not owned.
@@ -186,6 +227,17 @@ class EagerContext {
   mutable mutex async_map_mu_;
   std::unordered_map<std::thread::id, bool> thread_local_async_
       GUARDED_BY(async_map_mu_);
+
+  // The server_ is not const since we release it when the context is destroyed.
+  // Therefore the server_ object is not marked as const (even though it should
+  // be).
+  std::unique_ptr<GrpcServer> server_;
+  const std::unique_ptr<eager::EagerClientCache> remote_eager_workers_;
+  const std::unique_ptr<DeviceMgr> remote_device_manager_;
+
+  const gtl::FlatMap<string, uint64> remote_contexts_;
+  gtl::FlatMap<Device*, std::pair<eager::EagerClient*, uint64>>
+      device_to_client_cache_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.h b/tensorflow/core/common_runtime/eager/eager_operation.h
index 6b6e53da87acb8..fcf62c77153204 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.h
+++ b/tensorflow/core/common_runtime/eager/eager_operation.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_OPERATION_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_OPERATION_H_
 
-#include "tensorflow/c/eager/runtime.h"
+#include "tensorflow/core/common_runtime/eager/attr_builder.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index a514f81e146f20..ce989f4b4eb9b4 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -24,11 +24,14 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/execute_node.h"
 #include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/distributed_runtime/eager/eager_client.h"
+#include "tensorflow/core/distributed_runtime/eager/remote_execute_node.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
+#include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
 
@@ -183,14 +186,14 @@ Status SelectDevice(const NodeDef& ndef, EagerContext* ctx, Device** device) {
 // primitive op (e.g. matmul).
 //
 // The wrapper function conforms to the function signature expected by
-// _XlaLaunchOp, with input params ordered by <constants, (variable) args and
+// XlaLaunch, with input params ordered by <constants, (variable) args and
 // resources>. For example, if the op has input params <Const1, Arg2, Const3,
 // Resource4, Arg5>, they will be reordered to <Const1, Const3, Arg2, Arg5,
 // Resource4> as the input params to the synthesized function.
 //
 // It populates `const_input_types`, `arg_input_types` and
 // `op_input_to_func_input` based on the reordering results, that the caller can
-// use them to build an _XlaLaunchOp. On error, it returns NULL, and sets
+// use them to build an XlaLaunch. On error, it returns NULL, and sets
 // `status` accordingly.
 const FunctionDef* OpToFunction(TFE_Op* op,
                                 std::vector<TF_DataType>* const_input_types,
@@ -308,12 +311,12 @@ const FunctionDef* OpToFunction(TFE_Op* op,
   return ret;
 }
 
-// Builds an _XLALaunchOp as a wrapper over 'op', so that 'op' can be executed
+// Builds an XlaLaunch as a wrapper over 'op', so that 'op' can be executed
 // via XLA.
 std::unique_ptr<TFE_Op> BuildXlaLaunch(TFE_Op* op, TF_Status* status) {
-  VLOG(1) << "Creating _XlaLaunchOp for TFE_Op " << op->operation.Name();
+  VLOG(1) << "Creating XlaLaunch for TFE_Op " << op->operation.Name();
   auto launch_op = std::unique_ptr<TFE_Op>(
-      TFE_NewOp(op->operation.ctx, "_XlaLaunch", status));
+      TFE_NewOp(op->operation.ctx, "XlaLaunch", status));
   if (TF_GetCode(status) != TF_OK) return nullptr;
   if (op->operation.device) {
     TFE_OpSetDevice(launch_op.get(), op->operation.device->name().c_str(),
@@ -328,7 +331,7 @@ std::unique_ptr<TFE_Op> BuildXlaLaunch(TFE_Op* op, TF_Status* status) {
   gtl::FlatMap<int, int> op_input_to_func_input;
   if (fdef == nullptr) {
     // See if this is a primitive op, and if so create a function for it, so
-    // that _XlaLaunchOp can access it.
+    // that XlaLaunch can access it.
     fdef = OpToFunction(op, &const_input_types, &arg_input_types,
                         &op_input_to_func_input, status);
     if (!status.ok()) return nullptr;
@@ -392,17 +395,35 @@ std::unique_ptr<TFE_Op> BuildXlaLaunch(TFE_Op* op, TF_Status* status) {
 }
 #endif  // TENSORFLOW_EAGER_USE_XLA
 
+Status GetOutputDTypes(EagerOperation* op, DataTypeVector* output_dtypes) {
+  const auto& node_def = op->MutableAttrs()->BuildNodeDef();
+  const OpDef* op_def = nullptr;
+
+  TF_RETURN_IF_ERROR(OpDefForOp(op->Name().c_str(), &op_def));
+
+  TF_RETURN_IF_ERROR(OutputTypesForNode(node_def, *op_def, output_dtypes));
+
+  return Status::OK();
+}
+
 }  // namespace
 
-Status EagerExecute(EagerOperation* op,
-                    gtl::InlinedVector<TensorHandle*, 2>* retvals,
-                    int* num_retvals) {
+namespace {
+bool IsLocal(EagerContext* ctx, tensorflow::Device* d) {
+  if (d == nullptr || ctx->remote_device_mgr() == nullptr) return true;
+  tensorflow::Device* tmp;
+  return ctx->local_device_mgr()->LookupDevice(d->name(), &tmp).ok();
+}
+
+Status EagerLocalExecute(EagerOperation* op,
+                         gtl::InlinedVector<TensorHandle*, 2>* retvals,
+                         int* num_retvals) {
   EagerContext* ctx = op->EagerContext();
   auto status = ctx->GetStatus();
   if (!status.ok()) return status;
 #ifdef TENSORFLOW_EAGER_USE_XLA
   std::unique_ptr<TFE_Op> xla_launch_op;
-  if (op->UseXla() && op->Name() != "_XlaLaunch") {
+  if (op->UseXla() && op->Name() != "XlaLaunch") {
     xla_launch_op = BuildXlaLaunch(op, status);
     if (!status.ok()) return status;
     op = xla_launch_op.get();
@@ -521,6 +542,127 @@ Status EagerExecute(EagerOperation* op,
   return status;
 }
 
+Status EagerRemoteExecute(EagerOperation* op, eager::EagerClient* eager_client,
+                          uint64 context_id, TensorHandle** retvals,
+                          int* num_retvals) {
+  // All tensors must be on the same device.
+  // TODO(nareshmodi): handle silent copies
+  eager::EnqueueRequest request;
+  eager::EnqueueResponse response;
+
+  auto* remote_op = request.add_queue()->mutable_operation();
+
+  for (auto* input : op->Inputs()) {
+    tensorflow::Device* input_device;
+    TF_RETURN_IF_ERROR(input->Device(&input_device));
+    if (op->Device() != input_device) {
+      return tensorflow::errors::InvalidArgument(
+          "Ops and inputs are not on the same device. Use "
+          "TFE_TensorHandleCopyToDevice to get ops on the same "
+          "device. Expected device: ",
+          op->Device()->name(), ", Actual device: ", input_device->name());
+    }
+
+    tensorflow::uint64 op_id;
+    int32 output_num;
+    TF_RETURN_IF_ERROR(input->RemoteAddress(&op_id, &output_num));
+
+    auto* remote_op_input = remote_op->add_inputs();
+    remote_op_input->set_op_id(op_id);
+    remote_op_input->set_output_num(output_num);
+  }
+
+  remote_op->set_id(op->EagerContext()->NextId());
+  remote_op->set_name(op->Name());
+  // Inputs set above.
+  op->Attrs().FillAttrValueMap(remote_op->mutable_attrs());
+  remote_op->set_device(op->Device()->name());
+
+  request.set_context_id(context_id);
+
+  if (op->EagerContext()->Async()) {
+    tensorflow::uint64 id = op->EagerContext()->NextId();
+    auto* node = new eager::RemoteExecuteNode(id, request, eager_client);
+    op->EagerContext()->ExecutorAdd(node);
+  } else {
+    Notification n;
+    Status status;
+    eager_client->EnqueueAsync(&request, &response,
+                               [&n, &status](const Status& s) {
+                                 status = s;
+                                 n.Notify();
+                               });
+    n.WaitForNotification();
+    if (!status.ok()) return status;
+  }
+
+  DataTypeVector output_dtypes;
+  TF_RETURN_IF_ERROR(GetOutputDTypes(op, &output_dtypes));
+
+  if (*num_retvals != output_dtypes.size()) {
+    return errors::InvalidArgument(
+        "num_retvals does not match expected output dtypes");
+  }
+
+  tensorflow::Device* op_device = op->Device();
+  EagerContext* ctx = op->EagerContext();
+
+  const tensorflow::uint64 id = remote_op->id();
+  for (int i = 0; i < *num_retvals; i++) {
+    // TODO(nareshmodi): Change the callback to instead add the decref to a list
+    // of pending decrefs that we can send as a batch with the next execute.
+    std::function<void()> callback = [ctx, eager_client, context_id, id, i]() {
+      eager::EnqueueRequest request;
+      request.set_context_id(context_id);
+
+      auto* handle_to_decref = request.add_queue()->mutable_handle_to_decref();
+      handle_to_decref->set_op_id(id);
+      handle_to_decref->set_output_num(i);
+
+      if (ctx->Async()) {
+        tensorflow::uint64 id = ctx->NextId();
+        auto* node = new eager::RemoteExecuteNode(id, request, eager_client);
+        ctx->ExecutorAdd(node);
+      } else {
+        Notification n;
+        eager::EnqueueResponse response;
+        eager_client->EnqueueAsync(
+            &request, &response,
+            [&n](const tensorflow::Status& s) { n.Notify(); });
+        n.WaitForNotification();
+      }
+
+      return tensorflow::Status::OK();
+    };
+    retvals[i] = new TensorHandle(remote_op->id(), i, output_dtypes[i],
+                                  std::move(callback), op_device, op_device,
+                                  op->EagerContext());
+  }
+
+  return Status::OK();
+}
+}  // namespace
+
+Status EagerExecute(EagerOperation* op,
+                    gtl::InlinedVector<TensorHandle*, 2>* retvals,
+                    int* num_retvals) {
+  bool op_is_local = IsLocal(op->EagerContext(), op->Device());
+
+  if (op_is_local) {
+    return EagerLocalExecute(op, retvals, num_retvals);
+  }
+
+  auto* ctx = op->EagerContext();
+
+  tensorflow::eager::EagerClient* eager_client;
+  tensorflow::uint64 context_id;
+  TF_RETURN_IF_ERROR(
+      ctx->GetClientAndContextID(op->Device(), &eager_client, &context_id));
+
+  return EagerRemoteExecute(op, eager_client, context_id, retvals->data(),
+                            num_retvals);
+}
+
 Status EagerExecute(EagerContext* ctx, Device* device,
                     const gtl::InlinedVector<TensorHandle*, 4>& op_inputs,
                     KernelAndDevice* kernel, NodeExecStats* maybe_stats,
@@ -593,13 +735,11 @@ Status EagerExecute(EagerContext* ctx, Device* device,
   return Status::OK();
 }
 
-Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
-                         const char* device_name, TensorHandle** result) {
+namespace {
+
+Status LocalEagerCopyToDevice(TensorHandle* h, EagerContext* ctx, Device* dstd,
+                              TensorHandle** result) {
   TF_RETURN_IF_ERROR(ctx->GetStatus());
-  Device* dstd = ctx->HostCPU();
-  if (device_name != nullptr && strlen(device_name) > 0) {
-    TF_RETURN_IF_ERROR(ctx->device_mgr()->LookupDevice(device_name, &dstd));
-  }
   if (ctx->Async()) {
     // Note that `h` may not be currently ready. However execution order will
     // make sure that `h` is ready before the copy is actually done.
@@ -616,4 +756,118 @@ Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
   }
 }
 
+Status FindDeviceFromName(EagerContext* ctx, const char* device_name,
+                          Device** device) {
+  *device = ctx->HostCPU();
+  if (device_name == nullptr || strlen(device_name) == 0) {
+    return Status::OK();
+  }
+
+  auto status = ctx->local_device_mgr()->LookupDevice(device_name, device);
+  if (status.ok()) {
+    return status;
+  }
+
+  if (ctx->remote_device_mgr() != nullptr) {
+    return ctx->remote_device_mgr()->LookupDevice(device_name, device);
+  }
+
+  return status;
+}
+
+Status ExecuteSend(EagerContext* ctx, tensorflow::Device* device,
+                   TensorHandle* h, StringPiece wire_id,
+                   const string& recv_device) {
+  const tensorflow::AttrTypeMap* types;
+  TF_RETURN_IF_ERROR(tensorflow::AttrTypeMapForOp("_Send", &types));
+  tensorflow::EagerOperation op(ctx, "_Send", types);
+
+  op.AddInput(h);
+
+  op.SetDevice(device);
+
+  op.MutableAttrs()->Set("tensor_name", wire_id);
+  op.MutableAttrs()->Set("send_device", device->name());
+  op.MutableAttrs()->Set(
+      "send_device_incarnation",
+      static_cast<int64>(device->attributes().incarnation()));
+  op.MutableAttrs()->Set("recv_device", recv_device);
+  op.MutableAttrs()->Set("client_terminated", false);
+
+  op.MutableAttrs()->Set("T", h->dtype);
+
+  int num_outputs = 0;
+  gtl::InlinedVector<TensorHandle*, 2> retvals;
+
+  return EagerExecute(&op, &retvals, &num_outputs);
+}
+
+Status ExecuteRecv(EagerContext* ctx, tensorflow::Device* device,
+                   DataType dtype, StringPiece wire_id,
+                   const string& send_device, int64 send_device_incarnation,
+                   TensorHandle** result) {
+  const tensorflow::AttrTypeMap* types;
+  TF_RETURN_IF_ERROR(tensorflow::AttrTypeMapForOp("_Recv", &types));
+  tensorflow::EagerOperation op(ctx, "_Recv", types);
+
+  op.SetDevice(device);
+
+  op.MutableAttrs()->Set("tensor_name", wire_id);
+  op.MutableAttrs()->Set("send_device", send_device);
+  op.MutableAttrs()->Set("send_device_incarnation", send_device_incarnation);
+  op.MutableAttrs()->Set("recv_device", device->name());
+  op.MutableAttrs()->Set("client_terminated", false);
+
+  op.MutableAttrs()->Set("tensor_type", dtype);
+
+  int num_outputs = 1;
+  gtl::InlinedVector<TensorHandle*, 2> retvals(num_outputs);
+
+  TF_RETURN_IF_ERROR(EagerExecute(&op, &retvals, &num_outputs));
+
+  *result = retvals.at(0);
+
+  return Status::OK();
+}
+
+// This gets a unique wire ID. We add a random identifier so that if the worker
+// has other clients that it is servicing, we don't have any collision.
+string GetUniqueWireID() {
+  static tensorflow::uint64 random_seed = random::New64();
+  static tensorflow::mutex wireid_mutex(tensorflow::LINKER_INITIALIZED);
+  static tensorflow::int64 wireid GUARDED_BY(wireid_mutex) = 0;
+  tensorflow::mutex_lock l(wireid_mutex);
+  return strings::StrCat(random_seed, "_", wireid++);
+}
+
+}  // namespace
+
+Status EagerCopyToDevice(TensorHandle* h, EagerContext* ctx,
+                         const char* device_name, TensorHandle** result) {
+  tensorflow::Device* send_device;
+  TF_RETURN_IF_ERROR(h->Device(&send_device));
+
+  if (send_device == nullptr) {
+    send_device = ctx->HostCPU();
+  }
+
+  bool sender_is_local = IsLocal(ctx, send_device);
+
+  tensorflow::Device* recv_device;
+  TF_RETURN_IF_ERROR(FindDeviceFromName(ctx, device_name, &recv_device));
+
+  bool recver_is_local = IsLocal(ctx, recv_device);
+
+  if (sender_is_local && recver_is_local) {
+    return LocalEagerCopyToDevice(h, ctx, recv_device, result);
+  } else {
+    string wire_id = GetUniqueWireID();
+
+    TF_RETURN_IF_ERROR(
+        ExecuteSend(ctx, send_device, h, wire_id, recv_device->name()));
+
+    return ExecuteRecv(ctx, recv_device, h->dtype, wire_id, send_device->name(),
+                       send_device->attributes().incarnation(), result);
+  }
+}
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/execute.h b/tensorflow/core/common_runtime/eager/execute.h
index 7c8d7e164d0b80..f4f84980fb90a0 100644
--- a/tensorflow/core/common_runtime/eager/execute.h
+++ b/tensorflow/core/common_runtime/eager/execute.h
@@ -27,6 +27,15 @@ limitations under the License.
 namespace tensorflow {
 
 // Utility function that executes a fully constructed EagerOperation.
+// There are a few possible different combinations of how things can be
+// executed:
+//  - Async (the op context is configured to schedule asynchronously)
+//    Eager execute should return quickly after scheduling this operation to
+//    execute.
+//  - Remote (the op device is on a remote task)
+//    Eager execute will send an RPC to execute the op on a remote device.
+//  Note that in the Async + Remote case, EagerExecute should still return
+//  quickly, but it will schedule the op to be executed remotely.
 Status EagerExecute(
     EagerOperation* op,
     tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2>* retvals,
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index 0a4895a938a72a..2a43a31c02233e 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/gtl/stl_util.h"
@@ -85,6 +87,11 @@ Status KernelAndDevice::Run(std::vector<Tensor>* input_tensors,
       [](std::function<void()> f) { f(); };
   params.runner = &runner;
 
+  ScopedStepContainer step_container(0, [this](const string& name) {
+    device_->resource_manager()->Cleanup(name).IgnoreError();
+  });
+  params.step_container = &step_container;
+
   OpKernelContext context(&params);
 
   if (kernel_->def().op() == "_Recv") {
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h
index 46ec550c780aaa..f78d197fd55551 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.h
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@@ -32,6 +32,10 @@ limitations under the License.
 
 namespace tensorflow {
 
+// Forward declaration for proto class NodeExecStats so we do not need to
+// include the proto header
+class NodeExecStats;
+
 // KernelAndDevice encapsulates an instantiated kernel and the device it is on.
 //
 // Also see:
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc b/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
index dd055c3c3eb6df..b4349e1dee7262 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
@@ -18,13 +18,13 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/c/eager/runtime.h"
 #include "tensorflow/cc/client/client_session.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/eager/attr_builder.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index 8e11f7b7104ce2..1a811aa8df8728 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -50,6 +50,10 @@ bool TensorHandle::IsReady() {
   return is_ready_;
 }
 
+bool TensorHandle::IsRemote() {
+  return remote_op_id_ >= 0 && remote_output_num_ >= 0;
+}
+
 Status TensorHandle::WaitReady() {
   if (node_id == 0) return Status::OK();
   EagerExecutor* executor = nullptr;
@@ -62,6 +66,11 @@ Status TensorHandle::WaitReady() {
 }
 
 Status TensorHandle::Tensor(const tensorflow::Tensor** t) {
+  if (IsRemote()) {
+    return errors::Unavailable(
+        "Unable to get a tensor for a remote device. Please copy the tensor "
+        "handle to a local device using TFE_TensorHandleCopyToDevice");
+  }
   TF_RETURN_IF_ERROR(WaitReady());
   DCHECK(IsReady());
   *t = &tensor_;
@@ -85,6 +94,11 @@ Status TensorHandle::OpDevice(tensorflow::Device** d) {
 Status TensorHandle::TensorAndDevice(const tensorflow::Tensor** tensor,
                                      tensorflow::Device** device,
                                      tensorflow::Device** op_device) {
+  if (IsRemote()) {
+    return errors::Unavailable(
+        "Unable to get a tensor for a remote device. Please copy the tensor "
+        "handle to a local device using TFE_TensorHandleCopyToDevice");
+  }
   TF_RETURN_IF_ERROR(WaitReady());
   DCHECK(IsReady());
   *tensor = &tensor_;
@@ -93,6 +107,17 @@ Status TensorHandle::TensorAndDevice(const tensorflow::Tensor** tensor,
   return Status::OK();
 }
 
+Status TensorHandle::RemoteAddress(uint64* op_id, int32* output_num) {
+  if (!IsRemote()) {
+    return errors::FailedPrecondition(
+        "This TensorHandle refers to a local tensor handle");
+  }
+  *op_id = remote_op_id_;
+  *output_num = remote_output_num_;
+
+  return Status::OK();
+}
+
 void TensorHandle::SetTensorAndDevice(const tensorflow::Tensor& tensor,
                                       tensorflow::Device* device,
                                       tensorflow::Device* op_device) {
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index d66c4d95e2a551..a3b7dd862e368a 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -45,7 +45,7 @@ limitations under the License.
 namespace tensorflow {
 
 // Associates a Tensor and a Device, used in the eager runtime. Internal version
-// executor_of the TFE_TensorHandle struct and the python EagerTensor class
+// of the TFE_TensorHandle struct and the python EagerTensor class
 // (unrelated to python TensorHandle).
 class TensorHandle : public core::RefCounted {
  public:
@@ -55,6 +55,8 @@ class TensorHandle : public core::RefCounted {
         tensor_(t),
         device_(d),
         op_device_(op_device),
+        remote_op_id_(-1),
+        remote_output_num_(-1),
         ctx_(ctx),
         is_ready_(true) {}
 
@@ -64,12 +66,35 @@ class TensorHandle : public core::RefCounted {
         tensor_(dtype),
         device_(nullptr),
         op_device_(nullptr),
+        remote_op_id_(-1),
+        remote_output_num_(-1),
         ctx_(ctx),
         is_ready_(ctx == nullptr) {
     DCHECK_GT(node_id, 0);
   }
 
-  ~TensorHandle() override {}
+  // Remote tensor handle constructor.
+  TensorHandle(uint64 op_id, int32 output_num, DataType dtype,
+               std::function<void()> call_on_destroy, Device* d,
+               Device* op_device, EagerContext* ctx)
+      : dtype(dtype),
+        node_id(0),
+        device_(d),
+        op_device_(op_device),
+        remote_op_id_(op_id),
+        remote_output_num_(output_num),
+        call_on_destroy_(std::move(call_on_destroy)),
+        ctx_(ctx),
+        is_ready_(true) {
+    DCHECK(IsRemote()) << "Op ID and output num should be >= 0. Op ID: "
+                       << op_id << ", Output num: " << output_num;
+  }
+
+  ~TensorHandle() override {
+    if (call_on_destroy_) {
+      call_on_destroy_();
+    }
+  }
 
   Status Tensor(const tensorflow::Tensor** t);
 
@@ -81,6 +106,9 @@ class TensorHandle : public core::RefCounted {
                          tensorflow::Device** device,
                          tensorflow::Device** op_device);
 
+  // Return the op_id and output num if the handle refers to a remote tensor.
+  Status RemoteAddress(uint64* op_id, int32* output_num);
+
   // Note that this can be called at most once, and only on non-ready handles,
   // and makes them ready.
   void SetTensorAndDevice(const tensorflow::Tensor& tensor,
@@ -108,6 +136,8 @@ class TensorHandle : public core::RefCounted {
 
   bool IsReady();
 
+  bool IsRemote();
+
   // Id for the EagerNode that will compute the value pointed to by this handle.
   // If the value is 0, the handle is already ready, but not vice-versa.
   const uint64 node_id;
@@ -128,6 +158,15 @@ class TensorHandle : public core::RefCounted {
   // device_ for constant tensors.
   tensorflow::Device* op_device_;
 
+  // IDs required when this class is representing a remote tensor handle.
+  const uint64 remote_op_id_;
+  const int32 remote_output_num_;
+
+  // A callback that is executed when the class is destroyed.
+  //
+  // This is currently used for remote tensor handles.
+  const std::function<void()> call_on_destroy_;
+
   mutex ctx_mutex_;
 
   // `ctx` is only guaranteed to be set if the handle is not "ready". This is
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 0c461a9ee98ca6..585d777e81cdc7 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/framework/allocation_description.pb.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/control_flow.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -272,9 +273,9 @@ struct NodeItem {
   // (uint8 is enough for DataType).
   //   EdgeInfo            out_edges[num_out_edges];
   //   AllocatorAttributes output_attr[num_outputs];
+  //   int                 forward_from[num_outputs];
   //   uint8               input_type[num_inputs];
   //   uint8               output_type[num_outputs];
-  //   int                 forward_from[num_outputs];
 
   // Return pointer to variable length section.
   char* var() const {
@@ -289,22 +290,20 @@ struct NodeItem {
     return reinterpret_cast<AllocatorAttributes*>(var() + sizeof(EdgeInfo) *
                                                               num_output_edges);
   }
+  int* forward_from_base() const {
+    return reinterpret_cast<int*>(var() + sizeof(EdgeInfo) * num_output_edges +
+                                  sizeof(AllocatorAttributes) * num_outputs);
+  }
   uint8* input_type_base() const {
-    return reinterpret_cast<uint8*>(var() +
-                                    sizeof(EdgeInfo) * num_output_edges +
-                                    sizeof(AllocatorAttributes) * num_outputs);
+    return reinterpret_cast<uint8*>(
+        var() + sizeof(EdgeInfo) * num_output_edges +
+        sizeof(AllocatorAttributes) * num_outputs + sizeof(int) * num_outputs);
   }
   uint8* output_type_base() const {
     return reinterpret_cast<uint8*>(
         var() + sizeof(EdgeInfo) * num_output_edges +
-        sizeof(AllocatorAttributes) * num_outputs + sizeof(uint8) * num_inputs);
-  }
-
-  int* forward_from_base() const {
-    return reinterpret_cast<int*>(var() + sizeof(EdgeInfo) * num_output_edges +
-                                  sizeof(AllocatorAttributes) * num_outputs +
-                                  sizeof(uint8) * num_inputs +
-                                  sizeof(uint8) * num_outputs);
+        sizeof(AllocatorAttributes) * num_outputs + sizeof(int) * num_outputs +
+        sizeof(uint8) * num_inputs);
   }
 
   TF_DISALLOW_COPY_AND_ASSIGN(NodeItem);
@@ -322,6 +321,7 @@ class GraphView {
 
   void Initialize(const Graph* g);
   Status SetAllocAttrs(const Graph* g, const Device* device);
+  void SetScopedAllocatorAttrs(const std::vector<const Node*>& sa_nodes);
 
   NodeItem* node(size_t id) const {
     DCHECK_GE(id, 0);
@@ -480,9 +480,9 @@ size_t GraphView::NodeItemBytes(const Node* n) {
       sizeof(NodeItem)                             // Fixed
       + num_output_edges * sizeof(EdgeInfo)        // output_edges[...]
       + num_outputs * sizeof(AllocatorAttributes)  // output_attr[...]
+      + num_outputs * sizeof(int)                  // forward_from[num_outputs]
       + num_inputs * sizeof(uint8)                 // input_type[num_inputs]
-      + num_outputs * sizeof(uint8)                // output_type[num_outputs]
-      + num_outputs * sizeof(int);                 // forward_from[num_outputs]
+      + num_outputs * sizeof(uint8);               // output_type[num_outputs]
   static constexpr size_t kItemAlignment = sizeof(NodeItem*);
   static_assert(kItemAlignment % alignof(NodeItem) == 0,
                 "NodeItem must be aligned with kItemAlignment");
@@ -566,11 +566,47 @@ char* GraphView::InitializeNode(char* ptr, const Node* n) {
     DCHECK_EQ(item->input_type(i), n->input_type(i));
   }
 
-  uint8* output_types = item->output_type_base();
-  for (int i = 0; i < num_outputs; i++) {
-    output_types[i] = static_cast<uint8>(n->output_type(i));
-    DCHECK_EQ(item->output_type(i), n->output_type(i));
+  // Check ScopedAllocatorAttrs and forward_from.  Also assign output_types.
+  {
+    std::vector<int> forward_input;
+    Status fwd_status =
+        GetNodeAttr(n->attrs(), "_forward_input", &forward_input);
+    std::vector<int> scoped_allocator_attrs;
+    Status sa_status =
+        GetNodeAttr(n->attrs(), "_scoped_allocator", &scoped_allocator_attrs);
+
+    int* forward_from = item->forward_from_base();
+    uint8* output_types = item->output_type_base();
+    for (int i = 0; i < num_outputs; ++i) {
+      output_types[i] = static_cast<uint8>(n->output_type(i));
+      DCHECK_EQ(item->output_type(i), n->output_type(i));
+
+      forward_from[i] = OpKernelContext::Params::kNoReservation;
+      if (sa_status.ok()) {
+        for (int j = 0; j < scoped_allocator_attrs.size(); j += 2) {
+          if (scoped_allocator_attrs[j] == i) {
+            // This output slot must be explicitly allocated from a
+            // ScopedAllocator.
+            forward_from[i] = OpKernelContext::Params::kNeverForward;
+            DCHECK_EQ(output_attrs[i].scope_id, 0);
+            output_attrs[i].scope_id = scoped_allocator_attrs[j + 1];
+          }
+        }
+      }
+      if (fwd_status.ok() &&
+          forward_from[i] == OpKernelContext::Params::kNoReservation) {
+        DCHECK_EQ(forward_input.size() % 2, 0);
+        for (int j = 0; j < forward_input.size(); j += 2) {
+          if (forward_input[j + 1] == i) {
+            DCHECK_EQ(forward_from[i], OpKernelContext::Params::kNoReservation);
+            forward_from[i] = forward_input[j];
+            break;
+          }
+        }
+      }
+    }
   }
+
   return ptr;
 }
 
@@ -696,22 +732,86 @@ Status ExecutorImpl::Initialize() {
   return gview_.SetAllocAttrs(graph_.get(), params_.device);
 }
 
+// If a Node has been marked to use a ScopedAllocator x for output i, then
+// sc_attr will contain the subsequence (i, x) at an even offset.  This function
+// extracts and transfers that ScopedAllocator id to alloc_attr.  For now, we
+// only allow one ScopedAllocator use per Node.
+bool ExtractScopedAllocatorAttr(const std::vector<int>& sc_attr,
+                                int output_index,
+                                AllocatorAttributes* alloc_attr) {
+  DCHECK_LE(2, sc_attr.size());
+  for (int i = 0; i < sc_attr.size(); i += 2) {
+    if (sc_attr[i] == output_index) {
+      CHECK_EQ(alloc_attr->scope_id, 0);
+      alloc_attr->scope_id = sc_attr[i + 1];
+      return true;
+    }
+  }
+  return false;
+}
+
+void GraphView::SetScopedAllocatorAttrs(
+    const std::vector<const Node*>& sa_nodes) {
+  for (const Node* sa : sa_nodes) {
+    NodeItem* sa_item = node(sa->id());
+    AllocatorAttributes* sa_attrs = sa_item->output_attr_base();
+    // Control edges out of the ScopedAllocator should be use instances, but may
+    // include a few other nodes.
+    for (const auto& e : sa->out_edges()) {
+      if (!e->IsControlEdge()) {
+        continue;
+      }
+      Node* use_node = e->dst();
+      NodeItem* item = node(use_node->id());
+      AllocatorAttributes* use_attrs = item->output_attr_base();
+      std::vector<int> scoped_allocator_attrs;
+      Status s = GetNodeAttr(use_node->attrs(), "_scoped_allocator",
+                             &scoped_allocator_attrs);
+      if (!s.ok()) {
+        VLOG(2) << "Failed to find expected ScopedAllocator attr on "
+                << use_node->name();
+        continue;
+      }
+      // There can be more than one output using ScopedAllocation, but this
+      // analysis assumes they use the same ScopedAllocator.
+      for (const auto& e : use_node->out_edges()) {
+        if (!e->IsControlEdge()) {
+          AllocatorAttributes attr;
+          if (ExtractScopedAllocatorAttr(scoped_allocator_attrs,
+                                         e->src_output(), &attr)) {
+            // Set the scope_id on this use instance node.
+            (use_attrs + e->src_output())->Merge(attr);
+            // Propagate the other attributes of this node back to the SA node.
+            attr = *(use_attrs + e->src_output());
+            attr.scope_id = 0;
+            sa_attrs->Merge(attr);
+          }
+        }
+      }
+    }
+  }
+}
+
 Status GraphView::SetAllocAttrs(const Graph* g, const Device* device) {
   Status s;
   DeviceNameUtils::ParsedName local_dev_name = device->parsed_name();
 
+  std::vector<const Node*> scoped_allocator_instances;
   for (const Node* n : g->nodes()) {
     NodeItem* item = node(n->id());
     AllocatorAttributes* attrs = item->output_attr_base();
+    if (IsScopedAllocator(n)) {
+      scoped_allocator_instances.push_back(n);
+    }
 
     // Examine the out edges of each node looking for special use
     // cases that may affect memory allocation attributes.
-    for (auto e : n->out_edges()) {
+    for (const auto& e : n->out_edges()) {
       if (!e->IsControlEdge()) {
         AllocatorAttributes attr;
         s = InferAllocAttr(n, e->dst(), local_dev_name, &attr);
         if (!s.ok()) return s;
-        if (attr.value != 0) {
+        if (attr.value != 0 || attr.scope_id != 0) {
           attrs[e->src_output()].Merge(attr);
         }
       }
@@ -728,6 +828,7 @@ Status GraphView::SetAllocAttrs(const Graph* g, const Device* device) {
       }
     }
   }
+  SetScopedAllocatorAttrs(scoped_allocator_instances);
   return s;
 }
 
@@ -789,6 +890,11 @@ Status InferAllocAttr(const Node* n, const Node* dst,
               << " remote type " << parsed_dst_name.type;
     }
   }
+  if (n->IsCollective()) {
+    // We'll make the sweeping assumption that any collective op is going
+    // to be involved in network i/o.
+    attr->set_nic_compatible(true);
+  }
   return s;
 }
 
@@ -1191,6 +1297,7 @@ class ExecutorState {
   int64 step_id_;
   // Not owned.
   Rendezvous* rendezvous_;
+  CollectiveExecutor* collective_executor_ = nullptr;
   SessionState* session_state_;
   TensorStore* tensor_store_;
   // Step-local container.
@@ -1313,6 +1420,7 @@ ExecutorState::ExecutorState(const Executor::Args& args, ExecutorImpl* impl)
       log_memory_(LogMemory::IsEnabled()),
       step_id_(args.step_id),
       rendezvous_(args.rendezvous),
+      collective_executor_(args.collective_executor),
       session_state_(args.session_state),
       tensor_store_(args.tensor_store),
       step_container_(args.step_container),
@@ -1523,6 +1631,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
   params.log_memory = log_memory_;
   params.record_tensor_accesses = impl_->device_record_tensor_accesses_;
   params.rendezvous = rendezvous_;
+  params.collective_executor = collective_executor_;
   params.session_state = session_state_;
   params.tensor_store = tensor_store_;
   params.cancellation_manager = cancellation_manager_;
@@ -1576,7 +1685,8 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
 
     if (vlog_) {
       VLOG(1) << "Process node: " << id << " step " << params.step_id << " "
-              << SummarizeNode(*node) << " is dead: " << tagged_node.is_dead;
+              << SummarizeNode(*node) << " is dead: " << tagged_node.is_dead
+              << " device: " << device->name();
     }
 
     Entry* input_tensors = GetInputTensors(input_frame, input_iter);
@@ -1614,7 +1724,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
       params.frame_iter = FrameAndIter(input_frame->frame_id, input_iter);
       params.is_input_dead = is_input_dead;
       params.output_attr_array = item.output_attrs();
-      params.forward_from_array = nullptr;  // later: item.forward_from();
+      params.forward_from_array = item.forward_from();
 
       if (item.kernel_is_async) {
         // Asynchronous computes.
@@ -1637,7 +1747,8 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
             VLOG(2) << "Async kernel done: " << state->item->node->id()
                     << " step " << step_id_ << " "
                     << SummarizeNode(*state->item->node)
-                    << " is dead: " << state->tagged_node.is_dead;
+                    << " is dead: " << state->tagged_node.is_dead
+                    << " device: " << device->name();
           }
 
           // Clears inputs.
@@ -1690,7 +1801,8 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_usec) {
       if (vlog_) {
         VLOG(2) << "Synchronous kernel done: " << id << " step "
                 << params.step_id << " " << SummarizeNode(*node)
-                << " is dead: " << tagged_node.is_dead;
+                << " is dead: " << tagged_node.is_dead
+                << " device: " << device->name();
       }
 
       // Clears inputs.
@@ -2079,6 +2191,9 @@ bool ExecutorState::NodeDone(const Status& s, const Node* node,
     if (rendezvous_) {
       rendezvous_->StartAbort(s);
     }
+    if (collective_executor_) {
+      collective_executor_->StartAbort(s);
+    }
     if (cancellation_manager_) {
       cancellation_manager_->StartCancel();
     }
diff --git a/tensorflow/core/common_runtime/executor.h b/tensorflow/core/common_runtime/executor.h
index adf80a2417e2a8..e5d7b7c53c759e 100644
--- a/tensorflow/core/common_runtime/executor.h
+++ b/tensorflow/core/common_runtime/executor.h
@@ -89,6 +89,7 @@ class Executor {
     SessionState* session_state = nullptr;
     TensorStore* tensor_store = nullptr;
     ScopedStepContainer* step_container = nullptr;
+    CollectiveExecutor* collective_executor = nullptr;
 
     // If true, calls Sync() on the device.
     bool sync_on_finish = false;
diff --git a/tensorflow/core/distributed_runtime/executor_test.cc b/tensorflow/core/common_runtime/executor_test.cc
similarity index 85%
rename from tensorflow/core/distributed_runtime/executor_test.cc
rename to tensorflow/core/common_runtime/executor_test.cc
index e34224205bac48..8cb156785277d9 100644
--- a/tensorflow/core/distributed_runtime/executor_test.cc
+++ b/tensorflow/core/common_runtime/executor_test.cc
@@ -410,4 +410,73 @@ TEST_F(ExecutorTest, RecvInvalidRefDtype) {
   rendez->Unref();
 }
 
+// Create a graph that is 'depth' deep. At each level, fan-in and fan-out a
+// maximum of 'width' nodes. All nodes are no-ops and all dependencies are
+// control dependencies.
+static void BM_executor(int iters, int width, int depth) {
+#ifdef PLATFORM_GOOGLE
+  BenchmarkUseRealTime();
+#endif  // PLATFORM_GOOGLE
+  Graph* g = new Graph(OpRegistry::Global());
+  random::PhiloxRandom philox(1729, 17);
+  random::SimplePhilox rand(&philox);
+  uint64 cur = 0;
+  uint32 r = 1 + rand.Rand32() % width;
+  std::vector<Node*> ready_nodes;
+  for (int i = 0; i < r; ++i) {
+    ready_nodes.push_back(test::graph::NoOp(g, {}));
+    ++cur;
+  }
+  for (int i = 0; i < depth; ++i) {
+    std::random_shuffle(ready_nodes.begin(), ready_nodes.end());
+    r = 1 + rand.Rand32() % (ready_nodes.size());
+    std::vector<Node*> control_inputs;
+    for (int j = 0; j < r; ++j) {
+      control_inputs.push_back(ready_nodes.back());
+      ready_nodes.pop_back();
+    }
+    Node* n = test::graph::NoOp(g, control_inputs);
+    ++cur;
+    r = 1 + rand.Rand32() % width;
+    for (int j = 0; j < r; ++j) {
+      ready_nodes.push_back(test::graph::NoOp(g, {n}));
+      ++cur;
+    }
+  }
+#ifdef PLATFORM_GOOGLE
+  SetBenchmarkLabel(strings::StrCat("Nodes = ", cur));
+  SetBenchmarkItemsProcessed(cur * static_cast<int64>(iters));
+#endif  // PLATFORM_GOOGLE
+  test::Benchmark("cpu", g).Run(iters);
+}
+
+// Tall skinny graphs
+BENCHMARK(BM_executor)->ArgPair(16, 1024);
+BENCHMARK(BM_executor)->ArgPair(32, 8192);
+
+// Short fat graphs
+BENCHMARK(BM_executor)->ArgPair(1024, 16);
+BENCHMARK(BM_executor)->ArgPair(8192, 32);
+
+// Tall fat graph
+BENCHMARK(BM_executor)->ArgPair(1024, 1024);
+
+static void BM_FeedInputFetchOutput(int iters) {
+  Graph* g = new Graph(OpRegistry::Global());
+  // z = x + y: x and y are provided as benchmark inputs.  z is the
+  // output of the benchmark.  Conceptually, the caller is "a", the
+  // benchmark is "b".
+  Node* x = test::graph::Recv(g, "x", "float", "a", 1, "b");
+  Node* y = test::graph::Recv(g, "y", "float", "a", 1, "b");
+  Node* sum = test::graph::Add(g, x, y);
+  Node* z = test::graph::Send(g, sum, "z", "b", 1, "a");
+  Tensor val(DT_FLOAT, TensorShape({}));
+  val.scalar<float>()() = 3.14;
+#ifdef PLATFORM_GOOGLE
+  SetBenchmarkItemsProcessed(static_cast<int64>(iters));
+#endif  // PLATFORM_GOOGLE
+  test::Benchmark("cpu", g).RunWithArgs({{x, val}, {y, val}}, {z}, iters);
+}
+BENCHMARK(BM_FeedInputFetchOutput);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index a6f637b48837a0..5d9be70522016e 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/common_runtime/memory_types.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -208,19 +209,19 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
 
   // The instantiated and transformed function is encoded as a Graph
   // object, and an executor is created for the graph.
-  struct Item : public core::RefCounted {
-    bool invalidated = false;
+  struct Item {
+    uint64 instantiation_counter = 0;
     const Graph* graph = nullptr;                            // Owned by exec.
     const FunctionLibraryDefinition* overlay_lib = nullptr;  // Not owned.
     FunctionBody* func_graph = nullptr;
     Executor* exec = nullptr;
 
-    ~Item() override {
+    ~Item() {
       delete this->func_graph;
       delete this->exec;
     }
   };
-  std::unordered_map<Handle, Item*> items_ GUARDED_BY(mu_);
+  std::unordered_map<Handle, std::unique_ptr<Item>> items_ GUARDED_BY(mu_);
 
   ProcessFunctionLibraryRuntime* parent_ = nullptr;  // not owned.
 
@@ -284,9 +285,7 @@ FunctionLibraryRuntimeImpl::FunctionLibraryRuntimeImpl(
   }
 }
 
-FunctionLibraryRuntimeImpl::~FunctionLibraryRuntimeImpl() {
-  for (auto p : items_) p.second->Unref();
-}
+FunctionLibraryRuntimeImpl::~FunctionLibraryRuntimeImpl() {}
 
 // An asynchronous op kernel which executes an instantiated function
 // defined in a library.
@@ -490,30 +489,24 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
   options_copy.target = device_name_;
   const string key = Canonicalize(function_name, attrs, options_copy);
 
-  Handle found_handle = kInvalidHandle;
   {
     mutex_lock l(mu_);
-    found_handle = parent_->GetHandle(key);
-    if (found_handle != kInvalidHandle) {
+    *handle = parent_->GetHandle(key);
+    if (*handle != kInvalidHandle) {
       FunctionLibraryRuntime::LocalHandle handle_on_device =
-          parent_->GetHandleOnDevice(device_name_, found_handle);
+          parent_->GetHandleOnDevice(device_name_, *handle);
       if (handle_on_device == kInvalidLocalHandle) {
         return errors::Internal("LocalHandle not found for handle ", *handle,
                                 ".");
       }
-      auto iter = items_.find(handle_on_device);
-      if (iter == items_.end()) {
+      auto item_handle = items_.find(handle_on_device);
+      if (item_handle == items_.end()) {
         return errors::Internal("LocalHandle ", handle_on_device,
-                                " for handle ", found_handle,
+                                " for handle ", *handle,
                                 " not found in items.");
       }
-      Item* item = iter->second;
-      if (!item->invalidated) {
-        *handle = found_handle;
-        return Status::OK();
-      }
-      // *item is invalidated. Fall through and instantiate the given
-      // function_name/attrs/option again.
+      ++item_handle->second->instantiation_counter;
+      return Status::OK();
     }
   }
 
@@ -545,16 +538,18 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
 
   {
     mutex_lock l(mu_);
-    Handle found_handle_again = parent_->GetHandle(key);
-    if (found_handle_again != found_handle) {
+    *handle = parent_->GetHandle(key);
+    if (*handle != kInvalidHandle) {
       delete fbody;
-      *handle = found_handle_again;
+      ++items_[parent_->GetHandleOnDevice(device_name_, *handle)]
+            ->instantiation_counter;
     } else {
       *handle = parent_->AddHandle(key, device_name_, next_handle_);
       Item* item = new Item;
       item->func_graph = fbody;
       item->overlay_lib = options.overlay_lib;
-      items_.insert({next_handle_, item});
+      item->instantiation_counter = 1;
+      items_.emplace(next_handle_, std::unique_ptr<Item>(item));
       next_handle_++;
     }
   }
@@ -565,12 +560,17 @@ Status FunctionLibraryRuntimeImpl::ReleaseHandle(Handle handle) {
   if (!parent_->IsInstantiatedOnDevice(device_name_, handle)) {
     return parent_->ReleaseHandle(handle);
   }
+
   LocalHandle h = parent_->GetHandleOnDevice(device_name_, handle);
   CHECK_NE(h, kInvalidLocalHandle);
   mutex_lock l(mu_);
   CHECK_EQ(1, items_.count(h));
-  Item* item = items_[h];
-  item->invalidated = true;  // Reinstantiate later.
+  std::unique_ptr<Item>& item = items_[h];
+  --item->instantiation_counter;
+  if (item->instantiation_counter == 0) {
+    items_.erase(h);
+    TF_RETURN_IF_ERROR(parent_->RemoveHandle(handle));
+  }
   return Status::OK();
 }
 
@@ -680,7 +680,7 @@ Status FunctionLibraryRuntimeImpl::GetOrCreateItem(Handle handle, Item** item) {
       return errors::NotFound("Function handle ", handle,
                               " is not valid. Likely an internal error.");
     }
-    *item = items_[local_handle];
+    *item = items_[local_handle].get();
     if ((*item)->exec != nullptr) {
       return Status::OK();
     }
@@ -731,7 +731,6 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
   // computation is done and stored in *rets, we send the return values back
   // to the source_device (caller) so that the ProcFLR can receive them later.
   std::vector<Tensor>* remote_args = new std::vector<Tensor>;
-  item->Ref();
   ProcessFunctionLibraryRuntime::ReceiveTensorsAsync(
       source_device, target_device, "arg_", src_incarnation, args.size(),
       device_context, {}, rendezvous, remote_args,
@@ -743,7 +742,6 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
           s = frame->SetArgs(*remote_args);
         }
         if (!s.ok()) {
-          item->Unref();
           delete frame;
           delete remote_args;
           delete exec_args;
@@ -751,10 +749,9 @@ void FunctionLibraryRuntimeImpl::RunRemote(const Options& opts, Handle handle,
           return;
         }
         item->exec->RunAsync(
-            *exec_args, [item, frame, rets, done, source_device, target_device,
+            *exec_args, [frame, rets, done, source_device, target_device,
                          target_incarnation, rendezvous, device_context,
                          remote_args, exec_args](const Status& status) {
-              core::ScopedUnref unref(item);
               Status s = status;
               if (s.ok()) {
                 s = frame->ConsumeRetvals(rets);
@@ -795,16 +792,16 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
     };
   }
 
-  if (run_opts.runner == nullptr) {
-    run_opts.runner = &default_runner_;
-  }
-  DCHECK(run_opts.runner != nullptr);
-
   if (!parent_->IsInstantiatedOnDevice(device_name_, handle)) {
     parent_->Run(run_opts, handle, args, rets, done);
     return;
   }
 
+  if (run_opts.runner == nullptr) {
+    run_opts.runner = &default_runner_;
+  }
+  DCHECK(run_opts.runner != nullptr);
+
   Executor::Args* exec_args = new Executor::Args;
   // Inherit the step_id from the caller.
   exec_args->step_id = run_opts.step_id;
@@ -813,6 +810,7 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
   exec_args->cancellation_manager = run_opts.cancellation_manager;
   exec_args->step_container = run_opts.step_container;
   exec_args->runner = *run_opts.runner;
+  exec_args->collective_executor = run_opts.collective_executor;
 
   Item* item = nullptr;
   Status s = GetOrCreateItem(handle, &item);
@@ -840,13 +838,11 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
     return;
   }
 
-  item->Ref();
   item->exec->RunAsync(
       // Executor args
       *exec_args,
       // Done callback.
-      [item, frame, rets, done, exec_args](const Status& status) {
-        core::ScopedUnref unref(item);
+      [frame, rets, done, exec_args](const Status& status) {
         Status s = status;
         if (s.ok()) {
           s = frame->ConsumeRetvals(rets);
@@ -902,11 +898,11 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
   exec_args->rendezvous = run_opts.rendezvous;
   exec_args->stats_collector = run_opts.stats_collector;
   exec_args->cancellation_manager = run_opts.cancellation_manager;
+  exec_args->collective_executor = run_opts.collective_executor;
   exec_args->step_container = run_opts.step_container;
   exec_args->runner = *run_opts.runner;
   exec_args->call_frame = frame;
 
-  item->Ref();
   item->exec->RunAsync(
       // Executor args
       *exec_args,
@@ -915,7 +911,6 @@ void FunctionLibraryRuntimeImpl::Run(const Options& opts, Handle handle,
           [item, frame, exec_args](DoneCallback done,
                                    // Start unbound arguments.
                                    const Status& status) {
-            core::ScopedUnref unref(item);
             delete exec_args;
             done(status);
           },
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index 373fc64007e43e..f4f5198396fa42 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -231,8 +231,19 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
       return status;
     }
     FunctionLibraryRuntime::Options opts;
-    TF_RETURN_IF_ERROR(Run(flr, handle, opts, args, rets, add_runner));
-    return flr->ReleaseHandle(handle);
+    status = Run(flr, handle, opts, args, rets, add_runner);
+    if (!status.ok()) return status;
+
+    // Release the handle and try running again. It should not succeed.
+    status = flr->ReleaseHandle(handle);
+    if (!status.ok()) return status;
+
+    Status status2 = Run(flr, handle, opts, args, std::move(rets));
+    EXPECT_TRUE(errors::IsInvalidArgument(status2));
+    EXPECT_TRUE(
+        str_util::StrContains(status2.error_message(), "remote execution."));
+
+    return status;
   }
 
   Status Run(FunctionLibraryRuntime* flr, FunctionLibraryRuntime::Handle handle,
@@ -293,8 +304,16 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
       *rets[i] = retvals[i];
     }
 
-    // Release the handle.
-    return flr->ReleaseHandle(handle);
+    // Release the handle and try running again. It should not succeed.
+    status = flr->ReleaseHandle(handle);
+    if (!status.ok()) return status;
+
+    Status status2 = Run(flr, handle, opts, args, std::move(rets));
+    EXPECT_TRUE(errors::IsInvalidArgument(status2));
+    EXPECT_TRUE(
+        str_util::StrContains(status2.error_message(), "remote execution."));
+
+    return status;
   }
 
   std::unique_ptr<Graph> GetFuncBody(FunctionLibraryRuntime* flr,
@@ -826,7 +845,7 @@ TEST_F(FunctionLibraryRuntimeTest, ManySwapsNodeDef) {
   ASSERT_TRUE(g != nullptr);
   OptimizeGraph(flr0_, &g);
   const char* e0 = R"P(
-(n3:float, n2:float) -> (n3:float) {
+(n2:float, n3:float) -> (n2:float) {
 }
 )P";
   EXPECT_EQ(e0, DebugString(g.get()));
diff --git a/tensorflow/core/common_runtime/function_threadpool_test.cc b/tensorflow/core/common_runtime/function_threadpool_test.cc
index 98dac38a8cb903..655a68cfc936c7 100644
--- a/tensorflow/core/common_runtime/function_threadpool_test.cc
+++ b/tensorflow/core/common_runtime/function_threadpool_test.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
-#include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -144,7 +143,19 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
       return status;
     }
     FunctionLibraryRuntime::Options opts;
-    return Run(flr, handle, opts, args, std::move(rets), add_runner);
+    status = Run(flr, handle, opts, args, rets, add_runner);
+    if (!status.ok()) return status;
+
+    // Release the handle and try running again. It should not succeed.
+    status = flr->ReleaseHandle(handle);
+    if (!status.ok()) return status;
+
+    Status status2 = Run(flr, handle, opts, args, std::move(rets));
+    EXPECT_TRUE(errors::IsInvalidArgument(status2));
+    EXPECT_TRUE(
+        str_util::StrContains(status2.error_message(), "remote execution."));
+
+    return status;
   }
 
   Status Run(FunctionLibraryRuntime* flr, FunctionLibraryRuntime::Handle handle,
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
index dd5ee8569b2c3e..99b67cf7cf31ac 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc
@@ -31,7 +31,9 @@ GPUBFCAllocator::GPUBFCAllocator(PhysicalGpuId physical_gpu_id, size_t total_mem
                                  const string& name)
     : BFCAllocator(
           new GPUMemAllocator(
-              GpuIdUtil::ExecutorForPhysicalGpuId(physical_gpu_id).ValueOrDie()),
+              GpuIdUtil::ExecutorForPhysicalGpuId(physical_gpu_id).ValueOrDie(),
+              gpu_options.per_process_gpu_memory_fraction() > 1.0 ||
+                  gpu_options.experimental().use_unified_memory()),
           total_memory, gpu_options.allow_growth(), name) {}
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
index a7781c288c6c80..ee8b2986af6025 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.h
@@ -50,8 +50,9 @@ class GPUBFCAllocator : public BFCAllocator {
 class GPUMemAllocator : public SubAllocator {
  public:
   // Note: stream_exec cannot be null.
-  explicit GPUMemAllocator(se::StreamExecutor* stream_exec)
-      : stream_exec_(stream_exec) {
+  explicit GPUMemAllocator(se::StreamExecutor* stream_exec,
+                           bool use_unified_memory)
+      : stream_exec_(stream_exec), use_unified_memory_(use_unified_memory) {
     CHECK(stream_exec_ != nullptr);
   }
   ~GPUMemAllocator() override {}
@@ -59,20 +60,29 @@ class GPUMemAllocator : public SubAllocator {
   void* Alloc(size_t alignment, size_t num_bytes) override {
     void* ptr = nullptr;
     if (num_bytes > 0) {
-      ptr = stream_exec_->AllocateArray<char>(num_bytes).opaque();
+      if (use_unified_memory_) {
+        ptr = stream_exec_->UnifiedMemoryAllocate(num_bytes);
+      } else {
+        ptr = stream_exec_->AllocateArray<char>(num_bytes).opaque();
+      }
     }
     return ptr;
   }
 
   void Free(void* ptr, size_t num_bytes) override {
     if (ptr != nullptr) {
-      se::DeviceMemoryBase gpu_ptr(ptr);
-      stream_exec_->Deallocate(&gpu_ptr);
+      if (use_unified_memory_) {
+        stream_exec_->UnifiedMemoryDeallocate(ptr);
+      } else {
+        se::DeviceMemoryBase gpu_ptr(ptr);
+        stream_exec_->Deallocate(&gpu_ptr);
+      }
     }
   }
 
  private:
   se::StreamExecutor* stream_exec_;  // not owned, non-null
+  const bool use_unified_memory_ = false;
 
   TF_DISALLOW_COPY_AND_ASSIGN(GPUMemAllocator);
 };
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index ea00a29cca3e5b..9f617a26c52192 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -130,8 +130,9 @@ class EigenGPUStreamDevice : public ::Eigen::StreamInterface {
         reinterpret_cast<unsigned int*>(scratch + EIGEN_GPU_SCRATCH_SIZE);
     stream_ = gpu_stream;
     allocator_ = alloc;
-    const int physical_gpu_id = GpuIdManager::TfToPhysicalGpuId(tf_gpu_id).value();
-    device_prop_ = &Eigen::m_deviceProperties[physical_gpu_id];
+    PhysicalGpuId physical_gpu_id;
+    TF_CHECK_OK(GpuIdManager::TfToPhysicalGpuId(tf_gpu_id, &physical_gpu_id));
+    device_prop_ = &Eigen::m_deviceProperties[physical_gpu_id.value()];
   }
 
   const gpuStream_t& stream() const override { return *stream_; }
@@ -302,6 +303,7 @@ BaseGPUDevice::BaseGPUDevice(const SessionOptions& options, const string& name,
 
 BaseGPUDevice::~BaseGPUDevice() {
   delete gpu_device_info_;
+  for (auto sb : scratch_) gpu_allocator_->DeallocateRaw(sb);
   for (auto ctx : device_contexts_) ctx->Unref();
 }
 
@@ -352,7 +354,9 @@ Status BaseGPUDevice::Init(const SessionOptions& options) {
   gpu_device_info_->stream = streams_[0]->compute;
   gpu_device_info_->default_context = device_contexts_[0];
   gpu_device_info_->event_mgr = em_.get();
-  gpu_device_info_->gpu_id = GpuIdManager::TfToPhysicalGpuId(tf_gpu_id_).value();
+  PhysicalGpuId physical_gpu_id;
+  TF_RETURN_IF_ERROR(GpuIdManager::TfToPhysicalGpuId(tf_gpu_id_, &physical_gpu_id));
+  gpu_device_info_->gpu_id = physical_gpu_id.value();
   set_tensorflow_gpu_device_info(gpu_device_info_);
 
   // Whether and how the GPU device uses its own threadpool.
@@ -441,12 +445,8 @@ Status BaseGPUDevice::FillContextMap(const Graph* graph,
 }
 
 void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
-  // ScopedActivity is cheap when tracing is not active, but we
-  // can avoid computing the Hash64.
-  // TODO(pbar) This would no longer be needed if Ops have a unique id.
-  const uint64 id = port::Tracing::IsActive() ? Hash64(op_kernel->name()) : 0;
-  port::Tracing::ScopedActivity region(port::Tracing::EventCategory::kCompute,
-                                       id);
+  tracing::ScopedRegion region(tracing::EventCategory::kCompute,
+                               op_kernel->name());
 
   // NOTE(tucker): We need to discriminate between Eigen GPU
   // operations and all others.  If an operation is Eigen
@@ -460,15 +460,20 @@ void BaseGPUDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
   if (op_kernel->is_internal() && op_kernel->type_string() == "_Recv") {
     context->SetStatus(errors::Internal(
         "Invalid synchronous 'Compute' on GPU for '_Recv' op"));
-  } else if (port::Tracing::ScopedAnnotation::Enabled()) {
-    port::Tracing::ScopedAnnotation annotation(op_kernel->name(),
-                                               op_kernel->type_string());
-    ComputeHelper(op_kernel, context);
   } else {
+    tracing::ScopedAnnotation annotation(op_kernel->name(),
+                                         op_kernel->type_string());
     ComputeHelper(op_kernel, context);
   }
 }
 
+string BaseGPUDevice::ComputeOpKernelDebugString(const OpKernel& op_kernel,
+                                                 const int& stream_id) {
+  return strings::StrCat(op_kernel.name(), " op ", op_kernel.type_string(),
+                         " on GPU ", tf_gpu_id_.value(), " stream[", stream_id,
+                         "]");
+}
+
 void BaseGPUDevice::ComputeHelper(OpKernel* op_kernel,
                                   OpKernelContext* context) {
   GPUDeviceContext* gpu_device_context = device_contexts_[0];
@@ -483,9 +488,8 @@ void BaseGPUDevice::ComputeHelper(OpKernel* op_kernel,
   const bool vlog_2 = vlog_1 && VLOG_IS_ON(2);
 
   if (vlog_1) {
-    VLOG(1) << "GpuDevice::Compute " << op_kernel->name() << " op "
-            << op_kernel->type_string() << " on GPU" << tf_gpu_id_ << " stream["
-            << stream_id << "]";
+    VLOG(1) << "GpuDevice::ComputeHelper "
+            << ComputeOpKernelDebugString(*op_kernel, stream_id);
   }
 
   const auto num_streams = streams_.size();
@@ -529,6 +533,18 @@ void BaseGPUDevice::ComputeHelper(OpKernel* op_kernel,
       // all streams.  Given that this flag is typically used for
       // debugging it makes more sense to sync all GPU activity.
       context->SetStatus(GPUUtil::SyncAll(this));
+      if (vlog_1) {
+        VLOG(1) << "GpuDevice::ComputeHelper finished "
+                << ComputeOpKernelDebugString(*op_kernel, stream_id);
+      }
+    } else if (vlog_1) {
+      VLOG(1) << "GpuDevice::ComputeHelper scheduled "
+              << ComputeOpKernelDebugString(*op_kernel, stream_id);
+    }
+  } else {
+    if (vlog_1) {
+      VLOG(1) << "GpuDevice::ComputeHelper failed to schedule "
+              << ComputeOpKernelDebugString(*op_kernel, stream_id);
     }
   }
 }
@@ -562,11 +578,10 @@ void BaseGPUDevice::ComputeAsync(AsyncOpKernel* op_kernel,
           << op_kernel->type_string() << " on GPU" << tf_gpu_id_ << " stream["
           << stream_id << "]";
 
-  // When TraceMe profiling is off (which is the default), the
-  // following TraceMe constructor is simply a conditional test of
-  // false value. Measurements show that its overhead is negligible.
-  port::Tracing::TraceMe activity(op_kernel->name(), op_kernel->type_string(),
-                                  op_kernel->IsExpensive());
+  // When Xprof profiling is off (which is the default), constructing the
+  // activity is simple enough that its overhead is negligible.
+  tracing::ScopedActivity activity(op_kernel->name(), op_kernel->type_string(),
+                                   op_kernel->IsExpensive());
   ScopedActivateExecutorContext scoped_activation{stream->parent()};
   op_kernel->ComputeAsync(context, done);
 }
@@ -608,7 +623,7 @@ Status BaseGPUDevice::MaybeCopyTensorToGPU(
         },
         std::move(done), std::placeholders::_1);
 
-    port::Tracing::ScopedAnnotation annotation("MakeTensorFromProto");
+    tracing::ScopedAnnotation annotation("MakeTensorFromProto");
     device_contexts_[0]->CopyCPUTensorToDevice(&from, this, copy,
                                                std::move(wrapped_done));
     return Status::OK();
@@ -791,7 +806,7 @@ int64 MinSystemMemory(int64 available_memory) {
   } else {
     // max(300 MiB, 0.05 * available_memory)
     min_system_memory =
-        std::max(314572800LL, static_cast<int64>(available_memory * 0.05));
+        std::max(int64{314572800}, static_cast<int64>(available_memory * 0.05));
   }
 #if defined(__GNUC__) && defined(__OPTIMIZE__)
 // Do nothing
@@ -830,6 +845,20 @@ Status SingleVirtualDeviceMemoryLimit(const GPUOptions& gpu_options,
   int64 allocated_memory = 0;
   const double per_process_gpu_memory_fraction =
       gpu_options.per_process_gpu_memory_fraction();
+  if (per_process_gpu_memory_fraction > 1.0 ||
+      gpu_options.experimental().use_unified_memory()) {
+    int cc_major = 0, cc_minor = 0;
+    if (!se->GetDeviceDescription().cuda_compute_capability(&cc_major,
+                                                            &cc_minor)) {
+      return errors::Internal("Failed to get compute capability for device.");
+    }
+    if (cc_major < 6) {
+      return errors::Internal(
+          "Unified memory on GPUs with compute capability lower than 6.0 "
+          "(pre-Pascal class GPUs) does not support oversubscription.");
+    }
+  }
+
   if (per_process_gpu_memory_fraction == 0) {
     allocated_memory = available_memory;
     const int64 min_system_memory = MinSystemMemory(available_memory);
@@ -918,7 +947,8 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
   if (num_gpus_to_use > valid_physical_gpu_ids.size()) {
     num_gpus_to_use = valid_physical_gpu_ids.size();
   }
-  if (!valid_physical_gpu_ids.empty()) {
+  // If we aren't going to use any GPUs, don't initialize them.
+  if (num_gpus_to_use > 0 && !valid_physical_gpu_ids.empty()) {
     // Save the original device.
     int original_device = 0;
 #if GOOGLE_CUDA
@@ -1038,7 +1068,8 @@ Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
     while (next_tf_gpu_id < memory_limit_bytes.size()) {
       TfGpuId tf_gpu_id(next_tf_gpu_id);
       ++next_tf_gpu_id;
-      GpuIdManager::InsertTfPhysicalGpuIdPair(tf_gpu_id, physical_gpu_id);
+      TF_RETURN_IF_ERROR(
+          GpuIdManager::InsertTfPhysicalGpuIdPair(tf_gpu_id, physical_gpu_id));
     }
   }
   const int num_tf_gpus = next_tf_gpu_id;
@@ -1094,7 +1125,8 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options,
   const string device_name =
       strings::StrCat(name_prefix, "/device:GPU:", tf_gpu_id.value());
   GpuIdUtil::CheckValidTfGpuId(tf_gpu_id);
-  PhysicalGpuId physical_gpu_id = GpuIdManager::TfToPhysicalGpuId(tf_gpu_id);
+  PhysicalGpuId physical_gpu_id;
+  TF_RETURN_IF_ERROR(GpuIdManager::TfToPhysicalGpuId(tf_gpu_id, &physical_gpu_id));
   int numa_node = dev_locality.numa_node();
 
   se::StreamExecutor* se =
@@ -1179,7 +1211,8 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
     all_tf_gpu_ids.push_back(TfGpuId(i));
   }
   for (TfGpuId tf_gpu_id : all_tf_gpu_ids) {
-    PhysicalGpuId physical_gpu_id = GpuIdManager::TfToPhysicalGpuId(tf_gpu_id);
+    PhysicalGpuId physical_gpu_id;
+    TF_RETURN_IF_ERROR(GpuIdManager::TfToPhysicalGpuId(tf_gpu_id, &physical_gpu_id));
     // Get GPU bus_id from its reported NUMA affinity.  Because GPUs are
     // virtualized in some environments, we can't just use the GPU id.
     // NUMA locales are indexed from 0, buses are indexed from 1.
@@ -1207,7 +1240,9 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
     LocalLinks* links = dev_locality.mutable_links();
     for (const InterconnectMap& imap : interconnects) {
       for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) {
-        PhysicalGpuId physical_gpu_dst = GpuIdManager::TfToPhysicalGpuId(tf_gpu_dst);
+        PhysicalGpuId physical_gpu_dst;
+        TF_RETURN_IF_ERROR(
+            GpuIdManager::TfToPhysicalGpuId(tf_gpu_dst, &physical_gpu_dst));
         if (imap.directed_links.find({physical_gpu_id, physical_gpu_dst}) !=
             imap.directed_links.end()) {
           InterconnectLink* ilink = links->add_link();
@@ -1222,7 +1257,9 @@ Status BaseGPUDeviceFactory::GetDeviceLocalities(
     // add high strength links to the others.
     for (TfGpuId tf_gpu_dst : all_tf_gpu_ids) {
       if (tf_gpu_id == tf_gpu_dst) continue;
-      PhysicalGpuId physical_gpu_dst = GpuIdManager::TfToPhysicalGpuId(tf_gpu_dst);
+      PhysicalGpuId physical_gpu_dst;
+      TF_RETURN_IF_ERROR(
+          GpuIdManager::TfToPhysicalGpuId(tf_gpu_dst, &physical_gpu_dst));
       if (physical_gpu_id == physical_gpu_dst) {
         InterconnectLink* ilink = links->add_link();
         ilink->set_device_id(tf_gpu_dst.value());
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index c12b9f47c4f62b..965a8f72ea242f 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -90,7 +90,11 @@ class BaseGPUDevice : public LocalDevice {
 
   // Returns the physical GPU id of this device within the native driver system;
   // e.g., for CUDA this is the ordinal of the GPU within the system.
-  int gpu_id() const { return GpuIdManager::TfToPhysicalGpuId(tf_gpu_id_).value(); }
+  int gpu_id() const {
+    PhysicalGpuId physical_gpu_id;
+    TF_CHECK_OK(GpuIdManager::TfToPhysicalGpuId(tf_gpu_id_, &physical_gpu_id));
+    return physical_gpu_id.value();
+  }
 
   // The executor that provides control for the device; e.g., for CUDA this
   // corresponds to the cuda context.
@@ -135,6 +139,9 @@ class BaseGPUDevice : public LocalDevice {
 
   void ComputeHelper(OpKernel* op_kernel, OpKernelContext* context);
 
+  string ComputeOpKernelDebugString(const OpKernel& op_kernel,
+                                    const int& stream_id);
+
   // This method returns an initialization status, in addition to
   // calling the "done" StatusCallback, if there is a failure to
   // allocate memory or if the tensor "from" is not DMA-copyable.
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
index f3935f6ba26c49..5c6cb43eff17d3 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
@@ -17,19 +17,48 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/gpu/gpu_device.h"
 
+#include "tensorflow/core/common_runtime/gpu/gpu_id_utils.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_init.h"
 #include "tensorflow/core/common_runtime/gpu/process_state.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
+namespace {
 const char* kDeviceNamePrefix = "/job:localhost/replica:0/task:0";
 
+int64 GetTotalGPUMemory(CudaGpuId gpu_id) {
+  se::StreamExecutor* se =
+      GpuIdUtil::ExecutorForCudaGpuId(GPUMachineManager(), gpu_id).ValueOrDie();
+
+  int64 total_memory, available_memory;
+  CHECK(se->DeviceMemoryUsage(&available_memory, &total_memory));
+  return total_memory;
+}
+
+Status GetComputeCapability(CudaGpuId gpu_id, int* cc_major, int* cc_minor) {
+  se::StreamExecutor* se =
+      GpuIdUtil::ExecutorForCudaGpuId(GPUMachineManager(), gpu_id).ValueOrDie();
+  if (!se->GetDeviceDescription().cuda_compute_capability(cc_major, cc_minor)) {
+    *cc_major = 0;
+    *cc_minor = 0;
+    return errors::Internal("Failed to get compute capability for device.");
+  }
+  return Status::OK();
+}
+
+void ExpectErrorMessageSubstr(const Status& s, StringPiece substr) {
+  EXPECT_TRUE(str_util::StrContains(s.ToString(), substr))
+      << s << ", expected substring " << substr;
+}
+}  // namespace
+
 class GPUDeviceTest : public ::testing::Test {
  public:
-  void TearDown() { ProcessState::singleton()->TestOnlyReset(); }
+  void TearDown() override { ProcessState::singleton()->TestOnlyReset(); }
 
  protected:
   static SessionOptions MakeSessionOptions(
@@ -52,11 +81,6 @@ class GPUDeviceTest : public ::testing::Test {
     }
     return options;
   }
-
-  static bool StartsWith(const string& lhs, const string& rhs) {
-    if (rhs.length() > lhs.length()) return false;
-    return lhs.substr(0, rhs.length()) == rhs;
-  }
 };
 
 TEST_F(GPUDeviceTest, FailedToParseVisibleDeviceList) {
@@ -65,8 +89,7 @@ TEST_F(GPUDeviceTest, FailedToParseVisibleDeviceList) {
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
-  EXPECT_TRUE(StartsWith(status.error_message(), "Could not parse entry"))
-      << status;
+  ExpectErrorMessageSubstr(status, "Could not parse entry");
 }
 
 TEST_F(GPUDeviceTest, InvalidGpuId) {
@@ -75,9 +98,8 @@ TEST_F(GPUDeviceTest, InvalidGpuId) {
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
-  EXPECT_TRUE(StartsWith(status.error_message(),
-                         "'visible_device_list' listed an invalid GPU id"))
-      << status;
+  ExpectErrorMessageSubstr(status,
+                           "'visible_device_list' listed an invalid GPU id");
 }
 
 TEST_F(GPUDeviceTest, DuplicateEntryInVisibleDeviceList) {
@@ -86,9 +108,8 @@ TEST_F(GPUDeviceTest, DuplicateEntryInVisibleDeviceList) {
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
-  EXPECT_TRUE(StartsWith(status.error_message(),
-                         "visible_device_list contained a duplicate entry"))
-      << status;
+  ExpectErrorMessageSubstr(status,
+                           "visible_device_list contained a duplicate entry");
 }
 
 TEST_F(GPUDeviceTest, VirtualDeviceConfigConflictsWithMemoryFractionSettings) {
@@ -97,9 +118,8 @@ TEST_F(GPUDeviceTest, VirtualDeviceConfigConflictsWithMemoryFractionSettings) {
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
-  EXPECT_TRUE(StartsWith(status.error_message(),
-                         "It's invalid to set per_process_gpu_memory_fraction"))
-      << status;
+  ExpectErrorMessageSubstr(
+      status, "It's invalid to set per_process_gpu_memory_fraction");
 }
 
 TEST_F(GPUDeviceTest, GpuDeviceCountTooSmall) {
@@ -110,9 +130,8 @@ TEST_F(GPUDeviceTest, GpuDeviceCountTooSmall) {
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::UNKNOWN);
-  EXPECT_TRUE(StartsWith(status.error_message(),
-                         "Not enough GPUs to create virtual devices."))
-      << status;
+  ExpectErrorMessageSubstr(status,
+                           "Not enough GPUs to create virtual devices.");
 }
 
 TEST_F(GPUDeviceTest, NotEnoughGpuInVisibleDeviceList) {
@@ -123,9 +142,8 @@ TEST_F(GPUDeviceTest, NotEnoughGpuInVisibleDeviceList) {
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::UNKNOWN);
-  EXPECT_TRUE(StartsWith(status.error_message(),
-                         "Not enough GPUs to create virtual devices."))
-      << status;
+  ExpectErrorMessageSubstr(status,
+                           "Not enough GPUs to create virtual devices.");
 }
 
 TEST_F(GPUDeviceTest, VirtualDeviceConfigConflictsWithVisibleDeviceList) {
@@ -138,11 +156,11 @@ TEST_F(GPUDeviceTest, VirtualDeviceConfigConflictsWithVisibleDeviceList) {
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
-  EXPECT_TRUE(StartsWith(status.error_message(),
-                         "The number of GPUs in visible_device_list doesn't "
-                         "match the number of elements in the virtual_devices "
-                         "list."))
-      << status;
+  ExpectErrorMessageSubstr(
+      status,
+      "The number of GPUs in visible_device_list doesn't "
+      "match the number of elements in the virtual_devices "
+      "list.");
 }
 
 TEST_F(GPUDeviceTest, EmptyVirtualDeviceConfig) {
@@ -153,7 +171,7 @@ TEST_F(GPUDeviceTest, EmptyVirtualDeviceConfig) {
       opts, kDeviceNamePrefix, &devices));
   EXPECT_EQ(1, devices.size());
   EXPECT_GE(devices[0]->attributes().memory_limit(), 0);
-  for (auto d : devices) delete d;
+  gtl::STLDeleteElements(&devices);
 }
 
 TEST_F(GPUDeviceTest, SingleVirtualDeviceWithNoMemoryLimit) {
@@ -165,7 +183,7 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithNoMemoryLimit) {
       opts, kDeviceNamePrefix, &devices));
   EXPECT_EQ(1, devices.size());
   EXPECT_GE(devices[0]->attributes().memory_limit(), 0);
-  for (auto d : devices) delete d;
+  gtl::STLDeleteElements(&devices);
 }
 
 TEST_F(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimit) {
@@ -175,7 +193,7 @@ TEST_F(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimit) {
       opts, kDeviceNamePrefix, &devices));
   EXPECT_EQ(1, devices.size());
   EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit());
-  for (auto d : devices) delete d;
+  gtl::STLDeleteElements(&devices);
 }
 
 TEST_F(GPUDeviceTest, MultipleVirtualDevices) {
@@ -198,7 +216,67 @@ TEST_F(GPUDeviceTest, MultipleVirtualDevices) {
             devices[1]->attributes().locality().links().link(0).type());
   EXPECT_EQ(BaseGPUDeviceFactory::InterconnectMap::kSameDeviceStrength,
             devices[1]->attributes().locality().links().link(0).strength());
-  for (auto d : devices) delete d;
+  gtl::STLDeleteElements(&devices);
+}
+
+// Enabling unified memory on pre-Pascal GPUs results in an initialization
+// error.
+TEST_F(GPUDeviceTest, UnifiedMemoryUnavailableOnPrePascalGpus) {
+  int cc_major, cc_minor;
+  TF_ASSERT_OK(GetComputeCapability(CudaGpuId(0), &cc_major, &cc_minor));
+  // Exit early while running on Pascal or later GPUs.
+  if (cc_major >= 6) {
+    return;
+  }
+
+  SessionOptions opts = MakeSessionOptions("0", /*memory_fraction=*/1.2);
+  opts.config.mutable_gpu_options()
+      ->mutable_experimental()
+      ->set_use_unified_memory(true);
+  std::vector<tensorflow::Device*> devices;
+  Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
+      opts, kDeviceNamePrefix, &devices);
+  EXPECT_EQ(status.code(), error::INTERNAL);
+  ExpectErrorMessageSubstr(status, "does not support oversubscription.");
+}
+
+// Enabling unified memory on Pascal or later GPUs makes it possible to allocate
+// more memory than what is available on the device.
+TEST_F(GPUDeviceTest, UnifiedMemoryAllocation) {
+  static constexpr double kGpuMemoryFraction = 1.2;
+  static constexpr CudaGpuId kCudaGpuId(0);
+
+  int cc_major, cc_minor;
+  TF_ASSERT_OK(GetComputeCapability(kCudaGpuId, &cc_major, &cc_minor));
+  // Exit early if running on pre-Pascal GPUs.
+  if (cc_major < 6) {
+    LOG(INFO)
+        << "Unified memory allocation is not supported with pre-Pascal GPUs.";
+    return;
+  }
+
+  SessionOptions opts = MakeSessionOptions("0", kGpuMemoryFraction);
+  std::vector<tensorflow::Device*> devices;
+  TF_ASSERT_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
+      opts, kDeviceNamePrefix, &devices));
+  ASSERT_EQ(1, devices.size());
+
+  int64 memory_limit = devices[0]->attributes().memory_limit();
+  ASSERT_EQ(memory_limit, static_cast<int64>(GetTotalGPUMemory(kCudaGpuId) *
+                                             kGpuMemoryFraction));
+
+  AllocatorAttributes allocator_attributes = AllocatorAttributes();
+  allocator_attributes.set_gpu_compatible(true);
+  Allocator* allocator = devices[0]->GetAllocator(allocator_attributes);
+
+  // Try to allocate all the available memory after rounding down to the nearest
+  // multiple of MB.
+  void* ptr = allocator->AllocateRaw(Allocator::kAllocatorAlignment,
+                                     (memory_limit >> 20) << 20);
+  EXPECT_NE(ptr, nullptr);
+  allocator->DeallocateRaw(ptr);
+
+  gtl::STLDeleteElements(&devices);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
index 1d4ad957b94bfc..c5ff6c97a176a8 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
@@ -119,7 +119,7 @@ TEST(EventMgr, DelayedPolling) {
   EXPECT_EQ(0, th.queue_size());
   TensorReferenceVector* v = nullptr;
   std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
-  CHECK(stream.get());
+  CHECK(stream);
   stream->Init();
   for (int i = 0; i < 5; ++i) {
     v = new TensorReferenceVector;
@@ -151,7 +151,7 @@ TEST(EventMgr, FlushLargeTensorImmediately) {
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, live_tensor_bytes);
   std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
-  CHECK(stream.get());
+  CHECK(stream);
   stream->Init();
   for (int i = 0; i < 5; ++i) {
     TensorReferenceVector v;
@@ -168,7 +168,7 @@ TEST(EventMgr, ManySmallTensorsFlushedImmediately) {
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, live_tensor_bytes);
   std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
-  CHECK(stream.get());
+  CHECK(stream);
   stream->Init();
   for (int i = 0; i < 5; ++i) {
     TensorReferenceVector v;
@@ -209,7 +209,7 @@ TEST(EventMgr, ManySmallTensorsSeparateCallsFlushed) {
   TEST_EventMgrHelper th(&em);
   EXPECT_EQ(0, live_tensor_bytes);
   std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
-  CHECK(stream.get());
+  CHECK(stream);
   stream->Init();
   for (int i = 0; i < 5; ++i) {
     for (int i = 0; i < 1000; i++) {
@@ -232,7 +232,7 @@ TEST(EventMgr, NonEmptyShutdown) {
   EXPECT_EQ(0, th.queue_size());
   EXPECT_EQ(0, th.free_size());
   std::unique_ptr<se::Stream> stream(new se::Stream(stream_exec));
-  CHECK(stream.get());
+  CHECK(stream);
   stream->Init();
   for (int i = 0; i < 5; ++i) {
     TensorReferenceVector* v = new TensorReferenceVector;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc b/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc
index d6eb31c6981648..1ba85206c193f1 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_id_manager.cc
@@ -34,46 +34,40 @@ class TfToPhysicalGpuIdMap {
     return id_map;
   }
 
-  void InsertOrDie(TfGpuId tf_gpu_id, PhysicalGpuId physical_gpu_id)
-      LOCKS_EXCLUDED(mu_) {
+  Status Insert(TfGpuId tf_gpu_id, PhysicalGpuId physical_gpu_id) LOCKS_EXCLUDED(mu_) {
     std::pair<IdMapType::iterator, bool> result;
     {
       mutex_lock lock(mu_);
       result = id_map_.insert({tf_gpu_id.value(), physical_gpu_id.value()});
     }
-    if (!result.second) {
-      CHECK_EQ(physical_gpu_id.value(), result.first->second)
-          << "Mapping the same TfGpuId to a different physical GPU id."
-          << " TfGpuId: " << tf_gpu_id
-          << " Existing mapped physical GPU id: " << result.first->second
-          << " physical GPU id being tried to map to: " << physical_gpu_id;
+    if (!result.second && physical_gpu_id.value() != result.first->second) {
+      return errors::AlreadyExists(
+          "TensorFlow device (GPU:", tf_gpu_id.value(),
+          ") is being mapped to "
+          "multiple CUDA devices (",
+          physical_gpu_id.value(), " now, and ", result.first->second,
+          " previously), which is not supported. "
+          "This may be the result of providing different GPU configurations "
+          "(ConfigProto.gpu_options, for example different visible_device_list)"
+          " when creating multiple Sessions in the same process. This is not "
+          " currently supported, see "
+          "https://github.com/tensorflow/tensorflow/issues/19083");
     }
-  }
-
-  PhysicalGpuId FindOrDie(TfGpuId tf_gpu_id) const LOCKS_EXCLUDED(mu_) {
-    mutex_lock lock(mu_);
-    return FindOrDieLocked(tf_gpu_id);
+    return Status::OK();
   }
 
   bool Find(TfGpuId tf_gpu_id, PhysicalGpuId* physical_gpu_id) const
       LOCKS_EXCLUDED(mu_) {
     mutex_lock lock(mu_);
-    if (id_map_.count(tf_gpu_id.value()) == 0) return false;
-    *physical_gpu_id = FindOrDieLocked(tf_gpu_id);
+    auto result = id_map_.find(tf_gpu_id.value());
+    if (result == id_map_.end()) return false;
+    *physical_gpu_id = result->second;
     return true;
   }
 
  private:
   TfToPhysicalGpuIdMap() = default;
 
-  PhysicalGpuId FindOrDieLocked(TfGpuId tf_gpu_id) const
-      EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-    auto result = id_map_.find(tf_gpu_id.value());
-    CHECK(result != id_map_.end())
-        << "Could not find the mapping for TfGpuId: " << tf_gpu_id;
-    return PhysicalGpuId(result->second);
-  }
-
   void TestOnlyReset() LOCKS_EXCLUDED(mu_) {
     mutex_lock lock(mu_);
     id_map_.clear();
@@ -88,23 +82,19 @@ class TfToPhysicalGpuIdMap {
 };
 }  // namespace
 
-void GpuIdManager::InsertTfPhysicalGpuIdPair(TfGpuId tf_gpu_id,
-                                         PhysicalGpuId physical_gpu_id) {
-  TfToPhysicalGpuIdMap::singleton()->InsertOrDie(tf_gpu_id, physical_gpu_id);
+Status GpuIdManager::InsertTfPhysicalGpuIdPair(TfGpuId tf_gpu_id,
+                                           PhysicalGpuId physical_gpu_id) {
+  return TfToPhysicalGpuIdMap::singleton()->Insert(tf_gpu_id, physical_gpu_id);
 }
 
 Status GpuIdManager::TfToPhysicalGpuId(TfGpuId tf_gpu_id, PhysicalGpuId* physical_gpu_id) {
   if (TfToPhysicalGpuIdMap::singleton()->Find(tf_gpu_id, physical_gpu_id)) {
     return Status::OK();
   }
-  return errors::NotFound("TF GPU device with id ", tf_gpu_id.value(),
+  return errors::NotFound("TensorFlow device GPU:", tf_gpu_id.value(),
                           " was not registered");
 }
 
-PhysicalGpuId GpuIdManager::TfToPhysicalGpuId(TfGpuId tf_gpu_id) {
-  return TfToPhysicalGpuIdMap::singleton()->FindOrDie(tf_gpu_id);
-}
-
 void GpuIdManager::TestOnlyReset() {
   TfToPhysicalGpuIdMap::singleton()->TestOnlyReset();
 }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_manager.h b/tensorflow/core/common_runtime/gpu/gpu_id_manager.h
index ddd82324655c08..9c770f793c237a 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_id_manager.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_id_manager.h
@@ -26,13 +26,10 @@ namespace tensorflow {
 class GpuIdManager {
  public:
   // Adds a mapping from tf_gpu_id to physical_gpu_id.
-  static void InsertTfPhysicalGpuIdPair(TfGpuId tf_gpu_id, PhysicalGpuId physical_gpu_id);
+  static Status InsertTfPhysicalGpuIdPair(TfGpuId tf_gpu_id, PhysicalGpuId physical_gpu_id);
 
   // Gets the physical_gpu_id associated with tf_gpu_id. Returns OK if found.
   static Status TfToPhysicalGpuId(TfGpuId tf_gpu_id, PhysicalGpuId* physical_gpu_id);
-  // Similar to the above version, but returns the result, and checks fail if
-  // no result is found.
-  static PhysicalGpuId TfToPhysicalGpuId(TfGpuId tf_gpu_id);
 
   // Clears the map. Used in unit tests only.
   static void TestOnlyReset();
diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_manager_test.cc b/tensorflow/core/common_runtime/gpu/gpu_id_manager_test.cc
index 9f3fa63ffd6511..6078291733597a 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_id_manager_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_id_manager_test.cc
@@ -16,40 +16,45 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
 
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
-namespace test {
+namespace {
+
+PhysicalGpuId TfToPhysicalGpuId(TfGpuId tf) {
+  PhysicalGpuId cuda;
+  TF_CHECK_OK(GpuIdManager::TfToPhysicalGpuId(tf, &cuda));
+  return cuda;
+}
 
 TEST(GpuIdManagerTest, Basics) {
   TfGpuId key_0(0);
   PhysicalGpuId value_0(0);
-  GpuIdManager::InsertTfPhysicalGpuIdPair(key_0, value_0);
-  EXPECT_EQ(value_0, GpuIdManager::TfToPhysicalGpuId(key_0));
+  TF_ASSERT_OK(GpuIdManager::InsertTfPhysicalGpuIdPair(key_0, value_0));
+  EXPECT_EQ(value_0, TfToPhysicalGpuId(key_0));
 
   // Multiple calls to map the same value is ok.
-  GpuIdManager::InsertTfPhysicalGpuIdPair(key_0, value_0);
-  EXPECT_EQ(value_0, GpuIdManager::TfToPhysicalGpuId(key_0));
+  TF_ASSERT_OK(GpuIdManager::InsertTfPhysicalGpuIdPair(key_0, value_0));
+  EXPECT_EQ(value_0, TfToPhysicalGpuId(key_0));
 
   // Map a different TfGpuId to a different value.
   TfGpuId key_1(3);
   PhysicalGpuId value_1(2);
-  GpuIdManager::InsertTfPhysicalGpuIdPair(key_1, value_1);
-  EXPECT_EQ(value_1, GpuIdManager::TfToPhysicalGpuId(key_1));
+  TF_ASSERT_OK(GpuIdManager::InsertTfPhysicalGpuIdPair(key_1, value_1));
+  EXPECT_EQ(value_1, TfToPhysicalGpuId(key_1));
 
   // Mapping a different TfGpuId to the same value is ok.
   TfGpuId key_2(10);
-  GpuIdManager::InsertTfPhysicalGpuIdPair(key_2, value_1);
-  EXPECT_EQ(value_1, GpuIdManager::TfToPhysicalGpuId(key_2));
+  TF_ASSERT_OK(GpuIdManager::InsertTfPhysicalGpuIdPair(key_2, value_1));
+  EXPECT_EQ(value_1, TfToPhysicalGpuId(key_2));
 
-  // Mapping the same TfGpuId to a different value will crash the program.
-  ASSERT_DEATH(GpuIdManager::InsertTfPhysicalGpuIdPair(key_2, value_0),
-               "Mapping the same TfGpuId to a different physical GPU id");
+  // Mapping the same TfGpuId to a different value.
+  ASSERT_FALSE(GpuIdManager::InsertTfPhysicalGpuIdPair(key_2, value_0).ok());
 
-  // Getting an nonexistent mapping will crash the program.
-  ASSERT_DEATH(GpuIdManager::TfToPhysicalGpuId(TfGpuId(100)),
-               "Could not find the mapping for TfGpuId");
+  // Getting a nonexistent mapping.
+  ASSERT_FALSE(GpuIdManager::TfToPhysicalGpuId(TfGpuId(100), &value_0).ok());
 }
 
-}  // namespace test
+}  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_id_utils.h b/tensorflow/core/common_runtime/gpu/gpu_id_utils.h
index 2b6262a49ed0a5..b5064f3c66de79 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_id_utils.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_id_utils.h
@@ -39,12 +39,15 @@ class GpuIdUtil {
   }
   static se::port::StatusOr<se::StreamExecutor*> ExecutorForTfGpuId(
       TfGpuId tf_gpu_id) {
-    return ExecutorForPhysicalGpuId(GpuIdManager::TfToPhysicalGpuId(tf_gpu_id));
+    PhysicalGpuId physical_gpu_id;
+    TF_RETURN_IF_ERROR(GpuIdManager::TfToPhysicalGpuId(tf_gpu_id, &physical_gpu_id));
+    return ExecutorForPhysicalGpuId(physical_gpu_id);
   }
 
   // Verify that the physical_gpu_id associated with a TfGpuId is legitimate.
   static void CheckValidTfGpuId(TfGpuId tf_gpu_id) {
-    const PhysicalGpuId physical_gpu_id = GpuIdManager::TfToPhysicalGpuId(tf_gpu_id);
+    PhysicalGpuId physical_gpu_id;
+    TF_CHECK_OK(GpuIdManager::TfToPhysicalGpuId(tf_gpu_id, &physical_gpu_id));
     const int visible_device_count = GPUMachineManager()->VisibleDeviceCount();
     CHECK_LT(physical_gpu_id.value(), visible_device_count)
         << "physical_gpu_id is outside discovered device range."
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc
index 890ee39f73e9a1..e09f99d7da83f3 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc
@@ -149,7 +149,7 @@ void GPUUtil::SetProtoFromGPU(const Tensor& tensor, Device* dev,
   char* buf = nullptr;
   const int64 total_bytes = is_dead ? 0 : tensor.TotalBytes();
   if (total_bytes > 0) {
-    port::Tracing::ScopedAnnotation annotation("SetProtoFromGPU");
+    tracing::ScopedAnnotation annotation("SetProtoFromGPU");
     alloc = ProcessState::singleton()->GetGPUHostAllocator(0);
     buf = alloc->Allocate<char>(total_bytes);
     if (LogMemory::IsEnabled()) {
diff --git a/tensorflow/core/common_runtime/gpu/process_state.cc b/tensorflow/core/common_runtime/gpu/process_state.cc
index 90c4f60766d4c0..ef76a071608962 100644
--- a/tensorflow/core/common_runtime/gpu/process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/process_state.cc
@@ -126,7 +126,8 @@ Allocator* ProcessState::GetGPUAllocator(const GPUOptions& options,
       return nullptr;
     }
 
-    const PhysicalGpuId physical_gpu_id = GpuIdManager::TfToPhysicalGpuId(tf_gpu_id);
+    PhysicalGpuId physical_gpu_id;
+    TF_CHECK_OK(GpuIdManager::TfToPhysicalGpuId(tf_gpu_id, &physical_gpu_id));
     gpu_allocator =
         new GPUBFCAllocator(physical_gpu_id, total_bytes, options,
                             strings::StrCat("GPU_", tf_gpu_id.value(), "_bfc"));
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index 642d91e3282313..eb710bdbc504f4 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -76,7 +76,7 @@ GraphExecutionState::~GraphExecutionState() {
     GraphDef* graph_def, const GraphExecutionStateOptions& options,
     std::unique_ptr<GraphExecutionState>* out_state) {
 #ifndef __ANDROID__
-  VLOG(1) << "Graph proto is " << graph_def->DebugString();
+  VLOG(4) << "Graph proto is " << graph_def->DebugString();
 #endif  // __ANDROID__
 
   std::unique_ptr<GraphExecutionState> ret(
@@ -490,18 +490,31 @@ Status GraphExecutionState::OptimizeGraph(
         cpu_device = device;
       }
     }
-    grappler::VirtualCluster cluster(device_map);
+    grappler::VirtualCluster cluster(device_map, device_set_);
     GraphDef new_graph;
     TF_RETURN_IF_ERROR(grappler::RunMetaOptimizer(
         item, rewrite_options, cpu_device, &cluster, &new_graph));
 
     // Merge optimized graph function library with an original library.
     // Optimized graph might have new functions specialized for it's
-    // instantiation context (see Grappler function optimizer).
+    // instantiation context (see Grappler function optimizer), and modified
+    // function body for the existing functions.
+    optimized_flib->reset(new FunctionLibraryDefinition(*flib_def_));
+
+    for (const FunctionDef& fdef : new_graph.library().function()) {
+      const string& func_name = fdef.signature().name();
+
+      if ((*optimized_flib)->Find(func_name)) {
+        VLOG(3) << "Replace function: name=" << func_name;
+        TF_RETURN_IF_ERROR((*optimized_flib)->RemoveFunction(func_name));
+        TF_RETURN_IF_ERROR((*optimized_flib)->AddFunctionDef(fdef));
+      } else {
+        VLOG(3) << "Add new function: name=" << func_name;
+        TF_RETURN_IF_ERROR((*optimized_flib)->AddFunctionDef(fdef));
+      }
+    }
+
     optimized_graph->reset(new Graph(OpRegistry::Global()));
-    optimized_flib->reset(new FunctionLibraryDefinition(OpRegistry::Global(),
-                                                        new_graph.library()));
-    TF_RETURN_IF_ERROR((*optimized_flib)->AddLibrary(*flib_def_));
 
     GraphConstructorOptions opts;
     opts.allow_internal_ops = true;
@@ -540,6 +553,7 @@ Status GraphExecutionState::BuildGraph(const BuildGraphOptions& options,
 
   Status s = OptimizeGraph(options, &optimized_graph, &optimized_flib);
   if (!s.ok()) {
+    VLOG(2) << "Grappler optimization failed. Error: " << s.error_message();
     // Simply copy the original graph and the function library if we couldn't
     // optimize it.
     optimized_graph.reset(new Graph(flib_def_.get()));
diff --git a/tensorflow/core/common_runtime/graph_runner.cc b/tensorflow/core/common_runtime/graph_runner.cc
index 790f2eaa1e9de9..0a1797fa19bb7e 100644
--- a/tensorflow/core/common_runtime/graph_runner.cc
+++ b/tensorflow/core/common_runtime/graph_runner.cc
@@ -56,7 +56,7 @@ class SimpleRendezvous : public Rendezvous {
     }
 
     mutex_lock l(mu_);
-    string edge_name = parsed.edge_name.ToString();
+    string edge_name = std::string(parsed.edge_name);
     if (table_.count(edge_name) > 0) {
       return errors::Internal("Send of an already sent tensor");
     }
@@ -69,7 +69,7 @@ class SimpleRendezvous : public Rendezvous {
     Tensor tensor;
     Status status = Status::OK();
     {
-      string key = parsed.edge_name.ToString();
+      string key = std::string(parsed.edge_name);
       mutex_lock l(mu_);
       if (table_.count(key) <= 0) {
         status = errors::Internal("Did not find key ", key);
@@ -176,6 +176,9 @@ Status GraphRunner::Run(Graph* graph, FunctionLibraryRuntime* function_library,
   args.step_id = LogMemory::CONSTANT_FOLDING_STEP_ID;
   args.runner = runner;
   args.rendezvous = rendez;
+  // NOTE: Use of graph runner is limited to single-device executions
+  // so a CollectiveExecutor should never be required.
+  args.collective_executor = nullptr;
 
   // Run the graph.
   TF_RETURN_IF_ERROR(executor->Run(args));
diff --git a/tensorflow/core/common_runtime/lower_if_op.cc b/tensorflow/core/common_runtime/lower_if_op.cc
new file mode 100644
index 00000000000000..b5fee36ff43e08
--- /dev/null
+++ b/tensorflow/core/common_runtime/lower_if_op.cc
@@ -0,0 +1,283 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/lower_if_op.h"
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
+
+namespace tensorflow {
+
+// TODO(jpienaar): Consider making it a public attribute.
+const char* const LowerIfOpPass::kLowerUsingSwitchMergeAttr =
+    "_lower_using_switch_merge";
+
+namespace {
+
+using NodeOut = NodeBuilder::NodeOut;
+
+// Convenience builder to make it easy to construct a conditional with a single
+// function call in the then and else branch. This first converts the if node
+// into switches (for inputs) and merges (for outputs) around a function call
+// per branch, then inlines the function calls.
+class CondBuilder {
+ public:
+  enum Branch { kElseBranch = 0, kThenBranch = 1 };
+
+  // Create a CondBuilder to create the lowering of If op.  that has then and
+  // else functions named `then_fn_name` and `else_fn_name` respectively in the
+  // given graph.
+  CondBuilder(Node* if_op, const string& then_fn_name,
+              const string& else_fn_name, Graph* graph);
+
+  // Constructs the basic conditional control flow using switch and merge nodes.
+  Status CreatePivotNodes();
+
+  // Adds the inputs from the if node to the merge nodes of the lowered if.
+  Status AddInputs();
+
+  // Adds the outputs from the if node to the merge nodes of the lowered if.
+  // Note: no inputs can be added once outputs are added as the then and else
+  // nodes are finalized while adding outputs.
+  Status AddOutputs();
+
+  // Builds an identity node with the same outputs as If.
+  Status BuildLoweredIfOutput();
+
+  // Inline call nodes for then and else.
+  Status InlineCallNodes();
+
+ private:
+  // Returns unique name containing the name of the If op being rewritten
+  // (name_), infix and a suffix to ensure it is unique within the graph.
+  string NewName(const string& infix);
+
+  // Adds input to both the then and else nodes from src:src_output.
+  Status AddInput(Node* src, int src_output);
+
+  // The merged outputs of the then and else nodes.
+  std::vector<NodeOut> outputs_;
+
+  // The node that dominates all execution of the then and else body nodes.
+  Node* control_predecessor_;
+  // The original If op.
+  Node* if_op_;
+  // The identity node with the same outputs as the original If op.
+  Node* lowered_if_output_;
+  // The predicate of the conditional.
+  Node* pred_;
+  // Node corresponding to pivot_f branch of predicate switch which is
+  // the pivot node that dominates all nodes in the false/else branch.
+  Node* pivot_f_;
+  // Node corresponding to pivot_t branch of predicate switch which is
+  // the pivot node that dominates all nodes in the true/then branch.
+  Node* pivot_t_;
+  Node* then_call_node_;
+  Node* else_call_node_;
+  Graph* graph_;
+  string name_;
+
+  NodeBuilder then_call_builder_;
+  NodeBuilder else_call_builder_;
+};
+
+CondBuilder::CondBuilder(Node* if_op, const string& then_fn_name,
+                         const string& else_fn_name, Graph* graph)
+    : if_op_(if_op),
+      graph_(graph),
+      name_(if_op->name()),
+      then_call_builder_(NewName("then"), then_fn_name, graph->op_registry()),
+      else_call_builder_(NewName("else"), else_fn_name, graph->op_registry()) {
+  TF_CHECK_OK(if_op_->input_node(0, &pred_));
+}
+
+Status CondBuilder::CreatePivotNodes() {
+  // Construct the basic cond body (consisting of feeding in the predicate to
+  // create pivot nodes).
+  Node* switch_pred;
+  TF_RETURN_IF_ERROR(
+      NodeBuilder(NewName("switch_pred"), "Switch", graph_->op_registry())
+          .Input(NodeOut(pred_, 0))
+          .Input(NodeOut(pred_, 0))
+          .Finalize(graph_, &switch_pred));
+  control_predecessor_ = switch_pred;
+  TF_RETURN_IF_ERROR(
+      NodeBuilder(NewName("pivot_f"), "Identity", graph_->op_registry())
+          .Input(switch_pred, kElseBranch)
+          .Finalize(graph_, &pivot_f_));
+  TF_RETURN_IF_ERROR(
+      NodeBuilder(NewName("pivot_t"), "Identity", graph_->op_registry())
+          .Input(switch_pred, kThenBranch)
+          .Finalize(graph_, &pivot_t_));
+  return Status::OK();
+}
+
+string CondBuilder::NewName(const string& infix) {
+  return graph_->NewName(strings::StrCat(name_, "/", infix));
+}
+
+Status CondBuilder::AddInput(Node* src, int src_output) {
+  Node* input;
+  TF_RETURN_IF_ERROR(
+      NodeBuilder(NewName(src->name()), "Switch", graph_->op_registry())
+          .Input(src, src_output)
+          .Input(pred_, 0)
+          .Finalize(graph_, &input));
+  then_call_builder_.Input(input, kThenBranch);
+  else_call_builder_.Input(input, kElseBranch);
+  return Status::OK();
+}
+
+Status CondBuilder::AddInputs() {
+  // Add input data edges.
+  std::vector<const Edge*> edges;
+  TF_RETURN_IF_ERROR(if_op_->input_edges(&edges));
+  // Start at index 1 as the first input is the predicate.
+  for (int i = 1; i < edges.size(); ++i) {
+    const Edge* e = edges[i];
+    TF_RETURN_IF_ERROR(AddInput(e->src(), e->src_output()));
+  }
+  // Add input control edges.
+  for (const Edge* e : if_op_->in_edges()) {
+    if (e->IsControlEdge()) {
+      graph_->AddControlEdge(e->src(), control_predecessor_);
+    }
+  }
+  return Status::OK();
+}
+
+Status CondBuilder::AddOutputs() {
+  // Construct the then and else nodes.
+  TF_RETURN_IF_ERROR(then_call_builder_.Finalize(graph_, &then_call_node_));
+  graph_->AddControlEdge(pivot_t_, then_call_node_);
+  TF_RETURN_IF_ERROR(else_call_builder_.Finalize(graph_, &else_call_node_));
+  graph_->AddControlEdge(pivot_f_, else_call_node_);
+
+  // Merge the outputs from the two branches.
+  std::vector<Node*> merges(then_call_node_->num_outputs());
+  outputs_.resize(merges.size());
+  for (int i = 0; i < then_call_node_->num_outputs(); ++i) {
+    TF_RETURN_IF_ERROR(
+        NodeBuilder(graph_->NewName("merge"), "Merge", graph_->op_registry())
+            .Input({NodeOut(then_call_node_, i), NodeOut(else_call_node_, i)})
+            .Finalize(graph_, &merges[i]));
+    outputs_[i] = NodeOut(merges[i], 0);
+  }
+
+  TF_RETURN_IF_ERROR(BuildLoweredIfOutput());
+
+  // Add outputs.
+  for (const Edge* e : if_op_->out_edges()) {
+    if (e->IsControlEdge()) {
+      graph_->AddControlEdge(lowered_if_output_, e->dst());
+    } else {
+      // Feed the outputs directly from the merge nodes so that downstream ops
+      // can start before all the outputs have been computed.
+      graph_->AddEdge(merges[e->src_output()], e->src_output(), e->dst(),
+                      e->dst_input());
+    }
+  }
+  return Status::OK();
+}
+
+Status InlineCallInGraph(Node* n, Graph* g) {
+  const auto& lib = g->flib_def();
+  const FunctionDef* fdef = lib.Find(n->type_string());
+  CHECK(fdef != nullptr);
+  FunctionBody* fbody;
+  TF_RETURN_IF_ERROR(
+      FunctionDefToBodyHelper(*fdef, n->attrs(), &lib,
+                              [&lib](const string& op, const OpDef** sig) {
+                                return lib.LookUpOpDef(op, sig);
+                              },
+                              &fbody));
+  // TODO(jpienaar): Improve this interface to make the need to delete it
+  // explicit.
+  InlineFunctionBody(g->flib_def(), g, n, fbody);
+  delete fbody;
+  return Status::OK();
+}
+
+Status CondBuilder::BuildLoweredIfOutput() {
+  // Build the identity node output.
+  NodeBuilder ib(name_, "IdentityN");
+  ib.Input(outputs_);
+  return ib.Finalize(graph_, &lowered_if_output_);
+}
+
+Status CondBuilder::InlineCallNodes() {
+  TF_RETURN_IF_ERROR(InlineCallInGraph(then_call_node_, graph_));
+  TF_RETURN_IF_ERROR(InlineCallInGraph(else_call_node_, graph_));
+  return Status::OK();
+}
+
+}  // namespace
+
+Status LowerIfOpPass::Run(const GraphOptimizationPassOptions& options) {
+  if (options.partition_graphs != nullptr) {
+    return errors::Internal(
+        "Lowering If op should happen before partitioning.");
+  }
+  if (options.graph == nullptr) {
+    return Status::OK();
+  }
+
+  Graph* g = options.graph->get();
+  if (g == nullptr) {
+    return errors::Internal("Lowering If op requires a graph to be available.");
+  }
+
+  // Match all the nodes that need to be rewritten.
+  gtl::InlinedVector<Node*, 2> matches;
+  for (Node* n : g->op_nodes()) {
+    if (n->type_string() == "If") {
+      // Only rewrite if the If op is marked as needing to be lowered.
+      bool match;
+      Status s = GetNodeAttr(n->attrs(), kLowerUsingSwitchMergeAttr, &match);
+      if (s.ok() && match) matches.push_back(n);
+    }
+  }
+  for (Node* n : matches) {
+    TF_RETURN_IF_ERROR(RewriteNode(n, g));
+  }
+  return Status::OK();
+}
+
+Status LowerIfOpPass::RewriteNode(Node* n, Graph* g) {
+  const AttrValue* then_attr = n->attrs().Find("then_branch");
+  if (then_attr == nullptr) {
+    return errors::InvalidArgument("Then branch function missing");
+  }
+  const AttrValue* else_attr = n->attrs().Find("else_branch");
+  if (else_attr == nullptr) {
+    return errors::InvalidArgument("Else branch function missing");
+  }
+
+  CondBuilder cb(n, then_attr->func().name(), else_attr->func().name(), g);
+  TF_RETURN_IF_ERROR(cb.CreatePivotNodes());
+  TF_RETURN_IF_ERROR(cb.AddInputs());
+  TF_RETURN_IF_ERROR(cb.AddOutputs());
+  TF_RETURN_IF_ERROR(cb.InlineCallNodes());
+  g->RemoveNode(n);
+
+  return Status::OK();
+}
+
+REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 0,
+                      LowerIfOpPass);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/lower_if_op.h b/tensorflow/core/common_runtime/lower_if_op.h
new file mode 100644
index 00000000000000..a9ef39ae5c828d
--- /dev/null
+++ b/tensorflow/core/common_runtime/lower_if_op.h
@@ -0,0 +1,38 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_IF_OP_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_IF_OP_H_
+
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+// Rewrite If ops to use switch and merge nodes instead.
+class LowerIfOpPass : public GraphOptimizationPass {
+ public:
+  static const char* const kLowerUsingSwitchMergeAttr;
+
+  Status Run(const GraphOptimizationPassOptions& options) override;
+
+ private:
+  // Rewrite the given If node `n` in graph `g` to use the switch-merge form.
+  Status RewriteNode(Node* n, Graph* g);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_LOWER_IF_OP_H_
diff --git a/tensorflow/core/common_runtime/lower_if_op_test.cc b/tensorflow/core/common_runtime/lower_if_op_test.cc
new file mode 100644
index 00000000000000..319a617b322591
--- /dev/null
+++ b/tensorflow/core/common_runtime/lower_if_op_test.cc
@@ -0,0 +1,140 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/lower_if_op.h"
+
+#include "tensorflow/cc/client/client_session.h"
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/cc/ops/control_flow_ops_internal.h"
+#include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/graph_runner.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/graph/graph_def_builder_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+Status Rewrite(std::unique_ptr<Graph>* graph) {
+  FunctionDefLibrary flib;
+  FunctionLibraryDefinition flib_def((*graph)->op_registry(), flib);
+
+  GraphOptimizationPassOptions opt_options;
+  opt_options.graph = graph;
+  opt_options.flib_def = &flib_def;
+  LowerIfOpPass pass;
+  return pass.Run(opt_options);
+}
+
+TEST(LowerIfOpTest, Simple) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+
+  // Add test functions for then and else branch.
+  FunctionDefLibrary f_lib_proto;
+  *(f_lib_proto.add_function()) = test::function::XTimesTwo();
+  *(f_lib_proto.add_function()) = test::function::XTimesFour();
+  FunctionLibraryDefinition f_lib(OpRegistry::Global(), f_lib_proto);
+
+  // Construct simple conditional that switches on `pred` and operates only on
+  // single input `A`.
+  Scope root = Scope::NewRootScope().ExitOnError();
+  TF_ASSERT_OK(root.graph()->AddFunctionLibrary(f_lib_proto));
+  auto a = ops::_Arg(root.WithOpName("A"), DT_INT32, 0);
+  auto pred = ops::_Arg(root.WithOpName("pred"), DT_BOOL, 1);
+  Node* written_if;
+  std::vector<NodeBuilder::NodeOut> inputs({NodeBuilder::NodeOut(a.node())});
+  AttrValue tb;
+  tb.mutable_func()->set_name("XTimesTwo");
+  AttrValue eb;
+  eb.mutable_func()->set_name("XTimesFour");
+  TF_ASSERT_OK(NodeBuilder("if", "If", &f_lib)
+                   .Input(pred.node())
+                   .Input(inputs)
+                   .Attr("then_branch", tb)
+                   .Attr("else_branch", eb)
+                   .Attr(LowerIfOpPass::kLowerUsingSwitchMergeAttr, true)
+                   .Attr("Tout", {DT_INT32})
+                   .Finalize(root.graph(), &written_if));
+  TF_ASSERT_OK(root.DoShapeInference(written_if));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  // The input graph has no switch or merge nodes.
+  int node_called_if_count = 0;
+  for (const auto* op : graph->op_nodes()) {
+    ASSERT_FALSE(op->IsSwitch());
+    ASSERT_FALSE(op->IsMerge());
+    if (op->name() == "if") {
+      ++node_called_if_count;
+    }
+  }
+  ASSERT_EQ(node_called_if_count, 1);
+
+  TF_ASSERT_OK(Rewrite(&graph));
+
+  // Verify the resultant graph has switch and merge nodes, and a node called
+  // `if` (but not If nodes).
+  int switch_count = 0;
+  int merge_count = 0;
+  node_called_if_count = 0;
+  for (const auto* op : graph->op_nodes()) {
+    if (op->IsSwitch()) {
+      ++switch_count;
+    }
+    if (op->IsMerge()) {
+      ++merge_count;
+    }
+    ASSERT_NE(op->type_string(), "If");
+    if (op->name() == "if") {
+      ++node_called_if_count;
+    }
+  }
+  // One switch for predicate and one for input (A).
+  ASSERT_EQ(switch_count, 2);
+  // One merge for the single output values of then and else.
+  ASSERT_EQ(merge_count, 1);
+  ASSERT_EQ(node_called_if_count, 1);
+
+  // Verify execution.
+  ClientSession session(root);
+  {
+    ClientSession::FeedType feeds;
+    feeds.emplace(Output(pred.node()), Input::Initializer(false));
+    feeds.emplace(Output(a.node()), Input::Initializer(10));
+    std::vector<Tensor> out_tensors;
+    TF_ASSERT_OK(session.Run(feeds, {Output(written_if)}, &out_tensors));
+    EXPECT_EQ(out_tensors.size(), 1);
+    EXPECT_EQ(out_tensors[0].scalar<int>()(), 40);
+  }
+  {
+    ClientSession::FeedType feeds;
+    feeds.emplace(Output(pred.node()), Input::Initializer(true));
+    feeds.emplace(Output(a.node()), Input::Initializer(10));
+    std::vector<Tensor> out_tensors;
+    TF_ASSERT_OK(session.Run(feeds, {Output(written_if)}, &out_tensors));
+    EXPECT_EQ(out_tensors.size(), 1);
+    EXPECT_EQ(out_tensors[0].scalar<int>()(), 20);
+  }
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc b/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
new file mode 100644
index 00000000000000..5d583a8360b8a9
--- /dev/null
+++ b/tensorflow/core/common_runtime/mkl_threadpool_device_test.cc
@@ -0,0 +1,53 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifdef INTEL_MKL
+
+#include "tensorflow/core/common_runtime/threadpool_device.h"
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+
+#ifdef _OPENMP
+TEST(MKLThreadPoolDeviceTest, TestOmpDefaults) {
+  SessionOptions options;
+  unsetenv("OMP_NUM_THREADS");
+
+  ThreadPoolDevice* tp = new ThreadPoolDevice(
+      options, "/device:CPU:0", Bytes(256), DeviceLocality(), cpu_allocator());
+
+  const int ht = port::NumHyperthreadsPerCore();
+  EXPECT_EQ(omp_get_max_threads(), (port::NumSchedulableCPUs() + ht - 1) / ht);
+}
+
+TEST(MKLThreadPoolDeviceTest, TestOmpPreSets) {
+  SessionOptions options;
+  setenv("OMP_NUM_THREADS", "314", 1);
+
+  ThreadPoolDevice* tp = new ThreadPoolDevice(
+      options, "/device:CPU:0", Bytes(256), DeviceLocality(), cpu_allocator());
+
+  EXPECT_EQ(omp_get_max_threads(), 314);
+}
+#endif  // _OPENMP
+
+}  // namespace tensorflow
+
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/common_runtime/optimization_registry.cc b/tensorflow/core/common_runtime/optimization_registry.cc
index 7f270b4d4e4277..6ac047295dce8f 100644
--- a/tensorflow/core/common_runtime/optimization_registry.cc
+++ b/tensorflow/core/common_runtime/optimization_registry.cc
@@ -36,6 +36,7 @@ Status OptimizationPassRegistry::RunGrouping(
     for (auto& phase : group->second) {
       VLOG(1) << "Running optimization phase " << phase.first;
       for (auto& pass : phase.second) {
+        VLOG(1) << "Running optimization pass: " << pass->name();
         Status s = pass->Run(options);
         if (!s.ok()) return s;
       }
@@ -44,4 +45,22 @@ Status OptimizationPassRegistry::RunGrouping(
   return Status::OK();
 }
 
+void OptimizationPassRegistry::LogGrouping(Grouping grouping, int vlog_level) {
+  auto group = groups_.find(grouping);
+  if (group != groups_.end()) {
+    for (auto& phase : group->second) {
+      for (auto& pass : phase.second) {
+        VLOG(vlog_level) << "Registered optimization pass grouping " << grouping
+                         << " phase " << phase.first << ": " << pass->name();
+      }
+    }
+  }
+}
+
+void OptimizationPassRegistry::LogAllGroupings(int vlog_level) {
+  for (auto group = groups_.begin(); group != groups_.end(); ++group) {
+    LogGrouping(group->first, vlog_level);
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/optimization_registry.h b/tensorflow/core/common_runtime/optimization_registry.h
index a469c8aa4ea640..f5d265aa24bfc1 100644
--- a/tensorflow/core/common_runtime/optimization_registry.h
+++ b/tensorflow/core/common_runtime/optimization_registry.h
@@ -65,6 +65,13 @@ class GraphOptimizationPass {
  public:
   virtual ~GraphOptimizationPass() {}
   virtual Status Run(const GraphOptimizationPassOptions& options) = 0;
+  void set_name(const string& name) { name_ = name; }
+  string name() const { return name_; }
+
+ private:
+  // The name of the opitimization pass, which is the same as the inherited
+  // class name.
+  string name_;
 };
 
 // The key is a 'phase' number. Phases are executed in increasing
@@ -95,6 +102,10 @@ class OptimizationPassRegistry {
   // Returns the global registry of optimization passes.
   static OptimizationPassRegistry* Global();
 
+  // Prints registered optimization passes for debugging.
+  void LogGrouping(Grouping grouping, int vlog_level);
+  void LogAllGroupings(int vlog_level);
+
  private:
   std::map<Grouping, GraphOptimizationPasses> groups_;
 };
@@ -105,7 +116,9 @@ class OptimizationPassRegistration {
  public:
   OptimizationPassRegistration(OptimizationPassRegistry::Grouping grouping,
                                int phase,
-                               std::unique_ptr<GraphOptimizationPass> pass) {
+                               std::unique_ptr<GraphOptimizationPass> pass,
+                               string optimization_pass_name) {
+    pass->set_name(optimization_pass_name);
     OptimizationPassRegistry::Global()->Register(grouping, phase,
                                                  std::move(pass));
   }
@@ -123,7 +136,8 @@ class OptimizationPassRegistration {
   static optimization_registration::OptimizationPassRegistration       \
       register_optimization_##ctr(                                     \
           grouping, phase,                                             \
-          std::unique_ptr<GraphOptimizationPass>(new optimization))
+          std::unique_ptr<GraphOptimizationPass>(new optimization()),  \
+          #optimization)
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index e61ed8c4794883..729312a310cad4 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/rendezvous_util.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 
@@ -144,7 +145,8 @@ Status ProcessFunctionLibraryRuntime::GetDeviceContext(
   }
   Device* device = flr->device();
   string device_type = device->parsed_name().type;
-  if (device_type == "CPU" || device_type == "TPU_SYSTEM") {
+  if (device_type == "CPU" || device_type == "TPU_SYSTEM" ||
+      device_type == "TPU") {
     // "TPU_SYSTEM" indicates that `device` is a CPU.
     return Status::OK();
   }
@@ -182,8 +184,8 @@ FunctionLibraryRuntime::Handle ProcessFunctionLibraryRuntime::AddHandle(
     FunctionLibraryRuntime::LocalHandle local_handle) {
   mutex_lock l(mu_);
   auto h = next_handle_;
-  FunctionData* fd = new FunctionData(device_name, local_handle);
-  function_data_[h] = std::unique_ptr<FunctionData>(fd);
+  function_data_[h] = MakeUnique<FunctionData>(
+      device_name, local_handle, function_key);
   table_[function_key] = h;
   next_handle_++;
   return h;
@@ -246,8 +248,8 @@ Status ProcessFunctionLibraryRuntime::Instantiate(
         gtl::FindWithDefault(table_, function_key, kInvalidHandle);
     if (h == kInvalidHandle || function_data_.count(h) == 0) {
       h = next_handle_;
-      FunctionData* fd = new FunctionData(options.target, kInvalidHandle);
-      function_data_[h] = std::unique_ptr<FunctionData>(fd);
+      function_data_[h] = MakeUnique<FunctionData>(
+          options.target, kInvalidHandle, function_key);
       table_[function_key] = h;
       next_handle_++;
     }
@@ -262,6 +264,14 @@ Status ProcessFunctionLibraryRuntime::Instantiate(
   return Status::OK();
 }
 
+Status ProcessFunctionLibraryRuntime::RemoveHandle(
+    FunctionLibraryRuntime::Handle handle) {
+  mutex_lock l(mu_);
+  table_.erase(function_data_[handle]->function_key());
+  function_data_.erase(handle);
+  return Status::OK();
+}
+
 Status ProcessFunctionLibraryRuntime::ReleaseHandle(
     FunctionLibraryRuntime::Handle handle) {
   FunctionLibraryRuntime* flr = nullptr;
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.h b/tensorflow/core/common_runtime/process_function_library_runtime.h
index 05e57708993a93..69381dd34d94ec 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.h
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.h
@@ -134,6 +134,9 @@ class ProcessFunctionLibraryRuntime {
   // of the device where the function is registered.
   string GetDeviceName(FunctionLibraryRuntime::Handle handle);
 
+  // Removes handle from the state owned by this object.
+  Status RemoveHandle(FunctionLibraryRuntime::Handle handle);
+
   Status Clone(Env* env, int graph_def_version,
                const OptimizerOptions& optimizer_options,
                CustomKernelCreator custom_kernel_creator,
@@ -147,10 +150,14 @@ class ProcessFunctionLibraryRuntime {
   class FunctionData {
    public:
     FunctionData(const string& target_device,
-                 FunctionLibraryRuntime::LocalHandle local_handle)
-        : target_device_(target_device), local_handle_(local_handle) {}
+                 FunctionLibraryRuntime::LocalHandle local_handle,
+                 const string& function_key)
+        : target_device_(target_device),
+          local_handle_(local_handle),
+          function_key_(function_key) {}
 
     string target_device() { return target_device_; }
+    const string& function_key() { return function_key_; }
 
     FunctionLibraryRuntime::LocalHandle local_handle() {
       mutex_lock l(mu_);
@@ -169,6 +176,7 @@ class ProcessFunctionLibraryRuntime {
 
     const string target_device_;
     FunctionLibraryRuntime::LocalHandle local_handle_ GUARDED_BY(mu_);
+    const string function_key_;
     bool init_started_ GUARDED_BY(mu_) = false;
     Status init_result_ GUARDED_BY(mu_);
     Notification init_done_;
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index cc10e77ad2e952..cce23080118359 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -39,7 +39,7 @@ class TestClusterFLR : public DistributedFunctionLibraryRuntime {
   Status Instantiate(const string& function_name,
                      const FunctionLibraryDefinition& lib_def, AttrSlice attrs,
                      const FunctionLibraryRuntime::InstantiateOptions& options,
-                     FunctionLibraryRuntime::LocalHandle* handle) {
+                     FunctionLibraryRuntime::LocalHandle* handle) override {
     mutex_lock l(mu_);
     *handle = next_handle_;
     next_handle_++;
@@ -49,7 +49,7 @@ class TestClusterFLR : public DistributedFunctionLibraryRuntime {
   void Run(const FunctionLibraryRuntime::Options& opts,
            FunctionLibraryRuntime::LocalHandle handle,
            gtl::ArraySlice<Tensor> args, std::vector<Tensor>* rets,
-           FunctionLibraryRuntime::DoneCallback done) {}
+           FunctionLibraryRuntime::DoneCallback done) override {}
 
  private:
   mutex mu_;
@@ -119,13 +119,12 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
 
     EXPECT_GE(call_count, 1);  // Test runner is used.
 
-    // Release the handle and then try running the function.  It
-    // should still succeed.
+    // Release the handle and then try running the function. It shouldn't
+    // succeed.
     status = proc_flr_->ReleaseHandle(handle);
     if (!status.ok()) {
       return status;
     }
-
     Notification done2;
     proc_flr_->Run(opts, handle, args, &out,
                    [&status, &done2](const Status& s) {
@@ -133,7 +132,10 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
                      done2.Notify();
                    });
     done2.WaitForNotification();
-    return status;
+    EXPECT_TRUE(errors::IsNotFound(status));
+    EXPECT_TRUE(str_util::StrContains(status.error_message(), "not found."));
+
+    return Status::OK();
   }
 
   std::vector<Device*> devices_;
diff --git a/tensorflow/core/common_runtime/process_util.cc b/tensorflow/core/common_runtime/process_util.cc
index f8f3a1ecd73d88..a5d31b75c75b8f 100644
--- a/tensorflow/core/common_runtime/process_util.cc
+++ b/tensorflow/core/common_runtime/process_util.cc
@@ -16,8 +16,10 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/process_util.h"
 
 #ifdef INTEL_MKL
+#ifdef _OPENMP
 #include <omp.h>
-#endif
+#endif  // _OPENMP
+#endif  // INTEL_MKL
 #include <string.h>
 
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -57,7 +59,10 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
   // MKL library executes ops in parallel using OMP threads
   // Set inter_op conservatively to avoid thread oversubscription that could
   // lead to severe perf degradations and OMP resource exhaustion
-  const int mkl_intra_op = omp_get_max_threads();
+  int mkl_intra_op = 1;
+#ifdef _OPENMP
+  mkl_intra_op = omp_get_max_threads();
+#endif  // _OPENMP
   CHECK_GE(mkl_intra_op, 1);
   const int32 mkl_inter_op = std::max(
       (port::NumSchedulableCPUs() + mkl_intra_op - 1) / mkl_intra_op, 2);
@@ -68,7 +73,7 @@ int32 NumInterOpThreadsFromSessionOptions(const SessionOptions& options) {
 #else
   // Default to using the number of cores available in the process.
   return port::NumSchedulableCPUs();
-#endif
+#endif  // INTEL_MKL
 }
 
 thread::ThreadPool* NewThreadPoolFromSessionOptions(
@@ -79,21 +84,18 @@ thread::ThreadPool* NewThreadPoolFromSessionOptions(
 }
 
 void SchedClosure(std::function<void()> closure) {
-  if (port::Tracing::IsActive()) {
-    const uint64 id = port::Tracing::UniqueId();
-    port::Tracing::RecordEvent(port::Tracing::EventCategory::kScheduleClosure,
-                               id);
-    std::function<void()> wrapper = std::bind(
-        [id](std::function<void()> closure) {
-          port::Tracing::ScopedActivity region(
-              port::Tracing::EventCategory::kRunClosure, id);
-          closure();
-        },
-        std::move(closure));
-    Env::Default()->SchedClosure(std::move(wrapper));
-  } else {
-    Env::Default()->SchedClosure(std::move(closure));
+  if (!tracing::EventCollector::IsEnabled()) {
+    return Env::Default()->SchedClosure(std::move(closure));
   }
+  uint64 id = tracing::GetUniqueArg();
+  tracing::RecordEvent(tracing::EventCategory::kScheduleClosure, id);
+
+  Env::Default()->SchedClosure(std::bind(
+      [id](std::function<void()> closure) {
+        tracing::ScopedRegion region(tracing::EventCategory::kRunClosure, id);
+        closure();
+      },
+      std::move(closure)));
 }
 
 void SchedNonBlockingClosureAfter(int64 micros, std::function<void()> closure) {
diff --git a/tensorflow/core/common_runtime/profile_handler.h b/tensorflow/core/common_runtime/profile_handler.h
index 9d31b1aecbce21..391dc8c19878b7 100644
--- a/tensorflow/core/common_runtime/profile_handler.h
+++ b/tensorflow/core/common_runtime/profile_handler.h
@@ -29,22 +29,6 @@ class ProfileHandler {
   ProfileHandler() {}
   virtual ~ProfileHandler() {}
 
-  // Records that a miscellaneous activity occurred in the current step.
-  //
-  // Implementations of this method must be thread-safe.
-  //
-  // Args:
-  // - device: The device on which the activity occurred.
-  // - start: The time at which the activity started.
-  // - limit: The time at which the activity finished.
-  // - label: A label for the op, which may be used in visualization.
-  // - op_type: A type string for the op, which may be used in visualization.
-  // - details: A details string, which may be used in visualization.
-  // from time "start" to "limit" with "op_type" and "details".
-  virtual void RecordActivity(const string& device, Microseconds start,
-                              Microseconds limit, StringPiece label,
-                              StringPiece op_type, StringPiece details) = 0;
-
   // Records that a single Op was executed in the current step.
   //
   // Implementations of this method must be thread-safe.
diff --git a/tensorflow/core/common_runtime/renamed_device.h b/tensorflow/core/common_runtime/renamed_device.h
index fe4df1c106c5a8..103eee03b36415 100644
--- a/tensorflow/core/common_runtime/renamed_device.h
+++ b/tensorflow/core/common_runtime/renamed_device.h
@@ -58,11 +58,6 @@ class RenamedDevice : public Device {
     return underlying_->GetAllocator(attr);
   }
 
-  Allocator* GetStepAllocator(AllocatorAttributes attr,
-                              ResourceMgr* step_resource_manager) override {
-    return underlying_->GetStepAllocator(attr, step_resource_manager);
-  }
-
   const Eigen::ThreadPoolDevice* eigen_cpu_device() override {
     return underlying_->eigen_cpu_device();
   }
diff --git a/tensorflow/core/common_runtime/ring_reducer.cc b/tensorflow/core/common_runtime/ring_reducer.cc
index a1cd7625051c33..f8428f2fde3464 100644
--- a/tensorflow/core/common_runtime/ring_reducer.cc
+++ b/tensorflow/core/common_runtime/ring_reducer.cc
@@ -92,6 +92,8 @@ RingReducer::RingReducer(CollectiveExecutor* col_exec, const DeviceMgr* dev_mgr,
   CHECK_GT(num_subdivs_, 0);
 }
 
+RingReducer::~RingReducer() { group_size_tensor_ready_.WaitForNotification(); }
+
 string RingReducer::TensorDebugString(Tensor tensor) {
   const DeviceBase::GpuDeviceInfo* gpu_device_info =
       ctx_->device()->tensorflow_gpu_device_info();
@@ -155,21 +157,27 @@ void RingReducer::Run(StatusCallback done) {
   // we're not computing in-place on the input tensor.
   if ((input_ != output_) &&
       (DMAHelper::base(input_) != DMAHelper::base(output_))) {
+    // We are running in a blockable thread and the callback can't block so
+    // just wait here on the copy.
+    Notification note;
     CollectiveRemoteAccessLocal::MemCpyAsync(
         ctx_->input_device_context(0), ctx_->op_device_context(), device_,
         device_, ctx_->input_alloc_attr(0), ctx_->output_alloc_attr(0), input_,
-        output_, [this](const Status& s) {
-          if (!s.ok()) {
-            done_(s);
-          } else {
-            ContinueAfterInputCopy();
-          }
+        output_, [this, &note, &status](const Status& s) {
+          status.Update(s);
+          note.Notify();
         });
-  } else {
-    ContinueAfterInputCopy();
+    note.WaitForNotification();
+    if (!status.ok()) {
+      done_(status);
+      return;
+    }
   }
+  ContinueAfterInputCopy();
 }
 
+// Note that this function is blocking and must not run in any thread
+// which cannot be blocked.
 void RingReducer::ContinueAfterInputCopy() {
   AllocatorAttributes attr = ctx_->output_alloc_attr(0);
   ca_.reset(MakeCollectiveAdapter(output_, group_size_ * num_subdivs_,
@@ -233,6 +241,7 @@ void RingReducer::Finish(bool ok) {
     mutex_lock l(status_mu_);
     s = status_;
   }
+  rfv_.clear();  // Give up Refs on output tensor.
   done_(s);
 }
 
@@ -250,6 +259,7 @@ RingReducer::SubContext::SubContext(OpKernelContext* ctx,
   sub_params_.input_device_contexts = &sub_input_dc_;
   sub_params_.eigen_gpu_device = nullptr;
   sub_params_.ensure_eigen_gpu_device();
+  sub_params_.forward_from_array = &forward_from_;
   sub_ctx_ = new OpKernelContext(&sub_params_, 1);
 }
 
@@ -273,7 +283,7 @@ void RingReducer::InitRingField(RingField* rf, int chunk_idx, int subdiv_idx,
   // Note on field indexing: There are group_size_ devices in the
   // instance, implying the same number of chunks per tensor, where a
   // chunk is the unit of data transferred in a time step.  However, if
-  // a device can simultaenously send data by 2 or more independent
+  // a device can simultaneously send data by 2 or more independent
   // channels we can speed up the transfer by subdividing chunks and
   // processing multiple subdivisions at once.  So the actual number
   // of RingFields is group_size_ * num_subdivs_.
diff --git a/tensorflow/core/common_runtime/ring_reducer.h b/tensorflow/core/common_runtime/ring_reducer.h
index 8fde18dc1c083f..3e1988e78706fc 100644
--- a/tensorflow/core/common_runtime/ring_reducer.h
+++ b/tensorflow/core/common_runtime/ring_reducer.h
@@ -32,7 +32,7 @@ class RingReducer {
               const CollectiveParams& col_params, const string& exec_key,
               int64 step_id, const Tensor* input, Tensor* output);
 
-  virtual ~RingReducer() {}
+  virtual ~RingReducer();
 
   void Run(StatusCallback done);
 
diff --git a/tensorflow/core/common_runtime/scoped_allocator_mgr.cc b/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
index be79cc4507124f..8ac6adc2e48a07 100644
--- a/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
+++ b/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
@@ -104,7 +104,7 @@ ScopedAllocatorContainer::~ScopedAllocatorContainer() {
   // contents deleted via Drop.  When when a step ends early
   // (e.g. through abnormal termination) we need to clean up
   // explicitly.  So long as graph execution of the associated step has
-  // completey terminated this should be safe.
+  // completely terminated this should be safe.
   for (auto& it : allocators_) {
     if (it.second.field_index == ScopedAllocator::kBackingIndex) {
       delete it.second.scoped_allocator;
@@ -160,13 +160,18 @@ Status ScopedAllocatorMgr::AddScopedAllocator(
                                  expected_call_count);
 }
 
-void ScopedAllocatorMgr::PopulateFields(
+/*static*/
+size_t ScopedAllocatorMgr::PopulateFields(
     int32 scope_id, const gtl::ArraySlice<TensorShape>& shapes,
     const DataType dtype, std::vector<ScopedAllocator::Field>* fields) {
   const int32 num_fields = static_cast<int32>(shapes.size());
   fields->resize(num_fields);
   size_t offset = 0;
   for (int32 i = 0; i < num_fields; ++i) {
+    size_t overshoot = offset % Allocator::kAllocatorAlignment;
+    if (overshoot > 0) {
+      offset += (Allocator::kAllocatorAlignment - overshoot);
+    }
     size_t bytes = shapes[i].num_elements() * DataTypeSize(dtype);
     (*fields)[i].scope_id = scope_id + 1 + i;
     (*fields)[i].bytes = bytes;
@@ -175,11 +180,8 @@ void ScopedAllocatorMgr::PopulateFields(
             << " bytes=" << (*fields)[i].bytes
             << " offset=" << (*fields)[i].offset;
     offset += bytes;
-    size_t overshoot = offset % Allocator::kAllocatorAlignment;
-    if (overshoot > 0) {
-      offset += (Allocator::kAllocatorAlignment - overshoot);
-    }
   }
+  return offset;
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/scoped_allocator_mgr.h b/tensorflow/core/common_runtime/scoped_allocator_mgr.h
index effc5f2d775336..8c5e853472d131 100644
--- a/tensorflow/core/common_runtime/scoped_allocator_mgr.h
+++ b/tensorflow/core/common_runtime/scoped_allocator_mgr.h
@@ -89,10 +89,13 @@ class ScopedAllocatorMgr {
 
   // Populate the bytes and offset members of Field.  Instance allocaters get
   // consecutive scope_id values following that of the base ScopedAllocator.
-  static void PopulateFields(int32 scope_id,
-                             const gtl::ArraySlice<TensorShape>& shapes,
-                             const DataType dtype,
-                             std::vector<ScopedAllocator::Field>* fields);
+  // Returns the total number of bytes required to be allocated in the
+  // backing tensor, for convenience.  (The same value can be obtained
+  // by summing offset and bytes in the last field.)
+  static size_t PopulateFields(int32 scope_id,
+                               const gtl::ArraySlice<TensorShape>& shapes,
+                               const DataType dtype,
+                               std::vector<ScopedAllocator::Field>* fields);
 
   const string& device_name() const { return device_name_; }
 
diff --git a/tensorflow/core/common_runtime/session_state.cc b/tensorflow/core/common_runtime/session_state.cc
index 6befa53dff0227..65ff356e73af0c 100644
--- a/tensorflow/core/common_runtime/session_state.cc
+++ b/tensorflow/core/common_runtime/session_state.cc
@@ -70,7 +70,7 @@ Status TensorStore::SaveTensors(const std::vector<string>& output_names,
     // Save only the tensors in output_names in the session.
     for (const string& name : output_names) {
       TensorId id(ParseTensorName(name));
-      const string& op_name = id.first.ToString();
+      const string& op_name = std::string(id.first);
       auto it = tensors_.find(op_name);
       if (it != tensors_.end()) {
         // Save the tensor to the session state.
diff --git a/tensorflow/core/common_runtime/shape_refiner.cc b/tensorflow/core/common_runtime/shape_refiner.cc
index 06dbe049868b2f..fa4d1eda625fa7 100644
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@@ -232,13 +232,12 @@ Status ShapeRefiner::AddNode(const Node* node) {
     input_nodes[e->dst_input()] = input;
     input_shapes[e->dst_input()] = c->output(e->src_output());
 
-    // Only propagate handle data of edges which are carrying resource handles.
-    if (e->src()->output_type(e->src_output()) == DT_RESOURCE) {
-      const auto* in_v = c->output_handle_shapes_and_types(e->src_output());
-      if (in_v != nullptr) {
-        input_handle_shapes_and_types[e->dst_input()].reset(
-            new std::vector<ShapeAndType>(*in_v));
-      }
+    const auto* in_v = c->output_handle_shapes_and_types(e->src_output());
+    if (in_v != nullptr) {
+      DataType input_type = e->src()->output_type(e->src_output());
+      DCHECK(input_type == DT_RESOURCE || input_type == DT_VARIANT);
+      input_handle_shapes_and_types[e->dst_input()].reset(
+          new std::vector<ShapeAndType>(*in_v));
     }
   }
 
@@ -422,6 +421,28 @@ Status ShapeRefiner::EvaluateConstantTensorForEdge(const Node* node,
                                 kMaxTensorSize, disable_constant_propagation_);
 }
 
+Status ShapeRefiner::EvaluateConstantIntScalarEdge(const Node* node,
+                                                   int dst_idx, bool* evaluated,
+                                                   int64* result) {
+  Tensor scalar;
+  TF_RETURN_IF_ERROR(
+      EvaluateConstantTensorForEdge(node, dst_idx, evaluated, &scalar));
+  if (*evaluated) {
+    DCHECK_EQ(scalar.NumElements(), 1)
+        << "EvaluateConstantIntScalarEdge called on non-scalar edge: "
+        << scalar.NumElements();
+    if (scalar.dtype() == DT_INT32) {
+      *result = scalar.scalar<int32>()();
+    } else {
+      DCHECK_EQ(scalar.dtype(), DT_INT64)
+          << "EvaluateConstantIntScalarEdge called on non-integer edge: "
+          << scalar.dtype();
+      *result = scalar.scalar<int64>()();
+    }
+  }
+  return Status::OK();
+}
+
 Status ShapeRefiner::ConstantPartialShape(InferenceContext* target_context,
                                           const Node* node, int dst_idx,
                                           ShapeHandle* result) {
@@ -472,19 +493,11 @@ Status ShapeRefiner::ConstantPartialShape(InferenceContext* target_context,
     std::vector<DimensionHandle> dims;
     // Pack is concatenating its input scalars to form the shape tensor vector.
     for (int i = 0; i < src_context->num_inputs(); ++i) {
-      Tensor scalar;
-      bool evaluated = false;
-      TF_RETURN_IF_ERROR(EvaluateConstantTensorForEdge(input_edge->src(), i,
-                                                       &evaluated, &scalar));
+      int64 size;
+      bool evaluated;
+      TF_RETURN_IF_ERROR(EvaluateConstantIntScalarEdge(input_edge->src(), i,
+                                                       &evaluated, &size));
       if (evaluated) {
-        int64 size;
-        if (scalar.dtype() == DT_INT32) {
-          size = scalar.scalar<int32>()();
-        } else if (scalar.dtype() == DT_INT64) {
-          size = scalar.scalar<int64>()();
-        } else {
-          return errors::InvalidArgument("Pack input must be int32 or int64");
-        }
         dims.push_back(size < 0 ? target_context->UnknownDim()
                                 : target_context->MakeDim(size));
       } else {
@@ -514,6 +527,9 @@ Status ShapeRefiner::ConstantPartialShape(InferenceContext* target_context,
       TF_RETURN_IF_ERROR(
           target_context->Concatenate(*result, sub_result, result));
     }
+  } else if (src_op == "StridedSlice") {
+    TF_RETURN_IF_ERROR(
+        PartialStridedSliceShape(input_edge->src(), src_context, result));
   } else {
     Tensor t;
     bool evaluated = false;
@@ -525,6 +541,78 @@ Status ShapeRefiner::ConstantPartialShape(InferenceContext* target_context,
   return Status::OK();
 }
 
+Status ShapeRefiner::PartialStridedSliceShape(Node* slice_node,
+                                              InferenceContext* ctx,
+                                              ShapeHandle* result) {
+  // Only attempt to evaluate if begin/end/strides all are scalars.
+  for (int i = 1; i <= 3; ++i) {
+    ShapeHandle input_shape = ctx->input(i);
+    if (ctx->Value(ctx->Dim(input_shape, 0)) != 1) {
+      *result = ctx->UnknownShape();
+      return Status::OK();
+    }
+  }
+
+  int begin_mask, end_mask, ellipsis_mask, new_axis_mask, shrink_axis_mask;
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(slice_node->attrs(), "begin_mask", &begin_mask));
+  TF_RETURN_IF_ERROR(GetNodeAttr(slice_node->attrs(), "end_mask", &end_mask));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(slice_node->attrs(), "ellipsis_mask", &ellipsis_mask));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(slice_node->attrs(), "new_axis_mask", &new_axis_mask));
+  TF_RETURN_IF_ERROR(
+      GetNodeAttr(slice_node->attrs(), "shrink_axis_mask", &shrink_axis_mask));
+
+  // Only attempt to evaluate if there are no special masks set (note that we
+  // can handle begin/end_mask == 1).
+  if (!(begin_mask == 0 || begin_mask == 1) ||
+      !(end_mask == 0 || end_mask == 1) || ellipsis_mask != 0 ||
+      new_axis_mask != 0 || shrink_axis_mask != 0) {
+    *result = ctx->UnknownShape();
+    return Status::OK();
+  }
+
+  bool evaluated;
+  int64 begin;
+  if (begin_mask == 1) {
+    begin = 0;
+  } else {
+    TF_RETURN_IF_ERROR(
+        EvaluateConstantIntScalarEdge(slice_node, 1, &evaluated, &begin));
+    if (!evaluated) {
+      *result = ctx->UnknownShape();
+      return Status::OK();
+    }
+  }
+
+  int64 end;
+  if (end_mask == 1) {
+    end = std::numeric_limits<int64>::max();
+  } else {
+    TF_RETURN_IF_ERROR(
+        EvaluateConstantIntScalarEdge(slice_node, 2, &evaluated, &end));
+    if (!evaluated) {
+      *result = ctx->UnknownShape();
+      return Status::OK();
+    }
+  }
+
+  int64 stride;
+  TF_RETURN_IF_ERROR(
+      EvaluateConstantIntScalarEdge(slice_node, 3, &evaluated, &stride));
+  if (!evaluated) {
+    *result = ctx->UnknownShape();
+    return Status::OK();
+  }
+
+  // Apply stride to input interpreted as a partial shape.
+  ShapeHandle input;
+  TF_RETURN_IF_ERROR(ConstantPartialShape(ctx, slice_node, 0, &input));
+  TF_RETURN_IF_ERROR(ctx->Subshape(input, begin, end, stride, result));
+  return Status::OK();
+}
+
 Status ShapeRefiner::RunShapeFn(const Node* node,
                                 const OpRegistrationData* op_reg_data,
                                 ExtendedInferenceContext* ec) {
diff --git a/tensorflow/core/common_runtime/shape_refiner.h b/tensorflow/core/common_runtime/shape_refiner.h
index d49c4373f0b8c8..9c96dcbc206ae8 100644
--- a/tensorflow/core/common_runtime/shape_refiner.h
+++ b/tensorflow/core/common_runtime/shape_refiner.h
@@ -215,9 +215,18 @@ class ShapeRefiner {
                                 bool keep_nested_shapes,
                                 ExtendedInferenceContext* outer_context);
 
+  // Attempts to evaluate the 'dst_idx'-th input to 'node'. If the input edge
+  // value can be evaluated, 'evaluated' is set to true and the value returned
+  // in 'result'. Otherwise 'evaluated' is set to false.
   Status EvaluateConstantTensorForEdge(const Node* node, int dst_idx,
                                        bool* evaluated, Tensor* result);
 
+  // Wrapper around EvaluateConstantTensorForEdge for scalar int32/int64 input
+  // tensors. The caller is responsible for checking that the specified edge is
+  // scalar and int32 or int64.
+  Status EvaluateConstantIntScalarEdge(const Node* node, int dst_idx,
+                                       bool* evaluated, int64* result);
+
   // This function tries to materialize as much information about the 'node''s
   // dst_idx input as a statically computable shape, and the result may be
   // partially known, depending on what is statically inferable.
@@ -243,6 +252,11 @@ class ShapeRefiner {
                               const Node* node, int dst_idx,
                               shape_inference::ShapeHandle* result);
 
+  // Implementation of ConstantPartialShape for StridedSlice nodes.
+  Status PartialStridedSliceShape(Node* slice_node,
+                                  shape_inference::InferenceContext* ctx,
+                                  shape_inference::ShapeHandle* result);
+
   Status RunShapeFn(const Node* node, const OpRegistrationData* op_reg_data,
                     ExtendedInferenceContext* ec);
 
diff --git a/tensorflow/core/common_runtime/shape_refiner_test.cc b/tensorflow/core/common_runtime/shape_refiner_test.cc
index f48638afc0f602..8b9657eec88db6 100644
--- a/tensorflow/core/common_runtime/shape_refiner_test.cc
+++ b/tensorflow/core/common_runtime/shape_refiner_test.cc
@@ -60,6 +60,39 @@ class ShapeRefinerTest : public ::testing::Test {
   }
 
   static constexpr int64 kMaxTensorSize = ShapeRefiner::kMaxTensorSize;
+
+  void TestStridedSlice(const PartialTensorShape& input_shape, int begin,
+                        int end, int stride, const char* expected,
+                        int begin_mask = 0, int end_mask = 0,
+                        int ellipsis_mask = 0) {
+    Scope root = Scope::DisabledShapeInferenceScope();
+    auto placeholder =
+        ops::Placeholder(root, DT_INT32, ops::Placeholder::Shape(input_shape));
+    auto input = ops::Shape(root, placeholder);
+    auto begin_op = ops::Const(root, {begin});
+    auto end_op = ops::Const(root, {end});
+    auto stride_op = ops::Const(root, {stride});
+    auto slice = ops::StridedSlice(root, input, begin_op, end_op, stride_op,
+                                   ops::StridedSlice::BeginMask(begin_mask)
+                                       .EndMask(end_mask)
+                                       .EllipsisMask(ellipsis_mask));
+    Node* result;
+    TF_ASSERT_OK(NodeBuilder("test", "TensorAsShapeInt32")
+                     .Input(slice.node())
+                     .Finalize(root.graph(), &result));
+
+    ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
+    TF_ASSERT_OK(m.AddNode(placeholder.node()));
+    TF_ASSERT_OK(m.AddNode(input.node()));
+    TF_ASSERT_OK(m.AddNode(begin_op.node()));
+    TF_ASSERT_OK(m.AddNode(end_op.node()));
+    TF_ASSERT_OK(m.AddNode(stride_op.node()));
+    TF_ASSERT_OK(m.AddNode(slice.node()));
+    TF_ASSERT_OK(m.AddNode(result));
+
+    shape_inference::InferenceContext* ctx = m.GetContext(result);
+    EXPECT_EQ(ctx->DebugString(ctx->output(0)), expected);
+  }
 };
 
 namespace {
@@ -1156,6 +1189,73 @@ TEST_F(ShapeRefinerTest, ConstantValueAsShape_ConcatInvalidDimValue) {
             m.AddNode(result).error_message());
 }
 
+TEST_F(ShapeRefinerTest, ConstantValueAsShape_StridedSlice) {
+  TestStridedSlice(
+      /*input_shape=*/{1, -1, 3, -1, 5},
+      /*begin=*/2,
+      /*end=*/5,
+      /*stride=*/1,
+      /*expected=*/"[3,?,5]");
+}
+
+TEST_F(ShapeRefinerTest, ConstantValueAsShape_StridedSliceNegativeStride) {
+  // clang-format off
+  TestStridedSlice(
+      /*input_shape=*/{1, -1, 3, -1, 5},
+      /*begin=*/10,
+      /*end=*/0,
+      /*stride=*/-1,
+      /*expected=*/"[5,?,3,?]");
+  // clang-format on
+}
+
+TEST_F(ShapeRefinerTest, ConstantValueAsShape_StridedSliceMasks) {
+  TestStridedSlice(
+      /*input_shape=*/{1, -1, 3, -1, 5},
+      /*begin=*/3,
+      /*end=*/4,
+      /*stride=*/1,
+      /*expected=*/"[1,?,3,?,5]",
+      /*begin_mask=*/1,
+      /*end_mask=*/1);
+}
+
+TEST_F(ShapeRefinerTest, ConstantValueAsShape_StridedSliceInvalidMask) {
+  TestStridedSlice(
+      /*input_shape=*/{1, -1, 3},
+      /*begin=*/2,
+      /*end=*/3,
+      /*stride=*/1,
+      /*expected=*/"[?,?,?]",
+      /*begin_mask=*/0,
+      /*end_mask=*/0,
+      /*ellipsis_mask=*/1);
+}
+
+TEST_F(ShapeRefinerTest, ConstantValueAsShape_StridedSliceMulti) {
+  Scope root = Scope::DisabledShapeInferenceScope();
+  auto input = ops::Placeholder(root, DT_INT32);
+  auto begin = ops::Const(root, {0, 0});
+  auto end = ops::Const(root, {2, 2});
+  auto stride = ops::Const(root, {1, 1});
+  auto slice = ops::StridedSlice(root, input, begin, end, stride);
+  Node* result;
+  TF_ASSERT_OK(NodeBuilder("test", "TensorAsShapeInt32")
+                   .Input(slice.node())
+                   .Finalize(root.graph(), &result));
+
+  ShapeRefiner m(TF_GRAPH_DEF_VERSION, OpRegistry::Global());
+  TF_ASSERT_OK(m.AddNode(input.node()));
+  TF_ASSERT_OK(m.AddNode(begin.node()));
+  TF_ASSERT_OK(m.AddNode(end.node()));
+  TF_ASSERT_OK(m.AddNode(stride.node()));
+  TF_ASSERT_OK(m.AddNode(slice.node()));
+  TF_ASSERT_OK(m.AddNode(result));
+
+  shape_inference::InferenceContext* ctx = m.GetContext(result);
+  EXPECT_EQ(ctx->DebugString(ctx->output(0)), "?");
+}
+
 namespace {
 
 // Dummy op to test ShapeRefiner util functions
diff --git a/tensorflow/core/common_runtime/step_stats_collector.cc b/tensorflow/core/common_runtime/step_stats_collector.cc
index f21536d586edcc..af6880c6b3a110 100644
--- a/tensorflow/core/common_runtime/step_stats_collector.cc
+++ b/tensorflow/core/common_runtime/step_stats_collector.cc
@@ -94,7 +94,7 @@ static int ExtractGpuWithStreamAll(string device_name) {
   } else {
     // Convert the captured string into an integer. But first we need to put
     // the digits back in order
-    string ordered_capture = capture.ToString();
+    string ordered_capture = std::string(capture);
     std::reverse(ordered_capture.begin(), ordered_capture.end());
     int gpu_id;
     CHECK(strings::safe_strto32(ordered_capture, &gpu_id));
@@ -123,7 +123,7 @@ static int ExtractGpuWithoutStream(string device_name) {
   } else {
     // Convert the captured string into an integer. But first we need to put
     // the digits back in order
-    string ordered_capture = capture.ToString();
+    string ordered_capture = std::string(capture);
     std::reverse(ordered_capture.begin(), ordered_capture.end());
     int gpu_id;
     CHECK(strings::safe_strto32(ordered_capture, &gpu_id));
@@ -170,7 +170,7 @@ void StepStatsCollector::BuildCostModel(
 
   for (auto& itr : per_device_stats) {
     const StringPiece device_name = itr.first;
-    const int gpu_id = ExtractGpuWithoutStream(device_name.ToString());
+    const int gpu_id = ExtractGpuWithoutStream(std::string(device_name));
     if (gpu_id >= 0) {
       // Reference the gpu hardware stats in addition to the regular stats
       // for this gpu device if they're available.
diff --git a/tensorflow/core/common_runtime/sycl/sycl_device.cc b/tensorflow/core/common_runtime/sycl/sycl_device.cc
index 6e1a45b3efa8b5..f3bd72f697cde1 100644
--- a/tensorflow/core/common_runtime/sycl/sycl_device.cc
+++ b/tensorflow/core/common_runtime/sycl/sycl_device.cc
@@ -27,12 +27,11 @@ SYCLDevice::~SYCLDevice() {}
 
 void SYCLDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
   assert(context);
-  if (port::Tracing::IsActive()) {
-    // TODO(pbar) We really need a useful identifier of the graph node.
-    const uint64 id = Hash64(op_kernel->name());
-    port::Tracing::ScopedActivity region(port::Tracing::EventCategory::kCompute,
-                                         id);
-  }
+  // When ThreadScape profiling is off (which is the default), constructing the
+  // following code is simple enough that its overhead is negligible.
+  tracing::ScopedRegion region(tracing::EventCategory::kCompute,
+                               op_kernel->name());
+
   op_kernel->Compute(context);
 }
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib_ops.cc b/tensorflow/core/common_runtime/testlib_ops.cc
similarity index 84%
rename from tensorflow/core/distributed_runtime/rpc/grpc_testlib_ops.cc
rename to tensorflow/core/common_runtime/testlib_ops.cc
index 5597ee7a76a55f..a0139c3ee5d33d 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib_ops.cc
+++ b/tensorflow/core/common_runtime/testlib_ops.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/platform/macros.h"
@@ -21,8 +22,12 @@ namespace tensorflow {
 namespace test {
 
 // ErrorOp::Compute returns an error.
-REGISTER_OP("Error").Input("in: T").Output("out: T").Attr("T: type").Attr(
-    "message: string");
+REGISTER_OP("Error")
+    .Input("in: T")
+    .Output("out: T")
+    .Attr("T: type")
+    .Attr("message: string")
+    .SetShapeFn(shape_inference::UnknownShape);
 class ErrorOp : public OpKernel {
  public:
   explicit ErrorOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -41,7 +46,8 @@ REGISTER_KERNEL_BUILDER(Name("Error").Device(DEVICE_CPU), ErrorOp);
 REGISTER_OP("InvalidRefType")
     .Output("out: Ref(TIn)")
     .Attr("TIn: type")
-    .Attr("TOut: type");
+    .Attr("TOut: type")
+    .SetShapeFn(shape_inference::UnknownShape);
 class InvalidRefType : public OpKernel {
  public:
   explicit InvalidRefType(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -63,8 +69,12 @@ REGISTER_KERNEL_BUILDER(Name("InvalidRefType").Device(DEVICE_CPU),
 
 // DelayOp::AsyncCompute sleeps for "micros"-econd and then returns
 // its input.
-REGISTER_OP("Delay").Input("in: T").Output("out: T").Attr("T: type").Attr(
-    "micros: int");
+REGISTER_OP("Delay")
+    .Input("in: T")
+    .Output("out: T")
+    .Attr("T: type")
+    .Attr("micros: int")
+    .SetShapeFn(shape_inference::UnchangedShape);
 class DelayOp : public AsyncOpKernel {
  public:
   explicit DelayOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index 6d8de6a3c06a84..74a87215e1917d 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -31,7 +31,11 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 
 #ifdef INTEL_MKL
+#ifdef _OPENMP
+#include <omp.h>
+#endif
 #include "tensorflow/core/common_runtime/mkl_cpu_allocator.h"
+#include "tensorflow/core/platform/cpu_info.h"
 #endif
 
 namespace tensorflow {
@@ -43,25 +47,38 @@ ThreadPoolDevice::ThreadPoolDevice(const SessionOptions& options,
     : LocalDevice(options, Device::BuildDeviceAttributes(
                                name, DEVICE_CPU, memory_limit, locality)),
       allocator_(allocator),
-      scoped_allocator_mgr_(new ScopedAllocatorMgr(name)) {}
+      scoped_allocator_mgr_(new ScopedAllocatorMgr(name)) {
+#ifdef INTEL_MKL
+#ifdef _OPENMP
+  const char* user_omp_threads = getenv("OMP_NUM_THREADS");
+  if (user_omp_threads == nullptr) {
+    // OMP_NUM_THREADS controls MKL's intra-op parallelization
+    // Default to available physical cores
+    const int mkl_intra_op = port::NumSchedulableCPUs();
+    const int ht = port::NumHyperthreadsPerCore();
+    omp_set_num_threads((mkl_intra_op + ht - 1) / ht);
+  } else {
+    uint64 user_val = 0;
+    if (strings::safe_strtou64(user_omp_threads, &user_val)) {
+      // Superflous but triggers OpenMP loading
+      omp_set_num_threads(user_val);
+    }
+  }
+#endif  // _OPENMP
+#endif  // INTEL_MKL
+}
 
 ThreadPoolDevice::~ThreadPoolDevice() {}
 
 void ThreadPoolDevice::Compute(OpKernel* op_kernel, OpKernelContext* context) {
-  // When TraceMe profiling is off (which is the default), the
-  // following TraceMe constructor is simply a conditional test of
-  // false value. Measurements show that its overhead is negligible.
-  port::Tracing::TraceMe trace_me(op_kernel->name(), op_kernel->type_string(),
-                                  op_kernel->IsExpensive());
-  if (port::Tracing::IsActive()) {
-    // TODO(pbar) We really need a useful identifier of the graph node.
-    const uint64 id = Hash64(op_kernel->name());
-    port::Tracing::ScopedActivity region(port::Tracing::EventCategory::kCompute,
-                                         id);
-    op_kernel->Compute(context);
-  } else {
-    op_kernel->Compute(context);
-  }
+  // When Xprof/ThreadScape profiling is off (which is the default), the
+  // following code is simple enough that its overhead is negligible.
+  tracing::ScopedActivity activity(op_kernel->name(), op_kernel->type_string(),
+                                   op_kernel->IsExpensive());
+  tracing::ScopedRegion region(tracing::EventCategory::kCompute,
+                               op_kernel->name());
+
+  op_kernel->Compute(context);
 }
 
 Allocator* ThreadPoolDevice::GetAllocator(AllocatorAttributes attr) {
diff --git a/tensorflow/core/debug/BUILD b/tensorflow/core/debug/BUILD
index 7ca17fd411f587..fbba051e0b9a65 100644
--- a/tensorflow/core/debug/BUILD
+++ b/tensorflow/core/debug/BUILD
@@ -90,7 +90,6 @@ tf_gpu_library(
     deps = [
         ":debug",
         "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:device_tracer",
         "//tensorflow/core:direct_session_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
diff --git a/tensorflow/core/debug/debug_graph_utils.cc b/tensorflow/core/debug/debug_graph_utils.cc
index 4539ea5c0cb6a1..7641edea523679 100644
--- a/tensorflow/core/debug/debug_graph_utils.cc
+++ b/tensorflow/core/debug/debug_graph_utils.cc
@@ -356,10 +356,9 @@ Status DebugNodeInserter::ParseDebugOpName(
             "Malformed attributes in debug op name \"", debug_op_name, "\"");
       }
 
-      const string key = seg.substr(0, eq_index).ToString();
-      const string value =
-          seg.substr(eq_index + 1, attribute_seg.size() - eq_index - 1)
-              .ToString();
+      const string key = std::string(seg.substr(0, eq_index));
+      const string value = std::string(
+          seg.substr(eq_index + 1, attribute_seg.size() - eq_index - 1));
       if (key.empty() || value.empty()) {
         return errors::InvalidArgument(
             "Malformed attributes in debug op name \"", debug_op_name, "\"");
diff --git a/tensorflow/core/debug/debug_io_utils.cc b/tensorflow/core/debug/debug_io_utils.cc
index baa8c08fdf1508..03a011f79e1871 100644
--- a/tensorflow/core/debug/debug_io_utils.cc
+++ b/tensorflow/core/debug/debug_io_utils.cc
@@ -52,7 +52,7 @@ namespace {
 
 // Creates an Event proto representing a chunk of a Tensor. This method only
 // populates the field of the Event proto that represent the envelope
-// informaion (e.g., timestmap, device_name, num_chunks, chunk_index, dtype,
+// information (e.g., timestamp, device_name, num_chunks, chunk_index, dtype,
 // shape). It does not set the value.tensor field, which should be set by the
 // caller separately.
 Event PrepareChunkEventProto(const DebugNodeKey& debug_node_key,
@@ -399,8 +399,8 @@ Status DebugIO::PublishDebugMetadata(
                               strings::Printf("%.14lld", session_run_index))),
           Env::Default()->NowMicros());
       status.Update(DebugFileIO::DumpEventProtoToFile(
-          event, io::Dirname(core_metadata_path).ToString(),
-          io::Basename(core_metadata_path).ToString()));
+          event, std::string(io::Dirname(core_metadata_path)),
+          std::string(io::Basename(core_metadata_path))));
     }
   }
 
@@ -632,8 +632,8 @@ Status DebugFileIO::DumpTensorToEventFile(const DebugNodeKey& debug_node_key,
   std::vector<Event> events;
   TF_RETURN_IF_ERROR(
       WrapTensorAsEvents(debug_node_key, tensor, wall_time_us, 0, &events));
-  return DumpEventProtoToFile(events[0], io::Dirname(file_path).ToString(),
-                              io::Basename(file_path).ToString());
+  return DumpEventProtoToFile(events[0], std::string(io::Dirname(file_path)),
+                              std::string(io::Basename(file_path)));
 }
 
 Status DebugFileIO::RecursiveCreateDir(Env* env, const string& dir) {
@@ -642,7 +642,7 @@ Status DebugFileIO::RecursiveCreateDir(Env* env, const string& dir) {
     return Status::OK();
   }
 
-  string parent_dir = io::Dirname(dir).ToString();
+  string parent_dir = std::string(io::Dirname(dir));
   if (!env->FileExists(parent_dir).ok()) {
     // The parent path does not exist yet, create it first.
     Status s = RecursiveCreateDir(env, parent_dir);  // Recursive call
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index dc0058ce6dc8bd..34dc8982b0925a 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -452,17 +452,128 @@ cc_library(
     ],
 )
 
-# TODO(mrry): Move executor_test.cc to ../common_runtime when once it no longer depends
-# on grpc_testlib.
-tf_gpu_cc_tests(
-    name = "executor_tests",
+cc_library(
+    name = "collective_rma_distributed",
+    srcs = ["collective_rma_distributed.cc"],
+    hdrs = ["collective_rma_distributed.h"],
+    deps = [
+        ":worker_cache",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",  # protobuf::Any
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:worker_proto_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "collective_rma_distributed_test",
+    size = "small",
+    srcs = ["collective_rma_distributed_test.cc"],
+    deps = [
+        ":collective_rma_distributed",
+        ":device_resolver_distributed",
+        ":test_utils",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core:worker_proto_cc",
+    ],
+)
+
+cc_library(
+    name = "collective_param_resolver_distributed",
+    srcs = ["collective_param_resolver_distributed.cc"],
+    hdrs = ["collective_param_resolver_distributed.h"],
+    deps = [
+        ":call_options",
+        ":device_resolver_distributed",
+        ":worker_cache",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:worker_proto_cc",
+    ],
+)
+
+cc_library(
+    name = "test_utils",
+    srcs = [],
+    hdrs = ["test_utils.h"],
+    deps = [
+        ":worker_cache",
+        ":worker_interface",
+    ],
+)
+
+tf_cc_test(
+    name = "collective_param_resolver_distributed_test",
+    size = "small",
+    srcs = ["collective_param_resolver_distributed_test.cc"],
+    deps = [
+        ":collective_param_resolver_distributed",
+        ":device_resolver_distributed",
+        ":test_utils",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+cc_library(
+    name = "device_resolver_distributed",
+    srcs = ["device_resolver_distributed.cc"],
+    hdrs = ["device_resolver_distributed.h"],
+    deps = [
+        ":worker_cache",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:worker_proto_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "device_resolver_distributed_test",
+    size = "small",
+    srcs = ["device_resolver_distributed_test.cc"],
+    deps = [
+        ":device_resolver_distributed",
+        ":test_utils",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
+tf_gpu_cc_test(
+    name = "master_test",
     size = "medium",
     srcs = [
-        "executor_test.cc",
-        #"master_test.cc",  # TODO(b/27683709): Re-enable when not flaky.
+        "master_test.cc",
     ],
     linkstatic = tf_kernel_tests_linkstatic(),
-    tags = tf_gpu_tests_tags(),
+    tags = tf_gpu_tests_tags() + [
+        "manual",  # TODO(b/27683709): Re-enable when not flaky.
+        "notap",  # TODO(b/27683709): Re-enable when not flaky.
+        "noguitar",  # TODO(b/27683709): Re-enable when not flaky.
+        "nooss",  # TODO(b/27683709): Re-enable when not flaky.
+    ],
     deps = [
         ":master",
         ":remote_device",
@@ -479,6 +590,7 @@ tf_gpu_cc_tests(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/distributed_runtime/rpc:grpc_channel",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_master_service_impl",
         "//tensorflow/core/distributed_runtime/rpc:grpc_testlib",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
         "//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache",
@@ -539,10 +651,10 @@ tf_gpu_cc_test(
         "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
         "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
         "//tensorflow/core/distributed_runtime/rpc:grpc_session",
-        "//tensorflow/core/distributed_runtime/rpc:grpc_testlib_ops",
         "//tensorflow/core/kernels:aggregate_ops",
         "//tensorflow/core/kernels:array",
     ],
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc b/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
index 6f96d7cb065284..cd6e13501408a0 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
@@ -36,7 +36,8 @@ class ClusterFunctionLibraryRuntimeTest : public ::testing::Test {
     ChannelCreationFunction channel_func =
         ConvertToChannelCreationFunction(NewHostPortGrpcChannel);
     std::unique_ptr<WorkerCacheInterface> worker_cache(
-        NewGrpcWorkerCache(NewGrpcChannelCache(spec, channel_func)));
+        NewGrpcWorkerCache(std::shared_ptr<GrpcChannelCache>(
+            NewGrpcChannelCache(spec, channel_func))));
 
     worker_session_.reset(new WorkerSession(
         "cluster_test_session", "/job:localhost/replica:0/task:0",
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
new file mode 100644
index 00000000000000..7a93b54eae386f
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
@@ -0,0 +1,403 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h"
+
+#include "tensorflow/core/distributed_runtime/call_options.h"
+#include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
+#include "tensorflow/core/distributed_runtime/worker_cache.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+
+// TODO(tucker): When we're ready to enable collectives this const will
+// transition to a settable config member.
+static const char FLAGS_collective_group_leader[] =
+    "/job:worker/replica:0/task:0";
+
+namespace tensorflow {
+namespace {
+// Supports client side cancellation of WorkerInterface calls via
+// registration with a CancellationManager.  Note that ParamResolverInterface
+// calls are done on behalf of an Op execution which needs to abort if the
+// step in which it executes is cancelled.
+class CancellableCall {
+ public:
+  CancellableCall(CancellationManager* cancel_mgr, const string& remote_worker,
+                  WorkerCacheInterface* wc)
+      : cancel_mgr_(cancel_mgr), remote_worker_(remote_worker), wc_(wc) {
+    wi_ = wc_->CreateWorker(remote_worker_);
+  }
+  virtual ~CancellableCall() { wc_->ReleaseWorker(remote_worker_, wi_); }
+
+  virtual void IssueCall(const StatusCallback& done) = 0;
+
+  void Start(const StatusCallback& done) {
+    CancellationToken token = cancel_mgr_->get_cancellation_token();
+    const bool not_yet_cancelled = cancel_mgr_->RegisterCallback(
+        token, [this, token]() { opts_.StartCancel(); });
+    if (not_yet_cancelled) {
+      IssueCall([this, token, done](const Status& s) {
+        cancel_mgr_->DeregisterCallback(token);
+        done(s);
+      });
+    } else {
+      done(errors::Cancelled("RPC Request was cancelled"));
+    }
+  }
+
+ protected:
+  mutable mutex mu_;
+  CancellationManager* cancel_mgr_;  // Not owned
+  const string remote_worker_;
+  WorkerCacheInterface* wc_;  // Not owned
+  WorkerInterface* wi_;       // Owned by wc_, must be released.
+  CallOptions opts_;
+};
+
+class CompleteGroupCall : public CancellableCall {
+ public:
+  CompleteGroupCall(const CollGroupParams& group, const string& device_name,
+                    CancellationManager* cancel_mgr,
+                    const string& remote_worker, WorkerCacheInterface* wc)
+      : CancellableCall(cancel_mgr, remote_worker, wc) {
+    req_.set_group_key(group.group_key);
+    req_.set_group_size(group.group_size);
+    req_.set_device_type(group.device_type.type_string());
+    req_.add_device_name(device_name);
+  }
+  ~CompleteGroupCall() override {}
+
+  void IssueCall(const StatusCallback& done) override {
+    wi_->CompleteGroupAsync(&opts_, &req_, &resp_, done);
+  }
+
+  CompleteGroupRequest req_;
+  CompleteGroupResponse resp_;
+};
+
+class CompleteInstanceCall : public CancellableCall {
+ public:
+  CompleteInstanceCall(const CollGroupParams& group,
+                       const CollInstanceParams& instance,
+                       const string& node_name, const string& device_name,
+                       bool is_source, CancellationManager* cancel_mgr,
+                       const string& remote_worker, WorkerCacheInterface* wc)
+      : CancellableCall(cancel_mgr, remote_worker, wc) {
+    req_.set_name(node_name);
+    req_.set_type(instance.type);
+    req_.set_data_type(instance.data_type);
+    instance.shape.AsProto(req_.mutable_shape());
+    req_.set_group_key(group.group_key);
+    req_.set_group_size(group.group_size);
+    req_.set_instance_key(instance.instance_key);
+    req_.set_device_type(group.device_type.type_string());
+    for (int32 offset : instance.impl_details.subdiv_offsets) {
+      req_.add_subdiv_offset(offset);
+    }
+    req_.set_device(device_name);
+    req_.set_is_source(is_source);
+  }
+
+  ~CompleteInstanceCall() override {}
+
+  void IssueCall(const StatusCallback& done) override {
+    wi_->CompleteInstanceAsync(&opts_, &req_, &resp_, done);
+  }
+
+  CompleteInstanceRequest req_;
+  CompleteInstanceResponse resp_;
+};
+
+}  // namespace
+
+CollectiveParamResolverDistributed::CollectiveParamResolverDistributed(
+    const ConfigProto& config, const DeviceMgr* dev_mgr,
+    DeviceResolverDistributed* dev_resolver, WorkerCacheInterface* worker_cache,
+    const string& task_name)
+    : CollectiveParamResolverLocal(dev_mgr, dev_resolver, task_name),
+      worker_cache_(worker_cache),
+      group_leader_(task_name == FLAGS_collective_group_leader
+                        ? ""
+                        : FLAGS_collective_group_leader) {}
+
+void CollectiveParamResolverDistributed::CompleteParamsAsync(
+    const string& device, CollectiveParams* cp, CancellationManager* cancel_mgr,
+    const StatusCallback& done) {
+  CompleteGroupDistributed(device, cp, cancel_mgr,
+                           [this, device, cp, cancel_mgr, done](
+                               const Status& s, const GroupRec* gr) {
+                             if (s.ok()) {
+                               CompleteInstanceDistributed(device, gr, cp,
+                                                           cancel_mgr, done);
+                             } else {
+                               done(s);
+                             }
+                           });
+}
+
+void CollectiveParamResolverDistributed::CompleteGroupAsync(
+    const CompleteGroupRequest* request, CompleteGroupResponse* response,
+    CancellationManager* cancel_mgr, const StatusCallback& done) {
+  CollectiveParams cp;
+  cp.group.group_key = request->group_key();
+  cp.group.group_size = request->group_size();
+  cp.group.device_type = DeviceType(request->device_type());
+  for (const string& dn : request->device_name()) {
+    cp.instance.device_names.push_back(dn);
+  }
+  CompleteGroupDistributed(
+      cp.instance.device_names[0], &cp, cancel_mgr,
+      [this, response, done](const Status& s, const GroupRec* gr) {
+        if (s.ok()) {
+          mutex_lock l(gr->mu);
+          response->set_group_key(gr->group.group_key);
+          response->set_group_size(gr->group.group_size);
+          response->set_device_type(gr->group.device_type.type_string());
+          response->set_num_tasks(gr->task_set.size());
+          for (const string& dn : gr->device_list) {
+            response->add_device_name(dn);
+          }
+          for (const string& tn : gr->task_list) {
+            response->add_task_name(tn);
+          }
+        } else {
+          LOG(ERROR) << "Bad status from CompleteGroupDistributed: " << s;
+        }
+        done(s);
+      });
+}
+
+void CollectiveParamResolverDistributed::CompleteInstanceAsync(
+    const CompleteInstanceRequest* request, CompleteInstanceResponse* response,
+    CancellationManager* cancel_mgr, const StatusCallback& done) {
+  CollectiveParams* cp = new CollectiveParams;
+  cp->name = request->name();
+  cp->group.group_key = request->group_key();
+  cp->group.group_size = request->group_size();
+  cp->group.device_type = DeviceType(request->device_type());
+  cp->instance.type = CollectiveType(request->type());
+  cp->instance.instance_key = request->instance_key();
+  cp->instance.data_type = request->data_type();
+  cp->instance.shape = TensorShape(request->shape());
+  for (int32 offset : request->subdiv_offset()) {
+    cp->instance.impl_details.subdiv_offsets.push_back(offset);
+  }
+  VLOG(1) << "New cp " << cp << " for device " << request->device() << " : "
+          << cp->ToString();
+  StatusCallback done_and_cleanup = [this, cp, done](const Status& s) {
+    done(s);
+    delete cp;
+  };
+  // Start by completing the group.
+  CompleteGroupDistributed(
+      request->device(), cp, cancel_mgr,
+      [this, cp, request, response, cancel_mgr, done_and_cleanup](
+          const Status& cg_status, const GroupRec* gr) {
+        if (cg_status.ok()) {
+          // Then complete the instance.
+          CompleteInstanceDistributed(
+              request->device(), gr, cp, cancel_mgr,
+              [this, gr, cp, response,
+               done_and_cleanup](const Status& ci_status) {
+                if (ci_status.ok()) {
+                  // Now source_rank should be known, so
+                  // retrieve it.
+                  FindInstanceRec(
+                      gr, cp,
+                      [this, gr, cp, response, done_and_cleanup](
+                          const Status& fi_status, InstanceRec* ir) {
+                        if (fi_status.ok()) {
+                          mutex_lock l(ir->out_mu);
+                          response->set_instance_key(cp->instance.instance_key);
+                          response->set_source_rank(ir->source_rank);
+                          done_and_cleanup(fi_status);
+                        } else {
+                          done_and_cleanup(fi_status);
+                        }
+                      });
+                } else {
+                  done_and_cleanup(ci_status);
+                }
+              });
+        } else {
+          done_and_cleanup(cg_status);
+        }
+      });
+}
+
+bool CollectiveParamResolverDistributed::GroupIsCached(int32 group_key) {
+  mutex_lock l(group_mu_);
+  const auto& it = group_table_.find(group_key);
+  return it != group_table_.end();
+}
+
+Status CollectiveParamResolverDistributed::UpdateGroupCache(
+    const CompleteGroupResponse& resp) {
+  // Build a new record from resp.
+  std::unique_ptr<GroupRec> gr(new GroupRec);
+  mutex_lock grl(gr->mu);
+  gr->group.device_type = DeviceType(resp.device_type());
+  gr->group.group_key = resp.group_key();
+  gr->group.group_size = resp.group_size();
+  gr->group.num_tasks = resp.num_tasks();
+  if (resp.device_name_size() != gr->group.group_size) {
+    return errors::Internal(
+        "CompleteGroupResponse group_size doesn't match device_name list");
+  }
+  for (const string& dn : resp.device_name()) {
+    gr->device_set.insert(dn);
+    gr->device_list.push_back(dn);
+  }
+  if (resp.task_name_size() != gr->group.group_size) {
+    return errors::Internal(
+        "CompleteGroupResponse group_size doesn't match task_name list");
+  }
+  for (const string& tn : resp.task_name()) {
+    gr->task_list.push_back(tn);
+    gr->task_set.insert(tn);
+  }
+  CHECK_EQ(gr->task_set.size(), gr->group.num_tasks);
+  {
+    // Group membership should never change. Once a record is in group_table_
+    // it never gets removed.
+    mutex_lock l(group_mu_);
+    auto it = group_table_.find(gr->group.group_key);
+    if (it == group_table_.end()) {
+      group_table_[gr->group.group_key] = std::move(gr);
+    }
+  }
+  return Status::OK();
+}
+
+void CollectiveParamResolverDistributed::CompleteGroupDistributed(
+    const string& device, CollectiveParams* cp, CancellationManager* cancel_mgr,
+    const GroupRecCallback& done) {
+  VLOG(1) << "CompleteGroupDistributed group_key=" << cp->group.group_key
+          << " dev: " << device << " is_leader=" << (group_leader_.empty());
+  if (group_leader_.empty()) {
+    // This is the group leader, so resolution is local.
+    return CompleteGroupLocal(device, cp, done);
+  } else if (!GroupIsCached(cp->group.group_key)) {
+    // Need to update Group cache from the leader.
+    CompleteGroupCall* call = new CompleteGroupCall(
+        cp->group, device, cancel_mgr, group_leader_, worker_cache_);
+    call->Start([this, device, cp, call, done](const Status& s) {
+      if (s.ok()) {
+        Status status = UpdateGroupCache(call->resp_);
+        if (status.ok()) {
+          CompleteGroupLocal(device, cp, done);
+        } else {
+          done(status, nullptr);
+        }
+      } else {
+        done(s, nullptr);
+      }
+      delete call;
+    });
+    return;
+  } else {
+    return CompleteGroupLocal(device, cp, done);
+  }
+}
+
+bool CollectiveParamResolverDistributed::InstanceIsCached(int32 instance_key) {
+  mutex_lock l(instance_mu_);
+  const auto& it = instance_table_.find(instance_key);
+  return it != instance_table_.end();
+}
+
+void CollectiveParamResolverDistributed::UpdateInstanceCache(
+    const GroupRec* gr, CollectiveParams* cp,
+    const CompleteInstanceResponse& resp, const StatusCallback& done) {
+  Notification note;
+  InstanceRec* ir = nullptr;
+  int32 source_rank = resp.source_rank();
+
+  auto continue_with_ir = [this, cp, &ir, source_rank, done](const Status& s) {
+    if (!s.ok()) {
+      done(s);
+      return;
+    }
+    Status status;
+    do {
+      mutex_lock l(ir->out_mu);
+      if (ir->source_rank != source_rank) {
+        if (ir->source_rank >= 0) {
+          ir->status = errors::Internal(
+              "UpdateInstanceCache: CompleteInstanceResponse for instance ",
+              cp->instance.instance_key, " gives source_rank=", source_rank,
+              " but cache already holds value=", ir->source_rank);
+          status = ir->status;
+          break;
+        }
+        ir->source_rank = source_rank;
+      }
+      if (ir->known_count < cp->group.group_size) {
+        ir->known_count = cp->group.group_size;
+        if (ir->known.size() != cp->group.group_size) {
+          ir->status = errors::Internal(
+              "UpdateInstanceCache:: CompleteInstanceResponse for instance ",
+              cp->instance.instance_key, " has known.size()=", ir->known.size(),
+              " < group_size=", cp->group.group_size);
+          status = ir->status;
+          break;
+        }
+        for (int i = 0; i < ir->known.size(); ++i) {
+          ir->known[i] = true;
+        }
+      }
+      status = ir->status;
+    } while (false);
+    // Callback outside of lock.
+    done(status);
+  };
+
+  FindInstanceRec(
+      gr, cp, [this, &ir, continue_with_ir](const Status s, InstanceRec* irec) {
+        ir = irec;
+        continue_with_ir(s);
+      });
+}
+
+void CollectiveParamResolverDistributed::CompleteInstanceDistributed(
+    const string& device, const GroupRec* gr, CollectiveParams* cp,
+    CancellationManager* cancel_mgr, const StatusCallback& done) {
+  if (group_leader_.empty()) {
+    // This is the group leader so resolution is local.
+    return CompleteInstanceLocal(device, gr, cp, cp->is_source, done);
+  } else if (InstanceIsCached(cp->instance.instance_key)) {
+    return CompleteInstanceLocal(device, gr, cp, cp->is_source, done);
+  } else {
+    CompleteInstanceCall* call = new CompleteInstanceCall(
+        cp->group, cp->instance, cp->name, device, cp->is_source, cancel_mgr,
+        group_leader_, worker_cache_);
+    call->Start([this, device, gr, cp, call, done](const Status& s) {
+      if (s.ok()) {
+        UpdateInstanceCache(
+            gr, cp, call->resp_, [this, device, gr, cp, done](const Status& s) {
+              if (!s.ok()) {
+                done(s);
+              } else {
+                CompleteInstanceLocal(device, gr, cp, cp->is_source, done);
+              }
+            });
+      } else {
+        done(s);
+      }
+      delete call;
+    });
+    return;
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h
new file mode 100644
index 00000000000000..a35131d8350c15
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h
@@ -0,0 +1,90 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COLLECTIVE_PARAM_RESOLVER_DISTRIBUTED_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COLLECTIVE_PARAM_RESOLVER_DISTRIBUTED_H_
+
+#include "tensorflow/core/common_runtime/collective_param_resolver_local.h"
+
+namespace tensorflow {
+class ConfigProto;
+class WorkerCacheInterface;
+class DeviceResolverDistributed;
+class DeviceMgr;
+
+class CollectiveParamResolverDistributed : public CollectiveParamResolverLocal {
+ public:
+  CollectiveParamResolverDistributed(const ConfigProto& config,
+                                     const DeviceMgr* dev_mgr,
+                                     DeviceResolverDistributed* dev_resolver,
+                                     WorkerCacheInterface* worker_cache,
+                                     const string& task_name);
+
+  void CompleteParamsAsync(const string& device, CollectiveParams* cp,
+                           CancellationManager* cancel_mgr,
+                           const StatusCallback& done) override;
+
+  void CompleteGroupAsync(const CompleteGroupRequest* request,
+                          CompleteGroupResponse* response,
+                          CancellationManager* cancel_mgr,
+                          const StatusCallback& done) override;
+
+  void CompleteInstanceAsync(const CompleteInstanceRequest* request,
+                             CompleteInstanceResponse* response,
+                             CancellationManager* cancel_mgr,
+                             const StatusCallback& done) override;
+
+ protected:
+  // Returns true iff there's an entry for this group_key in the
+  // local group_table_.
+  bool GroupIsCached(int32 group_key) LOCKS_EXCLUDED(group_mu_);
+
+  // Updates group_table_ with contents of resp.
+  Status UpdateGroupCache(const CompleteGroupResponse& resp)
+      LOCKS_EXCLUDED(group_mu_);
+
+  // Finds the GroupRec that corresponds to cp->group_key and also
+  // populates cp->group from that GroupRec.
+  //
+  // Semantics are like those of CompleteGroupLocal but will make a
+  // remote call to the group leader if necessary.
+  void CompleteGroupDistributed(const string& device, CollectiveParams* cp,
+                                CancellationManager* cancel_mgr,
+                                const GroupRecCallback& done);
+
+  // Returns true iff there's an entry for this instance_key in the
+  // local instance_table_.
+  bool InstanceIsCached(int32 instance_key) LOCKS_EXCLUDED(instance_mu_);
+
+  // Updates instance_table_ with contents of resp.
+  void UpdateInstanceCache(const GroupRec* gr, CollectiveParams* cp,
+                           const CompleteInstanceResponse& resp,
+                           const StatusCallback& done)
+      LOCKS_EXCLUDED(instance_mu_, gr->mu, group_mu_);
+
+  // Finish populating *cp.  Semantics are like those of
+  // CompleteInstanceLocal but will make a remote call to the group
+  // leader if necessary.
+  void CompleteInstanceDistributed(const string& device, const GroupRec* gr,
+                                   CollectiveParams* cp,
+                                   CancellationManager* cancel_mgr,
+                                   const StatusCallback& done)
+      LOCKS_EXCLUDED(instance_mu_, gr->mu, group_mu_);
+
+  WorkerCacheInterface* worker_cache_;  // Not owned
+  const string group_leader_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COLLECTIVE_PARAM_RESOLVER_DISTRIBUTED_H_
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
new file mode 100644
index 00000000000000..95a010286d6ce2
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
@@ -0,0 +1,324 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h"
+
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
+#include "tensorflow/core/distributed_runtime/test_utils.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+namespace {
+
+static Device* NewDevice(const string& type, const string& name) {
+  class FakeDevice : public Device {
+   public:
+    explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
+    Status Sync() override { return Status::OK(); }
+    Allocator* GetAllocator(AllocatorAttributes) override { return nullptr; }
+  };
+  DeviceAttributes attr;
+  attr.set_name(name);
+  attr.set_device_type(type);
+  attr.mutable_locality()->set_numa_node(3);  // a non-default value
+  return new FakeDevice(attr);
+}
+
+class FakeWorker : public TestWorkerInterface {
+ public:
+  FakeWorker(const string& name, DeviceMgr* dev_mgr,
+             CollectiveParamResolverDistributed* cpres)
+      : name_(name), device_mgr_(dev_mgr), param_resolver_(cpres) {}
+
+  void GetStatusAsync(const GetStatusRequest* request,
+                      GetStatusResponse* response,
+                      StatusCallback done) override {
+    std::vector<DeviceAttributes> dev_attr;
+    device_mgr_->ListDeviceAttributes(&dev_attr);
+    for (const auto& da : dev_attr) {
+      *response->add_device_attributes() = da;
+    }
+    done(Status::OK());
+  }
+
+  void CompleteGroupAsync(CallOptions* opts,
+                          const CompleteGroupRequest* request,
+                          CompleteGroupResponse* response,
+                          StatusCallback done) override {
+    param_resolver_->CompleteGroupAsync(request, response, &cm_, done);
+  }
+
+  void CompleteInstanceAsync(CallOptions* ops,
+                             const CompleteInstanceRequest* request,
+                             CompleteInstanceResponse* response,
+                             StatusCallback done) override {
+    param_resolver_->CompleteInstanceAsync(request, response, &cm_, done);
+  }
+
+ private:
+  string name_;
+  DeviceMgr* device_mgr_;
+  CancellationManager cm_;
+  CollectiveParamResolverDistributed* param_resolver_;
+};
+
+class FakeCache : public TestWorkerCache {
+ public:
+  // Override the Locality methods to actually pass through to the
+  // worker.
+  bool GetDeviceLocalityNonBlocking(const string& device,
+                                    DeviceLocality* locality) override {
+    return false;
+  }
+
+  void GetDeviceLocalityAsync(const string& device, DeviceLocality* locality,
+                              StatusCallback done) override {
+    string task_name;
+    string dev_part;
+    if (!DeviceNameUtils::SplitDeviceName(device, &task_name, &dev_part)) {
+      done(errors::Internal("failed to parse device name"));
+      return;
+    }
+    auto it = workers_.find(task_name);
+    if (it == workers_.end()) {
+      done(errors::Internal("failed to find worker ", task_name));
+      return;
+    }
+    WorkerInterface* wi = it->second;
+    GetStatusRequest req;
+    GetStatusResponse resp;
+    Notification note;
+    Status status = wi->GetStatus(&req, &resp);
+    if (!status.ok()) {
+      done(status);
+      return;
+    }
+    for (const auto& it : resp.device_attributes()) {
+      if (it.name() == device) {
+        *locality = it.locality();
+        done(Status::OK());
+        return;
+      }
+    }
+    done(errors::Internal("device not found: ", device));
+  }
+};
+
+class DeviceResDistTest : public ::testing::Test {
+ protected:
+  DeviceResDistTest() {}
+
+  ~DeviceResDistTest() override {
+    for (DeviceMgr* dm : device_mgrs_) {
+      delete dm;
+    }
+    for (auto it : dev_resolvers_) {
+      delete it.second;
+    }
+    for (auto it : cp_resolvers_) {
+      delete it.second;
+    }
+    for (FakeWorker* w : workers_) {
+      delete w;
+    }
+  }
+
+  void DefineWorkers(int num_workers, int num_devices,
+                     const string& device_type) {
+    ConfigProto config;
+    for (int w = 0; w < num_workers; ++w) {
+      string name = strings::StrCat("/job:worker/replica:0/task:", w);
+      // TODO(tucker): When config option becomes available, set here.
+      // if (w == 0) {
+      //   config.set_collective_group_leader(name);
+      // }
+      DefineWorker(config, name, device_type, num_devices);
+    }
+  }
+
+  void DefineWorker(const ConfigProto& config, const string& worker_name,
+                    const string& device_type, int num_devices) {
+    std::vector<Device*> devices;
+    for (int i = 0; i < num_devices; ++i) {
+      devices.push_back(NewDevice(
+          device_type,
+          strings::StrCat(worker_name, "/device:", device_type, ":", i)));
+    }
+    DeviceMgr* dev_mgr = new DeviceMgr(devices);
+    device_mgrs_.push_back(dev_mgr);
+    std::vector<string>* dv = &dev_by_task_[worker_name];
+    for (auto d : devices) {
+      dv->push_back(d->name());
+    }
+    DeviceResolverDistributed* dev_res =
+        new DeviceResolverDistributed(dev_mgr, &wc_, worker_name);
+    dev_resolvers_[worker_name] = dev_res;
+    CollectiveParamResolverDistributed* cp_res =
+        new CollectiveParamResolverDistributed(config, dev_mgr, dev_res, &wc_,
+                                               worker_name);
+    cp_resolvers_[worker_name] = cp_res;
+    FakeWorker* fw = new FakeWorker(worker_name, dev_mgr, cp_res);
+    workers_.push_back(fw);
+    wc_.AddWorker(worker_name, fw);
+  }
+
+  void DefineCollectiveParams(int num_workers, int num_devices) {
+    const int kGroupKey = 5;
+    const int kInstanceKey = 3;
+    for (int wi = 0; wi < num_workers; ++wi) {
+      string task_name = strings::StrCat("/job:worker/replica:0/task:", wi);
+      for (int di = 0; di < num_devices; ++di) {
+        string device_name = strings::StrCat(task_name, "/device:CPU:", di);
+        cp_.push_back(CollectiveParams());
+        CollectiveParams& cp = cp_.back();
+        cp.group.group_key = kGroupKey;
+        cp.group.group_size = num_workers * num_devices;
+        cp.group.device_type = DEVICE_CPU;
+        cp.group.num_tasks = num_workers;
+        cp.instance.instance_key = kInstanceKey;
+        cp.instance.type = REDUCTION_COLLECTIVE;
+        cp.instance.data_type = DT_FLOAT;
+        cp.instance.shape = TensorShape({64});
+        cp.instance.impl_details.subdiv_offsets.push_back(0);
+      }
+    }
+  }
+
+  void IssueRequests(int num_workers, int num_devices) {
+    const int device_count = num_workers * num_devices;
+    {
+      mutex_lock l(mu_);
+      num_done_ = 0;
+    }
+    cp_.resize(device_count);
+    status_.resize(device_count);
+    int idx = 0;
+    for (int wi = 0; wi < num_workers; ++wi) {
+      for (int di = 0; di < num_devices; ++di) {
+        IssueRequest(num_workers, num_devices, idx);
+        ++idx;
+      }
+    }
+  }
+
+  void IssueRequest(int num_workers, int num_devices, int idx) {
+    int device_count = num_workers * num_devices;
+    int wi = idx / num_devices;
+    int di = idx % num_devices;
+    string task_name = strings::StrCat("/job:worker/replica:0/task:", wi);
+    string device_name = strings::StrCat(task_name, "/device:CPU:", di);
+    while (idx >= cp_.size()) {
+      status_.resize(idx + 1);
+      cp_.resize(idx + 1);
+    }
+    CollectiveParams* cp = &cp_[idx];
+    CollectiveParamResolverDistributed* cp_res = cp_resolvers_[task_name];
+    CHECK(cp_res);
+    cp_res->CompleteParamsAsync(device_name, cp, &cm_,
+                                [this, idx, device_count](const Status& s) {
+                                  status_[idx] = s;
+                                  {
+                                    mutex_lock l(mu_);
+                                    ++num_done_;
+                                    if (num_done_ == device_count) {
+                                      done_.notify_all();
+                                    }
+                                  }
+                                });
+  }
+
+  void ValidateCollectiveParams(int num_workers, int num_devices) {
+    int device_count = num_workers * num_devices;
+    {
+      mutex_lock l(mu_);
+      if (num_done_ < device_count) {
+        done_.wait(l);
+      }
+    }
+    // Verify that all cp_ values get the same set of task and device
+    // names, with unique default_rank in the expected order.
+    const int dev_count = num_workers * num_devices;
+    for (int wi = 0; wi < num_workers; ++wi) {
+      string task_name = strings::StrCat("/job:worker/replica:0/task:", wi);
+      for (int di = 0; di < num_devices; ++di) {
+        string device_name = strings::StrCat(task_name, "/device:CPU:", di);
+        int idx = wi * num_devices + di;
+        TF_ASSERT_OK(status_[idx]);
+        EXPECT_EQ(cp_[idx].default_rank, idx);
+        EXPECT_EQ(cp_[idx].instance.device_names.size(), dev_count);
+        EXPECT_EQ(cp_[idx].instance.device_names[idx], device_name);
+        EXPECT_EQ(cp_[idx].instance.task_names[idx], task_name);
+        if (idx > 0) {
+          for (int i = 0; i < dev_count; ++i) {
+            EXPECT_EQ(cp_[0].instance.device_names[i],
+                      cp_[idx].instance.device_names[i]);
+            EXPECT_EQ(cp_[0].instance.task_names[i],
+                      cp_[idx].instance.task_names[i]);
+          }
+        }
+      }
+    }
+  }
+
+  FakeCache wc_;
+  CancellationManager cm_;
+  std::vector<DeviceMgr*> device_mgrs_;
+  std::unordered_map<string, DeviceResolverDistributed*> dev_resolvers_;
+  std::unordered_map<string, CollectiveParamResolverDistributed*> cp_resolvers_;
+  std::unordered_map<string, std::vector<string>> dev_by_task_;
+  std::vector<FakeWorker*> workers_;
+  std::vector<CollectiveParams> cp_;
+  std::vector<Status> status_;
+  mutex mu_;
+  int num_done_ GUARDED_BY(mu_);
+  condition_variable done_;
+};
+
+TEST_F(DeviceResDistTest, Workers1Devices1) {
+  const int num_workers = 1;
+  const int num_devices = 1;
+  DefineWorkers(num_workers, num_devices, "CPU");
+  DefineCollectiveParams(num_workers, num_devices);
+  IssueRequests(num_workers, num_devices);
+  ValidateCollectiveParams(num_workers, num_devices);
+}
+
+TEST_F(DeviceResDistTest, Workers2Devices2) {
+  const int num_workers = 2;
+  const int num_devices = 2;
+  DefineWorkers(num_workers, num_devices, "CPU");
+  DefineCollectiveParams(num_workers, num_devices);
+  IssueRequests(num_workers, num_devices);
+  ValidateCollectiveParams(num_workers, num_devices);
+}
+
+TEST_F(DeviceResDistTest, Workers4Devices3) {
+  const int num_workers = 4;
+  const int num_devices = 3;
+  DefineWorkers(num_workers, num_devices, "CPU");
+  DefineCollectiveParams(num_workers, num_devices);
+  IssueRequests(num_workers, num_devices);
+  ValidateCollectiveParams(num_workers, num_devices);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
new file mode 100644
index 00000000000000..c15878bfd3a2ba
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
@@ -0,0 +1,209 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/distributed_runtime/collective_rma_distributed.h"
+
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/common_runtime/copy_tensor.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/distributed_runtime/worker_cache.h"
+#include "tensorflow/core/platform/protobuf_internal.h"
+#include "tensorflow/core/protobuf/transport_options.pb.h"
+#include "tensorflow/core/protobuf/worker.pb.h"
+
+namespace tensorflow {
+
+namespace {
+
+// Supports client side cancellation of WorkerInterface calls via
+// registration with a CancellationManager.
+//
+// TODO(tucker): Maybe unify this with CancellableCall in
+// collective_param_resolver_distributed.cc.
+class CancellableCall {
+ public:
+  CancellableCall(CancellationManager* cancel_mgr, const string& remote_worker,
+                  WorkerCacheInterface* wc)
+      : cancel_mgr_(cancel_mgr), remote_worker_(remote_worker), wc_(wc) {
+    wi_ = wc_->CreateWorker(remote_worker_);
+  }
+  virtual ~CancellableCall() { wc_->ReleaseWorker(remote_worker_, wi_); }
+
+  virtual void IssueCall(const StatusCallback& done) = 0;
+
+  void Start(const StatusCallback& done) {
+    CancellationToken token = cancel_mgr_->get_cancellation_token();
+    const bool not_yet_cancelled = cancel_mgr_->RegisterCallback(
+        token, [this, token]() { opts_.StartCancel(); });
+    if (not_yet_cancelled) {
+      IssueCall([this, token, done](const Status& s) {
+        cancel_mgr_->DeregisterCallback(token);
+        done(s);
+      });
+    } else {
+      done(errors::Cancelled("RPC Request was cancelled"));
+    }
+  }
+
+ protected:
+  mutable mutex mu_;
+  CancellationManager* cancel_mgr_;  // Not owned
+  const string remote_worker_;
+  WorkerCacheInterface* wc_;  // Not owned
+  WorkerInterface* wi_;       // Owned by wc_, must be released.
+  CallOptions opts_;
+};
+
+class RecvBufCall : public CancellableCall {
+ public:
+  RecvBufCall(int64 step_id, const string& peer_device, const string& peer_task,
+              const string& key, Device* to_device,
+              DeviceContext* to_device_ctx,
+              const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+              const DeviceLocality& client_locality,
+              const DeviceLocality& server_locality,
+              CancellationManager* cancel_mgr, WorkerCacheInterface* wc)
+      : CancellableCall(cancel_mgr, peer_task, wc) {
+    req_.set_step_id(step_id);
+    req_.set_buf_rendezvous_key(key);
+    *req_.mutable_client_locality() = client_locality;
+    *req_.mutable_server_locality() = server_locality;
+    req_.set_num_bytes(to_tensor->TotalBytes());
+    req_.set_buf_ptr(reinterpret_cast<int64>(DMAHelper::base(to_tensor)));
+    req_.set_src_device(peer_device);
+    req_.set_dst_device(to_device->name());
+  }
+
+  ~RecvBufCall() override {}
+
+  void IssueCall(const StatusCallback& done) override {
+    wi_->RecvBufAsync(&opts_, &req_, &resp_, done);
+  }
+
+  RecvBufRequest req_;
+  RecvBufResponse resp_;
+};
+
+}  // namespace
+
+void CollectiveRemoteAccessDistributed::RecvFromPeer(
+    const string& peer_device, const string& peer_task, bool peer_is_local,
+    const string& key, Device* to_device, DeviceContext* to_device_ctx,
+    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+    const DeviceLocality& client_locality, const StatusCallback& done) {
+  if (peer_is_local) {
+    CollectiveRemoteAccessLocal::RecvFromPeer(
+        peer_device, peer_task, peer_is_local, key, to_device, to_device_ctx,
+        to_alloc_attr, to_tensor, client_locality, done);
+    return;
+  }
+
+  // State that needs to be threaded through a couple of async calls
+  // in order to make this function completely non-blocking.
+  struct State {
+    DeviceLocality server_locality;
+    std::unique_ptr<RecvBufCall> call;
+  };
+  State* state = new State;
+
+  // Logic to be executed on the RecvBufferAsync callback.
+  auto recv_buf_callback = [this, state, peer_task, to_device, to_alloc_attr,
+                            to_device_ctx, to_tensor, done](const Status& s) {
+    if (s.ok()) {
+      // In this generic implementation the bytes come back in the
+      // RPC response protobuf rather than via RDMA so we need to copy
+      // them into the destination tensor here.
+      RecvBufRespExtra extra;
+      state->call->resp_.transport_options().UnpackTo(&extra);
+      int64 num_bytes = extra.tensor_content().size();
+      if (num_bytes != to_tensor->TotalBytes()) {
+        done(errors::Internal("RecvBufResponse returned ", num_bytes,
+                              " bytes where to_tensor expected ",
+                              to_tensor->TotalBytes()));
+        delete state;
+        return;
+      }
+      if (to_device->tensorflow_gpu_device_info()) {
+        // Move the bytes into a CPU tensor then use tensor-to-tensor copy.
+        // Use GPU-registered memory for the CPU tensor so the transfer
+        // goes faster.
+        Device* cpu_dev = nullptr;
+        Status status = dev_mgr_->LookupDevice("CPU:0", &cpu_dev);
+        if (!status.ok()) {
+          done(status);
+          delete state;
+          return;
+        }
+        AllocatorAttributes cpu_attr;
+        cpu_attr.set_gpu_compatible(true);
+        Tensor* cpu_tensor = new Tensor(cpu_dev->GetAllocator(cpu_attr),
+                                        to_tensor->dtype(), to_tensor->shape());
+        memcpy(DMAHelper::base(cpu_tensor), extra.tensor_content().data(),
+               num_bytes);
+        // Then copy it to the GPU.
+        CopyTensor::ViaDMA("",  // edge name (non-existent)
+                           nullptr /*send_dev_ctx*/, to_device_ctx, cpu_dev,
+                           to_device, cpu_attr, to_alloc_attr, cpu_tensor,
+                           to_tensor,
+                           [this, cpu_tensor, done](const Status& s) {
+                             delete cpu_tensor;
+                             // This callback must not block, so execute
+                             // done in another thread.
+                             SchedClosure([s, done] { done(s); });
+                           });
+        delete state;
+        return;
+      } else {
+        // CPU device
+        memcpy(DMAHelper::base(to_tensor), extra.tensor_content().data(),
+               num_bytes);
+      }
+    }
+    if (!s.ok() && errors::IsFailedPrecondition(s)) {
+      dev_resolver_->ClearTask(peer_task);
+    }
+
+    delete state;
+    done(s);
+  };
+
+  // Logic to execute once we have the device locality for the server-side
+  // device.
+  auto dev_locality_callback = [this, state, peer_device, peer_task, key,
+                                to_device, to_device_ctx, to_alloc_attr,
+                                to_tensor, client_locality,
+                                recv_buf_callback](const Status& s) {
+    if (!s.ok()) {
+      recv_buf_callback(s);
+    } else {
+      state->call.reset(new RecvBufCall(
+          step_id_, peer_device, peer_task, key, to_device, to_device_ctx,
+          to_alloc_attr, to_tensor, client_locality, state->server_locality,
+          &cancel_mgr_, worker_cache_));
+      state->call->Start(recv_buf_callback);
+    }
+  };
+
+  dev_resolver_->GetLocalityAsync(
+      peer_device, peer_task, &state->server_locality, dev_locality_callback);
+}
+
+void CollectiveRemoteAccessDistributed::StartAbort(const Status& s) {
+  CollectiveRemoteAccessLocal::StartAbort(s);
+  cancel_mgr_.StartCancel();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed.h b/tensorflow/core/distributed_runtime/collective_rma_distributed.h
new file mode 100644
index 00000000000000..cfa9110f473edc
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed.h
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COLLECTIVE_RMA_DISTRIBUTED_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COLLECTIVE_RMA_DISTRIBUTED_H_
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/framework/tensor.h"
+
+namespace tensorflow {
+class WorkerCacheInterface;
+
+// Extend CollectiveRemoteAccessLocal with access to remote peers.
+class CollectiveRemoteAccessDistributed : public CollectiveRemoteAccessLocal {
+ public:
+  CollectiveRemoteAccessDistributed(const DeviceMgr* dev_mgr,
+                                    DeviceResolverInterface* dev_resolver,
+                                    WorkerCacheInterface* worker_cache,
+                                    int64 step_id)
+      : CollectiveRemoteAccessLocal(dev_mgr, dev_resolver, step_id),
+        worker_cache_(worker_cache) {}
+
+  ~CollectiveRemoteAccessDistributed() override {}
+
+  void RecvFromPeer(const string& peer_device, const string& peer_task,
+                    bool peer_is_local, const string& key, Device* to_device,
+                    DeviceContext* to_device_ctx,
+                    const AllocatorAttributes& to_alloc_attr, Tensor* to_tensor,
+                    const DeviceLocality& client_locality,
+                    const StatusCallback& done) override;
+
+  void StartAbort(const Status& s) override;
+
+ protected:
+  WorkerCacheInterface* worker_cache_;  // Not owned
+  CancellationManager cancel_mgr_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_COLLECTIVE_RMA_DISTRIBUTED_H_
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
new file mode 100644
index 00000000000000..a552f81f584cbc
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
@@ -0,0 +1,356 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/collective_rma_distributed.h"
+
+#include "google/protobuf/any.pb.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
+#include "tensorflow/core/distributed_runtime/test_utils.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/transport_options.pb.h"
+#include "tensorflow/core/protobuf/worker.pb.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+// The only interesting method on CollectiveRemoteAccessDistributed
+// that's not on CollectiveRemoteAccessLocal is RecvFromPeer which
+// issues a RecvBufAsync call against a WorkerInterface.  That's all
+// that's tested here.  Note that RecvFromPeer can do a
+// DeviceResolverInterface::GetDeviceLocalityAsync call in preparation
+// for the RecvBufAsync.
+
+namespace tensorflow {
+namespace {
+
+static Device* NewDevice(const string& type, const string& name) {
+  class FakeDevice : public Device {
+   public:
+    explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
+    Status Sync() override { return Status::OK(); }
+    Allocator* GetAllocator(AllocatorAttributes) override { return nullptr; }
+  };
+  DeviceAttributes attr;
+  attr.set_name(name);
+  attr.set_device_type(type);
+  attr.mutable_locality()->set_numa_node(3);  // a non-default value
+  return new FakeDevice(attr);
+}
+
+static int64 kStepId = 123;
+
+class FakeWorker : public TestWorkerInterface {
+ public:
+  FakeWorker(const string& name, DeviceMgr* dev_mgr,
+             DeviceResolverDistributed* dres)
+      : name_(name),
+        device_mgr_(dev_mgr),
+        device_resolver_(dres),
+        buf_rendezvous_(kStepId) {}
+
+  // Direct access to a BufRendezvous that holds whatever the remote
+  // worker is supposed to have.
+  BufRendezvous* buf_rendezvous() { return &buf_rendezvous_; }
+
+  void GetStatusAsync(const GetStatusRequest* request,
+                      GetStatusResponse* response,
+                      StatusCallback done) override {
+    std::vector<DeviceAttributes> dev_attr;
+    device_mgr_->ListDeviceAttributes(&dev_attr);
+    for (const auto& da : dev_attr) {
+      *response->add_device_attributes() = da;
+    }
+    done(Status::OK());
+  }
+
+  void RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
+                    RecvBufResponse* response, StatusCallback done) override {
+    opts->SetCancelCallback([this]() {
+      // Within this test the call is satisfied by a process-local
+      // BufRendezvous table. In real application the BufRendezvous
+      // would be on the other side of a network hop, so call
+      // BufRendezvous::StartAbort() from a separate thread to be
+      // more consistent with that situation and avoid mutex deadlock.
+      SchedClosure([this]() {
+        Env::Default()->SleepForMicroseconds(100);
+        buf_rendezvous_.StartAbort(errors::Internal("Cancelled"));
+      });
+    });
+    buf_rendezvous_.ConsumeBuf(
+        request->buf_rendezvous_key(),
+        [this, opts, request, response, done](const Status& s,
+                                              BufRendezvous::Hook* h) {
+          if (s.ok()) {
+            opts->ClearCancelCallback();
+            // Since this is not really RDMA into pre-allocated memory send the
+            // bytes in the response.
+            RecvBufRespExtra extra;
+            int64 num_bytes = h->prod_value->TotalBytes();
+            extra.set_tensor_content(string(
+                reinterpret_cast<const char*>(DMAHelper::base(h->prod_value)),
+                num_bytes));
+            response->mutable_transport_options()->PackFrom(extra);
+          }
+          done(s);
+          if (h) BufRendezvous::DoneWithHook(h);
+        });
+  }
+
+ private:
+  string name_;
+  DeviceMgr* device_mgr_;
+  DeviceResolverDistributed* device_resolver_;
+  BufRendezvous buf_rendezvous_;
+};
+
+class FakeCache : public TestWorkerCache {
+ public:
+  // Override the Locality methods to actually pass through to the
+  // worker.
+  bool GetDeviceLocalityNonBlocking(const string& device,
+                                    DeviceLocality* locality) override {
+    return false;
+  }
+
+  void GetDeviceLocalityAsync(const string& device, DeviceLocality* locality,
+                              StatusCallback done) override {
+    string task_name;
+    string dev_part;
+    if (!DeviceNameUtils::SplitDeviceName(device, &task_name, &dev_part)) {
+      done(errors::Internal("failed to parse device name"));
+      return;
+    }
+    auto it = workers_.find(task_name);
+    if (it == workers_.end()) {
+      done(errors::Internal("failed to find worker ", task_name));
+      return;
+    }
+    WorkerInterface* wi = it->second;
+    GetStatusRequest req;
+    GetStatusResponse resp;
+    Notification note;
+    Status status = wi->GetStatus(&req, &resp);
+    if (!status.ok()) {
+      done(status);
+      return;
+    }
+    for (const auto& it : resp.device_attributes()) {
+      if (it.name() == device) {
+        *locality = it.locality();
+        done(Status::OK());
+        return;
+      }
+    }
+    done(errors::Internal("device not found: ", device));
+  }
+};
+
+class CollRMADistTest : public ::testing::Test {
+ protected:
+  CollRMADistTest() {}
+
+  ~CollRMADistTest() override {
+    for (DeviceMgr* dm : device_mgrs_) {
+      delete dm;
+    }
+    for (auto it : dev_resolvers_) {
+      delete it.second;
+    }
+    for (FakeWorker* w : workers_) {
+      delete w;
+    }
+  }
+
+  void SetUp() override {
+    const int num_workers = 2;
+    const int num_devices = 1;
+    string device_type = "CPU";
+    ConfigProto config;
+    string dev0_worker_name;
+    for (int w = 0; w < num_workers; ++w) {
+      string name = strings::StrCat("/job:worker/replica:0/task:", w);
+      if (w == 0) {
+        dev0_worker_name = name;
+        // TODO(tucker): Change to use config when available.
+        // config.set_collective_group_leader(name);
+      }
+      DefineWorker(config, name, device_type, num_devices);
+    }
+    // All tests simulate requests from worker 0 to worker 1.
+    rma_.reset(new CollectiveRemoteAccessDistributed(
+        device_mgrs_[0], dev_resolvers_[dev0_worker_name], &wc_, kStepId));
+
+    const int kNumElts = 8;
+    expected_value_ = Tensor(DT_FLOAT, {kNumElts});
+    to_tensor_ = Tensor(DT_FLOAT, {kNumElts});
+    auto exp_alias = expected_value_.flat<float>();
+    auto to_alias = to_tensor_.flat<float>();
+    for (int i = 0; i < kNumElts; ++i) {
+      exp_alias(i) = i;
+      to_alias(i) = -1;
+    }
+  }
+
+  void DefineWorker(const ConfigProto& config, const string& worker_name,
+                    const string& device_type, int num_devices) {
+    std::vector<Device*> devices;
+    for (int i = 0; i < num_devices; ++i) {
+      devices.push_back(NewDevice(
+          device_type,
+          strings::StrCat(worker_name, "/device:", device_type, ":", i)));
+    }
+    DeviceMgr* dev_mgr = new DeviceMgr(devices);
+    device_mgrs_.push_back(dev_mgr);
+    std::vector<string>* dv = &dev_by_task_[worker_name];
+    for (auto d : devices) {
+      dv->push_back(d->name());
+    }
+    DeviceResolverDistributed* dev_res =
+        new DeviceResolverDistributed(dev_mgr, &wc_, worker_name);
+    dev_resolvers_[worker_name] = dev_res;
+    FakeWorker* fw = new FakeWorker(worker_name, dev_mgr, dev_res);
+    workers_.push_back(fw);
+    wc_.AddWorker(worker_name, fw);
+  }
+
+  void ValidateResultTensor() {
+    ASSERT_EQ(expected_value_.NumElements(), to_tensor_.NumElements());
+    for (int i = 0; i < to_tensor_.NumElements(); ++i) {
+      EXPECT_FLOAT_EQ(expected_value_.flat<float>()(i),
+                      to_tensor_.flat<float>()(i));
+    }
+  }
+
+  FakeCache wc_;
+  CancellationManager cm_;
+  std::vector<DeviceMgr*> device_mgrs_;
+  std::unordered_map<string, DeviceResolverDistributed*> dev_resolvers_;
+  std::unordered_map<string, std::vector<string>> dev_by_task_;
+  std::vector<FakeWorker*> workers_;
+  std::unique_ptr<CollectiveRemoteAccessDistributed> rma_;
+  mutex mu_;
+  int num_done_ GUARDED_BY(mu_);
+  condition_variable done_;
+  Tensor expected_value_;
+  Tensor to_tensor_;
+  CallOptions opts_;
+  DeviceLocality device_locality_;
+  AllocatorAttributes alloc_attr_;
+};
+
+TEST_F(CollRMADistTest, ProdFirstOK) {
+  Notification consumer_note;
+  Notification producer_note;
+  Status consumer_status;
+  Status producer_status;
+  FakeWorker* wi = workers_[1];
+  const string kBufKey = "fake_buf_key";
+  wi->buf_rendezvous()->ProvideBuf(
+      kBufKey, nullptr /*device*/, nullptr /*dev_ctx*/, &expected_value_,
+      AllocatorAttributes(),
+      [this, &producer_note, &producer_status](const Status& s) {
+        producer_status.Update(s);
+        producer_note.Notify();
+      });
+  Status status;
+  Device* dst_device = nullptr;
+  string dev_name = "CPU:0";
+  TF_EXPECT_OK(device_mgrs_[0]->LookupDevice(dev_name, &dst_device));
+  DeviceContext* to_device_ctx = nullptr;
+  rma_->RecvFromPeer(
+      "/job:worker/replica:0/task:1/device:" + dev_name,  // peer_dev
+      "/job:worker/replica:0/task:1",                     // peer_task
+      false,                                              // peer_is_local
+      kBufKey, dst_device, to_device_ctx, alloc_attr_, &to_tensor_,
+      device_locality_,
+      [this, &consumer_status, &consumer_note](const Status& s) {
+        consumer_status = s;
+        consumer_note.Notify();
+      });
+  consumer_note.WaitForNotification();
+  TF_EXPECT_OK(consumer_status);
+  producer_note.WaitForNotification();
+  TF_EXPECT_OK(producer_status);
+  ValidateResultTensor();
+}
+
+TEST_F(CollRMADistTest, ConsFirstOK) {
+  Notification consumer_note;
+  Notification producer_note;
+  Status consumer_status;
+  Status producer_status;
+  FakeWorker* wi = workers_[1];
+  const string kBufKey = "fake_buf_key";
+  Status status;
+  Device* dst_device = nullptr;
+  string dev_name = "CPU:0";
+  TF_EXPECT_OK(device_mgrs_[0]->LookupDevice(dev_name, &dst_device));
+  DeviceContext* to_device_ctx = nullptr;
+  rma_->RecvFromPeer(
+      "/job:worker/replica:0/task:1/device:" + dev_name,  // peer_dev
+      "/job:worker/replica:0/task:1",                     // peer_task
+      false,                                              // peer_is_local
+      kBufKey, dst_device, to_device_ctx, alloc_attr_, &to_tensor_,
+      device_locality_,
+      [this, &consumer_status, &consumer_note](const Status& s) {
+        consumer_status = s;
+        consumer_note.Notify();
+      });
+  wi->buf_rendezvous()->ProvideBuf(
+      kBufKey, nullptr /*device*/, nullptr /*dev_ctx*/, &expected_value_,
+      AllocatorAttributes(),
+      [this, &producer_note, &producer_status](const Status& s) {
+        producer_status.Update(s);
+        producer_note.Notify();
+      });
+  consumer_note.WaitForNotification();
+  TF_EXPECT_OK(consumer_status);
+  producer_note.WaitForNotification();
+  TF_EXPECT_OK(producer_status);
+  ValidateResultTensor();
+}
+
+TEST_F(CollRMADistTest, ConsFirstAbort) {
+  Notification consumer_note;
+  Status consumer_status;
+  const string kBufKey = "fake_buf_key";
+  Status status;
+  Device* dst_device = nullptr;
+  string dev_name = "CPU:0";
+  TF_EXPECT_OK(device_mgrs_[0]->LookupDevice(dev_name, &dst_device));
+  DeviceContext* to_device_ctx = nullptr;
+  rma_->RecvFromPeer(
+      "/job:worker/replica:0/task:1/device:" + dev_name,  // peer_dev
+      "/job:worker/replica:0/task:1",                     // peer_task
+      false,                                              // peer_is_local
+      kBufKey, dst_device, to_device_ctx, alloc_attr_, &to_tensor_,
+      device_locality_,
+      [this, &consumer_status, &consumer_note](const Status& s) {
+        consumer_status = s;
+        consumer_note.Notify();
+      });
+  rma_->StartAbort(errors::Internal("Deliberate Failure"));
+  consumer_note.WaitForNotification();
+  EXPECT_EQ(consumer_status.error_message(), "Cancelled");
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/device_resolver_distributed.cc b/tensorflow/core/distributed_runtime/device_resolver_distributed.cc
new file mode 100644
index 00000000000000..038974cb3903f4
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/device_resolver_distributed.cc
@@ -0,0 +1,133 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
+
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/distributed_runtime/worker_cache.h"
+
+namespace tensorflow {
+DeviceResolverDistributed::DeviceResolverDistributed(
+    const DeviceMgr* dev_mgr, WorkerCacheInterface* worker_cache,
+    const string& task_name)
+    : dev_mgr_(dev_mgr), worker_cache_(worker_cache), task_name_(task_name) {}
+
+void DeviceResolverDistributed::GetLocalityAsync(const string& device,
+                                                 const string& task,
+                                                 DeviceLocality* locality,
+                                                 const StatusCallback& done) {
+  if (task.empty() || task == task_name_) {
+    // Device is local to this task.
+    Device* dev;
+    Status s = dev_mgr_->LookupDevice(device, &dev);
+    if (s.ok()) {
+      *locality = dev->attributes().locality();
+    }
+    done(s);
+    return;
+  } else {
+    // Lookup of a remote device: first try the local cache.
+    bool found = false;
+    {
+      mutex_lock l(mu_);
+      auto it = attr_table_.find(device);
+      if (it != attr_table_.end()) {
+        *locality = it->second.locality();
+        found = true;
+      }
+    }
+    if (found) {
+      done(Status::OK());
+      return;
+    }
+  }
+  // Device is remote and no cache entry was found.  Refresh the cache
+  // then retry the lookup.
+  RefreshRemoteAttributes(
+      device, task, [this, device, task, locality, done](const Status& s) {
+        if (!s.ok()) {
+          done(s);
+        } else {
+          GetLocalityAsync(device, task, locality, done);
+        }
+      });
+}
+
+void DeviceResolverDistributed::GetDeviceLocalitiesAsync(
+    const CollInstanceParams& inst_params,
+    std::vector<DeviceLocality>* localities, const StatusCallback& done) {
+  localities->clear();
+  GetDeviceLocalitiesRecursive(inst_params, localities, done);
+}
+
+void DeviceResolverDistributed::GetDeviceLocalitiesRecursive(
+    const CollInstanceParams& inst_params,
+    std::vector<DeviceLocality>* localities, const StatusCallback& done) {
+  size_t i = localities->size();
+  if (i < inst_params.device_names.size()) {
+    localities->push_back(DeviceLocality());
+    GetLocalityAsync(inst_params.device_names[i], inst_params.task_names[i],
+                     &localities->back(),
+                     [this, &inst_params, localities, done](const Status& s) {
+                       if (!s.ok()) {
+                         done(s);
+                         return;
+                       } else {
+                         GetDeviceLocalitiesRecursive(inst_params, localities,
+                                                      done);
+                       }
+                     });
+  } else {
+    done(Status::OK());
+  }
+}
+
+void DeviceResolverDistributed::RefreshRemoteAttributes(
+    const string& device, const string& task, const StatusCallback& done) {
+  GetStatusRequest* req = new GetStatusRequest;
+  GetStatusResponse* resp = new GetStatusResponse;
+  WorkerInterface* worker = worker_cache_->CreateWorker(task);
+  CHECK(worker) << "Failed to get worker for " << task;
+  worker->GetStatusAsync(
+      req, resp, [this, device, task, req, resp, worker, done](Status s) {
+        if (s.ok()) {
+          mutex_lock l(mu_);
+          for (const DeviceAttributes& da : resp->device_attributes()) {
+            attr_table_[da.name()] = da;
+          }
+        }
+        done(s);
+        delete req;
+        delete resp;
+        worker_cache_->ReleaseWorker(task, worker);
+      });
+}
+
+void DeviceResolverDistributed::ClearTask(const string& task) {
+  mutex_lock l(mu_);
+  // First find all the keys belonging to the task.
+  std::unordered_set<string> task_keys;
+  for (const auto& it : attr_table_) {
+    const string& device_name = it.first;
+    if (DeviceNameUtils::IsSameAddressSpace(task, device_name)) {
+      task_keys.insert(device_name);
+    }
+  }
+  // Then delete them.
+  for (const string& key : task_keys) {
+    attr_table_.erase(key);
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/device_resolver_distributed.h b/tensorflow/core/distributed_runtime/device_resolver_distributed.h
new file mode 100644
index 00000000000000..ac68ec68731d5b
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/device_resolver_distributed.h
@@ -0,0 +1,67 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_DEVICE_RESOLVER_DISTRIBUTED_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_DEVICE_RESOLVER_DISTRIBUTED_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+
+namespace tensorflow {
+class DeviceMgr;
+class WorkerCacheInterface;
+
+class DeviceResolverDistributed : public DeviceResolverInterface {
+ public:
+  DeviceResolverDistributed(const DeviceMgr* dev_mgr,
+                            WorkerCacheInterface* worker_cache,
+                            const string& task_name);
+
+  virtual ~DeviceResolverDistributed() {}
+
+  void GetDeviceLocalitiesAsync(const CollInstanceParams& inst_params,
+                                std::vector<DeviceLocality>* localities,
+                                const StatusCallback& done) override;
+
+  void GetLocalityAsync(const string& device, const string& task,
+                        DeviceLocality* locality,
+                        const StatusCallback& done) override;
+
+  void ClearTask(const string& task) override;
+
+ protected:
+  // Loads attr_table_ with device attributes retrieved from remote task.
+  void RefreshRemoteAttributes(const string& device, const string& task,
+                               const StatusCallback& done) LOCKS_EXCLUDED(mu_);
+
+  // Subroutine used by GetDeviceLocalitiesAsync.  Recursively extends
+  // *localities with DeviceLocality of the corresponding device named
+  // by inst_params.instance.device_names.
+  void GetDeviceLocalitiesRecursive(const CollInstanceParams& inst_params,
+                                    std::vector<DeviceLocality>* localities,
+                                    const StatusCallback& done);
+
+  const DeviceMgr* dev_mgr_;            // Not owned
+  WorkerCacheInterface* worker_cache_;  // Not owned
+  const string task_name_;
+  mutex mu_;
+  gtl::FlatMap<string, DeviceAttributes> attr_table_ GUARDED_BY(mu_);
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_DEVICE_RESOLVER_DISTRIBUTED_H_
diff --git a/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc b/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
new file mode 100644
index 00000000000000..ae44b98bd52d6d
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
@@ -0,0 +1,217 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
+
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/distributed_runtime/test_utils.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+namespace {
+
+// Subclass of DeviceResolverDistributed which behaves identically but
+// allows access to the attr_table_.
+class TestableDeviceResolverDistributed : public DeviceResolverDistributed {
+ public:
+  TestableDeviceResolverDistributed(const DeviceMgr* dev_mgr,
+                                    WorkerCacheInterface* worker_cache,
+                                    const string& task)
+      : DeviceResolverDistributed(dev_mgr, worker_cache, task) {}
+
+  gtl::FlatMap<string, DeviceAttributes>& attr_table() { return attr_table_; }
+};
+
+// Create a fake 'Device' whose only interesting attribute is a non-default
+// DeviceLocality.
+static Device* NewDevice(const string& type, const string& name,
+                         int numa_node) {
+  class FakeDevice : public Device {
+   public:
+    explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
+    Status Sync() override { return Status::OK(); }
+    Allocator* GetAllocator(AllocatorAttributes) override { return nullptr; }
+  };
+  DeviceAttributes attr;
+  attr.set_name(name);
+  attr.set_device_type(type);
+  attr.mutable_locality()->set_numa_node(numa_node);
+  return new FakeDevice(attr);
+}
+
+// Create a fake WorkerInterface that responds to requests without RPCs,
+// in this case returning the DeviceAttributes of a fake remote worker.
+class FakeWorker : public TestWorkerInterface {
+ public:
+  FakeWorker(const string& name, DeviceMgr* dev_mgr,
+             DeviceResolverDistributed* dres)
+      : name_(name), device_mgr_(dev_mgr), device_resolver_(dres) {}
+
+  void GetStatusAsync(const GetStatusRequest* request,
+                      GetStatusResponse* response,
+                      StatusCallback done) override {
+    std::vector<DeviceAttributes> dev_attr;
+    device_mgr_->ListDeviceAttributes(&dev_attr);
+    for (const auto& da : dev_attr) {
+      *response->add_device_attributes() = da;
+    }
+    done(Status::OK());
+  }
+
+ private:
+  string name_;
+  DeviceMgr* device_mgr_;
+  DeviceResolverDistributed* device_resolver_;
+};
+
+// An implementation of WorkerCacheInterface that routes all requests
+// to local FakeWorkers, implementing only the methods needed for tests.
+class FakeCache : public TestWorkerCache {
+ public:
+  // Override the Locality methods to actually pass through to the
+  // worker.
+  bool GetDeviceLocalityNonBlocking(const string& device,
+                                    DeviceLocality* locality) override {
+    return false;
+  }
+
+  void GetDeviceLocalityAsync(const string& device, DeviceLocality* locality,
+                              StatusCallback done) override {
+    string task_name;
+    string dev_part;
+    if (!DeviceNameUtils::SplitDeviceName(device, &task_name, &dev_part)) {
+      done(errors::Internal("failed to parse device name"));
+      return;
+    }
+    auto it = workers_.find(task_name);
+    if (it == workers_.end()) {
+      done(errors::Internal("failed to find worker ", task_name));
+      return;
+    }
+    WorkerInterface* wi = it->second;
+    GetStatusRequest req;
+    GetStatusResponse resp;
+    Notification note;
+    Status status = wi->GetStatus(&req, &resp);
+    if (!status.ok()) {
+      done(status);
+      return;
+    }
+    for (const auto& it : resp.device_attributes()) {
+      if (it.name() == device) {
+        *locality = it.locality();
+        done(Status::OK());
+        return;
+      }
+    }
+    done(errors::Internal("device not found: ", device));
+  }
+};
+
+class DeviceResDistTest : public ::testing::Test {
+ protected:
+  DeviceResDistTest() {}
+
+  ~DeviceResDistTest() override {
+    for (DeviceMgr* dm : device_mgrs_) {
+      delete dm;
+    }
+    for (auto it : resolvers_) {
+      delete it.second;
+    }
+    for (FakeWorker* w : workers_) {
+      delete w;
+    }
+  }
+
+  void DefineWorkers(int num_workers, int num_devices,
+                     const string& device_type) {
+    for (int w = 0; w < num_workers; ++w) {
+      string name = strings::StrCat("/job:worker/replica:0/task:", w);
+      DefineWorker(name, device_type, num_devices);
+    }
+  }
+
+  void DefineWorker(const string& worker_name, const string& device_type,
+                    int num_devices) {
+    std::vector<Device*> devices;
+    for (int i = 0; i < num_devices; ++i) {
+      devices.push_back(NewDevice(
+          device_type,
+          strings::StrCat(worker_name, "/device:", device_type, ":", i), i));
+    }
+    DeviceMgr* dev_mgr = new DeviceMgr(devices);
+    TestableDeviceResolverDistributed* dev_res =
+        new TestableDeviceResolverDistributed(dev_mgr, &wc_, worker_name);
+    resolvers_[worker_name] = dev_res;
+    device_mgrs_.push_back(dev_mgr);
+    std::vector<string>* dv = &dev_by_task_[worker_name];
+    for (auto d : devices) {
+      dv->push_back(d->name());
+    }
+    FakeWorker* fw = new FakeWorker(worker_name, dev_mgr, dev_res);
+    workers_.push_back(fw);
+    wc_.AddWorker(worker_name, fw);
+  }
+
+  FakeCache wc_;
+  std::vector<DeviceMgr*> device_mgrs_;
+  std::unordered_map<string, TestableDeviceResolverDistributed*> resolvers_;
+  std::unordered_map<string, std::vector<string>> dev_by_task_;
+  std::vector<FakeWorker*> workers_;
+};
+
+TEST_F(DeviceResDistTest, Workers3Devices4) {
+  DefineWorkers(3, 4, "CPU");
+  // Check that every device is available from every task.
+  for (auto it : resolvers_) {
+    DeviceResolverDistributed* dres = it.second;
+    for (auto it2 : dev_by_task_) {
+      const string& task_name = it2.first;
+      for (const auto& dev_name : it2.second) {
+        DeviceNameUtils::ParsedName parsed;
+        ASSERT_TRUE(DeviceNameUtils::ParseFullName(dev_name, &parsed));
+        Notification note;
+        Status status;
+        DeviceLocality locality;
+        dres->GetLocalityAsync(dev_name, task_name, &locality,
+                               [this, &note, &status](const Status& s) {
+                                 status = s;
+                                 note.Notify();
+                               });
+        note.WaitForNotification();
+        TF_EXPECT_OK(status);
+        EXPECT_EQ(parsed.id, locality.numa_node());
+      }
+    }
+  }
+  // Clear just task 0 from all.
+  const string w0_name = "/job:worker/replica:0/task:0";
+  for (auto it : resolvers_) {
+    if (it.first == w0_name) continue;
+    TestableDeviceResolverDistributed* dres = it.second;
+    EXPECT_EQ(8, it.second->attr_table().size());
+    dres->ClearTask("/job:worker/replica:0/task:0");
+    EXPECT_EQ(4, it.second->attr_table().size());
+  }
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/eager/BUILD b/tensorflow/core/distributed_runtime/eager/BUILD
new file mode 100644
index 00000000000000..f3922dde74ae27
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/eager/BUILD
@@ -0,0 +1,89 @@
+package(default_visibility = [
+    "//tensorflow:internal",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+)
+
+cc_library(
+    name = "remote_tensor_handle",
+    hdrs = ["remote_tensor_handle.h"],
+    deps = [
+        "//tensorflow/core:eager_service_proto_cc",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "eager_client",
+    hdrs = ["eager_client.h"],
+    deps = [
+        "//tensorflow/core:eager_service_proto_cc",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "remote_execute_node",
+    hdrs = ["remote_execute_node.h"],
+    deps = [
+        ":eager_client",
+        "//tensorflow/core:eager_service_proto_cc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/common_runtime/eager:eager_executor",
+    ],
+)
+
+cc_library(
+    name = "eager_service_impl",
+    srcs = ["eager_service_impl.cc"],
+    hdrs = [
+        "eager_service_impl.h",
+    ],
+    deps = [
+        "//tensorflow/c:c_api_internal",
+        "//tensorflow/c:tf_status_helper",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:eager_service_proto_cc",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/common_runtime/eager:context",
+        "//tensorflow/core/common_runtime/eager:eager_operation",
+        "//tensorflow/core/common_runtime/eager:execute",
+        "//tensorflow/core/common_runtime/eager:tensor_handle",
+        "//tensorflow/core/distributed_runtime:server_lib",
+        "//tensorflow/core/distributed_runtime:worker_cache",
+        "//tensorflow/core/distributed_runtime:worker_cache_wrapper",
+        "//tensorflow/core/distributed_runtime:worker_env",
+        "//tensorflow/core/distributed_runtime/eager:remote_tensor_handle",
+        "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
+        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc_unsecure",
+    ],
+)
+
+tf_cc_test(
+    name = "eager_service_impl_test",
+    srcs = ["eager_service_impl_test.cc"],
+    deps = [
+        ":eager_service_impl",
+        "//tensorflow/c:c_api",
+        "//tensorflow/c:c_api_internal",
+        "//tensorflow/core:eager_service_proto_cc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/common_runtime/eager:tensor_handle",
+        "//tensorflow/core/distributed_runtime:worker_env",
+        "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
+    ],
+)
diff --git a/tensorflow/core/distributed_runtime/eager/eager_client.h b/tensorflow/core/distributed_runtime/eager/eager_client.h
new file mode 100644
index 00000000000000..9ba8c8d80cb0db
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/eager/eager_client.h
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_EAGER_CLIENT_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_EAGER_CLIENT_H_
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/protobuf/eager_service.pb.h"
+
+namespace tensorflow {
+namespace eager {
+
+// This is a base class that can be implemented by a variety of
+// transports (e.g. gRPC which for each of the client methods makes an RPC).
+class EagerClient {
+ public:
+  virtual ~EagerClient() {}
+#define CLIENT_METHOD(method)                                \
+  virtual void method##Async(const method##Request* request, \
+                             method##Response* response,     \
+                             StatusCallback done) = 0;
+
+  CLIENT_METHOD(CreateContext);
+  CLIENT_METHOD(Enqueue);
+  CLIENT_METHOD(WaitQueueDone);
+  CLIENT_METHOD(KeepAlive);
+  CLIENT_METHOD(CloseContext);
+  CLIENT_METHOD(RegisterFunction);
+
+#undef CLIENT_METHOD
+};
+
+// Simple wrapper class that can be used to retrieve EagerClients.
+class EagerClientCache {
+ public:
+  virtual ~EagerClientCache() {}
+  virtual EagerClient* GetClient(const string& target) = 0;
+};
+
+}  // namespace eager
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_EAGER_CLIENT_H_
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
new file mode 100644
index 00000000000000..4bd74b81a7c432
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -0,0 +1,253 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/eager/eager_service_impl.h"
+
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/eager/eager_operation.h"
+#include "tensorflow/core/common_runtime/eager/execute.h"
+#include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/server_lib.h"
+#include "tensorflow/core/distributed_runtime/worker_cache.h"
+#include "tensorflow/core/distributed_runtime/worker_cache_wrapper.h"
+#include "tensorflow/core/distributed_runtime/worker_env.h"
+#include "tensorflow/core/framework/rendezvous.h"
+#include "tensorflow/core/lib/core/error_codes.pb.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/env.h"
+
+namespace tensorflow {
+namespace eager {
+
+namespace {
+Status GetNumRetvals(tensorflow::EagerContext* context, const string& op_name,
+                     const google::protobuf::Map<string, tensorflow::AttrValue>& attrs,
+                     int* num_retvals) {
+  const tensorflow::OpRegistrationData* op_reg_data = nullptr;
+  auto status = tensorflow::OpRegistry::Global()->LookUp(op_name, &op_reg_data);
+  if (errors::IsNotFound(status)) {
+    status = context->FindFunctionOpData(op_name, &op_reg_data);
+  }
+  TF_RETURN_IF_ERROR(status);
+
+  const tensorflow::OpDef& op_def = op_reg_data->op_def;
+
+  for (const auto& output_arg : op_def.output_arg()) {
+    if (!output_arg.number_attr().empty()) {
+      auto iter = attrs.find(output_arg.number_attr());
+      if (iter == attrs.end()) {
+        return errors::InvalidArgument("Unable to find number_attr ",
+                                       output_arg.number_attr(),
+                                       " for Op: ", op_name);
+      }
+      *num_retvals += iter->second.i();
+    } else if (!output_arg.type_list_attr().empty()) {
+      auto iter = attrs.find(output_arg.number_attr());
+      if (iter == attrs.end()) {
+        return errors::InvalidArgument("Unable to find number_attr ",
+                                       output_arg.number_attr(),
+                                       " for Op: ", op_name);
+      }
+      *num_retvals += iter->second.list().type_size();
+    } else {
+      *num_retvals += 1;
+    }
+  }
+
+  return Status::OK();
+}
+}  // namespace
+
+Status EagerServiceImpl::CreateContext(const CreateContextRequest* request,
+                                       CreateContextResponse* response) {
+  tensorflow::RemoteRendezvous* r = env_->rendezvous_mgr->Find(0);
+  std::vector<tensorflow::Device*> devices;
+  TF_RETURN_IF_ERROR(tensorflow::DeviceFactory::AddDevices(
+      // TODO(nareshmodi): Correctly set the SessionOptions.
+      SessionOptions(),
+      strings::Printf("/job:%s/replica:0/task:%d",
+                      request->server_def().job_name().data(),
+                      request->server_def().task_index()),
+      &devices));
+
+  response->mutable_device_attributes()->Reserve(devices.size());
+  for (auto& d : devices) {
+    *response->add_device_attributes() = d->attributes();
+  }
+
+  std::unique_ptr<tensorflow::DeviceMgr> device_mgr(
+      new tensorflow::DeviceMgr(devices));
+  std::unique_ptr<tensorflow::EagerContext> ctx(new tensorflow::EagerContext(
+      SessionOptions(),
+      tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
+      request->async(), std::move(device_mgr), r));
+
+  uint64 context_id;
+  {
+    mutex_lock l(contexts_mu_);
+    do {
+      context_id = random::New64();
+    } while (contexts_.find(context_id) != contexts_.end());
+    contexts_.emplace(context_id, new ServerContext(std::move(ctx)));
+  }
+  response->set_context_id(context_id);
+
+  return Status::OK();
+}
+
+Status EagerServiceImpl::ExecuteOp(const Operation& operation,
+                                   ServerContext* server_context) {
+  std::unique_ptr<tensorflow::EagerOperation> op;
+  const char* name = operation.name().c_str();  // Shorthand
+  const tensorflow::AttrTypeMap* types;
+  auto status = tensorflow::AttrTypeMapForOp(name, &types);
+  if (status.ok()) {
+    op.reset(
+        new tensorflow::EagerOperation(server_context->Context(), name, types));
+  } else if (errors::IsNotFound(status)) {
+    if (server_context->Context()->FindFunctionByName(name)) {
+      op.reset(new tensorflow::EagerOperation(server_context->Context(), name,
+                                              nullptr));
+    } else {
+      return status;
+    }
+  } else {
+    return status;
+  }
+
+  TF_RETURN_IF_ERROR(op->SetDevice(operation.device().c_str()));
+
+  for (const auto& remote_handle : operation.inputs()) {
+    tensorflow::TensorHandle* handle;
+    TF_RETURN_IF_ERROR(server_context->GetTensorHandle(
+        RemoteTensorHandleInternal(remote_handle), &handle));
+
+    op->AddInput(handle);
+  }
+
+  for (const auto& attr : operation.attrs()) {
+    op->MutableAttrs()->Set(attr.first, attr.second);
+  }
+
+  int num_retvals = 0;
+  // TODO(nareshmodi): Consider caching this.
+  TF_RETURN_IF_ERROR(GetNumRetvals(server_context->Context(), operation.name(),
+                                   operation.attrs(), &num_retvals));
+
+  tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 2> retvals;
+  TF_RETURN_IF_ERROR(EagerExecute(op.get(), &retvals, &num_retvals));
+
+  server_context->AddOperationOutputs(retvals, operation.id());
+
+  return Status::OK();
+}
+
+Status EagerServiceImpl::Enqueue(const EnqueueRequest* request,
+                                 EnqueueResponse* response) {
+  ServerContext* context = nullptr;
+  TF_RETURN_IF_ERROR(GetServerContext(request->context_id(), &context));
+  core::ScopedUnref context_unref(context);
+
+  for (const auto& item : request->queue()) {
+    if (item.has_operation()) {
+      TF_RETURN_IF_ERROR(ExecuteOp(item.operation(), context));
+    } else {
+      TF_RETURN_IF_ERROR(context->DeleteTensorHandle(
+          RemoteTensorHandleInternal(item.handle_to_decref())));
+    }
+  }
+
+  return Status::OK();
+}
+
+Status EagerServiceImpl::WaitQueueDone(const WaitQueueDoneRequest* request,
+                                       WaitQueueDoneResponse* response) {
+  ServerContext* context = nullptr;
+  TF_RETURN_IF_ERROR(GetServerContext(request->context_id(), &context));
+  core::ScopedUnref context_unref(context);
+
+  if (request->op_id_size() > 0) {
+    return errors::Unimplemented(
+        "EagerServiceImpl::WaitQueueDone is not "
+        "implemented for particular op IDs.");
+  }
+  return context->Context()->AsyncWait();
+}
+
+Status EagerServiceImpl::KeepAlive(const KeepAliveRequest* request,
+                                   KeepAliveResponse* response) {
+  // TODO(nareshmodi): Automated context_id cleaning is not implemented
+  return errors::Unimplemented(
+      "EagerServiceImpl::KeepAlive is not implemented.");
+}
+
+Status EagerServiceImpl::CloseContext(const CloseContextRequest* request,
+                                      CloseContextResponse* response) {
+  ServerContext* context = nullptr;
+  if (!GetServerContext(request->context_id(), &context).ok()) {
+    // Swallow the error here.
+    return Status::OK();
+  }
+
+  core::ScopedUnref context_unref(context);
+
+  mutex_lock l(contexts_mu_);
+  contexts_.erase(request->context_id());
+
+  // GetServerContext returns a newly Reffed copy of ServerContext, which is
+  // unreffed by context_unref. Additionally, we need to unref it one time since
+  // we are releasing it from the map.
+  context->Unref();
+
+  return Status::OK();
+}
+
+Status EagerServiceImpl::RegisterFunction(
+    const RegisterFunctionRequest* request,
+    RegisterFunctionResponse* response) {
+  ServerContext* context = nullptr;
+  TF_RETURN_IF_ERROR(GetServerContext(request->context_id(), &context));
+  core::ScopedUnref context_unref(context);
+
+  return context->Context()->AddFunctionDef(request->function_def());
+}
+
+tensorflow::Status EagerServiceImpl::GetServerContext(
+    uint64 context_id, ServerContext** server_context) {
+  mutex_lock l(contexts_mu_);
+  auto iter = contexts_.find(context_id);
+  if (iter == contexts_.end()) {
+    *server_context = nullptr;
+    return errors::InvalidArgument(strings::Printf(
+        "Unable to find a context_id matching the specified one "
+        "(%lld). Perhaps the worker was restarted?",
+        context_id));
+  }
+
+  *server_context = iter->second;
+  (*server_context)->Ref();
+  return Status::OK();
+}
+
+}  // namespace eager
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.h b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
new file mode 100644
index 00000000000000..ebd5269a57aa72
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.h
@@ -0,0 +1,150 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_EAGER_SERVICE_IMPL_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_EAGER_SERVICE_IMPL_H_
+
+#include <unordered_map>
+
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/distributed_runtime/eager/remote_tensor_handle.h"
+#include "tensorflow/core/distributed_runtime/worker_env.h"
+#include "tensorflow/core/lib/core/refcount.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/protobuf/eager_service.pb.h"
+
+namespace tensorflow {
+namespace eager {
+
+// A TensorFlow Eager Worker runs ops and supports worker to worker
+// Tensor transfer.
+//
+// See eager_service.proto for more details about each method.
+// This class can be wrapped by specific classes that implement rpc transports
+// over this (e.g. gRPC).
+class EagerServiceImpl {
+ public:
+  explicit EagerServiceImpl(const WorkerEnv* env) : env_(env) {}
+  virtual ~EagerServiceImpl() {
+    for (auto& entry : contexts_) {
+      entry.second->Unref();
+    }
+  }
+
+  Status CreateContext(const CreateContextRequest* request,
+                       CreateContextResponse* response);
+
+  Status Enqueue(const EnqueueRequest* request, EnqueueResponse* response);
+
+  Status WaitQueueDone(const WaitQueueDoneRequest* request,
+                       WaitQueueDoneResponse* response);
+
+  Status KeepAlive(const KeepAliveRequest* request,
+                   KeepAliveResponse* response);
+
+  Status CloseContext(const CloseContextRequest* request,
+                      CloseContextResponse* response);
+
+  Status RegisterFunction(const RegisterFunctionRequest* request,
+                          RegisterFunctionResponse* response);
+
+ protected:
+  // This is the server-side execution context. All state regarding execution of
+  // a client's ops is held in this server-side context (all generated tensors,
+  // and the EagerContext).
+  class ServerContext : public core::RefCounted {
+   public:
+    explicit ServerContext(std::unique_ptr<tensorflow::EagerContext> ctx)
+        : ctx_(std::move(ctx)) {}
+    ~ServerContext() {
+      for (const auto& entry : tensors_) {
+        entry.second->Unref();
+      }
+    }
+
+    tensorflow::EagerContext* Context() const { return ctx_.get(); }
+
+    void AddOperationOutputs(
+        const gtl::ArraySlice<tensorflow::TensorHandle*>& handles,
+        int64 operation_id) {
+      mutex_lock l(tensors_mu_);
+      for (int i = 0; i < handles.size(); i++) {
+        // TODO(nareshmodi): Correctly handle operation_id not being unique.
+        tensors_.emplace(RemoteTensorHandleInternal(operation_id, i),
+                         handles[i]);
+      }
+    }
+
+    Status GetTensorHandle(const RemoteTensorHandleInternal& remote_handle,
+                           tensorflow::TensorHandle** handle) {
+      mutex_lock l(tensors_mu_);
+      auto iter = tensors_.find(remote_handle);
+      if (iter == tensors_.end()) {
+        return errors::InvalidArgument(
+            "Unable to find the relevant tensor remote_handle: Op ID: ",
+            remote_handle.op_id, ", Output num: ", remote_handle.output_num);
+      }
+
+      *handle = iter->second;
+
+      return Status::OK();
+    }
+
+    Status DeleteTensorHandle(const RemoteTensorHandleInternal& remote_handle) {
+      mutex_lock l(tensors_mu_);
+      auto iter = tensors_.find(remote_handle);
+      if (iter == tensors_.end()) {
+        return errors::InvalidArgument(
+            "Unable to find the relevant tensor remote_handle: Op ID: ",
+            remote_handle.op_id, ", Output num: ", remote_handle.output_num);
+      }
+
+      iter->second->Unref();
+      tensors_.erase(iter);
+
+      return Status::OK();
+    }
+
+   private:
+    using RemoteTensorHandleMap =
+        gtl::FlatMap<RemoteTensorHandleInternal, tensorflow::TensorHandle*,
+                     RemoteTensorHandleInternalHash,
+                     RemoteTensorHandleInternalEquals>;
+
+    // The context for this execution.
+    std::unique_ptr<tensorflow::EagerContext> ctx_;
+
+    mutex tensors_mu_;
+    RemoteTensorHandleMap tensors_ GUARDED_BY(tensors_mu_);
+  };
+  // The returned ServerContext will need to be Unrefed.
+  tensorflow::Status GetServerContext(uint64, ServerContext**);
+
+ private:
+  Status ExecuteOp(const Operation& operation, ServerContext* server_context);
+  const WorkerEnv* const env_;  // Not owned.
+
+  mutex contexts_mu_;
+  std::unordered_map<uint64, ServerContext*> contexts_ GUARDED_BY(contexts_mu_);
+
+  TF_DISALLOW_COPY_AND_ASSIGN(EagerServiceImpl);
+};
+
+}  // namespace eager
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_EAGER_SERVICE_IMPL_H_
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
new file mode 100644
index 00000000000000..f865ebe1be9c07
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -0,0 +1,268 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/eager/eager_service_impl.h"
+
+#include <string.h>
+
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/worker_env.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/protobuf/eager_service.pb.h"
+#include "tensorflow/core/protobuf/tensorflow_server.pb.h"
+
+namespace tensorflow {
+namespace eager {
+namespace {
+
+class TestEagerServiceImpl : public EagerServiceImpl {
+ public:
+  explicit TestEagerServiceImpl(const WorkerEnv* env) : EagerServiceImpl(env) {}
+  Status GetTensorHandle(const uint64 context_id,
+                         const RemoteTensorHandleInternal& remote_handle,
+                         tensorflow::TensorHandle** handle) {
+    ServerContext* context = nullptr;
+    TF_RETURN_IF_ERROR(GetServerContext(context_id, &context));
+    core::ScopedUnref context_unref(context);
+
+    return context->GetTensorHandle(remote_handle, handle);
+  }
+};
+
+void SetTensorProto(AttrValue* val) {
+  int64_t dims[] = {2, 2};
+  float data[] = {1.0f, 2.0f, 3.0f, 4.0f};
+  TF_Tensor* t = TF_AllocateTensor(
+      TF_FLOAT, &dims[0], sizeof(dims) / sizeof(int64_t), sizeof(data));
+  memcpy(TF_TensorData(t), &data[0], TF_TensorByteSize(t));
+  tensorflow::Tensor tensor;
+  TF_ASSERT_OK(tensorflow::TF_TensorToTensor(t, &tensor));
+  tensor.AsProtoTensorContent(val->mutable_tensor());
+  TF_DeleteTensor(t);
+}
+
+void AddOperationToEnqueueRequest(
+    int64 id, const string& name,
+    const std::vector<std::pair<int64, int32>>& inputs,
+    const std::unordered_map<string, AttrValue>& attrs, const string& device,
+    EnqueueRequest* request) {
+  auto* operation = request->add_queue()->mutable_operation();
+
+  operation->set_id(id);
+  operation->set_name(name);
+  operation->set_device(device);
+
+  for (const auto& tensor_handle_pair : inputs) {
+    auto* input = operation->add_inputs();
+    input->set_op_id(tensor_handle_pair.first);
+    input->set_output_num(tensor_handle_pair.second);
+  }
+
+  for (const auto& attr_entry : attrs) {
+    (*operation->mutable_attrs())[attr_entry.first] = attr_entry.second;
+  }
+}
+
+tensorflow::FunctionDef MatMulFunction() {
+  tensorflow::FunctionDef def;
+  CHECK(tensorflow::protobuf::TextFormat::ParseFromString(
+      "    signature {"
+      "      name: 'MatMulFunction'"
+      "      input_arg {"
+      "        name: 'a'"
+      "        type: DT_FLOAT"
+      "      }"
+      "      output_arg {"
+      "        name: 'm'"
+      "        type: DT_FLOAT"
+      "      }"
+      "    }"
+      "    node_def {"
+      "      name: 'matmul'"
+      "      op: 'MatMul'"
+      "      input: 'a'"
+      "      input: 'a'"
+      "      attr {"
+      "        key: 'T'"
+      "        value {"
+      "          type: DT_FLOAT"
+      "        }"
+      "      }"
+      "    }"
+      "    ret {"
+      "      key: 'm'"
+      "      value: 'matmul:product'"
+      "    }",
+      &def));
+  return def;
+}
+
+// Test creates a context and attempts to execute some ops.
+TEST(EagerServiceImplTest, BasicTest) {
+  WorkerEnv worker_env;
+  worker_env.env = Env::Default();
+  tensorflow::RpcRendezvousMgr rm(&worker_env);
+  worker_env.rendezvous_mgr = &rm;
+
+  TestEagerServiceImpl eager_service_impl(&worker_env);
+
+  CreateContextRequest request;
+  request.mutable_server_def()->set_job_name("localhost");
+  request.mutable_server_def()->set_task_index(0);
+  CreateContextResponse response;
+
+  TF_ASSERT_OK(eager_service_impl.CreateContext(&request, &response));
+
+  uint64 context_id = response.context_id();
+
+  EnqueueRequest remote_enqueue_request;
+  remote_enqueue_request.set_context_id(context_id);
+  EnqueueResponse remote_enqueue_response;
+
+  std::unordered_map<string, AttrValue> const_attrs;
+  AttrValue val;
+  val.set_type(tensorflow::DataType::DT_FLOAT);
+  const_attrs.insert({"dtype", val});
+  val.Clear();
+  SetTensorProto(&val);
+  const_attrs.insert({"value", val});
+
+  AddOperationToEnqueueRequest(1, "Const", {}, const_attrs,
+                               "/job:localhost/replica:0/task:0/device:CPU:0",
+                               &remote_enqueue_request);
+
+  std::unordered_map<string, AttrValue> attrs;
+  val.Clear();
+  val.set_type(tensorflow::DataType::DT_FLOAT);
+  attrs.insert({"T", val});
+  val.Clear();
+  val.set_b(false);
+  attrs.insert({"transpose_a", val});
+  attrs.insert({"transpose_b", val});
+
+  AddOperationToEnqueueRequest(2, "MatMul", {{1, 0}, {1, 0}}, attrs,
+                               "/job:localhost/replica:0/task:0/device:CPU:0",
+                               &remote_enqueue_request);
+
+  TF_ASSERT_OK(eager_service_impl.Enqueue(&remote_enqueue_request,
+                                          &remote_enqueue_response));
+
+  tensorflow::TensorHandle* tensor_handle;
+  TF_ASSERT_OK(eager_service_impl.GetTensorHandle(
+      response.context_id(), RemoteTensorHandleInternal(2, 0), &tensor_handle));
+
+  // This should be OK to do since we've placed all computation on the CPU
+  // device.
+  const tensorflow::Tensor* t = nullptr;
+  TF_ASSERT_OK(tensor_handle->Tensor(&t));
+
+  auto actual = t->flat<float>();
+
+  EXPECT_EQ(4, actual.size());
+
+  EXPECT_EQ(7, actual(0));
+  EXPECT_EQ(10, actual(1));
+  EXPECT_EQ(15, actual(2));
+  EXPECT_EQ(22, actual(3));
+
+  CloseContextRequest close_context_request;
+  close_context_request.set_context_id(context_id);
+  CloseContextResponse close_context_response;
+  TF_ASSERT_OK(eager_service_impl.CloseContext(&close_context_request,
+                                               &close_context_response));
+}
+
+// Test creates a context and attempts to execute a function.
+TEST(EagerServiceImplTest, BasicFunctionTest) {
+  WorkerEnv worker_env;
+  worker_env.env = Env::Default();
+  tensorflow::RpcRendezvousMgr rm(&worker_env);
+  worker_env.rendezvous_mgr = &rm;
+
+  TestEagerServiceImpl eager_service_impl(&worker_env);
+
+  CreateContextRequest request;
+  request.mutable_server_def()->set_job_name("localhost");
+  request.mutable_server_def()->set_task_index(0);
+  CreateContextResponse response;
+
+  TF_ASSERT_OK(eager_service_impl.CreateContext(&request, &response));
+
+  uint64 context_id = response.context_id();
+
+  RegisterFunctionRequest register_function_request;
+  register_function_request.set_context_id(context_id);
+  *register_function_request.mutable_function_def() = MatMulFunction();
+  RegisterFunctionResponse register_function_response;
+
+  TF_ASSERT_OK(eager_service_impl.RegisterFunction(
+      &register_function_request, &register_function_response));
+
+  EnqueueRequest remote_enqueue_request;
+  remote_enqueue_request.set_context_id(context_id);
+  EnqueueResponse remote_enqueue_response;
+
+  std::unordered_map<string, AttrValue> const_attrs;
+  AttrValue val;
+  val.set_type(tensorflow::DataType::DT_FLOAT);
+  const_attrs.insert({"dtype", val});
+  val.Clear();
+
+  SetTensorProto(&val);
+  const_attrs.insert({"value", val});
+
+  AddOperationToEnqueueRequest(1, "Const", {}, const_attrs,
+                               "/job:localhost/replica:0/task:0/device:CPU:0",
+                               &remote_enqueue_request);
+  AddOperationToEnqueueRequest(
+      2, "MatMulFunction", {{1, 0}}, std::unordered_map<string, AttrValue>(),
+      "/job:localhost/replica:0/task:0/device:CPU:0", &remote_enqueue_request);
+
+  TF_ASSERT_OK(eager_service_impl.Enqueue(&remote_enqueue_request,
+                                          &remote_enqueue_response));
+
+  const tensorflow::Tensor* t = nullptr;
+  tensorflow::TensorHandle* tensor_handle;
+  TF_ASSERT_OK(eager_service_impl.GetTensorHandle(
+      response.context_id(), RemoteTensorHandleInternal(2, 0), &tensor_handle));
+  TF_ASSERT_OK(tensor_handle->Tensor(&t));
+
+  auto actual = t->flat<float>();
+  EXPECT_EQ(4, actual.size());
+
+  EXPECT_EQ(7, actual(0));
+  EXPECT_EQ(10, actual(1));
+  EXPECT_EQ(15, actual(2));
+  EXPECT_EQ(22, actual(3));
+
+  CloseContextRequest close_context_request;
+  close_context_request.set_context_id(context_id);
+  CloseContextResponse close_context_response;
+  TF_ASSERT_OK(eager_service_impl.CloseContext(&close_context_request,
+                                               &close_context_response));
+}
+
+}  // namespace
+}  // namespace eager
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/eager/remote_execute_node.h b/tensorflow/core/distributed_runtime/eager/remote_execute_node.h
new file mode 100644
index 00000000000000..c4bd67aaedbec0
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/eager/remote_execute_node.h
@@ -0,0 +1,60 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_EXECUTE_NODE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_EXECUTE_NODE_H_
+
+#include "tensorflow/core/common_runtime/eager/eager_executor.h"
+#include "tensorflow/core/distributed_runtime/eager/eager_client.h"
+#include "tensorflow/core/protobuf/eager_service.pb.h"
+
+namespace tensorflow {
+namespace eager {
+
+// EnqueueNode is an implementation of EagerNode which enqueues an operation
+// via RPC in a remote EagerService.
+class RemoteExecuteNode : public tensorflow::EagerNode {
+ public:
+  RemoteExecuteNode(tensorflow::uint64 id,
+                    const tensorflow::eager::EnqueueRequest& request,
+                    tensorflow::eager::EagerClient* eager_client)
+      : tensorflow::EagerNode(id),
+        request_(std::move(request)),
+        eager_client_(eager_client) {}
+
+  tensorflow::Status Run() override {
+    tensorflow::eager::EnqueueResponse response;
+    tensorflow::Status status;
+    Notification n;
+    eager_client_->EnqueueAsync(&request_, &response,
+                                [&n, &status](const tensorflow::Status& s) {
+                                  status.Update(s);
+                                  n.Notify();
+                                });
+    n.WaitForNotification();
+
+    return status;
+  }
+
+ private:
+  EnqueueRequest request_;
+  tensorflow::eager::EagerClient*
+      eager_client_;  // Not owned, and must outlive the RemoteExecuteNode.
+};
+
+}  // namespace eager
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_EXECUTE_NODE_H_
diff --git a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle.h b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle.h
new file mode 100644
index 00000000000000..25ec062c035535
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle.h
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_TENSOR_HANDLE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_TENSOR_HANDLE_H_
+
+#include "tensorflow/core/platform/fingerprint.h"
+#include "tensorflow/core/protobuf/eager_service.pb.h"
+
+namespace tensorflow {
+namespace eager {
+
+struct RemoteTensorHandleInternal {
+  explicit RemoteTensorHandleInternal(const RemoteTensorHandle& tensor_handle)
+      : op_id(tensor_handle.op_id()), output_num(tensor_handle.output_num()) {}
+  RemoteTensorHandleInternal(int64 op_id, int32 output_num)
+      : op_id(op_id), output_num(output_num) {}
+  int64 op_id;
+  int32 output_num;
+};
+
+struct RemoteTensorHandleInternalHash {
+  std::size_t operator()(const RemoteTensorHandleInternal& handle) const {
+    return FingerprintCat64(handle.op_id, handle.output_num);
+  }
+};
+
+struct RemoteTensorHandleInternalEquals {
+  bool operator()(const RemoteTensorHandleInternal& first,
+                  const RemoteTensorHandleInternal& second) const {
+    return first.op_id == second.op_id && first.output_num == second.output_num;
+  }
+};
+
+}  // namespace eager
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_EAGER_REMOTE_TENSOR_HANDLE_H_
diff --git a/tensorflow/core/distributed_runtime/local_master.h b/tensorflow/core/distributed_runtime/local_master.h
index cad6babad82b9b..b9c76d0f1d834a 100644
--- a/tensorflow/core/distributed_runtime/local_master.h
+++ b/tensorflow/core/distributed_runtime/local_master.h
@@ -79,7 +79,7 @@ class LocalMaster : public MasterInterface {
                      RunCallableResponse* response) override;
   Status ReleaseCallable(CallOptions* call_options,
                          const ReleaseCallableRequest* request,
-                         ReleaseCallableResponse* response);
+                         ReleaseCallableResponse* response) override;
 
   // Registers the mapping from the given `target` to the given `master`.
   //
diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc
index e60386fd34acc1..a48f734d3e2865 100644
--- a/tensorflow/core/distributed_runtime/master.cc
+++ b/tensorflow/core/distributed_runtime/master.cc
@@ -269,7 +269,8 @@ class DeviceFinder {
     mutex_lock l(mu_);
     seen_targets_[target_index] = true;
     if (!s.ok()) {
-      LOG(ERROR) << "Master init: " << s;
+      LOG(ERROR) << "CreateSession failed because worker "
+                 << targets_[target_index] << " returned error: " << s;
       status_.Update(s);
     } else {
       found_.insert(found_.end(), devices->begin(), devices->end());
@@ -472,7 +473,7 @@ void Master::PartialRunSetup(const PartialRunSetupRequest* req,
     return;
   }
 
-  SchedClosure([this, session, req, resp, done]() {
+  SchedClosure([session, req, resp, done]() {
     Status s = session->PartialRunSetup(req, resp);
     session->Unref();
     done(s);
@@ -627,7 +628,7 @@ void Master::MakeCallable(const MakeCallableRequest* req,
   }
 
   SchedClosure(std::bind(
-      [this, session, req, resp](MyClosure done) {
+      [session, req, resp](MyClosure done) {
         Status s = session->MakeCallable(*req, resp);
         session->Unref();
         done(s);
@@ -644,7 +645,7 @@ void Master::RunCallable(CallOptions* opts, const RunCallableRequest* req,
   }
 
   SchedClosure(std::bind(
-      [this, session, opts, req, resp](MyClosure done) {
+      [session, opts, req, resp](MyClosure done) {
         Status s = session->RunCallable(opts, *req, resp);
         session->Unref();
         done(s);
@@ -661,7 +662,7 @@ void Master::ReleaseCallable(const ReleaseCallableRequest* req,
   }
 
   SchedClosure(std::bind(
-      [this, session, req, resp](MyClosure done) {
+      [session, req, resp](MyClosure done) {
         Status s = session->ReleaseCallable(*req, resp);
         session->Unref();
         done(s);
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index 83afc5b1a4676b..e29bb76ddf1d58 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -119,7 +119,11 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
     // it on/off and don't make use of the responses.
     for (auto& p : partitions_) {
       LoggingRequest* req = new LoggingRequest;
-      req->set_rpc_logging(active);
+      if (active) {
+        req->set_enable_rpc_logging(true);
+      } else {
+        req->set_disable_rpc_logging(true);
+      }
       LoggingResponse* resp = new LoggingResponse;
       Ref();
       p.worker->LoggingAsync(req, resp, [this, req, resp](const Status& s) {
@@ -152,8 +156,7 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
         LoggingResponse* resp = new LoggingResponse;
         p.worker->LoggingAsync(
             &req, resp,
-            [step_id, ss, resp, &scoped_mu, &waiting_for,
-             &all_done](const Status& s) {
+            [step_id, ss, resp, &scoped_mu, &all_done](const Status& s) {
               {
                 mutex_lock l(scoped_mu);
                 if (s.ok()) {
@@ -606,7 +609,7 @@ Status MasterSession::ReffedClientGraph::RunPartitionsHelper(
     // inadvertently slowing down the normal run path.
     if (is_partial_) {
       for (const auto& name_index : feeds) {
-        const auto iter = part.feed_key.find(name_index.first.ToString());
+        const auto iter = part.feed_key.find(std::string(name_index.first));
         if (iter == part.feed_key.end()) {
           // The provided feed must be for a different partition.
           continue;
@@ -950,7 +953,7 @@ Status MasterSession::ReffedClientGraph::CheckFetches(
     // Skip if already fed.
     if (input.second) continue;
     TensorId id(ParseTensorName(input.first));
-    const Node* n = execution_state->get_node_by_name(id.first.ToString());
+    const Node* n = execution_state->get_node_by_name(std::string(id.first));
     if (n == nullptr) {
       return errors::NotFound("Feed ", input.first, ": not found");
     }
@@ -966,7 +969,7 @@ Status MasterSession::ReffedClientGraph::CheckFetches(
   for (size_t i = 0; i < req.num_fetches(); ++i) {
     const string& fetch = req.fetch_name(i);
     const TensorId id(ParseTensorName(fetch));
-    const Node* n = execution_state->get_node_by_name(id.first.ToString());
+    const Node* n = execution_state->get_node_by_name(std::string(id.first));
     if (n == nullptr) {
       return errors::NotFound("Fetch ", fetch, ": not found");
     }
@@ -1203,7 +1206,7 @@ Status MasterSession::CreateWorkerSessions(
   std::vector<WorkerGroup> workers(worker_names.size());
 
   // Release the workers.
-  auto cleanup = gtl::MakeCleanup([this, &workers, worker_cache] {
+  auto cleanup = gtl::MakeCleanup([&workers, worker_cache] {
     for (auto&& worker_group : workers) {
       if (worker_group.worker != nullptr) {
         worker_cache->ReleaseWorker(*worker_group.name, worker_group.worker);
@@ -1285,7 +1288,7 @@ Status MasterSession::DeleteWorkerSessions() {
   std::vector<WorkerGroup> workers(worker_names.size());
 
   // Release the workers.
-  auto cleanup = gtl::MakeCleanup([this, &workers, worker_cache] {
+  auto cleanup = gtl::MakeCleanup([&workers, worker_cache] {
     for (auto&& worker_group : workers) {
       if (worker_group.worker != nullptr) {
         worker_cache->ReleaseWorker(*worker_group.name, worker_group.worker);
diff --git a/tensorflow/core/distributed_runtime/master_test.cc b/tensorflow/core/distributed_runtime/master_test.cc
index f2c1f3489c388d..0826a90860a5d7 100644
--- a/tensorflow/core/distributed_runtime/master_test.cc
+++ b/tensorflow/core/distributed_runtime/master_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "grpc++/grpc++.h"
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_testlib.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/framework/allocator.h"
@@ -37,7 +38,6 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/master.pb.h"
-#include "tensorflow/core/protobuf/master_service.grpc.pb.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/distributed_runtime/remote_device.cc b/tensorflow/core/distributed_runtime/remote_device.cc
index ec26ac44b5f424..15e5919c54a539 100644
--- a/tensorflow/core/distributed_runtime/remote_device.cc
+++ b/tensorflow/core/distributed_runtime/remote_device.cc
@@ -37,7 +37,7 @@ string GetLocalDeviceName(StringPiece fullname) {
   auto pos = fullname.rfind('/');
   CHECK_NE(pos, StringPiece::npos);
   fullname.remove_prefix(pos + 1);
-  return fullname.ToString();
+  return std::string(fullname);
 }
 
 class RemoteDevice : public Device {
diff --git a/tensorflow/core/distributed_runtime/remote_device_test.cc b/tensorflow/core/distributed_runtime/remote_device_test.cc
index 778060daafa571..a04e79328b0b53 100644
--- a/tensorflow/core/distributed_runtime/remote_device_test.cc
+++ b/tensorflow/core/distributed_runtime/remote_device_test.cc
@@ -49,8 +49,9 @@ class RemoteDeviceTest : public ::testing::Test {
     TF_CHECK_OK(spec.AddHostPortsJob("localhost", {hostport}));
     ChannelCreationFunction channel_func =
         ConvertToChannelCreationFunction(NewHostPortGrpcChannel);
-    worker_cache_.reset(
-        NewGrpcWorkerCache(NewGrpcChannelCache(spec, channel_func)));
+    std::shared_ptr<GrpcChannelCache> channel_cache(
+        NewGrpcChannelCache(spec, channel_func));
+    worker_cache_.reset(NewGrpcWorkerCache(channel_cache));
     remote_name_ = "/job:localhost/replica:0/task:0";
     wi_ = worker_cache_->CreateWorker(remote_name_);
   }
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 6f6213723d8dc3..89e3cbb161a61d 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -169,9 +169,9 @@ tf_gpu_library(
         ":grpc_worker_service_impl",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
-        "//tensorflow/core:gpu_runtime",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:worker_proto_cc",
         "//tensorflow/core/distributed_runtime:graph_mgr",
         "//tensorflow/core/distributed_runtime:recent_request_ids",
@@ -314,18 +314,6 @@ tf_cc_binary(
     ],
 )
 
-tf_gpu_library(
-    name = "grpc_testlib_ops",
-    testonly = 1,
-    srcs = ["grpc_testlib_ops.cc"],
-    linkstatic = 1,  # Seems to be needed since alwayslink is broken in bazel
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-    ],
-    alwayslink = 1,
-)
-
 tf_cc_binary(
     name = "grpc_testlib_server",
     testonly = 1,
@@ -334,11 +322,11 @@ tf_cc_binary(
     ],
     deps = [
         ":grpc_server_lib",
-        ":grpc_testlib_ops",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:testlib",
         "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/kernels:constant_op",
         "//tensorflow/core/kernels:cwise_op",
@@ -362,12 +350,12 @@ tf_gpu_library(
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         ":grpc_session",
-        ":grpc_testlib_ops",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
+        "//tensorflow/core:testlib",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/BUILD b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
new file mode 100644
index 00000000000000..1a3bd9d6bf0e42
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
@@ -0,0 +1,67 @@
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+package(default_visibility = [
+    "//tensorflow:internal",
+])
+
+cc_library(
+    name = "grpc_eager_service",
+    srcs = ["grpc_eager_service.cc"],
+    hdrs = ["grpc_eager_service.h"],
+    deps = [
+        "//tensorflow/core:eager_service_proto_cc",
+        "@grpc//:grpc++_unsecure",
+    ],
+)
+
+cc_library(
+    name = "grpc_eager_client",
+    srcs = ["grpc_eager_client.cc"],
+    hdrs = ["grpc_eager_client.h"],
+    deps = [
+        "//tensorflow/core:eager_service_proto_cc",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/distributed_runtime/eager:eager_client",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_channel",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_client_cq_tag",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_state",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+        "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_service",
+        "@grpc//:grpc++_unsecure",
+    ],
+)
+
+cc_library(
+    name = "grpc_eager_service_impl",
+    srcs = ["grpc_eager_service_impl.cc"],
+    hdrs = ["grpc_eager_service_impl.h"],
+    deps = [
+        ":grpc_eager_service",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:ptr_util",
+        "//tensorflow/core/distributed_runtime/eager:eager_service_impl",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_call",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_channel",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_util",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_worker_service",
+        "@grpc//:grpc++_unsecure",
+    ],
+)
+
+cc_library(
+    name = "eager_grpc_server_lib",
+    hdrs = ["eager_grpc_server_lib.h"],
+    deps = [
+        ":grpc_eager_service_impl",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core/distributed_runtime:rendezvous_mgr_interface",
+        "//tensorflow/core/distributed_runtime:worker_cache_wrapper",
+        "//tensorflow/core/distributed_runtime/eager:eager_service_impl",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+        "//tensorflow/core/distributed_runtime/rpc:grpc_worker_service",
+    ],
+)
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/eager_grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/eager/eager_grpc_server_lib.h
new file mode 100644
index 00000000000000..f5dc4c831d04a0
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/eager/eager_grpc_server_lib.h
@@ -0,0 +1,97 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_EAGER_GRPC_SERVER_LIB_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_EAGER_GRPC_SERVER_LIB_H_
+
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/distributed_runtime/eager/eager_service_impl.h"
+#include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
+#include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h"
+#include "tensorflow/core/distributed_runtime/worker_cache_wrapper.h"
+
+namespace tensorflow {
+namespace eager {
+
+class EagerGrpcServer : public GrpcServer {
+ public:
+  static Status Create(const ServerDef& server_def,
+                       std::unique_ptr<EagerGrpcServer>* server) {
+    std::unique_ptr<EagerGrpcServer> ret(new EagerGrpcServer(server_def));
+
+    TF_RETURN_IF_ERROR(ret->InitEager());
+
+    *server = std::move(ret);
+
+    return Status::OK();
+  }
+
+  Status Start() override {
+    TF_RETURN_IF_ERROR(GrpcServer::Start());
+
+    eager_service_->Start();
+
+    return Status::OK();
+  }
+
+  Status Stop() override {
+    TF_RETURN_IF_ERROR(GrpcServer::Stop());
+
+    eager_service_->Stop();
+
+    return Status::OK();
+  }
+
+  using GrpcServer::channel_cache;
+  using GrpcServer::master_env;
+  using GrpcServer::worker_env;
+
+ private:
+  EagerGrpcServer(const ServerDef& server_def)
+      : GrpcServer(server_def, Env::Default()),
+        worker_name_(
+            strings::StrCat("/job:", server_def.job_name(),
+                            "/replica:0/task:", server_def.task_index())) {}
+
+  Status InitEager() {
+    TF_RETURN_IF_ERROR(this->Init(
+        [this](const WorkerEnv* worker_env,
+               ::grpc::ServerBuilder* server_builder) {
+          this->eager_service_.reset(
+              new eager::GrpcEagerServiceImpl(worker_env, server_builder));
+        },
+        nullptr));
+
+    worker_session_ = WorkerSession::CreateWithBorrowedDeviceMgr(
+        "", worker_name_,
+        std::unique_ptr<WorkerCacheInterface>(
+            new WorkerCacheWrapper(master_env()->worker_cache)),
+        worker_env()->device_mgr, {});
+
+    auto* r = worker_env()->rendezvous_mgr->Find(0);
+    return r->Initialize(worker_session_.get());
+  }
+
+  std::unique_ptr<GrpcEagerServiceImpl> eager_service_;
+  std::shared_ptr<WorkerSession> worker_session_;
+  const string worker_name_;
+};  // namespace eager
+
+}  // namespace eager
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_EAGER_GRPC_SERVER_LIB_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
new file mode 100644
index 00000000000000..4786c43ee2c511
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
@@ -0,0 +1,142 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.h"
+
+#include "grpc++/generic/generic_stub.h"
+#include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_state.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/protobuf/eager_service.pb.h"
+
+namespace tensorflow {
+namespace eager {
+namespace {
+class GrpcEagerClient : public EagerClient {
+ public:
+  GrpcEagerClient(const tensorflow::SharedGrpcChannelPtr& channel,
+                  ::grpc::CompletionQueue* cq)
+      : stub_(channel), cq_(cq) {}
+  ~GrpcEagerClient() override {}
+
+#define CLIENT_METHOD(method)                                             \
+  void method##Async(const method##Request* request,                      \
+                     method##Response* response, StatusCallback done)     \
+      override {                                                          \
+    new RPCState<protobuf::Message>(                                      \
+        &stub_, cq_, "/tensorflow.eager.EagerService/" #method, *request, \
+        response, std::move(done), nullptr);                              \
+  }
+
+  CLIENT_METHOD(CreateContext);
+  CLIENT_METHOD(Enqueue);
+  CLIENT_METHOD(WaitQueueDone);
+  CLIENT_METHOD(KeepAlive);
+  CLIENT_METHOD(CloseContext);
+  CLIENT_METHOD(RegisterFunction);
+
+#undef CLIENT_METHOD
+
+ private:
+  ::grpc::GenericStub stub_;
+  ::grpc::CompletionQueue* cq_;
+};
+
+class GrpcEagerClientCache : public EagerClientCache {
+ public:
+  explicit GrpcEagerClientCache(
+      std::shared_ptr<tensorflow::GrpcChannelCache> cache)
+      : next_round_robin_assignment_(0), cache_(cache), threads_(4) {}
+
+  ~GrpcEagerClientCache() override { threads_.clear(); }
+
+  EagerClient* GetClient(const string& target) override {
+    auto it = clients_.find(target);
+    if (it == clients_.end()) {
+      tensorflow::SharedGrpcChannelPtr shared =
+          cache_->FindWorkerChannel(target);
+      auto worker = std::unique_ptr<EagerClient>(new GrpcEagerClient(
+          shared, threads_[AssignClientToThread(target)].completion_queue()));
+
+      it = clients_.emplace(target, std::move(worker)).first;
+    }
+
+    return it->second.get();
+  }
+
+ private:
+  mutex assignment_mu_;
+  std::unordered_map<std::string, size_t> target_assignments_
+      GUARDED_BY(assignment_mu_);
+  size_t next_round_robin_assignment_ GUARDED_BY(assignment_mu_);
+
+  size_t AssignClientToThread(const string& target) {
+    // Round-robin target assignment, but keeps the same target on the same
+    // polling thread always, as this is important for gRPC performace
+    mutex_lock lock(assignment_mu_);
+    auto it = target_assignments_.find(target);
+    if (it == target_assignments_.end()) {
+      it = target_assignments_
+               .insert(std::make_pair(
+                   target, (next_round_robin_assignment_++) % threads_.size()))
+               .first;
+    }
+    return it->second;
+  }
+
+  class GrpcEagerClientThread {
+   public:
+    GrpcEagerClientThread() {
+      thread_.reset(Env::Default()->StartThread(
+          ThreadOptions(), "eager_client_thread", [this]() {
+            void* tag;
+            bool ok;
+            while (completion_queue_.Next(&tag, &ok)) {
+              GrpcClientCQTag* callback_tag =
+                  static_cast<GrpcClientCQTag*>(tag);
+              callback_tag->OnCompleted(ok);
+            }
+          }));
+    }
+
+    ~GrpcEagerClientThread() {
+      completion_queue_.Shutdown();
+      thread_.reset();
+    }
+
+    ::grpc::CompletionQueue* completion_queue() { return &completion_queue_; }
+
+   private:
+    ::grpc::CompletionQueue completion_queue_;
+    std::unique_ptr<Thread> thread_;
+  };  // GrpcEagerClientThread
+
+  std::shared_ptr<tensorflow::GrpcChannelCache> cache_;
+  std::unordered_map<string, std::unique_ptr<EagerClient>> clients_;
+  std::vector<GrpcEagerClientThread> threads_;
+};
+
+}  // namespace
+
+EagerClientCache* NewGrpcEagerClientCache(
+    std::shared_ptr<tensorflow::GrpcChannelCache> channel) {
+  return new GrpcEagerClientCache(channel);
+}
+
+}  // namespace eager
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.h b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.h
new file mode 100644
index 00000000000000..8a926da488477b
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.h
@@ -0,0 +1,30 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_CLIENT_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_CLIENT_H_
+
+#include "tensorflow/core/distributed_runtime/eager/eager_client.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
+
+namespace tensorflow {
+namespace eager {
+// The GrpcChannelCache is not owned.
+EagerClientCache* NewGrpcEagerClientCache(
+    std::shared_ptr<tensorflow::GrpcChannelCache> channel);
+}  // namespace eager
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_CLIENT_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.cc
new file mode 100644
index 00000000000000..3fd7deaa868a9d
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.cc
@@ -0,0 +1,123 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h"
+
+#include "grpc++/impl/codegen/async_stream.h"
+#include "grpc++/impl/codegen/async_unary_call.h"
+#include "grpc++/impl/codegen/channel_interface.h"
+#include "grpc++/impl/codegen/client_unary_call.h"
+#include "grpc++/impl/codegen/method_handler_impl.h"
+#include "grpc++/impl/codegen/rpc_service_method.h"
+#include "grpc++/impl/codegen/service_type.h"
+#include "grpc++/impl/codegen/sync_stream.h"
+
+namespace tensorflow {
+namespace eager {
+
+namespace grpc {
+
+static const char* grpcEagerService_method_names[] = {
+    "/tensorflow.eager.EagerService/CreateContext",
+    "/tensorflow.eager.EagerService/Enqueue",
+    "/tensorflow.eager.EagerService/WaitQueueDone",
+    "/tensorflow.eager.EagerService/KeepAlive",
+    "/tensorflow.eager.EagerService/CloseContext",
+    "/tensorflow.eager.EagerService/RegisterFunction",
+};
+
+std::unique_ptr<EagerService::Stub> EagerService::NewStub(
+    const std::shared_ptr< ::grpc::ChannelInterface>& channel,
+    const ::grpc::StubOptions& options) {
+  std::unique_ptr<EagerService::Stub> stub(new EagerService::Stub(channel));
+  return stub;
+}
+
+EagerService::Stub::Stub(
+    const std::shared_ptr< ::grpc::ChannelInterface>& channel)
+    : channel_(channel),
+      rpcmethod_CreateContext_(grpcEagerService_method_names[0],
+                               ::grpc::internal::RpcMethod::NORMAL_RPC,
+                               channel),
+      rpcmethod_Enqueue_(grpcEagerService_method_names[1],
+                         ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
+      rpcmethod_WaitQueueDone_(grpcEagerService_method_names[2],
+                               ::grpc::internal::RpcMethod::NORMAL_RPC,
+                               channel),
+      rpcmethod_KeepAlive_(grpcEagerService_method_names[3],
+                           ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
+      rpcmethod_CloseContext_(grpcEagerService_method_names[4],
+                              ::grpc::internal::RpcMethod::NORMAL_RPC, channel),
+      rpcmethod_RegisterFunction_(grpcEagerService_method_names[5],
+                                  ::grpc::internal::RpcMethod::NORMAL_RPC,
+                                  channel) {}
+
+::grpc::Status EagerService::Stub::CreateContext(
+    ::grpc::ClientContext* context, const CreateContextRequest& request,
+    CreateContextResponse* response) {
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_CreateContext_, context, request, response);
+}
+
+::grpc::Status EagerService::Stub::Enqueue(::grpc::ClientContext* context,
+                                           const EnqueueRequest& request,
+                                           EnqueueResponse* response) {
+  return ::grpc::internal::BlockingUnaryCall(channel_.get(), rpcmethod_Enqueue_,
+                                             context, request, response);
+}
+
+::grpc::Status EagerService::Stub::WaitQueueDone(
+    ::grpc::ClientContext* context, const WaitQueueDoneRequest& request,
+    WaitQueueDoneResponse* response) {
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_WaitQueueDone_, context, request, response);
+}
+
+::grpc::Status EagerService::Stub::KeepAlive(::grpc::ClientContext* context,
+                                             const KeepAliveRequest& request,
+                                             KeepAliveResponse* response) {
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_KeepAlive_, context, request, response);
+}
+
+::grpc::Status EagerService::Stub::CloseContext(
+    ::grpc::ClientContext* context, const CloseContextRequest& request,
+    CloseContextResponse* response) {
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_CloseContext_, context, request, response);
+}
+
+::grpc::Status EagerService::Stub::RegisterFunction(
+    ::grpc::ClientContext* context, const RegisterFunctionRequest& request,
+    RegisterFunctionResponse* response) {
+  return ::grpc::internal::BlockingUnaryCall(
+      channel_.get(), rpcmethod_RegisterFunction_, context, request, response);
+}
+
+EagerService::AsyncService::AsyncService() {
+  for (int i = 0; i < 6; ++i) {
+    AddMethod(new ::grpc::internal::RpcServiceMethod(
+        grpcEagerService_method_names[i],
+        ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr));
+    ::grpc::Service::MarkMethodAsync(i);
+  }
+}
+
+EagerService::AsyncService::~AsyncService() {}
+
+}  // namespace grpc
+
+}  // namespace eager
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h
new file mode 100644
index 00000000000000..d7b192ac857a42
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h
@@ -0,0 +1,168 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_SERVICE_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_SERVICE_H_
+
+#include "grpc++/impl/codegen/async_stream.h"
+#include "grpc++/impl/codegen/async_unary_call.h"
+#include "grpc++/impl/codegen/proto_utils.h"
+#include "grpc++/impl/codegen/rpc_method.h"
+#include "grpc++/impl/codegen/service_type.h"
+#include "grpc++/impl/codegen/status.h"
+#include "grpc++/impl/codegen/stub_options.h"
+#include "grpc++/impl/codegen/sync_stream.h"
+
+#include "tensorflow/core/protobuf/eager_service.pb.h"
+
+namespace grpc {
+class CompletionQueue;
+class Channel;
+class RpcService;
+class ServerCompletionQueue;
+class ServerContext;
+}  // namespace grpc
+
+namespace tensorflow {
+namespace eager {
+
+namespace grpc {
+
+// GRPC stubs of `tensorflow.eager.EagerService`, based on the
+// definition in "//tensorflow/core/protobuf/eager_service.proto",
+// and the gRPC generated stub and service classes.
+// See that file for the definition of methods and messages.
+// Similar to the Master/Worker tensorflow GRPC services, this is not gen'ned
+// via a rule, but included as an implementation directly.
+class EagerService final {
+ public:
+  class StubInterface {
+   public:
+    virtual ~StubInterface() {}
+    virtual ::grpc::Status CreateContext(::grpc::ClientContext* context,
+                                         const CreateContextRequest& request,
+                                         CreateContextResponse* response) = 0;
+    virtual ::grpc::Status Enqueue(::grpc::ClientContext* context,
+                                   const EnqueueRequest& request,
+                                   EnqueueResponse* response) = 0;
+    virtual ::grpc::Status WaitQueueDone(::grpc::ClientContext* context,
+                                         const WaitQueueDoneRequest& request,
+                                         WaitQueueDoneResponse* response) = 0;
+    virtual ::grpc::Status KeepAlive(::grpc::ClientContext* context,
+                                     const KeepAliveRequest& request,
+                                     KeepAliveResponse* response) = 0;
+    virtual ::grpc::Status CloseContext(::grpc::ClientContext* context,
+                                        const CloseContextRequest& request,
+                                        CloseContextResponse* response) = 0;
+    virtual ::grpc::Status RegisterFunction(
+        ::grpc::ClientContext* context, const RegisterFunctionRequest& request,
+        RegisterFunctionResponse* response) = 0;
+  };
+  class Stub final : public StubInterface {
+   public:
+    Stub(const std::shared_ptr< ::grpc::ChannelInterface>& channel);
+    ::grpc::Status CreateContext(::grpc::ClientContext* context,
+                                 const CreateContextRequest& request,
+                                 CreateContextResponse* response) override;
+    ::grpc::Status Enqueue(::grpc::ClientContext* context,
+                           const EnqueueRequest& request,
+                           EnqueueResponse* response) override;
+    ::grpc::Status WaitQueueDone(::grpc::ClientContext* context,
+                                 const WaitQueueDoneRequest& request,
+                                 WaitQueueDoneResponse* response) override;
+    ::grpc::Status KeepAlive(::grpc::ClientContext* context,
+                             const KeepAliveRequest& request,
+                             KeepAliveResponse* response) override;
+    ::grpc::Status CloseContext(::grpc::ClientContext* context,
+                                const CloseContextRequest& request,
+                                CloseContextResponse* response) override;
+    ::grpc::Status RegisterFunction(
+        ::grpc::ClientContext* context, const RegisterFunctionRequest& request,
+        RegisterFunctionResponse* response) override;
+
+   private:
+    std::shared_ptr< ::grpc::ChannelInterface> channel_;
+    const ::grpc::internal::RpcMethod rpcmethod_CreateContext_;
+    const ::grpc::internal::RpcMethod rpcmethod_Enqueue_;
+    const ::grpc::internal::RpcMethod rpcmethod_WaitQueueDone_;
+    const ::grpc::internal::RpcMethod rpcmethod_KeepAlive_;
+    const ::grpc::internal::RpcMethod rpcmethod_CloseContext_;
+    const ::grpc::internal::RpcMethod rpcmethod_RegisterFunction_;
+  };
+  static std::unique_ptr<Stub> NewStub(
+      const std::shared_ptr< ::grpc::ChannelInterface>& channel,
+      const ::grpc::StubOptions& options = ::grpc::StubOptions());
+
+  class AsyncService : public ::grpc::Service {
+   public:
+    AsyncService();
+    virtual ~AsyncService();
+    void RequestCreateContext(
+        ::grpc::ServerContext* context, CreateContextRequest* request,
+        ::grpc::ServerAsyncResponseWriter<CreateContextResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(0, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+    void RequestEnqueue(
+        ::grpc::ServerContext* context, EnqueueRequest* request,
+        ::grpc::ServerAsyncResponseWriter<EnqueueResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(1, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+    void RequestWaitQueueDone(
+        ::grpc::ServerContext* context, WaitQueueDoneRequest* request,
+        ::grpc::ServerAsyncResponseWriter<WaitQueueDoneResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(2, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+    void RequestKeepAlive(
+        ::grpc::ServerContext* context, KeepAliveRequest* request,
+        ::grpc::ServerAsyncResponseWriter<KeepAliveResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(3, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+    void RequestCloseContext(
+        ::grpc::ServerContext* context, CloseContextRequest* request,
+        ::grpc::ServerAsyncResponseWriter<CloseContextResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(4, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+    void RequestRegisterFunction(
+        ::grpc::ServerContext* context, RegisterFunctionRequest* request,
+        ::grpc::ServerAsyncResponseWriter<RegisterFunctionResponse>* response,
+        ::grpc::CompletionQueue* new_call_cq,
+        ::grpc::ServerCompletionQueue* notification_cq, void* tag) {
+      ::grpc::Service::RequestAsyncUnary(5, context, request, response,
+                                         new_call_cq, notification_cq, tag);
+    }
+  };
+};
+
+}  // namespace grpc
+
+}  // namespace eager
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_SERVICE_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc
new file mode 100644
index 00000000000000..b36c6dce868e40
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.cc
@@ -0,0 +1,91 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h"
+
+#include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_call.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace eager {
+
+GrpcEagerServiceImpl::GrpcEagerServiceImpl(
+    const WorkerEnv* env, ::grpc::ServerBuilder* server_builder)
+    : local_impl_(env) {
+  request_handler_threadpool_ =
+      MakeUnique<thread::ThreadPool>(env->env, "EagerServiceRequestHandler", 4);
+  server_builder->RegisterService(&service_);
+  cq_ = server_builder->AddCompletionQueue();
+}
+
+void GrpcEagerServiceImpl::DriveCQ() {
+#define ENQUEUE_REQUEST(method)                                                \
+  do {                                                                         \
+    Call<GrpcEagerServiceImpl,                                                 \
+         tensorflow::eager::grpc::EagerService::AsyncService, method##Request, \
+         method##Response>::                                                   \
+        EnqueueRequest(&service_, cq_.get(),                                   \
+                       &grpc::EagerService::AsyncService::Request##method,     \
+                       &GrpcEagerServiceImpl::method##Handler, false);         \
+  } while (0)
+  ENQUEUE_REQUEST(CreateContext);
+  ENQUEUE_REQUEST(Enqueue);
+  ENQUEUE_REQUEST(WaitQueueDone);
+  ENQUEUE_REQUEST(KeepAlive);
+  ENQUEUE_REQUEST(CloseContext);
+  ENQUEUE_REQUEST(RegisterFunction);
+#undef ENQUEUE_REQUEST
+
+  void* tag;  // Matches the operation started against this cq_.
+  bool ok;
+
+  while (true) {
+    if (!cq_->Next(&tag, &ok)) {
+      // The queue is shutting down.
+      break;
+    }
+    UntypedCall<GrpcEagerServiceImpl>::Tag* callback_tag =
+        static_cast<UntypedCall<GrpcEagerServiceImpl>::Tag*>(tag);
+
+    if (callback_tag) {
+      callback_tag->OnCompleted(this, ok);
+    } else {
+      cq_->Shutdown();
+      break;
+    }
+  }
+}
+
+void GrpcEagerServiceImpl::Start() {
+  // TODO(nareshmodi) separate thread for driving CQ
+  request_handler_threadpool_->Schedule([this]() { DriveCQ(); });
+}
+
+void GrpcEagerServiceImpl::Stop() {
+  // This enqueues a special event (with a null tag)
+  // that causes the completion queue to be shut down on the
+  // polling thread.
+  shutdown_alarm_ = MakeUnique<::grpc::Alarm>(
+      cq_.get(), gpr_now(GPR_CLOCK_MONOTONIC), nullptr);
+}
+
+}  // namespace eager
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
new file mode 100644
index 00000000000000..65550caf646283
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
@@ -0,0 +1,84 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_SERVICE_IMPL_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_SERVICE_IMPL_H_
+
+#include "grpc++/alarm.h"
+#include "grpc++/completion_queue.h"
+#include "grpc++/server_builder.h"
+#include "tensorflow/core/distributed_runtime/eager/eager_service_impl.h"
+#include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_call.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
+#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+
+namespace tensorflow {
+namespace eager {
+
+// This class is a wrapper that handles communication for gRPC.
+class GrpcEagerServiceImpl {
+ public:
+  template <class RequestMessage, class ResponseMessage>
+  using EagerCall = Call<GrpcEagerServiceImpl, grpc::EagerService::AsyncService,
+                         RequestMessage, ResponseMessage>;
+
+  GrpcEagerServiceImpl(const WorkerEnv* env,
+                       ::grpc::ServerBuilder* server_builder);
+  virtual ~GrpcEagerServiceImpl() {}
+
+  void Start();
+  void Stop();
+
+ private:
+#define HANDLER(method)                                                        \
+  void method##Handler(EagerCall<method##Request, method##Response>* call) {   \
+    request_handler_threadpool_->Schedule([this, call]() {                     \
+      call->SendResponse(                                                      \
+          ToGrpcStatus(local_impl_.method(&call->request, &call->response)));  \
+    });                                                                        \
+    Call<GrpcEagerServiceImpl,                                                 \
+         tensorflow::eager::grpc::EagerService::AsyncService, method##Request, \
+         method##Response>::                                                   \
+        EnqueueRequest(&service_, cq_.get(),                                   \
+                       &grpc::EagerService::AsyncService::Request##method,     \
+                       &GrpcEagerServiceImpl::method##Handler, false);         \
+  }
+  HANDLER(CreateContext);
+  HANDLER(Enqueue);
+  HANDLER(WaitQueueDone);
+  HANDLER(KeepAlive);
+  HANDLER(CloseContext);
+  HANDLER(RegisterFunction);
+#undef HANDLER
+
+  EagerServiceImpl local_impl_;
+
+  void DriveCQ();
+
+  std::unique_ptr<::grpc::Alarm> shutdown_alarm_;
+
+  std::unique_ptr<::grpc::ServerCompletionQueue> cq_;
+  tensorflow::eager::grpc::EagerService::AsyncService service_;
+
+  std::unique_ptr<thread::ThreadPool> request_handler_threadpool_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GrpcEagerServiceImpl);
+};
+
+}  // namespace eager
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_SERVICE_IMPL_H_
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
index 23968e24c87ee1..e025e555dd0656 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
@@ -285,7 +285,7 @@ class GrpcMasterService : public AsyncServiceInterface {
 #undef ENQUEUE_REQUEST
 
   // Start tracing, including the ID attached to the RPC.
-  port::Tracing::TraceMe* TraceRpc(
+  tracing::ScopedActivity* TraceRpc(
       StringPiece name,
       const std::multimap<::grpc::string_ref, ::grpc::string_ref>& metadata) {
     StringPiece id;
@@ -293,7 +293,7 @@ class GrpcMasterService : public AsyncServiceInterface {
     if (it != metadata.end()) {
       id = StringPiece(it->second.data(), it->second.size());
     }
-    return new port::Tracing::TraceMe(name, id);
+    return new tracing::ScopedActivity(name, id);
   }
 
   TF_DISALLOW_COPY_AND_ASSIGN(GrpcMasterService);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
index c832adbbbf8eba..85adfd2c762db0 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
@@ -147,7 +147,9 @@ ::grpc::Status MasterService::Stub::ReleaseCallable(
 }
 
 MasterService::AsyncService::AsyncService() {
-  for (int i = 0; i < 10; ++i) {
+  int method_len = sizeof(grpcMasterService_method_names) / 
+                    sizeof(grpcMasterService_method_names[0]);
+  for (int i = 0; i < method_len; ++i) {
     AddMethod(new ::grpc::internal::RpcServiceMethod(
         grpcMasterService_method_names[i],
         ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr));
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
index 1b92a79a67eae2..b832a2115cb809 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_master.cc
@@ -119,11 +119,11 @@ class GrpcRemoteMaster : public MasterInterface {
 
  private:
   // Start tracing, attaching a unique ID to both the trace and the RPC.
-  port::Tracing::TraceMe TraceRpc(StringPiece name,
-                                  ::grpc::ClientContext* ctx) {
-    string trace_id = strings::StrCat(port::Tracing::UniqueId());
+  tracing::ScopedActivity TraceRpc(StringPiece name,
+                                   ::grpc::ClientContext* ctx) {
+    string trace_id = strings::StrCat(tracing::GetUniqueArg());
     ctx->AddMetadata(GrpcIdKey(), trace_id);
-    return port::Tracing::TraceMe(name, trace_id);
+    return tracing::ScopedActivity(name, trace_id);
   }
 
   void SetDeadline(::grpc::ClientContext* ctx, int64 time_in_ms) {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index 895bbd97b76921..1acf1fb4fc1ea9 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -54,8 +54,12 @@ class GrpcRemoteWorker : public WorkerInterface {
         cleanupgraph_(Method(GrpcWorkerMethod::kCleanupGraph)),
         cleanupall_(Method(GrpcWorkerMethod::kCleanupAll)),
         recvtensor_(Method(GrpcWorkerMethod::kRecvTensor)),
+        recvbuf_(Method(GrpcWorkerMethod::kRecvBuf)),
         logging_(Method(GrpcWorkerMethod::kLogging)),
         tracing_(Method(GrpcWorkerMethod::kTracing)),
+        completegroup_(Method(GrpcWorkerMethod::kCompleteGroup)),
+        instancesource_(Method(GrpcWorkerMethod::kCompleteInstance)),
+        getstepsequence_(Method(GrpcWorkerMethod::kGetStepSequence)),
         logger_(logger) {}
 
   ~GrpcRemoteWorker() override {}
@@ -115,6 +119,32 @@ class GrpcRemoteWorker : public WorkerInterface {
     IssueRequest(request, response, cleanupall_, std::move(done));
   }
 
+  void RecvBufAsync(CallOptions* call_opts, const RecvBufRequest* request,
+                    RecvBufResponse* response, StatusCallback done) override {
+    IssueRequest(request, response, recvbuf_, std::move(done), call_opts);
+  }
+
+  void CompleteGroupAsync(CallOptions* call_opts,
+                          const CompleteGroupRequest* request,
+                          CompleteGroupResponse* response,
+                          StatusCallback done) override {
+    IssueRequest(request, response, completegroup_, std::move(done), call_opts);
+  }
+
+  void CompleteInstanceAsync(CallOptions* call_opts,
+                             const CompleteInstanceRequest* request,
+                             CompleteInstanceResponse* response,
+                             StatusCallback done) override {
+    IssueRequest(request, response, instancesource_, std::move(done),
+                 call_opts);
+  }
+
+  void GetStepSequenceAsync(const GetStepSequenceRequest* request,
+                            GetStepSequenceResponse* response,
+                            StatusCallback done) override {
+    IssueRequest(request, response, getstepsequence_, std::move(done));
+  }
+
   void RecvTensorAsync(CallOptions* call_opts, const RecvTensorRequest* request,
                        TensorResponse* response, StatusCallback done) override {
     VLOG(1) << "RecvTensorAsync req: " << request->DebugString();
@@ -215,8 +245,12 @@ class GrpcRemoteWorker : public WorkerInterface {
   const ::grpc::string cleanupgraph_;
   const ::grpc::string cleanupall_;
   const ::grpc::string recvtensor_;
+  const ::grpc::string recvbuf_;
   const ::grpc::string logging_;
   const ::grpc::string tracing_;
+  const ::grpc::string completegroup_;
+  const ::grpc::string instancesource_;
+  const ::grpc::string getstepsequence_;
 
   // Support for logging.
   WorkerCacheLogger* logger_;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index 488dcde9f5d31c..e5ffb4ed2fdfdd 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -296,19 +296,19 @@ Status GrpcServer::WorkerCacheFactory(const WorkerCacheFactoryOptions& options,
   GrpcChannelSpec channel_spec;
   TF_RETURN_IF_ERROR(ParseChannelSpec(options, &channel_spec));
 
-  std::unique_ptr<GrpcChannelCache> channel_cache(
+  channel_cache_.reset(
       NewGrpcChannelCache(channel_spec, GetChannelCreationFunction()));
 
   string name_prefix = strings::StrCat("/job:", *options.job_name, "/replica:0",
                                        "/task:", options.task_index);
 
-  const string host_port = channel_cache->TranslateTask(name_prefix);
+  const string host_port = channel_cache_->TranslateTask(name_prefix);
   int requested_port;
 
   if (!strings::safe_strto32(str_util::Split(host_port, ':')[1],
                              &requested_port)) {
     return errors::Internal("Could not parse port for local server from \"",
-                            channel_cache->TranslateTask(name_prefix), "\".");
+                            channel_cache_->TranslateTask(name_prefix), "\".");
   }
   if (requested_port != bound_port_) {
     return errors::InvalidArgument("Requested port ", requested_port,
@@ -316,7 +316,7 @@ Status GrpcServer::WorkerCacheFactory(const WorkerCacheFactoryOptions& options,
   }
 
   *worker_cache = NewGrpcWorkerCacheWithLocalWorker(
-      channel_cache.release(), worker_impl_.get(), name_prefix);
+      channel_cache_, worker_impl_.get(), name_prefix);
   return Status::OK();
 }
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index 7c2f06f618a85c..0122df178ad650 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -104,6 +104,9 @@ class GrpcServer : public ServerInterface {
   int bound_port() const { return bound_port_; }
 
   WorkerEnv* worker_env() { return &worker_env_; }
+  MasterEnv* master_env() { return &master_env_; }
+
+  std::shared_ptr<GrpcChannelCache> channel_cache() { return channel_cache_; }
 
   const ServerDef& server_def() const { return server_def_; }
 
@@ -135,6 +138,7 @@ class GrpcServer : public ServerInterface {
   std::unique_ptr<Master> master_impl_;
   AsyncServiceInterface* master_service_ = nullptr;
   std::unique_ptr<Thread> master_thread_ GUARDED_BY(mu_);
+  std::shared_ptr<GrpcChannelCache> channel_cache_;
 
   // Implementation of a TensorFlow worker, and RPC polling thread.
   WorkerEnv worker_env_;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
index 89f83f9f24d570..a8508d2d4f377e 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_session.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
@@ -50,9 +51,14 @@ Status TestCluster::MakeTestCluster(const SessionOptions& options, int n,
   }
 
   for (int i = 0; i < n; ++i) {
+    string server_file =
+        strings::StrCat(testing::TensorFlowSrcRoot(),
+                        "/core/distributed_runtime/rpc/grpc_testlib_server");
+    if (!options.env->FileExists(server_file).ok()) {
+      return errors::Internal("Could not find grpc_testlib_server");
+    }
     const std::vector<string> argv(
-        {strings::StrCat(testing::TensorFlowSrcRoot(),
-                         "/core/distributed_runtime/rpc/grpc_testlib_server"),
+        {server_file,
          /* see grpc_testlib_server.cc for flags */
          tf_jobs, "--tf_job=localhost", strings::StrCat("--tf_task=", i),
          strings::StrCat("--num_cpus=", num_cpus),
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
index bb14e0197b7b0e..b9f21ea211bdbd 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
@@ -36,7 +36,7 @@ class GrpcWorkerCache : public WorkerCachePartial {
   // TODO(ncteisen): consider adding a config var or flag for this
   static constexpr const size_t kGrpcWorkerCacheThreadCount = 8;
 
-  explicit GrpcWorkerCache(GrpcChannelCache* channel_cache,
+  explicit GrpcWorkerCache(std::shared_ptr<GrpcChannelCache> channel_cache,
                            WorkerInterface* local_worker,
                            const string& local_target)
       : local_target_(local_target),
@@ -48,7 +48,6 @@ class GrpcWorkerCache : public WorkerCachePartial {
   // Explicit destructor to control destruction order.
   ~GrpcWorkerCache() override {
     threads_.clear();  // Blocks until threads exit.
-    delete channel_cache_;
   }
 
   void ListWorkers(std::vector<string>* workers) const override {
@@ -116,7 +115,7 @@ class GrpcWorkerCache : public WorkerCachePartial {
 
   size_t AssignWorkerToThread(const string& target) {
     // Round-robin target assignment, but keeps the same target on the same
-    // polling thread always, as this is important for gRPC performace
+    // polling thread always, as this is important for gRPC performance
     mutex_lock lock(assignment_mu_);
     auto it = target_assignments_.find(target);
     if (it == target_assignments_.end()) {
@@ -130,7 +129,7 @@ class GrpcWorkerCache : public WorkerCachePartial {
 
   const string local_target_;
   WorkerInterface* const local_worker_;  // Not owned.
-  GrpcChannelCache* channel_cache_;      // Owned.
+  std::shared_ptr<GrpcChannelCache> channel_cache_;
   WorkerCacheLogger logger_;
   std::vector<GrpcWorkerCacheThread> threads_;
 
@@ -142,12 +141,12 @@ class GrpcWorkerCache : public WorkerCachePartial {
 
 }  // namespace
 
-WorkerCacheInterface* NewGrpcWorkerCache(GrpcChannelCache* cc) {
+WorkerCacheInterface* NewGrpcWorkerCache(std::shared_ptr<GrpcChannelCache> cc) {
   return new GrpcWorkerCache(cc, nullptr, "");
 }
 
 WorkerCacheInterface* NewGrpcWorkerCacheWithLocalWorker(
-    GrpcChannelCache* cc, WorkerInterface* local_worker,
+    std::shared_ptr<GrpcChannelCache> cc, WorkerInterface* local_worker,
     const string& local_target) {
   return new GrpcWorkerCache(cc, local_worker, local_target);
 }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h
index 7a35fdbca08e1f..d63fca74c15a5f 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h
@@ -22,10 +22,10 @@ limitations under the License.
 namespace tensorflow {
 
 // The returned WorkerCacheInterface object takes the ownership of "cc".
-WorkerCacheInterface* NewGrpcWorkerCache(GrpcChannelCache* cc);
+WorkerCacheInterface* NewGrpcWorkerCache(std::shared_ptr<GrpcChannelCache> cc);
 
 WorkerCacheInterface* NewGrpcWorkerCacheWithLocalWorker(
-    GrpcChannelCache* cc, WorkerInterface* local_worker,
+    std::shared_ptr<GrpcChannelCache> cc, WorkerInterface* local_worker,
     const string& local_target);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index b20e744a97160a..aa9304a033ad56 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -20,12 +20,10 @@ limitations under the License.
 #include "grpc++/alarm.h"
 #include "grpc++/server_builder.h"
 
+#include "tensorflow/core/common_runtime/buf_rendezvous.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
-#if GOOGLE_CUDA
-#include "tensorflow/core/common_runtime/gpu/gpu_util.h"
-#endif  // GOOGLE_CUDA
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
@@ -40,10 +38,12 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/distributed_runtime/worker_session.h"
 #include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/tracing.h"
+#include "tensorflow/core/protobuf/transport_options.pb.h"
 #include "tensorflow/core/protobuf/worker.pb.h"
 
 namespace tensorflow {
@@ -162,6 +162,9 @@ class GrpcWorkerService : public AsyncServiceInterface {
       for (int i = 0; i < 1000; ++i) {
         EnqueueRecvTensorRequestRaw();
       }
+      for (int i = 0; i < 500; ++i) {
+        ENQUEUE_REQUEST(RecvBuf, true);
+      }
       for (int i = 0; i < 100; ++i) {
         ENQUEUE_REQUEST(RunGraph, true);
       }
@@ -172,6 +175,12 @@ class GrpcWorkerService : public AsyncServiceInterface {
       ENQUEUE_REQUEST(Logging, false);
       ENQUEUE_REQUEST(Tracing, false);
 
+      for (int i = 0; i < 10; ++i) {
+        ENQUEUE_REQUEST(CompleteGroup, true);
+        ENQUEUE_REQUEST(CompleteInstance, true);
+        ENQUEUE_REQUEST(GetStepSequence, true);
+      }
+
       void* tag;
       bool ok;
 
@@ -318,6 +327,61 @@ class GrpcWorkerService : public AsyncServiceInterface {
       });
       ENQUEUE_REQUEST(Tracing, false);
     }
+
+    void RecvBufHandler(WorkerCall<RecvBufRequest, RecvBufResponse>* call) {
+      Schedule([this, call]() {
+        CallOptions* call_opts = new CallOptions;
+        call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+        worker_->RecvBufAsync(call_opts, &call->request, &call->response,
+                              [call, call_opts](const Status& s) {
+                                call->ClearCancelCallback();
+                                delete call_opts;
+                                call->SendResponse(ToGrpcStatus(s));
+                              });
+      });
+      ENQUEUE_REQUEST(RecvBuf, true);
+    }
+
+    void CompleteGroupHandler(
+        WorkerCall<CompleteGroupRequest, CompleteGroupResponse>* call) {
+      Schedule([this, call]() {
+        CallOptions* call_opts = new CallOptions;
+        call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+        worker_->CompleteGroupAsync(call_opts, &call->request, &call->response,
+                                    [call, call_opts](const Status& s) {
+                                      call->ClearCancelCallback();
+                                      delete call_opts;
+                                      call->SendResponse(ToGrpcStatus(s));
+                                    });
+      });
+      ENQUEUE_REQUEST(CompleteGroup, true);
+    }
+
+    void CompleteInstanceHandler(
+        WorkerCall<CompleteInstanceRequest, CompleteInstanceResponse>* call) {
+      Schedule([this, call]() {
+        CallOptions* call_opts = new CallOptions;
+        call->SetCancelCallback([call_opts]() { call_opts->StartCancel(); });
+        worker_->CompleteInstanceAsync(call_opts, &call->request,
+                                       &call->response,
+                                       [call, call_opts](const Status& s) {
+                                         call->ClearCancelCallback();
+                                         delete call_opts;
+                                         call->SendResponse(ToGrpcStatus(s));
+                                       });
+      });
+      ENQUEUE_REQUEST(CompleteInstance, false);
+    }
+
+    void GetStepSequenceHandler(
+        WorkerCall<GetStepSequenceRequest, GetStepSequenceResponse>* call) {
+      Schedule([this, call]() {
+        worker_->GetStepSequenceAsync(
+            &call->request, &call->response,
+            [call](const Status& s) { call->SendResponse(ToGrpcStatus(s)); });
+      });
+      ENQUEUE_REQUEST(GetStepSequence, true);
+    }
 #undef ENQUEUE_REQUEST
 
     void EnqueueRecvTensorRequestRaw() {
@@ -392,10 +456,10 @@ void GrpcWorker::GrpcRecvTensorAsync(CallOptions* opts,
   opts->SetCancelCallback([this, step_id]() { AbortStep(step_id); });
   env_->rendezvous_mgr->RecvLocalAsync(
       step_id, parsed,
-      [opts, response, done, src_dev](const Status& status,
-                                      const Rendezvous::Args& send_args,
-                                      const Rendezvous::Args& recv_args,
-                                      const Tensor& val, const bool is_dead) {
+      [opts, response, done, src_dev, request](
+          const Status& status, const Rendezvous::Args& send_args,
+          const Rendezvous::Args& recv_args, const Tensor& val,
+          const bool is_dead) {
         opts->ClearCancelCallback();
         if (status.ok()) {
           // DMA can only be used for Tensors that do not fall into
@@ -408,8 +472,7 @@ void GrpcWorker::GrpcRecvTensorAsync(CallOptions* opts,
           {
             // Non-DMA cases.
             if (src_dev->tensorflow_gpu_device_info() && (!on_host)) {
-#if GOOGLE_CUDA
-              const DeviceContext* send_dev_context = send_args.device_context;
+              DeviceContext* send_dev_context = send_args.device_context;
               AllocatorAttributes alloc_attrs;
               alloc_attrs.set_gpu_compatible(true);
               alloc_attrs.set_on_host(true);
@@ -418,7 +481,8 @@ void GrpcWorker::GrpcRecvTensorAsync(CallOptions* opts,
               CHECK(send_dev_context)
                   << "send dev name: " << src_dev->name()
                   << " gpu_info: " << src_dev->tensorflow_gpu_device_info();
-              // "val" is on a GPU. Uses GPUUtil to fill the copy on host.
+              // "val" is on an accelerator device. Uses the device_context to
+              // fill the copy on host.
               StatusCallback copy_ready = [response, done, copy,
                                            is_dead](const Status& s) {
                 // The value is now ready to be returned on the wire.
@@ -427,11 +491,8 @@ void GrpcWorker::GrpcRecvTensorAsync(CallOptions* opts,
                 delete copy;
               };
 
-              GPUUtil::CopyGPUTensorToCPU(src_dev, send_dev_context, &val, copy,
-                                          copy_ready);
-#else
-              done(errors::Internal("No GPU device in process"));
-#endif  // GOOGLE_CUDA
+              send_dev_context->CopyDeviceTensorToCPU(
+                  &val, request->rendezvous_key(), src_dev, copy, copy_ready);
             } else {
               grpc::EncodeTensorToByteBuffer(is_dead, val, response);
               done(Status::OK());
@@ -444,13 +505,90 @@ void GrpcWorker::GrpcRecvTensorAsync(CallOptions* opts,
       });
 }
 
+void GrpcWorker::RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
+                              RecvBufResponse* response, StatusCallback done) {
+  // This is a generic, low performance implementation appropriate for grpc.
+  CollectiveExecutor::Handle ce_handle(
+      env_->collective_executor_mgr->FindOrCreate(request->step_id()), true);
+  CollectiveRemoteAccess* rma = ce_handle.get()->remote_access();
+  rma->buf_rendezvous()->ConsumeBuf(
+      request->buf_rendezvous_key(),
+      [this, request, response, done](const Status& status,
+                                      BufRendezvous::Hook* hook) {
+        Status s = status;
+        if (s.ok()) {
+          if (!DMAHelper::CanUseDMA(hook->prod_value)) {
+            s = errors::Internal("Tensor value for key ",
+                                 request->buf_rendezvous_key(),
+                                 " is not of a type supported by RecvBuf");
+          }
+        }
+        if (s.ok()) {
+          // The RPC source tensor needs to be in CPU RAM.  If not already
+          // there make a copy using memory appropriate to the purpose.
+          const size_t num_bytes = hook->prod_value->TotalBytes();
+          const bool on_host =
+              hook->prod_dev->attributes().device_type() == "CPU" ||
+              hook->prod_attr.on_host();
+          if ((!on_host) && (num_bytes > 0)) {
+            Device* cpu_dev = nullptr;
+            s = env_->device_mgr->LookupDevice("CPU:0", &cpu_dev);
+            if (s.ok()) {
+              AllocatorAttributes cpu_attr;
+              cpu_attr.set_gpu_compatible(true);
+              cpu_attr.set_nic_compatible(true);
+              Tensor* cpu_tensor = new Tensor(cpu_dev->GetAllocator(cpu_attr),
+                                              hook->prod_value->dtype(),
+                                              hook->prod_value->shape());
+              hook->prod_ctx->CopyDeviceTensorToCPU(
+                  hook->prod_value, "empty_name", hook->prod_dev, cpu_tensor,
+                  [this, num_bytes, response, done, hook,
+                   cpu_tensor](const Status& s) {
+                    if (s.ok()) {
+                      RecvBufRespExtra extra;
+                      extra.set_tensor_content(reinterpret_cast<const char*>(
+                                                   DMAHelper::base(cpu_tensor)),
+                                               num_bytes);
+                      response->mutable_transport_options()->PackFrom(extra);
+                    }
+                    response->set_send_start_micros(env_->env->NowMicros());
+                    done(s);
+                    BufRendezvous::DoneWithHook(hook);
+                    delete cpu_tensor;
+                  });
+              return;
+            }
+          } else {
+            // Tensor is on CPU.
+            RecvBufRespExtra extra;
+            extra.set_tensor_content(reinterpret_cast<const char*>(
+                                         DMAHelper::base(hook->prod_value)),
+                                     num_bytes);
+            response->mutable_transport_options()->PackFrom(extra);
+          }
+        }
+        response->set_send_start_micros(env_->env->NowMicros());
+        done(s);
+        BufRendezvous::DoneWithHook(hook);
+      });
+}
+
 void GrpcWorker::LoggingAsync(const LoggingRequest* request,
                               LoggingResponse* response, StatusCallback done) {
   auto env = this->env();
   if (env) {
-    auto session_mgr = (SessionMgr*)env->session_mgr;
+    auto session_mgr = env->session_mgr;
     if (session_mgr) {
-      session_mgr->SetLogging(request->rpc_logging());
+      if (request->enable_rpc_logging()) {
+        session_mgr->SetLogging(true);
+      }
+      // NOTE(mrry): Handle old masters that disable RPC logging by setting
+      // `request->enable_rpc_logging` to `false`.
+      if (request->disable_rpc_logging() ||
+          (!request->enable_rpc_logging() &&
+           request->fetch_step_id_size() == 0)) {
+        session_mgr->SetLogging(false);
+      }
       for (const auto& step_id : request->fetch_step_id()) {
         session_mgr->RetrieveLogs(step_id, response);
       }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
index fbddbda9e6f9e5..c0ed0884bc5cfd 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h
@@ -43,6 +43,9 @@ class GrpcWorker : public Worker {
   virtual void LoggingAsync(const LoggingRequest* request,
                             LoggingResponse* response, StatusCallback done);
 
+  virtual void RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
+                            RecvBufResponse* response, StatusCallback done);
+
   WorkerEnv* env();
 
  private:
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc
index 05a9db10d3c379..38cc2b81d30e5d 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc
@@ -46,10 +46,18 @@ const char* GrpcWorkerMethodName(GrpcWorkerMethod id) {
       return "/tensorflow.WorkerService/CleanupAll";
     case GrpcWorkerMethod::kRecvTensor:
       return "/tensorflow.WorkerService/RecvTensor";
+    case GrpcWorkerMethod::kRecvBuf:
+      return "/tensorflow.WorkerService/RecvBuf";
     case GrpcWorkerMethod::kLogging:
       return "/tensorflow.WorkerService/Logging";
     case GrpcWorkerMethod::kTracing:
       return "/tensorflow.WorkerService/Tracing";
+    case GrpcWorkerMethod::kCompleteGroup:
+      return "/tensorflow.WorkerService/CompleteGroup";
+    case GrpcWorkerMethod::kCompleteInstance:
+      return "/tensorflow.WorkerService/CompleteInstance";
+    case GrpcWorkerMethod::kGetStepSequence:
+      return "/tensorflow.WorkerService/GetStepSequence";
   }
   // Shouldn't be reached.
   LOG(FATAL) << "Invalid id: this line shouldn't be reached.";
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
index a54ea9379628ed..da270835bd1ab8 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
@@ -81,11 +81,15 @@ enum class GrpcWorkerMethod {
   kCleanupGraph,
   kCleanupAll,
   kRecvTensor,
+  kRecvBuf,
   kLogging,
   kTracing,
+  kCompleteGroup,
+  kCompleteInstance,
+  kGetStepSequence,
 };
 static const int kGrpcNumWorkerMethods =
-    static_cast<int>(GrpcWorkerMethod::kTracing) + 1;
+    static_cast<int>(GrpcWorkerMethod::kGetStepSequence) + 1;
 
 const char* GrpcWorkerMethodName(GrpcWorkerMethod id);
 
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index 7ef4206c780d13..95b31c6991f634 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -67,7 +67,7 @@ Status SessionMgr::CreateSession(const string& session,
     worker_name = WorkerNameFromServerDef(server_def);
   }
 
-  if (worker_cache != nullptr & default_worker_cache_.get() != nullptr) {
+  if (worker_cache != nullptr && default_worker_cache_ != nullptr) {
     worker_cache->SetLogging(this->is_logging_active_);
   }
 
diff --git a/tensorflow/core/distributed_runtime/test_utils.h b/tensorflow/core/distributed_runtime/test_utils.h
new file mode 100644
index 00000000000000..48d83845dd3b0e
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/test_utils.h
@@ -0,0 +1,178 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_TEST_UTILS_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_TEST_UTILS_H_
+
+#include <unordered_map>
+#include "tensorflow/core/distributed_runtime/worker_cache.h"
+#include "tensorflow/core/distributed_runtime/worker_interface.h"
+
+namespace tensorflow {
+
+// Some utilities for testing distributed-mode components in a single process
+// without RPCs.
+
+// Implements the worker interface with methods that just respond with
+// "unimplemented" status.  Override just the methods needed for
+// testing.
+class TestWorkerInterface : public WorkerInterface {
+ public:
+  void GetStatusAsync(const GetStatusRequest* request,
+                      GetStatusResponse* response,
+                      StatusCallback done) override {
+    done(errors::Unimplemented("GetStatusAsync"));
+  }
+
+  void CreateWorkerSessionAsync(const CreateWorkerSessionRequest* request,
+                                CreateWorkerSessionResponse* response,
+                                StatusCallback done) override {
+    done(errors::Unimplemented("CreateWorkerSessionAsync"));
+  }
+
+  void DeleteWorkerSessionAsync(CallOptions* opts,
+                                const DeleteWorkerSessionRequest* request,
+                                DeleteWorkerSessionResponse* response,
+                                StatusCallback done) override {
+    done(errors::Unimplemented("DeleteWorkerSessionAsync"));
+  }
+
+  void RegisterGraphAsync(const RegisterGraphRequest* request,
+                          RegisterGraphResponse* response,
+                          StatusCallback done) override {
+    done(errors::Unimplemented("RegisterGraphAsync"));
+  }
+
+  void DeregisterGraphAsync(const DeregisterGraphRequest* request,
+                            DeregisterGraphResponse* response,
+                            StatusCallback done) override {
+    done(errors::Unimplemented("DeregisterGraphAsync"));
+  }
+
+  void RunGraphAsync(CallOptions* opts, RunGraphRequestWrapper* request,
+                     MutableRunGraphResponseWrapper* repsonse,
+                     StatusCallback done) override {
+    done(errors::Unimplemented("RunGraphAsync"));
+  }
+
+  void CleanupGraphAsync(const CleanupGraphRequest* request,
+                         CleanupGraphResponse* response,
+                         StatusCallback done) override {
+    done(errors::Unimplemented("RunGraphAsync"));
+  }
+
+  void CleanupAllAsync(const CleanupAllRequest* request,
+                       CleanupAllResponse* response,
+                       StatusCallback done) override {
+    done(errors::Unimplemented("RunGraphAsync"));
+  }
+
+  void RecvTensorAsync(CallOptions* opts, const RecvTensorRequest* request,
+                       TensorResponse* response, StatusCallback done) override {
+    done(errors::Unimplemented("RunGraphAsync"));
+  }
+
+  void LoggingAsync(const LoggingRequest* request, LoggingResponse* response,
+                    StatusCallback done) override {
+    done(errors::Unimplemented("RunGraphAsync"));
+  }
+
+  void TracingAsync(const TracingRequest* request, TracingResponse* response,
+                    StatusCallback done) override {
+    done(errors::Unimplemented("RunGraphAsync"));
+  }
+
+  void RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
+                    RecvBufResponse* response, StatusCallback done) override {
+    done(errors::Unimplemented("RecvBufAsync"));
+  }
+
+  void CompleteGroupAsync(CallOptions* opts,
+                          const CompleteGroupRequest* request,
+                          CompleteGroupResponse* response,
+                          StatusCallback done) override {
+    done(errors::Unimplemented("RunGraphAsync"));
+  }
+
+  void CompleteInstanceAsync(CallOptions* ops,
+                             const CompleteInstanceRequest* request,
+                             CompleteInstanceResponse* response,
+                             StatusCallback done) override {
+    done(errors::Unimplemented("RunGraphAsync"));
+  }
+
+  void GetStepSequenceAsync(const GetStepSequenceRequest* request,
+                            GetStepSequenceResponse* response,
+                            StatusCallback done) override {
+    done(errors::Unimplemented("RunGraphAsync"));
+  }
+};
+
+class TestWorkerCache : public WorkerCacheInterface {
+ public:
+  virtual ~TestWorkerCache() {}
+
+  void AddWorker(const string& target, WorkerInterface* wi) {
+    workers_[target] = wi;
+  }
+
+  void AddDevice(const string& device_name, const DeviceLocality& dev_loc) {
+    localities_[device_name] = dev_loc;
+  }
+
+  void ListWorkers(std::vector<string>* workers) const override {
+    workers->clear();
+    for (auto it : workers_) {
+      workers->push_back(it.first);
+    }
+  }
+
+  WorkerInterface* CreateWorker(const string& target) override {
+    auto it = workers_.find(target);
+    if (it != workers_.end()) {
+      return it->second;
+    }
+    return nullptr;
+  }
+
+  void ReleaseWorker(const string& target, WorkerInterface* worker) override {}
+
+  bool GetDeviceLocalityNonBlocking(const string& device,
+                                    DeviceLocality* locality) override {
+    auto it = localities_.find(device);
+    if (it != localities_.end()) {
+      *locality = it->second;
+      return true;
+    }
+    return false;
+  }
+
+  void GetDeviceLocalityAsync(const string& device, DeviceLocality* locality,
+                              StatusCallback done) override {
+    auto it = localities_.find(device);
+    if (it != localities_.end()) {
+      *locality = it->second;
+      done(Status::OK());
+      return;
+    }
+    done(errors::Internal("Device not found: ", device));
+  }
+
+ protected:
+  std::unordered_map<string, WorkerInterface*> workers_;
+  std::unordered_map<string, DeviceLocality> localities_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_TEST_UTILS_H_
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index e9073ef9f66d5e..4e6500fbc6baff 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/worker.h"
 
+#include "tensorflow/core/common_runtime/collective_executor_mgr.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
@@ -25,8 +26,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-Worker::Worker(WorkerEnv* env)
-    : env_(env), cancellation_manager_(new CancellationManager) {}
+Worker::Worker(WorkerEnv* env) : env_(env) {}
 
 void Worker::GetStatusAsync(const GetStatusRequest* request,
                             GetStatusResponse* response, StatusCallback done) {
@@ -185,19 +185,16 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
     AbortStep(step_id);
   });
   CancellationToken token;
-  {
-    mutex_lock l(mu_);
-    token = cancellation_manager_->get_cancellation_token();
-    bool already_cancelled = !cancellation_manager_->RegisterCallback(
-        token, [cm]() { cm->StartCancel(); });
-    if (already_cancelled) {
-      opts->ClearCancelCallback();
-      delete cm;
-      delete collector;
-      delete out;
-      done(errors::Aborted("Call was aborted"));
-      return;
-    }
+  token = cancellation_manager_.get_cancellation_token();
+  bool already_cancelled = !cancellation_manager_.RegisterCallback(
+      token, [cm]() { cm->StartCancel(); });
+  if (already_cancelled) {
+    opts->ClearCancelCallback();
+    delete cm;
+    delete collector;
+    delete out;
+    done(errors::Aborted("Call was aborted"));
+    return;
   }
   session->graph_mgr->ExecuteAsync(
       request->graph_handle(), step_id, session.get(), request->exec_opts(),
@@ -208,10 +205,7 @@ void Worker::DoRunGraph(CallOptions* opts, RunGraphRequestWrapper* request,
           s = session->graph_mgr->RecvOutputs(step_id, out);
         }
         opts->ClearCancelCallback();
-        {
-          mutex_lock l(mu_);
-          cancellation_manager_->DeregisterCallback(token);
-        }
+        cancellation_manager_.DeregisterCallback(token);
         delete cm;
 
         if (s.ok()) {
@@ -276,20 +270,14 @@ void Worker::DoPartialRunGraph(CallOptions* opts,
   // executors.
   if (is_new_partial_run) {
     CancellationToken token;
-    {
-      mutex_lock l(mu_);
-      token = cancellation_manager_->get_cancellation_token();
-      cancellation_manager_->RegisterCallback(token,
-                                              [cm]() { cm->StartCancel(); });
-    }
+    token = cancellation_manager_.get_cancellation_token();
+    cancellation_manager_.RegisterCallback(token,
+                                           [cm]() { cm->StartCancel(); });
     session->graph_mgr->ExecuteAsync(
         graph_handle, step_id, session.get(), request->exec_opts(),
         nullptr /* collector */, nullptr /* response */, cm, in,
         [this, token, step_id, session](Status s) {
-          {
-            mutex_lock l(mu_);
-            cancellation_manager_->DeregisterCallback(token);
-          }
+          cancellation_manager_.DeregisterCallback(token);
           partial_run_mgr_.ExecutorDone(step_id, s);
         });
   } else {
@@ -324,6 +312,9 @@ void Worker::CleanupGraphAsync(const CleanupGraphRequest* request,
                                StatusCallback done) {
   const int64 step_id = request->step_id();
   env_->rendezvous_mgr->Cleanup(step_id);
+  if (env_->collective_executor_mgr) {
+    env_->collective_executor_mgr->Cleanup(step_id);
+  }
   done(Status::OK());
 }
 
@@ -346,6 +337,53 @@ void Worker::TracingAsync(const TracingRequest* request,
   done(errors::Unimplemented("Tracing"));
 }
 
+void Worker::RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
+                          RecvBufResponse* response, StatusCallback done) {
+  // The base Worker class does not implement RecvBufAsync because
+  // it is not currently used for worker-to-worker communication. Use a
+  // transport-specific implementation (such as `GrpcWorker::RecvBufAsync()`)
+  // instead.
+  done(errors::Unimplemented("Worker::RecvBufAsync()"));
+}
+
+void Worker::CompleteGroupAsync(CallOptions* opts,
+                                const CompleteGroupRequest* request,
+                                CompleteGroupResponse* response,
+                                StatusCallback done) {
+  if (env_->collective_executor_mgr) {
+    env_->collective_executor_mgr->GetParamResolver()->CompleteGroupAsync(
+        request, response, &cancellation_manager_, done);
+  } else {
+    done(
+        errors::Internal("Runtime not initialized with CollectiveExecutorMgr"));
+  }
+}
+
+void Worker::CompleteInstanceAsync(CallOptions* opts,
+                                   const CompleteInstanceRequest* request,
+                                   CompleteInstanceResponse* response,
+                                   StatusCallback done) {
+  if (env_->collective_executor_mgr) {
+    env_->collective_executor_mgr->GetParamResolver()->CompleteInstanceAsync(
+        request, response, &cancellation_manager_, done);
+  } else {
+    done(
+        errors::Internal("Runtime not initialized with CollectiveExecutorMgr"));
+  }
+}
+
+void Worker::GetStepSequenceAsync(const GetStepSequenceRequest* request,
+                                  GetStepSequenceResponse* response,
+                                  StatusCallback done) {
+  if (env_->collective_executor_mgr) {
+    env_->collective_executor_mgr->GetStepSequenceAsync(request, response,
+                                                        done);
+  } else {
+    done(
+        errors::Internal("Runtime not initialized with CollectiveExecutorMgr"));
+  }
+}
+
 // Helper for RecvTensor. Validates "key" and returns the source
 // device in "*src_dev".
 Status Worker::PrepareRecvTensor(const Rendezvous::ParsedKey& parsed,
diff --git a/tensorflow/core/distributed_runtime/worker.h b/tensorflow/core/distributed_runtime/worker.h
index 19aeeb752c43d7..91eb27c10ea140 100644
--- a/tensorflow/core/distributed_runtime/worker.h
+++ b/tensorflow/core/distributed_runtime/worker.h
@@ -90,6 +90,23 @@ class Worker : public WorkerInterface {
   void TracingAsync(const TracingRequest* request, TracingResponse* response,
                     StatusCallback done) override;
 
+  void RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
+                    RecvBufResponse* response, StatusCallback done) override;
+
+  void CompleteGroupAsync(CallOptions* opts,
+                          const CompleteGroupRequest* request,
+                          CompleteGroupResponse* response,
+                          StatusCallback done) override;
+
+  void CompleteInstanceAsync(CallOptions* opts,
+                             const CompleteInstanceRequest* request,
+                             CompleteInstanceResponse* response,
+                             StatusCallback done) override;
+
+  void GetStepSequenceAsync(const GetStepSequenceRequest* request,
+                            GetStepSequenceResponse* response,
+                            StatusCallback done) override;
+
  protected:
   WorkerEnv* const env_;  // Not owned.
 
@@ -101,8 +118,7 @@ class Worker : public WorkerInterface {
  private:
   PartialRunMgr partial_run_mgr_;
 
-  mutex mu_;
-  CancellationManager* cancellation_manager_ GUARDED_BY(mu_);
+  CancellationManager cancellation_manager_;
 
   Status PrepareRunGraph(RunGraphRequestWrapper* req,
                          GraphMgr::NamedTensors* in,
diff --git a/tensorflow/core/distributed_runtime/worker_cache_partial.cc b/tensorflow/core/distributed_runtime/worker_cache_partial.cc
index 61e54162343fb4..55b6957b96210d 100644
--- a/tensorflow/core/distributed_runtime/worker_cache_partial.cc
+++ b/tensorflow/core/distributed_runtime/worker_cache_partial.cc
@@ -67,7 +67,7 @@ Status WorkerCachePartial::RefreshDeviceStatus(const string& device_name) {
   };
   std::unique_ptr<WorkerInterface, decltype(deleter)> rwi(CreateWorker(task),
                                                           deleter);
-  if (s.ok() && !rwi.get()) {
+  if (s.ok() && !rwi) {
     s = errors::Internal("RefreshDeviceStatus, unknown worker task: ", task);
   }
 
diff --git a/tensorflow/core/distributed_runtime/worker_env.h b/tensorflow/core/distributed_runtime/worker_env.h
index 793d58c8a1c6c5..93d933bfa60314 100644
--- a/tensorflow/core/distributed_runtime/worker_env.h
+++ b/tensorflow/core/distributed_runtime/worker_env.h
@@ -25,6 +25,7 @@ namespace thread {
 class ThreadPool;
 }  // namespace thread
 
+class CollectiveExecutorMgrInterface;
 class Device;
 class DeviceMgr;
 class Env;
@@ -57,6 +58,10 @@ struct WorkerEnv {
   // A set of rendezvous keyed by step ids.
   RendezvousMgrInterface* rendezvous_mgr = nullptr;
 
+  // Generates per-step CollectiveExecutors and has access to utilities
+  // supporting collective operations.
+  CollectiveExecutorMgrInterface* collective_executor_mgr = nullptr;
+
   // A pool of threads for scheduling compute work.
   thread::ThreadPool* compute_pool = nullptr;
 };
diff --git a/tensorflow/core/distributed_runtime/worker_interface.h b/tensorflow/core/distributed_runtime/worker_interface.h
index a1597ee798ff27..a50ac3b8ae51e9 100644
--- a/tensorflow/core/distributed_runtime/worker_interface.h
+++ b/tensorflow/core/distributed_runtime/worker_interface.h
@@ -112,6 +112,23 @@ class WorkerInterface {
   virtual void TracingAsync(const TracingRequest* request,
                             TracingResponse* response, StatusCallback done) = 0;
 
+  virtual void RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
+                            RecvBufResponse* response, StatusCallback done) = 0;
+
+  virtual void CompleteGroupAsync(CallOptions* opts,
+                                  const CompleteGroupRequest* request,
+                                  CompleteGroupResponse* response,
+                                  StatusCallback done) = 0;
+
+  virtual void CompleteInstanceAsync(CallOptions* ops,
+                                     const CompleteInstanceRequest* request,
+                                     CompleteInstanceResponse* response,
+                                     StatusCallback done) = 0;
+
+  virtual void GetStepSequenceAsync(const GetStepSequenceRequest* request,
+                                    GetStepSequenceResponse* response,
+                                    StatusCallback done) = 0;
+
   Status GetStatus(const GetStatusRequest* request,
                    GetStatusResponse* response) {
     return CallAndWait(&ME::GetStatusAsync, request, response);
@@ -156,6 +173,11 @@ class WorkerInterface {
     return CallAndWait(&ME::TracingAsync, request, response);
   }
 
+  Status GetStepSequence(const GetStepSequenceRequest* request,
+                         GetStepSequenceResponse* response) {
+    return CallAndWait(&ME::GetStepSequenceAsync, request, response);
+  }
+
  protected:
   // Instances of WorkerInterface must be deleted by a call to
   // WorkerCacheInterface::ReleaseWorker().
diff --git a/tensorflow/core/example/example.proto b/tensorflow/core/example/example.proto
index b2b723278b046b..e7142a4ef97132 100644
--- a/tensorflow/core/example/example.proto
+++ b/tensorflow/core/example/example.proto
@@ -7,7 +7,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ExampleProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.example";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/example";
 package tensorflow;
 
 // An Example is a mostly-normalized data format for storing data for
diff --git a/tensorflow/core/example/example_parser_configuration.proto b/tensorflow/core/example/example_parser_configuration.proto
index 15846c0e302960..b2c115d80e35ec 100644
--- a/tensorflow/core/example/example_parser_configuration.proto
+++ b/tensorflow/core/example/example_parser_configuration.proto
@@ -6,6 +6,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ExampleParserConfigurationProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.example";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/example";
 package tensorflow;
 
 import "tensorflow/core/framework/tensor_shape.proto";
diff --git a/tensorflow/core/example/feature.proto b/tensorflow/core/example/feature.proto
index da3dc59a120409..6d81974aac33b7 100644
--- a/tensorflow/core/example/feature.proto
+++ b/tensorflow/core/example/feature.proto
@@ -58,7 +58,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "FeatureProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.example";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/example";
 package tensorflow;
 
 // Containers to hold repeated fundamental values.
diff --git a/tensorflow/core/framework/allocation_description.proto b/tensorflow/core/framework/allocation_description.proto
index bb1037c2dfe46a..64133b05e18f90 100644
--- a/tensorflow/core/framework/allocation_description.proto
+++ b/tensorflow/core/framework/allocation_description.proto
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "AllocationDescriptionProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 
 message AllocationDescription {
   // Total number of bytes requested
diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index 1a7e5219cd243a..1c62d37955b313 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -68,6 +68,9 @@ static const double kLargeAllocationWarningThreshold = 0.1;
 // exceeds this threshold.
 static const double kTotalAllocationWarningThreshold = 0.5;
 
+static const int kMaxSingleAllocationWarnings = 5;
+static const int kMaxTotalAllocationWarnings = 1;
+
 // Cache first invocation to port::AvailableRam, as it can be expensive.
 static int64_t LargeAllocationWarningBytes() {
   static int64_t value = static_cast<int64>(port::AvailableRam() *
@@ -90,14 +93,18 @@ void EnableCPUAllocatorFullStats(bool enable) {
 
 class CPUAllocator : public Allocator {
  public:
-  CPUAllocator() : total_allocation_warning_triggered_(false) {}
+  CPUAllocator()
+      : single_allocation_warning_count_(0),
+        total_allocation_warning_count_(0) {}
 
   ~CPUAllocator() override {}
 
   string Name() override { return "cpu"; }
 
   void* AllocateRaw(size_t alignment, size_t num_bytes) override {
-    if (num_bytes > LargeAllocationWarningBytes()) {
+    if (num_bytes > LargeAllocationWarningBytes() &&
+        single_allocation_warning_count_ < kMaxSingleAllocationWarnings) {
+      ++single_allocation_warning_count_;
       LOG(WARNING) << "Allocation of " << num_bytes << " exceeds "
                    << 100 * kLargeAllocationWarningThreshold
                    << "% of system memory.";
@@ -115,11 +122,11 @@ class CPUAllocator : public Allocator {
           std::max<int64>(stats_.max_alloc_size, alloc_size);
 
       if (stats_.bytes_in_use > TotalAllocationWarningBytes() &&
-          !total_allocation_warning_triggered_) {
+          total_allocation_warning_count_ < kMaxTotalAllocationWarnings) {
+        ++total_allocation_warning_count_;
         LOG(WARNING) << "Total allocated memory " << stats_.bytes_in_use
                      << "exceeds " << 100 * kTotalAllocationWarningThreshold
                      << "% of system memory";
-        total_allocation_warning_triggered_ = true;
       }
     }
     return p;
@@ -154,7 +161,11 @@ class CPUAllocator : public Allocator {
  private:
   mutex mu_;
   AllocatorStats stats_ GUARDED_BY(mu_);
-  bool total_allocation_warning_triggered_ GUARDED_BY(mu_);
+
+  // Use <atomic> for single allocations to avoid mutex contention when
+  // statistics are disabled.
+  std::atomic<int> single_allocation_warning_count_;
+  int total_allocation_warning_count_ GUARDED_BY(mu_);
 
   TF_DISALLOW_COPY_AND_ASSIGN(CPUAllocator);
 };
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 2c87156dca6118..2bb4d32d577664 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -67,13 +67,8 @@ struct AllocatorStats {
 // device memory.
 class Allocator {
  public:
-#ifdef EIGEN_VECTORIZE_AVX512
   // Align to 64 byte boundary.
   static constexpr size_t kAllocatorAlignment = 64;
-#else
-  // Align to 32 byte boundary.
-  static constexpr size_t kAllocatorAlignment = 32;
-#endif
 
   virtual ~Allocator();
 
diff --git a/tensorflow/core/framework/api_def.proto b/tensorflow/core/framework/api_def.proto
index 98c38efc0e9a8e..3f8dd272e7798a 100644
--- a/tensorflow/core/framework/api_def.proto
+++ b/tensorflow/core/framework/api_def.proto
@@ -8,6 +8,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ApiDefProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/attr_value.proto";
 
 // Used to specify and override the default API & behavior in the
@@ -55,8 +56,10 @@ message ApiDef {
     // use a snake_case convention instead of CamelCase.
     string name = 1;
 
-    // First GraphDef version at which the op is disallowed.
-    int32 deprecation_version = 2;
+    // If this endpoint is deprecated, set deprecation_message to a
+    // message that should be logged when the endpoint is used.
+    // The message should indicate alternative endpoint to use, if any.
+    string deprecation_message = 2;
   }
   repeated Endpoint endpoint = 3;
 
diff --git a/tensorflow/core/framework/attr_value.proto b/tensorflow/core/framework/attr_value.proto
index 62f0a9050fb82c..054e3ec97cc144 100644
--- a/tensorflow/core/framework/attr_value.proto
+++ b/tensorflow/core/framework/attr_value.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "AttrValueProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/tensor.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
diff --git a/tensorflow/core/framework/attr_value_util.cc b/tensorflow/core/framework/attr_value_util.cc
index 87c1ddd15df4f8..79966f06922a62 100644
--- a/tensorflow/core/framework/attr_value_util.cc
+++ b/tensorflow/core/framework/attr_value_util.cc
@@ -33,6 +33,154 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+// Do not construct large tensors to compute their hash or compare for equality.
+constexpr int kMaxAttrValueTensorByteSize = 32 * 1024 * 1024;  // 32mb
+
+// Return the size of the tensor represented by this TensorProto. If shape is
+// not fully defined return -1.
+int64 TensorByteSize(const TensorProto& t) {
+  // num_elements returns -1 if shape is not fully defined.
+  int64 num_elems = TensorShape(t.tensor_shape()).num_elements();
+  return num_elems < 0 ? -1 : num_elems * DataTypeSize(t.dtype());
+}
+
+// Compute TensorProto hash by creating a Tensor, serializing it as tensor
+// content, and computing a hash of it's string representation. This is unsafe
+// operation, because large tensors can be represented as TensorProto, but can't
+// be serialized to tensor content.
+uint64 TensorProtoHash(const TensorProto& tp) {
+  Tensor tensor(tp.dtype());
+  bool success = tensor.FromProto(tp);
+  DCHECK(success);
+  TensorProto p;
+  tensor.AsProtoTensorContent(&p);
+  string s;
+  SerializeToStringDeterministic(p, &s);
+  return Hash64(s);
+}
+
+// Do not create large tensors in memory, compute hash based on TensorProto
+// string representation. Tensors with identical content potentially can have a
+// different hash code if they are defined with different TensorProto
+// representations.
+uint64 FastTensorProtoHash(const TensorProto& tp) {
+  string s;
+  if (TensorByteSize(tp) > kMaxAttrValueTensorByteSize) {
+    string s;
+    bool success = SerializeToStringDeterministic(tp, &s);
+    DCHECK(success);
+    return Hash64(s);
+  } else {
+    return TensorProtoHash(tp);
+  }
+}
+
+// There are multiple equivalent representations of attr values containing
+// TensorProtos. Compare them by constructing Tensors and serializing them
+// back. Comparing Tensor objects is pretty tricky. This is unsafe operation,
+// because large tensors can be represented as TensorProto, but can't be
+// serialized to tensor content.
+bool AreTensorProtosEqual(const TensorProto& lhs, const TensorProto& rhs) {
+  Tensor lhs_t(lhs.dtype());
+  bool success = lhs_t.FromProto(lhs);
+  DCHECK(success);
+
+  Tensor rhs_t(rhs.dtype());
+  success = rhs_t.FromProto(rhs);
+  DCHECK(success);
+
+  TensorProto lhs_tp;
+  lhs_t.AsProtoTensorContent(&lhs_tp);
+
+  TensorProto rhs_tp;
+  rhs_t.AsProtoTensorContent(&rhs_tp);
+
+  string lhs_str, rhs_str;
+  SerializeToStringDeterministic(lhs_tp, &lhs_str);
+  SerializeToStringDeterministic(rhs_tp, &rhs_str);
+
+  return lhs_str == rhs_str;
+}
+
+// Do not construct large tensors in memory, compare equality using TensorProto
+// string representation. Tensors with identical content potentially can have
+// different tensor proto representation.
+bool FastAreTensorProtosEqual(const TensorProto& lhs, const TensorProto& rhs) {
+  if (TensorByteSize(lhs) > kMaxAttrValueTensorByteSize ||
+      TensorByteSize(rhs) > kMaxAttrValueTensorByteSize) {
+    string lhs_str, rhs_str;
+    bool success = lhs.AppendToString(&lhs_str);
+    DCHECK(success);
+    success = rhs.AppendToString(&rhs_str);
+    DCHECK(success);
+
+    return lhs_str == rhs_str;
+  } else {
+    return AreTensorProtosEqual(lhs, rhs);
+  }
+}
+
+using TensorProtoHasher = std::function<uint64(const TensorProto&)>;
+using TensorProtosEquality =
+    std::function<bool(const TensorProto&, const TensorProto&)>;
+
+uint64 AttrValueHash(const AttrValue& a, const TensorProtoHasher& tensor_hash) {
+  if (a.has_tensor()) return tensor_hash(a.tensor());
+
+  if (a.has_func()) {
+    const NameAttrList& func = a.func();
+    uint64 h = Hash64(func.name());
+    std::map<string, AttrValue> map(func.attr().begin(), func.attr().end());
+    for (const auto& pair : map) {
+      h = Hash64(pair.first.data(), pair.first.size(), h);
+      h = Hash64Combine(AttrValueHash(pair.second, tensor_hash), h);
+    }
+    return h;
+  }
+
+  // If `a` is not a tensor or func, get a hash of serialized string.
+  string s;
+  SerializeToStringDeterministic(a, &s);
+  return Hash64(s);
+}
+
+bool AreAttrValuesEqual(const AttrValue& a, const AttrValue& b,
+                        const TensorProtosEquality& tensor_equality) {
+  if (a.has_tensor() != b.has_tensor()) {
+    return false;
+  } else if (a.has_tensor() && b.has_tensor()) {
+    return tensor_equality(a.tensor(), b.tensor());
+  }
+
+  // `func` field contains a nested AttrValue. Compare such AttrValues
+  // recursively.
+  if (a.has_func() != b.has_func()) {
+    return false;
+  } else if (a.has_func() && b.has_func()) {
+    const NameAttrList& af = a.func();
+    const NameAttrList& bf = b.func();
+    if (af.name() != bf.name()) return false;
+    std::unordered_map<string, AttrValue> am(af.attr().begin(),
+                                             af.attr().end());
+    for (const auto& bm_pair : bf.attr()) {
+      const auto& iter = am.find(bm_pair.first);
+      if (iter == am.end()) return false;
+      if (!AreAttrValuesEqual(iter->second, bm_pair.second, tensor_equality))
+        return false;
+      am.erase(iter);
+    }
+    if (!am.empty()) return false;
+    return true;
+  }
+
+  // All other fields in AttrValue have deterministic representations.
+  // It is safe to compare their serialized strings.
+  string a_str, b_str;
+  SerializeToStringDeterministic(a, &a_str);
+  SerializeToStringDeterministic(b, &b_str);
+  return a_str == b_str;
+}
+
 string SummarizeString(const string& str) {
   string escaped = str_util::CEscape(str);
 
@@ -412,89 +560,19 @@ void SetAttrValue(gtl::ArraySlice<NameAttrList> value, AttrValue* out) {
 }
 
 bool AreAttrValuesEqual(const AttrValue& a, const AttrValue& b) {
-  // There are multiple equivalent representations of attr values containing
-  // TensorProtos. Compare them by constructing Tensors and serializing them
-  // back. Comparing Tensor objects is pretty tricky.
-  if (a.has_tensor() != b.has_tensor()) {
-    return false;
-  } else if (a.has_tensor() && b.has_tensor()) {
-    Tensor at(a.tensor().dtype());
-    bool success = at.FromProto(a.tensor());
-    DCHECK(success);
-
-    Tensor bt(b.tensor().dtype());
-    success = bt.FromProto(b.tensor());
-    DCHECK(success);
-
-    TensorProto ap;
-    at.AsProtoTensorContent(&ap);
-
-    TensorProto bp;
-    bt.AsProtoTensorContent(&bp);
-
-    string a_str, b_str;
-    SerializeToStringDeterministic(ap, &a_str);
-    SerializeToStringDeterministic(bp, &b_str);
-    return a_str == b_str;
-  }
-
-  // `func` field contains a nested AttrValue. Compare such AttrValues
-  // recursively.
-  if (a.has_func() != b.has_func()) {
-    return false;
-  } else if (a.has_func() && b.has_func()) {
-    const NameAttrList& af = a.func();
-    const NameAttrList& bf = b.func();
-    if (af.name() != bf.name()) return false;
-    std::unordered_map<string, AttrValue> am(af.attr().begin(),
-                                             af.attr().end());
-    for (const auto& bm_pair : bf.attr()) {
-      const auto& iter = am.find(bm_pair.first);
-      if (iter == am.end()) return false;
-      if (!AreAttrValuesEqual(iter->second, bm_pair.second)) return false;
-      am.erase(iter);
-    }
-    if (!am.empty()) return false;
-    return true;
-  }
-
-  // All other fields in AttrValue have deterministic representations.
-  // It is safe to compare their serialized strings.
-  string a_str, b_str;
-  SerializeToStringDeterministic(a, &a_str);
-  SerializeToStringDeterministic(b, &b_str);
-  return a_str == b_str;
+  return AreAttrValuesEqual(a, b, AreTensorProtosEqual);
 }
 
 uint64 AttrValueHash(const AttrValue& a) {
-  if (a.has_tensor()) {
-    // Deal with multiple representations by parsing TensorProto to
-    // Tensor and serializing it back. This is slow, but current use case
-    // don't need high efficiency.
-    Tensor tensor(a.tensor().dtype());
-    bool success = tensor.FromProto(a.tensor());
-    DCHECK(success);
-    TensorProto p;
-    tensor.AsProtoTensorContent(&p);
-    string s;
-    SerializeToStringDeterministic(p, &s);
-    return Hash64(s);
-  }
-  if (a.has_func()) {
-    const NameAttrList& func = a.func();
-    uint64 h = Hash64(func.name());
-    std::map<string, AttrValue> map(func.attr().begin(), func.attr().end());
-    for (const auto& pair : map) {
-      h = Hash64(pair.first.data(), pair.first.size(), h);
-      h = Hash64Combine(AttrValueHash(pair.second), h);
-    }
-    return h;
-  }
+  return AttrValueHash(a, TensorProtoHash);
+}
 
-  // If `a` is not a tensor or func, get a hash of serialized string.
-  string s;
-  SerializeToStringDeterministic(a, &s);
-  return Hash64(s);
+bool FastAreAttrValuesEqual(const AttrValue& a, const AttrValue& b) {
+  return AreAttrValuesEqual(a, b, FastAreTensorProtosEqual);
+}
+
+uint64 FastAttrValueHash(const AttrValue& a) {
+  return AttrValueHash(a, FastTensorProtoHash);
 }
 
 bool HasPlaceHolder(const AttrValue& val) {
diff --git a/tensorflow/core/framework/attr_value_util.h b/tensorflow/core/framework/attr_value_util.h
index 29e34c5090ea91..0da9b1081bdf0b 100644
--- a/tensorflow/core/framework/attr_value_util.h
+++ b/tensorflow/core/framework/attr_value_util.h
@@ -98,6 +98,19 @@ bool AreAttrValuesEqual(const AttrValue& a, const AttrValue& b);
 // probably not persist the returned value.
 uint64 AttrValueHash(const AttrValue& a);
 
+// WARNING: Equality check might return false-negative for large (> 32mb)
+// tensors defined with different TensorProto representations.
+//
+// A pair of consistent hash and equals functions that are guaranteed to be fast
+// with AttrValues that potentially can have very large Tensors (larger than
+// 32mb) defined by TensorProto. If large identical Tensors are defined using
+// different representations (e.g. one with tensor content, and second with
+// bool_val), they will have different hash code and equals will return false.
+// Small (less than 32mb) tensors with different TensorProto representations
+// hashed/compared by their tensor content.
+uint64 FastAttrValueHash(const AttrValue& a);
+bool FastAreAttrValuesEqual(const AttrValue& a, const AttrValue& b);
+
 // Returns true if "val" has a placeholder.
 bool HasPlaceHolder(const AttrValue& val);
 
diff --git a/tensorflow/core/framework/collective.h b/tensorflow/core/framework/collective.h
index f6fe12e7ef6da7..f8d27d38687a93 100644
--- a/tensorflow/core/framework/collective.h
+++ b/tensorflow/core/framework/collective.h
@@ -52,7 +52,8 @@ struct CollGroupParams {
   DeviceType device_type;
   int32 num_tasks;  // number of distinct tasks in group
   string ToString() const;
-  CollGroupParams() : device_type(DEVICE_CPU) {}
+  CollGroupParams()
+      : group_key(0), group_size(0), device_type(DEVICE_CPU), num_tasks(0) {}
 };
 
 // The best implementation of a collective op depends on many factors
@@ -71,10 +72,11 @@ struct CollImplDetails {
 
 // Data common to all members of a collective instance.
 struct CollInstanceParams {
-  int32 instance_key;  // Identifies all participating graph nodes.
-  CollectiveType type;
-  DataType data_type;
-  TensorShape shape;
+  // Identifies all participating graph nodes.
+  int32 instance_key = -1;
+  CollectiveType type = UNDEFINED_COLLECTIVE;
+  DataType data_type = DT_FLOAT;
+  TensorShape shape = {0};
   // Fully qualified name of device for each member, in default rank order.
   std::vector<string> device_names;
   // Task name prefix of corresponding device name.
@@ -99,8 +101,8 @@ struct CollectiveParams {
   CollInstanceParams instance;
   CollTaskParams task;
 
-  string name;             // node name used only for log or error messages
-  int default_rank;        // index of this op within device_names
+  string name = "";        // node name used only for log or error messages
+  int default_rank = -1;   // index of this op within device_names
   bool is_source = false;  // broadcast only
   // Rank of this device in each subdivision permutation.
   std::vector<int> subdiv_rank;
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 0916c9b7a85122..6da0da14f047e1 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -40,8 +40,8 @@ Status GetWindowedOutputSizeVerboseV2(int64 input_size, int64 filter_size,
     case Padding::SAME:
       *output_size = (input_size + stride - 1) / stride;
       const int64 padding_needed =
-          std::max(0LL, (*output_size - 1) * stride + effective_filter_size -
-                            input_size);
+          std::max(int64{0}, (*output_size - 1) * stride +
+                                 effective_filter_size - input_size);
       // For odd values of total padding, add more padding at the 'right'
       // side of the given dimension.
       *padding_before = padding_needed / 2;
@@ -303,6 +303,9 @@ Status MakeShapeFromFormat(TensorFormat format, DimensionOrConstant N,
   if (format == FORMAT_NCHW_VECT_C) {
     dims_actual[GetTensorInnerFeatureDimIndex(num_dims, format)] =
         context->MakeDim(4);
+  } else if (format == FORMAT_NHWC_VECT_W) {
+    dims_actual[GetTensorInnerWidthDimIndex(num_dims, format)] =
+        context->MakeDim(4);
   }
   for (int spatial_dim = 0; spatial_dim < spatial.size(); spatial_dim++) {
     dims_actual[GetTensorSpatialDimIndex(num_dims, format, spatial_dim)] =
@@ -1417,6 +1420,21 @@ Status ExplicitShape(InferenceContext* c) {
   return Status::OK();
 }
 
+Status ExplicitShapes(InferenceContext* c) {
+  std::vector<PartialTensorShape> shapes;
+  TF_RETURN_IF_ERROR(c->GetAttr("shapes", &shapes));
+  if (shapes.empty()) {
+    return errors::Internal("shapes attribute is empty");
+  }
+  for (int i = 0; i < shapes.size(); ++i) {
+    ShapeHandle output_shape;
+    TF_RETURN_IF_ERROR(
+        c->MakeShapeFromPartialTensorShape(shapes[i], &output_shape));
+    c->set_output(i, output_shape);
+  }
+  return Status::OK();
+}
+
 }  // namespace shape_inference
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/common_shape_fns.h b/tensorflow/core/framework/common_shape_fns.h
index 789746b4037fbf..87bb133d929e4c 100644
--- a/tensorflow/core/framework/common_shape_fns.h
+++ b/tensorflow/core/framework/common_shape_fns.h
@@ -289,6 +289,9 @@ Status ScatterNdUpdateShape(InferenceContext* c);
 // Shape function for ops with an explicit "shape" attribute.
 Status ExplicitShape(InferenceContext* c);
 
+// Shape function for multiple-output ops with an explicit "shapes" attribute.
+Status ExplicitShapes(InferenceContext* c);
+
 }  // namespace shape_inference
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/cost_graph.proto b/tensorflow/core/framework/cost_graph.proto
index 7885b0171a55a4..19d765cd32e05a 100644
--- a/tensorflow/core/framework/cost_graph.proto
+++ b/tensorflow/core/framework/cost_graph.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "CostGraphProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
 
diff --git a/tensorflow/core/framework/dataset.cc b/tensorflow/core/framework/dataset.cc
index 4145ef7bc9d226..62a9d5751d6d4e 100644
--- a/tensorflow/core/framework/dataset.cc
+++ b/tensorflow/core/framework/dataset.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/dataset.h"
 
+#include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/node_builder.h"
 
@@ -269,4 +270,22 @@ const char GraphDatasetBase::kDatasetGraphKey[] = "_DATASET_GRAPH";
 const char GraphDatasetBase::kDatasetGraphOutputNodeKey[] =
     "_DATASET_GRAPH_OUTPUT_NODE";
 
+namespace dataset {
+
+IteratorContext MakeIteratorContext(OpKernelContext* ctx) {
+  IteratorContext::Params params;
+  params.env = ctx->env();
+  params.runner = *(ctx->runner());
+  params.lib = ctx->function_library();
+  // Note: must use reinterpret_cast because function.h forward-declares Device.
+  DeviceBase* device =
+      reinterpret_cast<DeviceBase*>(ctx->function_library()->device());
+  params.allocator_getter = [device](AllocatorAttributes attrs) {
+    return device->GetAllocator(attrs);
+  };
+  return IteratorContext(params);
+}
+
+}  // namespace dataset
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index 8d127baac44401..d8618f391e3fc2 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -351,6 +351,10 @@ class IteratorBase {
   // in the outputs of this iterator.
   virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
 
+  // Performs initialization that needs to happen outside of a constructor to
+  // properly propagate errors.
+  virtual Status Initialize(IteratorContext* ctx) { return Status::OK(); }
+
   // Saves the state of this iterator.
   virtual Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) {
     return SaveInternal(writer);
@@ -402,12 +406,13 @@ class DatasetBase : public core::RefCounted {
   // iterator will traverse all elements in this dataset from the
   // start.
   //
-  // Ownership of the created iterator will be transferred to the caller.
-  //
   // The prefix identifies the sequence of iterators leading up to the newly
   // created iterator.
-  virtual std::unique_ptr<IteratorBase> MakeIterator(
-      const string& prefix) const = 0;
+  Status MakeIterator(IteratorContext* ctx, const string& prefix,
+                      std::unique_ptr<IteratorBase>* iterator) const {
+    *iterator = MakeIteratorInternal(prefix);
+    return (*iterator)->Initialize(ctx);
+  }
 
   // Returns a vector of DataType values, representing the respective
   // element types of each tuple component in the outputs of this
@@ -420,7 +425,7 @@ class DatasetBase : public core::RefCounted {
   virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
 
   // A human-readable debug string for this dataset.
-  virtual string DebugString() = 0;
+  virtual string DebugString() const = 0;
 
   // Serializes the dataset and writes it to the `writer`.
   virtual Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) const {
@@ -451,6 +456,11 @@ class DatasetBase : public core::RefCounted {
                                     Node** node) const {
     return errors::Unimplemented("AsGraphDefInternal");
   }
+
+  virtual std::unique_ptr<IteratorBase> MakeIteratorInternal(
+      const string& prefix) const = 0;
+
+  friend class DatasetToGraphOp;  // For access to graph related members.
 };
 
 // Base-class for datasets that are built by ops.
@@ -521,7 +531,7 @@ class DatasetIterator : public IteratorBase {
 
   Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
                  bool* end_of_sequence) final {
-    port::Tracing::TraceMe activity(params_.prefix);
+    tracing::ScopedActivity activity(params_.prefix);
     Status s = GetNextInternal(ctx, out_tensors, end_of_sequence);
     if (TF_PREDICT_FALSE(errors::IsOutOfRange(s) && !*end_of_sequence)) {
       s = errors::Internal(
@@ -576,6 +586,23 @@ class DatasetOpKernel : public OpKernel {
     *output = argument_t->scalar<T>()();
     return Status::OK();
   }
+
+  template <typename T>
+  Status ParseVectorArgument(OpKernelContext* ctx,
+                             const StringPiece& argument_name,
+                             std::vector<T>* output) {
+    const Tensor* argument_t;
+    TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t));
+    if (!TensorShapeUtils::IsVector(argument_t->shape())) {
+      return errors::InvalidArgument(argument_name, " must be a vector");
+    }
+    int size = argument_t->vec<T>().size();
+    output->reserve(size);
+    for (int i = 0; i < size; ++i) {
+      output->push_back(argument_t->vec<T>()(i));
+    }
+    return Status::OK();
+  }
 };
 
 // Encapsulates the work required to plug unary Datasets into the core
@@ -619,6 +646,12 @@ Status GetDatasetFromVariantTensor(const Tensor& tensor,
 // The ownership of `dataset` is transferred to `tensor`.
 Status StoreDatasetInVariantTensor(DatasetBase* dataset, Tensor* tensor);
 
+namespace dataset {
+
+IteratorContext MakeIteratorContext(OpKernelContext* ctx);
+
+}  // namespace dataset
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_FRAMEWORK_DATASET_H_
diff --git a/tensorflow/core/framework/device_attributes.proto b/tensorflow/core/framework/device_attributes.proto
index 0b3c0d5bdf9f3d..44236ca9798abc 100644
--- a/tensorflow/core/framework/device_attributes.proto
+++ b/tensorflow/core/framework/device_attributes.proto
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "DeviceAttributesProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 
 message InterconnectLink {
   int32 device_id = 1;
diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index 223b74857d0459..ec26d92a61df65 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -169,13 +169,10 @@ class DeviceBase {
     return nullptr;
   }
 
-  // Return the Allocator implementation to use based on the allocator
-  // attributes requested and the supplied resource manager. By
-  // default this ignores the resource manager and calls the base
-  // implementation but devices can override if they want to consult
-  // the resource manager when choosing the allocator.
-  virtual Allocator* GetStepAllocator(AllocatorAttributes attr,
-                                      ResourceMgr* /*step_resource_manager*/) {
+  // DEPRECATED: Use `this->GetAllocator()` or `this->GetScopedAllocator()`.
+  // This method is provided for backwards compatibility, and will be removed
+  // in a future release.
+  Allocator* GetStepAllocator(AllocatorAttributes attr, ResourceMgr*) {
     return GetAllocator(attr);
   }
 
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index bdc1af9fdaeb2e..647c66099cfa99 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -504,7 +504,7 @@ string Print(const NodeDef& n) {
   std::vector<string> dep;
   for (StringPiece s : n.input()) {
     if (str_util::ConsumePrefix(&s, "^")) {
-      dep.push_back(s.ToString());
+      dep.push_back(std::string(s));
     } else {
       dat.push_back(s);
     }
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index e00399f97de42c..872906756a9f9e 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -33,6 +33,7 @@ limitations under the License.
 namespace tensorflow {
 
 class CancellationManager;
+class CollectiveExecutor;
 class GraphDef;
 class OpKernel;
 class ProcessFunctionLibraryRuntime;
@@ -484,6 +485,7 @@ class FunctionLibraryRuntime {
     int64 step_id = 0;
     Rendezvous* rendezvous = nullptr;
     CancellationManager* cancellation_manager = nullptr;
+    CollectiveExecutor* collective_executor = nullptr;
     ScopedStepContainer* step_container = nullptr;
     StepStatsCollector* stats_collector = nullptr;
 
diff --git a/tensorflow/core/framework/function.proto b/tensorflow/core/framework/function.proto
index 72e3c438314bd8..e69d3938d93d10 100644
--- a/tensorflow/core/framework/function.proto
+++ b/tensorflow/core/framework/function.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "FunctionProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/attr_value.proto";
 import "tensorflow/core/framework/node_def.proto";
 import "tensorflow/core/framework/op_def.proto";
diff --git a/tensorflow/core/framework/graph.proto b/tensorflow/core/framework/graph.proto
index 7d6e16d5c129a0..76d358971d7d33 100644
--- a/tensorflow/core/framework/graph.proto
+++ b/tensorflow/core/framework/graph.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "GraphProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/node_def.proto";
 import "tensorflow/core/framework/function.proto";
 import "tensorflow/core/framework/versions.proto";
diff --git a/tensorflow/core/framework/graph_transfer_info.proto b/tensorflow/core/framework/graph_transfer_info.proto
index 41dd54d78c0395..232297d460dbe8 100644
--- a/tensorflow/core/framework/graph_transfer_info.proto
+++ b/tensorflow/core/framework/graph_transfer_info.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "GraphTransferInfoProto";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/types.proto";
 
 message GraphTransferNodeInput {
diff --git a/tensorflow/core/framework/iterator.proto b/tensorflow/core/framework/iterator.proto
index 7e5f5ea2e0c2f9..f015342e13313e 100644
--- a/tensorflow/core/framework/iterator.proto
+++ b/tensorflow/core/framework/iterator.proto
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "IteratorProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.util";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 
 // Protocol buffer representing the metadata for an iterator's state stored
 // as a Variant tensor.
diff --git a/tensorflow/core/framework/kernel_def.proto b/tensorflow/core/framework/kernel_def.proto
index 65e9ef04a06651..a17b9c8492b68c 100644
--- a/tensorflow/core/framework/kernel_def.proto
+++ b/tensorflow/core/framework/kernel_def.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "KernelDefProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/attr_value.proto";
 
 message KernelDef {
diff --git a/tensorflow/core/framework/log_memory.proto b/tensorflow/core/framework/log_memory.proto
index d1e126330d20b6..7f37eadc3bed0a 100644
--- a/tensorflow/core/framework/log_memory.proto
+++ b/tensorflow/core/framework/log_memory.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "LogMemoryProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/tensor_description.proto";
 
 message MemoryLogStep {
diff --git a/tensorflow/core/framework/node_def.proto b/tensorflow/core/framework/node_def.proto
index 8fcee32e298661..0a095f903f9f6b 100644
--- a/tensorflow/core/framework/node_def.proto
+++ b/tensorflow/core/framework/node_def.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "NodeProto";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/attr_value.proto";
 
 message NodeDef {
diff --git a/tensorflow/core/framework/node_def_builder.cc b/tensorflow/core/framework/node_def_builder.cc
index f9cf6ce87359d6..8e00bfe4f89420 100644
--- a/tensorflow/core/framework/node_def_builder.cc
+++ b/tensorflow/core/framework/node_def_builder.cc
@@ -24,22 +24,23 @@ limitations under the License.
 namespace tensorflow {
 
 NodeDefBuilder::NodeOut::NodeOut(StringPiece n, int i, DataType dt)
-    : node(n.ToString()), index(i), data_type(dt) {}
+    : node(std::string(n)), index(i), data_type(dt) {}
 
 NodeDefBuilder::NodeOut::NodeOut() {
   // uninitialized, call Reset() before use.
 }
 
 void NodeDefBuilder::NodeOut::Reset(StringPiece n, int i, DataType dt) {
-  node = n.ToString();
+  node = std::string(n);
   index = i;
   data_type = dt;
 }
 
 NodeDefBuilder::NodeDefBuilder(StringPiece name, StringPiece op_name,
                                const OpRegistryInterface* op_registry) {
-  node_def_.set_name(name.ToString());
-  const Status status = op_registry->LookUpOpDef(op_name.ToString(), &op_def_);
+  node_def_.set_name(std::string(name));
+  const Status status =
+      op_registry->LookUpOpDef(std::string(op_name), &op_def_);
   if (status.ok()) {
     Initialize();
   } else {
@@ -50,7 +51,7 @@ NodeDefBuilder::NodeDefBuilder(StringPiece name, StringPiece op_name,
 
 NodeDefBuilder::NodeDefBuilder(StringPiece name, const OpDef* op_def)
     : op_def_(op_def) {
-  node_def_.set_name(name.ToString());
+  node_def_.set_name(std::string(name));
   Initialize();
 }
 
@@ -170,7 +171,7 @@ void NodeDefBuilder::AddInput(StringPiece src_node, int src_index) {
   } else if (src_index > 0) {
     node_def_.add_input(strings::StrCat(src_node, ":", src_index));
   } else {
-    node_def_.add_input(src_node.ToString());
+    node_def_.add_input(std::string(src_node));
   }
 }
 
@@ -193,12 +194,12 @@ void NodeDefBuilder::VerifyInputRef(const OpDef::ArgDef* input_arg,
 }
 
 NodeDefBuilder& NodeDefBuilder::ControlInput(StringPiece src_node) {
-  control_inputs_.push_back(src_node.ToString());
+  control_inputs_.push_back(std::string(src_node));
   return *this;
 }
 
 NodeDefBuilder& NodeDefBuilder::Device(StringPiece device_spec) {
-  node_def_.set_device(device_spec.ToString());
+  node_def_.set_device(std::string(device_spec));
   return *this;
 }
 
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index bad92ca9b3d8c9..a816c151407ed2 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -245,7 +245,7 @@ DEFINE_GET_ATTR(NameAttrList, func, "func", emplace_back, v, ;);
 #undef DEFINE_GET_ATTR
 
 bool HasNodeAttr(const NodeDef& node_def, StringPiece attr_name) {
-  return node_def.attr().find(attr_name.ToString()) != node_def.attr().end();
+  return node_def.attr().find(std::string(attr_name)) != node_def.attr().end();
 }
 
 static const string& kEmptyString = *new string();
@@ -378,15 +378,20 @@ Status OutputTypeForNode(const NodeDef& node_def, const OpDef& op_def,
                                  node_def.name());
 }
 
+Status OutputTypesForNode(const NodeDef& node_def, const OpDef& op_def,
+                          DataTypeVector* outputs) {
+  for (const auto& arg : op_def.output_arg()) {
+    TF_RETURN_IF_ERROR(AddArgToSig(node_def, arg, outputs));
+  }
+  return Status::OK();
+}
+
 Status InOutTypesForNode(const NodeDef& node_def, const OpDef& op_def,
                          DataTypeVector* inputs, DataTypeVector* outputs) {
   for (const auto& arg : op_def.input_arg()) {
     TF_RETURN_IF_ERROR(AddArgToSig(node_def, arg, inputs));
   }
-  for (const auto& arg : op_def.output_arg()) {
-    TF_RETURN_IF_ERROR(AddArgToSig(node_def, arg, outputs));
-  }
-  return Status::OK();
+  return OutputTypesForNode(node_def, op_def, outputs);
 }
 
 Status ValidateNodeDef(const NodeDef& node_def, const OpDef& op_def) {
@@ -639,7 +644,7 @@ Status AttachDef(const Status& status, const Node& node) {
 
 void AddNodeAttr(StringPiece name, const AttrValue& value, NodeDef* node_def) {
   node_def->mutable_attr()->insert(
-      AttrValueMap::value_type(name.ToString(), value));
+      AttrValueMap::value_type(std::string(name), value));
 }
 
 #define ADD_NODE_ATTR(T)                                           \
@@ -677,7 +682,7 @@ ADD_NODE_ATTR(gtl::ArraySlice<NameAttrList>)
 #undef ADD_NODE_ATTR
 
 void AddAttr(StringPiece name, const AttrValue& value, AttrValueMap* map) {
-  map->insert(AttrValueMap::value_type(name.ToString(), value));
+  map->insert(AttrValueMap::value_type(std::string(name), value));
 }
 
 #define ADD_ATTR(T)                                            \
diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index b8a1e84f2e79d6..ce7818a31c689e 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -247,6 +247,10 @@ Status InputTypeForNode(const NodeDef& node_def, const OpDef& op_def,
 // REQUIRES: ValidateOpDef(op_def).ok()
 Status OutputTypeForNode(const NodeDef& node_def, const OpDef& op_def,
                          int output_port, DataType* output_type);
+// Computes the output types for a specific node.
+// REQUIRES: ValidateOpDef(op_def).ok()
+Status OutputTypesForNode(const NodeDef& node_def, const OpDef& op_def,
+                          DataTypeVector* outputs);
 // Computes the input and output types for a specific node.
 // REQUIRES: ValidateOpDef(op_def).ok()
 Status InOutTypesForNode(const NodeDef& node_def, const OpDef& op_def,
diff --git a/tensorflow/core/framework/op.cc b/tensorflow/core/framework/op.cc
index 5f68c59fe9ae08..b8309eafb05251 100644
--- a/tensorflow/core/framework/op.cc
+++ b/tensorflow/core/framework/op.cc
@@ -91,11 +91,15 @@ Status OpRegistry::LookUp(const string& op_type_name,
         }
       }
     }
-    Status status =
-        errors::NotFound("Op type not registered '", op_type_name,
-                         "' in binary running on ", port::Hostname(), ". ",
-                         "Make sure the Op and Kernel are registered in the "
-                         "binary running in this process.");
+    Status status = errors::NotFound(
+        "Op type not registered '", op_type_name, "' in binary running on ",
+        port::Hostname(), ". ",
+        "Make sure the Op and Kernel are registered in the "
+        "binary running in this process. Note that if you "
+        "are loading a saved graph which used ops from "
+        "tf.contrib, accessing (e.g.) `tf.contrib.resampler` should be done "
+        "before importing the graph, as contrib ops are lazily registered "
+        "when the module is first accessed.");
     VLOG(1) << status.ToString();
     return status;
   }
@@ -246,10 +250,15 @@ Status OpListOpRegistry::LookUp(const string& op_type_name,
   auto iter = index_.find(op_type_name);
   if (iter == index_.end()) {
     *op_reg_data = nullptr;
-    return errors::NotFound("Op type not registered '", op_type_name,
-                            "' in binary running on ", port::Hostname(), ". ",
-                            "Make sure the Op and Kernel are registered in the "
-                            "binary running in this process.");
+    return errors::NotFound(
+        "Op type not registered '", op_type_name, "' in binary running on ",
+        port::Hostname(), ". ",
+        "Make sure the Op and Kernel are registered in the "
+        "binary running in this process. Note that if you "
+        "are loading a saved graph which used ops from "
+        "tf.contrib, accessing (e.g.) `tf.contrib.resampler` should be done "
+        "before importing the graph, as contrib ops are lazily registered "
+        "when the module is first accessed.");
   }
   *op_reg_data = iter->second;
   return Status::OK();
diff --git a/tensorflow/core/framework/op_def.proto b/tensorflow/core/framework/op_def.proto
index ca0e5e7133af61..aea2d2bb09a2c2 100644
--- a/tensorflow/core/framework/op_def.proto
+++ b/tensorflow/core/framework/op_def.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "OpDefProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/attr_value.proto";
 import "tensorflow/core/framework/types.proto";
 
diff --git a/tensorflow/core/framework/op_def_builder.cc b/tensorflow/core/framework/op_def_builder.cc
index 403bd0b5e22a31..91eb6c0672d93e 100644
--- a/tensorflow/core/framework/op_def_builder.cc
+++ b/tensorflow/core/framework/op_def_builder.cc
@@ -527,7 +527,7 @@ void FinalizeDoc(const string& text, OpDef* op_def,
 }  // namespace
 
 OpDefBuilder::OpDefBuilder(StringPiece op_name) {
-  op_def()->set_name(op_name.ToString());  // NOLINT
+  op_def()->set_name(std::string(op_name));  // NOLINT
 }
 
 OpDefBuilder& OpDefBuilder::Attr(StringPiece spec) {
@@ -584,7 +584,7 @@ OpDefBuilder& OpDefBuilder::Deprecated(int version, StringPiece explanation) {
   } else {
     OpDeprecation* deprecation = op_def()->mutable_deprecation();
     deprecation->set_version(version);
-    deprecation->set_explanation(explanation.ToString());
+    deprecation->set_explanation(std::string(explanation));
   }
   return *this;
 }
diff --git a/tensorflow/core/framework/op_gen_lib.cc b/tensorflow/core/framework/op_gen_lib.cc
index 7f23272871abe9..4b56d807df6bca 100644
--- a/tensorflow/core/framework/op_gen_lib.cc
+++ b/tensorflow/core/framework/op_gen_lib.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/op_gen_lib.h"
 
+#include <algorithm>
 #include <vector>
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -185,7 +186,7 @@ static bool FindMultiline(StringPiece line, size_t colon, string* end) {
   while (str_util::ConsumePrefix(&line, " ")) {
   }
   if (str_util::ConsumePrefix(&line, "<<")) {
-    *end = line.ToString();
+    *end = std::string(line);
     return true;
   }
   return false;
@@ -306,9 +307,6 @@ void InitApiDefFromOpDef(const OpDef& op_def, ApiDef* api_def) {
 
   auto* endpoint = api_def->add_endpoint();
   endpoint->set_name(op_def.name());
-  if (op_def.has_deprecation()) {
-    endpoint->set_deprecation_version(op_def.deprecation().version());
-  }
 
   for (const auto& op_in_arg : op_def.input_arg()) {
     auto* api_in_arg = api_def->add_in_arg();
diff --git a/tensorflow/core/framework/op_gen_lib.h b/tensorflow/core/framework/op_gen_lib.h
index ff38e4b22141a7..533dd64805c679 100644
--- a/tensorflow/core/framework/op_gen_lib.h
+++ b/tensorflow/core/framework/op_gen_lib.h
@@ -59,14 +59,14 @@ class ApiDefMap {
   // You can call this method multiple times to load multiple
   // sets of files. Api definitions are merged if the same
   // op definition is loaded multiple times. Later-loaded
-  // definitions take precedense.
+  // definitions take precedence.
   // ApiDefs loaded from files must contain a subset of ops defined
   // in the OpList passed to the constructor.
   Status LoadFileList(Env* env, const std::vector<string>& filenames);
 
   // Load a single file. Api definitions are merged if the same
   // op definition is loaded multiple times. Later-loaded
-  // definitions take precedense.
+  // definitions take precedence.
   // ApiDefs loaded from file must contain a subset of ops defined
   // in the OpList passed to the constructor.
   Status LoadFile(Env* env, const string& filename);
diff --git a/tensorflow/core/framework/op_gen_lib_test.cc b/tensorflow/core/framework/op_gen_lib_test.cc
index 857b1c8dbcac66..e0e77c74495d62 100644
--- a/tensorflow/core/framework/op_gen_lib_test.cc
+++ b/tensorflow/core/framework/op_gen_lib_test.cc
@@ -189,7 +189,6 @@ TEST(OpGenLibTest, ApiDefInitializedFromOpDef) {
 visibility: VISIBLE
 endpoint {
   name: "testop"
-  deprecation_version: 123
 }
 in_arg {
   name: "arg_a"
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index ca91d68f79f9e3..b05a9df7c18db5 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -283,13 +283,13 @@ OpKernelContext::~OpKernelContext() {
 
 Allocator* OpKernelContext::get_allocator(AllocatorAttributes attr) {
   Allocator* allocator = nullptr;
-  if (attr.scope_id > 0) {
+  if (TF_PREDICT_FALSE(attr.scope_id > 0)) {
     allocator = params_->device->GetScopedAllocator(attr, step_id());
     CHECK(allocator);
   } else {
-    allocator = params_->device->GetStepAllocator(attr, resource_manager());
+    allocator = params_->device->GetAllocator(attr);
   }
-  if (track_allocations()) {
+  if (TF_PREDICT_FALSE(track_allocations())) {
     mutex_lock lock(mu_);
     for (const auto& wrapped : wrapped_allocators_) {
       if (wrapped.first == allocator) {
@@ -923,7 +923,7 @@ void OpKernelContext::clear_recorded_memory() {
 struct KernelRegistration {
   KernelRegistration(const KernelDef& d, StringPiece c,
                      kernel_factory::OpKernelRegistrar::Factory f)
-      : def(d), kernel_class_name(c.ToString()), factory(f) {}
+      : def(d), kernel_class_name(std::string(c)), factory(f) {}
   const KernelDef def;
   const string kernel_class_name;
   const kernel_factory::OpKernelRegistrar::Factory factory;
@@ -1273,59 +1273,51 @@ const Eigen::SyclDevice& OpKernelContext::eigen_device() const {
 }
 #endif
 
-namespace {
-template <class OpKernelT>
-void CtxFailureInternal(OpKernelT* op_kernel, const char* file, int line,
-                        const Status& s) {
-  const string logging_prefix =
-      file == nullptr ? "CtxFailure: "
-                      : strings::StrCat("CtxFailure at ", io::Basename(file),
-                                        ":", line, ": ");
-
-  if (errors::IsOutOfRange(s)) {
-    // VLOG OutOfRange errors. Dataset ops create OutOfRange errors when they
-    // reach end-of-sequence.
-    VLOG(1) << logging_prefix << s;
-  } else {
-    LOG(WARNING) << logging_prefix << s;
-  }
-  op_kernel->SetStatus(s);
-}
-}  // anonymous namespace
-
 void OpKernelConstruction::CtxFailure(const Status& s) {
-  CtxFailureInternal(this, nullptr, 0, s);
+  VLOG(1) << s;
+  SetStatus(s);
 }
 
 void OpKernelConstruction::CtxFailureWithWarning(const Status& s) {
-  CtxFailureInternal(this, nullptr, 0, s);
+  LOG(WARNING) << s;
+  SetStatus(s);
 }
 
 void OpKernelConstruction::CtxFailure(const char* file, int line,
                                       const Status& s) {
-  CtxFailureInternal(this, file, line, s);
+  VLOG(1) << "OP_REQUIRES failed at " << io::Basename(file) << ":" << line
+          << " : " << s;
+  SetStatus(s);
 }
 
 void OpKernelConstruction::CtxFailureWithWarning(const char* file, int line,
                                                  const Status& s) {
-  CtxFailureInternal(this, file, line, s);
+  LOG(WARNING) << "OP_REQUIRES failed at " << io::Basename(file) << ":" << line
+               << " : " << s;
+  SetStatus(s);
 }
 
 void OpKernelContext::CtxFailure(const Status& s) {
-  CtxFailureInternal(this, nullptr, 0, s);
+  VLOG(1) << s;
+  SetStatus(s);
 }
 
 void OpKernelContext::CtxFailureWithWarning(const Status& s) {
-  CtxFailureInternal(this, nullptr, 0, s);
+  LOG(WARNING) << s;
+  SetStatus(s);
 }
 
 void OpKernelContext::CtxFailure(const char* file, int line, const Status& s) {
-  CtxFailureInternal(this, file, line, s);
+  VLOG(1) << "OP_REQUIRES failed at " << io::Basename(file) << ":" << line
+          << " : " << s;
+  SetStatus(s);
 }
 
 void OpKernelContext::CtxFailureWithWarning(const char* file, int line,
                                             const Status& s) {
-  CtxFailureInternal(this, file, line, s);
+  LOG(WARNING) << "OP_REQUIRES failed at " << io::Basename(file) << ":" << line
+               << " : " << s;
+  SetStatus(s);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 67943377b9f5cd..f577664709c064 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -534,7 +534,7 @@ class OpKernelContext {
     Rendezvous* rendezvous = nullptr;
 
     // Mechanism for executing a collective op that needs to coordinate
-    // with parallel instances runing on other devices.
+    // with parallel instances running on other devices.
     CollectiveExecutor* collective_executor = nullptr;
 
     // The session state for this op.
diff --git a/tensorflow/core/framework/reader_base.proto b/tensorflow/core/framework/reader_base.proto
index 1b8b965ee105fb..9e187cfa791f69 100644
--- a/tensorflow/core/framework/reader_base.proto
+++ b/tensorflow/core/framework/reader_base.proto
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ReaderBaseProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 
 // For serializing and restoring the state of ReaderBase, see
 // reader_base.h for details.
diff --git a/tensorflow/core/framework/remote_fused_graph_execute_info.proto b/tensorflow/core/framework/remote_fused_graph_execute_info.proto
index 946da40d0e315a..10072724d2f554 100644
--- a/tensorflow/core/framework/remote_fused_graph_execute_info.proto
+++ b/tensorflow/core/framework/remote_fused_graph_execute_info.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "RemoteFusedGraphExecuteInfoProto";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
diff --git a/tensorflow/core/framework/resource_handle.cc b/tensorflow/core/framework/resource_handle.cc
index 39ef82765f5ded..fc3a329b3b6852 100644
--- a/tensorflow/core/framework/resource_handle.cc
+++ b/tensorflow/core/framework/resource_handle.cc
@@ -66,4 +66,29 @@ string ProtoDebugString(const ResourceHandle& handle) {
   return handle.DebugString();
 }
 
+void EncodeResourceHandleList(const ResourceHandle* p, int64 n,
+                              std::unique_ptr<port::StringListEncoder> e) {
+  ResourceHandleProto proto;
+  for (int i = 0; i < n; ++i) {
+    p[i].AsProto(&proto);
+    e->Append(proto);
+  }
+  e->Finalize();
+}
+
+bool DecodeResourceHandleList(std::unique_ptr<port::StringListDecoder> d,
+                              ResourceHandle* ps, int64 n) {
+  std::vector<uint32> sizes(n);
+  if (!d->ReadSizes(&sizes)) return false;
+
+  ResourceHandleProto proto;
+  for (int i = 0; i < n; ++i) {
+    if (!proto.ParseFromArray(d->Data(sizes[i]), sizes[i])) {
+      return false;
+    }
+    ps[i].FromProto(proto);
+  }
+  return true;
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/resource_handle.h b/tensorflow/core/framework/resource_handle.h
index 06df1b9046da2c..db213669a3f30b 100644
--- a/tensorflow/core/framework/resource_handle.h
+++ b/tensorflow/core/framework/resource_handle.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_FRAMEWORK_RESOURCE_HANDLE_H_
 #define TENSORFLOW_FRAMEWORK_RESOURCE_HANDLE_H_
 
+#include "tensorflow/core/platform/tensor_coding.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -77,6 +78,14 @@ class ResourceHandle {
 // For backwards compatibility for when this was a proto
 string ProtoDebugString(const ResourceHandle& handle);
 
+// Encodes a list of ResourceHandle protos in the given StringListEncoder.
+void EncodeResourceHandleList(const ResourceHandle* p, int64 n,
+                              std::unique_ptr<port::StringListEncoder> e);
+
+// Decodes a list of ResourceHandle protos from the given StringListDecoder.
+bool DecodeResourceHandleList(std::unique_ptr<port::StringListDecoder> d,
+                              ResourceHandle* ps, int64 n);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_FRAMEWORK_RESOURCE_HANDLE_H_
diff --git a/tensorflow/core/framework/resource_handle.proto b/tensorflow/core/framework/resource_handle.proto
index b1921337f5fd0b..a54d3d906ca985 100644
--- a/tensorflow/core/framework/resource_handle.proto
+++ b/tensorflow/core/framework/resource_handle.proto
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ResourceHandle";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 
 // Protocol buffer representing a handle to a tensorflow resource. Handles are
 // not valid across executions, but can be serialized back and forth from within
diff --git a/tensorflow/core/framework/resource_mgr.cc b/tensorflow/core/framework/resource_mgr.cc
index 78574bc0b133fd..21fc6c1bd58773 100644
--- a/tensorflow/core/framework/resource_mgr.cc
+++ b/tensorflow/core/framework/resource_mgr.cc
@@ -138,16 +138,13 @@ string ResourceMgr::DebugString() const {
 
 Status ResourceMgr::DoCreate(const string& container, TypeIndex type,
                              const string& name, ResourceBase* resource) {
-  {
-    mutex_lock l(mu_);
-    Container** b = &containers_[container];
-    if (*b == nullptr) {
-      *b = new Container;
-    }
-    if ((*b)->insert({{type.hash_code(), name}, resource}).second) {
-      TF_RETURN_IF_ERROR(InsertDebugTypeName(type.hash_code(), type.name()));
-      return Status::OK();
-    }
+  Container** b = &containers_[container];
+  if (*b == nullptr) {
+    *b = new Container;
+  }
+  if ((*b)->insert({{type.hash_code(), name}, resource}).second) {
+    TF_RETURN_IF_ERROR(InsertDebugTypeName(type.hash_code(), type.name()));
+    return Status::OK();
   }
   resource->Unref();
   return errors::AlreadyExists("Resource ", container, "/", name, "/",
@@ -157,7 +154,6 @@ Status ResourceMgr::DoCreate(const string& container, TypeIndex type,
 Status ResourceMgr::DoLookup(const string& container, TypeIndex type,
                              const string& name,
                              ResourceBase** resource) const {
-  tf_shared_lock l(mu_);
   const Container* b = gtl::FindPtrOrNull(containers_, container);
   if (b == nullptr) {
     return errors::NotFound("Container ", container,
diff --git a/tensorflow/core/framework/resource_mgr.h b/tensorflow/core/framework/resource_mgr.h
index c84ea3b034cc20..33d4cb77ff8a95 100644
--- a/tensorflow/core/framework/resource_mgr.h
+++ b/tensorflow/core/framework/resource_mgr.h
@@ -131,6 +131,10 @@ class ResourceMgr {
   // "*resource". Otherwise, invokes creator() to create the resource.
   // The caller takes the ownership of one ref on "*resource".
   //
+  // WARNING: creator() must not call any methods on ResourceMgr during its
+  // execution, because a non-reentrant lock is held during the creator() call
+  // in order to guarantee atomicity of LookupOrCreate().
+  //
   // REQUIRES: std::is_base_of<ResourceBase, T>
   // REQUIRES: resource != nullptr
   template <typename T>
@@ -174,10 +178,19 @@ class ResourceMgr {
   mutable mutex mu_;
   std::unordered_map<string, Container*> containers_ GUARDED_BY(mu_);
 
+  template <typename T>
+  Status LookupInternal(const string& container, const string& name,
+                        T** resource) const
+      SHARED_LOCKS_REQUIRED(mu_) TF_MUST_USE_RESULT;
+
   Status DoCreate(const string& container, TypeIndex type, const string& name,
-                  ResourceBase* resource) TF_MUST_USE_RESULT;
+                  ResourceBase* resource)
+      EXCLUSIVE_LOCKS_REQUIRED(mu_) TF_MUST_USE_RESULT;
+
   Status DoLookup(const string& container, TypeIndex type, const string& name,
-                  ResourceBase** resource) const TF_MUST_USE_RESULT;
+                  ResourceBase** resource) const
+      SHARED_LOCKS_REQUIRED(mu_) TF_MUST_USE_RESULT;
+
   Status DoDelete(const string& container, uint64 type_hash_code,
                   const string& resource_name,
                   const string& type_name) TF_MUST_USE_RESULT;
@@ -338,6 +351,9 @@ class ResourceHandleOp : public OpKernel {
  private:
   string container_;
   string name_;
+  mutex mutex_;
+  Tensor resource_;
+  std::atomic<bool> initialized_{false};
 };
 
 // Registers a kernel for an op which produces a handle to a resource of the
@@ -359,6 +375,7 @@ Status ResourceMgr::Create(const string& container, const string& name,
                            T* resource) {
   CheckDeriveFromResourceBase<T>();
   CHECK(resource != nullptr);
+  mutex_lock l(mu_);
   return DoCreate(container, MakeTypeIndex<T>(), name, resource);
 }
 
@@ -366,6 +383,13 @@ template <typename T>
 Status ResourceMgr::Lookup(const string& container, const string& name,
                            T** resource) const {
   CheckDeriveFromResourceBase<T>();
+  tf_shared_lock l(mu_);
+  return LookupInternal(container, name, resource);
+}
+
+template <typename T>
+Status ResourceMgr::LookupInternal(const string& container, const string& name,
+                                   T** resource) const {
   ResourceBase* found = nullptr;
   Status s = DoLookup(container, MakeTypeIndex<T>(), name, &found);
   if (s.ok()) {
@@ -380,21 +404,23 @@ template <typename T>
 Status ResourceMgr::LookupOrCreate(const string& container, const string& name,
                                    T** resource,
                                    std::function<Status(T**)> creator) {
-  Status s;
+  CheckDeriveFromResourceBase<T>();
   *resource = nullptr;
-  while (*resource == nullptr) {
-    s = Lookup(container, name, resource);
-    if (s.ok()) break;
-    s = creator(resource);
-    if (!s.ok()) break;
-    s = Create(container, name, *resource);
-    if (s.ok()) {
-      (*resource)->Ref();
-      break;
-    }
-    // Rare event. Concurrent racy creation. Redo the lookup.
-    *resource = nullptr;
+  Status s;
+  {
+    tf_shared_lock l(mu_);
+    s = LookupInternal(container, name, resource);
+    if (s.ok()) return s;
+  }
+  mutex_lock l(mu_);
+  s = LookupInternal(container, name, resource);
+  if (s.ok()) return s;
+  TF_RETURN_IF_ERROR(creator(resource));
+  s = DoCreate(container, MakeTypeIndex<T>(), name, *resource);
+  if (!s.ok()) {
+    return errors::Internal("LookupOrCreate failed unexpectedly");
   }
+  (*resource)->Ref();
   return s;
 }
 
@@ -511,10 +537,20 @@ ResourceHandleOp<T>::ResourceHandleOp(OpKernelConstruction* context)
 
 template <typename T>
 void ResourceHandleOp<T>::Compute(OpKernelContext* ctx) {
-  Tensor* output = nullptr;
-  OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
-  output->scalar<ResourceHandle>()() =
-      MakeResourceHandle<T>(ctx, container_, name_);
+  if (!initialized_.load()) {
+    mutex_lock ml(mutex_);
+    // Checking again to see if another thread has initialized the resource.
+    if (!initialized_.load()) {
+      AllocatorAttributes attr;
+      attr.set_on_host(true);
+      OP_REQUIRES_OK(ctx, ctx->allocate_temp(DT_RESOURCE, TensorShape({}),
+                                             &resource_, attr));
+      resource_.scalar<ResourceHandle>()() =
+          MakeResourceHandle<T>(ctx, container_, name_);
+      initialized_.store(true);
+    }
+  }
+  ctx->set_output(0, resource_);
 }
 
 }  //  end namespace tensorflow
diff --git a/tensorflow/core/framework/resource_mgr_test.cc b/tensorflow/core/framework/resource_mgr_test.cc
index 798220d4c35c50..7c7f0af0ce46ab 100644
--- a/tensorflow/core/framework/resource_mgr_test.cc
+++ b/tensorflow/core/framework/resource_mgr_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
@@ -124,7 +125,7 @@ TEST(ResourceMgrTest, Basic) {
   TF_CHECK_OK(rm.Cleanup("bar"));
 }
 
-TEST(ResourceMgr, CreateOrLookup) {
+TEST(ResourceMgrTest, CreateOrLookup) {
   ResourceMgr rm;
   EXPECT_EQ("R/cat", LookupOrCreate<Resource>(&rm, "foo", "bar", "cat"));
   EXPECT_EQ("R/cat", LookupOrCreate<Resource>(&rm, "foo", "bar", "dog"));
@@ -136,6 +137,30 @@ TEST(ResourceMgr, CreateOrLookup) {
   HasError(FindErr<Other>(rm, "foo", "bar"), "Not found: Resource foo/bar");
 }
 
+TEST(ResourceMgrTest, CreateOrLookupRaceCondition) {
+  ResourceMgr rm;
+  std::atomic<int> atomic_int(0);
+  {
+    thread::ThreadPool threads(Env::Default(), "racing_creates", 2);
+    for (int i = 0; i < 2; i++) {
+      threads.Schedule([&rm, &atomic_int] {
+        Resource* r;
+        TF_CHECK_OK(rm.LookupOrCreate<Resource>(
+            "container", "resource-name", &r, [&atomic_int](Resource** ret) {
+              // Maximize chance of encountering race condition if one exists.
+              Env::Default()->SleepForMicroseconds(1 * 1000 * 1000);
+              atomic_int += 1;
+              *ret = new Resource("label");
+              return Status::OK();
+            }));
+        r->Unref();
+      });
+    }
+  }
+  // Resource creator function should always run exactly once.
+  EXPECT_EQ(1, atomic_int);
+}
+
 Status ComputePolicy(const string& attr_container,
                      const string& attr_shared_name,
                      bool use_node_name_as_default, string* result) {
diff --git a/tensorflow/core/framework/shape_inference.cc b/tensorflow/core/framework/shape_inference.cc
index 2b995e8b5e8c84..b02bc3adbed9e7 100644
--- a/tensorflow/core/framework/shape_inference.cc
+++ b/tensorflow/core/framework/shape_inference.cc
@@ -605,13 +605,20 @@ Status InferenceContext::Subshape(ShapeHandle s, int64 start,
   return Subshape(s, start, std::numeric_limits<int64>::max() /* end */, out);
 }
 
-Status InferenceContext::Subshape(ShapeHandle s, int64 start_in, int64 end_in,
+Status InferenceContext::Subshape(ShapeHandle s, int64 start, int64 end,
                                   ShapeHandle* out) {
-  int64 start = start_in;
-  int64 end = end_in;
+  return Subshape(s, start, end, 1 /* stride */, out);
+}
+
+Status InferenceContext::Subshape(ShapeHandle s, int64 start, int64 end,
+                                  int64 stride, ShapeHandle* out) {
+  int64 start_in = start;
+  int64 end_in = end;
+
   const int32 rank = Rank(s);
-  if (start == 0 && ((RankKnown(s) && end >= rank) ||
-                     end == std::numeric_limits<int64>::max())) {
+  if (start == 0 && stride == 1 &&
+      ((RankKnown(s) && end >= rank) ||
+       end == std::numeric_limits<int64>::max())) {
     *out = s;
     return Status::OK();
   }
@@ -621,6 +628,9 @@ Status InferenceContext::Subshape(ShapeHandle s, int64 start_in, int64 end_in,
 
   if (start > rank) start = rank;
   if (end > rank) end = rank;
+
+  if (stride < 0 && start == rank) --start;
+
   if (start < 0) {
     start = rank + start;
     if (start < 0) {
@@ -638,16 +648,23 @@ Status InferenceContext::Subshape(ShapeHandle s, int64 start_in, int64 end_in,
                                      ", for shape with rank ", rank);
     }
   }
-  if (start > end) {
+  if (stride > 0 && start > end) {
     *out = nullptr;
     return errors::InvalidArgument(
         "Subshape must have computed start <= end, but is ", start, " and ",
         end, " (computed from start ", start_in, " and end ", end_in,
         " over shape with rank ", rank, ")");
+  } else if (stride < 0 && start < end) {
+    *out = nullptr;
+    return errors::InvalidArgument(
+        "Subshape must have computed start >= end since stride is negative, "
+        "but is ",
+        start, " and ", end, " (computed from start ", start_in, " and end ",
+        end_in, " over shape with rank ", rank, " and stride", stride, ")");
   }
+
   std::vector<DimensionHandle> dims;
-  dims.reserve(end - start);
-  for (int i = start; i < end; ++i) {
+  for (int i = start; stride > 0 ? i < end : i > end; i += stride) {
     dims.push_back(Dim(s, i));
   }
   return ReturnCreatedShape(dims, out);
diff --git a/tensorflow/core/framework/shape_inference.h b/tensorflow/core/framework/shape_inference.h
index 9431a62abefd1a..3f3729dcf97e4d 100644
--- a/tensorflow/core/framework/shape_inference.h
+++ b/tensorflow/core/framework/shape_inference.h
@@ -434,6 +434,13 @@ class InferenceContext {
   Status Subshape(ShapeHandle s, int64 start, int64 end,
                   ShapeHandle* out) TF_MUST_USE_RESULT;
 
+  // Returns in <*out> a sub-shape of <s>, with dimensions [start:end:stride].
+  // <start> and <end> can be negative, to index from the end of the shape.
+  // <start> and <end> are set to the rank of <s> if > rank of <s>.
+  // <stride> can be negative, to reverse the <s>.
+  Status Subshape(ShapeHandle s, int64 start, int64 end, int64 stride,
+                  ShapeHandle* out) TF_MUST_USE_RESULT;
+
   // Returns in <*out> the result of appending the dimensions of <s2> to those
   // of <s1>.
   Status Concatenate(ShapeHandle s1, ShapeHandle s2,
diff --git a/tensorflow/core/framework/shape_inference_testutil.h b/tensorflow/core/framework/shape_inference_testutil.h
index 2a99af7659d9be..f6656b3b456388 100644
--- a/tensorflow/core/framework/shape_inference_testutil.h
+++ b/tensorflow/core/framework/shape_inference_testutil.h
@@ -32,7 +32,7 @@ class Tensor;
 
 struct ShapeInferenceTestOp {
   typedef std::pair<string, DataType> ShapeAndType;
-  explicit ShapeInferenceTestOp(StringPiece name) : name(name.ToString()) {}
+  explicit ShapeInferenceTestOp(StringPiece name) : name(std::string(name)) {}
   string name;
   NodeDef node_def;
   std::vector<const Tensor*> input_tensors;
diff --git a/tensorflow/core/framework/stats_aggregator.h b/tensorflow/core/framework/stats_aggregator.h
index a449f324e603ae..8002d9291c2e0f 100644
--- a/tensorflow/core/framework/stats_aggregator.h
+++ b/tensorflow/core/framework/stats_aggregator.h
@@ -47,6 +47,10 @@ class StatsAggregator {
   virtual void AddToHistogram(const string& name,
                               gtl::ArraySlice<double> values) = 0;
 
+  // TODO(shivaniagarawal): consistency in double and float usage.
+  // Add the given `value` as Scalar with the given `name`.
+  virtual void AddScalar(const string& name, float value) = 0;
+
   // Stores a protocol buffer representation of the aggregator state in the
   // given `out_summary`.
   // TODO(mrry): Consider separating this method from the `StatsAggregator`
diff --git a/tensorflow/core/framework/step_stats.proto b/tensorflow/core/framework/step_stats.proto
index 65c8089d51141b..d98999cb54bd84 100644
--- a/tensorflow/core/framework/step_stats.proto
+++ b/tensorflow/core/framework/step_stats.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "StepStatsProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/allocation_description.proto";
 import "tensorflow/core/framework/tensor_description.proto";
 
diff --git a/tensorflow/core/framework/summary.proto b/tensorflow/core/framework/summary.proto
index 55879f87831eb9..532e4fcd87b78e 100644
--- a/tensorflow/core/framework/summary.proto
+++ b/tensorflow/core/framework/summary.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "SummaryProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/tensor.proto";
 
 // Metadata associated with a series of Summary data
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index d5a45c73c37bf0..384a42fc112007 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -51,7 +51,6 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/tensor_coding.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/platform/variant_coding.h"
 
 namespace tensorflow {
 
@@ -207,7 +206,8 @@ struct Helper<ResourceHandle> {
   // "out", which is usually the TensorProto::tensor_content.
   template <typename Destination>
   static void Encode(TensorBuffer* in, int64 n, Destination* out) {
-    port::EncodeResourceHandleList(in->base<const ResourceHandle>(), n, out);
+    EncodeResourceHandleList(in->base<const ResourceHandle>(), n,
+                             port::NewStringListEncoder(out));
   }
 
   // Decodes "n" elements of type string from "in" and constructs a
@@ -217,7 +217,8 @@ struct Helper<ResourceHandle> {
   static TensorBuffer* Decode(Allocator* a, const Source& in, int64 n) {
     auto* buf = new Buffer<ResourceHandle>(a, n);
     ResourceHandle* ps = buf->template base<ResourceHandle>();
-    if (ps == nullptr || !port::DecodeResourceHandleList(in, ps, n)) {
+    if (ps == nullptr ||
+        !DecodeResourceHandleList(port::NewStringListDecoder(in), ps, n)) {
       buf->Unref();
       return nullptr;
     }
@@ -237,7 +238,8 @@ struct Helper<Variant> {
   // "out", which is usually the TensorProto::tensor_content.
   template <typename Destination>
   static void Encode(TensorBuffer* in, int64 n, Destination* out) {
-    port::EncodeVariantList(in->base<const Variant>(), n, out);
+    EncodeVariantList(in->base<const Variant>(), n,
+                      port::NewStringListEncoder(out));
   }
 
   // Decodes "n" elements of type Variant from "in" and constructs a
@@ -247,7 +249,8 @@ struct Helper<Variant> {
   static TensorBuffer* Decode(Allocator* a, const Source& in, int64 n) {
     auto* buf = new Buffer<Variant>(a, n);
     Variant* ps = buf->template base<Variant>();
-    if (ps == nullptr || !port::DecodeVariantList(in, ps, n)) {
+    if (ps == nullptr ||
+        !DecodeVariantList(port::NewStringListDecoder(in), ps, n)) {
       buf->Unref();
       return nullptr;
     }
@@ -619,7 +622,7 @@ void Tensor::CheckTypeAndIsAligned(DataType expected_dtype) const {
   CHECK_EQ(dtype(), expected_dtype)
       << DataTypeString(expected_dtype) << " expected, got "
       << DataTypeString(dtype());
-  CHECK(IsAligned()) << "CheckTypeAndIsAligned";
+  CHECK(IsAligned()) << "ptr = " << base<void>();
 }
 
 void Tensor::CheckIsAlignedAndSingleElement() const {
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index 58fbced606c988..d2f2609d3b80a7 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -484,6 +484,7 @@ class Tensor {
   friend class TensorTestHelper;      // For access to set_shape
   friend class OpKernelContext;       // For access to RefCountIsOne().
   friend class ScopedAllocator;       // For access to buf_.
+  friend class XlaTensor;             // For access to RefCountIsOne().
   friend class XlaTensorBuffer;  // For access to the private constructor taking
                                  // the buffer
   template <typename Device, typename T>
diff --git a/tensorflow/core/framework/tensor.proto b/tensorflow/core/framework/tensor.proto
index abbf16e8103326..55921af1d0f7ab 100644
--- a/tensorflow/core/framework/tensor.proto
+++ b/tensorflow/core/framework/tensor.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "TensorProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/resource_handle.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/types.proto";
diff --git a/tensorflow/core/framework/tensor_description.proto b/tensorflow/core/framework/tensor_description.proto
index 6ac3c1b8810878..4c23c7e6205ada 100644
--- a/tensorflow/core/framework/tensor_description.proto
+++ b/tensorflow/core/framework/tensor_description.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "TensorDescriptionProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 import "tensorflow/core/framework/types.proto";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/allocation_description.proto";
diff --git a/tensorflow/core/framework/tensor_shape.proto b/tensorflow/core/framework/tensor_shape.proto
index 1ec3c5323c2c73..286156a0123303 100644
--- a/tensorflow/core/framework/tensor_shape.proto
+++ b/tensorflow/core/framework/tensor_shape.proto
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "TensorShapeProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 
 package tensorflow;
 
diff --git a/tensorflow/core/framework/tensor_slice.proto b/tensorflow/core/framework/tensor_slice.proto
index 24b01661dc4691..a5c366ed6061f3 100644
--- a/tensorflow/core/framework/tensor_slice.proto
+++ b/tensorflow/core/framework/tensor_slice.proto
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "TensorSliceProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 
 package tensorflow;
 
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index b613effd18bbba..80e168df972c4c 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -1147,29 +1147,29 @@ TEST(Tensor, FailureToAllocate) {
 
 // On the alignment.
 //
-// As of 2015/8, tensorflow::Tensor allocates its buffer with 32-byte
+// As of 2018/5, tensorflow::Tensor allocates its buffer with 64-byte
 // alignment. Tensor::tensor/flat/vec/matrix methods requires the
 // buffer satisfies Eigen::Aligned (e.g., 16-bytes aligned usually,
-// and 32-bytes for AVX). Tensor::Slice requires the caller to ensure
-// its result is aligned if the caller intends to use those methods.
-// In this test case, we simply make sure each slice is 32-byte
-// aligned: sizeof(float) * 4 * 2 = 32.
+// 32-bytes for AVX, and 64-bytes for AVX512). Tensor::Slice requires
+// the caller to ensure its result is aligned if the caller intends
+// to use those methods. In this test case, we simply make sure each
+// slice is 64-byte aligned: sizeof(float) * 4 * 36 = 576.  576 % 64 = 0.
 TEST(Tensor, Slice_Basic) {
   Tensor saved;
   {  // General
-    Tensor x(DT_FLOAT, TensorShape({10, 4, 34}));
+    Tensor x(DT_FLOAT, TensorShape({10, 4, 36}));
     // Fills in known values.
     for (int i = 0; i < 10; ++i) {
       x.Slice(i, i + 1).flat<float>().setConstant(i * 1.f);
     }
     // A simple slice along dim0.
     Tensor y = x.Slice(4, 8);
-    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 4, 34})));
+    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 4, 36})));
     auto tx = x.tensor<float, 3>();
     auto ty = y.tensor<float, 3>();
     for (int i = 0; i < 4; ++i) {
       for (int j = 0; j < 4; ++j) {
-        for (int k = 0; k < 34; ++k) {
+        for (int k = 0; k < 36; ++k) {
           EXPECT_EQ(ty(i, j, k), 4.0 + i);
           EXPECT_EQ(&tx(4 + i, j, k), &ty(i, j, k));
         }
@@ -1186,7 +1186,7 @@ TEST(Tensor, Slice_Basic) {
     auto tz = z.tensor<float, 3>();
     EXPECT_EQ(1, z.dim_size(0));
     for (int j = 0; j < 4; ++j) {
-      for (int k = 0; k < 34; ++k) {
+      for (int k = 0; k < 36; ++k) {
         EXPECT_EQ(tz(0, j, k), 6.0);
       }
     }
@@ -1198,16 +1198,16 @@ TEST(Tensor, Slice_Basic) {
     EXPECT_EQ(1, saved.dim_size(0));
     auto tsaved = saved.tensor<float, 3>();
     for (int j = 0; j < 4; ++j) {
-      for (int k = 0; k < 34; ++k) {
+      for (int k = 0; k < 36; ++k) {
         EXPECT_EQ(tsaved(0, j, k), 6.0);
       }
     }
   }
   {  // Empty
-    Tensor x(DT_FLOAT, TensorShape({10, 0, 34}));
+    Tensor x(DT_FLOAT, TensorShape({10, 0, 36}));
     x.flat<float>().setRandom();
     Tensor y = x.Slice(4, 8);
-    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 0, 34})));
+    EXPECT_TRUE(y.shape().IsSameSize(TensorShape({4, 0, 36})));
   }
 
   {
diff --git a/tensorflow/core/framework/tensor_util.cc b/tensorflow/core/framework/tensor_util.cc
index 8e3ac25512edd2..65f6dc1c00b512 100644
--- a/tensorflow/core/framework/tensor_util.cc
+++ b/tensorflow/core/framework/tensor_util.cc
@@ -168,5 +168,14 @@ Status Split(const Tensor& tensor, const gtl::ArraySlice<int64>& sizes,
   return Status::OK();
 }
 
+namespace internal {
+void SetTensorProtoShape(std::vector<size_t> shape,
+                         TensorShapeProto* shape_proto) {
+  for (auto dim : shape) {
+    shape_proto->mutable_dim()->Add()->set_size(dim);
+  }
+}
+}  // namespace internal
+
 }  // namespace tensor
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/tensor_util.h b/tensorflow/core/framework/tensor_util.h
index 6c218b69e07a0e..43d2d95311225e 100644
--- a/tensorflow/core/framework/tensor_util.h
+++ b/tensorflow/core/framework/tensor_util.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_FRAMEWORK_TENSOR_UTIL_H_
 
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 
 #include <vector>
 namespace tensorflow {
@@ -54,6 +55,108 @@ Status Concat(const gtl::ArraySlice<Tensor>& tensors,
 Status Split(const Tensor& tensor, const gtl::ArraySlice<int64>& sizes,
              std::vector<Tensor>* result) TF_MUST_USE_RESULT;
 
+namespace internal {
+void SetTensorProtoShape(std::vector<size_t> shape,
+                         TensorShapeProto* shape_proto);
+
+// Defines value type dependent methods to manipulate `TensorProto`.
+// Class specializations has to define following methods:
+//   static DataType GetDataType()
+//   static void AddValue(Type value, TensorProto* proto)
+template <typename Type>
+class TensorProtoHelper : public std::false_type {};
+
+template <>
+class TensorProtoHelper<string> : public std::true_type {
+ public:
+  static DataType GetDataType() { return DataType::DT_STRING; }
+  static void AddValue(const string& value, TensorProto* proto) {
+    *proto->mutable_string_val()->Add() = value;
+  }
+};
+
+template <>
+class TensorProtoHelper<int32> : public std::true_type {
+ public:
+  static DataType GetDataType() { return DataType::DT_INT32; }
+  static void AddValue(int32 value, TensorProto* proto) {
+    proto->mutable_int_val()->Add(value);
+  }
+};
+
+template <>
+class TensorProtoHelper<int64> : public std::true_type {
+ public:
+  static DataType GetDataType() { return DataType::DT_INT64; }
+  static void AddValue(int64 value, TensorProto* proto) {
+    proto->mutable_int64_val()->Add(value);
+  }
+};
+
+template <>
+class TensorProtoHelper<uint32> : public std::true_type {
+ public:
+  static DataType GetDataType() { return DataType::DT_UINT32; }
+  static void AddValue(uint32 value, TensorProto* proto) {
+    proto->mutable_uint32_val()->Add(value);
+  }
+};
+
+template <>
+class TensorProtoHelper<uint64> : public std::true_type {
+ public:
+  static DataType GetDataType() { return DataType::DT_UINT64; }
+  static void AddValue(uint64 value, TensorProto* proto) {
+    proto->mutable_uint64_val()->Add(value);
+  }
+};
+
+template <>
+class TensorProtoHelper<float> : public std::true_type {
+ public:
+  static DataType GetDataType() { return DataType::DT_FLOAT; }
+  static void AddValue(float value, TensorProto* proto) {
+    proto->mutable_float_val()->Add(value);
+  }
+};
+
+template <>
+class TensorProtoHelper<double> : public std::true_type {
+ public:
+  static DataType GetDataType() { return DataType::DT_DOUBLE; }
+  static void AddValue(double value, TensorProto* proto) {
+    proto->mutable_double_val()->Add(value);
+  }
+};
+
+template <>
+class TensorProtoHelper<bool> : public std::true_type {
+ public:
+  static DataType GetDataType() { return DataType::DT_BOOL; }
+  static void AddValue(bool value, TensorProto* proto) {
+    proto->mutable_bool_val()->Add(value);
+  }
+};
+}  // namespace internal
+
+// Creates a 'TensorProto' with specified shape and values.
+// The dtype and a field to represent data values of the returned 'TensorProto'
+// are determined based on type of the 'values' parameter.
+template <typename Type>
+typename std::enable_if<internal::TensorProtoHelper<Type>::value,
+                        TensorProto>::type
+CreateTensorProto(const std::vector<Type>& values,
+                  const std::vector<size_t>& shape) {
+  TensorProto tensor;
+  using TypeHelper = internal::TensorProtoHelper<Type>;
+  tensor.set_dtype(TypeHelper::GetDataType());
+  internal::SetTensorProtoShape(shape, tensor.mutable_tensor_shape());
+  for (const auto& value : values) {
+    TypeHelper::AddValue(value, &tensor);
+  }
+  return tensor;
+}
+
 }  // namespace tensor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/framework/tensor_util_test.cc b/tensorflow/core/framework/tensor_util_test.cc
index 69eb8363b2ceee..2b4e1cad2fa24c 100644
--- a/tensorflow/core/framework/tensor_util_test.cc
+++ b/tensorflow/core/framework/tensor_util_test.cc
@@ -226,5 +226,145 @@ TEST(TensorUtil, ConcatSplitStrings) {
   }
 }
 
+TEST(TensorProtoUtil, CreatesStringTensorProto) {
+  std::vector<string> values{"a", "b", "c"};
+  std::vector<size_t> shape{1, 3};
+
+  auto proto = tensor::CreateTensorProto(values, shape);
+
+  EXPECT_EQ(proto.DebugString(),
+            "dtype: DT_STRING\n"
+            "tensor_shape {\n"
+            "  dim {\n"
+            "    size: 1\n"
+            "  }\n"
+            "  dim {\n"
+            "    size: 3\n"
+            "  }\n"
+            "}\n"
+            "string_val: \"a\"\n"
+            "string_val: \"b\"\n"
+            "string_val: \"c\"\n");
+}
+
+TEST(TensorProtoUtil, CreatesInt32TensorProto) {
+  std::vector<int32> values{1, 2};
+  std::vector<size_t> shape{2};
+
+  auto proto = tensor::CreateTensorProto(values, shape);
+
+  EXPECT_EQ(proto.DebugString(),
+            "dtype: DT_INT32\n"
+            "tensor_shape {\n"
+            "  dim {\n"
+            "    size: 2\n"
+            "  }\n"
+            "}\n"
+            "int_val: 1\n"
+            "int_val: 2\n");
+}
+
+TEST(TensorProtoUtil, CreatesInt64TensorProto) {
+  std::vector<int64> values{1, 2};
+  std::vector<size_t> shape{2};
+
+  auto proto = tensor::CreateTensorProto(values, shape);
+
+  EXPECT_EQ(proto.DebugString(),
+            "dtype: DT_INT64\n"
+            "tensor_shape {\n"
+            "  dim {\n"
+            "    size: 2\n"
+            "  }\n"
+            "}\n"
+            "int64_val: 1\n"
+            "int64_val: 2\n");
+}
+
+TEST(TensorProtoUtil, CreatesUInt32TensorProto) {
+  std::vector<uint32> values{1, 2};
+  std::vector<size_t> shape{2};
+
+  auto proto = tensor::CreateTensorProto(values, shape);
+
+  EXPECT_EQ(proto.DebugString(),
+            "dtype: DT_UINT32\n"
+            "tensor_shape {\n"
+            "  dim {\n"
+            "    size: 2\n"
+            "  }\n"
+            "}\n"
+            "uint32_val: 1\n"
+            "uint32_val: 2\n");
+}
+
+TEST(TensorProtoUtil, CreatesUInt64TensorProto) {
+  std::vector<uint64> values{1, 2};
+  std::vector<size_t> shape{2};
+
+  auto proto = tensor::CreateTensorProto(values, shape);
+
+  EXPECT_EQ(proto.DebugString(),
+            "dtype: DT_UINT64\n"
+            "tensor_shape {\n"
+            "  dim {\n"
+            "    size: 2\n"
+            "  }\n"
+            "}\n"
+            "uint64_val: 1\n"
+            "uint64_val: 2\n");
+}
+
+TEST(TensorProtoUtil, CreatesFloatTensorProto) {
+  std::vector<float> values{1.1, 2.2};
+  std::vector<size_t> shape{2};
+
+  auto proto = tensor::CreateTensorProto(values, shape);
+
+  EXPECT_EQ(proto.DebugString(),
+            "dtype: DT_FLOAT\n"
+            "tensor_shape {\n"
+            "  dim {\n"
+            "    size: 2\n"
+            "  }\n"
+            "}\n"
+            "float_val: 1.1\n"
+            "float_val: 2.2\n");
+}
+
+TEST(TensorProtoUtil, CreatesDoubleTensorProto) {
+  std::vector<double> values{1.1, 2.2};
+  std::vector<size_t> shape{2};
+
+  auto proto = tensor::CreateTensorProto(values, shape);
+
+  EXPECT_EQ(proto.DebugString(),
+            "dtype: DT_DOUBLE\n"
+            "tensor_shape {\n"
+            "  dim {\n"
+            "    size: 2\n"
+            "  }\n"
+            "}\n"
+            "double_val: 1.1\n"
+            "double_val: 2.2\n");
+}
+
+TEST(TensorProtoUtil, CreatesBoolTensorProto) {
+  std::vector<bool> values{true, false};
+  std::vector<size_t> shape{2};
+
+  auto proto = tensor::CreateTensorProto(values, shape);
+
+  EXPECT_EQ(proto.DebugString(),
+            "dtype: DT_BOOL\n"
+            "tensor_shape {\n"
+            "  dim {\n"
+            "    size: 2\n"
+            "  }\n"
+            "}\n"
+            "bool_val: true\n"
+            "bool_val: false\n");
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/tracking_allocator.h b/tensorflow/core/framework/tracking_allocator.h
index f6c3c0b71b951c..661c28969e6143 100644
--- a/tensorflow/core/framework/tracking_allocator.h
+++ b/tensorflow/core/framework/tracking_allocator.h
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include <unordered_map>
 #include "tensorflow/core/framework/allocator.h"
-#include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/platform/mutex.h"
diff --git a/tensorflow/core/framework/types.proto b/tensorflow/core/framework/types.proto
index e003fd00106fba..03835d1b923d4f 100644
--- a/tensorflow/core/framework/types.proto
+++ b/tensorflow/core/framework/types.proto
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "TypesProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 
 // LINT.IfChange
 enum DataType {
diff --git a/tensorflow/core/framework/variable.proto b/tensorflow/core/framework/variable.proto
index e0df01cc9b7758..66ba4cba7d83d9 100644
--- a/tensorflow/core/framework/variable.proto
+++ b/tensorflow/core/framework/variable.proto
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "VariableProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 
 // Protocol buffer representing a Variable.
 message VariableDef {
@@ -25,6 +26,9 @@ message VariableDef {
 
   // Whether to represent this as a ResourceVariable.
   bool is_resource = 5;
+
+  // Whether this variable should be trained.
+  bool trainable = 7;
 }
 
 message SaveSliceInfoDef {
diff --git a/tensorflow/core/framework/variant.cc b/tensorflow/core/framework/variant.cc
index 6ad2fafee778de..5a507804b0b65c 100644
--- a/tensorflow/core/framework/variant.cc
+++ b/tensorflow/core/framework/variant.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -73,4 +74,36 @@ bool DecodeVariant(const string& buf, VariantTensorDataProto* value) {
   return value->ParseFromString(buf);
 }
 
+void EncodeVariantList(const Variant* variant_array, int64 n,
+                       std::unique_ptr<port::StringListEncoder> e) {
+  for (int i = 0; i < n; ++i) {
+    string s;
+    variant_array[i].Encode(&s);
+    e->Append(s);
+  }
+  e->Finalize();
+}
+
+bool DecodeVariantList(std::unique_ptr<port::StringListDecoder> d,
+                       Variant* variant_array, int64 n) {
+  std::vector<uint32> sizes(n);
+  if (!d->ReadSizes(&sizes)) return false;
+
+  for (int i = 0; i < n; ++i) {
+    if (variant_array[i].is_empty()) {
+      variant_array[i] = VariantTensorDataProto();
+    }
+    string str(d->Data(sizes[i]), sizes[i]);
+    if (!variant_array[i].Decode(str)) return false;
+    if (!DecodeUnaryVariant(&variant_array[i])) {
+      LOG(ERROR) << "Could not decode variant with type_name: \""
+                 << variant_array[i].TypeName()
+                 << "\".  Perhaps you forgot to register a "
+                    "decoder via REGISTER_UNARY_VARIANT_DECODE_FUNCTION?";
+      return false;
+    }
+  }
+  return true;
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/framework/variant_encode_decode.h b/tensorflow/core/framework/variant_encode_decode.h
index 5a84f9d94385a7..ded04b2a30f571 100644
--- a/tensorflow/core/framework/variant_encode_decode.h
+++ b/tensorflow/core/framework/variant_encode_decode.h
@@ -259,6 +259,16 @@ void EncodeVariant(const VariantTensorDataProto& value, string* buf);
 template <>
 bool DecodeVariant(const string& buf, VariantTensorDataProto* value);
 
+// Encodes an array of Variant objects in to the given StringListEncoder.
+// `variant_array` is assumed to point to an array of `n` Variant objects.
+void EncodeVariantList(const Variant* variant_array, int64 n,
+                       std::unique_ptr<port::StringListEncoder> e);
+
+// Decodes an array of Variant objects from the given StringListDecoder.
+// `variant_array` is assumed to point to an array of `n` Variant objects.
+bool DecodeVariantList(std::unique_ptr<port::StringListDecoder> d,
+                       Variant* variant_array, int64 n);
+
 }  // end namespace tensorflow
 
 #endif  // TENSORFLOW_FRAMEWORK_VARIANT_ENCODE_DECODE_H_
diff --git a/tensorflow/core/framework/versions.proto b/tensorflow/core/framework/versions.proto
index 7d5e58ae7d4230..dd2ec55238728f 100644
--- a/tensorflow/core/framework/versions.proto
+++ b/tensorflow/core/framework/versions.proto
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "VersionsProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/framework";
 
 // Version information for a piece of serialized data
 //
diff --git a/tensorflow/core/graph/algorithm_test.cc b/tensorflow/core/graph/algorithm_test.cc
index 99ced0c0f5daa7..f67d5a2fd2bd9d 100644
--- a/tensorflow/core/graph/algorithm_test.cc
+++ b/tensorflow/core/graph/algorithm_test.cc
@@ -144,8 +144,8 @@ TEST(AlgorithmTest, ReversePostOrderStable) {
     std::vector<Node*> order;
 
     // Test reverse post order generates expected ordering.
-    GetReversePostOrder(g, &order, /*stable_comparator=*/NodeComparatorID());
-    EXPECT_TRUE(ExpectBefore({{"t3", "t2"}}, order, &error));
+    GetReversePostOrder(g, &order, /*stable_comparator=*/NodeComparatorName());
+    EXPECT_TRUE(ExpectBefore({{"t2", "t3"}}, order, &error));
   }
 }
 }  // namespace
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index fb8a6c39e6786c..0f748515efc22e 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -79,6 +79,10 @@ const std::unordered_map<string, Node::NodeClass>& Node::kNodeClassTable =
         {"Size", NC_METADATA},
         {"Shape", NC_METADATA},
         {"Rank", NC_METADATA},
+        {"_ScopedAllocator", NC_SCOPED_ALLOCATOR},
+        {"CollectiveReduce", NC_COLLECTIVE},
+        {"CollectiveBcastSend", NC_COLLECTIVE},
+        {"CollectiveBcastRecv", NC_COLLECTIVE},
     });
 
 #undef REF_CLASS
@@ -694,7 +698,7 @@ Status Graph::AddWhileContext(StringPiece frame_name,
                               std::vector<OutputTensor> body_outputs,
                               WhileContext** result) {
   auto pair = while_ctxs_.insert(std::pair<string, WhileContext>(
-      frame_name.ToString(),
+      std::string(frame_name),
       WhileContext(frame_name, std::move(enter_nodes), std::move(exit_nodes),
                    cond_output, std::move(body_inputs),
                    std::move(body_outputs))));
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index f7ca7d0620f4d4..33fb7cb57a2cd7 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -34,8 +34,8 @@ limitations under the License.
 // between output O of layer A and input I of layer B using
 // "input index" and "output index" labels per edge.
 
-#ifndef TENSORFLOW_GRAPH_GRAPH_H_
-#define TENSORFLOW_GRAPH_GRAPH_H_
+#ifndef TENSORFLOW_CORE_GRAPH_GRAPH_H_
+#define TENSORFLOW_CORE_GRAPH_GRAPH_H_
 
 #include <functional>
 #include <string>
@@ -162,6 +162,8 @@ class Node {
   }
   bool IsHostSend() const { return class_ == NC_HOST_SEND; }
   bool IsHostRecv() const { return class_ == NC_HOST_RECV; }
+  bool IsScopedAllocator() const { return class_ == NC_SCOPED_ALLOCATOR; }
+  bool IsCollective() const { return class_ == NC_COLLECTIVE; }
 
   bool IsMetadata() const { return class_ == NC_METADATA; }
 
@@ -233,6 +235,8 @@ class Node {
     NC_GET_SESSION_TENSOR,
     NC_DELETE_SESSION_TENSOR,
     NC_METADATA,
+    NC_SCOPED_ALLOCATOR,
+    NC_COLLECTIVE,
     NC_OTHER  // Not a special kind of node
   };
 
@@ -696,6 +700,8 @@ inline bool IsControlFlow(const Node* n) { return n->IsControlFlow(); }
 // (shape).  Specifically, returns true for "Size", "Shape" and "Rank" ops.
 inline bool IsMetadata(const Node* n) { return n->IsMetadata(); }
 
+inline bool IsScopedAllocator(const Node* n) { return n->IsScopedAllocator(); }
+
 inline bool IsHostMemoryPreserving(const Node* node) {
   return IsIdentity(node) || IsControlFlow(node);
 }
@@ -827,4 +833,4 @@ inline const string& Node::assigned_device_name() const {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPH_GRAPH_H_
+#endif  // TENSORFLOW_CORE_GRAPH_GRAPH_H_
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index c678283fce1768..0967492d92bfe0 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -278,8 +278,9 @@ class GraphConstructor {
   // name, the value is the new unique name.
   std::unordered_map<string, string> uniquified_names_;
 
-  // Index of NodeDefs in node_defs_ with all inputs already converted.
-  std::vector<int> ready_;
+  // Index of NodeDefs in node_defs_ with all inputs already converted. We use a
+  // (sorted) set so nodes are created in the order defined in the GraphDef.
+  std::set<int> ready_;
 
   // Mapping between index within node_defs_ and the number of inputs that
   // still need to be converted.
@@ -489,7 +490,7 @@ Status GraphConstructor::InitFromEdges() {
           num_control_edges++;
         } else {
           TensorId id(ParseTensorName(input_name));
-          if (next_iteration_nodes_.find(id.first.ToString()) !=
+          if (next_iteration_nodes_.find(std::string(id.first)) !=
               next_iteration_nodes_.end()) {
             has_loop_back_edge = true;
           }
@@ -520,7 +521,7 @@ Status GraphConstructor::InitFromEdges() {
       }
     }
     if (pending_count == 0) {
-      ready_.push_back(n);
+      ready_.insert(n);
     }
     pending_count_.push_back(pending_count);
   }
@@ -811,7 +812,7 @@ void GraphConstructor::UniquifyNames(
     // We require that UniquifyNames() is called on all NodeDefs in topological
     // order. This guarantees that node_def's inputs will already be uniquified
     // if necessary.
-    auto iter = uniquified_names_.find(id.first.ToString());
+    auto iter = uniquified_names_.find(std::string(id.first));
     if (iter == uniquified_names_.end()) continue;
     id.first = iter->second;
     node_def->set_input(i, id.ToString());
@@ -830,7 +831,7 @@ void GraphConstructor::UpdateUniquifiedColocationNames() {
     for (int i = 0; i < coloc_values.size(); ++i) {
       StringPiece val(coloc_values[i]);
       if (str_util::ConsumePrefix(&val, kColocationGroupPrefix)) {
-        const auto& name_pair = uniquified_names_.find(val.ToString());
+        const auto& name_pair = uniquified_names_.find(std::string(val));
         if (name_pair == uniquified_names_.end()) continue;
         updated = true;
         coloc_values[i] =
@@ -856,7 +857,7 @@ bool GraphConstructor::NameExistsInGraphDef(StringPiece name) {
 }
 
 string GraphConstructor::FindUniqueName(StringPiece original_name) {
-  string name = original_name.ToString();
+  string name = std::string(original_name);
   int count = 0;
   // Check that any generated names don't collide with imported NodeDefs (as
   // well as nodes in g_).
@@ -884,12 +885,12 @@ namespace {
 
 void UpdatePendingCountAndReady(
     const std::vector<gtl::InlinedVector<int, 4>>& outputs, int o,
-    std::vector<int>* pending_count, std::vector<int>* ready) {
+    std::vector<int>* pending_count, std::set<int>* ready) {
   for (size_t i = 0; i < outputs[o].size(); ++i) {
     const int output = outputs[o][i];
     (*pending_count)[output]--;
     if ((*pending_count)[output] == 0) {
-      ready->push_back(output);
+      ready->insert(output);
     }
   }
 }
@@ -913,8 +914,8 @@ Status GraphConstructor::Convert() {
   // inputs, pending_counts_ with the number of inputs for each node and
   // outputs_ with the outputs of each node).
   while (!ready_.empty()) {
-    int o = ready_.back();
-    ready_.pop_back();
+    int o = *ready_.begin();
+    ready_.erase(ready_.begin());
     ++processed;
     inputs.clear();
     bool has_data_back_edge = false;
@@ -989,7 +990,7 @@ Status GraphConstructor::Convert() {
             src_node->num_outputs(), " outputs");
       }
 
-      inputs.emplace_back(id.first.ToString(), src_node, src_index);
+      inputs.emplace_back(std::string(id.first), src_node, src_index);
     }
 
     if (has_data_back_edge && !IsMerge(*node_def)) {
diff --git a/tensorflow/core/graph/graph_constructor_test.cc b/tensorflow/core/graph/graph_constructor_test.cc
index c18ccf6ce44265..63098701907847 100644
--- a/tensorflow/core/graph/graph_constructor_test.cc
+++ b/tensorflow/core/graph/graph_constructor_test.cc
@@ -157,7 +157,7 @@ class GraphConstructorTest : public ::testing::Test {
     }
     StringPiece loc(value[0]);
     return str_util::ConsumePrefix(&loc, kColocationGroupPrefix)
-               ? loc.ToString()
+               ? std::string(loc)
                : "";
   }
 
@@ -3160,5 +3160,20 @@ TEST_F(GraphConstructorTest, ImportGraphDef_ValidateColationConstraints) {
   TF_EXPECT_OK(ImportGraphDef(options, def, &graph_, nullptr));
 }
 
+TEST_F(GraphConstructorTest, ImportGraphDef_UnknownOps) {
+  const string pb_ascii = "node { name: 'op_from_contrib' op: 'OpFromContrib'}";
+  // Try load twice to check for two parts of the error message. We cannot check
+  // for the whole thing in one go because the message includes the hostname.
+  ExpectError(pb_ascii, {"Op type not registered 'OpFromContrib'"});
+  ExpectError(
+      pb_ascii,
+      {"Make sure the Op and Kernel are registered in the "
+       "binary running in this process. Note that if you "
+       "are loading a saved graph which used ops from "
+       "tf.contrib, accessing (e.g.) `tf.contrib.resampler` should be done "
+       "before importing the graph, as contrib ops are lazily registered "
+       "when the module is first accessed."});
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/graph_def_builder.cc b/tensorflow/core/graph/graph_def_builder.cc
index 7a58347bd1ba44..dd84c4f7c7269d 100644
--- a/tensorflow/core/graph/graph_def_builder.cc
+++ b/tensorflow/core/graph/graph_def_builder.cc
@@ -44,12 +44,12 @@ GraphDefBuilder::Options GraphDefBuilder::Options::WithControlInputs(
 }
 GraphDefBuilder::Options GraphDefBuilder::Options::WithNameImpl(
     StringPiece name) {
-  name_ = name.ToString();
+  name_ = std::string(name);
   return *this;
 }
 GraphDefBuilder::Options GraphDefBuilder::Options::WithDeviceImpl(
     StringPiece device) {
-  device_ = device.ToString();
+  device_ = std::string(device);
   return *this;
 }
 GraphDefBuilder::Options GraphDefBuilder::Options::WithControlInputImpl(
diff --git a/tensorflow/core/graph/graph_def_builder.h b/tensorflow/core/graph/graph_def_builder.h
index 776a74c6d8821e..0d6aae43556920 100644
--- a/tensorflow/core/graph/graph_def_builder.h
+++ b/tensorflow/core/graph/graph_def_builder.h
@@ -128,7 +128,7 @@ class GraphDefBuilder {
     Options WithControlInputsImpl(gtl::ArraySlice<Node*> control_inputs);
     template <class T>
     Options WithAttrImpl(StringPiece name, T&& value) {
-      attrs_.emplace_back(name.ToString(), AttrValue());
+      attrs_.emplace_back(std::string(name), AttrValue());
       SetAttrValue(std::forward<T>(value), &attrs_.back().second);
       return *this;
     }
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index 877e4f1b44e005..1b1941f9c19cf6 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -785,7 +785,7 @@ Status TopologicalSortNodesWithTimePriority(
   for (int n = 0; n < gdef->node_size(); ++n) {
     const NodeDef* ndef = &gdef->node(n);
     for (int i = 0; i < ndef->input_size(); ++i) {
-      node_to_output_nodes[ParseTensorName(ndef->input(i)).first.ToString()]
+      node_to_output_nodes[std::string(ParseTensorName(ndef->input(i)).first)]
           .push_back(ndef);
     }
     int64 start_time;
diff --git a/tensorflow/core/graph/graph_partition_test.cc b/tensorflow/core/graph/graph_partition_test.cc
index 83b24cafe2cb36..f44ed47a6e94ac 100644
--- a/tensorflow/core/graph/graph_partition_test.cc
+++ b/tensorflow/core/graph/graph_partition_test.cc
@@ -329,11 +329,11 @@ TEST_F(GraphPartitionTest, CrossDeviceControl_MultiUse) {
   string b = "/job:a/replica:0/task:0/cpu:1";
   a1 = FloatInput(scope_a_.WithOpName("A1"));
   auto c = Const(scope_a_.WithOpName("A1/_0").WithControlDependencies(a1), {});
-  _Send(scope_a_.WithOpName("A1/_1"), c, "edge_1_A1", a, 82, b);
+  _Send(scope_a_.WithOpName("A1/_1"), c, "edge_3_A1", a, 82, b);
   ExpectMatchA();
 
   auto recv =
-      _Recv(scope_b_.WithOpName("A1/_2"), DT_FLOAT, "edge_1_A1", a, 82, b);
+      _Recv(scope_b_.WithOpName("A1/_2"), DT_FLOAT, "edge_3_A1", a, 82, b);
   auto id = Identity(scope_b_.WithOpName("A1/_3"), recv);
   b1 = FloatInput(scope_b_.WithOpName("B1"));
   Combine(scope_b_.WithOpName("B2").WithControlDependencies(id), b1, b1);
@@ -353,18 +353,18 @@ TEST_F(GraphPartitionTest, CrossDevice_DataControl) {
   string a = "/job:a/replica:0/task:0/cpu:0";
   string b = "/job:a/replica:0/task:0/cpu:1";
   a1 = FloatInput(scope_a_.WithOpName("A1"));
-  auto c = Const(scope_a_.WithOpName("A1/_0").WithControlDependencies(a1), {});
+  _Send(scope_a_.WithOpName("A1/_0"), a1, "edge_1_A1", a, 82, b);
+  auto c = Const(scope_a_.WithOpName("A1/_2").WithControlDependencies(a1), {});
   // NOTE: Send 0 A1/_1 -> A1/_2 is not necessarily needed. We could
   // use A1/_0 -> A1/_4 as the control as a minor optimization.
-  _Send(scope_a_.WithOpName("A1/_1"), c, "edge_1_A1", a, 82, b);
-  _Send(scope_a_.WithOpName("A1/_4"), a1, "edge_2_A1", a, 82, b);
+  _Send(scope_a_.WithOpName("A1/_3"), c, "edge_3_A1", a, 82, b);
   ExpectMatchA();
 
   auto recv1 =
-      _Recv(scope_b_.WithOpName("A1/_2"), DT_FLOAT, "edge_1_A1", a, 82, b);
-  auto id1 = Identity(scope_b_.WithOpName("A1/_3"), recv1);
+      _Recv(scope_b_.WithOpName("A1/_4"), DT_FLOAT, "edge_3_A1", a, 82, b);
+  auto id1 = Identity(scope_b_.WithOpName("A1/_5"), recv1);
   auto recv2 =
-      _Recv(scope_b_.WithOpName("A1/_5"), DT_FLOAT, "edge_2_A1", a, 82, b);
+      _Recv(scope_b_.WithOpName("A1/_1"), DT_FLOAT, "edge_1_A1", a, 82, b);
   b1 = FloatInput(scope_b_.WithOpName("B1"));
   Combine(scope_b_.WithOpName("B2"), recv2, b1);
   FloatInput(scope_b_.WithOpName("B3").WithControlDependencies(id1));
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 72a13d4da7a70a..b9667998d6143b 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -2691,14 +2691,14 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
     // If Op has been specifically assigned to a non-CPU device, then No.
     if (!n->assigned_device_name().empty() &&
-        !str_util::StrContains(n->assigned_device_name(),kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->assigned_device_name(), kCPUDeviceSubStr)) {
       result = false;
       reason = "Op has been assigned a runtime device that is not CPU.";
     }
 
     // If user has specifically assigned this op to a non-CPU device, then No.
     if (!n->def().device().empty() &&
-        !str_util::StrContains(n->def().device(),kCPUDeviceSubStr)) {
+        !str_util::StrContains(n->def().device(), kCPUDeviceSubStr)) {
       result = false;
       reason = "User has assigned a device that is not CPU.";
     }
@@ -2865,9 +2865,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return false;
   }
 
-  // If the depth_radius of LRN is not 2, then MKL DNN takes unoptimized 
-  // path. The unoptimized path is slow. Thus we dont rewrite the node 
-  // and use default Eigen. But for depth_radius=2, MKL DNN optimized 
+  // If the depth_radius of LRN is not 2, then MKL DNN takes unoptimized
+  // path. The unoptimized path is slow. Thus we dont rewrite the node
+  // and use default Eigen. But for depth_radius=2, MKL DNN optimized
   // path is taken, i.e., eigen node is rewritten by MKl DNN node.
   static bool LrnRewrite(const Node* n) {
     CHECK_NOTNULL(n);
@@ -2876,13 +2876,13 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     CHECK_EQ(GetNodeAttr(n->def(), "depth_radius", &depth_radius).ok(), true);
 
     // if the depth_radius of LRN is not 2, don't rewrite the node by MKL DNN
-    // and use eigen node instead 
+    // and use eigen node instead
     if (depth_radius == 2) {
       return true;
     }
     VLOG(1) << "LrnRewrite: The model sets depth_radius as not 2 which"
             << "case is not optimized by Intel MKL, thus using Eigen op"
-            << "for LRN " ; 
+            << "for LRN ";
 
     return false;
   }
@@ -3015,6 +3015,35 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                                 std::vector<NodeBuilder::NodeOut>* ws_tensors,
                                 bool* are_ws_tensors_added);
 
+  // Helper function used by FixMklMetaDataEdges. Fixes the metadata edge
+  // pointed by 'e_metadata' corresponding to the data edge 'e_data' in graph
+  // 'g'. Returns true is fixup was done; otherwise, it returns false.
+  bool FixMklMetaDataEdgeIfNeeded(std::unique_ptr<Graph>* g,
+    const Edge* e_data, const Edge* e_metadata);
+
+  // Are the input Mkl metadata edges for node 'n' in graph 'g' correctly
+  // connected? If not, then fix them. This is needed because a graph may have
+  // some input Mkl metadata edges incorrectly setup after node merge and
+  // rewrite passes. This could happen because GetReversePostOrder function may
+  // not provide topologically sorted order if a graph contains cycles. The
+  // function returns true if at least one Mkl metadata edge for node 'n' was
+  // fixed. Otherwise, it returns false.
+  //
+  // Example:
+  //
+  // X = MklConv2D(_, _, _)
+  // Y = MklConv2DWithBias(_, _, _, _, _, _)
+  // Z = MklAdd(X, Y, DummyMklTensor, Y:1)
+  //
+  // For a graph such as shown above, note that 3rd argument of MklAdd contains
+  // DummyMklTensor. Actually, it should be getting the Mkl metadata from
+  // MklConv2D op (specifically, X:2). This incorrect plumbing could be possible
+  // (although rare) if the Mkl NodeMerge + NodeRewrite passes visit Z before X
+  // (possible if X, Y, Z are part of a loop.) This function fixes the Mkl
+  // metadata edges only - it does not rewrite nodes nor does it modify the Mkl
+  // data edges (1st and 2nd arguments of MklAdd).
+  bool FixMklMetaDataEdges(std::unique_ptr<Graph>* g, Node* n);
+
   // Functions specific to operators to copy attributes
   // We need operator-specific function to copy attributes because the framework
   // does not provide any generic function for it.
@@ -4241,6 +4270,92 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
   return nullptr;
 }
 
+///////////////////////////////////////////////////////////////////////////////
+//              Post-rewrite Mkl metadata fixup pass
+///////////////////////////////////////////////////////////////////////////////
+bool MklLayoutRewritePass::FixMklMetaDataEdgeIfNeeded(std::unique_ptr<Graph>* g,
+    const Edge* e_data, const Edge* e_metadata) {
+  if (g == nullptr || e_data == nullptr || e_metadata == nullptr) {
+    return false;
+  }
+
+  Node* n_data = e_data->src();
+  int n_data_op_slot = e_data->src_output();
+  int n_metadata_op_slot = GetTensorMetaDataIndex(n_data_op_slot,
+                                                  n_data->num_outputs());
+
+  // If the source of meta edge is a constant node (producing dummy Mkl metadata
+  // tensor), then we will need to fix.
+  if (IsConstant(e_metadata->src())) {
+    Node* e_metadata_dst = e_metadata->dst();
+    int e_metadata_in_slot = e_metadata->dst_input();
+    CHECK_NOTNULL((*g)->AddEdge(n_data, n_metadata_op_slot,
+                  e_metadata_dst, e_metadata_in_slot));
+
+    (*g)->RemoveEdge(e_metadata);
+    return true;
+  }
+
+  return false;
+}
+
+bool MklLayoutRewritePass::FixMklMetaDataEdges(std::unique_ptr<Graph>* g,
+    Node* n) {
+  bool result = false;
+
+  // If graph node is not Mkl node, then return.
+  DataType T = DT_INVALID;
+  if (!GetNodeAttr(n->def(), "T", &T).ok() ||
+      !mkl_op_registry::IsMklOp(n->type_string(), T)) {
+    return result;
+  }
+
+  // If it is Mkl node, then check if the input edges to this node that carry
+  // Mkl metadata are linked up correctly with the source node.
+
+  // For Mkl nodes, we generate twice the number of input tensors (n for Mkl
+  // data tensors + n for Mkl metadata tensors). We need to check for correct
+  // connection of n metadata tensors only.
+  int num_data_inputs = n->num_inputs() / 2;
+  for (int idx = 0; idx < num_data_inputs; idx++) {
+    // Get the edge connecting input slot with index (idx).
+    const Edge* e = nullptr;
+    TF_CHECK_OK(n->input_edge(idx, &e));
+
+    // If e is control edge, then skip.
+    if (e->IsControlEdge()) {
+      continue;
+    }
+
+    // Check that the source node for edge 'e' is Mkl node. If it is not an Mkl
+    // node, then we don't need to do anything.
+    Node* e_src = e->src();
+    if (GetNodeAttr(e_src->def(), "T", &T).ok() &&
+        mkl_op_registry::IsMklOp(e_src->type_string(), T)) {
+      // Source node for edge 'e' is Mkl node.
+      // Destination node and destination input slot of e is node 'n' and 'idx'
+      // resp.
+      CHECK_EQ(e->dst(), n);
+      CHECK_EQ(e->dst_input(), idx);
+
+      // Let's get edge that carries Mkl metadata corresponding to Mkl data edge
+      // 'e'. For that, let's first get the input slot of 'n' where the meta
+      // edge will feed the value.
+      int e_meta_in_slot = GetTensorMetaDataIndex(e->dst_input(),
+                                                  n->num_inputs());
+      const Edge* e_meta = nullptr;
+      TF_CHECK_OK(n->input_edge(e_meta_in_slot, &e_meta));
+
+      // Let's check if we need to fix this meta edge.
+      if (FixMklMetaDataEdgeIfNeeded(g, e, e_meta)) {
+        result = true;
+      }
+    }
+  }
+
+  return result;
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 //              Run function for the pass
 ///////////////////////////////////////////////////////////////////////////////
@@ -4307,6 +4422,25 @@ bool MklLayoutRewritePass::RunPass(std::unique_ptr<Graph>* g) {
 
   DumpGraph("After running MklLayoutRewritePass(NodeMerge+Rewrite)", &**g);
 
+  order.clear();
+  GetReversePostOrder(**g, &order);  // This will give us topological sort.
+  for (Node* n : order) {
+    // If node is not an op or it cannot run on CPU device, then skip.
+    if (!n->IsOp() || !CanOpRunOnCPUDevice(n)) {
+      continue;
+    }
+    if (FixMklMetaDataEdges(g, n)) {
+      string node_name = n->name();
+      string op_name = n->type_string();
+
+      VLOG(1) << "MklLayoutRewritePass: fixed metadata edges for node "
+              << node_name << " with op " << op_name;
+      result = true;
+    }
+  }
+  DumpGraph("After running MklLayoutRewritePass(NodeMerge+Rewrite+Fixup)",
+            &**g);
+
   return result;
 }
 
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 5e2a465e22c7cb..7645b4a7f0afa5 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -2022,6 +2022,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Positive) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Input'}"
       "node { name: 'E' op: 'BiasAdd'"
@@ -2051,6 +2052,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_NoAddBias) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}");
   EXPECT_EQ(DoMklLayoutOptimizationPass(),
             "A(Input);B(Input);C(_MklConv2D);DMT/_0(Const);DMT/_1(Const)|"
@@ -2069,6 +2071,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow1) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Input'}"
       "node { name: 'E' op: 'Input'}"
@@ -2095,6 +2098,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_Dataflow2) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Input'}"
       "node { name: 'E' op: 'Input'}"
@@ -2125,6 +2129,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_Negative_AttrMismatch) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Input'}"
       "node { name: 'E' op: 'BiasAdd'"
@@ -2151,6 +2156,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Positive) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B', 'C'] }"
       "node { name: 'E' op: 'BiasAddGrad'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
@@ -2178,6 +2184,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Negative1) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B', 'C'] }"
       "node { name: 'E' op: 'BiasAddGrad'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
@@ -2204,6 +2211,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Negative2) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B', 'C'] }"
       "node { name: 'E' op: 'BiasAddGrad'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
@@ -2233,6 +2241,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackpropFilterFusion_Negative3) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
       "node { name: 'E' op: 'Zeta'"
       " attr {key: 'T'                 value { type: DT_FLOAT } }"
@@ -2272,6 +2281,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_ConvBpropInput_FilterFwd) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Input'}"
       "node { name: 'E' op: 'BiasAdd'"
@@ -2289,6 +2299,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_ConvBpropInput_FilterFwd) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['F', 'B', 'E']}"
       "node { name: 'Z' op: 'Zeta'"
       " attr {key: 'T'                 value { type: DT_FLOAT } }"
@@ -2319,6 +2330,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Basic) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['B', 'C'] }");
@@ -2341,6 +2353,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Positive1) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Conv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
@@ -2348,6 +2361,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Positive1) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'C']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['C', 'D'] }");
@@ -2370,6 +2384,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_Negative_UnsupportedType) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_HALF } }"
       " input: ['B', 'C'] }");
@@ -2389,6 +2404,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_Positive) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B', 'C']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['A', 'D'] }");
@@ -2411,6 +2427,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradInput_Positive) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['B', 'A', 'C']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['A', 'D'] }");
@@ -2477,6 +2494,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_BiasAddGrad_Positive2) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B', 'M', 'N']}"
       "node { name: 'D' op: 'Zeta'"
       " attr {key: 'T'                 value { type: DT_FLOAT } }"
@@ -2529,6 +2547,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_Mkl) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'F' op: 'Conv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
@@ -2536,6 +2555,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_Mkl) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['C', 'D']}"
       "node { name: 'G' op: 'Const' "
       " attr { key: 'dtype' value { type: DT_INT32 } }"
@@ -2572,6 +2592,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Concat_Input_MixedMkl) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['C', 'D']}"
@@ -2634,6 +2655,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_Mkl) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'F' op: 'Conv2D'"
       " attr { key: 'T'                value { type: DT_FLOAT } }"
@@ -2641,6 +2663,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_Mkl) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['C', 'D']}"
       "node { name: 'G' op: 'Const' "
       " attr { key: 'dtype' value { type: DT_INT32 } }"
@@ -2678,6 +2701,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_ConcatV2_Input_MixedMkl) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'F' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['C', 'D']}"
@@ -3274,6 +3298,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2D_DeviceTest) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B']}"
       "node { name: 'D' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['B', 'C'] }",
@@ -3296,6 +3321,7 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DBackprop_DeviceTest) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B', 'C', 'M', 'N', 'O']}"
       "node { name: 'E' op: 'Zeta'"
       " attr {key: 'T'                 value { type: DT_FLOAT } }"
@@ -3323,6 +3349,7 @@ TEST_F(MklLayoutPassTest, NodeRewrite_Conv2DGradFilter_DeviceTest) {
       " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
       " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
       " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
       " input: ['A', 'B', 'C']}"
       "node { name: 'E' op: 'Zeta' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['A', 'D'] }",
@@ -3491,6 +3518,37 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_DeviceTest) {
             "B->C:1;C->E;D->E:1;E->Z;M->C:2;N->C:3;Y->Z:1");
 }
 
+/////////////////////////////////////////////////////////////////////
+//         Post-rewrite fixup pass test
+
+TEST_F(MklLayoutPassTest, PostRewriteFixUpPass) {
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Input'}"
+      "node { name: 'M' op: '_MklInput'}"
+      "node { name: 'N' op: '_MklInput'}"
+      "node { name: 'C' op: '_MklConv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NCHW' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['A', 'B', 'M', 'N']}"
+      "node { name: 'D' op: 'Const' "
+      " attr { key: 'dtype' value { type: DT_UINT8 } }"
+      " attr { key: 'value' value { "
+      "    tensor { dtype: DT_UINT8 tensor_shape { dim { size: 1 } } "
+      "    int_val: 0 } } } }"
+      "node { name: 'E' op: '_MklAdd'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['C', 'A', 'D', 'D']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Input);C(_MklConv2D);D(Const);E(_MklAdd);"
+            "M(_MklInput);N(_MklInput)|A->C;A->E:1;B->C:1;C->E;C:2->E:2;"
+            "D->E:3;M->C:2;N->C:3");
+}
+
 /////////////////////////////////////////////////////////////////////
 
 static void BM_MklLayoutRewritePass(int iters, int op_nodes) {
diff --git a/tensorflow/core/graph/node_builder.cc b/tensorflow/core/graph/node_builder.cc
index 114962c0e4f296..03f3bbd6634b8a 100644
--- a/tensorflow/core/graph/node_builder.cc
+++ b/tensorflow/core/graph/node_builder.cc
@@ -30,7 +30,7 @@ NodeBuilder::NodeOut::NodeOut(Node* n, int32 i)  // NOLINT(runtime/explicit)
       dt(SafeGetOutput(node, i, &error)) {}
 
 NodeBuilder::NodeOut::NodeOut(StringPiece n, int32 i, DataType t)
-    : node(nullptr), error(false), name(n.ToString()), index(i), dt(t) {}
+    : node(nullptr), error(false), name(std::string(n)), index(i), dt(t) {}
 
 NodeBuilder::NodeOut::NodeOut()
     : node(nullptr), error(true), index(0), dt(DT_FLOAT) {}
diff --git a/tensorflow/core/graph/optimizer_cse_test.cc b/tensorflow/core/graph/optimizer_cse_test.cc
index 21a63662cf22c4..c1f93ce05ae99f 100644
--- a/tensorflow/core/graph/optimizer_cse_test.cc
+++ b/tensorflow/core/graph/optimizer_cse_test.cc
@@ -115,8 +115,8 @@ TEST_F(OptimizerCSETest, Simple) {
       "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['A', 'B'] }");
   EXPECT_EQ(DoCSE(),
-            "A(Input);B(Input);D(Mul)|"
-            "A->D;B->D:1");
+            "A(Input);B(Input);C(Mul)|"
+            "A->C;B->C:1");
 }
 
 TEST_F(OptimizerCSETest, Simple_ThreeEquivalent) {
@@ -130,8 +130,8 @@ TEST_F(OptimizerCSETest, Simple_ThreeEquivalent) {
       "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['A', 'B'] }");
   EXPECT_EQ(DoCSE(),
-            "A(Input);B(Input);E(Mul)|"
-            "A->E;B->E:1");
+            "A(Input);B(Input);C(Mul)|"
+            "A->C;B->C:1");
 }
 
 TEST_F(OptimizerCSETest, Simple_WithFixups) {
@@ -145,8 +145,8 @@ TEST_F(OptimizerCSETest, Simple_WithFixups) {
       "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['C', 'D'] }");
   EXPECT_EQ(DoCSE(),
-            "A(Input);B(Input);D(Mul);E(Mul)|"
-            "A->D;B->D:1;D->E;D->E:1");
+            "A(Input);B(Input);C(Mul);E(Mul)|"
+            "A->C;B->C:1;C->E;C->E:1");
 }
 
 TEST_F(OptimizerCSETest, Simple_Commutative) {
@@ -158,8 +158,8 @@ TEST_F(OptimizerCSETest, Simple_Commutative) {
       "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['B', 'A'] }");
   EXPECT_EQ(DoCSE(),
-            "A(Input);B(Input);D(Mul)|"
-            "A->D:1;B->D");
+            "A(Input);B(Input);C(Mul)|"
+            "A->C;B->C:1");
 }
 
 static bool IsNotMultiply(const Node* n) { return n->type_string() != "Mul"; }
@@ -210,8 +210,8 @@ TEST_F(OptimizerCSETest, Simple_SameOps_SameAttrs1) {
       " input: ['A', 'B'] attr { key: 'shape'"
       "    value { shape: { dim: { size: 37 name: 'SAME_NAME' } } } } }");
   EXPECT_EQ(DoCSE(),
-            "A(Input);B(Input);D(Mul)|"
-            "A->D;B->D:1");
+            "A(Input);B(Input);C(Mul)|"
+            "A->C;B->C:1");
 }
 
 TEST_F(OptimizerCSETest, Simple_SameOps_SameAttrs2) {
@@ -229,8 +229,8 @@ TEST_F(OptimizerCSETest, Simple_SameOps_SameAttrs2) {
       "    attr { key: 't' value { type: DT_INT32 } }"
       "    attr { key: 'a' value { i: 3 } } }");
   EXPECT_EQ(DoCSE(),
-            "A(Input);B(Input);D(Mul)|"
-            "A->D;B->D:1");
+            "A(Input);B(Input);C(Mul)|"
+            "A->C;B->C:1");
 }
 
 TEST_F(OptimizerCSETest, SameConstants) {
@@ -249,8 +249,8 @@ TEST_F(OptimizerCSETest, SameConstants) {
       "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_INT32 } }"
       " input: ['A', 'B'] }");
   EXPECT_EQ(DoCSE(),
-            "B(Const);D(Mul)|"
-            "B->D;B->D:1");
+            "A(Const);D(Mul)|"
+            "A->D;A->D:1");
 }
 
 TEST_F(OptimizerCSETest, DifferentConstants) {
@@ -338,8 +338,8 @@ TEST_F(OptimizerCSETest, Constant_Dedup) {
             "n/_0(Const);n/_1(Const);n/_2(Const);n/_3(Const);"
             "n/_4(Const);n/_5(Const);n/_6(Const);n/_7(Const)|");
   // In theory, there are 2^4 possible correct output of CSE.  In this
-  // test, it happens to eliminate the first 4 nodes.
-  EXPECT_EQ(DoCSE(), "n/_4(Const);n/_5(Const);n/_6(Const);n/_7(Const)|");
+  // test, it happens to eliminate the last 4 nodes.
+  EXPECT_EQ(DoCSE(), "n/_0(Const);n/_1(Const);n/_2(Const);n/_3(Const)|");
 }
 
 static void BM_CSE(int iters, int op_nodes) {
diff --git a/tensorflow/core/graph/while_context.cc b/tensorflow/core/graph/while_context.cc
index 10a2b67f378174..1b38aac35db9f5 100644
--- a/tensorflow/core/graph/while_context.cc
+++ b/tensorflow/core/graph/while_context.cc
@@ -23,7 +23,7 @@ WhileContext::WhileContext(StringPiece frame_name,
                            OutputTensor cond_output,
                            std::vector<OutputTensor> body_inputs,
                            std::vector<OutputTensor> body_outputs)
-    : frame_name_(frame_name.ToString()),
+    : frame_name_(std::string(frame_name)),
       enter_nodes_(std::move(enter_nodes)),
       exit_nodes_(std::move(exit_nodes)),
       cond_output_(cond_output),
diff --git a/tensorflow/core/graph/while_context.h b/tensorflow/core/graph/while_context.h
index 5944e368979ce0..2a83eb7bd8eb94 100644
--- a/tensorflow/core/graph/while_context.h
+++ b/tensorflow/core/graph/while_context.h
@@ -31,7 +31,7 @@ namespace tensorflow {
 // future to support these features.
 //
 // TODO(skyewm): de/serialize in MetaGraphDef so imported while loops will be
-// differentiable. Figure out backwards compatability story.
+// differentiable. Figure out backwards compatibility story.
 class WhileContext {
  public:
   WhileContext(StringPiece frame_name, std::vector<Node*> enter_nodes,
diff --git a/tensorflow/core/grappler/clusters/BUILD b/tensorflow/core/grappler/clusters/BUILD
index 9a647f9dbdb07d..1cef0b0b6229aa 100644
--- a/tensorflow/core/grappler/clusters/BUILD
+++ b/tensorflow/core/grappler/clusters/BUILD
@@ -20,6 +20,9 @@ tf_gpu_library(
     name = "utils",
     srcs = ["utils.cc"],
     hdrs = ["utils.h"],
+    gpu_deps = [
+        "@local_config_cuda//cuda:cudnn_header",
+    ],
     visibility = ["//visibility:public"],
     deps = [
         "//third_party/eigen3",
@@ -56,6 +59,7 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -73,6 +77,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":cluster",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler/costs:op_level_cost_estimator",
diff --git a/tensorflow/core/grappler/clusters/cluster.h b/tensorflow/core/grappler/clusters/cluster.h
index 5068f72b30d498..d33aaa7e4c16cd 100644
--- a/tensorflow/core/grappler/clusters/cluster.h
+++ b/tensorflow/core/grappler/clusters/cluster.h
@@ -13,14 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_CLUSTERS_CLUSTER_H_
-#define TENSORFLOW_GRAPPLER_CLUSTERS_CLUSTER_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_CLUSTERS_CLUSTER_H_
+#define TENSORFLOW_CORE_GRAPPLER_CLUSTERS_CLUSTER_H_
 
 #include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>
 
+#include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -92,6 +93,10 @@ class Cluster {
   // sorted alphabetically.
   const std::vector<string> GetDeviceNames() const;
 
+  // The DeviceSet is not always available, but when it is it contains a
+  // superset of the devices listed in GetDevices/GetDeviceNames().
+  const DeviceSet* GetDeviceSet() const { return device_set_; }
+
   // Enables collecting the allocator stats. Call with enable=true must be made
   // before Provision().
   virtual Status EnablePeakMemoryStats(bool enable) {
@@ -119,6 +124,7 @@ class Cluster {
 
  protected:
   std::unordered_map<string, DeviceProperties> devices_;
+  const DeviceSet* device_set_ = nullptr;  // Not owned
   const int timeout_s_;
   SessionOptions options_;
   RunOptions run_options_;
@@ -127,4 +133,4 @@ class Cluster {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_CLUSTERS_CLUSTER_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_CLUSTERS_CLUSTER_H_
diff --git a/tensorflow/core/grappler/clusters/single_machine.h b/tensorflow/core/grappler/clusters/single_machine.h
index 90d6a04cab6501..0ae188e0d62e38 100644
--- a/tensorflow/core/grappler/clusters/single_machine.h
+++ b/tensorflow/core/grappler/clusters/single_machine.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_CLUSTERS_SINGLE_MACHINE_H_
-#define TENSORFLOW_GRAPPLER_CLUSTERS_SINGLE_MACHINE_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_CLUSTERS_SINGLE_MACHINE_H_
+#define TENSORFLOW_CORE_GRAPPLER_CLUSTERS_SINGLE_MACHINE_H_
 
 #include "tensorflow/cc/training/coordinator.h"
 #include "tensorflow/core/framework/allocator.h"
@@ -85,4 +85,4 @@ class SingleMachine : public Cluster {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_CLUSTERS_SINGLE_MACHINE_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_CLUSTERS_SINGLE_MACHINE_H_
diff --git a/tensorflow/core/grappler/clusters/utils.h b/tensorflow/core/grappler/clusters/utils.h
index 2d5257c68ca399..a167736267461e 100644
--- a/tensorflow/core/grappler/clusters/utils.h
+++ b/tensorflow/core/grappler/clusters/utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_CLUSTERS_UTILS_H_
-#define TENSORFLOW_GRAPPLER_CLUSTERS_UTILS_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_CLUSTERS_UTILS_H_
+#define TENSORFLOW_CORE_GRAPPLER_CLUSTERS_UTILS_H_
 
 #include "tensorflow/core/common_runtime/gpu/gpu_id.h"
 #include "tensorflow/core/protobuf/device_properties.pb.h"
@@ -36,4 +36,4 @@ DeviceProperties GetDeviceInfo(const DeviceNameUtils::ParsedName& device);
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_CLUSTERS_UTILS_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_CLUSTERS_UTILS_H_
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.cc b/tensorflow/core/grappler/clusters/virtual_cluster.cc
index abfa7bc48e6e24..5c9b2320b5bbf4 100644
--- a/tensorflow/core/grappler/clusters/virtual_cluster.cc
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.cc
@@ -37,6 +37,14 @@ VirtualCluster::VirtualCluster(
     : Cluster(0), node_estimator_(node_estimator), node_manager_(node_manager) {
   devices_ = devices;
 }
+
+VirtualCluster::VirtualCluster(
+    const std::unordered_map<string, DeviceProperties>& devices,
+    const DeviceSet* device_set)
+    : VirtualCluster(devices) {
+  device_set_ = device_set;
+}
+
 VirtualCluster::~VirtualCluster() {}
 
 Status VirtualCluster::Provision() { return Status::OK(); }
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.h b/tensorflow/core/grappler/clusters/virtual_cluster.h
index dde70bab7a391e..eebac68e1b5acf 100644
--- a/tensorflow/core/grappler/clusters/virtual_cluster.h
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.h
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_CLUSTERS_VIRTUAL_CLUSTER_H_
-#define TENSORFLOW_GRAPPLER_CLUSTERS_VIRTUAL_CLUSTER_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_CLUSTERS_VIRTUAL_CLUSTER_H_
+#define TENSORFLOW_CORE_GRAPPLER_CLUSTERS_VIRTUAL_CLUSTER_H_
 
 #include <unordered_map>
+
+#include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
 #include "tensorflow/core/grappler/costs/op_level_cost_estimator.h"
 #include "tensorflow/core/grappler/costs/virtual_scheduler.h"
@@ -34,6 +36,8 @@ class VirtualCluster : public Cluster {
   VirtualCluster(const std::unordered_map<string, DeviceProperties>& devices,
                  OpLevelCostEstimator* node_estimator,
                  ReadyNodeManager* node_manager);
+  VirtualCluster(const std::unordered_map<string, DeviceProperties>& devices,
+                 const DeviceSet* device_set);
 
   ~VirtualCluster() override;
 
@@ -53,4 +57,4 @@ class VirtualCluster : public Cluster {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_CLUSTERS_VIRTUAL_CLUSTER_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_CLUSTERS_VIRTUAL_CLUSTER_H_
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index f16bdc660bcb99..8494144295823c 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -129,6 +129,9 @@ tf_gpu_library(
     name = "utils",
     srcs = ["utils.cc"],
     hdrs = ["utils.h"],
+    gpu_deps = [
+        "@local_config_cuda//cuda:cudnn_header",
+    ],
     visibility = ["//visibility:public"],
     deps = [
         "//third_party/eigen3",
diff --git a/tensorflow/core/grappler/costs/cost_estimator.h b/tensorflow/core/grappler/costs/cost_estimator.h
index 9e01ec5ff5b48b..fe8a876f8ac3e9 100644
--- a/tensorflow/core/grappler/costs/cost_estimator.h
+++ b/tensorflow/core/grappler/costs/cost_estimator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_COSTS_COST_ESTIMATOR_H_
-#define TENSORFLOW_GRAPPLER_COSTS_COST_ESTIMATOR_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_COST_ESTIMATOR_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_COST_ESTIMATOR_H_
 
 #include <chrono>
 #include <unordered_map>
@@ -180,4 +180,4 @@ class CostEstimator {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_COSTS_COST_ESTIMATOR_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_COST_ESTIMATOR_H_
diff --git a/tensorflow/core/grappler/costs/graph_memory.h b/tensorflow/core/grappler/costs/graph_memory.h
index 859e4c012c84c9..a8ae4cc49f0cdc 100644
--- a/tensorflow/core/grappler/costs/graph_memory.h
+++ b/tensorflow/core/grappler/costs/graph_memory.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_COSTS_GRAPH_MEMORY_H_
-#define TENSORFLOW_GRAPPLER_COSTS_GRAPH_MEMORY_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_GRAPH_MEMORY_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_GRAPH_MEMORY_H_
 
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
@@ -78,4 +78,4 @@ class GraphMemory {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_COSTS_GRAPH_MEMORY_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_GRAPH_MEMORY_H_
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 313f63149d5432..d9a08d42dbccdd 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <unordered_map>
 #include <unordered_set>
 #include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
@@ -256,18 +257,14 @@ typename DisjointSet<Handle>::Rep* DisjointSet<Handle>::Find(Handle value) {
   return root;
 }
 
-bool IsQueue(const NodeDef& node) {
-  return str_util::EndsWith(node.op(), "QueueV2");
+bool IsEnqueue(const NodeDef& n) {
+  return (n.op().find("Enqueue") != std::string::npos &&
+          n.op().find("EnqueueMany") == std::string::npos);
 }
 
-// Returns true if the node is an Enter op AND its input is a Queue.
-bool IsEnterWithQueue(const NodeDef& node, const GraphView& graph) {
-  if (IsEnter(node)) {
-    GraphView::InputPort input(&node, 0);
-    GraphView::OutputPort fanin = graph.GetRegularFanin(input);
-    return IsQueue(*fanin.node);
-  }
-  return false;
+bool IsDequeue(const NodeDef& n) {
+  return (n.op().find("Dequeue") != std::string::npos &&
+          n.op().find("DequeueMany") == std::string::npos);
 }
 
 bool HasAnyUnknownDimensions(const TensorShapeProto& proto) {
@@ -386,7 +383,7 @@ class TopoQueue {
   std::set<const NodeDef*, CompareNodes> queue_;
 };
 
-// Merge and relax symbolic shapes.
+// Processes symbolic shapes.
 // Each symbolic shape or dimension is represented by a handle. Unlike the TF
 // shape refiner which creates new handles every time it processes an unknown
 // shape/dimension, the symbolic shape refiner assigns a specific handle to each
@@ -428,7 +425,15 @@ class SymbolicShapeRefiner {
     }
     return it->second.inference_context.get();
   }
-  Status UpdateNode(const NodeDef* node, bool relax, bool* refined) {
+
+  // Forward the shapes from the function's fanin to the function body,
+  // then call PropagateShapes.
+  // Returns an error if 'node' is not a function node.
+  Status UpdateFunction(const NodeDef* node, bool* refined) {
+    return UpdateNode(node, refined);
+  }
+
+  Status UpdateNode(const NodeDef* node, bool* refined) {
     NodeContext* node_context = GetNodeContext(node);
     if (node_context == nullptr) {
       TF_RETURN_IF_ERROR(AddNode(node));
@@ -478,6 +483,38 @@ class SymbolicShapeRefiner {
               }
             }
           }
+        } else if (IsRank(*input)) {
+          if (c->inference_context->RankKnown(c->inference_context->input(0))) {
+            int32 rank =
+                c->inference_context->Rank(c->inference_context->input(0));
+            Tensor t(DT_INT32, {});
+            t.flat<int32>()(0) = rank;
+            const_values[dst_input] = t;
+            input_tensors[dst_input] = &const_values[dst_input];
+          }
+        } else if (IsSize(*input)) {
+          DimensionHandle size =
+              c->inference_context->NumElements(c->inference_context->input(0));
+          if (c->inference_context->ValueKnown(size)) {
+            int64 sz = c->inference_context->Value(size);
+            bool valid = false;
+            if (input->attr().at("T").type() == DT_INT32) {
+              if (sz < std::numeric_limits<int32>::max()) {
+                Tensor t(DT_INT32, {});
+                t.flat<int32>()(0) = sz;
+                const_values[dst_input] = t;
+                valid = true;
+              }
+            } else {
+              Tensor t(DT_INT64, {});
+              t.flat<int64>()(0) = sz;
+              const_values[dst_input] = t;
+              valid = true;
+            }
+            if (valid) {
+              input_tensors[dst_input] = &const_values[dst_input];
+            }
+          }
         }
 
         if (c->output_tensors_as_shapes.size() > src_output) {
@@ -519,8 +556,12 @@ class SymbolicShapeRefiner {
       }
     }
 
+    // Make sure we schedule the fanout of resources (which have no input)
+    // whenever the resources are updated.
+    *refined |= inference_context->num_inputs() == 0;
+
     if (!*refined) {
-      // No input shape has changed, we're done
+      // No input shape has changed, we're done.
       return Status::OK();
     }
 
@@ -573,52 +614,6 @@ class SymbolicShapeRefiner {
     }
   };
 
-  // Compute the shape of the tensors outputed by node 'node' at output port
-  // 'port_index' as the intersection of shape1 and shape2.
-  ShapeHandle OutputAsIntersection(const NodeDef* node, int port_index,
-                                   ShapeHandle shape1, ShapeHandle shape2) {
-    if (shape1.SameHandle(shape2)) {
-      return shape1;
-    }
-    InferenceContext* ctx = GetContext(node);
-    ShapeHandle merged = shape1;
-    if (!ctx->RankKnown(shape2) && !ctx->RankKnown(shape1)) {
-      // Return either one since they're expected to represent the same value.
-      return shape1;
-    } else if (!ctx->RankKnown(shape2) && ctx->RankKnown(shape1)) {
-      return shape1;
-    } else if (ctx->RankKnown(shape2) && !ctx->RankKnown(shape1)) {
-      return shape2;
-    } else {
-      const int rank = ctx->Rank(shape1);
-      if (ctx->Rank(shape2) != rank) {
-        // We detected an inconsistency, return an unknown shape. This can
-        // happen in the fanout of a merge node since during the initial
-        // propagation we optimistically assume that all the inputs to the merge
-        // node have the same shape.
-        return GetUnknownOutputShape(node, port_index);
-      }
-      for (int d = 0; d < rank; ++d) {
-        if (!ctx->Dim(shape1, d).SameHandle(ctx->Dim(shape2, d))) {
-          if (ctx->Value(ctx->Dim(shape1, d)) !=
-              ctx->Value(ctx->Dim(shape2, d))) {
-            DimensionHandle new_dim;
-            if (ctx->Value(ctx->Dim(shape1, d)) < 0) {
-              new_dim = ctx->Dim(shape2, d);
-            } else if (ctx->Value(ctx->Dim(shape2, d)) < 0) {
-              new_dim = ctx->Dim(shape1, d);
-            } else {
-              new_dim = GetUnknownOutputDim(node, port_index, d);
-            }
-            TF_CHECK_OK(ctx->ReplaceDim(merged, d, new_dim, &merged));
-          }
-        }
-      }
-    }
-    return merged;
-  }
-
-  // Compute the shape of the tensors outputed by node 'node' at output port
   // 'port_index' as the union of shape1 and shape2.
   ShapeHandle OutputAsUnion(const NodeDef* node, int port_index,
                             ShapeHandle shape1, ShapeHandle shape2) {
@@ -690,10 +685,16 @@ class SymbolicShapeRefiner {
     return true;
   }
 
+  Status AddFunction(const NodeDef* node) { return Status::OK(); }
+
   Status AddNode(const NodeDef* node) {
     NodeContext& node_ctx = node_to_context_[node];
     TF_RETURN_IF_ERROR(function_library_.LookUp(node->op(), &node_ctx.op_data));
 
+    if (node_ctx.op_data->is_function_op) {
+      TF_RETURN_IF_ERROR(AddFunction(node));
+    }
+
     TF_RETURN_IF_ERROR(InOutTypesForNode(*node, node_ctx.op_data->op_def,
                                          &node_ctx.input_types,
                                          &node_ctx.output_types));
@@ -782,6 +783,29 @@ class SymbolicShapeRefiner {
           c->output_tensors_as_shapes.resize(1);
           c->output_tensors_as_shapes[0] = result;
         }
+      } else if (IsPack(node)) {
+        // A Pack node concatenating scalars is often used to generate a shape.
+        std::vector<DimensionHandle> dims;
+        bool valid = true;
+        for (int i = 0; i < ic->num_inputs(); ++i) {
+          const Tensor* t = ic->input_tensor(i);
+          if (t) {
+            if (t->dims() != 0 ||
+                (t->dtype() != DT_INT32 && t->dtype() != DT_INT64)) {
+              valid = false;
+              break;
+            }
+            int64 size = t->dtype() == DT_INT32 ? t->scalar<int32>()()
+                                                : t->scalar<int64>()();
+            dims.push_back(size < 0 ? ic->UnknownDim() : ic->MakeDim(size));
+          } else {
+            dims.push_back(ic->UnknownDim());
+          }
+        }
+        if (valid) {
+          c->output_tensors_as_shapes.resize(1);
+          c->output_tensors_as_shapes[0] = ic->MakeShape(dims);
+        }
       } else if (IsSlice(node)) {
         ShapeHandle input = ic->input_tensors_as_shapes()[0];
         bool valid = ic->RankKnown(input);
@@ -793,11 +817,70 @@ class SymbolicShapeRefiner {
           int64 start = slice_offset->dtype() == DT_INT32
                             ? slice_offset->flat<int32>()(0)
                             : slice_offset->flat<int64>()(0);
-          int64 end = start + (slice_size->dtype() == DT_INT32
-                                   ? slice_size->flat<int32>()(0)
-                                   : slice_size->flat<int64>()(0));
+          int64 size =
+              (slice_size->dtype() == DT_INT32 ? slice_size->flat<int32>()(0)
+                                               : slice_size->flat<int64>()(0));
           ShapeHandle result;
-          TF_RETURN_IF_ERROR(ic->Subshape(input, start, end, &result));
+          if (size == -1) {
+            TF_RETURN_IF_ERROR(ic->Subshape(input, start, &result));
+          } else {
+            int64 end = start + size;
+            TF_RETURN_IF_ERROR(ic->Subshape(input, start, end, &result));
+          }
+          c->output_tensors_as_shapes.resize(1);
+          c->output_tensors_as_shapes[0] = result;
+        }
+      } else if (IsStridedSlice(node)) {
+        ShapeHandle input = ic->input_tensors_as_shapes()[0];
+        bool valid = ic->RankKnown(input);
+        const Tensor* slice_begin = ic->input_tensor(1);
+        valid &= slice_begin != nullptr && slice_begin->NumElements() == 1;
+        const Tensor* slice_end = ic->input_tensor(2);
+        valid &= slice_end != nullptr && slice_end->NumElements() == 1;
+        const Tensor* slice_stride = ic->input_tensor(3);
+        valid &= slice_stride != nullptr && slice_stride->NumElements() == 1;
+
+        if (node.attr().count("ellipsis_mask") > 0 &&
+            node.attr().at("ellipsis_mask").i() != 0) {
+          valid = false;
+        }
+        if (node.attr().count("new_axis_mask") > 0 &&
+            node.attr().at("new_axis_mask").i() != 0) {
+          valid = false;
+        }
+        if (node.attr().count("shrink_axis_mask") > 0 &&
+            node.attr().at("shrink_axis_mask").i() != 0) {
+          valid = false;
+        }
+        int begin_mask = 0;
+        if (node.attr().count("begin_mask") > 0) {
+          begin_mask = node.attr().at("begin_mask").i();
+        }
+        int end_mask = 0;
+        if (node.attr().count("end_mask") > 0) {
+          end_mask = node.attr().at("end_mask").i();
+        }
+        if (begin_mask < 0 || begin_mask > 1 || end_mask < 0 || end_mask > 1) {
+          valid = false;
+        }
+        if (valid) {
+          int64 begin = 0;
+          if (begin_mask == 0) {
+            begin = slice_begin->dtype() == DT_INT32
+                        ? slice_begin->flat<int32>()(0)
+                        : slice_begin->flat<int64>()(0);
+          }
+          int64 end = std::numeric_limits<int64>::max();
+          if (end_mask == 0) {
+            end =
+                (slice_end->dtype() == DT_INT32 ? slice_end->flat<int32>()(0)
+                                                : slice_end->flat<int64>()(0));
+          }
+          int64 stride = slice_stride->dtype() == DT_INT32
+                             ? slice_stride->flat<int32>()(0)
+                             : slice_stride->flat<int64>()(0);
+          ShapeHandle result;
+          TF_RETURN_IF_ERROR(ic->Subshape(input, begin, end, stride, &result));
           c->output_tensors_as_shapes.resize(1);
           c->output_tensors_as_shapes[0] = result;
         }
@@ -822,6 +905,7 @@ class SymbolicShapeRefiner {
         status.Update(SetUnknownShape(&node, output_port));
       }
     }
+
     return status;
   }
 
@@ -884,29 +968,6 @@ class SymbolicShapeManager {
   DisjointSet<shape_inference::DimensionHandle> dims_;
 };
 
-Status GraphProperties::MergeEnqueueShapesAndTypes(
-    SymbolicShapeRefiner* shape_refiner, const NodeDef* qnode,
-    const std::vector<ShapeAndType>& shapes_and_types,
-    std::vector<ShapeAndType>* queue_shapes_and_types) {
-  if (shapes_and_types.size() != queue_shapes_and_types->size()) {
-    return errors::InvalidArgument(
-        "Enqueue nodes mixed number of tensors: ", shapes_and_types.size(),
-        "  vs ", queue_shapes_and_types->size());
-  }
-  for (size_t i = 0; i < shapes_and_types.size(); ++i) {
-    const ShapeAndType& a = shapes_and_types[i];
-    ShapeAndType& b = (*queue_shapes_and_types)[i];
-    if (a.dtype != b.dtype) {
-      return errors::InvalidArgument("Enqueue nodes mixed dtypes for tensor ",
-                                     i, ": ", DataTypeString(a.dtype), " vs ",
-                                     DataTypeString(b.dtype));
-    }
-
-    b.shape = shape_refiner->OutputAsIntersection(qnode, i, a.shape, b.shape);
-  }
-  return Status::OK();
-}
-
 Status GraphProperties::RelaxEnqueueShapesAndMergeTypes(
     SymbolicShapeRefiner* shape_refiner, const NodeDef* qnode,
     const std::vector<ShapeAndType>& shapes_and_types,
@@ -930,13 +991,10 @@ Status GraphProperties::RelaxEnqueueShapesAndMergeTypes(
   return Status::OK();
 }
 
-// If a Merge node has a NextIteration node as an input then that input will
-// try to forward an UnknownShape at graph construction time. However, the
-// Merge shape function will always propagate an UnknownShape if any of its
-// inputs are UnknownShapes. So we need to ignore the input from NextIteration
-// nodes to propagate any known shape from the Merge node.
+// Compute the output shape of the merge node as the union of the available
+// input shapes.
 Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
-                                        const NodeDef* node, bool relax,
+                                        const NodeDef* node,
                                         bool* new_shapes) const {
   InferenceContext* c = shape_refiner->GetContext(node);
   if (!c) {
@@ -955,15 +1013,8 @@ Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
   bool out_initialized = false;
   for (const GraphView::Edge fanin :
        shape_refiner->graph().GetFaninEdges(*node, false)) {
-    // Skip back edges during the initial propagation phase. This is equivalent
-    // to assuming that all the inputs to the merge nodes are fed by the same
-    // shape, and will be corrected as needed in the relaxation phase.
-    if (!relax && IsNextIteration(*fanin.src.node)) {
-      continue;
-    }
-
     InferenceContext* in = shape_refiner->GetContext(fanin.src.node);
-    if (!relax && !in) {
+    if (!in) {
       // Handling a loop for the first time, the back edge won't have any shape
       // info.
       continue;
@@ -976,11 +1027,7 @@ Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
       out = input;
       continue;
     }
-    if (relax) {
-      out = shape_refiner->OutputAsUnion(node, 0, input, out);
-    } else {
-      out = shape_refiner->OutputAsIntersection(node, 0, input, out);
-    }
+    out = shape_refiner->OutputAsUnion(node, 0, input, out);
   }
 
   if (*new_shapes || !shape_refiner->EquivalentShapes(out, c->output(0))) {
@@ -991,14 +1038,12 @@ Status GraphProperties::UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
   return Status::OK();
 }
 
-// Manually propagate the input shape for Enter nodes and update any Merge node
-// outputs.
+// Manually propagate the input shape for Enter nodes.
 Status GraphProperties::UpdateEnter(SymbolicShapeRefiner* shape_refiner,
-                                    const NodeDef* node, bool relax,
-                                    bool* new_shapes) {
+                                    const NodeDef* node, bool* new_shapes) {
   auto enter_ctx = shape_refiner->GetContext(node);
   if (!enter_ctx) {
-    TF_RETURN_IF_ERROR(shape_refiner->UpdateNode(node, relax, new_shapes));
+    TF_RETURN_IF_ERROR(shape_refiner->UpdateNode(node, new_shapes));
     enter_ctx = shape_refiner->GetContext(node);
   }
 
@@ -1012,29 +1057,38 @@ Status GraphProperties::UpdateEnter(SymbolicShapeRefiner* shape_refiner,
     enter_ctx->set_output(0, input);
     *new_shapes = true;
   }
+  auto* outputs = in->output_handle_shapes_and_types(fanin.port_id);
+  if (outputs) {
+    enter_ctx->set_input_handle_shapes_and_types(0, *outputs);
+    enter_ctx->set_output_handle_shapes_and_types(0, *outputs);
+    *new_shapes = true;
+  }
   return Status::OK();
 }
 
-Status GraphProperties::UpdateShapes(SymbolicShapeRefiner* shape_refiner,
-                                     bool relax, const NodeDef* n,
-                                     bool* new_shapes) const {
+Status GraphProperties::UpdateShapes(
+    SymbolicShapeRefiner* shape_refiner,
+    const std::unordered_map<const NodeDef*, const NodeDef*>& resource_handles,
+    const NodeDef* n, bool* new_shapes) const {
   if (IsEnter(*n)) {
     // The Enter shape function always forwards an UnknownShape, so do the right
     // thing here.
-    TF_RETURN_IF_ERROR(UpdateEnter(shape_refiner, n, relax, new_shapes));
+    TF_RETURN_IF_ERROR(UpdateEnter(shape_refiner, n, new_shapes));
   } else if (IsMerge(*n)) {
     // Properly handle merge nodes.
-    TF_RETURN_IF_ERROR(UpdateMergeNode(shape_refiner, n, relax, new_shapes));
+    TF_RETURN_IF_ERROR(UpdateMergeNode(shape_refiner, n, new_shapes));
+  } else if (IsEnqueue(*n)) {
+    // Make sure the shapes of enqueued tensors are propagated to the queue
+    // itself.
+    TF_RETURN_IF_ERROR(
+        UpdateEnqueue(n, resource_handles, shape_refiner, new_shapes));
   } else {
-    // Rely on regular TF shape refinement for all the other nodes.
-    bool updated = false;
-    TF_RETURN_IF_ERROR(shape_refiner->UpdateNode(n, relax, &updated));
-    if (updated) {
-      // We want to avoid propagating through loops on the merge pass because
-      // the shapes are not guaranteed to converge.
-      if (relax || !IsNextIteration(*n)) {
-        *new_shapes = true;
-      }
+    auto c = shape_refiner->GetNodeContext(n);
+    if (c && c->op_data && c->op_data->is_function_op) {
+      TF_RETURN_IF_ERROR(shape_refiner->UpdateFunction(n, new_shapes));
+    } else {
+      // Rely on regular TF shape refinement for all the other nodes.
+      TF_RETURN_IF_ERROR(shape_refiner->UpdateNode(n, new_shapes));
     }
   }
   return Status::OK();
@@ -1042,23 +1096,22 @@ Status GraphProperties::UpdateShapes(SymbolicShapeRefiner* shape_refiner,
 
 // Propagates the shapes in the transitive fan-out of <new_shapes>.
 Status GraphProperties::PropagateShapes(
-    SymbolicShapeRefiner* shape_refiner, bool relax, TopoQueue* new_shapes,
-    const std::unordered_map<const NodeDef*,
-                             std::unordered_set<const NodeDef*>>& resources,
+    SymbolicShapeRefiner* shape_refiner, TopoQueue* new_shapes,
+    const std::unordered_map<const NodeDef*, const NodeDef*>& resource_handles,
     int num_loops) const {
   // Limit the number of iterations to prevent infinite loops in the presence of
-  // incorrect shape functions. The algoritm should converge in at most
+  // incorrect shape functions. The algorithm should converge in at most
   // num_nested_loops^2 * max_rank. We approximate max_rank with the constant 4.
   // The same applies to resources.
-  VLOG(1) << "Propagating (relax=" << relax << ") " << new_shapes->size()
-          << " new shapes through " << num_loops << " loops and "
-          << resources.size() << " resources" << std::endl;
+  VLOG(1) << "Propagating " << new_shapes->size() << " new shapes through "
+          << num_loops << " loops and " << resource_handles.size()
+          << " resources" << std::endl;
 
   const int64 max_loop_length = item_.graph.node_size();
   const int64 max_rank = 4;
   const int64 max_loop_iterations =
       max_rank * max_loop_length * std::max<int64>(1, num_loops * num_loops);
-  const int64 num_queues = resources.size();
+  const int64 num_queues = resource_handles.size();
   const int64 max_resource_iterations = num_queues * num_queues * max_rank;
 
   int64 num_resource_iterations = 0;
@@ -1068,22 +1121,22 @@ Status GraphProperties::PropagateShapes(
            num_loop_iterations++ < max_loop_iterations) {
       const NodeDef* n = new_shapes->pop();
       bool updated = false;
-      TF_RETURN_IF_ERROR(UpdateShapes(shape_refiner, relax, n, &updated));
+      TF_RETURN_IF_ERROR(
+          UpdateShapes(shape_refiner, resource_handles, n, &updated));
       if (updated) {
-        for (const GraphView::InputPort fanout :
+        for (const GraphView::InputPort& fanout :
              shape_refiner->graph().GetFanouts(*n, false)) {
           new_shapes->push(fanout.node);
         }
+        // Make sure the corresponding queue nodes are (re)processed.
+        if (IsEnqueue(*n)) {
+          auto it = resource_handles.find(n);
+          if (it != resource_handles.end()) {
+            new_shapes->push(it->second);
+          }
+        }
       }
     }
-
-    for (const auto& resource : resources) {
-      // Resources need special handling: since the enqueue nodes are in the
-      // fanout of the queues, we need to manually propagate the shapes from
-      // enqueue node to the corresponding queue.
-      TF_RETURN_IF_ERROR(UpdateResource(resource.first, resource.second,
-                                        shape_refiner, new_shapes));
-    }
   } while (!new_shapes->empty() &&
            num_resource_iterations++ < max_resource_iterations);
 
@@ -1094,54 +1147,48 @@ Status GraphProperties::PropagateShapes(
   return Status::OK();
 }
 
-Status GraphProperties::UpdateResource(
-    const NodeDef* qnode,
-    const std::unordered_set<const NodeDef*>& queue_inputs,
-    SymbolicShapeRefiner* shape_refiner, TopoQueue* new_shapes) {
-  // Proceed only if qnode is a queue or an Enter with queue input.
-  if (!IsQueue(*qnode) && !IsEnterWithQueue(*qnode, shape_refiner->graph())) {
+Status GraphProperties::UpdateEnqueue(
+    const NodeDef* enqueue_node,
+    const std::unordered_map<const NodeDef*, const NodeDef*>& resource_handles,
+    SymbolicShapeRefiner* shape_refiner, bool* new_shapes) {
+  auto ctx = shape_refiner->GetNodeContext(enqueue_node);
+  if (!ctx) {
+    TF_RETURN_IF_ERROR(shape_refiner->AddNode(enqueue_node));
+    ctx = CHECK_NOTNULL(shape_refiner->GetNodeContext(enqueue_node));
+  }
+
+  auto it = resource_handles.find(enqueue_node);
+  if (it == resource_handles.end()) {
+    // The corresponding queue was not found, there isn't much we can do.
     return Status::OK();
   }
+  const NodeDef* qnode = it->second;
   auto qctx = shape_refiner->GetContext(qnode);
   if (!qctx) {
     return Status::OK();
   }
   auto* queue_handle_data = qctx->output_handle_shapes_and_types(0);
 
-  // Merge all inputs into the enqueue node, regardless of which phase we
-  // are in.
-  std::vector<ShapeAndType> queue_shapes_and_types;
-  for (const auto& node : queue_inputs) {
-    auto ctx = shape_refiner->GetNodeContext(node);
-    if (!ctx) {
-      continue;
-    }
-    // TODO(bsteiner): handle EnqueueMany as well.
-    if (node->op().find("Enqueue") != std::string::npos &&
-        node->op().find("EnqueueMany") == std::string::npos) {
-      std::vector<ShapeAndType> shapes_and_types;
-      for (int i = 1; i < ctx->input_types.size(); ++i) {
-        shapes_and_types.push_back(
-            {ctx->inference_context->input(i), ctx->input_types[i]});
-      }
-      if (queue_shapes_and_types.empty()) {
-        queue_shapes_and_types = shapes_and_types;
-      } else {
-        TF_RETURN_IF_ERROR(RelaxEnqueueShapesAndMergeTypes(
-            shape_refiner, qnode, shapes_and_types, &queue_shapes_and_types));
-      }
-    }
+  // TODO(bsteiner): handle EnqueueMany as well.
+  std::vector<ShapeAndType> shapes_and_types;
+  for (int i = 1; i < ctx->input_types.size(); ++i) {
+    GraphView::InputPort inp(enqueue_node, i);
+    GraphView::OutputPort fanin = shape_refiner->graph().GetRegularFanin(inp);
+    InferenceContext* in = shape_refiner->GetContext(fanin.node);
+    ShapeHandle input = in->output(fanin.port_id);
+    ctx->inference_context->SetInput(i, input);
+    shapes_and_types.push_back({input, ctx->input_types[i]});
   }
 
-  if (queue_handle_data == nullptr ||
-      !shape_refiner->EquivalentShapesAndTypes(*queue_handle_data,
-                                               queue_shapes_and_types)) {
-    qctx->set_output_handle_shapes_and_types(0, queue_shapes_and_types);
-
-    for (const GraphView::InputPort fanout :
-         shape_refiner->graph().GetFanouts(*qnode, false)) {
-      new_shapes->push(fanout.node);
-    }
+  if (queue_handle_data == nullptr) {
+    qctx->set_output_handle_shapes_and_types(0, shapes_and_types);
+    *new_shapes = true;
+  } else {
+    TF_RETURN_IF_ERROR(RelaxEnqueueShapesAndMergeTypes(
+        shape_refiner, qnode, *queue_handle_data, &shapes_and_types));
+    *new_shapes |= !shape_refiner->EquivalentShapesAndTypes(*queue_handle_data,
+                                                            shapes_and_types);
+    qctx->set_output_handle_shapes_and_types(0, shapes_and_types);
   }
 
   return Status::OK();
@@ -1159,75 +1206,96 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
     }
   }
 
-  std::unordered_map<const NodeDef*, int> topo_order;
-  TF_RETURN_IF_ERROR(ComputeTopologicalOrder(item_.graph, &topo_order));
-
-  GraphView graph_view(&item_.graph);
+  GraphView graph_view(const_cast<GraphDef*>(&item_.graph));
 
   // List the resources and the nodes using them. Also collect the Merge nodes,
   // fed nodes, and primary inputs.
-  std::unordered_map<const NodeDef*, std::unordered_set<const NodeDef*>>
+  std::unordered_map<const NodeDef*,
+                     std::pair<std::unordered_set<const NodeDef*>,
+                               std::unordered_set<const NodeDef*>>>
       resources;
   std::unordered_set<const NodeDef*> merge_nodes;
   std::unordered_set<const NodeDef*> fed_nodes;
   std::unordered_set<const NodeDef*> primary_inputs;
   int num_loops = 0;
   for (const NodeDef& node : item_.graph.node()) {
+    if (IsQueue(node)) {
+      for (const GraphView::InputPort& fanout :
+           graph_view.GetFanouts(node, false)) {
+        if (IsEnter(*fanout.node)) {
+          const NodeDef& enter = *fanout.node;
+          for (const GraphView::InputPort& fanout :
+               graph_view.GetFanouts(enter, false)) {
+            if (IsEnqueue(*fanout.node)) {
+              resources[&node].first.insert(fanout.node);
+            } else if (IsDequeue(*fanout.node)) {
+              resources[&node].second.insert(fanout.node);
+            }
+          }
+        } else {
+          if (IsEnqueue(*fanout.node)) {
+            resources[&node].first.insert(fanout.node);
+          } else if (IsDequeue(*fanout.node)) {
+            resources[&node].second.insert(fanout.node);
+          }
+        }
+      }
+    }
     if (NumNonControlInputs(node) == 0) {
       primary_inputs.insert(&node);
     } else if (IsMerge(node)) {
       merge_nodes.insert(&node);
     } else if (IsNextIteration(node)) {
       ++num_loops;
-    } else {
-      const OpRegistrationData* op_data;
-      TF_RETURN_IF_ERROR(function_library.LookUp(node.op(), &op_data));
-      DataTypeVector input_types;
-      DataTypeVector output_types;
-      TF_RETURN_IF_ERROR(InOutTypesForNode(node, op_data->op_def, &input_types,
-                                           &output_types));
-      for (int i = 0; i < input_types.size(); ++i) {
-        if (input_types[i] == DataType::DT_RESOURCE) {
-          GraphView::InputPort input(&node, i);
-          const GraphView::OutputPort resource =
-              graph_view.GetRegularFanin(input);
-          resources[resource.node].insert(&node);
-        }
-      }
     }
     if (fed_ports.find(node.name()) != fed_ports.end()) {
       fed_nodes.insert(&node);
     }
   }
 
-  SymbolicShapeRefiner refiner(graph_view, fed_ports);
-
-  // We propagate shapes through the graph in two phases. In the first phase, we
-  // exclusively merge shapes but we do not propagate shapes through the
-  // backedge of loops (i.e. the NextIteration node). Then on the second phase,
-  // we exclusively relax shapes and propagate shapes through loops until
-  // reaching fixed point.
-  for (int relax = 0; relax < 2; relax++) {
-    TopoQueue new_shapes(topo_order);
-    // Seed the propagation of shapes through merge nodes.
-    if (relax) {
-      for (const NodeDef* node : merge_nodes) {
-        new_shapes.push(node);
+  std::unordered_map<const NodeDef*, const NodeDef*> resource_handles;
+  std::vector<std::pair<const NodeDef*, const NodeDef*>> extra_deps;
+  for (const auto& resource : resources) {
+    for (const NodeDef* src : resource.second.first) {
+      resource_handles[src] = resource.first;
+      for (const NodeDef* tgt : resource.second.second) {
+        // Add control edges from enqueue to dequeue nodes to ensure they are
+        // processed in their logical order.
+        extra_deps.emplace_back(src, tgt);
       }
     }
-    // Also seed the propagation of shapes in the fanout of primary inputs.
-    for (const NodeDef* node : primary_inputs) {
-      new_shapes.push(node);
-    }
-    // Also seed the propagation of shapes in the fanout of fed nodes.
-    for (const NodeDef* node : fed_nodes) {
-      new_shapes.push(node);
+  }
+
+  std::unordered_map<const NodeDef*, int> topo_order;
+  Status s = ComputeTopologicalOrder(item_.graph, &topo_order, &extra_deps);
+  if (!s.ok()) {
+    if (extra_deps.empty()) {
+      return s;
+    } else {
+      // There is a loop between queues: we'll just use the graph topological
+      // order. This will make the shape inference less precise but since this
+      // isn't common it's not worth to figure out where to break the loop and
+      // do a proper relaxation.
+      TF_RETURN_IF_ERROR(
+          ComputeTopologicalOrder(item_.graph, &topo_order, nullptr));
     }
-    // Propagate shapes normally.
-    TF_RETURN_IF_ERROR(
-        PropagateShapes(&refiner, relax, &new_shapes, resources, num_loops));
   }
 
+  SymbolicShapeRefiner refiner(graph_view, fed_ports);
+
+  TopoQueue new_shapes(topo_order);
+  // Also seed the propagation of shapes in the fanout of primary inputs.
+  for (const NodeDef* node : primary_inputs) {
+    new_shapes.push(node);
+  }
+  // Also seed the propagation of shapes in the fanout of fed nodes.
+  for (const NodeDef* node : fed_nodes) {
+    new_shapes.push(node);
+  }
+  // Propagate shapes normally.
+  TF_RETURN_IF_ERROR(
+      PropagateShapes(&refiner, &new_shapes, resource_handles, num_loops));
+
   // Track shapes globally across the graph.
   SymbolicShapeManager shape_manager;
   bool found_error = false;
@@ -1271,7 +1339,6 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
 
     // Fill input properties.
     {
-      // CHECK_EQ(ctx->num_inputs(), node.num_inputs());
       auto& input_properties = input_properties_[node.name()];
 
       // Should always be empty, node names in graph are supposed to be unique.
@@ -1295,7 +1362,6 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds) {
 
     // Fill output properties.
     {
-      // CHECK_EQ(ctx->num_outputs(), node->num_outputs());
       auto& output_properties = output_properties_[node.name()];
 
       // Should always be empty, node names in graph are supposed to be unique.
diff --git a/tensorflow/core/grappler/costs/graph_properties.h b/tensorflow/core/grappler/costs/graph_properties.h
index 485324c46643b7..8703613a120590 100644
--- a/tensorflow/core/grappler/costs/graph_properties.h
+++ b/tensorflow/core/grappler/costs/graph_properties.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_COSTS_GRAPH_PROPERTIES_H_
-#define TENSORFLOW_GRAPPLER_COSTS_GRAPH_PROPERTIES_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_GRAPH_PROPERTIES_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_GRAPH_PROPERTIES_H_
 
 #include <unordered_map>
 #include <vector>
@@ -38,6 +38,7 @@ class TopoQueue;
 // and data type properties.
 class GraphProperties {
  public:
+  // The item must outlive the properties
   explicit GraphProperties(const GrapplerItem& item) : item_(item) {}
 
   // Infer the shapes through abstract interpretation. Feed information can be
@@ -75,12 +76,6 @@ class GraphProperties {
   void ClearOutputProperties(const string& node_name);
 
  private:
-  // Merges shapes <shapes_and_types>, determined from an EnqueueV2 node, into
-  // <*queue_shapes_and_types>.
-  static Status MergeEnqueueShapesAndTypes(
-      SymbolicShapeRefiner* shape_refiner, const NodeDef* qnode,
-      const std::vector<shape_inference::ShapeAndType>& shapes_and_types,
-      std::vector<shape_inference::ShapeAndType>* queue_shapes_and_types);
   // Relaxes shapes <shapes_and_types>, determined from an EnqueueV2 node, into
   // <*queue_shapes_and_types>.
   static Status RelaxEnqueueShapesAndMergeTypes(
@@ -88,35 +83,37 @@ class GraphProperties {
       const std::vector<shape_inference::ShapeAndType>& shapes_and_types,
       std::vector<shape_inference::ShapeAndType>* queue_shapes_and_types);
 
-  // Update the shapes for qnode. If output shapes of qnode have changed,
-  // enqueue its fanout in 'new_shapes'.
-  static Status UpdateResource(
-      const NodeDef* qnode,
-      const std::unordered_set<const NodeDef*>& queue_inputs,
-      SymbolicShapeRefiner* shape_refiner, TopoQueue* new_shapes);
+  // Update the shapes of the enqueue node, port them over to the corresponding
+  // queue, and schedule the reprocessing of the queue if needed.
+  static Status UpdateEnqueue(
+      const NodeDef* enqueue_node,
+      const std::unordered_map<const NodeDef*, const NodeDef*>&
+          resource_handles,
+      SymbolicShapeRefiner* shape_refiner, bool* new_shapes);
 
   // Update the output shapes of a Merge node, and enqueue its fanout in
   // new_shapes if needed.
   Status UpdateMergeNode(SymbolicShapeRefiner* shape_refiner,
-                         const NodeDef* node, bool relax,
-                         bool* new_shapes) const;
+                         const NodeDef* node, bool* new_shapes) const;
   // Process the Enter node, and enqueue its fanout in new_shapes if needed.
   static Status UpdateEnter(SymbolicShapeRefiner* shape_refiner,
-                            const NodeDef* node, bool relax, bool* new_shapes);
+                            const NodeDef* node, bool* new_shapes);
   // Update the shapes for node 'n'. If output shapes for n have changed,
   // enqueue its fanout in 'new_shapes'.
-  Status UpdateShapes(SymbolicShapeRefiner* shape_refiner, bool relax,
+  Status UpdateShapes(SymbolicShapeRefiner* shape_refiner,
+                      const std::unordered_map<const NodeDef*, const NodeDef*>&
+                          resource_handles,
                       const NodeDef* n, bool* new_shapes) const;
   // Propagate the shapes for the nodes enqueued in new_shapes and their
   // transitive fanout until a fixed point is reached.
   Status PropagateShapes(
-      SymbolicShapeRefiner* shape_refiner, bool relax, TopoQueue* new_shapes,
-      const std::unordered_map<const NodeDef*,
-                               std::unordered_set<const NodeDef*>>& resources,
+      SymbolicShapeRefiner* shape_refiner, TopoQueue* new_shapes,
+      const std::unordered_map<const NodeDef*, const NodeDef*>&
+          resource_handles,
       int num_loops) const;
 
   // Data members
-  GrapplerItem item_;
+  const GrapplerItem& item_;
   std::map<string, std::vector<OpInfo::TensorProperties>> input_properties_;
   std::map<string, std::vector<OpInfo::TensorProperties>> output_properties_;
   const std::vector<OpInfo::TensorProperties> missing_properties_;
@@ -125,4 +122,4 @@ class GraphProperties {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_COSTS_GRAPH_PROPERTIES_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_GRAPH_PROPERTIES_H_
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index afe334dfa2fafd..3e44b222fdb99b 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -282,20 +282,11 @@ TEST_F(GraphPropertiesTest, Queues) {
   auto dequeue2 =
       ops::QueueDequeue(root.WithOpName("Dequeue2"), q2, {DataType::DT_FLOAT});
 
-  // Create a queue that feeds itself.
-  auto q3 =
-      ops::RandomShuffleQueue(root.WithOpName("Queue3"), {DataType::DT_FLOAT});
-  auto dequeue3 =
-      ops::QueueDequeue(root.WithOpName("Dequeue3"), q3, {DataType::DT_FLOAT});
-  auto merge3 = ops::Merge(root.WithOpName("Merge3"), {dequeue3[0], square2});
-  auto enqueue3 =
-      ops::QueueEnqueue(root.WithOpName("Enqueue3"), q3, {merge3.output});
-
   auto q4 =
       ops::RandomShuffleQueue(root.WithOpName("Queue4"), {DataType::DT_FLOAT});
   auto enqueue4 = ops::QueueEnqueue(root.WithOpName("Enqueue4"), q4, {square2});
   auto enqueue4_2 =
-      ops::QueueEnqueue(root.WithOpName("Enqueue4_2"), q4, {dequeue3[0]});
+      ops::QueueEnqueue(root.WithOpName("Enqueue4_2"), q4, {dequeue2[0]});
   auto dequeue4 =
       ops::QueueDequeue(root.WithOpName("Dequeue4"), q4, {DataType::DT_FLOAT});
 
@@ -327,10 +318,6 @@ TEST_F(GraphPropertiesTest, Queues) {
   ASSERT_EQ(1, props2.size());
   EXPECT_EQ("float: [3,7]", PropToString(props2[0]));
 
-  const auto props3 = properties.GetOutputProperties("Dequeue3");
-  ASSERT_EQ(1, props3.size());
-  EXPECT_EQ("float: [3,7]", PropToString(props3[0]));
-
   // The dequeue3 op shape is unknown. The square2 op shape is known. Verify
   // that we merge the 2 properly to determine the shape of the data coming out
   // of the queue.
@@ -965,6 +952,39 @@ TEST_F(GraphPropertiesTest, Performance) {
   TF_CHECK_OK(properties.InferStatically(false));
 }
 
+TEST_F(GraphPropertiesTest, StridedSlicesOfShapes) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a =
+      ops::Placeholder(s.WithOpName("a"), DT_FLOAT,
+                       ops::Placeholder::Shape(PartialTensorShape({-1, -1})));
+  auto shp = ops::Shape(s.WithOpName("shape"), {a});
+
+  Output index1 = ops::Const(s.WithOpName("index1"), 0, {1});
+  Output index2 = ops::Const(s.WithOpName("index2"), 1, {1});
+  Output index3 = ops::Const(s.WithOpName("index3"), 2, {1});
+
+  Output b = ops::StridedSlice(s.WithOpName("b"), shp, index1, index2, index2);
+  Output c = ops::StridedSlice(s.WithOpName("c"), shp, index2, index3, index2);
+
+  Output zero = ops::Const(s.WithOpName("zero"), 0.0f, {});
+  Output o1 = ops::Fill(s.WithOpName("o1"), b, zero);
+  Output o2 = ops::Fill(s.WithOpName("o2"), c, zero);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphProperties properties(item);
+  TF_CHECK_OK(properties.InferStatically(false));
+  const auto shape_a = properties.GetOutputProperties("a").at(0).shape();
+  const auto shape_o1 = properties.GetOutputProperties("o1").at(0).shape();
+  const auto shape_o2 = properties.GetOutputProperties("o2").at(0).shape();
+  EXPECT_EQ(2, shape_a.dim_size());
+  EXPECT_EQ(1, shape_o1.dim_size());
+  EXPECT_EQ(1, shape_o2.dim_size());
+  EXPECT_EQ(shape_a.dim(0).size(), shape_o1.dim(0).size());
+  EXPECT_EQ(shape_a.dim(1).size(), shape_o2.dim(0).size());
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/measuring_cost_estimator.h b/tensorflow/core/grappler/costs/measuring_cost_estimator.h
index 1b3edb4c27b325..3e741c91997403 100644
--- a/tensorflow/core/grappler/costs/measuring_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/measuring_cost_estimator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_COSTS_MEASURING_COST_ESTIMATOR_H_
-#define TENSORFLOW_GRAPPLER_COSTS_MEASURING_COST_ESTIMATOR_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_MEASURING_COST_ESTIMATOR_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_MEASURING_COST_ESTIMATOR_H_
 
 #include <string>
 #include <utility>
@@ -73,4 +73,4 @@ class MeasuringCostEstimator : public CostEstimator {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_COSTS_MEASURING_COST_ESTIMATOR_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_MEASURING_COST_ESTIMATOR_H_
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index b35873ce385974..b8e337582c93a8 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -27,10 +27,16 @@ namespace grappler {
 
 constexpr int kOpsPerMac = 2;
 constexpr char kConst[] = "Const";
+constexpr char kGuaranteeConst[] = "GuaranteeConst";
 constexpr char kConv2d[] = "Conv2D";
 constexpr char kConv2dBackpropFilter[] = "Conv2DBackpropFilter";
 constexpr char kConv2dBackpropInput[] = "Conv2DBackpropInput";
 constexpr char kFusedConv2dBiasActivation[] = "FusedConv2DBiasActivation";
+constexpr char kDepthwiseConv2dNative[] = "DepthwiseConv2dNative";
+constexpr char kDepthwiseConv2dNativeBackpropFilter[] =
+    "DepthwiseConv2dNativeBackpropFilter";
+constexpr char kDepthwiseConv2dNativeBackpropInput[] =
+    "DepthwiseConv2dNativeBackpropInput";
 constexpr char kMatMul[] = "MatMul";
 constexpr char kSparseMatMul[] = "SparseMatMul";
 constexpr char kPlaceholder[] = "Placeholder";
@@ -123,33 +129,6 @@ int64 GetOutputSize(const int64 input, const int64 filter, const int64 stride,
   }
 }
 
-// Return a minimum shape if the shape is unknown. If known, return the original
-// shape.
-TensorShapeProto MaybeGetMinimumShape(const TensorShapeProto& original_shape,
-                                      int rank, bool* found_unknown_shapes) {
-  auto shape = original_shape;
-  if (shape.unknown_rank() || shape.dim_size() < rank) {
-    *found_unknown_shapes = true;
-    TensorShapeProto::Dim dim;
-    VLOG(2) << "Use minimum shape because the rank is unknown.";
-    // The size of each dimension is at least 1, if unknown.
-    dim.set_size(1);
-    for (int i = 0; i < rank; i++) {
-      *shape.add_dim() = dim;
-    }
-  } else {
-    for (int i = 0; i < shape.dim_size(); i++) {
-      if (shape.dim(i).size() < 0) {
-        *found_unknown_shapes = true;
-        VLOG(2) << "Use minimum dim size 1 because the shape is unknown.";
-        // The size of each dimension is at least 1, if unknown.
-        shape.mutable_dim(i)->set_size(1);
-      }
-    }
-  }
-  return shape;
-}
-
 // Return the output element count of a binary element-wise op considering
 // broadcasting.
 int64 CwiseOutputElementCount(const TensorShapeProto& input_shape_1,
@@ -181,6 +160,33 @@ int64 CwiseOutputElementCount(const TensorShapeProto& input_shape_1,
 
 }  // namespace
 
+// Return a minimum shape if the shape is unknown. If known, return the original
+// shape.
+TensorShapeProto MaybeGetMinimumShape(const TensorShapeProto& original_shape,
+                                      int rank, bool* found_unknown_shapes) {
+  auto shape = original_shape;
+  if (shape.unknown_rank() || shape.dim_size() < rank) {
+    *found_unknown_shapes = true;
+    TensorShapeProto::Dim dim;
+    VLOG(2) << "Use minimum shape because the rank is unknown.";
+    // The size of each dimension is at least 1, if unknown.
+    dim.set_size(1);
+    for (int i = 0; i < rank; i++) {
+      *shape.add_dim() = dim;
+    }
+  } else {
+    for (int i = 0; i < shape.dim_size(); i++) {
+      if (shape.dim(i).size() < 0) {
+        *found_unknown_shapes = true;
+        VLOG(2) << "Use minimum dim size 1 because the shape is unknown.";
+        // The size of each dimension is at least 1, if unknown.
+        shape.mutable_dim(i)->set_size(1);
+      }
+    }
+  }
+  return shape;
+}
+
 OpLevelCostEstimator::OpLevelCostEstimator() {
   // Syntactic sugar to build and return a lambda that takes an OpInfo and
   // returns a cost.
@@ -200,11 +206,20 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
        wrap(&OpLevelCostEstimator::PredictConv2DBackpropInput)},
       {kFusedConv2dBiasActivation,
        wrap(&OpLevelCostEstimator::PredictFusedConv2DBiasActivation)},
+      // reuse Conv2D for DepthwiseConv2dNative because the caculation is the
+      // same although the actual meaning of the parameters are different. See
+      // comments in PredictConv2D and related functions
+      {kDepthwiseConv2dNative, wrap(&OpLevelCostEstimator::PredictConv2D)},
+      {kDepthwiseConv2dNativeBackpropFilter,
+       wrap(&OpLevelCostEstimator::PredictConv2DBackpropFilter)},
+      {kDepthwiseConv2dNativeBackpropInput,
+       wrap(&OpLevelCostEstimator::PredictConv2DBackpropInput)},
       {kMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kSparseMatMul, wrap(&OpLevelCostEstimator::PredictMatMul)},
       {kBatchMatMul, wrap(&OpLevelCostEstimator::PredictBatchMatMul)},
 
       {kNoOp, wrap(&OpLevelCostEstimator::PredictNoOp)},
+      {kGuaranteeConst, wrap(&OpLevelCostEstimator::PredictNoOp)},
 
       {kGather, wrap(&OpLevelCostEstimator::PredictGatherOrSlice)},
       {kGatherV2, wrap(&OpLevelCostEstimator::PredictGatherOrSlice)},
@@ -537,18 +552,30 @@ OpLevelCostEstimator::ConvolutionDimensionsFromInputs(
 int64 OpLevelCostEstimator::CountConv2DOperations(
     const OpInfo& op_features, ConvolutionDimensions* conv_info,
     bool* found_unknown_shapes) const {
-  if (op_features.op() != kConv2d) {
-    LOG(ERROR) << "Invalid Operation";
-    return 0;
-  }
+  DCHECK(op_features.op() == kConv2d ||
+         op_features.op() == kDepthwiseConv2dNative)
+      << "Invalid Operation: not Conv2D nor DepthwiseConv2dNative";
+
   ConvolutionDimensions conv_dims = ConvolutionDimensionsFromInputs(
       op_features.inputs(0).shape(), op_features.inputs(1).shape(), op_features,
       found_unknown_shapes);
 
+  //  in DepthwiseConv2dNative conv_dims.oz is actually the channel depth
+  //  multiplier; The effective output channel depth oz_effective is
+  //  conv_dims.iz * conv_dims.oz. thus # ops = N x H x W x oz_effective x 2RS.
+  //  Compare to Conv2D where # ops =  N x H x W x iz x oz x 2RS,
+  //  oz = oz_effective,  then Conv2D_ops / Depthwise_conv2d_native_ops = iz.
   int64 ops = conv_dims.batch;
   ops *= conv_dims.ox * conv_dims.oy;
   ops *= conv_dims.kx * conv_dims.ky;
-  ops *= conv_dims.iz * conv_dims.oz;
+  if (op_features.op() == kConv2d) {
+    ops *= conv_dims.iz * conv_dims.oz;
+  } else {
+    // To ensure output tensor dims to be correct for DepthwiseConv2DNative,
+    // although ops are the same as Conv2D.
+    conv_dims.oz *= conv_dims.iz;
+    ops *= conv_dims.oz;
+  }
   ops *= kOpsPerMac;
 
   if (conv_info != nullptr) {
@@ -795,7 +822,10 @@ int64 OpLevelCostEstimator::CountConv2DBackpropInputOperations(
     bool* found_unknown_shapes) const {
   int64 ops = 0;
 
-  DCHECK_EQ(kConv2dBackpropInput, op_features.op());
+  DCHECK(op_features.op() == kConv2dBackpropInput ||
+         op_features.op() == kDepthwiseConv2dNativeBackpropInput)
+      << "Invalid Operation: not kConv2dBackpropInput nor"
+         "kDepthwiseConv2dNativeBackpropInput";
 
   if (op_features.inputs_size() < 2) {
     *found_unknown_shapes = true;
@@ -828,10 +858,16 @@ int64 OpLevelCostEstimator::CountConv2DBackpropInputOperations(
   ops = conv_dims.batch;
   ops *= conv_dims.ox * conv_dims.oy;
   ops *= conv_dims.kx * conv_dims.ky;
-  ops *= conv_dims.iz * conv_dims.oz;
+  if (op_features.op() == kConv2dBackpropInput) {
+    ops *= conv_dims.iz * conv_dims.oz;
+  } else {
+    // conv_dims always use forward path definition regardless
+    conv_dims.oz *= conv_dims.iz;
+    ops *= conv_dims.oz;
+  }
   ops *= kOpsPerMac;
 
-  VLOG(1) << "Operations for Conv2DBackpropInput " << ops;
+  VLOG(1) << "Operations for" << op_features.op() << "  " << ops;
 
   if (returned_conv_dims != nullptr) {
     *returned_conv_dims = conv_dims;
@@ -843,7 +879,11 @@ int64 OpLevelCostEstimator::CountConv2DBackpropFilterOperations(
     const OpInfo& op_features, ConvolutionDimensions* returned_conv_dims,
     bool* found_unknown_shapes) const {
   int64 ops = 0;
-  DCHECK_EQ(kConv2dBackpropFilter, op_features.op());
+
+  DCHECK(op_features.op() == kConv2dBackpropFilter ||
+         op_features.op() == kDepthwiseConv2dNativeBackpropFilter)
+      << "Invalid Operation: not kConv2dBackpropFilter nor"
+         "kDepthwiseConv2dNativeBackpropFilter";
 
   TensorShapeProto filter_shape;
   bool shape_found = false;
@@ -875,10 +915,15 @@ int64 OpLevelCostEstimator::CountConv2DBackpropFilterOperations(
   ops = conv_dims.batch;
   ops *= conv_dims.ox * conv_dims.oy;
   ops *= conv_dims.kx * conv_dims.ky;
-  ops *= conv_dims.iz * conv_dims.oz;
+  if (op_features.op() == kConv2dBackpropFilter) {
+    ops *= conv_dims.iz * conv_dims.oz;
+  } else {
+    // conv_dims always use forward path definition regardless
+    conv_dims.oz *= conv_dims.iz;
+    ops *= conv_dims.oz;
+  }
   ops *= kOpsPerMac;
-
-  VLOG(1) << "Operations for Conv2DBackpropFilter" << ops;
+  VLOG(1) << "Operations for" << op_features.op() << "  " << ops;
 
   if (returned_conv_dims != nullptr) {
     *returned_conv_dims = conv_dims;
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.h b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
index 35649f7ee959a2..d384f5727965bc 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.h
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.h
@@ -30,6 +30,8 @@ namespace grappler {
 
 bool GetTensorShapeProtoFromTensorProto(const TensorProto& tensor_proto,
                                         TensorShapeProto* tensor_shape_proto);
+TensorShapeProto MaybeGetMinimumShape(const TensorShapeProto& original_shape,
+                                      int rank, bool* found_unknown_shapes);
 
 class OpLevelCostEstimator {
  public:
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
index 13ea43bed69282..b2c021b73ac4c3 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
@@ -128,6 +128,23 @@ OpContext DescribeConvolution(int batch, int ix, int iy, int iz1, int iz2,
   return op_context;
 }
 
+// Describe DepthwiseConvolution constructs an OpContext for a
+// DepthwiseConv2dNative applied to an input
+// tensor with shape (batch, ix, iy, iz1) and a kernel tensor with shape
+// (kx, ky, iz2, cm). cm is channel multiplier
+
+OpContext DescribeDepthwiseConv2dNative(int batch, int ix, int iy, int iz1,
+                                        int iz2, int kx, int ky, int cm) {
+  OpContext op_context;
+  SetCpuDevice(&op_context.op_info);
+  op_context.op_info.set_op("DepthwiseConv2dNative");
+
+  DescribeTensor4D(batch, ix, iy, iz1, op_context.op_info.add_inputs());
+  DescribeTensor4D(kx, ky, iz2, cm, op_context.op_info.add_inputs());
+
+  return op_context;
+}
+
 // DescribeFusedConv2DBiasActivation constructs an OpContext for a
 // FusedConv2DBiasActivation applied to a convolution input tensor with shape
 // (batch, ix, iy, iz1), a kernel tensor with shape (kx, ky, iz2, oz), a
@@ -505,6 +522,15 @@ TEST_F(OpLevelCostEstimatorTest, Conv2DExecutionTime) {
   EXPECT_FALSE(cost.inaccurate);
 }
 
+TEST_F(OpLevelCostEstimatorTest, DepthwiseConv2dNativeExecutionTime) {
+  auto cost =
+      PredictCosts(DescribeDepthwiseConv2dNative(16, 19, 19, 48, 48, 5, 5, 3));
+  EXPECT_EQ(Costs::Duration(112340), cost.memory_time);
+  EXPECT_EQ(Costs::Duration(4158720), cost.compute_time);
+  EXPECT_EQ(Costs::Duration(4271060), cost.execution_time);
+  EXPECT_FALSE(cost.inaccurate);
+}
+
 TEST_F(OpLevelCostEstimatorTest, DummyExecutionTime) {
   auto cost = PredictCosts(DescribeBinaryOp("Dummy", 1000, 1));
   EXPECT_EQ(Costs::Duration(2000), cost.memory_time);
diff --git a/tensorflow/core/grappler/costs/robust_stats.h b/tensorflow/core/grappler/costs/robust_stats.h
index 9d8f5bc970ad9c..f247eb940cef38 100644
--- a/tensorflow/core/grappler/costs/robust_stats.h
+++ b/tensorflow/core/grappler/costs/robust_stats.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_COSTS_ROBUST_STATS_H_
-#define TENSORFLOW_GRAPPLER_COSTS_ROBUST_STATS_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_ROBUST_STATS_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_ROBUST_STATS_H_
 
 #include <vector>
 namespace tensorflow {
@@ -39,4 +39,4 @@ class RobustStats {
 }  // namespace grappler
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_COSTS_ROBUST_STATS_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_ROBUST_STATS_H_
diff --git a/tensorflow/core/grappler/costs/utils.h b/tensorflow/core/grappler/costs/utils.h
index 409f07b28b16ca..d2c7c676667c50 100644
--- a/tensorflow/core/grappler/costs/utils.h
+++ b/tensorflow/core/grappler/costs/utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_COSTS_UTILS_H_
-#define TENSORFLOW_GRAPPLER_COSTS_UTILS_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_COSTS_UTILS_H_
+#define TENSORFLOW_CORE_GRAPPLER_COSTS_UTILS_H_
 
 #include <string>
 #include <unordered_map>
@@ -111,4 +111,4 @@ string GetStatsStringFromRunMetadata(const RunMetadata& run_metadata,
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_COSTS_UTILS_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_COSTS_UTILS_H_
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index 67bf1e6980e550..34d48819ac25ed 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -328,7 +328,7 @@ class VirtualScheduler {
   Costs graph_costs_;                   // Graph cost.
   std::map<string, Costs> op_to_cost_;  // Per-op cost.
 
-  // Auxilliary data structures for constructing NodeState and DeviceState.
+  // Auxiliary data structures for constructing NodeState and DeviceState.
   GraphProperties graph_properties_;
   Cluster* cluster_;  // Not owned.
 
diff --git a/tensorflow/core/grappler/devices.h b/tensorflow/core/grappler/devices.h
index 2d6c41888d92e0..1e60117b2d1649 100644
--- a/tensorflow/core/grappler/devices.h
+++ b/tensorflow/core/grappler/devices.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_DEVICES_H_
-#define TENSORFLOW_GRAPPLER_DEVICES_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_DEVICES_H_
+#define TENSORFLOW_CORE_GRAPPLER_DEVICES_H_
 
 #include <functional>
 
@@ -39,4 +39,4 @@ int GetNumAvailableLogicalCPUCores();
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_DEVICES_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_DEVICES_H_
diff --git a/tensorflow/core/grappler/graph_view.h b/tensorflow/core/grappler/graph_view.h
index c3baad09878777..584cb9048b64fd 100644
--- a/tensorflow/core/grappler/graph_view.h
+++ b/tensorflow/core/grappler/graph_view.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_GRAPH_VIEW_H_
-#define TENSORFLOW_GRAPPLER_GRAPH_VIEW_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_GRAPH_VIEW_H_
+#define TENSORFLOW_CORE_GRAPPLER_GRAPH_VIEW_H_
 
 #include <unordered_map>
 #include <unordered_set>
@@ -124,4 +124,4 @@ class GraphView {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_GRAPH_VIEW_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_GRAPH_VIEW_H_
diff --git a/tensorflow/core/grappler/grappler_item.h b/tensorflow/core/grappler/grappler_item.h
index cd165ac3d460fb..939e5fa04692fd 100644
--- a/tensorflow/core/grappler/grappler_item.h
+++ b/tensorflow/core/grappler/grappler_item.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_GRAPPLER_ITEM_H_
-#define TENSORFLOW_GRAPPLER_GRAPPLER_ITEM_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_GRAPPLER_ITEM_H_
+#define TENSORFLOW_CORE_GRAPPLER_GRAPPLER_ITEM_H_
 
 #include <memory>
 #include <string>
@@ -93,4 +93,4 @@ std::vector<const NodeDef*> ComputeTransitiveFanin(
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_GRAPPLER_ITEM_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_GRAPPLER_ITEM_H_
diff --git a/tensorflow/core/grappler/grappler_item_builder.h b/tensorflow/core/grappler/grappler_item_builder.h
index 6d181e49e67aca..aafd2fdcdaf920 100644
--- a/tensorflow/core/grappler/grappler_item_builder.h
+++ b/tensorflow/core/grappler/grappler_item_builder.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_GRAPPLER_ITEM_BUILDER_H_
-#define TENSORFLOW_GRAPPLER_GRAPPLER_ITEM_BUILDER_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_GRAPPLER_ITEM_BUILDER_H_
+#define TENSORFLOW_CORE_GRAPPLER_GRAPPLER_ITEM_BUILDER_H_
 
 #include <memory>
 #include <set>
@@ -59,4 +59,4 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_GRAPPLER_ITEM_BUILDER_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_GRAPPLER_ITEM_BUILDER_H_
diff --git a/tensorflow/core/grappler/inputs/file_input_yielder.h b/tensorflow/core/grappler/inputs/file_input_yielder.h
index b597319261011e..f3e9ecb677fdf8 100644
--- a/tensorflow/core/grappler/inputs/file_input_yielder.h
+++ b/tensorflow/core/grappler/inputs/file_input_yielder.h
@@ -18,8 +18,8 @@ limitations under the License.
 // that may be stored in the checkpoint are not restored in order to speedup the
 // initialization.
 
-#ifndef TENSORFLOW_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
-#define TENSORFLOW_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
+#define TENSORFLOW_CORE_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
 
 #include <stddef.h>
 #include <limits>
@@ -53,4 +53,4 @@ class FileInputYielder : public InputYielder {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_INPUTS_FILE_INPUT_YIELDER_H_
diff --git a/tensorflow/core/grappler/inputs/input_yielder.h b/tensorflow/core/grappler/inputs/input_yielder.h
index c9f90820a9928a..06f642c513018d 100644
--- a/tensorflow/core/grappler/inputs/input_yielder.h
+++ b/tensorflow/core/grappler/inputs/input_yielder.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_INPUTS_INPUT_YIELDER_H_
-#define TENSORFLOW_GRAPPLER_INPUTS_INPUT_YIELDER_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_INPUTS_INPUT_YIELDER_H_
+#define TENSORFLOW_CORE_GRAPPLER_INPUTS_INPUT_YIELDER_H_
 
 namespace tensorflow {
 namespace grappler {
@@ -32,4 +32,4 @@ class InputYielder {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_INPUTS_INPUT_YIELDER_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_INPUTS_INPUT_YIELDER_H_
diff --git a/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h
index 434b660614b426..74e5080a30f9c6 100644
--- a/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h
+++ b/tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_INPUTS_TRIVIAL_TEST_GRAPH_INPUT_YIELDER_H_
-#define TENSORFLOW_GRAPPLER_INPUTS_TRIVIAL_TEST_GRAPH_INPUT_YIELDER_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_INPUTS_TRIVIAL_TEST_GRAPH_INPUT_YIELDER_H_
+#define TENSORFLOW_CORE_GRAPPLER_INPUTS_TRIVIAL_TEST_GRAPH_INPUT_YIELDER_H_
 
 #include <string>
 #include <vector>
@@ -44,4 +44,4 @@ class TrivialTestGraphInputYielder : public InputYielder {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_INPUTS_TRIVIAL_TEST_GRAPH_INPUT_YIELDER_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_INPUTS_TRIVIAL_TEST_GRAPH_INPUT_YIELDER_H_
diff --git a/tensorflow/core/grappler/inputs/utils.h b/tensorflow/core/grappler/inputs/utils.h
index 00fcfa7a3f4b62..627dd5359fe181 100644
--- a/tensorflow/core/grappler/inputs/utils.h
+++ b/tensorflow/core/grappler/inputs/utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_INPUTS_UTILS_H_
-#define TENSORFLOW_GRAPPLER_INPUTS_UTILS_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_INPUTS_UTILS_H_
+#define TENSORFLOW_CORE_GRAPPLER_INPUTS_UTILS_H_
 
 #include <set>
 #include <vector>
@@ -37,4 +37,4 @@ Status ReadGraphDefFromFile(const std::string& graph_def_pbtxt_path,
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_INPUTS_UTILS_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_INPUTS_UTILS_H_
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index f595cf6456322a..2a47a4c4958c7e 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -54,6 +54,10 @@ bool IsApproximateEqual(const NodeDef& node) {
 
 bool IsAvgPoolGrad(const NodeDef& node) { return node.op() == "AvgPoolGrad"; }
 
+bool IsAssign(const NodeDef& node) {
+  return node.op() == "Assign" || node.op() == "AssignVariableOp";
+}
+
 bool IsAssert(const NodeDef& node) { return node.op() == "Assert"; }
 
 bool IsAtan2(const NodeDef& node) { return node.op() == "Atan2"; }
@@ -74,6 +78,12 @@ bool IsCheckNumerics(const NodeDef& node) {
   return node.op() == "CheckNumerics";
 }
 
+bool IsCollective(const NodeDef& node) {
+  return node.op() == "CollectiveReduce" ||
+         node.op() == "CollectiveBcastSend" ||
+         node.op() == "CollectiveBcastRecv";
+}
+
 bool IsComplex(const NodeDef& node) { return node.op() == "Complex"; }
 
 bool IsComplexAbs(const NodeDef& node) { return node.op() == "ComplexAbs"; }
@@ -102,6 +112,8 @@ bool IsConv2DBackpropInput(const NodeDef& node) {
   return node.op() == "Conv2DBackpropInput";
 }
 
+bool IsConv3D(const NodeDef& node) { return node.op() == "Conv3D"; }
+
 bool IsDepthwiseConv2dNative(const NodeDef& node) {
   return node.op() == "DepthwiseConv2dNative";
 }
@@ -197,6 +209,8 @@ bool IsMax(const NodeDef& node) { return node.op() == "Max"; }
 
 bool IsMaximum(const NodeDef& node) { return node.op() == "Maximum"; }
 
+bool IsMaxPoolGrad(const NodeDef& node) { return node.op() == "MaxPoolGrad"; }
+
 bool IsMean(const NodeDef& node) { return node.op() == "Mean"; }
 
 bool IsMerge(const NodeDef& node) {
@@ -250,6 +264,16 @@ bool IsPrint(const NodeDef& node) { return node.op() == "Print"; }
 
 bool IsProd(const NodeDef& node) { return node.op() == "Prod"; }
 
+bool IsQueue(const NodeDef& node) {
+  return str_util::EndsWith(node.op(), "QueueV2");
+}
+
+bool IsRandomShuffle(const NodeDef& node) {
+  return node.op() == "RandomShuffle";
+}
+
+bool IsRank(const NodeDef& node) { return node.op() == "Rank"; }
+
 bool IsReal(const NodeDef& node) { return node.op() == "Real"; }
 
 bool IsRealDiv(const NodeDef& node) { return node.op() == "RealDiv"; }
@@ -285,6 +309,8 @@ bool IsReverse(const NodeDef& node) {
 
 bool IsReverseV2(const NodeDef& node) { return node.op() == "ReverseV2"; }
 
+bool IsRsqrt(const NodeDef& node) { return node.op() == "Rsqrt"; }
+
 bool IsRsqrtGrad(const NodeDef& node) { return node.op() == "RsqrtGrad"; }
 
 bool IsSelect(const NodeDef& node) { return node.op() == "Select"; }
@@ -299,14 +325,16 @@ bool IsShape(const NodeDef& node) { return node.op() == "Shape"; }
 
 bool IsShapeN(const NodeDef& node) { return node.op() == "ShapeN"; }
 
-bool IsShuffle(const NodeDef& node) {
-  return node.op() == "Shuffle" || node.op() == "RandomShuffle";
-}
+bool IsShuffle(const NodeDef& node) { return node.op() == "Shuffle"; }
 
 bool IsSigmoidGrad(const NodeDef& node) { return node.op() == "SigmoidGrad"; }
 
+bool IsSize(const NodeDef& node) { return node.op() == "Size"; }
+
 bool IsSlice(const NodeDef& node) { return node.op() == "Slice"; }
 
+bool IsSnapshot(const NodeDef& node) { return node.op() == "Snapshot"; }
+
 bool IsSoftplusGrad(const NodeDef& node) { return node.op() == "SoftplusGrad"; }
 
 bool IsSoftsignGrad(const NodeDef& node) { return node.op() == "SoftsignGrad"; }
@@ -315,6 +343,8 @@ bool IsSplit(const NodeDef& node) { return node.op() == "Split"; }
 
 bool IsSplitV(const NodeDef& node) { return node.op() == "SplitV"; }
 
+bool IsSqrt(const NodeDef& node) { return node.op() == "Sqrt"; }
+
 bool IsSqrtGrad(const NodeDef& node) { return node.op() == "SqrtGrad"; }
 
 bool IsSquare(const NodeDef& node) { return node.op() == "Square"; }
@@ -388,6 +418,21 @@ bool IsPersistent(const NodeDef& node) {
   return IsConstant(node) || IsVariable(node);
 }
 
+bool MaybeHasRefInput(const NodeDef& node) {
+  const OpDef* op_def;
+  Status status = OpRegistry::Global()->LookUpOpDef(node.op(), &op_def);
+  if (!status.ok()) {
+    return true;
+  }
+  // Nodes such as Assign or AssignAdd modify one of their inputs.
+  for (const auto& input : op_def->input_arg()) {
+    if (input.is_ref()) {
+      return true;
+    }
+  }
+  return false;
+}
+
 bool IsFreeOfSideEffect(const NodeDef& node) {
   // Placeholders must be preserved to keep the graph feedable.
   if (IsPlaceholder(node)) {
@@ -408,6 +453,10 @@ bool IsFreeOfSideEffect(const NodeDef& node) {
       return false;
     }
   }
+  // Queue ops modify the queue which is a side effect.
+  if (node.op().find("Queue") != std::string::npos) {
+    return false;
+  }
   return !ModifiesInputsInPlace(node);
 }
 
@@ -458,28 +507,39 @@ bool IsInvolution(const NodeDef& node) {
   return involution_ops->count(node.op()) > 0;
 }
 
+bool IsValueAndOrderAndShapePreserving(const NodeDef& node) {
+  if (NumNonControlInputs(node) == 1 && IsAggregate(node)) {
+    return true;
+  }
+  static const std::unordered_set<string>*
+      value_and_order_and_shape_preserving_ops =
+          CHECK_NOTNULL((new const std::unordered_set<string>{
+              "CheckNumerics",
+              "DebugGradientIdentity",
+              "DeepCopy"
+              "Enter",
+              "Exit",
+              "PreventGradient",
+              "Print",
+              "Snapshot",
+              "StopGradient",
+          }));
+  return value_and_order_and_shape_preserving_ops->count(node.op()) > 0 ||
+         IsIdentity(node);
+}
+
 bool IsValueAndOrderPreserving(const NodeDef& node) {
   if (NumNonControlInputs(node) == 1 && IsAggregate(node)) {
     return true;
   }
   static const std::unordered_set<string>* value_and_order_preserving_ops =
       CHECK_NOTNULL((new const std::unordered_set<string>{
-          "CheckNumerics",
-          "DebugGradientIdentity",
-          "DeepCopy"
-          "Enter",
-          "Exit",
           "ExpandDims",
-          "Identity",
-          "IdentityN",
-          "PreventGradient",
-          "Print",
           "Reshape",
-          "Snapshot",
           "Squeeze",
-          "StopGradient",
       }));
-  return value_and_order_preserving_ops->count(node.op()) > 0;
+  return value_and_order_preserving_ops->count(node.op()) > 0 ||
+         IsValueAndOrderAndShapePreserving(node);
 }
 
 bool IsValuePreserving(const NodeDef& node) {
@@ -546,7 +606,7 @@ bool IsUnaryElementWise(const NodeDef& node) {
           "Tanh",
       }));
   return element_wise_ops->count(node.op()) > 0 ||
-         (!IsIdentityN(node) && IsValueAndOrderPreserving(node));
+         IsValueAndOrderAndShapePreserving(node);
 }
 
 bool HasOpDef(const NodeDef& node) {
@@ -554,5 +614,9 @@ bool HasOpDef(const NodeDef& node) {
   return OpRegistry::Global()->LookUpOpDef(node.op(), &op_def).ok();
 }
 
+bool IsIdempotent(const NodeDef& node) {
+  return IsValueAndOrderAndShapePreserving(node) && IsFreeOfSideEffect(node);
+}
+
 }  // namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 7f5da19d905b41..e7f39981c00dc2 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -13,15 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_OP_TYPES_H_
-#define TENSORFLOW_GRAPPLER_OP_TYPES_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_OP_TYPES_H_
+#define TENSORFLOW_CORE_GRAPPLER_OP_TYPES_H_
 
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace grappler {
-
 bool IsAdd(const NodeDef& node);
 bool IsAddN(const NodeDef& node);
 bool IsAll(const NodeDef& node);
@@ -31,6 +30,7 @@ bool IsAnyDiv(const NodeDef& node);
 bool IsApproximateEqual(const NodeDef& node);
 bool IsAvgPoolGrad(const NodeDef& node);
 bool IsAssert(const NodeDef& node);
+bool IsAssign(const NodeDef& node);
 bool IsAtan2(const NodeDef& node);
 bool IsBetainc(const NodeDef& node);
 bool IsBiasAdd(const NodeDef& node);
@@ -38,6 +38,7 @@ bool IsBiasAddGrad(const NodeDef& node);
 bool IsBitcast(const NodeDef& node);
 bool IsCast(const NodeDef& node);
 bool IsCheckNumerics(const NodeDef& node);
+bool IsCollective(const NodeDef& node);
 bool IsComplex(const NodeDef& node);
 bool IsComplexAbs(const NodeDef& node);
 bool IsConj(const NodeDef& node);
@@ -48,6 +49,7 @@ bool IsConstant(const NodeDef& node);
 bool IsConv2D(const NodeDef& node);
 bool IsConv2DBackpropFilter(const NodeDef& node);
 bool IsConv2DBackpropInput(const NodeDef& node);
+bool IsConv3D(const NodeDef& node);
 bool IsDepthwiseConv2dNative(const NodeDef& node);
 bool IsDepthwiseConv2dNativeBackpropFilter(const NodeDef& node);
 bool IsDepthwiseConv2dNativeBackpropInput(const NodeDef& node);
@@ -77,6 +79,7 @@ bool IsLogicalNot(const NodeDef& node);
 bool IsLogicalOr(const NodeDef& node);
 bool IsMax(const NodeDef& node);
 bool IsMaximum(const NodeDef& node);
+bool IsMaxPoolGrad(const NodeDef& node);
 bool IsMean(const NodeDef& node);
 bool IsMerge(const NodeDef& node);
 bool IsMin(const NodeDef& node);
@@ -98,6 +101,9 @@ bool IsPolygamma(const NodeDef& node);
 bool IsPrint(const NodeDef& node);
 bool IsProd(const NodeDef& node);
 bool IsPow(const NodeDef& node);
+bool IsQueue(const NodeDef& node);
+bool IsRandomShuffle(const NodeDef& node);
+bool IsRank(const NodeDef& node);
 bool IsReal(const NodeDef& node);
 bool IsRealDiv(const NodeDef& node);
 bool IsRelu6Grad(const NodeDef& node);
@@ -109,19 +115,23 @@ bool IsReshape(const NodeDef& node);
 bool IsRestore(const NodeDef& node);
 bool IsReverse(const NodeDef& node);
 bool IsReverseV2(const NodeDef& node);
+bool IsRsqrt(const NodeDef& node);
 bool IsRsqrtGrad(const NodeDef& node);
 bool IsSelect(const NodeDef& node);
 bool IsSeluGrad(const NodeDef& node);
 bool IsSend(const NodeDef& node);
+bool IsSize(const NodeDef& node);
 bool IsSlice(const NodeDef& node);
 bool IsShape(const NodeDef& node);
 bool IsShapeN(const NodeDef& node);
 bool IsShuffle(const NodeDef& node);
 bool IsSigmoidGrad(const NodeDef& node);
+bool IsSnapshot(const NodeDef& node);
 bool IsSoftplusGrad(const NodeDef& node);
 bool IsSoftsignGrad(const NodeDef& node);
 bool IsSplit(const NodeDef& node);
 bool IsSplitV(const NodeDef& node);
+bool IsSqrt(const NodeDef& node);
 bool IsSqrtGrad(const NodeDef& node);
 bool IsSquare(const NodeDef& node);
 bool IsSquaredDifference(const NodeDef& node);
@@ -159,6 +169,10 @@ bool IsPersistent(const NodeDef& node);
 
 bool IsFreeOfSideEffect(const NodeDef& node);
 
+// Returns true if the takes a tensor reference as input, or if looking up its
+// OpDef failed.
+bool MaybeHasRefInput(const NodeDef& node);
+
 bool ModifiesFrameInfo(const NodeDef& node);
 
 // Returns true if the op is known to write to one or more of its inputs.
@@ -168,6 +182,10 @@ bool ModifiesInputsInPlace(const NodeDef& node);
 // own inverse such that f(f(x)) == x.
 bool IsInvolution(const NodeDef& node);
 
+// Returns true if the op preserves the order and value of elements
+// and shape of its first input tensor.
+bool IsValueAndOrderAndShapePreserving(const NodeDef& node);
+
 // Returns true if the op preserves the order and value of elements in its
 // first input tensor and possible changes its shape.
 bool IsValueAndOrderPreserving(const NodeDef& node);
@@ -177,6 +195,10 @@ bool IsValueAndOrderPreserving(const NodeDef& node);
 // function returns true if the op commutes with all element-wise operations.
 bool IsValuePreserving(const NodeDef& node);
 
+// Returns true if node is idempotent w.r.t. its first input, i.e. if
+// Op(Op(x, y, z), y, z) = Op(x, y, z).
+bool IsIdempotent(const NodeDef& node);
+
 bool IsUnaryElementWise(const NodeDef& node);
 
 // Returns true if we can find an opdef corresponding to the op of the node.
@@ -185,4 +207,4 @@ bool HasOpDef(const NodeDef& node);
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_OP_TYPES_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_OP_TYPES_H_
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 3dc432a6baed21..51abffb4f6e34e 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -96,6 +96,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":graph_optimizer",
+        ":symbolic_shapes",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -214,7 +215,6 @@ cc_library(
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
-        "//tensorflow/core/grappler/utils:frame",
     ],
 )
 
@@ -267,7 +267,6 @@ cc_library(
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
-        "//tensorflow/core/grappler/utils:frame",
         "//tensorflow/core/grappler/utils:topological_sort",
     ],
 )
@@ -329,11 +328,13 @@ tf_gpu_cc_test(
         ":model_pruner",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+        "//tensorflow/core/grappler/utils:grappler_test",
         "//tensorflow/core/grappler/utils:topological_sort",
     ],
 )
@@ -508,7 +509,6 @@ cc_library(
         ":arithmetic_optimizer",
         ":auto_parallel",
         ":constant_folding",
-        ":custom_graph_optimizer",
         ":custom_graph_optimizer_registry",
         ":debug_stripper",
         ":dependency_optimizer",
@@ -518,11 +518,16 @@ cc_library(
         ":loop_optimizer",
         ":memory_optimizer",
         ":model_pruner",
+        ":remapper",
+        ":scoped_allocator_optimizer",
+        ":shape_optimizer",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler/utils:colocation",
+        "//tensorflow/core/grappler/utils:functions",
         "//tensorflow/core/grappler/utils:topological_sort",
     ],
 )
@@ -539,9 +544,11 @@ tf_gpu_cc_test(
         "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+        "//tensorflow/core/grappler/utils:grappler_test",
     ],
 )
 
@@ -600,6 +607,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_view",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
@@ -624,6 +632,80 @@ tf_gpu_cc_test(
     ],
 )
 
+cc_library(
+    name = "shape_optimizer",
+    srcs = ["shape_optimizer.cc"],
+    hdrs = [
+        "shape_optimizer.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_optimizer",
+        ":symbolic_shapes",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_view",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/utils:frame",
+    ],
+)
+
+tf_cc_test(
+    name = "shape_optimizer_test",
+    srcs = ["shape_optimizer_test.cc"],
+    deps = [
+        ":shape_optimizer",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/utils:grappler_test",
+    ],
+)
+
+cc_library(
+    name = "remapper",
+    srcs = ["remapper.cc"],
+    hdrs = [
+        "remapper.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":constant_folding",
+        ":graph_optimizer",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_view",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/costs:graph_properties",
+    ],
+)
+
+tf_gpu_cc_test(
+    name = "remapper_test",
+    srcs = ["remapper_test.cc"],
+    deps = [
+        ":remapper",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:devices",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+        "//tensorflow/core/grappler/utils:grappler_test",
+    ],
+)
+
 cc_library(
     name = "symbolic_shapes",
     srcs = ["symbolic_shapes.cc"],
@@ -631,6 +713,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
     ] + tf_protos_grappler(),
 )
@@ -682,3 +765,47 @@ tf_gpu_cc_test(
         "//tensorflow/core/grappler/utils:grappler_test",
     ],
 )
+
+cc_library(
+    name = "scoped_allocator_optimizer",
+    srcs = ["scoped_allocator_optimizer.cc"],
+    hdrs = [
+        "scoped_allocator_optimizer.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_optimizer",
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:scoped_allocator_ops_op_lib",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/utils:frame",
+    ],
+)
+
+tf_cc_test(
+    name = "scoped_allocator_optimizer_test",
+    size = "small",
+    srcs = ["scoped_allocator_optimizer_test.cc"],
+    deps = [
+        ":scoped_allocator_optimizer",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:direct_session",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+    ],
+)
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index c0bd0bda95cc55..51110b4bda7b7c 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -23,7 +23,9 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
@@ -37,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/tensor_coding.h"
@@ -49,6 +52,12 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
+// Mark nodes created or optimized by a stage with a tag.
+constexpr char kAddOpsRewriteTag[] =
+    "_grappler:ArithmeticOptimizer:AddOpsRewriteStage";
+constexpr char kMinimizeBroadcastsTag[] =
+    "_grappler:ArithmeticOptimizer:MinimizeBroadcasts";
+
 // Extract values from a Const op to `values`. Returns true if succeeds.
 template <typename T>
 bool ValuesFromConstNode(const NodeDef& node, std::vector<T>* values) {
@@ -92,38 +101,6 @@ bool ValuesFromConstNode(const NodeDef& node, std::vector<T>* values) {
   return false;
 }
 
-template <typename T>
-bool IsInnerMatrixTranspose(const std::vector<T>& perm) {
-  const T n = perm.size();
-  if (n < 2) {
-    return false;
-  }
-  for (T i = 0; i < n - 2; ++i) {
-    if (perm[i] != i) {
-      return false;
-    }
-  }
-  return perm[n - 1] == n - 2 && perm[n - 2] == n - 1;
-}
-
-bool IsInnerMatrixTransposeNode(const NodeDef& transpose_node,
-                                const NodeMap* node_map) {
-  if (transpose_node.op() != "Transpose" &&
-      transpose_node.op() != "ConjugateTranspose") {
-    return false;
-  }
-  const NodeDef* perm_node = node_map->GetNode(transpose_node.input(1));
-  std::vector<int> perm32;
-  if (ValuesFromConstNode(*perm_node, &perm32)) {
-    return IsInnerMatrixTranspose(perm32);
-  }
-  std::vector<int64> perm64;
-  if (ValuesFromConstNode(*perm_node, &perm64)) {
-    return IsInnerMatrixTranspose(perm64);
-  }
-  return false;
-}
-
 bool MaybeAddControlInput(const string& new_input, NodeDef* node,
                           GraphDef* graph, NodeMap* node_map) {
   bool already_exists = false;
@@ -142,28 +119,10 @@ bool MaybeAddControlInput(const string& new_input, NodeDef* node,
   return !already_exists;
 }
 
-int CopyControlInputs(const NodeDef& from, NodeDef* to, GraphDef* graph,
-                      NodeMap* node_map) {
-  int num_copied = 0;
-  for (const string& input : from.input()) {
-    if (IsControlInput(input) &&
-        MaybeAddControlInput(input, to, graph, node_map)) {
-      ++num_copied;
-    }
-  }
-  return num_copied;
-}
-
 void SetDataTypeToAttr(DataType dtype, const string& attr_name, NodeDef* node) {
   (*node->mutable_attr())[attr_name].set_type(dtype);
 }
 
-void FlipBooleanAttr(const string& attr_name, NodeDef* node) {
-  const bool old_value =
-      !node->attr().count(attr_name) ? false : node->attr().at(attr_name).b();
-  (*node->mutable_attr())[attr_name].set_b(!old_value);
-}
-
 string SourceDataTypeAttrName(const NodeDef& node) {
   if (node.op() == "Bitcast") {
     return "T";
@@ -197,57 +156,6 @@ void SetSourceDataType(DataType dtype, NodeDef* node) {
   SetDataTypeToAttr(dtype, SourceDataTypeAttrName(*node), node);
 }
 
-bool IsNumberType(DataType dtype) { return kNumberTypes.Contains(dtype); }
-
-// Returns whether `reshape` is an identity op. The tensor that `reshape`
-// reshapes is the `output_pos`-th output of node `input`.
-bool ReshapeIsIdentity(const NodeDef& reshape, const NodeDef& input,
-                       const int output_pos,
-                       const GraphProperties& graph_properties) {
-  const std::vector<OpInfo::TensorProperties>& reshape_props =
-      graph_properties.GetOutputProperties(reshape.name());
-  const std::vector<OpInfo::TensorProperties>& input_props =
-      graph_properties.GetOutputProperties(input.name());
-  if (reshape_props.empty() || input_props.size() <= output_pos) {
-    return false;
-  }
-
-  const PartialTensorShape& src_shape = input_props[output_pos].shape();
-  const PartialTensorShape& dst_shape = reshape_props[0].shape();
-
-  if (src_shape.unknown_rank() || dst_shape.unknown_rank()) {
-    return false;
-  }
-
-  if (!dst_shape.IsCompatibleWith(src_shape)) {
-    return false;
-  }
-
-  // Returns false when src_shape or dst_shape has >=2 dimensions with unknown
-  // sizes.
-  auto num_unknown_dim_sizes = [](const PartialTensorShape& partial_shape) {
-    auto dim_sizes = partial_shape.dim_sizes();
-    return std::count_if(dim_sizes.begin(), dim_sizes.end(),
-                         [](int dim) { return dim < 0; });
-  };
-  int src_num_unknown_dim_sizes = num_unknown_dim_sizes(src_shape);
-  int dst_num_unknown_dim_sizes = num_unknown_dim_sizes(dst_shape);
-  if (src_num_unknown_dim_sizes > 1 || dst_num_unknown_dim_sizes > 1) {
-    return false;
-  }
-
-  // If dst_num_unknown_dim_sizes != src_num_unknown_dim_sizes we would weaken
-  // shape inference in subsequent passes if we removed this reshape.
-  if (src_num_unknown_dim_sizes != dst_num_unknown_dim_sizes) {
-    return false;
-  }
-
-  // Remove the reshape if both are fully defined or partially defined and the
-  // unknown or symbolic shape appears on the same dimension, i.e., if
-  // IsIdenticalTo returns true.
-  return dst_shape.IsIdenticalTo(src_shape);
-}
-
 NodeDef* GetTailOfValuePreservingChain(
     const NodeDef& node, const NodeMap& node_map,
     const std::unordered_set<string>& nodes_to_preserve) {
@@ -259,6 +167,17 @@ NodeDef* GetTailOfValuePreservingChain(
                         is_value_preserving_non_branching);
 }
 
+NodeDef* GetTailOfIdempotentChain(
+    const NodeDef& node, const NodeMap& node_map,
+    const std::unordered_set<string>& nodes_to_preserve) {
+  auto is_idempotent_non_branching = [&](const NodeDef& node) {
+    return nodes_to_preserve.find(node.name()) == nodes_to_preserve.end() &&
+           IsIdempotent(node) && NumNonControlOutputs(node, node_map) == 1;
+  };
+  return GetTailOfChain(node, node_map, /*follow_control_input=*/false,
+                        is_idempotent_non_branching);
+}
+
 // Graph optimizer context extension specific to ArithmeticOptimizer.
 struct ArithmeticOptimizerContext {
   explicit ArithmeticOptimizerContext(SetVector<NodeDef*>* nodes_to_simplify)
@@ -275,7 +194,7 @@ class ArithmeticOptimizerStage : public GraphOptimizerStage<string> {
                                     const ArithmeticOptimizerContext ctx_ext)
       : GraphOptimizerStage("ArithmeticOptimizer", name, ctx),
         ctx_ext_(ctx_ext) {}
-  virtual ~ArithmeticOptimizerStage() = default;
+  ~ArithmeticOptimizerStage() override = default;
 
  protected:
   // Simplification graph rewrite can create additional nodes that are inputs
@@ -300,6 +219,12 @@ class ArithmeticOptimizerStage : public GraphOptimizerStage<string> {
         }
       }
     }
+    DedupControlInputs(target_node);
+  }
+
+  bool IsInPreserveSet(const NodeDef& node) const {
+    return ctx().nodes_to_preserve->find(node.name()) !=
+           ctx().nodes_to_preserve->end();
   }
 
  private:
@@ -321,7 +246,7 @@ class ArithmeticNodesGroupOptimizerStage : public ArithmeticOptimizerStage {
   explicit ArithmeticNodesGroupOptimizerStage(
       const string& name, const GraphOptimizerContext& ctx,
       const ArithmeticOptimizerContext ctx_ext)
-      : ArithmeticOptimizerStage(name, ctx, ctx_ext), optimized_nodes_{} {}
+      : ArithmeticOptimizerStage(name, ctx, ctx_ext) {}
   ~ArithmeticNodesGroupOptimizerStage() override = default;
 
   // Input name with a statically inferred shape from GraphProperties
@@ -460,13 +385,16 @@ class ArithmeticNodesGroupOptimizerStage : public ArithmeticOptimizerStage {
     return signature;
   }
 
-  void AddToOptimizedNodes(const NodeDef* node) {
-    optimized_nodes_.insert(node->name());
+  void MarkWithTag(const StringPiece tag, NodeDef* node) {
+    AddNodeAttr(tag, true, node);
   }
 
-  void AddAllMembersToOptimizedNodes(const OptimizedNodesGroup& group) {
-    AddToOptimizedNodes(group.root_node);
-    for (const NodeDef* opt : group.optimized_nodes) AddToOptimizedNodes(opt);
+  void MarkAllMembersWithTag(const OptimizedNodesGroup& group,
+                             const StringPiece tag) const {
+    AddNodeAttr(tag, true, group.root_node);
+    for (NodeDef* optimized_node : group.optimized_nodes) {
+      AddNodeAttr(tag, true, optimized_node);
+    }
   }
 
   bool IsOnTheSameDevice(const OptimizedNodesGroup& group,
@@ -479,13 +407,14 @@ class ArithmeticNodesGroupOptimizerStage : public ArithmeticOptimizerStage {
            ctx().nodes_to_preserve->end();
   }
 
-  bool IsAlreadyOptimized(const NodeDef& node) const {
-    return optimized_nodes_.find(node.name()) != optimized_nodes_.end();
+  bool IsMarkedWithTag(const NodeDef& node, const StringPiece tag) const {
+    return HasNodeAttr(node, tag);
   }
 
- private:
-  // set of nodes already processed by this optimizer stage
-  std::unordered_set<string> optimized_nodes_;
+  bool IsMarkedWithAnyTag(const NodeDef& node, const StringPiece tag1,
+                          const StringPiece tag2) const {
+    return IsMarkedWithTag(node, tag1) || IsMarkedWithTag(node, tag2);
+  }
 };
 
 // Rewrite a tree of Add/AddN with a single AddN operation, consuming all the
@@ -561,7 +490,7 @@ class AddOpsRewriteStage : public ArithmeticNodesGroupOptimizerStage {
     if (!IsAdd(node) && !IsAddN(node)) {
       return false;
     }
-    if (IsInPreserveSet(node) || IsAlreadyOptimized(node)) {
+    if (IsInPreserveSet(node) || IsMarkedWithTag(node, kAddOpsRewriteTag)) {
       return false;
     }
     // TODO(ezhulenev): relax this condition for root node
@@ -579,7 +508,7 @@ class AddOpsRewriteStage : public ArithmeticNodesGroupOptimizerStage {
             << " num_inputs=" << group.inputs.size();
 
     // Do not optimize any of the nodes that are part of this group.
-    AddAllMembersToOptimizedNodes(group);
+    MarkAllMembersWithTag(group, kAddOpsRewriteTag);
 
     // All new nodes will be placed under the scope of a root node.
     auto root_scope_and_name = ParseNodeScopeAndName(group.root_node->name());
@@ -688,7 +617,7 @@ class AddOpsRewriteStage : public ArithmeticNodesGroupOptimizerStage {
       node->add_input(inputAndShape.input);
     }
 
-    AddToOptimizedNodes(node);
+    MarkWithTag(kAddOpsRewriteTag, node);
     return InputAndShape(node_name, shape);
   }
 
@@ -705,14 +634,13 @@ class AddOpsRewriteStage : public ArithmeticNodesGroupOptimizerStage {
     node->set_op("Add");
     node->set_device(root_node.device());
     (*node->mutable_attr())["T"].set_type(dtype);
+    node->add_input(left.input);
+    node->add_input(right.input);
 
     ctx().node_map->AddOutput(left.input, node_name);
     ctx().node_map->AddOutput(right.input, node_name);
 
-    node->add_input(left.input);
-    node->add_input(right.input);
-
-    AddToOptimizedNodes(node);
+    MarkWithTag(kAddOpsRewriteTag, node);
     return InputAndShape(
         node_name, TensorShapeProto());  // shape is not important at this point
   }
@@ -960,7 +888,9 @@ class MinimizeBroadcasts : public ArithmeticNodesGroupOptimizerStage {
 
   bool IsSupported(const NodeDef* node) const override {
     if (!IsBinaryAssociative(*node)) return false;
-    if (IsAlreadyOptimized(*node)) return false;
+
+    if (IsMarkedWithAnyTag(*node, kMinimizeBroadcastsTag, kAddOpsRewriteTag))
+      return false;
 
     // has a symbolically defined shape with broadcastable inputs
     OpInfo::TensorProperties properties;
@@ -984,7 +914,11 @@ class MinimizeBroadcasts : public ArithmeticNodesGroupOptimizerStage {
     if (!IsSameOp(group, node)) {
       return false;
     }
-    if (IsInPreserveSet(node) || IsAlreadyOptimized(node)) {
+    if (IsInPreserveSet(node)) {
+      return false;
+    }
+    // Nodes optimized by AddOpsRewrite already have optimal broadcasts.
+    if (IsMarkedWithAnyTag(node, kMinimizeBroadcastsTag, kAddOpsRewriteTag)) {
       return false;
     }
     if (IsDrivenByControlDependency(node) || DrivesControlDependency(node)) {
@@ -1019,7 +953,7 @@ class MinimizeBroadcasts : public ArithmeticNodesGroupOptimizerStage {
             << " num_optimized_nodes=" << group.optimized_nodes.size();
 
     // Do not optimize any of the nodes that are part of this group.
-    AddAllMembersToOptimizedNodes(group);
+    MarkAllMembersWithTag(group, kMinimizeBroadcastsTag);
 
     if (CountUniqueShapes(group.inputs) <= 1) {
       VLOG(3) << "Skip min-bcast group with single unique shape";
@@ -1147,13 +1081,14 @@ class RemoveIdentityTranspose : public ArithmeticOptimizerStage {
     return IsTranspose(*node) || IsConjugateTranspose(*node);
   }
 
-  // TODO(rmlarsen): Forward control dependencies on the bypassed
-  // transpose nodes.
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
     TF_RETURN_IF_ERROR(EnsureNodeIsSupported(node));
+    NodeDef* tail = node;
+    tail = GetTailOfIdempotentChain(*tail, *ctx().node_map,
+                                    *ctx().nodes_to_preserve);
+    NodeDef* first_transpose;
+    TF_RETURN_IF_ERROR(GetInputNode(tail->input(0), &first_transpose));
 
-    NodeDef* input;
-    TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &input));
     NodeDef* node_perm;
     TF_RETURN_IF_ERROR(GetInputNode(node->input(1), &node_perm));
     if (!IsConstant(*node_perm)) {
@@ -1161,17 +1096,30 @@ class RemoveIdentityTranspose : public ArithmeticOptimizerStage {
     }
     std::vector<int64> node_perm_values;
     TF_RETURN_IF_ERROR(GetPermutation(*node_perm, &node_perm_values));
-    if (input->op() == node->op()) {
+    if (first_transpose->op() == node->op()) {
       // Remove pairs of transposes that cancel each other.
-      NodeDef* input_perm;
-      TF_RETURN_IF_ERROR(GetInputNode(input->input(1), &input_perm));
-      if (!IsConstant(*input_perm)) {
+      NodeDef* first_transpose_perm;
+      TF_RETURN_IF_ERROR(
+          GetInputNode(first_transpose->input(1), &first_transpose_perm));
+      if (!IsConstant(*first_transpose_perm)) {
         return Status::OK();
       }
-      std::vector<int64> input_perm_values;
-      TF_RETURN_IF_ERROR(GetPermutation(*input_perm, &input_perm_values));
-      if (AreInversePermutations(node_perm_values, input_perm_values)) {
-        *simplified_node_name = input->input(0);
+      std::vector<int64> first_transpose_perm_values;
+      TF_RETURN_IF_ERROR(
+          GetPermutation(*first_transpose_perm, &first_transpose_perm_values));
+      if (AreInversePermutations(node_perm_values,
+                                 first_transpose_perm_values)) {
+        if (tail == node) {
+          // Bypass adjacent pair.
+          *simplified_node_name = first_transpose->input(0);
+        } else {
+          // Bypass pair connected through chain.
+          tail->set_input(0, first_transpose->input(0));
+          ctx().node_map->UpdateInput(tail->name(), first_transpose->name(),
+                                      first_transpose->input(0));
+          ForwardControlDependencies(tail, {first_transpose});
+          *simplified_node_name = node->input(0);
+        }
       }
     } else {
       // Remove simple identity transposes.
@@ -1223,6 +1171,47 @@ class RemoveIdentityTranspose : public ArithmeticOptimizerStage {
   }
 };
 
+// An involution is an element-wise function f(x) that is its own inverse,
+// i.e. f(f(x)) = x. If we can find a chain of ops
+//   f->op1->op2->...opn->f
+// where op1 through opn preserve the values of their inputs, we can remove
+// the two instances of the involution from the graph, since they cancel
+// each other.
+class RemoveInvolution : public ArithmeticOptimizerStage {
+ public:
+  explicit RemoveInvolution(const GraphOptimizerContext& ctx,
+                            const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("RemoveInvolution", ctx, ctx_ext) {}
+  ~RemoveInvolution() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    return IsInvolution(*node);
+  }
+
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    NodeDef* tail = GetTailOfValuePreservingChain(*node, *ctx().node_map,
+                                                  *ctx().nodes_to_preserve);
+
+    NodeDef* involution;
+    TF_RETURN_IF_ERROR(GetInputNode(tail->input(0), &involution));
+
+    if (involution->op() == node->op()) {
+      // Skip both *node and *involution since they cancel each other.
+      if (tail == node) {
+        // The two nodes to eliminate are adjacent.
+        *simplified_node_name = involution->input(0);
+      } else {
+        tail->set_input(0, involution->input(0));
+        ctx().node_map->UpdateInput(tail->name(), involution->name(),
+                                    involution->input(0));
+        *simplified_node_name = node->input(0);
+      }
+    }
+
+    return Status::OK();
+  }
+};
+
 // Remove redundant Bitcasts.
 // 1) Remove Bitcast whose source type and destination type are equal
 // 2) Rewrite Bitcast(Bitcast(x, type1), type2) => Bitcast(x, type2)
@@ -1339,66 +1328,190 @@ class RemoveNegationStage : public ArithmeticOptimizerStage {
   }
 };
 
+class RemoveLogicalNotStage : public ArithmeticOptimizerStage {
+ public:
+  explicit RemoveLogicalNotStage(const GraphOptimizerContext& ctx,
+                                 const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("RemoveLogicalNot", ctx, ctx_ext) {}
+  ~RemoveLogicalNotStage() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    return IsLogicalNot(*node) && !IsInPreserveSet(*node);
+  }
+
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    const string node_name = node->name();
+    NodeDef* input;
+    TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &input));
+    if (IsInPreserveSet(*input) ||
+        NumNonControlOutputs(*input, *ctx().node_map) > 1) {
+      return Status::OK();
+    }
+    string new_op;
+    if (IsEqual(*input)) {
+      new_op = "NotEqual";
+    } else if (IsNotEqual(*input)) {
+      new_op = "Equal";
+    } else if (IsLess(*input)) {
+      new_op = "GreaterEqual";
+    } else if (IsLessEqual(*input)) {
+      new_op = "Greater";
+    } else if (IsGreater(*input)) {
+      new_op = "LessEqual";
+    } else if (IsGreaterEqual(*input)) {
+      new_op = "Less";
+    }
+    if (!new_op.empty()) {
+      input->set_op(new_op);
+      *simplified_node_name = input->name();
+    }
+    return Status::OK();
+  }
+};
+
 // This optimization hoists the common prefix of unary ops of the inputs to
-// concat out of the concat.
-// For example: Concat([Exp(Sin(x)), Exp(Sin(y)), Exp(Sin(z))]) ->
-// Exp(Sin(Concat([x, y, z]))).
+// concat out of the concat, for example:
+//    Concat([Exp(Sin(x)), Exp(Sin(y)), Exp(Sin(z))])
+// becomes
+//    Exp(Sin(Concat([x, y, z]))).
+// Similarly, it will hoist the common postfix of unary ops into Split or
+// SplitV nodes, for example:
+//    [Exp(Sin(y)) for y in Split(x)]
+// becomes
+//    [y for y in Split(Exp(Sin(x))]
+//
 // TODO(rmlarsen): Support casting. We would have to change the type attribute
-// on the concat node.
-class HoistCWiseUnaryFromConcatStage : public ArithmeticOptimizerStage {
+// on the concat/split node.
+// TODO(rmlarsen): Handle Enter/Exit.
+class HoistCWiseUnaryChainsStage : public ArithmeticOptimizerStage {
  public:
-  explicit HoistCWiseUnaryFromConcatStage(
-      const GraphOptimizerContext& ctx,
-      const ArithmeticOptimizerContext& ctx_ext)
+  explicit HoistCWiseUnaryChainsStage(const GraphOptimizerContext& ctx,
+                                      const ArithmeticOptimizerContext& ctx_ext)
       : ArithmeticOptimizerStage("", ctx, ctx_ext) {}
 
-  ~HoistCWiseUnaryFromConcatStage() override = default;
+  ~HoistCWiseUnaryChainsStage() override = default;
+
+  struct ChainLink {
+    ChainLink() = default;
+    ChainLink(NodeDef* _node, int _port_origin)
+        : node(_node), port_origin(_port_origin) {}
+    NodeDef* node;    // Node in a chain.
+    int port_origin;  // Port on concat/split node from which this chain
+                      // originates.
+
+    bool operator<(const ChainLink& other) const {
+      if (port_origin < other.port_origin) {
+        return true;
+      } else if (port_origin > other.port_origin) {
+        return false;
+      } else {
+        return node->name() < other.node->name();
+      }
+    }
+  };
+
+  // We use an ordinary set sorted on port and node name, so the order, and
+  // hence the node name used for the hoisted chain, will be deterministic.
+  using ChainLinkSet = std::set<ChainLink>;
 
   bool IsSupported(const NodeDef* node) const override {
-    if (!IsConcat(*node)) return false;
-    const int n = node->attr().at("N").i();
-    return n > 1;
+    if (IsInPreserveSet(*node)) return false;
+    if (IsConcat(*node)) {
+      const int n = node->attr().at("N").i();
+      return n > 1;
+    } else if (IsSplit(*node) || IsSplitV(*node)) {
+      const int num_split = node->attr().at("num_split").i();
+      if (NumNonControlOutputs(*node, *ctx().node_map) > num_split) {
+        // TODO(rmlarsen): Remove this constraint when we have optimizations
+        // in place for merging slices into splits.
+        return false;
+      }
+      return num_split > 1 && !IsAlreadyOptimized(*node);
+    }
+    return false;
   }
 
-  Status TrySimplify(NodeDef* concat_node,
-                     string* simplified_node_name) override {
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    node_is_concat_ = IsConcat(*node);
     int prefix_length;
     std::set<string> ctrl_inputs;
+    ChainLinkSet tails;
     TF_RETURN_IF_ERROR(
-        FindCommonUnaryOpPrefix(*concat_node, &prefix_length, &ctrl_inputs));
-    if (prefix_length > 0) {
+        FindCommonUnaryOpChain(*node, &prefix_length, &tails, &ctrl_inputs));
+    if (prefix_length > 0 && !tails.empty()) {
       TF_RETURN_IF_ERROR(
-          HoistUnaryOpPrefix(prefix_length, &ctrl_inputs, concat_node));
-      AddToOptimizationQueue(concat_node);
+          HoistUnaryOpChain(prefix_length, tails, &ctrl_inputs, node));
     }
     return Status::OK();
   }
 
  private:
-  void RemoveControlInputs(std::set<string>* removed_ctrl_inputs,
-                           NodeDef* node) const {
-    const int num_inputs = node->input_size();
-    for (int idx = num_inputs - 1; idx >= 0; --idx) {
-      const string& input = node->input(idx);
-      if (IsControlInput(input)) {
-        removed_ctrl_inputs->insert(input);
-        ctx().node_map->RemoveOutput(NodeName(input), node->name());
-        node->mutable_input()->RemoveLast();
-      } else {
-        break;
+  // Returns the length of the common unary chain of ops that can be
+  // hoisted to the other side of concat or split.
+  Status FindCommonUnaryOpChain(const NodeDef& root_node, int* prefix_length,
+                                ChainLinkSet* tails,
+                                std::set<string>* ctrl_inputs) const {
+    *prefix_length = 0;
+    // Follow the chains starting at each concat input or split output as long
+    // as all the following conditions hold:
+    //   1. The ops in all chains are the same.
+    //   2. The ops are unary elemenwise op.
+    //   3. The op output has only a single consumer (concat only).
+    ChainLinkSet cur_tails;
+    TF_RETURN_IF_ERROR(InitializeChains(root_node, &cur_tails));
+    if (cur_tails.size() < 2) {
+      return Status::OK();
+    }
+    ctrl_inputs->clear();
+    bool stop = false;
+    while (!stop && !cur_tails.empty() &&
+           OpsAreSafeToHoist(root_node, cur_tails)) {
+      // We found one more link that can be hoisted.
+      ++(*prefix_length);
+      tails->swap(cur_tails);
+      GatherControlInputs(ctrl_inputs, *tails);
+
+      // Advance tail pointers to the next level.
+      TF_RETURN_IF_ERROR(AdvanceTails(*tails, &cur_tails, &stop));
+    }
+    return Status::OK();
+  }
+
+  // Hoists the chains to the other side of concat or split and attaches the
+  // control inputs gathered from them to the concat or split node.
+  Status HoistUnaryOpChain(const int prefix_length, const ChainLinkSet& tails,
+                           std::set<string>* ctrl_inputs, NodeDef* root_node) {
+    if (tails.empty()) {
+      return Status::OK();
+    }
+    AddToOptimizationQueue(root_node);
+    optimized_nodes_.insert(root_node->name());
+    if (node_is_concat_) {
+      AddControlInputs(ctrl_inputs, root_node);
+      return HoistChainForConcat(prefix_length, tails, root_node);
+    } else {
+      return HoistChainForSplit(prefix_length, tails, ctrl_inputs, root_node);
+    }
+  }
+
+  void GatherControlInputs(std::set<string>* ctrl_inputs,
+                           const ChainLinkSet& ops) const {
+    for (const auto& link : ops) {
+      const NodeDef* node = link.node;
+      for (int i = node->input_size() - 1; i >= 0; --i) {
+        const string& input = node->input(i);
+        if (!IsControlInput(input)) break;
+        ctrl_inputs->insert(input);
       }
     }
   }
 
   void AddControlInputs(std::set<string>* new_ctrl_inputs,
                         NodeDef* node) const {
-    for (int idx = node->input_size() - 1; idx >= 0; --idx) {
-      const string& existing_input = node->input(idx);
-      if (IsControlInput(existing_input)) {
-        new_ctrl_inputs->erase(existing_input);
-      } else {
-        break;
-      }
+    for (int i = node->input_size() - 1; i >= 0; --i) {
+      const string& existing_input = node->input(i);
+      if (!IsControlInput(existing_input)) break;
+      new_ctrl_inputs->erase(existing_input);
     }
     for (const string& new_input : *new_ctrl_inputs) {
       ctx().node_map->AddOutput(NodeName(new_input), node->name());
@@ -1406,171 +1519,1026 @@ class HoistCWiseUnaryFromConcatStage : public ArithmeticOptimizerStage {
     }
   }
 
-  // Returns the length of the common unary prefix chain of ops that can be
-  // hoisted out of concat.
-  Status FindCommonUnaryOpPrefix(const NodeDef& concat_node, int* prefix_length,
-                                 std::set<string>* ctrl_inputs) const {
-    *prefix_length = 0;
-    const int n = concat_node.attr().at("N").i();
-    // Follow the chains backwards from each concat input as long as all the
-    // following conditions hold:
-    //   1. The ops in all chains are the same.
-    //   2. The op is a unary elemenwise op.
-    //   3. The op output has only a single consumer.
-    std::vector<NodeDef*> tail(n, nullptr);
-    const int start = concat_node.op() == "Concat" ? 1 : 0;
-    const int end = start + n;
-    // Set up tail pointers to point to the immediate inputs to Concat.
-    for (int i = start; i < end; ++i) {
-      if (IsControlInput(concat_node.input(i))) {
-        return errors::FailedPrecondition("Got control input ",
-                                          concat_node.input(i),
-                                          " where normal input was expected.");
-      }
-      TF_RETURN_IF_ERROR(GetInputNode(concat_node.input(i), &tail[i - start]));
-    }
-
-    bool stop = false;
-    ctrl_inputs->clear();
-    while (!stop) {
-      const NodeDef* tail0 = tail[0];
-      if (!IsUnaryElementWise(*tail0)) break;
-      for (int chain = 0; chain < n; ++chain) {
-        // TODO(rmlarsen): Allow and hoist outgoing control edges.
-        if (tail[chain]->op() != tail0->op() ||
-            ctx().node_map->GetOutputs(tail[chain]->name()).size() > 1) {
-          stop = true;
-          break;
+  Status InitializeChains(const NodeDef& node, ChainLinkSet* tails) const {
+    if (node_is_concat_) {
+      // Handle concat nodes by looking backwards in the graph.
+      const int n = node.attr().at("N").i();
+      const int start = node.op() == "Concat" ? 1 : 0;
+      const int end = start + n;
+      // Set up tail pointers to point to the immediate inputs to Concat.
+      for (int input_port = start; input_port < end; ++input_port) {
+        if (IsControlInput(node.input(input_port))) {
+          return errors::FailedPrecondition(
+              "Got control input ", node.input(input_port),
+              " where normal input was expected.");
         }
+        NodeDef* tail;
+        TF_RETURN_IF_ERROR(GetInputNode(node.input(input_port), &tail));
+        tails->insert(ChainLink(tail, input_port));
       }
-      if (stop) break;
-      // We found one more op that can be hoisted.
-      ++(*prefix_length);
-      for (int chain = 0; chain < n; ++chain) {
-        RemoveControlInputs(ctrl_inputs, tail[chain]);
-      }
-      // Advance tail pointers to the next level.
-      for (int chain = 0; chain < n; ++chain) {
-        if (tail[chain]->input_size() == 0 ||
-            IsControlInput(tail[chain]->input(0))) {
-          stop = true;
-          break;
+      return Status::OK();
+    } else {
+      // Handle split nodes by looking forwards in the graph.
+      const auto& outputs = ctx().node_map->GetOutputs(node.name());
+      for (NodeDef* output : outputs) {
+        if (IsControlInput(output->input(0))) continue;
+        int port;
+        const string node_name = ParseNodeName(output->input(0), &port);
+        if (node_name == node.name()) {
+          tails->insert(ChainLink(output, port));
         } else {
-          NodeDef* new_tail = nullptr;
-          TF_RETURN_IF_ERROR(GetInputNode(tail[chain]->input(0), &new_tail));
-          tail[chain] = new_tail;
+          // This output node has a non-control input other than the split node,
+          // abort.
+          tails->clear();
+          return Status::OK();
         }
       }
     }
     return Status::OK();
   }
 
-  Status HoistUnaryOpPrefix(const int prefix_length,
-                            std::set<string>* ctrl_inputs,
-                            NodeDef* concat_node) {
-    const int n = concat_node->attr().at("N").i();
-    const int start = concat_node->op() == "Concat" ? 1 : 0;
-    const int end = start + n;
-    const std::set<NodeDef*> consumers =
-        ctx().node_map->GetOutputs(concat_node->name());
-    AddControlInputs(ctrl_inputs, concat_node);
-    for (int chain = 0; chain < (end - start); ++chain) {
-      NodeDef* tail = nullptr;
-      const string concat_input = concat_node->input(chain + start);
-      for (int distance = 0; distance < prefix_length; ++distance) {
-        if (distance == 0) {
-          TF_RETURN_IF_ERROR(GetInputNode(concat_input, &tail));
-        } else {
-          TF_RETURN_IF_ERROR(GetInputNode(tail->input(0), &tail));
+  bool OpsAreSafeToHoist(const NodeDef& root_node,
+                         const ChainLinkSet& ops) const {
+    if (ops.empty()) return true;
+    const NodeDef* op0 = ops.begin()->node;
+    if (ModifiesFrameInfo(*op0) || !IsUnaryElementWise(*op0)) return false;
+    for (const auto& link : ops) {
+      const NodeDef* op = link.node;
+      if (op->device() != root_node.device() || op->op() != op0->op() ||
+          IsInPreserveSet(*op)) {
+        return false;
+      }
+      if (ctx().node_map->GetOutputs(op->name()).size() > 1) {
+        // TODO(rmlarsen): Allow outgoing control edges.
+        return false;
+      }
+    }
+    return true;
+  }
+
+  Status AdvanceTails(const ChainLinkSet& tails, ChainLinkSet* new_tails,
+                      bool* stop) const {
+    *stop = true;
+    new_tails->clear();
+    for (const auto& link : tails) {
+      const NodeDef* tail = link.node;
+      if (node_is_concat_) {
+        if (tail->input_size() == 0 || IsControlInput(tail->input(0))) {
+          return Status::OK();
+        }
+        NodeDef* new_tail;
+        TF_RETURN_IF_ERROR(GetInputNode(tail->input(0), &new_tail));
+        // Remember original port.
+        new_tails->insert(ChainLink(new_tail, link.port_origin));
+      } else {
+        for (NodeDef* new_tail : ctx().node_map->GetOutputs(tail->name())) {
+          int port;
+          const string node_name = ParseNodeName(new_tail->input(0), &port);
+          if (node_name != tail->name()) {
+            return Status::OK();
+          }
+          // Skip control outputs.
+          if (port >= 0) {
+            // Remember original port.
+            new_tails->insert(ChainLink(new_tail, link.port_origin));
+          }
         }
       }
+    }
+    *stop = false;
+    return Status::OK();
+  }
 
+  Status HoistChainForConcat(const int prefix_length, const ChainLinkSet& tails,
+                             NodeDef* concat_node) {
+    const string& concat_name = concat_node->name();
+    const int first_input = concat_node->op() == "Concat" ? 1 : 0;
+    for (const auto& link : tails) {
+      NodeDef* tail = CHECK_NOTNULL(link.node);
+      const int concat_port = link.port_origin;
+      CHECK_GE(concat_port, 0);
+      CHECK_LT(concat_port, concat_node->input_size());
+      const string concat_input = concat_node->input(concat_port);
       // Hook the node following tail directly into the concat node.
       const string tail_input = tail->input(0);
-      concat_node->set_input(chain + start, tail_input);
-      ctx().node_map->UpdateInput(concat_node->name(), concat_input,
-                                  tail_input);
-
-      if (chain == 0) {
-        // Reuse nodes in the first chain to process output of concat.
-        tail->set_input(0, concat_node->name());
-        ctx().node_map->UpdateInput(tail->name(), tail_input,
-                                    concat_node->name());
+      concat_node->set_input(concat_port, tail_input);
+      ctx().node_map->UpdateInput(concat_name, concat_input, tail_input);
 
+      if (concat_port == first_input) {
         // Update the consumers of concat to consume the end of the chain
         // instead.
-        for (NodeDef* consumer : consumers) {
-          for (int idx = 0; idx < consumer->input_size(); ++idx) {
-            if (consumer->input(idx) == concat_node->name()) {
-              consumer->set_input(idx, concat_input);
-              ctx().node_map->UpdateInput(consumer->name(), concat_node->name(),
-                                          concat_input);
-            }
-          }
-          AddToOptimizationQueue(consumer);
-        }
+        UpdateConsumers(concat_node, concat_input);
+        // Reuse nodes in the first chain to process output of concat.
+        tail->set_input(0, concat_name);
+        ctx().node_map->UpdateInput(tail->name(), tail_input, concat_name);
       }
     }
     return Status::OK();
   }
-};
 
-}  // namespace
+  Status HoistChainForSplit(const int prefix_length, const ChainLinkSet& tails,
+                            std::set<string>* ctrl_inputs,
+                            NodeDef* split_node) {
+    // Create a new chain before the split node to process the input tensor.
+    const string& split_name = split_node->name();
+    auto root_scope_and_name = ParseNodeScopeAndName(split_name);
+
+    // We use the first tail node in the set as a template to get the list of
+    // ops to apply (starting from the end).
+    NodeDef* cur_tail = tails.begin()->node;
+    NodeDef* cur_copy = AddCopyNode(
+        OptimizedNodeName(root_scope_and_name, cur_tail->name()), cur_tail);
+    cur_copy->clear_input();
+
+    // Update the split to take its input from the tail of the new chain.
+    const int value_slot = split_node->op() == "SplitV" ? 0 : 1;
+    const string orig_input = split_node->input(value_slot);
+    split_node->set_input(value_slot, cur_copy->name());
+    ctx().node_map->UpdateInput(split_node->name(), orig_input,
+                                cur_copy->name());
+    TF_RETURN_IF_ERROR(GetInputNode(cur_tail->input(0), &cur_tail));
+
+    // Now walk backwards creating the rest of the chain.
+    while (cur_tail != split_node) {
+      NodeDef* new_copy = AddCopyNode(
+          OptimizedNodeName(root_scope_and_name, cur_tail->name()), cur_tail);
+      new_copy->clear_input();
+      cur_copy->add_input(new_copy->name());
+      ctx().node_map->AddOutput(new_copy->name(), cur_copy->name());
+      cur_copy = new_copy;
+      TF_RETURN_IF_ERROR(GetInputNode(cur_tail->input(0), &cur_tail));
+    }
+    // Connect the original input to the head of the new chain.
+    cur_copy->add_input(orig_input);
+    ctx().node_map->UpdateOutput(NodeName(orig_input), split_name,
+                                 cur_copy->name());
+    // Make sure all the control inputs are satisfied before running the first
+    // node in the new chain.
+    AddControlInputs(ctrl_inputs, cur_copy);
+
+    // Connect all consumers of the tail nodes directly to the
+    // output port of Split from which the chain started.
+    for (const auto& link : tails) {
+      UpdateConsumers(link.node,
+                      link.port_origin == 0
+                          ? split_name
+                          : strings::StrCat(split_name, ":", link.port_origin));
+    }
+    return Status::OK();
+  }
 
-class UniqueNodes {
- public:
-  NodeDef* FindOrAddRepresentative(NodeDef* node) {
-    std::size_t sig = ComputeSignature(*node);
-    std::vector<NodeDef*>& candidates = rep_[sig];
-    for (auto& candidate : candidates) {
-      if (SameNode(*candidate, *node)) {
-        return candidate;
+  // Update consumers of node to take new_input as input instead.
+  void UpdateConsumers(NodeDef* node, const string& new_input) {
+    const string& node_name = node->name();
+    const std::set<NodeDef*> consumers = ctx().node_map->GetOutputs(node_name);
+    for (NodeDef* consumer : consumers) {
+      for (int i = 0; i < consumer->input_size(); ++i) {
+        if (consumer->input(i) == node_name) {
+          consumer->set_input(i, new_input);
+          ctx().node_map->UpdateInput(consumer->name(), node_name, new_input);
+        }
       }
+      AddToOptimizationQueue(consumer);
     }
-    candidates.push_back(node);
-    return node;
   }
 
- private:
-  std::size_t ComputeSignature(const NodeDef& node) const;
-  bool SameNode(const NodeDef& node1, const NodeDef& node2) const;
+  bool IsAlreadyOptimized(const NodeDef& node) const {
+    return optimized_nodes_.find(node.name()) != optimized_nodes_.end();
+  }
 
-  std::unordered_map<std::size_t, std::vector<NodeDef*>> rep_;
+ private:
+  bool node_is_concat_;
+  std::unordered_set<string> optimized_nodes_;
 };
 
-std::size_t UniqueNodes::ComputeSignature(const NodeDef& node) const {
-  std::size_t h = std::hash<string>{}(node.op());
-  h ^= std::hash<string>{}(node.device());
-  for (const auto& input : node.input()) {
-    int pos;
-    string node_name = ParseNodeName(input, &pos);
-    h ^= std::hash<string>{}(node_name);
-    h ^= static_cast<std::size_t>(pos);
-  }
-  for (const auto& attr : node.attr()) {
-    h ^= std::hash<string>{}(attr.first);
-    string tmp;
-    attr.second.AppendToString(&tmp);
-    h ^= std::hash<string>{}(tmp);
+class RemoveIdempotentStage : public ArithmeticOptimizerStage {
+ public:
+  explicit RemoveIdempotentStage(const GraphOptimizerContext& ctx,
+                                 const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("RemoveIdempotent", ctx, ctx_ext) {}
+  ~RemoveIdempotentStage() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    return IsIdempotent(*node) && !IsInPreserveSet(*node);
   }
-  return h;
-}
 
-bool UniqueNodes::SameNode(const NodeDef& node1, const NodeDef& node2) const {
-  if (node1.op() != node2.op()) {
-    return false;
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    NodeDef* input;
+    TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &input));
+    auto root_scope_and_name = ParseNodeScopeAndName(node->name());
+    const string new_name = OptimizedNodeName(root_scope_and_name);
+    if (input->op() == node->op() && input->device() == node->device() &&
+        IsIdempotent(*input) && !ctx().node_map->NodeExists(new_name)) {
+      NodeDef* new_input_node = AddCopyNode(new_name, input);
+      ForwardControlDependencies(new_input_node, {node});
+      *simplified_node_name = new_input_node->name();
+    }
+    return Status::OK();
   }
-  if (node1.device() != node2.device()) {
-    return false;
+};
+
+// Performs the conversion:
+// Div(x, Sqrt(y)) => Mul(x, Rsqrt(y))
+// TODO(srjoglekar): Generalize to optimize cases like (x / pow(y, z)).
+class SqrtDivToRsqrtMulStage : public ArithmeticOptimizerStage {
+ public:
+  explicit SqrtDivToRsqrtMulStage(const GraphOptimizerContext& ctx,
+                                  const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("SqrtDivToRsqrtMul", ctx, ctx_ext) {}
+  ~SqrtDivToRsqrtMulStage() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    return IsAnyDiv(*node);
   }
-  if (node1.input_size() != node2.input_size()) {
-    return false;
+
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    NodeDef* y;
+    TF_RETURN_IF_ERROR(GetInputNode(node->input(1), &y));
+    // Optimize only if divisor is a Sqrt whose output is not being consumed
+    // elsewhere.
+    if (IsSqrt(*y) && (NumNonControlOutputs(*y, *ctx().node_map) == 1)) {
+      // a / sqrt(b) = a * rsqrt(b)
+      node->set_op("Mul");
+      y->set_op("Rsqrt");
+      AddToOptimizationQueue(node);
+      AddToOptimizationQueue(y);
+    }
+    return Status::OK();
   }
-  if (node1.attr_size() != node2.attr_size()) {
-    return false;
+};
+
+// Bypass redundant reshape nodes:
+//
+//   Reshape                    Reshape  <-+
+//      ^                                  |
+//      |                                  |
+//   Reshape       becomes      Reshape    |
+//      ^                                  |
+//      |                                  |
+//    input                      input  ---+
+class RemoveRedundantReshape : public ArithmeticOptimizerStage {
+ public:
+  explicit RemoveRedundantReshape(const GraphOptimizerContext& ctx,
+                                  const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("RemoveRedundantReshape", ctx, ctx_ext) {}
+  ~RemoveRedundantReshape() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    return IsReshape(*node);
   }
 
-  // Compare inputs.
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    NodeDef* input;
+    TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &input));
+
+    // 1. Bypass reshape followed by reshape.
+    if (IsReshape(*input) && !HasControlInputs(*input)) {
+      node->set_input(0, input->input(0));
+      ctx().node_map->UpdateInput(node->name(), input->name(), input->input(0));
+      *simplified_node_name = node->name();
+      AddToOptimizationQueue(node);
+      return Status::OK();
+    }
+
+    // 2. If the reshape is a no-op, forward its input to its consumers, unless
+    // it anchors a control dependency since we want to make sure that control
+    // dependency is triggered.
+    if (ReshapeIsIdentity(*node) && !HasControlInputs(*node)) {
+      *simplified_node_name = node->input(0);
+      return Status::OK();
+    }
+
+    return Status::OK();
+  }
+
+ private:
+  // Returns whether `reshape` is an identity op.
+  bool ReshapeIsIdentity(const NodeDef& reshape) {
+    OpInfo::TensorProperties reshape_props;
+    OpInfo::TensorProperties input_props;
+
+    if (!GetTensorProperties(reshape.name(), &reshape_props).ok() ||
+        !GetTensorProperties(reshape.input(0), &input_props).ok()) {
+      return false;
+    }
+
+    return ShapesSymbolicallyEqual(input_props.shape(), reshape_props.shape());
+  }
+};
+
+// Reorder Cast and Transpose if beneficial.
+//
+// A common pattern after the layout optimizer is casting an uint8 NHWC
+// image to float before transposing it to NCHW. It is beneficial to reorder
+// the cast and the transpose to make the transpose process smaller amount
+// of data. This optimization converts
+//   Transpose(Cast(image, dst_type), perm)
+// to
+//   Cast(Transpose(image, perm), dst_type)
+// when sizeof(image.type) < sizeof(dst_type).
+//
+// TODO(jingyue): This optimization can be generalized to a cast followed by
+// a chain of ops that merely reorder elements (e.g. Reshape and
+// DepthToSpace).
+class ReorderCastAndTranspose : public ArithmeticOptimizerStage {
+ public:
+  explicit ReorderCastAndTranspose(const GraphOptimizerContext& ctx,
+                                   const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("ReorderCastAndTranspose", ctx, ctx_ext) {}
+  ~ReorderCastAndTranspose() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    return IsTranspose(*node) && NodeIsOnCpuOrGpu(node);
+  }
+
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    const NodeDef* transpose = node;
+
+    // Verify that input to Transpose is the Cast op.
+    NodeDef* cast;
+    TF_RETURN_IF_ERROR(GetInputNode(transpose->input(0), &cast));
+    if (!IsCast(*cast)) return Status::OK();
+
+    // Input to the Cast-Transpose chain.
+    NodeDef* input;
+    TF_RETURN_IF_ERROR(GetInputNode(cast->input(0), &input));
+
+    const DataType src_type = GetSourceDataType(*cast);
+    const DataType dst_type = GetDestinationDataType(*cast);
+
+    const string src_type_name = DataTypeString(src_type);
+    const string dst_type_name = DataTypeString(dst_type);
+
+    // Check if nodes were not already optimized.
+    const string optimized_cast_name =
+        OptimizedNodeName(ParseNodeScopeAndName(cast->name()), dst_type_name);
+    const string optimized_transpose_name = OptimizedNodeName(
+        ParseNodeScopeAndName(transpose->name()), src_type_name);
+
+    bool is_already_optimized =
+        ctx().node_map->NodeExists(optimized_transpose_name) ||
+        ctx().node_map->NodeExists(optimized_cast_name);
+
+    if (IsNumberType(src_type) && IsNumberType(dst_type) &&
+        DataTypeSize(src_type) < DataTypeSize(dst_type) &&
+        !is_already_optimized) {
+      NodeDef* new_transpose = AddCopyNode(optimized_transpose_name, transpose);
+      (*new_transpose->mutable_attr())["T"].set_type(src_type);
+      new_transpose->set_input(0, cast->input(0));
+
+      ctx().node_map->AddOutput(input->name(), new_transpose->name());
+      ctx().node_map->AddOutput(NodeName(new_transpose->input(1)),
+                                new_transpose->name());
+
+      NodeDef* new_cast = AddCopyNode(optimized_cast_name, cast);
+      new_cast->set_input(0, new_transpose->name());
+      ctx().node_map->AddOutput(new_transpose->name(), new_cast->name());
+
+      AddToOptimizationQueue(new_transpose);
+      ForwardControlDependencies(new_transpose, {cast, node});
+
+      *simplified_node_name = new_cast->name();
+    }
+
+    return Status::OK();
+  }
+
+ private:
+  // This optimization can be dangerous on devices other than CPU and
+  // GPU. The transpose might not be implemented for image.type, or
+  // might be slower with image.type than with dst_type.
+  bool NodeIsOnCpuOrGpu(const NodeDef* node) const {
+    using str_util::StrContains;
+
+    string task;
+    string device;
+
+    return DeviceNameUtils::SplitDeviceName(node->device(), &task, &device) &&
+           (StrContains(device, DEVICE_CPU) || StrContains(device, DEVICE_GPU));
+  }
+
+  bool IsNumberType(DataType dtype) { return kNumberTypes.Contains(dtype); }
+};
+
+// Fold a multiply of a scalar into the following convolution. This folding
+// can jump across nodes that merely reorders data (such as reshape and
+// transpose). For example, we can optimize
+//
+//
+//         Conv2D                             Conv2D
+//        /      \                           /      \
+//    Transpose  weights*       ->     Transpose    Mul
+//       |                                |        /   \
+//      Mul                               |    weights  scale
+//     /   \                              |
+//   input  scale**                     input
+//
+//  *) weights must be a const
+// **) scale must be a const scalar
+//
+// When `weights` and `scale` are constant, `Mul` in the optimized graph can be
+// constant-folded, also weights tend to be smaller than the activations.
+//
+// TODO(jingyue): Fold scalar multiplies to Conv?DBackpropFilter and
+// Conv?DBackpropInput.
+class FoldMultiplyIntoConv : public ArithmeticOptimizerStage {
+ public:
+  explicit FoldMultiplyIntoConv(const GraphOptimizerContext& ctx,
+                                const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("FoldMultiplyIntoConv", ctx, ctx_ext) {}
+  ~FoldMultiplyIntoConv() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    return IsConv2D(*node) || IsConv3D(*node);
+  }
+
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+#define TF_RETURN_IF_TRUE(...) \
+  if ((__VA_ARGS__)) return Status::OK()
+
+    NodeDef* conv = node;
+
+    NodeDef* weights;
+    TF_RETURN_IF_ERROR(GetInputNode(conv->input(1), &weights));
+
+    // Fold the multiply to conv only when the weights are constant, so the
+    // multiply can be constant-folded.
+    //
+    // TODO(jingyue): When the weights aren't constant, this should also help
+    // performance a bit and memory usage a lot, since the weights tend to be
+    // smaller than the activations.
+    TF_RETURN_IF_TRUE(!IsConstant(*weights));
+
+    // Verify that this node was not already optimized.
+    const string scaled_weights_node_name =
+        OptimizedNodeName(ParseNodeScopeAndName(weights->name()),
+                          strings::StrCat("scaled", "_", conv->name()));
+
+    TF_RETURN_IF_TRUE(ctx().node_map->NodeExists(scaled_weights_node_name));
+
+    // Find the tail of value preserving chain entering the Conv node.
+    NodeDef* tail = GetTailOfValuePreservingChain(*conv, *ctx().node_map,
+                                                  *ctx().nodes_to_preserve);
+
+    NodeDef* source;
+    TF_RETURN_IF_ERROR(GetInputNode(tail->input(0), &source));
+
+    // Check that value preserving chain is the only consumer of the Mul output.
+    TF_RETURN_IF_TRUE(!IsMul(*source));
+    TF_RETURN_IF_TRUE(NumNonControlOutputs(*source, *ctx().node_map) != 1);
+
+    const NodeDef* mul = source;
+
+    // TODO(jingyue): handle the case where `scale` is 0-th operand.
+    NodeDef* scale;  // scalar multiplier fot the input tensor
+    NodeDef* input;
+    TF_RETURN_IF_ERROR(GetInputNode(mul->input(1), &scale));
+    TF_RETURN_IF_ERROR(GetInputNode(mul->input(0), &input));
+
+    // Check that 'scale * weight' can be const folded.
+    TF_RETURN_IF_TRUE(!IsConstant(*scale));
+    TF_RETURN_IF_TRUE(scale->attr().at("dtype").type() !=
+                      weights->attr().at("dtype").type());
+
+    // Check that `scale` is a scalar.
+    const TensorProto& scale_tensor = scale->attr().at("value").tensor();
+    bool scale_is_a_scalar = scale_tensor.has_tensor_shape() &&
+                             scale_tensor.tensor_shape().dim_size() == 0;
+    TF_RETURN_IF_TRUE(!scale_is_a_scalar);
+
+    // At this point all preconditions are met, and we safely do the rewrite.
+    VLOG(3) << "Fold multiply into conv: conv=" << conv->name()
+            << " mul=" << mul->name() << " weights=" << weights->name();
+
+    // Create new node `scaled_weights`.
+    NodeDef* scaled_weights = AddEmptyNode(scaled_weights_node_name);
+    scaled_weights->set_op("Mul");
+    scaled_weights->set_device(weights->device());
+    (*scaled_weights->mutable_attr())["T"] = weights->attr().at("dtype");
+    AddToOptimizationQueue(scaled_weights);
+
+    // Link in its inputs.
+    scaled_weights->add_input(conv->input(1));
+    ctx().node_map->AddOutput(weights->name(), scaled_weights->name());
+    scaled_weights->add_input(mul->input(1));
+    ctx().node_map->AddOutput(scale->name(), scaled_weights->name());
+    ForwardControlDependencies(scaled_weights, {source});
+
+    // Update `conv`'s weights to `scaled_weights`.
+    conv->set_input(1, scaled_weights->name());
+    ctx().node_map->UpdateInput(conv->name(), weights->name(),
+                                scaled_weights->name());
+    AddToOptimizationQueue(conv);
+
+    // Update `tail` node to bypass `mul` because it's folded to the weights.
+    tail->set_input(0, mul->input(0));
+    ctx().node_map->UpdateInput(tail->name(), mul->name(), input->name());
+    AddToOptimizationQueue(tail);
+    *simplified_node_name = conv->name();
+
+    return Status::OK();
+#undef TF_RETURN_IF_TRUE
+  }
+};
+
+// Fold Transpose into matrix multiplication.
+class FoldTransposeIntoMatMul : public ArithmeticOptimizerStage {
+ public:
+  explicit FoldTransposeIntoMatMul(const GraphOptimizerContext& ctx,
+                                   const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("FoldTransposeIntoMatMul", ctx, ctx_ext) {}
+  ~FoldTransposeIntoMatMul() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    return IsMatMul(*node);
+  }
+
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    const NodeScopeAndName matmul = ParseNodeScopeAndName(node->name());
+    const string optimized_node_name = OptimizedNodeName(matmul);
+    if (ctx().node_map->NodeExists(optimized_node_name)) return Status::OK();
+
+    NodeDef* a;
+    NodeDef* b;
+    TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &a));
+    TF_RETURN_IF_ERROR(GetInputNode(node->input(1), &b));
+
+    bool is_complex = false;
+    if (node->op() != "SparseMatMul") {
+      const DataType type = GetDataTypeFromAttr(*node, "T");
+      is_complex = (type == DT_COMPLEX64) || (type == DT_COMPLEX128);
+    }
+
+    const std::set<string> foldable_transpose_ops =
+        !is_complex ? std::set<string>{"ConjugateTranspose", "Transpose"}
+                    : (node->op() == "BatchMatMul"
+                           ? std::set<string>{"ConjugateTranspose"}
+                           : std::set<string>{"Transpose"});
+
+    const bool a_is_foldable = foldable_transpose_ops.count(a->op()) > 0 &&
+                               IsInnerMatrixTransposeNode(*a, ctx().node_map);
+    const bool b_is_foldable = foldable_transpose_ops.count(b->op()) > 0 &&
+                               IsInnerMatrixTransposeNode(*b, ctx().node_map);
+    if (!a_is_foldable && !b_is_foldable) return Status::OK();
+
+    NodeDef* new_op = AddCopyNode(optimized_node_name, node);
+
+    if (a_is_foldable) {
+      const string attr_a =
+          node->op() == "BatchMatMul" ? "adj_x" : "transpose_a";
+      FlipBooleanAttr(attr_a, new_op);
+      new_op->set_input(0, a->input(0));
+      ctx().node_map->UpdateInput(new_op->name(), a->name(), a->input(0));
+    }
+
+    if (b_is_foldable) {
+      const string attr_b =
+          node->op() == "BatchMatMul" ? "adj_y" : "transpose_b";
+      FlipBooleanAttr(attr_b, new_op);
+      new_op->set_input(1, b->input(0));
+      ctx().node_map->UpdateInput(new_op->name(), b->name(), b->input(0));
+    }
+
+    std::vector<const NodeDef*> deps_to_forward = {node};
+    if (a_is_foldable) deps_to_forward.push_back(a);
+    if (b_is_foldable) deps_to_forward.push_back(b);
+    ForwardControlDependencies(new_op, deps_to_forward);
+
+    return Status::OK();
+  }
+
+ private:
+  void FlipBooleanAttr(const string& attr_name, NodeDef* node) {
+    const bool old_value =
+        !node->attr().count(attr_name) ? false : node->attr().at(attr_name).b();
+    (*node->mutable_attr())[attr_name].set_b(!old_value);
+  }
+
+  template <typename T>
+  bool IsInnerMatrixTranspose(const std::vector<T>& perm) {
+    const T n = perm.size();
+    if (n < 2) {
+      return false;
+    }
+    for (T i = 0; i < n - 2; ++i) {
+      if (perm[i] != i) {
+        return false;
+      }
+    }
+    return perm[n - 1] == n - 2 && perm[n - 2] == n - 1;
+  }
+
+  bool IsInnerMatrixTransposeNode(const NodeDef& transpose_node,
+                                  const NodeMap* node_map) {
+    if (transpose_node.op() != "Transpose" &&
+        transpose_node.op() != "ConjugateTranspose") {
+      return false;
+    }
+    const NodeDef* perm_node = node_map->GetNode(transpose_node.input(1));
+    std::vector<int> perm32;
+    if (ValuesFromConstNode(*perm_node, &perm32)) {
+      return IsInnerMatrixTranspose(perm32);
+    }
+    std::vector<int64> perm64;
+    if (ValuesFromConstNode(*perm_node, &perm64)) {
+      return IsInnerMatrixTranspose(perm64);
+    }
+    return false;
+  }
+};
+
+// Fold Transpose into matrix multiplication.
+class FoldConjugateIntoTranspose : public ArithmeticOptimizerStage {
+ public:
+  explicit FoldConjugateIntoTranspose(const GraphOptimizerContext& ctx,
+                                      const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("FoldConjugateIntoTranspose", ctx, ctx_ext) {}
+  ~FoldConjugateIntoTranspose() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    return IsConj(*node) || IsTranspose(*node) || IsConjugateTranspose(*node);
+  }
+
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    const NodeScopeAndName matmul = ParseNodeScopeAndName(node->name());
+    const string optimized_node_name = OptimizedNodeName(matmul);
+    if (ctx().node_map->NodeExists(optimized_node_name)) return Status::OK();
+
+    NodeDef* input;
+    TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &input));
+
+    const NodeDef* transpose_op = node->op() == "Conj" ? input : node;
+    const NodeDef* conj_op = node->op() == "Conj" ? node : input;
+
+    if ((IsTranspose(*transpose_op) || IsConjugateTranspose(*transpose_op)) &&
+        IsConj(*conj_op)) {
+      NodeDef* new_op = AddCopyNode(optimized_node_name, transpose_op);
+
+      // Flip the type of transpose op to absorb the conjugation.
+      new_op->set_op(transpose_op->op() == "Transpose" ? "ConjugateTranspose"
+                                                       : "Transpose");
+      new_op->set_input(0, input->input(0));
+      ctx().node_map->UpdateInput(new_op->name(), node->name(),
+                                  input->input(0));
+      ForwardControlDependencies(new_op, {node, input});
+      *simplified_node_name = new_op->name();
+    }
+
+    return Status::OK();
+  }
+};
+
+// Replace Mul node with identical inputs with a Square.
+class ReplaceMulWithSquare : public ArithmeticOptimizerStage {
+ public:
+  explicit ReplaceMulWithSquare(const GraphOptimizerContext& ctx,
+                                const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("ReplaceMulWithSquare", ctx, ctx_ext) {}
+  ~ReplaceMulWithSquare() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    return IsMul(*node) && node->input(0) == node->input(1);
+  }
+
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    const NodeScopeAndName mul = ParseNodeScopeAndName(node->name());
+    const string optimized_node_name = OptimizedNodeName(mul);
+    if (ctx().node_map->NodeExists(optimized_node_name)) return Status::OK();
+
+    const DataType type = GetDataTypeFromAttr(*node, "T");
+    bool is_complex = (type == DT_COMPLEX64) || (type == DT_COMPLEX128);
+
+    string task;
+    string device;
+    bool is_on_cpu =
+        DeviceNameUtils::SplitDeviceName(node->device(), &task, &device) &&
+        str_util::StrContains(device, DEVICE_CPU);
+
+    if (!is_complex || is_on_cpu) {
+      NodeDef* new_square_node = AddCopyNode(optimized_node_name, node);
+      new_square_node->set_op("Square");
+      for (int i = 1; i < new_square_node->input_size(); ++i) {
+        new_square_node->set_input(i - 1, new_square_node->input(i));
+      }
+      new_square_node->mutable_input()->RemoveLast();
+      for (const string& input : new_square_node->input()) {
+        ctx().node_map->AddOutput(NodeName(input), new_square_node->name());
+      }
+      *simplified_node_name = new_square_node->name();
+    }
+
+    return Status::OK();
+  }
+};
+
+// Simplify aggregation (e.g. AddN) nodes:
+//
+// 1. Discard aggregate nodes with a single input and no control dependencies.
+//
+// 2. Try to rewrite aggregations of N >= 2 identical terms (possibly due to
+//    deduping or other rewrites) so we can get rid of the sum entirely.
+//
+//    The expression (using AddN as an example of an aggregate op):
+//      AddN(x, x, x, ... ,x)
+//           <-- N terms -->
+//    can be rewritten to:
+//      Mul(Const(N), x))
+//
+class SimplifyAggregation : public ArithmeticOptimizerStage {
+ public:
+  explicit SimplifyAggregation(const GraphOptimizerContext& ctx,
+                               const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("SimplifyAggregation", ctx, ctx_ext) {}
+  ~SimplifyAggregation() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    return IsAggregate(*node) && NumNonControlInputs(*node) > 0;
+  }
+
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    // 1. Discard aggregate nodes with a single input and no control deps.
+    if (node->input_size() == 1) {
+      *simplified_node_name = node->input(0);
+      return Status::OK();
+    }
+
+    // 2. Rewrite aggregations of N >= 2 identical terms.
+
+    // All non-control inputs must be identical.
+    bool all_equal = true;
+    int num_inputs = 1;
+    for (int i = 1; i < node->input_size(); ++i) {
+      if (IsControlInput(node->input(i))) break;
+      ++num_inputs;
+      if (node->input(i) != node->input(0)) {
+        all_equal = false;
+        break;
+      }
+    }
+    if (!all_equal) return Status::OK();
+
+    // And node should not be optimized earlier.
+    const NodeScopeAndName node_scope_and_name =
+        ParseNodeScopeAndName(node->name());
+    const string optimized_const_name =
+        OptimizedNodeName(node_scope_and_name, "Const");
+    const string optimized_mul_name =
+        OptimizedNodeName(node_scope_and_name, "Mul");
+
+    bool is_already_optimized =
+        ctx().node_map->NodeExists(optimized_const_name) ||
+        ctx().node_map->NodeExists(optimized_mul_name);
+
+    if (is_already_optimized) return Status::OK();
+
+    // At this point all preconditions are met, and we safely do the rewrite.
+    VLOG(3) << "Simplify aggregation with identical inputs: node="
+            << node->name() << " num_inputs=" << num_inputs;
+
+    // 1. Create constant node with value N.
+    const auto type = GetDataTypeFromAttr(*node, "T");
+    Tensor t(type, TensorShape({}));
+    Status status = SetTensorValue(type, num_inputs, &t);
+    if (!status.ok()) {
+      return errors::Internal("Failed to create const node: ",
+                              status.error_message());
+    }
+
+    TensorValue value(&t);
+    NodeDef* new_const_node = AddEmptyNode(optimized_const_name);
+    status = ConstantFolding::CreateNodeDef(new_const_node->name(), value,
+                                            new_const_node);
+    if (!status.ok()) {
+      return errors::Internal("Failed to create const node: ",
+                              status.error_message());
+    }
+    new_const_node->set_device(node->device());
+    MaybeAddControlInput(NodeName(node->input(0)), new_const_node,
+                         ctx().optimized_graph, ctx().node_map);
+    AddToOptimizationQueue(new_const_node);
+
+    // 2. Replace the aggregate node with Mul(Const(N), x).
+    NodeDef* new_mul_node = AddEmptyNode(optimized_mul_name);
+    new_mul_node->set_op("Mul");
+    new_mul_node->set_device(node->device());
+    SetDataTypeToAttr(type, "T", new_mul_node);
+    new_mul_node->add_input(new_const_node->name());
+    ctx().node_map->AddOutput(new_const_node->name(), new_mul_node->name());
+    new_mul_node->add_input(node->input(0));
+    ctx().node_map->AddOutput(node->input(0), new_mul_node->name());
+
+    ForwardControlDependencies(new_mul_node, {node});
+    *simplified_node_name = new_mul_node->name();
+
+    return Status::OK();
+  }
+};
+
+class ConvertPowStage : public ArithmeticOptimizerStage {
+ public:
+  explicit ConvertPowStage(const GraphOptimizerContext& ctx,
+                           const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("ConvertPow", ctx, ctx_ext) {}
+
+  bool IsSupported(const NodeDef* node) const override {
+    return IsPow(*node) &&
+           ctx().graph_properties->GetInputProperties(node->name()).size() == 2;
+  }
+
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    const auto& p = ctx().graph_properties->GetInputProperties(node->name())[1];
+    for (int i = 0; i < p.shape().dim_size(); ++i) {
+      if (p.shape().dim(i).size() < 0) {
+        // skip if p is is not fully defined.
+        return Status::OK();
+      }
+    }
+    if (TensorShape::IsValid(p.shape()) && p.has_value()) {
+      Tensor pow(p.dtype(), p.shape());
+      if (!pow.FromProto(p.value())) {
+        return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                       p.value().DebugString());
+      }
+
+      complex128 prev, curr;
+      for (int i = 0; i < pow.NumElements(); ++i) {
+        TF_RETURN_IF_ERROR(GetElement(pow, i, &curr));
+        if (i != 0 && curr != prev) {
+          // pow has different values on different elements. Skip.
+          return Status::OK();
+        }
+        prev = curr;
+      }
+      NodeDef *x, *y;
+      TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &x));
+      TF_RETURN_IF_ERROR(GetInputNode(node->input(1), &y));
+      if (curr == complex128(2, 0)) {
+        node->set_op("Square");
+        node->set_input(1, AsControlDependency(y->name()));
+        AddToOptimizationQueue(node);
+        AddToOptimizationQueue(y);
+      } else if (curr == complex128(1, 0)) {
+        node->set_op("Identity");
+        node->set_input(1, AsControlDependency(y->name()));
+        AddToOptimizationQueue(node);
+        AddToOptimizationQueue(y);
+      } else if (curr == complex128(0.5, 0)) {
+        node->set_op("Sqrt");
+        node->set_input(1, AsControlDependency(y->name()));
+        AddToOptimizationQueue(node);
+        AddToOptimizationQueue(y);
+      } else if (curr == complex128(0, 0)) {
+        const auto& b =
+            ctx().graph_properties->GetInputProperties(node->name())[0];
+        for (int i = 0; i < b.shape().dim_size(); ++i) {
+          if (b.shape().dim(i).size() < 0) {
+            // skip if b is is not fully defined.
+            return Status::OK();
+          }
+        }
+        if (TensorShape::IsValid(b.shape()) && b.has_value()) {
+          Tensor base(b.dtype(), b.shape());
+          if (!base.FromProto(b.value())) {
+            return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                           b.value().DebugString());
+          }
+          node->set_op("Const");
+          Tensor c(base.dtype(), base.shape());
+          for (int i = 0; i < c.NumElements(); ++i) {
+            TF_RETURN_IF_ERROR(SetElementToOne(i, &c));
+          }
+          (*node->mutable_attr())["dtype"].set_type(base.dtype());
+          c.AsProtoTensorContent(
+              (*node->mutable_attr())["value"].mutable_tensor());
+          node->mutable_attr()->erase("T");
+          node->set_input(0, AsControlDependency(x->name()));
+          node->set_input(1, AsControlDependency(y->name()));
+          AddToOptimizationQueue(node);
+          AddToOptimizationQueue(x);
+          AddToOptimizationQueue(y);
+        }
+      } else if (curr == complex128(-0.5, 0)) {
+        node->set_op("Rsqrt");
+        node->set_input(1, AsControlDependency(y->name()));
+        AddToOptimizationQueue(node);
+        AddToOptimizationQueue(y);
+      } else if (curr == complex128(-1, 0)) {
+        node->set_op("Reciprocal");
+        node->set_input(1, AsControlDependency(y->name()));
+        AddToOptimizationQueue(node);
+        AddToOptimizationQueue(y);
+      }
+    }
+    return Status::OK();
+  }
+
+ private:
+  Status GetElement(const Tensor& t, int i, complex128* element) {
+    switch (t.dtype()) {
+      case DT_INT32:
+        *element = complex128(t.flat<int32>()(i));
+        return Status::OK();
+      case DT_INT64:
+        *element = complex128(t.flat<int64>()(i));
+        return Status::OK();
+      case DT_FLOAT:
+        *element = complex128(t.flat<float>()(i));
+        return Status::OK();
+      case DT_DOUBLE:
+        *element = complex128(t.flat<double>()(i));
+        return Status::OK();
+      case DT_COMPLEX64:
+        *element = complex128(t.flat<complex64>()(i));
+        return Status::OK();
+      case DT_COMPLEX128:
+        *element = t.flat<complex128>()(i);
+        return Status::OK();
+      default:
+        return errors::InvalidArgument("Invalid data type: ", t.dtype());
+    }
+  }
+
+  Status SetElementToOne(int i, Tensor* t) {
+    switch (t->dtype()) {
+      case DT_INT32:
+        t->flat<int32>()(i) = 1;
+        return Status::OK();
+      case DT_INT64:
+        t->flat<int64>()(i) = 1L;
+        return Status::OK();
+      case DT_FLOAT:
+        t->flat<float>()(i) = 1.0f;
+        return Status::OK();
+      case DT_DOUBLE:
+        t->flat<double>()(i) = 1.0;
+        return Status::OK();
+      case DT_COMPLEX64:
+        t->flat<complex64>()(i) = complex64(1);
+        return Status::OK();
+      case DT_COMPLEX128:
+        t->flat<complex128>()(i) = complex128(1);
+        return Status::OK();
+      default:
+        return errors::InvalidArgument("Invalid data type: ", t->dtype());
+    }
+  }
+};
+
+}  // namespace
+
+class UniqueNodes {
+ public:
+  NodeDef* FindOrAddRepresentative(NodeDef* node) {
+    uint64 sig = ComputeSignature(*node);
+    std::vector<NodeDef*>& candidates = rep_[sig];
+    for (auto& candidate : candidates) {
+      if (SameNode(*candidate, *node)) {
+        return candidate;
+      }
+    }
+    candidates.push_back(node);
+    return node;
+  }
+
+ private:
+  uint64 ComputeSignature(const NodeDef& node) const;
+  bool SameNode(const NodeDef& node1, const NodeDef& node2) const;
+
+  std::unordered_map<uint64, std::vector<NodeDef*>> rep_;
+};
+
+uint64 UniqueNodes::ComputeSignature(const NodeDef& node) const {
+  uint64 h = Hash64(node.op());
+  h = Hash64Combine(Hash64(node.device()), h);
+
+  for (const auto& input : node.input()) {
+    int pos;
+    string node_name = ParseNodeName(input, &pos);
+    h = Hash64CombineUnordered(Hash64(node_name), h);
+    h = Hash64CombineUnordered(std::hash<int>()(pos), h);
+  }
+  for (const auto& attr : node.attr()) {
+    h = Hash64CombineUnordered(Hash64(attr.first), h);
+    h = Hash64CombineUnordered(FastAttrValueHash(attr.second), h);
+  }
+  return h;
+}
+
+bool UniqueNodes::SameNode(const NodeDef& node1, const NodeDef& node2) const {
+  if (node1.op() != node2.op()) {
+    return false;
+  }
+  if (node1.device() != node2.device()) {
+    return false;
+  }
+  if (node1.input_size() != node2.input_size()) {
+    return false;
+  }
+  if (node1.attr_size() != node2.attr_size()) {
+    return false;
+  }
+
+  // Compare inputs.
   if (IsCommutative(node1)) {
     std::vector<string> inputs1(node1.input().begin(), node1.input().end());
     std::vector<string> inputs2(node2.input().begin(), node2.input().end());
@@ -1607,49 +2575,13 @@ bool UniqueNodes::SameNode(const NodeDef& node1, const NodeDef& node2) const {
   }
   for (const auto& attr1 : node1.attr()) {
     auto it = node2.attr().find(attr1.first);
-    if (it == node2.attr().end()) {
-      return false;
-    }
-    const auto& attr2 = *it;
-    string val1;
-    attr1.second.AppendToString(&val1);
-    string val2;
-    attr2.second.AppendToString(&val2);
-    if (val1 != val2) {
-      return false;
-    }
+    if (it == node2.attr().end()) return false;
+    if (!FastAreAttrValuesEqual(attr1.second, it->second)) return false;
   }
 
   return true;
 }
 
-NodeDef* ArithmeticOptimizer::AddNode(const NodeDef& node, StringPiece suffix,
-                                      bool copy_node) {
-  return AddNode(OptimizedNodeName(node, suffix), copy_node ? &node : nullptr);
-}
-
-NodeDef* ArithmeticOptimizer::AddNode(const string& name,
-                                      const NodeDef* node_to_copy) {
-  NodeDef* new_node = optimized_graph_->add_node();
-  node_map_->AddNode(NodeName(name), new_node);
-  if (node_to_copy != nullptr) {
-    *new_node = *node_to_copy;
-  }
-  new_node->set_name(name);
-  return new_node;
-}
-
-string ArithmeticOptimizer::OptimizedNodeName(const NodeDef& node,
-                                              StringPiece suffix) const {
-  return AddPrefixToNodeName(strings::StrCat(node.name(), "_", suffix),
-                             kArithmeticOptimizer);
-}
-
-bool ArithmeticOptimizer::OptimizedNodeExists(const NodeDef& node,
-                                              StringPiece suffix) const {
-  return node_map_->NodeExists(OptimizedNodeName(node, suffix));
-}
-
 namespace {
 
 bool FeedsInPlaceOp(const SimpleGraphView& graph_view, const NodeDef& node) {
@@ -1717,6 +2649,8 @@ void ArithmeticOptimizer::DedupComputations() {
           FeedsInPlaceOp(graph_view, *node)) {
         continue;
       }
+      VLOG(3) << "Remove duplicated node: node=" << node->name()
+              << " representative=" << rep->name();
       const std::set<NodeDef*>& fanouts = node_map_->GetOutputs(node->name());
       for (NodeDef* fanout : fanouts) {
         for (int i = 0; i < fanout->input_size(); ++i) {
@@ -1768,377 +2702,7 @@ void ArithmeticOptimizer::ForwardControlDependencies(
       }
     }
   }
-}
-
-// TODO(ezhulenev): extract each individual simplify rewrite into separate
-// ArithmeticOptimizerStage
-string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
-    const NodeDef* node, SetVector<NodeDef*>* nodes_to_simplify) {
-  // Remove involutions applied twice.
-  if (IsInvolution(*node)) {
-    // An involution is an element-wise function f(x) that is its own inverse,
-    // i.e. f(f(x)) = x. If we can find a chain of ops
-    //   f->op1->op2->...opn->f
-    // where op1 through opn preserve the values of their inputs, we can remove
-    // the two instances of the involution from the graph, since they cancel
-    // each other.
-    NodeDef* tail =
-        GetTailOfValuePreservingChain(*node, *node_map_, nodes_to_preserve_);
-    NodeDef* involution = node_map_->GetNode(tail->input(0));
-    if (involution->op() == node->op()) {
-      // Skip both *node and *involution since they cancel each other.
-      if (tail == node) {
-        // The two nodes to eliminate are adjacent.
-        return involution->input(0);
-      } else {
-        tail->set_input(0, involution->input(0));
-        node_map_->UpdateInput(tail->name(), involution->name(),
-                               involution->input(0));
-        return node->input(0);
-      }
-    }
-  }
-
-  if (node->op() == "Reshape") {
-    //   Reshape
-    //      ^
-    //      |
-    //   Reshape
-    //      ^
-    //      |
-    //    input
-    //
-    // becomes
-    //
-    //   Reshape <-+
-    //             |
-    //   Reshape   |
-    //      ^      |
-    //      |      |
-    //    input ---+
-    NodeDef* reshape = const_cast<NodeDef*>(node);
-    int output_pos = 0;
-    string input_node_name = ParseNodeName(reshape->input(0), &output_pos);
-    const NodeDef* input = node_map_->GetNode(input_node_name);
-    if (input->op() == "Reshape" && !HasControlInputs(*input)) {
-      reshape->set_input(0, input->input(0));
-      node_map_->UpdateInput(reshape->name(), input->name(), input->input(0));
-      nodes_to_simplify->PushBack(reshape);
-      return reshape->name();
-    }
-
-    // If the reshape is a no-op, forward its input to its consumers, unless it
-    // anchors a control dependency since we want to make sure that control
-    // dependency is triggered.
-    if (ReshapeIsIdentity(*reshape, *input, output_pos, *graph_properties_) &&
-        !HasControlInputs(*reshape)) {
-      return reshape->input(0);
-    }
-  }
-
-  if (node->op() == "Transpose") {
-    // Reorder Cast and Transpose if beneficial.
-    //
-    // A common pattern after the layout optimizer is casting an uint8 NHWC
-    // image to float before transposing it to NCHW. It is beneficial to reorder
-    // the cast and the transpose to make the transpose process smaller amount
-    // of data. This optimization converts
-    //   Transpose(Cast(image, dst_type), perm)
-    // to
-    //   Cast(Transpose(image, perm), dst_type)
-    // when sizeof(image.type) < sizeof(dst_type).
-    //
-    // TODO(jingyue): This optimization can be generalized to a cast followed by
-    // a chain of ops that merely reorder elements (e.g. Reshape and
-    // DepthToSpace).
-    const NodeDef* transpose = node;
-    string dontcare;
-    string device;
-    // This optimization can be dangerous on devices other than CPU and GPU. The
-    // transpose might not be implemented for image.type, or might be slower
-    // with image.type than with dst_type.
-    if (DeviceNameUtils::SplitDeviceName(transpose->device(), &dontcare,
-                                         &device) &&
-        (str_util::StrContains(device, DEVICE_CPU) ||
-         str_util::StrContains(device, DEVICE_GPU))) {
-      const NodeDef* cast = node_map_->GetNode(transpose->input(0));
-      if (cast->op() == "Cast") {
-        const NodeDef* input = node_map_->GetNode(cast->input(0));
-        const DataType src_type = GetSourceDataType(*cast);
-        const DataType dst_type = GetDestinationDataType(*cast);
-        if (IsNumberType(src_type) && IsNumberType(dst_type) &&
-            DataTypeSize(src_type) < DataTypeSize(dst_type) &&
-            !OptimizedNodeExists(*cast, DataTypeString(dst_type)) &&
-            !OptimizedNodeExists(*transpose, DataTypeString(src_type))) {
-          NodeDef* new_transpose = AddNode(*transpose, DataTypeString(src_type),
-                                           /*copy_node=*/true);
-          (*new_transpose->mutable_attr())["T"].set_type(src_type);
-          new_transpose->set_input(0, cast->input(0));
-          node_map_->AddOutput(input->name(), new_transpose->name());
-          node_map_->AddOutput(NodeName(new_transpose->input(1)),
-                               new_transpose->name());
-
-          NodeDef* new_cast =
-              AddNode(*cast, DataTypeString(dst_type), /*copy_node=*/true);
-          new_cast->set_input(0, new_transpose->name());
-          node_map_->AddOutput(new_transpose->name(), new_cast->name());
-
-          nodes_to_simplify->PushBack(new_transpose);
-          ForwardControlDependencies(new_transpose, {cast, node});
-          return new_cast->name();
-        }
-      }
-    }
-  }
-
-  // Fold a multiply of a scalar into the following convolution. This folding
-  // can jump across nodes that merely reorders data (such as reshape and
-  // transpose). For example, we can optimize
-  //
-  //
-  //         Conv2D
-  //        /      \
-  //    Transpose  weights
-  //       |
-  //      Mul
-  //     /   \
-  //   inputs 255.0
-  //
-  // to
-  //
-  //         Conv2D
-  //        /      \
-  //    Transpose   Mul
-  //       |       /   \
-  //       |   weights  255.0
-  //       |
-  //     inputs
-  //
-  // when `weights` are constant. `Mul` in the optimized graph can be
-  // constant-folded.
-  //
-  // TODO(jingyue): Fold scalar multiplies to Conv?DBackpropFilter and
-  // Conv?DBackpropInput.
-  if (node->op() == "Conv2D" || node->op() == "Conv3D") {
-    NodeDef* conv = const_cast<NodeDef*>(node);
-    const NodeDef* weights = node_map_->GetNode(NodeName(conv->input(1)));
-    // Fold the multiply to conv only when the weights are constant, so the
-    // multiply can be constant-folded. TODO(jingyue): When the weights aren't
-    // constant, this should also help performance a bit and memory usage a lot,
-    // since the weights tend to be smaller than the activations.
-    if (weights->op() == "Const" &&
-        !OptimizedNodeExists(*weights, StrCat("scaled_", conv->name()))) {
-      const NodeDef* source = node_map_->GetNode(
-          GetTailOfValuePreservingChain(*node, *node_map_, nodes_to_preserve_)
-              ->input(0));
-      if (source->op() == "Mul" &&
-          node_map_->GetOutputs(source->name()).size() == 1) {
-        const NodeDef* mul = source;
-        // `scale` is the scalar multiplier, and `other` is the other operand.
-        // TODO(jingyue): handle the case where `scale` is 0-th operand.
-        const NodeDef* scale = node_map_->GetNode(mul->input(1));
-        const NodeDef* other = node_map_->GetNode(mul->input(0));
-        if (scale->op() == "Const" && scale->attr().at("dtype").type() ==
-                                          weights->attr().at("dtype").type()) {
-          const TensorProto& scale_tensor = scale->attr().at("value").tensor();
-          // Test whether `scale` is a scalar.
-          if (scale_tensor.has_tensor_shape() &&
-              scale_tensor.tensor_shape().dim_size() == 0) {
-            // Create new node `scaled_weights`.
-            NodeDef* scaled_weights = AddNode(
-                *weights, StrCat("scaled_", conv->name()), /*copy_node=*/false);
-            scaled_weights->set_op("Mul");
-            scaled_weights->set_device(weights->device());
-            (*scaled_weights->mutable_attr())["T"] =
-                weights->attr().at("dtype");
-            nodes_to_simplify->PushBack(scaled_weights);
-
-            // Link in its inputs.
-            scaled_weights->add_input(conv->input(1));
-            node_map_->AddOutput(weights->name(), scaled_weights->name());
-            scaled_weights->add_input(mul->input(1));
-            node_map_->AddOutput(scale->name(), scaled_weights->name());
-            ForwardControlDependencies(scaled_weights, {source});
-
-            // Update `conv`'s weights to `scaled_weights`.
-            conv->set_input(1, scaled_weights->name());
-            node_map_->UpdateInput(conv->name(), weights->name(),
-                                   scaled_weights->name());
-            nodes_to_simplify->PushBack(conv);
-
-            // Update `mul`'s consumer to bypass `mul` because it's folded to
-            // the weights.
-            CHECK_EQ(node_map_->GetOutputs(mul->name()).size(), 1);
-            NodeDef* consumer_of_mul =
-                *node_map_->GetOutputs(mul->name()).begin();
-            consumer_of_mul->set_input(0, mul->input(0));
-            node_map_->UpdateInput(consumer_of_mul->name(), mul->name(),
-                                   other->name());
-            nodes_to_simplify->PushBack(consumer_of_mul);
-            return conv->name();
-          }
-        }
-      }
-    }
-  }
-
-  if (node->op() == "Mul" && node->input(0) == node->input(1) &&
-      !OptimizedNodeExists(*node, "square")) {
-    const DataType type = GetDataTypeFromAttr(*node, "T");
-    bool is_complex = (type == DT_COMPLEX64) || (type == DT_COMPLEX128);
-    string dontcare;
-    string device;
-    bool is_on_cpu =
-        DeviceNameUtils::SplitDeviceName(node->device(), &dontcare, &device) &&
-        str_util::StrContains(device, DEVICE_CPU);
-    if (!is_complex || is_on_cpu) {
-      NodeDef* new_square_node = AddNode(*node, "square", /*copy_node=*/true);
-      new_square_node->set_op("Square");
-      for (int i = 1; i < new_square_node->input_size(); ++i) {
-        new_square_node->set_input(i - 1, new_square_node->input(i));
-      }
-      new_square_node->mutable_input()->RemoveLast();
-      return new_square_node->name();
-    }
-  }
-
-  if (IsAggregate(*node) && NumNonControlInputs(*node) > 0) {
-    // Discard aggregate nodes with a single input and no control dependencies.
-    if (node->input_size() == 1) {
-      return node->input(0);
-    }
-
-    // Try to rewrite aggregations of N >= 2 identical terms (possibly due
-    // to deduping or other rewrites) so we can get rid of the sum entirely.
-    // The expression (using AddN as an example of an aggregate op):
-    //   AddN(x, x, x, ... ,x)
-    //        <-- N terms -->
-    // can be rewritten to
-    //   Mul(Const(N), x))
-    //
-    bool all_equal = true;
-    int num_inputs = 1;
-    for (int i = 1; i < node->input_size(); ++i) {
-      if (IsControlInput(node->input(i))) {
-        break;
-      }
-      ++num_inputs;
-      if (node->input(i) != node->input(0)) {
-        all_equal = false;
-        break;
-      }
-    }
-    if (all_equal && !OptimizedNodeExists(*node, "const") &&
-        !OptimizedNodeExists(*node, "mul")) {
-      // 1. Create constant node with value N.
-      const auto type = GetDataTypeFromAttr(*node, "T");
-      Tensor t(type, TensorShape({}));
-      Status status = SetTensorValue(type, num_inputs, &t);
-      if (!status.ok()) {
-        LOG(WARNING) << "Failed to create const node: "
-                     << status.error_message();
-        return "";
-      }
-      TensorValue value(&t);
-      NodeDef* new_const_node = AddNode(*node, "const", /*copy_node=*/false);
-      status = ConstantFolding::CreateNodeDef(new_const_node->name(), value,
-                                              new_const_node);
-      if (!status.ok()) {
-        LOG(WARNING) << "Failed to create const node: "
-                     << status.error_message();
-        return "";
-      }
-      new_const_node->set_device(node->device());
-      MaybeAddControlInput(NodeName(node->input(0)), new_const_node,
-                           optimized_graph_, node_map_.get());
-      nodes_to_simplify->PushBack(new_const_node);
-
-      // 2. Replace the aggregate node with Mul(Const(N), x).
-      NodeDef* new_mul_node = AddNode(*node, "mul", /*copy_node=*/false);
-      new_mul_node->set_op("Mul");
-      new_mul_node->set_device(node->device());
-      SetDataTypeToAttr(type, "T", new_mul_node);
-      new_mul_node->add_input(new_const_node->name());
-      node_map_->AddOutput(new_const_node->name(), new_mul_node->name());
-      new_mul_node->add_input(node->input(0));
-      node_map_->AddOutput(node->input(0), new_mul_node->name());
-
-      ForwardControlDependencies(new_mul_node, {node});
-      return new_mul_node->name();
-    }
-  }
-
-  // Fold Transpose into matrix multiplication.
-  if ((node->op() == "MatMul" || node->op() == "SparseMatMul" ||
-       node->op() == "BatchMatMul") &&
-      !OptimizedNodeExists(*node, "fused")) {
-    const NodeDef* a = node_map_->GetNode(node->input(0));
-    const NodeDef* b = node_map_->GetNode(node->input(1));
-    bool is_complex = false;
-    if (node->op() != "SparseMatMul") {
-      const DataType type = GetDataTypeFromAttr(*node, "T");
-      is_complex = (type == DT_COMPLEX64) || (type == DT_COMPLEX128);
-    }
-    const std::set<string> foldable_transpose_ops =
-        !is_complex ? std::set<string>{"ConjugateTranspose", "Transpose"}
-                    : (node->op() == "BatchMatMul"
-                           ? std::set<string>{"ConjugateTranspose"}
-                           : std::set<string>{"Transpose"});
-    const bool a_is_foldable = foldable_transpose_ops.count(a->op()) > 0 &&
-                               IsInnerMatrixTransposeNode(*a, node_map_.get());
-    const bool b_is_foldable = foldable_transpose_ops.count(b->op()) > 0 &&
-                               IsInnerMatrixTransposeNode(*b, node_map_.get());
-    if (a_is_foldable || b_is_foldable) {
-      NodeDef* new_op = AddNode(*node, "fused", /*copy_node=*/true);
-      if (a_is_foldable) {
-        const string attr_a =
-            node->op() == "BatchMatMul" ? "adj_x" : "transpose_a";
-        FlipBooleanAttr(attr_a, new_op);
-        new_op->set_input(0, a->input(0));
-        node_map_->UpdateInput(new_op->name(), a->name(), a->input(0));
-      }
-      if (b_is_foldable) {
-        const string attr_b =
-            node->op() == "BatchMatMul" ? "adj_y" : "transpose_b";
-        FlipBooleanAttr(attr_b, new_op);
-        new_op->set_input(1, b->input(0));
-        node_map_->UpdateInput(new_op->name(), b->name(), b->input(0));
-      }
-      std::vector<const NodeDef*> deps_to_forward({node});
-      if (a_is_foldable) {
-        deps_to_forward.push_back(a);
-      }
-      if (b_is_foldable) {
-        deps_to_forward.push_back(b);
-      }
-      ForwardControlDependencies(new_op, deps_to_forward);
-    }
-  }
-
-  // Fold Conj into Transpose or ConjugateTranspose.
-  if ((node->op() == "Conj" || node->op() == "Transpose" ||
-       node->op() == "ConjugateTranspose") &&
-      !OptimizedNodeExists(*node, "fused")) {
-    const NodeDef* input = node_map_->GetNode(node->input(0));
-    const NodeDef* transpose_op = node->op() == "Conj" ? input : node;
-    const NodeDef* conj_op = node->op() == "Conj" ? node : input;
-
-    if ((transpose_op->op() == "Transpose" ||
-         transpose_op->op() == "ConjugateTranspose") &&
-        conj_op->op() == "Conj") {
-      NodeDef* new_op =
-          AddNode(OptimizedNodeName(*node, "fused"), transpose_op);
-      // Flip the type of transpose op to absorb the conjugation.
-      new_op->set_op(transpose_op->op() == "Transpose" ? "ConjugateTranspose"
-                                                       : "Transpose");
-      new_op->set_input(0, input->input(0));
-      node_map_->UpdateInput(new_op->name(), node->name(), input->input(0));
-      ForwardControlDependencies(new_op, {node, input});
-      return new_op->name();
-    }
-  }
-
-  return "";
+  DedupControlInputs(target_node);
 }
 
 Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
@@ -2158,20 +2722,43 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
 
   if (options_.combine_add_to_addn && can_use_shapes)
     pipeline.AddStage<AddOpsRewriteStage>(ctx, ctx_ext);
+  if (options_.fold_conjugate_into_transpose)
+    pipeline.AddStage<FoldConjugateIntoTranspose>(ctx, ctx_ext);
+  if (options_.fold_multiply_into_conv)
+    pipeline.AddStage<FoldMultiplyIntoConv>(ctx, ctx_ext);
+  if (options_.fold_transpose_into_matmul)
+    pipeline.AddStage<FoldTransposeIntoMatMul>(ctx, ctx_ext);
   if (options_.hoist_common_factor_out_of_aggregation && can_use_shapes)
     pipeline.AddStage<HoistCommonFactorOutOfAggregation>(ctx, ctx_ext);
   if (options_.minimize_broadcasts && can_use_shapes)
     pipeline.AddStage<MinimizeBroadcasts>(ctx, ctx_ext);
   if (options_.remove_identity_transpose && can_use_shapes)
     pipeline.AddStage<RemoveIdentityTranspose>(ctx, ctx_ext);
+  if (options_.remove_involution)
+    pipeline.AddStage<RemoveInvolution>(ctx, ctx_ext);
   if (options_.remove_redundant_bitcast)
     pipeline.AddStage<RemoveRedundantBitcastStage>(ctx, ctx_ext);
   if (options_.remove_redundant_cast)
     pipeline.AddStage<RemoveRedundantCastStage>(ctx, ctx_ext);
+  if (options_.remove_redundant_reshape)
+    pipeline.AddStage<RemoveRedundantReshape>(ctx, ctx_ext);
   if (options_.remove_negation)
     pipeline.AddStage<RemoveNegationStage>(ctx, ctx_ext);
-  if (options_.hoist_unary_out_of_concat)
-    pipeline.AddStage<HoistCWiseUnaryFromConcatStage>(ctx, ctx_ext);
+  if (options_.replace_mul_with_square)
+    pipeline.AddStage<ReplaceMulWithSquare>(ctx, ctx_ext);
+  if (options_.remove_logical_not)
+    pipeline.AddStage<RemoveLogicalNotStage>(ctx, ctx_ext);
+  if (options_.reorder_cast_and_transpose)
+    pipeline.AddStage<ReorderCastAndTranspose>(ctx, ctx_ext);
+  if (options_.simplify_aggregation)
+    pipeline.AddStage<SimplifyAggregation>(ctx, ctx_ext);
+  if (options_.hoist_cwise_unary_chains)
+    pipeline.AddStage<HoistCWiseUnaryChainsStage>(ctx, ctx_ext);
+  if (options_.convert_sqrt_div_to_rsqrt_mul)
+    pipeline.AddStage<SqrtDivToRsqrtMulStage>(ctx, ctx_ext);
+  if (options_.remove_idempotent)
+    pipeline.AddStage<RemoveIdempotentStage>(ctx, ctx_ext);
+  if (options_.convert_pow) pipeline.AddStage<ConvertPowStage>(ctx, ctx_ext);
 
   VLOG(1) << "Run " << pipeline.NumStages() << " arithmetic optimizer stages: "
           << str_util::Join(pipeline.StageNames(), ", ");
@@ -2179,19 +2766,11 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
   while (!nodes_to_simplify.Empty()) {
     NodeDef* node = nodes_to_simplify.PopBack();
 
-    // TODO(ezhulenev): move all rewrites into separate stages
     string simplified_tensor = "";
-    if (options_.enable_try_simplify_and_replace) {
-      simplified_tensor = TrySimplifyAndReplaceUses(node, &nodes_to_simplify);
-    }
+    bool optimized = pipeline.PassThroughAllStages(node, &simplified_tensor);
 
-    // if it was not simplified try to run it through all configured stages
-    if (!stop(simplified_tensor)) {
-      bool optimized = pipeline.PassThroughAllStages(node, &simplified_tensor);
-      if (!optimized) {
-        continue;
-      }
-    }
+    // If the node was not optimized by any of the stages, go to the next one.
+    if (!optimized) continue;
 
     // re-wire consumers of an old node to the new one
     if (NodeName(simplified_tensor) != node->name()) {
@@ -2252,7 +2831,8 @@ Status ArithmeticOptimizer::Optimize(Cluster* /*cluster*/,
   TF_RETURN_IF_ERROR(TopologicalSort(optimized_graph_));
 
   graph_properties_.reset(new GraphProperties(optimized_item));
-  const Status status = graph_properties_->InferStatically(false);
+  const bool assume_valid_feeds = opt_level_ == RewriterConfig::AGGRESSIVE;
+  const Status status = graph_properties_->InferStatically(assume_valid_feeds);
   const bool can_use_shapes = status.ok();
   if (!can_use_shapes) {
     VLOG(1) << "Shape inference failed." << status.error_message();
@@ -2272,5 +2852,5 @@ void ArithmeticOptimizer::Feedback(Cluster* /*cluster*/,
   // Nothing to do for ArithmeticOptimizer.
 }
 
-}  // end namespace grappler
-}  // end namespace tensorflow
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
index 375f13acc131e7..40c5e9fc56a50e 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_ARITHMETIC_OPTIMIZER_H_
-#define TENSORFLOW_GRAPPLER_OPTIMIZERS_ARITHMETIC_OPTIMIZER_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_ARITHMETIC_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_ARITHMETIC_OPTIMIZER_H_
 
 #include <unordered_set>
 #include "tensorflow/core/grappler/costs/graph_properties.h"
@@ -54,46 +54,37 @@ class ArithmeticOptimizer : public GraphOptimizer {
 
   // Granular control for arithmetic optimizer stages
   struct ArithmeticOptimizerOptions {
-    // TODO(ezhulenev): flag do disable TrySimplifyAndReplaceUses in tests.
-    // Remove when all optimizers will be migrated to separate stages.
-    bool dedup_computations = true;
-    bool enable_try_simplify_and_replace = true;
     bool combine_add_to_addn = true;
+    bool convert_sqrt_div_to_rsqrt_mul = true;
+    bool dedup_computations = true;
+    bool fold_conjugate_into_transpose = true;
+    bool fold_multiply_into_conv = true;
+    bool fold_transpose_into_matmul = true;
     bool hoist_common_factor_out_of_aggregation = true;
+    bool hoist_cwise_unary_chains = false;
     bool minimize_broadcasts = true;
+    bool remove_idempotent = true;
     bool remove_identity_transpose = true;
+    bool remove_involution = true;
+    bool remove_logical_not = true;
+    bool remove_negation = true;
     bool remove_redundant_bitcast = true;
     bool remove_redundant_cast = true;
-    bool remove_negation = true;
-    bool hoist_unary_out_of_concat = false;
+    bool remove_redundant_reshape = true;
+    bool reorder_cast_and_transpose = true;
+    bool replace_mul_with_square = true;
+    bool simplify_aggregation = true;
+    bool convert_pow = true;
 
     // Choose which arithmetic optimizer stages will be enabled for a given
     // optimization level by default.
     static ArithmeticOptimizerOptions Default(
         RewriterConfig::Toggle opt_level) {
       ArithmeticOptimizerOptions options;
-      if (opt_level == RewriterConfig::AGGRESSIVE) {
-        options.hoist_unary_out_of_concat = true;
-      }
       return options;
     }
   };
 
-  // Returns true is a node with given name and the optimizer prefix already
-  // exists.
-  string OptimizedNodeName(const NodeDef& node, StringPiece suffix) const;
-  bool OptimizedNodeExists(const NodeDef& node, StringPiece suffix) const;
-
-  // Creates a new node in the graph, with name equal to that of node, prefixed
-  // with "ArithmeticOptimizer/" and the given suffix. Also updates node_map_,
-  // and optionally copies node into the new node if copy_node is true.
-  NodeDef* AddNode(const NodeDef& node, StringPiece suffix, bool copy_node);
-
-  // Creates a new node in the graph, prefixed with "ArithmeticOptimizer/",
-  // updates node_map_, and optionally copies *node_to_copy into the new
-  // node, if node_to_copy is not nullptr.
-  NodeDef* AddNode(const string& name, const NodeDef* node_to_copy);
-
   // Returns true if it is safe to dedup node from the graph.
   bool CanDedup(const NodeDef& node) const;
 
@@ -109,7 +100,7 @@ class ArithmeticOptimizer : public GraphOptimizer {
   Status SimplifyArithmeticOps(bool can_use_shapes);
   // Tries to simplify the expression that roots at `node` and replaces the uses
   // of `node` to the simplified expression. Returns the name of the simplified
-  // tensor (e.g. "split:1") or an emtpy string if no simplification is
+  // tensor (e.g. "split:1") or an empty string if no simplification is
   // performed.
   //
   // `node_map` stores the mapping from node names to NodeDef*, and will be
@@ -138,4 +129,4 @@ class ArithmeticOptimizer : public GraphOptimizer {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_ARITHMETIC_OPTIMIZER_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_ARITHMETIC_OPTIMIZER_H_
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index df10dbdf48ff2a..ff96cb64801ee4 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -40,21 +40,37 @@ constexpr char kHoistFactorOptimizerMul[] =
 constexpr char kHoistFactorOptimizerAdd[] =
     "ArithmeticOptimizer/HoistCommonFactor_Add_";
 
-// Optimized name of outer Mul node by HoistCommonFactorOutOfAggregation
+constexpr char kSimplifyAggregationConst[] =
+    "ArithmeticOptimizer/SimplifyAggregation_Const_";
+
+constexpr char kSimplifyAggregationMul[] =
+    "ArithmeticOptimizer/SimplifyAggregation_Mul_";
+
+// Optimized name of outer Mul node by HoistCommonFactorOutOfAggregation.
 string HoistMulName(const string& name) {
   return AddPrefixToNodeName(name, kHoistFactorOptimizerMul, "");
 }
 
-// Optimized name of outer Div node by HoistCommonFactorOutOfAggregation
+// Optimized name of outer Div node by HoistCommonFactorOutOfAggregation.
 string HoistDivName(const string& name) {
   return AddPrefixToNodeName(name, kHoistFactorOptimizerDiv, "");
 }
 
-// Optimized name of inner Add node by HoistCommonFactorOutOfAggregation
+// Optimized name of inner Add node by HoistCommonFactorOutOfAggregation.
 string HoistAddName(const string& name) {
   return AddPrefixToNodeName(name, kHoistFactorOptimizerAdd, "");
 }
 
+// Optimized name of Const node by SimplifyAggregation.
+string AggregationConstName(const string& name) {
+  return AddPrefixToNodeName(name, kSimplifyAggregationConst, "");
+}
+
+// Optimized name of Mul node by SimplifyAggregation.
+string AggregationMulName(const string& name) {
+  return AddPrefixToNodeName(name, kSimplifyAggregationMul, "");
+}
+
 string OptimizedName(const string& name) {
   return AddPrefixToNodeName(name, kArithmeticOptimizer);
 }
@@ -83,6 +99,7 @@ class ArithmeticOptimizerTest : public GrapplerTest {
                         GraphDef* output) {
     TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
     item->graph.Swap(output);
+    output->Clear();
     TF_EXPECT_OK(ModelPruner().Optimize(nullptr, *item, output));
   }
 
@@ -91,22 +108,56 @@ class ArithmeticOptimizerTest : public GrapplerTest {
                      GraphDef* output) {
     TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
     item->graph.Swap(output);
+    output->Clear();
     TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
   }
 
+  // Run ArithmeticOptimizer twice to make sure the rewrite is idempotent.
+  // Optionally run a constant folding pass before pruning.
+  void OptimizeTwiceAndPrune(ArithmeticOptimizer* optimizer, GrapplerItem* item,
+                             GraphDef* output, bool const_folding = false) {
+    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
+
+    item->graph.Swap(output);
+    output->Clear();
+    TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
+
+    if (const_folding) {
+      item->graph.Swap(output);
+      output->Clear();
+      TF_EXPECT_OK(ConstantFolding(/*cpu_device=*/nullptr)
+                       .Optimize(nullptr, *item, output));
+    }
+
+    item->graph.Swap(output);
+    output->Clear();
+    TF_EXPECT_OK(ModelPruner().Optimize(nullptr, *item, output));
+  }
+
   // TODO(ezhulenev): Make private. After migration to stages each test
   // should explicitly enable required optimization for tests isolation
   void DisableAllStages(ArithmeticOptimizer* optimizer) {
     ArithmeticOptimizer::ArithmeticOptimizerOptions options;
     options.dedup_computations = false;
-    options.enable_try_simplify_and_replace = false;
     options.combine_add_to_addn = false;
+    options.convert_sqrt_div_to_rsqrt_mul = false;
+    options.fold_conjugate_into_transpose = false;
+    options.fold_multiply_into_conv = false;
+    options.fold_transpose_into_matmul = false;
     options.hoist_common_factor_out_of_aggregation = false;
+    options.hoist_cwise_unary_chains = false;
     options.minimize_broadcasts = false;
     options.remove_identity_transpose = false;
+    options.remove_involution = false;
+    options.remove_idempotent = false;
     options.remove_redundant_bitcast = false;
     options.remove_redundant_cast = false;
+    options.remove_redundant_reshape = false;
     options.remove_negation = false;
+    options.remove_logical_not = false;
+    options.reorder_cast_and_transpose = false;
+    options.replace_mul_with_square = false;
+    options.simplify_aggregation = false;
     optimizer->options_ = options;
   }
 
@@ -119,6 +170,21 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     optimizer->options_.combine_add_to_addn = true;
   }
 
+  void EnableOnlyFoldConjugateIntoTranspose(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.fold_conjugate_into_transpose = true;
+  }
+
+  void EnableOnlyFoldMultipleIntoConv(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.fold_multiply_into_conv = true;
+  }
+
+  void EnableOnlyFoldTransposeIntoMatMul(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.fold_transpose_into_matmul = true;
+  }
+
   void EnableOnlyHoistCommonFactor(ArithmeticOptimizer* optimizer) {
     DisableAllStages(optimizer);
     optimizer->options_.hoist_common_factor_out_of_aggregation = true;
@@ -134,6 +200,11 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     optimizer->options_.remove_identity_transpose = true;
   }
 
+  void EnableOnlyRemoveInvolution(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_involution = true;
+  }
+
   void EnableOnlyRemoveRedundantBitcast(ArithmeticOptimizer* optimizer) {
     DisableAllStages(optimizer);
     optimizer->options_.remove_redundant_bitcast = true;
@@ -144,13 +215,54 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     optimizer->options_.remove_redundant_cast = true;
   }
 
+  void EnableOnlyRemoveRedundantReshape(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_redundant_reshape = true;
+  }
+
   void EnableOnlyRemoveNegation(ArithmeticOptimizer* optimizer) {
     DisableAllStages(optimizer);
     optimizer->options_.remove_negation = true;
   }
-  void EnableOnlyHoistCWiseUnaryFromConcat(ArithmeticOptimizer* optimizer) {
+
+  void EnableOnlyReorderCastAndTranspose(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.reorder_cast_and_transpose = true;
+  }
+
+  void EnableOnlyReplaceMulWithSquare(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.replace_mul_with_square = true;
+  }
+
+  void EnableOnlyHoistCWiseUnaryChains(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.hoist_cwise_unary_chains = true;
+  }
+
+  void EnableOnlySqrtDivToRsqrtMul(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.convert_sqrt_div_to_rsqrt_mul = true;
+  }
+
+  void EnableOnlyConvertPow(ArithmeticOptimizer* optimizer) {
     DisableAllStages(optimizer);
-    optimizer->options_.hoist_unary_out_of_concat = true;
+    optimizer->options_.convert_pow = true;
+  }
+
+  void EnableOnlyRemoveIdempotent(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_idempotent = true;
+  }
+
+  void EnableOnlyRemoveLogicalNot(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_logical_not = true;
+  }
+
+  void EnableOnlySimplifyAggregation(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.simplify_aggregation = true;
   }
 };
 
@@ -277,131 +389,142 @@ TEST_F(ArithmeticOptimizerTest, OpDedupCommutative) {
   test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
-TEST_F(ArithmeticOptimizerTest, MulToSquare) {
+TEST_F(ArithmeticOptimizerTest, ReplaceMulWithSquare) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output c = ops::Const(s.WithOpName("c"), {1.0f, 2.0f}, {1, 2});
   Output d = ops::Const(s.WithOpName("d"), {3.0f, 4.0f}, {1, 2});
   Output mul = ops::Mul(s.WithControlDependencies(d).WithOpName("mul"), c, c);
   Output id = ops::Identity(s.WithOpName("id"), mul);
+
   GrapplerItem item;
+  item.fetch = {"id"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  std::vector<string> fetch = {"id"};
-  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
   EXPECT_EQ(1, tensors_expected.size());
 
-  ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  ArithmeticOptimizer optimizer;
+  EnableOnlyReplaceMulWithSquare(&optimizer);
+  OptimizeAndPrune(&optimizer, &item, &output);
 
-  EXPECT_EQ(5, output.node_size());
-  EXPECT_EQ("id", output.node(3).name());
-  EXPECT_EQ(OptimizedName("mul_square"), output.node(3).input(0));
-  EXPECT_EQ("Square", output.node(4).op());
-  EXPECT_EQ(OptimizedName("mul_square"), output.node(4).name());
-  EXPECT_EQ(2, output.node(4).input_size());
-  EXPECT_EQ("c", output.node(4).input(0));
-  EXPECT_EQ("^d", output.node(4).input(1));
+  EXPECT_EQ(4, output.node_size());
 
-  auto tensors = EvaluateNodes(output, fetch);
+  NodeMap node_map(&output);
+  const string p = "ArithmeticOptimizer/ReplaceMulWithSquare";
+  const NodeDef* square_node = node_map.GetNode(strings::StrCat(p, "_", "mul"));
+
+  ASSERT_NE(square_node, nullptr);
+  EXPECT_EQ("Square", square_node->op());
+  EXPECT_EQ("c", square_node->input(0));
+  EXPECT_EQ("^d", square_node->input(1));
+
+  auto tensors = EvaluateNodes(output, item.fetch);
   EXPECT_EQ(1, tensors.size());
   test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
-TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsReal) {
+TEST_F(ArithmeticOptimizerTest, RemoveInvolution_AdjacentNodes) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output c = ops::Const(s.WithOpName("c"), {1.0f, 2.0f}, {1, 2});
-  Output neg1 = ops::Neg(s.WithOpName("neg1"), c);
-  Output neg2 = ops::Neg(s.WithOpName("neg2"), neg1);
-  Output recip1 = ops::Reciprocal(s.WithOpName("recip1"), neg2);
-  Output recip2 = ops::Reciprocal(s.WithOpName("recip2"), recip1);
-  Output id = ops::Identity(s.WithOpName("id"), recip2);
+
+  auto c = ops::Const(s.WithOpName("c"), {1.0f, 2.0f}, {1, 2});
+  auto neg1 = ops::Neg(s.WithOpName("neg1"), c);
+  auto neg2 = ops::Neg(s.WithOpName("neg2"), neg1);
+  auto recip1 = ops::Reciprocal(s.WithOpName("recip1"), neg2);
+  auto recip2 = ops::Reciprocal(s.WithOpName("recip2"), recip1);
+  auto id = ops::Identity(s.WithOpName("id"), recip2);
+
   GrapplerItem item;
+  item.fetch = {"id"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  std::vector<string> fetch = {"id"};
-  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
   EXPECT_EQ(1, tensors_expected.size());
 
-  ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  ArithmeticOptimizer optimizer;
+  EnableOnlyRemoveInvolution(&optimizer);
+  OptimizeAndPrune(&optimizer, &item, &output);
 
-  EXPECT_EQ(6, output.node_size());
+  // Negation and Reciprocal nodes cancelled each other.
+  EXPECT_EQ(2, output.node_size());
+  EXPECT_EQ("id", output.node(1).name());
   EXPECT_EQ("c", output.node(1).input(0));
-  EXPECT_EQ("c", output.node(3).input(0));
-  EXPECT_EQ("c", output.node(5).input(0));
 
-  auto tensors = EvaluateNodes(output, fetch);
+  auto tensors = EvaluateNodes(output, item.fetch);
   EXPECT_EQ(1, tensors.size());
   test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
-TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsWithChain) {
+TEST_F(ArithmeticOptimizerTest, RemoveInvolution_AroundValuePreservingChain) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output c = ops::Const(s.WithOpName("c"), {1.0f, 2.0f}, {1, 2});
-  Output recip1 = ops::Reciprocal(s.WithOpName("recip1"), c);
-  Output id1 = ops::Identity(s.WithOpName("id1"), recip1);
-  Output squeeze = ops::Squeeze(s.WithOpName("squeeze"), id1);
-  Output recip2 = ops::Reciprocal(s.WithOpName("recip2"), squeeze);
-  Output id2 = ops::Identity(s.WithOpName("id2"), recip2);
+
+  auto c = ops::Const(s.WithOpName("c"), {1.0f, 2.0f}, {1, 2});
+  auto recip1 = ops::Reciprocal(s.WithOpName("recip1"), c);
+  auto id1 = ops::Identity(s.WithOpName("id1"), recip1);
+  auto squeeze = ops::Squeeze(s.WithOpName("squeeze"), id1);
+  auto recip2 = ops::Reciprocal(s.WithOpName("recip2"), squeeze);
+  auto id2 = ops::Identity(s.WithOpName("id2"), recip2);
+
+  std::vector<string> fetch = {"id2"};
+
   GrapplerItem item;
+  item.fetch = fetch;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  std::vector<string> fetch = {"id2"};
   auto tensors_expected = EvaluateNodes(item.graph, fetch);
   EXPECT_EQ(1, tensors_expected.size());
 
-  ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  ArithmeticOptimizer optimizer;
+  EnableOnlyRemoveInvolution(&optimizer);
+  OptimizeTwiceAndPrune(&optimizer, &item, &output);
 
-  EXPECT_EQ(6, output.node_size());
-  EXPECT_EQ("squeeze", output.node(5).input(0));
-  EXPECT_EQ("c", output.node(2).input(0));
+  // Check that Reciprocal nodes were removed from the graph.
+  EXPECT_EQ(3, output.node_size());
+
+  // And const directly flows into squeeze.
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "squeeze") {
+      EXPECT_EQ("c", node.input(0));
+      found++;
+    } else if (node.name() == "id2") {
+      EXPECT_EQ("squeeze", node.input(0));
+      found++;
+    }
+  }
+  EXPECT_EQ(2, found);
 
   auto tensors = EvaluateNodes(output, fetch);
   EXPECT_EQ(1, tensors.size());
   test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
-TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsWithControlChain) {
+TEST_F(ArithmeticOptimizerTest, RemoveInvolution_SkipControlDependencies) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output c = ops::Const(s.WithOpName("c"), {1.0f, 2.0f}, {1, 2});
-  Output recip1 = ops::Reciprocal(s.WithOpName("recip1"), c);
-  Output id1 = ops::Identity(s.WithOpName("id1"), recip1);
-  Output squeeze = ops::Squeeze(s.WithOpName("squeeze"), id1);
-  Output recip2 = ops::Reciprocal(
+
+  auto c = ops::Const(s.WithOpName("c"), {1.0f, 2.0f}, {1, 2});
+  auto recip1 = ops::Reciprocal(s.WithOpName("recip1"), c);
+  auto id1 = ops::Identity(s.WithOpName("id1"), recip1);
+  auto squeeze = ops::Squeeze(s.WithOpName("squeeze"), id1);
+  auto recip2 = ops::Reciprocal(
       s.WithOpName("recip2").WithControlDependencies(squeeze), c);
-  Output id2 = ops::Identity(s.WithOpName("id2"), recip2);
+  auto id2 = ops::Identity(s.WithOpName("id2"), recip2);
+
+  std::vector<string> fetch = {"id2"};
+
   GrapplerItem item;
+  item.fetch = fetch;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  std::vector<string> fetch = {"id2"};
   auto tensors_expected = EvaluateNodes(item.graph, fetch);
   EXPECT_EQ(1, tensors_expected.size());
 
-  ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  ArithmeticOptimizer optimizer;
+  EnableOnlyRemoveInvolution(&optimizer);
+  OptimizeTwice(&optimizer, &item, &output);  // do not prune in this test
 
   // The optimizer should be a noop.
-  EXPECT_EQ(item.graph.node_size(), output.node_size());
-  for (int i = 0; i < item.graph.node_size(); ++i) {
-    const NodeDef& original = item.graph.node(i);
-    const NodeDef& optimized = output.node(i);
-    EXPECT_EQ(original.name(), optimized.name());
-    EXPECT_EQ(original.op(), optimized.op());
-    EXPECT_EQ(original.input_size(), optimized.input_size());
-    for (int j = 0; j < original.input_size(); ++j) {
-      EXPECT_EQ(original.input(j), optimized.input(j));
-    }
-  }
+  VerifyGraphsMatch(item.graph, output, __LINE__);
 
   auto tensors = EvaluateNodes(output, fetch);
   EXPECT_EQ(1, tensors.size());
@@ -415,10 +538,10 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsSimple) {
   Output id = ops::Identity(s.WithOpName("id"), add);
 
   GrapplerItem item;
+  item.fetch = {"id"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  std::vector<string> fetch = {"id"};
-  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
   EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
@@ -428,22 +551,25 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsSimple) {
 
   EXPECT_EQ(5, output.node_size());
 
-  const NodeDef* new_const = node_map.GetNode(OptimizedName("add_const"));
+  const string optimized_const_name = AggregationConstName("add");
+  const string optimized_mul_name = AggregationMulName("add");
+
+  const NodeDef* new_const = node_map.GetNode(optimized_const_name);
   ASSERT_NE(new_const, nullptr);
   EXPECT_EQ("^x", new_const->input(0));
   EXPECT_EQ(std::string("\0\0\0@", 4),
             new_const->attr().at("value").tensor().tensor_content());
 
-  const NodeDef* new_mul = node_map.GetNode(OptimizedName("add_mul"));
+  const NodeDef* new_mul = node_map.GetNode(optimized_mul_name);
   ASSERT_NE(new_mul, nullptr);
-  EXPECT_EQ(OptimizedName("add_const"), new_mul->input(0));
+  EXPECT_EQ(optimized_const_name, new_mul->input(0));
   EXPECT_EQ("x", new_mul->input(1));
 
   const NodeDef* new_id = node_map.GetNode("id");
   ASSERT_NE(new_id, nullptr);
-  EXPECT_EQ(OptimizedName("add_mul"), new_id->input(0));
+  EXPECT_EQ(optimized_mul_name, new_id->input(0));
 
-  auto tensors = EvaluateNodes(output, fetch);
+  auto tensors = EvaluateNodes(output, item.fetch);
   EXPECT_EQ(1, tensors.size());
   test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
@@ -469,21 +595,24 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsSimpleWithControlDep) {
 
   EXPECT_EQ(6, output.node_size());
 
-  const NodeDef* new_const = node_map.GetNode(OptimizedName("add_const"));
+  const string optimized_const_name = AggregationConstName("add");
+  const string optimized_mul_name = AggregationMulName("add");
+
+  const NodeDef* new_const = node_map.GetNode(optimized_const_name);
   ASSERT_NE(new_const, nullptr);
   EXPECT_EQ("^x", new_const->input(0));
   EXPECT_EQ(std::string("\0\0\0@", 4),
             new_const->attr().at("value").tensor().tensor_content());
 
-  const NodeDef* new_mul = node_map.GetNode(OptimizedName("add_mul"));
+  const NodeDef* new_mul = node_map.GetNode(optimized_mul_name);
   ASSERT_NE(new_mul, nullptr);
-  EXPECT_EQ(OptimizedName("add_const"), new_mul->input(0));
+  EXPECT_EQ(optimized_const_name, new_mul->input(0));
   EXPECT_EQ("x", new_mul->input(1));
   EXPECT_EQ("^y", new_mul->input(2));
 
   const NodeDef* new_id = node_map.GetNode("id");
   ASSERT_NE(new_id, nullptr);
-  EXPECT_EQ(OptimizedName("add_mul"), new_id->input(0));
+  EXPECT_EQ(optimized_mul_name, new_id->input(0));
 
   auto tensors = EvaluateNodes(output, fetch);
   EXPECT_EQ(1, tensors.size());
@@ -548,24 +677,24 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsRepeatedAdd) {
   ASSERT_NE(add_4_node, nullptr);
   EXPECT_EQ("Add", add_4_node->op());
   EXPECT_EQ(2, add_4_node->input_size());
-  EXPECT_EQ(OptimizedName("Add_const"), add_4_node->input(0));
-  EXPECT_EQ(OptimizedName("Add_1_const"), add_4_node->input(1));
+  EXPECT_EQ(AggregationConstName("Add"), add_4_node->input(0));
+  EXPECT_EQ(AggregationConstName("Add_1"), add_4_node->input(1));
 
   const NodeDef* add_5_node = node_map.GetNode(HoistAddName("Add_5"));
   ASSERT_NE(add_5_node, nullptr);
   EXPECT_EQ("Add", add_5_node->op());
   EXPECT_EQ(2, add_5_node->input_size());
-  EXPECT_EQ(OptimizedName("Add_const"), add_5_node->input(0));
-  EXPECT_EQ(OptimizedName("Add_1_const"), add_5_node->input(1));
+  EXPECT_EQ(AggregationConstName("Add"), add_5_node->input(0));
+  EXPECT_EQ(AggregationConstName("Add_1"), add_5_node->input(1));
 
-  const NodeDef* add_const_node = node_map.GetNode(OptimizedName("Add_const"));
+  const NodeDef* add_const_node = node_map.GetNode(AggregationConstName("Add"));
   ASSERT_NE(add_const_node, nullptr);
   EXPECT_EQ("Const", add_const_node->op());
   EXPECT_EQ(1, add_const_node->input_size());
   EXPECT_EQ("^Placeholder", add_const_node->input(0));
 
   const NodeDef* add_1_const_node =
-      node_map.GetNode(OptimizedName("Add_1_const"));
+      node_map.GetNode(AggregationConstName("Add_1"));
   ASSERT_NE(add_1_const_node, nullptr);
   EXPECT_EQ("Const", add_1_const_node->op());
   EXPECT_EQ(1, add_1_const_node->input_size());
@@ -671,6 +800,9 @@ TEST_F(ArithmeticOptimizerTest, HoistFactorDiv) {
         item.fetch = {"id"};
         TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+        auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+        EXPECT_EQ(1, tensors_expected.size());
+
         ArithmeticOptimizer optimizer;
         EnableOnlyHoistCommonFactor(&optimizer);
 
@@ -709,6 +841,13 @@ TEST_F(ArithmeticOptimizerTest, HoistFactorDiv) {
           EXPECT_EQ("id", id_node->name());
           EXPECT_EQ(HoistDivName("add"), id_node->input(0));
         }
+        auto tensors = EvaluateNodes(output, item.fetch);
+        EXPECT_EQ(1, tensors.size());
+        if (use_ints) {
+          test::ExpectTensorEqual<int32>(tensors_expected[0], tensors[0]);
+        } else {
+          test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+        }
       }
     }
   }
@@ -722,11 +861,14 @@ TEST_F(ArithmeticOptimizerTest, FuseConjAndTranspose) {
   Output perm = ops::Const(s.WithOpName("perm"), {1, 0}, {2});
   Output conj = ops::Conj(s.WithOpName("conj"), z);
   Output transp = ops::Transpose(s.WithOpName("trans"), conj, perm);
+
   GrapplerItem item;
+  item.fetch = {"trans"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  std::vector<string> fetch = {"trans"};
-  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
   EXPECT_EQ(1, tensors_expected.size());
+
   ArithmeticOptimizer optimizer;
   GraphDef output;
   OptimizeTwice(&optimizer, &item, &output);
@@ -734,20 +876,23 @@ TEST_F(ArithmeticOptimizerTest, FuseConjAndTranspose) {
 
   EXPECT_EQ(7, output.node_size());
 
-  const NodeDef* trans_fused_node =
-      node_map.GetNode(OptimizedName("trans_fused"));
+  const string p = "ArithmeticOptimizer/FoldConjugateIntoTranspose";
+  const string optimized_name = strings::StrCat(p, "_", "trans");
+
+  const NodeDef* trans_fused_node = node_map.GetNode(optimized_name);
   ASSERT_NE(trans_fused_node, nullptr);
   EXPECT_EQ("ConjugateTranspose", trans_fused_node->op());
   EXPECT_EQ("z", trans_fused_node->input(0));
   EXPECT_EQ("perm", trans_fused_node->input(1));
 
-  auto tensors = EvaluateNodes(output, fetch);
+  auto tensors = EvaluateNodes(output, item.fetch);
   EXPECT_EQ(1, tensors.size());
   test::ExpectTensorEqual<complex64>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ArithmeticOptimizerTest, FuseConjAndConjugateTranspose) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
   Output re = ops::Const(s.WithOpName("re"), {1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
   Output im = ops::Const(s.WithOpName("im"), {5.0f, 6.0f, 7.0f, 8.0f}, {2, 2});
   Output z = ops::Complex(s.WithOpName("z"), re, im);
@@ -755,10 +900,12 @@ TEST_F(ArithmeticOptimizerTest, FuseConjAndConjugateTranspose) {
   Output conj = ops::Conj(s.WithOpName("conj"), z);
   Output transp =
       ops::ConjugateTranspose(s.WithOpName("conjugate_trans"), conj, perm);
+
   GrapplerItem item;
+  item.fetch = {"conjugate_trans"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  std::vector<string> fetch = {"conjugate_trans"};
-  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
   EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
@@ -768,12 +915,16 @@ TEST_F(ArithmeticOptimizerTest, FuseConjAndConjugateTranspose) {
 
   EXPECT_EQ(7, output.node_size());
 
-  const NodeDef* conjugate_trans_fused_node =
-      node_map.GetNode(OptimizedName("conjugate_trans_fused"));
+  const string p = "ArithmeticOptimizer/FoldConjugateIntoTranspose";
+  const string optimized_name = strings::StrCat(p, "_", "conjugate_trans");
+
+  const NodeDef* conjugate_trans_fused_node = node_map.GetNode(optimized_name);
+  ASSERT_NE(conjugate_trans_fused_node, nullptr);
   EXPECT_EQ("Transpose", conjugate_trans_fused_node->op());
   EXPECT_EQ("z", conjugate_trans_fused_node->input(0));
   EXPECT_EQ("perm", conjugate_trans_fused_node->input(1));
-  auto tensors = EvaluateNodes(output, fetch);
+
+  auto tensors = EvaluateNodes(output, item.fetch);
   EXPECT_EQ(1, tensors.size());
   test::ExpectTensorEqual<complex64>(tensors_expected[0], tensors[0]);
 }
@@ -786,10 +937,12 @@ TEST_F(ArithmeticOptimizerTest, FuseTransposeAndConj) {
   Output perm = ops::Const(s.WithOpName("perm"), {1, 0}, {2});
   Output trans = ops::Transpose(s.WithOpName("trans"), z, perm);
   Output conj = ops::Conj(s.WithOpName("conj"), trans);
+
   GrapplerItem item;
+  item.fetch = {"conj"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  std::vector<string> fetch = {"conj"};
-  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
   EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
@@ -799,12 +952,16 @@ TEST_F(ArithmeticOptimizerTest, FuseTransposeAndConj) {
 
   EXPECT_EQ(7, output.node_size());
 
-  const NodeDef* conj_fused_node =
-      node_map.GetNode(OptimizedName("conj_fused"));
+  const string p = "ArithmeticOptimizer/FoldConjugateIntoTranspose";
+  const string optimized_name = strings::StrCat(p, "_", "conj");
+
+  const NodeDef* conj_fused_node = node_map.GetNode(optimized_name);
+  ASSERT_NE(conj_fused_node, nullptr);
   EXPECT_EQ("ConjugateTranspose", conj_fused_node->op());
   EXPECT_EQ("z", conj_fused_node->input(0));
   EXPECT_EQ("perm", conj_fused_node->input(1));
-  auto tensors = EvaluateNodes(output, fetch);
+
+  auto tensors = EvaluateNodes(output, item.fetch);
   EXPECT_EQ(1, tensors.size());
   test::ExpectTensorEqual<complex64>(tensors_expected[0], tensors[0]);
 }
@@ -812,38 +969,45 @@ TEST_F(ArithmeticOptimizerTest, FuseTransposeAndConj) {
 TEST_F(ArithmeticOptimizerTest, FoldTransposeIntoMatMul) {
   for (const string matmul_type : {"MatMul", "SparseMatMul", "BatchMatMul"}) {
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
     Output a = ops::Const(s.WithOpName("a"), {1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
     Output b = ops::Const(s.WithOpName("b"), {5.0f, 6.0f, 7.0f, 8.0f}, {2, 2});
     Output perm = ops::Const(s.WithOpName("perm"), {1, 0}, {2});
     Output trans_a = ops::Transpose(s.WithOpName("trans_a"), a, perm);
     Output trans_b = ops::Transpose(s.WithOpName("trans_b"), b, perm);
+
+    auto matmul_op = s.WithOpName("matmul");
     if (matmul_type == "MatMul") {
-      Output matmul = ops::MatMul(s.WithOpName("matmul"), trans_a, trans_b);
+      Output matmul = ops::MatMul(matmul_op, trans_a, trans_b);
     } else if (matmul_type == "SparseMatMul") {
-      Output matmul =
-          ops::SparseMatMul(s.WithOpName("matmul"), trans_a, trans_b);
+      Output matmul = ops::SparseMatMul(matmul_op, trans_a, trans_b);
     } else if (matmul_type == "BatchMatMul") {
-      Output matmul =
-          ops::BatchMatMul(s.WithOpName("matmul"), trans_a, trans_b);
+      Output matmul = ops::BatchMatMul(matmul_op, trans_a, trans_b);
     }
+
     GrapplerItem item;
+    item.fetch = {"matmul"};
     TF_CHECK_OK(s.ToGraphDef(&item.graph));
-    std::vector<string> fetch = {"matmul"};
-    auto tensors_expected = EvaluateNodes(item.graph, fetch);
+
+    auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
     EXPECT_EQ(1, tensors_expected.size());
 
     ArithmeticOptimizer optimizer;
+    EnableOnlyFoldTransposeIntoMatMul(&optimizer);
     GraphDef output;
     OptimizeTwice(&optimizer, &item, &output);
     NodeMap node_map(&output);
 
     EXPECT_EQ(7, output.node_size());
 
-    const NodeDef* matmul_fused_node =
-        node_map.GetNode(OptimizedName("matmul_fused"));
+    const string p = "ArithmeticOptimizer/FoldTransposeIntoMatMul";
+    const string optimized_name = strings::StrCat(p, "_", "matmul");
+
+    const NodeDef* matmul_fused_node = node_map.GetNode(optimized_name);
     ASSERT_NE(matmul_fused_node, nullptr);
     EXPECT_EQ("a", matmul_fused_node->input(0));
     EXPECT_EQ("b", matmul_fused_node->input(1));
+
     if (matmul_type == "BatchMatMul") {
       EXPECT_TRUE(matmul_fused_node->attr().at("adj_x").b());
       EXPECT_TRUE(matmul_fused_node->attr().at("adj_y").b());
@@ -851,7 +1015,8 @@ TEST_F(ArithmeticOptimizerTest, FoldTransposeIntoMatMul) {
       EXPECT_TRUE(matmul_fused_node->attr().at("transpose_a").b());
       EXPECT_TRUE(matmul_fused_node->attr().at("transpose_b").b());
     }
-    auto tensors = EvaluateNodes(output, fetch);
+
+    auto tensors = EvaluateNodes(output, item.fetch);
     EXPECT_EQ(1, tensors.size());
     test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
   }
@@ -859,6 +1024,7 @@ TEST_F(ArithmeticOptimizerTest, FoldTransposeIntoMatMul) {
 
 TEST_F(ArithmeticOptimizerTest, FoldConjugateTransposeIntoBatchMatMul) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
   Output re_a =
       ops::Const(s.WithOpName("re_a"), {1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
   Output im_a =
@@ -873,29 +1039,37 @@ TEST_F(ArithmeticOptimizerTest, FoldConjugateTransposeIntoBatchMatMul) {
   Output trans_a = ops::ConjugateTranspose(s.WithOpName("trans_a"), a, perm);
   Output trans_b = ops::ConjugateTranspose(s.WithOpName("trans_b"), b, perm);
   Output matmul = ops::BatchMatMul(s.WithOpName("matmul"), trans_a, trans_b);
+
   GrapplerItem item;
+  item.fetch = {"matmul"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  std::vector<string> fetch = {"matmul"};
-  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
   EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  OptimizeTwice(&optimizer, &item, &output);
 
-  EXPECT_EQ(11, output.node_size());
-  EXPECT_EQ(OptimizedName("matmul_fused"), output.node(10).name());
-  EXPECT_EQ("a", output.node(10).input(0));
-  EXPECT_EQ("b", output.node(10).input(1));
-  EXPECT_TRUE(output.node(10).attr().at("adj_x").b());
-  EXPECT_TRUE(output.node(10).attr().at("adj_y").b());
-  auto tensors = EvaluateNodes(output, fetch);
+  NodeMap node_map(&output);
+  ASSERT_EQ(11, output.node_size());
+
+  const string p = "ArithmeticOptimizer/FoldTransposeIntoMatMul";
+  const string optimized_name = strings::StrCat(p, "_", "matmul");
+
+  const NodeDef* optimized_matmul = node_map.GetNode(optimized_name);
+  ASSERT_NE(optimized_matmul, nullptr);
+  EXPECT_EQ("a", optimized_matmul->input(0));
+  EXPECT_EQ("b", optimized_matmul->input(1));
+  EXPECT_TRUE(optimized_matmul->attr().at("adj_x").b());
+  EXPECT_TRUE(optimized_matmul->attr().at("adj_y").b());
+
+  auto tensors = EvaluateNodes(output, item.fetch);
   EXPECT_EQ(1, tensors.size());
   test::ExpectTensorNear<complex64>(tensors_expected[0], tensors[0], 1e-6);
 }
 
-TEST_F(ArithmeticOptimizerTest, IdentityReshape) {
+TEST_F(ArithmeticOptimizerTest, RemoveRedundantReshape_IdentityReshape) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output inputs =
       ops::Placeholder(s, DT_FLOAT, ops::Placeholder::Shape({-1, 3, 28, 28}));
@@ -917,11 +1091,11 @@ TEST_F(ArithmeticOptimizerTest, IdentityReshape) {
   auto tensors_expected =
       EvaluateNodes(item.graph, item.fetch, {{"Placeholder", x_t}});
   EXPECT_EQ(1, tensors_expected.size());
-  GraphDef output;
-  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
-  item.graph.Swap(&output);
-  TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyRemoveRedundantReshape(&optimizer);
+  OptimizeTwiceAndPrune(&optimizer, &item, &output);
 
   EXPECT_EQ(0, CountOpNodes(output, "Reshape"));
   auto tensors = EvaluateNodes(output, item.fetch, {{"Placeholder", x_t}});
@@ -929,7 +1103,109 @@ TEST_F(ArithmeticOptimizerTest, IdentityReshape) {
   test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
-TEST_F(ArithmeticOptimizerTest, NotIdentityReshape) {
+TEST_F(ArithmeticOptimizerTest,
+       RemoveRedundantReshape_IdentityReshapeBetweenSymbolicShapes) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output inputs =
+      ops::Placeholder(s, DT_FLOAT, ops::Placeholder::Shape({-1, 3, -1, -1}));
+  Output inputs_shape = ops::Shape(s, inputs);
+  // The target shape of the reshape is the concatenation of `batch_size`, 3,
+  // `height, and `width`.
+  Output batch_size = ops::Slice(s, inputs_shape, ops::Const(s, {0}, {1}),
+                                 ops::Const(s, {1}, {1}));
+  Output height = ops::Slice(s, inputs_shape, ops::Const(s, {2}, {1}),
+                             ops::Const(s, {1}, {1}));
+  Output width = ops::Slice(s, inputs_shape, ops::Const(s, {3}, {1}),
+                            ops::Const(s, {1}, {1}));
+  Output target_shape =
+      ops::Concat(s.WithOpName("target_shape"),
+                  {batch_size, ops::Const(s, {3}, {1}), height, width},
+                  ops::Const(s, {0}, {}));
+  Output reshape = ops::Reshape(s, inputs, target_shape);
+  Output outputs = ops::Identity(s.WithOpName("outputs"), reshape);
+
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 3, 28, 28}));
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  item.feed = {{"Placeholder", x_t}};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
+  GraphDef output;
+  // Assume valid feed shape in aggressive mode.
+  ArithmeticOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  EnableOnlyRemoveRedundantReshape(&optimizer);
+  OptimizeTwiceAndPrune(&optimizer, &item, &output);
+
+  EXPECT_EQ(0, CountOpNodes(output, "Reshape"));
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+}
+
+TEST_F(ArithmeticOptimizerTest, RemoveRedundantReshape_NotAssumeValidFeeds) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output inputs =
+      ops::Placeholder(s, DT_FLOAT, ops::Placeholder::Shape({4, 3, 28, 28}));
+  Output target_shape = ops::Const(s, {4, 3, 28, 28}, {4});
+  Output reshape = ops::Reshape(s, inputs, target_shape);
+  Output outputs = ops::Identity(s.WithOpName("outputs"), reshape);
+
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({4, 3, 28, 28}));
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  item.feed = {{"Placeholder", x_t}};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyRemoveRedundantReshape(&optimizer);
+  OptimizeTwiceAndPrune(&optimizer, &item, &output);
+
+  // The reshape is preserved because the shape of the placeholder can be
+  // different from the shape of the actual feed.
+  EXPECT_EQ(1, CountOpNodes(output, "Reshape"));
+
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+}
+
+TEST_F(ArithmeticOptimizerTest,
+       RemoveRedundantReshape_AssumeValidFeedsInAggressiveMode) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output inputs =
+      ops::Placeholder(s, DT_FLOAT, ops::Placeholder::Shape({4, 3, 28, 28}));
+  Output target_shape = ops::Const(s, {4, 3, 28, 28}, {4});
+  Output reshape = ops::Reshape(s, inputs, target_shape);
+  Output outputs = ops::Identity(s.WithOpName("outputs"), reshape);
+
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({4, 3, 28, 28}));
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  item.feed = {{"Placeholder", x_t}};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  EnableOnlyRemoveRedundantReshape(&optimizer);
+  OptimizeTwiceAndPrune(&optimizer, &item, &output);
+
+  EXPECT_EQ(0, CountOpNodes(output, "Reshape"));
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+}
+
+TEST_F(ArithmeticOptimizerTest, RemoveRedundantReshape_NotIdentityReshape) {
   // Reshape from [-1,3,28,28] to [8,-1,28,28] is not identity, because it can
   // be from [4,3,28,28] to [8,6,28,28].
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -945,11 +1221,11 @@ TEST_F(ArithmeticOptimizerTest, NotIdentityReshape) {
   item.feed = {{"Placeholder", x_t}};
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
   EXPECT_EQ(1, tensors_expected.size());
-  GraphDef output;
-  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
-  item.graph.Swap(&output);
-  TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyRemoveRedundantReshape(&optimizer);
+  OptimizeTwiceAndPrune(&optimizer, &item, &output);
 
   EXPECT_EQ(1, CountOpNodes(output, "Reshape"));
   auto tensors = EvaluateNodes(output, item.fetch, item.feed);
@@ -957,7 +1233,8 @@ TEST_F(ArithmeticOptimizerTest, NotIdentityReshape) {
   test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
-TEST_F(ArithmeticOptimizerTest, NotIdentityReshapeTooManyUnknownDimSizes) {
+TEST_F(ArithmeticOptimizerTest,
+       RemoveRedundantReshape_NotIdentityReshapeTooManyUnknownDimSizes) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output inputs =
       ops::Placeholder(s, DT_FLOAT, ops::Placeholder::Shape({4, 3}));
@@ -967,16 +1244,16 @@ TEST_F(ArithmeticOptimizerTest, NotIdentityReshapeTooManyUnknownDimSizes) {
   GrapplerItem item;
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  GraphDef output;
-  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
-  item.graph.Swap(&output);
-  TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyRemoveRedundantReshape(&optimizer);
+  OptimizeTwiceAndPrune(&optimizer, &item, &output);
 
   EXPECT_EQ(1, CountOpNodes(output, "Reshape"));
 }
 
-TEST_F(ArithmeticOptimizerTest, CombineReshapes) {
+TEST_F(ArithmeticOptimizerTest, RemoveRedundantReshape_CombineReshapes) {
   // Converts an NCHW_VECT_C tensor to NHWC and then flattens it to 2D. The two
   // reshapes should be combined.
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -1001,11 +1278,11 @@ TEST_F(ArithmeticOptimizerTest, CombineReshapes) {
   item.feed = {{"nchw_vect_c", x_t}};
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
   EXPECT_EQ(1, tensors_expected.size());
-  GraphDef output;
-  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
-  item.graph.Swap(&output);
-  TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyRemoveRedundantReshape(&optimizer);
+  OptimizeTwiceAndPrune(&optimizer, &item, &output);
 
   EXPECT_EQ(1, CountOpNodes(output, "Reshape"));
   auto tensors = EvaluateNodes(output, item.fetch, item.feed);
@@ -1087,7 +1364,7 @@ TEST_F(ArithmeticOptimizerTest, RemoveIdentityTransposes) {
       ops::RandomUniform(s.WithOpName("inputs"), inputs_shape, DT_FLOAT);
   Output perm1 = ops::Const(s.WithOpName("perm1"), {0, 2, 3, 1}, {4});
   Output perm2 = ops::Const(s.WithOpName("perm2"), {0, 3, 1, 2}, {4});
-  Output perm3 = ops::Const(s.WithOpName("perm2"), {0, 1, 2, 3}, {4});
+  Output perm3 = ops::Const(s.WithOpName("perm3"), {0, 1, 2, 3}, {4});
   Output transpose1 = ops::Transpose(s.WithOpName("transpose1"), inputs, perm1);
   Output transpose2 =
       ops::Transpose(s.WithOpName("transpose2"), transpose1, perm2);
@@ -1131,6 +1408,11 @@ TEST_F(ArithmeticOptimizerTest, RemoveIdentityTransposesMultipleOutputs) {
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({8, 12, 28, 28}));
+  item.feed = {{"inputs", x_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
   GraphDef output;
   ArithmeticOptimizer optimizer;
   EnableOnlyRemoveIdentityTranspose(&optimizer);
@@ -1143,6 +1425,10 @@ TEST_F(ArithmeticOptimizerTest, RemoveIdentityTransposesMultipleOutputs) {
       EXPECT_EQ(node.input(2), "Split:2");
     }
   }
+
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, RemoveTransposesWithControlDependency) {
@@ -1159,6 +1445,11 @@ TEST_F(ArithmeticOptimizerTest, RemoveTransposesWithControlDependency) {
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 3}));
+  item.feed = {{"Placeholder", x_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
   GraphDef output;
   ArithmeticOptimizer optimizer;
   EnableOnlyRemoveIdentityTranspose(&optimizer);
@@ -1169,6 +1460,10 @@ TEST_F(ArithmeticOptimizerTest, RemoveTransposesWithControlDependency) {
   EXPECT_EQ(2, outputs_node->input_size());
   EXPECT_EQ(outputs_node->input(0), "outputs_const");
   EXPECT_EQ(outputs_node->input(1), "^Placeholder");
+
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, NotRemoveTransposes) {
@@ -1195,6 +1490,47 @@ TEST_F(ArithmeticOptimizerTest, NotRemoveTransposes) {
   EXPECT_EQ(6, output.node_size());
 }
 
+TEST_F(ArithmeticOptimizerTest, RemoveIdentityTransposesThroughChain) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output inputs_shape =
+      ops::Const(s.WithOpName("inputs_shape"), {8, 3, 28, 28}, {4});
+  Output inputs =
+      ops::RandomUniform(s.WithOpName("inputs"), inputs_shape, DT_FLOAT);
+  Output perm1 = ops::Const(s.WithOpName("perm1"), {0, 2, 3, 1}, {4});
+  Output perm2 = ops::Const(s.WithOpName("perm2"), {0, 3, 1, 2}, {4});
+  Output transpose1 = ops::Transpose(
+      s.WithOpName("transpose1").WithControlDependencies(perm2), inputs, perm1);
+  Output identity = ops::Identity(s.WithOpName("id"), transpose1);
+  Output transpose2 =
+      ops::Transpose(s.WithOpName("transpose2"), identity, perm2);
+  Output id1 = ops::Identity(s.WithOpName("id1"), transpose2);
+
+  GrapplerItem item;
+  item.fetch = {"id1"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyRemoveIdentityTranspose(&optimizer);
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  std::set<string> nodes_after_optimization;
+  for (const NodeDef& node : output.node()) {
+    nodes_after_optimization.insert(node.name());
+    if (node.name() == "id") {
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("inputs", node.input(0));
+      EXPECT_EQ("^perm2", node.input(1));
+    }
+    if (node.name() == "id1") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("id", node.input(0));
+    }
+  }
+  EXPECT_EQ(nodes_after_optimization,
+            std::set<string>({"id", "id1", "inputs_shape", "inputs", "perm2"}));
+}
+
 TEST_F(ArithmeticOptimizerTest, FoldMulToTransposeConv) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output inputs = ops::Placeholder(s.WithOpName("inputs"), DT_FLOAT,
@@ -1218,18 +1554,24 @@ TEST_F(ArithmeticOptimizerTest, FoldMulToTransposeConv) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
   GraphDef output;
-  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
-
-  item.graph.Swap(&output);
-  TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
+  ArithmeticOptimizer optimizer;
+  EnableOnlyFoldMultipleIntoConv(&optimizer);
+  OptimizeTwiceAndPrune(&optimizer, &item, &output);
 
   NodeMap node_map(&output);
+
   // `conv` is now a folded convolution with scaled weights.
   const NodeDef* folded_conv = node_map.GetNode(conv.node()->name());
-  CHECK_EQ(node_map.GetNode(NodeName(folded_conv->input(1)))->op(), "Mul");
+  ASSERT_NE(folded_conv, nullptr);
+
+  const NodeDef* folded_conv_weights = node_map.GetNode(folded_conv->input(1));
+  ASSERT_NE(folded_conv_weights, nullptr);
+  EXPECT_EQ("Mul", folded_conv_weights->op());
+
   // Its input should be a transpose of `inputs`.
   const NodeDef* transpose = node_map.GetNode(NodeName(folded_conv->input(0)));
-  CHECK_EQ(NodeName(transpose->input(0)), inputs.node()->name());
+  ASSERT_NE(transpose, nullptr);
+  EXPECT_EQ("inputs", transpose->input(0));
 }
 
 TEST_F(ArithmeticOptimizerTest, NotFoldMulAcrossPreservedTranspose) {
@@ -1312,6 +1654,7 @@ TEST_F(ArithmeticOptimizerTest, OptimizeCastMulTransposeConv) {
   //     =>
   //   Conv2D(Cast(Transpose(I)), W*S)
   tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice("/gpu:0");
+
   Output inputs =
       ops::Placeholder(s, DT_UINT8, ops::Placeholder::Shape({8, 28, 28, 3}));
   Output cast = ops::Cast(s, inputs, DT_FLOAT);
@@ -1329,28 +1672,32 @@ TEST_F(ArithmeticOptimizerTest, OptimizeCastMulTransposeConv) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
   GraphDef output;
-  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
+  ArithmeticOptimizer optimizer;  // all optimization stages are on
+  OptimizeTwiceAndPrune(&optimizer, &item, &output, /*const_folding=*/true);
 
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
+  NodeMap node_map(&output);
 
-  item.graph.Swap(&output);
-  TF_EXPECT_OK(
-      ConstantFolding(/*cpu_device=*/nullptr).Optimize(nullptr, item, &output));
+  // Expected names for reordered cast and transpose.
+  const string p = "ArithmeticOptimizer/ReorderCastAndTranspose_";
+  const string optimized_cast_name = strings::StrCat(p, "float_Cast");
+  const string optimized_transpose_name = strings::StrCat(p, "uint8_Transpose");
 
-  item.graph.Swap(&output);
-  TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
+  // Expected names for folded multiply and conv.
+  const string optimized_weights =
+      "ArithmeticOptimizer/FoldMultiplyIntoConv_scaled_Conv2D_weights";
+
+  const NodeDef* inputs_node = node_map.GetNode("Placeholder");
+  const NodeDef* transpose_node = node_map.GetNode(optimized_transpose_name);
+  const NodeDef* cast_node = node_map.GetNode(optimized_cast_name);
+
+  const NodeDef* weights_node = node_map.GetNode(optimized_weights);
+  const NodeDef* conv_node = node_map.GetNode("Conv2D");
 
-  NodeMap node_map(&output);
-  const NodeDef* inputs_node = CHECK_NOTNULL(node_map.GetNode("Placeholder"));
-  const NodeDef* transpose_node =
-      CHECK_NOTNULL(node_map.GetNode(OptimizedName("Transpose_uint8")));
-  const NodeDef* cast_node =
-      CHECK_NOTNULL(node_map.GetNode(OptimizedName("Cast_float")));
-  const NodeDef* weights_node =
-      CHECK_NOTNULL(node_map.GetNode(OptimizedName("weights_scaled_Conv2D")));
-  const NodeDef* conv_node = CHECK_NOTNULL(node_map.GetNode("Conv2D"));
+  ASSERT_NE(inputs_node, nullptr);
+  ASSERT_NE(transpose_node, nullptr);
+  ASSERT_NE(cast_node, nullptr);
+  ASSERT_NE(weights_node, nullptr);
+  ASSERT_NE(conv_node, nullptr);
 
   EXPECT_EQ(output.node_size(), 7);
   EXPECT_EQ(transpose_node->input(0), inputs_node->name());
@@ -1382,23 +1729,27 @@ TEST_F(ArithmeticOptimizerTest, OptimizeMultipleMulTransposeConv) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
   GraphDef output;
-  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
+  ArithmeticOptimizer optimizer;
+  EnableOnlyFoldMultipleIntoConv(&optimizer);
+  OptimizeTwiceAndPrune(&optimizer, &item, &output, /*const_folding=*/true);
 
-  item.graph.Swap(&output);
-  TF_EXPECT_OK(
-      ConstantFolding(/*cpu_device=*/nullptr).Optimize(nullptr, item, &output));
+  NodeMap node_map(&output);
 
-  item.graph.Swap(&output);
-  TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
+  using strings::StrCat;
+  const string p = "ArithmeticOptimizer/FoldMultiplyIntoConv_";
+  const string optimized_weights = StrCat(p, "scaled_Conv2D_weights");
+  const string optimized_weights_1 = StrCat(p, "scaled_Conv2D_1_weights_1");
 
-  NodeMap node_map(&output);
-  const NodeDef* weights_node =
-      CHECK_NOTNULL(node_map.GetNode(OptimizedName("weights_scaled_Conv2D")));
-  const NodeDef* conv_node = CHECK_NOTNULL(node_map.GetNode("Conv2D"));
+  const NodeDef* weights_node = node_map.GetNode(optimized_weights);
+  const NodeDef* weights_node_1 = node_map.GetNode(optimized_weights_1);
+  const NodeDef* conv_node = node_map.GetNode("Conv2D");
+  const NodeDef* conv_node_1 = node_map.GetNode("Conv2D_1");
+
+  ASSERT_NE(weights_node, nullptr);
+  ASSERT_NE(weights_node_1, nullptr);
+  ASSERT_NE(conv_node, nullptr);
+  ASSERT_NE(conv_node_1, nullptr);
 
-  const NodeDef* weights_node_1 =
-      CHECK_NOTNULL(node_map.GetNode(OptimizedName("weights_scaled_Conv2D_1")));
-  const NodeDef* conv_node_1 = CHECK_NOTNULL(node_map.GetNode("Conv2D_1"));
   EXPECT_EQ(conv_node->input(1), weights_node->name());
   EXPECT_EQ(conv_node_1->input(1), weights_node_1->name());
 }
@@ -1415,6 +1766,11 @@ TEST_F(ArithmeticOptimizerTest, CombineBitcasts) {
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  auto x_t = GenerateRandomTensor<DT_UINT8>(TensorShape({2, 3}));
+  item.feed = {{"inputs", x_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
   GraphDef output;
   ArithmeticOptimizer optimizer;
   EnableOnlyRemoveRedundantBitcast(&optimizer);
@@ -1426,6 +1782,10 @@ TEST_F(ArithmeticOptimizerTest, CombineBitcasts) {
   EXPECT_EQ(3, output.node_size());
   EXPECT_EQ(1, CountOpNodes(output, "Bitcast"));
   EXPECT_TRUE(IsNodesDirectlyConnected(node_map, "inputs", "bc2"));
+
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<int8>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ArithmeticOptimizerTest, CombineAndRemoveBitcasts) {
@@ -1440,6 +1800,11 @@ TEST_F(ArithmeticOptimizerTest, CombineAndRemoveBitcasts) {
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  auto x_t = GenerateRandomTensor<DT_INT8>(TensorShape({2, 3}));
+  item.feed = {{"inputs", x_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
   GraphDef output;
   ArithmeticOptimizer optimizer;
   EnableOnlyRemoveRedundantBitcast(&optimizer);
@@ -1451,6 +1816,10 @@ TEST_F(ArithmeticOptimizerTest, CombineAndRemoveBitcasts) {
   EXPECT_EQ(2, output.node_size());
   EXPECT_EQ(0, CountOpNodes(output, "Bitcast"));
   EXPECT_TRUE(IsNodesDirectlyConnected(node_map, "inputs", "outputs"));
+
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<int8>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ArithmeticOptimizerTest, RemoveRedundantCast) {
@@ -1464,6 +1833,11 @@ TEST_F(ArithmeticOptimizerTest, RemoveRedundantCast) {
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  auto x_t = GenerateRandomTensor<DT_INT8>(TensorShape({2, 3}));
+  item.feed = {{"inputs", x_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
   GraphDef output;
   ArithmeticOptimizer optimizer;
   EnableOnlyRemoveRedundantCast(&optimizer);
@@ -1475,6 +1849,10 @@ TEST_F(ArithmeticOptimizerTest, RemoveRedundantCast) {
   EXPECT_EQ(2, output.node_size());
   EXPECT_EQ(0, CountOpNodes(output, "Cast"));
   EXPECT_TRUE(IsNodesDirectlyConnected(node_map, "inputs", "outputs"));
+
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<int8>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddOpsOfIdenticalShape) {
@@ -1494,6 +1872,14 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddOpsOfIdenticalShape) {
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  auto a_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto b_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto c_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  std::vector<std::pair<string, Tensor>> feed = {
+      {"a", a_t}, {"b", b_t}, {"c", c_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
   GraphDef output;
   ArithmeticOptimizer optimizer;
   EnableOnlyAddToAddNCombining(&optimizer);
@@ -1527,6 +1913,10 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddOpsOfIdenticalShape) {
   ASSERT_NE(updated_outputs, nullptr);
 
   EXPECT_EQ(collapsed_add->name(), updated_outputs->input(0));
+
+  auto tensors = EvaluateNodes(output, item.fetch, feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MultiplePasses) {
@@ -1551,6 +1941,17 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MultiplePasses) {
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  auto a_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto b_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto c_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto y_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto z_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  std::vector<std::pair<string, Tensor>> feed = {
+      {"a", a_t}, {"b", b_t}, {"c", c_t}, {"x", x_t}, {"y", y_t}, {"z", z_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
   GraphDef output;
   ArithmeticOptimizer optimizer;
   EnableOnlyAddToAddNCombining(&optimizer);
@@ -1600,6 +2001,10 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MultiplePasses) {
   EXPECT_EQ(2, updated_mul->input_size());
   EXPECT_EQ(collapsed_left->name(), updated_mul->input(0));
   EXPECT_EQ(collapsed_right->name(), updated_mul->input(1));
+
+  auto tensors = EvaluateNodes(output, item.fetch, feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddInputMultipleTimes) {
@@ -1617,6 +2022,14 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddInputMultipleTimes) {
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  auto a_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto b_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto c_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  std::vector<std::pair<string, Tensor>> feed = {
+      {"a", a_t}, {"b", b_t}, {"c", c_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
   GraphDef output;
   ArithmeticOptimizer optimizer;
   EnableOnlyAddToAddNCombining(&optimizer);
@@ -1645,6 +2058,10 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddInputMultipleTimes) {
   EXPECT_EQ("b", collapsed_add->input(1));
   EXPECT_EQ("b", collapsed_add->input(2));
   EXPECT_EQ("c", collapsed_add->input(3));
+
+  auto tensors = EvaluateNodes(output, item.fetch, feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddOpsOfSymbolicallyEqualShape) {
@@ -1668,6 +2085,11 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddOpsOfSymbolicallyEqualShape) {
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  std::vector<std::pair<string, Tensor>> feed = {{"input", x_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
   GraphDef output;
   ArithmeticOptimizer optimizer;
   EnableOnlyAddToAddNCombining(&optimizer);
@@ -1699,6 +2121,10 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_AddOpsOfSymbolicallyEqualShape) {
   const NodeDef* updated_outputs = node_map.GetNode("outputs");
   ASSERT_NE(updated_outputs, nullptr);
   EXPECT_EQ(collapsed_add->name(), updated_outputs->input(0));
+
+  auto tensors = EvaluateNodes(output, item.fetch, feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MinimizeBCast) {
@@ -1723,6 +2149,17 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MinimizeBCast) {
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  auto a_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({32}));
+  auto b_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({32, 32}));
+  auto c_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({32, 32, 32}));
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({32}));
+  auto y_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({32, 32}));
+  auto z_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({32, 32, 32}));
+  std::vector<std::pair<string, Tensor>> feed = {
+      {"a", a_t}, {"b", b_t}, {"c", c_t}, {"x", x_t}, {"y", y_t}, {"z", z_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
   GraphDef output;
   ArithmeticOptimizer optimizer;
   EnableOnlyAddToAddNCombining(&optimizer);
@@ -1795,18 +2232,22 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MinimizeBCast) {
   const NodeDef* updated_outputs = node_map.GetNode("outputs");
   ASSERT_NE(updated_outputs, nullptr);
   EXPECT_EQ(outer_add_name, updated_outputs->input(0));
+
+  auto tensors = EvaluateNodes(output, item.fetch, feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MinimizeBCastWithSymbolicShapes) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
   // We have a small input with one unknown dimension
-  auto small = ops::Variable(s.WithOpName("small"), {-1, 1, 1}, DT_FLOAT);
+  auto small = ops::Variable(s.WithOpName("small"), {-1, 1, 1}, DT_DOUBLE);
 
   // And second input which is larger, but has the same unknown dimension
   // device spec prevents this node from rewriting
-  auto d = "/job:do_not_rewrite_me";
-  auto v = ops::Variable(s.WithOpName("v"), {1, 32, 32}, DT_FLOAT);
+  auto d = "/device:CPU:0";
+  auto v = ops::Variable(s.WithOpName("v"), {1, 32, 32}, DT_DOUBLE);
   auto large = ops::Add(s.WithOpName("large").WithDevice(d), small, v);
 
   // [a, c] have {?, 1, 1} shape, [b] has {?, 32, 32}
@@ -1824,6 +2265,12 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MinimizeBCastWithSymbolicShapes) {
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  auto s_t = GenerateRandomTensor<DT_DOUBLE>(TensorShape({8, 1, 1}));
+  auto v_t = GenerateRandomTensor<DT_DOUBLE>(TensorShape({1, 32, 32}));
+  std::vector<std::pair<string, Tensor>> feed = {{"small", s_t}, {"v", v_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
   GraphDef output;
   ArithmeticOptimizer optimizer;
   EnableOnlyAddToAddNCombining(&optimizer);
@@ -1862,6 +2309,10 @@ TEST_F(ArithmeticOptimizerTest, AddOpsRewrite_MinimizeBCastWithSymbolicShapes) {
   const NodeDef* updated_outputs = node_map.GetNode("outputs");
   ASSERT_NE(updated_outputs, nullptr);
   EXPECT_EQ(outer_add_name, updated_outputs->input(0));
+
+  auto tensors = EvaluateNodes(output, item.fetch, feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<double>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, RemoveNegation) {
@@ -1886,6 +2337,12 @@ TEST_F(ArithmeticOptimizerTest, RemoveNegation) {
   item.fetch = {"add_all"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  auto y_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 2}));
+  std::vector<std::pair<string, Tensor>> feed = {{"x", x_t}, {"y", y_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
   GraphDef output;
   ArithmeticOptimizer optimizer;
   EnableOnlyRemoveNegation(&optimizer);
@@ -1934,6 +2391,99 @@ TEST_F(ArithmeticOptimizerTest, RemoveNegation) {
     }
   }
   EXPECT_EQ(5, found);
+
+  auto tensors = EvaluateNodes(output, item.fetch, feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+}
+
+TEST_F(ArithmeticOptimizerTest, ConvertSqrtDivToRsqrtMul) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto x = ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
+  auto y = ops::Const(s.WithOpName("y"), {3.0f, 4.0f}, {1, 2});
+  Output sqrt_y = ops::Sqrt(s.WithOpName("sqrt_y"), y);
+  Output div_x_sqrt_y = ops::Div(s.WithOpName("output"), x, sqrt_y);
+
+  GrapplerItem item;
+  item.fetch = {"output"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlySqrtDivToRsqrtMul(&optimizer);
+  OptimizeAndPrune(&optimizer, &item, &output);
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(1, tensors.size());
+
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+  EXPECT_EQ(item.graph.node_size(), output.node_size());
+  for (int i = 0; i < output.node_size(); ++i) {
+    const NodeDef& node = output.node(i);
+    if (node.name() == "output") {
+      EXPECT_EQ("Mul", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("sqrt_y", node.input(1));
+    } else if (node.name() == "sqrt_y") {
+      EXPECT_EQ("Rsqrt", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("y", node.input(0));
+    }
+  }
+}
+
+TEST_F(ArithmeticOptimizerTest, ConvertPow) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto x = ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
+  auto y2 = ops::Const(s.WithOpName("y2"), {2.0f, 2.0f}, {1, 2});
+  auto y1 = ops::Const(s.WithOpName("y1"), {1.0f, 1.0f}, {1, 2});
+  auto yPoint5 = ops::Const(s.WithOpName("y.5"), {0.5f, 0.5f}, {1, 2});
+  auto y0 = ops::Const(s.WithOpName("y0"), {0.0f, 0.0f}, {1, 2});
+  auto y_Point5 = ops::Const(s.WithOpName("y_.5"), {-0.5f, -0.5f}, {1, 2});
+  auto y_1 = ops::Const(s.WithOpName("y_1"), {-1.0f, -1.0f}, {1, 2});
+  auto y = ops::Const(s.WithOpName("y"), {3.0f, 4.0f}, {1, 2});
+  Output out2 = ops::Pow(s.WithOpName("out2"), x, y2);
+  Output out1 = ops::Pow(s.WithOpName("out1"), x, y1);
+  Output outPoint5 = ops::Pow(s.WithOpName("out.5"), x, yPoint5);
+  Output out0 = ops::Pow(s.WithOpName("out0"), x, y0);
+  Output out_Point5 = ops::Pow(s.WithOpName("out_.5"), x, y_Point5);
+  Output out_1 = ops::Pow(s.WithOpName("out_1"), x, y_1);
+  Output out = ops::Pow(s.WithOpName("out"), x, y);
+
+  GrapplerItem item;
+  item.fetch = {"out2", "out1", "out.5", "out0", "out_.5", "out_1", "out"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  EXPECT_EQ(7, tensors_expected.size());
+
+  GraphDef got;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyConvertPow(&optimizer);
+  OptimizeAndPrune(&optimizer, &item, &got);
+  auto tensors = EvaluateNodes(got, item.fetch);
+  EXPECT_EQ(7, tensors.size());
+
+  GraphDef want;
+  AddNode("x", "Const", {}, {}, &want);
+  AddNode("y2", "Const", {}, {}, &want);
+  AddNode("y1", "Const", {}, {}, &want);
+  AddNode("y.5", "Const", {}, {}, &want);
+  AddNode("y0", "Const", {}, {}, &want);
+  AddNode("y_.5", "Const", {}, {}, &want);
+  AddNode("y_1", "Const", {}, {}, &want);
+  AddNode("y", "Const", {}, {}, &want);
+  AddNode("out2", "Square", {"x", AsControlDependency("y2")}, {}, &want);
+  AddNode("out1", "Identity", {"x", AsControlDependency("y1")}, {}, &want);
+  AddNode("out.5", "Sqrt", {"x", AsControlDependency("y.5")}, {}, &want);
+  AddNode("out0", "Const",
+          {AsControlDependency("x"), AsControlDependency("y0")}, {}, &want);
+  AddNode("out_.5", "Rsqrt", {"x", AsControlDependency("y_.5")}, {}, &want);
+  AddNode("out_1", "Reciprocal", {"x", AsControlDependency("y_1")}, {}, &want);
+  AddNode("out", "Pow", {"x", "y"}, {}, &want);
+
+  CompareGraphs(want, got);
 }
 
 TEST_F(ArithmeticOptimizerTest, MinimizeBroadcasts_SimpleSwap) {
@@ -1952,6 +2502,14 @@ TEST_F(ArithmeticOptimizerTest, MinimizeBroadcasts_SimpleSwap) {
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  auto a_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({32}));
+  auto b_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({32, 32}));
+  auto c_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({32}));
+  std::vector<std::pair<string, Tensor>> feed = {
+      {"a", a_t}, {"b", b_t}, {"c", c_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
   GraphDef output;
   ArithmeticOptimizer optimizer;
   EnableOnlyMinimizeBroadcasts(&optimizer);
@@ -1976,16 +2534,20 @@ TEST_F(ArithmeticOptimizerTest, MinimizeBroadcasts_SimpleSwap) {
   ASSERT_NE(mul2_node, nullptr);
   EXPECT_EQ("mul1", mul2_node->input(0));
   EXPECT_EQ("b", mul2_node->input(1));
+
+  auto tensors = EvaluateNodes(output, item.fetch, feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, MinimizeBroadcasts_FlattenTallGraph) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
-  auto a = ops::Variable(s.WithOpName("a"), {32}, DT_FLOAT);
-  auto b = ops::Variable(s.WithOpName("b"), {32, 32}, DT_FLOAT);
-  auto c = ops::Variable(s.WithOpName("c"), {32}, DT_FLOAT);
-  auto d = ops::Variable(s.WithOpName("d"), {32}, DT_FLOAT);
-  auto e = ops::Variable(s.WithOpName("e"), {32}, DT_FLOAT);
+  auto a = ops::Variable(s.WithOpName("a"), {32}, DT_DOUBLE);
+  auto b = ops::Variable(s.WithOpName("b"), {32, 32}, DT_DOUBLE);
+  auto c = ops::Variable(s.WithOpName("c"), {32}, DT_DOUBLE);
+  auto d = ops::Variable(s.WithOpName("d"), {32}, DT_DOUBLE);
+  auto e = ops::Variable(s.WithOpName("e"), {32}, DT_DOUBLE);
 
   auto mul1 = ops::Mul(s.WithOpName("mul1"), a, b);
   auto mul2 = ops::Mul(s.WithOpName("mul2"), mul1, c);
@@ -1998,6 +2560,16 @@ TEST_F(ArithmeticOptimizerTest, MinimizeBroadcasts_FlattenTallGraph) {
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  auto a_t = GenerateRandomTensor<DT_DOUBLE>(TensorShape({32}));
+  auto b_t = GenerateRandomTensor<DT_DOUBLE>(TensorShape({32, 32}));
+  auto c_t = GenerateRandomTensor<DT_DOUBLE>(TensorShape({32}));
+  auto d_t = GenerateRandomTensor<DT_DOUBLE>(TensorShape({32}));
+  auto e_t = GenerateRandomTensor<DT_DOUBLE>(TensorShape({32}));
+  std::vector<std::pair<string, Tensor>> feed = {
+      {"a", a_t}, {"b", b_t}, {"c", c_t}, {"d", d_t}, {"e", e_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
   GraphDef output;
   ArithmeticOptimizer optimizer;
   EnableOnlyMinimizeBroadcasts(&optimizer);
@@ -2037,6 +2609,10 @@ TEST_F(ArithmeticOptimizerTest, MinimizeBroadcasts_FlattenTallGraph) {
   ASSERT_NE(mul4_node, nullptr);
   EXPECT_EQ("mul3", mul4_node->input(0));
   EXPECT_EQ("b", mul4_node->input(1));
+
+  auto tensors = EvaluateNodes(output, item.fetch, feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<double>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, MinimizeBroadcasts_BuildTreeUp) {
@@ -2058,6 +2634,15 @@ TEST_F(ArithmeticOptimizerTest, MinimizeBroadcasts_BuildTreeUp) {
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  auto a_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({32}));
+  auto b_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({32}));
+  auto c_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({32}));
+  auto d_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({32, 32}));
+  std::vector<std::pair<string, Tensor>> feed = {
+      {"a", a_t}, {"b", b_t}, {"c", c_t}, {"D", d_t}};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, feed);
+  EXPECT_EQ(1, tensors_expected.size());
+
   GraphDef output;
   ArithmeticOptimizer optimizer;
   EnableOnlyMinimizeBroadcasts(&optimizer);
@@ -2089,18 +2674,26 @@ TEST_F(ArithmeticOptimizerTest, MinimizeBroadcasts_BuildTreeUp) {
   ASSERT_NE(mul3_node, nullptr);
   EXPECT_EQ("D", mul3_node->input(0));
   EXPECT_EQ("mul1", mul3_node->input(1));
+
+  auto tensors = EvaluateNodes(output, item.fetch, feed);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
 TEST_F(ArithmeticOptimizerTest, HoistCWiseUnaryFromConcat) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output a = ops::Variable(s.WithOpName("a"), {32}, DT_FLOAT);
-  Output b = ops::Variable(s.WithOpName("b"), {32}, DT_FLOAT);
-  Output c = ops::Variable(s.WithOpName("c"), {32}, DT_FLOAT);
+  Output a = ops::Const(s.WithOpName("a"), 3.14f, {32});
+  Output b = ops::Const(s.WithOpName("b"), 1.0f, {32});
+  Output c = ops::Const(s.WithOpName("c"), 42.0f, {32});
   Output axis = ops::Const(s.WithOpName("axis"), 0, {});
   Output ctrl1 = ops::Const(s.WithOpName("ctrl1"), 1, {});
   Output ctrl2 = ops::Const(s.WithOpName("ctrl2"), 2, {});
   Output ctrl3 = ops::Const(s.WithOpName("ctrl3"), 3, {});
   // Test case with chains of length 1.
+  // Rewrites
+  //       Concat({Exp(a), Exp(b), Exp(c)})
+  // into
+  //       Exp(Concat({a, b, c})).
   Output sin_a =
       ops::Sin(s.WithOpName("sin_a").WithControlDependencies(ctrl3), a);
   Output exp_a =
@@ -2113,6 +2706,10 @@ TEST_F(ArithmeticOptimizerTest, HoistCWiseUnaryFromConcat) {
   Output id = ops::Identity(s.WithOpName("id"), concat);
 
   // Test case with chains of length 2.
+  // Rewrites
+  //       Concat({Cos(Exp(a)), Cos(Exp(b)), Cos(Exp(c))})
+  // into
+  //       Cos(Exp(Concat({a, b, c}))).
   Output exp_a2 =
       ops::Exp(s.WithOpName("exp_a2").WithControlDependencies(ctrl1), sin_a);
   Output exp_b2 = ops::Exp(s.WithOpName("exp_b2"), b);
@@ -2130,11 +2727,13 @@ TEST_F(ArithmeticOptimizerTest, HoistCWiseUnaryFromConcat) {
   item.fetch = {"id", "id2"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+
   GraphDef output;
   ArithmeticOptimizer optimizer;
-  EnableOnlyHoistCWiseUnaryFromConcat(&optimizer);
+  EnableOnlyHoistCWiseUnaryChains(&optimizer);
+  OptimizeTwiceAndPrune(&optimizer, &item, &output);
 
-  OptimizeAndPrune(&optimizer, &item, &output);
   int found = 0;
   for (const NodeDef& node : output.node()) {
     if (node.name() == "concat") {
@@ -2148,8 +2747,9 @@ TEST_F(ArithmeticOptimizerTest, HoistCWiseUnaryFromConcat) {
       found++;
     }
     if (node.name() == "exp_a") {
-      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("concat", node.input(0));
+      EXPECT_EQ("^ctrl1", node.input(1));
       found++;
     }
     if (node.name() == "id") {
@@ -2170,13 +2770,15 @@ TEST_F(ArithmeticOptimizerTest, HoistCWiseUnaryFromConcat) {
       found++;
     }
     if (node.name() == "exp_a2") {
-      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("concat2", node.input(0));
+      EXPECT_EQ("^ctrl1", node.input(1));
       found++;
     }
     if (node.name() == "cos_exp_a2") {
-      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("exp_a2", node.input(0));
+      EXPECT_EQ("^ctrl1", node.input(1));
       found++;
     }
     if (node.name() == "id2") {
@@ -2186,6 +2788,299 @@ TEST_F(ArithmeticOptimizerTest, HoistCWiseUnaryFromConcat) {
     }
   }
   EXPECT_EQ(7, found);
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(tensors.size(), tensors_expected.size());
+  EXPECT_EQ(tensors.size(), item.fetch.size());
+  for (int i = 0; i < item.fetch.size(); ++i) {
+    test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-6);
+  }
+}
+
+TEST_F(ArithmeticOptimizerTest, HoistCWiseUnaryIntoSplit) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::Const(s.WithOpName("x"), 3.1415f, {32});
+  Output axis = ops::Const(s.WithOpName("axis"), 0, {});
+  Output ctrl1 = ops::Const(s.WithOpName("ctrl1"), 1, {});
+  Output ctrl2 = ops::Const(s.WithOpName("ctrl2"), 2, {});
+  Output ctrl3 = ops::Const(s.WithOpName("ctrl3"), 3, {});
+  // Test case with chains of length 1.
+  // Rewrites
+  //          [Sin(y) for y in Split(x)]
+  // into
+  //          [y for y in Split(Sin(x))].
+  ops::Split split1(s.WithOpName("split1"), axis, x, 2);
+  Output sin_a =
+      ops::Sin(s.WithOpName("sin_a").WithControlDependencies(ctrl1), split1[0]);
+  Output id_a = ops::Identity(s.WithOpName("id_a"), sin_a);
+  Output sin_b = ops::Sin(s.WithOpName("sin_b"), split1[1]);
+  Output exp_b = ops::Exp(s.WithOpName("exp_b"), sin_b);
+  Output id_b = ops::Identity(s.WithOpName("id_b"), exp_b);
+
+  // Test case with SplitV and chains of length 2.
+  // Rewrites
+  //          [Cos(Exp(y)) for y in Split(x)]
+  // into
+  //          [y for y in Split(Cos(Exp(x)))].
+  Output size_splits2 = ops::Const(s.WithOpName("size_splits2"), {20, 12}, {2});
+  ops::SplitV split2(s.WithOpName("split2"), x, size_splits2, axis, 2);
+  Output exp_a2 = ops::Exp(
+      s.WithOpName("exp_a2").WithControlDependencies(ctrl1), split2[0]);
+  Output exp_b2 = ops::Exp(s.WithOpName("exp_b2"), split2[1]);
+  Output cos_exp_a2 = ops::Cos(
+      s.WithOpName("cos_exp_a2").WithControlDependencies(ctrl2), exp_a2);
+  Output cos_exp_b2 = ops::Cos(
+      s.WithOpName("cos_exp_b2").WithControlDependencies(ctrl3), exp_b2);
+  Output id_a2 = ops::Identity(s.WithOpName("id_a2"), cos_exp_a2);
+  Output id_b2 = ops::Identity(s.WithOpName("id_b2"), cos_exp_b2);
+
+  GrapplerItem item;
+  item.fetch = {"id_a", "id_b", "id_a2", "id_b2"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyHoistCWiseUnaryChains(&optimizer);
+  OptimizeTwiceAndPrune(&optimizer, &item, &output);
+
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    // The following 6 nodes should be pruned.
+    EXPECT_NE(node.name(), "sin_a");
+    EXPECT_NE(node.name(), "sin_b");
+    EXPECT_NE(node.name(), "exp_a2");
+    EXPECT_NE(node.name(), "exp_b2");
+    EXPECT_NE(node.name(), "cos_exp_a2");
+    EXPECT_NE(node.name(), "cos_exp_b2");
+
+    if (node.name() == "split1") {
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("axis", node.input(0));
+      EXPECT_EQ("ArithmeticOptimizer/_sin_a_split1", node.input(1));
+      found++;
+    }
+    if (node.name() == "ArithmeticOptimizer/_sin_a_split1") {
+      EXPECT_EQ("Sin", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("^ctrl1", node.input(1));
+      found++;
+    }
+    if (node.name() == "id_a") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("split1", node.input(0));
+      found++;
+    }
+    if (node.name() == "exp_b") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("split1:1", node.input(0));
+      found++;
+    }
+    if (node.name() == "id_b") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("exp_b", node.input(0));
+      found++;
+    }
+    if (node.name() == "ArithmeticOptimizer/_exp_a2_split2") {
+      EXPECT_EQ("Exp", node.op());
+      EXPECT_EQ(4, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("^ctrl1", node.input(1));
+      EXPECT_EQ("^ctrl2", node.input(2));
+      EXPECT_EQ("^ctrl3", node.input(3));
+      found++;
+    }
+    if (node.name() == "ArithmeticOptimizer/_cos_exp_a2_split2") {
+      EXPECT_EQ("Cos", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("ArithmeticOptimizer/_exp_a2_split2", node.input(0));
+      found++;
+    }
+    if (node.name() == "split2") {
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("ArithmeticOptimizer/_cos_exp_a2_split2", node.input(0));
+      EXPECT_EQ("size_splits2", node.input(1));
+      EXPECT_EQ("axis", node.input(2));
+      found++;
+    }
+    if (node.name() == "id_a2") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("split2", node.input(0));
+      found++;
+    }
+    if (node.name() == "id_b2") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("split2:1", node.input(0));
+      found++;
+    }
+  }
+  EXPECT_EQ(10, found);
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(tensors.size(), tensors_expected.size());
+  EXPECT_EQ(tensors.size(), item.fetch.size());
+  for (int i = 0; i < item.fetch.size(); ++i) {
+    test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-6);
+  }
+}
+
+TEST_F(ArithmeticOptimizerTest, RemoveIdempotent) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a = ops::Const(s.WithOpName("a"), 3.14f, {32});
+  Output ctrl1 = ops::Const(s.WithOpName("ctrl1"), 1, {});
+  Output ctrl2 = ops::Const(s.WithOpName("ctrl2"), 2, {});
+  Output sn1 =
+      ops::Snapshot(s.WithOpName("sn1").WithControlDependencies(ctrl1), a);
+  Output sn2 =
+      ops::Snapshot(s.WithOpName("sn2").WithControlDependencies(ctrl2), sn1);
+  Output out1 = ops::Identity(s.WithOpName("out1"), sn2);
+  Output id1 = ops::Identity(s.WithOpName("id1"), a);
+  Output id2 = ops::Identity(s.WithOpName("id2"), id1);
+  Output out2 = ops::Identity(s.WithOpName("out2"), id2);
+  GrapplerItem item;
+  item.fetch = {"out1", "out2"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyRemoveIdempotent(&optimizer);
+  OptimizeTwice(&optimizer, &item, &output);
+
+  EXPECT_EQ(11, output.node_size());
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "out1") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("ArithmeticOptimizer/RemoveIdempotent_sn2", node.input(0));
+      found++;
+    } else if (node.name() == "ArithmeticOptimizer/RemoveIdempotent_sn2") {
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("Snapshot", node.op());
+      EXPECT_EQ("a", node.input(0));
+      EXPECT_EQ("^ctrl1", node.input(1));
+      EXPECT_EQ("^ctrl2", node.input(2));
+      found++;
+    } else if (node.name() == "out2") {
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("ArithmeticOptimizer/RemoveIdempotent_id2", node.input(0));
+      found++;
+    } else if (node.name() == "ArithmeticOptimizer/RemoveIdempotent_id2") {
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("a", node.input(0));
+      found++;
+    }
+  }
+  EXPECT_EQ(4, found);
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(tensors.size(), tensors_expected.size());
+  EXPECT_EQ(tensors.size(), item.fetch.size());
+  for (int i = 0; i < item.fetch.size(); ++i) {
+    test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-6);
+  }
+}
+
+TEST_F(ArithmeticOptimizerTest, RemoveLogicalNot) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a = ops::Const(s.WithOpName("a"), 3.14f, {32});
+  Output b = ops::Const(s.WithOpName("b"), -3.14f, {32});
+  Output eq = ops::Equal(s.WithOpName("eq"), a, b);
+  Output neq = ops::NotEqual(s.WithOpName("neq"), a, b);
+  Output lt = ops::Less(s.WithOpName("lt"), a, b);
+  Output le = ops::LessEqual(s.WithOpName("le"), a, b);
+  Output gt = ops::Greater(s.WithOpName("gt"), a, b);
+  Output ge = ops::GreaterEqual(s.WithOpName("ge"), a, b);
+  // not_eq is reserved
+  Output not_eq1 = ops::LogicalNot(s.WithOpName("not_eq1"), eq);
+  Output not_neq = ops::LogicalNot(s.WithOpName("not_neq"), neq);
+  Output not_lt = ops::LogicalNot(s.WithOpName("not_lt"), lt);
+  Output not_le = ops::LogicalNot(s.WithOpName("not_le"), le);
+  Output not_gt = ops::LogicalNot(s.WithOpName("not_gt"), gt);
+  Output not_ge = ops::LogicalNot(s.WithOpName("not_ge"), ge);
+  Output id_not_eq = ops::Identity(s.WithOpName("id_not_eq"), not_eq1);
+  Output id_not_neq = ops::Identity(s.WithOpName("id_not_neq"), not_neq);
+  Output id_not_lt = ops::Identity(s.WithOpName("id_not_lt"), not_lt);
+  Output id_not_le = ops::Identity(s.WithOpName("id_not_le"), not_le);
+  Output id_not_gt = ops::Identity(s.WithOpName("id_not_gt"), not_gt);
+  Output id_not_ge = ops::Identity(s.WithOpName("id_not_ge"), not_ge);
+
+  GrapplerItem item;
+  item.fetch = {"id_not_eq", "id_not_neq", "id_not_lt",
+                "id_not_le", "id_not_gt",  "id_not_ge"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyRemoveLogicalNot(&optimizer);
+  OptimizeTwice(&optimizer, &item, &output);
+
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "id_not_eq") {
+      EXPECT_EQ("eq", node.input(0));
+      ++found;
+    }
+    if (node.name() == "id_not_neq") {
+      EXPECT_EQ("neq", node.input(0));
+      ++found;
+    }
+    if (node.name() == "id_not_lt") {
+      EXPECT_EQ("lt", node.input(0));
+      ++found;
+    }
+    if (node.name() == "id_not_le") {
+      EXPECT_EQ("le", node.input(0));
+      ++found;
+    }
+    if (node.name() == "id_not_gt") {
+      EXPECT_EQ("gt", node.input(0));
+      ++found;
+    }
+    if (node.name() == "id_not_ge") {
+      EXPECT_EQ("ge", node.input(0));
+      ++found;
+    }
+
+    if (node.name() == "eq") {
+      EXPECT_EQ("NotEqual", node.op());
+      ++found;
+    }
+    if (node.name() == "neq") {
+      EXPECT_EQ("Equal", node.op());
+      ++found;
+    }
+    if (node.name() == "lt") {
+      EXPECT_EQ("GreaterEqual", node.op());
+      ++found;
+    }
+    if (node.name() == "le") {
+      EXPECT_EQ("Greater", node.op());
+      ++found;
+    }
+    if (node.name() == "gt") {
+      EXPECT_EQ("LessEqual", node.op());
+      ++found;
+    }
+    if (node.name() == "ge") {
+      EXPECT_EQ("Less", node.op());
+      ++found;
+    }
+  }
+  EXPECT_EQ(12, found);
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(tensors.size(), tensors_expected.size());
+  EXPECT_EQ(tensors.size(), item.fetch.size());
+  for (int i = 0; i < item.fetch.size(); ++i) {
+    test::ExpectTensorEqual<bool>(tensors_expected[i], tensors[i]);
+  }
 }
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/auto_parallel.h b/tensorflow/core/grappler/optimizers/auto_parallel.h
index 8d1098d87755c1..63f6fe5b9db870 100644
--- a/tensorflow/core/grappler/optimizers/auto_parallel.h
+++ b/tensorflow/core/grappler/optimizers/auto_parallel.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
-#define TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
 
 #include "tensorflow/core/framework/variable.pb.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
@@ -63,4 +63,4 @@ class AutoParallel : public GraphOptimizer {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_AUTO_PARALLEL_H_
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index 45bb188e8db49e..f4b384ec1e9d05 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
+
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.pb.h"
@@ -30,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/optimizers/symbolic_shapes.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
@@ -866,6 +868,25 @@ Status CreateConstantTensorAttrValue(DataType type, double value,
 }
 
 #undef SET_TENSOR_CAL_CASE
+
+DataType GetDataTypeFromNodeOrProps(const NodeDef& node,
+                                    const GraphProperties& properties) {
+  DataType dtype = DT_INVALID;
+  if (node.attr().count("T") == 1) {
+    dtype = node.attr().at("T").type();
+  } else if (node.attr().count("dtype") == 1) {
+    dtype = node.attr().at("dtype").type();
+  } else if (IsLogicalOr(node) || IsLogicalAnd(node)) {
+    dtype = DT_BOOL;
+  } else {
+    auto output_props = properties.GetOutputProperties(node.name());
+    if (!output_props.empty()) {
+      dtype = output_props[0].dtype();
+    }
+  }
+  return dtype;
+}
+
 }  // namespace
 
 // static
@@ -1310,22 +1331,56 @@ Status ConstantFolding::FoldGraph(GraphDef* output) {
 // Returns true iff this reduction can be reduced to an identity (i.e if the set
 // of dimensions to reduce along is empty). This happens often in the gradient
 // graphs.
-bool ConstantFolding::IsSimplifiableReduction(const NodeDef& node) const {
+bool ConstantFolding::IsSimplifiableReduction(
+    const NodeDef& node, const GraphProperties& properties) const {
   if (IsReduction(node)) {
     CHECK_LE(2, node.input_size());
     const NodeDef* reductions_indices = node_map_->GetNode(node.input(1));
     if (IsReallyConstant(*reductions_indices)) {
       TensorVector output;
+      auto outputs_cleanup = gtl::MakeCleanup([&output] {
+        for (const auto& out : output) {
+          delete out.tensor;
+        }
+      });
       Status s = EvaluateNode(*reductions_indices, TensorVector(), &output);
       if (!s.ok()) {
         return false;
       }
       CHECK_EQ(1, output.size());
       int output_size = output[0]->NumElements();
-      delete output[0].tensor;
       if (output_size == 0) {
         return true;
       }
+      if (node.attr().count("keep_dims") > 0 &&
+          node.attr().at("keep_dims").b()) {
+        const auto& props = properties.GetInputProperties(node.name());
+        if (!props.empty()) {
+          const TensorShapeProto& input_shape = props[0].shape();
+          if (!input_shape.unknown_rank()) {
+            bool simplifiable = true;
+            for (int i = 0; i < output[0]->NumElements(); ++i) {
+              int64 dim;
+              if (output[0]->dtype() == DT_INT32) {
+                dim = output[0]->flat<int32>()(i);
+              } else {
+                dim = output[0]->flat<int64>()(i);
+              }
+              if (dim < 0) {
+                dim += input_shape.dim_size();
+              }
+              if (dim < 0 || dim >= input_shape.dim_size() ||
+                  input_shape.dim(dim).size() != 1) {
+                simplifiable = false;
+                break;
+              }
+            }
+            if (simplifiable) {
+              return true;
+            }
+          }
+        }
+      }
     }
   }
   return false;
@@ -1412,6 +1467,7 @@ bool ConstantFolding::IsOnes(const NodeDef& node) const {
   }
   const auto dtype = node.attr().at("dtype").type();
   switch (dtype) {
+    IS_ONES_CASE(DT_BOOL);
     IS_ONES_CASE(DT_HALF);
     IS_ONES_CASE(DT_BFLOAT16);
     IS_ONES_CASE(DT_FLOAT);
@@ -1447,6 +1503,7 @@ bool ConstantFolding::IsZeros(const NodeDef& node) const {
   }
   const auto dtype = node.attr().at("dtype").type();
   switch (dtype) {
+    IS_ZEROS_CASE(DT_BOOL);
     IS_ZEROS_CASE(DT_HALF);
     IS_ZEROS_CASE(DT_BFLOAT16);
     IS_ZEROS_CASE(DT_FLOAT);
@@ -1466,14 +1523,15 @@ bool ConstantFolding::IsZeros(const NodeDef& node) const {
   return false;
 }
 
-void ConstantFolding::ReplaceOperationWithIdentity(int input_to_forward,
-                                                   NodeDef* node,
-                                                   GraphDef* graph) {
+void ConstantFolding::ReplaceOperationWithIdentity(
+    int input_to_forward, const GraphProperties& properties, NodeDef* node,
+    GraphDef* graph) {
+  const DataType dtype = GetDataTypeFromNodeOrProps(*node, properties);
+  if (dtype == DT_INVALID) return;
+
   node->set_op("Identity");
-  DataType dtype = node->attr().at("T").type();
   node->clear_attr();
   (*node->mutable_attr())["T"].set_type(dtype);
-
   // Propagate the designated input through the identity.
   node->mutable_input()->SwapElements(0, input_to_forward);
   // Add all other inputs as control dependencies.
@@ -1489,14 +1547,22 @@ void ConstantFolding::ReplaceOperationWithIdentity(int input_to_forward,
   graph_modified_ = true;
 }
 
-void ConstantFolding::ReplaceOperationWithSnapshot(int input_to_forward,
-                                                   NodeDef* node,
-                                                   GraphDef* graph) {
+void ConstantFolding::ReplaceOperationWithSnapshot(
+    int input_to_forward, const GraphProperties& properties, NodeDef* node,
+    GraphDef* graph) {
+  // If the graph contains no ops that mutate their inputs, we can
+  // use Identity insted of Snapshot.
+  if (!graph_contains_assign_or_inplace_op_) {
+    ReplaceOperationWithIdentity(input_to_forward, properties, node, graph);
+    return;
+  }
+
+  const DataType dtype = GetDataTypeFromNodeOrProps(*node, properties);
+  if (dtype == DT_INVALID) return;
+
   node->set_op("Snapshot");
-  DataType dtype = node->attr().at("T").type();
   node->clear_attr();
   (*node->mutable_attr())["T"].set_type(dtype);
-
   // Propagate the designated input through the Snapshot.
   node->mutable_input()->SwapElements(0, input_to_forward);
   // Add all other inputs as control dependencies.
@@ -1535,15 +1601,22 @@ void ConstantFolding::ReplaceSubtractionFromZeroByNegation(NodeDef* node,
 }
 
 Status ConstantFolding::ReplaceOperationWithConstant(
-    double value, const AttrValue& dtype_attr, const TensorShapeProto& shape,
-    NodeDef* node, GraphDef* graph) {
+    double value, const GraphProperties& properties,
+    const TensorShapeProto& shape, NodeDef* node, GraphDef* graph,
+    bool* success) {
+  const DataType dtype = GetDataTypeFromNodeOrProps(*node, properties);
+  if (dtype == DT_INVALID) {
+    *success = false;
+    return Status::OK();
+  }
+
   AttrValue tensor_attr;
-  TF_RETURN_IF_ERROR(CreateConstantTensorAttrValue(dtype_attr.type(), value,
-                                                   shape, &tensor_attr));
+  TF_RETURN_IF_ERROR(
+      CreateConstantTensorAttrValue(dtype, value, shape, &tensor_attr));
+  node->set_op("Const");
   node->clear_attr();
-  node->mutable_attr()->insert({"dtype", dtype_attr});
+  (*node->mutable_attr())["dtype"].set_type(dtype);
   node->mutable_attr()->insert({"value", tensor_attr});
-  node->set_op("Const");
   // Convert all inputs to control dependencies.
   for (int i = 0; i < node->input_size(); ++i) {
     if (IsControlInput(node->input(i))) {
@@ -1554,822 +1627,1351 @@ Status ConstantFolding::ReplaceOperationWithConstant(
     node_map_->UpdateInput(node->name(), node->input(i), ctrl_dep);
     node->set_input(i, ctrl_dep);
   }
-  graph_modified_ = true;
+  *success = true;
   return Status::OK();
 }
 
-Status ConstantFolding::SimplifyGraph(GraphDef* optimized_graph,
-                                      GraphProperties* properties,
-                                      bool use_shape_info) {
-  const bool is_aggressive = opt_level_ == RewriterConfig::AGGRESSIVE;
+Status ConstantFolding::SimplifyGraph(bool use_shape_info,
+                                      GraphDef* optimized_graph,
+                                      GraphProperties* properties) {
   for (int i = 0; i < optimized_graph->node_size(); ++i) {
-    NodeDef* node = optimized_graph->mutable_node(i);
+    TF_RETURN_IF_ERROR(SimplifyNode(use_shape_info,
+                                    optimized_graph->mutable_node(i),
+                                    optimized_graph, properties));
+  }
+  return Status::OK();
+}
 
-    if (IsSplit(*node) && node->attr().at("num_split").i() == 1) {
-      ReplaceOperationWithIdentity(1, node, optimized_graph);
-      continue;
+Status ConstantFolding::SimplifyNode(bool use_shape_info, NodeDef* node,
+                                     GraphDef* optimized_graph,
+                                     GraphProperties* properties) {
+  if (RemoveSplitOrSplitV(*properties, optimized_graph, node)) {
+    return Status::OK();
+  }
+
+  bool remove_shuffle_transpose_successful = false;
+  Status remove_shuffle_transpose_status =
+      RemoveShuffleOrTranspose(*properties, use_shape_info, optimized_graph,
+                               node, &remove_shuffle_transpose_successful);
+  if (!remove_shuffle_transpose_status.ok()) {
+    return remove_shuffle_transpose_status;
+  } else if (remove_shuffle_transpose_successful) {
+    return Status::OK();
+  }
+
+  if (RemoveRandomShuffle(*properties, use_shape_info, optimized_graph, node)) {
+    return Status::OK();
+  }
+
+  bool remove_reverse_successful = false;
+  Status remove_reverse_status =
+      RemoveReverse(*properties, use_shape_info, optimized_graph, node,
+                    &remove_reverse_successful);
+  if (!remove_reverse_status.ok()) {
+    return remove_reverse_status;
+  } else if (remove_reverse_successful) {
+    return Status::OK();
+  }
+
+  bool simplify_slice_successful = false;
+  Status simplify_slice_status =
+      SimplifySlice(*properties, use_shape_info, optimized_graph, node,
+                    &simplify_slice_successful);
+  if (!simplify_slice_status.ok()) {
+    return simplify_slice_status;
+  } else if (simplify_slice_successful) {
+    return Status::OK();
+  }
+
+  bool simplify_strided_slice_successful = false;
+  Status simplify_strided_slice_status =
+      SimplifyStridedSlice(*properties, use_shape_info, optimized_graph, node,
+                           &simplify_strided_slice_successful);
+  if (!simplify_strided_slice_status.ok()) {
+    return simplify_strided_slice_status;
+  } else if (simplify_strided_slice_successful) {
+    return Status::OK();
+  }
+
+  bool simplify_tile_successful = false;
+  Status simplify_tile_status =
+      SimplifyTile(*properties, use_shape_info, optimized_graph, node,
+                   &simplify_tile_successful);
+  if (!simplify_tile_status.ok()) {
+    return simplify_tile_status;
+  } else if (simplify_tile_successful) {
+    return Status::OK();
+  }
+
+  bool simplify_pad_successful = false;
+  Status simplify_pad_status =
+      SimplifyPad(*properties, use_shape_info, optimized_graph, node,
+                  &simplify_pad_successful);
+  if (!simplify_pad_status.ok()) {
+    return simplify_pad_status;
+  } else if (simplify_pad_successful) {
+    return Status::OK();
+  }
+
+  if (SimplifySqueeze(*properties, use_shape_info, optimized_graph, node)) {
+    return Status::OK();
+  }
+
+  if (SimplifyPack(optimized_graph, node)) {
+    graph_modified_ = true;
+    return Status::OK();
+  }
+
+  if (MoveConstantsPastEnter(optimized_graph, node)) {
+    graph_modified_ = true;
+    return Status::OK();
+  }
+
+  if (SimplifySwitch(optimized_graph, node)) {
+    graph_modified_ = true;
+    return Status::OK();
+  }
+
+  if (SimplifyReduction(*properties, node)) {
+    graph_modified_ = true;
+    return Status::OK();
+  }
+
+  if (SimplifyReshape(*properties, use_shape_info, node)) {
+    graph_modified_ = true;
+    return Status::OK();
+  }
+
+  bool arithmetic_simplification_succeed = false;
+  Status simplify_arithmetic_status =
+      SimplifyArithmeticOperations(*properties, use_shape_info, optimized_graph,
+                                   node, &arithmetic_simplification_succeed);
+  if (!simplify_arithmetic_status.ok()) {
+    return simplify_arithmetic_status;
+  } else if (arithmetic_simplification_succeed) {
+    graph_modified_ = true;
+    return Status::OK();
+  }
+
+  if (ReduceDivToReciprocalMul(optimized_graph, node)) {
+    graph_modified_ = true;
+    return Status::OK();
+  }
+
+  if (ConstantPushDown(node)) {
+    graph_modified_ = true;
+    return Status::OK();
+  }
+
+  if (MulConvPushDown(node, *properties)) {
+    graph_modified_ = true;
+    return Status::OK();
+  }
+
+  if (PartialConstPropThroughIdentityN(node)) {
+    graph_modified_ = true;
+    return Status::OK();
+  }
+
+  if (PartialAssocOpConstFolding(optimized_graph, properties, node)) {
+    graph_modified_ = true;
+    return Status::OK();
+  }
+
+  if (PartialConcatConstFolding(optimized_graph, properties, node)) {
+    graph_modified_ = true;
+    return Status::OK();
+  }
+
+  return Status::OK();
+}
+
+bool ConstantFolding::RemoveSplitOrSplitV(const GraphProperties& properties,
+                                          GraphDef* optimized_graph,
+                                          NodeDef* node) {
+  if (IsSplit(*node) && node->attr().at("num_split").i() == 1) {
+    ReplaceOperationWithIdentity(1, properties, node, optimized_graph);
+    return true;
+  }
+
+  if (IsSplitV(*node) && node->attr().at("num_split").i() == 1) {
+    ReplaceOperationWithIdentity(0, properties, node, optimized_graph);
+    return true;
+  }
+  return false;
+}
+
+Status ConstantFolding::RemoveShuffleOrTranspose(
+    const GraphProperties& properties, bool use_shape_info,
+    GraphDef* optimized_graph, NodeDef* node, bool* success) {
+  if (use_shape_info && (IsShuffle(*node) || IsTranspose(*node)) &&
+      properties.GetInputProperties(node->name()).size() >= 2) {
+    const auto& shape = properties.GetInputProperties(node->name())[0].shape();
+    if (shape.unknown_rank()) {
+      // Not optimizable.
+      return Status::OK();
+    }
+    const auto& p = properties.GetInputProperties(node->name())[1];
+    if (TensorShape::IsValid(p.shape()) && p.has_value()) {
+      Tensor perm(p.dtype(), p.shape());
+      if (!perm.FromProto(p.value())) {
+        return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                       p.value().DebugString());
+      }
+      std::vector<int> permutation;
+      for (int j = 0; j < perm.NumElements(); ++j) {
+        if (perm.dtype() == DT_INT64) {
+          permutation.push_back(perm.vec<int64>()(j));
+        } else {
+          permutation.push_back(perm.vec<int>()(j));
+        }
+      }
+      if (permutation.size() != shape.dim_size()) {
+        // Number of elements in perm should be same as dim_size. Skip if not.
+        return Status::OK();
+      }
+      // The node is replaceable iff
+      // dim_size == 0 || all dims have size 1 ||
+      // all dims with > 1 size are not permuted.
+      bool replaceable = true;
+      for (int j = 0; replaceable && j < shape.dim_size(); ++j) {
+        replaceable &= shape.dim(j).size() == 1 || j == permutation[j];
+      }
+      if (replaceable) {
+        ReplaceOperationWithIdentity(0, properties, node, optimized_graph);
+        *success = true;
+        return Status::OK();
+      }
+    }
+  }
+  *success = false;
+  return Status::OK();
+}
+bool ConstantFolding::RemoveRandomShuffle(const GraphProperties& properties,
+                                          bool use_shape_info,
+                                          GraphDef* optimized_graph,
+                                          NodeDef* node) {
+  if (use_shape_info && IsRandomShuffle(*node) &&
+      !properties.GetInputProperties(node->name()).empty()) {
+    const auto& shape = properties.GetInputProperties(node->name())[0].shape();
+    // The node is replaceable iff
+    // unknown_rank == false && (dim_size == 0 || first dim is of size 1)
+    if (!shape.unknown_rank() &&
+        (shape.dim_size() == 0 || shape.dim(0).size() == 1)) {
+      ReplaceOperationWithIdentity(0, properties, node, optimized_graph);
+      return true;
     }
+  }
+  return false;
+}
 
-    if (IsSplitV(*node) && node->attr().at("num_split").i() == 1) {
-      ReplaceOperationWithIdentity(0, node, optimized_graph);
-      continue;
+Status ConstantFolding::RemoveReverse(const GraphProperties& properties,
+                                      bool use_shape_info,
+                                      GraphDef* optimized_graph, NodeDef* node,
+                                      bool* success) {
+  if (use_shape_info && node->op() == "ReverseV2" &&
+      properties.GetInputProperties(node->name()).size() >= 2) {
+    const auto& shape = properties.GetInputProperties(node->name())[0].shape();
+    if (shape.unknown_rank()) {
+      // Not optimizable.
+      return Status::OK();
     }
+    const auto& a = properties.GetInputProperties(node->name())[1];
+    if (TensorShape::IsValid(a.shape()) && a.has_value()) {
+      Tensor axis(a.dtype(), a.shape());
+      if (!axis.FromProto(a.value())) {
+        return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                       a.value().DebugString());
+      }
+      std::set<int> target_axes;
+      for (int j = 0; j < axis.NumElements(); ++j) {
+        // value of axis can be negative.
+        if (axis.dtype() == DT_INT64) {
+          target_axes.insert((axis.vec<int64>()(j) + shape.dim_size()) %
+                             shape.dim_size());
+        } else {
+          target_axes.insert((axis.vec<int>()(j) + shape.dim_size()) %
+                             shape.dim_size());
+        }
+      }
 
-    // Remove Shuffle or Reverse op over scalar values.
-    if (use_shape_info &&
-        !properties->GetInputProperties(node->name()).empty() &&
-        (IsShuffle(*node) || IsReverse(*node) || IsTranspose(*node))) {
-      const auto& shape =
-          properties->GetInputProperties(node->name())[0].shape();
       // The node is replaceable iff
-      // unknown_rank == false && (dim_size == 0 || all dims have size 1)
+      // unknown_rank == false &&
+      // (dim_size == 0 || all dims have size 1 ||
+      //  all dims with > 1 size are not in target_axes)
       bool replaceable = !shape.unknown_rank();
       for (int j = 0; replaceable && j < shape.dim_size(); ++j) {
-        replaceable &= shape.dim(j).size() == 1;
+        replaceable &= shape.dim(j).size() == 1 ||
+                       target_axes.find(j) == target_axes.end();
       }
       if (replaceable) {
-        ReplaceOperationWithIdentity(0, node, optimized_graph);
-        continue;
+        ReplaceOperationWithIdentity(0, properties, node, optimized_graph);
+        *success = true;
+        return Status::OK();
       }
     }
+  }
+  *success = false;
+  return Status::OK();
+}
 
-    if (use_shape_info && IsSlice(*node) &&
-        properties->GetInputProperties(node->name()).size() == 3) {
-      const auto& input = properties->GetInputProperties(node->name())[0];
-      const auto& b = properties->GetInputProperties(node->name())[1];
-      const auto& s = properties->GetInputProperties(node->name())[2];
-      if (TensorShape::IsValid(b.shape()) && b.has_value() &&
-          TensorShape::IsValid(s.shape()) && s.has_value()) {
-        Tensor begin(b.dtype(), b.shape());
-        if (!begin.FromProto(b.value())) {
-          return errors::InvalidArgument("Cannot parse tensor from proto: ",
-                                         b.value().DebugString());
-        }
-        Tensor size(s.dtype(), s.shape());
-        if (!size.FromProto(s.value())) {
-          return errors::InvalidArgument("Cannot parse tensor from proto: ",
-                                         s.value().DebugString());
-        }
-        // The node is replaceable iff unknown_rank == false &&
-        // begin == 0 && (size == -1 || size == input_shape) for all dimensions
-        bool replaceable = !input.shape().unknown_rank();
-        for (int j = 0; replaceable && j < input.shape().dim_size(); ++j) {
-          if (begin.dtype() == DT_INT32) {
-            replaceable &= begin.vec<int>()(j) == 0;
-          } else {
-            replaceable &= begin.vec<int64>()(j) == 0;
-          }
-          if (size.dtype() == DT_INT32) {
-            replaceable &= (size.vec<int>()(j) == -1 ||
-                            size.vec<int>()(j) == input.shape().dim(j).size());
-          } else {
-            replaceable &=
-                (size.vec<int64>()(j) == -1 ||
-                 size.vec<int64>()(j) == input.shape().dim(j).size());
-          }
+Status ConstantFolding::SimplifySlice(const GraphProperties& properties,
+                                      bool use_shape_info,
+                                      GraphDef* optimized_graph, NodeDef* node,
+                                      bool* success) {
+  if (use_shape_info && IsSlice(*node) &&
+      properties.GetInputProperties(node->name()).size() == 3) {
+    const auto& input = properties.GetInputProperties(node->name())[0];
+    const auto& b = properties.GetInputProperties(node->name())[1];
+    const auto& s = properties.GetInputProperties(node->name())[2];
+    if (TensorShape::IsValid(b.shape()) && b.has_value() &&
+        TensorShape::IsValid(s.shape()) && s.has_value()) {
+      Tensor begin(b.dtype(), b.shape());
+      if (!begin.FromProto(b.value())) {
+        return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                       b.value().DebugString());
+      }
+      Tensor size(s.dtype(), s.shape());
+      if (!size.FromProto(s.value())) {
+        return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                       s.value().DebugString());
+      }
+      // The node is replaceable iff unknown_rank == false &&
+      // begin == 0 && (size == -1 || size == input_shape) for all dimensions
+      bool replaceable = !input.shape().unknown_rank();
+      for (int j = 0; replaceable && j < input.shape().dim_size(); ++j) {
+        if (begin.dtype() == DT_INT32) {
+          replaceable &= begin.vec<int>()(j) == 0;
+        } else {
+          replaceable &= begin.vec<int64>()(j) == 0;
         }
-        if (replaceable) {
-          ReplaceOperationWithIdentity(0, node, optimized_graph);
-          continue;
+        if (size.dtype() == DT_INT32) {
+          replaceable &= (size.vec<int>()(j) == -1 ||
+                          size.vec<int>()(j) == input.shape().dim(j).size());
+        } else {
+          replaceable &= (size.vec<int64>()(j) == -1 ||
+                          size.vec<int64>()(j) == input.shape().dim(j).size());
         }
       }
+      if (replaceable) {
+        ReplaceOperationWithIdentity(0, properties, node, optimized_graph);
+        *success = true;
+        return Status::OK();
+      }
     }
+  }
+  *success = false;
+  return Status::OK();
+}
 
-    if (use_shape_info && IsTile(*node) &&
-        properties->GetInputProperties(node->name()).size() == 2) {
-      const auto& m = properties->GetInputProperties(node->name())[1];
-      if (TensorShape::IsValid(m.shape()) && m.has_value()) {
-        Tensor multiplies(m.dtype(), m.shape());
-        if (!multiplies.FromProto(m.value())) {
-          return errors::InvalidArgument("Cannot parse tensor from proto: ",
-                                         m.value().DebugString());
+Status ConstantFolding::SimplifyStridedSlice(const GraphProperties& properties,
+                                             bool use_shape_info,
+                                             GraphDef* optimized_graph,
+                                             NodeDef* node, bool* success) {
+  if (use_shape_info && IsStridedSlice(*node) &&
+      properties.GetInputProperties(node->name()).size() == 4) {
+    if (node->attr().at("new_axis_mask").i() != 0 ||
+        node->attr().at("shrink_axis_mask").i() != 0) {
+      // Skip nodes with new/shrink axis mask, since they involve dimension
+      // changes.
+      return Status::OK();
+    }
+    const auto& input = properties.GetInputProperties(node->name())[0];
+    for (int j = 0; j < input.shape().dim_size(); ++j) {
+      // Skip if input shape is not fully determined.
+      if (input.shape().dim(j).size() < 0) {
+        return Status::OK();
+      }
+    }
+    const auto& b = properties.GetInputProperties(node->name())[1];
+    const auto& e = properties.GetInputProperties(node->name())[2];
+    const auto& s = properties.GetInputProperties(node->name())[3];
+    if (TensorShape::IsValid(b.shape()) && b.has_value() &&
+        TensorShape::IsValid(e.shape()) && e.has_value() &&
+        TensorShape::IsValid(s.shape()) && s.has_value()) {
+      Tensor begin(b.dtype(), b.shape());
+      if (!begin.FromProto(b.value())) {
+        return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                       b.value().DebugString());
+      }
+      Tensor end(e.dtype(), e.shape());
+      if (!end.FromProto(e.value())) {
+        return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                       e.value().DebugString());
+      }
+      Tensor strides(s.dtype(), s.shape());
+      if (!strides.FromProto(s.value())) {
+        return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                       s.value().DebugString());
+      }
+      int begin_mask = node->attr().at("begin_mask").i();
+      int end_mask = node->attr().at("end_mask").i();
+      std::set<int> expanded_ellipsis_indices;
+      int ellipsis_index = -1;
+      for (int j = 0; j < input.shape().dim_size(); ++j) {
+        // find the ellipsis_mask. If not found, insert one in the end if
+        // necessary.
+        if (node->attr().at("ellipsis_mask").i() & 1 << j ||
+            (ellipsis_index == -1 && j >= strides.NumElements())) {
+          ellipsis_index = j;
         }
-        // The node is replaceable iff all values in multiplies are 1.
-        bool replaceable = true;
-        if (multiplies.dtype() == DT_INT32) {
-          for (int j = 0; replaceable && j < multiplies.vec<int>().size();
-               ++j) {
-            replaceable &= multiplies.vec<int>()(j) == 1;
-          }
-        } else {
-          for (int j = 0; replaceable && j < multiplies.vec<int64>().size();
-               ++j) {
-            replaceable &= multiplies.vec<int64>()(j) == 1;
-          }
+        // insert the indices that are immediately after ellipsis_index if
+        // necessary.
+        if (ellipsis_index != -1 &&
+            input.shape().dim_size() >
+                strides.NumElements() + j - ellipsis_index) {
+          expanded_ellipsis_indices.insert(j);
         }
-        if (replaceable) {
-          ReplaceOperationWithIdentity(0, node, optimized_graph);
+      }
+
+      // The node is replaceable iff unknown_rank == false &&
+      // ((begin_mask is set || begin == 0) && (end_mask is set || end == dim)
+      //  && strides == 1) for all dimensions.
+      bool replaceable = !input.shape().unknown_rank();
+      for (int j = 0; replaceable && j < input.shape().dim_size(); ++j) {
+        if (expanded_ellipsis_indices.find(j) !=
+            expanded_ellipsis_indices.end()) {
+          // ellipsis_mask is effective on current dimension.
           continue;
         }
+        // when we have ellipsis_mask in between, input.shape().dim_size() will
+        // be greater than strides.NumElements(), since we will insert
+        // as many as expanded_ellipsis_indices.size() axes during computation.
+        // We need to subtract this number from j.
+        int i = j;
+        if (ellipsis_index != -1 &&
+            j >= ellipsis_index + expanded_ellipsis_indices.size()) {
+          i = j - expanded_ellipsis_indices.size();
+        }
+        int b = begin.dtype() == DT_INT32 ? begin.vec<int>()(i)
+                                          : begin.vec<int64>()(i);
+        int e =
+            end.dtype() == DT_INT32 ? end.vec<int>()(i) : end.vec<int64>()(i);
+        int s = strides.dtype() == DT_INT32 ? strides.vec<int>()(i)
+                                            : strides.vec<int64>()(i);
+        replaceable &=
+            (begin_mask & 1 << i || b == 0) &&
+            (end_mask & 1 << i || e == input.shape().dim(j).size()) && s == 1;
+      }
+      if (replaceable) {
+        ReplaceOperationWithIdentity(0, properties, node, optimized_graph);
+        *success = true;
+        return Status::OK();
       }
     }
+  }
+  *success = false;
+  return Status::OK();
+}
 
-    if (use_shape_info && IsPad(*node) &&
-        properties->GetInputProperties(node->name()).size() >= 2) {
-      const auto& p = properties->GetInputProperties(node->name())[1];
-      if (TensorShape::IsValid(p.shape()) && p.has_value()) {
-        Tensor paddings(p.dtype(), p.shape());
-        if (!paddings.FromProto(p.value())) {
-          return errors::InvalidArgument("Cannot parse tensor from proto: ",
-                                         p.value().DebugString());
-        }
-        // The node is replaceable iff all values in paddings are 0.
-        bool replaceable = true;
-        // The operation requires it to be int32 value so we don't check for
-        // 1nt64.
-        const auto flatten = paddings.flat<int32>();
-        for (int j = 0; replaceable && j < flatten.size(); ++j) {
-          replaceable &= flatten(j) == 0;
+Status ConstantFolding::SimplifyTile(const GraphProperties& properties,
+                                     bool use_shape_info,
+                                     GraphDef* optimized_graph, NodeDef* node,
+                                     bool* success) {
+  if (use_shape_info && IsTile(*node) &&
+      properties.GetInputProperties(node->name()).size() == 2) {
+    const auto& m = properties.GetInputProperties(node->name())[1];
+    if (TensorShape::IsValid(m.shape()) && m.has_value()) {
+      Tensor multiplies(m.dtype(), m.shape());
+      if (!multiplies.FromProto(m.value())) {
+        return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                       m.value().DebugString());
+      }
+      // The node is replaceable iff all values in multiplies are 1.
+      bool replaceable = true;
+      if (multiplies.dtype() == DT_INT32) {
+        for (int j = 0; replaceable && j < multiplies.vec<int>().size(); ++j) {
+          replaceable &= multiplies.vec<int>()(j) == 1;
         }
-        if (replaceable) {
-          ReplaceOperationWithIdentity(0, node, optimized_graph);
-          continue;
+      } else {
+        for (int j = 0; replaceable && j < multiplies.vec<int64>().size();
+             ++j) {
+          replaceable &= multiplies.vec<int64>()(j) == 1;
         }
       }
+      if (replaceable) {
+        ReplaceOperationWithIdentity(0, properties, node, optimized_graph);
+        *success = true;
+        return Status::OK();
+      }
     }
+  }
+  *success = false;
+  return Status::OK();
+}
 
-    if (use_shape_info && IsSqueeze(*node) &&
-        !properties->GetInputProperties(node->name()).empty()) {
-      // https://www.tensorflow.org/api_docs/python/tf/squeeze mentions it's
-      // error to squeeze a dimension that is not 1, so we only need to check
-      // whether the input has > 1 size for each dimension.
-      const auto& shape =
-          properties->GetInputProperties(node->name())[0].shape();
-      // The node is replaceable iff
-      // unknown_rank == false && (dim_size == 0 || all dims have size > 1)
-      bool replaceable = !shape.unknown_rank();
-      for (int j = 0; replaceable && j < shape.dim_size(); ++j) {
-        replaceable &= shape.dim(j).size() > 1;
+Status ConstantFolding::SimplifyPad(const GraphProperties& properties,
+                                    bool use_shape_info,
+                                    GraphDef* optimized_graph, NodeDef* node,
+                                    bool* success) {
+  if (use_shape_info && IsPad(*node) &&
+      properties.GetInputProperties(node->name()).size() >= 2) {
+    const auto& p = properties.GetInputProperties(node->name())[1];
+    if (TensorShape::IsValid(p.shape()) && p.has_value()) {
+      Tensor paddings(p.dtype(), p.shape());
+      if (!paddings.FromProto(p.value())) {
+        return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                       p.value().DebugString());
+      }
+      // The node is replaceable iff all values in paddings are 0.
+      bool replaceable = true;
+      // The operation requires it to be int32 value so we don't check for
+      // 1nt64.
+      const auto flatten = paddings.flat<int32>();
+      for (int j = 0; replaceable && j < flatten.size(); ++j) {
+        replaceable &= flatten(j) == 0;
       }
       if (replaceable) {
-        ReplaceOperationWithIdentity(0, node, optimized_graph);
-        continue;
+        ReplaceOperationWithIdentity(0, properties, node, optimized_graph);
+        *success = true;
+        return Status::OK();
       }
     }
+  }
+  *success = false;
+  return Status::OK();
+}
 
-    if (IsPack(*node) && NumNonControlInputs(*node) == 1 &&
-        !OptimizedNodeExists(*node, "_const_axis")) {
-      // Create constant axis node.
-      Tensor axis_t(DT_INT32, TensorShape({}));
-      NodeDef* axis_node = optimized_graph->add_node();
-      axis_node->set_name(OptimizedNodeName(*node, "_const_axis"));
-      const int axis = node->attr().at("axis").i();
-      if (!SetTensorValue(DT_INT32, axis, &axis_t).ok() ||
-          !CreateNodeDef(axis_node->name(), TensorValue(&axis_t), axis_node)
-               .ok()) {
-        continue;
-      }
-      // Add a control dependency to make sure axis_node is in the right frame.
-      const string ctrl_dep = ConstantFolding::AddControlDependency(
-          node->input(0), graph_, node_map_.get());
-      axis_node->add_input(ctrl_dep);
-      axis_node->set_device(node->device());
-      node->set_op("ExpandDims");
-      if (node->attr().count("axis") != 0) {
-        node->mutable_attr()->erase("axis");
-      }
-      if (node->attr().count("N") != 0) {
-        node->mutable_attr()->erase("N");
-      }
-      (*node->mutable_attr())["Tdim"].set_type(DT_INT32);
-      node->add_input(axis_node->name());
-      if (node->input_size() > 2) {
-        node->mutable_input()->SwapElements(1, node->input_size() - 1);
-      }
-      graph_modified_ = true;
-      continue;
+bool ConstantFolding::SimplifySqueeze(const GraphProperties& properties,
+                                      bool use_shape_info,
+                                      GraphDef* optimized_graph,
+                                      NodeDef* node) {
+  if (use_shape_info && IsSqueeze(*node) &&
+      !properties.GetInputProperties(node->name()).empty()) {
+    // https://www.tensorflow.org/api_docs/python/tf/squeeze mentions it's
+    // error to squeeze a dimension that is not 1, so we only need to check
+    // whether the input has > 1 size for each dimension.
+    const auto& shape = properties.GetInputProperties(node->name())[0].shape();
+    // The node is replaceable iff
+    // unknown_rank == false && (dim_size == 0 || all dims have size > 1)
+    bool replaceable = !shape.unknown_rank();
+    for (int j = 0; replaceable && j < shape.dim_size(); ++j) {
+      replaceable &= shape.dim(j).size() > 1;
+    }
+    if (replaceable) {
+      ReplaceOperationWithIdentity(0, properties, node, optimized_graph);
+      return true;
     }
+  }
+  return false;
+}
 
-    // Move constants past Enter.
-    if (IsEnter(*node) && node->input_size() > 0) {
-      if (node->attr().count("is_constant") == 0 ||
-          !node->attr().at("is_constant").b()) {
-        continue;
-      }
-      const string& node_name = node->name();
-      const NodeDef* input = node_map_->GetNode(node->input(0));
-      if (input != nullptr && IsReallyConstant(*input) &&
-          !OptimizedNodeExists(*input, "_enter")) {
-        auto fanouts = node_map_->GetOutputs(node_name);
-        // Find non-constant nodes that consume the output of *node.
-        std::vector<NodeDef*> consumers;
-        for (NodeDef* fanout : fanouts) {
-          if (!IsConstant(*fanout)) {
-            for (int i = 0; i < fanout->input_size(); ++i) {
-              if (fanout->input(i) == node_name) {
-                consumers.push_back(fanout);
-                break;
-              }
+bool ConstantFolding::SimplifyPack(GraphDef* optimized_graph, NodeDef* node) {
+  if (IsPack(*node) && NumNonControlInputs(*node) == 1 &&
+      !OptimizedNodeExists(*node, "_const_axis")) {
+    // Create constant axis node.
+    Tensor axis_t(DT_INT32, TensorShape({}));
+    NodeDef* axis_node = optimized_graph->add_node();
+    axis_node->set_name(OptimizedNodeName(*node, "_const_axis"));
+    const int axis = node->attr().at("axis").i();
+    if (!SetTensorValue(DT_INT32, axis, &axis_t).ok() ||
+        !CreateNodeDef(axis_node->name(), TensorValue(&axis_t), axis_node)
+             .ok()) {
+      return false;
+    }
+    // Add a control dependency to make sure axis_node is in the right frame.
+    const string ctrl_dep = ConstantFolding::AddControlDependency(
+        node->input(0), optimized_graph, node_map_.get());
+    axis_node->add_input(ctrl_dep);
+    axis_node->set_device(node->device());
+    node->set_op("ExpandDims");
+    if (node->attr().count("axis") != 0) {
+      node->mutable_attr()->erase("axis");
+    }
+    if (node->attr().count("N") != 0) {
+      node->mutable_attr()->erase("N");
+    }
+    (*node->mutable_attr())["Tdim"].set_type(DT_INT32);
+    node->add_input(axis_node->name());
+    if (node->input_size() > 2) {
+      node->mutable_input()->SwapElements(1, node->input_size() - 1);
+    }
+    return true;
+  }
+  return false;
+}
+
+bool ConstantFolding::MoveConstantsPastEnter(GraphDef* optimized_graph,
+                                             NodeDef* node) {
+  if (IsEnter(*node) && node->input_size() > 0) {
+    if (node->attr().count("is_constant") == 0 ||
+        !node->attr().at("is_constant").b()) {
+      return false;
+    }
+    const string& node_name = node->name();
+    const NodeDef* input = node_map_->GetNode(node->input(0));
+    if (input != nullptr && IsReallyConstant(*input) &&
+        !OptimizedNodeExists(*input, "_enter")) {
+      auto fanouts = node_map_->GetOutputs(node_name);
+      // Find non-constant nodes that consume the output of *node.
+      std::vector<NodeDef*> consumers;
+      for (NodeDef* fanout : fanouts) {
+        if (!IsConstant(*fanout)) {
+          for (int i = 0; i < fanout->input_size(); ++i) {
+            if (fanout->input(i) == node_name) {
+              consumers.push_back(fanout);
+              break;
             }
           }
         }
-        if (!consumers.empty()) {
-          NodeDef* new_node = optimized_graph->add_node();
-          *new_node = *input;
-          new_node->set_name(OptimizedNodeName(*input, "_enter"));
-          new_node->set_device(node->device());
-          new_node->clear_input();
-          new_node->add_input(AsControlDependency(node_name));
-          node_map_->AddNode(new_node->name(), new_node);
-          node_map_->AddOutput(node_name, new_node->name());
-          for (NodeDef* consumer : consumers) {
-            for (int i = 0; i < consumer->input_size(); ++i) {
-              if (NodeName(consumer->input(i)) == node_name) {
-                node_map_->UpdateInput(consumer->name(), node_name,
-                                       new_node->name());
-                consumer->set_input(i, new_node->name());
-              }
+      }
+      if (!consumers.empty()) {
+        NodeDef* new_node = optimized_graph->add_node();
+        *new_node = *input;
+        new_node->set_name(OptimizedNodeName(*input, "_enter"));
+        new_node->set_device(node->device());
+        new_node->clear_input();
+        new_node->add_input(AsControlDependency(node_name));
+        node_map_->AddNode(new_node->name(), new_node);
+        node_map_->AddOutput(node_name, new_node->name());
+        for (NodeDef* consumer : consumers) {
+          for (int i = 0; i < consumer->input_size(); ++i) {
+            if (NodeName(consumer->input(i)) == node_name) {
+              node_map_->UpdateInput(consumer->name(), node_name,
+                                     new_node->name());
+              consumer->set_input(i, new_node->name());
             }
           }
-          graph_modified_ = true;
-          continue;
         }
+        return true;
       }
     }
+  }
+  return false;
+}
 
-    // Switch(x, x) will always feed false to its false branch and true to
-    // its true branch. By rewriting the graph a bit, we can propagate these
-    // constants down the two output branches, and just use control dependencies
-    // to trigger the selected one at runtime. For example,
-    //
-    //     +------+
-    // x-->|Switch|-->a  (in practice there may be multiple consumers of each
-    // x-->|      |-->b   output branch.)
-    //     +------+
-    //
-    // Is rewritten as
-    //
-    //     +------+
-    // x-->|Switch|-->Identity--^>Const(false)-->a
-    // x-->|      |-->Identity--^>Const(true)-->b
-    //     +------+
-    if (node->op() == "Switch" && node->input(0) == node->input(1) &&
-        !OptimizedNodeExists(*node, "_const_false") &&
-        !OptimizedNodeExists(*node, "_const_true")) {
-      bool already_optimized = true;
-      // If the optimization was already applied, the switch would have exactly
-      // one Identity node consuming each of its outputs, each without any
-      // non-control outputs.
-      auto fanouts = node_map_->GetOutputs(node->name());
-      if (fanouts.size() == 2) {
-        for (NodeDef* fanout : fanouts) {
-          if (!IsIdentity(*fanout) ||
-              NumNonControlOutputs(*fanout, *node_map_) > 0) {
-            already_optimized = false;
-            break;
-          }
+bool ConstantFolding::SimplifySwitch(GraphDef* optimized_graph, NodeDef* node) {
+  if (node->op() == "Switch" && node->input(0) == node->input(1) &&
+      !OptimizedNodeExists(*node, "_const_false") &&
+      !OptimizedNodeExists(*node, "_const_true")) {
+    bool already_optimized = true;
+    // If the optimization was already applied, the switch would have exactly
+    // one Identity node consuming each of its outputs, each without any
+    // non-control outputs.
+    auto fanouts = node_map_->GetOutputs(node->name());
+    if (fanouts.size() == 2) {
+      for (NodeDef* fanout : fanouts) {
+        if (!IsIdentity(*fanout) ||
+            NumNonControlOutputs(*fanout, *node_map_) > 0) {
+          already_optimized = false;
+          break;
         }
       }
-      Tensor false_t(DT_BOOL, TensorShape({}));
-      Tensor true_t(DT_BOOL, TensorShape({}));
-      // Make sure we don't proceed if this switch node was already optimized.
-      if (!already_optimized && SetTensorValue(DT_BOOL, true, &true_t).ok() &&
-          SetTensorValue(DT_BOOL, false, &false_t).ok()) {
-        // Copy the set of consumers of the switch as they will be manipulated
-        // below.
-        const std::set<NodeDef*>& consumer_set =
-            node_map_->GetOutputs(node->name());
-        std::vector<NodeDef*> consumers(consumer_set.begin(),
-                                        consumer_set.end());
-        std::sort(consumers.begin(), consumers.end(),
-                  [](const NodeDef* n1, const NodeDef* n2) {
-                    return n1->name() < n2->name();
-                  });
-        // Create constant false & true nodes.
-        NodeDef* false_node = optimized_graph->add_node();
-        false_node->set_name(OptimizedNodeName(*node, "_const_false"));
-        if (!CreateNodeDef(false_node->name(), TensorValue(&false_t),
-                           false_node)
-                 .ok()) {
-          continue;
-        }
-        false_node->set_device(node->device());
-
-        NodeDef* true_node = optimized_graph->add_node();
-        true_node->set_name(OptimizedNodeName(*node, "_const_true"));
-        if (!CreateNodeDef(true_node->name(), TensorValue(&true_t), true_node)
-                 .ok()) {
-          continue;
-        }
-        true_node->set_device(node->device());
-
-        // Add controls from the switch ports to the constants, and connect the
-        // constants to the original switch outputs.
-        const string false_port = node->name();
-        const string true_port = strings::StrCat(node->name(), ":1");
-        const string false_ctrl_dep =
-            AddControlDependency(false_port, optimized_graph, node_map_.get());
-        false_node->add_input(false_ctrl_dep);
-        const string true_ctrl_dep =
-            AddControlDependency(true_port, optimized_graph, node_map_.get());
-        true_node->add_input(true_ctrl_dep);
-
-        node_map_->AddNode(false_node->name(), false_node);
-        node_map_->AddNode(true_node->name(), true_node);
-        node_map_->AddOutput(NodeName(false_ctrl_dep), false_node->name());
-        node_map_->AddOutput(NodeName(true_ctrl_dep), true_node->name());
+    }
+    Tensor false_t(DT_BOOL, TensorShape({}));
+    Tensor true_t(DT_BOOL, TensorShape({}));
+    // Make sure we don't proceed if this switch node was already optimized.
+    if (!already_optimized && SetTensorValue(DT_BOOL, true, &true_t).ok() &&
+        SetTensorValue(DT_BOOL, false, &false_t).ok()) {
+      // Copy the set of consumers of the switch as they will be manipulated
+      // below.
+      const std::set<NodeDef*>& consumer_set =
+          node_map_->GetOutputs(node->name());
+      std::vector<NodeDef*> consumers(consumer_set.begin(), consumer_set.end());
+      std::sort(consumers.begin(), consumers.end(),
+                [](const NodeDef* n1, const NodeDef* n2) {
+                  return n1->name() < n2->name();
+                });
+      // Create constant false & true nodes.
+      NodeDef* false_node = optimized_graph->add_node();
+      false_node->set_name(OptimizedNodeName(*node, "_const_false"));
+      if (!CreateNodeDef(false_node->name(), TensorValue(&false_t), false_node)
+               .ok()) {
+        return false;
+      }
+      false_node->set_device(node->device());
 
-        for (NodeDef* consumer : consumers) {
-          for (int i = 0; i < consumer->input_size(); ++i) {
-            const string& input = consumer->input(i);
-            if (input == false_port) {
-              consumer->set_input(i, false_node->name());
-              node_map_->UpdateInput(consumer->name(), false_port,
-                                     false_node->name());
-            } else if (input == true_port) {
-              consumer->set_input(i, true_node->name());
-              node_map_->UpdateInput(consumer->name(), true_port,
-                                     true_node->name());
-            }
+      NodeDef* true_node = optimized_graph->add_node();
+      true_node->set_name(OptimizedNodeName(*node, "_const_true"));
+      if (!CreateNodeDef(true_node->name(), TensorValue(&true_t), true_node)
+               .ok()) {
+        return false;
+      }
+      true_node->set_device(node->device());
+
+      // Add controls from the switch ports to the constants, and connect the
+      // constants to the original switch outputs.
+      const string false_port = node->name();
+      const string true_port = strings::StrCat(node->name(), ":1");
+      const string false_ctrl_dep =
+          AddControlDependency(false_port, optimized_graph, node_map_.get());
+      false_node->add_input(false_ctrl_dep);
+      const string true_ctrl_dep =
+          AddControlDependency(true_port, optimized_graph, node_map_.get());
+      true_node->add_input(true_ctrl_dep);
+
+      node_map_->AddNode(false_node->name(), false_node);
+      node_map_->AddNode(true_node->name(), true_node);
+      node_map_->AddOutput(NodeName(false_ctrl_dep), false_node->name());
+      node_map_->AddOutput(NodeName(true_ctrl_dep), true_node->name());
+
+      for (NodeDef* consumer : consumers) {
+        for (int i = 0; i < consumer->input_size(); ++i) {
+          const string& input = consumer->input(i);
+          if (input == false_port) {
+            consumer->set_input(i, false_node->name());
+            node_map_->UpdateInput(consumer->name(), false_port,
+                                   false_node->name());
+          } else if (input == true_port) {
+            consumer->set_input(i, true_node->name());
+            node_map_->UpdateInput(consumer->name(), true_port,
+                                   true_node->name());
           }
         }
-        graph_modified_ = true;
-        continue;
       }
+      return true;
     }
-    if (IsSimplifiableReduction(*node)) {
-      // Replace the reduction node with an identity node, that can be further
-      // optimized by the model pruner.
-      DataType output_type;
-      if (node->attr().count("T") > 0) {
-        output_type = node->attr().at("T").type();
-      } else {
-        // This is an 'any' or 'all' reduction. The output is always boolean.
-        output_type = DT_BOOL;
-      }
-      node->set_op("Identity");
-      node->clear_attr();
-      (*node->mutable_attr())["T"].set_type(output_type);
-      *node->mutable_input(1) = AsControlDependency(node->input(1));
-      graph_modified_ = true;
-      continue;
+  }
+  return false;
+}
+
+bool ConstantFolding::SimplifyReduction(const GraphProperties& properties,
+                                        NodeDef* node) {
+  if (IsSimplifiableReduction(*node, properties)) {
+    // Replace the reduction node with an identity node, that can be further
+    // optimized by the model pruner.
+    DataType output_type;
+    if (node->attr().count("T") > 0) {
+      output_type = node->attr().at("T").type();
+    } else {
+      // This is an 'any' or 'all' reduction. The output is always boolean.
+      output_type = DT_BOOL;
     }
-    if (use_shape_info && IsSimplifiableReshape(*node, *properties)) {
-      DataType output_type = node->attr().at("T").type();
-      node->set_op("Identity");
-      node->clear_attr();
-      (*node->mutable_attr())["T"].set_type(output_type);
-      *node->mutable_input(1) = AsControlDependency(node->input(1));
-      graph_modified_ = true;
-      continue;
+    node->set_op("Identity");
+    node->clear_attr();
+    (*node->mutable_attr())["T"].set_type(output_type);
+    *node->mutable_input(1) = AsControlDependency(node->input(1));
+    return true;
+  }
+  return false;
+}
+
+bool ConstantFolding::SimplifyReshape(const GraphProperties& properties,
+                                      bool use_shape_info, NodeDef* node) {
+  if (!use_shape_info) return false;
+  if (!IsSimplifiableReshape(*node, properties)) return false;
+  DataType output_type = node->attr().at("T").type();
+  node->set_op("Identity");
+  node->clear_attr();
+  (*node->mutable_attr())["T"].set_type(output_type);
+  *node->mutable_input(1) = AsControlDependency(node->input(1));
+  return true;
+}
+
+Status ConstantFolding::SimplifyArithmeticOperations(
+    const GraphProperties& properties, bool use_shape_info,
+    GraphDef* optimized_graph, NodeDef* node, bool* success) {
+  const bool is_mul = IsMul(*node) || IsLogicalAnd(*node);
+  const bool is_matmul = IsMatMul(*node);
+  const bool is_add = IsAdd(*node) || IsBiasAdd(*node) || IsLogicalOr(*node);
+  const bool is_sub = IsSub(*node);
+  const bool is_any_div = IsAnyDiv(*node);
+  // Simplify arithmetic operations with ones or zeros.
+  if (use_shape_info &&
+      (is_mul || is_matmul || is_add || is_sub || is_any_div) &&
+      properties.HasInputProperties(node->name()) &&
+      properties.HasOutputProperties(node->name())) {
+    const NodeDef* x = node_map_->GetNode(node->input(0));
+    const NodeDef* y = node_map_->GetNode(node->input(1));
+    if (x == nullptr || y == nullptr) {
+      return errors::InvalidArgument("Invalid inputs to node: ",
+                                     node->DebugString());
+    }
+    const TensorShapeProto& output_shape =
+        properties.GetOutputProperties(node->name())[0].shape();
+
+    // Simplify element-wise multiplication by ones or addition/subtraction
+    // of zeros.
+    const TensorShapeProto& y_shape =
+        properties.GetInputProperties(node->name())[1].shape();
+    const bool x_is_zero = IsZeros(*x);
+    const bool x_is_one = x_is_zero ? false : IsOnes(*x);
+    const bool y_matches_output_shape = ShapesEqual(output_shape, y_shape);
+    if (y_matches_output_shape &&
+        ((is_mul && x_is_one) || (is_add && x_is_zero))) {
+      // 1 * y = y or 0 + y = y.
+      ReplaceOperationWithSnapshot(1, properties, node, optimized_graph);
+      *success = true;
+      return Status::OK();
     }
 
-    const bool is_mul = IsMul(*node);
-    const bool is_matmul = IsMatMul(*node);
-    const bool is_add = IsAdd(*node) || IsBiasAdd(*node);
-    const bool is_sub = IsSub(*node);
-    const bool is_any_div = IsAnyDiv(*node);
-    // Simplify arithmetic operations with ones or zeros.
-    if (use_shape_info &&
-        (is_mul || is_matmul || is_add || is_sub || is_any_div) &&
-        properties->HasInputProperties(node->name()) &&
-        properties->HasOutputProperties(node->name())) {
-      const NodeDef* x = node_map_->GetNode(node->input(0));
-      const NodeDef* y = node_map_->GetNode(node->input(1));
-      if (x == nullptr || y == nullptr) {
-        return errors::InvalidArgument("Invalid inputs to node: ",
-                                       node->DebugString());
-      }
-      const TensorShapeProto& output_shape =
-          properties->GetOutputProperties(node->name())[0].shape();
-
-      // Simplify element-wise multiplication by ones or addition/subtraction
-      // of zeros.
-      const TensorShapeProto& y_shape =
-          properties->GetInputProperties(node->name())[1].shape();
-      const bool x_is_zero = IsZeros(*x);
-      const bool x_is_one = x_is_zero ? false : IsOnes(*x);
-      const bool y_matches_output_shape = ShapesEqual(output_shape, y_shape);
-      if (y_matches_output_shape &&
-          ((is_mul && x_is_one) || (is_add && x_is_zero))) {
-        // 1 * y = y or 0 + y = y.
-        ReplaceOperationWithSnapshot(1, node, optimized_graph);
-        continue;
-      }
+    if (y_matches_output_shape && (is_sub && x_is_zero)) {
+      // Replace 0 - y with Neg(y).
+      ReplaceSubtractionFromZeroByNegation(node, optimized_graph);
+      *success = true;
+      return Status::OK();
+    }
 
-      if (y_matches_output_shape && (is_sub && x_is_zero)) {
-        // Replace 0 - y with Neg(y).
-        ReplaceSubtractionFromZeroByNegation(node, optimized_graph);
-        continue;
+    // Replace 1 / y with Reciprocal op.
+    if (y_matches_output_shape && is_any_div && x_is_one) {
+      DataType type = node->attr().at("T").type();
+      if (DataTypeIsFloating(type) || DataTypeIsComplex(type)) {
+        ReplaceDivisionOfOnesByReciprocal(node, optimized_graph);
+        *success = true;
+        return Status::OK();
       }
+    }
 
-      // Replace 1 / y with Reciprocal op.
-      if (y_matches_output_shape && is_any_div && x_is_one) {
-        DataType type = node->attr().at("T").type();
-        if (DataTypeIsFloating(type) || DataTypeIsComplex(type)) {
-          ReplaceDivisionOfOnesByReciprocal(node, optimized_graph);
-          continue;
+    const TensorShapeProto& x_shape =
+        properties.GetInputProperties(node->name())[0].shape();
+    const bool y_is_zero = IsZeros(*y);
+    const bool y_is_one = y_is_zero ? false : IsOnes(*y);
+    const bool x_matches_output_shape = ShapesEqual(output_shape, x_shape);
+    if (x_matches_output_shape && (((is_mul || is_any_div) && y_is_one) ||
+                                   ((is_add || is_sub) && y_is_zero))) {
+      // x * 1 = x or x / 1 = x or x +/- 0 = x
+      ReplaceOperationWithSnapshot(0, properties, node, optimized_graph);
+      *success = true;
+      return Status::OK();
+    }
+
+    // x OR true = true OR y = true.
+    bool updated_graph = false;
+    const PartialTensorShape shp(output_shape);
+    if (shp.IsFullyDefined() && IsLogicalOr(*node) && (y_is_one || x_is_one)) {
+      bool replace_succeed = false;
+      Status replace_op_status = ReplaceOperationWithConstant(
+          1, properties, output_shape, node, optimized_graph, &replace_succeed);
+      if (!replace_op_status.ok()) {
+        return replace_op_status;
+      } else if (replace_succeed) {
+        updated_graph = true;
+      }
+    }
+
+    // Simplify multiplication and matmul by zeros.
+    // Also optimize zeros divided by a tensor, but only if we are in
+    // aggressive mode, since we might get rid of divisions by zero.
+    const bool is_aggressive = opt_level_ == RewriterConfig::AGGRESSIVE;
+    bool optimize_zeros_divided_by_y = is_any_div && x_is_zero && is_aggressive;
+    if ((x_is_zero || y_is_zero) &&
+        (is_mul || is_matmul || optimize_zeros_divided_by_y)) {
+      if (shp.IsFullyDefined()) {
+        bool replace_succeed = false;
+        Status replace_op_status =
+            ReplaceOperationWithConstant(0, properties, output_shape, node,
+                                         optimized_graph, &replace_succeed);
+        if (!replace_op_status.ok()) {
+          return replace_op_status;
+        } else if (replace_succeed) {
+          *success = true;
+          return Status::OK();
         }
       }
-
-      const TensorShapeProto& x_shape =
-          properties->GetInputProperties(node->name())[0].shape();
-      const bool y_is_zero = IsZeros(*y);
-      const bool y_is_one = y_is_zero ? false : IsOnes(*y);
-      const bool x_matches_output_shape = ShapesEqual(output_shape, x_shape);
-      if (x_matches_output_shape && (((is_mul || is_any_div) && y_is_one) ||
-                                     ((is_add || is_sub) && y_is_zero))) {
-        // x * 1 = x or x / 1 = x or x +/- 0 = x
-        ReplaceOperationWithSnapshot(0, node, optimized_graph);
-        continue;
+      // Even if an input shape is only partially known, we may known that it
+      // matches the output shape and thus forward the corresponding zero
+      // input.
+      if ((is_mul || is_any_div) && x_is_zero && x_matches_output_shape) {
+        ReplaceOperationWithIdentity(0, properties, node, optimized_graph);
+        *success = true;
+        return Status::OK();
+      } else if (is_mul && y_is_zero && y_matches_output_shape) {
+        ReplaceOperationWithIdentity(1, properties, node, optimized_graph);
+        *success = true;
+        return Status::OK();
       }
+    }
+    if (updated_graph) {
+      *success = true;
+      return Status::OK();
+    }
+  }
+  *success = false;
+  return Status::OK();
+}
 
-      // Simplify multiplication and matmul by zeros.
-      // Also optimize zeros divided by a tensor, but only if we are in
-      // aggressive mode, since we might get rid of divisions by zero.
-      bool optimize_zeros_divided_by_y =
-          is_any_div && x_is_zero && is_aggressive;
-      if ((x_is_zero || y_is_zero) &&
-          (is_mul || is_matmul || optimize_zeros_divided_by_y)) {
-        const PartialTensorShape shp(output_shape);
-        if (shp.IsFullyDefined()) {
-          AttrValue dtype_attr;
-          if (node->op() == "SparseMatMul") {
-            dtype_attr.set_type(DT_FLOAT);
-          } else {
-            dtype_attr = node->attr().at("T");
-          }
-          TF_RETURN_IF_ERROR(ReplaceOperationWithConstant(
-              0, dtype_attr, output_shape, node, optimized_graph));
-          continue;
-        }
-        // Even if an input shape is only partially known, we may known that it
-        // matches the output shape and thus forward the corresponding zero
-        // input.
-        if ((is_mul || is_any_div) && x_is_zero && x_matches_output_shape) {
-          ReplaceOperationWithIdentity(0, node, optimized_graph);
-          continue;
-        } else if (is_mul && y_is_zero && y_matches_output_shape) {
-          ReplaceOperationWithIdentity(1, node, optimized_graph);
-          continue;
-        }
-      }
+bool ConstantFolding::ReduceDivToReciprocalMul(GraphDef* optimized_graph,
+                                               NodeDef* node) {
+  // Strength reduce floating point division by a constant Div(x, const) to
+  // multiplication by the reciprocal Mul(x, Reciprocal(const)). This in turn
+  // will be constant folded to Mul(x, 1.0/const).
+  if (node->input_size() >= 2 && (IsRealDiv(*node) || IsDiv(*node))) {
+    const string& const_input = node->input(1);
+    const NodeDef* denom = node_map_->GetNode(const_input);
+    CHECK(denom != nullptr);
+    if (!IsReallyConstant(*denom)) {
+      return false;
+    }
+    if (node->attr().count("T") == 0) {
+      return false;
     }
+    DataType type = node->attr().at("T").type();
+    if (IsDiv(*node) &&
+        !(DataTypeIsFloating(type) || DataTypeIsComplex(type))) {
+      return false;
+    }
+    // Insert new reciprocal op and change node from Div to Mul.
+    NodeDef* reciprocal_node = optimized_graph->add_node();
+    reciprocal_node->set_name(OptimizedNodeName(*node, "_recip"));
+    reciprocal_node->set_op("Reciprocal");
+    reciprocal_node->set_device(node->device());
+    node->set_op("Mul");
+    // Re-wire inputs and outputs.
+    reciprocal_node->add_input(const_input);
+    (*reciprocal_node->mutable_attr())["T"].set_type(type);
+    node->set_input(1, reciprocal_node->name());
+    node_map_->AddNode(reciprocal_node->name(), reciprocal_node);
+    node_map_->UpdateOutput(node->name(), const_input, reciprocal_node->name());
+    return true;
+  }
 
-    // Strength reduce floating point division by a constant Div(x, const) to
-    // multiplication by the reciprocal Mul(x, Reciprocal(const)). This in turn
-    // will be constant folded to Mul(x, 1.0/const).
-    if (node->input_size() >= 2 && (IsRealDiv(*node) || IsDiv(*node))) {
-      const string& const_input = node->input(1);
-      const NodeDef* denom = node_map_->GetNode(const_input);
-      CHECK(denom != nullptr);
-      if (!IsReallyConstant(*denom)) {
-        continue;
-      }
-      if (node->attr().count("T") == 0) {
-        continue;
-      }
-      DataType type = node->attr().at("T").type();
-      if (IsDiv(*node) &&
-          !(DataTypeIsFloating(type) || DataTypeIsComplex(type))) {
-        continue;
-      }
-      // Insert new reciprocal op and change node from Div to Mul.
-      NodeDef* reciprocal_node = optimized_graph->add_node();
-      reciprocal_node->set_name(OptimizedNodeName(*node, "_recip"));
-      reciprocal_node->set_op("Reciprocal");
-      reciprocal_node->set_device(node->device());
-      node->set_op("Mul");
-      // Re-wire inputs and outputs.
-      reciprocal_node->add_input(const_input);
-      (*reciprocal_node->mutable_attr())["T"].set_type(type);
-      node->set_input(1, reciprocal_node->name());
-      node_map_->AddNode(reciprocal_node->name(), reciprocal_node);
-      node_map_->UpdateOutput(node->name(), const_input,
-                              reciprocal_node->name());
-      graph_modified_ = true;
-      continue;
+  return false;
+}
+
+bool ConstantFolding::ConstantPushDown(NodeDef* node) {
+  // Consider the transformation
+  //
+  //                      +                +       = parent
+  //                     / \              / \
+  //                    C   +    -- >    X   +     = children
+  //                       / \              / \
+  //                      X   Y            C   Y   = leaves
+  //
+  // where C is constant and X is non-constant, and '+' denotes an
+  // associative and commutative operator like addition or multiplication.
+  // This optimization pushes constants down in the tree to canonicalize it.
+  // Moreoever, in cases where the child node has a second constant input Y
+  // we will create a leaf node that can be folded, e.g.
+  //
+  //    Add(C1, Add(C2, X)) -> Add(X, Add(C1, C2)) -> Add(X, C1 + C2)
+  //
+  // TODO(rmlarsen): Handle non-associative/non-commutative operators like
+  // subtraction and division, as well as mixed subtraction/addition,
+  // division/multiplication.
+  // Don't touch BiasAdd since they can't handle vectors as their first
+  // inputs.
+  if (has_fetch_ && (IsAdd(*node) || IsMul(*node)) &&
+      NumNonControlInputs(*node) == 2) {
+    NodeDef* left_child = node_map_->GetNode(node->input(0));
+    NodeDef* right_child = node_map_->GetNode(node->input(1));
+    // One child must be constant, and the other the same op as the parent.
+    if (node->op() != left_child->op() && node->op() != right_child->op()) {
+      return false;
+    }
+    const bool left_child_is_constant = IsReallyConstant(*left_child);
+    const bool right_child_is_constant = IsReallyConstant(*right_child);
+    if (!left_child_is_constant && !right_child_is_constant) {
+      return false;
+    }
+    if (node->device() != left_child->device() ||
+        node->device() != right_child->device()) {
+      return false;
+    }
+    NodeDef* op_child_node = left_child_is_constant ? right_child : left_child;
+    NodeDef* const_child_node =
+        left_child_is_constant ? left_child : right_child;
+    // Make sure that it is safe to change the value of the child node->
+    if (op_child_node->input_size() < 2 ||
+        nodes_to_preserve_.find(op_child_node->name()) !=
+            nodes_to_preserve_.end() ||
+        NumNonControlOutputs(*op_child_node, *node_map_) > 1) {
+      return false;
     }
 
-    // Consider the transformation
-    //
-    //                      +                +       = parent
-    //                     / \              / \
-    //                    C   +    -- >    X   +     = children
-    //                       / \              / \
-    //                      X   Y            C   Y   = leaves
-    //
-    // where C is constant and X is non-constant, and '+' denotes an
-    // associative and commutative operator like addition or multiplication.
-    // This optimization pushes constants down in the tree to canonicalize it.
-    // Moreoever, in cases where the child node has a second constant input Y
-    // we will create a leaf node that can be folded, e.g.
-    //
-    //    Add(C1, Add(C2, X)) -> Add(X, Add(C1, C2)) -> Add(X, C1 + C2)
-    //
-    // TODO(rmlarsen): Handle non-associative/non-commutative operators like
-    // subtraction and division, as well as mixed subtraction/addition,
-    // division/multiplication.
-    // Don't touch BiasAdd since they can't handle vectors as their first
-    // inputs.
-    if (has_fetch_ && (IsAdd(*node) || is_mul) &&
-        NumNonControlInputs(*node) == 2) {
-      NodeDef* left_child = node_map_->GetNode(node->input(0));
-      NodeDef* right_child = node_map_->GetNode(node->input(1));
-      // One child must be constant, and the other the same op as the parent.
-      if (node->op() != left_child->op() && node->op() != right_child->op()) {
-        continue;
-      }
-      const bool left_child_is_constant = IsReallyConstant(*left_child);
-      const bool right_child_is_constant = IsReallyConstant(*right_child);
-      if (!left_child_is_constant && !right_child_is_constant) {
-        continue;
-      }
-      if (node->device() != left_child->device() ||
-          node->device() != right_child->device()) {
-        continue;
-      }
-      NodeDef* op_child_node =
-          left_child_is_constant ? right_child : left_child;
-      NodeDef* const_child_node =
-          left_child_is_constant ? left_child : right_child;
-      // Make sure that it is safe to change the value of the child node->
-      if (op_child_node->input_size() < 2 ||
-          nodes_to_preserve_.find(op_child_node->name()) !=
-              nodes_to_preserve_.end() ||
-          NumNonControlOutputs(*op_child_node, *node_map_) > 1) {
-        continue;
-      }
+    // Identify the nodes to swap.
+    NodeDef* left_leaf = node_map_->GetNode(op_child_node->input(0));
+    NodeDef* right_leaf = node_map_->GetNode(op_child_node->input(1));
+    const bool left_leaf_is_constant = IsReallyConstant(*left_leaf);
+    const bool right_leaf_is_constant = IsReallyConstant(*right_leaf);
+    if (left_leaf_is_constant && right_leaf_is_constant) {
+      // Child is already foldable, leave it alone.
+      return false;
+    }
+    const int non_const_leaf_input = left_leaf_is_constant ? 1 : 0;
+    const int parent_const_input = left_child_is_constant ? 0 : 1;
+    const auto& child_output = node_map_->GetOutputs(op_child_node->name());
+    if (child_output.find(const_child_node) != child_output.end()) {
+      // If there is a control edge from the child op to C, the transformation
+      // would create a cycle in the graph. We know that it must be a control
+      // edge. We can replace such a control edge with a control edge from A
+      // to C.
+      CHECK(MaybeRemoveControlInput(op_child_node->name(), const_child_node,
+                                    graph_, node_map_.get()));
+      NodeDef* other_leaf = left_leaf_is_constant ? left_leaf : right_leaf;
+      MaybeAddControlInput(other_leaf->name(), const_child_node, graph_,
+                           node_map_.get());
+    }
+
+    // Swap the constant child with a non-constant leaf node.
+    node_map_->UpdateInput(node->name(), node->input(parent_const_input),
+                           op_child_node->input(non_const_leaf_input));
+    node_map_->UpdateInput(op_child_node->name(),
+                           op_child_node->input(non_const_leaf_input),
+                           node->input(parent_const_input));
+    std::swap(*node->mutable_input(parent_const_input),
+              *op_child_node->mutable_input(non_const_leaf_input));
+    return true;
+  }
+  return false;
+}
 
-      // Identify the nodes to swap.
-      NodeDef* left_leaf = node_map_->GetNode(op_child_node->input(0));
-      NodeDef* right_leaf = node_map_->GetNode(op_child_node->input(1));
-      const bool left_leaf_is_constant = IsReallyConstant(*left_leaf);
-      const bool right_leaf_is_constant = IsReallyConstant(*right_leaf);
-      if (left_leaf_is_constant && right_leaf_is_constant) {
-        // Child is already foldable, leave it alone.
-        continue;
-      }
-      const int non_const_leaf_input = left_leaf_is_constant ? 1 : 0;
-      const int parent_const_input = left_child_is_constant ? 0 : 1;
-      const auto& child_output = node_map_->GetOutputs(op_child_node->name());
-      if (child_output.find(const_child_node) != child_output.end()) {
-        // If there is a control edge from the child op to C, the transformation
-        // would create a cycle in the graph. We know that it must be a control
-        // edge. We can replace such a control edge with a control edge from A
-        // to C.
-        CHECK(MaybeRemoveControlInput(op_child_node->name(), const_child_node,
-                                      graph_, node_map_.get()));
-        NodeDef* other_leaf = left_leaf_is_constant ? left_leaf : right_leaf;
-        MaybeAddControlInput(other_leaf->name(), const_child_node, graph_,
-                             node_map_.get());
-      }
-
-      // Swap the constant child with a non-constant leaf node.
-      node_map_->UpdateInput(node->name(), node->input(parent_const_input),
-                             op_child_node->input(non_const_leaf_input));
-      node_map_->UpdateInput(op_child_node->name(),
-                             op_child_node->input(non_const_leaf_input),
-                             node->input(parent_const_input));
-      std::swap(*node->mutable_input(parent_const_input),
-                *op_child_node->mutable_input(non_const_leaf_input));
-      graph_modified_ = true;
-      continue;
+bool ConstantFolding::MulConvPushDown(NodeDef* node,
+                                      const GraphProperties& properties) {
+  // Push down multiplication on ConvND.
+  //                       *                  ConvND
+  //                     /   \                /    \
+  //                 ConvND  C2    -- >      X      *
+  //                  / \                          / \
+  //                 X  C1                       C1  C2
+  //
+  // where C1 and C2 are constants and X is non-constant.
+  if (IsMul(*node) && NumNonControlInputs(*node) == 2) {
+    NodeDef* mul_left_child = node_map_->GetNode(node->input(0));
+    NodeDef* mul_right_child = node_map_->GetNode(node->input(1));
+    // One child must be constant, and the second must be Conv op.
+    const bool left_child_is_constant = IsReallyConstant(*mul_left_child);
+    const bool right_child_is_constant = IsReallyConstant(*mul_right_child);
+    if (!left_child_is_constant && !right_child_is_constant) {
+      return false;
+    }
+    NodeDef* conv_node =
+        left_child_is_constant ? mul_right_child : mul_left_child;
+    if (!IsConv2D(*conv_node) && !IsConv3D(*conv_node)) {
+      return false;
+    }
+    if (node->device() != mul_left_child->device() ||
+        node->device() != mul_right_child->device()) {
+      return false;
     }
 
-    // Partial constant propagation through IdentityN.
-    if (IsIdentityN(*node) && NumNonControlInputs(*node) > 0) {
-      const std::set<NodeDef*>& tmp = node_map_->GetOutputs(node->name());
-      const std::vector<NodeDef*> consumers(tmp.begin(), tmp.end());
-      bool updated_graph = false;
-      for (int input_idx = 0; input_idx < node->input_size(); ++input_idx) {
-        const string& input = node->input(input_idx);
-        if (IsControlInput(input)) {
-          break;
-        }
-        const NodeDef* input_node = node_map_->GetNode(NodeName(input));
-        if (input_node == nullptr) {
-          LOG(ERROR) << "Bad input: " << input;
-          break;
-        }
-        // Forward constant inputs to outputs and add a control dependency on
-        // the IdentityN node.
-        if (IsReallyConstant(*input_node)) {
-          // Update each consumer.
-          for (NodeDef* consumer : consumers) {
-            bool add_dep = false;
-            for (int consumer_input_idx = 0;
-                 consumer_input_idx < consumer->input_size();
-                 ++consumer_input_idx) {
-              const string& consumer_input =
-                  consumer->input(consumer_input_idx);
-              if (IsControlInput(consumer_input)) {
-                break;
-              }
-              int output_idx;
-              const string input_node_name =
-                  ParseNodeName(consumer_input, &output_idx);
-              if (input_node_name == node->name() && output_idx == input_idx) {
-                consumer->set_input(consumer_input_idx, input);
-                // We will keep the input from IdentityN through a control
-                // dependency, so we only need to add the consumer as an output
-                // for the constant input node.
-                node_map_->AddOutput(NodeName(input), consumer->name());
-                add_dep = true;
-              }
+    // Make sure that it is safe to change the value of the convolution
+    // output.
+    if (conv_node->input_size() < 2 ||
+        NumNonControlOutputs(*conv_node, *node_map_) > 1 ||
+        nodes_to_preserve_.find(conv_node->name()) !=
+            nodes_to_preserve_.end()) {
+      return false;
+    }
+
+    // Identify the nodes to swap.
+    NodeDef* conv_left_child = node_map_->GetNode(conv_node->input(0));
+    NodeDef* conv_right_child = node_map_->GetNode(conv_node->input(1));
+    const bool conv_left_is_constant = IsReallyConstant(*conv_left_child);
+    const bool conv_right_is_constant = IsReallyConstant(*conv_right_child);
+    if (!conv_left_is_constant && !conv_right_is_constant) {
+      // At least one of the convolution inputs should be constant.
+      return false;
+    }
+    if (conv_left_is_constant && conv_right_is_constant) {
+      // Leverage regular constant folding to handle this.
+      return false;
+    }
+    const auto& mul_props = properties.GetOutputProperties(node->name());
+    const auto& conv_props = properties.GetOutputProperties(conv_node->name());
+    if (mul_props.empty() || conv_props.empty()) {
+      return false;
+    }
+    const auto& mul_shape = mul_props[0].shape();
+    const auto& conv_shape = conv_props[0].shape();
+    if (!ShapesSymbolicallyEqual(mul_shape, conv_shape)) {
+      return false;
+    }
+
+    const auto& input_props = properties.GetInputProperties(conv_node->name());
+    if (input_props.size() < 2) {
+      return false;
+    }
+    const auto& filter_shape = input_props[1].shape();
+
+    NodeDef* const_node =
+        left_child_is_constant ? mul_left_child : mul_right_child;
+    const auto& const_props =
+        properties.GetOutputProperties(const_node->name());
+    if (const_props.empty()) {
+      return false;
+    }
+    const auto& const_shape = const_props[0].shape();
+
+    TensorShapeProto new_filter_shape;
+    if (!ShapeAfterBroadcast(filter_shape, const_shape, &new_filter_shape)) {
+      return false;
+    }
+    if (!ShapesSymbolicallyEqual(filter_shape, new_filter_shape)) {
+      return false;
+    }
+
+    string mul_new_name =
+        AddPrefixToNodeName("merged_input", conv_node->name());
+    if (node_map_->NodeExists(mul_new_name)) {
+      return false;
+    }
+    // Make sure we don't introduce loops in the graph by removing control
+    // dependencies from the conv2d node to c2.
+    NodeDef* conv_const_node =
+        conv_left_is_constant ? conv_left_child : conv_right_child;
+    if (MaybeRemoveControlInput(conv_node->name(), const_node, graph_,
+                                node_map_.get())) {
+      // Add a control dep from c1 to c2 to ensure c2 is in the right frame
+      *const_node->add_input() = AsControlDependency(*conv_const_node);
+    }
+
+    conv_node->set_name(node->name());
+    node->set_name(mul_new_name);
+    if (conv_left_is_constant) {
+      node_map_->UpdateInput(conv_node->name(), node->input(0), mul_new_name);
+      conv_node->set_input(0, mul_new_name);
+    } else {
+      node_map_->UpdateInput(conv_node->name(), node->input(1), mul_new_name);
+      conv_node->set_input(1, mul_new_name);
+    }
+    if (left_child_is_constant) {
+      node->set_input(1, conv_const_node->name());
+    } else {
+      node->set_input(0, conv_const_node->name());
+    }
+    node_map_->AddNode(mul_new_name, node);
+
+    return true;
+  }
+  return false;
+}
+
+bool ConstantFolding::PartialConstPropThroughIdentityN(NodeDef* node) {
+  // Partial constant propagation through IdentityN.
+  if (IsIdentityN(*node) && NumNonControlInputs(*node) > 0) {
+    const std::set<NodeDef*>& tmp = node_map_->GetOutputs(node->name());
+    const std::vector<NodeDef*> consumers(tmp.begin(), tmp.end());
+    bool updated_graph = false;
+    for (int input_idx = 0; input_idx < node->input_size(); ++input_idx) {
+      const string& input = node->input(input_idx);
+      if (IsControlInput(input)) {
+        break;
+      }
+      const NodeDef* input_node = node_map_->GetNode(NodeName(input));
+      if (input_node == nullptr) {
+        LOG(ERROR) << "Bad input: " << input;
+        break;
+      }
+      // Forward constant inputs to outputs and add a control dependency on
+      // the IdentityN node.
+      if (IsReallyConstant(*input_node)) {
+        // Update each consumer.
+        for (NodeDef* consumer : consumers) {
+          bool add_dep = false;
+          for (int consumer_input_idx = 0;
+               consumer_input_idx < consumer->input_size();
+               ++consumer_input_idx) {
+            const string& consumer_input = consumer->input(consumer_input_idx);
+            if (IsControlInput(consumer_input)) {
+              break;
             }
-            if (add_dep) {
-              consumer->add_input(AsControlDependency(node->name()));
-              updated_graph = true;
+            int output_idx;
+            const string input_node_name =
+                ParseNodeName(consumer_input, &output_idx);
+            if (input_node_name == node->name() && output_idx == input_idx) {
+              consumer->set_input(consumer_input_idx, input);
+              // We will keep the input from IdentityN through a control
+              // dependency, so we only need to add the consumer as an output
+              // for the constant input node.
+              node_map_->AddOutput(NodeName(input), consumer->name());
+              add_dep = true;
             }
           }
+          if (add_dep) {
+            consumer->add_input(AsControlDependency(node->name()));
+            updated_graph = true;
+          }
         }
       }
+    }
 
-      if (updated_graph) {
-        for (NodeDef* consumer : consumers) {
-          DedupControlInputs(consumer);
-        }
-        graph_modified_ = true;
-        continue;
+    if (updated_graph) {
+      for (NodeDef* consumer : consumers) {
+        DedupControlInputs(consumer);
       }
+      return true;
     }
+  }
+  return false;
+}
 
-    // Partial constant folding for associative operators:
-    // Split AddN/AccumulateNV2 to enable partial
-    // folding of ops when more than one but not all inputs are constant.
-    // For AddN and AccumulateNV2, we may furthermore reorder inputs, since
-    // addition is commutative.
-    const int num_non_control_inputs = NumNonControlInputs(*node);
-    if (IsAggregate(*node) && IsCommutative(*node) &&
-        num_non_control_inputs > 2) {
-      const int num_control_inputs =
-          node->input_size() - num_non_control_inputs;
-      std::vector<int> const_inputs;
-      std::vector<int> nonconst_inputs;
-      for (int i = 0; i < node->input_size(); ++i) {
-        const string& input = node->input(i);
-        const NodeDef* input_node = node_map_->GetNode(NodeName(input));
-        CHECK(input_node != nullptr) << input;
-        if (!IsControlInput(input) && IsReallyConstant(*input_node)) {
-          const_inputs.push_back(i);
-        } else {
-          // Non-const and control inputs.
-          nonconst_inputs.push_back(i);
+bool ConstantFolding::PartialAssocOpConstFolding(GraphDef* optimized_graph,
+                                                 GraphProperties* properties,
+                                                 NodeDef* node) {
+  // Partial constant folding for associative operators:
+  // Split AddN/AccumulateNV2 to enable partial
+  // folding of ops when more than one but not all inputs are constant.
+  // For AddN and AccumulateNV2, we may furthermore reorder inputs, since
+  // addition is commutative.
+  const int num_non_control_inputs = NumNonControlInputs(*node);
+  if (IsAggregate(*node) && IsCommutative(*node) &&
+      num_non_control_inputs > 2) {
+    const int num_control_inputs = node->input_size() - num_non_control_inputs;
+    std::vector<int> const_inputs;
+    std::vector<int> nonconst_inputs;
+    for (int i = 0; i < node->input_size(); ++i) {
+      const string& input = node->input(i);
+      const NodeDef* input_node = node_map_->GetNode(NodeName(input));
+      CHECK(input_node != nullptr) << input;
+      if (!IsControlInput(input) && IsReallyConstant(*input_node)) {
+        const_inputs.push_back(i);
+      } else {
+        // Non-const and control inputs.
+        nonconst_inputs.push_back(i);
+      }
+    }
+    // Promote AccumulateNV2 with all constant inputs to AddN, since it is
+    // a fake node that cannot be constant folded by itself.
+    if (const_inputs.size() == num_non_control_inputs &&
+        node->op() == "AccumulateNV2") {
+      node->set_op("AddN");
+      node->mutable_attr()->erase("shape");
+      return true;
+    }
+    const string new_node_name = OptimizedNodeName(
+        *node, strings::StrCat("_partial_split_", const_inputs.size()));
+    if (1 < const_inputs.size() &&
+        const_inputs.size() < num_non_control_inputs &&
+        !node_map_->NodeExists(new_node_name)) {
+      NodeDef* added_node = optimized_graph->add_node();
+      *added_node = *node;
+      // Always use AddN for the constant node, since AccumulateNV2 is a fake
+      // node that cannot be constant folded, since it does not have a kernel.
+      added_node->set_op("AddN");
+      added_node->mutable_attr()->erase("shape");
+      added_node->set_name(new_node_name);
+      node_map_->AddNode(added_node->name(), added_node);
+      added_node->clear_input();
+      for (int i : const_inputs) {
+        added_node->add_input(node->input(i));
+        node_map_->UpdateOutput(NodeName(node->input(i)), node->name(),
+                                added_node->name());
+      }
+
+      // Overwrite the first const input with the added node.
+      node->set_input(const_inputs[0], added_node->name());
+      node_map_->AddOutput(added_node->name(), node->name());
+      nonconst_inputs.push_back(const_inputs[0]);
+      // Compact the remaining inputs to the original node.
+      std::sort(nonconst_inputs.begin(), nonconst_inputs.end());
+      int idx = 0;
+      for (int i : nonconst_inputs) {
+        if (idx != i) {
+          node->set_input(idx, node->input(i));
         }
+        ++idx;
       }
-      // Promote AccumulateNV2 with all constant inputs to AddN, since it is
-      // a fake node that cannot be constant folded by itself.
-      if (const_inputs.size() == num_non_control_inputs &&
-          node->op() == "AccumulateNV2") {
-        node->set_op("AddN");
-        node->mutable_attr()->erase("shape");
-        graph_modified_ = true;
-        continue;
-      }
-      const string new_node_name = OptimizedNodeName(
-          *node, strings::StrCat("_partial_split_", const_inputs.size()));
-      if (1 < const_inputs.size() &&
-          const_inputs.size() < num_non_control_inputs &&
-          !node_map_->NodeExists(new_node_name)) {
-        NodeDef* added_node = optimized_graph->add_node();
-        *added_node = *node;
-        // Always use AddN for the constant node, since AccumulateNV2 is a fake
-        // node that cannot be constant folded, since it does not have a kernel.
-        added_node->set_op("AddN");
-        added_node->mutable_attr()->erase("shape");
-        added_node->set_name(new_node_name);
-        node_map_->AddNode(added_node->name(), added_node);
-        added_node->clear_input();
-        for (int i : const_inputs) {
-          added_node->add_input(node->input(i));
-          node_map_->UpdateOutput(NodeName(node->input(i)), node->name(),
-                                  added_node->name());
-        }
+      node->mutable_input()->DeleteSubrange(nonconst_inputs.size(),
+                                            const_inputs.size() - 1);
+      (*node->mutable_attr())["N"].set_i(node->input_size() -
+                                         num_control_inputs);
+      properties->ClearInputProperties(node->name());
+      (*added_node->mutable_attr())["N"].set_i(const_inputs.size());
+      return true;
+    }
+  }
+  return false;
+}
 
-        // Overwrite the first const input with the added node.
-        node->set_input(const_inputs[0], added_node->name());
-        node_map_->AddOutput(added_node->name(), node->name());
-        nonconst_inputs.push_back(const_inputs[0]);
-        // Compact the remaining inputs to the original node.
-        std::sort(nonconst_inputs.begin(), nonconst_inputs.end());
-        int idx = 0;
-        for (int i : nonconst_inputs) {
-          if (idx != i) {
-            node->set_input(idx, node->input(i));
-          }
-          ++idx;
-        }
-        node->mutable_input()->DeleteSubrange(nonconst_inputs.size(),
-                                              const_inputs.size() - 1);
-        (*node->mutable_attr())["N"].set_i(node->input_size() -
-                                           num_control_inputs);
-        properties->ClearInputProperties(node->name());
-        (*added_node->mutable_attr())["N"].set_i(const_inputs.size());
-        graph_modified_ = true;
-        continue;
-      }
+bool ConstantFolding::PartialConcatConstFolding(GraphDef* optimized_graph,
+                                                GraphProperties* properties,
+                                                NodeDef* node) {
+  // Partial constant folding for Concat which is not commutative, so
+  // we have to preserve order and can only push consecutive runs of constant
+  // inputs into sub-nodes.
+  const int num_non_control_inputs = NumNonControlInputs(*node);
+  if (IsConcat(*node) && num_non_control_inputs > 3 &&
+      node->name().rfind("_partial_split_") == string::npos) {
+    int axis_arg = -1;
+    int begin = 0;
+    int end = num_non_control_inputs;
+    if (node->op() == "Concat") {
+      begin = 1;
+      axis_arg = 0;
+    } else if (node->op() == "ConcatV2") {
+      end = num_non_control_inputs - 1;
+      axis_arg = num_non_control_inputs - 1;
+    } else {
+      return false;
     }
 
-    // Partial constant folding for Concat which is not commutative, so
-    // we have to preserve order and can only push consecutive runs of constant
-    // inputs into sub-nodes.
-    if (IsConcat(*node) && num_non_control_inputs > 3 &&
-        node->name().rfind("_partial_split_") == string::npos) {
-      int axis_arg = -1;
-      int begin = 0;
-      int end = num_non_control_inputs;
-      if (node->op() == "Concat") {
-        begin = 1;
-        axis_arg = 0;
-      } else if (node->op() == "ConcatV2") {
-        end = num_non_control_inputs - 1;
-        axis_arg = num_non_control_inputs - 1;
-      } else {
-        continue;
-      }
+    const NodeDef* axis_arg_node =
+        node_map_->GetNode(NodeName(node->input(axis_arg)));
+    if (axis_arg_node == nullptr || !IsReallyConstant(*axis_arg_node)) {
+      // We cannot constant fold Concat unless we the axis argument is
+      // constant. Skip node.
+      return false;
+    }
 
-      const NodeDef* axis_arg_node =
-          node_map_->GetNode(NodeName(node->input(axis_arg)));
-      if (axis_arg_node == nullptr || !IsReallyConstant(*axis_arg_node)) {
-        // We cannot constant fold Concat unless we the axis argument is
-        // constant. Skip node.
-        continue;
+    // We search for consecutive runs of constant inputs in the range
+    // [begin:end[ and push then down into child nodes.
+    std::vector<std::pair<int, int>> constant_input_runs;
+    int first = begin;
+    int last = begin;
+    while (last < end) {
+      while (first < end && !IsReallyConstant(*node_map_->GetNode(
+                                NodeName(node->input(first))))) {
+        ++first;
       }
-
-      // We search for consecutive runs of constant inputs in the range
-      // [begin:end[ and push then down into child nodes.
-      std::vector<std::pair<int, int>> constant_input_runs;
-      int first = begin;
-      int last = begin;
-      while (last < end) {
-        while (first < end && !IsReallyConstant(*node_map_->GetNode(
-                                  NodeName(node->input(first))))) {
-          ++first;
-        }
-        // Invariant: node[first] is constant || first >= end.
-        last = first + 1;
-        while (last < end && IsReallyConstant(*node_map_->GetNode(
-                                 NodeName(node->input(last))))) {
-          ++last;
-        }
-        // Invariant: node[last] is not constant || last >= end
-        // Discard intervals shorter than 2 elements.
-        if (first < end && (last - first) > 1) {
-          constant_input_runs.emplace_back(first, last);
-        }
-        first = last;
+      // Invariant: node[first] is constant || first >= end.
+      last = first + 1;
+      while (last < end && IsReallyConstant(*node_map_->GetNode(
+                               NodeName(node->input(last))))) {
+        ++last;
+      }
+      // Invariant: node[last] is not constant || last >= end
+      // Discard intervals shorter than 2 elements.
+      if (first < end && (last - first) > 1) {
+        constant_input_runs.emplace_back(first, last);
       }
+      first = last;
+    }
 
-      // Skip if all inputs are constant, and let constant folding take over.
-      if (constant_input_runs.size() == 1 &&
-          constant_input_runs[0].first == begin &&
-          constant_input_runs[0].second == end) {
-        continue;
+    // Skip if all inputs are constant, and let constant folding take over.
+    if (constant_input_runs.size() == 1 &&
+        constant_input_runs[0].first == begin &&
+        constant_input_runs[0].second == end) {
+      return false;
+    }
+    std::set<int> inputs_to_delete;
+    for (auto interval : constant_input_runs) {
+      // Push the constant inputs in the interval to a child node than can be
+      // constant folded.
+      const string new_node_name = OptimizedNodeName(
+          *node, strings::StrCat("_partial_split_", interval.first));
+      if (node_map_->NodeExists(new_node_name)) {
+        break;
       }
-      std::set<int> inputs_to_delete;
-      for (auto interval : constant_input_runs) {
-        // Push the constant inputs in the interval to a child node than can be
-        // constant folded.
-        const string new_node_name = OptimizedNodeName(
-            *node, strings::StrCat("_partial_split_", interval.first));
-        if (node_map_->NodeExists(new_node_name)) {
-          break;
-        }
-        NodeDef* added_node = optimized_graph->add_node();
-        *added_node = *node;
-        added_node->set_name(new_node_name);
-        node_map_->AddNode(added_node->name(), added_node);
-        added_node->clear_input();
-        for (int i = interval.first; i < interval.second; ++i) {
-          added_node->add_input(node->input(i));
-          node_map_->UpdateOutput(NodeName(node->input(i)), node->name(),
-                                  added_node->name());
-          if (i != interval.first) {
-            inputs_to_delete.insert(i);
-          }
+      NodeDef* added_node = optimized_graph->add_node();
+      *added_node = *node;
+      added_node->set_name(new_node_name);
+      node_map_->AddNode(added_node->name(), added_node);
+      added_node->clear_input();
+      for (int i = interval.first; i < interval.second; ++i) {
+        added_node->add_input(node->input(i));
+        node_map_->UpdateOutput(NodeName(node->input(i)), node->name(),
+                                added_node->name());
+        if (i != interval.first) {
+          inputs_to_delete.insert(i);
         }
-        added_node->add_input(node->input(axis_arg));
-        (*added_node->mutable_attr())["N"].set_i(interval.second -
-                                                 interval.first);
-        node_map_->AddOutput(NodeName(node->input(axis_arg)),
-                             added_node->name());
-
-        // Overwrite the first constant input with the result of the added
-        // child node.
-        node->set_input(interval.first, added_node->name());
-        node_map_->AddOutput(added_node->name(), node->name());
-      }
-      if (!constant_input_runs.empty()) {
-        graph_modified_ = true;
-        if (!inputs_to_delete.empty()) {
-          // Fix up the inputs to the original node.
-          std::vector<string> tmp(node->input().begin(), node->input().end());
-          node->clear_input();
-          for (int i = 0; i < tmp.size(); ++i) {
-            if (inputs_to_delete.find(i) == inputs_to_delete.end()) {
-              node->add_input(tmp[i]);
-            }
+      }
+      added_node->add_input(node->input(axis_arg));
+      (*added_node->mutable_attr())["N"].set_i(interval.second -
+                                               interval.first);
+      node_map_->AddOutput(NodeName(node->input(axis_arg)), added_node->name());
+
+      // Overwrite the first constant input with the result of the added
+      // child node.
+      node->set_input(interval.first, added_node->name());
+      node_map_->AddOutput(added_node->name(), node->name());
+    }
+    if (!constant_input_runs.empty()) {
+      if (!inputs_to_delete.empty()) {
+        // Fix up the inputs to the original node.
+        std::vector<string> tmp(node->input().begin(), node->input().end());
+        node->clear_input();
+        for (int i = 0; i < tmp.size(); ++i) {
+          if (inputs_to_delete.find(i) == inputs_to_delete.end()) {
+            node->add_input(tmp[i]);
           }
-          (*node->mutable_attr())["N"].set_i(node->input_size() - 1);
-          properties->ClearInputProperties(node->name());
         }
-        continue;
+        (*node->mutable_attr())["N"].set_i(node->input_size() - 1);
+        properties->ClearInputProperties(node->name());
       }
+      return true;
     }
   }
-
-  return Status::OK();
+  return false;
 }
 
 Status ConstantFolding::RunOptimizationPass(Cluster* cluster,
@@ -2406,7 +3008,7 @@ Status ConstantFolding::RunOptimizationPass(Cluster* cluster,
   TF_RETURN_IF_ERROR(FoldGraph(optimized_graph));
   node_map_.reset(new NodeMap(optimized_graph));
   TF_RETURN_IF_ERROR(
-      SimplifyGraph(optimized_graph, &properties, can_use_shape_info));
+      SimplifyGraph(can_use_shape_info, optimized_graph, &properties));
 
   return Status::OK();
 }
@@ -2427,6 +3029,14 @@ Status ConstantFolding::Optimize(Cluster* cluster, const GrapplerItem& item,
     cpu_device_ = owned_device_.get();
   }
 
+  graph_contains_assign_or_inplace_op_ = false;
+  for (const NodeDef& node : item.graph.node()) {
+    if (ModifiesInputsInPlace(node) || MaybeHasRefInput(node)) {
+      graph_contains_assign_or_inplace_op_ = true;
+      break;
+    }
+  }
+
   has_fetch_ = !item.fetch.empty();
   GrapplerItem item_to_optimize = item;
   *optimized_graph = item.graph;
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.h b/tensorflow/core/grappler/optimizers/constant_folding.h
index f8a9e90d62111e..b42d5f201eabb7 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.h
+++ b/tensorflow/core/grappler/optimizers/constant_folding.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_CONSTANT_FOLDING_H_
-#define TENSORFLOW_GRAPPLER_OPTIMIZERS_CONSTANT_FOLDING_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_CONSTANT_FOLDING_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_CONSTANT_FOLDING_H_
 
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -78,26 +78,137 @@ class ConstantFolding : public GraphOptimizer {
 
   bool IsOnes(const NodeDef& node) const;
   bool IsZeros(const NodeDef& node) const;
-  void ReplaceOperationWithIdentity(int input_to_forward, NodeDef* node,
-                                    GraphDef* graph);
-  void ReplaceOperationWithSnapshot(int input_to_forward, NodeDef* node,
-                                    GraphDef* graph);
+  void ReplaceOperationWithIdentity(int input_to_forward,
+                                    const GraphProperties& properties,
+                                    NodeDef* node, GraphDef* graph);
+  void ReplaceOperationWithSnapshot(int input_to_forward,
+                                    const GraphProperties& properties,
+                                    NodeDef* node, GraphDef* graph);
   void ReplaceSubtractionFromZeroByNegation(NodeDef* node, GraphDef* graph);
-  Status ReplaceOperationWithConstant(double value, const AttrValue& dtype_attr,
+  Status ReplaceOperationWithConstant(double value,
+                                      const GraphProperties& properties,
                                       const TensorShapeProto& shape,
-                                      NodeDef* node, GraphDef* graph);
+                                      NodeDef* node, GraphDef* graph,
+                                      bool* success);
   void ReplaceDivisionOfOnesByReciprocal(NodeDef* node, GraphDef* graph);
   Status FoldGraph(GraphDef* output);
 
-  bool IsSimplifiableReduction(const NodeDef& node) const;
+  bool IsSimplifiableReduction(const NodeDef& node,
+                               const GraphProperties& properties) const;
   bool IsSimplifiableReshape(const NodeDef& node,
                              const GraphProperties& properties) const;
-  Status SimplifyGraph(GraphDef* output, GraphProperties* properties,
-                       bool use_shape_info);
+  Status SimplifyGraph(bool use_shape_info, GraphDef* optimized_graph,
+                       GraphProperties* properties);
+  Status SimplifyNode(bool use_shape_info, NodeDef* node,
+                      GraphDef* optimized_graph, GraphProperties* properties);
 
   Status RunOptimizationPass(Cluster* cluster, const GrapplerItem& item,
                              GraphDef* output);
 
+  // Applies partial constant folding for Concat which is not commutative.
+  // Returns true if the transformation applied successfully.
+  bool PartialConcatConstFolding(GraphDef* optimized_graph,
+                                 GraphProperties* properties, NodeDef* node);
+
+  // Applies partial constant folding for associative operators AddN and
+  // AccumulateNV2. Returns true if the transformation applied successfully.
+  bool PartialAssocOpConstFolding(GraphDef* optimized_graph,
+                                  GraphProperties* properties, NodeDef* node);
+
+  // Applies partial constant propagation through IdentityN operator.
+  // Returns true if the transformation applied successfully.
+  bool PartialConstPropThroughIdentityN(NodeDef* node);
+
+  // Pushes down constants on '+' and '*' operators if applicable. Returns true
+  // the transformation applied successfully.
+  bool ConstantPushDown(NodeDef* node);
+
+  // Aggregate constants present around a conv operator. Returns true if the
+  // transformation was applied successfully.
+  bool MulConvPushDown(NodeDef* node, const GraphProperties& properties);
+
+  // Strength reduces floating point division by a constant Div(x, const) to
+  // multiplication by the reciprocal Mul(x, Reciprocal(const)).
+  bool ReduceDivToReciprocalMul(GraphDef* optimized_graph, NodeDef* node);
+
+  // Simplifies arithmetic operations with ones or zeros. Returns the status,
+  // and updates the success input argument that denotes if any simplification
+  // was applied.
+  Status SimplifyArithmeticOperations(const GraphProperties& properties,
+                                      bool use_shape_info,
+                                      GraphDef* optimized_graph, NodeDef* node,
+                                      bool* success);
+
+  // Simplifies a Reshape operation to an Identity operation if applicable.
+  bool SimplifyReshape(const GraphProperties& properties, bool use_shape_info,
+                       NodeDef* node);
+
+  // Simplifies a Reduction operation to an Identity operation if applicable.
+  bool SimplifyReduction(const GraphProperties& properties, NodeDef* node);
+
+  // Switch(x, x) will always feed false to its false branch and true to
+  // its true branch. By rewriting the graph a bit, we can propagate these
+  // constants down the two output branches, and just use control dependencies
+  // to trigger the selected one at runtime. For example,
+  //
+  //     +------+
+  // x-->|Switch|-->a  (in practice there may be multiple consumers of each
+  // x-->|      |-->b   output branch.)
+  //     +------+
+  //
+  // Is rewritten as
+  //
+  //     +------+
+  // x-->|Switch|-->Identity--^>Const(false)-->a
+  // x-->|      |-->Identity--^>Const(true)-->b
+  //     +------+
+  bool SimplifySwitch(GraphDef* optimized_graph, NodeDef* node);
+
+  // Moves constants past Enter node if applicable.
+  bool MoveConstantsPastEnter(GraphDef* optimized_graph, NodeDef* node);
+
+  // Simplifies Pack operation if applicable.
+  bool SimplifyPack(GraphDef* optimized_graph, NodeDef* node);
+
+  // Simplifies a Squeeze operation to an Identity operation if applicable.
+  bool SimplifySqueeze(const GraphProperties& properties, bool use_shape_info,
+                       GraphDef* optimized_graph, NodeDef* node);
+
+  // Simplifies a Pad operation to an Identity operation if applicable.
+  Status SimplifyPad(const GraphProperties& properties, bool use_shape_info,
+                     GraphDef* optimized_graph, NodeDef* node, bool* success);
+
+  // Simplifies a Tile operation to an Identity operation if applicable.
+  Status SimplifyTile(const GraphProperties& properties, bool use_shape_info,
+                      GraphDef* optimized_graph, NodeDef* node, bool* success);
+
+  // Simplifies a StridedSlice operation to an Identity operation if applicable.
+  Status SimplifyStridedSlice(const GraphProperties& properties,
+                              bool use_shape_info, GraphDef* optimized_graph,
+                              NodeDef* node, bool* success);
+
+  // Simplifies a Slice operation to an Identity operation if applicable.
+  Status SimplifySlice(const GraphProperties& properties, bool use_shape_info,
+                       GraphDef* optimized_graph, NodeDef* node, bool* success);
+
+  // Removes Reverse op over dimensions with size 1.
+  Status RemoveReverse(const GraphProperties& properties, bool use_shape_info,
+                       GraphDef* optimized_graph, NodeDef* node, bool* success);
+
+  // Removes RandomShuffle op if it is scalar or first dimension is of size 1.
+  bool RemoveRandomShuffle(const GraphProperties& properties,
+                           bool use_shape_info, GraphDef* optimized_graph,
+                           NodeDef* node);
+
+  // Removes Shuffle or Transpose op over dimensions of size 1.
+  Status RemoveShuffleOrTranspose(const GraphProperties& properties,
+                                  bool use_shape_info,
+                                  GraphDef* optimized_graph, NodeDef* node,
+                                  bool* success);
+
+  // Removes Split or SplitV node if possible.
+  bool RemoveSplitOrSplitV(const GraphProperties& properties,
+                           GraphDef* optimized_graph, NodeDef* node);
   // Points to an externally provided device or to owned_device_;
   RewriterConfig::Toggle opt_level_;
   DeviceBase* cpu_device_;
@@ -111,9 +222,10 @@ class ConstantFolding : public GraphOptimizer {
   std::unordered_set<string> feed_nodes_;
   bool has_fetch_;
   bool graph_modified_;
+  bool graph_contains_assign_or_inplace_op_;
 };
 
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_CONSTANT_FOLDING_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_CONSTANT_FOLDING_H_
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index 25693c5c60b6b9..9f051ca248b75f 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -33,52 +33,88 @@ class ConstantFoldingTest : public GrapplerTest {
  protected:
   template <DataType DTYPE>
   void SimpleNeutralElementTest() {
-    typedef typename EnumToDataType<DTYPE>::Type T;
-    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-    Output x = ops::Placeholder(s.WithOpName("x"), DTYPE,
-                                ops::Placeholder::Shape(TensorShape({2, 2})));
-    Tensor zeros_t(DTYPE, TensorShape({2, 2}));
-    Tensor ones_t(DTYPE, TensorShape({2, 2}));
-    Tensor x_t(DTYPE, TensorShape({2, 2}));
-    for (int i = 0; i < 4; ++i) {
-      zeros_t.flat<T>()(i) = T(0);
-      ones_t.flat<T>()(i) = T(1);
-      x_t.flat<T>()(i) = T(i + 1);
-    }
-    Output zeros = ops::Const(s.WithOpName("zeros"), zeros_t);
-    Output ones = ops::Const(s.WithOpName("ones"), ones_t);
-    Output mul1 = ops::Mul(s.WithOpName("mul1"), x, zeros);
-    Output mul2 = ops::Mul(s.WithOpName("mul2"), x, ones);
-
-    GrapplerItem item;
-    TF_CHECK_OK(s.ToGraphDef(&item.graph));
-    item.fetch = {"mul1", "mul2"};
-    ConstantFolding optimizer(nullptr /* cpu_device */);
-    GraphDef output;
-    Status status = optimizer.Optimize(nullptr, item, &output);
-    TF_EXPECT_OK(status);
-    LOG(INFO) << output.DebugString();
-    EXPECT_EQ(5, output.node_size());
-    for (int i = 0; i < output.node_size(); ++i) {
-      const NodeDef& node = output.node(i);
-      const string& name = node.name();
-      if (name == "mul1") {
-        EXPECT_EQ("Const", node.op());
-        EXPECT_EQ("^x", node.input(0));
-        EXPECT_EQ("^zeros", node.input(1));
-      } else if (name == "mul2") {
-        EXPECT_EQ("Snapshot", node.op());
-        EXPECT_EQ("x", node.input(0));
-        EXPECT_EQ("^ones", node.input(1));
+    for (bool use_snapshot : {false, true}) {
+      typedef typename EnumToDataType<DTYPE>::Type T;
+      tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+      Output x = ops::Placeholder(s.WithOpName("x"), DTYPE,
+                                  ops::Placeholder::Shape(TensorShape({2, 2})));
+      Output v = ops::Variable(s.WithOpName("v"), {2, 2}, DTYPE);
+      Tensor zeros_t(DTYPE, TensorShape({2, 2}));
+      Tensor ones_t(DTYPE, TensorShape({2, 2}));
+      Tensor x_t(DTYPE, TensorShape({2, 2}));
+      for (int i = 0; i < 4; ++i) {
+        zeros_t.flat<T>()(i) = T(0);
+        ones_t.flat<T>()(i) = T(1);
+        x_t.flat<T>()(i) = T(i + 1);
+      }
+      Output zeros = ops::Const(s.WithOpName("zeros"), zeros_t);
+      Output ones = ops::Const(s.WithOpName("ones"), ones_t);
+      Output mul1;
+      Output mul2;
+      Output add1;
+      Output add2;
+      if (DTYPE == DT_BOOL) {
+        mul1 = ops::LogicalAnd(s.WithOpName("mul1"), x, zeros);
+        mul2 = ops::LogicalAnd(s.WithOpName("mul2"), x, ones);
+        add1 = ops::LogicalOr(s.WithOpName("add1"), x, zeros);
+        add2 = ops::LogicalOr(s.WithOpName("add2"), x, ones);
+      } else {
+        mul1 = ops::Mul(s.WithOpName("mul1"), x, zeros);
+        mul2 = ops::Mul(s.WithOpName("mul2"), x, ones);
+        add1 = ops::Add(s.WithOpName("add1"), x, zeros);
+        add1 = ops::Add(s.WithOpName("add2"), x, ones);
+      }
+      if (use_snapshot) {
+        // Add an op with ref input to prevent Snapshot from being
+        // turned into Identity.
+        ops::Assign(s.WithOpName("assign"), v, ones);
+      }
+      GrapplerItem item;
+      TF_CHECK_OK(s.ToGraphDef(&item.graph));
+      item.fetch = {"mul1", "mul2", "add1", "add2"};
+      ConstantFolding optimizer(nullptr /* cpu_device */);
+      GraphDef output;
+      Status status = optimizer.Optimize(nullptr, item, &output);
+      TF_EXPECT_OK(status);
+
+      EXPECT_EQ(7, output.node_size());
+      const string snapshot_or_identity =
+          use_snapshot ? "Snapshot" : "Identity";
+      for (int i = 0; i < output.node_size(); ++i) {
+        const NodeDef& node = output.node(i);
+        const string& name = node.name();
+        if (name == "mul1") {
+          EXPECT_EQ("Const", node.op());
+          EXPECT_EQ("^x", node.input(0));
+          EXPECT_EQ("^zeros", node.input(1));
+        } else if (name == "mul2") {
+          EXPECT_EQ(snapshot_or_identity, node.op());
+          EXPECT_EQ("x", node.input(0));
+          EXPECT_EQ("^ones", node.input(1));
+        } else if (name == "add1") {
+          EXPECT_EQ(snapshot_or_identity, node.op());
+          EXPECT_EQ("x", node.input(0));
+          EXPECT_EQ("^zeros", node.input(1));
+        } else if (name == "add2") {
+          if (DTYPE == DT_BOOL) {
+            EXPECT_EQ("Const", node.op());
+            EXPECT_EQ("^x", node.input(0));
+            EXPECT_EQ("^ones", node.input(1));
+          } else {
+            EXPECT_EQ("Add", node.op());
+            EXPECT_EQ("x", node.input(0));
+            EXPECT_EQ("ones", node.input(1));
+          }
+        }
+      }
+      auto tensors_expected =
+          EvaluateNodes(item.graph, item.fetch, {{"x", x_t}});
+      auto tensors = EvaluateNodes(output, item.fetch, {{"x", x_t}});
+      EXPECT_EQ(4, tensors_expected.size());
+      EXPECT_EQ(4, tensors.size());
+      for (int i = 0; i < item.fetch.size(); ++i) {
+        test::ExpectTensorEqual<T>(tensors_expected[i], tensors[i]);
       }
-    }
-    auto tensors_expected =
-        EvaluateNodes(item.graph, {"mul1", "mul2"}, {{"x", x_t}});
-    auto tensors = EvaluateNodes(output, {"mul1", "mul2"}, {{"x", x_t}});
-    EXPECT_EQ(2, tensors_expected.size());
-    EXPECT_EQ(2, tensors.size());
-    for (int i = 0; i < 2; ++i) {
-      test::ExpectTensorEqual<T>(tensors_expected[i], tensors[i]);
     }
   }
 };
@@ -204,6 +240,77 @@ TEST_F(ConstantFoldingTest, AddTree) {
   }
 }
 
+TEST_F(ConstantFoldingTest, ConvPushDownTest) {
+  // Tests if the following rewrite is performed:
+  //
+  //         *                       Conv2D
+  //        / \                       / \
+  //       c  Conv2D        -->      x  (c * filter)
+  //           / \
+  //          x  filter
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  int input_depth = 3;
+  int filter_count = 5;
+  int filter_size = 2;
+  TensorShape filter_shape(
+      {filter_size, filter_size, input_depth, filter_count});
+  Tensor filter_values(DT_FLOAT, filter_shape);
+  for (int i = 0; i < filter_values.NumElements(); ++i) {
+    filter_values.flat<float>()(i) = std::sqrt(static_cast<float>(i));
+  }
+  Output filter =
+      ops::Const(s.WithOpName("filter"), Input::Initializer(filter_values));
+
+  int batch_size = 4;
+  int input_dim = 10;
+  TensorShape input_shape({batch_size, input_dim, input_dim, input_depth});
+  Output input = ops::Placeholder(s.WithOpName("x"), DT_FLOAT,
+                                  ops::Placeholder::Shape(input_shape));
+
+  Output conv =
+      ops::Conv2D(s.WithOpName("conv"), input, filter, {1, 1, 1, 1}, "VALID");
+  Output c = ops::Const(s.WithOpName("c"), 3.0f, {1});
+  Output mul = ops::Mul(s.WithOpName("mul"), c, conv);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  ConstantFolding fold(nullptr);
+  GraphDef output;
+  Status status = fold.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  std::cout << output.DebugString() << std::endl;
+
+  EXPECT_EQ(5, output.node_size());
+  int found = 0;
+  for (const auto& node : output.node()) {
+    if (node.name() == "mul") {
+      found++;
+      EXPECT_EQ("Conv2D", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("conv/merged_input", node.input(1));
+    } else if (node.name() == "conv/merged_input") {
+      found++;
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ(0, node.input_size());
+    }
+  }
+  EXPECT_EQ(2, found);
+
+  // Check that const folded multiplication node has the expected value.
+  std::vector<string> fetch = {"mul"};
+  Tensor value(DT_FLOAT, input_shape);
+  for (int i = 0; i < value.NumElements(); ++i) {
+    value.flat<float>()(i) = i;
+  }
+  auto actual = EvaluateNodes(output, fetch, {{"x", value}});
+  auto expected = EvaluateNodes(item.graph, fetch, {{"x", value}});
+  test::ExpectTensorEqual<float>(expected[0], actual[0]);
+}
+
 TEST_F(ConstantFoldingTest, NeutralElement) {
   int kConst = 0;
   int kLike = 1;
@@ -284,11 +391,11 @@ TEST_F(ConstantFoldingTest, NeutralElement) {
         EXPECT_EQ(ctrl_zeros_name, node.input(0));
         EXPECT_EQ("^y", node.input(1));
       } else if (name == "mul3") {
-        EXPECT_EQ("Snapshot", node.op());
+        EXPECT_EQ("Identity", node.op());
         EXPECT_EQ("x", node.input(0));
         EXPECT_EQ(ctrl_ones_name, node.input(1));
       } else if (name == "mul4") {
-        EXPECT_EQ("Snapshot", node.op());
+        EXPECT_EQ("Identity", node.op());
         EXPECT_EQ("y", node.input(0));
         EXPECT_EQ(ctrl_ones_name, node.input(1));
       } else if (name == "mul5") {
@@ -300,7 +407,7 @@ TEST_F(ConstantFoldingTest, NeutralElement) {
         EXPECT_EQ("^zeros_1d", node.input(0));
         EXPECT_EQ("^y", node.input(1));
       } else if (name == "div1") {
-        EXPECT_EQ("Snapshot", node.op());
+        EXPECT_EQ("Identity", node.op());
         EXPECT_EQ("x", node.input(0));
         EXPECT_EQ(ctrl_ones_name, node.input(1));
       } else if (name == "div2") {
@@ -336,15 +443,15 @@ TEST_F(ConstantFoldingTest, NeutralElement) {
         EXPECT_EQ(2, t.tensor_shape().dim(0).size());
         EXPECT_EQ(3, t.tensor_shape().dim(1).size());
       } else if (name == "add1") {
-        EXPECT_EQ("Snapshot", node.op());
+        EXPECT_EQ("Identity", node.op());
         EXPECT_EQ("x", node.input(0));
         EXPECT_EQ(ctrl_zeros_name, node.input(1));
       } else if (name == "add2") {
-        EXPECT_EQ("Snapshot", node.op());
+        EXPECT_EQ("Identity", node.op());
         EXPECT_EQ("y", node.input(0));
         EXPECT_EQ(ctrl_zeros_name, node.input(1));
       } else if (name == "bias_add1") {
-        EXPECT_EQ("Snapshot", node.op());
+        EXPECT_EQ("Identity", node.op());
         EXPECT_EQ("x", node.input(0));
         EXPECT_EQ("^zeros_1d", node.input(1));
       } else if (name == "bias_add2") {
@@ -353,7 +460,7 @@ TEST_F(ConstantFoldingTest, NeutralElement) {
         EXPECT_EQ(zeros_name, node.input(0));
         EXPECT_EQ("bias", node.input(1));
       } else if (name == "sub1") {
-        EXPECT_EQ("Snapshot", node.op());
+        EXPECT_EQ("Identity", node.op());
         EXPECT_EQ("x", node.input(0));
         EXPECT_EQ(ctrl_zeros_name, node.input(1));
       } else if (name == "sub2") {
@@ -393,6 +500,7 @@ TEST_F(ConstantFoldingTest, NeutralElement) {
 }
 
 TEST_F(ConstantFoldingTest, NeutralElement_ShortFloats) {
+  SimpleNeutralElementTest<DT_BOOL>();
   SimpleNeutralElementTest<DT_HALF>();
   SimpleNeutralElementTest<DT_BFLOAT16>();
 }
@@ -1522,8 +1630,6 @@ TEST_F(ConstantFoldingTest, SplitVRemoval) {
   ops::SplitV s1(scope.WithOpName("s1"), in1, size_splits1, split_dim, 1);
   ops::SplitV s2(scope.WithOpName("s2"), in2, size_splits2, split_dim, 2);
 
-  LOG(INFO) << s1.output.size();
-  LOG(INFO) << s2.output.size();
   ops::Add out(scope.WithOpName("out"), s1[0], s2[0]);
 
   GrapplerItem item;
@@ -1561,7 +1667,45 @@ TEST_F(ConstantFoldingTest, SplitVRemoval) {
   test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-5);
 }
 
-TEST_F(ConstantFoldingTest, ShuffleReverseOnScalarRemoval) {
+TEST_F(ConstantFoldingTest, TransposeOnSize1DimsRemoval) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  Output in1 = ops::Variable(scope.WithOpName("in1"), TensorShape({1, 2, 4, 1}),
+                             DT_FLOAT);
+  Output p1 = ops::Const(scope.WithOpName("p1"), {3, 2, 1, 0}, {4});
+  Output in2 = ops::Variable(scope.WithOpName("in2"), TensorShape({1, 4, 2, 1}),
+                             DT_FLOAT);
+  Output p2 = ops::Const(scope.WithOpName("p2"), {3, 1, 2, 0}, {4});
+  ops::Transpose t1(scope.WithOpName("t1"), in1, p1);
+  ops::Transpose t2(scope.WithOpName("t2").WithControlDependencies({in1}), in2,
+                    p2);
+
+  ops::Add out1(scope.WithOpName("out1"), t1, t2);
+
+  GrapplerItem item;
+  item.fetch = {"out1"};
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef got;
+  Status status = optimizer.Optimize(nullptr, item, &got);
+  TF_EXPECT_OK(status);
+
+  GraphDef want;
+  AddNode("in1", "VariableV2", {}, {}, &want);
+  AddNode("in2", "VariableV2", {}, {}, &want);
+  AddNode("p1", "Const", {}, {}, &want);
+  AddNode("p2", "Const", {}, {}, &want);
+  AddNode("t1", "Transpose", {"in1", "p1"}, {}, &want);
+  AddNode("t2", "Identity",
+          {"in2", AsControlDependency("in1"), AsControlDependency("p2")}, {},
+          &want);
+  AddNode("out1", "Add", {"t1", "t2"}, {}, &want);
+
+  CompareGraphs(want, got);
+}
+
+TEST_F(ConstantFoldingTest, RandomShuffleOnScalarRemoval) {
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
 
   Output in1 =
@@ -1606,6 +1750,44 @@ TEST_F(ConstantFoldingTest, ShuffleReverseOnScalarRemoval) {
     test::ExpectTensorNear<float>(tensors_expected[i], tensors[i], 1e-5);
 }
 
+TEST_F(ConstantFoldingTest, ReverseOnSize1DimsRemoval) {
+  tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+  Output in1 = ops::Variable(scope.WithOpName("in1"), TensorShape({1, 2, 4, 1}),
+                             DT_FLOAT);
+  Output a1 = ops::Const(scope.WithOpName("a1"), {3, 2, 1, 0}, {4});
+  Output in2 = ops::Variable(scope.WithOpName("in2"), TensorShape({1, 2, 4, 1}),
+                             DT_FLOAT);
+  Output a2 = ops::Const(scope.WithOpName("a2"), {0, 3}, {2});
+  ops::Reverse r1(scope.WithOpName("r1"), in1, a1);
+  ops::Reverse r2(scope.WithOpName("r2").WithControlDependencies({in1}), in2,
+                  a2);
+
+  ops::Add out1(scope.WithOpName("out1"), r1, r2);
+
+  GrapplerItem item;
+  item.fetch = {"out1"};
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  ConstantFolding optimizer(nullptr /* cpu_device */);
+  GraphDef got;
+  Status status = optimizer.Optimize(nullptr, item, &got);
+  TF_EXPECT_OK(status);
+
+  GraphDef want;
+  AddNode("in1", "VariableV2", {}, {}, &want);
+  AddNode("in2", "VariableV2", {}, {}, &want);
+  AddNode("a1", "Const", {}, {}, &want);
+  AddNode("a2", "Const", {}, {}, &want);
+  AddNode("r1", "ReverseV2", {"in1", "a1"}, {}, &want);
+  AddNode("r2", "Identity",
+          {"in2", AsControlDependency("in1"), AsControlDependency("a2")}, {},
+          &want);
+  AddNode("out1", "Add", {"r1", "r2"}, {}, &want);
+
+  CompareGraphs(want, got);
+}
+
 TEST_F(ConstantFoldingTest, SliceWithSameDimensionRemoval) {
   {  // size = {3, 5}
     tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
@@ -1701,6 +1883,118 @@ TEST_F(ConstantFoldingTest, SliceWithSameDimensionRemoval) {
   }
 }
 
+TEST_F(ConstantFoldingTest, StridedSliceWithSameDimensionRemoval) {
+  {  // no mask
+    tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+    auto in1 = ops::Variable(scope.WithOpName("in1"), {3, 5, 2}, DT_FLOAT);
+    auto begin = ops::Const(scope.WithOpName("begin"), {0, 0}, {2});
+    auto end = ops::Const(scope.WithOpName("end"), {3, 5}, {2});
+    auto strides = ops::Const(scope.WithOpName("strides"), {1, 1}, {2});
+    Output in2 = ops::Variable(scope.WithOpName("in2"), {4, 6, 2}, DT_FLOAT);
+    ops::StridedSlice s1(scope.WithOpName("s1"), in1, begin, end, strides);
+    ops::StridedSlice s2(scope.WithOpName("s2"), in2, begin, end, strides);
+
+    ops::Add out(scope.WithOpName("out"), s1, s2);
+
+    GrapplerItem item;
+    item.fetch = {"out"};
+    TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+    ConstantFolding optimizer(nullptr /* cpu_device */);
+    GraphDef got;
+    Status status = optimizer.Optimize(nullptr, item, &got);
+    TF_EXPECT_OK(status);
+
+    GraphDef want;
+    AddNode("in1", "VariableV2", {}, {}, &want);
+    AddNode("in2", "VariableV2", {}, {}, &want);
+    AddNode("begin", "Const", {}, {}, &want);
+    AddNode("end", "Const", {}, {}, &want);
+    AddNode("strides", "Const", {}, {}, &want);
+    AddNode("s1", "Identity",
+            {"in1", AsControlDependency("begin"), AsControlDependency("end"),
+             AsControlDependency("strides")},
+            {}, &want);
+    AddNode("s2", "StridedSlice", {"in2", "begin", "end", "strides"}, {},
+            &want);
+    AddNode("out", "Add", {"s1", "s2"}, {}, &want);
+
+    CompareGraphs(want, got);
+
+    auto in1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 5, 2}));
+    auto in2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({4, 6, 2}));
+    auto tensors_expected =
+        EvaluateNodes(item.graph, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+    EXPECT_EQ(1, tensors_expected.size());
+    auto tensors =
+        EvaluateNodes(got, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+    EXPECT_EQ(1, tensors.size());
+    test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-5);
+  }
+  {  // with begin/end/ellipsis mask
+    tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
+
+    // s1 = in1[:, ..., 0:5, 0:6]
+    auto in1 =
+        ops::Variable(scope.WithOpName("in1"), {2, 3, 4, 5, 6}, DT_FLOAT);
+    auto begin1 = ops::Const(scope.WithOpName("begin1"), {0, 0, 0}, {3});
+    auto end1 = ops::Const(scope.WithOpName("end1"), {0, 5, 6}, {3});
+    auto strides1 = ops::Const(scope.WithOpName("strides1"), {1, 1, 1}, {3});
+    ops::StridedSlice s1(
+        scope.WithOpName("s1"), in1, begin1, end1, strides1,
+        ops::StridedSlice::Attrs().BeginMask(1).EndMask(1).EllipsisMask(2));
+
+    Output in2 =
+        ops::Variable(scope.WithOpName("in2"), {5, 8, 5, 6, 9}, DT_FLOAT);
+    auto begin2 = ops::Const(scope.WithOpName("begin2"), {0, 0, 0, 0, 0}, {5});
+    auto end2 = ops::Const(scope.WithOpName("end2"), {2, 3, 4, 5, 6}, {5});
+    auto strides2 =
+        ops::Const(scope.WithOpName("strides2"), {1, 1, 1, 1, 1}, {5});
+    ops::StridedSlice s2(scope.WithOpName("s2"), in2, begin2, end2, strides2);
+
+    ops::Add out(scope.WithOpName("out"), s1, s2);
+
+    GrapplerItem item;
+    item.fetch = {"out"};
+    TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+    ConstantFolding optimizer(nullptr /* cpu_device */);
+    GraphDef got;
+    Status status = optimizer.Optimize(nullptr, item, &got);
+    TF_EXPECT_OK(status);
+
+    GraphDef want;
+    AddNode("in1", "VariableV2", {}, {}, &want);
+    AddNode("in2", "VariableV2", {}, {}, &want);
+    AddNode("begin1", "Const", {}, {}, &want);
+    AddNode("end1", "Const", {}, {}, &want);
+    AddNode("strides1", "Const", {}, {}, &want);
+    AddNode("s1", "Identity",
+            {"in1", AsControlDependency("begin1"), AsControlDependency("end1"),
+             AsControlDependency("strides1")},
+            {}, &want);
+    AddNode("begin2", "Const", {}, {}, &want);
+    AddNode("end2", "Const", {}, {}, &want);
+    AddNode("strides2", "Const", {}, {}, &want);
+    AddNode("s2", "StridedSlice", {"in2", "begin2", "end2", "strides2"}, {},
+            &want);
+    AddNode("out", "Add", {"s1", "s2"}, {}, &want);
+
+    CompareGraphs(want, got);
+
+    auto in1_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({2, 3, 4, 5, 6}));
+    auto in2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({5, 8, 5, 6, 9}));
+    auto tensors_expected =
+        EvaluateNodes(item.graph, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+    EXPECT_EQ(1, tensors_expected.size());
+    auto tensors =
+        EvaluateNodes(got, item.fetch, {{"in1", in1_t}, {"in2", in2_t}});
+    EXPECT_EQ(1, tensors.size());
+    test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-5);
+  }
+}
+
 TEST_F(ConstantFoldingTest, TileWithMultipliesBeingOne) {
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
 
@@ -1829,7 +2123,7 @@ TEST_F(ConstantFoldingTest, SqueezeWithAllDimesionsGreaterThanOne) {
 }
 
 TEST_F(ConstantFoldingTest, NoOpReduction) {
-  // Build a simple graph with a reduction that can be reduced to the
+  // Build a simple graph with reductions that can be reduced to the
   // identity.
   tensorflow::Scope scope = tensorflow::Scope::NewRootScope();
 
@@ -1840,8 +2134,15 @@ TEST_F(ConstantFoldingTest, NoOpReduction) {
   Output p = ops::Prod(scope.WithOpName("p"), v, i);
   Output s = ops::Square(scope.WithOpName("s"), p);
 
+  Output v2 = ops::Variable(scope.WithOpName("v2"), {3, 5, 1}, DT_FLOAT);
+  Output c2 =
+      ops::Const(scope.WithOpName("c2").WithControlDependencies(v), 2, {1});
+  ops::Prod::Attrs attr;
+  attr = attr.KeepDims(true);
+  Output p2 = ops::Prod(scope.WithOpName("p2"), v2, c2, attr);
+
   GrapplerItem item;
-  item.fetch.push_back("s");
+  item.fetch = {"s", "p2"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
   ConstantFolding optimizer(nullptr /* cpu_device */);
@@ -1849,24 +2150,33 @@ TEST_F(ConstantFoldingTest, NoOpReduction) {
   Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  bool found = false;
+  int found = 0;
   for (const auto& node : output.node()) {
     if (node.name() == "p") {
-      found = true;
+      found++;
       EXPECT_EQ("Identity", node.op());
       EXPECT_EQ(2, node.input_size());
       EXPECT_EQ("v", node.input(0));
       EXPECT_EQ("^i", node.input(1));
+    } else if (node.name() == "p2") {
+      found++;
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("v2", node.input(0));
+      EXPECT_EQ("^c2", node.input(1));
     }
   }
-  EXPECT_TRUE(found);
+  EXPECT_EQ(2, found);
 
   auto v_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 5, 7}));
-  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, {{"v", v_t}});
-  EXPECT_EQ(1, tensors_expected.size());
-  auto tensors = EvaluateNodes(output, item.fetch, {{"v", v_t}});
-  EXPECT_EQ(1, tensors.size());
+  auto v2_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 5, 1}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"v", v_t}, {"v2", v2_t}});
+  EXPECT_EQ(2, tensors_expected.size());
+  auto tensors = EvaluateNodes(output, item.fetch, {{"v", v_t}, {"v2", v2_t}});
+  EXPECT_EQ(2, tensors.size());
   test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-5);
+  test::ExpectTensorNear<float>(tensors_expected[1], tensors[1], 1e-5);
 }
 
 TEST_F(ConstantFoldingTest, NoOpReshape) {
diff --git a/tensorflow/core/grappler/optimizers/custom_graph_optimizer.h b/tensorflow/core/grappler/optimizers/custom_graph_optimizer.h
index 4d7f8c98d07112..ab9af5acff413b 100644
--- a/tensorflow/core/grappler/optimizers/custom_graph_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/custom_graph_optimizer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_CUSTOM_GRAPH_OPTIMIZER_H_
-#define TENSORFLOW_GRAPPLER_OPTIMIZERS_CUSTOM_GRAPH_OPTIMIZER_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_CUSTOM_GRAPH_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_CUSTOM_GRAPH_OPTIMIZER_H_
 
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -34,4 +34,4 @@ class CustomGraphOptimizer : public GraphOptimizer {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_CUSTOM_GRAPH_OPTIMIZER_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_CUSTOM_GRAPH_OPTIMIZER_H_
diff --git a/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h b/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h
index 3148a5f809f0df..0b8e0b692aeeb5 100644
--- a/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h
+++ b/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h
@@ -50,7 +50,7 @@ class CustomGraphOptimizerRegistrar {
 
 #define REGISTER_GRAPH_OPTIMIZER_AS(MyCustomGraphOptimizerClass, name) \
   namespace {                                                          \
-  static CustomGraphOptimizerRegistrar                                 \
+  static ::tensorflow::grappler::CustomGraphOptimizerRegistrar         \
       MyCustomGraphOptimizerClass##_registrar(                         \
           []() { return new MyCustomGraphOptimizerClass; }, (name));   \
   }  // namespace
diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
new file mode 100644
index 00000000000000..121de1e0896994
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -0,0 +1,77 @@
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_protos_all")
+
+cc_library(
+    name = "graph_utils",
+    srcs = ["graph_utils.cc"],
+    hdrs = [
+        "graph_utils.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/grappler:graph_view",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:grappler_item_builder",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
+        "//tensorflow/core/grappler/optimizers:meta_optimizer",
+    ] + tf_protos_all(),
+)
+
+tf_cc_test(
+    name = "graph_utils_test",
+    srcs = ["graph_utils_test.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_utils",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+cc_library(
+    name = "map_and_batch_fusion",
+    srcs = ["map_and_batch_fusion.cc"],
+    hdrs = [
+        "map_and_batch_fusion.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/grappler:graph_view",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+    ] + tf_protos_all(),
+)
+
+tf_cc_test(
+    name = "map_and_batch_fusion_test",
+    srcs = ["map_and_batch_fusion_test.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_utils",
+        ":map_and_batch_fusion",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler:grappler_item",
+    ],
+)
+
+cc_library(
+    name = "data",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":map_and_batch_fusion",
+    ],
+    alwayslink = 1,
+)
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.cc b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
new file mode 100644
index 00000000000000..df12de37da0d12
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
@@ -0,0 +1,217 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/grappler/graph_view.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/grappler_item_builder.h"
+#include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace graph_utils {
+namespace {
+
+int FindNodeWithPredicate(const std::function<bool(const NodeDef&)>& predicate,
+                          const GraphDef& graph) {
+  for (int i = 0; i < graph.node_size(); ++i) {
+    if (predicate(graph.node(i))) {
+      return i;
+    }
+  }
+  return -1;
+}
+
+std::vector<int> CreateNameIndex(const GraphDef& graph) {
+  std::map<string, int> names;
+  for (int i = 0; i < graph.node_size(); ++i) {
+    names[graph.node(i).name()] = i;
+  }
+  std::vector<int> index(graph.node_size());
+  int i = 0;
+  for (const auto& pair : names) {
+    index[i++] = pair.second;
+  }
+  return index;
+}
+
+std::vector<int> CreateInputIndex(const NodeDef& node) {
+  std::map<string, int> inputs;
+  for (int i = 0; i < node.input_size(); ++i) {
+    inputs[node.input(i)] = i;
+  }
+  std::vector<int> index(node.input_size());
+  int i = 0;
+  for (const auto& pair : inputs) {
+    index[i++] = pair.second;
+  }
+  return index;
+}
+
+Status AddScalarConstNodeHelper(
+    DataType dtype, const std::function<void(TensorProto*)>& add_value,
+    GraphDef* graph, NodeDef** result) {
+  NodeDef* node = graph->add_node();
+  const string& name = strings::StrCat("Const/_", graph->node_size());
+  node->set_name(name);
+  node->set_op("Const");
+  (*node->mutable_attr())["dtype"].set_type(dtype);
+  std::unique_ptr<tensorflow::TensorProto> tensor =
+      tensorflow::MakeUnique<tensorflow::TensorProto>();
+  std::unique_ptr<tensorflow::TensorShapeProto> tensor_shape =
+      tensorflow::MakeUnique<tensorflow::TensorShapeProto>();
+  tensor->set_allocated_tensor_shape(tensor_shape.release());
+  tensor->set_dtype(dtype);
+  add_value(tensor.get());
+  (*node->mutable_attr())["value"].set_allocated_tensor(tensor.release());
+  *result = node;
+  return Status::OK();
+}
+
+}  // namespace
+
+Status AddNode(const string& name, const string& op,
+               const std::vector<string>& inputs,
+               const std::vector<std::pair<string, AttrValue>>& attributes,
+               GraphDef* graph, NodeDef** result) {
+  NodeDef* node = graph->add_node();
+  if (!name.empty()) {
+    node->set_name(name);
+  } else {
+    node->set_name(strings::StrCat(op, "/_", graph->node_size()));
+  }
+  node->set_op(op);
+  for (const string& input : inputs) {
+    node->add_input(input);
+  }
+  for (auto attr : attributes) {
+    (*node->mutable_attr())[attr.first] = attr.second;
+  }
+  *result = node;
+  return Status::OK();
+}
+
+template <>
+Status AddScalarConstNode(bool v, GraphDef* graph, NodeDef** result) {
+  return AddScalarConstNodeHelper(
+      DT_BOOL, [v](TensorProto* proto) { proto->add_bool_val(v); }, graph,
+      result);
+}
+
+template <>
+Status AddScalarConstNode(double v, GraphDef* graph, NodeDef** result) {
+  return AddScalarConstNodeHelper(
+      DT_DOUBLE, [v](TensorProto* proto) { proto->add_double_val(v); }, graph,
+      result);
+}
+
+template <>
+Status AddScalarConstNode(float v, GraphDef* graph, NodeDef** result) {
+  return AddScalarConstNodeHelper(
+      DT_FLOAT, [v](TensorProto* proto) { proto->add_float_val(v); }, graph,
+      result);
+}
+
+template <>
+Status AddScalarConstNode(int v, GraphDef* graph, NodeDef** result) {
+  return AddScalarConstNodeHelper(
+      DT_INT32, [v](TensorProto* proto) { proto->add_int_val(v); }, graph,
+      result);
+}
+
+template <>
+Status AddScalarConstNode(int64 v, GraphDef* graph, NodeDef** result) {
+  return AddScalarConstNodeHelper(
+      DT_INT64, [v](TensorProto* proto) { proto->add_int64_val(v); }, graph,
+      result);
+}
+
+template <>
+Status AddScalarConstNode(StringPiece v, GraphDef* graph, NodeDef** result) {
+  return AddScalarConstNodeHelper(
+      DT_STRING,
+      [v](TensorProto* proto) { proto->add_string_val(v.data(), v.size()); },
+      graph, result);
+}
+
+bool Compare(const GraphDef& g1, const GraphDef& g2) {
+  if (g1.node_size() != g2.node_size()) {
+    return false;
+  }
+  std::vector<int> name_index1 = CreateNameIndex(g1);
+  std::vector<int> name_index2 = CreateNameIndex(g2);
+  for (int i = 0; i < g1.node_size(); ++i) {
+    int idx1 = name_index1[i];
+    int idx2 = name_index2[i];
+    if (g1.node(idx1).op() != g2.node(idx2).op()) {
+      return false;
+    }
+    if (g1.node(idx1).name() != g2.node(idx2).name()) {
+      return false;
+    }
+    if (g1.node(idx1).input_size() != g2.node(idx2).input_size()) {
+      return false;
+    }
+    std::vector<int> input_index1 = CreateInputIndex(g1.node(idx1));
+    std::vector<int> input_index2 = CreateInputIndex(g2.node(idx2));
+    for (int j = 0; j < g1.node(idx1).input_size(); ++j) {
+      if (!IsSameInput(g1.node(idx1).input(input_index1[j]),
+                       g2.node(idx2).input(input_index2[j]))) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+bool ContainsNodeWithName(const string& name, const GraphDef& graph) {
+  return FindNodeWithName(name, graph) != -1;
+}
+
+bool ContainsNodeWithOp(const string& op, const GraphDef& graph) {
+  return FindNodeWithOp(op, graph) != -1;
+}
+
+Status DeleteNodes(const std::set<string>& nodes_to_delete, GraphDef* graph) {
+  int last = graph->node_size() - 1;
+  for (int i = graph->node_size() - 1; i >= 0; --i) {
+    const NodeDef& node = graph->node(i);
+    if (nodes_to_delete.find(node.name()) != nodes_to_delete.end()) {
+      graph->mutable_node()->SwapElements(i, last);
+      last--;
+    }
+  }
+  graph->mutable_node()->DeleteSubrange(last + 1,
+                                        graph->node_size() - last - 1);
+  return Status::OK();
+}
+
+int FindNodeWithName(const string& name, const GraphDef& graph) {
+  return FindNodeWithPredicate(
+      [name](const NodeDef& node) { return node.name() == name; }, graph);
+}
+
+int FindNodeWithOp(const string& op, const GraphDef& graph) {
+  return FindNodeWithPredicate(
+      [op](const NodeDef& node) { return node.op() == op; }, graph);
+}
+
+}  // end namespace graph_utils
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.h b/tensorflow/core/grappler/optimizers/data/graph_utils.h
new file mode 100644
index 00000000000000..b40ca44d785946
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.h
@@ -0,0 +1,81 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_GRAPH_UTILS_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_GRAPH_UTILS_H_
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace graph_utils {
+
+// Adds a node to the graph.
+Status AddNode(const string& name, const string& op,
+               const std::vector<string>& inputs,
+               const std::vector<std::pair<string, AttrValue>>& attributes,
+               GraphDef* graph, NodeDef** result);
+
+// Adds a Const node with the given value to the graph.
+template <typename T>
+Status AddScalarConstNode(T v, GraphDef* graph, NodeDef** result) {
+  return errors::Unimplemented("Type %s is not supported.",
+                               DataTypeToEnum<T>::value);
+}
+template <>
+Status AddScalarConstNode(bool v, GraphDef* graph, NodeDef** result);
+template <>
+Status AddScalarConstNode(double v, GraphDef* graph, NodeDef** result);
+template <>
+Status AddScalarConstNode(float v, GraphDef* graph, NodeDef** result);
+template <>
+Status AddScalarConstNode(int v, GraphDef* graph, NodeDef** result);
+template <>
+Status AddScalarConstNode(int64 v, GraphDef* graph, NodeDef** result);
+template <>
+Status AddScalarConstNode(StringPiece v, GraphDef* graph, NodeDef** result);
+
+// Checks whether the two graphs are the same.
+bool Compare(const GraphDef& g1, const GraphDef& g2);
+
+// Checks whether the graph contains a node with the given name.
+bool ContainsNodeWithName(const string& name, const GraphDef& graph);
+
+// Checks whether the graph contains a node with the given op.
+bool ContainsNodeWithOp(const string& op, const GraphDef& graph);
+
+// Deletes nodes from the graph.
+Status DeleteNodes(const std::set<string>& nodes_to_delete, GraphDef* graph);
+
+// Returns the index of the node with the given name or -1 if the node does
+// not exist.
+int FindNodeWithName(const string& name, const GraphDef& graph);
+
+// Returns the index of a node with the given op or -1 if no such  node
+// exists.
+int FindNodeWithOp(const string& op, const GraphDef& graph);
+
+}  // end namespace graph_utils
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_GRAPH_UTILS_H_
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
new file mode 100644
index 00000000000000..b34726044e4d64
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
@@ -0,0 +1,142 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace graph_utils {
+namespace {
+
+class GraphUtilsTest : public ::testing::Test {};
+
+TEST_F(GraphUtilsTest, AddScalarConstNodeBool) {
+  GraphDef graph;
+  NodeDef* bool_node;
+  TF_EXPECT_OK(AddScalarConstNode<bool>(true, &graph, &bool_node));
+  EXPECT_TRUE(ContainsNodeWithName(bool_node->name(), graph));
+  EXPECT_EQ(bool_node->attr().at("value").tensor().bool_val(0), true);
+}
+
+TEST_F(GraphUtilsTest, AddScalarConstNodeDouble) {
+  GraphDef graph;
+  NodeDef* double_node;
+  TF_EXPECT_OK(AddScalarConstNode<double>(3.14, &graph, &double_node));
+  EXPECT_TRUE(ContainsNodeWithName(double_node->name(), graph));
+  EXPECT_FLOAT_EQ(double_node->attr().at("value").tensor().double_val(0), 3.14);
+}
+
+TEST_F(GraphUtilsTest, AddScalarConstNodeFloat) {
+  GraphDef graph;
+  NodeDef* float_node;
+  TF_EXPECT_OK(AddScalarConstNode<float>(3.14, &graph, &float_node));
+  EXPECT_TRUE(ContainsNodeWithName(float_node->name(), graph));
+  EXPECT_FLOAT_EQ(float_node->attr().at("value").tensor().float_val(0), 3.14);
+}
+
+TEST_F(GraphUtilsTest, AddScalarConstNodeInt) {
+  GraphDef graph;
+  NodeDef* int_node;
+  TF_EXPECT_OK(AddScalarConstNode<int>(42, &graph, &int_node));
+  EXPECT_TRUE(ContainsNodeWithName(int_node->name(), graph));
+  EXPECT_EQ(int_node->attr().at("value").tensor().int_val(0), 42);
+}
+
+TEST_F(GraphUtilsTest, AddScalarConstNodeInt64) {
+  GraphDef graph;
+  NodeDef* int64_node;
+  TF_EXPECT_OK(AddScalarConstNode<int64>(42, &graph, &int64_node));
+  EXPECT_TRUE(ContainsNodeWithName(int64_node->name(), graph));
+  EXPECT_EQ(int64_node->attr().at("value").tensor().int64_val(0), 42);
+}
+
+TEST_F(GraphUtilsTest, AddScalarConstNodeString) {
+  GraphDef graph;
+  NodeDef* string_node;
+  TF_EXPECT_OK(AddScalarConstNode<StringPiece>("hello", &graph, &string_node));
+  EXPECT_TRUE(ContainsNodeWithName(string_node->name(), graph));
+  EXPECT_EQ(string_node->attr().at("value").tensor().string_val(0), "hello");
+}
+
+TEST_F(GraphUtilsTest, Compare) {
+  GraphDef graphA;
+  GraphDef graphB;
+  EXPECT_TRUE(Compare(graphA, graphB));
+
+  NodeDef* nodeA;
+  TF_EXPECT_OK(AddNode("A", "OpA", {}, {}, &graphA, &nodeA));
+  NodeDef* nodeB;
+  TF_EXPECT_OK(AddNode("B", "OpB", {"A"}, {}, &graphA, &nodeB));
+  EXPECT_FALSE(Compare(graphA, graphB));
+
+  graphB.mutable_node()->CopyFrom(graphA.node());
+  EXPECT_TRUE(Compare(graphA, graphB));
+}
+
+TEST_F(GraphUtilsTest, ContainsNodeWithName) {
+  GraphDef graph;
+  EXPECT_TRUE(!ContainsNodeWithName("A", graph));
+
+  NodeDef* node;
+  TF_EXPECT_OK(AddNode("A", "OpA", {}, {}, &graph, &node));
+  EXPECT_TRUE(ContainsNodeWithName("A", graph));
+
+  TF_EXPECT_OK(DeleteNodes({"A"}, &graph));
+  EXPECT_TRUE(!ContainsNodeWithName("A", graph));
+}
+
+TEST_F(GraphUtilsTest, ContainsNodeWithOp) {
+  GraphDef graph;
+  EXPECT_TRUE(!ContainsNodeWithOp("OpA", graph));
+
+  NodeDef* node;
+  TF_EXPECT_OK(AddNode("A", "OpA", {}, {}, &graph, &node));
+  EXPECT_TRUE(ContainsNodeWithOp("OpA", graph));
+
+  TF_EXPECT_OK(DeleteNodes({"A"}, &graph));
+  EXPECT_TRUE(!ContainsNodeWithOp("OpA", graph));
+}
+
+TEST_F(GraphUtilsTest, FindNodeWithName) {
+  GraphDef graph;
+  EXPECT_EQ(FindNodeWithName("A", graph), -1);
+
+  NodeDef* node;
+  TF_EXPECT_OK(AddNode("A", "OpA", {}, {}, &graph, &node));
+  EXPECT_NE(FindNodeWithName("A", graph), -1);
+
+  TF_EXPECT_OK(DeleteNodes({"A"}, &graph));
+  EXPECT_EQ(FindNodeWithName("A", graph), -1);
+}
+
+TEST_F(GraphUtilsTest, FindNodeWithOp) {
+  GraphDef graph;
+  EXPECT_EQ(FindNodeWithOp("OpA", graph), -1);
+
+  NodeDef* node;
+  TF_EXPECT_OK(AddNode("A", "OpA", {}, {}, &graph, &node));
+  EXPECT_NE(FindNodeWithOp("OpA", graph), -1);
+
+  TF_EXPECT_OK(DeleteNodes({"A"}, &graph));
+  EXPECT_EQ(FindNodeWithOp("OpA", graph), -1);
+}
+
+}  // namespace
+}  // namespace graph_utils
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
new file mode 100644
index 00000000000000..290326ab75749a
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
@@ -0,0 +1,135 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h"
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/graph_view.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+namespace grappler {
+
+Status MapAndBatchFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
+                                   GraphDef* output) {
+  *output = item.graph;
+  GraphView graph(output);
+  std::set<string> nodes_to_delete;
+  for (const NodeDef& node : item.graph.node()) {
+    if (node.op() != "BatchDataset") {
+      continue;
+    }
+
+    // Use a more descriptive variable name now that we now the node type.
+    NodeDef batch_node(node);
+    GraphView::InputPort input_port = graph.GetInputPort(batch_node.name(), 0);
+    NodeDef* node2 = graph.GetRegularFanin(input_port).node;
+    if (node2->op() != "MapDataset" && node2->op() != "ParallelMapDataset") {
+      continue;
+    }
+
+    // Use a more descriptive variable name now that we now the node type.
+    NodeDef* map_node = node2;
+    NodeDef* new_node = output->mutable_node()->Add();
+    new_node->set_op("MapAndBatchDatasetV2");
+    new_node->set_name(
+        strings::StrCat("MapAndBatchDatasetV2/_", output->node_size()));
+
+    // Set the `input` input argument.
+    new_node->add_input(map_node->input(0));
+
+    // Set the `other_arguments` input arguments.
+    int num_other_args;
+    if (map_node->op() == "ParallelMapDataset") {
+      num_other_args = map_node->input_size() - 2;
+    } else {
+      num_other_args = map_node->input_size() - 1;
+    }
+    for (int i = 0; i < num_other_args; i++) {
+      new_node->add_input(map_node->input(i + 1));
+    }
+
+    // Set the `batch_size` input argument.
+    new_node->add_input(batch_node.input(1));
+
+    // Set the `num_parallel_calls` input argument.
+    if (map_node->op() == "ParallelMapDataset") {
+      // The type of the `num_parallel_calls` argument in ParallelMapDataset
+      // and MapAndBatchDataset is different (int32 and int64 respectively)
+      // so we cannot reuse the same Const node and thus create a new one.
+      NodeDef* v = graph.GetNode(map_node->input(map_node->input_size() - 1));
+      NodeDef* tmp;
+      TF_RETURN_IF_ERROR(graph_utils::AddScalarConstNode<int64>(
+          v->attr().at("value").tensor().int_val(0), output, &tmp));
+      new_node->add_input(tmp->name());
+    } else {
+      NodeDef* tmp;
+      TF_RETURN_IF_ERROR(
+          graph_utils::AddScalarConstNode<int64>(1, output, &tmp));
+      new_node->add_input(tmp->name());
+    }
+
+    // Set the `drop_remainder` input argument.
+    {
+      NodeDef* tmp;
+      TF_RETURN_IF_ERROR(
+          graph_utils::AddScalarConstNode<bool>(false, output, &tmp));
+      new_node->add_input(tmp->name());
+    }
+
+    // Set `f` and `Targuments` attributes.
+    for (auto key : {"f", "Targuments"}) {
+      (*new_node->mutable_attr())[key] = map_node->attr().at(key);
+    }
+    // Set `output_types` and `output_shapes` attributes.
+    for (auto key : {"output_shapes", "output_types"}) {
+      (*new_node->mutable_attr())[key] = batch_node.attr().at(key);
+    }
+
+    // Mark the `Map` and `Batch` nodes for removal.
+    nodes_to_delete.insert(map_node->name());
+    nodes_to_delete.insert(batch_node.name());
+
+    // Update the input of the outputs of the `Batch` node to use
+    // `MapAndBatch`.
+    GraphView::OutputPort output_port =
+        graph.GetOutputPort(batch_node.name(), 0);
+    auto fanout = graph.GetFanout(output_port);
+    for (auto it = fanout.begin(); it != fanout.end(); ++it) {
+      NodeDef* node = it->node;
+      node->set_input(0, new_node->name());
+    }
+  }
+  TF_RETURN_IF_ERROR(graph_utils::DeleteNodes(nodes_to_delete, output));
+  return Status::OK();
+}
+
+void MapAndBatchFusion::Feedback(Cluster* cluster, const GrapplerItem& item,
+                                 const GraphDef& optimize_output,
+                                 double result) {
+  // no-op
+}
+
+REGISTER_GRAPH_OPTIMIZER_AS(MapAndBatchFusion, "map_and_batch_fusion");
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h
new file mode 100644
index 00000000000000..a5a4d91df6e65c
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_AND_BATCH_FUSION_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_AND_BATCH_FUSION_H_
+
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class MapAndBatchFusion : public CustomGraphOptimizer {
+ public:
+  MapAndBatchFusion() {}
+  ~MapAndBatchFusion() override {}
+
+  string name() const override { return "map_and_batch_fusion"; };
+
+  Status Init(const tensorflow::RewriterConfig_CustomGraphOptimizer* config =
+                  nullptr) override {
+    return Status::OK();
+  }
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* output) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimize_output, double result) override;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_MAP_AND_BATCH_FUSION_H_
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
new file mode 100644
index 00000000000000..8c7498dc5d2de8
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
@@ -0,0 +1,235 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h"
+
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+TEST(MapAndBatchFusionTest, FuseMapAndBatchNodesIntoOne) {
+  GrapplerItem item;
+  GraphDef *graph = &item.graph;
+  NodeDef *start_node;
+  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(0, graph, &start_node));
+  NodeDef *stop_node;
+  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(10, graph, &stop_node));
+  NodeDef *step_node;
+  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(1, graph, &step_node));
+
+  std::vector<string> range_inputs(3);
+  range_inputs[0] = start_node->name();
+  range_inputs[1] = stop_node->name();
+  range_inputs[2] = step_node->name();
+  std::vector<std::pair<string, AttrValue>> range_attrs;
+  NodeDef *range_node;
+  TF_ASSERT_OK(graph_utils::AddNode("", "RangeDataset", range_inputs,
+                                    range_attrs, graph, &range_node));
+  NodeDef *captured_input_node;
+  TF_ASSERT_OK(graph_utils::AddScalarConstNode<StringPiece>(
+      "hello", graph, &captured_input_node));
+
+  NodeDef *map_node;
+  {
+    std::vector<string> map_inputs(2);
+    map_inputs[0] = range_node->name();
+    map_inputs[1] = captured_input_node->name();
+    std::vector<std::pair<string, AttrValue>> map_attrs(2);
+    AttrValue f_attr;
+    SetAttrValue("f", &f_attr);
+    map_attrs[0] = std::make_pair("f", f_attr);
+    AttrValue args_attr;
+    SetAttrValue("Targuments", &args_attr);
+    map_attrs[1] = std::make_pair("Targuments", args_attr);
+    TF_ASSERT_OK(graph_utils::AddNode("", "MapDataset", map_inputs, map_attrs,
+                                      graph, &map_node));
+  }
+
+  NodeDef *batch_size_node;
+  TF_ASSERT_OK(
+      graph_utils::AddScalarConstNode<int64>(5, graph, &batch_size_node));
+  NodeDef *batch_node;
+  {
+    std::vector<string> batch_inputs(2);
+    batch_inputs[0] = map_node->name();
+    batch_inputs[1] = batch_size_node->name();
+    std::vector<std::pair<string, AttrValue>> batch_attrs(2);
+    AttrValue shapes_attr;
+    SetAttrValue("output_shapes", &shapes_attr);
+    batch_attrs[0] = std::make_pair("output_shapes", shapes_attr);
+    AttrValue types_attr;
+    SetAttrValue("output_types", &types_attr);
+    batch_attrs[1] = std::make_pair("output_types", types_attr);
+    TF_ASSERT_OK(graph_utils::AddNode("", "BatchDataset", batch_inputs,
+                                      batch_attrs, graph, &batch_node));
+  }
+
+  MapAndBatchFusion optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  EXPECT_FALSE(graph_utils::ContainsNodeWithName(map_node->name(), output));
+  EXPECT_FALSE(graph_utils::ContainsNodeWithName(batch_node->name(), output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("MapAndBatchDatasetV2", output));
+  NodeDef map_and_batch_node =
+      output.node(graph_utils::FindNodeWithOp("MapAndBatchDatasetV2", output));
+  EXPECT_EQ(map_and_batch_node.input_size(), 5);
+  EXPECT_EQ(map_and_batch_node.input(0), map_node->input(0));
+  EXPECT_EQ(map_and_batch_node.input(1), map_node->input(1));
+  EXPECT_EQ(map_and_batch_node.input(2), batch_node->input(1));
+  NodeDef num_parallel_calls_node = output.node(
+      graph_utils::FindNodeWithName(map_and_batch_node.input(3), output));
+  EXPECT_EQ(num_parallel_calls_node.attr().at("value").tensor().int64_val(0),
+            1);
+  NodeDef drop_remainder_node = output.node(
+      graph_utils::FindNodeWithName(map_and_batch_node.input(4), output));
+  EXPECT_EQ(drop_remainder_node.attr().at("value").tensor().bool_val(0), false);
+  EXPECT_TRUE(AreAttrValuesEqual(map_and_batch_node.attr().at("f"),
+                                 map_node->attr().at("f")));
+  EXPECT_TRUE(AreAttrValuesEqual(map_and_batch_node.attr().at("Targuments"),
+                                 map_node->attr().at("Targuments")));
+  EXPECT_TRUE(AreAttrValuesEqual(map_and_batch_node.attr().at("output_shapes"),
+                                 batch_node->attr().at("output_shapes")));
+  EXPECT_TRUE(AreAttrValuesEqual(map_and_batch_node.attr().at("output_types"),
+                                 batch_node->attr().at("output_types")));
+}
+
+TEST(MapAndBatchFusionTest, FuseParallelMapAndBatchNodesIntoOne) {
+  GrapplerItem item;
+  GraphDef *graph = &item.graph;
+  NodeDef *start_node;
+  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(0, graph, &start_node));
+  NodeDef *stop_node;
+  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(10, graph, &stop_node));
+  NodeDef *step_node;
+  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(1, graph, &step_node));
+
+  std::vector<string> range_inputs(3);
+  range_inputs[0] = start_node->name();
+  range_inputs[1] = stop_node->name();
+  range_inputs[2] = step_node->name();
+  std::vector<std::pair<string, AttrValue>> range_attrs;
+  NodeDef *range_node;
+  TF_ASSERT_OK(graph_utils::AddNode("", "RangeDataset", range_inputs,
+                                    range_attrs, graph, &range_node));
+  NodeDef *captured_input_node;
+  TF_ASSERT_OK(graph_utils::AddScalarConstNode<StringPiece>(
+      "hello", graph, &captured_input_node));
+  NodeDef *num_parallel_calls_node;
+  TF_ASSERT_OK(
+      graph_utils::AddScalarConstNode<int>(2, graph, &num_parallel_calls_node));
+
+  NodeDef *map_node;
+  {
+    std::vector<string> map_inputs(3);
+    map_inputs[0] = range_node->name();
+    map_inputs[1] = captured_input_node->name();
+    map_inputs[2] = num_parallel_calls_node->name();
+    std::vector<std::pair<string, AttrValue>> map_attrs(2);
+    AttrValue f_attr;
+    SetAttrValue("f", &f_attr);
+    map_attrs[0] = std::make_pair("f", f_attr);
+    AttrValue args_attr;
+    SetAttrValue("Targuments", &args_attr);
+    map_attrs[1] = std::make_pair("Targuments", args_attr);
+    TF_ASSERT_OK(graph_utils::AddNode("", "ParallelMapDataset", map_inputs,
+                                      map_attrs, graph, &map_node));
+  }
+
+  NodeDef *batch_size_node;
+  TF_ASSERT_OK(
+      graph_utils::AddScalarConstNode<int64>(5, graph, &batch_size_node));
+  NodeDef *batch_node;
+  {
+    std::vector<string> batch_inputs(2);
+    batch_inputs[0] = map_node->name();
+    batch_inputs[1] = batch_size_node->name();
+    std::vector<std::pair<string, AttrValue>> batch_attrs(2);
+    AttrValue shapes_attr;
+    SetAttrValue("output_shapes", &shapes_attr);
+    batch_attrs[0] = std::make_pair("output_shapes", shapes_attr);
+    AttrValue types_attr;
+    SetAttrValue("output_types", &types_attr);
+    batch_attrs[1] = std::make_pair("output_types", types_attr);
+    TF_ASSERT_OK(graph_utils::AddNode("", "BatchDataset", batch_inputs,
+                                      batch_attrs, graph, &batch_node));
+  }
+
+  MapAndBatchFusion optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  EXPECT_FALSE(graph_utils::ContainsNodeWithName(map_node->name(), output));
+  EXPECT_FALSE(graph_utils::ContainsNodeWithName(batch_node->name(), output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("MapAndBatchDatasetV2", output));
+  NodeDef map_and_batch_node =
+      output.node(graph_utils::FindNodeWithOp("MapAndBatchDatasetV2", output));
+  EXPECT_EQ(map_and_batch_node.input_size(), 5);
+  EXPECT_EQ(map_and_batch_node.input(0), map_node->input(0));
+  EXPECT_EQ(map_and_batch_node.input(1), map_node->input(1));
+  EXPECT_EQ(map_and_batch_node.input(2), batch_node->input(1));
+  NodeDef num_parallel_calls_node2 = output.node(
+      graph_utils::FindNodeWithName(map_and_batch_node.input(3), output));
+  EXPECT_EQ(num_parallel_calls_node2.attr().at("value").tensor().int64_val(0),
+            2);
+  NodeDef drop_remainder_node = output.node(
+      graph_utils::FindNodeWithName(map_and_batch_node.input(4), output));
+  EXPECT_EQ(drop_remainder_node.attr().at("value").tensor().bool_val(0), false);
+  EXPECT_TRUE(AreAttrValuesEqual(map_and_batch_node.attr().at("f"),
+                                 map_node->attr().at("f")));
+  EXPECT_TRUE(AreAttrValuesEqual(map_and_batch_node.attr().at("Targuments"),
+                                 map_node->attr().at("Targuments")));
+  EXPECT_TRUE(AreAttrValuesEqual(map_and_batch_node.attr().at("output_shapes"),
+                                 batch_node->attr().at("output_shapes")));
+  EXPECT_TRUE(AreAttrValuesEqual(map_and_batch_node.attr().at("output_types"),
+                                 batch_node->attr().at("output_types")));
+}
+
+TEST(MapAndBatchFusionTest, NoChange) {
+  std::vector<std::pair<string, AttrValue>> empty_attributes;
+
+  GrapplerItem item;
+  GraphDef *graph = &item.graph;
+  NodeDef *start_node;
+  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(0, graph, &start_node));
+  NodeDef *stop_node;
+  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(10, graph, &stop_node));
+  NodeDef *step_node;
+  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(1, graph, &step_node));
+
+  std::vector<string> range_inputs(3);
+  range_inputs[0] = start_node->name();
+  range_inputs[1] = stop_node->name();
+  range_inputs[2] = step_node->name();
+  NodeDef *range_node;
+  TF_ASSERT_OK(graph_utils::AddNode("", "RangeDataset", range_inputs,
+                                    empty_attributes, graph, &range_node));
+
+  MapAndBatchFusion optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  EXPECT_TRUE(graph_utils::Compare(*graph, output));
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
index 7b7fd8115588a6..78a6d0d8359fd1 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
@@ -126,9 +126,9 @@ bool DependencyOptimizer::SafeToConvertToNoOp(const NodeDef& node) {
     return false;
   }
   const std::unordered_set<string> do_not_rewrite_ops{
-      "Assert",      "CheckNumerics",         "_Retval",
-      "_Arg",        "_ParallelConcatUpdate", "_TPUExecute",
-      "_TPUCompile", "ControlTrigger"};
+      "Assert",     "CheckNumerics",         "_Retval",
+      "_Arg",       "_ParallelConcatUpdate", "TPUExecute",
+      "TPUCompile", "ControlTrigger"};
   if (do_not_rewrite_ops.find(node.op()) != do_not_rewrite_ops.end()) {
     return false;
   }
@@ -557,6 +557,92 @@ void DependencyOptimizer::BuildNodeToIdx() {
   }
 }
 
+// Suppose there are cross-device control inputs to node C from multiple nodes
+// that are located on another device, e.g., we have control edges:
+// A->C, B->C
+// where A and B are on device X and C is on device Y.
+// We can reduce cross-device communication by introducing an intermediate
+// NoOp node C' on device X and rewriting the control edges to:
+// A->C', B->C', C' -> C
+void DependencyOptimizer::GroupCrossDeviceControlEdges() {
+  const int num_nodes = optimized_graph_->node_size();
+  for (int i = 0; i < num_nodes; ++i) {
+    NodeDef* node = optimized_graph_->mutable_node(i);
+    if (node->device().empty()) continue;
+
+    // Creates new noop nodes for devices on which multiple control inputs are
+    // located.
+
+    // Map keyed by device name to the newly introduced Noop node for that
+    // device. A nullptr value means that we have only seen a single node on
+    // that device.
+    std::map<string, NodeDef*> noops;
+    int num_noops = 0;
+    for (int j = 0; j < node->input_size(); ++j) {
+      if (IsControlInput(node->input(j))) {
+        const NodeDef* input = node_map_->GetNode(node->input(j));
+        if (input != nullptr && !input->device().empty() &&
+            input->device() != node->device()) {
+          auto emplace_result = noops.emplace(input->device(), nullptr);
+          if (!emplace_result.second &&
+              emplace_result.first->second == nullptr) {
+            // This is the second cross-device control input from the same
+            // device. Creates an intermediate noop node on that device.
+            string group_name;
+            NodeDef* noop;
+            // Creates a fresh node name; there may be conflicting names from
+            // a previous iteration of the optimizer.
+            do {
+              group_name = AddPrefixToNodeName(
+                  node->name(),
+                  strings::StrCat("GroupCrossDeviceControlEdges_", num_noops));
+              noop = node_map_->GetNode(group_name);
+              ++num_noops;
+            } while (noop != nullptr);
+            noop = optimized_graph_->add_node();
+            noop->set_name(group_name);
+            noop->set_device(input->device());
+            noop->set_op("NoOp");
+            node_map_->AddNode(noop->name(), noop);
+            emplace_result.first->second = noop;
+          }
+        }
+      }
+    }
+
+    // Reroute existing control edges to go via the newly introduced NoOp nodes.
+    int pos = 0;
+    while (pos < node->input_size()) {
+      const string& input_name = node->input(pos);
+      if (IsControlInput(input_name)) {
+        NodeDef* input = node_map_->GetNode(input_name);
+        if (input == nullptr) {
+          ++pos;
+        } else {
+          auto it = noops.find(input->device());
+          if (it == noops.end() || it->second == nullptr) {
+            ++pos;
+          } else {
+            node->mutable_input()->SwapElements(pos, node->input_size() - 1);
+            node->mutable_input()->RemoveLast();
+            it->second->add_input(AsControlDependency(*input));
+            node_map_->UpdateOutput(input_name, node->name(),
+                                    it->second->name());
+          }
+        }
+      } else {
+        ++pos;
+      }
+    }
+    for (const auto& entry : noops) {
+      if (entry.second) {
+        node->add_input(AsControlDependency(*entry.second));
+        node_map_->AddOutput(entry.second->name(), node->name());
+      }
+    }
+  }
+}
+
 Status DependencyOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                      GraphDef* optimized_graph) {
   optimized_graph_ = optimized_graph;
@@ -588,6 +674,8 @@ Status DependencyOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
 
     // Dedup control inputs.
     CleanControlInputs();
+
+    GroupCrossDeviceControlEdges();
   }
 
   return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.h b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
index b4db98125aa740..c97ff23e88867d 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
@@ -30,7 +30,8 @@ namespace grappler {
 class DependencyOptimizer : public GraphOptimizer {
  public:
   DependencyOptimizer() {}
-  explicit DependencyOptimizer(RewriterConfig::Toggle opt_level) {}
+  explicit DependencyOptimizer(RewriterConfig::Toggle opt_level)
+      : opt_level_(opt_level) {}
   ~DependencyOptimizer() override {}
 
   string name() const override { return "dependency_optimizer"; };
@@ -61,7 +62,11 @@ class DependencyOptimizer : public GraphOptimizer {
   Status TransitiveReduction();
   // Main driver of dependency optimizations.
   Status OptimizeDependencies();
+  // Replaces multiple cross-device control edges from the same device with a
+  // single control edge.
+  void GroupCrossDeviceControlEdges();
 
+  RewriterConfig::Toggle opt_level_;
   bool fetch_nodes_known_;
   std::unordered_set<string> nodes_to_preserve_;
   std::unique_ptr<NodeMap> node_map_;
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
index 6a297da52d075e..0ae3b4ec3450ad 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
@@ -16,11 +16,13 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/dependency_optimizer.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/optimizers/model_pruner.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
@@ -29,7 +31,7 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-class DependencyOptimizerTest : public ::testing::Test {};
+class DependencyOptimizerTest : public GrapplerTest {};
 
 void VerifyGraphsEqual(const GraphDef& original_graph,
                        const GraphDef& optimized_graph, const string& func) {
@@ -722,6 +724,68 @@ TEST_F(DependencyOptimizerTest, RemoveGreaterEqualWithNoOp) {
   EXPECT_EQ(3, count);
 }
 
+TEST_F(DependencyOptimizerTest, GroupCrossDeviceControlDeps) {
+  GrapplerItem item;
+  {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output a = ops::RandomUniform(s.WithOpName("a").WithDevice("/CPU:1"),
+                                  {1, 2}, DT_FLOAT);
+    Output b = ops::RandomUniform(s.WithOpName("b").WithDevice("/CPU:2"),
+                                  {1, 2}, DT_FLOAT);
+    Output c = ops::RandomUniform(s.WithOpName("c").WithDevice("/CPU:1"),
+                                  {1, 2}, DT_FLOAT);
+    Output d = ops::RandomUniform(s.WithOpName("d").WithDevice("/CPU:3"),
+                                  {1, 2}, DT_FLOAT);
+    Output e = ops::RandomUniform(s.WithOpName("e").WithDevice("/CPU:0"),
+                                  {1, 2}, DT_FLOAT);
+    // Node with cross-device dependencies.
+    auto fetch = ops::Identity(
+        s.WithOpName("f")
+            .WithControlDependencies({a.op(), b.op(), c.op(), d.op()})
+            .WithDevice("/GPU:0"),
+        {e});
+
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+    item.fetch.push_back("f");
+  }
+
+  GraphDef expected;
+  {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output a = ops::RandomUniform(s.WithOpName("a").WithDevice("/CPU:1"),
+                                  {1, 2}, DT_FLOAT);
+    Output b = ops::RandomUniform(s.WithOpName("b").WithDevice("/CPU:2"),
+                                  {1, 2}, DT_FLOAT);
+    Output c = ops::RandomUniform(s.WithOpName("c").WithDevice("/CPU:1"),
+                                  {1, 2}, DT_FLOAT);
+    Output d = ops::RandomUniform(s.WithOpName("d").WithDevice("/CPU:3"),
+                                  {1, 2}, DT_FLOAT);
+    Output e = ops::RandomUniform(s.WithOpName("e").WithDevice("/CPU:0"),
+                                  {1, 2}, DT_FLOAT);
+    auto noop = ops::NoOp(s.WithOpName("GroupCrossDeviceControlEdges_0/f")
+                              .WithDevice("/CPU:1")
+                              .WithControlDependencies({a.op(), c.op()}));
+    auto fetch =
+        ops::Identity(s.WithOpName("f")
+                          .WithControlDependencies({b.op(), d.op(), noop})
+                          .WithDevice("/GPU:0"),
+                      {e});
+
+    TF_CHECK_OK(s.ToGraphDef(&expected));
+  }
+
+  DependencyOptimizer optimizer;
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+  CompareGraphs(expected, output);
+
+  // Run the optimizer again to verify idempotence.
+  item.graph.Swap(&output);
+  output.Clear();
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+  CompareGraphs(expected, output);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index 47e7dc0a969147..b0d689c2dde4bc 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -14,10 +14,13 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/function_optimizer.h"
+
 #include <unordered_map>
+
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph_def_util.h"
@@ -74,11 +77,86 @@ string UniqueSpecializedFunctionName(const FunctionDef& func,
   return unique_name;
 }
 
+// Specialized function instantiation type parameters, body parameters, and
+// const inputs.
+struct FunctionSpecializationSignature {
+  string func_name;
+  std::unordered_map<string, DataType> type_parameters;
+  std::unordered_map<string, AttrValue> body_parameters;
+  std::unordered_map<int, string> const_inputs;
+
+  bool operator==(const FunctionSpecializationSignature& other) const {
+    bool equals = func_name == other.func_name &&
+                  type_parameters == other.type_parameters &&
+                  const_inputs == other.const_inputs;
+
+    if (!equals) return false;
+
+    // Equality is not defined for AttrValue.
+    if (body_parameters.size() != other.body_parameters.size()) return false;
+
+    for (const auto& lhs : body_parameters) {
+      auto it = other.body_parameters.find(lhs.first);
+      if (it == other.body_parameters.end()) return false;
+      if (!FastAreAttrValuesEqual(lhs.second, (*it).second)) return false;
+    }
+
+    return true;
+  }
+
+  struct Hash {
+    uint64 operator()(FunctionSpecializationSignature const& s) const {
+      uint64 h = Hash64(s.func_name);
+
+      // Use std::map for deterministic iteration order.
+
+      std::map<string, DataType> types(s.type_parameters.begin(),
+                                       s.type_parameters.end());
+      for (const auto& pair : types) {
+        AttrValue attr_value;
+        attr_value.set_type(pair.second);
+        h = Hash64Combine(Hash64(pair.first), h);
+        h = Hash64Combine(AttrValueHash(attr_value), h);
+      }
+
+      std::map<string, AttrValue> body(s.body_parameters.begin(),
+                                       s.body_parameters.end());
+      for (const auto& pair : body) {
+        h = Hash64Combine(Hash64(pair.first), h);
+        h = Hash64Combine(FastAttrValueHash(pair.second), h);
+      }
+
+      std::map<int, string> inputs(s.const_inputs.begin(),
+                                   s.const_inputs.end());
+      for (const auto& pair : inputs) {
+        h = Hash64Combine(std::hash<int>()(pair.first), h);
+        h = Hash64Combine(Hash64(pair.second), h);
+      }
+
+      return h;
+    }
+  };
+};
+
+struct FunctionSpecialization {
+  string specialized_func_name;
+  std::unordered_set<string> const_inputs;
+  std::unordered_set<string> control_deps;
+};
+
+class FakeCPUDevice : public Device {
+ public:
+  FakeCPUDevice(Env* env, const DeviceAttributes& attr) : Device(env, attr) {}
+  Status Sync() override { return Status::OK(); }
+};
+
 class FunctionOptimizerContext {
  public:
   explicit FunctionOptimizerContext(RewriterConfig::Toggle opt_level,
                                     const GrapplerItem& item)
-      : function_library_(OpRegistry::Global(), item.graph.library()) {
+      : graph_version_(item.graph.versions().producer()),
+        function_library_(OpRegistry::Global(), item.graph.library()) {
+    InitializeTrulyConstNodes(item);
     InitializeInlinedFunctions(opt_level, item);
   }
 
@@ -86,20 +164,56 @@ class FunctionOptimizerContext {
     return function_library_;
   }
 
-  FunctionLibraryDefinition& mutable_function_library() {
-    return function_library_;
+  FunctionLibraryDefinition* mutable_function_library() {
+    return &function_library_;
+  }
+
+  FunctionLibraryRuntime* mutable_function_library_runtime() {
+    InitializeFunctionLibraryRuntime();
+    return flr_;
   }
 
   bool IsInlinedFunction(const string& name) const {
     return inlined_functions_.count(name) > 0;
   }
 
+  bool IsTrulyConst(const string& name) const {
+    return TrulyConstNode(name) != nullptr;
+  }
+
+  const NodeDef* TrulyConstNode(const string& name) const {
+    return gtl::FindWithDefault(truly_const_nodes_, name, nullptr);
+  }
+
   // Find inlining candidate by name. Return nullptr if not found.
   const FunctionDef* FindInlinedFunction(const string& name) const {
     return gtl::FindWithDefault(inlined_functions_, name, nullptr);
   }
 
+  const FunctionSpecialization* FindFunctionSpecialization(
+      const FunctionSpecializationSignature& sig) const {
+    return gtl::FindOrNull(specialized_functions_, sig);
+  }
+
+  void AddSpecializedFunction(const FunctionSpecializationSignature& sig,
+                              const FunctionSpecialization& specialized_func) {
+    specialized_functions_.emplace(sig, specialized_func);
+  }
+
  private:
+  void InitializeTrulyConstNodes(const GrapplerItem& item) {
+    std::unordered_set<string> feed_nodes;
+    for (const auto& feed : item.feed) {
+      feed_nodes.insert(NodeName(feed.first));
+    }
+
+    for (const NodeDef& node : item.graph.node()) {
+      if (IsConstant(node) && feed_nodes.count(node.name()) == 0) {
+        truly_const_nodes_[node.name()] = &node;
+      }
+    }
+  }
+
   void InitializeInlinedFunctions(RewriterConfig::Toggle opt_level,
                                   const GrapplerItem& item) {
     bool aggressive = opt_level == RewriterConfig::AGGRESSIVE;
@@ -120,13 +234,52 @@ class FunctionOptimizerContext {
     }
   }
 
+  void InitializeFunctionLibraryRuntime() {
+    if (!flr_) {
+      Env* env = Env::Default();
+      DeviceAttributes attr;
+      attr.set_name("/device:CPU:0");
+      attr.set_device_type("CPU");
+      Device* device = new FakeCPUDevice(env, attr);
+      device_mgr_.reset(new DeviceMgr({device}));
+      OptimizerOptions optimizer_opts;
+      optimizer_opts.set_do_function_inlining(true);
+      process_flr_.reset(new ProcessFunctionLibraryRuntime(
+          device_mgr_.get(), env, graph_version_, &function_library_,
+          optimizer_opts));
+      flr_ = process_flr_->GetFLR(device->name());
+    }
+  }
+
+  const int graph_version_;
   FunctionLibraryDefinition function_library_;
+
+  // These fields initialized lazily only if needed.
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> process_flr_;
+  FunctionLibraryRuntime* flr_ = nullptr;
+
   // Functions that can be inlined into optimized graph.
   std::unordered_map<string, const FunctionDef*> inlined_functions_;
+  // Nodes that are Const and not in feed.
+  std::unordered_map<string, const NodeDef*> truly_const_nodes_;
+  // Specialized functions.
+  std::unordered_map<FunctionSpecializationSignature,
+                     const FunctionSpecialization,
+                     FunctionSpecializationSignature::Hash>
+      specialized_functions_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(FunctionOptimizerContext);
 };
 
+bool HasTrulyConstInputs(const NodeDef& node,
+                         const FunctionOptimizerContext& ctx) {
+  const auto is_truly_const = [&ctx](const string& input) {
+    return ctx.IsTrulyConst(NodeName(input));
+  };
+  return std::any_of(node.input().begin(), node.input().end(), is_truly_const);
+}
+
 // Return trimmed FunctionDefLibrary with functions that are reachable from
 // the optimized graph.
 FunctionDefLibrary TrimFunctionLibrary(const FunctionLibraryDefinition& flib,
@@ -208,6 +361,97 @@ FunctionDefLibrary TrimFunctionLibrary(const FunctionLibraryDefinition& flib,
   return lib;
 }
 
+// Push all constant inputs of an instantiating node into the function body.
+Status PushDownConstInputs(const NodeDef& func_node,
+                           const FunctionOptimizerContext& ctx,
+                           GrapplerFunctionItem* item,
+                           std::unordered_set<string>* const_inputs,
+                           std::unordered_set<string>* control_deps) {
+  // Record node control dependencies in the control_deps set.
+  const auto record_control_deps = [&](const NodeDef* const_input) {
+    for (int i = const_input->input_size() - 1; i >= 0; --i) {
+      const string& input = const_input->input(i);
+      if (IsControlInput(input))
+        control_deps->insert(input);
+      else
+        break;
+    }
+  };
+
+  for (int i = func_node.input_size() - 1; i >= 0; --i) {
+    const string& input = func_node.input(i);
+    if (IsControlInput(input)) continue;
+
+    const string node_name = NodeName(input);
+    if (ctx.IsTrulyConst(node_name)) {
+      VLOG(3) << "Push const into function body: input=" << input;
+      const auto* const_input = CHECK_NOTNULL(ctx.TrulyConstNode(node_name));
+      const_inputs->insert(input);
+      record_control_deps(const_input);
+      TF_RETURN_IF_ERROR(ReplaceInputWithConst(*const_input, i, item));
+    }
+  }
+
+  return Status::OK();
+}
+
+// Remove inputs that were pushed into the function body, and attach their
+// control dependencies to the function caller node.
+void RemovePushedDownConstInputs(const std::unordered_set<string>& const_inputs,
+                                 const std::unordered_set<string>& control_deps,
+                                 NodeDef* specialized_func_node) {
+  // Nothing to do if it was no const inputs to the function node.
+  if (const_inputs.empty()) return;
+
+  // Keep only non-const inputs.
+  std::vector<string> keep_inputs;
+  const auto& inputs = specialized_func_node->input();
+  std::copy_if(inputs.begin(), inputs.end(), std::back_inserter(keep_inputs),
+               [&](const string& input) {
+                 return const_inputs.find(input) == const_inputs.end();
+               });
+
+  specialized_func_node->clear_input();
+  for (const auto& keep : keep_inputs) specialized_func_node->add_input(keep);
+
+  // Attach control dependencies of pushed down const input to the caller node.
+  if (!control_deps.empty()) {
+    std::unordered_set<string> existing_control_deps;
+
+    for (const string& input : keep_inputs) {
+      existing_control_deps.insert(AsControlDependency(NodeName(input)));
+    }
+
+    for (const string& ctrl : control_deps) {
+      if (existing_control_deps.find(ctrl) == existing_control_deps.end()) {
+        VLOG(3) << "Forward control dependency: input=" << ctrl;
+        specialized_func_node->add_input(ctrl);
+      }
+    }
+  }
+}
+
+Status InitializeFunctionSpecializationSignature(
+    const NodeDef& func_node, const FunctionDef& func,
+    const AttrValueMap& func_attr, const FunctionOptimizerContext& ctx,
+    FunctionSpecializationSignature* sig) {
+  sig->func_name = func.signature().name();
+
+  TF_RETURN_IF_ERROR(
+      InstantiationTypeParameters(func, func_attr, &sig->type_parameters));
+  TF_RETURN_IF_ERROR(
+      InstantiationBodyParameters(func, func_attr, &sig->body_parameters));
+
+  for (int i = 0; i < func_node.input_size(); ++i) {
+    const string& input = func_node.input(i);
+    if (ctx.IsTrulyConst(input)) {
+      sig->const_inputs.emplace(i, input);
+    }
+  }
+
+  return Status::OK();
+}
+
 Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
                           FunctionOptimizerContext* ctx,
                           GraphDef* optimized_graph) {
@@ -217,13 +461,47 @@ Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
   const std::unordered_map<string, AttrValue> func_attr(
       func_node.attr().begin(), func_node.attr().end());
 
+  FunctionSpecializationSignature signature;
+  TF_RETURN_IF_ERROR(InitializeFunctionSpecializationSignature(
+      func_node, func, func_attr, *ctx, &signature));
+
+  // Check if function was already specialized for identical context.
+  const FunctionSpecialization* already_specialized =
+      ctx->FindFunctionSpecialization(signature);
+
+  if (already_specialized) {
+    VLOG(2) << "Function was already specialized in identical context: "
+               "specialized_name="
+            << already_specialized->specialized_func_name;
+
+    // Add a function call node for the specialized function.
+    NodeDef* specialized_func_node = optimized_graph->add_node();
+    *specialized_func_node = func_node;
+    specialized_func_node->set_op(already_specialized->specialized_func_name);
+
+    RemovePushedDownConstInputs(already_specialized->const_inputs,
+                                already_specialized->control_deps,
+                                specialized_func_node);
+
+    return Status::OK();
+  }
+
+  // Add a new specialized function definition to the library.
   const auto& flib = ctx->function_library();
 
-  // Make a GrapplerFunctionItem and immediately convert it back to FunctionDef.
+  // Make a GrapplerFunctionItem and convert it back to FunctionDef after
+  // pushing all constant inputs into the function body.
   GrapplerFunctionItem item;
   TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
 
-  // TODO(ezhulenev): Push down const inputs and known input shapes.
+  // Push const inputs into the function body, and keep track of their control
+  // dependencies.
+  std::unordered_set<string> const_inputs;
+  std::unordered_set<string> control_deps;
+  TF_RETURN_IF_ERROR(PushDownConstInputs(func_node, *ctx, &item, &const_inputs,
+                                         &control_deps));
+
+  // TODO(ezhulenev): Push down known input shapes.
   FunctionDef specialized_func;
   TF_RETURN_IF_ERROR(MakeFunctionDef(item, flib, &specialized_func));
 
@@ -237,73 +515,63 @@ Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
 
   // Add specialized function to the library.
   TF_RETURN_IF_ERROR(
-      ctx->mutable_function_library().AddFunctionDef(specialized_func));
+      ctx->mutable_function_library()->AddFunctionDef(specialized_func));
 
   // Add a function call node for the specialized function.
   NodeDef* specialized_func_node = optimized_graph->add_node();
   *specialized_func_node = func_node;
   specialized_func_node->set_op(specialized_func_name);
 
-  return Status::OK();
-}
+  // Update specialized node to remove inputs for pushed down consts.
+  RemovePushedDownConstInputs(const_inputs, control_deps,
+                              specialized_func_node);
+
+  ctx->AddSpecializedFunction(
+      signature, {specialized_func_name, const_inputs, control_deps});
 
-// Copy input/output argument type to the type_list. Return error if argument
-// type is not explicitly defined, and not specified in function attributes.
-Status CopyArgType(const NodeDef& func_node,
-                   const std::unordered_map<string, AttrValue>& func_attr,
-                   const string& arg_kind, const OpDef::ArgDef& arg,
-                   AttrValue::ListValue* type_list) {
-  if (arg.type() != DT_INVALID) {
-    type_list->add_type(arg.type());
-  } else {
-    auto it = func_attr.find(arg.type_attr());
-    if (it == func_attr.end() || it->second.type() == DT_INVALID) {
-      return errors::InvalidArgument(
-          "Invalid ", arg_kind, " argument ", arg.name(), " for function ",
-          func_node.op(), " instantiated by ", func_node.name());
-    }
-    type_list->add_type(it->second.type());
-  }
   return Status::OK();
 }
 
-// Add an IdentityN op to hook the function inputs to: this ensures that
+// Create an IdentityN node to hook the function inputs to: this ensures that
 // they're all evaluated before the evaluation of the function body starts.
-Status HookInlinedFunctionInputs(
-    const NodeDef& func_node, const FunctionDef& func,
-    const std::unordered_map<string, AttrValue>& func_attr, NodeDef* inputs) {
-  inputs->set_name(strings::StrCat(func_node.name(), "/", "inlined_inputs"));
-  inputs->set_op("IdentityN");
-  inputs->set_device(func_node.device());
-  *inputs->mutable_input() = func_node.input();
+NodeDef InlinedFunctionInputsNode(const NodeDef& func_node,
+                                  const GrapplerFunctionItem& item) {
+  NodeDef inputs;
+  inputs.set_name(strings::StrCat(func_node.name(), "/", "inlined_inputs"));
+  inputs.set_op("IdentityN");
+  inputs.set_device(func_node.device());
+  *inputs.mutable_input() = func_node.input();
   AttrValue::ListValue* type_list =
-      (*inputs->mutable_attr())["T"].mutable_list();
-  for (const OpDef::ArgDef& arg : func.signature().input_arg()) {
-    TF_RETURN_IF_ERROR(
-        CopyArgType(func_node, func_attr, "input", arg, type_list));
+      (*inputs.mutable_attr())["T"].mutable_list();
+
+  for (const InputArgExpansion& input_arg : item.inputs()) {
+    for (int i = 0; i < input_arg.placeholders.size(); ++i) {
+      type_list->add_type(input_arg.data_type);
+    }
   }
-  return Status::OK();
+
+  return inputs;
 }
 
-// Add an IdentityN op to hook the function outputs to: this ensures that the
-// function body is fully evaluated before its fanout gets scheduled.
-Status HookInlinedFunctionOutputs(
-    const NodeDef& func_node, const FunctionDef& func,
-    const std::unordered_map<string, AttrValue>& func_attr,
-    const gtl::ArraySlice<string> fetch, NodeDef* outputs) {
-  outputs->set_name(func_node.name());
-  outputs->set_op("IdentityN");
-  outputs->set_device(func_node.device());
+// Create an IdentityN node to hook the function outputs to: this ensures that
+// the function body is fully evaluated before its fanout gets scheduled.
+NodeDef InlinedFunctionOutputsNode(const NodeDef& func_node,
+                                   const GrapplerFunctionItem& item) {
+  NodeDef outputs;
+  outputs.set_name(func_node.name());
+  outputs.set_op("IdentityN");
+  outputs.set_device(func_node.device());
   AttrValue::ListValue* type_list =
-      (*outputs->mutable_attr())["T"].mutable_list();
-  for (int i = 0; i < func.signature().output_arg_size(); ++i) {
-    const OpDef::ArgDef& arg = func.signature().output_arg(i);
-    TF_RETURN_IF_ERROR(
-        CopyArgType(func_node, func_attr, "output", arg, type_list));
-    // Use the fetch names since they take into account the output mapping.
-    outputs->add_input(strings::StrCat(func_node.name(), "/", fetch[i]));
+      (*outputs.mutable_attr())["T"].mutable_list();
+
+  for (const OutputArgExpansion& output_arg : item.outputs()) {
+    for (const string& output_tensor : output_arg.output_tensors) {
+      type_list->add_type(output_arg.data_type);
+      outputs.add_input(strings::StrCat(func_node.name(), "/", output_tensor));
+    }
   }
-  return Status::OK();
+
+  return outputs;
 }
 
 Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
@@ -324,27 +592,30 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
                                    ". Error: ", item_status.error_message());
   }
 
-  std::unordered_map<string, int> input_nodes;
-  for (int i = 0; i < func.signature().input_arg_size(); ++i) {
-    const OpDef::ArgDef& arg = func.signature().input_arg(i);
-    input_nodes[arg.name()] = i;
+  // Mapping from input placeholder name to function input position.
+  int idx = 0;
+  std::unordered_map<string, int> input_placeholders_idx;
+  for (const InputArgExpansion& input_arg : item.inputs()) {
+    for (const string& placeholder : input_arg.placeholders) {
+      input_placeholders_idx[placeholder] = idx++;
+    }
   }
 
-  // Hook inlined function inputs to IdentityN node
+  // Hook inlined function inputs to IdentityN node.
   NodeDef* func_inputs = optimized_graph->add_node();
-  TF_RETURN_IF_ERROR(
-      HookInlinedFunctionInputs(func_node, func, func_attr, func_inputs));
+  *func_inputs = InlinedFunctionInputsNode(func_node, item);
 
   for (NodeDef& func_body_node : *item.mutable_function_body().mutable_node()) {
-    if (input_nodes.find(func_body_node.name()) != input_nodes.end()) {
+    if (item.IsInputPlaceholder(func_body_node.name())) {
+      // Turn input placeholders into identity nodes.
       CHECK_EQ(0, func_body_node.input_size());
-      // Turn input placeholders into identity nodes
-      if (IsPlaceholder(func_body_node)) {
-        func_body_node.set_op("Identity");
-      }
-      int input_id = input_nodes[func_body_node.name()];
+      func_body_node.set_op("Identity");
+      (*func_body_node.mutable_attr())["T"] = func_body_node.attr().at("dtype");
+      func_body_node.mutable_attr()->erase("dtype");
+      func_body_node.mutable_attr()->erase("shape");
+      int input_idx = input_placeholders_idx[func_body_node.name()];
       func_body_node.add_input(
-          strings::StrCat(func_inputs->name(), ":", input_id));
+          strings::StrCat(func_inputs->name(), ":", input_idx));
     } else {
       // Update the input names if any.
       for (string& input : *func_body_node.mutable_input()) {
@@ -358,18 +629,18 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
       }
     }
 
-    // Add the node name as a prefix to avoid collisions after inlining
+    // Add the node name as a prefix to avoid collisions after inlining.
     func_body_node.set_name(
         strings::StrCat(func_node.name(), "/", func_body_node.name()));
 
-    // Make sure the node is placed
+    // Make sure the node is placed.
     func_body_node.set_device(func_node.device());
 
-    // Check if a body node is itself a function
+    // Check if a body node is itself a function.
     const FunctionDef* func_body_node_func =
         ctx.FindInlinedFunction(func_body_node.op());
     if (func_body_node_func != nullptr) {
-      // Recursively inline function calls
+      // Recursively inline function calls.
       TF_RETURN_IF_ERROR(InlineFunction(func_body_node, *func_body_node_func,
                                         ctx, optimized_graph));
     } else {
@@ -377,73 +648,21 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
       for (const auto& attr : func.attr()) {
         func_body_node.mutable_attr()->insert(attr);
       }
-      // Move the node to the main graph
+      // Move the node to the main graph.
       optimized_graph->add_node()->Swap(&func_body_node);
     }
   }
 
-  // Hook inlined function outputs to IdentityN node
+  // Hook inlined function outputs to IdentityN node.
   NodeDef* func_outputs = optimized_graph->add_node();
-  std::vector<string> fetch = OutputTensors(item);
-  TF_RETURN_IF_ERROR(HookInlinedFunctionOutputs(func_node, func, func_attr,
-                                                fetch, func_outputs));
+  *func_outputs = InlinedFunctionOutputsNode(func_node, item);
 
   return Status::OK();
 }
 
-class FakeCPUDevice : public Device {
- public:
-  FakeCPUDevice(Env* env, const DeviceAttributes& attr) : Device(env, attr) {}
-  Status Sync() override { return Status::OK(); }
-};
-
-class SymbolicGradientEnv {
- public:
-  SymbolicGradientEnv(int graph_version, const FunctionDefLibrary& library)
-      : graph_version_(graph_version), library_(library) {}
-
-  FunctionLibraryDefinition* function_library() {
-    InitializeIfNeeded();
-    return fld_.get();
-  }
-  FunctionLibraryRuntime* function_library_runtime() {
-    InitializeIfNeeded();
-    return flr_;
-  }
-
- private:
-  // This initialization is expensive. Do it lazily to avoid paying for it
-  // unless it's needed.
-  void InitializeIfNeeded() {
-    if (flr_) {
-      return;
-    }
-    Env* env = Env::Default();
-    DeviceAttributes attr;
-    attr.set_name("/device:CPU:0");
-    attr.set_device_type("CPU");
-    FakeCPUDevice* dev = new FakeCPUDevice(env, attr);
-    std::vector<Device*> devices;
-    devices.push_back(dev);
-    dvc_mgr_.reset(new DeviceMgr(devices));
-    fld_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), library_));
-    OptimizerOptions optimizer_opts;
-    optimizer_opts.set_do_function_inlining(true);
-    pflr_.reset(new ProcessFunctionLibraryRuntime(
-        dvc_mgr_.get(), env, graph_version_, fld_.get(), optimizer_opts));
-    flr_ = pflr_->GetFLR(dev->name());
-  }
-
-  const int graph_version_;
-  const FunctionDefLibrary& library_;
-  std::unique_ptr<DeviceMgr> dvc_mgr_;
-  std::unique_ptr<FunctionLibraryDefinition> fld_;
-  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
-  FunctionLibraryRuntime* flr_ = nullptr;
-};
-
-Status InlineSymbolicGradient(const NodeDef& node, SymbolicGradientEnv* env,
-                              GraphDef* inlined_graph) {
+Status InlineSymbolicGradient(const NodeDef& node,
+                              FunctionOptimizerContext* ctx,
+                              GraphDef* optimized_graph) {
   VLOG(2) << "Inline symbolic gradient: " << SummarizeNodeDef(node);
 
   GraphDef graph_def;
@@ -482,15 +701,15 @@ Status InlineSymbolicGradient(const NodeDef& node, SymbolicGradientEnv* env,
   GraphConstructorOptions graph_ctor_opts;
   graph_ctor_opts.allow_internal_ops = true;
   graph_ctor_opts.expect_device_spec = false;
-  Graph graph(env->function_library());
+  Graph graph(ctx->function_library());
   TF_RETURN_IF_ERROR(
       ConvertGraphDefToGraph(graph_ctor_opts, graph_def, &graph));
 
   // Recursively inline the functions until there is nothing more to inline. We
   // should at least expand one function.
   int counter = 0;
-  while (counter < 50 &&
-         ExpandInlineFunctions(env->function_library_runtime(), &graph)) {
+  while (counter < 50 && ExpandInlineFunctions(
+                             ctx->mutable_function_library_runtime(), &graph)) {
     ++counter;
   }
 
@@ -531,7 +750,7 @@ Status InlineSymbolicGradient(const NodeDef& node, SymbolicGradientEnv* env,
       }
     }
     inlined_node.set_device(node.device());
-    inlined_graph->add_node()->Swap(&inlined_node);
+    optimized_graph->add_node()->Swap(&inlined_node);
   }
 
   return Status::OK();
@@ -551,8 +770,6 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   }
 
   FunctionOptimizerContext ctx(opt_level_, item);
-  SymbolicGradientEnv env(item.graph.versions().producer(),
-                          item.graph.library());
 
   bool inline_gradients = options_.enable_symbolic_gradient_inlining;
   bool inline_func = options_.enable_function_inlining;
@@ -561,31 +778,62 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   for (const NodeDef& node : item.graph.node()) {
     const string func_name = node.op();
 
+    // Each node optimization can modify optimized graph only by adding new
+    // nodes, we can check node size to make sure that graph was not modified.
+    const int num_nodes_before = optimized_graph->node_size();
+    const auto is_graph_modified = [&]() {
+      int num_nodes = optimized_graph->node_size();
+      CHECK_GE(num_nodes, num_nodes_before) << "Nodes should not be removed";
+      return num_nodes > num_nodes_before;
+    };
+
+    // Add a copy of an input graph node to the optimized graph.
+    const auto add_node_copy = [&]() { *optimized_graph->add_node() = node; };
+
+// Skip errors if optimized graph was not modified before error happened.
+#define TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(...)                     \
+  do {                                                             \
+    const Status _status = (__VA_ARGS__);                          \
+    if (TF_PREDICT_FALSE(!_status.ok() && is_graph_modified()))    \
+      return _status;                                              \
+    if (TF_PREDICT_FALSE(!_status.ok() && !is_graph_modified())) { \
+      VLOG(3) << "Skip error: " << _status.error_message();        \
+      add_node_copy();                                             \
+    }                                                              \
+  } while (0)
+
+    // 1. Inline symbolic gradients into the optimized graph.
     if (func_name == "SymbolicGradient" && inline_gradients) {
       // Inline symbolic gradients only if the corresponding function is inlined
       const auto* f_attr = gtl::FindOrNull(node.attr(), "f");
       string f_name = f_attr != nullptr ? f_attr->func().name() : "";
       if (ctx.IsInlinedFunction(f_name)) {
-        TF_RETURN_IF_ERROR(InlineSymbolicGradient(node, &env, optimized_graph));
+        TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(
+            InlineSymbolicGradient(node, &ctx, optimized_graph));
         continue;
       }
     }
 
+    // 2. Check if a node op is a function call.
     const FunctionDef* func = ctx.function_library().Find(func_name);
     if (func != nullptr) {
+      // 2a. Inline it if it's allowed to do so.
       if (inline_func && ctx.IsInlinedFunction(func_name)) {
         // Inline function body into the optimized graph}
-        TF_RETURN_IF_ERROR(InlineFunction(node, *func, ctx, optimized_graph));
+        TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(
+            InlineFunction(node, *func, ctx, optimized_graph));
         continue;
       }
 
-      if (specialize_func && IsParametrized(*func)) {
-        // TODO(ezhulenev): Specialize function call if input is a Const or has
-        // a known shape. Const input tensors can be pushed into the function
-        // body and removed from function inputs.
+      // Do not specialize if function has custom gradient.
+      const string grad_func = ctx.function_library().FindGradient(func_name);
 
+      // 2b. Specialize it to it's instantiation context if can't be inlined.
+      if (specialize_func && grad_func.empty() &&
+          (IsParametrized(*func) || HasTrulyConstInputs(node, ctx))) {
+        // TODO(ezhulenev): Specialize function call if input has a known shape.
         // Specialize function body for its instantiation attributes and inputs.
-        TF_RETURN_IF_ERROR(
+        TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(
             SpecializeFunction(node, *func, &ctx, optimized_graph));
         continue;
       }
@@ -593,7 +841,9 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
 
     // If we reached this point, node was not handled by any of the stages
     // (inline, specialize), simply add a copy to the graph.
-    *optimized_graph->add_node() = node;
+    add_node_copy();
+
+#undef TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED
   }
 
   *optimized_graph->mutable_versions() = item.graph.versions();
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.h b/tensorflow/core/grappler/optimizers/function_optimizer.h
index e307b4e533fc5b..4352555064c43c 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_FUNCTION_OPTIMIZER_H_
-#define TENSORFLOW_GRAPPLER_OPTIMIZERS_FUNCTION_OPTIMIZER_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_FUNCTION_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_FUNCTION_OPTIMIZER_H_
 
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
@@ -55,4 +55,4 @@ class FunctionOptimizer : public GraphOptimizer {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_FUNCTION_OPTIMIZER_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_FUNCTION_OPTIMIZER_H_
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
index 6147e8a27c0e18..d043f6129d483b 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
@@ -111,6 +111,82 @@ TEST_F(FunctionOptimizerTest, InlineFunction_SimpleFunction) {
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
 
+TEST_F(FunctionOptimizerTest, InlineFunction_SkipErrorsIfGraphNotModified) {
+  using test::function::NDef;
+
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
+
+  // Standard XTimesTwo() function.
+  FunctionDef x_times_two = test::function::XTimesTwo();
+
+  // Function with sequence of tensors as an input (currently not supported).
+  FunctionDef my_identity_n = FunctionDefHelper::Create(
+      // Name
+      "MyIdentityN",
+      // Args
+      {"x: N*T"},
+      // Return values
+      {"out: N*T"},
+      // Attrs
+      {"N:int", "T:{float, double, int32, int64}"},
+      // Nodes (just forward inputs through IdentityN)
+      {
+          {{"Id"}, "IdentityN", {"x"}, {{"T", "$T"}, {"N", "$N"}}},
+      },
+      // Output mapping
+      {{"out", "Id:output:0"}});
+
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("y1", "XTimesTwo", {"x"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("y2", "MyIdentityN", {"x"}, {{"T", DT_FLOAT}, {"N", 1}}, kDevice),
+       NDef("z1", "Identity", {"y1:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("z2", "Identity", {"y2:0"}, {{"T", DT_FLOAT}}, kDevice)},
+      // FunctionLib
+      {x_times_two, my_identity_n});
+
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  // Verify that only MyIdentityN is in the function library after optimization.
+  ASSERT_EQ(1, output.library().function().size());
+  EXPECT_EQ("MyIdentityN", output.library().function(0).signature().name());
+
+  // And that XTimesTwo was successfully inlined.
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "y1/inlined_inputs") {
+      found++;
+      EXPECT_EQ("IdentityN", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+    } else if (node.name() == "y1") {
+      found++;
+      EXPECT_EQ("IdentityN", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("y1/y", node.input(0));
+    } else if (node.name() == "y2") {
+      found++;
+      EXPECT_EQ("MyIdentityN", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+    }
+  }
+  EXPECT_EQ(3, found);
+
+  Tensor pi = test::AsScalar<float>(3.14f);
+  item.fetch = {"z1"};
+  item.feed.emplace_back("x", pi);
+  auto tensors_expected = EvaluateFetchNodes(item);
+  GrapplerItem optimized(item, std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
 TEST_F(FunctionOptimizerTest, InlineFunction_FixedTypeFunction) {
   using test::function::NDef;
 
@@ -657,5 +733,208 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_XTimesTwo) {
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
 
+TEST_F(FunctionOptimizerTest, SpecializeFunction_PushDownConstInput) {
+  using test::function::NDef;
+
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
+
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"output"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "output:z:0"}});
+
+  // Mark MyMul as noinline.
+  (*mul_func.mutable_attr())["_noinline"].set_b(true);
+  std::vector<FunctionDef> function_library = {mul_func};
+
+  // Build a graph to compute y = MyMul(x, 2.0).
+  const Tensor kTwo = test::AsScalar<float>(2.0);
+
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("init", "NoOp", {}, {}, kDevice),
+       NDef("two", "Const", {"^init", "^x"},
+            {{"dtype", DT_FLOAT}, {"value", kTwo}}, kDevice),
+       NDef("y", "MyMul", {"x", "two"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("z", "Identity", {"y"}, {{"T", DT_FLOAT}}, kDevice)},
+      function_library);
+
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  // Make sure that specialized function was added to the library and original
+  // function was removed.
+  ASSERT_EQ(1, output.library().function_size());
+
+  const FunctionDef& specialized = output.library().function(0);
+  EXPECT_EQ("MyMul_specialized_for_y", specialized.signature().name());
+  EXPECT_EQ(1, specialized.signature().input_arg_size());
+
+  // And 'y' node has control dependencies of a pushed down const node.
+  int count = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "y" && count++) {
+      ASSERT_EQ(2, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+      EXPECT_EQ("^init", node.input(1));
+    }
+  }
+  EXPECT_EQ(1, count);
+
+  // And that graph evaluation yields the same result.
+  Tensor pi = test::AsScalar<float>(3.14f);
+  item.fetch = {"z"};
+  item.feed.emplace_back("x", pi);
+
+  auto tensors_expected = EvaluateFetchNodes(item);
+  GrapplerItem optimized(item, std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
+TEST_F(FunctionOptimizerTest, SpecializeFunction_OncePerUniqueContext) {
+  using test::function::NDef;
+
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
+
+  // Mark MyMul as noinline.
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, int32}"},
+      {{{"output"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "output:z:0"}});
+  (*mul_func.mutable_attr())["_noinline"].set_b(true);
+  std::vector<FunctionDef> function_library = {mul_func};
+
+  const Tensor kTwo = test::AsScalar<float>(2.0);
+  const Tensor kThree = test::AsScalar<float>(3.0);
+
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("init", "NoOp", {}, {}, kDevice),
+
+       // Float placeholders.
+       NDef("xf", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("yf", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+
+       // Int32 placeholders.
+       NDef("xi", "Placeholder", {}, {{"dtype", DT_INT32}}, kDevice),
+       NDef("yi", "Placeholder", {}, {{"dtype", DT_INT32}}, kDevice),
+
+       // Consts. Control inputs has to be attached to specialized func calls.
+       NDef("two", "Const", {"^init", "^xf"},
+            {{"dtype", DT_FLOAT}, {"value", kTwo}}, kDevice),
+       NDef("three", "Const", {"^init", "^xf"},
+            {{"dtype", DT_FLOAT}, {"value", kThree}}, kDevice),
+
+       // Specialization #1: DT_FLOAT type parameter.
+       NDef("mul_1", "MyMul", {"xf", "yf"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("mul_2", "MyMul", {"yf", "xf"}, {{"T", DT_FLOAT}}, kDevice),
+
+       // Specialization #2: DT_INT32 type parameter.
+       NDef("mul_3", "MyMul", {"xi", "yi"}, {{"T", DT_INT32}}, kDevice),
+
+       // Specialization #3: DT_FLOAT type parameter + const input kTwo.
+       NDef("mul_4", "MyMul", {"xf", "two"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("mul_5", "MyMul", {"yf", "two"}, {{"T", DT_FLOAT}}, kDevice),
+
+       // Specialization #4: DT_FLOAT type parameter + const input kThree.
+       NDef("mul_6", "MyMul", {"three", "xf"}, {{"T", DT_FLOAT}}, kDevice)},
+      function_library);
+
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  // Make sure that MyMul was specialized once per unique context.
+  EXPECT_EQ(4, output.library().function_size());
+
+  // And graph nodes calling specialized functions.
+  int count = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "mul_1" && count++) {
+      EXPECT_EQ("MyMul_specialized_for_mul_1", node.op());
+      ASSERT_EQ(2, node.input_size());
+      EXPECT_EQ("xf", node.input(0));
+      EXPECT_EQ("yf", node.input(1));
+
+    } else if (node.name() == "mul_2" && count++) {
+      EXPECT_EQ("MyMul_specialized_for_mul_1", node.op());
+      ASSERT_EQ(2, node.input_size());
+      EXPECT_EQ("yf", node.input(0));
+      EXPECT_EQ("xf", node.input(1));
+
+    } else if (node.name() == "mul_3" && count++) {
+      EXPECT_EQ("MyMul_specialized_for_mul_3", node.op());
+      ASSERT_EQ(2, node.input_size());
+      EXPECT_EQ("xi", node.input(0));
+      EXPECT_EQ("yi", node.input(1));
+
+    } else if (node.name() == "mul_4" && count++) {
+      EXPECT_EQ("MyMul_specialized_for_mul_4", node.op());
+      ASSERT_EQ(2, node.input_size());
+      EXPECT_EQ("xf", node.input(0));
+      EXPECT_EQ("^init", node.input(1));
+
+    } else if (node.name() == "mul_5" && count++) {
+      EXPECT_EQ("MyMul_specialized_for_mul_4", node.op());
+      ASSERT_EQ(3, node.input_size());
+      EXPECT_EQ("yf", node.input(0));
+      EXPECT_EQ("^init", node.input(1));
+      EXPECT_EQ("^xf", node.input(2));
+
+    } else if (node.name() == "mul_6" && count++) {
+      EXPECT_EQ("MyMul_specialized_for_mul_6", node.op());
+      ASSERT_EQ(2, node.input_size());
+      EXPECT_EQ("xf", node.input(0));
+      EXPECT_EQ("^init", node.input(1));
+    }
+  }
+  EXPECT_EQ(6, count);
+
+  // And that graph evaluation yields the same result.
+  Tensor pi = test::AsScalar<float>(3.14f);
+  Tensor four = test::AsScalar<int32>(4);
+  item.fetch = {"mul_1", "mul_2", "mul_3", "mul_4", "mul_5", "mul_6"};
+  item.feed = {{"xf", pi}, {"yf", pi}, {"xi", four}, {"yi", four}};
+
+  auto tensors_expected = EvaluateFetchNodes(item);
+  GrapplerItem optimized(item, std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+  test::ExpectTensorEqual<float>(tensors_expected[1], tensors[1]);
+  test::ExpectTensorEqual<int32>(tensors_expected[2], tensors[2]);
+  test::ExpectTensorEqual<float>(tensors_expected[3], tensors[3]);
+  test::ExpectTensorEqual<float>(tensors_expected[4], tensors[4]);
+  test::ExpectTensorEqual<float>(tensors_expected[5], tensors[5]);
+}
+
+TEST_F(FunctionOptimizerTest, PruningUselessLibraryFunctions) {
+  using test::function::NDef;
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
+  DisableFunctionSpecialization(&optimizer);
+  auto func = test::function::XTimesTwo();
+  (*func.mutable_attr())["_noinline"].set_b(true);
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}, "/device:CPU:0"),
+       NDef("y", "XTimesTwo", {"x"}, {{"T", DT_FLOAT}}, "/device:CPU:0"),
+       NDef("z", "Identity", {"y"}, {{"T", DT_FLOAT}}, "/device:CPU:0")},
+      // FunctionLib
+      {
+          func,
+          test::function::XTimesTwoInt32(),
+          test::function::XTimes16(),
+      });
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_EXPECT_OK(status);
+
+  EXPECT_EQ(output.library().function().size(), 1);
+  EXPECT_EQ(output.library().function(0).signature().name(), "XTimesTwo");
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer.h b/tensorflow/core/grappler/optimizers/graph_optimizer.h
index 42d9837312d25f..765dd13263f029 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_H_
-#define TENSORFLOW_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_H_
 
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -50,4 +50,4 @@ class GraphOptimizer {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_H_
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
index 089cad36e9ad40..2fbdd76a775b59 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_OPTIMIZER_STAGE_H_
-#define TENSORFLOW_GRAPPLER_OPTIMIZERS_OPTIMIZER_STAGE_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_STAGE_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_STAGE_H_
 
 #include <unordered_map>
 #include <unordered_set>
@@ -240,6 +240,25 @@ class GraphOptimizerStagePipeline {
     return false;
   }
 
+  // Pass a node through all registered optimizer stages, until break predicate
+  // is true or a stage fails.
+  //
+  // Returns any stage failure status, or else Status::OK().
+  Status PassThroughAllStagesWithStatus(NodeDef* node, Result* result) {
+    for (auto& stage : stages_) {
+      if (!stage->IsSupported(node)) {
+        continue;
+      }
+      const Status stage_status = stage->TrySimplify(node, result);
+      if (!stage_status.ok()) {
+        return stage_status;
+      } else if (break_predicate_(*result)) {
+        break;
+      }
+    }
+    return Status::OK();
+  }
+
   std::size_t NumStages() { return stages_.size(); }
 
   std::vector<string> StageNames() {
@@ -260,4 +279,4 @@ class GraphOptimizerStagePipeline {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_OPTIMIZER_STAGE_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_OPTIMIZER_STAGE_H_
diff --git a/tensorflow/core/grappler/optimizers/graph_rewriter.h b/tensorflow/core/grappler/optimizers/graph_rewriter.h
index 3d48d628e203e3..4a5a150dc9234f 100644
--- a/tensorflow/core/grappler/optimizers/graph_rewriter.h
+++ b/tensorflow/core/grappler/optimizers/graph_rewriter.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_GRAPH_REWRITER_H_
-#define TENSORFLOW_GRAPPLER_OPTIMIZERS_GRAPH_REWRITER_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_REWRITER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_REWRITER_H_
 
 #include <unordered_map>
 #include <unordered_set>
@@ -99,4 +99,4 @@ class GraphRewriter {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_GRAPH_REWRITER_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_REWRITER_H_
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index 87ab4608627e7b..e08ab1eb673e12 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -2183,7 +2183,7 @@ Status LayoutOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
 
   TuningConfig config;
   config.no_gemm = true;
-  // TODO(yaozhang): Enable tuning with various TuningConfig choices wtih
+  // TODO(yaozhang): Enable tuning with various TuningConfig choices with
   // the measurement-based estimator.
   status = Tune(item, graph_properties, config, output);
   if (!status.ok()) {
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.h b/tensorflow/core/grappler/optimizers/layout_optimizer.h
index 357205828ddea3..49b697bb75b6b8 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_LAYOUT_OPTIMIZER_H_
-#define TENSORFLOW_GRAPPLER_OPTIMIZERS_LAYOUT_OPTIMIZER_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_LAYOUT_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_LAYOUT_OPTIMIZER_H_
 
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/costs/virtual_placer.h"
@@ -57,4 +57,4 @@ class LayoutOptimizer : public GraphOptimizer {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_LAYOUT_OPTIMIZER_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_LAYOUT_OPTIMIZER_H_
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
index fc87f69b8c31d7..dad49cd74f8d26 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
@@ -108,10 +108,8 @@ class LayoutOptimizerTest : public GrapplerTest {
 
     TensorShape filter_shape(
         {filter_size, filter_size, input_depth, filter_count});
-    Tensor filter_data(DT_FLOAT, filter_shape);
-    test::FillIota<float>(&filter_data, 1.0f);
     Output filter =
-        ops::Const(s->WithOpName("Filter"), Input::Initializer(filter_data));
+        ops::Variable(s->WithOpName("Filter"), filter_shape, DT_FLOAT);
 
     int output_height = input_height;
     int output_width = input_width;
@@ -143,6 +141,10 @@ class LayoutOptimizerTest : public GrapplerTest {
     return tensor;
   }
 
+  TensorShape GetAttrShape(const NodeDef& node) {
+    return TensorShape(node.attr().at({"shape"}).shape());
+  }
+
   Output SimpleFusedBatchNormGrad(tensorflow::Scope* s, bool is_training) {
     int batch_size = 16;
     int input_height = 8;
@@ -200,9 +202,12 @@ TEST_F(LayoutOptimizerTest, Conv2DBackpropInput) {
   test::ExpectTensorEqual<int>(input_sizes_expected, input_sizes);
 
   if (gpu_available_) {
+    TensorShape filter_shape = GetAttrShape(*node_map.GetNode("Filter"));
+    Tensor filter_data = GenerateRandomTensor<DT_FLOAT>(filter_shape);
     std::vector<string> fetch = {"Fetch"};
-    auto tensors_expected = EvaluateNodes(item.graph, fetch);
-    auto tensors = EvaluateNodes(output, fetch);
+    auto tensors_expected =
+        EvaluateNodes(item.graph, fetch, {{"Filter", filter_data}});
+    auto tensors = EvaluateNodes(output, fetch, {{"Filter", filter_data}});
     EXPECT_EQ(1, tensors_expected.size());
     EXPECT_EQ(1, tensors.size());
     test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.cc b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
index f7994221bb30cc..9627ed732346bc 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
@@ -25,8 +25,8 @@ limitations under the License.
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/grappler/graph_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
@@ -389,7 +389,7 @@ Status LoopInvariantNodeMotionOptimizer::Optimize() {
       frame_children_[frame_ids[0]].insert(frame_ids[1]);
       frame_parent_[frame_ids.back()] = frame_ids[frame_ids.size() - 2];
     }
-    if (frame_ids.size() >= 1) {
+    if (!frame_ids.empty()) {
       frame_children_.insert(std::make_pair(frame_ids.back(), empty_set_));
       if (node->op() == "LoopCond") {
         if (loop_cond_.count(frame_ids.back())) {
@@ -408,7 +408,7 @@ Status LoopInvariantNodeMotionOptimizer::Optimize() {
   }
 
   for (auto it = frame_children_.begin(); it != frame_children_.end(); ++it) {
-    if (it->second.size() == 0) {
+    if (it->second.empty()) {
       worklist.push_back(it->first);
     }
   }
@@ -421,7 +421,7 @@ Status LoopInvariantNodeMotionOptimizer::Optimize() {
     if (parent_it != frame_parent_.end()) {
       int parent_id = parent_it->second;
       frame_children_[parent_id].erase(frame_id);
-      if (frame_children_[parent_id].size() == 0) {
+      if (frame_children_[parent_id].empty()) {
         worklist.push_back(parent_id);
       }
     }
@@ -474,15 +474,13 @@ std::vector<int> GetStackPushNodesToConvert(
   return nodes_to_convert;
 }
 
-Status RemoveStackOps(const GrapplerItem& item, GraphDef* optimized_graph) {
-  const std::unordered_set<string> nodes_to_preserve = item.NodesToPreserve();
-  const GraphDef& graph = item.graph;
-  *optimized_graph = graph;
+Status RemoveStackOps(const std::unordered_set<string>& nodes_to_preserve,
+                      GraphDef* optimized_graph) {
   NodeMap node_map(optimized_graph);
   SimpleGraphView graph_view;
-  TF_RETURN_IF_ERROR(graph_view.Initialize(graph));
-  for (int node_idx = 0; node_idx < graph.node_size(); ++node_idx) {
-    if (IsStackOp(graph.node(node_idx))) {
+  TF_RETURN_IF_ERROR(graph_view.Initialize(*optimized_graph));
+  for (int node_idx = 0; node_idx < optimized_graph->node_size(); ++node_idx) {
+    if (IsStackOp(optimized_graph->node(node_idx))) {
       for (int push_node_idx : GetStackPushNodesToConvert(
                graph_view, nodes_to_preserve, node_idx)) {
         // We found push nodes without corresponding pops. Convert them to
@@ -506,6 +504,144 @@ Status RemoveStackOps(const GrapplerItem& item, GraphDef* optimized_graph) {
   return Status::OK();
 }
 
+Status RemoveDeadBranches(const std::unordered_set<string>& nodes_to_preserve,
+                          GraphDef* optimized_graph) {
+  std::unordered_set<const NodeDef*> dead_nodes;
+  std::unordered_map<NodeDef*, std::set<int>> dead_merge_inputs;
+  // TODO(bsteiner): also rewrite switches as identity. For now we just record
+  // them
+  std::unordered_set<GraphView::OutputPort, GraphView::HashPort>
+      identity_switches;
+
+  GraphView view(optimized_graph);
+  for (const NodeDef& node : optimized_graph->node()) {
+    if (!IsSwitch(node)) {
+      continue;
+    }
+    if (nodes_to_preserve.find(node.name()) != nodes_to_preserve.end()) {
+      continue;
+    }
+    GraphView::InputPort ctrl_port(&node, 1);
+    GraphView::OutputPort ctrl_node = view.GetRegularFanin(ctrl_port);
+    if (!IsConstant(*ctrl_node.node)) {
+      continue;
+    }
+    Tensor selector;
+    CHECK(selector.FromProto(ctrl_node.node->attr().at("value").tensor()));
+    const int dead_fanout = selector.scalar<bool>()() ? 0 : 1;
+    GraphView::OutputPort dead(const_cast<NodeDef*>(&node), dead_fanout);
+    identity_switches.insert(dead);
+
+    SetVector<GraphView::InputPort, GraphView::HashPort> zombie_inputs;
+    for (const GraphView::InputPort& port : view.GetFanout(dead)) {
+      if (dead_nodes.find(port.node) == dead_nodes.end()) {
+        zombie_inputs.PushBack(port);
+      }
+    }
+    // If we encounter a single node that must be preserved in the fanout of the
+    // switch node we need to preserve the entire switch fanout: we therefore
+    // work on a local copy that only gets committed to the master copy once the
+    // whole fanout has been explored.
+    std::unordered_set<const NodeDef*> local_dead_nodes = dead_nodes;
+    std::unordered_map<NodeDef*, std::set<int>> local_dead_merge_inputs =
+        dead_merge_inputs;
+    bool found_node_to_preserve = false;
+    while (!found_node_to_preserve && !zombie_inputs.Empty()) {
+      GraphView::InputPort dead = zombie_inputs.PopBack();
+      if (nodes_to_preserve.find(dead.node->name()) !=
+          nodes_to_preserve.end()) {
+        found_node_to_preserve = true;
+        break;
+      }
+
+      if (local_dead_nodes.find(dead.node) != local_dead_nodes.end()) {
+        continue;
+      }
+
+      if (IsMerge(*dead.node)) {
+        const int fanout = dead.node->attr().at("N").i();
+        if (fanout > 2) {
+          // This never happens in practice, so we'll just skip these to
+          // simplify the code for now.
+          found_node_to_preserve = true;
+          break;
+        }
+        GraphView::OutputPort value_index(dead.node, 1);
+        const std::unordered_set<GraphView::InputPort, GraphView::HashPort>&
+            index_fanout = view.GetFanout(value_index);
+        if (!index_fanout.empty()) {
+          // The 2nd output (that indicates which input is propagated) is
+          // connected. This never happens in practice, so we'll just skip this
+          // case to simplify the code for now.
+          found_node_to_preserve = true;
+          break;
+        }
+
+        bool fully_dead = false;
+        if (dead.port_id < 0) {
+          // If the control dependency never gets triggered the merge will also
+          // never get triggered.
+          local_dead_nodes.insert(dead.node);
+          fully_dead = true;
+        } else {
+          local_dead_merge_inputs[dead.node].insert(dead.port_id);
+          if (local_dead_merge_inputs[dead.node].size() ==
+              dead.node->attr().at("N").i()) {
+            fully_dead = true;
+          }
+          if (fully_dead) {
+            local_dead_nodes.insert(dead.node);
+            for (const GraphView::InputPort& port :
+                 view.GetFanouts(*dead.node, true)) {
+              zombie_inputs.PushBack(port);
+            }
+          }
+        }
+      } else if (dead.node->op() == "ControlTrigger") {
+        // Control trigger have different semantic, so don't touch them
+        found_node_to_preserve = true;
+        break;
+      } else {
+        if (local_dead_nodes.insert(dead.node).second) {
+          for (const GraphView::InputPort& dead_fanout :
+               view.GetFanouts(*dead.node, true)) {
+            zombie_inputs.PushBack(dead_fanout);
+          }
+        }
+      }
+    }
+    if (!found_node_to_preserve) {
+      std::swap(dead_nodes, local_dead_nodes);
+      std::swap(dead_merge_inputs, local_dead_merge_inputs);
+    }
+  }
+
+  int last = optimized_graph->node_size() - 1;
+  for (int i = optimized_graph->node_size() - 1; i >= 0; --i) {
+    NodeDef* node = optimized_graph->mutable_node(i);
+    if (dead_nodes.find(node) != dead_nodes.end()) {
+      optimized_graph->mutable_node()->SwapElements(i, last);
+      last--;
+    }
+  }
+  optimized_graph->mutable_node()->DeleteSubrange(last + 1, dead_nodes.size());
+
+  for (const auto& itr : dead_merge_inputs) {
+    NodeDef* dead_node = itr.first;
+    if (dead_nodes.find(dead_node) != dead_nodes.end()) {
+      // The node has been pruned since all its inputs are dead.
+      continue;
+    }
+    const std::set<int>& dead_inputs = itr.second;
+    for (int index : dead_inputs) {
+      dead_node->mutable_input()->DeleteSubrange(index, 1);
+    }
+    dead_node->set_op("Identity");
+    dead_node->mutable_attr()->erase("N");
+  }
+  return Status::OK();
+}
+
 }  // namespace
 
 Status LoopOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
@@ -517,7 +653,11 @@ Status LoopOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     TF_RETURN_IF_ERROR(linm_optimizer.Optimize());
   }
   if (options_.enable_stack_push_removal) {
-    TF_RETURN_IF_ERROR(RemoveStackOps(item, optimized_graph));
+    TF_RETURN_IF_ERROR(RemoveStackOps(item.NodesToPreserve(), optimized_graph));
+  }
+  if (options_.enable_dead_branch_removal) {
+    TF_RETURN_IF_ERROR(
+        RemoveDeadBranches(item.NodesToPreserve(), optimized_graph));
   }
 
   return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.h b/tensorflow/core/grappler/optimizers/loop_optimizer.h
index a422505d23c197..85b8e655439b28 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.h
@@ -52,8 +52,9 @@ class LoopOptimizer : public GraphOptimizer {
 
   // Granular control for loop optimizer stages.
   struct LoopOptimizerOptions {
-    bool enable_loop_invariant_node_motion = true;
+    bool enable_loop_invariant_node_motion = false;
     bool enable_stack_push_removal = true;
+    bool enable_dead_branch_removal = true;
 
     static LoopOptimizerOptions Default(RewriterConfig::Toggle opt_level) {
       LoopOptimizerOptions options;
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc b/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
index 10ec544424e651..6fd177b7103eac 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
@@ -589,5 +589,112 @@ TEST_F(LoopOptimizerTest, RemovePushWithoutMatchingPop) {
   }
 }
 
+TEST_F(LoopOptimizerTest, RemoveDeadBranches) {
+  Scope scope = Scope::NewRootScope();
+  Output v_in = ops::Variable(scope.WithOpName("v_in"), {3}, DT_FLOAT);
+
+  Output ctrl1 = ops::Const(scope.WithOpName("ctrl1"), false, TensorShape({}));
+  ops::Switch s1(scope.WithOpName("switch1"), v_in, ctrl1);
+  Output square1 = ops::Square(scope.WithOpName("square1"), s1.output_false);
+  Output sqrt1 = ops::Sqrt(scope.WithOpName("sqrt1"), s1.output_true);
+
+  Output ctrl2 = ops::Const(scope.WithOpName("ctrl2"), true, TensorShape({}));
+  ops::Switch s2(scope.WithOpName("switch2"), v_in, ctrl2);
+  Output square2 = ops::Square(scope.WithOpName("square2"), s2.output_false);
+  Output sqrt2 = ops::Sqrt(scope.WithOpName("sqrt2"), s2.output_true);
+
+  Output ctrl3 = ops::Const(scope.WithOpName("ctrl3"), false, TensorShape({}));
+  ops::Switch s3(scope.WithOpName("switch3"), v_in, ctrl3);
+  Output square3 = ops::Square(scope.WithOpName("square3"), s3.output_false);
+  Output sqrt3 = ops::Sqrt(scope.WithOpName("sqrt3"), s3.output_true);
+
+  Output ctrl4 = ops::Const(scope.WithOpName("ctrl4"), false, TensorShape({}));
+  ops::Switch s4(scope.WithOpName("switch4"), v_in, ctrl4);
+  Output square4 = ops::Square(scope.WithOpName("square4"), s4.output_false);
+  Output sqrt4 = ops::Sqrt(scope.WithOpName("sqrt4"), s4.output_true);
+
+  ops::Merge m1(scope.WithOpName("m1"), {square1, sqrt1});
+  ops::Merge m2(scope.WithOpName("m2"), {v_in, square1});
+  ops::Merge m3(scope.WithOpName("m3"), {v_in, sqrt1});
+  ops::Merge m4(scope.WithOpName("m4"), {square1, sqrt2});
+  ops::Merge m5(scope.WithOpName("m5"), {square2, sqrt1});
+  ops::Merge m6(scope.WithOpName("m6").WithControlDependencies(sqrt2),
+                {v_in, square1});
+  ops::Merge m7(scope.WithOpName("m7").WithControlDependencies(sqrt1),
+                {v_in, square1});
+
+  ops::Switch s5(scope.WithOpName("switch5"), v_in, ctrl1);
+  Output id1 = ops::Identity(scope.WithOpName("id1"), s5.output_false);
+  Output id2 = ops::Identity(scope.WithOpName("id2"), s5.output_true);
+  ops::Merge m8(scope.WithOpName("m8"), {id1, id2});
+
+  ops::Switch s6(scope.WithOpName("switch6"), v_in, ctrl1);
+  Output id3 = ops::Identity(scope.WithOpName("id3"), s6.output_false);
+  Output id4 = ops::Identity(scope.WithOpName("id4"), s6.output_true);
+  ops::Merge m9(scope.WithOpName("m9"), {id3, id4});
+
+  GrapplerItem item;
+  item.fetch.push_back("m8");
+  item.fetch.push_back("id4");
+
+  TF_CHECK_OK(scope.ToGraphDef(&item.graph));
+
+  LoopOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  GraphDef output;
+  Status status = optimizer.Optimize(nullptr, item, &output);
+  TF_CHECK_OK(status);
+
+  for (const NodeDef& node : output.node()) {
+    // These nodes should have been pruned
+    EXPECT_NE("Square1", node.name());
+    EXPECT_NE("Sqrt2", node.name());
+    EXPECT_NE("m5", node.name());
+    EXPECT_NE("m7", node.name());
+
+    if (node.name() == "m1") {
+      // sqrt1 is dead
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("square1", node.input(0));
+    } else if (node.name() == "m2") {
+      // both inputs are alive
+      EXPECT_EQ("Merge", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("v_in", node.input(0));
+      EXPECT_EQ("square1", node.input(1));
+    } else if (node.name() == "m3") {
+      // sqrt1 is dead
+      EXPECT_EQ("Identity", node.op());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("v_in", node.input(0));
+    } else if (node.name() == "m4") {
+      // both inputs are alive
+      EXPECT_EQ("Merge", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("square1", node.input(0));
+      EXPECT_EQ("sqrt2", node.input(1));
+    } else if (node.name() == "m6") {
+      // both inputs are alive and the control dependency can get triggered
+      EXPECT_EQ("Merge", node.op());
+      EXPECT_EQ(3, node.input_size());
+      EXPECT_EQ("v_in", node.input(0));
+      EXPECT_EQ("square1", node.input(1));
+      EXPECT_EQ("^sqrt2", node.input(2));
+    } else if (node.name() == "m8") {
+      // The node is to be preserved because of a fetch
+      EXPECT_EQ("Merge", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("id1", node.input(0));
+      EXPECT_EQ("id2", node.input(1));
+    } else if (node.name() == "m9") {
+      // The node is to be preserved because of a fetch
+      EXPECT_EQ("Merge", node.op());
+      EXPECT_EQ(2, node.input_size());
+      EXPECT_EQ("id3", node.input(0));
+      EXPECT_EQ("id4", node.input(1));
+    }
+  }
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
index c1fee0e993dd18..1be5f8dcc2ca8a 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/grappler/utils/traversal.h"
+#include "tensorflow/core/lib/math/math_util.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
@@ -1069,9 +1070,11 @@ static bool IdentifySwappingCandidates(
         // ensure that swapping the tensor back in won't recreate the memory
         // bottleneck. Last but not least, we want the tensor to have as few
         // remaining uses as possible.
-        mem_info.fitness = std::pow((earliest_use - peak_time).count(), 2);
-        mem_info.fitness /= std::pow(mem_info.uses_left.size(), 2);
-        mem_info.fitness += std::pow((allocation_time - peak_time).count(), 2);
+        mem_info.fitness =
+            MathUtil::IPow((earliest_use - peak_time).count(), 2);
+        mem_info.fitness /= MathUtil::IPow(mem_info.uses_left.size(), 2);
+        mem_info.fitness +=
+            MathUtil::IPow((allocation_time - peak_time).count(), 2);
         mem_info.fitness = -mem_info.fitness;
         mem_state.push_back(mem_info);
       }
@@ -1219,6 +1222,80 @@ bool SwappingPass(RewriterConfig::MemOptType optimization_level,
   return updated_graph;
 }
 
+// TODO(rmlarsen): Add distributed TF test.
+Status RelaxAllocatorConstraints(GraphDef* optimized_graph) {
+  std::unordered_set<string> devices;
+  std::vector<int> assign_nodes;
+  bool found_send = false;
+  for (int i = 0; i < optimized_graph->node_size(); ++i) {
+    const NodeDef& node = optimized_graph->node(i);
+    devices.insert(node.device());
+    if (IsAssign(node)) {
+      assign_nodes.push_back(i);
+    }
+    if (IsSend(node)) {
+      found_send = true;
+      break;
+    }
+  }
+  if (!found_send && devices.size() == 1) {
+    for (int assign_idx : assign_nodes) {
+      // Set an attribute telling AssignOp to ignore allocator constraints.
+      NodeDef* assign_node = optimized_graph->mutable_node(assign_idx);
+      (*assign_node->mutable_attr())["_grappler_relax_allocator_constraints"]
+          .set_b(true);
+    }
+    return Status::OK();
+  }
+
+  std::unordered_set<int> optimized_nodes;
+  SimpleGraphView graph_view;
+  TF_RETURN_IF_ERROR(graph_view.Initialize(*optimized_graph));
+  for (int i : assign_nodes) {
+    if (optimized_nodes.find(i) == optimized_nodes.end()) {
+      const NodeDef& node = optimized_graph->node(i);
+      optimized_nodes.insert(i);
+      std::vector<int> assign_nodes_in_fanout;
+      assign_nodes_in_fanout.push_back(i);
+      std::set<int> transitive_fanout;
+      graph_view.DepthFirstSearch(std::unordered_set<string>{}, i,
+                                  &transitive_fanout);
+      const string& assign_device = node.device();
+      bool relax_constraint = true;
+      // If all nodes in the transitive fanout are on the same device as the
+      // assign node, there is no need to allocate the output in pinned memory.
+      for (int fanout : transitive_fanout) {
+        const NodeDef& fanout_node = optimized_graph->node(fanout);
+        if (relax_constraint &&
+            (fanout_node.device() != assign_device || IsSend(fanout_node))) {
+          relax_constraint = false;
+        }
+        if (optimized_nodes.find(fanout) == optimized_nodes.end() &&
+            IsAssign(fanout_node)) {
+          assign_nodes_in_fanout.push_back(fanout);
+        }
+      }
+
+      for (int assign_idx : assign_nodes_in_fanout) {
+        if (relax_constraint) {
+          // If all devices match in fanout of node(i) then, by transitivity,
+          // they must also match in the fanout of other assign nodes
+          // node(assign_idx) in the fanout, so we can process them here,
+          // and save computing their transitive fanout later.
+          optimized_nodes.insert(assign_idx);
+
+          // Set an attribute telling AssignOp to ignore allocator constraints.
+          NodeDef* assign_node = optimized_graph->mutable_node(assign_idx);
+          (*assign_node
+                ->mutable_attr())["_grappler_relax_allocator_constraints"]
+              .set_b(true);
+        }
+      }
+    }
+  }
+  return Status::OK();
+}
+
 Status MemoryOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                  GraphDef* optimized_graph) {
   *optimized_graph = item.graph;
@@ -1251,6 +1328,8 @@ Status MemoryOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     }
   }
 
+  TF_RETURN_IF_ERROR(RelaxAllocatorConstraints(&optimized_item.graph));
+
   optimized_graph->Swap(&optimized_item.graph);
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.h b/tensorflow/core/grappler/optimizers/memory_optimizer.h
index 5c555a26746b75..653ffaec4c206c 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_MEMORY_OPTIMIZER_H_
-#define TENSORFLOW_GRAPPLER_OPTIMIZERS_MEMORY_OPTIMIZER_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_MEMORY_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_MEMORY_OPTIMIZER_H_
 
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
@@ -53,4 +53,4 @@ class MemoryOptimizer : public GraphOptimizer {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_MEMORY_OPTIMIZER_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_MEMORY_OPTIMIZER_H_
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
index a1f80802ddc2b3..a3f0e078616efe 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
@@ -440,6 +440,140 @@ TEST_F(MemoryOptimizerTest, AccumulationRewrites) {
   }
 }
 
+class RelaxAllocatorConstraintsTest : public GrapplerTest {};
+
+TEST_F(RelaxAllocatorConstraintsTest, SameDevice) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output constant = ops::Const(s.WithOpName("constant").WithDevice("/cpu:0"),
+                               -3.14f, {128, 128});
+  Output variable = ops::Variable(s.WithOpName("variable").WithDevice("/cpu:0"),
+                                  {128, 128}, DT_FLOAT);
+  Output assign = ops::Assign(s.WithOpName("assign").WithDevice("/cpu:0"),
+                              variable, constant);
+  Output exp = ops::Exp(s.WithOpName("exp").WithDevice("/cpu:0"), assign);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  MemoryOptimizer optimizer(RewriterConfig::MANUAL);
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  auto node = output.node(2);
+  EXPECT_EQ("assign", node.name());
+  EXPECT_EQ(1, node.attr().count("_grappler_relax_allocator_constraints"));
+  EXPECT_EQ(true, node.attr().at("_grappler_relax_allocator_constraints").b());
+
+  item.fetch = {"exp"};
+  item.init_ops = {"variable"};
+  auto tensors_expected = EvaluateFetchNodes(item);
+  GrapplerItem optimized(item, std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
+TEST_F(RelaxAllocatorConstraintsTest, DifferentDevice) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output constant = ops::Const(s.WithOpName("constant").WithDevice("/cpu:0"),
+                               -3.14f, {128, 128});
+  Output variable = ops::Variable(s.WithOpName("variable").WithDevice("/cpu:0"),
+                                  {128, 128}, DT_FLOAT);
+  Output assign = ops::Assign(s.WithOpName("assign").WithDevice("/cpu:0"),
+                              variable, constant);
+  // exp runs on a different device, so we cannot relax the allocation
+  // constraints on assign.
+  Output exp = ops::Exp(s.WithOpName("exp").WithDevice("/gpu:0"), assign);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  MemoryOptimizer optimizer(RewriterConfig::MANUAL);
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  auto node = output.node(2);
+  EXPECT_EQ("assign", node.name());
+  EXPECT_EQ(0, node.attr().count("_grappler_relax_allocator_constraints"));
+#if GOOGLE_CUDA
+  item.fetch = {"exp"};
+  item.init_ops = {"variable"};
+  auto tensors_expected = EvaluateFetchNodes(item);
+  GrapplerItem optimized(item, std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+#endif
+}
+
+TEST_F(RelaxAllocatorConstraintsTest, SendNode) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output constant = ops::Const(s.WithOpName("constant").WithDevice("/cpu:0"),
+                               -3.14f, {128, 128});
+  Output variable = ops::Variable(s.WithOpName("variable").WithDevice("/cpu:0"),
+                                  {128, 128}, DT_FLOAT);
+  Output assign = ops::Assign(s.WithOpName("assign").WithDevice("/cpu:0"),
+                              variable, constant);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  NodeDef* send = item.graph.add_node();
+  // Add a send node to the graph in the fanout of "assign".
+  send->set_name("send");
+  send->set_op("_Send");
+  send->add_input("assign");
+
+  MemoryOptimizer optimizer(RewriterConfig::MANUAL);
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  auto node = output.node(2);
+  EXPECT_EQ("assign", node.name());
+  EXPECT_EQ(0, node.attr().count("_grappler_relax_allocator_constraints"));
+}
+
+TEST_F(RelaxAllocatorConstraintsTest, AssignNodeInFanout) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output constant0 = ops::Const(s.WithOpName("constant0").WithDevice("/cpu:0"),
+                                -42.0f, {128, 128});
+  Output variable0 = ops::Variable(
+      s.WithOpName("variable0").WithDevice("/cpu:0"), {128, 128}, DT_FLOAT);
+  Output assign0 = ops::Assign(s.WithOpName("assign0").WithDevice("/cpu:0"),
+                               variable0, constant0);
+  // The rest of the graph is on a second device, so we can relax the
+  // constraint for assign1, but not for assign0.
+  Output exp1 = ops::Exp(s.WithOpName("exp1").WithDevice("/gpu:0"), assign0);
+  Output variable1 = ops::Variable(
+      s.WithOpName("variable1").WithDevice("/gpu:0"), {128, 128}, DT_FLOAT);
+  Output assign1 = ops::Assign(s.WithOpName("assign1").WithDevice("/gpu:0"),
+                               variable1, exp1);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  MemoryOptimizer optimizer(RewriterConfig::MANUAL);
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  auto node = output.node(3);
+  EXPECT_EQ("assign0", node.name());
+  EXPECT_EQ(0, node.attr().count("_grappler_relax_allocator_constraints"));
+
+  node = output.node(5);
+  EXPECT_EQ("assign1", node.name());
+  EXPECT_EQ(1, node.attr().count("_grappler_relax_allocator_constraints"));
+  EXPECT_EQ(true, node.attr().at("_grappler_relax_allocator_constraints").b());
+
+#if GOOGLE_CUDA
+  item.fetch = {"assign0", "assign1"};
+  item.init_ops = {"exp1", "variable1"};
+  auto tensors_expected = EvaluateFetchNodes(item);
+  GrapplerItem optimized(item, std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+  for (int i = 0; i < tensors_expected.size(); ++i) {
+    test::ExpectTensorEqual<float>(tensors_expected[i], tensors[i]);
+  }
+#endif
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index c98eef1a6a5cca..143d9dc1c63694 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+#include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
@@ -23,12 +24,15 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/debug_stripper.h"
 #include "tensorflow/core/grappler/optimizers/dependency_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/function_optimizer.h"
-#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/loop_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/memory_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/model_pruner.h"
+#include "tensorflow/core/grappler/optimizers/remapper.h"
+#include "tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.h"
+#include "tensorflow/core/grappler/optimizers/shape_optimizer.h"
 #include "tensorflow/core/grappler/utils/colocation.h"
+#include "tensorflow/core/grappler/utils/functions.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/status.h"
 
@@ -37,7 +41,7 @@ namespace grappler {
 
 namespace {
 
-constexpr int kDefaultNumberOfIterations = 1;
+constexpr int kDefaultNumberOfIterations = 2;
 
 int64 NumEdges(const GraphDef& graph) {
   int64 num_edges = 0;
@@ -61,7 +65,10 @@ int NumIterations(const RewriterConfig& cfg) {
 }
 
 // Check if optimizer is allowed to run only once.
-bool IsRunOnceOptimizer(const string& name) { return name == "layout"; }
+bool IsRunOnceOptimizer(const string& name) {
+  return name == "layout" || name == "memory_optimizer" ||
+         name == "loop_optimizer";
+}
 
 }  // namespace
 
@@ -73,6 +80,8 @@ std::unique_ptr<GraphOptimizer> MetaOptimizer::MakeNewOptimizer(
   MK_OPT("pruning", new ModelPruner());
   MK_OPT("function", new FunctionOptimizer(cfg_.function_optimization()));
   MK_OPT("constfold", new ConstantFolding(cpu_device_));
+  MK_OPT("shape", new ShapeOptimizer());
+  MK_OPT("remap", new Remapper(cfg_.remapping()));
   MK_OPT("layout", new LayoutOptimizer());
   MK_OPT("memory", new MemoryOptimizer(RewriterConfig::MANUAL));
   MK_OPT("arithmetic", new ArithmeticOptimizer(cfg_.arithmetic_optimization()));
@@ -80,6 +89,8 @@ std::unique_ptr<GraphOptimizer> MetaOptimizer::MakeNewOptimizer(
   MK_OPT("loop", new LoopOptimizer(cfg_.loop_optimization()));
   MK_OPT("dependency", new DependencyOptimizer(cfg_.dependency_optimization()));
   MK_OPT("debug_stripper", new DebugStripper());
+  MK_OPT("scoped_allocator",
+         new ScopedAllocatorOptimizer(cfg_.scoped_allocator_opts()));
 
   return std::unique_ptr<GraphOptimizer>();
 }
@@ -102,6 +113,12 @@ Status MetaOptimizer::InitializeOptimizers(
     optimizers->emplace_back(
         new ConstantFolding(cfg_.constant_folding(), cpu_device_));
   }
+  if (cfg_.shape_optimization() != RewriterConfig::OFF) {
+    optimizers->emplace_back(new ShapeOptimizer());
+  }
+  if (cfg_.remapping() != RewriterConfig::OFF) {
+    optimizers->emplace_back(new Remapper(cfg_.remapping()));
+  }
   if (cfg_.arithmetic_optimization() != RewriterConfig::OFF) {
     optimizers->emplace_back(
         new ArithmeticOptimizer(cfg_.arithmetic_optimization()));
@@ -131,6 +148,10 @@ Status MetaOptimizer::InitializeOptimizers(
     optimizers->emplace_back(
         new AutoParallel(cfg_.auto_parallel().num_replicas()));
   }
+  if (cfg_.scoped_allocator_optimization()) {
+    optimizers->emplace_back(
+        new ScopedAllocatorOptimizer(cfg_.scoped_allocator_opts()));
+  }
   return Status::OK();
 }
 
@@ -155,13 +176,26 @@ Status MetaOptimizer::InitializeOptimizersByName(
       VLOG(2) << "Can't register an optimizer by name: " << optimizer_name;
     }
   }
+  for (const auto& optimizer_config : cfg_.custom_optimizers()) {
+    auto custom_optimizer = CustomGraphOptimizerRegistry::CreateByNameOrNull(
+        optimizer_config.name());
+    if (custom_optimizer) {
+      VLOG(2) << "Registered custom configurable graph optimizer: "
+              << optimizer_config.name();
+      TF_RETURN_IF_ERROR(custom_optimizer->Init(&optimizer_config));
+      optimizers->push_back(std::move(custom_optimizer));
+    } else {
+      VLOG(2) << "Can't register an optimizer by name: "
+              << optimizer_config.name();
+    }
+  }
   return Status::OK();
 }
 
 Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
                                     GraphDef* optimized_graph) {
   std::vector<std::unique_ptr<GraphOptimizer>> optimizers;
-  if (cfg_.optimizers().empty()) {
+  if (cfg_.optimizers().empty() && cfg_.custom_optimizers().empty()) {
     TF_RETURN_IF_ERROR(InitializeOptimizers(&optimizers));
   } else {
     TF_RETURN_IF_ERROR(InitializeOptimizersByName(&optimizers));
@@ -183,6 +217,8 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
 
   bool is_optimized = false;
   GraphOptimizationResult optimization_result(item.id);
+  GraphOptimizer* fusion_optimizer = nullptr;
+  GraphOptimizer* sa_optimizer = nullptr;
 
   for (int iteration = 0; iteration < NumIterations(cfg_); ++iteration) {
     VLOG(4) << "Starting optimization iteration " << iteration + 1;
@@ -190,34 +226,41 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
     for (const auto& optimizer : optimizers) {
       // Some optimizers can run only once.
       if (iteration > 0 && IsRunOnceOptimizer(optimizer->name())) continue;
-
-      uint64 start_us = Env::Default()->NowMicros();
-      // This swaps the current optimized_graph into optimized item and
-      // resets optimized_graph to an empty graph.
-      optimized_graph->Swap(&optimized_item.graph);
-      *optimized_graph = GraphDef();
-      Status status =
-          optimizer->Optimize(cluster, optimized_item, optimized_graph);
-      uint64 end_us = Env::Default()->NowMicros();
-
-      string result;
-      if (!status.ok()) {
-        optimized_graph->Swap(&optimized_item.graph);
-        result = status.ToString();
-      } else {
-        is_optimized = true;
-        float duration_ms = (end_us - start_us) / 1000.0f;
-        result = strings::StrCat(
-            PrintSizesBeforeAfter(optimized_item.graph, *optimized_graph),
-            ", time = ", duration_ms, "ms.");
+      // Some must run only on the last iteration.
+      if (optimizer->name() == "scoped_allocator_optimizer") {
+        if (sa_optimizer == nullptr) sa_optimizer = optimizer.get();
+        continue;
+      }
+      if (optimizer->name() == "xla-fusion") {
+        if (fusion_optimizer == nullptr) fusion_optimizer = optimizer.get();
+        continue;
       }
-      VLOG(4) << optimizer->name() << ": " << result;
 
-      OptimizerResult optimizer_result{optimizer->name(), result};
-      optimization_result.results.push_back(optimizer_result);
+      Status status = RunOptimizer(optimizer.get(), cluster, &optimized_item,
+                                   optimized_graph, &optimization_result);
+      if (status.ok()) is_optimized = true;
     }
   }
 
+  // Run fusion optimizer if requested after all other optimizers since: 1) it
+  // doesn't need to be called more than once. 2) we don't want subsequent
+  // optimization passes to break the fusion clusters. We could potentially
+  // encapsulate the fusion clusters right away, but that will prevent a lot of
+  // optimizations from taking place since we don't have shape inference for
+  // functions, and we can't optimize across function boundaries.
+  if (fusion_optimizer != nullptr) {
+    Status status = RunOptimizer(fusion_optimizer, cluster, &optimized_item,
+                                 optimized_graph, &optimization_result);
+    if (status.ok()) is_optimized = true;
+  }
+
+  // ScopedAllocatorOptimizer must run last.
+  if (sa_optimizer != nullptr) {
+    Status status = RunOptimizer(sa_optimizer, cluster, &optimized_item,
+                                 optimized_graph, &optimization_result);
+    if (status.ok()) is_optimized = true;
+  }
+
   // Record graph optimization result.
   optimization_results_.push_back(optimization_result);
 
@@ -232,10 +275,107 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
   return Status::OK();
 }
 
+Status MetaOptimizer::RunOptimizer(
+    GraphOptimizer* optimizer, Cluster* cluster, GrapplerItem* optimized_item,
+    GraphDef* optimized_graph, GraphOptimizationResult* optimization_result) {
+  uint64 start_us = Env::Default()->NowMicros();
+  // This swaps the current optimized_graph into optimized item and
+  // resets optimized_graph to an empty graph.
+  optimized_graph->Swap(&optimized_item->graph);
+  *optimized_graph = GraphDef();
+  Status status =
+      optimizer->Optimize(cluster, *optimized_item, optimized_graph);
+  uint64 end_us = Env::Default()->NowMicros();
+
+  string result;
+  if (!status.ok()) {
+    optimized_graph->Swap(&optimized_item->graph);
+    result = status.ToString();
+  } else {
+    float duration_ms = (end_us - start_us) / 1000.0f;
+    result = strings::StrCat(
+        PrintSizesBeforeAfter(optimized_item->graph, *optimized_graph),
+        ", time = ", duration_ms, "ms.");
+  }
+  VLOG(4) << optimizer->name() << ": " << result;
+
+  OptimizerResult optimizer_result{optimizer->name(), result};
+  optimization_result->results.push_back(optimizer_result);
+  return status;
+}
+
 Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                GraphDef* optimized_graph) {
   optimization_results_.clear();
+
+  // 1. Optimize main graph
   TF_RETURN_IF_ERROR(OptimizeGraph(cluster, item, optimized_graph));
+
+  // 2. Optimize function library
+  FunctionLibraryDefinition flib(OpRegistry::Global(),
+                                 optimized_graph->library());
+
+  // Optimize each function only once.
+  std::unordered_set<string> optimized_funcs;
+  bool optimize_function_library = true;
+
+  while (optimize_function_library) {
+    optimize_function_library = false;
+
+    for (const FunctionDef& func : optimized_graph->library().function()) {
+      const string& func_name = func.signature().name();
+
+      // Skip already optimized functions.
+      if (optimized_funcs.find(func_name) != optimized_funcs.end()) continue;
+
+      // Skip parametrized functions (function type or body is defined only at
+      // function call time by caller node attributes).
+      if (IsParametrized(func)) continue;
+
+      VLOG(3) << "Optimize function: function=" << func_name;
+
+      // Function optimization might specialize nested function calls, so we
+      // have to reset the flag and do at least one more pass over the library.
+      optimize_function_library = true;
+      optimized_funcs.insert(func_name);
+
+      // Make a GrapplerItem from a FunctionDef.
+      GrapplerFunctionItem func_item;
+      TF_RETURN_IF_ERROR(MakeGrapplerFunctionItem(func, flib, &func_item));
+
+      // Optimize function body graph.
+      GraphDef optimized_func_graph;
+      TF_RETURN_IF_ERROR(
+          OptimizeGraph(cluster, func_item, &optimized_func_graph));
+
+      // Function body optimization might have created new specialized
+      // functions for each instantiation context. Add them to the library.
+      for (const FunctionDef& func_def :
+           optimized_func_graph.library().function()) {
+        if (flib.Find(func_def.signature().name()) == nullptr) {
+          TF_RETURN_IF_ERROR(flib.AddFunctionDef(func_def));
+        }
+      }
+
+      // Convert optimized graph back to FunctionDef.
+      FunctionDef optimized_func;
+      func_item.SwapFunctionBody(std::move(optimized_func_graph));
+      TF_RETURN_IF_ERROR(MakeFunctionDef(func_item, flib, &optimized_func));
+
+      // Replace optimized function with a new FunctionDef.
+      TF_RETURN_IF_ERROR(flib.RemoveFunction(func_name));
+      TF_RETURN_IF_ERROR(flib.AddFunctionDef(optimized_func));
+    }
+
+    // If optimized at least one function, update the graph library.
+    if (optimize_function_library) {
+      *optimized_graph->mutable_library() = flib.ToProto();
+    }
+  }
+
+  VLOG(3) << "Optimized " << optimized_funcs.size()
+          << " functions: " << str_util::Join(optimized_funcs, ", ");
+
   return Status::OK();
 }
 
@@ -258,13 +398,16 @@ bool MetaOptimizerEnabled(const RewriterConfig& cfg) {
          cfg.layout_optimizer() != RewriterConfig::OFF ||
          cfg.function_optimization() != RewriterConfig::OFF ||
          cfg.constant_folding() != RewriterConfig::OFF ||
+         cfg.shape_optimization() != RewriterConfig::OFF ||
+         cfg.remapping() != RewriterConfig::OFF ||
          cfg.arithmetic_optimization() != RewriterConfig::OFF ||
          cfg.loop_optimization() != RewriterConfig::OFF ||
          cfg.dependency_optimization() != RewriterConfig::OFF ||
          cfg.auto_parallel().enable() ||
          cfg.memory_optimization() != RewriterConfig::NO_MEM_OPT ||
          cfg.debug_stripper() == RewriterConfig::ON ||
-         !cfg.optimizers().empty();
+         cfg.scoped_allocator_optimization() == RewriterConfig::ON ||
+         !cfg.optimizers().empty() || !cfg.custom_optimizers().empty();
 }
 
 Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg,
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.h b/tensorflow/core/grappler/optimizers/meta_optimizer.h
index b8d46662489c03..151a54cbdfd32b 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_META_OPTIMIZER_H_
-#define TENSORFLOW_GRAPPLER_OPTIMIZERS_META_OPTIMIZER_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_META_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_META_OPTIMIZER_H_
 
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -72,6 +72,10 @@ class MetaOptimizer : public GraphOptimizer {
     std::vector<OptimizerResult> results;
   };
 
+  Status RunOptimizer(GraphOptimizer* optimizer, Cluster* cluster,
+                      GrapplerItem* optimized_item, GraphDef* optimized_graph,
+                      GraphOptimizationResult* optimization_result);
+
   std::vector<GraphOptimizationResult> optimization_results_;
 };
 
@@ -90,4 +94,4 @@ Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg,
 }  // namespace grappler
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_META_OPTIMIZER_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_META_OPTIMIZER_H_
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
index 9fcf07651b0953..8247cce33922e6 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
@@ -16,11 +16,14 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
 #include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -28,6 +31,8 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
+constexpr char kDevice[] = "/device:CPU:0";
+
 class TestOptimizer : public CustomGraphOptimizer {
  public:
   static void SetOptimized(const bool flag_value) { optimized_ = flag_value; }
@@ -59,7 +64,9 @@ bool TestOptimizer::optimized_;
 
 REGISTER_GRAPH_OPTIMIZER(TestOptimizer);
 
-TEST(MetaOptimizerTest, RunsCustomOptimizer) {
+class MetaOptimizerTest : public GrapplerTest {};
+
+TEST_F(MetaOptimizerTest, RunsCustomOptimizer) {
   TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
   GrapplerItem item;
   CHECK(fake_input.NextItem(&item));
@@ -75,7 +82,7 @@ TEST(MetaOptimizerTest, RunsCustomOptimizer) {
   EXPECT_TRUE(TestOptimizer::IsOptimized());
 }
 
-TEST(MetaOptimizerTest, RunOptimizersTwice) {
+TEST_F(MetaOptimizerTest, RunOptimizersTwice) {
   TrivialTestGraphInputYielder fake_input(4, 1, 10, false, {"CPU:0"});
   GrapplerItem item;
   CHECK(fake_input.NextItem(&item));
@@ -89,6 +96,166 @@ TEST(MetaOptimizerTest, RunOptimizersTwice) {
   TF_EXPECT_OK(status);
 }
 
+TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
+  using test::function::NDef;
+
+  // Enable ony function optimization.
+  RewriterConfig rewriter_config;
+  rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
+  rewriter_config.set_function_optimization(RewriterConfig::ON);
+  rewriter_config.add_optimizers("function");
+
+  MetaOptimizer optimizer(nullptr, rewriter_config);
+
+  // Define function library:
+  //
+  //   MyMul(x, y)    = x * y
+  //  *MySquare(x)    = MyMul(x, x)
+  //  *MyQuadratic(x) = MySquare(MySquare(x))
+  //
+  //  * - marked as noinline
+
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"mul"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "mul:z:0"}});
+
+  FunctionDef square_func = FunctionDefHelper::Create(
+      "MySquare", {"x:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"my_mul"}, "MyMul", {"x", "x"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "my_mul:z:0"}});
+  (*square_func.mutable_attr())["_noinline"].set_b(true);
+
+  FunctionDef quadratic_func = FunctionDefHelper::Create(
+      "MyQuadratic", {"x:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"square"}, "MySquare", {"x"}, {{"T", "$T"}}},
+       {{"quadratic"}, "MySquare", {"square:z"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "quadratic:z:0"}});
+  (*quadratic_func.mutable_attr())["_noinline"].set_b(true);
+
+  // Tensorflow graph:
+  //
+  //   a = tf.Placeholder(tf.float);
+  //   b = tf.Placeholder(tf.int32);
+  //
+  //   square = MySquare(a);        // a^2
+  //   quadratic = MyQuadratic(b);  // b^4
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_INT32}}, kDevice),
+       // Calls into function library
+       NDef("square", "MySquare", {"a"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("quadratic", "MyQuadratic", {"b"}, {{"T", DT_INT32}}, kDevice),
+       // Forward outputs
+       NDef("out_s", "Identity", {"square:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("out_q", "Identity", {"quadratic:0"}, {{"T", DT_INT32}}, kDevice)},
+      // FunctionLib
+      {mul_func, square_func, quadratic_func});
+
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  FunctionLibraryDefinition optimized_flib(OpRegistry::Global(),
+                                           output.library());
+
+  // Specialized and optimized functions should be added to the graph.
+  EXPECT_EQ(5, optimized_flib.num_functions());
+
+  // MyQuadratic should be specialized once:
+  //   0. 'quadratic' node in the main graph
+  const string optimized_0 = "MyQuadratic_specialized_for_quadratic";
+
+  // MySquare should be specialized and optimized for 3 instantiations:
+  //   1.  'square' node in the main graph
+  //   2.  'square' node in the MyQuadratic specialization
+  //   3*. 'quadratic' node in the MyQuadratic specialization
+  //        has identical instantiation context to #2
+
+  const string optimized_1 = "MySquare_specialized_for_square";
+  const string optimized_2 = "MySquare_specialized_for_square_1";
+
+  const FunctionDef* optimized_func_0 = optimized_flib.Find(optimized_0);
+  const FunctionDef* optimized_func_1 = optimized_flib.Find(optimized_1);
+  const FunctionDef* optimized_func_2 = optimized_flib.Find(optimized_2);
+
+  ASSERT_NE(optimized_func_0, nullptr);
+  ASSERT_NE(optimized_func_1, nullptr);
+  ASSERT_NE(optimized_func_2, nullptr);
+
+  // Graph should call optimized function.
+  int count = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "square" && count++) {
+      EXPECT_EQ("MySquare_specialized_for_square", node.op());
+    } else if (node.name() == "quadratic" && count++) {
+      EXPECT_EQ("MyQuadratic_specialized_for_quadratic", node.op());
+    }
+  }
+  EXPECT_EQ(2, count);
+
+  // Specialized MySquare should call specialized functions.
+  count = 0;
+  for (const NodeDef& node : optimized_func_0->node_def()) {
+    if (node.name() == "square" && count++) {
+      EXPECT_EQ(optimized_2, node.op());
+    } else if (node.name() == "quadratic" && count++) {
+      // Share specialized function with the 'square' node.
+      EXPECT_EQ(optimized_2, node.op());
+    }
+  }
+  EXPECT_EQ(2, count);
+
+  const std::vector<const FunctionDef*> optimized_funcs = {optimized_func_1,
+                                                           optimized_func_2};
+
+  // MyMul should be inlined into all optimized versions of MySquare.
+  for (const FunctionDef* optimized_func : optimized_funcs) {
+    count = 0;
+    for (const NodeDef& node : optimized_func->node_def()) {
+      if (node.name() == "my_mul/inlined_inputs" && count++) {
+        EXPECT_EQ("IdentityN", node.op());
+        EXPECT_EQ(2, node.input_size());
+        EXPECT_EQ("x:0", node.input(0));
+        EXPECT_EQ("x:0", node.input(1));
+      } else if (node.name() == "my_mul/x" && count++) {
+        EXPECT_EQ("Identity", node.op());
+        EXPECT_EQ(1, node.input_size());
+        EXPECT_EQ("my_mul/inlined_inputs:output:0", node.input(0));
+      } else if (node.name() == "my_mul/y" && count++) {
+        EXPECT_EQ("Identity", node.op());
+        EXPECT_EQ(1, node.input_size());
+        EXPECT_EQ("my_mul/inlined_inputs:output:1", node.input(0));
+      } else if (node.name() == "my_mul/mul" && count++) {
+        EXPECT_EQ("Mul", node.op());
+        EXPECT_EQ(2, node.input_size());
+        EXPECT_EQ("my_mul/x:output:0", node.input(0));
+        EXPECT_EQ("my_mul/y:output:0", node.input(1));
+      } else if (node.name() == "my_mul" && count++) {
+        EXPECT_EQ("IdentityN", node.op());
+        EXPECT_EQ(1, node.input_size());
+        EXPECT_EQ("my_mul/mul:z:0", node.input(0));
+      }
+      EXPECT_TRUE(node.device().empty());
+    }
+    EXPECT_EQ(5, count);
+  }
+
+  item.fetch = {"out_s", "out_q"};
+  item.feed.emplace_back("a", test::AsScalar<float>(2.0f));
+  item.feed.emplace_back("b", test::AsScalar<int>(4));
+  auto tensors_expected = EvaluateFetchNodes(item);
+
+  GrapplerItem optimized(item, std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+  test::ExpectTensorEqual<int>(tensors_expected[1], tensors[1]);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/model_pruner.cc b/tensorflow/core/grappler/optimizers/model_pruner.cc
index 3311e970108d94..36eab4999d0ff3 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner.cc
@@ -70,6 +70,7 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
     }
     // Try to keep the nodes ordered somewhat topologically since this helps
     // further optimizations perform better.
+    runnable_item.graph.mutable_node()->Reserve(keep.size());
     for (int i = keep.size() - 1; i >= 0; --i) {
       *runnable_item.graph.add_node() = *keep[i];
     }
@@ -113,6 +114,7 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
     }
   }
 
+  pruned_graph->Clear();
   *pruned_graph->mutable_library() = item.graph.library();
   *pruned_graph->mutable_versions() = item.graph.versions();
 
@@ -122,6 +124,7 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
   }
 
   const bool fetches_are_known = !item.fetch.empty();
+  pruned_graph->mutable_node()->Reserve(runnable_item.graph.node_size());
   for (auto& node : runnable_item.graph.node()) {
     if (!fetches_are_known ||
         nodes_to_delete.find(&node) == nodes_to_delete.end()) {
@@ -134,6 +137,7 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
   VLOG(1) << "Pruned " << nodes_to_delete.size()
           << " nodes from the graph. The graph now contains "
           << pruned_graph->node_size() << " nodes.";
+  CHECK_LE(pruned_graph->node_size(), item.graph.node_size());
 
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/model_pruner.h b/tensorflow/core/grappler/optimizers/model_pruner.h
index 3d76aebef433f1..76cc792a45404c 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner.h
+++ b/tensorflow/core/grappler/optimizers/model_pruner.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_OPTIMIZERS_MODEL_PRUNER_H_
-#define TENSORFLOW_GRAPPLER_OPTIMIZERS_MODEL_PRUNER_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_MODEL_PRUNER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_MODEL_PRUNER_H_
 
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 
@@ -41,4 +41,4 @@ class ModelPruner : public GraphOptimizer {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_OPTIMIZERS_MODEL_PRUNER_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_MODEL_PRUNER_H_
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
new file mode 100644
index 00000000000000..efd870b118622d
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -0,0 +1,225 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/remapper.h"
+
+#include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/graph_view.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/core/grappler/utils.h"
+
+namespace tensorflow {
+namespace grappler {
+
+void AddBatchNormNodes(GraphDef* optimized_graph, const NodeDef& fused_node) {
+  const string& x = fused_node.input(0);
+  string scale = fused_node.input(1);
+  string offset = fused_node.input(2);
+  string mean = fused_node.input(3);
+  string variance = fused_node.input(4);
+
+  if (fused_node.attr().at("data_format").s() == "NCHW") {
+    // Need to reshape the last 4 inputs
+    NodeDef* new_shape = optimized_graph->add_node();
+    new_shape->set_name(AddPrefixToNodeName("NCHWShape", fused_node.name()));
+    new_shape->set_op("Const");
+    new_shape->set_device(fused_node.device());
+    *new_shape->add_input() = AsControlDependency(scale);
+    (*new_shape->mutable_attr())["dtype"].set_type(DT_INT32);
+    Tensor t(DT_INT32, {4});
+    t.flat<int32>()(0) = 1;
+    t.flat<int32>()(1) = -1;
+    t.flat<int32>()(2) = 1;
+    t.flat<int32>()(3) = 1;
+    t.AsProtoTensorContent(
+        (*new_shape->mutable_attr())["value"].mutable_tensor());
+
+    NodeDef* reshaped_scale = optimized_graph->add_node();
+    reshaped_scale->set_name(
+        AddPrefixToNodeName("NCHWShapedScale", fused_node.name()));
+    reshaped_scale->set_op("Reshape");
+    reshaped_scale->set_device(fused_node.device());
+    *reshaped_scale->add_input() = scale;
+    *reshaped_scale->add_input() = new_shape->name();
+    (*reshaped_scale->mutable_attr())["T"] = fused_node.attr().at("T");
+    (*reshaped_scale->mutable_attr())["Tshape"].set_type(DT_INT32);
+    scale = reshaped_scale->name();
+
+    NodeDef* reshaped_offset = optimized_graph->add_node();
+    reshaped_offset->set_name(
+        AddPrefixToNodeName("NCHWShapedOffset", fused_node.name()));
+    reshaped_offset->set_op("Reshape");
+    reshaped_offset->set_device(fused_node.device());
+    *reshaped_offset->add_input() = offset;
+    *reshaped_offset->add_input() = new_shape->name();
+    (*reshaped_offset->mutable_attr())["T"] = fused_node.attr().at("T");
+    (*reshaped_offset->mutable_attr())["Tshape"].set_type(DT_INT32);
+    offset = reshaped_offset->name();
+
+    NodeDef* reshaped_mean = optimized_graph->add_node();
+    reshaped_mean->set_name(
+        AddPrefixToNodeName("NCHWShapedMean", fused_node.name()));
+    reshaped_mean->set_op("Reshape");
+    reshaped_mean->set_device(fused_node.device());
+    *reshaped_mean->add_input() = mean;
+    *reshaped_mean->add_input() = new_shape->name();
+    (*reshaped_mean->mutable_attr())["T"] = fused_node.attr().at("T");
+    (*reshaped_mean->mutable_attr())["Tshape"].set_type(DT_INT32);
+    mean = reshaped_mean->name();
+
+    NodeDef* reshaped_variance = optimized_graph->add_node();
+    reshaped_variance->set_name(
+        AddPrefixToNodeName("NCHWShapedVariance", fused_node.name()));
+    reshaped_variance->set_op("Reshape");
+    reshaped_variance->set_device(fused_node.device());
+    *reshaped_variance->add_input() = variance;
+    *reshaped_variance->add_input() = new_shape->name();
+    (*reshaped_variance->mutable_attr())["T"] = fused_node.attr().at("T");
+    (*reshaped_variance->mutable_attr())["Tshape"].set_type(DT_INT32);
+    variance = reshaped_variance->name();
+  }
+
+  float epsilon = 0.0f;
+  if (fused_node.attr().count("epsilon")) {
+    epsilon = fused_node.attr().at("epsilon").f();
+  }
+  DataType dtype = fused_node.attr().at("T").type();
+  Tensor value(dtype, TensorShape());
+  value.scalar<float>()() = epsilon;
+  NodeDef* variance_epsilon = optimized_graph->add_node();
+  TF_CHECK_OK(ConstantFolding::CreateNodeDef(
+      AddPrefixToNodeName("Const", fused_node.name()), &value,
+      variance_epsilon));
+  variance_epsilon->set_device(fused_node.device());
+
+  NodeDef* variance_plus_epsilon = optimized_graph->add_node();
+  variance_plus_epsilon->set_name(
+      AddPrefixToNodeName("VarPlusEpsilon", fused_node.name()));
+  variance_plus_epsilon->set_op("Add");
+  (*variance_plus_epsilon->mutable_attr())["T"].set_type(dtype);
+  variance_plus_epsilon->set_device(fused_node.device());
+  *variance_plus_epsilon->add_input() = variance;
+  *variance_plus_epsilon->add_input() = variance_epsilon->name();
+
+  NodeDef* inv = optimized_graph->add_node();
+  inv->set_name(AddPrefixToNodeName("Inv", fused_node.name()));
+  inv->set_op("Rsqrt");
+  inv->set_device(fused_node.device());
+  (*inv->mutable_attr())["T"].set_type(dtype);
+  *inv->add_input() = variance_plus_epsilon->name();
+
+  NodeDef* scaled = optimized_graph->add_node();
+  scaled->set_name(AddPrefixToNodeName("Scaled", fused_node.name()));
+  scaled->set_op("Mul");
+  scaled->set_device(fused_node.device());
+  (*scaled->mutable_attr())["T"].set_type(dtype);
+  *scaled->add_input() = inv->name();
+  *scaled->add_input() = scale;
+
+  NodeDef* a = optimized_graph->add_node();
+  a->set_name(AddPrefixToNodeName("Mul", fused_node.name()));
+  a->set_op("Mul");
+  a->set_device(fused_node.device());
+  (*a->mutable_attr())["T"].set_type(dtype);
+  *a->add_input() = x;
+  *a->add_input() = scaled->name();
+
+  NodeDef* b = optimized_graph->add_node();
+  b->set_name(AddPrefixToNodeName("Mul2", fused_node.name()));
+  b->set_op("Mul");
+  b->set_device(fused_node.device());
+  (*b->mutable_attr())["T"].set_type(dtype);
+  *b->add_input() = mean;
+  *b->add_input() = scaled->name();
+
+  NodeDef* c = optimized_graph->add_node();
+  c->set_name(AddPrefixToNodeName("Offset", fused_node.name()));
+  c->set_op("Sub");
+  c->set_device(fused_node.device());
+  (*c->mutable_attr())["T"].set_type(dtype);
+  *c->add_input() = offset;
+  *c->add_input() = b->name();
+
+  NodeDef* r = optimized_graph->add_node();
+  r->set_name(fused_node.name());
+  r->set_op("Add");
+  r->set_device(fused_node.device());
+  (*r->mutable_attr())["T"].set_type(dtype);
+  *r->add_input() = a->name();
+  *r->add_input() = c->name();
+}
+
+Status Remapper::Optimize(Cluster* /*cluster*/, const GrapplerItem& item,
+                          GraphDef* optimized_graph) {
+  GraphProperties properties(item);
+  TF_RETURN_IF_ERROR(properties.InferStatically(false));
+  GraphView graph(const_cast<GraphDef*>(&item.graph));
+
+  // During inference, most of the inputs to FusedBatchNorm are constant, and we
+  // can therefore replace the op with a much cheaper set of primitives.
+  for (const NodeDef& node : item.graph.node()) {
+    if (node.op() == "FusedBatchNorm" || node.op() == "FusedBatchNormV2") {
+      bool optimizable = (node.attr().count("T") == 0 ||
+                          node.attr().at("T").type() == DT_FLOAT);
+      optimizable &= (node.attr().count("is_training") == 0 ||
+                      !node.attr().at("is_training").b());
+      if (optimizable) {
+        int const_inputs = 0;
+        const auto& props = properties.GetInputProperties(node.name());
+        for (const auto& prop : props) {
+          if (prop.has_value()) {
+            const_inputs += 1;
+          }
+        }
+        // TODO(bsteiner): use the cost model to compare the cost of fused batch
+        // norm against that of the optimized form.
+        optimizable = (const_inputs >= 4);
+      }
+      if (optimizable) {
+        for (GraphView::Edge edge : graph.GetFanoutEdges(node, false)) {
+          if (edge.src.port_id != 0) {
+            // The optimized version only generates the first output.
+            optimizable = false;
+            break;
+          }
+        }
+      }
+      if (optimizable) {
+        std::cout << "Optimizing fused batch norm node " << node.DebugString()
+                  << std::endl;
+        AddBatchNormNodes(optimized_graph, node);
+        continue;
+      }
+    }
+    *optimized_graph->add_node() = node;
+  }
+
+  *optimized_graph->mutable_library() = item.graph.library();
+  *optimized_graph->mutable_versions() = item.graph.versions();
+
+  return Status::OK();
+}
+
+void Remapper::Feedback(Cluster* /*cluster*/, const GrapplerItem& /*item*/,
+                        const GraphDef& /*optimized_graph*/,
+                        double /*result*/) {
+  // Nothing to do for ArithmeticOptimizer.
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/remapper.h b/tensorflow/core/grappler/optimizers/remapper.h
new file mode 100644
index 00000000000000..c18413e4e72bb9
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/remapper.h
@@ -0,0 +1,48 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_REMAPPER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_REMAPPER_H_
+
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Optimize TF computations by remapping subgraphs/nodes onto other subgraphs or
+// nodes to decrease the amount of operations needed to perform a computation.
+class Remapper : public GraphOptimizer {
+ public:
+  explicit Remapper(RewriterConfig::Toggle opt_level) : opt_level_(opt_level) {}
+
+  ~Remapper() override {}
+
+  string name() const override { return "remapper"; };
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* optimized_graph) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimized_graph, double result) override;
+
+ private:
+  RewriterConfig::Toggle opt_level_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_REMAPPER_H_
diff --git a/tensorflow/core/grappler/optimizers/remapper_test.cc b/tensorflow/core/grappler/optimizers/remapper_test.cc
new file mode 100644
index 00000000000000..4cbf0d8d6f11ea
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/remapper_test.cc
@@ -0,0 +1,95 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/remapper.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/devices.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class RemapperTest : public GrapplerTest {};
+
+TEST_F(RemapperTest, FusedBatchNorm) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output dflt = ops::Const(s.WithOpName("dflt"), {3.14f, 2.7f}, {2, 1, 1, 1});
+  Output x = ops::PlaceholderWithDefault(s.WithOpName("x"), dflt, {2, 1, 1, 1});
+  Output scale = ops::Const(s.WithOpName("scale"), {0.3f}, {1});
+  Output offset = ops::Const(s.WithOpName("offset"), {0.123f}, {1});
+  Output mean = ops::Const(s.WithOpName("mean"), {7.3f}, {1});
+  Output variance = ops::Const(s.WithOpName("variance"), {0.57f}, {1});
+  ops::FusedBatchNorm::Attrs attr;
+  attr = attr.IsTraining(false);
+  ops::FusedBatchNorm bn(s.WithOpName("batch_norm"), x, scale, offset, mean,
+                         variance, attr);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"batch_norm"};
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+
+  Remapper optimizer(RewriterConfig::ON);
+  GraphDef output;
+  TF_CHECK_OK(optimizer.Optimize(nullptr, item, &output));
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+}
+
+TEST_F(RemapperTest, FusedBatchNormNCHW) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output dflt =
+      ops::Const(s.WithOpName("dflt"), {3.14f, 2.7f, 1.0f, 2.0f, 3.0f, 100.0f},
+                 {1, 3, 1, 2});
+  Output x = ops::PlaceholderWithDefault(s.WithOpName("x"), dflt, {1, 3, 1, 2});
+  Output scale = ops::Const(s.WithOpName("scale"), {0.3f, 7.0f, 123.0f}, {3});
+  Output offset =
+      ops::Const(s.WithOpName("offset"), {0.123f, 2.1f, 0.55f}, {3});
+  Output mean = ops::Const(s.WithOpName("mean"), {7.3f, 8.3f, 3.1f}, {3});
+  Output variance =
+      ops::Const(s.WithOpName("variance"), {0.57f, 1.0f, 2.0f}, {3});
+  ops::FusedBatchNorm::Attrs attr;
+  attr = attr.IsTraining(false);
+  attr = attr.DataFormat("NCHW");
+  ops::FusedBatchNorm bn(s.WithOpName("batch_norm").WithDevice("/device:GPU:0"),
+                         x, scale, offset, mean, variance, attr);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"batch_norm"};
+
+  Remapper optimizer(RewriterConfig::ON);
+  GraphDef output;
+  TF_CHECK_OK(optimizer.Optimize(nullptr, item, &output));
+
+  if (GetNumAvailableGPUs() > 0) {
+    // NCHW batch norm is only supported on GPU.
+    auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+    EXPECT_EQ(1, tensors_expected.size());
+    auto tensors = EvaluateNodes(output, item.fetch);
+    EXPECT_EQ(1, tensors.size());
+    test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+  }
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
new file mode 100644
index 00000000000000..cceef4098df67e
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
@@ -0,0 +1,929 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.h"
+
+#include "tensorflow/core/common_runtime/scoped_allocator.h"
+#include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/utils/frame.h"
+#include "tensorflow/core/lib/gtl/inlined_vector.h"
+
+// Like TF_RETURN_IF_ERROR, but also logs a WARNING.
+#define LOG_WARNING_AND_RETURN_IF_ERROR(...)            \
+  do {                                                  \
+    const ::tensorflow::Status _status = (__VA_ARGS__); \
+    if (TF_PREDICT_FALSE(!_status.ok())) {              \
+      LOG(WARNING) << "error: " << _status;             \
+      return _status;                                   \
+    }                                                   \
+  } while (0)
+
+namespace tensorflow {
+namespace grappler {
+
+namespace {
+// Node names often have some kind of name_scope prefix, with slashes,
+// and a _nn numeric suffix.  Returns true if the main part of the node_name
+// matches op_name, i.e. it looks from the name like this node is
+// of that op type.
+bool HasOpName(const string& node_name, const string& op_name) {
+  size_t begin = node_name.rfind("/");
+  if (begin == string::npos) {
+    begin = 0;
+  } else {
+    ++begin;
+  }
+  size_t end = node_name.rfind("_");
+  if (end != string::npos) {
+    size_t p = end + 1;
+    while (p < node_name.size()) {
+      if (!isdigit(node_name[p])) {
+        end = node_name.size();
+        break;
+      }
+      ++p;
+    }
+  } else {
+    end = node_name.size();
+  }
+  return node_name.substr(begin, end - begin) == op_name;
+}
+
+// After shape inference has been done each op should be annotated
+// with its output shape(s).  This function iterates over a collection
+// of ops that are a potential application of a ScopedAllocator.  It
+// verifies whether they all have the same output type and if so
+// gathers a vector of their output shapes.  It returns an error if
+// any of the ops doesn't have type or shape data, or if it has more
+// than one output, of if the output type of all ops is not the same.
+// If it returns OK then *type and *shapes should be correctly populated.
+Status CheckTypesAndGetShapes(const GraphProperties& graph_properties,
+                              const std::vector<NodeDef*>& ops, DataType* type,
+                              std::vector<TensorShape>* shapes) {
+  VLOG(1) << "CheckTypesAndGetShapes";
+  *type = DT_INVALID;
+  for (NodeDef* n : ops) {
+    AttrSlice n_attrs = AttrSlice(*n);
+    DataType dtype;
+    LOG_WARNING_AND_RETURN_IF_ERROR(GetNodeAttr(n_attrs, "T", &dtype));
+    VLOG(2) << "op " << n->name() << " has type " << dtype << " shapes.size() "
+            << shapes->size();
+    if (!graph_properties.HasOutputProperties(n->name())) {
+      LOG(ERROR) << "Node " << n->DebugString() << " lacks output shape.";
+      return errors::Internal("Node ", n->name(), " lacks output shape.");
+    }
+    const std::vector<OpInfo::TensorProperties>& prop_list =
+        graph_properties.GetOutputProperties(n->name());
+    if (prop_list.size() != 1) {
+      return errors::Internal("Node ", n->name(),
+                              " does not have exactly one output as expected "
+                              "by ScopedAllocatorOptimizer");
+    }
+    const OpInfo::TensorProperties& props = prop_list[0];
+    if (shapes->empty()) {
+      *type = props.dtype();
+    } else if (*type != props.dtype()) {
+      return errors::Internal("Group ops don't all have same type");
+    } else if (!TensorShape::IsValid(props.shape())) {
+      return errors::Internal("Complete shape not known for ", n->name());
+    }
+    VLOG(2) << "Adding shape " << props.shape().DebugString();
+    shapes->push_back(TensorShape(props.shape()));
+  }
+  return Status::OK();
+}
+
+// Describes an existing input edge in the graph.
+struct InputDesc {
+  NodeDef* from_node_def;
+  int output_slot;
+  NodeDef* to_node_def;
+  InputDesc(NodeDef* f, int os, NodeDef* t)
+      : from_node_def(f), output_slot(os), to_node_def(t) {}
+};
+
+// Populates *inputs with all of the non-control inputs of ops.
+// Returns error if it fails to find exactly one input for each op,
+// or if some input is not of type dtype.
+Status GetInputs(NodeMap* node_map, const std::vector<NodeDef*>& ops,
+                 DataType dtype, std::vector<InputDesc>* inputs) {
+  VLOG(1) << "Getinputs";
+  for (NodeDef* n : ops) {
+    NodeDef* inode = nullptr;
+    int position = 0;
+    VLOG(2) << "for node " << n->name();
+    for (const auto& input_name : n->input()) {
+      if (!IsControlInput(input_name)) {
+        if (inode) {
+          return errors::Internal("Found more than one input for node ",
+                                  n->name());
+        }
+        ParseNodeName(input_name, &position);
+        inode = node_map->GetNode(input_name);
+        CHECK(inode) << input_name;
+        VLOG(2) << "inode " << inode->DebugString();
+      }
+    }
+    AttrSlice inode_attrs = AttrSlice(*inode);
+    DataType inode_dtype;
+    LOG_WARNING_AND_RETURN_IF_ERROR(
+        GetNodeAttr(inode_attrs, "T", &inode_dtype));
+    if (inode_dtype != dtype) {
+      return errors::Internal("ScopedAllocatorOptimizer expected input type ",
+                              dtype, " but found ", inode_dtype);
+    }
+    // inputs->push_back(InputDesc(inode, position, n));
+    inputs->emplace_back(inode, position, n);
+  }
+  return Status::OK();
+}
+
+// Remove the NodeDef nd from node_map and graph.  It must be the case
+// that nd no longer has any input or output edges, though that is not
+// checked.
+void RemoveNode(NodeDef* nd, GraphDef* graph, NodeMap* node_map) {
+  node_map->RemoveNode(nd->name());
+  // TODO(tucker): The efficiency of this routine is poor.
+  // Change to accumulate and do a bulk removal, maybe refactoring
+  // some code from dependency_optimizer.
+  protobuf::RepeatedPtrField<NodeDef>* nodes = graph->mutable_node();
+  for (int i = 0; i < nodes->size(); ++i) {
+    if (nd->name() == (*nodes)[i].name()) {
+      nodes->SwapElements(i, nodes->size() - 1);
+      nodes->RemoveLast();
+      return;
+    }
+  }
+  LOG(FATAL) << "Failed to find node " << nd->name() << " in graph";
+}
+
+// Removes a named edge from between two nodes.
+Status RemoveEdge(const string& input_edge_name, const string& from_node_name,
+                  NodeDef* to_node, NodeMap* node_map) {
+  if (node_map) {
+    node_map->RemoveOutput(from_node_name, to_node->name());
+  }
+  protobuf::RepeatedPtrField<string>* inputs = to_node->mutable_input();
+  int edge_index = -1;
+  for (edge_index = 0; edge_index < inputs->size(); ++edge_index) {
+    VLOG(2) << " consider edge " << (*inputs)[edge_index];
+    if ((*inputs)[edge_index] == input_edge_name) {
+      break;
+    }
+  }
+  if (edge_index >= inputs->size()) {
+    return errors::Internal("Could not find input name ", input_edge_name,
+                            " at node ", to_node->name());
+  }
+  inputs->DeleteSubrange(edge_index, 1);
+  return Status::OK();
+}
+}  // namespace
+
+void ScopedAllocatorOptimizer::ExtendNodeAttr(StringPiece name,
+                                              const std::vector<int32>& values,
+                                              NodeDef* node_def) {
+  if (HasNodeAttr(*node_def, name)) {
+    VLOG(2) << "extending";
+    AttrValue* existing = &(*node_def->mutable_attr())[name.ToString()];
+    for (int32 i : values) {
+      existing->mutable_list()->add_i(i);
+    }
+  } else {
+    VLOG(2) << "setting new attr value";
+    AddNodeAttr(name, values, node_def);
+  }
+}
+
+class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
+ public:
+  ~UnaryElementwiseRewriter() override {}
+
+  // Return non-OK if any input is already committed to a ScopedAllocator.
+  Status CheckExistingScopedAllocator(const std::vector<InputDesc>& inputs) {
+    for (const InputDesc& nd : inputs) {
+      VLOG(2) << "get attrs for " << nd.from_node_def->name();
+      AttrSlice n_attrs = AttrSlice(*nd.from_node_def);
+      int sa_id;
+      Status ss = GetNodeAttr(n_attrs, "sa_id", &sa_id);
+      if (ss.ok()) {
+        LOG(INFO) << "Abandoning PARewriter because input "
+                  << nd.from_node_def->name() << " is already assigned "
+                  << "to ScopedAllocator " << sa_id;
+        return errors::Internal(
+            "Abandoning PARewriter because input ", nd.from_node_def->name(),
+            " is already assigned to ScopedAllocator ", sa_id);
+      }
+    }
+    return Status::OK();
+  }
+
+  // Return non-OK if any input is a member of op_set.
+  Status CheckInternalDataDependency(const std::set<string>& op_set,
+                                     const std::vector<InputDesc>& inputs) {
+    for (const InputDesc& nd : inputs) {
+      if (op_set.find(nd.from_node_def->name()) != op_set.end()) {
+        if (nd.output_slot != tensorflow::Graph::kControlSlot) {
+          return errors::Internal("Data edge exists bewtween ",
+                                  nd.from_node_def->name(),
+                                  " and another "
+                                  "node in the set");
+        }
+      }
+    }
+    return Status::OK();
+  }
+
+  // Remove all control edges between members of ops.
+  void ClearInternalControlInputs(const std::set<string>& op_set,
+                                  const std::vector<NodeDef*>& ops,
+                                  NodeMap* node_map) {
+    for (NodeDef* n : ops) {
+      for (const auto& input_name : n->input()) {
+        if (IsControlInput(input_name)) {
+          int position = 0;
+          string input_node_name = ParseNodeName(input_name, &position);
+          CHECK_EQ(position, -1);
+          if (op_set.find(input_node_name) != op_set.end()) {
+            // This is an internal control edge.  Remove it.
+            VLOG(1) << "Remove control output from " << input_node_name
+                    << " via edge " << input_name << " to " << n->name();
+            TF_CHECK_OK(RemoveEdge(input_name, input_node_name, n, node_map));
+          }
+        }
+      }
+    }
+  }
+
+  // Examine the input set of an op set, gathering their shapes and types
+  // and checking whether there are any considerations that prevent use
+  // of a single ScopedAllocator for all of those inputs.
+  Status AnalyzeInputs(ScopedAllocatorOptimizer* sa_opti, NodeMap* node_map,
+                       const std::vector<NodeDef*>& ops,
+                       const std::set<string>& op_instance_names,
+                       string* device_name, DataType* dtype,
+                       std::vector<TensorShape>* input_shapes,
+                       std::vector<InputDesc>* inputs, TensorShape* sa_shape) {
+    CHECK(graph_properties_);
+    LOG_WARNING_AND_RETURN_IF_ERROR(
+        CheckTypesAndGetShapes(*graph_properties_, ops, dtype, input_shapes));
+    LOG_WARNING_AND_RETURN_IF_ERROR(
+        GetInputs(sa_opti->node_map(), ops, *dtype, inputs));
+    LOG_WARNING_AND_RETURN_IF_ERROR(CheckExistingScopedAllocator(*inputs));
+    LOG_WARNING_AND_RETURN_IF_ERROR(
+        CheckInternalDataDependency(op_instance_names, *inputs));
+    ClearInternalControlInputs(op_instance_names, ops, node_map);
+    *device_name = ops[0]->device();
+    CHECK(!device_name->empty());
+    CHECK(!input_shapes->empty());
+    CHECK_EQ(0, Allocator::kAllocatorAlignment % DataTypeSize(*dtype))
+        << "ScopedAllocatorOptimizer only applies to types that evenly "
+        << "divide kAllocatorAlignment";
+    std::vector<ScopedAllocator::Field> sa_fields;
+    // Calculate the field embedding boundaries and thereby the
+    // required size of the backing tensor.
+    int64 num_bytes = ScopedAllocatorMgr::PopulateFields(
+        0 /*scope_id*/, *input_shapes, *dtype, &sa_fields);
+    int64 num_elts = num_bytes / DataTypeSize(*dtype);
+    VLOG(2) << "num_bytes " << num_bytes << " num_elts=" << num_elts;
+    *sa_shape = TensorShape({num_elts});
+    return Status::OK();
+  }
+
+  // Build the ScopedAllocator node that will be assigned to allocate
+  // the output tensors of the input node set.
+  Status ConstructScopedAllocatorNode(
+      ScopedAllocatorOptimizer* sa_opti, GraphDef* graph, NodeMap* node_map,
+      const std::vector<NodeDef*>& ops, const string& device_name,
+      DataType dtype, int sa_id, const string& sa_name,
+      const std::vector<TensorShape>& input_shapes,
+      const std::vector<InputDesc>& inputs, const TensorShape& sa_shape) {
+    VLOG(2) << "ConstructScopedAllocatorNode " << sa_name;
+    NodeDefBuilder sa_builder(sa_name, "_ScopedAllocator");
+    sa_builder.Device(device_name);
+    sa_builder.Attr("sa_name", sa_name);
+    sa_builder.Attr("T", dtype);
+    sa_builder.Attr("id", sa_id);
+    sa_builder.Attr("shapes", input_shapes);
+    sa_builder.Attr("shape", sa_shape);
+    sa_builder.Attr("expected_call_count", static_cast<int64>(ops.size()));
+    NodeDef* sa_node = graph->add_node();
+    LOG_WARNING_AND_RETURN_IF_ERROR(sa_builder.Finalize(sa_node));
+    node_map->AddNode(sa_name, sa_node);
+
+    // Add control edges from the ScopedAllocatorOp to all of the
+    // input nodes and mark them for allocation from backing tensor.
+    for (int i = 0; i < inputs.size(); ++i) {
+      auto& nd = inputs[i];
+      VLOG(2) << "To input " << i << ": " << nd.from_node_def->name()
+              << " add control input "
+              << "^" << sa_name;
+      nd.from_node_def->add_input(strings::StrCat("^", sa_name));
+      // This attribute says: allocate output_slot from
+      // ScopedAllocator instance sa_id + 1 + i.
+      ScopedAllocatorOptimizer::ExtendNodeAttr("_scoped_allocator",
+                                               {nd.output_slot, sa_id + 1 + i},
+                                               nd.from_node_def);
+      node_map->AddOutput(sa_name, nd.from_node_def->name());
+    }
+    return Status::OK();
+  }
+
+  Status BuildSAConcatNode(GraphDef* graph, NodeMap* node_map,
+                           const std::vector<NodeDef*>& ops,
+                           const std::set<string>& op_instance_names,
+                           const string& device_name, DataType dtype, int sa_id,
+                           const string& sa_name, const string& sac_name,
+                           const TensorShape& sa_shape,
+                           std::vector<NodeDefBuilder::NodeOut>* sac_inputs) {
+    VLOG(2) << "BuildSAConcatNode " << sac_name;
+    std::set<string> sac_ctl_inputs;
+    for (int i = 0; i < ops.size(); ++i) {
+      NodeDef* old_op = ops[i];
+      for (const string& old_op_input : old_op->input()) {
+        int position = 0;
+        string input_name = ParseNodeName(old_op_input, &position);
+        if (position == -1) {
+          // A control input: drop if from another member of the op set.
+          if (op_instance_names.find(old_op_input) == op_instance_names.end()) {
+            sac_ctl_inputs.insert(old_op_input);
+          }
+        } else {
+          // TODO(tucker): remove redundant check.
+          // A data input: illegal if from another member of the op set.
+          if (op_instance_names.find(old_op_input) != op_instance_names.end()) {
+            LOG(ERROR) << "Data edge between " << old_op_input << " and "
+                       << old_op->name() << " cannot build ScopedAllocator.";
+            return errors::Internal("Data edge between ", old_op_input, " and ",
+                                    old_op->name(),
+                                    " cannot build ScopedAllocator.");
+          }
+          sac_inputs->push_back(
+              NodeDefBuilder::NodeOut(old_op_input, 0, dtype));
+        }
+        VLOG(3) << "from op " << i << ": " << old_op->name()
+                << " sac_inputs append " << old_op_input;
+      }
+    }
+    NodeDefBuilder sac_builder(sac_name, "_ScopedAllocatorConcat");
+    VLOG(2) << "New sac_name " << sac_name << " shape "
+            << sa_shape.DebugString();
+    sac_builder.Device(device_name);
+    sac_builder.Attr("sa_name", sa_name);
+    sac_builder.Attr("id", sa_id);
+    sac_builder.Attr("T", dtype);
+    sac_builder.Attr("shape", sa_shape);
+    sac_builder.Attr("N", static_cast<int>(sac_inputs->size()));
+    sac_builder.Input(NodeDefBuilder::NodeOut(sa_name, 0, dtype));
+    sac_builder.Input(*sac_inputs);
+    NodeDef* sac_node = graph->add_node();
+    LOG_WARNING_AND_RETURN_IF_ERROR(sac_builder.Finalize(sac_node));
+    node_map->AddNode(sac_name, sac_node);
+    node_map->AddOutput(sa_name, sac_name);
+
+    // Attach the old control inputs to the new sac node.
+    for (const string& ctl_input : sac_ctl_inputs) {
+      sac_node->add_input(ctl_input);
+    }
+    return Status::OK();
+  }
+
+  Status BuildReplacementOp(GraphDef* graph, NodeMap* node_map,
+                            const std::vector<NodeDef*>& ops,
+                            const string& device_name, DataType dtype,
+                            const string& op_name, const string& sac_name,
+                            const string& sa_op_name) {
+    VLOG(2) << "BuildReplacementOp " << sa_op_name;
+    NodeDefBuilder op_builder(sa_op_name, op_name);
+    op_builder.Device(device_name);
+
+    // Transfer the Node Attr from the first replaced Node to the new
+    // Node.  TODO(tucker): In principle we should verify that
+    // the Attr are consistent and compatible across all op instances.
+    // Unfortunately that will probably require op-specific tests, so
+    // punt on that for the time being.
+    AttrSlice first_slice(*ops[0]);
+    for (auto& it : first_slice) {
+      op_builder.Attr(it.first, it.second);
+    }
+    op_builder.Attr("_forward_input", {0, 0});
+    op_builder.Input(sac_name, 0, dtype);
+    NodeDef* sa_op_node = graph->add_node();
+    LOG_WARNING_AND_RETURN_IF_ERROR(op_builder.Finalize(sa_op_node));
+    node_map->AddNode(sa_op_name, sa_op_node);
+    node_map->AddOutput(sac_name, sa_op_name);
+    return Status::OK();
+  }
+
+  Status BuildSplitNode(GraphDef* graph, NodeMap* node_map,
+                        const std::vector<NodeDef*>& ops,
+                        const std::vector<TensorShape>& input_shapes,
+                        const std::vector<NodeDefBuilder::NodeOut>& sac_inputs,
+                        const string& device_name, DataType dtype,
+                        const string& op_name, int sa_id,
+                        const string& sas_name, const string& sa_name,
+                        const string& sa_op_name) {
+    VLOG(2) << "new ScopedAllocatorSplit " << sas_name;
+    NodeDefBuilder sas_builder(sas_name, "_ScopedAllocatorSplit");
+    sas_builder.Device(device_name);
+    sas_builder.Attr("sa_name", sa_name);
+    sas_builder.Attr("id", sa_id);
+    sas_builder.Attr("T", dtype);
+    sas_builder.Attr("shapes", input_shapes);
+    std::vector<NodeDefBuilder::NodeOut> sas_inputs = sac_inputs;
+    sas_builder.Attr("N", static_cast<int>(sas_inputs.size()));
+    sas_builder.Input(NodeDefBuilder::NodeOut(sa_op_name, 0, dtype));
+    sas_builder.Input(sas_inputs);
+    NodeDef* sas_node = graph->add_node();
+    LOG_WARNING_AND_RETURN_IF_ERROR(sas_builder.Finalize(sas_node));
+    node_map->AddNode(sas_name, sas_node);
+    node_map->AddOutput(sa_op_name, sas_name);
+    return Status::OK();
+  }
+
+  // After the new ScopedAllocator and its corresponding Concat and
+  // Split nodes have been built, and a new single Op instance
+  // constructed, rewire the graph: Remove input edges to the old Op
+  // nodes and replace the old Op node outputs with the corresponding
+  // ScopedAllocatorSplit node outputs.  After this the old Op nodes
+  // should no longer have any input or output edges and they can be
+  // removed from the graph.
+  Status RewireSubgraph(GraphDef* graph, NodeMap* node_map,
+                        const std::vector<NodeDef*>& ops,
+                        const std::set<string>& op_instance_names,
+                        const string& op_name, const string& sas_name) {
+    VLOG(2) << "RewireSubgraph";
+    for (int op_idx = 0; op_idx < ops.size(); ++op_idx) {
+      NodeDef* old_op = ops[op_idx];
+      // Copy the output node set since we'll be modifying the version
+      // maintained by NodeMap in the loop.
+      std::set<NodeDef*> output_nodes = node_map->GetOutputs(old_op->name());
+      VLOG(3) << "old_op " << old_op->name() << " had " << output_nodes.size()
+              << " outputs.  Moving them to the PASplit node.";
+      if (VLOG_IS_ON(2)) {
+        for (NodeDef* n : output_nodes) {
+          VLOG(3) << "    output: " << n->name();
+        }
+      }
+      for (NodeDef* n : output_nodes) {
+        VLOG(3) << "really checking old output " << n->name()
+                << " for corresponding input.";
+        if (op_instance_names.find(n->name()) != op_instance_names.end()) {
+          // If this output node is a member of the ops set, it must have
+          // been an internal control edge so drop it.
+          VLOG(3) << "Dropping control output from " << old_op->name() << " to "
+                  << n->name();
+          // However, we may already have dropped it at the clear() below,
+          // so if we fail to find it, that's okay.
+          Status ignore = RemoveEdge(strings::StrCat("^", old_op->name()),
+                                     old_op->name(), n, node_map);
+          continue;
+        }
+        bool found = false;
+        VLOG(3) << "about to iterate over " << n->input_size() << " inputs";
+        for (int i = 0; i < n->input_size(); ++i) {
+          VLOG(3) << "input " << n->input(i);
+          int position = 0;
+          string input_node = ParseNodeName(n->input(i), &position);
+          if (input_node == old_op->name()) {
+            found = true;
+            VLOG(3) << "match pos=" << position;
+            if (position == -1) {
+              // It was a control edge
+              *n->mutable_input(i) = strings::StrCat("^", sas_name);
+            } else {
+              CHECK_EQ(0, position)
+                  << "name " << n->input(i) << " pos " << position;
+              *n->mutable_input(i) = strings::StrCat(sas_name, ":", op_idx);
+            }
+            node_map->RemoveOutput(old_op->name(), n->name());
+            node_map->AddOutput(sas_name, n->name());
+            VLOG(3) << "breaking on success";
+            break;
+          } else {
+            VLOG(3) << "other input " << n->input(i);
+          }
+        }
+        // In general it's required that we found the output node's old
+        // input and replaced it, but one exception is if the output node
+        // is of the same type being coalesced and the edge is a control
+        // input.  In that case it probably got eliminated in an earlier
+        // pass.
+        VLOG(3) << "before HasOp";
+        if (!HasOpName(n->name(), op_name)) {
+          CHECK(found) << "old_op " << old_op->name() << " node "
+                       << " could not find input edge on " << n->DebugString()
+                       << " to replace."
+                       << " " << op_name << " not in " << n->name();
+        }
+        VLOG(3) << "bottom of for output_nodes";
+      }
+      VLOG(3) << "Clearing all inputs of " << old_op->name();
+      node_map->RemoveInputs(old_op->name());
+      old_op->clear_input();
+      node_map->RemoveOutputs(old_op->name());
+      VLOG(3) << "after clear: " << old_op->DebugString();
+      // old_op should be dead, with no further inputs or outputs.
+      // It needs to be removed altogether before the graph is generated,
+      // but we need to leave it around until this Optimizer is done,
+      // because there may be some
+      // Remove.
+      RemoveNode(old_op, graph, node_map);
+    }
+    return Status::OK();
+  }
+
+  // Given a collection of instances of op_name, presumed to be
+  // logically parallel and operating on tensors of the same type,
+  // replace them by a single instance.  First find the upstream Ops
+  // generating their inputs. Create a new ScopedAllocatorOp that
+  // outputs a single backing_tensor pre-arranged for sub-allocation
+  // of all of those input tensors.  Then insert a new
+  // ScopedAllocatorConcatOp below the upstream Ops to make explicit
+  // the materialization of a concatenation of their outputs.  Put the
+  // new op_name instance below the new concat op and follow with a
+  // ScopedAllocatorSplitOp that restores the correct shape outputs
+  // for the consumers of the old op_name instances.
+  //
+  // There must be no non-control edges between Nodes in 'ops'.
+  // Control edges among these nodes will be dropped.
+  Status Rewrite(ScopedAllocatorOptimizer* sa_opti, GraphDef* graph,
+                 const string& op_name, const std::vector<NodeDef*>& ops,
+                 bool* applied) override {
+    if (VLOG_IS_ON(1)) {
+      VLOG(1) << "Rewrite";
+      string op_names;
+      for (auto& nd : ops) {
+        strings::StrAppend(&op_names, nd->name(), ", ");
+      }
+      VLOG(1) << "UnaryElementwiseRewriter::Rewrite " << op_name
+              << " to: " << op_names;
+    }
+    NodeMap* node_map = sa_opti->node_map();
+
+    // Make a set of the node names for faster membership testing.
+    std::set<string> op_instance_names;
+    for (auto& nd : ops) {
+      op_instance_names.insert(nd->name());
+      VLOG(2) << "op_instance_name " << nd->name();
+    }
+    DataType dtype;
+    std::vector<TensorShape> input_shapes;
+    std::vector<InputDesc> inputs;
+    TensorShape sa_shape;
+    string device_name;
+
+    TF_RETURN_IF_ERROR(AnalyzeInputs(sa_opti, node_map, ops, op_instance_names,
+                                     &device_name, &dtype, &input_shapes,
+                                     &inputs, &sa_shape));
+
+    int sa_id = sa_opti->NewScopedAllocatorId(input_shapes.size());
+    string sa_name = strings::StrCat("scoped_allocator_", sa_id);
+    TF_RETURN_IF_ERROR(ConstructScopedAllocatorNode(
+        sa_opti, graph, node_map, ops, device_name, dtype, sa_id, sa_name,
+        input_shapes, inputs, sa_shape));
+
+    // TODO(tucker): Maybe add control edges to delay execution of the
+    // ScopedAllocatorOp until just before first use in order to
+    // conserve memory.  What would be correct?  Let I0...In be the
+    // input nodes that are all going to alloc from SA.  If we make
+    // SA wait until all of these are ready, that might be too slow.
+    // It should probably wait until at least one is ready, but which
+    // one?  Maybe just pick the first.
+    // {
+    //   auto& nd = inputs[0];
+    //   std::vector<InputDesc> inputs_to_first;
+    //   LOG_WARNING_AND_RETURN_IF_ERROR(GetInputs(sa_opti->node_map(),
+    //   {nd.from_node_def},
+    //                                dtype, &inputs_to_first));
+    //   for (int i = 0; i < inputs_to_first.size(); ++i) {
+    //     sa_node->add_input(
+    //         strings::StrCat("^", inputs_to_first[i].from_node_def->name()));
+    //   }
+    // }
+
+    // Build a ScopedAllocatorConcat below all of the input nodes.
+    std::vector<NodeDefBuilder::NodeOut> sac_inputs;
+    string sac_name = strings::StrCat("scoped_allocator_concat_", sa_id);
+    TF_RETURN_IF_ERROR(BuildSAConcatNode(
+        graph, node_map, ops, op_instance_names, device_name, dtype, sa_id,
+        sa_name, sac_name, sa_shape, &sac_inputs));
+
+    // Construct a new instance of the parallel op and insert it
+    // immediately below the new ScopedAllocatorConcat.
+    string sa_op_name = strings::StrCat(sa_name, "_", op_name);
+    TF_RETURN_IF_ERROR(BuildReplacementOp(graph, node_map, ops, device_name,
+                                          dtype, op_name, sac_name,
+                                          sa_op_name));
+
+    // Build a ScopedAllocatorSplit split below the new Op.
+    string sas_name = strings::StrCat("scoped_allocator_split_", sa_id);
+    TF_RETURN_IF_ERROR(BuildSplitNode(graph, node_map, ops, input_shapes,
+                                      sac_inputs, device_name, dtype, op_name,
+                                      sa_id, sas_name, sa_name, sa_op_name));
+
+    // Rewire the graph.
+    TF_RETURN_IF_ERROR(RewireSubgraph(graph, node_map, ops, op_instance_names,
+                                      op_name, sas_name));
+
+    *applied = true;
+    return Status::OK();
+  }
+};
+
+ScopedAllocatorOptimizer::ScopedAllocatorOptimizer(
+    const ScopedAllocatorOptions& opts) {
+  VLOG(1) << "ScopedAllocatorOptimizer::ScopedAllocatorOptimizer";
+  Rewriter* r = new UnaryElementwiseRewriter();
+  to_delete_.push_back(r);
+  if (opts.enable_op_size() == 0) {
+    // Opts handled by default:
+    for (const auto& op_name : {"CollectiveReduce"}) {
+      op_name_set_.insert(op_name);
+      rewriters_[op_name] = r;
+    }
+  } else {
+    for (const auto& op_name : opts.enable_op()) {
+      op_name_set_.insert(op_name);
+      rewriters_[op_name] = r;
+    }
+  }
+}
+
+Status ScopedAllocatorOptimizer::Optimize(Cluster* /*cluster*/,
+                                          const GrapplerItem& item,
+                                          GraphDef* optimized_graph) {
+  *optimized_graph = item.graph;
+  // Nodes that cannot be removed from the graph without damaging correctness,
+  // typically fetch nodes.
+  nodes_to_preserve_ = item.NodesToPreserve();
+
+  GraphProperties graph_properties(item);
+  const bool assume_valid_feeds = opt_level_ == RewriterConfig::AGGRESSIVE;
+  LOG_WARNING_AND_RETURN_IF_ERROR(
+      graph_properties.InferStatically(assume_valid_feeds));
+  node_map_.reset(new NodeMap(optimized_graph));
+
+  LOG_WARNING_AND_RETURN_IF_ERROR(ScopedAllocatorOptimizer::ProcessGraphDef(
+      optimized_graph, graph_properties));
+
+  VLOG(1) << "ScopedAllocatorOptimizer::Optimize() done";
+  return Status::OK();
+}
+
+ScopedAllocatorOptimizer::Rewriter* ScopedAllocatorOptimizer::GetRewriter(
+    const string& op_name) {
+  auto it = rewriters_.find(op_name);
+  if (it != rewriters_.end()) {
+    return it->second;
+  }
+  return nullptr;
+}
+
+int ScopedAllocatorOptimizer::NewScopedAllocatorId(int num_fields) {
+  CHECK_GT(num_fields, 0);
+  int id = next_sa_id_;
+  next_sa_id_ += (num_fields + 1);
+  CHECK_GT(next_sa_id_, 0);
+  return id;
+}
+
+ScopedAllocatorOptimizer::~ScopedAllocatorOptimizer() {
+  for (auto ptr : to_delete_) {
+    delete ptr;
+  }
+}
+
+void ScopedAllocatorOptimizer::FindOpOccurrences(GraphDef* graph,
+                                                 const OpNameSet& op_names,
+                                                 GraphOpOccurrences* occs) {
+  VLOG(1) << "FindOpOccurrences ";
+  for (const auto& it : op_names) {
+    VLOG(1) << "search target " << it;
+  }
+  for (int ni = 0; ni < graph->node_size(); ++ni) {
+    NodeDef* node = graph->mutable_node(ni);
+    const string& op_name = node->op();
+    if (op_names.find(op_name) != op_names.end()) {
+      VLOG(1) << "found " << op_name << " on dev " << node->device();
+      (*occs)[node->device()][op_name].push_back(node);
+    }
+  }
+}
+
+namespace {
+struct OpNameOrder {
+  bool operator()(const NodeDef* a, const NodeDef* b) {
+    return a->name() <= b->name();
+  }
+};
+
+class Tree {
+ public:
+  Tree(const string& edge, int depth) : edge_(edge), depth_(depth) {}
+  ~Tree() {
+    for (auto it : subtrees_) delete it.second;
+  }
+
+  Tree* GetSubTree(const string& edge) {
+    auto it = subtrees_.find(edge);
+    if (it != subtrees_.end()) {
+      return it->second;
+    }
+    Tree* t = new Tree(edge, depth_ + 1);
+    subtrees_[edge] = t;
+    return t;
+  }
+
+  void InsertNode(NodeDef* n) { nodes_.push_back(n); }
+
+  string edge_;
+  int depth_;
+  std::vector<NodeDef*> nodes_;
+  std::unordered_map<string, Tree*> subtrees_;
+};
+
+// Applies a function to every Tree in DFS order.  Terminates early
+// on any non-OK Status.
+Status ApplyToAll(Tree* tree, const std::function<Status(Tree*)>& func) {
+  Status s;
+  for (auto it : tree->subtrees_) {
+    s = ApplyToAll(it.second, func);
+    if (!s.ok()) return s;
+  }
+  s = func(tree);
+  return s;
+}
+
+Tree* ComputeScopeTree(const string& op_name,
+                       const std::vector<NodeDef*>& node_vec) {
+  Tree* root = new Tree("", 0);
+  for (NodeDef* n : node_vec) {
+    std::vector<string> pieces = str_util::Split(n->name(), "/");
+    // last piece is node name proper.
+    int depth = pieces.size() - 1;
+    Tree* subtree = root;
+    for (int i = 0; i < depth; ++i) {
+      subtree = subtree->GetSubTree(pieces[i]);
+    }
+    subtree->InsertNode(n);
+  }
+  return root;
+}
+
+void PartitionByLoopStructure(const FrameMap& frame_map,
+                              std::vector<NodeDef*> nodes,
+                              std::vector<std::vector<NodeDef*>>* loop_groups) {
+  // It is assumed that two nodes with identical loop containment have
+  // identical integer vectors.  Represent those by 64 bit hashes.
+  std::unordered_map<uint64, std::vector<NodeDef*>> loop_sets;
+  for (NodeDef* nd : nodes) {
+    uint64 hash = 0;
+    const auto& it = frame_map.find(nd);
+    if (it != frame_map.end()) {
+      const std::vector<int>& loop_ids = it->second;
+      for (int id : loop_ids) {
+        hash = Hash64Combine(hash, static_cast<uint64>(id));
+      }
+    }
+    loop_sets[hash].push_back(nd);
+  }
+  for (auto it : loop_sets) {
+    loop_groups->push_back(std::move(it.second));
+  }
+}
+
+}  // namespace
+
+Status ScopedAllocatorOptimizer::ProcessGraphDef(
+    GraphDef* graph, const GraphProperties& graph_properties) {
+  VLOG(1) << "ProcessGraphDef";
+  Status status;
+  GraphOpOccurrences occ;
+  FindOpOccurrences(graph, op_name_set_, &occ);
+  if (!occ.empty()) {
+    FrameMap frame_map;
+    int num_frames;
+    LOG_WARNING_AND_RETURN_IF_ERROR(
+        IdentifyFramesWithNodeMap(*graph, *node_map_, &frame_map, &num_frames));
+    for (auto& dt : occ) {
+      VLOG(2) << "Processing device " << dt.first;
+      const DevOpOccurrences& dev_occ = dt.second;
+      for (auto& it : dev_occ) {
+        string op_name = it.first;
+        VLOG(1) << "Processing " << op_name << " set size " << it.second.size();
+        Rewriter* rewriter = GetRewriter(op_name);
+        if (!rewriter) {
+          LOG(ERROR) << "Failed to find PARewriter for op_name " << op_name;
+          continue;
+        }
+        rewriter->SetGraphProperties(graph_properties);
+        std::unique_ptr<Tree> root(ComputeScopeTree(it.first, it.second));
+        // Nodes with a common depth and root path are now grouped
+        // in the same Tree struct.  Split those groups into subgroups that
+        // share identical loop nesting.
+        status = ApplyToAll(
+            root.get(), [this, rewriter, graph, &frame_map, &op_name](Tree* t) {
+              VLOG(2) << "applied to tree node " << t->edge_ << " at depth "
+                      << t->depth_ << " of size " << t->nodes_.size();
+              if (t->nodes_.size() > 1) {
+                std::vector<std::vector<NodeDef*>> loop_groups;
+                PartitionByLoopStructure(frame_map, t->nodes_, &loop_groups);
+                for (auto& lg : loop_groups) {
+                  if (lg.size() > 1) {
+                    bool applied = false;
+                    Status s = OrderNodeSet(&lg);
+                    TF_RETURN_IF_ERROR(s);
+                    VLOG(1) << "Applying Rewriter for " << op_name;
+                    s = rewriter->Rewrite(this, graph, op_name, lg, &applied);
+                    LOG_WARNING_AND_RETURN_IF_ERROR(s);
+                  }
+                }
+              }
+              return Status::OK();
+            });
+        if (!status.ok()) {
+          break;
+        }
+      }
+      if (!status.ok()) {
+        break;
+      }
+    }
+  }
+  VLOG(1) << "ScopedAllocatorOptimizer returning " << status;
+  if (!status.ok()) {
+    LOG(ERROR) << "ScopedAllocatorOptimizer: " << status;
+  }
+  return status;
+}
+
+namespace {
+struct InstanceKeyLess {
+  bool operator()(const NodeDef* a, const NodeDef* b) const {
+    AttrSlice a_attrs = AttrSlice(*a);
+    AttrSlice b_attrs = AttrSlice(*b);
+    int32 a_key = -1;
+    int32 b_key = -1;
+    Status s = GetNodeAttr(a_attrs, "instance_key", &a_key);
+    CHECK(s.ok());
+    s = GetNodeAttr(b_attrs, "instance_key", &b_key);
+    CHECK(s.ok());
+    return a_key < b_key;
+  }
+};
+
+struct NameLess {
+  bool operator()(const NodeDef* a, const NodeDef* b) const {
+    return a->name() < b->name();
+  }
+};
+
+bool IsCollectiveNode(const NodeDef& n) {
+  AttrSlice attrs = AttrSlice(n);
+  int key = -1;
+  if (!IsCollective(n)) return false;
+  Status s = GetNodeAttr(attrs, "instance_key", &key);
+  if (s.ok() && key >= 0) {
+    return true;
+  }
+  return false;
+}
+}  // namespace
+
+Status ScopedAllocatorOptimizer::OrderNodeSet(
+    std::vector<NodeDef*>* nodes) const {
+  // Nodes should be identical type.  Default order is by name but for
+  // collectives we order by increasing instance_key so each group gets
+  // the same instance_key.
+  if (nodes->size() <= 1) return Status::OK();
+  if (IsCollectiveNode(*nodes->at(0))) {
+    sort(nodes->begin(), nodes->end(), InstanceKeyLess());
+  } else {
+    sort(nodes->begin(), nodes->end(), NameLess());
+  }
+  return Status::OK();
+}
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#undef LOG_WARNING_AND_RETURN_IF_ERROR
diff --git a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.h b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.h
new file mode 100644
index 00000000000000..ab4d444595f8b4
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.h
@@ -0,0 +1,107 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_SCOPED_ALLOCATOR_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_SCOPED_ALLOCATOR_OPTIMIZER_H_
+
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+class Graph;
+class GraphProperties;
+class NodeMap;
+class ScopedAllocatorOptimizer;
+
+// An Optimizer that introduces ScopedAllocators in order to reduce data
+// movement and consolidate some kinds of Ops.
+class ScopedAllocatorOptimizer : public GraphOptimizer {
+ public:
+  explicit ScopedAllocatorOptimizer(const ScopedAllocatorOptions& opts);
+  ~ScopedAllocatorOptimizer() override;
+
+  string name() const override { return "scoped_allocator_optimizer"; }
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* optimized_graph) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimized_graph, double result) override {}
+
+  // Map from an Op name to a vector of Nodes with that Op.
+  typedef std::unordered_map<string, std::vector<NodeDef*>> DevOpOccurrences;
+  // Map from a device name to a DevOpOccurrences map.
+  typedef std::unordered_map<string, DevOpOccurrences> GraphOpOccurrences;
+  typedef std::unordered_set<string> OpNameSet;
+
+  Status ProcessGraphDef(GraphDef* graph,
+                         const GraphProperties& graph_properties);
+
+  // Populates *occs by grouping Nodes with common Ops, according to
+  // their assigned devices.
+  void FindOpOccurrences(GraphDef* graph, const OpNameSet& op_names,
+                         GraphOpOccurrences* occs);
+
+  // Returns a new, unused scope_id to be assigned to a ScopedAllocator that
+  // will allocate num_fields (> 0) separate tensors.
+  int NewScopedAllocatorId(int num_fields);
+
+  NodeMap* node_map() { return node_map_.get(); }
+
+  // Appends values to the attr value under name in node_def, if present.
+  // If not present does an assignment.
+  static void ExtendNodeAttr(StringPiece name, const std::vector<int32>& values,
+                             NodeDef* node_def);
+
+  // Class that knows how to do graph rewriting for a particular kind of Op in
+  // order to take advantage of a ScopedAllocator.
+  class Rewriter {
+   public:
+    virtual ~Rewriter() {}
+
+    virtual Status Rewrite(ScopedAllocatorOptimizer* paopti, GraphDef* graph,
+                           const string& op_name,
+                           const std::vector<NodeDef*>& nodes,
+                           bool* applied) = 0;
+
+    void SetGraphProperties(const GraphProperties& graph_properties) {
+      graph_properties_ = &graph_properties;
+      CHECK(graph_properties_);
+    }
+
+   protected:
+    const GraphProperties* graph_properties_;
+  };
+
+ private:
+  Rewriter* GetRewriter(const string& op_name);
+
+  Status OrderNodeSet(std::vector<NodeDef*>* nodes) const;
+
+  RewriterConfig::Toggle opt_level_;
+  std::unordered_set<string> nodes_to_preserve_;
+  OpNameSet op_name_set_;
+  std::unordered_map<string, Rewriter*> rewriters_;
+  std::vector<Rewriter*> to_delete_;
+  int next_sa_id_ = 1;
+  std::unique_ptr<NodeMap> node_map_;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_SCOPED_ALLOCATOR_OPTIMIZER_H_
diff --git a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer_test.cc b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer_test.cc
new file mode 100644
index 00000000000000..3a2859dc5f0f57
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer_test.cc
@@ -0,0 +1,243 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.h"
+
+#include <unordered_set>
+
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/graph/testlib.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class ScopedAllocatorOptimizerTest : public ::testing::Test {
+ public:
+  std::unique_ptr<Session> CreateSession(const GraphDef& graph,
+                                         const ConfigProto& config) {
+    SessionOptions options;
+    options.config = config;
+    (*options.config.mutable_device_count())["CPU"] = 2;
+    Session* session = NewSession(options);
+    TF_CHECK_OK(session->Create(graph));
+    return std::unique_ptr<Session>(session);
+  }
+
+  std::vector<Tensor> EvaluateNodes(const GraphDef& graph,
+                                    const std::vector<string>& fetch) {
+    SessionOptions options;
+    std::unique_ptr<tensorflow::Session> session(NewSession(options));
+    TF_CHECK_OK(session->Create(graph));
+    RunOptions run_options;
+    std::vector<Tensor> output_tensors;
+    TF_CHECK_OK(
+        session->Run(run_options, {}, fetch, fetch, &output_tensors, nullptr));
+    TF_CHECK_OK(session->Close());
+    return output_tensors;
+  }
+
+  // Constructs the following graph.
+  // (Flow is top to bottom, like nature intends.)
+  //
+  // The intended optimization is to have s1 and s2 allocate from
+  // an new ScopedAllocator, then replace a1 and a2 with a3 that
+  // reads from the backing buffer.
+  /*
+        a    b    c
+         \  / \  /
+          s1   s2
+          |    |
+          a1   a2
+          |    |
+          r1   r2
+  */
+  void BuildAbsGraph(GraphDef* graph_def) {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    s = s.WithDevice("/job:localhost/replica:0/task:0/device:CPU:0");
+
+    Output a =
+        ops::Const<float>(s.WithOpName("a"), {1.0, 0.0, 0.0, -1.0}, {2, 2});
+    Output b =
+        ops::Const<float>(s.WithOpName("b"), {1.0, -2.0, 3.0, 4.0}, {2, 2});
+    Output c =
+        ops::Const<float>(s.WithOpName("c"), {-5.0, -2.0, 0.0, -2.0}, {2, 2});
+    Output s1 = ops::Add(s.WithOpName("s1"), a, b);
+    Output s2 = ops::Add(s.WithOpName("s2"), b, c);
+    Output a1 = ops::Abs(s.WithOpName("a1"), s1);
+    Output a2 = ops::Abs(s.WithOpName("a2"), s2);
+    Output r1 = ops::Reshape(s.WithOpName("r1"), a1, {1, 4});
+    Output r2 = ops::Reshape(s.WithOpName("r2"), a2, {4, 1});
+    TF_CHECK_OK(s.ToGraphDef(graph_def));
+  }
+
+  void SetShapes(GraphDef* graph_def) {
+    TensorShapeProto shape_proto;
+    shape_proto.add_dim()->set_size(2);
+    shape_proto.add_dim()->set_size(2);
+
+    for (NodeDef& n : *graph_def->mutable_node()) {
+      if (n.op() == "Add" || n.op() == "Abs") {
+        AddNodeAttr("_output_shapes", {shape_proto}, &n);
+      }
+    }
+  }
+};
+
+TEST_F(ScopedAllocatorOptimizerTest, UnaryRewriteOnly) {
+  // Tests that Rewrite of program with parallel unary Ops is done as
+  // anticipated.
+  GrapplerItem item;
+  BuildAbsGraph(&item.graph);
+  SetShapes(&item.graph);
+
+  ScopedAllocatorOptions opts;
+  opts.add_enable_op("Abs");
+  ScopedAllocatorOptimizer sao(opts);
+  ScopedAllocatorOptimizer::OpNameSet ons;
+  ons.insert("Abs");
+
+  GraphDef optimized_graph;
+  TF_ASSERT_OK(sao.Optimize(nullptr /*cluster*/, item, &optimized_graph));
+
+  // Examine the resulting graph def.
+  NodeMap node_map(&optimized_graph);
+  NodeDef* nd = node_map.GetNode("scoped_allocator_1");
+  ASSERT_TRUE(nd);
+  {
+    auto& nd_set = node_map.GetOutputs(nd->name());
+    ASSERT_EQ(3, nd_set.size());
+    std::unordered_set<string> expected = {"scoped_allocator_concat_1", "s1",
+                                           "s2"};
+    for (auto it : nd_set) {
+      ASSERT_NE(expected.find(it->name()), expected.end())
+          << "Failed to find " << it->name();
+    }
+  }
+  {
+    auto& nd_set = node_map.GetOutputs("scoped_allocator_concat_1");
+    ASSERT_EQ(1, nd_set.size());
+    for (auto it : nd_set) {
+      ASSERT_EQ("scoped_allocator_1_Abs", it->name());
+    }
+  }
+  {
+    auto& nd_set = node_map.GetOutputs("scoped_allocator_1_Abs");
+    ASSERT_EQ(1, nd_set.size());
+    for (auto it : nd_set) {
+      ASSERT_EQ("scoped_allocator_split_1", it->name());
+    }
+  }
+  {
+    auto& nd_set = node_map.GetOutputs("scoped_allocator_split_1");
+    ASSERT_EQ(2, nd_set.size());
+    std::unordered_set<string> name_set;
+    for (auto it : nd_set) {
+      name_set.insert(it->name());
+    }
+    ASSERT_TRUE(name_set.find("r1") != name_set.end());
+    ASSERT_TRUE(name_set.find("r2") != name_set.end());
+  }
+}
+
+TEST_F(ScopedAllocatorOptimizerTest, UnaryExecute) {
+  // Constructs the same graph as UnaryRewriteOnly, but actually executes it.
+  GrapplerItem item;
+  BuildAbsGraph(&item.graph);
+
+  // Turn off all optimization except the ScopedAllocatorOptimizer
+  // to avoid anything that would alter the expected graph input/output,
+  // e.g. by constant folding away all calculations.
+  ConfigProto config;
+  GraphOptions* gopt = config.mutable_graph_options();
+  OptimizerOptions* opts = gopt->mutable_optimizer_options();
+  opts->set_do_common_subexpression_elimination(false);
+  opts->set_do_constant_folding(false);
+  opts->set_do_function_inlining(false);
+  opts->set_opt_level(OptimizerOptions::L0);
+  RewriterConfig* rwcfg = gopt->mutable_rewrite_options();
+  rwcfg->clear_optimizers();
+  (*rwcfg->add_optimizers()) = "scoped_allocator";
+  rwcfg->mutable_scoped_allocator_opts()->add_enable_op("Abs");
+  std::unique_ptr<Session> session(CreateSession(item.graph, config));
+
+  std::vector<std::pair<string, Tensor>> inputs;
+
+  // Request two targets: one fetch output and one non-fetched output.
+  std::vector<string> output_names = {"r1:0", "r2:0",
+                                      "scoped_allocator_1_Abs:0"};
+  std::vector<string> target_nodes = {};
+  std::vector<Tensor> outputs;
+  Status s = session->Run(inputs, output_names, target_nodes, &outputs);
+  TF_ASSERT_OK(s);
+  ASSERT_EQ(outputs.size(), 3);
+  std::vector<float> expected_r1({2, 2, 3, 3});
+  std::vector<float> expected_r2({4, 4, 3, 2});
+  // a + b == 2, -2, 3, 3
+  // b + c == -4, -4, 3, 2
+  for (int oi = 0; oi < outputs.size(); ++oi) {
+    for (int i = 0; i < outputs[oi].NumElements(); ++i) {
+      VLOG(0) << "output vec " << oi << " index " << i << " = "
+              << outputs[oi].flat<float>()(i);
+    }
+    if (oi == 0) {
+      ASSERT_EQ(expected_r1.size(), outputs[oi].NumElements());
+      for (int i = 0; i < expected_r1.size(); ++i) {
+        EXPECT_EQ(expected_r1[i], outputs[oi].flat<float>()(i));
+      }
+    } else if (oi == 1) {
+      ASSERT_EQ(expected_r2.size(), outputs[oi].NumElements());
+      for (int i = 0; i < expected_r2.size(); ++i) {
+        EXPECT_EQ(expected_r2[i], outputs[oi].flat<float>()(i));
+      }
+    }
+  }
+}
+
+// Tests static ScopedAllocatorOptimizer::ExtendNodeAttr.
+// Maybe this should be moved elsewhere?
+TEST_F(ScopedAllocatorOptimizerTest, Extend) {
+  NodeDef nd;
+  ScopedAllocatorOptimizer::ExtendNodeAttr("_scoped_allocator", {0, 2}, &nd);
+  ScopedAllocatorOptimizer::ExtendNodeAttr("_scoped_allocator", {6, 7}, &nd);
+  ScopedAllocatorOptimizer::ExtendNodeAttr("_scoped_allocator", {2, 3}, &nd);
+  VLOG(0) << "nd: " << nd.DebugString();
+  std::vector<int> scoped_allocator_attrs;
+  AttrSlice slice(nd);
+  Status sa_status =
+      GetNodeAttr(slice, "_scoped_allocator", &scoped_allocator_attrs);
+  for (int i : scoped_allocator_attrs) {
+    VLOG(0) << "extracted: " << i;
+  }
+  NodeDef nd2;
+  AddNodeAttr("_scoped_allocator", {0, 2}, &nd2);
+  AddNodeAttr("_scoped_allocator", {6, 7}, &nd2);
+  AddNodeAttr("_scoped_allocator", {2, 3}, &nd2);
+  VLOG(0) << "nd2: " << nd2.DebugString();
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/shape_optimizer.cc b/tensorflow/core/grappler/optimizers/shape_optimizer.cc
new file mode 100644
index 00000000000000..26c54df56b9e25
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/shape_optimizer.cc
@@ -0,0 +1,133 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/shape_optimizer.h"
+
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/grappler/graph_view.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/symbolic_shapes.h"
+
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/lib/core/errors.h"
+
+namespace tensorflow {
+namespace grappler {
+
+Status ShapeOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
+                                GraphDef* optimized_graph) {
+  *optimized_graph = item.graph;
+
+  GraphProperties properties(item);
+  TF_RETURN_IF_ERROR(properties.InferStatically(false));
+  GraphView graph(optimized_graph);
+
+  // The product of all the dimensions in a tensor shape can be expressed more
+  // simply as the size of the tensor.
+  for (auto& node : *optimized_graph->mutable_node()) {
+    if (!IsShape(node)) {
+      continue;
+    }
+    for (GraphView::InputPort fanout :
+         graph.GetFanout(GraphView::OutputPort(&node, 0))) {
+      if (fanout.node->op() != "Prod") {
+        continue;
+      }
+      if (fanout.node->attr().count("keep_dims") != 0 &&
+          fanout.node->attr().at("keep_dims").b()) {
+        // Keeping the reduced dimensions won't result in a scalar, so we can't
+        // rewrite the whole expression directly as a Size operation.
+        continue;
+      }
+      const GraphView::OutputPort reduce_indices =
+          graph.GetRegularFanin(GraphView::InputPort(fanout.node, 1));
+      const auto& prop =
+          properties.GetOutputProperties(reduce_indices.node->name());
+      if (prop.size() < reduce_indices.port_id) {
+        continue;
+      }
+      const TensorShapeProto& reduction_indices_shape =
+          prop[reduce_indices.port_id].shape();
+      if (NumCoefficients(reduction_indices_shape) == 1) {
+        const auto& input_props = properties.GetInputProperties(node.name());
+        if (input_props.size() != 1) {
+          continue;
+        }
+        // Rewrite the reduction of the shape dimensions as a Size operation.
+        const DataType type = input_props[0].dtype();
+        fanout.node->set_op("Size");
+        fanout.node->set_input(0, node.input(0));
+        fanout.node->set_input(1, AsControlDependency(node));
+        fanout.node->mutable_attr()->erase("Tidx");
+        fanout.node->mutable_attr()->erase("keep_dims");
+        (*fanout.node->mutable_attr())["out_type"] =
+            fanout.node->attr().at("T");
+        (*fanout.node->mutable_attr())["T"].set_type(type);
+      }
+    }
+  }
+  for (auto& node : *optimized_graph->mutable_node()) {
+    // Try to convert the ratio of 2 symbolic tensor sizes into a constant. This
+    // is possible whenever the symbolic dimensions in the numerator and
+    // denominator cancel each other.
+    if (node.op() == "Div") {
+      const GraphView::OutputPort input1 =
+          graph.GetRegularFanin(GraphView::InputPort(&node, 0));
+      const GraphView::OutputPort input2 =
+          graph.GetRegularFanin(GraphView::InputPort(&node, 1));
+      if (!IsSize(*input1.node) || !IsSize(*input2.node)) {
+        continue;
+      }
+      const auto& prop1 = properties.GetInputProperties(input1.node->name());
+      const auto& prop2 = properties.GetInputProperties(input2.node->name());
+      if (prop1.size() != 1 || prop2.size() != 1) {
+        continue;
+      }
+      const TensorShapeProto& shape1 = prop1[0].shape();
+      const TensorShapeProto& shape2 = prop2[0].shape();
+      int64 result = ComputeSizeRatio(shape1, shape2);
+      if (result >= 0) {
+        // Replace div with constant.
+        node.set_op("Const");
+        DataType dtype = node.attr().at("T").type();
+        node.mutable_attr()->erase("T");
+        (*node.mutable_attr())["dtype"].set_type(dtype);
+        TensorProto* t = (*node.mutable_attr())["value"].mutable_tensor();
+        t->set_dtype(dtype);
+        *t->mutable_tensor_shape() = TensorShapeProto();
+        if (dtype == DT_INT32) {
+          t->add_int_val(result);
+        } else {
+          t->add_int64_val(result);
+        }
+        node.set_input(0, AsControlDependency(node.input(0)));
+        node.set_input(1, AsControlDependency(node.input(1)));
+      }
+    }
+  }
+  return Status::OK();
+}
+
+void ShapeOptimizer::Feedback(Cluster* /*cluster*/,
+                              const GrapplerItem& /*item*/,
+                              const GraphDef& /*optimized_graph*/,
+                              double /*result*/) {
+  // Nothing to do for LoopOptimizer.
+}
+
+}  // end namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/shape_optimizer.h b/tensorflow/core/grappler/optimizers/shape_optimizer.h
new file mode 100644
index 00000000000000..b7f84a1e5dbe7d
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/shape_optimizer.h
@@ -0,0 +1,54 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_SHAPE_OPTIMIZER_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_SHAPE_OPTIMIZER_H_
+
+#include <unordered_set>
+#include "tensorflow/core/grappler/costs/graph_properties.h"
+#include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/frame.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// Optimize TensorFlow subgraphs that operate on shape and shape related
+// information.
+class ShapeOptimizer : public GraphOptimizer {
+ public:
+  ShapeOptimizer() : opt_level_(RewriterConfig::ON) {}
+  explicit ShapeOptimizer(RewriterConfig::Toggle opt_level)
+      : opt_level_(opt_level) {}
+
+  ~ShapeOptimizer() override {}
+
+  string name() const override { return "shape_optimizer"; };
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* optimized_graph) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimized_graph, double result) override;
+
+ private:
+  RewriterConfig::Toggle opt_level_;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_SHAPE_OPTIMIZER_H_
diff --git a/tensorflow/core/grappler/optimizers/shape_optimizer_test.cc b/tensorflow/core/grappler/optimizers/shape_optimizer_test.cc
new file mode 100644
index 00000000000000..95a5eccd4f0ee8
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/shape_optimizer_test.cc
@@ -0,0 +1,105 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/shape_optimizer.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+class ShapeOptimizerTest : public GrapplerTest {};
+
+TEST_F(ShapeOptimizerTest, OptimizeShapeProduct) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a = ops::Const(s.WithOpName("a"), 3.14f, {32, 16});
+  Output c = ops::Shape(s.WithOpName("c"), a);
+  Output d = ops::Const(s.WithOpName("d"), 0, {1});
+  ops::ReduceProd::Attrs attrs;
+  Output e = ops::ReduceProd(s.WithOpName("e"), c, d, attrs.KeepDims(false));
+  Output f = ops::ReduceProd(s.WithOpName("f"), c, d, attrs.KeepDims(true));
+
+  GrapplerItem item;
+  item.fetch = {"e", "f"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+
+  GraphDef output;
+  ShapeOptimizer optimizer;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "e") {
+      found++;
+      EXPECT_EQ("Size", node.op());
+      EXPECT_EQ("a", node.input(0));
+    } else if (node.name() == "f") {
+      found++;
+      EXPECT_EQ("Prod", node.op());
+      EXPECT_EQ("c", node.input(0));
+    }
+  }
+  EXPECT_EQ(2, found);
+
+  auto tensors_actual = EvaluateNodes(output, item.fetch);
+  EXPECT_NEAR(tensors_expected[0].scalar<int>()(),
+              tensors_actual[0].scalar<int>()(), 0);
+  EXPECT_NEAR(tensors_expected[1].scalar<int>()(),
+              tensors_actual[1].scalar<int>()(), 0);
+}
+
+TEST_F(ShapeOptimizerTest, OptimizeShapeRatio) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output a = ops::Const(s.WithOpName("a"), 3.14f, {32, 32});
+  Output b = ops::Const(s.WithOpName("b"), 3.14f, {32, 16});
+  Output c = ops::Size(s.WithOpName("c"), a);
+  Output d = ops::Size(s.WithOpName("d"), b);
+  Output e = ops::Div(s.WithOpName("e"), c, d);
+
+  GrapplerItem item;
+  item.fetch = {"e"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+
+  GraphDef output;
+  ShapeOptimizer optimizer;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "e") {
+      found++;
+      EXPECT_EQ("Const", node.op());
+    }
+  }
+  EXPECT_EQ(1, found);
+
+  auto tensors_actual = EvaluateNodes(output, item.fetch);
+  EXPECT_NEAR(tensors_expected[0].scalar<int>()(),
+              tensors_actual[0].scalar<int>()(), 0);
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/symbolic_shapes.cc b/tensorflow/core/grappler/optimizers/symbolic_shapes.cc
index cfca2dc0d38480..155843a74403b0 100644
--- a/tensorflow/core/grappler/optimizers/symbolic_shapes.cc
+++ b/tensorflow/core/grappler/optimizers/symbolic_shapes.cc
@@ -49,6 +49,27 @@ bool ShapeIsSymbolicallyDefined(const OpInfo::TensorProperties& properties) {
   return ShapeIsSymbolicallyDefined(properties.shape());
 }
 
+int Rank(const TensorShapeProto& shape) {
+  if (shape.unknown_rank()) {
+    return -1;
+  }
+  return shape.dim_size();
+}
+
+int64 NumCoefficients(const TensorShapeProto& shape) {
+  if (shape.unknown_rank()) {
+    return -1;
+  }
+  int64 num_coefficients = 1;
+  for (const auto& dim : shape.dim()) {
+    if (dim.size() < 0) {
+      return -1;
+    }
+    num_coefficients *= dim.size();
+  }
+  return num_coefficients;
+}
+
 bool ShapesSymbolicallyEqual(const TensorShapeProto& left,
                              const TensorShapeProto& right) {
   if (left.unknown_rank() || right.unknown_rank() ||
@@ -85,6 +106,25 @@ bool ShapesBroadcastable(const OpInfo::TensorProperties& left,
   return ShapesBroadcastable(left.shape(), right.shape());
 }
 
+bool ShapeAfterBroadcast(const TensorShapeProto& left,
+                         const TensorShapeProto& right,
+                         TensorShapeProto* output_shape) {
+  if (!ShapeIsSymbolicallyDefined(left) || !ShapeIsSymbolicallyDefined(right)) {
+    return false;
+  }
+  BCast bcast(ShapeDims(left), ShapeDims(right),
+              /*fewer_dims_optimization*/ false);
+  if (!bcast.IsValid()) {
+    return false;
+  }
+  output_shape->set_unknown_rank(false);
+  output_shape->clear_dim();
+  for (const auto& dim : bcast.output_shape()) {
+    output_shape->add_dim()->set_size(dim);
+  }
+  return true;
+}
+
 bool CompareSymbolicallyShapedTensorSizes(const TensorShapeProto& left,
                                           const TensorShapeProto& right) {
   // if one of the ranks is unknown, it's impossible to compare tensor sizes
@@ -173,5 +213,44 @@ bool CompareSymbolicallyShapedTensorSizes(
   return CompareSymbolicallyShapedTensorSizes(left.shape(), right.shape());
 }
 
+int64 ComputeSizeRatio(const TensorShapeProto& numerator,
+                       const TensorShapeProto& denominator) {
+  if (numerator.unknown_rank() || denominator.unknown_rank()) {
+    return -1;
+  }
+  std::multiset<int> symbolic_dims;
+  int64 num = 1;
+  for (const auto& dim : numerator.dim()) {
+    if (dim.size() == -1) {
+      return -1;
+    } else if (dim.size() < -1) {
+      symbolic_dims.insert(dim.size());
+    } else {
+      num *= dim.size();
+    }
+  }
+  int64 denom = 1;
+  for (const auto& dim : denominator.dim()) {
+    if (dim.size() == -1) {
+      return -1;
+    } else if (dim.size() < -1) {
+      auto it = symbolic_dims.find(dim.size());
+      if (it == symbolic_dims.end()) {
+        return -1;
+      }
+      symbolic_dims.erase(it);
+    } else {
+      denom *= dim.size();
+    }
+  }
+  if (denom == 0) {
+    return -1;
+  }
+  if (!symbolic_dims.empty()) {
+    return -1;
+  }
+  return num / denom;
+}
+
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/symbolic_shapes.h b/tensorflow/core/grappler/optimizers/symbolic_shapes.h
index eb79bab3141579..ace7bd1fe7ecf9 100644
--- a/tensorflow/core/grappler/optimizers/symbolic_shapes.h
+++ b/tensorflow/core/grappler/optimizers/symbolic_shapes.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/grappler/costs/op_performance_data.pb.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -31,6 +32,14 @@ bool IsUnknown(const TensorShapeProto::Dim& dim);
 bool ShapeIsSymbolicallyDefined(const TensorShapeProto& shape);
 bool ShapeIsSymbolicallyDefined(const OpInfo::TensorProperties& properties);
 
+// Returns the rank of the shape ir -1 if unknown
+int Rank(const TensorShapeProto& shape);
+
+// Returns the number of coefficients in the shape or -1 if unknown.
+// TODO(bsteiner) Add a function that computes the minimum size of the tensor,
+// ie the size assuming all the symbolic dimensions take the value 1.
+int64 NumCoefficients(const TensorShapeProto& shape);
+
 // Shapes are symbolically equal, if they have the same rank, they are known or
 // symbolically defined, and have matching dimensions.
 bool ShapesSymbolicallyEqual(const TensorShapeProto& left,
@@ -44,6 +53,9 @@ bool ShapesBroadcastable(const TensorShapeProto& left,
                          const TensorShapeProto& right);
 bool ShapesBroadcastable(const OpInfo::TensorProperties& left,
                          const OpInfo::TensorProperties& right);
+bool ShapeAfterBroadcast(const TensorShapeProto& left,
+                         const TensorShapeProto& right,
+                         TensorShapeProto* output_shape);
 
 // Return true if can prove, that tensor of size 'left' is smaller than tensor
 // of size 'right'. Return false if it's larger or equal, or it's impossible to
@@ -54,6 +66,11 @@ bool CompareSymbolicallyShapedTensorSizes(
     const OpInfo::TensorProperties& left,
     const OpInfo::TensorProperties& right);
 
+// Returns the ratio of the sizes of the 2 shapes if known statically, or -1
+// otherwise.
+int64 ComputeSizeRatio(const TensorShapeProto& numerator,
+                       const TensorShapeProto& denominator);
+
 }  // namespace grappler
 }  // end namespace tensorflow
 
diff --git a/tensorflow/core/grappler/optimizers/symbolic_shapes_test.cc b/tensorflow/core/grappler/optimizers/symbolic_shapes_test.cc
index 5ef9f659257106..7ce995d1c5fcea 100644
--- a/tensorflow/core/grappler/optimizers/symbolic_shapes_test.cc
+++ b/tensorflow/core/grappler/optimizers/symbolic_shapes_test.cc
@@ -74,6 +74,26 @@ TEST_F(SymbolicShapesTest, ShapesBroadcastable) {
   EXPECT_TRUE(ShapesBroadcastable(MakeShape({-2, 1}), MakeShape({1, -2})));
   EXPECT_TRUE(ShapesBroadcastable(MakeShape({-2, 1}), MakeShape({1, -3})));
   EXPECT_TRUE(ShapesBroadcastable(MakeShape({-3}), MakeShape({-2, -3})));
+
+  TensorShapeProto output_shape;
+  EXPECT_TRUE(
+      ShapeAfterBroadcast(MakeShape({1, 2}), MakeShape({1, 2}), &output_shape));
+  EXPECT_TRUE(ShapesSymbolicallyEqual(MakeShape({1, 2}), output_shape));
+  EXPECT_TRUE(ShapeAfterBroadcast(MakeShape({-2, 2}), MakeShape({-2, 2}),
+                                  &output_shape));
+  EXPECT_TRUE(ShapesSymbolicallyEqual(MakeShape({-2, 2}), output_shape));
+  EXPECT_TRUE(ShapeAfterBroadcast(MakeShape({-2, 32}), MakeShape({-2, 1}),
+                                  &output_shape));
+  EXPECT_TRUE(ShapesSymbolicallyEqual(MakeShape({-2, 32}), output_shape));
+  EXPECT_TRUE(ShapeAfterBroadcast(MakeShape({-2, 1}), MakeShape({1, -2}),
+                                  &output_shape));
+  EXPECT_TRUE(ShapesSymbolicallyEqual(MakeShape({-2, -2}), output_shape));
+  EXPECT_TRUE(ShapeAfterBroadcast(MakeShape({-2, 1}), MakeShape({1, -3}),
+                                  &output_shape));
+  EXPECT_TRUE(ShapesSymbolicallyEqual(MakeShape({-2, -3}), output_shape));
+  EXPECT_TRUE(
+      ShapeAfterBroadcast(MakeShape({-3}), MakeShape({-2, -3}), &output_shape));
+  EXPECT_TRUE(ShapesSymbolicallyEqual(MakeShape({-2, -3}), output_shape));
 }
 
 TEST_F(SymbolicShapesTest, CompareSymbolicallyShapedTensorSizes) {
@@ -90,6 +110,33 @@ TEST_F(SymbolicShapesTest, CompareSymbolicallyShapedTensorSizes) {
   EXPECT_FALSE(MakeShape({-1, -1, 32}) < MakeShape({1, -1, 32}));
 }
 
+TEST_F(SymbolicShapesTest, RankAndNumCoeff) {
+  EXPECT_EQ(2, Rank(MakeShape({32, 32})));
+  EXPECT_EQ(32 * 32, NumCoefficients(MakeShape({32, 32})));
+  EXPECT_EQ(2, Rank(MakeShape({-2, 32})));
+  EXPECT_EQ(-1, NumCoefficients(MakeShape({-2, 32})));
+  TensorShapeProto shape;
+  shape.set_unknown_rank(true);
+  EXPECT_EQ(-1, Rank(shape));
+  EXPECT_EQ(-1, NumCoefficients(shape));
+}
+
+TEST_F(SymbolicShapesTest, SizeRatio) {
+  EXPECT_EQ(16, ComputeSizeRatio(MakeShape({32, 32}), MakeShape({32, 2})));
+  EXPECT_EQ(16, ComputeSizeRatio(MakeShape({-2, 32}), MakeShape({-2, 2})));
+  EXPECT_EQ(16,
+            ComputeSizeRatio(MakeShape({-2, -2, 32}), MakeShape({-2, 2, -2})));
+  EXPECT_EQ(-1,
+            ComputeSizeRatio(MakeShape({-2, -2, 32}), MakeShape({-2, 2, 2})));
+  EXPECT_EQ(-1,
+            ComputeSizeRatio(MakeShape({-2, 2, 32}), MakeShape({-2, 2, -2})));
+  EXPECT_EQ(-1, ComputeSizeRatio(MakeShape({-2, -2}), MakeShape({-2, 2})));
+  EXPECT_EQ(-1, ComputeSizeRatio(MakeShape({-2, 32}), MakeShape({-2, -2})));
+  EXPECT_EQ(1, ComputeSizeRatio(MakeShape({-2, -3}), MakeShape({-3, -2})));
+  EXPECT_EQ(-1, ComputeSizeRatio(MakeShape({-1, 32}), MakeShape({-2, 2})));
+  EXPECT_EQ(-1, ComputeSizeRatio(MakeShape({-1, 32}), MakeShape({-2, 0})));
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index 7398d2c896dc01..c8e63f95e1855f 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -361,8 +361,11 @@ inline void STLSortAndRemoveDuplicates(T* v) {
 }
 }  // namespace
 
-Status SimpleGraphView::Initialize(const GraphDef& graph, bool dedup_inputs,
-                                   bool dedup_outputs) {
+Status SimpleGraphView::Initialize(
+    const GraphDef& graph,
+    const std::vector<std::pair<const NodeDef*, const NodeDef*>>*
+        extra_dependencies,
+    bool dedup_inputs, bool dedup_outputs) {
   graph_ = &graph;
   const int num_nodes = graph.node_size();
   inputs_.clear();
@@ -381,6 +384,23 @@ Status SimpleGraphView::Initialize(const GraphDef& graph, bool dedup_inputs,
     index_to_name_.push_back(node.name());
   }
 
+  if (extra_dependencies) {
+    for (const auto& dep : *extra_dependencies) {
+      auto itr_src = name_to_index_.find(dep.first->name());
+      if (itr_src == name_to_index_.end()) {
+        return errors::InvalidArgument("Non-existent src ", dep.first->name());
+      }
+      auto itr_tgt = name_to_index_.find(dep.second->name());
+      if (itr_tgt == name_to_index_.end()) {
+        return errors::InvalidArgument("Non-existent tgt ", dep.second->name());
+      }
+      const int src_idx = itr_src->second;
+      const int tgt_idx = itr_tgt->second;
+      inputs_[tgt_idx].push_back(src_idx);
+      outputs_[src_idx].push_back(tgt_idx);
+    }
+  }
+
   // Build forward and reverse adjacency lists.
   for (int node_idx = 0; node_idx < num_nodes; ++node_idx) {
     const NodeDef& node = graph.node(node_idx);
@@ -415,7 +435,8 @@ void SimpleGraphView::DepthFirstSearch(
     std::set<int>* nodes_found) const {
   nodes_found->clear();
   const string& op_type = graph_->node(root_node).op();
-  if (op_types_to_traverse.find(op_type) == op_types_to_traverse.end()) {
+  if (!op_types_to_traverse.empty() &&
+      op_types_to_traverse.find(op_type) == op_types_to_traverse.end()) {
     return;
   }
   std::vector<int> stack;
@@ -426,7 +447,8 @@ void SimpleGraphView::DepthFirstSearch(
     stack.pop_back();
     nodes_found->insert(node_idx);
     const string& op_type = graph_->node(node_idx).op();
-    if (op_types_to_traverse.find(op_type) != op_types_to_traverse.end()) {
+    if (op_types_to_traverse.empty() ||
+        op_types_to_traverse.find(op_type) != op_types_to_traverse.end()) {
       for (auto output_idx : this->outputs(node_idx)) {
         if (nodes_found->find(output_idx) == nodes_found->end()) {
           stack.push_back(output_idx);
diff --git a/tensorflow/core/grappler/utils.h b/tensorflow/core/grappler/utils.h
index b15667dca26968..1c6fef59eaec8e 100644
--- a/tensorflow/core/grappler/utils.h
+++ b/tensorflow/core/grappler/utils.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_UTILS_H_
-#define TENSORFLOW_GRAPPLER_UTILS_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_H_
 
 #include <functional>
 #include <unordered_map>
@@ -65,7 +65,7 @@ class NodeMap {
 // A vector with a set. The set stores the same elements as the vector, and
 // quickly answers whether a value is in the vector. Duplicated elements are not
 // allowed for now.
-template <class T>
+template <class T, class Hash = std::hash<T>>
 class SetVector {
  public:
   // Returns false if value already existed in the set, true otherwise.
@@ -91,7 +91,7 @@ class SetVector {
   void Reserve(int64 size) { vector_.reserve(size); }
 
  private:
-  std::unordered_set<T> set_;
+  std::unordered_set<T, Hash> set_;
   std::vector<T> vector_;
 };
 
@@ -139,7 +139,7 @@ inline StringPiece ParseNodeNameAsStringPiece(const string& name,
 
 // Returns the node name and position in a single call.
 inline string ParseNodeName(const string& name, int* position) {
-  return ParseNodeNameAsStringPiece(name, position).ToString();
+  return std::string(ParseNodeNameAsStringPiece(name, position));
 }
 
 // Add a prefix to a node name with a custom delimiter.
@@ -211,11 +211,24 @@ Status SetTensorValue(DataType dtype, int value, Tensor* tensor);
 
 class SimpleGraphView {
  public:
+  // Build a graph view for the specified graphdef.
   Status Initialize(const GraphDef& graph) {
-    return Initialize(graph, true, true);
+    return Initialize(graph, nullptr, true, true);
   }
-  Status Initialize(const GraphDef& graph, bool dedup_inputs,
-                    bool dedup_outputs);
+  // Build a graph view for the specified graphdef augmented with the additional
+  // edges specified in 'extra_dependencies' if any. Note that
+  // extra_dependencies can be null.
+  Status Initialize(
+      const GraphDef& graph,
+      const std::vector<std::pair<const NodeDef*, const NodeDef*>>*
+          extra_dependencies) {
+    return Initialize(graph, extra_dependencies, true, true);
+  }
+  Status Initialize(
+      const GraphDef& graph,
+      const std::vector<std::pair<const NodeDef*, const NodeDef*>>*
+          extra_dependencies,
+      bool dedup_inputs, bool dedup_outputs);
 
   const GraphDef* graph() const { return graph_; }
   inline int num_nodes() const { return index_to_name_.size(); }
@@ -238,6 +251,7 @@ class SimpleGraphView {
   // visited in nodes_found. If a node has an op in `op_types_to_traverse`, the
   // walk continues to its children. It is assumed that *graph_ was not modified
   // after the call to Initialize().
+  // If `op_types_to_traverse` is empty the DFS will traverse any node type.
   void DepthFirstSearch(const std::unordered_set<string>& op_types_to_traverse,
                         int node_idx, std::set<int>* nodes_found) const;
 
@@ -254,4 +268,4 @@ class SimpleGraphView {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_UTILS_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_H_
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index 44ef4a965b59a5..e540cc047679c9 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -97,8 +97,6 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:graph_view",
-        "//tensorflow/core/grappler:op_types",
-        "//tensorflow/core/grappler:utils",
     ],
 )
 
diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
index 790809bc6709f0..d64cb497154d35 100644
--- a/tensorflow/core/grappler/utils/functions.cc
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
@@ -380,16 +381,6 @@ GrapplerFunctionItem& GrapplerFunctionItem::SwapFunctionBody(GraphDef&& other) {
   return *this;
 }
 
-std::vector<string> OutputTensors(const GrapplerFunctionItem& item) {
-  std::vector<string> output_tensors;
-  for (const OutputArgExpansion& output : item.outputs()) {
-    for (const string& tensor : output.output_tensors) {
-      output_tensors.push_back(tensor);
-    }
-  }
-  return output_tensors;
-}
-
 bool HasParametrizedType(const FunctionDef& func) {
   const auto is_type_parametrized = [](const OpDef::ArgDef& arg) {
     return !arg.type_attr().empty() || !arg.number_attr().empty() ||
@@ -417,6 +408,63 @@ bool IsParametrized(const FunctionDef& func) {
   return HasParametrizedType(func) || HasParametrizedBody(func);
 }
 
+Status InstantiationTypeParameters(
+    const FunctionDef& func, const AttrValueMap& func_instantiation_attr,
+    std::unordered_map<string, DataType>* type_parameters) {
+  if (!type_parameters->empty()) {
+    return errors::InvalidArgument("Type parameters output map must be empty");
+  }
+
+  GrapplerFunctionItemInstantiation instantiation(&func_instantiation_attr);
+
+  const auto resolve_type_attr = [&](const OpDef::ArgDef& arg) {
+    // Check if it's unknown and unresolved type.
+    if (arg.type() == DT_INVALID &&
+        type_parameters->find(arg.type_attr()) == type_parameters->end()) {
+      DataType data_type;
+      TF_RETURN_IF_ERROR(instantiation.GetArgType(arg, &data_type));
+      type_parameters->insert({arg.type_attr(), data_type});
+    }
+    return Status::OK();
+  };
+
+  for (const auto& input : func.signature().input_arg())
+    TF_RETURN_IF_ERROR(resolve_type_attr(input));
+  for (const auto& output : func.signature().output_arg())
+    TF_RETURN_IF_ERROR(resolve_type_attr(output));
+
+  return Status::OK();
+}
+
+Status InstantiationBodyParameters(
+    const FunctionDef& func, const AttrValueMap& func_instantiation_attr,
+    std::unordered_map<string, AttrValue>* body_parameters) {
+  if (!body_parameters->empty()) {
+    return errors::InvalidArgument("Body parameters output map must be empty");
+  }
+
+  for (const NodeDef& func_body_node : func.node_def()) {
+    for (auto& attr : func_body_node.attr()) {
+      const string& placeholder = attr.second.placeholder();
+
+      if (placeholder.empty() ||
+          body_parameters->find(placeholder) != body_parameters->end()) {
+        continue;
+      }
+
+      auto it = func_instantiation_attr.find(placeholder);
+      if (it != func_instantiation_attr.end()) {
+        body_parameters->emplace(placeholder, it->second);
+      } else {
+        return errors::InvalidArgument("Can't resolve placeholder: ",
+                                       placeholder);
+      }
+    }
+  }
+
+  return Status::OK();
+}
+
 Status MakeGrapplerFunctionItem(const FunctionDef& func,
                                 const AttrValueMap& func_instantiation_attr,
                                 const FunctionLibraryDefinition& flib,
@@ -478,7 +526,9 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
     NodeDef* placeholder = function_body.add_node();
     placeholder->set_name(input.name());
     placeholder->set_op("Placeholder");
-    (*placeholder->mutable_attr())["T"].set_type(input_data_type);
+    (*placeholder->mutable_attr())["dtype"].set_type(input_data_type);
+    (*placeholder->mutable_attr())["shape"].mutable_shape()->set_unknown_rank(
+        true);
 
     InputArgExpansion input_expansion{/*input_name=*/input.name(),
                                       /*data_type=*/input_data_type,
@@ -566,6 +616,60 @@ Status RegisterGrapplerFunctionConnectivity(
   return Status::OK();
 }
 
+Status ReplaceInputWithConst(const NodeDef& input_const, int input_position,
+                             GrapplerFunctionItem* item) {
+  if (!IsConstant(input_const)) {
+    return errors::InvalidArgument("Input node ", input_const.name(),
+                                   " is not a constant");
+  }
+
+  auto& inputs = item->input_arg_expansions_;
+
+  // Find input arg expansion and input placeholder position in it for the
+  // given function input position.
+  InputArgExpansion* input_arg_expansion = nullptr;
+  int placeholder_idx = input_position;
+
+  for (InputArgExpansion& input : inputs) {
+    if (placeholder_idx < input.placeholders.size()) {
+      input_arg_expansion = &input;
+      break;
+    }
+    placeholder_idx -= input.placeholders.size();
+  }
+
+  if (input_arg_expansion == nullptr) {
+    return errors::InvalidArgument(
+        "Input placeholder not found: input_position=", input_position,
+        " function=", item->id);
+  }
+
+  // Delete placeholder from input expansion.
+  string placeholder_name = input_arg_expansion->placeholders[placeholder_idx];
+  item->input_arg_placeholders_.erase(placeholder_name);
+  input_arg_expansion->placeholders.erase(
+      input_arg_expansion->placeholders.begin() + placeholder_idx);
+
+  // Delete empty input expansions.
+  inputs.erase(std::remove_if(inputs.begin(), inputs.end(),
+                              [](const InputArgExpansion& input) {
+                                return input.placeholders.empty();
+                              }),
+               inputs.end());
+
+  // Replace placeholder node in the function body with a const node.
+  for (NodeDef& node : *item->graph.mutable_node()) {
+    if (node.name() == placeholder_name) {
+      node = input_const;
+      node.set_name(placeholder_name);
+      node.clear_input();   // remove potential control inputs
+      node.clear_device();  // device placement is defined by instantiating node
+    }
+  }
+
+  return Status::OK();
+}
+
 Status MakeFunctionDef(const GrapplerFunctionItem& item,
                        const FunctionLibraryDefinition& flib,
                        FunctionDef* func) {
@@ -579,6 +683,9 @@ Status MakeFunctionDef(const GrapplerFunctionItem& item,
 
   // Add function input arguments.
   for (const InputArgExpansion& input_arg : item.inputs()) {
+    CHECK(input_arg.placeholders.size() == 1)  // do some sanity checking
+        << "Inputs of tensor sequences are not supported";
+
     OpDef::ArgDef arg_def;
     arg_def.set_name(input_arg.input_name);
     arg_def.set_type(input_arg.data_type);
@@ -588,15 +695,15 @@ Status MakeFunctionDef(const GrapplerFunctionItem& item,
 
   // Add function output arguments.
   for (const OutputArgExpansion& output_arg : item.outputs()) {
+    CHECK(output_arg.output_tensors.size() == 1)  // do some sanity checking
+        << "Outputs of tensor sequences are not supported";
+
     OpDef::ArgDef arg_def;
     arg_def.set_name(output_arg.output_name);
     arg_def.set_type(output_arg.data_type);
     arg_def.set_is_ref(output_arg.is_ref);
     *func->mutable_signature()->add_output_arg() = arg_def;
 
-    CHECK(output_arg.output_tensors.size() == 1)  // do some sanity checking
-        << "Outputs of tensor sequences are not supported";
-
     string ret;
     for (const string& output_tensor : output_arg.output_tensors) {
       TF_RETURN_IF_ERROR(connectivity.AsFunctionDefInput(output_tensor, &ret));
diff --git a/tensorflow/core/grappler/utils/functions.h b/tensorflow/core/grappler/utils/functions.h
index 5e8b6c69601571..6227daa71b57f5 100644
--- a/tensorflow/core/grappler/utils/functions.h
+++ b/tensorflow/core/grappler/utils/functions.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_UTILS_FUNCTIONS_H_
-#define TENSORFLOW_GRAPPLER_UTILS_FUNCTIONS_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_FUNCTIONS_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_FUNCTIONS_H_
 
 #include <memory>
 #include <string>
@@ -162,6 +162,9 @@ class GrapplerFunctionItem : public GrapplerItem {
   GrapplerFunctionItem& SwapFunctionBody(GraphDef&& other);
 
  private:
+  friend Status ReplaceInputWithConst(const NodeDef&, int,
+                                      GrapplerFunctionItem*);
+
   AttrValueMap func_attr_;  // Attributes specific to function definition that
                             // produced this item (FuncDef.attr field).
 
@@ -173,9 +176,6 @@ class GrapplerFunctionItem : public GrapplerItem {
   bool is_stateful_;
 };
 
-// Return all output tensors referenced by item output args.
-std::vector<string> OutputTensors(const GrapplerFunctionItem& item);
-
 // Check if function input/output types are fully defined only at instantiation
 // time (parametrized by it's instantiation node).
 bool HasParametrizedType(const FunctionDef& func);
@@ -188,20 +188,37 @@ bool HasParametrizedBody(const FunctionDef& func);
 // Check if function has parametrized type or body.
 bool IsParametrized(const FunctionDef& func);
 
+// Resolve function instantiation type parameters from the attributes of the
+// caller node. Return error if type can't be resolved.
+Status InstantiationTypeParameters(
+    const FunctionDef& func, const AttrValueMap& func_instantiation_attr,
+    std::unordered_map<string, DataType>* type_parameters);
+
+// Resolve function instantiation body parameters (values for the function body
+// attr placeholders) from the attributes of the caller node. Return error if
+// type can't be resolved.
+Status InstantiationBodyParameters(
+    const FunctionDef& func, const AttrValueMap& func_instantiation_attr,
+    std::unordered_map<string, AttrValue>* body_parameters);
+
 // Register GrapplerFunctionItem input arg expansion and function body outputs
-// in the GrapplerFunctionConnectivity.  Use function library definition to
+// in the GrapplerFunctionConnectivity. Use function library definition to
 // lookup function body nodes output names and ranges.
 Status RegisterGrapplerFunctionConnectivity(
     const GrapplerFunctionItem& item, const FunctionLibraryDefinition& flib,
     GrapplerFunctionConnectivity* connectivity);
 
+// Replace one of the function inputs with a constant.
+Status ReplaceInputWithConst(const NodeDef& input_const, int input_position,
+                             GrapplerFunctionItem* item);
+
 // Make a GrapplerFunctionItem from the function definition and function
 // instantiation attributes (caller node attributes). Returns error if the given
 // function def cannot be converted (e.g. not all attributes are defined).
-Status MakeGrapplerFunctionItem(
-    const FunctionDef& func,
-    const std::unordered_map<string, AttrValue>& func_instantiation_attr,
-    const FunctionLibraryDefinition& flib, GrapplerFunctionItem* item);
+Status MakeGrapplerFunctionItem(const FunctionDef& func,
+                                const AttrValueMap& func_instantiation_attr,
+                                const FunctionLibraryDefinition& flib,
+                                GrapplerFunctionItem* item);
 
 // Make a GrapplerFunction item from the function definition. Function must be
 // fully defined (no type or body parametrization).
@@ -221,4 +238,4 @@ Status MakeFunctionDef(const GrapplerFunctionItem& item,
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_UTILS_FUNCTIONS_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_FUNCTIONS_H_
diff --git a/tensorflow/core/grappler/utils/functions_test.cc b/tensorflow/core/grappler/utils/functions_test.cc
index 6dfd49b9438081..8c3cc70351ad5c 100644
--- a/tensorflow/core/grappler/utils/functions_test.cc
+++ b/tensorflow/core/grappler/utils/functions_test.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/protobuf/meta_graph.pb.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -54,6 +53,44 @@ TEST_F(FunctionsTest, IsParametrized) {
   EXPECT_FALSE(IsParametrized(non_parametrized_func));
 }
 
+TEST_F(FunctionsTest, InstantiationParameters) {
+  // Function definition is invalid, only type/body parameters are important.
+  FunctionDef func = FunctionDefHelper::Create(
+      "ParametrizedFunc",
+      /* inputs */
+      {"input1:A", "input2:B", "input3:float"},
+      /* outputs */
+      {"output1: A", "output2:C"},
+      /* type parameters */
+      {"A: {float, double}", "B: {float, int32}", "C: {float, double}"},
+      /* function body*/
+      {{{"output"}, "FakeOp", {"input1", "input2"}, {{"key", "$key"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"x", "cx:output:0"}, {"y", "cy:output:0"}});
+
+  std::unordered_map<string, AttrValue> func_instantiation_attr;
+  func_instantiation_attr["key"].set_s("key-value");
+  func_instantiation_attr["A"].set_type(DT_FLOAT);
+  func_instantiation_attr["B"].set_type(DT_INT32);
+  func_instantiation_attr["C"].set_type(DT_DOUBLE);
+
+  std::unordered_map<string, DataType> type_parameters;
+  TF_EXPECT_OK(InstantiationTypeParameters(func, func_instantiation_attr,
+                                           &type_parameters));
+
+  ASSERT_EQ(3, type_parameters.size());
+  EXPECT_EQ(DT_FLOAT, type_parameters["A"]);
+  EXPECT_EQ(DT_INT32, type_parameters["B"]);
+  EXPECT_EQ(DT_DOUBLE, type_parameters["C"]);
+
+  std::unordered_map<string, AttrValue> body_parameters;
+  TF_EXPECT_OK(InstantiationBodyParameters(func, func_instantiation_attr,
+                                           &body_parameters));
+
+  ASSERT_EQ(1, body_parameters.size());
+  EXPECT_EQ("key-value", body_parameters["key"].s());
+}
+
 TEST_F(FunctionsTest, GrapplerFunctionConnectivity_ExpandFunctionDefInput) {
   GrapplerFunctionConnectivity connectivity;
 
@@ -219,7 +256,7 @@ TEST_F(FunctionsTest, FromSimpleFunctionDef) {
   for (const NodeDef &node : item.function_body().node()) {
     if (node.name() == "x" && count++) {
       EXPECT_EQ("Placeholder", node.op());
-      EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
+      EXPECT_EQ(DT_FLOAT, node.attr().at("dtype").type());
       EXPECT_EQ(0, node.input_size());
     } else if (node.name() == "two" && count++) {
       EXPECT_EQ("Const", node.op());
@@ -296,7 +333,7 @@ TEST_F(FunctionsTest, FromFunctionDefWithMultiOutputNodes) {
     if (node.name() == "x" || node.name() == "y" || node.name() == "dz") {
       count++;
       EXPECT_EQ("Placeholder", node.op());
-      EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
+      EXPECT_EQ(DT_FLOAT, node.attr().at("dtype").type());
       EXPECT_EQ(0, node.input_size());
     } else if (node.name() == "rx" && count++) {
       EXPECT_EQ("BroadcastGradientArgs", node.op());
@@ -365,7 +402,7 @@ TEST_F(FunctionsTest, FromFunctionDefWithNestedFuncs) {
     if (node.name() == "x" || node.name() == "y") {
       count++;
       EXPECT_EQ("Placeholder", node.op());
-      EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
+      EXPECT_EQ(DT_FLOAT, node.attr().at("dtype").type());
       EXPECT_EQ(0, node.input_size());
     } else if (node.name() == "a0" && count++) {
       EXPECT_EQ("Swap", node.op());
@@ -428,7 +465,7 @@ TEST_F(FunctionsTest, FromFunctionDefWithOutputMappings) {
   for (const NodeDef &node : item.function_body().node()) {
     if (node.name() == "in" && count++) {
       EXPECT_EQ("Placeholder", node.op());
-      EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
+      EXPECT_EQ(DT_FLOAT, node.attr().at("dtype").type());
       EXPECT_EQ(0, node.input_size());
     } else if (node.name() == "Linear_func" && count++) {
       EXPECT_EQ("Identity", node.op());
@@ -480,9 +517,9 @@ TEST_F(FunctionsTest, FromFunctionDefWithInputForwarding) {
     count++;
     EXPECT_EQ("Placeholder", node.op());
     if (node.name() == "arg3") {
-      EXPECT_EQ(DT_INT32, node.attr().at("T").type());
+      EXPECT_EQ(DT_INT32, node.attr().at("dtype").type());
     } else {
-      EXPECT_EQ(DT_FLOAT, node.attr().at("T").type());
+      EXPECT_EQ(DT_FLOAT, node.attr().at("dtype").type());
     }
   }
   EXPECT_EQ(5, count);
@@ -573,6 +610,81 @@ TEST_F(FunctionsTest, MakeFunctionDef) {
   EXPECT_EQ(2, count);
 }
 
+TEST_F(FunctionsTest, ReplaceInputWithConst) {
+  FunctionDef func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"output"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "output:z:0"}});
+
+  std::unordered_map<string, AttrValue> func_attr;
+  func_attr["T"].set_type(DT_FLOAT);
+  FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
+
+  GrapplerFunctionItem item;
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func, func_attr, flib, &item));
+
+  EXPECT_EQ(2, item.input_size());
+  EXPECT_EQ(1, item.output_size());
+
+  ASSERT_EQ(3, item.function_body().node_size());
+
+  const NodeDef &input_x = item.function_body().node(0);
+  const NodeDef &input_y = item.function_body().node(1);
+
+  // Initially inputs added to the graph as placeholders.
+  EXPECT_EQ("Placeholder", input_x.op());
+  EXPECT_EQ("Placeholder", input_y.op());
+
+  // Replace inputs x and y with constants.
+  NodeDef const_input_x;
+  const_input_x.set_op("Const");
+  AddNodeAttr("Tag", "const_input_x", &const_input_x);
+
+  NodeDef const_input_y;
+  const_input_y.set_op("Const");
+  AddNodeAttr("Tag", "const_input_y", &const_input_y);
+
+  // Replace input x.
+  TF_EXPECT_OK(ReplaceInputWithConst(const_input_x, 0, &item));
+
+  EXPECT_EQ(1, item.input_size());
+  EXPECT_EQ("Const", input_x.op());
+  EXPECT_EQ("const_input_x", input_x.attr().at("Tag").s());
+
+  // Replace input y.
+  TF_EXPECT_OK(ReplaceInputWithConst(const_input_y, 0, &item));
+
+  EXPECT_EQ(0, item.input_size());
+  EXPECT_EQ("Const", input_y.op());
+  EXPECT_EQ("const_input_y", input_y.attr().at("Tag").s());
+
+  // Make a function from const-specialized function item.
+  FunctionDef specialized;
+  TF_EXPECT_OK(MakeFunctionDef(item, flib, &specialized));
+
+  EXPECT_EQ(0, specialized.signature().input_arg_size());
+  EXPECT_EQ(1, specialized.signature().output_arg_size());
+  EXPECT_EQ(3, specialized.node_def_size());
+
+  // Check that graph has const nodes pushed into function body.
+  int count = 0;
+  for (const NodeDef &node : specialized.node_def()) {
+    if (node.name() == "x" && count++) {
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ("const_input_x", node.attr().at("Tag").s());
+    } else if (node.name() == "y" && count++) {
+      EXPECT_EQ("Const", node.op());
+      EXPECT_EQ("const_input_y", node.attr().at("Tag").s());
+    } else if (node.name() == "output" && count++) {
+      EXPECT_EQ("Mul", node.op());
+      EXPECT_EQ("x:output:0", node.input(0));
+      EXPECT_EQ("y:output:0", node.input(1));
+    }
+  }
+  EXPECT_EQ(3, count);
+}
+
 TEST_F(FunctionsTest, SwapFunctionBodyAndMakeFunctionDef) {
   using test::function::NDef;
 
diff --git a/tensorflow/core/grappler/utils/grappler_test.h b/tensorflow/core/grappler/utils/grappler_test.h
index c2ba5ee7e8a601..bd4d7f2a7e89ad 100644
--- a/tensorflow/core/grappler/utils/grappler_test.h
+++ b/tensorflow/core/grappler/utils/grappler_test.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_GRAPPLER_GRAPPLER_TEST_H_
-#define TENSORFLOW_GRAPPLER_GRAPPLER_TEST_H_
+#ifndef TENSORFLOW_CORE_GRAPPLER_UTILS_GRAPPLER_TEST_H_
+#define TENSORFLOW_CORE_GRAPPLER_UTILS_GRAPPLER_TEST_H_
 
 #include <vector>
 
@@ -75,4 +75,4 @@ class GrapplerTest : public ::testing::Test {
 }  // end namespace grappler
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_GRAPPLER_GRAPPLER_TEST_H_
+#endif  // TENSORFLOW_CORE_GRAPPLER_UTILS_GRAPPLER_TEST_H_
diff --git a/tensorflow/core/grappler/utils/topological_sort.cc b/tensorflow/core/grappler/utils/topological_sort.cc
index a8e464d09d6c2e..ff89035902270c 100644
--- a/tensorflow/core/grappler/utils/topological_sort.cc
+++ b/tensorflow/core/grappler/utils/topological_sort.cc
@@ -26,10 +26,12 @@ namespace grappler {
 
 // Kahn's algorithm is implemented.
 // For details, see https://en.wikipedia.org/wiki/Topological_sorting
-Status ComputeTopologicalOrder(const GraphDef& graph,
-                               std::vector<int>* ready_nodes) {
+Status ComputeTopologicalOrder(
+    const GraphDef& graph, std::vector<int>* ready_nodes,
+    const std::vector<std::pair<const NodeDef*, const NodeDef*>>*
+        extra_dependencies) {
   SimpleGraphView graph_view;
-  TF_RETURN_IF_ERROR(graph_view.Initialize(graph));
+  TF_RETURN_IF_ERROR(graph_view.Initialize(graph, extra_dependencies));
 
   ready_nodes->reserve(graph_view.num_nodes());
 
@@ -70,10 +72,12 @@ Status ComputeTopologicalOrder(const GraphDef& graph,
 }
 
 Status ComputeTopologicalOrder(
-    const GraphDef& graph,
-    std::unordered_map<const NodeDef*, int>* topo_order) {
+    const GraphDef& graph, std::unordered_map<const NodeDef*, int>* topo_order,
+    const std::vector<std::pair<const NodeDef*, const NodeDef*>>*
+        extra_dependencies) {
   std::vector<int> ready_nodes;
-  TF_RETURN_IF_ERROR(ComputeTopologicalOrder(graph, &ready_nodes));
+  TF_RETURN_IF_ERROR(
+      ComputeTopologicalOrder(graph, &ready_nodes, extra_dependencies));
   topo_order->reserve(graph.node_size());
   for (int i = 0; i < ready_nodes.size(); ++i) {
     (*topo_order)[&graph.node(ready_nodes[i])] = i;
@@ -83,7 +87,7 @@ Status ComputeTopologicalOrder(
 
 Status TopologicalSort(GraphDef* graph) {
   std::vector<int> ready_nodes;
-  TF_RETURN_IF_ERROR(ComputeTopologicalOrder(*graph, &ready_nodes));
+  TF_RETURN_IF_ERROR(ComputeTopologicalOrder(*graph, &ready_nodes, nullptr));
   PermuteNodesInPlace(graph, &ready_nodes, /*invert_permutation=*/true);
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/utils/topological_sort.h b/tensorflow/core/grappler/utils/topological_sort.h
index 668c88dc751c87..bc0299a7b8c908 100644
--- a/tensorflow/core/grappler/utils/topological_sort.h
+++ b/tensorflow/core/grappler/utils/topological_sort.h
@@ -24,7 +24,9 @@ namespace grappler {
 
 // Compute a topological ordering for the graph nodes.
 Status ComputeTopologicalOrder(
-    const GraphDef& graph, std::unordered_map<const NodeDef*, int>* topo_order);
+    const GraphDef& graph, std::unordered_map<const NodeDef*, int>* topo_order,
+    const std::vector<std::pair<const NodeDef*, const NodeDef*>>*
+        extra_dependencies);
 
 // Sort a graph in topological order.
 Status TopologicalSort(GraphDef* graph);
diff --git a/tensorflow/core/grappler/utils/topological_sort_test.cc b/tensorflow/core/grappler/utils/topological_sort_test.cc
index f5c95009d240f3..48b7eb50bd9f2a 100644
--- a/tensorflow/core/grappler/utils/topological_sort_test.cc
+++ b/tensorflow/core/grappler/utils/topological_sort_test.cc
@@ -53,7 +53,7 @@ TEST_F(TopologicalSortTest, NoLoop) {
   *graph.add_node() = CreateNode("4", {});
 
   std::unordered_map<const NodeDef*, int> topo_order;
-  TF_EXPECT_OK(ComputeTopologicalOrder(graph, &topo_order));
+  TF_EXPECT_OK(ComputeTopologicalOrder(graph, &topo_order, nullptr));
 
   const std::vector<string> order = {"5", "4", "2", "0", "3", "1"};
   for (const auto& topo : topo_order) {
@@ -80,7 +80,7 @@ TEST_F(TopologicalSortTest, WithLoop) {
   *graph.add_node() = CreateNode("1", {});
 
   std::unordered_map<const NodeDef*, int> topo_order;
-  TF_EXPECT_OK(ComputeTopologicalOrder(graph, &topo_order));
+  TF_EXPECT_OK(ComputeTopologicalOrder(graph, &topo_order, nullptr));
 
   const std::vector<string> order = {"1", "2", "3", "4", "5"};
   for (const auto& topo : topo_order) {
@@ -143,6 +143,36 @@ TEST_F(TopologicalSortTest, Idempotent) {
   }
 }
 
+TEST_F(TopologicalSortTest, ExtraDependencies) {
+  GraphDef graph;
+  *graph.add_node() = CreateNode("2", {"5"});
+  *graph.add_node() = CreateNode("0", {"5", "4"});
+  *graph.add_node() = CreateNode("1", {"4", "3"});
+  *graph.add_node() = CreateNode("3", {"2"});
+  *graph.add_node() = CreateNode("5", {});
+  *graph.add_node() = CreateNode("4", {});
+
+  // Add an edge from 4 to 5.
+  std::vector<std::pair<const NodeDef*, const NodeDef*>> extra_dependencies;
+  extra_dependencies.emplace_back(&graph.node(5), &graph.node(4));
+
+  std::unordered_map<const NodeDef*, int> topo_order;
+  TF_EXPECT_OK(
+      ComputeTopologicalOrder(graph, &topo_order, &extra_dependencies));
+
+  const std::vector<string> order = {"4", "5", "2", "0", "3", "1"};
+  for (const auto& topo : topo_order) {
+    const string& node_name = topo.first->name();
+    const int topo_order = topo.second;
+    EXPECT_EQ(node_name, order[topo_order]);
+  }
+
+  // Add an edge from 0 to 4. This will create a loop
+  extra_dependencies.emplace_back(&graph.node(1), &graph.node(5));
+  EXPECT_FALSE(
+      ComputeTopologicalOrder(graph, &topo_order, &extra_dependencies).ok());
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 615820833ec338..73509e82460f34 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -361,7 +361,6 @@ cc_library(
     srcs = ["queue_base.cc"],
     hdrs = ["queue_base.h"],
     deps = [
-        ":batch_util",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -383,7 +382,6 @@ cc_library(
     srcs = ["priority_queue.cc"],
     hdrs = ["priority_queue.h"],
     deps = [
-        ":batch_util",
         ":queue_base",
         ":typed_queue",
         "//tensorflow/core:framework",
@@ -1700,7 +1698,6 @@ tf_kernel_library(
     name = "random_shuffle_queue_op",
     prefix = "random_shuffle_queue_op",
     deps = DATA_FLOW_DEPS + [
-        ":batch_util",
         "//tensorflow/core:protos_all_cc",
     ],
 )
@@ -1894,7 +1891,6 @@ cc_library(
     hdrs = ["fifo_queue.h"],
     visibility = ["//visibility:private"],
     deps = [
-        ":batch_util",
         ":queue_base",
         ":typed_queue",
         "//tensorflow/core:framework",
@@ -1909,7 +1905,6 @@ cc_library(
     hdrs = ["padding_fifo_queue.h"],
     visibility = ["//visibility:private"],
     deps = [
-        ":batch_util",
         ":fifo_queue",
         ":queue_base",
         ":typed_queue",
@@ -1977,6 +1972,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "resource_variable_ops",
     srcs = ["resource_variable_ops.cc"],
+    hdrs = ["resource_variable_ops.h"],
     deps = [
         ":bounds_check",
         ":dense_update_functor",
@@ -2036,9 +2032,12 @@ tf_kernel_library(
     name = "functional_ops",
     prefix = "functional_ops",
     deps = [
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:functional_ops_op_lib",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
         "//third_party/eigen3",
     ],
 )
@@ -3312,7 +3311,10 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:nn_ops_op_lib",
-    ] + if_cuda(["@cub_archive//:cub"]),
+    ] + if_cuda([
+        "@cub_archive//:cub",
+        "@local_config_cuda//cuda:cudnn_header",
+    ]),
 )
 
 tf_kernel_library(
@@ -3323,12 +3325,15 @@ tf_kernel_library(
     prefix = "depthwise_conv_grad_op",
     deps = [
         ":bounds_check",
+        ":conv_ops",
         ":ops_util",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:nn_ops_op_lib",
-    ],
+    ] + if_cuda([
+        "@local_config_cuda//cuda:cudnn_header",
+    ]),
 )
 
 cc_library(
@@ -4261,6 +4266,7 @@ cc_library(
         ":as_string_op",
         ":base64_ops",
         ":reduce_join_op",
+        ":regex_full_match_op",
         ":regex_replace_op",
         ":string_join_op",
         ":string_split_op",
@@ -4297,6 +4303,12 @@ tf_kernel_library(
     deps = STRING_DEPS,
 )
 
+tf_kernel_library(
+    name = "regex_full_match_op",
+    prefix = "regex_full_match_op",
+    deps = STRING_DEPS + ["@com_googlesource_code_re2//:re2"],
+)
+
 tf_kernel_library(
     name = "regex_replace_op",
     prefix = "regex_replace_op",
@@ -5013,7 +5025,6 @@ filegroup(
 filegroup(
     name = "android_extended_ops_group2",
     srcs = [
-        "batch_util.cc",
         "batchtospace_op.cc",
         "ctc_decoder_ops.cc",
         "decode_bmp_op.cc",
@@ -5186,9 +5197,9 @@ filegroup(
             "partitioned_function_ops.cc",
             # Excluded due to experimental status:
             "debug_ops.*",
-            "scatter_nd_op*",
             "mutex_ops.*",
             "batch_kernels.*",
+            "regex_full_match_op.cc",
             "regex_replace_op.cc",
         ],
     ),
@@ -6140,9 +6151,10 @@ tf_mkl_kernel_library(
     ],
 )
 
+# NOTE(lespeholt): This rule is deprecated, please use:
+# tensorflow/core/util/batch_util.h
 cc_library(
     name = "batch_util",
-    srcs = ["batch_util.cc"],
     hdrs = ["batch_util.h"],
     deps = [
         "//tensorflow/core:framework",
@@ -6176,7 +6188,7 @@ cc_library(
 tf_kernel_library(
     name = "dataset_ops",
     deps = [
-        "//tensorflow/core/kernels/data:dataset_ops",
+        "//tensorflow/core/kernels/data",
     ],
 )
 
diff --git a/tensorflow/core/kernels/as_string_op.cc b/tensorflow/core/kernels/as_string_op.cc
index 66c4aff3e33769..a7757d13617c42 100644
--- a/tensorflow/core/kernels/as_string_op.cc
+++ b/tensorflow/core/kernels/as_string_op.cc
@@ -73,6 +73,7 @@ class AsStringOp : public OpKernel {
     }
     switch (dtype) {
       case DT_INT8:
+      case DT_INT16:
       case DT_INT32:
         strings::Appendf(&format_, "d");
         break;
@@ -129,6 +130,7 @@ class AsStringOp : public OpKernel {
       ENCODE_TYPE(DT_FLOAT, float, format_);
       ENCODE_TYPE(DT_DOUBLE, double, format_);
       ENCODE_TYPE(DT_INT8, int8, format_);
+      ENCODE_TYPE(DT_INT16, int16, format_);
       case (DT_BOOL): {
         const auto& input_flat = input_tensor->flat<bool>();
         for (int i = 0; i < input_flat.size(); ++i) {
diff --git a/tensorflow/core/kernels/assign_op.h b/tensorflow/core/kernels/assign_op.h
index 2ed1628bf1a84b..a450b1d1eeffd8 100644
--- a/tensorflow/core/kernels/assign_op.h
+++ b/tensorflow/core/kernels/assign_op.h
@@ -36,6 +36,12 @@ class AssignOp : public OpKernel {
                    context->GetAttr("validate_shape", &validate_shape_));
     OP_REQUIRES(context, IsRefType(context->input_type(0)),
                 errors::InvalidArgument("lhs input needs to be a ref type"));
+    if (!context
+             ->GetAttr("_grappler_relax_allocator_constraints",
+                       &relax_constraints_)
+             .ok()) {
+      relax_constraints_ = false;
+    }
   }
 
   void Compute(OpKernelContext* context) override {
@@ -44,49 +50,37 @@ class AssignOp : public OpKernel {
     // We always return the input ref.
     context->forward_ref_input_to_ref_output(0, 0);
 
-    // We can't always know how this value will be used downstream,
-    // so make conservative assumptions in specifying constraints on
-    // the memory allocation attributes.
-    // TODO(rmlarsen): These conservative constraints make buffer
-    // forwarding unlikely to happen very often. Try to use graph analysis
-    // (possibly the InferAllocAttr pass in the executer) to improve the
-    // situation.
+    // We can't always know how this value will be used downstream, so make
+    // conservative assumptions in specifying constraints on the memory
+    // allocation attributes, unless the Grappler graph analysis determined that
+    // it was safe not to.
     AllocatorAttributes attr;
-    attr.set_gpu_compatible(true);
-    attr.set_nic_compatible(true);
+    if (!relax_constraints_) {
+      attr.set_gpu_compatible(true);
+      attr.set_nic_compatible(true);
+    }
 
     {
       mutex_lock l(*context->input_ref_mutex(0));
       const Tensor& old_lhs = context->mutable_input(0, /* lock_held */ true);
       const bool same_shape = old_lhs.shape().IsSameSize(rhs.shape());
       if (validate_shape_) {
-        OP_REQUIRES(
-            context, same_shape,
-            errors::InvalidArgument(
-                "Assign requires shapes of both tensors to match. lhs shape= ",
-                old_lhs.shape().DebugString(),
-                " rhs shape= ", rhs.shape().DebugString()));
+        OP_REQUIRES(context, same_shape,
+                    errors::InvalidArgument(
+                        "Assign requires shapes of both tensors to match. "
+                        "lhs shape= ",
+                        old_lhs.shape().DebugString(),
+                        " rhs shape= ", rhs.shape().DebugString()));
       }
 
       // In the code below we try to minimize the amount of memory allocation
       // and copying by trying the following two shortcuts:
-      // 1. If we can reuse the rhs buffer we avoid both a memory allocation
-      //   and copying.
-      // 2. If the lhs is initialized and has the same number of elements as the
-      //    rhs we can avoid a memory allocation.
-
-      // 1. Try to reuse the rhs.
-      std::unique_ptr<Tensor> input_alias = context->forward_input(
-          1, OpKernelContext::Params::kNoReservation /*output_index*/,
-          old_lhs.dtype(), old_lhs.shape(), DEVICE_MEMORY, attr);
-      if (input_alias != nullptr) {
-        // Transfer ownership to the ref.
-        context->replace_ref_input(0, *input_alias.release(),
-                                   /* lock_held */ true);
-        return;
-      }
+      // 1. If the lhs is initialized and has the same number of elements as
+      //    the rhs we can avoid a memory allocation.
+      // 2. If we can reuse the rhs buffer we avoid both a memory allocation
+      //    and copying.
 
-      // 2. Try to copy into an existing buffer.
+      // 1. Try to copy into an existing buffer.
       if (old_lhs.IsInitialized() &&
           old_lhs.shape().num_elements() == rhs.shape().num_elements()) {
         // The existing lhs tensor has already been initialized and the right
@@ -96,15 +90,26 @@ class AssignOp : public OpKernel {
           reshaped_old_lhs = old_lhs;
         } else {
           CHECK(reshaped_old_lhs.CopyFrom(old_lhs, rhs.shape()));
-          context->replace_ref_input(0, reshaped_old_lhs, /* lock_held */ true);
+          context->replace_ref_input(0, reshaped_old_lhs,
+                                     /* lock_held */ true);
         }
         if (use_exclusive_lock_) {
           Copy(context, &reshaped_old_lhs, rhs);
           return;
         }
       } else {
-        // Create a new persistent tensor whose shape matches the right hand
-        // side, hand off to lhs and copy the rhs into it.
+        // 2. Try to reuse the rhs.
+        std::unique_ptr<Tensor> input_alias = context->forward_input(
+            1, OpKernelContext::Params::kNoReservation /*output_index*/,
+            rhs.dtype(), rhs.shape(), DEVICE_MEMORY, attr);
+        if (input_alias != nullptr) {
+          // Update the ref to point to the new buffer.
+          context->replace_ref_input(0, *input_alias, /* lock_held */ true);
+          return;
+        }
+
+        // Otherwise, create a new persistent tensor whose shape matches the
+        // right hand side, hand off to lhs and copy the rhs into it.
         PersistentTensor copy;
         Tensor* copyTensor = nullptr;
         OP_REQUIRES_OK(
@@ -133,6 +138,7 @@ class AssignOp : public OpKernel {
 
   bool use_exclusive_lock_;
   bool validate_shape_;
+  bool relax_constraints_;
 };
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index 3d2bbeb0155769..094a46a5896afc 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
@@ -24,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/kernels/split_lib.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/macros.h"
 
@@ -41,7 +43,7 @@ typedef Eigen::SyclDevice SYCLDevice;
 // ensure proper device placement.
 template <typename T>
 Status Concat(OpKernelContext* context, const gtl::ArraySlice<Tensor>& inputs,
-              int output_index) {
+              Tensor* output) {
   const int input_dims = inputs[0].dims();
   const TensorShape& input_shape = inputs[0].shape();
 
@@ -76,9 +78,8 @@ Status Concat(OpKernelContext* context, const gtl::ArraySlice<Tensor>& inputs,
 
   TensorShape output_shape(input_shape);
   output_shape.set_dim(0, output_dim0);
-  Tensor* output = nullptr;
   TF_RETURN_IF_ERROR(
-      context->allocate_output(output_index, output_shape, &output));
+      context->allocate_temp(DataTypeToEnum<T>::value, output_shape, output));
   if (output->NumElements() > 0) {
     auto output_flat = output->shaped<T, 2>({1, output->NumElements()});
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -209,6 +210,7 @@ class BatchResource : public ResourceBase {
   static Status Create(int32 num_batch_threads, int32 max_batch_size,
                        int32 batch_timeout_micros, int32 max_enqueued_batches,
                        const std::vector<int32>& allowed_batch_sizes,
+                       FunctionLibraryRuntime::Handle fhandle,
                        std::unique_ptr<BatchResource>* resource) {
     std::unique_ptr<BatchResource> new_resource(new BatchResource);
 
@@ -225,6 +227,8 @@ class BatchResource : public ResourceBase {
 
     new_resource->allowed_batch_sizes_ = allowed_batch_sizes;
 
+    new_resource->fhandle_ = fhandle;
+
     *resource = std::move(new_resource);
     return Status::OK();
   }
@@ -254,6 +258,14 @@ class BatchResource : public ResourceBase {
       }
       batch_components->inputs.push_back(tensor);
     }
+    OpInputList captured_tensors;
+    const auto captured_status =
+        context->input_list("captured_tensors", &captured_tensors);
+    if (captured_status.ok()) {
+      for (const Tensor& captured_tensor : captured_tensors) {
+        batch_components->captured_inputs.push_back(captured_tensor);
+      }
+    }
     batch_components->context = context;
     batch_components->done_callback = std::move(done_callback);
 
@@ -272,6 +284,7 @@ class BatchResource : public ResourceBase {
     int64 guid;
 
     std::vector<Tensor> inputs;
+    std::vector<Tensor> captured_inputs;
     OpKernelContext* context;
     AsyncOpKernel::DoneCallback done_callback;
 
@@ -314,50 +327,32 @@ class BatchResource : public ResourceBase {
     return batch_size;
   }
 
-  // Processes a batch of one or more BatchTask entries.
-  void ProcessBatch(std::unique_ptr<Batch> batch) const {
-    if (batch->empty()) {
-      return;
+  Status ConcatInputTensors(const Batch& batch, OpKernelContext* context,
+                            std::vector<Tensor>* concatenated_tensors) const {
+    if (batch.num_tasks() == 0) {
+      return errors::InvalidArgument("Empty batch.");
     }
-    const int padded_batch_size = RoundToLowestAllowedBatchSize(batch->size());
-    const int padding_amount = padded_batch_size - batch->size();
 
-    OpKernelContext* last_task_context =
-        batch->task(batch->num_tasks() - 1).context;
-    AsyncOpKernel::DoneCallback last_task_callback =
-        batch->task(batch->num_tasks() - 1).done_callback;
-
-    OP_REQUIRES_OK_ASYNC(last_task_context, ValidateBatch(*batch),
-                         last_task_callback);
+    const int padded_batch_size = RoundToLowestAllowedBatchSize(batch.size());
+    const int padding_amount = padded_batch_size - batch.size();
 
     // All tasks should have the same number of input edges.
-    const int num_input_edges = batch->task(0).inputs.size();
-
-    // Process each input edge one at a time (the typical case has just one).
-    for (int i = 0; i < num_input_edges; ++i) {
-      // Emit batch->num_tasks() - 1 empty output tensors.
-      for (int task_idx = 0; task_idx < batch->num_tasks() - 1; ++task_idx) {
-        const BatchTask& task = batch->task(task_idx);
-        TensorShape output_shape(task.inputs.at(i).shape());
-        output_shape.set_dim(0, 0);
-        Tensor* output = nullptr;
-        OP_REQUIRES_OK_ASYNC(
-            task.context,
-            task.context->allocate_output(i, output_shape, &output),
-            task.done_callback);
-      }
+    const int num_inputs = batch.task(0).inputs.size();
+    concatenated_tensors->reserve(num_inputs);
 
+    // Process each input one at a time (the typical case has just one).
+    for (int i = 0; i < num_inputs; ++i) {
       // Concatenate the tasks ith input tensors into a big output tensor.
       std::vector<Tensor> to_concatenate;
-      to_concatenate.reserve(batch->num_tasks());
-      for (int task_idx = 0; task_idx < batch->num_tasks(); ++task_idx) {
-        to_concatenate.push_back(batch->task(task_idx).inputs.at(i));
+      to_concatenate.reserve(batch.num_tasks());
+      for (int task_idx = 0; task_idx < batch.num_tasks(); ++task_idx) {
+        to_concatenate.push_back(batch.task(task_idx).inputs.at(i));
       }
 
       // Add padding as needed. Use the first row of the first task's tensor as
       // the data for padding.
       if (padding_amount > 0) {
-        const Tensor& padding_source = batch->task(0).inputs.at(i);
+        const Tensor& padding_source = batch.task(0).inputs.at(i);
         Tensor padding;
         if (padding_source.shape().dim_size(0) == 1) {
           padding = padding_source;
@@ -367,10 +362,10 @@ class BatchResource : public ResourceBase {
           Status slice_status;
           std::vector<Tensor> slices;
           switch (type) {
-#define CASE(type)                                                   \
-  case DataTypeToEnum<type>::value:                                  \
-    slice_status = SplitCPU<type>(last_task_context, padding_source, \
-                                  slice_sizes, &slices);             \
+#define CASE(type)                                                     \
+  case DataTypeToEnum<type>::value:                                    \
+    slice_status =                                                     \
+        SplitCPU<type>(context, padding_source, slice_sizes, &slices); \
     break;
             TF_CALL_ALL_TYPES(CASE);
 #undef CASE
@@ -379,8 +374,7 @@ class BatchResource : public ResourceBase {
                   errors::InvalidArgument("Unsupported data type: ", type);
               break;
           }
-          OP_REQUIRES_OK_ASYNC(last_task_context, slice_status,
-                               last_task_callback);
+          TF_RETURN_IF_ERROR(slice_status);
           padding = slices.at(0);
         }
         for (int i = 0; i < padding_amount; ++i) {
@@ -390,10 +384,12 @@ class BatchResource : public ResourceBase {
 
       const DataType type = to_concatenate[0].dtype();
       Status concat_status;
+      Tensor concatenated_tensor;
       switch (type) {
-#define CASE(type)                                                      \
-  case DataTypeToEnum<type>::value:                                     \
-    concat_status = Concat<type>(last_task_context, to_concatenate, i); \
+#define CASE(type)                                                   \
+  case DataTypeToEnum<type>::value:                                  \
+    concat_status =                                                  \
+        Concat<type>(context, to_concatenate, &concatenated_tensor); \
     break;
         TF_CALL_ALL_TYPES(CASE);
 #undef CASE
@@ -402,10 +398,197 @@ class BatchResource : public ResourceBase {
               errors::InvalidArgument("Unsupported data type: ", type);
           break;
       }
-      OP_REQUIRES_OK_ASYNC(last_task_context, concat_status,
-                           last_task_callback);
+      TF_RETURN_IF_ERROR(concat_status);
+      concatenated_tensors->push_back(concatenated_tensor);
+    }
+    return Status::OK();
+  }
+
+  Status SplitOutputTensors(const std::vector<Tensor>& combined_outputs,
+                            Batch* batch) const {
+    DCHECK_GE(batch->num_tasks(), 1);
+    if (batch->num_tasks() < 1) {
+      return errors::Internal("Batch size expected to be positive; was ",
+                              batch->num_tasks());
+    }
+
+    std::vector<int64> task_sizes_plus_optional_padding;
+    task_sizes_plus_optional_padding.reserve(batch->num_tasks());
+    for (int i = 0; i < batch->num_tasks(); ++i) {
+      task_sizes_plus_optional_padding.push_back(batch->task(i).size());
+    }
+    const int padding_size =
+        RoundToLowestAllowedBatchSize(batch->size()) - batch->size();
+    if (padding_size > 0) {
+      task_sizes_plus_optional_padding.push_back(padding_size);
+    }
+
+    // For each output tensor name, a divided-up tensor with one entry per task.
+    std::map<string, std::vector<Tensor>> split_tensors;
+
+    DCHECK_EQ(batch->task(0).context->num_outputs(), combined_outputs.size());
+    if (combined_outputs.size() != batch->task(0).context->num_outputs()) {
+      return errors::Internal("Wrong number of batched output tensors");
+    }
+
+    // Generate 'split_tensors' and populate the context outputs.
+    for (int i = 0; i < combined_outputs.size(); ++i) {
+      const Tensor& output_tensor = combined_outputs[i];
+      if (output_tensor.shape().dims() == 0) {
+        return errors::FailedPrecondition(
+            "Batched output tensor has 0 dimensions");
+      }
+      if (output_tensor.shape().dim_size(0) != batch->size() + padding_size) {
+        return errors::FailedPrecondition(
+            "Batched output tensor's 0th dimension does not equal the sum of "
+            "the 0th dimension sizes of the input tensors");
+      }
+
+      std::vector<Tensor> split_tensor;
+      const Status split_status = tensor::Split(
+          output_tensor, task_sizes_plus_optional_padding, &split_tensor);
+      DCHECK(split_status.ok()) << split_status.ToString();
+      if (!split_status.ok()) {
+        return errors::Internal("Tensor split operation failed: ",
+                                split_status.ToString());
+      }
+      DCHECK_EQ(split_tensor.size(), task_sizes_plus_optional_padding.size());
+      if (split_tensor.size() != task_sizes_plus_optional_padding.size()) {
+        return errors::Internal(
+            "Tensor split operation did not work as expected; got ",
+            split_tensor.size(), " splits; expected ",
+            task_sizes_plus_optional_padding.size());
+      }
+
+      for (int j = 0; j < batch->num_tasks(); ++j) {
+        BatchTask& task = *(batch->mutable_task(j));
+        task.context->set_output(i, split_tensor.at(j));
+      }  // (Ignore a possible final split_tensors entry containing the
+         // padding.)
+    }
+
+    return Status::OK();
+  }
+
+  void ProcessFuncBatch(std::unique_ptr<Batch> batch) const {
+    if (batch->empty()) {
+      return;
+    }
+
+    OpKernelContext* last_task_context =
+        batch->task(batch->num_tasks() - 1).context;
+
+    // Regardless of the outcome, we need to propagate the status to the
+    // individual tasks and signal that they are done. We use MakeCleanup() to
+    // ensure that this happens no matter how we exit the method below.
+    Status status;
+    bool cleanup_done = false;
+    auto cleanup_fn = [&cleanup_done, &batch](const Status& status) {
+      if (cleanup_done) {
+        return;
+      }
+      for (int i = 0; i < batch->num_tasks(); ++i) {
+        batch->mutable_task(i)->context->SetStatus(status);
+        batch->mutable_task(i)->done_callback();
+      }
+      cleanup_done = true;
+    };
+    auto finally =
+        gtl::MakeCleanup([&cleanup_fn, &status] { cleanup_fn(status); });
+
+    status = ValidateBatch(*batch);
+    if (!status.ok()) {
+      return;
     }
 
+    std::vector<Tensor> concatenated_tensors;
+    status =
+        ConcatInputTensors(*batch, last_task_context, &concatenated_tensors);
+    if (!status.ok()) {
+      return;
+    }
+    FunctionLibraryRuntime::Options opts;
+    opts.step_id = last_task_context->step_id();
+    opts.step_container = last_task_context->step_container();
+    opts.cancellation_manager = last_task_context->cancellation_manager();
+    opts.stats_collector = last_task_context->stats_collector();
+    opts.rendezvous = last_task_context->rendezvous();
+    opts.runner = last_task_context->runner();
+
+    auto* flib = last_task_context->function_library();
+    std::vector<Tensor> combined_outputs;
+    Notification done;
+    std::vector<Tensor> args(concatenated_tensors.begin(),
+                             concatenated_tensors.end());
+    const auto& captured_inputs =
+        batch->task(batch->num_tasks() - 1).captured_inputs;
+    args.insert(args.end(), captured_inputs.begin(), captured_inputs.end());
+
+    // Releases the cleanup method here, because the callback of the function
+    // library runtime will handle it now.
+    finally.release();
+    flib->Run(
+        opts, fhandle_, args, &combined_outputs, [&](const Status& run_status) {
+          Status final_status;
+          auto run_finally = gtl::MakeCleanup([&]() {
+            // We do the cleanup here as an optimization, so that it runs in
+            // the underlying TF inter-op threadpool. Running it in the
+            // threadpool, let's the ensuing ops be scheduled faster,
+            // because the executor will add them to the front of the
+            // threadpool's task queue rather than the end.
+            cleanup_fn(final_status);
+            done.Notify();
+          });
+          final_status = run_status;
+          if (!final_status.ok()) {
+            return;
+          }
+          final_status = SplitOutputTensors(combined_outputs, batch.get());
+        });
+    // By waiting for the notification we are ensuring that this thread isn't
+    // used for processing other batches, which gives the batches time to
+    // coalesce upstream. So overall the number of batches going through the
+    // devices goes down, improving latency and throughput in most cases.
+    done.WaitForNotification();
+  }
+
+  // Processes a batch of one or more BatchTask entries.
+  void ProcessBatch(std::unique_ptr<Batch> batch) const {
+    if (batch->empty()) {
+      return;
+    }
+
+    OpKernelContext* last_task_context =
+        batch->task(batch->num_tasks() - 1).context;
+    AsyncOpKernel::DoneCallback last_task_callback =
+        batch->task(batch->num_tasks() - 1).done_callback;
+
+    OP_REQUIRES_OK_ASYNC(last_task_context, ValidateBatch(*batch),
+                         last_task_callback);
+
+    // All tasks should have the same number of input edges.
+    const int num_input_edges = batch->task(0).inputs.size();
+    std::vector<Tensor> concatenated_tensors;
+    const Status concat_status =
+        ConcatInputTensors(*batch, last_task_context, &concatenated_tensors);
+    OP_REQUIRES_OK_ASYNC(last_task_context, concat_status, last_task_callback);
+
+    // Process each input edge one at a time (the typical case has just one).
+    for (int i = 0; i < num_input_edges; ++i) {
+      last_task_context->set_output(i, concatenated_tensors.at(i));
+
+      // Emit batch->num_tasks() - 1 empty output tensors.
+      for (int task_idx = 0; task_idx < batch->num_tasks() - 1; ++task_idx) {
+        const BatchTask& task = batch->task(task_idx);
+        TensorShape output_shape(task.inputs.at(i).shape());
+        output_shape.set_dim(0, 0);
+        Tensor* output = nullptr;
+        OP_REQUIRES_OK_ASYNC(
+            task.context,
+            task.context->allocate_output(i, output_shape, &output),
+            task.done_callback);
+      }
+    }
     // Emit batch->num_tasks() - 1 empty index tensors.
     for (int task_idx = 0; task_idx < batch->num_tasks() - 1; ++task_idx) {
       const BatchTask& task = batch->task(task_idx);
@@ -463,7 +646,7 @@ class BatchResource : public ResourceBase {
     return Status::OK();
   }
 
-  // Looks up the batcher queue for 'queue_name'. If it didn't previously exist,
+  // Looks up the batcher queue for 'queue_name'. If it did't previously exist,
   // creates it.
   Status LookupOrCreateBatcherQueue(const string& queue_name,
                                     BatcherQueue** queue) {
@@ -477,7 +660,11 @@ class BatchResource : public ResourceBase {
 
     std::unique_ptr<BatcherQueue> new_queue;
     auto process_batch_callback = [this](std::unique_ptr<Batch> batch) {
-      ProcessBatch(std::move(batch));
+      if (fhandle_ == kInvalidHandle) {
+        ProcessBatch(std::move(batch));
+      } else {
+        ProcessFuncBatch(std::move(batch));
+      }
     };
     TF_RETURN_IF_ERROR(batcher_->AddQueue(batcher_queue_options_,
                                           process_batch_callback, &new_queue));
@@ -498,8 +685,99 @@ class BatchResource : public ResourceBase {
       GUARDED_BY(batcher_queues_mu_);
 
   std::vector<int32> allowed_batch_sizes_;
+  FunctionLibraryRuntime::Handle fhandle_;
 };
 
+class BatchFunctionKernel : public AsyncOpKernel {
+ public:
+  explicit BatchFunctionKernel(OpKernelConstruction* c) : AsyncOpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("container", &container_));
+    OP_REQUIRES_OK(c, c->GetAttr("shared_name", &shared_name_));
+    // If shared_name is not supplied, use name instead (prevent collisions by
+    // default).
+    if (shared_name_.empty()) {
+      shared_name_ = name();
+    }
+    OP_REQUIRES_OK(c, c->GetAttr("batching_queue", &batcher_queue_));
+    OP_REQUIRES_OK(c, c->GetAttr("num_batch_threads", &num_batch_threads_));
+    OP_REQUIRES_OK(c, c->GetAttr("max_batch_size", &max_batch_size_));
+    OP_REQUIRES_OK(c,
+                   c->GetAttr("batch_timeout_micros", &batch_timeout_micros_));
+    OP_REQUIRES_OK(c,
+                   c->GetAttr("max_enqueued_batches", &max_enqueued_batches_));
+    OP_REQUIRES_OK(c, c->GetAttr("allowed_batch_sizes", &allowed_batch_sizes_));
+    OP_REQUIRES_OK(c, ValidateAllowedBatchSizes());
+
+    auto lib = c->function_library();
+    OP_REQUIRES(c, lib != nullptr, errors::Internal("No function library"));
+    NameAttrList func;
+    OP_REQUIRES_OK(c, c->GetAttr("f", &func));
+    OP_REQUIRES_OK(
+        c, lib->Instantiate(func.name(), AttrSlice(&func.attr()), &fhandle_));
+  }
+
+  bool IsExpensive() override { return false; }
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) final {
+    BatchResource* br;
+    std::function<Status(BatchResource * *r)> creator = [this,
+                                                         c](BatchResource** r) {
+      std::unique_ptr<BatchResource> new_resource;
+      TF_RETURN_IF_ERROR(
+          BatchResource::Create(num_batch_threads_, max_batch_size_,
+                                batch_timeout_micros_, max_enqueued_batches_,
+                                allowed_batch_sizes_, fhandle_, &new_resource));
+      *r = new_resource.release();
+      return Status::OK();
+    };
+    OP_REQUIRES_OK_ASYNC(c,
+                         c->resource_manager()->LookupOrCreate(
+                             container_, shared_name_, &br, creator),
+                         done);
+    const Status status =
+        br->RegisterInput(random::New64(), c, batcher_queue_, done);
+    br->Unref();
+    OP_REQUIRES_OK_ASYNC(c, status, done);
+    // Assume br calls done, so nothing to do here.
+  }
+
+  // Validates 'allowed_batch_sizes_'. The entries must increase monotonically,
+  // and the last one must equal 'max_batch_size_'.
+  Status ValidateAllowedBatchSizes() const {
+    if (allowed_batch_sizes_.empty()) {
+      return Status::OK();
+    }
+    int32 last_size = 0;
+    for (size_t i = 0; i < allowed_batch_sizes_.size(); ++i) {
+      const int32 size = allowed_batch_sizes_.at(i);
+      if (i > 0 && size <= last_size) {
+        return errors::InvalidArgument(
+            "allowed_batch_sizes entries must be monotonically increasing");
+      }
+      if (i == allowed_batch_sizes_.size() - 1 && size != max_batch_size_) {
+        return errors::InvalidArgument(
+            "final entry in allowed_batch_sizes must equal max_batch_size");
+      }
+      last_size = size;
+    }
+    return Status::OK();
+  }
+
+ private:
+  string container_;
+  string shared_name_;
+  string batcher_queue_;
+  int32 num_batch_threads_;
+  int32 max_batch_size_;
+  int32 batch_timeout_micros_;
+  int32 max_enqueued_batches_;
+  std::vector<int32> allowed_batch_sizes_;
+  FunctionLibraryRuntime::Handle fhandle_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("BatchFunction").Device(DEVICE_CPU),
+                        BatchFunctionKernel);
+
 class BatchKernel : public AsyncOpKernel {
  public:
   explicit BatchKernel(OpKernelConstruction* c) : AsyncOpKernel(c) {
@@ -528,7 +806,8 @@ class BatchKernel : public AsyncOpKernel {
           std::unique_ptr<BatchResource> new_resource;
           TF_RETURN_IF_ERROR(BatchResource::Create(
               num_batch_threads_, max_batch_size_, batch_timeout_micros_,
-              max_enqueued_batches_, allowed_batch_sizes_, &new_resource));
+              max_enqueued_batches_, allowed_batch_sizes_, kInvalidHandle,
+              &new_resource));
           *r = new_resource.release();
           return Status::OK();
         };
@@ -539,9 +818,7 @@ class BatchKernel : public AsyncOpKernel {
     const Status status =
         br->RegisterInput(random::New64(), c, batcher_queue_, done);
     br->Unref();
-    if (!status.ok()) {
-      OP_REQUIRES_OK_ASYNC(c, status, done);
-    }
+    OP_REQUIRES_OK_ASYNC(c, status, done);
     // Assume br calls done, so nothing to do here.
   }
 
@@ -800,9 +1077,7 @@ class UnbatchKernel : public AsyncOpKernel {
                          done);
     auto status = ubr->Compute(c, done);
     ubr->Unref();
-    if (!status.ok()) {
-      OP_REQUIRES_OK_ASYNC(c, status, done);
-    }
+    OP_REQUIRES_OK_ASYNC(c, status, done);
     // Assume ubr calls done, so nothing to do here.
   }
 
@@ -840,10 +1115,12 @@ class UnbatchGradResource : public ResourceBase {
     }
 
     const DataType type = tensors[0].dtype();
+    Tensor concatenated_tensor;
     switch (type) {
-#define CASE(type)                                         \
-  case DataTypeToEnum<type>::value:                        \
-    TF_RETURN_IF_ERROR(Concat<type>(context, tensors, 0)); \
+#define CASE(type)                                                            \
+  case DataTypeToEnum<type>::value:                                           \
+    TF_RETURN_IF_ERROR(Concat<type>(context, tensors, &concatenated_tensor)); \
+    context->set_output(0, concatenated_tensor);                              \
     break;
       TF_CALL_ALL_TYPES(CASE);
 #undef CASE
@@ -986,9 +1263,7 @@ class UnbatchGradKernel : public AsyncOpKernel {
                          done);
     Status status = ubr->Compute(c, done);
     ubr->Unref();
-    if (!status.ok()) {
-      OP_REQUIRES_OK_ASYNC(c, status, done);
-    }
+    OP_REQUIRES_OK_ASYNC(c, status, done);
     // Assume ubr calls done, so nothing to do here.
   }
 
diff --git a/tensorflow/core/kernels/batch_matmul_op_impl.h b/tensorflow/core/kernels/batch_matmul_op_impl.h
index 81209b86d679b6..1b949c3eeda7c0 100644
--- a/tensorflow/core/kernels/batch_matmul_op_impl.h
+++ b/tensorflow/core/kernels/batch_matmul_op_impl.h
@@ -329,6 +329,8 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
       c_ptrs.push_back(&c_device_memory.back());
     }
 
+    typedef Scalar Coefficient;
+
     // Blas does
     // C = A x B
     // where A, B and C are assumed to be in column major.
@@ -352,9 +354,9 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
         bool blas_launch_status =
             stream
                 ->ThenBlasGemv(gemv_trans_a, adj_x ? m : k, adj_x ? k : m,
-                               static_cast<Scalar>(1.0), *(a_ptrs[0]),
+                               static_cast<Coefficient>(1.0), *(a_ptrs[0]),
                                adj_x ? m : k, *(b_ptrs[0]), 1,
-                               static_cast<Scalar>(0.0), c_ptrs[0], 1)
+                               static_cast<Coefficient>(0.0), c_ptrs[0], 1)
                 .ok();
         if (!blas_launch_status) {
           context->SetStatus(errors::Internal(
@@ -366,9 +368,9 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
         bool blas_launch_status =
             stream
                 ->ThenBlasGemm(blas_transpose_b, blas_transpose_a, n, m, k,
-                               static_cast<Scalar>(1.0), *(b_ptrs[0]),
+                               static_cast<Coefficient>(1.0), *(b_ptrs[0]),
                                adj_y ? k : n, *(a_ptrs[0]), adj_x ? m : k,
-                               static_cast<Scalar>(0.0), c_ptrs[0], n)
+                               static_cast<Coefficient>(0.0), c_ptrs[0], n)
                 .ok();
         if (!blas_launch_status) {
           context->SetStatus(errors::Internal(
@@ -383,8 +385,8 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
           stream
               ->ThenBlasGemmBatchedWithScratch(
                   blas_transpose_b, blas_transpose_a, n, m, k,
-                  static_cast<Scalar>(1.0), b_ptrs, adj_y ? k : n, a_ptrs,
-                  adj_x ? m : k, static_cast<Scalar>(0.0), c_ptrs, n,
+                  static_cast<Coefficient>(1.0), b_ptrs, adj_y ? k : n, a_ptrs,
+                  adj_x ? m : k, static_cast<Coefficient>(0.0), c_ptrs, n,
                   batch_size, &scratch_allocator)
               .ok();
       if (!blas_launch_status) {
@@ -398,6 +400,98 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
   }
 };
 
+template <>
+struct LaunchBatchMatMul<GPUDevice, Eigen::half> {
+  static void Launch(OpKernelContext* context, const Tensor& in_x,
+                     const Tensor& in_y, bool adj_x, bool adj_y, Tensor* out) {
+    typedef Eigen::half Scalar;
+    constexpr perftools::gputools::blas::Transpose kTranspose =
+        is_complex<Scalar>::value
+            ? perftools::gputools::blas::Transpose::kConjugateTranspose
+            : perftools::gputools::blas::Transpose::kTranspose;
+    perftools::gputools::blas::Transpose trans[] = {
+        perftools::gputools::blas::Transpose::kNoTranspose, kTranspose};
+    const uint64 m = in_x.dim_size(adj_x ? 2 : 1);
+    const uint64 k = in_x.dim_size(adj_x ? 1 : 2);
+    const uint64 n = in_y.dim_size(adj_y ? 1 : 2);
+    const uint64 batch_size = in_x.dim_size(0);
+    auto blas_transpose_a = trans[adj_x];
+    auto blas_transpose_b = trans[adj_y];
+
+    auto* stream = context->op_device_context()->stream();
+    OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+
+    typedef perftools::gputools::DeviceMemory<Scalar> DeviceMemoryType;
+    std::vector<DeviceMemoryType> a_device_memory;
+    std::vector<DeviceMemoryType> b_device_memory;
+    std::vector<DeviceMemoryType> c_device_memory;
+    std::vector<DeviceMemoryType*> a_ptrs;
+    std::vector<DeviceMemoryType*> b_ptrs;
+    std::vector<DeviceMemoryType*> c_ptrs;
+    a_device_memory.reserve(batch_size);
+    b_device_memory.reserve(batch_size);
+    c_device_memory.reserve(batch_size);
+    a_ptrs.reserve(batch_size);
+    b_ptrs.reserve(batch_size);
+    c_ptrs.reserve(batch_size);
+    auto* a_base_ptr = in_x.template flat<Scalar>().data();
+    auto* b_base_ptr = in_y.template flat<Scalar>().data();
+    auto* c_base_ptr = out->template flat<Scalar>().data();
+    for (int64 i = 0; i < batch_size; ++i) {
+      a_device_memory.push_back(AsDeviceMemory(a_base_ptr + i * m * k));
+      b_device_memory.push_back(AsDeviceMemory(b_base_ptr + i * k * n));
+      c_device_memory.push_back(AsDeviceMemory(c_base_ptr + i * m * n));
+      a_ptrs.push_back(&a_device_memory.back());
+      b_ptrs.push_back(&b_device_memory.back());
+      c_ptrs.push_back(&c_device_memory.back());
+    }
+
+    typedef float Coefficient;
+
+    // Blas does
+    // C = A x B
+    // where A, B and C are assumed to be in column major.
+    // We want the output to be in row-major, so we can compute
+    // C' = B' x A', where ' stands for transpose (not adjoint).
+    // TODO(yangzihao): Choose the best of the three strategies using autotune.
+    if (batch_size == 1) {
+      // This is a regular matrix*matrix or matrix*vector multiply. Avoid the
+      // overhead of the scratch allocator and the batch interface.
+      // TODO(benbarsdell): Use fp16 Gemv if it becomes supported by CUBLAS
+      bool blas_launch_status =
+          stream
+              ->ThenBlasGemm(blas_transpose_b, blas_transpose_a, n, m, k,
+                             static_cast<Coefficient>(1.0), *(b_ptrs[0]),
+                             adj_y ? k : n, *(a_ptrs[0]), adj_x ? m : k,
+                             static_cast<Coefficient>(0.0), c_ptrs[0], n)
+              .ok();
+      if (!blas_launch_status) {
+        context->SetStatus(errors::Internal(
+            "Blas xGEMM launch failed : a.shape=", in_x.shape().DebugString(),
+            ", b.shape=", in_y.shape().DebugString(), ", m=", m, ", n=", n,
+            ", k=", k));
+      }
+    } else {
+      BlasScratchAllocator scratch_allocator(context);
+      bool blas_launch_status =
+          stream
+              ->ThenBlasGemmBatchedWithScratch(
+                  blas_transpose_b, blas_transpose_a, n, m, k,
+                  static_cast<Coefficient>(1.0), b_ptrs, adj_y ? k : n, a_ptrs,
+                  adj_x ? m : k, static_cast<Coefficient>(0.0), c_ptrs, n,
+                  batch_size, &scratch_allocator)
+              .ok();
+      if (!blas_launch_status) {
+        context->SetStatus(
+            errors::Internal("Blas xGEMMBatched launch failed : a.shape=",
+                             in_x.shape().DebugString(), ", b.shape=",
+                             in_y.shape().DebugString(), ", m=", m, ", n=", n,
+                             ", k=", k, ", batch_size=", batch_size));
+      }
+    }
+  }
+};
+
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/batch_matmul_op_real.cc b/tensorflow/core/kernels/batch_matmul_op_real.cc
index 614d6a0081d120..427ef4f2f70316 100644
--- a/tensorflow/core/kernels/batch_matmul_op_real.cc
+++ b/tensorflow/core/kernels/batch_matmul_op_real.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/batch_matmul_op_impl.h"
 
+#if GOOGLE_CUDA
+#include "cuda/include/cuda.h"
+#endif  // GOOGLE_CUDA
+
 namespace tensorflow {
 
 #if !defined(INTEL_MKL)
@@ -27,10 +31,8 @@ TF_CALL_int32(REGISTER_BATCH_MATMUL_CPU);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 TF_CALL_float(REGISTER_BATCH_MATMUL_GPU);
 TF_CALL_double(REGISTER_BATCH_MATMUL_GPU);
-// ROCM TODO: check when to enable fp16 batch matmul
-#if CUDA_VERSION >= 7050
-TF_CALL_half(REGISTER_BATCH_MATMUL_GPU);
-#endif
+// TODO(csigg): Implement Stream::ThenBlasGemv for Eigen::half and uncomment.
+// TF_CALL_half(REGISTER_BATCH_MATMUL_GPU);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/batch_util.h b/tensorflow/core/kernels/batch_util.h
index 69098fbd1d8dcc..dad2ec4e103e19 100644
--- a/tensorflow/core/kernels/batch_util.h
+++ b/tensorflow/core/kernels/batch_util.h
@@ -12,43 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+// NOTE(lespeholt): This file is deprecated. Use
+// "tensorflow/core/util/batch_util.h" instead.
+
 #ifndef TENSORFLOW_CORE_KERNELS_BATCH_UTIL_H_
 #define TENSORFLOW_CORE_KERNELS_BATCH_UTIL_H_
 
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/lib/core/status.h"
-
-namespace tensorflow {
-namespace batch_util {
-
-// Copies element into the index^th slice of parent (in the 0th dimension).
-//
-// NOTE(mrry): The `element` argument is taken by value. Use `std::move()`
-// to move the `element` argument into this function, and the implementation
-// may be able to optimize the copy to a move. This is particularly important
-// for DT_STRING tensors.
-Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index);
-
-// Copies the index^th slice of parent (in the 0th dimension) into element.
-Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index);
-
-// Copies the index^th slice of parent (in the 0th dimension) into element.
-//
-// NOTE(mrry): The implementation may be able to optimize the copy to a move.
-// This is particularly important for DT_STRING tensors.
-Status MaybeMoveSliceToElement(Tensor* parent, Tensor* element, int64 index);
-
-// Zero-initializes the tensor `element` using the scalar stored in `padding`.
-// Both `element` and `padding` must have matching `dtype`.
-Status SetElementZero(Tensor* element, const Tensor& padding);
-
-// Copies `element` into a (0th dimension) slice of `parent`, assuming
-// the shape of `element` is strictly not larger along any axis than a
-// slice.
-Status CopyElementToLargerSlice(const Tensor& element, Tensor* parent,
-                                int index);
-
-}  // namespace batch_util
-}  // namespace tensorflow
+#include "tensorflow/core/util/batch_util.h"
 
 #endif  // TENSORFLOW_CORE_KERNELS_BATCH_UTIL_H_
diff --git a/tensorflow/core/kernels/batching_util/BUILD b/tensorflow/core/kernels/batching_util/BUILD
index de05c647d6bfc8..e292ff200a2f2f 100644
--- a/tensorflow/core/kernels/batching_util/BUILD
+++ b/tensorflow/core/kernels/batching_util/BUILD
@@ -126,6 +126,27 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "serial_device_batch_scheduler",
+    hdrs = ["serial_device_batch_scheduler.h"],
+    deps = [
+        ":batch_scheduler",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "serial_device_batch_scheduler_test",
+    srcs = ["serial_device_batch_scheduler_test.cc"],
+    deps = [
+        ":fake_clock_env",
+        ":serial_device_batch_scheduler",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "basic_batch_scheduler",
     hdrs = ["basic_batch_scheduler.h"],
diff --git a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
index f5ced95febfdfa..b77c14d0128431 100644
--- a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
@@ -76,7 +76,7 @@ class AdaptiveSharedBatchScheduler
           AdaptiveSharedBatchScheduler<TaskType>> {
  public:
   ~AdaptiveSharedBatchScheduler() {
-    // Finish processing batches before destorying other class members.
+    // Finish processing batches before destroying other class members.
     batch_thread_pool_.reset();
   }
 
@@ -92,6 +92,9 @@ class AdaptiveSharedBatchScheduler
     // for num_batch_threads allows for large in_flight_batches_limit_, which
     // will harm latency for some time once load increases again.
     int64 num_batch_threads = port::NumSchedulableCPUs();
+    // Lower bound for in_flight_batches_limit_. As discussed above, can be used
+    // to minimize the damage caused by the random walk under low load.
+    int64 min_in_flight_batches_limit = 1;
     // Although batch selection is primarily based on age, this parameter
     // specifies a preference for larger batches.  A full batch will be
     // scheduled before an older, nearly empty batch as long as the age gap is
@@ -286,6 +289,16 @@ Status AdaptiveSharedBatchScheduler<TaskType>::Create(
     return errors::InvalidArgument("num_batch_threads must be positive; was ",
                                    options.num_batch_threads);
   }
+  if (options.min_in_flight_batches_limit < 1) {
+    return errors::InvalidArgument(
+        "min_in_flight_batches_limit must be >= 1; was ",
+        options.min_in_flight_batches_limit);
+  }
+  if (options.min_in_flight_batches_limit > options.num_batch_threads) {
+    return errors::InvalidArgument(
+        "min_in_flight_batches_limit (", options.min_in_flight_batches_limit,
+        ") must be <= num_batch_threads (", options.num_batch_threads, ")");
+  }
   if (options.full_batch_scheduling_boost_micros < 0) {
     return errors::InvalidArgument(
         "full_batch_scheduling_boost_micros can't be negative; was ",
@@ -298,11 +311,12 @@ Status AdaptiveSharedBatchScheduler<TaskType>::Create(
         ") should not be larger than num_batch_threads (",
         options.num_batch_threads, ")");
   }
-  if (options.initial_in_flight_batches_limit < 1) {
-    return errors::InvalidArgument(
-        "initial_in_flight_batches_limit should be "
-        "greater than or equal to 1; was ",
-        options.initial_in_flight_batches_limit);
+  if (options.initial_in_flight_batches_limit <
+      options.min_in_flight_batches_limit) {
+    return errors::InvalidArgument("initial_in_flight_batches_limit (",
+                                   options.initial_in_flight_batches_limit,
+                                   "must be >= min_in_flight_batches_limit (",
+                                   options.min_in_flight_batches_limit, ")");
   }
   if (options.batches_to_average_over < 1) {
     return errors::InvalidArgument(
@@ -437,7 +451,9 @@ void AdaptiveSharedBatchScheduler<TaskType>::CallbackWrapper(
     in_flight_batches_limit_ =
         std::min(in_flight_batches_limit_,
                  static_cast<double>(options_.num_batch_threads));
-    in_flight_batches_limit_ = std::max(in_flight_batches_limit_, 1.0);
+    in_flight_batches_limit_ =
+        std::max(in_flight_batches_limit_,
+                 static_cast<double>(options_.min_in_flight_batches_limit));
     last_avg_latency_ms_ = current_avg_latency_ms;
     last_latency_decreased_ = current_latency_decreased;
     batch_count_ = 0;
diff --git a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler_test.cc
index 1be0c1f5c65cad..af356cf24db9a9 100644
--- a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler_test.cc
@@ -80,6 +80,18 @@ TEST(AdaptiveSharedBatchSchedulerTest, BadOptions) {
   options = Scheduler::Options();
   options.batches_to_average_over = -5;
   EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
+  options = Scheduler::Options();
+  options.min_in_flight_batches_limit = 0;
+  EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
+  options = Scheduler::Options();
+  options.min_in_flight_batches_limit = 5;
+  options.num_batch_threads = 3;
+  EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
+  options = Scheduler::Options();
+  options.initial_in_flight_batches_limit = 1;
+  options.min_in_flight_batches_limit = 2;
+  options.num_batch_threads = 3;
+  EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
 }
 
 TEST(AdaptiveSharedBatchSchedulerTest, InFlightBatchesLimit) {
diff --git a/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h b/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h
new file mode 100644
index 00000000000000..518f2ff8a939ae
--- /dev/null
+++ b/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h
@@ -0,0 +1,548 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_SERIAL_DEVICE_BATCH_SCHEDULER_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_SERIAL_DEVICE_BATCH_SCHEDULER_H_
+
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <random>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace serving {
+namespace internal {
+template <typename TaskType>
+class SDBSBatch;
+
+template <typename TaskType>
+class SDBSQueue;
+}  // namespace internal
+
+// EXPERIMENTAL: API MAY BE SUBJECTED TO SUDDEN CHANGES.
+//
+// Shared batch scheduler designed for batches which are processed by a serial
+// device (e.g. GPU, TPU). When batch processing involves a mix of
+// parallelizable cpu work and non-parallelizable on-device work, overall
+// latency can be minimized by producing batches at a (load dependent) rate
+// which keeps the serial device uniformly busy.
+//
+// SerialDeviceBatchScheduler (SDBS) controls the batching rate by limiting the
+// allowed number of concurrently processed batches. Too large a limit causes
+// batches to pile up behind the serial device, adding to the overall batch
+// latency. Too small a limit underutilizes the serial device and harms latency
+// by forcing batches to wait longer to be processed. Feedback from the device
+// (i.e. avg number of batches directly pending on the device) is used to set
+// the correct limit.
+//
+// SDBS groups requests into per model batches which are processed when a batch
+// processing thread becomes available. SDBS prioritizes batches primarily by
+// age (i.e. the batch's oldest request) along with a configurable preference
+// for scheduling larger batches first.
+
+
+template <typename TaskType>
+class SerialDeviceBatchScheduler : public std::enable_shared_from_this<
+                                       SerialDeviceBatchScheduler<TaskType>> {
+ public:
+  ~SerialDeviceBatchScheduler();
+
+  struct Options {
+    // The name to use for the pool of batch threads.
+    string thread_pool_name = {"batch_threads"};
+    // Maximum number of batch processing threads.
+    int64 num_batch_threads = port::NumSchedulableCPUs();
+    // Although batch selection is primarily based on age, this parameter
+    // specifies a preference for larger batches.  A full batch will be
+    // scheduled before an older, nearly empty batch as long as the age gap is
+    // less than full_batch_scheduling_boost_micros.  The optimal value for this
+    // parameter should be of order the batch processing latency, but must be
+    // chosen carefully, as too large a value will harm tail latency.
+    int64 full_batch_scheduling_boost_micros = 0;
+    // The environment to use (typically only overridden by test code).
+    Env* env = Env::Default();
+    // Initial limit for number of batches being concurrently processed.
+    int64 initial_in_flight_batches_limit = 3;
+    // Returns the current number of batches directly waiting to be processed
+    // by the serial device (i.e. GPU, TPU).
+    std::function<int64()> get_pending_on_serial_device;
+    // Desired average number of batches directly waiting to be processed by the
+    // serial device. Small numbers of O(1) should deliver the best latency.
+    double target_pending = 2;
+    // Number of batches between potential adjustments of
+    // in_flight_batches_limit.  Larger numbers will reduce noise, but will be
+    // less responsive to sudden changes in workload.
+    int64 batches_to_average_over = 1000;
+  };
+
+  // Ownership is shared between the caller of Create() and any queues created
+  // via AddQueue().
+  static Status Create(
+      const Options& options,
+      std::shared_ptr<SerialDeviceBatchScheduler<TaskType>>* scheduler);
+
+  struct QueueOptions {
+    // Maximum size of each batch.
+    int max_batch_size = 1000;
+    // Maximum number of enqueued (i.e. non-scheduled) batches.
+    int max_enqueued_batches = 10;
+  };
+
+  using BatchProcessor = std::function<void(std::unique_ptr<Batch<TaskType>>)>;
+
+  // Adds queue (and its callback) to be managed by this scheduler.
+  Status AddQueue(const QueueOptions& options,
+                  BatchProcessor process_batch_callback,
+                  std::unique_ptr<BatchScheduler<TaskType>>* queue);
+
+  double in_flight_batches_limit() {
+    mutex_lock l(mu_);
+    return in_flight_batches_limit_;
+  }
+
+  double recent_low_traffic_ratio() {
+    mutex_lock l(mu_);
+    return recent_low_traffic_ratio_;
+  }
+
+ private:
+  // access to AddBatch(), RemoveQueue(), env().
+  friend class internal::SDBSQueue<TaskType>;
+
+  explicit SerialDeviceBatchScheduler(const Options& options);
+
+  // Continuously retrieves and processes batches.
+  void ProcessBatches();
+
+  // Notifies scheduler of non-empty batch which is eligible for processing.
+  void AddBatch(const internal::SDBSBatch<TaskType>* batch);
+
+  // Removes queue from scheduler.
+  void RemoveQueue(const internal::SDBSQueue<TaskType>* queue);
+
+  Env* env() const { return options_.env; }
+
+  const Options options_;
+
+  // Collection of batches added by AddBatch. Owned by scheduler until they are
+  // released for processing.
+  std::vector<const internal::SDBSBatch<TaskType>*> batches_ GUARDED_BY(mu_);
+
+  // Unowned queues and callbacks added by AddQueue.
+  std::unordered_map<const internal::SDBSQueue<TaskType>*, BatchProcessor>
+      queues_and_callbacks_ GUARDED_BY(mu_);
+
+  // Responsible for running the batch processing callbacks.
+  std::unique_ptr<thread::ThreadPool> batch_thread_pool_;
+
+  // Limit on number of batches which can be concurrently processed.
+  int64 in_flight_batches_limit_ GUARDED_BY(mu_);
+
+  // Number of batch processing threads.
+  int64 processing_threads_ GUARDED_BY(mu_) = 0;
+
+  // Number of batches processed since the last in_flight_batches_limit_
+  // adjustment.
+  int64 batch_count_ GUARDED_BY(mu_) = 0;
+
+  // Number of times since the last in_flight_batches_limit_ adjustment when a
+  // processing thread was available but there were no batches to process.
+  int64 no_batch_count_ GUARDED_BY(mu_) = 0;
+
+  // Sum of batches pending on the serial device since the last
+  // in_flight_batches_limit_ adjustment.
+  int64 pending_sum_ = 0;
+
+  // Sum of batch latencies since the last in_flight_batches_limit_ adjustment.
+  int64 batch_latency_sum_ = 0;
+
+  // Average period between which two consecutive batches begin processing.
+  int64 batch_period_micros_ = 0;
+
+  // Moving average tracking the fraction of recent in_flight_batches_limit_
+  // adjustments where the external traffic was not high enough to provide
+  // useful feedback for an adjustment.
+  double recent_low_traffic_ratio_ = 0;
+
+  mutex mu_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(SerialDeviceBatchScheduler);
+};
+
+//////////////////////////////////////////////////////////
+// Implementation details follow. API users need not read.
+
+namespace internal {
+// Consolidates tasks into batches, passing them off to the
+// SerialDeviceBatchScheduler for processing.
+template <typename TaskType>
+class SDBSQueue : public BatchScheduler<TaskType> {
+ public:
+  using QueueOptions =
+      typename SerialDeviceBatchScheduler<TaskType>::QueueOptions;
+
+  SDBSQueue(std::shared_ptr<SerialDeviceBatchScheduler<TaskType>> scheduler,
+            const QueueOptions& options);
+
+  ~SDBSQueue() override;
+
+  // Adds task to current batch. Fails if the task size is larger than the batch
+  // size or if the current batch is full and this queue's number of outstanding
+  // batches is at its maximum.
+  Status Schedule(std::unique_ptr<TaskType>* task) override;
+
+  // Number of tasks waiting to be scheduled.
+  size_t NumEnqueuedTasks() const override;
+
+  // Number of size 1 tasks which could currently be scheduled without failing.
+  size_t SchedulingCapacity() const override;
+
+  // Notifies queue that a batch is about to be scheduled; the queue should not
+  // place any more tasks in this batch.
+  void ReleaseBatch(const SDBSBatch<TaskType>* batch);
+
+  size_t max_task_size() const override { return options_.max_batch_size; }
+
+ private:
+  std::shared_ptr<SerialDeviceBatchScheduler<TaskType>> scheduler_;
+  const QueueOptions options_;
+  // Owned by scheduler_.
+  SDBSBatch<TaskType>* current_batch_ GUARDED_BY(mu_) = nullptr;
+  int64 num_enqueued_batches_ GUARDED_BY(mu_) = 0;
+  int64 num_enqueued_tasks_ GUARDED_BY(mu_) = 0;
+  mutable mutex mu_;
+  TF_DISALLOW_COPY_AND_ASSIGN(SDBSQueue);
+};
+
+// Batch which remembers when and by whom it was created.
+template <typename TaskType>
+class SDBSBatch : public Batch<TaskType> {
+ public:
+  SDBSBatch(SDBSQueue<TaskType>* queue, int64 creation_time_micros)
+      : queue_(queue), creation_time_micros_(creation_time_micros) {}
+
+  ~SDBSBatch() override {}
+
+  SDBSQueue<TaskType>* queue() const { return queue_; }
+
+  int64 creation_time_micros() const { return creation_time_micros_; }
+
+ private:
+  SDBSQueue<TaskType>* queue_;
+  const int64 creation_time_micros_;
+  TF_DISALLOW_COPY_AND_ASSIGN(SDBSBatch);
+};
+}  // namespace internal
+
+// ---------------- SerialDeviceBatchScheduler ----------------
+
+template <typename TaskType>
+Status SerialDeviceBatchScheduler<TaskType>::Create(
+    const Options& options,
+    std::shared_ptr<SerialDeviceBatchScheduler<TaskType>>* scheduler) {
+  if (options.num_batch_threads < 1) {
+    return errors::InvalidArgument("num_batch_threads must be positive; was ",
+                                   options.num_batch_threads);
+  }
+  if (options.initial_in_flight_batches_limit < 1) {
+    return errors::InvalidArgument(
+        "initial_in_flight_batches_limit must be positive; was ",
+        options.initial_in_flight_batches_limit);
+  }
+  if (options.initial_in_flight_batches_limit > options.num_batch_threads) {
+    return errors::InvalidArgument(
+        "initial_in_flight_batches_limit (",
+        options.initial_in_flight_batches_limit,
+        ") should not be larger than num_batch_threads (",
+        options.num_batch_threads, ")");
+  }
+  if (options.full_batch_scheduling_boost_micros < 0) {
+    return errors::InvalidArgument(
+        "full_batch_scheduling_boost_micros can't be negative; was ",
+        options.full_batch_scheduling_boost_micros);
+  }
+  if (options.batches_to_average_over < 1) {
+    return errors::InvalidArgument(
+        "batches_to_average_over should be "
+        "greater than or equal to 1; was ",
+        options.batches_to_average_over);
+  }
+  if (options.target_pending <= 0) {
+    return errors::InvalidArgument(
+        "target_pending should be larger than zero; was ",
+        options.target_pending);
+  }
+  if (!options.get_pending_on_serial_device) {
+    return errors::InvalidArgument(
+        "get_pending_on_serial_device must be "
+        "specified");
+  }
+  scheduler->reset(new SerialDeviceBatchScheduler<TaskType>(options));
+  return Status::OK();
+}
+
+template <typename TaskType>
+SerialDeviceBatchScheduler<TaskType>::SerialDeviceBatchScheduler(
+    const Options& options)
+    : options_(options),
+      in_flight_batches_limit_(options.initial_in_flight_batches_limit),
+      processing_threads_(options.initial_in_flight_batches_limit) {
+  batch_thread_pool_.reset(new thread::ThreadPool(
+      env(), options.thread_pool_name, options.num_batch_threads));
+  for (int i = 0; i < processing_threads_; i++) {
+    batch_thread_pool_->Schedule(
+        std::bind(&SerialDeviceBatchScheduler<TaskType>::ProcessBatches, this));
+  }
+}
+
+template <typename TaskType>
+SerialDeviceBatchScheduler<TaskType>::~SerialDeviceBatchScheduler() {
+  // Signal processing threads to exit.
+  {
+    mutex_lock l(mu_);
+    processing_threads_ = 0;
+  }
+  // Hangs until all threads finish.
+  batch_thread_pool_.reset();
+}
+
+template <typename TaskType>
+Status SerialDeviceBatchScheduler<TaskType>::AddQueue(
+    const QueueOptions& options, BatchProcessor process_batch_callback,
+    std::unique_ptr<BatchScheduler<TaskType>>* queue) {
+  if (options.max_batch_size <= 0) {
+    return errors::InvalidArgument("max_batch_size must be positive; was ",
+                                   options.max_batch_size);
+  }
+  if (options.max_enqueued_batches <= 0) {
+    return errors::InvalidArgument(
+        "max_enqueued_batches must be positive; was ",
+        options.max_enqueued_batches);
+  }
+  internal::SDBSQueue<TaskType>* SDBS_queue_raw;
+  queue->reset(SDBS_queue_raw = new internal::SDBSQueue<TaskType>(
+                   this->shared_from_this(), options));
+  mutex_lock l(mu_);
+  queues_and_callbacks_[SDBS_queue_raw] = process_batch_callback;
+  return Status::OK();
+}
+
+template <typename TaskType>
+void SerialDeviceBatchScheduler<TaskType>::AddBatch(
+    const internal::SDBSBatch<TaskType>* batch) {
+  mutex_lock l(mu_);
+  batches_.push_back(batch);
+}
+
+template <typename TaskType>
+void SerialDeviceBatchScheduler<TaskType>::RemoveQueue(
+    const internal::SDBSQueue<TaskType>* queue) {
+  mutex_lock l(mu_);
+  queues_and_callbacks_.erase(queue);
+}
+
+template <typename TaskType>
+void SerialDeviceBatchScheduler<TaskType>::ProcessBatches() {
+  const int64 kIdleThreadSleepTimeMicros = 1000;
+  const double kMaxNoBatchRatio = .1;
+  const double kLowTrafficMovingAverageFactor = .1;
+  for (;;) {
+    mu_.lock();
+    if (processing_threads_ < 1 ||
+        processing_threads_ > in_flight_batches_limit_) {
+      processing_threads_--;
+      mu_.unlock();
+      break;
+    }
+    if (batches_.empty()) {
+      no_batch_count_++;
+      int64 sleep_time = batch_period_micros_ ? batch_period_micros_
+                                              : kIdleThreadSleepTimeMicros;
+      mu_.unlock();
+      env()->SleepForMicroseconds(sleep_time);
+      continue;
+    }
+    auto best_it = batches_.begin();
+    double best_score =
+        (*best_it)->creation_time_micros() -
+        options_.full_batch_scheduling_boost_micros * (*best_it)->size() /
+            static_cast<double>((*best_it)->queue()->max_task_size());
+    for (auto it = batches_.begin() + 1; it != batches_.end(); it++) {
+      const double score =
+          (*it)->creation_time_micros() -
+          options_.full_batch_scheduling_boost_micros * (*it)->size() /
+              static_cast<double>((*it)->queue()->max_task_size());
+      if (score < best_score) {
+        best_score = score;
+        best_it = it;
+      }
+    }
+    const internal::SDBSBatch<TaskType>* batch = *best_it;
+    batches_.erase(best_it);
+    // Queue may destroy itself after ReleaseBatch is called.
+    batch->queue()->ReleaseBatch(batch);
+    auto callback = queues_and_callbacks_[batch->queue()];
+    mu_.unlock();
+    int64 start_time = env()->NowMicros();
+    callback(std::unique_ptr<Batch<TaskType>>(
+        const_cast<internal::SDBSBatch<TaskType>*>(batch)));
+    int64 end_time = env()->NowMicros();
+    mu_.lock();
+    batch_count_++;
+    batch_latency_sum_ += end_time - start_time;
+    pending_sum_ += options_.get_pending_on_serial_device();
+    if (batch_count_ == options_.batches_to_average_over) {
+      recent_low_traffic_ratio_ *= (1 - kLowTrafficMovingAverageFactor);
+      // Only adjust in_flight_batches_limit_ if external load is large enough
+      // to consistently provide batches. Otherwise we would (mistakenly) assume
+      // that the device is underutilized because in_flight_batches_limit_ is
+      // too small.
+      if (no_batch_count_ < kMaxNoBatchRatio * batch_count_) {
+        double avg_pending = pending_sum_ / static_cast<double>(batch_count_);
+        // Avg processing time / # of concurrent batches gives the avg period
+        // between which two consecutive batches begin processing. Used to set a
+        // reasonable sleep time for idle batch processing threads.
+        batch_period_micros_ =
+            batch_latency_sum_ / batch_count_ / in_flight_batches_limit_;
+        // When the processing pipeline is consistently busy, the average number
+        // of pending batches differs from in_flight_batches_limit_ by a
+        // load-dependent offset. Adjust in_flight_batches_limit_to maintain
+        // the desired target pending.
+        in_flight_batches_limit_ +=
+            std::round(options_.target_pending - avg_pending);
+        in_flight_batches_limit_ = std::max(in_flight_batches_limit_, 1LL);
+        in_flight_batches_limit_ =
+            std::min(in_flight_batches_limit_, options_.num_batch_threads);
+        // Add extra processing threads if necessary.
+        if (processing_threads_ > 0 &&
+            processing_threads_ < in_flight_batches_limit_) {
+          int extra_threads = in_flight_batches_limit_ - processing_threads_;
+          for (int i = 0; i < extra_threads; i++) {
+            batch_thread_pool_->Schedule(std::bind(
+                &SerialDeviceBatchScheduler<TaskType>::ProcessBatches, this));
+          }
+          processing_threads_ = in_flight_batches_limit_;
+        }
+      } else {
+        recent_low_traffic_ratio_ += kLowTrafficMovingAverageFactor;
+      }
+      batch_count_ = 0;
+      no_batch_count_ = 0;
+      pending_sum_ = 0;
+      batch_latency_sum_ = 0;
+    }
+    mu_.unlock();
+  }
+}
+
+// ---------------- SDBSQueue ----------------
+
+namespace internal {
+template <typename TaskType>
+SDBSQueue<TaskType>::SDBSQueue(
+    std::shared_ptr<SerialDeviceBatchScheduler<TaskType>> scheduler,
+    const QueueOptions& options)
+    : scheduler_(scheduler), options_(options) {}
+
+template <typename TaskType>
+SDBSQueue<TaskType>::~SDBSQueue() {
+  // Wait until last batch has been scheduled.
+  const int kSleepMicros = 1000;
+  for (;;) {
+    {
+      mutex_lock l(mu_);
+      if (num_enqueued_batches_ == 0) {
+        break;
+      }
+    }
+    scheduler_->env()->SleepForMicroseconds(kSleepMicros);
+  }
+  scheduler_->RemoveQueue(this);
+}
+
+template <typename TaskType>
+Status SDBSQueue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
+  SDBSBatch<TaskType>* new_batch = nullptr;
+  size_t size = (*task)->size();
+  if (size > options_.max_batch_size) {
+    return errors::InvalidArgument("Task size ", size,
+                                   " is larger than maximum batch size ",
+                                   options_.max_batch_size);
+  }
+  {
+    mutex_lock l(mu_);
+    // Current batch is full, create another if allowed.
+    if (current_batch_ &&
+        current_batch_->size() + size > options_.max_batch_size) {
+      if (num_enqueued_batches_ >= options_.max_enqueued_batches) {
+        return errors::Unavailable("The batch scheduling queue is full");
+      }
+      current_batch_->Close();
+      current_batch_ = nullptr;
+    }
+    if (!current_batch_) {
+      num_enqueued_batches_++;
+      current_batch_ = new_batch =
+          new SDBSBatch<TaskType>(this, scheduler_->env()->NowMicros());
+    }
+    current_batch_->AddTask(std::move(*task));
+    num_enqueued_tasks_++;
+  }
+  // AddBatch must be called outside of lock, since it may call ReleaseBatch.
+  if (new_batch != nullptr) scheduler_->AddBatch(new_batch);
+  return Status::OK();
+}
+
+template <typename TaskType>
+void SDBSQueue<TaskType>::ReleaseBatch(const SDBSBatch<TaskType>* batch) {
+  mutex_lock l(mu_);
+  num_enqueued_batches_--;
+  num_enqueued_tasks_ -= batch->num_tasks();
+  if (batch == current_batch_) {
+    current_batch_->Close();
+    current_batch_ = nullptr;
+  }
+}
+
+template <typename TaskType>
+size_t SDBSQueue<TaskType>::NumEnqueuedTasks() const {
+  mutex_lock l(mu_);
+  return num_enqueued_tasks_;
+}
+
+template <typename TaskType>
+size_t SDBSQueue<TaskType>::SchedulingCapacity() const {
+  mutex_lock l(mu_);
+  const int current_batch_capacity =
+      current_batch_ ? options_.max_batch_size - current_batch_->size() : 0;
+  const int spare_batches =
+      options_.max_enqueued_batches - num_enqueued_batches_;
+  return spare_batches * options_.max_batch_size + current_batch_capacity;
+}
+}  // namespace internal
+}  // namespace serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_SERIAL_DEVICE_BATCH_SCHEDULER_H_
diff --git a/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler_test.cc
new file mode 100644
index 00000000000000..a2f8f9a03ee3a0
--- /dev/null
+++ b/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler_test.cc
@@ -0,0 +1,394 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h"
+
+#include "tensorflow/core/kernels/batching_util/fake_clock_env.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace serving {
+namespace anonymous {
+
+class FakeTask : public BatchTask {
+ public:
+  explicit FakeTask(size_t size) : size_(size) {}
+
+  ~FakeTask() override = default;
+
+  size_t size() const override { return size_; }
+
+ private:
+  const size_t size_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(FakeTask);
+};
+
+// Creates a FakeTask of size 'task_size', and calls 'scheduler->Schedule()' on
+// that task. Returns the resulting status.
+Status ScheduleTask(size_t task_size, BatchScheduler<FakeTask>* scheduler) {
+  std::unique_ptr<FakeTask> task(new FakeTask(task_size));
+  Status status = scheduler->Schedule(&task);
+  // Schedule() should have consumed 'task' iff it returned Status::OK.
+  CHECK_EQ(status.ok(), task == nullptr);
+  return status;
+}
+
+// Creates a thread that waits on 'start' and then advances the fake clock in
+// 'env' in a loop until 'stop' is notified. Useful for allowing objects that
+// use the clock to be destroyed.
+std::unique_ptr<Thread> CreateFakeClockAdvancerThread(
+    test_util::FakeClockEnv* env, Notification* start, Notification* stop) {
+  return std::unique_ptr<Thread>(Env::Default()->StartThread(
+      {}, "FakeClockAdvancerThread", [env, start, stop] {
+        start->WaitForNotification();
+        while (!stop->HasBeenNotified()) {
+          env->AdvanceByMicroseconds(10);
+          Env::Default()->SleepForMicroseconds(10);
+        }
+      }));
+}
+
+TEST(SerialDeviceBatchSchedulerTest, BadOptions) {
+  using Scheduler = SerialDeviceBatchScheduler<FakeTask>;
+  std::shared_ptr<Scheduler> scheduler;
+  Scheduler::Options default_options;
+  default_options.get_pending_on_serial_device = []() { return 0; };
+  Scheduler::Options options = default_options;
+  options.num_batch_threads = 0;
+  EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
+  options = default_options;
+  options.initial_in_flight_batches_limit = 0;
+  EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
+  options = default_options;
+  options.num_batch_threads = 5;
+  options.initial_in_flight_batches_limit = 8;
+  EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
+  options = default_options;
+  options.batches_to_average_over = -5;
+  EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
+  options = default_options;
+  options.target_pending = 0;
+  EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
+  options = Scheduler::Options();
+  EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
+}
+
+TEST(SerialDeviceBatchSchedulerTest, InFlightBatchesLimit) {
+  SerialDeviceBatchScheduler<FakeTask>::Options options;
+  options.num_batch_threads = 3;
+  options.initial_in_flight_batches_limit = 2;
+  options.batches_to_average_over = 1000;
+  options.get_pending_on_serial_device = []() { return 0; };
+  mutex mu;
+  int processed_batches = 0;
+  Notification finish_processing;
+  auto queue_callback = [&mu, &processed_batches, &finish_processing](
+                            std::unique_ptr<Batch<FakeTask>> batch) {
+    ASSERT_TRUE(batch->IsClosed());
+    EXPECT_GT(batch->num_tasks(), 0);
+    mu.lock();
+    int batch_num = ++processed_batches;
+    mu.unlock();
+    if (batch_num == 2) {
+      // Give third batch a chance to process if it's going to.
+      Env::Default()->SleepForMicroseconds(1000);
+      finish_processing.Notify();
+    }
+    if (batch_num == 3) {
+      ASSERT_TRUE(finish_processing.HasBeenNotified());
+    }
+    finish_processing.WaitForNotification();
+  };
+  std::shared_ptr<SerialDeviceBatchScheduler<FakeTask>> scheduler;
+  TF_ASSERT_OK(
+      SerialDeviceBatchScheduler<FakeTask>::Create(options, &scheduler));
+  std::unique_ptr<BatchScheduler<FakeTask>> queue1;
+  std::unique_ptr<BatchScheduler<FakeTask>> queue2;
+  std::unique_ptr<BatchScheduler<FakeTask>> queue3;
+  TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue1));
+  TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue2));
+  TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue3));
+  // Create 3 batches, only 2 should be processed concurrently.
+  TF_ASSERT_OK(ScheduleTask(100, queue1.get()));
+  TF_ASSERT_OK(ScheduleTask(100, queue2.get()));
+  TF_ASSERT_OK(ScheduleTask(100, queue3.get()));
+}
+
+TEST(SerialDeviceBatchSchedulerTest, PendingOnSerialDevice) {
+  mutex mu;
+  int pending;
+  SerialDeviceBatchScheduler<FakeTask>::Options options;
+  options.num_batch_threads = 3;
+  options.initial_in_flight_batches_limit = 1;
+  options.batches_to_average_over = 1;
+  options.target_pending = 3;
+  options.get_pending_on_serial_device = [&mu, &pending]() {
+    mutex_lock l(mu);
+    return pending;
+  };
+  std::shared_ptr<SerialDeviceBatchScheduler<FakeTask>> scheduler;
+  TF_ASSERT_OK(
+      SerialDeviceBatchScheduler<FakeTask>::Create(options, &scheduler));
+  // Make sure batch processing thread has gone to sleep.
+  Env::Default()->SleepForMicroseconds(1000);
+  int processed_batches = 0;
+  Notification start_processing;
+  auto queue_callback = [&mu, &processed_batches, &start_processing, &pending,
+                         &scheduler](std::unique_ptr<Batch<FakeTask>> batch) {
+    // Be careful with mutex mu to avoid potential deadlock with mutex mu_
+    // held in ProcessBatch() and in_flight_batches_limit().
+    int batch_num;
+    {
+      mutex_lock l(mu);
+      batch_num = ++processed_batches;
+    }
+    switch (batch_num) {
+      case 1:
+        start_processing.WaitForNotification();
+        {
+          mutex_lock l(mu);
+          pending = 2;
+        }
+        break;
+      case 2:
+        // No batches initially --> low traffic --> no adjustment.
+        CHECK_EQ(scheduler->in_flight_batches_limit(), 1);
+        {
+          mutex_lock l(mu);
+          pending = 3;
+        }
+        break;
+      case 3:
+        // Pending at target --> no adjustment.
+        CHECK_EQ(scheduler->in_flight_batches_limit(), 1);
+        {
+          mutex_lock l(mu);
+          pending = 1;
+        }
+        break;
+      case 4:
+        // Small pending --> 2 additional threads added.
+        CHECK_EQ(scheduler->in_flight_batches_limit(), 3);
+        {
+          mutex_lock l(mu);
+          pending = 3;
+        }
+        break;
+      default:
+        break;
+    }
+  };
+  std::unique_ptr<BatchScheduler<FakeTask>> queue;
+  TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue));
+  // Create 4 batches.
+  for (int i = 0; i < 4; i++) {
+    TF_ASSERT_OK(ScheduleTask(800, queue.get()));
+  }
+  start_processing.Notify();
+}
+
+TEST(SerialDeviceBatchSchedulerTest, FullBatchSchedulingBoostMicros) {
+  test_util::FakeClockEnv env(Env::Default());
+  Notification start_teardown, stop_teardown;
+  std::unique_ptr<Thread> teardown_thread =
+      CreateFakeClockAdvancerThread(&env, &start_teardown, &stop_teardown);
+  {
+    SerialDeviceBatchScheduler<FakeTask>::Options options;
+    options.env = &env;
+    options.initial_in_flight_batches_limit = 1;
+    options.batches_to_average_over = 1000;
+    options.full_batch_scheduling_boost_micros = 10;
+    options.get_pending_on_serial_device = []() { return 0; };
+    mutex mu;
+    int processed_batches = 0;
+    auto queue_callback =
+        [&mu, &processed_batches](std::unique_ptr<Batch<FakeTask>> batch) {
+          ASSERT_TRUE(batch->IsClosed());
+          mutex_lock l(mu);
+          processed_batches++;
+          switch (processed_batches) {
+            case 1:
+              EXPECT_EQ(1000, batch->size());
+              break;
+            case 2:
+              EXPECT_EQ(100, batch->size());
+              break;
+            case 3:
+              EXPECT_EQ(80, batch->size());
+              break;
+            default:
+              EXPECT_TRUE(false) << "Should only have 3 batches";
+          }
+        };
+    std::shared_ptr<SerialDeviceBatchScheduler<FakeTask>> scheduler;
+    TF_ASSERT_OK(
+        SerialDeviceBatchScheduler<FakeTask>::Create(options, &scheduler));
+    // Make sure batch processing thread has gone to sleep.
+    Env::Default()->SleepForMicroseconds(1000);
+    SerialDeviceBatchScheduler<FakeTask>::QueueOptions queue_options;
+    std::unique_ptr<BatchScheduler<FakeTask>> queue1;
+    std::unique_ptr<BatchScheduler<FakeTask>> queue2;
+    std::unique_ptr<BatchScheduler<FakeTask>> queue3;
+    queue_options.max_batch_size = 1000;
+    TF_ASSERT_OK(scheduler->AddQueue(queue_options, queue_callback, &queue1));
+    queue_options.max_batch_size = 1000;
+    TF_ASSERT_OK(scheduler->AddQueue(queue_options, queue_callback, &queue2));
+    queue_options.max_batch_size = 100;
+    TF_ASSERT_OK(scheduler->AddQueue(queue_options, queue_callback, &queue3));
+
+    TF_ASSERT_OK(ScheduleTask(100, queue1.get()));
+    // First batch - creation time: 0, fullness: 0.1, sched score: -1
+    env.AdvanceByMicroseconds(3);
+    TF_ASSERT_OK(ScheduleTask(1000, queue2.get()));
+    // Second batch - creation time: 3, fullness: 1, sched score: -7
+    env.AdvanceByMicroseconds(5);
+    TF_ASSERT_OK(ScheduleTask(80, queue3.get()));
+    // Third batch - creation time: 8, fullness: .8, sched score: 0
+    // Release the batch processing thread.
+    env.AdvanceByMicroseconds(1000);
+    start_teardown.Notify();
+  }
+  stop_teardown.Notify();
+}
+
+TEST(SerialDeviceBatchSchedulerTest, DeleteQueue) {
+  SerialDeviceBatchScheduler<FakeTask>::Options options;
+  options.initial_in_flight_batches_limit = 1;
+  options.batches_to_average_over = 1000;
+  options.get_pending_on_serial_device = []() { return 0; };
+  mutex mu;
+  int processed_batches = 0;
+  Notification finish_processing;
+  auto queue_callback = [&mu, &processed_batches, &finish_processing](
+                            std::unique_ptr<Batch<FakeTask>> batch) {
+    ASSERT_TRUE(batch->IsClosed());
+    EXPECT_GT(batch->num_tasks(), 0);
+    finish_processing.WaitForNotification();
+    mu.lock();
+    processed_batches++;
+    mu.unlock();
+  };
+  std::shared_ptr<SerialDeviceBatchScheduler<FakeTask>> scheduler;
+  TF_ASSERT_OK(
+      SerialDeviceBatchScheduler<FakeTask>::Create(options, &scheduler));
+  std::unique_ptr<BatchScheduler<FakeTask>> queue;
+  TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue));
+
+  // Enqueue 2 tasks, should result in 2 batches.
+  for (int i = 0; i < 2; i++) {
+    TF_ASSERT_OK(ScheduleTask(800, queue.get()));
+  }
+  std::unique_ptr<Thread> queue_deleter(Env::Default()->StartThread(
+      {}, "QueueDeleterThread", [&queue, &mu, &processed_batches] {
+        // Delete queue, should be kept alive until empty.
+        queue.reset();
+        mutex_lock l(mu);
+        EXPECT_EQ(processed_batches, 2);
+      }));
+  // Give queue_deleter thread time to delete queue.
+  Env::Default()->SleepForMicroseconds(1000);
+  finish_processing.Notify();
+}
+
+TEST(SerialDeviceBatchSchedulerTest, DeleteScheduler) {
+  SerialDeviceBatchScheduler<FakeTask>::Options options;
+  options.initial_in_flight_batches_limit = 1;
+  options.batches_to_average_over = 1000;
+  options.get_pending_on_serial_device = []() { return 0; };
+  mutex mu;
+  int processed_batches = 0;
+  Notification start_processing;
+  Notification finish_processing;
+  auto queue_callback =
+      [&mu, &processed_batches, &start_processing,
+       &finish_processing](std::unique_ptr<Batch<FakeTask>> batch) {
+        ASSERT_TRUE(batch->IsClosed());
+        EXPECT_GT(batch->num_tasks(), 0);
+        start_processing.WaitForNotification();
+        mutex_lock l(mu);
+        processed_batches++;
+        if (processed_batches == 2) {
+          finish_processing.Notify();
+        }
+      };
+
+  std::shared_ptr<SerialDeviceBatchScheduler<FakeTask>> scheduler;
+  TF_ASSERT_OK(
+      SerialDeviceBatchScheduler<FakeTask>::Create(options, &scheduler));
+  std::unique_ptr<BatchScheduler<FakeTask>> queue;
+  TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue));
+
+  // Enqueue 2 tasks, should result in 2 batches.
+  for (int i = 0; i < 2; i++) {
+    TF_ASSERT_OK(ScheduleTask(800, queue.get()));
+  }
+  // Delete scheduler, should be kept alive until queues are empty.
+  scheduler.reset();
+  start_processing.Notify();
+  finish_processing.WaitForNotification();
+}
+
+TEST(SerialDeviceBatchSchedulerTest, QueueCapacityInfo) {
+  SerialDeviceBatchScheduler<FakeTask>::Options options;
+  options.initial_in_flight_batches_limit = 1;
+  options.batches_to_average_over = 1000;
+  options.full_batch_scheduling_boost_micros = 1000;
+  options.get_pending_on_serial_device = []() { return 0; };
+  mutex mu;
+  int processed_batches = 0;
+  Notification finish_processing;
+  auto queue_callback = [&mu, &processed_batches, &finish_processing](
+                            std::unique_ptr<Batch<FakeTask>> batch) {
+    ASSERT_TRUE(batch->IsClosed());
+    EXPECT_GT(batch->num_tasks(), 0);
+    mu.lock();
+    int batch_num = ++processed_batches;
+    mu.unlock();
+    if (batch_num == 1) {
+      finish_processing.WaitForNotification();
+    }
+  };
+  std::shared_ptr<SerialDeviceBatchScheduler<FakeTask>> scheduler;
+  TF_ASSERT_OK(
+      SerialDeviceBatchScheduler<FakeTask>::Create(options, &scheduler));
+  std::unique_ptr<BatchScheduler<FakeTask>> queue1;
+  std::unique_ptr<BatchScheduler<FakeTask>> queue2;
+  TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue1));
+  TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue2));
+
+  // Blocker task, should schedule first.
+  TF_ASSERT_OK(ScheduleTask(800, queue1.get()));
+  TF_ASSERT_OK(ScheduleTask(100, queue2.get()));
+
+  EXPECT_EQ(queue2->NumEnqueuedTasks(), 1);
+  EXPECT_EQ(queue2->SchedulingCapacity(), 9 * 1000 + 900);
+  // Enqueue 2 more tasks, should fall in same batch.
+  TF_ASSERT_OK(ScheduleTask(100, queue2.get()));
+  TF_ASSERT_OK(ScheduleTask(200, queue2.get()));
+  EXPECT_EQ(queue2->NumEnqueuedTasks(), 3);
+  EXPECT_EQ(queue2->SchedulingCapacity(), 9 * 1000 + 600);
+  // Enqueue 1 more task, should create new batch.
+  TF_ASSERT_OK(ScheduleTask(700, queue2.get()));
+  EXPECT_EQ(queue2->NumEnqueuedTasks(), 4);
+  EXPECT_EQ(queue2->SchedulingCapacity(), 8 * 1000 + 300);
+  finish_processing.Notify();
+}
+}  // namespace anonymous
+}  // namespace serving
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/boosted_trees/prediction_ops.cc b/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
index 1b5ce32b7bed89..20359f28d331ea 100644
--- a/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/prediction_ops.cc
@@ -213,6 +213,12 @@ class BoostedTreesPredictOp : public OpKernel {
                                 &output_logits_t));
     auto output_logits = output_logits_t->matrix<float>();
 
+    // Return zero logits if it's an empty ensemble.
+    if (resource->num_trees() <= 0) {
+      output_logits.setZero();
+      return;
+    }
+
     const int32 latest_tree = resource->num_trees() - 1;
 
     auto do_work = [&resource, &batch_bucketized_features, &output_logits,
diff --git a/tensorflow/core/kernels/boosted_trees/stats_ops.cc b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
index 6dfcd63ab310a2..53bdd482cb7a2c 100644
--- a/tensorflow/core/kernels/boosted_trees/stats_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
@@ -255,7 +255,7 @@ class BoostedTreesMakeStatsSummaryOp : public OpKernel {
     // node_ids
     const Tensor* node_ids_t;
     OP_REQUIRES_OK(context, context->input("node_ids", &node_ids_t));
-    const auto node_ids = node_ids_t->vec<int32>();
+    const auto node_ids = node_ids_t->flat<int32>();
     // gradients
     const Tensor* gradients_t;
     OP_REQUIRES_OK(context, context->input("gradients", &gradients_t));
@@ -268,12 +268,6 @@ class BoostedTreesMakeStatsSummaryOp : public OpKernel {
     OpInputList bucketized_features_list;
     OP_REQUIRES_OK(context, context->input_list("bucketized_features_list",
                                                 &bucketized_features_list));
-    std::vector<tensorflow::TTypes<int32>::ConstVec> bucketized_features;
-    bucketized_features.reserve(num_features_);
-    for (const Tensor& tensor : bucketized_features_list) {
-      bucketized_features.emplace_back(tensor.vec<int32>());
-    }
-
     // Infer batch size.
     const int64 batch_size = node_ids_t->dim_size(0);
     // Allocate output stats tensor (Rank 4).
@@ -282,18 +276,39 @@ class BoostedTreesMakeStatsSummaryOp : public OpKernel {
                                 "stats_summary",
                                 {num_features_, max_splits_, num_buckets_, 2},
                                 &output_stats_summary_t));
-    auto output_stats_summary = output_stats_summary_t->tensor<float, 4>();
-    output_stats_summary.setZero();
+    auto output_stats_summary = output_stats_summary_t->flat<float>();
+    EIGEN_STATIC_ASSERT(
+        (static_cast<int>(decltype(output_stats_summary)::Layout) ==
+         static_cast<int>(Eigen::RowMajor)),
+        THIS_METHOD_IS_ONLY_FOR_ROW_MAJOR_MATRICES);
+
+    const int shift_per_node = num_buckets_ * 2;
+    const int shift_per_feature = shift_per_node * max_splits_;
+    const int32 max_index = num_features_ * shift_per_feature;
+    // We use double to sum the gradients and hessians, due to possible
+    // precision loss when summing small float values.
+    std::vector<double> res(max_index, 0);
 
     // Partition by node, and then bucketize.
-    for (int feature_idx = 0; feature_idx < num_features_; ++feature_idx) {
-      const auto& features = bucketized_features[feature_idx];
+    int feature_idx = 0;
+    int feature_shift = 0;
+    for (const Tensor& tensor : bucketized_features_list) {
+      const auto& features = tensor.flat<int32>();
       for (int i = 0; i < batch_size; ++i) {
         const int32 node = node_ids(i);
         const int32 bucket = features(i);
-        output_stats_summary(feature_idx, node, bucket, 0) += gradients(i, 0);
-        output_stats_summary(feature_idx, node, bucket, 1) += hessians(i, 0);
+        // Calculate the index in the flattened vector for
+        // [feature_idx][node][bucket][0].
+        const int index = feature_shift + node * shift_per_node + bucket * 2;
+        res[index] += gradients(i, 0);
+        res[index + 1] += hessians(i, 0);
       }
+      ++feature_idx;
+      feature_shift += shift_per_feature;
+    }
+    // Copy over the results.
+    for (int i = 0; i < max_index; ++i) {
+      output_stats_summary(i) = res[i];
     }
   }
 
diff --git a/tensorflow/core/kernels/broadcast_to_op.h b/tensorflow/core/kernels/broadcast_to_op.h
index 608e9b6ac9c161..73fdd5d28ea8d2 100644
--- a/tensorflow/core/kernels/broadcast_to_op.h
+++ b/tensorflow/core/kernels/broadcast_to_op.h
@@ -34,14 +34,37 @@ struct BroadcastTo {
                   const TensorShape &input_shape) {
 #define BROADCAST_SHAPE(broadcast, reshape, NDIMS, input_shape, output_shape) \
   for (int i = 0; i < NDIMS; i++) {                                           \
-    OP_REQUIRES(ctx, (broadcast[i] % reshape[i] == 0),                        \
-                errors::InvalidArgument("invalid shape to broadcast from ",   \
-                                        input_shape.DebugString(), " to ",    \
-                                        output_shape.DebugString()));         \
-    broadcast[i] = broadcast[i] / reshape[i];                                 \
+    if (reshape[i] != broadcast[i]) {                                         \
+      OP_REQUIRES(ctx,                                                        \
+                  ((reshape[i] != 0) && (broadcast[i] % reshape[i] == 0)),    \
+                  errors::InvalidArgument("invalid shape to broadcast from ", \
+                                          input_shape.DebugString(), " to ",  \
+                                          output_shape.DebugString()));       \
+      broadcast[i] = broadcast[i] / reshape[i];                               \
+    } else {                                                                  \
+      broadcast[i] = 1;                                                       \
+    }                                                                         \
   }
 
+    if (output_shape.num_elements() == 0) {
+      return;
+    }
+    if (output_shape == input_shape) {
+      output_tensor.flat<T>().device(d) = input_tensor.flat<T>();
+      return;
+    }
+
     switch (output_shape.dims()) {
+      case 0: {
+        if (input_shape.dims() > 0) {
+          ctx->CtxFailure(errors::InvalidArgument(
+              "invalid shape to broadcast from ", input_shape.DebugString(),
+              " to ", output_shape.DebugString()));
+          break;
+        }
+        output_tensor.scalar<T>().device(d) = input_tensor.scalar<T>();
+        break;
+      }
       case 1: {
         auto reshape = AsEigenDSizesWithPrefix<1>(input_shape);
         auto broadcast = output_shape.AsEigenDSizes<1>();
@@ -125,7 +148,6 @@ struct BroadcastTo {
         auto broadcast = output_shape.AsEigenDSizes<4>();
 
         BROADCAST_SHAPE(broadcast, reshape, 4, input_shape, output_shape);
-
         auto output = output_tensor.tensor<T, 4>();
         switch (input_shape.dims()) {
           case 0: {
diff --git a/tensorflow/core/kernels/cholesky_grad.cc b/tensorflow/core/kernels/cholesky_grad.cc
index 9d33845c2f4751..eac66e580dd2ec 100644
--- a/tensorflow/core/kernels/cholesky_grad.cc
+++ b/tensorflow/core/kernels/cholesky_grad.cc
@@ -84,7 +84,7 @@ class CholeskyGrad : public LinearAlgebraOp<Scalar> {
       Variables names representing the derivative matrix have a trailing '_bar'.
       */
 
-      const int64 block_begin = std::max(0ll, block_end - kMaxBlockSize);
+      const int64 block_begin = std::max(int64{0}, block_end - kMaxBlockSize);
       const int64 block_size = block_end - block_begin;
       const int64 trailing_size = kMatrixSize - block_end;
 
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index 7d5d54e5bece7d..ebf844d75ffa4d 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -587,24 +587,14 @@ REGISTER_SYCL_HOST_KERNEL(string);
 #undef REGISTER_SYCL_HOST_KERNEL
 #endif  // TENSORFLOW_USE_SYCL
 
-// A LoopCond op has one input and one output. The input is a boolean
-// scalar representing the taken branches of the "pivot" Switch that
-// determines loop termination. As a contract, any high-level front-end
-// should always use port '0' of the "pivot" switches for loop exit.
-class LoopCondOp : public OpKernel {
- public:
-  explicit LoopCondOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    context->set_output(0, context->input(0));
-  }
+LoopCondOp::LoopCondOp(OpKernelConstruction* context) : OpKernel(context) {}
+LoopCondOp::~LoopCondOp() = default;
 
-  bool IsExpensive() override { return false; }
-
-  ~LoopCondOp() override {}
+void LoopCondOp::Compute(OpKernelContext* context) {
+  context->set_output(0, context->input(0));
+}
 
-  TF_DISALLOW_COPY_AND_ASSIGN(LoopCondOp);
-};
+bool LoopCondOp::IsExpensive() { return false; }
 
 REGISTER_KERNEL_BUILDER(Name("LoopCond").Device(DEVICE_CPU), LoopCondOp);
 REGISTER_KERNEL_BUILDER(Name("LoopCond")
diff --git a/tensorflow/core/kernels/control_flow_ops.h b/tensorflow/core/kernels/control_flow_ops.h
index 4838f2e2bf0443..8edbcc9077764a 100644
--- a/tensorflow/core/kernels/control_flow_ops.h
+++ b/tensorflow/core/kernels/control_flow_ops.h
@@ -97,6 +97,22 @@ class NextIterationOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(NextIterationOp);
 };
 
+// A LoopCond op has one input and one output. The input is a boolean
+// scalar representing the taken branches of the "pivot" Switch that
+// determines loop termination. As a contract, any high-level front-end
+// should always use port '0' of the "pivot" switches for loop exit.
+class LoopCondOp : public OpKernel {
+ public:
+  explicit LoopCondOp(OpKernelConstruction* context);
+  ~LoopCondOp() override;
+
+  void Compute(OpKernelContext* context) override;
+
+  bool IsExpensive() override;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(LoopCondOp);
+};
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_CONTROL_FLOW_OPS_H_
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops.cc b/tensorflow/core/kernels/conv_grad_filter_ops.cc
index 72351446ae825b..0344a891003504 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops.cc
@@ -96,7 +96,8 @@ template <typename T>
 struct LaunchConv2DBackpropFilterOp<CPUDevice, T> {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                   const Tensor& out_backprop, const Tensor& input,
-                  int row_stride, int col_stride, const Padding& padding,
+                  int row_dilation, int col_dilation, int row_stride,
+                  int col_stride, const Padding& padding,
                   Tensor* filter_backprop, TensorFormat data_format) {
     const CPUDevice& d = ctx->eigen_device<CPUDevice>();
     functor::SpatialConvolutionBackwardFilter<CPUDevice, T>()(
@@ -275,7 +276,8 @@ class Conv2DFastBackpropFilterOp : public OpKernel {
 #endif
 
     LaunchConv2DBackpropFilterOp<Device, T>()(
-        context, false, false, out_backprop, input, dims.spatial_dims[0].stride,
+        context, false, false, out_backprop, input,
+        /*row_dilation=*/1, /*col_dilation=*/1, dims.spatial_dims[0].stride,
         dims.spatial_dims[1].stride, padding_, filter_backprop, data_format_);
   }
 
@@ -402,10 +404,9 @@ class Conv2DCustomBackpropFilterOp : public OpKernel {
     // image ('work_unit_size').
 
     // TODO(andydavis)
-    // *) Get L3 cache size from device at runtime (30MB is from ivybridge).
     // *) Consider reducing 'target_working_set_size' if L3 is shared by
     //    other concurrently running tensorflow ops.
-    const size_t target_working_set_size = (30LL << 20) / sizeof(T);
+    const size_t target_working_set_size = Eigen::l3CacheSize() / sizeof(T);
 
     const size_t size_A = output_image_size * filter_total_size;
 
@@ -523,6 +524,11 @@ TF_CALL_float(REGISTER_CPU_KERNELS);
 TF_CALL_double(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 
+// To be used inside depthwise_conv_grad_op.cc.
+template struct LaunchConv2DBackpropFilterOp<CPUDevice, Eigen::half>;
+template struct LaunchConv2DBackpropFilterOp<CPUDevice, float>;
+template struct LaunchConv2DBackpropFilterOp<CPUDevice, double>;
+
 // GPU definitions.
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // The slow version (but compiles for GPU)
@@ -690,10 +696,15 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
     return;
   }
 
+  // If the filter in-depth (filter_shape.dim_size(2)) is 1 and smaller than the
+  // input depth, it's a depthwise convolution. More generally, if the filter
+  // in-depth divides but is smaller than the input depth, it is a grouped
+  // convolution.
+  bool is_grouped_convolution = filter_shape.dim_size(2) != dims.in_depth;
   bool cudnn_disable_conv_1x1_optimization_ = CudnnDisableConv1x1Optimization();
   if (!cudnn_disable_conv_1x1_optimization_ &&
       dims.spatial_dims[0].filter_size == 1 &&
-      dims.spatial_dims[1].filter_size == 1 &&
+      dims.spatial_dims[1].filter_size == 1 && !is_grouped_convolution &&
       dims.spatial_dims[0].stride == 1 && dims.spatial_dims[1].stride == 1 &&
       data_format == FORMAT_NHWC) {
     const uint64 m = dims.in_depth;
@@ -734,9 +745,10 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
                  dims.spatial_dims[0].input_size &&
              dims.spatial_dims[1].filter_size ==
                  dims.spatial_dims[1].input_size &&
-             padding == VALID && data_format == FORMAT_NHWC) {
-    // The input data and filter have the same height/width, so call cublas
-    // directly.
+             !is_grouped_convolution && padding == VALID &&
+             data_format == FORMAT_NHWC) {
+    // The input data and filter have the same height/width, and we are not
+    // using grouped convolution, so call cublas directly.
     const uint64 m = dims.spatial_dims[0].input_size *
                      dims.spatial_dims[1].input_size * dims.in_depth;
     const uint64 k = dims.batch_size;
@@ -802,15 +814,16 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
   se::dnn::FilterDescriptor filter_desc;
   filter_desc.set_input_filter_height(dims.spatial_dims[0].filter_size)
       .set_input_filter_width(dims.spatial_dims[1].filter_size)
-      .set_input_feature_map_count(dims.in_depth)
-      .set_output_feature_map_count(dims.out_depth);
+      .set_input_feature_map_count(filter_shape.dim_size(2))
+      .set_output_feature_map_count(filter_shape.dim_size(3));
   se::dnn::ConvolutionDescriptor conv_desc;
   conv_desc.set_vertical_dilation_rate(dims.spatial_dims[0].dilation)
       .set_horizontal_dilation_rate(dims.spatial_dims[1].dilation)
       .set_vertical_filter_stride(dims.spatial_dims[0].stride)
       .set_horizontal_filter_stride(dims.spatial_dims[1].stride)
       .set_zero_padding_height(padding_rows / 2)
-      .set_zero_padding_width(padding_cols / 2);
+      .set_zero_padding_width(padding_cols / 2)
+      .set_group_count(dims.in_depth / filter_shape.dim_size(2));
 
   // NOTE(zhengxq):
   // cuDNN only supports the following layouts :
@@ -891,21 +904,22 @@ void LaunchConv2DBackpropFilterOp<Eigen::GpuDevice, T>::operator()(
   int device_id = stream->parent()->device_ordinal();
   DataType dtype = input.dtype();
   ConvParameters conv_parameters = {
-      dims.batch_size,                       // batch
-      dims.in_depth,                         // in_depths
-      {{input_desc.height(),                 // in_rows
-        input_desc.width()}},                // in_cols
-      dims.out_depth,                        // out_depths
-      {{dims.spatial_dims[0].filter_size,    // filter_rows
-        dims.spatial_dims[1].filter_size}},  // filter_cols
-      {{dims.spatial_dims[0].dilation,       // dilation_rows
-        dims.spatial_dims[1].dilation}},     // dilation_cols
-      {{dims.spatial_dims[0].stride,         // stride_rows
-        dims.spatial_dims[1].stride}},       // stride_cols
-      {{padding_rows,                        // padding_rows
-        padding_cols}},                      // padding_cols
-      dtype,                                 // tensor datatype
-      device_id,                             // device_id
+      dims.batch_size,                     // batch
+      dims.in_depth,                       // in_depths
+      {{input_desc.height(),               // in_rows
+        input_desc.width()}},              // in_cols
+      dims.out_depth,                      // out_depths
+      {{dims.spatial_dims[0].filter_size,  // filter_rows
+        dims.spatial_dims[1].filter_size,  // filter_cols
+        filter_shape.dim_size(2)}},        // filter_depth
+      {{dims.spatial_dims[0].dilation,     // dilation_rows
+        dims.spatial_dims[1].dilation}},   // dilation_cols
+      {{dims.spatial_dims[0].stride,       // stride_rows
+        dims.spatial_dims[1].stride}},     // stride_cols
+      {{padding_rows,                      // padding_rows
+        padding_cols}},                    // padding_cols
+      dtype,                               // tensor datatype
+      device_id,                           // device_id
   };
   AlgorithmConfig algorithm_config;
   if (cudnn_use_autotune && !AutoTuneConvBwdFilter::GetInstance()->Find(
@@ -1040,9 +1054,9 @@ namespace functor {
       typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format); \
   extern template struct PadInput<GPUDevice, T, int, 4>;
 
-DECLARE_GPU_SPEC(double);
 DECLARE_GPU_SPEC(float);
 DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(double);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor
 
@@ -1061,6 +1075,12 @@ REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropFilter")
                             .TypeConstraint<Eigen::half>("T")
                             .HostMemory("filter_sizes"),
                         Conv2DSlowBackpropFilterOp<GPUDevice, Eigen::half>);
+
+// To be used inside depthwise_conv_grad_op.cc.
+template struct LaunchConv2DBackpropFilterOp<GPUDevice, float>;
+template struct LaunchConv2DBackpropFilterOp<GPUDevice, Eigen::half>;
+template struct LaunchConv2DBackpropFilterOp<GPUDevice, double>;
+
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index 4e441a1e852eaf..38fb62e5e9bf70 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -101,8 +101,9 @@ template <typename T>
 struct LaunchConv2DBackpropInputOp<CPUDevice, T> {
   void operator()(OpKernelContext* ctx, bool use_cudnn, bool cudnn_use_autotune,
                   const Tensor& out_backprop, const Tensor& filter,
-                  int row_stride, int col_stride, const Padding& padding,
-                  Tensor* in_backprop, TensorFormat data_format) {
+                  int row_dilation, int col_dilation, int row_stride,
+                  int col_stride, const Padding& padding, Tensor* in_backprop,
+                  TensorFormat data_format) {
     const CPUDevice& d = ctx->eigen_device<CPUDevice>();
     functor::SpatialConvolutionBackwardInput<CPUDevice, T>()(
         d, in_backprop->tensor<T, 4>(), filter.tensor<T, 4>(),
@@ -280,8 +281,8 @@ class Conv2DFastBackpropInputOp : public OpKernel {
 
     LaunchConv2DBackpropInputOp<Device, T>()(
         context, false, false, out_backprop, filter,
-        dims.spatial_dims[0].stride, dims.spatial_dims[1].stride, padding_,
-        in_backprop, data_format_);
+        /*row_dilation=*/1, /*col_dilation=*/1, dims.spatial_dims[0].stride,
+        dims.spatial_dims[1].stride, padding_, in_backprop, data_format_);
   }
 
  private:
@@ -419,9 +420,8 @@ class Conv2DCustomBackpropInputOp : public OpKernel {
     const int output_image_size =
         dims.spatial_dims[0].output_size * dims.spatial_dims[1].output_size;
 
-    // TODO(andydavis) Get L2/L3 cache sizes from device.
-    const size_t l2_cache_size = 256LL << 10;
-    const size_t l3_cache_size = 30LL << 20;
+    const size_t l2_cache_size = Eigen::l2CacheSize();
+    const size_t l3_cache_size = Eigen::l3CacheSize();
 
     // Use L3 cache size as target working set size.
     const size_t target_working_set_size = l3_cache_size / sizeof(T);
@@ -595,6 +595,11 @@ TF_CALL_float(REGISTER_CPU_KERNELS);
 TF_CALL_double(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 
+// To be used inside depthwise_conv_grad_op.cc.
+template struct LaunchConv2DBackpropInputOp<CPUDevice, Eigen::half>;
+template struct LaunchConv2DBackpropInputOp<CPUDevice, float>;
+template struct LaunchConv2DBackpropInputOp<CPUDevice, double>;
+
 // GPU definitions.
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // The slow version (but compiles for GPU)
@@ -761,8 +766,13 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
     return;
   }
 
+  // If the filter in-depth (filter_shape.dim_size(2)) is 1 and smaller than the
+  // input depth, it's a depthwise convolution. More generally, if the filter
+  // in-depth divides but is smaller than the input depth, it is a grouped
+  // convolution.
+  bool is_grouped_convolution = filter_shape.dim_size(2) != dims.in_depth;
   if (dims.spatial_dims[0].filter_size == 1 &&
-      dims.spatial_dims[1].filter_size == 1 &&
+      dims.spatial_dims[1].filter_size == 1 && !is_grouped_convolution &&
       dims.spatial_dims[0].stride == 1 && dims.spatial_dims[1].stride == 1 &&
       data_format == FORMAT_NHWC) {
     // 1x1 filter, so call cublas directly.
@@ -795,9 +805,10 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
                  dims.spatial_dims[0].input_size &&
              dims.spatial_dims[1].filter_size ==
                  dims.spatial_dims[1].input_size &&
-             padding == VALID && data_format == FORMAT_NHWC) {
-    // The input data and filter have the same height/width, so call cublas
-    // directly.
+             !is_grouped_convolution && padding == VALID &&
+             data_format == FORMAT_NHWC) {
+    // The input data and filter have the same height/width, and we are not
+    // using grouped convolution, so call cublas directly.
     const uint64 m = dims.batch_size;
     const uint64 k = dims.out_depth;
     const uint64 n = dims.spatial_dims[0].input_size *
@@ -856,15 +867,16 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
   se::dnn::FilterDescriptor filter_desc;
   filter_desc.set_input_filter_height(dims.spatial_dims[0].filter_size)
       .set_input_filter_width(dims.spatial_dims[1].filter_size)
-      .set_input_feature_map_count(dims.in_depth)
-      .set_output_feature_map_count(dims.out_depth);
+      .set_input_feature_map_count(filter_shape.dim_size(2))
+      .set_output_feature_map_count(filter_shape.dim_size(3));
   se::dnn::ConvolutionDescriptor conv_desc;
   conv_desc.set_vertical_dilation_rate(dims.spatial_dims[0].dilation)
       .set_horizontal_dilation_rate(dims.spatial_dims[1].dilation)
       .set_vertical_filter_stride(dims.spatial_dims[0].stride)
       .set_horizontal_filter_stride(dims.spatial_dims[1].stride)
       .set_zero_padding_height(padding_rows / 2)
-      .set_zero_padding_width(padding_cols / 2);
+      .set_zero_padding_width(padding_cols / 2)
+      .set_group_count(dims.in_depth / filter_shape.dim_size(2));
 
   // NOTE(keveman):
   // cuDNN only supports the following layouts :
@@ -940,21 +952,22 @@ void LaunchConv2DBackpropInputOp<GPUDevice, T>::operator()(
   int device_id = stream->parent()->device_ordinal();
   DataType dtype = out_backprop.dtype();
   ConvParameters conv_parameters = {
-      dims.batch_size,                       // batch
-      dims.in_depth,                         // in_depths
-      {{input_desc.height(),                 // in_rows
-        input_desc.width()}},                // in_cols
-      dims.out_depth,                        // out_depths
-      {{dims.spatial_dims[0].filter_size,    // filter_rows
-        dims.spatial_dims[1].filter_size}},  // filter_cols
-      {{dims.spatial_dims[0].dilation,       // dilation_rows
-        dims.spatial_dims[1].dilation}},     // dilation_cols
-      {{dims.spatial_dims[0].stride,         // stride_rows
-        dims.spatial_dims[1].stride}},       // stride_cols
-      {{padding_rows,                        // padding_rows
-        padding_cols}},                      // padding_cols
-      dtype,                                 // tensor data type
-      device_id,                             // device_id
+      dims.batch_size,                     // batch
+      dims.in_depth,                       // in_depths
+      {{input_desc.height(),               // in_rows
+        input_desc.width()}},              // in_cols
+      dims.out_depth,                      // out_depths
+      {{dims.spatial_dims[0].filter_size,  // filter_rows
+        dims.spatial_dims[1].filter_size,  // filter_cols
+        filter_shape.dim_size(2)}},        // filter_depths
+      {{dims.spatial_dims[0].dilation,     // dilation_rows
+        dims.spatial_dims[1].dilation}},   // dilation_cols
+      {{dims.spatial_dims[0].stride,       // stride_rows
+        dims.spatial_dims[1].stride}},     // stride_cols
+      {{padding_rows,                      // padding_rows
+        padding_cols}},                    // padding_cols
+      dtype,                               // tensor data type
+      device_id,                           // device_id
   };
   AlgorithmConfig algorithm_config;
   if (cudnn_use_autotune && !AutoTuneConvBwdData::GetInstance()->Find(
@@ -1117,9 +1130,9 @@ namespace functor {
       typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format); \
   extern template struct PadInput<GPUDevice, T, int, 4>;
 
-DECLARE_GPU_SPEC(double);
 DECLARE_GPU_SPEC(float);
 DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(double);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor
 
@@ -1138,6 +1151,12 @@ REGISTER_KERNEL_BUILDER(Name("Conv2DBackpropInput")
                             .TypeConstraint<Eigen::half>("T")
                             .HostMemory("input_sizes"),
                         Conv2DSlowBackpropInputOp<GPUDevice, Eigen::half>);
+
+// To be used inside depthwise_conv_grad_op.cc.
+template struct LaunchConv2DBackpropInputOp<GPUDevice, float>;
+template struct LaunchConv2DBackpropInputOp<GPUDevice, Eigen::half>;
+template struct LaunchConv2DBackpropInputOp<GPUDevice, double>;
+
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_grad_ops.cc b/tensorflow/core/kernels/conv_grad_ops.cc
index 170ce31d1711a8..5bf709af08af41 100644
--- a/tensorflow/core/kernels/conv_grad_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_ops.cc
@@ -127,16 +127,17 @@ Status ConvBackpropComputeDimensionsV2(
   dims->in_depth = input_shape.dim_size(feature_dim);
   // The input and output feature dimensions are the second last and last
   // dimensions of the filter Tensor.
-  if (dims->in_depth != filter_shape.dim_size(num_dims - 2)) {
+  VLOG(2) << "input vs filter_in depth " << dims->in_depth << " "
+          << filter_shape.dim_size(num_dims - 2);
+  if (dims->in_depth % filter_shape.dim_size(num_dims - 2)) {
     return errors::InvalidArgument(
-        label, ": input and filter must have the same depth");
+        label, ": input depth must be evenly divisible by filter depth");
   }
   dims->out_depth = filter_shape.dim_size(num_dims - 1);
   if (dims->out_depth != out_backprop_shape.dim_size(feature_dim)) {
     return errors::InvalidArgument(
         label, ": filter and out_backprop must have the same out_depth");
   }
-
   dims->spatial_dims.resize(num_spatial_dims);
   for (int i = 0; i < num_spatial_dims; ++i) {
     int image_dim = GetTensorSpatialDimIndex(num_dims, data_format, i);
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index b43f02c5327cd7..29c8bc026bb7a9 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -195,8 +195,8 @@ class Conv3DBackpropInputOp : public OpKernel {
     TensorShape input_shape;
     if (takes_shape_) {
       const Tensor& input_sizes = context->input(0);
-      OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(
-                                  input_sizes.vec<int32>(), &input_shape));
+      // MakeShape is able to handle both DT_INT32 and DT_INT64 for input_sizes.
+      OP_REQUIRES_OK(context, MakeShape(input_sizes, &input_shape));
     } else {
       input_shape = context->input(0).shape();
     }
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index 6aa7e5cbb4bc40..1b90d722818d74 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -18,10 +18,16 @@ limitations under the License.
 #define USE_EIGEN_TENSOR
 #define EIGEN_USE_THREADS
 
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+#endif  // GOOGLE_CUDA
+
 #include "tensorflow/core/kernels/conv_ops.h"
+
 #include <string.h>
 #include <map>
 #include <vector>
+
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -32,9 +38,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/deep_conv2d.h"
 #include "tensorflow/core/kernels/ops_util.h"
-#ifdef TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS
-#include "tensorflow/core/kernels/xsmm_conv2d.h"
-#endif
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/lib/strings/numbers.h"
@@ -45,6 +48,10 @@ limitations under the License.
 #include "tensorflow/core/util/tensor_format.h"
 #include "tensorflow/core/util/use_cudnn.h"
 
+#ifdef TENSORFLOW_USE_LIBXSMM_CONVOLUTIONS
+#include "tensorflow/core/kernels/xsmm_conv2d.h"
+#endif
+
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #include "tensorflow/core/kernels/conv_ops_gpu.h"
 #include "tensorflow/core/platform/stream_executor.h"
@@ -123,6 +130,10 @@ struct LaunchConv2DOp<CPUDevice, T> {
                                 "NHWC tensor format for now."));
       return;
     }
+    const int64 in_depth = GetTensorDim(input, data_format, 'C');
+    OP_REQUIRES(ctx, in_depth == filter.dim_size(2),
+                errors::Unimplemented("Generic conv implementation does not "
+                                      "support grouped convolutions for now."));
     LaunchGeneric<CPUDevice, T>()(ctx, input, filter, row_stride, col_stride,
                                   row_dilation, col_dilation, padding, output,
                                   data_format);
@@ -324,12 +335,13 @@ class Conv2DOp : public BinaryOp<T> {
     }
 
     // The last dimension for input is in_depth. It must be the same as the
-    // filter's in_depth.
+    // filter's in_depth or be evenly divisible by filter's in_depth.
     const int64 in_depth = GetTensorDim(input, data_format_, 'C');
-    OP_REQUIRES(context, in_depth == filter.dim_size(2),
+    const int64 patch_depth = filter.dim_size(2);
+    OP_REQUIRES(context, in_depth % patch_depth == 0,
                 errors::InvalidArgument(
-                    "input and filter must have the same depth: ", in_depth,
-                    " vs ", filter.dim_size(2)));
+                    "input depth must be evenly divisible by filter depth: ",
+                    in_depth, " vs ", patch_depth));
 
     // The last dimension for filter is out_depth.
     const int out_depth = static_cast<int>(filter.dim_size(3));
@@ -386,6 +398,7 @@ class Conv2DOp : public BinaryOp<T> {
     OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
 
     VLOG(2) << "Conv2D: in_depth = " << in_depth
+            << ", patch_depth = " << patch_depth
             << ", input_cols = " << input_cols
             << ", filter_cols = " << filter_cols
             << ", input_rows = " << input_rows
@@ -450,7 +463,9 @@ TF_CALL_double(REGISTER_CPU);
 #endif  // USE_GEMM_FOR_CONV
 
 // To be used inside depthwise_conv_op.cc.
+template struct LaunchConv2DOp<CPUDevice, Eigen::half>;
 template struct LaunchConv2DOp<CPUDevice, float>;
+template struct LaunchConv2DOp<CPUDevice, double>;
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 int64 GetDnnWorkspaceLimit(const string& envvar_in_mb,
@@ -498,13 +513,24 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
   }
 
   Tensor input = input_param;
-
-  if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1 && row_dilation == 1 &&
-      col_dilation == 1 && row_stride == 1 && col_stride == 1 &&
-      data_format == FORMAT_NHWC) {
+  const int64 in_batch = GetTensorDim(input, data_format, 'N');
+  int64 in_rows = GetTensorDim(input, data_format, 'H');
+  int64 in_cols = GetTensorDim(input, data_format, 'W');
+  const int64 in_depths = GetTensorDim(input, data_format, 'C');
+  const int64 patch_rows = filter.dim_size(0);
+  const int64 patch_cols = filter.dim_size(1);
+  const int64 patch_depths = filter.dim_size(2);
+
+  // If the filter in-depth (patch_depths) is 1 and smaller than the input
+  // depth, it's a depthwise convolution. More generally, if the filter in-depth
+  // divides but is smaller than the input depth, it is a grouped convolution.
+  bool is_grouped_convolution = patch_depths != in_depths;
+  if (patch_rows == 1 && patch_cols == 1 && !is_grouped_convolution &&
+      row_dilation == 1 && col_dilation == 1 && row_stride == 1 &&
+      col_stride == 1 && data_format == FORMAT_NHWC) {
     // 1x1 filter, so call cublas directly.
-    const uint64 m = input.dim_size(0) * input.dim_size(1) * input.dim_size(2);
-    const uint64 k = filter.dim_size(2);
+    const uint64 m = in_batch * in_rows * in_cols;
+    const uint64 k = patch_depths;
     const uint64 n = filter.dim_size(3);
 
     auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
@@ -525,15 +551,14 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
                                       ", n=", n, ", k=", k));
     }
     return;
-  } else if (filter.dim_size(0) == input.dim_size(1) &&
-             filter.dim_size(1) == input.dim_size(2) && row_dilation == 1 &&
+  } else if (patch_rows == in_rows && patch_cols == in_cols &&
+             !is_grouped_convolution && row_dilation == 1 &&
              col_dilation == 1 && padding == VALID &&
              data_format == FORMAT_NHWC) {
     // The input data and filter have the same height/width, so call cublas
     // directly.
-    const uint64 m = input.dim_size(0);
-    const uint64 k =
-        filter.dim_size(0) * filter.dim_size(1) * filter.dim_size(2);
+    const uint64 m = in_batch;
+    const uint64 k = patch_rows * patch_cols * patch_depths;
     const uint64 n = filter.dim_size(3);
 
     auto a_ptr = AsDeviceMemory(input.template flat<T>().data(),
@@ -558,16 +583,10 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
 
   int padding_rows = 0;
   int padding_cols = 0;
-  const int64 in_batch = GetTensorDim(input, data_format, 'N');
-  int64 in_rows = GetTensorDim(input, data_format, 'H');
-  int64 in_cols = GetTensorDim(input, data_format, 'W');
-  const int64 in_depths = GetTensorDim(input, data_format, 'C');
   const int64 out_batch = GetTensorDim(*output, data_format, 'N');
   const int64 out_rows = GetTensorDim(*output, data_format, 'H');
   const int64 out_cols = GetTensorDim(*output, data_format, 'W');
   const int64 out_depths = GetTensorDim(*output, data_format, 'C');
-  const int64 patch_rows = filter.dim_size(0);
-  const int64 patch_cols = filter.dim_size(1);
   if (padding == SAME) {
     // Total padding on rows and cols is
     // Pr = (R' - 1) * S + (Kr - 1) * Dr + 1 - R
@@ -642,9 +661,9 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
       .set_feature_map_count(out_depths)
       .set_layout(se::dnn::DataLayout::kBatchDepthYX);
   se::dnn::FilterDescriptor filter_desc;
-  filter_desc.set_input_filter_height(filter.dim_size(0))
-      .set_input_filter_width(filter.dim_size(1))
-      .set_input_feature_map_count(filter.dim_size(2))
+  filter_desc.set_input_filter_height(patch_rows)
+      .set_input_filter_width(patch_cols)
+      .set_input_feature_map_count(patch_depths)
       .set_output_feature_map_count(filter.dim_size(3));
   se::dnn::ConvolutionDescriptor conv_desc;
   conv_desc.set_vertical_dilation_rate(row_dilation)
@@ -652,7 +671,8 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
       .set_vertical_filter_stride(row_stride)
       .set_horizontal_filter_stride(col_stride)
       .set_zero_padding_height(padding_rows / 2)
-      .set_zero_padding_width(padding_cols / 2);
+      .set_zero_padding_width(padding_cols / 2)
+      .set_group_count(in_depths / patch_depths);
 
   Tensor transformed_filter;
   OP_REQUIRES_OK(ctx, ctx->allocate_temp(
@@ -695,7 +715,8 @@ void LaunchConv2DOp<GPUDevice, T>::operator()(
         in_cols}},       // in_cols
       out_depths,        // out_depths
       {{patch_rows,      // filter_rows
-        patch_cols}},    // filter_cols
+        patch_cols,      // filter_cols
+        patch_depths}},  // filter_depths
       {{row_dilation,    // dilation_rows
         col_dilation}},  // dilation_cols
       {{row_stride,      // stride_rows
@@ -835,9 +856,9 @@ namespace functor {
       typename TTypes<T, 4, int>::Tensor out, TensorFormat data_format);     \
   extern template struct PadInput<GPUDevice, T, int, 4>
 
-DECLARE_GPU_SPEC(double);
 DECLARE_GPU_SPEC(float);
 DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(double);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor
 
@@ -853,7 +874,9 @@ REGISTER_KERNEL_BUILDER(
     Conv2DOp<GPUDevice, double>);
 
 // To be used inside depthwise_conv_op.cc.
-template class LaunchConv2DOp<GPUDevice, float>;
+template struct LaunchConv2DOp<GPUDevice, float>;
+template struct LaunchConv2DOp<GPUDevice, Eigen::half>;
+template struct LaunchConv2DOp<GPUDevice, double>;
 
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
diff --git a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
index 941a25fefdf0fa..bdf07901ee1960 100644
--- a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
@@ -249,7 +249,17 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
   constexpr int ReadRowPerPass = NumThreads / TileSizeJ;
   constexpr int WriteRowPerPass = NumThreads / TileSizeI;
   // One extra line in the inner dimension to avoid share memory bank conflict.
+#if GOOGLE_CUDA
+  // This is to mimic the following, but no constructor of T can be invoked.
+  //     __shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1];
+  __shared__ __align__(
+      alignof(T)) char shared_mem_raw[TileSizeI * (TileSizeJ + 1) * sizeof(T)];
+  typedef T(*SharedMemoryTile)[TileSizeJ + 1];
+  SharedMemoryTile shared_memory_tile =
+      reinterpret_cast<SharedMemoryTile>(shared_mem_raw);
+#elif TENSORFLOW_USE_ROCM
   __shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1];
+#endif
 
   int x = threadIdx.x;
 
@@ -597,7 +607,7 @@ constexpr bool TileSizeOnNonLongSideFrontier(int TileLongSide,
   // For a tile size combination (longside, shortside), lying on the frontier
   // implies that (longside, shortside) is on or within the frontier but
   // (longside*2, shortside) or (longside, shortside+1) is not. With the above
-  // critereon, we simply need to use !TileSizeOnLongSideFrontier to ensure that
+  // criterion, we simply need to use !TileSizeOnLongSideFrontier to ensure that
   // it is not on the long side frontier.
   return !TileSizeOutsideFrontier(TileLongSide, TileShortSide, size_of_t) &&
          (TileSizeOutsideFrontier(TileLongSide * 2, TileShortSide, size_of_t) ||
@@ -1029,6 +1039,7 @@ template struct functor::SwapDimension1And2InTensor3<GPUDevice, float2,
                                                      /*conjugate=*/true>;
 template struct functor::SwapDimension1And2InTensor3<GPUDevice, double2,
                                                      /*conjugate=*/true>;
+template struct functor::SwapDimension1And2InTensor3<GPUDevice, Eigen::half>;
 
 template struct functor::SwapDimension0And2InTensor3<GPUDevice, uint8>;
 template struct functor::SwapDimension0And2InTensor3<GPUDevice, uint16>;
diff --git a/tensorflow/core/kernels/crop_and_resize_op.cc b/tensorflow/core/kernels/crop_and_resize_op.cc
index 231dffc21bff96..d9e306d368d817 100644
--- a/tensorflow/core/kernels/crop_and_resize_op.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op.cc
@@ -115,10 +115,10 @@ class CropAndResizeOp : public AsyncOpKernel {
  public:
   explicit CropAndResizeOp(OpKernelConstruction* context)
       : AsyncOpKernel(context) {
-    string method;
-    OP_REQUIRES_OK(context, context->GetAttr("method", &method));
-    OP_REQUIRES(context, method == "bilinear",
-                errors::InvalidArgument("method must be 'bilinear'", method));
+    OP_REQUIRES_OK(context, context->GetAttr("method", &method_));
+    OP_REQUIRES(context, method_ == "bilinear" || method_ == "nearest",
+                errors::InvalidArgument(
+                    "method must be 'bilinear' or 'nearest'", method_));
     OP_REQUIRES_OK(context, context->GetAttr("extrapolation_value",
                                              &extrapolation_value_));
   }
@@ -183,7 +183,7 @@ class CropAndResizeOp : public AsyncOpKernel {
       const Tensor& box_index = context->input(2);
       const bool status = functor::CropAndResize<Device, T>()(
           context, image.tensor<T, 4>(), boxes.tensor<float, 2>(),
-          box_index.tensor<int32, 1>(), extrapolation_value_,
+          box_index.tensor<int32, 1>(), method_, extrapolation_value_,
           output->tensor<float, 4>());
       if (!status) {
         context->SetStatus(
@@ -198,6 +198,7 @@ class CropAndResizeOp : public AsyncOpKernel {
 
  private:
   float extrapolation_value_;
+  string method_;
 };
 
 // Partial specialization of CropAndResize functor for a CPUDevice.
@@ -208,7 +209,7 @@ struct CropAndResize<CPUDevice, T> {
                   typename TTypes<T, 4>::ConstTensor image,
                   typename TTypes<float, 2>::ConstTensor boxes,
                   typename TTypes<int32, 1>::ConstTensor box_index,
-                  float extrapolation_value,
+                  const string& method_name, float extrapolation_value,
                   typename TTypes<float, 4>::Tensor crops) {
     const int batch_size = image.dimension(0);
     const int image_height = image.dimension(1);
@@ -252,37 +253,57 @@ struct CropAndResize<CPUDevice, T> {
             }
             continue;
           }
-          const int top_y_index = floorf(in_y);
-          const int bottom_y_index = ceilf(in_y);
-          const float y_lerp = in_y - top_y_index;
-
-          for (int x = 0; x < crop_width; ++x) {
-            const float in_x = (crop_width > 1)
-                                   ? x1 * (image_width - 1) + x * width_scale
-                                   : 0.5 * (x1 + x2) * (image_width - 1);
-            if (in_x < 0 || in_x > image_width - 1) {
+          if (method_name == "bilinear") {
+            const int top_y_index = floorf(in_y);
+            const int bottom_y_index = ceilf(in_y);
+            const float y_lerp = in_y - top_y_index;
+
+            for (int x = 0; x < crop_width; ++x) {
+              const float in_x = (crop_width > 1)
+                                     ? x1 * (image_width - 1) + x * width_scale
+                                     : 0.5 * (x1 + x2) * (image_width - 1);
+              if (in_x < 0 || in_x > image_width - 1) {
+                for (int d = 0; d < depth; ++d) {
+                  crops(b, y, x, d) = extrapolation_value;
+                }
+                continue;
+              }
+              const int left_x_index = floorf(in_x);
+              const int right_x_index = ceilf(in_x);
+              const float x_lerp = in_x - left_x_index;
+
               for (int d = 0; d < depth; ++d) {
-                crops(b, y, x, d) = extrapolation_value;
+                const float top_left(static_cast<float>(
+                    image(b_in, top_y_index, left_x_index, d)));
+                const float top_right(static_cast<float>(
+                    image(b_in, top_y_index, right_x_index, d)));
+                const float bottom_left(static_cast<float>(
+                    image(b_in, bottom_y_index, left_x_index, d)));
+                const float bottom_right(static_cast<float>(
+                    image(b_in, bottom_y_index, right_x_index, d)));
+                const float top = top_left + (top_right - top_left) * x_lerp;
+                const float bottom =
+                    bottom_left + (bottom_right - bottom_left) * x_lerp;
+                crops(b, y, x, d) = top + (bottom - top) * y_lerp;
               }
-              continue;
             }
-            const int left_x_index = floorf(in_x);
-            const int right_x_index = ceilf(in_x);
-            const float x_lerp = in_x - left_x_index;
-
-            for (int d = 0; d < depth; ++d) {
-              const float top_left(static_cast<float>(
-                  image(b_in, top_y_index, left_x_index, d)));
-              const float top_right(static_cast<float>(
-                  image(b_in, top_y_index, right_x_index, d)));
-              const float bottom_left(static_cast<float>(
-                  image(b_in, bottom_y_index, left_x_index, d)));
-              const float bottom_right(static_cast<float>(
-                  image(b_in, bottom_y_index, right_x_index, d)));
-              const float top = top_left + (top_right - top_left) * x_lerp;
-              const float bottom =
-                  bottom_left + (bottom_right - bottom_left) * x_lerp;
-              crops(b, y, x, d) = top + (bottom - top) * y_lerp;
+          } else {  // method == "nearest"
+            for (int x = 0; x < crop_width; ++x) {
+              const float in_x = (crop_width > 1)
+                                     ? x1 * (image_width - 1) + x * width_scale
+                                     : 0.5 * (x1 + x2) * (image_width - 1);
+              if (in_x < 0 || in_x > image_width - 1) {
+                for (int d = 0; d < depth; ++d) {
+                  crops(b, y, x, d) = extrapolation_value;
+                }
+                continue;
+              }
+              const int closest_x_index = roundf(in_x);
+              const int closest_y_index = roundf(in_y);
+              for (int d = 0; d < depth; ++d) {
+                crops(b, y, x, d) = static_cast<float>(
+                    image(b_in, closest_y_index, closest_x_index, d));
+              }
             }
           }
         }
@@ -290,12 +311,17 @@ struct CropAndResize<CPUDevice, T> {
     };
 
     // A rough estimation of the cost for each cropped box.
-    const double cost_per_pixel =
+    double cost_per_pixel =
         depth * (Eigen::TensorOpCost::AddCost<float>() * 6 +
                  Eigen::TensorOpCost::MulCost<float>() * 3 +
                  Eigen::TensorOpCost::CastCost<T, float>() * 4) +
         (Eigen::TensorOpCost::AddCost<float>() * 2 +
          Eigen::TensorOpCost::AddCost<float>() * 3);
+    if (method_name == "nearest") {
+      cost_per_pixel = depth * Eigen::TensorOpCost::CastCost<T, float>() +
+                       Eigen::TensorOpCost::AddCost<float>() * 4 +
+                       Eigen::TensorOpCost::MulCost<float>() * 4;
+    }
     const double cost_per_box = crop_height * crop_width * cost_per_pixel;
 
     const DeviceBase::CpuWorkerThreads& worker_threads =
@@ -314,10 +340,10 @@ class CropAndResizeGradImageOp : public AsyncOpKernel {
  public:
   explicit CropAndResizeGradImageOp(OpKernelConstruction* context)
       : AsyncOpKernel(context) {
-    string method;
-    OP_REQUIRES_OK(context, context->GetAttr("method", &method));
-    OP_REQUIRES(context, method == "bilinear",
-                errors::InvalidArgument("method must be 'bilinear'", method));
+    OP_REQUIRES_OK(context, context->GetAttr("method", &method_));
+    OP_REQUIRES(context, method_ == "bilinear" || method_ == "nearest",
+                errors::InvalidArgument(
+                    "method must be 'bilinear' or 'nearest'", method_));
   }
 
   void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
@@ -377,14 +403,14 @@ class CropAndResizeGradImageOp : public AsyncOpKernel {
             &output),
         done);
 
-    auto compute_callback = [context, output]() {
+    auto compute_callback = [this, context, output]() {
       const Tensor& grads = context->input(0);
       const Tensor& boxes = context->input(1);
       const Tensor& box_index = context->input(2);
       const bool status = functor::CropAndResizeBackpropImage<Device, T>()(
           context->eigen_device<Device>(), grads.tensor<float, 4>(),
           boxes.tensor<float, 2>(), box_index.tensor<int32, 1>(),
-          output->tensor<T, 4>());
+          output->tensor<T, 4>(), method_);
       if (!status) {
         context->SetStatus(errors::Internal(
             "Failed launch CropAndResizeBackpropImage kernel."));
@@ -395,6 +421,9 @@ class CropAndResizeGradImageOp : public AsyncOpKernel {
                                  batch_size, std::move(compute_callback),
                                  std::move(done));
   }
+
+ private:
+  string method_;
 };
 
 // Partial specialization of CropAndResizeBackpropImage functor for a CPUDevice.
@@ -405,7 +434,8 @@ struct CropAndResizeBackpropImage<CPUDevice, T> {
                   typename TTypes<float, 4>::ConstTensor grads,
                   typename TTypes<float, 2>::ConstTensor boxes,
                   typename TTypes<int32, 1>::ConstTensor box_index,
-                  typename TTypes<T, 4>::Tensor grads_image) {
+                  typename TTypes<T, 4>::Tensor grads_image,
+                  const string& method_name) {
     const int batch_size = grads_image.dimension(0);
     const int image_height = grads_image.dimension(1);
     const int image_width = grads_image.dimension(2);
@@ -453,21 +483,30 @@ struct CropAndResizeBackpropImage<CPUDevice, T> {
           if (in_x < 0 || in_x > image_width - 1) {
             continue;
           }
-          const int left_x_index = floorf(in_x);
-          const int right_x_index = ceilf(in_x);
-          const float x_lerp = in_x - left_x_index;
+          if (method_name == "bilinear") {
+            const int left_x_index = floorf(in_x);
+            const int right_x_index = ceilf(in_x);
+            const float x_lerp = in_x - left_x_index;
 
-          for (int d = 0; d < depth; ++d) {
-            const float dtop = (1 - y_lerp) * grads(b, y, x, d);
-            grads_image(b_in, top_y_index, left_x_index, d) +=
-                static_cast<T>((1 - x_lerp) * dtop);
-            grads_image(b_in, top_y_index, right_x_index, d) +=
-                static_cast<T>(x_lerp * dtop);
-            const float dbottom = y_lerp * grads(b, y, x, d);
-            grads_image(b_in, bottom_y_index, left_x_index, d) +=
-                static_cast<T>((1 - x_lerp) * dbottom);
-            grads_image(b_in, bottom_y_index, right_x_index, d) +=
-                static_cast<T>(x_lerp * dbottom);
+            for (int d = 0; d < depth; ++d) {
+              const float dtop = (1 - y_lerp) * grads(b, y, x, d);
+              grads_image(b_in, top_y_index, left_x_index, d) +=
+                  static_cast<T>((1 - x_lerp) * dtop);
+              grads_image(b_in, top_y_index, right_x_index, d) +=
+                  static_cast<T>(x_lerp * dtop);
+              const float dbottom = y_lerp * grads(b, y, x, d);
+              grads_image(b_in, bottom_y_index, left_x_index, d) +=
+                  static_cast<T>((1 - x_lerp) * dbottom);
+              grads_image(b_in, bottom_y_index, right_x_index, d) +=
+                  static_cast<T>(x_lerp * dbottom);
+            }
+          } else {  // method_name == "nearest"
+            for (int d = 0; d < depth; ++d) {
+              int closest_x_index = roundf(in_x);
+              int closest_y_index = roundf(in_y);
+              grads_image(b_in, closest_y_index, closest_x_index, d) +=
+                  static_cast<T>(grads(b, y, x, d));
+            }
           }
         }
       }
diff --git a/tensorflow/core/kernels/crop_and_resize_op.h b/tensorflow/core/kernels/crop_and_resize_op.h
index b6b1dbd7b0c3d4..61dc3f941f6c8c 100644
--- a/tensorflow/core/kernels/crop_and_resize_op.h
+++ b/tensorflow/core/kernels/crop_and_resize_op.h
@@ -31,7 +31,7 @@ struct CropAndResize {
                   typename TTypes<T, 4>::ConstTensor image,
                   typename TTypes<float, 2>::ConstTensor boxes,
                   typename TTypes<int32, 1>::ConstTensor box_ind,
-                  float extrapolation_value,
+                  string method_name, float extrapolation_value,
                   typename TTypes<float, 4>::Tensor crops);
 };
 
@@ -41,7 +41,8 @@ struct CropAndResizeBackpropImage {
   bool operator()(const Device& d, typename TTypes<float, 4>::ConstTensor grads,
                   typename TTypes<float, 2>::ConstTensor boxes,
                   typename TTypes<int32, 1>::ConstTensor box_ind,
-                  typename TTypes<T, 4>::Tensor grads_image);
+                  typename TTypes<T, 4>::Tensor grads_image,
+                  const string& method_name);
 };
 
 template <typename Device, typename T>
diff --git a/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc b/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc
index 476f69d7f7f0d1..c4d46395c7661d 100644
--- a/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op_gpu.cu.cc
@@ -32,11 +32,16 @@ typedef Eigen::GpuDevice GPUDevice;
 
 namespace {
 
+enum InterpolationMethod {
+  BILINEAR = 0,
+  NEAREST = 1,
+};
+
 template <typename T>
 __global__ void CropAndResizeKernel(
     const int32 nthreads, const T* image_ptr, const float* boxes_ptr,
     const int32* box_ind_ptr, int num_boxes, int batch, int image_height,
-    int image_width, int crop_height, int crop_width, int depth,
+    int image_width, int crop_height, int crop_width, int depth, int method_id,
     float extrapolation_value, float* crops_ptr) {
   GPU_1D_KERNEL_LOOP(out_idx, nthreads) {
     // out_idx = d + depth * (w + crop_width * (h + crop_height * b))
@@ -80,37 +85,47 @@ __global__ void CropAndResizeKernel(
       continue;
     }
 
-    const int top_y_index = floorf(in_y);
-    const int bottom_y_index = ceilf(in_y);
-    const float y_lerp = in_y - top_y_index;
-
-    const int left_x_index = floorf(in_x);
-    const int right_x_index = ceilf(in_x);
-    const float x_lerp = in_x - left_x_index;
-
-    const float top_left(static_cast<float>(
-        image_ptr[((b_in * image_height + top_y_index) * image_width +
-                   left_x_index) *
-                      depth +
-                  d]));
-    const float top_right(static_cast<float>(
-        image_ptr[((b_in * image_height + top_y_index) * image_width +
-                   right_x_index) *
-                      depth +
-                  d]));
-    const float bottom_left(static_cast<float>(
-        image_ptr[((b_in * image_height + bottom_y_index) * image_width +
-                   left_x_index) *
-                      depth +
-                  d]));
-    const float bottom_right(static_cast<float>(
-        image_ptr[((b_in * image_height + bottom_y_index) * image_width +
-                   right_x_index) *
-                      depth +
-                  d]));
-    const float top = top_left + (top_right - top_left) * x_lerp;
-    const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
-    crops_ptr[out_idx] = top + (bottom - top) * y_lerp;
+    if (method_id == BILINEAR) {
+      const int top_y_index = floorf(in_y);
+      const int bottom_y_index = ceilf(in_y);
+      const float y_lerp = in_y - top_y_index;
+
+      const int left_x_index = floorf(in_x);
+      const int right_x_index = ceilf(in_x);
+      const float x_lerp = in_x - left_x_index;
+
+      const float top_left(static_cast<float>(
+          image_ptr[((b_in * image_height + top_y_index) * image_width +
+                     left_x_index) *
+                        depth +
+                    d]));
+      const float top_right(static_cast<float>(
+          image_ptr[((b_in * image_height + top_y_index) * image_width +
+                     right_x_index) *
+                        depth +
+                    d]));
+      const float bottom_left(static_cast<float>(
+          image_ptr[((b_in * image_height + bottom_y_index) * image_width +
+                     left_x_index) *
+                        depth +
+                    d]));
+      const float bottom_right(static_cast<float>(
+          image_ptr[((b_in * image_height + bottom_y_index) * image_width +
+                     right_x_index) *
+                        depth +
+                    d]));
+      const float top = top_left + (top_right - top_left) * x_lerp;
+      const float bottom = bottom_left + (bottom_right - bottom_left) * x_lerp;
+      crops_ptr[out_idx] = top + (bottom - top) * y_lerp;
+    } else {  // method_id == kMethodNearestId
+      const int closest_x_index = roundf(in_x);
+      const int closest_y_index = roundf(in_y);
+      crops_ptr[out_idx] = static_cast<float>(
+          image_ptr[((b_in * image_height + closest_y_index) * image_width +
+                     closest_x_index) *
+                        depth +
+                    d]);
+    }
   }
 }
 
@@ -119,7 +134,7 @@ __global__ void CropAndResizeBackpropImageKernel(
     const int32 nthreads, const float* grads_ptr, const float* boxes_ptr,
     const int32* box_ind_ptr, int num_boxes, int batch, int image_height,
     int image_width, int crop_height, int crop_width, int depth,
-    T* grads_image_ptr) {
+    T* grads_image_ptr, int method_id) {
   GPU_1D_KERNEL_LOOP(out_idx, nthreads) {
     // out_idx = d + depth * (w + crop_width * (h + crop_height * b))
     int idx = out_idx;
@@ -160,41 +175,52 @@ __global__ void CropAndResizeBackpropImageKernel(
       continue;
     }
 
-    const int top_y_index = floorf(in_y);
-    const int bottom_y_index = ceilf(in_y);
-    const float y_lerp = in_y - top_y_index;
-
-    const int left_x_index = floorf(in_x);
-    const int right_x_index = ceilf(in_x);
-    const float x_lerp = in_x - left_x_index;
-
-    const float dtop = (1 - y_lerp) * grads_ptr[out_idx];
-    GpuAtomicAdd(
-        grads_image_ptr +
-            ((b_in * image_height + top_y_index) * image_width + left_x_index) *
-                depth +
-            d,
-        static_cast<T>((1 - x_lerp) * dtop));
-    GpuAtomicAdd(grads_image_ptr +
-                      ((b_in * image_height + top_y_index) * image_width +
-                       right_x_index) *
-                          depth +
-                      d,
-                  static_cast<T>(x_lerp * dtop));
-
-    const float dbottom = y_lerp * grads_ptr[out_idx];
-    GpuAtomicAdd(grads_image_ptr +
-                      ((b_in * image_height + bottom_y_index) * image_width +
-                       left_x_index) *
-                          depth +
-                      d,
-                  static_cast<T>((1 - x_lerp) * dbottom));
-    GpuAtomicAdd(grads_image_ptr +
-                      ((b_in * image_height + bottom_y_index) * image_width +
-                       right_x_index) *
-                          depth +
-                      d,
-                  static_cast<T>(x_lerp * dbottom));
+    if (method_id == BILINEAR) {
+      const int top_y_index = floorf(in_y);
+      const int bottom_y_index = ceilf(in_y);
+      const float y_lerp = in_y - top_y_index;
+
+      const int left_x_index = floorf(in_x);
+      const int right_x_index = ceilf(in_x);
+      const float x_lerp = in_x - left_x_index;
+
+      const float dtop = (1 - y_lerp) * grads_ptr[out_idx];
+      GpuAtomicAdd(grads_image_ptr +
+                        ((b_in * image_height + top_y_index) * image_width +
+                         left_x_index) *
+                            depth +
+                        d,
+                    static_cast<T>((1 - x_lerp) * dtop));
+      GpuAtomicAdd(grads_image_ptr +
+                        ((b_in * image_height + top_y_index) * image_width +
+                         right_x_index) *
+                            depth +
+                        d,
+                    static_cast<T>(x_lerp * dtop));
+
+      const float dbottom = y_lerp * grads_ptr[out_idx];
+      GpuAtomicAdd(grads_image_ptr +
+                        ((b_in * image_height + bottom_y_index) * image_width +
+                         left_x_index) *
+                            depth +
+                        d,
+                    static_cast<T>((1 - x_lerp) * dbottom));
+      GpuAtomicAdd(grads_image_ptr +
+                        ((b_in * image_height + bottom_y_index) * image_width +
+                         right_x_index) *
+                            depth +
+                        d,
+                    static_cast<T>(x_lerp * dbottom));
+    } else {  // method_id == NEAREST
+      const int closest_x_index = roundf(in_x);
+      const int closest_y_index = roundf(in_y);
+      GpuAtomicAdd(grads_image_ptr +
+                        ((b_in * image_height + closest_y_index) * image_width +
+                         closest_x_index) *
+                            depth +
+                        d,
+                    static_cast<T>(grads_ptr[out_idx]));
+    }
   }
 }
 
@@ -324,7 +350,7 @@ struct CropAndResize<GPUDevice, T> {
                   typename TTypes<T, 4>::ConstTensor image,
                   typename TTypes<float, 2>::ConstTensor boxes,
                   typename TTypes<int32, 1>::ConstTensor box_ind,
-                  float extrapolation_value,
+                  string method_name, float extrapolation_value,
                   typename TTypes<float, 4>::Tensor crops) {
     const int batch = image.dimension(0);
     const int image_height = image.dimension(1);
@@ -338,14 +364,20 @@ struct CropAndResize<GPUDevice, T> {
     const int total_count = num_boxes * crop_height * crop_width * depth;
     const GPUDevice& d = context->eigen_device<GPUDevice>();
 
+    InterpolationMethod method = BILINEAR;
+    if (method_name == "nearest") {
+      method = NEAREST;
+    }
+
     if (total_count > 0) {
       GpuLaunchConfig config = GetGpuLaunchConfig(total_count, d);
-      GPU_LAUNCH_KERNEL(CropAndResizeKernel,
+      GPU_LAUNCH_KERNEL(CropAndResizeKernel<T>,
           dim3(config.block_count), dim3(config.thread_per_block), 0,
           d.stream(),
           config.virtual_thread_count, image.data(), boxes.data(),
           box_ind.data(), num_boxes, batch, image_height, image_width,
-          crop_height, crop_width, depth, extrapolation_value, crops.data());
+          crop_height, crop_width, depth, method, extrapolation_value,
+          crops.data());
     }
     return d.ok();
   }
@@ -357,7 +389,8 @@ struct CropAndResizeBackpropImage<GPUDevice, T> {
                   typename TTypes<float, 4>::ConstTensor grads,
                   typename TTypes<float, 2>::ConstTensor boxes,
                   typename TTypes<int32, 1>::ConstTensor box_ind,
-                  typename TTypes<T, 4>::Tensor grads_image) {
+                  typename TTypes<T, 4>::Tensor grads_image,
+                  const string& method_name) {
     const int batch = grads_image.dimension(0);
     const int image_height = grads_image.dimension(1);
     const int image_width = grads_image.dimension(2);
@@ -374,22 +407,28 @@ struct CropAndResizeBackpropImage<GPUDevice, T> {
     total_count = batch * image_height * image_width * depth;
     if (total_count > 0) {
       config = GetGpuLaunchConfig(total_count, d);
-      GPU_LAUNCH_KERNEL(SetZero,
+      GPU_LAUNCH_KERNEL(SetZero<T>,
           dim3(config.block_count), dim3(config.thread_per_block), 0,
           d.stream(),
           config.virtual_thread_count, grads_image.data());
     }
 
+    // Configurate interpolation method.
+    InterpolationMethod method = BILINEAR;
+    if (method_name == "nearest") {
+      method = NEAREST;
+    }
+
     // Accumulate.
     total_count = num_boxes * crop_height * crop_width * depth;
     if (total_count > 0) {
       config = GetGpuLaunchConfig(total_count, d);
-      GPU_LAUNCH_KERNEL(CropAndResizeBackpropImageKernel,
+      GPU_LAUNCH_KERNEL(CropAndResizeBackpropImageKernel<T>,
           dim3(config.block_count), dim3(config.thread_per_block), 0,
           d.stream(),
           config.virtual_thread_count, grads.data(), boxes.data(),
           box_ind.data(), num_boxes, batch, image_height, image_width,
-          crop_height, crop_width, depth, grads_image.data());
+          crop_height, crop_width, depth, grads_image.data(), method);
     }
     return d.ok();
   }
@@ -419,7 +458,7 @@ struct CropAndResizeBackpropBoxes<GPUDevice, T> {
     total_count = num_boxes * 4;
     if (total_count > 0) {
       config = GetGpuLaunchConfig(total_count, d);
-      GPU_LAUNCH_KERNEL(SetZero,
+      GPU_LAUNCH_KERNEL(SetZero<T>,
           dim3(config.block_count), dim3(config.thread_per_block), 0,
           d.stream(),
           config.virtual_thread_count, grads_boxes.data());
@@ -429,7 +468,7 @@ struct CropAndResizeBackpropBoxes<GPUDevice, T> {
     total_count = num_boxes * crop_height * crop_width * depth;
     if (total_count > 0) {
       config = GetGpuLaunchConfig(total_count, d);
-      GPU_LAUNCH_KERNEL(CropAndResizeBackpropBoxesKernel,
+      GPU_LAUNCH_KERNEL(CropAndResizeBackpropBoxesKernel<T>,
           dim3(config.block_count), dim3(config.thread_per_block), 0, d.stream(),
           config.virtual_thread_count, grads.data(), image.data(), boxes.data(),
           box_ind.data(), num_boxes, batch, image_height, image_width,
diff --git a/tensorflow/core/kernels/crop_and_resize_op_test.cc b/tensorflow/core/kernels/crop_and_resize_op_test.cc
index 709082e79903d0..6921020d09e94f 100644
--- a/tensorflow/core/kernels/crop_and_resize_op_test.cc
+++ b/tensorflow/core/kernels/crop_and_resize_op_test.cc
@@ -34,13 +34,14 @@ namespace tensorflow {
 class CropAndResizeOpTest : public OpsTestBase {
  protected:
   template <typename T>
-  void MakeOp(float extrapolation_value) {
+  void MakeOp(float extrapolation_value, const string& method) {
     TF_EXPECT_OK(NodeDefBuilder("crop_and_resize_op", "CropAndResize")
                      .Input(FakeInput(DataTypeToEnum<T>::value))
                      .Input(FakeInput(DT_FLOAT))
                      .Input(FakeInput(DT_INT32))
                      .Input(FakeInput(DT_INT32))
                      .Attr("extrapolation_value", extrapolation_value)
+                     .Attr("method", method)
                      .Finalize(node_def()));
     TF_EXPECT_OK(InitOp());
   }
@@ -48,7 +49,7 @@ class CropAndResizeOpTest : public OpsTestBase {
 
 #define REGISTER_TEST(T)                                               \
   TEST_F(CropAndResizeOpTest, TestCropAndResize##T) {                  \
-    MakeOp<T>(0);                                                      \
+    MakeOp<T>(0, "bilinear");                                          \
     AddInputFromArray<T>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});     \
     AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});       \
     AddInputFromArray<int32>(TensorShape({1}), {0});                   \
@@ -58,6 +59,19 @@ class CropAndResizeOpTest : public OpsTestBase {
     Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 1})); \
     test::FillValues<float>(&expected, {2.5});                         \
     test::ExpectTensorEqual<float>(expected, *GetOutput(0));           \
+  }                                                                    \
+                                                                       \
+  TEST_F(CropAndResizeOpTest, TestCropAndResize##T##nearest) {         \
+    MakeOp<T>(0, "nearest");                                           \
+    AddInputFromArray<T>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});     \
+    AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});       \
+    AddInputFromArray<int32>(TensorShape({1}), {0});                   \
+    AddInputFromArray<int32>(TensorShape({2}), {1, 1});                \
+    TF_ASSERT_OK(RunOpKernel());                                       \
+                                                                       \
+    Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 1})); \
+    test::FillValues<float>(&expected, {4.0});                         \
+    test::ExpectTensorEqual<float>(expected, *GetOutput(0));           \
   }
 
 REGISTER_TEST(float)
@@ -72,7 +86,7 @@ REGISTER_TEST(int64)
 #undef REGISTER_TEST
 
 TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To1x1Uint8) {
-  MakeOp<uint8>(0);
+  MakeOp<uint8>(0, "bilinear");
   // Input:
   //  1, 2
   //  3, 4
@@ -87,8 +101,24 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To1x1Uint8) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
+TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To1x1Uint8NearestNeibor) {
+  MakeOp<uint8>(0, "nearest");
+  // Input:
+  //  1, 2
+  //  3, 4
+  AddInputFromArray<uint8>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+  AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
+  AddInputFromArray<int32>(TensorShape({1}), {0});
+  AddInputFromArray<int32>(TensorShape({2}), {1, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 1}));
+  test::FillValues<float>(&expected, {4.0});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
 TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To1x1Flipped) {
-  MakeOp<float>(0);
+  MakeOp<float>(0, "bilinear");
   // Input:
   //  1, 2
   //  3, 4
@@ -103,8 +133,24 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To1x1Flipped) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
+TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To1x1FlippedNearestNeighbor) {
+  MakeOp<float>(0, "nearest");
+  // Input:
+  //  1, 2
+  //  3, 4
+  AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+  AddInputFromArray<float>(TensorShape({1, 4}), {1, 1, 0, 0});
+  AddInputFromArray<int32>(TensorShape({1}), {0});
+  AddInputFromArray<int32>(TensorShape({2}), {1, 1});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 1, 1, 1}));
+  test::FillValues<float>(&expected, {4.0});
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
 TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3) {
-  MakeOp<float>(0);
+  MakeOp<float>(0, "bilinear");
   // Input:
   //  1, 2
   //  3, 4
@@ -124,8 +170,29 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
+TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3NearestNeighbor) {
+  MakeOp<float>(0, "nearest");
+  // Input:
+  //  1, 2
+  //  3, 4
+  AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+  AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
+  AddInputFromArray<int32>(TensorShape({1}), {0});
+  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
+  // clang-format off
+  test::FillValues<float>(&expected,
+    {1,  2,  2,
+     3,  4,  4,
+     3,  4,  4});
+  // clang-format on
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
 TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3Flipped) {
-  MakeOp<float>(0);
+  MakeOp<float>(0, "bilinear");
   // Input:
   //  1, 2
   //  3, 4
@@ -145,8 +212,54 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3Flipped) {
   test::ExpectTensorEqual<float>(expected, *GetOutput(0));
 }
 
+TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3FlippedNearestNeighbor) {
+  MakeOp<float>(0, "nearest");
+  // Input:
+  //  1, 2
+  //  3, 4
+  AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
+  AddInputFromArray<float>(TensorShape({1, 4}), {1, 1, 0, 0});
+  AddInputFromArray<int32>(TensorShape({1}), {0});
+  AddInputFromArray<int32>(TensorShape({2}), {3, 3});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({1, 3, 3, 1}));
+  // clang-format off
+  test::FillValues<float>(&expected,
+    {4,  4,  3,
+     4,  4,  3,
+     2,  2,  1});
+  // clang-format on
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
 TEST_F(CropAndResizeOpTest, TestCropAndResize3x3To2x2) {
-  MakeOp<float>(0);
+  MakeOp<float>(0, "bilinear");
+  // Input:
+  //  1, 2, 3
+  //  4, 5, 6
+  //  7, 8, 9
+  AddInputFromArray<float>(TensorShape({1, 3, 3, 1}),
+                           {1, 2, 3, 4, 5, 6, 7, 8, 9});
+  AddInputFromArray<float>(TensorShape({2, 4}), {0, 0, 1, 1, 0, 0, 0.5, 0.5});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 0});
+  AddInputFromArray<int32>(TensorShape({2}), {2, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 2, 1}));
+
+  // clang-format off
+  test::FillValues<float>(&expected,
+    {1,  3,
+     7,  9,
+     1,  2,
+     4,  5});
+  // clang-format on
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(CropAndResizeOpTest, TestCropAndResize3x3To2x2NearestNeighbor) {
+  MakeOp<float>(0, "nearest");
   // Input:
   //  1, 2, 3
   //  4, 5, 6
@@ -171,7 +284,32 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize3x3To2x2) {
 }
 
 TEST_F(CropAndResizeOpTest, TestCropAndResize3x3To2x2Flipped) {
-  MakeOp<float>(0);
+  MakeOp<float>(0, "bilinear");
+  // Input:
+  //  1, 2, 3
+  //  4, 5, 6
+  //  7, 8, 9
+  AddInputFromArray<float>(TensorShape({1, 3, 3, 1}),
+                           {1, 2, 3, 4, 5, 6, 7, 8, 9});
+  AddInputFromArray<float>(TensorShape({2, 4}), {1, 1, 0, 0, 0.5, 0.5, 0, 0});
+  AddInputFromArray<int32>(TensorShape({2}), {0, 0});
+  AddInputFromArray<int32>(TensorShape({2}), {2, 2});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 2, 1}));
+
+  // clang-format off
+  test::FillValues<float>(&expected,
+    {9,  7,
+     3,  1,
+     5,  4,
+     2,  1});
+  // clang-format on
+  test::ExpectTensorEqual<float>(expected, *GetOutput(0));
+}
+
+TEST_F(CropAndResizeOpTest, TestCropAndResize3x3To2x2FlippedNearestNeighbor) {
+  MakeOp<float>(0, "nearest");
   // Input:
   //  1, 2, 3
   //  4, 5, 6
@@ -197,7 +335,7 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize3x3To2x2Flipped) {
 
 TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3Extrapolated) {
   const float v = -1;
-  MakeOp<float>(v);
+  MakeOp<float>(v, "bilinear");
   // Input:
   //  1, 2
   //  3, 4
@@ -218,7 +356,7 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3Extrapolated) {
 }
 
 TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3NoCrop) {
-  MakeOp<float>(0);
+  MakeOp<float>(0, "bilinear");
   // Input:
   //  1, 2
   //  3, 4
@@ -236,7 +374,7 @@ TEST_F(CropAndResizeOpTest, TestCropAndResize2x2To3x3NoCrop) {
 }
 
 TEST_F(CropAndResizeOpTest, TestInvalidInputShape) {
-  MakeOp<float>(0);
+  MakeOp<float>(0, "bilinear");
   AddInputFromArray<float>(TensorShape({2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
   AddInputFromArray<int32>(TensorShape({1}), {0});
@@ -248,7 +386,7 @@ TEST_F(CropAndResizeOpTest, TestInvalidInputShape) {
 }
 
 TEST_F(CropAndResizeOpTest, TestInvalidBoxIndexShape) {
-  MakeOp<float>(0);
+  MakeOp<float>(0, "bilinear");
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
   AddInputFromArray<int32>(TensorShape({2}), {0, 0});
@@ -261,7 +399,7 @@ TEST_F(CropAndResizeOpTest, TestInvalidBoxIndexShape) {
 }
 
 TEST_F(CropAndResizeOpTest, TestInvalidBoxIndex) {
-  MakeOp<float>(0);
+  MakeOp<float>(0, "bilinear");
   AddInputFromArray<float>(TensorShape({1, 2, 2, 1}), {1, 2, 3, 4});
   AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
   AddInputFromArray<int32>(TensorShape({1}), {1});
@@ -274,7 +412,7 @@ TEST_F(CropAndResizeOpTest, TestInvalidBoxIndex) {
 }
 
 TEST_F(CropAndResizeOpTest, TestWithSharding) {
-  MakeOp<float>(0);
+  MakeOp<float>(0, "bilinear");
   // Generate a relatively large input (999x999) so that sharding happens.
   const int kLength = 999;  // Length of the input. Must use an odd number.
   const int kHalf = (kLength + 1) / 2;  // Half size for the cropped result.
diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc
index e1dc25ab61fc0e..a342c24018a2b2 100644
--- a/tensorflow/core/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc
@@ -352,7 +352,7 @@ struct ToTFDataType<uint8> : std::integral_constant<DataType, DT_UINT8> {};
 template <typename T>
 class CudnnRnnAllocatorInTemp : public ScratchAllocator {
  public:
-  ~CudnnRnnAllocatorInTemp() = default;
+  ~CudnnRnnAllocatorInTemp() override = default;
 
   explicit CudnnRnnAllocatorInTemp(OpKernelContext* context)
       : context_(context) {}
@@ -571,7 +571,7 @@ Status ExtractForwardInput(OpKernelContext* context,
           : 1;
 
   if ((*input_h)->dims() != 3) {
-    return errors::InvalidArgument("RNN input must be a 3-D vector.");
+    return errors::InvalidArgument("RNN input_h must be a 3-D vector.");
   }
   model_shapes->num_layers = (*input_h)->dim_size(0) / model_shapes->dir_count;
   model_shapes->num_units = (*input_h)->dim_size(2);
@@ -1411,7 +1411,7 @@ class CudnnRNNForwardOpV2<GPUDevice, T>
       CudnnRnnAllocatorInTemp<T> reserve_space_allocator(context);
       CudnnRnnAllocatorInTemp<uint8> workspace_allocator(context);
       status = DoForward<T>(
-          context, *rnn_desc.get(), model_types(), model_shapes, input, input_h,
+          context, *rnn_desc, model_types(), model_shapes, input, input_h,
           input_c, params, is_training(), output, output_h, output_c,
           &reserve_space_allocator, &workspace_allocator, &fwd_profile_result);
       if (!status.ok()) {
@@ -1422,12 +1422,11 @@ class CudnnRNNForwardOpV2<GPUDevice, T>
         // Get reserve space from the forward pass.
         Tensor reserve_space = reserve_space_allocator.get_allocated_tensor(0);
         status = DoBackward<T>(
-            context, *rnn_desc.get(), model_types(), model_shapes, input,
-            input_h, input_c, params, output, output_h, output_c,
-            &output_backprop, &output_h_backprop, &output_c_backprop,
-            &reserve_space, &input_backprop, &input_h_backprop,
-            &input_c_backprop, &params_backprop, &workspace_allocator,
-            &bak_profile_result);
+            context, *rnn_desc, model_types(), model_shapes, input, input_h,
+            input_c, params, output, output_h, output_c, &output_backprop,
+            &output_h_backprop, &output_c_backprop, &reserve_space,
+            &input_backprop, &input_h_backprop, &input_c_backprop,
+            &params_backprop, &workspace_allocator, &bak_profile_result);
         if (!status.ok()) {
           continue;
         }
diff --git a/tensorflow/core/kernels/cwise_op_clip.cc b/tensorflow/core/kernels/cwise_op_clip.cc
index ef437e24fcb044..c0c71c5f638ab3 100644
--- a/tensorflow/core/kernels/cwise_op_clip.cc
+++ b/tensorflow/core/kernels/cwise_op_clip.cc
@@ -33,52 +33,41 @@ class ClipOp : public OpKernel {
     const Tensor& in0 = ctx->input(0);
     const Tensor& in1 = ctx->input(1);
     const Tensor& in2 = ctx->input(2);
+    OP_REQUIRES(ctx, (in0.shape() == in1.shape() ||
+                      TensorShapeUtils::IsScalar(in1.shape())) &&
+                     (in0.shape() == in2.shape() ||
+                      TensorShapeUtils::IsScalar(in2.shape())),
+                errors::InvalidArgument(
+                    "clip_value_min and clip_value_max must be either of "
+                    "the same shape as input, or a scalar. ",
+                    "input shape: ", in0.shape().DebugString(),
+                    "clip_value_min shape: ", in1.shape().DebugString(),
+                    "clip_value_max shape: ", in2.shape().DebugString()));
+
+    Tensor* out = nullptr;
+    OP_REQUIRES_OK(
+        ctx, ctx->forward_input_or_allocate_output({0}, 0, in0.shape(), &out));
+    if (out->NumElements() == 0) return;  // Nothing to do for empty output
 
     auto in0_flat = in0.flat<T>();
     auto in1_flat = in1.flat<T>();
     auto in2_flat = in2.flat<T>();
+    auto out_flat = out->flat<T>();
     const Device& d = ctx->eigen_device<Device>();
 
-    Tensor* out = nullptr;
-    OP_REQUIRES_OK(
-        ctx, ctx->forward_input_or_allocate_output({0}, 0, in0.shape(), &out));
-    auto out_flat = out->flat<T>();
     if (in1.shape() == in2.shape()) {
       if (in0.shape() == in1.shape()) {
         functor::TernaryClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                             out_flat);
       } else {
-        OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(in1.shape()),
-                    errors::InvalidArgument(
-                        "clip_value_min and clip_value_max must be either of "
-                        "the same shape as input, or a scalar. ",
-                        "input shape: ", in0.shape().DebugString(),
-                        "clip_value_min shape: ", in1.shape().DebugString(),
-                        "clip_value_max shape: ", in2.shape().DebugString()));
         functor::UnaryClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                           out_flat);
       }
     } else {
       if (in0.shape() == in1.shape()) {
-        OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(in2.shape()),
-                    errors::InvalidArgument(
-                        "clip_value_min and clip_value_max must be either of "
-                        "the same shape as input, or a scalar. ",
-                        "input shape: ", in0.shape().DebugString(),
-                        "clip_value_min shape: ", in1.shape().DebugString(),
-                        "clip_value_max shape: ", in2.shape().DebugString()));
         functor::BinaryLeftClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                                out_flat);
       } else {
-        OP_REQUIRES(ctx,
-                    (in0.shape() == in2.shape() &&
-                     TensorShapeUtils::IsScalar(in1.shape())),
-                    errors::InvalidArgument(
-                        "clip_value_min and clip_value_max must be either of "
-                        "the same shape as input, or a scalar. ",
-                        "input shape: ", in0.shape().DebugString(),
-                        "clip_value_min shape: ", in1.shape().DebugString(),
-                        "clip_value_max shape: ", in2.shape().DebugString()));
         functor::BinaryRightClipOp<Device, T>()(d, in0_flat, in1_flat, in2_flat,
                                                 out_flat);
       }
diff --git a/tensorflow/core/kernels/cwise_op_igammas.cc b/tensorflow/core/kernels/cwise_op_igammas.cc
index a1d7f4dad43bf6..4b5f888bc1f076 100644
--- a/tensorflow/core/kernels/cwise_op_igammas.cc
+++ b/tensorflow/core/kernels/cwise_op_igammas.cc
@@ -18,4 +18,8 @@ limitations under the License.
 namespace tensorflow {
 REGISTER2(BinaryOp, CPU, "Igamma", functor::igamma, float, double);
 REGISTER2(BinaryOp, CPU, "Igammac", functor::igammac, float, double);
+#if GOOGLE_CUDA
+REGISTER2(BinaryOp, GPU, "Igamma", functor::igamma, float, double);
+REGISTER2(BinaryOp, GPU, "Igammac", functor::igammac, float, double);
+#endif
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc b/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
index 7a4d1ce6385aad..30c1db089bf685 100644
--- a/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
+++ b/tensorflow/core/kernels/cwise_op_not_equal_to_1.cc
@@ -21,5 +21,28 @@ REGISTER6(BinaryOp, CPU, "NotEqual", functor::not_equal_to, float, Eigen::half,
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 REGISTER4(BinaryOp, GPU, "NotEqual", functor::not_equal_to, float, Eigen::half,
           double, uint8);
+// A special GPU kernel for int32.
+// TODO(b/25387198): Also enable int32 in device memory. This kernel
+// registration requires all int32 inputs and outputs to be in host memory.
+REGISTER_KERNEL_BUILDER(Name("NotEqual")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::not_equal_to<int32>>);
 #endif
+
+#ifdef TENSORFLOW_USE_SYCL
+REGISTER2(BinaryOp, SYCL, "NotEqual", functor::not_equal_to, float, double);
+
+REGISTER_KERNEL_BUILDER(Name("NotEqual")
+                            .Device(DEVICE_SYCL)
+                            .HostMemory("x")
+                            .HostMemory("y")
+                            .HostMemory("z")
+                            .TypeConstraint<int32>("T"),
+                        BinaryOp<CPUDevice, functor::not_equal_to<int32>>);
+#endif  // TENSORFLOW_USE_SYCL
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index c78e0aff833319..da330e742e0482 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -81,7 +81,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/kernels:batch_util",
     ],
 )
 
@@ -94,7 +93,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/kernels:batch_util",
     ],
 )
 
@@ -107,7 +105,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/kernels:batch_util",
     ],
 )
 
@@ -123,6 +120,20 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "group_by_reducer_dataset_op",
+    srcs = ["group_by_reducer_dataset_op.cc"],
+    deps = [
+        ":captured_function",
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
 tf_kernel_library(
     name = "group_by_window_dataset_op",
     srcs = ["group_by_window_dataset_op.cc"],
@@ -429,7 +440,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/kernels:batch_util",
     ],
 )
 
@@ -442,7 +452,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/kernels:batch_util",
     ],
 )
 
@@ -455,7 +464,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/kernels:batch_util",
     ],
 )
 
@@ -540,21 +548,69 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "identity_dataset_op",
+    srcs = ["identity_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:framework",
+    ],
+)
+
+tf_kernel_library(
+    name = "optimize_dataset_op",
+    srcs = ["optimize_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_view",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:grappler_item_builder",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
+        "//tensorflow/core/grappler/optimizers:meta_optimizer",
+        "//tensorflow/core/grappler/optimizers/data",
+    ],
+)
+
 tf_kernel_library(
     name = "dataset_ops",
+    srcs = ["dataset_ops.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "data",
     deps = [
         ":batch_dataset_op",
         ":cache_dataset_ops",
         ":concatenate_dataset_op",
+        ":dataset",
+        ":dataset_ops",
         ":dense_to_sparse_batch_dataset_op",
         ":filter_dataset_op",
         ":flat_map_dataset_op",
         ":generator_dataset_op",
+        ":group_by_reducer_dataset_op",
         ":group_by_window_dataset_op",
+        ":identity_dataset_op",
         ":interleave_dataset_op",
         ":iterator_ops",
         ":map_and_batch_dataset_op",
         ":map_dataset_op",
+        ":optimize_dataset_op",
         ":padded_batch_dataset_op",
         ":parallel_interleave_dataset_op",
         ":parallel_map_dataset_op",
diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index 7fa67efb9e22e6..9a83c16f33f69b 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
 
@@ -61,7 +61,7 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(new Iterator(
           Iterator::Params{this, strings::StrCat(prefix, "::Batch")}));
@@ -75,7 +75,7 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override {
+    string DebugString() const override {
       return strings::StrCat("BatchDatasetOp(", batch_size_, ")::Dataset");
     }
 
@@ -95,8 +95,11 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc
index 4b4728dab68523..3673df6fa3fa11 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc
@@ -64,7 +64,7 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
 
     ~FileDataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       if (env_->FileExists(strings::StrCat(filename_, ".index")).ok()) {
         return std::unique_ptr<IteratorBase>(new FileReaderIterator(
@@ -83,7 +83,9 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() override { return "CacheDatasetOp::FileDataset"; }
+    string DebugString() const override {
+      return "CacheDatasetOp::FileDataset";
+    }
 
    private:
     static size_t StringPaddingSize(size_t num_tensors) {
@@ -106,12 +108,15 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
       explicit FileWriterIterator(const Params& params)
           : DatasetIterator<FileDataset>(params),
             cur_index_(0),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
             writer_(params.dataset->env_, params.dataset->filename_),
             lockfile_(strings::StrCat(params.dataset->filename_, ".lockfile")),
             lockfile_created_(false),
             iteration_completed_(false) {}
 
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
@@ -268,7 +273,7 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
 
     ~MemoryDataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       mutex_lock l(mu_);
       if (cache_) {
@@ -292,7 +297,9 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() override { return "CacheDatasetOp::MemoryDataset"; }
+    string DebugString() const override {
+      return "CacheDatasetOp::MemoryDataset";
+    }
 
    private:
     // MemoryWriterIterator passes through and appends items from the input
@@ -305,7 +312,6 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
      public:
       explicit MemoryWriterIterator(const Params& params)
           : DatasetIterator<MemoryDataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
             cache_(new std::vector<std::vector<Tensor>>) {}
 
       ~MemoryWriterIterator() override {
@@ -323,6 +329,10 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
         }
       }
 
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index dd61b7daee153b..ee58341cfd6803 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -32,6 +32,20 @@ Status CapturedFunction::Create(
   return Status::OK();
 }
 
+/* static */
+Status CapturedFunction::Create(
+    const NameAttrList& func, OpKernelContext* ctx, const string& argument,
+    std::unique_ptr<CapturedFunction>* out_function) {
+  OpInputList argument_inputs;
+  TF_RETURN_IF_ERROR(ctx->input_list(argument, &argument_inputs));
+  std::vector<Tensor> arguments_t;
+  arguments_t.reserve(argument_inputs.size());
+  for (const Tensor& t : argument_inputs) {
+    arguments_t.push_back(t);
+  }
+  return CapturedFunction::Create(func, std::move(arguments_t), out_function);
+}
+
 CapturedFunction::~CapturedFunction() {
   if (lib_ != nullptr && f_handle_ != kInvalidHandle) {
     lib_->ReleaseHandle(f_handle_).IgnoreError();
diff --git a/tensorflow/core/kernels/data/captured_function.h b/tensorflow/core/kernels/data/captured_function.h
index 490f5cd1e3b667..e9ad3e381d4ea0 100644
--- a/tensorflow/core/kernels/data/captured_function.h
+++ b/tensorflow/core/kernels/data/captured_function.h
@@ -40,12 +40,20 @@ class ResourceMgr;
 // context.
 class CapturedFunction {
  public:
+  // Creates a new instance from a list of named attributes and captured inputs.
+  //
   // NOTE(mrry): The `captured_inputs` are passed by value. For
   // efficiency, you are recommended to move this argument into the call.
   static Status Create(const NameAttrList& func,
                        std::vector<Tensor> captured_inputs,
                        std::unique_ptr<CapturedFunction>* out_function);
 
+  // Creates a new instance using a list of named attributes, fetching captured
+  // inputs from a context argument.
+  static Status Create(const NameAttrList& func, OpKernelContext* ctx,
+                       const string& argument,
+                       std::unique_ptr<CapturedFunction>* out_function);
+
   ~CapturedFunction();
 
   // Runs the "Captured function" using the given FLR and caches the lib and
@@ -87,6 +95,9 @@ class CapturedFunction {
                 std::vector<Tensor>* rets,
                 FunctionLibraryRuntime::DoneCallback done);
 
+  // Returns the named list of function arguments.
+  const NameAttrList& func() { return func_; }
+
   // Returns that additional captured inputs that will be passed to the function
   // when `Run*()` is called.
   const std::vector<Tensor>& captured_inputs() { return captured_inputs_; }
diff --git a/tensorflow/core/kernels/data/concatenate_dataset_op.cc b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
index f11abc62a67a69..0012a4769d1974 100644
--- a/tensorflow/core/kernels/data/concatenate_dataset_op.cc
+++ b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
@@ -61,7 +61,7 @@ class ConcatenateDatasetOp : public BinaryDatasetOpKernel {
       to_concatenate_->Unref();
     }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::Concatenate")}));
@@ -75,7 +75,9 @@ class ConcatenateDatasetOp : public BinaryDatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override { return "ConcatenateDatasetOp::Dataset"; }
+    string DebugString() const override {
+      return "ConcatenateDatasetOp::Dataset";
+    }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
@@ -94,10 +96,12 @@ class ConcatenateDatasetOp : public BinaryDatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            i_(0),
-            input_impl_(params.dataset->input_->MakeIterator(
-                strings::StrCat(params.prefix, "[0]"))) {}
+          : DatasetIterator<Dataset>(params), i_(0) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(
+            ctx, strings::StrCat(prefix(), "[0]"), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
@@ -114,8 +118,8 @@ class ConcatenateDatasetOp : public BinaryDatasetOpKernel {
             return Status::OK();
           }
           if (++i_ < 2) {
-            input_impl_ = dataset()->to_concatenate_->MakeIterator(
-                strings::StrCat(prefix(), "[1]"));
+            TF_RETURN_IF_ERROR(dataset()->to_concatenate_->MakeIterator(
+                ctx, strings::StrCat(prefix(), "[1]"), &input_impl_));
           }
         }
         *end_of_sequence = true;
@@ -147,8 +151,8 @@ class ConcatenateDatasetOp : public BinaryDatasetOpKernel {
         if (!TF_PREDICT_TRUE(i_ >= 0 && i_ <= 2))
           return errors::InvalidArgument("i_ must be in range [0, 2].");
         if (i_ == 1) {
-          input_impl_ = dataset()->to_concatenate_->MakeIterator(
-              strings::StrCat(prefix(), "[1]"));
+          TF_RETURN_IF_ERROR(dataset()->to_concatenate_->MakeIterator(
+              ctx, strings::StrCat(prefix(), "[1]"), &input_impl_));
         } else if (i_ == 2) {
           input_impl_.reset();
         }
diff --git a/tensorflow/core/kernels/data/dataset_ops.cc b/tensorflow/core/kernels/data/dataset_ops.cc
new file mode 100644
index 00000000000000..01989a3bd9652d
--- /dev/null
+++ b/tensorflow/core/kernels/data/dataset_ops.cc
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+
+namespace tensorflow {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+class DatasetToGraphOp : public OpKernel {
+ public:
+  explicit DatasetToGraphOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    DatasetBase* dataset;
+    OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset));
+    GraphDefBuilder b;
+    DatasetBase::DatasetGraphDefBuilder db(&b);
+    Node* input_node = nullptr;
+    OP_REQUIRES_OK(ctx, db.AddParentDataset(ctx, dataset, &input_node));
+    GraphDef graph_def;
+    OP_REQUIRES_OK(ctx, b.ToGraphDef(&graph_def));
+    Tensor* result;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &result));
+    result->scalar<string>()() = graph_def.SerializeAsString();
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("DatasetToGraph").Device(DEVICE_CPU),
+                        DatasetToGraphOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/dataset_utils.cc b/tensorflow/core/kernels/data/dataset_utils.cc
index 67ddb52d577edb..d85ef1cbabcfc8 100644
--- a/tensorflow/core/kernels/data/dataset_utils.cc
+++ b/tensorflow/core/kernels/data/dataset_utils.cc
@@ -41,21 +41,8 @@ Status MakeIteratorFromInputElement(
       GetDatasetFromVariantTensor(return_values[0], &returned_dataset));
 
   // Create an iterator for the dataset that was returned by `f`.
-  *out_iterator = returned_dataset->MakeIterator(
-      strings::StrCat(prefix, "[", thread_index, "]"));
-  return Status::OK();
-}
-
-IteratorContext MakeIteratorContext(OpKernelContext* ctx) {
-  IteratorContext::Params params;
-  params.env = ctx->env();
-  params.runner = *(ctx->runner());
-  params.lib = ctx->function_library();
-  DeviceBase* device = ctx->function_library()->device();
-  params.allocator_getter = [device](AllocatorAttributes attrs) {
-    return device->GetAllocator(attrs);
-  };
-  return IteratorContext(params);
+  return returned_dataset->MakeIterator(
+      ctx, strings::StrCat(prefix, "[", thread_index, "]"), out_iterator);
 }
 
 }  // namespace dataset
diff --git a/tensorflow/core/kernels/data/dataset_utils.h b/tensorflow/core/kernels/data/dataset_utils.h
index e5ca71dd99d7f5..6c4191c2be6c55 100644
--- a/tensorflow/core/kernels/data/dataset_utils.h
+++ b/tensorflow/core/kernels/data/dataset_utils.h
@@ -28,8 +28,6 @@ Status MakeIteratorFromInputElement(
     int64 thread_index, CapturedFunction* captured_func, StringPiece prefix,
     std::unique_ptr<IteratorBase>* out_iterator);
 
-IteratorContext MakeIteratorContext(OpKernelContext* ctx);
-
 }  // namespace dataset
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc b/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
index 132808a5f140a3..91b927942763d3 100644
--- a/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
@@ -94,7 +94,7 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(new Iterator(
           {this, strings::StrCat(prefix, "::DenseToSparseBatch")}));
@@ -109,7 +109,7 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override {
+    string DebugString() const override {
       return strings::StrCat("DenseToSparseBatchDatasetOp(", batch_size_,
                              ")::Dataset");
     }
@@ -137,8 +137,12 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset<T>> {
      public:
       explicit Iterator(const typename Iterator::Params& params)
-          : DatasetIterator<Dataset<T>>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<Dataset<T>>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return DatasetIterator<Dataset<T>>::dataset()->input_->MakeIterator(
+            ctx, DatasetIterator<Dataset<T>>::prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
diff --git a/tensorflow/core/kernels/data/filter_dataset_op.cc b/tensorflow/core/kernels/data/filter_dataset_op.cc
index 186b1e1c6c5a3d..6d6c44552d9490 100644
--- a/tensorflow/core/kernels/data/filter_dataset_op.cc
+++ b/tensorflow/core/kernels/data/filter_dataset_op.cc
@@ -93,7 +93,7 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
 
     ~FilterDatasetBase() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::Filter")}));
@@ -106,7 +106,7 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() override { return "FilterDatasetOp::Dataset"; }
+    string DebugString() const override { return "FilterDatasetOp::Dataset"; }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
@@ -145,8 +145,11 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
     class Iterator : public DatasetIterator<FilterDatasetBase> {
      public:
       explicit Iterator(const Params& params)
-          : DatasetIterator<FilterDatasetBase>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<FilterDatasetBase>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
diff --git a/tensorflow/core/kernels/data/flat_map_dataset_op.cc b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
index 77a48a2aa9b0a2..baca022f1eb284 100644
--- a/tensorflow/core/kernels/data/flat_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
@@ -74,7 +74,7 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::FlatMap")}));
@@ -88,7 +88,7 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override { return "FlatMapDatasetOp::Dataset"; }
+    string DebugString() const override { return "FlatMapDatasetOp::Dataset"; }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
@@ -125,8 +125,11 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
@@ -202,7 +205,8 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel {
         current_element_iterator_.reset();
         captured_func_inputs_.clear();
         if (!reader->Contains(full_name("exhausted"))) {
-          input_impl_ = dataset()->input_->MakeIterator(prefix());
+          TF_RETURN_IF_ERROR(
+              dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
           TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
           {
             int64 temp;
diff --git a/tensorflow/core/kernels/data/generator_dataset_op.cc b/tensorflow/core/kernels/data/generator_dataset_op.cc
index 3f1e441b91d010..aae62ad2fe8c2a 100644
--- a/tensorflow/core/kernels/data/generator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/generator_dataset_op.cc
@@ -99,7 +99,7 @@ class GeneratorDatasetOp : public DatasetOpKernel {
           output_types_(output_types),
           output_shapes_(output_shapes) {}
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::Generator")}));
@@ -112,7 +112,9 @@ class GeneratorDatasetOp : public DatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override { return "GeneratorDatasetOp::Dataset"; }
+    string DebugString() const override {
+      return "GeneratorDatasetOp::Dataset";
+    }
 
    private:
     class Iterator : public DatasetIterator<Dataset> {
diff --git a/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc b/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc
new file mode 100644
index 00000000000000..03abae79d210da
--- /dev/null
+++ b/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc
@@ -0,0 +1,427 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <map>
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/lib/random/random.h"
+
+namespace tensorflow {
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit GroupByReducerDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx),
+        graph_def_version_(ctx->graph_def_version()) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("key_func", &key_func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("init_func", &init_func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("reduce_func", &reduce_func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("finalize_func", &finalize_func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    std::unique_ptr<CapturedFunction> captured_key_func;
+    OP_REQUIRES_OK(ctx, CapturedFunction::Create(key_func_, ctx,
+                                                 "key_func_other_arguments",
+                                                 &captured_key_func));
+    std::unique_ptr<CapturedFunction> captured_init_func;
+    OP_REQUIRES_OK(ctx, CapturedFunction::Create(init_func_, ctx,
+                                                 "init_func_other_arguments",
+                                                 &captured_init_func));
+    std::unique_ptr<CapturedFunction> captured_reduce_func;
+    OP_REQUIRES_OK(ctx, CapturedFunction::Create(reduce_func_, ctx,
+                                                 "reduce_func_other_arguments",
+                                                 &captured_reduce_func));
+    std::unique_ptr<CapturedFunction> captured_finalize_func;
+    OP_REQUIRES_OK(ctx,
+                   CapturedFunction::Create(finalize_func_, ctx,
+                                            "finalize_func_other_arguments",
+                                            &captured_finalize_func));
+
+    *output = new Dataset(
+        ctx, input, std::move(captured_key_func), std::move(captured_init_func),
+        std::move(captured_reduce_func), std::move(captured_finalize_func),
+        output_types_, output_shapes_);
+  }
+
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            std::unique_ptr<CapturedFunction> captured_key_func,
+            std::unique_ptr<CapturedFunction> captured_init_func,
+            std::unique_ptr<CapturedFunction> captured_reduce_func,
+            std::unique_ptr<CapturedFunction> captured_finalize_func,
+            const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes)
+        : GraphDatasetBase(ctx),
+          input_(input),
+          captured_key_func_(std::move(captured_key_func)),
+          captured_init_func_(std::move(captured_init_func)),
+          captured_reduce_func_(std::move(captured_reduce_func)),
+          captured_finalize_func_(std::move(captured_finalize_func)),
+          output_types_(output_types),
+          output_shapes_(output_shapes) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::GroupByReducer")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_types_;
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() const override {
+      return "GroupByReducerDatasetOp::Dataset";
+    }
+
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx, key_func().name()));
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx, init_func().name()));
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx, reduce_func().name()));
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx, finalize_func().name()));
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+
+      std::vector<Node*> key_func_other_arguments_node;
+      DataTypeVector key_func_other_arguments_types;
+      TF_RETURN_IF_ERROR(OtherArgumentsNodeAndType(
+          b, captured_key_func_, &key_func_other_arguments_node,
+          &key_func_other_arguments_types));
+
+      std::vector<Node*> init_func_other_arguments_node;
+      DataTypeVector init_func_other_arguments_types;
+      TF_RETURN_IF_ERROR(OtherArgumentsNodeAndType(
+          b, captured_init_func_, &init_func_other_arguments_node,
+          &init_func_other_arguments_types));
+
+      std::vector<Node*> reduce_func_other_arguments_node;
+      DataTypeVector reduce_func_other_arguments_types;
+      TF_RETURN_IF_ERROR(OtherArgumentsNodeAndType(
+          b, captured_reduce_func_, &reduce_func_other_arguments_node,
+          &reduce_func_other_arguments_types));
+
+      std::vector<Node*> finalize_func_other_arguments_node;
+      DataTypeVector finalize_func_other_arguments_types;
+      TF_RETURN_IF_ERROR(OtherArgumentsNodeAndType(
+          b, captured_finalize_func_, &finalize_func_other_arguments_node,
+          &finalize_func_other_arguments_types));
+
+      AttrValue key_func;
+      b->BuildAttrValue(this->key_func(), &key_func);
+      AttrValue init_func;
+      b->BuildAttrValue(this->init_func(), &init_func);
+      AttrValue reduce_func;
+      b->BuildAttrValue(this->reduce_func(), &reduce_func);
+      AttrValue finalize_func;
+      b->BuildAttrValue(this->finalize_func(), &finalize_func);
+
+      AttrValue key_func_other_arguments_types_attr;
+      b->BuildAttrValue(key_func_other_arguments_types,
+                        &key_func_other_arguments_types_attr);
+      AttrValue init_func_other_arguments_types_attr;
+      b->BuildAttrValue(init_func_other_arguments_types,
+                        &init_func_other_arguments_types_attr);
+      AttrValue reduce_func_other_arguments_types_attr;
+      b->BuildAttrValue(reduce_func_other_arguments_types,
+                        &reduce_func_other_arguments_types_attr);
+      AttrValue finalize_func_other_arguments_types_attr;
+      b->BuildAttrValue(finalize_func_other_arguments_types,
+                        &finalize_func_other_arguments_types_attr);
+
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this, {{0, input_graph_node}},
+          {{1, key_func_other_arguments_node},
+           {2, init_func_other_arguments_node},
+           {3, reduce_func_other_arguments_node},
+           {4, finalize_func_other_arguments_node}},
+          {{"key_func", key_func},
+           {"init_func", init_func},
+           {"reduce_func", reduce_func},
+           {"finalize_func", finalize_func},
+           {"Tkey_func_other_arguments", key_func_other_arguments_types_attr},
+           {"Tinit_func_other_arguments", init_func_other_arguments_types_attr},
+           {"Treduce_func_other_arguments",
+            reduce_func_other_arguments_types_attr},
+           {"Tfinalize_func_other_arguments",
+            finalize_func_other_arguments_types_attr}},
+          output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+
+        // Iterate through the input dataset, keying input elements to reducers.
+        while (!end_of_input_) {
+          std::vector<Tensor> next_input_element;
+          TF_RETURN_IF_ERROR(
+              input_impl_->GetNext(ctx, &next_input_element, &end_of_input_));
+
+          if (!end_of_input_) {
+            // Run the key function on the input element.
+            std::vector<Tensor> key_func_output;
+            TF_RETURN_IF_ERROR(
+                dataset()->captured_key_func_->RunWithBorrowedArgs(
+                    ctx, next_input_element, &key_func_output));
+
+            if (key_func_output.size() != 1 ||
+                key_func_output[0].dtype() != DT_INT64 ||
+                key_func_output[0].NumElements() != 1) {
+              // TODO(b/78665031): Support non-int64 keys.
+              return errors::InvalidArgument(
+                  "`key_func` must return a scalar int64.");
+            }
+            const int64 key = key_func_output[0].scalar<int64>()();
+
+            if (states_.find(key) == states_.end()) {
+              // Run the init function to create the initial state.
+              std::vector<Tensor> init_func_output;
+              TF_RETURN_IF_ERROR(dataset()->captured_init_func_->Run(
+                  ctx, std::move(key_func_output), &init_func_output));
+              states_[key] = init_func_output;
+            }
+
+            // Run the reduce function to update the current state.
+            std::vector<Tensor> args;
+            args.reserve(states_[key].size() + next_input_element.size());
+            std::copy(states_[key].begin(), states_[key].end(),
+                      std::back_inserter(args));
+            std::copy(next_input_element.begin(), next_input_element.end(),
+                      std::back_inserter(args));
+
+            std::vector<Tensor> reduce_func_output;
+            TF_RETURN_IF_ERROR(dataset()->captured_reduce_func_->Run(
+                ctx, std::move(args), &reduce_func_output));
+            states_[key] = reduce_func_output;
+          } else {
+            keys_.resize(states_.size());
+            int idx = 0;
+            for (auto it = states_.begin(); it != states_.end(); ++idx, ++it) {
+              keys_[idx] = it->first;
+            }
+          }
+        }
+
+        if (keys_index_ == keys_.size()) {
+          *end_of_sequence = true;
+          return Status::OK();
+        }
+        TF_RETURN_IF_ERROR(
+            dataset()->captured_finalize_func_->RunWithBorrowedArgs(
+                ctx, states_[keys_[keys_index_++]], out_tensors));
+        return Status::OK();
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+
+        if (end_of_input_) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("end_of_input"), ""));
+        }
+
+        // Saving states_.
+        if (!states_.empty()) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("states_size"), states_.size()));
+          int idx = 0;
+          for (auto it = states_.begin(); it != states_.end(); ++idx, ++it) {
+            int64 key = it->first;
+            TF_RETURN_IF_ERROR(writer->WriteScalar(
+                full_name(strings::StrCat("states[", idx, "]->key")), key));
+            if (!it->second.empty()) {
+              TF_RETURN_IF_ERROR(writer->WriteScalar(
+                  full_name(strings::StrCat("states[", idx, "]->state_size")),
+                  it->second.size()));
+              for (int j = 0; j < it->second.size(); ++j) {
+                TF_RETURN_IF_ERROR(writer->WriteTensor(
+                    full_name(
+                        strings::StrCat("states[", idx, "]->state[", j, "]")),
+                    it->second[j]));
+              }
+            }
+          }
+        }
+
+        // Saving keys_index_ and keys_.
+        if (end_of_input_) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("keys_index"), keys_index_));
+          if (!keys_.empty()) {
+            TF_RETURN_IF_ERROR(
+                writer->WriteScalar(full_name("keys_size"), keys_.size()));
+            for (int idx = 0; idx < keys_.size(); ++idx) {
+              TF_RETURN_IF_ERROR(writer->WriteScalar(
+                  full_name(strings::StrCat("keys[", idx, "]")), keys_[idx]));
+            }
+          }
+        }
+
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+
+        if (reader->Contains(full_name("end_of_input"))) end_of_input_ = true;
+
+        // Restoring states_.
+        if (reader->Contains(full_name("states_size"))) {
+          int64 size;
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("states_size"), &size));
+          for (int idx = 0; idx < size; ++idx) {
+            int64 key;
+            TF_RETURN_IF_ERROR(reader->ReadScalar(
+                full_name(strings::StrCat("states[", idx, "]->key")), &key));
+            std::vector<Tensor> state;
+            if (reader->Contains(full_name(
+                    strings::StrCat("states[", idx, "]->state_size")))) {
+              int64 state_size;
+              TF_RETURN_IF_ERROR(reader->ReadScalar(
+                  full_name(strings::StrCat("states[", idx, "]->state_size")),
+                  &state_size));
+              state.resize(state_size);
+              for (int j = 0; j < state_size; ++j) {
+                TF_RETURN_IF_ERROR(reader->ReadTensor(
+                    full_name(
+                        strings::StrCat("states[", idx, "]->state[", j, "]")),
+                    &state[j]));
+              }
+            }
+            states_[key] = state;
+          }
+        }
+
+        // Restoring keys_index_ and keys_.
+        if (end_of_input_) {
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("keys_index"), &keys_index_));
+          if (reader->Contains(full_name("keys_size"))) {
+            int64 size;
+            TF_RETURN_IF_ERROR(
+                reader->ReadScalar(full_name("keys_size"), &size));
+            keys_.resize(size);
+            for (int idx = 0; idx < size; ++idx) {
+              int64 key;
+              TF_RETURN_IF_ERROR(reader->ReadScalar(
+                  full_name(strings::StrCat("keys[", idx, "]")), &key));
+              keys_[idx] = key;
+            }
+          }
+        }
+
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      bool end_of_input_ GUARDED_BY(mu_) = false;
+      std::map<int64, std::vector<Tensor>> states_ GUARDED_BY(mu_);
+      std::vector<int64> keys_ GUARDED_BY(mu_);
+      int64 keys_index_ GUARDED_BY(mu_) = 0;
+    };
+
+    const NameAttrList& key_func() const { return captured_key_func_->func(); }
+
+    const NameAttrList& init_func() const {
+      return captured_init_func_->func();
+    }
+
+    const NameAttrList& reduce_func() const {
+      return captured_reduce_func_->func();
+    }
+
+    const NameAttrList& finalize_func() const {
+      return captured_finalize_func_->func();
+    }
+
+    Status OtherArgumentsNodeAndType(
+        DatasetGraphDefBuilder* b,
+        const std::unique_ptr<CapturedFunction>& captured_func,
+        std::vector<Node*>* other_arguments_node,
+        DataTypeVector* other_arguments_types) const {
+      other_arguments_node->reserve(captured_func->captured_inputs().size());
+      other_arguments_types->reserve(captured_func->captured_inputs().size());
+      for (const Tensor& t : captured_func->captured_inputs()) {
+        Node* node;
+        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        other_arguments_node->emplace_back(node);
+        other_arguments_types->emplace_back(t.dtype());
+      }
+      return Status::OK();
+    }
+
+    const DatasetBase* const input_;
+    const std::unique_ptr<CapturedFunction> captured_key_func_;
+    const std::unique_ptr<CapturedFunction> captured_init_func_;
+    const std::unique_ptr<CapturedFunction> captured_reduce_func_;
+    const std::unique_ptr<CapturedFunction> captured_finalize_func_;
+    const DataTypeVector output_types_;
+    const std::vector<PartialTensorShape> output_shapes_;
+  };
+
+  const int graph_def_version_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  NameAttrList key_func_;
+  NameAttrList init_func_;
+  NameAttrList reduce_func_;
+  NameAttrList finalize_func_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("GroupByReducerDataset").Device(DEVICE_CPU),
+                        GroupByReducerDatasetOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
index 46f43dd1b1dcd7..23d769e1ab0e61 100644
--- a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
@@ -118,7 +118,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::GroupByWindow")}));
@@ -131,7 +131,9 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override { return "GroupByWindowDatasetOp::Dataset"; }
+    string DebugString() const override {
+      return "GroupByWindowDatasetOp::Dataset";
+    }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
@@ -198,8 +200,11 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
@@ -241,7 +246,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
               if (key_func_output.size() != 1 ||
                   key_func_output[0].dtype() != DT_INT64 ||
                   key_func_output[0].NumElements() != 1) {
-                // TODO(mrry): Support non-int64 keys.
+                // TODO(b/78665031): Support non-int64 keys.
                 return errors::InvalidArgument(
                     "`key_func` must return a scalar int64.");
               }
@@ -484,8 +489,8 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
             GetDatasetFromVariantTensor(return_values[0], &returned_dataset));
 
         // Create an iterator for the dataset that was returned by `f`.
-        current_group_iterator_ = returned_dataset->MakeIterator(prefix());
-        return Status::OK();
+        return returned_dataset->MakeIterator(ctx, prefix(),
+                                              &current_group_iterator_);
       }
 
       mutex mu_;
diff --git a/tensorflow/core/kernels/data/identity_dataset_op.cc b/tensorflow/core/kernels/data/identity_dataset_op.cc
new file mode 100644
index 00000000000000..e28f1883361679
--- /dev/null
+++ b/tensorflow/core/kernels/data/identity_dataset_op.cc
@@ -0,0 +1,102 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <map>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+
+namespace tensorflow {
+namespace {
+
+// The purpose of identity dataset is to serve as a placeholder when performing
+// optimizations. It is not expected to be surfaced in the Python API.
+class IdentityDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit IdentityDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    *output = new Dataset(ctx, input);
+  }
+
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input)
+        : GraphDatasetBase(ctx), input_(input) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::Identity")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() const override { return "IdentityDatasetOp::Dataset"; }
+
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      TF_RETURN_IF_ERROR(b->AddDataset(this, {input_graph_node}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return errors::Unimplemented(strings::StrCat(prefix(), "::Initialize"));
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        return errors::Unimplemented(
+            strings::StrCat(prefix(), "::GetNextInternal"));
+      }
+    };
+
+    const DatasetBase* const input_;
+  };
+
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("IdentityDataset").Device(DEVICE_CPU),
+                        IdentityDatasetOp);
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/interleave_dataset_op.cc b/tensorflow/core/kernels/data/interleave_dataset_op.cc
index bce3f28d62bf89..0765e639932845 100644
--- a/tensorflow/core/kernels/data/interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/interleave_dataset_op.cc
@@ -96,7 +96,7 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::Interleave")}));
@@ -109,7 +109,9 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override { return "InterleaveDatasetOp::Dataset"; }
+    string DebugString() const override {
+      return "InterleaveDatasetOp::Dataset";
+    }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
@@ -149,10 +151,13 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
      public:
       explicit Iterator(const Params& params)
           : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
             current_elements_(params.dataset->cycle_length_),
             args_list_(params.dataset->cycle_length_) {}
 
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
       void AdvanceToNextInCycle() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         block_index_ = 0;
         cycle_index_ = (cycle_index_ + 1) % dataset()->cycle_length_;
@@ -294,7 +299,7 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
       }
 
       mutex mu_;
-      const std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
       std::vector<std::unique_ptr<IteratorBase>> current_elements_
           GUARDED_BY(mu_);
       std::vector<std::vector<Tensor>> args_list_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index f5db97fd59ea88..9d9e74adba4ab4 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -158,7 +158,10 @@ class IteratorResource : public ResourceBase {
         graph_runner.Run(&graph, lib, {}, {output_node}, &outputs));
     TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(outputs[0], &dataset));
 
-    TF_RETURN_IF_ERROR(set_iterator(dataset->MakeIterator("Iterator")));
+    IteratorContext iter_ctx = dataset::MakeIteratorContext(ctx);
+    std::unique_ptr<IteratorBase> iterator;
+    TF_RETURN_IF_ERROR(dataset->MakeIterator(&iter_ctx, "Iterator", &iterator));
+    TF_RETURN_IF_ERROR(set_iterator(std::move(iterator)));
     std::shared_ptr<IteratorBase> captured_iterator(iterator_);
 
     if (captured_iterator) {
@@ -438,6 +441,9 @@ class IteratorStateVariant {
 REGISTER_UNARY_VARIANT_DECODE_FUNCTION(IteratorStateVariant,
                                        kIteratorVariantTypeName);
 
+// Note that IteratorHandleOp holds a reference to the resource it creates. If
+// cleaning up resources with DestroyResourceOp is important, consider creating
+// resource containers with AnonymousIteratorHandleOp instead.
 class IteratorHandleOp : public OpKernel {
  public:
   explicit IteratorHandleOp(OpKernelConstruction* ctx)
@@ -574,6 +580,75 @@ class IteratorHandleOp : public OpKernel {
   string name_;
 };
 
+// Like IteratorHandleOp, but creates handles which are never shared, and does
+// not hold a reference to these handles. The latter is important for eager
+// execution, since OpKernel instances generally live as long as the program
+// running them.
+class AnonymousIteratorHandleOp : public OpKernel {
+ public:
+  explicit AnonymousIteratorHandleOp(OpKernelConstruction* context)
+      : OpKernel(context), graph_def_version_(context->graph_def_version()) {
+    OP_REQUIRES_OK(context, context->GetAttr("output_types", &output_dtypes_));
+    OP_REQUIRES_OK(context, context->GetAttr("output_shapes", &output_shapes_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    FunctionLibraryRuntime* lib;
+    std::unique_ptr<DeviceMgr> device_mgr(nullptr);
+    std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
+    std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
+    OP_REQUIRES_OK(context,
+                   context->function_library()->Clone(&flib_def, &pflr, &lib));
+
+    ResourceMgr* mgr = context->resource_manager();
+
+    const string container_name = "AnonymousIterator";
+    string unique_name;
+    {
+      mutex_lock l(static_resource_lookup_mutex_);
+      while (true) {  // Find an unused name
+        IteratorResource* existing_resource = nullptr;
+        unique_name = strings::StrCat("AnonymousIterator", current_id_++);
+        Status status = mgr->Lookup<IteratorResource>(
+            container_name, unique_name, &existing_resource);
+        if (status.code() == error::NOT_FOUND) {
+          break;
+        }
+        OP_REQUIRES_OK(context, status);
+        existing_resource->Unref();
+      }
+      IteratorResource* new_resource = new IteratorResource(
+          output_dtypes_, output_shapes_, graph_def_version_,
+          std::move(device_mgr), std::move(flib_def), std::move(pflr), lib);
+      // Create the resource with our chosen name under the resource lookup
+      // mutex to avoid another kernel racily creating a resource with this
+      // name.
+      OP_REQUIRES_OK(context, mgr->Create<IteratorResource>(
+                                  container_name, unique_name, new_resource));
+    }
+    OP_REQUIRES_OK(context, MakeResourceHandleToOutput(
+                                context, 0, container_name, unique_name,
+                                MakeTypeIndex<IteratorResource>()));
+  }
+
+ private:
+  // Coordinates Iterator unique name creation across AnonymousIteratorHandleOp
+  // instances.
+  static mutex static_resource_lookup_mutex_;
+  // current_id_ is just a hint for creating unique names. If it turns out
+  // there's a collision (e.g. because another AnonymousIteratorHandleOp
+  // instance is generating handles) we'll just skip that id.
+  static int64 current_id_ GUARDED_BY(static_resource_lookup_mutex_);
+  DataTypeVector output_dtypes_;
+  std::vector<PartialTensorShape> output_shapes_;
+  const int graph_def_version_;
+};
+
+// Static initializers for AnonymousIteratorHandleOp id counting.
+mutex AnonymousIteratorHandleOp::static_resource_lookup_mutex_{
+    LINKER_INITIALIZED};
+int64 AnonymousIteratorHandleOp::current_id_(0);
+
 class MakeIteratorOp : public OpKernel {
  public:
   explicit MakeIteratorOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
@@ -584,9 +659,13 @@ class MakeIteratorOp : public OpKernel {
     IteratorResource* iterator_resource;
     OP_REQUIRES_OK(
         ctx, LookupResource(ctx, HandleFromInput(ctx, 1), &iterator_resource));
-    OP_REQUIRES_OK(ctx, iterator_resource->set_iterator(
-                            dataset->MakeIterator("Iterator")));
-    iterator_resource->Unref();
+    core::ScopedUnref unref(iterator_resource);
+
+    IteratorContext iter_ctx = dataset::MakeIteratorContext(ctx);
+    std::unique_ptr<IteratorBase> iterator;
+    OP_REQUIRES_OK(ctx,
+                   dataset->MakeIterator(&iter_ctx, "Iterator", &iterator));
+    OP_REQUIRES_OK(ctx, iterator_resource->set_iterator(std::move(iterator)));
   }
 };
 
@@ -608,9 +687,12 @@ class ToSingleElementOp : public AsyncOpKernel {
       DatasetBase* dataset;
       OP_REQUIRES_OK_ASYNC(
           ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
-      auto iterator = dataset->MakeIterator("SingleElementIterator");
-
       IteratorContext iter_ctx = dataset::MakeIteratorContext(ctx);
+      std::unique_ptr<IteratorBase> iterator;
+      OP_REQUIRES_OK_ASYNC(
+          ctx,
+          dataset->MakeIterator(&iter_ctx, "SingleElementIterator", &iterator),
+          done);
       std::vector<Tensor> components;
       components.reserve(dataset->output_dtypes().size());
       bool end_of_sequence;
@@ -794,8 +876,10 @@ class OneShotIteratorOp : public AsyncOpKernel {
     // factory function.
     DatasetBase* dataset;
     TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(return_values[0], &dataset));
-    TF_RETURN_IF_ERROR(
-        (*iterator)->set_iterator(dataset->MakeIterator("Iterator")));
+    IteratorContext iter_ctx = dataset::MakeIteratorContext(ctx);
+    std::unique_ptr<IteratorBase> iter;
+    TF_RETURN_IF_ERROR(dataset->MakeIterator(&iter_ctx, "Iterator", &iter));
+    TF_RETURN_IF_ERROR((*iterator)->set_iterator(std::move(iter)));
 
     (*iterator)->Ref();
     return Status::OK();
@@ -1051,7 +1135,7 @@ class DeserializeIteratorOp : public OpKernel {
     IteratorResource* iterator_resource;
     OP_REQUIRES_OK(
         ctx, LookupResource(ctx, HandleFromInput(ctx, 0), &iterator_resource));
-
+    core::ScopedUnref unref_iterator(iterator_resource);
     Variant variant = ctx->input(1).scalar<Variant>()();
     auto* wrapper = variant.get<IteratorStateVariant>();
     OP_REQUIRES(ctx, wrapper != nullptr,
@@ -1066,6 +1150,8 @@ class DeserializeIteratorOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("Iterator").Device(DEVICE_CPU), IteratorHandleOp);
 REGISTER_KERNEL_BUILDER(Name("MakeIterator").Device(DEVICE_CPU),
                         MakeIteratorOp);
+REGISTER_KERNEL_BUILDER(Name("AnonymousIterator").Device(DEVICE_CPU),
+                        AnonymousIteratorHandleOp);
 REGISTER_KERNEL_BUILDER(Name("DatasetToSingleElement").Device(DEVICE_CPU),
                         ToSingleElementOp);
 REGISTER_KERNEL_BUILDER(Name("OneShotIterator").Device(DEVICE_CPU),
diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
index 605ef3c0b79cb0..703ef194a13e4c 100644
--- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #define EIGEN_USE_THREADS
 
+#include <utility>
+
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -21,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/dataset.h"
 #include "tensorflow/core/kernels/inplace_ops_functor.h"
 #include "tensorflow/core/lib/core/blocking_counter.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/tracing.h"
@@ -36,7 +39,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit MapAndBatchDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx),
-        graph_def_version_(ctx->graph_def_version()) {
+        graph_def_version_(ctx->graph_def_version()),
+        op_version_(ctx->def().op() == "MapAndBatchDataset" ? 1 : 2) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
@@ -59,12 +63,29 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         ctx, batch_size > 0,
         errors::InvalidArgument("batch_size must be greater than zero."));
 
-    int64 num_parallel_batches;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_batches",
-                                            &num_parallel_batches));
-    OP_REQUIRES(ctx, num_parallel_batches > 0,
-                errors::InvalidArgument(
-                    "num_parallel_batches must be greater than zero."));
+    int64 num_parallel_calls;
+    switch (op_version_) {
+      case 1:
+        int64 num_parallel_batches;
+        OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_batches",
+                                                &num_parallel_batches));
+        num_parallel_calls = num_parallel_batches * batch_size;
+        OP_REQUIRES(ctx, num_parallel_batches > 0,
+                    errors::InvalidArgument(
+                        "num_parallel_batches must be greater than zero."));
+        break;
+      case 2:
+        OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_calls",
+                                                &num_parallel_calls));
+        OP_REQUIRES(ctx, num_parallel_calls > 0,
+                    errors::InvalidArgument(
+                        "num_parallel_calls must be greater than zero."));
+        break;
+      default:
+        OP_REQUIRES(ctx, false,
+                    errors::Unimplemented("Unsupported operation version %d.",
+                                          op_version_));
+    }
 
     bool drop_remainder;
     OP_REQUIRES_OK(ctx,
@@ -74,7 +95,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
     OP_REQUIRES_OK(ctx, CapturedFunction::Create(
                             func_, std::move(other_arguments), &captured_func));
 
-    *output = new Dataset(ctx, input, batch_size, num_parallel_batches,
+    *output = new Dataset(ctx, input, batch_size, num_parallel_calls,
                           drop_remainder, output_types_, output_shapes_, func_,
                           std::move(captured_func), &ctx->eigen_cpu_device());
   }
@@ -83,7 +104,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
   class Dataset : public GraphDatasetBase {
    public:
     Dataset(OpKernelContext* ctx, const DatasetBase* input, int64 batch_size,
-            int64 num_parallel_batches, bool drop_remainder,
+            int64 num_parallel_calls, bool drop_remainder,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes,
             const NameAttrList& func,
@@ -92,7 +113,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         : GraphDatasetBase(ctx),
           input_(input),
           batch_size_(batch_size),
-          num_parallel_batches_(num_parallel_batches),
+          num_parallel_calls_(num_parallel_calls),
           drop_remainder_(drop_remainder),
           output_types_(output_types),
           output_shapes_(output_shapes),
@@ -104,7 +125,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::MapAndBatch")}));
@@ -118,7 +139,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override { return "MapAndBatchDatasetOp::Dataset"; }
+    string DebugString() const override {
+      return "MapAndBatchDatasetOp::Dataset";
+    }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
@@ -128,9 +151,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
       Node* batch_size_node;
       TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size_node));
-      Node* num_parallel_batches_node;
+      Node* num_parallel_calls_node;
       TF_RETURN_IF_ERROR(
-          b->AddScalar(num_parallel_batches_, &num_parallel_batches_node));
+          b->AddScalar(num_parallel_calls_, &num_parallel_calls_node));
       Node* drop_remainder_node;
       TF_RETURN_IF_ERROR(b->AddScalar(drop_remainder_, &drop_remainder_node));
 
@@ -153,7 +176,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
           this,
           {std::make_pair(0, input_graph_node),
            std::make_pair(2, batch_size_node),
-           std::make_pair(3, num_parallel_batches_node),
+           std::make_pair(3, num_parallel_calls_node),
            std::make_pair(4, drop_remainder_node)},  // Single tensor inputs.
           {std::make_pair(1, other_arguments)},      // Tensor list inputs.
           {std::make_pair("f", f),
@@ -167,191 +190,198 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
      public:
       explicit Iterator(const Params& params)
           : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
-            invocation_results_(params.dataset->batch_size_ *
-                                params.dataset->num_parallel_batches_),
-            batch_results_(params.dataset->num_parallel_batches_) {}
+            batch_results_((params.dataset->num_parallel_calls_ +
+                            params.dataset->batch_size_ - 1) /
+                           params.dataset->batch_size_) {
+        for (int i = 0; i < batch_results_.size(); ++i) {
+          batch_results_[i].Initialize(params.dataset->batch_size_);
+        }
+      }
 
       ~Iterator() override {
-        // TODO(mrry): Replace this cancellation logic with a
-        // CancellationManager. The syntax would be more heavyweight,
-        // but it would be possible to thread a cancellation manager
-        // through the IteratorContext to upstream,
-        // potentially-blocking iterators, when we add these.
         mutex_lock l(mu_);
-        if (current_batch_index_ != -1) {
-          for (size_t batch_index = 0;
-               batch_index < dataset()->num_parallel_batches_; ++batch_index) {
-            int64 num_elements;
-            WaitForBatch(batch_index, &num_elements).IgnoreError();
-            // Deallocate tensors allocated for the output.
-            batch_results_[batch_index].output.clear();
-          }
+        // Cancel the runner thread.
+        cancelled_ = true;
+        cond_var_.notify_all();
+        // Wait for all in-flight calls to complete.
+        while (num_calls_ > 0) {
+          cond_var_.wait(l);
         }
       }
 
-      // TODO(jsimsa): Implement and profile the following alternative design:
-      //
-      // 0. Set the number of in-flight batches and invocations independently
-      // (though obviously the max number of in-flight invocations must be <
-      // batch_size * num_parallel_batches). Maintain a current producing batch
-      // index and offset.
-      // 1. Issue invocations in order of batch and offset, as you do currently.
-      // 2. When an invocation finishes, increment the current producing batch
-      // and offset. If that invocation would start a new batch and give more
-      // than num_parallel_batches in-flight, block; else start the new
-      // invocation into that location.
-      // 3. When a GetNext() call arrives, block until there's a full batch.
-      // Before returning the batch, if the number of pending invocations is
-      // less than the max, issue that number of invocations.
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
+        mutex_lock external_l(external_mu_);
         mutex_lock l(mu_);
-
-        // One-time initialization.
-        if (current_batch_index_ == -1) {
-          current_batch_index_ = 0;
-          for (size_t i = 0; i < dataset()->num_parallel_batches_; ++i) {
-            StartInvocationBatch(ctx, i);
-          }
-        }
-
-        int64 num_elements = 0;
-        Status status = WaitForBatch(current_batch_index_, &num_elements);
-        if (num_elements == 0) {
-          *end_of_sequence = true;
-          return Status::OK();
-        }
-        if (!status.ok()) {
-          // Deallocate tensors allocated for the output.
-          batch_results_[current_batch_index_].output.clear();
-        } else {
-          if (num_elements < dataset()->batch_size_) {
-            if (dataset()->drop_remainder_) {
-              // Deallocate tensors allocated for the output.
-              batch_results_[current_batch_index_].output.clear();
-              *end_of_sequence = true;
-              return Status::OK();
-            }
-            const std::vector<Tensor>& output =
-                batch_results_[current_batch_index_].output;
-            for (size_t i = 0; i < output.size(); ++i) {
-              TensorShape component_shape(
-                  batch_results_[current_batch_index_].output[i].shape());
-              component_shape.set_dim(0, num_elements);
-              AllocatorAttributes attr;
-              attr.set_gpu_compatible(true);
-              Tensor component(ctx->allocator(attr), output[i].dtype(),
-                               component_shape);
-              TF_RETURN_IF_ERROR(
-                  CopyPartialBatch(&component, output[i], num_elements));
-              out_tensors->emplace_back(std::move(component));
-            }
-            // Deallocate tensors allocated for the output.
-            batch_results_[current_batch_index_].output.clear();
-          } else {
-            *out_tensors =
-                std::move(batch_results_[current_batch_index_].output);
-          }
-          *end_of_sequence = false;
-        }
-        StartInvocationBatch(ctx, current_batch_index_);
-        current_batch_index_ =
-            (current_batch_index_ + 1) % dataset()->num_parallel_batches_;
-        return status;
+        EnsureRunnerThreadStarted(ctx);
+        BatchResult* result = &batch_results_[ComputeIndex(input_batch_)];
+        WaitForBatch(result, &l);
+        return ProcessBatch(ctx, result, out_tensors, end_of_sequence);
       }
 
      protected:
       Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock external_l(external_mu_);
         mutex_lock l(mu_);
-        if (current_batch_index_ == -1) {
-          // Iterator has not been used. Nothing to save.
-          return Status::OK();
+        // Wait for all in-flight calls to complete.
+        while (num_calls_ > 0) {
+          cond_var_.wait(l);
         }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("current_batch_index"),
-                                               current_batch_index_));
+        CHECK_EQ(num_calls_, 0);
         TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            full_name("invocation_results_size"), invocation_results_.size()));
-        for (size_t i = 0; i < invocation_results_.size(); ++i) {
-          TF_RETURN_IF_ERROR(WriteInvocationResultLocked(writer, i));
-        }
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("call_counter"), call_counter_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("input_batch"), input_batch_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("output_batch"), output_batch_));
         TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("batch_results_size"),
                                                batch_results_.size()));
         for (size_t i = 0; i < batch_results_.size(); ++i) {
-          TF_RETURN_IF_ERROR(WriteBatchResultLocked(writer, i));
+          TF_RETURN_IF_ERROR(WriteBatchResult(writer, i));
         }
         return Status::OK();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
+        mutex_lock external_l(external_mu_);
         mutex_lock l(mu_);
-        if (!reader->Contains(full_name("current_batch_index"))) {
-          // Iterator was never used so nothing to restore.
-          return Status::OK();
-        }
-        {
-          int64 temp;
-          TF_RETURN_IF_ERROR(
-              reader->ReadScalar(full_name("current_batch_index"), &temp));
-          current_batch_index_ = static_cast<int32>(temp);
-          if (current_batch_index_ != temp) {
-            return errors::Internal("Invalid value for current_batch_index ",
-                                    temp);
-          }
-        }
         TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
-        size_t invocation_results_size;
-        {
-          int64 temp;
-          TF_RETURN_IF_ERROR(
-              reader->ReadScalar(full_name("invocation_results_size"), &temp));
-          invocation_results_size = static_cast<size_t>(temp);
-          if (invocation_results_size != temp) {
-            return errors::Internal(
-                "Invalid value for invocation_results_size ", temp);
-          }
-        }
-        CHECK_EQ(invocation_results_.size(), invocation_results_size);
-        for (size_t i = 0; i < invocation_results_size; ++i) {
-          TF_RETURN_IF_ERROR(ReadInvocationResultLocked(reader, i));
-        }
-        size_t batch_results_size;
-        {
-          int64 temp;
-          TF_RETURN_IF_ERROR(
-              reader->ReadScalar(full_name("batch_results_size"), &temp));
-          batch_results_size = static_cast<size_t>(temp);
-          if (batch_results_size != temp) {
-            return errors::Internal("Invalid value for batch_results_size ",
-                                    temp);
-          }
-        }
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("call_counter"), &call_counter_));
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("input_batch"), &input_batch_));
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("output_batch"), &output_batch_));
+        int64 batch_results_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("batch_results_size"),
+                                              &batch_results_size));
         CHECK_EQ(batch_results_.size(), batch_results_size);
-        for (size_t i = 0; i < batch_results_size; ++i) {
-          TF_RETURN_IF_ERROR(ReadBatchResultLocked(ctx, reader, i));
+        for (int i = 0; i < batch_results_size; ++i) {
+          TF_RETURN_IF_ERROR(ReadBatchResult(ctx, reader, i));
         }
         return Status::OK();
       }
 
      private:
       struct BatchResult {
-        mutex mu ACQUIRED_AFTER(mu_);
-        bool output_allocated GUARDED_BY(mu);
+        mutex mu;
+        bool end_of_input GUARDED_BY(mu);
+        int64 num_elements GUARDED_BY(mu);
         std::vector<Tensor> output;
-        std::unique_ptr<BlockingCounter> counter;
+        bool output_allocated GUARDED_BY(mu);
+        Status status GUARDED_BY(mu);
+        // Used for coordination between the main thread and the callback
+        // threads. In particular, the main thread will wait for the value
+        // of `num_calls` to reach zero before processing the batch result.
+        condition_variable cond_var;  // access guarded by owner's mutex
+        // Counts the number of outstanding calls for this batch.
+        int64 num_calls;  // access guarded by owner's mutex
+
+        void Initialize(int64 batch_size) {
+          mutex_lock l(mu);
+          end_of_input = false;
+          num_calls = batch_size;
+          num_elements = 0;
+          output_allocated = false;
+          status = Status::OK();
+        }
+
+        void UpdateStatus(const Status& s) {
+          mutex_lock l(mu);
+          status.Update(s);
+        }
       };
 
-      struct InvocationResult {
-        Status status;
+      void Callback(const std::shared_ptr<IteratorContext>& ctx,
+                    BatchResult* result, std::vector<Tensor>* return_values,
+                    int64 offset, const Status& status) {
+        std::unique_ptr<std::vector<Tensor>> cleanup_retvals(return_values);
+        result->UpdateStatus(status);
+        if (status.ok()) {
+          EnsureOutputAllocated(ctx, result, return_values);
+          for (size_t i = 0; i < return_values->size(); ++i) {
+            const Tensor& tensor = return_values->at(i);
+            Tensor* batch = &(result->output)[i];
+            if (tensor.NumElements() !=
+                (batch->NumElements() / batch->dim_size(0))) {
+              TensorShape batch_shape = batch->shape();
+              batch_shape.RemoveDim(0);
+              result->UpdateStatus(errors::InvalidArgument(
+                  "Cannot add tensor to the batch: number of elements does not "
+                  "match. Shapes are: [tensor]: ",
+                  tensor.shape().DebugString(),
+                  ", [batch]: ", batch_shape.DebugString()));
+              break;
+            }
+            // TODO(mrry): Add a version of DoParallelConcat that allows us to
+            // move `tensor` where possible, to speed up string tensor batching.
+            Status copy_status = ::tensorflow::functor::DoParallelConcat(
+                *dataset()->device_, tensor, offset, batch);
+            if (!copy_status.ok()) {
+              result->UpdateStatus(copy_status);
+              break;
+            }
+          }
+        }
+        {
+          mutex_lock l(result->mu);
+          result->num_elements++;
+        }
+        {
+          mutex_lock l(mu_);
+          CallCompleted(result);
+        }
+      }
+
+      void CallCompleted(BatchResult* result) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        num_calls_--;
+        cond_var_.notify_all();
+        result->num_calls--;
+        result->cond_var.notify_all();
+      }
+
+      void CallFunction(std::shared_ptr<IteratorContext> ctx,
+                        BatchResult* result, int64 offset) {
+        // Get the next input element.
+        std::vector<Tensor> input_element;
         bool end_of_input;
-        std::vector<Tensor> return_values;
-      };
+        Status status =
+            input_impl_->GetNext(ctx.get(), &input_element, &end_of_input);
+        {
+          mutex_lock l(mu_);
+          mutex_lock l2(result->mu);
+          result->end_of_input = result->end_of_input || end_of_input;
+          result->status.Update(status);
+          if (result->end_of_input || !result->status.ok()) {
+            CallCompleted(result);
+            return;
+          }
+        }
 
-      int64 ComputeInvocationIndex(int64 batch_index, int64 offset) {
-        return batch_index * dataset()->batch_size_ + offset;
+        // Call `captured_func_(input_element)`, using `Callback` to store the
+        // result in `result`.
+        (*ctx->runner())(std::bind(
+            [this, result, offset](std::shared_ptr<IteratorContext> ctx,
+                                   std::vector<Tensor> input_element) {
+              std::vector<Tensor>* return_values = new std::vector<Tensor>();
+              dataset()->captured_func_->RunAsync(
+                  ctx.get(), std::move(input_element), return_values,
+                  [this, ctx, result, return_values, offset](Status status) {
+                    Callback(ctx, result, return_values, offset, status);
+                  });
+            },
+            ctx, std::move(input_element)));
+      }
+
+      int64 ComputeIndex(int64 n) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        return n % batch_results_.size();
       }
 
       Status CopyPartialBatch(Tensor* output, const Tensor& value,
@@ -377,259 +407,140 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
-      void EnsureOutputAllocated(IteratorContext* ctx,
-                                 BatchResult* batch_result,
-                                 const std::vector<Tensor>& return_values) {
-        mutex_lock l(batch_result->mu);
-        if (batch_result->output_allocated) {
+      void EnsureRunnerThreadStarted(IteratorContext* ctx)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (!runner_thread_) {
+          std::shared_ptr<IteratorContext> ctx_copy(new IteratorContext(*ctx));
+          runner_thread_.reset(ctx->env()->StartThread(
+              {}, "runner_thread",
+              std::bind(&Iterator::RunnerThread, this, ctx_copy)));
+        }
+      }
+
+      void EnsureOutputAllocated(const std::shared_ptr<IteratorContext>& ctx,
+                                 BatchResult* result,
+                                 const std::vector<Tensor>* return_values) {
+        mutex_lock l(result->mu);
+        if (result->output_allocated) {
           return;
         }
-        const size_t num_components = return_values.size();
+        const size_t num_components = return_values->size();
         for (size_t i = 0; i < num_components; ++i) {
           TensorShape component_shape({dataset()->batch_size_});
-          component_shape.AppendShape(return_values[i].shape());
+          component_shape.AppendShape(return_values->at(i).shape());
           AllocatorAttributes attr;
           attr.set_gpu_compatible(true);
-          Tensor component(ctx->allocator(attr), return_values[i].dtype(),
+          Tensor component(ctx->allocator(attr), return_values->at(i).dtype(),
                            component_shape);
-          batch_result->output.emplace_back(std::move(component));
+          result->output.emplace_back(std::move(component));
         }
-        batch_result->output_allocated = true;
+        result->output_allocated = true;
       }
 
-      void InvokeFunctionLocked(IteratorContext* ctx, int64 batch_index,
-                                int64 offset) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        size_t index = ComputeInvocationIndex(batch_index, offset);
-        InvocationResult* result = &invocation_results_[index];
-        BatchResult* batch_result = &batch_results_[batch_index];
-
-        // Get the next input element.
-        std::vector<Tensor> input_element;
-        result->status =
-            input_impl_->GetNext(ctx, &input_element, &result->end_of_input);
-        if (result->end_of_input || !result->status.ok()) {
-          batch_result->counter->DecrementCount();
-          return;
+      Status ProcessBatch(IteratorContext* ctx, BatchResult* result,
+                          std::vector<Tensor>* out_tensors,
+                          bool* end_of_sequence) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        auto cleanup =
+            gtl::MakeCleanup([this, result]() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+              result->Initialize(dataset()->batch_size_);
+              input_batch_++;
+              cond_var_.notify_all();
+            });
+        mutex_lock l(result->mu);
+        if (result->num_elements == 0) {
+          *end_of_sequence = true;
+          return Status::OK();
         }
 
-        // Call `captured_func_(input_element)`, store the result in
-        // `result->return_values`, and notify `batch_result->counter`
-        // to unblock a consumer.
-        (*ctx->runner())(std::bind(
-            [this, result, batch_result, offset](
-                IteratorContext* ctx, std::vector<Tensor> input_element) {
-              dataset()->captured_func_->RunAsync(
-                  ctx, std::move(input_element), &result->return_values,
-                  [this, ctx, result, batch_result, offset](Status ret_status) {
-                    result->status.Update(ret_status);
-                    if (ret_status.ok()) {
-                      EnsureOutputAllocated(ctx, batch_result,
-                                            result->return_values);
-                      const size_t num_components =
-                          result->return_values.size();
-                      for (size_t i = 0; i < num_components; ++i) {
-                        const Tensor& tensor = result->return_values[i];
-                        Tensor* batch = &(batch_result->output)[i];
-                        if (tensor.NumElements() !=
-                            (batch->NumElements() / batch->dim_size(0))) {
-                          TensorShape batch_shape = batch->shape();
-                          batch_shape.RemoveDim(0);
-                          result->status.Update(errors::InvalidArgument(
-                              "Cannot add tensor to the batch: number of "
-                              "elements does not match. Shapes are: [tensor]: ",
-                              tensor.shape().DebugString(),
-                              ", [batch]: ", batch_shape.DebugString()));
-                          break;
-                        }
-                        // TODO(mrry): Add a version of DoParallelConcat that
-                        // allows us to move `tensor` where possible, to speed
-                        // up string tensor batching.
-                        Status copy_status =
-                            ::tensorflow::functor::DoParallelConcat(
-                                *dataset()->device_, tensor, offset, batch);
-                        if (!copy_status.ok()) {
-                          result->status.Update(copy_status);
-                          break;
-                        }
-                      }
-                    }
-                    delete ctx;
-                    // NOTE(mrry): We clear the return values here to release
-                    // any memory associated with them and to paralellize the
-                    // destruction of the tensors (which can be surprisingly
-                    // expensive for map functions with large numbers of return
-                    // values).
-                    result->return_values.clear();
-                    batch_result->counter->DecrementCount();
-                  });
-            },
-            new IteratorContext(*ctx), std::move(input_element)));
-      }
-
-      void StartInvocationBatch(IteratorContext* ctx, int64 batch_index)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        port::Tracing::TraceMe activity(strings::StrCat(prefix(), "::Start"));
-        // Initialize batch result.
-        {
-          mutex_lock l(batch_results_[batch_index].mu);
-          batch_results_[batch_index].output_allocated = false;
-          batch_results_[batch_index].counter.reset(
-              new BlockingCounter(dataset()->batch_size_));
-        }
-        // Initialize invocation results.
-        for (size_t i = 0; i < dataset()->batch_size_; ++i) {
-          size_t index = ComputeInvocationIndex(batch_index, i);
-          InvocationResult* result = &invocation_results_[index];
-          // Reset the state of `result`; `result->return_values` was cleared
-          // when the previous invocation completed.
-          result->end_of_input = false;
-          result->status = Status::OK();
-        }
-        // Start individual invocations.
-        for (size_t i = 0; i < dataset()->batch_size_; ++i) {
-          InvokeFunctionLocked(ctx, batch_index, i);
+        if (!result->status.ok()) {
+          // Deallocate tensors allocated for the output.
+          result->output.clear();
+        } else {
+          if (result->num_elements < dataset()->batch_size_) {
+            if (dataset()->drop_remainder_) {
+              // Deallocate tensors allocated for the output.
+              result->output.clear();
+              *end_of_sequence = true;
+              return Status::OK();
+            }
+            const std::vector<Tensor>& output = result->output;
+            for (size_t i = 0; i < output.size(); ++i) {
+              TensorShape component_shape(result->output[i].shape());
+              component_shape.set_dim(0, result->num_elements);
+              AllocatorAttributes attr;
+              attr.set_gpu_compatible(true);
+              Tensor component(ctx->allocator(attr), output[i].dtype(),
+                               component_shape);
+              TF_RETURN_IF_ERROR(CopyPartialBatch(&component, output[i],
+                                                  result->num_elements));
+              out_tensors->emplace_back(std::move(component));
+            }
+            // Deallocate tensors allocated for the output.
+            result->output.clear();
+          } else {
+            *out_tensors = std::move(result->output);
+          }
+          *end_of_sequence = false;
         }
+        return result->status;
       }
 
-      Status WaitForBatch(int64 batch_index, int64* num_elements)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        port::Tracing::TraceMe activity(strings::StrCat(prefix(), "::Wait"));
-        batch_results_[batch_index].counter->Wait();
-        Status status = Status::OK();
-        for (size_t i = 0; i < dataset()->batch_size_; ++i, ++*num_elements) {
-          size_t index = ComputeInvocationIndex(batch_index, i);
-          InvocationResult* result = &invocation_results_[index];
-          if (result->end_of_input) {
-            VLOG(3) << "end of input encountered at element[" << i << "]: ";
-            return Status::OK();
-          }
-          if (!result->status.ok()) {
-            VLOG(3) << "failed to process element[" << i
-                    << "]: " << result->status;
-            status.Update(result->status);
+      void RunnerThread(const std::shared_ptr<IteratorContext>& ctx) {
+        mutex_lock l(mu_);
+        while (true) {
+          while (!cancelled_ &&
+                 (num_calls_ == dataset()->num_parallel_calls_ ||
+                  (output_batch_ - input_batch_ == batch_results_.size()))) {
+            cond_var_.wait(l);
           }
-        }
-        return status;
-      }
 
-      Status WriteInvocationResultLocked(IteratorStateWriter* writer,
-                                         size_t index)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        const InvocationResult& result = invocation_results_[index];
-        string prefix = strings::StrCat("invocation_results_", index);
-        TF_RETURN_IF_ERROR(WriteStatusLocked(
-            writer, full_name(strings::StrCat(prefix, "_status")),
-            result.status));
-        if (result.end_of_input) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat(prefix, "_end_of_input")), ""));
-        }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            full_name(strings::StrCat(prefix, "_return_values_size")),
-            result.return_values.size()));
-        for (size_t i = 0; i < result.return_values.size(); i++) {
-          TF_RETURN_IF_ERROR(writer->WriteTensor(
-              full_name(strings::StrCat(prefix, "_return_values_", i)),
-              result.return_values[i]));
-        }
-        return Status::OK();
-      }
+          if (cancelled_) {
+            return;
+          }
 
-      Status ReadInvocationResultLocked(IteratorStateReader* reader,
-                                        size_t index)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        InvocationResult* result = &invocation_results_[index];
-        string prefix = strings::StrCat("invocation_results_", index);
-        TF_RETURN_IF_ERROR(ReadStatusLocked(
-            reader, full_name(strings::StrCat(prefix, "_status")),
-            &result->status));
-        result->end_of_input = reader->Contains(
-            full_name(strings::StrCat(prefix, "_end_of_input")));
-        size_t return_values_size;
-        {
-          int64 temp;
-          TF_RETURN_IF_ERROR(reader->ReadScalar(
-              full_name(strings::StrCat(prefix, "_return_values_size")),
-              &temp));
-          return_values_size = static_cast<size_t>(temp);
-          if (temp != return_values_size) {
-            return errors::Internal("Invalid value for return_values_size ",
-                                    return_values_size);
+          while (num_calls_ < dataset()->num_parallel_calls_ &&
+                 (output_batch_ - input_batch_ < batch_results_.size())) {
+            BatchResult* result = &batch_results_[ComputeIndex(output_batch_)];
+            int64 offset = call_counter_++ % dataset()->batch_size_;
+            num_calls_++;
+            mu_.unlock();
+            CallFunction(ctx, result, offset);
+            mu_.lock();
+            if (offset + 1 == dataset()->batch_size_) {
+              // Done scheduling calls for the current batch.
+              output_batch_++;
+            }
           }
         }
-        result->return_values.reserve(return_values_size);
-        for (size_t i = 0; i < return_values_size; i++) {
-          result->return_values.emplace_back();
-          TF_RETURN_IF_ERROR(reader->ReadTensor(
-              full_name(strings::StrCat(prefix, "_return_values_", i)),
-              &result->return_values.back()));
-        }
-        return Status::OK();
       }
 
-      Status WriteBatchResultLocked(IteratorStateWriter* writer, size_t index)
+      void WaitForBatch(BatchResult* result, mutex_lock* l)
           EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        // Wait for the map_fn dispatches made in `InvokeFunctionLocked` to
-        // finish. This may delay saving a checkpoint by a bit but keeps the
-        // code clean and also saves us from checkpointing the state of the
-        // `BlockingCounter`.
-        int64 num_elements = 0;
-        WaitForBatch(index, &num_elements).IgnoreError();
-
-        const BatchResult& result = batch_results_[index];
-        string prefix = strings::StrCat("batch_results_", index);
-        {
-          mutex_lock l(batch_results_[index].mu);
-          if (result.output_allocated) {
-            TF_RETURN_IF_ERROR(writer->WriteScalar(
-                full_name(strings::StrCat(prefix, "_output_allocated")), ""));
-          }
+        while (result->num_calls > 0) {
+          result->cond_var.wait(*l);
         }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            full_name(strings::StrCat(prefix, "_output_size")),
-            result.output.size()));
-        for (size_t i = 0; i < result.output.size(); i++) {
-          // If the batch is not full, we only store the first
-          // `num_elements` values. The rest of the batch tensor is
-          // *uninitialized* and accessing that will raise msan errors.
-          if (num_elements < dataset()->batch_size_) {
-            TF_RETURN_IF_ERROR(writer->WriteTensor(
-                full_name(strings::StrCat(prefix, "_output_", i)),
-                result.output[i].Slice(0, num_elements)));
-          } else {
-            TF_RETURN_IF_ERROR(writer->WriteTensor(
-                full_name(strings::StrCat(prefix, "_output_", i)),
-                result.output[i]));
-          }
-        }
-        return Status::OK();
       }
 
-      Status ReadBatchResultLocked(IteratorContext* ctx,
-                                   IteratorStateReader* reader, size_t index)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      Status ReadBatchResult(IteratorContext* ctx, IteratorStateReader* reader,
+                             size_t index) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         BatchResult* result = &batch_results_[index];
         string prefix = strings::StrCat("batch_results_", index);
-        {
-          mutex_lock l(batch_results_[index].mu);
-          result->output_allocated = reader->Contains(
-              full_name(strings::StrCat(prefix, "_output_allocated")));
-          // Simulate that the batch was fully generated.
-          batch_results_[index].counter.reset(new BlockingCounter(0));
-        }
-        size_t output_size;
-        {
-          int64 temp;
-          TF_RETURN_IF_ERROR(reader->ReadScalar(
-              full_name(strings::StrCat(prefix, "_output_size")), &temp));
-          output_size = static_cast<size_t>(temp);
-          if (temp != output_size) {
-            return errors::Internal("Invalid value for output_size ",
-                                    output_size);
-          }
-        }
+        mutex_lock l(result->mu);
+        result->end_of_input = reader->Contains(
+            full_name(strings::StrCat(prefix, "_end_of_input")));
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name(strings::StrCat(prefix, "_num_calls")),
+                               &result->num_calls));
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(prefix, "_num_elements")),
+            &result->num_elements));
+        result->output_allocated = reader->Contains(
+            full_name(strings::StrCat(prefix, "_output_allocated")));
+        int64 output_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(prefix, "_output_size")), &output_size));
         result->output.reserve(output_size);
-        for (size_t i = 0; i < output_size; i++) {
+        for (int i = 0; i < output_size; i++) {
           Tensor t;
           TF_RETURN_IF_ERROR(reader->ReadTensor(
               full_name(strings::StrCat(prefix, "_output_", i)), &t));
@@ -649,25 +560,13 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
             result->output.emplace_back(std::move(t));
           }
         }
+        TF_RETURN_IF_ERROR(ReadStatus(
+            reader, strings::StrCat(prefix, "_status"), &result->status));
         return Status::OK();
       }
 
-      Status WriteStatusLocked(IteratorStateWriter* writer,
-                               const string& prefix, const Status& status)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name(strings::StrCat(prefix, "_code")),
-                                static_cast<int64>(status.code())));
-        if (!status.ok()) {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(full_name(strings::StrCat(prefix, "_msg")),
-                                  status.error_message()));
-        }
-        return Status::OK();
-      }
-
-      Status ReadStatusLocked(IteratorStateReader* reader, const string& prefix,
-                              Status* status) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      Status ReadStatus(IteratorStateReader* reader, const string& prefix,
+                        Status* status) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         int64 code_int;
         TF_RETURN_IF_ERROR(reader->ReadScalar(
             full_name(strings::StrCat(prefix, "_code")), &code_int));
@@ -683,17 +582,93 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         }
         return Status::OK();
       }
+
+      Status WriteBatchResult(IteratorStateWriter* writer, size_t index)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        BatchResult* result = &batch_results_[index];
+        string prefix = strings::StrCat("batch_results_", index);
+        mutex_lock l(result->mu);
+        if (result->end_of_input) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(prefix, "_end_of_input")), ""));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_num_calls")),
+            result->num_calls));
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_num_elements")),
+            result->num_elements));
+        if (result->output_allocated) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(prefix, "_output_allocated")), ""));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_output_size")),
+            result->output.size()));
+        for (int i = 0; i < result->output.size(); i++) {
+          // If the batch is not full, we only store the first `num_elements`
+          // values. The rest of the batch tensor is *uninitialized* and
+          // accessing that will raise msan errors.
+          if (result->num_elements < dataset()->batch_size_) {
+            TF_RETURN_IF_ERROR(writer->WriteTensor(
+                full_name(strings::StrCat(prefix, "_output_", i)),
+                result->output[i].Slice(0, result->num_elements)));
+          } else {
+            TF_RETURN_IF_ERROR(writer->WriteTensor(
+                full_name(strings::StrCat(prefix, "_output_", i)),
+                result->output[i]));
+          }
+        }
+        TF_RETURN_IF_ERROR(WriteStatus(
+            writer, strings::StrCat(prefix, "_status"), result->status));
+        return Status::OK();
+      }
+
+      Status WriteStatus(IteratorStateWriter* writer, const string& prefix,
+                         const Status& status) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name(strings::StrCat(prefix, "_code")),
+                                static_cast<int64>(status.code())));
+        if (!status.ok()) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name(strings::StrCat(prefix, "_msg")),
+                                  status.error_message()));
+        }
+        return Status::OK();
+      }
+
+      // Used for coordination between the main thread, the runner thread, and
+      // the callback threads.
       mutex mu_;
-      int32 current_batch_index_ GUARDED_BY(mu_) = -1;
-      const std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-      std::vector<InvocationResult> invocation_results_ GUARDED_BY(mu_);
+      // Used for coordination between the main thread, the runner thread, and
+      // the callback threads. In particular, the runner thread should only
+      // schedule new calls when the number of in-flight calls is less than the
+      // user specified level of parallelism and there are slots available in
+      // the `batch_results_` buffer.
+      condition_variable cond_var_;
+      // Used for serializing external parallelism.
+      mutex external_mu_ ACQUIRED_BEFORE(mu_);
+      // Counts the number of outstanding calls for this batch.
+      int64 num_calls_ GUARDED_BY(mu_) = 0;
+      // Counts the total number of calls.
+      int64 call_counter_ GUARDED_BY(mu_) = 0;
+      std::unique_ptr<IteratorBase> input_impl_;
+      // Identifies the next batch to be read by the caller.
+      int64 input_batch_ GUARDED_BY(mu_) = 0;
+      // Identifies the next batch to create.
+      int64 output_batch_ GUARDED_BY(mu_) = 0;
+      // Circular buffer for storing the (intermediate) batch results. When
+      // using `input_batch_` and `output_batch_` to index into the buffer,
+      // their value should be interpreted modulo the size of the buffer.
       std::vector<BatchResult> batch_results_ GUARDED_BY(mu_);
+      std::unique_ptr<Thread> runner_thread_ GUARDED_BY(mu_);
+      bool cancelled_ GUARDED_BY(mu_) = false;
     };
 
     const DatasetBase* const input_;
     const NameAttrList func_;
     const int64 batch_size_;
-    const int64 num_parallel_batches_;
+    const int64 num_parallel_calls_;
     const bool drop_remainder_;
     const DataTypeVector output_types_;
     const std::vector<PartialTensorShape> output_shapes_;
@@ -703,6 +678,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
   };
 
   const int graph_def_version_;
+  const int op_version_;
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
   NameAttrList func_;
@@ -711,6 +687,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
 REGISTER_KERNEL_BUILDER(Name("MapAndBatchDataset").Device(DEVICE_CPU),
                         MapAndBatchDatasetOp);
 
+REGISTER_KERNEL_BUILDER(Name("MapAndBatchDatasetV2").Device(DEVICE_CPU),
+                        MapAndBatchDatasetOp);
+
 }  // namespace
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/map_dataset_op.cc b/tensorflow/core/kernels/data/map_dataset_op.cc
index 89360d1cd95e89..aa530aea192a29 100644
--- a/tensorflow/core/kernels/data/map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op.cc
@@ -73,7 +73,7 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::Map")}));
@@ -86,7 +86,7 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override { return "MapDatasetOp::Dataset"; }
+    string DebugString() const override { return "MapDatasetOp::Dataset"; }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
@@ -123,8 +123,11 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
@@ -167,7 +170,7 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
       }
 
      private:
-      const std::unique_ptr<IteratorBase> input_impl_;
+      std::unique_ptr<IteratorBase> input_impl_;
     };
 
     const DatasetBase* const input_;
diff --git a/tensorflow/core/kernels/data/optimize_dataset_op.cc b/tensorflow/core/kernels/data/optimize_dataset_op.cc
new file mode 100644
index 00000000000000..8965858e8dd490
--- /dev/null
+++ b/tensorflow/core/kernels/data/optimize_dataset_op.cc
@@ -0,0 +1,210 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <map>
+
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/graph_runner.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/grappler/graph_view.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/grappler_item_builder.h"
+#include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+class OptimizeDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit OptimizeDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx),
+        graph_def_version_(ctx->graph_def_version()) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    std::vector<string> optimizations;
+    OP_REQUIRES_OK(
+        ctx, ParseVectorArgument<string>(ctx, "optimizations", &optimizations));
+    Dataset* dataset =
+        new Dataset(ctx, input, optimizations, output_types_, output_shapes_);
+    core::ScopedUnref unref(dataset);
+    OP_REQUIRES_OK(ctx, dataset->Optimize(ctx, output));
+  }
+
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            const std::vector<string>& optimizations,
+            const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes)
+        : GraphDatasetBase(ctx),
+          input_(input),
+          optimizations_(optimizations),
+          output_types_(output_types),
+          output_shapes_(output_shapes) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::Optimize")}));
+    }
+
+    Status Optimize(OpKernelContext* ctx, DatasetBase** output) {
+      GraphDefBuilder b;
+      DatasetGraphDefBuilder db(&b);
+      Node* input_node = nullptr;
+      TF_RETURN_IF_ERROR(db.AddParentDataset(ctx, input_, &input_node));
+      string output_node = input_node->name();
+      GraphDef graph_def;
+      TF_RETURN_IF_ERROR(b.ToGraphDef(&graph_def));
+      TF_RETURN_IF_ERROR(ApplyOptimizations(ctx, &graph_def, &output_node));
+
+      Graph graph(OpRegistry::Global());
+      TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, nullptr));
+      std::vector<Tensor> outputs;
+      GraphRunner graph_runner(ctx->env());
+      // Once rewrites that add/modify functions are introduced, we will need
+      // persist the results in a function library runtime.
+      TF_RETURN_IF_ERROR(graph_runner.Run(&graph, ctx->function_library(), {},
+                                          {output_node}, &outputs));
+      TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(outputs[0], output));
+      (*output)->Ref();
+      return Status::OK();
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_types_;
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() const override { return "OptimizeDatasetOp::Dataset"; }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return errors::Unimplemented(strings::StrCat(prefix(), "::Initialize"));
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        return errors::Unimplemented(
+            strings::StrCat(prefix(), "::GetNextInternal"));
+      }
+    };
+
+    Status ApplyOptimizations(OpKernelContext* ctx, GraphDef* graph_def,
+                              string* output_node) {
+      // Add a fake sink node to allow rewriting the actual sink node.
+      NodeDef* node = graph_def->mutable_node()->Add();
+      node->set_name("FakeSink");
+      node->set_op("IdentityDataset");
+      node->add_input(*output_node);
+      {
+        grappler::GraphView graph(graph_def);
+        NodeDef* sink = graph.GetNode(*output_node);
+        (*node->mutable_attr())["output_shapes"] =
+            sink->attr().at("output_shapes");
+        (*node->mutable_attr())["output_types"] =
+            sink->attr().at("output_types");
+      }
+
+      // Create metagraph.
+      MetaGraphDef meta_graph_def;
+      (*meta_graph_def.mutable_graph_def()) = *graph_def;
+
+      // Grappler determines fetch ops from collection 'train_op'.
+      CollectionDef collection_def;
+      auto node_list = collection_def.mutable_node_list();
+      node_list->add_value("FakeSink");
+      (*meta_graph_def.mutable_collection_def())["train_op"] = collection_def;
+
+      // Create Grappler item.
+      tensorflow::RewriterConfig rewriter_config;
+      for (const string& optimization : optimizations_) {
+        rewriter_config.add_optimizers(optimization);
+      }
+      // If no optimizations were specified, supply a non-existent optimization
+      // to prevent Grappler from applying the default set of optimizations as
+      // some of them do not work out of the box at the moment (e.g. because we
+      // have no cost model for dataset ops).
+      if (optimizations_.empty()) {
+        rewriter_config.add_optimizers("non-existent");
+      }
+      tensorflow::grappler::ItemConfig item_config;
+      item_config.apply_optimizations = true;
+      std::unique_ptr<tensorflow::grappler::GrapplerItem> grappler_item =
+          tensorflow::grappler::GrapplerItemFromMetaGraphDef(
+              "graph", meta_graph_def, item_config);
+      std::unordered_map<string, tensorflow::DeviceProperties> device_map;
+      tensorflow::grappler::VirtualCluster cluster(device_map);
+
+      // Run optimizer.
+      TF_RETURN_IF_ERROR(tensorflow::grappler::RunMetaOptimizer(
+          *grappler_item, rewriter_config, ctx->device(), &cluster, graph_def));
+
+      // Set `output_node` to the input of the fake sink node.
+      {
+        grappler::GraphView graph(graph_def);
+        grappler::GraphView::InputPort input_port =
+            graph.GetInputPort("FakeSink", 0);
+        *output_node = graph.GetRegularFanin(input_port).node->name();
+      }
+
+      return Status::OK();
+    }
+
+    const DatasetBase* input_;
+    const std::vector<string> optimizations_;
+    const DataTypeVector output_types_;
+    const std::vector<PartialTensorShape> output_shapes_;
+  };
+
+  const int graph_def_version_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("OptimizeDataset").Device(DEVICE_CPU),
+                        OptimizeDatasetOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
index cfb4efda9a56fd..d9e43ace3952dd 100644
--- a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
@@ -15,8 +15,8 @@ limitations under the License.
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_util.h"
-#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
 
@@ -119,7 +119,7 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::PaddedBatch")}));
@@ -133,7 +133,7 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override {
+    string DebugString() const override {
       return strings::StrCat("PaddedBatchDatasetOp(", batch_size_,
                              ")::Dataset");
     }
@@ -186,8 +186,11 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
@@ -325,7 +328,8 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
         if (reader->Contains(full_name("exhausted"))) {
           input_impl_.reset();
         } else {
-          input_impl_ = dataset()->input_->MakeIterator(prefix());
+          TF_RETURN_IF_ERROR(
+              dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
           TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
         }
         return Status::OK();
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index fa33867ec11812..6292b4536ed563 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -116,7 +116,7 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(new Iterator(
           {this, strings::StrCat(prefix, "::ParallelInterleave")}));
@@ -129,7 +129,7 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override {
+    string DebugString() const override {
       return "ParallelInterleaveDatasetOp::Dataset";
     }
 
@@ -236,7 +236,6 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
      public:
       explicit Iterator(const Params& params)
           : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
             workers_(dataset()->num_threads()),
             worker_thread_states_(dataset()->num_threads()) {}
 
@@ -249,6 +248,10 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
         }
       }
 
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
       // It is implemented so that it matches the deterministic interleave
       // unless getting the next element would block and we are allowed to be
       // sloppy.
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index 7e373f25686899..3fa6b0d3a9390a 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -85,7 +85,7 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::ParallelMap")}));
@@ -99,7 +99,9 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override { return "ParallelMapDatasetOp::Dataset"; }
+    string DebugString() const override {
+      return "ParallelMapDatasetOp::Dataset";
+    }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
@@ -150,7 +152,6 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
      public:
       explicit Iterator(const Params& params)
           : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
             invocation_results_(params.dataset->num_parallel_calls_) {}
 
       ~Iterator() override {
@@ -169,6 +170,10 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
         }
       }
 
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
diff --git a/tensorflow/core/kernels/data/prefetch_autotuner_test.cc b/tensorflow/core/kernels/data/prefetch_autotuner_test.cc
index 2f573dfb3555b2..29a8cc50cde8e6 100644
--- a/tensorflow/core/kernels/data/prefetch_autotuner_test.cc
+++ b/tensorflow/core/kernels/data/prefetch_autotuner_test.cc
@@ -33,7 +33,7 @@ TEST(PrefetchAutotuner, Disabled) {
 TEST(PrefetchAutotuner, Enabled) {
   PrefetchAutotuner t(PrefetchAutotuner::kAutoTune);
   EXPECT_EQ(1, t.buffer_limit());
-  t.RecordConsumption(0);  // Expect buffer limit to increase.
+  t.RecordConsumption(0);  // Expect buffer limit to stay the same.
   EXPECT_EQ(1, t.buffer_limit());
   t.RecordConsumption(1);
   EXPECT_EQ(1, t.buffer_limit());
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index 536de81fd891f1..e2b6aa590e22f9 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -55,7 +55,7 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::Prefetch")}));
@@ -68,7 +68,7 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() override { return "PrefetchDatasetOp::Dataset"; }
+    string DebugString() const override { return "PrefetchDatasetOp::Dataset"; }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
@@ -87,7 +87,6 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
      public:
       explicit Iterator(const Params& params)
           : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
             auto_tuner_(params.dataset->buffer_size_) {}
 
       ~Iterator() override {
@@ -106,6 +105,10 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
         }
       }
 
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
@@ -327,7 +330,7 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
       // accessing the parent iterator. We keep this separate from `mu_` to
       // allow prefetching to run in parallel with GetNext calls.
       mutex parent_mu_ ACQUIRED_BEFORE(mu_);
-      const std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(parent_mu_);
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(parent_mu_);
       condition_variable cond_var_;
       PrefetchAutotuner auto_tuner_ GUARDED_BY(mu_);
       std::deque<BufferElement> buffer_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/kernels/data/random_dataset_op.cc b/tensorflow/core/kernels/data/random_dataset_op.cc
index 210b9ad1b84eeb..ff166c3be76f96 100644
--- a/tensorflow/core/kernels/data/random_dataset_op.cc
+++ b/tensorflow/core/kernels/data/random_dataset_op.cc
@@ -54,7 +54,7 @@ class RandomDatasetOp : public DatasetOpKernel {
     Dataset(OpKernelContext* ctx, int64 seed, int64 seed2)
         : GraphDatasetBase(ctx), seed_(seed), seed2_(seed2) {}
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::Random")}));
@@ -71,7 +71,7 @@ class RandomDatasetOp : public DatasetOpKernel {
       return *shapes;
     }
 
-    string DebugString() override {
+    string DebugString() const override {
       return strings::StrCat("RandomDatasetOp(", seed_, ", ", seed2_,
                              ")::Dataset");
     }
diff --git a/tensorflow/core/kernels/data/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc
index b57518e678ed18..0b5c814767b0f7 100644
--- a/tensorflow/core/kernels/data/range_dataset_op.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op.cc
@@ -48,7 +48,7 @@ class RangeDatasetOp : public DatasetOpKernel {
     Dataset(OpKernelContext* ctx, int64 start, int64 stop, int64 step)
         : GraphDatasetBase(ctx), start_(start), stop_(stop), step_(step) {}
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::Range")}));
@@ -65,7 +65,7 @@ class RangeDatasetOp : public DatasetOpKernel {
       return *shapes;
     }
 
-    string DebugString() override {
+    string DebugString() const override {
       return strings::StrCat("RangeDatasetOp(", start_, ", ", stop_, ", ",
                              step_, ")::Dataset");
     }
diff --git a/tensorflow/core/kernels/data/reader_dataset_ops.cc b/tensorflow/core/kernels/data/reader_dataset_ops.cc
index 34d7d9f914d7a7..29654b9bca9c09 100644
--- a/tensorflow/core/kernels/data/reader_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/reader_dataset_ops.cc
@@ -89,7 +89,7 @@ class TextLineDatasetOp : public DatasetOpKernel {
           use_compression_(!compression_type.empty()),
           options_(options) {}
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::TextLine")}));
@@ -106,7 +106,7 @@ class TextLineDatasetOp : public DatasetOpKernel {
       return *shapes;
     }
 
-    string DebugString() override { return "TextLineDatasetOp::Dataset"; }
+    string DebugString() const override { return "TextLineDatasetOp::Dataset"; }
 
    protected:
     Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
@@ -323,7 +323,7 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
           footer_bytes_(footer_bytes),
           buffer_size_(buffer_size) {}
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::FixedLengthRecord")}));
@@ -340,7 +340,7 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
       return *shapes;
     }
 
-    string DebugString() override {
+    string DebugString() const override {
       return "FixedLengthRecordDatasetOp::Dataset";
     }
 
@@ -543,7 +543,7 @@ class TFRecordDatasetOp : public DatasetOpKernel {
       }
     }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::TFRecord")}));
@@ -560,7 +560,7 @@ class TFRecordDatasetOp : public DatasetOpKernel {
       return *shapes;
     }
 
-    string DebugString() override { return "TFRecordDatasetOp::Dataset"; }
+    string DebugString() const override { return "TFRecordDatasetOp::Dataset"; }
 
    protected:
     Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/repeat_dataset_op.cc b/tensorflow/core/kernels/data/repeat_dataset_op.cc
index d37086541dc471..6b3f4ed27b7297 100644
--- a/tensorflow/core/kernels/data/repeat_dataset_op.cc
+++ b/tensorflow/core/kernels/data/repeat_dataset_op.cc
@@ -48,7 +48,7 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       if (count_ < 0) {
         return std::unique_ptr<IteratorBase>(new ForeverIterator(
@@ -69,7 +69,7 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() override { return "RepeatDatasetOp::Dataset"; }
+    string DebugString() const override { return "RepeatDatasetOp::Dataset"; }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
@@ -108,9 +108,11 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
     class FiniteIterator : public DatasetIterator<Dataset> {
      public:
       explicit FiniteIterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            i_(0),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<Dataset>(params), i_(0) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
@@ -127,7 +129,8 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
             return Status::OK();
           }
           ++i_;
-          input_impl_ = dataset()->input_->MakeIterator(prefix());
+          TF_RETURN_IF_ERROR(
+              dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
         }
         *end_of_sequence = true;
         input_impl_.reset();
@@ -178,7 +181,8 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
           bool first_call = false;
           if (!input_impl_) {
             first_call = true;
-            input_impl_ = dataset()->input_->MakeIterator(prefix());
+            TF_RETURN_IF_ERROR(
+                dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
           }
           TF_RETURN_IF_ERROR(
               input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
@@ -214,7 +218,8 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
         if (reader->Contains(full_name("uninitialized"))) {
           input_impl_.reset();
         } else {
-          input_impl_ = dataset()->input_->MakeIterator(prefix());
+          TF_RETURN_IF_ERROR(
+              dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
           TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
         }
         return Status::OK();
diff --git a/tensorflow/core/kernels/data/scan_dataset_op.cc b/tensorflow/core/kernels/data/scan_dataset_op.cc
index 5dd6ff848eb483..a3b20016a86431 100644
--- a/tensorflow/core/kernels/data/scan_dataset_op.cc
+++ b/tensorflow/core/kernels/data/scan_dataset_op.cc
@@ -90,7 +90,7 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::Scan")}));
@@ -103,7 +103,7 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override { return "ScanDatasetOp::Dataset"; }
+    string DebugString() const override { return "ScanDatasetOp::Dataset"; }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
@@ -149,9 +149,12 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
      public:
       explicit Iterator(const Params& params)
           : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
             state_(params.dataset->initial_state_) {}
 
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
@@ -250,7 +253,7 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
 
      private:
       mutex mu_;
-      const std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
       std::vector<Tensor> state_ GUARDED_BY(mu_);
     };
 
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
index 2f6bf83da5d4f1..3438199ebd0e19 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
@@ -85,7 +85,8 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
         bool first_call = false;
         if (!input_impl_ && epoch_ == 0) {
           first_call = true;
-          input_impl_ = dataset()->input_->MakeIterator(prefix());
+          TF_RETURN_IF_ERROR(
+              dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
         }
         while (input_impl_ && num_elements_ < dataset()->buffer_size_) {
           if (ctx->env()->NowMicros() >
@@ -114,7 +115,8 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
             epoch_++;
             int64 n = slices_.back()->end;
             slices_.emplace_back(new Slice{n, n});
-            input_impl_ = dataset()->input_->MakeIterator(prefix());
+            TF_RETURN_IF_ERROR(
+                dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
           }
           if (!end_of_input_sequence) {
             buffer_[slices_.back()->end % dataset()->buffer_size_] =
@@ -211,7 +213,8 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
 
         // Restore the input iterator if it wasn't already exhausted.
         if (!reader->Contains(full_name("end_of_input_sequence"))) {
-          input_impl_ = dataset()->input_->MakeIterator(prefix());
+          TF_RETURN_IF_ERROR(
+              dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
           TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
         } else {
           input_impl_.reset();
@@ -356,12 +359,12 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
           parent_generator_(seed, seed2),
           generator_(&parent_generator_) {}
 
-    string DebugString() override {
+    string DebugString() const override {
       return strings::StrCat("ShuffleDatasetOp(", buffer_size_, ", ", seed_,
                              ", ", seed2_, ")::ReshufflingDataset");
     }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       int64 iterator_seed;
       int64 iterator_seed2;
@@ -375,6 +378,23 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
           iterator_seed2));
     }
 
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      return errors::Unimplemented(
+          "Checkpointing ShufflingDataset with reshuffle_each_iteration=true "
+          "is not supported.\n"
+          "If you have a ds.shuffle(buffer_size).repeat(count) in your input "
+          "pipeline, replace it with "
+          "ds.apply(tf.contrib.data.shuffle_and_repeat(buffer_size, count)).\n"
+          "If you iterate over your dataset once, change shuffle(buffer_size) "
+          "to shuffle(buffer_size, reshuffle_each_iteration=False).\n"
+          "If you are using Dataset.list_files(pattern), change it to "
+          "Dataset.list_files(pattern, shuffle=False) and manually shuffle "
+          "the list of files using shuffle_and_repeat as above or using "
+          "ds.shuffle with reshuffle_each_iteration=False.");
+    }
+
    private:
     const int64 seed_;
     const int64 seed2_;
@@ -394,12 +414,12 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
           seed_(seed),
           seed2_(seed) {}
 
-    string DebugString() override {
+    string DebugString() const override {
       return strings::StrCat("ShuffleDatasetOp(", buffer_size_, ", ", seed_,
                              ", ", seed2_, ")::FixedSeedDataset");
     }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(new ShuffleDatasetBase::Iterator(
           {this, strings::StrCat(prefix, "::Shuffle")}, seed_, seed2_));
@@ -477,12 +497,12 @@ class ShuffleAndRepeatDatasetOp : public ShuffleDatasetOpBase {
           seed_(seed),
           seed2_(seed2) {}
 
-    string DebugString() override {
+    string DebugString() const override {
       return strings::StrCat("ShuffleAndRepeatDatasetOp(", buffer_size_, ", ",
                              seed_, ", ", seed2_, ", ", count_, ")::Dataset");
     }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(new ShuffleDatasetBase::Iterator(
           {this, strings::StrCat(prefix, "::ShuffleAndRepeat")}, seed_,
diff --git a/tensorflow/core/kernels/data/skip_dataset_op.cc b/tensorflow/core/kernels/data/skip_dataset_op.cc
index d636c37afe2aa0..b84afa3e33464c 100644
--- a/tensorflow/core/kernels/data/skip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/skip_dataset_op.cc
@@ -47,14 +47,11 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       if (count_ < 0) {
         return std::unique_ptr<IteratorBase>(
             new EmptyIterator({this, strings::StrCat(prefix, "::EmptySkip")}));
-      } else if (count_ == 0) {
-        // Pass through.
-        return input_->MakeIterator(prefix);
       } else {
         return std::unique_ptr<IteratorBase>(new FiniteIterator(
             {this, strings::StrCat(prefix, "::FiniteSkip")}));
@@ -68,7 +65,7 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() override { return "SkipDatasetOp::Dataset"; }
+    string DebugString() const override { return "SkipDatasetOp::Dataset"; }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
@@ -108,9 +105,11 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
     class FiniteIterator : public DatasetIterator<Dataset> {
      public:
       explicit FiniteIterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            i_(0),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<Dataset>(params), i_(0) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
diff --git a/tensorflow/core/kernels/data/slide_dataset_op.cc b/tensorflow/core/kernels/data/slide_dataset_op.cc
index 4f3537b6912bff..48776cbf6113f8 100644
--- a/tensorflow/core/kernels/data/slide_dataset_op.cc
+++ b/tensorflow/core/kernels/data/slide_dataset_op.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
 
@@ -33,10 +33,9 @@ class SlideDatasetOp : public UnaryDatasetOpKernel {
                    DatasetBase** output) override {
     int64 window_size = 0;
     int64 stride = 1;
-    OP_REQUIRES_OK(ctx,
-                   ParseScalarArgument<int64>(ctx, "window_size", &window_size));
-    OP_REQUIRES_OK(ctx,
-                   ParseScalarArgument<int64>(ctx, "stride", &stride));
+    OP_REQUIRES_OK(
+        ctx, ParseScalarArgument<int64>(ctx, "window_size", &window_size));
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "stride", &stride));
     OP_REQUIRES(
         ctx, window_size > 0,
         errors::InvalidArgument("Window size must be greater than zero."));
@@ -50,8 +49,12 @@ class SlideDatasetOp : public UnaryDatasetOpKernel {
  private:
   class Dataset : public GraphDatasetBase {
    public:
-    Dataset(OpKernelContext* ctx, int64 window_size, int64 stride, const DatasetBase* input)
-        : GraphDatasetBase(ctx), window_size_(window_size), stride_(stride), input_(input) {
+    Dataset(OpKernelContext* ctx, int64 window_size, int64 stride,
+            const DatasetBase* input)
+        : GraphDatasetBase(ctx),
+          window_size_(window_size),
+          stride_(stride),
+          input_(input) {
       input_->Ref();
 
       const auto& input_shapes = input_->output_shapes();
@@ -64,7 +67,7 @@ class SlideDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(new Iterator(
           Iterator::Params{this, strings::StrCat(prefix, "::Slide")}));
@@ -78,8 +81,9 @@ class SlideDatasetOp : public UnaryDatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override {
-      return strings::StrCat("SlideDatasetOp(", window_size_, ", ", stride_, ")::Dataset");
+    string DebugString() const override {
+      return strings::StrCat("SlideDatasetOp(", window_size_, ", ", stride_,
+                             ")::Dataset");
     }
 
    protected:
@@ -101,8 +105,11 @@ class SlideDatasetOp : public UnaryDatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
diff --git a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
index fcf17ad68bb1bb..2604822cc9da4e 100644
--- a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
@@ -39,7 +39,7 @@ class Dataset : public GraphDatasetBase {
                  {-1},
                  {sparse_tensor.dims() - 1}}) {}
 
-  std::unique_ptr<IteratorBase> MakeIterator(
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
       const string& prefix) const override {
     return std::unique_ptr<IteratorBase>(
         new Iterator({this, strings::StrCat(prefix, "::SparseTensorSlice")}));
@@ -50,7 +50,7 @@ class Dataset : public GraphDatasetBase {
     return shapes_;
   }
 
-  string DebugString() override {
+  string DebugString() const override {
     return "SparseTensorSliceDatasetOp::Dataset";
   }
 
diff --git a/tensorflow/core/kernels/data/sql_dataset_ops.cc b/tensorflow/core/kernels/data/sql_dataset_ops.cc
index d50e9c9cf97390..16652e792cb6a9 100644
--- a/tensorflow/core/kernels/data/sql_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/sql_dataset_ops.cc
@@ -70,23 +70,25 @@ class SqlDatasetOp : public DatasetOpKernel {
                     "The set of supported databases is: {'sqlite'}.",
                     driver_name.c_str())));
 
-    *output = new Dataset(driver_name, data_source_name, query, output_types_,
-                          output_shapes_);
+    *output = new Dataset(ctx, driver_name, data_source_name, query,
+                          output_types_, output_shapes_);
   }
 
  private:
-  class Dataset : public DatasetBase {
+  class Dataset : public GraphDatasetBase {
    public:
-    Dataset(const string& driver_name, const string& data_source_name,
-            const string& query, const DataTypeVector& output_types,
+    Dataset(OpKernelContext* ctx, const string& driver_name,
+            const string& data_source_name, const string& query,
+            const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
-        : driver_name_(driver_name),
+        : GraphDatasetBase(ctx),
+          driver_name_(driver_name),
           data_source_name_(data_source_name),
           query_(query),
           output_types_(output_types),
           output_shapes_(output_shapes) {}
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::Sql")}));
@@ -100,7 +102,22 @@ class SqlDatasetOp : public DatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override { return "SqlDatasetOp::Dataset"; }
+    string DebugString() const override { return "SqlDatasetOp::Dataset"; }
+
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* driver_name_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(driver_name_, &driver_name_node));
+      Node* data_source_name_node;
+      TF_RETURN_IF_ERROR(
+          b->AddScalar(data_source_name_, &data_source_name_node));
+      Node* query_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(query_, &query_node));
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this, {driver_name_node, data_source_name_node, query_node}, output));
+      return Status::OK();
+    }
 
    private:
     class Iterator : public DatasetIterator<Dataset> {
@@ -121,22 +138,62 @@ class SqlDatasetOp : public DatasetOpKernel {
                              bool* end_of_sequence) override {
         mutex_lock l(mu_);
         if (!query_connection_initialized_) {
-          query_connection_initialized_ = true;
-          query_connection_ = sql::DriverManager::CreateQueryConnection(
-              dataset()->driver_name_);
-          Status s = query_connection_->Open(dataset()->data_source_name_,
-                                             dataset()->query_,
-                                             dataset()->output_types_);
-          if (!s.ok()) {
-            LOG(WARNING) << "Failed to connect to database: " << s;
-            return s;
-          }
+          TF_RETURN_IF_ERROR(InitializeQueryConnection());
         }
+        next_calls_++;
         return query_connection_->GetNext(ctx, out_tensors, end_of_sequence);
       }
 
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        if (query_connection_initialized_) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("next_calls"), next_calls_));
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        if (reader->Contains(full_name("next_calls"))) {
+          TF_RETURN_IF_ERROR(InitializeQueryConnection());
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("next_calls"), &next_calls_));
+          int64 rem_next_calls = next_calls_;
+          std::vector<Tensor> out_tensors;
+          bool end_of_sequence = false;
+          while (rem_next_calls--) {
+            TF_RETURN_IF_ERROR(query_connection_->GetNext(ctx, &out_tensors,
+                                                          &end_of_sequence));
+            out_tensors.clear();
+          }
+        } else {
+          query_connection_initialized_ = false;
+        }
+        return Status::OK();
+      }
+
      private:
+      Status InitializeQueryConnection() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        query_connection_initialized_ = true;
+        query_connection_ =
+            sql::DriverManager::CreateQueryConnection(dataset()->driver_name_);
+        Status s = query_connection_->Open(dataset()->data_source_name_,
+                                           dataset()->query_,
+                                           dataset()->output_types_);
+        next_calls_ = 0;
+        if (!s.ok()) {
+          LOG(WARNING) << "Failed to connect to database: " << s;
+          return s;
+        }
+        return Status::OK();
+      }
+
       mutex mu_;
+      // TODO(shivaniagrawal): explore ways to seek into a SQLite databases.
+      int64 next_calls_ GUARDED_BY(mu_) = 0;
       std::unique_ptr<sql::QueryConnection> query_connection_ GUARDED_BY(mu_);
       bool query_connection_initialized_ GUARDED_BY(mu_) = false;
     };
diff --git a/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc b/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
index eb96b8a872cae7..2ff90d7b10b36d 100644
--- a/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
@@ -53,7 +53,7 @@ class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
       stats_aggregator_resource_->Unref();
     }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(new Iterator(
           {this, strings::StrCat(prefix, "::SetStatsAggregator")}));
@@ -66,7 +66,7 @@ class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() override {
+    string DebugString() const override {
       return "SetStatsAggregatorDatasetOp::Dataset";
     }
 
@@ -82,8 +82,11 @@ class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
diff --git a/tensorflow/core/kernels/data/stats_aggregator_ops.cc b/tensorflow/core/kernels/data/stats_aggregator_ops.cc
index dd373115806b89..33a56b2eb567a2 100644
--- a/tensorflow/core/kernels/data/stats_aggregator_ops.cc
+++ b/tensorflow/core/kernels/data/stats_aggregator_ops.cc
@@ -38,6 +38,11 @@ class StatsAggregatorImpl : public StatsAggregator {
     }
   }
 
+  void AddScalar(const string& name, float value) override {
+    mutex_lock l(mu_);
+    scalars_[name] = value;
+  }
+
   void EncodeToProto(Summary* out_summary) override {
     mutex_lock l(mu_);
     for (const auto& pair : histograms_) {
@@ -49,11 +54,17 @@ class StatsAggregatorImpl : public StatsAggregator {
       histogram.EncodeToProto(value->mutable_histo(),
                               false /* doesn't preserve zero buckets */);
     }
+    for (const auto& pair : scalars_) {
+      Summary::Value* value = out_summary->add_value();
+      value->set_tag(pair.first);
+      value->set_simple_value(pair.second);
+    }
   }
 
  private:
   mutex mu_;
   std::unordered_map<string, histogram::Histogram> histograms_ GUARDED_BY(mu_);
+  std::unordered_map<string, float> scalars_ GUARDED_BY(mu_);
   TF_DISALLOW_COPY_AND_ASSIGN(StatsAggregatorImpl);
 };
 
diff --git a/tensorflow/core/kernels/data/stats_dataset_ops.cc b/tensorflow/core/kernels/data/stats_dataset_ops.cc
index 633cd8545114e8..7370a24b38212c 100644
--- a/tensorflow/core/kernels/data/stats_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/stats_dataset_ops.cc
@@ -56,7 +56,7 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::LatencyStats")}));
@@ -69,7 +69,9 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() override { return "LatencyStatsDatasetOp::Dataset"; }
+    string DebugString() const override {
+      return "LatencyStatsDatasetOp::Dataset";
+    }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
@@ -86,8 +88,11 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
@@ -150,7 +155,7 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(new Iterator(
           {this, strings::StrCat(prefix, "::BytesProducedStats")}));
@@ -163,7 +168,7 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() override {
+    string DebugString() const override {
       return "BytesProducedStatsDatasetOp::Dataset";
     }
 
@@ -182,8 +187,11 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
diff --git a/tensorflow/core/kernels/data/take_dataset_op.cc b/tensorflow/core/kernels/data/take_dataset_op.cc
index 3bea46a747e002..3d29221f3e6644 100644
--- a/tensorflow/core/kernels/data/take_dataset_op.cc
+++ b/tensorflow/core/kernels/data/take_dataset_op.cc
@@ -47,12 +47,9 @@ class TakeDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      if (count_ < 0) {
-        // Pass through
-        return input_->MakeIterator(prefix);
-      } else if (count_ == 0) {
+      if (count_ == 0) {
         return std::unique_ptr<IteratorBase>(
             new EmptyIterator({this, strings::StrCat(prefix, "::EmptyTake")}));
       } else {
@@ -69,7 +66,7 @@ class TakeDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() override { return "TakeDatasetOp::Dataset"; }
+    string DebugString() const override { return "TakeDatasetOp::Dataset"; }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
@@ -109,9 +106,11 @@ class TakeDatasetOp : public UnaryDatasetOpKernel {
     class FiniteIterator : public DatasetIterator<Dataset> {
      public:
       explicit FiniteIterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            i_(0),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<Dataset>(params), i_(0) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
@@ -121,7 +120,7 @@ class TakeDatasetOp : public UnaryDatasetOpKernel {
           *end_of_sequence = true;
           return Status::OK();
         }
-        while (i_ < dataset()->count_) {
+        while (dataset()->count_ < 0 || i_ < dataset()->count_) {
           TF_RETURN_IF_ERROR(
               input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
           if (!*end_of_sequence) {
diff --git a/tensorflow/core/kernels/data/tensor_dataset_op.cc b/tensorflow/core/kernels/data/tensor_dataset_op.cc
index 8c8994b1c3f470..36fc434d8fbb65 100644
--- a/tensorflow/core/kernels/data/tensor_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_dataset_op.cc
@@ -53,7 +53,7 @@ class TensorDatasetOp : public DatasetOpKernel {
       }
     }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::FromTensor")}));
@@ -64,7 +64,7 @@ class TensorDatasetOp : public DatasetOpKernel {
       return shapes_;
     }
 
-    string DebugString() override { return "TensorDatasetOp::Dataset"; }
+    string DebugString() const override { return "TensorDatasetOp::Dataset"; }
 
    protected:
     Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc b/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc
index ff412a4671bd03..29b4c9053ea7aa 100644
--- a/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc
@@ -20,8 +20,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
-#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
 
@@ -81,7 +81,7 @@ class PrependFromQueueAndPaddedBatchDataset : public GraphDatasetBase {
 
   ~PrependFromQueueAndPaddedBatchDataset() override { input_->Unref(); }
 
-  std::unique_ptr<IteratorBase> MakeIterator(
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
       const string& prefix) const override {
     return std::unique_ptr<IteratorBase>(new Iterator(
         {this, strings::StrCat(prefix, "::PrependFromQueueAndPaddedBatch")}));
@@ -94,7 +94,7 @@ class PrependFromQueueAndPaddedBatchDataset : public GraphDatasetBase {
     return batched_shapes_with_queue_;
   }
 
-  string DebugString() override {
+  string DebugString() const override {
     return "PrependFromQueueAndPaddedBatchDatasetOp::Dataset";
   }
 
@@ -152,15 +152,19 @@ class PrependFromQueueAndPaddedBatchDataset : public GraphDatasetBase {
       : public DatasetIterator<PrependFromQueueAndPaddedBatchDataset> {
    public:
     explicit Iterator(const Params& params)
-        : DatasetIterator<PrependFromQueueAndPaddedBatchDataset>(params),
-          queue_(new TensorQueue(/*input_impl*/
-                                 params.dataset->input_->MakeIterator(
-                                     params.prefix),
-                                 params.dataset->dtypes_,
-                                 params.dataset->shapes_)) {}
+        : DatasetIterator<PrependFromQueueAndPaddedBatchDataset>(params) {}
 
     ~Iterator() override { queue_->Unref(); }
 
+    Status Initialize(IteratorContext* ctx) override {
+      std::unique_ptr<IteratorBase> iterator;
+      TF_RETURN_IF_ERROR(
+          dataset()->input_->MakeIterator(ctx, prefix(), &iterator));
+      queue_ = new TensorQueue(std::move(iterator), dataset()->dtypes_,
+                               dataset()->shapes_);
+      return Status::OK();
+    }
+
     Status GetNextInternal(IteratorContext* ctx,
                            std::vector<Tensor>* out_tensors,
                            bool* end_of_sequence) override {
@@ -372,7 +376,8 @@ class PrependFromQueueAndPaddedBatchDataset : public GraphDatasetBase {
         if (reader->Contains(iter->full_name("input_exhausted"))) {
           input_impl_.reset();
         } else {
-          input_impl_ = iter->dataset_input()->MakeIterator(iter->prefix());
+          TF_RETURN_IF_ERROR(iter->dataset_input()->MakeIterator(
+              ctx, iter->prefix(), &input_impl_));
           TF_RETURN_IF_ERROR(iter->RestoreParent(ctx, reader, input_impl_));
         }
         entries_.clear();
@@ -469,7 +474,7 @@ class PrependFromQueueAndPaddedBatchDataset : public GraphDatasetBase {
     };
 
    private:
-    TensorQueue* const queue_;
+    TensorQueue* queue_;
   };
 
  private:
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
index d5be4c778074e4..68ce324081d7c0 100644
--- a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
 
@@ -70,7 +70,7 @@ class TensorSliceDatasetOp : public DatasetOpKernel {
       }
     }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::TensorSlice")}));
@@ -81,7 +81,9 @@ class TensorSliceDatasetOp : public DatasetOpKernel {
       return shapes_;
     }
 
-    string DebugString() override { return "TensorSliceDatasetOp::Dataset"; }
+    string DebugString() const override {
+      return "TensorSliceDatasetOp::Dataset";
+    }
 
    protected:
     Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/unbatch_dataset_op.cc b/tensorflow/core/kernels/data/unbatch_dataset_op.cc
index 241b615aca11cc..2aec9fb0900f6b 100644
--- a/tensorflow/core/kernels/data/unbatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/unbatch_dataset_op.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
 
@@ -49,7 +49,7 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
       }
     }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::Unbatch")}));
@@ -62,7 +62,7 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
       return shapes_;
     }
 
-    string DebugString() override { return "UnbatchDatasetOp::Dataset"; }
+    string DebugString() const override { return "UnbatchDatasetOp::Dataset"; }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
@@ -80,9 +80,12 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
           : DatasetIterator<Dataset>(params),
             current_index_(0),
             current_batch_size_(0),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
             shapes_(params.dataset->output_shapes().size()) {}
 
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
diff --git a/tensorflow/core/kernels/data/window_dataset.cc b/tensorflow/core/kernels/data/window_dataset.cc
index e24bdea4ac70b7..668b461374998e 100644
--- a/tensorflow/core/kernels/data/window_dataset.cc
+++ b/tensorflow/core/kernels/data/window_dataset.cc
@@ -26,7 +26,7 @@ class WindowDataset : public DatasetBase {
         output_types_(std::move(output_types)),
         output_shapes_(std::move(output_shapes)) {}
 
-  std::unique_ptr<IteratorBase> MakeIterator(
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
       const string& prefix) const override {
     return std::unique_ptr<IteratorBase>(
         new Iterator({this, strings::StrCat(prefix, "::Window")}));
@@ -38,7 +38,7 @@ class WindowDataset : public DatasetBase {
     return output_shapes_;
   }
 
-  string DebugString() override { return "WindowDataset"; }
+  string DebugString() const override { return "WindowDataset"; }
 
  private:
   class Iterator : public DatasetIterator<WindowDataset> {
diff --git a/tensorflow/core/kernels/data/writer_ops.cc b/tensorflow/core/kernels/data/writer_ops.cc
index 656fee1e856d21..80d9a5b867c859 100644
--- a/tensorflow/core/kernels/data/writer_ops.cc
+++ b/tensorflow/core/kernels/data/writer_ops.cc
@@ -70,9 +70,13 @@ class ToTFRecordOp : public AsyncOpKernel {
       DatasetBase* dataset;
       OP_REQUIRES_OK_ASYNC(
           ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
-      auto iterator = dataset->MakeIterator("ToTFRecordOpIterator");
-
       IteratorContext iter_ctx = dataset::MakeIteratorContext(ctx);
+      std::unique_ptr<IteratorBase> iterator;
+      OP_REQUIRES_OK_ASYNC(
+          ctx,
+          dataset->MakeIterator(&iter_ctx, "ToTFRecordOpIterator", &iterator),
+          done);
+
       std::vector<Tensor> components;
       components.reserve(dataset->output_dtypes().size());
       bool end_of_sequence;
diff --git a/tensorflow/core/kernels/data/zip_dataset_op.cc b/tensorflow/core/kernels/data/zip_dataset_op.cc
index 0f79eac94710fa..00705236f9ee83 100644
--- a/tensorflow/core/kernels/data/zip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/zip_dataset_op.cc
@@ -60,7 +60,7 @@ class ZipDatasetOp : public DatasetOpKernel {
       }
     }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::Zip")}));
@@ -74,7 +74,7 @@ class ZipDatasetOp : public DatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override { return "ZipDatasetOp::Dataset"; }
+    string DebugString() const override { return "ZipDatasetOp::Dataset"; }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
@@ -95,13 +95,16 @@ class ZipDatasetOp : public DatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params) {
-        input_impls_.reserve(params.dataset->inputs_.size());
-        size_t idx = 0;
-        for (const auto& input : params.dataset->inputs_) {
-          input_impls_.emplace_back(input->MakeIterator(
-              strings::StrCat(params.prefix, "[", idx++, "]")));
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        mutex_lock l(mu_);
+        input_impls_.resize(dataset()->inputs_.size());
+        for (size_t i = 0; i < input_impls_.size(); ++i) {
+          TF_RETURN_IF_ERROR(dataset()->inputs_[i]->MakeIterator(
+              ctx, strings::StrCat(prefix(), "[", i, "]"), &input_impls_[i]));
         }
+        return Status::OK();
       }
 
       Status GetNextInternal(IteratorContext* ctx,
diff --git a/tensorflow/core/kernels/decode_proto_op.cc b/tensorflow/core/kernels/decode_proto_op.cc
index 24f8a4f72fd551..6d3dcc1c59bbc0 100644
--- a/tensorflow/core/kernels/decode_proto_op.cc
+++ b/tensorflow/core/kernels/decode_proto_op.cc
@@ -287,8 +287,7 @@ struct FieldInfo {
 // It is more complex and provides better motivation for the API here.
 class CountCollector {
  public:
-  // Default constructor allows the collector to be a vector element.
-  CountCollector() = default;
+  CountCollector() = delete;
 
   // The count may be stored inside an Eigen Tensor to eliminate copying.
   explicit CountCollector(int32* count) : count_ptr_(count) {}
@@ -517,8 +516,7 @@ class CountCollector {
 // the user requests it.
 class DenseCollector {
  public:
-  // Default constructor allows the collector to be a vector element.
-  DenseCollector() = default;
+  DenseCollector() = delete;
 
   // A DenseCollector applies to one field of a serialized message.
   // Note that default_value.dtype is the type of the output tensor.
diff --git a/tensorflow/core/kernels/deep_conv2d.cc b/tensorflow/core/kernels/deep_conv2d.cc
index 829155fb313bd3..85a9702ae70c5e 100644
--- a/tensorflow/core/kernels/deep_conv2d.cc
+++ b/tensorflow/core/kernels/deep_conv2d.cc
@@ -294,11 +294,11 @@ struct TransformFilterRange {
 
     // Compute number of filter shards.
     const int64 residual_row =
-        std::max(0LL, args.filter_rows - base_filter_rows);
+        std::max(int64{0}, args.filter_rows - base_filter_rows);
     const int64 shard_rows = 1 + (residual_row + 2 - 1) / 2;
 
     const int64 residual_col =
-        std::max(0LL, args.filter_cols - base_filter_cols);
+        std::max(int64{0}, args.filter_cols - base_filter_cols);
     const int64 shard_cols = 1 + (residual_col + 2 - 1) / 2;
 
     // Compute strides to be used for input and output IO.
@@ -393,9 +393,8 @@ struct TransformFilters {
 
     // Calculate filter transform batch based on cache/filter sizes.
 
-    // Cache budget (based on L2 cache size = 256KB).
-    // TODO(andydavis) Read cache size from system.
-    const int64 cache_size = (256LL << 10) / sizeof(T);
+    // Cache budget (based on L2 cache size).
+    const int64 cache_size = Eigen::l2CacheSize() / sizeof(T);
 
     // Fixed cost.
     const int64 filter_transform_matrix_size =
@@ -416,8 +415,9 @@ struct TransformFilters {
         filter_total_size + filter_transform_buffer_size + filter_out_buf_size;
 
     // Remove fixed cost and divide by per-filter cost.
-    const int64 num_filters_cache = std::max(
-        1LL, (cache_size - filter_transform_matrix_size) / per_filter_cost);
+    const int64 num_filters_cache =
+        std::max(int64{1},
+                 (cache_size - filter_transform_matrix_size) / per_filter_cost);
     const int64 num_filters_transform = std::min(out_depth, num_filters_cache);
 
     // Allocate buffer for filter transform matrix:
@@ -953,11 +953,11 @@ struct DeepConv2D<CPUDevice, T> {
     const int64 base_filter_rows = transform->filter_shape().rows;
 
     const int64 filter_residual_row =
-        std::max(0LL, args.filter_rows - base_filter_rows);
+        std::max(int64{0}, args.filter_rows - base_filter_rows);
     const int64 filter_shards_row = 1 + (filter_residual_row + 2 - 1) / 2;
 
     const int64 filter_residual_col =
-        std::max(0LL, args.filter_cols - base_filter_rows);
+        std::max(int64{0}, args.filter_cols - base_filter_rows);
     const int64 filter_shards_col = 1 + (filter_residual_col + 2 - 1) / 2;
 
     // Allocate buffer for transformed filters.
@@ -1017,9 +1017,8 @@ struct DeepConv2D<CPUDevice, T> {
       const int64 filter_shard_size = filter_shards_row * filter_shards_col;
       const int64 out_tile_spatial_size = out_tile_rows * out_tile_cols;
 
-      // Cache budget (based on L2 cache size = 256KB).
-      // TODO(andydavis) Read cache size from the system.
-      const int64 cache_size = (256LL << 10) / sizeof(T);
+      // Cache budget (based on L2 cache size).
+      const int64 cache_size = Eigen::l2CacheSize() / sizeof(T);
 
       // Fixed costs.
       const int64 tile_transform_matrix_size =
@@ -1047,8 +1046,8 @@ struct DeepConv2D<CPUDevice, T> {
           buffer1_per_tile_size + buffer2_per_tile_size +
           packed_tile_per_tile_size + gemm_out_per_tile_size;
 
-      const int64 num_tiles_cache =
-          std::max(4LL, (cache_size - total_fixed_cost) / total_per_tile_cost);
+      const int64 num_tiles_cache = std::max(
+          int64{4}, (cache_size - total_fixed_cost) / total_per_tile_cost);
       const int64 num_tiles = std::min(num_tiles_cache, col_tiles);
 
       // Allocate temporary buffer 'buffer1', which is first used for copying
diff --git a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
index 792cab185f4f21..243b6d3c8bf920 100644
--- a/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/dense_update_functor_gpu.cu.cc
@@ -57,6 +57,7 @@ struct DenseUpdate<GPUDevice, T, SUB> {
   template struct functor::DenseUpdate<GPUDevice, T, ADD>; \
   template struct functor::DenseUpdate<GPUDevice, T, SUB>;
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_KERNELS);
+TF_CALL_int32(DEFINE_GPU_KERNELS);
 TF_CALL_int64(DEFINE_GPU_KERNELS);
 #undef DEFINE_GPU_KERNELS
 
diff --git a/tensorflow/core/kernels/depthwise_conv_grad_op.cc b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
index 4b88db48c911e4..94c7ecab04aedf 100644
--- a/tensorflow/core/kernels/depthwise_conv_grad_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_grad_op.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/conv_grad_ops.h"
 #include "tensorflow/core/kernels/depthwise_conv_op.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -33,9 +34,15 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#if GOOGLE_CUDA
+#include "cuda/include/cudnn.h"
+#endif
+
 #include "tensorflow/core/platform/stream_executor.h"
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
@@ -509,8 +516,19 @@ static void DepthwiseConvBackpropInputReference(const DepthwiseArgs& args,
   }
 }
 
+// Extern template instantiated in conv_grad_input_ops.cc.
+extern template struct LaunchConv2DBackpropInputOp<CPUDevice, Eigen::half>;
+extern template struct LaunchConv2DBackpropInputOp<CPUDevice, float>;
+extern template struct LaunchConv2DBackpropInputOp<CPUDevice, double>;
+
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
+// Extern template instantiated in conv_grad_input_ops.cc.
+extern template struct LaunchConv2DBackpropInputOp<GPUDevice, Eigen::half>;
+extern template struct LaunchConv2DBackpropInputOp<GPUDevice, float>;
+extern template struct LaunchConv2DBackpropInputOp<GPUDevice, double>;
+
+// Extern template instantiated in depthwise_conv_op_gpu.cu.cc.
 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice,
                                                           Eigen::half>;
 extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, float>;
@@ -548,6 +566,12 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+
+    // For in_depth == 1 and grouped convolutions.
+    use_cudnn_ = CanUseCudnn() && std::is_same<Device, GPUDevice>::value;
+    cudnn_use_autotune_ = CudnnUseAutotune();
+    use_cudnn_grouped_conv_ = false;
+    dtype_ = DataTypeToEnum<T>::value;
   }
 
   void Compute(OpKernelContext* context) override {
@@ -560,6 +584,7 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
             input_sizes.dims()));
     TensorShape input_shape;
     const int32* in_sizes_data = input_sizes.template flat<int32>().data();
+
     for (int i = 0; i < input_sizes.NumElements(); ++i) {
       OP_REQUIRES(context, in_sizes_data[i] >= 0,
                   errors::InvalidArgument("Dimension ", i,
@@ -568,27 +593,77 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
     }
     const TensorShape& filter_shape = filter.shape();
     EXTRACT_AND_VERIFY_DIMENSIONS("DepthwiseConv2DBackpropInput");
+
     Tensor* in_backprop = nullptr;
     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
                                 {0}, 0, input_shape, &in_backprop));
-    auto out_backprop_ptr = out_backprop.template flat<T>().data();
-    auto filter_ptr = filter.template flat<T>().data();
-    auto in_backprop_ptr = in_backprop->template flat<T>().data();
+
     // If there is nothing to compute, return.
     if (input_shape.num_elements() == 0) {
       return;
     }
+
+    // If in_depth==1, this operation is just a standard convolution.
+    // Depthwise convolution is a special case of cuDNN's grouped convolution.
+    bool use_cudnn = use_cudnn_ && (in_depth == 1 || use_cudnn_grouped_conv_);
+
+    VLOG(2) << "DepthwiseConv2dNativeBackpropInput: "
+            << " Input: [" << batch << ", " << input_rows << ", " << input_cols
+            << ", " << in_depth << "]; Filter: [" << filter_rows << ", "
+            << filter_cols << ", " << in_depth << ", " << depth_multiplier
+            << "]; Output: [" << batch << ", " << out_rows << ", " << out_cols
+            << ", " << out_depth << "], stride = " << stride_
+            << ", pad_rows = " << pad_rows << ", pad_cols = " << pad_cols
+            << ", Use cuDNN: " << use_cudnn;
+
+    if (use_cudnn) {
+      // Reshape from TF depthwise filter to cuDNN grouped convolution filter:
+      //
+      //                  | TensorFlow       | cuDNN
+      // --------------------------------------------------------------------
+      // filter_out_depth | depth_multiplier | depth_multiplier * group_count
+      // filter_in_depth  | in_depth         | in_depth / group_count
+      //
+      // For depthwise convolution, we have group_count == in_depth.
+      int32 filter_in_depth = 1;
+      TensorShape shape =
+          TensorShape{filter_rows, filter_cols, filter_in_depth, out_depth};
+      Tensor reshaped_filter(/*type=*/dtype_);
+      OP_REQUIRES(
+          context, reshaped_filter.CopyFrom(filter, shape),
+          errors::Internal(
+              "Failed to reshape filter tensor for grouped convolution."));
+      // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
+      // conv is supported.
+      launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop,
+                reshaped_filter, /*row_dilation=*/1, /*col_dilation=*/1,
+                stride_, stride_, padding_, in_backprop, data_format_);
+      return;
+    }
+
+    auto out_backprop_ptr = out_backprop.template flat<T>().data();
+    auto filter_ptr = filter.template flat<T>().data();
+    auto in_backprop_ptr = in_backprop->template flat<T>().data();
     LaunchDepthwiseConvBackpropInputOp<Device, T>()(
         context, args, out_backprop_ptr, filter_ptr, in_backprop_ptr,
         data_format_);
   }
 
+ protected:
+  bool use_cudnn_grouped_conv_;
+
  private:
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_;
   int64 stride_;
 
+  // For in_depth == 1 and grouped convolutions.
+  LaunchConv2DBackpropInputOp<Device, T> launcher_;
+  bool use_cudnn_;
+  bool cudnn_use_autotune_;
+  DataType dtype_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropInputOp);
 };
 
@@ -597,23 +672,52 @@ class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
                               .Device(DEVICE_CPU)                    \
                               .TypeConstraint<T>("T"),               \
                           DepthwiseConv2dNativeBackpropInputOp<CPUDevice, T>);
+
+TF_CALL_half(REGISTER_CPU_KERNEL);
 TF_CALL_float(REGISTER_CPU_KERNEL);
+#if !defined(PLATFORM_WINDOWS) || !defined(_DEBUG)
 TF_CALL_double(REGISTER_CPU_KERNEL);
+#endif
 #undef REGISTER_CPU_KERNEL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput")
-                            .Device(DEVICE_GPU)
-                            .TypeConstraint<float>("T")
-                            .HostMemory("input_sizes"),
-                        DepthwiseConv2dNativeBackpropInputOp<GPUDevice, float>);
-
-REGISTER_KERNEL_BUILDER(
-    Name("DepthwiseConv2dNativeBackpropInput")
-        .Device(DEVICE_GPU)
-        .TypeConstraint<double>("T")
-        .HostMemory("input_sizes"),
-    DepthwiseConv2dNativeBackpropInputOp<GPUDevice, double>);
+
+#define REGISTER_GPU_KERNEL(T)                                       \
+  REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \
+                              .Device(DEVICE_GPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .HostMemory("input_sizes"),            \
+                          DepthwiseConv2dNativeBackpropInputOp<GPUDevice, T>)
+
+TF_CALL_half(REGISTER_GPU_KERNEL);
+TF_CALL_float(REGISTER_GPU_KERNEL);
+TF_CALL_double(REGISTER_GPU_KERNEL);
+#undef REGISTER_GPU_KERNEL
+
+#if CUDNN_VERSION >= 7000
+template <typename T>
+class DepthwiseConv2dGroupedConvBackpropInputOp
+    : public DepthwiseConv2dNativeBackpropInputOp<GPUDevice, T> {
+ public:
+  DepthwiseConv2dGroupedConvBackpropInputOp(OpKernelConstruction* context)
+      : DepthwiseConv2dNativeBackpropInputOp<GPUDevice, T>(context) {
+    this->use_cudnn_grouped_conv_ = true;
+  }
+};
+
+#define REGISTER_GROUPED_CONV_KERNEL(T)                              \
+  REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \
+                              .Device(DEVICE_GPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .HostMemory("input_sizes")             \
+                              .Label("cudnn_grouped_convolution"),   \
+                          DepthwiseConv2dGroupedConvBackpropInputOp<T>)
+
+TF_CALL_half(REGISTER_GROUPED_CONV_KERNEL);
+TF_CALL_float(REGISTER_GROUPED_CONV_KERNEL);
+TF_CALL_double(REGISTER_GROUPED_CONV_KERNEL);
+#undef REGISTER_GROUPED_CONV_KERNEL
+#endif  // CUDNN_VERSION
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 // Kernels to compute the gradients of the filters for depthwise convolution.
@@ -885,8 +989,19 @@ static void DepthwiseConvBackpropFilterReference(const DepthwiseArgs& args,
   }
 }
 
+// Extern template instantiated in conv_grad_filter_ops.cc.
+extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, Eigen::half>;
+extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, float>;
+extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, double>;
+
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
+// Extern template instantiated in conv_grad_filter_ops.cc.
+extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, Eigen::half>;
+extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, float>;
+extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, double>;
+
+// Extern template instantiated in depthwise_conv_op_gpu.cu.cc.
 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice,
                                                            Eigen::half>;
 extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, float>;
@@ -924,6 +1039,21 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
         errors::InvalidArgument("Current implementation does not yet support "
                                 "strides in the batch and depth dimensions."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+
+    // For in_depth == 1 and grouped convolutions.
+    use_cudnn_ = CanUseCudnn() && std::is_same<Device, GPUDevice>::value;
+    cudnn_use_autotune_ = CudnnUseAutotune();
+    use_cudnn_grouped_conv_ = false;
+
+    if (std::is_same<T, Eigen::half>::value) {
+      dtype_ = DT_HALF;
+    } else if (std::is_same<T, float>::value) {
+      dtype_ = DT_FLOAT;
+    } else if (std::is_same<T, double>::value) {
+      dtype_ = DT_DOUBLE;
+    } else {
+      LOG(ERROR) << "Only half, float, and double are supported.";
+    }
   }
 
   void Compute(OpKernelContext* context) override {
@@ -949,24 +1079,73 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
     OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
                                 {1}, 0, filter_shape, &filter_backprop));
 
-    auto out_backprop_ptr = out_backprop.template flat<T>().data();
-    auto input_ptr = input.template flat<T>().data();
-    auto filter_backprop_ptr = filter_backprop->template flat<T>().data();
     // If there is nothing to compute, return.
-    if (filter_shape.num_elements() == 0) {
+    if (out_backprop.shape().num_elements() == 0) {
+      return;
+    }
+
+    // If in_depth==1, this operation is just a standard convolution.
+    // Depthwise convolution is a special case of cuDNN's grouped convolution.
+    bool use_cudnn = use_cudnn_ && (in_depth == 1 || use_cudnn_grouped_conv_);
+
+    VLOG(2) << "DepthwiseConv2dNativeBackpropFilter: "
+            << " Input: [" << batch << ", " << input_rows << ", " << input_cols
+            << ", " << in_depth << "]; Filter: [" << filter_rows << ", "
+            << filter_cols << ", " << in_depth << ", " << depth_multiplier
+            << "]; Output: [" << batch << ", " << out_rows << ", " << out_cols
+            << ", " << out_depth << "], stride = " << stride_
+            << ", pad_rows = " << pad_rows << ", pad_cols = " << pad_cols
+            << ", Use cuDNN: " << use_cudnn;
+
+    if (use_cudnn) {
+      // Reshape from TF depthwise filter to cuDNN grouped convolution filter:
+      //
+      //                  | TensorFlow       | cuDNN
+      // --------------------------------------------------------------------
+      // filter_out_depth | depth_multiplier | depth_multiplier * group_count
+      // filter_in_depth  | in_depth         | in_depth / group_count
+      //
+      // For depthwise convolution, we have group_count == in_depth.
+      int32 filter_in_depth = 1;
+      TensorShape shape =
+          TensorShape{filter_rows, filter_cols, filter_in_depth, out_depth};
+      Tensor reshaped_filter(/*type=*/dtype_);
+      OP_REQUIRES(
+          context, reshaped_filter.CopyFrom(*filter_backprop, shape),
+          errors::Internal(
+              "Failed to reshape filter tensor for grouped convolution."));
+
+      // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
+      // conv is supported.
+      launcher_(context, use_cudnn_, cudnn_use_autotune_, out_backprop, input,
+                /*row_dilation=*/1, /*col_dilation=*/1, stride_, stride_,
+                padding_, &reshaped_filter, data_format_);
       return;
     }
+
+    auto out_backprop_ptr = out_backprop.template flat<T>().data();
+    auto input_ptr = input.template flat<T>().data();
+    auto filter_backprop_ptr = filter_backprop->template flat<T>().data();
     LaunchDepthwiseConvBackpropFilterOp<Device, T>()(
         context, args, out_backprop_ptr, input_ptr, filter_backprop_ptr,
         data_format_);
   }
 
+ protected:
+  bool use_cudnn_grouped_conv_;
+
  private:
   std::vector<int32> strides_;
   Padding padding_;
   TensorFormat data_format_;
   int64 stride_;
 
+  // For in_depth == 1 and grouped convolutions.
+  LaunchConv2DBackpropFilterOp<Device, T> launcher_;
+  bool use_cudnn_;
+  bool cudnn_use_autotune_;
+  DataType dtype_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropFilterOp);
 };
 
@@ -976,24 +1155,50 @@ class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
           .Device(DEVICE_CPU)                     \
           .TypeConstraint<T>("T"),                \
       DepthwiseConv2dNativeBackpropFilterOp<CPUDevice, T>);
+TF_CALL_half(REGISTER_CPU_KERNEL);
 TF_CALL_float(REGISTER_CPU_KERNEL);
+#if !defined(PLATFORM_WINDOWS) || !defined(_DEBUG)
 TF_CALL_double(REGISTER_CPU_KERNEL);
+#endif
 #undef REGISTER_CPU_KERNEL
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-REGISTER_KERNEL_BUILDER(
-    Name("DepthwiseConv2dNativeBackpropFilter")
-        .Device(DEVICE_GPU)
-        .TypeConstraint<float>("T")
-        .HostMemory("filter_sizes"),
-    DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, float>);
-
-REGISTER_KERNEL_BUILDER(
-    Name("DepthwiseConv2dNativeBackpropFilter")
-        .Device(DEVICE_GPU)
-        .TypeConstraint<double>("T")
-        .HostMemory("filter_sizes"),
-    DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, double>);
+#define REGISTER_GPU_KERNEL(T)                                        \
+  REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropFilter") \
+                              .Device(DEVICE_GPU)                     \
+                              .TypeConstraint<T>("T")                 \
+                              .HostMemory("filter_sizes"),            \
+                          DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, T>)
+
+TF_CALL_half(REGISTER_GPU_KERNEL);
+TF_CALL_float(REGISTER_GPU_KERNEL);
+TF_CALL_double(REGISTER_GPU_KERNEL);
+#undef REGISTER_GPU_KERNEL
+
+#if CUDNN_VERSION >= 7000
+template <typename T>
+class DepthwiseConv2dGroupedConvBackpropFilterOp
+    : public DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, T> {
+ public:
+  DepthwiseConv2dGroupedConvBackpropFilterOp(OpKernelConstruction* context)
+      : DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, T>(context) {
+    this->use_cudnn_grouped_conv_ = true;
+  }
+};
+
+#define REGISTER_GROUPED_CONV_KERNEL(T)                               \
+  REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropFilter") \
+                              .Device(DEVICE_GPU)                     \
+                              .TypeConstraint<T>("T")                 \
+                              .HostMemory("filter_sizes")             \
+                              .Label("cudnn_grouped_convolution"),    \
+                          DepthwiseConv2dGroupedConvBackpropFilterOp<T>)
+
+TF_CALL_half(REGISTER_GROUPED_CONV_KERNEL);
+TF_CALL_float(REGISTER_GROUPED_CONV_KERNEL);
+TF_CALL_double(REGISTER_GROUPED_CONV_KERNEL);
+#undef REGISTER_GROUPED_CONV_KERNEL
+#endif  // CUDNN_VERSION
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/depthwise_conv_op.cc b/tensorflow/core/kernels/depthwise_conv_op.cc
index 96e8f1ed75c77a..126cb3b67c5b4f 100644
--- a/tensorflow/core/kernels/depthwise_conv_op.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op.cc
@@ -39,6 +39,11 @@ limitations under the License.
 #include "tensorflow/core/util/work_sharder.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#if GOOGLE_CUDA
+#include "cuda/include/cudnn.h"
+#endif
+
 #include "tensorflow/core/platform/stream_executor.h"
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
@@ -241,18 +246,22 @@ struct LaunchDepthwiseConvOp<CPUDevice, T> {
 };
 
 // Extern template instantiated in conv_ops.cc.
+extern template struct LaunchConv2DOp<CPUDevice, Eigen::half>;
 extern template struct LaunchConv2DOp<CPUDevice, float>;
+extern template struct LaunchConv2DOp<CPUDevice, double>;
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
+// Extern template instantiated in conv_ops.cc.
+extern template struct LaunchConv2DOp<GPUDevice, Eigen::half>;
+extern template struct LaunchConv2DOp<GPUDevice, float>;
+extern template struct LaunchConv2DOp<GPUDevice, double>;
+
 // Extern template instantiated in depthwise_conv_op_gpu.cc.
 extern template struct LaunchDepthwiseConvOp<GPUDevice, Eigen::half>;
 extern template struct LaunchDepthwiseConvOp<GPUDevice, float>;
 extern template struct LaunchDepthwiseConvOp<GPUDevice, double>;
 
-// Extern template instantiated in conv_ops.cc.
-extern template struct LaunchConv2DOp<GPUDevice, float>;
-
 #endif
 
 template <typename Device, typename T>
@@ -284,9 +293,11 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
                                 "strides in the batch and depth dimensions."));
     OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
 
-    // For special case when in_depth == 1.
-    use_cudnn_ = CanUseCudnn();
+    // For in_depth == 1 and grouped convolutions.
+    use_cudnn_ = CanUseCudnn() && std::is_same<Device, GPUDevice>::value;
     cudnn_use_autotune_ = CudnnUseAutotune();
+    use_cudnn_grouped_conv_ = false;
+    dtype_ = DataTypeToEnum<T>::value;
   }
 
   void Compute(OpKernelContext* context) override {
@@ -357,27 +368,47 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
     Tensor* output = nullptr;
     OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
 
-    VLOG(2) << "DepthwiseConv2dNative: "
-            << " Input: [" << batch << ", " << input_rows << ", " << input_cols
-            << ", " << in_depth << "]; Filter: [" << filter_rows << ", "
-            << filter_cols << ", " << in_depth << ", " << depth_multiplier
-            << "]; stride = " << stride_ << ", pad_rows = " << pad_rows
-            << ", pad_cols = " << pad_cols << ", output: [" << batch << ", "
-            << out_rows << ", " << out_cols << ", " << out_depth << "]";
-
     // If there is nothing to compute, return.
     if (out_shape.num_elements() == 0) {
       return;
     }
 
-    // If in_depth==1, this operation is just a standard convolution, so
-    // invoke that op.
-    if (std::is_same<T, float>::value && in_depth == 1) {
+    // TODO(csigg): Have autotune decide if native is faster than cuDNN.
+    // If in_depth==1, this operation is just a standard convolution.
+    // Depthwise convolution is a special case of cuDNN's grouped convolution.
+    bool use_cudnn = use_cudnn_ && (in_depth == 1 || use_cudnn_grouped_conv_);
+
+    VLOG(2) << "DepthwiseConv2dNative: "
+            << " Input: [" << batch << ", " << input_rows << ", " << input_cols
+            << ", " << in_depth << "]; Filter: [" << filter_rows << ", "
+            << filter_cols << ", " << in_depth << ", " << depth_multiplier
+            << "]; Output: [" << batch << ", " << out_rows << ", " << out_cols
+            << ", " << out_depth << "], stride = " << stride_
+            << ", pad_rows = " << pad_rows << ", pad_cols = " << pad_cols
+            << ", Use cuDNN: " << use_cudnn;
+
+    if (use_cudnn) {
+      // Reshape from TF depthwise filter to cuDNN grouped convolution filter:
+      //
+      //                  | TensorFlow       | cuDNN
+      // --------------------------------------------------------------------
+      // filter_out_depth | depth_multiplier | depth_multiplier * group_count
+      // filter_in_depth  | in_depth         | in_depth / group_count
+      //
+      // For depthwise convolution, we have group_count == in_depth.
+      int32 filter_in_depth = 1;
+      TensorShape shape =
+          TensorShape{filter_rows, filter_cols, filter_in_depth, out_depth};
+      Tensor reshaped_filter(/*type=*/dtype_);
+      OP_REQUIRES(
+          context, reshaped_filter.CopyFrom(filter, shape),
+          errors::Internal(
+              "Failed to reshape filter tensor for grouped convolution."));
       // TODO(yangzihao): Send in arbitrary dilation rates after the dilated
       // conv is supported.
-      launcher_(context, use_cudnn_, cudnn_use_autotune_, input, filter,
-                /*row_dilation=*/1, /*col_dilation=*/1, stride_, stride_,
-                padding_, output, data_format_);
+      launcher_(context, use_cudnn_, cudnn_use_autotune_, input,
+                reshaped_filter, /*row_dilation=*/1, /*col_dilation=*/1,
+                stride_, stride_, padding_, output, data_format_);
       return;
     }
 
@@ -403,6 +434,9 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
                                        output_ptr, data_format_);
   }
 
+ protected:
+  bool use_cudnn_grouped_conv_;
+
  private:
   std::vector<int32> strides_;
   Padding padding_;
@@ -410,10 +444,11 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
 
   int64 stride_;  // in height/width dimension.
 
-  // For the case in_depth == 1.
+  // For in_depth == 1 and grouped convolutions.
   LaunchConv2DOp<Device, T> launcher_;
   bool use_cudnn_;
   bool cudnn_use_autotune_;
+  DataType dtype_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeOp);
 };
@@ -421,7 +456,7 @@ class DepthwiseConv2dNativeOp : public BinaryOp<T> {
 #define REGISTER_CPU_KERNEL(T)                                                 \
   REGISTER_KERNEL_BUILDER(                                                     \
       Name("DepthwiseConv2dNative").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
-      DepthwiseConv2dNativeOp<CPUDevice, T>);
+      DepthwiseConv2dNativeOp<CPUDevice, T>)
 
 TF_CALL_half(REGISTER_CPU_KERNEL);
 TF_CALL_float(REGISTER_CPU_KERNEL);
@@ -430,19 +465,38 @@ TF_CALL_double(REGISTER_CPU_KERNEL);
 #endif
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNative")
-                            .Device(DEVICE_GPU)
-                            .TypeConstraint<Eigen::half>("T"),
-                        DepthwiseConv2dNativeOp<GPUDevice, Eigen::half>);
-
-REGISTER_KERNEL_BUILDER(
-    Name("DepthwiseConv2dNative").Device(DEVICE_GPU).TypeConstraint<float>("T"),
-    DepthwiseConv2dNativeOp<GPUDevice, float>);
-
-REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNative")
-                            .Device(DEVICE_GPU)
-                            .TypeConstraint<double>("T"),
-                        DepthwiseConv2dNativeOp<GPUDevice, double>);
-#endif
+
+#define REGISTER_GPU_KERNEL(T)                                                 \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("DepthwiseConv2dNative").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
+      DepthwiseConv2dNativeOp<GPUDevice, T>)
+
+TF_CALL_half(REGISTER_GPU_KERNEL);
+TF_CALL_float(REGISTER_GPU_KERNEL);
+TF_CALL_double(REGISTER_GPU_KERNEL);
+
+#if CUDNN_VERSION >= 7000
+template <typename T>
+class DepthwiseConv2dGroupedConvOp
+    : public DepthwiseConv2dNativeOp<GPUDevice, T> {
+ public:
+  DepthwiseConv2dGroupedConvOp(OpKernelConstruction* context)
+      : DepthwiseConv2dNativeOp<GPUDevice, T>(context) {
+    this->use_cudnn_grouped_conv_ = true;
+  }
+};
+
+#define REGISTER_GROUPED_CONV_KERNEL(T)                            \
+  REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNative")            \
+                              .Device(DEVICE_GPU)                  \
+                              .TypeConstraint<T>("T")              \
+                              .Label("cudnn_grouped_convolution"), \
+                          DepthwiseConv2dGroupedConvOp<T>)
+
+TF_CALL_half(REGISTER_GROUPED_CONV_KERNEL);
+TF_CALL_float(REGISTER_GROUPED_CONV_KERNEL);
+TF_CALL_double(REGISTER_GROUPED_CONV_KERNEL);
+#endif  // CUDNN_VERSION
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index 8b298addc6c47e..5423a910bb0a4d 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -620,8 +620,8 @@ void LaunchDepthwiseConv2dGPUSmall(const GpuDevice& device,
                                             kKnownFilterHeight, kBlockDepth,
                                             kKnownEvenHeight>;
       break;
-    case FORMAT_NCHW_VECT_C:
-      LOG(ERROR) << "FORMAT_NCHW_VECT_C is not supported";
+    default:
+      LOG(ERROR) << "FORMAT_" << ToString(data_format) << " is not supported";
       return;
   }
   const int tile_width = args.in_cols + args.filter_cols - 1;
@@ -699,8 +699,8 @@ void LaunchDepthwiseConv2dGPU(const GpuDevice& device,
           DepthwiseConv2dGPUKernelNCHW<T, kKnownFilterWidth, kKnownFilterHeight,
                                        kKnownDepthMultiplier>;
       break;
-    case FORMAT_NCHW_VECT_C:
-      LOG(ERROR) << "FORMAT_NCHW_VECT_C is not supported";
+    default:
+      LOG(ERROR) << "FORMAT_" << ToString(data_format) << " is not supported";
       return;
   }
   const int num_outputs =
@@ -933,8 +933,8 @@ void LaunchDepthwiseConv2dBackpropInputGPU(const GpuDevice& device,
       kernel = DepthwiseConv2dBackpropInputGPUKernelNCHW<
           T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>;
       break;
-    case FORMAT_NCHW_VECT_C:
-      LOG(ERROR) << "FORMAT_NCHW_VECT_C is not supported";
+    default:
+      LOG(ERROR) << "FORMAT_" << ToString(data_format) << " is not supported";
       return;
   }
   const int num_in_backprop =
@@ -1586,8 +1586,8 @@ bool TryLaunchDepthwiseConv2dBackpropFilterGPUSmall(
       kernel = DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall<
           T, kKnownFilterWidth, kKnownFilterHeight, kBlockDepth, kAccumPixels>;
       break;
-    case FORMAT_NCHW_VECT_C:
-      LOG(ERROR) << "FORMAT_NCHW_VECT_C is not supported";
+    default:
+      LOG(ERROR) << "FORMAT_" << ToString(data_format) << " is not supported";
       return false;
   }
   const int num_out_backprop = args.out_rows * args.out_cols * block_count;
@@ -1691,8 +1691,8 @@ void LaunchDepthwiseConv2dBackpropFilterGPU(const GpuDevice& device,
       kernel = DepthwiseConv2dBackpropFilterGPUKernelNCHW<
           T, kKnownFilterWidth, kKnownFilterHeight, kKnownDepthMultiplier>;
       break;
-    case FORMAT_NCHW_VECT_C:
-      LOG(ERROR) << "FORMAT_NCHW_VECT_C is not supported";
+    default:
+      LOG(ERROR) << "FORMAT_" << ToString(data_format) << " is not supported";
       return;
   }
   const int num_out_backprop =
diff --git a/tensorflow/core/kernels/dequantize_op.cc b/tensorflow/core/kernels/dequantize_op.cc
index 3f644a61bfc270..42fbf95cd3681c 100644
--- a/tensorflow/core/kernels/dequantize_op.cc
+++ b/tensorflow/core/kernels/dequantize_op.cc
@@ -96,27 +96,17 @@ class DequantizeOp : public OpKernel {
             output);
       }
     } else if (mode_ == QUANTIZE_MODE_SCALED) {
-      // The quantization logic for mode SCALED matches that of
-      // QuantizeAndDequantizeV2 and QuantizeAndDequantizeV3.
-      static constexpr int num_bits = sizeof(T) * 8;
-      const float max_abs = std::max(std::abs(min_range), std::abs(max_range));
-      bool is_signed = std::is_signed<T>::value;
-      // If it is signed, we try to keep 0.0 being 0 and drop one bucket. For
-      // example, if it is 8 bits, we have the range [-127, 127]. So for input
-      // range of [-x, x], the scale should be 254/(2*x).
-      //
-      // If it is unsigned and num_bits == 8, the range with 8 bits is [0, 255].
-      // If the input range is [0, x], then the scale is x/255 instead of 254 as
-      // in the case above.
-      const int target_bits = is_signed ? (num_bits - 1) : num_bits;
-      const float target_range =
-          static_cast<float>((uint64_t{1} << target_bits) - 1);
-      const float scale_factor = max_abs / target_range;
+      // TODO(pauldonnelly): Update QuantizeAndDequantizeV2 and
+      // QuantizeAndDequantizeV3 to match this SCALED mode again.
+      const float scale_factor =
+          std::numeric_limits<T>::min() == 0
+              ? (max_range / std::numeric_limits<T>::max())
+              : std::max(min_range / std::numeric_limits<T>::min(),
+                         max_range / std::numeric_limits<T>::max());
       float* out_ptr = output->flat<float>().data();
       const T* in_ptr = input.flat<T>().data();
-
       const int64 num_elements = input.NumElements();
-      for (int i = 0; i < num_elements; ++i) {
+      for (int64 i = 0; i < num_elements; ++i) {
         out_ptr[i] = static_cast<int>(in_ptr[i]) * scale_factor;
       }
     }
diff --git a/tensorflow/core/kernels/dequantize_op_test.cc b/tensorflow/core/kernels/dequantize_op_test.cc
index 9938eb61aa4d81..63b18d72631949 100644
--- a/tensorflow/core/kernels/dequantize_op_test.cc
+++ b/tensorflow/core/kernels/dequantize_op_test.cc
@@ -127,8 +127,8 @@ TEST_F(DequantizeOpTest, DequantizeMinCombinedQuint16) {
 TEST_F(DequantizeOpTest, DequantizeScaledQuint8Zero) {
   RunDequantizeScaledTest<quint8>(-255.0f, 127.0f, 0, 0.0);
 }
-TEST_F(DequantizeOpTest, DequantizeScaledQuint8ScaleIdentity) {
-  RunDequantizeScaledTest<quint8>(-255.0f, 127.0f, 127, 127.0);
+TEST_F(DequantizeOpTest, DequantizeScaledQuint8CheckIgnoresNegative) {
+  RunDequantizeScaledTest<quint8>(-512.0f, 255.0f, 255, 255.0);
 }
 TEST_F(DequantizeOpTest, DequantizeScaledQuint8ScaleDown) {
   RunDequantizeScaledTest<quint8>(-1.0f, 2.0f, 255, 2.0);
@@ -144,7 +144,7 @@ TEST_F(DequantizeOpTest, DequantizeScaledQint8ScaleIdentity) {
   RunDequantizeScaledTest<qint8>(-10.0f, 127.0f, -127, -127.0);
 }
 TEST_F(DequantizeOpTest, DequantizeScaledQint8ScaleDown) {
-  RunDequantizeScaledTest<qint8>(-2.0f, 1.0f, -127, -2.0);
+  RunDequantizeScaledTest<qint8>(-2.0f, 1.0f, -128, -2.0);
 }
 TEST_F(DequantizeOpTest, DequantizeScaledQint8ScaleUp) {
   RunDequantizeScaledTest<qint8>(-1.0f, 300.0f, 42, 99.212601);
diff --git a/tensorflow/core/kernels/draw_bounding_box_op.cc b/tensorflow/core/kernels/draw_bounding_box_op.cc
index b5d5b880bbbaca..618c47e6848b39 100644
--- a/tensorflow/core/kernels/draw_bounding_box_op.cc
+++ b/tensorflow/core/kernels/draw_bounding_box_op.cc
@@ -93,14 +93,14 @@ class DrawBoundingBoxesOp : public OpKernel {
         int64 color_index = bb % color_table_length;
         const int64 min_box_row =
             static_cast<float>(tboxes(b, bb, 0)) * (height - 1);
-        const int64 min_box_row_clamp = std::max<int64>(min_box_row, 0);
+        const int64 min_box_row_clamp = std::max<int64>(min_box_row, int64{0});
         const int64 max_box_row =
             static_cast<float>(tboxes(b, bb, 2)) * (height - 1);
         const int64 max_box_row_clamp =
             std::min<int64>(max_box_row, height - 1);
         const int64 min_box_col =
             static_cast<float>(tboxes(b, bb, 1)) * (width - 1);
-        const int64 min_box_col_clamp = std::max<int64>(min_box_col, 0);
+        const int64 min_box_col_clamp = std::max<int64>(min_box_col, int64{0});
         const int64 max_box_col =
             static_cast<float>(tboxes(b, bb, 3)) * (width - 1);
         const int64 max_box_col_clamp = std::min<int64>(max_box_col, width - 1);
diff --git a/tensorflow/core/kernels/eigen_pooling.h b/tensorflow/core/kernels/eigen_pooling.h
index 574a5a2e89c400..7c1b324dfdf246 100644
--- a/tensorflow/core/kernels/eigen_pooling.h
+++ b/tensorflow/core/kernels/eigen_pooling.h
@@ -372,16 +372,23 @@ struct reducer_traits<AvgPoolMeanReducer<float>, Device> {
     Cost = 1,
 #if (EIGEN_ARCH_i386 || EIGEN_ARCH_x86_64) && !defined(__CUDACC__) && !defined(__HIPCC__)
     // We only support packet access for floats.
-    PacketAccess = true
+    PacketAccess = true,
 #else
-    PacketAccess = false
+    PacketAccess = false,
 #endif
+    IsStateful = true,
+    IsExactlyAssociative = false
   };
 };
 
 template <>
 struct reducer_traits<AvgPoolMeanReducer<float>, GpuDevice> {
-  enum { Cost = 1, PacketAccess = false };
+  enum {
+    Cost = 1,
+    PacketAccess = false,
+    IsStateful = true,
+    IsExactlyAssociative = false
+  };
 };
 
 }  // namespace internal
diff --git a/tensorflow/core/kernels/fft_ops.cc b/tensorflow/core/kernels/fft_ops.cc
index c8dc58a604ee65..c08364e70789dc 100644
--- a/tensorflow/core/kernels/fft_ops.cc
+++ b/tensorflow/core/kernels/fft_ops.cc
@@ -129,13 +129,23 @@ class FFTCPU : public FFTBase {
     auto device = ctx->eigen_device<CPUDevice>();
 
     if (!IsReal()) {
-      auto input = Tensor(in).flat_inner_dims<complex64, FFTRank + 1>();
-      // Compute the FFT using eigen.
-      auto output = out->flat_inner_dims<complex64, FFTRank + 1>();
+      // Compute the FFT using Eigen.
       constexpr auto direction =
           Forward ? Eigen::FFT_FORWARD : Eigen::FFT_REVERSE;
-      output.device(device) =
-          input.template fft<Eigen::BothParts, direction>(axes);
+      if (in.dtype() == DT_COMPLEX64) {
+        DCHECK_EQ(out->dtype(), DT_COMPLEX64);
+        auto input = Tensor(in).flat_inner_dims<complex64, FFTRank + 1>();
+        auto output = out->flat_inner_dims<complex64, FFTRank + 1>();
+        output.device(device) =
+            input.template fft<Eigen::BothParts, direction>(axes);
+      } else {
+        DCHECK_EQ(DT_COMPLEX128, in.dtype());
+        DCHECK_EQ(DT_COMPLEX128, out->dtype());
+        auto input = Tensor(in).flat_inner_dims<complex128, FFTRank + 1>();
+        auto output = out->flat_inner_dims<complex128, FFTRank + 1>();
+        output.device(device) =
+            input.template fft<Eigen::BothParts, direction>(axes);
+      }
     } else {
       if (IsForward()) {
         auto input = Tensor(in).flat_inner_dims<float, FFTRank + 1>();
@@ -392,10 +402,16 @@ class FFTGPUBase : public FFTBase {
     }
 
     constexpr bool kInPlaceFft = false;
+    const bool is_complex128 = in.dtype() == DT_COMPLEX128;
+    // complex128 real FFT is not supported yet.
+    DCHECK(!IsReal() || !is_complex128);
+
     const auto kFftType =
         IsReal() ? (IsForward() ? se::fft::Type::kR2C : se::fft::Type::kC2R)
-                 : (IsForward() ? se::fft::Type::kC2CForward
-                                : se::fft::Type::kC2CInverse);
+                 : (IsForward() ? (is_complex128 ? se::fft::Type::kZ2ZForward
+                                                 : se::fft::Type::kC2CForward)
+                                : (is_complex128 ? se::fft::Type::kZ2ZInverse
+                                                 : se::fft::Type::kC2CInverse));
 
     CufftScratchAllocator scratch_allocator(CufftScratchSize, ctx);
     auto plan =
@@ -428,20 +444,42 @@ class FFTGPUBase : public FFTBase {
                              input_shape.DebugString()));
       }
     } else {
-      auto src = AsDeviceMemory<complex64>(in.flat<complex64>().data());
-      auto dst = AsDeviceMemory<complex64>(out->flat<complex64>().data());
-      OP_REQUIRES(
-          ctx, stream->ThenFft(plan.get(), src, &dst).ok(),
-          errors::Internal("fft failed : type=", static_cast<int>(kFftType),
-                           " in.shape=", input_shape.DebugString()));
-      if (!IsForward()) {
-        auto alpha = complex64(1.f / output_distance);
+      if (!is_complex128) {
+        DCHECK_EQ(in.dtype(), DT_COMPLEX64);
+        DCHECK_EQ(out->dtype(), DT_COMPLEX64);
+        auto src = AsDeviceMemory<complex64>(in.flat<complex64>().data());
+        auto dst = AsDeviceMemory<complex64>(out->flat<complex64>().data());
         OP_REQUIRES(
-            ctx,
-            stream->ThenBlasScal(output_shape.num_elements(), alpha, &dst, 1)
-                .ok(),
-            errors::Internal("BlasScal failed : in.shape=",
-                             input_shape.DebugString()));
+            ctx, stream->ThenFft(plan.get(), src, &dst).ok(),
+            errors::Internal("fft failed : type=", static_cast<int>(kFftType),
+                             " in.shape=", input_shape.DebugString()));
+        if (!IsForward()) {
+          float alpha = 1.f / output_distance;
+          OP_REQUIRES(
+              ctx,
+              stream->ThenBlasScal(output_shape.num_elements(), alpha, &dst, 1)
+                  .ok(),
+              errors::Internal("BlasScal failed : in.shape=",
+                               input_shape.DebugString()));
+        }
+      } else {
+        DCHECK_EQ(in.dtype(), DT_COMPLEX128);
+        DCHECK_EQ(out->dtype(), DT_COMPLEX128);
+        auto src = AsDeviceMemory<complex128>(in.flat<complex128>().data());
+        auto dst = AsDeviceMemory<complex128>(out->flat<complex128>().data());
+        OP_REQUIRES(
+            ctx, stream->ThenFft(plan.get(), src, &dst).ok(),
+            errors::Internal("fft failed : type=", static_cast<int>(kFftType),
+                             " in.shape=", input_shape.DebugString()));
+        if (!IsForward()) {
+          double alpha = 1.0 / output_distance;
+          OP_REQUIRES(
+              ctx,
+              stream->ThenBlasScal(output_shape.num_elements(), alpha, &dst, 1)
+                  .ok(),
+              errors::Internal("BlasScal failed : in.shape=",
+                               input_shape.DebugString()));
+        }
       }
     }
   }
diff --git a/tensorflow/core/kernels/fifo_queue.cc b/tensorflow/core/kernels/fifo_queue.cc
index 479f7be4b506e4..a23478af5b5ca3 100644
--- a/tensorflow/core/kernels/fifo_queue.cc
+++ b/tensorflow/core/kernels/fifo_queue.cc
@@ -23,13 +23,13 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/kernels/fifo_queue.h"
 #include "tensorflow/core/kernels/queue_base.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index f8e0267578054b..f2724735bf4590 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -254,6 +254,7 @@ class SymbolicGradientOp : public AsyncOpKernel {
     opts.runner = ctx->runner();
     opts.stats_collector = ctx->stats_collector();
     opts.step_container = ctx->step_container();
+    opts.collective_executor = ctx->collective_executor();
     std::vector<Tensor> args;
     args.reserve(ctx->num_inputs());
     for (int i = 0; i < ctx->num_inputs(); ++i) {
@@ -324,7 +325,7 @@ class RemoteCallOp : public AsyncOpKernel {
         handle = cached_entry->second;
       } else {
         VLOG(1) << "Instantiating " << func_.name() << " on " << target_device;
-        port::Tracing::TraceMe activity(strings::StrCat(
+        tracing::ScopedActivity activity(strings::StrCat(
             "RemoteCall: Instantiate: ", func_.name(), " on ", target_device));
         OP_REQUIRES_OK_ASYNC(
             ctx,
@@ -355,12 +356,12 @@ class RemoteCallOp : public AsyncOpKernel {
       args.push_back(argument);
     }
     auto* rets = new std::vector<Tensor>;
-    auto* trace = new port::Tracing::TraceMe(strings::StrCat(
+    auto* activity = new tracing::ScopedActivity(strings::StrCat(
         "RemoteCall: Run: ", func_.name(), " on ", target_device));
     VLOG(1) << "Running " << func_.name() << " on " << target_device
             << " with handle: " << handle;
     lib->Run(opts, handle, args, rets,
-             [rets, trace, done, ctx](const Status& status) {
+             [rets, activity, done, ctx](const Status& status) {
                if (!status.ok()) {
                  ctx->SetStatus(status);
                } else {
@@ -369,7 +370,7 @@ class RemoteCallOp : public AsyncOpKernel {
                  }
                }
                delete rets;
-               delete trace;
+               delete activity;
                done();
              });
   }
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index 3bf0a4ce0b425a..e0d594fa257f02 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -15,16 +15,16 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#if GOOGLE_CUDA
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/device_base.h"
+#endif
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#include "tensorflow/stream_executor/stream.h"
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
 namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -39,6 +39,21 @@ Status Instantiate(FunctionLibraryRuntime* lib, const NameAttrList& func,
   return lib->Instantiate(func.name(), AttrSlice(&func.attr()), handle);
 }
 
+template <typename To, typename From>  // use like this: down_cast<T*>(foo);
+inline To down_cast(From* f) {         // so we only accept pointers
+  static_assert(
+      (std::is_base_of<From, typename std::remove_pointer<To>::type>::value),
+      "target type not derived from source type");
+
+  // We skip the assert and hence the dynamic_cast if RTTI is disabled.
+#if !defined(__GNUC__) || defined(__GXX_RTTI)
+  // Uses RTTI in dbg and fastbuild. asserts are disabled in opt builds.
+  assert(f == nullptr || dynamic_cast<To>(f) != nullptr);
+#endif  // !defined(__GNUC__) || defined(__GXX_RTTI)
+
+  return static_cast<To>(f);
+}
+
 // If "t" is a scalar of a supported type, returns t != 0 in "*v".
 Status ToBool(gtl::ArraySlice<Tensor> t, bool* v) {
   if (t.size() != 1) {
@@ -279,8 +294,46 @@ class WhileOp : public AsyncOpKernel {
     }
 
     void StartBody() {
+      Status s;
+      if (rets_.size() != 1) {
+        s = errors::InvalidArgument(
+            "Expected a single scalar return value from WhileOp cond, got ",
+            rets_.size(), " tensors.");
+        return Finish(s);
+      }
+      Tensor cond_t;
+#if GOOGLE_CUDA
+      const DeviceBase::GpuDeviceInfo* gpu_device_info =
+          ctx_->device()->tensorflow_gpu_device_info();
+      const bool is_hostmem_dtype =
+          rets_[0].dtype() == DT_INT32 || rets_[0].dtype() == DT_INT64;
+      if (!is_hostmem_dtype && gpu_device_info &&
+          (opts_.rets_alloc_attrs.empty() ||
+           !opts_.rets_alloc_attrs[0].on_host())) {
+        // Copy the ret value to host if it's allocated on device.
+        Device* device = down_cast<Device*>(ctx_->device());
+        DeviceContext* device_ctx = ctx_->op_device_context();
+        cond_t = Tensor(rets_[0].dtype(), rets_[0].shape());
+        Notification done_copy;
+        device_ctx->CopyDeviceTensorToCPU(
+            &rets_[0], /*tensor_name=*/"", device, &cond_t,
+            [&done_copy, &s](const Status& status) {
+              s = status;
+              done_copy.Notify();
+            });
+        done_copy.WaitForNotification();
+        if (!s.ok()) {
+          return Finish(s);
+        }
+      } else {
+        cond_t = rets_[0];
+      }
+#else
+      cond_t = rets_[0];
+#endif
       bool cond;
-      Status s = ToBool(rets_, &cond);
+      s = ToBool({cond_t}, &cond);
+
       if (!s.ok()) {
         return Finish(s);
       }
@@ -465,5 +518,24 @@ REGISTER_KERNEL_BUILDER(Name("For")
                             .HostMemory("delta"),
                         ForOp);
 
+class FakeParamOp : public OpKernel {
+ public:
+  explicit FakeParamOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // We must produce something (only Switch and Recvs are allowed to output
+    // dead tensors). This output is not expected to be consumed by anything.
+    Tensor output_tensor(dtype_, TensorShape({}));
+    context->set_output(0, output_tensor);
+  }
+
+ private:
+  DataType dtype_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("FakeParam").Device(DEVICE_CPU), FakeParamOp);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/gather_functor.cc b/tensorflow/core/kernels/gather_functor.cc
index 1700509cfc9997..e4f2182be3c05e 100644
--- a/tensorflow/core/kernels/gather_functor.cc
+++ b/tensorflow/core/kernels/gather_functor.cc
@@ -37,6 +37,7 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
+TF_CALL_int64(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
 TF_CALL_complex128(DECLARE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/gather_functor_gpu.cu.cc b/tensorflow/core/kernels/gather_functor_gpu.cu.cc
index 4cac9d204d9a4e..54732e51de79d1 100644
--- a/tensorflow/core/kernels/gather_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/gather_functor_gpu.cu.cc
@@ -31,6 +31,7 @@ typedef Eigen::GpuDevice GPUDevice;
   DEFINE_GPU_SPECS_INDEX(T, int32); \
   DEFINE_GPU_SPECS_INDEX(T, int64);
 
+TF_CALL_int64(DEFINE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
 TF_CALL_complex64(DEFINE_GPU_SPECS);
 TF_CALL_complex128(DEFINE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/gather_nd_op.cc b/tensorflow/core/kernels/gather_nd_op.cc
index 1ddc83130b85f0..73d1519576f36f 100644
--- a/tensorflow/core/kernels/gather_nd_op.cc
+++ b/tensorflow/core/kernels/gather_nd_op.cc
@@ -228,6 +228,8 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
+TF_CALL_int32(DECLARE_GPU_SPECS);
+TF_CALL_int64(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
 TF_CALL_complex128(DECLARE_GPU_SPECS);
@@ -239,6 +241,8 @@ TF_CALL_complex128(DECLARE_GPU_SPECS);
 // Registration of the GPU implementations.
 #define REGISTER_GATHER_ND_GPU(type) REGISTER_GATHER_ND_ALL_INDICES(GPU, type)
 
+TF_CALL_int32(REGISTER_GATHER_ND_GPU);
+TF_CALL_int64(REGISTER_GATHER_ND_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GATHER_ND_GPU);
 TF_CALL_complex64(REGISTER_GATHER_ND_GPU);
 TF_CALL_complex128(REGISTER_GATHER_ND_GPU);
diff --git a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
index 23c25a5b23d382..68b1d138cb17e6 100644
--- a/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/gather_nd_op_gpu.cu.cc
@@ -119,6 +119,8 @@ struct GatherNdSlice<GPUDevice, T, Index, IXDIM> {
   DEFINE_GPU_SPECS_INDEX(T, int32); \
   DEFINE_GPU_SPECS_INDEX(T, int64);
 
+TF_CALL_int32(DEFINE_GPU_SPECS);
+TF_CALL_int64(DEFINE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
 TF_CALL_complex64(DEFINE_GPU_SPECS);
 TF_CALL_complex128(DEFINE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/gather_op.cc b/tensorflow/core/kernels/gather_op.cc
index 75e836fcf2db9b..1025e690487348 100644
--- a/tensorflow/core/kernels/gather_op.cc
+++ b/tensorflow/core/kernels/gather_op.cc
@@ -153,6 +153,7 @@ TF_CALL_uint64(REGISTER_GATHER_CPU);
 // Registration of the GPU implementations.
 #define REGISTER_GATHER_GPU(type) REGISTER_GATHER_ALL_INDICES(GPU, type)
 
+TF_CALL_int64(REGISTER_GATHER_GPU);
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_GATHER_GPU);
 TF_CALL_complex64(REGISTER_GATHER_GPU);
 TF_CALL_complex128(REGISTER_GATHER_GPU);
diff --git a/tensorflow/core/kernels/gpu_utils.h b/tensorflow/core/kernels/gpu_utils.h
index 744d59428b826c..3d59e54790fd5e 100644
--- a/tensorflow/core/kernels/gpu_utils.h
+++ b/tensorflow/core/kernels/gpu_utils.h
@@ -175,7 +175,8 @@ class AutoTuneMap {
   string GetActionSummary(StringPiece action, const Parameters& params,
                           const Config& config) {
     return strings::Printf("autotune_map %s %s: %s -> (%s)", name_.c_str(),
-                           action.ToString().c_str(), params.ToString().c_str(),
+                           std::string(action).c_str(),
+                           params.ToString().c_str(),
                            config.ToString().c_str());
   }
 
diff --git a/tensorflow/core/kernels/hexagon/BUILD b/tensorflow/core/kernels/hexagon/BUILD
index 66aeec51050c9f..4870d9ae200cd5 100644
--- a/tensorflow/core/kernels/hexagon/BUILD
+++ b/tensorflow/core/kernels/hexagon/BUILD
@@ -70,7 +70,6 @@ tf_kernel_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels:remote_fused_graph_execute_utils",
         "//third_party/eigen3",
-        "@com_google_absl//absl/memory",
     ],
 )
 
diff --git a/tensorflow/core/kernels/hexagon/graph_transferer.cc b/tensorflow/core/kernels/hexagon/graph_transferer.cc
index 7960cb4b0552de..e05de3fe8e0eca 100644
--- a/tensorflow/core/kernels/hexagon/graph_transferer.cc
+++ b/tensorflow/core/kernels/hexagon/graph_transferer.cc
@@ -161,7 +161,7 @@ Status GraphTransferer::LoadGraphFromProto(
 
   for (const string& output_node_name : output_node_names) {
     const TensorId tid = ParseTensorName(output_node_name);
-    const string node_name = tid.first.ToString();
+    const string node_name = std::string(tid.first);
     const int port = tid.second;
     const int node_id = node_name_to_id_cache_map_.at(node_name);
     const Node* node = node_name_cache_list_.at(node_id);
diff --git a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
index 3810cbe5b55a44..1580b72605256a 100644
--- a/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
+++ b/tensorflow/core/kernels/hexagon/hexagon_control_wrapper.cc
@@ -168,7 +168,7 @@ bool HexagonControlWrapper::SetupGraph() {
     new_output_node_info.set_output_count(0);
 
     const TensorId tid = ParseTensorName(graph_output.name());
-    const string node_name = tid.first.ToString();
+    const string node_name = std::string(tid.first);
     const int port = tid.second;
     // Register node input for the new output node
     const GraphTransferNodeInfo* node_info =
diff --git a/tensorflow/core/kernels/inplace_ops.cc b/tensorflow/core/kernels/inplace_ops.cc
index 0954d617abf9e0..4015e0a369e13e 100644
--- a/tensorflow/core/kernels/inplace_ops.cc
+++ b/tensorflow/core/kernels/inplace_ops.cc
@@ -476,6 +476,7 @@ REGISTER_EMPTY(string, CPU)
 REGISTER_EMPTY(int32, CPU)
 REGISTER_EMPTY(int64, CPU)
 REGISTER_EMPTY(bool, CPU)
+REGISTER_EMPTY(uint8, CPU)
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
diff --git a/tensorflow/core/kernels/lrn_op_test.cc b/tensorflow/core/kernels/lrn_op_test.cc
index 9c8a1dfa9a52c0..5d8c5c21ca21f0 100644
--- a/tensorflow/core/kernels/lrn_op_test.cc
+++ b/tensorflow/core/kernels/lrn_op_test.cc
@@ -71,7 +71,7 @@ class LRNFloatTest : public OpsTestBase {
       Eigen::Tensor<float, 1, Eigen::RowMajor> out_col(depth);
       for (int64 d = 0; d < depth; ++d) {
         float denom = 0.0f;
-        for (int64 r = std::max(0ll, d - depth_radius);
+        for (int64 r = std::max(int64{0}, d - depth_radius);
              r < std::min(depth, d + depth_radius + 1); ++r) {
           denom += in(i, r) * in(i, r);
         }
diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc
index ed058252981626..276ef9ba523aac 100644
--- a/tensorflow/core/kernels/matmul_op.cc
+++ b/tensorflow/core/kernels/matmul_op.cc
@@ -579,9 +579,7 @@ TF_CALL_float(REGISTER_GPU);
 TF_CALL_double(REGISTER_GPU);
 TF_CALL_complex64(REGISTER_GPU);
 TF_CALL_complex128(REGISTER_GPU);
-#if CUDA_VERSION >= 7050
 TF_CALL_half(REGISTER_GPU);
-#endif
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #ifdef TENSORFLOW_USE_SYCL
diff --git a/tensorflow/core/kernels/matrix_band_part_op.cc b/tensorflow/core/kernels/matrix_band_part_op.cc
index 6a61c3505c719d..0e400a7765a199 100644
--- a/tensorflow/core/kernels/matrix_band_part_op.cc
+++ b/tensorflow/core/kernels/matrix_band_part_op.cc
@@ -159,7 +159,7 @@ struct MatrixBandPartFunctor<CPUDevice, Scalar> {
           const int64 band_start =
               num_lower_diags < 0
                   ? 0
-                  : std::min(n, std::max(0ll, row - num_lower_diags));
+                  : std::min(n, std::max(int64{0}, row - num_lower_diags));
           const int64 band_end =
               num_upper_diags < 0
                   ? n
diff --git a/tensorflow/core/kernels/merge_v2_checkpoints_op_test.cc b/tensorflow/core/kernels/merge_v2_checkpoints_op_test.cc
index 3b9e9e9b75aa23..10e468ce469d90 100644
--- a/tensorflow/core/kernels/merge_v2_checkpoints_op_test.cc
+++ b/tensorflow/core/kernels/merge_v2_checkpoints_op_test.cc
@@ -115,7 +115,7 @@ class MergeV2CheckpointsOpTest : public OpsTestBase {
     for (int i = 0; i < 2; ++i) {
       int directory_found =
           Env::Default()
-              ->IsDirectory(io::Dirname(prefixes[i]).ToString())
+              ->IsDirectory(std::string(io::Dirname(prefixes[i])))
               .code();
       if (delete_old_dirs) {
         EXPECT_EQ(error::NOT_FOUND, directory_found);
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index 9ab95d765c39d7..a9b952095d5798 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -14,6 +14,7 @@ limitations under the License.
 
 #include <limits>
 #include <vector>
+#include <unordered_map>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -589,8 +590,8 @@ class MklConcatOp : public OpKernel {
       const int N = input_tensors.size();
 
       // Get Tensor shapes.
-      std::vector<MklDnnShape> input_shapes(N);
-      GetMklShapeList(context, "values", &input_shapes);
+      std::vector<MklDnnShape> mkl_input_shapes(N);
+      GetMklShapeList(context, "values", &mkl_input_shapes);
 
       const Tensor& concat_dim_tensor = (AxisArgName == NAME_IS_CONCAT_DIM)
                                             ? MklGetInput(context, 0)
@@ -609,19 +610,14 @@ class MklConcatOp : public OpKernel {
       int i = 0;
       bool invoke_eigen = false;
       bool are_all_mkl_inputs = true, are_all_tf_inputs = true;
-      const TensorShape expected_shape = input_shapes[0].IsMklTensor()
-                                             ? input_shapes[0].GetTfShape()
-                                             : input_tensors[0].shape();
+      const TensorShape expected_shape = mkl_input_shapes[0].IsMklTensor()
+                                       ? mkl_input_shapes[0].GetTfShape()
+                                       : input_tensors[0].shape();
       size_t expected_dims = expected_shape.dims();
 
       if (concat_dim < 0) concat_dim = expected_dims + concat_dim;
 
-      for (auto& s : input_shapes) {
-        if (s == expected_shape) {
-          ++i;
-          continue;
-        }
-
+      for (auto& s : mkl_input_shapes) {
         TensorShape s_shape =
             s.IsMklTensor() ? s.GetTfShape() : input_tensors[i].shape();
         size_t s_dims = s_shape.dims();
@@ -664,21 +660,14 @@ class MklConcatOp : public OpKernel {
 
       // Call Eigen library
       if (invoke_eigen) {
-        TensorShapeList tf_input_shapes;
-        i = 0;
-        for (auto& s : input_shapes) {
-          TensorShape s_shape =
-              s.IsMklTensor() ? s.GetTfShape() : input_tensors[i].shape();
-          tf_input_shapes.push_back(s_shape);
-          ++i;
-        }
-        CallEigenVersion(context, input_tensors, tf_input_shapes);
+        CallEigenVersion(context, input_tensors, mkl_input_shapes);
         return;
       }
 
       memory::dims dst_dims;
+
       if (are_all_mkl_inputs)
-        dst_dims = TFShapeToMklDnnDims(input_shapes[0].GetTfShape());
+        dst_dims = TFShapeToMklDnnDims(mkl_input_shapes[0].GetTfShape());
       else
         // When all the inputs are in Tensorflow format, we don't know
         // what is the input data format. In that case, we just use
@@ -688,26 +677,61 @@ class MklConcatOp : public OpKernel {
       std::vector<memory::primitive_desc> srcs_pd;
       std::vector<MklDnnData<T>> srcs(N, MklDnnData<T>(&cpu_engine));
       int64 dst_concat_dim_size = 0;
-      for (int k = 0; k < N; k++) {
-        bool is_mkl_tensor = input_shapes[k].IsMklTensor();
-        memory::dims src_dims;
-
-        // Same comment as dst_dims for src_dims.
-        src_dims = (is_mkl_tensor)
-                       ? TFShapeToMklDnnDims(input_shapes[k].GetTfShape())
-                       : TFShapeToMklDnnDims(input_tensors[k].shape());
-
-        dst_concat_dim_size += src_dims[concat_dim];
-        auto src_md =
-            is_mkl_tensor ? input_shapes[k].GetMklLayout() :
-                          // It does not matter what data format we use here
-                          // (NHWC or NCHW). We just need to ensure that output
-                          // of Concat uses same data format as input.
-                memory::desc(src_dims, MklDnnType<T>(), memory::format::nchw);
-
-        srcs[k].SetUsrMem(src_md, &input_tensors[k]);
-        auto src_mpd = srcs[k].GetUsrMemPrimDesc();
-        srcs_pd.push_back(src_mpd);
+
+      bool isMklReorderNeeded = false;
+      memory::format mkl_common_format = memory::format::any;
+      if (are_all_mkl_inputs) {
+        mkl_common_format =
+            FindMklCommonFormat(mkl_input_shapes, concat_dim,
+               &isMklReorderNeeded, &dst_concat_dim_size);
+
+        if (!isMklReorderNeeded) {
+          // All MKL tensors have a same format. Reorder is not needed.
+          for (int k = 0; k < N; k++) {
+            if (input_tensors[k].NumElements() == 0)
+              continue;
+
+            auto src_md = mkl_input_shapes[k].GetMklLayout();
+            srcs[k].SetUsrMem(src_md, &input_tensors[k]);
+            auto src_mpd = srcs[k].GetUsrMemPrimDesc();
+            srcs_pd.push_back(src_mpd);
+          }
+        } else {
+          // MKL tensors have different formats.
+          // Reorder them to most common format.
+          for (int k = 0; k < N; k++) {
+            if (input_tensors[k].NumElements() == 0)
+              continue;
+
+            auto src_dims = TFShapeToMklDnnDims(
+                mkl_input_shapes[k].GetTfShape());
+            auto src_md = mkl_input_shapes[k].GetMklLayout();
+            srcs[k].SetUsrMem(src_md, &input_tensors[k]);
+
+            if (src_md.data.format != mkl_common_format)
+              src_md = memory::desc(src_dims, MklDnnType<T>(),
+                           mkl_common_format);
+
+            srcs_pd.push_back(memory::primitive_desc(src_md, cpu_engine));
+          }
+        }
+      } else {  // All TF inputs
+        for (int k = 0; k < N; k++) {
+          if (input_tensors[k].NumElements() == 0)
+            continue;
+
+          memory::dims src_dims = TFShapeToMklDnnDims(input_tensors[k].shape());
+          dst_concat_dim_size += src_dims[concat_dim];
+
+          // It does not matter what data format to be used (NHWC versus NCHW).
+          // We just need to ensure that output uses same data format as inputs.
+          auto src_md =
+              memory::desc(src_dims, MklDnnType<T>(), memory::format::nchw);
+
+          srcs[k].SetUsrMem(src_md, &input_tensors[k]);
+          auto src_mpd = srcs[k].GetUsrMemPrimDesc();
+          srcs_pd.push_back(src_mpd);
+        }
       }
       dst_dims[concat_dim] = dst_concat_dim_size;
 
@@ -717,25 +741,33 @@ class MklConcatOp : public OpKernel {
       if (are_all_mkl_inputs) {
         // Since we are passing a specific format for destination,
         // we need to have dst_dims in MklDnn order (NCHW).
-        auto orig_tf_format = input_shapes[0].GetTfDataFormat();
+        auto orig_tf_format = mkl_input_shapes[0].GetTfDataFormat();
         dst_dims_in_nchw = MklDnnDimsInNCHW(
             dst_dims, MklDnnDataFormatToTFDataFormat(orig_tf_format));
-        // We will set the output in the same format as input to avoid layout
-        // conversions.
-        // Currently we are setting dst format same as input format.
-        // See if we can make this choice in a better way.
+        // Set the output format same as the most common format of inputs
+        // to avoid layout conversions.
         dst_md = memory::desc(
-            dst_dims_in_nchw, MklDnnType<T>(),
-            (memory::format)input_shapes[0].GetMklLayout().data.format);
+            dst_dims_in_nchw, MklDnnType<T>(), mkl_common_format);
       } else {
-        // Again, format does not matter here. We just need to make it same as
-        // input format.
+        // All inputs are TF tensors.
+        // Set the output format same as input format (nchw).
         dst_md = memory::desc(dst_dims, MklDnnType<T>(), memory::format::nchw);
       }
 
       std::vector<primitive::at> inputs;
-      for (int k = 0; k < input_tensors.size(); k++)
-        inputs.push_back(srcs[k].GetOpMem());
+      std::vector<primitive> net;
+      if (isMklReorderNeeded) {
+        for (int k = 0; k < input_tensors.size(); k++) {
+          if (input_tensors[k].NumElements() > 0) {
+            srcs[k].CheckReorderToOpMem(srcs_pd[k], &net);
+          }
+        }
+      }
+      for (int k = 0; k < input_tensors.size(); k++) {
+        if (input_tensors[k].NumElements() > 0) {
+          inputs.push_back(srcs[k].GetOpMem());
+        }
+      }
 
       // If all inputs are in MKL format, then meaning of concat_dim needs to
       // change. Value of concat_dim is tied to input Tensorflow data format
@@ -744,7 +776,8 @@ class MklConcatOp : public OpKernel {
       // But ifinput tensors are in NHWC order, then semantics need to change.
       // E.g., if we are concatinating over Channel (dimension 3 for NHWC),
       // then since MklDnn order is NCHW, concat_dim needs to be 1.
-      if (are_all_mkl_inputs) concat_dim = input_shapes[0].TfDimIdx(concat_dim);
+      if (are_all_mkl_inputs)
+         concat_dim = mkl_input_shapes[0].TfDimIdx(concat_dim);
 
       auto concat_pd = concat::primitive_desc(dst_md, concat_dim, srcs_pd);
 
@@ -757,7 +790,7 @@ class MklConcatOp : public OpKernel {
         dnn_shape_dst.SetMklLayout(&dst_pd);
         dnn_shape_dst.SetElemType(MklDnnType<T>());
         dnn_shape_dst.SetTfLayout(dst_dims.size(), dst_dims_in_nchw,
-                                  input_shapes[0].GetTfDataFormat());
+                                  mkl_input_shapes[0].GetTfDataFormat());
         tf_shape_dst.AddDim((dst_pd.get_size() / sizeof(T)));
       } else {
         dnn_shape_dst.SetMklTensor(false);
@@ -772,7 +805,6 @@ class MklConcatOp : public OpKernel {
       dst.SetUsrMem(dst_md, dst_tensor);
 
       auto concat_op = concat(concat_pd, inputs, dst.GetOpMem());
-      std::vector<primitive> net;
       net.push_back(concat_op);
       stream(stream::kind::eager).submit(net).wait();
     } catch (mkldnn::error& e) {
@@ -786,15 +818,27 @@ class MklConcatOp : public OpKernel {
   }
 
   void CallEigenVersion(OpKernelContext* context, const OpInputList& values,
-                        const TensorShapeList& input_shapes) {
-    CHECK_EQ(values.size(), input_shapes.size());
+                        const MklDnnShapeList& mkl_input_shapes) {
+    CHECK_EQ(values.size(), mkl_input_shapes.size());
 
     std::vector<Tensor> converted_values;
-    for (int i = 0; i < input_shapes.size(); i++)
-      converted_values.push_back(values[i]);
+    TensorShapeList tf_input_shapes;
+    for (int i = 0; i < mkl_input_shapes.size(); i++) {
+      if (mkl_input_shapes[i].IsMklTensor()) {
+        // do conversion from MKL to TF
+        Tensor tmp_tensor =
+            ConvertMklToTF<T>(context, values[i], mkl_input_shapes[i]);
+        converted_values.push_back(tmp_tensor);
+        tf_input_shapes.push_back(mkl_input_shapes[i].GetTfShape());
+      } else {
+        // no conversion since it is TF tensor already
+        converted_values.push_back(values[i]);
+        tf_input_shapes.push_back(values[i].shape());
+      }
+    }
 
     // Call Eigen concat.
-    eigen_concat_op_.Compute(context, converted_values, input_shapes);
+    eigen_concat_op_.Compute(context, converted_values, tf_input_shapes);
 
     // Set output Mkl tensor for this op.
     MklDnnShape dnn_shape_output;
@@ -811,6 +855,55 @@ class MklConcatOp : public OpKernel {
         output_tensor->flat<uint8>().data(),
         output_tensor->flat<uint8>().size() * sizeof(uint8));
   }
+
+  // This method finds the most commom format accross all MKL inputs
+  // Inputs:
+  //   1. input_shapes: shapes of input (MKL) tensors.
+  //   2. concat_dim: concat dimension.
+  // Outputs:
+  //   1. is_reorder_needed is set to true if inputs have difference formats
+  //      It is set to false otherwise.
+  //   2. concat_dim_size is the size of concat_dim.
+  // Return:
+  //   return the common MKL format.
+  memory::format FindMklCommonFormat(const MklDnnShapeList& input_shapes,
+      int concat_dim, bool* is_reorder_needed, int64* concat_dim_size) {
+    *is_reorder_needed = false;
+    *concat_dim_size = 0;
+    std::unordered_map<int, int> occurrence_map;
+    if (input_shapes.size() == 0)
+      return memory::format::any;
+
+    // Compute ocurrences of each format of all inputs.
+    for (int k=0; k <input_shapes.size(); k++) {
+      auto src_dims = TFShapeToMklDnnDims(input_shapes[k].GetTfShape());
+      *concat_dim_size += src_dims[concat_dim];
+      int fmt = static_cast<int>(
+          input_shapes[k].GetMklLayout().data.format);
+      occurrence_map[fmt] += 1;
+    }
+
+    if (occurrence_map.size() == 1) {
+       // this means that all inputs have a same format
+       // return it with is_reorder_needed set false.
+       return static_cast<memory::format>(
+           input_shapes[0].GetMklLayout().data.format);
+    }
+
+    // Input tensors have different formats. Thus, reorder is needed.
+    // We pick up the most common format to minimize the total
+    // number of input reorder.
+    memory::format commonest_format = memory::format::any;
+    int max_occurrence = 0;
+    *is_reorder_needed = true;
+    for (auto item : occurrence_map) {
+      if (item.second > max_occurrence) {
+        commonest_format = static_cast<memory::format>(item.first);
+        max_occurrence = item.second;
+      }
+    }
+    return commonest_format;
+  }
 };
 
 #endif
diff --git a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
index d23027a54d169b..a6698a1a0780ad 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
@@ -18,6 +18,7 @@ limitations under the License.
 // bias.
 
 #ifdef INTEL_MKL
+#ifdef INTEL_MKL_ML
 
 #define USE_EIGEN_TENSOR
 #define EIGEN_USE_THREADS
@@ -262,4 +263,5 @@ class MklConv2DCustomBackpropBiasOp : public OpKernel {
 TF_CALL_float(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 } /* namespace tensorflow */
+#endif /* INTEL_MKL_ML */
 #endif /* INTEL_MKL */
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
index 279167aba24863..c0dfed7d7d079c 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -199,13 +199,15 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase<T> {
     CHECK_NOTNULL(pool_params);
     CHECK_NOTNULL(dnn_data_input);
     TensorShape input_tensor_shape = input_tensor.shape();
-    memory::desc input_md =
+    if (input_tensor.NumElements() != 0) {
+      memory::desc input_md =
         input_mkl_shape.IsMklTensor()
             ? input_mkl_shape.GetMklLayout()
             : memory::desc(TFShapeToMklDnnDimsInNCHW(input_tensor_shape,
                                                      this->data_format_tf_),
                            MklDnnType<T>(), this->data_format_mkldnn_);
-    dnn_data_input->SetUsrMem(input_md, &input_tensor);
+      dnn_data_input->SetUsrMem(input_md, &input_tensor);
+    }
     this->InitMklPoolParameters(context, pool_params, input_mkl_shape,
                                 input_tensor_shape);
   }
diff --git a/tensorflow/core/kernels/non_max_suppression_op.cc b/tensorflow/core/kernels/non_max_suppression_op.cc
index 903b898d0ac850..23fdfe944a2ba6 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/non_max_suppression_op.h"
 
+#include <queue>
 #include <vector>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
@@ -56,20 +57,9 @@ static inline void ParseAndCheckBoxSizes(OpKernelContext* context,
               errors::InvalidArgument("scores has incompatible shape"));
 }
 
-static inline void DecreasingArgSort(const std::vector<float>& values,
-                                     std::vector<int>* indices) {
-  indices->resize(values.size());
-  for (int i = 0; i < values.size(); ++i) (*indices)[i] = i;
-  std::sort(
-      indices->begin(), indices->end(),
-      [&values](const int i, const int j) { return values[i] > values[j]; });
-}
-
-// Return true if intersection-over-union overlap between boxes i and j
-// is greater than iou_threshold.
-static inline bool IOUGreaterThanThreshold(
-    typename TTypes<float, 2>::ConstTensor boxes, int i, int j,
-    float iou_threshold) {
+// Return intersection-over-union overlap between boxes i and j
+static inline float IOU(typename TTypes<float, 2>::ConstTensor boxes, int i,
+                        int j) {
   const float ymin_i = std::min<float>(boxes(i, 0), boxes(i, 2));
   const float xmin_i = std::min<float>(boxes(i, 1), boxes(i, 3));
   const float ymax_i = std::max<float>(boxes(i, 0), boxes(i, 2));
@@ -88,13 +78,13 @@ static inline bool IOUGreaterThanThreshold(
   const float intersection_area =
       std::max<float>(intersection_ymax - intersection_ymin, 0.0) *
       std::max<float>(intersection_xmax - intersection_xmin, 0.0);
-  const float iou = intersection_area / (area_i + area_j - intersection_area);
-  return iou > iou_threshold;
+  return intersection_area / (area_i + area_j - intersection_area);
 }
 
 void DoNonMaxSuppressionOp(OpKernelContext* context, const Tensor& boxes,
                            const Tensor& scores, const Tensor& max_output_size,
-                           const float iou_threshold) {
+                           const float iou_threshold,
+                           const float score_threshold) {
   OP_REQUIRES(context, iou_threshold >= 0 && iou_threshold <= 1,
               errors::InvalidArgument("iou_threshold must be in [0, 1]"));
 
@@ -109,37 +99,57 @@ void DoNonMaxSuppressionOp(OpKernelContext* context, const Tensor& boxes,
 
   std::vector<float> scores_data(num_boxes);
   std::copy_n(scores.flat<float>().data(), num_boxes, scores_data.begin());
-  std::vector<int> sorted_indices;
-  DecreasingArgSort(scores_data, &sorted_indices);
+
+  // Data structure for selection candidate in NMS.
+  struct Candidate {
+    int box_index;
+    float score;
+  };
+
+  auto cmp = [](const Candidate bs_i, const Candidate bs_j) {
+    return bs_i.score < bs_j.score;
+  };
+  std::priority_queue<Candidate, std::deque<Candidate>, decltype(cmp)>
+      candidate_priority_queue(cmp);
+  for (int i = 0; i < scores_data.size(); ++i) {
+    if (scores_data[i] > score_threshold) {
+      candidate_priority_queue.emplace(Candidate({i, scores_data[i]}));
+    }
+  }
 
   std::vector<int> selected;
-  std::vector<int> selected_indices(output_size, 0);
-  int num_selected = 0;
-  for (int i = 0; i < num_boxes; ++i) {
-    if (selected.size() >= output_size) break;
-    bool should_select = true;
+  std::vector<float> selected_scores;
+  Candidate next_candidate;
+  float iou, original_score;
+
+  while (selected.size() < output_size && !candidate_priority_queue.empty()) {
+    next_candidate = candidate_priority_queue.top();
+    original_score = next_candidate.score;
+    candidate_priority_queue.pop();
+
     // Overlapping boxes are likely to have similar scores,
-    // therefore we iterate through the selected boxes backwards.
-    for (int j = num_selected - 1; j >= 0; --j) {
-      if (IOUGreaterThanThreshold(boxes_data, sorted_indices[i],
-                                  sorted_indices[selected_indices[j]],
-                                  iou_threshold)) {
-        should_select = false;
-        break;
-      }
+    // therefore we iterate through the previously selected boxes backwards
+    // in order to see if `next_candidate` should be suppressed.
+    bool should_select = true;
+    for (int j = selected.size() - 1; j >= 0; --j) {
+      iou = IOU(boxes_data, next_candidate.box_index, selected[j]);
+      if (iou == 0.0) continue;
+      if (iou > iou_threshold) should_select = false;
     }
+
     if (should_select) {
-      selected.push_back(sorted_indices[i]);
-      selected_indices[num_selected++] = i;
+      selected.push_back(next_candidate.box_index);
+      selected_scores.push_back(next_candidate.score);
     }
   }
 
-  // Allocate output tensor
-  Tensor* output = nullptr;
+  // Allocate output tensors
+  Tensor* output_indices = nullptr;
   TensorShape output_shape({static_cast<int>(selected.size())});
-  OP_REQUIRES_OK(context, context->allocate_output(0, output_shape, &output));
-  TTypes<int, 1>::Tensor selected_indices_data = output->tensor<int, 1>();
-  std::copy_n(selected.begin(), selected.size(), selected_indices_data.data());
+  OP_REQUIRES_OK(context,
+                 context->allocate_output(0, output_shape, &output_indices));
+  TTypes<int, 1>::Tensor output_indices_data = output_indices->tensor<int, 1>();
+  std::copy_n(selected.begin(), selected.size(), output_indices_data.data());
 }
 
 }  // namespace
@@ -164,8 +174,9 @@ class NonMaxSuppressionOp : public OpKernel {
         errors::InvalidArgument("max_output_size must be 0-D, got shape ",
                                 max_output_size.shape().DebugString()));
 
+    const float score_threshold_val = std::numeric_limits<float>::lowest();
     DoNonMaxSuppressionOp(context, boxes, scores, max_output_size,
-                          iou_threshold_);
+                          iou_threshold_, score_threshold_val);
   }
 
  private:
@@ -194,11 +205,48 @@ class NonMaxSuppressionV2Op : public OpKernel {
     OP_REQUIRES(context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
                 errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
                                         iou_threshold.shape().DebugString()));
+    const float iou_threshold_val = iou_threshold.scalar<float>()();
+
+    const float score_threshold_val = std::numeric_limits<float>::lowest();
+    DoNonMaxSuppressionOp(context, boxes, scores, max_output_size,
+                          iou_threshold_val, score_threshold_val);
+  }
+};
+
+template <typename Device>
+class NonMaxSuppressionV3Op : public OpKernel {
+ public:
+  explicit NonMaxSuppressionV3Op(OpKernelConstruction* context)
+      : OpKernel(context) {}
 
+  void Compute(OpKernelContext* context) override {
+    // boxes: [num_boxes, 4]
+    const Tensor& boxes = context->input(0);
+    // scores: [num_boxes]
+    const Tensor& scores = context->input(1);
+    // max_output_size: scalar
+    const Tensor& max_output_size = context->input(2);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsScalar(max_output_size.shape()),
+        errors::InvalidArgument("max_output_size must be 0-D, got shape ",
+                                max_output_size.shape().DebugString()));
+    // iou_threshold: scalar
+    const Tensor& iou_threshold = context->input(3);
+    OP_REQUIRES(context, TensorShapeUtils::IsScalar(iou_threshold.shape()),
+                errors::InvalidArgument("iou_threshold must be 0-D, got shape ",
+                                        iou_threshold.shape().DebugString()));
     const float iou_threshold_val = iou_threshold.scalar<float>()();
 
+    // score_threshold: scalar
+    const Tensor& score_threshold = context->input(4);
+    OP_REQUIRES(
+        context, TensorShapeUtils::IsScalar(score_threshold.shape()),
+        errors::InvalidArgument("score_threshold must be 0-D, got shape ",
+                                score_threshold.shape().DebugString()));
+    const float score_threshold_val = score_threshold.scalar<float>()();
+
     DoNonMaxSuppressionOp(context, boxes, scores, max_output_size,
-                          iou_threshold_val);
+                          iou_threshold_val, score_threshold_val);
   }
 };
 
@@ -208,4 +256,7 @@ REGISTER_KERNEL_BUILDER(Name("NonMaxSuppression").Device(DEVICE_CPU),
 REGISTER_KERNEL_BUILDER(Name("NonMaxSuppressionV2").Device(DEVICE_CPU),
                         NonMaxSuppressionV2Op<CPUDevice>);
 
+REGISTER_KERNEL_BUILDER(Name("NonMaxSuppressionV3").Device(DEVICE_CPU),
+                        NonMaxSuppressionV3Op<CPUDevice>);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/non_max_suppression_op.h b/tensorflow/core/kernels/non_max_suppression_op.h
index d4349edf176f70..933b1af447d94b 100644
--- a/tensorflow/core/kernels/non_max_suppression_op.h
+++ b/tensorflow/core/kernels/non_max_suppression_op.h
@@ -27,7 +27,8 @@ template <typename Device, typename T>
 struct NonMaxSuppression {
   void operator()(const Device& d, typename TTypes<float, 2>::ConstTensor boxes,
                   typename TTypes<float, 1>::ConstTensor scores,
-                  float iou_threshold, int max_output_size,
+                  float iou_threshold, float score_threshold,
+                  int max_output_size,
                   typename TTypes<int, 1>::Tensor selected_indices);
 };
 
diff --git a/tensorflow/core/kernels/non_max_suppression_op_test.cc b/tensorflow/core/kernels/non_max_suppression_op_test.cc
index 9387fb13bc2524..ed7db313bd59c6 100644
--- a/tensorflow/core/kernels/non_max_suppression_op_test.cc
+++ b/tensorflow/core/kernels/non_max_suppression_op_test.cc
@@ -86,6 +86,23 @@ TEST_F(NonMaxSuppressionOpTest, TestSelectAtMostTwoBoxesFromThreeClusters) {
   test::ExpectTensorEqual<int>(expected, *GetOutput(0));
 }
 
+TEST_F(NonMaxSuppressionOpTest, TestSelectWithNegativeScores) {
+  MakeOp(.5);
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
+  AddInputFromArray<float>(
+      TensorShape({6}), {.9f - 10.0f, .75f - 10.0f, .6f - 10.0f, .95f - 10.0f,
+                         .5f - 10.0f, .3f - 10.0f});
+  AddInputFromArray<int>(TensorShape({}), {6});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({3}));
+  test::FillValues<int>(&expected, {3, 0, 5});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
 TEST_F(NonMaxSuppressionOpTest, TestSelectAtMostThirtyBoxesFromThreeClusters) {
   MakeOp(.5);
   AddInputFromArray<float>(
@@ -340,4 +357,216 @@ TEST_F(NonMaxSuppressionV2OpTest, TestEmptyInput) {
   test::ExpectTensorEqual<int>(expected, *GetOutput(0));
 }
 
+//
+// NonMaxSuppressionV3Op Tests
+//
+
+class NonMaxSuppressionV3OpTest : public OpsTestBase {
+ protected:
+  void MakeOp() {
+    TF_EXPECT_OK(NodeDefBuilder("non_max_suppression_op", "NonMaxSuppressionV3")
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_INT32))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Input(FakeInput(DT_FLOAT))
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+  }
+};
+
+TEST_F(NonMaxSuppressionV3OpTest, TestSelectFromThreeClusters) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
+  AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({3}));
+  test::FillValues<int>(&expected, {3, 0, 5});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV3OpTest,
+       TestSelectFromThreeClustersWithScoreThreshold) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
+  AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {0.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.4f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({2}));
+  test::FillValues<int>(&expected, {3, 0});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV3OpTest,
+       TestSelectFromThreeClustersWithScoreThresholdZeroScores) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
+  AddInputFromArray<float>(TensorShape({6}), {.1, 0, 0, .3, .2, -5.0});
+  // If we ask for more boxes than we actually expect to get back;
+  // should still only get 2 boxes back.
+  AddInputFromArray<int>(TensorShape({}), {6});
+  AddInputFromArray<float>(TensorShape({}), {0.5f});
+  AddInputFromArray<float>(TensorShape({}), {-3.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({2}));
+  test::FillValues<int>(&expected, {3, 0});
+
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV3OpTest,
+       TestSelectFromThreeClustersFlippedCoordinates) {
+  MakeOp();
+  AddInputFromArray<float>(TensorShape({6, 4}),
+                           {1, 1,  0, 0,  0, 0.1f,  1, 1.1f,  0, .9f, 1, -0.1f,
+                            0, 10, 1, 11, 1, 10.1f, 0, 11.1f, 1, 101, 0, 100});
+  AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({3}));
+  test::FillValues<int>(&expected, {3, 0, 5});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV3OpTest, TestSelectAtMostTwoBoxesFromThreeClusters) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
+  AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
+  AddInputFromArray<int>(TensorShape({}), {2});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({2}));
+  test::FillValues<int>(&expected, {3, 0});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV3OpTest,
+       TestSelectAtMostThirtyBoxesFromThreeClusters) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
+  AddInputFromArray<float>(TensorShape({6}), {.9f, .75f, .6f, .95f, .5f, .3f});
+  AddInputFromArray<int>(TensorShape({}), {30});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({3}));
+  test::FillValues<int>(&expected, {3, 0, 5});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV3OpTest, TestSelectSingleBox) {
+  MakeOp();
+  AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
+  AddInputFromArray<float>(TensorShape({1}), {.9f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({1}));
+  test::FillValues<int>(&expected, {0});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV3OpTest, TestSelectFromTenIdenticalBoxes) {
+  MakeOp();
+
+  int num_boxes = 10;
+  std::vector<float> corners(num_boxes * 4);
+  std::vector<float> scores(num_boxes);
+  for (int i = 0; i < num_boxes; ++i) {
+    corners[i * 4 + 0] = 0;
+    corners[i * 4 + 1] = 0;
+    corners[i * 4 + 2] = 1;
+    corners[i * 4 + 3] = 1;
+    scores[i] = .9;
+  }
+  AddInputFromArray<float>(TensorShape({num_boxes, 4}), corners);
+  AddInputFromArray<float>(TensorShape({num_boxes}), scores);
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({1}));
+  test::FillValues<int>(&expected, {0});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
+TEST_F(NonMaxSuppressionV3OpTest, TestInconsistentBoxAndScoreShapes) {
+  MakeOp();
+  AddInputFromArray<float>(
+      TensorShape({6, 4}),
+      {0, 0,  1, 1,  0, 0.1f,  1, 1.1f,  0, -0.1f, 1, 0.9f,
+       0, 10, 1, 11, 0, 10.1f, 1, 11.1f, 0, 100,   1, 101});
+  AddInputFromArray<float>(TensorShape({5}), {.9f, .75f, .6f, .95f, .5f});
+  AddInputFromArray<int>(TensorShape({}), {30});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  Status s = RunOpKernel();
+
+  ASSERT_FALSE(s.ok());
+  EXPECT_TRUE(
+      str_util::StrContains(s.ToString(), "scores has incompatible shape"))
+      << s;
+}
+
+TEST_F(NonMaxSuppressionV3OpTest, TestInvalidIOUThreshold) {
+  MakeOp();
+  AddInputFromArray<float>(TensorShape({1, 4}), {0, 0, 1, 1});
+  AddInputFromArray<float>(TensorShape({1}), {.9f});
+  AddInputFromArray<int>(TensorShape({}), {3});
+  AddInputFromArray<float>(TensorShape({}), {1.2f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  Status s = RunOpKernel();
+
+  ASSERT_FALSE(s.ok());
+  EXPECT_TRUE(
+      str_util::StrContains(s.ToString(), "iou_threshold must be in [0, 1]"))
+      << s;
+}
+
+TEST_F(NonMaxSuppressionV3OpTest, TestEmptyInput) {
+  MakeOp();
+  AddInputFromArray<float>(TensorShape({0, 4}), {});
+  AddInputFromArray<float>(TensorShape({0}), {});
+  AddInputFromArray<int>(TensorShape({}), {30});
+  AddInputFromArray<float>(TensorShape({}), {.5f});
+  AddInputFromArray<float>(TensorShape({}), {0.0f});
+  TF_ASSERT_OK(RunOpKernel());
+
+  Tensor expected(allocator(), DT_INT32, TensorShape({0}));
+  test::FillValues<int>(&expected, {});
+  test::ExpectTensorEqual<int>(expected, *GetOutput(0));
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/nth_element_op.cc b/tensorflow/core/kernels/nth_element_op.cc
index 7f12eb953a31ec..0e43cc19aae513 100644
--- a/tensorflow/core/kernels/nth_element_op.cc
+++ b/tensorflow/core/kernels/nth_element_op.cc
@@ -114,7 +114,7 @@ struct NthElementFunctor<CPUDevice, T> {
 
     auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
     // The average time complexity of partition-based nth_element (BFPRT) is
-    // O(n), althought the worst time complexity could be O(n^2). Here, 20 is a
+    // O(n), although the worst time complexity could be O(n^2). Here, 20 is a
     // empirical factor of cost_per_unit.
     Shard(worker_threads.num_threads, worker_threads.workers, num_rows,
           20 * last_dim, SubNthElement);
diff --git a/tensorflow/core/kernels/ops_testutil.cc b/tensorflow/core/kernels/ops_testutil.cc
index cd13d31bbc2d2d..7aa7d1a5861608 100644
--- a/tensorflow/core/kernels/ops_testutil.cc
+++ b/tensorflow/core/kernels/ops_testutil.cc
@@ -24,7 +24,7 @@ namespace tensorflow {
 
 void OpsTestBase::SetDevice(const DeviceType& device_type,
                             std::unique_ptr<Device> device) {
-  CHECK(device_.get()) << "No device provided";
+  CHECK(device_) << "No device provided";
   device_type_ = device_type;
   device_ = std::move(device);
 #ifdef GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/padding_fifo_queue.cc b/tensorflow/core/kernels/padding_fifo_queue.cc
index 9d35ecb66c00e0..ff553f11c9fdfa 100644
--- a/tensorflow/core/kernels/padding_fifo_queue.cc
+++ b/tensorflow/core/kernels/padding_fifo_queue.cc
@@ -23,13 +23,13 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/kernels/padding_fifo_queue.h"
 #include "tensorflow/core/kernels/queue_base.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index 9ea3c545e0b4b6..3669769eb9ab14 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_partition.h"
+#include "tensorflow/core/util/ptr_util.h"
 #include "tensorflow/core/util/reffed_status_callback.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -41,7 +42,8 @@ namespace {
 // TODO(akshayka): Support distributed execution.
 class PartitionedCallOp : public AsyncOpKernel {
  public:
-  explicit PartitionedCallOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
+  explicit PartitionedCallOp(OpKernelConstruction* ctx)
+      : AsyncOpKernel(ctx), local_device_name_(ctx->device()->name()) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
   }
 
@@ -73,92 +75,28 @@ class PartitionedCallOp : public AsyncOpKernel {
     {
       mutex_lock l(mu_);
       if (!partitioned_) {
-        // Instantiate the function to obtain its underlying graph, complete
-        // with nodes for arguments and return values.
-        FunctionLibraryRuntime::InstantiateOptions opts;
-        FHandle handle;
-        OP_REQUIRES_OK_ASYNC(
-            ctx,
-            lib->Instantiate(func_.name(), AttrSlice(&func_.attr()), opts,
-                             &handle),
-            done);
-        Graph* graph = lib->GetFunctionBody(handle)->graph;
+        auto graph = tensorflow::MakeUnique<Graph>(OpRegistry::Global());
+        OP_REQUIRES_OK_ASYNC(ctx, GetGraphFromFunction(lib, graph.get()), done);
 
-        // Pin the inputs and outputs to the local device to simplify the
-        // function-dispatching logic.
-        local_device_name_ = lib->device()->name();
-        for (Node* node : graph->op_nodes()) {
-          string node_type = node->type_string();
-          if (node_type == FunctionLibraryDefinition::kArgOp ||
-              node_type == FunctionLibraryDefinition::kRetOp) {
-            node->set_assigned_device_name(local_device_name_);
-          }
-        }
-
-        // Place the graph, i.e,. assign a device to every node in it.
         DeviceSet device_set;
         for (auto d : lib->device_mgr()->ListDevices()) {
           device_set.AddDevice(d);
         }
-        Placer placer(graph, &device_set);
+        Placer placer(graph.get(), &device_set);
         OP_REQUIRES_OK_ASYNC(ctx, placer.Run(), done);
 
-        // Partition the graph into subgraphs: exactly one subgraph per device.
-        //
-        // TODO(akshayka): Let devices rewrite their graphs.
-        PartitionOptions partition_options;
-        partition_options.node_to_loc = [](const Node* node) {
-          // TODO(akshayka): To better support the distributed case, first split
-          // the graph by worker (e.g,. using the master session's
-          // `SplitByWorker` policy), and then recursively partition the
-          // per-worker shards at the remote worker(s).
-          return node->assigned_device_name();
-        };
-        int64 edge_name_counter = 0;
-        partition_options.new_name =
-            [&edge_name_counter](const string& prefix) {
-              return strings::StrCat(prefix, "/_", ++edge_name_counter);
-            };
-        partition_options.get_incarnation =
-            [&device_set](const string& name) -> int64 {
-          const Device* d = device_set.FindDeviceByName(name);
-          if (d == nullptr) {
-            return PartitionOptions::kIllegalIncarnation;
-          } else {
-            return d->attributes().incarnation();
-          }
-        };
-        partition_options.control_flow_added = false;
-        std::unordered_map<string, GraphDef> partitions;
+        std::unordered_map<string, std::unique_ptr<Graph>> subgraphs;
         OP_REQUIRES_OK_ASYNC(
-            ctx, Partition(partition_options, graph, &partitions), done);
-
-        VLOG(3) << "Partitioned function '" << func_.name() << "', yielding "
-                << partitions.size() << " shards.";
-
-        // `subgraphs` is a map from devices to their corresponding subgraphs.
-        gtl::FlatMap<string, std::unique_ptr<Graph>> subgraphs;
-        const FunctionLibraryDefinition* flib_def = &graph->flib_def();
-        for (const auto& partition : partitions) {
-          std::unique_ptr<Graph> subgraph(new Graph(flib_def));
-          GraphConstructorOptions opts;
-          opts.allow_internal_ops = true;
-          opts.expect_device_spec = true;
-          const string& device = partition.first;
-          const GraphDef& graph_def = partition.second;
-          OP_REQUIRES_OK_ASYNC(
-              ctx, ConvertGraphDefToGraph(opts, graph_def, subgraph.get()),
-              done);
-          subgraphs.emplace(device, std::move(subgraph));
-        }
+            ctx, PartitionHelper(device_set, std::move(graph), &subgraphs),
+            done);
 
         // The FunctionLibraryRuntime's library cannot be mutated from within
-        // an OpKernel, so the functions are instantiated in an overlay library.
+        // an OpKernel, so functions are instantiated in an overlay library.
         overlay_lib_.reset(new FunctionLibraryDefinition(
             *lib->GetFunctionLibraryDefinition()));
         for (const auto& pair : subgraphs) {
           const string& target = pair.first;
-          Graph* subgraph = pair.second.get();
+          const auto& subgraph = pair.second;
           FunctionDef shard;
           string unique_name = UniquifyFunctionName(func_.name());
           OP_REQUIRES_OK_ASYNC(
@@ -173,12 +111,96 @@ class PartitionedCallOp : public AsyncOpKernel {
               lib->Instantiate(unique_name, AttrSlice(&shard.attr()), opts,
                                &handle),
               done);
-          device_handle_map_.emplace(target, handle);
+          function_handles_.emplace(target, handle);
         }
         partitioned_ = true;
       }
     }
+    ExecuteFunctions(lib, ctx, std::move(done));
+  }
+
+ private:
+  typedef std::pair<string, FHandle> DeviceAndFHandle;
+
+  // `func_` encapsulates the original, unsharded function.
+  // Copies the graph backing `func_` into `*graph`, pinning the input and
+  // output nodes to the local device.
+  //
+  // `*graph` must be a freshly allocated graph.
+  Status GetGraphFromFunction(FunctionLibraryRuntime* lib, Graph* graph) {
+    FunctionLibraryRuntime::InstantiateOptions opts;
+    FHandle handle;
+    TF_RETURN_IF_ERROR(lib->Instantiate(func_.name(), AttrSlice(&func_.attr()),
+                                        opts, &handle));
+    const FunctionBody* fbody = lib->GetFunctionBody(handle);
+    if (fbody == nullptr) {
+      return errors::Internal("Could not find handle ", handle);
+    }
+    CopyGraph(*fbody->graph, graph);
 
+    // Pin the inputs and outputs to the local device to simplify the
+    // function-dispatching logic.
+    for (Node* node : graph->op_nodes()) {
+      string node_type = node->type_string();
+      if (node_type == FunctionLibraryDefinition::kArgOp ||
+          node_type == FunctionLibraryDefinition::kRetOp) {
+        node->set_assigned_device_name(local_device_name_);
+      }
+    }
+    return Status::OK();
+  }
+
+  // Partitions `graph` and populates `subgraphs` with the partitions.
+  Status PartitionHelper(
+      const DeviceSet& device_set, std::unique_ptr<Graph> graph,
+      std::unordered_map<string, std::unique_ptr<Graph>>* subgraphs) {
+    PartitionOptions partition_options;
+    partition_options.node_to_loc = [](const Node* node) {
+      // TODO(akshayka): To better support the distributed case, first split
+      // the graph by worker (e.g,. using the master session's
+      // `SplitByWorker` policy), and then recursively partition the
+      // per-worker shards at the remote worker(s).
+      return node->assigned_device_name();
+    };
+    int64 edge_name_counter = 0;
+    partition_options.new_name = [&edge_name_counter](const string& prefix) {
+      return strings::StrCat(prefix, "/_", ++edge_name_counter);
+    };
+    partition_options.get_incarnation =
+        [&device_set](const string& name) -> int64 {
+      const Device* d = device_set.FindDeviceByName(name);
+      if (d == nullptr) {
+        return PartitionOptions::kIllegalIncarnation;
+      } else {
+        return d->attributes().incarnation();
+      }
+    };
+    partition_options.control_flow_added = false;
+    std::unordered_map<string, GraphDef> partitions;
+    TF_RETURN_IF_ERROR(Partition(partition_options, graph.get(), &partitions));
+
+    VLOG(3) << "Partitioned function '" << func_.name() << "', yielding "
+            << partitions.size() << " shards.";
+
+    const FunctionLibraryDefinition* flib_def = &graph->flib_def();
+    for (const auto& partition : partitions) {
+      std::unique_ptr<Graph> subgraph(new Graph(flib_def));
+      GraphConstructorOptions opts;
+      opts.allow_internal_ops = true;
+      opts.expect_device_spec = true;
+      const string& device = partition.first;
+      const GraphDef& graph_def = partition.second;
+      TF_RETURN_IF_ERROR(
+          ConvertGraphDefToGraph(opts, graph_def, subgraph.get()));
+      subgraphs->emplace(device, std::move(subgraph));
+    }
+
+    return Status::OK();
+  }
+
+  // Executes the partitioned functions.
+  void ExecuteFunctions(FunctionLibraryRuntime* lib, OpKernelContext* ctx,
+                        DoneCallback done) LOCKS_EXCLUDED(mu_) {
     FunctionLibraryRuntime::Options opts;
     opts.step_id = ctx->step_id();
     opts.step_container = ctx->step_container();
@@ -205,11 +227,11 @@ class PartitionedCallOp : public AsyncOpKernel {
         },
         rendez, std::move(done), std::placeholders::_1);
     auto* refcounted_done = new ReffedStatusCallback(std::move(callback));
-    for (int i = 1; i < device_handle_map_.size(); ++i) {
+    for (int i = 1; i < function_handles_.size(); ++i) {
       refcounted_done->Ref();
     }
 
-    for (const auto& pair : device_handle_map_) {
+    for (const auto& pair : function_handles_) {
       const string& target_device = pair.first;
       FHandle handle = pair.second;
       VLOG(3) << "Running function shard on device " << target_device;
@@ -247,8 +269,6 @@ class PartitionedCallOp : public AsyncOpKernel {
       }
     }
   }
-
- private:
   string UniquifyFunctionName(const string& name) {
     for (;; ++suffix_) {
       const string candidate = strings::StrCat(name, "_", suffix_);
@@ -258,13 +278,13 @@ class PartitionedCallOp : public AsyncOpKernel {
     }
   }
 
-  // `func_` encapsulates the original, unsharded function.
   NameAttrList func_;
-  string local_device_name_;
+  const string local_device_name_;
   // Function shards are added to `overlay_lib_`.
   std::unique_ptr<FunctionLibraryDefinition> overlay_lib_;
-  // A map from device names to handles of function shards.
-  gtl::FlatMap<string, FHandle> device_handle_map_;
+  // A map from device names to handles of function shards; this map is
+  // read-only after the first execution of the OpKernel.
+  gtl::FlatMap<string, FHandle> function_handles_;
 
   mutex mu_;
   bool partitioned_ GUARDED_BY(mu_) = false;
@@ -274,6 +294,12 @@ class PartitionedCallOp : public AsyncOpKernel {
 };
 REGISTER_KERNEL_BUILDER(Name("PartitionedCall").Device(DEVICE_CPU),
                         PartitionedCallOp);
+REGISTER_KERNEL_BUILDER(Name("PartitionedCall").Device(DEVICE_GPU),
+                        PartitionedCallOp);
+#if TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("PartitionedCall").Device(DEVICE_SYCL),
+                        PartitionedCallOp);
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/pooling_ops_common.h b/tensorflow/core/kernels/pooling_ops_common.h
index 3ddf96d3a55cde..c8d9ec0e4f79cf 100644
--- a/tensorflow/core/kernels/pooling_ops_common.h
+++ b/tensorflow/core/kernels/pooling_ops_common.h
@@ -596,7 +596,7 @@ void SpatialAvgPool(OpKernelContext* context, Tensor* output,
   // so the factor 0.01 (i.e. 1/100) with a max of 10000, was chosen to limit
   // the work unit cost to an operating range in which it emperically performed
   // best.
-  const int64 work_unit_cost = std::max(10000LL, work_unit_size / 100LL);
+  const int64 work_unit_cost = std::max(int64{10000}, work_unit_size / 100LL);
   const DeviceBase::CpuWorkerThreads& worker_threads =
       *(context->device()->tensorflow_cpu_worker_threads());
   Shard(worker_threads.num_threads, worker_threads.workers,
diff --git a/tensorflow/core/kernels/priority_queue.cc b/tensorflow/core/kernels/priority_queue.cc
index bab94f7f0ad1fd..342e66a7c7d0e0 100644
--- a/tensorflow/core/kernels/priority_queue.cc
+++ b/tensorflow/core/kernels/priority_queue.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/kernels/priority_queue.h"
 #include "tensorflow/core/kernels/queue_base.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -30,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/quantization_utils.h b/tensorflow/core/kernels/quantization_utils.h
index 9fafe6bb65406a..e67a94e5f83793 100644
--- a/tensorflow/core/kernels/quantization_utils.h
+++ b/tensorflow/core/kernels/quantization_utils.h
@@ -273,8 +273,8 @@ inline void RequantizeManyInNewRangeReference(const qint32* input, int64 count,
     const int64 offset_intermediate = fp_value - output_offset_fp;
     const int64 round_intermediate = offset_intermediate + rounding_delta;
     int64 quantized_int64 = round_intermediate >> fp_shift;
-    quantized_int64 = std::max(quantized_int64, 0LL);
-    quantized_int64 = std::min(quantized_int64, 255LL);
+    quantized_int64 = std::max(quantized_int64, int64{0});
+    quantized_int64 = std::min(quantized_int64, int64{255});
     output[index] = static_cast<quint8>(static_cast<int32>(quantized_int64));
   }
 }
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.h b/tensorflow/core/kernels/quantize_and_dequantize_op.h
index 3b09ea2527d8b4..906d507c8a4159 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.h
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.h
@@ -23,6 +23,8 @@ limitations under the License.
 namespace tensorflow {
 namespace functor {
 
+// TODO(pauldonnelly): 'signed_input' should really be called 'signed_output'.
+
 template <typename Device, typename T>
 struct QuantizeAndDequantizeOneScaleFunctor {
   void operator()(const Device& d, typename TTypes<T>::ConstVec input,
@@ -49,56 +51,51 @@ struct QuantizeAndDequantizeOneScaleImpl {
     d.memcpyDeviceToHost(&min_range, input_min.data(), sizeof(T));
     d.memcpyDeviceToHost(&max_range, input_max.data(), sizeof(T));
 
-    // Make sure the range is symmetric for signed quantization, or start from
-    // 0 for unsigned quantization.
-    max_range = std::max(std::abs(max_range), std::abs(min_range));
+    // Calculate the range for the simulated integer quantization:
+    // e.g. [-128,127] for signed = true, num_bits = 8,
+    // or [0, 255] for signed = false, num_bits = 8.
+    const int64 min_quantized = signed_input ? -(1ULL << (num_bits - 1)) : 0;
+    const int64 max_quantized = min_quantized + ((1ULL << num_bits) - 1);
 
-    // If both min and max are 0, then the output should be just 0.
-    if (max_range == 0) {
-      out.device(d) = input.constant(T(0));
-      return;
-    }
+    // Determine the maximum scaling factor that would scale
+    // [min_range, max_range] to not exceed [min_quantized, max_quantized],
+    // while keeping 0 unchanged.
+    const T scale_from_min_side = (min_quantized * min_range > 0)
+                                      ? min_quantized / min_range
+                                      : std::numeric_limits<T>::max();
+    const T scale_from_max_side = (max_quantized * max_range > 0)
+                                      ? max_quantized / max_range
+                                      : std::numeric_limits<T>::max();
 
-    if (signed_input) {
-      min_range = -max_range;
+    // Note: Avoids changing the side of the range that determines scale.
+    T scale, inverse_scale;
+    if (scale_from_min_side < scale_from_max_side) {
+      scale = scale_from_min_side;
+      inverse_scale = min_range / min_quantized;
+      max_range = max_quantized * inverse_scale;
+    } else {
+      scale = scale_from_max_side;
+      inverse_scale = max_range / max_quantized;
+      min_range = min_quantized * inverse_scale;
+    }
 
-      // If it is signed, we try to keep 0.0 being 0 and drop one bucket. For
-      // example, if it is 8 bits, we have the range [-127, 127]. So for input
-      // range of [-x, x], the scale should be 254/(2*x).
-      T scale = static_cast<T>((uint64_t{1} << (num_bits - 1)) - 1) / max_range;
-      T inverse_scale = T(1.0) / scale;
-      if (range_given) {
-        out.device(d) =
-            ((input.cwiseMin(max_range).cwiseMax(min_range) - min_range) *
-                 scale +
-             T(0.5))
-                    .floor() *
-                inverse_scale +
-            min_range;
-      } else {
-        // No need to compare with min and max as they are measured from the
-        // tensor.
-        out.device(d) =
-            ((input - min_range) * scale + T(0.5)).floor() * inverse_scale +
-            min_range;
-      }
+    if (range_given) {
+      // Note: The clamping here is to avoid overflow in the quantized type.
+      // The semantics of the op does not guarantee to clamp to the specified
+      // min_range and max_range - because we may have changed either min_range
+      // or max_range.
+      out.device(d) =
+          ((input.cwiseMin(max_range).cwiseMax(min_range) - min_range) * scale +
+           T(0.5))
+                  .floor() *
+              inverse_scale +
+          min_range;
     } else {
-      min_range = 0;
-      // If it is unsigned and num_bits == 8, the range with 8 bits is [0, 255].
-      // If the input range is [0, x], then the scale is x/255 instead of 254 as
-      // in the case above.
-      T scale = static_cast<T>((uint64_t{1} << num_bits) - 1) / max_range;
-      T inverse_scale = 1.0 / scale;
-      if (range_given) {
-        out.device(d) =
-            ((input.cwiseMin(max_range).cwiseMax(min_range)) * scale + T(0.5))
-                .floor() *
-            inverse_scale;
-      } else {
-        // No need to compare with min and max as they are measured from the
-        // tensor.
-        out.device(d) = (input * scale + T(0.5)).floor() * inverse_scale;
-      }
+      // No need to clamp to min_range and max_range in this case as they were
+      // measured from the tensor.
+      out.device(d) =
+          ((input - min_range) * scale + T(0.5)).floor() * inverse_scale +
+          min_range;
     }
   }
 };
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
index e41df12d914d39..629c69850368f5 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
@@ -105,13 +105,13 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int8) {
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
 
-  // With int8, the tensor is quantized to {-127, -63, 0, 38, 102, 70}.
+  // With int8, the tensor is quantized to {-128, -64, 0, 38, 102, 71}.
   // Scale is: 1/127
-  // Then it is dequantized to {-1, -63.0/127, 0, 38.0/127, 102.0/127, 70.0/127}
+  // Then it is dequantized to {-1, -0.5, 0, 38.0/128, 102.0/128, 71.0/128}
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_FLOAT, TensorShape({6}));
-  test::FillValues<float>(
-      &expected, {-1, -63.0 / 127, 0, 38.0 / 127, 102.0 / 127, 70.0 / 127});
+  test::FillValues<float>(&expected,
+                          {-1, -0.5, 0, 38.0 / 128, 102.0 / 128, 71.0 / 128});
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
 
   // Ensure that the inputs haven't been changed.
@@ -136,13 +136,13 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int8_V3) {
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
   AddInputFromArray<int32>(TensorShape({}), {8});    // num_bits
 
-  // With int8, the tensor is quantized to {-127, -63, 0, 38, 102, 70}.
-  // Scale is: 1/127
-  // Then it is dequantized to {-1, -63.0/127, 0, 38.0/127, 102.0/127, 70.0/127}
+  // With int8, the tensor is quantized to {-128, -64, 0, 38, 102, 71}.
+  // Scale is: 1/128
+  // Then it is dequantized to {-1, -64.0/128, 0, 38.0/128, 102.0/128, 71.0/128}
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_FLOAT, TensorShape({6}));
-  test::FillValues<float>(
-      &expected, {-1, -63.0 / 127, 0, 38.0 / 127, 102.0 / 127, 70.0 / 127});
+  test::FillValues<float>(&expected,
+                          {-1, -0.5, 0, 38.0 / 128, 102.0 / 128, 71.0 / 128});
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
 
   // Ensure that the inputs haven't been changed.
@@ -166,12 +166,11 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int4) {
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
 
-  // With int4, the tensor is quantized to {-7, -3, 0, 2, 6, 4}.
-  // Scale is: 1/7
+  // With int4, the tensor is quantized to {-8, -4, 0, 2, 6, 4}.
+  // Scale is: 1/8
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_FLOAT, TensorShape({6}));
-  test::FillValues<float>(&expected,
-                          {-1, -3.0 / 7, 0, 2.0 / 7, 6.0 / 7, 4.0 / 7});
+  test::FillValues<float>(&expected, {-1, -0.5, 0, 0.25, 0.75, 0.5});
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
 
   // Ensure that the inputs haven't been changed.
@@ -196,12 +195,11 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int4_V3) {
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
   AddInputFromArray<int32>(TensorShape({}), {4});    // num_bits
 
-  // With int4, the tensor is quantized to {-7, -3, 0, 2, 6, 4}.
-  // Scale is: 1/7
+  // With int4, the tensor is quantized to {-8, -4, 0, 2, 6, 4}.
+  // Scale is: 1/8
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_FLOAT, TensorShape({6}));
-  test::FillValues<float>(&expected,
-                          {-1, -3.0 / 7, 0, 2.0 / 7, 6.0 / 7, 4.0 / 7});
+  test::FillValues<float>(&expected, {-1, -0.5, 0, 0.25, 0.75, 0.5});
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
 
   // Ensure that the inputs haven't been changed.
@@ -228,13 +226,14 @@ TEST_F(QuantizeAndDequantizeTest, Convert_2D_tensor_with_int8_range_given) {
   AddInputFromArray<float>(TensorShape({}), {1.0});   // Max
 
   // Note that the range is given as [-1, 1].
-  // With int8, the tensor is quantized to {-102, -63, 0, 38, 102, 70, -127,
+  // With int8, the tensor is quantized to {-102, -63, 0, 38, 102, 70, -128,
   // 127}.
   // Scale is: 1/127
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 4}));
-  test::FillValues<float>(&expected, {-102.0 / 127, -63.0 / 127, 0, 38.0 / 127,
-                                      102.0 / 127, 70.0 / 127, -1, 1});
+  test::FillValues<float>(
+      &expected, {-102.0 / 127, -63.0 / 127, 0, 38.0 / 127, 102.0 / 127,
+                  70.0 / 127, -128.0 / 127, 1});
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
 }
 
@@ -258,13 +257,14 @@ TEST_F(QuantizeAndDequantizeTest, Convert_2D_tensor_with_int8_range_given_V3) {
   AddInputFromArray<int32>(TensorShape({}), {8});     // num_bits
 
   // Note that the range is given as [-1, 1].
-  // With int8, the tensor is quantized to {-102, -63, 0, 38, 102, 70, -127,
+  // With int8, the tensor is quantized to {-102, -63, 0, 38, 102, 70, -128,
   // 127}.
   // Scale is: 1/127
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 4}));
-  test::FillValues<float>(&expected, {-102.0 / 127, -63.0 / 127, 0, 38.0 / 127,
-                                      102.0 / 127, 70.0 / 127, -1, 1});
+  test::FillValues<float>(
+      &expected, {-102.0 / 127, -63.0 / 127, 0, 38.0 / 127, 102.0 / 127,
+                  70.0 / 127, -128.0 / 127, 1});
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
 }
 
diff --git a/tensorflow/core/kernels/quantize_op.cc b/tensorflow/core/kernels/quantize_op.cc
index fc26813a08e5af..857273e04eec2f 100644
--- a/tensorflow/core/kernels/quantize_op.cc
+++ b/tensorflow/core/kernels/quantize_op.cc
@@ -131,6 +131,7 @@ class QuantizeV2Op : public OpKernel {
 
     Tensor* output = nullptr;
     OP_REQUIRES_OK(ctx, ctx->allocate_output(0, input.shape(), &output));
+    typename TTypes<T>::Vec o = output->template flat<T>();
     if (mode_ == QUANTIZE_MODE_MIN_COMBINED) {
       const float scale_factor =
           (static_cast<double>(std::numeric_limits<T>::max()) -
@@ -147,7 +148,6 @@ class QuantizeV2Op : public OpKernel {
       // semantic of std::round, which implements "round-half-away-zero",
       // e.g., -5.5 gets rounded to -6, -5.4 goes to -5, 5.4 goes to 5,
       // and 5.5 goes to 6.
-      typename TTypes<T>::Vec o = output->template flat<T>();
       bool is_signed = std::is_signed<T>::value;
       if (is_signed) {
         // The slow path.
@@ -180,29 +180,20 @@ class QuantizeV2Op : public OpKernel {
             output);
       }
     } else if (mode_ == QUANTIZE_MODE_SCALED) {
-      // The quantization logic for mode SCALED matches that of
-      // QuantizeAndDequantizeV2 and QuantizeAndDequantizeV3.
-      typename TTypes<T>::Vec o = output->template flat<T>();
-      static constexpr int num_bits = sizeof(T) * 8;
-      const float max_abs = std::max(std::abs(min_range), std::abs(max_range));
-      const bool is_signed = std::is_signed<T>::value;
-      float target_range;
-      if (is_signed) {
-        max_range = max_abs;
-        min_range = -max_abs;
-        // If it is signed, we try to keep 0.0 being 0 and drop one bucket. For
-        // example, if it is 8 bits, we have the range [-127, 127]. So for input
-        // range of [-x, x], the scale should be 254/(2*x).
-        target_range = static_cast<float>((uint64_t{1} << (num_bits - 1)) - 1);
-      } else {
-        max_range = max_abs;
-        min_range = 0.0;
-        // If it is unsigned and num_bits == 8, the range with 8 bits is [0,
-        // 255].  If the input range is [0, x], then the scale is x/255 instead
-        // of 254 as in the case above.
-        target_range = static_cast<float>((uint64_t{1} << num_bits) - 1);
-      }
-      const float scale_factor = target_range / max_abs;
+      const int min_output_value = std::numeric_limits<T>::min();
+      const int max_output_value = std::numeric_limits<T>::max();
+      const float scale_factor_from_min_side =
+          (min_output_value * min_range > 0)
+              ? min_output_value / min_range
+              : std::numeric_limits<float>::max();
+      const float scale_factor_from_max_side =
+          (max_output_value * max_range > 0)
+              ? max_output_value / max_range
+              : std::numeric_limits<float>::max();
+      const float scale_factor =
+          std::min(scale_factor_from_min_side, scale_factor_from_max_side);
+      min_range = min_output_value / scale_factor;
+      max_range = max_output_value / scale_factor;
       if (round_mode_ == ROUND_HALF_TO_EVEN) {
         // scalar_round_op_google implements "round-half-to-even".
         o.device(ctx->template eigen_device<Device>()) =
diff --git a/tensorflow/core/kernels/quantize_op_test.cc b/tensorflow/core/kernels/quantize_op_test.cc
index 57982bdf76e396..0a672686a2d49b 100644
--- a/tensorflow/core/kernels/quantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_op_test.cc
@@ -61,17 +61,17 @@ TEST_F(QuantizedOpTest, QuantizeV2Quint8Scaled) {
                    .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
   AddInputFromArray<float>(TensorShape({8}),
-                           {-255.0, 0.0, 1.0, 1.25, 1.75, 127.0, 255.0, 500.0});
+                           {-255.0, 0.0, 1.0, 1.25, 1.75, 64.0, 127.0, 500.0});
   AddInputFromArray<float>(TensorShape({1}), {-255.0f});
   AddInputFromArray<float>(TensorShape({1}), {127.0f});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_QUINT8, TensorShape({8}));
-  // Input element -5.0 should map to 0 even though min_range = -255, because
+  // Input values < 0 should map to 0 even though min_range = -255, because
   // we are performing quantization by scaling to quint8.
-  // Input element 0.0 should map to 0.
-  // Input element 500.0 is quantized to 127 because
-  // max(abs(-255), abs(127)) = 255.
-  test::FillValues<quint8>(&expected, {0, 0, 1, 1, 2, 127, 255, 255});
+  // Input value 0.0 should map to 0.
+  // The scale factor chosen should be 255 / 127 =  2.00787
+  // Output values are clipped to 255.
+  test::FillValues<quint8>(&expected, {0, 0, 2, 3, 4, 129, 255, 255});
   test::ExpectTensorEqual<quint8>(expected, *GetOutput(0));
 
   Tensor expected_output_min(allocator(), DT_FLOAT, TensorShape({}));
@@ -79,7 +79,7 @@ TEST_F(QuantizedOpTest, QuantizeV2Quint8Scaled) {
   test::ExpectTensorEqual<float>(expected_output_min, *GetOutput(1));
 
   Tensor expected_output_max(allocator(), DT_FLOAT, TensorShape({}));
-  test::FillValues<float>(&expected_output_max, {255.0});
+  test::FillValues<float>(&expected_output_max, {127.0});
   test::ExpectTensorEqual<float>(expected_output_max, *GetOutput(2));
 }
 
@@ -123,19 +123,19 @@ TEST_F(QuantizedOpTest, QuantizeV2Qint8Scaled) {
                    .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
   AddInputFromArray<float>(TensorShape({7}),
-                           {-127.0, 0.0, 1.0, 1.25, 1.75, 64.0, 127.0});
-  AddInputFromArray<float>(TensorShape({1}), {-127.0f});
+                           {-128.0, 0.0, 1.0, 1.25, 1.75, 64.0, 127.0});
+  AddInputFromArray<float>(TensorShape({1}), {-128.0f});
   AddInputFromArray<float>(TensorShape({1}), {100.0f});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_QINT8, TensorShape({7}));
   // Input element 0.0 should map to 0.
   // Input element 127.0 maps to 127 instead of 100 because
   // max(abs(-127), abs(100)) = 127.
-  test::FillValues<qint8>(&expected, {-127, 0, 1, 1, 2, 64, 127});
+  test::FillValues<qint8>(&expected, {-128, 0, 1, 1, 2, 64, 127});
   test::ExpectTensorEqual<qint8>(expected, *GetOutput(0));
 
   Tensor expected_output_min(allocator(), DT_FLOAT, TensorShape({}));
-  test::FillValues<float>(&expected_output_min, {-127.0});
+  test::FillValues<float>(&expected_output_min, {-128.0});
   test::ExpectTensorEqual<float>(expected_output_min, *GetOutput(1));
 
   Tensor expected_output_max(allocator(), DT_FLOAT, TensorShape({}));
@@ -152,9 +152,9 @@ TEST_F(QuantizedOpTest, QuantizeV2Qint8ScaledSmallInputRange) {
                    .Attr("mode", "SCALED")
                    .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
-  AddInputFromArray<float>(TensorShape({3}), {-1.0, 0.0, 2.0});
-  AddInputFromArray<float>(TensorShape({1}), {-1.0f});
-  AddInputFromArray<float>(TensorShape({1}), {2.0f});
+  AddInputFromArray<float>(TensorShape({3}), {-0.064, 0.0, 0.127});
+  AddInputFromArray<float>(TensorShape({1}), {-0.064f});
+  AddInputFromArray<float>(TensorShape({1}), {0.127f});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_QINT8, TensorShape({3}));
   // Input element 0.0 should map to 0.
@@ -163,11 +163,11 @@ TEST_F(QuantizedOpTest, QuantizeV2Qint8ScaledSmallInputRange) {
   test::ExpectTensorEqual<qint8>(expected, *GetOutput(0));
 
   Tensor expected_output_min(allocator(), DT_FLOAT, TensorShape({}));
-  test::FillValues<float>(&expected_output_min, {-2.0});
+  test::FillValues<float>(&expected_output_min, {-0.128});
   test::ExpectTensorEqual<float>(expected_output_min, *GetOutput(1));
 
   Tensor expected_output_max(allocator(), DT_FLOAT, TensorShape({}));
-  test::FillValues<float>(&expected_output_max, {2.0});
+  test::FillValues<float>(&expected_output_max, {0.127});
   test::ExpectTensorEqual<float>(expected_output_max, *GetOutput(2));
 }
 
@@ -183,8 +183,8 @@ TEST_F(QuantizedOpTest, QuantizeV2Qint8ScaledRoundToEven) {
   TF_ASSERT_OK(InitOp());
   AddInputFromArray<float>(TensorShape({7}),
                            {-126.5, 0.0, 1.0, 2.5, 3.5, 64.0, 127.0});
-  AddInputFromArray<float>(TensorShape({1}), {-127.0f});
-  AddInputFromArray<float>(TensorShape({1}), {-127.0f});
+  AddInputFromArray<float>(TensorShape({1}), {-128.0f});
+  AddInputFromArray<float>(TensorShape({1}), {-128.0f});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_QINT8, TensorShape({7}));
   // Input element 0.0 should map to 0.
@@ -193,7 +193,7 @@ TEST_F(QuantizedOpTest, QuantizeV2Qint8ScaledRoundToEven) {
   test::ExpectTensorEqual<qint8>(expected, *GetOutput(0));
 
   Tensor expected_output_min(allocator(), DT_FLOAT, TensorShape({}));
-  test::FillValues<float>(&expected_output_min, {-127.0});
+  test::FillValues<float>(&expected_output_min, {-128.0});
   test::ExpectTensorEqual<float>(expected_output_min, *GetOutput(1));
 
   Tensor expected_output_max(allocator(), DT_FLOAT, TensorShape({}));
@@ -213,8 +213,8 @@ TEST_F(QuantizedOpTest, QuantizeV2Qint8ScaledRoundAwayFromZero) {
   TF_ASSERT_OK(InitOp());
   AddInputFromArray<float>(TensorShape({7}),
                            {-126.5, 0.0, 1.0, 2.5, 3.5, 64.0, 127.0});
-  AddInputFromArray<float>(TensorShape({1}), {-127.0f});
-  AddInputFromArray<float>(TensorShape({1}), {-127.0f});
+  AddInputFromArray<float>(TensorShape({1}), {-128.0f});
+  AddInputFromArray<float>(TensorShape({1}), {-128.0f});
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_QINT8, TensorShape({7}));
   // Input element 0.0 should map to 0.
@@ -223,7 +223,7 @@ TEST_F(QuantizedOpTest, QuantizeV2Qint8ScaledRoundAwayFromZero) {
   test::ExpectTensorEqual<qint8>(expected, *GetOutput(0));
 
   Tensor expected_output_min(allocator(), DT_FLOAT, TensorShape({}));
-  test::FillValues<float>(&expected_output_min, {-127.0});
+  test::FillValues<float>(&expected_output_min, {-128.0});
   test::ExpectTensorEqual<float>(expected_output_min, *GetOutput(1));
 
   Tensor expected_output_max(allocator(), DT_FLOAT, TensorShape({}));
diff --git a/tensorflow/core/kernels/queue_base.cc b/tensorflow/core/kernels/queue_base.cc
index de495c19cba300..acfbc02ee7ad55 100644
--- a/tensorflow/core/kernels/queue_base.cc
+++ b/tensorflow/core/kernels/queue_base.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/random_shuffle_queue_op.cc b/tensorflow/core/kernels/random_shuffle_queue_op.cc
index 87fc94333162c4..31e8ce944fef91 100644
--- a/tensorflow/core/kernels/random_shuffle_queue_op.cc
+++ b/tensorflow/core/kernels/random_shuffle_queue_op.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/kernels/batch_util.h"
 #include "tensorflow/core/kernels/queue_op.h"
 #include "tensorflow/core/kernels/typed_queue.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -36,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/batch_util.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
index eff2f7b17b6773..168fdeeb63b489 100644
--- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
+++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -295,7 +295,11 @@ __global__ void ColumnReduceMax16ColumnsKernel(
 
   // 1D array necessary due to bug in CUDA 9 compiler.
   // TODO(nluehr) revert to 2D array when compiler is ready.
-  __shared__ storage_type<value_type> partial_sums[32 * 33];
+  // This is the mimic the following, but without any constructors:
+  //   __shared__ storage_type<value_type> partial_sums[32 * 33];
+  __shared__ __align__(
+      alignof(value_type)) char partial_sums_raw[32 * 33 * sizeof(value_type)];
+  value_type* partial_sums = reinterpret_cast<value_type*>(partial_sums_raw);
 
   row += rows_per_warp * gridDim.y * blockDim.y;
   for (; row < num_rows; row += rows_per_warp * gridDim.y * blockDim.y) {
@@ -344,7 +348,11 @@ __global__ void ColumnReduceKernel(
 
   // 1D array necessary due to bug in CUDA 9 compiler.
   // TODO(nluehr) revert to 2D array when compiler is ready.
-  __shared__ storage_type<value_type> partial_sums[32 * 33];
+  // This is to mimic the following, but without constructors:
+  //     __shared__ storage_type<value_type> partial_sums[32 * 33];
+  __shared__ __align__(
+      alignof(value_type)) char partial_sums_raw[32 * 33 * sizeof(value_type)];
+  value_type* partial_sums = reinterpret_cast<value_type*>(partial_sums_raw);
 
   row += gridDim.y * blockDim.y;
 
diff --git a/tensorflow/core/kernels/regex_full_match_op.cc b/tensorflow/core/kernels/regex_full_match_op.cc
new file mode 100644
index 00000000000000..5863a2c8e46419
--- /dev/null
+++ b/tensorflow/core/kernels/regex_full_match_op.cc
@@ -0,0 +1,59 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <string>
+
+#include "re2/re2.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+class RegexFullMatchOp : public OpKernel {
+ public:
+  explicit RegexFullMatchOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* input_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
+    const auto& input_flat = input_tensor->flat<string>();
+
+    const Tensor* pattern_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("pattern", &pattern_tensor));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(pattern_tensor->shape()),
+                errors::InvalidArgument("Pattern must be scalar, but received ",
+                                        pattern_tensor->shape().DebugString()));
+    const string pattern = pattern_tensor->flat<string>()(0);
+    const RE2 match(pattern);
+    OP_REQUIRES(ctx, match.ok(),
+                errors::InvalidArgument("Invalid pattern: ", pattern,
+                                        ", error: ", match.error()));
+
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output("output", input_tensor->shape(),
+                                             &output_tensor));
+    auto output_flat = output_tensor->flat<bool>();
+    for (size_t i = 0; i < input_flat.size(); ++i) {
+      output_flat(i) = RE2::FullMatch(input_flat(i), match);
+    }
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("RegexFullMatch").Device(DEVICE_CPU),
+                        RegexFullMatchOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc b/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
index cc4d9a49a00139..194a711d983288 100644
--- a/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
+++ b/tensorflow/core/kernels/remote_fused_graph_execute_utils.cc
@@ -47,7 +47,7 @@ std::unordered_set<string> BuildNodeSetFromNodeNamesAndPorts(
   std::unordered_set<string> retval;
   for (const string& node_name_and_port : node_names_and_ports) {
     const TensorId tid = ParseTensorName(node_name_and_port);
-    retval.emplace(tid.first.ToString());
+    retval.emplace(std::string(tid.first));
   }
   return retval;
 }
@@ -64,7 +64,7 @@ Node* FindMutableNodeByName(const string& name, Graph* graph) {
 const NodeDef* FindNodeDefByName(const string& input,
                                  const GraphDef& graph_def) {
   const TensorId tid = ParseTensorName(input);
-  const string name = tid.first.ToString();
+  const string name = std::string(tid.first);
   for (const NodeDef& node_def : graph_def.node()) {
     if (node_def.name() == name) {
       return &node_def;
@@ -77,7 +77,7 @@ bool IsSameNodeName(const NodeDef& node_def, const string& node_name_and_port,
                     TensorId* tid) {
   CHECK_NOTNULL(tid);
   *tid = ParseTensorName(node_name_and_port);
-  if (node_def.name() == tid->first.ToString()) {
+  if (node_def.name() == tid->first) {
     return true;
   }
   return false;
@@ -326,7 +326,7 @@ RemoteFusedGraphExecuteUtils::GetExecutorBuildRegistry() {
     const string& node_name) {
   for (const std::pair<string, Tensor>& pair : input_tensor_vector) {
     const TensorId tid = ParseTensorName(pair.first);
-    if (node_name == tid.first.ToString()) {
+    if (node_name == tid.first) {
       return true;
     }
   }
@@ -423,7 +423,7 @@ RemoteFusedGraphExecuteUtils::AddOutputTensorShapeTypeByTensorShapeMap(
   std::vector<DataType> data_types;
   std::vector<TensorShape> shapes;
   const TensorId tid = ParseTensorName(name_and_port);
-  const string node_name = tid.first.ToString();
+  const string node_name = std::string(tid.first);
   const int port = tid.second;
   const NodeDef* node_def = FindNodeDefByName(node_name, graph_def);
   CHECK_NOTNULL(node_def);
@@ -522,7 +522,7 @@ RemoteFusedGraphExecuteUtils::GetTensorShapeType(
     const TensorShapeMap& tensor_shape_map, const string& node_name) {
   if (node_name.find(':') != string::npos) {
     const TensorId tid = ParseTensorName(node_name);
-    return GetTensorShapeType(tensor_shape_map, tid.first.ToString(),
+    return GetTensorShapeType(tensor_shape_map, std::string(tid.first),
                               tid.second);
   } else {
     return GetTensorShapeType(tensor_shape_map, node_name, 0);
@@ -570,7 +570,7 @@ RemoteFusedGraphExecuteUtils::BuildRemoteGraphInputsAndOutputsFromProto(
   const TensorId tid = ParseTensorName(name);
   CHECK_EQ(tensor_shape_map->count(name), 0);
   tensor_shape_map->emplace(
-      tid.first.ToString(),
+      std::string(tid.first),
       std::make_pair(tid.second,
                      std::make_pair(tensor.dtype(), tensor.shape())));
 }
@@ -692,7 +692,7 @@ RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteOpNode(
   std::vector<NodeBuilder::NodeOut> node_out_list;
   for (const string& input : inputs) {
     const TensorId tid = ParseTensorName(input);
-    Node* node = FindMutableNodeByName(tid.first.ToString(), graph);
+    Node* node = FindMutableNodeByName(std::string(tid.first), graph);
     CHECK_NOTNULL(node);
     node_out_list.emplace_back(node, tid.second);
   }
@@ -848,7 +848,7 @@ RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteOpNode(
 
   for (const string& subgraph_input : std::get<1>(cluster)) {
     const TensorId tid = ParseTensorName(subgraph_input);
-    const string subgraph_input_name = tid.first.ToString();
+    const string subgraph_input_name = std::string(tid.first);
     const int subgraph_input_port = tid.second;
     const NodeDef* node_def = FindNodeDefByName(subgraph_input_name, graph_def);
     CHECK_NOTNULL(node_def);
@@ -895,7 +895,7 @@ RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteOpNode(
   std::deque<const Node*> queue;
   for (const string& output : border_outputs) {
     const TensorId tid = ParseTensorName(output);
-    const string& output_node_name = tid.first.ToString();
+    const string& output_node_name = std::string(tid.first);
     for (const Node* node : graph.nodes()) {
       if (output_node_name == node->name()) {
         queue.push_back(node);
@@ -916,8 +916,7 @@ RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteOpNode(
       bool input_found = false;
       for (const string& input : border_inputs) {
         const TensorId tid = ParseTensorName(input);
-        if (tid.first.ToString() == src_node->name() &&
-            tid.second == src_port) {
+        if (tid.first == src_node->name() && tid.second == src_port) {
           input_found = true;
           border_input_nodes.insert(src_node);
         }
@@ -976,7 +975,7 @@ RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteOpNode(
       for (int j = 0; j < border_outputs.size(); ++j) {
         const string& output = border_outputs.at(j);
         const TensorId tid = ParseTensorName(output);
-        const string output_name = tid.first.ToString();
+        const string output_name = std::string(tid.first);
         Node* src_node = edge->src();
         if (src_node != nullptr && src_node->name() == output_name &&
             edge->src_output() == tid.second) {
@@ -996,11 +995,12 @@ RemoteFusedGraphExecuteUtils::BuildRemoteFusedGraphExecuteOpNode(
   // RemoteFusedGraphExecuteOpNode
   for (const string& output : outputs) {
     const TensorId output_tid = ParseTensorName(output);
-    const string output_name = output_tid.first.ToString();
+    const string output_name = std::string(output_tid.first);
     for (size_t i = 0; i < border_outputs.size(); ++i) {
       const TensorId subgraph_output_tid =
           ParseTensorName(border_outputs.at(i));
-      const string& subgraph_output_name = subgraph_output_tid.first.ToString();
+      const string& subgraph_output_name =
+          std::string(subgraph_output_tid.first);
       if (output_name == subgraph_output_name) {
         LOG(INFO) << "As graph output and subgraph output are same, "
                   << "the graph output node is replaced by identity node";
@@ -1435,7 +1435,7 @@ RemoteFusedGraphExecuteUtils::BuildNodeMapFromOpsDefinitions(
     GraphDef* graph_def) {
   const TensorId tid = ParseTensorName(input);
   CHECK_EQ(0, tid.second);
-  const string node_name = tid.first.ToString();
+  const string node_name = std::string(tid.first);
   for (NodeDef& node : *graph_def->mutable_node()) {
     if (node.name() != node_name) {
       continue;
diff --git a/tensorflow/core/kernels/resize_area_op.cc b/tensorflow/core/kernels/resize_area_op.cc
index 98b8a0df282a21..c996ae60b79ee9 100644
--- a/tensorflow/core/kernels/resize_area_op.cc
+++ b/tensorflow/core/kernels/resize_area_op.cc
@@ -271,7 +271,7 @@ class ResizeAreaOp : public OpKernel {
 
  private:
   static EIGEN_ALWAYS_INLINE int64 Bound(int64 val, int64 limit) {
-    return std::min(limit - 1ll, std::max(0ll, val));
+    return std::min(limit - 1ll, std::max(int64{0}, val));
   }
 
   bool align_corners_;
diff --git a/tensorflow/core/kernels/resize_area_op_test.cc b/tensorflow/core/kernels/resize_area_op_test.cc
index a7e06ef15a1dd1..84ff090b546929 100644
--- a/tensorflow/core/kernels/resize_area_op_test.cc
+++ b/tensorflow/core/kernels/resize_area_op_test.cc
@@ -124,7 +124,8 @@ class ResizeAreaOpTest : public OpsTestBase {
                                   ? (j + 1 > in_x1 ? width_scale : j + 1 - in_x)
                                   : (j + 1 > in_x1 ? in_x1 - j : 1.0);
               for (int64 c = 0; c < channels; ++c) {
-#define BOUND(val, limit) std::min(((limit)-1ll), (std::max(0ll, (val))))
+#define BOUND(val, limit) \
+  std::min(((limit)-int64{1}), (std::max(int64{0}, (val))))
                 sum_data(c) +=
                     static_cast<float>(input_data(b, BOUND(i, in_height),
                                                   BOUND(j, in_width), c)) *
diff --git a/tensorflow/core/kernels/resize_bicubic_op.cc b/tensorflow/core/kernels/resize_bicubic_op.cc
index 65014b6c44eb2e..8380ed6d8ff64a 100644
--- a/tensorflow/core/kernels/resize_bicubic_op.cc
+++ b/tensorflow/core/kernels/resize_bicubic_op.cc
@@ -57,7 +57,7 @@ const float* GetCoeffsTable() {
 }
 
 inline int64 Bound(int64 val, int64 limit) {
-  return std::min(limit - 1ll, std::max(0ll, val));
+  return std::min(limit - 1ll, std::max(int64{0}, val));
 }
 
 struct WeightsAndIndices {
diff --git a/tensorflow/core/kernels/resize_bicubic_op_test.cc b/tensorflow/core/kernels/resize_bicubic_op_test.cc
index c23570d885be24..eff25f5ad49c24 100644
--- a/tensorflow/core/kernels/resize_bicubic_op_test.cc
+++ b/tensorflow/core/kernels/resize_bicubic_op_test.cc
@@ -81,7 +81,7 @@ class ResizeBicubicOpTest : public OpsTestBase {
 
   // Used in the baseline implementation
   inline int64 Bound(int64 val, int64 limit) {
-    return std::min(limit - 1ll, std::max(0ll, val));
+    return std::min(limit - 1ll, std::max(int64{0}, val));
   }
 
   // Used in the baseline implementation
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index 69aec9c475e6c5..a6785f97054f32 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -51,6 +51,7 @@ limitations under the License.
 #define EIGEN_USE_GPU
 #endif
 
+#include "tensorflow/core/kernels/resource_variable_ops.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
@@ -72,40 +73,33 @@ namespace tensorflow {
 
 REGISTER_RESOURCE_HANDLE_KERNEL(Var);
 
-class ReadVariableOp : public OpKernel {
- public:
-  explicit ReadVariableOp(OpKernelConstruction* c) : OpKernel(c) {
-    OP_REQUIRES_OK(c, c->GetAttr("dtype", &dtype_));
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    Var* variable = nullptr;
-    ResourceHandle handle = HandleFromInput(ctx, 0);
-    const auto status = LookupResource(ctx, handle, &variable);
-    OP_REQUIRES(ctx, status.ok(),
-                errors::FailedPrecondition(
-                    "Error while reading resource variable ", handle.name(),
-                    " from Container: ", handle.container(),
-                    ". This could mean that the variable was uninitialized. ",
-                    status.ToString()));
-
-    core::ScopedUnref s(variable);
-    // We're acquiring a reference to the underlying buffer while
-    // holding a shared lock to guarantee ordering of reads and
-    // writes.
-    tf_shared_lock ml(*variable->mu());
-    const Tensor& t = *variable->tensor();
-    OP_REQUIRES(
-        ctx, dtype_ == t.dtype(),
-        errors::InvalidArgument(
-            "Trying to read variable with wrong dtype. Expected ",
-            DataTypeString(dtype_), " got ", DataTypeString(t.dtype())));
-    ctx->set_output(0, t);
-  }
-
- private:
-  DataType dtype_;
-};
+ReadVariableOp::ReadVariableOp(OpKernelConstruction* c) : OpKernel(c) {
+  OP_REQUIRES_OK(c, c->GetAttr("dtype", &dtype_));
+}
+
+void ReadVariableOp::Compute(OpKernelContext* ctx) {
+  Var* variable = nullptr;
+  ResourceHandle handle = HandleFromInput(ctx, 0);
+  const auto status = LookupResource(ctx, handle, &variable);
+  OP_REQUIRES(ctx, status.ok(),
+              errors::FailedPrecondition(
+                  "Error while reading resource variable ", handle.name(),
+                  " from Container: ", handle.container(),
+                  ". This could mean that the variable was uninitialized. ",
+                  status.ToString()));
+
+  core::ScopedUnref s(variable);
+  // We're acquiring a reference to the underlying buffer while
+  // holding a shared lock to guarantee ordering of reads and
+  // writes.
+  tf_shared_lock ml(*variable->mu());
+  const Tensor& t = *variable->tensor();
+  OP_REQUIRES(ctx, dtype_ == t.dtype(),
+              errors::InvalidArgument(
+                  "Trying to read variable with wrong dtype. Expected ",
+                  DataTypeString(dtype_), " got ", DataTypeString(t.dtype())));
+  ctx->set_output(0, t);
+}
 
 REGISTER_KERNEL_BUILDER(Name("ReadVariableOp").Device(DEVICE_CPU),
                         ReadVariableOp);
@@ -211,6 +205,11 @@ class AssignVariableOp : public OpKernel {
  public:
   explicit AssignVariableOp(OpKernelConstruction* c) : OpKernel(c) {
     OP_REQUIRES_OK(c, c->GetAttr("dtype", &dtype_));
+    if (!c->GetAttr("_grappler_relax_allocator_constraints",
+                    &relax_constraints_)
+             .ok()) {
+      relax_constraints_ = false;
+    }
   }
 
   void Compute(OpKernelContext* context) override {
@@ -228,8 +227,10 @@ class AssignVariableOp : public OpKernel {
               PersistentTensor unused;
               Tensor* tmp;
               AllocatorAttributes attr;
-              attr.set_gpu_compatible(true);
-              attr.set_nic_compatible(true);
+              if (!relax_constraints_) {
+                attr.set_gpu_compatible(true);
+                attr.set_nic_compatible(true);
+              }
               TF_RETURN_IF_ERROR(context->allocate_persistent(
                   dtype_, context->input(1).shape(), &unused, &tmp, attr));
               *(*ptr)->tensor() = *tmp;
@@ -245,8 +246,10 @@ class AssignVariableOp : public OpKernel {
 
     const Tensor& value = context->input(1);
     AllocatorAttributes attr;
-    attr.set_gpu_compatible(true);
-    attr.set_nic_compatible(true);
+    if (!relax_constraints_) {
+      attr.set_gpu_compatible(true);
+      attr.set_nic_compatible(true);
+    }
 
     // Copying is unnecessary if we are the last user of the value
     // tensor, we can just adopt the input tensor's buffer instead.
@@ -277,6 +280,7 @@ class AssignVariableOp : public OpKernel {
 
  private:
   DataType dtype_;
+  bool relax_constraints_;
 };
 
 template <typename Device>
@@ -693,6 +697,8 @@ TF_CALL_REAL_NUMBER_TYPES(REGISTER_SCATTER_MINMAX_CPU);
 
 REGISTER_SCATTER_KERNEL(string, CPU, "ResourceScatterUpdate",
                         scatter_op::UpdateOp::ASSIGN);
+REGISTER_SCATTER_KERNEL(bool, CPU, "ResourceScatterUpdate",
+                        scatter_op::UpdateOp::ASSIGN);
 REGISTER_SCATTER_KERNEL(Variant, CPU, "ResourceScatterUpdate",
                         scatter_op::UpdateOp::ASSIGN);
 
@@ -715,6 +721,13 @@ REGISTER_KERNEL_BUILDER(Name("ResourceScatterUpdate")
                             .TypeConstraint<int32>("Tindices"),
                         ResourceScatterUpdateOp<GPUDevice, Variant, int32,
                                                 scatter_op::UpdateOp::ASSIGN>)
+REGISTER_KERNEL_BUILDER(Name("ResourceScatterUpdate")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("resource")
+                            .TypeConstraint<bool>("dtype")
+                            .TypeConstraint<int32>("Tindices"),
+                        ResourceScatterUpdateOp<GPUDevice, bool, int32,
+                                                scatter_op::UpdateOp::ASSIGN>)
 REGISTER_KERNEL_BUILDER(Name("ResourceScatterUpdate")
                             .Device(DEVICE_GPU)
                             .HostMemory("resource")
diff --git a/tensorflow/core/kernels/resource_variable_ops.h b/tensorflow/core/kernels/resource_variable_ops.h
new file mode 100644
index 00000000000000..8cae5d21f0e5d8
--- /dev/null
+++ b/tensorflow/core/kernels/resource_variable_ops.h
@@ -0,0 +1,33 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_RESOURCE_VARIABLE_OPS_H_
+#define TENSORFLOW_CORE_KERNELS_RESOURCE_VARIABLE_OPS_H_
+
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+
+class ReadVariableOp : public OpKernel {
+ public:
+  explicit ReadVariableOp(OpKernelConstruction* c);
+  void Compute(OpKernelContext* ctx) override;
+
+ private:
+  DataType dtype_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_RESOURCE_VARIABLE_OPS_H_
diff --git a/tensorflow/core/kernels/roll_op.cc b/tensorflow/core/kernels/roll_op.cc
index 4b630809c5a854..722116f86fd131 100644
--- a/tensorflow/core/kernels/roll_op.cc
+++ b/tensorflow/core/kernels/roll_op.cc
@@ -84,7 +84,7 @@ void DoRoll(OpKernelContext* context, const int64 num_elements,
   // Shard
   auto worker_threads = context->device()->tensorflow_cpu_worker_threads();
   // 15 - expiramentally determined with float and bool types
-  const int cost_per_element = 15 * sizeof(T);  // rough esitmate
+  const int cost_per_element = 15 * sizeof(T);  // rough estimate
   Shard(worker_threads->num_threads, worker_threads->workers, num_elements,
         cost_per_element, std::move(work));
 }
@@ -285,7 +285,7 @@ class RollOp : public OpKernel {
       dim_range[i] = dim_size_prod;
     }
 
-    Tensor* output = NULL;
+    Tensor* output = nullptr;
     OP_REQUIRES_OK(context,
                    context->allocate_output(0, input.shape(), &output));
     auto input_flat = input.flat<T>().data();
diff --git a/tensorflow/core/kernels/save_restore_v2_ops.cc b/tensorflow/core/kernels/save_restore_v2_ops.cc
index 3acf290ea20992..ab4de6c815ceb0 100644
--- a/tensorflow/core/kernels/save_restore_v2_ops.cc
+++ b/tensorflow/core/kernels/save_restore_v2_ops.cc
@@ -220,9 +220,9 @@ class MergeV2Checkpoints : public OpKernel {
         context, tensorflow::MergeBundles(env, input_prefixes, merged_prefix));
 
     if (delete_old_dirs_) {
-      const string& merged_dir = io::Dirname(merged_prefix).ToString();
+      const string& merged_dir = std::string(io::Dirname(merged_prefix));
       for (const string& input_prefix : input_prefixes) {
-        const string& dirname = io::Dirname(input_prefix).ToString();
+        const string& dirname = std::string(io::Dirname(input_prefix));
         if (dirname == merged_dir) continue;
         Status status = env->DeleteDir(dirname);
         // For sharded save, only the first delete will go through and all
diff --git a/tensorflow/core/kernels/scatter_functor_gpu.cu.cc b/tensorflow/core/kernels/scatter_functor_gpu.cu.cc
index ab1c62e2c44da0..7bfd0051de9bba 100644
--- a/tensorflow/core/kernels/scatter_functor_gpu.cu.cc
+++ b/tensorflow/core/kernels/scatter_functor_gpu.cu.cc
@@ -42,6 +42,8 @@ typedef Eigen::GpuDevice GPUDevice;
 
 DEFINE_GPU_SPECS(float);
 DEFINE_GPU_SPECS(double);
+DEFINE_GPU_SPECS_OP(bool, int32, scatter_op::UpdateOp::ASSIGN);
+DEFINE_GPU_SPECS_OP(bool, int64, scatter_op::UpdateOp::ASSIGN);
 // TODO(b/27222123): The following fails to compile due to lack of support for
 // fp16.
 // TF_CALL_GPU_NUMBER_TYPES(DEFINE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index 5a9386a4bd0fb7..03c876e65b1834 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -62,14 +62,57 @@ class ScatterNdOp : public OpKernel {
     const Tensor& updates = c->input(1);
     const Tensor& shape_input = c->input(2);
 
-    OP_REQUIRES(c, shape_input.dims() == 1,
-                errors::InvalidArgument("Shape must be a vector"));
+    OP_REQUIRES(c, indices.shape().dims() >= 1,
+                errors::InvalidArgument(
+                    "Indices shape must have rank at least one. Found:",
+                    indices.shape().DebugString()));
+    OP_REQUIRES(c, updates.shape().dims() >= 1,
+                errors::InvalidArgument(
+                    "Updates shape must have rank at least one. Found:",
+                    updates.shape().DebugString()));
 
     auto vec = shape_input.flat<Index>();
     TensorShape shape;
     OP_REQUIRES_OK(c,
                    TensorShapeUtils::MakeShape(vec.data(), vec.size(), &shape));
 
+    OP_REQUIRES(
+        c,
+        (shape.num_elements() > 0 || (indices.shape().num_elements() == 0 &&
+                                      updates.shape().num_elements() == 0)),
+        errors::InvalidArgument(
+            "Indices and updates specified for empty output shape"));
+
+    const int64 outer_dims = indices.shape().dims() - 1;
+
+    for (int i = 0; i < outer_dims; ++i) {
+      OP_REQUIRES(c, indices.shape().dim_size(i) == updates.shape().dim_size(i),
+                  errors::InvalidArgument(
+                      "Outer dimensions of indices and update must match. "
+                      "Indices shape: ",
+                      indices.shape().DebugString(),
+                      ", updates shape:", updates.shape().DebugString()));
+    }
+
+    const int64 ix = indices.shape().dim_size(outer_dims);
+    OP_REQUIRES(
+        c, updates.shape().dims() - outer_dims == shape.dims() - ix,
+        errors::InvalidArgument("Inner dimensions of output shape must match "
+                                "inner dimensions of updates shape. Output: ",
+                                shape.DebugString(),
+                                " updates: ", updates.shape().DebugString()));
+    for (int i = 0; i + outer_dims < updates.shape().dims(); ++i) {
+      OP_REQUIRES(
+          c, updates.shape().dim_size(i + outer_dims) == shape.dim_size(ix + i),
+          errors::InvalidArgument(
+              "The inner ", shape.dims() - ix,
+              " dimensions of output.shape=", shape.DebugString(),
+              " must match the inner ", updates.shape().dims() - outer_dims,
+              " dimensions of updates.shape=", updates.shape().DebugString()));
+    }
+    OP_REQUIRES(c, shape_input.dims() == 1,
+                errors::InvalidArgument("Shape must be a vector"));
+
     Tensor out;
     OP_REQUIRES_OK(
         c, functor::DoScatterNd<Device, T, Index, scatter_nd_op::UpdateOp::ADD>(
@@ -257,6 +300,7 @@ TF_CALL_string(REGISTER_SCATTER_ND_CPU);
   REGISTER_SCATTER_ND_UPDATE_GPU(type);   \
   REGISTER_SCATTER_ND_GPU(type);
 
+TF_CALL_int32(REGISTER_SCATTER_ND_ALL_GPU);
 // TODO(b/66916790): Support half types in ScatterNd.
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_ALL_GPU);
 TF_CALL_complex64(REGISTER_SCATTER_ND_ALL_GPU);
@@ -271,6 +315,8 @@ TF_CALL_complex128(REGISTER_SCATTER_ND_ALL_GPU);
 #define REGISTER_SCATTER_ND_UPDATE_SYCL(type) \
   REGISTER_SCATTER_ND_UPDATE(type, SYCL);
 
+TF_CALL_int32(REGISTER_SCATTER_ND_ADD_SUB_SYCL);
+TF_CALL_int32(REGISTER_SCATTER_ND_UPDATE_SYCL);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_ADD_SUB_SYCL);
 TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_UPDATE_SYCL);
 #undef REGISTER_SCATTER_ND_ADD_SUB_SYCL
@@ -541,6 +587,7 @@ namespace functor {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
+TF_CALL_int32(DECLARE_GPU_SPECS);
 // TODO(b/66916790): Support half types in ScatterNd.
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
index 3f23a29aab6703..d2f6925aef18fb 100644
--- a/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/scatter_nd_op_gpu.cu.cc
@@ -170,6 +170,7 @@ struct ScatterNdFunctor<GPUDevice, T, Index, op, IXDIM> {
   DECLARE_GPU_SPECS_INDEX(T, int32); \
   DECLARE_GPU_SPECS_INDEX(T, int64)
 
+TF_CALL_int32(DECLARE_GPU_SPECS);
 TF_CALL_GPU_NUMBER_TYPES(DECLARE_GPU_SPECS);
 TF_CALL_complex64(DECLARE_GPU_SPECS);
 TF_CALL_complex128(DECLARE_GPU_SPECS);
diff --git a/tensorflow/core/kernels/scoped_allocator_ops.cc b/tensorflow/core/kernels/scoped_allocator_ops.cc
index d7b25ffad04083..1d2fb6996a3fcf 100644
--- a/tensorflow/core/kernels/scoped_allocator_ops.cc
+++ b/tensorflow/core/kernels/scoped_allocator_ops.cc
@@ -94,7 +94,8 @@ class ScopedAllocatorConcatOp : public OpKernel {
       : OpKernel(context) {
     OP_REQUIRES_OK(context, context->GetAttr("shape", &shape_));
     OP_REQUIRES_OK(context, context->GetAttr("T", &dtype_));
-    // This stuff is just for debugging
+    OP_REQUIRES_OK(context, context->GetAttr("reshape", &reshape_));
+    // These attributes are just for debugging.
     OP_REQUIRES_OK(context, context->GetAttr("sa_name", &name_));
     OP_REQUIRES_OK(context, context->GetAttr("id", &id_));
     device_ = context->device();
@@ -112,13 +113,16 @@ class ScopedAllocatorConcatOp : public OpKernel {
     OP_REQUIRES(context, backing_tensor.NumElements() >= shape_.num_elements(),
                 errors::InvalidArgument("Backing tensor num elements ",
                                         backing_tensor.NumElements(),
-                                        " is not equal to expected ",
+                                        " is not >= to expected ",
                                         shape_.num_elements()));
-    VLOG(1) << "_ScopedAllocatorConcatOp outputting backing tensor at "
-            << DMAHelper::base(&backing_tensor);
-    Tensor backing_copy(backing_tensor);
-    context->set_output(0, backing_copy);
-    const TensorBuffer* backing_buf = DMAHelper::buffer(&backing_copy);
+    Tensor output(dtype_);
+    if (reshape_) {
+      CHECK(output.CopyFrom(backing_tensor, shape_));
+    } else {
+      CHECK(output.CopyFrom(backing_tensor, backing_tensor.shape()));
+    }
+    context->set_output(0, output);
+    const TensorBuffer* backing_buf = DMAHelper::buffer(&output);
     const void* backing_tensor_lb = backing_buf->data();
     const void* backing_tensor_ub = static_cast<const void*>(
         static_cast<const char*>(backing_tensor_lb) + backing_buf->size());
@@ -126,17 +130,27 @@ class ScopedAllocatorConcatOp : public OpKernel {
     for (int i = 1; i < context->num_inputs(); ++i) {
       const TensorBuffer* input_buf = DMAHelper::buffer(&context->input(i));
       const void* input_lb = input_buf->data();
-      OP_REQUIRES(
-          context, input_lb >= backing_tensor_lb,
-          errors::InvalidArgument("Lower bound check fail for input ", i,
-                                  " to node ", context->op_kernel().name()));
       const void* input_ub = static_cast<const void*>(
           static_cast<const char*>(input_lb) + input_buf->size());
+      OP_REQUIRES(
+          context, input_lb >= backing_tensor_lb,
+          errors::InvalidArgument(
+              "Lower bound check fail for input ", i, " from node ",
+              context->op_kernel().requested_input(i), " to node ",
+              context->op_kernel().name(), " input bounds = [", input_lb, ", ",
+              input_ub, "]", " backing_tensor bounds = [", backing_tensor_lb,
+              ", ", backing_tensor_ub, "]"));
       OP_REQUIRES(
           context, input_ub <= backing_tensor_ub,
-          errors::InvalidArgument("Upper bound check fail for input ", i,
-                                  " to node ", context->op_kernel().name()));
+          errors::InvalidArgument(
+              "Upper bound check fail for input ", i, " from node ",
+              context->op_kernel().requested_input(i), " to node ",
+              context->op_kernel().name(), " input bounds = [", input_lb, ", ",
+              input_ub, "]", " backing_tensor bounds = [", backing_tensor_lb,
+              ", ", backing_tensor_ub, "]"));
     }
+    VLOG(1) << "_ScopedAllocatorConcatOp outputting backing tensor at "
+            << backing_buf;
   }
 
  private:
@@ -144,6 +158,7 @@ class ScopedAllocatorConcatOp : public OpKernel {
   DataType dtype_;
   string name_;
   int32 id_;
+  bool reshape_;
   DeviceBase* device_;
 };
 
diff --git a/tensorflow/core/kernels/scoped_allocator_ops_test.cc b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
index 3d36c8b7d43748..634f9ba8876ea0 100644
--- a/tensorflow/core/kernels/scoped_allocator_ops_test.cc
+++ b/tensorflow/core/kernels/scoped_allocator_ops_test.cc
@@ -37,10 +37,12 @@ namespace tensorflow {
 
 class ScopedAllocatorOpTest : public OpsTestBase {
  protected:
-  void MakeOp(const gtl::ArraySlice<TensorShape>& shapes, DataType dtype,
+  void MakeOp(const TensorShape& shape,
+              const gtl::ArraySlice<TensorShape>& shapes, DataType dtype,
               const string& name, int32 id, int32 expected_call_count) {
     TF_EXPECT_OK(NodeDefBuilder("scoped_allocator_op", "_ScopedAllocator")
                      .Attr("T", dtype)
+                     .Attr("shape", shape)
                      .Attr("shapes", shapes)
                      .Attr("sa_name", name)
                      .Attr("id", id)
@@ -61,12 +63,14 @@ class ScopedAllocatorOpTest : public OpsTestBase {
 };
 
 TEST_F(ScopedAllocatorOpTest, Simple) {
-  MakeOp({TensorShape({8})}, DT_FLOAT, "test", 120, 1);
-  MakeOp({TensorShape({32, 32})}, DT_DOUBLE, "test1", 130, 1);
-  MakeOp({TensorShape({64}), TensorShape({3, 3}), TensorShape({5, 5, 5})},
+  MakeOp(TensorShape({8}), {TensorShape({8})}, DT_FLOAT, "test", 120, 1);
+  MakeOp(TensorShape({1024}), {TensorShape({32, 32})}, DT_DOUBLE, "test1", 130,
+         1);
+  MakeOp(TensorShape({204}),
+         {TensorShape({64}), TensorShape({3, 3}), TensorShape({5, 5, 5})},
          DT_HALF, "test2", 140, 3);
-  MakeOp({TensorShape({512}), TensorShape({64, 8})}, DT_UINT32, "test3", 150,
-         2);
+  MakeOp(TensorShape({1024}), {TensorShape({512}), TensorShape({64, 8})},
+         DT_UINT32, "test3", 150, 2);
 }
 
 // PrepOp is common to ConcatOp tests and SplitOpTests.
@@ -120,18 +124,43 @@ void PrepOp(DataType dtype, int32 id,
 
 class ScopedAllocatorConcatOpTest : public OpsTestBase {
  protected:
-  void MakeOp(const TensorShape& shape, DataType dtype, const string& name,
-              int32 id, int32 num_tensors) {
+  void BuildNodeDef(const TensorShape& shape, DataType dtype,
+                    const string& name, int32 id, int32 num_tensors) {
+    TF_EXPECT_OK(
+        NodeDefBuilder("scoped_allocator_concat_op", "_ScopedAllocatorConcat")
+            .Attr("shape", shape)
+            .Attr("T", dtype)
+            .Attr("N", num_tensors)
+            .Attr("sa_name", name)
+            .Attr("id", id)
+            .Input(FakeInput(dtype))               // backing tensor
+            .Input(FakeInput(num_tensors, dtype))  // list of tensors
+            .Finalize(node_def()));
+    shape_ = shape;
+    reshape_ = false;
+  }
+
+  void BuildNodeDefWithReshape(const TensorShape& shape, DataType dtype,
+                               bool reshape, const string& name, int32 id,
+                               int32 num_tensors) {
     TF_EXPECT_OK(
         NodeDefBuilder("scoped_allocator_concat_op", "_ScopedAllocatorConcat")
             .Attr("shape", shape)
             .Attr("T", dtype)
+            .Attr("reshape", reshape)
             .Attr("N", num_tensors)
             .Attr("sa_name", name)
             .Attr("id", id)
             .Input(FakeInput(dtype))               // backing tensor
             .Input(FakeInput(num_tensors, dtype))  // list of tensors
             .Finalize(node_def()));
+    shape_ = shape;
+    reshape_ = reshape;
+  }
+
+  void MakeOp(const TensorShape& shape, DataType dtype, bool reshape,
+              const string& name, int32 id, int32 num_tensors) {
+    BuildNodeDefWithReshape(shape, dtype, reshape, name, id, num_tensors);
     TF_EXPECT_OK(InitOp());
   }
 
@@ -141,7 +170,7 @@ class ScopedAllocatorConcatOpTest : public OpsTestBase {
     std::vector<Tensor> tensors;
     std::vector<ScopedAllocator::Field> fields;
     PrepOp(dtype, id, fields_shapes, &fields, &backing_tensor, allocator(),
-           device_->GetScopedAllocatorMgr(), "split", &tensors, &inputs_,
+           device_->GetScopedAllocatorMgr(), "concat", &tensors, &inputs_,
            input_types_);
 
     TF_ASSERT_OK(RunOpKernel());
@@ -155,34 +184,60 @@ class ScopedAllocatorConcatOpTest : public OpsTestBase {
     CHECK_EQ(DMAHelper::base(&input), DMAHelper::base(&output));
     CHECK_EQ(input.dtype(), output.dtype());
     CHECK_EQ(input.NumElements(), output.NumElements());
+    if (reshape_) {
+      CHECK_EQ(shape_, output.shape());
+    } else {
+      TensorShape expected_shape({input.NumElements()});
+      CHECK_EQ(expected_shape, output.shape());
+    }
 
     // Free the backing tensor which was allocated in PrepOp.
     delete backing_tensor;
   }
+
+ private:
+  TensorShape shape_;
+  bool reshape_;
 };
 
 TEST_F(ScopedAllocatorConcatOpTest, Success1) {
-  MakeOp({32}, DT_FLOAT, "test", 120, 2);
+  MakeOp({32}, DT_FLOAT, false, "test", 120, 2);
   ExecOp(DT_FLOAT, 120, {{16}, {16}});
 }
 
 TEST_F(ScopedAllocatorConcatOpTest, Success2) {
-  MakeOp({2, 2, 2}, DT_DOUBLE, "test", 120, 2);
+  MakeOp({2, 2, 2}, DT_DOUBLE, false, "test", 120, 2);
   ExecOp(DT_DOUBLE, 120, {{2, 2}, {2, 2}});
 }
 
 TEST_F(ScopedAllocatorConcatOpTest, Success3) {
-  MakeOp({3, 3, 3}, DT_HALF, "test", 120, 3);
+  MakeOp({3, 3, 3}, DT_HALF, false, "test", 120, 3);
   ExecOp(DT_HALF, 120, {{3, 3}, {3, 3}, {3, 3}});
 }
 
+TEST_F(ScopedAllocatorConcatOpTest, Reshape) {
+  MakeOp({2, 2, 4}, DT_DOUBLE, true, "test", 120, 2);
+
+  // The elements of the third parameter to ExecOp must be multiples of
+  // Allocator::kAllocatorAlignment in size.  If they are not, the backing
+  // tensor allocated by PrepOp will have too many elements and reshaping
+  // will fail.
+  ExecOp(DT_DOUBLE, 120, {{2, 4}, {2, 4}});
+}
+
+TEST_F(ScopedAllocatorConcatOpTest, NoReshapeAttr) {
+  BuildNodeDef({3, 4, 4}, DT_HALF, "test", 120, 3);
+  TF_EXPECT_OK(InitOp());
+  ExecOp(DT_HALF, 120, {{4, 4}, {4, 4}, {4, 4}});
+}
+
 TEST_F(ScopedAllocatorConcatOpTest, FailDtypeCheck) {
-  MakeOp({8}, DT_FLOAT, "test", 120, 2);
+  MakeOp({8}, DT_FLOAT, false, "test", 120, 2);
   EXPECT_DEATH(ExecOp(DT_DOUBLE, 120, {{4}, {4}}), "");
 }
 
 TEST_F(ScopedAllocatorConcatOpTest, FailNumElementsCheck) {
-  MakeOp({32}, DT_FLOAT, "test", 120, 2);
+  MakeOp({32}, DT_FLOAT, false, "test", 120, 2);
   AddInputFromArray<float>({8}, {0, 1, 2, 3, 4, 5, 6, 7});
   AddInputFromArray<float>({4}, {0, 1, 2, 3});
   AddInputFromArray<float>({4}, {4, 5, 6, 7});
@@ -193,7 +248,7 @@ TEST_F(ScopedAllocatorConcatOpTest, FailNumElementsCheck) {
 // This test should fail because the backing tensor and the input tensors are
 // unrelated, i.e. the inputs are not slices of the backing tensor.
 TEST_F(ScopedAllocatorConcatOpTest, FailBounds) {
-  MakeOp({8}, DT_DOUBLE, "test", 120, 2);
+  MakeOp({8}, DT_DOUBLE, false, "test", 120, 2);
   AddInputFromArray<double>({8}, {0, 1, 2, 3, 4, 5, 6, 7});
   AddInputFromArray<double>({4}, {0, 1, 2, 3});
   AddInputFromArray<double>({4}, {4, 5, 6, 7});
@@ -203,23 +258,26 @@ TEST_F(ScopedAllocatorConcatOpTest, FailBounds) {
 
 class ScopedAllocatorSplitOpTest : public OpsTestBase {
  protected:
-  void BuildNodeDef(const TensorShape& shape, DataType dtype,
-                    const string& name, int32 id, int32 num_tensors) {
+  void BuildNodeDef(const TensorShape& in_shape, DataType dtype,
+                    const string& name, int32 id, int32 num_tensors,
+                    const std::vector<TensorShape>& out_shapes) {
     TF_EXPECT_OK(
         NodeDefBuilder("scoped_allocator_split_op", "_ScopedAllocatorSplit")
             .Attr("T", dtype)
             .Attr("N", num_tensors)
             .Attr("sa_name", name)
             .Attr("id", id)
+            .Attr("shapes", out_shapes)
             .Input(FakeInput(dtype))  // backing tensor and input
             .Input(
                 FakeInput(num_tensors, dtype))  // list of subtensors to forward
             .Finalize(node_def()));
   }
 
-  void MakeOp(const TensorShape& shape, DataType dtype, const string& name,
-              int32 id, int32 num_tensors) {
-    BuildNodeDef(shape, dtype, name, id, num_tensors);
+  void MakeOp(const TensorShape& in_shape, DataType dtype, const string& name,
+              int32 id, int32 num_tensors,
+              const std::vector<TensorShape>& out_shapes) {
+    BuildNodeDef(in_shape, dtype, name, id, num_tensors, out_shapes);
     TF_EXPECT_OK(InitOp());
   }
 
@@ -259,33 +317,33 @@ class ScopedAllocatorSplitOpTest : public OpsTestBase {
 };
 
 TEST_F(ScopedAllocatorSplitOpTest, Success1) {
-  MakeOp({32}, DT_FLOAT, "test", 120, 2);
+  MakeOp({32}, DT_FLOAT, "test", 120, 2, {{16}, {16}});
   ExecOp(DT_FLOAT, 120, {{16}, {16}});
 }
 
 TEST_F(ScopedAllocatorSplitOpTest, Success2) {
-  MakeOp({2, 2, 2}, DT_DOUBLE, "test", 120, 2);
+  MakeOp({2, 2, 2}, DT_DOUBLE, "test", 120, 2, {{2, 2}, {2, 2}});
   ExecOp(DT_DOUBLE, 120, {{2, 2}, {2, 2}});
 }
 
 TEST_F(ScopedAllocatorSplitOpTest, Success3) {
-  MakeOp({3, 3, 3}, DT_HALF, "test", 120, 3);
+  MakeOp({3, 3, 3}, DT_HALF, "test", 120, 3, {{3, 3}, {3, 3}, {3, 3}});
   ExecOp(DT_HALF, 120, {{3, 3}, {3, 3}, {3, 3}});
 }
 
 TEST_F(ScopedAllocatorSplitOpTest, FailNLessThan2) {
-  BuildNodeDef({4, 4}, DT_FLOAT, "test", 120, 1);
+  BuildNodeDef({4, 4}, DT_FLOAT, "test", 120, 1, {{4, 4}});
   Status s = InitOp();
   EXPECT_EQ(s.code(), error::INVALID_ARGUMENT);
 }
 
 TEST_F(ScopedAllocatorSplitOpTest, FailDtypeCheck) {
-  MakeOp({8}, DT_FLOAT, "test", 120, 2);
+  MakeOp({8}, DT_FLOAT, "test", 120, 2, {{4}, {4}});
   EXPECT_DEATH(ExecOp(DT_HALF, 120, {{4}, {4}}), "");
 }
 
 TEST_F(ScopedAllocatorSplitOpTest, FailBounds) {
-  MakeOp({8}, DT_DOUBLE, "test", 120, 2);
+  MakeOp({8}, DT_DOUBLE, "test", 120, 2, {{4}, {4}});
   AddInputFromArray<double>({8}, {0, 1, 2, 3, 4, 5, 6, 7});
   AddInputFromArray<double>({4}, {0, 1, 2, 3});
   AddInputFromArray<double>({4}, {4, 5, 6, 7});
diff --git a/tensorflow/core/kernels/segment_reduction_ops.cc b/tensorflow/core/kernels/segment_reduction_ops.cc
index 61941672fde3de..f533b6e46c985f 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops.cc
@@ -326,7 +326,9 @@ class SegmentSumGPUOp : public AsyncOpKernel {
   REGISTER_CPU_KERNEL_SEGMENT("SegmentSum", Eigen::internal::SumReducer<type>, \
                               type, index_type, 0);                            \
   REGISTER_CPU_KERNEL_SEGMENT(                                                 \
-      "SegmentProd", Eigen::internal::ProdReducer<type>, type, index_type, 1)
+      "SegmentMean", Eigen::internal::MeanReducer<type>, type, index_type, 0); \
+  REGISTER_CPU_KERNEL_SEGMENT(                                                 \
+      "SegmentProd", Eigen::internal::ProdReducer<type>, type, index_type, 1);
 
 #define REGISTER_REAL_CPU_KERNELS_ALL(type) \
   REGISTER_REAL_CPU_KERNELS(type, int32);   \
diff --git a/tensorflow/core/kernels/segment_reduction_ops.h b/tensorflow/core/kernels/segment_reduction_ops.h
index 97cf7e991c163f..85f6a7a782a530 100644
--- a/tensorflow/core/kernels/segment_reduction_ops.h
+++ b/tensorflow/core/kernels/segment_reduction_ops.h
@@ -17,35 +17,6 @@ limitations under the License.
 #define THIRD_PARTY_TENSORFLOW_CORE_KERNELS_SEGMENT_REDUCTION_OPS_H_
 
 
-// This file requires the following include because it uses GpuAtomicMax:
-// #include "tensorflow/core/util/gpu_kernel_helper.h"
-
-// Unfortunately we can't add the #include, since it breaks compilation for
-// non-GPU targets. This only breaks in clang, because it's more strict for
-// template code and GpuAtomicMax is used in template context.
-
-
-// This file requires the following include because it uses GpuAtomicMax:
-// #include "tensorflow/core/util/gpu_kernel_helper.h"
-
-// Unfortunately we can't add the #include, since it breaks compilation for
-// non-GPU targets. This only breaks in clang, because it's more strict for
-// template code and GpuAtomicMax is used in template context.
-
-// This file requires the following include because it uses GpuAtomicMax:
-// #include "tensorflow/core/util/gpu_kernel_helper.h"
-
-// Unfortunately we can't add the #include, since it breaks compilation for
-// non-GPU targets. This only breaks in clang, because it's more strict for
-// template code and GpuAtomicMax is used in template context.
-
-// This file requires the following include because it uses GpuAtomicMax:
-// #include "tensorflow/core/util/gpu_kernel_helper.h"
-
-// Unfortunately we can't add the #include, since it breaks compilation for
-// non-GPU targets. This only breaks in clang, because it's more strict for
-// template code and GpuAtomicMax is used in template context.
-
 // This file requires the following include because it uses GpuAtomicMax:
 // #include "tensorflow/core/util/gpu_kernel_helper.h"
 
diff --git a/tensorflow/core/kernels/sparse_fill_empty_rows_op.cc b/tensorflow/core/kernels/sparse_fill_empty_rows_op.cc
index d17b72bc26a6e0..c9365be5119391 100644
--- a/tensorflow/core/kernels/sparse_fill_empty_rows_op.cc
+++ b/tensorflow/core/kernels/sparse_fill_empty_rows_op.cc
@@ -125,7 +125,7 @@ class SparseFillEmptyRowsOp : public OpKernel {
       // Scratch here describes the number of elements in this dense row
       empty_row_indicator(row) = (scratch(row) == 0);
       // In filled version, each row has at least one element.
-      scratch(row) = std::max(scratch(row), 1LL);
+      scratch(row) = std::max(scratch(row), int64{1});
       // Update scratch to represent the number of elements up to and
       // including dense_row + 1:
       //  scratch(0) == #{elements of row 0}
diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index a1f9667b783ca5..866c5dcd521b2a 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -1490,7 +1490,7 @@ inline void LibxsmmSparseMatMul<TL, TR>::Compute(
 
 #endif  // TENSORFLOW_USE_LIBXSMM
 
-// Here is a an overview of the SparseMatMul code. Note that we assume that the
+// Here is an overview of the SparseMatMul code. Note that we assume that the
 // left matrix is sparse.
 //
 // The matrix "left" is divided into a grid with blocksize of (M, KL). Each
diff --git a/tensorflow/core/kernels/string_split_op.cc b/tensorflow/core/kernels/string_split_op.cc
index 4c2b312c3454e6..3996ff002788a1 100644
--- a/tensorflow/core/kernels/string_split_op.cc
+++ b/tensorflow/core/kernels/string_split_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
@@ -43,6 +44,63 @@ std::vector<string> Split(const string& str, const string& delimiter,
   return char_vector;
 }
 
+std::vector<string> SplitV2(const string& str, StringPiece sep, int maxsplit) {
+  // This SplitV2 method matches the behavior of python's str.split:
+  //   If sep is given, consecutive delimiters are not grouped together
+  //   and are deemed to delimit empty strings (for example, '1,,2'.split(',')
+  //   returns ['1', '', '2']). The sep argument may consist of multiple
+  //   characters (for example, '1<>2<>3'.split('<>') returns ['1', '2', '3']).
+  //   Splitting an empty string with a specified separator returns [''].
+  //
+  //   If sep is not specified or is None, a different splitting algorithm is
+  //   applied: runs of consecutive whitespace are regarded as a single
+  //   separator, and the result will contain no empty strings at the start or
+  //   end if the string has leading or trailing whitespace. Consequently,
+  //   splitting an empty string or a string consisting of just whitespace
+  //   with a None separator returns [].
+
+  std::vector<string> result;
+
+  StringPiece text(str);
+  if (maxsplit == 0) {
+    result.emplace_back(std::string(text));
+    return result;
+  }
+
+  if (sep.empty()) {
+    StringPiece token;
+    // Remove leading whitespaces.
+    str_util::RemoveLeadingWhitespace(&text);
+    int split = 0;
+    while (str_util::ConsumeNonWhitespace(&text, &token)) {
+      result.emplace_back(std::string(token));
+      str_util::RemoveLeadingWhitespace(&text);
+      ++split;
+      if (maxsplit > 0 && split == maxsplit) {
+        result.emplace_back(std::string(text));
+        return result;
+      }
+    }
+    return result;
+  }
+  auto p = std::search(text.begin(), text.end(), sep.begin(), sep.end());
+  int split = 0;
+  while (p != text.end()) {
+    StringPiece token = text.substr(0, p - text.begin());
+    result.emplace_back(std::string(token));
+    text.remove_prefix(token.size());
+    text.remove_prefix(sep.size());
+    ++split;
+    if (maxsplit > 0 && split == maxsplit) {
+      result.emplace_back(std::string(text));
+      return result;
+    }
+    p = std::search(text.begin(), text.end(), sep.begin(), sep.end());
+  }
+  result.emplace_back(std::string(text));
+  return result;
+}
+
 }  // namespace
 
 class StringSplitOp : public OpKernel {
@@ -122,6 +180,78 @@ class StringSplitOp : public OpKernel {
   bool skip_empty_;
 };
 
+class StringSplitV2Op : public OpKernel {
+ public:
+  explicit StringSplitV2Op(OpKernelConstruction* context)
+      : OpKernel(context), maxsplit_(-1) {
+    context->GetAttr("maxsplit", &maxsplit_);
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* input_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(input_tensor->shape()),
+                errors::InvalidArgument("input must be a vector, got shape: ",
+                                        input_tensor->shape().DebugString()));
+
+    const auto input_vec = input_tensor->vec<string>();
+    const int64 batch_size = input_vec.dimension(0);
+
+    const Tensor* sep_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("sep", &sep_tensor));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(sep_tensor->shape()),
+                errors::InvalidArgument("sep must be a scalar, got shape: ",
+                                        sep_tensor->shape().DebugString()));
+    const auto sep_vec = sep_tensor->flat<string>();
+    StringPiece sep(sep_vec(0));
+    std::vector<string> tokens;
+    // Guess that we'll be unpacking a handful of tokens per example.
+    static constexpr int kReserveSize = 4;
+    tokens.reserve(batch_size * kReserveSize);
+
+    int64 output_size = 0;
+    int64 max_num_entries = 0;
+    std::vector<int64> num_indices(batch_size);
+    for (int64 i = 0; i < batch_size; ++i) {
+      std::vector<string> parts = SplitV2(input_vec(i), sep, maxsplit_);
+      int64 n_entries = parts.size();
+      num_indices[i] = n_entries;
+      output_size += n_entries;
+      max_num_entries = std::max(max_num_entries, n_entries);
+      tokens.insert(tokens.end(), parts.begin(), parts.end());
+    }
+
+    Tensor* sp_indices_t;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({output_size, 2}),
+                                             &sp_indices_t));
+    Tensor* sp_tokens_t;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output(1, TensorShape({output_size}), &sp_tokens_t));
+    Tensor* sp_shape_t;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(2, TensorShape({2}), &sp_shape_t));
+
+    auto sp_indices = sp_indices_t->matrix<int64>();
+    auto sp_tokens = sp_tokens_t->vec<string>();
+    auto sp_shape = sp_shape_t->vec<int64>();
+    sp_shape(0) = batch_size;
+    sp_shape(1) = max_num_entries;
+    size_t c = 0;
+    for (size_t i = 0; i < batch_size; ++i) {
+      for (size_t j = 0; j < num_indices[i]; ++j) {
+        sp_indices(c, 0) = i;
+        sp_indices(c, 1) = j;
+        sp_tokens(c) = tokens[c];
+        ++c;
+      }
+    }
+  }
+
+ private:
+  int maxsplit_;
+};
+
 REGISTER_KERNEL_BUILDER(Name("StringSplit").Device(DEVICE_CPU), StringSplitOp);
+REGISTER_KERNEL_BUILDER(Name("StringSplitV2").Device(DEVICE_CPU),
+                        StringSplitV2Op);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/string_strip_op.cc b/tensorflow/core/kernels/string_strip_op.cc
index ae700f42942da0..2aeafa28c441fb 100644
--- a/tensorflow/core/kernels/string_strip_op.cc
+++ b/tensorflow/core/kernels/string_strip_op.cc
@@ -43,7 +43,7 @@ class StringStripOp : public OpKernel {
     for (int64 i = 0; i < input.size(); ++i) {
       StringPiece entry(input(i));
       str_util::RemoveWhitespaceContext(&entry);
-      output(i) = entry.ToString();
+      output(i) = std::string(entry);
     }
   }
 };
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index 9f04fd7bbbecb9..0eb9de9f151fa7 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -293,7 +293,7 @@ class TensorArrayGradOp : public TensorArrayCreationOp {
                                        resource.name());
       }
       tensor_array_name =
-          StringPiece(resource.name()).substr(container.size()).ToString();
+          std::string(StringPiece(resource.name()).substr(container.size()));
     }
 
     auto output_handle = tensor_array_output_handle->flat<string>();
diff --git a/tensorflow/core/kernels/whole_file_read_ops.cc b/tensorflow/core/kernels/whole_file_read_ops.cc
index 17a39ce29b4ea8..ed2bf3e8e2fb0b 100644
--- a/tensorflow/core/kernels/whole_file_read_ops.cc
+++ b/tensorflow/core/kernels/whole_file_read_ops.cc
@@ -134,7 +134,7 @@ class WriteFileOp : public OpKernel {
                     "Contents tensor must be scalar, but had shape: ",
                     contents_input->shape().DebugString()));
     const string& filename = filename_input->scalar<string>()();
-    const string dir = io::Dirname(filename).ToString();
+    const string dir = std::string(io::Dirname(filename));
     if (!context->env()->FileExists(dir).ok()) {
       OP_REQUIRES_OK(context, context->env()->RecursivelyCreateDir(dir));
     }
diff --git a/tensorflow/core/lib/core/error_codes.proto b/tensorflow/core/lib/core/error_codes.proto
index b82d3891460cb4..5ced65a97331cd 100644
--- a/tensorflow/core/lib/core/error_codes.proto
+++ b/tensorflow/core/lib/core/error_codes.proto
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ErrorCodesProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/lib/core";
 
 // The canonical error codes for TensorFlow APIs.
 //
diff --git a/tensorflow/core/lib/core/stringpiece.h b/tensorflow/core/lib/core/stringpiece.h
index 0cf6c248509aa0..d7ecc44e507e25 100644
--- a/tensorflow/core/lib/core/stringpiece.h
+++ b/tensorflow/core/lib/core/stringpiece.h
@@ -92,6 +92,7 @@ class StringPiece {
   StringPiece substr(size_t pos, size_t n = npos) const;
 
   // Return a string that contains the copy of the referenced data.
+  // DEPRECATED: use std::string(sv) instead.
   std::string ToString() const { return std::string(data_, size_); }
 
   // Three-way comparison.  Returns value:
@@ -100,6 +101,13 @@ class StringPiece {
   //   >  0 iff "*this" >  "b"
   int compare(StringPiece b) const;
 
+  // Converts to `std::basic_string`.
+  template <typename A>
+  explicit operator std::basic_string<char, std::char_traits<char>, A>() const {
+    if (!data()) return {};
+    return std::basic_string<char, std::char_traits<char>, A>(data(), size());
+  }
+
  private:
   const char* data_;
   size_t size_;
diff --git a/tensorflow/core/lib/core/stringpiece_test.cc b/tensorflow/core/lib/core/stringpiece_test.cc
index de35d6eac6e854..952b9eaaaae43a 100644
--- a/tensorflow/core/lib/core/stringpiece_test.cc
+++ b/tensorflow/core/lib/core/stringpiece_test.cc
@@ -55,4 +55,9 @@ TEST(StringPiece, Ctor) {
   }
 }
 
+TEST(StringPiece, ConversionToString) {
+  EXPECT_EQ("", std::string(StringPiece("")));
+  EXPECT_EQ("foo", std::string(StringPiece("foo")));
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/core/threadpool.cc b/tensorflow/core/lib/core/threadpool.cc
index e55ed79d36cd2d..99684ae47b547d 100644
--- a/tensorflow/core/lib/core/threadpool.cc
+++ b/tensorflow/core/lib/core/threadpool.cc
@@ -59,10 +59,9 @@ struct EigenEnvironment {
 
   Task CreateTask(std::function<void()> f) {
     uint64 id = 0;
-    if (port::Tracing::IsActive()) {
-      id = port::Tracing::UniqueId();
-      port::Tracing::RecordEvent(port::Tracing::EventCategory::kScheduleClosure,
-                                 id);
+    if (tracing::EventCollector::IsEnabled()) {
+      id = tracing::GetUniqueArg();
+      tracing::RecordEvent(tracing::EventCategory::kScheduleClosure, id);
     }
     return Task{
         std::unique_ptr<TaskImpl>(new TaskImpl{
@@ -75,13 +74,9 @@ struct EigenEnvironment {
 
   void ExecuteTask(const Task& t) {
     WithContext wc(t.f->context);
-    if (t.f->trace_id != 0) {
-      port::Tracing::ScopedActivity region(
-          port::Tracing::EventCategory::kRunClosure, t.f->trace_id);
-      t.f->f();
-    } else {
-      t.f->f();
-    }
+    tracing::ScopedRegion region(tracing::EventCategory::kRunClosure,
+                                 t.f->trace_id);
+    t.f->f();
   }
 };
 
diff --git a/tensorflow/core/lib/db/BUILD b/tensorflow/core/lib/db/BUILD
index ce09c2009ac81b..7a64306c6e9620 100644
--- a/tensorflow/core/lib/db/BUILD
+++ b/tensorflow/core/lib/db/BUILD
@@ -29,6 +29,7 @@ cc_library(
         "@org_sqlite",
         "@snappy",
     ],
+    alwayslink = 1,
 )
 
 tf_cc_test(
diff --git a/tensorflow/core/lib/gtl/flatmap.h b/tensorflow/core/lib/gtl/flatmap.h
index 889d2ddaa6be36..9dc439c163733c 100644
--- a/tensorflow/core/lib/gtl/flatmap.h
+++ b/tensorflow/core/lib/gtl/flatmap.h
@@ -76,6 +76,10 @@ class FlatMap {
 
   FlatMap(const FlatMap& src) : rep_(src.rep_) {}
 
+  // Move constructor leaves src in a valid but unspecified state (same as
+  // std::unordered_map).
+  FlatMap(FlatMap&& src) : rep_(std::move(src.rep_)) {}
+
   template <typename InputIter>
   FlatMap(InputIter first, InputIter last, size_t N = 1,
           const Hash& hf = Hash(), const Eq& eq = Eq())
@@ -92,6 +96,13 @@ class FlatMap {
     return *this;
   }
 
+  // Move-assignment operator leaves src in a valid but unspecified state (same
+  // as std::unordered_map).
+  FlatMap& operator=(FlatMap&& src) {
+    rep_.MoveFrom(std::move(src.rep_));
+    return *this;
+  }
+
   ~FlatMap() {}
 
   void swap(FlatMap& x) { rep_.swap(x.rep_); }
diff --git a/tensorflow/core/lib/gtl/flatmap_test.cc b/tensorflow/core/lib/gtl/flatmap_test.cc
index 0901eba9265a48..0fd22ab37be6be 100644
--- a/tensorflow/core/lib/gtl/flatmap_test.cc
+++ b/tensorflow/core/lib/gtl/flatmap_test.cc
@@ -656,19 +656,33 @@ TEST(FlatMap, UniqueMap) {
   }
   EXPECT_EQ(map.size(), N);
 
+  // move constructor
+  UniqMap map2(std::move(map));
+
   // Lookups
   for (int i = 0; i < N; i++) {
-    EXPECT_EQ(*map.at(MakeUniq(i)), i + 100);
+    EXPECT_EQ(*map2.at(MakeUniq(i)), i + 100);
   }
 
+  // move assignment
+  UniqMap map3;
+  map3 = std::move(map2);
+
   // find+erase
-  EXPECT_EQ(map.count(MakeUniq(2)), 1);
-  map.erase(MakeUniq(2));
-  EXPECT_EQ(map.count(MakeUniq(2)), 0);
+  EXPECT_EQ(map3.count(MakeUniq(2)), 1);
+  map3.erase(MakeUniq(2));
+  EXPECT_EQ(map3.count(MakeUniq(2)), 0);
 
   // clear
-  map.clear();
-  EXPECT_EQ(map.size(), 0);
+  map3.clear();
+  EXPECT_EQ(map3.size(), 0);
+
+  // Check that moved-from maps are in a valid (though unspecified) state.
+  EXPECT_GE(map.size(), 0);
+  EXPECT_GE(map2.size(), 0);
+  // This insert should succeed no matter what state `map` is in, because
+  // MakeUniq(-1) is never called above: This key can't possibly exist.
+  EXPECT_TRUE(map.emplace(MakeUniq(-1), MakeUniq(-1)).second);
 }
 
 TEST(FlatMap, UniqueMapIter) {
diff --git a/tensorflow/core/lib/gtl/flatrep.h b/tensorflow/core/lib/gtl/flatrep.h
index 0d7e7487fc3335..65a076b0f39d5e 100644
--- a/tensorflow/core/lib/gtl/flatrep.h
+++ b/tensorflow/core/lib/gtl/flatrep.h
@@ -51,10 +51,23 @@ class FlatRep {
   FlatRep(size_t N, const Hash& hf, const Eq& eq) : hash_(hf), equal_(eq) {
     Init(N);
   }
-  explicit FlatRep(const FlatRep& src) : hash_(src.hash_), equal_(src.equal_) {
+  FlatRep(const FlatRep& src) : hash_(src.hash_), equal_(src.equal_) {
     Init(src.size());
     CopyEntries(src.array_, src.end_, CopyEntry());
   }
+
+  FlatRep(FlatRep&& src)
+      // Copy rather than move src.hash_ and src.equal_.  This is necessary to
+      // leave src in a valid state -- otherwise e.g. if hash_ is an
+      // std::function, moving it would null it out.
+      : hash_(src.hash_), equal_(src.equal_) {
+    // TODO(jlebar): Init(1) still allocates some memory, so this isn't as cheap
+    // as it could be.  The fundamental problem is that we need to leave src in
+    // a valid state, and FlatRep *always* owns a nonzero amount of memory.
+    Init(1);
+    swap(src);
+  }
+
   ~FlatRep() {
     clear_no_resize();
     delete[] array_;
@@ -78,6 +91,12 @@ class FlatRep {
     }
   }
 
+  void MoveFrom(FlatRep&& src) {
+    if (this != &src) {
+      swap(src);
+    }
+  }
+
   void clear_no_resize() {
     for (Bucket* b = array_; b != end_; b++) {
       for (uint32 i = 0; i < kWidth; i++) {
diff --git a/tensorflow/core/lib/gtl/flatset.h b/tensorflow/core/lib/gtl/flatset.h
index f31e3abe411588..bb4356e46de194 100644
--- a/tensorflow/core/lib/gtl/flatset.h
+++ b/tensorflow/core/lib/gtl/flatset.h
@@ -59,6 +59,10 @@ class FlatSet {
 
   FlatSet(const FlatSet& src) : rep_(src.rep_) {}
 
+  // Move constructor leaves src in a valid but unspecified state (same as
+  // std::unordered_set).
+  FlatSet(FlatSet&& src) : rep_(std::move(src.rep_)) {}
+
   template <typename InputIter>
   FlatSet(InputIter first, InputIter last, size_t N = 1,
           const Hash& hf = Hash(), const Eq& eq = Eq())
@@ -75,6 +79,13 @@ class FlatSet {
     return *this;
   }
 
+  // Move-assignment operator leaves src in a valid but unspecified state (same
+  // as std::unordered_set).
+  FlatSet& operator=(FlatSet&& src) {
+    rep_.MoveFrom(std::move(src.rep_));
+    return *this;
+  }
+
   ~FlatSet() {}
 
   void swap(FlatSet& x) { rep_.swap(x.rep_); }
@@ -169,6 +180,7 @@ class FlatSet {
   }
 
   std::pair<iterator, bool> insert(const Key& k) { return Insert(k); }
+  std::pair<iterator, bool> insert(Key&& k) { return Insert(std::move(k)); }
   template <typename InputIter>
   void insert(InputIter first, InputIter last) {
     for (; first != last; ++first) {
@@ -265,9 +277,10 @@ class FlatSet {
     }
   };
 
-  std::pair<iterator, bool> Insert(const Key& k) {
+  template <typename K>
+  std::pair<iterator, bool> Insert(K&& k) {
     rep_.MaybeResize();
-    auto r = rep_.FindOrInsert(k);
+    auto r = rep_.FindOrInsert(std::forward<K>(k));
     const bool inserted = !r.found;
     return {iterator(r.b, rep_.limit(), r.index), inserted};
   }
diff --git a/tensorflow/core/lib/gtl/flatset_test.cc b/tensorflow/core/lib/gtl/flatset_test.cc
index 010b4bb5df3337..7f0138404f131e 100644
--- a/tensorflow/core/lib/gtl/flatset_test.cc
+++ b/tensorflow/core/lib/gtl/flatset_test.cc
@@ -552,18 +552,32 @@ TEST(FlatSet, UniqueSet) {
   }
   EXPECT_EQ(set.size(), N);
 
+  // Move constructor
+  UniqSet set2(std::move(set));
+
   // Lookups
   for (int i = 0; i < N; i++) {
-    EXPECT_EQ(set.count(MakeUniq(i)), 1);
+    EXPECT_EQ(set2.count(MakeUniq(i)), 1);
   }
 
+  // Move-assignment operator
+  UniqSet set3;
+  set3 = std::move(set2);
+
   // erase
-  set.erase(MakeUniq(2));
-  EXPECT_EQ(set.count(MakeUniq(2)), 0);
+  set3.erase(MakeUniq(2));
+  EXPECT_EQ(set3.count(MakeUniq(2)), 0);
 
   // clear
   set.clear();
   EXPECT_EQ(set.size(), 0);
+
+  // Check that moved-from sets are in a valid (though unspecified) state.
+  EXPECT_GE(set.size(), 0);
+  EXPECT_GE(set2.size(), 0);
+  // This insert should succeed no matter what state `set` is in, because
+  // MakeUniq(-1) is never called above: This key can't possibly exist.
+  EXPECT_TRUE(set.emplace(MakeUniq(-1)).second);
 }
 
 TEST(FlatSet, UniqueSetIter) {
@@ -579,6 +593,12 @@ TEST(FlatSet, UniqueSetIter) {
   EXPECT_EQ(sum, (kCount * (kCount + 1)) / 2);
 }
 
+TEST(FlatSet, InsertUncopyable) {
+  UniqSet set;
+  EXPECT_TRUE(set.insert(MakeUniq(0)).second);
+  EXPECT_EQ(set.size(), 1);
+}
+
 /* This would be a good negative compilation test, if we could do that.
 
 TEST(FlatSet, MutableIterator_ShouldNotCompile) {
diff --git a/tensorflow/core/lib/hash/hash.h b/tensorflow/core/lib/hash/hash.h
index ca05e6346e2045..737d23f6994fe2 100644
--- a/tensorflow/core/lib/hash/hash.h
+++ b/tensorflow/core/lib/hash/hash.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <stddef.h>
 #include <stdint.h>
 
+#include <functional>
 #include <string>
 
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -43,17 +44,39 @@ inline uint64 Hash64Combine(uint64 a, uint64 b) {
   return a ^ (b + 0x9e3779b97f4a7800ULL + (a << 10) + (a >> 4));
 }
 
+// Combine two hashes in an order-independent way. This operation should be
+// associative and compute the same hash for a collection of elements
+// independent of traversal order. Note that it is better to combine hashes
+// symmetrically with addition rather than XOR, since (x^x) == 0 but (x+x) != 0.
+inline uint64 Hash64CombineUnordered(uint64 a, uint64 b) { return a + b; }
+
 // Hash functor suitable for use with power-of-two sized hashtables.  Use
 // instead of std::hash<T>.
 //
 // In particular, tensorflow::hash is not the identity function for pointers.
 // This is important for power-of-two sized hashtables like FlatMap and FlatSet,
 // because otherwise they waste the majority of their hash buckets.
-template <typename T>
+//
+// The second type argument is only used for SFNIAE below.
+template <typename T, typename = void>
 struct hash {
   size_t operator()(const T& t) const { return std::hash<T>()(t); }
 };
 
+template <typename T>
+struct hash<T, typename std::enable_if<std::is_enum<T>::value>::type> {
+  size_t operator()(T value) const {
+    // This works around a defect in the std::hash C++ spec that isn't fixed in
+    // (at least) gcc 4.8.4:
+    // http://www.open-std.org/jtc1/sc22/wg21/docs/lwg-defects.html#2148
+    //
+    // We should be able to remove this and use the default
+    // tensorflow::hash<EnumTy>() once we stop building with GCC versions old
+    // enough to not have this defect fixed.
+    return std::hash<uint64>()(static_cast<uint64>(value));
+  }
+};
+
 template <typename T>
 struct hash<T*> {
   size_t operator()(const T* t) const {
diff --git a/tensorflow/core/lib/io/path.cc b/tensorflow/core/lib/io/path.cc
index 996fbf62e5c173..b62206012cc93b 100644
--- a/tensorflow/core/lib/io/path.cc
+++ b/tensorflow/core/lib/io/path.cc
@@ -42,7 +42,7 @@ string JoinPathImpl(std::initializer_list<StringPiece> paths) {
     if (path.empty()) continue;
 
     if (result.empty()) {
-      result = path.ToString();
+      result = std::string(path);
       continue;
     }
 
@@ -124,7 +124,7 @@ StringPiece Extension(StringPiece path) {
 }
 
 string CleanPath(StringPiece unclean_path) {
-  string path = unclean_path.ToString();
+  string path = std::string(unclean_path);
   const char* src = path.c_str();
   string::iterator dst = path.begin();
 
@@ -237,7 +237,7 @@ void ParseURI(StringPiece remaining, StringPiece* scheme, StringPiece* host,
 
 string CreateURI(StringPiece scheme, StringPiece host, StringPiece path) {
   if (scheme.empty()) {
-    return path.ToString();
+    return std::string(path);
   }
   return strings::StrCat(scheme, "://", host, path);
 }
diff --git a/tensorflow/core/lib/io/table_test.cc b/tensorflow/core/lib/io/table_test.cc
index 78a3fa501c6836..9e3309f0a7b21d 100644
--- a/tensorflow/core/lib/io/table_test.cc
+++ b/tensorflow/core/lib/io/table_test.cc
@@ -147,7 +147,7 @@ class Constructor {
   virtual ~Constructor() {}
 
   void Add(const string& key, const StringPiece& value) {
-    data_[key] = value.ToString();
+    data_[key] = std::string(value);
   }
 
   // Finish constructing the data structure with all the keys that have
@@ -188,7 +188,7 @@ class BlockConstructor : public Constructor {
       builder.Add(it->first, it->second);
     }
     // Open the block
-    data_ = builder.Finish().ToString();
+    data_ = std::string(builder.Finish());
     BlockContents contents;
     contents.data = data_;
     contents.cachable = false;
@@ -515,7 +515,7 @@ TEST_F(Harness, Randomized) {
       for (int e = 0; e < num_entries; e++) {
         string v;
         Add(test::RandomKey(&rnd, rnd.Skewed(4)),
-            test::RandomString(&rnd, rnd.Skewed(5), &v).ToString());
+            std::string(test::RandomString(&rnd, rnd.Skewed(5), &v)));
       }
       Test(&rnd);
     }
diff --git a/tensorflow/core/lib/monitoring/collection_registry.cc b/tensorflow/core/lib/monitoring/collection_registry.cc
index d3fd7132de5f0e..8c28620ff9c7fd 100644
--- a/tensorflow/core/lib/monitoring/collection_registry.cc
+++ b/tensorflow/core/lib/monitoring/collection_registry.cc
@@ -38,15 +38,15 @@ void Collector::CollectMetricDescriptor(
     mutex_lock l(mu_);
     return collected_metrics_->metric_descriptor_map
         .insert(std::make_pair(
-            metric_def->name().ToString(),
+            std::string(metric_def->name()),
             std::unique_ptr<MetricDescriptor>(new MetricDescriptor())))
         .first->second.get();
   }();
-  metric_descriptor->name = metric_def->name().ToString();
-  metric_descriptor->description = metric_def->description().ToString();
+  metric_descriptor->name = std::string(metric_def->name());
+  metric_descriptor->description = std::string(metric_def->description());
 
   for (const StringPiece label_name : metric_def->label_descriptions()) {
-    metric_descriptor->label_names.push_back(label_name.ToString());
+    metric_descriptor->label_names.push_back(std::string(label_name));
   }
 
   metric_descriptor->metric_kind = metric_def->kind();
diff --git a/tensorflow/core/lib/monitoring/collection_registry.h b/tensorflow/core/lib/monitoring/collection_registry.h
index 63cc0f550df79c..20f0444f8b656b 100644
--- a/tensorflow/core/lib/monitoring/collection_registry.h
+++ b/tensorflow/core/lib/monitoring/collection_registry.h
@@ -72,7 +72,7 @@ class MetricCollector {
         registration_time_millis_(registration_time_millis),
         collector_(collector),
         point_set_(point_set) {
-    point_set_->metric_name = metric_def->name().ToString();
+    point_set_->metric_name = std::string(metric_def->name());
   }
 
   const MetricDef<metric_kind, Value, NumLabels>* const metric_def_;
@@ -261,7 +261,7 @@ class Collector {
     auto* const point_set = [&]() {
       mutex_lock l(mu_);
       return collected_metrics_->point_set_map
-          .insert(std::make_pair(metric_def->name().ToString(),
+          .insert(std::make_pair(std::string(metric_def->name()),
                                  std::unique_ptr<PointSet>(new PointSet())))
           .first->second.get();
     }();
diff --git a/tensorflow/core/lib/monitoring/metric_def.h b/tensorflow/core/lib/monitoring/metric_def.h
index 5ecadcc4272581..6f9468566570f2 100644
--- a/tensorflow/core/lib/monitoring/metric_def.h
+++ b/tensorflow/core/lib/monitoring/metric_def.h
@@ -98,8 +98,8 @@ class AbstractMetricDef {
                     const std::vector<string>& label_descriptions)
       : kind_(kind),
         value_type_(value_type),
-        name_(name.ToString()),
-        description_(description.ToString()),
+        name_(std::string(name)),
+        description_(std::string(description)),
         label_descriptions_(std::vector<string>(label_descriptions.begin(),
                                                 label_descriptions.end())) {}
 
diff --git a/tensorflow/core/lib/strings/numbers.cc b/tensorflow/core/lib/strings/numbers.cc
index e4b909296e8608..87aa5915ff8070 100644
--- a/tensorflow/core/lib/strings/numbers.cc
+++ b/tensorflow/core/lib/strings/numbers.cc
@@ -331,31 +331,29 @@ bool safe_strtou32(StringPiece str, uint32* value) {
   return true;
 }
 
-bool safe_strtof(const char* str, float* value) {
+bool safe_strtof(StringPiece str, float* value) {
   int processed_characters_count = -1;
-  auto len = str_util::Strnlen(str, kFastToBufferSize);
+  auto len = str.size();
 
-  // If there is no zero-termination in str, fail.
-  if (len == kFastToBufferSize) return false;
-  // If string length exceeds int max, fail.
+  // If string length exceeds buffer size or int max, fail.
+  if (len >= kFastToBufferSize) return false;
   if (len > std::numeric_limits<int>::max()) return false;
 
-  *value = StringToFloatConverter().StringToFloat(str, static_cast<int>(len),
-                                                  &processed_characters_count);
+  *value = StringToFloatConverter().StringToFloat(
+      str.data(), static_cast<int>(len), &processed_characters_count);
   return processed_characters_count > 0;
 }
 
-bool safe_strtod(const char* str, double* value) {
+bool safe_strtod(StringPiece str, double* value) {
   int processed_characters_count = -1;
-  auto len = str_util::Strnlen(str, kFastToBufferSize);
+  auto len = str.size();
 
-  // If there is no zero-termination in str, fail.
-  if (len == kFastToBufferSize) return false;
-  // If string length exceeds int max, fail.
+  // If string length exceeds buffer size or int max, fail.
+  if (len >= kFastToBufferSize) return false;
   if (len > std::numeric_limits<int>::max()) return false;
 
-  *value = StringToFloatConverter().StringToDouble(str, static_cast<int>(len),
-                                                   &processed_characters_count);
+  *value = StringToFloatConverter().StringToDouble(
+      str.data(), static_cast<int>(len), &processed_characters_count);
   return processed_characters_count > 0;
 }
 
@@ -392,8 +390,8 @@ string FpToString(Fprint fp) {
 
 bool StringToFp(const string& s, Fprint* fp) {
   char junk;
-  uint64 result;
-  if (sscanf(s.c_str(), "%llx%c", &result, &junk) == 1) {
+  uint64_t result;
+  if (sscanf(s.c_str(), "%lx%c", &result, &junk) == 1) {
     *fp = result;
     return true;
   } else {
diff --git a/tensorflow/core/lib/strings/numbers.h b/tensorflow/core/lib/strings/numbers.h
index e9add428492f3e..1d5bacac93b89a 100644
--- a/tensorflow/core/lib/strings/numbers.h
+++ b/tensorflow/core/lib/strings/numbers.h
@@ -115,13 +115,13 @@ bool safe_strtou64(StringPiece str, uint64* value);
 // Leading and trailing spaces are allowed.
 // Values may be rounded on over- and underflow.
 // Returns false on invalid input or if `strlen(value) >= kFastToBufferSize`.
-bool safe_strtof(const char* str, float* value);
+bool safe_strtof(StringPiece str, float* value);
 
 // Convert strings to double precision floating point values.
 // Leading and trailing spaces are allowed.
 // Values may be rounded on over- and underflow.
 // Returns false on invalid input or if `strlen(value) >= kFastToBufferSize`.
-bool safe_strtod(const char* str, double* value);
+bool safe_strtod(StringPiece str, double* value);
 
 inline bool ProtoParseNumeric(StringPiece s, int32* value) {
   return safe_strto32(s, value);
@@ -140,11 +140,11 @@ inline bool ProtoParseNumeric(StringPiece s, uint64* value) {
 }
 
 inline bool ProtoParseNumeric(StringPiece s, float* value) {
-  return safe_strtof(s.ToString().c_str(), value);
+  return safe_strtof(std::string(s).c_str(), value);
 }
 
 inline bool ProtoParseNumeric(StringPiece s, double* value) {
-  return safe_strtod(s.ToString().c_str(), value);
+  return safe_strtod(std::string(s).c_str(), value);
 }
 
 // Convert strings to number of type T.
diff --git a/tensorflow/core/lib/strings/scanner_test.cc b/tensorflow/core/lib/strings/scanner_test.cc
index 55ff3405c35dda..b0f568a03e1739 100644
--- a/tensorflow/core/lib/strings/scanner_test.cc
+++ b/tensorflow/core/lib/strings/scanner_test.cc
@@ -42,24 +42,24 @@ TEST_F(ScannerTest, Any) {
                   .Any(Scanner::DIGIT)
                   .Any(Scanner::LETTER)
                   .GetResult(&remaining, &match));
-  EXPECT_EQ("   horse", match.ToString());
-  EXPECT_EQ("0123", remaining.ToString());
+  EXPECT_EQ("   horse", match);
+  EXPECT_EQ("0123", remaining);
 
   EXPECT_TRUE(Scanner("")
                   .Any(Scanner::SPACE)
                   .Any(Scanner::DIGIT)
                   .Any(Scanner::LETTER)
                   .GetResult(&remaining, &match));
-  EXPECT_EQ("", remaining.ToString());
-  EXPECT_EQ("", match.ToString());
+  EXPECT_EQ("", remaining);
+  EXPECT_EQ("", match);
 
   EXPECT_TRUE(Scanner("----")
                   .Any(Scanner::SPACE)
                   .Any(Scanner::DIGIT)
                   .Any(Scanner::LETTER)
                   .GetResult(&remaining, &match));
-  EXPECT_EQ("----", remaining.ToString());
-  EXPECT_EQ("", match.ToString());
+  EXPECT_EQ("----", remaining);
+  EXPECT_EQ("", match);
 }
 
 TEST_F(ScannerTest, AnySpace) {
@@ -69,8 +69,8 @@ TEST_F(ScannerTest, AnySpace) {
                   .One(Scanner::LETTER)
                   .AnySpace()
                   .GetResult(&remaining, &match));
-  EXPECT_EQ("  a ", match.ToString());
-  EXPECT_EQ("b ", remaining.ToString());
+  EXPECT_EQ("  a ", match);
+  EXPECT_EQ("b ", remaining);
 }
 
 TEST_F(ScannerTest, AnyEscapedNewline) {
@@ -143,8 +143,8 @@ TEST_F(ScannerTest, ScanUntil) {
                   .ScanUntil('\'')
                   .OneLiteral("'")
                   .GetResult(&remaining, &match));
-  EXPECT_EQ(R"( \\'rest)", remaining.ToString());
-  EXPECT_EQ(R"(' \1 \2 \3 \')", match.ToString());
+  EXPECT_EQ(R"( \\'rest)", remaining);
+  EXPECT_EQ(R"(' \1 \2 \3 \')", match);
 
   // The "scan until" character is not present.
   remaining = match = "unset";
@@ -152,15 +152,15 @@ TEST_F(ScannerTest, ScanUntil) {
                    .OneLiteral("'")
                    .ScanUntil('\'')
                    .GetResult(&remaining, &match));
-  EXPECT_EQ("unset", remaining.ToString());
-  EXPECT_EQ("unset", match.ToString());
+  EXPECT_EQ("unset", remaining);
+  EXPECT_EQ("unset", match);
 
   // Scan until an escape character.
   remaining = match = "";
   EXPECT_TRUE(
       Scanner(R"(123\456)").ScanUntil('\\').GetResult(&remaining, &match));
-  EXPECT_EQ(R"(\456)", remaining.ToString());
-  EXPECT_EQ("123", match.ToString());
+  EXPECT_EQ(R"(\456)", remaining);
+  EXPECT_EQ("123", match);
 }
 
 TEST_F(ScannerTest, ScanEscapedUntil) {
@@ -170,8 +170,8 @@ TEST_F(ScannerTest, ScanEscapedUntil) {
                   .ScanEscapedUntil('\'')
                   .OneLiteral("'")
                   .GetResult(&remaining, &match));
-  EXPECT_EQ("rest", remaining.ToString());
-  EXPECT_EQ(R"(' \1 \2 \3 \' \\')", match.ToString());
+  EXPECT_EQ("rest", remaining);
+  EXPECT_EQ(R"(' \1 \2 \3 \' \\')", match);
 
   // The "scan until" character is not present.
   remaining = match = "unset";
@@ -179,27 +179,27 @@ TEST_F(ScannerTest, ScanEscapedUntil) {
                    .OneLiteral("'")
                    .ScanEscapedUntil('\'')
                    .GetResult(&remaining, &match));
-  EXPECT_EQ("unset", remaining.ToString());
-  EXPECT_EQ("unset", match.ToString());
+  EXPECT_EQ("unset", remaining);
+  EXPECT_EQ("unset", match);
 }
 
 TEST_F(ScannerTest, ZeroOrOneLiteral) {
   StringPiece remaining, match;
   EXPECT_TRUE(
       Scanner("abc").ZeroOrOneLiteral("abC").GetResult(&remaining, &match));
-  EXPECT_EQ("abc", remaining.ToString());
-  EXPECT_EQ("", match.ToString());
+  EXPECT_EQ("abc", remaining);
+  EXPECT_EQ("", match);
 
   EXPECT_TRUE(
       Scanner("abcd").ZeroOrOneLiteral("ab").ZeroOrOneLiteral("c").GetResult(
           &remaining, &match));
-  EXPECT_EQ("d", remaining.ToString());
-  EXPECT_EQ("abc", match.ToString());
+  EXPECT_EQ("d", remaining);
+  EXPECT_EQ("abc", match);
 
   EXPECT_TRUE(
       Scanner("").ZeroOrOneLiteral("abc").GetResult(&remaining, &match));
-  EXPECT_EQ("", remaining.ToString());
-  EXPECT_EQ("", match.ToString());
+  EXPECT_EQ("", remaining);
+  EXPECT_EQ("", match);
 }
 
 // Test output of GetResult (including the forms with optional params),
@@ -215,24 +215,24 @@ TEST_F(ScannerTest, CaptureAndGetResult) {
                   .StopCapture()
                   .Any(Scanner::SPACE)
                   .GetResult(&remaining, &match));
-  EXPECT_EQ("second", remaining.ToString());
-  EXPECT_EQ("first", match.ToString());
+  EXPECT_EQ("second", remaining);
+  EXPECT_EQ("first", match);
   EXPECT_TRUE(scan.GetResult());
   remaining = "";
   EXPECT_TRUE(scan.GetResult(&remaining));
-  EXPECT_EQ("second", remaining.ToString());
+  EXPECT_EQ("second", remaining);
   remaining = "";
   match = "";
   EXPECT_TRUE(scan.GetResult(&remaining, &match));
-  EXPECT_EQ("second", remaining.ToString());
-  EXPECT_EQ("first", match.ToString());
+  EXPECT_EQ("second", remaining);
+  EXPECT_EQ("first", match);
 
   scan.RestartCapture().One(Scanner::LETTER).One(Scanner::LETTER);
   remaining = "";
   match = "";
   EXPECT_TRUE(scan.GetResult(&remaining, &match));
-  EXPECT_EQ("cond", remaining.ToString());
-  EXPECT_EQ("se", match.ToString());
+  EXPECT_EQ("cond", remaining);
+  EXPECT_EQ("se", match);
 }
 
 // Tests that if StopCapture is not called, then calling GetResult, then
@@ -242,14 +242,14 @@ TEST_F(ScannerTest, MultipleGetResultExtendsCapture) {
 
   Scanner scan("one2three");
   EXPECT_TRUE(scan.Many(Scanner::LETTER).GetResult(&remaining, &match));
-  EXPECT_EQ("2three", remaining.ToString());
-  EXPECT_EQ("one", match.ToString());
+  EXPECT_EQ("2three", remaining);
+  EXPECT_EQ("one", match);
   EXPECT_TRUE(scan.Many(Scanner::DIGIT).GetResult(&remaining, &match));
-  EXPECT_EQ("three", remaining.ToString());
-  EXPECT_EQ("one2", match.ToString());
+  EXPECT_EQ("three", remaining);
+  EXPECT_EQ("one2", match);
   EXPECT_TRUE(scan.Many(Scanner::LETTER).GetResult(&remaining, &match));
-  EXPECT_EQ("", remaining.ToString());
-  EXPECT_EQ("one2three", match.ToString());
+  EXPECT_EQ("", remaining);
+  EXPECT_EQ("one2three", match);
 }
 
 TEST_F(ScannerTest, FailedMatchDoesntChangeResult) {
@@ -258,8 +258,8 @@ TEST_F(ScannerTest, FailedMatchDoesntChangeResult) {
   StringPiece remaining = "rem";
   StringPiece match = "match";
   EXPECT_FALSE(scan.One(Scanner::SPACE).GetResult(&remaining, &match));
-  EXPECT_EQ("rem", remaining.ToString());
-  EXPECT_EQ("match", match.ToString());
+  EXPECT_EQ("rem", remaining);
+  EXPECT_EQ("match", match);
 }
 
 TEST_F(ScannerTest, DefaultCapturesAll) {
@@ -271,8 +271,8 @@ TEST_F(ScannerTest, DefaultCapturesAll) {
                   .AnySpace()
                   .Any(Scanner::LETTER)
                   .GetResult(&remaining, &match));
-  EXPECT_EQ("", remaining.ToString());
-  EXPECT_EQ("a b", match.ToString());
+  EXPECT_EQ("", remaining);
+  EXPECT_EQ("a b", match);
 }
 
 TEST_F(ScannerTest, AllCharClasses) {
diff --git a/tensorflow/core/lib/strings/str_util.cc b/tensorflow/core/lib/strings/str_util.cc
index 4598b8ccc79fcd..cab8f81585922e 100644
--- a/tensorflow/core/lib/strings/str_util.cc
+++ b/tensorflow/core/lib/strings/str_util.cc
@@ -332,7 +332,7 @@ string StringReplace(StringPiece s, StringPiece oldsub, StringPiece newsub,
                      bool replace_all) {
   // TODO(jlebar): We could avoid having to shift data around in the string if
   // we had a StringPiece::find() overload that searched for a StringPiece.
-  string res = s.ToString();
+  string res = std::string(s);
   size_t pos = 0;
   while ((pos = res.find(oldsub.data(), pos, oldsub.size())) != string::npos) {
     res.replace(pos, oldsub.size(), newsub.data(), newsub.size());
@@ -449,7 +449,7 @@ bool SplitAndParseAsFloats(StringPiece text, char delim,
   return SplitAndParseAsInts<float>(text, delim,
                                     [](StringPiece str, float* value) {
                                       return strings::safe_strtof(
-                                          str.ToString().c_str(), value);
+                                          std::string(str).c_str(), value);
                                     },
                                     result);
 }
diff --git a/tensorflow/core/lib/strings/str_util.h b/tensorflow/core/lib/strings/str_util.h
index e97d00b975e2f6..c887db7eff21a5 100644
--- a/tensorflow/core/lib/strings/str_util.h
+++ b/tensorflow/core/lib/strings/str_util.h
@@ -205,7 +205,7 @@ std::vector<string> Split(StringPiece text, StringPiece delims, Predicate p) {
       if ((i == text.size()) || (delims.find(text[i]) != StringPiece::npos)) {
         StringPiece token(text.data() + token_start, i - token_start);
         if (p(token)) {
-          result.push_back(token.ToString());
+          result.push_back(std::string(token));
         }
         token_start = i + 1;
       }
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 88fc03826a8dcc..fce0b93cd71fe6 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -466,7 +466,7 @@ REGISTER_OP("BroadcastTo")
           // so no check needed.
           if (i >= in_offset) {
             DimensionHandle in_dim = c->Dim(in, i - in_offset);
-            if (c->ValueKnown(in_dim)) {
+            if (c->ValueKnown(in_dim) && c->Value(in_dim) != 0) {
               if (c->Value(dim) % c->Value(in_dim) != 0) {
                 return errors::InvalidArgument(
                     "Cannot broadcast a tensor with shape ", c->DebugString(in),
diff --git a/tensorflow/core/ops/batch_ops.cc b/tensorflow/core/ops/batch_ops.cc
index 0a62965eedd3c0..ba7faeb5e8aeca 100644
--- a/tensorflow/core/ops/batch_ops.cc
+++ b/tensorflow/core/ops/batch_ops.cc
@@ -19,6 +19,26 @@ limitations under the License.
 
 namespace tensorflow {
 
+REGISTER_OP("BatchFunction")
+    .Input("in_tensors: Tin")
+    .Input("captured_tensors: Tcaptured")
+    .Output("out_tensors: Tout")
+    .Attr("f: func")
+    .Attr("num_batch_threads: int")
+    .Attr("max_batch_size: int")
+    .Attr("batch_timeout_micros: int")
+    .Attr("max_enqueued_batches: int = 10")
+    .Attr("allowed_batch_sizes: list(int) = []")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("batching_queue: string = ''")
+    .Attr("Tin: list(type)")
+    .Attr("Tcaptured: list(type) >= 0")
+    .Attr("Tout: list(type)")
+    // TODO(apassos): Fix this shape inference function. It requires shape
+    // inference of function calls.
+    .SetShapeFn(shape_inference::UnknownShape);
+
 REGISTER_OP("Batch")
     .Input("in_tensors: T")
     .Output("batched_tensors: T")
diff --git a/tensorflow/core/ops/candidate_sampling_ops.cc b/tensorflow/core/ops/candidate_sampling_ops.cc
index 6e4d100b04fba2..6e589c8d1c5fea 100644
--- a/tensorflow/core/ops/candidate_sampling_ops.cc
+++ b/tensorflow/core/ops/candidate_sampling_ops.cc
@@ -145,12 +145,15 @@ REGISTER_OP("ComputeAccidentalHits")
       int64 num_true;
       TF_RETURN_IF_ERROR(c->GetAttr("num_true", &num_true));
 
-      // Validate true_classes.
+      // Validate true_classes, must be a matrix.
       ShapeHandle true_classes;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &true_classes));
       DimensionHandle unused;
       TF_RETURN_IF_ERROR(
           c->WithValue(c->Dim(true_classes, 1), num_true, &unused));
+      // Validate sampled_candidates, must be a vector.
+      ShapeHandle sampled_candidates;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &sampled_candidates));
 
       // All three outputs are the same shape.
       ShapeHandle v = c->Vector(InferenceContext::kUnknownDim);
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 71ba5f016a76e2..16e9b2e02eb187 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -1499,6 +1499,26 @@ op {
     }
   }
 }
+op {
+  name: "AnonymousIterator"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "Any"
   input_arg {
@@ -8742,6 +8762,90 @@ op {
     version: 15
   }
 }
+op {
+  name: "BatchFunction"
+  input_arg {
+    name: "in_tensors"
+    type_list_attr: "Tin"
+  }
+  input_arg {
+    name: "captured_tensors"
+    type_list_attr: "Tcaptured"
+  }
+  output_arg {
+    name: "out_tensors"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "num_batch_threads"
+    type: "int"
+  }
+  attr {
+    name: "max_batch_size"
+    type: "int"
+  }
+  attr {
+    name: "batch_timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "max_enqueued_batches"
+    type: "int"
+    default_value {
+      i: 10
+    }
+  }
+  attr {
+    name: "allowed_batch_sizes"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "batching_queue"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tcaptured"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "BatchIFFT"
   input_arg {
@@ -14194,6 +14298,92 @@ op {
     }
   }
 }
+op {
+  name: "Conv3DBackpropInputV2"
+  input_arg {
+    name: "input_sizes"
+    type_attr: "Tshape"
+  }
+  input_arg {
+    name: "filter"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "out_backprop"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "strides"
+    type: "list(int)"
+    has_minimum: true
+    minimum: 5
+  }
+  attr {
+    name: "padding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "SAME"
+        s: "VALID"
+      }
+    }
+  }
+  attr {
+    name: "data_format"
+    type: "string"
+    default_value {
+      s: "NDHWC"
+    }
+    allowed_values {
+      list {
+        s: "NDHWC"
+        s: "NCDHW"
+      }
+    }
+  }
+  attr {
+    name: "dilations"
+    type: "list(int)"
+    default_value {
+      list {
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+        i: 1
+      }
+    }
+  }
+  attr {
+    name: "Tshape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "Copy"
   input_arg {
@@ -14641,6 +14831,66 @@ op {
     }
   }
 }
+op {
+  name: "CropAndResize"
+  input_arg {
+    name: "image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "crop_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "crops"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT8
+        type: DT_INT16
+        type: DT_INT32
+        type: DT_INT64
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "method"
+    type: "string"
+    default_value {
+      s: "bilinear"
+    }
+    allowed_values {
+      list {
+        s: "bilinear"
+        s: "nearest"
+      }
+    }
+  }
+  attr {
+    name: "extrapolation_value"
+    type: "float"
+    default_value {
+      f: 0
+    }
+  }
+}
 op {
   name: "CropAndResizeGradBoxes"
   input_arg {
@@ -14790,6 +15040,53 @@ op {
     }
   }
 }
+op {
+  name: "CropAndResizeGradImage"
+  input_arg {
+    name: "grads"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "box_ind"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "image_size"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_HALF
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "method"
+    type: "string"
+    default_value {
+      s: "bilinear"
+    }
+    allowed_values {
+      list {
+        s: "bilinear"
+        s: "nearest"
+      }
+    }
+  }
+}
 op {
   name: "Cross"
   input_arg {
@@ -16369,6 +16666,17 @@ op {
     }
   }
 }
+op {
+  name: "DatasetToGraph"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "graph"
+    type: DT_STRING
+  }
+}
 op {
   name: "DatasetToSingleElement"
   input_arg {
@@ -20998,6 +21306,30 @@ op {
     type: DT_COMPLEX64
   }
 }
+op {
+  name: "FFT"
+  input_arg {
+    name: "input"
+    type_attr: "Tcomplex"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tcomplex"
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "FFT2D"
   input_arg {
@@ -21009,6 +21341,30 @@ op {
     type: DT_COMPLEX64
   }
 }
+op {
+  name: "FFT2D"
+  input_arg {
+    name: "input"
+    type_attr: "Tcomplex"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tcomplex"
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "FFT3D"
   input_arg {
@@ -21020,6 +21376,30 @@ op {
     type: DT_COMPLEX64
   }
 }
+op {
+  name: "FFT3D"
+  input_arg {
+    name: "input"
+    type_attr: "Tcomplex"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tcomplex"
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "FIFOQueue"
   output_arg {
@@ -21116,6 +21496,21 @@ op {
     type: DT_STRING
   }
 }
+op {
+  name: "FakeParam"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+}
 op {
   name: "FakeQuantWithMinMaxArgs"
   input_arg {
@@ -24166,6 +24561,82 @@ op {
     }
   }
 }
+op {
+  name: "GroupByReducerDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "init_func_other_arguments"
+    type_list_attr: "Tinit_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "finalize_func_other_arguments"
+    type_list_attr: "Tfinalize_func_other_arguments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "key_func"
+    type: "func"
+  }
+  attr {
+    name: "init_func"
+    type: "func"
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "finalize_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tinit_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tfinalize_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "GroupByWindowDataset"
   input_arg {
@@ -24573,59 +25044,94 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
+        type: DT_INT64
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+}
+op {
+  name: "HistogramSummary"
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "values"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "HistogramSummary"
+  name: "IFFT"
   input_arg {
-    name: "tag"
-    type: DT_STRING
+    name: "input"
+    type: DT_COMPLEX64
+  }
+  output_arg {
+    name: "output"
+    type: DT_COMPLEX64
   }
+}
+op {
+  name: "IFFT"
   input_arg {
-    name: "values"
-    type_attr: "T"
+    name: "input"
+    type_attr: "Tcomplex"
   }
   output_arg {
-    name: "summary"
-    type: DT_STRING
+    name: "output"
+    type_attr: "Tcomplex"
   }
   attr {
-    name: "T"
+    name: "Tcomplex"
     type: "type"
     default_value {
-      type: DT_FLOAT
+      type: DT_COMPLEX64
     }
     allowed_values {
       list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
       }
     }
   }
 }
 op {
-  name: "IFFT"
+  name: "IFFT2D"
   input_arg {
     name: "input"
     type: DT_COMPLEX64
@@ -24639,11 +25145,24 @@ op {
   name: "IFFT2D"
   input_arg {
     name: "input"
-    type: DT_COMPLEX64
+    type_attr: "Tcomplex"
   }
   output_arg {
     name: "output"
-    type: DT_COMPLEX64
+    type_attr: "Tcomplex"
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
 }
 op {
@@ -24657,6 +25176,30 @@ op {
     type: DT_COMPLEX64
   }
 }
+op {
+  name: "IFFT3D"
+  input_arg {
+    name: "input"
+    type_attr: "Tcomplex"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "Tcomplex"
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+}
 op {
   name: "IRFFT"
   input_arg {
@@ -24717,6 +25260,29 @@ op {
     type: "type"
   }
 }
+op {
+  name: "IdentityDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "IdentityN"
   input_arg {
@@ -27909,6 +28475,54 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "MapAndBatchDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "MapClear"
   attr {
@@ -34188,6 +34802,33 @@ op {
     type: DT_INT32
   }
 }
+op {
+  name: "NonMaxSuppressionV3"
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "scores"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "score_threshold"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
+  }
+}
 op {
   name: "NotEqual"
   input_arg {
@@ -34606,6 +35247,33 @@ op {
     }
   }
 }
+op {
+  name: "OptimizeDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "optimizations"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "OrderedMapClear"
   attr {
@@ -41959,6 +42627,21 @@ op {
   }
   allows_uninitialized_input: true
 }
+op {
+  name: "RegexFullMatch"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "pattern"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_BOOL
+  }
+}
 op {
   name: "RegexReplace"
   input_arg {
@@ -54893,6 +55576,56 @@ op {
     }
   }
 }
+op {
+  name: "SegmentMean"
+  input_arg {
+    name: "data"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "segment_ids"
+    type_attr: "Tindices"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "SegmentMin"
   input_arg {
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 4ba3f15ef03eca..c9d1cdf412aedc 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -206,7 +206,40 @@ REGISTER_OP("MapAndBatchDataset")
     .Attr("Targuments: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      // Use index from the end to retrieve the Input shapes,
+      // so that to avoid guessing the length of "other_arguments".
+      // batch_size, num_parallel_batches, and drop_remainder are 0-D scalars.
+      shape_inference::ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 1), 0, &unused));
+
+      return shape_inference::ScalarShape(c);
+    });
+
+REGISTER_OP("MapAndBatchDatasetV2")
+    .Input("input_dataset: variant")
+    .Input("other_arguments: Targuments")
+    .Input("batch_size: int64")
+    .Input("num_parallel_calls: int64")
+    .Input("drop_remainder: bool")
+    .Output("handle: variant")
+    .Attr("f: func")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      // Use index from the end to retrieve the Input shapes,
+      // so that to avoid guessing the length of "other_arguments".
+      // batch_size, num_parallel_calls, and drop_remainder are 0-D scalars.
+      shape_inference::ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 3), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(c->num_inputs() - 1), 0, &unused));
+
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("PrefetchDataset")
     .Input("input_dataset: variant")
@@ -270,6 +303,26 @@ REGISTER_OP("ParallelInterleaveDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("GroupByReducerDataset")
+    .Input("input_dataset: variant")
+    .Input("key_func_other_arguments: Tkey_func_other_arguments")
+    .Input("init_func_other_arguments: Tinit_func_other_arguments")
+    .Input("reduce_func_other_arguments: Treduce_func_other_arguments")
+    .Input("finalize_func_other_arguments: Tfinalize_func_other_arguments")
+    .Output("handle: variant")
+    .Attr("key_func: func")
+    .Attr("init_func: func")
+    .Attr("reduce_func: func")
+    .Attr("finalize_func: func")
+    .Attr("Tkey_func_other_arguments: list(type) >= 0")
+    .Attr("Tinit_func_other_arguments: list(type) >= 0")
+    .Attr("Treduce_func_other_arguments: list(type) >= 0")
+    .Attr("Tfinalize_func_other_arguments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("GroupByWindowDataset")
     .Input("input_dataset: variant")
     .Input("key_func_other_arguments: Tkey_func_other_arguments")
@@ -458,11 +511,11 @@ REGISTER_OP("TextLineDataset")
       shape_inference::ShapeHandle unused;
       // `filenames` must be a scalar or a vector.
       TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 1, &unused));
-      return shape_inference::ScalarShape(c);
       // `compression_type` could only be a scalar.
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
       // `buffer_size` could only be a scalar.
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return shape_inference::ScalarShape(c);
     });
 
 REGISTER_OP("SqlDataset")
@@ -531,6 +584,12 @@ REGISTER_OP("Iterator")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("AnonymousIterator")
+    .Output("handle: resource")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("MakeIterator")
     .Input("dataset: variant")
     .Input("iterator: resource")
@@ -659,4 +718,24 @@ REGISTER_OP("DatasetToTFRecord")
     .Input("compression_type: string")
     .SetShapeFn(shape_inference::NoOutputs);
 
+REGISTER_OP("DatasetToGraph")
+    .Input("input_dataset: variant")
+    .Output("graph: string")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("IdentityDataset")
+    .Input("input_dataset: variant")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("OptimizeDataset")
+    .Input("input_dataset: variant")
+    .Input("optimizations: string")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/functional_ops.cc b/tensorflow/core/ops/functional_ops.cc
index 4d4a370478e051..a6cc4b60e5d74c 100644
--- a/tensorflow/core/ops/functional_ops.cc
+++ b/tensorflow/core/ops/functional_ops.cc
@@ -154,4 +154,21 @@ REGISTER_OP("PartitionedCall")
     .Attr("f: func")
     .SetShapeFn(shape_inference::UnknownShape);
 
+// This op is used as a placeholder in If branch functions. It doesn't provide a
+// valid output when run, so must either be removed (e.g. replaced with a
+// function input) or guaranteed not to be used (e.g. if mirroring an
+// intermediate output needed for the gradient computation of the other branch).
+REGISTER_OP("FakeParam")
+    .Output("output: dtype")
+    .Attr("dtype: type")
+    .Attr("shape: shape")
+    .SetShapeFn([](InferenceContext* c) {
+      PartialTensorShape shape;
+      TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape));
+      shape_inference::ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(shape, &out));
+      c->set_output(0, out);
+      return Status::OK();
+    });
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index c3b08e067a2c35..87f4991134afd7 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -435,7 +435,28 @@ REGISTER_OP("DrawBoundingBoxes")
     .Output("output: T")
     .Attr("T: {float, half} = DT_FLOAT")
     .SetShapeFn([](InferenceContext* c) {
-      return shape_inference::UnchangedShapeWithRankAtLeast(c, 3);
+      // The rank of images should be 4.
+      ShapeHandle images;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 4, &images));
+      // Channel depth should be either 1 (GRY), 3 (RGB), or 4 (RGBA).
+      if (c->ValueKnown(c->Dim(images, 3))) {
+        int64 depth = c->Value(c->Dim(images, 3));
+        if (!(depth == 1 || depth == 3 || depth == 4)) {
+          return errors::InvalidArgument("Channel depth should be either 1 (GRY), "
+                                         "3 (RGB), or 4 (RGBA)");
+        }
+      }
+
+      // The rank of boxes is 3: [batch, num_bounding_boxes, 4].
+      ShapeHandle boxes;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 3, &boxes));
+      // The last value of boxes shape is 4.
+      DimensionHandle unused;
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 2), 4, &unused));
+
+      // The rank of the input image (rank = 4) has already been restricted
+      // above, and the output is of the same shape as the input.
+      return shape_inference::UnchangedShape(c);
     });
 
 // --------------------------------------------------------------------------
@@ -548,7 +569,7 @@ REGISTER_OP("CropAndResize")
     .Input("crop_size: int32")
     .Output("crops: float")
     .Attr("T: {uint8, uint16, int8, int16, int32, int64, half, float, double}")
-    .Attr("method: {'bilinear'} = 'bilinear'")
+    .Attr("method: {'bilinear', 'nearest'} = 'bilinear'")
     .Attr("extrapolation_value: float = 0")
     .SetShapeFn([](InferenceContext* c) {
       // Get inputs and validate ranks.
@@ -579,7 +600,7 @@ REGISTER_OP("CropAndResizeGradImage")
     .Input("image_size: int32")
     .Output("output: T")
     .Attr("T: {float, half, double}")
-    .Attr("method: {'bilinear'} = 'bilinear'")
+    .Attr("method: {'bilinear', 'nearest'} = 'bilinear'")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle out;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(3, &out));
@@ -657,4 +678,35 @@ REGISTER_OP("NonMaxSuppressionV2")
       return Status::OK();
     });
 
+REGISTER_OP("NonMaxSuppressionV3")
+    .Input("boxes: float")
+    .Input("scores: float")
+    .Input("max_output_size: int32")
+    .Input("iou_threshold: float")
+    .Input("score_threshold: float")
+    .Output("selected_indices: int32")
+    .SetShapeFn([](InferenceContext* c) {
+      // Get inputs and validate ranks.
+      ShapeHandle boxes;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &boxes));
+      ShapeHandle scores;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &scores));
+      ShapeHandle max_output_size;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &max_output_size));
+      ShapeHandle iou_threshold;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &iou_threshold));
+      ShapeHandle score_threshold;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &score_threshold));
+      // The boxes is a 2-D float Tensor of shape [num_boxes, 4].
+      DimensionHandle unused;
+      // The boxes[0] and scores[0] are both num_boxes.
+      TF_RETURN_IF_ERROR(
+          c->Merge(c->Dim(boxes, 0), c->Dim(scores, 0), &unused));
+      // The boxes[1] is 4.
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 1), 4, &unused));
+
+      c->set_output(0, c->Vector(c->UnknownDim()));
+      return Status::OK();
+    });
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/image_ops_test.cc b/tensorflow/core/ops/image_ops_test.cc
index 5f0b391b0d14a9..517af26b44f53d 100644
--- a/tensorflow/core/ops/image_ops_test.cc
+++ b/tensorflow/core/ops/image_ops_test.cc
@@ -312,4 +312,23 @@ TEST(ImageOpsTest, QuantizedResizeBilinear_ShapeFn) {
   INFER_OK(op, "[1,?,3,?];[2];[];[]", "[d0_0,20,30,d0_3];[];[]");
 }
 
+TEST(ImageOpsTest, DrawBoundingBoxes_ShapeFn) {
+  ShapeInferenceTestOp op("DrawBoundingBoxes");
+  op.input_tensors.resize(2);
+
+  // Check images.
+  INFER_ERROR("must be rank 4", op, "[1,?,3];?");
+  INFER_ERROR("should be either 1 (GRY), 3 (RGB), or 4 (RGBA)",
+      op, "[1,?,?,5];?");
+
+  // Check boxes.
+  INFER_ERROR("must be rank 3", op, "[1,?,?,4];[1,4]");
+  INFER_ERROR("Dimension must be 4", op, "[1,?,?,4];[1,2,2]");
+
+  // OK shapes.
+  INFER_OK(op, "[4,?,?,4];?", "in0");
+  INFER_OK(op, "[?,?,?,?];[?,?,?]", "in0");
+  INFER_OK(op, "[4,?,?,4];[?,?,?]", "in0");
+  INFER_OK(op, "[4,?,?,4];[?,?,4]", "in0");
+}
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 8f8443a46cfa68..929213656cccb9 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1017,7 +1017,7 @@ REGISTER_OP("SegmentMean")
     .Input("data: T")
     .Input("segment_ids: Tindices")
     .Output("output: T")
-    .Attr("T: realnumbertype")
+    .Attr("T: numbertype")
     .Attr("Tindices: {int32,int64}")
     .SetShapeFn(SegmentReductionShapeFn);
 
@@ -1080,7 +1080,7 @@ REGISTER_OP("UnsortedSegmentProd")
     .Input("segment_ids: Tindices")
     .Input("num_segments: Tnumsegments")
     .Output("output: T")
-    .Attr("T: realnumbertype")
+    .Attr("T: numbertype")
     .Attr("Tindices: {int32,int64}")
     .Attr("Tnumsegments: {int32,int64} = DT_INT32")
     .SetShapeFn(UnsortedSegmentReductionShapeFn);
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index bb46dafd424fe6..41efa49ce3d5e2 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -547,7 +547,7 @@ REGISTER_OP("Conv3DBackpropFilter")
     });
 
 REGISTER_OP("Conv3DBackpropInputV2")
-    .Input("input_sizes: int32")
+    .Input("input_sizes: Tshape")
     .Input("filter: T")
     .Input("out_backprop: T")
     .Output("output: T")
@@ -556,6 +556,7 @@ REGISTER_OP("Conv3DBackpropInputV2")
     .Attr(GetPaddingAttrString())
     .Attr(GetConvnet3dDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1, 1]")
+    .Attr("Tshape: {int32, int64} = DT_INT32")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0, &s));
@@ -1452,6 +1453,7 @@ REGISTER_OP("QuantizedReluX")
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
       TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
       c->set_output(1, c->Scalar());
       c->set_output(2, c->Scalar());
       return Status::OK();
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 90368fe614eb2d..7df43663c93408 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -649,6 +649,26 @@ op {
     }
   }
 }
+op {
+  name: "AnonymousIterator"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "Any"
   input_arg {
@@ -3029,6 +3049,90 @@ op {
     explanation: "Use FFT3D"
   }
 }
+op {
+  name: "BatchFunction"
+  input_arg {
+    name: "in_tensors"
+    type_list_attr: "Tin"
+  }
+  input_arg {
+    name: "captured_tensors"
+    type_list_attr: "Tcaptured"
+  }
+  output_arg {
+    name: "out_tensors"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "num_batch_threads"
+    type: "int"
+  }
+  attr {
+    name: "max_batch_size"
+    type: "int"
+  }
+  attr {
+    name: "batch_timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "max_enqueued_batches"
+    type: "int"
+    default_value {
+      i: 10
+    }
+  }
+  attr {
+    name: "allowed_batch_sizes"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "batching_queue"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tcaptured"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "BatchIFFT"
   input_arg {
@@ -5927,7 +6031,7 @@ op {
   name: "Conv3DBackpropInputV2"
   input_arg {
     name: "input_sizes"
-    type: DT_INT32
+    type_attr: "Tshape"
   }
   input_arg {
     name: "filter"
@@ -5995,6 +6099,19 @@ op {
       }
     }
   }
+  attr {
+    name: "Tshape"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
   name: "Copy"
@@ -6242,6 +6359,7 @@ op {
     allowed_values {
       list {
         s: "bilinear"
+        s: "nearest"
       }
     }
   }
@@ -6347,6 +6465,7 @@ op {
     allowed_values {
       list {
         s: "bilinear"
+        s: "nearest"
       }
     }
   }
@@ -7435,6 +7554,17 @@ op {
     }
   }
 }
+op {
+  name: "DatasetToGraph"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "graph"
+    type: DT_STRING
+  }
+}
 op {
   name: "DatasetToSingleElement"
   input_arg {
@@ -9709,33 +9839,72 @@ op {
   name: "FFT"
   input_arg {
     name: "input"
-    type: DT_COMPLEX64
+    type_attr: "Tcomplex"
   }
   output_arg {
     name: "output"
-    type: DT_COMPLEX64
+    type_attr: "Tcomplex"
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
 }
 op {
   name: "FFT2D"
   input_arg {
     name: "input"
-    type: DT_COMPLEX64
+    type_attr: "Tcomplex"
   }
   output_arg {
     name: "output"
-    type: DT_COMPLEX64
+    type_attr: "Tcomplex"
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
 }
 op {
   name: "FFT3D"
   input_arg {
     name: "input"
-    type: DT_COMPLEX64
+    type_attr: "Tcomplex"
   }
   output_arg {
     name: "output"
-    type: DT_COMPLEX64
+    type_attr: "Tcomplex"
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
 }
 op {
@@ -9834,6 +10003,21 @@ op {
     type: DT_STRING
   }
 }
+op {
+  name: "FakeParam"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+}
 op {
   name: "FakeQuantWithMinMaxArgs"
   input_arg {
@@ -11536,6 +11720,82 @@ op {
     }
   }
 }
+op {
+  name: "GroupByReducerDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "init_func_other_arguments"
+    type_list_attr: "Tinit_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "finalize_func_other_arguments"
+    type_list_attr: "Tfinalize_func_other_arguments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "key_func"
+    type: "func"
+  }
+  attr {
+    name: "init_func"
+    type: "func"
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "finalize_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tinit_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tfinalize_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "GroupByWindowDataset"
   input_arg {
@@ -11801,33 +12061,72 @@ op {
   name: "IFFT"
   input_arg {
     name: "input"
-    type: DT_COMPLEX64
+    type_attr: "Tcomplex"
   }
   output_arg {
     name: "output"
-    type: DT_COMPLEX64
+    type_attr: "Tcomplex"
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
 }
 op {
   name: "IFFT2D"
   input_arg {
     name: "input"
-    type: DT_COMPLEX64
+    type_attr: "Tcomplex"
   }
   output_arg {
     name: "output"
-    type: DT_COMPLEX64
+    type_attr: "Tcomplex"
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
 }
 op {
   name: "IFFT3D"
   input_arg {
     name: "input"
-    type: DT_COMPLEX64
+    type_attr: "Tcomplex"
   }
   output_arg {
     name: "output"
-    type: DT_COMPLEX64
+    type_attr: "Tcomplex"
+  }
+  attr {
+    name: "Tcomplex"
+    type: "type"
+    default_value {
+      type: DT_COMPLEX64
+    }
+    allowed_values {
+      list {
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
 }
 op {
@@ -11890,6 +12189,29 @@ op {
     type: "type"
   }
 }
+op {
+  name: "IdentityDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "IdentityN"
   input_arg {
@@ -13785,6 +14107,54 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "MapAndBatchDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "MapClear"
   attr {
@@ -16365,6 +16735,33 @@ op {
     type: DT_INT32
   }
 }
+op {
+  name: "NonMaxSuppressionV3"
+  input_arg {
+    name: "boxes"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "scores"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "max_output_size"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "iou_threshold"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "score_threshold"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "selected_indices"
+    type: DT_INT32
+  }
+}
 op {
   name: "NotEqual"
   input_arg {
@@ -16565,6 +16962,33 @@ op {
     }
   }
 }
+op {
+  name: "OptimizeDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "optimizations"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "OrderedMapClear"
   attr {
@@ -21263,6 +21687,21 @@ op {
   }
   allows_uninitialized_input: true
 }
+op {
+  name: "RegexFullMatch"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "pattern"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "output"
+    type: DT_BOOL
+  }
+}
 op {
   name: "RegexReplace"
   input_arg {
@@ -25816,9 +26255,14 @@ op {
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_COMPLEX64
         type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
         type: DT_BFLOAT16
         type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
diff --git a/tensorflow/core/ops/random_ops.cc b/tensorflow/core/ops/random_ops.cc
index 416ce9c0d82ca0..80ffae579655d5 100644
--- a/tensorflow/core/ops/random_ops.cc
+++ b/tensorflow/core/ops/random_ops.cc
@@ -72,7 +72,15 @@ REGISTER_OP("ParameterizedTruncatedNormal")
     .Attr("seed2: int = 0")
     .Attr("dtype: {half,bfloat16,float,double}")
     .Attr("T: {int32, int64}")
-    .SetShapeFn(shape_inference::RandomShape);
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      // Parameters must be 0-d or 1-d.
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(1), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(3), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(4), 1, &unused));
+      return shape_inference::RandomShape(c);
+    });
 
 REGISTER_OP("TruncatedNormal")
     .Input("shape: T")
diff --git a/tensorflow/core/ops/rpc_ops.cc b/tensorflow/core/ops/rpc_ops.cc
index 72fda5e6eba3fd..136f96d9ea764e 100644
--- a/tensorflow/core/ops/rpc_ops.cc
+++ b/tensorflow/core/ops/rpc_ops.cc
@@ -18,7 +18,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-using tensorflow::shape_inference::DimensionHandle;
 using tensorflow::shape_inference::InferenceContext;
 using tensorflow::shape_inference::ShapeHandle;
 
diff --git a/tensorflow/core/ops/scoped_allocator_ops.cc b/tensorflow/core/ops/scoped_allocator_ops.cc
index f053a53f4cf02e..359b4d8756fb23 100644
--- a/tensorflow/core/ops/scoped_allocator_ops.cc
+++ b/tensorflow/core/ops/scoped_allocator_ops.cc
@@ -21,6 +21,7 @@ namespace tensorflow {
 REGISTER_OP("_ScopedAllocator")
     .Output("output: T")
     .Attr("shapes: list(shape)")
+    .Attr("shape: shape")
     .Attr("T: type")
     .Attr("sa_name: string")
     .Attr("id: int")
@@ -35,6 +36,16 @@ Returns a reference to this value.
 
 This is an experimental op for internal use only.  It is possible to use this
 op in unsafe ways.
+
+'shapes' is a list of the shapes of the tensors that are to be allocated
+by this ScopedAllocator.
+'shape' is the shape of the output of this Op, i.e. the 1D backing tensor
+from which the individual allocated tensors are aliased.
+'sa_name' is the name assigned to the Node, for connectivity specification
+and debugging.
+'id' is a non-negative integer 'scope_id' handled by the ScopedAllocatorMgr.
+'expected_call_count' is the number of individual tensors expected to
+be allocated from the backing tensor.
 )doc");
 
 REGISTER_OP("_ScopedAllocatorConcat")
@@ -43,6 +54,7 @@ REGISTER_OP("_ScopedAllocatorConcat")
     .Input("inputs: N * T")
     .Attr("shape: shape")
     .Attr("T: type")
+    .Attr("reshape: bool = false")
     .Attr("sa_name: string")
     .Attr("id: int")
     .Attr("N: int >= 2")
@@ -56,6 +68,18 @@ reference to that ScopedAllocator's backing tensor.
 
 This is an experimental op for internal use only.  It is possible to use this
 op in unsafe ways.
+
+'backing' is the backing tensor, i.e. the output of an upstream ScopedAllocator.
+'inputs' is a list of nominal input tensors, all of which must be aliases
+to regions of the backing tensor.  These will be outputs of upstream nodes
+that allocate their outputs from the same ScopedAllocator.
+'shape' is the shape of the output, which will usually be the same shape as
+the input backing tensor.
+'reshape' is true iff the output shape is to be different from that of
+the input backing tensor.
+'sa_name' is the Node name of the upstream ScopedAllocator.
+'id' is the scope_id identifying the upstream ScopedAllocator.
+'N' is the number of nominal inputs to be concatenated.
 )doc");
 
 REGISTER_OP("_ScopedAllocatorSplit")
@@ -66,16 +90,30 @@ REGISTER_OP("_ScopedAllocatorSplit")
     .Attr("sa_name: string")
     .Attr("id: int")
     .Attr("N: int >= 2")
+    .Attr("shapes: list(shape)")
     .SetIsStateful()
-    .SetShapeFn(shape_inference::ExplicitShape)
+    .SetShapeFn(shape_inference::ExplicitShapes)
     .Doc(R"doc(
-Acts like a Concat Op that merges multple tensors into one, however it must
-only be used in conjunction with a ScopedAllocator which is backing the memory
-of all of its input tensors so that actually it just outputs a read-only
-reference to that ScopedAllocator's backing tensor.
+Acts roughly like a SplitV Op that splits one tensor into multiple tensors
+but must only be used in conjunction with corresponding ScopedAllocator
+and ScopedAllocatorConcat instances.  In practice it is provided as inputs
+the backing tensor as first input, which contains the concatenated values,
+and a list of alias tensors as its other input and it simply outputs that
+second list.
 
 This is an experimental op for internal use only.  It is possible to use this
 op in unsafe ways.
+
+'concat' is the single output produced by an upstream ScopedAllocatorConcat
+node.  This is actually the backing tensor from a ScopedAllocator node
+upstream of the ScopedAllocatorConcat.
+'split' is a list of tensors aliased from the backing tensor.  It will
+become the output of this ScopedAllocatorSplit node.
+'type' is the common DataType of all of the input and output tensors.
+'sa_name' is the Node name of the upstream ScopedAllocator.
+'id' is the scope_id identifying the upstream ScopedAllocator.
+'N' is the number of split tensors.
+'shapes' is a list of the split tensor shapes.
 )doc");
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/spectral_ops.cc b/tensorflow/core/ops/spectral_ops.cc
index 2790aee37e93d3..b1ae7040f02dee 100644
--- a/tensorflow/core/ops/spectral_ops.cc
+++ b/tensorflow/core/ops/spectral_ops.cc
@@ -25,43 +25,49 @@ using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
 
 REGISTER_OP("FFT")
-    .Input("input: complex64")
-    .Output("output: complex64")
+    .Input("input: Tcomplex")
+    .Output("output: Tcomplex")
+    .Attr("Tcomplex: {complex64, complex128} = DT_COMPLEX64")
     .SetShapeFn([](InferenceContext* c) {
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 1);
     });
 
 REGISTER_OP("IFFT")
-    .Input("input: complex64")
-    .Output("output: complex64")
+    .Input("input: Tcomplex")
+    .Output("output: Tcomplex")
+    .Attr("Tcomplex: {complex64, complex128} = DT_COMPLEX64")
     .SetShapeFn([](InferenceContext* c) {
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 1);
     });
 
 REGISTER_OP("FFT2D")
-    .Input("input: complex64")
-    .Output("output: complex64")
+    .Input("input: Tcomplex")
+    .Output("output: Tcomplex")
+    .Attr("Tcomplex: {complex64, complex128} = DT_COMPLEX64")
     .SetShapeFn([](InferenceContext* c) {
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 2);
     });
 
 REGISTER_OP("IFFT2D")
-    .Input("input: complex64")
-    .Output("output: complex64")
+    .Input("input: Tcomplex")
+    .Output("output: Tcomplex")
+    .Attr("Tcomplex: {complex64, complex128} = DT_COMPLEX64")
     .SetShapeFn([](InferenceContext* c) {
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 2);
     });
 
 REGISTER_OP("FFT3D")
-    .Input("input: complex64")
-    .Output("output: complex64")
+    .Input("input: Tcomplex")
+    .Output("output: Tcomplex")
+    .Attr("Tcomplex: {complex64, complex128} = DT_COMPLEX64")
     .SetShapeFn([](InferenceContext* c) {
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 3);
     });
 
 REGISTER_OP("IFFT3D")
-    .Input("input: complex64")
-    .Output("output: complex64")
+    .Input("input: Tcomplex")
+    .Output("output: Tcomplex")
+    .Attr("Tcomplex: {complex64, complex128} = DT_COMPLEX64")
     .SetShapeFn([](InferenceContext* c) {
       return shape_inference::UnchangedShapeWithRankAtLeast(c, 3);
     });
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index 469f193cf41775..44230623629986 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -37,6 +37,17 @@ REGISTER_OP("RegexReplace")
       return Status::OK();
     });
 
+REGISTER_OP("RegexFullMatch")
+    .Input("input: string")
+    .Input("pattern: string")
+    .Output("output: bool")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      c->set_output(0, c->input(0));
+      return Status::OK();
+    });
+
 REGISTER_OP("StringToHashBucketFast")
     .Input("input: string")
     .Output("output: int64")
@@ -67,7 +78,7 @@ REGISTER_OP("ReduceJoin")
 REGISTER_OP("AsString")
     .Input("input: T")
     .Output("output: string")
-    .Attr("T: {int32, int64, complex64, float, double, bool, int8}")
+    .Attr("T: {int8, int16, int32, int64, complex64, float, double, bool}")
     .Attr("precision: int = -1")
     .Attr("scientific: bool = false")
     .Attr("shortest: bool = false")
@@ -123,6 +134,24 @@ REGISTER_OP("StringSplit")
       return Status::OK();
     });
 
+REGISTER_OP("StringSplitV2")
+    .Input("input: string")
+    .Input("sep: string")
+    .Output("indices: int64")
+    .Output("values: string")
+    .Output("shape: int64")
+    .Attr("maxsplit: int = -1")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+
+      c->set_output(0, c->Matrix(InferenceContext::kUnknownDim, 2));
+      c->set_output(1, c->Vector(InferenceContext::kUnknownDim));
+      c->set_output(2, c->Vector(2));
+      return Status::OK();
+    });
+
 REGISTER_OP("StringStrip")
     .Input("input: string")
     .Output("output: string")
diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index be84316c482aa5..67651349ea8cc0 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD
@@ -174,6 +174,7 @@ cc_library(
         "oauth_client.h",
     ],
     copts = tf_copts(),
+    visibility = ["//tensorflow:__subpackages__"],
     deps = [
         ":curl_http_request",
         ":http_request",
@@ -201,9 +202,6 @@ cc_library(
 
 cc_library(
     name = "retrying_file_system",
-    srcs = [
-        "retrying_file_system.cc",
-    ],
     hdrs = [
         "retrying_file_system.h",
     ],
diff --git a/tensorflow/core/platform/cloud/curl_http_request.cc b/tensorflow/core/platform/cloud/curl_http_request.cc
index 1ac6a7531b0c0b..a1be4aacceead7 100644
--- a/tensorflow/core/platform/cloud/curl_http_request.cc
+++ b/tensorflow/core/platform/cloud/curl_http_request.cc
@@ -112,10 +112,6 @@ class LibCurlProxy : public LibCurl {
   }
 
   void curl_free(void* p) override { ::curl_free(p); }
-
-  const char* curl_easy_strerror(CURLcode errornum) override {
-    return ::curl_easy_strerror(errornum);
-  }
 };
 }  // namespace
 
@@ -313,7 +309,7 @@ void CurlHttpRequest::SetResultBufferDirect(char* buffer, size_t size) {
   CHECK(buffer != nullptr);
   CheckNotSent();
 
-  direct_response_ = DirectResponseState{buffer, size, 0};
+  direct_response_ = DirectResponseState{buffer, size, 0, 0};
   CHECK_CURL_OK(libcurl_->curl_easy_setopt(curl_, CURLOPT_WRITEDATA,
                                            reinterpret_cast<void*>(this)));
   CHECK_CURL_OK(libcurl_->curl_easy_setopt(
@@ -335,24 +331,15 @@ size_t CurlHttpRequest::WriteCallbackDirect(const void* ptr, size_t size,
   size_t curl_bytes_received = size * nmemb;
   size_t user_buffer_bytes_available =
       state->buffer_size_ - state->bytes_transferred_;
-
-  // The HTTP server may send a response body that is longer than what we
-  // expected. We must not use CHECK() for this situation, because that would
-  // imply a code bug (in this client code) where none exists; the violation of
-  // expectations would have been caused by the server, not the client. So we
-  // report a log warning, if an HTTP server is misbehaving.
-  if (curl_bytes_received > user_buffer_bytes_available) {
-    LOG(WARNING) << "The HTTP response body that we received is longer than we "
-                    "requested or expected. "
-                 << "Total bytes requested: " << state->buffer_size_
-                 << " Bytes received (so far) in HTTP response body: "
-                 << (state->bytes_transferred_ + curl_bytes_received);
-  }
-
   size_t bytes_to_copy =
       std::min<size_t>(curl_bytes_received, user_buffer_bytes_available);
   memcpy(&state->buffer_[state->bytes_transferred_], ptr, bytes_to_copy);
   state->bytes_transferred_ += bytes_to_copy;
+  state->bytes_received_ += curl_bytes_received;
+  // If we didn't have room to store the full response, returning less than
+  // curl_bytes_received here will abort the transfer and curl_easy_perform()
+  // will return CURLE_WRITE_ERROR. We will detect and handle this error there,
+  // and can use state->bytes_received_ as stored above for logging purposes.
   return bytes_to_copy;
 }
 
@@ -407,9 +394,9 @@ size_t CurlHttpRequest::HeaderCallback(const void* ptr, size_t size,
           .StopCapture()
           .OneLiteral(": ")
           .GetResult(&value, &name)) {
-    string str_value = value.ToString();
+    string str_value = std::string(value);
     str_util::StripTrailingWhitespace(&str_value);
-    that->response_headers_[name.ToString()] = str_value;
+    that->response_headers_[std::string(name)] = str_value;
   }
   return size * nmemb;
 }
@@ -447,23 +434,7 @@ Status CurlHttpRequest::Send() {
   }
 
   const CURLcode curl_result = libcurl_->curl_easy_perform(curl_);
-  TF_CURL_RETURN_WITH_CONTEXT_IF_ERROR(
-      curl_result, "Performing request. Detailed error: ", error_buffer);
-
-  auto get_error_message = [this, curl_result, &error_buffer]() -> string {
-    StringPiece response = GetResponse();
-    string error_message = strings::StrCat(
-        "Error executing an HTTP request (HTTP response code ", response_code_,
-        ", error code ", curl_result, ", error message '", error_buffer, "')");
-    if (!response.empty()) {
-      return strings::StrCat(
-          error_message, ", response '",
-          response.substr(0,
-                          std::min(response.size(), response_to_error_limit_)),
-          "'");
-    }
-    return error_message;
-  };
+  TF_RETURN_IF_ERROR(CURLcodeToStatus(curl_result, error_buffer));
 
   double written_size = 0;
   CHECK_CURL_OK(libcurl_->curl_easy_getinfo(curl_, CURLINFO_SIZE_DOWNLOAD,
@@ -472,6 +443,18 @@ Status CurlHttpRequest::Send() {
   CHECK_CURL_OK(libcurl_->curl_easy_getinfo(curl_, CURLINFO_RESPONSE_CODE,
                                             &response_code_));
 
+  auto get_error_message = [this]() -> string {
+    string error_message = strings::StrCat(
+        "Error executing an HTTP request: HTTP response code ", response_code_);
+    StringPiece body = GetResponse();
+    if (!body.empty()) {
+      return strings::StrCat(
+          error_message, " with body '",
+          body.substr(0, std::min(body.size(), response_to_error_limit_)), "'");
+    }
+    return error_message;
+  };
+
   Status result;
   switch (response_code_) {
     // The group of response codes indicating that the request achieved
@@ -485,9 +468,12 @@ Status CurlHttpRequest::Send() {
 
     case 416:  // Requested Range Not Satisfiable
       // The requested range had no overlap with the available range.
-      // This doesn't indicate an error, but this does mean an empty response
-      // body.
+      // This doesn't indicate an error, but we should produce an empty response
+      // body. (Not all servers do; GCS returns a short error message body.)
       response_buffer_->clear();
+      if (IsDirectResponse()) {
+        direct_response_.bytes_transferred_ = 0;
+      }
       result = Status::OK();
       break;
 
@@ -613,14 +599,13 @@ int CurlHttpRequest::ProgressCallback(void* this_object, curl_off_t dltotal,
                << " bytes for " << now - that->last_progress_timestamp_
                << " seconds and will be aborted. CURL timing information: "
                << "lookup time: " << lookup_time << " ("
-               << that->libcurl_->curl_easy_strerror(lookup_time_status)
+               << curl_easy_strerror(lookup_time_status)
                << "), connect time: " << connect_time << " ("
-               << that->libcurl_->curl_easy_strerror(connect_time_status)
+               << curl_easy_strerror(connect_time_status)
                << "), pre-transfer time: " << pretransfer_time << " ("
-               << that->libcurl_->curl_easy_strerror(pretransfer_time_status)
+               << curl_easy_strerror(pretransfer_time_status)
                << "), start-transfer time: " << starttransfer_time << " ("
-               << that->libcurl_->curl_easy_strerror(starttransfer_time_status)
-               << ")";
+               << curl_easy_strerror(starttransfer_time_status) << ")";
     return 1;  // Will abort the request.
   }
 
@@ -628,12 +613,36 @@ int CurlHttpRequest::ProgressCallback(void* this_object, curl_off_t dltotal,
   return 0;
 }
 
-Status CURLcodeToStatus(CURLcode code) {
-  // Return Unavailable to retry by default. We probably should distinguish
-  // between permanent or temporary failures.
-  return errors::Unavailable("Error executing an HTTP request (error code ",
-                             code, ", error message '",
-                             curl_easy_strerror(code), "')");
+Status CurlHttpRequest::CURLcodeToStatus(CURLcode code,
+                                         const char* error_buffer) {
+  if (code == CURLE_OK) {
+    return Status::OK();
+  }
+  string error_message = strings::StrCat(
+      "Error executing an HTTP request: libcurl code ", code, " meaning '",
+      curl_easy_strerror(code), "', error details: ");
+  // Special-case response-too-large errors as FAILED_PRECONDITION.
+  if (code == CURLE_WRITE_ERROR && IsDirectResponse() &&
+      direct_response_.bytes_received_ > direct_response_.buffer_size_) {
+    string overflow_message = strings::StrCat(
+        "Received ", direct_response_.bytes_received_, " response bytes ",
+        "for a ", direct_response_.buffer_size_, "-byte buffer");
+    uint64 response_code = 0;
+    const CURLcode get_response_result = libcurl_->curl_easy_getinfo(
+        curl_, CURLINFO_RESPONSE_CODE, &response_code);
+    // Special-case 416 Range Not Satisfied responses; they sometimes have
+    // a response body (e.g. GCS sends one with an error message) but we
+    // pretend as though they don't, so actually ignore this error.
+    if (get_response_result == CURLE_OK && response_code == 416) {
+      return Status::OK();
+    }
+    return errors::FailedPrecondition(
+        strings::StrCat(error_message, overflow_message));
+  }
+  // Return Unavailable to retry by default. There may be other permanent
+  // failures that should be distinguished.
+  return errors::Unavailable(
+      strings::StrCat(error_message, *error_buffer ? error_buffer : "(none)"));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/curl_http_request.h b/tensorflow/core/platform/cloud/curl_http_request.h
index e658948ab9b9dd..1b2029926d6d60 100644
--- a/tensorflow/core/platform/cloud/curl_http_request.h
+++ b/tensorflow/core/platform/cloud/curl_http_request.h
@@ -167,6 +167,10 @@ class CurlHttpRequest : public HttpRequest {
   void CheckNotSent() const;
   StringPiece GetResponse() const;
 
+  /// Helper to convert the given CURLcode and error buffer, representing the
+  /// result of performing a transfer, into a Status with an error message.
+  Status CURLcodeToStatus(CURLcode code, const char* error_buffer);
+
   LibCurl* libcurl_;
   Env* env_;
 
@@ -181,6 +185,7 @@ class CurlHttpRequest : public HttpRequest {
     char* buffer_;
     size_t buffer_size_;
     size_t bytes_transferred_;
+    size_t bytes_received_;
   };
   DirectResponseState direct_response_ = {};
 
@@ -261,21 +266,8 @@ class LibCurl {
   virtual void curl_slist_free_all(curl_slist* list) = 0;
   virtual char* curl_easy_escape(CURL* curl, const char* str, int length) = 0;
   virtual void curl_free(void* p) = 0;
-
-  virtual const char* curl_easy_strerror(CURLcode errornum) = 0;
 };
 
-Status CURLcodeToStatus(CURLcode code);
-
-#define TF_CURL_RETURN_WITH_CONTEXT_IF_ERROR(_code, ...)                    \
-  do {                                                                      \
-    if (_code != CURLE_OK) {                                                \
-      ::tensorflow::Status _status = ::tensorflow::CURLcodeToStatus(_code); \
-      ::tensorflow::errors::AppendToMessage(&_status, __VA_ARGS__);         \
-      return _status;                                                       \
-    }                                                                       \
-  } while (0)
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_CURL_HTTP_REQUEST_H_
diff --git a/tensorflow/core/platform/cloud/curl_http_request_test.cc b/tensorflow/core/platform/cloud/curl_http_request_test.cc
index 522b717568777b..eb9023d70890e4 100644
--- a/tensorflow/core/platform/cloud/curl_http_request_test.cc
+++ b/tensorflow/core/platform/cloud/curl_http_request_test.cc
@@ -149,8 +149,12 @@ class FakeLibCurl : public LibCurl {
       } while (bytes_read > 0);
     }
     if (write_data_ || write_callback_) {
-      write_callback_(response_content_.c_str(), 1, response_content_.size(),
-                      write_data_);
+      size_t bytes_handled = write_callback_(
+          response_content_.c_str(), 1, response_content_.size(), write_data_);
+      // Mimic real libcurl behavior by checking write callback return value.
+      if (bytes_handled != response_content_.size()) {
+        curl_easy_perform_result_ = CURLE_WRITE_ERROR;
+      }
     }
     for (const auto& header : response_headers_) {
       header_callback_(header.c_str(), 1, header.size(), header_data_);
@@ -219,10 +223,6 @@ class FakeLibCurl : public LibCurl {
   }
   void curl_free(void* p) override { port::Free(p); }
 
-  const char* curl_easy_strerror(CURLcode errornum) override {
-    return "<unimplemented>";
-  }
-
   // Variables defining the behavior of this fake.
   string response_content_;
   uint64 response_code_;
@@ -302,7 +302,7 @@ TEST(CurlHttpRequestTest, GetRequest_Direct) {
   string expected_response = "get response";
   size_t response_bytes_transferred =
       http_request.GetResultBufferDirectBytesTransferred();
-  EXPECT_EQ(response_bytes_transferred, expected_response.size());
+  EXPECT_EQ(expected_response.size(), response_bytes_transferred);
   EXPECT_EQ(
       "get response",
       string(scratch.begin(), scratch.begin() + response_bytes_transferred));
@@ -318,6 +318,48 @@ TEST(CurlHttpRequestTest, GetRequest_Direct) {
   EXPECT_EQ(200, http_request.GetResponseCode());
 }
 
+TEST(CurlHttpRequestTest, GetRequest_Direct_ResponseTooLarge) {
+  FakeLibCurl libcurl("get response", 200);
+  CurlHttpRequest http_request(&libcurl);
+
+  std::vector<char> scratch(5, 0);
+
+  http_request.SetUri("http://www.testuri.com");
+  http_request.SetResultBufferDirect(scratch.data(), scratch.size());
+  const Status& status = http_request.Send();
+  EXPECT_EQ(error::FAILED_PRECONDITION, status.code());
+  EXPECT_EQ(
+      "Error executing an HTTP request: libcurl code 23 meaning "
+      "'Failed writing received data to disk/application', error details: "
+      "Received 12 response bytes for a 5-byte buffer",
+      status.error_message());
+
+  // As long as the request clearly fails, ok to leave truncated response here.
+  EXPECT_EQ(5, http_request.GetResultBufferDirectBytesTransferred());
+  EXPECT_EQ("get r", string(scratch.begin(), scratch.begin() + 5));
+}
+
+TEST(CurlHttpRequestTest, GetRequest_Direct_RangeOutOfBound) {
+  FakeLibCurl libcurl("get response", 416);
+  CurlHttpRequest http_request(&libcurl);
+
+  const string initialScratch = "abcde";
+  std::vector<char> scratch;
+  scratch.insert(scratch.end(), initialScratch.begin(), initialScratch.end());
+
+  http_request.SetUri("http://www.testuri.com");
+  http_request.SetRange(0, 4);
+  http_request.SetResultBufferDirect(scratch.data(), scratch.size());
+  TF_EXPECT_OK(http_request.Send());
+  EXPECT_EQ(416, http_request.GetResponseCode());
+
+  // Some servers (in particular, GCS) return an error message payload with a
+  // 416 Range Not Satisfiable response. We should pretend it's not there when
+  // reporting bytes transferred, but it's ok if it writes to scratch.
+  EXPECT_EQ(0, http_request.GetResultBufferDirectBytesTransferred());
+  EXPECT_EQ("get r", string(scratch.begin(), scratch.end()));
+}
+
 TEST(CurlHttpRequestTest, GetRequest_Empty) {
   FakeLibCurl libcurl("", 200);
   CurlHttpRequest http_request(&libcurl);
@@ -357,28 +399,26 @@ TEST(CurlHttpRequestTest, GetRequest_RangeOutOfBound) {
   http_request.SetResultBuffer(&scratch);
   TF_EXPECT_OK(http_request.Send());
 
+  // Some servers (in particular, GCS) return an error message payload with a
+  // 416 Range Not Satisfiable response. We should pretend it's not there.
   EXPECT_TRUE(scratch.empty());
   EXPECT_EQ(416, http_request.GetResponseCode());
 }
 
 TEST(CurlHttpRequestTest, GetRequest_503) {
   FakeLibCurl libcurl("get response", 503);
-  libcurl.curl_easy_perform_result_ = CURLE_WRITE_ERROR;
   CurlHttpRequest http_request(&libcurl);
 
   std::vector<char> scratch;
   scratch.insert(scratch.end(), kTestContent.begin(), kTestContent.end());
 
   http_request.SetUri("http://www.testuri.com");
-  http_request.AddAuthBearerHeader("fake-bearer");
-  http_request.SetRange(100, 199);
   http_request.SetResultBuffer(&scratch);
   const auto& status = http_request.Send();
   EXPECT_EQ(error::UNAVAILABLE, status.code());
   EXPECT_EQ(
-      "Error executing an HTTP request (error code 23, error message 'Failed "
-      "writing received data to disk/application')\n\tPerforming request. "
-      "Detailed error: ",
+      "Error executing an HTTP request: HTTP response code 503 with body "
+      "'get response'",
       status.error_message());
 }
 
@@ -395,9 +435,8 @@ TEST(CurlHttpRequestTest, GetRequest_HttpCode0) {
   const auto& status = http_request.Send();
   EXPECT_EQ(error::UNAVAILABLE, status.code());
   EXPECT_EQ(
-      "Error executing an HTTP request (error code 28, error message 'Timeout "
-      "was reached')\n\tPerforming request. Detailed error: Operation timed "
-      "out",
+      "Error executing an HTTP request: libcurl code 28 meaning "
+      "'Timeout was reached', error details: Operation timed out",
       status.error_message());
   EXPECT_EQ(0, http_request.GetResponseCode());
 }
@@ -630,9 +669,8 @@ TEST(CurlHttpRequestTest, ProgressIsStuck) {
   auto status = http_request.Send();
   EXPECT_EQ(error::UNAVAILABLE, status.code());
   EXPECT_EQ(
-      "Error executing an HTTP request (error code 42, error message "
-      "'Operation was aborted by an application callback')\n\tPerforming "
-      "request. Detailed error: ",
+      "Error executing an HTTP request: libcurl code 42 meaning 'Operation "
+      "was aborted by an application callback', error details: (none)",
       status.error_message());
 }
 
diff --git a/tensorflow/core/platform/cloud/file_block_cache.h b/tensorflow/core/platform/cloud/file_block_cache.h
index da167882470bfa..c98b10640fa0b7 100644
--- a/tensorflow/core/platform/cloud/file_block_cache.h
+++ b/tensorflow/core/platform/cloud/file_block_cache.h
@@ -67,6 +67,13 @@ class FileBlockCache {
   virtual Status Read(const string& filename, size_t offset, size_t n,
                       char* buffer, size_t* bytes_transferred) = 0;
 
+  // Validate the given file signature with the existing file signature in the
+  // cache. Returns true if the signature doesn't change or the file did not
+  // exist before. If the signature changes, update the existing signature with
+  // the new one and remove the file from cache.
+  virtual bool ValidateAndUpdateFileSignature(const string& filename,
+                                              int64 file_signature) = 0;
+
   /// Remove all cached blocks for `filename`.
   virtual void RemoveFile(const string& filename) = 0;
 
@@ -80,6 +87,10 @@ class FileBlockCache {
 
   /// The current size (in bytes) of the cache.
   virtual size_t CacheSize() const = 0;
+
+  // Returns true if the cache is enabled. If false, the BlockFetcher callback
+  // is always executed during Read.
+  virtual bool IsCacheEnabled() const = 0;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/gcs_dns_cache.cc b/tensorflow/core/platform/cloud/gcs_dns_cache.cc
index 4d9aff4d24f06c..f2e64662a92309 100644
--- a/tensorflow/core/platform/cloud/gcs_dns_cache.cc
+++ b/tensorflow/core/platform/cloud/gcs_dns_cache.cc
@@ -71,8 +71,8 @@ void GcsDnsCache::AnnotateRequest(HttpRequest* request) {
     addresses_ = ResolveNames(kCachedDomainNames);
 
     // Note: we opt to use a thread instead of a delayed closure.
-    worker_.reset(env_->StartThread(
-        {}, "gcs_dns_worker", std::bind(&GcsDnsCache::WorkerThread, this)));
+    worker_.reset(env_->StartThread({}, "gcs_dns_worker",
+                                    [this]() { return WorkerThread(); }));
     started_ = true;
   }
 
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 2d9c99c124a3f2..22ae6121e0fb9e 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -64,6 +64,10 @@ constexpr uint64 HTTP_CODE_RESUME_INCOMPLETE = 308;
 // The environment variable that overrides the size of the readahead buffer.
 // DEPRECATED. Use GCS_BLOCK_SIZE_MB instead.
 constexpr char kReadaheadBufferSize[] = "GCS_READAHEAD_BUFFER_SIZE_BYTES";
+// The environment variable that disables the GCS block cache for reads.
+// This is the explicit alternative to setting BLOCK_SIZE or MAX_SIZE to 0, and
+// takes precedence over either of those environment variables.
+constexpr char kReadCacheDisabled[] = "GCS_READ_CACHE_DISABLED";
 // The environment variable that overrides the block size for aligned reads from
 // GCS. Specified in MB (e.g. "16" = 16 x 1024 x 1024 = 16777216 bytes).
 constexpr char kBlockSize[] = "GCS_READ_CACHE_BLOCK_SIZE_MB";
@@ -80,7 +84,7 @@ constexpr uint64 kDefaultMaxStaleness = 0;
 // The environment variable that overrides the maximum age of entries in the
 // Stat cache. A value of 0 (the default) means nothing is cached.
 constexpr char kStatCacheMaxAge[] = "GCS_STAT_CACHE_MAX_AGE";
-constexpr uint64 kStatCacheDefaultMaxAge = 0;
+constexpr uint64 kStatCacheDefaultMaxAge = 5;
 // The environment variable that overrides the maximum number of entries in the
 // Stat cache.
 constexpr char kStatCacheMaxEntries[] = "GCS_STAT_CACHE_MAX_ENTRIES";
@@ -103,7 +107,7 @@ constexpr char kResolveCacheSecs[] = "GCS_RESOLVE_REFRESH_SECS";
 // The environment variable to configure the http request's connection timeout.
 constexpr char kRequestConnectionTimeout[] =
     "GCS_REQUEST_CONNECTION_TIMEOUT_SECS";
-// The environment varaible to configure the http request's idle timeout.
+// The environment variable to configure the http request's idle timeout.
 constexpr char kRequestIdleTimeout[] = "GCS_REQUEST_IDLE_TIMEOUT_SECS";
 // The environment variable to configure the overall request timeout for
 // metadata requests.
@@ -129,9 +133,6 @@ constexpr char kInitialTokens[] = "GCS_INITIAL_TOKENS";
 
 // TODO: DO NOT use a hardcoded path
 Status GetTmpFilename(string* filename) {
-  if (!filename) {
-    return errors::Internal("'filename' cannot be nullptr.");
-  }
 #ifndef _WIN32
   char buffer[] = "/tmp/gcs_filesystem_XXXXXX";
   int fd = mkstemp(buffer);
@@ -158,22 +159,19 @@ Status GetTmpFilename(string* filename) {
 /// object is empty.
 Status ParseGcsPath(StringPiece fname, bool empty_object_ok, string* bucket,
                     string* object) {
-  if (!bucket || !object) {
-    return errors::Internal("bucket and object cannot be null.");
-  }
   StringPiece scheme, bucketp, objectp;
   io::ParseURI(fname, &scheme, &bucketp, &objectp);
   if (scheme != "gs") {
     return errors::InvalidArgument("GCS path doesn't start with 'gs://': ",
                                    fname);
   }
-  *bucket = bucketp.ToString();
+  *bucket = std::string(bucketp);
   if (bucket->empty() || *bucket == ".") {
     return errors::InvalidArgument("GCS path doesn't contain a bucket name: ",
                                    fname);
   }
   str_util::ConsumePrefix(&objectp, "/");
-  *object = objectp.ToString();
+  *object = std::string(objectp);
   if (!empty_object_ok && object->empty()) {
     return errors::InvalidArgument("GCS path doesn't contain an object name: ",
                                    fname);
@@ -212,7 +210,7 @@ std::set<string> AddAllSubpaths(const std::vector<string>& paths) {
   for (const string& path : paths) {
     StringPiece subpath = io::Dirname(path);
     while (!subpath.empty()) {
-      result.emplace(subpath.ToString());
+      result.emplace(std::string(subpath));
       subpath = io::Dirname(subpath);
     }
   }
@@ -290,40 +288,24 @@ Status GetBoolValue(const Json::Value& parent, const char* name, bool* result) {
 /// A GCS-based implementation of a random access file with an LRU block cache.
 class GcsRandomAccessFile : public RandomAccessFile {
  public:
-  GcsRandomAccessFile(const string& filename, FileBlockCache* file_block_cache)
-      : filename_(filename), file_block_cache_(file_block_cache) {}
+  using ReadFn =
+      std::function<Status(const string& filename, uint64 offset, size_t n,
+                           StringPiece* result, char* scratch)>;
+
+  GcsRandomAccessFile(const string& filename, ReadFn read_fn)
+      : filename_(filename), read_fn_(std::move(read_fn)) {}
 
   /// The implementation of reads with an LRU block cache. Thread safe.
   Status Read(uint64 offset, size_t n, StringPiece* result,
               char* scratch) const override {
-    *result = StringPiece();
-    size_t bytes_transferred;
-    TF_RETURN_IF_ERROR(file_block_cache_->Read(filename_, offset, n, scratch,
-                                               &bytes_transferred));
-    *result = StringPiece(scratch, bytes_transferred);
-    string checkpoint_ending = "/checkpoint";
-    // Check if the file is the checkpoint file as we should not be caching
-    // that. As it's contents are updated and used for iterating checkpoints.
-    if (std::equal(checkpoint_ending.rbegin(), checkpoint_ending.rend(),
-                   filename_.rbegin())) {
-      // Remove the checkpoint file from the cache
-      file_block_cache_->RemoveFile(filename_);
-    }
-    if (bytes_transferred < n) {
-      // This is not an error per se. The RandomAccessFile interface expects
-      // that Read returns OutOfRange if fewer bytes were read than requested.
-      return errors::OutOfRange("EOF reached, ", result->size(),
-                                " bytes were read out of ", n,
-                                " bytes requested.");
-    }
-    return Status::OK();
+    return read_fn_(filename_, offset, n, result, scratch);
   }
 
  private:
   /// The filename of this file.
   const string filename_;
-  /// The LRU block cache for this file.
-  mutable FileBlockCache* file_block_cache_;  // not owned
+  /// The implementation of the read operation (provided by the GCSFileSystem).
+  const ReadFn read_fn_;
 };
 
 /// \brief GCS-based implementation of a writeable file.
@@ -464,9 +446,6 @@ class GcsWritableFile : public WritableFile {
   }
 
   Status GetCurrentFileSize(uint64* size) {
-    if (size == nullptr) {
-      return errors::Internal("'size' cannot be nullptr");
-    }
     const auto tellp = outfile_.tellp();
     if (tellp == static_cast<std::streampos>(-1)) {
       return errors::Internal(
@@ -478,9 +457,6 @@ class GcsWritableFile : public WritableFile {
 
   /// Initiates a new resumable upload session.
   Status CreateNewUploadSession(string* session_uri) {
-    if (session_uri == nullptr) {
-      return errors::Internal("'session_uri' cannot be nullptr.");
-    }
     uint64 file_size;
     TF_RETURN_IF_ERROR(GetCurrentFileSize(&file_size));
 
@@ -514,9 +490,6 @@ class GcsWritableFile : public WritableFile {
   /// uploaded size in bytes.
   Status RequestUploadSessionStatus(const string& session_uri, bool* completed,
                                     uint64* uploaded) {
-    if (completed == nullptr || uploaded == nullptr) {
-      return errors::Internal("'completed' and 'uploaded' cannot be nullptr.");
-    }
     uint64 file_size;
     TF_RETURN_IF_ERROR(GetCurrentFileSize(&file_size));
 
@@ -654,6 +627,10 @@ GcsFileSystem::GcsFileSystem()
   if (GetEnvVar(kMaxStaleness, strings::safe_strtou64, &value)) {
     max_staleness = value;
   }
+  if (std::getenv(kReadCacheDisabled)) {
+    // Setting either to 0 disables the cache; set both for good measure.
+    block_size = max_bytes = 0;
+  }
   file_block_cache_ = MakeFileBlockCache(block_size, max_bytes, max_staleness);
   // Apply overrides for the stat cache max age and max entries, if provided.
   uint64 stat_cache_max_age = kStatCacheDefaultMaxAge;
@@ -664,8 +641,8 @@ GcsFileSystem::GcsFileSystem()
   if (GetEnvVar(kStatCacheMaxEntries, strings::safe_strtou64, &value)) {
     stat_cache_max_entries = value;
   }
-  stat_cache_.reset(new ExpiringLRUCache<FileStatistics>(
-      stat_cache_max_age, stat_cache_max_entries));
+  stat_cache_.reset(new ExpiringLRUCache<GcsFileStat>(stat_cache_max_age,
+                                                      stat_cache_max_entries));
   // Apply overrides for the matching paths cache max age and max entries, if
   // provided.
   uint64 matching_paths_cache_max_age = kMatchingPathsCacheDefaultMaxAge;
@@ -704,7 +681,7 @@ GcsFileSystem::GcsFileSystem()
 
       if (!header_name.empty() && !header_value.empty()) {
         additional_header_.reset(new std::pair<const string, const string>(
-            header_name.ToString(), header_value.ToString()));
+            std::string(header_name), std::string(header_value)));
 
         VLOG(1) << "GCS additional header ENABLED. "
                 << "Name: " << additional_header_->first << ", "
@@ -786,10 +763,50 @@ Status GcsFileSystem::NewRandomAccessFile(
     const string& fname, std::unique_ptr<RandomAccessFile>* result) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
-  result->reset(new GcsRandomAccessFile(fname, file_block_cache_.get()));
+  result->reset(new GcsRandomAccessFile(fname, [this, bucket, object](
+                                                   const string& fname,
+                                                   uint64 offset, size_t n,
+                                                   StringPiece* result,
+                                                   char* scratch) {
+    tf_shared_lock l(block_cache_lock_);
+    if (file_block_cache_->IsCacheEnabled()) {
+      GcsFileStat stat;
+      TF_RETURN_IF_ERROR(stat_cache_->LookupOrCompute(
+          fname, &stat,
+          [this, bucket, object](const string& fname, GcsFileStat* stat) {
+            return UncachedStatForObject(fname, bucket, object, stat);
+          }));
+      if (!file_block_cache_->ValidateAndUpdateFileSignature(
+              fname, stat.generation_number)) {
+        VLOG(1)
+            << "File signature has been changed. Refreshing the cache. Path: "
+            << fname;
+      }
+    }
+    *result = StringPiece();
+    size_t bytes_transferred;
+    TF_RETURN_IF_ERROR(
+        file_block_cache_->Read(fname, offset, n, scratch, &bytes_transferred));
+    *result = StringPiece(scratch, bytes_transferred);
+    if (bytes_transferred < n) {
+      return errors::OutOfRange("EOF reached, ", result->size(),
+                                " bytes were read out of ", n,
+                                " bytes requested.");
+    }
+    return Status::OK();
+  }));
   return Status::OK();
 }
 
+void GcsFileSystem::ResetFileBlockCache(size_t block_size_bytes,
+                                        size_t max_bytes,
+                                        uint64 max_staleness_secs) {
+  mutex_lock l(block_cache_lock_);
+  file_block_cache_ =
+      MakeFileBlockCache(block_size_bytes, max_bytes, max_staleness_secs);
+  stats_->Configure(this, &throttle_, file_block_cache_.get());
+}
+
 // A helper function to build a FileBlockCache for GcsFileSystem.
 std::unique_ptr<FileBlockCache> GcsFileSystem::MakeFileBlockCache(
     size_t block_size, size_t max_bytes, uint64 max_staleness) {
@@ -842,9 +859,9 @@ Status GcsFileSystem::LoadBufferFromGCS(const string& filename, size_t offset,
 
   if (bytes_read < n) {
     // Check stat cache to see if we encountered an interrupted read.
-    FileStatistics stat;
+    GcsFileStat stat;
     if (stat_cache_->Lookup(filename, &stat)) {
-      if (offset + bytes_read < stat.length) {
+      if (offset + bytes_read < stat.base.length) {
         return errors::Internal(strings::Printf(
             "File contents are inconsistent for file: %s @ %lu.",
             filename.c_str(), offset));
@@ -858,6 +875,7 @@ Status GcsFileSystem::LoadBufferFromGCS(const string& filename, size_t offset,
 }
 
 void GcsFileSystem::ClearFileCaches(const string& fname) {
+  tf_shared_lock l(block_cache_lock_);
   file_block_cache_->RemoveFile(fname);
   stat_cache_->Delete(fname);
   // TODO(rxsang): Remove the patterns that matche the file in
@@ -940,11 +958,16 @@ Status GcsFileSystem::FileExists(const string& fname) {
       return Status::OK();
     }
   }
-  bool result;
-  TF_RETURN_IF_ERROR(ObjectExists(fname, bucket, object, &result));
-  if (result) {
-    return Status::OK();
+
+  // Check if the object exists.
+  GcsFileStat stat;
+  const Status status = StatForObject(fname, bucket, object, &stat);
+  if (status.code() != errors::Code::NOT_FOUND) {
+    return status;
   }
+
+  // Check if the folder exists.
+  bool result;
   TF_RETURN_IF_ERROR(FolderExists(fname, &result));
   if (result) {
     return Status::OK();
@@ -954,14 +977,11 @@ Status GcsFileSystem::FileExists(const string& fname) {
 
 Status GcsFileSystem::ObjectExists(const string& fname, const string& bucket,
                                    const string& object, bool* result) {
-  if (!result) {
-    return errors::Internal("'result' cannot be nullptr.");
-  }
-  FileStatistics not_used_stat;
-  const Status status = StatForObject(fname, bucket, object, &not_used_stat);
+  GcsFileStat stat;
+  const Status status = StatForObject(fname, bucket, object, &stat);
   switch (status.code()) {
     case errors::Code::OK:
-      *result = true;
+      *result = !stat.base.is_directory;
       return Status::OK();
     case errors::Code::NOT_FOUND:
       *result = false;
@@ -971,68 +991,77 @@ Status GcsFileSystem::ObjectExists(const string& fname, const string& bucket,
   }
 }
 
-Status GcsFileSystem::StatForObject(const string& fname, const string& bucket,
-                                    const string& object,
-                                    FileStatistics* stat) {
-  if (!stat) {
-    return errors::Internal("'stat' cannot be nullptr.");
-  }
-  if (object.empty()) {
-    return errors::InvalidArgument(strings::Printf(
-        "'object' must be a non-empty string. (File: %s)", fname.c_str()));
-  }
-
-  StatCache::ComputeFunc compute_func = [this, &bucket, &object](
-                                            const string& fname,
-                                            FileStatistics* stat) {
-    std::vector<char> output_buffer;
-    std::unique_ptr<HttpRequest> request;
-    TF_RETURN_WITH_CONTEXT_IF_ERROR(CreateHttpRequest(&request),
-                                    " when reading metadata of gs://", bucket,
-                                    "/", object);
-
-    request->SetUri(strings::StrCat(kGcsUriBase, "b/", bucket, "/o/",
-                                    request->EscapeString(object),
-                                    "?fields=size%2Cupdated"));
-    request->SetResultBuffer(&output_buffer);
-    request->SetTimeouts(timeouts_.connect, timeouts_.idle, timeouts_.metadata);
-
-    TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(),
-                                    " when reading metadata of gs://", bucket,
-                                    "/", object);
+Status GcsFileSystem::UncachedStatForObject(const string& fname,
+                                            const string& bucket,
+                                            const string& object,
+                                            GcsFileStat* stat) {
+  std::vector<char> output_buffer;
+  std::unique_ptr<HttpRequest> request;
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(CreateHttpRequest(&request),
+                                  " when reading metadata of gs://", bucket,
+                                  "/", object);
 
-    Json::Value root;
-    TF_RETURN_IF_ERROR(ParseJson(output_buffer, &root));
+  request->SetUri(strings::StrCat(kGcsUriBase, "b/", bucket, "/o/",
+                                  request->EscapeString(object),
+                                  "?fields=size%2Cgeneration%2Cupdated"));
+  request->SetResultBuffer(&output_buffer);
+  request->SetTimeouts(timeouts_.connect, timeouts_.idle, timeouts_.metadata);
 
-    // Parse file size.
-    TF_RETURN_IF_ERROR(GetInt64Value(root, "size", &stat->length));
+  if (stats_ != nullptr) {
+    stats_->RecordStatObjectRequest();
+  }
 
-    // Parse file modification time.
-    string updated;
-    TF_RETURN_IF_ERROR(GetStringValue(root, "updated", &updated));
-    TF_RETURN_IF_ERROR(ParseRfc3339Time(updated, &(stat->mtime_nsec)));
+  TF_RETURN_WITH_CONTEXT_IF_ERROR(
+      request->Send(), " when reading metadata of gs://", bucket, "/", object);
 
-    VLOG(1) << "Stat of: gs://" << bucket << "/" << object << " -- "
-            << " length: " << stat->length
-            << "; mtime_nsec: " << stat->mtime_nsec << "; updated: " << updated;
+  Json::Value root;
+  TF_RETURN_IF_ERROR(ParseJson(output_buffer, &root));
 
-    stat->is_directory = false;
-    return Status::OK();
-  };
+  // Parse file size.
+  TF_RETURN_IF_ERROR(GetInt64Value(root, "size", &stat->base.length));
 
-  TF_RETURN_IF_ERROR(stat_cache_->LookupOrCompute(fname, stat, compute_func));
-  if (stat->is_directory) {
-    return errors::NotFound(fname, " is a directory.");
+  // Parse generation number.
+  TF_RETURN_IF_ERROR(
+      GetInt64Value(root, "generation", &stat->generation_number));
+
+  // Parse file modification time.
+  string updated;
+  TF_RETURN_IF_ERROR(GetStringValue(root, "updated", &updated));
+  TF_RETURN_IF_ERROR(ParseRfc3339Time(updated, &(stat->base.mtime_nsec)));
+
+  VLOG(1) << "Stat of: gs://" << bucket << "/" << object << " -- "
+          << " length: " << stat->base.length
+          << " generation: " << stat->generation_number
+          << "; mtime_nsec: " << stat->base.mtime_nsec
+          << "; updated: " << updated;
+
+  if (str_util::EndsWith(fname, "/")) {
+    // In GCS a path can be both a directory and a file, both it is uncommon for
+    // other file systems. To avoid the ambiguity, if a path ends with "/" in
+    // GCS, we always regard it as a directory mark or a virtual directory.
+    stat->base.is_directory = true;
   } else {
-    return Status::OK();
+    stat->base.is_directory = false;
   }
+  return Status::OK();
 }
 
-Status GcsFileSystem::BucketExists(const string& bucket, bool* result) {
-  if (!result) {
-    return errors::Internal("'result' cannot be nullptr.");
+Status GcsFileSystem::StatForObject(const string& fname, const string& bucket,
+                                    const string& object, GcsFileStat* stat) {
+  if (object.empty()) {
+    return errors::InvalidArgument(strings::Printf(
+        "'object' must be a non-empty string. (File: %s)", fname.c_str()));
   }
 
+  TF_RETURN_IF_ERROR(stat_cache_->LookupOrCompute(
+      fname, stat,
+      [this, &bucket, &object](const string& fname, GcsFileStat* stat) {
+        return UncachedStatForObject(fname, bucket, object, stat);
+      }));
+  return Status::OK();
+}
+
+Status GcsFileSystem::BucketExists(const string& bucket, bool* result) {
   std::unique_ptr<HttpRequest> request;
   TF_RETURN_IF_ERROR(CreateHttpRequest(&request));
   request->SetUri(strings::StrCat(kGcsUriBase, "b/", bucket));
@@ -1051,26 +1080,24 @@ Status GcsFileSystem::BucketExists(const string& bucket, bool* result) {
 }
 
 Status GcsFileSystem::FolderExists(const string& dirname, bool* result) {
-  if (!result) {
-    return errors::Internal("'result' cannot be nullptr.");
-  }
   StatCache::ComputeFunc compute_func = [this](const string& dirname,
-                                               FileStatistics* stat) {
+                                               GcsFileStat* stat) {
     std::vector<string> children;
     TF_RETURN_IF_ERROR(
         GetChildrenBounded(dirname, 1, &children, true /* recursively */,
                            true /* include_self_directory_marker */));
     if (!children.empty()) {
-      *stat = DIRECTORY_STAT;
+      stat->base = DIRECTORY_STAT;
       return Status::OK();
     } else {
       return errors::InvalidArgument("Not a directory!");
     }
   };
-  FileStatistics stat;
-  Status s = stat_cache_->LookupOrCompute(dirname, &stat, compute_func);
+  GcsFileStat stat;
+  Status s = stat_cache_->LookupOrCompute(MaybeAppendSlash(dirname), &stat,
+                                          compute_func);
   if (s.ok()) {
-    *result = stat.is_directory;
+    *result = stat.base.is_directory;
     return Status::OK();
   }
   if (errors::IsInvalidArgument(s)) {
@@ -1095,7 +1122,7 @@ Status GcsFileSystem::GetMatchingPaths(const string& pattern,
         // Find the fixed prefix by looking for the first wildcard.
         const string& fixed_prefix =
             pattern.substr(0, pattern.find_first_of("*?[\\"));
-        const string& dir = io::Dirname(fixed_prefix).ToString();
+        const string& dir = std::string(io::Dirname(fixed_prefix));
         if (dir.empty()) {
           return errors::InvalidArgument(
               "A GCS pattern doesn't have a bucket name: ", pattern);
@@ -1192,7 +1219,7 @@ Status GcsFileSystem::GetChildrenBounded(const string& dirname,
               " doesn't match the prefix ", object_prefix));
         }
         if (!relative_path.empty() || include_self_directory_marker) {
-          result->emplace_back(relative_path.ToString());
+          result->emplace_back(std::string(relative_path));
         }
         if (++retrieved_results >= max_results) {
           return Status::OK();
@@ -1220,7 +1247,7 @@ Status GcsFileSystem::GetChildrenBounded(const string& dirname,
               "Unexpected response: the returned folder name ", prefix_str,
               " doesn't match the prefix ", object_prefix);
         }
-        result->emplace_back(relative_path.ToString());
+        result->emplace_back(std::string(relative_path));
         if (++retrieved_results >= max_results) {
           return Status::OK();
         }
@@ -1254,8 +1281,10 @@ Status GcsFileSystem::Stat(const string& fname, FileStatistics* stat) {
     return errors::NotFound("The specified bucket ", fname, " was not found.");
   }
 
-  const Status status = StatForObject(fname, bucket, object, stat);
+  GcsFileStat gcs_stat;
+  const Status status = StatForObject(fname, bucket, object, &gcs_stat);
   if (status.ok()) {
+    *stat = gcs_stat.base;
     return Status::OK();
   }
   if (status.code() != errors::Code::NOT_FOUND) {
@@ -1375,9 +1404,9 @@ Status GcsFileSystem::RenameObject(const string& src, const string& target) {
   request->SetResultBuffer(&output_buffer);
   TF_RETURN_WITH_CONTEXT_IF_ERROR(request->Send(), " when renaming ", src,
                                   " to ", target);
-  // Flush the target from the block cache.  The source will be flushed in the
+  // Flush the target from the caches.  The source will be flushed in the
   // DeleteFile call below.
-  file_block_cache_->RemoveFile(target);
+  ClearFileCaches(target);
   Json::Value root;
   TF_RETURN_IF_ERROR(ParseJson(output_buffer, &root));
   bool done;
@@ -1397,8 +1426,7 @@ Status GcsFileSystem::RenameObject(const string& src, const string& target) {
   // on the server side, we can't just retry the whole RenameFile operation
   // because the source object is already gone.
   return RetryingUtils::DeleteWithRetries(
-      std::bind(&GcsFileSystem::DeleteFile, this, src),
-      initial_retry_delay_usec_);
+      [this, &src]() { return DeleteFile(src); }, initial_retry_delay_usec_);
 }
 
 Status GcsFileSystem::IsDirectory(const string& fname) {
@@ -1454,7 +1482,7 @@ Status GcsFileSystem::DeleteRecursively(const string& dirname,
     // and therefore RetryingFileSystem won't pay attention to the failures,
     // we need to make sure these failures are properly retried.
     const auto& delete_file_status = RetryingUtils::DeleteWithRetries(
-        std::bind(&GcsFileSystem::DeleteFile, this, full_path),
+        [this, &full_path]() { return DeleteFile(full_path); },
         initial_retry_delay_usec_);
     if (!delete_file_status.ok()) {
       if (IsDirectory(full_path).ok()) {
@@ -1472,6 +1500,7 @@ Status GcsFileSystem::DeleteRecursively(const string& dirname,
 // reclaiming memory once filesystem operations are done (e.g. model is loaded),
 // or for resetting the filesystem to a consistent state.
 void GcsFileSystem::FlushCaches() {
+  tf_shared_lock l(block_cache_lock_);
   file_block_cache_->Flush();
   stat_cache_->Clear();
   matching_paths_cache_->Clear();
@@ -1480,8 +1509,15 @@ void GcsFileSystem::FlushCaches() {
 void GcsFileSystem::SetStats(GcsStatsInterface* stats) {
   CHECK(stats_ == nullptr) << "SetStats() has already been called.";
   CHECK(stats != nullptr);
+  mutex_lock l(block_cache_lock_);
   stats_ = stats;
-  stats_->Init(this, &throttle_, file_block_cache_.get());
+  stats_->Configure(this, &throttle_, file_block_cache_.get());
+}
+
+void GcsFileSystem::SetAuthProvider(
+    std::unique_ptr<AuthProvider> auth_provider) {
+  mutex_lock l(mu_);
+  auth_provider_ = std::move(auth_provider);
 }
 
 // Creates an HttpRequest and sets several parameters that are common to all
@@ -1494,7 +1530,11 @@ Status GcsFileSystem::CreateHttpRequest(std::unique_ptr<HttpRequest>* request) {
   }
 
   string auth_token;
-  TF_RETURN_IF_ERROR(AuthProvider::GetToken(auth_provider_.get(), &auth_token));
+  {
+    tf_shared_lock l(mu_);
+    TF_RETURN_IF_ERROR(
+        AuthProvider::GetToken(auth_provider_.get(), &auth_token));
+  }
 
   new_request->AddAuthBearerHeader(auth_token);
 
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.h b/tensorflow/core/platform/cloud/gcs_file_system.h
index 99c94c17515034..74768c98b563bd 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.h
+++ b/tensorflow/core/platform/cloud/gcs_file_system.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_PLATFORM_GCS_FILE_SYSTEM_H_
-#define TENSORFLOW_CORE_PLATFORM_GCS_FILE_SYSTEM_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_FILE_SYSTEM_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_FILE_SYSTEM_H_
 
 #include <string>
 #include <utility>
@@ -43,9 +43,12 @@ class GcsFileSystem;
 /// time.
 class GcsStatsInterface {
  public:
-  /// Init is called by the GcsFileSystem immediately after being registered.
-  virtual void Init(GcsFileSystem* fs, GcsThrottle* throttle,
-                    const FileBlockCache* block_cache) = 0;
+  /// Configure is called by the GcsFileSystem to provide instrumentation hooks.
+  ///
+  /// Note: Configure can be called multiple times (e.g. if the block cache is
+  /// re-initialized).
+  virtual void Configure(GcsFileSystem* fs, GcsThrottle* throttle,
+                         const FileBlockCache* block_cache) = 0;
 
   /// RecordBlockLoadRequest is called to record a block load request is about
   /// to be made.
@@ -56,6 +59,10 @@ class GcsStatsInterface {
   virtual void RecordBlockRetrieved(const string& file, size_t offset,
                                     size_t bytes_transferred) = 0;
 
+  // RecordStatObjectRequest is called once a statting object request over GCS
+  // is about to be made.
+  virtual void RecordStatObjectRequest() = 0;
+
   /// HttpStats is called to optionally provide a RequestStats listener
   /// to be annotated on every HTTP request made to the GCS API.
   ///
@@ -128,9 +135,18 @@ class GcsFileSystem : public FileSystem {
 
   /// These accessors are mainly for testing purposes, to verify that the
   /// environment variables that control these parameters are handled correctly.
-  size_t block_size() const { return file_block_cache_->block_size(); }
-  size_t max_bytes() const { return file_block_cache_->max_bytes(); }
-  uint64 max_staleness() const { return file_block_cache_->max_staleness(); }
+  size_t block_size() {
+    tf_shared_lock l(block_cache_lock_);
+    return file_block_cache_->block_size();
+  }
+  size_t max_bytes() {
+    tf_shared_lock l(block_cache_lock_);
+    return file_block_cache_->max_bytes();
+  }
+  uint64 max_staleness() {
+    tf_shared_lock l(block_cache_lock_);
+    return file_block_cache_->max_staleness();
+  }
   TimeoutConfig timeouts() const { return timeouts_; }
   string additional_header_name() const {
     return additional_header_ ? additional_header_->first : "";
@@ -186,7 +202,28 @@ class GcsFileSystem : public FileSystem {
 
   Status CreateHttpRequest(std::unique_ptr<HttpRequest>* request);
 
+  /// \brief Sets a new AuthProvider on the GCS FileSystem.
+  ///
+  /// The new auth provider will be used for all subsequent requests.
+  void SetAuthProvider(std::unique_ptr<AuthProvider> auth_provider);
+
+  /// \brief Resets the block cache and re-instantiates it with the new values.
+  ///
+  /// This method can be used to clear the existing block cache and/or to
+  /// re-configure the block cache for different values.
+  ///
+  /// Note: the existing block cache is not cleaned up until all existing files
+  /// have been closed.
+  void ResetFileBlockCache(size_t block_size_bytes, size_t max_bytes,
+                           uint64 max_staleness_secs);
+
  private:
+  // GCS file statistics.
+  struct GcsFileStat {
+    FileStatistics base;
+    int64 generation_number = 0;
+  };
+
   /// \brief Checks if the bucket exists. Returns OK if the check succeeded.
   ///
   /// 'result' is set if the function returns OK. 'result' cannot be nullptr.
@@ -214,9 +251,15 @@ class GcsFileSystem : public FileSystem {
   Status GetChildrenBounded(const string& dir, uint64 max_results,
                             std::vector<string>* result, bool recursively,
                             bool include_self_directory_marker);
-  /// Retrieves file statistics assuming fname points to a GCS object.
+
+  /// Retrieves file statistics assuming fname points to a GCS object. The data
+  /// may be read from cache or from GCS directly.
   Status StatForObject(const string& fname, const string& bucket,
-                       const string& object, FileStatistics* stat);
+                       const string& object, GcsFileStat* stat);
+  /// Retrieves file statistics of file fname directly from GCS.
+  Status UncachedStatForObject(const string& fname, const string& bucket,
+                               const string& object, GcsFileStat* stat);
+
   Status RenameObject(const string& src, const string& target);
 
   std::unique_ptr<FileBlockCache> MakeFileBlockCache(size_t block_size,
@@ -230,13 +273,18 @@ class GcsFileSystem : public FileSystem {
   // Clear all the caches related to the file with name `filename`.
   void ClearFileCaches(const string& fname);
 
-  std::unique_ptr<AuthProvider> auth_provider_;
+  mutex mu_;
+  std::unique_ptr<AuthProvider> auth_provider_ GUARDED_BY(mu_);
   std::unique_ptr<HttpRequest::Factory> http_request_factory_;
-  std::unique_ptr<FileBlockCache> file_block_cache_;
+  // block_cache_lock_ protects the file_block_cache_ pointer (Note that
+  // FileBlockCache instances are themselves threadsafe).
+  mutex block_cache_lock_;
+  std::unique_ptr<FileBlockCache> file_block_cache_
+      GUARDED_BY(block_cache_lock_);
   std::unique_ptr<GcsDnsCache> dns_cache_;
   GcsThrottle throttle_;
 
-  using StatCache = ExpiringLRUCache<FileStatistics>;
+  using StatCache = ExpiringLRUCache<GcsFileStat>;
   std::unique_ptr<StatCache> stat_cache_;
 
   using MatchingPathsCache = ExpiringLRUCache<std::vector<string>>;
@@ -256,20 +304,12 @@ class GcsFileSystem : public FileSystem {
 };
 
 /// Google Cloud Storage implementation of a file system with retry on failures.
-class RetryingGcsFileSystem : public RetryingFileSystem {
+class RetryingGcsFileSystem : public RetryingFileSystem<GcsFileSystem> {
  public:
-  RetryingGcsFileSystem() : RetryingGcsFileSystem(new GcsFileSystem) {}
-
-  void SetStats(GcsStatsInterface* stats) { underlying_->SetStats(stats); }
-
- private:
-  explicit RetryingGcsFileSystem(GcsFileSystem* fs)
-      : RetryingFileSystem(std::unique_ptr<FileSystem>(fs)), underlying_(fs) {}
-
-  // TODO(b/74259157): Refactor RetryingFileSystem to avoid holding this ptr.
-  GcsFileSystem* underlying_;
+  RetryingGcsFileSystem()
+      : RetryingFileSystem(std::unique_ptr<GcsFileSystem>(new GcsFileSystem)) {}
 };
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_PLATFORM_GCS_FILE_SYSTEM_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_FILE_SYSTEM_H_
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index c6392999543960..e791ae5a199eb8 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -74,7 +74,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_NoBlockCache) {
   EXPECT_EQ("6789", result);
 }
 
-TEST(GcsFileSystemTest, NewRandomAccessFile_NoBlockCache_differentN) {
+TEST(GcsFileSystemTest, NewRandomAccessFile_NoBlockCache_DifferentN) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
@@ -123,6 +123,13 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache) {
   // "0123456789abcde".
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "random_access.txt?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"15\",\"generation\": \"1\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+       new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
            "Range: 0-8\n"
@@ -145,7 +152,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache) {
       std::unique_ptr<HttpRequest::Factory>(
           new FakeHttpRequestFactory(&requests)),
       9 /* block size */, 18 /* max bytes */, 0 /* max staleness */,
-      0 /* stat cache max age */, 0 /* stat cache max entries */,
+      3600 /* stat cache max age */, 0 /* stat cache max entries */,
       0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, 0 /* initial retry delay */,
       kTestTimeoutConfig, nullptr /* gcs additional header */);
@@ -198,64 +205,30 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache) {
   EXPECT_EQ("0123", result);
 }
 
-TEST(GcsFileSystemTest, NewRandomAccessFile_CheckpointFile_WithBlockCache) {
-  // Our underlying file in this test changes as new data comes in
-  std::vector<HttpRequest*> requests(
-      {new FakeHttpRequest(
-           "Uri: https://storage.googleapis.com/bucket/checkpoint\n"
-           "Auth Token: fake_token\n"
-           "Range: 0-8\n"
-           "Timeouts: 5 1 20\n",
-           "012345678"),
-       new FakeHttpRequest(
-           "Uri: https://storage.googleapis.com/bucket/checkpoint\n"
-           "Auth Token: fake_token\n"
-           "Range: 0-8\n"
-           "Timeouts: 5 1 20\n",
-           "abcdefghi")});
-  GcsFileSystem fs(
-      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-      std::unique_ptr<HttpRequest::Factory>(
-          new FakeHttpRequestFactory(&requests)),
-      9 /* block size */, 18 /* max bytes */, 0 /* max staleness */,
-      0 /* stat cache max age */, 0 /* stat cache max entries */,
-      0 /* matching paths cache max age */,
-      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
-      kTestTimeoutConfig, nullptr /* gcs additional header */);
-
-  char scratch[100];
-  StringPiece result;
-  {
-    // We are instantiating this in an enclosed scope to make sure after the
-    // unique ptr goes out of scope, we can still access result.
-    std::unique_ptr<RandomAccessFile> file;
-    TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/checkpoint", &file));
-
-    // Read the first chunk. The cache will be populated with the first block of
-    // 9 bytes.
-    scratch[5] = 'x';
-    TF_EXPECT_OK(file->Read(0, 4, &result, scratch));
-    EXPECT_EQ("0123", result);
-    EXPECT_EQ(scratch[5], 'x');  // Make sure we only copied 4 bytes.
-
-    // The second chunk should not be in cache so we make a new request
-    // As the checkpoint file should not be cached
-    TF_EXPECT_OK(file->Read(0, 4, &result, scratch));
-    EXPECT_EQ("abcd", result);
-    EXPECT_EQ(scratch[5], 'x');  // Make sure we only copied 4 bytes.
-  }
-}
-
 TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_Flush) {
   // Our underlying file in this test is a 15 byte file with contents
   // "0123456789abcde".
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "random_access.txt?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"15\",\"generation\": \"1\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+       new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
            "Range: 0-8\n"
            "Timeouts: 5 1 20\n",
            "012345678"),
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "random_access.txt?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"15\",\"generation\": \"1\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
            "Auth Token: fake_token\n"
@@ -267,7 +240,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_Flush) {
       std::unique_ptr<HttpRequest::Factory>(
           new FakeHttpRequestFactory(&requests)),
       9 /* block size */, 18 /* max bytes */, 0 /* max staleness */,
-      0 /* stat cache max age */, 0 /* stat cache max entries */,
+      3600 /* stat cache max age */, 0 /* stat cache max entries */,
       0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, 0 /* initial retry delay */,
       kTestTimeoutConfig, nullptr /* gcs additional header */);
@@ -293,7 +266,14 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_MaxStaleness) {
   // Our underlying file in this test is a 16 byte file with contents
   // "0123456789abcdef".
   std::vector<HttpRequest*> requests(
-      {new FakeHttpRequest("Uri: https://storage.googleapis.com/bucket/object\n"
+      {new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "object?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"16\",\"generation\": \"1\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+       new FakeHttpRequest("Uri: https://storage.googleapis.com/bucket/object\n"
                            "Auth Token: fake_token\n"
                            "Range: 0-7\n"
                            "Timeouts: 5 1 20\n",
@@ -308,7 +288,7 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_MaxStaleness) {
       std::unique_ptr<HttpRequest::Factory>(
           new FakeHttpRequestFactory(&requests)),
       8 /* block size */, 16 /* max bytes */, 3600 /* max staleness */,
-      0 /* stat cache max age */, 0 /* stat cache max entries */,
+      3600 /* stat cache max age */, 0 /* stat cache max entries */,
       0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, 0 /* initial retry delay */,
       kTestTimeoutConfig, nullptr /* gcs additional header */);
@@ -343,6 +323,60 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_WithBlockCache_MaxStaleness) {
   }
 }
 
+TEST(GcsFileSystemTest,
+     NewRandomAccessFile_WithBlockCache_FileSignatureChanges) {
+  std::vector<HttpRequest*> requests(
+      {new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "random_access.txt?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"5\",\"generation\": \"1\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+       new FakeHttpRequest(
+           "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
+           "Auth Token: fake_token\n"
+           "Range: 0-8\n"
+           "Timeouts: 5 1 20\n",
+           "01234"),
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "random_access.txt?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"5\",\"generation\": \"2\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+       new FakeHttpRequest(
+           "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
+           "Auth Token: fake_token\n"
+           "Range: 0-8\n"
+           "Timeouts: 5 1 20\n",
+           "43210")});
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      9 /* block size */, 18 /* max bytes */, 0 /* max staleness */,
+      0 /* stat cache max age */, 0 /* stat cache max entries */,
+      0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
+      kTestTimeoutConfig, nullptr /* gcs additional header */);
+
+  std::unique_ptr<RandomAccessFile> file;
+  TF_EXPECT_OK(fs.NewRandomAccessFile("gs://bucket/random_access.txt", &file));
+
+  char scratch[5];
+  StringPiece result;
+
+  // First read.
+  TF_EXPECT_OK(file->Read(0, sizeof(scratch), &result, scratch));
+  EXPECT_EQ("01234", result);
+
+  // Second read. File signatures are different.
+  TF_EXPECT_OK(file->Read(0, sizeof(scratch), &result, scratch));
+  EXPECT_EQ("43210", result);
+}
+
 TEST(GcsFileSystemTest, NewRandomAccessFile_NoObjectName) {
   std::vector<HttpRequest*> requests;
   GcsFileSystem fs(
@@ -364,10 +398,10 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_InconsistentRead) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "random_access.txt?fields=size%2Cupdated\n"
+           "random_access.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           strings::StrCat("{\"size\": \"6\","
+           strings::StrCat("{\"size\": \"6\",\"generation\": \"1\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
@@ -404,6 +438,13 @@ TEST(GcsFileSystemTest, NewRandomAccessFile_InconsistentRead) {
 TEST(GcsFileSystemTest, NewWritableFile) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Fwriteable?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"16\",\"generation\": \"1\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+       new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fwriteable\n"
            "Auth Token: fake_token\n"
            "Range: 0-7\n"
@@ -423,21 +464,28 @@ TEST(GcsFileSystemTest, NewWritableFile) {
                            "Timeouts: 5 1 30\n"
                            "Put body: content1,content2\n",
                            ""),
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Fwriteable?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"33\",\"generation\": \"2\","
+                           "\"updated\": \"2016-04-29T23:15:34.896Z\"}")),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fwriteable\n"
            "Auth Token: fake_token\n"
            "Range: 0-7\n"
            "Timeouts: 5 1 20\n",
            "01234567")});
-  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
-                   std::unique_ptr<HttpRequest::Factory>(
-                       new FakeHttpRequestFactory(&requests)),
-                   8 /* block size */, 8 /* max bytes */, 0 /* max staleness */,
-                   0 /* stat cache max age */, 0 /* stat cache max entries */,
-                   0 /* matching paths cache max age */,
-                   0 /* matching paths cache max entries */,
-                   0 /* initial retry delay */, kTestTimeoutConfig,
-                   nullptr /* gcs additional header */);
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      8 /* block size */, 8 /* max bytes */, 0 /* max staleness */,
+      3600 /* stat cache max age */, 0 /* stat cache max entries */,
+      0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
+      kTestTimeoutConfig, nullptr /* gcs additional header */);
 
   // Read from the file first, to fill the block cache.
   std::unique_ptr<RandomAccessFile> rfile;
@@ -541,6 +589,13 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceedsOnGetStatus) {
   // path.
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Fwriteable?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"16\",\"generation\": \"1\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+       new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fwriteable\n"
            "Auth Token: fake_token\n"
            "Range: 0-7\n"
@@ -566,6 +621,13 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceedsOnGetStatus) {
                            "Header Content-Range: bytes */17\n"
                            "Put: yes\n",
                            "", Status::OK(), nullptr, {}, 201),
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Fwriteable?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"33\",\"generation\": \"2\","
+                           "\"updated\": \"2016-04-29T23:19:24.896Z\"}")),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fwriteable\n"
            "Auth Token: fake_token\n"
@@ -577,7 +639,7 @@ TEST(GcsFileSystemTest, NewWritableFile_ResumeUploadSucceedsOnGetStatus) {
       std::unique_ptr<HttpRequest::Factory>(
           new FakeHttpRequestFactory(&requests)),
       8 /* block size */, 8 /* max bytes */, 3600 /* max staleness */,
-      0 /* stat cache max age */, 0 /* stat cache max entries */,
+      3600 /* stat cache max age */, 0 /* stat cache max entries */,
       0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, 0 /* initial retry delay */,
       kTestTimeoutConfig, nullptr /* gcs additional header */);
@@ -761,6 +823,13 @@ TEST(GcsFileSystemTest, NewWritableFile_NoObjectName) {
 TEST(GcsFileSystemTest, NewAppendableFile) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Fappendable?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"8\",\"generation\": \"1\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+       new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fappendable\n"
            "Auth Token: fake_token\n"
            "Range: 0-31\n"
@@ -780,6 +849,13 @@ TEST(GcsFileSystemTest, NewAppendableFile) {
                            "Timeouts: 5 1 30\n"
                            "Put body: content1,content2\n",
                            ""),
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Fappendable?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"8\",\"generation\": \"2\","
+                           "\"updated\": \"2016-04-29T23:25:24.896Z\"}")),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fappendable\n"
            "Auth Token: fake_token\n"
@@ -791,7 +867,7 @@ TEST(GcsFileSystemTest, NewAppendableFile) {
       std::unique_ptr<HttpRequest::Factory>(
           new FakeHttpRequestFactory(&requests)),
       32 /* block size */, 32 /* max bytes */, 0 /* max staleness */,
-      0 /* stat cache max age */, 0 /* stat cache max entries */,
+      3600 /* stat cache max age */, 0 /* stat cache max entries */,
       0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, 0 /* initial retry delay */,
       kTestTimeoutConfig, nullptr /* gcs additional header */);
@@ -840,11 +916,12 @@ TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "path%2Frandom_access.txt?fields=size%2Cupdated\n"
+           "path%2Frandom_access.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           strings::StrCat("{\"size\": \"", content.size(),
-                           "\", \"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+           strings::StrCat("{\"size\": \"", content.size(), "\"",
+                           ", \"generation\": \"1\"",
+                           ", \"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        new FakeHttpRequest(
            strings::StrCat("Uri: https://storage.googleapis.com/bucket/"
                            "path%2Frandom_access.txt\n"
@@ -890,10 +967,10 @@ TEST(GcsFileSystemTest, NewReadOnlyMemoryRegionFromFile_NoObjectName) {
 TEST(GcsFileSystemTest, FileExists_YesAsObject) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-      "path%2Ffile1.txt?fields=size%2Cupdated\n"
+      "path%2Ffile1.txt?fields=size%2Cgeneration%2Cupdated\n"
       "Auth Token: fake_token\n"
       "Timeouts: 5 1 10\n",
-      strings::StrCat("{\"size\": \"1010\","
+      strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                       "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -912,7 +989,7 @@ TEST(GcsFileSystemTest, FileExists_YesAsFolder) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "path%2Fsubfolder?fields=size%2Cupdated\n"
+           "path%2Fsubfolder?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404),
@@ -967,7 +1044,7 @@ TEST(GcsFileSystemTest, FileExists_NotAsObjectOrFolder) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "path%2Ffile1.txt?fields=size%2Cupdated\n"
+           "path%2Ffile1.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404),
@@ -1023,14 +1100,14 @@ TEST(GcsFileSystemTest, FileExists_StatCache) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "path%2Ffile1.txt?fields=size%2Cupdated\n"
+           "path%2Ffile1.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           strings::StrCat("{\"size\": \"1010\","
+           strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "path%2Fsubfolder?fields=size%2Cupdated\n"
+           "path%2Fsubfolder%2F?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404),
@@ -1056,10 +1133,32 @@ TEST(GcsFileSystemTest, FileExists_StatCache) {
   // HTTP requests.
   for (int i = 0; i < 10; i++) {
     TF_EXPECT_OK(fs.FileExists("gs://bucket/path/file1.txt"));
-    TF_EXPECT_OK(fs.FileExists("gs://bucket/path/subfolder"));
+    TF_EXPECT_OK(fs.FileExists("gs://bucket/path/subfolder/"));
   }
 }
 
+TEST(GcsFileSystemTest, FileExists_DirectoryMark) {
+  std::vector<HttpRequest*> requests({new FakeHttpRequest(
+      "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+      "dir%2F?fields=size%2Cgeneration%2Cupdated\n"
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
+      strings::StrCat("{\"size\": \"5\",\"generation\": \"1\","
+                      "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+      3600 /* stat cache max age */, 0 /* stat cache max entries */,
+      0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
+      kTestTimeoutConfig, nullptr /* gcs additional header */);
+
+  TF_EXPECT_OK(fs.FileExists("gs://bucket/dir/"));
+  TF_EXPECT_OK(fs.IsDirectory("gs://bucket/dir/"));
+}
+
 TEST(GcsFileSystemTest, GetChildren_NoItems) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
@@ -1493,6 +1592,13 @@ TEST(GcsFileSystemTest, GetMatchingPaths_Cache_Flush) {
 TEST(GcsFileSystemTest, DeleteFile) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Ffile1.txt?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"8\",\"generation\": \"1\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+       new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Ffile1.txt\n"
            "Auth Token: fake_token\n"
            "Range: 0-15\n"
@@ -1504,6 +1610,13 @@ TEST(GcsFileSystemTest, DeleteFile) {
                            "Timeouts: 5 1 10\n"
                            "Delete: yes\n",
                            ""),
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Ffile1.txt?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"8\",\"generation\": \"2\","
+                           "\"updated\": \"2016-04-29T23:19:24.896Z\"}")),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Ffile1.txt\n"
            "Auth Token: fake_token\n"
@@ -1515,7 +1628,7 @@ TEST(GcsFileSystemTest, DeleteFile) {
       std::unique_ptr<HttpRequest::Factory>(
           new FakeHttpRequestFactory(&requests)),
       16 /* block size */, 16 /* max bytes */, 0 /* max staleness */,
-      0 /* stat cache max age */, 0 /* stat cache max entries */,
+      3600 /* stat cache max age */, 0 /* stat cache max entries */,
       0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
       kTestTimeoutConfig, nullptr /* gcs additional header */);
@@ -1555,10 +1668,10 @@ TEST(GcsFileSystemTest, DeleteFile_StatCacheRemoved) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "file.txt?fields=size%2Cupdated\n"
+           "file.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           strings::StrCat("{\"size\": \"1010\","
+           strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        new FakeHttpRequest("Uri: https://www.googleapis.com/storage/v1/b"
                            "/bucket/o/file.txt\n"
@@ -1568,7 +1681,7 @@ TEST(GcsFileSystemTest, DeleteFile_StatCacheRemoved) {
                            ""),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "file.txt?fields=size%2Cupdated\n"
+           "file.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404),
@@ -1693,10 +1806,10 @@ TEST(GcsFileSystemTest, DeleteDir_NonEmpty) {
 TEST(GcsFileSystemTest, GetFileSize) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-      "file.txt?fields=size%2Cupdated\n"
+      "file.txt?fields=size%2Cgeneration%2Cupdated\n"
       "Auth Token: fake_token\n"
       "Timeouts: 5 1 10\n",
-      strings::StrCat("{\"size\": \"1010\","
+      strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                       "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -1816,11 +1929,25 @@ TEST(GcsFileSystemTest, RenameFile_Folder) {
 TEST(GcsFileSystemTest, RenameFile_Object) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Fsrc.txt?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"8\",\"generation\": \"1\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+       new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fsrc.txt\n"
            "Auth Token: fake_token\n"
            "Range: 0-15\n"
            "Timeouts: 5 1 20\n",
            "01234567"),
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Fdst.txt?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"8\",\"generation\": \"1\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fdst.txt\n"
            "Auth Token: fake_token\n"
@@ -1835,14 +1962,6 @@ TEST(GcsFileSystemTest, RenameFile_Object) {
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "{}"),
-       // IsDirectory is checking if the path exists as an object.
-       new FakeHttpRequest(
-           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "path%2Fsrc.txt?fields=size%2Cupdated\n"
-           "Auth Token: fake_token\n"
-           "Timeouts: 5 1 10\n",
-           strings::StrCat("{\"size\": \"1010\","
-                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        // Copying to the new location.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
@@ -1859,12 +1978,26 @@ TEST(GcsFileSystemTest, RenameFile_Object) {
            "Timeouts: 5 1 10\n"
            "Delete: yes\n",
            ""),
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Fsrc.txt?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"8\",\"generation\": \"2\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fsrc.txt\n"
            "Auth Token: fake_token\n"
            "Range: 0-15\n"
            "Timeouts: 5 1 20\n",
            "89abcdef"),
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Fdst.txt?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"8\",\"generation\": \"2\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        new FakeHttpRequest(
            "Uri: https://storage.googleapis.com/bucket/path%2Fdst.txt\n"
            "Auth Token: fake_token\n"
@@ -1876,7 +2009,7 @@ TEST(GcsFileSystemTest, RenameFile_Object) {
       std::unique_ptr<HttpRequest::Factory>(
           new FakeHttpRequestFactory(&requests)),
       16 /* block size */, 64 /* max bytes */, 0 /* max staleness */,
-      0 /* stat cache max age */, 0 /* stat cache max entries */,
+      3600 /* stat cache max age */, 0 /* stat cache max entries */,
       0 /* matching paths cache max age */,
       0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
       kTestTimeoutConfig, nullptr /* gcs additional header */);
@@ -1902,6 +2035,78 @@ TEST(GcsFileSystemTest, RenameFile_Object) {
   EXPECT_EQ("fedcba98", result);
 }
 
+TEST(GcsFileSystemTest, RenameFile_Object_FlushTargetStatCache) {
+  std::vector<HttpRequest*> requests(
+      {// Stat the target file.
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Fdst.txt?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"1000\",\"generation\": \"1\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+       // IsDirectory is checking whether there are children objects.
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
+           "fields=items%2Fname%2CnextPageToken&prefix=path%2Fsrc.txt%2F"
+           "&maxResults=1\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           "{}"),
+       // IsDirectory is checking if the path exists as an object.
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Fsrc.txt?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
+       // Copying to the new location.
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Fsrc.txt/rewriteTo/b/bucket/o/path%2Fdst.txt\n"
+           "Auth Token: fake_token\n"
+           "Post: yes\n"
+           "Timeouts: 5 1 10\n",
+           "{\"done\": true}"),
+       // Deleting the original file.
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Fsrc.txt\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n"
+           "Delete: yes\n",
+           ""),
+       new FakeHttpRequest(
+           "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+           "path%2Fdst.txt?fields=size%2Cgeneration%2Cupdated\n"
+           "Auth Token: fake_token\n"
+           "Timeouts: 5 1 10\n",
+           strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
+                           "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+      3600 /* stat cache max age */, 0 /* stat cache max entries */,
+      0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, 0 /* initial retry delay*/,
+      kTestTimeoutConfig, nullptr /* gcs additional header */);
+  // Do an initial stat of the destination file to load their contents into the
+  // stat cache.
+  FileStatistics stat_before_renaming;
+  TF_EXPECT_OK(fs.Stat("gs://bucket/path/dst.txt", &stat_before_renaming));
+  EXPECT_EQ(1000, stat_before_renaming.length);
+
+  TF_EXPECT_OK(
+      fs.RenameFile("gs://bucket/path/src.txt", "gs://bucket/path/dst.txt"));
+
+  FileStatistics stat_after_renaming;
+  TF_EXPECT_OK(fs.Stat("gs://bucket/path/dst.txt", &stat_after_renaming));
+  EXPECT_EQ(1010, stat_after_renaming.length);
+}
+
 /// Tests the scenario when deletion returns a failure, but actually succeeds.
 TEST(GcsFileSystemTest, RenameFile_Object_DeletionRetried) {
   std::vector<HttpRequest*> requests(
@@ -1916,10 +2121,10 @@ TEST(GcsFileSystemTest, RenameFile_Object_DeletionRetried) {
        // IsDirectory is checking if the path exists as an object.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "path%2Fsrc.txt?fields=size%2Cupdated\n"
+           "path%2Fsrc.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           strings::StrCat("{\"size\": \"1010\","
+           strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        // Copying to the new location.
        new FakeHttpRequest(
@@ -1973,10 +2178,10 @@ TEST(GcsFileSystemTest, RenameFile_Object_Incomplete) {
        // IsDirectory is checking if the path exists as an object.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "path%2Fsrc.txt?fields=size%2Cupdated\n"
+           "path%2Fsrc.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           strings::StrCat("{\"size\": \"1010\","
+           strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        // Copying to the new location.
        new FakeHttpRequest(
@@ -2005,10 +2210,10 @@ TEST(GcsFileSystemTest, RenameFile_Object_Incomplete) {
 TEST(GcsFileSystemTest, Stat_Object) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-      "file.txt?fields=size%2Cupdated\n"
+      "file.txt?fields=size%2Cgeneration%2Cupdated\n"
       "Auth Token: fake_token\n"
       "Timeouts: 5 1 10\n",
-      strings::StrCat("{\"size\": \"1010\","
+      strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                       "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -2031,7 +2236,7 @@ TEST(GcsFileSystemTest, Stat_Folder) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "subfolder?fields=size%2Cupdated\n"
+           "subfolder?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404),
@@ -2064,7 +2269,7 @@ TEST(GcsFileSystemTest, Stat_ObjectOrFolderNotFound) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "path?fields=size%2Cupdated\n"
+           "path?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404),
@@ -2136,14 +2341,14 @@ TEST(GcsFileSystemTest, Stat_Cache) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "file.txt?fields=size%2Cupdated\n"
+           "file.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           strings::StrCat("{\"size\": \"1010\","
+           strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "subfolder?fields=size%2Cupdated\n"
+           "subfolder%2F?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404),
@@ -2173,7 +2378,7 @@ TEST(GcsFileSystemTest, Stat_Cache) {
     EXPECT_EQ(1010, stat.length);
     EXPECT_NEAR(1461971724896, stat.mtime_nsec / 1000 / 1000, 1);
     EXPECT_FALSE(stat.is_directory);
-    TF_EXPECT_OK(fs.Stat("gs://bucket/subfolder", &stat));
+    TF_EXPECT_OK(fs.Stat("gs://bucket/subfolder/", &stat));
     EXPECT_EQ(0, stat.length);
     EXPECT_EQ(0, stat.mtime_nsec);
     EXPECT_TRUE(stat.is_directory);
@@ -2184,17 +2389,17 @@ TEST(GcsFileSystemTest, Stat_Cache_Flush) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "file.txt?fields=size%2Cupdated\n"
+           "file.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           strings::StrCat("{\"size\": \"1010\","
+           strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}")),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "file.txt?fields=size%2Cupdated\n"
+           "file.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           strings::StrCat("{\"size\": \"1010\","
+           strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
   GcsFileSystem fs(
       std::unique_ptr<AuthProvider>(new FakeAuthProvider),
@@ -2224,6 +2429,30 @@ TEST(GcsFileSystemTest, Stat_Cache_Flush) {
   }
 }
 
+TEST(GcsFileSystemTest, Stat_FilenameEndingWithSlash) {
+  std::vector<HttpRequest*> requests({new FakeHttpRequest(
+      "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+      "dir%2F?fields=size%2Cgeneration%2Cupdated\n"
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
+      strings::StrCat("{\"size\": \"5\",\"generation\": \"1\","
+                      "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */,
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
+
+  FileStatistics stat;
+  TF_EXPECT_OK(fs.Stat("gs://bucket/dir/", &stat));
+  EXPECT_EQ(5, stat.length);
+  EXPECT_TRUE(stat.is_directory);
+}
+
 TEST(GcsFileSystemTest, IsDirectory_NotFound) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
@@ -2235,7 +2464,7 @@ TEST(GcsFileSystemTest, IsDirectory_NotFound) {
            "{}"),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "file.txt?fields=size%2Cupdated\n"
+           "file.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404)});
@@ -2264,10 +2493,10 @@ TEST(GcsFileSystemTest, IsDirectory_NotDirectoryButObject) {
            "{}"),
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "file.txt?fields=size%2Cupdated\n"
+           "file.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
-           strings::StrCat("{\"size\": \"1010\","
+           strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
                            "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
   GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
                    std::unique_ptr<HttpRequest::Factory>(
@@ -2570,7 +2799,7 @@ TEST(GcsFileSystemTest, DeleteRecursively_DeletionErrors) {
        // Checking if gs://bucket/path/file3.txt is an object - fails with 404.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "path%2Ffile3.txt?fields=size%2Cupdated\n"
+           "path%2Ffile3.txt?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404)});
@@ -2605,7 +2834,7 @@ TEST(GcsFileSystemTest, DeleteRecursively_NotAFolder) {
        // IsDirectory is checking if the path exists as an object.
        new FakeHttpRequest(
            "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
-           "path?fields=size%2Cupdated\n"
+           "path?fields=size%2Cgeneration%2Cupdated\n"
            "Auth Token: fake_token\n"
            "Timeouts: 5 1 10\n",
            "", errors::NotFound("404"), 404)});
@@ -2761,41 +2990,71 @@ TEST(GcsFileSystemTest, CreateHttpRequest) {
   TF_EXPECT_OK(request->Send());
 }
 
-TEST(GcsFileSystemTest, NewRandomAccessFile_StatsRecording) {
-  class TestGcsStats : public GcsStatsInterface {
-   public:
-    void Init(GcsFileSystem* fs, GcsThrottle* throttle,
-              const FileBlockCache* block_cache) override {
-      CHECK(fs_ == nullptr);
-      CHECK(throttle_ == nullptr);
-      CHECK(block_cache_ == nullptr);
-
-      fs_ = fs;
-      throttle_ = throttle;
-      block_cache_ = block_cache;
-    }
-
-    void RecordBlockLoadRequest(const string& file, size_t offset) override {
-      block_load_request_file_ = file;
-    }
-
-    void RecordBlockRetrieved(const string& file, size_t offset,
-                              size_t bytes_transferred) override {
-      block_retrieved_file_ = file;
-      block_retrieved_bytes_transferred_ = bytes_transferred;
-    }
-
-    HttpRequest::RequestStats* HttpStats() override { return nullptr; }
-
-    GcsFileSystem* fs_ = nullptr;
-    GcsThrottle* throttle_ = nullptr;
-    const FileBlockCache* block_cache_ = nullptr;
-
-    string block_load_request_file_;
-    string block_retrieved_file_;
-    size_t block_retrieved_bytes_transferred_ = 0;
-  };
+class TestGcsStats : public GcsStatsInterface {
+ public:
+  void Configure(GcsFileSystem* fs, GcsThrottle* throttle,
+                 const FileBlockCache* block_cache) override {
+    CHECK(fs_ == nullptr);
+    CHECK(throttle_ == nullptr);
+    CHECK(block_cache_ == nullptr);
+
+    fs_ = fs;
+    throttle_ = throttle;
+    block_cache_ = block_cache;
+  }
+
+  void RecordBlockLoadRequest(const string& file, size_t offset) override {
+    block_load_request_file_ = file;
+  }
+
+  void RecordBlockRetrieved(const string& file, size_t offset,
+                            size_t bytes_transferred) override {
+    block_retrieved_file_ = file;
+    block_retrieved_bytes_transferred_ = bytes_transferred;
+  }
+
+  void RecordStatObjectRequest() override { stat_object_request_count_++; }
+
+  HttpRequest::RequestStats* HttpStats() override { return nullptr; }
+
+  GcsFileSystem* fs_ = nullptr;
+  GcsThrottle* throttle_ = nullptr;
+  const FileBlockCache* block_cache_ = nullptr;
+
+  string block_load_request_file_;
+  string block_retrieved_file_;
+  size_t block_retrieved_bytes_transferred_ = 0;
+  int stat_object_request_count_ = 0;
+};
+
+TEST(GcsFileSystemTest, Stat_StatsRecording) {
+  std::vector<HttpRequest*> requests({new FakeHttpRequest(
+      "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+      "file.txt?fields=size%2Cgeneration%2Cupdated\n"
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
+      strings::StrCat("{\"size\": \"1010\",\"generation\": \"1\","
+                      "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */,
+                   0 /* initial retry delay */, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
+
+  TestGcsStats stats;
+  fs.SetStats(&stats);
+  EXPECT_EQ(stats.fs_, &fs);
 
+  FileStatistics stat;
+  TF_EXPECT_OK(fs.Stat("gs://bucket/file.txt", &stat));
+  EXPECT_EQ(1, stats.stat_object_request_count_);
+}
+
+TEST(GcsFileSystemTest, NewRandomAccessFile_StatsRecording) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://storage.googleapis.com/bucket/random_access.txt\n"
       "Auth Token: fake_token\n"
diff --git a/tensorflow/core/platform/cloud/gcs_throttle.cc b/tensorflow/core/platform/cloud/gcs_throttle.cc
index 27dd06a6250ad4..940d98fd090860 100644
--- a/tensorflow/core/platform/cloud/gcs_throttle.cc
+++ b/tensorflow/core/platform/cloud/gcs_throttle.cc
@@ -51,7 +51,7 @@ void GcsThrottle::UpdateState() {
   // TODO(b/72643279): Switch to a monotonic clock.
   int64 now = env_time_->NowSeconds();
   uint64 delta_secs =
-      std::max(0LL, now - static_cast<int64>(last_updated_secs_));
+      std::max(int64{0}, now - static_cast<int64>(last_updated_secs_));
   available_tokens_ += delta_secs * config_.token_rate;
   available_tokens_ = std::min(available_tokens_, config_.bucket_size);
   last_updated_secs_ = now;
diff --git a/tensorflow/core/platform/cloud/gcs_throttle.h b/tensorflow/core/platform/cloud/gcs_throttle.h
index 97a858e3fecfbb..8c9e2e074cbc20 100644
--- a/tensorflow/core/platform/cloud/gcs_throttle.h
+++ b/tensorflow/core/platform/cloud/gcs_throttle.h
@@ -132,7 +132,7 @@ class GcsThrottle {
    * UpdateState updates the available_tokens_ and last_updated_secs_ variables.
    *
    * UpdateState should be called in order to mark the passage of time, and
-   * therefore add tokens to the availble_tokens_ pool.
+   * therefore add tokens to the available_tokens_ pool.
    */
   void UpdateState() EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
diff --git a/tensorflow/core/platform/cloud/oauth_client.cc b/tensorflow/core/platform/cloud/oauth_client.cc
index 06849f9093099b..e64653a67aceeb 100644
--- a/tensorflow/core/platform/cloud/oauth_client.cc
+++ b/tensorflow/core/platform/cloud/oauth_client.cc
@@ -97,7 +97,7 @@ Status CreateSignature(RSA* private_key, StringPiece to_sign,
   }
   std::unique_ptr<EVP_MD_CTX, std::function<void(EVP_MD_CTX*)>> md_ctx(
       EVP_MD_CTX_create(), [](EVP_MD_CTX* ptr) { EVP_MD_CTX_destroy(ptr); });
-  if (!md_ctx.get()) {
+  if (!md_ctx) {
     return errors::Internal("Could not create MD_CTX.");
   }
 
@@ -196,7 +196,7 @@ Status OAuthClient::GetTokenFromServiceAccountJson(
   std::unique_ptr<RSA, std::function<void(RSA*)>> private_key(
       PEM_read_bio_RSAPrivateKey(bio.get(), nullptr, nullptr, nullptr),
       [](RSA* ptr) { RSA_free(ptr); });
-  if (!private_key.get()) {
+  if (!private_key) {
     return errors::Internal("Could not deserialize the private key.");
   }
 
@@ -216,7 +216,7 @@ Status OAuthClient::GetTokenFromServiceAccountJson(
   // Send the request to the Google OAuth 2.0 server to get the token.
   std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
   std::vector<char> response_buffer;
-  request->SetUri(oauth_server_uri.ToString());
+  request->SetUri(std::string(oauth_server_uri));
   request->SetPostFromBuffer(request_body.c_str(), request_body.size());
   request->SetResultBuffer(&response_buffer);
   TF_RETURN_IF_ERROR(request->Send());
@@ -248,7 +248,7 @@ Status OAuthClient::GetTokenFromRefreshTokenJson(
 
   std::unique_ptr<HttpRequest> request(http_request_factory_->Create());
   std::vector<char> response_buffer;
-  request->SetUri(oauth_server_uri.ToString());
+  request->SetUri(std::string(oauth_server_uri));
   request->SetPostFromBuffer(request_body.c_str(), request_body.size());
   request->SetResultBuffer(&response_buffer);
   TF_RETURN_IF_ERROR(request->Send());
diff --git a/tensorflow/core/platform/cloud/oauth_client_test.cc b/tensorflow/core/platform/cloud/oauth_client_test.cc
index ad569758cc6ec1..4ffa72288bb5ea 100644
--- a/tensorflow/core/platform/cloud/oauth_client_test.cc
+++ b/tensorflow/core/platform/cloud/oauth_client_test.cc
@@ -124,11 +124,11 @@ TEST(OAuthClientTest, GetTokenFromServiceAccountJson) {
                   .OneLiteral("&assertion=")
                   .GetResult(&assertion, &grant_type));
   EXPECT_EQ("urn%3Aietf%3Aparams%3Aoauth%3Agrant-type%3Ajwt-bearer",
-            grant_type.ToString());
+            grant_type);
 
-  int last_dot = assertion.ToString().find_last_of(".");
-  string header_dot_claim = assertion.ToString().substr(0, last_dot);
-  string signature_encoded = assertion.ToString().substr(last_dot + 1);
+  int last_dot = std::string(assertion).find_last_of(".");
+  string header_dot_claim = std::string(assertion.substr(0, last_dot));
+  string signature_encoded = std::string(assertion.substr(last_dot + 1));
 
   // Check that 'signature' signs 'header_dot_claim'.
 
diff --git a/tensorflow/core/platform/cloud/ram_file_block_cache.cc b/tensorflow/core/platform/cloud/ram_file_block_cache.cc
index 55a5657a503a33..82b692a9e39c9c 100644
--- a/tensorflow/core/platform/cloud/ram_file_block_cache.cc
+++ b/tensorflow/core/platform/cloud/ram_file_block_cache.cc
@@ -161,7 +161,7 @@ Status RamFileBlockCache::Read(const string& filename, size_t offset, size_t n,
   if (n == 0) {
     return Status::OK();
   }
-  if (block_size_ == 0 || max_bytes_ == 0) {
+  if (!IsCacheEnabled()) {
     // The cache is effectively disabled, so we pass the read through to the
     // fetcher without breaking it up into blocks.
     return block_fetcher_(filename, offset, n, buffer, bytes_transferred);
@@ -217,6 +217,23 @@ Status RamFileBlockCache::Read(const string& filename, size_t offset, size_t n,
   return Status::OK();
 }
 
+bool RamFileBlockCache::ValidateAndUpdateFileSignature(const string& filename,
+                                                       int64 file_signature) {
+  mutex_lock lock(mu_);
+  auto it = file_signature_map_.find(filename);
+  if (it != file_signature_map_.end()) {
+    if (it->second == file_signature) {
+      return true;
+    }
+    // Remove the file from cache if the signatures don't match.
+    RemoveFile_Locked(filename);
+    it->second = file_signature;
+    return false;
+  }
+  file_signature_map_[filename] = file_signature;
+  return true;
+}
+
 size_t RamFileBlockCache::CacheSize() const {
   mutex_lock lock(mu_);
   return cache_size_;
diff --git a/tensorflow/core/platform/cloud/ram_file_block_cache.h b/tensorflow/core/platform/cloud/ram_file_block_cache.h
index 7fdd7b2e0294e1..46fb9a35b88f04 100644
--- a/tensorflow/core/platform/cloud/ram_file_block_cache.h
+++ b/tensorflow/core/platform/cloud/ram_file_block_cache.h
@@ -60,6 +60,8 @@ class RamFileBlockCache : public FileBlockCache {
       pruning_thread_.reset(env_->StartThread(ThreadOptions(), "TF_prune_FBC",
                                               [this] { Prune(); }));
     }
+    VLOG(1) << "GCS file block cache is "
+            << (IsCacheEnabled() ? "enabled" : "disabled");
   }
 
   ~RamFileBlockCache() override {
@@ -88,11 +90,19 @@ class RamFileBlockCache : public FileBlockCache {
   Status Read(const string& filename, size_t offset, size_t n, char* buffer,
               size_t* bytes_transferred) override;
 
+  // Validate the given file signature with the existing file signature in the
+  // cache. Returns true if the signature doesn't change or the file doesn't
+  // exist before. If the signature changes, update the existing signature with
+  // the new one and remove the file from cache.
+  bool ValidateAndUpdateFileSignature(const string& filename,
+                                      int64 file_signature) override
+      LOCKS_EXCLUDED(mu_);
+
   /// Remove all cached blocks for `filename`.
   void RemoveFile(const string& filename) override LOCKS_EXCLUDED(mu_);
 
   /// Remove all cached data.
-  void Flush() LOCKS_EXCLUDED(mu_) override;
+  void Flush() override LOCKS_EXCLUDED(mu_);
 
   /// Accessors for cache parameters.
   size_t block_size() const override { return block_size_; }
@@ -102,6 +112,12 @@ class RamFileBlockCache : public FileBlockCache {
   /// The current size (in bytes) of the cache.
   size_t CacheSize() const override LOCKS_EXCLUDED(mu_);
 
+  // Returns true if the cache is enabled. If false, the BlockFetcher callback
+  // is always executed during Read.
+  bool IsCacheEnabled() const override {
+    return block_size_ > 0 && max_bytes_ > 0;
+  }
+
  private:
   /// The size of the blocks stored in the LRU cache, as well as the size of the
   /// reads from the underlying filesystem.
@@ -222,6 +238,9 @@ class RamFileBlockCache : public FileBlockCache {
 
   /// The combined number of bytes in all of the cached blocks.
   size_t cache_size_ GUARDED_BY(mu_) = 0;
+
+  // A filename->file_signature map.
+  std::map<string, int64> file_signature_map_ GUARDED_BY(mu_);
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/ram_file_block_cache_test.cc b/tensorflow/core/platform/cloud/ram_file_block_cache_test.cc
index 10203783fcbe06..eea61135c31c99 100644
--- a/tensorflow/core/platform/cloud/ram_file_block_cache_test.cc
+++ b/tensorflow/core/platform/cloud/ram_file_block_cache_test.cc
@@ -37,6 +37,52 @@ Status ReadCache(RamFileBlockCache* cache, const string& filename,
   return status;
 }
 
+TEST(RamFileBlockCacheTest, IsCacheEnabled) {
+  auto fetcher = [](const string& filename, size_t offset, size_t n,
+                    char* buffer, size_t* bytes_transferred) {
+    // Do nothing.
+    return Status::OK();
+  };
+  RamFileBlockCache cache1(0, 0, 0, fetcher);
+  RamFileBlockCache cache2(16, 0, 0, fetcher);
+  RamFileBlockCache cache3(0, 32, 0, fetcher);
+  RamFileBlockCache cache4(16, 32, 0, fetcher);
+
+  EXPECT_FALSE(cache1.IsCacheEnabled());
+  EXPECT_FALSE(cache2.IsCacheEnabled());
+  EXPECT_FALSE(cache3.IsCacheEnabled());
+  EXPECT_TRUE(cache4.IsCacheEnabled());
+}
+
+TEST(RamFileBlockCacheTest, ValidateAndUpdateFileSignature) {
+  int calls = 0;
+  auto fetcher = [&calls](const string& filename, size_t offset, size_t n,
+                          char* buffer, size_t* bytes_transferred) {
+    calls++;
+    memset(buffer, 'x', n);
+    *bytes_transferred = n;
+    return Status::OK();
+  };
+  string filename = "file";
+  RamFileBlockCache cache(16, 32, 0, fetcher);
+  std::vector<char> out;
+
+  // First read.
+  EXPECT_TRUE(cache.ValidateAndUpdateFileSignature(filename, 123));
+  TF_EXPECT_OK(ReadCache(&cache, filename, 0, 16, &out));
+  EXPECT_EQ(calls, 1);
+
+  // Second read. Hit cache.
+  EXPECT_TRUE(cache.ValidateAndUpdateFileSignature(filename, 123));
+  TF_EXPECT_OK(ReadCache(&cache, filename, 0, 16, &out));
+  EXPECT_EQ(calls, 1);
+
+  // Third read. File signatures are different.
+  EXPECT_FALSE(cache.ValidateAndUpdateFileSignature(filename, 321));
+  TF_EXPECT_OK(ReadCache(&cache, filename, 0, 16, &out));
+  EXPECT_EQ(calls, 2);
+}
+
 TEST(RamFileBlockCacheTest, PassThrough) {
   const string want_filename = "foo/bar";
   const size_t want_offset = 42;
diff --git a/tensorflow/core/platform/cloud/retrying_file_system.cc b/tensorflow/core/platform/cloud/retrying_file_system.cc
deleted file mode 100644
index be9ebe67b18e7b..00000000000000
--- a/tensorflow/core/platform/cloud/retrying_file_system.cc
+++ /dev/null
@@ -1,207 +0,0 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/platform/cloud/retrying_file_system.h"
-#include <functional>
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/random/random.h"
-#include "tensorflow/core/platform/cloud/retrying_utils.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/file_system.h"
-
-namespace tensorflow {
-
-namespace {
-
-class RetryingRandomAccessFile : public RandomAccessFile {
- public:
-  RetryingRandomAccessFile(std::unique_ptr<RandomAccessFile> base_file,
-                           int64 delay_microseconds)
-      : base_file_(std::move(base_file)),
-        initial_delay_microseconds_(delay_microseconds) {}
-
-  Status Read(uint64 offset, size_t n, StringPiece* result,
-              char* scratch) const override {
-    return RetryingUtils::CallWithRetries(
-        std::bind(&RandomAccessFile::Read, base_file_.get(), offset, n, result,
-                  scratch),
-        initial_delay_microseconds_);
-  }
-
- private:
-  std::unique_ptr<RandomAccessFile> base_file_;
-  const int64 initial_delay_microseconds_;
-};
-
-class RetryingWritableFile : public WritableFile {
- public:
-  RetryingWritableFile(std::unique_ptr<WritableFile> base_file,
-                       int64 delay_microseconds)
-      : base_file_(std::move(base_file)),
-        initial_delay_microseconds_(delay_microseconds) {}
-
-  ~RetryingWritableFile() override {
-    // Makes sure the retrying version of Close() is called in the destructor.
-    Close().IgnoreError();
-  }
-
-  Status Append(const StringPiece& data) override {
-    return RetryingUtils::CallWithRetries(
-        std::bind(&WritableFile::Append, base_file_.get(), data),
-        initial_delay_microseconds_);
-  }
-  Status Close() override {
-    return RetryingUtils::CallWithRetries(
-        std::bind(&WritableFile::Close, base_file_.get()),
-        initial_delay_microseconds_);
-  }
-  Status Flush() override {
-    return RetryingUtils::CallWithRetries(
-        std::bind(&WritableFile::Flush, base_file_.get()),
-        initial_delay_microseconds_);
-  }
-  Status Sync() override {
-    return RetryingUtils::CallWithRetries(
-        std::bind(&WritableFile::Sync, base_file_.get()),
-        initial_delay_microseconds_);
-  }
-
- private:
-  std::unique_ptr<WritableFile> base_file_;
-  const int64 initial_delay_microseconds_;
-};
-
-}  // namespace
-
-Status RetryingFileSystem::NewRandomAccessFile(
-    const string& filename, std::unique_ptr<RandomAccessFile>* result) {
-  std::unique_ptr<RandomAccessFile> base_file;
-  TF_RETURN_IF_ERROR(RetryingUtils::CallWithRetries(
-      std::bind(&FileSystem::NewRandomAccessFile, base_file_system_.get(),
-                filename, &base_file),
-      initial_delay_microseconds_));
-  result->reset(new RetryingRandomAccessFile(std::move(base_file),
-                                             initial_delay_microseconds_));
-  return Status::OK();
-}
-
-Status RetryingFileSystem::NewWritableFile(
-    const string& filename, std::unique_ptr<WritableFile>* result) {
-  std::unique_ptr<WritableFile> base_file;
-  TF_RETURN_IF_ERROR(RetryingUtils::CallWithRetries(
-      std::bind(&FileSystem::NewWritableFile, base_file_system_.get(), filename,
-                &base_file),
-      initial_delay_microseconds_));
-  result->reset(new RetryingWritableFile(std::move(base_file),
-                                         initial_delay_microseconds_));
-  return Status::OK();
-}
-
-Status RetryingFileSystem::NewAppendableFile(
-    const string& filename, std::unique_ptr<WritableFile>* result) {
-  std::unique_ptr<WritableFile> base_file;
-  TF_RETURN_IF_ERROR(RetryingUtils::CallWithRetries(
-      std::bind(&FileSystem::NewAppendableFile, base_file_system_.get(),
-                filename, &base_file),
-      initial_delay_microseconds_));
-  result->reset(new RetryingWritableFile(std::move(base_file),
-                                         initial_delay_microseconds_));
-  return Status::OK();
-}
-
-Status RetryingFileSystem::NewReadOnlyMemoryRegionFromFile(
-    const string& filename, std::unique_ptr<ReadOnlyMemoryRegion>* result) {
-  return RetryingUtils::CallWithRetries(
-      std::bind(&FileSystem::NewReadOnlyMemoryRegionFromFile,
-                base_file_system_.get(), filename, result),
-      initial_delay_microseconds_);
-}
-
-Status RetryingFileSystem::FileExists(const string& fname) {
-  return RetryingUtils::CallWithRetries(
-      std::bind(&FileSystem::FileExists, base_file_system_.get(), fname),
-      initial_delay_microseconds_);
-}
-
-Status RetryingFileSystem::Stat(const string& fname, FileStatistics* stat) {
-  return RetryingUtils::CallWithRetries(
-      std::bind(&FileSystem::Stat, base_file_system_.get(), fname, stat),
-      initial_delay_microseconds_);
-}
-
-Status RetryingFileSystem::GetChildren(const string& dir,
-                                       std::vector<string>* result) {
-  return RetryingUtils::CallWithRetries(
-      std::bind(&FileSystem::GetChildren, base_file_system_.get(), dir, result),
-      initial_delay_microseconds_);
-}
-
-Status RetryingFileSystem::GetMatchingPaths(const string& pattern,
-                                            std::vector<string>* result) {
-  return RetryingUtils::CallWithRetries(
-      std::bind(&FileSystem::GetMatchingPaths, base_file_system_.get(), pattern,
-                result),
-      initial_delay_microseconds_);
-}
-
-Status RetryingFileSystem::DeleteFile(const string& fname) {
-  return RetryingUtils::DeleteWithRetries(
-      std::bind(&FileSystem::DeleteFile, base_file_system_.get(), fname),
-      initial_delay_microseconds_);
-}
-
-Status RetryingFileSystem::CreateDir(const string& dirname) {
-  return RetryingUtils::CallWithRetries(
-      std::bind(&FileSystem::CreateDir, base_file_system_.get(), dirname),
-      initial_delay_microseconds_);
-}
-
-Status RetryingFileSystem::DeleteDir(const string& dirname) {
-  return RetryingUtils::DeleteWithRetries(
-      std::bind(&FileSystem::DeleteDir, base_file_system_.get(), dirname),
-      initial_delay_microseconds_);
-}
-
-Status RetryingFileSystem::GetFileSize(const string& fname, uint64* file_size) {
-  return RetryingUtils::CallWithRetries(
-      std::bind(&FileSystem::GetFileSize, base_file_system_.get(), fname,
-                file_size),
-      initial_delay_microseconds_);
-}
-
-Status RetryingFileSystem::RenameFile(const string& src, const string& target) {
-  return RetryingUtils::CallWithRetries(
-      std::bind(&FileSystem::RenameFile, base_file_system_.get(), src, target),
-      initial_delay_microseconds_);
-}
-
-Status RetryingFileSystem::IsDirectory(const string& dirname) {
-  return RetryingUtils::CallWithRetries(
-      std::bind(&FileSystem::IsDirectory, base_file_system_.get(), dirname),
-      initial_delay_microseconds_);
-}
-
-Status RetryingFileSystem::DeleteRecursively(const string& dirname,
-                                             int64* undeleted_files,
-                                             int64* undeleted_dirs) {
-  return RetryingUtils::DeleteWithRetries(
-      std::bind(&FileSystem::DeleteRecursively, base_file_system_.get(),
-                dirname, undeleted_files, undeleted_dirs),
-      initial_delay_microseconds_);
-}
-
-void RetryingFileSystem::FlushCaches() { base_file_system_->FlushCaches(); }
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/platform/cloud/retrying_file_system.h b/tensorflow/core/platform/cloud/retrying_file_system.h
index a262a5fd940f9b..92aa72be89e4b3 100644
--- a/tensorflow/core/platform/cloud/retrying_file_system.h
+++ b/tensorflow/core/platform/cloud/retrying_file_system.h
@@ -13,20 +13,27 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_PLATFORM_RETRYING_FILE_SYSTEM_H_
-#define TENSORFLOW_CORE_PLATFORM_RETRYING_FILE_SYSTEM_H_
+#ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_RETRYING_FILE_SYSTEM_H_
+#define TENSORFLOW_CORE_PLATFORM_CLOUD_RETRYING_FILE_SYSTEM_H_
 
+#include <functional>
 #include <string>
 #include <vector>
+
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/cloud/retrying_utils.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/file_system.h"
 
 namespace tensorflow {
 
 /// A wrapper to add retry logic to another file system.
+template <typename Underlying>
 class RetryingFileSystem : public FileSystem {
  public:
-  RetryingFileSystem(std::unique_ptr<FileSystem> base_file_system,
+  RetryingFileSystem(std::unique_ptr<Underlying> base_file_system,
                      int64 delay_microseconds = 1000000)
       : base_file_system_(std::move(base_file_system)),
         initial_delay_microseconds_(delay_microseconds) {}
@@ -45,39 +52,209 @@ class RetryingFileSystem : public FileSystem {
       const string& filename,
       std::unique_ptr<ReadOnlyMemoryRegion>* result) override;
 
-  Status FileExists(const string& fname) override;
+  Status FileExists(const string& fname) override {
+    return RetryingUtils::CallWithRetries(
+        [this, &fname]() { return base_file_system_->FileExists(fname); },
+        initial_delay_microseconds_);
+  }
 
-  Status GetChildren(const string& dir, std::vector<string>* result) override;
+  Status GetChildren(const string& dir, std::vector<string>* result) override {
+    return RetryingUtils::CallWithRetries(
+        [this, &dir, result]() {
+          return base_file_system_->GetChildren(dir, result);
+        },
+        initial_delay_microseconds_);
+  }
 
-  Status GetMatchingPaths(const string& dir,
-                          std::vector<string>* result) override;
+  Status GetMatchingPaths(const string& pattern,
+                          std::vector<string>* result) override {
+    return RetryingUtils::CallWithRetries(
+        [this, &pattern, result]() {
+          return base_file_system_->GetMatchingPaths(pattern, result);
+        },
+        initial_delay_microseconds_);
+  }
 
-  Status Stat(const string& fname, FileStatistics* stat) override;
+  Status Stat(const string& fname, FileStatistics* stat) override {
+    return RetryingUtils::CallWithRetries(
+        [this, &fname, stat]() { return base_file_system_->Stat(fname, stat); },
+        initial_delay_microseconds_);
+  }
 
-  Status DeleteFile(const string& fname) override;
+  Status DeleteFile(const string& fname) override {
+    return RetryingUtils::DeleteWithRetries(
+        [this, &fname]() { return base_file_system_->DeleteFile(fname); },
+        initial_delay_microseconds_);
+  }
 
-  Status CreateDir(const string& dirname) override;
+  Status CreateDir(const string& dirname) override {
+    return RetryingUtils::CallWithRetries(
+        [this, &dirname]() { return base_file_system_->CreateDir(dirname); },
+        initial_delay_microseconds_);
+  }
 
-  Status DeleteDir(const string& dirname) override;
+  Status DeleteDir(const string& dirname) override {
+    return RetryingUtils::DeleteWithRetries(
+        [this, &dirname]() { return base_file_system_->DeleteDir(dirname); },
+        initial_delay_microseconds_);
+  }
 
-  Status GetFileSize(const string& fname, uint64* file_size) override;
+  Status GetFileSize(const string& fname, uint64* file_size) override {
+    return RetryingUtils::CallWithRetries(
+        [this, &fname, file_size]() {
+          return base_file_system_->GetFileSize(fname, file_size);
+        },
+        initial_delay_microseconds_);
+  }
 
-  Status RenameFile(const string& src, const string& target) override;
+  Status RenameFile(const string& src, const string& target) override {
+    return RetryingUtils::CallWithRetries(
+        [this, &src, &target]() {
+          return base_file_system_->RenameFile(src, target);
+        },
+        initial_delay_microseconds_);
+  }
 
-  Status IsDirectory(const string& dir) override;
+  Status IsDirectory(const string& dirname) override {
+    return RetryingUtils::CallWithRetries(
+        [this, &dirname]() { return base_file_system_->IsDirectory(dirname); },
+        initial_delay_microseconds_);
+  }
 
   Status DeleteRecursively(const string& dirname, int64* undeleted_files,
-                           int64* undeleted_dirs) override;
+                           int64* undeleted_dirs) override {
+    return RetryingUtils::DeleteWithRetries(
+        [this, &dirname, undeleted_files, undeleted_dirs]() {
+          return base_file_system_->DeleteRecursively(dirname, undeleted_files,
+                                                      undeleted_dirs);
+        },
+        initial_delay_microseconds_);
+  }
+
+  void FlushCaches() override { base_file_system_->FlushCaches(); }
 
-  void FlushCaches() override;
+  Underlying* underlying() const { return base_file_system_.get(); }
 
  private:
-  std::unique_ptr<FileSystem> base_file_system_;
+  std::unique_ptr<Underlying> base_file_system_;
   const int64 initial_delay_microseconds_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(RetryingFileSystem);
 };
 
+namespace retrying_internals {
+
+class RetryingRandomAccessFile : public RandomAccessFile {
+ public:
+  RetryingRandomAccessFile(std::unique_ptr<RandomAccessFile> base_file,
+                           int64 delay_microseconds)
+      : base_file_(std::move(base_file)),
+        initial_delay_microseconds_(delay_microseconds) {}
+
+  Status Read(uint64 offset, size_t n, StringPiece* result,
+              char* scratch) const override {
+    return RetryingUtils::CallWithRetries(
+        [this, offset, n, result, scratch]() {
+          return base_file_->Read(offset, n, result, scratch);
+        },
+        initial_delay_microseconds_);
+  }
+
+ private:
+  std::unique_ptr<RandomAccessFile> base_file_;
+  const int64 initial_delay_microseconds_;
+};
+
+class RetryingWritableFile : public WritableFile {
+ public:
+  RetryingWritableFile(std::unique_ptr<WritableFile> base_file,
+                       int64 delay_microseconds)
+      : base_file_(std::move(base_file)),
+        initial_delay_microseconds_(delay_microseconds) {}
+
+  ~RetryingWritableFile() override {
+    // Makes sure the retrying version of Close() is called in the destructor.
+    Close().IgnoreError();
+  }
+
+  Status Append(const StringPiece& data) override {
+    return RetryingUtils::CallWithRetries(
+        [this, &data]() { return base_file_->Append(data); },
+        initial_delay_microseconds_);
+  }
+  Status Close() override {
+    return RetryingUtils::CallWithRetries(
+        [this]() { return base_file_->Close(); }, initial_delay_microseconds_);
+  }
+  Status Flush() override {
+    return RetryingUtils::CallWithRetries(
+        [this]() { return base_file_->Flush(); }, initial_delay_microseconds_);
+  }
+  Status Sync() override {
+    return RetryingUtils::CallWithRetries(
+        [this]() { return base_file_->Sync(); }, initial_delay_microseconds_);
+  }
+
+ private:
+  std::unique_ptr<WritableFile> base_file_;
+  const int64 initial_delay_microseconds_;
+};
+
+}  // namespace retrying_internals
+
+template <typename Underlying>
+Status RetryingFileSystem<Underlying>::NewRandomAccessFile(
+    const string& filename, std::unique_ptr<RandomAccessFile>* result) {
+  std::unique_ptr<RandomAccessFile> base_file;
+  TF_RETURN_IF_ERROR(RetryingUtils::CallWithRetries(
+      [this, &filename, &base_file]() {
+        return base_file_system_->NewRandomAccessFile(filename, &base_file);
+      },
+      initial_delay_microseconds_));
+  result->reset(new retrying_internals::RetryingRandomAccessFile(
+      std::move(base_file), initial_delay_microseconds_));
+  return Status::OK();
+}
+
+template <typename Underlying>
+Status RetryingFileSystem<Underlying>::NewWritableFile(
+    const string& filename, std::unique_ptr<WritableFile>* result) {
+  std::unique_ptr<WritableFile> base_file;
+  TF_RETURN_IF_ERROR(RetryingUtils::CallWithRetries(
+      [this, &filename, &base_file]() {
+        return base_file_system_->NewWritableFile(filename, &base_file);
+      },
+      initial_delay_microseconds_));
+  result->reset(new retrying_internals::RetryingWritableFile(
+      std::move(base_file), initial_delay_microseconds_));
+  return Status::OK();
+}
+
+template <typename Underlying>
+Status RetryingFileSystem<Underlying>::NewAppendableFile(
+    const string& filename, std::unique_ptr<WritableFile>* result) {
+  std::unique_ptr<WritableFile> base_file;
+  TF_RETURN_IF_ERROR(RetryingUtils::CallWithRetries(
+      [this, &filename, &base_file]() {
+        return base_file_system_->NewAppendableFile(filename, &base_file);
+      },
+      initial_delay_microseconds_));
+  result->reset(new retrying_internals::RetryingWritableFile(
+      std::move(base_file), initial_delay_microseconds_));
+  return Status::OK();
+}
+
+template <typename Underlying>
+Status RetryingFileSystem<Underlying>::NewReadOnlyMemoryRegionFromFile(
+    const string& filename, std::unique_ptr<ReadOnlyMemoryRegion>* result) {
+  return RetryingUtils::CallWithRetries(
+      [this, &filename, result]() {
+        return base_file_system_->NewReadOnlyMemoryRegionFromFile(filename,
+                                                                  result);
+      },
+      initial_delay_microseconds_);
+}
+
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_PLATFORM_RETRYING_FILE_SYSTEM_H_
+#endif  // TENSORFLOW_CORE_PLATFORM_CLOUD_RETRYING_FILE_SYSTEM_H_
diff --git a/tensorflow/core/platform/cloud/retrying_file_system_test.cc b/tensorflow/core/platform/cloud/retrying_file_system_test.cc
index ee6886fef70328..ec2c470db797a1 100644
--- a/tensorflow/core/platform/cloud/retrying_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/retrying_file_system_test.cc
@@ -184,7 +184,7 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_ImmediateSuccess) {
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->random_access_file_to_return = std::move(base_file);
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   // Retrieve the wrapped random access file.
   std::unique_ptr<RandomAccessFile> random_access_file;
@@ -211,7 +211,7 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_SuccessWith3rdTry) {
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->random_access_file_to_return = std::move(base_file);
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   // Retrieve the wrapped random access file.
   std::unique_ptr<RandomAccessFile> random_access_file;
@@ -235,7 +235,7 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_AllRetriesFailed) {
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->random_access_file_to_return = std::move(base_file);
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   // Retrieve the wrapped random access file.
   std::unique_ptr<RandomAccessFile> random_access_file;
@@ -265,7 +265,7 @@ TEST(RetryingFileSystemTest, NewRandomAccessFile_NoRetriesForSomeErrors) {
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->random_access_file_to_return = std::move(base_file);
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   // Retrieve the wrapped random access file.
   std::unique_ptr<RandomAccessFile> random_access_file;
@@ -291,7 +291,7 @@ TEST(RetryingFileSystemTest, NewWritableFile_ImmediateSuccess) {
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->writable_file_to_return = std::move(base_file);
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   // Retrieve the wrapped writable file.
   std::unique_ptr<WritableFile> writable_file;
@@ -317,7 +317,7 @@ TEST(RetryingFileSystemTest, NewWritableFile_SuccessWith3rdTry) {
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->writable_file_to_return = std::move(base_file);
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   // Retrieve the wrapped writable file.
   std::unique_ptr<WritableFile> writable_file;
@@ -343,7 +343,7 @@ TEST(RetryingFileSystemTest, NewWritableFile_SuccessWith3rdTry_ViaDestructor) {
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->writable_file_to_return = std::move(base_file);
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   // Retrieve the wrapped writable file.
   std::unique_ptr<WritableFile> writable_file;
@@ -368,7 +368,7 @@ TEST(RetryingFileSystemTest, NewAppendableFile_SuccessWith3rdTry) {
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->writable_file_to_return = std::move(base_file);
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   // Retrieve the wrapped appendable file.
   std::unique_ptr<WritableFile> writable_file;
@@ -391,7 +391,7 @@ TEST(RetryingFileSystemTest, NewWritableFile_AllRetriesFailed) {
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
   base_fs->writable_file_to_return = std::move(base_file);
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   // Retrieve the wrapped writable file.
   std::unique_ptr<WritableFile> writable_file;
@@ -412,7 +412,7 @@ TEST(RetryingFileSystemTest,
        std::make_tuple("NewReadOnlyMemoryRegionFromFile", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   std::unique_ptr<ReadOnlyMemoryRegion> result;
   TF_EXPECT_OK(fs.NewReadOnlyMemoryRegionFromFile("filename.txt", &result));
@@ -423,7 +423,7 @@ TEST(RetryingFileSystemTest, NewReadOnlyMemoryRegionFromFile_AllRetriesFailed) {
       CreateRetriableErrors("NewReadOnlyMemoryRegionFromFile", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   std::unique_ptr<ReadOnlyMemoryRegion> result;
   const auto& status =
@@ -440,7 +440,7 @@ TEST(RetryingFileSystemTest, GetChildren_SuccessWith2ndTry) {
        std::make_tuple("GetChildren", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetChildren("gs://path", &result));
@@ -450,7 +450,7 @@ TEST(RetryingFileSystemTest, GetChildren_AllRetriesFailed) {
   ExpectedCalls expected_fs_calls = CreateRetriableErrors("GetChildren", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   std::vector<string> result;
   const auto& status = fs.GetChildren("gs://path", &result);
@@ -466,7 +466,7 @@ TEST(RetryingFileSystemTest, GetMatchingPaths_SuccessWith2ndTry) {
        std::make_tuple("GetMatchingPaths", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.GetMatchingPaths("gs://path/dir", &result));
@@ -477,7 +477,7 @@ TEST(RetryingFileSystemTest, GetMatchingPaths_AllRetriesFailed) {
       CreateRetriableErrors("GetMatchingPaths", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   std::vector<string> result;
   const auto& status = fs.GetMatchingPaths("gs://path/dir", &result);
@@ -492,7 +492,7 @@ TEST(RetryingFileSystemTest, DeleteFile_SuccessWith2ndTry) {
        std::make_tuple("DeleteFile", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.DeleteFile("gs://path/file.txt"));
@@ -502,7 +502,7 @@ TEST(RetryingFileSystemTest, DeleteFile_AllRetriesFailed) {
   ExpectedCalls expected_fs_calls = CreateRetriableErrors("DeleteFile", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   std::vector<string> result;
   const auto& status = fs.DeleteFile("gs://path/file.txt");
@@ -517,7 +517,7 @@ TEST(RetryingFileSystemTest, CreateDir_SuccessWith2ndTry) {
        std::make_tuple("CreateDir", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.CreateDir("gs://path/newdir"));
@@ -527,7 +527,7 @@ TEST(RetryingFileSystemTest, CreateDir_AllRetriesFailed) {
   ExpectedCalls expected_fs_calls = CreateRetriableErrors("CreateDir", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   std::vector<string> result;
   const auto& status = fs.CreateDir("gs://path/newdir");
@@ -542,7 +542,7 @@ TEST(RetryingFileSystemTest, DeleteDir_SuccessWith2ndTry) {
        std::make_tuple("DeleteDir", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   std::vector<string> result;
   TF_EXPECT_OK(fs.DeleteDir("gs://path/dir"));
@@ -552,7 +552,7 @@ TEST(RetryingFileSystemTest, DeleteDir_AllRetriesFailed) {
   ExpectedCalls expected_fs_calls = CreateRetriableErrors("DeleteDir", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   std::vector<string> result;
   const auto& status = fs.DeleteDir("gs://path/dir");
@@ -568,7 +568,7 @@ TEST(RetryingFileSystemTest, GetFileSize_SuccessWith2ndTry) {
        std::make_tuple("GetFileSize", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   uint64 size;
   TF_EXPECT_OK(fs.GetFileSize("gs://path/file.txt", &size));
@@ -578,7 +578,7 @@ TEST(RetryingFileSystemTest, GetFileSize_AllRetriesFailed) {
   ExpectedCalls expected_fs_calls = CreateRetriableErrors("GetFileSize", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   uint64 size;
   const auto& status = fs.GetFileSize("gs://path/file.txt", &size);
@@ -593,7 +593,7 @@ TEST(RetryingFileSystemTest, RenameFile_SuccessWith2ndTry) {
        std::make_tuple("RenameFile", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   TF_EXPECT_OK(fs.RenameFile("old_name", "new_name"));
 }
@@ -602,7 +602,7 @@ TEST(RetryingFileSystemTest, RenameFile_AllRetriesFailed) {
   ExpectedCalls expected_fs_calls = CreateRetriableErrors("RenameFile", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   const auto& status = fs.RenameFile("old_name", "new_name");
   EXPECT_TRUE(
@@ -616,7 +616,7 @@ TEST(RetryingFileSystemTest, Stat_SuccessWith2ndTry) {
        std::make_tuple("Stat", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   FileStatistics stat;
   TF_EXPECT_OK(fs.Stat("file_name", &stat));
@@ -626,7 +626,7 @@ TEST(RetryingFileSystemTest, Stat_AllRetriesFailed) {
   ExpectedCalls expected_fs_calls = CreateRetriableErrors("Stat", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   FileStatistics stat;
   const auto& status = fs.Stat("file_name", &stat);
@@ -639,7 +639,7 @@ TEST(RetryingFileSystemTest, FileExists_AllRetriesFailed) {
   ExpectedCalls expected_fs_calls = CreateRetriableErrors("FileExists", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   const auto& status = fs.FileExists("file_name");
   EXPECT_TRUE(
@@ -653,7 +653,7 @@ TEST(RetryingFileSystemTest, FileExists_SuccessWith2ndTry) {
        std::make_tuple("FileExists", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   TF_EXPECT_OK(fs.FileExists("gs://path/dir"));
 }
@@ -665,7 +665,7 @@ TEST(RetryingFileSystemTest, IsDirectory_SuccessWith2ndTry) {
        std::make_tuple("IsDirectory", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   TF_EXPECT_OK(fs.IsDirectory("gs://path/dir"));
 }
@@ -674,7 +674,7 @@ TEST(RetryingFileSystemTest, IsDirectory_AllRetriesFailed) {
   ExpectedCalls expected_fs_calls = CreateRetriableErrors("IsDirectory", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
 
   const auto& status = fs.IsDirectory("gs://path/dir");
   EXPECT_TRUE(
@@ -689,7 +689,7 @@ TEST(RetryingFileSystemTest, DeleteRecursively_SuccessWith2ndTry) {
        std::make_tuple("DeleteRecursively", Status::OK())});
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
   int64 undeleted_files, undeleted_dirs;
 
   TF_EXPECT_OK(
@@ -701,7 +701,7 @@ TEST(RetryingFileSystemTest, DeleteRecursively_AllRetriesFailed) {
       CreateRetriableErrors("DeleteRecursively", 11);
   std::unique_ptr<MockFileSystem> base_fs(
       new MockFileSystem(expected_fs_calls));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
   int64 undeleted_files, undeleted_dirs;
 
   const auto& status =
@@ -715,7 +715,7 @@ TEST(RetryingFileSystemTest, FlushCaches) {
   ExpectedCalls none;
   bool flushed = false;
   std::unique_ptr<MockFileSystem> base_fs(new MockFileSystem(none, &flushed));
-  RetryingFileSystem fs(std::move(base_fs), 0);
+  RetryingFileSystem<MockFileSystem> fs(std::move(base_fs), 0);
   fs.FlushCaches();
   EXPECT_TRUE(flushed);
 }
diff --git a/tensorflow/core/platform/cloud/retrying_utils.cc b/tensorflow/core/platform/cloud/retrying_utils.cc
index 99691ecfb9d5a5..d2df42202487f1 100644
--- a/tensorflow/core/platform/cloud/retrying_utils.cc
+++ b/tensorflow/core/platform/cloud/retrying_utils.cc
@@ -44,9 +44,9 @@ bool IsRetriable(error::Code code) {
 
 Status RetryingUtils::CallWithRetries(const std::function<Status()>& f,
                                       const int64 initial_delay_microseconds) {
-  return CallWithRetries(f, initial_delay_microseconds,
-                         std::bind(&Env::SleepForMicroseconds, Env::Default(),
-                                   std::placeholders::_1));
+  return CallWithRetries(f, initial_delay_microseconds, [](int64 micros) {
+    return Env::Default()->SleepForMicroseconds(micros);
+  });
 }
 
 Status RetryingUtils::CallWithRetries(
diff --git a/tensorflow/core/platform/cpu_info.cc b/tensorflow/core/platform/cpu_info.cc
index 99de3640421243..e9da3d8e32ab0c 100644
--- a/tensorflow/core/platform/cpu_info.cc
+++ b/tensorflow/core/platform/cpu_info.cc
@@ -344,5 +344,28 @@ int CPUModelNum() {
 #endif
 }
 
+int CPUIDNumSMT() {
+#ifdef PLATFORM_IS_X86
+  // https://software.intel.com/en-us/articles/intel-64-architecture-processor-topology-enumeration
+  // https://software.intel.com/en-us/articles/intel-sdm (Vol 3A)
+  // Section: Detecting Hardware Multi-threads Support and Topology
+  // Uses CPUID Leaf 11 to enumerate system topology on Intel x86 architectures
+  // Other cases not supported
+  uint32 eax, ebx, ecx, edx;
+  // Check if system supports Leaf 11
+  GETCPUID(eax, ebx, ecx, edx, 0, 0);
+  if (eax >= 11) {
+    // 1) Leaf 11 available? CPUID.(EAX=11, ECX=0):EBX != 0
+    // 2) SMT_Mask_Width = CPUID.(EAX=11, ECX=0):EAX[4:0] if CPUID.(EAX=11,
+    // ECX=0):ECX[15:8] is 1
+    GETCPUID(eax, ebx, ecx, edx, 11, 0);
+    if (ebx != 0 && ((ecx & 0xff00) >> 8) == 1) {
+      return 1 << (eax & 0x1f);  // 2 ^ SMT_Mask_Width
+    }
+  }
+#endif  // PLATFORM_IS_X86
+  return 0;
+}
+
 }  // namespace port
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/cpu_info.h b/tensorflow/core/platform/cpu_info.h
index b5be7e8b54543d..175c9ae8b183ea 100644
--- a/tensorflow/core/platform/cpu_info.h
+++ b/tensorflow/core/platform/cpu_info.h
@@ -35,6 +35,10 @@ namespace port {
 // software can change it dynamically.
 int NumSchedulableCPUs();
 
+// Returns an estimate of the number of hyperthreads per physical core
+// on the CPU
+int NumHyperthreadsPerCore();
+
 // Mostly ISA related features that we care about
 enum CPUFeature {
   // Do not change numeric assignments.
@@ -107,6 +111,9 @@ int CPUModelNum();
 // Returns nominal core processor cycles per second of each processor.
 double NominalCPUFrequency();
 
+// Returns num of hyperthreads per physical core
+int CPUIDNumSMT();
+
 }  // namespace port
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index aaf497372a678f..c0082dbb581a18 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -72,6 +72,8 @@ def pyx_library(
         name = filename + "_cython_translation",
         srcs = [filename],
         outs = [filename.split(".")[0] + ".cpp"],
+        # Optionally use PYTHON_BIN_PATH on Linux platforms so that python 3
+        # works. Windows has issues with cython_binary so skip PYTHON_BIN_PATH.
         cmd = "PYTHONHASHSEED=0 $(location @cython//:cython_binary) --cplus $(SRCS) --output-file $(OUTS)",
         tools = ["@cython//:cython_binary"] + pxd_srcs,
     )
@@ -83,7 +85,7 @@ def pyx_library(
     native.cc_binary(
         name=shared_object_name,
         srcs=[stem + ".cpp"],
-        deps=deps + ["//util/python:python_headers"],
+        deps=deps + ["//third_party/python_runtime:headers"],
         linkshared = 1,
     )
     shared_objects.append(shared_object_name)
@@ -305,6 +307,7 @@ def tf_proto_library_cc(name, srcs = [], has_services = None,
                         cc_grpc_version = None,
                         j2objc_api_version = 1,
                         cc_api_version = 2,
+                        dart_api_version = 2,
                         java_api_version = 2, py_api_version = 2,
                         js_api_version = 2, js_codegen = "jspb",
                         default_header = False):
@@ -336,6 +339,8 @@ def tf_proto_library_cc(name, srcs = [], has_services = None,
         name = cc_name,
         deps = cc_deps + ["@protobuf_archive//:protobuf_headers"] +
                if_static([name + "_cc_impl"]),
+        testonly = testonly,
+        visibility = visibility,
     )
     native.cc_library(
         name = cc_name + "_impl",
@@ -379,8 +384,10 @@ def tf_proto_library_py(name, srcs=[], protodeps=[], deps=[], visibility=[],
     )
     native.py_library(
         name = py_name,
-        deps = py_deps + ["@protobuf_archive//:protobuf_python"])
-
+        deps = py_deps + ["@protobuf_archive//:protobuf_python"],
+        testonly = testonly,
+        visibility = visibility,
+    )
     return
 
   py_proto_library(
@@ -406,13 +413,14 @@ def tf_proto_library(name, srcs = [], has_services = None,
                      visibility = [], testonly = 0,
                      cc_libs = [],
                      cc_api_version = 2, cc_grpc_version = None,
-                     j2objc_api_version = 1,
+                     dart_api_version = 2, j2objc_api_version = 1,
                      java_api_version = 2, py_api_version = 2,
                      js_api_version = 2, js_codegen = "jspb",
+                     provide_cc_alias = False,
                      default_header = False):
   """Make a proto library, possibly depending on other proto libraries."""
-  js_api_version = js_api_version  # unused argument
-  js_codegen = js_codegen  # unused argument
+  _ignore = (js_api_version, js_codegen, provide_cc_alias)
+
   tf_proto_library_cc(
       name = name,
       srcs = srcs,
@@ -434,6 +442,33 @@ def tf_proto_library(name, srcs = [], has_services = None,
       use_grpc_plugin = has_services,
   )
 
+# A list of all files under platform matching the pattern in 'files'. In
+# contrast with 'tf_platform_srcs' below, which seletive collects files that
+# must be compiled in the 'default' platform, this is a list of all headers
+# mentioned in the platform/* files.
+def tf_platform_hdrs(files):
+  return native.glob(["platform/*/" + f for f in files])
+
+def tf_platform_srcs(files):
+  base_set = ["platform/default/" + f for f in files]
+  windows_set = base_set + ["platform/windows/" + f for f in files]
+  posix_set = base_set + ["platform/posix/" + f for f in files]
+
+  # Handle cases where we must also bring the posix file in. Usually, the list
+  # of files to build on windows builds is just all the stuff in the
+  # windows_set. However, in some cases the implementations in 'posix/' are
+  # just what is necessary and historically we choose to simply use the posix
+  # file instead of making a copy in 'windows'.
+  for f in files:
+    if f == "error.cc":
+      windows_set.append("platform/posix/" + f)
+
+  return select({
+    "//tensorflow:windows" : native.glob(windows_set),
+    "//tensorflow:windows_msvc" : native.glob(windows_set),
+    "//conditions:default" : native.glob(posix_set),
+  })
+
 def tf_additional_lib_hdrs(exclude = []):
   windows_hdrs = native.glob([
       "platform/default/*.h",
@@ -464,14 +499,6 @@ def tf_additional_lib_srcs(exclude = []):
       ], exclude = exclude),
   })
 
-# pylint: disable=unused-argument
-def tf_additional_framework_hdrs(exclude = []):
-  return []
-
-def tf_additional_framework_srcs(exclude = []):
-  return []
-# pylint: enable=unused-argument
-
 def tf_additional_minimal_lib_srcs():
   return [
       "platform/default/integral_types.h",
@@ -489,10 +516,12 @@ def tf_additional_proto_hdrs():
 
 def tf_additional_proto_srcs():
   return [
-      "platform/default/logging.cc",
       "platform/default/protobuf.cc",
   ]
 
+def tf_additional_human_readable_json_deps():
+  return []
+
 def tf_additional_all_protos():
   return ["//tensorflow/core:protos_all"]
 
@@ -512,25 +541,6 @@ def tf_protos_grappler():
       extra_deps=tf_protos_grappler_impl(),
       otherwise=["//tensorflow/core/grappler/costs:op_performance_data_cc"])
 
-def tf_env_time_hdrs():
-  return [
-      "platform/env_time.h",
-  ]
-
-def tf_env_time_srcs():
-  win_env_time = native.glob([
-    "platform/windows/env_time.cc",
-    "platform/env_time.cc",
-  ], exclude = [])
-  return select({
-    "//tensorflow:windows" : win_env_time,
-    "//tensorflow:windows_msvc" : win_env_time,
-    "//conditions:default" : native.glob([
-        "platform/posix/env_time.cc",
-        "platform/env_time.cc",
-      ], exclude = []),
-  })
-
 def tf_additional_cupti_wrapper_deps():
   return ["//tensorflow/core/platform/default/gpu:cupti_wrapper"]
 
diff --git a/tensorflow/core/platform/default/build_config/BUILD b/tensorflow/core/platform/default/build_config/BUILD
index a62697477d23c4..903ed6e486c5b3 100644
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@@ -130,6 +130,37 @@ cc_library(
     copts = tf_copts(),
 )
 
+cc_library(
+    name = "port",
+    srcs = [],
+    copts = tf_copts(),
+)
+
+cc_library(
+    name = "protobuf",
+    srcs = [],
+    copts = tf_copts(),
+)
+
+cc_library(
+    name = "env",
+    srcs = [],
+    copts = tf_copts(),
+)
+
+cc_library(
+    name = "other",
+    srcs = [],
+    copts = tf_copts(),
+    deps = [
+        "@com_googlesource_code_re2//:re2",
+        "@farmhash_archive//:farmhash",
+        "@fft2d",
+        "@highwayhash//:sip_hash",
+        "@png_archive//:png",
+    ],
+)
+
 cc_library(
     name = "platformlib",
     copts = tf_copts(),
diff --git a/tensorflow/core/platform/default/device_tracer.cc b/tensorflow/core/platform/default/device_tracer.cc
index 8e60a7f0910ff9..ccddf1eafc0c92 100644
--- a/tensorflow/core/platform/default/device_tracer.cc
+++ b/tensorflow/core/platform/default/device_tracer.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/cupti_wrapper.h"
 #include "tensorflow/core/platform/env.h"
@@ -288,7 +289,7 @@ TF_STATIC_THREAD_LOCAL_POD(const char *, tls_current_annotation);
 
 class DeviceTracerImpl : public DeviceTracer,
                          public CUPTIClient,
-                         public port::Tracing::Engine {
+                         public tracing::TraceCollector {
  public:
   DeviceTracerImpl();
   ~DeviceTracerImpl() override;
@@ -298,25 +299,25 @@ class DeviceTracerImpl : public DeviceTracer,
   Status Stop() override;
   Status Collect(StepStatsCollector *collector) override;
 
-  // port::Tracing::Engine interface:
-  bool IsEnabled() const override {
-    // We only register the Engine while tracing is enabled.
-    return true;
-  }
-  Annotation *PushAnnotation(StringPiece name) override {
-    VLOG(2) << "PushAnnotation " << name;
-    struct Impl : public port::Tracing::Engine::Annotation {
+  // tracing::TraceCollector interface:
+  virtual std::unique_ptr<Handle> CreateAnnotationHandle(
+      StringPiece name_part1, StringPiece name_part2) const {
+    struct Impl : public tracing::TraceCollector::Handle {
       string annotation;
-      explicit Impl(StringPiece n) : annotation(n.ToString()) {
+      explicit Impl(string &&name_scope) : annotation(name_scope) {
+        VLOG(2) << "CreateAnnotationHandle " << annotation;
         // Remember the most recent ScopedAnnotation for each thread.
         tls_current_annotation.get() = annotation.c_str();
       }
       ~Impl() override { tls_current_annotation.get() = nullptr; }
     };
-    return new Impl(name);
+    return std::unique_ptr<Handle>(
+        new Impl{ConcatenateNames(name_part1, name_part2)});
   }
-  Tracer *StartTracing(StringPiece label, bool is_expensive) override {
-    // We don't do anything with 'TraceMe' regions yet.
+
+  virtual std::unique_ptr<Handle> CreateActivityHandle(StringPiece, StringPiece,
+                                                       bool) const {
+    // We don't do anything with 'Activities' yet.
     return nullptr;
   }
 
@@ -410,7 +411,7 @@ Status DeviceTracerImpl::Start() {
   }
 
   // Register as a TraceEngine to receive ScopedAnnotations.
-  port::Tracing::RegisterEngine(this);
+  tracing::SetTraceCollector(this);
 
   // Intercept launch and memcpy calls to capture the Op name annotation.
   // TODO(pbar) Add callbacks for memcpy variants.
@@ -458,7 +459,7 @@ Status DeviceTracerImpl::Stop() {
     return Status::OK();
   }
   CUPTI_CALL(Unsubscribe(subscriber_));
-  port::Tracing::RegisterEngine(nullptr);
+  tracing::SetTraceCollector(nullptr);
   TF_RETURN_IF_ERROR(cupti_manager_->DisableTrace());
   end_walltime_us_ = NowInUsec();
   CUPTI_CALL(GetTimestamp(&end_timestamp_));
diff --git a/tensorflow/core/platform/default/human_readable_json.cc b/tensorflow/core/platform/default/human_readable_json.cc
new file mode 100644
index 00000000000000..6bf2106f6e5d38
--- /dev/null
+++ b/tensorflow/core/platform/default/human_readable_json.cc
@@ -0,0 +1,54 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/human_readable_json.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace tensorflow {
+
+Status ProtoToHumanReadableJson(const ::google::protobuf::Message& proto,
+                                string* result) {
+  result->clear();
+
+  auto status = google::protobuf::util::MessageToJsonString(proto, result);
+  if (!status.ok()) {
+    // Convert error_msg google::protobuf::StringPiece to
+    // tensorflow::StringPiece.
+    auto error_msg = status.error_message();
+    return errors::Internal(
+        strings::StrCat("Could not convert proto to JSON string: ",
+                        StringPiece(error_msg.data(), error_msg.length())));
+  }
+  return Status::OK();
+}
+
+Status HumanReadableJsonToProto(const string& str,
+                                ::google::protobuf::Message* proto) {
+  proto->Clear();
+  auto status = google::protobuf::util::JsonStringToMessage(str, proto);
+  if (!status.ok()) {
+    // Convert error_msg google::protobuf::StringPiece to
+    // tensorflow::StringPiece.
+    auto error_msg = status.error_message();
+    return errors::Internal(
+        strings::StrCat("Could not convert JSON string to proto: ",
+                        StringPiece(error_msg.data(), error_msg.length())));
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/default/mutex.h b/tensorflow/core/platform/default/mutex.h
index a12d92795e1466..89e57d58a00546 100644
--- a/tensorflow/core/platform/default/mutex.h
+++ b/tensorflow/core/platform/default/mutex.h
@@ -77,9 +77,7 @@ class SCOPED_LOCKABLE mutex_lock {
 
   // Manually nulls out the source to prevent double-free.
   // (std::move does not null the source pointer by default.)
-  explicit mutex_lock(mutex_lock&& ml) noexcept : mu_(ml.mu_) {
-    ml.mu_ = nullptr;
-  }
+  mutex_lock(mutex_lock&& ml) noexcept : mu_(ml.mu_) { ml.mu_ = nullptr; }
   ~mutex_lock() UNLOCK_FUNCTION() {
     if (mu_ != nullptr) {
       mu_->unlock();
diff --git a/tensorflow/core/platform/posix/tracing.cc b/tensorflow/core/platform/default/string_coding.cc
similarity index 54%
rename from tensorflow/core/platform/posix/tracing.cc
rename to tensorflow/core/platform/default/string_coding.cc
index 1d1aa53f2ca328..7410ee67820a38 100644
--- a/tensorflow/core/platform/posix/tracing.cc
+++ b/tensorflow/core/platform/default/string_coding.cc
@@ -1,4 +1,4 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,27 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/platform/tracing.h"
-
-#include <stdlib.h>
-#include <unistd.h>
+#include "tensorflow/core/platform/default/string_coding.h"
 
 namespace tensorflow {
 namespace port {
 
-static bool TryGetEnv(const char* name, const char** value) {
-  *value = getenv(name);
-  return *value != nullptr && (*value)[0] != '\0';
+std::unique_ptr<StringListEncoder> NewStringListEncoder(string* out) {
+  return std::unique_ptr<StringListEncoder>(new StringListEncoder(out));
 }
 
-const char* Tracing::LogDir() {
-  const char* dir;
-  if (TryGetEnv("TEST_TMPDIR", &dir)) return dir;
-  if (TryGetEnv("TMP", &dir)) return dir;
-  if (TryGetEnv("TMPDIR", &dir)) return dir;
-  dir = "/tmp";
-  if (access(dir, R_OK | W_OK | X_OK) == 0) return dir;
-  return ".";  // Default to current directory.
+std::unique_ptr<StringListDecoder> NewStringListDecoder(const string& in) {
+  return std::unique_ptr<StringListDecoder>(new StringListDecoder(in));
 }
 
 }  // namespace port
diff --git a/tensorflow/core/platform/default/string_coding.h b/tensorflow/core/platform/default/string_coding.h
new file mode 100644
index 00000000000000..70b8ab01444a61
--- /dev/null
+++ b/tensorflow/core/platform/default/string_coding.h
@@ -0,0 +1,98 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PLATFORM_DEFAULT_STRING_CODING_H_
+#define TENSORFLOW_CORE_PLATFORM_DEFAULT_STRING_CODING_H_
+
+// IWYU pragma: private, include "third_party/tensorflow/core/platform/tensor_coding.h"
+// IWYU pragma: friend third_party/tensorflow/core/platform/tensor_coding.h
+
+#include "tensorflow/core/lib/core/coding.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace port {
+
+// Encodes sequences of strings and serialized protocol buffers into a string.
+// Normal usage consists of zero or more calls to Append() and a single call to
+// Finalize().
+class StringListEncoder {
+ public:
+  explicit StringListEncoder(string* out) : out_(out) {}
+
+  // Encodes the given protocol buffer. This may not be called after Finalize().
+  void Append(const protobuf::MessageLite& m) {
+    core::PutVarint32(out_, m.ByteSize());
+    m.AppendToString(&rest_);
+  }
+
+  // Encodes the given string. This may not be called after Finalize().
+  void Append(const string& s) {
+    core::PutVarint32(out_, s.length());
+    strings::StrAppend(&rest_, s);
+  }
+
+  // Signals end of the encoding process. No other calls are allowed after this.
+  void Finalize() { strings::StrAppend(out_, rest_); }
+
+ private:
+  string* out_;
+  string rest_;
+};
+
+// Decodes a string into sequences of strings (which may represent serialized
+// protocol buffers). Normal usage involves a single call to ReadSizes() in
+// order to retrieve the length of all the strings in the sequence. For each
+// size returned a call to Data() is expected and will return the actual
+// string.
+class StringListDecoder {
+ public:
+  explicit StringListDecoder(const string& in) : reader_(in) {}
+
+  // Populates the given vector with the lengths of each string in the sequence
+  // being decoded. Upon returning the vector is guaranteed to contain as many
+  // elements as there are strings in the sequence.
+  bool ReadSizes(std::vector<uint32>* sizes) {
+    int64 total = 0;
+    for (auto& size : *sizes) {
+      if (!core::GetVarint32(&reader_, &size)) return false;
+      total += size;
+    }
+    if (total != static_cast<int64>(reader_.size())) {
+      return false;
+    }
+    return true;
+  }
+
+  // Returns a pointer to the next string in the sequence, then prepares for the
+  // next call by advancing 'size' characters in the sequence.
+  const char* Data(uint32 size) {
+    const char* data = reader_.data();
+    reader_.remove_prefix(size);
+    return data;
+  }
+
+ private:
+  StringPiece reader_;
+};
+
+std::unique_ptr<StringListEncoder> NewStringListEncoder(string* out);
+std::unique_ptr<StringListDecoder> NewStringListDecoder(const string& in);
+
+}  // namespace port
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_DEFAULT_STRING_CODING_H_
diff --git a/tensorflow/core/platform/default/tracing.cc b/tensorflow/core/platform/default/tracing.cc
index 422564fb3e4df8..3efcef09b8da27 100644
--- a/tensorflow/core/platform/default/tracing.cc
+++ b/tensorflow/core/platform/default/tracing.cc
@@ -15,21 +15,33 @@ limitations under the License.
 
 #include "tensorflow/core/platform/tracing.h"
 
-namespace tensorflow {
-namespace port {
-
-void Tracing::RegisterEvent(EventCategory id, const char* name) {
-  // TODO(opensource): implement
-}
+#include <cstdlib>
 
-void Tracing::Initialize() {}
+#ifndef PLATFORM_WINDOWS
+#include <unistd.h>
+#endif
 
-static bool DoInit() {
-  Tracing::Initialize();
-  return true;
+namespace tensorflow {
+namespace tracing {
+namespace {
+bool TryGetEnv(const char* name, const char** value) {
+  *value = getenv(name);
+  return *value != nullptr && (*value)[0] != '\0';
 }
-
-static const bool dummy = DoInit();
-
-}  // namespace port
+}  // namespace
+
+void EventCollector::SetCurrentThreadName(const char*) {}
+
+const char* GetLogDir() {
+  const char* dir;
+  if (TryGetEnv("TEST_TMPDIR", &dir)) return dir;
+  if (TryGetEnv("TMP", &dir)) return dir;
+  if (TryGetEnv("TMPDIR", &dir)) return dir;
+#ifndef PLATFORM_WINDOWS
+  dir = "/tmp";
+  if (access(dir, R_OK | W_OK | X_OK) == 0) return dir;
+#endif
+  return ".";  // Default to current directory.
+}
+}  // namespace tracing
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/default/tracing_impl.h b/tensorflow/core/platform/default/tracing_impl.h
index 78345488969ee3..b1613784053ba2 100644
--- a/tensorflow/core/platform/default/tracing_impl.h
+++ b/tensorflow/core/platform/default/tracing_impl.h
@@ -21,13 +21,8 @@ limitations under the License.
 // IWYU pragma: private, include "third_party/tensorflow/core/platform/tracing.h"
 // IWYU pragma: friend third_party/tensorflow/core/platform/tracing.h
 
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/tracing.h"
 
-namespace tensorflow {
-namespace port {
-
 // Definitions that do nothing for platforms that don't have underlying thread
 // tracing support.
 #define TRACELITERAL(a) \
@@ -40,21 +35,12 @@ namespace port {
   do {                           \
   } while (0)
 
-inline uint64 Tracing::UniqueId() { return random::New64(); }
-inline bool Tracing::IsActive() { return false; }
-inline void Tracing::RegisterCurrentThread(const char* name) {}
-
-// Posts an atomic threadscape event with the supplied category and arg.
-inline void Tracing::RecordEvent(EventCategory category, uint64 arg) {
-  // TODO(opensource): Implement
-}
-
-inline Tracing::ScopedActivity::ScopedActivity(EventCategory category,
-                                               uint64 arg) {}
+namespace tensorflow {
+namespace tracing {
 
-inline Tracing::ScopedActivity::~ScopedActivity() {}
+inline bool EventCollector::IsEnabled() { return false; }
 
-}  // namespace port
+}  // namespace tracing
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_PLATFORM_DEFAULT_TRACING_IMPL_H_
diff --git a/tensorflow/core/platform/env.cc b/tensorflow/core/platform/env.cc
index b9a9ef85eb16e6..47c59d435b95d6 100644
--- a/tensorflow/core/platform/env.cc
+++ b/tensorflow/core/platform/env.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #endif
 #if defined(PLATFORM_WINDOWS)
 #include <windows.h>
-#include "tensorflow/core/platform/windows/windows_file_system.h"
+#include "tensorflow/core/platform/windows/wide_char.h"
 #define PATH_MAX MAX_PATH
 #else
 #include <unistd.h>
@@ -92,7 +92,7 @@ Env::Env() : file_system_registry_(new FileSystemRegistryImpl) {}
 Status Env::GetFileSystemForFile(const string& fname, FileSystem** result) {
   StringPiece scheme, host, path;
   io::ParseURI(fname, &scheme, &host, &path);
-  FileSystem* file_system = file_system_registry_->Lookup(scheme.ToString());
+  FileSystem* file_system = file_system_registry_->Lookup(std::string(scheme));
   if (!file_system) {
     if (scheme.empty()) {
       scheme = "[local]";
@@ -166,7 +166,7 @@ bool Env::FilesExist(const std::vector<string>& files,
   for (const auto& file : files) {
     StringPiece scheme, host, path;
     io::ParseURI(file, &scheme, &host, &path);
-    files_per_fs[scheme.ToString()].push_back(file);
+    files_per_fs[std::string(scheme)].push_back(file);
   }
 
   std::unordered_map<string, Status> per_file_status;
@@ -311,7 +311,7 @@ string Env::GetExecutablePath() {
   HMODULE hModule = GetModuleHandleW(NULL);
   WCHAR wc_file_path[MAX_PATH] = {0};
   GetModuleFileNameW(hModule, wc_file_path, MAX_PATH);
-  string file_path = WindowsFileSystem::WideCharToUtf8(wc_file_path);
+  string file_path = WideCharToUtf8(wc_file_path);
   std::copy(file_path.begin(), file_path.end(), exe_path);
 #else
   CHECK_NE(-1, readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1));
diff --git a/tensorflow/core/platform/env_test.cc b/tensorflow/core/platform/env_test.cc
index a70a417e6a2f3a..c461a40086360f 100644
--- a/tensorflow/core/platform/env_test.cc
+++ b/tensorflow/core/platform/env_test.cc
@@ -357,7 +357,7 @@ TEST_F(DefaultEnvTest, LocalTempFilename) {
   CHECK_EQ(error::OUT_OF_RANGE,
            file_to_read->Read(0 /* offset */, 1024 /* n */, &content, scratch)
                .code());
-  EXPECT_EQ("Null", content.ToString());
+  EXPECT_EQ("Null", content);
 
   // Delete the temporary file.
   TF_CHECK_OK(env->DeleteFile(filename));
diff --git a/tensorflow/core/platform/error.h b/tensorflow/core/platform/error.h
new file mode 100644
index 00000000000000..ae965b6c773e26
--- /dev/null
+++ b/tensorflow/core/platform/error.h
@@ -0,0 +1,30 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_ERROR_H_
+#define TENSORFLOW_CORE_PLATFORM_ERROR_H_
+
+#include "tensorflow/core/platform/platform.h"
+
+#if defined(PLATFORM_GOOGLE) || defined(PLATFORM_POSIX) || \
+    defined(PLATFORM_POSIX_ANDROID) || defined(PLATFORM_GOOGLE_ANDROID)
+#include "tensorflow/core/platform/posix/error.h"
+#elif defined(PLATFORM_WINDOWS)
+#include "tensorflow/core/platform/windows/error.h"
+#else
+#error Define the appropriate PLATFORM_<foo> macro for this platform
+#endif
+
+#endif  // TENSORFLOW_CORE_PLATFORM_ERROR_H_
diff --git a/tensorflow/core/platform/file_system.cc b/tensorflow/core/platform/file_system.cc
index b55e94d552ed3a..922773684b00bb 100644
--- a/tensorflow/core/platform/file_system.cc
+++ b/tensorflow/core/platform/file_system.cc
@@ -158,7 +158,7 @@ Status FileSystem::RecursivelyCreateDir(const string& dirname) {
   std::reverse(sub_dirs.begin(), sub_dirs.end());
 
   // Now create the directories.
-  string built_path = remaining_dir.ToString();
+  string built_path = std::string(remaining_dir);
   for (const StringPiece sub_dir : sub_dirs) {
     built_path = io::JoinPath(built_path, sub_dir);
     Status status = CreateDir(io::CreateURI(scheme, host, built_path));
diff --git a/tensorflow/core/platform/file_system_helper.cc b/tensorflow/core/platform/file_system_helper.cc
index 22c5057281959f..0ba0e6304f67c0 100644
--- a/tensorflow/core/platform/file_system_helper.cc
+++ b/tensorflow/core/platform/file_system_helper.cc
@@ -59,7 +59,7 @@ Status GetMatchingPaths(FileSystem* fs, Env* env, const string& pattern,
   string fixed_prefix = pattern.substr(0, pattern.find_first_of("*?[\\"));
   string eval_pattern = pattern;
   std::vector<string> all_files;
-  string dir = io::Dirname(fixed_prefix).ToString();
+  string dir = std::string(io::Dirname(fixed_prefix));
   // If dir is empty then we need to fix up fixed_prefix and eval_pattern to
   // include . as the top level directory.
   if (dir.empty()) {
diff --git a/tensorflow/core/platform/file_system_test.cc b/tensorflow/core/platform/file_system_test.cc
index f261b8f5761506..c0a16c95f930e0 100644
--- a/tensorflow/core/platform/file_system_test.cc
+++ b/tensorflow/core/platform/file_system_test.cc
@@ -125,7 +125,7 @@ class InterPlanetaryFileSystem : public NullFileSystem {
     ASSERT_EQ(scheme, "ipfs");
     ASSERT_EQ(host, "solarsystem");
     str_util::ConsumePrefix(&path, "/");
-    *parsed_path = path.ToString();
+    *parsed_path = std::string(path);
   }
 
   std::map<string, std::set<string>> celestial_bodies_ = {
diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
index a8cb40502c1185..ff4b4436bbc1c0 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@@ -21,11 +21,11 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/error.h"
 #include "tensorflow/core/platform/file_system.h"
 #include "tensorflow/core/platform/file_system_helper.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/posix/error.h"
 #include "third_party/hadoop/hdfs.h"
 
 namespace tensorflow {
@@ -115,18 +115,17 @@ class LibHDFS {
     const char* kLibHdfsDso = "libhdfs.so";
 #endif
     char* hdfs_home = getenv("HADOOP_HDFS_HOME");
-    if (hdfs_home == nullptr) {
-      status_ = errors::FailedPrecondition(
-          "Environment variable HADOOP_HDFS_HOME not set");
-      return;
-    }
-    string path = io::JoinPath(hdfs_home, "lib", "native", kLibHdfsDso);
-    status_ = TryLoadAndBind(path.c_str(), &handle_);
-    if (!status_.ok()) {
-      // try load libhdfs.so using dynamic loader's search path in case
-      // libhdfs.so is installed in non-standard location
-      status_ = TryLoadAndBind(kLibHdfsDso, &handle_);
+    if (hdfs_home != nullptr) {
+      string path = io::JoinPath(hdfs_home, "lib", "native", kLibHdfsDso);
+      status_ = TryLoadAndBind(path.c_str(), &handle_);
+      if (status_.ok()) {
+        return;
+      }
     }
+
+    // Try to load the library dynamically in case it has been installed
+    // to a in non-standard location.
+    status_ = TryLoadAndBind(kLibHdfsDso, &handle_);
   }
 
   Status status_;
diff --git a/tensorflow/core/platform/human_readable_json.h b/tensorflow/core/platform/human_readable_json.h
new file mode 100644
index 00000000000000..c759e801e97704
--- /dev/null
+++ b/tensorflow/core/platform/human_readable_json.h
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_HUMAN_READABLE_JSON_H_
+#define TENSORFLOW_CORE_PLATFORM_HUMAN_READABLE_JSON_H_
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+// Converts a proto to a JSON-like string that's meant to be human-readable
+// but still machine-parseable.
+//
+// This string may not be strictly JSON-compliant, but it must be parseable by
+// HumanReadableJSONToProto.
+Status ProtoToHumanReadableJson(const protobuf::Message& proto, string* result);
+
+// Converts a string produced by ProtoToHumanReadableJSON to a protobuf.  Not
+// guaranteed to work for general JSON.
+Status HumanReadableJsonToProto(const string& str, protobuf::Message* proto);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_HUMAN_READABLE_JSON_H_
diff --git a/tensorflow/core/platform/macros.h b/tensorflow/core/platform/macros.h
index 912d73520d462c..e93c8268298e70 100644
--- a/tensorflow/core/platform/macros.h
+++ b/tensorflow/core/platform/macros.h
@@ -79,7 +79,11 @@ limitations under the License.
 // analysis. Giving it this information can help it optimize for the
 // common case in the absence of better information (ie.
 // -fprofile-arcs).
-#if !defined(__HIP_DEVICE_COMPILE__) && (TF_HAS_BUILTIN(__builtin_expect) || (defined(__GNUC__) && __GNUC__ >= 3))
+//
+// We need to disable this for GPU builds, though, since nvcc8 and older
+// don't recognize `__builtin_expect` as a builtin, and fail compilation.
+#if (!defined(__NVCC__)) && (!defined(__HIP_DEVICE_COMPILE__)) && \
+    (TF_HAS_BUILTIN(__builtin_expect) || (defined(__GNUC__) && __GNUC__ >= 3))
 #define TF_PREDICT_FALSE(x) (__builtin_expect(x, 0))
 #define TF_PREDICT_TRUE(x) (__builtin_expect(!!(x), 1))
 #else
diff --git a/tensorflow/core/platform/posix/port.cc b/tensorflow/core/platform/posix/port.cc
index 8e316472fe2ea6..708f32ba8085cd 100644
--- a/tensorflow/core/platform/posix/port.cc
+++ b/tensorflow/core/platform/posix/port.cc
@@ -74,6 +74,11 @@ int NumSchedulableCPUs() {
   return kDefaultCores;
 }
 
+int NumHyperthreadsPerCore() {
+  static const int ht_per_core = tensorflow::port::CPUIDNumSMT();
+  return (ht_per_core > 0) ? ht_per_core : 1;
+}
+
 void* AlignedMalloc(size_t size, int minimum_alignment) {
 #if defined(__ANDROID__)
   return memalign(minimum_alignment, size);
diff --git a/tensorflow/core/platform/tensor_coding.cc b/tensorflow/core/platform/tensor_coding.cc
index 17dc81f7e0f49e..84601de39a6547 100644
--- a/tensorflow/core/platform/tensor_coding.cc
+++ b/tensorflow/core/platform/tensor_coding.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/core/platform/tensor_coding.h"
 
 #include <vector>
-#include "tensorflow/core/framework/resource_handle.pb.h"
+
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 
@@ -66,39 +66,5 @@ void CopyFromArray(string* s, const char* base, size_t bytes) {
   s->assign(base, bytes);
 }
 
-void EncodeResourceHandleList(const ResourceHandle* p, int64 n, string* out) {
-  out->clear();
-  string rest;
-  ResourceHandleProto proto;
-  for (int i = 0; i < n; ++i) {
-    p[i].AsProto(&proto);
-    core::PutVarint32(out, proto.ByteSize());
-    proto.AppendToString(&rest);
-  }
-  *out += rest;
-}
-
-bool DecodeResourceHandleList(const string& in, ResourceHandle* ps, int64 n) {
-  std::vector<uint32> sizes(n);
-  StringPiece reader(in);
-  int64 total = 0;
-  for (auto& size : sizes) {
-    if (!core::GetVarint32(&reader, &size)) return false;
-    total += size;
-  }
-  if (total != static_cast<int64>(reader.size())) {
-    return false;
-  }
-  ResourceHandleProto proto;
-  for (int i = 0; i < n; ++i) {
-    if (!proto.ParseFromArray(reader.data(), sizes[i])) {
-      return false;
-    }
-    ps[i].FromProto(proto);
-    reader.remove_prefix(sizes[i]);
-  }
-  return true;
-}
-
 }  // namespace port
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/tensor_coding.h b/tensorflow/core/platform/tensor_coding.h
index 19f53e6374f4d0..6c6d75830de743 100644
--- a/tensorflow/core/platform/tensor_coding.h
+++ b/tensorflow/core/platform/tensor_coding.h
@@ -18,7 +18,6 @@ limitations under the License.
 #define TENSORFLOW_PLATFORM_TENSOR_CODING_H_
 
 #include <string>
-#include "tensorflow/core/framework/resource_handle.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/platform/platform.h"
@@ -26,6 +25,8 @@ limitations under the License.
 
 #ifdef PLATFORM_GOOGLE
 #include "tensorflow/core/platform/google/cord_coding.h"
+#else
+#include "tensorflow/core/platform/default/string_coding.h"
 #endif
 
 namespace tensorflow {
@@ -51,13 +52,6 @@ bool DecodeStringList(const string& src, string* strings, int64 n);
 // Assigns base[0..bytes-1] to *s
 void CopyFromArray(string* s, const char* base, size_t bytes);
 
-// Encodes a list of ResourceHandle protos in the given string.
-void EncodeResourceHandleList(const ResourceHandle* handles, int64 n,
-                              string* out);
-
-// Decodes a list of ResourceHandle protos from the given string.
-bool DecodeResourceHandleList(const string& in, ResourceHandle* ps, int64 n);
-
 }  // namespace port
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/platform/test_benchmark.h b/tensorflow/core/platform/test_benchmark.h
index 327237dba93323..9b8726d98fc5a8 100644
--- a/tensorflow/core/platform/test_benchmark.h
+++ b/tensorflow/core/platform/test_benchmark.h
@@ -42,9 +42,43 @@ namespace testing {
 #if defined(PLATFORM_GOOGLE)
 
 using ::testing::Benchmark;
+using ::testing::DoNotOptimize;
 
 #else
 
+// The DoNotOptimize(...) function can be used to prevent a value or
+// expression from being optimized away by the compiler. This function is
+// intended to add little to no overhead.
+// See: http://stackoverflow.com/questions/28287064
+//
+// The specific guarantees of DoNotOptimize(x) are:
+//  1) x, and any data it transitively points to, will exist (in a register or
+//     in memory) at the current point in the program.
+//  2) The optimizer will assume that DoNotOptimize(x) could mutate x or
+//     anything it transitively points to (although it actually doesn't).
+//
+// To see this in action:
+//
+//   void BM_multiply(benchmark::State& state) {
+//     int a = 2;
+//     int b = 4;
+//     for (auto _ : state) {
+//       testing::DoNotOptimize(a);
+//       testing::DoNotOptimize(b);
+//       int c = a * b;
+//       testing::DoNotOptimize(c);
+//     }
+//   }
+//   BENCHMARK(BM_multiply);
+//
+// Guarantee (2) applied to 'a' and 'b' prevents the compiler lifting the
+// multiplication outside of the loop. Guarantee (1) applied to 'c' prevents the
+// compiler from optimizing away 'c' as dead code.
+template <class T>
+void DoNotOptimize(const T& var) {
+  asm volatile("" : "+m"(const_cast<T&>(var)));
+}
+
 class Benchmark {
  public:
   Benchmark(const char* name, void (*fn)(int));
diff --git a/tensorflow/core/platform/tracing.cc b/tensorflow/core/platform/tracing.cc
index f7d2a8e282de51..c0386c0a3fc052 100644
--- a/tensorflow/core/platform/tracing.cc
+++ b/tensorflow/core/platform/tracing.cc
@@ -15,24 +15,24 @@ limitations under the License.
 
 #include "tensorflow/core/platform/tracing.h"
 
+#include <array>
 #include <atomic>
 #include <map>
 #include <string>
 #include <vector>
+#include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
+namespace tracing {
+namespace {
+std::atomic<uint64> unique_arg{1};
+std::atomic<const TraceCollector*> trace_collector;
+}  // namespace
 
-namespace port {
-
-int32 Tracing::category_id_[kEventCategoryMax];
-uint64 Tracing::event_mask_ = 0;
-std::map<string, int32>* Tracing::name_map_ = new std::map<string, int32>;
-
-// This needs to be kept in sync with the EventCategory enumeration.
-const char* Tracing::EventCategoryString(EventCategory category) {
+const char* GetEventCategoryName(EventCategory category) {
   switch (category) {
     case EventCategory::kScheduleClosure:
       return "ScheduleClosure";
@@ -40,63 +40,45 @@ const char* Tracing::EventCategoryString(EventCategory category) {
       return "RunClosure";
     case EventCategory::kCompute:
       return "Compute";
-    case EventCategory::kEventCategoryMax:
-      return "EventCategoryMax";
+    default:
+      return "Unknown";
   }
-  return "Unknown";
 }
 
-// This function allows the user to specify arbitrary subsets of the
-// supported Threadscape events and activities.
-bool Tracing::ParseEventMask(const char* flagname, const string& value) {
-  VLOG(1) << flagname << " set to " << value;
-  int64 new_mask = 0;
-  std::vector<string> events =
-      str_util::Split(value, ',', str_util::SkipEmpty());
-  for (string name : events) {
-    bool clear = false;
-    int64 mask = 0;
-    if (name[0] == '!') {
-      // invert the sense of the flag
-      clear = true;
-      name = name.substr(1);
-    }
-    if (name == "ALL") {
-      mask = ~0;
-    } else {
-      auto it = name_map_->find(name);
-      int32 id;
-      if (it == name_map_->end()) {
-        id = -1;
-      } else {
-        id = it->second;
-      }
-      if (id < 0) {
-        LOG(ERROR) << "Can't parse event mask name " << name;
-        return false;
-      }
-      mask = 1 << id;
-    }
-    if (clear) {
-      new_mask &= ~mask;
-    } else {
-      new_mask |= mask;
-    }
-  }
-  // parsing was successful; set the permanent event mask
-  event_mask_ = new_mask;
-  return true;
+std::array<const EventCollector*, GetNumEventCategories()>
+    EventCollector::instances_;
+
+void SetEventCollector(EventCategory category,
+                       const EventCollector* collector) {
+  EventCollector::instances_[static_cast<unsigned>(category)] = collector;
+}
+
+uint64 GetUniqueArg() {
+  return unique_arg.fetch_add(1, std::memory_order_relaxed);
 }
 
-/*static*/ std::atomic<Tracing::Engine*> Tracing::tracing_engine_;
+uint64 GetArgForName(StringPiece name) {
+  return Hash64(name.data(), name.size());
+}
 
-void Tracing::RegisterEngine(Engine* e) {
-  tracing_engine_.store(e, std::memory_order_release);
+string TraceCollector::ConcatenateNames(StringPiece first, StringPiece second) {
+  std::string result;
+  bool has_two_parts = !first.empty() && !second.empty();
+  result.reserve(first.size() + second.size() +
+                 static_cast<int>(has_two_parts));
+  result.append(first.data(), first.size());
+  if (has_two_parts) result.append({':'});
+  result.append(second.data(), second.size());
+  return result;
 }
 
-Tracing::Engine::~Engine() {}
-Tracing::Engine::Annotation::~Annotation() {}
-Tracing::Engine::Tracer::~Tracer() {}
+void SetTraceCollector(const TraceCollector* collector) {
+  return trace_collector.store(collector, std::memory_order_release);
+}
+
+const TraceCollector* GetTraceCollector() {
+  return trace_collector.load(std::memory_order_acquire);
+}
 
-}  // namespace port
+}  // namespace tracing
 }  // namespace tensorflow
diff --git a/tensorflow/core/platform/tracing.h b/tensorflow/core/platform/tracing.h
index 3c6e7b0db59951..c322777705a7fc 100644
--- a/tensorflow/core/platform/tracing.h
+++ b/tensorflow/core/platform/tracing.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 // Tracing interface
 
+#include <array>
 #include <atomic>
 #include <map>
 #include <memory>
@@ -30,255 +31,205 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
+namespace tracing {
+
+// This enumeration contains the identifiers of all TensorFlow CPU profiler
+// events. It must be kept in sync with the code in GetEventCategoryName().
+enum struct EventCategory : unsigned {
+  kScheduleClosure = 0,
+  kRunClosure = 1,
+  kCompute = 2,
+  kNumCategories = 3  // sentinel - keep last
+};
+constexpr unsigned GetNumEventCategories() {
+  return static_cast<unsigned>(EventCategory::kNumCategories);
+}
+const char* GetEventCategoryName(EventCategory);
 
-namespace port {
-
-class Tracing {
+// Interface for CPU profiler events.
+class EventCollector {
  public:
-  // This enumeration contains the identifiers of all TensorFlow
-  // threadscape events and code regions.  Threadscape assigns its
-  // own identifiers at runtime when we register our events and we
-  // cannot know in advance what IDs it will choose.  The "RecordEvent"
-  // method and "ScopedActivity" use these event IDs for consistency
-  // and remap them to threadscape IDs at runtime.  This enum is limited
-  // to 64 values since we use a bitmask to configure which events are
-  // enabled.  It must also be kept in step with the code in
-  // "Tracing::EventCategoryString".
-  enum EventCategory {
-    kScheduleClosure = 0,
-    kRunClosure = 1,
-    kCompute = 2,
-    kEventCategoryMax = 3  // sentinel - keep last
-  };
-  // Note: We currently only support up to 64 categories.
-  static_assert(kEventCategoryMax <= 64, "only support up to 64 events");
+  virtual ~EventCollector() {}
+  virtual void RecordEvent(uint64 arg) const = 0;
+  virtual void StartRegion(uint64 arg) const = 0;
+  virtual void StopRegion() const = 0;
 
-  // Called by main programs to initialize tracing facilities
-  static void Initialize();
+  // Annotates the current thread with a name.
+  static void SetCurrentThreadName(const char* name);
+  // Returns whether event collection is enabled.
+  static bool IsEnabled();
 
-  // Return the pathname of the directory where we are writing log files.
-  static const char* LogDir();
+ private:
+  friend void SetEventCollector(EventCategory, const EventCollector*);
+  friend const EventCollector* GetEventCollector(EventCategory);
 
-  // Returns a non-zero identifier which can be used to correlate
-  // related events.
-  static inline uint64 UniqueId();
+  static std::array<const EventCollector*, GetNumEventCategories()> instances_;
+};
+// Set the callback for RecordEvent and ScopedRegion of category.
+// Not thread safe. Only call while EventCollector::IsEnabled returns false.
+void SetEventCollector(EventCategory category, const EventCollector* collector);
+
+// Returns the callback for RecordEvent and ScopedRegion of category if
+// EventCollector::IsEnabled(), otherwise returns null.
+inline const EventCollector* GetEventCollector(EventCategory category) {
+  if (EventCollector::IsEnabled()) {
+    return EventCollector::instances_[static_cast<unsigned>(category)];
+  }
+  return nullptr;
+}
 
-  // Returns true if a trace is in progress.  Can be used to reduce tracing
-  // overheads in fast-path code.
-  static inline bool IsActive();
+// Returns a unique id to pass to RecordEvent/ScopedRegion. Never returns zero.
+uint64 GetUniqueArg();
 
-  // Associate name with the current thread.
-  static void RegisterCurrentThread(const char* name);
+// Returns an id for name to pass to RecordEvent/ScopedRegion.
+uint64 GetArgForName(StringPiece name);
 
-  // Posts an event with the supplied category and arg.
-  static void RecordEvent(EventCategory category, uint64 arg);
+// Records an atomic event through the currently registered EventCollector.
+inline void RecordEvent(EventCategory category, uint64 arg) {
+  if (auto collector = GetEventCollector(category)) {
+    collector->RecordEvent(arg);
+  }
+}
 
-  // Traces a region of code.  Posts a tracing "EnterCodeRegion" event
-  // when created and an "ExitCodeRegion" event when destroyed.
-  class ScopedActivity {
-   public:
-    explicit ScopedActivity(EventCategory category, uint64 arg);
-    ~ScopedActivity();
+// Records an event for the duration of the instance lifetime through the
+// currently registered EventCollector.
+class ScopedRegion {
+  ScopedRegion(ScopedRegion&) = delete;             // Not copy-constructible.
+  ScopedRegion& operator=(ScopedRegion&) = delete;  // Not assignable.
 
-   private:
-#if defined(PLATFORM_GOOGLE)
-    const bool enabled_;
-    const int32 region_id_;
-#endif
+ public:
+  ScopedRegion(ScopedRegion&& other) noexcept  // Move-constructible.
+      : collector_(other.collector_) {
+    other.collector_ = nullptr;
+  }
 
-    TF_DISALLOW_COPY_AND_ASSIGN(ScopedActivity);
-  };
+  ScopedRegion(EventCategory category, uint64 arg)
+      : collector_(GetEventCollector(category)) {
+    if (collector_) {
+      collector_->StartRegion(arg);
+    }
+  }
 
-  // Trace collection engine can be registered with this module.
-  // If no engine is registered, ScopedAnnotation and TraceMe are no-ops.
-  class Engine;
-  static void RegisterEngine(Engine*);
+  // Same as ScopedRegion(category, GetUniqueArg()), but faster if
+  // EventCollector::IsEnaled() returns false.
+  ScopedRegion(EventCategory category)
+      : collector_(GetEventCollector(category)) {
+    if (collector_) {
+      collector_->StartRegion(GetUniqueArg());
+    }
+  }
 
-  // Forward declaration of the GPU utility classes.
-  class ScopedAnnotation;
-  class TraceMe;
+  // Same as ScopedRegion(category, GetArgForName(name)), but faster if
+  // EventCollector::IsEnaled() returns false.
+  ScopedRegion(EventCategory category, StringPiece name)
+      : collector_(GetEventCollector(category)) {
+    if (collector_) {
+      collector_->StartRegion(GetArgForName(name));
+    }
+  }
 
- private:
-  friend class TracingTest;
-  friend class ScopedAnnotation;
-  friend class TraceMe;
-
-  // TODO: TF_EXPORT is for building //tensorflow/contrib/data:_dataset_ops.so
-  //       on Windows. Figure out a way to remove TF_EXPORT here.
-  TF_EXPORT static std::atomic<Tracing::Engine*> tracing_engine_;
-  static Tracing::Engine* engine() {
-    return tracing_engine_.load(std::memory_order_acquire);
+  ~ScopedRegion() {
+    if (collector_) {
+      collector_->StopRegion();
+    }
   }
 
-  static void RegisterEvent(EventCategory id, const char* name);
-  static const char* EventCategoryString(EventCategory category);
-
-  //
-  // Parses event mask expressions in 'value' of the form:
-  //   expr ::= <term> (,<term>)*
-  //   term ::= <event> | "!" <event>
-  //   event ::= "ALL" | <wait_event> | <other_event>
-  //   wait_event ::= "ENewSession" | "ECloseSession" | ...
-  //   other_event ::= "Send" | "Wait" | ...
-  // ALL denotes all events, <event> turns on tracing for this event, and
-  // !<event> turns off tracing for this event.
-  // If the expression can be parsed correctly it returns true and sets
-  // the event_mask_. Otherwise it returns false and the event_mask_ is left
-  // unchanged.
-  static bool ParseEventMask(const char* flagname, const string& value);
-
-  // Bit mask of enabled trace categories.
-  static uint64 event_mask_;
-
-  // Records the mappings between Threadscape IDs and the "EventCategory" enum.
-  static int32 category_id_[kEventCategoryMax];
-  static std::map<string, int32>* name_map_;
+  bool IsEnabled() const { return collector_ != nullptr; }
+
+ private:
+  const EventCollector* collector_;
 };
 
-// Trace collection engine that actually implements collection.
-class Tracing::Engine {
+// Interface for accelerator profiler annotations.
+class TraceCollector {
  public:
-  Engine() {}
-  virtual ~Engine();
-
-  // Returns true if Tracing is currently enabled.
-  virtual bool IsEnabled() const = 0;
-
-  // Represents an active annotation.
-  class Annotation {
+  class Handle {
    public:
-    Annotation() {}
-    virtual ~Annotation();
+    virtual ~Handle() {}
   };
 
-  // Represents an active trace.
-  class Tracer {
-   public:
-    Tracer() {}
-    virtual ~Tracer();
-  };
+  virtual ~TraceCollector() {}
+  virtual std::unique_ptr<Handle> CreateAnnotationHandle(
+      StringPiece name_part1, StringPiece name_part2) const = 0;
+  virtual std::unique_ptr<Handle> CreateActivityHandle(
+      StringPiece name_part1, StringPiece name_part2,
+      bool is_expensive) const = 0;
 
- private:
-  friend class ScopedAnnotation;
-  friend class TraceMe;
-
-  // Register the specified name as an annotation on the current thread.
-  // Caller should delete the result to remove the annotation.
-  // Annotations from the same thread are destroyed in a LIFO manner.
-  // May return nullptr if annotations are not supported.
-  virtual Annotation* PushAnnotation(StringPiece name) = 0;
-
-  // Start tracing under the specified label. Caller should delete the result
-  // to stop tracing.
-  // May return nullptr if tracing is not supported.
-  virtual Tracer* StartTracing(StringPiece label, bool is_expensive) = 0;
-  // Same as above, but implementations can avoid copying the string.
-  virtual Tracer* StartTracing(string&& label, bool is_expensive) {
-    return StartTracing(StringPiece(label), is_expensive);
-  }
+ protected:
+  static string ConcatenateNames(StringPiece first, StringPiece second);
 
-  // Backwards compatibility one arg variants (assume is_expensive=true).
-  Tracer* StartTracing(StringPiece label) {
-    return StartTracing(label, /*is_expensive=*/true);
-  }
-  Tracer* StartTracing(string&& label) {
-    return StartTracing(StringPiece(label), /*is_expensive=*/true);
-  }
+ private:
+  friend void SetTraceCollector(const TraceCollector*);
+  friend const TraceCollector* GetTraceCollector();
 };
+// Set the callback for ScopedAnnotation and ScopedActivity.
+void SetTraceCollector(const TraceCollector* collector);
+// Returns the callback for ScopedAnnotation and ScopedActivity.
+const TraceCollector* GetTraceCollector();
 
-// This class permits a user to apply annotation on kernels and memcpys
-// when launching them. While an annotation is in scope, all activities
-// within that scope get their names replaced by the annotation. The kernel
-// name replacement is done when constructing the protobuf for sending out to
-// a client (e.g., the stubby requestor) for both API and Activity records.
-//
-// Ownership: The creator of ScopedAnnotation assumes ownership of the object.
+// Adds an annotation to all activities for the duration of the instance
+// lifetime through the currently registered TraceCollector.
 //
 // Usage: {
-//          ScopedAnnotation annotation("first set of kernels");
+//          ScopedAnnotation annotation("my kernels");
 //          Kernel1<<<x,y>>>;
-//          LaunchKernel2(); // Which eventually launches a cuda kernel.
+//          LaunchKernel2(); // Launches a CUDA kernel.
 //        }
-// In the above scenario, the GPUProf UI would show 2 kernels with the name
-// "first set of kernels" executing -- they will appear as the same kernel.
-class Tracing::ScopedAnnotation {
+// This will add 'my kernels' to both kernels in the profiler UI
+class ScopedAnnotation {
  public:
-  explicit ScopedAnnotation(StringPiece name);
+  explicit ScopedAnnotation(StringPiece name)
+      : ScopedAnnotation(name, StringPiece()) {}
 
-  // If tracing is enabled, set up an annotation with a label of
-  // "<name_part1>:<name_part2>".  Can be cheaper than the
+  // If tracing is enabled, add a name scope of
+  // "<name_part1>:<name_part2>".  This can be cheaper than the
   // single-argument constructor because the concatenation of the
   // label string is only done if tracing is enabled.
-  ScopedAnnotation(StringPiece name_part1, StringPiece name_part2);
+  ScopedAnnotation(StringPiece name_part1, StringPiece name_part2)
+      : handle_([&] {
+          auto trace_collector = GetTraceCollector();
+          return trace_collector ? trace_collector->CreateAnnotationHandle(
+                                       name_part1, name_part2)
+                                 : nullptr;
+        }()) {}
 
-  // Returns true iff scoped annotations are active.
-  static bool Enabled() {
-    auto e = Tracing::engine();
-    return e && e->IsEnabled();
-  }
+  bool IsEnabled() const { return static_cast<bool>(handle_); }
 
  private:
-  std::unique_ptr<Engine::Annotation> annotation_;
+  std::unique_ptr<TraceCollector::Handle> handle_;
 };
 
-// TODO(opensource): clean up the scoped classes for GPU tracing.
-// This class permits user-specified (CPU) tracing activities. A trace
-// activity is started when an object of this class is created and stopped
-// when the object is destroyed.
-class Tracing::TraceMe {
+// Adds an activity through the currently registered TraceCollector.
+// The activity starts when an object of this class is created and stops when
+// the object is destroyed.
+class ScopedActivity {
  public:
-  explicit TraceMe(StringPiece name);
-  TraceMe(StringPiece name, bool is_expensive);
+  explicit ScopedActivity(StringPiece name, bool is_expensive = true)
+      : ScopedActivity(name, StringPiece(), is_expensive) {}
 
-  // If tracing is enabled, set up a traceMe with a label of
+  // If tracing is enabled, set up an activity with a label of
   // "<name_part1>:<name_part2>".  This can be cheaper than the
   // single-argument constructor because the concatenation of the
   // label string is only done if tracing is enabled.
-  TraceMe(StringPiece name_part1, StringPiece name_part2);
-  TraceMe(StringPiece name_part1, StringPiece name_part2, bool is_expensive);
+  ScopedActivity(StringPiece name_part1, StringPiece name_part2,
+                 bool is_expensive = true)
+      : handle_([&] {
+          auto trace_collector = GetTraceCollector();
+          return trace_collector ? trace_collector->CreateActivityHandle(
+                                       name_part1, name_part2, is_expensive)
+                                 : nullptr;
+        }()) {}
+
+  bool IsEnabled() const { return static_cast<bool>(handle_); }
 
  private:
-  std::unique_ptr<Engine::Tracer> tracer_;
+  std::unique_ptr<TraceCollector::Handle> handle_;
 };
 
-inline Tracing::ScopedAnnotation::ScopedAnnotation(StringPiece name) {
-  auto e = Tracing::engine();
-  if (e && e->IsEnabled()) {
-    annotation_.reset(e->PushAnnotation(name));
-  }
-}
-
-inline Tracing::ScopedAnnotation::ScopedAnnotation(StringPiece name_part1,
-                                                   StringPiece name_part2) {
-  auto e = Tracing::engine();
-  if (e && e->IsEnabled()) {
-    annotation_.reset(
-        e->PushAnnotation(strings::StrCat(name_part1, ":", name_part2)));
-  }
-}
-
-inline Tracing::TraceMe::TraceMe(StringPiece name) : TraceMe(name, true) {}
-
-inline Tracing::TraceMe::TraceMe(StringPiece name, bool is_expensive) {
-  auto e = Tracing::engine();
-  if (e && e->IsEnabled()) {
-    tracer_.reset(e->StartTracing(name, is_expensive));
-  }
-}
-
-inline Tracing::TraceMe::TraceMe(StringPiece name_part1, StringPiece name_part2)
-    : TraceMe(name_part1, name_part2, true) {}
-
-inline Tracing::TraceMe::TraceMe(StringPiece name_part1, StringPiece name_part2,
-                                 bool is_expensive) {
-  auto e = Tracing::engine();
-  if (e && e->IsEnabled()) {
-    tracer_.reset(e->StartTracing(strings::StrCat(name_part1, ":", name_part2),
-                                  is_expensive));
-  }
-}
+// Return the pathname of the directory where we are writing log files.
+const char* GetLogDir();
 
-}  // namespace port
+}  // namespace tracing
 }  // namespace tensorflow
 
 #if defined(PLATFORM_GOOGLE)
diff --git a/tensorflow/core/platform/variant_coding.cc b/tensorflow/core/platform/variant_coding.cc
deleted file mode 100644
index 48c5389d293f25..00000000000000
--- a/tensorflow/core/platform/variant_coding.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/platform/variant_coding.h"
-
-#include <vector>
-#include "tensorflow/core/framework/tensor.pb.h"
-#include "tensorflow/core/framework/variant_op_registry.h"
-#include "tensorflow/core/lib/core/coding.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-
-namespace tensorflow {
-namespace port {
-
-void EncodeVariantList(const Variant* variant_array, int64 n, string* out) {
-  out->clear();
-  string rest;
-  for (int i = 0; i < n; ++i) {
-    string s;
-    variant_array[i].Encode(&s);
-    core::PutVarint32(out, s.length());
-    strings::StrAppend(&rest, s);
-  }
-  strings::StrAppend(out, rest);
-}
-
-bool DecodeVariantList(const string& in, Variant* variant_array, int64 n) {
-  std::vector<uint32> sizes(n);
-  StringPiece reader(in);
-  int64 total = 0;
-  for (auto& size : sizes) {
-    if (!core::GetVarint32(&reader, &size)) return false;
-    total += size;
-  }
-  if (total != static_cast<int64>(reader.size())) {
-    return false;
-  }
-
-  for (int i = 0; i < n; ++i) {
-    if (variant_array[i].is_empty()) {
-      variant_array[i] = VariantTensorDataProto();
-    }
-    string str(reader.data(), sizes[i]);
-    if (!variant_array[i].Decode(str)) return false;
-    if (!DecodeUnaryVariant(&variant_array[i])) {
-      LOG(ERROR) << "Could not decode variant with type_name: \""
-                 << variant_array[i].TypeName()
-                 << "\".  Perhaps you forgot to register a "
-                    "decoder via REGISTER_UNARY_VARIANT_DECODE_FUNCTION?";
-      return false;
-    }
-    reader.remove_prefix(sizes[i]);
-  }
-  return true;
-}
-
-}  // end namespace port
-}  // end namespace tensorflow
diff --git a/tensorflow/core/platform/variant_coding.h b/tensorflow/core/platform/variant_coding.h
deleted file mode 100644
index a971857e4a8954..00000000000000
--- a/tensorflow/core/platform/variant_coding.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_PLATFORM_VARIANT_CODING_H_
-#define TENSORFLOW_PLATFORM_VARIANT_CODING_H_
-
-#include "tensorflow/core/framework/variant.h"
-#include "tensorflow/core/framework/variant_encode_decode.h"
-
-#ifdef PLATFORM_GOOGLE
-#include "tensorflow/core/platform/google/variant_cord_coding.h"
-#endif
-
-namespace tensorflow {
-namespace port {
-
-// Encodes an array of Variant objects in to the given string.
-// `variant_array` is assumed to point to an array of `n` Variant objects.
-void EncodeVariantList(const Variant* variant_array, int64 n, string* out);
-
-// Decodes an array of Variant objects from the given string.
-// `variant_array` is assumed to point to an array of `n` Variant objects.
-bool DecodeVariantList(const string& in, Variant* variant_array, int64 n);
-
-}  // end namespace port
-}  // end namespace tensorflow
-
-#endif  // TENSORFLOW_PLATFORM_VARIANT_CODING_H_
diff --git a/tensorflow/core/platform/windows/env.cc b/tensorflow/core/platform/windows/env.cc
index 2f54f423b2ee7d..68ee3595a2bf84 100644
--- a/tensorflow/core/platform/windows/env.cc
+++ b/tensorflow/core/platform/windows/env.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/platform/load_library.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/windows/wide_char.h"
 #include "tensorflow/core/platform/windows/windows_file_system.h"
 
 #pragma comment(lib, "Shlwapi.lib")
@@ -71,8 +72,8 @@ class WindowsEnv : public Env {
   }
 
   bool MatchPath(const string& path, const string& pattern) override {
-    std::wstring ws_path(WindowsFileSystem::Utf8ToWideChar(path));
-    std::wstring ws_pattern(WindowsFileSystem::Utf8ToWideChar(pattern));
+    std::wstring ws_path(Utf8ToWideChar(path));
+    std::wstring ws_pattern(Utf8ToWideChar(pattern));
     return PathMatchSpecW(ws_path.c_str(), ws_pattern.c_str()) == TRUE;
   }
 
@@ -125,7 +126,7 @@ class WindowsEnv : public Env {
     std::string file_name = library_filename;
     std::replace(file_name.begin(), file_name.end(), '/', '\\');
 
-    std::wstring ws_file_name(WindowsFileSystem::Utf8ToWideChar(file_name));
+    std::wstring ws_file_name(Utf8ToWideChar(file_name));
 
     HMODULE hModule = LoadLibraryExW(ws_file_name.c_str(), NULL,
                                      LOAD_WITH_ALTERED_SEARCH_PATH);
diff --git a/tensorflow/core/platform/windows/wide_char.h b/tensorflow/core/platform/windows/wide_char.h
new file mode 100644
index 00000000000000..1b86abc3fa120f
--- /dev/null
+++ b/tensorflow/core/platform/windows/wide_char.h
@@ -0,0 +1,46 @@
+/* Copyright 2018 Google Inc. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_WINDOWS_WIDE_CHAR_H_
+#define TENSORFLOW_CORE_PLATFORM_WINDOWS_WIDE_CHAR_H_
+
+#include <Windows.h>
+#include <string>
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+inline std::wstring Utf8ToWideChar(const string& utf8str) {
+  int size_required = MultiByteToWideChar(CP_UTF8, 0, utf8str.c_str(),
+                                          (int)utf8str.size(), NULL, 0);
+  std::wstring ws_translated_str(size_required, 0);
+  MultiByteToWideChar(CP_UTF8, 0, utf8str.c_str(), (int)utf8str.size(),
+                      &ws_translated_str[0], size_required);
+  return ws_translated_str;
+}
+
+inline string WideCharToUtf8(const std::wstring& wstr) {
+  if (wstr.empty()) return std::string();
+  int size_required = WideCharToMultiByte(
+      CP_UTF8, 0, wstr.c_str(), (int)wstr.size(), NULL, 0, NULL, NULL);
+  string utf8_translated_str(size_required, 0);
+  WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), (int)wstr.size(),
+                      &utf8_translated_str[0], size_required, NULL, NULL);
+  return utf8_translated_str;
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_WINDOWS_WIDE_CHAR_H_
diff --git a/tensorflow/core/platform/windows/windows_file_system.cc b/tensorflow/core/platform/windows/windows_file_system.cc
index dc2efbeaf5e3ea..9079a5ccaad338 100644
--- a/tensorflow/core/platform/windows/windows_file_system.cc
+++ b/tensorflow/core/platform/windows/windows_file_system.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/posix/error.h"
 #include "tensorflow/core/platform/windows/error.h"
+#include "tensorflow/core/platform/windows/wide_char.h"
 #include "tensorflow/core/platform/windows/windows_file_system.h"
 
 // TODO(mrry): Prevent this Windows.h #define from leaking out of our headers.
diff --git a/tensorflow/core/platform/windows/windows_file_system.h b/tensorflow/core/platform/windows/windows_file_system.h
index ba0302f0fd8b56..6b04720c68f5e9 100644
--- a/tensorflow/core/platform/windows/windows_file_system.h
+++ b/tensorflow/core/platform/windows/windows_file_system.h
@@ -64,25 +64,6 @@ class WindowsFileSystem : public FileSystem {
   Status RenameFile(const string& src, const string& target) override;
 
   string TranslateName(const string& name) const override { return name; }
-
-  static std::wstring Utf8ToWideChar(const string& utf8str) {
-    int size_required = MultiByteToWideChar(CP_UTF8, 0, utf8str.c_str(),
-                                            (int)utf8str.size(), NULL, 0);
-    std::wstring ws_translated_str(size_required, 0);
-    MultiByteToWideChar(CP_UTF8, 0, utf8str.c_str(), (int)utf8str.size(),
-                        &ws_translated_str[0], size_required);
-    return ws_translated_str;
-  }
-
-  static string WideCharToUtf8(const std::wstring& wstr) {
-    if (wstr.empty()) return std::string();
-    int size_required = WideCharToMultiByte(
-        CP_UTF8, 0, wstr.c_str(), (int)wstr.size(), NULL, 0, NULL, NULL);
-    string utf8_translated_str(size_required, 0);
-    WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), (int)wstr.size(),
-                        &utf8_translated_str[0], size_required, NULL, NULL);
-    return utf8_translated_str;
-  }
 };
 
 class LocalWinFileSystem : public WindowsFileSystem {
diff --git a/tensorflow/core/profiler/g3doc/command_line.md b/tensorflow/core/profiler/g3doc/command_line.md
index bbaf55e613f6f3..cc6d9def4724aa 100644
--- a/tensorflow/core/profiler/g3doc/command_line.md
+++ b/tensorflow/core/profiler/g3doc/command_line.md
@@ -82,7 +82,7 @@ bazel-bin/tensorflow/core/profiler/profiler \
 #
 # Alternatively, user can pass separate files.
 #
-# --graph_path contains the model architecutre and tensor shapes.
+# --graph_path contains the model architecture and tensor shapes.
 # --run_meta_path contains the memory and time information.
 # --op_log_path contains float operation and code traces.
 # --checkpoint_path contains the model checkpoint data.
diff --git a/tensorflow/core/protobuf/cluster.proto b/tensorflow/core/protobuf/cluster.proto
index 33c87eefe022ee..c696d345e0cfb9 100644
--- a/tensorflow/core/protobuf/cluster.proto
+++ b/tensorflow/core/protobuf/cluster.proto
@@ -20,6 +20,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ClusterProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 
 // This file contains protos to be used when defining a TensorFlow
 // cluster.
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 887e03ff8d417f..d99463b821c02e 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ConfigProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 import "tensorflow/core/framework/cost_graph.proto";
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/step_stats.proto";
@@ -14,12 +14,29 @@ import "tensorflow/core/protobuf/cluster.proto";
 import "tensorflow/core/protobuf/rewriter_config.proto";
 
 message GPUOptions {
-  // A value between 0 and 1 that indicates what fraction of the
-  // available GPU memory to pre-allocate for each process.  1 means
-  // to pre-allocate all of the GPU memory, 0.5 means the process
-  // allocates ~50% of the available GPU memory.
+  // Fraction of the available GPU memory to allocate for each process.
+  // 1 means to allocate all of the GPU memory, 0.5 means the process
+  // allocates up to ~50% of the available GPU memory.
+  //
+  // GPU memory is pre-allocated unless the allow_growth option is enabled.
+  //
+  // If greater than 1.0, uses CUDA unified memory to potentially oversubscribe
+  // the amount of memory available on the GPU device by using host memory as a
+  // swap space. Accessing memory not available on the device will be
+  // significantly slower as that would require memory transfer between the host
+  // and the device. Options to reduce the memory requirement should be
+  // considered before enabling this option as this may come with a negative
+  // performance impact. Oversubscription using the unified memory requires
+  // Pascal class or newer GPUs and it is currently only supported on the Linux
+  // operating system. See
+  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-requirements
+  // for the detailed requirements.
   double per_process_gpu_memory_fraction = 1;
 
+  // If true, the allocator does not pre-allocate the entire specified
+  // GPU memory region, instead starting small and growing as needed.
+  bool allow_growth = 4;
+
   // The type of GPU allocation strategy to use.
   //
   // Allowed values:
@@ -35,10 +52,6 @@ message GPUOptions {
   // a reasonable default (several MBs).
   int64 deferred_deletion_bytes = 3;
 
-  // If true, the allocator does not pre-allocate the entire specified
-  // GPU memory region, instead starting small and growing as needed.
-  bool allow_growth = 4;
-
   // A comma-separated list of GPU ids that determines the 'visible'
   // to 'virtual' mapping of GPU devices.  For example, if TensorFlow
   // can see 8 GPU devices in the process, and one wanted to map
@@ -82,9 +95,6 @@ message GPUOptions {
   // the overall host system performance.
   bool force_gpu_compatible = 8;
 
-  // Everything inside Experimental is subject to change and is not subject
-  // to API stability guarantees in
-  // https://www.tensorflow.org/programmers_guide/version_compat.
   message Experimental {
     // Configuration for breaking down a visible GPU into multiple "virtual"
     // devices.
@@ -124,8 +134,20 @@ message GPUOptions {
     //    different settings in different sessions within same process will
     //    result in undefined behavior.
     repeated VirtualDevices virtual_devices = 1;
+
+    // If true, uses CUDA unified memory for memory allocations. If
+    // per_process_gpu_memory_fraction option is greater than 1.0, then unified
+    // memory is used regardless of the value for this field. See comments for
+    // per_process_gpu_memory_fraction field for more details and requirements
+    // of the unified memory. This option is useful to oversubscribe memory if
+    // multiple processes are sharing a single GPU while individually using less
+    // than 1.0 per process memory fraction.
+    bool use_unified_memory = 2;
   }
 
+  // Everything inside experimental is subject to change and is not subject
+  // to API stability guarantees in
+  // https://www.tensorflow.org/programmers_guide/version_compat.
   Experimental experimental = 9;
 };
 
@@ -357,7 +379,17 @@ message ConfigProto {
   // shared with other sessions.
   bool isolate_session_state = 15;
 
-  // Next: 16
+  // Everything inside Experimental is subject to change and is not subject
+  // to API stability guarantees in
+  // https://www.tensorflow.org/programmers_guide/version_compat.
+  message Experimental {
+    // Task name for group resolution.
+    string collective_group_leader = 1;
+  };
+
+  Experimental experimental = 16;
+
+  // Next: 17
 };
 
 // Options for a single Run() call.
@@ -392,6 +424,19 @@ message RunOptions {
   // Enabling this option can slow down the Run() call.
   bool report_tensor_allocations_upon_oom = 7;
 
+  // Everything inside Experimental is subject to change and is not subject
+  // to API stability guarantees in
+  // https://www.tensorflow.org/programmers_guide/version_compat.
+  message Experimental {
+    // If non-zero, declares that this graph is going to use collective
+    // ops and must synchronize step_ids with any other graph with this
+    // same group_key value (in a distributed computation where tasks
+    // run disjoint graphs).
+    int64 collective_graph_key = 1;
+  };
+
+  Experimental experimental = 8;
+
   reserved 4;
 }
 
diff --git a/tensorflow/core/protobuf/control_flow.proto b/tensorflow/core/protobuf/control_flow.proto
index 3c05b4f0e22e5c..5f44878c44c90b 100644
--- a/tensorflow/core/protobuf/control_flow.proto
+++ b/tensorflow/core/protobuf/control_flow.proto
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ControlFlowProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 
 // Control flow context related protocol buffers.
 
diff --git a/tensorflow/core/protobuf/critical_section.proto b/tensorflow/core/protobuf/critical_section.proto
index 0b3f531e6d9f59..7954e7ba87c1b9 100644
--- a/tensorflow/core/protobuf/critical_section.proto
+++ b/tensorflow/core/protobuf/critical_section.proto
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "CriticalSectionProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 
 // Protocol buffer representing a CriticalSection.
 message CriticalSectionDef {
diff --git a/tensorflow/core/protobuf/debug.proto b/tensorflow/core/protobuf/debug.proto
index 56983f3b7d464f..499900f965ac2b 100644
--- a/tensorflow/core/protobuf/debug.proto
+++ b/tensorflow/core/protobuf/debug.proto
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "DebugProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 
 // EXPERIMENTAL. Option for watching a node.
 message DebugTensorWatch {
diff --git a/tensorflow/core/protobuf/device_properties.proto b/tensorflow/core/protobuf/device_properties.proto
index 3bd30159003484..11e1258e75e6bb 100644
--- a/tensorflow/core/protobuf/device_properties.proto
+++ b/tensorflow/core/protobuf/device_properties.proto
@@ -18,6 +18,7 @@ syntax = "proto3";
 package tensorflow;
 option cc_enable_arenas = true;
 option java_outer_classname = "DevicePropertiesProtos";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 
 message DeviceProperties {
   // Device type (CPU, GPU, ...)
diff --git a/tensorflow/core/protobuf/eager_service.proto b/tensorflow/core/protobuf/eager_service.proto
index c2325cc8039e4d..9a7d0edb35ecfd 100644
--- a/tensorflow/core/protobuf/eager_service.proto
+++ b/tensorflow/core/protobuf/eager_service.proto
@@ -121,10 +121,17 @@ message RegisterFunctionResponse {
 // Eager Service defines a TensorFlow service that executes operations eagerly
 // on a set of local devices, on behalf of a remote Eager executor.
 //
-// The service impl will keep track of the various peers and devices it has
+// The service impl will keep track of the various clients and devices it has
 // access to and allows the client to enqueue ops on any devices that it is able
 // to access and schedule data transfers from/to any of the peers.
 //
+// A client can generate multiple contexts to be able to independently execute
+// operations, but cannot share data between the two contexts.
+//
+// NOTE: Even though contexts generated by clients should be independent, the
+// lower level tensorflow execution engine is not, so they might share some data
+// (e.g. a Device's ResourceMgr).
+//
 ////////////////////////////////////////////////////////////////////////////////
 service EagerService {
   // This initializes the worker, informing it about the other workers in the
diff --git a/tensorflow/core/protobuf/master.proto b/tensorflow/core/protobuf/master.proto
index 96c91536f73865..03022875e64ace 100644
--- a/tensorflow/core/protobuf/master.proto
+++ b/tensorflow/core/protobuf/master.proto
@@ -20,7 +20,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "DistributedRuntimeProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 import "tensorflow/core/framework/device_attributes.proto";
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/tensor.proto";
diff --git a/tensorflow/core/protobuf/master_service.proto b/tensorflow/core/protobuf/master_service.proto
index 1170611f372327..ce0e4f643544ee 100644
--- a/tensorflow/core/protobuf/master_service.proto
+++ b/tensorflow/core/protobuf/master_service.proto
@@ -19,7 +19,7 @@ package tensorflow.grpc;
 option java_outer_classname = "MasterServiceProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 import "tensorflow/core/protobuf/master.proto";
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/tensorflow/core/protobuf/meta_graph.proto b/tensorflow/core/protobuf/meta_graph.proto
index fd86c0da12b26c..75a2a88ed72cd9 100644
--- a/tensorflow/core/protobuf/meta_graph.proto
+++ b/tensorflow/core/protobuf/meta_graph.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "MetaGraphProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 import "google/protobuf/any.proto";
 
 import "tensorflow/core/framework/graph.proto";
diff --git a/tensorflow/core/protobuf/named_tensor.proto b/tensorflow/core/protobuf/named_tensor.proto
index dd4976e3546268..6e2f7feee29f2f 100644
--- a/tensorflow/core/protobuf/named_tensor.proto
+++ b/tensorflow/core/protobuf/named_tensor.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "NamedTensorProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 import "tensorflow/core/framework/tensor.proto";
 
 // A pair of tensor name and tensor values.
diff --git a/tensorflow/core/protobuf/queue_runner.proto b/tensorflow/core/protobuf/queue_runner.proto
index 05a48d0acf7581..f4df649f7d6548 100644
--- a/tensorflow/core/protobuf/queue_runner.proto
+++ b/tensorflow/core/protobuf/queue_runner.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "QueueRunnerProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 import "tensorflow/core/lib/core/error_codes.proto";
 
 // Protocol buffer representing a QueueRunner.
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index 029b27cd043705..bbb25d6f3f755d 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "RewriterConfigProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 
 import "tensorflow/core/framework/attr_value.proto";
 
@@ -13,6 +14,11 @@ message AutoParallelOptions {
   int32 num_replicas = 2;
 }
 
+message ScopedAllocatorOptions {
+  // If present, only perform optimization for these ops.
+  repeated string enable_op = 1;
+}
+
 message RewriterConfig {
   // Graph rewriting is experimental and subject to change, not covered by any
   // API stability guarantees.
@@ -31,7 +37,7 @@ message RewriterConfig {
     AGGRESSIVE = 3;
   }
 
-  // Enum controling the number of times to run optimizers. The default is to
+  // Enum controlling the number of times to run optimizers. The default is to
   // run them once.
   enum NumIterationsType {
     DEFAULT_NUM_ITERS = 0;
@@ -46,6 +52,12 @@ message RewriterConfig {
   // Statically infer the value of tensors when possible, and materialize the
   // result using constants.
   Toggle constant_folding = 3;
+  // Shape optimizations (default is ON)
+  // Simplify computations made on shapes.
+  Toggle shape_optimization = 13;
+  // Remapping (default is ON)
+  // Remap subgraphs onto more efficient implementations.
+  Toggle remapping = 14;
   // Arithmetic optimizations (default is ON)
   // e.g. Simplify arithmetic ops; merge ops with same value (like constants).
   Toggle arithmetic_optimization = 7;
@@ -60,6 +72,9 @@ message RewriterConfig {
   Toggle debug_stripper = 11;
   // If true, don't remove unnecessary ops from the graph
   bool disable_model_pruning = 2;
+  // Try to allocate some independent Op outputs contiguously in order to
+  // merge or eliminate downstream Ops (off by default).
+  Toggle scoped_allocator_optimization = 15;
 
   // Controls how many times we run the optimizers in meta optimizer (default
   // is once).
@@ -108,6 +123,8 @@ message RewriterConfig {
   // meta-optimizer or when manually specified through the optimizers field.
   AutoParallelOptions auto_parallel = 5;
 
+  ScopedAllocatorOptions scoped_allocator_opts = 16;
+
   // If non-empty, will use this as an alternative way to specify a list of
   // optimizations to turn on and the order of the optimizations (replacing the
   // meta-optimizer).
diff --git a/tensorflow/core/protobuf/saved_model.proto b/tensorflow/core/protobuf/saved_model.proto
index c2595ddf884b08..03789d3df72f2a 100644
--- a/tensorflow/core/protobuf/saved_model.proto
+++ b/tensorflow/core/protobuf/saved_model.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "SavedModelProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.framework";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 import "tensorflow/core/protobuf/meta_graph.proto";
 
 // SavedModel is the high level serialization format for TensorFlow Models.
diff --git a/tensorflow/core/protobuf/saver.proto b/tensorflow/core/protobuf/saver.proto
index a757d3f756ab73..4245386145907f 100644
--- a/tensorflow/core/protobuf/saver.proto
+++ b/tensorflow/core/protobuf/saver.proto
@@ -5,6 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "SaverProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.util";
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 
 // Protocol buffer representing the configuration of a Saver.
 message SaverDef {
diff --git a/tensorflow/core/protobuf/tensor_bundle.proto b/tensorflow/core/protobuf/tensor_bundle.proto
index 80e87f14f941b9..681c01bbbd40fd 100644
--- a/tensorflow/core/protobuf/tensor_bundle.proto
+++ b/tensorflow/core/protobuf/tensor_bundle.proto
@@ -5,7 +5,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "TensorBundleProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.util";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 import "tensorflow/core/framework/tensor_shape.proto";
 import "tensorflow/core/framework/tensor_slice.proto";
 import "tensorflow/core/framework/types.proto";
diff --git a/tensorflow/core/protobuf/tensorflow_server.proto b/tensorflow/core/protobuf/tensorflow_server.proto
index 6199e707e5ad03..be25804a1b4a94 100644
--- a/tensorflow/core/protobuf/tensorflow_server.proto
+++ b/tensorflow/core/protobuf/tensorflow_server.proto
@@ -23,7 +23,7 @@ option cc_enable_arenas = true;
 option java_outer_classname = "ServerProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 // Defines the configuration of a single TensorFlow server.
 message ServerDef {
   // The cluster of which this server is a member.
diff --git a/tensorflow/core/protobuf/transport_options.proto b/tensorflow/core/protobuf/transport_options.proto
new file mode 100644
index 00000000000000..d7b1bddbbe3d7d
--- /dev/null
+++ b/tensorflow/core/protobuf/transport_options.proto
@@ -0,0 +1,8 @@
+syntax = "proto3";
+
+package tensorflow;
+
+// Extra data needed on a non-RDMA RecvBufResponse.
+message RecvBufRespExtra {
+  bytes tensor_content = 1;
+};
diff --git a/tensorflow/core/protobuf/worker.proto b/tensorflow/core/protobuf/worker.proto
index 1819a352481e3d..a3bc2f422e776a 100644
--- a/tensorflow/core/protobuf/worker.proto
+++ b/tensorflow/core/protobuf/worker.proto
@@ -20,13 +20,15 @@ option cc_enable_arenas = true;
 option java_outer_classname = "WorkerProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 import "google/protobuf/any.proto";
 import "tensorflow/core/framework/cost_graph.proto";
 import "tensorflow/core/framework/step_stats.proto";
 import "tensorflow/core/framework/device_attributes.proto";
 import "tensorflow/core/framework/graph.proto";
 import "tensorflow/core/framework/tensor.proto";
+import "tensorflow/core/framework/tensor_shape.proto";
+import "tensorflow/core/framework/types.proto";
 import "tensorflow/core/lib/core/error_codes.proto";
 import "tensorflow/core/protobuf/config.proto";
 import "tensorflow/core/protobuf/debug.proto";
@@ -120,6 +122,14 @@ message RegisterGraphRequest {
 
   // Field(s) used by TensorFlow Debugger (tfdbg).
   DebugOptions debug_options = 5;
+
+  // If graph_def contains any collective ops this must be a positive
+  // integer used to coordinate execution with other graphs.  All
+  // graphs in a distributed execution with the same
+  // collective_graph_key will coordinate to use the same step_id
+  // concurrently so that BufRendezvous entries will make the correct
+  // values accessible.
+  int64 collective_graph_key = 7;
 }
 
 message RegisterGraphResponse {
@@ -359,8 +369,11 @@ message RecvTensorResponse {
 // Out-of-band request to begin or end logging, or
 // to retrieve logs for particular steps.
 message LoggingRequest {
-  // If true, RPC logging will be activated.
-  bool rpc_logging = 1;
+  // If true, RPC logging will be enabled.
+  bool enable_rpc_logging = 1;
+
+  // If true, RPC logging will be disabled.
+  bool disable_rpc_logging = 4;
 
   // If true, discard any saved logging data (for all steps).
   bool clear = 2;
@@ -413,3 +426,125 @@ message TracingRequest {
 
 message TracingResponse {
 }
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Raw data transfers in support of Collective Ops.
+// These methods are experimental and subject to change.
+//
+// The intention is to allow collectives to take advantage of the most
+// efficient methods available on a platform, e.g. RDMA, and not be
+// constrained to use the RPC system in use by other methods.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+message RecvBufRequest {
+  // Use of the fields below may vary by implementation.  For example
+  // the buf_ptr and num_bytes may be set only for local operations and
+  // not sent on the wire, or only sent on the wire in one direction.
+
+  // Used at server side to find the correct BufRendezvous.
+  int64 step_id = 1;
+
+  // Arbitrary string identifying a BufRendezvous entry.
+  string buf_rendezvous_key = 2;
+
+  // Size of value expected, must agree with BufRendezvous entry.
+  int64 num_bytes = 3;
+
+  // When RDMA is in use, address of destination field on client.
+  fixed64 buf_ptr = 4;
+
+  // Optional information on client-side device locality.
+  DeviceLocality client_locality = 5;
+
+  // Optional information on server-side device locality.
+  DeviceLocality server_locality = 6;
+
+  // Optional, implementation-specific data.
+  google.protobuf.Any transport_options = 7;
+  // Optional, for annotating the timeline.
+  string src_device = 8;
+  string dst_device = 9;
+}
+
+message RecvBufResponse {
+  // Use of the fields below may vary by implementation.  Comments give
+  // intended use.
+
+  fixed64 buf_ptr = 1;  // Address of source field on server.
+  int64 num_bytes = 2;  // Byte length of buf_ptr field, if set.
+  bool is_dead = 3;     // True if value is 'dead' like a tensor.
+  // Optional, implementation-specific data.
+  google.protobuf.Any transport_options = 4;
+  // Optional, for timeline.
+  int64 send_start_micros = 5;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+//
+// Collective Op dynamic group resolution messages.
+//
+////////////////////////////////////////////////////////////////////////////////
+
+// Supplies one or more device names as members of the group identified by
+// group_key.  Service will respond when all group_size devices become known.
+// All devices in group must have same type.
+message CompleteGroupRequest {
+  int32 group_key = 1;
+  int32 group_size = 2;
+  string device_type = 3;
+  repeated string device_name = 4;
+}
+
+// Gives the complete membership of the group identified by group_key.
+message CompleteGroupResponse {
+  int32 group_key = 1;
+  int32 group_size = 2;
+  string device_type = 3;
+  int32 num_tasks = 4;  // number of distinct tasks hosting the devices
+  repeated string device_name = 5;
+  repeated string task_name = 6;  // task name prefixes of device_names
+}
+
+// Supplies data about one collective op belonging to the instance identified
+// by instance_key.  Service will respond when all group_size ops have
+// become known.  Most of the data being sent is for correctness checking,
+// to ensure that all ops in the instance share common attributes.
+message CompleteInstanceRequest {
+  string name = 1;
+  int32 type = 2;
+  DataType data_type = 3;
+  TensorShapeProto shape = 4;
+  int32 group_key = 5;
+  int32 group_size = 6;
+  int32 instance_key = 7;
+  string device_type = 8;
+  repeated int32 subdiv_offset = 9;
+  string device = 10;
+  bool is_source = 11;
+}
+
+// Confirms that every op in the instance has consistently declared itself.
+// Also gives the source_rank in case of broadcast.
+message CompleteInstanceResponse {
+  int32 instance_key = 1;
+  int32 source_rank = 2;
+}
+
+// Request for next agreed-upon step_id for the specified graph_keys.
+// This is used to enable multiple graphs containing nodes from
+// a common collective instance to coordinate using the same step_ids.
+message GetStepSequenceRequest {
+  repeated int64 graph_key = 1;
+}
+
+message StepSequence {
+  int64 graph_key = 1;
+  int64 next_step_id = 2;
+}
+
+// Next valid step_ids for one or more graph_keys.
+message GetStepSequenceResponse {
+  repeated StepSequence step_sequence = 1;
+}
diff --git a/tensorflow/core/protobuf/worker_service.proto b/tensorflow/core/protobuf/worker_service.proto
index e1bfb04d7c53a5..9ebbd553f2181b 100644
--- a/tensorflow/core/protobuf/worker_service.proto
+++ b/tensorflow/core/protobuf/worker_service.proto
@@ -19,7 +19,7 @@ package tensorflow.grpc;
 option java_outer_classname = "WorkerServiceProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.distruntime";
-
+option go_package = "github.com/tensorflow/tensorflow/tensorflow/go/core/protobuf";
 import "tensorflow/core/protobuf/worker.proto";
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -72,4 +72,18 @@ service WorkerService {
 
   // See worker.proto for details.
   rpc Tracing(TracingRequest) returns (TracingResponse);
+
+  // See worker.proto for details.
+  rpc RecvBuf(RecvBufRequest) returns (RecvBufResponse) {
+  }
+
+  // See worker.proto for details.
+  rpc GetStepSequence(GetStepSequenceRequest) returns (GetStepSequenceResponse);
+
+  // See worker.proto for details.
+  rpc CompleteGroup(CompleteGroupRequest) returns (CompleteGroupResponse);
+
+  // See worker.proto for details.
+  rpc CompleteInstance(CompleteInstanceRequest)
+      returns (CompleteInstanceResponse);
 }
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index ba69efb289a42a..522a9d84fddd2e 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -24,7 +24,7 @@ limitations under the License.
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX "-rc1"
+#define TF_VERSION_SUFFIX ""
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/core/kernels/batch_util.cc b/tensorflow/core/util/batch_util.cc
similarity index 98%
rename from tensorflow/core/kernels/batch_util.cc
rename to tensorflow/core/util/batch_util.cc
index 52be1ab8d0f23b..7ea8851e651255 100644
--- a/tensorflow/core/kernels/batch_util.cc
+++ b/tensorflow/core/util/batch_util.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/kernels/batch_util.h"
+#include "tensorflow/core/util/batch_util.h"
 
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/types.h"
@@ -134,6 +134,8 @@ Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index) {
   switch (element.dtype()) {
     TF_CALL_ALL_TYPES(HANDLE_TYPE);
     TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
+    TF_CALL_uint32(HANDLE_TYPE);
+    TF_CALL_uint64(HANDLE_TYPE);
 #undef HANDLE_TYPE
     default:
       return errors::Unimplemented("CopyElementToSlice Unhandled data type: ",
diff --git a/tensorflow/core/util/batch_util.h b/tensorflow/core/util/batch_util.h
new file mode 100644
index 00000000000000..eee0309fbc4c00
--- /dev/null
+++ b/tensorflow/core/util/batch_util.h
@@ -0,0 +1,54 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_UTIL_BATCH_UTIL_H_
+#define TENSORFLOW_CORE_UTIL_BATCH_UTIL_H_
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+namespace batch_util {
+
+// Copies element into the index^th slice of parent (in the 0th dimension).
+//
+// NOTE(mrry): The `element` argument is taken by value. Use `std::move()`
+// to move the `element` argument into this function, and the implementation
+// may be able to optimize the copy to a move. This is particularly important
+// for DT_STRING tensors.
+Status CopyElementToSlice(Tensor element, Tensor* parent, int64 index);
+
+// Copies the index^th slice of parent (in the 0th dimension) into element.
+Status CopySliceToElement(const Tensor& parent, Tensor* element, int64 index);
+
+// Copies the index^th slice of parent (in the 0th dimension) into element.
+//
+// NOTE(mrry): The implementation may be able to optimize the copy to a move.
+// This is particularly important for DT_STRING tensors.
+Status MaybeMoveSliceToElement(Tensor* parent, Tensor* element, int64 index);
+
+// Zero-initializes the tensor `element` using the scalar stored in `padding`.
+// Both `element` and `padding` must have matching `dtype`.
+Status SetElementZero(Tensor* element, const Tensor& padding);
+
+// Copies `element` into a (0th dimension) slice of `parent`, assuming
+// the shape of `element` is strictly not larger along any axis than a
+// slice.
+Status CopyElementToLargerSlice(const Tensor& element, Tensor* parent,
+                                int index);
+
+}  // namespace batch_util
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_BATCH_UTIL_H_
diff --git a/tensorflow/core/util/command_line_flags.cc b/tensorflow/core/util/command_line_flags.cc
index 480ce94fcaeddd..b281acb2b0261f 100644
--- a/tensorflow/core/util/command_line_flags.cc
+++ b/tensorflow/core/util/command_line_flags.cc
@@ -32,7 +32,7 @@ bool ParseStringFlag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
   if (str_util::ConsumePrefix(&arg, "--") &&
       str_util::ConsumePrefix(&arg, flag) &&
       str_util::ConsumePrefix(&arg, "=")) {
-    *value_parsing_ok = hook(arg.ToString());
+    *value_parsing_ok = hook(std::string(arg));
     return true;
   }
 
@@ -69,8 +69,8 @@ bool ParseInt64Flag(tensorflow::StringPiece arg, tensorflow::StringPiece flag,
       str_util::ConsumePrefix(&arg, flag) &&
       str_util::ConsumePrefix(&arg, "=")) {
     char extra;
-    int64 parsed_int64;
-    if (sscanf(arg.data(), "%lld%c", &parsed_int64, &extra) != 1) {
+    int64_t parsed_int64;
+    if (sscanf(arg.data(), "%ld%c", &parsed_int64, &extra) != 1) {
       LOG(ERROR) << "Couldn't interpret value " << arg << " for flag " << flag
                  << ".";
       *value_parsing_ok = false;
diff --git a/tensorflow/core/util/env_var.cc b/tensorflow/core/util/env_var.cc
index c844850179235c..8d43bcc9270453 100644
--- a/tensorflow/core/util/env_var.cc
+++ b/tensorflow/core/util/env_var.cc
@@ -28,7 +28,7 @@ namespace tensorflow {
 Status ReadBoolFromEnvVar(StringPiece env_var_name, bool default_val,
                           bool* value) {
   *value = default_val;
-  const char* tf_env_var_val = getenv(env_var_name.ToString().c_str());
+  const char* tf_env_var_val = getenv(std::string(env_var_name).c_str());
   if (tf_env_var_val == nullptr) {
     return Status::OK();
   }
@@ -48,7 +48,7 @@ Status ReadBoolFromEnvVar(StringPiece env_var_name, bool default_val,
 Status ReadInt64FromEnvVar(StringPiece env_var_name, int64 default_val,
                            int64* value) {
   *value = default_val;
-  const char* tf_env_var_val = getenv(env_var_name.ToString().c_str());
+  const char* tf_env_var_val = getenv(std::string(env_var_name).c_str());
   if (tf_env_var_val == nullptr) {
     return Status::OK();
   }
@@ -62,11 +62,11 @@ Status ReadInt64FromEnvVar(StringPiece env_var_name, int64 default_val,
 
 Status ReadStringFromEnvVar(StringPiece env_var_name, StringPiece default_val,
                             string* value) {
-  const char* tf_env_var_val = getenv(env_var_name.ToString().c_str());
+  const char* tf_env_var_val = getenv(std::string(env_var_name).c_str());
   if (tf_env_var_val != nullptr) {
     *value = tf_env_var_val;
   } else {
-    *value = default_val.ToString();
+    *value = std::string(default_val);
   }
   return Status::OK();
 }
diff --git a/tensorflow/core/util/event.proto b/tensorflow/core/util/event.proto
index 65d2c5a09c5c98..9ce85be551191d 100644
--- a/tensorflow/core/util/event.proto
+++ b/tensorflow/core/util/event.proto
@@ -81,7 +81,35 @@ message TaggedRunMetadata {
   bytes run_metadata = 2;
 }
 
-// For communicating live events back to a coordinator
-message SessionStatus {
-  repeated Event event = 1;
+// Worker heartbeat messages.  Support for these operations is currently
+// internal and expected to change.
+
+// Current health status of a worker.
+enum WorkerHealth {
+  OK = 0;  // By default a worker is healthy.
+  RECEIVED_SHUTDOWN_SIGNAL = 1;
+  INTERNAL_ERROR = 2;
+}
+
+// Indicates the behavior of the worker when an internal error or shutdown
+// signal is received.
+enum WorkerShutdownMode {
+  DEFAULT = 0;
+  SHUTDOWN_IMMEDIATELY = 1;
+  WAIT_FOR_COORDINATOR = 2;
+}
+
+message WatchdogConfig {
+  int64 timeout_ms = 1;
+}
+
+message WorkerHeartbeatRequest {
+  WorkerShutdownMode shutdown_mode = 1;
+  WatchdogConfig watchdog_config = 2;
+}
+
+message WorkerHeartbeatResponse {
+  WorkerHealth health_status = 1;
+  repeated Event worker_log = 2;
+  string hostname = 3;
 }
diff --git a/tensorflow/core/util/example_proto_fast_parsing.cc b/tensorflow/core/util/example_proto_fast_parsing.cc
index 7946fa1782ab3e..3ce7988057208e 100644
--- a/tensorflow/core/util/example_proto_fast_parsing.cc
+++ b/tensorflow/core/util/example_proto_fast_parsing.cc
@@ -353,7 +353,7 @@ bool TestFastParse(const string& serialized, Example* example) {
     // I.e. last entry in the map overwrites all the previous ones.
     parsed::FeatureMapEntry& name_and_feature =
         parsed_example[parsed_example_size - i - 1];
-    string name = name_and_feature.first.ToString();
+    string name = std::string(name_and_feature.first);
     if ((*features.mutable_feature()).count(name) > 0) continue;
 
     auto& value = (*features.mutable_feature())[name];
diff --git a/tensorflow/core/util/example_proto_fast_parsing_test.cc b/tensorflow/core/util/example_proto_fast_parsing_test.cc
index 13e41c17f7c7df..1a804e154cf607 100644
--- a/tensorflow/core/util/example_proto_fast_parsing_test.cc
+++ b/tensorflow/core/util/example_proto_fast_parsing_test.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+
 #include "tensorflow/core/util/example_proto_fast_parsing.h"
 
 #include "tensorflow/core/example/example.pb.h"
@@ -336,34 +337,6 @@ TEST(FastParse, FuzzTest) {
   }
 }
 
-string MakeSerializedExample() {
-  Example example;
-  const int kFeatureNameLength = 10;
-  const int kFeatureValueLength = 20;
-  const int kBytesFeatureCount = 200;
-  const int kFloatFeatureCount = 200;
-  const int kInt64FeatureCount = 200;
-  auto& fmap = *example.mutable_features()->mutable_feature();
-  for (int i = 0; i < kBytesFeatureCount; ++i) {
-    fmap[strings::StrCat(string('b', kFeatureNameLength), i)]
-        .mutable_bytes_list()
-        ->add_value(string('v', kFeatureValueLength));
-  }
-  for (int i = 0; i < kFloatFeatureCount; ++i) {
-    fmap[strings::StrCat(string('f', kFeatureNameLength), i)]
-        .mutable_float_list()
-        ->add_value(123123123.123);
-  }
-  for (int i = 0; i < kInt64FeatureCount; ++i) {
-    fmap[strings::StrCat(string('i', kFeatureNameLength), i)]
-        .mutable_int64_list()
-        ->add_value(10 * i);
-  }
-  string serialized;
-  example.SerializeToString(&serialized);
-  return serialized;
-}
-
 TEST(TestFastParseExample, Empty) {
   Result result;
   FastParseExampleConfig config;
@@ -374,6 +347,5 @@ TEST(TestFastParseExample, Empty) {
 }
 
 }  // namespace
-
 }  // namespace example
 }  // namespace tensorflow
diff --git a/tensorflow/core/util/exec_on_stall.h b/tensorflow/core/util/exec_on_stall.h
new file mode 100644
index 00000000000000..5c8f9d2324d38d
--- /dev/null
+++ b/tensorflow/core/util/exec_on_stall.h
@@ -0,0 +1,89 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_EXEC_ON_STALL_H_
+#define TENSORFLOW_CORE_UTIL_EXEC_ON_STALL_H_
+
+#include <functional>
+
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+// An object that executes a particular function only if it
+// is not deleted within the allotted number of seconds.
+//
+// This can be useful in diagnosing deadlocks, stalls and memory leaks
+// without logging too agressively.
+class ExecuteOnStall {
+ public:
+  // delay_secs: If the object still exists after this many seconds,
+  //     execute f.
+  // f: The function to be executed, for example a detailed log of the
+  //    the state of an object to which this is attached.
+  // poll_microseconds: The spawned thread will wake and test whether
+  //    the destructor has been invoked this frequently.
+  ExecuteOnStall(int delay_secs, std::function<void()> f,
+                 int32 poll_microseconds = 100)
+      : disabled_(false),
+        joined_(false),
+        env_(Env::Default()),
+        f_(f),
+        poll_microseconds_(poll_microseconds) {
+    deadline_ = env_->NowMicros() + 1000000 * delay_secs;
+    env_->SchedClosure([this]() {
+      while (env_->NowMicros() < deadline_) {
+        {
+          mutex_lock l(mu_);
+          if (disabled_) {
+            break;
+          }
+        }
+        env_->SleepForMicroseconds(poll_microseconds_);
+      }
+      {
+        mutex_lock l(mu_);
+        if (!disabled_) {
+          f_();
+        }
+        joined_ = true;
+        cond_var_.notify_all();
+      }
+    });
+  }
+
+  ~ExecuteOnStall() {
+    // Wait for spawned thread to terminate.
+    mutex_lock l(mu_);
+    disabled_ = true;
+    if (!joined_) {
+      cond_var_.wait(l);
+    }
+  }
+
+ private:
+  mutex mu_;
+  condition_variable cond_var_;
+  bool disabled_ GUARDED_BY(mu_);
+  bool joined_ GUARDED_BY(mu_);
+  Env* env_;
+  std::function<void()> f_;
+  int64 deadline_;
+  int32 poll_microseconds_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_UTIL_EXEC_ON_STALL_H_
diff --git a/tensorflow/core/util/exec_on_stall_test.cc b/tensorflow/core/util/exec_on_stall_test.cc
new file mode 100644
index 00000000000000..df8118d611fe9a
--- /dev/null
+++ b/tensorflow/core/util/exec_on_stall_test.cc
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/exec_on_stall.h"
+
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+struct Chunk {
+  std::unique_ptr<ExecuteOnStall> stall_closure;
+};
+
+Chunk* NewChunk(int stall_seconds, std::function<void()> f) {
+  Chunk* c = new Chunk;
+  c->stall_closure.reset(new ExecuteOnStall(stall_seconds, std::move(f)));
+  return c;
+}
+
+TEST(ExecuteOnStallTest, BothWays) {
+  bool a_triggered = false;
+  bool b_triggered = false;
+  Chunk* a = NewChunk(1, [&a_triggered]() { a_triggered = true; });
+  Chunk* b = NewChunk(1, [&b_triggered]() { b_triggered = true; });
+  delete a;
+  Env::Default()->SleepForMicroseconds(2000000);
+  EXPECT_FALSE(a_triggered);
+  EXPECT_TRUE(b_triggered);
+  delete b;
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 50a8e305749eec..230b4278ca926b 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -706,15 +706,48 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
   return output_tensor;
 }
 #else
+using mkldnn::stream;
+template <typename T> class MklDnnData;
+
 template <typename T>
 inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
                              const MklDnnShape& mkl_shape) {
   Tensor output_tensor;
-  TensorShape output_shape;
-
-  TF_CHECK_OK(
-      Status(error::Code::UNIMPLEMENTED, "Unimplemented conversion function"));
-
+  try {
+    if (!mkl_shape.IsMklTensor())
+      return mkl_tensor;  // return input since it is already TF tensor
+
+    TensorShape output_shape = mkl_shape.GetTfShape();;
+
+    // Allocate output tensor.
+    context->allocate_temp(DataTypeToEnum<T>::v(),
+        output_shape, &output_tensor);
+
+    auto cpu_engine = engine(engine::cpu, 0);
+    MklDnnData<T> input(&cpu_engine);
+
+    // Get Mkl layout of input tensor.
+    auto input_mkl_md = mkl_shape.GetMklLayout();
+    auto output_tf_md = mkl_shape.GetTfLayout();
+    auto output_tf_pd = memory::primitive_desc(output_tf_md, cpu_engine);
+    input.SetUsrMem(input_mkl_md, &mkl_tensor);
+
+    // reorder
+    if (input.IsReorderNeeded(output_tf_pd)) {
+      std::vector<primitive> net;
+      CHECK_EQ(input.CheckReorderToOpMem(output_tf_pd, &output_tensor, &net),
+             true);
+      stream(stream::kind::eager).submit(net).wait();
+    } else {
+      // If not, just forward input tensor to output tensor.
+      CHECK(output_tensor.CopyFrom(mkl_tensor, output_shape));
+    }
+  } catch (mkldnn::error& e) {
+    string error_msg = "Status: " + std::to_string(e.status) +
+                       ", message: " + string(e.message) + ", in file " +
+                       string(__FILE__) + ":" + std::to_string(__LINE__);
+    LOG(FATAL) << "Operation received an exception: " << error_msg;
+  }
   return output_tensor;
 }
 #endif
@@ -1359,7 +1392,7 @@ inline memory::dims MklDnnDimsInNCHW(const memory::dims& in_dims,
 /// Map MklDnn memory::dims object into TensorShape object.
 ///
 /// This function will simply map input shape in MKL-DNN memory::dims format
-/// in Tensorflow's TensorShape object by perserving dimension order.
+/// in Tensorflow's TensorShape object by preserving dimension order.
 ///
 /// @input MKL-DNN memory::dims object
 /// @output TensorShape corresponding to memory::dims
diff --git a/tensorflow/core/util/port.cc b/tensorflow/core/util/port.cc
index 200c88f099176e..8909c7de229ed9 100644
--- a/tensorflow/core/util/port.cc
+++ b/tensorflow/core/util/port.cc
@@ -39,11 +39,7 @@ bool IsBuiltWithROCm() {
 
   
 bool GpuSupportsHalfMatMulAndConv() {
-#if GOOGLE_CUDA
-  // NOTE: We check compile-time and not runtime, since the check for
-  // whether we include the fp16 kernels or not is compile-time.
-  return CUDA_VERSION >= 7050;
-#elif TENSORFLOW_USE_ROCM
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   return true;
 #else
   return false;
diff --git a/tensorflow/core/util/rpc/call_container.h b/tensorflow/core/util/rpc/call_container.h
index e1226a7f162caf..39ead10815abba 100644
--- a/tensorflow/core/util/rpc/call_container.h
+++ b/tensorflow/core/util/rpc/call_container.h
@@ -102,7 +102,9 @@ CallContainer<Call>::CallContainer(
     typename CallContainer<Call>::StartCallFn start_call_fn)
     : ctx_(ctx),
       done_(std::move(done)),
-      token_(ctx->cancellation_manager()->get_cancellation_token()),
+      token_(ctx->cancellation_manager() != nullptr
+                 ? ctx->cancellation_manager()->get_cancellation_token()
+                 : CancellationManager::kInvalidToken),
       fail_fast_(fail_fast),
       try_rpc_(try_rpc),
       callback_destroyed_(new Notification) {
@@ -110,7 +112,9 @@ CallContainer<Call>::CallContainer(
 
   // This will run when all RPCs are finished.
   reffed_status_callback_ = new ReffedStatusCallback([this](const Status& s) {
-    ctx_->cancellation_manager()->DeregisterCallback(token_);
+    if (token_ != CancellationManager::kInvalidToken) {
+      ctx_->cancellation_manager()->DeregisterCallback(token_);
+    }
     ctx_->SetStatus(s);
     done_();
     callback_destroyed_->WaitForNotification();
@@ -125,11 +129,14 @@ CallContainer<Call>::CallContainer(
   std::shared_ptr<internal::NotifyWhenDestroyed> notify_when_destroyed(
       new internal::NotifyWhenDestroyed(callback_destroyed_));
   std::shared_ptr<Notification> calls_started(new Notification);
-  bool is_cancelled = !ctx_->cancellation_manager()->RegisterCallback(
-      token_, [this, calls_started, notify_when_destroyed]() {
-        calls_started->WaitForNotification();
-        StartCancel();
-      });
+  bool is_cancelled = false;
+  if (token_ != CancellationManager::kInvalidToken) {
+    is_cancelled = !ctx_->cancellation_manager()->RegisterCallback(
+        token_, [this, calls_started, notify_when_destroyed]() {
+          calls_started->WaitForNotification();
+          StartCancel();
+        });
+  }
 
   for (int i = 0; i < num_calls; ++i) {
     create_call_fn(this, i);
diff --git a/tensorflow/core/util/session_message.cc b/tensorflow/core/util/session_message.cc
deleted file mode 100644
index 28a6517a1a3c58..00000000000000
--- a/tensorflow/core/util/session_message.cc
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/util/session_message.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/core/util/event.pb.h"
-
-static const int kMaxLogEvents = 1000;
-
-namespace tensorflow {
-
-SessionLogger::SessionLogger() : status_(new SessionStatus) {}
-
-SessionLogger::~SessionLogger() {}
-
-string SessionLogger::DebugString() { return "SessionLogger"; }
-
-void SessionLogger::Log(StringPiece message) {
-  mutex_lock lock(mu_);
-
-  Event* event = status_->add_event();
-  event->set_wall_time(Env::Default()->NowMicros());
-  event->set_step(0);
-  LogMessage* log = event->mutable_log_message();
-  log->set_message(message.ToString());
-  log->set_level(LogMessage::INFO);
-
-  // Clip log events by 10% if we overflow
-  if (status_->event_size() > kMaxLogEvents) {
-    auto events = status_->mutable_event();
-    events->DeleteSubrange(0, kMaxLogEvents / 10);
-  }
-}
-
-SessionLogger* GetSessionLogger(ResourceMgr* rm) {
-  SessionLogger* logger;
-
-  std::function<Status(SessionLogger**)> status_creator =
-      [](SessionLogger** result) {
-        *result = new SessionLogger();
-        return Status::OK();
-      };
-
-  if (!rm->LookupOrCreate<SessionLogger>("session", "status", &logger,
-                                         status_creator)
-           .ok()) {
-    return nullptr;
-  }
-
-  return logger;
-}
-
-void LogSessionMessage(ResourceMgr* rm, StringPiece message) {
-  return GetSessionLogger(rm)->Log(message);
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/core/util/session_message.h b/tensorflow/core/util/session_message.h
deleted file mode 100644
index c0f3d78b46a503..00000000000000
--- a/tensorflow/core/util/session_message.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_UTIL_SESSION_MESSAGE_H_
-#define TENSORFLOW_CORE_UTIL_SESSION_MESSAGE_H_
-
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/platform/mutex.h"
-
-namespace tensorflow {
-
-class ResourceMgr;
-class SessionStatus;
-
-class SessionLogger : public ResourceBase {
- public:
-  SessionLogger();
-  ~SessionLogger();
-
-  void Log(StringPiece message);
-  string DebugString() override;
-
-  const SessionStatus& status() { return *status_; }
-
- private:
-  std::unique_ptr<SessionStatus> status_;
-  mutex mu_;
-};
-
-// Return a SessionLogger instance for the current session.  If the logger
-// will be used across multiple computations, you must explicitly acquire
-// and release references using Ref()/Unref().
-//
-// Returns nullptr if a logger cannot be created.
-SessionLogger* GetSessionLogger(ResourceMgr* rm);
-
-// Attach `message` to the logger for the current session.
-void LogSessionMessage(ResourceMgr* rm, StringPiece message);
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_UTIL_SESSION_MESSAGE_H
diff --git a/tensorflow/core/util/stat_summarizer.cc b/tensorflow/core/util/stat_summarizer.cc
index 8447028e382438..a5c1fda1024ee8 100644
--- a/tensorflow/core/util/stat_summarizer.cc
+++ b/tensorflow/core/util/stat_summarizer.cc
@@ -31,26 +31,22 @@ limitations under the License.
 
 namespace tensorflow {
 
+using Detail = StatsCalculator::Detail;
+
 StatSummarizer::StatSummarizer(const StatSummarizerOptions& options)
-    : options_(options) {}
+    : stats_calculator_(new StatsCalculator(options)) {}
 
 StatSummarizer::StatSummarizer(const tensorflow::GraphDef& tensorflow_graph)
-    : StatSummarizer(StatSummarizerOptions()) {}
+    : stats_calculator_(new StatsCalculator(StatSummarizerOptions())) {}
 
 StatSummarizer::~StatSummarizer() {}
 
-void StatSummarizer::Reset() {
-  run_total_us_.Reset();
-  memory_.Reset();
-  details_.clear();
-}
-
-void StatSummarizer::Validate(const Detail* detail,
+void StatSummarizer::Validate(const std::vector<TensorDescription>* outputs,
                               const NodeExecStats& ns) const {
-  if (detail->outputs.size() != ns.output_size()) {
+  if (outputs->size() != ns.output_size()) {
     LOG(WARNING) << "Number of outputs changed between runs for '"
-                 << ns.node_name() << "' - was " << detail->outputs.size()
-                 << ", now " << ns.output_size();
+                 << ns.node_name() << "' - was " << outputs->size() << ", now "
+                 << ns.output_size();
   } else {
     for (const auto& output : ns.output()) {
       const int32 slot = output.slot();
@@ -58,7 +54,7 @@ void StatSummarizer::Validate(const Detail* detail,
         // This is not a hard error for Switch ops, so just pass.
         continue;
       }
-      const auto& stored = detail->outputs[slot];
+      const auto& stored = (*outputs)[slot];
       const auto& current = output.tensor_description();
 
       bool do_tensors_match =
@@ -82,6 +78,14 @@ void StatSummarizer::Validate(const Detail* detail,
   }
 }
 
+void StatSummarizer::PrintStepStats() const {
+  string output = GetOutputString();
+  std::istringstream iss(output);
+  for (std::string line; std::getline(iss, line);) {
+    LOG(INFO) << line;
+  }
+}
+
 namespace {
 std::string OpType(const DeviceStepStats& ds, const NodeExecStats& ns) {
   // There is no published specification of how DeviceStats and NodeStats
@@ -129,6 +133,7 @@ void StatSummarizer::ProcessStepStats(const StepStats& step_stats) {
 
   int64 first_node_start_us =
       step_stats.dev_stats(0).node_stats(0).all_start_micros();
+  std::map<std::string, Detail> details;
 
   int node_num = 0;
   for (const auto& ds : step_stats.dev_stats()) {
@@ -172,7 +177,10 @@ void StatSummarizer::ProcessStepStats(const StepStats& step_stats) {
       ++node_num;
       const int64 curr_time = ns.all_end_rel_micros();
       curr_total_us += curr_time;
-      auto result = details_.emplace(name, Detail());
+      auto result = details.emplace(name, Detail());
+      auto output_result =
+          outputs_.emplace(name, std::vector<TensorDescription>());
+      std::vector<TensorDescription>* outputs = &(output_result.first->second);
       Detail* detail = &(result.first->second);
 
       detail->start_us.UpdateStat(ns.all_start_micros() - first_node_start_us);
@@ -185,16 +193,15 @@ void StatSummarizer::ProcessStepStats(const StepStats& step_stats) {
 
         detail->run_order = node_num;
 
-        detail->outputs.resize(ns.output_size());
+        outputs->resize(ns.output_size());
         for (const auto& output : ns.output()) {
           const int32 slot = output.slot();
           if ((slot < 0) || (slot >= ns.output_size())) {
             // This is not a hard error for Switch ops, so just pass.
             continue;
           }
-          detail->outputs[slot] = output.tensor_description();
+          (*outputs)[slot] = output.tensor_description();
         }
-
         detail->times_called = 0;
       }
 
@@ -207,273 +214,22 @@ void StatSummarizer::ProcessStepStats(const StepStats& step_stats) {
       mem_total += curr_node_mem;
 
       ++detail->times_called;
+      stats_calculator_->UpdateDetails(details);
 
-      Validate(detail, ns);
-    }
-  }
-
-  run_total_us_.UpdateStat(curr_total_us);
-  memory_.UpdateStat(mem_total);
-}
-
-std::string StatSummarizer::ShortSummary() const {
-  std::stringstream stream;
-  stream << "Timings (microseconds): ";
-  run_total_us_.OutputToStream(&stream);
-  stream << std::endl;
-
-  stream << "Memory (bytes): ";
-  memory_.OutputToStream(&stream);
-  stream << std::endl;
-
-  stream << details_.size() << " nodes observed" << std::endl;
-  return stream.str();
-}
-
-std::ostream& InitField(std::ostream& stream, int width) {
-  stream << "\t" << std::right << std::setw(width) << std::fixed
-         << std::setprecision(3);
-  return stream;
-}
-
-std::string StatSummarizer::HeaderString(const string& title) const {
-  std::stringstream stream;
-
-  stream << "============================== " << title
-         << " ==============================" << std::endl;
-
-  InitField(stream, 24) << "[node type]";
-  InitField(stream, 9) << "[start]";
-  InitField(stream, 9) << "[first]";
-  InitField(stream, 9) << "[avg ms]";
-  InitField(stream, 8) << "[%]";
-  InitField(stream, 8) << "[cdf%]";
-  InitField(stream, 10) << "[mem KB]";
-  InitField(stream, 9) << "[times called]";
-  stream << "\t"
-         << "[Name]";
-  return stream.str();
-}
-
-std::string StatSummarizer::ColumnString(const Detail& detail,
-                                         const int64 cumulative_stat_on_node,
-                                         const Stat<int64>& stat) const {
-  const double start_ms = detail.start_us.avg() / 1000.0;
-  const double first_time_ms = detail.rel_end_us.first() / 1000.0;
-  const double avg_time_ms = detail.rel_end_us.avg() / 1000.0;
-  const double percentage = detail.rel_end_us.sum() * 100.0 / stat.sum();
-  const double cdf_percentage = (cumulative_stat_on_node * 100.0f) / stat.sum();
-  const int64 times_called = detail.times_called / num_runs();
-
-  std::stringstream stream;
-  InitField(stream, 24) << detail.type;
-  InitField(stream, 9) << start_ms;
-  InitField(stream, 9) << first_time_ms;
-  InitField(stream, 9) << avg_time_ms;
-  InitField(stream, 7) << percentage << "%";
-  InitField(stream, 7) << cdf_percentage << "%";
-  InitField(stream, 10) << detail.mem_used.newest() / 1000.0;
-  InitField(stream, 9) << times_called;
-  stream << "\t" << detail.name;
-
-  return stream.str();
-}
-
-void StatSummarizer::OrderNodesByMetric(
-    SortingMetric metric, std::vector<const Detail*>* details) const {
-  std::priority_queue<std::pair<string, const Detail*>> sorted_list;
-  const int num_nodes = details_.size();
-
-  for (const auto& det : details_) {
-    const Detail* detail = &(det.second);
-    std::stringstream stream;
-    stream << std::setw(20) << std::right << std::setprecision(10)
-           << std::fixed;
-
-    switch (metric) {
-      case BY_NAME:
-        stream << detail->name;
-        break;
-      case BY_RUN_ORDER:
-        stream << num_nodes - detail->run_order;
-        break;
-      case BY_TIME:
-        stream << detail->rel_end_us.avg();
-        break;
-      case BY_MEMORY:
-        stream << detail->mem_used.avg();
-        break;
-      case BY_TYPE:
-        stream << detail->type;
-        break;
-      default:
-        stream << "";
-        break;
-    }
-
-    sorted_list.emplace(stream.str(), detail);
-  }
-
-  while (!sorted_list.empty()) {
-    auto entry = sorted_list.top();
-    sorted_list.pop();
-    details->push_back(entry.second);
-  }
-}
-
-void StatSummarizer::ComputeStatsByType(
-    std::map<string, int64>* node_type_map_count,
-    std::map<string, int64>* node_type_map_time,
-    std::map<string, int64>* node_type_map_memory,
-    std::map<string, int64>* node_type_map_times_called,
-    int64* accumulated_us) const {
-  int64 run_count = run_total_us_.count();
-
-  for (const auto& det : details_) {
-    const string node_name = det.first;
-    const Detail& detail = det.second;
-
-    int64 curr_time_val =
-        static_cast<int64>(detail.rel_end_us.sum() / run_count);
-    *accumulated_us += curr_time_val;
-
-    int64 curr_memory_val = detail.mem_used.newest();
-
-    const string& node_type = detail.type;
-
-    (*node_type_map_count)[node_type] += 1;
-    (*node_type_map_time)[node_type] += curr_time_val;
-    (*node_type_map_memory)[node_type] += curr_memory_val;
-    (*node_type_map_times_called)[node_type] += detail.times_called / run_count;
-  }
-}
-
-std::string StatSummarizer::GetStatsByNodeType() const {
-  std::stringstream stream;
-
-  stream << "============================== Summary by node type "
-            "=============================="
-         << std::endl;
-
-  LOG(INFO) << "Number of nodes executed: " << details_.size();
-
-  std::map<string, int64> node_type_map_count;
-  std::map<string, int64> node_type_map_time;
-  std::map<string, int64> node_type_map_memory;
-  std::map<string, int64> node_type_map_times_called;
-  int64 accumulated_us = 0;
-
-  ComputeStatsByType(&node_type_map_count, &node_type_map_time,
-                     &node_type_map_memory, &node_type_map_times_called,
-                     &accumulated_us);
-
-  // Sort them.
-  std::priority_queue<std::pair<int64, std::pair<string, int64>>> timings;
-  for (const auto& node_type : node_type_map_time) {
-    const int64 mem_used = node_type_map_memory[node_type.first];
-    timings.emplace(node_type.second,
-                    std::pair<string, int64>(node_type.first, mem_used));
-  }
-
-  InitField(stream, 24) << "[Node type]";
-  InitField(stream, 9) << "[count]";
-  InitField(stream, 10) << "[avg ms]";
-  InitField(stream, 11) << "[avg %]";
-  InitField(stream, 11) << "[cdf %]";
-  InitField(stream, 10) << "[mem KB]";
-  InitField(stream, 10) << "[times called]";
-  stream << std::endl;
-
-  float cdf = 0.0f;
-  while (!timings.empty()) {
-    auto entry = timings.top();
-    timings.pop();
-
-    const string node_type = entry.second.first;
-    const float memory = entry.second.second / 1000.0f;
-
-    const int64 node_type_total_us = entry.first;
-    const float time_per_run_ms = node_type_total_us / 1000.0f;
-
-    const float percentage =
-        ((entry.first / static_cast<float>(accumulated_us)) * 100.0f);
-    cdf += percentage;
-
-    InitField(stream, 24) << node_type;
-    InitField(stream, 9) << node_type_map_count[node_type];
-    InitField(stream, 10) << time_per_run_ms;
-    InitField(stream, 10) << percentage << "%";
-    InitField(stream, 10) << cdf << "%";
-    InitField(stream, 10) << memory;
-    InitField(stream, 9) << node_type_map_times_called[node_type];
-    stream << std::endl;
-  }
-  stream << std::endl;
-  return stream.str();
-}
-
-std::string StatSummarizer::GetStatsByMetric(const string& title,
-                                             SortingMetric sorting_metric,
-                                             int num_stats) const {
-  std::vector<const Detail*> details;
-  OrderNodesByMetric(sorting_metric, &details);
-
-  double cumulative_stat_on_node = 0;
-
-  std::stringstream stream;
-  stream << HeaderString(title) << std::endl;
-  int stat_num = 0;
-  for (auto detail : details) {
-    ++stat_num;
-    if (num_stats > 0 && stat_num > num_stats) {
-      break;
+      Validate(outputs, ns);
     }
-
-    // TODO(andrewharp): Make this keep track of the particular metric for cdf.
-    cumulative_stat_on_node += detail->rel_end_us.sum();
-    stream << ColumnString(*detail, cumulative_stat_on_node, run_total_us_)
-           << std::endl;
   }
-  stream << std::endl;
-  return stream.str();
-}
 
-std::string StatSummarizer::GetOutputString() const {
-  std::stringstream stream;
-  if (options_.show_run_order) {
-    stream << GetStatsByMetric("Run Order", BY_RUN_ORDER,
-                               options_.run_order_limit);
-  }
-  if (options_.show_time) {
-    stream << GetStatsByMetric("Top by Computation Time", BY_TIME,
-                               options_.time_limit);
-  }
-  if (options_.show_memory) {
-    stream << GetStatsByMetric("Top by Memory Use", BY_MEMORY,
-                               options_.memory_limit);
-  }
-  if (options_.show_type) {
-    stream << GetStatsByNodeType();
-  }
-  if (options_.show_summary) {
-    stream << ShortSummary() << std::endl;
-  }
-  return stream.str();
+  stats_calculator_->UpdateRunTotalUs(curr_total_us);
+  stats_calculator_->UpdateMemoryUsed(mem_total);
 }
 
-void StatSummarizer::PrintStepStats() const {
-  string output = GetOutputString();
-  std::istringstream iss(output);
-  for (std::string line; std::getline(iss, line);) {
-    LOG(INFO) << line;
-  }
-}
 
 void StatSummarizer::PrintOutputs() const {
   std::priority_queue<
       std::pair<int64, const std::pair<const std::string, Detail>*>>
       timings;
-  for (const auto& entry : details_) {
+  for (const auto& entry : stats_calculator_->GetDetails()) {
     timings.emplace(-entry.second.start_us.avg(), &entry);
   }
 
@@ -481,10 +237,10 @@ void StatSummarizer::PrintOutputs() const {
   while (!timings.empty()) {
     auto entry = timings.top();
     timings.pop();
-    const Detail& detail = entry.second->second;
     std::stringstream stream;
-    stream << entry.second->first << "\t" << detail.outputs.size();
-    for (const auto& tensor : detail.outputs) {
+    const auto detail_outputs = outputs_.at(entry.second->first);
+    stream << entry.second->first << "\t" << detail_outputs.size();
+    for (const auto& tensor : detail_outputs) {
       stream << "\t" << DataTypeString(tensor.dtype());
       stream << "\t" << tensor.shape().dim_size();
       for (const auto& d : tensor.shape().dim()) {
diff --git a/tensorflow/core/util/stat_summarizer.h b/tensorflow/core/util/stat_summarizer.h
index 79fa63723ec843..7e6d6f63724ee0 100644
--- a/tensorflow/core/util/stat_summarizer.h
+++ b/tensorflow/core/util/stat_summarizer.h
@@ -13,20 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_UTIL_STAT_SUMMARIZER_H_
-#define TENSORFLOW_UTIL_STAT_SUMMARIZER_H_
+#ifndef TENSORFLOW_CORE_UTIL_STAT_SUMMARIZER_H_
+#define TENSORFLOW_CORE_UTIL_STAT_SUMMARIZER_H_
 
 #include <stdlib.h>
 
 #include <cmath>
 #include <limits>
 #include <map>
+#include <memory>
 #include <sstream>
 #include <string>
 
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/stat_summarizer_options.h"
+#include "tensorflow/core/util/stats_calculator.h"
 
 namespace tensorflow {
 
@@ -34,103 +37,6 @@ class GraphDef;
 class StepStats;
 class NodeExecStats;
 
-template <typename ValueType, typename HighPrecisionValueType = double>
-class Stat {
- public:
-  void UpdateStat(ValueType v) {
-    if (count_ == 0) {
-      first_ = v;
-    }
-
-    newest_ = v;
-    max_ = std::max(v, max_);
-    min_ = std::min(v, min_);
-    ++count_;
-    sum_ += v;
-    squared_sum_ += static_cast<HighPrecisionValueType>(v) * v;
-  }
-
-  void Reset() { new (this) Stat<ValueType, HighPrecisionValueType>(); }
-
-  bool empty() const { return count_ == 0; }
-
-  ValueType first() const { return first_; }
-
-  ValueType newest() const { return newest_; }
-
-  ValueType max() const { return max_; }
-
-  ValueType min() const { return min_; }
-
-  int64 count() const { return count_; }
-
-  ValueType sum() const { return sum_; }
-
-  HighPrecisionValueType squared_sum() const { return squared_sum_; }
-
-  bool all_same() const { return (count_ == 0 || min_ == max_); }
-
-  HighPrecisionValueType avg() const {
-    return empty() ? std::numeric_limits<ValueType>::quiet_NaN()
-                   : static_cast<HighPrecisionValueType>(sum_) / count_;
-  }
-
-  ValueType std_deviation() const {
-    return all_same() ? 0 : sqrt(squared_sum_ / count_ - avg() * avg());
-  }
-
-  void OutputToStream(std::ostream* stream) const {
-    if (empty()) {
-      *stream << "count=0";
-    } else if (all_same()) {
-      *stream << "count=" << count_ << " curr=" << newest_;
-      if (count_ > 1) *stream << "(all same)";
-    } else {
-      *stream << "count=" << count_ << " first=" << first_
-              << " curr=" << newest_ << " min=" << min_ << " max=" << max_
-              << " avg=" << avg() << " std=" << std_deviation();
-    }
-  }
-
-  friend std::ostream& operator<<(std::ostream& stream,
-                                  const Stat<ValueType>& stat) {
-    stat.OutputToStream(&stream);
-    return stream;
-  }
-
- private:
-  ValueType first_ = 0;
-  ValueType newest_ = 0;
-  ValueType max_ = std::numeric_limits<ValueType>::min();
-  ValueType min_ = std::numeric_limits<ValueType>::max();
-  int64 count_ = 0;
-  ValueType sum_ = 0;
-  HighPrecisionValueType squared_sum_ = 0;
-};
-
-// Used to control the output of the statistics summarizer;
-class StatSummarizerOptions {
- public:
-  StatSummarizerOptions()
-      : show_run_order(true),
-        run_order_limit(0),
-        show_time(true),
-        time_limit(10),
-        show_memory(true),
-        memory_limit(10),
-        show_type(true),
-        show_summary(true) {}
-
-  bool show_run_order;
-  int run_order_limit;
-  bool show_time;
-  int time_limit;
-  bool show_memory;
-  int memory_limit;
-  bool show_type;
-  bool show_summary;
-};
-
 // A StatSummarizer assists in performance analysis of Graph executions.
 //
 // It summarizes time spent executing (on GPU/CPU), memory used etc. across
@@ -140,14 +46,6 @@ class StatSummarizerOptions {
 // See tensorflow/tools/benchmark/benchmark_model.cc for an example usage.
 class StatSummarizer {
  public:
-  enum SortingMetric {
-    BY_NAME,
-    BY_RUN_ORDER,
-    BY_TIME,
-    BY_MEMORY,
-    BY_TYPE,
-  };
-
   explicit StatSummarizer(const StatSummarizerOptions& options);
 
   // Deprecated: Use StatSummarizer(const StatSummarizerOptions&) instead. The
@@ -161,9 +59,13 @@ class StatSummarizer {
 
   // Returns a string detailing the accumulated runtime stats in a tab-separated
   // format which can be pasted into a spreadsheet for further analysis.
-  std::string GetOutputString() const;
+  std::string GetOutputString() const {
+    return stats_calculator_->GetOutputString();
+  }
 
-  std::string ShortSummary() const;
+  std::string ShortSummary() const {
+    return stats_calculator_->GetShortSummary();
+  }
 
   // Prints the string returned by GetOutputString().
   void PrintStepStats() const;
@@ -171,55 +73,44 @@ class StatSummarizer {
   // Prints the output tensor sizes and types for each node.
   void PrintOutputs() const;
 
-  void ComputeStatsByType(std::map<string, int64>* node_type_map_count,
-                          std::map<string, int64>* node_type_map_time,
-                          std::map<string, int64>* node_type_map_memory,
-                          std::map<string, int64>* node_type_map_times_called,
-                          int64* accumulated_us) const;
+  void ComputeStatsByType(
+      std::map<std::string, int64_t>* node_type_map_count,
+      std::map<std::string, int64_t>* node_type_map_time,
+      std::map<std::string, int64_t>* node_type_map_memory,
+      std::map<std::string, int64_t>* node_type_map_times_called,
+      int64_t* accumulated_us) const {
+    stats_calculator_->ComputeStatsByType(
+        node_type_map_count, node_type_map_time, node_type_map_memory,
+        node_type_map_times_called, accumulated_us);
+  }
 
-  std::string GetStatsByNodeType() const;
+  std::string GetStatsByNodeType() const {
+    return stats_calculator_->GetStatsByNodeType();
+  }
 
   std::string GetStatsByMetric(const string& title,
-                               SortingMetric sorting_metric,
-                               int num_stats) const;
-
-  void Reset();
+                               StatsCalculator::SortingMetric sorting_metric,
+                               int num_stats) const {
+    return stats_calculator_->GetStatsByMetric(title, sorting_metric,
+                                               num_stats);
+  }
 
-  // Returns number of runs.
-  int num_runs() const { return static_cast<int>(run_total_us_.count()); }
+  int num_runs() const { return stats_calculator_->num_runs(); }
 
   // Returns stats of total microseconds spent by all nodes in each run.
-  const Stat<int64>& run_total_us() const { return run_total_us_; }
+  const Stat<int64_t>& run_total_us() const {
+    return stats_calculator_->run_total_us();
+  }
 
  private:
-  struct Detail {
-    string name;
-    string type;
-    int64 run_order;
-    Stat<int64> start_us;
-    Stat<int64> rel_end_us;
-    Stat<int64> mem_used;
-    std::vector<TensorDescription> outputs;
-    int64 times_called;
-  };
-
-  void Validate(const Detail* detail, const NodeExecStats& ns) const;
-
-  void OrderNodesByMetric(SortingMetric sorting_metric,
-                          std::vector<const Detail*>* details) const;
-
-  std::string HeaderString(const string& title) const;
-  std::string ColumnString(const Detail& detail,
-                           const int64 cumulative_stat_on_node,
-                           const Stat<int64>& stat) const;
-
-  Stat<int64> run_total_us_;
-  Stat<int64> memory_;
-
-  std::map<std::string, Detail> details_;
-  StatSummarizerOptions options_;
+  void Validate(const std::vector<TensorDescription>* outputs,
+                const NodeExecStats& ns) const;
+
+  std::map<std::string, std::vector<TensorDescription> > outputs_;
+
+  std::unique_ptr<StatsCalculator> stats_calculator_;
 };
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_UTIL_STAT_SUMMARIZER_H_
+#endif  // TENSORFLOW_CORE_UTIL_STAT_SUMMARIZER_H_
diff --git a/tensorflow/core/util/stat_summarizer_options.h b/tensorflow/core/util/stat_summarizer_options.h
new file mode 100644
index 00000000000000..578020676b7698
--- /dev/null
+++ b/tensorflow/core/util/stat_summarizer_options.h
@@ -0,0 +1,43 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_STAT_SUMMARIZER_OPTIONS_H_
+#define TENSORFLOW_CORE_UTIL_STAT_SUMMARIZER_OPTIONS_H_
+namespace tensorflow {
+// Used to control the output of the statistics summarizer;
+class StatSummarizerOptions {
+ public:
+  StatSummarizerOptions()
+      : show_run_order(true),
+        run_order_limit(0),
+        show_time(true),
+        time_limit(10),
+        show_memory(true),
+        memory_limit(10),
+        show_type(true),
+        show_summary(true) {}
+
+  bool show_run_order;
+  int run_order_limit;
+  bool show_time;
+  int time_limit;
+  bool show_memory;
+  int memory_limit;
+  bool show_type;
+  bool show_summary;
+};
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_STAT_SUMMARIZER_OPTIONS_H_
diff --git a/tensorflow/core/util/stats_calculator.cc b/tensorflow/core/util/stats_calculator.cc
new file mode 100644
index 00000000000000..c4befbdb84fec9
--- /dev/null
+++ b/tensorflow/core/util/stats_calculator.cc
@@ -0,0 +1,280 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/stats_calculator.h"
+
+#include <iomanip>
+#include <map>
+#include <queue>
+#include <sstream>
+#include <string>
+
+namespace tensorflow {
+
+StatsCalculator::StatsCalculator(const StatSummarizerOptions& options)
+    : options_(options) {}
+
+std::string StatsCalculator::GetShortSummary() const {
+  std::stringstream stream;
+  stream << "Timings (microseconds): ";
+  run_total_us_.OutputToStream(&stream);
+  stream << std::endl;
+
+  stream << "Memory (bytes): ";
+  memory_.OutputToStream(&stream);
+  stream << std::endl;
+
+  stream << details_.size() << " nodes observed" << std::endl;
+  return stream.str();
+}
+
+std::ostream& InitField(std::ostream& stream, int width) {
+  stream << "\t" << std::right << std::setw(width) << std::fixed
+         << std::setprecision(3);
+  return stream;
+}
+
+std::string StatsCalculator::HeaderString(const std::string& title) const {
+  std::stringstream stream;
+
+  stream << "============================== " << title
+         << " ==============================" << std::endl;
+
+  InitField(stream, 24) << "[node type]";
+  InitField(stream, 9) << "[start]";
+  InitField(stream, 9) << "[first]";
+  InitField(stream, 9) << "[avg ms]";
+  InitField(stream, 8) << "[%]";
+  InitField(stream, 8) << "[cdf%]";
+  InitField(stream, 10) << "[mem KB]";
+  InitField(stream, 9) << "[times called]";
+  stream << "\t"
+         << "[Name]";
+  return stream.str();
+}
+
+std::string StatsCalculator::ColumnString(const Detail& detail,
+                                          const int64_t cumulative_stat_on_node,
+                                          const Stat<int64_t>& stat) const {
+  const double start_ms = detail.start_us.avg() / 1000.0;
+  const double first_time_ms = detail.rel_end_us.first() / 1000.0;
+  const double avg_time_ms = detail.rel_end_us.avg() / 1000.0;
+  const double percentage = detail.rel_end_us.sum() * 100.0 / stat.sum();
+  const double cdf_percentage = (cumulative_stat_on_node * 100.0f) / stat.sum();
+  const int64_t times_called = detail.times_called / num_runs();
+
+  std::stringstream stream;
+  InitField(stream, 24) << detail.type;
+  InitField(stream, 9) << start_ms;
+  InitField(stream, 9) << first_time_ms;
+  InitField(stream, 9) << avg_time_ms;
+  InitField(stream, 7) << percentage << "%";
+  InitField(stream, 7) << cdf_percentage << "%";
+  InitField(stream, 10) << detail.mem_used.newest() / 1000.0;
+  InitField(stream, 9) << times_called;
+  stream << "\t" << detail.name;
+
+  return stream.str();
+}
+
+void StatsCalculator::OrderNodesByMetric(
+    SortingMetric metric, std::vector<const Detail*>* details) const {
+  std::priority_queue<std::pair<std::string, const Detail*>> sorted_list;
+  const int num_nodes = details_.size();
+
+  for (const auto& det : details_) {
+    const Detail* detail = &(det.second);
+    std::stringstream stream;
+    stream << std::setw(20) << std::right << std::setprecision(10)
+           << std::fixed;
+
+    switch (metric) {
+      case BY_NAME:
+        stream << detail->name;
+        break;
+      case BY_RUN_ORDER:
+        stream << num_nodes - detail->run_order;
+        break;
+      case BY_TIME:
+        stream << detail->rel_end_us.avg();
+        break;
+      case BY_MEMORY:
+        stream << detail->mem_used.avg();
+        break;
+      case BY_TYPE:
+        stream << detail->type;
+        break;
+      default:
+        stream << "";
+        break;
+    }
+
+    sorted_list.emplace(stream.str(), detail);
+  }
+
+  while (!sorted_list.empty()) {
+    auto entry = sorted_list.top();
+    sorted_list.pop();
+    details->push_back(entry.second);
+  }
+}
+
+void StatsCalculator::ComputeStatsByType(
+    std::map<std::string, int64_t>* node_type_map_count,
+    std::map<std::string, int64_t>* node_type_map_time,
+    std::map<std::string, int64_t>* node_type_map_memory,
+    std::map<std::string, int64_t>* node_type_map_times_called,
+    int64_t* accumulated_us) const {
+  int64_t run_count = run_total_us_.count();
+
+  for (const auto& det : details_) {
+    const std::string node_name = det.first;
+    const Detail& detail = det.second;
+
+    int64_t curr_time_val =
+        static_cast<int64_t>(detail.rel_end_us.sum() / run_count);
+    *accumulated_us += curr_time_val;
+
+    int64_t curr_memory_val = detail.mem_used.newest();
+
+    const std::string& node_type = detail.type;
+
+    (*node_type_map_count)[node_type] += 1;
+    (*node_type_map_time)[node_type] += curr_time_val;
+    (*node_type_map_memory)[node_type] += curr_memory_val;
+    (*node_type_map_times_called)[node_type] += detail.times_called / run_count;
+  }
+}
+
+std::string StatsCalculator::GetStatsByNodeType() const {
+  std::stringstream stream;
+
+  stream << "Number of nodes executed: " << details_.size() << std::endl;
+
+  stream << "============================== Summary by node type "
+            "=============================="
+         << std::endl;
+
+  std::map<std::string, int64_t> node_type_map_count;
+  std::map<std::string, int64_t> node_type_map_time;
+  std::map<std::string, int64_t> node_type_map_memory;
+  std::map<std::string, int64_t> node_type_map_times_called;
+  int64_t accumulated_us = 0;
+
+  ComputeStatsByType(&node_type_map_count, &node_type_map_time,
+                     &node_type_map_memory, &node_type_map_times_called,
+                     &accumulated_us);
+
+  // Sort them.
+  std::priority_queue<std::pair<int64_t, std::pair<std::string, int64_t>>>
+      timings;
+  for (const auto& node_type : node_type_map_time) {
+    const int64_t mem_used = node_type_map_memory[node_type.first];
+    timings.emplace(node_type.second,
+                    std::pair<std::string, int64_t>(node_type.first, mem_used));
+  }
+
+  InitField(stream, 24) << "[Node type]";
+  InitField(stream, 9) << "[count]";
+  InitField(stream, 10) << "[avg ms]";
+  InitField(stream, 11) << "[avg %]";
+  InitField(stream, 11) << "[cdf %]";
+  InitField(stream, 10) << "[mem KB]";
+  InitField(stream, 10) << "[times called]";
+  stream << std::endl;
+
+  float cdf = 0.0f;
+  while (!timings.empty()) {
+    auto entry = timings.top();
+    timings.pop();
+
+    const std::string node_type = entry.second.first;
+    const float memory = entry.second.second / 1000.0f;
+
+    const int64_t node_type_total_us = entry.first;
+    const float time_per_run_ms = node_type_total_us / 1000.0f;
+
+    const float percentage =
+        ((entry.first / static_cast<float>(accumulated_us)) * 100.0f);
+    cdf += percentage;
+
+    InitField(stream, 24) << node_type;
+    InitField(stream, 9) << node_type_map_count[node_type];
+    InitField(stream, 10) << time_per_run_ms;
+    InitField(stream, 10) << percentage << "%";
+    InitField(stream, 10) << cdf << "%";
+    InitField(stream, 10) << memory;
+    InitField(stream, 9) << node_type_map_times_called[node_type];
+    stream << std::endl;
+  }
+  stream << std::endl;
+  return stream.str();
+}
+
+std::string StatsCalculator::GetStatsByMetric(const std::string& title,
+                                              SortingMetric sorting_metric,
+                                              int num_stats) const {
+  std::vector<const Detail*> details;
+  OrderNodesByMetric(sorting_metric, &details);
+
+  double cumulative_stat_on_node = 0;
+
+  std::stringstream stream;
+  stream << HeaderString(title) << std::endl;
+  int stat_num = 0;
+  for (auto detail : details) {
+    ++stat_num;
+    if (num_stats > 0 && stat_num > num_stats) {
+      break;
+    }
+
+    // TODO(andrewharp): Make this keep track of the particular metric for cdf.
+    cumulative_stat_on_node += detail->rel_end_us.sum();
+    stream << ColumnString(*detail, cumulative_stat_on_node, run_total_us_)
+           << std::endl;
+  }
+  stream << std::endl;
+  return stream.str();
+}
+
+std::string StatsCalculator::GetOutputString() const {
+  std::stringstream stream;
+  if (options_.show_run_order) {
+    stream << GetStatsByMetric("Run Order", BY_RUN_ORDER,
+                               options_.run_order_limit);
+  }
+  if (options_.show_time) {
+    stream << GetStatsByMetric("Top by Computation Time", BY_TIME,
+                               options_.time_limit);
+  }
+  if (options_.show_memory) {
+    stream << GetStatsByMetric("Top by Memory Use", BY_MEMORY,
+                               options_.memory_limit);
+  }
+  if (options_.show_type) {
+    stream << GetStatsByNodeType();
+  }
+  if (options_.show_summary) {
+    stream << GetShortSummary() << std::endl;
+  }
+  return stream.str();
+}
+
+void StatsCalculator::UpdateDetails(
+    const std::map<std::string, Detail>& details) {
+  details_.insert(details.begin(), details.end());
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/stats_calculator.h b/tensorflow/core/util/stats_calculator.h
new file mode 100644
index 00000000000000..39cef816f1987c
--- /dev/null
+++ b/tensorflow/core/util/stats_calculator.h
@@ -0,0 +1,186 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_STATS_CALCULATOR_H_
+#define TENSORFLOW_CORE_UTIL_STATS_CALCULATOR_H_
+
+#include <stdlib.h>
+
+#include <cmath>
+#include <limits>
+#include <map>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/util/stat_summarizer_options.h"
+
+namespace tensorflow {
+
+template <typename ValueType, typename HighPrecisionValueType = double>
+class Stat {
+ public:
+  void UpdateStat(ValueType v) {
+    if (count_ == 0) {
+      first_ = v;
+    }
+
+    newest_ = v;
+    max_ = std::max(v, max_);
+    min_ = std::min(v, min_);
+    ++count_;
+    sum_ += v;
+    squared_sum_ += static_cast<HighPrecisionValueType>(v) * v;
+  }
+
+  void Reset() { new (this) Stat<ValueType, HighPrecisionValueType>(); }
+
+  bool empty() const { return count_ == 0; }
+
+  ValueType first() const { return first_; }
+
+  ValueType newest() const { return newest_; }
+
+  ValueType max() const { return max_; }
+
+  ValueType min() const { return min_; }
+
+  int64_t count() const { return count_; }
+
+  ValueType sum() const { return sum_; }
+
+  HighPrecisionValueType squared_sum() const { return squared_sum_; }
+
+  bool all_same() const { return (count_ == 0 || min_ == max_); }
+
+  HighPrecisionValueType avg() const {
+    return empty() ? std::numeric_limits<ValueType>::quiet_NaN()
+                   : static_cast<HighPrecisionValueType>(sum_) / count_;
+  }
+
+  ValueType std_deviation() const {
+    return all_same() ? 0 : sqrt(squared_sum_ / count_ - avg() * avg());
+  }
+
+  void OutputToStream(std::ostream* stream) const {
+    if (empty()) {
+      *stream << "count=0";
+    } else if (all_same()) {
+      *stream << "count=" << count_ << " curr=" << newest_;
+      if (count_ > 1) *stream << "(all same)";
+    } else {
+      *stream << "count=" << count_ << " first=" << first_
+              << " curr=" << newest_ << " min=" << min_ << " max=" << max_
+              << " avg=" << avg() << " std=" << std_deviation();
+    }
+  }
+
+  friend std::ostream& operator<<(std::ostream& stream,
+                                  const Stat<ValueType>& stat) {
+    stat.OutputToStream(&stream);
+    return stream;
+  }
+
+ private:
+  ValueType first_ = 0;
+  ValueType newest_ = 0;
+  ValueType max_ = std::numeric_limits<ValueType>::min();
+  ValueType min_ = std::numeric_limits<ValueType>::max();
+  int64_t count_ = 0;
+  ValueType sum_ = 0;
+  HighPrecisionValueType squared_sum_ = 0;
+};
+
+// A StatsCalculator assists in performance analysis of Graph executions.
+//
+// It summarizes time spent executing (on GPU/CPU), memory used etc for
+// graph execution.
+//
+// For example usage see StatsSummarizer.
+class StatsCalculator {
+ public:
+  enum SortingMetric {
+    BY_NAME,
+    BY_RUN_ORDER,
+    BY_TIME,
+    BY_MEMORY,
+    BY_TYPE,
+  };
+
+  explicit StatsCalculator(const StatSummarizerOptions& options);
+
+  // Returns a string detailing the accumulated runtime stats in a tab-separated
+  // format which can be pasted into a spreadsheet for further analysis.
+  std::string GetOutputString() const;
+
+  std::string GetShortSummary() const;
+
+  void ComputeStatsByType(
+      std::map<std::string, int64_t>* node_type_map_count,
+      std::map<std::string, int64_t>* node_type_map_time,
+      std::map<std::string, int64_t>* node_type_map_memory,
+      std::map<std::string, int64_t>* node_type_map_times_called,
+      int64_t* accumulated_us) const;
+
+  std::string GetStatsByNodeType() const;
+
+  std::string GetStatsByMetric(const std::string& title,
+                               SortingMetric sorting_metric,
+                               int num_stats) const;
+
+  // Returns number of runs.
+  int num_runs() const { return static_cast<int>(run_total_us_.count()); }
+
+  // Returns stats of total microseconds spent by all nodes in each run.
+  const Stat<int64_t>& run_total_us() const { return run_total_us_; }
+
+  void UpdateRunTotalUs(int64_t run_total_us) {
+    run_total_us_.UpdateStat(run_total_us);
+  }
+
+  void UpdateMemoryUsed(int64_t memory) { memory_.UpdateStat(memory); }
+
+  struct Detail {
+    std::string name;
+    std::string type;
+    int64_t run_order;
+    Stat<int64_t> start_us;
+    Stat<int64_t> rel_end_us;
+    Stat<int64_t> mem_used;
+    int64_t times_called;
+  };
+
+  const std::map<std::string, Detail>& GetDetails() const { return details_; }
+  void UpdateDetails(const std::map<std::string, Detail>& details);
+
+ private:
+  void OrderNodesByMetric(SortingMetric sorting_metric,
+                          std::vector<const Detail*>* details) const;
+
+  std::string HeaderString(const std::string& title) const;
+  std::string ColumnString(const Detail& detail,
+                           const int64_t cumulative_stat_on_node,
+                           const Stat<int64_t>& stat) const;
+
+  Stat<int64_t> run_total_us_;
+  Stat<int64_t> memory_;
+
+  std::map<std::string, Detail> details_;
+  StatSummarizerOptions options_;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_STATS_CALCULATOR_H_
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
index 0426fee0e26797..71906147069074 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -370,14 +370,14 @@ Status PadAlignment(FileOutputBuffer* out, int alignment, int64* size) {
 BundleWriter::BundleWriter(Env* env, StringPiece prefix, const Options& options)
     : env_(env),
       options_(options),
-      prefix_(prefix.ToString()),
+      prefix_(std::string(prefix)),
       tmp_metadata_path_(strings::StrCat(MetaFilename(prefix_), ".tempstate",
                                          random::New64())),
       tmp_data_path_(strings::StrCat(DataFilename(prefix_, 0, 1), ".tempstate",
                                      random::New64())),
       out_(nullptr),
       size_(0) {
-  status_ = env_->CreateDir(io::Dirname(prefix_).ToString());
+  status_ = env_->CreateDir(std::string(io::Dirname(prefix_)));
   if (!status_.ok() && !errors::IsAlreadyExists(status_)) {
     return;
   }
@@ -394,7 +394,7 @@ BundleWriter::BundleWriter(Env* env, StringPiece prefix, const Options& options)
 Status BundleWriter::Add(StringPiece key, const Tensor& val) {
   if (!status_.ok()) return status_;
   CHECK_NE(key, kHeaderEntryKey);
-  const string key_string = key.ToString();
+  const string key_string = std::string(key);
   if (entries_.find(key_string) != entries_.end()) {
     status_ = errors::InvalidArgument("Adding duplicate key: ", key);
     return status_;
@@ -445,7 +445,7 @@ Status BundleWriter::AddSlice(StringPiece full_tensor_key,
   // In the case of a sharded save, MergeBundles() is responsible for merging
   // the "slices" field of multiple metadata entries corresponding to the same
   // full tensor.
-  const string full_tensor_key_string = full_tensor_key.ToString();
+  const string full_tensor_key_string = std::string(full_tensor_key);
   BundleEntryProto* full_entry = &entries_[full_tensor_key_string];
   if (full_entry->dtype() != DT_INVALID) {
     CHECK_EQ(full_entry->dtype(), slice_tensor.dtype());
@@ -600,7 +600,7 @@ static Status MergeOneBundle(Env* env, StringPiece prefix,
   // Loops through the non-header to-merge entries.
   BundleEntryProto to_merge_entry;
   for (; iter->Valid(); iter->Next()) {
-    const string key = iter->key().ToString();
+    const string key = std::string(iter->key());
     const auto entry_iter = merge_state->entries.find(key);
 
     // Illegal: the duplicated entry is a non-slice tensor.
@@ -649,7 +649,7 @@ Status MergeBundles(Env* env, gtl::ArraySlice<string> prefixes,
   // Merges all metadata tables.
   // TODO(zhifengc): KeyValue sorter if it becomes too big.
   MergeState merge;
-  Status status = env->CreateDir(io::Dirname(merged_prefix).ToString());
+  Status status = env->CreateDir(std::string(io::Dirname(merged_prefix)));
   if (!status.ok() && !errors::IsAlreadyExists(status)) return status;
   for (int i = 0; i < prefixes.size(); ++i) {
     TF_RETURN_IF_ERROR(MergeOneBundle(env, prefixes[i], &merge));
@@ -697,7 +697,7 @@ Status MergeBundles(Env* env, gtl::ArraySlice<string> prefixes,
 
 BundleReader::BundleReader(Env* env, StringPiece prefix)
     : env_(env),
-      prefix_(prefix.ToString()),
+      prefix_(std::string(prefix)),
       metadata_(nullptr),
       table_(nullptr),
       iter_(nullptr) {
@@ -919,7 +919,7 @@ Status BundleReader::GetSliceValue(StringPiece full_tensor_key,
 
   const TensorShape full_shape(TensorShape(full_tensor_entry.shape()));
   std::vector<std::pair<TensorSlice, string>> details;
-  const string full_tensor_key_string = full_tensor_key.ToString();
+  const string full_tensor_key_string = std::string(full_tensor_key);
   const TensorSliceSet* tss =
       gtl::FindPtrOrNull(tensor_slices_, full_tensor_key_string);
 
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
index 7f166f0ec0aeee..92ce8ae00eaf7c 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle_test.cc
@@ -107,7 +107,7 @@ std::vector<string> AllTensorKeys(BundleReader* reader) {
   reader->Seek(kHeaderEntryKey);
   reader->Next();
   for (; reader->Valid(); reader->Next()) {
-    ret.push_back(reader->key().ToString());
+    ret.push_back(std::string(reader->key()));
   }
   return ret;
 }
diff --git a/tensorflow/core/util/tensor_format.cc b/tensorflow/core/util/tensor_format.cc
index 8c833650ca1341..d4311d1ab058aa 100644
--- a/tensorflow/core/util/tensor_format.cc
+++ b/tensorflow/core/util/tensor_format.cc
@@ -41,6 +41,8 @@ string ToString(TensorFormat format) {
       return "NCHW";
     case FORMAT_NCHW_VECT_C:
       return "NCHW_VECT_C";
+    case FORMAT_NHWC_VECT_W:
+      return "NHWC_VECT_W";
     default:
       LOG(FATAL) << "Invalid Format: " << static_cast<int32>(format);
       return "INVALID_FORMAT";
@@ -74,6 +76,10 @@ bool FormatFromString(const string& format_str, TensorFormat* format) {
     *format = FORMAT_NCHW_VECT_C;
     return true;
   }
+  if (format_str == "NHWC_VECT_W") {
+    *format = FORMAT_NHWC_VECT_W;
+    return true;
+  }
   return false;
 }
 
diff --git a/tensorflow/core/util/tensor_format.h b/tensorflow/core/util/tensor_format.h
index 646673512cf18f..d3d5602f924541 100644
--- a/tensorflow/core/util/tensor_format.h
+++ b/tensorflow/core/util/tensor_format.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_UTIL_TENSOR_FORMAT_H_
-#define TENSORFLOW_UTIL_TENSOR_FORMAT_H_
+#ifndef TENSORFLOW_CORE_UTIL_TENSOR_FORMAT_H_
+#define TENSORFLOW_CORE_UTIL_TENSOR_FORMAT_H_
 
 #include <array>
 #include <vector>
@@ -29,6 +29,9 @@ namespace tensorflow {
 // The mnemonics specify the meaning of each tensor dimension sorted from
 // largest to smallest memory stride.
 // N = Batch, H = Image Height, W = Image Width, C = Number of Channels.
+// TODO(pauldonnelly): It would probably be better to switch to a registration
+// process for tensor formats, so specialized formats could be defined more
+// locally to where they are used.
 enum TensorFormat {
   // FORMAT_NHWC is the default format in TensorFlow.
   FORMAT_NHWC = 0,
@@ -45,6 +48,17 @@ enum TensorFormat {
   // NCHW_VECT_C format.
   // A pre-condition of this format is that C must be a multiple of 4.
   FORMAT_NCHW_VECT_C = 2,
+
+  // Similar to NHWC, but the size of the W dimension is divided by 4, and a
+  // new dimension of size 4 is appended, which packs 4 adjacent activations
+  // in the width dimension.
+  FORMAT_NHWC_VECT_W = 3,
+
+  // Note: although the current code in this file assumes VECT_C and VECT_W
+  // enums imply int8x4 vectors, this should not be relied upon.
+  // In the future we may change the meaning of these enums to include vectors
+  // of other types such as int16x2, with op implementations automatically
+  // determining which format is implied based on the datatype.
 };
 
 // Tensor format for convolutional filters.
@@ -61,7 +75,7 @@ enum FilterTensorFormat {
   FORMAT_OIHW = 1,
 
   // OIHW_VECT_I is the most performant tensor format for cudnn6's quantized
-  // int8 convolution and fused convolution. It is analagous to the NCHW_VECT_C
+  // int8 convolution and fused convolution. It is analogous to the NCHW_VECT_C
   // data format. It is laid out in the same order as OIHW, except that the size
   // of the Input Channels dimension is divided by 4, and a new dimension of
   // size 4 is appended, which packs 4 adjacent input channel weights into an
@@ -89,10 +103,17 @@ string ToString(FilterTensorFormat format);
 // Returns the number of spatial dims of a tensor of rank 'num_dims' and tensor
 // format 'format'.
 inline int GetTensorSpatialDims(int num_dims, TensorFormat format) {
-  if (format == FORMAT_NCHW_VECT_C) {
-    return num_dims - 3;  // Exclude N,C,InnerC.
-  } else {
-    return num_dims - 2;  // Exclude N,C.
+  switch (format) {
+    case FORMAT_NHWC:
+      return num_dims - 2;  // Exclude N,C.
+    case FORMAT_NCHW:
+      return num_dims - 2;  // Exclude N,C.
+    case FORMAT_NCHW_VECT_C:
+      return num_dims - 3;  // Exclude N,C,VectDim.
+    case FORMAT_NHWC_VECT_W:
+      // Note: the VECT_W is not counted as an independent spatial dim here,
+      // since it just a component of the width dimension.
+      return num_dims - 3;  // Exclude N,C,VectDim.
   }
 }
 
@@ -108,10 +129,13 @@ inline int GetFilterTensorSpatialDims(int num_dims, FilterTensorFormat format) {
 // tensor format 'format'. This is the inverse of GetTensorSpatialDims.
 inline int GetTensorDimsFromSpatialDims(int num_spatial_dims,
                                         TensorFormat format) {
-  if (format == FORMAT_NCHW_VECT_C) {
-    return num_spatial_dims + 3;  // Include N,C,InnerC.
-  } else {
-    return num_spatial_dims + 2;  // Include N,C.
+  switch (format) {
+    case FORMAT_NHWC:
+    case FORMAT_NCHW:
+      return num_spatial_dims + 2;  // Include N,C.
+    case FORMAT_NCHW_VECT_C:
+    case FORMAT_NHWC_VECT_W:
+      return num_spatial_dims + 3;  // Include N,C,VectDim.
   }
 }
 
@@ -132,6 +156,7 @@ inline int GetTensorBatchDimIndex(int num_dims, TensorFormat format) {
     case FORMAT_NHWC:
     case FORMAT_NCHW:
     case FORMAT_NCHW_VECT_C:
+    case FORMAT_NHWC_VECT_W:
       return 0;
     default:
       LOG(FATAL) << "Unknown format " << format;
@@ -146,6 +171,8 @@ inline int GetTensorFeatureDimIndex(int num_dims, TensorFormat format) {
   switch (format) {
     case FORMAT_NHWC:
       return num_dims - 1;
+    case FORMAT_NHWC_VECT_W:
+      return num_dims - 2;
     case FORMAT_NCHW:
     case FORMAT_NCHW_VECT_C:
       return 1;
@@ -161,24 +188,34 @@ inline int GetTensorInnerFeatureDimIndex(int num_dims, TensorFormat format) {
   return num_dims - 1;
 }
 
-// Returns the index of the `dim`-th spatial dimension.
+// Returns the index of the inner width dimension.
+inline int GetTensorInnerWidthDimIndex(int num_dims, TensorFormat format) {
+  DCHECK_EQ(format, FORMAT_NHWC_VECT_W);
+  return num_dims - 1;
+}
+
+// Returns the dimension index of the specified 'spatial_dim' within an
+// activation tensor. If format is NHWC_VECT_W and spatial_dim is 1, returns
+// the index of the outer width dimension (i.e. dimension 2, whose size would
+// be width / 4 in this case).
 inline int GetTensorSpatialDimIndex(int num_dims, TensorFormat format,
-                                    int dim) {
-  CHECK(dim >= 0 && dim < GetTensorSpatialDims(num_dims, format))
-      << dim << " " << num_dims << " " << ToString(format);
+                                    int spatial_dim) {
+  CHECK(spatial_dim >= 0 &&
+        spatial_dim < GetTensorSpatialDims(num_dims, format))
+      << spatial_dim << " " << num_dims << " " << ToString(format);
   switch (format) {
     case FORMAT_NHWC:
-      return dim + 1;
+    case FORMAT_NHWC_VECT_W:
+      return spatial_dim + 1;
     case FORMAT_NCHW:
     case FORMAT_NCHW_VECT_C:
-      return dim + 2;
+      return spatial_dim + 2;
     default:
       LOG(FATAL) << "Unknown format " << format;
       return -1;  // Avoid compiler warning about missing return value
   }
 }
 
-// Returns the index of the `dim`-th spatial dimension.
 inline int GetFilterTensorSpatialDimIndex(int num_dims,
                                           FilterTensorFormat format, int dim) {
   CHECK(dim >= 0 && dim < GetFilterTensorSpatialDims(num_dims, format))
@@ -246,7 +283,7 @@ inline int GetFilterTensorOutputChannelsDimIndex(int num_dims,
 // the outer channel dimension (i.e. 1).
 template <int NUM_SPATIAL_DIMS>
 inline int32 GetTensorDimIndex(TensorFormat format, char dimension) {
-  if (format == FORMAT_NHWC) {
+  if (format == FORMAT_NHWC || format == FORMAT_NHWC_VECT_W) {
     // clang-format off
     switch (dimension) {
       case 'N': return 0;
@@ -404,28 +441,37 @@ string GetConvnet3dDataFormatAttrString();
 string GetConvnetFilterFormatAttrString();
 string GetConvnet3dFilterFormatAttrString();
 
-// Return a tensor shape for the given format. Works for both 2D and 3D
-// operations. If format is FORMAT_NCHW_VECT_C, the output TensorShape has rank
-// spatial.size()+3 (N,C,spatial,InnerC); otherwise, it has rank
-// spatial.size()+2 (e.g. N,C,spatial or N,spatial,C).
+// Returns a tensor shape for the specified format and dimension sizes.
+// Works for both 2D and 3D operations. The output shapes are as follows:
+// FORMAT_NHWC:        (N, spatial, C); rank = spatial.size() + 2
+// FORMAT_NCHW:        (N, C, spatial); rank = spatial.size() + 2
+// FORMAT_NCHW_VECT_C: (N, C, spatial, InnerC); rank = spatial.size() + 3
+// FORMAT_NHWC_VECT_W: (N, spatial, C, InnerW); rank = spatial.size() + 3
 inline TensorShape ShapeFromFormat(TensorFormat format, int64 N,
                                    gtl::ArraySlice<int64> spatial, int64 C) {
   const int dims = GetTensorDimsFromSpatialDims(spatial.size(), format);
   gtl::InlinedVector<int64, 6> dim_sizes(dims);
   dim_sizes[GetTensorBatchDimIndex(dims, format)] = N;
   for (int dim = 0; static_cast<size_t>(dim) < spatial.size(); dim++) {
-    dim_sizes[GetTensorSpatialDimIndex(dims, format, dim)] = spatial[dim];
+    auto dim_size = spatial[dim];
+    if (format == FORMAT_NHWC_VECT_W && dim == spatial.size() - 1) {
+      CHECK_EQ(0, dim_size % 4)
+          << "FORMAT_NHWC_VECT_W requires W to be a multiple of 4, but W="
+          << dim_size;
+      dim_sizes[GetTensorInnerWidthDimIndex(dims, format)] = 4;
+      dim_size /= 4;
+    }
+    dim_sizes[GetTensorSpatialDimIndex(dims, format, dim)] = dim_size;
   }
 
   int feature_index = GetTensorFeatureDimIndex(dims, format);
   if (format == FORMAT_NCHW_VECT_C) {
     CHECK_EQ(0, C % 4) << "NCHW_VECT_C requires C to be a multiple of 4, but C="
                        << C;
-    dim_sizes[feature_index] = C / 4;
+    C /= 4;
     dim_sizes[GetTensorInnerFeatureDimIndex(dims, format)] = 4;
-  } else {
-    dim_sizes[feature_index] = C;
   }
+  dim_sizes[feature_index] = C;
   return TensorShape(dim_sizes);
 }
 
@@ -478,19 +524,18 @@ inline TensorShape ShapeFromFormat(TensorFormat dst_format,
   const int64 batch = GetTensorDim(src_shape, src_format, 'N');
   const int64 channels = GetTensorDim(src_shape, src_format, 'C') *
                          (src_format == FORMAT_NCHW_VECT_C ? 4 : 1);
-
-  if (GetTensorSpatialDims(src_shape.dims(), src_format) == 3) {
-    return ShapeFromFormat(dst_format, batch,
-                           {{GetTensorDim(src_shape, src_format, '0'),
-                             GetTensorDim(src_shape, src_format, '1'),
-                             GetTensorDim(src_shape, src_format, '2')}},
-                           channels);
+  const int num_src_spatial_dims =
+      GetTensorSpatialDims(src_shape.dims(), src_format);
+  std::vector<int64> spatial_dims(num_src_spatial_dims);
+  for (int spatial_dim = 0; spatial_dim < num_src_spatial_dims; ++spatial_dim) {
+    spatial_dims[spatial_dim] =
+        gtl::ArraySlice<int64>(src_shape.dim_sizes())[GetTensorSpatialDimIndex(
+            src_shape.dims(), src_format, spatial_dim)];
   }
-
-  return ShapeFromFormat(dst_format, batch,
-                         {{GetTensorDim(src_shape, src_format, 'H'),
-                           GetTensorDim(src_shape, src_format, 'W')}},
-                         channels);
+  if (src_format == FORMAT_NHWC_VECT_W) {
+    spatial_dims[num_src_spatial_dims - 1] *= 4;
+  }
+  return ShapeFromFormat(dst_format, batch, {spatial_dims}, channels);
 }
 
 // Returns a copy of the specified filter tensor 'src_shape' converted from
@@ -525,4 +570,4 @@ inline TensorShape ShapeFromFilterFormat(FilterTensorFormat dst_filter_format,
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_UTIL_TENSOR_FORMAT_H_
+#endif  // TENSORFLOW_CORE_UTIL_TENSOR_FORMAT_H_
diff --git a/tensorflow/core/util/tensor_format_test.cc b/tensorflow/core/util/tensor_format_test.cc
index 36698e03831379..93902290eb094d 100644
--- a/tensorflow/core/util/tensor_format_test.cc
+++ b/tensorflow/core/util/tensor_format_test.cc
@@ -29,6 +29,7 @@ std::pair<TensorFormat, const char*> test_data_formats[] = {
     EnumStringPair(FORMAT_NHWC),
     EnumStringPair(FORMAT_NCHW),
     EnumStringPair(FORMAT_NCHW_VECT_C),
+    EnumStringPair(FORMAT_NHWC_VECT_W),
 };
 
 std::pair<FilterTensorFormat, const char*> test_filter_formats[] = {
@@ -104,7 +105,8 @@ struct DimMaps {
 inline constexpr const TensorDimMap&
 GetTensorDimMap(const int num_spatial_dims, const TensorFormat format) {
   return
-      (format == FORMAT_NHWC) ? DimMaps::kTdmNHWC[num_spatial_dims] :
+      (format == FORMAT_NHWC ||
+       format == FORMAT_NHWC_VECT_W) ? DimMaps::kTdmNHWC[num_spatial_dims] :
       (format == FORMAT_NCHW ||
        format == FORMAT_NCHW_VECT_C) ? DimMaps::kTdmNCHW[num_spatial_dims]
                                      : DimMaps::kTdmInvalid;
diff --git a/tensorflow/core/util/work_sharder.cc b/tensorflow/core/util/work_sharder.cc
index 7922fc9224e8c4..337af07b50872f 100644
--- a/tensorflow/core/util/work_sharder.cc
+++ b/tensorflow/core/util/work_sharder.cc
@@ -35,7 +35,7 @@ void Shard(int max_parallelism, thread::ThreadPool* workers, int64 total,
     workers->ParallelFor(total, cost_per_unit, work);
     return;
   }
-  cost_per_unit = std::max(1LL, cost_per_unit);
+  cost_per_unit = std::max(int64{1}, cost_per_unit);
   // We shard [0, total) into "num_shards" shards.
   //   1 <= num_shards <= num worker threads
   //
diff --git a/tensorflow/docs_src/api_guides/python/contrib.framework.md b/tensorflow/docs_src/api_guides/python/contrib.framework.md
index f7f26e5626033c..6b4ce3a14d7e1f 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.framework.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.framework.md
@@ -18,17 +18,20 @@ Framework utilities.
 *   @{tf.contrib.framework.with_same_shape}
 
 ## Deprecation
+
 *   @{tf.contrib.framework.deprecated}
 *   @{tf.contrib.framework.deprecated_args}
 *   @{tf.contrib.framework.deprecated_arg_values}
 
 ## Arg_Scope
+
 *   @{tf.contrib.framework.arg_scope}
 *   @{tf.contrib.framework.add_arg_scope}
 *   @{tf.contrib.framework.has_arg_scope}
 *   @{tf.contrib.framework.arg_scoped_arguments}
 
 ## Variables
+
 *   @{tf.contrib.framework.add_model_variable}
 *   @{tf.train.assert_global_step}
 *   @{tf.contrib.framework.assert_or_get_global_step}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.learn.md b/tensorflow/docs_src/api_guides/python/contrib.learn.md
index 8b2fffa2013da7..03838dc5aede4a 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.learn.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.learn.md
@@ -25,6 +25,7 @@ Train and evaluate TensorFlow models.
 *   @{tf.contrib.learn.LogisticRegressor}
 
 ## Distributed training utilities
+
 *   @{tf.contrib.learn.Experiment}
 *   @{tf.contrib.learn.ExportStrategy}
 *   @{tf.contrib.learn.TaskType}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md b/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md
index 496d43dfd7e48c..143919fd84b70b 100644
--- a/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md
+++ b/tensorflow/docs_src/api_guides/python/contrib.seq2seq.md
@@ -21,6 +21,7 @@ wrapper.  An instance of an `AttentionMechanism` is constructed with a
 ### Attention Mechanisms
 
 The two basic attention mechanisms are:
+
 *   @{tf.contrib.seq2seq.BahdanauAttention} (additive attention,
     [ref.](https://arxiv.org/abs/1409.0473))
 *   @{tf.contrib.seq2seq.LuongAttention} (multiplicative attention,
@@ -118,14 +119,17 @@ outputs, _ = tf.contrib.seq2seq.dynamic_decode(
 ```
 
 ### Decoder base class and functions
+
 *   @{tf.contrib.seq2seq.Decoder}
 *   @{tf.contrib.seq2seq.dynamic_decode}
 
 ### Basic Decoder
+
 *   @{tf.contrib.seq2seq.BasicDecoderOutput}
 *   @{tf.contrib.seq2seq.BasicDecoder}
 
 ### Decoder Helpers
+
 *   @{tf.contrib.seq2seq.Helper}
 *   @{tf.contrib.seq2seq.CustomHelper}
 *   @{tf.contrib.seq2seq.GreedyEmbeddingHelper}
diff --git a/tensorflow/docs_src/api_guides/python/meta_graph.md b/tensorflow/docs_src/api_guides/python/meta_graph.md
index 0eff9000931666..f1c3adc22c3546 100644
--- a/tensorflow/docs_src/api_guides/python/meta_graph.md
+++ b/tensorflow/docs_src/api_guides/python/meta_graph.md
@@ -22,14 +22,14 @@ protocol buffer. It contains the following fields:
 * [`GraphDef`](https://www.tensorflow.org/code/tensorflow/core/framework/graph.proto) for describing the graph.
 * [`SaverDef`](https://www.tensorflow.org/code/tensorflow/core/protobuf/saver.proto) for the saver.
 * [`CollectionDef`](https://www.tensorflow.org/code/tensorflow/core/protobuf/meta_graph.proto)
-map that further describes additional components of the model, such as
+map that further describes additional components of the model such as
 @{$python/state_ops$`Variables`},
-@{tf.train.QueueRunner}, etc.  In order for a Python object to be serialized
+@{tf.train.QueueRunner}, etc.
+
+In order for a Python object to be serialized
 to and from `MetaGraphDef`, the Python class must implement `to_proto()` and
 `from_proto()` methods, and register them with the system using
-`register_proto_function`.
-
-  For example,
+`register_proto_function`. For example:
 
   ```Python
   def to_proto(self, export_scope=None):
diff --git a/tensorflow/docs_src/api_guides/python/reading_data.md b/tensorflow/docs_src/api_guides/python/reading_data.md
index b3ca9583704eb3..5bbbfd32160f71 100644
--- a/tensorflow/docs_src/api_guides/python/reading_data.md
+++ b/tensorflow/docs_src/api_guides/python/reading_data.md
@@ -184,7 +184,7 @@ The recommended way to read a TFRecord file is with a @{tf.data.TFRecordDataset}
     dataset = dataset.map(decode)
 ```
 
-To acomplish the same task with a queue based input pipeline requires the following code 
+To accomplish the same task with a queue based input pipeline requires the following code
 (using the same `decode` function from the above example): 
 
 ``` python
diff --git a/tensorflow/docs_src/api_guides/python/train.md b/tensorflow/docs_src/api_guides/python/train.md
index 80fe9784de64c3..cbc50529469b32 100644
--- a/tensorflow/docs_src/api_guides/python/train.md
+++ b/tensorflow/docs_src/api_guides/python/train.md
@@ -54,6 +54,7 @@ gradients.
 *   @{tf.global_norm}
 
 ## Decaying the learning rate
+
 *   @{tf.train.exponential_decay}
 *   @{tf.train.inverse_time_decay}
 *   @{tf.train.natural_exp_decay}
diff --git a/tensorflow/docs_src/community/benchmarks.md b/tensorflow/docs_src/community/benchmarks.md
index 67856ce8698aec..153ef4a015d475 100644
--- a/tensorflow/docs_src/community/benchmarks.md
+++ b/tensorflow/docs_src/community/benchmarks.md
@@ -1,14 +1,14 @@
 # Defining and Running Benchmarks
 
-This guide contains instructions for defining and running a TensorFlow benchmark. These benchmarks store output in [TestResults](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/test_log.proto) format. If these benchmarks are added to TensorFlow github repo, then we will run them daily with our continuous build and display a graph on our dashboard: https://benchmarks-dot-tensorflow-testing.appspot.com/.
+This guide contains instructions for defining and running a TensorFlow benchmark. These benchmarks store output in [TestResults](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/test_log.proto) format. If these benchmarks are added to the TensorFlow github repo, we will run them daily with our continuous build and display a graph on our dashboard: https://benchmarks-dot-tensorflow-testing.appspot.com/.
 
 [TOC]
 
 
 ## Defining a Benchmark
 
-Defining a TensorFlow benchmark requires extending from `tf.test.Benchmark`
-class and calling `self.report_benchmark` method. For example, take a look at the sample benchmark code below:
+Defining a TensorFlow benchmark requires extending the `tf.test.Benchmark`
+class and calling the `self.report_benchmark` method. Below, you'll find an example of benchmark code:
 
 ```python
 import time
@@ -54,20 +54,20 @@ Key points to note in the example above:
 
 ## Running with Python
 
-Use the `--benchmarks` flag to run the benchmark with python. A [BenchmarkEntries](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/util/test_log.proto) proto will be printed.
+Use the `--benchmarks` flag to run the benchmark with Python. A [BenchmarkEntries](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/util/test_log.proto) proto will be printed.
 
 ```
 python sample_benchmark.py --benchmarks=SampleBenchmark
 ```
 
-Setting the flag as `--benchmarks=.` or `--benchmarks=all` would work as well.
+Setting the flag as `--benchmarks=.` or `--benchmarks=all` works as well.
 
-(Please ensure that Tensorflow is installed to successfully import the package in the line `import tensorflow as tf`. For installation instructions, see [Installing TensorFlow](https://www.tensorflow.org/install/). This step is not necessary when running with bazel.)
+(Please ensure that Tensorflow is installed to successfully import the package in the line `import tensorflow as tf`. For installation instructions, see [Installing TensorFlow](https://www.tensorflow.org/install/). This step is not necessary when running with Bazel.)
 
 
 ## Adding a `bazel` Target
 
-We have a special target called `tf_py_logged_benchmark` for benchmarks defined under TensorFlow github repo. `tf_py_logged_benchmark` should wrap around a regular `py_test` target. Running a `tf_py_logged_benchmark` would print a [TestResults](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/test_log.proto) proto. Defining a `tf_py_logged_benchmark` also lets us run it with TensorFlow continuous build.
+We have a special target called `tf_py_logged_benchmark` for benchmarks defined under the TensorFlow github repo. `tf_py_logged_benchmark` should wrap around a regular `py_test` target. Running a `tf_py_logged_benchmark` would print a [TestResults](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/util/test_log.proto) proto. Defining a `tf_py_logged_benchmark` also lets us run it with TensorFlow continuous build.
 
 First, define a regular `py_test` target. See example below:
 
@@ -82,7 +82,7 @@ py_test(
 )
 ```
 
-You can run benchmarks in a `py_test` target by passing `--benchmarks` flag. The benchmark should just print out a [BenchmarkEntries](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/util/test_log.proto) proto.
+You can run benchmarks in a `py_test` target by passing the `--benchmarks` flag. The benchmark should just print out a [BenchmarkEntries](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/core/util/test_log.proto) proto.
 
 ```shell
 bazel test :sample_benchmark --test_arg=--benchmarks=all
@@ -90,7 +90,7 @@ bazel test :sample_benchmark --test_arg=--benchmarks=all
 
 
 Now, add the `tf_py_logged_benchmark` target (if available). This target would
-pass in `--benchmarks=all` to the wrapped `py_test` target and provide a way to store output for our TensorFlow continuous build. `tf_py_logged_benchmark` target should be available in TensorFlow repository.
+pass in `--benchmarks=all` to the wrapped `py_test` target and provide a way to store output for our TensorFlow continuous build. The target `tf_py_logged_benchmark` should be available in TensorFlow repository.
 
 ```build
 load("//tensorflow/tools/test:performance.bzl", "tf_py_logged_benchmark")
diff --git a/tensorflow/docs_src/community/groups.md b/tensorflow/docs_src/community/groups.md
index d92f5775fafa39..0b07d413da3c6d 100644
--- a/tensorflow/docs_src/community/groups.md
+++ b/tensorflow/docs_src/community/groups.md
@@ -1,17 +1,38 @@
 # User Groups
 
-TensorFlow has communities around the world.
+TensorFlow has communities around the world. [Submit your community!](https://docs.google.com/forms/d/e/1FAIpQLSc_RQIUYtVgLLihzATaO_WUXkEyBDE_OoRoOXYDPmBEvHuEBA/viewform)
 
 ## Asia
 
-* [TensorFlow Korea (TF-KR) User Group](https://www.facebook.com/groups/TensorFlowKR/) _(Korean language)_
-* [TensorFlow User Group Tokyo](https://tfug-tokyo.connpass.com/) _(Japanese Language)_
-* [Soleil Data Dojo](https://soleildatadojo.connpass.com/) _(Japanese language)_
+* [TensorFlow China community](https://www.tensorflowers.cn)
+* [TensorFlow Korea (TF-KR) User Group](https://www.facebook.com/groups/TensorFlowKR/)
+* [TensorFlow User Group Tokyo](https://tfug-tokyo.connpass.com/)
+* [Soleil Data Dojo](https://soleildatadojo.connpass.com/)
 * [TensorFlow User Group Utsunomiya](https://tfug-utsunomiya.connpass.com/)
+* [TensorFlow Philippines Community](https://www.facebook.com/groups/TensorFlowPH/)
+* [TensorFlow and Deep Learning Singapore](https://www.meetup.com/TensorFlow-and-Deep-Learning-Singapore/)
+* [TensorFlow India](https://www.facebook.com/tensorflowindia)
 
 
 ## Europe
 
 * [TensorFlow Barcelona](https://www.meetup.com/Barcelona-Machine-Learning-Meetup/)
 * [TensorFlow Madrid](https://www.meetup.com/TensorFlow-Madrid/)
+* [Tensorflow Belgium](https://www.meetup.com/TensorFlow-Belgium)
+* [TensorFlow x Rome Meetup](https://www.meetup.com/it-IT/TensorFlow-x-Rome-Meetup)
+* [TensorFlow London](https://www.meetup.com/TensorFlow-London/)
+* [TensorFlow Edinburgh](https://www.meetup.com/tensorflow-edinburgh/)
 
+
+## America
+
+* [TensorFlow Buenos Aires](https://www.meetup.com/TensorFlow-Buenos-Aires/)
+
+
+## Oceania
+* [Melbourne TensorFlow Meetup](https://www.meetup.com/Melbourne-TensorFlow-Meetup)
+
+
+## Africa
+
+* [TensorFlow Tunis Meetup](https://www.meetup.com/fr-FR/TensorFlow-Tunis-Meetup/)
diff --git a/tensorflow/docs_src/community/leftnav_files b/tensorflow/docs_src/community/leftnav_files
index 2bae60d9ddc5c1..0bd1f14de9817b 100644
--- a/tensorflow/docs_src/community/leftnav_files
+++ b/tensorflow/docs_src/community/leftnav_files
@@ -6,4 +6,3 @@ groups.md
 documentation.md
 style_guide.md
 benchmarks.md
-swift.md
diff --git a/tensorflow/docs_src/community/roadmap.md b/tensorflow/docs_src/community/roadmap.md
index a3170a10f2d12e..0463ca05fe5353 100644
--- a/tensorflow/docs_src/community/roadmap.md
+++ b/tensorflow/docs_src/community/roadmap.md
@@ -1,5 +1,5 @@
 # Roadmap
-**Last updated: Feb 15, 2018**
+**Last updated: Apr 27, 2018**
 
 TensorFlow is a rapidly moving, community supported project. This document is intended 
 to provide guidance about priorities and focus areas of the core set of TensorFlow 
@@ -14,12 +14,12 @@ expected in the next one to two releases.
 
 ### APIs
 #### High Level APIs:
-* Easy multi-GPU utilization with Estimators
+* Easy multi-GPU and TPU utilization with Estimators
 * Easy-to-use high-level pre-made estimators for Gradient Boosted Trees, Time Series, and other models
 
 #### Eager Execution:
 * Efficient utilization of multiple GPUs
-* Distributed training (multi-machine)
+* Distributed training support (multi-machine)
 * Performance improvements
 * Simpler export to a GraphDef/SavedModel 
 
@@ -31,14 +31,14 @@ to create Keras models Eager- style via Model subclassing)
 
 #### Official Models:
 * A set of 
-[reference models](https://github.com/tensorflow/models/tree/master/official) 
+[models](https://github.com/tensorflow/models/tree/master/official) 
 across image recognition, speech, object detection, and 
   translation that demonstrate best practices and serve as a starting point for 
   high-performance model development.
 
 #### Contrib:
-* Deprecation notices added to parts of tf.contrib where preferred implementations exist outside of tf.contrib.
-* As much as possible, large projects inside tf.contrib moved to separate repositories.
+* Deprecate parts of tf.contrib where preferred implementations exist outside of tf.contrib.
+* As much as possible, move large projects inside tf.contrib to separate repositories.
 * The tf.contrib module will eventually be discontinued in its current form, experimental development will in future happen in other repositories.
 
 
@@ -50,36 +50,72 @@ across image recognition, speech, object detection, and
 
 ### Platforms
 #### TensorFlow Lite:
-* Increased coverage of supported ops in TensorFlow Lite
+* Increase coverage of supported ops in TensorFlow Lite
 * Easier conversion of a trained TensorFlow graph for use on TensorFlow Lite
 * Support for GPU acceleration in TensorFlow Lite (iOS and Android)
 * Support for hardware accelerators via Android NeuralNets API 
-* Improved CPU performance by quantization and other network optimizations (eg. pruning, distillation)
-* Increased support for devices beyond Android and iOS (eg. RPi, Cortex-M)
+* Improve CPU performance by quantization and other network optimizations (eg. pruning, distillation)
+* Increase support for devices beyond Android and iOS (eg. RPi, Cortex-M)
+
+#### TensorFlow.js:
+* Release package for Node.js bindings to the TensorFlow C API through the TensorFlow.js backend interface
+* Expand support for importing TensorFlow SavedModels and Keras models into browser with unified APIs supporting retraining in browser
+* Improve Layers API and allow model exporting/saving
+* Release tfjs-data API for efficient data input pipelines
+
+#### TensorFlow with Swift:
+* Establish open source project including documentation, open design, and code availability.
+* Continue implementing and refining implementation and design through 2018.
+* Aim for implementation to be solid enough for general use later in 2018.
 
 ### Performance
 #### Distributed TensorFlow:
-* Multi-GPU support optimized for a variety of GPU topologies
-* Improved mechanisms for distributing computations on several machines
+* Optimize Multi-GPU support for a variety of GPU topologies
+* Improve mechanisms for distributing computations on several machines
+
+#### GPU Optimizations:
+* Simplify mixed precision API with initial example model and guide.
+* Finalize TensorRT API and move to core.
+* CUDA 9.2 and NCCL 2.x default in TensorFlow builds.
+* Optimizations for DGX-2.
+* Remove support for CUDA less than 8.x and cuDNN less than 6.x.
 
-#### Optimizations:
-* Mixed precision training support with initial example model and guide
-* Native TensorRT support
+
+#### CPU Optimizations
 * Int8 support for SkyLake via MKL
 * Dynamic loading of SIMD-optimized kernels
+* MKL for Linux and Windows
+
+### End-to-end ML systems:
+#### TensorFlow Hub:
+* Expand support for module-types in TF Hub with TF Eager integration, Keras layers integration, and TensorFlow.js integration
+* Accept variable-sized image input
+* Improve multi-GPU estimator support
+* Document and improve TPU integration
+
+#### TensorFlow Extended:
+* Open source more of the TensorFlow Extended platform to facilitate adoption of TensorFlow in production settings.
+* Release TFX libraries for Data Validation
+
+### Documentation and Resources:
+* Update documentation, tutorials and Getting Started guides on all features and APIs
+* Update [Youtube Tensorflow channel](https://youtube.com/tensorflow) weekly with new content:
+Coding TensorFlow - where we teach folks coding with tensorflow
+TensorFlow Meets - where we highlight community contributions
+Ask TensorFlow - where we answer community questions
+Guest and Showcase videos
+* Update [Official TensorFlow blog](https://blog.tensorflow.org) with regular articles from Google team and the Community
 
-### Documentation and Usability:
-* Updated documentation, tutorials and Getting Started guides
-* Process to enable external contributions to tutorials, documentation, and blogs showcasing best practice use-cases of TensorFlow and high-impact applications
 
 ### Community and Partner Engagement
 #### Special Interest Groups: 
-* Mobilizing the community to work together in focused domains
+* Mobilize the community to work together in focused domains
 * [tf-distribute](https://groups.google.com/a/tensorflow.org/forum/#!forum/tf-distribute): build and packaging of TensorFlow
-* More to be identified and launched
+* SIG TensorBoard, SIG Rust, and more to be identified and launched
 
 #### Community:
 * Incorporate public feedback on significant design decisions via a Request-for-Comment (RFC) process
 * Formalize process for external contributions to land in TensorFlow and associated projects 
 * Grow global TensorFlow communities and user groups
 * Collaborate with partners to co-develop and publish research papers
+* Process to enable external contributions to tutorials, documentation, and blogs showcasing best practice use-cases of TensorFlow and high-impact applications
diff --git a/tensorflow/docs_src/community/security.md b/tensorflow/docs_src/community/security.md
deleted file mode 100644
index 8d13c7a1eaf336..00000000000000
--- a/tensorflow/docs_src/community/security.md
+++ /dev/null
@@ -1,7 +0,0 @@
-# Using TensorFlow Securely
-
-Before using TensorFlow, please take a look at our security model, list of
-recent security announcements, and ways you can report security issues to the
-TensorFlow team at the
-[https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md](Using
-TensorFlow Securely) page on GitHub.
diff --git a/tensorflow/docs_src/community/swift.md b/tensorflow/docs_src/community/swift.md
deleted file mode 100644
index 54d9960b23405b..00000000000000
--- a/tensorflow/docs_src/community/swift.md
+++ /dev/null
@@ -1,35 +0,0 @@
-# Swift Community
-
-Welcome to the Swift for TensorFlow development community!
-
-Swift for TensorFlow is a result of first-principles thinking applied to machine
-learning frameworks, and works quite differently than existing TensorFlow
-language bindings.  Whereas prior solutions are designed within the constraints
-of what can be achieved by a (typically Python or Lua) library, Swift for
-TensorFlow is based on the belief that machine learning is important enough to
-deserve first-class language and compiler support.
-
-First-class language and compiler support allows us to innovate in areas that
-have traditionally been out of bounds for machine learning libraries.  Our
-results provide the performance of TensorFlow graphs with the ease of use of
-define-by-run models, and provides a great user experience - for example, by
-catching more mistakes before you run your code.
-
-## Open Source
-
-As announced at the TensorFlow Developer Summit, we are planning to launch our
-open source project on GitHub in April.  In addition to releasing the code, we
-will be using an open design model, where design discussions happen in public.
-
-Between now and then, we are writing some technical white papers that explain in
-detail the design approach (e.g., the core compiler partitioning technique that
-underlies the whole thing, our approach to automatic differentiation, etc.),
-implementation tradeoffs, and the status of this work.  We can’t wait to engage
-with the broader community, but prefer to start the conversation when these
-white papers are ready.
-
-[Sign up here to join the community Google
-group](https://groups.google.com/a/tensorflow.org/d/forum/swift). We will
-initially use it for announcements, and then open it for general discussion when
-we are ready in April.
-
diff --git a/tensorflow/docs_src/deploy/index.md b/tensorflow/docs_src/deploy/index.md
index 07b1bc9257ff7b..33220041895acd 100644
--- a/tensorflow/docs_src/deploy/index.md
+++ b/tensorflow/docs_src/deploy/index.md
@@ -14,4 +14,8 @@ the following documents:
     designed for production environments. TensorFlow Serving provides
     out-of-the-box integration with TensorFlow models.
     [Source code for TensorFlow Serving](https://github.com/tensorflow/serving)
-    is available on Github.
+    is available on GitHub.
+
+[TensorFlow Extended (TFX)](/tfx) is an end-to-end machine learning platform for
+TensorFlow. Implemented at Google, we've open sourced some TFX libraries with the
+rest of the system to come.
diff --git a/tensorflow/docs_src/deploy/s3.md b/tensorflow/docs_src/deploy/s3.md
index ef3b030e3277c1..9ef9674338a905 100644
--- a/tensorflow/docs_src/deploy/s3.md
+++ b/tensorflow/docs_src/deploy/s3.md
@@ -1,6 +1,6 @@
 # How to run TensorFlow on S3
 
-Tensorflow supports reading and writing data to S3. S3 is an object storage API which is nearly ubiquitious, and can help in situations where data must accessed by multiple actors, such as in distributed training.
+Tensorflow supports reading and writing data to S3. S3 is an object storage API which is nearly ubiquitous, and can help in situations where data must accessed by multiple actors, such as in distributed training.
 
 This document guides you through the required setup, and provides examples on usage.
 
diff --git a/tensorflow/docs_src/extend/adding_an_op.md b/tensorflow/docs_src/extend/adding_an_op.md
index c3795492cef7d6..1b028be4ea16af 100644
--- a/tensorflow/docs_src/extend/adding_an_op.md
+++ b/tensorflow/docs_src/extend/adding_an_op.md
@@ -863,48 +863,53 @@ REGISTER_OP("ZeroOut")
 Instead of writing another `OpKernel` with redundant code as above, often you
 will be able to use a C++ template instead.  You will still have one kernel
 registration (`REGISTER_KERNEL_BUILDER` call) per overload.
-<pre class="prettyprint"><code class="lang-cpp">
-<b>template &lt;typename T&gt;</b>
+```c++
+template <typename T>
 class ZeroOutOp : public OpKernel {
  public:
-  explicit ZeroOutOp(OpKernelConstruction\* context) : OpKernel(context) {}<br/>
-  void Compute(OpKernelContext\* context) override {
+  explicit ZeroOutOp(OpKernelConstruction* context) : OpKernel(context) {}
+  
+  void Compute(OpKernelContext* context) override {
     // Grab the input tensor
-    const Tensor& input\_tensor = context-&gt;input(0);
-    auto input = input\_tensor.flat<b>&lt;T&gt;</b>();<br/>
+    const Tensor& input_tensor = context->input(0);
+    auto input = input_tensor.flat<T>();
+    
     // Create an output tensor
     Tensor* output = NULL;
-    OP\_REQUIRES\_OK(context,
-                   context-&gt;allocate\_output(0, input_tensor.shape(), &output));
-    auto output\_flat = output-&gt;template flat<b>&lt;T&gt;</b>();<br/>
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, input_tensor.shape(), &output));
+    auto output_flat = output->template flat<T>();
+    
     // Set all the elements of the output tensor to 0
     const int N = input.size();
-    for (int i = 0; i &lt; N; i++) {
-      output\_flat(i) = 0;
-    }<br/>
+    for (int i = 0; i < N; i++) {
+      output_flat(i) = 0;
+    }
+    
     // Preserve the first input value
-    if (N &gt; 0) output\_flat(0) = input(0);
+    if (N > 0) output_flat(0) = input(0);
   }
-};<br/>
-// Note that TypeConstraint&lt;int32&gt;("T") means that attr "T" (defined
+};
+
+// Note that TypeConstraint<int32>("T") means that attr "T" (defined
 // in the op registration above) must be "int32" to use this template
-// instantiation.</b>
-REGISTER\_KERNEL\_BUILDER(
+// instantiation.
+REGISTER_KERNEL_BUILDER(
     Name("ZeroOut")
-    .Device(DEVICE\_CPU)
-    .TypeConstraint&lt;int32&gt;("T"),
-    <b>ZeroOutOp&lt;int32&gt;</b>);
-REGISTER\_KERNEL\_BUILDER(
+    .Device(DEVICE_CPU)
+    .TypeConstraint<int32>("T"),
+    ZeroOutOp<int32>);
+REGISTER_KERNEL_BUILDER(
     Name("ZeroOut")
-    .Device(DEVICE\_CPU)
-    .TypeConstraint&lt;float&gt;("T"),
-    <b>ZeroOutOp&lt;float&gt;</b>);
-<b>REGISTER\_KERNEL\_BUILDER(
+    .Device(DEVICE_CPU)
+    .TypeConstraint<float>("T"),
+    ZeroOutOp<float>);
+REGISTER_KERNEL_BUILDER(
     Name("ZeroOut")
-    .Device(DEVICE\_CPU)
-    .TypeConstraint&lt;double&gt;("T"),
-    ZeroOutOp&lt;double&gt;);
-</b></code></pre>
+    .Device(DEVICE_CPU)
+    .TypeConstraint<double>("T"),
+    ZeroOutOp<double>);
+```
 
 If you have more than a couple overloads, you can put the registration in a
 macro.
diff --git a/tensorflow/docs_src/extend/architecture.md b/tensorflow/docs_src/extend/architecture.md
index c0fc714a4405d6..c8f522a03ab0c1 100644
--- a/tensorflow/docs_src/extend/architecture.md
+++ b/tensorflow/docs_src/extend/architecture.md
@@ -4,8 +4,8 @@ We designed TensorFlow for large-scale distributed training and inference, but
 it is also flexible enough to support experimentation with new machine
 learning models and system-level optimizations.
 
-This document describes the system architecture that makes possible this
-combination of scale and flexibility. It assumes that you have basic familiarity
+This document describes the system architecture that makes this
+combination of scale and flexibility possible. It assumes that you have basic familiarity
 with TensorFlow programming concepts such as the computation graph, operations,
 and sessions. See @{$programmers_guide/low_level_intro$this document}
 for an introduction to these topics. Some familiarity
@@ -15,8 +15,8 @@ will also be helpful.
 This document is for developers who want to extend TensorFlow in some way not
 supported by current APIs, hardware engineers who want to optimize for
 TensorFlow, implementers of machine learning systems working on scaling and
-distribution, or anyone who wants to look under Tensorflow's hood. After
-reading it you should understand TensorFlow architecture well enough to read
+distribution, or anyone who wants to look under Tensorflow's hood. By the end of this document 
+you should understand the TensorFlow architecture well enough to read
 and modify the core TensorFlow code.
 
 ## Overview
@@ -35,7 +35,7 @@ This document focuses on the following layers:
 *  **Client**:
    *  Defines the computation as a dataflow graph.
    *  Initiates graph execution using a [**session**](
-      https://www.tensorflow.org/code/tensorflow/python/client/session.py)
+      https://www.tensorflow.org/code/tensorflow/python/client/session.py).
 *  **Distributed Master**
    *  Prunes a specific subgraph from the graph, as defined by the arguments
       to Session.run().
@@ -55,7 +55,7 @@ Figure 2 illustrates the interaction of these components. "/job:worker/task:0" a
 server": a task responsible for storing and updating the model's parameters.
 Other tasks send updates to these parameters as they work on optimizing the
 parameters. This particular division of labor between tasks is not required, but
-it is common for distributed training.
+ is common for distributed training.
 
 ![TensorFlow Architecture Diagram](https://www.tensorflow.org/images/diag1.svg){: width="500"}
 
@@ -193,7 +193,7 @@ https://www.tensorflow.org/code/tensorflow/contrib/nccl/python/ops/nccl_ops.py))
 
 ## Kernel Implementations
 
-The runtime contains over 200 standard operations, including mathematical, array
+The runtime contains over 200 standard operations including mathematical, array
 manipulation, control flow, and state management operations. Each of these
 operations can have kernel implementations optimized for a variety of devices.
 Many of the operation kernels are implemented using Eigen::Tensor, which uses
diff --git a/tensorflow/docs_src/extend/new_data_formats.md b/tensorflow/docs_src/extend/new_data_formats.md
index 2c33a6b6f7e5f1..d1d1f69766987c 100644
--- a/tensorflow/docs_src/extend/new_data_formats.md
+++ b/tensorflow/docs_src/extend/new_data_formats.md
@@ -45,7 +45,7 @@ Each of these implementations comprises three related classes:
 * A `tensorflow::GraphDatasetBase` subclass (e.g. `TextLineDatasetOp::Dataset`),
   which represents the *immutable* definition of the dataset itself, and tells
   TensorFlow how to construct an iterator object over that dataset, in its
-  `MakeIterator()` method.
+  `MakeIteratorInternal()` method.
 
 * A `tensorflow::DatasetIterator<Dataset>` subclass (e.g.
   `TextLineDatasetOp::Dataset::Iterator`), which represents the *mutable* state
@@ -103,7 +103,7 @@ class MyReaderDatasetOp : public DatasetOpKernel {
    public:
     Dataset(OpKernelContext* ctx) : GraphDatasetBase(ctx) {}
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::MyReader")}));
@@ -124,7 +124,7 @@ class MyReaderDatasetOp : public DatasetOpKernel {
       return *shapes;
     }
 
-    string DebugString() override { return "MyReaderDatasetOp::Dataset"; }
+    string DebugString() const override { return "MyReaderDatasetOp::Dataset"; }
 
    protected:
     // Optional: Implementation of `GraphDef` serialization for this dataset.
diff --git a/tensorflow/docs_src/get_started/datasets_quickstart.md b/tensorflow/docs_src/get_started/datasets_quickstart.md
index c972e5e555eea1..020e40dd3b8f04 100644
--- a/tensorflow/docs_src/get_started/datasets_quickstart.md
+++ b/tensorflow/docs_src/get_started/datasets_quickstart.md
@@ -14,7 +14,7 @@ introduces the API by walking through two simple examples:
 
 Taking slices from an array is the simplest way to get started with `tf.data`.
 
-The @{$get_started/premade_estimators$Premade Estimators} chapter describes
+The @{$premade_estimators$Premade Estimators} chapter describes
 the following `train_input_fn`, from
 [`iris_data.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/iris_data.py),
 to pipe the data into the Estimator:
@@ -377,7 +377,7 @@ Now you have the basic idea of how to efficiently load data into an
 Estimator. Consider the following documents next:
 
 
-* @{$get_started/custom_estimators}, which demonstrates how to build your own
+* @{$custom_estimators}, which demonstrates how to build your own
   custom `Estimator` model.
 * The @{$low_level_intro#datasets$Low Level Introduction}, which demonstrates
   how to experiment directly with `tf.data.Datasets` using TensorFlow's low
diff --git a/tensorflow/docs_src/get_started/eager.md b/tensorflow/docs_src/get_started/eager.md
index ad89f0154c06d9..f08ac74425b6dc 100644
--- a/tensorflow/docs_src/get_started/eager.md
+++ b/tensorflow/docs_src/get_started/eager.md
@@ -1,3 +1,3 @@
 # Get Started with Eager Execution
 
-[Colab notebook](https://colab.research.google.com/github/tensorflow/models/blob/master/samples/core/get_started/eager.ipynb)
+[Colab notebook](https://colab.research.google.com/github/tensorflow/models/blob/r1.8.0/samples/core/get_started/eager.ipynb)
diff --git a/tensorflow/docs_src/get_started/get_started_for_beginners.md b/tensorflow/docs_src/get_started/get_started_for_beginners.md
deleted file mode 100644
index fbe0ed74f82bb3..00000000000000
--- a/tensorflow/docs_src/get_started/get_started_for_beginners.md
+++ /dev/null
@@ -1,751 +0,0 @@
-# Get Started with Graph Execution
-
-This document explains how to use machine learning to classify (categorize)
-Iris flowers by species.  This document dives deeply into the TensorFlow
-code to do exactly that, explaining ML fundamentals along the way.
-
-If the following list describes you, then you are in the right place:
-
-*   You know little to nothing about machine learning.
-*   You want to learn how to write TensorFlow programs.
-*   You can code (at least a little) in Python.
-
-If you are already familiar with basic machine learning concepts
-but are new to TensorFlow, read
-@{$premade_estimators$Getting Started with TensorFlow: for ML Experts}.
-
-If you'd like to learn a lot about the basics of Machine Learning,
-consider taking
-[Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course/).
-
-
-## The Iris classification problem
-
-Imagine you are a botanist seeking an automated way to classify each
-Iris flower you find.  Machine learning provides many ways to classify flowers.
-For instance, a sophisticated machine learning program could classify flowers
-based on photographs.  Our ambitions are more modest--we're going to classify
-Iris flowers based solely on the length and width of their
-[sepals](https://en.wikipedia.org/wiki/Sepal) and
-[petals](https://en.wikipedia.org/wiki/Petal).
-
-The Iris genus entails about 300 species, but our program will classify only
-the following three:
-
-*   Iris setosa
-*   Iris virginica
-*   Iris versicolor
-
-<div style="margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%"
-  alt="Petal geometry compared for three iris species: Iris setosa, Iris virginica, and Iris versicolor"
-  src="../images/iris_three_species.jpg">
-</div>
-
-**From left to right,
-[*Iris setosa*](https://commons.wikimedia.org/w/index.php?curid=170298) (by
-[Radomil](https://commons.wikimedia.org/wiki/User:Radomil), CC BY-SA 3.0),
-[*Iris versicolor*](https://commons.wikimedia.org/w/index.php?curid=248095) (by
-[Dlanglois](https://commons.wikimedia.org/wiki/User:Dlanglois), CC BY-SA 3.0),
-and [*Iris virginica*](https://www.flickr.com/photos/33397993@N05/3352169862)
-(by [Frank Mayfield](https://www.flickr.com/photos/33397993@N05), CC BY-SA
-2.0).**
-<p>&nbsp;</p>
-
-Fortunately, someone has already created [a data set of 120 Iris
-flowers](https://en.wikipedia.org/wiki/Iris_flower_data_set)
-with the sepal and petal measurements.  This data set has become
-one of the canonical introductions to machine learning classification problems.
-(The [MNIST database](https://en.wikipedia.org/wiki/MNIST_database),
-which contains handwritten digits, is another popular classification
-problem.) The first 5 entries of the Iris data set
-look as follows:
-
-| Sepal length | sepal width | petal length | petal width | species
-| ---          | ---         | ---          | ---         | ---
-|6.4           | 2.8         | 5.6          | 2.2         | 2
-|5.0           | 2.3         | 3.3          | 1.0         | 1
-|4.9           | 2.5         | 4.5          | 1.7         | 2
-|4.9           | 3.1         | 1.5          | 0.1         | 0
-|5.7           | 3.8         | 1.7          | 0.3         | 0
-
-Let's introduce some terms:
-
-*   The last column (species) is called the
-    [**label**](https://developers.google.com/machine-learning/glossary/#label);
-    the first four columns are called
-    [**features**](https://developers.google.com/machine-learning/glossary/#feature).
-    Features are characteristics of an example, while the label is
-    the thing we're trying to predict.
-
-*   An [**example**](https://developers.google.com/machine-learning/glossary/#example)
-    consists of the set of features and the label for one sample
-    flower. The preceding table shows 5 examples from a data set of
-    120 examples.
-
-Each label is naturally a string (for example, "setosa"), but machine learning
-typically relies on numeric values. Therefore, someone mapped each string to
-a number.  Here's the representation scheme:
-
-* 0 represents setosa
-* 1 represents versicolor
-* 2 represents virginica
-
-For a look at other examples of labels and examples, see the
-[ML Terminology section of Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course/framing/ml-terminology).
-
-
-## Models and training
-
-A **model** is the relationship between features
-and the label.  For the Iris problem, the model defines the relationship
-between the sepal and petal measurements and the predicted Iris species. Some
-simple models can be described with a few lines of algebra, but complex machine
-learning models have a large number of parameters that are difficult to
-summarize.
-
-Could you determine the relationship between the four features and the
-Iris species *without* using machine learning?  That is, could you use
-traditional programming techniques (for example, a lot of conditional
-statements) to create a model?  Maybe. You could play with the data set
-long enough to determine the right relationships of petal and sepal
-measurements to particular species.  However, a good machine learning
-approach *determines the model for you*.  That is, if you feed enough
-representative examples into the right machine learning model type, the program
-will determine the relationship between sepals, petals, and species.
-
-**Training** is the stage of machine learning in which the model is
-gradually optimized (learned).  The Iris problem is an example
-of [**supervised machine
-learning**](https://developers.google.com/machine-learning/glossary/#supervised_machine_learning)
-in which a model is trained from examples that contain labels.  (In
-[**unsupervised machine
-learning**](https://developers.google.com/machine-learning/glossary/#unsupervised_machine_learning),
-the examples don't contain labels. Instead, the model typically finds
-patterns among the features.)
-
-
-
-
-## Get the sample program
-
-Prior to playing with the sample code in this document, do the following:
-
-1.  @{$install$Install TensorFlow}.
-2.  If you installed TensorFlow with virtualenv or Anaconda, activate your
-    TensorFlow environment.
-3.  Install or upgrade pandas by issuing the following command:
-
-     `pip install pandas`
-
-
-Take the following steps to get the sample program:
-
-1. Clone the TensorFlow Models repository from github by entering the following
-   command:
-
-       `git clone https://github.com/tensorflow/models`
-
-2. Change directory within that branch to the location containing the examples
-   used in this document:
-
-       `cd models/samples/core/get_started/`
-
-In that `get_started` directory, you'll find a program
-named `premade_estimator.py`.
-
-
-## Run the sample program
-
-You run TensorFlow programs as you would run any Python program. Therefore,
-issue the following command from a command line to
-run `premade_estimators.py`:
-
-``` bash
-python premade_estimator.py
-```
-
-Running the program should output a whole bunch of information ending with
-three prediction lines like the following:
-
-```None
-...
-Prediction is "Setosa" (99.6%), expected "Setosa"
-
-Prediction is "Versicolor" (99.8%), expected "Versicolor"
-
-Prediction is "Virginica" (97.9%), expected "Virginica"
-```
-
-If the program generates errors instead of predictions, ask yourself the
-following questions:
-
-* Did you install TensorFlow properly?
-* Are you using the correct version of TensorFlow?  The `premade_estimators.py`
-  program requires at least TensorFlow v1.4.
-* If you installed TensorFlow with virtualenv or Anaconda, did you activate
-  the environment?
-
-
-
-## The TensorFlow programming stack
-
-As the following illustration shows, TensorFlow
-provides a programming stack consisting of multiple API layers:
-
-<div style="margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../images/tensorflow_programming_environment.png">
-</div>
-
-**The TensorFlow Programming Environment.**
-<p>&nbsp;</p>
-
-As you start writing TensorFlow programs, we strongly recommend focusing on
-the following two high-level APIs:
-
-*   Estimators
-*   Datasets
-
-Although we'll grab an occasional convenience function from other APIs,
-this document focuses on the preceding two APIs.
-
-
-## The program itself
-
-Thanks for your patience; let's dig into the code.
-The general outline of `premade_estimator.py`--and many other TensorFlow
-programs--is as follows:
-
-*   Import and parse the data sets.
-*   Create feature columns to describe the data.
-*   Select the type of model
-*   Train the model.
-*   Evaluate the model's effectiveness.
-*   Let the trained model make predictions.
-
-The following subsections detail each part.
-
-
-### Import and parse the data sets
-
-The Iris program requires the data from the following two .csv files:
-
-*   `http://download.tensorflow.org/data/iris_training.csv`, which contains
-    the training set.
-*   `http://download.tensorflow.org/data/iris_test.csv`, which contains the
-    the test set.
-
-The **training set** contains the examples that we'll use to train the model;
-the **test set** contains the examples that we'll use to evaluate the trained
-model's effectiveness.
-
-The training set and test set started out as a
-single data set.  Then, someone split the examples, with the majority going into
-the training set and the remainder going into the test set.  Adding
-examples to the training set usually builds a better model; however, adding
-more examples to the test set enables us to better gauge the model's
-effectiveness. Regardless of the split, the examples in the test set
-must be separate from the examples in the training set.  Otherwise, you can't
-accurately determine the model's effectiveness.
-
-The `premade_estimators.py` program relies on the `load_data` function
-in the adjacent [`iris_data.py`](
-https://github.com/tensorflow/models/blob/master/samples/core/get_started/iris_data.py)
-file to read in and parse the training set and test set.
-Here is a heavily commented version of the function:
-
-```python
-TRAIN_URL = "http://download.tensorflow.org/data/iris_training.csv"
-TEST_URL = "http://download.tensorflow.org/data/iris_test.csv"
-
-CSV_COLUMN_NAMES = ['SepalLength', 'SepalWidth',
-                    'PetalLength', 'PetalWidth', 'Species']
-
-...
-
-def load_data(label_name='Species'):
-    """Parses the csv file in TRAIN_URL and TEST_URL."""
-
-    # Create a local copy of the training set.
-    train_path = tf.keras.utils.get_file(fname=TRAIN_URL.split('/')[-1],
-                                         origin=TRAIN_URL)
-    # train_path now holds the pathname: ~/.keras/datasets/iris_training.csv
-
-    # Parse the local CSV file.
-    train = pd.read_csv(filepath_or_buffer=train_path,
-                        names=CSV_COLUMN_NAMES,  # list of column names
-                        header=0  # ignore the first row of the CSV file.
-                       )
-    # train now holds a pandas DataFrame, which is data structure
-    # analogous to a table.
-
-    # 1. Assign the DataFrame's labels (the right-most column) to train_label.
-    # 2. Delete (pop) the labels from the DataFrame.
-    # 3. Assign the remainder of the DataFrame to train_features
-    train_features, train_label = train, train.pop(label_name)
-
-    # Apply the preceding logic to the test set.
-    test_path = tf.keras.utils.get_file(TEST_URL.split('/')[-1], TEST_URL)
-    test = pd.read_csv(test_path, names=CSV_COLUMN_NAMES, header=0)
-    test_features, test_label = test, test.pop(label_name)
-
-    # Return four DataFrames.
-    return (train_features, train_label), (test_features, test_label)
-```
-
-Keras is an open-sourced machine learning library; `tf.keras` is a TensorFlow
-implementation of Keras.  The `premade_estimator.py` program only accesses
-one `tf.keras` function; namely, the `tf.keras.utils.get_file` convenience
-function, which copies a remote CSV file to a local file system.
-
-The call to `load_data` returns two `(feature,label)` pairs, for the training
-and test sets respectively:
-
-```python
-    # Call load_data() to parse the CSV file.
-    (train_feature, train_label), (test_feature, test_label) = load_data()
-```
-
-Pandas is an open-source Python library leveraged by several
-TensorFlow functions.  A pandas
-[**DataFrame**](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html)
-is a table with named columns headers and numbered rows.
-The features returned by `load_data` are packed in `DataFrames`.
-For example, the `test_feature` DataFrame looks as follows:
-
-```none
-    SepalLength  SepalWidth  PetalLength  PetalWidth
-0           5.9         3.0          4.2         1.5
-1           6.9         3.1          5.4         2.1
-2           5.1         3.3          1.7         0.5
-...
-27          6.7         3.1          4.7         1.5
-28          6.7         3.3          5.7         2.5
-29          6.4         2.9          4.3         1.3
-```
-
-
-### Describe the data
-
-A **feature column** is a data structure that tells your model
-how to interpret the data in each feature.  In the Iris problem,
-we want the model to interpret the data in each
-feature as its literal floating-point value; that is, we want the
-model to interpret an input value like 5.4 as, well, 5.4.  However,
-in other machine learning problems, it is often desirable to interpret
-data less literally.  Using feature columns to
-interpret data is such a rich topic that we devote an entire
-@{$feature_columns$document} to it.
-
-From a code perspective, you build a list of `feature_column` objects by calling
-functions from the @{tf.feature_column} module. Each object describes an input
-to the model. To tell the model to interpret data as a floating-point value,
-call @{tf.feature_column.numeric_column}.  In `premade_estimator.py`, all
-four features should be interpreted as literal floating-point values, so
-the code to create a feature column looks as follows:
-
-```python
-# Create feature columns for all features.
-my_feature_columns = []
-for key in train_x.keys():
-    my_feature_columns.append(tf.feature_column.numeric_column(key=key))
-```
-
-Here is a less elegant, but possibly clearer, alternative way to
-encode the preceding block:
-
-```python
-my_feature_columns = [
-    tf.feature_column.numeric_column(key='SepalLength'),
-    tf.feature_column.numeric_column(key='SepalWidth'),
-    tf.feature_column.numeric_column(key='PetalLength'),
-    tf.feature_column.numeric_column(key='PetalWidth')
-]
-```
-
-
-### Select the type of model
-
-We need to select the kind of model that will be trained.
-Lots of model types exist; picking the ideal type takes experience.
-We've selected a neural network to solve the Iris problem.  [**Neural
-networks**](https://developers.google.com/machine-learning/glossary/#neural_network)
-can find complex relationships between features and the label.
-A neural network is a highly-structured graph, organized into one or more
-[**hidden layers**](https://developers.google.com/machine-learning/glossary/#hidden_layer).
-Each hidden layer consists of one or more
-[**neurons**](https://developers.google.com/machine-learning/glossary/#neuron).
-There are several categories of neural networks.
-We'll be using a [**fully connected neural
-network**](https://developers.google.com/machine-learning/glossary/#fully_connected_layer),
-which means that the neurons in one layer take inputs from *every* neuron in
-the previous layer.  For example, the following figure illustrates a
-fully connected neural network consisting of three hidden layers:
-
-*   The first hidden layer contains four neurons.
-*   The second hidden layer contains three neurons.
-*   The third hidden layer contains two neurons.
-
-<div style="margin:auto; margin-bottom:10px; margin-top:20px;">
-<img style="width:100%" src="../images/simple_dnn.svg">
-</div>
-
-**A neural network with three hidden layers.**
-<p>&nbsp;</p>
-
-For a more detailed introduction to neural networks, see the
-[Introduction to Neural Nets section of Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course/introduction-to-neural-networks/anatomy).
-
-To specify a model type, instantiate an
-[**Estimator**](https://developers.google.com/machine-learning/glossary/#Estimators)
-class.  TensorFlow provides two categories of Estimators:
-
-*   [**pre-made
-    Estimators**](https://developers.google.com/machine-learning/glossary/#pre-made_Estimator),
-    which someone else has already written for you.
-*   [**custom
-    Estimators**](https://developers.google.com/machine-learning/glossary/#custom_estimator),
-    which you must code yourself, at least partially.
-
-To implement a neural network, the `premade_estimators.py` program uses
-a pre-made Estimator named @{tf.estimator.DNNClassifier}.  This Estimator
-builds a neural network that classifies examples.  The following call
-instantiates `DNNClassifier`:
-
-```python
-    classifier = tf.estimator.DNNClassifier(
-        feature_columns=my_feature_columns,
-        hidden_units=[10, 10],
-        n_classes=3)
-```
-
-Use the `hidden_units` parameter to define the number of neurons
-in each hidden layer of the neural network.  Assign this parameter
-a list. For example:
-
-```python
-        hidden_units=[10, 10],
-```
-
-The length of the list assigned to `hidden_units` identifies the number of
-hidden layers (2, in this case).
-Each value in the list represents the number of neurons in a particular
-hidden layer (10 in the first hidden layer and 10 in the second hidden layer).
-To change the number of hidden layers or neurons, simply assign a different
-list to the `hidden_units` parameter.
-
-The ideal number of hidden layers and neurons depends on the problem
-and the data set. Like many aspects of machine learning,
-picking the ideal shape of the neural network requires some mixture
-of knowledge and experimentation.
-As a rule of thumb, increasing the number of hidden layers and neurons
-*typically* creates a more powerful model, which requires more data to
-train effectively.
-
-The `n_classes` parameter specifies the number of possible values that the
-neural network can predict.  Since the Iris problem classifies 3 Iris species,
-we set `n_classes` to 3.
-
-The constructor for `tf.Estimator.DNNClassifier` takes an optional argument
-named `optimizer`, which our sample code chose not to specify.  The
-[**optimizer**](https://developers.google.com/machine-learning/glossary/#optimizer)
-controls how the model will train.  As you develop more expertise in machine
-learning, optimizers and
-[**learning
-rate**](https://developers.google.com/machine-learning/glossary/#learning_rate)
-will become very important.
-
-
-
-### Train the model
-
-Instantiating a `tf.Estimator.DNNClassifier` creates a framework for learning
-the model. Basically, we've wired a network but haven't yet let data flow
-through it. To train the neural network, call the Estimator object's `train`
-method. For example:
-
-```python
-    classifier.train(
-        input_fn=lambda:train_input_fn(train_feature, train_label, args.batch_size),
-        steps=args.train_steps)
-```
-
-The `steps` argument tells `train` to stop training after the specified
-number of iterations.  Increasing `steps` increases the amount of time
-the model will train.  Counter-intuitively, training a model longer
-does not guarantee a better model.  The default value of `args.train_steps`
-is 1000.  The number of steps to train is a
-[**hyperparameter**](https://developers.google.com/machine-learning/glossary/#hyperparameter)
-you can tune. Choosing the right number of steps usually
-requires both experience and experimentation.
-
-The `input_fn` parameter identifies the function that supplies the
-training data.  The call to the `train` method indicates that the
-`train_input_fn` function will supply the training data.  Here's that
-method's signature:
-
-```python
-def train_input_fn(features, labels, batch_size):
-```
-
-We're passing the following arguments to `train_input_fn`:
-
-* `train_feature` is a Python dictionary in which:
-    * Each key is the name of a feature.
-    * Each value is an array containing the values for each example in the
-      training set.
-* `train_label` is an array containing the values of the label for every
-  example in the training set.
-* `args.batch_size` is an integer defining the [**batch
-  size**](https://developers.google.com/machine-learning/glossary/#batch_size).
-
-The `train_input_fn` function relies on the **Dataset API**. This is a
-high-level TensorFlow API for reading data and transforming it into a form
-that the `train` method requires.  The following call converts the
-input features and labels into a `tf.data.Dataset` object, which is the base
-class of the Dataset API:
-
-```python
-    dataset = tf.data.Dataset.from_tensor_slices((dict(features), labels))
-```
-
-The `tf.dataset` class provides many useful functions for preparing examples
-for training. The following line calls three of those functions:
-
-```python
-    dataset = dataset.shuffle(buffer_size=1000).repeat(count=None).batch(batch_size)
-```
-
-Training works best if the training examples are in
-random order.  To randomize the examples, call
-`tf.data.Dataset.shuffle`.  Setting the `buffer_size` to a value
-larger than the number of examples (120) ensures that the data will
-be well shuffled.
-
-During training, the `train` method typically processes the
-examples multiple times.  Calling the
-`tf.data.Dataset.repeat` method without any arguments ensures
-that the `train` method has an infinite supply of (now shuffled)
-training set examples.
-
-The `train` method processes a
-[**batch**](https://developers.google.com/machine-learning/glossary/#batch)
-of examples at a time.
-The `tf.data.Dataset.batch` method creates a batch by
-concatenating multiple examples.
-This program sets the default [**batch
-size**](https://developers.google.com/machine-learning/glossary/#batch_size)
-to 100, meaning that the `batch` method will concatenate groups of
-100 examples.  The ideal batch size depends on the problem.  As a rule
-of thumb, smaller batch sizes usually enable the `train` method to train
-the model faster at the expense (sometimes) of accuracy.
-
-The following `return` statement passes a batch of examples back to
-the caller (the `train` method).
-
-```python
-   return dataset.make_one_shot_iterator().get_next()
-```
-
-
-### Evaluate the model
-
-**Evaluating** means determining how effectively the model makes
-predictions.  To determine the Iris classification model's effectiveness,
-pass some sepal and petal measurements to the model and ask the model
-to predict what Iris species they represent. Then compare the model's
-prediction against the actual label.  For example, a model that picked
-the correct species on half the input examples would have an
-[accuracy](https://developers.google.com/machine-learning/glossary/#accuracy)
-of 0.5.  The following suggests a more effective model:
-
-
-<table>
-  <tr>
-    <th style="background-color:darkblue" colspan="5">
-       Test Set</th>
-  </tr>
-  <tr>
-    <th colspan="4">Features</th>
-    <th colspan="1">Label</th>
-    <th colspan="1">Prediction</th>
-  </tr>
-  <tr> <td>5.9</td> <td>3.0</td> <td>4.3</td> <td>1.5</td> <td>1</td>
-          <td style="background-color:green">1</td></tr>
-  <tr> <td>6.9</td> <td>3.1</td> <td>5.4</td> <td>2.1</td> <td>2</td>
-          <td style="background-color:green">2</td></tr>
-  <tr> <td>5.1</td> <td>3.3</td> <td>1.7</td> <td>0.5</td> <td>0</td>
-          <td style="background-color:green">0</td></tr>
-  <tr> <td>6.0</td> <td>3.4</td> <td>4.5</td> <td>1.6</td> <td>1</td>
-          <td style="background-color:red">2</td></tr>
-  <tr> <td>5.5</td> <td>2.5</td> <td>4.0</td> <td>1.3</td> <td>1</td>
-          <td style="background-color:green">1</td></tr>
-</table>
-
-**A model that is 80% accurate.**
-<p>&nbsp;</p>
-
-To evaluate a model's effectiveness, each Estimator provides an `evaluate`
-method.  The `premade_estimator.py` program calls `evaluate` as follows:
-
-```python
-# Evaluate the model.
-eval_result = classifier.evaluate(
-    input_fn=lambda:eval_input_fn(test_x, test_y, args.batch_size))
-
-print('\nTest set accuracy: {accuracy:0.3f}\n'.format(**eval_result))
-```
-
-The call to `classifier.evaluate` is similar to the call to `classifier.train`.
-The biggest difference is that `classifier.evaluate` must get its examples
-from the test set rather than the training set.  In other words, to
-fairly assess a model's effectiveness, the examples used to
-*evaluate* a model must be different from the examples used to *train*
-the model.  The `eval_input_fn` function serves a batch of examples from
-the test set.  Here's the `eval_input_fn` method:
-
-```python
-def eval_input_fn(features, labels=None, batch_size=None):
-    """An input function for evaluation or prediction"""
-    if labels is None:
-        # No labels, use only features.
-        inputs = features
-    else:
-        inputs = (features, labels)
-
-    # Convert inputs to a tf.dataset object.
-    dataset = tf.data.Dataset.from_tensor_slices(inputs)
-
-    # Batch the examples
-    assert batch_size is not None, "batch_size must not be None"
-    dataset = dataset.batch(batch_size)
-
-    # Return the read end of the pipeline.
-    return dataset.make_one_shot_iterator().get_next()
-```
-
-In brief, `eval_input_fn` does the following when called by
-`classifier.evaluate`:
-
-1.  Converts the features and labels from the test set to a `tf.dataset`
-    object.
-2.  Creates a batch of test set examples.  (There's no need to shuffle
-    or repeat the test set examples.)
-3.  Returns that batch of test set examples to `classifier.evaluate`.
-
-Running this code yields the following output (or something close to it):
-
-```none
-Test set accuracy: 0.967
-```
-
-An accuracy of 0.967 implies that our trained model correctly classified 29
-out of the 30 Iris species in the test set.
-
-To get a deeper understanding of different metrics for evaluating
-models, see the
-[Classification section of Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course/classification).
-
-
-### Predicting
-
-We've now trained a model and "proven" that it is good--but not
-perfect--at classifying Iris species.  Now let's use the trained
-model to make some predictions on [**unlabeled
-examples**](https://developers.google.com/machine-learning/glossary/#unlabeled_example);
-that is, on examples that contain features but not a label.
-
-In real-life, the unlabeled examples could come from lots of different
-sources including apps, CSV files, and data feeds.  For now, we're simply
-going to manually provide the following three unlabeled examples:
-
-```python
-    predict_x = {
-        'SepalLength': [5.1, 5.9, 6.9],
-        'SepalWidth': [3.3, 3.0, 3.1],
-        'PetalLength': [1.7, 4.2, 5.4],
-        'PetalWidth': [0.5, 1.5, 2.1],
-    }
-```
-
-Every Estimator provides a `predict` method, which `premade_estimator.py`
-calls as follows:
-
-```python
-predictions = classifier.predict(
-    input_fn=lambda:eval_input_fn(predict_x,
-                                  labels=None,
-                                  batch_size=args.batch_size))
-```
-
-As with the `evaluate` method, our `predict` method also gathers examples
-from the `eval_input_fn` method.
-
-When doing predictions, we're *not* passing labels to `eval_input_fn`.
-Therefore, `eval_input_fn` does the following:
-
-1.  Converts the features from the 3-element manual set we just created.
-2.  Creates a batch of 3 examples from that manual set.
-3.  Returns that batch of examples to `classifier.predict`.
-
-The `predict` method returns a python iterable, yielding a dictionary of
-prediction results for each example.  This dictionary contains several keys.
-The `probabilities` key holds a list of three floating-point values,
-each representing the probability that the input example is a particular
-Iris species.  For example, consider the following `probabilities` list:
-
-```none
-'probabilities': array([  1.19127117e-08,   3.97069454e-02,   9.60292995e-01])
-```
-
-The preceding list indicates:
-
-*   A negligible chance of the Iris being Setosa.
-*   A 3.97% chance of the Iris being Versicolor.
-*   A 96.0% chance of the Iris being Virginica.
-
-The `class_ids` key holds a one-element array that identifies the most
-probable species.  For example:
-
-```none
-'class_ids': array([2])
-```
-
-The number `2` corresponds to Virginica.  The following code iterates
-through the returned `predictions` to report on each prediction:
-
-``` python
-for pred_dict, expec in zip(predictions, expected):
-    template = ('\nPrediction is "{}" ({:.1f}%), expected "{}"')
-
-    class_id = pred_dict['class_ids'][0]
-    probability = pred_dict['probabilities'][class_id]
-    print(template.format(iris_data.SPECIES[class_id], 100 * probability, expec))
-```
-
-Running the program yields the following output:
-
-
-``` None
-...
-Prediction is "Setosa" (99.6%), expected "Setosa"
-
-Prediction is "Versicolor" (99.8%), expected "Versicolor"
-
-Prediction is "Virginica" (97.9%), expected "Virginica"
-```
-
-
-## Summary
-
-This document provides a short introduction to machine learning.
-
-Because `premade_estimators.py` relies on high-level APIs, much of the
-mathematical complexity in machine learning is hidden.
-If you intend to become more proficient in machine learning, we recommend
-ultimately learning more about [**gradient
-descent**](https://developers.google.com/machine-learning/glossary/#gradient_descent),
-batching, and neural networks.
-
-We recommend reading the @{$feature_columns$Feature Columns} document next,
-which explains how to represent different kinds of data in machine learning.
diff --git a/tensorflow/docs_src/get_started/index.md b/tensorflow/docs_src/get_started/index.md
index b28cb9df75d94a..232d2f154703dc 100644
--- a/tensorflow/docs_src/get_started/index.md
+++ b/tensorflow/docs_src/get_started/index.md
@@ -10,31 +10,13 @@ course prior to diving into TensorFlow documentation:
 TensorFlow is a tool for machine learning. While it contains a wide range of
 functionality, TensorFlow is mainly designed for deep neural network models.
 
-The easiest way to get started with tensorflow is using Eager Execution.
+The easiest way to get started with TensorFlow is by using Eager Execution.
 
-  * @{$get_started/eager}, is for anyone new to  machine learning or TensorFlow.
+  * @{$get_started/eager}, is for anyone new to machine learning or TensorFlow.
 
 TensorFlow provides many APIs. The remainder of this section focuses on the
-Estimator API which provide scalable, high-performance models.
-To get started with Estimators begin by reading one of the following documents:
-
-  * @{$get_started/get_started_for_beginners}, which is aimed at readers
-    new to machine learning.
-  * @{$get_started/premade_estimators}, which is aimed at readers who have
-    experience in machine learning.
-
-Then, read the following documents, which demonstrate the key features
-in the high-level APIs:
-
-  * @{$get_started/checkpoints}, which explains how to save training progress
-    and resume where you left off.
-  * @{$get_started/feature_columns}, which shows how an
-    Estimator can handle a variety of input data types without changes to the
-    model.
-  * @{$get_started/datasets_quickstart}, which introduces TensorFlow's
-    input pipelines.
-  * @{$get_started/custom_estimators}, which demonstrates how
-    to build and train models you design yourself.
+Estimator API which provide scalable, high-performance models. See the
+@{$estimators} guide.
 
 For more advanced users:
 
diff --git a/tensorflow/docs_src/get_started/leftnav_files b/tensorflow/docs_src/get_started/leftnav_files
index 4c12f0d84b3d13..e6cc8d56581068 100644
--- a/tensorflow/docs_src/get_started/leftnav_files
+++ b/tensorflow/docs_src/get_started/leftnav_files
@@ -1,15 +1,4 @@
 index.md
 
-### Beginners
 eager.md
-get_started_for_beginners.md
-premade_estimators.md
-
-### Estimators
-get_started_for_beginners.md: For Beginners
-premade_estimators.md: Premade Estimators
->>>
-checkpoints.md
-feature_columns.md
 datasets_quickstart.md
-custom_estimators.md
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index 8c165aad52499a..1abd840ab3ca3f 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -38,7 +38,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.8.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.8.0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index 26cbcc9a9b0a99..52a2a3f8a68dd5 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.8.0-rc1.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.8.0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index 6a4ac290881e51..1256fb99c4307c 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.8.0-rc1</version>
+  <version>1.8.0</version>
 </dependency>
 ```
 
@@ -65,11 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-<<<<<<< HEAD
-                 <version>1.8.0-rc1</version>
-=======
-                 <version>1.8.0-rc0</version>
->>>>>>> 43a7072882196c7ac2d9429050a3140b1ecb52db
+                 <version>1.8.0</version>
                </dependency>
              </dependencies>
          </project>
@@ -128,20 +124,12 @@ instead:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-<<<<<<< HEAD
-  <version>1.8.0-rc1</version>
-=======
-  <version>1.8.0-rc0</version>
->>>>>>> 43a7072882196c7ac2d9429050a3140b1ecb52db
+  <version>1.8.0</version>
 </dependency>
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow_jni_gpu</artifactId>
-<<<<<<< HEAD
-  <version>1.8.0-rc1</version>
-=======
-  <version>1.8.0-rc0</version>
->>>>>>> 43a7072882196c7ac2d9429050a3140b1ecb52db
+  <version>1.8.0</version>
 </dependency>
 ```
 
@@ -160,11 +148,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-<<<<<<< HEAD
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0-rc1.jar),
-=======
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0-rc0.jar),
->>>>>>> 43a7072882196c7ac2d9429050a3140b1ecb52db
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -183,11 +167,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-<<<<<<< HEAD
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.8.0-rc1.tar.gz" |
-=======
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.8.0-rc0.tar.gz" |
->>>>>>> 43a7072882196c7ac2d9429050a3140b1ecb52db
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.8.0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -195,17 +175,10 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-<<<<<<< HEAD
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0-rc1.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.8.0-rc1.zip).
-=======
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0-rc0.jar),
-     which is the TensorFlow Java Archive (JAR).
-  2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.8.0-rc0.zip).
->>>>>>> 43a7072882196c7ac2d9429050a3140b1ecb52db
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.8.0.zip).
   3. Extract this .zip file.
 
 
@@ -254,11 +227,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<<<<<<< HEAD
-<pre><b>javac -cp libtensorflow-1.8.0-rc1.jar HelloTF.java</b></pre>
-=======
-<pre><b>javac -cp libtensorflow-1.8.0-rc0.jar HelloTF.java</b></pre>
->>>>>>> 43a7072882196c7ac2d9429050a3140b1ecb52db
+<pre><b>javac -cp libtensorflow-1.8.0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -272,19 +241,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<<<<<<< HEAD
-<pre><b>java -cp libtensorflow-1.8.0-rc1.jar:. -Djava.library.path=./jni HelloTF</b></pre>
-
-And the following command line executes the `HelloTF` program on Windows:
-
-<pre><b>java -cp libtensorflow-1.8.0-rc1.jar;. -Djava.library.path=jni HelloTF</b></pre>
-=======
-<pre><b>java -cp libtensorflow-1.8.0-rc0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.8.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.8.0-rc0.jar;. -Djava.library.path=jni HelloTF</b></pre>
->>>>>>> 43a7072882196c7ac2d9429050a3140b1ecb52db
+<pre><b>java -cp libtensorflow-1.8.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index c66d50c3cb1b98..7b56b6a5088c0e 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -339,9 +339,7 @@ Docker will download the TensorFlow binary image the first time you launch it.
 
 #### GPU support
 
-Prior to installing TensorFlow with GPU support, ensure that your system meets all
-[NVIDIA software requirements](#NVIDIARequirements).  To launch a Docker container
-with NVidia GPU support, enter a command of the following format:
+To launch a Docker container with NVidia GPU support, enter a command of the following format (this [does not require any local CUDA installation](https://github.com/nvidia/nvidia-docker/wiki/CUDA#requirements)):
 
 <pre>
 $ <b>nvidia-docker run -it</b> <i>-p hostPort:containerPort TensorFlowGPUImage</i>
@@ -438,7 +436,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 <a name="ValidateYourInstallation"></a>
 ## Validate your installation
@@ -496,6 +494,8 @@ If you are new to machine learning, we recommend the following:
 *  [Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course)
 *  @{$get_started/eager}
 
+If you are experienced with machine learning but new to TensorFlow, see
+@{$get_started/eager}.
 
 <a name="NVIDIARequirements"></a>
 ## TensorFlow GPU support
@@ -515,7 +515,7 @@ on your system:
   from source. To use the TensorFlow binaries, version 3.5 or higher is required.
   See the [NVIDIA documentation](https://developer.nvidia.com/cuda-gpus) for a
   list of supported GPU cards.
-* [GPU drivers](http://nvidia.com/driver) that support your version of the CUDA
+* [GPU drivers](http://nvidia.com/drivers) that support your version of the CUDA
   Toolkit.
 * The `libcupti-dev` library is the NVIDIA CUDA Profile Tools Interface. This
   library provides advanced profiling support. To install this library,
@@ -531,12 +531,6 @@ Add this path to the `LD_LIBRARY_PATH` environmental variable:
   <code class="devsite-terminal">export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:+${LD_LIBRARY_PATH}:}/usr/local/cuda/extras/CUPTI/lib64</code>
 </pre>
 
-For CUDA Toolkit <= 7.5 use:
-
-<pre class="prettyprint lang-bsh">
-  <code class="devsite-terminal">sudo apt-get install libcupti-dev</code>
-</pre>
-
 * *OPTIONAL*:  For optimized performance during inference, install
   *NVIDIA&nbsp;TensorRT&nbsp;3.0*. To install the minimal amount of TensorRT
   runtime components required to use with the pre-built `tensorflow-gpu` package:
@@ -688,14 +682,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc1-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -707,14 +701,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc1-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -726,14 +720,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc1-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -745,14 +739,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0rc1-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0rc1-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index ff6c2f5e447873..29a867a9e300b7 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -119,7 +119,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py3-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -242,7 +242,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py3-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -350,7 +350,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -403,13 +403,11 @@ writing TensorFlow programs:
 If the system outputs an error message instead of a greeting, see
 [Common installation problems](#common_installation_problems).
 
-If you are new to machine learning, we recommend the following:
-
-*  [Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course)
-*  @{$get_started/get_started_for_beginners$Getting Started for ML Beginners}
+If you are new to machine learning, we recommend the
+[Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course).
 
 If you are experienced with machine learning but new to TensorFlow, see
-@{$get_started/premade_estimators$Getting Started with TensorFlow}.
+@{$get_started/eager}.
 
 
 ## Common installation problems
@@ -524,7 +522,7 @@ The value you specify depends on your Python version.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py2-none-any.whl
 </pre>
 
 
@@ -532,5 +530,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py2-none-a
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0rc1-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl
 </pre>
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index 4d99f81d8197c3..1a14f178959cf1 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -81,7 +81,7 @@ or
 [macOS](#PrepareMac)
 
 
-<a name="#PrepareLinux"></a>
+<a name="PrepareLinux"></a>
 ## Prepare environment for Linux
 
 Before building TensorFlow on Linux, install the following build
@@ -133,7 +133,7 @@ The following NVIDIA <i>hardware</i> must be installed on your system:
 
 The following NVIDIA <i>software</i> must be installed on your system:
 
-  * [CUDA Toolkit](http://nvidia.com/cuda) (>= 7.0). We recommend version 9.0.
+  * [CUDA Toolkit](http://nvidia.com/cuda) (>= 8.0). We recommend version 9.0.
     For details, see
     [NVIDIA's documentation](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/).
     Ensure that you append the relevant CUDA pathnames to the
@@ -141,7 +141,7 @@ The following NVIDIA <i>software</i> must be installed on your system:
     NVIDIA documentation.
   * [GPU drivers](http://nvidia.com/driver) supporting your version of the CUDA
     Toolkit.
-  * [cuDNN SDK](http://developer.nvidia.com/cudnn) (>= v3). We recommend version 7.0. For details, see
+  * [cuDNN SDK](http://developer.nvidia.com/cudnn) (>= 6.0). We recommend version 7.0. For details, see
     [NVIDIA's documentation](http://docs.nvidia.com/deeplearning/sdk/cudnn-install/).
   * [CUPTI](http://docs.nvidia.com/cuda/cupti/) ships with the CUDA Toolkit, but
     you also need to append its path to the `LD_LIBRARY_PATH` environment
@@ -195,32 +195,6 @@ plan on executing tasks directly with `bazel` , without the pip installation,
 you may need to install additional python packages. For example, you should
 `pip install mock enum34` before running TensorFlow's tests with bazel.
 
-### Optional: install TensorFlow for GPU prerequisites
-
-If you do not have brew installed, install it by following
-[these instructions](http://brew.sh/).
-
-After installing brew, install GNU coreutils by issuing the following command:
-
-<pre>$ <b>brew install coreutils</b></pre>
-
-If you want to compile tensorflow and have XCode 7.3 and CUDA 7.5 installed,
-note that Xcode 7.3 is not yet compatible with CUDA 7.5.  To remedy this
-problem, do either of the following:
-
-  * Upgrade to CUDA 8.0.
-  * Download Xcode 7.2 and select it as your default by issuing the following
-    command:
-
-    <pre> $ <b>sudo xcode-select -s /Applications/Xcode-7.2/Xcode.app</b></pre>
-
-**NOTE:** Your system must fulfill the NVIDIA software requirements described
-in one of the following documents:
-
-  * @{$install_linux#NVIDIARequirements$Installing TensorFlow on Linux}
-  * @{$install_mac#NVIDIARequirements$Installing TensorFlow on Mac OS}
-
-
 <a name="ConfigureInstallation"></a>
 ## Configure the installation
 
@@ -280,7 +254,7 @@ Do you wish to build TensorFlow with CUDA support? [y/N] <b>Y</b>
 CUDA support will be enabled for TensorFlow
 Do you want to use clang as CUDA compiler? [y/N]
 nvcc will be used as CUDA compiler
-Please specify the CUDA SDK version you want to use, e.g. 7.0. [Leave empty to default to CUDA 9.0]: <b>9.0</b>
+Please specify the CUDA SDK version you want to use. [Leave empty to default to CUDA 9.0]: <b>9.0</b>
 Please specify the location where CUDA 9.0 toolkit is installed. Refer to README.md for more details. [Default is /usr/local/cuda]:
 Please specify which gcc should be used by nvcc as the host compiler. [Default is /usr/bin/gcc]:
 Please specify the cuDNN version you want to use. [Leave empty to default to cuDNN 7.0]: <b>7</b>
@@ -354,10 +328,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.8.0rc1 on Linux:
+for TensorFlow 1.8.0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.8.0rc1-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.8.0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
@@ -388,7 +362,7 @@ TensorFlow programs:
 
 <pre>Hello, TensorFlow!</pre>
 
-If you are new to TensorFlow, see @{$get_started/premade_estimators$Getting Started with TensorFlow}.
+If you are new to TensorFlow, see @{$get_started/eager}.
 
 If the system outputs an error message instead of a greeting, see [Common
 installation problems](#common_installation_problems).
@@ -399,9 +373,9 @@ The build and installation problems you encounter typically depend on the
 operating system.  See the "Common installation problems" section
 of one of the following guides:
 
-  * @{$install_linux#CommonInstallationProblems$Installing TensorFlow on Linux}
-  * @{$install_mac#CommonInstallationProblems$Installing TensorFlow on Mac OS}
-  * @{$install_windows#CommonInstallationProblems$Installing TensorFlow on Windows}
+  * @{$install_linux#common_installation_problems$Installing TensorFlow on Linux}
+  * @{$install_mac#common_installation_problems$Installing TensorFlow on Mac OS}
+  * @{$install_windows#common_installation_problems$Installing TensorFlow on Windows}
 
 Beyond the errors documented in those two guides, the following table
 notes additional errors specific to building TensorFlow.  Note that we
diff --git a/tensorflow/docs_src/install/install_windows.md b/tensorflow/docs_src/install/install_windows.md
index 86add74da15005..6c4f5b85ab2fac 100644
--- a/tensorflow/docs_src/install/install_windows.md
+++ b/tensorflow/docs_src/install/install_windows.md
@@ -157,13 +157,11 @@ TensorFlow programs:
 If the system outputs an error message instead of a greeting, see [Common
 installation problems](#common_installation_problems).
 
-If you are new to machine learning, we recommend the following:
-
-*  [Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course)
-*  @{$get_started/get_started_for_beginners$Getting Started for ML Beginners}
+If you are new to machine learning, we recommend the
+[Machine Learning Crash Course](https://developers.google.com/machine-learning/crash-course).
 
 If you are experienced with machine learning but new to TensorFlow, see
-@{$get_started/premade_estimators$Getting Started with TensorFlow}.
+@{$get_started/eager}.
 
 
 ## Common installation problems
diff --git a/tensorflow/docs_src/mobile/android_build.md b/tensorflow/docs_src/mobile/android_build.md
index c35530061dcaf2..f4b07db4591ddd 100644
--- a/tensorflow/docs_src/mobile/android_build.md
+++ b/tensorflow/docs_src/mobile/android_build.md
@@ -26,7 +26,7 @@ If you haven't already, do the following two things:
 - Install [Android Studio](https://developer.android.com/studio/index.html),
   following the instructions on their website.
 
-- Clone the TensorFlow repository from Github:
+- Clone the TensorFlow repository from GitHub:
 
         git clone https://github.com/tensorflow/tensorflow
 
@@ -37,7 +37,7 @@ If you haven't already, do the following two things:
 
 2. From the **Open File or Project** window that appears, navigate to and select
     the `tensorflow/examples/android` directory from wherever you cloned the
-    TensorFlow Github repo.  Click OK.
+    TensorFlow GitHub repo.  Click OK.
 
     If it asks you to do a Gradle Sync, click OK.
 
diff --git a/tensorflow/docs_src/mobile/linking_libs.md b/tensorflow/docs_src/mobile/linking_libs.md
index 2a0a77c92d309e..efef5dd0daa0b2 100644
--- a/tensorflow/docs_src/mobile/linking_libs.md
+++ b/tensorflow/docs_src/mobile/linking_libs.md
@@ -27,12 +27,12 @@ called `libandroid_tensorflow_inference_java.jar`. There are three ways to
 include this functionality in your program:
 
 1. Include the jcenter AAR which contains it, as in this
- [example app](https://github.com/googlecodelabs/tensorflow-for-poets-2/blob/master/android/build.gradle#L59-L65)
+ [example app](https://github.com/googlecodelabs/tensorflow-for-poets-2/blob/master/android/tfmobile/build.gradle#L59-L65)
 
 2. Download the nightly precompiled version from
 [ci.tensorflow.org](http://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/).
 
-3. Build the JAR file yourself using the instructions [in our Android Github repo](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/android)
+3. Build the JAR file yourself using the instructions [in our Android GitHub repo](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/android)
 
 ### iOS
 
diff --git a/tensorflow/docs_src/mobile/mobile_intro.md b/tensorflow/docs_src/mobile/mobile_intro.md
index 69b63ae7d22ced..241f01d460ae35 100644
--- a/tensorflow/docs_src/mobile/mobile_intro.md
+++ b/tensorflow/docs_src/mobile/mobile_intro.md
@@ -80,7 +80,7 @@ tracking is especially important for applications where you’re trying to count
 how many objects are present over time, since it gives you a good idea when a
 new object enters or leaves the scene. We have some sample code for this
 available for Android [on
-Github](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android),
+GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android),
 and also a [more general object detection
 model](https://github.com/tensorflow/models/tree/master/research/object_detection/README.md)
 available as well.
@@ -212,7 +212,7 @@ handle the task then it will be difficult to train a computer to do better.
 
 After you’ve solved any fundamental issues with your use case, you need to
 create a labeled dataset to define what problem you’re trying to solve. This
-step is extremely important, moreso than picking which model to use. You want it
+step is extremely important, more than picking which model to use. You want it
 to be as representative as possible of your actual use case, since the model
 will only be effective at the task you teach it. It’s also worth investing in
 tools to make labeling the data as efficient and accurate as possible. For
@@ -231,7 +231,7 @@ process.
 The next step is to pick an effective model to use. You might be able to avoid
 training a model from scratch if someone else has already implemented a model
 similar to what you need; we have a repository of models implemented in
-TensorFlow [on Github](https://github.com/tensorflow/models) that you can look
+TensorFlow [on GitHub](https://github.com/tensorflow/models) that you can look
 through. Lean towards the simplest model you can find, and try to get started as
 soon as you have even a small amount of labelled data, since you’ll get the best
 results when you’re able to iterate quickly. The shorter the time it takes to
diff --git a/tensorflow/docs_src/mobile/prepare_models.md b/tensorflow/docs_src/mobile/prepare_models.md
index 8b22c04d872f18..2b84dbb97388b1 100644
--- a/tensorflow/docs_src/mobile/prepare_models.md
+++ b/tensorflow/docs_src/mobile/prepare_models.md
@@ -105,8 +105,8 @@ inline constants so everything’s in one file.  To handle the conversion, you
 need the `freeze_graph.py` script, that’s held in
 [`tensorflow/python/tools/freeze_graph.py`](https://www.tensorflow.org/code/tensorflow/python/tools/freeze_graph.py). You’ll run it like this:
 
-    bazel build tensorflow/tools:freeze_graph
-    bazel-bin/tensorflow/tools/freeze_graph \
+    bazel build tensorflow/python/tools:freeze_graph
+    bazel-bin/tensorflow/python/tools/freeze_graph \
     --input_graph=/tmp/model/my_graph.pb \
     --input_checkpoint=/tmp/model/model.ckpt-1000 \
     --output_graph=/tmp/frozen_graph.pb \
diff --git a/tensorflow/docs_src/mobile/tflite/index.md b/tensorflow/docs_src/mobile/tflite/index.md
index 11f11ea4dc54b9..56220348276399 100644
--- a/tensorflow/docs_src/mobile/tflite/index.md
+++ b/tensorflow/docs_src/mobile/tflite/index.md
@@ -11,7 +11,7 @@ optimizing the kernels for mobile apps, pre-fused activations, and quantized
 kernels that allow smaller and faster (fixed-point math) models.
 
 Most of our TensorFlow Lite documentation is [on
-Github](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite)
+GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite)
 for the time being.
 
 ## What does TensorFlow Lite contain?
@@ -155,7 +155,7 @@ retraining for both floating point and quantized inference.
 
 The following diagram shows the architectural design of TensorFlow Lite:
 
-<img src="/images/tflite-architecture.jpg"
+<img src="https://www.tensorflow.org/images/tflite-architecture.jpg"
      alt="TensorFlow Lite architecture diagram"
      style="max-width:600px;">
 
diff --git a/tensorflow/docs_src/performance/benchmarks.md b/tensorflow/docs_src/performance/benchmarks.md
index 20165a090efcf2..a5fa551dd4904d 100644
--- a/tensorflow/docs_src/performance/benchmarks.md
+++ b/tensorflow/docs_src/performance/benchmarks.md
@@ -403,8 +403,6 @@ GPUs | InceptionV3 (batch size 32) | ResNet-50 (batch size 32)
 This
 [script](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks)
 was run on the various platforms to generate the above results.
-@{$performance_models$High-Performance Models} details techniques in the script
-along with examples of how to execute the script.
 
 In order to create results that are as repeatable as possible, each test was run
 5 times and then the times were averaged together. GPUs are run in their default
diff --git a/tensorflow/docs_src/performance/index.md b/tensorflow/docs_src/performance/index.md
index 49343eaac7f0a7..131d28fa3eb47f 100644
--- a/tensorflow/docs_src/performance/index.md
+++ b/tensorflow/docs_src/performance/index.md
@@ -1,19 +1,31 @@
 # Performance
 
-Performance is often a significant issue when training a machine learning
-model.  This section explains various ways to optimize performance.  Start
-your investigation with the @{$performance_guide$Performance Guide} and then go
-deeper with techniques detailed in @{$performance_models$High-Performance Models}:
-
-  * @{$performance_guide$Performance Guide}, which contains a collection of best
+Performance is an important consideration when training machine learning
+models. Performance speeds up and scales research while
+also providing end users with near instant predictions. This section provides
+details on the high level APIs to use along with best practices to build
+and train high performance models, and quantize models for the least latency
+and highest throughput for inference.
+
+  * @{$performance_guide$Performance Guide} contains a collection of best
     practices for optimizing your TensorFlow code.
 
-  * @{$performance_models$High-Performance Models}, which contains a collection
-    of advanced techniques to build highly scalable models targeting different
-    system types and network topologies.
+  * @{$datasets_performance$Data input pipeline guide} describes the tf.data
+    API for building efficient data input pipelines for TensorFlow.
+
+  * @{$performance/benchmarks$Benchmarks} contains a collection of
+    benchmark results for a variety of hardware configurations.
+
+  * For improving inference efficiency on mobile and
+    embedded hardware, see
+    @{$quantization$How to Quantize Neural Networks with TensorFlow}, which
+    explains how to use quantization to reduce model size, both in storage
+    and at runtime.
+
+  * For optimizing inference on GPUs, refer to [NVIDIA TensorRT™
+  integration with TensorFlow.](
+    https://medium.com/tensorflow/speed-up-tensorflow-inference-on-gpus-with-tensorrt-13b49f3db3fa)
 
-  * @{$performance/benchmarks$Benchmarks}, which contains a collection of
-    benchmark results.
 
 XLA (Accelerated Linear Algebra) is an experimental compiler for linear
 algebra that optimizes TensorFlow computations. The following guides explore
@@ -36,10 +48,5 @@ XLA:
     standalone tool that compiles TensorFlow graphs into executable code in
     order to optimize performance.
 
-And finally, we offer the following guide:
 
-  * @{$quantization$How to Quantize Neural Networks with TensorFlow}, which
-    can explains how to use quantization to reduce model size, both in storage
-    and at runtime. Quantization can improve performance, especially on
-    mobile hardware.
 
diff --git a/tensorflow/docs_src/performance/leftnav_files b/tensorflow/docs_src/performance/leftnav_files
index 1f894c39fe4554..12e0dbd48ac491 100644
--- a/tensorflow/docs_src/performance/leftnav_files
+++ b/tensorflow/docs_src/performance/leftnav_files
@@ -1,7 +1,6 @@
 index.md
 performance_guide.md
 datasets_performance.md
-performance_models.md
 benchmarks.md
 quantization.md
 
diff --git a/tensorflow/docs_src/performance/performance_guide.md b/tensorflow/docs_src/performance/performance_guide.md
index b1796cf9b2d0bf..cb0f5ca9242098 100644
--- a/tensorflow/docs_src/performance/performance_guide.md
+++ b/tensorflow/docs_src/performance/performance_guide.md
@@ -78,7 +78,7 @@ training CIFAR-10 illustrates the use of the `tf.data` API along with
 The `tf.data` API utilizes C++ multi-threading and has a much lower overhead
 than the Python-based `queue_runner` that is limited by Python's multi-threading
 performance. A detailed performance guide for the `tf.data` API can be found
-[here](@{$datasets_performance}).
+@{$datasets_performance$here}.
 
 While feeding data using a `feed_dict` offers a high level of flexibility, in
 general `feed_dict` does not provide a scalable solution. If only a single GPU
diff --git a/tensorflow/docs_src/performance/quantization.md b/tensorflow/docs_src/performance/quantization.md
index 2fea02d861d314..c97f74139c6ee8 100644
--- a/tensorflow/docs_src/performance/quantization.md
+++ b/tensorflow/docs_src/performance/quantization.md
@@ -227,8 +227,8 @@ of 30.0f, and an 8-bit array, the quantized values represent the following:
   <table>
     <tr><th>Quantized</th><th>Float</th></tr>
     <tr><td>0</td><td>-10.0</td></tr>
-    <tr><td>255</td><td>30.0</td></tr>
     <tr><td>128</td><td>10.0</td></tr>
+    <tr><td>255</td><td>30.0</td></tr>
   </table>
   <figcaption>
     <b>Table 2</b>: Example quantized value range
diff --git a/tensorflow/docs_src/performance/xla/broadcasting.md b/tensorflow/docs_src/performance/xla/broadcasting.md
index ca3bddf758cf64..eaa709c2f84245 100644
--- a/tensorflow/docs_src/performance/xla/broadcasting.md
+++ b/tensorflow/docs_src/performance/xla/broadcasting.md
@@ -97,9 +97,9 @@ shape is broadcast into a larger rank shape. For example, given a 2x3x4 cuboid
 and a 3x4 matrix, a broadcasting tuple (1,2) means matching the matrix to
 dimensions 1 and 2 of the cuboid.
 
-This type of broadcast is used in the binary ops in `ComputationBuilder`, if the
+This type of broadcast is used in the binary ops in `XlaBuilder`, if the
 `broadcast_dimensions` argument is given. For example, see
-[ComputationBuilder::Add](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.cc).
+[XlaBuilder::Add](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.cc).
 In the XLA source code, this type of broadcasting is sometimes called "InDim"
 broadcasting.
 
diff --git a/tensorflow/docs_src/performance/xla/index.md b/tensorflow/docs_src/performance/xla/index.md
index a8847830740302..8f5de83ea62923 100644
--- a/tensorflow/docs_src/performance/xla/index.md
+++ b/tensorflow/docs_src/performance/xla/index.md
@@ -1,5 +1,9 @@
 # XLA Overview
 
+<div style="width:50%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:50%" src="/images/xlalogo.png">
+</div>
+
 > Note: XLA is experimental and considered alpha.  Most use cases will not
 > see improvements in performance (speed or decreased memory usage). We have
 > released XLA early so the Open Source Community can contribute to its
diff --git a/tensorflow/docs_src/performance/xla/jit.md b/tensorflow/docs_src/performance/xla/jit.md
index d9a979ccbd3177..6724d1eaf8f853 100644
--- a/tensorflow/docs_src/performance/xla/jit.md
+++ b/tensorflow/docs_src/performance/xla/jit.md
@@ -137,12 +137,12 @@ TF_XLA_FLAGS=--xla_generate_hlo_graph=.* python mnist_softmax_xla.py
 ```
 
 Open the timeline file created (`timeline.ctf.json`).  The rendered timeline
-should look similar to the picture below with one long bar labeled `_XlaLaunch`.
+should look similar to the picture below with one long bar labeled `XlaLaunch`.
 <div style="width:95%; margin:auto; margin-bottom:10px; margin-top:20px;">
   <img style="width:100%" src="https://www.tensorflow.org/images/jit_timeline_gpu_xla.png">
 </div>
 
-To understand what is happening in `_XlaLaunch`, look at the console output for
+To understand what is happening in `XlaLaunch`, look at the console output for
 statements similar to the following:
 
 ```shell
diff --git a/tensorflow/docs_src/performance/xla/operation_semantics.md b/tensorflow/docs_src/performance/xla/operation_semantics.md
index f530fe1206c0e9..5887c3d88bf8c7 100644
--- a/tensorflow/docs_src/performance/xla/operation_semantics.md
+++ b/tensorflow/docs_src/performance/xla/operation_semantics.md
@@ -1,7 +1,7 @@
 # Operation Semantics
 
 The following describes the semantics of operations defined in the
-[`ComputationBuilder`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h)
+[`XlaBuilder`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h)
 interface. Typically, these operations map one-to-one to operations defined in
 the RPC interface in
 [`xla_data.proto`](https://www.tensorflow.org/code/tensorflow/compiler/xla/xla_data.proto).
@@ -16,7 +16,7 @@ and familiar names; for example a *vector* is a 1-dimensional array and a
 ## BatchNormGrad
 
 See also
-[`ComputationBuilder::BatchNormGrad`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h)
+[`XlaBuilder::BatchNormGrad`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h)
 and [the original batch normalization paper](https://arxiv.org/abs/1502.03167)
 for a detailed description of the algorithm.
 
@@ -26,14 +26,14 @@ Calculates gradients of batch norm.
 
 | Arguments       | Type                    | Semantics                        |
 | --------------- | ----------------------- | -------------------------------- |
-| `operand`       | `ComputationDataHandle` | n dimensional array to be        |
+| `operand`       | `XlaOp`                 | n dimensional array to be        |
 :                 :                         : normalized (x)                   :
-| `scale`         | `ComputationDataHandle` | 1 dimensional array              |
+| `scale`         | `XlaOp`                 | 1 dimensional array              |
 :                 :                         : (\\(\gamma\\))                   :
-| `mean`          | `ComputationDataHandle` | 1 dimensional array (\\(\mu\\))  |
-| `variance`      | `ComputationDataHandle` | 1 dimensional array              |
+| `mean`          | `XlaOp`                 | 1 dimensional array (\\(\mu\\))  |
+| `variance`      | `XlaOp`                 | 1 dimensional array              |
 :                 :                         : (\\(\sigma^2\\))                 :
-| `grad_output`   | `ComputationDataHandle` | Gradients passed to              |
+| `grad_output`   | `XlaOp`                 | Gradients passed to              |
 :                 :                         : `BatchNormTraining`              :
 :                 :                         : (\\( \nabla y\\))                :
 | `epsilon`       | `float`                 | Epsilon value (\\(\epsilon\\))   |
@@ -70,35 +70,33 @@ The output type is a tuple of three handles:
 
 | Outputs        | Type                    | Semantics                         |
 | -------------  | ----------------------- | --------------------------------- |
-| `grad_operand` | `ComputationDataHandle` | gradient with respect to input    |
+| `grad_operand` | `XlaOp`                 | gradient with respect to input    |
 :                :                         : `operand` (\\( \nabla x\\))       :
-| `grad_scale`   | `ComputationDataHandle` | gradient with respect to input    |
+| `grad_scale`   | `XlaOp`                 | gradient with respect to input    |
 :                :                         : `scale` (\\( \nabla \gamma\\))    :
-| `grad_offset`  | `ComputationDataHandle` | gradient with respect to input    |
+| `grad_offset`  | `XlaOp`                 | gradient with respect to input    |
 :                :                         : `offset`(\\( \nabla \beta\\))     :
 
 ## BatchNormInference
 
 See also
-[`ComputationBuilder::BatchNormInference`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h) and
-[the original batch normalization paper](https://arxiv.org/abs/1502.03167)
+[`XlaBuilder::BatchNormInference`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h)
+and [the original batch normalization paper](https://arxiv.org/abs/1502.03167)
 for a detailed description of the algorithm.
 
 Normalizes an array across batch and spatial dimensions.
 
 <b> `BatchNormInference(operand, scale, offset, mean, variance, epsilon, feature_index)` </b>
 
-| Arguments       | Type                    | Semantics                       |
-| --------------  | ----------------------- | ------------------------------- |
-| `operand`       | `ComputationDataHandle` | n dimensional array to be       |
-:                 :                         : normalized                      :
-| `scale`         | `ComputationDataHandle` | 1 dimensional array             |
-| `offset`        | `ComputationDataHandle` | 1 dimensional array             |
-| `mean`          | `ComputationDataHandle` | 1 dimensional array             |
-| `variance`      | `ComputationDataHandle` | 1 dimensional array             |
-| `epsilon`       | `float`                 | Epsilon value                   |
-| `feature_index` | `int64`                 | Index to feature dimension in   |
-:                 :                         : `operand`                       :
+Arguments       | Type    | Semantics
+--------------- | ------- | ---------------------------------------
+`operand`       | `XlaOp` | n dimensional array to be normalized
+`scale`         | `XlaOp` | 1 dimensional array
+`offset`        | `XlaOp` | 1 dimensional array
+`mean`          | `XlaOp` | 1 dimensional array
+`variance`      | `XlaOp` | 1 dimensional array
+`epsilon`       | `float` | Epsilon value
+`feature_index` | `int64` | Index to feature dimension in `operand`
 
 For each feature in the feature dimension (`feature_index` is the index for the
 feature dimension in `operand`), the operation calculates the mean and variance
@@ -117,25 +115,21 @@ The output is an n-dimensional, normalized array with the same shape as input
 ## BatchNormTraining
 
 See also
-[`ComputationBuilder::BatchNormTraining`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h) and
-[`the original batch normalization paper`](https://arxiv.org/abs/1502.03167)
+[`XlaBuilder::BatchNormTraining`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h)
+and [`the original batch normalization paper`](https://arxiv.org/abs/1502.03167)
 for a detailed description of the algorithm.
 
 Normalizes an array across batch and spatial dimensions.
 
 <b> `BatchNormTraining(operand, scale, offset, epsilon, feature_index)` </b>
 
-| Arguments       | Type                    | Semantics                        |
-| --------------- | ----------------------- | -------------------------------- |
-| `operand`       | `ComputationDataHandle` | n dimensional array to be        |
-:                 :                         : normalized (x)                   :
-| `scale`         | `ComputationDataHandle` | 1 dimensional array              |
-:                 :                         : (\\(\gamma\\))                   :
-| `offset`        | `ComputationDataHandle` | 1 dimensional array              |
-:                 :                         : (\\(\beta\\))                    :
-| `epsilon`       | `float`                 | Epsilon value (\\(\epsilon\\))   |
-| `feature_index` | `int64`                 | Index to feature dimension       |
-:                 :                         : in `operand`                     :
+Arguments       | Type    | Semantics
+--------------- | ------- | ----------------------------------------
+`operand`       | `XlaOp` | n dimensional array to be normalized (x)
+`scale`         | `XlaOp` | 1 dimensional array (\\(\gamma\\))
+`offset`        | `XlaOp` | 1 dimensional array (\\(\beta\\))
+`epsilon`       | `float` | Epsilon value (\\(\epsilon\\))
+`feature_index` | `int64` | Index to feature dimension in `operand`
 
 For each feature in the feature dimension (`feature_index` is the index for the
 feature dimension in `operand`), the operation calculates the mean and variance
@@ -158,14 +152,14 @@ contains `m` elements with `w` and `h` as the size of spatial dimensions
 
 The epsilon value, usually a small number, is added to avoid divide-by-zero errors.
 
-The output type is a tuple of three `ComputationDataHandle`s:
+The output type is a tuple of three `XlaOp`s:
 
 | Outputs      | Type                    | Semantics                            |
 | ------------ | ----------------------- | -------------------------------------|
-| `output`     | `ComputationDataHandle` | n dimensional array with the same    |
+| `output`     | `XlaOp`                 | n dimensional array with the same    |
 :              :                         : shape as input `operand` (y)         :
-| `batch_mean` | `ComputationDataHandle` | 1 dimensional array (\\(\mu\\))      |
-| `batch_var`  | `ComputationDataHandle` | 1 dimensional array (\\(\sigma^2\\)) |
+| `batch_mean` | `XlaOp`                 | 1 dimensional array (\\(\mu\\))      |
+| `batch_var`  | `XlaOp`                 | 1 dimensional array (\\(\sigma^2\\)) |
 
 The `batch_mean` and `batch_var` are moments calculated across the batch and
 spatial dimensions using the formulas above.
@@ -173,7 +167,7 @@ spatial dimensions using the formulas above.
 ## BitcastConvertType
 
 See also
-[`ComputationBuilder::BitcastConvertType`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::BitcastConvertType`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Similar to a `tf.bitcast` in TensorFlow, performs an element-wise bitcast
 operation from a data shape to a target shape. The dimensions must match, and
@@ -183,10 +177,10 @@ with different floating-point representations will give different results.
 
 <b> `BitcastConvertType(operand, new_element_type)` </b>
 
-Arguments          | Type                    | Semantics
------------------- | ----------------------- | ---------------------------
-`operand`          | `ComputationDataHandle` | array of type T with dims D
-`new_element_type` | `PrimitiveType`         | type U
+Arguments          | Type            | Semantics
+------------------ | --------------- | ---------------------------
+`operand`          | `XlaOp`         | array of type T with dims D
+`new_element_type` | `PrimitiveType` | type U
 
 The dimensions of the operand and the target shape must match. The bit-width of
 the source and destination element types must be equal. The source
@@ -195,16 +189,16 @@ and destination element types must not be tuples.
 ## Broadcast
 
 See also
-[`ComputationBuilder::Broadcast`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Broadcast`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Adds dimensions to an array by duplicating the data in the array.
 
 <b> `Broadcast(operand, broadcast_sizes)` </b>
 
-Arguments         | Type                    | Semantics
------------------ | ----------------------- | -------------------------------
-`operand`         | `ComputationDataHandle` | The array to duplicate
-`broadcast_sizes` | `ArraySlice<int64>`     | The sizes of the new dimensions
+Arguments         | Type                | Semantics
+----------------- | ------------------- | -------------------------------
+`operand`         | `XlaOp`             | The array to duplicate
+`broadcast_sizes` | `ArraySlice<int64>` | The sizes of the new dimensions
 
 The new dimensions are inserted on the left, i.e. if `broadcast_sizes` has
 values `{a0, ..., aN}` and the operand shape has dimensions `{b0, ..., bM}` then
@@ -223,19 +217,18 @@ For example, if `operand` is a scalar `f32` with value `2.0f`, and
 ## Call
 
 See also
-[`ComputationBuilder::Call`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Call`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Invokes a computation with the given arguments.
 
 <b> `Call(computation, args...)` </b>
 
-| Arguments     | Type                     | Semantics                        |
-| ------------- | ------------------------ | -------------------------------- |
-| `computation` | `Computation`            | computation of type `T_0, T_1,   |
-:               :                          : ..., T_N -> S` with N parameters :
-:               :                          : of arbitrary type                :
-| `args`        | sequence of N            | N arguments of arbitrary type    |
-:               : `ComputationDataHandle`s :                                  :
+| Arguments     | Type                   | Semantics                           |
+| ------------- | ---------------------- | ----------------------------------- |
+| `computation` | `XlaComputation`       | computation of type `T_0, T_1, ..., |
+:               :                        : T_N -> S` with N parameters of      :
+:               :                        : arbitrary type                      :
+| `args`        | sequence of N `XlaOp`s | N arguments of arbitrary type       |
 
 The arity and types of the `args` must match the parameters of the
 `computation`. It is allowed to have no `args`.
@@ -243,17 +236,17 @@ The arity and types of the `args` must match the parameters of the
 ## Clamp
 
 See also
-[`ComputationBuilder::Clamp`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Clamp`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Clamps an operand to within the range between a minimum and maximum value.
 
 <b> `Clamp(min, operand, max)` </b>
 
-| Arguments     | Type                    | Semantics                        |
-| ------------- | ----------------------- | -------------------------------- |
-| `min`         | `ComputationDataHandle` | array of type T                  |
-| `operand`     | `ComputationDataHandle` | array of type T                  |
-| `max`         | `ComputationDataHandle` | array of type T                  |
+Arguments | Type    | Semantics
+--------- | ------- | ---------------
+`min`     | `XlaOp` | array of type T
+`operand` | `XlaOp` | array of type T
+`max`     | `XlaOp` | array of type T
 
 Given an operand and minimum and maximum values, returns the operand if it is in
 the range between the minimum and maximum, else returns the minimum value if the
@@ -276,18 +269,17 @@ Clamp(min, operand, max) = s32[3]{0, 5, 6};
 ## Collapse
 
 See also
-[`ComputationBuilder::Collapse`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h)
+[`XlaBuilder::Collapse`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h)
 and the @{tf.reshape} operation.
 
 Collapses dimensions of an array into one dimension.
 
 <b> `Collapse(operand, dimensions)` </b>
 
-| Arguments    | Type                    | Semantics                           |
-| ------------ | ----------------------- | ----------------------------------- |
-| `operand`    | `ComputationDataHandle` | array of type T                     |
-| `dimensions` | `int64` vector          | in-order, consecutive subset of T's |
-:              :                         : dimensions.                         :
+Arguments    | Type           | Semantics
+------------ | -------------- | -----------------------------------------------
+`operand`    | `XlaOp`        | array of type T
+`dimensions` | `int64` vector | in-order, consecutive subset of T's dimensions.
 
 Collapse replaces the given subset of the operand's dimensions by a single
 dimension. The input arguments are an arbitrary array of type T and a
@@ -340,7 +332,7 @@ then v12 == f32[8x3] {{10, 11, 12},
 ## Concatenate
 
 See also
-[`ComputationBuilder::ConcatInDim`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::ConcatInDim`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Concatenate composes an array from multiple array operands. The array is of the
 same rank as each of the input array operands (which must be of the same rank as
@@ -348,13 +340,13 @@ each other) and contains the arguments in the order that they were specified.
 
 <b> `Concatenate(operands..., dimension)` </b>
 
-| Arguments   | Type                    | Semantics                            |
-| ----------- | ----------------------- | ------------------------------------ |
-| `operands`  | sequence of N           | N arrays of type T with dimensions   |
-:             : `ComputationDataHandle` : [L0, L1, ...]. Requires N >= 1.      :
-| `dimension` | `int64`                 | A value in the interval `[0, N)`     |
-:             :                         : that names the dimension to be       :
-:             :                         : concatenated between the `operands`. :
+| Arguments   | Type                  | Semantics                              |
+| ----------- | --------------------- | -------------------------------------- |
+| `operands`  | sequence of N `XlaOp` | N arrays of type T with dimensions     |
+:             :                       : [L0, L1, ...]. Requires N >= 1.        :
+| `dimension` | `int64`               | A value in the interval `[0, N)` that  |
+:             :                       : names the dimension to be concatenated :
+:             :                       : between the `operands`.                :
 
 With the exception of `dimension` all dimensions must be the same. This is
 because XLA does not support "ragged" arrays. Also note that rank-0 values
@@ -395,20 +387,19 @@ Diagram:
 
 ## Conditional
 
-See also [`ComputationBuilder::Conditional`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+See also
+[`XlaBuilder::Conditional`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 <b> `Conditional(pred, true_operand, true_computation, false_operand,
-    false_computation)` </b>
-
-| Arguments           | Type                    | Semantics                   |
-| ------------------- | ----------------------- | --------------------------- |
-| `pred`              | `ComputationDataHandle` | Scalar of type `PRED`       |
-| `true_operand`      | `ComputationDataHandle` | Argument of type `T_0`      |
-| `true_computation`  | `Computation`           | Computation of type `T_0 -> |
-:                     :                         : S`                          :
-| `false_operand`     | `ComputationDataHandle` | Argument of type `T_1`      |
-| `false_computation` | `Computation`           | Computation of type `T_1 -> |
-:                     :                         : S`                          :
+false_computation)` </b>
+
+Arguments           | Type             | Semantics
+------------------- | ---------------- | ---------------------------------
+`pred`              | `XlaOp`          | Scalar of type `PRED`
+`true_operand`      | `XlaOp`          | Argument of type `T_0`
+`true_computation`  | `XlaComputation` | XlaComputation of type `T_0 -> S`
+`false_operand`     | `XlaOp`          | Argument of type `T_1`
+`false_computation` | `XlaComputation` | XlaComputation of type `T_1 -> S`
 
 Executes `true_computation` if `pred` is `true`, `false_computation` if `pred`
 is `false`, and returns the result.
@@ -425,7 +416,7 @@ executed depending on the value of `pred`.
 ## Conv (convolution)
 
 See also
-[`ComputationBuilder::Conv`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Conv`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 As ConvWithGeneralPadding, but the padding is specified in a short-hand way as
 either SAME or VALID. SAME padding pads the input (`lhs`) with zeroes so that
@@ -435,7 +426,7 @@ account. VALID padding simply means no padding.
 ## ConvWithGeneralPadding (convolution)
 
 See also
-[`ComputationBuilder::ConvWithGeneralPadding`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::ConvWithGeneralPadding`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Computes a convolution of the kind used in neural networks. Here, a convolution
 can be thought of as a n-dimensional window moving across a n-dimensional base
@@ -443,8 +434,8 @@ area and a computation is performed for each possible position of the window.
 
 | Arguments        | Type                    | Semantics                     |
 | ---------------- | ----------------------- | ----------------------------- |
-| `lhs`            | `ComputationDataHandle` | rank n+2 array of inputs      |
-| `rhs`            | `ComputationDataHandle` | rank n+2 array of kernel      |
+| `lhs`            | `XlaOp`                 | rank n+2 array of inputs      |
+| `rhs`            | `XlaOp`                 | rank n+2 array of kernel      |
 :                  :                         : weights                       :
 | `window_strides` | `ArraySlice<int64>`     | n-d array of kernel strides   |
 | `padding`        | `ArraySlice<pair<int64, | n-d array of (low, high)      |
@@ -547,7 +538,7 @@ for (b, oz, oy, ox) {  // output coordinates
 ## ConvertElementType
 
 See also
-[`ComputationBuilder::ConvertElementType`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::ConvertElementType`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Similar to an element-wise `static_cast` in C++, performs an element-wise
 conversion operation from a data shape to a target shape. The dimensions must
@@ -556,10 +547,10 @@ match, and the conversion is an element-wise one; e.g. `s32` elements become
 
 <b> `ConvertElementType(operand, new_element_type)` </b>
 
-Arguments          | Type                    | Semantics
------------------- | ----------------------- | ---------------------------
-`operand`          | `ComputationDataHandle` | array of type T with dims D
-`new_element_type` | `PrimitiveType`         | type U
+Arguments          | Type            | Semantics
+------------------ | --------------- | ---------------------------
+`operand`          | `XlaOp`         | array of type T with dims D
+`new_element_type` | `PrimitiveType` | type U
 
 The dimensions of the operand and the target shape must match. The source and
 destination element types must not be tuples.
@@ -581,15 +572,15 @@ then b == f32[3]{0.0, 1.0, 2.0}
 ## CrossReplicaSum
 
 See also
-[`ComputationBuilder::CrossReplicaSum`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::CrossReplicaSum`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Computes a sum across replicas.
 
 <b> `CrossReplicaSum(operand)` </b>
 
-| Arguments    | Type                    | Semantics                          |
-| ------------ | ----------------------- | ---------------------------------- |
-| `operand`    | `ComputationDataHandle` | Array to sum across replicas.      |
+Arguments | Type    | Semantics
+--------- | ------- | -----------------------------
+`operand` | `XlaOp` | Array to sum across replicas.
 
 The output shape is the same as the input shape. For example, if there are two
 replicas and the operand has the value `(1.0, 2.5)` and `(3.0, 5.25)`
@@ -607,21 +598,21 @@ than another.
 ## CustomCall
 
 See also
-[`ComputationBuilder::CustomCall`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::CustomCall`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Call a user-provided function within a computation.
 
 <b> `CustomCall(target_name, args..., shape)` </b>
 
-| Arguments     | Type                     | Semantics                        |
-| ------------- | ------------------------ | -------------------------------- |
-| `target_name` | `string`                 | Name of the function. A call     |
-:               :                          : instruction will be emitted      :
-:               :                          : which targets this symbol name.  :
-| `args`        | sequence of N            | N arguments of arbitrary type,   |
-:               : `ComputationDataHandle`s : which will be passed to the      :
-:               :                          : function.                        :
-| `shape`       | `Shape`                  | Output shape of the function     |
+| Arguments     | Type                   | Semantics                         |
+| ------------- | ---------------------- | --------------------------------- |
+| `target_name` | `string`               | Name of the function. A call      |
+:               :                        : instruction will be emitted which :
+:               :                        : targets this symbol name.         :
+| `args`        | sequence of N `XlaOp`s | N arguments of arbitrary type,    |
+:               :                        : which will be passed to the       :
+:               :                        : function.                         :
+| `shape`       | `Shape`                | Output shape of the function      |
 
 The function signature is the same, regardless of the arity or type of args:
 
@@ -668,14 +659,14 @@ idempotent.
 ## Dot
 
 See also
-[`ComputationBuilder::Dot`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Dot`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 <b> `Dot(lhs, rhs)` </b>
 
-Arguments | Type                    | Semantics
---------- | ----------------------- | ---------------
-`lhs`     | `ComputationDataHandle` | array of type T
-`rhs`     | `ComputationDataHandle` | array of type T
+Arguments | Type    | Semantics
+--------- | ------- | ---------------
+`lhs`     | `XlaOp` | array of type T
+`rhs`     | `XlaOp` | array of type T
 
 The exact semantics of this operation depend on the ranks of the operands:
 
@@ -697,15 +688,15 @@ multiplications or matrix/matrix multiplications.
 ## DotGeneral
 
 See also
-[`ComputationBuilder::DotGeneral`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::DotGeneral`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 <b> `DotGeneral(lhs, rhs, dimension_numbers)` </b>
 
-| Arguments | Type                    | Semantics
-| --------- | ----------------------- | ---------------
-| `lhs`     | `ComputationDataHandle` | array of type T
-| `rhs`     | `ComputationDataHandle` | array of type T
-| `dimension_numbers` | `DotDimensionNumbers` | array of type T
+Arguments           | Type                  | Semantics
+------------------- | --------------------- | ---------------
+`lhs`               | `XlaOp`               | array of type T
+`rhs`               | `XlaOp`               | array of type T
+`dimension_numbers` | `DotDimensionNumbers` | array of type T
 
 As Dot, but allows contracting and batch dimension numbers to be specified for
 both the 'lhs' and 'rhs'.
@@ -784,7 +775,7 @@ non-contracting/non-batch dimension.
 ## DynamicSlice
 
 See also
-[`ComputationBuilder::DynamicSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::DynamicSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 DynamicSlice extracts a sub-array from the input array at dynamic
 `start_indices`. The size of the slice in each dimension is passed in
@@ -796,22 +787,21 @@ calculation of 'start_indices') is currently implementation-defined.
 
 <b> `DynamicSlice(operand, start_indices, size_indices)` </b>
 
-| Arguments       | Type                    | Semantics                        |
-| --------------- | ----------------------- | -------------------------------- |
-| `operand`       | `ComputationDataHandle` | N dimensional array of type T    |
-| `start_indices` | `ComputationDataHandle` | Rank 1 array of N integers       |
-:                 :                         : containing the starting indices  :
-:                 :                         : of the slice for each dimension. :
-:                 :                         : Value must be greater than or    :
-:                 :                         : equal to zero.                   :
-| `size_indices`  | `ArraySlice<int64>`     | List of N integers containing    |
-:                 :                         : the slice size for each          :
-:                 :                         : dimension. Each value must be    :
-:                 :                         : strictly greater than zero, and  :
-:                 :                         : start + size must be less than   :
-:                 :                         : or equal to the size of the      :
-:                 :                         : dimension to avoid wrapping      :
-:                 :                         : modulo dimension size.           :
+| Arguments       | Type                | Semantics                           |
+| --------------- | ------------------- | ----------------------------------- |
+| `operand`       | `XlaOp`             | N dimensional array of type T       |
+| `start_indices` | `XlaOp`             | Rank 1 array of N integers          |
+:                 :                     : containing the starting indices of  :
+:                 :                     : the slice for each dimension. Value :
+:                 :                     : must be greater than or equal to    :
+:                 :                     : zero.                               :
+| `size_indices`  | `ArraySlice<int64>` | List of N integers containing the   |
+:                 :                     : slice size for each dimension. Each :
+:                 :                     : value must be strictly greater than :
+:                 :                     : zero, and start + size must be less :
+:                 :                     : than or equal to the size of the    :
+:                 :                     : dimension to avoid wrapping modulo  :
+:                 :                     : dimension size.                     :
 
 1-dimensional example:
 
@@ -840,7 +830,7 @@ DynamicSlice(b, s, {2, 2}) produces:
 ## DynamicUpdateSlice
 
 See also
-[`ComputationBuilder::DynamicUpdateSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::DynamicUpdateSlice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 DynamicUpdateSlice generates a result which is the value of the input array
 `operand`, with a slice `update` overwritten at `start_indices`.
@@ -853,23 +843,19 @@ calculation of 'start_indices') is currently implementation-defined.
 
 <b> `DynamicUpdateSlice(operand, update, start_indices)` </b>
 
-| Arguments       | Type                    | Semantics                        |
-| --------------- | ----------------------- | -------------------------------- |
-| `operand`       | `ComputationDataHandle` | N dimensional array of type T    |
-| `update`        | `ComputationDataHandle` | N dimensional array of type T    |
-:                 :                         : containing the slice update.     :
-:                 :                         : Each dimension of update shape   :
-:                 :                         : must be strictly greater than    :
-:                 :                         : zero, and start + update must be :
-:                 :                         : less than or equal to the operand:
-:                 :                         : size for each dimension to avoid :
-:                 :                         : generating out-of-bounds update  :
-:                 :                         : indices.                         :
-| `start_indices` | `ComputationDataHandle` | Rank 1 array of N integers       |
-:                 :                         : containing the starting indices  :
-:                 :                         : of the slice for each dimension. :
-:                 :                         : Value must be greater than or    :
-:                 :                         : equal to zero.                   :
+| Arguments       | Type    | Semantics                                        |
+| --------------- | ------- | ------------------------------------------------ |
+| `operand`       | `XlaOp` | N dimensional array of type T                    |
+| `update`        | `XlaOp` | N dimensional array of type T containing the     |
+:                 :         : slice update. Each dimension of update shape     :
+:                 :         : must be strictly greater than zero, and start +  :
+:                 :         : update must be less than or equal to the operand :
+:                 :         : size for each dimension to avoid generating      :
+:                 :         : out-of-bounds update indices.                    :
+| `start_indices` | `XlaOp` | Rank 1 array of N integers containing the        |
+:                 :         : starting indices of the slice for each           :
+:                 :         : dimension. Value must be greater than or equal   :
+:                 :         : to zero.                                         :
 
 1-dimensional example:
 
@@ -907,7 +893,7 @@ DynamicUpdateSlice(b, u, s) produces:
 ## Element-wise binary arithmetic operations
 
 See also
-[`ComputationBuilder::Add`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Add`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 A set of element-wise binary arithmetic operations is supported.
 
@@ -917,10 +903,10 @@ Where `Op` is one of `Add` (addition), `Sub` (subtraction), `Mul`
 (multiplication), `Div` (division), `Rem` (remainder), `Max` (maximum), `Min`
 (minimum), `LogicalAnd` (logical AND), or `LogicalOr` (logical OR).
 
-Arguments | Type                    | Semantics
---------- | ----------------------- | ----------------------------------------
-`lhs`     | `ComputationDataHandle` | left-hand-side operand: array of type T
-`rhs`     | `ComputationDataHandle` | right-hand-side operand: array of type T
+Arguments | Type    | Semantics
+--------- | ------- | ----------------------------------------
+`lhs`     | `XlaOp` | left-hand-side operand: array of type T
+`rhs`     | `XlaOp` | right-hand-side operand: array of type T
 
 The arguments' shapes have to be either similar or compatible. See the
 @{$broadcasting$broadcasting} documentation about what it means for shapes to
@@ -952,7 +938,7 @@ shapes of both operands. The semantics are described in detail on the
 ## Element-wise comparison operations
 
 See also
-[`ComputationBuilder::Eq`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Eq`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 A set of standard element-wise binary comparison operations is supported. Note
 that standard IEEE 754 floating-point comparison semantics apply when comparing
@@ -964,10 +950,10 @@ Where `Op` is one of `Eq` (equal-to), `Ne` (not equal-to), `Ge`
 (greater-or-equal-than), `Gt` (greater-than), `Le` (less-or-equal-than), `Lt`
 (less-than).
 
-Arguments | Type                    | Semantics
---------- | ----------------------- | ----------------------------------------
-`lhs`     | `ComputationDataHandle` | left-hand-side operand: array of type T
-`rhs`     | `ComputationDataHandle` | right-hand-side operand: array of type T
+Arguments | Type    | Semantics
+--------- | ------- | ----------------------------------------
+`lhs`     | `XlaOp` | left-hand-side operand: array of type T
+`rhs`     | `XlaOp` | right-hand-side operand: array of type T
 
 The arguments' shapes have to be either similar or compatible. See the
 @{$broadcasting$broadcasting} documentation about what it means for shapes to
@@ -991,7 +977,7 @@ in detail on the @{$broadcasting$broadcasting page}.
 
 ## Element-wise unary functions
 
-ComputationBuilder supports these element-wise unary functions:
+XlaBuilder supports these element-wise unary functions:
 
 <b>`Abs(operand)`</b> Element-wise abs `x -> |x|`.
 
@@ -1023,9 +1009,9 @@ using the comparison operator of the element type of `operand`.
 <b>`Tanh(operand)`</b> Element-wise hyperbolic tangent `x -> tanh(x)`.
 
 
-Arguments | Type                    | Semantics
---------- | ----------------------- | ---------------------------
-`operand` | `ComputationDataHandle` | The operand to the function
+Arguments | Type    | Semantics
+--------- | ------- | ---------------------------
+`operand` | `XlaOp` | The operand to the function
 
 The function is applied to each element in the `operand` array, resulting in an
 array with the same shape. It is allowed for `operand` to be a scalar (rank 0).
@@ -1038,19 +1024,19 @@ potentially different runtime offset) of an input tensor into an output tensor.
 ### General Semantics
 
 See also
-[`ComputationBuilder::Gather`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Gather`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 For a more intuitive description, see the "Informal Description" section below.
 
 <b> `gather(operand, gather_indices, output_window_dims, elided_window_dims, window_bounds, gather_dims_to_operand_dims)` </b>
 
 |Arguments         | Type                    | Semantics                       |
 |----------------- | ----------------------- | --------------------------------|
-|`operand`         | `ComputationDataHandle` | The tensor we’re gathering      |
+|`operand`         | `XlaOp`                 | The tensor we’re gathering      |
 :                  :                         : from.                           :
-|`gather_indices`  | `ComputationDataHandle` | Tensor containing the starting  |
+|`gather_indices`  | `XlaOp`                 | Tensor containing the starting  |
 :                  :                         : indices of the slices we're     :
-:                  :                         : we're stitching together into   :
-:                  :                         : the output tensor.              :
+:                  :                         : stitching together into the     :
+:                  :                         : output tensor.                  :
 |`index_vector_dim`  | `int64`               | The dimension in                |
 :                  :                         : `gather_indices` that contains  :
 :                  :                         : the starting indices.           :
@@ -1241,7 +1227,7 @@ concatenation of all these rows.
 ## GetTupleElement
 
 See also
-[`ComputationBuilder::GetTupleElement`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::GetTupleElement`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Indexes into a tuple with a compile-time-constant value.
 
@@ -1262,7 +1248,7 @@ See also @{tf.tuple}.
 ## Infeed
 
 See also
-[`ComputationBuilder::Infeed`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Infeed`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 <b> `Infeed(shape)` </b>
 
@@ -1275,7 +1261,7 @@ See also
 
 Reads a single data item from the implicit Infeed streaming interface of the
 device, interpreting the data as the given shape and its layout, and returns a
-`ComputationDataHandle` of the data. Multiple Infeed operations are allowed in a
+`XlaOp` of the data. Multiple Infeed operations are allowed in a
 computation, but there must be a total order among the Infeed operations. For
 example, two Infeeds in the code below have a total order since there is a
 dependency between the while loops.
@@ -1301,21 +1287,19 @@ Infeed of the device.
 ## Map
 
 See also
-[`ComputationBuilder::Map`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Map`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 <b> `Map(operands..., computation)` </b>
 
-| Arguments         | Type                     | Semantics                     |
-| ----------------- | ------------------------ | ----------------------------- |
-| `operands`        | sequence of N            | N arrays of types T_0..T_{N-1}|
-:                   : `ComputationDataHandle`s :                               :
-| `computation`     | `Computation`            | computation of type `T_0,     |
-:                   :                          : T_1, ..., T_{N + M -1} -> S`  :
-:                   :                          : with N parameters of type T   :
-:                   :                          : and M of arbitrary type       :
-| `dimensions`       | `int64` array           | array of map dimensions    |
-| `static_operands` | sequence of M            | M arrays of arbitrary type    |
-:                   : `ComputationDataHandle`s :                               :
+| Arguments         | Type                   | Semantics                      |
+| ----------------- | ---------------------- | ------------------------------ |
+| `operands`        | sequence of N `XlaOp`s | N arrays of types T_0..T_{N-1} |
+| `computation`     | `XlaComputation`        | computation of type `T_0, T_1, |
+:                   :                        : ..., T_{N + M -1} -> S` with N :
+:                   :                        : parameters of type T and M of  :
+:                   :                        : arbitrary type                 :
+| `dimensions`      | `int64` array          | array of map dimensions        |
+| `static_operands` | sequence of M `XlaOp`s | M arrays of arbitrary type     |
 
 Applies a scalar function over the given `operands` arrays, producing an array
 of the same dimensions where each element is the result of the mapped function
@@ -1334,18 +1318,18 @@ input arrays to produce the output array.
 ## Pad
 
 See also
-[`ComputationBuilder::Pad`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Pad`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 <b> `Pad(operand, padding_value, padding_config)` </b>
 
-| Arguments        | Type                    | Semantics                     |
-| ---------------- | ----------------------- | ----------------------------- |
-| `operand`        | `ComputationDataHandle` | array of type `T`             |
-| `padding_value`  | `ComputationDataHandle` | scalar of type `T` to fill in |
-:                  :                         : the added padding             :
-| `padding_config` | `PaddingConfig`         | padding amount on both edges  |
-:                  :                         : (low, high) and between the   :
-:                  :                         : elements of each dimension    :
+| Arguments        | Type            | Semantics                               |
+| ---------------- | --------------- | --------------------------------------- |
+| `operand`        | `XlaOp`         | array of type `T`                       |
+| `padding_value`  | `XlaOp`         | scalar of type `T` to fill in the added |
+:                  :                 : padding                                 :
+| `padding_config` | `PaddingConfig` | padding amount on both edges (low,      |
+:                  :                 : high) and between the elements of each  :
+:                  :                 : dimension                               :
 
 Expands the given `operand` array by padding around the array as well as between
 the elements of the array with the given `padding_value`. `padding_config`
@@ -1373,7 +1357,7 @@ are all 0. The figure below shows examples of different `edge_padding` and
 ## Recv
 
 See also
-[`ComputationBuilder::Recv`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Recv`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 <b> `Recv(shape, channel_handle)` </b>
 
@@ -1384,7 +1368,7 @@ See also
 
 Receives data of the given shape from a `Send` instruction in another
 computation that shares the same channel handle. Returns a
-ComputationDataHandle for the received data.
+XlaOp for the received data.
 
 The client API of `Recv` operation represents synchronous communication.
 However, the instruction is internally decomposed into 2 HLO instructions
@@ -1407,19 +1391,18 @@ complete and returns the received data.
 ## Reduce
 
 See also
-[`ComputationBuilder::Reduce`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Reduce`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Applies a reduction function to an array.
 
 <b> `Reduce(operand, init_value, computation, dimensions)` </b>
 
-| Arguments     | Type                    | Semantics                        |
-| ------------- | ----------------------- | -------------------------------- |
-| `operand`     | `ComputationDataHandle` | array of type `T`                |
-| `init_value`  | `ComputationDataHandle` | scalar of type `T`               |
-| `computation` | `Computation`           | computation of type `T, T -> T`  |
-| `dimensions`  | `int64` array           | unordered array of dimensions to |
-:               :                         : reduce                           :
+Arguments     | Type             | Semantics
+------------- | ---------------- | ---------------------------------------
+`operand`     | `XlaOp`          | array of type `T`
+`init_value`  | `XlaOp`          | scalar of type `T`
+`computation` | `XlaComputation` | computation of type `T, T -> T`
+`dimensions`  | `int64` array    | unordered array of dimensions to reduce
 
 This operation reduces one or more dimensions of the input array into scalars.
 The rank of the returned array is `rank(operand) - len(dimensions)`.
@@ -1525,7 +1508,7 @@ Reducing the 3D array over all its dimensions produces the scalar `84`.
 ## ReducePrecision
 
 See also
-[`ComputationBuilder::ReducePrecision`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::ReducePrecision`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Models the effect of converting floating-point values to a lower-precision
 format (such as IEEE-FP16) and back to the original format.  The number of
@@ -1535,14 +1518,11 @@ implementations.
 
 <b> `ReducePrecision(operand, mantissa_bits, exponent_bits)` </b>
 
-| Arguments           | Type                    | Semantics                    |
-| ------------------- | ----------------------- | ---------------------------- |
-| `operand`           | `ComputationDataHandle` | array of floating-point type |
-:                     :                         : `T`.                         :
-| `exponent_bits`     | `int32`                 | number of exponent bits in   |
-:                     :                         : lower-precision format       :
-| `mantissa_bits`     | `int32`                 | number of mantissa bits in   |
-:                     :                         : lower-precision format       :
+Arguments       | Type    | Semantics
+--------------- | ------- | -------------------------------------------------
+`operand`       | `XlaOp` | array of floating-point type `T`.
+`exponent_bits` | `int32` | number of exponent bits in lower-precision format
+`mantissa_bits` | `int32` | number of mantissa bits in lower-precision format
 
 The result is an array of type `T`.  The input values are rounded to the nearest
 value representable with the given number of mantissa bits (using "ties to even"
@@ -1559,7 +1539,7 @@ portion of the conversion is then simply a no-op.
 ## ReduceWindow
 
 See also
-[`ComputationBuilder::ReduceWindow`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::ReduceWindow`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Applies a reduction function to all elements in each window of the input
 multi-dimensional array, producing an output multi-dimensional array with the
@@ -1571,25 +1551,25 @@ on the left-hand side.
 <b> `ReduceWindow(operand, init_value, computation, window_dimensions,
 window_strides, padding)` </b>
 
-| Arguments           | Type                    | Semantics                    |
-| ------------------- | ----------------------- | ---------------------------- |
-| `operand`           | `ComputationDataHandle` | N dimensional array          |
-:                     :                         : containing elements of type  :
-:                     :                         : T. This is the base area on  :
-:                     :                         : which the window is placed.  :
-| `init_value`        | `ComputationDataHandle` | Starting value for the       |
-:                     :                         : reduction. See [Reduce]      :
-:                     :                         : (#reduce) for details.       :
-| `computation`       | `Computation`           | Reduction function of type   |
-:                     :                         : `T, T -> T`, to apply to all :
-:                     :                         : elements in each window      :
-| `window_dimensions` | `ArraySlice<int64>`     | array of integers for window |
-:                     :                         : dimension values             :
-| `window_strides`    | `ArraySlice<int64>`     | array of integers for window |
-:                     :                         : stride values                :
-| `padding`           | `Padding`               | padding type for window      |
-:                     :                         : (Padding\:\:kSame or         :
-:                     :                         : Padding\:\:kValid)           :
+| Arguments           | Type                | Semantics                        |
+| ------------------- | ------------------- | -------------------------------- |
+| `operand`           | `XlaOp`             | N dimensional array containing   |
+:                     :                     : elements of type T. This is the  :
+:                     :                     : base area on which the window is :
+:                     :                     : placed.                          :
+| `init_value`        | `XlaOp`             | Starting value for the           |
+:                     :                     : reduction. See [Reduce](#reduce) :
+:                     :                     : for details.                     :
+| `computation`       | `XlaComputation`    | Reduction function of type `T, T |
+:                     :                     : -> T`, to apply to all elements  :
+:                     :                     : in each window                   :
+| `window_dimensions` | `ArraySlice<int64>` | array of integers for window     |
+:                     :                     : dimension values                 :
+| `window_strides`    | `ArraySlice<int64>` | array of integers for window     |
+:                     :                     : stride values                    :
+| `padding`           | `Padding`           | padding type for window          |
+:                     :                     : (Padding\:\:kSame or             :
+:                     :                     : Padding\:\:kValid)               :
 
 Below code and figure shows an example of using `ReduceWindow`. Input is a
 matrix of size [4x6] and both window_dimensions and window_stride_dimensions are
@@ -1597,9 +1577,9 @@ matrix of size [4x6] and both window_dimensions and window_stride_dimensions are
 
 ```
 // Create a computation for the reduction (maximum).
-Computation max;
+XlaComputation max;
 {
-  ComputationBuilder builder(client_, "max");
+  XlaBuilder builder(client_, "max");
   auto y = builder.Parameter(0, ShapeUtil::MakeShape(F32, {}), "y");
   auto x = builder.Parameter(1, ShapeUtil::MakeShape(F32, {}), "x");
   builder.Max(y, x);
@@ -1607,7 +1587,7 @@ Computation max;
 }
 
 // Create a ReduceWindow computation with the max reduction computation.
-ComputationBuilder builder(client_, "reduce_window_2x3");
+XlaBuilder builder(client_, "reduce_window_2x3");
 auto shape = ShapeUtil::MakeShape(F32, {4, 6});
 auto input = builder.Parameter(0, shape, "input");
 builder.ReduceWindow(
@@ -1642,7 +1622,7 @@ context of [`Reduce`](#reduce) for more details.
 ## Reshape
 
 See also
-[`ComputationBuilder::Reshape`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h)
+[`XlaBuilder::Reshape`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h)
 and the [`Collapse`](#collapse) operation.
 
 Reshapes the dimensions of an array into a new configuration.
@@ -1650,11 +1630,11 @@ Reshapes the dimensions of an array into a new configuration.
 <b> `Reshape(operand, new_sizes)` </b>
 <b> `Reshape(operand, dimensions, new_sizes)` </b>
 
-Arguments    | Type                    | Semantics
------------- | ----------------------- | ---------------------------------------
-`operand`    | `ComputationDataHandle` | array of type T
-`dimensions` | `int64` vector          | order in which dimensions are collapsed
-`new_sizes`  | `int64` vector          | vector of sizes of new dimensions
+Arguments    | Type           | Semantics
+------------ | -------------- | ---------------------------------------
+`operand`    | `XlaOp`        | array of type T
+`dimensions` | `int64` vector | order in which dimensions are collapsed
+`new_sizes`  | `int64` vector | vector of sizes of new dimensions
 
 Conceptually, reshape first flattens an array into a one-dimensional vector of
 data values, and then refines this vector into a new shape. The input arguments
@@ -1723,14 +1703,14 @@ Reshape(5, {}, {1,1}) == f32[1x1] {{5}};
 ## Rev (reverse)
 
 See also
-[`ComputationBuilder::Rev`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Rev`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 <b>`Rev(operand, dimensions)`</b>
 
-Arguments    | Type                    | Semantics
------------- | ----------------------- | ---------------------
-`operand`    | `ComputationDataHandle` | array of type T
-`dimensions` | `ArraySlice<int64>`     | dimensions to reverse
+Arguments    | Type                | Semantics
+------------ | ------------------- | ---------------------
+`operand`    | `XlaOp`             | array of type T
+`dimensions` | `ArraySlice<int64>` | dimensions to reverse
 
 Reverses the order of elements in the `operand` array along the specified
 `dimensions`, generating an output array of the same shape. Each element of the
@@ -1745,7 +1725,7 @@ the two window dimensions during the gradient computation in neural networks.
 ## RngNormal
 
 See also
-[`ComputationBuilder::RngNormal`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::RngNormal`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Constructs an output of a given shape with random numbers generated following
 the $$N(\mu, \sigma)$$ normal distribution. The parameters `mu` and `sigma`, and
@@ -1754,18 +1734,18 @@ be scalar valued.
 
 <b>`RngNormal(mean, sigma, shape)`</b>
 
-| Arguments | Type                    | Semantics                              |
-| --------- | ----------------------- | -------------------------------------- |
-| `mu`      | `ComputationDataHandle` | Scalar of type F32 specifying mean of  |
-:           :                         : generated numbers                      :
-| `sigma`   | `ComputationDataHandle` | Scalar of type F32 specifying standard |
-:           :                         : deviation of generated numbers         :
-| `shape`   | `Shape`                 | Output shape of type F32               |
+| Arguments | Type    | Semantics                                           |
+| --------- | ------- | --------------------------------------------------- |
+| `mu`      | `XlaOp` | Scalar of type F32 specifying mean of generated     |
+:           :         : numbers                                             :
+| `sigma`   | `XlaOp` | Scalar of type F32 specifying standard deviation of |
+:           :         : generated numbers                                   :
+| `shape`   | `Shape` | Output shape of type F32                            |
 
 ## RngUniform
 
 See also
-[`ComputationBuilder::RngUniform`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::RngUniform`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Constructs an output of a given shape with random numbers generated following
 the uniform distribution over the interval $$[a,b)$$. The parameters and output
@@ -1777,27 +1757,27 @@ is implementation-defined.
 
 | Arguments | Type                    | Semantics                         |
 | --------- | ----------------------- | --------------------------------- |
-| `a`       | `ComputationDataHandle` | Scalar of type T specifying lower |
+| `a`       | `XlaOp`                 | Scalar of type T specifying lower |
 :           :                         : limit of interval                 :
-| `b`       | `ComputationDataHandle` | Scalar of type T specifying upper |
+| `b`       | `XlaOp`                 | Scalar of type T specifying upper |
 :           :                         : limit of interval                 :
 | `shape`   | `Shape`                 | Output shape of type T            |
 
 ## Select
 
 See also
-[`ComputationBuilder::Select`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Select`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Constructs an output array from elements of two input arrays, based on the
 values of a predicate array.
 
 <b> `Select(pred, on_true, on_false)` </b>
 
-Arguments  | Type                    | Semantics
----------- | ----------------------- | ------------------
-`pred`     | `ComputationDataHandle` | array of type PRED
-`on_true`  | `ComputationDataHandle` | array of type T
-`on_false` | `ComputationDataHandle` | array of type T
+Arguments  | Type    | Semantics
+---------- | ------- | ------------------
+`pred`     | `XlaOp` | array of type PRED
+`on_true`  | `XlaOp` | array of type T
+`on_false` | `XlaOp` | array of type T
 
 The arrays `on_true` and `on_false` must have the same shape. This is also the
 shape of the output array. The array `pred` must have the same dimensionality as
@@ -1837,7 +1817,7 @@ the same shape!) then `pred` has to be a scalar of type `PRED`.
 ## SelectAndScatter
 
 See also
-[`ComputationBuilder::SelectAndScatter`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::SelectAndScatter`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 This operation can be considered as a composite operation that first computes
 `ReduceWindow` on the `operand` array to select an element from each window, and
@@ -1870,33 +1850,32 @@ backpropagate the gradient values for a pooling layer in a neural network.
 <b>`SelectAndScatter(operand, select, window_dimensions, window_strides,
 padding, source, init_value, scatter)`</b>
 
-| Arguments           | Type                    | Semantics                    |
-| ------------------- | ----------------------- | ---------------------------- |
-| `operand`           | `ComputationDataHandle` | array of type T over which   |
-:                     :                         : the windows slide            :
-| `select`            | `Computation`           | binary computation of type   |
-:                     :                         : `T, T -> PRED`, to apply to  :
-:                     :                         : all elements in each window; :
-:                     :                         : returns `true` if the first  :
-:                     :                         : parameter is selected and    :
-:                     :                         : returns `false` if the       :
-:                     :                         : second parameter is selected :
-| `window_dimensions` | `ArraySlice<int64>`     | array of integers for window |
-:                     :                         : dimension values             :
-| `window_strides`    | `ArraySlice<int64>`     | array of integers for window |
-:                     :                         : stride values                :
-| `padding`           | `Padding`               | padding type for window      |
-:                     :                         : (Padding\:\:kSame or         :
-:                     :                         : Padding\:\:kValid)           :
-| `source`            | `ComputationDataHandle` | array of type T with the     |
-:                     :                         : values to scatter            :
-| `init_value`        | `ComputationDataHandle` | scalar value of type T for   |
-:                     :                         : the initial value of the     :
-:                     :                         : output array                 :
-| `scatter`           | `Computation`           | binary computation of type   |
-:                     :                         : `T, T -> T`, to apply each   :
-:                     :                         : scatter source element with  :
-:                     :                         : its destination element      :
+| Arguments           | Type                | Semantics                        |
+| ------------------- | ------------------- | -------------------------------- |
+| `operand`           | `XlaOp`             | array of type T over which the   |
+:                     :                     : windows slide                    :
+| `select`            | `XlaComputation`    | binary computation of type `T, T |
+:                     :                     : -> PRED`, to apply to all        :
+:                     :                     : elements in each window; returns :
+:                     :                     : `true` if the first parameter is :
+:                     :                     : selected and returns `false` if  :
+:                     :                     : the second parameter is selected :
+| `window_dimensions` | `ArraySlice<int64>` | array of integers for window     |
+:                     :                     : dimension values                 :
+| `window_strides`    | `ArraySlice<int64>` | array of integers for window     |
+:                     :                     : stride values                    :
+| `padding`           | `Padding`           | padding type for window          |
+:                     :                     : (Padding\:\:kSame or             :
+:                     :                     : Padding\:\:kValid)               :
+| `source`            | `XlaOp`             | array of type T with the values  |
+:                     :                     : to scatter                       :
+| `init_value`        | `XlaOp`             | scalar value of type T for the   |
+:                     :                     : initial value of the output      :
+:                     :                     : array                            :
+| `scatter`           | `XlaComputation`    | binary computation of type `T, T |
+:                     :                     : -> T`, to apply each scatter     :
+:                     :                     : source element with its          :
+:                     :                     : destination element              :
 
 The figure below shows examples of using `SelectAndScatter`, with the `select`
 function computing the maximal value among its parameters. Note that when the
@@ -1918,14 +1897,14 @@ context of [`Reduce`](#reduce) for more details.
 ## Send
 
 See also
-[`ComputationBuilder::Send`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Send`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 <b> `Send(operand, channel_handle)` </b>
 
-| Arguments        | Type                    | Semantics                        |
-| ---------------- | ----------------------- | -------------------------------- |
-| `operand`        | `ComputationDataHandle` | data to send (array of type T)   |
-| `channel_handle` | `ChannelHandle`         | unique identifier for each send/recv pair |
+Arguments        | Type            | Semantics
+---------------- | --------------- | -----------------------------------------
+`operand`        | `XlaOp`         | data to send (array of type T)
+`channel_handle` | `ChannelHandle` | unique identifier for each send/recv pair
 
 Sends the given operand data to a `Recv` instruction in another computation
 that shares the same channel handle. Does not return any data.
@@ -1973,7 +1952,7 @@ computations. For example, below schedules lead to deadlocks.
 ## Slice
 
 See also
-[`ComputationBuilder::Slice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Slice`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Slicing extracts a sub-array from the input array. The sub-array is of the same
 rank as the input and contains the values inside a bounding box within the input
@@ -1982,23 +1961,20 @@ arguments to the slice operation.
 
 <b> `Slice(operand, start_indices, limit_indices)` </b>
 
-| Arguments       | Type                    | Semantics                        |
-| --------------- | ----------------------- | -------------------------------- |
-| `operand`       | `ComputationDataHandle` | N dimensional array of type T    |
-| `start_indices` | `ArraySlice<int64>`     | List of N integers containing    |
-:                 :                         : the starting indices of the      :
-:                 :                         : slice for each dimension. Values :
-:                 :                         : must be greater than or equal to :
-:                 :                         : zero.                            :
-| `limit_indices` | `ArraySlice<int64>`     | List of N integers containing    |
-:                 :                         : the ending indices (exclusive)   :
-:                 :                         : for the slice for each           :
-:                 :                         : dimension. Each value must be    :
-:                 :                         : strictly greater than the        :
-:                 :                         : respective `start_indices` value :
-:                 :                         : for the dimension and less than  :
-:                 :                         : or equal to the size of the      :
-:                 :                         : dimension.                       :
+| Arguments       | Type                | Semantics                            |
+| --------------- | ------------------- | ------------------------------------ |
+| `operand`       | `XlaOp`             | N dimensional array of type T        |
+| `start_indices` | `ArraySlice<int64>` | List of N integers containing the    |
+:                 :                     : starting indices of the slice for    :
+:                 :                     : each dimension. Values must be       :
+:                 :                     : greater than or equal to zero.       :
+| `limit_indices` | `ArraySlice<int64>` | List of N integers containing the    |
+:                 :                     : ending indices (exclusive) for the   :
+:                 :                     : slice for each dimension. Each value :
+:                 :                     : must be strictly greater than the    :
+:                 :                     : respective `start_indices` value for :
+:                 :                     : the dimension and less than or equal :
+:                 :                     : to the size of the dimension.        :
 
 1-dimensional example:
 
@@ -2025,15 +2001,15 @@ Slice(b, {2, 1}, {4, 3}) produces:
 ## Sort
 
 See also
-[`ComputationBuilder::Sort`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Sort`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 Sorts the elements in the operand.
 
 <b>`Sort(operand)`</b>
 
-Arguments | Type                    | Semantics
---------- | ----------------------- | -------------------
-`operand` | `ComputationDataHandle` | The operand to sort
+Arguments | Type    | Semantics
+--------- | ------- | -------------------
+`operand` | `XlaOp` | The operand to sort
 
 ## Transpose
 
@@ -2041,10 +2017,10 @@ See also the @{tf.reshape} operation.
 
 <b>`Transpose(operand)`</b>
 
-Arguments     | Type                    | Semantics
----------     | ----------------------- | -------------------------
-`operand`     | `ComputationDataHandle` | The operand to transpose.
-`permutation` | `ArraySlice<int64>`     | How to permute the dimensions.
+Arguments     | Type                | Semantics
+------------- | ------------------- | ------------------------------
+`operand`     | `XlaOp`             | The operand to transpose.
+`permutation` | `ArraySlice<int64>` | How to permute the dimensions.
 
 
 Permutes the operand dimensions with the given permutation, so
@@ -2056,7 +2032,7 @@ This is the same as Reshape(operand, permutation,
 ## Tuple
 
 See also
-[`ComputationBuilder::Tuple`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::Tuple`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 A tuple containing a variable number of data handles, each of which has its own
 shape.
@@ -2075,18 +2051,19 @@ Tuples can be deconstructed (accessed) via the [`GetTupleElement`]
 ## While
 
 See also
-[`ComputationBuilder::While`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/computation_builder.h).
+[`XlaBuilder::While`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_client/xla_builder.h).
 
 <b> `While(condition, body, init)` </b>
 
-| Arguments   | Type          | Semantics                                      |
-| ----------- | ------------- | ---------------------------------------------- |
-| `condition` | `Computation` | Computation of type `T -> PRED` which defines  |
-:             :               : the termination condition of the loop.         :
-| `body`      | `Computation` | Computation of type `T -> T` which defines the |
-:             :               : body of the loop.                              :
-| `init`      | `T`           | Initial value for the parameter of `condition` |
-:             :               : and `body`.                                    :
+| Arguments   | Type             | Semantics                                |
+| ----------- | ---------------- | ---------------------------------------- |
+| `condition` | `XlaComputation` | XlaComputation of type `T -> PRED` which |
+:             :                  : defines the termination condition of the :
+:             :                  : loop.                                    :
+| `body`      | `XlaComputation` | XlaComputation of type `T -> T` which    |
+:             :                  : defines the body of the loop.            :
+| `init`      | `T`              | Initial value for the parameter of       |
+:             :                  : `condition` and `body`.                  :
 
 Sequentially executes the `body` until the `condition` fails. This is similar to
 a typical while loop in many other languages except for the differences and
diff --git a/tensorflow/docs_src/get_started/checkpoints.md b/tensorflow/docs_src/programmers_guide/checkpoints.md
similarity index 98%
rename from tensorflow/docs_src/get_started/checkpoints.md
rename to tensorflow/docs_src/programmers_guide/checkpoints.md
index 4aa07c7f2a0b56..8dfd91e3c8368f 100644
--- a/tensorflow/docs_src/get_started/checkpoints.md
+++ b/tensorflow/docs_src/programmers_guide/checkpoints.md
@@ -38,8 +38,10 @@ Estimators automatically write the following to disk:
     uses to create visualizations.
 
 To specify the top-level directory in which the Estimator stores its
-information, assign a value to the optional `model_dir` argument of any
-Estimator's constructor.  For example, the following code sets the `model_dir`
+information, assign a value to the optional `model_dir` argument of *any*
+`Estimator`'s constructor.
+Taking `DNNClassifier` as an example,
+the following code sets the `model_dir`
 argument to the `models/iris` directory:
 
 ```python
diff --git a/tensorflow/docs_src/get_started/custom_estimators.md b/tensorflow/docs_src/programmers_guide/custom_estimators.md
similarity index 98%
rename from tensorflow/docs_src/get_started/custom_estimators.md
rename to tensorflow/docs_src/programmers_guide/custom_estimators.md
index 275cda12bc397e..fb20b35c128b5b 100644
--- a/tensorflow/docs_src/get_started/custom_estimators.md
+++ b/tensorflow/docs_src/programmers_guide/custom_estimators.md
@@ -5,7 +5,7 @@ This document introduces custom Estimators. In particular, this document
 demonstrates how to create a custom @{tf.estimator.Estimator$Estimator} that
 mimics the behavior of the pre-made Estimator
 @{tf.estimator.DNNClassifier$`DNNClassifier`} in solving the Iris problem. See
-the @{$get_started/premade_estimators$Pre-Made Estimators chapter} for details
+the @{$premade_estimators$Pre-Made Estimators chapter} for details
 on the Iris problem.
 
 To download and access the example code invoke the following two commands:
@@ -84,7 +84,7 @@ and a logits output layer.
 ## Write an Input function
 
 Our custom Estimator implementation uses the same input function as our
-@{$get_started/premade_estimators$pre-made Estimator implementation}, from
+@{$premade_estimators$pre-made Estimator implementation}, from
 [`iris_data.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/iris_data.py).
 Namely:
 
@@ -106,8 +106,8 @@ This input function builds an input pipeline that yields batches of
 
 ## Create feature columns
 
-As detailed in the @{$get_started/premade_estimators$Premade Estimators} and
-@{$get_started/feature_columns$Feature Columns} chapters, you must define
+As detailed in the @{$premade_estimators$Premade Estimators} and
+@{$feature_columns$Feature Columns} chapters, you must define
 your model's feature columns to specify how the model should use each feature.
 Whether working with pre-made Estimators or custom Estimators, you define
 feature columns in the same fashion.
@@ -145,7 +145,7 @@ to the constructor are in turn passed on to the `model_fn`. In
 [`custom_estimator.py`](https://github.com/tensorflow/models/blob/master/samples/core/get_started/custom_estimator.py)
 the following lines create the estimator and set the params to configure the
 model. This configuration step is similar to how we configured the @{tf.estimator.DNNClassifier} in
-@{$get_started/premade_estimators}.
+@{$premade_estimators}.
 
 ```python
 classifier = tf.estimator.Estimator(
@@ -489,7 +489,7 @@ configure your Estimator without modifying the code in the `model_fn`.
 
 The rest of the code to train, evaluate, and generate predictions using our
 Estimator is the same as in the
-@{$get_started/premade_estimators$Premade Estimators} chapter. For
+@{$premade_estimators$Premade Estimators} chapter. For
 example, the following line will train the model:
 
 ```python
diff --git a/tensorflow/docs_src/programmers_guide/datasets.md b/tensorflow/docs_src/programmers_guide/datasets.md
index 67be41b1a68bc7..8b69860a68461e 100644
--- a/tensorflow/docs_src/programmers_guide/datasets.md
+++ b/tensorflow/docs_src/programmers_guide/datasets.md
@@ -485,6 +485,46 @@ dataset = dataset.flat_map(
         .filter(lambda line: tf.not_equal(tf.substr(line, 0, 1), "#"))))
 ```
 
+### Consuming CSV data
+
+The CSV file format is a popular format for storing tabular data in plain text.
+The @{tf.contrib.data.CsvDataset} class provides a way to extract records from
+one or more CSV files that comply with [RFC 4180](https://tools.ietf.org/html/rfc4180).
+Given one or more filenames and a list of defaults, a `CsvDataset` will produce
+a tuple of elements whose types correspond to the types of the defaults
+provided, per CSV record. Like `TFRecordDataset` and `TextLineDataset`,
+`CsvDataset` accepts `filenames` as a `tf.Tensor`, so you can parameterize it
+by passing a  `tf.placeholder(tf.string)`.
+
+```
+# Creates a dataset that reads all of the records from two CSV files, each with
+# eight float columns
+filenames = ["/var/data/file1.csv", "/var/data/file2.csv"]
+record_defaults = [tf.float32] * 8   # Eight required float columns
+dataset = tf.contrib.data.CsvDataset(filenames, record_defaults)
+```
+
+If some columns are empty, you can provide defaults instead of types.
+
+```
+# Creates a dataset that reads all of the records from two CSV files, each with
+# four float columns which may have missing values
+record_defaults = [[0.0]] * 8
+dataset = tf.contrib.data.CsvDataset(filenames, record_defaults)
+```
+
+By default, a `CsvDataset` yields *every* column of *every* line of the file,
+which may not be desirable, for example if the file starts with a header line
+that should be ignored, or if some columns are not required in the input.
+These lines and fields can be removed with the `header` and `select_cols`
+arguments respectively.
+
+```
+# Creates a dataset that reads all of the records from two CSV files with
+# headers, extracting float data from columns 2 and 4.
+record_defaults = [[0.0]] * 2  # Only provide defaults for the selected columns
+dataset = tf.contrib.data.CsvDataset(filenames, record_defaults, header=True, select_cols=[2,4])
+```
 <!--
 TODO(mrry): Add these sections.
 
diff --git a/tensorflow/docs_src/programmers_guide/debugger.md b/tensorflow/docs_src/programmers_guide/debugger.md
index f7817b06d4c8bd..6bd941886d7fe8 100644
--- a/tensorflow/docs_src/programmers_guide/debugger.md
+++ b/tensorflow/docs_src/programmers_guide/debugger.md
@@ -34,7 +34,7 @@ type of bug in TensorFlow model development.
 The following example is for users who use the low-level
 [`Session`](https://www.tensorflow.org/api_docs/python/tf/Session) API of
 TensorFlow. A later section of this document describes how to use **tfdbg**
-with a higher-level API, namely tf-learn `Estimator`s and `Experiment`s.
+with a higher-level API, namely `Estimator`s.
 To *observe* such an issue, run the following command without the debugger (the
 source code can be found
 [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/debug/examples/debug_mnist.py)):
@@ -418,21 +418,20 @@ run -f has_inf_or_nan`
 Confirm that no tensors are flagged as containing `nan` or `inf` values, and
 accuracy now continues to rise rather than getting stuck. Success!
 
-## Debugging tf-learn Estimators and Experiments
+## Debugging TensorFlow Estimators
 
 This section explains how to debug TensorFlow programs that use the `Estimator`
-and `Experiment` APIs. Part of the convenience provided by these APIs is that
+APIs. Part of the convenience provided by these APIs is that
 they manage `Session`s internally. This makes the `LocalCLIDebugWrapperSession`
 described in the preceding sections inapplicable. Fortunately, you can still
 debug them by using special `hook`s provided by `tfdbg`.
 
-### Debugging tf.contrib.learn Estimators
-
-Currently, `tfdbg` can debug the
-@{tf.contrib.learn.BaseEstimator.fit$`fit()`}
-@{tf.contrib.learn.BaseEstimator.evaluate$`evaluate()`}
-methods of tf-learn `Estimator`s. To debug `Estimator.fit()`,
-create a `LocalCLIDebugHook` and supply it in the `monitors` argument. For example:
+`tfdbg` can debug the
+@{tf.estimator.Estimator.train$`train()`},
+@{tf.estimator.Estimator.evaluate$`evaluate()`} and
+@{tf.estimator.Estimator.predict$`predict()`}
+methods of tf-learn `Estimator`s. To debug `Estimator.train()`,
+create a `LocalCLIDebugHook` and supply it in the `hooks` argument. For example:
 
 ```python
 # First, let your BUILD target depend on "//tensorflow/python/debug:debug_py"
@@ -443,67 +442,33 @@ from tensorflow.python import debug as tf_debug
 # Create a LocalCLIDebugHook and use it as a monitor when calling fit().
 hooks = [tf_debug.LocalCLIDebugHook()]
 
-classifier.fit(x=training_set.data,
-               y=training_set.target,
-               steps=1000,
-               monitors=hooks)
+# To debug `train`:
+classifier.train(input_fn,
+                 steps=1000,
+                 hooks=hooks)
 ```
 
-To debug `Estimator.evaluate()`, assign hooks to the `hooks` parameter, as in
-the following example:
+Similarly, to debug `Estimator.evaluate()` and `Estimator.predict()`, assign
+hooks to the `hooks` parameter, as in the following example:
 
 ```python
-accuracy_score = classifier.evaluate(x=test_set.data,
-                                     y=test_set.target,
+# To debug `evaluate`:
+accuracy_score = classifier.evaluate(eval_input_fn,
                                      hooks=hooks)["accuracy"]
-```
 
+# To debug `predict`:
+predict_results = classifier.predict(predict_input_fn, hooks=hooks)
+```
 
 [debug_tflearn_iris.py](https://www.tensorflow.org/code/tensorflow/python/debug/examples/debug_tflearn_iris.py),
-based on [tf-learn's iris tutorial](https://www.tensorflow.org/versions/r1.2/get_started/tflearn), contains a full example of how to
-use the tfdbg with `Estimator`s. To run this example, do:
+based on [tf-learn's iris tutorial](https://www.tensorflow.org/versions/r1.8/get_started/tflearn),
+contains a full example of how to use the tfdbg with `Estimator`s.
+To run this example, do:
 
 ```none
 python -m tensorflow.python.debug.examples.debug_tflearn_iris --debug
 ```
 
-### Debugging tf.contrib.learn Experiments
-
-`Experiment` is a construct in `tf.contrib.learn` at a higher level than
-`Estimator`.
-It provides a single interface for training and evaluating a model. To debug
-the `train()` and `evaluate()` calls to an `Experiment` object, you can
-use the keyword arguments `train_monitors` and `eval_hooks`, respectively, when
-calling its constructor. For example:
-
-```python
-# First, let your BUILD target depend on "//tensorflow/python/debug:debug_py"
-# (You don't need to worry about the BUILD dependency if you are using a pip
-#  install of open-source TensorFlow.)
-from tensorflow.python import debug as tf_debug
-
-hooks = [tf_debug.LocalCLIDebugHook()]
-
-ex = experiment.Experiment(classifier,
-                           train_input_fn=iris_input_fn,
-                           eval_input_fn=iris_input_fn,
-                           train_steps=FLAGS.train_steps,
-                           eval_delay_secs=0,
-                           eval_steps=1,
-                           train_monitors=hooks,
-                           eval_hooks=hooks)
-
-ex.train()
-accuracy_score = ex.evaluate()["accuracy"]
-```
-
-To build and run the `debug_tflearn_iris` example in the `Experiment` mode, do:
-
-```none
-python -m tensorflow.python.debug.examples.debug_tflearn_iris \
-    --use_experiment --debug
-```
-
 The `LocalCLIDebugHook` also allows you to configure a `watch_fn` that can be
 used to flexibly specify what `Tensor`s to watch on different `Session.run()`
 calls, as a function of the `fetches` and `feed_dict` and other states. See
@@ -573,7 +538,7 @@ Often, your model is running on a remote machine or a process that you don't
 have terminal access to. To perform model debugging in such cases, you can use
 the `offline_analyzer` binary of `tfdbg` (described below). It operates on
 dumped data directories. This can be done to both the lower-level `Session` API
-and the higher-level `Estimator` and `Experiment` APIs.
+and the higher-level `Estimator` API.
 
 ### Debugging Remote tf.Sessions
 
@@ -636,7 +601,7 @@ can be inspected offline. See
 [the proto definition](https://www.tensorflow.org/code/tensorflow/core/protobuf/debug.proto)
 for more details.
 
-### Debugging Remotely-Running tf-learn Estimators and Experiments
+### Debugging Remotely-Running Estimators
 
 If your remote TensorFlow server runs `Estimator`s,
 you can use the non-interactive `DumpingDebugHook`. For example:
@@ -652,8 +617,8 @@ hooks = [tf_debug.DumpingDebugHook("/shared/storage/location/tfdbg_dumps_1")]
 
 Then this `hook` can be used in the same way as the `LocalCLIDebugHook` examples
 described earlier in this document.
-As the training and/or evalution of `Estimator` or `Experiment`
-happens, tfdbg creates directories having the following name pattern:
+As the training, evalution or prediction happens with `Estimator`,
+tfdbg creates directories having the following name pattern:
 `/shared/storage/location/tfdbg_dumps_1/run_<epoch_timestamp_microsec>_<uuid>`.
 Each directory corresponds to a `Session.run()` call that underlies
 the `fit()` or `evaluate()` call. You can load these directories and inspect
diff --git a/tensorflow/docs_src/programmers_guide/eager.md b/tensorflow/docs_src/programmers_guide/eager.md
index 595e6be4af78d7..b2bc3273b45292 100644
--- a/tensorflow/docs_src/programmers_guide/eager.md
+++ b/tensorflow/docs_src/programmers_guide/eager.md
@@ -118,13 +118,14 @@ it is easy to write [fizzbuzz](https://en.wikipedia.org/wiki/Fizz_buzz):
 ```py
 def fizzbuzz(max_num):
   counter = tf.constant(0)
-  for num in range(max_num):
+  max_num = tf.convert_to_tensor(max_num)
+  for num in range(max_num.numpy()):
     num = tf.constant(num)
-    if num % 3 == 0 and num % 5 == 0:
+    if int(num % 3) == 0 and int(num % 5) == 0:
       print('FizzBuzz')
-    elif num % 3 == 0:
+    elif int(num % 3) == 0:
       print('Fizz')
-    elif num % 5 == 0:
+    elif int(num % 5) == 0:
       print('Buzz')
     else:
       print(num)
@@ -148,16 +149,17 @@ it to implement your own layer:
 ```py
 class MySimpleLayer(tf.keras.layers.Layer):
   def __init__(self, output_units):
+    super(MySimpleLayer, self).__init__()
     self.output_units = output_units
 
-  def build(self, input):
+  def build(self, input_shape):
     # The build method gets called the first time your layer is used.
     # Creating variables on build() allows you to make their shape depend
-    # on the input shape and hence remove the need for the user to specify
+    # on the input shape and hence removes the need for the user to specify
     # full shapes. It is possible to create variables during __init__() if
     # you already know their full shapes.
     self.kernel = self.add_variable(
-      "kernel", [input.shape[-1], self.output_units])
+      "kernel", [input_shape[-1], self.output_units])
 
   def call(self, input):
     # Override call() instead of __call__ so we can perform some bookkeeping.
@@ -227,8 +229,8 @@ w = tfe.Variable([[1.0]])
 with tf.GradientTape() as tape:
   loss = w * w
 
-grad = tape.gradient(loss, [w])
-print(grad)  # => [tf.Tensor([[ 2.]], shape=(1, 1), dtype=float32)]
+grad = tape.gradient(loss, w)
+print(grad)  # => tf.Tensor([[ 2.]], shape=(1, 1), dtype=float32)
 ```
 
 Here's an example of `tf.GradientTape` that records forward-pass operations
@@ -596,7 +598,7 @@ def line_search_step(fn, init_x, rate=1.0):
     # Variables are automatically recorded, but manually watch a tensor
     tape.watch(init_x)
     value = fn(init_x)
-  grad, = tape.gradient(value, [init_x])
+  grad = tape.gradient(value, init_x)
   grad_norm = tf.reduce_sum(grad * grad)
   init_value = value
   while value > init_value - rate * grad_norm:
diff --git a/tensorflow/docs_src/programmers_guide/embedding.md b/tensorflow/docs_src/programmers_guide/embedding.md
index d5703e07375b1f..8a98367dfbb97e 100644
--- a/tensorflow/docs_src/programmers_guide/embedding.md
+++ b/tensorflow/docs_src/programmers_guide/embedding.md
@@ -238,7 +238,7 @@ row doesn't have to be filled, as shown below.
 </tr>
 </table>
 
-Follow [this link]("https://www.tensorflow.org/images/embedding-mnist.mp4" )
+Follow [this link](https://www.tensorflow.org/images/embedding-mnist.mp4)
 to see a fun example of thumbnail images in the Embedding Projector.
 
 
diff --git a/tensorflow/docs_src/programmers_guide/estimators.md b/tensorflow/docs_src/programmers_guide/estimators.md
index ffadf29ad7710d..b13b47184d2b32 100644
--- a/tensorflow/docs_src/programmers_guide/estimators.md
+++ b/tensorflow/docs_src/programmers_guide/estimators.md
@@ -21,18 +21,17 @@ Note: TensorFlow also includes a deprecated `Estimator` class at
 
 Estimators provide the following benefits:
 
-*   You can run Estimators-based models on a local host or on a
+*   You can run Estimator-based models on a local host or on a
     distributed multi-server environment without changing your model.
-    Furthermore, you can run Estimators-based models on CPUs, GPUs,
+    Furthermore, you can run Estimator-based models on CPUs, GPUs,
     or TPUs without recoding your model.
 *   Estimators simplify sharing implementations between model developers.
-*   You can develop a state of the art model with high-level intuitive code,
+*   You can develop a state of the art model with high-level intuitive code.
     In short, it is generally much easier to create models with Estimators
     than with the low-level TensorFlow APIs.
-*   Estimators are themselves built on tf.layers, which
+*   Estimators are themselves built on @{tf.layers}, which
     simplifies customization.
-*   Estimators build the graph for you.  In other words, you don't have to
-    build the graph.
+*   Estimators build the graph for you.
 *   Estimators provide a safe distributed training loop that controls how and
     when to:
     *   build the graph
@@ -57,7 +56,7 @@ the "plumbing" for you.  That is, pre-made Estimators create and manage
 pre-made Estimators let you experiment with different model architectures by
 making only minimal code changes.  @{tf.estimator.DNNClassifier$`DNNClassifier`},
 for example, is a pre-made Estimator class that trains classification models
-through dense, feed-forward neural networks.
+based on dense, feed-forward neural networks.
 
 
 ### Structure of a pre-made Estimators program
@@ -79,7 +78,7 @@ of the following four steps:
     an input function:
 
         def input_fn(dataset):
-           ...  # manipulate dataset, extracting feature names and the label
+           ...  # manipulate dataset, extracting the feature dict and the label
            return feature_dict, label
 
     (See @{$programmers_guide/datasets} for full details.)
@@ -96,13 +95,13 @@ of the following four steps:
         population = tf.feature_column.numeric_column('population')
         crime_rate = tf.feature_column.numeric_column('crime_rate')
         median_education = tf.feature_column.numeric_column('median_education',
-                            normalizer_fn='lambda x: x - global_education_mean')
+                            normalizer_fn=lambda x: x - global_education_mean)
 
 3.  **Instantiate the relevant pre-made Estimator.**  For example, here's
     a sample instantiation of a pre-made Estimator named `LinearClassifier`:
 
         # Instantiate an estimator, passing the feature columns.
-        estimator = tf.estimator.Estimator.LinearClassifier(
+        estimator = tf.estimator.LinearClassifier(
             feature_columns=[population, crime_rate, median_education],
             )
 
@@ -134,7 +133,7 @@ The heart of every Estimator--whether pre-made or custom--is its
 evaluation, and prediction. When you are using a pre-made Estimator,
 someone else has already implemented the model function. When relying
 on a custom Estimator, you must write the model function yourself. A
-@{$get_started/custom_estimators$companion document}
+@{$custom_estimators$companion document}
 explains how to write the model function.
 
 
diff --git a/tensorflow/docs_src/programmers_guide/faq.md b/tensorflow/docs_src/programmers_guide/faq.md
index 51c1a1e032baae..b6291a9fface40 100644
--- a/tensorflow/docs_src/programmers_guide/faq.md
+++ b/tensorflow/docs_src/programmers_guide/faq.md
@@ -72,7 +72,7 @@ tensors in the execution of a step.
 
 If `t` is a @{tf.Tensor} object,
 @{tf.Tensor.eval} is shorthand for
-@{tf.Session.run} (where `sess` is the
+@{tf.Session.run}, where `sess` is the
 current @{tf.get_default_session}. The
 two following snippets of code are equivalent:
 
@@ -101,9 +101,8 @@ sessions, it may be more straightforward to make explicit calls to
 Sessions can own resources, such as
 @{tf.Variable},
 @{tf.QueueBase}, and
-@{tf.ReaderBase}; and these resources can use
-a significant amount of memory. These resources (and the associated memory) are
-released when the session is closed, by calling
+@{tf.ReaderBase}. These resources can sometimes use
+a significant amount of memory, and can be released when the session is closed by calling
 @{tf.Session.close}.
 
 The intermediate tensors that are created as part of a call to
@@ -137,7 +136,7 @@ TensorFlow also has a
 to help build support for more client languages.  We invite contributions of new
 language bindings.
 
-Bindings for various other languages (such as [C#](https://github.com/migueldeicaza/TensorFlowSharp), [Julia](https://github.com/malmaud/TensorFlow.jl), [Ruby](https://github.com/somaticio/tensorflow.rb) and [Scala](https://github.com/eaplatanios/tensorflow_scala)) created and supported by the opensource community build on top of the C API supported by the TensorFlow maintainers.
+Bindings for various other languages (such as [C#](https://github.com/migueldeicaza/TensorFlowSharp), [Julia](https://github.com/malmaud/TensorFlow.jl), [Ruby](https://github.com/somaticio/tensorflow.rb) and [Scala](https://github.com/eaplatanios/tensorflow_scala)) created and supported by the open source community build on top of the C API supported by the TensorFlow maintainers.
 
 #### Does TensorFlow make use of all the devices (GPUs and CPUs) available on my machine?
 
@@ -210,8 +209,8 @@ a new tensor with a different dynamic shape.
 
 #### How do I build a graph that works with variable batch sizes?
 
-It is often useful to build a graph that works with variable batch sizes, for
-example so that the same code can be used for (mini-)batch training, and
+It is often useful to build a graph that works with variable batch sizes 
+so that the same code can be used for (mini-)batch training, and
 single-instance inference. The resulting graph can be
 @{tf.Graph.as_graph_def$saved as a protocol buffer}
 and
@@ -260,7 +259,7 @@ See the how-to documentation for
 There are three main options for dealing with data in a custom format.
 
 The easiest option is to write parsing code in Python that transforms the data
-into a numpy array. Then use @{tf.data.Dataset.from_tensor_slices} to
+into a numpy array. Then, use @{tf.data.Dataset.from_tensor_slices} to
 create an input pipeline from the in-memory data.
 
 If your data doesn't fit in memory, try doing the parsing in the Dataset
@@ -274,7 +273,7 @@ If your data is not easily parsable with the built-in TensorFlow operations,
 consider converting it, offline, to a format that is easily parsable, such
 as @{tf.python_io.TFRecordWriter$`TFRecord`} format.
 
-The more efficient method to customize the parsing behavior is to
+The most efficient method to customize the parsing behavior is to
 @{$adding_an_op$add a new op written in C++} that parses your
 data format. The @{$new_data_formats$guide to handling new data formats} has
 more information about the steps for doing this.
diff --git a/tensorflow/docs_src/get_started/feature_columns.md b/tensorflow/docs_src/programmers_guide/feature_columns.md
similarity index 98%
rename from tensorflow/docs_src/get_started/feature_columns.md
rename to tensorflow/docs_src/programmers_guide/feature_columns.md
index 9c777a0077a768..90f5c53a17f232 100644
--- a/tensorflow/docs_src/get_started/feature_columns.md
+++ b/tensorflow/docs_src/programmers_guide/feature_columns.md
@@ -5,7 +5,7 @@ intermediaries between raw data and Estimators. Feature columns are very rich,
 enabling you to transform a diverse range of raw data into formats that
 Estimators can use, allowing easy experimentation.
 
-In @{$get_started/premade_estimators$Premade Estimators}, we used the premade
+In @{$premade_estimators$Premade Estimators}, we used the premade
 Estimator, @{tf.estimator.DNNClassifier$`DNNClassifier`} to train a model to
 predict different types of Iris flowers from four input features. That example
 created only numerical feature columns (of type
@@ -138,7 +138,7 @@ The model will represent the buckets as follows:
 |< 1960               | [1, 0, 0, 0] |
 |>= 1960 but < 1980   | [0, 1, 0, 0] |
 |>= 1980 but < 2000   | [0, 0, 1, 0] |
-|> 2000               | [0, 0, 0, 1] |
+|>= 2000              | [0, 0, 0, 1] |
 
 Why would you want to split a number—a perfectly valid input to your
 model—into a categorical value? Well, notice that the categorization splits a
@@ -528,10 +528,10 @@ suggested by the following snippet:
 categorical_column = ... # Create any categorical column
 
 # Represent the categorical column as an embedding column.
-# This means creating a one-hot vector with one element for each category.
+# This means creating an embedding vector lookup table with one element for each category.
 embedding_column = tf.feature_column.embedding_column(
     categorical_column=categorical_column,
-    dimension=dimension_of_embedding_vector)
+    dimension=embedding_dimensions)
 ```
 
 @{$programmers_guide/embedding$Embeddings} is a significant topic within machine
diff --git a/tensorflow/docs_src/programmers_guide/index.md b/tensorflow/docs_src/programmers_guide/index.md
index 648d001bd3535f..0c2d4afb115c59 100644
--- a/tensorflow/docs_src/programmers_guide/index.md
+++ b/tensorflow/docs_src/programmers_guide/index.md
@@ -5,11 +5,31 @@ works. The units are as follows:
 
 ## High Level APIs
 
-  * @{$programmers_guide/eager}, which is the easiest way to use TensorFlow.
-  * @{$programmers_guide/estimators}, which introduces a high-level
-    TensorFlow API that greatly simplifies ML programming.
-  * @{$programmers_guide/datasets}, which explains how to
-    set up data pipelines to read data sets into your TensorFlow program.
+  * @{$programmers_guide/keras}, TensorFlow's high-level API for building and
+    training deep learning models.
+  * @{$programmers_guide/eager}, an API for writing TensorFlow code
+    imperatively, like you would use Numpy.
+  * @{$programmers_guide/estimators}, a high-level API that provides
+    fully-packaged models ready for large-scale training and production.
+  * @{$programmers_guide/datasets}, easy input pipelines to bring your data into
+    your TensorFlow program.
+
+## Estimators
+
+* @{$estimators} provides an introduction.
+* @{$premade_estimators}, introduces Estimators for machine learning.
+* @{$custom_estimators}, which demonstrates how to build and train models you
+  design yourself.
+* @{$feature_columns}, which shows how an Estimator can handle a variety of input
+  data types without changes to the model.
+* @{$checkpoints}, which explains how to save training progress and resume where
+  you left off.
+
+## Accelerators
+
+  * @{$using_gpu} explains how TensorFlow assigns operations to
+    devices and how you can change the arrangement manually.
+  * @{$using_tpu} explains how to modify `Estimator` programs to run on a TPU.
 
 ## Low Level APIs
 
@@ -32,13 +52,6 @@ works. The units are as follows:
   * @{$programmers_guide/saved_model}, which
     explains how to save and restore variables and models.
 
-## Accelerators
-
-  * @{$using_gpu} explains how TensorFlow assigns operations to
-    devices and how you can change the arrangement manually.
-  * @{$using_tpu} explains how to modify `Estimator` programs to run on a TPU.
-
-
 ## ML Concepts
 
   * @{$programmers_guide/embedding}, which introduces the concept
diff --git a/tensorflow/docs_src/programmers_guide/keras.md b/tensorflow/docs_src/programmers_guide/keras.md
new file mode 100644
index 00000000000000..6a9df12a25cf7a
--- /dev/null
+++ b/tensorflow/docs_src/programmers_guide/keras.md
@@ -0,0 +1,715 @@
+# Keras
+
+## What's Keras?
+
+Keras is a high-level API specification for building and training deep learning
+models, suitable for fast prototyping, advanced research, and production.
+It offers three key advantages:
+
+- **User friendliness.** Keras follows best practices for reducing
+    cognitive load: it offers consistent & simple interfaces,
+    it minimizes the number of user actions required for common use cases,
+    and it provides clear and actionable feedback upon user error.
+- **Modularity and composability.** A Keras model is composed of
+    fully-configurable building blocks that can be plugged together
+    with as few restrictions as possible -- like Lego bricks.
+- **Easy extensibility.** You can easily write your own building blocks
+    (such as new layers, new loss functions, new models where you write
+    the forward pass from scratch). This allows for total expressiveness,
+    making Keras suitable for advanced research.
+
+
+## What's tf.keras?
+
+`tf.keras` is TensorFlow's implementation of the Keras API specification, that
+serves as the TensorFlow high-level API: it's how you build models in TensorFlow.
+`tf.keras` seamlessly integrates with the rest of the TensorFlow API
+(such as `tf.data` input pipelines), bringing you the full power and flexibility
+of TensorFlow through an easy-to-use interface.
+
+You can import `tf.keras` via:
+
+```python
+from tensorflow import keras
+```
+
+What follows is a quick introduction to the basics of `tf.keras`.
+
+
+## Table of contents
+
+- [Getting started: the Sequential model](#getting-started-the-sequential-model)
+- [Configuring layers](#configuring-layers)
+- [Configuring training](#configuring-training)
+- [Training and evaluation](#training-and-evaluation)
+- [Building advanced models: the functional API](#building-advanced-models-the-functional-api)
+- [Building fully-customizable research models: the Model subclassing API](#building-fully-customizable-research-models-the-model-subclassing-api)
+- [Callbacks](#callbacks)
+- [Saving and serialization](#saving-and-serialization)
+- [Developing custom layers](#developing-custom-layers)
+- [Eager execution](#eager-execution)
+- [Further reading](#further-reading)
+- [FAQ](#faq)
+
+
+---
+
+## Getting started: the Sequential model
+
+In `tf.keras`, you're assembling together **layers** to build **models**.
+A model is generally a graph of layers.
+The most common type of model is just a stack of layers: the `Sequential` class.
+
+Here's how to build a simple fully-connected network (multi-layer perceptron):
+
+```python
+from tensorflow import keras
+from tensorflow.keras import layers
+
+model = keras.Sequential()
+# This adds to the model a densely-connected layer with 64 units:
+model.add(Dense(64, activation='relu'))
+# Another one:
+model.add(Dense(64, activation='relu'))
+# This adds a softmax layer with 10 output units:
+model.add(Dense(10, activation='softmax'))
+```
+
+---
+
+## Configuring layers
+
+Each layer may have unique constructor arguments, but some common arguments include:
+
+- `activation`: the activation function to be used.
+    It could be specified by name, as a string (for built-in functions)
+    or as a callable object. By default, no activation is applied.
+- `kernel_initializer` and `bias_initializer`: the initialization schemes to use
+    to create the layer's weights (kernel and bias).
+    Likewise, they may be passed either by name or by specifying a callable.
+    By default, the "Glorot uniform" initializer is used.
+- `kernel_regularizer` and `bias_regularizer`: the regularization schemes to
+    apply to the layer's weights (kernel and bias), such as L1
+    or L2 regularization. By default, no regularization is applied.
+
+
+### Examples
+
+```python
+import tensorflow as tf
+from tensorflow.keras.layers import Dense
+from tensorflow.keras import regularizers
+from tensorflow.keras import initializers
+
+# A sigmoid layer:
+Dense(64, activation='sigmoid')
+# Another way to define the same sigmoid layer:
+Dense(64, activation=tf.sigmoid)
+
+# A linear layer with L1 regularization of factor 0.01
+# applied to the kernel matrix:
+Dense(64, kernel_regularizer=regularizers.l1(0.01))
+# A linear layer with L2 regularization of factor 0.01
+# applied to the bias vector:
+Dense(64, bias_regularizer=regularizers.l2(0.01))
+
+# A linear layer with a kernel initialized to a random orthogonal matrix:
+Dense(64, kernel_initializer='orthogonal')
+# A linear layer with a bias vector initialized to 2.0s:
+Dense(64, bias_initializer=initializers.constant(2.0))
+```
+
+---
+
+## Configuring training
+
+Once your model looks good, configure its learning process by calling `compile`:
+
+```python
+import tensorflow as tf
+
+model.compile(optimizer=tf.train.AdamOptimizer(0.001),
+              loss='categorical_crossentropy',
+              metrics=['accuracy'])
+```
+
+There are three key arguments that you need to specify:
+
+- An `optimizer`: this object specifies the training procedure.
+    We recommend that you pass instances of optimizers from the `tf.train` module
+    (such as [`AdamOptimizer`](https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer),
+    [`RMSPropOptimizer`](https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer),
+    or [`GradientDescentOptimizer`](https://www.tensorflow.org/api_docs/python/tf/train/GradientDescentOptimizer)).
+- A `loss` function to minimize: this specifies the optimization objective.
+    Common choices include mean square error (`mse`), `categorical_crossentropy`
+    and `binary_crossentropy`. Loss functions may be specified by name
+    or by passing a callable (e.g. from the `tf.keras.losses` module).
+- Some `metrics` to monitor during training: again, you can pass these as either
+    string names or callables (e.g. from the `tf.keras.metrics` module).
+
+
+### Examples
+
+```python
+# Configures a model to do mean-squared error regression.
+model.compile(optimizer=tf.train.AdamOptimizer(0.01),
+              loss='mse',  # mean squared error
+              metrics=['mae'])  # mean absolute error
+```
+```python
+# Configures a model to do categorical classification.
+model.compile(optimizer=tf.train.RMSPropOptimizer(0.01),
+              loss=tf.keras.losses.categorical_crossentropy,
+              metrics=[tf.keras.metrics.categorical_accuracy])
+```
+
+---
+
+## Training and evaluation
+
+### From Numpy data
+
+When running locally on small datasets, the easiest way to do training and
+evaluation is to pass data to your model as Numpy arrays of inputs and targets.
+You can "fit" your model to some training data using the `model.fit()` method:
+
+```python
+import numpy as np
+
+data = np.random.random(shape=(1000, 32))
+targets = np.random.random(shape=(1000, 10))
+
+model.fit(data, targets, epochs=10, batch_size=32)
+```
+
+Here are some key arguments you can pass to the `fit` method:
+
+- `epochs`: Training is structured into **epochs**. An epoch is one iteration
+    over the entire input data (which is done in smaller batches).
+- `batch_size`: when passing Numpy data, the model will slice the data into
+    smaller batches and iterate over these batches during training.
+    This integer specifies the size of each batch
+    (the last batch may be smaller if the total number of samples is not
+    divisible by the batch size).
+- `validation_data`: when prototyping a model, you want to be able to quickly
+    monitor its performance on some validation data.
+    When you pass this argument (it expects a tuple of inputs and targets),
+    the model will display the loss and metrics in inference mode on the data
+    you passed, at the end of each epoch.
+
+Here's an example using `validation_data`:
+
+```python
+import numpy as np
+
+data = np.random.random(shape=(1000, 32))
+targets = np.random.random(shape=(1000, 10))
+
+val_data = np.random.random(shape=(100, 32))
+val_targets = np.random.random(shape=(100, 10))
+
+model.fit(data, targets, epochs=10, batch_size=32,
+          validation_data=(val_data, val_targets))
+```
+
+### From tf.data datasets
+
+When you need to scale to large datasets or multi-device training,
+training from Numpy arrays in memory will not be ideal.
+In such cases, you should use [the `tf.data` API](https://www.tensorflow.org/programmers_guide/datasets).
+You can pass a `tf.data.Dataset` instance to the `fit` method:
+
+```python
+import tensorflow as tf
+
+# Instantiates a toy dataset instance:
+dataset = tf.data.Dataset.from_tensor_slices((data, targets)).batch(32)
+
+# Don't forget to specify `steps_per_epoch` when calling `fit` on a dataset.
+model.fit(dataset, epochs=10, steps_per_epoch=30)
+```
+
+When doing so, the dataset itself will yield batches of data,
+so the model does not need to be passed `batch_size` information.
+Instead, the model needs to know for how many steps (or batches of data)
+it should run at each epoch.
+You specify this with the `steps_per_epoch` argument: it's the number of
+training steps the model will run before moving on the next epoch.
+
+You can also pass datasets for validation:
+
+```python
+dataset = tf.data.Dataset.from_tensor_slices((data, targets)).batch(32)
+val_dataset = tf.data.Dataset.from_tensor_slices((val_data, val_targets)).batch(32)
+
+model.fit(dataset, epochs=10, steps_per_epoch=30, validation_data=val_dataset, validation_steps=3)
+```
+
+### Evaluate and predict
+
+In addition, you get access to the following methods
+(both with Numpy data and dataset instances):
+
+- `model.evaluate(x, y, batch_size=32)` or `model.evaluate(dataset, steps=30)`
+    will return the inference-mode loss and metrics for the data provided.
+- `model.predict(x, y, batch_size=32)` or `model.predict(dataset, steps=30)`
+    will return the output(s) of the last layer(s) in inference on the data
+    provided, as Numpy array(s).
+
+---
+
+## Building advanced models: the functional API
+
+The `Sequential` model cannot represent arbitrary models -- only simple stacks
+of layers. If you need to use more complex model topologies,
+such as multi-input models, multi-output models,
+models with a same layer called several times (shared layers),
+or models with non-sequential data flows (e.g. residual connections),
+you can use the 'functional API'.
+
+Here's how it works:
+
+- A layer instance is callable (on a tensor), and it returns a tensor.
+- Input tensor(s) and output tensor(s) can then be used to define a `Model` instance.
+- Such a model can be trained just like the `Sequential` model.
+
+Here's a basic example showing the same model we previously defined,
+built using the functional API:
+
+
+```python
+from tensorflow import keras
+from tensorflow.keras import layers
+
+# This returns a placeholder tensor:
+inputs = keras.Input(shape=(784,))
+
+# A layer instance is callable on a tensor, and returns a tensor.
+x = layers.Dense(64, activation='relu')(inputs)
+x = layers.Dense(64, activation='relu')(x)
+predictions = layers.Dense(10, activation='softmax')(x)
+
+# Instantiates the model given inputs and outputs.
+model = keras.Model(inputs=inputs, outputs=predictions)
+
+# The "compile" step specifies the training configuration.
+model.compile(optimizer='rmsprop',
+              loss='categorical_crossentropy',
+              metrics=['accuracy'])
+
+# Trains for 5 epochs.
+model.fit(data, labels, batch_size=32, epochs=5)
+```
+
+This API enables you to create models with multiple inputs and outputs,
+and to "share" layers across different inputs
+(i.e. to reuse a same instance multiple times).
+For examples of these use cases,
+please see [this guide to the functional API in Keras](https://keras.io/getting-started/functional-api-guide/).
+
+---
+
+## Building fully-customizable research models: the Model subclassing API
+
+Besides `Sequential` and the functional API, one last, more flexible way to
+define models is to directly subclass the `Model` class and define your own
+forward pass manually.
+
+In this API, you instante layers in `__init__` and set them as attribute of the
+class instance. Then you specify the forward pass in `call`.
+This API is particularly valuable when using TensorFlow with [eager execution](https://www.tensorflow.org/programmers_guide/eager),
+since eager execution allows you to write your forward pass in an
+imperative fashion (as if you were writing Numpy code, for instance).
+
+```python
+import tensorflow as tf
+from tensorflow import keras
+
+
+class MyModel(keras.Model):
+
+  def __init__(self, num_classes=2):
+    super(MyModel, self).__init__(name='my_model')
+    self.num_classes = num_classes
+    # Define your layers here.
+    self.dense_1 = keras.layers.Dense(32, activation='relu')
+    self.dense_2 = keras.layers.Dense(num_classes, activation='sigmoid')
+
+  def call(self, inputs):
+    # Define your forward pass here,
+    # using layers you previously defined (in `__init__`).
+    x = self.dense_1(inputs)
+    return self.dense_2(x)
+
+  def compute_output_shape(self, input_shape):
+    # You need to override this function if you want to use the subclassed model
+    # as part of a functional-style model.
+    # Otherwise, this method is optional.
+    shape = tf.TensorShape(input_shape).as_list()
+    shape[-1] = self.num_classes
+    return tf.TensorShape(shape)
+
+
+# Instantiates the subclassed model.
+model = MyModel(num_classes=2)
+
+# The "compile" step specifies the training configuration.
+model.compile(optimizer='rmsprop',
+              loss='categorical_crossentropy',
+              metrics=['accuracy'])
+
+# Trains for 5 epochs.
+model.fit(data, labels, batch_size=32, epochs=5)
+```
+
+**Remember:** use the right API for the right job.
+Using the `Model` subclassing API offers more flexibility,
+but at the cost of greater complexity and a larger potential user error surface.
+Prefer using the functional API when possible.
+
+---
+
+## Callbacks
+
+Callbacks are objects that you can pass to your model that customize and extend
+its behavior during training.
+There are callbacks for saving checkpoints of your model at regular intervals
+(`tf.keras.callbacks.ModelCheckpoint`),
+to dynamically change the learning rate (`tf.keras.callbacks.LearningRateScheduler`)
+or to interrupt training when validation performance has stopped improving
+(`tf.keras.callbacks.EarlyStopping`).
+You can also use a callback to monitor your model's behavior using
+[TensorBoard](https://www.tensorflow.org/programmers_guide/summaries_and_tensorboard)
+(`tf.keras.callbacks.TensorBoard`).
+You can also write your own custom callbacks.
+
+Different built-in callback are found in `tf.keras.callbacks`.
+You use them by passing a `Callback` instance to `fit`:
+
+```python
+from tensorflow import keras
+
+callbacks = [
+    # Interrupt training if `val_loss` stops improving for over 2 epochs
+    keras.callbacks.EarlyStopping(patience=2, monitor='val_loss'),
+    # Write TensorBoard logs to `./logs` directory
+    keras.callbacks.TensorBoard(log_dir='./logs')
+]
+model.fit(data, labels, batch_size=32, epochs=5, callbacks=callbacks)
+```
+
+---
+
+## Saving and serialization
+
+### Weights-only saving
+
+You can save the weight values of a model via `model.save_weights(filepath)`:
+
+```python
+# Saves weights to a SavedModel file.
+model.save_weights('my_model')
+
+# Restores the model's state
+# (this requires a model that has the same architecture).
+model.load_weights('my_model')
+```
+
+By default, this saves the weight in the TensorFlow
+[`SavedModel`](https://www.tensorflow.org/programmers_guide/saved_model) format.
+You could also save them in the Keras HDF5 format
+(which is the default in the multi-backend implementation of Keras):
+
+```python
+# Saves weights to a HDF5 file.
+model.save_weights('my_model.h5', format='h5')
+
+# Restores the model's state.
+model.load_weights('my_model.h5')
+```
+
+### Configuration-only saving (serialization)
+
+You can also save the model's configuration
+(its architecture, without any weight values),
+which allows you to recreate the same model later (freshly initialized) even if
+you don't have the code that defined it anymore.
+Two possible serialization formats are JSON and YAML:
+
+```python
+from tensorflow.keras import models
+
+# Serializes a model to JSON.
+json_string = model.to_json()
+# Recreates the model (freshly initialized).
+fresh_model = models.from_json(json_string)
+
+# Serializes a model to YAML.
+yaml_string = model.to_yaml()
+# Recreates the model.
+fresh_model = models.from_yaml(yaml_string)
+```
+
+Note that this feature is not available with subclassed models,
+because they are simply not serializable:
+their architecture is defined as Python code
+(the body of the `call` method of the model).
+
+### Whole-model saving
+
+Finally, you can also save a model wholesale, to a file that will contain both
+the weight values, the model's configuration,
+and even the optimizer's configuration.
+The allows you to checkpoint a model and resume training later --
+from the exact same state -- even if you don't have access to the original code.
+
+```python
+from tensorflow.keras import models
+
+model.save('my_model.h5')
+
+# Recreates the exact same model, complete with weights and optimizer.
+model = models.load_model('my_model.h5')
+```
+
+---
+
+## Developing custom layers
+
+You can write your own custom layers by subclassing the class
+`tf.keras.layers.Layer`. You will need to implement the following three methods:
+
+- `build`: Creates the weights of the layer.
+    Weights should be added via the `add_weight` method.
+- `call`: Specifies the forward pass.
+- `compute_output_shape`: Specifies how to compute the output shape of the layer 
+    given the input shape.
+
+Optionally, you may also implement the method `get_config()` and the
+class method `from_config()` if you want your layer to be serializable.
+
+Here's a simple example of a custom layer that implements a `matmul`
+of an input with a kernel matrix:
+
+```python
+import tensorflow as tf
+from tensorflow.keras import layers
+
+class MyLayer(layers.Layer):
+
+    def __init__(self, output_dim, **kwargs):
+        self.output_dim = output_dim
+        super(MyLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        # Create a trainable weight variable for this layer.
+        self.kernel = self.add_weight(name='kernel', 
+                                      shape=(input_shape[1], self.output_dim),
+                                      initializer='uniform',
+                                      trainable=True)
+        # Be sure to call this at the end
+        super(MyLayer, self).build(input_shape)
+
+    def call(self, inputs):
+        return tf.matmul(inputs, self.kernel)
+
+    def compute_output_shape(self, input_shape):
+        shape = tf.TensorShape(input_shape).as_list()
+        shape[-1] = self.output_dim
+        return tf.TensorShape(shape)
+
+    def get_config(self):
+        base_config = super(MyLayer, self).get_config()
+        base_config['output_dim'] = self.output_dim
+
+    @classmethod
+    def from_config(cls, config):
+        return cls(**config)
+```
+
+---
+
+## Eager execution
+
+[Eager execution](https://www.tensorflow.org/programmers_guide/eager)
+is a way to write TensorFlow code imperatively.
+
+All three `tf.keras` model-building APIs
+(`Sequential`, the functional API `Model(inputs, outputs)`,
+and the subclassing API `MyModel(Model)`) are compatible with eager execution.
+When using `Sequential` or the functional API, it makes no difference to the
+user experience whether the model is executing eagerly or not.
+Eager execution is most beneficial when used with the `Model` subclassing API,
+or when prototyping a custom layer -- that is to say, in APIs that require you
+to *write a forward pass as code*, rather than in APIs that allow you to create
+models by assembling together existing layers.
+
+While the same training and evaluating APIs presented in this guide work
+as usual with eager execution, you can in addition
+write custom training loops using the eager `GradientTape`
+and define-by-run autodifferentiation:
+
+```python
+import tensorflow as tf
+from tensorflow.contrib import eager as tfe
+
+# This call begins the eager execution session.
+tf.enable_eager_execution()
+
+model = ...  # Defines a Keras model (we recommend Model subclassing in this case).
+dataset = ...  # Defines a `tf.data` dataset.
+
+optimizer = tf.train.AdamOptimizer(0.01)
+
+for data, labels in dataset:
+    # Runs the forward pass and loss computation under a `GradientTape` scope,
+    # which will record all operations in order to prepare for the backward pass.
+    with tfe.GradientTape() as tape:
+      predictions = model(data)
+      loss = loss_function(labels, predictions)
+
+    # Runs the backward pass manually using the operations recorded
+    # by the gradient tape.
+    grads = tape.gradient(loss, model.trainable_weights)
+    optimizer.apply_gradients(zip(grads, model.trainable_weights),
+                              global_step=tf.train.get_or_create_global_step())
+```
+
+---
+
+## Further reading
+
+### Documentation
+
+- [tf.keras documentation](https://www.tensorflow.org/api_docs/python/tf/keras)
+- [keras.io](https://keras.io/)
+
+### tf.keras tutorials and examples
+
+- [Fashion-MNIST with tf.Keras](https://medium.com/tensorflow/hello-deep-learning-fashion-mnist-with-keras-50fcff8cd74a)
+- [Predicting the price of wine with the Keras Functional API and TensorFlow](
+    https://medium.com/tensorflow/predicting-the-price-of-wine-with-the-keras-functional-api-and-tensorflow-a95d1c2c1b03)
+
+
+---
+
+## FAQ
+
+### What are the differences between tf.keras and the multi-backend Keras implementation?
+
+`tf.keras` includes first-class support for important TensorFlow-specific
+functionality not found in other Keras implementations, in particular:
+
+- Support for eager execution.
+- Support for the `tf.data` API.
+- Integration with the
+    [`tf.estimator` API](https://www.tensorflow.org/programmers_guide/estimators),
+    via `tf.keras.estimator.model_to_estimator`.
+
+In terms of API differences: `tf.keras` is a full implementation of the
+Keras API, so any code targeting the Keras API will run on `tf.keras`.
+However, keep in mind that:
+
+- The `tf.keras` API version in the latest TensorFlow release might not be the
+    same as the latest `keras` version from PyPI.
+    Check out `tf.keras.__version__` if in doubt.
+- In `tf.keras`, the default file format saved by `model.save_weights` is the
+    TensorFlow `SavedModel` format.
+    To use HDF5, you can pass the `format='h5'` argument.
+
+
+### What is the relationship between tf.keras and tf.estimator?
+
+The [`tf.estimator` API](https://www.tensorflow.org/programmers_guide/estimators)
+is a high-level TensorFlow API for training "estimator" models,
+in particular in distributed settings.
+This API targets industry use cases, such as distributed training
+on large datasets with a focus on eventually exporting a production model.
+
+If you have a `tf.keras` model that would like to train with the `tf.estimator`
+API, you can convert your model to an `Estimator` object via the
+`model_to_estimator` utility](https://www.tensorflow.org/programmers_guide/estimators#creating_estimators_from_keras_models):
+
+
+```python
+estimator = tf.keras.estimator.model_to_estimator(model)
+```
+
+When using `model_to_estimator`, enabling eager execution is helpful for
+developing and debugging your `input_fn`
+(as it allows you to easily print your data).
+
+
+### How can I run tf.keras models on multiple GPUs?
+
+You can run tf.keras models on multiple GPUs using the
+[`DistributionStrategy API`](https://www.tensorflow.org/versions/master/api_docs/python/tf/contrib/distribute/DistributionStrategy).
+The `DistributionStrategy` API allow you to distribute training on multiple GPUs
+with almost no changes to your existing code.
+
+Currently [`MirroredStrategy`](https://www.tensorflow.org/versions/master/api_docs/python/tf/contrib/distribute/MirroredStrategy)
+is the only supported strategy.
+`MirroredStrategy` allows you to do in-graph replication with synchronous
+training using all-reduce on a single machine.
+To use `DistributionStrategy` with a `tf.keras` model,
+you can use the `model_to_estimator` utility to convert a `tf.keras` model to
+an `Estimator` and then train the estimator.
+
+Here is a simple example of distributing a `tf.keras` model across multiple GPUs
+on a single machine.
+
+Let's first define a simple model:
+
+```python
+model = tf.keras.Sequential()
+model.add(tf.keras.layers.Dense(16, activation='relu', input_shape=(10,)))
+model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
+optimizer = tf.train.GradientDescentOptimizer(0.2)
+model.compile(loss='binary_crossentropy', optimizer=optimizer)
+model.summary()
+```
+
+Let's use `model_to_estimator` to create an `Estimator` instance from the
+`tf.keras` model defined above.
+
+```python
+keras_estimator = tf.keras.estimator.model_to_estimator(
+    keras_model=model,
+    config=config,
+    model_dir='/tmp/model_dir')
+```
+
+We'll use `tf.data.Datasets` to define our input pipeline.
+Our `input_fn` returns a `tf.data.Dataset` object that we then use to distribute
+the data across multiple devices with each device processing
+a slice of the input batch.
+
+```python
+def input_fn():
+    x = np.random.random((1024, 10))
+    y = np.random.randint(2, size=(1024, 1))
+    x = tf.cast(x, tf.float32)
+    dataset = tf.data.Dataset.from_tensor_slices((x, y))
+    dataset = dataset.repeat(10)
+    dataset = dataset.batch(32)
+    return dataset
+```
+
+The next step is to create a `RunConfig` and set the train_distribute argument
+to the new `MirroredStrategy` instance.
+You can specify a list of devices or the `num_gpus` argument when creating
+a `MirroredStrategy` instance.
+Not specifying any arguments defaults to using all the available GPUs like we do
+in this example.
+
+```python
+strategy = tf.contrib.distribute.MirroredStrategy()
+config = tf.estimator.RunConfig(train_distribute=strategy)
+```
+
+Call train on the `Estimator` instance providing the `input_fn` and `steps`
+arguments as input:
+
+```python
+keras_estimator.train(input_fn=input_fn, steps=10)
+```
diff --git a/tensorflow/docs_src/programmers_guide/leftnav_files b/tensorflow/docs_src/programmers_guide/leftnav_files
index 7ac63bf2e019fc..3bcf864e13db0c 100644
--- a/tensorflow/docs_src/programmers_guide/leftnav_files
+++ b/tensorflow/docs_src/programmers_guide/leftnav_files
@@ -1,9 +1,20 @@
 index.md
 
 ### High Level APIs
+keras.md
 eager.md
 datasets.md
-estimators.md
+
+### Estimators
+estimators.md: Introduction to Estimators
+premade_estimators.md
+custom_estimators.md
+feature_columns.md
+checkpoints.md
+
+### Accelerators
+using_gpu.md
+using_tpu.md
 
 ### Low Level APIs
 low_level_intro.md
@@ -12,10 +23,6 @@ variables.md
 graphs.md
 saved_model.md
 
-### Accelerators
-using_gpu.md
-using_tpu.md
-
 ### ML Concepts
 embedding.md
 
diff --git a/tensorflow/docs_src/programmers_guide/low_level_intro.md b/tensorflow/docs_src/programmers_guide/low_level_intro.md
index 05709ad10a9275..478e2bb70bc7f5 100644
--- a/tensorflow/docs_src/programmers_guide/low_level_intro.md
+++ b/tensorflow/docs_src/programmers_guide/low_level_intro.md
@@ -9,7 +9,7 @@ This guide gets you started programming in the low-level TensorFlow APIs
   * Use high level components ([datasets](#datasets), [layers](#layers), and
     [feature_columns](#feature_columns)) in this low level environment.
   * Build your own training loop, instead of using the one
-    @{$get_started/premade_estimators$provided by Estimators}.
+    @{$premade_estimators$provided by Estimators}.
 
 We recommend using the higher level APIs to build models when possible.
 Knowing TensorFlow Core is valuable for the following reasons:
@@ -398,7 +398,7 @@ and layer reuse impossible.
 
 The easiest way to experiment with feature columns is using the
 @{tf.feature_column.input_layer} function. This function only accepts
-@{$get_started/feature_columns$dense columns} as inputs, so to view the result
+@{$feature_columns$dense columns} as inputs, so to view the result
 of a categorical column you must wrap it in an
 @{tf.feature_column.indicator_column}. For example:
 
@@ -589,7 +589,7 @@ print(sess.run(y_pred))
 
 To learn more about building models with TensorFlow consider the following:
 
-* @{$get_started/custom_estimators$Custom Estimators}, to learn how to build
+* @{$custom_estimators$Custom Estimators}, to learn how to build
   customized models with TensorFlow. Your knowledge of TensorFlow Core will
   help you understand and debug your own models.
 
diff --git a/tensorflow/docs_src/get_started/premade_estimators.md b/tensorflow/docs_src/programmers_guide/premade_estimators.md
similarity index 96%
rename from tensorflow/docs_src/get_started/premade_estimators.md
rename to tensorflow/docs_src/programmers_guide/premade_estimators.md
index 4be7e508f94074..f6dd75eacab1c9 100644
--- a/tensorflow/docs_src/get_started/premade_estimators.md
+++ b/tensorflow/docs_src/programmers_guide/premade_estimators.md
@@ -177,13 +177,11 @@ other features so you can concentrate on your model. For more details see
 
 An Estimator is any class derived from @{tf.estimator.Estimator}. TensorFlow
 provides a collection of
-[pre-made Estimators](https://developers.google.com/machine-learning/glossary/#pre-made_Estimator)
+@{tf.estimator$pre-made Estimators}
 (for example, `LinearRegressor`) to implement common ML algorithms. Beyond
 those, you may write your own
-[custom Estimators](https://developers.google.com/machine-learning/glossary/#custom_Estimator).
-We recommend using pre-made Estimators when just getting started with
-TensorFlow. After gaining expertise with the pre-made Estimators, we recommend
-optimizing your model by creating your own custom Estimators.
+@{$custom_estimators$custom Estimators}.
+We recommend using pre-made Estimators when just getting started.
 
 To write a TensorFlow program based on pre-made Estimators, you must perform the
 following tasks:
@@ -289,7 +287,7 @@ for key in train_x.keys():
 ```
 
 Feature columns can be far more sophisticated than those we're showing here.  We
-detail feature columns @{$get_started/feature_columns$later on} in our Getting
+detail feature columns @{$feature_columns$later on} in our Getting
 Started guide.
 
 Now that we have the description of how we want the model to represent the raw
@@ -425,11 +423,10 @@ Pre-made Estimators are an effective way to quickly create standard models.
 Now that you've gotten started writing TensorFlow programs, consider the
 following material:
 
-* @{$get_started/checkpoints$Checkpoints} to learn how to save and restore
-  models.
+* @{$checkpoints$Checkpoints} to learn how to save and restore models.
 * @{$get_started/datasets_quickstart$Datasets} to learn more about importing
   data into your
   model.
-* @{$get_started/custom_estimators$Creating Custom Estimators} to learn how to
+* @{$custom_estimators$Creating Custom Estimators} to learn how to
   write your own Estimator, customized for a particular problem.
 
diff --git a/tensorflow/docs_src/programmers_guide/tensors.md b/tensorflow/docs_src/programmers_guide/tensors.md
index 58a80d533927e4..1248c3cabe23c8 100644
--- a/tensorflow/docs_src/programmers_guide/tensors.md
+++ b/tensorflow/docs_src/programmers_guide/tensors.md
@@ -265,7 +265,7 @@ example:
 ```python
 constant = tf.constant([1, 2, 3])
 tensor = constant * constant
-print tensor.eval()
+print(tensor.eval())
 ```
 
 The `eval` method only works when a default `tf.Session` is active (see
@@ -306,8 +306,8 @@ Note that you rarely want to use the following pattern when printing a
 
 ``` python
 t = <<some tensorflow operation>>
-print t  # This will print the symbolic tensor when the graph is being built.
-         # This tensor does not have a value in this context.
+print(t)  # This will print the symbolic tensor when the graph is being built.
+          # This tensor does not have a value in this context.
 ```
 
 This code prints the `tf.Tensor` object (which represents deferred computation)
diff --git a/tensorflow/docs_src/programmers_guide/using_tpu.md b/tensorflow/docs_src/programmers_guide/using_tpu.md
index 5e3e49d43402cd..44aabf05571bb7 100644
--- a/tensorflow/docs_src/programmers_guide/using_tpu.md
+++ b/tensorflow/docs_src/programmers_guide/using_tpu.md
@@ -22,8 +22,8 @@ Standard `Estimators` can drive models on CPU and GPUs. You must use
 @{tf.contrib.tpu.TPUEstimator} to drive a model on TPUs.
 
 Refer to TensorFlow's Getting Started section for an introduction to the basics
-of using a @{$get_started/premade_estimators$pre-made `Estimator`}, and
-@{$get_started/custom_estimators$custom `Estimator`s}.
+of using a @{$premade_estimators$pre-made `Estimator`}, and
+@{$custom_estimators$custom `Estimator`s}.
 
 The `TPUEstimator` class differs somewhat from the `Estimator` class.
 
diff --git a/tensorflow/docs_src/programmers_guide/variables.md b/tensorflow/docs_src/programmers_guide/variables.md
index e8cf7711552f4c..cd8c4b5b9a026f 100644
--- a/tensorflow/docs_src/programmers_guide/variables.md
+++ b/tensorflow/docs_src/programmers_guide/variables.md
@@ -237,7 +237,7 @@ TensorFlow supports two ways of sharing variables:
 While code which explicitly passes variables around is very clear, it is
 sometimes convenient to write TensorFlow functions that implicitly use
 variables in their implementations. Most of the functional layers from
-`tf.layer` use this approach, as well as all `tf.metrics`, and a few other
+`tf.layers` use this approach, as well as all `tf.metrics`, and a few other
 library utilities.
 
 Variable scopes allow you to control variable reuse when calling functions which
diff --git a/tensorflow/docs_src/tutorials/audio_recognition.md b/tensorflow/docs_src/tutorials/audio_recognition.md
index 372ab47df7df30..d7a8da6f96194a 100644
--- a/tensorflow/docs_src/tutorials/audio_recognition.md
+++ b/tensorflow/docs_src/tutorials/audio_recognition.md
@@ -25,13 +25,15 @@ python tensorflow/examples/speech_commands/train.py
 ```
 
 The script will start off by downloading the [Speech Commands
-dataset](https://storage.cloud.google.com/download.tensorflow.org/data/speech_commands_v0.01.tar.gz),
-which consists of 65,000 WAVE audio files of people saying thirty different
-words. This data was collected by Google and released under a CC BY license, and
-you can help improve it by [contributing five minutes of your own
+dataset](https://storage.cloud.google.com/download.tensorflow.org/data/speech_commands_v0.02.tar.gz),
+which consists of over 105,000 WAVE audio files of people saying thirty
+different words. This data was collected by Google and released under a CC BY
+license, and you can help improve it by [contributing five minutes of your own
 voice](https://aiyprojects.withgoogle.com/open_speech_recording). The archive is
-over 1GB, so this part may take a while, but you should see progress logs, and
-once it's been downloaded once you won't need to do this step again.
+over 2GB, so this part may take a while, but you should see progress logs, and
+once it's been downloaded once you won't need to do this step again. You can
+find more information about this dataset in this
+[Speech Commands paper](https://arxiv.org/abs/1804.03209).
 
 Once the downloading has completed, you'll see logging information that looks
 like this:
@@ -229,7 +231,7 @@ You can also build this application yourself, since it's open source and
 [available as part of the TensorFlow repository on
 github](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#building-in-android-studio-using-the-tensorflow-aar-from-jcenter).
 By default it downloads [a pretrained model from
-tensorflow.org](http://download.tensorflow.org/models/speech_commands_v0.01.zip),
+tensorflow.org](http://download.tensorflow.org/models/speech_commands_v0.02.zip),
 but you can easily [replace it with a model you've trained
 yourself](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android#install-model-files-optional).
 If you do this, you'll need to make sure that the constants in [the main
diff --git a/tensorflow/docs_src/tutorials/kernel_methods.md b/tensorflow/docs_src/tutorials/kernel_methods.md
index 73e5c5105784dd..205e2a2d2c1d10 100644
--- a/tensorflow/docs_src/tutorials/kernel_methods.md
+++ b/tensorflow/docs_src/tutorials/kernel_methods.md
@@ -53,7 +53,7 @@ In order to feed data to a `tf.contrib.learn Estimator`, it is helpful to conver
 it to Tensors. For this, we will use an `input function` which adds Ops to the
 TensorFlow graph that, when executed, create mini-batches of Tensors to be used
 downstream. For more background on input functions, check
-@{$get_started/premade_estimators#create_input_functions$this section on input functions}.
+@{$premade_estimators#create_input_functions$this section on input functions}.
 In this example, we will use the `tf.train.shuffle_batch` Op which, besides
 converting numpy arrays to Tensors, allows us to specify the batch_size and
 whether to randomize the input every time the input_fn Ops are executed
diff --git a/tensorflow/docs_src/tutorials/layers.md b/tensorflow/docs_src/tutorials/layers.md
index 37cd2bb1397dea..0f17899dae7ccd 100644
--- a/tensorflow/docs_src/tutorials/layers.md
+++ b/tensorflow/docs_src/tutorials/layers.md
@@ -190,7 +190,7 @@ def cnn_model_fn(features, labels, mode):
 The following sections (with headings corresponding to each code block above)
 dive deeper into the `tf.layers` code used to create each layer, as well as how
 to calculate loss, configure the training op, and generate predictions. If
-you're already experienced with CNNs and @{$get_started/custom_estimators$TensorFlow `Estimator`s},
+you're already experienced with CNNs and @{$custom_estimators$TensorFlow `Estimator`s},
 and find the above code intuitive, you may want to skim these sections or just
 skip ahead to ["Training and Evaluating the CNN MNIST Classifier"](#train_eval_mnist).
 
@@ -209,7 +209,6 @@ for two-dimensional image data expect input tensors to have a shape of
 *   _`channels`_. Number of color channels in the example images. For color
     images, the number of channels is 3 (red, green, blue). For monochrome
     images, there is just 1 channel (black).
-*   _`image_height`_. Height of the example images.
 *   _`data_format`_. A string, one of `channels_last` (default) or `channels_first`.
       `channels_last` corresponds to inputs with shape
       `(batch, ..., channels)` while `channels_first` corresponds to
@@ -535,8 +534,8 @@ if mode == tf.estimator.ModeKeys.TRAIN:
 ```
 
 > Note: For a more in-depth look at configuring training ops for Estimator model
-> functions, see @{$get_started/custom_estimators#defining-the-training-op-for-the-model$"Defining the training op for the model"}
-> in the @{$get_started/custom_estimators$"Creating Estimations in tf.estimator"} tutorial.
+> functions, see @{$custom_estimators#defining-the-training-op-for-the-model$"Defining the training op for the model"}
+> in the @{$custom_estimators$"Creating Estimations in tf.estimator"} tutorial.
 
 
 ### Add evaluation metrics
@@ -601,7 +600,7 @@ be saved (here, we specify the temp directory `/tmp/mnist_convnet_model`, but
 feel free to change to another directory of your choice).
 
 > Note: For an in-depth walkthrough of the TensorFlow `Estimator` API, see the
-> tutorial @{$get_started/custom_estimators$"Creating Estimators in tf.estimator."}
+> tutorial @{$custom_estimators$"Creating Estimators in tf.estimator."}
 
 ### Set Up a Logging Hook {#set_up_a_logging_hook}
 
@@ -720,7 +719,7 @@ Here, we've achieved an accuracy of 97.3% on our test data set.
 To learn more about TensorFlow Estimators and CNNs in TensorFlow, see the
 following resources:
 
-*   @{$get_started/custom_estimators$Creating Estimators in tf.estimator}
+*   @{$custom_estimators$Creating Estimators in tf.estimator}
     provides an introduction to the TensorFlow Estimator API. It walks through
     configuring an Estimator, writing a model function, calculating loss, and
     defining a training op.
diff --git a/tensorflow/docs_src/tutorials/linear.md b/tensorflow/docs_src/tutorials/linear.md
index 265ded877d1ff9..3f247ade266d26 100644
--- a/tensorflow/docs_src/tutorials/linear.md
+++ b/tensorflow/docs_src/tutorials/linear.md
@@ -17,7 +17,7 @@ tutorial walks through the code in greater detail.
 
 To understand this overview it will help to have some familiarity
 with basic machine learning concepts, and also with
-@{$get_started/premade_estimators$Estimators}.
+@{$premade_estimators$Estimators}.
 
 [TOC]
 
diff --git a/tensorflow/docs_src/tutorials/recurrent_quickdraw.md b/tensorflow/docs_src/tutorials/recurrent_quickdraw.md
index 5d83fbe2a3709c..1afd861738512f 100644
--- a/tensorflow/docs_src/tutorials/recurrent_quickdraw.md
+++ b/tensorflow/docs_src/tutorials/recurrent_quickdraw.md
@@ -220,7 +220,7 @@ length 2.
 ### Defining the model
 
 To define the model we create a new `Estimator`. If you want to read more about
-estimators, we recommend @{$get_started/custom_estimators$this tutorial}.
+estimators, we recommend @{$custom_estimators$this tutorial}.
 
 To build the model, we:
 
diff --git a/tensorflow/examples/android/BUILD b/tensorflow/examples/android/BUILD
index aa594a63c6ad5a..07f096418f5321 100644
--- a/tensorflow/examples/android/BUILD
+++ b/tensorflow/examples/android/BUILD
@@ -36,7 +36,7 @@ cc_binary(
         "-z defs",
         "-s",
         "-Wl,--version-script",  # This line must be directly followed by LINKER_SCRIPT.
-        LINKER_SCRIPT,
+        "$(location {})".format(LINKER_SCRIPT),
     ],
     linkshared = 1,
     linkstatic = 1,
diff --git a/tensorflow/examples/learn/iris.py b/tensorflow/examples/learn/iris.py
index 03e60972aa660f..86f5204ec3e871 100644
--- a/tensorflow/examples/learn/iris.py
+++ b/tensorflow/examples/learn/iris.py
@@ -21,7 +21,8 @@
 from __future__ import print_function
 
 import os
-import urllib
+
+from six.moves.urllib.request import urlretrieve
 
 import tensorflow as tf
 
@@ -38,9 +39,7 @@
 def maybe_download_iris_data(file_name, download_url):
   """Downloads the file and returns the number of data."""
   if not os.path.exists(file_name):
-    raw = urllib.urlopen(download_url).read()
-    with open(file_name, 'w') as f:
-      f.write(raw)
+    urlretrieve(download_url, file_name)
 
   # The first line is a comma-separated string. The first one is the number of
   # total data in the file.
diff --git a/tensorflow/examples/learn/text_classification_cnn.py b/tensorflow/examples/learn/text_classification_cnn.py
index 9e21aee87f6298..a40a9eaecbd9bb 100644
--- a/tensorflow/examples/learn/text_classification_cnn.py
+++ b/tensorflow/examples/learn/text_classification_cnn.py
@@ -73,7 +73,7 @@ def cnn_model(features, labels, mode):
         kernel_size=FILTER_SHAPE2,
         padding='VALID')
     # Max across each filter to get useful features for classification.
-    pool2 = tf.squeeze(tf.reduce_max(conv2, 1), squeeze_dims=[1])
+    pool2 = tf.squeeze(tf.reduce_max(conv2, 1), axis=[1])
 
   # Apply regular WX + B and classification.
   logits = tf.layers.dense(pool2, MAX_LABEL, activation=None)
diff --git a/tensorflow/examples/speech_commands/train.py b/tensorflow/examples/speech_commands/train.py
index f084931215261f..fc28eb0631dc5e 100644
--- a/tensorflow/examples/speech_commands/train.py
+++ b/tensorflow/examples/speech_commands/train.py
@@ -288,7 +288,7 @@ def main(_):
       '--data_url',
       type=str,
       # pylint: disable=line-too-long
-      default='http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz',
+      default='http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz',
       # pylint: enable=line-too-long
       help='Location of speech training data archive on the web.')
   parser.add_argument(
diff --git a/tensorflow/go/README.md b/tensorflow/go/README.md
index b1bd87eb0c3b3a..e251356ec8e973 100644
--- a/tensorflow/go/README.md
+++ b/tensorflow/go/README.md
@@ -5,7 +5,7 @@ Construct and execute TensorFlow graphs in Go.
 [![GoDoc](https://godoc.org/github.com/tensorflow/tensorflow/tensorflow/go?status.svg)](https://godoc.org/github.com/tensorflow/tensorflow/tensorflow/go)
 
 > *WARNING*: The API defined in this package is not stable and can change
-> without notice. The same goes for the awkward package path
+> without notice. The same goes for the package path:
 > (`github.com/tensorflow/tensorflow/tensorflow/go`).
 
 ## Quickstart
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 9e879954416206..016f572c3f5de1 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -2544,29 +2544,348 @@ func EditDistance(scope *Scope, hypothesis_indices tf.Output, hypothesis_values
 	return op.Output(0)
 }
 
-// Clips tensor values to a specified min and max.
+// Reverses specific dimensions of a tensor.
 //
-// Given a tensor `t`, this operation returns a tensor of the same type and
-// shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
-// Any values less than `clip_value_min` are set to `clip_value_min`. Any values
-// greater than `clip_value_max` are set to `clip_value_max`.
+// Given a `tensor`, and a `bool` tensor `dims` representing the dimensions
+// of `tensor`, this operation reverses each dimension i of `tensor` where
+// `dims[i]` is `True`.
+//
+// `tensor` can have up to 8 dimensions. The number of dimensions
+// of `tensor` must equal the number of elements in `dims`. In other words:
+//
+// `rank(tensor) = size(dims)`
+//
+// For example:
+//
+// ```
+// # tensor 't' is [[[[ 0,  1,  2,  3],
+// #                  [ 4,  5,  6,  7],
+// #                  [ 8,  9, 10, 11]],
+// #                 [[12, 13, 14, 15],
+// #                  [16, 17, 18, 19],
+// #                  [20, 21, 22, 23]]]]
+// # tensor 't' shape is [1, 2, 3, 4]
+//
+// # 'dims' is [False, False, False, True]
+// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
+//                         [ 7,  6,  5,  4],
+//                         [ 11, 10, 9, 8]],
+//                        [[15, 14, 13, 12],
+//                         [19, 18, 17, 16],
+//                         [23, 22, 21, 20]]]]
+//
+// # 'dims' is [False, True, False, False]
+// reverse(t, dims) ==> [[[[12, 13, 14, 15],
+//                         [16, 17, 18, 19],
+//                         [20, 21, 22, 23]
+//                        [[ 0,  1,  2,  3],
+//                         [ 4,  5,  6,  7],
+//                         [ 8,  9, 10, 11]]]]
+//
+// # 'dims' is [False, False, True, False]
+// reverse(t, dims) ==> [[[[8, 9, 10, 11],
+//                         [4, 5, 6, 7],
+//                         [0, 1, 2, 3]]
+//                        [[20, 21, 22, 23],
+//                         [16, 17, 18, 19],
+//                         [12, 13, 14, 15]]]]
+// ```
 //
 // Arguments:
-//	t: A `Tensor`.
-//	clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
-// as `t`. The minimum value to clip by.
-//	clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
-// as `t`. The maximum value to clip by.
+//	tensor: Up to 8-D.
+//	dims: 1-D. The dimensions to reverse.
 //
-// Returns A clipped `Tensor` with the same shape as input 't'.
-func ClipByValue(scope *Scope, t tf.Output, clip_value_min tf.Output, clip_value_max tf.Output) (output tf.Output) {
+// Returns The same shape as `tensor`.
+func Reverse(scope *Scope, tensor tf.Output, dims tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ClipByValue",
+		Type: "Reverse",
 		Input: []tf.Input{
-			t, clip_value_min, clip_value_max,
+			tensor, dims,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Copy a tensor setting everything outside a central band in each innermost matrix
+//
+// to zero.
+//
+// The `band` part is computed as follows:
+// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
+// tensor with the same shape where
+//
+// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
+//
+// The indicator function
+//
+// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
+//                  (num_upper < 0 || (n-m) <= num_upper)`.
+//
+// For example:
+//
+// ```
+// # if 'input' is [[ 0,  1,  2, 3]
+//                  [-1,  0,  1, 2]
+//                  [-2, -1,  0, 1]
+//                  [-3, -2, -1, 0]],
+//
+// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
+//                                        [-1,  0,  1, 2]
+//                                        [ 0, -1,  0, 1]
+//                                        [ 0,  0, -1, 0]],
+//
+// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
+//                                       [-1,  0,  1, 0]
+//                                       [-2, -1,  0, 1]
+//                                       [ 0, -2, -1, 0]]
+// ```
+//
+// Useful special cases:
+//
+// ```
+//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
+//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
+//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
+// ```
+//
+// Arguments:
+//	input: Rank `k` tensor.
+//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
+// lower triangle.
+//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
+// entire upper triangle.
+//
+// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
+func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixBandPart",
+		Input: []tf.Input{
+			input, num_lower, num_upper,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the batched diagonal part of a batched tensor.
+//
+// This operation returns a tensor with the `diagonal` part
+// of the batched `input`. The `diagonal` part is computed as follows:
+//
+// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
+// tensor of rank `k - 1` with dimensions `[I, J, K, ..., min(M, N)]` where:
+//
+// `diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.
+//
+// The input must be at least a matrix.
+//
+// For example:
+//
+// ```
+// # 'input' is [[[1, 0, 0, 0]
+//                [0, 2, 0, 0]
+//                [0, 0, 3, 0]
+//                [0, 0, 0, 4]],
+//               [[5, 0, 0, 0]
+//                [0, 6, 0, 0]
+//                [0, 0, 7, 0]
+//                [0, 0, 0, 8]]]
+//
+// and input.shape = (2, 4, 4)
+//
+// tf.matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]
+//
+// which has shape (2, 4)
+// ```
+//
+// Arguments:
+//	input: Rank `k` tensor where `k >= 2`.
+//
+// Returns The extracted diagonal(s) having shape
+// `diagonal.shape = input.shape[:-2] + [min(input.shape[-2:])]`.
+func MatrixDiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixDiagPart",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns a batched diagonal tensor with a given batched diagonal values.
+//
+// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+// everything else padded with zeros. The diagonal is computed as follows:
+//
+// Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
+// tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
+//
+// `output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
+//
+// For example:
+//
+// ```
+// # 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
+//
+// and diagonal.shape = (2, 4)
+//
+// tf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
+//                                      [0, 2, 0, 0]
+//                                      [0, 0, 3, 0]
+//                                      [0, 0, 0, 4]],
+//                                     [[5, 0, 0, 0]
+//                                      [0, 6, 0, 0]
+//                                      [0, 0, 7, 0]
+//                                      [0, 0, 0, 8]]]
+//
+// which has shape (2, 4, 4)
+// ```
+//
+// Arguments:
+//	diagonal: Rank `k`, where `k >= 1`.
+//
+// Returns Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`.
+func MatrixDiag(scope *Scope, diagonal tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixDiag",
+		Input: []tf.Input{
+			diagonal,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// QuantizedInstanceNormAttr is an optional argument to QuantizedInstanceNorm.
+type QuantizedInstanceNormAttr func(optionalAttr)
+
+// QuantizedInstanceNormOutputRangeGiven sets the optional output_range_given attribute to value.
+//
+// value: If True, `given_y_min` and `given_y_min`
+// and `given_y_max` are used as the output range. Otherwise,
+// the implementation computes the output range.
+// If not specified, defaults to false
+func QuantizedInstanceNormOutputRangeGiven(value bool) QuantizedInstanceNormAttr {
+	return func(m optionalAttr) {
+		m["output_range_given"] = value
+	}
+}
+
+// QuantizedInstanceNormGivenYMin sets the optional given_y_min attribute to value.
+//
+// value: Output in `y_min` if `output_range_given` is True.
+// If not specified, defaults to 0
+func QuantizedInstanceNormGivenYMin(value float32) QuantizedInstanceNormAttr {
+	return func(m optionalAttr) {
+		m["given_y_min"] = value
+	}
+}
+
+// QuantizedInstanceNormGivenYMax sets the optional given_y_max attribute to value.
+//
+// value: Output in `y_max` if `output_range_given` is True.
+// If not specified, defaults to 0
+func QuantizedInstanceNormGivenYMax(value float32) QuantizedInstanceNormAttr {
+	return func(m optionalAttr) {
+		m["given_y_max"] = value
+	}
+}
+
+// QuantizedInstanceNormVarianceEpsilon sets the optional variance_epsilon attribute to value.
+//
+// value: A small float number to avoid dividing by 0.
+// If not specified, defaults to 1e-05
+func QuantizedInstanceNormVarianceEpsilon(value float32) QuantizedInstanceNormAttr {
+	return func(m optionalAttr) {
+		m["variance_epsilon"] = value
+	}
+}
+
+// QuantizedInstanceNormMinSeparation sets the optional min_separation attribute to value.
+//
+// value: Minimum value of `y_max - y_min`
+// If not specified, defaults to 0.001
+func QuantizedInstanceNormMinSeparation(value float32) QuantizedInstanceNormAttr {
+	return func(m optionalAttr) {
+		m["min_separation"] = value
+	}
+}
+
+// Quantized Instance normalization.
+//
+// Arguments:
+//	x: A 4D input Tensor.
+//	x_min: The value represented by the lowest quantized input.
+//	x_max: The value represented by the highest quantized input.
+//
+// Returns A 4D Tensor.The value represented by the lowest quantized output.The value represented by the highest quantized output.
+func QuantizedInstanceNorm(scope *Scope, x tf.Output, x_min tf.Output, x_max tf.Output, optional ...QuantizedInstanceNormAttr) (y tf.Output, y_min tf.Output, y_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedInstanceNorm",
+		Input: []tf.Input{
+			x, x_min, x_max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns the diagonal part of the tensor.
+//
+// This operation returns a tensor with the `diagonal` part
+// of the `input`. The `diagonal` part is computed as follows:
+//
+// Assume `input` has dimensions `[D1,..., Dk, D1,..., Dk]`, then the output is a
+// tensor of rank `k` with dimensions `[D1,..., Dk]` where:
+//
+// `diagonal[i1,..., ik] = input[i1, ..., ik, i1,..., ik]`.
+//
+// For example:
+//
+// ```
+// # 'input' is [[1, 0, 0, 0]
+//               [0, 2, 0, 0]
+//               [0, 0, 3, 0]
+//               [0, 0, 0, 4]]
+//
+// tf.diag_part(input) ==> [1, 2, 3, 4]
+// ```
+//
+// Arguments:
+//	input: Rank k tensor where k is even and not zero.
+//
+// Returns The extracted diagonal.
+func DiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DiagPart",
+		Input: []tf.Input{
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -2683,57 +3002,6 @@ func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor.
-//
-// Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
-//
-// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// For example:
-//
-// ```python
-// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
-//
-// tf.sparse_segment_sum_with_num_segments(
-//     c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
-// # => [[0 0 0 0]
-// #     [0 0 0 0]
-// #     [0 0 0 0]]
-//
-// tf.sparse_segment_sum_with_num_segments(c,
-//                                         tf.constant([0, 1]),
-//                                         tf.constant([0, 2],
-//                                         num_segments=4))
-// # => [[ 1  2  3  4]
-// #     [ 0  0  0  0]
-// #     [-1 -2 -3 -4]
-// #     [ 0  0  0  0]]
-// ```
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `num_segments`.
-func SparseSegmentSumWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentSumWithNumSegments",
-		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // PreventGradientAttr is an optional argument to PreventGradient.
 type PreventGradientAttr func(optionalAttr)
 
@@ -2781,81 +3049,16 @@ func PreventGradient(scope *Scope, input tf.Output, optional ...PreventGradientA
 	return op.Output(0)
 }
 
-// Computes asin of x element-wise.
-func Asin(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Asin",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SparseToDenseAttr is an optional argument to SparseToDense.
-type SparseToDenseAttr func(optionalAttr)
-
-// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
-//
-// value: If true, indices are checked to make sure they are sorted in
-// lexicographic order and that there are no repeats.
-// If not specified, defaults to true
-func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Converts a sparse representation into a dense tensor.
-//
-// Builds an array `dense` with shape `output_shape` such that
-//
-// ```
-// # If sparse_indices is scalar
-// dense[i] = (i == sparse_indices ? sparse_values : default_value)
-//
-// # If sparse_indices is a vector, then for each i
-// dense[sparse_indices[i]] = sparse_values[i]
-//
-// # If sparse_indices is an n by d matrix, then for each i in [0, n)
-// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
-// ```
-//
-// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
-// scalar, all sparse indices are set to this single value.
-//
-// Indices should be sorted in lexicographic order, and indices must not
-// contain any repeats. If `validate_indices` is true, these properties
-// are checked during execution.
-//
-// Arguments:
-//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
-// index where `sparse_values[i]` will be placed.
-//	output_shape: 1-D.  Shape of the dense output tensor.
-//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
-// or a scalar value to be used for all sparse indices.
-//	default_value: Scalar value to set for indices not specified in
-// `sparse_indices`.
-//
-// Returns Dense output tensor of shape `output_shape`.
-func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
+// Computes asin of x element-wise.
+func Asin(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "SparseToDense",
+		Type: "Asin",
 		Input: []tf.Input{
-			sparse_indices, output_shape, sparse_values, default_value,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -4498,6 +4701,68 @@ func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// Returns a batched matrix tensor with new batched diagonal values.
+//
+// Given `input` and `diagonal`, this operation returns a tensor with the
+// same shape and values as `input`, except for the main diagonal of the
+// innermost matrices.  These will be overwritten by the values in `diagonal`.
+//
+// The output is computed as follows:
+//
+// Assume `input` has `k+1` dimensions `[I, J, K, ..., M, N]` and `diagonal` has
+// `k` dimensions `[I, J, K, ..., min(M, N)]`.  Then the output is a
+// tensor of rank `k+1` with dimensions `[I, J, K, ..., M, N]` where:
+//
+//   * `output[i, j, k, ..., m, n] = diagonal[i, j, k, ..., n]` for `m == n`.
+//   * `output[i, j, k, ..., m, n] = input[i, j, k, ..., m, n]` for `m != n`.
+//
+// Arguments:
+//	input: Rank `k+1`, where `k >= 1`.
+//	diagonal: Rank `k`, where `k >= 1`.
+//
+// Returns Rank `k+1`, with `output.shape = input.shape`.
+func MatrixSetDiag(scope *Scope, input tf.Output, diagonal tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixSetDiag",
+		Input: []tf.Input{
+			input, diagonal,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the element-wise max of two SparseTensors.
+//
+// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
+//
+// Arguments:
+//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, in the canonical lexicographic ordering.
+//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
+//	a_shape: 1-D.  Shape of the input SparseTensor.
+//	b_indices: counterpart to `a_indices` for the other operand.
+//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
+//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
+//
+// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
+func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSparseMaximum",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // OrderedMapClearAttr is an optional argument to OrderedMapClear.
 type OrderedMapClearAttr func(optionalAttr)
 
@@ -5050,53 +5315,6 @@ func FloorDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// Returns a batched diagonal tensor with a given batched diagonal values.
-//
-// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
-// everything else padded with zeros. The diagonal is computed as follows:
-//
-// Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
-// tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
-//
-// `output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
-//
-// For example:
-//
-// ```
-// # 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
-//
-// and diagonal.shape = (2, 4)
-//
-// tf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
-//                                      [0, 2, 0, 0]
-//                                      [0, 0, 3, 0]
-//                                      [0, 0, 0, 4]],
-//                                     [[5, 0, 0, 0]
-//                                      [0, 6, 0, 0]
-//                                      [0, 0, 7, 0]
-//                                      [0, 0, 0, 8]]]
-//
-// which has shape (2, 4, 4)
-// ```
-//
-// Arguments:
-//	diagonal: Rank `k`, where `k >= 1`.
-//
-// Returns Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`.
-func MatrixDiag(scope *Scope, diagonal tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixDiag",
-		Input: []tf.Input{
-			diagonal,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Computes the inverse permutation of a tensor.
 //
 // This operation computes the inverse of an index permutation. It takes a 1-D
@@ -6469,72 +6687,6 @@ func SparseFillEmptyRows(scope *Scope, indices tf.Output, values tf.Output, dens
 	return op.Output(0), op.Output(1), op.Output(2), op.Output(3)
 }
 
-// Reverses specific dimensions of a tensor.
-//
-// Given a `tensor`, and a `bool` tensor `dims` representing the dimensions
-// of `tensor`, this operation reverses each dimension i of `tensor` where
-// `dims[i]` is `True`.
-//
-// `tensor` can have up to 8 dimensions. The number of dimensions
-// of `tensor` must equal the number of elements in `dims`. In other words:
-//
-// `rank(tensor) = size(dims)`
-//
-// For example:
-//
-// ```
-// # tensor 't' is [[[[ 0,  1,  2,  3],
-// #                  [ 4,  5,  6,  7],
-// #                  [ 8,  9, 10, 11]],
-// #                 [[12, 13, 14, 15],
-// #                  [16, 17, 18, 19],
-// #                  [20, 21, 22, 23]]]]
-// # tensor 't' shape is [1, 2, 3, 4]
-//
-// # 'dims' is [False, False, False, True]
-// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
-//                         [ 7,  6,  5,  4],
-//                         [ 11, 10, 9, 8]],
-//                        [[15, 14, 13, 12],
-//                         [19, 18, 17, 16],
-//                         [23, 22, 21, 20]]]]
-//
-// # 'dims' is [False, True, False, False]
-// reverse(t, dims) ==> [[[[12, 13, 14, 15],
-//                         [16, 17, 18, 19],
-//                         [20, 21, 22, 23]
-//                        [[ 0,  1,  2,  3],
-//                         [ 4,  5,  6,  7],
-//                         [ 8,  9, 10, 11]]]]
-//
-// # 'dims' is [False, False, True, False]
-// reverse(t, dims) ==> [[[[8, 9, 10, 11],
-//                         [4, 5, 6, 7],
-//                         [0, 1, 2, 3]]
-//                        [[20, 21, 22, 23],
-//                         [16, 17, 18, 19],
-//                         [12, 13, 14, 15]]]]
-// ```
-//
-// Arguments:
-//	tensor: Up to 8-D.
-//	dims: 1-D. The dimensions to reverse.
-//
-// Returns The same shape as `tensor`.
-func Reverse(scope *Scope, tensor tf.Output, dims tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Reverse",
-		Input: []tf.Input{
-			tensor, dims,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // BiasAddGradAttr is an optional argument to BiasAddGrad.
 type BiasAddGradAttr func(optionalAttr)
 
@@ -6619,104 +6771,152 @@ func FusedBatchNormV2IsTraining(value bool) FusedBatchNormV2Attr {
 
 // Batch normalization.
 //
-// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
-// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+// Note that the size of 4D Tensors are defined by either "NHWC" or "NCHW".
+// The size of 1D Tensors matches the dimension C of the 4D Tensors.
+//
+// Arguments:
+//	x: A 4D Tensor for input data.
+//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
+//	offset: A 1D Tensor for offset, to shift to the normalized x.
+//	mean: A 1D Tensor for population mean. Used for inference only;
+// must be empty for training.
+//	variance: A 1D Tensor for population variance. Used for inference only;
+// must be empty for training.
+//
+// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
+// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
+// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
+// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
+// in the cuDNN case), to be reused in the gradient computation.
+func FusedBatchNormV2(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormV2Attr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FusedBatchNormV2",
+		Input: []tf.Input{
+			x, scale, offset, mean, variance,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+}
+
+// Reverses specific dimensions of a tensor.
+//
+// NOTE `tf.reverse` has now changed behavior in preparation for 1.0.
+// `tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.
+//
+// Given a `tensor`, and a `int32` tensor `axis` representing the set of
+// dimensions of `tensor` to reverse. This operation reverses each dimension
+// `i` for which there exists `j` s.t. `axis[j] == i`.
+//
+// `tensor` can have up to 8 dimensions. The number of dimensions specified
+// in `axis` may be 0 or more entries. If an index is specified more than
+// once, a InvalidArgument error is raised.
+//
+// For example:
+//
+// ```
+// # tensor 't' is [[[[ 0,  1,  2,  3],
+// #                  [ 4,  5,  6,  7],
+// #                  [ 8,  9, 10, 11]],
+// #                 [[12, 13, 14, 15],
+// #                  [16, 17, 18, 19],
+// #                  [20, 21, 22, 23]]]]
+// # tensor 't' shape is [1, 2, 3, 4]
+//
+// # 'dims' is [3] or 'dims' is [-1]
+// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
+//                         [ 7,  6,  5,  4],
+//                         [ 11, 10, 9, 8]],
+//                        [[15, 14, 13, 12],
+//                         [19, 18, 17, 16],
+//                         [23, 22, 21, 20]]]]
+//
+// # 'dims' is '[1]' (or 'dims' is '[-3]')
+// reverse(t, dims) ==> [[[[12, 13, 14, 15],
+//                         [16, 17, 18, 19],
+//                         [20, 21, 22, 23]
+//                        [[ 0,  1,  2,  3],
+//                         [ 4,  5,  6,  7],
+//                         [ 8,  9, 10, 11]]]]
+//
+// # 'dims' is '[2]' (or 'dims' is '[-2]')
+// reverse(t, dims) ==> [[[[8, 9, 10, 11],
+//                         [4, 5, 6, 7],
+//                         [0, 1, 2, 3]]
+//                        [[20, 21, 22, 23],
+//                         [16, 17, 18, 19],
+//                         [12, 13, 14, 15]]]]
+// ```
 //
 // Arguments:
-//	x: A 4D Tensor for input data.
-//	scale: A 1D Tensor for scaling factor, to scale the normalized x.
-//	offset: A 1D Tensor for offset, to shift to the normalized x.
-//	mean: A 1D Tensor for population mean. Used for inference only;
-// must be empty for training.
-//	variance: A 1D Tensor for population variance. Used for inference only;
-// must be empty for training.
+//	tensor: Up to 8-D.
+//	axis: 1-D. The indices of the dimensions to reverse. Must be in the range
+// `[-rank(tensor), rank(tensor))`.
 //
-// Returns A 4D Tensor for output data.A 1D Tensor for the computed batch mean, to be used by TensorFlow
-// to compute the running mean.A 1D Tensor for the computed batch variance, to be used by
-// TensorFlow to compute the running variance.A 1D Tensor for the computed batch mean, to be reused
-// in the gradient computation.A 1D Tensor for the computed batch variance (inverted variance
-// in the cuDNN case), to be reused in the gradient computation.
-func FusedBatchNormV2(scope *Scope, x tf.Output, scale tf.Output, offset tf.Output, mean tf.Output, variance tf.Output, optional ...FusedBatchNormV2Attr) (y tf.Output, batch_mean tf.Output, batch_variance tf.Output, reserve_space_1 tf.Output, reserve_space_2 tf.Output) {
+// Returns The same shape as `tensor`.
+func ReverseV2(scope *Scope, tensor tf.Output, axis tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "FusedBatchNormV2",
+		Type: "ReverseV2",
 		Input: []tf.Input{
-			x, scale, offset, mean, variance,
+			tensor, axis,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
+	return op.Output(0)
 }
 
-// Transforms a Tensor into a serialized TensorProto proto.
+// Adds `bias` to `value`.
+//
+// This is a deprecated version of BiasAdd and will be soon removed.
+//
+// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
+// Broadcasting is supported, so `value` may have any number of dimensions.
 //
 // Arguments:
-//	tensor: A Tensor of type `T`.
+//	value: Any number of dimensions.
+//	bias: 1-D with size the last dimension of `value`.
 //
-// Returns A serialized TensorProto proto of the input tensor.
-func SerializeTensor(scope *Scope, tensor tf.Output) (serialized tf.Output) {
+// Returns Broadcasted sum of `value` and `bias`.
+func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "SerializeTensor",
+		Type: "BiasAddV1",
 		Input: []tf.Input{
-			tensor,
+			value, bias,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MatrixSolveAttr is an optional argument to MatrixSolve.
-type MatrixSolveAttr func(optionalAttr)
-
-// MatrixSolveAdjoint sets the optional adjoint attribute to value.
-//
-// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
-// adjoint.
-// If not specified, defaults to false
-func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
-	return func(m optionalAttr) {
-		m["adjoint"] = value
-	}
-}
-
-// Solves systems of linear equations.
-//
-// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
-// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
-// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-// If `adjoint` is `True` then each output matrix satisfies
-// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
+// Transforms a Tensor into a serialized TensorProto proto.
 //
 // Arguments:
-//	matrix: Shape is `[..., M, M]`.
-//	rhs: Shape is `[..., M, K]`.
+//	tensor: A Tensor of type `T`.
 //
-// Returns Shape is `[..., M, K]`.
-func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
+// Returns A serialized TensorProto proto of the input tensor.
+func SerializeTensor(scope *Scope, tensor tf.Output) (serialized tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "MatrixSolve",
+		Type: "SerializeTensor",
 		Input: []tf.Input{
-			matrix, rhs,
+			tensor,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -7216,65 +7416,42 @@ func DecodeRaw(scope *Scope, bytes tf.Output, out_type tf.DataType, optional ...
 	return op.Output(0)
 }
 
-// Copy a tensor setting everything outside a central band in each innermost matrix
-//
-// to zero.
-//
-// The `band` part is computed as follows:
-// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
-// tensor with the same shape where
-//
-// `band[i, j, k, ..., m, n] = in_band(m, n) * input[i, j, k, ..., m, n]`.
-//
-// The indicator function
-//
-// `in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) &&
-//                  (num_upper < 0 || (n-m) <= num_upper)`.
-//
-// For example:
-//
-// ```
-// # if 'input' is [[ 0,  1,  2, 3]
-//                  [-1,  0,  1, 2]
-//                  [-2, -1,  0, 1]
-//                  [-3, -2, -1, 0]],
-//
-// tf.matrix_band_part(input, 1, -1) ==> [[ 0,  1,  2, 3]
-//                                        [-1,  0,  1, 2]
-//                                        [ 0, -1,  0, 1]
-//                                        [ 0,  0, -1, 0]],
-//
-// tf.matrix_band_part(input, 2, 1) ==> [[ 0,  1,  0, 0]
-//                                       [-1,  0,  1, 0]
-//                                       [-2, -1,  0, 1]
-//                                       [ 0, -2, -1, 0]]
-// ```
-//
-// Useful special cases:
-//
-// ```
-//  tf.matrix_band_part(input, 0, -1) ==> Upper triangular part.
-//  tf.matrix_band_part(input, -1, 0) ==> Lower triangular part.
-//  tf.matrix_band_part(input, 0, 0) ==> Diagonal.
-// ```
-//
-// Arguments:
-//	input: Rank `k` tensor.
-//	num_lower: 0-D tensor. Number of subdiagonals to keep. If negative, keep entire
-// lower triangle.
-//	num_upper: 0-D tensor. Number of superdiagonals to keep. If negative, keep
-// entire upper triangle.
+// RandomPoissonAttr is an optional argument to RandomPoisson.
+type RandomPoissonAttr func(optionalAttr)
+
+// RandomPoissonSeed sets the optional seed attribute to value.
+// If not specified, defaults to 0
+func RandomPoissonSeed(value int64) RandomPoissonAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomPoissonSeed2 sets the optional seed2 attribute to value.
+// If not specified, defaults to 0
+func RandomPoissonSeed2(value int64) RandomPoissonAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Use RandomPoissonV2 instead.
 //
-// Returns Rank `k` tensor of the same shape as input. The extracted banded tensor.
-func MatrixBandPart(scope *Scope, input tf.Output, num_lower tf.Output, num_upper tf.Output) (band tf.Output) {
+// DEPRECATED at GraphDef version 25: Replaced by RandomPoissonV2
+func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "MatrixBandPart",
+		Type: "RandomPoisson",
 		Input: []tf.Input{
-			input, num_lower, num_upper,
+			shape, rate,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -7738,47 +7915,6 @@ func SparseSplit(scope *Scope, split_dim tf.Output, indices tf.Output, values tf
 	return output_indices, output_values, output_shape
 }
 
-// RandomPoissonAttr is an optional argument to RandomPoisson.
-type RandomPoissonAttr func(optionalAttr)
-
-// RandomPoissonSeed sets the optional seed attribute to value.
-// If not specified, defaults to 0
-func RandomPoissonSeed(value int64) RandomPoissonAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomPoissonSeed2 sets the optional seed2 attribute to value.
-// If not specified, defaults to 0
-func RandomPoissonSeed2(value int64) RandomPoissonAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Use RandomPoissonV2 instead.
-//
-// DEPRECATED at GraphDef version 25: Replaced by RandomPoissonV2
-func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomPoisson",
-		Input: []tf.Input{
-			shape, rate,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // ResourceSparseApplyFtrlV2Attr is an optional argument to ResourceSparseApplyFtrlV2.
 type ResourceSparseApplyFtrlV2Attr func(optionalAttr)
 
@@ -10044,21 +10180,58 @@ func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Outp
 //
 // Arguments:
 //
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//
+//
+func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "BatchDataset",
+		Input: []tf.Input{
+			input_dataset, batch_size,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Says whether the targets are in the top `K` predictions.
+//
+// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
+// prediction for the target class is among the top `k` predictions among
+// all predictions for example `i`. Note that the behavior of `InTopK` differs
+// from the `TopK` op in its handling of ties; if multiple classes have the
+// same prediction value and straddle the top-`k` boundary, all of those
+// classes are considered to be in the top `k`.
+//
+// More formally, let
+//
+//   \\(predictions_i\\) be the predictions for all classes for example `i`,
+//   \\(targets_i\\) be the target class for example `i`,
+//   \\(out_i\\) be the output for example `i`,
+//
+// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
 //
+// Arguments:
+//	predictions: A `batch_size` x `classes` tensor.
+//	targets: A `batch_size` vector of class ids.
+//	k: Number of top elements to look at for computing precision.
 //
-func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns Computed precision at `k` as a `bool Tensor`.
+func InTopKV2(scope *Scope, predictions tf.Output, targets tf.Output, k tf.Output) (precision tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "BatchDataset",
+		Type: "InTopKV2",
 		Input: []tf.Input{
-			input_dataset, batch_size,
+			predictions, targets, k,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -10592,7 +10765,7 @@ func SampleDistortedBoundingBoxAspectRatioRange(value []float32) SampleDistorted
 // SampleDistortedBoundingBoxAreaRange sets the optional area_range attribute to value.
 //
 // value: The cropped area of the image must contain a fraction of the
-// supplied image within in this range.
+// supplied image within this range.
 // If not specified, defaults to <f:0.05 f:1 >
 func SampleDistortedBoundingBoxAreaRange(value []float32) SampleDistortedBoundingBoxAttr {
 	return func(m optionalAttr) {
@@ -10919,101 +11092,6 @@ func Fact(scope *Scope) (fact tf.Output) {
 	return op.Output(0)
 }
 
-// AngleAttr is an optional argument to Angle.
-type AngleAttr func(optionalAttr)
-
-// AngleTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func AngleTout(value tf.DataType) AngleAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Returns the argument of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the argument of each element in `input`. All elements in
-// `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part.
-//
-// The argument returned by this operation is of the form \\(atan2(b, a)\\).
-//
-// For example:
-//
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.angle(input) ==> [2.0132, 1.056]
-// ```
-//
-// @compatibility(numpy)
-// Equivalent to np.angle.
-// @end_compatibility
-func Angle(scope *Scope, input tf.Output, optional ...AngleAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Angle",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// VarHandleOpAttr is an optional argument to VarHandleOp.
-type VarHandleOpAttr func(optionalAttr)
-
-// VarHandleOpContainer sets the optional container attribute to value.
-//
-// value: the container this variable is placed in.
-// If not specified, defaults to ""
-func VarHandleOpContainer(value string) VarHandleOpAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// VarHandleOpSharedName sets the optional shared_name attribute to value.
-//
-// value: the name by which this variable is referred to.
-// If not specified, defaults to ""
-func VarHandleOpSharedName(value string) VarHandleOpAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Creates a handle to a Variable resource.
-//
-// Arguments:
-//	dtype: the type of this variable. Must agree with the dtypes
-// of all ops using this variable.
-//	shape: The (possibly partially specified) shape of this variable.
-func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...VarHandleOpAttr) (resource tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "VarHandleOp",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Elementwise computes the bitwise XOR of `x` and `y`.
 //
 // The result will have those bits set, that are different in `x` and `y`. The
@@ -13816,101 +13894,6 @@ func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms
 	return scope.AddOperation(opspec)
 }
 
-// Adds `bias` to `value`.
-//
-// This is a deprecated version of BiasAdd and will be soon removed.
-//
-// This is a special case of `tf.add` where `bias` is restricted to be 1-D.
-// Broadcasting is supported, so `value` may have any number of dimensions.
-//
-// Arguments:
-//	value: Any number of dimensions.
-//	bias: 1-D with size the last dimension of `value`.
-//
-// Returns Broadcasted sum of `value` and `bias`.
-func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BiasAddV1",
-		Input: []tf.Input{
-			value, bias,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Reverses specific dimensions of a tensor.
-//
-// NOTE `tf.reverse` has now changed behavior in preparation for 1.0.
-// `tf.reverse_v2` is currently an alias that will be deprecated before TF 1.0.
-//
-// Given a `tensor`, and a `int32` tensor `axis` representing the set of
-// dimensions of `tensor` to reverse. This operation reverses each dimension
-// `i` for which there exists `j` s.t. `axis[j] == i`.
-//
-// `tensor` can have up to 8 dimensions. The number of dimensions specified
-// in `axis` may be 0 or more entries. If an index is specified more than
-// once, a InvalidArgument error is raised.
-//
-// For example:
-//
-// ```
-// # tensor 't' is [[[[ 0,  1,  2,  3],
-// #                  [ 4,  5,  6,  7],
-// #                  [ 8,  9, 10, 11]],
-// #                 [[12, 13, 14, 15],
-// #                  [16, 17, 18, 19],
-// #                  [20, 21, 22, 23]]]]
-// # tensor 't' shape is [1, 2, 3, 4]
-//
-// # 'dims' is [3] or 'dims' is [-1]
-// reverse(t, dims) ==> [[[[ 3,  2,  1,  0],
-//                         [ 7,  6,  5,  4],
-//                         [ 11, 10, 9, 8]],
-//                        [[15, 14, 13, 12],
-//                         [19, 18, 17, 16],
-//                         [23, 22, 21, 20]]]]
-//
-// # 'dims' is '[1]' (or 'dims' is '[-3]')
-// reverse(t, dims) ==> [[[[12, 13, 14, 15],
-//                         [16, 17, 18, 19],
-//                         [20, 21, 22, 23]
-//                        [[ 0,  1,  2,  3],
-//                         [ 4,  5,  6,  7],
-//                         [ 8,  9, 10, 11]]]]
-//
-// # 'dims' is '[2]' (or 'dims' is '[-2]')
-// reverse(t, dims) ==> [[[[8, 9, 10, 11],
-//                         [4, 5, 6, 7],
-//                         [0, 1, 2, 3]]
-//                        [[20, 21, 22, 23],
-//                         [16, 17, 18, 19],
-//                         [12, 13, 14, 15]]]]
-// ```
-//
-// Arguments:
-//	tensor: Up to 8-D.
-//	axis: 1-D. The indices of the dimensions to reverse. Must be in the range
-// `[-rank(tensor), rank(tensor))`.
-//
-// Returns The same shape as `tensor`.
-func ReverseV2(scope *Scope, tensor tf.Output, axis tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReverseV2",
-		Input: []tf.Input{
-			tensor, axis,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // RealAttr is an optional argument to Real.
 type RealAttr func(optionalAttr)
 
@@ -18041,64 +18024,27 @@ func MaxPoolDataFormat(value string) MaxPoolAttr {
 // Performs max pooling on the input.
 //
 // Arguments:
-//	input: 4-D input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
-//
-// Returns The max pooled output tensor.
-func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPool",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Says whether the targets are in the top `K` predictions.
-//
-// This outputs a `batch_size` bool array, an entry `out[i]` is `true` if the
-// prediction for the target class is among the top `k` predictions among
-// all predictions for example `i`. Note that the behavior of `InTopK` differs
-// from the `TopK` op in its handling of ties; if multiple classes have the
-// same prediction value and straddle the top-`k` boundary, all of those
-// classes are considered to be in the top `k`.
-//
-// More formally, let
-//
-//   \\(predictions_i\\) be the predictions for all classes for example `i`,
-//   \\(targets_i\\) be the target class for example `i`,
-//   \\(out_i\\) be the output for example `i`,
-//
-// $$out_i = predictions_{i, targets_i} \in TopKIncludingTies(predictions_i)$$
-//
-// Arguments:
-//	predictions: A `batch_size` x `classes` tensor.
-//	targets: A `batch_size` vector of class ids.
-//	k: Number of top elements to look at for computing precision.
+//	input: 4-D input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
 //
-// Returns Computed precision at `k` as a `bool Tensor`.
-func InTopKV2(scope *Scope, predictions tf.Output, targets tf.Output, k tf.Output) (precision tf.Output) {
+// Returns The max pooled output tensor.
+func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "InTopKV2",
+		Type: "MaxPool",
 		Input: []tf.Input{
-			predictions, targets, k,
+			input,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -18178,9 +18124,10 @@ func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_val
 }
 
 // Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
-//
 // if < 0, `scale * features` otherwise.
 //
+// Assumes weights to have zero mean and variance 1.0 / fan_in.
+//
 // See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
 func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
@@ -19659,6 +19606,130 @@ func OrderedMapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...Or
 	return op.Output(0)
 }
 
+// VarHandleOpAttr is an optional argument to VarHandleOp.
+type VarHandleOpAttr func(optionalAttr)
+
+// VarHandleOpContainer sets the optional container attribute to value.
+//
+// value: the container this variable is placed in.
+// If not specified, defaults to ""
+func VarHandleOpContainer(value string) VarHandleOpAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// VarHandleOpSharedName sets the optional shared_name attribute to value.
+//
+// value: the name by which this variable is referred to.
+// If not specified, defaults to ""
+func VarHandleOpSharedName(value string) VarHandleOpAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Creates a handle to a Variable resource.
+//
+// Arguments:
+//	dtype: the type of this variable. Must agree with the dtypes
+// of all ops using this variable.
+//	shape: The (possibly partially specified) shape of this variable.
+func VarHandleOp(scope *Scope, dtype tf.DataType, shape tf.Shape, optional ...VarHandleOpAttr) (resource tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "VarHandleOp",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// AngleAttr is an optional argument to Angle.
+type AngleAttr func(optionalAttr)
+
+// AngleTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func AngleTout(value tf.DataType) AngleAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Returns the argument of a complex number.
+//
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the argument of each element in `input`. All elements in
+// `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part.
+//
+// The argument returned by this operation is of the form \\(atan2(b, a)\\).
+//
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.angle(input) ==> [2.0132, 1.056]
+// ```
+//
+// @compatibility(numpy)
+// Equivalent to np.angle.
+// @end_compatibility
+func Angle(scope *Scope, input tf.Output, optional ...AngleAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Angle",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Clips tensor values to a specified min and max.
+//
+// Given a tensor `t`, this operation returns a tensor of the same type and
+// shape as `t` with its values clipped to `clip_value_min` and `clip_value_max`.
+// Any values less than `clip_value_min` are set to `clip_value_min`. Any values
+// greater than `clip_value_max` are set to `clip_value_max`.
+//
+// Arguments:
+//	t: A `Tensor`.
+//	clip_value_min: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+// as `t`. The minimum value to clip by.
+//	clip_value_max: A 0-D (scalar) `Tensor`, or a `Tensor` with the same shape
+// as `t`. The maximum value to clip by.
+//
+// Returns A clipped `Tensor` with the same shape as input 't'.
+func ClipByValue(scope *Scope, t tf.Output, clip_value_min tf.Output, clip_value_max tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ClipByValue",
+		Input: []tf.Input{
+			t, clip_value_min, clip_value_max,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Counts the number of occurrences of each value in an integer array.
 //
 // Outputs a vector with length `size` and the same dtype as `weights`. If
@@ -21661,41 +21732,159 @@ func TensorListPushBack(scope *Scope, input_handle tf.Output, tensor tf.Output)
 			input_handle, tensor,
 		},
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the number of tensors in the input tensor list.
+//
+// input_handle: the input list
+// length: the number of tensors in the list
+func TensorListLength(scope *Scope, input_handle tf.Output) (length tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListLength",
+		Input: []tf.Input{
+			input_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// The shape of the elements of the given list, as a tensor.
+//
+//   input_handle: the list
+//   element_shape: the shape of elements of the list
+func TensorListElementShape(scope *Scope, input_handle tf.Output, shape_type tf.DataType) (element_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shape_type": shape_type}
+	opspec := tf.OpSpec{
+		Type: "TensorListElementShape",
+		Input: []tf.Input{
+			input_handle,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the item in the list with the given index.
+//
+// input_handle: the list
+// index: the position in the list from which an element will be retrieved
+// item: the element at that position
+//
+//
+func TensorListGetItem(scope *Scope, input_handle tf.Output, index tf.Output, element_dtype tf.DataType) (item tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
+	opspec := tf.OpSpec{
+		Type: "TensorListGetItem",
+		Input: []tf.Input{
+			input_handle, index,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns a diagonal tensor with a given diagonal values.
+//
+// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+// everything else padded with zeros. The diagonal is computed as follows:
+//
+// Assume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of
+// rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
+//
+// `output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.
+//
+// For example:
+//
+// ```
+// # 'diagonal' is [1, 2, 3, 4]
+// tf.diag(diagonal) ==> [[1, 0, 0, 0]
+//                        [0, 2, 0, 0]
+//                        [0, 0, 3, 0]
+//                        [0, 0, 0, 4]]
+// ```
+//
+// Arguments:
+//	diagonal: Rank k tensor where k is at most 1.
+func Diag(scope *Scope, diagonal tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Diag",
+		Input: []tf.Input{
+			diagonal,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ParameterizedTruncatedNormalAttr is an optional argument to ParameterizedTruncatedNormal.
+type ParameterizedTruncatedNormalAttr func(optionalAttr)
+
+// ParameterizedTruncatedNormalSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ParameterizedTruncatedNormalSeed(value int64) ParameterizedTruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
 }
 
-// Returns the number of tensors in the input tensor list.
+// ParameterizedTruncatedNormalSeed2 sets the optional seed2 attribute to value.
 //
-// input_handle: the input list
-// length: the number of tensors in the list
-func TensorListLength(scope *Scope, input_handle tf.Output) (length tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListLength",
-		Input: []tf.Input{
-			input_handle,
-		},
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ParameterizedTruncatedNormalSeed2(value int64) ParameterizedTruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// The shape of the elements of the given list, as a tensor.
+// Outputs random values from a normal distribution. The parameters may each be a
 //
-//   input_handle: the list
-//   element_shape: the shape of elements of the list
-func TensorListElementShape(scope *Scope, input_handle tf.Output, shape_type tf.DataType) (element_shape tf.Output) {
+// scalar which applies to the entire output, or a vector of length shape[0] which
+// stores the parameters for each batch.
+//
+// Arguments:
+//	shape: The shape of the output tensor. Batches are indexed by the 0th dimension.
+//	means: The mean parameter of each batch.
+//	stdevs: The standard deviation parameter of each batch. Must be greater than 0.
+//	minvals: The minimum cutoff. May be -infinity.
+//	maxvals: The maximum cutoff. May be +infinity, and must be more than the minval
+// for each batch.
+//
+// Returns A matrix of shape num_batches x samples_per_batch, filled with random
+// truncated normal values using the parameters for each row.
+func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output, stdevs tf.Output, minvals tf.Output, maxvals tf.Output, optional ...ParameterizedTruncatedNormalAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shape_type": shape_type}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "TensorListElementShape",
+		Type: "ParameterizedTruncatedNormal",
 		Input: []tf.Input{
-			input_handle,
+			shape, means, stdevs, minvals, maxvals,
 		},
 		Attrs: attrs,
 	}
@@ -21703,24 +21892,22 @@ func TensorListElementShape(scope *Scope, input_handle tf.Output, shape_type tf.
 	return op.Output(0)
 }
 
-// Returns the item in the list with the given index.
+// Sets the index-th position of the list to contain the given tensor.
 //
 // input_handle: the list
-// index: the position in the list from which an element will be retrieved
-// item: the element at that position
-//
+// index: the position in the list to which the tensor will be assigned
+// item: the element to be assigned to that position
+// output_handle: the new list, with the element in the proper position
 //
-func TensorListGetItem(scope *Scope, input_handle tf.Output, index tf.Output, element_dtype tf.DataType) (item tf.Output) {
+func TensorListSetItem(scope *Scope, input_handle tf.Output, index tf.Output, item tf.Output) (output_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "TensorListGetItem",
+		Type: "TensorListSetItem",
 		Input: []tf.Input{
-			input_handle, index,
+			input_handle, index, item,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -21761,46 +21948,6 @@ func MatrixExponential(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the matrix logarithm of one or more square matrices:
-//
-//
-// log(exp(A)) = A
-//
-// This op is only defined for complex matrices. If A is positive-definite and
-// real, then casting to a complex matrix, taking the logarithm and casting back
-// to a real matrix will give the correct result.
-//
-// This function computes the matrix logarithm using the Schur-Parlett algorithm.
-// Details of the algorithm can be found in Section 11.6.2 of:
-// Nicholas J. Higham, Functions of Matrices: Theory and Computation, SIAM 2008.
-// ISBN 978-0-898716-46-7.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the exponential for all input submatrices `[..., :, :]`.
-//
-// Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[..., M, M]`.
-//
-// @compatibility(scipy)
-// Equivalent to scipy.linalg.logm
-// @end_compatibility
-func MatrixLogarithm(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixLogarithm",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // QueueDequeueUpToV2Attr is an optional argument to QueueDequeueUpToV2.
 type QueueDequeueUpToV2Attr func(optionalAttr)
 
@@ -22101,6 +22248,53 @@ func AdjustSaturation(scope *Scope, images tf.Output, scale tf.Output) (output t
 	return op.Output(0)
 }
 
+// MatrixSolveAttr is an optional argument to MatrixSolve.
+type MatrixSolveAttr func(optionalAttr)
+
+// MatrixSolveAdjoint sets the optional adjoint attribute to value.
+//
+// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
+// adjoint.
+// If not specified, defaults to false
+func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
+	return func(m optionalAttr) {
+		m["adjoint"] = value
+	}
+}
+
+// Solves systems of linear equations.
+//
+// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
+// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
+// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+// If `adjoint` is `True` then each output matrix satisfies
+// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
+//
+// Arguments:
+//	matrix: Shape is `[..., M, M]`.
+//	rhs: Shape is `[..., M, K]`.
+//
+// Returns Shape is `[..., M, K]`.
+func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixSolve",
+		Input: []tf.Input{
+			matrix, rhs,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // SvdAttr is an optional argument to Svd.
 type SvdAttr func(optionalAttr)
 
@@ -23656,7 +23850,7 @@ func SampleDistortedBoundingBoxV2AspectRatioRange(value []float32) SampleDistort
 // SampleDistortedBoundingBoxV2AreaRange sets the optional area_range attribute to value.
 //
 // value: The cropped area of the image must contain a fraction of the
-// supplied image within in this range.
+// supplied image within this range.
 // If not specified, defaults to <f:0.05 f:1 >
 func SampleDistortedBoundingBoxV2AreaRange(value []float32) SampleDistortedBoundingBoxV2Attr {
 	return func(m optionalAttr) {
@@ -24165,6 +24359,46 @@ func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_ou
 	return op.Output(0)
 }
 
+// Computes the matrix logarithm of one or more square matrices:
+//
+//
+// log(exp(A)) = A
+//
+// This op is only defined for complex matrices. If A is positive-definite and
+// real, then casting to a complex matrix, taking the logarithm and casting back
+// to a real matrix will give the correct result.
+//
+// This function computes the matrix logarithm using the Schur-Parlett algorithm.
+// Details of the algorithm can be found in Section 11.6.2 of:
+// Nicholas J. Higham, Functions of Matrices: Theory and Computation, SIAM 2008.
+// ISBN 978-0-898716-46-7.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the exponential for all input submatrices `[..., :, :]`.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(scipy)
+// Equivalent to scipy.linalg.logm
+// @end_compatibility
+func MatrixLogarithm(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixLogarithm",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // EncodeProtoAttr is an optional argument to EncodeProto.
 type EncodeProtoAttr func(optionalAttr)
 
@@ -24312,8 +24546,7 @@ type DecodeProtoV2Attr func(optionalAttr)
 // If not specified, defaults to "local://"
 func DecodeProtoV2DescriptorSource(value string) DecodeProtoV2Attr {
 	return func(m optionalAttr) {
-		m["descriptor_source"] = value
-	}
+		m["descriptor_source"] = value	}
 }
 
 // DecodeProtoV2MessageFormat sets the optional message_format attribute to value.
@@ -24857,28 +25090,93 @@ func MapSize(scope *Scope, dtypes []tf.DataType, optional ...MapSizeAttr) (size
 
 // Convert JSON-encoded Example records to binary protocol buffer strings.
 //
-// This op translates a tensor containing Example records, encoded using
-// the [standard JSON
-// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
-// into a tensor containing the same records encoded as binary protocol
-// buffers. The resulting tensor can then be fed to any of the other
-// Example-parsing ops.
+// This op translates a tensor containing Example records, encoded using
+// the [standard JSON
+// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
+// into a tensor containing the same records encoded as binary protocol
+// buffers. The resulting tensor can then be fed to any of the other
+// Example-parsing ops.
+//
+// Arguments:
+//	json_examples: Each string is a JSON object serialized according to the JSON
+// mapping of the Example proto.
+//
+// Returns Each string is a binary Example protocol buffer corresponding
+// to the respective element of `json_examples`.
+func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeJSONExample",
+		Input: []tf.Input{
+			json_examples,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseToDenseAttr is an optional argument to SparseToDense.
+type SparseToDenseAttr func(optionalAttr)
+
+// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
+//
+// value: If true, indices are checked to make sure they are sorted in
+// lexicographic order and that there are no repeats.
+// If not specified, defaults to true
+func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Converts a sparse representation into a dense tensor.
+//
+// Builds an array `dense` with shape `output_shape` such that
+//
+// ```
+// # If sparse_indices is scalar
+// dense[i] = (i == sparse_indices ? sparse_values : default_value)
+//
+// # If sparse_indices is a vector, then for each i
+// dense[sparse_indices[i]] = sparse_values[i]
+//
+// # If sparse_indices is an n by d matrix, then for each i in [0, n)
+// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
+// ```
+//
+// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
+// scalar, all sparse indices are set to this single value.
+//
+// Indices should be sorted in lexicographic order, and indices must not
+// contain any repeats. If `validate_indices` is true, these properties
+// are checked during execution.
 //
 // Arguments:
-//	json_examples: Each string is a JSON object serialized according to the JSON
-// mapping of the Example proto.
+//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
+// index where `sparse_values[i]` will be placed.
+//	output_shape: 1-D.  Shape of the dense output tensor.
+//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
+// or a scalar value to be used for all sparse indices.
+//	default_value: Scalar value to set for indices not specified in
+// `sparse_indices`.
 //
-// Returns Each string is a binary Example protocol buffer corresponding
-// to the respective element of `json_examples`.
-func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
+// Returns Dense output tensor of shape `output_shape`.
+func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "DecodeJSONExample",
+		Type: "SparseToDense",
 		Input: []tf.Input{
-			json_examples,
+			sparse_indices, output_shape, sparse_values, default_value,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -25246,6 +25544,57 @@ func CacheDataset(scope *Scope, input_dataset tf.Output, filename tf.Output, out
 	return op.Output(0)
 }
 
+// Computes the sum along sparse segments of a tensor.
+//
+// Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
+//
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// For example:
+//
+// ```python
+// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+//
+// tf.sparse_segment_sum_with_num_segments(
+//     c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
+// # => [[0 0 0 0]
+// #     [0 0 0 0]
+// #     [0 0 0 0]]
+//
+// tf.sparse_segment_sum_with_num_segments(c,
+//                                         tf.constant([0, 1]),
+//                                         tf.constant([0, 2],
+//                                         num_segments=4))
+// # => [[ 1  2  3  4]
+// #     [ 0  0  0  0]
+// #     [-1 -2 -3 -4]
+// #     [ 0  0  0  0]]
+// ```
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `num_segments`.
+func SparseSegmentSumWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentSumWithNumSegments",
+		Input: []tf.Input{
+			data, indices, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Creates a dataset that executes a SQL query and emits rows of the result set.
 //
 // Arguments:
@@ -26649,56 +26998,6 @@ func QueueIsClosedV2(scope *Scope, handle tf.Output) (is_closed tf.Output) {
 	return op.Output(0)
 }
 
-// Returns the batched diagonal part of a batched tensor.
-//
-// This operation returns a tensor with the `diagonal` part
-// of the batched `input`. The `diagonal` part is computed as follows:
-//
-// Assume `input` has `k` dimensions `[I, J, K, ..., M, N]`, then the output is a
-// tensor of rank `k - 1` with dimensions `[I, J, K, ..., min(M, N)]` where:
-//
-// `diagonal[i, j, k, ..., n] = input[i, j, k, ..., n, n]`.
-//
-// The input must be at least a matrix.
-//
-// For example:
-//
-// ```
-// # 'input' is [[[1, 0, 0, 0]
-//                [0, 2, 0, 0]
-//                [0, 0, 3, 0]
-//                [0, 0, 0, 4]],
-//               [[5, 0, 0, 0]
-//                [0, 6, 0, 0]
-//                [0, 0, 7, 0]
-//                [0, 0, 0, 8]]]
-//
-// and input.shape = (2, 4, 4)
-//
-// tf.matrix_diag_part(input) ==> [[1, 2, 3, 4], [5, 6, 7, 8]]
-//
-// which has shape (2, 4)
-// ```
-//
-// Arguments:
-//	input: Rank `k` tensor where `k >= 2`.
-//
-// Returns The extracted diagonal(s) having shape
-// `diagonal.shape = input.shape[:-2] + [min(input.shape[-2:])]`.
-func MatrixDiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixDiagPart",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Computes the absolute value of a tensor.
 //
 // Given a tensor `x`, this operation returns a tensor containing the absolute
@@ -27149,154 +27448,38 @@ func TensorArrayConcatV3ElementShapeExcept0(value tf.Shape) TensorArrayConcatV3A
 //   (n0 x d0 x d1 x ...), (n1 x d0 x d1 x ...), ..., (n(T-1) x d0 x d1 x ...)
 //   ```
 //
-// and concatenates them into a Tensor of shape:
-//
-//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```
-//
-// All elements must have the same shape (excepting the first dimension).
-//
-// Arguments:
-//	handle: The handle to a TensorArray.
-//	flow_in: A float scalar that enforces proper chaining of operations.
-//	dtype: The type of the elem that is returned.
-//
-// Returns All of the elements in the TensorArray, concatenated along the first
-// axis.A vector of the row sizes of the original T elements in the
-// value output.  In the example above, this would be the values:
-// `(n1, n2, ..., n(T-1))`.
-func TensorArrayConcatV3(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV3Attr) (value tf.Output, lengths tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayConcatV3",
-		Input: []tf.Input{
-			handle, flow_in,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// ParameterizedTruncatedNormalAttr is an optional argument to ParameterizedTruncatedNormal.
-type ParameterizedTruncatedNormalAttr func(optionalAttr)
-
-// ParameterizedTruncatedNormalSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ParameterizedTruncatedNormalSeed(value int64) ParameterizedTruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// ParameterizedTruncatedNormalSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ParameterizedTruncatedNormalSeed2(value int64) ParameterizedTruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from a normal distribution. The parameters may each be a
-//
-// scalar which applies to the entire output, or a vector of length shape[0] which
-// stores the parameters for each batch.
-//
-// Arguments:
-//	shape: The shape of the output tensor. Batches are indexed by the 0th dimension.
-//	means: The mean parameter of each batch.
-//	stdevs: The standard deviation parameter of each batch. Must be greater than 0.
-//	minvals: The minimum cutoff. May be -infinity.
-//	maxvals: The maximum cutoff. May be +infinity, and must be more than the minval
-// for each batch.
-//
-// Returns A matrix of shape num_batches x samples_per_batch, filled with random
-// truncated normal values using the parameters for each row.
-func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output, stdevs tf.Output, minvals tf.Output, maxvals tf.Output, optional ...ParameterizedTruncatedNormalAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ParameterizedTruncatedNormal",
-		Input: []tf.Input{
-			shape, means, stdevs, minvals, maxvals,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Sets the index-th position of the list to contain the given tensor.
-//
-// input_handle: the list
-// index: the position in the list to which the tensor will be assigned
-// item: the element to be assigned to that position
-// output_handle: the new list, with the element in the proper position
-//
-func TensorListSetItem(scope *Scope, input_handle tf.Output, index tf.Output, item tf.Output) (output_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListSetItem",
-		Input: []tf.Input{
-			input_handle, index, item,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns a diagonal tensor with a given diagonal values.
-//
-// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
-// everything else padded with zeros. The diagonal is computed as follows:
-//
-// Assume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of
-// rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
-//
-// `output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.
+// and concatenates them into a Tensor of shape:
 //
-// For example:
+//   ```(n0 + n1 + ... + n(T-1) x d0 x d1 x ...)```
 //
-// ```
-// # 'diagonal' is [1, 2, 3, 4]
-// tf.diag(diagonal) ==> [[1, 0, 0, 0]
-//                        [0, 2, 0, 0]
-//                        [0, 0, 3, 0]
-//                        [0, 0, 0, 4]]
-// ```
+// All elements must have the same shape (excepting the first dimension).
 //
 // Arguments:
-//	diagonal: Rank k tensor where k is at most 1.
-func Diag(scope *Scope, diagonal tf.Output) (output tf.Output) {
+//	handle: The handle to a TensorArray.
+//	flow_in: A float scalar that enforces proper chaining of operations.
+//	dtype: The type of the elem that is returned.
+//
+// Returns All of the elements in the TensorArray, concatenated along the first
+// axis.A vector of the row sizes of the original T elements in the
+// value output.  In the example above, this would be the values:
+// `(n1, n2, ..., n(T-1))`.
+func TensorArrayConcatV3(scope *Scope, handle tf.Output, flow_in tf.Output, dtype tf.DataType, optional ...TensorArrayConcatV3Attr) (value tf.Output, lengths tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Diag",
+		Type: "TensorArrayConcatV3",
 		Input: []tf.Input{
-			diagonal,
+			handle, flow_in,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
 // Split the data from the input value into TensorArray elements.
@@ -29242,6 +29425,26 @@ func Snapshot(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
+// Returns a tensor of zeros with the same shape and type as x.
+//
+// Arguments:
+//	x: a tensor of type T.
+//
+// Returns a tensor of the same shape and type as x but filled with zeros.
+func ZerosLike(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ZerosLike",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // AbortAttr is an optional argument to Abort.
 type AbortAttr func(optionalAttr)
 
@@ -30507,206 +30710,3 @@ func GuaranteeConst(scope *Scope, input tf.Output) (output tf.Output) {
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
-
-// Returns a tensor of zeros with the same shape and type as x.
-//
-// Arguments:
-//	x: a tensor of type T.
-//
-// Returns a tensor of the same shape and type as x but filled with zeros.
-func ZerosLike(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ZerosLike",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizedInstanceNormAttr is an optional argument to QuantizedInstanceNorm.
-type QuantizedInstanceNormAttr func(optionalAttr)
-
-// QuantizedInstanceNormOutputRangeGiven sets the optional output_range_given attribute to value.
-//
-// value: If True, `given_y_min` and `given_y_min`
-// and `given_y_max` are used as the output range. Otherwise,
-// the implementation computes the output range.
-// If not specified, defaults to false
-func QuantizedInstanceNormOutputRangeGiven(value bool) QuantizedInstanceNormAttr {
-	return func(m optionalAttr) {
-		m["output_range_given"] = value
-	}
-}
-
-// QuantizedInstanceNormGivenYMin sets the optional given_y_min attribute to value.
-//
-// value: Output in `y_min` if `output_range_given` is True.
-// If not specified, defaults to 0
-func QuantizedInstanceNormGivenYMin(value float32) QuantizedInstanceNormAttr {
-	return func(m optionalAttr) {
-		m["given_y_min"] = value
-	}
-}
-
-// QuantizedInstanceNormGivenYMax sets the optional given_y_max attribute to value.
-//
-// value: Output in `y_max` if `output_range_given` is True.
-// If not specified, defaults to 0
-func QuantizedInstanceNormGivenYMax(value float32) QuantizedInstanceNormAttr {
-	return func(m optionalAttr) {
-		m["given_y_max"] = value
-	}
-}
-
-// QuantizedInstanceNormVarianceEpsilon sets the optional variance_epsilon attribute to value.
-//
-// value: A small float number to avoid dividing by 0.
-// If not specified, defaults to 1e-05
-func QuantizedInstanceNormVarianceEpsilon(value float32) QuantizedInstanceNormAttr {
-	return func(m optionalAttr) {
-		m["variance_epsilon"] = value
-	}
-}
-
-// QuantizedInstanceNormMinSeparation sets the optional min_separation attribute to value.
-//
-// value: Minimum value of `y_max - y_min`
-// If not specified, defaults to 0.001
-func QuantizedInstanceNormMinSeparation(value float32) QuantizedInstanceNormAttr {
-	return func(m optionalAttr) {
-		m["min_separation"] = value
-	}
-}
-
-// Quantized Instance normalization.
-//
-// Arguments:
-//	x: A 4D input Tensor.
-//	x_min: The value represented by the lowest quantized input.
-//	x_max: The value represented by the highest quantized input.
-//
-// Returns A 4D Tensor.The value represented by the lowest quantized output.The value represented by the highest quantized output.
-func QuantizedInstanceNorm(scope *Scope, x tf.Output, x_min tf.Output, x_max tf.Output, optional ...QuantizedInstanceNormAttr) (y tf.Output, y_min tf.Output, y_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedInstanceNorm",
-		Input: []tf.Input{
-			x, x_min, x_max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Returns the diagonal part of the tensor.
-//
-// This operation returns a tensor with the `diagonal` part
-// of the `input`. The `diagonal` part is computed as follows:
-//
-// Assume `input` has dimensions `[D1,..., Dk, D1,..., Dk]`, then the output is a
-// tensor of rank `k` with dimensions `[D1,..., Dk]` where:
-//
-// `diagonal[i1,..., ik] = input[i1, ..., ik, i1,..., ik]`.
-//
-// For example:
-//
-// ```
-// # 'input' is [[1, 0, 0, 0]
-//               [0, 2, 0, 0]
-//               [0, 0, 3, 0]
-//               [0, 0, 0, 4]]
-//
-// tf.diag_part(input) ==> [1, 2, 3, 4]
-// ```
-//
-// Arguments:
-//	input: Rank k tensor where k is even and not zero.
-//
-// Returns The extracted diagonal.
-func DiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DiagPart",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns the element-wise max of two SparseTensors.
-//
-// Assumes the two SparseTensors have the same shape, i.e., no broadcasting.
-//
-// Arguments:
-//	a_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, in the canonical lexicographic ordering.
-//	a_values: 1-D.  `N` non-empty values corresponding to `a_indices`.
-//	a_shape: 1-D.  Shape of the input SparseTensor.
-//	b_indices: counterpart to `a_indices` for the other operand.
-//	b_values: counterpart to `a_values` for the other operand; must be of the same dtype.
-//	b_shape: counterpart to `a_shape` for the other operand; the two shapes must be equal.
-//
-// Returns 2-D.  The indices of the output SparseTensor.1-D.  The values of the output SparseTensor.
-func SparseSparseMaximum(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSparseMaximum",
-		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Returns a batched matrix tensor with new batched diagonal values.
-//
-// Given `input` and `diagonal`, this operation returns a tensor with the
-// same shape and values as `input`, except for the main diagonal of the
-// innermost matrices.  These will be overwritten by the values in `diagonal`.
-//
-// The output is computed as follows:
-//
-// Assume `input` has `k+1` dimensions `[I, J, K, ..., M, N]` and `diagonal` has
-// `k` dimensions `[I, J, K, ..., min(M, N)]`.  Then the output is a
-// tensor of rank `k+1` with dimensions `[I, J, K, ..., M, N]` where:
-//
-//   * `output[i, j, k, ..., m, n] = diagonal[i, j, k, ..., n]` for `m == n`.
-//   * `output[i, j, k, ..., m, n] = input[i, j, k, ..., m, n]` for `m != n`.
-//
-// Arguments:
-//	input: Rank `k+1`, where `k >= 1`.
-//	diagonal: Rank `k`, where `k >= 1`.
-//
-// Returns Rank `k+1`, with `output.shape = input.shape`.
-func MatrixSetDiag(scope *Scope, input tf.Output, diagonal tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixSetDiag",
-		Input: []tf.Input{
-			input, diagonal,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index ab7d698a45b7fc..19d2133a55f347 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -60,9 +60,7 @@ java_library(
 
 filegroup(
     name = "java_op_sources",
-    srcs = glob(["src/main/java/org/tensorflow/op/**/*.java"]) + [
-        ":java_op_gen_sources",
-    ],
+    srcs = glob(["src/main/java/org/tensorflow/op/**/*.java"]) + [":java_op_gen_sources"],
     visibility = [
         "//tensorflow/java:__pkg__",
     ],
@@ -70,40 +68,27 @@ filegroup(
 
 tf_java_op_gen_srcjar(
     name = "java_op_gen_sources",
-    gen_base_package = "org.tensorflow.op",
-    gen_tool = "java_op_gen_tool",
-    ops_libs = [
-        "array_ops",
-        "candidate_sampling_ops",
-        "control_flow_ops",
-        "data_flow_ops",
-        "image_ops",
-        "io_ops",
-        "linalg_ops",
-        "logging_ops",
-        "math_ops",
-        "nn_ops",
-        "no_op",
-        "parsing_ops",
-        "random_ops",
-        "sparse_ops",
-        "state_ops",
-        "string_ops",
-        "training_ops",
-        "user_ops",
+    api_def_srcs = [
+        "//tensorflow/core/api_def:base_api_def",
     ],
+    base_package = "org.tensorflow.op",
+    gen_tool = ":java_op_gen_tool",
 )
 
-# Build the gen tool as a library, as it will be linked to a core/ops binary
-# file before making it an executable. See tf_java_op_gen_srcjar().
-cc_library(
+tf_cc_binary(
     name = "java_op_gen_tool",
     srcs = [
         "src/gen/cc/op_gen_main.cc",
     ],
     copts = tf_copts(),
+    linkopts = ["-lm"],
+    linkstatic = 1,
     deps = [
         ":java_op_gen_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:ops",
     ],
 )
 
@@ -111,11 +96,13 @@ cc_library(
     name = "java_op_gen_lib",
     srcs = [
         "src/gen/cc/op_generator.cc",
+        "src/gen/cc/op_specs.cc",
         "src/gen/cc/source_writer.cc",
     ],
     hdrs = [
         "src/gen/cc/java_defs.h",
         "src/gen/cc/op_generator.h",
+        "src/gen/cc/op_specs.h",
         "src/gen/cc/source_writer.h",
     ],
     copts = tf_copts(),
@@ -124,6 +111,9 @@ cc_library(
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:op_gen_lib",
+        "//tensorflow/core:protos_all_cc",
+        "@com_googlesource_code_re2//:re2",
     ],
 )
 
@@ -316,6 +306,7 @@ tf_cc_test(
     ],
     deps = [
         ":java_op_gen_lib",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
@@ -343,7 +334,7 @@ tf_cc_binary(
         "//tensorflow:debug": [],  # Disable all custom linker options in debug mode
         "//tensorflow:darwin": [
             "-Wl,-exported_symbols_list",  # This line must be directly followed by LINKER_EXPORTED_SYMBOLS
-            LINKER_EXPORTED_SYMBOLS,
+            "$(location {})".format(LINKER_EXPORTED_SYMBOLS),
         ],
         "//tensorflow:windows": [],
         "//tensorflow:windows_msvc": [],
@@ -351,7 +342,7 @@ tf_cc_binary(
             "-z defs",
             "-s",
             "-Wl,--version-script",  #  This line must be directly followed by LINKER_VERSION_SCRIPT
-            LINKER_VERSION_SCRIPT,
+            "$(location {})".format(LINKER_VERSION_SCRIPT),
         ],
     }),
     linkshared = 1,
diff --git a/tensorflow/java/build_defs.bzl b/tensorflow/java/build_defs.bzl
index ab7f60d03dfd04..e1916ca4d9d6aa 100644
--- a/tensorflow/java/build_defs.bzl
+++ b/tensorflow/java/build_defs.bzl
@@ -15,6 +15,7 @@ JAVA_VERSION_OPTS = [
 XLINT_OPTS = [
     "-Werror",
     "-Xlint:all",
+    "-Xlint:-processing",
     "-Xlint:-serial",
     "-Xlint:-try",
     "-Xlint:-classfile", # see b/32750402, go/javac-warnings#classfile
diff --git a/tensorflow/java/maven/libtensorflow/pom.xml b/tensorflow/java/maven/libtensorflow/pom.xml
index 66985e3b18cd3f..08cc860f5795a4 100644
--- a/tensorflow/java/maven/libtensorflow/pom.xml
+++ b/tensorflow/java/maven/libtensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.8.0-rc1</version>
+    <version>1.8.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni/pom.xml b/tensorflow/java/maven/libtensorflow_jni/pom.xml
index 34d4ba0b083d23..fcc7eacc33b7ba 100644
--- a/tensorflow/java/maven/libtensorflow_jni/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.8.0-rc1</version>
+    <version>1.8.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
index 1909d08e41daa7..3d22d86a4970de 100644
--- a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.8.0-rc1</version>
+    <version>1.8.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni_gpu</artifactId>
diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml
index ba98732f5add32..0a09a5ea7cb967 100644
--- a/tensorflow/java/maven/pom.xml
+++ b/tensorflow/java/maven/pom.xml
@@ -6,7 +6,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>org.tensorflow</groupId>
   <artifactId>parentpom</artifactId>
-  <version>1.8.0-rc1</version>
+  <version>1.8.0</version>
   <packaging>pom</packaging>
 
   <url>https://www.tensorflow.org</url>
diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml
index dee8c343598d17..77ec6a0ddbab27 100644
--- a/tensorflow/java/maven/proto/pom.xml
+++ b/tensorflow/java/maven/proto/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.8.0-rc1</version>
+    <version>1.8.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>proto</artifactId>
diff --git a/tensorflow/java/maven/tensorflow/pom.xml b/tensorflow/java/maven/tensorflow/pom.xml
index 95e024ace9762d..0df1f2814906e5 100644
--- a/tensorflow/java/maven/tensorflow/pom.xml
+++ b/tensorflow/java/maven/tensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.8.0-rc1</version>
+    <version>1.8.0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>tensorflow</artifactId>
diff --git a/tensorflow/java/src/gen/cc/java_defs.h b/tensorflow/java/src/gen/cc/java_defs.h
index 59f8beaee78a2f..f5f54bf4d31af1 100644
--- a/tensorflow/java/src/gen/cc/java_defs.h
+++ b/tensorflow/java/src/gen/cc/java_defs.h
@@ -18,17 +18,20 @@ limitations under the License.
 
 #include <string>
 #include <list>
+#include <map>
+#include <utility>
 
 namespace tensorflow {
 namespace java {
 
 // An enumeration of different modifiers commonly used in Java
 enum Modifier {
-  PUBLIC    = (1 << 0),
+  PACKAGE = 0,
+  PUBLIC = (1 << 0),
   PROTECTED = (1 << 1),
-  PRIVATE   = (1 << 2),
-  STATIC    = (1 << 3),
-  FINAL     = (1 << 4),
+  PRIVATE = (1 << 2),
+  STATIC = (1 << 3),
+  FINAL = (1 << 4),
 };
 
 class Annotation;
@@ -72,6 +75,8 @@ class Type {
     // Reflection API does
     return Type(Type::PRIMITIVE, "void");
   }
+  static Type Generic(const string& name) { return Type(Type::GENERIC, name); }
+  static Type Wildcard() { return Type(Type::GENERIC, ""); }
   static Type Class(const string& name, const string& package = "") {
     return Type(Type::CLASS, name, package);
   }
@@ -81,9 +86,6 @@ class Type {
   static Type Enum(const string& name, const string& package = "") {
     return Type(Type::ENUM, name, package);
   }
-  static Type Generic(const string& name = "") {
-    return Type(Type::GENERIC, name);
-  }
   static Type ClassOf(const Type& type) {
     return Class("Class").add_parameter(type);
   }
@@ -96,11 +98,10 @@ class Type {
   const Kind& kind() const { return kind_; }
   const string& name() const { return name_; }
   const string& package() const { return package_; }
-  const string& description() const { return description_; }
-  Type& description(const string& description) {
-    description_ = description;
-    return *this;
+  const string canonical_name() const {
+    return package_.empty() ? name_ : package_ + "." + name_;
   }
+  bool wildcard() const { return name_.empty(); }  // only wildcards has no name
   const std::list<Type>& parameters() const { return parameters_; }
   Type& add_parameter(const Type& parameter) {
     parameters_.push_back(parameter);
@@ -120,14 +121,6 @@ class Type {
     }
     return *this;
   }
-  // Returns true if "type" is of a known collection type (only a few for now)
-  bool IsCollection() const {
-    return name_ == "List" || name_ == "Iterable";
-  }
-  // Returns true if this instance is a wildcard (<?>)
-  bool IsWildcard() const {
-    return kind_ == GENERIC && name_.empty();
-  }
 
  protected:
   Type(Kind kind, const string& name, const string& package = "")
@@ -137,7 +130,6 @@ class Type {
   Kind kind_;
   string name_;
   string package_;
-  string description_;
   std::list<Type> parameters_;
   std::list<Annotation> annotations_;
   std::list<Type> supertypes_;
@@ -180,16 +172,11 @@ class Variable {
   const string& name() const { return name_; }
   const Type& type() const { return type_; }
   bool variadic() const { return variadic_; }
-  const string& description() const { return description_; }
-  Variable& description(const string& description) {
-    description_ = description;
-    return *this;
-  }
+
  private:
   string name_;
   Type type_;
   bool variadic_;
-  string description_;
 
   Variable(const string& name, const Type& type, bool variadic)
     : name_(name), type_(type), variadic_(variadic) {}
@@ -210,16 +197,6 @@ class Method {
   bool constructor() const { return constructor_; }
   const string& name() const { return name_; }
   const Type& return_type() const { return return_type_; }
-  const string& description() const { return description_; }
-  Method& description(const string& description) {
-    description_ = description;
-    return *this;
-  }
-  const string& return_description() const { return return_description_; }
-  Method& return_description(const string& description) {
-    return_description_ = description;
-    return *this;
-  }
   const std::list<Variable>& arguments() const { return arguments_; }
   Method& add_argument(const Variable& var) {
     arguments_.push_back(var);
@@ -235,8 +212,6 @@ class Method {
   string name_;
   Type return_type_;
   bool constructor_;
-  string description_;
-  string return_description_;
   std::list<Variable> arguments_;
   std::list<Annotation> annotations_;
 
@@ -244,6 +219,33 @@ class Method {
     : name_(name), return_type_(return_type), constructor_(constructor) {}
 };
 
+// A definition of a documentation bloc for a Java element (JavaDoc)
+class Javadoc {
+ public:
+  static Javadoc Create(const string& brief = "") { return Javadoc(brief); }
+  const string& brief() const { return brief_; }
+  const string& details() const { return details_; }
+  Javadoc& details(const string& details) {
+    details_ = details;
+    return *this;
+  }
+  const std::list<std::pair<string, string>>& tags() const { return tags_; }
+  Javadoc& add_tag(const string& tag, const string& text) {
+    tags_.push_back(std::make_pair(tag, text));
+    return *this;
+  }
+  Javadoc& add_param_tag(const string& name, const string& text) {
+    return add_tag("param", name + " " + text);
+  }
+
+ private:
+  string brief_;
+  string details_;
+  std::list<std::pair<string, string>> tags_;
+
+  explicit Javadoc(const string& brief) : brief_(brief) {}
+};
+
 }  // namespace java
 }  // namespace tensorflow
 
diff --git a/tensorflow/java/src/gen/cc/op_gen_main.cc b/tensorflow/java/src/gen/cc/op_gen_main.cc
index bea99f3d7f6bea..0d9e0883af262e 100644
--- a/tensorflow/java/src/gen/cc/op_gen_main.cc
+++ b/tensorflow/java/src/gen/cc/op_gen_main.cc
@@ -36,49 +36,43 @@ const char kUsageHeader[] =
     "Operation wrappers are generated under the path specified by the "
     "'--output_dir' argument. This path can be absolute or relative to the\n"
     "current working directory and will be created if it does not exists.\n\n"
-    "The '--lib_name' argument is used to classify the set of operations. If "
-    "the chosen name contains more than one word, it must be provided in \n"
-    "snake_case. This value is declined into other meaningful names, such as "
-    "the group and package of the generated operations. For example,\n"
-    "'--lib_name=my_lib' generates the operations under the "
-    "'org.tensorflow.op.mylib' package and add them to the 'myLib()' operator\n"
-    "group.\n\n"
-    "Note that the operator group assigned to the generated wrappers is just "
-    "an annotation tag at this stage. Operations will not be available "
-    "through\n"
-    "the 'org.tensorflow.op.Ops' API as a group until the generated classes "
-    "are compiled using an appropriate annotation processor.\n\n"
-    "Finally, the '--base_package' overrides the default parent package "
-    "under which the generated subpackage and classes are to be located.\n\n";
+    "Note that the operations will not be available through the "
+    "'org.tensorflow.op.Ops' API until the generated classes are compiled\n"
+    "using an appropriate annotation processor.\n\n"
+    "The '--base_package' overrides the default parent package under which "
+    "the generated subpackage and classes are to be located.\n\n"
+    "Finally, the `--api_dirs` argument takes a list of comma-separated "
+    "directories of API definitions can be provided to override default\n"
+    "values found in the ops definitions. Directories are ordered by priority "
+    "(the last having precedence over the first).\n\n";
 
 }  // namespace java
 }  // namespace tensorflow
 
 int main(int argc, char* argv[]) {
-  tensorflow::string lib_name;
   tensorflow::string output_dir;
   tensorflow::string base_package = "org.tensorflow.op";
+  tensorflow::string api_dirs_str;
   std::vector<tensorflow::Flag> flag_list = {
       tensorflow::Flag("output_dir", &output_dir,
                        "Root directory into which output files are generated"),
-      tensorflow::Flag(
-          "lib_name", &lib_name,
-          "A name, in snake_case, used to classify this set of operations"),
       tensorflow::Flag(
           "base_package", &base_package,
-          "Package parent to the generated subpackage and classes")};
+          "Package parent to the generated subpackage and classes"),
+      tensorflow::Flag(
+          "api_dirs", &api_dirs_str,
+          "List of directories that contains the ops api definitions")};
   tensorflow::string usage = tensorflow::java::kUsageHeader;
   usage += tensorflow::Flags::Usage(argv[0], flag_list);
   bool parsed_flags_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
   tensorflow::port::InitMain(usage.c_str(), &argc, &argv);
-  QCHECK(parsed_flags_ok && !lib_name.empty() && !output_dir.empty()) << usage;
-
-  tensorflow::java::OpGenerator generator;
+  QCHECK(parsed_flags_ok && !output_dir.empty()) << usage;
+  std::vector<tensorflow::string> api_dirs = tensorflow::str_util::Split(
+      api_dirs_str, ",", tensorflow::str_util::SkipEmpty());
+  tensorflow::java::OpGenerator generator(api_dirs);
   tensorflow::OpList ops;
-  tensorflow::OpRegistry::Global()->Export(true, &ops);
-  tensorflow::Status status =
-      generator.Run(ops, lib_name, base_package, output_dir);
-  TF_QCHECK_OK(status);
+  tensorflow::OpRegistry::Global()->Export(false, &ops);
+  TF_CHECK_OK(generator.Run(ops, base_package, output_dir));
 
   return 0;
 }
diff --git a/tensorflow/java/src/gen/cc/op_generator.cc b/tensorflow/java/src/gen/cc/op_generator.cc
index def06baf2db43e..debd95fc621749 100644
--- a/tensorflow/java/src/gen/cc/op_generator.cc
+++ b/tensorflow/java/src/gen/cc/op_generator.cc
@@ -13,54 +13,466 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <list>
+#include <map>
+#include <memory>
+#include <set>
 #include <string>
+#include <vector>
 
+#include "tensorflow/core/framework/op_gen_lib.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/java/src/gen/cc/java_defs.h"
 #include "tensorflow/java/src/gen/cc/op_generator.h"
+#include "tensorflow/java/src/gen/cc/op_specs.h"
+#include "tensorflow/java/src/gen/cc/source_writer.h"
 
 namespace tensorflow {
 namespace java {
 namespace {
 
-string CamelCase(const string& str, char delimiter, bool upper) {
-  string result;
-  bool cap = upper;
-  for (string::const_iterator it = str.begin(); it != str.end(); ++it) {
-    const char c = *it;
-    if (c == delimiter) {
-      cap = true;
-    } else if (cap) {
-      result += toupper(c);
-      cap = false;
+const char* kLicense =
+    "/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.\n"
+    "\n"
+    "Licensed under the Apache License, Version 2.0 (the \"License\");\n"
+    "you may not use this file except in compliance with the License.\n"
+    "You may obtain a copy of the License at\n"
+    "\n"
+    "    http://www.apache.org/licenses/LICENSE-2.0\n"
+    "\n"
+    "Unless required by applicable law or agreed to in writing, software\n"
+    "distributed under the License is distributed on an \"AS IS\" BASIS,\n"
+    "WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n"
+    "See the License for the specific language governing permissions and\n"
+    "limitations under the License.\n"
+    "=======================================================================*/"
+    "\n";
+
+// There is three different modes to render an op class, depending on the
+// number and type of outputs it has:
+//
+// DEFAULT: This mode does not provide any specialization for the op class, it
+//          is applied when the operation does not comply with any other mode
+//
+// OPERAND: The op class implements the Operand<T> interface, allowing an
+//          instance to be passed directly in input to another operation
+//
+// LIST_OPERAND: The op class implements the Iterable<Operand<T>> interface,
+//          allowing an instance to be passed directly as a list input to
+//          another operation
+//
+enum RenderMode { DEFAULT, OPERAND, LIST_OPERAND };
+
+void AddArgument(const Variable& var, const string& description,
+                 Method* method_out, Javadoc* javadoc_out) {
+  method_out->add_argument(var);
+  javadoc_out->add_param_tag(var.name(), description);
+}
+
+void CollectOpDependencies(const OpSpec& op, RenderMode mode,
+                           std::list<Type>* out) {
+  out->push_back(Type::Class("Operation", "org.tensorflow"));
+  out->push_back(Type::Class("OperationBuilder", "org.tensorflow"));
+  out->push_back(Type::Class("Scope", "org.tensorflow.op"));
+  if (mode == OPERAND) {
+    out->push_back(Type::Class("Output", "org.tensorflow"));
+  } else if (mode == LIST_OPERAND) {
+    out->push_back(Type::Interface("Iterator", "java.util"));
+  }
+  // Don't pay attention to duplicate types in the dependency list, they will
+  // be filtered out by the SourceWriter.
+  for (const ArgumentSpec& input : op.inputs()) {
+    out->push_back(input.var().type());
+    if (input.iterable()) {
+      out->push_back(Type::Class("Operands", "org.tensorflow.op"));
+    }
+  }
+  for (const ArgumentSpec& output : op.outputs()) {
+    out->push_back(output.var().type());
+    if (output.iterable()) {
+      out->push_back(Type::Class("Arrays", "java.util"));
+    }
+  }
+  for (const AttributeSpec& attribute : op.attributes()) {
+    out->push_back(attribute.var().type());
+    out->push_back(attribute.jni_type());
+  }
+  for (const AttributeSpec& optional_attribute : op.optional_attributes()) {
+    out->push_back(optional_attribute.var().type());
+  }
+}
+
+void WriteSetAttrDirective(const AttributeSpec& attr, bool optional,
+                           SourceWriter* writer) {
+  string var_name = optional ? "opts." + attr.var().name() : attr.var().name();
+  if (attr.iterable()) {
+    string array_name = attr.var().name() + "Array";
+    writer->AppendType(attr.jni_type())
+        .Append("[] " + array_name + " = new ")
+        .AppendType(attr.jni_type())
+        .Append("[" + var_name + ".size()];")
+        .EndLine()
+        .BeginBlock("for (int i = 0; i < " + array_name + ".length; ++i)")
+        .Append(array_name + "[i] = ");
+    if (attr.type().kind() == Type::GENERIC) {
+      writer->Append("DataType.fromClass(" + var_name + ".get(i));");
+    } else {
+      writer->Append(var_name + ".get(i);");
+    }
+    writer->EndLine()
+        .EndBlock()
+        .Append("opBuilder.setAttr(\"" + attr.op_def_name() + "\", ")
+        .Append(array_name + ");")
+        .EndLine();
+  } else {
+    writer->Append("opBuilder.setAttr(\"" + attr.op_def_name() + "\", ");
+    if (attr.var().type().name() == "Class") {
+      writer->Append("DataType.fromClass(" + var_name + "));");
     } else {
-      result += c;
+      writer->Append(var_name + ");");
     }
+    writer->EndLine();
   }
-  return result;
 }
 
-}  // namespace
+void RenderFactoryMethods(const OpSpec& op, const Type& op_class,
+                          SourceWriter* writer) {
+  Method factory = Method::Create("create", op_class);
+  Javadoc factory_doc =
+      Javadoc::Create("Factory method to create a class to wrap a new " +
+                      op_class.name() + " operation to the graph.");
+  Variable scope =
+      Variable::Create("scope", Type::Class("Scope", "org.tensorflow.op"));
+  AddArgument(scope, "current graph scope", &factory, &factory_doc);
+  for (const ArgumentSpec& input : op.inputs()) {
+    AddArgument(input.var(), input.description(), &factory, &factory_doc);
+  }
+  for (const AttributeSpec& attr : op.attributes()) {
+    AddArgument(attr.var(), attr.description(), &factory, &factory_doc);
+  }
+  if (!op.optional_attributes().empty()) {
+    AddArgument(Variable::Varargs("options", Type::Class("Options")),
+                "carries optional attributes values", &factory, &factory_doc);
+  }
+  factory_doc.add_tag("return", "a new instance of " + op_class.name());
 
-OpGenerator::OpGenerator() : env(Env::Default()) {}
+  writer->BeginMethod(factory, PUBLIC | STATIC, &factory_doc);
+  writer->Append("OperationBuilder opBuilder = scope.graph().opBuilder(\"" +
+                 op.graph_op_name() + "\", scope.makeOpName(\"" +
+                 op_class.name() + "\"));");
+  writer->EndLine();
+  for (const ArgumentSpec& input : op.inputs()) {
+    if (input.iterable()) {
+      writer->Append("opBuilder.addInputList(Operands.asOutputs(" +
+                     input.var().name() + "));");
+      writer->EndLine();
+    } else {
+      writer->Append("opBuilder.addInput(" + input.var().name() +
+                     ".asOutput());");
+      writer->EndLine();
+    }
+  }
+  for (const AttributeSpec& attribute : op.attributes()) {
+    WriteSetAttrDirective(attribute, false, writer);
+  }
+  if (!op.optional_attributes().empty()) {
+    writer->BeginBlock("if (options != null)")
+        .BeginBlock("for (Options opts : options)");
+    for (const AttributeSpec& attribute : op.optional_attributes()) {
+      writer->BeginBlock("if (opts." + attribute.var().name() + " != null)");
+      WriteSetAttrDirective(attribute, true, writer);
+      writer->EndBlock();
+    }
+    writer->EndBlock().EndBlock();
+  }
+  writer->Append("return new ")
+      .AppendType(op_class)
+      .Append("(opBuilder.build());")
+      .EndLine();
+  writer->EndMethod();
+}
 
-OpGenerator::~OpGenerator() {}
+void RenderConstructor(const OpSpec& op, const Type& op_class,
+                       SourceWriter* writer) {
+  Variable operation =
+      Variable::Create("operation", Type::Class("Operation", "org.tensorflow"));
+  Method constructor = Method::ConstructorFor(op_class).add_argument(operation);
+  for (const ArgumentSpec& output : op.outputs()) {
+    if (output.iterable() && !output.type().wildcard()) {
+      constructor.add_annotation(
+          Annotation::Create("SuppressWarnings").attributes("\"unchecked\""));
+      break;
+    }
+  }
+  writer->BeginMethod(constructor, PRIVATE)
+      .Append("super(operation);")
+      .EndLine();
+  if (!op.outputs().empty()) {
+    writer->Append("int outputIdx = 0;").EndLine();
+    for (const ArgumentSpec& output : op.outputs()) {
+      if (output.iterable()) {
+        string var_length = output.var().name() + "Length";
+        writer->Append("int " + var_length)
+            .Append(" = operation.outputListLength(\"" + output.op_def_name() +
+                    "\");")
+            .EndLine()
+            .Append(output.var().name() + " = Arrays.asList(");
+        if (!output.type().wildcard()) {
+          writer->Append("(")
+              .AppendType(output.var().type().parameters().front())
+              .Append("[])");
+        }
+        writer->Append("operation.outputList(outputIdx, " + var_length + "));")
+            .EndLine()
+            .Append("outputIdx += " + var_length + ";")
+            .EndLine();
+      } else {
+        writer
+            ->Append(output.var().name() + " = operation.output(outputIdx++);")
+            .EndLine();
+      }
+    }
+  }
+  writer->EndMethod();
+}
 
-Status OpGenerator::Run(const OpList& ops, const string& lib_name,
-                        const string& base_package, const string& output_dir) {
-  const string package =
-      base_package + '.' + str_util::StringReplace(lib_name, "_", "", true);
-  const string package_path =
-      output_dir + '/' + str_util::StringReplace(package, ".", "/", true);
-  const string group = CamelCase(lib_name, '_', false);
+void RenderGettersAndSetters(const OpSpec& op, SourceWriter* writer) {
+  for (const AttributeSpec& attr : op.optional_attributes()) {
+    Method setter = Method::Create(attr.var().name(), Type::Class("Options"));
+    Javadoc setter_doc = Javadoc::Create();
+    AddArgument(attr.var(), attr.description(), &setter, &setter_doc);
+    writer->BeginMethod(setter, PUBLIC | STATIC, &setter_doc)
+        .Append("return new Options()." + attr.var().name() + "(" +
+                attr.var().name() + ");")
+        .EndLine()
+        .EndMethod();
+  }
+  for (const ArgumentSpec& output : op.outputs()) {
+    Method getter = Method::Create(output.var().name(), output.var().type());
+    Javadoc getter_doc = Javadoc::Create(output.description());
+    writer->BeginMethod(getter, PUBLIC, &getter_doc)
+        .Append("return " + output.var().name() + ";")
+        .EndLine()
+        .EndMethod();
+  }
+}
 
-  if (!env->FileExists(package_path).ok()) {
-    TF_CHECK_OK(env->RecursivelyCreateDir(package_path));
+void RenderInterfaceImpl(const OpSpec& op, RenderMode mode,
+                         SourceWriter* writer) {
+  ArgumentSpec output = op.outputs().front();
+
+  if (mode == OPERAND) {
+    bool cast2obj = output.type().wildcard();
+    Type return_type =
+        Type::Class("Output", "org.tensorflow")
+            .add_parameter(cast2obj ? Type::Class("Object") : output.type());
+    Method as_output = Method::Create("asOutput", return_type)
+                           .add_annotation(Annotation::Create("Override"));
+    if (cast2obj) {
+      as_output.add_annotation(
+          Annotation::Create("SuppressWarnings").attributes("\"unchecked\""));
+    }
+    writer->BeginMethod(as_output, PUBLIC);
+    if (cast2obj) {
+      writer->Append("return (").AppendType(return_type).Append(") ");
+    } else {
+      writer->Append("return ");
+    }
+    writer->Append(output.var().name() + ";").EndLine().EndMethod();
+
+  } else if (mode == LIST_OPERAND) {
+    Type operand = Type::Interface("Operand", "org.tensorflow");
+    if (output.type().wildcard()) {
+      operand.add_parameter(Type::Class("Object"));
+    } else {
+      operand.add_parameter(output.type());
+    }
+    Type return_type =
+        Type::Interface("Iterator", "java.util").add_parameter(operand);
+    Method iterator =
+        Method::Create("iterator", return_type)
+            .add_annotation(Annotation::Create("Override"))
+            .add_annotation(Annotation::Create("SuppressWarnings")
+                                .attributes("{\"rawtypes\", \"unchecked\"}"));
+    // cast the output list using a raw List
+    writer->BeginMethod(iterator, PUBLIC)
+        .Append("return (" + return_type.name() + ") ")
+        .Append(output.var().name() + ".iterator();")
+        .EndLine()
+        .EndMethod();
+  }
+}
+
+void RenderOptionsClass(const OpSpec& op, const Type& op_class,
+                        SourceWriter* writer) {
+  Type options_class = Type::Class("Options");
+  Javadoc options_doc = Javadoc::Create("Optional attributes for {@link " +
+                                        op_class.canonical_name() + "}");
+  writer->BeginInnerType(options_class, PUBLIC | STATIC, &options_doc);
+  for (const AttributeSpec& attr : op.optional_attributes()) {
+    Method setter = Method::Create(attr.var().name(), options_class);
+    Javadoc setter_doc = Javadoc::Create();
+    AddArgument(attr.var(), attr.description(), &setter, &setter_doc);
+    writer->BeginMethod(setter, PUBLIC, &setter_doc)
+        .Append("this." + attr.var().name() + " = " + attr.var().name() + ";")
+        .EndLine()
+        .Append("return this;")
+        .EndLine()
+        .EndMethod();
+  }
+  writer->EndLine();
+  for (const AttributeSpec& optional_attribute : op.optional_attributes()) {
+    writer->WriteField(optional_attribute.var(), PRIVATE);
   }
+  Method constructor = Method::ConstructorFor(options_class);
+  writer->BeginMethod(constructor, PRIVATE).EndMethod();
+  writer->EndType();
+}
+
+inline Type ClassOf(const EndpointSpec& endpoint, const string& base_package) {
+  return Type::Class(
+      endpoint.name(),
+      base_package + "." + str_util::Lowercase(endpoint.package()));
+}
 
-  LOG(INFO) << "Generating Java wrappers for '" << lib_name << "' operations";
-  // TODO(karllessard) generate wrappers from list of ops
+void GenerateOp(const OpSpec& op, const EndpointSpec& endpoint,
+                const string& base_package, const string& output_dir,
+                Env* env) {
+  Type op_class(
+      ClassOf(endpoint, base_package)
+          .add_supertype(Type::Class("PrimitiveOp", "org.tensorflow.op")));
+  Javadoc op_javadoc(endpoint.javadoc());
 
+  // op interfaces
+  RenderMode mode = DEFAULT;
+  if (op.outputs().size() == 1) {
+    const ArgumentSpec& output = op.outputs().front();
+    Type operand_type(output.type().wildcard() ? Type::Class("Object")
+                                               : output.type());
+    Type operand_inf(Type::Interface("Operand", "org.tensorflow")
+                         .add_parameter(operand_type));
+    if (output.iterable()) {
+      mode = LIST_OPERAND;
+      op_class.add_supertype(Type::IterableOf(operand_inf));
+    } else {
+      mode = OPERAND;
+      op_class.add_supertype(operand_inf);
+    }
+  }
+  // op generic parameters
+  std::set<string> generics;
+  for (const ArgumentSpec& output : op.outputs()) {
+    if (output.type().kind() == Type::GENERIC && !output.type().wildcard() &&
+        generics.find(output.type().name()) == generics.end()) {
+      op_class.add_parameter(output.type());
+      op_javadoc.add_param_tag(
+          "<" + output.type().name() + ">",
+          "data type for {@code " + output.var().name() + "()} output");
+      generics.insert(output.type().name());
+    }
+  }
+  // op annotations
+  op_class.add_annotation(
+      Annotation::Create("Generated", "javax.annotation")
+          .attributes("value = \"TensorFlow Java Op Generator\""));
+  if (endpoint.deprecated()) {
+    op_class.add_annotation(Annotation::Create("Deprecated"));
+    string explanation;
+    if (!op.endpoints().front().deprecated()) {
+      explanation =
+          "use {@link " +
+          ClassOf(op.endpoints().front(), base_package).canonical_name() +
+          "} instead";
+    } else {
+      explanation = op.deprecation_explanation();
+    }
+    op_javadoc.add_tag("deprecated", explanation);
+  }
+  if (!op.hidden()) {
+    // expose the op in the Ops Graph API only if it is visible
+    op_class.add_annotation(
+        Annotation::Create("Operator", "org.tensorflow.op.annotation")
+            .attributes("group = \"" + endpoint.package() + "\""));
+  }
+  // create op class file
+  const string op_dir_name = io::JoinPath(
+      output_dir, str_util::StringReplace(op_class.package(), ".", "/", true));
+  if (!env->FileExists(op_dir_name).ok()) {
+    TF_CHECK_OK(Env::Default()->RecursivelyCreateDir(op_dir_name))
+        << op_dir_name;
+  }
+  const string op_file_name = op_class.name() + ".java";
+  std::unique_ptr<tensorflow::WritableFile> op_file;
+  TF_CHECK_OK(
+      env->NewWritableFile(io::JoinPath(op_dir_name, op_file_name), &op_file))
+      << op_file_name;
+
+  // render endpoint source code
+  SourceFileWriter writer(op_file.get());
+  std::list<Type> dependencies;
+  CollectOpDependencies(op, mode, &dependencies);
+  writer.Write(kLicense).EndLine().BeginType(op_class, PUBLIC | FINAL,
+                                             &dependencies, &op_javadoc);
+  if (!op.optional_attributes().empty()) {
+    RenderOptionsClass(op, op_class, &writer);
+  }
+  RenderFactoryMethods(op, op_class, &writer);
+  RenderGettersAndSetters(op, &writer);
+  if (mode != DEFAULT) {
+    RenderInterfaceImpl(op, mode, &writer);
+  }
+  writer.EndLine();
+  for (const ArgumentSpec& output : op.outputs()) {
+    writer.WriteField(output.var(), PRIVATE);
+  }
+  RenderConstructor(op, op_class, &writer);
+  writer.EndType();
+}
+
+bool CanGenerateOp(const OpDef& op_def, const ApiDef& api_def) {
+  if (api_def.visibility() == ApiDef::SKIP) {
+    return false;
+  }
+  for (const auto& attr : op_def.attr()) {
+    if (attr.type() == "func") {
+      return false;  // TODO(karllessard) add support for function attributes
+    }
+  }
+  return true;
+}
+
+}  // namespace
+
+Status OpGenerator::Run(const OpList& op_list, const string& base_package,
+                        const string& output_dir) {
+  ApiDefMap api_map(op_list);
+  if (!api_dirs_.empty()) {
+    // Only load api files that correspond to the requested "op_list"
+    for (const auto& op : op_list.op()) {
+      for (const auto& api_def_dir : api_dirs_) {
+        const std::string api_def_file_pattern =
+            io::JoinPath(api_def_dir, "api_def_" + op.name() + ".pbtxt");
+        if (env_->FileExists(api_def_file_pattern).ok()) {
+          TF_CHECK_OK(api_map.LoadFile(env_, api_def_file_pattern))
+              << api_def_file_pattern;
+        }
+      }
+    }
+  }
+  api_map.UpdateDocs();
+  for (const auto& op_def : op_list.op()) {
+    const ApiDef* api_def = api_map.GetApiDef(op_def.name());
+    if (CanGenerateOp(op_def, *api_def)) {
+      OpSpec op(OpSpec::Create(op_def, *api_def));
+      for (const EndpointSpec& endpoint : op.endpoints()) {
+        GenerateOp(op, endpoint, base_package, output_dir, env_);
+      }
+    }
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/java/src/gen/cc/op_generator.h b/tensorflow/java/src/gen/cc/op_generator.h
index 4b55ed3ed94f11..759d800ecfb5be 100644
--- a/tensorflow/java/src/gen/cc/op_generator.h
+++ b/tensorflow/java/src/gen/cc/op_generator.h
@@ -17,34 +17,39 @@ limitations under the License.
 #define TENSORFLOW_JAVA_SRC_GEN_CC_OP_GENERATOR_H_
 
 #include <string>
+#include <vector>
 
-#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/api_def.pb.h"
+#include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/env.h"
+#include "tensorflow/java/src/gen/cc/op_specs.h"
 
 namespace tensorflow {
 namespace java {
 
-/// \brief A generator of Java operation wrappers.
-///
-/// Such generator is normally ran only once per executable, outputting
-/// wrappers for the all registered operations it has been compiled with.
-/// Nonetheless, it is designed to support multiple runs, giving a different
-/// list of operations on each cycle.
+// A generator of Java operation wrappers.
+//
+// This generator takes a list of ops definitions in input and outputs
+// a Java Op wrapper for each of them in the provided directory. The same
+// generator instance can be invoked multiple times with a different list of
+// ops definitions.
 class OpGenerator {
  public:
-  OpGenerator();
-  virtual ~OpGenerator();
+  explicit OpGenerator(const std::vector<string>& api_dirs,
+                       Env* env = Env::Default())
+      : api_dirs_(api_dirs), env_(env) {}
 
-  /// \brief Generates wrappers for the given list of 'ops'.
-  ///
-  /// Output files are generated in <output_dir>/<base_package>/<lib_package>,
-  /// where 'lib_package' is derived from 'lib_name'.
-  Status Run(const OpList& ops, const string& lib_name,
-             const string& base_package, const string& output_dir);
+  // Generates wrappers for the given list of 'ops'.
+  //
+  // Output files are generated in <output_dir>/<base_package>/<op_package>,
+  // where 'op_package' is derived from ops endpoints.
+  Status Run(const OpList& op_list, const string& base_package,
+             const string& output_dir);
 
  private:
-  Env* env;
+  const std::vector<string> api_dirs_;
+  Env* env_;
 };
 
 }  // namespace java
diff --git a/tensorflow/java/src/gen/cc/op_specs.cc b/tensorflow/java/src/gen/cc/op_specs.cc
new file mode 100644
index 00000000000000..4bcfc7fe011423
--- /dev/null
+++ b/tensorflow/java/src/gen/cc/op_specs.cc
@@ -0,0 +1,429 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+
+#include "re2/re2.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/java/src/gen/cc/op_specs.h"
+
+namespace tensorflow {
+namespace java {
+namespace {
+
+inline bool IsRealNumbers(const AttrValue& values) {
+  if (!values.has_list()) {
+    return RealNumberTypes().Contains(values.type());
+  }
+  for (int i = 0; i < values.list().type_size(); ++i) {
+    if (!RealNumberTypes().Contains(values.list().type(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+class TypeResolver {
+ public:
+  explicit TypeResolver(const OpDef& op_def) : op_def_(op_def) {}
+
+  // Returns the class type of an input/output argument
+  //
+  // For example, if the argument's datatype is DT_STRING, this method will
+  // return "java.lang.String", so the argument can become "Operand<String>"
+  // in the Ops API
+  Type TypeOf(const OpDef_ArgDef& arg_def, bool *iterable_out);
+
+  // Returns types of an input attribute
+  //
+  // The first element of the pair is the class type of this attribute while
+  // the second is its JNI/primitive type equivalent, required for explicit
+  // unboxing.
+  //
+  // For example, if the attribute is of type "float", this method will return
+  // <java.lang.Float, float>, so the attribute can be used as a "Float" object
+  // in the Ops API and casted to a "float" when passing through the JNI layer.
+  std::pair<Type, Type> TypesOf(const OpDef_AttrDef& attr_def,
+      bool *iterable_out);
+
+  // Returns true if the type of this attribute has already been resolved
+  bool IsAttributeVisited(const string& attr_name) {
+    return visited_attrs_.find(attr_name) != visited_attrs_.cend();
+  }
+
+ private:
+  const OpDef op_def_;
+  std::map<std::string, Type> visited_attrs_;
+  char next_generic_letter_ = 'T';
+
+  std::pair<Type, Type> MakeTypePair(const Type& type, const Type& jni_type) {
+    return std::make_pair(type, jni_type);
+  }
+  std::pair<Type, Type> MakeTypePair(const Type& type) {
+    return std::make_pair(type, type);
+  }
+  Type NextGeneric() {
+    char generic_letter = next_generic_letter_++;
+    if (next_generic_letter_ > 'Z') {
+      next_generic_letter_ = 'A';
+    }
+    return Type::Generic(string(1, generic_letter));
+  }
+};
+
+Type TypeResolver::TypeOf(const OpDef_ArgDef& arg_def,
+    bool* iterable_out) {
+  *iterable_out = false;
+  if (!arg_def.number_attr().empty()) {
+    // when number_attr is set, argument has to be a list of tensors
+    *iterable_out = true;
+    visited_attrs_.insert(std::make_pair(arg_def.number_attr(), Type::Int()));
+  }
+  Type type = Type::Wildcard();
+  if (arg_def.type() != DataType::DT_INVALID) {
+    // resolve type from DataType
+    switch (arg_def.type()) {
+      case DataType::DT_BOOL:
+        type = Type::Class("Boolean");
+        break;
+      case DataType::DT_STRING:
+        type = Type::Class("String");
+        break;
+      case DataType::DT_FLOAT:
+        type = Type::Class("Float");
+        break;
+      case DataType::DT_DOUBLE:
+        type = Type::Class("Double");
+        break;
+      case DataType::DT_UINT8:
+        type = Type::Class("UInt8", "org.tensorflow.types");
+        break;
+      case DataType::DT_INT32:
+        type = Type::Class("Integer");
+        break;
+      case DataType::DT_INT64:
+        type = Type::Class("Long");
+        break;
+      case DataType::DT_RESOURCE:
+        // TODO(karllessard) create a Resource utility class that could be
+        // used to store a resource and its type (passed in a second argument).
+        // For now, we need to force a wildcard and we will unfortunately lose
+        // track of the resource type.
+        break;
+      default:
+        // Any other datatypes does not have a equivalent in Java and must
+        // remain a wildcard (e.g. DT_COMPLEX64, DT_QINT8, ...)
+        break;
+    }
+  } else if (!arg_def.type_attr().empty()) {
+    // resolve type from attribute (if already visited, retrieve its type)
+    if (IsAttributeVisited(arg_def.type_attr())) {
+      type = visited_attrs_.at(arg_def.type_attr());
+    } else {
+      for (const auto& attr_def : op_def_.attr()) {
+        if (attr_def.name() == arg_def.type_attr()) {
+          type = TypesOf(attr_def, iterable_out).first;
+          break;
+        }
+      }
+    }
+  } else if (!arg_def.type_list_attr().empty()) {
+    // type is a list of tensors that can be of different data types, so leave
+    // it as a list of wildcards
+    *iterable_out = true;
+    visited_attrs_.insert(std::make_pair(arg_def.type_list_attr(), type));
+
+  } else {
+    LOG(FATAL) << "Cannot resolve data type of argument \"" << arg_def.name()
+        << "\" in operation \"" << op_def_.name() << "\"";
+  }
+  return type;
+}
+
+std::pair<Type, Type> TypeResolver::TypesOf(const OpDef_AttrDef& attr_def,
+    bool* iterable_out) {
+  std::pair<Type, Type> types = MakeTypePair(Type::Wildcard());
+  *iterable_out = false;
+  StringPiece attr_type = attr_def.type();
+  if (str_util::ConsumePrefix(&attr_type, "list(")) {
+    attr_type.remove_suffix(1);  // remove closing brace
+    *iterable_out = true;
+  }
+  if (attr_type == "string") {
+    types = MakeTypePair(Type::Class("String"));
+
+  } else if (attr_type == "int") {
+    types = MakeTypePair(Type::Class("Long"), Type::Long());
+
+  } else if (attr_type == "float") {
+    types = MakeTypePair(Type::Class("Float"), Type::Float());
+
+  } else if (attr_type == "bool") {
+    types = MakeTypePair(Type::Class("Boolean"), Type::Boolean());
+
+  } else if (attr_type == "shape") {
+    types = MakeTypePair(Type::Class("Shape", "org.tensorflow"));
+
+  } else if (attr_type == "tensor") {
+    types = MakeTypePair(Type::Class("Tensor", "org.tensorflow")
+        .add_parameter(Type::Wildcard()));
+
+  } else if (attr_type == "type") {
+    Type type = *iterable_out ? Type::Wildcard() : NextGeneric();
+    if (IsRealNumbers(attr_def.allowed_values())) {
+      type.add_supertype(Type::Class("Number"));
+    }
+    types = MakeTypePair(type, Type::Enum("DataType", "org.tensorflow"));
+
+  } else {
+    LOG(FATAL) << "Cannot resolve data type for attribute \"" << attr_type
+        << "\" in operation \"" << op_def_.name() << "\"";
+  }
+  visited_attrs_.insert(std::make_pair(attr_def.name(), types.first));
+  return types;
+}
+
+string SnakeToCamelCase(const string& str, bool upper = false) {
+  string result;
+  bool cap = upper;
+  for (string::const_iterator it = str.begin(); it != str.end(); ++it) {
+    const char c = *it;
+    if (c == '_') {
+      cap = true;
+    } else if (cap) {
+      result += toupper(c);
+      cap = false;
+    } else {
+      result += c;
+    }
+  }
+  return result;
+}
+
+bool FindAndCut(re2::StringPiece* input, const RE2& expr,
+    re2::StringPiece* before_match, re2::StringPiece* ret_match = nullptr) {
+  re2::StringPiece match;
+  if (!expr.Match(*input, 0, input->size(), RE2::UNANCHORED, &match, 1)) {
+    return false;
+  }
+  before_match->set(input->data(), match.begin() - input->begin());
+  input->remove_prefix(match.end() - before_match->begin());
+  if (ret_match != nullptr) {
+    *ret_match = match;
+  }
+  return true;
+}
+
+string ParseDocumentation(re2::StringPiece input) {
+  std::stringstream javadoc_text;
+
+  // TODO(karllessard) This is a very minimalist utility method for converting
+  // markdown syntax, as found in ops descriptions, to Javadoc/html tags. Check
+  // for alternatives to increase the level of support for markups.
+  std::vector<string> markups_subexpr;
+  markups_subexpr.push_back("\n+\\*\\s+");  // lists
+  markups_subexpr.push_back("\n{2,}");  // paragraphs
+  markups_subexpr.push_back("`{3,}\\s*[^\\s\n]*\\s*\n");  // code blocks
+  markups_subexpr.push_back("`+");  // inlined code and code blocks
+  markups_subexpr.push_back("\\*{1,2}\\b");  // text emphasis
+  markups_subexpr.push_back("\\[");  // hyperlinks
+  const RE2 markup_expr(str_util::Join(markups_subexpr, "|"));
+
+  bool in_list = false;
+  while (true) {
+    re2::StringPiece text;
+    re2::StringPiece markup;
+    if (!FindAndCut(&input, markup_expr, &text, &markup)) {
+      javadoc_text << input;
+      break;  // end of loop
+    }
+    javadoc_text << text;
+    if (markup.starts_with("\n")) {
+      javadoc_text << "\n";
+      if (markup.contains("*")) {
+        // new list item
+        javadoc_text << (in_list ? "</li>\n" : "<ul>\n") << "<li>\n";
+        in_list = true;
+      } else if (in_list) {
+        // end of list
+        javadoc_text << "</li>\n</ul>\n";
+        in_list = false;
+      } else if (!input.starts_with("```")) {
+        // new paragraph (not required if a <pre> block follows)
+        javadoc_text << "<p>\n";
+      }
+    } else if (markup.starts_with("```")) {
+      // code blocks
+      if (FindAndCut(&input, "```\\s*\n*", &text)) {
+        javadoc_text << "<pre>{@code\n" << text << "}</pre>\n";
+      } else {
+        javadoc_text << markup;
+      }
+    } else if (markup.starts_with("`")) {
+      // inlined code
+      if (FindAndCut(&input, markup, &text)) {
+        javadoc_text << "{@code " << text << "}";
+      } else {
+        javadoc_text << markup;
+      }
+    } else if (markup == "**") {
+      // text emphasis (strong)
+      if (FindAndCut(&input, "\\b\\*{2}", &text)) {
+        javadoc_text << "<b>" << ParseDocumentation(text) << "</b>";
+      } else {
+        javadoc_text << markup;
+      }
+    } else if (markup == "*") {
+      // text emphasis (normal)
+      if (FindAndCut(&input, "\\b\\*{1}", &text)) {
+        javadoc_text << "<i>" << ParseDocumentation(text) << "</i>";
+      } else {
+        javadoc_text << markup;
+      }
+    } else if (markup.starts_with("[")) {
+      // hyperlinks
+      string label;
+      string link;
+      if (RE2::Consume(&input, "([^\\[]+)\\]\\((http.+)\\)", &label, &link)) {
+        javadoc_text << "<a href=\"" << link << "\">"
+            << ParseDocumentation(label)
+            << "</a>";
+      } else {
+        javadoc_text << markup;
+      }
+    } else {
+      // safe fallback
+      javadoc_text << markup;
+    }
+  }
+  return javadoc_text.str();
+}
+
+ArgumentSpec CreateInput(const OpDef_ArgDef& input_def,
+    const ApiDef::Arg& input_api_def, TypeResolver* type_resolver) {
+  bool iterable = false;
+  Type type = type_resolver->TypeOf(input_def, &iterable);
+  Type var_type = Type::Interface("Operand", "org.tensorflow")
+    .add_parameter(type);
+  if (iterable) {
+    var_type = Type::IterableOf(var_type);
+  }
+  return ArgumentSpec(input_api_def.name(),
+      Variable::Create(SnakeToCamelCase(input_api_def.rename_to()), var_type),
+      type,
+      ParseDocumentation(input_api_def.description()),
+      iterable);
+}
+
+AttributeSpec CreateAttribute(const OpDef_AttrDef& attr_def,
+    const ApiDef::Attr& attr_api_def, TypeResolver* type_resolver) {
+  bool iterable = false;
+  std::pair<Type, Type> types = type_resolver->TypesOf(attr_def, &iterable);
+  Type var_type = types.first.kind() == Type::GENERIC ?
+      Type::Class("Class").add_parameter(types.first) : types.first;
+  if (iterable) {
+    var_type = Type::ListOf(var_type);
+  }
+  return AttributeSpec(attr_api_def.name(),
+      Variable::Create(SnakeToCamelCase(attr_api_def.rename_to()), var_type),
+      types.first,
+      types.second,
+      ParseDocumentation(attr_api_def.description()),
+      iterable,
+      attr_api_def.has_default_value());
+}
+
+ArgumentSpec CreateOutput(const OpDef_ArgDef& output_def,
+    const ApiDef::Arg& output_api, TypeResolver* type_resolver) {
+  bool iterable = false;
+  Type type = type_resolver->TypeOf(output_def, &iterable);
+  Type var_type = Type::Class("Output", "org.tensorflow")
+    .add_parameter(type);
+  if (iterable) {
+    var_type = Type::ListOf(var_type);
+  }
+  return ArgumentSpec(output_api.name(),
+      Variable::Create(SnakeToCamelCase(output_api.rename_to()), var_type),
+      type,
+      ParseDocumentation(output_api.description()),
+      iterable);
+}
+
+EndpointSpec CreateEndpoint(const OpDef& op_def, const ApiDef& api_def,
+    const ApiDef_Endpoint& endpoint_def) {
+  std::vector<string> name_tokens = str_util::Split(endpoint_def.name(), ".");
+  string package;
+  string name;
+  if (name_tokens.size() > 1) {
+    package = name_tokens.at(0);
+    name = name_tokens.at(1);
+  } else {
+    package = "core";  // generate unclassified ops in the 'core' package
+    name = name_tokens.at(0);
+  }
+  return EndpointSpec(package,
+      name,
+      Javadoc::Create(ParseDocumentation(api_def.summary()))
+          .details(ParseDocumentation(api_def.description())));
+}
+
+}  // namespace
+
+OpSpec OpSpec::Create(const OpDef& op_def, const ApiDef& api_def) {
+  OpSpec op(api_def.graph_op_name(),
+      api_def.visibility() == ApiDef::HIDDEN,
+      op_def.deprecation().explanation());
+  TypeResolver type_resolver(op_def);
+  for (const string& next_input_name : api_def.arg_order()) {
+    for (int i = 0; i < op_def.input_arg().size(); ++i) {
+      if (op_def.input_arg(i).name() == next_input_name) {
+        op.inputs_.push_back(CreateInput(op_def.input_arg(i), api_def.in_arg(i),
+            &type_resolver));
+        break;
+      }
+    }
+  }
+  for (int i = 0; i < op_def.attr().size(); ++i) {
+    // do not parse attributes already visited, they have probably been inferred
+    // before as an input argument type
+    if (!type_resolver.IsAttributeVisited(op_def.attr(i).name())) {
+      AttributeSpec attr = CreateAttribute(op_def.attr(i), api_def.attr(i),
+          &type_resolver);
+      // attributes with a default value are optional
+      if (attr.has_default_value() && attr.type().kind() != Type::GENERIC) {
+        op.optional_attributes_.push_back(attr);
+      } else {
+        op.attributes_.push_back(attr);
+      }
+    }
+  }
+  for (int i = 0; i < op_def.output_arg().size(); ++i) {
+    op.outputs_.push_back(CreateOutput(op_def.output_arg(i), api_def.out_arg(i),
+        &type_resolver));
+  }
+  for (const auto& endpoint_def : api_def.endpoint()) {
+    op.endpoints_.push_back(CreateEndpoint(op_def, api_def, endpoint_def));
+  }
+  return op;
+}
+
+}  // namespace java
+}  // namespace tensorflow
diff --git a/tensorflow/java/src/gen/cc/op_specs.h b/tensorflow/java/src/gen/cc/op_specs.h
new file mode 100644
index 00000000000000..034cf636ed071a
--- /dev/null
+++ b/tensorflow/java/src/gen/cc/op_specs.h
@@ -0,0 +1,165 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_JAVA_SRC_GEN_CC_OP_SPECS_H_
+#define TENSORFLOW_JAVA_SRC_GEN_CC_OP_SPECS_H_
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/api_def.pb.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/java/src/gen/cc/java_defs.h"
+
+namespace tensorflow {
+namespace java {
+
+class EndpointSpec {
+ public:
+  // A specification for an operation endpoint
+  //
+  // package: package of this endpoint (from which also derives its package)
+  // name: name of this endpoint class
+  // javadoc: the endpoint class documentation
+  // TODO(annarev): hardcode depcreated to false until deprecated is possible
+  EndpointSpec(const string& package, const string& name,
+      const Javadoc& javadoc)
+    : package_(package), name_(name), javadoc_(javadoc),
+      deprecated_(false) {}
+
+  const string& package() const { return package_; }
+  const string& name() const { return name_; }
+  const Javadoc& javadoc() const { return javadoc_; }
+  bool deprecated() const { return deprecated_; }
+
+ private:
+  const string package_;
+  const string name_;
+  const Javadoc javadoc_;
+  const bool deprecated_;
+};
+
+class ArgumentSpec {
+ public:
+  // A specification for an operation argument
+  //
+  // op_def_name: argument name, as known by TensorFlow core
+  // var: a variable to represent this argument in Java
+  // type: the tensor type of this argument
+  // description: a description of this argument, in javadoc
+  // iterable: true if this argument is a list
+  ArgumentSpec(const string& op_def_name, const Variable& var,
+      const Type& type, const string& description, bool iterable)
+    : op_def_name_(op_def_name), var_(var), type_(type),
+      description_(description), iterable_(iterable) {}
+
+  const string& op_def_name() const { return op_def_name_; }
+  const Variable& var() const { return var_; }
+  const Type& type() const { return type_; }
+  const string& description() const { return description_; }
+  bool iterable() const { return iterable_; }
+
+ private:
+  const string op_def_name_;
+  const Variable var_;
+  const Type type_;
+  const string description_;
+  const bool iterable_;
+};
+
+class AttributeSpec {
+ public:
+  // A specification for an operation attribute
+  //
+  // op_def_name: attribute name, as known by TensorFlow core
+  // var: a variable to represent this attribute in Java
+  // type: the type of this attribute
+  // jni_type: the type of this attribute in JNI layer (see OperationBuilder)
+  // description: a description of this attribute, in javadoc
+  // iterable: true if this attribute is a list
+  // has_default_value: true if this attribute has a default value if not set
+  AttributeSpec(const string& op_def_name, const Variable& var,
+      const Type& type, const Type& jni_type, const string& description,
+      bool iterable, bool has_default_value)
+    : op_def_name_(op_def_name), var_(var), type_(type),
+      description_(description), iterable_(iterable),
+      jni_type_(jni_type), has_default_value_(has_default_value) {}
+
+  const string& op_def_name() const { return op_def_name_; }
+  const Variable& var() const { return var_; }
+  const Type& type() const { return type_; }
+  const string& description() const { return description_; }
+  bool iterable() const { return iterable_; }
+  const Type& jni_type() const { return jni_type_; }
+  bool has_default_value() const { return has_default_value_; }
+
+ private:
+  const string op_def_name_;
+  const Variable var_;
+  const Type type_;
+  const string description_;
+  const bool iterable_;
+  const Type jni_type_;
+  const bool has_default_value_;
+};
+
+class OpSpec {
+ public:
+  // Parses an op definition and its API to produce a specification used for
+  // rendering its Java wrapper
+  //
+  // op_def: Op definition
+  // api_def: Op API definition
+  static OpSpec Create(const OpDef& op_def, const ApiDef& api_def);
+
+  const string& graph_op_name() const { return graph_op_name_; }
+  bool hidden() const { return hidden_; }
+  const string& deprecation_explanation() const {
+    return deprecation_explanation_;
+  }
+  const std::vector<EndpointSpec> endpoints() const { return endpoints_; }
+  const std::vector<ArgumentSpec>& inputs() const { return inputs_; }
+  const std::vector<ArgumentSpec>& outputs() const { return outputs_; }
+  const std::vector<AttributeSpec>& attributes() const { return attributes_; }
+  const std::vector<AttributeSpec>& optional_attributes() const {
+    return optional_attributes_;
+  }
+
+ private:
+  // A specification for an operation
+  //
+  // graph_op_name: name of this op, as known by TensorFlow core engine
+  // hidden: true if this op should not be visible through the Graph Ops API
+  // deprecation_explanation: message to show if all endpoints are deprecated
+  explicit OpSpec(const string& graph_op_name, bool hidden,
+      const string& deprecation_explanation)
+    : graph_op_name_(graph_op_name), hidden_(hidden),
+      deprecation_explanation_(deprecation_explanation) {}
+
+  const string graph_op_name_;
+  const bool hidden_;
+  const string deprecation_explanation_;
+  std::vector<EndpointSpec> endpoints_;
+  std::vector<ArgumentSpec> inputs_;
+  std::vector<ArgumentSpec> outputs_;
+  std::vector<AttributeSpec> attributes_;
+  std::vector<AttributeSpec> optional_attributes_;
+};
+
+}  // namespace java
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_JAVA_SRC_GEN_CC_OP_SPECS_H_
diff --git a/tensorflow/java/src/gen/cc/source_writer.cc b/tensorflow/java/src/gen/cc/source_writer.cc
index a02f75ad6e7f5f..8e5fba7e32f096 100644
--- a/tensorflow/java/src/gen/cc/source_writer.cc
+++ b/tensorflow/java/src/gen/cc/source_writer.cc
@@ -15,7 +15,8 @@ limitations under the License.
 
 #include <string>
 #include <algorithm>
-#include <deque>
+#include <list>
+#include <string>
 
 #include "tensorflow/java/src/gen/cc/source_writer.h"
 
@@ -83,20 +84,22 @@ SourceWriter& SourceWriter::Append(const StringPiece& str) {
 }
 
 SourceWriter& SourceWriter::AppendType(const Type& type) {
-  if (type.kind() == Type::Kind::GENERIC && type.name().empty()) {
+  if (type.wildcard()) {
     Append("?");
   } else {
     Append(type.name());
-  }
-  if (!type.parameters().empty()) {
-    Append("<");
-    for (const Type& t : type.parameters()) {
-      if (&t != &type.parameters().front()) {
-        Append(", ");
+    if (!type.parameters().empty()) {
+      Append("<");
+      bool first = true;
+      for (const Type& t : type.parameters()) {
+        if (!first) {
+          Append(", ");
+        }
+        AppendType(t);
+        first = false;
       }
-      AppendType(t);
+      Append(">");
     }
-    Append(">");
   }
   return *this;
 }
@@ -107,7 +110,21 @@ SourceWriter& SourceWriter::EndLine() {
   return *this;
 }
 
-SourceWriter& SourceWriter::BeginMethod(const Method& method, int modifiers) {
+SourceWriter& SourceWriter::BeginBlock(const string& expression) {
+  if (!expression.empty()) {
+    Append(expression + " {");
+  } else {
+    Append(newline_ ? "{" : " {");
+  }
+  return EndLine().Indent(2);
+}
+
+SourceWriter& SourceWriter::EndBlock() {
+  return Indent(-2).Append("}").EndLine();
+}
+
+SourceWriter& SourceWriter::BeginMethod(const Method& method, int modifiers,
+                                        const Javadoc* javadoc) {
   GenericNamespace* generic_namespace = PushGenericNamespace(modifiers);
   if (!method.constructor()) {
     generic_namespace->Visit(method.return_type());
@@ -116,8 +133,9 @@ SourceWriter& SourceWriter::BeginMethod(const Method& method, int modifiers) {
     generic_namespace->Visit(v.type());
   }
   EndLine();
-  WriteDoc(method.description(), method.return_description(),
-      &method.arguments());
+  if (javadoc != nullptr) {
+    WriteJavadoc(*javadoc);
+  }
   if (!method.annotations().empty()) {
     WriteAnnotations(method.annotations());
   }
@@ -130,11 +148,13 @@ SourceWriter& SourceWriter::BeginMethod(const Method& method, int modifiers) {
     AppendType(method.return_type()).Append(" ");
   }
   Append(method.name()).Append("(");
+  bool first = true;
   for (const Variable& v : method.arguments()) {
-    if (&v != &method.arguments().front()) {
+    if (!first) {
       Append(", ");
     }
     AppendType(v.type()).Append(v.variadic() ? "... " : " ").Append(v.name());
+    first = false;
   }
   return Append(")").BeginBlock();
 }
@@ -145,29 +165,36 @@ SourceWriter& SourceWriter::EndMethod() {
   return *this;
 }
 
-SourceWriter& SourceWriter::BeginType(const Type& type,
-    const std::list<Type>* dependencies, int modifiers) {
+SourceWriter& SourceWriter::BeginType(const Type& type, int modifiers,
+                                      const std::list<Type>* extra_dependencies,
+                                      const Javadoc* javadoc) {
   if (!type.package().empty()) {
     Append("package ").Append(type.package()).Append(";").EndLine();
   }
-  if (dependencies != nullptr && !dependencies->empty()) {
-    TypeImporter type_importer(type.package());
-    for (const Type& t : *dependencies) {
+  TypeImporter type_importer(type.package());
+  type_importer.Visit(type);
+  if (extra_dependencies != nullptr) {
+    for (const Type& t : *extra_dependencies) {
       type_importer.Visit(t);
     }
+  }
+  if (!type_importer.imports().empty()) {
     EndLine();
     for (const string& s : type_importer.imports()) {
       Append("import ").Append(s).Append(";").EndLine();
     }
   }
-  return BeginInnerType(type, modifiers);
+  return BeginInnerType(type, modifiers, javadoc);
 }
 
-SourceWriter& SourceWriter::BeginInnerType(const Type& type, int modifiers) {
+SourceWriter& SourceWriter::BeginInnerType(const Type& type, int modifiers,
+                                           const Javadoc* javadoc) {
   GenericNamespace* generic_namespace = PushGenericNamespace(modifiers);
   generic_namespace->Visit(type);
   EndLine();
-  WriteDoc(type.description());
+  if (javadoc != nullptr) {
+    WriteJavadoc(*javadoc);
+  }
   if (!type.annotations().empty()) {
     WriteAnnotations(type.annotations());
   }
@@ -200,14 +227,15 @@ SourceWriter& SourceWriter::EndType() {
   return *this;
 }
 
-SourceWriter& SourceWriter::WriteFields(const std::list<Variable>& fields,
-    int modifiers) {
-  EndLine();
-  for (const Variable& v : fields) {
-    WriteModifiers(modifiers);
-    AppendType(v.type()).Append(" ").Append(v.name()).Append(";");
-    EndLine();
+SourceWriter& SourceWriter::WriteField(const Variable& field, int modifiers,
+                                       const Javadoc* javadoc) {
+  // If present, write field javadoc only as one brief line
+  if (javadoc != nullptr && !javadoc->brief().empty()) {
+    Append("/** ").Append(javadoc->brief()).Append(" */").EndLine();
   }
+  WriteModifiers(modifiers);
+  AppendType(field.type()).Append(" ").Append(field.name()).Append(";");
+  EndLine();
   return *this;
 }
 
@@ -228,39 +256,33 @@ SourceWriter& SourceWriter::WriteModifiers(int modifiers) {
   return *this;
 }
 
-SourceWriter& SourceWriter::WriteDoc(const string& description,
-    const string& return_description, const std::list<Variable>* parameters) {
-  if (description.empty() && return_description.empty()
-      && (parameters == nullptr || parameters->empty())) {
-    return *this;  // no doc to write
-  }
+SourceWriter& SourceWriter::WriteJavadoc(const Javadoc& javadoc) {
+  Append("/**").Prefix(" * ").EndLine();
   bool do_line_break = false;
-  Append("/**").EndLine().Prefix(" * ");
-  if (!description.empty()) {
-    Write(description).EndLine();
+  if (!javadoc.brief().empty()) {
+    Write(javadoc.brief()).EndLine();
     do_line_break = true;
   }
-  if (parameters != nullptr && !parameters->empty()) {
+  if (!javadoc.details().empty()) {
     if (do_line_break) {
-      EndLine();
-      do_line_break = false;
-    }
-    for (const Variable& v : *parameters) {
-      Append("@param ").Append(v.name());
-      if (!v.description().empty()) {
-        Append(" ").Write(v.description());
-      }
-      EndLine();
+      Append("<p>").EndLine();
     }
+    Write(javadoc.details()).EndLine();
+    do_line_break = true;
   }
-  if (!return_description.empty()) {
+  if (!javadoc.tags().empty()) {
     if (do_line_break) {
       EndLine();
-      do_line_break = false;
     }
-    Append("@return ").Write(return_description).EndLine();
+    for (const auto& p : javadoc.tags()) {
+      Append("@" + p.first);
+      if (!p.second.empty()) {
+        Append(" ").Write(p.second);
+      }
+      EndLine();
+    }
   }
-  return Prefix("").Append(" **/").EndLine();
+  return Prefix("").Append(" */").EndLine();
 }
 
 SourceWriter& SourceWriter::WriteAnnotations(
@@ -278,14 +300,16 @@ SourceWriter& SourceWriter::WriteAnnotations(
 SourceWriter& SourceWriter::WriteGenerics(
     const std::list<const Type*>& generics) {
   Append("<");
+  bool first = true;
   for (const Type* pt : generics) {
-    if (pt != generics.front()) {
+    if (!first) {
       Append(", ");
     }
     Append(pt->name());
     if (!pt->supertypes().empty()) {
       Append(" extends ").AppendType(pt->supertypes().front());
     }
+    first = false;
   }
   return Append(">");
 }
@@ -311,21 +335,20 @@ void SourceWriter::PopGenericNamespace() {
 void SourceWriter::TypeVisitor::Visit(const Type& type) {
   DoVisit(type);
   for (const Type& t : type.parameters()) {
-    DoVisit(t);
+    Visit(t);
   }
   for (const Annotation& t : type.annotations()) {
     DoVisit(t);
   }
   for (const Type& t : type.supertypes()) {
-    DoVisit(t);
+    Visit(t);
   }
 }
 
 void SourceWriter::GenericNamespace::DoVisit(const Type& type) {
   // ignore non-generic parameters, wildcards and generics already declared
-  if (type.kind() == Type::GENERIC
-      && !type.IsWildcard()
-      && generic_names_.find(type.name()) == generic_names_.end()) {
+  if (type.kind() == Type::GENERIC && !type.wildcard() &&
+      generic_names_.find(type.name()) == generic_names_.end()) {
     declared_types_.push_back(&type);
     generic_names_.insert(type.name());
   }
@@ -333,7 +356,7 @@ void SourceWriter::GenericNamespace::DoVisit(const Type& type) {
 
 void SourceWriter::TypeImporter::DoVisit(const Type& type) {
   if (!type.package().empty() && type.package() != current_package_) {
-    imports_.insert(type.package() + '.' + type.name());
+    imports_.insert(type.canonical_name());
   }
 }
 
diff --git a/tensorflow/java/src/gen/cc/source_writer.h b/tensorflow/java/src/gen/cc/source_writer.h
index f011acd30aae39..de0113bd5b7092 100644
--- a/tensorflow/java/src/gen/cc/source_writer.h
+++ b/tensorflow/java/src/gen/cc/source_writer.h
@@ -93,25 +93,22 @@ class SourceWriter {
   // This method appends a new opening brace to the current data and indent the
   // next lines according to Google Java Style Guide. The block can optionally
   // be preceded by an expression (e.g. Append("if(true)").BeginBlock();)
-  SourceWriter& BeginBlock() {
-    return Append(newline_ ? "{" : " {").EndLine().Indent(2);
-  }
+  SourceWriter& BeginBlock(const string& expression = "");
 
   // Ends the current block of source code.
   //
   // This method appends a new closing brace to the current data and outdent the
   // next lines back to the margin used before BeginBlock() was invoked.
-  SourceWriter& EndBlock() {
-    return Indent(-2).Append("}").EndLine();
-  }
+  SourceWriter& EndBlock();
 
   // Begins to write a method.
   //
   // This method outputs the signature of the Java method from the data passed
-  // in the 'method' parameter and starts a new block. Additionnal modifiers can
-  // also be passed in parameter to define the accesses and the scope of this
-  // method.
-  SourceWriter& BeginMethod(const Method& method, int modifiers = 0);
+  // in the 'method' parameter and starts a new block. Modifiers are also passed
+  // in parameter to define the access scope of this method and, optionally,
+  // a Javadoc.
+  SourceWriter& BeginMethod(const Method& method, int modifiers,
+                            const Javadoc* javadoc = nullptr);
 
   // Ends the current method.
   //
@@ -122,22 +119,24 @@ class SourceWriter {
   // Begins to write the main type of a source file.
   //
   // This method outputs the declaration of the Java type from the data passed
-  // in the 'type' parameter and starts a new block. Additionnal modifiers can
-  // also be passed in parameter to define the accesses and the scope of this
-  // type.
+  // in the 'type' parameter and starts a new block. Modifiers are also passed
+  // in parameter to define the access scope of this type and, optionally,
+  // a Javadoc.
   //
-  // If not null, all types found in the 'dependencies' list will be imported
-  // before declaring the new type.
-  SourceWriter& BeginType(const Type& clazz,
-      const std::list<Type>* dependencies, int modifiers = 0);
+  // If not null, all types found in the 'extra_dependencies' list will be
+  // imported before declaring the new type.
+  SourceWriter& BeginType(const Type& type, int modifiers,
+                          const std::list<Type>* extra_dependencies = nullptr,
+                          const Javadoc* javadoc = nullptr);
 
   // Begins to write a new inner type.
   //
   // This method outputs the declaration of the Java type from the data passed
-  // in the 'type' parameter and starts a new block. Additionnal modifiers can
-  // also be passed in parameter to define the accesses and the scope of this
-  // type.
-  SourceWriter& BeginInnerType(const Type& type, int modifiers = 0);
+  // in the 'type' parameter and starts a new block. Modifiers are also passed
+  // in parameter to define the accesses and the scope of this type and,
+  // optionally, a Javadoc.
+  SourceWriter& BeginInnerType(const Type& type, int modifiers,
+                               const Javadoc* javadoc = nullptr);
 
   // Ends the current type.
   //
@@ -145,13 +144,13 @@ class SourceWriter {
   // BeginType() or BeginInnerType() prior to this.
   SourceWriter& EndType();
 
-  // Writes a list of variables as fields of a type.
+  // Writes a variable as fields of a type.
   //
   // This method must be called within the definition of a type (see BeginType()
-  // or BeginInnerType()). Additional modifiers can also be passed in parameter
-  // to define the accesses and the scope of those fields.
-  SourceWriter& WriteFields(const std::list<Variable>& fields,
-      int modifiers = 0);
+  // or BeginInnerType()). Modifiers are also be passed in parameter to define
+  // the accesses and the scope of this field and, optionally, a Javadoc.
+  SourceWriter& WriteField(const Variable& field, int modifiers,
+                           const Javadoc* javadoc = nullptr);
 
  protected:
   virtual void DoAppend(const StringPiece& str) = 0;
@@ -207,9 +206,7 @@ class SourceWriter {
   std::stack<GenericNamespace*> generic_namespaces_;
 
   SourceWriter& WriteModifiers(int modifiers);
-  SourceWriter& WriteDoc(const string& description,
-    const string& return_description = "",
-    const std::list<Variable>* parameters = nullptr);
+  SourceWriter& WriteJavadoc(const Javadoc& javadoc);
   SourceWriter& WriteAnnotations(const std::list<Annotation>& annotations);
   SourceWriter& WriteGenerics(const std::list<const Type*>& generics);
   GenericNamespace* PushGenericNamespace(int modifiers);
diff --git a/tensorflow/java/src/gen/cc/source_writer_test.cc b/tensorflow/java/src/gen/cc/source_writer_test.cc
index 4bce2fea7040a0..fb8fc64dffa309 100644
--- a/tensorflow/java/src/gen/cc/source_writer_test.cc
+++ b/tensorflow/java/src/gen/cc/source_writer_test.cc
@@ -245,12 +245,17 @@ TEST(StreamTest, Types) {
   SourceBufferWriter writer;
   Type generic = Type::Generic("T").add_supertype(Type::Class("Number"));
 
-  writer.AppendType(Type::Int()).Append(", ")
-        .AppendType(Type::Class("String")).Append(", ")
-        .AppendType(generic).Append(", ")
-        .AppendType(Type::ListOf(generic)).Append(", ")
-        .AppendType(Type::ListOf(Type::IterableOf(generic))).Append(", ")
-        .AppendType(Type::ListOf(Type::Generic()));
+  writer.AppendType(Type::Int())
+      .Append(", ")
+      .AppendType(Type::Class("String"))
+      .Append(", ")
+      .AppendType(generic)
+      .Append(", ")
+      .AppendType(Type::ListOf(generic))
+      .Append(", ")
+      .AppendType(Type::ListOf(Type::IterableOf(generic)))
+      .Append(", ")
+      .AppendType(Type::ListOf(Type::Wildcard()));
 
   const char* expected =
       "int, String, T, List<T>, List<Iterable<T>>, List<?>";
@@ -282,7 +287,7 @@ TEST(WriteType, SimpleClass) {
   SourceBufferWriter writer;
   Type clazz = Type::Class("Test", "org.tensorflow");
 
-  writer.BeginType(clazz, nullptr, PUBLIC).EndType();
+  writer.BeginType(clazz, PUBLIC).EndType();
 
   const char* expected =
       "package org.tensorflow;\n\n"
@@ -300,7 +305,7 @@ TEST(WriteType, SimpleClassWithDependencies) {
   deps.push_back(Type::Class("SamePackageType", "org.tensorflow"));
   deps.push_back(Type::Class("NoPackageType"));
 
-  writer.BeginType(clazz, &deps, PUBLIC).EndType();
+  writer.BeginType(clazz, PUBLIC, &deps).EndType();
 
   const char* expected =
       "package org.tensorflow;\n\n"
@@ -313,20 +318,22 @@ TEST(WriteType, SimpleClassWithDependencies) {
 TEST(WriteType, AnnotatedAndDocumentedClass) {
   SourceBufferWriter writer;
   Type clazz = Type::Class("Test", "org.tensorflow");
-  clazz.description("This class has a\n<p>\nmultiline description.");
+  Javadoc clazz_doc = Javadoc::Create("Javadoc test")
+                          .details("This is a\nmultiline description.");
   clazz.add_annotation(Annotation::Create("Bean"));
   clazz.add_annotation(Annotation::Create("SuppressWarnings")
       .attributes("\"rawtypes\""));
 
-  writer.BeginType(clazz, nullptr, PUBLIC).EndType();
+  writer.BeginType(clazz, PUBLIC, nullptr, &clazz_doc).EndType();
 
   const char* expected =
       "package org.tensorflow;\n\n"
       "/**\n"
-      " * This class has a\n"
+      " * Javadoc test\n"
       " * <p>\n"
+      " * This is a\n"
       " * multiline description.\n"
-      " **/\n"
+      " */\n"
       "@Bean\n"
       "@SuppressWarnings(\"rawtypes\")\n"
       "public class Test {\n}\n";
@@ -339,7 +346,7 @@ TEST(WriteType, ParameterizedClass) {
   clazz.add_parameter(Type::Generic("T"));
   clazz.add_parameter(Type::Generic("U").add_supertype(Type::Class("Number")));
 
-  writer.BeginType(clazz, nullptr, PUBLIC).EndType();
+  writer.BeginType(clazz, PUBLIC).EndType();
 
   const char* expected =
       "package org.tensorflow;\n\n"
@@ -358,7 +365,7 @@ TEST(WriteType, ParameterizedClassAndSupertypes) {
   clazz.add_supertype(Type::Interface("Runnable"));
   clazz.add_supertype(Type::Class("SuperTest").add_parameter(type_t));
 
-  writer.BeginType(clazz, nullptr, PUBLIC).EndType();
+  writer.BeginType(clazz, PUBLIC).EndType();
 
   const char* expected =
       "package org.tensorflow;\n\n"
@@ -372,24 +379,23 @@ TEST(WriteType, ParameterizedClassFields) {
   Type clazz = Type::Class("Test", "org.tensorflow");
   Type type_t = Type::Generic("T").add_supertype(Type::Class("Number"));
   clazz.add_parameter(type_t);
-  std::list<Variable> static_fields;
-  static_fields.push_back(Variable::Create("field1", Type::Class("String")));
-  std::list<Variable> member_fields;
-  member_fields.push_back(Variable::Create("field2", Type::Class("String")));
-  member_fields.push_back(Variable::Create("field3", type_t));
+  Variable field1 = Variable::Create("field1", Type::Class("String"));
+  Variable field2 = Variable::Create("field2", Type::Class("String"));
+  Variable field3 = Variable::Create("field3", type_t);
+  Javadoc field3_doc = Javadoc::Create("This variable is documented");
 
-  writer.BeginType(clazz, nullptr, PUBLIC)
-          .WriteFields(static_fields, STATIC | PUBLIC | FINAL)
-          .WriteFields(member_fields, PRIVATE)
-        .EndType();
+  writer.BeginType(clazz, PUBLIC)
+      .WriteField(field1, STATIC | PUBLIC | FINAL)
+      .WriteField(field2, PRIVATE)
+      .WriteField(field3, PRIVATE, &field3_doc)
+      .EndType();
 
   const char* expected =
       "package org.tensorflow;\n\n"
       "public class Test<T extends Number> {\n"
-      "  \n"
       "  public static final String field1;\n"
-      "  \n"
       "  private String field2;\n"
+      "  /** This variable is documented */\n"
       "  private T field3;\n"
       "}\n";
   ASSERT_STREQ(expected, writer.str().data());
@@ -400,10 +406,10 @@ TEST(WriteType, SimpleInnerClass) {
   Type clazz = Type::Class("Test", "org.tensorflow");
   Type inner_class = Type::Class("InnerTest");
 
-  writer.BeginType(clazz, nullptr, PUBLIC)
-          .BeginInnerType(inner_class, PUBLIC)
-          .EndType()
-        .EndType();
+  writer.BeginType(clazz, PUBLIC)
+      .BeginInnerType(inner_class, PUBLIC)
+      .EndType()
+      .EndType();
 
   const char* expected =
       "package org.tensorflow;\n\n"
@@ -423,10 +429,10 @@ TEST(WriteType, StaticParameterizedInnerClass) {
   Type inner_class = Type::Class("InnerTest");
   inner_class.add_parameter(type_t);
 
-  writer.BeginType(clazz, nullptr, PUBLIC)
-          .BeginInnerType(inner_class, PUBLIC | STATIC)
-          .EndType()
-        .EndType();
+  writer.BeginType(clazz, PUBLIC)
+      .BeginInnerType(inner_class, PUBLIC | STATIC)
+      .EndType()
+      .EndType();
 
   const char* expected =
       "package org.tensorflow;\n\n"
@@ -443,9 +449,10 @@ TEST(WriteMethod, SimpleMethod) {
   Type clazz = Type::Class("Test", "org.tensorflow");
   Method method = Method::Create("doNothing", Type::Void());
 
-  writer.BeginType(clazz, nullptr, PUBLIC)
-          .BeginMethod(method, PUBLIC).EndMethod()
-        .EndType();
+  writer.BeginType(clazz, PUBLIC)
+      .BeginMethod(method, PUBLIC)
+      .EndMethod()
+      .EndType();
 
   const char* expected =
       "package org.tensorflow;\n\n"
@@ -461,24 +468,28 @@ TEST(WriteMethod, AnnotatedAndDocumentedMethod) {
   SourceBufferWriter writer;
   Type clazz = Type::Class("Test", "org.tensorflow");
   Method method = Method::Create("doNothing", Type::Void());
-  method.description("This method has a\n<p>\nmultiline description.");
+  Javadoc method_doc =
+      Javadoc::Create("Javadoc test")
+          .details("This method has a\nmultiline description.");
   method.add_annotation(Annotation::Create("Override"));
   method.add_annotation(Annotation::Create("SuppressWarnings")
       .attributes("\"rawtypes\""));
 
-  writer.BeginType(clazz, nullptr, PUBLIC)
-          .BeginMethod(method, PUBLIC).EndMethod()
-        .EndType();
+  writer.BeginType(clazz, PUBLIC)
+      .BeginMethod(method, PUBLIC, &method_doc)
+      .EndMethod()
+      .EndType();
 
   const char* expected =
       "package org.tensorflow;\n\n"
       "public class Test {\n"
       "  \n"
       "  /**\n"
-      "   * This method has a\n"
+      "   * Javadoc test\n"
       "   * <p>\n"
+      "   * This method has a\n"
       "   * multiline description.\n"
-      "   **/\n"
+      "   */\n"
       "  @Override\n"
       "  @SuppressWarnings(\"rawtypes\")\n"
       "  public void doNothing() {\n"
@@ -490,23 +501,27 @@ TEST(WriteMethod, AnnotatedAndDocumentedMethod) {
 TEST(WriteMethod, DocumentedMethodWithArguments) {
   SourceBufferWriter writer;
   Type clazz = Type::Class("Test", "org.tensorflow");
+  Variable reverse = Variable::Create("reverse", Type::Boolean());
   Method method = Method::Create("boolToInt", Type::Int());
-  method.description("Converts a boolean to an int");
-  method.return_description("int value for this boolean");
   method.add_argument(Variable::Create("b", Type::Boolean()));
-  Variable reverse = Variable::Create("reverse", Type::Boolean());
-  reverse.description("if true, value is reversed");
   method.add_argument(reverse);
-
-  writer.BeginType(clazz, nullptr, PUBLIC)
-          .BeginMethod(method, PUBLIC)
-            .Append("if (b && !reverse)")
-            .BeginBlock()
-              .Append("return 1;").EndLine()
-            .EndBlock()
-          .Append("return 0;").EndLine()
-          .EndMethod()
-        .EndType();
+  Javadoc method_doc =
+      Javadoc::Create("Converts a boolean to an int")
+          .details("This method will convert\na boolean to an int")
+          .add_param_tag(reverse.name(), "if true, value is reversed")
+          .add_tag("return", "int value for this boolean");
+
+  writer.BeginType(clazz, PUBLIC)
+      .BeginMethod(method, PUBLIC, &method_doc)
+      .Append("if (b && !reverse)")
+      .BeginBlock()
+      .Append("return 1;")
+      .EndLine()
+      .EndBlock()
+      .Append("return 0;")
+      .EndLine()
+      .EndMethod()
+      .EndType();
 
   const char* expected =
       "package org.tensorflow;\n\n"
@@ -514,11 +529,13 @@ TEST(WriteMethod, DocumentedMethodWithArguments) {
       "  \n"
       "  /**\n"
       "   * Converts a boolean to an int\n"
+      "   * <p>\n"
+      "   * This method will convert\n"
+      "   * a boolean to an int\n"
       "   * \n"
-      "   * @param b\n"
       "   * @param reverse if true, value is reversed\n"
       "   * @return int value for this boolean\n"
-      "   **/\n"
+      "   */\n"
       "  public int boolToInt(boolean b, boolean reverse) {\n"
       "    if (b && !reverse) {\n"
       "      return 1;\n"
@@ -536,11 +553,12 @@ TEST(WriteMethod, ParameterizedMethod) {
   clazz.add_parameter(type_t);
   Method method = Method::Create("doNothing", type_t);
 
-  writer.BeginType(clazz, nullptr, PUBLIC)
-          .BeginMethod(method, PUBLIC)
-            .Append("return null;").EndLine()
-          .EndMethod()
-        .EndType();
+  writer.BeginType(clazz, PUBLIC)
+      .BeginMethod(method, PUBLIC)
+      .Append("return null;")
+      .EndLine()
+      .EndMethod()
+      .EndType();
 
   const char* expected =
       "package org.tensorflow;\n\n"
@@ -560,11 +578,12 @@ TEST(WriteMethod, StaticParameterizedMethod) {
   clazz.add_parameter(type_t);
   Method method = Method::Create("doNothing", type_t);
 
-  writer.BeginType(clazz, nullptr, PUBLIC)
-          .BeginMethod(method, PUBLIC | STATIC)
-            .Append("return null;").EndLine()
-          .EndMethod()
-        .EndType();
+  writer.BeginType(clazz, PUBLIC)
+      .BeginMethod(method, PUBLIC | STATIC)
+      .Append("return null;")
+      .EndLine()
+      .EndMethod()
+      .EndType();
 
   const char* expected =
       "package org.tensorflow;\n\n"
diff --git a/tensorflow/java/src/gen/gen_ops.bzl b/tensorflow/java/src/gen/gen_ops.bzl
index a6650fc4ea0b67..f4ff34ea0361fb 100644
--- a/tensorflow/java/src/gen/gen_ops.bzl
+++ b/tensorflow/java/src/gen/gen_ops.bzl
@@ -1,62 +1,62 @@
 # -*- Python -*-
 
-load("//tensorflow:tensorflow.bzl",
-     "tf_binary_additional_srcs",
-     "tf_cc_binary",
-     "tf_copts")
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_binary_additional_srcs",
+)
 
-# Given a list of "ops_libs" (a list of files in the core/ops directory
-# without their .cc extensions), generate Java wrapper code for all operations
-# found in the ops files.
-# Then, combine all those source files into a single archive (.srcjar).
+# Generate Java wrapper classes for all registered core operations and package
+# them into a single source archive (.srcjar).
 #
 # For example:
-#  tf_java_op_gen_srcjar("gen_sources", "gen_tool", "my.package", [ "array_ops", "math_ops" ])
+#  tf_java_op_gen_srcjar("gen_sources", ":gen_tool", "my.package")
 #
-# will create a genrule named "gen_sources" that first generate source files:
-#     ops/src/main/java/my/package/array/*.java
-#     ops/src/main/java/my/package/math/*.java
+# will create a genrule named "gen_sources" that generates source files under
+#     ops/src/main/java/my/package/**/*.java
 #
-# and then archive those source files in:
+# and then archive those source files into
 #     ops/gen_sources.srcjar
 #
 def tf_java_op_gen_srcjar(name,
                           gen_tool,
-                          gen_base_package,
-                          ops_libs=[],
-                          ops_libs_pkg="//tensorflow/core",
+                          base_package,
+                          api_def_srcs=[],
                           out_dir="ops/",
                           out_src_dir="src/main/java/",
                           visibility=["//tensorflow/java:__pkg__"]):
 
-  gen_tools = []
   gen_cmds = ["rm -rf $(@D)"]  # Always start from fresh when generating source files
+  srcs = api_def_srcs[:]
 
-  # Construct an op generator binary for each ops library.
-  for ops_lib in ops_libs:
-    gen_lib = ops_lib[:ops_lib.rfind("_")]
-    out_gen_tool = out_dir + ops_lib + "_gen_tool"
+  if not api_def_srcs:
+    api_def_args_str = ","
+  else:
+    api_def_args = []
+    for api_def_src in api_def_srcs:
+      # Add directory of the first ApiDef source to args.
+      # We are assuming all ApiDefs in a single api_def_src are in the
+      # same directory.
+      api_def_args.append(
+          "$$(dirname $$(echo $(locations " + api_def_src +
+          ") | cut -d\" \" -f1))")
+    api_def_args_str = ",".join(api_def_args)
 
-    tf_cc_binary(
-        name=out_gen_tool,
-        copts=tf_copts(),
-        linkopts=["-lm"],
-        linkstatic=1,  # Faster to link this one-time-use binary dynamically
-        deps=[gen_tool, ops_libs_pkg + ":" + ops_lib + "_op_lib"])
-
-    gen_tools += [":" + out_gen_tool]
-    gen_cmds += ["$(location :" + out_gen_tool + ")" +
-                 " --output_dir=$(@D)/" + out_src_dir +
-                 " --lib_name=" + gen_lib +
-                 " --base_package=" + gen_base_package]
+  gen_cmds += ["$(location " + gen_tool + ")" +
+               " --output_dir=$(@D)/" + out_src_dir +
+               " --base_package=" + base_package +
+               " --api_dirs=" + api_def_args_str]
 
   # Generate a source archive containing generated code for these ops.
   gen_srcjar = out_dir + name + ".srcjar"
   gen_cmds += ["$(location @local_jdk//:jar) cMf $(location :" + gen_srcjar + ") -C $(@D) src"]
-  gen_tools += ["@local_jdk//:jar"] + ["@local_jdk//:jdk"]
-  gen_tools += tf_binary_additional_srcs()
+
   native.genrule(
       name=name,
+      srcs=srcs,
       outs=[gen_srcjar],
-      tools=gen_tools,
-      cmd="&&".join(gen_cmds))
+      tools=[
+          "@local_jdk//:jar",
+          "@local_jdk//:jdk",
+          gen_tool
+      ] + tf_binary_additional_srcs(),
+      cmd=" && ".join(gen_cmds))
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index adf8a861bd603c..0fb9149a93c317 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -71,6 +71,7 @@ py_library(
     visibility = [
         "//tensorflow:__pkg__",
         "//tensorflow/python/tools:__pkg__",
+        "//tensorflow/tools/api/generator:__pkg__",
     ],
     deps = [
         ":array_ops",
@@ -79,6 +80,7 @@ py_library(
         ":check_ops",
         ":client",
         ":client_testlib",
+        ":collective_ops",
         ":confusion_matrix",
         ":control_flow_ops",
         ":cudnn_rnn_ops_gen",
@@ -255,7 +257,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//third_party/py/numpy:headers",
-        "//util/python:python_headers",
+        "//third_party/python_runtime:headers",
     ],
 )
 
@@ -268,7 +270,7 @@ cc_library(
         ":safe_ptr",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//util/python:python_headers",
+        "//third_party/python_runtime:headers",
     ],
 )
 
@@ -292,7 +294,7 @@ cc_library(
     deps = [
         "//tensorflow/c:c_api",
         "//tensorflow/core:lib",
-        "//util/python:python_headers",
+        "//third_party/python_runtime:headers",
     ],
 )
 
@@ -313,9 +315,9 @@ cc_library(
     hdrs = ["util/util.h"],
     deps = [
         ":safe_ptr",
-        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//util/python:python_headers",
+        "//tensorflow/core:lib_internal",
+        "//third_party/python_runtime:headers",
     ],
 )
 
@@ -337,7 +339,7 @@ cc_library(
         "//tensorflow/core:script_ops_op_lib",
         "//tensorflow/python/eager:pywrap_tfe_lib",
         "//third_party/py/numpy:headers",
-        "//util/python:python_headers",
+        "//third_party/python_runtime:headers",
     ],
 )
 
@@ -348,7 +350,7 @@ cc_library(
     deps = [
         "//tensorflow/c:c_api",
         "//tensorflow/c/eager:c_api",
-        "//util/python:python_headers",
+        "//third_party/python_runtime:headers",
     ],
 )
 
@@ -378,7 +380,7 @@ cc_library(
         ":safe_ptr",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//util/python:python_headers",
+        "//third_party/python_runtime:headers",
     ],
 )
 
@@ -389,7 +391,7 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:script_ops_op_lib",
-        "//util/python:python_headers",
+        "//third_party/python_runtime:headers",
     ],
 )
 
@@ -502,7 +504,10 @@ py_test(
 
 cc_library(
     name = "python_op_gen",
-    srcs = ["framework/python_op_gen.cc"],
+    srcs = [
+        "framework/python_op_gen.cc",
+        "framework/python_op_gen_internal.cc",
+    ],
     hdrs = [
         "framework/python_op_gen.h",
         "framework/python_op_gen_internal.h",
@@ -524,12 +529,12 @@ cc_library(
     srcs = ["framework/python_op_gen_main.cc"],
     visibility = ["//visibility:public"],
     deps = [
+        ":python_op_gen",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:op_gen_lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/python/eager:python_eager_op_gen",
     ],
 )
 
@@ -624,6 +629,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":pywrap_tensorflow",
+        "//tensorflow/core:protos_all_py",
     ],
 )
 
@@ -711,6 +717,38 @@ py_library(
     ],
 )
 
+py_library(
+    name = "function_def_to_graph",
+    srcs = ["framework/function_def_to_graph.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":framework",
+        ":function",
+        ":op_def_registry",
+        ":tensor_shape",
+        ":versions",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
+py_test(
+    name = "function_def_to_graph_test",
+    size = "small",
+    srcs = ["framework/function_def_to_graph_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":array_ops",
+        ":client_testlib",
+        ":dtypes",
+        ":framework_ops",
+        ":function_def_to_graph",
+        ":graph_to_function_def",
+        ":math_ops",
+        ":test_ops",
+    ],
+)
+
 py_library(
     name = "graph_util",
     srcs = [
@@ -1012,7 +1050,10 @@ py_test(
 
 tf_gen_op_wrapper_private_py(
     name = "functional_ops_gen",
-    visibility = ["//learning/brain/python/ops:__pkg__"],
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/contrib/control_flow:__pkg__",
+    ],
 )
 
 py_library(
@@ -1431,6 +1472,14 @@ tf_gen_op_wrapper_private_py(
     ],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "collective_ops_gen",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/core:collective_ops_op_lib",
+    ],
+)
+
 tf_gen_op_wrapper_private_py(
     name = "control_flow_ops_gen",
     visibility = [
@@ -1474,6 +1523,7 @@ tf_gen_op_wrapper_private_py(
     visibility = [
         "//learning/brain/python/ops:__pkg__",
         "//tensorflow/python/kernel_tests:__pkg__",
+        "//tensorflow/python/training/checkpointable:__pkg__",
     ],
 )
 
@@ -1731,9 +1781,33 @@ py_test(
     ],
 )
 
+py_library(
+    name = "collective_ops",
+    srcs = ["ops/collective_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":collective_ops_gen",
+        ":framework_for_generated_wrappers",
+    ],
+)
+
+py_test(
+    name = "collective_ops_test",
+    size = "small",
+    srcs = ["ops/collective_ops_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":client_testlib",
+        ":collective_ops",
+        ":framework_for_generated_wrappers",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_library(
     name = "control_flow_grad",
-    srcs = ["ops/control_flow_grad.py"],
+    srcs =
+        ["ops/control_flow_grad.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":control_flow_ops",
@@ -1762,6 +1836,7 @@ py_library(
         ":logging_ops_gen",
         ":math_ops",
         ":platform",
+        ":resource_variable_ops_gen",
         ":sparse_tensor",
         ":tensor_array_ops",
         ":tf_should_use",
@@ -1878,8 +1953,10 @@ py_library(
         ":math_grad",
         ":math_ops",
         ":platform",
+        ":resource_variable_ops",
         ":spectral_grad",
         ":util",
+        ":variable_scope",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:tape",
@@ -2658,7 +2735,6 @@ py_library(
         ":util",
         ":variables",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/estimator:util",
         "@six_archive//:six",
     ],
 )
@@ -2669,7 +2745,6 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops",
-        ":checkpointable",
         ":control_flow_ops",
         ":dtypes",
         ":framework_ops",
@@ -2680,6 +2755,7 @@ py_library(
         ":util",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/training/checkpointable:base",
     ],
 )
 
@@ -2776,6 +2852,7 @@ gpu_py_test(
         ":framework_test_lib",
         ":functional_ops",
         ":gradients",
+        ":layers",
         ":math_grad",
         ":math_ops",
         ":nn_grad",
@@ -2785,6 +2862,7 @@ gpu_py_test(
         ":tensor_array_grad",
         ":tensor_array_ops",
         ":test_ops",
+        ":variable_scope",
         "//third_party/py/numpy",
     ],
 )
@@ -2964,7 +3042,11 @@ py_library(
         ["training/**/*.py"],
         exclude = [
             "**/*test*",
-            "training/training_util.py",  # See :training_util
+            "training/checkpointable/**/*.py",
+            # The following targets have their own build rules (same name as the
+            # file):
+            "training/saveable_object.py",
+            "training/training_util.py",
         ],
     ),
     srcs_version = "PY2AND3",
@@ -2995,6 +3077,7 @@ py_library(
         ":random_ops",
         ":resource_variable_ops",
         ":resources",
+        ":saveable_object",
         ":sdca_ops",
         ":sparse_ops",
         ":state_ops",
@@ -3010,34 +3093,18 @@ py_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/ops/losses",
         # `layers` dependency only exists due to the use of a small utility.
         "//tensorflow/python/keras:layers",
+        "//tensorflow/python/ops/losses",
+        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/checkpointable:util",
     ],
 )
 
 py_library(
-    name = "checkpointable",
-    srcs = ["training/checkpointable.py"],
+    name = "saveable_object",
+    srcs = ["training/saveable_object.py"],
     srcs_version = "PY2AND3",
-    deps = [
-        ":array_ops",
-        ":dtypes",
-        ":io_ops_gen",
-        ":ops",
-        ":util",
-        "//tensorflow/python/eager:context",
-    ],
-)
-
-py_test(
-    name = "checkpointable_test",
-    srcs = ["training/checkpointable_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":checkpointable",
-        ":client_testlib",
-    ],
 )
 
 py_library(
@@ -3070,39 +3137,6 @@ py_library(
     ],
 )
 
-py_test(
-    name = "checkpointable_utils_test",
-    srcs = ["training/checkpointable_utils_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_windows",  # TODO: needs investigation on Windows
-        "notsan",  # b/74395663
-    ],
-    deps = [
-        ":checkpointable",
-        ":constant_op",
-        ":control_flow_ops",
-        ":dtypes",
-        ":framework_ops",
-        ":framework_test_lib",
-        ":init_ops",
-        ":resource_variable_ops",
-        ":session",
-        ":state_ops",
-        ":template",
-        ":training",
-        ":training_util",
-        ":variable_scope",
-        "//tensorflow/python/eager:backprop",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:function",
-        "//tensorflow/python/eager:test",
-        "//tensorflow/python/keras:engine",
-        "//tensorflow/python/keras:layers",
-        "@six_archive//:six",
-    ],
-)
-
 py_test(
     name = "distribute_test",
     size = "small",
@@ -3203,6 +3237,18 @@ py_test(
     ],
 )
 
+py_test(
+    name = "util_serialization_test",
+    size = "small",
+    srcs = ["util/serialization_test.py"],
+    main = "util/serialization_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":client_testlib",
+        ":util",
+    ],
+)
+
 py_test(
     name = "future_api_test",
     size = "small",
@@ -3214,6 +3260,16 @@ py_test(
     ],
 )
 
+py_test(
+    name = "function_utils_test",
+    srcs = ["util/function_utils_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":client_testlib",
+        ":util",
+    ],
+)
+
 py_test(
     name = "tf_contextlib_test",
     size = "small",
@@ -3416,7 +3472,7 @@ tf_gpu_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//third_party/py/numpy:headers",
-        "//util/python:python_headers",
+        "//third_party/python_runtime:headers",
     ],
 )
 
@@ -3468,6 +3524,7 @@ tf_py_wrap_cc(
         "util/transform_graph.i",
         "util/util.i",
     ],
+    # add win_def_file
     win_def_file = select({
         "//tensorflow:windows": ":pywrap_tensorflow_filtered_def_file",
         "//conditions:default": None,
@@ -3487,6 +3544,7 @@ tf_py_wrap_cc(
         ":py_record_writer_lib",
         ":python_op_gen",
         ":tf_session_helper",
+        "//third_party/python_runtime:headers",
         "//tensorflow/c:c_api",
         "//tensorflow/c:checkpoint_reader",
         "//tensorflow/c:python_api",
@@ -3509,8 +3567,6 @@ tf_py_wrap_cc(
         "//tensorflow/core/profiler/internal:print_model_analysis",
         "//tensorflow/tools/graph_transforms:transform_graph_lib",
         "//tensorflow/python/eager:pywrap_tfe_lib",
-        "//tensorflow/python/eager:python_eager_op_gen",
-        "//util/python:python_headers",
     ] + (tf_additional_lib_deps() +
          tf_additional_plugin_deps() +
          tf_additional_verbs_deps() +
@@ -3924,7 +3980,18 @@ gpu_py_test(
         ":math_ops",
         "//tensorflow/core:protos_all_py",
     ],
-    tags = ["noguitar"],
+)
+
+py_test(
+    name = "c_api_util_test",
+    size = "small",
+    srcs = ["framework/c_api_util_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":c_api_util",
+        ":framework_test_lib",
+        ":platform_test",
+    ],
 )
 
 py_test(
@@ -4016,6 +4083,7 @@ gpu_py_tests(
         "training/basic_loops_test.py",
         "training/coordinator_test.py",
         "training/device_setter_test.py",
+        "training/device_util_test.py",
         "training/ftrl_test.py",
         "training/gradient_descent_test.py",
         "training/learning_rate_decay_test.py",
@@ -4117,7 +4185,7 @@ gpu_py_test(
 
 py_test(
     name = "saver_large_variable_test",
-    size = "small",
+    size = "medium",
     srcs = ["training/saver_large_variable_test.py"],
     srcs_version = "PY2AND3",
     tags = [
@@ -4199,7 +4267,7 @@ tf_py_test(
 
 py_test(
     name = "basic_session_run_hooks_test",
-    size = "small",
+    size = "medium",
     srcs = ["training/basic_session_run_hooks_test.py"],
     srcs_version = "PY2AND3",
     tags = [
@@ -4276,7 +4344,7 @@ py_test(
 
 py_test(
     name = "warm_starting_util_test",
-    size = "small",
+    size = "medium",
     srcs = ["training/warm_starting_util_test.py"],
     srcs_version = "PY2AND3",
     deps = [
@@ -4455,7 +4523,6 @@ py_library(
         ":variable_scope",
         ":variables",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/estimator:util",
         "//tensorflow/python/keras:engine",
         "//third_party/py/numpy",
     ],
@@ -4492,7 +4559,6 @@ py_library(
         ":variable_scope",
         ":variables",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/estimator:util",
         "//tensorflow/python/keras:layers",
         "//third_party/py/numpy",
         "@six_archive//:six",
diff --git a/tensorflow/python/client/client_lib.py b/tensorflow/python/client/client_lib.py
index b9ecaa4c851c08..c94767a03c28cd 100644
--- a/tensorflow/python/client/client_lib.py
+++ b/tensorflow/python/client/client_lib.py
@@ -16,30 +16,6 @@
 """Support for launching graphs and executing operations.
 
 See the @{$python/client} guide.
-
-@@Session
-@@InteractiveSession
-@@get_default_session
-@@OpError
-@@CancelledError
-@@UnknownError
-@@InvalidArgumentError
-@@DeadlineExceededError
-@@NotFoundError
-@@AlreadyExistsError
-@@PermissionDeniedError
-@@UnauthenticatedError
-@@ResourceExhaustedError
-@@FailedPreconditionError
-@@AbortedError
-@@OutOfRangeError
-@@UnimplementedError
-@@InternalError
-@@UnavailableError
-@@DataLossError
-@@exception_type_from_error_code
-@@error_code_from_exception_type
-@@raise_exception_on_not_ok_status
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/client/session_clusterspec_prop_test.py b/tensorflow/python/client/session_clusterspec_prop_test.py
index f1934241334e04..df020f88a88687 100644
--- a/tensorflow/python/client/session_clusterspec_prop_test.py
+++ b/tensorflow/python/client/session_clusterspec_prop_test.py
@@ -42,7 +42,6 @@
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
-ops._USE_C_API = True
 
 # NOTE(mrry): Dummy shape registration for ops used in the tests, since they
 # don't have C++ op registrations on which to attach C++ shape fns.
@@ -77,7 +76,7 @@ def testClusterSpecPropagationWorker2Placement(self):
     config = config_pb2.ConfigProto(cluster_def=cluster_def)
 
     with ops.Graph().as_default() as g, ops.device('/job:worker/task:1'):
-      with ops.device('/cpu:0'):	 
+      with ops.device('/cpu:0'):
         const = constant_op.constant(17)
     sess = session.Session(server1.target, config=config, graph=g)
     run_options = config_pb2.RunOptions(
@@ -459,7 +458,6 @@ def testClusterSpecPropagationIsolation(self):
     with self.assertRaises(errors.FailedPreconditionError):
       sess3.run(v)
 
-  @test_util.disable_c_api  # Partial runs don't work with C API
   def testClusterSpecPropagationPartialRun(self):
     """Test successful partial run with ClusterSpec propagation."""
     server1 = server_lib.Server.create_local_server()
diff --git a/tensorflow/python/client/session_list_devices_test.py b/tensorflow/python/client/session_list_devices_test.py
index 38a3acb2dc3049..c5d82c213ac890 100644
--- a/tensorflow/python/client/session_list_devices_test.py
+++ b/tensorflow/python/client/session_list_devices_test.py
@@ -30,8 +30,7 @@
 from tensorflow.python.training import server_lib
 
 
-class SessionListDevicesTestMethods(object):
-  """Mixin with test methods."""
+class SessionListDevicesTest(test_util.TensorFlowTestCase):
 
   def testListDevices(self):
     with session.Session() as sess:
@@ -75,33 +74,5 @@ def testListDevicesClusterSpecPropagation(self):
           '/job:worker/replica:0/task:1/device:CPU:0' in device_names)
 
 
-class SessionListDevicesTest(SessionListDevicesTestMethods,
-                             test_util.TensorFlowTestCase):
-  """Test case that invokes test methods with _USE_C_API=False."""
-
-  def setUp(self):
-    self.prev_use_c_api = ops._USE_C_API
-    ops._USE_C_API = False
-    super(SessionListDevicesTest, self).setUp()
-
-  def tearDown(self):
-    ops._USE_C_API = self.prev_use_c_api
-    super(SessionListDevicesTest, self).tearDown()
-
-
-class SessionListDevicesWithCApiTest(SessionListDevicesTestMethods,
-                                     test_util.TensorFlowTestCase):
-  """Test case that invokes test methods with _USE_C_API=True."""
-
-  def setUp(self):
-    self.prev_use_c_api = ops._USE_C_API
-    ops._USE_C_API = True
-    super(SessionListDevicesWithCApiTest, self).setUp()
-
-  def tearDown(self):
-    ops._USE_C_API = self.prev_use_c_api
-    super(SessionListDevicesWithCApiTest, self).tearDown()
-
-
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/client/session_partial_run_test.py b/tensorflow/python/client/session_partial_run_test.py
index 6a389b078a54ad..92ca47efa9348f 100644
--- a/tensorflow/python/client/session_partial_run_test.py
+++ b/tensorflow/python/client/session_partial_run_test.py
@@ -39,7 +39,7 @@
 ops.RegisterShape('ConstructionFails')(common_shapes.unknown_shape)
 
 
-class PartialRunTestMethods(object):
+class PartialRunTest(test_util.TensorFlowTestCase):
 
   def RunTestPartialRun(self, sess):
     a = array_ops.placeholder(dtypes.float32, shape=[])
@@ -283,32 +283,5 @@ def testPartialRunEmptyFetchesDist(self):
     self.RunTestPartialRunEmptyFetches(session.Session(server.target))
 
 
-class PartialRunTest(PartialRunTestMethods, test_util.TensorFlowTestCase):
-  """Test case that invokes test methods with _USE_C_API=False."""
-
-  def setUp(self):
-    self.prev_use_c_api = ops._USE_C_API
-    ops._USE_C_API = False
-    super(PartialRunTest, self).setUp()
-
-  def tearDown(self):
-    ops._USE_C_API = self.prev_use_c_api
-    super(PartialRunTest, self).tearDown()
-
-
-class PartialRunWithCApiTest(PartialRunTestMethods,
-                             test_util.TensorFlowTestCase):
-  """Test case that invokes test methods with _USE_C_API=True."""
-
-  def setUp(self):
-    self.prev_use_c_api = ops._USE_C_API
-    ops._USE_C_API = True
-    super(PartialRunWithCApiTest, self).setUp()
-
-  def tearDown(self):
-    ops._USE_C_API = self.prev_use_c_api
-    super(PartialRunWithCApiTest, self).tearDown()
-
-
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index 92497272c66b5c..482497078cd3e0 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -62,7 +62,6 @@
 ops.RegisterShape('ConstructionFails')(common_shapes.unknown_shape)
 
 
-@test_util.with_c_api
 class SessionTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -173,21 +172,6 @@ def exc_predicate(e):
         # Run with a bogus handle.
         s.partial_run('foo', r1, feed_dict={a: 1, b: 2})
 
-  def testOpConstructionErrorPayload(self):
-    if ops._USE_C_API:
-      return  # No shape registration for 'ConstructionFails'
-
-    with session.Session():
-      failing_op = ops.get_default_graph().create_op(
-          'ConstructionFails', [], [], name='f')
-
-      def exc_predicate(e):
-        return (e.op == failing_op and
-                e.error_code == error_codes_pb2.INVALID_ARGUMENT)
-
-      with self.assertRaisesOpError(exc_predicate):
-        failing_op.run()
-
   def testErrorBasedOn(self):
     with session.Session() as sess:
       a = constant_op.constant(0.0, shape=[2, 3])
@@ -1084,10 +1068,7 @@ def run_loop():
           if gdef is None:
             gdef = graph.as_graph_def()
           else:
-            # NOTE(skyewm): import_graph_def breaks the running threads without
-            # the C API enabled. This is not a regression so I didn't fix it.
-            if ops._USE_C_API:
-              importer.import_graph_def(gdef, name='import')
+            importer.import_graph_def(gdef, name='import')
 
       stop.set()
       for t in threads:
@@ -1584,10 +1565,6 @@ def testRunOptionsRunMetadata(self):
         self.assertEquals(len(run_metadata.step_stats.dev_stats), 1)
 
   def testFeedShapeCompatibility(self):
-    # TODO(nolivia): C API doesn't yet handle marking nodes as not feedable.
-    if ops._USE_C_API:
-      return
-
     with session.Session() as sess:
       some_tensor = constant_op.constant([2.0, 2.0, 2.0, 2.0])
       new_shape = constant_op.constant([2, 2])
@@ -1596,7 +1573,10 @@ def testFeedShapeCompatibility(self):
       with self.assertRaisesRegexp(ValueError, 'Cannot feed value of shape'):
         sess.run(reshaped_tensor, feed_dict={some_tensor: [1.0, 2.0, 3.0]})
 
-      with self.assertRaisesRegexp(ValueError, 'may not be fed'):
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          'Input to reshape is a tensor with 4 values, '
+          'but the requested shape has 21'):
         sess.run(reshaped_tensor, feed_dict={new_shape: [3, 7]})
 
   def testInferShapesFalse(self):
diff --git a/tensorflow/python/client/virtual_gpu_test.py b/tensorflow/python/client/virtual_gpu_test.py
index addf63474c9ba2..52e1b56886f83c 100644
--- a/tensorflow/python/client/virtual_gpu_test.py
+++ b/tensorflow/python/client/virtual_gpu_test.py
@@ -192,7 +192,6 @@ def TestRandomGraph(self, sess, op_placement=None, random_seed=None):
     return True
 
 
-@test_util.with_c_api
 class VirtualGpuTest(test_util.TensorFlowTestCase):
 
   def __init__(self, method_name):
@@ -236,7 +235,7 @@ def testLargeRandomGraph(self):
     with self.test_session(config=self._util.config) as sess:
       if not test.is_gpu_available(cuda_only=True):
         self.skipTest('No GPU available')
-      for _ in range(10):
+      for _ in range(5):
         if not self._util.TestRandomGraph(sess):
           return
 
diff --git a/tensorflow/python/data/__init__.py b/tensorflow/python/data/__init__.py
index 5cedb89bf8f3bc..7efe0948e7729c 100644
--- a/tensorflow/python/data/__init__.py
+++ b/tensorflow/python/data/__init__.py
@@ -15,12 +15,6 @@
 """`tf.data.Dataset` API for input pipelines.
 
 See the @{$datasets$Importing Data} Programmer's Guide for an overview.
-
-@@Dataset
-@@Iterator
-@@FixedLengthRecordDataset
-@@TextLineDataset
-@@TFRecordDataset
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index bc2ff9666680b4..19e1ab11e1bd73 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -72,6 +72,17 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "dataset_ops_test",
+    size = "small",
+    srcs = ["dataset_ops_test.py"],
+    additional_deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
 tf_py_test(
     name = "filter_dataset_op_test",
     size = "small",
diff --git a/tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py b/tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py
index 6aabad2f574551..296a76ec887ae7 100644
--- a/tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py
+++ b/tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py
@@ -32,9 +32,12 @@
 
 class DatasetConstructorTest(test.TestCase):
 
-  def _testFromGenerator(self, generator, elem_sequence, num_repeats):
+  def _testFromGenerator(self, generator, elem_sequence, num_repeats,
+                         output_types=None):
+    if output_types is None:
+      output_types = dtypes.int64
     iterator = (
-        dataset_ops.Dataset.from_generator(generator, output_types=dtypes.int64)
+        dataset_ops.Dataset.from_generator(generator, output_types=output_types)
         .repeat(num_repeats)
         .prefetch(5)
         .make_initializable_iterator())
@@ -84,8 +87,8 @@ def testFromGeneratorUsingList(self):
   def testFromGeneratorUsingNdarray(self):
     generator = lambda: np.arange(100, dtype=np.int64)
     elem_sequence = list(generator())
-    self._testFromGenerator(generator, elem_sequence, 1)
-    self._testFromGenerator(generator, elem_sequence, 5)
+    self._testFromGenerator(generator, elem_sequence, 1, output_types=np.int64)
+    self._testFromGenerator(generator, elem_sequence, 5, output_types=np.int64)
 
   def testFromGeneratorUsingGeneratorExpression(self):
     # NOTE(mrry): Generator *expressions* are not repeatable (or in
@@ -357,6 +360,65 @@ def __del__(self):
       # iterator terminates (and the generator iterator is deleted).
       self.assertTrue(event.is_set())
 
+  def testFromGeneratorWithArgs(self):
+
+    def flat_map_fn(elem):
+
+      def generator_with_arg(n):
+        for _ in range(n):
+          yield np.array(n, dtype=np.int64)
+
+      return dataset_ops.Dataset.from_generator(
+          generator_with_arg, output_types=dtypes.int64, output_shapes=(),
+          args=(elem,))
+
+    iterator = (dataset_ops.Dataset
+                .range(5)
+                .flat_map(flat_map_fn)
+                .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      expected = [1, 2, 2, 3, 3, 3, 4, 4, 4, 4]
+      for x in expected:
+        self.assertEqual(x, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testFromGeneratorWithTwoArgs(self):
+
+    def flat_map_fn(elem, message):
+
+      def generator_with_arg(n, msg):
+        for i in range(n):
+          yield i, msg
+
+      return dataset_ops.Dataset.from_generator(
+          generator_with_arg, output_types=(dtypes.int64, dtypes.string),
+          output_shapes=((), ()), args=(elem, message))
+
+    iterator = (
+        dataset_ops.Dataset.zip(
+            (dataset_ops.Dataset.range(5),
+             dataset_ops.Dataset.from_tensors("Hi!").repeat(None)))
+        .flat_map(flat_map_fn)
+        .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      expected = [(0, b"Hi!"),
+                  (0, b"Hi!"), (1, b"Hi!"),
+                  (0, b"Hi!"), (1, b"Hi!"), (2, b"Hi!"),
+                  (0, b"Hi!"), (1, b"Hi!"), (2, b"Hi!"), (3, b"Hi!")]
+      for x in expected:
+        self.assertEqual(x, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
   def testGeneratorDatasetFinalizeFunctionCalled(self):
     # NOTE(mrry): This test tests the internal `_GeneratorDataset`,
     # which affords more control over what the finalize function can do than
diff --git a/tensorflow/python/keras/_impl/keras/datasets/__init__.py b/tensorflow/python/data/kernel_tests/dataset_ops_test.py
similarity index 58%
rename from tensorflow/python/keras/_impl/keras/datasets/__init__.py
rename to tensorflow/python/data/kernel_tests/dataset_ops_test.py
index 60db3766fbce85..2c4c11e132d1fc 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/__init__.py
+++ b/tensorflow/python/data/kernel_tests/dataset_ops_test.py
@@ -12,17 +12,26 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Keras datasets: utilities for downloading and pre-processing common datasets.
+"""Tests for the input pipeline ops."""
 
-"""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.datasets import boston_housing
-from tensorflow.python.keras._impl.keras.datasets import cifar10
-from tensorflow.python.keras._impl.keras.datasets import cifar100
-from tensorflow.python.keras._impl.keras.datasets import fashion_mnist
-from tensorflow.python.keras._impl.keras.datasets import imdb
-from tensorflow.python.keras._impl.keras.datasets import mnist
-from tensorflow.python.keras._impl.keras.datasets import reuters
+from tensorflow.core.framework import graph_pb2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.platform import test
+
+
+class DatasetOpsTest(test.TestCase):
+
+  def testAsSerializedGraph(self):
+    dataset = dataset_ops.Dataset.range(10)
+    with self.test_session() as sess:
+      graph = graph_pb2.GraphDef().FromString(
+          sess.run(dataset._as_serialized_graph()))
+      self.assertTrue(any([node.op != "RangeDataset" for node in graph.node]))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/iterator_ops_test.py b/tensorflow/python/data/kernel_tests/iterator_ops_test.py
index 0af282a0247538..820c167b6bb9dc 100644
--- a/tensorflow/python/data/kernel_tests/iterator_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_ops_test.py
@@ -51,18 +51,15 @@
 
 class IteratorTest(test.TestCase):
 
-  def testAttemptingGradientsRaiseExceptions(self):
-    component = constant_op.constant([1])
-    side = constant_op.constant(0)
+  def testNoGradients(self):
+    component = constant_op.constant([1.])
+    side = constant_op.constant(0.)
     add = lambda x: x + side
     dataset = dataset_ops.Dataset.from_tensor_slices(component).map(add)
     value = dataset.make_one_shot_iterator().get_next()
-    with self.assertRaisesRegexp(LookupError, "No gradient defined"):
-      gradients_impl.gradients(value, component)
-    with self.assertRaisesRegexp(LookupError, "No gradient defined"):
-      gradients_impl.gradients(value, side)
-    with self.assertRaisesRegexp(LookupError, "No gradient defined"):
-      gradients_impl.gradients(value, [component, side])
+    self.assertIsNone(gradients_impl.gradients(value, component)[0])
+    self.assertIsNone(gradients_impl.gradients(value, side)[0])
+    self.assertIsNone(gradients_impl.gradients(value, [component, side])[0])
 
   def testCapturingStateInOneShotRaisesException(self):
     var = variables.Variable(37.0, name="myvar")
diff --git a/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py b/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py
index 1ddedfda4e1c9d..e99f0a203b4d8b 100644
--- a/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py
@@ -24,6 +24,7 @@
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.ops import readers
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -38,6 +39,13 @@
 from tensorflow.python.util import compat
 
 
+try:
+  import psutil  # pylint: disable=g-import-not-at-top
+  psutil_import_succeeded = True
+except ImportError:
+  psutil_import_succeeded = False
+
+
 class TextLineDatasetTest(test.TestCase):
 
   def _lineText(self, f, l):
@@ -162,6 +170,34 @@ def testTextLineDatasetBuffering(self):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(iterator.get_next())
 
+  def testIteratorResourceCleanup(self):
+    filename = os.path.join(self.get_temp_dir(), "text.txt")
+    with open(filename, "wt") as f:
+      for i in range(3):
+        f.write("%d\n" % (i,))
+    with context.eager_mode():
+      first_iterator = iter(readers.TextLineDataset(filename))
+      self.assertEqual(b"0", next(first_iterator).numpy())
+      second_iterator = iter(readers.TextLineDataset(filename))
+      self.assertEqual(b"0", next(second_iterator).numpy())
+      # Eager kernel caching is based on op attributes, which includes the
+      # Dataset's output shape. Create a different kernel to test that they
+      # don't create resources with the same names.
+      different_kernel_iterator = iter(
+          readers.TextLineDataset(filename).repeat().batch(16))
+      self.assertEqual([16], next(different_kernel_iterator).shape)
+      # Remove our references to the Python Iterator objects, which (assuming no
+      # reference cycles) is enough to trigger DestroyResourceOp and close the
+      # partially-read files.
+      del first_iterator
+      del second_iterator
+      del different_kernel_iterator
+      if not psutil_import_succeeded:
+        self.skipTest(
+            "psutil is required to check that we've closed our files.")
+      open_files = psutil.Process().open_files()
+      self.assertNotIn(filename, [open_file.path for open_file in open_files])
+
 
 class FixedLengthRecordReaderTest(test.TestCase):
 
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index bd9686f692102d..5f17444797d00b 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -18,7 +18,6 @@
 from __future__ import print_function
 
 import abc
-import collections
 import threading
 
 import numpy as np
@@ -58,6 +57,15 @@ class Dataset(object):
   def __init__(self):
     pass
 
+  def _as_serialized_graph(self):
+    """Produces serialized graph representation of the dataset.
+
+    Returns:
+      A scalar `tf.Tensor` of `tf.string` type, representing this dataset as a
+      serialized graph.
+    """
+    return gen_dataset_ops.dataset_to_graph(self._as_variant_tensor())
+
   @abc.abstractmethod
   def _as_variant_tensor(self):
     """Creates a scalar `tf.Tensor` of `tf.variant` representing this dataset.
@@ -259,25 +267,32 @@ def __init__(self, generator):
       self._generator = generator
       self._lock = threading.Lock()
       self._next_id = 0  # GUARDED_BY(self._lock)
-      self._iterators = collections.defaultdict(lambda: iter(generator()))
+      self._args = {}
+      self._iterators = {}
 
-    def get_next_id(self):
+    def get_next_id(self, *args):
       with self._lock:
         ret = self._next_id
         self._next_id += 1
+      self._args[ret] = args
       # NOTE(mrry): Explicitly create an array of `np.int64` because implicit
       # casting in `py_func()` will create an array of `np.int32` on Windows,
       # leading to a runtime error.
       return np.array(ret, dtype=np.int64)
 
     def get_iterator(self, iterator_id):
-      return self._iterators[iterator_id]
+      try:
+        return self._iterators[iterator_id]
+      except KeyError:
+        iterator = iter(self._generator(*self._args.pop(iterator_id)))
+        self._iterators[iterator_id] = iterator
+        return iterator
 
     def iterator_completed(self, iterator_id):
       del self._iterators[iterator_id]
 
   @staticmethod
-  def from_generator(generator, output_types, output_shapes=None):
+  def from_generator(generator, output_types, output_shapes=None, args=None):
     """Creates a `Dataset` whose elements are generated by `generator`.
 
     The `generator` argument must be a callable object that returns
@@ -320,13 +335,17 @@ def gen():
     `Dataset.from_generator()`.
 
     Args:
-      generator: A callable object that takes no arguments and returns an
-        object that supports the `iter()` protocol.
+      generator: A callable object that returns an object that supports the
+        `iter()` protocol. If `args` is not specified, `generator` must take
+        no arguments; otherwise it must take as many arguments as there are
+        values in `args`.
       output_types: A nested structure of `tf.DType` objects corresponding to
         each component of an element yielded by `generator`.
       output_shapes: (Optional.) A nested structure of `tf.TensorShape`
         objects corresponding to each component of an element yielded by
         `generator`.
+      args: (Optional.) A tuple of `tf.Tensor` objects that will be evaluated
+        and passed to `generator` as NumPy-array arguments.
 
     Returns:
       Dataset: A `Dataset`.
@@ -339,8 +358,12 @@ def gen():
     else:
       output_shapes = nest.map_structure_up_to(
           output_types, tensor_shape.as_shape, output_shapes)
+    if args is None:
+      args = ()
+    else:
+      args = tuple(ops.convert_n_to_tensor(args, name="args"))
 
-    flattened_types = nest.flatten(output_types)
+    flattened_types = [dtypes.as_dtype(dt) for dt in nest.flatten(output_types)]
     flattened_shapes = nest.flatten(output_shapes)
 
     generator_state = Dataset._GeneratorState(generator)
@@ -359,7 +382,7 @@ def get_iterator_id_fn(unused_dummy):
         `generator_state`.
       """
       return script_ops.py_func(
-          generator_state.get_next_id, [], dtypes.int64, stateful=True)
+          generator_state.get_next_id, args, dtypes.int64, stateful=True)
 
     def generator_next_fn(iterator_id_t):
       """Generates the next element from iterator with ID `iterator_id_t`.
@@ -726,7 +749,6 @@ def shard(self, num_shards, index):
     d = d.shard(FLAGS.num_workers, FLAGS.worker_index)
     d = d.repeat(FLAGS.num_epochs)
     d = d.shuffle(FLAGS.shuffle_buffer_size)
-    d = d.repeat()
     d = d.interleave(tf.data.TFRecordDataset,
                      cycle_length=FLAGS.num_readers, block_length=1)
     d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
@@ -945,7 +967,8 @@ def filter(self, predicate):
         scalar `tf.bool` tensor.
 
     Returns:
-      Dataset: A `Dataset`.
+      Dataset: The `Dataset` containing the elements of this dataset for which
+          `predicate` is `True`.
     """
     return FilterDataset(self, predicate)
 
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index 0c76afd29d4626..b6dba4e3ca3874 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -52,6 +52,9 @@
     "`next_element` as the input to some computation that is invoked inside "
     "the loop.")
 
+# Collection of all IteratorResources in the `Graph`.
+GLOBAL_ITERATORS = "iterators"
+
 
 @tf_export("data.Iterator")
 class Iterator(object):
@@ -75,8 +78,7 @@ def __init__(self, iterator_resource, initializer, output_types,
       output_shapes: A nested structure of `tf.TensorShape` objects
         corresponding to each component of an element of this dataset.
       output_classes: A nested structure of Python `type` object corresponding
-        to each
-        component of an element of this iterator.
+        to each component of an element of this iterator.
     """
     self._iterator_resource = iterator_resource
     self._initializer = initializer
@@ -86,6 +88,7 @@ def __init__(self, iterator_resource, initializer, output_types,
     self._string_handle = gen_dataset_ops.iterator_to_string_handle(
         self._iterator_resource)
     self._get_next_call_count = 0
+    ops.add_to_collection(GLOBAL_ITERATORS, self._iterator_resource)
 
   @staticmethod
   def from_structure(output_types,
@@ -468,9 +471,7 @@ def __init__(self, dataset):
           sparse.as_dense_types(self._output_types, self._output_classes))
       self._flat_output_shapes = nest.flatten(
           sparse.as_dense_shapes(self._output_shapes, self._output_classes))
-      self._resource = gen_dataset_ops.iterator(
-          shared_name="",
-          container=_generate_shared_name("eageriterator"),
+      self._resource = gen_dataset_ops.anonymous_iterator(
           output_types=self._flat_output_types,
           output_shapes=self._flat_output_shapes)
       gen_dataset_ops.make_iterator(ds_variant, self._resource)
diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py
index fe033f5546498d..a73a8b5cdc494d 100644
--- a/tensorflow/python/data/ops/readers.py
+++ b/tensorflow/python/data/ops/readers.py
@@ -197,6 +197,11 @@ def __init__(self, filenames, compression_type=None, buffer_size=None,
       filenames = array_ops.reshape(filenames, [-1], name="flat_filenames")
       filenames = dataset_ops.Dataset.from_tensor_slices(filenames)
 
+    self._filenames = filenames
+    self._compression_type = compression_type
+    self._buffer_size = buffer_size
+    self._num_parallel_reads = num_parallel_reads
+
     def read_one_file(filename):
       return _TFRecordDataset(filename, compression_type, buffer_size)
 
@@ -208,6 +213,16 @@ def read_one_file(filename):
           block_length=1, sloppy=False, buffer_output_elements=None,
           prefetch_input_elements=None)
 
+  def _clone(self,
+             filenames=None,
+             compression_type=None,
+             buffer_size=None,
+             num_parallel_reads=None):
+    return TFRecordDataset(filenames or self._filenames,
+                           compression_type or self._compression_type,
+                           buffer_size or self._buffer_size,
+                           num_parallel_reads or self._num_parallel_reads)
+
   def _as_variant_tensor(self):
     return self._impl._as_variant_tensor()  # pylint: disable=protected-access
 
diff --git a/tensorflow/python/data/util/nest.py b/tensorflow/python/data/util/nest.py
index eff6e02c1484b2..32e08021dc80d1 100644
--- a/tensorflow/python/data/util/nest.py
+++ b/tensorflow/python/data/util/nest.py
@@ -17,19 +17,16 @@
 """## Functions for working with arbitrarily nested sequences of elements.
 
 NOTE(mrry): This fork of the `tensorflow.python.util.nest` module
-makes three changes:
+makes two changes:
 
-1. It adds support for dictionaries as a level of nesting in nested structures.
-2. It removes support for lists as a level of nesting in nested structures.
-3. It adds support for `SparseTensorValue` as an atomic element.
+1. It removes support for lists as a level of nesting in nested structures.
+2. It adds support for `SparseTensorValue` as an atomic element.
 
-The motivation for this change is threefold:
+The motivation for this change is twofold:
 
-1. Many input-processing functions (e.g. `tf.parse_example()`) return
-   dictionaries, and we would like to support them natively in datasets.
-2. It seems more natural for lists to be treated (e.g. in Dataset constructors)
+1. It seems more natural for lists to be treated (e.g. in Dataset constructors)
    as tensors, rather than lists of (lists of...) tensors.
-3. This is needed because `SparseTensorValue` is implemented as a `namedtuple`
+2. This is needed because `SparseTensorValue` is implemented as a `namedtuple`
    that would normally be flattened and we want to be able to create sparse
    tensor from `SparseTensorValue's similarly to creating tensors from numpy
    arrays.
@@ -43,6 +40,7 @@
 
 import six as _six
 
+from tensorflow.python import pywrap_tensorflow as _pywrap_tensorflow
 from tensorflow.python.framework import sparse_tensor as _sparse_tensor
 
 
@@ -99,22 +97,13 @@ def _yield_value(iterable):
       yield value
 
 
-def _yield_flat_nest(nest):
-  for n in _yield_value(nest):
-    if is_sequence(n):
-      for ni in _yield_flat_nest(n):
-        yield ni
-    else:
-      yield n
-
-
 def is_sequence(seq):
   """Returns a true if `seq` is a Sequence or dict (except strings/lists).
 
   NOTE(mrry): This differs from `tensorflow.python.util.nest.is_sequence()`,
   which *does* treat a Python list as a sequence. For ergonomic
   reasons, `tf.data` users would prefer to treat lists as
-  implict `tf.Tensor` objects, and dicts as (nested) sequences.
+  implicit `tf.Tensor` objects, and dicts as (nested) sequences.
 
   Args:
     seq: an input sequence.
@@ -123,9 +112,7 @@ def is_sequence(seq):
     True if the sequence is a not a string or list and is a
     collections.Sequence.
   """
-  return (isinstance(seq, (_collections.Sequence, dict)) and
-          not isinstance(seq, _sparse_tensor.SparseTensorValue) and
-          not isinstance(seq, (list, _six.string_types)))
+  return _pywrap_tensorflow.IsSequenceForData(seq)
 
 
 def flatten(nest):
@@ -140,7 +127,7 @@ def flatten(nest):
   Returns:
     A Python list, the flattened version of the input.
   """
-  return list(_yield_flat_nest(nest)) if is_sequence(nest) else [nest]
+  return _pywrap_tensorflow.FlattenForData(nest)
 
 
 def _recursive_assert_same_structure(nest1, nest2, check_types):
@@ -536,4 +523,3 @@ def map_structure_up_to(shallow_tree, func, *inputs):
 
   results = [func(*tensors) for tensors in zip(*all_flattened_up_to)]
   return pack_sequence_as(structure=shallow_tree, flat_sequence=results)
-
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 1e37ac2034d6b2..9dc738d4aa5873 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -449,7 +449,6 @@ py_binary(
     deps = [
         ":debug_py",
         "//tensorflow:tensorflow_py",
-        "//third_party/py/numpy",
         "@six_archive//:six",
     ],
 )
@@ -573,6 +572,7 @@ py_test(
         ":source_utils",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client",
+        "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
@@ -1003,6 +1003,8 @@ gpu_py_test(
     tags = [
         "no_oss",  # Test flaky due to port collisions.
         "no_windows",
+        "noasan",  # Times out due to size of test (b/73731462).
+        "optonly",  # Test flaky (b/80130873)
         "oss_serial",
     ],
 )
diff --git a/tensorflow/python/debug/cli/curses_ui.py b/tensorflow/python/debug/cli/curses_ui.py
index f66cefb427c9cc..7b87972d694981 100644
--- a/tensorflow/python/debug/cli/curses_ui.py
+++ b/tensorflow/python/debug/cli/curses_ui.py
@@ -190,8 +190,6 @@ def layout(self):
     return layout
 
   def get_click_command(self, mouse_y):
-    # TODO(cais): Support continuous scrolling when the mouse button is held
-    # down.
     if self._output_num_rows <= 1:
       return None
     elif mouse_y == self._min_y:
@@ -271,6 +269,10 @@ class CursesUI(base_ui.BaseUI):
 
   _UI_WAIT_MESSAGE = "Processing..."
 
+  # The delay (in ms) between each update of the scroll bar when the mouse
+  # button is held down on the scroll bar. Controls how fast the screen scrolls.
+  _MOUSE_SCROLL_DELAY_MS = 100
+
   _single_instance_lock = threading.Lock()
 
   def __init__(self, on_ui_exit=None, config=None):
@@ -855,7 +857,30 @@ def _on_textbox_keypress(self, x):
       except curses.error:
         mouse_event_type = None
 
-      if mouse_event_type == curses.BUTTON1_RELEASED:
+      if mouse_event_type == curses.BUTTON1_PRESSED:
+        # Logic for held mouse-triggered scrolling.
+        if mouse_x >= self._max_x - 2:
+          # Disable blocking on checking for user input.
+          self._command_window.nodelay(True)
+
+          # Loop while mouse button is pressed.
+          while mouse_event_type == curses.BUTTON1_PRESSED:
+            # Sleep for a bit.
+            curses.napms(self._MOUSE_SCROLL_DELAY_MS)
+            scroll_command = self._scroll_bar.get_click_command(mouse_y)
+            if scroll_command in (_SCROLL_UP_A_LINE, _SCROLL_DOWN_A_LINE):
+              self._scroll_output(scroll_command)
+
+            # Check to see if different mouse event is in queue.
+            self._command_window.getch()
+            try:
+              _, _, _, _, mouse_event_type = self._screen_getmouse()
+            except curses.error:
+              pass
+
+          self._command_window.nodelay(False)
+          return x
+      elif mouse_event_type == curses.BUTTON1_RELEASED:
         # Logic for mouse-triggered scrolling.
         if mouse_x >= self._max_x - 2:
           scroll_command = self._scroll_bar.get_click_command(mouse_y)
@@ -1677,4 +1702,7 @@ def _set_mouse_enabled(self, enabled):
       self._redraw_output()
 
   def _screen_set_mousemask(self):
-    curses.mousemask(self._mouse_enabled)
+    if self._mouse_enabled:
+      curses.mousemask(curses.BUTTON1_RELEASED | curses.BUTTON1_PRESSED)
+    else:
+      curses.mousemask(0)
diff --git a/tensorflow/python/debug/examples/debug_tflearn_iris.py b/tensorflow/python/debug/examples/debug_tflearn_iris.py
index 4f4666ee4fa51e..7cbaae46b4f60f 100644
--- a/tensorflow/python/debug/examples/debug_tflearn_iris.py
+++ b/tensorflow/python/debug/examples/debug_tflearn_iris.py
@@ -22,11 +22,9 @@
 import sys
 import tempfile
 
-import numpy as np
 from six.moves import urllib
 import tensorflow as tf
 
-from tensorflow.contrib.learn.python.learn import experiment
 from tensorflow.contrib.learn.python.learn.datasets import base
 from tensorflow.python import debug as tf_debug
 
@@ -82,28 +80,34 @@ def iris_input_fn():
 def main(_):
   # Load datasets.
   if FLAGS.fake_data:
-    training_set = tf.contrib.learn.datasets.base.Dataset(
-        np.random.random([120, 4]),
-        np.random.random_integers(3, size=[120]) - 1)
-    test_set = tf.contrib.learn.datasets.base.Dataset(
-        np.random.random([30, 4]),
-        np.random.random_integers(3, size=[30]) - 1)
+    def training_input_fn():
+      return ({"features": tf.random_normal([128, 4])},
+              tf.random_uniform([128], minval=0, maxval=3, dtype=tf.int32))
+    def test_input_fn():
+      return ({"features": tf.random_normal([32, 4])},
+              tf.random_uniform([32], minval=0, maxval=3, dtype=tf.int32))
+    feature_columns = [
+        tf.feature_column.numeric_column("features", shape=(4,))]
   else:
     training_data_path, test_data_path = maybe_download_data(FLAGS.data_dir)
-    training_set = tf.contrib.learn.datasets.base.load_csv_with_header(
-        filename=training_data_path,
-        target_dtype=np.int,
-        features_dtype=np.float32)
-    test_set = tf.contrib.learn.datasets.base.load_csv_with_header(
-        filename=test_data_path, target_dtype=np.int, features_dtype=np.float32)
-
-  # Specify that all features have real-value data
-  feature_columns = [tf.contrib.layers.real_valued_column("", dimension=4)]
+    column_names = [
+        "sepal_length", "sepal_width", "petal_length", "petal_width", "label"]
+    batch_size = 32
+    def training_input_fn():
+      return tf.contrib.data.make_csv_dataset(
+          [training_data_path], batch_size,
+          column_names=column_names, label_name="label")
+    def test_input_fn():
+      return tf.contrib.data.make_csv_dataset(
+          [test_data_path], batch_size,
+          column_names=column_names, label_name="label")
+    feature_columns = [tf.feature_column.numeric_column(feature)
+                       for feature in column_names[:-1]]
 
   # Build 3 layer DNN with 10, 20, 10 units respectively.
   model_dir = FLAGS.model_dir or tempfile.mkdtemp(prefix="debug_tflearn_iris_")
 
-  classifier = tf.contrib.learn.DNNClassifier(
+  classifier = tf.estimator.DNNClassifier(
       feature_columns=feature_columns,
       hidden_units=[10, 20, 10],
       n_classes=3,
@@ -121,32 +125,23 @@ def main(_):
     debug_hook = tf_debug.TensorBoardDebugHook(FLAGS.tensorboard_debug_address)
   hooks = [debug_hook]
 
-  if not FLAGS.use_experiment:
-    # Fit model.
-    classifier.fit(x=training_set.data,
-                   y=training_set.target,
+  # Train model, using tfdbg hook.
+  classifier.train(training_input_fn,
                    steps=FLAGS.train_steps,
-                   monitors=hooks)
+                   hooks=hooks)
 
-    # Evaluate accuracy.
-    accuracy_score = classifier.evaluate(x=test_set.data,
-                                         y=test_set.target,
-                                         hooks=hooks)["accuracy"]
-  else:
-    ex = experiment.Experiment(classifier,
-                               train_input_fn=iris_input_fn,
-                               eval_input_fn=iris_input_fn,
-                               train_steps=FLAGS.train_steps,
-                               eval_delay_secs=0,
-                               eval_steps=1,
-                               train_monitors=hooks,
-                               eval_hooks=hooks)
-    ex.train()
-    accuracy_score = ex.evaluate()["accuracy"]
+  # Evaluate accuracy, using tfdbg hook.
+  accuracy_score = classifier.evaluate(test_input_fn,
+                                       steps=FLAGS.eval_steps,
+                                       hooks=hooks)["accuracy"]
 
   print("After training %d steps, Accuracy = %f" %
         (FLAGS.train_steps, accuracy_score))
 
+  # Make predictions, using tfdbg hook.
+  predict_results = classifier.predict(test_input_fn, hooks=hooks)
+  print("A prediction result: %s" % next(predict_results))
+
 
 if __name__ == "__main__":
   parser = argparse.ArgumentParser()
@@ -165,14 +160,12 @@ def main(_):
       "--train_steps",
       type=int,
       default=10,
-      help="Number of steps to run trainer.")
+      help="Number of steps to run training for.")
   parser.add_argument(
-      "--use_experiment",
-      type="bool",
-      nargs="?",
-      const=True,
-      default=False,
-      help="Use tf.contrib.learn Experiment to run training and evaluation")
+      "--eval_steps",
+      type=int,
+      default=1,
+      help="Number of steps to run evaluation foir.")
   parser.add_argument(
       "--ui_type",
       type=str,
diff --git a/tensorflow/python/debug/lib/grpc_debug_test_server.py b/tensorflow/python/debug/lib/grpc_debug_test_server.py
index 917004694845c7..a7be20948df0d8 100644
--- a/tensorflow/python/debug/lib/grpc_debug_test_server.py
+++ b/tensorflow/python/debug/lib/grpc_debug_test_server.py
@@ -245,7 +245,7 @@ def __init__(self, server_port, dump_dir, toggle_watch_on_core_metadata=None):
     self._origin_id_to_strings = []
     self._graph_tracebacks = []
     self._graph_versions = []
-    self._source_files = None
+    self._source_files = []
 
   def _initialize_toggle_watch_state(self, toggle_watches):
     self._toggle_watches = toggle_watches
@@ -274,7 +274,7 @@ def clear_data(self):
     self._origin_id_to_strings = []
     self._graph_tracebacks = []
     self._graph_versions = []
-    self._source_files = None
+    self._source_files = []
 
   def SendTracebacks(self, request, context):
     self._call_types.append(request.call_type)
@@ -286,7 +286,7 @@ def SendTracebacks(self, request, context):
     return debug_service_pb2.EventReply()
 
   def SendSourceFiles(self, request, context):
-    self._source_files = request
+    self._source_files.append(request)
     return debug_service_pb2.EventReply()
 
   def query_op_traceback(self, op_name):
@@ -351,9 +351,10 @@ def query_source_file_line(self, file_path, lineno):
     if not self._source_files:
       raise ValueError(
           "This debug server has not received any source file contents yet.")
-    for source_file_proto in self._source_files.source_files:
-      if source_file_proto.file_path == file_path:
-        return source_file_proto.lines[lineno - 1]
+    for source_files in self._source_files:
+      for source_file_proto in source_files.source_files:
+        if source_file_proto.file_path == file_path:
+          return source_file_proto.lines[lineno - 1]
     raise ValueError(
         "Source file at path %s has not been received by the debug server",
         file_path)
diff --git a/tensorflow/python/debug/lib/source_remote.py b/tensorflow/python/debug/lib/source_remote.py
index 4b6b2b995ecd13..4afae41bc9a672 100644
--- a/tensorflow/python/debug/lib/source_remote.py
+++ b/tensorflow/python/debug/lib/source_remote.py
@@ -28,6 +28,7 @@
 from tensorflow.python.debug.lib import debug_service_pb2_grpc
 from tensorflow.python.debug.lib import source_utils
 from tensorflow.python.platform import gfile
+from tensorflow.python.platform import tf_logging
 from tensorflow.python.profiler import tfprof_logger
 
 
@@ -95,6 +96,11 @@ def _source_file_paths_outside_tensorflow_py_library(code_defs, id_to_string):
   return non_tf_files
 
 
+def grpc_message_length_bytes():
+  """Maximum gRPC message length in bytes."""
+  return 4 * 1024 * 1024
+
+
 def _send_call_tracebacks(destinations,
                           origin_stack,
                           is_eager_execution=False,
@@ -155,17 +161,28 @@ def _send_call_tracebacks(destinations,
     source_file_paths.update(_source_file_paths_outside_tensorflow_py_library(
         [call_traceback.origin_stack], call_traceback.origin_id_to_string))
 
-    debugged_source_files = debug_pb2.DebuggedSourceFiles()
+    debugged_source_files = []
     for file_path in source_file_paths:
+      source_files = debug_pb2.DebuggedSourceFiles()
       _load_debugged_source_file(
-          file_path, debugged_source_files.source_files.add())
+          file_path, source_files.source_files.add())
+      debugged_source_files.append(source_files)
 
   for destination in destinations:
     channel = grpc.insecure_channel(destination)
     stub = debug_service_pb2_grpc.EventListenerStub(channel)
     stub.SendTracebacks(call_traceback)
     if send_source:
-      stub.SendSourceFiles(debugged_source_files)
+      for path, source_files in zip(
+          source_file_paths, debugged_source_files):
+        if source_files.ByteSize() < grpc_message_length_bytes():
+          stub.SendSourceFiles(source_files)
+        else:
+          tf_logging.warn(
+              "The content of the source file at %s is not sent to "
+              "gRPC debug server %s, because the message size exceeds "
+              "gRPC message length limit (%d bytes)." % (
+                  path, destination, grpc_message_length_bytes()))
 
 
 def send_graph_tracebacks(destinations,
diff --git a/tensorflow/python/debug/lib/source_remote_test.py b/tensorflow/python/debug/lib/source_remote_test.py
index 27bafa45e12075..29add425e946aa 100644
--- a/tensorflow/python/debug/lib/source_remote_test.py
+++ b/tensorflow/python/debug/lib/source_remote_test.py
@@ -33,6 +33,7 @@
 from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
+from tensorflow.python.platform import test
 from tensorflow.python.util import tf_inspect
 
 
@@ -155,6 +156,51 @@ def testSendGraphTracebacksToTwoDebugServers(self):
         self.assertEqual(["dummy_run_key"], server.query_call_keys())
         self.assertEqual([sess.graph.version], server.query_graph_versions())
 
+  def testSourceFileSizeExceedsGrpcMessageLengthLimit(self):
+    """In case source file size exceeds the grpc message length limit.
+
+    it ought not to have been sent to the server.
+    """
+    this_func_name = "testSourceFileSizeExceedsGrpcMessageLengthLimit"
+
+    # Patch the method to simulate a very small message length limit.
+    with test.mock.patch.object(
+        source_remote, "grpc_message_length_bytes", return_value=2):
+      with session.Session() as sess:
+        a = variables.Variable(21.0, name="two/a")
+        a_lineno = line_number_above()
+        b = variables.Variable(2.0, name="two/b")
+        b_lineno = line_number_above()
+        x = math_ops.add(a, b, name="two/x")
+        x_lineno = line_number_above()
+
+        send_traceback = traceback.extract_stack()
+        send_lineno = line_number_above()
+        source_remote.send_graph_tracebacks(
+            [self._server_address, self._server_address_2],
+            "dummy_run_key", send_traceback, sess.graph)
+
+        servers = [self._server, self._server_2]
+        for server in servers:
+          # Even though the source file content is not sent, the traceback
+          # should have been sent.
+          tb = server.query_op_traceback("two/a")
+          self.assertIn((self._curr_file_path, a_lineno, this_func_name), tb)
+          tb = server.query_op_traceback("two/b")
+          self.assertIn((self._curr_file_path, b_lineno, this_func_name), tb)
+          tb = server.query_op_traceback("two/x")
+          self.assertIn((self._curr_file_path, x_lineno, this_func_name), tb)
+
+          self.assertIn(
+              (self._curr_file_path, send_lineno, this_func_name),
+              server.query_origin_stack()[-1])
+
+          tf_trace_file_path = (
+              self._findFirstTraceInsideTensorFlowPyLibrary(x.op))
+          # Verify that the source content is not sent to the server.
+          with self.assertRaises(ValueError):
+            self._server.query_source_file_line(tf_trace_file_path, 0)
+
   def testSendEagerTracebacksToSingleDebugServer(self):
     this_func_name = "testSendEagerTracebacksToSingleDebugServer"
     send_traceback = traceback.extract_stack()
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 93af9e85df3c13..9be2bd041d1842 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -25,12 +25,14 @@ cc_library(
         "//tensorflow/c/eager:c_api_internal",
         "//tensorflow/c/eager:tape",
         "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/python:cpp_python_util",
         "//tensorflow/python:ndarray_tensor",
         "//tensorflow/python:ndarray_tensor_bridge",
         "//tensorflow/python:numpy_lib",
         "//tensorflow/python:py_seq_tensor",
         "//tensorflow/python:safe_ptr",
-        "//util/python:python_headers",
+        "//third_party/python_runtime:headers",
     ],
 )
 
@@ -191,22 +193,6 @@ py_library(
     ],
 )
 
-cc_library(
-    name = "python_eager_op_gen",
-    srcs = ["python_eager_op_gen.cc"],
-    hdrs = ["python_eager_op_gen.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:op_gen_lib",
-        "//tensorflow/core:proto_text",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/python:python_op_gen",
-    ],
-)
-
 py_library(
     name = "graph_only_ops",
     srcs = ["graph_only_ops.py"],
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index 92774d4d50e00c..bd97b181ff7fa5 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -20,7 +20,6 @@
 
 import functools
 import operator
-import threading
 
 import six
 
@@ -37,7 +36,9 @@
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
+from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
@@ -92,8 +93,8 @@ def _get_control_flow_context(self):
     )
 
 
-def _magic_gradient_function(op_name, attr_tuple, num_inputs,
-                             inputs, outputs, out_grads):
+def _gradient_function(op_name, attr_tuple, num_inputs, inputs, outputs,
+                       out_grads):
   """Calls the gradient function of the op.
 
   Args:
@@ -115,8 +116,7 @@ def _magic_gradient_function(op_name, attr_tuple, num_inputs,
   return grad_fn(mock_op, *out_grads)
 
 
-_gradient_functions = {}
-_gradient_functions_lock = threading.Lock()
+pywrap_tensorflow.TFE_Py_RegisterGradientFunction(_gradient_function)
 
 
 _tracing = False
@@ -140,22 +140,6 @@ def _magic_gradient_function(op_name, attr_tuple, num_inputs,
 }
 
 
-def _get_backward_fn(op_name, attrs, num_inputs, op_inputs, op_outputs):
-
-  def grad_fn(*orig_outputs):
-    result = _magic_gradient_function(op_name, attrs, num_inputs,
-                                      op_inputs, op_outputs, orig_outputs)
-    if _tracing:
-      print("Gradient for", op_name, "inputs", op_inputs, "output_grads",
-            orig_outputs, "gradients", result)
-    return nest.flatten(result)
-
-  return grad_fn
-
-
-pywrap_tensorflow.TFE_Py_RegisterBackwardFunctionGetter(_get_backward_fn)
-
-
 def _record_gradient(op_name, inputs, attrs, results, name):
   return pywrap_tensorflow.TFE_Py_RecordGradient(op_name, inputs, attrs,
                                                  results, name)
@@ -212,27 +196,25 @@ def loss(x, y):
   # TODO(cais): Remove calls to tf.constant() once the gradients functions
   # accept lists and np.ndarrays.
 
-  def grad_fn(*args):
+  def grad_fn(*args, **kwds):
     """Computes the gradient of the wrapped function."""
     this_tape = tape.push_new_tape()
     try:
-      end_node = f(*args)
+      end_node = f(*args, **kwds)
       if end_node is None:
         raise ValueError("Cannot differentiate a function that returns None; "
                          "did you forget to return a value from {}?".format(
                              f.__name__))
     finally:
       tape.pop_tape(this_tape)
-    # Sorting variables by id, which is monotonically increasing in construction
-    # order. This ensures unique order across executions.
-    # TODO(josh11b): Move the sort to the C++ implementation in pywrap_tfe_src.cc.
-    variables = list(sorted(this_tape.watched_variables(),
-                            key=lambda v: v.handle._id))  # pylint: disable=protected-access
-    sources = [x.handle for x in variables]
-
-    if not sources:
+    # Note: variables are returned in construction order. This ensures unique
+    # order across executions.
+    variables = this_tape.watched_variables()
+    if not variables:
       raise ValueError("No trainable variables were accessed while the "
                        "function was being computed.")
+
+    sources = [v.handle for v in variables]
     grad = imperative_grad.imperative_grad(_default_vspace,
                                            this_tape,
                                            nest.flatten(end_node),
@@ -358,6 +340,8 @@ def f(x, y):
   assert y_grad.numpy() == (2 ** 3) - 2 * 2 * 3
   ```
 
+  Note that only tensors with real or complex dtypes are differentiable.
+
   Args:
    f: function to be differentiated. If `f` returns a scalar, this scalar will
      be differentiated. If `f` returns a tensor or list of tensors, by default
@@ -666,23 +650,23 @@ class GradientTape(object):
   be computed as:
 
   ```python
-  x = tf.constant(3.)
-  with tfe.GradientTape() as g:
+  x = tf.constant(3.0)
+  with tf.GradientTape() as g:
     g.watch(x)
     y = x * x
-  grad = g.gradient(y, [x])[0] # Will compute to 6.0
+  dy_dx = g.gradient(y, x) # Will compute to 6.0
   ```
 
   GradientTapes can be nested to compute higher-order derivatives. For example,
 
   ```python
   x = tf.constant(3.0)
-  with tfe.GradientTape() as g:
-    with tfe.GradientTape() as gg:
+  with tf.GradientTape() as g:
+    with tf.GradientTape() as gg:
       gg.watch(x)
       y = x * x
-    dy_dx = gg.gradient(y, [x])[0]     # Will compute to 6.0
-  d2y_dx2 = g.gradient(dy_dx, [x])[0]  # Will compute to 2.0
+    dy_dx = gg.gradient(y, x)     # Will compute to 6.0
+  d2y_dx2 = g.gradient(dy_dx, x)  # Will compute to 2.0
   ```
 
   By default, the resources held by a GradientTape are released as soon as
@@ -693,13 +677,16 @@ class GradientTape(object):
 
   ```python
   x = tf.constant(3.0)
-  with tfe.GradientTape(persistent=True) as g:
+  with tf.GradientTape(persistent=True) as g:
     g.watch(x)
     y = x * x
     z = y * y
-  dy_dx = g.gradient(z, [x])[0]  # 6.0
-  dz_dx = g.gradient(y, [x])[0]  # 108.0 (4*x^3 at x = 3)
+  dz_dx = g.gradient(z, x)  # 108.0 (4*x^3 at x = 3)
+  dy_dx = g.gradient(y, x)  # 6.0
   del g  # Drop the reference to the tape
+  ```
+
+  Note that only tensors with real or complex dtypes are differentiable.
   """
 
   def __init__(self, persistent=False):
@@ -712,13 +699,29 @@ def __init__(self, persistent=False):
     """
     self._tape = None
     self._persistent = persistent
+    self._recording = False
 
   def __enter__(self):
-    self._tape = tape.push_new_tape(persistent=self._persistent)
+    """Enters a context inside which operations are recorded on this tape."""
+    self._push_tape()
     return self
 
   def __exit__(self, typ, value, traceback):
+    """Exits the recording context, no further operations are traced."""
+    if self._recording:
+      self._pop_tape()
+
+  def _push_tape(self):
+    if self._recording:
+      raise ValueError("Tape is already recording.")
+    self._tape = tape.push_new_tape(persistent=self._persistent)
+    self._recording = True
+
+  def _pop_tape(self):
+    if not self._recording:
+      raise ValueError("Tape is not recording.")
     tape.pop_tape(self._tape)
+    self._recording = False
 
   def watch(self, tensor):
     """Ensures that `tensor` is being traced by this tape.
@@ -729,18 +732,81 @@ def watch(self, tensor):
     for t in nest.flatten(tensor):
       tape.watch(_handle_or_self(t))
 
+  @tf_contextlib.contextmanager
+  def stop_recording(self):
+    """Temporarily stops recording operations on this tape.
+
+    Operations executed while this context manager is active will not be
+    recorded on the tape. This is useful for reducing the memory used by tracing
+    all computations.
+
+    For example:
+
+    ```
+      with tf.GradientTape(persistent=True) as t:
+        loss = compute_loss(model)
+        with t.stop_recording():
+          # The gradient computation below is not traced, saving memory.
+          grads = t.gradient(loss, model.variables)
+    ```
+
+    Yields:
+      None
+    Raises:
+      RuntimeError: if the tape is not currently recording.
+    """
+    if self._tape is None:
+      raise RuntimeError(
+          "Trying to stop recording a tape which is not recording.")
+    self._pop_tape()
+    try:
+      yield
+    finally:
+      self._push_tape()
+
+  def reset(self):
+    """Clears all information stored in this tape.
+
+    Equivalent to exiting and reentering the tape context manager with a new
+    tape. For example, the two following code blocks are equivalent:
+    ```
+    with tf.GradientTape() as t:
+      loss = loss_fn()
+    with tf.GradientTape() as t:
+      loss += other_loss_fn()
+    t.gradient(loss, ...)  # Only differentiates other_loss_fn, not loss_fn
+
+
+    # The following is equivalent to the above
+    with tf.GradientTape() as t:
+      loss = loss_fn()
+      t.reset()
+      loss += other_loss_fn()
+    t.gradient(loss, ...)  # Only differentiates other_loss_fn, not loss_fn
+    ```
+
+    This is useful if you don't want to exit the context manager for the tape,
+    or can't because the desired reset point is inside a control flow construct:
+
+    ```
+    with tf.GradientTape() as t:
+      loss = ...
+      if loss > k:
+        t.reset()
+    ```
+    """
+    self._pop_tape()
+    self._push_tape()
+
   def watched_variables(self):
-    # Sorting variables by id, which is monotonically increasing in construction
-    # order. This ensures unique order across executions.
-    # TODO(josh11b): Move the sort to the C++ implementation in pywrap_tfe_src.cc.
-    return list(sorted(self._tape.watched_variables(),
-                       key=lambda v: v.handle._id))  # pylint: disable=protected-access
+    """Returns variables watched by this tape in order of construction."""
+    return self._tape.watched_variables()
 
   def gradient(self, target, sources, output_gradients=None):
     """Computes the gradient using operations recorded in context of this tape.
 
     Args:
-      target: Tensor to be differentiated.
+      target: Tensor (or list of tensors) to be differentiated.
       sources: a list or nested structure of Tensors or Variables. `target`
         will be differentiated against elements in `sources`.
       output_gradients: a list of gradients, one for each element of
@@ -756,14 +822,32 @@ def gradient(self, target, sources, output_gradients=None):
        than once on a non-persistent tape.
     """
     if self._tape is None:
-      raise RuntimeError("GradientTape.gradient can only be called once "
-                         "on non-persistent tapes, and "
-                         "only when the context manager has exited.")
+      raise RuntimeError("GradientTape.gradient can only be called once on "
+                         "non-persistent tapes.")
+    if self._recording:
+      if not self._persistent:
+        self._pop_tape()
+      else:
+        logging.log_first_n(logging.WARN,
+                            "Calling GradientTape.gradient on a persistent "
+                            "tape inside it's context is significantly less "
+                            "efficient than calling it outside the context (it "
+                            "causes the gradient ops to be recorded on the "
+                            "tape, leading to increased CPU and memory usage). "
+                            "Only call GradientTape.gradient inside the "
+                            "context if you actually want to trace the "
+                            "gradient in order to compute higher order "
+                            "derrivatives.", 1)
+
     flat_sources = nest.flatten(sources)
     flat_sources = [_handle_or_self(x) for x in flat_sources]
 
+    if output_gradients is not None:
+      output_gradients = [None if x is None else ops.convert_to_tensor(x)
+                          for x in nest.flatten(output_gradients)]
+
     flat_grad = imperative_grad.imperative_grad(
-        _default_vspace, self._tape, [target], flat_sources,
+        _default_vspace, self._tape, nest.flatten(target), flat_sources,
         output_gradients=output_gradients)
 
     if not self._persistent:
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index 991b4dbe7a688c..826c6683b9668a 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -96,6 +96,38 @@ def fn():
     self.assertAllEqual(grads_and_vars[0][0], 1.0)
     self.assertAllEqual(id(grads_and_vars[0][1]), id(x))
 
+  def testWhereGradient(self):
+    # Note: where is special because only some of its arguments are of
+    # differentiable dtypes.
+
+    def f(x):
+      return array_ops.where(x < 10, x, x * x)
+
+    g = backprop.gradients_function(f)
+
+    self.assertAllEqual(g(5.)[0], 1.0)
+    self.assertAllEqual(g(50.)[0], 100.0)
+
+  def testTwoTargets(self):
+    with backprop.GradientTape() as t:
+      x = constant_op.constant(3.0)
+      y = constant_op.constant(2.0)
+      t.watch([x, y])
+      xx = 2 * x
+      yy = 3 * y
+    dx, dy = t.gradient([xx, yy], [x, y])
+    self.assertAllEqual(dx, 2.0)
+    self.assertAllEqual(dy, 3.0)
+
+  def testOutputGradUsedInComputation(self):
+    with backprop.GradientTape() as t:
+      x = constant_op.constant(3.0)
+      y = constant_op.constant(2.0)
+      t.watch([x, y])
+      loss = x * y
+    dx, = t.gradient([loss, x], [x], output_gradients=[1.0, 2.0])
+    self.assertAllEqual(dx, 4.0)
+
   def testDy(self):
 
     def f(x):
@@ -104,6 +136,14 @@ def f(x):
     grad_fn = backprop.gradients_function(f)
     self.assertAllEqual(2., grad_fn(1., dy=2.)[0])
 
+  def testGradientInteger(self):
+
+    def f(x):
+      return x + x
+
+    int_tensor = constant_op.constant(1)
+    self.assertEqual(backprop.gradients_function(f)(int_tensor)[0], None)
+
   def testErrors(self):
 
     @custom_gradient.custom_gradient
@@ -181,6 +221,21 @@ def f():
     self.assertTrue(ordered_variables[0] is v0)
     self.assertTrue(ordered_variables[1] is v1)
 
+  def testTapeStopRecording(self):
+    with backprop.GradientTape() as t:
+      x = constant_op.constant(1.0)
+      with t.stop_recording():
+        y = x * x
+    self.assertEqual(t.gradient(y, x), None)
+
+  def testTapeReset(self):
+    with backprop.GradientTape() as t:
+      v = resource_variable_ops.ResourceVariable(1.0)
+      loss = v * v
+      t.reset()
+      loss += v * v
+    self.assertAllEqual(t.gradient(loss, v), 2.0)
+
   @test_util.assert_no_new_tensors
   def testGradientNone(self):
 
@@ -202,12 +257,22 @@ def testGradientWithinTapeBlock(self):
     self.evaluate(v1.initializer)
     with backprop.GradientTape() as t:
       loss = 2 * v1
-      with self.assertRaises(RuntimeError):
-        t.gradient(loss, [v1])
+      grad = t.gradient(loss, v1)
+    self.assertAllEqual(self.evaluate(grad), 2.0)
+
     with backprop.GradientTape(persistent=True) as t:
       loss = 2 * v1
-      grad = t.gradient(loss, [v1])
-    self.assertAllEqual(self.evaluate(grad[0]), 2.0)
+      grad = t.gradient(loss, v1)
+    self.assertAllEqual(self.evaluate(grad), 2.0)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testNestedSelfContexts(self):
+    v1 = resource_variable_ops.ResourceVariable(1.)
+    self.evaluate(v1.initializer)
+    with backprop.GradientTape() as t:
+      with self.assertRaises(ValueError):
+        with t:
+          pass
 
   @test_util.assert_no_new_tensors
   def testSecondGrad(self):
@@ -501,6 +566,23 @@ def testPersistentTape(self):
     self.assertEqual(self.evaluate(dy_dx), 2 * 3)
     del g
 
+  @test_util.assert_no_new_tensors
+  @test_util.run_in_graph_and_eager_modes()
+  def testHigherOrderGradient(self):
+    with backprop.GradientTape(persistent=True) as g:
+      x = constant_op.constant(3.0)
+      g.watch(x)
+      y = x ** 3                      # y       := x^3
+      dy_dx = g.gradient(y, x)        # dy/dx   := 3x^2
+      d2y_dx2 = g.gradient(dy_dx, x)  # d2y/dx2 := 6x
+    d3y_dx3 = g.gradient(d2y_dx2, x)  # d3y/dx3 := 6
+    x = 3
+    self.assertEqual(self.evaluate(y), x ** 3)
+    self.assertEqual(self.evaluate(dy_dx), 3 * x ** 2)
+    self.assertEqual(self.evaluate(d2y_dx2), 6 * x)
+    self.assertEqual(self.evaluate(d3y_dx3), 6)
+    del g
+
   @test_util.assert_no_new_tensors
   @test_util.run_in_graph_and_eager_modes()
   def testPersistentNestedTape(self):
@@ -532,6 +614,18 @@ def testGradientTapeVariable(self):
     grad = g.gradient(y, [v])[0]
     self.assertAllEqual(self.evaluate(grad), 2.0)
 
+  @test_util.assert_no_new_tensors
+  @test_util.run_in_graph_and_eager_modes()
+  def testNestedGradients(self):
+    x = constant_op.constant(3.0)
+    with backprop.GradientTape() as g:
+      g.watch(x)
+      y = x * x
+      z = y * y
+    dz_dx, dz_dy = g.gradient(z, [x, y])
+    self.assertEqual(self.evaluate(dz_dx), 108.0)
+    self.assertEqual(self.evaluate(dz_dy), 18.0)
+
   @test_util.assert_no_new_tensors
   def testEmptyParamsForValueAndGradFunction(self):
     def fn(a, b):
@@ -733,7 +827,7 @@ def grad(dr):
       return result, grad
 
     x = resource_variable_ops.ResourceVariable(
-        initial_value=3, name='X.' + self.id())
+        initial_value=3., name='X.' + self.id())
 
     def f():
       return my_square(x)
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index bdbbe864df99f7..559063d6ae0d06 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -20,11 +20,10 @@
 from __future__ import print_function
 
 import collections
-import contextlib
-import threading
 
 import numpy as np
 
+from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import function_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
@@ -32,7 +31,6 @@
 from tensorflow.python.eager import tape
 from tensorflow.python.eager.graph_only_ops import graph_placeholder
 from tensorflow.python.framework import c_api_util
-from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_module
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -43,25 +41,6 @@
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 
-# Thread-local storage for tfe Tensors which are referenced while evaluating a
-# graph-mode function.
-_scoped_captures = threading.local()
-# _scoped_captures.tensors is either None or a map from Tensor id to a pair
-# of a tfe tensor and its corresponding placeholder to pass as a function
-# argument. The value should be None unless we're in function definition
-# context.
-_scoped_captures.tensors = None
-
-
-@contextlib.contextmanager
-def capture_tensors(captures):
-  old = _scoped_captures.__dict__.get("tensors", None)
-  try:
-    _scoped_captures.tensors = captures
-    yield
-  finally:
-    _scoped_captures.tensors = old
-
 
 def capture_value(tensor_map, value, dtype, name):
   """Capture a value from outside the function, to pass in as an extra arg."""
@@ -105,43 +84,6 @@ def capture_value(tensor_map, value, dtype, name):
   return captured_value
 
 
-def _convert_to_graph_tensor(value, dtype=None, name=None, as_ref=False):
-  """Captures a Tensor while building a graph mode function.
-
-  Arguments:
-    value: A Tensor object.
-    dtype: The datatype of the value produced by the node in the graph.
-    name:  str, Name of the node in the graph.
-    as_ref: Ignored (required by register_tensor_conversion_function).
-
-  Returns:
-    Returns a constant (the current value of the tensor) if capturing
-    is not enabled. A placeholder which will have the value of the
-    tensor at runtime otherwise.
-  """
-  del as_ref  # Unused.
-
-  if context.executing_eagerly():
-    return value
-
-  default_graph = ops.get_default_graph()
-  if not default_graph.building_function:
-    return value
-
-  tensor_map = _scoped_captures.tensors
-  if tensor_map is None:
-    # Capturing is not enabled.
-    if value.dtype == dtypes_module.resource:
-      return value
-    return constant_op.constant(value.numpy())
-  if type(value) == ops.Tensor and value.graph is default_graph:
-    # The tensor has already been converted and captured. The type check
-    # is intentional: we are checking that value is a Tensor and not an
-    # EagerTensor.
-    return value
-  return capture_value(tensor_map, value, dtype, name)
-
-
 class CapturingGraph(ops.Graph):
   """Graph used when constructing eager functions."""
 
@@ -161,6 +103,17 @@ def _use_c_api_hack(self):
   def clear_resource_control_flow_state(self):
     self._last_op_using_resource_tensor = {}
 
+  def capture(self, tensor, name=None):
+    if isinstance(tensor, ops.EagerTensor):
+      if name is None:
+        name = str(ops.uid())
+      return capture_value(self.captures, tensor, tensor.dtype, name)
+    if tensor.graph is not self:
+      if name is None:
+        name = tensor.op.name
+      return capture_value(self.captures, tensor, tensor.dtype, name)
+    return tensor
+
   def create_op(
       self,
       op_type,
@@ -176,20 +129,12 @@ def create_op(
     # forward the resources such as Identity and Switch can cause serialization
     # to fail.
     for i, inp in enumerate(inputs):
-      if inp.graph is not self:
-        inputs[i] = capture_value(self.captures, inp, inp.dtype, inp.op.name)
+      inputs[i] = self.capture(inp)
     return super(CapturingGraph, self).create_op(
         op_type, inputs, dtypes, input_types, name, attrs, op_def,
         compute_shapes, compute_device)
 
 
-# TODO(apassos): it'd be really nice if we could scope this registration.
-# Note that we register this at a higher priority than ops.Tensor since we want
-# to handle subclass specific conversion before a superclass conversion.
-ops.register_tensor_conversion_function(
-    ops.EagerTensor, _convert_to_graph_tensor, priority=-1)
-
-
 # pylint: disable=invalid-name
 class HelperContext(object):
   """ControlFlowContext with a customizable AddOp method."""
@@ -283,7 +228,7 @@ def _inference_name(n):
 class _EagerDefinedFunction(object):
   """Function object with the interface of tf _DefinedFunction."""
 
-  def __init__(self, name, graph, operations, inputs, outputs):
+  def __init__(self, name, graph, operations, inputs, outputs, attrs):
     """Initializes an eager defined function.
 
     Args:
@@ -293,6 +238,7 @@ def __init__(self, name, graph, operations, inputs, outputs):
         which will be in the function
       inputs: the tensors in the graph to be used as inputs to the function
       outputs: the tensors in the graph which will be outputs to the function
+      attrs: dict mapping names of attributes to their AttrValue values
     """
     fn = pywrap_tensorflow.TF_GraphToFunction_wrapper(
         graph._c_graph,  # pylint: disable=protected-access
@@ -304,6 +250,14 @@ def __init__(self, name, graph, operations, inputs, outputs):
         [],
         None,
         compat.as_str(""))
+
+    for name, attr_value in attrs.items():
+      serialized = attr_value.SerializeToString()
+      # TODO(iga): this creates and deletes a new TF_Status for every attr.
+      # It might be worth creating a convenient way to re-use status.
+      pywrap_tensorflow.TF_FunctionSetAttrValueProto(
+          fn, compat.as_str(name), serialized)
+
     # TODO(apassos) avoid creating a FunctionDef (specially to grab the
     # signature, but also in general it's nice not to depend on it.
     with c_api_util.tf_buffer() as buffer_:
@@ -345,25 +299,6 @@ def _flatten(sequence):
 
 class GraphModeFunction(object):
   """Callable object representing a graph-mode function.
-
-  Args:
-    name: str the name of the created function
-    input_placeholders: list of placeholder values (tensors) to feed when
-      calling the wrapped function.
-    extra_inputs: Tensor inputs this function definition closed over which
-      are passed as arguments. Need to track so gradients are supported
-      correctly.
-    graph: the Graph from which the operations will be pulled. Used as
-      a context when computing gradients.
-    operations: the subset of Operations in the graph used in the function
-      definition.
-    outputs: a flat list of the Tensors in the graph used as outputs to the
-      function
-    func_outputs: a possibly nested python object which will be returned by
-      this function. The Tensors in this structure will be replaced by their
-      corresponding values in outputs.
-    output_shapes: List of shapes of all tensors in outputs
-    variables: (optional) List of variables to watch during function execution.
   """
 
   def __init__(self,
@@ -375,9 +310,36 @@ def __init__(self,
                outputs,
                func_outputs,
                output_shapes,
-               variables=None):
+               variables=None,
+               attrs=None):
+    """Initialize a GraphModeFunction.
+
+    Args:
+      name: str the name of the created function
+      input_placeholders: list of placeholder values (tensors) to feed when
+        calling the wrapped function.
+      extra_inputs: Tensor inputs this function definition closed over which
+        are passed as arguments. Need to track so gradients are supported
+        correctly.
+      graph: the Graph from which the operations will be pulled. Used as
+        a context when computing gradients.
+      operations: the subset of Operations in the graph used in the function
+        definition.
+      outputs: a flat list of the Tensors in the graph used as outputs to the
+        function
+      func_outputs: a possibly nested python object which will be returned by
+        this function. The Tensors in this structure will be replaced by their
+        corresponding values in outputs.
+      output_shapes: List of shapes of all tensors in outputs
+      variables: (optional) List of variables to watch during function
+        execution.
+      attrs: (optional) dict mapping names of attributes to their AttrValue
+        values. Attributes in `attrs` will be included in this function's
+        definition.
+    """
+    self._attrs = attrs or {}
     defined_function = _EagerDefinedFunction(
-        name, graph, operations, input_placeholders, outputs)
+        name, graph, operations, input_placeholders, outputs, self._attrs)
     if len(input_placeholders) != len(defined_function.signature.input_arg):
       raise ValueError("Internal error: invalid lengths. %s %s" % (
           len(input_placeholders), len(defined_function.signature.input_arg)))
@@ -430,7 +392,7 @@ def _construct_backprop_function(self):
     forward_name = _forward_name(self._func_name)
     self._forward_fdef = _EagerDefinedFunction(
         forward_name, self._graph, self._ops, self._input_placeholders,
-        filtered_outputs + captures)
+        filtered_outputs + captures, self._attrs)
     all_inputs = self._out_grad_placeholders + captures
     # Excluding input ops from the body as we do not intend to execute these
     # operations when the function is executed.
@@ -444,7 +406,7 @@ def _construct_backprop_function(self):
     bname = _backward_name(self._func_name)
     self._backward_function = GraphModeFunction(
         bname, all_inputs, [], self._graph, function_def_ops,
-        backward_outputs, in_gradients, output_shapes)
+        backward_outputs, in_gradients, output_shapes, attrs=self._attrs)
 
   def _backprop_call(self, args):
     """Calls the wrapped function and records the result on a tape."""
@@ -532,7 +494,7 @@ def add_to_graph(self, g):
   def __call__(self, *args):
     """Executes the passed function in eager mode."""
     for v in self._variables:
-      if v._trainable:  # pylint: disable=protected-access
+      if v.trainable:
         tape.watch_variable(v)
 
     tensor_inputs = [x for x in nest.flatten(args) if isinstance(x, ops.Tensor)]
@@ -618,7 +580,7 @@ def _get_defun_inputs(args):
   return nest.pack_sequence_as(args, ret)
 
 
-def _defun_internal(name, func, args, kwds):
+def _defun_internal(name, func, compiled, args, kwds):
   """Defines and returns graph-mode version of func."""
   graph_key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
   with context.graph_mode():
@@ -644,21 +606,21 @@ def convert(x):
         x = a.mark_as_return(x)
         return x
 
-      with capture_tensors(captures):
-        this_tape = tape.push_new_tape()
-        try:
-          func_outputs = func(*func_inputs, **kwds)
-          func_outputs = nest.map_structure(convert, func_outputs)
-        finally:
-          tape.pop_tape(this_tape)
-        variables = this_tape.watched_variables()
-
-        # Returning a closed-over tensor as an output does not trigger a
-        # call to convert_to_tensor, so we manually capture all such tensors.
-        outputs_list = _flatten(func_outputs)
-        func_def_outputs = [
-            _convert_to_graph_tensor(x) for x in outputs_list if x is not None
-        ]
+      this_tape = tape.push_new_tape()
+      try:
+        func_outputs = func(*func_inputs, **kwds)
+        func_outputs = nest.map_structure(convert, func_outputs)
+      finally:
+        tape.pop_tape(this_tape)
+      variables = this_tape.watched_variables()
+
+      # Returning a closed-over tensor as an output does not trigger a
+      # call to convert_to_tensor, so we manually capture all such tensors.
+      outputs_list = _flatten(func_outputs)
+      func_def_outputs = [
+          tmp_graph.capture(x) for x in outputs_list
+          if x is not None
+      ]
 
       ids = list(sorted(captures.keys()))
       if ids:
@@ -683,9 +645,14 @@ def convert(x):
     for f in tmp_graph._functions.values():  # pylint: disable=protected-access
       # TODO(ashankar): What about the gradient registry?
       _register(f._c_func.func)  # pylint: disable=protected-access
+
+  attrs = {}
+  if compiled:
+    attrs["_XlaCompile"] = attr_value_pb2.AttrValue(b=True)
+
   return GraphModeFunction(
       fname, all_inputs, extra_inputs, tmp_graph, operations, func_def_outputs,
-      func_outputs, output_shapes, variables)
+      func_outputs, output_shapes, variables, attrs)
 
 
 # Defun uses this instead of Tensor as a cache key. Using dtype because
@@ -727,15 +694,16 @@ def _register(fn):
 
 
 # TODO(apassos): better error messages for non-hashable arguments.
-def named_defun(func, name):
+def named_defun(func, name, compiled=False):
   """Defines a function with a given name.
 
-  See the documentation for `defun` for more information on the semantics of the
-  function.
+  See the documentation for `defun` for more information on the semantics of
+  this function.
 
   Args:
     func: the function to be wrapped.
     name: the name given to it.
+    compiled: if true, the framework will attempt to compile func with XLA.
 
   Returns:
     the wrapped function.
@@ -752,68 +720,280 @@ def decorated(*args, **kwds):
 
     if cache_key not in arguments_to_functions:
       arguments_to_functions[cache_key] = _defun_internal(
-          name, func, args, kwds)
+          name, func, compiled, args, kwds)
     return arguments_to_functions[cache_key](*args)
 
   return decorated
 
 
-def defun(func):
-  """Decorator to compile func into graph_mode.
+# TODO(akshayka): Remove the `compiled` flag and create a separate
+# API for xla compilation (`defun` is already complicated enough
+# as it is, and the keyword argument makes 'compiled' an overloaded concept)
+def defun(func=None, compiled=False):
+  """Compiles a Python function into a callable TensorFlow graph.
+
+  `defun` (short for "define function") trace-compiles a Python function
+  composed of TensorFlow operations into a callable that executes a @{tf.Graph}
+  containing those operations. When eager execution is enabled, the ability to
+  create graphs from Python functions makes it possible to incrementally trade
+  off debugability and interactivity for performance.  Functions compiled with
+  `defun` cannot be inspected with `pdb` and `print` statements; however,
+  executing a graph generated by `defun` sometimes takes less time and memory
+  than eagerly executing the corresponding Python function, since specifying
+  computations as graphs allows for optimizations like automatic buffer reuse
+  and parallelization among ops. Note that executing a `defun`-compiled function
+  incurs a small constant overhead, so eagerly executing sufficiently small
+  Python functions might take less time than executing their corresponding
+  `defun`-generated graphs.
+
+  For a Python function to be compatible with `defun`, the values of its keyword
+  arguments cannot be Tensors and all of its arguments, including its keyword
+  arguments, must be hashable Python objects or lists thereof. Additionally, it
+  must return zero or more @{tf.Tensor} objects.
+
+  _Example Usage_
 
-  `defun` converts a function that constructs a TensorFlow graph into a function
-  that executes the graph. TensorFlow graphs typically execute faster and with a
-  lower memory-footprint than executing each of the operations that make up the
-  function individually as the TensorFlow runtime can optimize the graph and
-  execute sub-operations in parallel.
-
-  func must be a Python function that constructs a TensorFlow graph,
-  typically using functions in the tensorflow module.
-
-  Arguments to func can be either Tensor objects or Python
-  objects. Non-Tensor python objects are treated as constants, and new function
-  definitions are created internally based on their values.
-
-  func must return a tf.Tensor (NOT a Tensor) or a list of tf.Tensor (NOT a
-  Tensor).
+  ```python
+  import tensorflow as tf
 
-  Control flow constructs (e.g., `if`, `while`) are not yet compatible with
-  `defun`.
+  tf.enable_eager_execution()
 
-  Example:
-  ```python
+  # A simple example.
   def f(x, y):
     return tf.reduce_mean(tf.multiply(x ** 2, 3) + y)
 
-  @tfe.defun
-  def g(x, y):
-    return tf.reduce_mean(tf.multiply(x ** 2, 3) + y)
+  g = tf.contrib.eager.defun(f)
 
   x = tf.constant([[2.0, 3.0]])
   y = tf.constant([[3.0, -2.0]])
-  # The plain function and defun-compiled function should return the same value.
+
+  # `f` and `g` will return the same value, but `g` will be executed as a
+  # TensorFlow graph.
   assert f(x, y).numpy() == g(x, y).numpy()
 
-  # After the first invocation, the defun-compiled (graph) function runs faster
-  # than the plain function because the defun-compiled function does not involve
-  # Python interpreter overhead during the execution.
-  %time print(f(x, y))
-  %time print(g(x, y))
+  # `defun` is capable of compiling Python functions that close over Python
+  # objects, including Tensors and Variables.
+  @tf.contrib.eager.defun
+  def h():
+    return f(x, y)
+
+  assert (h().numpy() == f(x, y).numpy()).all()
+
+  # `defun` automatically lifts variables out of the graphs it creates,
+  # allowing you to compile the `call` methods of `tf.keras.layers.Layer` and
+  # `tf.keras.Model` objects.
+  class MyModel(tf.keras.Model):
+
+    def __init__(self, keep_probability=0.2):
+      super(MyModel, self).__init__()
+      self.dense1 = tf.keras.layers.Dense(4, activation=tf.nn.relu)
+      self.dense2 = tf.keras.layers.Dense(5, activation=tf.nn.softmax)
+      self.keep_probability = keep_probability
+
+    def call(self, inputs, training=True):
+      x = self.dense2(self.dense1(inputs))
+      if training:
+        return tf.nn.dropout(x, self.keep_probability)
+      else:
+        return x
+
+  model = MyModel()
+  model.call = tf.contrib.eager.defun(model.call)
+  model(x, training=True)  # executes a graph, with dropout
+  model(x, training=False) # executes a graph, without dropout
+
+  # `defun`-compiled functions are differentiable.
+  optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
+  with tf.GradientTape() as tape:
+    outputs = model(x)
+  gradient = tape.gradient(outputs, model.trainable_variables)
+  optimizer.apply_gradients((grad, var) for grad, var in zip(gradient,
+                            model.trainable_variables))
   ```
 
+  When using `defun`, there are subtleties regarding inputs, Python control
+  flow, and variable creation that one should be aware of. For concreteness, let
+  `f` be a Python function that returns zero or more @{tf.Tensor} objects and
+  let `F = defun(f)`. `F` builds a graph for each unique input signature it
+  sees, Python control flow is baked into graphs, and operations related to
+  variable initialization are automatically lifted out of the graphs that `F`
+  generates and placed in the eager context if executing eagerly or into an
+  outer graph otherwise.
+
+  _Tracing and Input Signatures_.
+  The signature of inputs supplied to `F` is defined to be a tuple of the shapes
+  and dtypes of Tensor-typed arguments and the values of non-Tensor arguments
+  and keyword arguments. Every time `F` is invoked, the signature of its inputs
+  are inferred. The first time `F(*args, **kwargs)` is invoked with a particular
+  signature, `f(*args, **kwargs)` is executed and all the TensorFlow operations
+  that `f` executes, along with the Tensors that flow between them, are recorded
+  in a TensorFlow graph. `F` caches this graph and binds it to the inputs'
+  signature; every subsequent invocation of `F` with inputs conforming to this
+  signature will immediately retrieve the cached graph and pass it to the
+  TensorFlow runtime for execution.
+
+  Be aware that because `F` only logs TensorFlow operations, all non-TensorFlow
+  operations that `f` executes will only shape the _construction_ of the graphs
+  that `F` executes: They won't be executed when the graphs themselves are
+  executed. For example, whereas the Python function
+
+  ```python
+  import tensorflow as tf
+  import numpy as np
+
+  tf.enable_eager_execution()
+
+  matrix = tf.eye(5)
+  # `matrix` is assumed to be a Tensor
+  def add_noise():
+    return matrix + np.random.randn(matrix.shape[0], matrix.shape[1])
+  ```
+
+  will return a different output everytime it is invoked, the compiled function
+  `compiled = tf.contrib.eager.defun(add_noise)` will return the same value
+  every time it is called, since a particular random offset generated by NumPy
+  will be inserted into the graph as a TensorFlow constant. The solution is to
+  replace the call to `np.random.randn` with `tf.random_normal(matrix.shape)`.
+
+  _Python Control Flow_.
+  The structure of many machine learning computations depend upon whether one is
+  training or validating, and it is common to nest specialized logic under `if
+  training:` blocks. By mapping each input signature to a unique graph, `defun`
+  lets users transparently compile such code, as the following code snippet
+  demonstrates:
+
+  ```python
+  import tensorflow as tf
+
+  tf.enable_eager_execution()
+
+  @tf.contrib.eager.defun
+  def lossy_matmul(W, x, training=True):
+    outputs = tf.matmul(W, x)
+    if training:
+      outputs = tf.nn.dropout(outputs, keep_probability=0.2)
+    return outputs
+
+  W = tf.random_normal((3, 5))
+  x = tf.random_normal((5, 1))
+
+  # Executes a graph that applies dropout.
+  lossy_outputs = lossy_matmul(W, x, training=True)
+
+  # Executes a graph that does not apply dropout.
+  exact_outputs = lossy_matmul(W, x, training=False)
+  ```
+
+  On the other hand, because `defun` generates graphs by tracing and not by
+  source code analysis, it fully unrolls Python `for` and `while` loops,
+  potentially creating large graphs. If your Python function has native loops
+  that run for many iterations, consider replacing them with @{tf.while_loop}
+  operations.
+
+  When constructing graphs, @{tf.Tensor} objects cannot be used as Python
+  `bool` objects. This means, for example, that you should replace code in `f`
+  resembling
+
+  ```python
+
+  if tensor < 10:
+    true_fn()
+  else:
+    false_fn()
+  ```
+
+  with `tf.cond(tensor < 10, true_fn, false_fn)`.
+
+  _Variables_
+  TensorFlow operations related to variable creation and initialization are
+  automatically lifted out of the graphs generated by `defun`. In practice, this
+  implies that variable creation and initialization only happen the first time
+  `F` is called, and that variables are reused every time thereafter. Many
+  TensorFlow APIs, like @{tf.keras.layers.Layer} objects, create variables the
+  first time they are called and reuse them thereafter. Automatic variable
+  lifting makes it possible to compile these APIs without extra effort, at the
+  cost of introducing a discrepancy between the semantics of executing Python
+  functions and their corresponding compiled functions. For example:
+
+  ```python
+  import tensorflow as tf
+
+  tf.enable_eager_execution()
+
+  def fn():
+    x = tf.contrib.eager.Variable(0.0)
+    x.assign_add(1.0)
+    return x.read_value()
+
+  # `fn` is a Python function, so x is created, initialized, and destroyed upon
+  # every invocation
+  assert fn().numpy() == fn().numpy() == 1.0
+
+  compiled = tf.contrib.eager.defun(fn)
+
+  # Compiling `fn` with `defun` hoists all variables outside of the generated
+  # graph, so initialization happens exactly once.
+  assert compiled().numpy() == 1.0
+  assert compiled().numpy() == 2.0
+  ```
+
+  Finally, because each input signature is bound to a unique graph, if your
+  Python function constructs `tf.contrib.eager.Variable` objects, then each
+  graph constructed for that Python function will reference a unique set of
+  variables. To circumvent this problem, we recommend against compiling Python
+  functions that create `tf.contrib.eager.Variable` objects. Instead, Python
+  functions should either lexically close over `tf.contrib.eager.Variable`
+  objects or accept them as arguments, preferably encapsulated in an
+  object-oriented container. If you must create variables inside your Python
+  function and you want each graph generated for it to reference the same set of
+  variables, add logic to your Python function that ensures that variables are
+  only created the first time it is called and are reused for every subsequent
+  invocation; note that this is precisely what @{tf.keras.layers.Layer} objects
+  do, so we recommend using them to represent variable-bearing computations
+  whenever possible.
+
   Args:
-    func: function to be compiled.
+    func: function to be compiled. If `func` is None, returns a
+      decorator that can be invoked with a single argument - `func`. The
+      end result is equivalent to providing all the arguments up front.
+      In other words, defun(compiled=True)(func) is equivalent to
+      defun(func, compiled=True). The former allows the following use case:
+        @tf.contrib.eager.defun(compiled=True)
+        def foo(...):
+          ...
+
+    compiled: If True, an attempt to compile `func` with XLA will be made.
+      If it fails, function will be run normally. Experimental.  Currently
+      supported only for execution on TPUs. For the vast majority of users,
+      this argument should be False.
 
   Returns:
-     A callable that will execute the compiled function (and return zero
-     or more Tensor objects).
+     If `func` is not None, returns a callable that will execute the compiled
+     function (and return zero or more `tf.Tensor` objects).
+     If `func` is None, returns a decorator that, when invoked with a single
+     `func` argument, returns a callable equivalent to the case above.
   """
   # TODO(apassos): deal with captured global state. Deal with control flow.
-  try:
-    name = func.__name__
-  except AttributeError:
-    name = "function"
-  return tf_decorator.make_decorator(func, named_defun(func, name))
+  def decorated(function):
+    try:
+      name = function.__name__
+    except AttributeError:
+      name = "function"
+    return tf_decorator.make_decorator(
+        function, named_defun(function, name, compiled=compiled))
+
+  # This code path is for the `foo = tfe.defun(foo, ...)` use case
+  if func is not None:
+    return decorated(func)
+
+  # This code path is for the
+  #
+  # @tfe.defun(...)
+  # def foo(...):
+  #    ...
+  #
+  # use case, which is equivalent to `foo = tfe.defun(...)(foo)`
+  return decorated
 
 
 def make_defun_op(func, *args, **kwds):
@@ -865,7 +1045,7 @@ def g(x, y):
   name = func.__name__
   if any(isinstance(x, ops.EagerTensor) for x in kwds.values()):
     raise ValueError("Tensor keyword arguments are not supported.")
-  return _defun_internal(name, func, args, kwds)
+  return _defun_internal(name, func, False, args, kwds)
 
 
 class AutomaticControlDependencies(object):
@@ -1035,6 +1215,9 @@ def __exit__(self, unused_type, unused_value, unused_traceback):
     # test that it works. Support while loops. Support init_scope escaping from
     # this.
     for op in new_operations:
+      # TODO(apassos) make this code safely support while loops.
+      if isinstance(op._control_flow_context, control_flow_ops.WhileContext):  # pylint: disable=protected-access
+        continue
       control_inputs = set()
       # Ensure stateful ops run
       if (op.type not in self._graph._registered_ops  # pylint: disable=protected-access
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 1828c987f43430..f53d6c26083cad 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -309,7 +309,7 @@ def f(x):
     def g(x):
       return backprop.gradients_function(f, [0])(x)[0]
 
-    self.assertAllEqual(2, g(constant_op.constant(2)))
+    self.assertAllEqual(2, g(constant_op.constant(2.)))
 
   def testGraphModeEagerGradError(self):
     with context.graph_mode():
@@ -771,6 +771,21 @@ def false_fn():
       self.assertAllEqual(val.eval(feed_dict={p: False}), 10.0)
       self.assertAllEqual(val.eval(feed_dict={p: True}), 20.0)
 
+  def testDefunWhileLoopWithCapturedLoopVars(self):
+    n = 3
+    x = constant_op.constant(list(range(n)))
+
+    @function.defun
+    def loop():
+      c = lambda i, x: i < n
+      b = lambda i, x: (i + 1, x + 1)
+      i, out = control_flow_ops.while_loop(c, b, (0, x))
+      return i, out
+
+    i, out = loop()
+    self.assertEqual(int(i), 3)
+    self.assertAllEqual(out, [3, 4, 5])
+
   def testDecorator(self):
     with context.graph_mode(), self.test_session():
       v = resource_variable_ops.ResourceVariable(1.0)
diff --git a/tensorflow/python/eager/graph_callable.py b/tensorflow/python/eager/graph_callable.py
index d40ea982c74659..760a1485523798 100644
--- a/tensorflow/python/eager/graph_callable.py
+++ b/tensorflow/python/eager/graph_callable.py
@@ -202,7 +202,7 @@ def __call__(self, *args):
         v.handle).numpy() for v in self._call_fn.variables]
     if all(x for x in initialized):
       for v in self._call_fn.variables:
-        if v._trainable:  # pylint: disable=protected-access
+        if v.trainable:
           tape.watch_variable(v)
       return self._call_fn(*args)
     elif all(not x for x in initialized):
@@ -278,8 +278,8 @@ def _graph_callable_internal(func, shape_and_dtypes):
       # variables. As a side-effect this will populate the variable capturing
       # scope's view of which variables exist.
       variable_captures = _VariableCapturingScope()
-      with variable_captures.initializing_scope(), function.capture_tensors(
-          captures), function.AutomaticControlDependencies() as a:
+      with variable_captures.initializing_scope(
+          ), function.AutomaticControlDependencies() as a:
         func_outputs = func(*func_inputs)
         outputs_list = nest.flatten(func_outputs)
         for i, x in enumerate(outputs_list):
@@ -296,8 +296,8 @@ def _graph_callable_internal(func, shape_and_dtypes):
       # placeholders. This assumes the variable capturing scope created above
       # knows about all variables.
       tmp_graph.clear_resource_control_flow_state()
-      with variable_captures.capturing_scope(), function.capture_tensors(
-          captures), function.AutomaticControlDependencies() as a:
+      with variable_captures.capturing_scope(
+          ), function.AutomaticControlDependencies() as a:
         captured_outputs = func(*func_inputs)
       captured_outlist = nest.flatten(captured_outputs)
       for i, x in enumerate(captured_outlist):
diff --git a/tensorflow/python/eager/python_eager_op_gen.cc b/tensorflow/python/eager/python_eager_op_gen.cc
deleted file mode 100644
index 9afab0077b666b..00000000000000
--- a/tensorflow/python/eager/python_eager_op_gen.cc
+++ /dev/null
@@ -1,1047 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/python/eager/python_eager_op_gen.h"
-
-#include <stdio.h>
-#include <sstream>
-#include <unordered_map>
-#include "tensorflow/core/framework/api_def.pb.h"
-#include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_def.pb_text.h"
-#include "tensorflow/core/framework/op_def.pb.h"
-#include "tensorflow/core/framework/op_def_util.h"
-#include "tensorflow/core/framework/op_gen_lib.h"
-#include "tensorflow/core/framework/tensor.pb_text.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/lib/gtl/map_util.h"
-#include "tensorflow/core/lib/gtl/stl_util.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/python/framework/python_op_gen_internal.h"
-
-namespace tensorflow {
-namespace {
-
-const int kRightMargin = 78;
-
-constexpr char kEagerFallbackSuffix[] = "_eager_fallback";
-
-string AttrVarName(const string& attr_name,
-                   std::unordered_map<string, string>* attr_expressions) {
-  const string var = strings::StrCat("_attr_", attr_name);
-  if (attr_expressions != nullptr) (*attr_expressions)[attr_name] = var;
-  return var;
-}
-
-void AddInferredAttr(const string& indentation, const string& attr_name,
-                     const string& value_expression, string* result,
-                     std::unordered_map<string, string>* attr_expressions) {
-  strings::StrAppend(result, indentation,
-                     AttrVarName(attr_name, attr_expressions), " = ",
-                     value_expression, "\n");
-}
-
-string VectorToTuple(const std::vector<string>& l) {
-  if (l.size() == 1) return strings::StrCat("(", l.front(), ",)");
-  string ret = "(";
-  for (int i = 0; i < l.size(); ++i) {
-    if (i > 0) {
-      strings::StrAppend(&ret, ", ");
-    }
-    strings::StrAppend(&ret, l[i]);
-  }
-  strings::StrAppend(&ret, ")");
-  return ret;
-}
-
-void Unflatten(const string& prefix, const std::vector<string>& output_sizes,
-               const string& var, string* result) {
-  for (int i = 0; i < output_sizes.size(); ++i) {
-    if (!output_sizes[i].empty()) {
-      strings::StrAppend(result, prefix, var, " = ");
-      if (i > 0) strings::StrAppend(result, var, "[:", i, "] + ");
-      if (i + 1 < output_sizes.size()) {
-        // Special case i == 0 to avoid "0 +" in the generated code.
-        if (i == 0) {
-          strings::StrAppend(result, "[", var, "[:", output_sizes[i], "]] + ",
-                             var, "[", output_sizes[i], ":]");
-        } else {
-          strings::StrAppend(result, "[", var, "[", i, ":", i, " + ",
-                             output_sizes[i], "]] + ", var, "[", i, " + ",
-                             output_sizes[i], ":]");
-        }
-      } else {
-        strings::StrAppend(result, "[", var, "[", i, ":]]");
-      }
-      strings::StrAppend(result, "\n");
-    }
-  }
-}
-
-string TensorPBString(const TensorProto& pb) {
-  // Note: This gets used in the argument list, and so must survive naive
-  // word wrapping.
-  return strings::StrCat("\"\"\"", ProtoShortDebugString(pb), "\"\"\"");
-}
-
-const ApiDef::Arg* FindInputArg(StringPiece name, const ApiDef& api_def) {
-  for (int i = 0; i < api_def.in_arg_size(); ++i) {
-    if (api_def.in_arg(i).name() == name) {
-      return &api_def.in_arg(i);
-    }
-  }
-  return nullptr;
-}
-
-class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
- public:
-  GenEagerPythonOp(const OpDef& op_def, const ApiDef& api_def,
-                   const string& function_name)
-      : python_op_gen_internal::GenPythonOp(op_def, api_def, function_name) {
-    op_name_ = function_name_;
-    str_util::ConsumePrefix(&op_name_, "_");
-  }
-  ~GenEagerPythonOp() override {}
-
-  string Code() override;
-
- protected:
-  void HandleGraphMode(const string& function_setup);
-
-  string GetEagerNotAllowedError();
-  void ExpectListArg(const string& indentation, const string& arg_name,
-                     string* output);
-  bool GetEagerFunctionSetup(const string& indentation, string* function_setup);
-  void GetOutputSizesAndNumOutputsExpr(std::vector<string>* output_sizes,
-                                       string* num_outputs_expr);
-
-  void AddEagerFunctionTeardown(const string& indentation,
-                                const std::vector<string>& output_sizes,
-                                bool execute_record_gradient);
-
-  bool AddEagerFastPathAndGraphCode(const string& parameters,
-                                    const std::vector<string>& output_sizes,
-                                    const string& eager_not_allowed_error);
-  bool AddEagerFallbackCode(const string& parameters,
-                            const std::vector<string>& output_sizes,
-                            const string& num_outputs_expr,
-                            const string& eager_not_allowed_error);
-  void AddEagerFastPathExecute();
-
-  void AddEagerInferredAttrs(const string& indentation);
-  void AddEagerInputCasts(const string& indentation);
-  void AddEagerAttrs(const string& indentation);
-  void AddEagerExecute(const string& indentation,
-                       const string& num_outputs_expr);
-
-  void AddAttrForArg(const string& attr, int arg_index) {
-    gtl::InsertIfNotPresent(&inferred_attrs_, attr,
-                            op_def_.input_arg(arg_index).name());
-    auto iter = attr_to_args_.find(attr);
-    if (iter == attr_to_args_.end()) {
-      attr_to_args_.insert(AttrToArgMap::value_type(attr, {arg_index}));
-    } else {
-      iter->second.push_back(arg_index);
-    }
-  }
-
-  // Returns a string expression representing a flattened list of all
-  // the inputs given by `*input_indices` (or all inputs if
-  // `input_indices` is nullptr).  `*output_sizes` can be used to unflatten.
-  string FlattenInputs(const std::vector<int>* input_indices,
-                       std::vector<string>* output_sizes) const;
-
-  StringPiece op_name_;
-  typedef std::unordered_map<string, std::vector<int>> AttrToArgMap;
-  AttrToArgMap attr_to_args_;
-  std::unordered_map<string, string> attr_expressions_;
-  // This has all the input args followed by those attrs that don't have
-  // defaults.
-  std::vector<python_op_gen_internal::ParamNames> params_no_default_;
-  // The parameters with defaults (these have to be listed after those without).
-  // No input args are included, just attrs.
-  std::vector<std::pair<python_op_gen_internal::ParamNames, string>>
-      params_with_default_;
-};
-
-string GetEagerPythonOp(const OpDef& op_def, const ApiDef& api_def,
-                        const string& function_name) {
-  return GenEagerPythonOp(op_def, api_def, function_name).Code();
-}
-
-string GenEagerPythonOp::FlattenInputs(
-    const std::vector<int>* input_indices,
-    std::vector<string>* output_sizes) const {
-  string inputs;
-  enum { STARTING, WAS_LIST_INPUT, WAS_SOLO_INPUT } inputs_state = STARTING;
-  const int n = input_indices != nullptr ? input_indices->size()
-                                         : op_def_.input_arg_size();
-  for (int j = 0; j < n; ++j) {
-    const int i = input_indices ? (*input_indices)[j] : j;
-    const auto& arg(op_def_.input_arg(i));
-    const bool is_list =
-        !arg.type_list_attr().empty() || !arg.number_attr().empty();
-    if (is_list) {
-      if (inputs_state == WAS_SOLO_INPUT) {
-        strings::StrAppend(&inputs, "] + ");
-      } else if (inputs_state == WAS_LIST_INPUT) {
-        strings::StrAppend(&inputs, " + ");
-      }
-      strings::StrAppend(&inputs, "list(", param_names_[i].GetRenameTo(), ")");
-      inputs_state = WAS_LIST_INPUT;
-      if (output_sizes != nullptr) {
-        if (!arg.number_attr().empty()) {
-          output_sizes->emplace_back(AttrVarName(arg.number_attr(), nullptr));
-        } else {
-          output_sizes->emplace_back(
-              strings::StrCat("len(", param_names_[i].GetRenameTo(), ")"));
-        }
-      }
-    } else {
-      if (inputs_state == WAS_SOLO_INPUT) {
-        strings::StrAppend(&inputs, ", ");
-      } else if (inputs_state == WAS_LIST_INPUT) {
-        strings::StrAppend(&inputs, " + [");
-      } else {
-        strings::StrAppend(&inputs, "[");
-      }
-      strings::StrAppend(&inputs, param_names_[i].GetRenameTo());
-      inputs_state = WAS_SOLO_INPUT;
-      if (output_sizes != nullptr) output_sizes->emplace_back();
-    }
-  }
-  if (inputs_state == STARTING) return "[]";
-  if (inputs_state == WAS_SOLO_INPUT) {
-    strings::StrAppend(&inputs, "]");
-  }
-  return inputs;
-}
-
-string GenEagerPythonOp::Code() {
-  if (api_def_.visibility() == ApiDef::SKIP) {
-    return "";
-  }
-
-  for (int i = 0; i < api_def_.arg_order_size(); ++i) {
-    const auto& arg = *FindInputArg(api_def_.arg_order(i), op_def_);
-    const auto& api_def_arg = *FindInputArg(api_def_.arg_order(i), api_def_);
-    params_no_default_.emplace_back(api_def_arg.name(),
-                                    api_def_arg.rename_to());
-    if (!arg.type_attr().empty()) {
-      AddAttrForArg(arg.type_attr(), i);
-    } else if (!arg.type_list_attr().empty()) {
-      AddAttrForArg(arg.type_list_attr(), i);
-    }
-    if (!arg.number_attr().empty()) {
-      AddAttrForArg(arg.number_attr(), i);
-    }
-  }
-  for (int i = 0; i < op_def_.attr_size(); ++i) {
-    const auto& attr(op_def_.attr(i));
-    const auto& api_def_attr(api_def_.attr(i));
-    // Do not add inferred attrs to the Python function signature.
-    if (inferred_attrs_.find(attr.name()) == inferred_attrs_.end()) {
-      if (api_def_attr.has_default_value()) {
-        if (attr.type() == "tensor") {
-          params_with_default_.emplace_back(
-              python_op_gen_internal::ParamNames(api_def_attr.name(),
-                                                 api_def_attr.rename_to()),
-              strings::StrCat(
-                  "_execute.make_tensor(",
-                  TensorPBString(api_def_attr.default_value().tensor()), ", \"",
-                  api_def_attr.rename_to(), "\")"));
-        } else if (attr.type() == "list(tensor)") {
-          std::vector<string> pbtxt;
-          for (const auto& pb : api_def_attr.default_value().list().tensor()) {
-            pbtxt.emplace_back(TensorPBString(pb));
-          }
-          params_with_default_.emplace_back(
-              python_op_gen_internal::ParamNames(api_def_attr.name(),
-                                                 api_def_attr.rename_to()),
-              strings::StrCat("[_execute.make_tensor(_pb, \"",
-                              api_def_attr.rename_to(), "\") for _pb in ",
-                              VectorToTuple(pbtxt), "]"));
-        } else {
-          params_with_default_.emplace_back(
-              python_op_gen_internal::ParamNames(api_def_attr.name(),
-                                                 api_def_attr.rename_to()),
-              python_op_gen_internal::AttrValueToPython(
-                  attr.type(), api_def_attr.default_value(), "_dtypes."));
-        }
-      } else {
-        params_no_default_.emplace_back(api_def_attr.name(),
-                                        api_def_attr.rename_to());
-      }
-    }
-  }
-
-  // Save the list of attr parameters (attrs that won't be inferred),
-  // those with defaults go at the end.
-  // Get the attrs in the order we want by taking the attrs without defaults
-  // from the end of params_no_default_, and adding params_no_default_.
-  attrs_.reserve(params_no_default_.size() - op_def_.input_arg_size() +
-                 params_with_default_.size());
-  for (int i = op_def_.input_arg_size(); i < params_no_default_.size(); ++i) {
-    attrs_.push_back(params_no_default_[i].GetName());
-  }
-  for (const auto& p : params_with_default_) {
-    attrs_.push_back(p.first.GetName());
-  }
-
-  param_names_.reserve(params_no_default_.size() + params_with_default_.size());
-  param_names_.insert(param_names_.begin(), params_no_default_.begin(),
-                      params_no_default_.end());
-  for (const auto& param_and_default : params_with_default_) {
-    param_names_.push_back(param_and_default.first);
-  }
-
-  string parameters;
-  for (const auto& param : params_no_default_) {
-    if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
-    strings::StrAppend(&parameters, param.GetRenameTo());
-  }
-  for (const auto& param_and_default : params_with_default_) {
-    if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
-    strings::StrAppend(&parameters, param_and_default.first.GetRenameTo(), "=",
-                       param_and_default.second);
-  }
-  if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
-  strings::StrAppend(&parameters, "name=None");
-
-  // Add attr_expressions_ for attrs that are params.
-  for (int i = 0; i < attrs_.size(); ++i) {
-    const string& attr_name = attrs_[i];
-    const string& attr_api_name =
-        param_names_[i + op_def_.input_arg_size()].GetRenameTo();
-    attr_expressions_[attr_name] = attr_api_name;
-  }
-  // Add attr_expressions_ for attrs that are inferred.
-  for (int i = 0; i < op_def_.attr_size(); ++i) {
-    const auto& attr(op_def_.attr(i));
-    if (attr.type() == "int") {
-      auto arg_list = attr_to_args_.find(attr.name());
-      if (arg_list != attr_to_args_.end()) {
-        AttrVarName(attr.name(), &attr_expressions_);
-      }
-    }
-  }
-
-  string num_outputs_expr;
-  std::vector<string> output_sizes(num_outs_);
-  GetOutputSizesAndNumOutputsExpr(&output_sizes, &num_outputs_expr);
-
-  string eager_not_allowed_error = GetEagerNotAllowedError();
-
-  if (!AddEagerFastPathAndGraphCode(parameters, output_sizes,
-                                    eager_not_allowed_error)) {
-    return result_;
-  }
-
-  if (!AddEagerFallbackCode(parameters, output_sizes, num_outputs_expr,
-                            eager_not_allowed_error)) {
-    return result_;
-  }
-
-  return prelude_ + result_;
-}
-
-void GenEagerPythonOp::HandleGraphMode(const string& function_setup) {
-  // Handle graph-mode case
-  strings::StrAppend(&result_,
-                     "  _ctx = _context._context\n"
-                     "  if _ctx is None or not _ctx._eager_context.is_eager:\n",
-                     function_setup,
-                     "    _, _, _op = _op_def_lib._apply_op_helper(\n");
-  AddBodyNoReturn("        ");
-  if (num_outs_ > 0) {
-    strings::StrAppend(&result_, "    _result = _op.outputs[:]\n");
-    // Special case handling for stateful op with single list output
-    // that might be empty.
-    if (num_outs_ == 1 && op_def_.is_stateful() &&
-        (!op_def_.output_arg(0).number_attr().empty() ||
-         !op_def_.output_arg(0).type_list_attr().empty())) {
-      // TODO(josh11b): Can skip this if the number_attr/type_list_attr has
-      // a constraint indicating that this can never be empty.
-      strings::StrAppend(&result_,
-                         "    if not _result:\n"
-                         "      return _op\n");
-    }
-    strings::StrAppend(&result_, "    _inputs_flat = _op.inputs\n");
-
-    // Compute graph-mode attrs.
-    if (op_def_.attr_size() > 0) {
-      string attr_values;
-      for (int i = 0; i < op_def_.attr_size(); ++i) {
-        if (i > 0) strings::StrAppend(&attr_values, ", ");
-        const auto& attr_name(op_def_.attr(i).name());
-        strings::StrAppend(&attr_values, "\"", attr_name, "\", _op.get_attr(\"",
-                           attr_name, "\")");
-      }
-      strings::StrAppend(&attr_values, ")");
-      strings::StrAppend(&result_,
-                         WordWrap("    _attrs = (", attr_values, kRightMargin),
-                         "\n");
-    } else {
-      strings::StrAppend(&result_, "    _attrs = None\n");
-    }
-  } else {
-    strings::StrAppend(&result_, "    return _op\n");
-  }
-}
-
-string GenEagerPythonOp::GetEagerNotAllowedError() {
-  bool eager_allowed = true;
-  string ref_arg;
-  for (int i = 0; i < op_def_.input_arg_size(); ++i) {
-    const auto& arg = op_def_.input_arg(i);
-    if (arg.is_ref()) {
-      eager_allowed = false;
-      DCHECK_EQ(op_def_.input_arg(i).name(), api_def_.in_arg(i).name());
-      ref_arg = api_def_.in_arg(i).rename_to();
-    }
-  }
-  for (int i = 0; i < op_def_.output_arg_size(); ++i) {
-    const auto& arg = op_def_.output_arg(i);
-    if (arg.is_ref()) {
-      eager_allowed = false;
-      DCHECK_EQ(op_def_.output_arg(i).name(), api_def_.out_arg(i).name());
-      ref_arg = api_def_.out_arg(i).rename_to();
-    }
-  }
-
-  if (eager_allowed) return "";
-
-  return strings::StrCat("raise RuntimeError(\"", op_name_,
-                         " op does not support eager execution. ", "Arg '",
-                         ref_arg, "' is a ref.\")\n");
-}
-
-void GenEagerPythonOp::ExpectListArg(const string& indentation,
-                                     const string& arg_name, string* output) {
-  strings::StrAppend(output, indentation, "if not isinstance(", arg_name,
-                     ", (list, tuple)):\n", indentation, "  raise TypeError(\n",
-                     indentation, "      \"Expected list for '", arg_name,
-                     "' argument to \"\n", indentation, "      \"'", op_name_,
-                     "' Op, not %r.\" % ", arg_name, ")\n");
-}
-
-bool GenEagerPythonOp::GetEagerFunctionSetup(const string& indentation,
-                                             string* function_setup) {
-  // Validate list inputs, infer length attrs.
-  for (int i = 0; i < op_def_.attr_size(); ++i) {
-    const auto& attr(op_def_.attr(i));
-    if (attr.type() == "int") {
-      auto arg_list = attr_to_args_.find(attr.name());
-      if (arg_list != attr_to_args_.end()) {
-        // Inferred int attrs are the lengths of inputs. Validate those
-        // inputs are lists and have the same length.
-        for (auto iter = arg_list->second.begin();
-             iter != arg_list->second.end(); ++iter) {
-          const string& arg_api_name = param_names_[*iter].GetRenameTo();
-          ExpectListArg(indentation, arg_api_name, function_setup);
-          if (iter == arg_list->second.begin()) {
-            AddInferredAttr(indentation, attr.name(),
-                            strings::StrCat("len(", arg_api_name, ")"),
-                            function_setup, &attr_expressions_);
-          } else {
-            const auto& attr_var = attr_expressions_[attr.name()];
-            strings::StrAppend(
-                function_setup, indentation, "if len(", arg_api_name,
-                ") != ", attr_var, ":\n", indentation, "  raise ValueError(\n",
-                indentation, "      \"List argument '", arg_api_name, "' to '",
-                op_name_, "' Op with length %d \"\n", indentation,
-                "      \"must match length %d of argument '",
-                inferred_attrs_[attr.name()], "'.\" %\n", indentation,
-                "      (len(", arg_api_name, "), ", attr_var, "))\n");
-          }
-        }
-      }
-    }
-  }
-
-  for (int i = 0; i < attrs_.size(); ++i) {
-    const string& attr_name = attrs_[i];
-    const auto& param = param_names_[i + op_def_.input_arg_size()];
-    const auto& attr = *FindAttr(attr_name, op_def_);
-    const string& attr_api_name = param.GetRenameTo();
-    StringPiece attr_type = attr.type();
-    attr_expressions_[attr_name] = attr_api_name;
-    const int default_index = i - (attrs_.size() - params_with_default_.size());
-    if (default_index >= 0) {
-      const string& default_value = params_with_default_[default_index].second;
-      strings::StrAppend(function_setup, indentation, "if ", attr_api_name,
-                         " is None:\n");
-      strings::StrAppend(function_setup, indentation, "  ", attr_api_name,
-                         " = ", default_value, "\n");
-    }
-    if (str_util::StartsWith(attr_type, "list(")) {
-      ExpectListArg(indentation, attr_api_name, function_setup);
-    }
-
-    if (attr_type == "string") {
-      strings::StrAppend(function_setup, indentation, attr_api_name,
-                         " = _execute.make_str(", attr_api_name, ", \"",
-                         attr_api_name, "\")\n");
-    } else if (attr_type == "list(string)") {
-      strings::StrAppend(function_setup, indentation, attr_api_name,
-                         " = [_execute.make_str(_s, \"", attr_api_name,
-                         "\") for _s in ", attr_api_name, "]\n");
-    } else if (attr_type == "int") {
-      strings::StrAppend(function_setup, indentation, attr_api_name,
-                         " = _execute.make_int(", attr_api_name, ", \"",
-                         attr_api_name, "\")\n");
-    } else if (attr_type == "list(int)") {
-      strings::StrAppend(function_setup, indentation, attr_api_name,
-                         " = [_execute.make_int(_i, \"", attr_api_name,
-                         "\") for _i in ", attr_api_name, "]\n");
-    } else if (attr_type == "float") {
-      strings::StrAppend(function_setup, indentation, attr_api_name,
-                         " = _execute.make_float(", attr_api_name, ", \"",
-                         attr_api_name, "\")\n");
-    } else if (attr_type == "list(float)") {
-      strings::StrAppend(function_setup, indentation, attr_api_name,
-                         " = [_execute.make_float(_f, \"", attr_api_name,
-                         "\") for _f in ", attr_api_name, "]\n");
-    } else if (attr_type == "bool") {
-      strings::StrAppend(function_setup, indentation, attr_api_name,
-                         " = _execute.make_bool(", attr_api_name, ", \"",
-                         attr_api_name, "\")\n");
-    } else if (attr_type == "list(bool)") {
-      strings::StrAppend(function_setup, indentation, attr_api_name,
-                         " = [_execute.make_bool(_b, \"", attr_api_name,
-                         "\") for _b in ", attr_api_name, "]\n");
-    } else if (attr_type == "type") {
-      strings::StrAppend(function_setup, indentation, attr_api_name,
-                         " = _execute.make_type(", attr_api_name, ", \"",
-                         attr_api_name, "\")\n");
-    } else if (attr_type == "list(type)") {
-      strings::StrAppend(function_setup, indentation, attr_api_name,
-                         " = [_execute.make_type(_t, \"", attr_api_name,
-                         "\") for _t in ", attr_api_name, "]\n");
-    } else if (attr_type == "shape") {
-      strings::StrAppend(function_setup, indentation, attr_api_name,
-                         " = _execute.make_shape(", attr_api_name, ", \"",
-                         attr_api_name, "\")\n");
-    } else if (attr_type == "list(shape)") {
-      strings::StrAppend(function_setup, indentation, attr_api_name,
-                         " = [_execute.make_shape(_s, \"", attr_api_name,
-                         "\") for _s in ", attr_api_name, "]\n");
-    } else if (attr_type == "tensor") {
-      strings::StrAppend(function_setup, indentation, attr_api_name,
-                         " = _execute.make_tensor(", attr_api_name, ", \"",
-                         attr_api_name, "\")\n");
-    } else if (attr_type == "list(tensor)") {
-      strings::StrAppend(function_setup, indentation, attr_api_name,
-                         " = [_execute.make_tensor(_t, \"", attr_api_name,
-                         "\") for _t in ", attr_api_name, "]\n");
-    } else if (attr_type != "func") {
-      *function_setup =
-          strings::StrCat("# No definition for ", function_name_,
-                          " since we don't support attrs with type\n"
-                          "# '",
-                          attr_type, "' right now.\n\n");
-      return false;
-    }
-  }
-  return true;
-}
-
-// If output i is list output, output_sizes[i] will be set to a
-// string with the python expression that will evaluate to its
-// length. output_sizes[i] is empty for non-list outputs.
-void GenEagerPythonOp::GetOutputSizesAndNumOutputsExpr(
-    std::vector<string>* output_sizes, string* num_outputs_expr) {
-  // Expression representing the number of outputs.
-  int num_fixed_outputs = 0;
-  for (int i = 0; i < num_outs_; ++i) {
-    const auto& arg(op_def_.output_arg(i));
-    if (!arg.number_attr().empty()) {
-      if (!num_outputs_expr->empty()) {
-        strings::StrAppend(num_outputs_expr, " + ");
-      }
-      (*output_sizes)[i] = attr_expressions_[arg.number_attr()];
-      strings::StrAppend(num_outputs_expr, (*output_sizes)[i]);
-    } else if (!arg.type_list_attr().empty()) {
-      if (!num_outputs_expr->empty()) {
-        strings::StrAppend(num_outputs_expr, " + ");
-      }
-      // Have to be careful to use an expression that works in both
-      // graph and eager paths here.
-      const auto iter = inferred_attrs_.find(arg.type_list_attr());
-      if (iter == inferred_attrs_.end()) {
-        (*output_sizes)[i] = strings::StrCat(
-            "len(", attr_expressions_[arg.type_list_attr()], ")");
-      } else {
-        (*output_sizes)[i] = strings::StrCat("len(", iter->second, ")");
-      }
-      strings::StrAppend(num_outputs_expr, (*output_sizes)[i]);
-    } else {
-      ++num_fixed_outputs;
-    }
-  }
-  if (num_fixed_outputs > 0) {
-    if (!num_outputs_expr->empty()) {
-      strings::StrAppend(num_outputs_expr, " + ");
-    }
-    strings::StrAppend(num_outputs_expr, num_fixed_outputs);
-  } else if (num_outputs_expr->empty()) {
-    *num_outputs_expr = "0";
-  }
-}
-
-void GenEagerPythonOp::AddEagerFunctionTeardown(
-    const string& indentation, const std::vector<string>& output_sizes,
-    bool execute_record_gradient) {
-  if (num_outs_ > 0) {
-    if (execute_record_gradient) {
-      strings::StrAppend(&result_, indentation, "_execute.record_gradient(\n",
-                         "      \"", op_def_.name(),
-                         "\", _inputs_flat, _attrs, _result, name)\n");
-    }
-    if (num_outs_ == 1 && !output_sizes[0].empty()) {
-      // Single list result.
-    } else if (num_outs_ == 1) {
-      // Execute returns a single-element list which we need to destructure.
-      strings::StrAppend(&result_, indentation, "_result, = _result\n");
-    } else {
-      // Have multiple outputs, so we will need to reformat the return
-      // value of execute() to be a list with one entry per op output
-      // (that entry will be a list of tensors if that output is of list
-      // type).
-      // For list outputs, convert the right subrange of _result into a list.
-      Unflatten(indentation, output_sizes, "_result", &result_);
-      // Convert to a named tuple.
-      strings::StrAppend(&result_, indentation, "_result = _", op_def_.name(),
-                         "Output._make(_result)\n");
-    }
-  } else {
-    strings::StrAppend(&result_, indentation, "_result = None\n");
-  }
-  strings::StrAppend(&result_, indentation, "return _result\n\n");
-}
-
-bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
-    const string& parameters, const std::vector<string>& output_sizes,
-    const string& eager_not_allowed_error) {
-  AddExport();
-  AddDefLine(function_name_, parameters);
-  AddDocStringDescription();
-  AddDocStringArgs();
-  AddDocStringInputs();
-  AddDocStringAttrs();
-  AddDocStringNameArg();
-  AddOutputGlobals();  // Added to prelude_
-  AddDocStringOutputs();
-  strings::StrAppend(&result_, "  \"\"\"\n");
-
-  // Handle graph-mode case
-  string function_setup;
-  if (!GetEagerFunctionSetup("    ", &function_setup)) {
-    result_ = function_setup;
-    return false;
-  }
-  HandleGraphMode(function_setup);
-  AddEagerFunctionTeardown("    ", output_sizes,
-                           true /* execute_record_gradient */);
-
-  // Handle eager-mode case
-  strings::StrAppend(&result_, "  else:\n");
-
-  if (eager_not_allowed_error.empty()) {
-    AddEagerFastPathExecute();
-  } else {
-    strings::StrAppend(&result_, "    ", eager_not_allowed_error);
-  }
-
-  strings::StrAppend(&result_, "\n\n");
-  return true;
-}
-
-bool GenEagerPythonOp::AddEagerFallbackCode(
-    const string& parameters, const std::vector<string>& output_sizes,
-    const string& num_outputs_expr, const string& eager_not_allowed_error) {
-  if (!eager_not_allowed_error.empty()) {
-    strings::StrAppend(&result_, "  ", eager_not_allowed_error);
-    return true;
-  }
-
-  AddDefLine(strings::StrCat(function_name_, kEagerFallbackSuffix),
-             strings::StrCat(parameters, ", ctx=None"));
-  strings::StrAppend(
-      &result_, "  r\"\"\"This is the slowpath function for Eager mode.\n");
-  strings::StrAppend(&result_, "  This is for function ", function_name_,
-                     "\n  \"\"\"\n");
-
-  strings::StrAppend(&result_, "  _ctx = ctx if ctx else _context.context()\n");
-
-  string function_setup;
-  if (!GetEagerFunctionSetup("  ", &function_setup)) {
-    result_ = function_setup;
-    return false;
-  }
-  strings::StrAppend(&result_, function_setup);
-
-  AddEagerInferredAttrs("  ");
-  AddEagerInputCasts("  ");
-  strings::StrAppend(
-      &result_, "  _inputs_flat = ", FlattenInputs(nullptr, nullptr), "\n");
-  AddEagerAttrs("  ");
-  AddEagerExecute("  ", num_outputs_expr);
-
-  AddEagerFunctionTeardown("  ", output_sizes,
-                           true /* execute_record_gradient */);
-
-  return true;
-}
-
-void GenEagerPythonOp::AddEagerFastPathExecute() {
-  string fastpath_execute_params = strings::StrCat(
-      "_ctx._context_handle, _ctx._eager_context.device_name, \"",
-      op_def_.name(), "\", ", "name, _ctx._post_execution_callbacks");
-  string fallback_params;
-
-  for (int i = 0; i < api_def_.in_arg_size(); i++) {
-    const string param_name = param_names_[i].GetRenameTo();
-    strings::StrAppend(&fastpath_execute_params, ", ", param_name);
-    if (!fallback_params.empty()) strings::StrAppend(&fallback_params, ", ");
-    strings::StrAppend(&fallback_params, param_name);
-  }
-
-  for (const auto& attr : api_def_.attr()) {
-    if (inferred_attrs_.find(attr.name()) == inferred_attrs_.end()) {
-      strings::StrAppend(&fastpath_execute_params, ", \"", attr.name(), "\", ",
-                         attr.rename_to());
-
-      if (!fallback_params.empty()) strings::StrAppend(&fallback_params, ", ");
-      strings::StrAppend(&fallback_params, attr.rename_to(), "=",
-                         attr.rename_to());
-    }
-  }
-
-  if (!fallback_params.empty()) strings::StrAppend(&fallback_params, ", ");
-  strings::StrAppend(&fallback_params, "name=name");
-
-  strings::StrAppend(&result_, "    try:\n");
-  strings::StrAppend(
-      &result_, "      ",
-      "_result = _pywrap_tensorflow.TFE_Py_FastPathExecute(\n",
-      WordWrap(strings::StrCat("        "),
-               strings::StrCat(fastpath_execute_params, ")"), kRightMargin),
-      "\n");
-
-  if (op_def_.output_arg_size() > 1) {
-    const string output_tuple_name =
-        strings::StrCat("_", op_def_.name(), "Output");
-    strings::StrAppend(&result_, "      ", "_result = ", output_tuple_name,
-                       "._make(_result)\n");
-  }
-  strings::StrAppend(&result_, "      ", "return _result\n");
-
-  // Handle fallback.
-  if (!fallback_params.empty()) strings::StrAppend(&fallback_params, ", ");
-  strings::StrAppend(&fallback_params, "ctx=_ctx");
-  strings::StrAppend(&result_, "    ", "except _core._FallbackException:\n");
-  strings::StrAppend(
-      &result_, "      ", "return ", function_name_, kEagerFallbackSuffix,
-      "(\n",
-      WordWrap(strings::StrCat("          "),
-               strings::StrCat(fallback_params, ")"), kRightMargin),
-      "\n");
-
-  // Any errors thrown from execute need to be unwrapped from
-  // _NotOkStatusException.
-  strings::StrAppend(&result_, "    ",
-                     "except _core._NotOkStatusException as e:\n");
-  strings::StrAppend(&result_, "      ", "if name is not None:\n");
-  strings::StrAppend(&result_, "        ",
-                     "message = e.message + \" name: \" + name\n");
-  strings::StrAppend(&result_, "      ", "else:\n");
-  strings::StrAppend(&result_, "        ", "message = e.message\n");
-  strings::StrAppend(
-      &result_, "      ",
-      "_six.raise_from(_core._status_to_exception(e.code, message), None)\n");
-}
-
-void GenEagerPythonOp::AddEagerInferredAttrs(const string& indentation) {
-  // Figure out values for inferred attrs, and cast to eager tensors.
-  for (int i = 0; i < op_def_.attr_size(); ++i) {
-    const auto& attr(op_def_.attr(i));
-    const auto& api_def_attr(api_def_.attr(i));
-    auto arg_list = attr_to_args_.find(attr.name());
-    if (arg_list != attr_to_args_.end()) {
-      if (attr.type() == "type") {
-        std::vector<string> output_sizes;
-        const string flattened =
-            FlattenInputs(&arg_list->second, &output_sizes);
-        string conversion = strings::StrCat("_execute.args_to_matching_eager(",
-                                            flattened, ", _ctx");
-        if (attr.has_default_value()) {
-          strings::StrAppend(
-              &conversion, ", ",
-              python_op_gen_internal::AttrValueToPython(
-                  attr.type(), api_def_attr.default_value(), "_dtypes."));
-        }
-        strings::StrAppend(&conversion, ")");
-        const string var_name = AttrVarName(attr.name(), &attr_expressions_);
-        if (output_sizes.size() == 1) {
-          // Avoid creating a temporary variable in the case where
-          // we can easily assign to the right value directly.
-          const string inputs_var =
-              param_names_[arg_list->second.front()].GetRenameTo();
-          if (output_sizes.front().empty()) {
-            strings::StrAppend(&result_, indentation, var_name, ", (",
-                               inputs_var, ",) = ", conversion, "\n");
-          } else {
-            strings::StrAppend(&result_, indentation, var_name, ", ",
-                               inputs_var, " = ", conversion, "\n");
-          }
-        } else {
-          const string inputs_var = strings::StrCat("_inputs_", attr.name());
-          strings::StrAppend(&result_, indentation, var_name, ", ", inputs_var,
-                             " = ", conversion, "\n");
-          // Convert from a flat list of eager tensors back to the
-          // parameter variables.
-          Unflatten(indentation, output_sizes, inputs_var, &result_);
-          std::vector<string> p;
-          for (int j : arg_list->second) {
-            p.emplace_back(param_names_[j].GetRenameTo());
-          }
-          strings::StrAppend(&result_, indentation, VectorToTuple(p), " = ",
-                             inputs_var, "\n");
-        }
-      } else if (attr.type() == "list(type)") {
-        // NOTE: We ignore default values for these attrs, since it is
-        // unclear how you would use it, and the one use case is
-        // parse_single_sequence_example which only needs it for
-        // backwards compatibility.
-        const string var_name = AttrVarName(attr.name(), &attr_expressions_);
-        string inputs_var;
-        string conversion;
-        if (arg_list->second.size() > 1) {
-          // If you have more than one list(tensor) argument, their types
-          // have to match.
-          std::vector<string> lists;
-          for (auto iter = arg_list->second.begin();
-               iter != arg_list->second.end(); ++iter) {
-            lists.push_back(param_names_[*iter].GetRenameTo());
-          }
-          inputs_var = VectorToTuple(lists);
-          conversion = "_execute.args_to_mixed_eager_tensors";
-        } else {
-          // For one list(tensor) argument, we just convert every
-          // element of the list to an eager tensor.
-          inputs_var = param_names_[arg_list->second.front()].GetRenameTo();
-          conversion = "_execute.convert_to_mixed_eager_tensors";
-        }
-        strings::StrAppend(&result_, indentation, var_name, ", ", inputs_var,
-                           " = ", conversion, "(", inputs_var, ", _ctx)\n");
-      }
-    }
-  }
-}
-
-void GenEagerPythonOp::AddEagerInputCasts(const string& indentation) {
-  // Cast remaining args to eager tensors
-  for (int i = 0; i < op_def_.input_arg_size(); ++i) {
-    const auto& arg(op_def_.input_arg(i));
-    if (!arg.type_attr().empty() || !arg.type_list_attr().empty()) continue;
-    const string& param = param_names_[i].GetRenameTo();
-    const string fn = arg.number_attr().empty() ? "" : "n_";
-    const string dtype =
-        python_op_gen_internal::DataTypeToPython(arg.type(), "_dtypes.");
-    strings::StrAppend(&result_, indentation, param, " = _ops.convert_", fn,
-                       "to_tensor(", param, ", ", dtype, ")\n");
-  }
-}
-
-void GenEagerPythonOp::AddEagerAttrs(const string& indentation) {
-  // Compute eager attrs
-  if (op_def_.attr_size() > 0) {
-    string attr_values;
-    for (int i = 0; i < op_def_.attr_size(); ++i) {
-      if (i > 0) strings::StrAppend(&attr_values, ", ");
-      const auto& attr_name(op_def_.attr(i).name());
-      strings::StrAppend(&attr_values, "\"", attr_name, "\", ",
-                         attr_expressions_[attr_name]);
-    }
-    strings::StrAppend(&attr_values, ")");
-    strings::StrAppend(
-        &result_,
-        WordWrap(indentation, strings::StrCat("_attrs = (", attr_values),
-                 kRightMargin),
-        "\n");
-  } else {
-    strings::StrAppend(&result_, indentation, "_attrs = None\n");
-  }
-}
-
-void GenEagerPythonOp::AddEagerExecute(const string& indentation,
-                                       const string& num_outputs_expr) {
-  const string return_prefix =
-      strings::StrCat(indentation, "_result = _execute.execute(");
-  const string return_args = strings::StrCat(
-      "b\"", op_def_.name(), "\", ", num_outputs_expr,
-      ", inputs=_inputs_flat, attrs=_attrs, ctx=_ctx, name=name)");
-  strings::StrAppend(&result_,
-                     // Wrap the arguments, and indent to the (.
-                     WordWrap(return_prefix, return_args, kRightMargin), "\n");
-}
-
-string GetEagerPythonOps(const OpList& ops, const ApiDefMap& api_defs,
-                         const std::vector<string>& hidden_ops,
-                         bool require_shapes,
-                         const string& source_file_name = "") {
-  string result;
-  // Header
-  // TODO(josh11b): Mention the library for which wrappers are being generated.
-  strings::StrAppend(&result, R"("""Python wrappers around TensorFlow ops.
-
-This file is MACHINE GENERATED! Do not edit.
-)");
-
-  // Mention the original source file so someone tracing back through
-  // generated Python code will know where to look next.
-  if (!source_file_name.empty()) {
-    strings::StrAppend(&result, "Original C++ source file: ");
-    strings::StrAppend(&result, source_file_name);
-    strings::StrAppend(&result, "\n");
-  }
-
-  strings::StrAppend(&result, R"("""
-
-import collections as _collections
-import six as _six
-
-from tensorflow.python import pywrap_tensorflow as _pywrap_tensorflow
-from tensorflow.python.eager import context as _context
-from tensorflow.python.eager import core as _core
-from tensorflow.python.eager import execute as _execute
-from tensorflow.python.framework import dtypes as _dtypes
-from tensorflow.python.framework import errors as _errors
-from tensorflow.python.framework import tensor_shape as _tensor_shape
-
-from tensorflow.core.framework import op_def_pb2 as _op_def_pb2
-# Needed to trigger the call to _set_call_cpp_shape_fn.
-from tensorflow.python.framework import common_shapes as _common_shapes
-from tensorflow.python.framework import op_def_registry as _op_def_registry
-from tensorflow.python.framework import ops as _ops
-from tensorflow.python.framework import op_def_library as _op_def_library
-from tensorflow.python.util.tf_export import tf_export
-
-)");
-
-  // We'll make a copy of ops that filters out descriptions.
-  OpList cleaned_ops;
-  auto out = cleaned_ops.mutable_op();
-  out->Reserve(ops.op_size());
-  for (const auto& op_def : ops.op()) {
-    const auto* api_def = api_defs.GetApiDef(op_def.name());
-
-    if (api_def->visibility() == ApiDef::SKIP) {
-      continue;
-    }
-    // An op is hidden if either its ApiDef visibility is HIDDEN
-    // or it is in the hidden_ops list.
-    bool is_hidden = api_def->visibility() == ApiDef::HIDDEN;
-    bool hidden_by_api_def = is_hidden;
-    if (!is_hidden) {
-      for (const string& hidden : hidden_ops) {
-        if (op_def.name() == hidden) {
-          is_hidden = true;
-          break;
-        }
-      }
-    }
-
-    string function_name;
-    python_op_gen_internal::GenerateLowerCaseOpName(op_def.name(),
-                                                    &function_name);
-    bool is_reserved = python_op_gen_internal::IsPythonReserved(function_name);
-
-    // Prefix an op with underscore if the op is listed in hidden_ops or
-    // name is reserved or it is of the exceptions in IsOpWithUnderscorePrefix.
-    // Do not add underscores to ops set to HIDDEN in ApiDef otherwise.
-    // TODO(annarev): don't prefix with underscores even if op is in hidden_ops.
-    if (is_hidden) {
-      if (!hidden_by_api_def || is_reserved ||
-          python_op_gen_internal::IsOpWithUnderscorePrefix(function_name)) {
-        function_name = strings::StrCat("_", function_name);
-      }
-    } else if (is_reserved) {
-      // When users create custom python wrappers, they may link in the
-      // default op registry by accident, and because they can't
-      // enumerate all 'hidden' symbols, this guard is to prevent
-      // instantiating a python reserved word in their wrapper.
-      continue;
-    }
-
-    strings::StrAppend(&result,
-                       GetEagerPythonOp(op_def, *api_def, function_name));
-
-    if (!require_shapes) {
-      strings::StrAppend(&result, "_ops.RegisterShape(\"", op_def.name(),
-                         "\")(None)\n\n");
-    }
-
-    auto added = out->Add();
-    *added = op_def;
-    RemoveNonDeprecationDescriptionsFromOpDef(added);
-  }
-
-  result.append(R"(def _InitOpDefLibrary(op_list_proto_bytes):
-  op_list = _op_def_pb2.OpList()
-  op_list.ParseFromString(op_list_proto_bytes)
-  _op_def_registry.register_op_list(op_list)
-  op_def_lib = _op_def_library.OpDefLibrary()
-  op_def_lib.add_op_list(op_list)
-  return op_def_lib
-)");
-
-  result.append("# ");
-  auto ops_text = ProtoDebugString(cleaned_ops);
-  str_util::StripTrailingWhitespace(&ops_text);
-  result.append(str_util::StringReplace(ops_text, "\n", "\n# ", true));
-  result.append("\n");
-  strings::Appendf(&result, "_op_def_lib = _InitOpDefLibrary(b\"%s\")\n",
-                   str_util::CEscape(cleaned_ops.SerializeAsString()).c_str());
-  return result;
-}
-
-}  // namespace
-
-void PrintEagerPythonOps(const OpList& ops, const ApiDefMap& api_defs,
-                         const std::vector<string>& hidden_ops,
-                         bool require_shapes, const string& source_file_name) {
-  printf("%s", GetEagerPythonOps(ops, api_defs, hidden_ops, require_shapes,
-                                 source_file_name)
-                   .c_str());
-}
-
-string GetEagerPythonWrappers(const char* op_list_buf, size_t op_list_len) {
-  string op_list_str(op_list_buf, op_list_len);
-  OpList ops;
-  ops.ParseFromString(op_list_str);
-
-  ApiDefMap api_def_map(ops);
-  return GetEagerPythonOps(ops, api_def_map, {}, false);
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/python/eager/python_eager_op_gen.h b/tensorflow/python/eager/python_eager_op_gen.h
deleted file mode 100644
index d27b00139d129a..00000000000000
--- a/tensorflow/python/eager/python_eager_op_gen.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_PYTHON_EAGER_PYTHON_EAGER_OP_GEN_H_
-#define TENSORFLOW_PYTHON_EAGER_PYTHON_EAGER_OP_GEN_H_
-
-#include <string>
-#include <vector>
-#include "tensorflow/core/framework/op_def.pb.h"
-#include "tensorflow/core/framework/op_gen_lib.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-
-// hidden_ops should be a list of Op names that should get a leading _
-// in the output. Prints the output to stdout.
-// Optional fourth argument is the name of the original C++ source file
-// where the ops' REGISTER_OP() calls reside.
-void PrintEagerPythonOps(const OpList& ops, const ApiDefMap& api_defs,
-                         const std::vector<string>& hidden_ops,
-                         bool require_shapes,
-                         const string& source_file_name = "");
-
-// Get the python wrappers for a list of ops in a OpList.
-// `op_list_buf` should be a pointer to a buffer containing
-// the binary encoded OpList proto, and `op_list_len` should be the
-// length of that buffer.
-string GetEagerPythonWrappers(const char* op_list_buf, size_t op_list_len);
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_PYTHON_EAGER_PYTHON_EAGER_OP_GEN_H_
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index b5b4e394e33bd3..ea604647faede0 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -27,8 +27,15 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/python/lib/core/ndarray_tensor.h"
 
+// forward declare
+struct EagerTensor;
+
 namespace {
 
+// An instance of _EagerTensorProfiler that will receive callbacks about
+// events on eager tensors. This is set by TFE_Py_InitEagerTensor, if at all.
+PyObject* eager_tensor_profiler = nullptr;
+
 TFE_Context* GetContext(PyObject* ctx) {
   TFE_Context* context =
       reinterpret_cast<TFE_Context*>(PyCapsule_GetPointer(ctx, nullptr));
@@ -253,8 +260,45 @@ typedef struct EagerTensor {
   // to use a TF_Status object. However note that accesses to `status` are not
   // thread-safe.
   TF_Status* status;
+
+  PyObject* weakreflist; /* List of weak references */
 } EagerTensor;
 
+namespace {
+
+// Returns true on success - successfully invoked or no profiler registered.
+// Returns false if some error occurred.
+bool MaybeInvokeCreatedOnEagerTensorProfiler(EagerTensor* created_tensor) {
+  if (eager_tensor_profiler != nullptr) {
+#if PY_MAJOR_VERSION < 3
+    PyObject* created_method_name = PyString_InternFromString("created");
+#else
+    PyObject* created_method_name = PyUnicode_InternFromString("created");
+#endif
+    if (created_method_name == nullptr) {
+      return false;
+    }
+    PyObject* result = PyObject_CallMethodObjArgs(
+        eager_tensor_profiler, created_method_name, created_tensor, NULL);
+    if (result == nullptr) {
+      LOG(ERROR) << "Invoking created() on EagerTensor profiler failed";
+      // While we can potentially continue because the error is related to
+      // profiling, we choose to return an error because:
+      //  - If profiling is used, the user likely wants to stop execution on
+      //    profiling errors.
+      //  - Error in profiling code might have left some state in an invalid
+      //    form that can lead to an error later on. Better to fail fast.
+      Py_DECREF(created_method_name);
+      return false;
+    }
+    Py_DECREF(created_method_name);
+    Py_DECREF(result);
+  }
+  return true;
+}
+
+}  // namespace
+
 // tp_init for EagerTensor.
 int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) {
   self->id = get_uid();
@@ -266,6 +310,7 @@ int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) {
   Py_INCREF(Py_None);
   self->tensor_shape = Py_None;
   self->status = TF_NewStatus();
+  self->weakreflist = nullptr;
   PyObject* value;
   PyObject* context = nullptr;
   PyObject* device = nullptr;
@@ -299,7 +344,7 @@ int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) {
         GetContext(context), handle.get(), handle_dtype,
         static_cast<TF_DataType>(desired_dtype), self->status));
     if (TF_GetCode(self->status) != TF_OK) {
-      PyErr_SetString(PyExc_ValueError,
+      PyErr_SetString(PyExc_TypeError,
                       tensorflow::strings::StrCat(
                           "Error while casting from DataType ", handle_dtype,
                           " to ", desired_dtype, ". ", TF_Message(self->status))
@@ -344,11 +389,22 @@ int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) {
     if (handle == nullptr) return -1;
   }
   self->handle = handle.release();
+
+  if (!MaybeInvokeCreatedOnEagerTensorProfiler(self)) {
+    return -1;
+  }
+
   return 0;
 }
 
 // tp_dealloc for EagerTensor.
 void EagerTensor_dealloc(EagerTensor* self) {
+  // Clear weak references to self.
+  // Needs to happen before any actual destruction.
+  if (self->weakreflist != nullptr) {
+    PyObject_ClearWeakRefs((PyObject*)self);
+  }
+
   TF_DeleteStatus(self->status);
   Py_DECREF(self->handle_data);
   Py_DECREF(self->keras_mask);
@@ -574,43 +630,43 @@ static PyTypeObject _EagerTensorType = {
     // clang-format off
     PyVarObject_HEAD_INIT(nullptr, 0)
     // clang-format on
-    "EagerTensor",                   /* tp_name */
-    sizeof(EagerTensor),             /* tp_basicsize */
-    0,                               /* tp_itemsize */
-    (destructor)EagerTensor_dealloc, /* tp_dealloc */
-    nullptr,                         /* tp_print */
-    nullptr,                         /* tp_getattr */
-    nullptr,                         /* tp_setattr */
-    nullptr,                         /* tp_compare */
-    nullptr,                         /* tp_repr */
-    nullptr,                         /* tp_as_number */
-    nullptr,                         /* tp_as_sequence */
-    nullptr,                         /* tp_as_mapping */
-    nullptr,                         /* tp_hash */
-    nullptr,                         /* tp_call */
-    nullptr,                         /* tp_str */
-    nullptr,                         /* tp_getattro */
-    nullptr,                         /* tp_setattro */
-    nullptr,                         /* tp_as_buffer */
-    Py_TPFLAGS_DEFAULT,              /* tp_flags */
-    nullptr,                         /* tp_doc */
-    nullptr,                         /* tp_traverse */
-    nullptr,                         /* tp_clear */
-    nullptr,                         /* tp_richcompare */
-    0,                               /* tp_weaklistoffset */
-    nullptr,                         /* tp_iter */
-    nullptr,                         /* tp_iternext */
-    EagerTensor_methods,             /* tp_methods */
-    nullptr,                         /* tp_members */
-    EagerTensor_getseters,           /* tp_getset */
-    nullptr,                         /* tp_base */
-    nullptr,                         /* tp_dict */
-    nullptr,                         /* tp_descr_get */
-    nullptr,                         /* tp_descr_set */
-    0,                               /* tp_dictoffset */
-    (initproc)EagerTensor_init,      /* tp_init */
-    nullptr,                         /* tp_alloc */
-    nullptr,                         /* tp_new */
+    "EagerTensor",                      /* tp_name */
+    sizeof(EagerTensor),                /* tp_basicsize */
+    0,                                  /* tp_itemsize */
+    (destructor)EagerTensor_dealloc,    /* tp_dealloc */
+    nullptr,                            /* tp_print */
+    nullptr,                            /* tp_getattr */
+    nullptr,                            /* tp_setattr */
+    nullptr,                            /* tp_compare */
+    nullptr,                            /* tp_repr */
+    nullptr,                            /* tp_as_number */
+    nullptr,                            /* tp_as_sequence */
+    nullptr,                            /* tp_as_mapping */
+    nullptr,                            /* tp_hash */
+    nullptr,                            /* tp_call */
+    nullptr,                            /* tp_str */
+    nullptr,                            /* tp_getattro */
+    nullptr,                            /* tp_setattro */
+    nullptr,                            /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT,                 /* tp_flags */
+    nullptr,                            /* tp_doc */
+    nullptr,                            /* tp_traverse */
+    nullptr,                            /* tp_clear */
+    nullptr,                            /* tp_richcompare */
+    offsetof(EagerTensor, weakreflist), /* tp_weaklistoffset */
+    nullptr,                            /* tp_iter */
+    nullptr,                            /* tp_iternext */
+    EagerTensor_methods,                /* tp_methods */
+    nullptr,                            /* tp_members */
+    EagerTensor_getseters,              /* tp_getset */
+    nullptr,                            /* tp_base */
+    nullptr,                            /* tp_dict */
+    nullptr,                            /* tp_descr_get */
+    nullptr,                            /* tp_descr_set */
+    0,                                  /* tp_dictoffset */
+    (initproc)EagerTensor_init,         /* tp_init */
+    nullptr,                            /* tp_alloc */
+    nullptr,                            /* tp_new */
 };
 
 #endif
@@ -641,6 +697,11 @@ PyObject* EagerTensorFromHandle(TFE_TensorHandle* handle) {
     t->tensor_shape = Py_None;
     t->handle = handle;
     t->status = TF_NewStatus();
+    t->weakreflist = nullptr;
+
+    if (!MaybeInvokeCreatedOnEagerTensorProfiler(t)) {
+      return nullptr;
+    }
   }
   return reinterpret_cast<PyObject*>(t);
 }
@@ -650,6 +711,12 @@ tensorflow::int64 EagerTensor_id(const PyObject* tensor) {
   return reinterpret_cast<const EagerTensor*>(tensor)->id;
 }
 
+tensorflow::DataType EagerTensor_dtype(const PyObject* tensor) {
+  CHECK(EagerTensor_CheckExact(tensor));
+  return static_cast<tensorflow::DataType>(TFE_TensorHandleDataType(
+      reinterpret_cast<const EagerTensor*>(tensor)->handle));
+}
+
 PyObject* TFE_Py_InitEagerTensor(PyObject* base_class) {
   if (!PyType_Check(base_class)) {
     PyErr_SetString(
@@ -714,6 +781,18 @@ PyObject* TFE_Py_InitEagerTensor(PyObject* base_class) {
   return reinterpret_cast<PyObject*>(EagerTensorType);
 }
 
+PyObject* TFE_Py_SetEagerTensorProfiler(PyObject* profiler) {
+  Py_XDECREF(eager_tensor_profiler);
+
+  if (profiler == Py_None) {
+    eager_tensor_profiler = nullptr;
+  } else {
+    eager_tensor_profiler = profiler;
+    Py_INCREF(eager_tensor_profiler);
+  }
+  Py_RETURN_NONE;
+}
+
 PyObject* TFE_Py_TensorShapeSlice(PyObject* tensors, int slice_dim) {
   if (!PyList_Check(tensors) && !PyTuple_Check(tensors)) {
     PyErr_SetString(PyExc_TypeError,
@@ -786,3 +865,37 @@ PyObject* TFE_Py_TensorShapeSlice(PyObject* tensors, int slice_dim) {
 
   return EagerTensorFromHandle(handle);
 }
+
+PyObject* TFE_Py_TensorShapeOnDevice(PyObject* tensor) {
+  if (!EagerTensor_CheckExact(tensor)) {
+    PyErr_SetString(
+        PyExc_TypeError,
+        tensorflow::strings::StrCat("Expected an EagerTensors but got type \"",
+                                    Py_TYPE(tensor)->tp_name, "\"")
+            .c_str());
+    return nullptr;
+  }
+  TFE_TensorHandle* handle = EagerTensor_Handle(tensor);
+
+  auto status = tensorflow::make_safe(TF_NewStatus());
+  TFE_TensorDebugInfo* debug_info =
+      TFE_TensorHandleTensorDebugInfo(handle, status.get());
+  if (TF_GetCode(status.get()) != TF_OK) {
+    PyErr_SetString(
+        PyExc_RuntimeError,
+        tensorflow::strings::StrCat("Error retrieving tensor's device shape: ",
+                                    TF_Message(status.get()))
+            .c_str());
+    return nullptr;
+  }
+
+  int rank = TFE_TensorDebugInfoOnDeviceNumDims(debug_info);
+  PyObject* shape = PyTuple_New(rank);
+  for (int i = 0; i < rank; ++i) {
+    tensorflow::int64 dim_size = TFE_TensorDebugInfoOnDeviceDim(debug_info, i);
+    PyTuple_SET_ITEM(shape, i, PyLong_FromLongLong(dim_size));
+  }
+  TFE_DeleteTensorDebugInfo(debug_info);
+
+  return shape;
+}
diff --git a/tensorflow/python/eager/pywrap_tensor.h b/tensorflow/python/eager/pywrap_tensor.h
index 63ab1ed84d5ba3..bc042eb19e6a91 100644
--- a/tensorflow/python/eager/pywrap_tensor.h
+++ b/tensorflow/python/eager/pywrap_tensor.h
@@ -16,11 +16,13 @@ limitations under the License.
 #define TENSORFLOW_PYTHON_EAGER_PYWRAP_TENSOR_H_
 
 #include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/python/lib/core/numpy.h"
 
 bool EagerTensor_CheckExact(const PyObject* o);
 tensorflow::int64 EagerTensor_id(const PyObject* tensor);
+tensorflow::DataType EagerTensor_dtype(const PyObject* tensor);
 
 namespace tensorflow {
 TFE_TensorHandle* ConvertToEagerTensor(PyObject* value, PyObject* dtype);
diff --git a/tensorflow/python/eager/pywrap_tfe.h b/tensorflow/python/eager/pywrap_tfe.h
index 691b613e48b217..a916a75f00cafc 100644
--- a/tensorflow/python/eager/pywrap_tfe.h
+++ b/tensorflow/python/eager/pywrap_tfe.h
@@ -16,10 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_PYTHON_EAGER_PYWRAP_TFE_H_
 #define TENSORFLOW_PYTHON_EAGER_PYWRAP_TFE_H_
 
+#include <Python.h>
+
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
-#include <Python.h>
 
 typedef tensorflow::gtl::InlinedVector<TFE_TensorHandle*, 4>
     TFE_InputTensorHandles;
@@ -66,14 +67,15 @@ PyObject* TFE_Py_RegisterResourceVariableType(PyObject* e);
 // This function is not thread-safe.
 PyObject* TFE_Py_RegisterFallbackExceptionClass(PyObject* e);
 
-// Registers e as the backward_function_getter.
-// The registered function creates a backward function (a function that can
-// return the gradient of the inputs an op given the gradient of it's outputs).
-// The registered function will be passed the following arguments:
-//    op_name, attrs, num_inputs, op_inputs, op_outputs
+// Registers e as the gradient_function.
+// The registered function takes
+// (op_name, attrs, num_inputs, inputs, outputs, output_gradients) and returns
+// the input gradients. This function will not correctly be able to generate
+// gradients for functional ops - the gradients for those ops are calculated
+// through a different codepath (see function.py for additional information).
 //
 // This function is not thread-safe.
-PyObject* TFE_Py_RegisterBackwardFunctionGetter(PyObject* e);
+PyObject* TFE_Py_RegisterGradientFunction(PyObject* e);
 
 // Returns 0 if 'status' is TF_OK. Otherwise, raises an exception (using
 // `exception` if not nullptr, else using the class registered via
@@ -113,6 +115,15 @@ TFE_TensorHandle* EagerTensor_Handle(const PyObject* o);
 // newly created type, or nullptr on error.
 PyObject* TFE_Py_InitEagerTensor(PyObject* base_class);
 
+// Sets `profiler` as the current profiler to receive callbacks about events
+// on eager tensors. Currently, the only reported event is creation.
+// `profiler` is expected to have a `created(self, eager_tensor)` method that
+// takes the created tensor as its single argument.
+// Previous profiler, if any, is unset and will not receive any more
+// callbacks.
+// To unset the profiler, pass Py_None as the value of `profiler`.
+PyObject* TFE_Py_SetEagerTensorProfiler(PyObject* profiler);
+
 // Creates a new tape and adds it to the active set. `persistent` must be a
 // PyBool_Type, i.e either Py_True or Py_False
 PyObject* TFE_Py_TapeSetNew(PyObject* persistent);
@@ -120,6 +131,9 @@ PyObject* TFE_Py_TapeSetNew(PyObject* persistent);
 // Removes the passed tape from the set of active tapes.
 void TFE_Py_TapeSetRemove(PyObject* tape);
 
+// Adds the passed tape to the set of active tapes.
+void TFE_Py_TapeSetAdd(PyObject* tape);
+
 // Returns true if the tape stack is empty.
 PyObject* TFE_Py_TapeSetIsEmpty();
 
@@ -183,7 +197,8 @@ PyObject* TFE_Py_RecordGradient(PyObject* op_name, PyObject* inputs,
                                 PyObject* attrs, PyObject* results,
                                 PyObject* name);
 
-// Returns the set of variables watched by the given tape.
+// Returns all variables watched by the given tape in the order those variables
+// were created.
 PyObject* TFE_Py_TapeWatchedVariables(PyObject* tape);
 
 // Returns an EagerTensor of dimension [len(`tensors`)] containing
@@ -198,4 +213,8 @@ PyObject* TFE_Py_TapeWatchedVariables(PyObject* tape);
 //   tensors in `tensors`.
 PyObject* TFE_Py_TensorShapeSlice(PyObject* tensors, int slice_dim);
 
+// Returns the shape of this tensor's on-device representation.
+// The shape is represented as a Python tuple of integers.
+PyObject* TFE_Py_TensorShapeOnDevice(PyObject* tensor);
+
 #endif  // TENSORFLOW_PYTHON_EAGER_PYWRAP_TFE_H_
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 2bfa1f052cfe65..e3ce0ef9d02be1 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/python/eager/pywrap_tensor.h"
 #include "tensorflow/python/lib/core/safe_ptr.h"
+#include "tensorflow/python/util/util.h"
 
 using tensorflow::string;
 using tensorflow::strings::Printf;
@@ -45,12 +46,14 @@ struct InputInfo {
   bool is_list = false;
 };
 
+// Takes in output gradients, returns input gradients.
+typedef std::function<PyObject*(PyObject*)> PyBackwardFunction;
+
 using AttrToInputsMap =
     tensorflow::gtl::FlatMap<string,
                              tensorflow::gtl::InlinedVector<InputInfo, 4>>;
 
-tensorflow::mutex all_attr_to_input_maps_lock(
-    tensorflow::LINKER_INITIALIZED);
+tensorflow::mutex all_attr_to_input_maps_lock(tensorflow::LINKER_INITIALIZED);
 tensorflow::gtl::FlatMap<string, AttrToInputsMap*>* GetAllAttrToInputsMaps() {
   static auto* all_attr_to_input_maps =
       new tensorflow::gtl::FlatMap<string, AttrToInputsMap*>;
@@ -174,6 +177,8 @@ bool IsInteger(PyObject* py_value) {
 #endif
 }
 
+// This function considers a Dimension._value of None to be valid, and sets the
+// value to be -1 in that case.
 bool ParseDimensionValue(const string& key, PyObject* py_value,
                          TF_Status* status, int64_t* value) {
   if (IsInteger(py_value)) {
@@ -191,6 +196,11 @@ bool ParseDimensionValue(const string& key, PyObject* py_value,
     return false;
   }
 
+  if (dimension_value.get() == Py_None) {
+    *value = -1;
+    return true;
+  }
+
   return ParseInt64Value(key, dimension_value.get(), status, value);
 }
 
@@ -634,8 +644,8 @@ PyObject* exception_class GUARDED_BY(exception_class_mutex) = nullptr;
 // Python subclass of Exception that is created to signal fallback.
 PyObject* fallback_exception_class = nullptr;
 
-// Python function that returns a backward_function.
-PyObject* backward_function_getter = nullptr;
+// Python function that returns input gradients given output gradients.
+PyObject* gradient_function = nullptr;
 
 PyTypeObject* resource_variable_type = nullptr;
 
@@ -728,26 +738,26 @@ PyObject* TFE_Py_RegisterFallbackExceptionClass(PyObject* e) {
   }
 }
 
-PyObject* TFE_Py_RegisterBackwardFunctionGetter(PyObject* e) {
-  if (backward_function_getter != nullptr) {
-    Py_DECREF(backward_function_getter);
+PyObject* TFE_Py_RegisterGradientFunction(PyObject* e) {
+  if (gradient_function != nullptr) {
+    Py_DECREF(gradient_function);
   }
   if (!PyCallable_Check(e)) {
-    backward_function_getter = nullptr;
+    gradient_function = nullptr;
     PyErr_SetString(PyExc_TypeError,
                     "TFE_Py_RegisterBackwardFunctionGetter: "
                     "Registered object should be function.");
     return nullptr;
   } else {
     Py_INCREF(e);
-    backward_function_getter = e;
+    gradient_function = e;
     Py_RETURN_NONE;
   }
 }
 
 void RaiseFallbackException(const char* message) {
   if (fallback_exception_class != nullptr) {
-    PyErr_SetObject(fallback_exception_class, Py_BuildValue("s", message));
+    PyErr_SetString(fallback_exception_class, message);
     return;
   }
 
@@ -765,8 +775,9 @@ int MaybeRaiseExceptionFromTFStatus(TF_Status* status, PyObject* exception) {
   if (exception == nullptr) {
     tensorflow::mutex_lock l(exception_class_mutex);
     if (exception_class != nullptr) {
-      PyErr_SetObject(exception_class,
-                      Py_BuildValue("si", msg, TF_GetCode(status)));
+      tensorflow::Safe_PyObjectPtr val(
+          Py_BuildValue("si", msg, TF_GetCode(status)));
+      PyErr_SetObject(exception_class, val.get());
       return -1;
     } else {
       exception = PyExc_RuntimeError;
@@ -784,7 +795,8 @@ int MaybeRaiseExceptionFromStatus(const tensorflow::Status& status,
   if (exception == nullptr) {
     tensorflow::mutex_lock l(exception_class_mutex);
     if (exception_class != nullptr) {
-      PyErr_SetObject(exception_class, Py_BuildValue("si", msg, status.code()));
+      tensorflow::Safe_PyObjectPtr val(Py_BuildValue("si", msg, status.code()));
+      PyErr_SetObject(exception_class, val.get());
       return -1;
     } else {
       exception = PyExc_RuntimeError;
@@ -843,11 +855,46 @@ static tensorflow::int64 FastTensorId(PyObject* tensor) {
   return id;
 }
 
+static tensorflow::DataType FastTensorDtype(PyObject* tensor) {
+  if (EagerTensor_CheckExact(tensor)) {
+    return EagerTensor_dtype(tensor);
+  }
+  PyObject* dtype_field = PyObject_GetAttrString(tensor, "dtype");
+  if (dtype_field == nullptr) {
+    return tensorflow::DT_INVALID;
+  }
+  PyObject* enum_field = PyObject_GetAttrString(dtype_field, "_type_enum");
+  Py_DECREF(dtype_field);
+  if (dtype_field == nullptr) {
+    return tensorflow::DT_INVALID;
+  }
+  tensorflow::int64 id = MakeInt(enum_field);
+  Py_DECREF(enum_field);
+  return static_cast<tensorflow::DataType>(id);
+}
+
+static tensorflow::int64 FastHandleId(PyObject* variable) {
+  PyObject* handle = PyObject_GetAttrString(variable, "handle");
+  if (handle == nullptr) {
+    return -1;
+  }
+  tensorflow::int64 id = FastTensorId(handle);
+  Py_DECREF(handle);
+  return id;
+}
+
+struct CompareByHandleId {
+  bool operator()(PyObject* lhs, PyObject* rhs) {
+    return FastHandleId(lhs) < FastHandleId(rhs);
+  }
+};
+
 class GradientTape
-    : public tensorflow::eager::GradientTape<PyObject, PyObject> {
+    : public tensorflow::eager::GradientTape<PyObject, PyBackwardFunction> {
  public:
   explicit GradientTape(bool persistent)
-      : tensorflow::eager::GradientTape<PyObject, PyObject>(persistent) {}
+      : tensorflow::eager::GradientTape<PyObject, PyBackwardFunction>(
+            persistent) {}
 
   virtual ~GradientTape() {
     for (PyObject* v : watched_variables_) {
@@ -873,12 +920,12 @@ class GradientTape
     }
   }
 
-  const std::unordered_set<PyObject*> WatchedVariables() {
+  const std::set<PyObject*, CompareByHandleId> WatchedVariables() {
     return watched_variables_;
   }
 
  private:
-  std::unordered_set<PyObject*> watched_variables_;
+  std::set<PyObject*, CompareByHandleId> watched_variables_;
 };
 
 typedef struct {
@@ -991,6 +1038,14 @@ PyObject* TFE_Py_TapeSetNew(PyObject* persistent) {
   return reinterpret_cast<PyObject*>(tape);
 }
 
+void TFE_Py_TapeSetAdd(PyObject* tape) {
+  Py_INCREF(tape);
+  if (!GetTapeSet()->insert(reinterpret_cast<TFE_Py_Tape*>(tape)).second) {
+    // Already exists in the tape set.
+    Py_DECREF(tape);
+  }
+}
+
 PyObject* TFE_Py_TapeSetIsEmpty() {
   if (*ThreadTapeIsStopped() || GetTapeSet()->empty()) {
     Py_RETURN_TRUE;
@@ -1053,15 +1108,18 @@ PyObject* TFE_Py_TapeSetShouldRecord(PyObject* tensors) {
   // TODO(apassos) consider not building a list and changing the API to check
   // each tensor individually.
   std::vector<tensorflow::int64> tensor_ids;
+  std::vector<tensorflow::DataType> dtypes;
   tensor_ids.reserve(len);
+  dtypes.reserve(len);
   for (int i = 0; i < len; ++i) {
     PyObject* item = PySequence_Fast_GET_ITEM(seq, i);
     tensor_ids.push_back(FastTensorId(item));
+    dtypes.push_back(FastTensorDtype(item));
   }
   Py_DECREF(seq);
   auto tape_set = *tape_set_ptr;
   for (TFE_Py_Tape* tape : tape_set) {
-    if (tape->tape->ShouldRecord(tensor_ids)) {
+    if (tape->tape->ShouldRecord(tensor_ids, dtypes)) {
       Py_RETURN_TRUE;
     }
   }
@@ -1159,23 +1217,45 @@ void TFE_Py_TapeSetWatchVariable(PyObject* variable) {
 }
 
 PyObject* TFE_Py_TapeWatchedVariables(PyObject* tape) {
-  const std::unordered_set<PyObject*>& watched_variables =
+  const auto& watched_variables =
       reinterpret_cast<TFE_Py_Tape*>(tape)->tape->WatchedVariables();
-  PyObject* result = PySet_New(nullptr);
+  PyObject* result = PyTuple_New(watched_variables.size());
+  Py_ssize_t pos = 0;
   for (PyObject* variable : watched_variables) {
-    PySet_Add(result, variable);
+    PyTuple_SET_ITEM(result, pos++, variable);
+    Py_INCREF(variable);
   }
   return result;
 }
 
 namespace {
-void TapeSetRecordOperation(PyObject* op_type, PyObject* output_tensors,
-                            const std::vector<tensorflow::int64>& input_ids,
-                            PyObject* backward_function) {
+std::vector<tensorflow::DataType> MakeTensorDtypeList(PyObject* tensors) {
+  PyObject* seq = PySequence_Fast(tensors, "expected a sequence");
+  if (seq == nullptr) {
+    return {};
+  }
+  int len = PySequence_Fast_GET_SIZE(seq);
+  std::vector<tensorflow::DataType> list;
+  list.reserve(len);
+  for (int i = 0; i < len; ++i) {
+    PyObject* tensor = PySequence_Fast_GET_ITEM(seq, i);
+    list.push_back(FastTensorDtype(tensor));
+  }
+  Py_DECREF(seq);
+  return list;
+}
+
+void TapeSetRecordOperation(
+    PyObject* op_type, PyObject* output_tensors,
+    const std::vector<tensorflow::int64>& input_ids,
+    const std::vector<tensorflow::DataType>& input_dtypes,
+    const std::function<PyBackwardFunction*()>& backward_function_getter,
+    const std::function<void(PyBackwardFunction*)>& backward_function_killer) {
   std::vector<tensorflow::eager::TapeTensor> output_info;
   PyObject* seq = PySequence_Fast(output_tensors,
                                   "expected a sequence of integer tensor ids");
   int len = PySequence_Size(output_tensors);
+  if (PyErr_Occurred()) return;
   output_info.reserve(len);
   for (int i = 0; i < len; ++i) {
     output_info.push_back(
@@ -1204,10 +1284,10 @@ void TapeSetRecordOperation(PyObject* op_type, PyObject* output_tensors,
   }
 
   for (TFE_Py_Tape* tape : SafeTapeSet()) {
-    Py_INCREF(backward_function);
-    tape->tape->RecordOperation(
-        op_type_str, output_info, input_ids, backward_function,
-        [backward_function]() { Py_DECREF(backward_function); });
+    auto* function = backward_function_getter();
+    tape->tape->RecordOperation(op_type_str, output_info, input_ids,
+                                input_dtypes, function,
+                                backward_function_killer);
   }
 }
 }  // namespace
@@ -1221,7 +1301,24 @@ void TFE_Py_TapeSetRecordOperation(PyObject* op_type, PyObject* output_tensors,
   std::vector<tensorflow::int64> input_ids = MakeTensorIDList(input_tensors);
   if (PyErr_Occurred()) return;
 
-  TapeSetRecordOperation(op_type, output_tensors, input_ids, backward_function);
+  std::vector<tensorflow::DataType> input_dtypes =
+      MakeTensorDtypeList(input_tensors);
+  if (PyErr_Occurred()) return;
+
+  TapeSetRecordOperation(
+      op_type, output_tensors, input_ids, input_dtypes,
+      [backward_function]() {
+        Py_INCREF(backward_function);
+        PyBackwardFunction* function =
+            new PyBackwardFunction([backward_function](PyObject* out_grads) {
+              return PyObject_CallObject(backward_function, out_grads);
+            });
+        return function;
+      },
+      [backward_function](PyBackwardFunction* py_backward_function) {
+        Py_DECREF(backward_function);
+        delete py_backward_function;
+      });
 }
 
 void TFE_Py_TapeSetDeleteTrace(tensorflow::int64 tensor_id) {
@@ -1230,7 +1327,8 @@ void TFE_Py_TapeSetDeleteTrace(tensorflow::int64 tensor_id) {
   }
 }
 
-class PyVSpace : public tensorflow::eager::VSpace<PyObject, PyObject> {
+class PyVSpace
+    : public tensorflow::eager::VSpace<PyObject, PyBackwardFunction> {
  public:
   explicit PyVSpace(PyObject* py_vspace) : py_vspace_(py_vspace) {}
 
@@ -1290,6 +1388,8 @@ class PyVSpace : public tensorflow::eager::VSpace<PyObject, PyObject> {
     return result;
   }
 
+  void MarkAsResult(PyObject* gradient) const final { Py_INCREF(gradient); }
+
   PyObject* Zeros(tensorflow::TensorShape shape,
                   tensorflow::DataType dtype) const final {
     PyObject* py_shape = PyTuple_New(shape.dims());
@@ -1321,7 +1421,7 @@ class PyVSpace : public tensorflow::eager::VSpace<PyObject, PyObject> {
   }
 
   tensorflow::Status CallBackwardFunction(
-      PyObject* backward_function,
+      PyBackwardFunction* backward_function,
       tensorflow::gtl::ArraySlice<PyObject*> output_gradients,
       std::vector<PyObject*>* result) const final {
     PyObject* grads = PyTuple_New(output_gradients.size());
@@ -1334,8 +1434,7 @@ class PyVSpace : public tensorflow::eager::VSpace<PyObject, PyObject> {
                          reinterpret_cast<PyObject*>(output_gradients[i]));
       }
     }
-    PyObject* py_result = PyEval_CallObject(
-        reinterpret_cast<PyObject*>(backward_function), grads);
+    PyObject* py_result = (*backward_function)(grads);
     Py_DECREF(grads);
     if (py_result == nullptr) {
       return tensorflow::errors::Internal("gradient function threw exceptions");
@@ -1364,10 +1463,6 @@ class PyVSpace : public tensorflow::eager::VSpace<PyObject, PyObject> {
     return tensorflow::Status::OK();
   }
 
-  void ReleaseBackwardFunction(PyObject* backward_function) const final {
-    Py_DECREF(backward_function);
-  }
-
   void DeleteGradient(PyObject* tensor) const final { Py_XDECREF(tensor); }
 
  private:
@@ -1402,8 +1497,12 @@ PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* vspace,
     auto* tape_set = GetTapeSet();
     if (tape_set->find(tape_obj) != tape_set->end()) {
       PyErr_SetString(PyExc_RuntimeError,
-                      "Trying to call tape.gradient on a non-persistent tape "
-                      "while it is still active.");
+                      "gradient() cannot be invoked within the "
+                      "GradientTape context (i.e., while operations are being "
+                      "recorded). Either move the call to gradient() to be "
+                      "outside the 'with tf.GradientTape' block, or "
+                      "use a persistent tape: "
+                      "'with tf.GradientTape(persistent=true)'");
       return nullptr;
     }
   }
@@ -1522,12 +1621,12 @@ bool CheckInputsOk(PyObject* seq, int start_index,
       for (Py_ssize_t j = 0; j < PySequence_Fast_GET_SIZE(item); j++) {
         PyObject* inner_item = PySequence_Fast_GET_ITEM(item, j);
         if (!CheckOneInput(inner_item)) {
-          VLOG(1)
-              << "Falling back to slow path for Op \"" << op_def.name()
-              << "\", Input \"" << op_def.input_arg(i).name() << "\", Index "
-              << j
-              << " since we expected an EagerTensor/ResourceVariable, but got "
-              << inner_item->ob_type->tp_name;
+          VLOG(1) << "Falling back to slow path for Op \"" << op_def.name()
+                  << "\", Input \"" << op_def.input_arg(i).name()
+                  << "\", Index " << j
+                  << " since we expected an EagerTensor/ResourceVariable, "
+                     "but got "
+                  << inner_item->ob_type->tp_name;
           return false;
         }
       }
@@ -1706,10 +1805,12 @@ PyObject* RecordGradient(PyObject* op_name, PyObject* inputs, PyObject* attrs,
                          PyObject* results, PyObject* name) {
   std::vector<tensorflow::int64> input_ids = MakeTensorIDList(inputs);
   if (PyErr_Occurred()) return nullptr;
+  std::vector<tensorflow::DataType> input_dtypes = MakeTensorDtypeList(inputs);
+  if (PyErr_Occurred()) return nullptr;
 
   bool should_record = false;
   for (TFE_Py_Tape* tape : SafeTapeSet()) {
-    if (tape->tape->ShouldRecord(input_ids)) {
+    if (tape->tape->ShouldRecord(input_ids, input_dtypes)) {
       should_record = true;
       break;
     }
@@ -1732,27 +1833,51 @@ PyObject* RecordGradient(PyObject* op_name, PyObject* inputs, PyObject* attrs,
   }
 
   PyObject* num_inputs = PyLong_FromLong(PySequence_Size(inputs));
-  PyObject* callback_args =
-      Py_BuildValue("OOOOO", op_name, attrs, num_inputs, op_inputs, op_outputs);
-
-  PyObject* backward_function =
-      PyObject_CallObject(backward_function_getter, callback_args);
-  Py_DECREF(callback_args);
-  if (backward_function == nullptr) return nullptr;
 
-  TapeSetRecordOperation(op_name, results, input_ids, backward_function);
-
-  Py_DECREF(backward_function);
+  TapeSetRecordOperation(
+      op_name, results, input_ids, input_dtypes,
+      [op_name, attrs, num_inputs, op_inputs, op_outputs]() {
+        Py_INCREF(op_name);
+        Py_INCREF(attrs);
+        Py_INCREF(num_inputs);
+        Py_INCREF(op_inputs);
+        Py_INCREF(op_outputs);
+        PyBackwardFunction* function =
+            new PyBackwardFunction([op_name, attrs, num_inputs, op_inputs,
+                                    op_outputs](PyObject* output_grads) {
+              tensorflow::Safe_PyObjectPtr callback_args(
+                  Py_BuildValue("OOOOOO", op_name, attrs, num_inputs, op_inputs,
+                                op_outputs, output_grads));
+
+              tensorflow::Safe_PyObjectPtr result(
+                  PyObject_CallObject(gradient_function, callback_args.get()));
+
+              if (PyErr_Occurred()) return static_cast<PyObject*>(nullptr);
+
+              return tensorflow::swig::Flatten(result.get());
+            });
+        return function;
+      },
+      [op_name, attrs, num_inputs, op_inputs,
+       op_outputs](PyBackwardFunction* backward_function) {
+        Py_DECREF(op_name);
+        Py_DECREF(attrs);
+        Py_DECREF(num_inputs);
+        Py_DECREF(op_inputs);
+        Py_DECREF(op_outputs);
+
+        delete backward_function;
+      });
 
   Py_RETURN_NONE;
 }
 
 void MaybeWatchVariable(PyObject* input) {
   DCHECK(CheckResourceVariable(input));
-  DCHECK(PyObject_HasAttrString(input, "_trainable"));
+  DCHECK(PyObject_HasAttrString(input, "trainable"));
 
   tensorflow::Safe_PyObjectPtr trainable(
-      PyObject_GetAttrString(input, "_trainable"));
+      PyObject_GetAttrString(input, "trainable"));
   if (trainable.get() == Py_False) return;
   TFE_Py_TapeSetWatchVariable(input);
 }
@@ -1813,8 +1938,8 @@ bool ReadVariableOp(const FastPathOpExecInfo& parent_op_exec_info,
 
 // Supports only 2 cases at the moment:
 //  i) input is an EagerTensor
-//  ii) input is a ResourceVariable - in this case, the is_variable param is set
-//  to true.
+//  ii) input is a ResourceVariable - in this case, the is_variable param is
+//  set to true.
 //
 //  NOTE: dtype_hint_getter must *always* return a PyObject that can be
 //  decref'd. So if no hint is found, Py_RETURN_NONE (which correctly
diff --git a/tensorflow/python/eager/tape.py b/tensorflow/python/eager/tape.py
index ad82266beca05d..caa217b70cabfd 100644
--- a/tensorflow/python/eager/tape.py
+++ b/tensorflow/python/eager/tape.py
@@ -39,6 +39,11 @@ def push_new_tape(persistent=False):
   return Tape(tape)
 
 
+def push_tape(tape):
+  """Pushes an existing tape onto the tape stack."""
+  pywrap_tensorflow.TFE_Py_TapeSetAdd(tape._tape)  # pylint: disable=protected-access
+
+
 def watch(tensor):
   """Marks this tensor to be watched by all tapes in the stack.
 
diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py
index b044b30231603b..626a4eb1eee9bd 100644
--- a/tensorflow/python/eager/tensor_test.py
+++ b/tensorflow/python/eager/tensor_test.py
@@ -292,6 +292,11 @@ def testNegativeSliceDim(self):
   def testUnicode(self):
     self.assertEqual(constant_op.constant(u"asdf").numpy(), b"asdf")
 
+  def testFloatTensor(self):
+    self.assertEqual(dtypes.float64, _create_tensor(np.float64()).dtype)
+    self.assertEqual(dtypes.float32, _create_tensor(np.float32()).dtype)
+    self.assertEqual(dtypes.float32, _create_tensor(0.0).dtype)
+
   def testSliceDimOutOfRange(self):
     t1 = _create_tensor([[1, 2], [3, 4], [5, 6]], dtype=dtypes.int32)
     t2 = _create_tensor([1, 2], dtype=dtypes.int32)
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index c6bb9b9be7cb80..d538c6c415b1fb 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -21,6 +21,7 @@ py_library(
         ":export",
         ":exporter",
         ":inputs",
+        ":keras",
         ":linear",
         ":model_fn",
         ":parsing_utils",
@@ -38,6 +39,10 @@ py_library(
         ":gc",
         "//tensorflow/python:errors",
         "//tensorflow/python:platform",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:util",
+        "//tensorflow/python/estimator:metric_keys",
+        "//tensorflow/python/estimator:util",
     ],
 )
 
@@ -91,6 +96,7 @@ py_library(
         "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python/saved_model:signature_constants",
+        "//tensorflow/python/saved_model:tag_constants",
         "@six_archive//:six",
     ],
 )
@@ -440,7 +446,10 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/python:platform",
+        "//tensorflow/python:training",
         "//tensorflow/python:util",
+        "//tensorflow/python/data",
     ],
 )
 
@@ -448,9 +457,15 @@ py_test(
     name = "util_test",
     srcs = ["util_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["notsan"],  # b/67510291
     deps = [
         ":util",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:training",
+        "//tensorflow/python/data",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
     ],
 )
 
@@ -478,6 +493,7 @@ py_library(
         "//tensorflow/python:util",
         "//tensorflow/python/data",
         "//tensorflow/python/saved_model:builder",
+        "//tensorflow/python/saved_model:constants",
         "//tensorflow/python/saved_model:tag_constants",
         "//third_party/py/numpy",
         "@six_archive//:six",
@@ -487,6 +503,7 @@ py_library(
 py_test(
     name = "estimator_test",
     srcs = ["estimator_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     tags = ["notsan"],  # b/67510291
     deps = [
@@ -600,6 +617,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
@@ -642,7 +660,6 @@ py_library(
         ":metric_keys",
         ":model_fn",
         ":prediction_keys",
-        ":util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:control_flow_ops",
@@ -656,6 +673,7 @@ py_library(
         "//tensorflow/python:string_ops",
         "//tensorflow/python:summary",
         "//tensorflow/python:training",
+        "//tensorflow/python:util",
         "//tensorflow/python:weights_broadcast_ops",
         "//tensorflow/python/feature_column",
         "//tensorflow/python/ops/losses",
@@ -691,6 +709,7 @@ py_test(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:training",
+        "//tensorflow/python:variables",
         "//tensorflow/python/feature_column",
         "//tensorflow/python/ops/losses",
         "//tensorflow/python/saved_model:signature_constants",
@@ -908,3 +927,65 @@ py_test(
         "//tensorflow/python:training",
     ],
 )
+
+py_library(
+    name = "keras",
+    srcs = ["keras.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":estimator",
+        ":export_export",
+        ":model_fn",
+        ":run_config",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:metrics",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:partitioned_variables",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:random_seed",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python:training",
+        "//tensorflow/python:training_util",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/feature_column",
+        "//tensorflow/python/keras:backend",
+        "//tensorflow/python/keras:engine",
+        "//tensorflow/python/keras:layers",
+        "//tensorflow/python/ops/losses",
+        "//tensorflow/python/saved_model",
+        "//tensorflow/python/saved_model:signature_constants",
+    ],
+)
+
+py_test(
+    name = "keras_test",
+    size = "large",
+    srcs = ["keras_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["notsan"],
+    deps = [
+        ":keras",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:summary",
+        "//tensorflow/python:training",
+        "//tensorflow/python/estimator:numpy_io",
+        "//tensorflow/python/estimator:run_config",
+        "//tensorflow/python/keras",
+        "//tensorflow/python/keras:backend",
+        "//tensorflow/python/keras:engine",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/python/estimator/canned/baseline.py b/tensorflow/python/estimator/canned/baseline.py
index 3e92a77543e3d2..980c0573726945 100644
--- a/tensorflow/python/estimator/canned/baseline.py
+++ b/tensorflow/python/estimator/canned/baseline.py
@@ -344,7 +344,7 @@ def __init__(self,
       A `BaselineRegressor` estimator.
     """
 
-    head = head_lib._regression_head_with_mean_squared_error_loss(  # pylint: disable=protected-access
+    head = head_lib._regression_head(  # pylint: disable=protected-access
         label_dimension=label_dimension,
         weight_column=weight_column,
         loss_reduction=loss_reduction)
diff --git a/tensorflow/python/estimator/canned/boosted_trees.py b/tensorflow/python/estimator/canned/boosted_trees.py
index 085dace1b3eb1b..4e6010a162be6e 100644
--- a/tensorflow/python/estimator/canned/boosted_trees.py
+++ b/tensorflow/python/estimator/canned/boosted_trees.py
@@ -49,40 +49,15 @@
 
 _HOLD_FOR_MULTI_CLASS_SUPPORT = object()
 _HOLD_FOR_MULTI_DIM_SUPPORT = object()
+_DUMMY_NUM_BUCKETS = -1
 
 
-def _get_max_buckets(feature_columns):
-  """Gets the maximum number of buckets from feature_columns.
-
-  Args:
-    feature_columns: a list/set of tf.feature_column.
-
-  Returns:
-    max_buckets: the maximum number of buckets among bucketized_columns.
-
-  Raises:
-    ValueError: when unsupported feature_columns are given.
-  """
-  if not feature_columns:
-    raise ValueError('feature_columns must be a non-empty list/set of '
-                     'tf.feature_column.')
-  max_buckets = 1
-  for fc in feature_columns:
-    if isinstance(fc, feature_column_lib._BucketizedColumn):  # pylint:disable=protected-access
-      # N boundaries creates (N+1) buckets.
-      max_buckets = max(max_buckets, len(fc.boundaries) + 1)
-    else:
-      raise ValueError('For now, only bucketized_column is supported but '
-                       'got: {}'.format(fc))
-  return max_buckets
-
-
-def _get_transformed_features(features, feature_columns):
+def _get_transformed_features(features, sorted_feature_columns):
   """Gets the transformed features from features/feature_columns pair.
 
   Args:
     features: a dicionary of name to Tensor.
-    feature_columns: a list/set of tf.feature_column.
+    sorted_feature_columns: a list/set of tf.feature_column, sorted by name.
 
   Returns:
     result_features: a list of the transformed features, sorted by the name.
@@ -91,38 +66,131 @@ def _get_transformed_features(features, feature_columns):
     ValueError: when unsupported features/columns are tried.
   """
   # pylint:disable=protected-access
-  for fc in feature_columns:
-    if not isinstance(fc, feature_column_lib._BucketizedColumn):
-      raise ValueError('For now, only bucketized_column is supported but '
-                       'got: {}'.format(fc))
   transformed_features = feature_column_lib._transform_features(
-      features, feature_columns)
-  # pylint:enable=protected-access
+      features, sorted_feature_columns)
   result_features = []
-  for column in sorted(transformed_features, key=lambda tc: tc.name):
-    source_name = column.source_column.name
-    squeezed_tensor = array_ops.squeeze(transformed_features[column], axis=1)
-    if len(squeezed_tensor.shape) > 1:
-      raise ValueError('For now, only supports features equivalent to rank 1 '
-                       'but column `{}` got: {}'.format(
-                           source_name, features[source_name].shape))
-    result_features.append(squeezed_tensor)
+  for column in sorted_feature_columns:
+    if isinstance(column, feature_column_lib._BucketizedColumn):
+      source_name = column.source_column.name
+      squeezed_tensor = array_ops.squeeze(transformed_features[column], axis=1)
+      if len(squeezed_tensor.shape) > 1:
+        raise ValueError('For now, only supports features equivalent to rank 1 '
+                         'but column `{}` got: {}'.format(
+                             source_name, features[source_name].shape))
+      result_features.append(squeezed_tensor)
+    elif isinstance(column, feature_column_lib._IndicatorColumn):
+      source_name = column.categorical_column.name
+      tensor = math_ops.to_int32(transformed_features[column])
+      if len(tensor.shape) > 2:
+        raise ValueError('Rank of indicator column must be no more than 2, '
+                         'but column `{}` got: {}'.format(
+                             source_name, features[source_name].shape))
+      unstacked = array_ops.unstack(tensor, axis=1)
+      result_features.extend(unstacked)
+    else:
+      raise ValueError(
+          'For now, only bucketized_column and indicator_column is supported '
+          'but got: {}'.format(column))
+    # pylint:enable=protected-access
+
   return result_features
 
 
-def _local_variable(tensor, name=None):
+def _local_variable(initial_value, name=None):
   """Stores a tensor as a local Variable for faster read."""
-  return variable_scope.variable(
-      initial_value=tensor,
+  result = variable_scope.variable(
+      initial_value=initial_value,
       trainable=False,
       collections=[ops.GraphKeys.LOCAL_VARIABLES],
       validate_shape=False,
       name=name)
+  if isinstance(initial_value, ops.Tensor):
+    # Match the resulting variable's shape if the initial_value is a Tensor.
+    result.set_shape(initial_value.shape)
+  return result
+
+
+def _group_features_by_num_buckets(sorted_feature_columns):
+  """Groups feature ids by the number of buckets.
+
+  Derives the feature ids based on iterating through ordered feature columns
+  and groups them by the number of buckets each feature require. Returns a
+  sorted list of buckets and a list of lists of feature ids for each of those
+  buckets.
+
+  Args:
+    sorted_feature_columns: a list/set of tf.feature_column sorted by name.
+
+  Returns:
+    bucket_size_list: a list of required bucket sizes.
+    feature_ids_list: a list of lists of feature ids for each bucket size.
+
+  Raises:
+    ValueError: when unsupported features columns are provided.
+  """
+  bucket_size_to_feature_ids_dict = collections.OrderedDict()
+
+  # TODO(nponomareva) for now we preserve the previous functionality and bucket
+  # all numeric into the same num of buckets. Can be easily changed to using
+  # each numeric's real buckets num, but we need to test that it does not cause
+  # a performance hit.
+
+  # We will replace this dummy key with the real max after we calculate it.
+  bucket_size_to_feature_ids_dict[_DUMMY_NUM_BUCKETS] = []
+
+  max_buckets_for_bucketized = 2
+  max_buckets_for_indicator = 2
+
+  feature_idx = 0
+  # pylint:disable=protected-access
+
+  for column in sorted_feature_columns:
+    if isinstance(column, feature_column_lib._IndicatorColumn):
+      num_categorical_features = column.categorical_column._num_buckets
+      if max_buckets_for_indicator not in bucket_size_to_feature_ids_dict:
+        bucket_size_to_feature_ids_dict[max_buckets_for_indicator] = []
+
+      for _ in range(num_categorical_features):
+        # We use bucket size of 2 for categorical.
+        bucket_size_to_feature_ids_dict[max_buckets_for_indicator].append(
+            feature_idx)
+        feature_idx += 1
+    elif isinstance(column, feature_column_lib._BucketizedColumn):
+      max_buckets_for_bucketized = max(max_buckets_for_bucketized,
+                                       len(column.boundaries) + 1)
+      bucket_size_to_feature_ids_dict[_DUMMY_NUM_BUCKETS].append(feature_idx)
+      feature_idx += 1
+    elif not isinstance(column, feature_column_lib._IndicatorColumn):  # pylint:disable=protected-access
+      raise ValueError(
+          'For now, only bucketized_column and indicator column are supported '
+          'but got: {}'.format(column))
+
+  # pylint:enable=protected-access
+  # Replace the dummy key with the real max num of buckets for all bucketized
+  # columns.
+  bucket_size_to_feature_ids_dict[
+      max_buckets_for_bucketized] = bucket_size_to_feature_ids_dict[
+          _DUMMY_NUM_BUCKETS]
+  del bucket_size_to_feature_ids_dict[_DUMMY_NUM_BUCKETS]
+
+  feature_ids_list = list(bucket_size_to_feature_ids_dict.values())
+  bucket_size_list = list(bucket_size_to_feature_ids_dict.keys())
+  return bucket_size_list, feature_ids_list
+
+
+def _calculate_num_features(sorted_feature_columns):
+  num_features = 0
+  for column in sorted_feature_columns:
+    if isinstance(column, feature_column_lib._IndicatorColumn):  # pylint:disable=protected-access
+      num_features += column.categorical_column._num_buckets  # pylint:disable=protected-access
+    else:
+      num_features += 1
+  return num_features
 
 
-def _cache_transformed_features(features, feature_columns, batch_size):
+def _cache_transformed_features(features, sorted_feature_columns, batch_size):
   """Transform features and cache, then returns (cached_features, cache_op)."""
-  num_features = len(feature_columns)
+  num_features = _calculate_num_features(sorted_feature_columns)
   cached_features = [
       _local_variable(
           array_ops.zeros([batch_size], dtype=dtypes.int32),
@@ -132,7 +200,7 @@ def _cache_transformed_features(features, feature_columns, batch_size):
   are_features_cached = _local_variable(False, name='are_features_cached')
 
   def cache_features_and_return():
-    """Caches transoformed features.
+    """Caches transformed features.
 
     The intention is to hide get_transformed_features() from the graph by
     caching the result except the first step, since bucketize operation
@@ -144,7 +212,8 @@ def cache_features_and_return():
           the graph.
     """
 
-    transformed_features = _get_transformed_features(features, feature_columns)
+    transformed_features = _get_transformed_features(features,
+                                                     sorted_feature_columns)
     cached = [
         state_ops.assign(cached_features[i], transformed_features[i])
         for i in range(num_features)
@@ -191,7 +260,7 @@ def __init__(self, example_ids, logits_dimension):
     elif dtypes.as_dtype(dtypes.string).is_compatible_with(example_ids.dtype):
       empty_key = ''
     else:
-      raise ValueError('Unsupported example_id_feature dtype %s.',
+      raise ValueError('Unsupported example_id_feature dtype %s.' %
                        example_ids.dtype)
     # Cache holds latest <tree_id, node_id, logits> for each example.
     # tree_id and node_id are both int32 but logits is a float32.
@@ -199,7 +268,10 @@ def __init__(self, example_ids, logits_dimension):
     # bitcast the ids to int32.
     self._table_ref = lookup_ops.mutable_dense_hash_table_v2(
         empty_key=empty_key, value_dtype=dtypes.float32, value_shape=[3])
-    self._example_ids = example_ids
+    self._example_ids = ops.convert_to_tensor(example_ids)
+    if self._example_ids.shape.ndims not in (None, 1):
+      raise ValueError('example_id should have rank 1, but got %s' %
+                       self._example_ids)
     self._logits_dimension = logits_dimension
 
   def lookup(self):
@@ -213,6 +285,9 @@ def lookup(self):
         array_ops.bitcast(cached_tree_ids, dtypes.int32))
     cached_node_ids = array_ops.squeeze(
         array_ops.bitcast(cached_node_ids, dtypes.int32))
+    if self._example_ids.shape.ndims is not None:
+      cached_logits.set_shape(
+          [self._example_ids.shape[0], self._logits_dimension])
     return (cached_tree_ids, cached_node_ids, cached_logits)
 
   def insert(self, tree_ids, node_ids, logits):
@@ -349,6 +424,8 @@ def _bt_model_fn(
     ValueError: mode or params are invalid, or features has the wrong type.
   """
   is_single_machine = (config.num_worker_replicas <= 1)
+
+  sorted_feature_columns = sorted(feature_columns, key=lambda tc: tc.name)
   if train_in_memory:
     assert n_batches_per_layer == 1, (
         'When train_in_memory is enabled, input_fn should return the entire '
@@ -364,24 +441,26 @@ def _bt_model_fn(
   # the dimension max_splits_per_layer, instead of max_splits (for the entire
   # tree).
   max_splits = (1 << tree_hparams.max_depth) - 1
-  max_buckets = _get_max_buckets(feature_columns)
   train_op = []
   with ops.name_scope(name) as name:
     # Prepare.
     global_step = training_util.get_or_create_global_step()
-    num_features = len(feature_columns)
+    bucket_size_list, feature_ids_list = _group_features_by_num_buckets(
+        sorted_feature_columns)
     # Extract input features and set up cache for training.
     training_state_cache = None
     if mode == model_fn.ModeKeys.TRAIN and train_in_memory:
       # cache transformed features as well for in-memory training.
       batch_size = array_ops.shape(labels)[0]
-      input_feature_list, input_cache_op = _cache_transformed_features(
-          features, feature_columns, batch_size)
+      input_feature_list, input_cache_op = (
+          _cache_transformed_features(features, sorted_feature_columns,
+                                      batch_size))
       train_op.append(input_cache_op)
       training_state_cache = _CacheTrainingStatesUsingVariables(
           batch_size, head.logits_dimension)
     else:
-      input_feature_list = _get_transformed_features(features, feature_columns)
+      input_feature_list = _get_transformed_features(features,
+                                                     sorted_feature_columns)
       if mode == model_fn.ModeKeys.TRAIN and example_id_column_name:
         example_ids = features[example_id_column_name]
         training_state_cache = _CacheTrainingStatesUsingHashTable(
@@ -446,34 +525,61 @@ def _train_op_fn(loss):
         gradients = gradients_impl.gradients(loss, logits, name='Gradients')[0]
         hessians = gradients_impl.gradients(
             gradients, logits, name='Hessians')[0]
-      stats_summary_list = [
-          array_ops.squeeze(
-              boosted_trees_ops.make_stats_summary(
-                  node_ids=node_ids,
-                  gradients=gradients,
-                  hessians=hessians,
-                  bucketized_features_list=[input_feature_list[f]],
-                  max_splits=max_splits,
-                  num_buckets=max_buckets),
-              axis=0) for f in range(num_features)
-      ]
-
-      def grow_tree_from_stats_summaries(stats_summary_list):
+
+      stats_summaries_list = []
+      for i, feature_ids in enumerate(feature_ids_list):
+        num_buckets = bucket_size_list[i]
+        summaries = [
+            array_ops.squeeze(
+                boosted_trees_ops.make_stats_summary(
+                    node_ids=node_ids,
+                    gradients=gradients,
+                    hessians=hessians,
+                    bucketized_features_list=[input_feature_list[f]],
+                    max_splits=max_splits,
+                    num_buckets=num_buckets),
+                axis=0) for f in feature_ids
+        ]
+        stats_summaries_list.append(summaries)
+
+      accumulators = []
+
+      def grow_tree_from_stats_summaries(stats_summaries_list,
+                                         feature_ids_list):
         """Updates ensemble based on the best gains from stats summaries."""
-        (node_ids_per_feature, gains_list, thresholds_list,
-         left_node_contribs_list, right_node_contribs_list) = (
-             boosted_trees_ops.calculate_best_gains_per_feature(
-                 node_id_range=last_layer_nodes_range,
-                 stats_summary_list=stats_summary_list,
-                 l1=tree_hparams.l1,
-                 l2=tree_hparams.l2,
-                 tree_complexity=tree_hparams.tree_complexity,
-                 min_node_weight=tree_hparams.min_node_weight,
-                 max_splits=max_splits))
+        node_ids_per_feature = []
+        gains_list = []
+        thresholds_list = []
+        left_node_contribs_list = []
+        right_node_contribs_list = []
+        all_feature_ids = []
+
+        assert len(stats_summaries_list) == len(feature_ids_list)
+
+        for i, feature_ids in enumerate(feature_ids_list):
+          (numeric_node_ids_per_feature, numeric_gains_list,
+           numeric_thresholds_list, numeric_left_node_contribs_list,
+           numeric_right_node_contribs_list) = (
+               boosted_trees_ops.calculate_best_gains_per_feature(
+                   node_id_range=last_layer_nodes_range,
+                   stats_summary_list=stats_summaries_list[i],
+                   l1=tree_hparams.l1,
+                   l2=tree_hparams.l2,
+                   tree_complexity=tree_hparams.tree_complexity,
+                   min_node_weight=tree_hparams.min_node_weight,
+                   max_splits=max_splits))
+
+          all_feature_ids += feature_ids
+          node_ids_per_feature += numeric_node_ids_per_feature
+          gains_list += numeric_gains_list
+          thresholds_list += numeric_thresholds_list
+          left_node_contribs_list += numeric_left_node_contribs_list
+          right_node_contribs_list += numeric_right_node_contribs_list
+
         grow_op = boosted_trees_ops.update_ensemble(
             # Confirm if local_tree_ensemble or tree_ensemble should be used.
             tree_ensemble.resource_handle,
-            feature_ids=math_ops.range(0, num_features, dtype=dtypes.int32),
+            feature_ids=all_feature_ids,
             node_ids=node_ids_per_feature,
             gains=gains_list,
             thresholds=thresholds_list,
@@ -486,32 +592,50 @@ def grow_tree_from_stats_summaries(stats_summary_list):
 
       if train_in_memory and is_single_machine:
         train_op.append(distribute_lib.increment_var(global_step))
-        train_op.append(grow_tree_from_stats_summaries(stats_summary_list))
+        train_op.append(
+            grow_tree_from_stats_summaries(stats_summaries_list,
+                                           feature_ids_list))
       else:
-        summary_accumulator = data_flow_ops.ConditionalAccumulator(
-            dtype=dtypes.float32,
-            # The stats consist of gradients and hessians (the last dimension).
-            shape=[num_features, max_splits, max_buckets, 2],
-            shared_name='stats_summary_accumulator')
-        apply_grad = summary_accumulator.apply_grad(
-            array_ops.stack(stats_summary_list, axis=0), stamp_token)
+        dependencies = []
+
+        for i, feature_ids in enumerate(feature_ids_list):
+          stats_summaries = stats_summaries_list[i]
+          accumulator = data_flow_ops.ConditionalAccumulator(
+              dtype=dtypes.float32,
+              # The stats consist of grads and hessians (the last dimension).
+              shape=[len(feature_ids), max_splits, bucket_size_list[i], 2],
+              shared_name='numeric_stats_summary_accumulator_' + str(i))
+          accumulators.append(accumulator)
+
+          apply_grad = accumulator.apply_grad(
+              array_ops.stack(stats_summaries, axis=0), stamp_token)
+          dependencies.append(apply_grad)
 
         def grow_tree_from_accumulated_summaries_fn():
           """Updates the tree with the best layer from accumulated summaries."""
           # Take out the accumulated summaries from the accumulator and grow.
-          stats_summary_list = array_ops.unstack(
-              summary_accumulator.take_grad(1), axis=0)
-          grow_op = grow_tree_from_stats_summaries(stats_summary_list)
+          stats_summaries_list = []
+
+          stats_summaries_list = [
+              array_ops.unstack(accumulator.take_grad(1), axis=0)
+              for accumulator in accumulators
+          ]
+
+          grow_op = grow_tree_from_stats_summaries(stats_summaries_list,
+                                                   feature_ids_list)
           return grow_op
 
-        with ops.control_dependencies([apply_grad]):
+        with ops.control_dependencies(dependencies):
           train_op.append(distribute_lib.increment_var(global_step))
           if config.is_chief:
+            min_accumulated = math_ops.reduce_min(
+                array_ops.stack(
+                    [acc.num_accumulated() for acc in accumulators]))
+
             train_op.append(
                 control_flow_ops.cond(
-                    math_ops.greater_equal(
-                        summary_accumulator.num_accumulated(),
-                        n_batches_per_layer),
+                    math_ops.greater_equal(min_accumulated,
+                                           n_batches_per_layer),
                     grow_tree_from_accumulated_summaries_fn,
                     control_flow_ops.no_op,
                     name='wait_until_n_batches_accumulated'))
@@ -554,13 +678,18 @@ def _create_classification_head_and_closed_form(n_classes, weight_column,
                                                 label_vocabulary):
   """Creates a head for classifier and the closed form gradients/hessians."""
   head = _create_classification_head(n_classes, weight_column, label_vocabulary)
-  if n_classes == 2 and weight_column is None and label_vocabulary is None:
+  if (n_classes == 2 and head.logits_dimension == 1 and weight_column is None
+      and label_vocabulary is None):
     # Use the closed-form gradients/hessians for 2 class.
     def _grad_and_hess_for_logloss(logits, labels):
+      """A closed form gradient and hessian for logistic loss."""
       # TODO(youngheek): add weights handling.
       predictions = math_ops.reciprocal(math_ops.exp(-logits) + 1.0)
       normalizer = math_ops.reciprocal(
           math_ops.cast(array_ops.size(predictions), dtypes.float32))
+      labels = math_ops.cast(labels, dtypes.float32)
+      labels = head_lib._check_dense_labels_match_logits_and_reshape(  # pylint: disable=protected-access
+          labels, logits, head.logits_dimension)
       gradients = (predictions - labels) * normalizer
       hessians = predictions * (1.0 - predictions) * normalizer
       return gradients, hessians
@@ -576,7 +705,7 @@ def _create_regression_head(label_dimension, weight_column=None):
     raise ValueError('For now only 1 dimension regression is supported.'
                      'label_dimension given as {}'.format(label_dimension))
   # pylint: disable=protected-access
-  return head_lib._regression_head_with_mean_squared_error_loss(
+  return head_lib._regression_head(
       label_dimension=label_dimension,
       weight_column=weight_column,
       loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
diff --git a/tensorflow/python/estimator/canned/boosted_trees_test.py b/tensorflow/python/estimator/canned/boosted_trees_test.py
index c8c52d3bc649c9..9ea4f484744762 100644
--- a/tensorflow/python/estimator/canned/boosted_trees_test.py
+++ b/tensorflow/python/estimator/canned/boosted_trees_test.py
@@ -35,6 +35,7 @@
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 from tensorflow.python.training import checkpoint_utils
+from tensorflow.python.training import session_run_hook
 
 NUM_FEATURES = 3
 
@@ -46,6 +47,7 @@
         [3.0, 20.0, 50.0, -100.0, 102.75],     # feature_2 quantized:[2,3,3,0,3]
     ],
     dtype=np.float32)
+
 CLASSIFICATION_LABELS = [[0.], [1.], [1.], [0.], [0.]]
 REGRESSION_LABELS = [[1.5], [0.3], [0.2], [2.], [5.]]
 FEATURES_DICT = {'f_%d' % i: INPUT_FEATURES[i] for i in range(NUM_FEATURES)}
@@ -59,7 +61,7 @@ def _make_train_input_fn(is_classification):
   """Makes train input_fn for classification/regression."""
 
   def _input_fn():
-    features_dict = dict(FEATURES_DICT)
+    features_dict = dict(FEATURES_DICT)  # copies the dict to add an entry.
     features_dict[EXAMPLE_ID_COLUMN] = constant_op.constant(EXAMPLE_IDS)
     labels = CLASSIFICATION_LABELS if is_classification else REGRESSION_LABELS
     return features_dict, labels
@@ -71,7 +73,7 @@ def _make_train_input_fn_dataset(is_classification, batch=None, repeat=None):
   """Makes input_fn using Dataset."""
 
   def _input_fn():
-    features_dict = dict(FEATURES_DICT)
+    features_dict = dict(FEATURES_DICT)  # copies the dict to add an entry.
     features_dict[EXAMPLE_ID_COLUMN] = constant_op.constant(EXAMPLE_IDS)
     labels = CLASSIFICATION_LABELS if is_classification else REGRESSION_LABELS
     if batch:
@@ -101,17 +103,58 @@ def setUp(self):
 
   def _assert_checkpoint(self, model_dir, global_step, finalized_trees,
                          attempted_layers):
+    self._assert_checkpoint_and_return_model(model_dir, global_step,
+                                             finalized_trees, attempted_layers)
+
+  def _assert_checkpoint_and_return_model(self, model_dir, global_step,
+                                          finalized_trees, attempted_layers):
     reader = checkpoint_utils.load_checkpoint(model_dir)
     self.assertEqual(global_step, reader.get_tensor(ops.GraphKeys.GLOBAL_STEP))
     serialized = reader.get_tensor('boosted_trees:0_serialized')
     ensemble_proto = boosted_trees_pb2.TreeEnsemble()
     ensemble_proto.ParseFromString(serialized)
+
     self.assertEqual(
         finalized_trees,
         sum([1 for t in ensemble_proto.tree_metadata if t.is_finalized]))
     self.assertEqual(attempted_layers,
                      ensemble_proto.growing_metadata.num_layers_attempted)
 
+    return ensemble_proto
+
+  def testFirstCheckpointWorksFine(self):
+    """Tests that eval/pred doesn't crash with the very first checkpoint.
+
+    The step-0 checkpoint will have only an empty ensemble, and a separate eval
+    job might read from it and crash.
+    This test ensures that prediction/evaluation works fine with it.
+    """
+    input_fn = _make_train_input_fn(is_classification=True)
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.BoostedTreesClassifier(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+
+    class BailOutWithoutTraining(session_run_hook.SessionRunHook):
+
+      def before_run(self, run_context):
+        raise StopIteration('to bail out.')
+
+    est.train(input_fn, steps=100,  # must stop at 0 anyway.
+              hooks=[BailOutWithoutTraining()])
+    self._assert_checkpoint(
+        est.model_dir, global_step=0, finalized_trees=0, attempted_layers=0)
+    # Empty ensemble returns 0 logits, so that all output labels are 0.
+    eval_res = est.evaluate(input_fn=input_fn, steps=1)
+    self.assertAllClose(eval_res['accuracy'], 0.6)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose([[0], [0], [0], [0], [0]],
+                        [pred['class_ids'] for pred in predictions])
+
   def testTrainAndEvaluateBinaryClassifier(self):
     input_fn = _make_train_input_fn(is_classification=True)
 
@@ -151,6 +194,69 @@ def testInferBinaryClassifier(self):
     self.assertAllClose([[0], [1], [1], [0], [0]],
                         [pred['class_ids'] for pred in predictions])
 
+  def testTrainClassifierWithRankOneLabel(self):
+    """Tests that label with rank-1 tensor is also accepted by classifier."""
+    def _input_fn_with_rank_one_label():
+      return FEATURES_DICT, [0., 1., 1., 0., 0.]
+
+    est = boosted_trees.BoostedTreesClassifier(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+
+    # It will stop after 5 steps because of the max depth and num trees.
+    num_steps = 100
+    # Train for a few steps, and validate final checkpoint.
+    est.train(_input_fn_with_rank_one_label, steps=num_steps)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    eval_res = est.evaluate(input_fn=_input_fn_with_rank_one_label, steps=1)
+    self.assertAllClose(eval_res['accuracy'], 1.0)
+
+  def testTrainClassifierWithLabelVocabulary(self):
+    apple, banana = 'apple', 'banana'
+    def _input_fn_with_label_vocab():
+      return FEATURES_DICT, [[apple], [banana], [banana], [apple], [apple]]
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.BoostedTreesClassifier(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5,
+        label_vocabulary=[apple, banana])
+    est.train(input_fn=_input_fn_with_label_vocab, steps=5)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    eval_res = est.evaluate(input_fn=_input_fn_with_label_vocab, steps=1)
+    self.assertAllClose(eval_res['accuracy'], 1.0)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose([[0], [1], [1], [0], [0]],
+                        [pred['class_ids'] for pred in predictions])
+
+  def testTrainClassifierWithIntegerLabel(self):
+    def _input_fn_with_integer_label():
+      return (FEATURES_DICT,
+              constant_op.constant([[0], [1], [1], [0], [0]], dtypes.int32))
+    predict_input_fn = numpy_io.numpy_input_fn(
+        x=FEATURES_DICT, y=None, batch_size=1, num_epochs=1, shuffle=False)
+
+    est = boosted_trees.BoostedTreesClassifier(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+    est.train(input_fn=_input_fn_with_integer_label, steps=5)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    eval_res = est.evaluate(input_fn=_input_fn_with_integer_label, steps=1)
+    self.assertAllClose(eval_res['accuracy'], 1.0)
+    predictions = list(est.predict(input_fn=predict_input_fn))
+    self.assertAllClose([[0], [1], [1], [0], [0]],
+                        [pred['class_ids'] for pred in predictions])
+
   def testTrainClassifierWithDataset(self):
     train_input_fn = _make_train_input_fn_dataset(is_classification=True)
     predict_input_fn = numpy_io.numpy_input_fn(
@@ -210,6 +316,26 @@ def testInferRegressor(self):
         [[0.571619], [0.262821], [0.124549], [0.956801], [1.769801]],
         [pred['predictions'] for pred in predictions])
 
+  def testTrainRegressorWithRankOneLabel(self):
+    """Tests that label with rank-1 tensor is also accepted by regressor."""
+    def _input_fn_with_rank_one_label():
+      return FEATURES_DICT, [1.5, 0.3, 0.2, 2., 5.]
+
+    est = boosted_trees.BoostedTreesRegressor(
+        feature_columns=self._feature_columns,
+        n_batches_per_layer=1,
+        n_trees=1,
+        max_depth=5)
+
+    # It will stop after 5 steps because of the max depth and num trees.
+    num_steps = 100
+    # Train for a few steps, and validate final checkpoint.
+    est.train(_input_fn_with_rank_one_label, steps=num_steps)
+    self._assert_checkpoint(
+        est.model_dir, global_step=5, finalized_trees=1, attempted_layers=5)
+    eval_res = est.evaluate(input_fn=_input_fn_with_rank_one_label, steps=1)
+    self.assertAllClose(eval_res['average_loss'], 2.478283)
+
   def testTrainRegressorWithDataset(self):
     train_input_fn = _make_train_input_fn_dataset(is_classification=False)
     predict_input_fn = numpy_io.numpy_input_fn(
@@ -325,6 +451,55 @@ def testTrainRegressorWithDatasetWhenInputIsOverEarlier(self):
         [[0.353850], [0.254100], [0.106850], [0.712100], [1.012100]],
         [pred['predictions'] for pred in predictions])
 
+  def testTrainEvaluateAndPredictWithIndicatorColumn(self):
+    categorical = feature_column.categorical_column_with_vocabulary_list(
+        key='categorical', vocabulary_list=('bad', 'good', 'ok'))
+    feature_indicator = feature_column.indicator_column(categorical)
+    bucketized_col = feature_column.bucketized_column(
+        feature_column.numeric_column(
+            'an_uninformative_feature', dtype=dtypes.float32),
+        BUCKET_BOUNDARIES)
+
+    labels = np.array([[0.], [5.7], [5.7], [0.], [0.]], dtype=np.float32)
+    # Our categorical feature defines the labels perfectly
+    input_fn = numpy_io.numpy_input_fn(
+        x={
+            'an_uninformative_feature': np.array([1, 1, 1, 1, 1]),
+            'categorical': np.array(['bad', 'good', 'good', 'ok', 'bad']),
+        },
+        y=labels,
+        batch_size=5,
+        shuffle=False)
+
+    # Train depth 1 tree.
+    est = boosted_trees.BoostedTreesRegressor(
+        feature_columns=[bucketized_col, feature_indicator],
+        n_batches_per_layer=1,
+        n_trees=1,
+        learning_rate=1.0,
+        max_depth=1)
+
+    num_steps = 1
+    est.train(input_fn, steps=num_steps)
+    ensemble = self._assert_checkpoint_and_return_model(
+        est.model_dir, global_step=1, finalized_trees=1, attempted_layers=1)
+
+    # We learnt perfectly.
+    eval_res = est.evaluate(input_fn=input_fn, steps=1)
+    self.assertAllClose(eval_res['loss'], 0)
+
+    predictions = list(est.predict(input_fn))
+    self.assertAllClose(
+        labels,
+        [pred['predictions'] for pred in predictions])
+
+    self.assertEqual(3, len(ensemble.trees[0].nodes))
+
+    # Check that the split happened on 'good' value, which will be encoded as
+    # feature with index 2 (0-numeric, 1 - 'bad')
+    self.assertEqual(2, ensemble.trees[0].nodes[0].bucketized_split.feature_id)
+    self.assertEqual(0, ensemble.trees[0].nodes[0].bucketized_split.threshold)
+
 
 class ModelFnTests(test_util.TensorFlowTestCase):
   """Tests bt_model_fn including unexposed internal functionalities."""
diff --git a/tensorflow/python/estimator/canned/dnn.py b/tensorflow/python/estimator/canned/dnn.py
index 6382622e0b5c72..1feac36f356cc5 100644
--- a/tensorflow/python/estimator/canned/dnn.py
+++ b/tensorflow/python/estimator/canned/dnn.py
@@ -126,7 +126,8 @@ def _dnn_model_fn(features,
                   activation_fn=nn.relu,
                   dropout=None,
                   input_layer_partitioner=None,
-                  config=None):
+                  config=None,
+                  tpu_estimator_spec=False):
   """Deep Neural Net model_fn.
 
   Args:
@@ -147,6 +148,8 @@ def _dnn_model_fn(features,
     input_layer_partitioner: Partitioner for input layer. Defaults
       to `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
     config: `RunConfig` object to configure the runtime settings.
+    tpu_estimator_spec: Whether to return a `_TPUEstimatorSpec` or
+      or `model_fn.EstimatorSpec` instance.
 
   Returns:
     An `EstimatorSpec` instance.
@@ -182,12 +185,20 @@ def _dnn_model_fn(features,
         input_layer_partitioner=input_layer_partitioner)
     logits = logit_fn(features=features, mode=mode)
 
-    return head.create_estimator_spec(
-        features=features,
-        mode=mode,
-        labels=labels,
-        optimizer=optimizer,
-        logits=logits)
+    if tpu_estimator_spec:
+      return head._create_tpu_estimator_spec(  # pylint: disable=protected-access
+          features=features,
+          mode=mode,
+          labels=labels,
+          optimizer=optimizer,
+          logits=logits)
+    else:
+      return head.create_estimator_spec(
+          features=features,
+          mode=mode,
+          labels=labels,
+          optimizer=optimizer,
+          logits=logits)
 
 
 @tf_export('estimator.DNNClassifier')
@@ -320,17 +331,8 @@ def __init__(
       loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
         to reduce training loss over batch. Defaults to `SUM`.
     """
-    if n_classes == 2:
-      head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(  # pylint: disable=protected-access
-          weight_column=weight_column,
-          label_vocabulary=label_vocabulary,
-          loss_reduction=loss_reduction)
-    else:
-      head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(  # pylint: disable=protected-access
-          n_classes, weight_column=weight_column,
-          label_vocabulary=label_vocabulary,
-          loss_reduction=loss_reduction)
-
+    head = head_lib._binary_logistic_or_multi_class_head(  # pylint: disable=protected-access
+        n_classes, weight_column, label_vocabulary, loss_reduction)
     def _model_fn(features, labels, mode, config):
       """Call the defined shared _dnn_model_fn."""
       return _dnn_model_fn(
@@ -481,8 +483,7 @@ def _model_fn(features, labels, mode, config):
           features=features,
           labels=labels,
           mode=mode,
-          head=head_lib.  # pylint: disable=protected-access
-          _regression_head_with_mean_squared_error_loss(
+          head=head_lib._regression_head(  # pylint: disable=protected-access
               label_dimension=label_dimension, weight_column=weight_column,
               loss_reduction=loss_reduction),
           hidden_units=hidden_units,
diff --git a/tensorflow/python/estimator/canned/dnn_linear_combined.py b/tensorflow/python/estimator/canned/dnn_linear_combined.py
index f47706db2fc5f9..95efc0a028bc90 100644
--- a/tensorflow/python/estimator/canned/dnn_linear_combined.py
+++ b/tensorflow/python/estimator/canned/dnn_linear_combined.py
@@ -553,8 +553,7 @@ def _model_fn(features, labels, mode, config):
           features=features,
           labels=labels,
           mode=mode,
-          head=head_lib.  # pylint: disable=protected-access
-          _regression_head_with_mean_squared_error_loss(
+          head=head_lib._regression_head(  # pylint: disable=protected-access
               label_dimension=label_dimension, weight_column=weight_column,
               loss_reduction=loss_reduction),
           linear_feature_columns=linear_feature_columns,
diff --git a/tensorflow/python/estimator/canned/dnn_testing_utils.py b/tensorflow/python/estimator/canned/dnn_testing_utils.py
index 62b13c3200dd78..06a648777f8f73 100644
--- a/tensorflow/python/estimator/canned/dnn_testing_utils.py
+++ b/tensorflow/python/estimator/canned/dnn_testing_utils.py
@@ -134,7 +134,7 @@ def mock_head(testcase, hidden_units, logits_dimension, expected_logits):
       hidden_weights_names + hidden_biases_names +
       [LOGITS_WEIGHTS_NAME + '/part_0:0', LOGITS_BIASES_NAME + '/part_0:0'])
 
-  def _create_estimator_spec(
+  def _create_tpu_estimator_spec(
       features, mode, logits, labels, train_op_fn=None, optimizer=None):
     del features, labels  # Not used.
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
@@ -149,19 +149,29 @@ def _create_estimator_spec(
           train_op = train_op_fn(loss)
         elif optimizer is not None:
           train_op = optimizer.minimize(loss, global_step=None)
-        return model_fn.EstimatorSpec(
+        return model_fn._TPUEstimatorSpec(
             mode=mode, loss=loss, train_op=train_op)
       elif mode == model_fn.ModeKeys.EVAL:
-        return model_fn.EstimatorSpec(mode=mode, loss=array_ops.identity(loss))
+        return model_fn._TPUEstimatorSpec(
+            mode=mode, loss=array_ops.identity(loss))
       elif mode == model_fn.ModeKeys.PREDICT:
-        return model_fn.EstimatorSpec(
+        return model_fn._TPUEstimatorSpec(
             mode=mode, predictions={'logits': array_ops.identity(logits)})
       else:
         testcase.fail('Invalid mode: {}'.format(mode))
 
+  def _create_estimator_spec(
+      features, mode, logits, labels, train_op_fn=None, optimizer=None):
+    tpu_spec = _create_tpu_estimator_spec(
+        features, mode, logits, labels, train_op_fn, optimizer)
+    return tpu_spec.as_estimator_spec()
+
   head = test.mock.NonCallableMagicMock(spec=head_lib._Head)
   head.logits_dimension = logits_dimension
-  head.create_estimator_spec = test.mock.MagicMock(wraps=_create_estimator_spec)
+  head._create_tpu_estimator_spec = test.mock.MagicMock(
+      wraps=_create_tpu_estimator_spec)
+  head.create_estimator_spec = test.mock.MagicMock(
+      wraps=_create_estimator_spec)
 
   return head
 
diff --git a/tensorflow/python/estimator/canned/head.py b/tensorflow/python/estimator/canned/head.py
index efa4bdf5980a34..b74ef1015cc564 100644
--- a/tensorflow/python/estimator/canned/head.py
+++ b/tensorflow/python/estimator/canned/head.py
@@ -24,7 +24,6 @@
 import six
 
 from tensorflow.python.estimator import model_fn
-from tensorflow.python.estimator import util
 from tensorflow.python.estimator.canned import metric_keys
 from tensorflow.python.estimator.canned import prediction_keys
 from tensorflow.python.estimator.export import export_output
@@ -32,6 +31,7 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
@@ -45,6 +45,7 @@
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.summary import summary
 from tensorflow.python.training import training_util
+from tensorflow.python.util import function_utils
 
 _DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
 
@@ -69,6 +70,35 @@ def _summary_key(head_name, val):
   return '%s/%s' % (val, head_name) if head_name else val
 
 
+def _create_eval_metrics_tuple(fn, kwargs):
+  """Creates TPU eval metrics tuple.
+
+  Helper function to make eval_metric tuple (eval_metric_fn, fn_kwargs) used
+  by `TPUEstimator`. TPUEstimator requires that `eval_metric_fn` take
+  exclusively Tensor arguments. This helper can help create such a function from
+  a more generic function that can take both Tensor and non-Tensor arguments.
+
+  Args:
+    fn: A eval_metric_fn that takes both Tensor and non-Tensor arguments.
+        This function must return a dict of form
+        {'metric name': (metric_tensor, eval_op)}
+    kwargs: Dict of arguments for `fn`.
+
+  Returns:
+    `eval_metric` tuple that can be passed to a `model_fn._TPUEstimatorSpec`.
+  """
+  tensor_kwargs = {}
+  nontensor_kwargs = {}
+  for k, v in six.iteritems(kwargs):
+    if tensor_util.is_tensor(v):
+      tensor_kwargs[k] = v
+    else:
+      nontensor_kwargs[k] = v
+  def _fn(**tensors):
+    return fn(**dict(nontensor_kwargs, **tensors))
+  return (_fn, tensor_kwargs)
+
+
 class _Head(object):
   """Interface for the head/top of a model.
 
@@ -174,7 +204,6 @@ def create_loss(self, features, mode, logits, labels):
 
   # TODO(b/65403806): By default, collect regularization_losses from
   # GraphKeys.REGULARIZATION_LOSSES collection.
-  @abc.abstractmethod
   def create_estimator_spec(
       self, features, mode, logits, labels=None, optimizer=None,
       train_op_fn=None, regularization_losses=None):
@@ -203,7 +232,47 @@ def create_estimator_spec(
     Returns:
       `EstimatorSpec`.
     """
-    raise NotImplementedError('Calling an abstract method.')
+    try:
+      tpu_estimator_spec = (
+          self._create_tpu_estimator_spec(
+              features, mode, logits, labels, optimizer, train_op_fn,
+              regularization_losses))
+      return tpu_estimator_spec.as_estimator_spec()
+    except NotImplementedError:
+      # Not all subclasses of _Head will have implemented
+      # _create_tpu_estimator_spec. If it is implemented, we can use it to
+      # create our `EstimatorSpec` here.
+      raise NotImplementedError(
+          'Subclasses of _Head must implement `create_estimator_spec()` or '
+          '_create_tpu_estimator_spec().')
+
+  def _create_tpu_estimator_spec(
+      self, features, mode, logits, labels=None, optimizer=None,
+      train_op_fn=None, regularization_losses=None):
+    """Returns `model_fn._TPUEstimatorSpec` that a model_fn can return.
+
+    Args:
+      features: Input `dict` of `Tensor` or `SparseTensor` objects.
+      mode: Estimator's `ModeKeys`.
+      logits: logits `Tensor` to be used by the head.
+      labels: Labels `Tensor`, or `dict` of same.
+      optimizer: `Optimizer` instance to optimize the loss in TRAIN mode.
+        Namely, sets `train_op = optimizer.minimize(loss, global_step)`, which
+        updates variables and increments `global_step`.
+      train_op_fn: Function that takes a scalar loss `Tensor` and returns an op
+        to optimize the model with the loss in TRAIN mode. Used if `optimizer`
+        is `None`. Exactly one of `train_op_fn` and `optimizer` must be set in
+        TRAIN mode. None is allowed in other modes. If you want to optimize loss
+        yourself you can pass `lambda _: tf.no_op()` and then use
+        EstimatorSpec.loss to compute and apply gradients.
+      regularization_losses: A list of additional scalar losses to be added to
+        the training loss, such as regularization losses.
+
+    Returns:
+      A `model_fn._TPUEstimatorSpec' instance.
+    """
+    raise NotImplementedError(
+        'TPUEstimatorSpec not available for this model head.')
 
 
 def _check_dense_labels_match_logits_and_reshape(
@@ -392,7 +461,7 @@ def _validate_loss_fn_args(loss_fn):
   Raises:
     ValueError: If the signature is unexpected.
   """
-  loss_fn_args = util.fn_args(loss_fn)
+  loss_fn_args = function_utils.fn_args(loss_fn)
   for required_arg in ['labels', 'logits']:
     if required_arg not in loss_fn_args:
       raise ValueError(
@@ -415,7 +484,7 @@ def _call_loss_fn(loss_fn, labels, logits, features, expected_loss_dim=1):
   Returns:
     Loss Tensor with shape [D0, D1, ... DN, expected_loss_dim].
   """
-  loss_fn_args = util.fn_args(loss_fn)
+  loss_fn_args = function_utils.fn_args(loss_fn)
   kwargs = {}
   if 'features' in loss_fn_args:
     kwargs['features'] = features
@@ -702,10 +771,10 @@ def create_loss(self, features, mode, logits, labels):
         weights=weights,
         processed_labels=label_ids)
 
-  def create_estimator_spec(
+  def _create_tpu_estimator_spec(
       self, features, mode, logits, labels=None, optimizer=None,
       train_op_fn=None, regularization_losses=None):
-    """Returns an `EstimatorSpec`.
+    """Returns a `model_fn._TPUEstimatorSpec`.
 
     Args:
       features: Input `dict` of `Tensor` or `SparseTensor` objects.
@@ -727,7 +796,7 @@ def create_estimator_spec(
         `loss_reduction=SUM_OVER_NONZERO_WEIGHTS` when creating the head to
         avoid scaling errors.
     Returns:
-      `EstimatorSpec`.
+      A `model_fn._TPUEstimatorSpec` instance.
     Raises:
       ValueError: If both `train_op_fn` and `optimizer` are `None` in TRAIN
         mode, or if both are set.
@@ -761,7 +830,7 @@ def create_estimator_spec(
         classifier_output = _classification_output(
             scores=probabilities, n_classes=self._n_classes,
             label_vocabulary=self._label_vocabulary)
-        return model_fn.EstimatorSpec(
+        return model_fn._TPUEstimatorSpec(  # pylint: disable=protected-access
             mode=model_fn.ModeKeys.PREDICT,
             predictions=predictions,
             export_outputs={
@@ -781,16 +850,17 @@ def create_estimator_spec(
         regularized_training_loss = training_loss
       # Eval.
       if mode == model_fn.ModeKeys.EVAL:
-        return model_fn.EstimatorSpec(
+        return model_fn._TPUEstimatorSpec(  # pylint: disable=protected-access
             mode=model_fn.ModeKeys.EVAL,
             predictions=predictions,
             loss=regularized_training_loss,
-            eval_metric_ops=self._eval_metric_ops(
-                labels=label_ids,
-                class_ids=class_ids,
-                weights=weights,
-                unreduced_loss=unreduced_loss,
-                regularization_loss=regularization_loss))
+            eval_metrics=_create_eval_metrics_tuple(self._eval_metric_ops, {
+                'labels': label_ids,
+                'class_ids': class_ids,
+                'weights': weights,
+                'unreduced_loss': unreduced_loss,
+                'regularization_loss': regularization_loss
+            }))
 
       # Train.
       if optimizer is not None:
@@ -803,6 +873,7 @@ def create_estimator_spec(
         train_op = train_op_fn(regularized_training_loss)
       else:
         raise ValueError('train_op_fn and optimizer cannot both be None.')
+      train_op = _append_update_ops(train_op)
       # Only summarize mean_loss for SUM reduction to preserve backwards
       # compatibility. Otherwise skip it to avoid unnecessary computation.
       if self._loss_reduction == losses.Reduction.SUM:
@@ -824,7 +895,7 @@ def create_estimator_spec(
         summary.scalar(
             _summary_key(self._name, keys.LOSS_REGULARIZATION),
             regularization_loss)
-    return model_fn.EstimatorSpec(
+    return model_fn._TPUEstimatorSpec(  # pylint: disable=protected-access
         mode=model_fn.ModeKeys.TRAIN,
         predictions=predictions,
         loss=regularized_training_loss,
@@ -1060,7 +1131,7 @@ def create_loss(self, features, mode, logits, labels):
         weights=weights,
         processed_labels=labels)
 
-  def create_estimator_spec(
+  def _create_tpu_estimator_spec(
       self, features, mode, logits, labels=None, optimizer=None,
       train_op_fn=None, regularization_losses=None):
     """Returns an `EstimatorSpec`.
@@ -1122,7 +1193,7 @@ def create_estimator_spec(
         classifier_output = _classification_output(
             scores=probabilities, n_classes=2,
             label_vocabulary=self._label_vocabulary)
-        return model_fn.EstimatorSpec(
+        return model_fn._TPUEstimatorSpec(  # pylint: disable=protected-access
             mode=model_fn.ModeKeys.PREDICT,
             predictions=predictions,
             export_outputs={
@@ -1146,18 +1217,22 @@ def create_estimator_spec(
 
       # Eval.
       if mode == model_fn.ModeKeys.EVAL:
-        return model_fn.EstimatorSpec(
+        return model_fn._TPUEstimatorSpec(  # pylint: disable=protected-access
             mode=model_fn.ModeKeys.EVAL,
             predictions=predictions,
             loss=regularized_training_loss,
-            eval_metric_ops=self._eval_metric_ops(
-                labels=processed_labels,
-                logits=logits,
-                logistic=logistic,
-                class_ids=class_ids,
-                weights=weights,
-                unreduced_loss=unreduced_loss,
-                regularization_loss=regularization_loss))
+            eval_metrics=_create_eval_metrics_tuple(
+                self._eval_metric_ops,
+                {
+                    'labels': processed_labels,
+                    'logits': logits,
+                    'logistic': logistic,
+                    'class_ids': class_ids,
+                    'weights': weights,
+                    'unreduced_loss': unreduced_loss,
+                    'regularization_loss': regularization_loss
+                }
+            ))
 
       # Train.
       if optimizer is not None:
@@ -1170,6 +1245,7 @@ def create_estimator_spec(
         train_op = train_op_fn(regularized_training_loss)
       else:
         raise ValueError('train_op_fn and optimizer cannot both be None.')
+      train_op = _append_update_ops(train_op)
       # Only summarize mean_loss for SUM reduction to preserve backwards
       # compatibility. Otherwise skip it to avoid unnecessary computation.
       if self._loss_reduction == losses.Reduction.SUM:
@@ -1190,14 +1266,14 @@ def create_estimator_spec(
         summary.scalar(
             _summary_key(self._name, keys.LOSS_REGULARIZATION),
             regularization_loss)
-    return model_fn.EstimatorSpec(
+    return model_fn._TPUEstimatorSpec(  # pylint: disable=protected-access
         mode=model_fn.ModeKeys.TRAIN,
         predictions=predictions,
         loss=regularized_training_loss,
         train_op=train_op)
 
 
-def _regression_head_with_mean_squared_error_loss(
+def _regression_head(
     weight_column=None,
     label_dimension=1,
     loss_reduction=losses.Reduction.SUM,
@@ -1322,7 +1398,25 @@ def create_loss(self, features, mode, logits, labels):
         weights=weights,
         processed_labels=labels)
 
-  def create_estimator_spec(
+  def _eval_metric_ops(self, weights, unreduced_loss, regularization_loss):
+    """Returns the Eval metric ops."""
+    keys = metric_keys.MetricKeys
+    # Estimator already adds a metric for loss.
+    eval_metric_ops = {
+        _summary_key(self._name, keys.LOSS_MEAN):
+            metrics_lib.mean(
+                values=unreduced_loss,
+                weights=weights)
+    }
+    if regularization_loss is not None:
+      regularization_loss_key = _summary_key(
+          self._name, keys.LOSS_REGULARIZATION)
+      eval_metric_ops[regularization_loss_key] = metrics_lib.mean(
+          values=regularization_loss,
+          name=keys.LOSS_REGULARIZATION)
+    return eval_metric_ops
+
+  def _create_tpu_estimator_spec(
       self, features, mode, logits, labels=None, optimizer=None,
       train_op_fn=None, regularization_losses=None):
     """Returns an `EstimatorSpec`.
@@ -1348,7 +1442,7 @@ def create_estimator_spec(
         `loss_reduction=SUM_OVER_NONZERO_WEIGHTS` when creating the head to
         avoid scaling errors.
     Returns:
-      `EstimatorSpec`.
+      A `model_fn._TPUEstimatorSpec` instance.
     Raises:
       ValueError: If both `train_op_fn` and `optimizer` are `None` in TRAIN
         mode, or if both are set.
@@ -1369,7 +1463,7 @@ def create_estimator_spec(
       if mode == model_fn.ModeKeys.PREDICT:
         regression_output = export_output.RegressionOutput(
             value=predicted_value)
-        return model_fn.EstimatorSpec(
+        return model_fn._TPUEstimatorSpec(  # pylint: disable=protected-access
             mode=model_fn.ModeKeys.PREDICT,
             predictions=predictions,
             export_outputs={
@@ -1390,25 +1484,18 @@ def create_estimator_spec(
 
       # Eval.
       if mode == model_fn.ModeKeys.EVAL:
-        keys = metric_keys.MetricKeys
-        # Estimator already adds a metric for loss.
-        eval_metric_ops = {
-            _summary_key(self._name, keys.LOSS_MEAN):
-                metrics_lib.mean(
-                    values=unreduced_loss,
-                    weights=weights)
-        }
-        if regularization_loss is not None:
-          regularization_loss_key = _summary_key(
-              self._name, keys.LOSS_REGULARIZATION)
-          eval_metric_ops[regularization_loss_key] = metrics_lib.mean(
-              values=regularization_loss,
-              name=keys.LOSS_REGULARIZATION)
-        return model_fn.EstimatorSpec(
+        return model_fn._TPUEstimatorSpec(  # pylint: disable=protected-access
             mode=model_fn.ModeKeys.EVAL,
             predictions=predictions,
             loss=regularized_training_loss,
-            eval_metric_ops=eval_metric_ops)
+            eval_metrics=_create_eval_metrics_tuple(
+                self._eval_metric_ops,
+                {
+                    'weights': weights,
+                    'unreduced_loss': unreduced_loss,
+                    'regularization_loss': regularization_loss,
+                }
+            ))
 
       # Train.
       if optimizer is not None:
@@ -1421,6 +1508,7 @@ def create_estimator_spec(
         train_op = train_op_fn(regularized_training_loss)
       else:
         raise ValueError('train_op_fn and optimizer cannot both be None.')
+      train_op = _append_update_ops(train_op)
       # Only summarize mean_loss for SUM reduction to preserve backwards
       # compatibility. Otherwise skip it to avoid unnecessary computation.
       if self._loss_reduction == losses.Reduction.SUM:
@@ -1441,13 +1529,21 @@ def create_estimator_spec(
         summary.scalar(
             _summary_key(self._name, keys.LOSS_REGULARIZATION),
             regularization_loss)
-    return model_fn.EstimatorSpec(
+    return model_fn._TPUEstimatorSpec(  # pylint: disable=protected-access
         mode=model_fn.ModeKeys.TRAIN,
         predictions=predictions,
         loss=regularized_training_loss,
         train_op=train_op)
 
 
+def _append_update_ops(train_op):
+  """Returns `train_op` appending `UPDATE_OPS` collection if present."""
+  update_ops = ops.get_collection(ops.GraphKeys.UPDATE_OPS)
+  if update_ops:
+    return control_flow_ops.group(train_op, *update_ops)
+  return train_op
+
+
 def _assert_range(labels, n_classes, message=None):
   with ops.name_scope(None, 'assert_range', (labels,)):
     assert_less = check_ops.assert_less_equal(
@@ -1460,21 +1556,40 @@ def _assert_range(labels, n_classes, message=None):
       return array_ops.identity(labels)
 
 
-# TODO(b/69000400): Delete this method.
-def _weights(features, weight_column):
-  """Fetches weights from features."""
-  with ops.name_scope(None, 'weights', values=features.values()):
-    if weight_column is None:
-      return 1.
-    if isinstance(weight_column, six.string_types):
-      weight_column = feature_column_lib.numeric_column(
-          key=weight_column, shape=(1,))
-    if not isinstance(weight_column, feature_column_lib._NumericColumn):  # pylint: disable=protected-access
-      raise TypeError('Weight column must be either a string or _NumericColumn.'
-                      ' Given type: {}.'.format(type(weight_column)))
-    weights = weight_column._get_dense_tensor(  # pylint: disable=protected-access
-        feature_column_lib._LazyBuilder(features))  # pylint: disable=protected-access
-    if not (weights.dtype.is_floating or weights.dtype.is_integer):
-      raise ValueError('Weight column should be castable to float. '
-                       'Given dtype: {}'.format(weights.dtype))
-    return math_ops.to_float(weights, name='weights')
+def _binary_logistic_or_multi_class_head(
+    n_classes, weight_column, label_vocabulary, loss_reduction):
+  """Creates either binary or multi-class head.
+
+  Args:
+    n_classes: Number of label classes.
+    weight_column: A string or a `_NumericColumn` created by
+      `tf.feature_column.numeric_column` defining feature column representing
+      weights. It is used to down weight or boost examples during training. It
+      will be multiplied by the loss of the example. If it is a string, it is
+      used as a key to fetch weight tensor from the `features`. If it is a
+      `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
+      then weight_column.normalizer_fn is applied on it to get weight tensor.
+    label_vocabulary: A list of strings represents possible label values. If
+      given, labels must be string type and have any value in
+      `label_vocabulary`. If it is not given, that means labels are
+      already encoded as integer or float within [0, 1] for `n_classes=2` and
+      encoded as integer values in {0, 1,..., n_classes-1} for `n_classes`>2 .
+      Also there will be errors if vocabulary is not provided and labels are
+      string.
+    loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
+      to reduce training loss over batch. Defaults to `SUM`.
+
+  Returns:
+    `head._Head` instance.
+  """
+  if n_classes == 2:
+    head = _binary_logistic_head_with_sigmoid_cross_entropy_loss(
+        weight_column=weight_column,
+        label_vocabulary=label_vocabulary,
+        loss_reduction=loss_reduction)
+  else:
+    head = _multi_class_head_with_softmax_cross_entropy_loss(
+        n_classes, weight_column=weight_column,
+        label_vocabulary=label_vocabulary,
+        loss_reduction=loss_reduction)
+  return head
diff --git a/tensorflow/python/estimator/canned/head_test.py b/tensorflow/python/estimator/canned/head_test.py
index 7da3df01dc48d9..08ce5ca8e833fd 100644
--- a/tensorflow/python/estimator/canned/head_test.py
+++ b/tensorflow/python/estimator/canned/head_test.py
@@ -39,6 +39,7 @@
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import string_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import signature_constants
@@ -86,6 +87,98 @@ def _sigmoid(logits):
   return 1 / (1 + np.exp(-logits))
 
 
+class CreateEstimatorSpecTest(test.TestCase):
+
+  class _HeadWithTPUSupport(head_lib._Head):
+    """Head that overrides _create_tpu_estimator_spec."""
+
+    def name(self):
+      return 'HeadWithTPUSupport'
+
+    def logits_dimension(self):
+      return None
+
+    def create_loss(self, features, mode, logits, labels):
+      return None
+
+    def _create_tpu_estimator_spec(self, features, mode, logits, labels=None,
+                                   optimizer=None, train_op_fn=None,
+                                   regularization_losses=None):
+      return model_fn._TPUEstimatorSpec(
+          mode=model_fn.ModeKeys.EVAL,
+          loss=constant_op.constant(0.0, dtype=dtypes.float32))
+
+  class _HeadWithOutTPUSupport(head_lib._Head):
+    """Head that overrides create_estimator_spec."""
+
+    def name(self):
+      return 'HeadWithOutTPUSupport'
+
+    def logits_dimension(self):
+      return None
+
+    def create_loss(self, features, mode, logits, labels):
+      return None
+
+    def create_estimator_spec(self, features, mode, logits, labels=None,
+                              optimizer=None, train_op_fn=None,
+                              regularization_losses=None):
+      return model_fn.EstimatorSpec(
+          mode=model_fn.ModeKeys.EVAL,
+          loss=constant_op.constant(0.0, dtype=dtypes.float32))
+
+  class _InvalidHead(head_lib._Head):
+    """Head that overrides neither estimator_spec functions."""
+
+    def name(self):
+      return 'InvalidHead'
+
+    def logits_dimension(self):
+      return None
+
+    def create_loss(self, features, mode, logits, labels):
+      return None
+
+  def test_head_override_tpu_estimator_spec(self):
+    """Test for `_Head` that overrides _create_tpu_estimator_spec."""
+    head = self._HeadWithTPUSupport()
+
+    tpu_spec = head._create_tpu_estimator_spec(
+        features=None, mode=None, logits=None)
+    self.assertTrue(isinstance(tpu_spec, model_fn._TPUEstimatorSpec))
+    est_spec = head.create_estimator_spec(
+        features=None, mode=None, logits=None)
+    self.assertTrue(isinstance(est_spec, model_fn.EstimatorSpec))
+
+  def test_head_override_estimator_spec(self):
+    """Test for `_Head` that overrides create_estimator_spec."""
+    head = self._HeadWithOutTPUSupport()
+
+    with self.assertRaisesRegexp(
+        NotImplementedError,
+        'TPUEstimatorSpec not available for this model head.'):
+      _ = head._create_tpu_estimator_spec(
+          features=None, mode=None, logits=None)
+    est_spec = head.create_estimator_spec(
+        features=None, mode=None, logits=None)
+    self.assertTrue(isinstance(est_spec, model_fn.EstimatorSpec))
+
+  def test_invalid_head_class(self):
+    head = self._InvalidHead()
+
+    with self.assertRaisesRegexp(
+        NotImplementedError,
+        'TPUEstimatorSpec not available for this model head.'):
+      _ = head._create_tpu_estimator_spec(
+          features=None, mode=None, logits=None)
+    with self.assertRaisesRegexp(
+        NotImplementedError,
+        r'Subclasses of _Head must implement `create_estimator_spec\(\)` or '
+        r'_create_tpu_estimator_spec\(\).'):
+      _ = head.create_estimator_spec(
+          features=None, mode=None, logits=None)
+
+
 class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
 
   def setUp(self):
@@ -877,6 +970,35 @@ def minimize(self, loss, global_step):
           six.b('{0:s}{1:.2f}'.format(expected_train_result, expected_loss)),
           train_result)
 
+  def test_train_with_update_ops(self):
+    n_classes = 3
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
+
+    with ops.Graph().as_default():
+      w = variables.Variable(1)
+      update_op = w.assign_add(1)
+      ops.add_to_collection(ops.GraphKeys.UPDATE_OPS, update_op)
+
+      t = variables.Variable('')
+      expected_train_result = b'my_train_op'
+      def _train_op_fn(loss):
+        del loss
+        return t.assign(expected_train_result)
+
+      spec = head.create_estimator_spec(
+          features={'x': np.array(((42,),), dtype=np.int32)},
+          mode=model_fn.ModeKeys.TRAIN,
+          logits=np.array(((10, 0, 0), (0, 10, 0),), dtype=np.float32),
+          labels=np.array(((1,), (1,)), dtype=np.int64),
+          train_op_fn=_train_op_fn)
+
+      with self.test_session() as sess:
+        _initialize_variables(self, spec.scaffold)
+        sess.run(spec.train_op)
+        w_value, t_value = sess.run([w, t])
+        self.assertEqual(2, w_value)
+        self.assertEqual(expected_train_result, t_value)
+
   def test_train_summaries_with_head_name(self):
     n_classes = 3
     head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
@@ -2010,6 +2132,34 @@ def minimize(self, loss, global_step):
       self.assertAllClose(expected_loss, loss)
       self.assertEqual(expected_train_result, train_result)
 
+  def test_train_with_update_ops(self):
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
+
+    with ops.Graph().as_default():
+      w = variables.Variable(1)
+      update_op = w.assign_add(1)
+      ops.add_to_collection(ops.GraphKeys.UPDATE_OPS, update_op)
+
+      t = variables.Variable('')
+      expected_train_result = b'my_train_op'
+      def _train_op_fn(loss):
+        del loss
+        return t.assign(expected_train_result)
+
+      spec = head.create_estimator_spec(
+          features={'x': np.array(((42,),), dtype=np.int32)},
+          mode=model_fn.ModeKeys.TRAIN,
+          logits=np.array(((45,), (-41,),), dtype=np.float32),
+          labels=np.array(((1,), (1,),), dtype=np.float64),
+          train_op_fn=_train_op_fn)
+
+      with self.test_session() as sess:
+        _initialize_variables(self, spec.scaffold)
+        sess.run(spec.train_op)
+        w_value, t_value = sess.run([w, t])
+        self.assertEqual(2, w_value)
+        self.assertEqual(expected_train_result, t_value)
+
   def test_train_summaries_with_head_name(self):
     head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
         name='some_binary_head')
@@ -2607,26 +2757,24 @@ def test_multi_dim_weighted_eval(self):
           rtol=tol, atol=tol)
 
 
-class RegressionHeadWithMeanSquaredErrorLossTest(test.TestCase):
+class RegressionHead(test.TestCase):
 
   def setUp(self):
     ops.reset_default_graph()
 
   def test_invalid_label_dimension(self):
     with self.assertRaisesRegexp(ValueError, r'Invalid label_dimension'):
-      head_lib._regression_head_with_mean_squared_error_loss(label_dimension=-1)
+      head_lib._regression_head(label_dimension=-1)
     with self.assertRaisesRegexp(ValueError, r'Invalid label_dimension'):
-      head_lib._regression_head_with_mean_squared_error_loss(label_dimension=0)
+      head_lib._regression_head(label_dimension=0)
 
   def test_invalid_loss_reduction(self):
     with self.assertRaisesRegexp(
         ValueError, r'Invalid loss_reduction: invalid_loss_reduction'):
-      head_lib._regression_head_with_mean_squared_error_loss(
-          loss_reduction='invalid_loss_reduction')
+      head_lib._regression_head(loss_reduction='invalid_loss_reduction')
     with self.assertRaisesRegexp(
         ValueError, r'Invalid loss_reduction: none'):
-      head_lib._regression_head_with_mean_squared_error_loss(
-          loss_reduction=losses.Reduction.NONE)
+      head_lib._regression_head(loss_reduction=losses.Reduction.NONE)
 
   def test_loss_fn_arg_labels_missing(self):
     def _loss_fn(logits):
@@ -2635,7 +2783,7 @@ def _loss_fn(logits):
         ValueError,
         r'loss_fn must contain argument: labels\. '
         r'Given arguments: \(\'logits\',\)'):
-      head_lib._regression_head_with_mean_squared_error_loss(loss_fn=_loss_fn)
+      head_lib._regression_head(loss_fn=_loss_fn)
 
   def test_loss_fn_arg_logits_missing(self):
     def _loss_fn(labels):
@@ -2644,12 +2792,12 @@ def _loss_fn(labels):
         ValueError,
         r'loss_fn must contain argument: logits\. '
         r'Given arguments: \(\'labels\',\)'):
-      head_lib._regression_head_with_mean_squared_error_loss(loss_fn=_loss_fn)
+      head_lib._regression_head(loss_fn=_loss_fn)
 
   def test_loss_fn_arg_features_ok(self):
     def _loss_fn(labels, logits, features):
       del labels, logits, features  # Unused
-      head_lib._regression_head_with_mean_squared_error_loss(loss_fn=_loss_fn)
+      head_lib._regression_head(loss_fn=_loss_fn)
 
   def test_loss_fn_arg_invalid(self):
     def _loss_fn(labels, logits, name=None):
@@ -2657,11 +2805,10 @@ def _loss_fn(labels, logits, name=None):
     with self.assertRaisesRegexp(
         ValueError,
         r'loss_fn has unexpected args: \[\'name\'\]'):
-      head_lib._regression_head_with_mean_squared_error_loss(loss_fn=_loss_fn)
+      head_lib._regression_head(loss_fn=_loss_fn)
 
   def test_invalid_logits(self):
-    head = head_lib._regression_head_with_mean_squared_error_loss(
-        label_dimension=3)
+    head = head_lib._regression_head(label_dimension=3)
     self.assertEqual(3, head.logits_dimension)
     logits_1d = np.array(((45.,), (41.,),))
 
@@ -2685,8 +2832,7 @@ def test_invalid_logits(self):
         })
 
   def test_incompatible_labels_eval(self):
-    head = head_lib._regression_head_with_mean_squared_error_loss(
-        label_dimension=3)
+    head = head_lib._regression_head(label_dimension=3)
     self.assertEqual(3, head.logits_dimension)
     values_3d = np.array(((45., 46., 47.), (41., 42., 43.),))
     values_1d = np.array(((43.,), (44.,),))
@@ -2732,8 +2878,7 @@ def test_incompatible_labels_eval(self):
         })
 
   def test_incompatible_labels_train(self):
-    head = head_lib._regression_head_with_mean_squared_error_loss(
-        label_dimension=3)
+    head = head_lib._regression_head(label_dimension=3)
     self.assertEqual(3, head.logits_dimension)
     values_3d = np.array(((45., 46., 47.), (41., 42., 43.),))
     values_1d = np.array(((43.,), (44.,),))
@@ -2784,12 +2929,11 @@ def test_incompatible_labels_train(self):
         })
 
   def test_name(self):
-    head = head_lib._regression_head_with_mean_squared_error_loss(
-        name='foo')
+    head = head_lib._regression_head(name='foo')
     self.assertEqual('foo', head.name)
 
   def test_predict(self):
-    head = head_lib._regression_head_with_mean_squared_error_loss()
+    head = head_lib._regression_head()
     self.assertEqual(1, head.logits_dimension)
 
     # Create estimator spec.
@@ -2826,8 +2970,7 @@ def test_predict(self):
   def test_predict_with_inverse_link_fn(self):
     def _inverse_link_fn(logits):
       return logits - 10.
-    head = head_lib._regression_head_with_mean_squared_error_loss(
-        inverse_link_fn=_inverse_link_fn)
+    head = head_lib._regression_head(inverse_link_fn=_inverse_link_fn)
 
     # Create estimator spec.
     logits = np.array(((45,), (41,),), dtype=np.int32)
@@ -2866,7 +3009,7 @@ def _inverse_link_fn(logits):
           logits, spec.export_outputs['predict'].outputs['logits'].eval())
 
   def test_eval_create_loss(self):
-    head = head_lib._regression_head_with_mean_squared_error_loss()
+    head = head_lib._regression_head()
     logits = np.array(((45,), (41,),), dtype=np.float32)
     labels = np.array(((43,), (44,),), dtype=np.int32)
     features = {'x': np.array(((42,),), dtype=np.float32)}
@@ -2895,8 +3038,7 @@ def _loss_fn(labels, logits):
           data=[logits])
       with ops.control_dependencies([check_labels, check_logits]):
         return constant_op.constant(loss)
-    head = head_lib._regression_head_with_mean_squared_error_loss(
-        label_dimension=2, loss_fn=_loss_fn)
+    head = head_lib._regression_head(label_dimension=2, loss_fn=_loss_fn)
 
     actual_training_loss = head.create_loss(
         features={'x': np.array(((42,),), dtype=np.int32)},
@@ -2913,8 +3055,7 @@ def test_eval_create_loss_loss_fn_wrong_shape(self):
     def _loss_fn(labels, logits):
       del labels, logits  # Unused
       return constant_op.constant(loss)
-    head = head_lib._regression_head_with_mean_squared_error_loss(
-        label_dimension=2, loss_fn=_loss_fn)
+    head = head_lib._regression_head(label_dimension=2, loss_fn=_loss_fn)
 
     logits = np.array([[-1., 1.], [-2., 2.]], dtype=np.float32)
     labels = np.array([[1., 0.], [2., -1.]], dtype=np.float32)
@@ -2933,7 +3074,7 @@ def _loss_fn(labels, logits):
 
   def test_eval_labels_none(self):
     """Tests that error is raised when labels is None."""
-    head = head_lib._regression_head_with_mean_squared_error_loss()
+    head = head_lib._regression_head()
 
     with self.assertRaisesRegexp(
         ValueError, r'You must provide a labels Tensor\. Given: None\.'):
@@ -2944,7 +3085,7 @@ def test_eval_labels_none(self):
           labels=None)
 
   def test_eval(self):
-    head = head_lib._regression_head_with_mean_squared_error_loss()
+    head = head_lib._regression_head()
     self.assertEqual(1, head.logits_dimension)
 
     logits = np.array(((45,), (41,),), dtype=np.float32)
@@ -2986,8 +3127,7 @@ def test_eval(self):
       self.assertAllClose(expected_loss_mean, loss_mean_value_op.eval())
 
   def test_eval_metric_ops_with_head_name_for_regression(self):
-    head = head_lib._regression_head_with_mean_squared_error_loss(
-        name='some_regression_head')
+    head = head_lib._regression_head(name='some_regression_head')
     logits = np.array(((1,), (9,)), dtype=np.float32)
     labels = np.array(((1,), (1,)), dtype=np.int64)
     features = {'x': np.array(((42,),), dtype=np.int32)}
@@ -3004,7 +3144,7 @@ def test_eval_metric_ops_with_head_name_for_regression(self):
     self.assertItemsEqual(expected_metric_keys, spec.eval_metric_ops.keys())
 
   def test_eval_with_regularization_losses(self):
-    head = head_lib._regression_head_with_mean_squared_error_loss(
+    head = head_lib._regression_head(
         loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
     self.assertEqual(1, head.logits_dimension)
 
@@ -3049,7 +3189,7 @@ def test_eval_with_regularization_losses(self):
           expected_metrics, {k: value_ops[k].eval() for k in value_ops})
 
   def test_train_create_loss(self):
-    head = head_lib._regression_head_with_mean_squared_error_loss()
+    head = head_lib._regression_head()
     logits = np.array(((45,), (41,),), dtype=np.float32)
     labels = np.array(((43,), (44,),), dtype=np.int32)
     features = {'x': np.array(((42,),), dtype=np.float32)}
@@ -3073,7 +3213,7 @@ def test_train_create_loss(self):
 
   def test_train_create_loss_loss_reduction(self):
     """Tests create_loss with loss_reduction."""
-    head = head_lib._regression_head_with_mean_squared_error_loss(
+    head = head_lib._regression_head(
         loss_reduction=losses.Reduction.SUM_BY_NONZERO_WEIGHTS)
     logits = np.array(((45,), (41,),), dtype=np.float32)
     labels = np.array(((43,), (44,),), dtype=np.int32)
@@ -3098,7 +3238,7 @@ def test_train_create_loss_loss_reduction(self):
 
   def test_train_labels_none(self):
     """Tests that error is raised when labels is None."""
-    head = head_lib._regression_head_with_mean_squared_error_loss()
+    head = head_lib._regression_head()
     def _no_op_train_fn(loss):
       del loss
       return control_flow_ops.no_op()
@@ -3113,7 +3253,7 @@ def _no_op_train_fn(loss):
           train_op_fn=_no_op_train_fn)
 
   def test_train(self):
-    head = head_lib._regression_head_with_mean_squared_error_loss()
+    head = head_lib._regression_head()
     self.assertEqual(1, head.logits_dimension)
 
     # Create estimator spec.
@@ -3163,7 +3303,7 @@ def _train_op_fn(loss):
       }, summary_str)
 
   def test_train_with_optimizer(self):
-    head = head_lib._regression_head_with_mean_squared_error_loss()
+    head = head_lib._regression_head()
     self.assertEqual(1, head.logits_dimension)
 
     # Create estimator spec.
@@ -3196,9 +3336,36 @@ def minimize(self, loss, global_step):
       self.assertAllClose(expected_loss, loss)
       self.assertEqual(expected_train_result, train_result)
 
+  def test_train_with_update_ops(self):
+    head = head_lib._regression_head()
+
+    with ops.Graph().as_default():
+      w = variables.Variable(1)
+      update_op = w.assign_add(1)
+      ops.add_to_collection(ops.GraphKeys.UPDATE_OPS, update_op)
+
+      t = variables.Variable('')
+      expected_train_result = b'my_train_op'
+      def _train_op_fn(loss):
+        del loss
+        return t.assign(expected_train_result)
+
+      spec = head.create_estimator_spec(
+          features={'x': np.array(((42,),), dtype=np.int32)},
+          mode=model_fn.ModeKeys.TRAIN,
+          logits=np.array(((45,), (41,),), dtype=np.float32),
+          labels=np.array(((43.,), (44.,),), dtype=np.float64),
+          train_op_fn=_train_op_fn)
+
+      with self.test_session() as sess:
+        _initialize_variables(self, spec.scaffold)
+        sess.run(spec.train_op)
+        w_value, t_value = sess.run([w, t])
+        self.assertEqual(2, w_value)
+        self.assertEqual(expected_train_result, t_value)
+
   def test_train_summaries_with_head_name(self):
-    head = head_lib._regression_head_with_mean_squared_error_loss(
-        name='some_regression_head')
+    head = head_lib._regression_head(name='some_regression_head')
     self.assertEqual(1, head.logits_dimension)
 
     # Create estimator spec.
@@ -3237,7 +3404,7 @@ def _train_op_fn(loss):
           summary_str)
 
   def test_train_with_regularization_losses(self):
-    head = head_lib._regression_head_with_mean_squared_error_loss(
+    head = head_lib._regression_head(
         loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
     self.assertEqual(1, head.logits_dimension)
 
@@ -3285,8 +3452,7 @@ def _train_op_fn(loss):
 
   def test_weighted_multi_example_eval(self):
     """1d label, 3 examples, 1 batch."""
-    head = head_lib._regression_head_with_mean_squared_error_loss(
-        weight_column='label_weights')
+    head = head_lib._regression_head(weight_column='label_weights')
     self.assertEqual(1, head.logits_dimension)
 
     # Create estimator spec.
@@ -3330,7 +3496,7 @@ def test_weighted_multi_example_eval(self):
 
   def test_weight_with_numeric_column(self):
     """1d label, 3 examples, 1 batch."""
-    head = head_lib._regression_head_with_mean_squared_error_loss(
+    head = head_lib._regression_head(
         weight_column=feature_column_lib.numeric_column(
             'label_weights', normalizer_fn=lambda x: x + 1.))
 
@@ -3356,8 +3522,7 @@ def test_weight_with_numeric_column(self):
 
   def test_weighted_multi_example_train(self):
     """1d label, 3 examples, 1 batch."""
-    head = head_lib._regression_head_with_mean_squared_error_loss(
-        weight_column='label_weights')
+    head = head_lib._regression_head(weight_column='label_weights')
     self.assertEqual(1, head.logits_dimension)
 
     # Create estimator spec.
@@ -3408,8 +3573,7 @@ def _train_op_fn(loss):
 
   def test_train_one_dim_create_loss(self):
     """Tests create_loss with 1D labels and weights (shape [batch_size])."""
-    head = head_lib._regression_head_with_mean_squared_error_loss(
-        weight_column='label_weights')
+    head = head_lib._regression_head(weight_column='label_weights')
     logits = np.array(((45,), (41,), (44,)), dtype=np.float32)
     x_feature_rank_1 = np.array((42., 43., 44.,), dtype=np.float32)
     weight_rank_1 = np.array((1., .1, 1.5,), dtype=np.float64)
@@ -3435,8 +3599,7 @@ def test_train_one_dim_create_loss(self):
 
   def test_train_one_dim(self):
     """Tests train with 1D labels and weights (shape [batch_size])."""
-    head = head_lib._regression_head_with_mean_squared_error_loss(
-        weight_column='label_weights')
+    head = head_lib._regression_head(weight_column='label_weights')
     self.assertEqual(1, head.logits_dimension)
 
     # Create estimator spec.
@@ -3493,7 +3656,7 @@ def _train_op_fn(loss):
 
   def test_weighted_multi_value_eval_create_loss(self):
     """3d label, 1 example, 1 batch."""
-    head = head_lib._regression_head_with_mean_squared_error_loss(
+    head = head_lib._regression_head(
         weight_column='label_weights', label_dimension=3)
     logits = np.array(((45., 41., 44.),))
     labels = np.array(((35., 42., 45.),))
@@ -3515,7 +3678,7 @@ def test_weighted_multi_value_eval_create_loss(self):
 
   def test_weighted_multi_value_eval(self):
     """3d label, 1 example, 1 batch."""
-    head = head_lib._regression_head_with_mean_squared_error_loss(
+    head = head_lib._regression_head(
         weight_column='label_weights', label_dimension=3)
     self.assertEqual(3, head.logits_dimension)
 
@@ -3562,7 +3725,7 @@ def test_weighted_multi_value_eval(self):
 
   def test_weighted_multi_value_train_create_loss(self):
     """3d label, 1 example, 1 batch."""
-    head = head_lib._regression_head_with_mean_squared_error_loss(
+    head = head_lib._regression_head(
         weight_column='label_weights', label_dimension=3)
     logits = np.array(((45., 41., 44.),))
     labels = np.array(((35., 42., 45.),))
@@ -3584,7 +3747,7 @@ def test_weighted_multi_value_train_create_loss(self):
 
   def test_weighted_multi_value_train(self):
     """3d label, 1 example, 1 batch."""
-    head = head_lib._regression_head_with_mean_squared_error_loss(
+    head = head_lib._regression_head(
         weight_column='label_weights', label_dimension=3)
     self.assertEqual(3, head.logits_dimension)
 
@@ -3639,8 +3802,7 @@ def _train_op_fn(loss):
 
   def test_weighted_multi_batch_eval(self):
     """1d label, 1 example, 3 batches."""
-    head = head_lib._regression_head_with_mean_squared_error_loss(
-        weight_column='label_weights')
+    head = head_lib._regression_head(weight_column='label_weights')
     self.assertEqual(1, head.logits_dimension)
 
     # Create estimator spec.
@@ -3705,8 +3867,7 @@ def test_weighted_multi_batch_eval(self):
 
   def test_weighted_multi_batch_train(self):
     """1d label, 1 example, 3 batches."""
-    head = head_lib._regression_head_with_mean_squared_error_loss(
-        weight_column='label_weights')
+    head = head_lib._regression_head(weight_column='label_weights')
     self.assertEqual(1, head.logits_dimension)
 
     # Create estimator spec.
@@ -3755,7 +3916,7 @@ def test_weighted_multi_batch_train(self):
   def test_multi_dim_weighted_train_create_loss(self):
     """Logits, labels of shape [2, 2, 3], weight shape [2, 2]."""
     label_dimension = 3
-    head = head_lib._regression_head_with_mean_squared_error_loss(
+    head = head_lib._regression_head(
         weight_column='label_weights', label_dimension=label_dimension)
     logits = np.array([[[00., 01., 02.], [10., 11., 12.]],
                        [[20., 21., 22.], [30., 31., 32.]]])
@@ -3785,7 +3946,7 @@ def test_multi_dim_weighted_train_create_loss(self):
 
   def test_multi_dim_weighted_train(self):
     """Logits, labels of shape [2, 2, 3], weight shape [2, 2]."""
-    head = head_lib._regression_head_with_mean_squared_error_loss(
+    head = head_lib._regression_head(
         weight_column='label_weights', label_dimension=3)
     logits = np.array([[[00., 01., 02.], [10., 11., 12.]],
                        [[20., 21., 22.], [30., 31., 32.]]])
@@ -3816,7 +3977,7 @@ def _train_op_fn(loss):
 
   def test_multi_dim_train_weights_wrong_inner_dim(self):
     """Logits, labels of shape [2, 2, 3], weight shape [2, 1]."""
-    head = head_lib._regression_head_with_mean_squared_error_loss(
+    head = head_lib._regression_head(
         weight_column='label_weights', label_dimension=3)
     logits = np.array([[[00., 01., 02.], [10., 11., 12.]],
                        [[20., 21., 22.], [30., 31., 32.]]])
@@ -3844,7 +4005,7 @@ def _no_op_train_fn(loss):
 
   def test_multi_dim_train_weights_wrong_outer_dim(self):
     """Logits, labels of shape [2, 2, 3], weight shape [2, 2, 2]."""
-    head = head_lib._regression_head_with_mean_squared_error_loss(
+    head = head_lib._regression_head(
         weight_column='label_weights', label_dimension=3)
     logits = np.array([[[00., 01., 02.], [10., 11., 12.]],
                        [[20., 21., 22.], [30., 31., 32.]]])
diff --git a/tensorflow/python/estimator/canned/linear.py b/tensorflow/python/estimator/canned/linear.py
index e7ec4179917a88..81657f0c016445 100644
--- a/tensorflow/python/estimator/canned/linear.py
+++ b/tensorflow/python/estimator/canned/linear.py
@@ -415,7 +415,7 @@ def __init__(self,
       loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
         to reduce training loss over batch. Defaults to `SUM`.
     """
-    head = head_lib._regression_head_with_mean_squared_error_loss(  # pylint: disable=protected-access
+    head = head_lib._regression_head(  # pylint: disable=protected-access
         label_dimension=label_dimension, weight_column=weight_column,
         loss_reduction=loss_reduction)
 
diff --git a/tensorflow/python/estimator/canned/metric_keys.py b/tensorflow/python/estimator/canned/metric_keys.py
index f374d3154982e3..4f7c849ba4b058 100644
--- a/tensorflow/python/estimator/canned/metric_keys.py
+++ b/tensorflow/python/estimator/canned/metric_keys.py
@@ -42,3 +42,8 @@ class MetricKeys(object):
   ACCURACY_AT_THRESHOLD = 'accuracy/positive_threshold_%g'
   PRECISION_AT_THRESHOLD = 'precision/positive_threshold_%g'
   RECALL_AT_THRESHOLD = 'recall/positive_threshold_%g'
+
+  # The following require a class id applied.
+  PROBABILITY_MEAN_AT_CLASS = 'probability_mean/class%d'
+  AUC_AT_CLASS = 'auc/class%d'
+  AUC_PR_AT_CLASS = 'auc_precision_recall/class%d'
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 2dd5e1e8aac46b..4f57a4ef79a5f5 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -30,17 +30,19 @@
 from google.protobuf import message
 from tensorflow.core.framework import summary_pb2
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session as tf_session
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator import run_config
-from tensorflow.python.estimator import util
-from tensorflow.python.estimator.export.export import build_all_signature_defs
-from tensorflow.python.estimator.export.export import get_temp_export_dir
-from tensorflow.python.estimator.export.export import get_timestamped_export_dir
+from tensorflow.python.estimator import util as estimator_util
+from tensorflow.python.estimator.export import export as export_helpers
+from tensorflow.python.estimator.export import export_output
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import metrics as metrics_lib
@@ -49,7 +51,7 @@
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import builder as saved_model_builder
-from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.saved_model import constants
 from tensorflow.python.summary import summary
 from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.training import device_setter
@@ -62,6 +64,7 @@
 from tensorflow.python.training import warm_starting_util
 from tensorflow.python.util import compat
 from tensorflow.python.util import compat_internal
+from tensorflow.python.util import function_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -154,12 +157,12 @@ def __init__(self, model_fn, model_dir=None, config=None, params=None,
       config: Configuration object.
       params: `dict` of hyper parameters that will be passed into `model_fn`.
               Keys are names of parameters, values are basic python types.
-      warm_start_from: Optional string filepath to a checkpoint to warm-start
-                       from, or a `tf.estimator.WarmStartSettings` object to
-                       fully configure warm-starting.  If the string filepath is
-                       provided instead of a `WarmStartSettings`, then all
-                       variables are warm-started, and it is assumed that
-                       vocabularies and Tensor names are unchanged.
+      warm_start_from: Optional string filepath to a checkpoint or SavedModel to
+                       warm-start from, or a `tf.estimator.WarmStartSettings`
+                       object to fully configure warm-starting.  If the string
+                       filepath is provided instead of a `WarmStartSettings`,
+                       then all variables are warm-started, and it is assumed
+                       that vocabularies and Tensor names are unchanged.
 
     Raises:
       ValueError: parameters of `model_fn` don't match `params`.
@@ -203,12 +206,16 @@ def __init__(self, model_fn, model_dir=None, config=None, params=None,
     logging.info('Using config: %s', str(vars(self._config)))
 
     if self._config.session_config is None:
-      self._session_config = config_pb2.ConfigProto(allow_soft_placement=True)
+      rewrite_opts = rewriter_config_pb2.RewriterConfig(
+          meta_optimizer_iterations=rewriter_config_pb2.RewriterConfig.ONE)
+      graph_opts = config_pb2.GraphOptions(rewrite_options=rewrite_opts)
+      self._session_config = config_pb2.ConfigProto(
+          allow_soft_placement=True, graph_options=graph_opts)
     else:
       self._session_config = self._config.session_config
 
-    self._device_fn = self._config.device_fn or \
-                      _get_replica_device_setter(self._config)
+    self._device_fn = (
+        self._config.device_fn or _get_replica_device_setter(self._config))
 
     if model_fn is None:
       raise ValueError('model_fn must be provided to Estimator.')
@@ -297,7 +304,7 @@ def train(self,
 
     Args:
       input_fn: A function that provides input data for training as minibatches.
-        See @{$get_started/premade_estimators#create_input_functions} for more
+        See @{$premade_estimators#create_input_functions} for more
         information. The function should construct and return one of
         the following:
 
@@ -366,6 +373,21 @@ def _convert_train_steps_to_hooks(self, steps, max_steps):
     else:
       return []
 
+  def eval_dir(self, name=None):
+    """Shows directory name where evaluation metrics are dumped.
+
+    Args:
+      name: Name of the evaluation if user needs to run multiple evaluations on
+        different data sets, such as on training data vs test data. Metrics for
+        different evaluations are saved in separate folders, and appear
+        separately in tensorboard.
+
+    Returns:
+      A string which is the path of directory contains evaluation metrics.
+    """
+    return os.path.join(self._model_dir, 'eval' if not name else
+                        'eval_' + name)
+
   def evaluate(self, input_fn, steps=None, hooks=None, checkpoint_path=None,
                name=None):
     """Evaluates the model given evaluation data input_fn.
@@ -378,7 +400,7 @@ def evaluate(self, input_fn, steps=None, hooks=None, checkpoint_path=None,
 
     Args:
       input_fn: A function that constructs the input data for evaluation.
-        See @{$get_started/premade_estimators#create_input_functions} for more
+        See @{$premade_estimators#create_input_functions} for more
         information. The function should construct and return one of
         the following:
 
@@ -395,7 +417,9 @@ def evaluate(self, input_fn, steps=None, hooks=None, checkpoint_path=None,
       hooks: List of `SessionRunHook` subclass instances. Used for callbacks
         inside the evaluation call.
       checkpoint_path: Path of a specific checkpoint to evaluate. If `None`, the
-        latest checkpoint in `model_dir` is used.
+        latest checkpoint in `model_dir` is used.  If there are no checkpoints
+        in `model_dir`, evaluation is run with newly initialized `Variables`
+        instead of restored from checkpoint.
       name: Name of the evaluation if user needs to run multiple evaluations on
         different data sets, such as on training data vs test data. Metrics for
         different evaluations are saved in separate folders, and appear
@@ -415,11 +439,25 @@ def evaluate(self, input_fn, steps=None, hooks=None, checkpoint_path=None,
       hooks = _check_hooks_type(hooks)
       hooks.extend(self._convert_eval_steps_to_hooks(steps))
 
-      return self._evaluate_model(
-          input_fn=input_fn,
-          hooks=hooks,
-          checkpoint_path=checkpoint_path,
-          name=name)
+      # Check that model has been trained (if nothing has been set explicitly).
+      if not checkpoint_path:
+        latest_path = saver.latest_checkpoint(self._model_dir)
+        if not latest_path:
+          logging.info('Could not find trained model in model_dir: {}, running '
+                       'initialization to evaluate.'.format(self._model_dir))
+        checkpoint_path = latest_path
+
+      with ops.Graph().as_default():
+        (scaffold, update_op,
+         eval_dict, all_hooks) = self._evaluate_build_graph(
+             input_fn, hooks, checkpoint_path)
+        return self._evaluate_run(
+            checkpoint_path=checkpoint_path,
+            scaffold=scaffold,
+            update_op=update_op,
+            eval_dict=eval_dict,
+            all_hooks=all_hooks,
+            output_dir=self.eval_dir(name))
 
   def _convert_eval_steps_to_hooks(self, steps):
     if steps is None:
@@ -441,7 +479,7 @@ def predict(self,
       input_fn: A function that constructs the features. Prediction continues
         until `input_fn` raises an end-of-input exception (`OutOfRangeError` or
         `StopIteration`).
-        See @{$get_started/premade_estimators#create_input_functions} for more
+        See @{$premade_estimators#create_input_functions} for more
         information. The function should construct and return one of
         the following:
 
@@ -459,7 +497,9 @@ def predict(self,
       hooks: List of `SessionRunHook` subclass instances. Used for callbacks
         inside the prediction call.
       checkpoint_path: Path of a specific checkpoint to predict. If `None`, the
-        latest checkpoint in `model_dir` is used.
+        latest checkpoint in `model_dir` is used.  If there are no checkpoints
+        in `model_dir`, prediction is run with newly initialized `Variables`
+        instead of restored from checkpoint.
       yield_single_examples: If False, yield the whole batch as returned by the
         `model_fn` instead of decomposing the batch into individual elements.
         This is useful if `model_fn` returns some tensors whose first dimension
@@ -482,10 +522,8 @@ def predict(self,
       if not checkpoint_path:
         checkpoint_path = saver.latest_checkpoint(self._model_dir)
       if not checkpoint_path:
-        raise ValueError(
-            'Could not find trained model in model_dir: {}.'.format(
-                self._model_dir))
-
+        logging.info('Could not find trained model in model_dir: {}, running '
+                     'initialization to predict.'.format(self._model_dir))
       with ops.Graph().as_default() as g:
         random_seed.set_random_seed(self._config.tf_random_seed)
         self._create_and_assert_global_step(g)
@@ -493,6 +531,10 @@ def predict(self,
             input_fn, model_fn_lib.ModeKeys.PREDICT)
         estimator_spec = self._call_model_fn(
             features, None, model_fn_lib.ModeKeys.PREDICT, self.config)
+
+        # Call to warm_start has to be after model_fn is called.
+        self._maybe_warm_start(checkpoint_path)
+
         predictions = self._extract_keys(
             estimator_spec.predictions, predict_keys)
         all_hooks = list(input_hooks)
@@ -524,7 +566,8 @@ def _assert_members_are_not_overridden(self):
     allowed_overrides = set([
         '_call_input_fn', '_create_global_step',
         '_convert_train_steps_to_hooks', '_convert_eval_steps_to_hooks',
-        '_tf_api_names', '_validate_features_in_predict_input'
+        '_tf_api_names', '_validate_features_in_predict_input',
+        '_call_model_fn', '_add_meta_graph_for_mode'
     ])
     estimator_members = set([m for m in Estimator.__dict__.keys()
                              if not m.startswith('__')])
@@ -597,88 +640,335 @@ def export_savedmodel(
           are provided, or no checkpoint can be found.
     """
     # pylint: enable=line-too-long
+    return self._export_saved_model_for_mode(
+        export_dir_base,
+        serving_input_receiver_fn,
+        assets_extra=assets_extra,
+        as_text=as_text,
+        checkpoint_path=checkpoint_path,
+        strip_default_attrs=strip_default_attrs,
+        mode=model_fn_lib.ModeKeys.PREDICT)
+
+  def _export_saved_model_for_mode(
+      self, export_dir_base, input_receiver_fn,
+      assets_extra=None,
+      as_text=False,
+      checkpoint_path=None,
+      strip_default_attrs=False,
+      mode=model_fn_lib.ModeKeys.PREDICT):
+    # pylint: disable=line-too-long
+    """Exports a single train/eval/predict graph as a SavedModel.
+
+    This method is a wrapper for _export_all_saved_models, and wraps a raw
+    input_receiver_fn in a dictionary to pass in to that function.
+    See _export_all_saved_models for full docs.
+
+    See tf.contrib.estimator.export_saved_model_for_mode for the currently
+    exposed version of this function.
+
+    Args:
+      export_dir_base: A string containing a directory in which to create
+        timestamped subdirectories containing exported SavedModels.
+      input_receiver_fn: a function that takes no argument and
+        returns the appropriate subclass of `InputReceiver`.
+      assets_extra: A dict specifying how to populate the assets.extra directory
+        within the exported SavedModel, or `None` if no extra assets are needed.
+      as_text: whether to write the SavedModel proto in text format.
+      checkpoint_path: The checkpoint path to export.  If `None` (the default),
+        the most recent checkpoint found within the model directory is chosen.
+      strip_default_attrs: Boolean. If `True`, default-valued attributes will be
+        removed from the NodeDefs. For a detailed guide, see
+        [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
+      mode: tf.estimator.ModeKeys value indicating with mode will be exported.
+
+    Returns:
+      The string path to the exported directory.
+
+    Raises:
+      ValueError: if input_receiver_fn is None, no export_outputs
+        are provided, or no checkpoint can be found.
+    """
+    # pylint: enable=line-too-long
+    if not input_receiver_fn:
+      raise ValueError('An input_receiver_fn must be defined.')
+
+    input_receiver_fn_map = {mode: input_receiver_fn}
+
+    return self._export_all_saved_models(
+        export_dir_base,
+        input_receiver_fn_map,
+        assets_extra=assets_extra,
+        as_text=as_text,
+        checkpoint_path=checkpoint_path,
+        strip_default_attrs=strip_default_attrs)
+
+  def _export_all_saved_models(
+      self, export_dir_base, input_receiver_fn_map,
+      assets_extra=None,
+      as_text=False,
+      checkpoint_path=None,
+      strip_default_attrs=False):
+    # pylint: disable=line-too-long
+    """Exports a SavedModel containing MetaGraphDefs for each requested mode.
+
+    See tf.contrib.estimator.export_all_saved_models for the currently
+    exposed version of this function.
+
+    For each mode passed in via the input_receiver_fn_map,
+    this method builds a new graph by calling the input_receiver_fn to obtain
+    feature and label `Tensor`s. Next, this method calls the `Estimator`'s
+    model_fn in the passed mode to generate the model graph based on
+    those features and labels, and restores the given checkpoint
+    (or, lacking that, the most recent checkpoint) into the graph.
+    Only one of the modes is used for saving variables to the SavedModel
+    (order of preference: TRAIN, EVAL, then PREDICT), such that up to three
+    MetaGraphDefs are saved with a single set of variables in a single
+    SavedModel directory.
+
+    For the variables and MetaGraphDefs, a timestamped export directory below
+    export_dir_base, and writes a `SavedModel` into it containing
+    the `MetaGraphDef` for the given mode and its associated signatures.
+
+    For prediction, the exported `MetaGraphDef` will provide one `SignatureDef`
+    for each element of the export_outputs dict returned from the model_fn,
+    named using the same keys.  One of these keys is always
+    signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY, indicating which
+    signature will be served when a serving request does not specify one.
+    For each signature, the outputs are provided by the corresponding
+    `ExportOutput`s, and the inputs are always the input receivers provided by
+    the serving_input_receiver_fn.
+
+    For training and evaluation, the train_op is stored in an extra collection,
+    and loss, metrics, and predictions are included in a SignatureDef for the
+    mode in question.
+
+    Extra assets may be written into the SavedModel via the assets_extra
+    argument.  This should be a dict, where each key gives a destination path
+    (including the filename) relative to the assets.extra directory.  The
+    corresponding value gives the full path of the source file to be copied.
+    For example, the simple case of copying a single file without renaming it
+    is specified as `{'my_asset_file.txt': '/path/to/my_asset_file.txt'}`.
+
+    Args:
+      export_dir_base: A string containing a directory in which to create
+        timestamped subdirectories containing exported SavedModels.
+      input_receiver_fn_map: dict of tf.estimator.ModeKeys to input_receiver_fn
+        mappings, where the input_receiver_fn is a function that takes no
+        argument and returns the appropriate subclass of `InputReceiver`.
+      assets_extra: A dict specifying how to populate the assets.extra directory
+        within the exported SavedModel, or `None` if no extra assets are needed.
+      as_text: whether to write the SavedModel proto in text format.
+      checkpoint_path: The checkpoint path to export.  If `None` (the default),
+        the most recent checkpoint found within the model directory is chosen.
+      strip_default_attrs: Boolean. If `True`, default-valued attributes will be
+        removed from the NodeDefs. For a detailed guide, see
+        [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
+
+    Returns:
+      A dict of tf.estimator.ModeKeys value to string path for each exported
+      directory.
+
+    Raises:
+      ValueError: if any input_receiver_fn is None, no export_outputs
+        are provided, or no checkpoint can be found.
+    """
+    # pylint: enable=line-too-long
+    # TODO(b/65561022): Consider allowing multiple input_receiver_fns per mode.
     with context.graph_mode():
-      if serving_input_receiver_fn is None:
-        raise ValueError('serving_input_receiver_fn must be defined.')
+      if not checkpoint_path:
+        # Locate the latest checkpoint
+        checkpoint_path = saver.latest_checkpoint(self._model_dir)
+      if not checkpoint_path:
+        raise ValueError("Couldn't find trained model at %s." % self._model_dir)
+
+      export_dir = export_helpers.get_timestamped_export_dir(export_dir_base)
+      temp_export_dir = export_helpers.get_temp_export_dir(export_dir)
+
+      builder = saved_model_builder.SavedModelBuilder(temp_export_dir)
+
+      save_variables = True
+      # Note that the order in which we run here matters, as the first
+      # mode we pass through will be used to save the variables. We run TRAIN
+      # first, as that is also the mode used for checkpoints, and therefore
+      # we are not likely to have vars in PREDICT that are not in the checkpoint
+      # created by TRAIN.
+      if input_receiver_fn_map.get(model_fn_lib.ModeKeys.TRAIN):
+        self._add_meta_graph_for_mode(
+            builder, input_receiver_fn_map, checkpoint_path,
+            strip_default_attrs, save_variables,
+            mode=model_fn_lib.ModeKeys.TRAIN)
+        save_variables = False
+      if input_receiver_fn_map.get(model_fn_lib.ModeKeys.EVAL):
+        self._add_meta_graph_for_mode(
+            builder, input_receiver_fn_map, checkpoint_path,
+            strip_default_attrs, save_variables,
+            mode=model_fn_lib.ModeKeys.EVAL)
+        save_variables = False
+      if input_receiver_fn_map.get(model_fn_lib.ModeKeys.PREDICT):
+        self._add_meta_graph_for_mode(
+            builder, input_receiver_fn_map, checkpoint_path,
+            strip_default_attrs, save_variables,
+            mode=model_fn_lib.ModeKeys.PREDICT)
+        save_variables = False
+
+      if save_variables:
+        raise ValueError('No valid modes for exporting found. Got {}.'.format(
+            input_receiver_fn_map.keys()))
+
+      builder.save(as_text)
+
+      # Add the extra assets
+      if assets_extra:
+        assets_extra_path = os.path.join(compat.as_bytes(temp_export_dir),
+                                         compat.as_bytes('assets.extra'))
+        for dest_relative, source in assets_extra.items():
+          dest_absolute = os.path.join(compat.as_bytes(assets_extra_path),
+                                       compat.as_bytes(dest_relative))
+          dest_path = os.path.dirname(dest_absolute)
+          gfile.MakeDirs(dest_path)
+          gfile.Copy(source, dest_absolute)
+
+      gfile.Rename(temp_export_dir, export_dir)
+      return export_dir
+
+  def _add_meta_graph_for_mode(self,
+                               builder,
+                               input_receiver_fn_map,
+                               checkpoint_path,
+                               strip_default_attrs,
+                               save_variables=True,
+                               mode=model_fn_lib.ModeKeys.PREDICT,
+                               export_tags=None):
+    # pylint: disable=line-too-long
+    """Loads variables and adds them along with a MetaGraphDef for saving.
 
-      with ops.Graph().as_default() as g:
-        self._create_and_assert_global_step(g)
-        random_seed.set_random_seed(self._config.tf_random_seed)
-        serving_input_receiver = serving_input_receiver_fn()
+    Args:
+      builder: instance of SavedModelBuilder that will be used for saving.
+      input_receiver_fn_map: dict of tf.estimator.ModeKeys to input_receiver_fn
+        mappings, where the input_receiver_fn is a function that takes no
+        argument and returns the appropriate subclass of `InputReceiver`.
+      checkpoint_path: The checkpoint path to export.  If `None` (the default),
+        the most recent checkpoint found within the model directory is chosen.
+      strip_default_attrs: Boolean. If `True`, default-valued attributes will be
+        removed from the NodeDefs. For a detailed guide, see
+        [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
+      save_variables: bool, whether variables should be saved. If False, just
+        the MetaGraphDef will be saved. Note that save_variables should only be
+        True for the first call to this function, and the SavedModelBuilder will
+        raise an error if that is not the case.
+      mode: tf.estimator.ModeKeys value indicating which mode will be exported.
+      export_tags: The set of tags with which to save `MetaGraphDef`. If None,
+        a default set will be selected to matched the passed mode.
+    """
+    # pylint: enable=line-too-long
+    if export_tags is None:
+      export_tags = model_fn_lib.EXPORT_TAG_MAP[mode]
+    input_receiver_fn = input_receiver_fn_map[mode]
 
-        # Call the model_fn and collect the export_outputs.
-        estimator_spec = self._call_model_fn(
-            features=serving_input_receiver.features,
-            labels=None,
-            mode=model_fn_lib.ModeKeys.PREDICT,
-            config=self.config)
-
-        # Build the SignatureDefs from receivers and all outputs
-        signature_def_map = build_all_signature_defs(
-            serving_input_receiver.receiver_tensors,
-            estimator_spec.export_outputs,
-            serving_input_receiver.receiver_tensors_alternatives)
-
-        if not checkpoint_path:
-          # Locate the latest checkpoint
-          checkpoint_path = saver.latest_checkpoint(self._model_dir)
-        if not checkpoint_path:
-          raise ValueError(
-              "Couldn't find trained model at %s." % self._model_dir)
-
-        export_dir = get_timestamped_export_dir(export_dir_base)
-        temp_export_dir = get_temp_export_dir(export_dir)
-
-        # TODO(soergel): Consider whether MonitoredSession makes sense here
-        with tf_session.Session(config=self._session_config) as session:
-
-          saver_for_restore = estimator_spec.scaffold.saver or saver.Saver(
-              sharded=True)
-          saver_for_restore.restore(session, checkpoint_path)
+    with ops.Graph().as_default() as g:
+      self._create_and_assert_global_step(g)
+      random_seed.set_random_seed(self._config.tf_random_seed)
+
+      input_receiver = input_receiver_fn()
+
+      # Call the model_fn and collect the export_outputs.
+      estimator_spec = self._call_model_fn(
+          features=input_receiver.features,
+          labels=getattr(input_receiver, 'labels', None),
+          mode=mode,
+          config=self.config)
+
+      export_outputs = self._get_export_outputs_for_spec(estimator_spec)
+
+      # Build the SignatureDefs from receivers and all outputs
+      signature_def_map = export_helpers.build_all_signature_defs(
+          input_receiver.receiver_tensors,
+          export_outputs,
+          getattr(input_receiver, 'receiver_tensors_alternatives', None),
+          serving_only=(mode == model_fn_lib.ModeKeys.PREDICT))
 
-          local_init_op = (
-              estimator_spec.scaffold.local_init_op or
-              monitored_session.Scaffold.default_local_init_op())
+      with tf_session.Session(config=self._session_config) as session:
 
-          # Perform the export
-          builder = saved_model_builder.SavedModelBuilder(temp_export_dir)
+        local_init_op = (
+            estimator_spec.scaffold.local_init_op or
+            monitored_session.Scaffold.default_local_init_op())
+
+        saver_for_restore = estimator_spec.scaffold.saver or saver.Saver(
+            sharded=True)
+
+        try:
+          saver_for_restore.restore(session, checkpoint_path)
+        except errors.NotFoundError as e:
+          msg = ('Could not load all requested variables from the checkpoint. '
+                 'Please make sure your model_fn does not expect variables '
+                 'that were not saved in the checkpoint.\n\n'
+                 'Encountered error with mode `{}` while restoring checkpoint '
+                 'from: `{}`. Full Traceback:\n\n{}').format(
+                     mode, checkpoint_path, e)
+          raise ValueError(msg)
+
+        # We add the train op explicitly for now, so that we don't have to
+        # change the Builder public interface. Note that this is a no-op
+        # for prediction, where train_op is None.
+        builder._add_train_op(estimator_spec.train_op)  # pylint: disable=protected-access
+
+        meta_graph_kwargs = dict(
+            tags=export_tags,
+            signature_def_map=signature_def_map,
+            assets_collection=ops.get_collection(
+                ops.GraphKeys.ASSET_FILEPATHS),
+            strip_default_attrs=strip_default_attrs,
+            legacy_init_op=local_init_op)
+
+        if save_variables:
           builder.add_meta_graph_and_variables(
-              session, [tag_constants.SERVING],
-              signature_def_map=signature_def_map,
-              assets_collection=ops.get_collection(
-                  ops.GraphKeys.ASSET_FILEPATHS),
-              legacy_init_op=local_init_op,
-              strip_default_attrs=strip_default_attrs)
-          builder.save(as_text)
-
-        # Add the extra assets
-        if assets_extra:
-          assets_extra_path = os.path.join(compat.as_bytes(temp_export_dir),
-                                           compat.as_bytes('assets.extra'))
-          for dest_relative, source in assets_extra.items():
-            dest_absolute = os.path.join(compat.as_bytes(assets_extra_path),
-                                         compat.as_bytes(dest_relative))
-            dest_path = os.path.dirname(dest_absolute)
-            gfile.MakeDirs(dest_path)
-            gfile.Copy(source, dest_absolute)
-
-        gfile.Rename(temp_export_dir, export_dir)
-        return export_dir
+              session, **meta_graph_kwargs)
+        else:
+          builder.add_meta_graph(**meta_graph_kwargs)
+
+  def _get_export_outputs_for_spec(self, estimator_spec):
+    """Given an EstimatorSpec, determine what our export outputs should be.
+
+    EstimatorSpecs contain export_outputs that are used for serving, but for
+    training and eval graphs, we must wrap the tensors of interest in
+    appropriate ExportOutput objects.
+
+    Args:
+      estimator_spec: EstimatorSpec object that will be exported.
+
+    Returns:
+      a dict mapping export_output_name to ExportOutput object.
+
+    Raises:
+      ValueError: if an appropriate ExportOutput cannot be found for the
+        passed EstimatorSpec.mode
+    """
+    mode = estimator_spec.mode
+    if mode == model_fn_lib.ModeKeys.PREDICT:
+      outputs = estimator_spec.export_outputs
+    else:
+      if mode == model_fn_lib.ModeKeys.TRAIN:
+        output_class = export_output.TrainOutput
+      elif mode == model_fn_lib.ModeKeys.EVAL:
+        output_class = export_output.EvalOutput
+      else:
+        raise ValueError(
+            'Export output type not found for mode: {}'.format(mode))
+
+      export_out = output_class(
+          loss=estimator_spec.loss,
+          predictions=estimator_spec.predictions,
+          metrics=estimator_spec.eval_metric_ops)
+      outputs = {mode: export_out}
+
+    return outputs
 
   def _get_features_from_input_fn(self, input_fn, mode):
     """Extracts the `features` from return values of `input_fn`."""
     result = self._call_input_fn(input_fn, mode)
-    input_hooks = []
-    if isinstance(result, dataset_ops.Dataset):
-      iterator = result.make_initializable_iterator()
-      input_hooks.append(_DatasetInitializerHook(iterator))
-      result = iterator.get_next()
-    if isinstance(result, (list, tuple)):
-      # Unconditionally drop the label (the second element of result).
-      result = result[0]
-
+    result, _, hooks = estimator_util.parse_input_fn_result(result)
     self._validate_features_in_predict_input(result)
-    return result, input_hooks
+    return result, hooks
 
   def _validate_features_in_predict_input(self, result):
     if not _has_dataset_or_queue_runner(result):
@@ -688,25 +978,13 @@ def _validate_features_in_predict_input(self, result):
 
   def _get_features_and_labels_from_input_fn(self, input_fn, mode):
     """Extracts the `features` and labels from return values of `input_fn`."""
-    input_hooks = []
     if self._distribution is not None and mode == model_fn_lib.ModeKeys.TRAIN:
       result = self._distribution.distribute_dataset(
           lambda: self._call_input_fn(input_fn, mode))
-      iterator = result.make_initializable_iterator()
-      input_hooks.append(_DatasetInitializerHook(iterator))
-      result = iterator.get_next()
     else:
       result = self._call_input_fn(input_fn, mode)
-      if isinstance(result, dataset_ops.Dataset):
-        iterator = result.make_initializable_iterator()
-        input_hooks.append(_DatasetInitializerHook(iterator))
-        result = iterator.get_next()
-    if isinstance(result, (list, tuple)):
-      if len(result) != 2:
-        raise ValueError(
-            'input_fn should return (features, labels) as a len 2 tuple.')
-      return result[0], result[1], input_hooks
-    return result, None, input_hooks
+
+    return estimator_util.parse_input_fn_result(result)
 
   def _extract_batch_length(self, preds_evaluated):
     """Extracts batch length of predictions."""
@@ -771,14 +1049,20 @@ def _call_input_fn(self, input_fn, mode):
       mode: ModeKeys
 
     Returns:
-      Either features or (features, labels) where features and labels are:
-        features - `Tensor` or dictionary of string feature name to `Tensor`.
-        labels - `Tensor` or dictionary of `Tensor` with labels.
+      The return value of the passed input_fn, which should be one of:
+
+        * A 'tf.data.Dataset' object: Outputs of `Dataset` object must be a
+            tuple (features, labels) with same constraints as below.
+        * A tuple (features, labels): Where `features` is a `Tensor` or a
+          dictionary of string feature name to `Tensor` and `labels` is a
+          `Tensor` or a dictionary of string label name to `Tensor`. Both
+          `features` and `labels` are consumed by `model_fn`. They should
+          satisfy the expectation of `model_fn` from inputs.
 
     Raises:
       ValueError: if input_fn takes invalid arguments.
     """
-    input_fn_args = util.fn_args(input_fn)
+    input_fn_args = function_utils.fn_args(input_fn)
     kwargs = {}
     if 'mode' in input_fn_args:
       kwargs['mode'] = mode
@@ -804,7 +1088,7 @@ def _call_model_fn(self, features, labels, mode, config):
     Raises:
       ValueError: if model_fn returns invalid objects.
     """
-    model_fn_args = util.fn_args(self._model_fn)
+    model_fn_args = function_utils.fn_args(self._model_fn)
     kwargs = {}
     if 'labels' in model_fn_args:
       kwargs['labels'] = labels
@@ -875,7 +1159,7 @@ def _train_model_distributed(self, input_fn, hooks, saving_listeners):
             model_fn_lib.ModeKeys.TRAIN,
             self.config)
 
-        # TODO(anjalisridhar): Figure out how to resolve the folowing scaffold
+        # TODO(anjalisridhar): Figure out how to resolve the following scaffold
         # parameters: init_feed_dict, init_fn.
         scaffold_list = self._distribution.unwrap(
             grouped_estimator_spec.scaffold)
@@ -974,9 +1258,7 @@ def _train_with_estimator_spec(self, estimator_spec, worker_hooks, hooks,
     if self._warm_start_settings:
       logging.info('Warm-starting with WarmStartSettings: %s' %
                    (self._warm_start_settings,))
-      # pylint: disable=protected-access
       warm_starting_util.warm_start(*self._warm_start_settings)
-      # pylint: enable=protected-access
     # Check if the user created a loss summary, and add one if they didn't.
     # We assume here that the summary is called 'loss'. If it is not, we will
     # make another one with the name 'loss' to ensure it shows up in the right
@@ -986,15 +1268,18 @@ def _train_with_estimator_spec(self, estimator_spec, worker_hooks, hooks,
       summary.scalar('loss', estimator_spec.loss)
     ops.add_to_collection(ops.GraphKeys.LOSSES, estimator_spec.loss)
     worker_hooks.extend(hooks)
-    worker_hooks.extend([
-        training.NanTensorHook(estimator_spec.loss),
-        training.LoggingTensorHook(
-            {
-                'loss': estimator_spec.loss,
-                'step': global_step_tensor
-            },
-            every_n_iter=self._config.log_step_count_steps)
-    ])
+    worker_hooks.append(
+        training.NanTensorHook(estimator_spec.loss)
+    )
+    if self._config.log_step_count_steps is not None:
+      worker_hooks.append(
+          training.LoggingTensorHook(
+              {
+                  'loss': estimator_spec.loss,
+                  'step': global_step_tensor
+              },
+              every_n_iter=self._config.log_step_count_steps)
+      )
     worker_hooks.extend(estimator_spec.training_hooks)
 
     if not (estimator_spec.scaffold.saver or
@@ -1051,70 +1336,76 @@ def _train_with_estimator_spec(self, estimator_spec, worker_hooks, hooks,
         _, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
     return loss
 
-  def _evaluate_model(self,
-                      input_fn,
-                      hooks=None,
-                      checkpoint_path=None,
-                      name=''):
-    """Evaluates the model using the training.evaluation library."""
-    # Check that model has been trained (if nothing has been set explicitly).
-    if not checkpoint_path:
-      latest_path = saver.latest_checkpoint(self._model_dir)
-      if not latest_path:
-        raise ValueError('Could not find trained model in model_dir: {}.'.
-                         format(self._model_dir))
-      checkpoint_path = latest_path
-
-    # Setup output directory.
-    eval_dir = os.path.join(self._model_dir, 'eval' if not name else
-                            'eval_' + name)
-
-    with ops.Graph().as_default() as g:
-      random_seed.set_random_seed(self._config.tf_random_seed)
-      global_step_tensor = self._create_and_assert_global_step(g)
-      features, labels, input_hooks = (
-          self._get_features_and_labels_from_input_fn(
-              input_fn, model_fn_lib.ModeKeys.EVAL))
-      estimator_spec = self._call_model_fn(
-          features, labels, model_fn_lib.ModeKeys.EVAL, self.config)
-
-      if model_fn_lib.LOSS_METRIC_KEY in estimator_spec.eval_metric_ops:
-        raise ValueError(
-            'Metric with name "%s" is not allowed, because Estimator ' % (
-                model_fn_lib.LOSS_METRIC_KEY) +
-            'already defines a default metric with the same name.')
-      estimator_spec.eval_metric_ops[
-          model_fn_lib.LOSS_METRIC_KEY] = metrics_lib.mean(estimator_spec.loss)
-
-      update_op, eval_dict = _extract_metric_update_ops(
-          estimator_spec.eval_metric_ops)
-
-      if ops.GraphKeys.GLOBAL_STEP in eval_dict:
-        raise ValueError(
-            'Metric with name `global_step` is not allowed, because Estimator '
-            'already defines a default metric with the same name.')
-      eval_dict[ops.GraphKeys.GLOBAL_STEP] = global_step_tensor
+  def _evaluate_build_graph(self, input_fn, hooks=None, checkpoint_path=None):
+    """Builds the graph and related hooks to run evaluation."""
+    random_seed.set_random_seed(self._config.tf_random_seed)
+    global_step_tensor = self._create_and_assert_global_step(
+        ops.get_default_graph())
+    features, labels, input_hooks = (
+        self._get_features_and_labels_from_input_fn(input_fn,
+                                                    model_fn_lib.ModeKeys.EVAL))
+    estimator_spec = self._call_model_fn(
+        features, labels, model_fn_lib.ModeKeys.EVAL, self.config)
+
+    # Call to warm_start has to be after model_fn is called.
+    self._maybe_warm_start(checkpoint_path)
+
+    if model_fn_lib.LOSS_METRIC_KEY in estimator_spec.eval_metric_ops:
+      raise ValueError(
+          'Metric with name "%s" is not allowed, because Estimator ' %
+          (model_fn_lib.LOSS_METRIC_KEY) +
+          'already defines a default metric with the same name.')
+    estimator_spec.eval_metric_ops[
+        model_fn_lib.LOSS_METRIC_KEY] = metrics_lib.mean(estimator_spec.loss)
 
-      all_hooks = list(input_hooks)
-      all_hooks.extend(hooks)
-      all_hooks.extend(list(estimator_spec.evaluation_hooks or []))
+    update_op, eval_dict = _extract_metric_update_ops(
+        estimator_spec.eval_metric_ops)
 
-      eval_results = evaluation._evaluate_once(  # pylint: disable=protected-access
+    if ops.GraphKeys.GLOBAL_STEP in eval_dict:
+      raise ValueError(
+          'Metric with name `global_step` is not allowed, because Estimator '
+          'already defines a default metric with the same name.')
+    eval_dict[ops.GraphKeys.GLOBAL_STEP] = global_step_tensor
+
+    all_hooks = list(input_hooks)
+    all_hooks.extend(hooks)
+    all_hooks.extend(list(estimator_spec.evaluation_hooks or []))
+
+    return estimator_spec.scaffold, update_op, eval_dict, all_hooks
+
+  def _evaluate_run(self, checkpoint_path, scaffold, update_op, eval_dict,
+                    all_hooks, output_dir):
+    """Run evaluation."""
+    eval_results = evaluation._evaluate_once(  # pylint: disable=protected-access
+        checkpoint_path=checkpoint_path,
+        master=self._config.evaluation_master,
+        scaffold=scaffold,
+        eval_ops=update_op,
+        final_ops=eval_dict,
+        hooks=all_hooks,
+        config=self._session_config)
+
+    current_global_step = eval_results[ops.GraphKeys.GLOBAL_STEP]
+
+    _write_dict_to_summary(
+        output_dir=output_dir,
+        dictionary=eval_results,
+        current_global_step=current_global_step)
+
+    if checkpoint_path:
+      _write_checkpoint_path_to_summary(
+          output_dir=output_dir,
           checkpoint_path=checkpoint_path,
-          master=self._config.evaluation_master,
-          scaffold=estimator_spec.scaffold,
-          eval_ops=update_op,
-          final_ops=eval_dict,
-          hooks=all_hooks,
-          config=self._session_config)
-
-      _write_dict_to_summary(
-          output_dir=eval_dir,
-          dictionary=eval_results,
-          current_global_step=eval_results[ops.GraphKeys.GLOBAL_STEP])
+          current_global_step=current_global_step)
 
     return eval_results
 
+  def _maybe_warm_start(self, checkpoint_path):
+    if not checkpoint_path and self._warm_start_settings:
+      logging.info('Warm-starting with WarmStartSettings: %s' %
+                   (self._warm_start_settings,))
+      warm_starting_util.warm_start(*self._warm_start_settings)
+
 
 def create_per_tower_ready_op(scaffold):
   """Create a Scaffold.ready_op inside a tower."""
@@ -1203,7 +1494,7 @@ def _get_replica_device_setter(config):
 
 def _verify_model_fn_args(model_fn, params):
   """Verifies model fn arguments."""
-  args = set(util.fn_args(model_fn))
+  args = set(function_utils.fn_args(model_fn))
   if 'features' not in args:
     raise ValueError('model_fn (%s) must include features argument.' % model_fn)
   if params is not None and 'params' not in args:
@@ -1304,6 +1595,30 @@ def _write_dict_to_summary(output_dir,
   summary_writer.flush()
 
 
+def _write_checkpoint_path_to_summary(output_dir, checkpoint_path,
+                                      current_global_step):
+  """Writes `checkpoint_path` into summary file in the given output directory.
+
+  Args:
+    output_dir: `str`, directory to write the summary file in.
+    checkpoint_path: `str`, checkpoint file path to be written to summary file.
+    current_global_step: `int`, the current global step.
+  """
+
+  checkpoint_path_tag = 'checkpoint_path'
+
+  logging.info('Saving \'%s\' summary for global step %d: %s',
+               checkpoint_path_tag, current_global_step, checkpoint_path)
+  summary_proto = summary_pb2.Summary()
+  summary_proto.value.add(
+      tag=checkpoint_path_tag,
+      tensor=tensor_util.make_tensor_proto(
+          checkpoint_path, dtype=dtypes.string))
+  summary_writer = writer_cache.FileWriterCache.get(output_dir)
+  summary_writer.add_summary(summary_proto, current_global_step)
+  summary_writer.flush()
+
+
 def _has_dataset_or_queue_runner(maybe_tensor):
   """Returns True if TF dataset or QueueRunner has been used."""
   # Check TF dataset first. Here, we use a simple algorithm to check the top
@@ -1315,20 +1630,8 @@ def _has_dataset_or_queue_runner(maybe_tensor):
   # Now, check queue.
   return ops.get_default_graph().get_collection(ops.GraphKeys.QUEUE_RUNNERS)
 
-
-class _DatasetInitializerHook(training.SessionRunHook):
-
-  def __init__(self, iterator):
-    self._iterator = iterator
-
-  def begin(self):
-    self._initializer = self._iterator.initializer
-
-  def after_create_session(self, session, coord):
-    del coord
-    session.run(self._initializer)
-
 VocabInfo = warm_starting_util.VocabInfo  # pylint: disable=invalid-name
+tf_export('estimator.VocabInfo', allow_multiple_exports=True)(VocabInfo)
 
 
 @tf_export('estimator.WarmStartSettings')
@@ -1458,10 +1761,19 @@ class WarmStartSettings(
     ckpt_to_initialize_from: [Required] A string specifying the directory with
       checkpoint file(s) or path to checkpoint from which to warm-start the
       model parameters.
-    vars_to_warm_start: [Optional] A regular expression that captures which
-      variables to warm-start (see tf.get_collection).  Defaults to `'.*'`,
-      which warm-starts all variables.  If `None` is explicitly given, only
-      variables specified in `var_name_to_vocab_info` will be warm-started.
+    vars_to_warm_start: [Optional] One of the following:
+
+      - A regular expression (string) that captures which variables to
+        warm-start (see tf.get_collection).  This expression will only consider
+        variables in the TRAINABLE_VARIABLES collection.
+      - A list of Variables to warm-start.
+      - A list of strings, each representing a full variable name to warm-start.
+      - `None`, in which case only variables specified in
+        `var_name_to_vocab_info` will be warm-started.
+
+      Defaults to `'.*'`, which warm-starts all variables in the
+      TRAINABLE_VARIABLES collection.  Note that this excludes variables such as
+      accumulators and moving statistics from batch norm.
     var_name_to_vocab_info: [Optional] Dict of variable names (strings) to
       VocabInfo. The variable names should be "full" variables, not the names
       of the partitions.  If not explicitly provided, the variable is assumed to
@@ -1494,7 +1806,7 @@ def _get_default_warm_start_settings(warm_start_from):
 
   Args:
     warm_start_from: Either a string representing the filepath of a checkpoint
-      to initialize from, or an instance of WarmStartSettings.
+      or SavedModel to initialize from, or an instance of WarmStartSettings.
 
   Returns:
     Either None or an instance of WarmStartSettings.
@@ -1505,9 +1817,20 @@ def _get_default_warm_start_settings(warm_start_from):
   """
   if warm_start_from is None:
     return None
-  if isinstance(warm_start_from, six.string_types):
+  if isinstance(warm_start_from, (six.string_types, six.binary_type)):
+    # Infer that this is a SavedModel if export_path +
+    # 'variables/variables.index' exists, and if so, construct the
+    # WarmStartSettings pointing to export_path + 'variables/variables'.
+    if gfile.Exists(os.path.join(compat.as_bytes(warm_start_from),
+                                 compat.as_bytes('variables/variables.index'))):
+      logging.info('Warm-starting from a SavedModel')
+      return WarmStartSettings(ckpt_to_initialize_from=os.path.join(
+          compat.as_bytes(warm_start_from),
+          compat.as_bytes('{}/{}'.format(constants.VARIABLES_DIRECTORY,
+                                         constants.VARIABLES_FILENAME))))
     return WarmStartSettings(ckpt_to_initialize_from=warm_start_from)
   elif isinstance(warm_start_from, WarmStartSettings):
     return warm_start_from
   else:
-    raise ValueError('warm_start_from must be a string or a WarmStartSettings')
+    raise ValueError('warm_start_from must be a string or a WarmStartSettings, '
+                     'instead got {}'.format(type(warm_start_from)))
diff --git a/tensorflow/python/estimator/estimator_lib.py b/tensorflow/python/estimator/estimator_lib.py
index 3815f424705644..f188f2d4e6096a 100644
--- a/tensorflow/python/estimator/estimator_lib.py
+++ b/tensorflow/python/estimator/estimator_lib.py
@@ -39,6 +39,7 @@
 from tensorflow.python.estimator.exporter import FinalExporter
 from tensorflow.python.estimator.exporter import LatestExporter
 from tensorflow.python.estimator.inputs import inputs
+from tensorflow.python.estimator.keras import model_to_estimator
 from tensorflow.python.estimator.model_fn import EstimatorSpec
 from tensorflow.python.estimator.model_fn import ModeKeys
 from tensorflow.python.estimator.run_config import RunConfig
diff --git a/tensorflow/python/estimator/estimator_test.py b/tensorflow/python/estimator/estimator_test.py
index 0fea86124cc58a..9c0d0f7390ee6f 100644
--- a/tensorflow/python/estimator/estimator_test.py
+++ b/tensorflow/python/estimator/estimator_test.py
@@ -33,13 +33,13 @@
 from tensorflow.python.estimator import estimator
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator import run_config
-from tensorflow.python.estimator import util
 from tensorflow.python.estimator.export import export
 from tensorflow.python.estimator.export import export_output
 from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
 from tensorflow.python.layers import layers
 from tensorflow.python.lib.io import file_io
@@ -72,6 +72,7 @@
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training
 from tensorflow.python.util import compat
+from tensorflow.python.util import function_utils
 
 _TMP_DIR = '/tmp'
 _ANOTHER_TMP_DIR = '/another_tmp'
@@ -81,21 +82,22 @@ def dummy_model_fn(features, labels, params):
   _, _, _ = features, labels, params
 
 
-def check_eventfile_for_keyword(keyword, dir_):
-  """Checks event files for the keyword."""
+def summaries_with_matching_keyword(keyword, dir_):
+  """Yields summary protos matching given keyword from event file."""
 
   writer_cache.FileWriterCache.clear()
 
-  # Get last Event written.
   event_paths = glob.glob(os.path.join(dir_, 'events*'))
-  last_event = None
-  for last_event in summary_iterator.summary_iterator(event_paths[-1]):
-    if last_event.summary is not None:
-      for value in last_event.summary.value:
+  for event in summary_iterator.summary_iterator(event_paths[-1]):
+    if event.summary is not None:
+      for value in event.summary.value:
         if keyword in value.tag:
-          return True
+          yield event.summary
+
 
-  return False
+def check_eventfile_for_keyword(keyword, dir_):
+  """Checks event files for the keyword."""
+  return any(summaries_with_matching_keyword(keyword, dir_))
 
 
 class EstimatorInheritanceConstraintTest(test.TestCase):
@@ -332,7 +334,7 @@ def model_fn(features, labels, mode, config, params):
       _, _, _, _, _ = features, labels, mode, config, params
 
     est = estimator.Estimator(model_fn=model_fn)
-    model_fn_args = util.fn_args(est.model_fn)
+    model_fn_args = function_utils.fn_args(est.model_fn)
     self.assertEqual(
         set(['features', 'labels', 'mode', 'config']), set(model_fn_args))
 
@@ -342,7 +344,7 @@ def model_fn(features, labels):
       _, _ = features, labels
 
     est = estimator.Estimator(model_fn=model_fn)
-    model_fn_args = util.fn_args(est.model_fn)
+    model_fn_args = function_utils.fn_args(est.model_fn)
     self.assertEqual(
         set(['features', 'labels', 'mode', 'config']), set(model_fn_args))
 
@@ -658,6 +660,41 @@ def _variable_creating_model_fn(features, labels, mode):
         5, estimator._load_global_step_from_checkpoint_dir(
             warm_started_est.model_dir))
 
+  def test_warm_starts_from_savedmodel(self):
+    def _make_model_fn(x):
+      def _variable_creating_and_export_model_fn(features, labels, mode):
+        _, _ = features, labels
+        variable_scope.get_variable('x', initializer=x)
+        global_step = training.get_global_step()
+        return model_fn_lib.EstimatorSpec(
+            mode,
+            predictions={'y': constant_op.constant(1.0)},
+            loss=constant_op.constant(1.),
+            train_op=state_ops.assign_add(global_step, 1),
+            export_outputs={'test': export_output.ClassificationOutput(
+                constant_op.constant([4.2]), constant_op.constant(['label']))})
+      return _variable_creating_and_export_model_fn
+
+    est = estimator.Estimator(model_fn=_make_model_fn(42.))
+    est.train(dummy_input_fn, steps=10)
+    feature_spec = {'x': parsing_ops.VarLenFeature(dtype=dtypes.int64),
+                    'y': parsing_ops.VarLenFeature(dtype=dtypes.int64)}
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+    tmpdir = tempfile.mkdtemp()
+    export_dir_base = os.path.join(
+        compat.as_bytes(tmpdir), compat.as_bytes('export'))
+    export_dir = est.export_savedmodel(
+        export_dir_base, serving_input_receiver_fn)
+
+    warm_started_est = estimator.Estimator(
+        model_fn=_make_model_fn(36.),
+        warm_start_from=export_dir)
+    warm_started_est.train(dummy_input_fn, steps=5)
+    # warm_start is called after the model_fn, so x should have the value
+    # from the SavedModel.
+    self.assertEqual(42., warm_started_est.get_variable_value('x'))
+
   def test_max_step(self):
     est = estimator.Estimator(model_fn=model_fn_global_step_incrementer)
     est.train(dummy_input_fn, max_steps=5)
@@ -779,6 +816,7 @@ def _model_fn_hooks(features, labels, mode):
 
   def test_saving_listeners_are_used(self):
     listener = test.mock.Mock(spec=training.CheckpointSaverListener)
+    listener.after_save.return_value = None
     est = estimator.Estimator(
         model_fn=model_fn_global_step_incrementer,
         config=run_config.RunConfig(save_checkpoints_steps=10))
@@ -1026,6 +1064,15 @@ def _model_fn(features, labels, mode):
 
 class EstimatorEvaluateTest(test.TestCase):
 
+  def test_eval_dir(self):
+    est = estimator.Estimator(
+        model_fn=model_fn_global_step_incrementer,
+        model_dir='some_path')
+    expected_eval_dir = os.path.join('some_path', 'eval')
+    self.assertEqual(expected_eval_dir, est.eval_dir())
+    expected_eval_dir_name = os.path.join('some_path', 'eval_a_name')
+    self.assertEqual(expected_eval_dir_name, est.eval_dir('a_name'))
+
   def test_input_fn_args(self):
     expected_mode = model_fn_lib.ModeKeys.EVAL
     expected_params = {'batch_size': 10}
@@ -1067,11 +1114,65 @@ def _model_fn(features, labels, mode):
         ValueError, 'model_fn should return an EstimatorSpec'):
       est.evaluate(dummy_input_fn, steps=1)
 
-  def test_no_trained_model(self):
-    est = estimator.Estimator(model_fn=_model_fn_with_eval_metric_ops)
-    with self.assertRaisesRegexp(
-        ValueError, 'Could not find trained model in model_dir'):
-      est.evaluate(dummy_input_fn, steps=1)
+  def test_no_checkpoint_uses_init(self):
+    def _model_fn(features, labels, mode, params):
+      del features, labels, params
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          loss=constant_op.constant(1.),
+          eval_metric_ops={'metric': metrics_lib.mean(
+              variables.Variable(2.) + 1)})
+    est = estimator.Estimator(model_fn=_model_fn)
+    metrics = est.evaluate(dummy_input_fn, steps=1)
+    # Metric value here is set to 1 + the value of the Variable that is newly
+    # initialized (since there is no checkpoint).
+    self.assertEqual(3., metrics['metric'])
+
+  def test_no_checkpoint_uses_init_with_warm_starting(self):
+    def _make_model_fn(x):
+      def _variable_creating_and_export_model_fn(features, labels, mode):
+        _, _ = features, labels
+        x_var = variable_scope.get_variable('x', initializer=x)
+        global_step = training.get_global_step()
+        return model_fn_lib.EstimatorSpec(
+            mode,
+            predictions={'y': constant_op.constant(1.0)},
+            loss=constant_op.constant(1.),
+            eval_metric_ops={'metric': metrics_lib.mean(x_var + 1)},
+            train_op=state_ops.assign_add(global_step, 1),
+            export_outputs={'test': export_output.ClassificationOutput(
+                constant_op.constant([4.2]), constant_op.constant(['label']))})
+      return _variable_creating_and_export_model_fn
+
+    first_est = estimator.Estimator(model_fn=_make_model_fn(42.))
+    first_est.train(dummy_input_fn, steps=10)
+    feature_spec = {'x': parsing_ops.VarLenFeature(dtype=dtypes.int64),
+                    'y': parsing_ops.VarLenFeature(dtype=dtypes.int64)}
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+    tmpdir = tempfile.mkdtemp()
+    export_dir_base = os.path.join(
+        compat.as_bytes(tmpdir), compat.as_bytes('export'))
+    exported_path = first_est.export_savedmodel(export_dir_base,
+                                                serving_input_receiver_fn)
+
+    # Test that we can pass either warm_start_from as an external checkpoint
+    # or an exported SavedModel.
+    est = estimator.Estimator(model_fn=_make_model_fn(52.),
+                              warm_start_from=exported_path)
+    metrics = est.evaluate(dummy_input_fn, steps=1)
+    # Metric value here is set to 1 + the value of the Variable that is
+    # warm-started from the SavedModel of the first model (42.), as opposed to
+    # the initialization in the new model_fn (52.).
+    self.assertEqual(43., metrics['metric'])
+
+    est = estimator.Estimator(model_fn=_make_model_fn(62.),
+                              warm_start_from=first_est.model_dir)
+    metrics = est.evaluate(dummy_input_fn, steps=1)
+    # Metric value here is set to 1 + the value of the Variable that is
+    # warm-started from a checkpoint of the first model (42.), as opposed to
+    # the initialization in the new model_fn (52.).
+    self.assertEqual(43., metrics['metric'])
 
   def test_scores(self):
     est = estimator.Estimator(
@@ -1296,9 +1397,22 @@ def model_fn_global_step_incrementer_image(features, labels, mode):
     # Get last evaluation Event written.
     for key in ['foo/0', 'foo/1', 'foo/2']:
       self.assertTrue(
-          check_eventfile_for_keyword(key, os.path.join(est.model_dir, 'eval')),
+          check_eventfile_for_keyword(key, est.eval_dir()),
           '{} should be part of reported summaries.'.format(key))
 
+    # Verify that evaluated checkpoint path is written to event file.
+    checkpoint_path_tag = 'checkpoint_path'
+    self.assertTrue(
+        check_eventfile_for_keyword(checkpoint_path_tag, est.eval_dir()),
+        '{} should be part of reported summaries.'.format(checkpoint_path_tag))
+
+    expected_tensor_proto = tensor_util.make_tensor_proto(
+        est.latest_checkpoint(), dtype=dtypes.string)
+    summaries = summaries_with_matching_keyword(checkpoint_path_tag,
+                                                est.eval_dir())
+    self.assertProtoEquals(expected_tensor_proto,
+                           next(summaries).value[0].tensor)
+
 
 class EstimatorPredictTest(test.TestCase):
 
@@ -1331,11 +1445,58 @@ def _input_fn(mode, params, config):
     next(est.predict(_input_fn))
     self.assertEqual(1, input_fn_call_count[0])
 
-  def test_no_trained_model_in_model_dir(self):
-    est = estimator.Estimator(model_fn=model_fn_global_step_incrementer)
-    with self.assertRaisesRegexp(ValueError,
-                                 'Could not find trained model in model_dir'):
-      next(est.predict(dummy_input_fn))
+  def test_no_checkpoint_uses_init(self):
+    def _model_fn(features, labels, mode, params, config):
+      del features, labels, params, config
+      x = variables.Variable([[3.]], name='x')
+      return model_fn_lib.EstimatorSpec(mode, predictions=math_ops.add(x, 1.))
+    est = estimator.Estimator(model_fn=_model_fn)
+    # Expected prediction value is 1 + the value of the Variable that is newly
+    # initialized (since there is no checkpoint).
+    self.assertEqual(4., next(est.predict(dummy_input_fn)))
+
+  def test_no_checkpoint_uses_init_with_warm_starting(self):
+    def _make_model_fn(x):
+      def _variable_creating_and_export_model_fn(features, labels, mode):
+        _, _ = features, labels
+        x_var = variables.Variable([[x]], name='x')
+        return model_fn_lib.EstimatorSpec(
+            mode,
+            predictions=math_ops.add(x_var, 1.),
+            loss=constant_op.constant(1.),
+            train_op=state_ops.assign_add(training.get_global_step(), 1),
+            export_outputs={'test': export_output.ClassificationOutput(
+                constant_op.constant([4.2]),
+                constant_op.constant(['label']))})
+      return _variable_creating_and_export_model_fn
+
+    first_est = estimator.Estimator(model_fn=_make_model_fn(3.))
+    first_est.train(dummy_input_fn, steps=10)
+    feature_spec = {'x': parsing_ops.VarLenFeature(dtype=dtypes.int64),
+                    'y': parsing_ops.VarLenFeature(dtype=dtypes.int64)}
+    serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
+        feature_spec)
+    tmpdir = tempfile.mkdtemp()
+    export_dir_base = os.path.join(
+        compat.as_bytes(tmpdir), compat.as_bytes('export'))
+    exported_path = first_est.export_savedmodel(export_dir_base,
+                                                serving_input_receiver_fn)
+
+    # Test that we can pass either warm_start_from as an external checkpoint
+    # or an exported SavedModel.
+    est = estimator.Estimator(model_fn=_make_model_fn(30.),
+                              warm_start_from=exported_path)
+    # Prediction here is set to 1 + the value of the Variable that is
+    # warm-started from the SavedModel of the first model (3.), as opposed to
+    # the initialization in the new model_fn (30.).
+    self.assertEqual(4., next(est.predict(dummy_input_fn)))
+
+    est = estimator.Estimator(model_fn=_make_model_fn(40.),
+                              warm_start_from=first_est.model_dir)
+    # Prediction here is set to 1 + the value of the Variable that is
+    # warm-started from a checkpoint of the first model (3.), as opposed to
+    # the initialization in the new model_fn (40.).
+    self.assertEqual(4., next(est.predict(dummy_input_fn)))
 
   def test_no_trained_model_invalid_checkpoint_path(self):
     est = estimator.Estimator(model_fn=model_fn_global_step_incrementer)
@@ -1729,6 +1890,41 @@ def _model_fn_for_export_tests(features, labels, mode):
           'test': export_output.ClassificationOutput(scores, classes)})
 
 
+def _x_y_input_fn():
+  return ({'x': constant_op.constant([[1], [1]]),
+           'y': constant_op.constant([[2], [2]])},
+          constant_op.constant([[1], [1]]))
+
+
+def _model_fn_with_x_y(features, labels, mode):
+  _ = labels
+  variables.Variable(1., name='weight')
+  scores = constant_op.constant([3.])
+  classes = constant_op.constant(['wumpus'])
+  if mode == model_fn_lib.ModeKeys.PREDICT:
+    variables.Variable(36., name='name_collision')
+    return model_fn_lib.EstimatorSpec(
+        mode,
+        predictions=constant_op.constant(10.),
+        export_outputs={
+            'test': export_output.ClassificationOutput(scores, classes)})
+  else:
+    prefix = 'eval_' if mode == model_fn_lib.ModeKeys.EVAL else ''
+
+    multiplied = math_ops.multiply(
+        features['x'], features['y'], name='{}multiplied'.format(prefix))
+    metrics = {'mean': metrics_lib.mean(features['x'] - features['y'],
+                                        name='{}mean'.format(prefix))}
+    variables.Variable(1., name='later_var')
+    variables.Variable(3., name='name_collision')
+    return model_fn_lib.EstimatorSpec(
+        mode,
+        predictions=multiplied,
+        loss=constant_op.constant(1.),
+        train_op=state_ops.assign_add(training.get_global_step(), 1),
+        eval_metric_ops=metrics)
+
+
 def _model_fn_with_saveables_for_export_tests(features, labels, mode):
   _, _ = features, labels
   table = saver_test_utils.CheckpointedOp(name='v2')
@@ -1745,21 +1941,41 @@ def _model_fn_with_saveables_for_export_tests(features, labels, mode):
           'test': export_output.PredictOutput({'prediction': prediction})})
 
 
+def _get_serving_input_receiver_fn():
+  feature_spec = {'x': parsing_ops.VarLenFeature(dtype=dtypes.int64),
+                  'y': parsing_ops.VarLenFeature(dtype=dtypes.int64)}
+  return export.build_parsing_serving_input_receiver_fn(feature_spec)
+
+
+def _get_supervised_input_receiver_fn():
+  feature_spec = {
+      'x': array_ops.placeholder(
+          dtype=dtypes.int64, shape=(2, 1), name='feature_x'),
+      'y': array_ops.placeholder(
+          dtype=dtypes.int64, shape=(2, 1), name='feature_y')
+      }
+  label_spec = array_ops.placeholder(
+      dtype=dtypes.float32, shape=[1], name='truth')
+
+  return export.build_raw_supervised_input_receiver_fn(feature_spec, label_spec)
+
+
 _VOCAB_FILE_CONTENT = 'emerson\nlake\npalmer\n'
 _EXTRA_FILE_CONTENT = 'kermit\npiggy\nralph\n'
 
 
 class EstimatorExportTest(test.TestCase):
 
-  def test_export_savedmodel_proto_roundtrip(self):
-    tmpdir = tempfile.mkdtemp()
-    est = estimator.Estimator(model_fn=_model_fn_for_export_tests)
-    est.train(input_fn=dummy_input_fn, steps=1)
+  def test_export_savedmodel_proto_roundtrip_raw_receiver(self):
     feature_spec = {'x': parsing_ops.VarLenFeature(dtype=dtypes.int64),
                     'y': parsing_ops.VarLenFeature(dtype=dtypes.int64)}
     serving_input_receiver_fn = export.build_parsing_serving_input_receiver_fn(
         feature_spec)
 
+    tmpdir = tempfile.mkdtemp()
+    est = estimator.Estimator(model_fn=_model_fn_for_export_tests)
+    est.train(input_fn=dummy_input_fn, steps=1)
+
     # Perform the export.
     export_dir_base = os.path.join(
         compat.as_bytes(tmpdir), compat.as_bytes('export'))
@@ -1768,6 +1984,248 @@ def test_export_savedmodel_proto_roundtrip(self):
 
     # Check that all the files are in the right places.
     self.assertTrue(gfile.Exists(export_dir_base))
+    self._validate_exported_files(export_dir)
+
+    # Restore, to validate that the export was well-formed.
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.SERVING], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('input_example_tensor' in graph_ops)
+        self.assertTrue('ParseExample/ParseExample' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+  def test_export_saved_model_train(self):
+    self._test_export_saved_model_for_mode(
+        _get_supervised_input_receiver_fn(), model_fn_lib.ModeKeys.TRAIN)
+
+  def test_export_saved_model_eval(self):
+    self._test_export_saved_model_for_mode(
+        _get_supervised_input_receiver_fn(), model_fn_lib.ModeKeys.EVAL)
+
+  def test_export_saved_model_predict(self):
+    self._test_export_saved_model_for_mode(
+        _get_serving_input_receiver_fn(), model_fn_lib.ModeKeys.PREDICT)
+
+  def _test_export_saved_model_for_mode(self, input_receiver_fn, mode):
+    tmpdir = tempfile.mkdtemp()
+    est = estimator.Estimator(model_fn=_model_fn_for_export_tests)
+    est.train(input_fn=_x_y_input_fn, steps=1)
+
+    # Perform the export.
+    export_dir_base = os.path.join(
+        compat.as_bytes(tmpdir), compat.as_bytes('export'))
+    export_dir = est._export_saved_model_for_mode(
+        export_dir_base, input_receiver_fn, mode=mode)
+
+    # Check that all the files are in the right places.
+    self.assertTrue(gfile.Exists(export_dir_base))
+    self._validate_exported_files(export_dir)
+
+    # Restore, to validate that the export was well-formed.
+    tag_set = model_fn_lib.EXPORT_TAG_MAP[mode]
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, tag_set, export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertFalse('name_collision_1' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def test_export_all_saved_models_proto_roundtrip_receiver_map(self):
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.PREDICT: _get_serving_input_receiver_fn()
+    }
+    export_dir, tmpdir = self._test_export_all_saved_models(
+        input_receiver_fn_map)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.SERVING], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('input_example_tensor' in graph_ops)
+        self.assertTrue('ParseExample/ParseExample' in graph_ops)
+        self.assertFalse('feature_x' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def test_export_all_saved_models_proto_roundtrip_train_only(self):
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
+    }
+    export_dir, tmpdir = self._test_export_all_saved_models(
+        input_receiver_fn_map)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.TRAINING], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('multiplied' in graph_ops)
+        self.assertTrue('mean/update_op' in graph_ops)
+        self.assertFalse('eval_multiplied' in graph_ops)
+        self.assertTrue('feature_x' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def test_export_all_saved_models_proto_roundtrip_eval_only(self):
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.EVAL: _get_supervised_input_receiver_fn()
+    }
+    export_dir, tmpdir = self._test_export_all_saved_models(
+        input_receiver_fn_map)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.EVAL], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('eval_multiplied' in graph_ops)
+        self.assertTrue('eval_mean/value' in graph_ops)
+        self.assertFalse('multiplied' in graph_ops)
+        self.assertTrue('feature_x' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def test_export_all_saved_models_proto_roundtrip_no_serving(self):
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
+        model_fn_lib.ModeKeys.EVAL: _get_supervised_input_receiver_fn()
+    }
+    export_dir, tmpdir = self._test_export_all_saved_models(
+        input_receiver_fn_map)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.TRAINING], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('multiplied' in graph_ops)
+        self.assertFalse('eval_multiplied' in graph_ops)
+        self.assertTrue('feature_x' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.EVAL], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('eval_multiplied' in graph_ops)
+        self.assertFalse('multiplied' in graph_ops)
+        # TODO(karmel): is this the desired behavior when names are shared?
+        self.assertTrue('feature_x_1' in graph_ops)
+        self.assertTrue('feature_y_1' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def test_export_all_saved_models_proto_roundtrip_three_defs(self):
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
+        model_fn_lib.ModeKeys.EVAL: _get_supervised_input_receiver_fn(),
+        model_fn_lib.ModeKeys.PREDICT: _get_serving_input_receiver_fn()
+    }
+    export_dir, tmpdir = self._test_export_all_saved_models(
+        input_receiver_fn_map)
+
+    # Restore, to validate that the export was well-formed.
+    for tag_set in model_fn_lib.EXPORT_TAG_MAP.values():
+      with ops.Graph().as_default() as graph:
+        with session.Session(graph=graph) as sess:
+          loader.load(sess, tag_set, export_dir)
+          graph_ops = [x.name for x in graph.get_operations()]
+          self.assertTrue('global_step/Assign' in graph_ops)
+          self.assertTrue('global_step/Initializer/zeros' in graph_ops)
+          self.assertTrue('weight' in graph_ops)
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def test_export_all_saved_models_proto_roundtrip_all_vars(self):
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
+        model_fn_lib.ModeKeys.PREDICT: _get_serving_input_receiver_fn()
+    }
+    export_dir, tmpdir = self._test_export_all_saved_models(
+        input_receiver_fn_map)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.TRAINING], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('later_var' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.SERVING], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertFalse('later_var' in graph_ops)
+        self.assertTrue('weight' in graph_ops)
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def test_export_all_saved_models_name_collision(self):
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
+        model_fn_lib.ModeKeys.PREDICT: _get_serving_input_receiver_fn()
+    }
+    export_dir, tmpdir = self._test_export_all_saved_models(
+        input_receiver_fn_map)
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.TRAINING], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('name_collision' in graph_ops)
+        self.assertFalse('name_collision_1' in graph_ops)
+        collection_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+        self.assertEqual(3, collection_vars[-1].eval())
+
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.SERVING], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue('name_collision' in graph_ops)
+        self.assertFalse('name_collision_1' in graph_ops)
+        collection_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+        # This is a non-obvious detail: when we load the estimator spec
+        # for predict, name_collision gets set to 36. However, we then restore
+        # from checkpoint, which should overwrite that var and make it the 3
+        # from training. In practice, this would not be a good way to write
+        # a model_fn, but leaving this check in for now to ensure consistency
+        # with what would happen given our current order of spec, then
+        # checkpoint.
+        self.assertEqual(3, collection_vars[-1].eval())
+
+    # Clean up.
+    gfile.DeleteRecursively(tmpdir)
+
+  def _test_export_all_saved_models(self, input_receiver_fn_map):
+    tmpdir = tempfile.mkdtemp()
+    est = estimator.Estimator(model_fn=_model_fn_with_x_y)
+    est.train(input_fn=_x_y_input_fn, steps=1)
+
+    # Perform the export.
+    export_dir_base = os.path.join(
+        compat.as_bytes(tmpdir), compat.as_bytes('export'))
+    export_dir = est._export_all_saved_models(
+        export_dir_base, input_receiver_fn_map)
+
+    # Check that all the files are in the right places.
+    self.assertTrue(gfile.Exists(export_dir_base))
+
+    self._validate_exported_files(export_dir)
+
+    return export_dir, tmpdir
+
+  def _validate_exported_files(self, export_dir):
     self.assertTrue(gfile.Exists(export_dir))
     self.assertTrue(gfile.Exists(os.path.join(
         compat.as_bytes(export_dir),
@@ -1782,17 +2240,41 @@ def test_export_savedmodel_proto_roundtrip(self):
         compat.as_bytes(export_dir),
         compat.as_bytes('variables/variables.data-00000-of-00001'))))
 
-    # Restore, to validate that the export was well-formed.
-    with ops.Graph().as_default() as graph:
-      with session.Session(graph=graph) as sess:
-        loader.load(sess, [tag_constants.SERVING], export_dir)
-        graph_ops = [x.name for x in graph.get_operations()]
-        self.assertTrue('input_example_tensor' in graph_ops)
-        self.assertTrue('ParseExample/ParseExample' in graph_ops)
-        self.assertTrue('weight' in graph_ops)
+  def test_export_all_saved_models_var_not_found(self):
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
+        model_fn_lib.ModeKeys.EVAL: _get_supervised_input_receiver_fn(),
+        model_fn_lib.ModeKeys.PREDICT: _get_serving_input_receiver_fn()
+    }
 
-    # Clean up.
-    gfile.DeleteRecursively(tmpdir)
+    def _model_fn_with_predict_only_vars(features, labels, mode):
+      _, _ = features, labels
+      if mode == model_fn_lib.ModeKeys.PREDICT:
+        variables.Variable(1., name='only_in_predict')
+      else:
+        variables.Variable(1., name='otherwise')
+
+      prediction = constant_op.constant(1.)
+      return model_fn_lib.EstimatorSpec(
+          mode,
+          predictions=prediction,
+          loss=constant_op.constant(1.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          export_outputs={
+              'test': export_output.PredictOutput({'prediction': prediction})
+          })
+
+    tmpdir = tempfile.mkdtemp()
+    est = estimator.Estimator(model_fn=_model_fn_with_predict_only_vars)
+    est.train(input_fn=_x_y_input_fn, steps=1)
+
+    # Perform the export.
+    export_dir_base = os.path.join(
+        compat.as_bytes(tmpdir), compat.as_bytes('export'))
+
+    err_regex = r'Could not load all requested variables[\w\W]*infer'
+    with self.assertRaisesRegexp(ValueError, err_regex):
+      est._export_all_saved_models(export_dir_base, input_receiver_fn_map)
 
   def test_export_savedmodel_with_saveables_proto_roundtrip(self):
     tmpdir = tempfile.mkdtemp()
@@ -2025,6 +2507,43 @@ def _model_fn_scaffold(features, labels, mode):
 
     self.assertTrue(self.mock_saver.restore.called)
 
+  def test_scaffold_is_used_for_saver_multiple_modes(self):
+    tmpdir = tempfile.mkdtemp()
+
+    def _model_fn_scaffold(features, labels, mode):
+      _, _ = features, labels
+      variables.Variable(1., name='weight')
+      real_saver = saver.Saver()
+      self.mock_saver = test.mock.Mock(
+          wraps=real_saver, saver_def=real_saver.saver_def)
+      scores = constant_op.constant([3.])
+      if mode == model_fn_lib.ModeKeys.PREDICT:
+        scaffold = training.Scaffold(saver=self.mock_saver)
+      else:
+        scaffold = training.Scaffold()
+      return model_fn_lib.EstimatorSpec(
+          mode=mode,
+          predictions=constant_op.constant([[1.]]),
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          scaffold=scaffold,
+          export_outputs={'test': export_output.ClassificationOutput(scores)})
+
+    est = estimator.Estimator(model_fn=_model_fn_scaffold)
+    est.train(dummy_input_fn, steps=1)
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
+        model_fn_lib.ModeKeys.EVAL: _get_supervised_input_receiver_fn(),
+        model_fn_lib.ModeKeys.PREDICT: _get_serving_input_receiver_fn()
+    }
+
+    # Perform the export.
+    export_dir_base = os.path.join(
+        compat.as_bytes(tmpdir), compat.as_bytes('export'))
+    est._export_all_saved_models(export_dir_base, input_receiver_fn_map)
+
+    self.assertTrue(self.mock_saver.restore.called)
+
   def test_scaffold_is_used_for_local_init(self):
     tmpdir = tempfile.mkdtemp()
 
@@ -2070,6 +2589,61 @@ def _model_fn_scaffold(features, labels, mode):
         my_int_value = sess.run(my_int)
         self.assertEqual(12345, my_int_value)
 
+  def test_scaffold_is_used_for_local_init_multiple_modes(self):
+    tmpdir = tempfile.mkdtemp()
+
+    def _model_fn_scaffold(features, labels, mode):
+      _, _ = features, labels
+      my_int = variables.Variable(1, name='my_int',
+                                  collections=[ops.GraphKeys.LOCAL_VARIABLES])
+      scores = constant_op.constant([3.])
+      with ops.control_dependencies([
+          variables.local_variables_initializer(),
+          lookup_ops.tables_initializer()
+      ]):
+        assign_op = state_ops.assign(my_int, 12345)
+
+      custom_local_init_op = None
+      if mode == model_fn_lib.ModeKeys.PREDICT:
+        # local_initSop must be an Operation, not a Tensor.
+        custom_local_init_op = control_flow_ops.group(assign_op)
+
+      return model_fn_lib.EstimatorSpec(
+          mode=mode,
+          predictions=constant_op.constant([[1.]]),
+          loss=constant_op.constant(0.),
+          train_op=state_ops.assign_add(training.get_global_step(), 1),
+          scaffold=training.Scaffold(local_init_op=custom_local_init_op),
+          export_outputs={'test': export_output.ClassificationOutput(scores)})
+
+    est = estimator.Estimator(model_fn=_model_fn_scaffold)
+    est.train(dummy_input_fn, steps=1)
+    input_receiver_fn_map = {
+        model_fn_lib.ModeKeys.TRAIN: _get_supervised_input_receiver_fn(),
+        model_fn_lib.ModeKeys.EVAL: _get_supervised_input_receiver_fn(),
+        model_fn_lib.ModeKeys.PREDICT: _get_serving_input_receiver_fn()
+    }
+
+    # Perform the export.
+    export_dir_base = os.path.join(
+        compat.as_bytes(tmpdir), compat.as_bytes('export'))
+    export_dir = est._export_all_saved_models(
+        export_dir_base, input_receiver_fn_map)
+
+    # Restore, to validate that the custom local_init_op runs.
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.SERVING], export_dir)
+        my_int = graph.get_tensor_by_name('my_int:0')
+        my_int_value = sess.run(my_int)
+        self.assertEqual(12345, my_int_value)
+    with ops.Graph().as_default() as graph:
+      with session.Session(graph=graph) as sess:
+        loader.load(sess, [tag_constants.TRAINING], export_dir)
+        my_int = graph.get_tensor_by_name('my_int:0')
+        my_int_value = sess.run(my_int)
+        self.assertEqual(1, my_int_value)
+
   def test_features_labels_mode(self):
     given_features = {'test-features': constant_op.constant([[1], [1]])}
 
@@ -2349,5 +2923,6 @@ def _model_fn(features, labels, mode):
                                        serving_input_receiver_fn)
     self.assertTrue(gfile.Exists(export_dir))
 
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/estimator/export/export.py b/tensorflow/python/estimator/export/export.py
index 41c1f5a2e25cd6..ff19a0a7f43559 100644
--- a/tensorflow/python/estimator/export/export.py
+++ b/tensorflow/python/estimator/export/export.py
@@ -14,7 +14,6 @@
 # ==============================================================================
 """Configuration and utilities for receiving inputs at serving time."""
 
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -37,69 +36,104 @@
 from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import tf_export
 
-
 _SINGLE_FEATURE_DEFAULT_NAME = 'feature'
 _SINGLE_RECEIVER_DEFAULT_NAME = 'input'
+_SINGLE_LABEL_DEFAULT_NAME = 'label'
+
+
+def _wrap_and_check_receiver_tensors(receiver_tensors):
+  """Ensure that receiver_tensors is a dict of str to Tensor mappings.
+
+  Args:
+    receiver_tensors: dict of str to Tensors, or a single Tensor.
+
+  Returns:
+    dict of str to Tensors; this is the original dict if one was passed, or
+    the original tensor wrapped in a dictionary.
+
+  Raises:
+    ValueError: if receiver_tensors is None, or has non-string keys,
+      or non-Tensor values
+  """
+  if receiver_tensors is None:
+    raise ValueError('receiver_tensors must be defined.')
+  if not isinstance(receiver_tensors, dict):
+    receiver_tensors = {_SINGLE_RECEIVER_DEFAULT_NAME: receiver_tensors}
+  for name, tensor in receiver_tensors.items():
+    _check_tensor_key(name, error_label='receiver_tensors')
+    _check_tensor(tensor, name, error_label='receiver_tensor')
+  return receiver_tensors
+
+
+def _check_tensor(tensor, name, error_label='feature'):
+  """Check that passed `tensor` is a Tensor or SparseTensor."""
+  if not (isinstance(tensor, ops.Tensor) or
+          isinstance(tensor, sparse_tensor.SparseTensor)):
+    fmt_name = ' {}'.format(name) if name else ''
+    value_error = ValueError('{}{} must be a Tensor or SparseTensor.'.format(
+        error_label, fmt_name))
+    # NOTE(ericmc): This if-else block is a specific carve-out for
+    # LabeledTensor, which has a `.tensor` attribute and which is
+    # convertible to tf.Tensor via ops.convert_to_tensor.
+    # Allowing all types convertible to tf.Tensor is considered by soergel@
+    # to be too permissive.
+    # TODO(soergel): accept any type convertible to Tensor,
+    # as in cl/193238295 snapshot #6.
+    if hasattr(tensor, 'tensor'):
+      try:
+        ops.convert_to_tensor(tensor)
+      except TypeError:
+        raise value_error
+    else:
+      raise value_error
+
+
+def _check_tensor_key(name, error_label='feature'):
+  if not isinstance(name, six.string_types):
+    raise ValueError('{} keys must be strings: {}.'.format(error_label, name))
 
 
 @tf_export('estimator.export.ServingInputReceiver')
-class ServingInputReceiver(collections.namedtuple(
-    'ServingInputReceiver',
-    ['features', 'receiver_tensors', 'receiver_tensors_alternatives'])):
+class ServingInputReceiver(
+    collections.namedtuple(
+        'ServingInputReceiver',
+        ['features', 'receiver_tensors', 'receiver_tensors_alternatives'])):
   """A return type for a serving_input_receiver_fn.
 
   The expected return values are:
     features: A `Tensor`, `SparseTensor`, or dict of string to `Tensor` or
-      `SparseTensor`, specifying the features to be passed to the model.
-    receiver_tensors: a `Tensor`, or a dict of string to `Tensor`, specifying
-      input nodes where this receiver expects to be fed by default.  Typically,
-      this is a single placeholder expecting serialized `tf.Example` protos.
+      `SparseTensor`, specifying the features to be passed to the model. Note:
+      if `features` passed is not a dict, it will be wrapped in a dict with a
+      single entry, using 'feature' as the key.  Consequently, the model must
+      accept a feature dict of the form {'feature': tensor}.  You may use
+      `TensorServingInputReceiver` if you want the tensor to be passed as is.
+    receiver_tensors: A `Tensor`, `SparseTensor`, or dict of string to `Tensor`
+      or `SparseTensor`, specifying input nodes where this receiver expects to
+      be fed by default.  Typically, this is a single placeholder expecting
+      serialized `tf.Example` protos.
     receiver_tensors_alternatives: a dict of string to additional
-      groups of receiver tensors, each of which may be a `Tensor` or a dict of
-      string to `Tensor`.  These named receiver tensor alternatives generate
-      additional serving signatures, which may be used to feed inputs at
-      different points within the input receiver subgraph.  A typical usage is
-      to allow feeding raw feature `Tensor`s *downstream* of the
-      tf.parse_example() op.  Defaults to None.
+      groups of receiver tensors, each of which may be a `Tensor`,
+      `SparseTensor`, or dict of string to `Tensor` or`SparseTensor`.
+      These named receiver tensor alternatives generate additional serving
+      signatures, which may be used to feed inputs at different points within
+      the input receiver subgraph.  A typical usage is to allow feeding raw
+      feature `Tensor`s *downstream* of the tf.parse_example() op.
+      Defaults to None.
   """
 
-  def __new__(cls, features, receiver_tensors,
+  def __new__(cls,
+              features,
+              receiver_tensors,
               receiver_tensors_alternatives=None):
     if features is None:
       raise ValueError('features must be defined.')
     if not isinstance(features, dict):
       features = {_SINGLE_FEATURE_DEFAULT_NAME: features}
     for name, tensor in features.items():
-      if not isinstance(name, six.string_types):
-        raise ValueError('feature keys must be strings: {}.'.format(name))
-      if not (isinstance(tensor, ops.Tensor)
-              or isinstance(tensor, sparse_tensor.SparseTensor)):
-        value_error = ValueError(
-            'feature {} must be a Tensor or SparseTensor.'.format(name))
-        # NOTE(ericmc): This if-else block is a specific carve-out for
-        # LabeledTensor, which has a `.tensor` attribute and which is
-        # convertible to tf.Tensor via ops.convert_to_tensor.
-        # Allowing all types convertible to tf.Tensor is considered by soergel@
-        # to be too permissive.
-        if hasattr(tensor, 'tensor'):
-          try:
-            ops.convert_to_tensor(tensor)
-          except TypeError:
-            raise value_error
-        else:
-          raise value_error
-
-    if receiver_tensors is None:
-      raise ValueError('receiver_tensors must be defined.')
-    if not isinstance(receiver_tensors, dict):
-      receiver_tensors = {_SINGLE_RECEIVER_DEFAULT_NAME: receiver_tensors}
-    for name, tensor in receiver_tensors.items():
-      if not isinstance(name, six.string_types):
-        raise ValueError(
-            'receiver_tensors keys must be strings: {}.'.format(name))
-      if not isinstance(tensor, ops.Tensor):
-        raise ValueError(
-            'receiver_tensor {} must be a Tensor.'.format(name))
+      _check_tensor_key(name)
+      _check_tensor(tensor, name)
+
+    receiver_tensors = _wrap_and_check_receiver_tensors(receiver_tensors)
 
     if receiver_tensors_alternatives is not None:
       if not isinstance(receiver_tensors_alternatives, dict):
@@ -109,20 +143,16 @@ def __new__(cls, features, receiver_tensors,
       for alternative_name, receiver_tensors_alt in (
           six.iteritems(receiver_tensors_alternatives)):
         if not isinstance(receiver_tensors_alt, dict):
-          receiver_tensors_alt = {_SINGLE_RECEIVER_DEFAULT_NAME:
-                                  receiver_tensors_alt}
+          receiver_tensors_alt = {
+              _SINGLE_RECEIVER_DEFAULT_NAME: receiver_tensors_alt
+          }
           # Updating dict during iteration is OK in this case.
           receiver_tensors_alternatives[alternative_name] = (
               receiver_tensors_alt)
         for name, tensor in receiver_tensors_alt.items():
-          if not isinstance(name, six.string_types):
-            raise ValueError(
-                'receiver_tensors keys must be strings: {}.'.format(name))
-          if not (isinstance(tensor, ops.Tensor)
-                  or isinstance(tensor, sparse_tensor.SparseTensor)):
-            raise ValueError(
-                'receiver_tensor {} must be a Tensor or SparseTensor.'.format(
-                    name))
+          _check_tensor_key(name, error_label='receiver_tensors_alternative')
+          _check_tensor(
+              tensor, name, error_label='receiver_tensors_alternative')
 
     return super(ServingInputReceiver, cls).__new__(
         cls,
@@ -132,9 +162,10 @@ def __new__(cls, features, receiver_tensors,
 
 
 @tf_export('estimator.export.TensorServingInputReceiver')
-class TensorServingInputReceiver(collections.namedtuple(
-    'TensorServingInputReceiver',
-    ['features', 'receiver_tensors', 'receiver_tensors_alternatives'])):
+class TensorServingInputReceiver(
+    collections.namedtuple(
+        'TensorServingInputReceiver',
+        ['features', 'receiver_tensors', 'receiver_tensors_alternatives'])):
   """A return type for a serving_input_receiver_fn.
 
   This is for use with models that expect a single `Tensor` or `SparseTensor`
@@ -155,25 +186,27 @@ class TensorServingInputReceiver(collections.namedtuple(
   The expected return values are:
     features: A single `Tensor` or `SparseTensor`, representing the feature
       to be passed to the model.
-    receiver_tensors: a `Tensor`, or a dict of string to `Tensor`, specifying
-      input nodes where this receiver expects to be fed by default.  Typically,
-      this is a single placeholder expecting serialized `tf.Example` protos.
+    receiver_tensors: A `Tensor`, `SparseTensor`, or dict of string to `Tensor`
+      or `SparseTensor`, specifying input nodes where this receiver expects to
+      be fed by default.  Typically, this is a single placeholder expecting
+      serialized `tf.Example` protos.
     receiver_tensors_alternatives: a dict of string to additional
-      groups of receiver tensors, each of which may be a `Tensor` or a dict of
-      string to `Tensor`.  These named receiver tensor alternatives generate
-      additional serving signatures, which may be used to feed inputs at
-      different points within the input receiver subgraph.  A typical usage is
-      to allow feeding raw feature `Tensor`s *downstream* of the
-      tf.parse_example() op.  Defaults to None.
+      groups of receiver tensors, each of which may be a `Tensor`,
+      `SparseTensor`, or dict of string to `Tensor` or`SparseTensor`.
+      These named receiver tensor alternatives generate additional serving
+      signatures, which may be used to feed inputs at different points within
+      the input receiver subgraph.  A typical usage is to allow feeding raw
+      feature `Tensor`s *downstream* of the tf.parse_example() op.
+      Defaults to None.
   """
 
-  def __new__(cls, features, receiver_tensors,
+  def __new__(cls,
+              features,
+              receiver_tensors,
               receiver_tensors_alternatives=None):
     if features is None:
       raise ValueError('features must be defined.')
-    if not (isinstance(features, ops.Tensor)
-            or isinstance(features, sparse_tensor.SparseTensor)):
-      raise ValueError('feature must be a Tensor or SparseTensor.')
+    _check_tensor(features, None)
 
     receiver = ServingInputReceiver(
         features=features,
@@ -187,6 +220,49 @@ def __new__(cls, features, receiver_tensors,
         receiver_tensors_alternatives=receiver.receiver_tensors_alternatives)
 
 
+class SupervisedInputReceiver(
+    collections.namedtuple('SupervisedInputReceiver',
+                           ['features', 'labels', 'receiver_tensors'])):
+  """A return type for a training_input_receiver_fn or eval_input_receiver_fn.
+
+  This differs from a ServingInputReceiver in that (1) this receiver expects
+  a set of labels to be passed in with features, and (2) this receiver does
+  not support receiver_tensors_alternatives, which are primarily used for
+  serving.
+
+  The expected return values are:
+    features: A `Tensor`, `SparseTensor`, or dict of string to `Tensor` or
+      `SparseTensor`, specifying the features to be passed to the model.
+    labels: A `Tensor`, `SparseTensor`, or dict of string to `Tensor` or
+      `SparseTensor`, specifying the labels to be passed to the model.
+    receiver_tensors: A `Tensor`, `SparseTensor`, or dict of string to `Tensor`
+      or `SparseTensor`, specifying input nodes where this receiver expects to
+      be fed by default.  Typically, this is a single placeholder expecting
+      serialized `tf.Example` protos.
+
+  """
+
+  def __new__(cls, features, labels, receiver_tensors):
+    # Both features and labels can be dicts or raw tensors.
+    for input_vals, error_label in ((features, 'feature'), (labels, 'label')):
+      if input_vals is None:
+        raise ValueError('{}s must be defined.'.format(error_label))
+      if isinstance(input_vals, dict):
+        for name, tensor in input_vals.items():
+          _check_tensor_key(name, error_label=error_label)
+          _check_tensor(tensor, name, error_label=error_label)
+      else:
+        _check_tensor(input_vals, None, error_label=error_label)
+
+    receiver_tensors = _wrap_and_check_receiver_tensors(receiver_tensors)
+
+    return super(SupervisedInputReceiver, cls).__new__(
+        cls,
+        features=features,
+        labels=labels,
+        receiver_tensors=receiver_tensors)
+
+
 @tf_export('estimator.export.build_parsing_serving_input_receiver_fn')
 def build_parsing_serving_input_receiver_fn(feature_spec,
                                             default_batch_size=None):
@@ -204,11 +280,13 @@ def build_parsing_serving_input_receiver_fn(feature_spec,
   Returns:
     A serving_input_receiver_fn suitable for use in serving.
   """
+
   def serving_input_receiver_fn():
     """An input_fn that expects a serialized tf.Example."""
-    serialized_tf_example = array_ops.placeholder(dtype=dtypes.string,
-                                                  shape=[default_batch_size],
-                                                  name='input_example_tensor')
+    serialized_tf_example = array_ops.placeholder(
+        dtype=dtypes.string,
+        shape=[default_batch_size],
+        name='input_example_tensor')
     receiver_tensors = {'examples': serialized_tf_example}
     features = parsing_ops.parse_example(serialized_tf_example, feature_spec)
     return ServingInputReceiver(features, receiver_tensors)
@@ -216,6 +294,25 @@ def serving_input_receiver_fn():
   return serving_input_receiver_fn
 
 
+def _placeholder_from_tensor(t, default_batch_size=None):
+  shape_list = t.get_shape().as_list()
+  shape_list[0] = default_batch_size
+  shape = tensor_shape.TensorShape(shape_list)
+
+  # Reuse the feature tensor's op name (t.op.name) for the placeholder,
+  # excluding the index from the tensor's name (t.name):
+  # t.name = "%s:%d" % (t.op.name, t._value_index)
+  return array_ops.placeholder(dtype=t.dtype, shape=shape, name=t.op.name)
+
+
+def _placeholders_from_receiver_tensors_dict(input_vals,
+                                             default_batch_size=None):
+  return {
+      name: _placeholder_from_tensor(t, default_batch_size)
+      for name, t in input_vals.items()
+  }
+
+
 @tf_export('estimator.export.build_raw_serving_input_receiver_fn')
 def build_raw_serving_input_receiver_fn(features, default_batch_size=None):
   """Build a serving_input_receiver_fn expecting feature Tensors.
@@ -231,19 +328,12 @@ def build_raw_serving_input_receiver_fn(features, default_batch_size=None):
   Returns:
     A serving_input_receiver_fn.
   """
+
   def serving_input_receiver_fn():
     """A serving_input_receiver_fn that expects features to be fed directly."""
-    receiver_tensors = {}
-    for name, t in features.items():
-      shape_list = t.get_shape().as_list()
-      shape_list[0] = default_batch_size
-      shape = tensor_shape.TensorShape(shape_list)
-
-      # Reuse the feature tensor's op name (t.op.name) for the placeholder,
-      # excluding the index from the tensor's name (t.name):
-      # t.name = "%s:%d" % (t.op.name, t._value_index)
-      receiver_tensors[name] = array_ops.placeholder(
-          dtype=t.dtype, shape=shape, name=t.op.name)
+    receiver_tensors = _placeholders_from_receiver_tensors_dict(
+        features, default_batch_size)
+
     # TODO(b/34885899): remove the unnecessary copy
     # The features provided are simply the placeholders, but we defensively copy
     # the dict because it may be mutated.
@@ -252,13 +342,137 @@ def serving_input_receiver_fn():
   return serving_input_receiver_fn
 
 
+def build_raw_supervised_input_receiver_fn(features,
+                                           labels,
+                                           default_batch_size=None):
+  """Build a supervised_input_receiver_fn for raw features and labels.
+
+  This function wraps tensor placeholders in a supervised_receiver_fn
+  with the expectation that the features and labels appear precisely as
+  the model_fn expects them. Features and labels can therefore be dicts of
+  tensors, or raw tensors.
+
+  Args:
+    features: a dict of string to `Tensor` or `Tensor`.
+    labels: a dict of string to `Tensor` or `Tensor`.
+    default_batch_size: the number of query examples expected per batch.
+        Leave unset for variable batch size (recommended).
+
+  Returns:
+    A supervised_input_receiver_fn.
+
+  Raises:
+    ValueError: if features and labels have overlapping keys.
+  """
+  # Check for overlapping keys before beginning.
+  try:
+    feat_keys = features.keys()
+  except AttributeError:
+    feat_keys = [_SINGLE_RECEIVER_DEFAULT_NAME]
+  try:
+    label_keys = labels.keys()
+  except AttributeError:
+    label_keys = [_SINGLE_LABEL_DEFAULT_NAME]
+
+  overlap_keys = set(feat_keys) & set(label_keys)
+  if overlap_keys:
+    raise ValueError('Features and labels must have distinct keys. '
+                     'Found overlapping keys: {}'.format(overlap_keys))
+
+  def supervised_input_receiver_fn():
+    """A receiver_fn that expects pass-through features and labels."""
+    if not isinstance(features, dict):
+      features_cp = _placeholder_from_tensor(features, default_batch_size)
+      receiver_features = {_SINGLE_RECEIVER_DEFAULT_NAME: features_cp}
+    else:
+      receiver_features = _placeholders_from_receiver_tensors_dict(
+          features, default_batch_size)
+      features_cp = receiver_features
+
+    if not isinstance(labels, dict):
+      labels_cp = _placeholder_from_tensor(labels, default_batch_size)
+      receiver_labels = {_SINGLE_LABEL_DEFAULT_NAME: labels_cp}
+    else:
+      receiver_labels = _placeholders_from_receiver_tensors_dict(
+          labels, default_batch_size)
+      labels_cp = receiver_labels
+
+    receiver_tensors = dict(receiver_features)
+    receiver_tensors.update(receiver_labels)
+    return SupervisedInputReceiver(features_cp, labels_cp, receiver_tensors)
+
+  return supervised_input_receiver_fn
+
+
+def build_supervised_input_receiver_fn_from_input_fn(input_fn, **input_fn_args):
+  """Get a function that returns a SupervisedInputReceiver matching an input_fn.
+
+  Note that this function calls the input_fn in a local graph in order to
+  extract features and labels. Placeholders are then created from those
+  features and labels in the default graph.
+
+  Args:
+    input_fn: An Estimator input_fn, which is a function that returns one of:
+
+      * A 'tf.data.Dataset' object: Outputs of `Dataset` object must be a
+          tuple (features, labels) with same constraints as below.
+      * A tuple (features, labels): Where `features` is a `Tensor` or a
+        dictionary of string feature name to `Tensor` and `labels` is a
+        `Tensor` or a dictionary of string label name to `Tensor`. Both
+        `features` and `labels` are consumed by `model_fn`. They should
+        satisfy the expectation of `model_fn` from inputs.
+
+    **input_fn_args: set of kwargs to be passed to the input_fn. Note that
+      these will not be checked or validated here, and any errors raised by
+      the input_fn will be thrown to the top.
+
+  Returns:
+    A function taking no arguments that, when called, returns a
+    SupervisedInputReceiver. This function can be passed in as part of the
+    input_receiver_map when exporting SavedModels from Estimator with multiple
+    modes.
+  """
+  # Wrap the input_fn call in a graph to prevent sullying the default namespace
+  with ops.Graph().as_default():
+    result = input_fn(**input_fn_args)
+    features, labels, _ = util.parse_input_fn_result(result)
+  # Placeholders are created back in the default graph.
+  return build_raw_supervised_input_receiver_fn(features, labels)
+
+
 ### Below utilities are specific to SavedModel exports.
 
 
 def build_all_signature_defs(receiver_tensors,
                              export_outputs,
-                             receiver_tensors_alternatives=None):
-  """Build `SignatureDef`s for all export outputs."""
+                             receiver_tensors_alternatives=None,
+                             serving_only=True):
+  """Build `SignatureDef`s for all export outputs.
+
+  Args:
+    receiver_tensors: a `Tensor`, or a dict of string to `Tensor`, specifying
+      input nodes where this receiver expects to be fed by default.  Typically,
+      this is a single placeholder expecting serialized `tf.Example` protos.
+    export_outputs: a dict of ExportOutput instances, each of which has
+      an as_signature_def instance method that will be called to retrieve
+      the signature_def for all export output tensors.
+    receiver_tensors_alternatives: a dict of string to additional
+      groups of receiver tensors, each of which may be a `Tensor` or a dict of
+      string to `Tensor`.  These named receiver tensor alternatives generate
+      additional serving signatures, which may be used to feed inputs at
+      different points within the input receiver subgraph.  A typical usage is
+      to allow feeding raw feature `Tensor`s *downstream* of the
+      tf.parse_example() op.  Defaults to None.
+    serving_only: boolean; if true, resulting signature defs will only include
+      valid serving signatures. If false, all requested signatures will be
+      returned.
+
+  Returns:
+    signature_def representing all passed args.
+
+  Raises:
+    ValueError: if export_outputs is not a dict
+  """
   if not isinstance(receiver_tensors, dict):
     receiver_tensors = {_SINGLE_RECEIVER_DEFAULT_NAME: receiver_tensors}
   if export_outputs is None or not isinstance(export_outputs, dict):
@@ -279,11 +493,12 @@ def build_all_signature_defs(receiver_tensors,
     for receiver_name, receiver_tensors_alt in (
         six.iteritems(receiver_tensors_alternatives)):
       if not isinstance(receiver_tensors_alt, dict):
-        receiver_tensors_alt = {_SINGLE_RECEIVER_DEFAULT_NAME:
-                                receiver_tensors_alt}
+        receiver_tensors_alt = {
+            _SINGLE_RECEIVER_DEFAULT_NAME: receiver_tensors_alt
+        }
       for output_key, export_output in export_outputs.items():
-        signature_name = '{}:{}'.format(receiver_name or 'None',
-                                        output_key or 'None')
+        signature_name = '{}:{}'.format(receiver_name or 'None', output_key or
+                                        'None')
         try:
           signature = export_output.as_signature_def(receiver_tensors_alt)
           signature_def_map[signature_name] = signature
@@ -293,17 +508,27 @@ def build_all_signature_defs(receiver_tensors,
   _log_signature_report(signature_def_map, excluded_signatures)
 
   # The above calls to export_output.as_signature_def should return only
-  # valid signatures; if there is a validity problem, they raise ValueError,
-  # which we ignore above. Consequently the call to is_valid_signature here
-  # should not remove anything else; it's just an extra sanity check.
-  return {k: v for k, v in signature_def_map.items()
-          if signature_def_utils.is_valid_signature(v)}
+  # valid signatures; if there is a validity problem, they raise a ValueError,
+  # in which case we exclude that signature from signature_def_map above.
+  # The is_valid_signature check ensures that the signatures produced are
+  # valid for serving, and acts as an additional sanity check for export
+  # signatures produced for serving. We skip this check for training and eval
+  # signatures, which are not intended for serving.
+  if serving_only:
+    signature_def_map = {
+        k: v
+        for k, v in signature_def_map.items()
+        if signature_def_utils.is_valid_signature(v)
+    }
+  return signature_def_map
 
 
 _FRIENDLY_METHOD_NAMES = {
     signature_constants.CLASSIFY_METHOD_NAME: 'Classify',
     signature_constants.REGRESS_METHOD_NAME: 'Regress',
     signature_constants.PREDICT_METHOD_NAME: 'Predict',
+    signature_constants.SUPERVISED_TRAIN_METHOD_NAME: 'Train',
+    signature_constants.SUPERVISED_EVAL_METHOD_NAME: 'Eval',
 }
 
 
@@ -335,8 +560,8 @@ def _log_signature_report(signature_def_map, excluded_signatures):
 
   if not signature_def_map:
     logging.warn('Export includes no signatures!')
-  elif (signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
-        not in signature_def_map):
+  elif (signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY not in
+        signature_def_map):
     logging.warn('Export includes no default signature!')
 
 
@@ -376,6 +601,5 @@ def get_temp_export_dir(timestamped_export_dir):
   """
   (dirname, basename) = os.path.split(timestamped_export_dir)
   temp_export_dir = os.path.join(
-      compat.as_bytes(dirname),
-      compat.as_bytes('temp-{}'.format(basename)))
+      compat.as_bytes(dirname), compat.as_bytes('temp-{}'.format(basename)))
   return temp_export_dir
diff --git a/tensorflow/python/estimator/export/export_output.py b/tensorflow/python/estimator/export/export_output.py
index 87b964be37197d..d387ea2940e7a4 100644
--- a/tensorflow/python/estimator/export/export_output.py
+++ b/tensorflow/python/estimator/export/export_output.py
@@ -38,6 +38,8 @@ class ExportOutput(object):
 
   __metaclass__ = abc.ABCMeta
 
+  _SEPARATOR_CHAR = '/'
+
   @abc.abstractmethod
   def as_signature_def(self, receiver_tensors):
     """Generate a SignatureDef proto for inclusion in a MetaGraphDef.
@@ -51,6 +53,52 @@ def as_signature_def(self, receiver_tensors):
     """
     pass
 
+  def _check_output_key(self, key, error_label):
+    # For multi-head models, the key can be a tuple.
+    if isinstance(key, tuple):
+      key = self._SEPARATOR_CHAR.join(key)
+
+    if not isinstance(key, six.string_types):
+      raise ValueError(
+          '{} output key must be a string; got {}.'.format(error_label, key))
+    return key
+
+  def _wrap_and_check_outputs(
+      self, outputs, single_output_default_name, error_label=None):
+    """Wraps raw tensors as dicts and checks type.
+
+    Note that we create a new dict here so that we can overwrite the keys
+    if necessary.
+
+    Args:
+      outputs: A `Tensor` or a dict of string to `Tensor`.
+      single_output_default_name: A string key for use in the output dict
+        if the provided `outputs` is a raw tensor.
+      error_label: descriptive string for use in error messages. If none,
+        single_output_default_name will be used.
+
+    Returns:
+      A dict of tensors
+
+    Raises:
+      ValueError: if the outputs dict keys are not strings or tuples of strings
+        or the values are not Tensors.
+    """
+    if not isinstance(outputs, dict):
+      outputs = {single_output_default_name: outputs}
+
+    output_dict = {}
+    for key, value in outputs.items():
+      error_name = error_label or single_output_default_name
+      key = self._check_output_key(key, error_name)
+      if not isinstance(value, ops.Tensor):
+        raise ValueError(
+            '{} output value must be a Tensor; got {}.'.format(
+                error_name, value))
+
+      output_dict[key] = value
+    return output_dict
+
 
 @tf_export('estimator.export.ClassificationOutput')
 class ClassificationOutput(ExportOutput):
@@ -154,9 +202,6 @@ def as_signature_def(self, receiver_tensors):
     return signature_def_utils.regression_signature_def(examples, self.value)
 
 
-_SINGLE_OUTPUT_DEFAULT_NAME = 'output'
-
-
 @tf_export('estimator.export.PredictOutput')
 class PredictOutput(ExportOutput):
   """Represents the output of a generic prediction head.
@@ -165,6 +210,7 @@ class PredictOutput(ExportOutput):
 
   Named outputs must be provided as a dict from string to `Tensor`,
   """
+  _SINGLE_OUTPUT_DEFAULT_NAME = 'output'
 
   def __init__(self, outputs):
     """Constructor for PredictOutput.
@@ -177,16 +223,9 @@ def __init__(self, outputs):
       ValueError: if the outputs is not dict, or any of its keys are not
           strings, or any of its values are not `Tensor`s.
     """
-    if not isinstance(outputs, dict):
-      outputs = {_SINGLE_OUTPUT_DEFAULT_NAME: outputs}
-    for key, value in outputs.items():
-      if not isinstance(key, six.string_types):
-        raise ValueError(
-            'Prediction output key must be a string; got {}.'.format(key))
-      if not isinstance(value, ops.Tensor):
-        raise ValueError(
-            'Prediction output value must be a Tensor; got {}.'.format(value))
-    self._outputs = outputs
+
+    self._outputs = self._wrap_and_check_outputs(
+        outputs, self._SINGLE_OUTPUT_DEFAULT_NAME, error_label='Prediction')
 
   @property
   def outputs(self):
@@ -195,3 +234,161 @@ def outputs(self):
   def as_signature_def(self, receiver_tensors):
     return signature_def_utils.predict_signature_def(receiver_tensors,
                                                      self.outputs)
+
+
+class _SupervisedOutput(ExportOutput):
+  """Represents the output of a supervised training or eval process."""
+  __metaclass__ = abc.ABCMeta
+
+  LOSS_NAME = 'loss'
+  PREDICTIONS_NAME = 'predictions'
+  METRICS_NAME = 'metrics'
+
+  METRIC_VALUE_SUFFIX = 'value'
+  METRIC_UPDATE_SUFFIX = 'update_op'
+
+  _loss = None
+  _predictions = None
+  _metrics = None
+
+  def __init__(self, loss=None, predictions=None, metrics=None):
+    """Constructor for SupervisedOutput (ie, Train or Eval output).
+
+    Args:
+      loss: dict of Tensors or single Tensor representing calculated loss.
+      predictions: dict of Tensors or single Tensor representing model
+        predictions.
+      metrics: dict of (metric_value, update_op) tuples, or a single tuple.
+        metric_value must be a Tensor, and update_op must be a Tensor or Op.
+
+    Raises:
+      ValueError: if any of the outputs' dict keys are not strings or tuples of
+        strings or the values are not Tensors (or Operations in the case of
+        update_op).
+    """
+
+    if loss is not None:
+      loss_dict = self._wrap_and_check_outputs(loss, self.LOSS_NAME)
+      self._loss = self._prefix_output_keys(loss_dict, self.LOSS_NAME)
+    if predictions is not None:
+      pred_dict = self._wrap_and_check_outputs(
+          predictions, self.PREDICTIONS_NAME)
+      self._predictions = self._prefix_output_keys(
+          pred_dict, self.PREDICTIONS_NAME)
+    if metrics is not None:
+      self._metrics = self._wrap_and_check_metrics(metrics)
+
+  def _prefix_output_keys(self, output_dict, output_name):
+    """Prepend output_name to the output_dict keys if it doesn't exist.
+
+    This produces predictable prefixes for the pre-determined outputs
+    of SupervisedOutput.
+
+    Args:
+      output_dict: dict of string to Tensor, assumed valid.
+      output_name: prefix string to prepend to existing keys.
+
+    Returns:
+      dict with updated keys and existing values.
+    """
+
+    new_outputs = {}
+    for key, val in output_dict.items():
+      key = self._prefix_key(key, output_name)
+      new_outputs[key] = val
+    return new_outputs
+
+  def _prefix_key(self, key, output_name):
+    if key.find(output_name) != 0:
+      key = output_name + self._SEPARATOR_CHAR + key
+    return key
+
+  def _wrap_and_check_metrics(self, metrics):
+    """Handle the saving of metrics.
+
+    Metrics is either a tuple of (value, update_op), or a dict of such tuples.
+    Here, we separate out the tuples and create a dict with names to tensors.
+
+    Args:
+      metrics: dict of (metric_value, update_op) tuples, or a single tuple.
+
+    Returns:
+      dict of output_names to tensors
+
+    Raises:
+      ValueError: if the dict key is not a string, or the metric values or ops
+        are not tensors.
+    """
+    if not isinstance(metrics, dict):
+      metrics = {self.METRICS_NAME: metrics}
+
+    outputs = {}
+    for key, (metric_val, metric_op) in metrics.items():
+      key = self._check_output_key(key, self.METRICS_NAME)
+      key = self._prefix_key(key, self.METRICS_NAME)
+
+      val_name = key + self._SEPARATOR_CHAR + self.METRIC_VALUE_SUFFIX
+      op_name = key + self._SEPARATOR_CHAR + self.METRIC_UPDATE_SUFFIX
+      if not isinstance(metric_val, ops.Tensor):
+        raise ValueError(
+            '{} output value must be a Tensor; got {}.'.format(
+                key, metric_val))
+      if (not isinstance(metric_op, ops.Tensor) and
+          not isinstance(metric_op, ops.Operation)):
+        raise ValueError(
+            '{} update_op must be a Tensor or Operation; got {}.'.format(
+                key, metric_op))
+      outputs[val_name] = metric_val
+      outputs[op_name] = metric_op
+
+    return outputs
+
+  @property
+  def loss(self):
+    return self._loss
+
+  @property
+  def predictions(self):
+    return self._predictions
+
+  @property
+  def metrics(self):
+    return self._metrics
+
+  @abc.abstractmethod
+  def _get_signature_def_fn(self):
+    """Returns a function that produces a SignatureDef given desired outputs."""
+    pass
+
+  def as_signature_def(self, receiver_tensors):
+    signature_def_fn = self._get_signature_def_fn()
+    return signature_def_fn(
+        receiver_tensors, self.loss, self.predictions, self.metrics)
+
+
+class TrainOutput(_SupervisedOutput):
+  """Represents the output of a supervised training process.
+
+  This class generates the appropriate signature def for exporting
+  training output by type-checking and wrapping loss, predictions, and metrics
+  values.
+  """
+
+  def _get_signature_def_fn(self):
+    return signature_def_utils.supervised_train_signature_def
+
+
+class EvalOutput(_SupervisedOutput):
+  """Represents the output of a supervised eval process.
+
+  This class generates the appropriate signature def for exporting
+  eval output by type-checking and wrapping loss, predictions, and metrics
+  values.
+  """
+
+  def _get_signature_def_fn(self):
+    return signature_def_utils.supervised_eval_signature_def
+
+
+
+
diff --git a/tensorflow/python/estimator/export/export_output_test.py b/tensorflow/python/estimator/export/export_output_test.py
index 7090e53d807817..b21ba91b0fbb7e 100644
--- a/tensorflow/python/estimator/export/export_output_test.py
+++ b/tensorflow/python/estimator/export/export_output_test.py
@@ -225,5 +225,115 @@ def test_predict_outputs_invalid(self):
       })
 
 
+class MockSupervisedOutput(export_output_lib._SupervisedOutput):
+  """So that we can test the abstract class methods directly."""
+
+  def _get_signature_def_fn(self):
+    pass
+
+
+class SupervisedOutputTest(test.TestCase):
+
+  def test_supervised_outputs_valid(self):
+    """Tests that no errors are raised when provided outputs are valid."""
+    loss = {"my_loss": constant_op.constant([0])}
+    predictions = {u"output1": constant_op.constant(["foo"])}
+    metrics = {"metrics": (constant_op.constant([0]),
+                           constant_op.constant([10])),
+               "metrics2": (constant_op.constant([0]),
+                            constant_op.constant([10]))}
+
+    outputter = MockSupervisedOutput(loss, predictions, metrics)
+    self.assertEqual(outputter.loss["loss/my_loss"], loss["my_loss"])
+    self.assertEqual(
+        outputter.predictions["predictions/output1"], predictions["output1"])
+    self.assertEqual(outputter.metrics["metrics/value"], metrics["metrics"][0])
+    self.assertEqual(
+        outputter.metrics["metrics2/update_op"], metrics["metrics2"][1])
+
+    # Single Tensor is OK too
+    outputter = MockSupervisedOutput(
+        loss["my_loss"], predictions["output1"], metrics["metrics"])
+    self.assertEqual(outputter.loss, {"loss": loss["my_loss"]})
+    self.assertEqual(
+        outputter.predictions, {"predictions": predictions["output1"]})
+    self.assertEqual(outputter.metrics["metrics/value"], metrics["metrics"][0])
+
+  def test_supervised_outputs_none(self):
+    outputter = MockSupervisedOutput(
+        constant_op.constant([0]), None, None)
+    self.assertEqual(len(outputter.loss), 1)
+    self.assertEqual(outputter.predictions, None)
+    self.assertEqual(outputter.metrics, None)
+
+  def test_supervised_outputs_invalid(self):
+    with self.assertRaisesRegexp(ValueError, "predictions output value must"):
+      MockSupervisedOutput(constant_op.constant([0]), [3], None)
+    with self.assertRaisesRegexp(ValueError, "loss output value must"):
+      MockSupervisedOutput("str", None, None)
+    with self.assertRaisesRegexp(ValueError, "metrics output value must"):
+      MockSupervisedOutput(None, None, (15.3, 4))
+    with self.assertRaisesRegexp(ValueError, "loss output key must"):
+      MockSupervisedOutput({25: "Tensor"}, None, None)
+
+  def test_supervised_outputs_tuples(self):
+    """Tests that no errors are raised when provided outputs are valid."""
+    loss = {("my", "loss"): constant_op.constant([0])}
+    predictions = {(u"output1", "2"): constant_op.constant(["foo"])}
+    metrics = {("metrics", "twice"): (constant_op.constant([0]),
+                                      constant_op.constant([10]))}
+
+    outputter = MockSupervisedOutput(loss, predictions, metrics)
+    self.assertEqual(set(outputter.loss.keys()), set(["loss/my/loss"]))
+    self.assertEqual(set(outputter.predictions.keys()),
+                     set(["predictions/output1/2"]))
+    self.assertEqual(set(outputter.metrics.keys()),
+                     set(["metrics/twice/value", "metrics/twice/update_op"]))
+
+  def test_supervised_outputs_no_prepend(self):
+    """Tests that no errors are raised when provided outputs are valid."""
+    loss = {"loss": constant_op.constant([0])}
+    predictions = {u"predictions": constant_op.constant(["foo"])}
+    metrics = {u"metrics": (constant_op.constant([0]),
+                            constant_op.constant([10]))}
+
+    outputter = MockSupervisedOutput(loss, predictions, metrics)
+    self.assertEqual(set(outputter.loss.keys()), set(["loss"]))
+    self.assertEqual(set(outputter.predictions.keys()), set(["predictions"]))
+    self.assertEqual(set(outputter.metrics.keys()),
+                     set(["metrics/value", "metrics/update_op"]))
+
+  def test_train_signature_def(self):
+    loss = {"my_loss": constant_op.constant([0])}
+    predictions = {u"output1": constant_op.constant(["foo"])}
+    metrics = {"metrics": (constant_op.constant([0]),
+                           constant_op.constant([10]))}
+
+    outputter = export_output_lib.TrainOutput(loss, predictions, metrics)
+
+    receiver = {u"features": constant_op.constant(100, shape=(100, 2)),
+                "labels": constant_op.constant(100, shape=(100, 1))}
+    sig_def = outputter.as_signature_def(receiver)
+
+    self.assertTrue("loss/my_loss" in sig_def.outputs)
+    self.assertTrue("metrics/value" in sig_def.outputs)
+    self.assertTrue("predictions/output1" in sig_def.outputs)
+    self.assertTrue("features" in sig_def.inputs)
+
+  def test_eval_signature_def(self):
+    loss = {"my_loss": constant_op.constant([0])}
+    predictions = {u"output1": constant_op.constant(["foo"])}
+
+    outputter = export_output_lib.EvalOutput(loss, predictions, None)
+
+    receiver = {u"features": constant_op.constant(100, shape=(100, 2)),
+                "labels": constant_op.constant(100, shape=(100, 1))}
+    sig_def = outputter.as_signature_def(receiver)
+
+    self.assertTrue("loss/my_loss" in sig_def.outputs)
+    self.assertFalse("metrics/value" in sig_def.outputs)
+    self.assertTrue("predictions/output1" in sig_def.outputs)
+    self.assertTrue("features" in sig_def.inputs)
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/estimator/export/export_test.py b/tensorflow/python/estimator/export/export_test.py
index c203be7dacf800..a7074712c25532 100644
--- a/tensorflow/python/estimator/export/export_test.py
+++ b/tensorflow/python/estimator/export/export_test.py
@@ -54,7 +54,7 @@ def _convert_labeled_tensor_mock_to_tensor(value, *args, **kwargs):
                                         _convert_labeled_tensor_mock_to_tensor)
 
 
-class ExportTest(test_util.TensorFlowTestCase):
+class ServingInputReceiverTest(test_util.TensorFlowTestCase):
 
   def test_serving_input_receiver_constructor(self):
     """Tests that no errors are raised when input is expected."""
@@ -161,6 +161,165 @@ def test_receiver_wrong_type(self):
     with self.assertRaises(ValueError):
       _ = export.ServingInputReceiver(feature, receiver_tensor)
 
+
+class SupervisedInputReceiverTest(test_util.TensorFlowTestCase):
+
+  def test_input_receiver_constructor(self):
+    """Tests that no errors are raised when input is expected."""
+    features = {
+        "feature0": constant_op.constant([0]),
+        u"feature1": constant_op.constant([1]),
+        "feature2": sparse_tensor.SparseTensor(
+            indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
+    }
+    labels = {
+        "classes": constant_op.constant([0] * 100),
+    }
+
+    receiver_tensors = {
+        "example0": array_ops.placeholder(dtypes.string, name="example0"),
+        u"example1": array_ops.placeholder(dtypes.string, name="example1"),
+    }
+    export.SupervisedInputReceiver(features, labels, receiver_tensors)
+
+  def test_input_receiver_raw_values(self):
+    """Tests that no errors are raised when input is expected."""
+    features = {
+        "feature0": constant_op.constant([0]),
+        u"feature1": constant_op.constant([1]),
+        "feature2": sparse_tensor.SparseTensor(
+            indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
+    }
+
+    labels = {
+        "classes": constant_op.constant([0] * 100),
+    }
+
+    receiver_tensors = {
+        "example0": array_ops.placeholder(dtypes.string, name="example0"),
+        u"example1": array_ops.placeholder(dtypes.string, name="example1"),
+    }
+    rec = export.SupervisedInputReceiver(
+        features["feature2"], labels, receiver_tensors)
+    self.assertIsInstance(rec.features, sparse_tensor.SparseTensor)
+
+    rec = export.SupervisedInputReceiver(
+        features, labels["classes"], receiver_tensors)
+    self.assertIsInstance(rec.labels, ops.Tensor)
+
+  def test_input_receiver_features_invalid(self):
+    features = constant_op.constant([0] * 100)
+    labels = constant_op.constant([0])
+    receiver_tensors = {
+        "example0": array_ops.placeholder(dtypes.string, name="example0"),
+        u"example1": array_ops.placeholder(dtypes.string, name="example1"),
+    }
+
+    with self.assertRaisesRegexp(ValueError, "features must be defined"):
+      export.SupervisedInputReceiver(
+          features=None,
+          labels=labels,
+          receiver_tensors=receiver_tensors)
+
+    with self.assertRaisesRegexp(ValueError, "feature keys must be strings"):
+      export.SupervisedInputReceiver(
+          features={1: constant_op.constant([1])},
+          labels=labels,
+          receiver_tensors=receiver_tensors)
+
+    with self.assertRaisesRegexp(ValueError, "label keys must be strings"):
+      export.SupervisedInputReceiver(
+          features=features,
+          labels={1: constant_op.constant([1])},
+          receiver_tensors=receiver_tensors)
+
+    with self.assertRaisesRegexp(
+        ValueError, "feature feature1 must be a Tensor or SparseTensor"):
+      export.SupervisedInputReceiver(
+          features={"feature1": [1]},
+          labels=labels,
+          receiver_tensors=receiver_tensors)
+
+    with self.assertRaisesRegexp(
+        ValueError, "feature must be a Tensor or SparseTensor"):
+      export.SupervisedInputReceiver(
+          features=[1],
+          labels=labels,
+          receiver_tensors=receiver_tensors)
+
+    with self.assertRaisesRegexp(
+        ValueError, "label must be a Tensor or SparseTensor"):
+      export.SupervisedInputReceiver(
+          features=features,
+          labels=100,
+          receiver_tensors=receiver_tensors)
+
+  def test_input_receiver_receiver_tensors_invalid(self):
+    features = {
+        "feature0": constant_op.constant([0]),
+        u"feature1": constant_op.constant([1]),
+        "feature2": sparse_tensor.SparseTensor(
+            indices=[[0, 0]], values=[1], dense_shape=[1, 1]),
+    }
+    labels = constant_op.constant([0])
+
+    with self.assertRaisesRegexp(
+        ValueError, "receiver_tensors must be defined"):
+      export.SupervisedInputReceiver(
+          features=features,
+          labels=labels,
+          receiver_tensors=None)
+
+    with self.assertRaisesRegexp(
+        ValueError, "receiver_tensors keys must be strings"):
+      export.SupervisedInputReceiver(
+          features=features,
+          labels=labels,
+          receiver_tensors={
+              1: array_ops.placeholder(dtypes.string, name="example0")})
+
+    with self.assertRaisesRegexp(
+        ValueError, "receiver_tensor example1 must be a Tensor"):
+      export.SupervisedInputReceiver(
+          features=features,
+          labels=labels,
+          receiver_tensors={"example1": [1]})
+
+  def test_single_feature_single_receiver(self):
+    feature = constant_op.constant(5)
+    label = constant_op.constant(5)
+    receiver_tensor = array_ops.placeholder(dtypes.string)
+    input_receiver = export.SupervisedInputReceiver(
+        feature, label, receiver_tensor)
+
+    # single receiver is automatically named
+    receiver_key, = input_receiver.receiver_tensors.keys()
+    self.assertEqual("input", receiver_key)
+
+  def test_multi_feature_single_receiver(self):
+    features = {"foo": constant_op.constant(5),
+                "bar": constant_op.constant(6)}
+    labels = {"value": constant_op.constant(5)}
+    receiver_tensor = array_ops.placeholder(dtypes.string)
+    _ = export.SupervisedInputReceiver(features, labels, receiver_tensor)
+
+  def test_multi_feature_multi_receiver(self):
+    features = {"foo": constant_op.constant(5),
+                "bar": constant_op.constant(6)}
+    labels = {"value": constant_op.constant(5)}
+    receiver_tensors = {"baz": array_ops.placeholder(dtypes.int64),
+                        "qux": array_ops.placeholder(dtypes.float32)}
+    _ = export.SupervisedInputReceiver(features, labels, receiver_tensors)
+
+  def test_feature_labeled_tensor(self):
+    feature = LabeledTensorMock()
+    label = constant_op.constant(5)
+    receiver_tensor = array_ops.placeholder(dtypes.string)
+    _ = export.SupervisedInputReceiver(feature, label, receiver_tensor)
+
+
+class ExportTest(test_util.TensorFlowTestCase):
+
   def test_build_parsing_serving_input_receiver_fn(self):
     feature_spec = {"int_feature": parsing_ops.VarLenFeature(dtypes.int64),
                     "float_feature": parsing_ops.VarLenFeature(dtypes.float32)}
@@ -237,6 +396,104 @@ def test_build_raw_serving_input_receiver_fn(self):
           dtypes.int32,
           serving_input_receiver.receiver_tensors["feature_2"].dtype)
 
+  def test_build_raw_supervised_input_receiver_fn(self):
+    features = {"feature_1": constant_op.constant(["hello"]),
+                "feature_2": constant_op.constant([42])}
+    labels = {"foo": constant_op.constant([5]),
+              "bar": constant_op.constant([6])}
+    input_receiver_fn = export.build_raw_supervised_input_receiver_fn(
+        features, labels)
+    with ops.Graph().as_default():
+      input_receiver = input_receiver_fn()
+      self.assertEqual(set(["feature_1", "feature_2"]),
+                       set(input_receiver.features.keys()))
+      self.assertEqual(set(["foo", "bar"]),
+                       set(input_receiver.labels.keys()))
+      self.assertEqual(set(["feature_1", "feature_2", "foo", "bar"]),
+                       set(input_receiver.receiver_tensors.keys()))
+      self.assertEqual(
+          dtypes.string, input_receiver.receiver_tensors["feature_1"].dtype)
+      self.assertEqual(
+          dtypes.int32, input_receiver.receiver_tensors["feature_2"].dtype)
+
+  def test_build_raw_supervised_input_receiver_fn_raw_tensors(self):
+    features = {"feature_1": constant_op.constant(["hello"]),
+                "feature_2": constant_op.constant([42])}
+    labels = {"foo": constant_op.constant([5]),
+              "bar": constant_op.constant([6])}
+    input_receiver_fn1 = export.build_raw_supervised_input_receiver_fn(
+        features["feature_1"], labels)
+    input_receiver_fn2 = export.build_raw_supervised_input_receiver_fn(
+        features["feature_1"], labels["foo"])
+    with ops.Graph().as_default():
+      input_receiver = input_receiver_fn1()
+      self.assertIsInstance(input_receiver.features, ops.Tensor)
+      self.assertEqual(set(["foo", "bar"]),
+                       set(input_receiver.labels.keys()))
+      self.assertEqual(set(["input", "foo", "bar"]),
+                       set(input_receiver.receiver_tensors.keys()))
+
+      input_receiver = input_receiver_fn2()
+      self.assertIsInstance(input_receiver.features, ops.Tensor)
+      self.assertIsInstance(input_receiver.labels, ops.Tensor)
+      self.assertEqual(set(["input", "label"]),
+                       set(input_receiver.receiver_tensors.keys()))
+
+  def test_build_raw_supervised_input_receiver_fn_batch_size(self):
+    features = {"feature_1": constant_op.constant(["hello"]),
+                "feature_2": constant_op.constant([42])}
+    labels = {"foo": constant_op.constant([5]),
+              "bar": constant_op.constant([6])}
+    input_receiver_fn = export.build_raw_supervised_input_receiver_fn(
+        features, labels, default_batch_size=10)
+    with ops.Graph().as_default():
+      input_receiver = input_receiver_fn()
+      self.assertEqual([10], input_receiver.receiver_tensors["feature_1"].shape)
+      self.assertEqual([10], input_receiver.features["feature_1"].shape)
+
+  def test_build_raw_supervised_input_receiver_fn_overlapping_keys(self):
+    features = {"feature_1": constant_op.constant(["hello"]),
+                "feature_2": constant_op.constant([42])}
+    labels = {"feature_1": constant_op.constant([5]),
+              "bar": constant_op.constant([6])}
+    with self.assertRaises(ValueError):
+      export.build_raw_supervised_input_receiver_fn(features, labels)
+
+  def test_build_supervised_input_receiver_fn_from_input_fn(self):
+    def dummy_input_fn():
+      return ({"x": constant_op.constant([[1], [1]]),
+               "y": constant_op.constant(["hello", "goodbye"])},
+              constant_op.constant([[1], [1]]))
+
+    input_receiver_fn = export.build_supervised_input_receiver_fn_from_input_fn(
+        dummy_input_fn)
+
+    with ops.Graph().as_default():
+      input_receiver = input_receiver_fn()
+      self.assertEqual(set(["x", "y"]),
+                       set(input_receiver.features.keys()))
+      self.assertIsInstance(input_receiver.labels, ops.Tensor)
+      self.assertEqual(set(["x", "y", "label"]),
+                       set(input_receiver.receiver_tensors.keys()))
+
+  def test_build_supervised_input_receiver_fn_from_input_fn_args(self):
+    def dummy_input_fn(feature_key="x"):
+      return ({feature_key: constant_op.constant([[1], [1]]),
+               "y": constant_op.constant(["hello", "goodbye"])},
+              {"my_label": constant_op.constant([[1], [1]])})
+
+    input_receiver_fn = export.build_supervised_input_receiver_fn_from_input_fn(
+        dummy_input_fn, feature_key="z")
+
+    with ops.Graph().as_default():
+      input_receiver = input_receiver_fn()
+      self.assertEqual(set(["z", "y"]),
+                       set(input_receiver.features.keys()))
+      self.assertEqual(set(["my_label"]),
+                       set(input_receiver.labels.keys()))
+      self.assertEqual(set(["z", "y", "my_label"]),
+                       set(input_receiver.receiver_tensors.keys()))
+
   def test_build_all_signature_defs_without_receiver_alternatives(self):
     receiver_tensor = array_ops.placeholder(dtypes.string)
     output_1 = constant_op.constant([1.])
@@ -404,6 +661,35 @@ def test_get_timestamped_export_dir(self):
     self.assertTrue(int(time_1) < int(time_2))
     self.assertTrue(int(time_2) < int(time_3))
 
+  def test_build_all_signature_defs_serving_only(self):
+    receiver_tensor = {"input": array_ops.placeholder(dtypes.string)}
+    output_1 = constant_op.constant([1.])
+    export_outputs = {
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
+            export_output.PredictOutput(outputs=output_1),
+        "train": export_output.TrainOutput(loss=output_1),
+    }
+
+    signature_defs = export.build_all_signature_defs(
+        receiver_tensor, export_outputs)
+
+    expected_signature_defs = {
+        "serving_default": signature_def_utils.predict_signature_def(
+            receiver_tensor, {"output": output_1})
+    }
+
+    self.assertDictEqual(expected_signature_defs, signature_defs)
+
+    signature_defs = export.build_all_signature_defs(
+        receiver_tensor, export_outputs, serving_only=False)
+
+    expected_signature_defs.update({
+        "train": signature_def_utils.supervised_train_signature_def(
+            receiver_tensor, loss={"loss": output_1})
+    })
+
+    self.assertDictEqual(expected_signature_defs, signature_defs)
+
 
 class TensorServingReceiverTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/estimator/exporter.py b/tensorflow/python/estimator/exporter.py
index a3f04626d1e5ed..766ea23f2a3d14 100644
--- a/tensorflow/python/estimator/exporter.py
+++ b/tensorflow/python/estimator/exporter.py
@@ -22,9 +22,12 @@
 import os
 
 from tensorflow.python.estimator import gc
+from tensorflow.python.estimator import util
+from tensorflow.python.estimator.canned import metric_keys
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging
+from tensorflow.python.summary import summary_iterator
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -125,6 +128,245 @@ def export(self, estimator, export_path, checkpoint_path, eval_result,
     return export_result
 
 
+def _loss_smaller(best_eval_result, current_eval_result):
+  """Compares two evaluation results and returns true if the 2nd one is smaller.
+
+  Both evaluation results should have the values for MetricKeys.LOSS, which are
+  used for comparison.
+
+  Args:
+    best_eval_result: best eval metrics.
+    current_eval_result: current eval metrics.
+
+  Returns:
+    True if the loss of current_eval_result is smaller; otherwise, False.
+
+  Raises:
+    ValueError: If input eval result is None or no loss is available.
+  """
+  default_key = metric_keys.MetricKeys.LOSS
+  if not best_eval_result or default_key not in best_eval_result:
+    raise ValueError(
+        'best_eval_result cannot be empty or no loss is found in it.')
+
+  if not current_eval_result or default_key not in current_eval_result:
+    raise ValueError(
+        'current_eval_result cannot be empty or no loss is found in it.')
+
+  return best_eval_result[default_key] > current_eval_result[default_key]
+
+
+def _verify_compare_fn_args(compare_fn):
+  """Verifies compare_fn arguments."""
+  args = set(util.fn_args(compare_fn))
+  if 'best_eval_result' not in args:
+    raise ValueError(
+        'compare_fn (%s) must include best_eval_result argument.' % compare_fn)
+  if 'current_eval_result' not in args:
+    raise ValueError(
+        'compare_fn (%s) must include current_eval_result argument.' %
+        compare_fn)
+  non_valid_args = list(args - set(['best_eval_result', 'current_eval_result']))
+  if non_valid_args:
+    raise ValueError('compare_fn (%s) has following not expected args: %s' %
+                     (compare_fn, non_valid_args))
+
+
+@tf_export('estimator.BestExporter')
+class BestExporter(Exporter):
+  """This class exports the serving graph and checkpoints of the best models.
+
+  This class performs a model export everytime when the new model is better
+  than any exsiting model.
+  """
+
+  def __init__(self,
+               name='best_exporter',
+               serving_input_receiver_fn=None,
+               event_file_pattern='eval/*.tfevents.*',
+               compare_fn=_loss_smaller,
+               assets_extra=None,
+               as_text=False,
+               exports_to_keep=5):
+    """Create an `Exporter` to use with `tf.estimator.EvalSpec`.
+
+    Example of creating a BestExporter for training and evluation:
+    ```python
+    def make_train_and_eval_fn():
+      # Set up feature columns.
+      categorial_feature_a = (
+          tf.feature_column.categorical_column_with_hash_bucket(...))
+      categorial_feature_a_emb = embedding_column(
+          categorical_column=categorial_feature_a, ...)
+      ...  # other feature columns
+
+      estimator = tf.estimator.DNNClassifier(
+          config=tf.estimator.RunConfig(
+              model_dir='/my_model', save_summary_steps=100),
+          feature_columns=[categorial_feature_a_emb, ...],
+          hidden_units=[1024, 512, 256])
+
+      serving_feature_spec = tf.feature_column.make_parse_example_spec(
+          categorial_feature_a_emb)
+      serving_input_receiver_fn = (
+          tf.estimator.export.build_parsing_serving_input_receiver_fn(
+          serving_feature_spec))
+
+      exporter = tf.estimator.BestExporter(
+          name="best_exporter",
+          serving_input_receiver_fn=serving_input_receiver_fn,
+          exports_to_keep=5)
+
+      train_spec = tf.estimator.TrainSpec(...)
+
+      eval_spec = [tf.estimator.EvalSpec(
+        input_fn=eval_input_fn,
+        steps=100,
+        exporters=exporter,
+        start_delay_secs=0,
+        throttle_secs=5)]
+
+      return tf.estimator.DistributedTrainingSpec(estimator, train_spec,
+                                                  eval_spec)
+    ```
+
+    Args:
+      name: unique name of this `Exporter` that is going to be used in the
+        export path.
+      serving_input_receiver_fn: a function that takes no arguments and returns
+        a `ServingInputReceiver`.
+      event_file_pattern: event file name pattern relative to model_dir. If
+        None, however, the exporter would not be preemption-safe. To be
+        preemption-safe, event_file_pattern should be specified.
+      compare_fn: a function that compares two evaluation results and returns
+        true if current evaluation result is better. Follows the signature:
+        * Args:
+          * `best_eval_result`: This is the evaluation result of the best model.
+          * `current_eval_result`: This is the evaluation result of current
+                 candidate model.
+        * Returns:
+          True if current evaluation result is better; otherwise, False.
+      assets_extra: An optional dict specifying how to populate the assets.extra
+        directory within the exported SavedModel.  Each key should give the
+        destination path (including the filename) relative to the assets.extra
+        directory.  The corresponding value gives the full path of the source
+        file to be copied.  For example, the simple case of copying a single
+        file without renaming it is specified as `{'my_asset_file.txt':
+        '/path/to/my_asset_file.txt'}`.
+      as_text: whether to write the SavedModel proto in text format. Defaults to
+        `False`.
+      exports_to_keep: Number of exports to keep.  Older exports will be
+        garbage-collected.  Defaults to 5.  Set to `None` to disable garbage
+        collection.
+
+    Raises:
+      ValueError: if any arguments is invalid.
+    """
+    self._compare_fn = compare_fn
+    if self._compare_fn is None:
+      raise ValueError('`compare_fn` must not be None.')
+    _verify_compare_fn_args(self._compare_fn)
+
+    self._saved_model_exporter = _SavedModelExporter(
+        name, serving_input_receiver_fn, assets_extra, as_text)
+
+    self._event_file_pattern = event_file_pattern
+    self._model_dir = None
+    self._best_eval_result = None
+
+    self._exports_to_keep = exports_to_keep
+    if exports_to_keep is not None and exports_to_keep <= 0:
+      raise ValueError(
+          '`exports_to_keep`, if provided, must be positive number')
+
+  @property
+  def name(self):
+    return self._saved_model_exporter.name
+
+  def export(self, estimator, export_path, checkpoint_path, eval_result,
+             is_the_final_export):
+    export_result = None
+
+    if self._model_dir != estimator.model_dir and self._event_file_pattern:
+      # Loads best metric from event files.
+      tf_logging.info('Loading best metric from event files.')
+
+      self._model_dir = estimator.model_dir
+      full_event_file_pattern = os.path.join(self._model_dir,
+                                             self._event_file_pattern)
+      self._best_eval_result = self._get_best_eval_result(
+          full_event_file_pattern)
+
+    if self._best_eval_result is None or self._compare_fn(
+        best_eval_result=self._best_eval_result,
+        current_eval_result=eval_result):
+      tf_logging.info('Performing best model export.')
+      self._best_eval_result = eval_result
+      export_result = self._saved_model_exporter.export(
+          estimator, export_path, checkpoint_path, eval_result,
+          is_the_final_export)
+      self._garbage_collect_exports(export_path)
+
+    return export_result
+
+  def _garbage_collect_exports(self, export_dir_base):
+    """Deletes older exports, retaining only a given number of the most recent.
+
+    Export subdirectories are assumed to be named with monotonically increasing
+    integers; the most recent are taken to be those with the largest values.
+
+    Args:
+      export_dir_base: the base directory under which each export is in a
+        versioned subdirectory.
+    """
+    if self._exports_to_keep is None:
+      return
+
+    def _export_version_parser(path):
+      # create a simple parser that pulls the export_version from the directory.
+      filename = os.path.basename(path.path)
+      if not (len(filename) == 10 and filename.isdigit()):
+        return None
+      return path._replace(export_version=int(filename))
+
+    # pylint: disable=protected-access
+    keep_filter = gc._largest_export_versions(self._exports_to_keep)
+    delete_filter = gc._negation(keep_filter)
+    for p in delete_filter(
+        gc._get_paths(export_dir_base, parser=_export_version_parser)):
+      try:
+        gfile.DeleteRecursively(p.path)
+      except errors_impl.NotFoundError as e:
+        tf_logging.warn('Can not delete %s recursively: %s', p.path, e)
+    # pylint: enable=protected-access
+
+  def _get_best_eval_result(self, event_files):
+    """Get the best eval result from event files.
+
+    Args:
+      event_files: Absolute pattern of event files.
+
+    Returns:
+      The best eval result.
+    """
+    if not event_files:
+      return None
+
+    best_eval_result = None
+    for event_file in gfile.Glob(os.path.join(event_files)):
+      for event in summary_iterator.summary_iterator(event_file):
+        if event.HasField('summary'):
+          event_eval_result = {}
+          for value in event.summary.value:
+            if value.HasField('simple_value'):
+              event_eval_result[value.tag] = value.simple_value
+          if event_eval_result:
+            if best_eval_result is None or self._compare_fn(
+                best_eval_result, event_eval_result):
+              best_eval_result = event_eval_result
+    return best_eval_result
+
+
 @tf_export('estimator.FinalExporter')
 class FinalExporter(Exporter):
   """This class exports the serving graph and checkpoints in the end.
@@ -157,9 +399,8 @@ def __init__(self,
     Raises:
       ValueError: if any arguments is invalid.
     """
-    self._saved_model_exporter = _SavedModelExporter(name,
-                                                     serving_input_receiver_fn,
-                                                     assets_extra, as_text)
+    self._saved_model_exporter = _SavedModelExporter(
+        name, serving_input_receiver_fn, assets_extra, as_text)
 
   @property
   def name(self):
@@ -213,9 +454,8 @@ def __init__(self,
     Raises:
       ValueError: if any arguments is invalid.
     """
-    self._saved_model_exporter = _SavedModelExporter(name,
-                                                     serving_input_receiver_fn,
-                                                     assets_extra, as_text)
+    self._saved_model_exporter = _SavedModelExporter(
+        name, serving_input_receiver_fn, assets_extra, as_text)
     self._exports_to_keep = exports_to_keep
     if exports_to_keep is not None and exports_to_keep <= 0:
       raise ValueError(
diff --git a/tensorflow/python/estimator/exporter_test.py b/tensorflow/python/estimator/exporter_test.py
index 70b5612804b2d9..c4b006955c4128 100644
--- a/tensorflow/python/estimator/exporter_test.py
+++ b/tensorflow/python/estimator/exporter_test.py
@@ -30,6 +30,193 @@
 from tensorflow.python.util import compat
 
 
+class BestExporterTest(test.TestCase):
+
+  def test_error_out_if_exports_to_keep_is_zero(self):
+
+    def _serving_input_receiver_fn():
+      pass
+
+    with self.assertRaisesRegexp(ValueError, "positive number"):
+      exporter = exporter_lib.BestExporter(
+          name="best_exporter",
+          serving_input_receiver_fn=_serving_input_receiver_fn,
+          exports_to_keep=0)
+      self.assertEqual("best_exporter", exporter.name)
+
+  def test_best_exporter(self):
+
+    def _serving_input_receiver_fn():
+      pass
+
+    export_dir_base = tempfile.mkdtemp()
+    gfile.MkDir(export_dir_base)
+    gfile.MkDir(export_dir_base + "/export")
+    gfile.MkDir(export_dir_base + "/eval")
+
+    exporter = exporter_lib.BestExporter(
+        name="best_exporter",
+        serving_input_receiver_fn=_serving_input_receiver_fn,
+        assets_extra={"from/path": "to/path"},
+        as_text=False,
+        exports_to_keep=5)
+    estimator = test.mock.Mock(spec=estimator_lib.Estimator)
+    estimator.export_savedmodel.return_value = "export_result_path"
+    estimator.model_dir = export_dir_base
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {}, False)
+
+    self.assertEqual("export_result_path", export_result)
+    estimator.export_savedmodel.assert_called_with(
+        export_dir_base,
+        _serving_input_receiver_fn,
+        assets_extra={"from/path": "to/path"},
+        as_text=False,
+        checkpoint_path="checkpoint_path",
+        strip_default_attrs=True)
+
+  def test_best_export_is_saved(self):
+
+    def _serving_input_receiver_fn():
+      pass
+
+    export_dir_base = tempfile.mkdtemp()
+    gfile.MkDir(export_dir_base)
+    gfile.MkDir(export_dir_base + "/export")
+    gfile.MkDir(export_dir_base + "/eval")
+
+    exporter = exporter_lib.BestExporter(
+        name="best_exporter",
+        serving_input_receiver_fn=_serving_input_receiver_fn,
+        assets_extra={"from/path": "to/path"},
+        as_text=False,
+        exports_to_keep=1)
+    estimator = test.mock.Mock(spec=estimator_lib.Estimator)
+    estimator.export_savedmodel.return_value = "export_result_path"
+    estimator.model_dir = export_dir_base
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"loss": 0.5}, False)
+
+    self.assertTrue(estimator.export_savedmodel.called)
+    self.assertEqual("export_result_path", export_result)
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"loss": 0.6}, False)
+    self.assertEqual(None, export_result)
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"loss": 0.4}, False)
+    self.assertEqual("export_result_path", export_result)
+
+  def test_best_exporter_with_preemption(self):
+
+    def _serving_input_receiver_fn():
+      pass
+
+    export_dir_base = tempfile.mkdtemp()
+    gfile.MkDir(export_dir_base)
+    gfile.MkDir(export_dir_base + "/export")
+    gfile.MkDir(export_dir_base + "/eval")
+
+    eval_dir_base = os.path.join(export_dir_base, "eval_continuous")
+    estimator_lib._write_dict_to_summary(eval_dir_base, {"loss": 50}, 1)
+    estimator_lib._write_dict_to_summary(eval_dir_base, {"loss": 60}, 2)
+
+    exporter = exporter_lib.BestExporter(
+        name="best_exporter",
+        serving_input_receiver_fn=_serving_input_receiver_fn,
+        event_file_pattern="eval_continuous/*.tfevents.*",
+        assets_extra={"from/path": "to/path"},
+        as_text=False,
+        exports_to_keep=1)
+
+    estimator = test.mock.Mock(spec=estimator_lib.Estimator)
+    estimator.model_dir = export_dir_base
+    estimator.export_savedmodel.return_value = "export_result_path"
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"loss": 100}, False)
+    self.assertEqual(None, export_result)
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"loss": 10}, False)
+    self.assertEqual("export_result_path", export_result)
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"loss": 20}, False)
+    self.assertEqual(None, export_result)
+
+  def test_best_exporter_with_empty_event(self):
+
+    def _serving_input_receiver_fn():
+      pass
+
+    export_dir_base = tempfile.mkdtemp()
+    gfile.MkDir(export_dir_base)
+    gfile.MkDir(export_dir_base + "/export")
+    gfile.MkDir(export_dir_base + "/eval")
+
+    eval_dir_base = os.path.join(export_dir_base, "eval_continuous")
+    estimator_lib._write_dict_to_summary(eval_dir_base, {}, 1)
+    estimator_lib._write_dict_to_summary(eval_dir_base, {"loss": 60}, 2)
+
+    exporter = exporter_lib.BestExporter(
+        name="best_exporter",
+        serving_input_receiver_fn=_serving_input_receiver_fn,
+        event_file_pattern="eval_continuous/*.tfevents.*",
+        assets_extra={"from/path": "to/path"},
+        as_text=False,
+        exports_to_keep=1)
+
+    estimator = test.mock.Mock(spec=estimator_lib.Estimator)
+    estimator.model_dir = export_dir_base
+    estimator.export_savedmodel.return_value = "export_result_path"
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"loss": 100}, False)
+    self.assertEqual(None, export_result)
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"loss": 10}, False)
+    self.assertEqual("export_result_path", export_result)
+
+  def test_garbage_collect_exports(self):
+    export_dir_base = tempfile.mkdtemp()
+    gfile.MkDir(export_dir_base)
+    gfile.MkDir(export_dir_base + "/export")
+    gfile.MkDir(export_dir_base + "/eval")
+
+    export_dir_1 = _create_test_export_dir(export_dir_base)
+    export_dir_2 = _create_test_export_dir(export_dir_base)
+    export_dir_3 = _create_test_export_dir(export_dir_base)
+    export_dir_4 = _create_test_export_dir(export_dir_base)
+
+    self.assertTrue(gfile.Exists(export_dir_1))
+    self.assertTrue(gfile.Exists(export_dir_2))
+    self.assertTrue(gfile.Exists(export_dir_3))
+    self.assertTrue(gfile.Exists(export_dir_4))
+
+    def _serving_input_receiver_fn():
+      return array_ops.constant([1]), None
+
+    exporter = exporter_lib.BestExporter(
+        name="best_exporter",
+        serving_input_receiver_fn=_serving_input_receiver_fn,
+        exports_to_keep=2)
+    estimator = test.mock.Mock(spec=estimator_lib.Estimator)
+    estimator.model_dir = export_dir_base
+    # Garbage collect all but the most recent 2 exports,
+    # where recency is determined based on the timestamp directory names.
+    exporter.export(estimator, export_dir_base, None, None, False)
+
+    self.assertFalse(gfile.Exists(export_dir_1))
+    self.assertFalse(gfile.Exists(export_dir_2))
+    self.assertTrue(gfile.Exists(export_dir_3))
+    self.assertTrue(gfile.Exists(export_dir_4))
+
+
 class LatestExporterTest(test.TestCase):
 
   def test_error_out_if_exports_to_keep_is_zero(self):
diff --git a/tensorflow/python/estimator/inputs/numpy_io.py b/tensorflow/python/estimator/inputs/numpy_io.py
index a6f471291008e3..eefc7c712d79d8 100644
--- a/tensorflow/python/estimator/inputs/numpy_io.py
+++ b/tensorflow/python/estimator/inputs/numpy_io.py
@@ -136,11 +136,13 @@ def numpy_input_fn(x,
       values in `x` have same shape).
     ValueError: if duplicate keys are in both `x` and `y` when `y` is a dict.
     ValueError: if x or y is an empty dict.
-    TypeError: `x` is not a dict or array, or if `shuffle` is not bool.
+    TypeError: `x` is not a dict or array.
+    ValueError: if 'shuffle' is not provided or a bool.
   """
   if not isinstance(shuffle, bool):
-    raise TypeError('shuffle must be explicitly set as boolean; '
-                    'got {}'.format(shuffle))
+    raise ValueError('shuffle must be provided and explicitly set as boolean '
+                     '(it is recommended to set it as True for training); '
+                     'got {}'.format(shuffle))
 
   def input_fn():
     """Numpy input function."""
diff --git a/tensorflow/python/estimator/inputs/numpy_io_test.py b/tensorflow/python/estimator/inputs/numpy_io_test.py
index 92d057e25da785..81b201cc5c5f3d 100644
--- a/tensorflow/python/estimator/inputs/numpy_io_test.py
+++ b/tensorflow/python/estimator/inputs/numpy_io_test.py
@@ -286,8 +286,9 @@ def testNumpyInputFnWithNonBoolShuffle(self):
     x = np.arange(32, 36)
     y = np.arange(4)
     with self.test_session():
-      with self.assertRaisesRegexp(TypeError,
-                                   'shuffle must be explicitly set as boolean'):
+      with self.assertRaisesRegexp(ValueError,
+                                   'shuffle must be provided and explicitly '
+                                   'set as boolean'):
         # Default shuffle is None.
         numpy_io.numpy_input_fn(x, y)
 
diff --git a/tensorflow/python/estimator/inputs/pandas_io.py b/tensorflow/python/estimator/inputs/pandas_io.py
index bd06843021f47f..1ed6ed4d846a47 100644
--- a/tensorflow/python/estimator/inputs/pandas_io.py
+++ b/tensorflow/python/estimator/inputs/pandas_io.py
@@ -68,15 +68,16 @@ def pandas_input_fn(x,
   Raises:
     ValueError: if `x` already contains a column with the same name as `y`, or
       if the indexes of `x` and `y` don't match.
-    TypeError: `shuffle` is not bool.
+    ValueError: if 'shuffle' is not provided or a bool.
   """
   if not HAS_PANDAS:
     raise TypeError(
         'pandas_input_fn should not be called without pandas installed')
 
   if not isinstance(shuffle, bool):
-    raise TypeError('shuffle must be explicitly set as boolean; '
-                    'got {}'.format(shuffle))
+    raise ValueError('shuffle must be provided and explicitly set as boolean '
+                     '(it is recommended to set it as True for training); '
+                     'got {}'.format(shuffle))
 
   x = x.copy()
   if y is not None:
diff --git a/tensorflow/python/estimator/inputs/pandas_io_test.py b/tensorflow/python/estimator/inputs/pandas_io_test.py
index e5912a3b28e78c..dcecf6dd61c4d2 100644
--- a/tensorflow/python/estimator/inputs/pandas_io_test.py
+++ b/tensorflow/python/estimator/inputs/pandas_io_test.py
@@ -70,8 +70,9 @@ def testPandasInputFn_NonBoolShuffle(self):
       return
     x, _ = self.makeTestDataFrame()
     y_noindex = pd.Series(np.arange(-32, -28))
-    with self.assertRaisesRegexp(TypeError,
-                                 'shuffle must be explicitly set as boolean'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'shuffle must be provided and explicitly '
+                                 'set as boolean'):
       # Default shuffle is None
       pandas_io.pandas_input_fn(x, y_noindex)
 
diff --git a/tensorflow/python/estimator/inputs/queues/feeding_functions.py b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
index 8e5d8141a1a15d..51a61adb216c9b 100644
--- a/tensorflow/python/estimator/inputs/queues/feeding_functions.py
+++ b/tensorflow/python/estimator/inputs/queues/feeding_functions.py
@@ -52,7 +52,7 @@ def _fill_array(arr, seq, fillvalue=0):
   If length of seq is less than arr padded length, fillvalue used.
   Args:
     arr: Padded tensor of shape [batch_size, ..., max_padded_dim_len].
-    seq: Non-padded list of data sampels of shape
+    seq: Non-padded list of data samples of shape
       [batch_size, ..., padded_dim(None)]
     fillvalue: Default fillvalue to use.
   """
@@ -250,7 +250,7 @@ def __init__(self,
                num_epochs=None):
     if len(placeholders) != len(dataframe.columns) + 1:
       raise ValueError("Expected {} placeholders; got {}.".format(
-          len(dataframe.columns), len(placeholders)))
+          len(dataframe.columns) + 1, len(placeholders)))
     self._index_placeholder = placeholders[0]
     self._col_placeholders = placeholders[1:]
     self._dataframe = dataframe
diff --git a/tensorflow/python/keras/_impl/keras/estimator.py b/tensorflow/python/estimator/keras.py
similarity index 91%
rename from tensorflow/python/keras/_impl/keras/estimator.py
rename to tensorflow/python/estimator/keras.py
index c3c3fceb454773..2f439f765e6811 100644
--- a/tensorflow/python/keras/_impl/keras/estimator.py
+++ b/tensorflow/python/estimator/keras.py
@@ -20,7 +20,7 @@
 from __future__ import print_function
 
 import os
-
+import re
 from tensorflow.python.client import session
 from tensorflow.python.estimator import estimator as estimator_lib
 from tensorflow.python.estimator import export as export_lib
@@ -30,22 +30,24 @@
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.framework import tensor_util
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import models
-from tensorflow.python.keras._impl.keras import optimizers
-from tensorflow.python.keras._impl.keras.engine.base_layer import Layer
-from tensorflow.python.keras._impl.keras.engine.network import Network
-from tensorflow.python.keras._impl.keras.utils.generic_utils import CustomObjectScope
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import models
+from tensorflow.python.keras import optimizers
+from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.network import Network
+from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics as metrics_module
 from tensorflow.python.ops import variables as variables_module
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import training_util
 from tensorflow.python.util.tf_export import tf_export
 
+
 _DEFAULT_SERVING_KEY = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
 
 
@@ -68,11 +70,11 @@ def _convert_tensor(x):
   return x
 
 
-def _any_variable_initalized():
+def _any_variable_initialized():
   """Check if any variable has been initialized in the Keras model.
 
   Returns:
-    boolean, True if at least one variable has been initalized, else False.
+    boolean, True if at least one variable has been initialized, else False.
   """
   variables = variables_module.global_variables()
   for v in variables:
@@ -136,8 +138,9 @@ def _in_place_subclassed_model_reset(model):
   To "instantiate" an identical model in a new TF graph, we reuse the original
   model object, but we clear its state.
 
-  After calling this function on a model intance, you can use the model instance
-  as if it were a model clone (in particular you can use it in a new graph).
+  After calling this function on a model instance, you can use the model
+  instance as if it were a model clone (in particular you can use it in a new
+  graph).
 
   This method clears the state of the input model. It is thus destructive.
   However the original state can be restored fully by calling
@@ -220,7 +223,6 @@ def _in_place_subclassed_model_reset(model):
       for name in attributes_to_cache:
         attributes_cache[name] = getattr(model, name)
   model._original_attributes_cache = attributes_cache
-
   # Reset built state
   model.built = False
   model.inputs = None
@@ -340,8 +342,19 @@ def model_fn(features, labels, mode):
     """model_fn for keras Estimator."""
     model = _clone_and_build_model(mode, keras_model, custom_objects, features,
                                    labels)
+    model_output_names = []
+    # We need to make sure that the output names of the last layer in the model
+    # is the same for each of the cloned models. This is required for mirrored
+    # strategy when we call regroup.
+    if distribute_lib.has_distribution_strategy():
+      for name in model.output_names:
+        name = re.compile(r'_\d$').sub('', name)
+        model_output_names.append(name)
+    else:
+      model_output_names = model.output_names
+
     # Get inputs to EstimatorSpec
-    predictions = dict(zip(model.output_names, model.outputs))
+    predictions = dict(zip(model_output_names, model.outputs))
 
     loss = None
     train_op = None
@@ -445,10 +458,14 @@ def model_to_estimator(keras_model=None,
   @{$programmers_guide/estimators$creating_estimators_from_keras_models}.
 
   Args:
-    keras_model: Keras model in memory.
-    keras_model_path: Directory to a keras model on disk.
+    keras_model: A compiled Keras model object. This argument is mutually
+      exclusive with `keras_model_path`.
+    keras_model_path: Path to a compiled Keras model saved on disk, in HDF5
+      format, which can be generated with the `save()` method of a Keras model.
+      This argument is mutually exclusive with `keras_model`.
     custom_objects: Dictionary for custom objects.
-    model_dir: Directory to save Estimator model parameters, graph and etc.
+    model_dir: Directory to save Estimator model parameters, graph, summary
+      files for TensorBoard, etc.
     config: Configuration object.
 
   Returns:
@@ -460,7 +477,7 @@ def model_to_estimator(keras_model=None,
     ValueError: if the keras_model_path is a GCS URI.
     ValueError: if keras_model has not been compiled.
   """
-  if (not keras_model) and (not keras_model_path):
+  if not (keras_model or keras_model_path):
     raise ValueError(
         'Either `keras_model` or `keras_model_path` needs to be provided.')
   if keras_model and keras_model_path:
@@ -482,8 +499,9 @@ def model_to_estimator(keras_model=None,
 
   if not hasattr(keras_model, 'optimizer') or not keras_model.optimizer:
     raise ValueError(
-        'The given keras model has not been compiled yet. Please compile first '
-        'before calling `model_to_estimator`.')
+        'The given keras model has not been compiled yet. '
+        'Please compile the model with `model.compile()` '
+        'before calling `model_to_estimator()`.')
 
   if isinstance(config, dict):
     config = run_config_lib.RunConfig(**config)
@@ -493,7 +511,7 @@ def model_to_estimator(keras_model=None,
       keras_model_fn, model_dir=model_dir, config=config)
 
   # Check if we need to call get_weights:
-  if _any_variable_initalized():
+  if _any_variable_initialized():
     keras_weights = keras_model.get_weights()
     # Warn if config passed to estimator tries to update GPUOptions. If a
     # session has already been created, the GPUOptions passed to the first
diff --git a/tensorflow/python/keras/_impl/keras/estimator_test.py b/tensorflow/python/estimator/keras_test.py
similarity index 92%
rename from tensorflow/python/keras/_impl/keras/estimator_test.py
rename to tensorflow/python/estimator/keras_test.py
index 80fa87d0410871..5e094ae92bcf88 100644
--- a/tensorflow/python/keras/_impl/keras/estimator_test.py
+++ b/tensorflow/python/estimator/keras_test.py
@@ -25,15 +25,16 @@
 import numpy as np
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python import keras
+from tensorflow.python.estimator import keras as keras_lib
 from tensorflow.python.estimator import run_config as run_config_lib
 from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import testing_utils
-from tensorflow.python.keras._impl.keras.applications import mobilenet
-from tensorflow.python.keras._impl.keras.optimizers import SGD
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.applications import mobilenet
+from tensorflow.python.keras.optimizers import SGD
+from tensorflow.python.ops.parsing_ops import gen_parsing_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
@@ -145,13 +146,13 @@ def randomize_io_type(array, name):
 def multi_inputs_multi_outputs_model():
   a = keras.layers.Input(shape=(16,), name='input_a')
   b = keras.layers.Input(shape=(16,), name='input_b')
-  m = keras.layers.Input(shape=(8,), dtype='bool', name='input_m')
+  m = keras.layers.Input(shape=(8,), dtype='string', name='input_m')
   dense = keras.layers.Dense(8, name='dense_1')
 
   a_2 = dense(a)
-  # Apply a mask
-  s_2 = keras.layers.Lambda(lambda k:
-                            K.switch(k[0], k[1], K.zeros_like(k[1])))([m, a_2])
+  # Read m
+  m_2 = keras.layers.Lambda(gen_parsing_ops.string_to_number)(m)
+  s_2 = keras.layers.Lambda(lambda k: k[0] * k[1])([m_2, a_2])
   b_2 = dense(b)
   merged = keras.layers.concatenate([s_2, b_2], name='merge')
   c = keras.layers.Dense(3, activation='softmax', name='dense_2')(merged)
@@ -192,7 +193,7 @@ def test_train(self):
           metrics=['mse', keras.metrics.categorical_accuracy])
 
       with self.test_session():
-        est_keras = keras.estimator.model_to_estimator(
+        est_keras = keras_lib.model_to_estimator(
             keras_model=keras_model, config=self._config)
         before_eval_results = est_keras.evaluate(
             input_fn=eval_input_fn, steps=1)
@@ -214,7 +215,7 @@ def test_train_with_tf_optimizer(self):
           metrics=['mse', keras.metrics.categorical_accuracy])
 
       with self.test_session():
-        est_keras = keras.estimator.model_to_estimator(
+        est_keras = keras_lib.model_to_estimator(
             keras_model=keras_model,
             # Also use dict config argument to get test coverage for that line.
             config={
@@ -240,7 +241,7 @@ def test_train_with_subclassed_model(self):
         metrics=['mse', keras.metrics.categorical_accuracy])
 
     with self.test_session():
-      est_keras = keras.estimator.model_to_estimator(
+      est_keras = keras_lib.model_to_estimator(
           keras_model=keras_model, config=self._config)
       est_keras.train(input_fn=train_input_fn, steps=_TRAIN_SIZE / 16)
       before_eval_results = est_keras.evaluate(
@@ -264,7 +265,7 @@ def test_train_with_subclassed_model_with_existing_state(self):
                                  np.random.random((10, _NUM_CLASS)))
       original_preds = keras_model.predict(np.ones((10,) + _INPUT_SIZE))
 
-      est_keras = keras.estimator.model_to_estimator(
+      est_keras = keras_lib.model_to_estimator(
           keras_model=keras_model, config=self._config)
       est_keras.train(input_fn=train_input_fn, steps=_TRAIN_SIZE / 16)
       before_eval_results = est_keras.evaluate(
@@ -300,7 +301,7 @@ def test_evaluate(self):
       keras_eval = keras_model.evaluate(x_test, y_test, batch_size=32)
 
     with self.test_session():
-      keras_est = keras.estimator.model_to_estimator(
+      keras_est = keras_lib.model_to_estimator(
           keras_model=keras_model, config=self._config)
       est_eval = keras_est.evaluate(input_fn=eval_input_fn)
 
@@ -336,7 +337,7 @@ def test_predict(self):
       keras_pred = [np.argmax(y) for y in keras_model.predict(x_test)]
 
     with self.test_session():
-      keras_est = keras.estimator.model_to_estimator(
+      keras_est = keras_lib.model_to_estimator(
           keras_model=keras_model, config=self._config)
       est_pred = [
           np.argmax(y[keras_model.output_names[0]])
@@ -371,19 +372,19 @@ def test_multi_inputs_multi_outputs(self):
 
     def train_input_fn():
       input_dict = {'input_a': a_train, 'input_b': b_train,
-                    'input_m': input_m_train > 0}
+                    'input_m': input_m_train.astype(np.str)}
       output_dict = {'dense_2': c_train, 'dense_3': d_train}
       return input_dict, output_dict
 
     def eval_input_fn():
       input_dict = {'input_a': a_test, 'input_b': b_test,
-                    'input_m': input_m_test > 0}
+                    'input_m': input_m_test.astype(np.str)}
       output_dict = {'dense_2': c_test, 'dense_3': d_test}
       return input_dict, output_dict
 
     with self.test_session():
       model = multi_inputs_multi_outputs_model()
-      est_keras = keras.estimator.model_to_estimator(
+      est_keras = keras_lib.model_to_estimator(
           keras_model=model, config=self._config)
       before_eval_results = est_keras.evaluate(input_fn=eval_input_fn, steps=1)
       est_keras.train(input_fn=train_input_fn, steps=_TRAIN_SIZE / 16)
@@ -409,7 +410,7 @@ def test_init_from_file(self):
       keras.models.save_model(keras_model, fname)
 
     with self.test_session():
-      keras_est = keras.estimator.model_to_estimator(
+      keras_est = keras_lib.model_to_estimator(
           keras_model_path=fname, config=self._config)
       est_pred = [
           np.argmax(y[keras_model.output_names[0]])
@@ -419,24 +420,24 @@ def test_init_from_file(self):
 
   def test_keras_model_init_error(self):
     with self.assertRaisesRegexp(ValueError, 'Either'):
-      keras.estimator.model_to_estimator()
+      keras_lib.model_to_estimator()
 
     with self.test_session():
       keras_model = simple_sequential_model()
       with self.assertRaisesRegexp(ValueError, 'not both'):
-        keras.estimator.model_to_estimator(
+        keras_lib.model_to_estimator(
             keras_model=keras_model,
             keras_model_path=tempfile.mkdtemp(dir=self._base_dir))
 
     with self.test_session():
       keras_model = simple_sequential_model()
       with self.assertRaisesRegexp(ValueError, 'compiled'):
-        keras.estimator.model_to_estimator(keras_model=keras_model)
+        keras_lib.model_to_estimator(keras_model=keras_model)
 
     with self.test_session():
       keras_model = simple_sequential_model()
       with self.assertRaisesRegexp(ValueError, 'not a local path'):
-        keras.estimator.model_to_estimator(
+        keras_lib.model_to_estimator(
             keras_model_path='gs://bucket/object')
 
   def test_invalid_ionames_error(self):
@@ -460,7 +461,7 @@ def invald_output_name_input_fn():
     model.compile(
         loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
     with self.test_session():
-      est_keras = keras.estimator.model_to_estimator(
+      est_keras = keras_lib.model_to_estimator(
           keras_model=model, config=self._config)
 
     with self.test_session():
@@ -479,12 +480,12 @@ def test_custom_objects(self):
     }
     with self.assertRaisesRegexp(ValueError, 'relu6'):
       with self.test_session():
-        keras.estimator.model_to_estimator(
+        keras_lib.model_to_estimator(
             keras_model=keras_mobile,
             model_dir=tempfile.mkdtemp(dir=self._base_dir))
 
     with self.test_session():
-      keras.estimator.model_to_estimator(
+      keras_lib.model_to_estimator(
           keras_model=keras_mobile,
           model_dir=tempfile.mkdtemp(dir=self._base_dir),
           custom_objects=custom_objects)
@@ -509,7 +510,7 @@ def test_tf_config(self):
     })
     with test.mock.patch.dict('os.environ', {'TF_CONFIG': tf_config}):
       with self.test_session():
-        keras.estimator.model_to_estimator(
+        keras_lib.model_to_estimator(
             keras_model=keras_model,
             model_dir=tempfile.mkdtemp(dir=self._base_dir))
 
@@ -524,7 +525,7 @@ def test_gpu_config(self):
       gpu_options = config_pb2.GPUOptions(per_process_gpu_memory_fraction=0.3)
       sess_config = config_pb2.ConfigProto(gpu_options=gpu_options)
       self._config._session_config = sess_config
-      keras.estimator.model_to_estimator(
+      keras_lib.model_to_estimator(
           keras_model=keras_model, config=self._config)
       self.assertEqual(
           keras.backend.get_session()
@@ -548,7 +549,7 @@ def test_pretrained_weights(self):
           loss='categorical_crossentropy',
           optimizer=SGD(lr=0.0001, momentum=0.9),
           metrics=['mse', keras.metrics.categorical_accuracy])
-      keras.estimator.model_to_estimator(
+      keras_lib.model_to_estimator(
           keras_model=keras_model, config=self._config)
 
 
diff --git a/tensorflow/python/estimator/model_fn.py b/tensorflow/python/estimator/model_fn.py
index 8111ab564c0171..3edf9fe940b19c 100644
--- a/tensorflow/python/estimator/model_fn.py
+++ b/tensorflow/python/estimator/model_fn.py
@@ -28,6 +28,7 @@
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.training import monitored_session
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.util import nest
@@ -53,6 +54,13 @@ class ModeKeys(object):
 LOSS_METRIC_KEY = 'loss'
 AVERAGE_LOSS_METRIC_KEY = 'average_loss'
 
+# Mapping of the modes to appropriate tag_constants that are used for saving.
+EXPORT_TAG_MAP = {
+    ModeKeys.PREDICT: [tag_constants.SERVING],
+    ModeKeys.TRAIN: [tag_constants.TRAINING],
+    ModeKeys.EVAL: [tag_constants.EVAL],
+}
+
 
 @tf_export('estimator.EstimatorSpec')
 class EstimatorSpec(
@@ -326,6 +334,57 @@ def _replace(self, **kwds):
     return EstimatorSpec(*new_fields)
 
 
+class _TPUEstimatorSpec(collections.namedtuple('TPUEstimatorSpec', [
+    'mode',
+    'predictions',
+    'loss',
+    'train_op',
+    'eval_metrics',
+    'export_outputs',
+    'scaffold_fn',
+    'host_call'])):
+  """Ops and objects returned from a `model_fn` and passed to `TPUEstimator`.
+
+  This is a simplified implementation of `tf.contrib.tpu.EstimatorSpec`. See
+  tensorflow/contrib/tpu/python/tpu/tpu_estimator.py for more detailed
+  documentation.
+  """
+
+  def __new__(cls,
+              mode,
+              predictions=None,
+              loss=None,
+              train_op=None,
+              eval_metrics=None,
+              export_outputs=None,
+              scaffold_fn=None,
+              host_call=None):
+    """Creates a `_TPUEstimatorSpec` instance."""
+    return super(_TPUEstimatorSpec, cls).__new__(cls,
+                                                 mode=mode,
+                                                 predictions=predictions,
+                                                 loss=loss,
+                                                 train_op=train_op,
+                                                 eval_metrics=eval_metrics,
+                                                 export_outputs=export_outputs,
+                                                 scaffold_fn=scaffold_fn,
+                                                 host_call=host_call)
+
+  def as_estimator_spec(self):
+    """Creates an equivalent `EstimatorSpec` used by CPU train/eval."""
+    if not self.eval_metrics:
+      eval_metric_ops = None
+    else:
+      metric_fn, tensors = self.eval_metrics
+      eval_metric_ops = metric_fn(**tensors)
+    return EstimatorSpec(mode=self.mode,
+                         predictions=self.predictions,
+                         loss=self.loss,
+                         train_op=self.train_op,
+                         eval_metric_ops=eval_metric_ops,
+                         export_outputs=self.export_outputs)
+
+
 def _check_is_tensor_or_operation(x, name):
   if not (isinstance(x, ops.Operation) or isinstance(x, ops.Tensor)):
     raise TypeError('{} must be Operation or Tensor, given: {}'.format(name, x))
diff --git a/tensorflow/python/estimator/run_config.py b/tensorflow/python/estimator/run_config.py
index 8162b249f1f0be..c7707be8397d95 100644
--- a/tensorflow/python/estimator/run_config.py
+++ b/tensorflow/python/estimator/run_config.py
@@ -27,8 +27,8 @@
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
-from tensorflow.python.estimator import util
 from tensorflow.python.util import compat_internal
+from tensorflow.python.util import function_utils
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -283,7 +283,7 @@ def _validate(property_name, cond, message):
             message='tf_random_seed must be integer.')
 
   _validate('device_fn', lambda device_fn: six.callable(device_fn) and
-            set(util.fn_args(device_fn)) == _VALID_DEVICE_FN_ARGS,
+            set(function_utils.fn_args(device_fn)) == _VALID_DEVICE_FN_ARGS,
             message='device_fn must be callable with exactly'
                     ' one argument "op".')
 
diff --git a/tensorflow/python/estimator/training.py b/tensorflow/python/estimator/training.py
index 9d271758f63586..fb6a68b4f7c6d5 100644
--- a/tensorflow/python/estimator/training.py
+++ b/tensorflow/python/estimator/training.py
@@ -129,7 +129,7 @@ def __new__(cls, input_fn, max_steps=None, hooks=None):
 
     Args:
       input_fn: A function that provides input data for training as minibatches.
-        See @{$get_started/premade_estimators#create_input_functions} for more
+        See @{$premade_estimators#create_input_functions} for more
         information. The function should construct and return one of
         the following:
           * A 'tf.data.Dataset' object: Outputs of `Dataset` object must be a
@@ -193,7 +193,7 @@ def __new__(cls,
 
     Args:
       input_fn: A function that constructs the input data for evaluation.
-        See @{$get_started/premade_estimators#create_input_functions} for more
+        See @{$premade_estimators#create_input_functions} for more
         information. The function should construct and return one of
         the following:
           * A 'tf.data.Dataset' object: Outputs of `Dataset` object must be a
@@ -201,7 +201,7 @@ def __new__(cls,
           * A tuple (features, labels): Where features is a `Tensor` or a
             dictionary of string feature name to `Tensor` and labels is a
             `Tensor` or a dictionary of string label name to `Tensor`.
-            
+
       steps: Int. Positive number of steps for which to evaluate model. If
         `None`, evaluates until `input_fn` raises an end-of-input exception.
         See `Estimator.evaluate` for details.
@@ -295,6 +295,7 @@ def train_and_evaluate(estimator, train_spec, eval_spec):
   model will be trained with three epochs of training data instead of one epoch.
 
   Example of local (non-distributed) training:
+
   ```python
   # Set up feature columns.
   categorial_feature_a = categorial_column_with_hash_bucket(...)
@@ -339,12 +340,14 @@ def eval_input_fn_eval: # returns x, y
 
   Setting environment variable depends on the platform. For example, on Linux,
   it can be done as follows (`$` is the shell prompt):
+
   ```
   $ TF_CONFIG='<replace_with_real_content>' python train_model.py
   ```
 
   For the content in `TF_CONFIG`, assume that the training cluster spec looks
   like:
+
   ```
   cluster = {"chief": ["host0:2222"],
              "worker": ["host1:2222", "host2:2222", "host3:2222"],
@@ -352,6 +355,7 @@ def eval_input_fn_eval: # returns x, y
   ```
 
   Example of `TF_CONFIG` for chief training worker (must have one and only one):
+
   ```
   # This should be a JSON string, which is set as environment variable. Usually
   # the cluster manager handles that.
@@ -371,6 +375,7 @@ def eval_input_fn_eval: # returns x, y
 
   Example of `TF_CONFIG` for non-chief training worker (optional, could be
   multiple):
+
   ```
   # This should be a JSON string, which is set as environment variable. Usually
   # the cluster manager handles that.
@@ -387,6 +392,7 @@ def eval_input_fn_eval: # returns x, y
   for non-chief training workers.
 
   Example of `TF_CONFIG` for parameter server, aka ps (could be multiple):
+
   ```
   # This should be a JSON string, which is set as environment variable. Usually
   # the cluster manager handles that.
@@ -405,6 +411,7 @@ def eval_input_fn_eval: # returns x, y
   Example of `TF_CONFIG` for evaluator task. Evaluator is a special task that is
   not part of the training cluster. There could be only one. It is used for
   model evaluation.
+
   ```
   # This should be a JSON string, which is set as environment variable. Usually
   # the cluster manager handles that.
@@ -424,9 +431,16 @@ def eval_input_fn_eval: # returns x, y
     eval_spec: A `EvalSpec` instance to specify the evaluation and export
       specification.
 
+  Returns:
+    A tuple of the result of the `evaluate` call to the `Estimator` and the
+    export results using the specified `ExportStrategy`.
+    Currently, the return value is undefined for distributed training mode.
+
   Raises:
     ValueError: if environment variable `TF_CONFIG` is incorrectly set.
   """
+  _assert_eval_spec(eval_spec)  # fail fast if eval_spec is invalid.
+
   executor = _TrainingExecutor(
       estimator=estimator, train_spec=train_spec, eval_spec=eval_spec)
 
@@ -437,7 +451,7 @@ def eval_input_fn_eval: # returns x, y
         'For distributed training, there can only be one `evaluator` task '
         '(with task id 0).  Given task id {}'.format(config.task_id))
 
-  executor.run()
+  return executor.run()
 
 
 class _StopAtSecsHook(session_run_hook.SessionRunHook):
@@ -481,10 +495,10 @@ def __init__(self,
           'Got: {}'.format(type(train_spec)))
     self._train_spec = train_spec
 
-    if not isinstance(eval_spec, EvalSpec):
-      raise TypeError(
-          '`eval_spec` must have type `tf.estimator.EvalSpec`. '
-          'Got: {}'.format(type(eval_spec)))
+    if eval_spec and not isinstance(eval_spec, EvalSpec):
+      raise TypeError('`eval_spec` must be either `None` or have type '
+                      '`tf.estimator.EvalSpec`. Got: {}'.format(
+                          type(eval_spec)))
     self._eval_spec = eval_spec
 
     self._train_hooks = _validate_hooks(train_hooks)
@@ -508,6 +522,11 @@ def run(self):
     procedure is `run_foo'. This `run` method invoke the procedure base on the
     `RunConfig.task_type`.
 
+    Returns:
+      A tuple of the result of the `evaluate` call to the `Estimator` and the
+      export results using the specified `ExportStrategy`.
+      Currently undefined for distributed training mode.
+
     Raises:
       ValueError: if the estimator.config is mis-configured.
     """
@@ -516,8 +535,7 @@ def run(self):
     if (not config.cluster_spec and
         config.task_type != run_config_lib.TaskType.EVALUATOR):
       logging.info('Running training and evaluation locally (non-distributed).')
-      self.run_local()
-      return
+      return self.run_local()
 
     # Distributed case.
     if not config.task_type:
@@ -580,11 +598,13 @@ def after_save(self, session, global_step_value):
           logging.info('Skip the current checkpoint eval due to throttle secs '
                        '({} secs).'.format(self._eval_throttle_secs))
 
+    _assert_eval_spec(self._eval_spec)
+
     # Final export signal: For any eval result with global_step >= train
     # max_steps, the evaluator will send the final export signal. There is a
     # small chance that the Estimator.train stopping logic sees a different
     # global_step value (due to global step race condition and the fact the
-    # saver sees a larger value for checkpoing saving), which does not end
+    # saver sees a larger value for checkpoint saving), which does not end
     # the training. When the training ends, a new checkpoint is generated, which
     # triggers the listener again. So, it could be the case the final export is
     # triggered twice.
@@ -628,6 +648,8 @@ def _should_stop_local_train(global_step):
         return True
       return False
 
+    _assert_eval_spec(self._eval_spec)
+
     if self._eval_spec.throttle_secs <= 0:
       raise ValueError('eval_spec.throttle_secs should be positive, given: {}.'
                        'It is used do determine how long each training '
@@ -644,18 +666,26 @@ def _should_stop_local_train(global_step):
     evaluator = _TrainingExecutor._Evaluator(self._estimator, self._eval_spec,
                                              self._train_spec.max_steps)
 
+    eval_result = _EvalResult(status=_EvalStatus.MISSING_CHECKPOINT)
+    export_results = []
+
     while True:
       self._estimator.train(
           input_fn=self._train_spec.input_fn,
           max_steps=self._train_spec.max_steps,
           hooks=train_hooks)
 
+      if not self._continuous_eval_listener.before_eval():
+        logging.info('Exiting training and evaluation loop, as requested by '
+                     '_ContinuousEvalListener.before_eval.')
+        break
+
       # Final export signal: For any eval result with global_step >= train
       # max_steps, the evaluator will send the final export signal. The
       # _should_stop_local_train will then end the while True as the stopping
       # condition is satisfied (both checks use the same global_step value,
       # i.e., no race condition)
-      eval_result = evaluator.evaluate_and_export()
+      eval_result, export_results = evaluator.evaluate_and_export()
 
       if eval_result.status != _EvalStatus.EVALUATED:
         #  This is unexpected; should never happen.
@@ -663,9 +693,15 @@ def _should_stop_local_train(global_step):
         raise RuntimeError('There was no new checkpoint after the training. '
                            'Eval status: {}'.format(eval_result.status))
 
+      if not self._continuous_eval_listener.after_eval(eval_result):
+        logging.info('Exiting evaluation, as requested by '
+                     '_ContinuousEvalListener.after_eval.')
+        break
+
       if _should_stop_local_train(
           eval_result.metrics[ops.GraphKeys.GLOBAL_STEP]):
         break
+    return eval_result.metrics, export_results
 
   def _start_std_server(self, config):
     """Creates, starts, and returns a server_lib.Server."""
@@ -741,6 +777,9 @@ def _start_distributed_training(self, saving_listeners=None):
 
   def _start_continuous_evaluation(self):
     """Repeatedly calls `Estimator` evaluate and export until training ends."""
+
+    _assert_eval_spec(self._eval_spec)
+
     start_delay_secs = self._eval_spec.start_delay_secs
     if start_delay_secs:
       logging.info('Waiting %f secs before starting eval.', start_delay_secs)
@@ -769,6 +808,9 @@ def _start_continuous_evaluation(self):
   def _execute_evaluator_once(self, evaluator, continuous_eval_listener,
                               throttle_secs):
     """Executes the `evaluator`."""
+
+    _assert_eval_spec(self._eval_spec)
+
     start = time.time()
 
     eval_result = None
@@ -785,7 +827,7 @@ def _execute_evaluator_once(self, evaluator, continuous_eval_listener,
     # iteration of while loop will end the continuous eval as the stopping
     # condition is satisfied (both checks use the same global_step value,
     # i.e., no race condition)
-    eval_result = evaluator.evaluate_and_export()
+    eval_result, _ = evaluator.evaluate_and_export()
 
     if not self._continuous_eval_listener.after_eval(eval_result):
       logging.info('Exiting evaluation, as requested by '
@@ -807,7 +849,10 @@ class _Evaluator(object):
 
     def __init__(self, estimator, eval_spec, max_training_steps):
       self._estimator = estimator
+
+      _assert_eval_spec(eval_spec)
       self._eval_spec = eval_spec
+
       self._is_final_export_triggered = False
       self._previous_ckpt_path = None
       self._last_warning_time = 0
@@ -821,7 +866,7 @@ def evaluate_and_export(self):
       """Evaluate and (maybe) export the current model.
 
       Returns:
-        An `EvalResult` instance.
+        A tuple of `EvalResult` instance and the export results.
 
       Raises:
         RuntimeError: for any unexpected internal error.
@@ -831,14 +876,14 @@ def evaluate_and_export(self):
       if not latest_ckpt_path:
         self._log_err_msg('Estimator is not trained yet. Will start an '
                           'evaluation when a checkpoint is ready.')
-        return _EvalResult(status=_EvalStatus.MISSING_CHECKPOINT)
+        return _EvalResult(status=_EvalStatus.MISSING_CHECKPOINT), []
 
       if latest_ckpt_path == self._previous_ckpt_path:
         self._log_err_msg(
             'No new checkpoint ready for evaluation. Skip the current '
             'evaluation pass as evaluation results are expected to be same '
             'for the same checkpoint.')
-        return _EvalResult(status=_EvalStatus.NO_NEW_CHECKPOINT)
+        return _EvalResult(status=_EvalStatus.NO_NEW_CHECKPOINT), []
 
       metrics = self._estimator.evaluate(
           input_fn=self._eval_spec.input_fn,
@@ -856,7 +901,8 @@ def evaluate_and_export(self):
       is_the_final_export = (
           eval_result.metrics[ops.GraphKeys.GLOBAL_STEP] >=
           self._max_training_steps if self._max_training_steps else False)
-      self._export_eval_result(eval_result, is_the_final_export)
+      export_results = self._export_eval_result(eval_result,
+                                                is_the_final_export)
 
       if is_the_final_export:
         logging.debug('Calling exporter with the `is_the_final_export=True`.')
@@ -864,7 +910,7 @@ def evaluate_and_export(self):
 
       self._last_warning_time = 0
       self._previous_ckpt_path = latest_ckpt_path
-      return eval_result
+      return eval_result, export_results
 
     def _log_err_msg(self, message):
       """Prints warning `message` every 10 mins."""
@@ -879,15 +925,18 @@ def _export_eval_result(self, eval_result, is_the_final_export):
           compat.as_str_any(self._estimator.model_dir),
           compat.as_str_any('export'))
 
+      export_results = []
       for exporter in self._eval_spec.exporters:
-        exporter.export(
-            estimator=self._estimator,
-            export_path=os.path.join(
-                compat.as_str_any(export_dir_base),
-                compat.as_str_any(exporter.name)),
-            checkpoint_path=eval_result.checkpoint_path,
-            eval_result=eval_result.metrics,
-            is_the_final_export=is_the_final_export)
+        export_results.append(
+            exporter.export(
+                estimator=self._estimator,
+                export_path=os.path.join(
+                    compat.as_str_any(export_dir_base),
+                    compat.as_str_any(exporter.name)),
+                checkpoint_path=eval_result.checkpoint_path,
+                eval_result=eval_result.metrics,
+                is_the_final_export=is_the_final_export))
+      return export_results
 
 
 class _EvalStatus(object):
@@ -996,3 +1045,10 @@ def after_eval(self, eval_result):
     """
     del eval_result
     return True
+
+
+def _assert_eval_spec(eval_spec):
+  """Raise error if `eval_spec` is not of the right type."""
+  if not isinstance(eval_spec, EvalSpec):
+    raise TypeError('`eval_spec` must have type `tf.estimator.EvalSpec`. '
+                    'Got: {}'.format(type(eval_spec)))
diff --git a/tensorflow/python/estimator/training_test.py b/tensorflow/python/estimator/training_test.py
index 4f7da848086514..2c838db7a4de98 100644
--- a/tensorflow/python/estimator/training_test.py
+++ b/tensorflow/python/estimator/training_test.py
@@ -72,6 +72,8 @@
     'An Exporter cannot have a name that is `None` or empty.')
 _INVALID_TRAIN_SPEC_MSG = '`train_spec` must have type `tf.estimator.TrainSpec`'
 _INVALID_EVAL_SPEC_MSG = '`eval_spec` must have type `tf.estimator.EvalSpec`'
+_EVAL_SPEC_OR_NONE_MSG = (
+    '`eval_spec` must be either `None` or have type `tf.estimator.EvalSpec`')
 _INVALID_EVAL_LISTENER_MSG = 'must have type `_ContinuousEvalListener`'
 _INVALID_CONFIG_FOR_STD_SERVER_MSG = 'Could not start server; .*TF_CONFIG'
 _INVALID_LOCAL_TASK_WITH_CLUSTER = '`task.type` in TF_CONFIG cannot be `local`'
@@ -356,11 +358,23 @@ def test_invalid_estimator(self):
       training.train_and_evaluate(invalid_estimator, mock_train_spec,
                                   mock_eval_spec)
 
+  def test_fail_fast_if_invalid_eval_spec(self):
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
+    invalid_eval_spec = object()
+
+    with test.mock.patch.object(training, '_TrainingExecutor') as mock_executor:
+      with self.assertRaisesRegexp(TypeError, _INVALID_EVAL_SPEC_MSG):
+        training.train_and_evaluate(mock_est, mock_train_spec,
+                                    invalid_eval_spec)
+
+      mock_executor.assert_not_called()
+
 
 class TrainingExecutorConstructorTest(test.TestCase):
   """Tests constructor of _TrainingExecutor."""
 
-  def testRequiredArgumentsSet(self):
+  def test_required_arguments_set(self):
     estimator = estimator_lib.Estimator(model_fn=lambda features: features)
     train_spec = training.TrainSpec(input_fn=lambda: 1)
     eval_spec = training.EvalSpec(input_fn=lambda: 1)
@@ -389,9 +403,17 @@ def test_invalid_eval_spec(self):
     train_spec = training.TrainSpec(input_fn=lambda: 1)
     invalid_eval_spec = object()
 
-    with self.assertRaisesRegexp(TypeError, _INVALID_EVAL_SPEC_MSG):
+    with self.assertRaisesRegexp(TypeError, _EVAL_SPEC_OR_NONE_MSG):
       training._TrainingExecutor(estimator, train_spec, invalid_eval_spec)
 
+  def test_eval_spec_none(self):
+    estimator = estimator_lib.Estimator(model_fn=lambda features: features)
+    train_spec = training.TrainSpec(input_fn=lambda: 1)
+    eval_spec = None
+
+    # Tests that no error is raised.
+    training._TrainingExecutor(estimator, train_spec, eval_spec)
+
   def test_invalid_train_hooks(self):
     estimator = estimator_lib.Estimator(model_fn=lambda features: features)
     train_spec = training.TrainSpec(input_fn=lambda: 1)
@@ -457,6 +479,36 @@ def test_train_with_train_spec(self, mock_server, unused_mock_sleep):
     mock_est.evaluate.assert_not_called()
     mock_est.export_savedmodel.assert_not_called()
 
+  @test.mock.patch.object(time, 'sleep')
+  @test.mock.patch.object(server_lib, 'Server')
+  def test_train_with_no_eval_spec(self, mock_server, unused_mock_sleep):
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    mock_est.config = self._run_config
+    train_spec = training.TrainSpec(
+        input_fn=lambda: 1, max_steps=2, hooks=[_FakeHook()])
+    eval_spec = None
+    mock_server_instance = mock_server.return_value
+
+    executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
+    self._run_task(executor)
+
+    mock_server.assert_called_with(
+        mock_est.config.cluster_spec,
+        job_name=mock_est.config.task_type,
+        task_index=mock_est.config.task_id,
+        config=test.mock.ANY,
+        start=False)
+
+    self.assertTrue(mock_server_instance.start.called)
+
+    mock_est.train.assert_called_with(
+        input_fn=train_spec.input_fn,
+        max_steps=train_spec.max_steps,
+        hooks=list(train_spec.hooks),
+        saving_listeners=test.mock.ANY)
+    mock_est.evaluate.assert_not_called()
+    mock_est.export_savedmodel.assert_not_called()
+
   @test.mock.patch.object(time, 'sleep')
   @test.mock.patch.object(server_lib, 'Server')
   def test_train_with_train_hooks(self, unused_mock_server, unused_mock_sleep):
@@ -683,6 +735,20 @@ def test_train_with_train_spec(self, mock_server, unused_mock_sleep):
         saving_listeners=test.mock.ANY)
     mock_est.export_savedmodel.assert_not_called()
 
+  @test.mock.patch.object(time, 'sleep')
+  @test.mock.patch.object(server_lib, 'Server')
+  def test_train_with_no_eval_spec_fails(self, mock_server, unused_mock_sleep):
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    mock_est.evaluate = lambda *args, **kw: {ops.GraphKeys.GLOBAL_STEP: 123}
+    mock_est.config = self._run_config
+    train_spec = training.TrainSpec(
+        input_fn=lambda: 1, max_steps=2, hooks=[_FakeHook()])
+    eval_spec = None
+
+    executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
+    with self.assertRaisesRegexp(TypeError, _INVALID_EVAL_SPEC_MSG):
+      executor.run_master()
+
   @test.mock.patch.object(time, 'sleep')
   @test.mock.patch.object(server_lib, 'Server')
   def test_train_with_train_hooks(self, mock_server, unused_mock_sleep):
@@ -980,6 +1046,19 @@ def test_evaluate_with_evaluate_spec(self):
         hooks=eval_spec.hooks)
     self.assertFalse(mock_est.train.called)
 
+  def test_evaluate_with_no_eval_spec_fails(self):
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    mock_est.latest_checkpoint.return_value = 'latest_it_is'
+    mock_train_spec = test.mock.Mock(spec=training.TrainSpec)
+    self._set_up_mock_est_to_train_and_evaluate_once(mock_est, mock_train_spec)
+
+    eval_spec = None
+
+    executor = training._TrainingExecutor(mock_est, mock_train_spec, eval_spec)
+
+    with self.assertRaisesRegexp(TypeError, _INVALID_EVAL_SPEC_MSG):
+      executor.run_evaluator()
+
   def test_evaluate_with_train_hooks(self):
     mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
     mock_est.latest_checkpoint.return_value = 'latest_it_is'
@@ -1549,6 +1628,73 @@ def export(estimator, export_path, checkpoint_path, eval_result,
     self.assertEqual(3, mock_est.times_export_was_called)
     self.assertEqual(1, mock_est.times_final_export_was_called)
 
+  def test_runs_with_eval_listener_before_eval(self):
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator, model_dir='path/')
+    mock_est.latest_checkpoint = self.unique_checkpoint_every_time_fn
+
+    train_spec = training.TrainSpec(input_fn=lambda: 1, max_steps=300)
+    eval_spec = training.EvalSpec(input_fn=lambda: 1, throttle_secs=100)
+    # should be called 2 times without the evallistener
+    mock_est.evaluate.side_effect = [{
+        _GLOBAL_STEP_KEY: train_spec.max_steps - 50
+    }, {
+        _GLOBAL_STEP_KEY: train_spec.max_steps
+    }]
+
+    class _Listener(training._ContinuousEvalListener):
+
+      def __init__(self):
+        self.call_count = 0
+
+      def before_eval(self):
+        self.call_count += 1
+        return False  # Will stop the run_local before first eval.
+
+    listener = _Listener()
+
+    executor = training._TrainingExecutor(
+        mock_est, train_spec, eval_spec, continuous_eval_listener=listener)
+    executor.run_local()
+
+    self.assertEqual(1, mock_est.train.call_count)
+    self.assertEqual(0, mock_est.evaluate.call_count)
+    self.assertEqual(1, listener.call_count)
+
+  def test_runs_with_eval_listener_after_eval(self):
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator, model_dir='path/')
+    mock_est.latest_checkpoint = self.unique_checkpoint_every_time_fn
+
+    train_spec = training.TrainSpec(input_fn=lambda: 1, max_steps=300)
+    eval_spec = training.EvalSpec(input_fn=lambda: 1, throttle_secs=100)
+    # should be called 2 times without the evallistener
+    mock_est.evaluate.side_effect = [{
+        _GLOBAL_STEP_KEY: train_spec.max_steps - 50
+    }, {
+        _GLOBAL_STEP_KEY: train_spec.max_steps
+    }]
+
+    class _Listener(training._ContinuousEvalListener):
+
+      def __init__(self, test_case):
+        self.call_count = 0
+        self._test_case = test_case
+
+      def after_eval(self, eval_result):
+        self.call_count += 1
+        self._test_case.assertEqual(
+            train_spec.max_steps - 50, eval_result.metrics[_GLOBAL_STEP_KEY])
+        return False  # Will stop the run_local after first eval.
+
+    listener = _Listener(test_case=self)
+
+    executor = training._TrainingExecutor(
+        mock_est, train_spec, eval_spec, continuous_eval_listener=listener)
+    executor.run_local()
+
+    self.assertEqual(1, mock_est.train.call_count)
+    self.assertEqual(1, mock_est.evaluate.call_count)
+    self.assertEqual(1, listener.call_count)
+
   def test_handles_no_new_checkpoint_found(self):
     mock_est = test.mock.Mock(spec=estimator_lib.Estimator, model_dir='path/')
     mock_est.latest_checkpoint.return_value = (
@@ -1635,6 +1781,17 @@ def test_train_and_evaluate_args(self):
     self.assertEqual(train_spec.input_fn, train_args['input_fn'])
     self.assertEqual(train_spec.max_steps, train_args['max_steps'])
 
+  def test_train_with_no_eval_spec_fails(self):
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
+    train_spec = training.TrainSpec(
+        input_fn=lambda: 1, max_steps=300, hooks=[_FakeHook()])
+    eval_spec = None
+
+    executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
+
+    with self.assertRaisesRegexp(TypeError, _INVALID_EVAL_SPEC_MSG):
+      executor.run_local()
+
   def test_train_hooks(self):
     mock_est = test.mock.Mock(spec=estimator_lib.Estimator, model_dir='path/')
     mock_est.latest_checkpoint.return_value = 'checkpoint_path/'
@@ -1678,6 +1835,7 @@ def test_that_export_is_called_with_run_local(self):
     def export(estimator, *args, **kwargs):
       del args, kwargs
       estimator.export_was_called = True
+      return 'path_to_export'
 
     exporter = test.mock.PropertyMock(spec=exporter_lib.Exporter)
     exporter.name = 'see_whether_export_is_called'
@@ -1691,9 +1849,12 @@ def export(estimator, *args, **kwargs):
         exporters=exporter)
 
     executor = training._TrainingExecutor(mock_est, mock_train_spec, eval_spec)
-    executor.run_local()
+    # pylint: disable=assignment-from-no-return
+    _, export_results = executor.run_local()
+    # pylint: enable=assignment-from-no-return
 
     self.assertTrue(mock_est.export_was_called)
+    self.assertEqual(export_results, ['path_to_export'])
 
   def test_errors_out_if_evaluate_returns_empty_dict(self):
     mock_est = test.mock.Mock(spec=estimator_lib.Estimator)
@@ -1710,7 +1871,6 @@ def test_errors_out_if_evaluate_returns_non_dict(self):
     train_spec = training.TrainSpec(input_fn=lambda: 1)
     eval_spec = training.EvalSpec(input_fn=(lambda: 1), throttle_secs=123)
     mock_est.evaluate.return_value = 123
-
     executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
     with self.assertRaisesRegexp(TypeError, _INVALID_EVAL_RESULT_TYPE_ERR):
       executor.run_local()
@@ -1726,6 +1886,21 @@ def test_errors_out_if_evaluate_returns_dict_without_global_step(self):
                                  _MISSING_GLOBAL_STEP_IN_EVAL_RESULT_ERR):
       executor.run_local()
 
+  def test_train_and_evaluate_return_metrics(self):
+    mock_est = test.mock.Mock(spec=estimator_lib.Estimator, model_dir='path/')
+    mock_est.latest_checkpoint.return_value = 'checkpoint_path/'
+    train_spec = training.TrainSpec(
+        input_fn=lambda: 1, max_steps=300, hooks=[_FakeHook()])
+    eval_spec = training.EvalSpec(
+        input_fn=lambda: 1, steps=2, hooks=[_FakeHook()], name='local_eval')
+    mock_est.evaluate.return_value = {_GLOBAL_STEP_KEY: train_spec.max_steps}
+
+    executor = training._TrainingExecutor(mock_est, train_spec, eval_spec)
+    # pylint: disable=assignment-from-no-return
+    metrics, _ = executor.run_local()
+    # pylint: enable=assignment-from-no-return
+    self.assertEqual(metrics['global_step'], 300)
+
 
 class TrainAndEvaluateRunTest(test.TestCase):
 
diff --git a/tensorflow/python/estimator/util.py b/tensorflow/python/estimator/util.py
index bb4bdd3fdfb2e1..924ca309ff0455 100644
--- a/tensorflow/python/estimator/util.py
+++ b/tensorflow/python/estimator/util.py
@@ -13,55 +13,22 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Utility to retrieve function args."""
+"""Utilities for Estimators."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import functools
 import os
 import time
 
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import training
 from tensorflow.python.util import compat
-from tensorflow.python.util import tf_decorator
-from tensorflow.python.util import tf_inspect
-
-
-def _is_bounded_method(fn):
-  _, fn = tf_decorator.unwrap(fn)
-  return tf_inspect.ismethod(fn) and (fn.__self__ is not None)
-
-
-def _is_callable_object(obj):
-  return hasattr(obj, '__call__') and tf_inspect.ismethod(obj.__call__)
-
-
-def fn_args(fn):
-  """Get argument names for function-like object.
-
-  Args:
-    fn: Function, or function-like object (e.g., result of `functools.partial`).
-
-  Returns:
-    `tuple` of string argument names.
-
-  Raises:
-    ValueError: if partial function has positionally bound arguments
-  """
-  if isinstance(fn, functools.partial):
-    args = fn_args(fn.func)
-    args = [a for a in args[len(fn.args):] if a not in (fn.keywords or [])]
-  else:
-    if _is_callable_object(fn):
-      fn = fn.__call__
-    args = tf_inspect.getfullargspec(fn).args
-    if _is_bounded_method(fn):
-      args.remove('self')
-  return tuple(args)
+from tensorflow.python.util import function_utils
 
+fn_args = function_utils.fn_args
 
 # When we create a timestamped directory, there is a small chance that the
 # directory already exists because another process is also creating these
@@ -106,3 +73,59 @@ def get_timestamped_dir(dir_base):
         result_dir, attempts, MAX_DIRECTORY_CREATION_ATTEMPTS))
   raise RuntimeError('Failed to obtain a unique export directory name after '
                      '{} attempts.'.format(MAX_DIRECTORY_CREATION_ATTEMPTS))
+
+
+def parse_input_fn_result(result):
+  """Gets features, labels, and hooks from the result of an Estimator input_fn.
+
+  Args:
+    result: output of an input_fn to an estimator, which should be one of:
+
+      * A 'tf.data.Dataset' object: Outputs of `Dataset` object must be a
+          tuple (features, labels) with same constraints as below.
+      * A tuple (features, labels): Where `features` is a `Tensor` or a
+        dictionary of string feature name to `Tensor` and `labels` is a
+        `Tensor` or a dictionary of string label name to `Tensor`. Both
+        `features` and `labels` are consumed by `model_fn`. They should
+        satisfy the expectation of `model_fn` from inputs.
+
+  Returns:
+    Tuple of features, labels, and input_hooks, where features are as described
+    above, labels are as described above or None, and input_hooks are a list
+    of SessionRunHooks to be included when running.
+
+  Raises:
+    ValueError: if the result is a list or tuple of length != 2.
+  """
+  input_hooks = []
+  try:
+    # We can't just check whether this is a tf.data.Dataset instance here,
+    # as this is plausibly a PerDeviceDataset. Try treating as a dataset first.
+    iterator = result.make_initializable_iterator()
+  except AttributeError:
+    # Not a dataset or dataset-like-object. Move along.
+    pass
+  else:
+    input_hooks.append(_DatasetInitializerHook(iterator))
+    result = iterator.get_next()
+
+  if isinstance(result, (list, tuple)):
+    if len(result) != 2:
+      raise ValueError(
+          'input_fn should return (features, labels) as a len 2 tuple.')
+    return result[0], result[1], input_hooks
+  return result, None, input_hooks
+
+
+class _DatasetInitializerHook(training.SessionRunHook):
+  """Creates a SessionRunHook that initializes the passed iterator."""
+
+  def __init__(self, iterator):
+    self._iterator = iterator
+
+  def begin(self):
+    self._initializer = self._iterator.initializer
+
+  def after_create_session(self, session, coord):
+    del coord
+    session.run(self._initializer)
diff --git a/tensorflow/python/estimator/util_test.py b/tensorflow/python/estimator/util_test.py
index 4b2c8d7637e2e6..d7e06107790231 100644
--- a/tensorflow/python/estimator/util_test.py
+++ b/tensorflow/python/estimator/util_test.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,117 +12,91 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for Estimator related util."""
+
+"""Tests for util.py."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import functools
+import numpy as np
 
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.estimator import util
+from tensorflow.python.framework import constant_op
 from tensorflow.python.platform import test
+from tensorflow.python.training import training
 
 
-class FnArgsTest(test.TestCase):
-
-  def test_simple_function(self):
-    def fn(a, b):
-      return a + b
-    self.assertEqual(('a', 'b'), util.fn_args(fn))
-
-  def test_callable(self):
-
-    class Foo(object):
-
-      def __call__(self, a, b):
-        return a + b
-
-    self.assertEqual(('a', 'b'), util.fn_args(Foo()))
-
-  def test_bounded_method(self):
-
-    class Foo(object):
-
-      def bar(self, a, b):
-        return a + b
-
-    self.assertEqual(('a', 'b'), util.fn_args(Foo().bar))
-
-  def test_partial_function(self):
-    expected_test_arg = 123
-
-    def fn(a, test_arg):
-      if test_arg != expected_test_arg:
-        return ValueError('partial fn does not work correctly')
-      return a
-
-    wrapped_fn = functools.partial(fn, test_arg=123)
-
-    self.assertEqual(('a',), util.fn_args(wrapped_fn))
+class UtilTest(test.TestCase):
+  """Tests for miscellaneous Estimator utils."""
 
-  def test_partial_function_with_positional_args(self):
-    expected_test_arg = 123
+  def test_parse_input_fn_result_tuple(self):
+    def _input_fn():
+      features = constant_op.constant(np.arange(100))
+      labels = constant_op.constant(np.arange(100, 200))
+      return features, labels
 
-    def fn(test_arg, a):
-      if test_arg != expected_test_arg:
-        return ValueError('partial fn does not work correctly')
-      return a
+    features, labels, hooks = util.parse_input_fn_result(_input_fn())
 
-    wrapped_fn = functools.partial(fn, 123)
+    with self.test_session() as sess:
+      vals = sess.run([features, labels])
 
-    self.assertEqual(('a',), util.fn_args(wrapped_fn))
+    self.assertAllEqual(vals[0], np.arange(100))
+    self.assertAllEqual(vals[1], np.arange(100, 200))
+    self.assertEqual(hooks, [])
 
-    self.assertEqual(3, wrapped_fn(3))
-    self.assertEqual(3, wrapped_fn(a=3))
+  def test_parse_input_fn_result_dataset(self):
+    def _input_fn():
+      features = np.expand_dims(np.arange(100), 0)
+      labels = np.expand_dims(np.arange(100, 200), 0)
+      return dataset_ops.Dataset.from_tensor_slices((features, labels))
 
-  def test_double_partial(self):
-    expected_test_arg1 = 123
-    expected_test_arg2 = 456
+    features, labels, hooks = util.parse_input_fn_result(_input_fn())
 
-    def fn(a, test_arg1, test_arg2):
-      if test_arg1 != expected_test_arg1 or test_arg2 != expected_test_arg2:
-        return ValueError('partial does not work correctly')
-      return a
+    with training.MonitoredSession(hooks=hooks) as sess:
+      vals = sess.run([features, labels])
 
-    wrapped_fn = functools.partial(fn, test_arg2=456)
-    double_wrapped_fn = functools.partial(wrapped_fn, test_arg1=123)
+    self.assertAllEqual(vals[0], np.arange(100))
+    self.assertAllEqual(vals[1], np.arange(100, 200))
+    self.assertIsInstance(hooks[0], util._DatasetInitializerHook)
 
-    self.assertEqual(('a',), util.fn_args(double_wrapped_fn))
+  def test_parse_input_fn_result_features_only(self):
+    def _input_fn():
+      return constant_op.constant(np.arange(100))
 
-  def test_double_partial_with_positional_args_in_outer_layer(self):
-    expected_test_arg1 = 123
-    expected_test_arg2 = 456
+    features, labels, hooks = util.parse_input_fn_result(_input_fn())
 
-    def fn(test_arg1, a, test_arg2):
-      if test_arg1 != expected_test_arg1 or test_arg2 != expected_test_arg2:
-        return ValueError('partial fn does not work correctly')
-      return a
+    with self.test_session() as sess:
+      vals = sess.run([features])
 
-    wrapped_fn = functools.partial(fn, test_arg2=456)
-    double_wrapped_fn = functools.partial(wrapped_fn, 123)
+    self.assertAllEqual(vals[0], np.arange(100))
+    self.assertEqual(labels, None)
+    self.assertEqual(hooks, [])
 
-    self.assertEqual(('a',), util.fn_args(double_wrapped_fn))
+  def test_parse_input_fn_result_features_only_dataset(self):
+    def _input_fn():
+      features = np.expand_dims(np.arange(100), 0)
+      return dataset_ops.Dataset.from_tensor_slices(features)
 
-    self.assertEqual(3, double_wrapped_fn(3))
-    self.assertEqual(3, double_wrapped_fn(a=3))
+    features, labels, hooks = util.parse_input_fn_result(_input_fn())
 
-  def test_double_partial_with_positional_args_in_both_layers(self):
-    expected_test_arg1 = 123
-    expected_test_arg2 = 456
+    with training.MonitoredSession(hooks=hooks) as sess:
+      vals = sess.run([features])
 
-    def fn(test_arg1, test_arg2, a):
-      if test_arg1 != expected_test_arg1 or test_arg2 != expected_test_arg2:
-        return ValueError('partial fn does not work correctly')
-      return a
+    self.assertAllEqual(vals[0], np.arange(100))
+    self.assertEqual(labels, None)
+    self.assertIsInstance(hooks[0], util._DatasetInitializerHook)
 
-    wrapped_fn = functools.partial(fn, 123)  # binds to test_arg1
-    double_wrapped_fn = functools.partial(wrapped_fn, 456)  # binds to test_arg2
+  def test_parse_input_fn_result_invalid(self):
+    def _input_fn():
+      features = np.expand_dims(np.arange(100), 0)
+      labels = np.expand_dims(np.arange(100, 200), 0)
+      return dataset_ops.Dataset.from_tensor_slices((features, labels, labels))
 
-    self.assertEqual(('a',), util.fn_args(double_wrapped_fn))
+    with self.assertRaisesRegexp(ValueError, 'input_fn should return'):
+      util.parse_input_fn_result(_input_fn())
 
-    self.assertEqual(3, double_wrapped_fn(3))
-    self.assertEqual(3, double_wrapped_fn(a=3))
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index c16c3cda4892b8..59801efc26abab 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -48,7 +48,7 @@
 
       embedded_dept_column = embedding_column(
           categorical_column_with_vocabulary_list(
-              "department", ["math", "philosphy", ...]), dimension=10)
+              "department", ["math", "philosophy", ...]), dimension=10)
 
   * Wide (aka linear) models (`LinearClassifier`, `LinearRegressor`).
 
@@ -140,7 +140,7 @@
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl.keras.engine import training
+from tensorflow.python.keras.engine import training
 from tensorflow.python.layers import base
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
@@ -280,7 +280,7 @@ def input_layer(features,
 # TODO(akshayka): InputLayer should be a subclass of Layer, and it
 # should implement the logic in input_layer using Layer's build-and-call
 # paradigm; input_layer should create an instance of InputLayer and
-# return the result of inovking its apply method, just as functional layers do.
+# return the result of invoking its apply method, just as functional layers do.
 class InputLayer(object):
   """An object-oriented version of `input_layer` that reuses variables."""
 
@@ -834,7 +834,7 @@ def shared_embedding_columns(
     tensor_name_in_ckpt=None, max_norm=None, trainable=True):
   """List of dense columns that convert from sparse, categorical input.
 
-  This is similar to `embedding_column`, except that that it produces a list of
+  This is similar to `embedding_column`, except that it produces a list of
   embedding columns that share the same embedding weights.
 
   Use this when your inputs are sparse and of the same type (e.g. watched and
@@ -1068,6 +1068,7 @@ def numeric_column(key,
     raise TypeError(
         'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
 
+  _assert_key_is_string(key)
   return _NumericColumn(
       key,
       shape=shape,
@@ -1166,6 +1167,13 @@ def _assert_string_or_int(dtype, prefix):
         '{} dtype must be string or integer. dtype: {}.'.format(prefix, dtype))
 
 
+def _assert_key_is_string(key):
+  if not isinstance(key, six.string_types):
+    raise ValueError(
+        'key must be a string. Got: type {}. Given key: {}.'.format(
+            type(key), key))
+
+
 @tf_export('feature_column.categorical_column_with_hash_bucket')
 def categorical_column_with_hash_bucket(key,
                                         hash_bucket_size,
@@ -1218,6 +1226,7 @@ def categorical_column_with_hash_bucket(key,
                      'hash_bucket_size: {}, key: {}'.format(
                          hash_bucket_size, key))
 
+  _assert_key_is_string(key)
   _assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
 
   return _HashedCategoricalColumn(key, hash_bucket_size, dtype)
@@ -1334,6 +1343,7 @@ def categorical_column_with_vocabulary_file(key,
       raise ValueError('Invalid num_oov_buckets {} in {}.'.format(
           num_oov_buckets, key))
   _assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
+  _assert_key_is_string(key)
   return _VocabularyFileCategoricalColumn(
       key=key,
       vocabulary_file=vocabulary_file,
@@ -1448,6 +1458,7 @@ def categorical_column_with_vocabulary_list(
         'dtype {} and vocabulary dtype {} do not match, column_name: {}'.format(
             dtype, vocabulary_dtype, key))
   _assert_string_or_int(dtype, prefix='column_name: {}'.format(key))
+  _assert_key_is_string(key)
 
   return _VocabularyListCategoricalColumn(
       key=key, vocabulary_list=tuple(vocabulary_list), dtype=dtype,
@@ -1518,6 +1529,7 @@ def categorical_column_with_identity(key, num_buckets, default_value=None):
     raise ValueError(
         'default_value {} not in range [0, {}), column_name {}'.format(
             default_value, num_buckets, key))
+  _assert_key_is_string(key)
   return _IdentityCategoricalColumn(
       key=key, num_buckets=num_buckets, default_value=default_value)
 
@@ -1787,6 +1799,15 @@ def __init__(self,
     self._initializer = initializer
     self._weight_collections = weight_collections
 
+  def set_weight_collections(self, weight_collections):
+    """Sets the weight collections for the layer.
+
+    Args:
+      weight_collections: A list of collection names to which the Variable will
+        be added.
+    """
+    self._weight_collections = weight_collections
+
   def build(self, _):
     self._embedding_weight_var = self.add_variable(
         name='embedding_weights',
@@ -2151,7 +2172,7 @@ def get(self, key):
       self._feature_tensors[key] = feature_tensor
       return feature_tensor
 
-    if isinstance(key, str):
+    if isinstance(key, six.string_types):
       raise ValueError('Feature {} is not in features dictionary.'.format(key))
 
     if not isinstance(key, _FeatureColumn):
@@ -2592,6 +2613,7 @@ def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
       sparse_ids = sparse_tensors.id_tensor
       sparse_weights = sparse_tensors.weight_tensor
 
+      self._layer.set_weight_collections(weight_collections)
       embedding_weights = self._layer(
           None, scope=variable_scope.get_variable_scope())
       # If we're in graph mode and this is called with a different graph,
@@ -2600,6 +2622,7 @@ def _get_dense_tensor(self, inputs, weight_collections=None, trainable=None):
           ops.get_default_graph() !=
           _get_graph_for_variable(embedding_weights)):
         self._reset_config()
+        self._layer.set_weight_collections(weight_collections)
         embedding_weights = self._layer(
             None, scope=variable_scope.get_variable_scope())
 
@@ -3032,7 +3055,7 @@ def _transform_feature(self, inputs):
         feature_tensors.append(ids_and_weights.id_tensor)
       else:
         raise ValueError('Unsupported column type. Given: {}'.format(key))
-    return sparse_ops._sparse_cross_hashed(  # pylint: disable=protected-access
+    return sparse_ops.sparse_cross_hashed(
         inputs=feature_tensors,
         num_buckets=self.hash_bucket_size,
         hash_key=self.hash_key)
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index d963dd9b551c0e..627430d6bc5995 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -25,6 +25,8 @@
 
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
@@ -54,8 +56,8 @@
 from tensorflow.python.training import queue_runner_impl
 
 
-def _initialized_session():
-  sess = session.Session()
+def _initialized_session(config=None):
+  sess = session.Session(config=config)
   sess.run(variables_lib.global_variables_initializer())
   sess.run(lookup_ops.tables_initializer())
   return sess
@@ -135,6 +137,9 @@ def test_error_if_feature_is_not_found(self):
     with self.assertRaisesRegexp(ValueError,
                                  'bbb is not in features dictionary'):
       builder.get('bbb')
+    with self.assertRaisesRegexp(ValueError,
+                                 'bbb is not in features dictionary'):
+      builder.get(u'bbb')
 
   def test_not_supported_feature_column(self):
 
@@ -180,6 +185,10 @@ def test_defaults(self):
     self.assertEqual(dtypes.float32, a.dtype)
     self.assertIsNone(a.normalizer_fn)
 
+  def test_key_should_be_string(self):
+    with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
+      fc.numeric_column(key=('aaa',))
+
   def test_shape_saved_as_tuple(self):
     a = fc.numeric_column('aaa', shape=[1, 2], default_value=[[3, 2.]])
     self.assertEqual((1, 2), a.shape)
@@ -643,6 +652,10 @@ def test_defaults(self):
     self.assertEqual(10, a.hash_bucket_size)
     self.assertEqual(dtypes.string, a.dtype)
 
+  def test_key_should_be_string(self):
+    with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
+      fc.categorical_column_with_hash_bucket(('key',), 10)
+
   def test_bucket_size_should_be_given(self):
     with self.assertRaisesRegexp(ValueError, 'hash_bucket_size must be set.'):
       fc.categorical_column_with_hash_bucket('aaa', None)
@@ -1274,7 +1287,6 @@ def get_keras_linear_model_predictions(features,
   return retval
 
 
-@test_util.with_c_api
 class LinearModelTest(test.TestCase):
 
   def test_raises_if_empty_feature_columns(self):
@@ -1550,16 +1562,10 @@ def test_raises_if_shape_mismatch(self):
     price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
-      if ops._USE_C_API:
-        with self.assertRaisesRegexp(
-            Exception,
-            r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
-          predictions = fc.linear_model(features, [price])
-      else:
-        predictions = fc.linear_model(features, [price])
-        with _initialized_session():
-          with self.assertRaisesRegexp(Exception, 'requested shape has 4'):
-            predictions.eval()
+      with self.assertRaisesRegexp(
+          Exception,
+          r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
+        fc.linear_model(features, [price])
 
   def test_dense_reshaping(self):
     price = fc.numeric_column('price', shape=[1, 2])
@@ -1923,7 +1929,6 @@ def test_with_rank_0_feature(self):
         sess.run(net, feed_dict={features['price']: np.array(1)})
 
 
-@test_util.with_c_api
 class _LinearModelTest(test.TestCase):
 
   def test_raises_if_empty_feature_columns(self):
@@ -2188,16 +2193,10 @@ def test_raises_if_shape_mismatch(self):
     price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
-      if ops._USE_C_API:
-        with self.assertRaisesRegexp(
-            Exception,
-            r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
-          predictions = get_keras_linear_model_predictions(features, [price])
-      else:
-        predictions = get_keras_linear_model_predictions(features, [price])
-        with _initialized_session():
-          with self.assertRaisesRegexp(Exception, 'requested shape has 4'):
-            predictions.eval()
+      with self.assertRaisesRegexp(
+          Exception,
+          r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
+        get_keras_linear_model_predictions(features, [price])
 
   def test_dense_reshaping(self):
     price = fc.numeric_column('price', shape=[1, 2])
@@ -2684,7 +2683,6 @@ def scale_matrix():
       self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
 
 
-@test_util.with_c_api
 class FunctionalInputLayerTest(test.TestCase):
 
   def test_raises_if_empty_feature_columns(self):
@@ -2749,16 +2747,10 @@ def test_raises_if_shape_mismatch(self):
     price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
-      if ops._USE_C_API:
-        with self.assertRaisesRegexp(
-            Exception,
-            r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
-          net = fc.input_layer(features, [price])
-      else:
-        net = fc.input_layer(features, [price])
-        with _initialized_session():
-          with self.assertRaisesRegexp(Exception, 'requested shape has 4'):
-            net.eval()
+      with self.assertRaisesRegexp(
+          Exception,
+          r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
+        fc.input_layer(features, [price])
 
   def test_reshaping(self):
     price = fc.numeric_column('price', shape=[1, 2])
@@ -3325,6 +3317,11 @@ def test_defaults(self):
         'aaa': parsing_ops.VarLenFeature(dtypes.string)
     }, column._parse_example_spec)
 
+  def test_key_should_be_string(self):
+    with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
+      fc.categorical_column_with_vocabulary_file(
+          key=('aaa',), vocabulary_file='path_to_file', vocabulary_size=3)
+
   def test_all_constructor_args(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa', vocabulary_file='path_to_file', vocabulary_size=3,
@@ -3750,6 +3747,11 @@ def test_defaults_string(self):
         'aaa': parsing_ops.VarLenFeature(dtypes.string)
     }, column._parse_example_spec)
 
+  def test_key_should_be_string(self):
+    with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
+      fc.categorical_column_with_vocabulary_list(
+          key=('aaa',), vocabulary_list=('omar', 'stringer', 'marlo'))
+
   def test_defaults_int(self):
     column = fc.categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=(12, 24, 36))
@@ -4141,6 +4143,10 @@ def test_constructor(self):
         'aaa': parsing_ops.VarLenFeature(dtypes.int64)
     }, column._parse_example_spec)
 
+  def test_key_should_be_string(self):
+    with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
+      fc.categorical_column_with_identity(key=('aaa',), num_buckets=3)
+
   def test_deep_copy(self):
     original = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     for column in (original, copy.deepcopy(original)):
@@ -5609,6 +5615,72 @@ def _initializer(shape, dtype, partition_info):
       self.assertAllEqual(expected_lookups_a, embedding_lookup_a.eval())
       self.assertAllEqual(expected_lookups_b, embedding_lookup_b.eval())
 
+  def test_get_dense_tensor_weight_collections(self):
+    # Inputs.
+    vocabulary_size = 3
+    # -1 values are ignored.
+    input_a = np.array([
+        [2, -1, -1],  # example 0, ids [2]
+        [0, 1, -1]
+    ])  # example 1, ids [0, 1]
+    input_b = np.array([
+        [0, -1, -1],  # example 0, ids [0]
+        [-1, -1, -1]
+    ])  # example 1, ids []
+    input_features = {'aaa': input_a, 'bbb': input_b}
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups_a = (
+        # example 0:
+        (7., 11.),  # ids [2], embedding = [7, 11]
+        # example 1:
+        (2., 3.5),  # ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+    )
+    expected_lookups_b = (
+        # example 0:
+        (1., 2.),  # ids [0], embedding = [1, 2]
+        # example 1:
+        (0., 0.),  # ids [], embedding = [0, 0]
+    )
+
+    # Build columns.
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+        [categorical_column_a, categorical_column_b],
+        dimension=embedding_dimension,
+        initializer=_initializer)
+
+    fc.input_layer(
+        input_features, [embedding_column_a, embedding_column_b],
+        weight_collections=('my_vars',))
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('input_layer/aaa_bbb_shared_embedding/embedding_weights:0',),
+        tuple(v.name for v in global_vars))
+    my_vars = ops.get_collection('my_vars')
+    self.assertItemsEqual(
+        ('input_layer/aaa_bbb_shared_embedding/embedding_weights:0',),
+        tuple(v.name for v in my_vars))
+
   def test_get_dense_tensor_placeholder_inputs(self):
     # Inputs.
     vocabulary_size = 3
@@ -6191,7 +6263,12 @@ def test_keras_linear_model_mismatched_dense_values(self):
               'values': ((.5,), (1.,))
           }, (column,),
           sparse_combiner='mean')
-      with _initialized_session():
+      # Disabling the constant folding optimizer here since it changes the
+      # error message differently on CPU and GPU.
+      config = config_pb2.ConfigProto()
+      config.graph_options.rewrite_options.constant_folding = (
+          rewriter_config_pb2.RewriterConfig.OFF)
+      with _initialized_session(config):
         with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
           predictions.eval()
 
@@ -6284,7 +6361,12 @@ def test_linear_model_mismatched_dense_values(self):
               'values': ((.5,), (1.,))
           }, (column,),
           sparse_combiner='mean')
-      with _initialized_session():
+      # Disabling the constant folding optimizer here since it changes the
+      # error message differently on CPU and GPU.
+      config = config_pb2.ConfigProto()
+      config.graph_options.rewrite_options.constant_folding = (
+          rewriter_config_pb2.RewriterConfig.OFF)
+      with _initialized_session(config):
         with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
           predictions.eval()
 
diff --git a/tensorflow/python/framework/c_api_util.py b/tensorflow/python/framework/c_api_util.py
index 7bbe3183dfa376..f68f30a0a966b9 100644
--- a/tensorflow/python/framework/c_api_util.py
+++ b/tensorflow/python/framework/c_api_util.py
@@ -19,6 +19,8 @@
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.core.framework import api_def_pb2
+from tensorflow.core.framework import op_def_pb2
 from tensorflow.python import pywrap_tensorflow as c_api
 from tensorflow.python.util import compat
 from tensorflow.python.util import tf_contextlib
@@ -89,6 +91,53 @@ def __del__(self):
       c_api.TF_DeleteFunction(self.func)
 
 
+class ApiDefMap(object):
+  """Wrapper around Tf_ApiDefMap that handles querying and deletion.
+
+  The OpDef protos are also stored in this class so that they could
+  be queried by op name.
+  """
+
+  def __init__(self):
+    op_def_proto = op_def_pb2.OpList()
+    buf = c_api.TF_GetAllOpList()
+    try:
+      op_def_proto.ParseFromString(c_api.TF_GetBuffer(buf))
+      self._api_def_map = c_api.TF_NewApiDefMap(buf)
+    finally:
+      c_api.TF_DeleteBuffer(buf)
+
+    self._op_per_name = {}
+    for op in op_def_proto.op:
+      self._op_per_name[op.name] = op
+
+  def __del__(self):
+    # Note: when we're destructing the global context (i.e when the process is
+    # terminating) we can have already deleted other modules.
+    if c_api is not None and c_api.TF_DeleteApiDefMap is not None:
+      c_api.TF_DeleteApiDefMap(self._api_def_map)
+
+  def put_api_def(self, text):
+    c_api.TF_ApiDefMapPut(self._api_def_map, text, len(text))
+
+  def get_api_def(self, op_name):
+    api_def_proto = api_def_pb2.ApiDef()
+    buf = c_api.TF_ApiDefMapGet(self._api_def_map, op_name, len(op_name))
+    try:
+      api_def_proto.ParseFromString(c_api.TF_GetBuffer(buf))
+    finally:
+      c_api.TF_DeleteBuffer(buf)
+    return api_def_proto
+
+  def get_op_def(self, op_name):
+    if op_name in self._op_per_name:
+      return self._op_per_name[op_name]
+    raise ValueError("No entry found for " + op_name + ".")
+
+  def op_names(self):
+    return self._op_per_name.keys()
+
+
 @tf_contextlib.contextmanager
 def tf_buffer(data=None):
   """Context manager that creates and deletes TF_Buffer.
diff --git a/tensorflow/python/framework/c_api_util_test.py b/tensorflow/python/framework/c_api_util_test.py
new file mode 100644
index 00000000000000..169abf1bb46d2a
--- /dev/null
+++ b/tensorflow/python/framework/c_api_util_test.py
@@ -0,0 +1,59 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for c_api utils."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import c_api_util
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import googletest
+
+
+class ApiDefMapTest(test_util.TensorFlowTestCase):
+
+  def testApiDefMapOpNames(self):
+    api_def_map = c_api_util.ApiDefMap()
+    self.assertIn("Add", api_def_map.op_names())
+
+  def testApiDefMapGet(self):
+    api_def_map = c_api_util.ApiDefMap()
+    op_def = api_def_map.get_op_def("Add")
+    self.assertEqual(op_def.name, "Add")
+    api_def = api_def_map.get_api_def("Add")
+    self.assertEqual(api_def.graph_op_name, "Add")
+
+  def testApiDefMapPutThenGet(self):
+    api_def_map = c_api_util.ApiDefMap()
+    api_def_text = """
+op {
+  graph_op_name: "Add"
+  summary: "Returns x + y element-wise."
+  description: <<END
+*NOTE*: `Add` supports broadcasting. `AddN` does not. More about broadcasting
+[here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+END
+}
+"""
+    api_def_map.put_api_def(api_def_text)
+    api_def = api_def_map.get_api_def("Add")
+    self.assertEqual(api_def.graph_op_name, "Add")
+    self.assertEqual(api_def.summary, "Returns x + y element-wise.")
+
+
+if __name__ == "__main__":
+  googletest.main()
+
diff --git a/tensorflow/python/framework/constant_op.py b/tensorflow/python/framework/constant_op.py
index 782b505d6c1d0b..b3eb57d067ba29 100644
--- a/tensorflow/python/framework/constant_op.py
+++ b/tensorflow/python/framework/constant_op.py
@@ -15,24 +15,6 @@
 """Operations that generate constants.
 
 See the @{$python/constant_op$constants guide}.
-
-@@zeros
-@@zeros_like
-@@ones
-@@ones_like
-@@fill
-@@constant
-@@linspace
-@@range
-@@random_normal
-@@truncated_normal
-@@random_uniform
-@@random_shuffle
-@@random_crop
-@@multinomial
-@@random_gamma
-@@random_poisson
-@@set_random_seed
 """
 
 # Must be separate from array_ops to avoid a cyclic dependency.
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index 7f9ef53457ae06..c3f70df7d8056e 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -120,11 +120,7 @@ def real_dtype(self):
 
   @property
   def is_numpy_compatible(self):
-    numpy_incompatible = [
-        types_pb2.DT_VARIANT, types_pb2.DT_VARIANT_REF, types_pb2.DT_RESOURCE,
-        types_pb2.DT_RESOURCE_REF
-    ]
-    return self._type_enum not in numpy_incompatible
+    return self._type_enum not in _NUMPY_INCOMPATIBLE
 
   @property
   def as_numpy_dtype(self):
@@ -162,7 +158,7 @@ def is_complex(self):
   @property
   def is_quantized(self):
     """Returns whether this is a quantized data type."""
-    return self.base_dtype in [qint8, quint8, qint16, quint16, qint32]
+    return self.base_dtype in _QUANTIZED_DTYPES_NO_REF
 
   @property
   def is_unsigned(self):
@@ -401,6 +397,11 @@ def size(self):
 qint32_ref = DType(types_pb2.DT_QINT32_REF)
 bfloat16_ref = DType(types_pb2.DT_BFLOAT16_REF)
 
+_NUMPY_INCOMPATIBLE = frozenset([
+    types_pb2.DT_VARIANT, types_pb2.DT_VARIANT_REF, types_pb2.DT_RESOURCE,
+    types_pb2.DT_RESOURCE_REF
+])
+
 # Maintain an intern table so that we don't have to create a large
 # number of small objects.
 _INTERN_TABLE = {
@@ -645,10 +646,10 @@ def size(self):
         _np_bfloat16,
 }
 
-QUANTIZED_DTYPES = frozenset([
-    qint8, quint8, qint16, quint16, qint32, qint8_ref, quint8_ref, qint16_ref,
-    quint16_ref, qint32_ref
-])
+_QUANTIZED_DTYPES_NO_REF = frozenset([qint8, quint8, qint16, quint16, qint32])
+_QUANTIZED_DTYPES_REF = frozenset(
+    [qint8_ref, quint8_ref, qint16_ref, quint16_ref, qint32_ref])
+QUANTIZED_DTYPES = _QUANTIZED_DTYPES_REF.union(_QUANTIZED_DTYPES_NO_REF)
 tf_export("QUANTIZED_DTYPES").export_constant(__name__, "QUANTIZED_DTYPES")
 
 _PYTHON_TO_TF = {
@@ -662,10 +663,9 @@ def as_dtype(type_value):
   """Converts the given `type_value` to a `DType`.
 
   Args:
-    type_value: A value that can be converted to a `tf.DType`
-      object. This may currently be a `tf.DType` object, a
-      [`DataType`
-        enum](https://www.tensorflow.org/code/tensorflow/core/framework/types.proto),
+    type_value: A value that can be converted to a `tf.DType` object. This may
+      currently be a `tf.DType` object, a [`DataType`
+      enum](https://www.tensorflow.org/code/tensorflow/core/framework/types.proto),
       a string type name, or a `numpy.dtype`.
 
   Returns:
diff --git a/tensorflow/python/framework/fast_tensor_util.pyx b/tensorflow/python/framework/fast_tensor_util.pyx
index 19928314efe143..17d112a1ece9ae 100644
--- a/tensorflow/python/framework/fast_tensor_util.pyx
+++ b/tensorflow/python/framework/fast_tensor_util.pyx
@@ -7,6 +7,18 @@ cimport numpy as np
 from tensorflow.python.util import compat
 
 
+def AppendFloat16ArrayToTensorProto(
+    # For numpy, npy_half is a typedef for npy_uint16,
+    # see: https://github.com/numpy/numpy/blob/master/doc/source/reference/c-api.coremath.rst#half-precision-functions
+    # Because np.float16_t dosen't exist in cython, we use uint16_t here.
+    # TODO: Use np.float16_t when cython supports it.
+    tensor_proto, np.ndarray[np.uint16_t, ndim=1] nparray):
+  cdef long i, n
+  n = nparray.size
+  for i in range(n):
+    tensor_proto.half_val.append(nparray[i])
+
+
 def AppendFloat32ArrayToTensorProto(
     tensor_proto, np.ndarray[np.float32_t, ndim=1] nparray):
   cdef long i, n
diff --git a/tensorflow/python/framework/framework_lib.py b/tensorflow/python/framework/framework_lib.py
index 392a4f65c6e62c..fffb6488425524 100644
--- a/tensorflow/python/framework/framework_lib.py
+++ b/tensorflow/python/framework/framework_lib.py
@@ -14,59 +14,7 @@
 # ==============================================================================
 
 # pylint: disable=unused-import,g-bad-import-order
-"""Classes and functions for building TensorFlow graphs.
-
-## Core graph data structures
-
-@@Graph
-@@Operation
-@@Tensor
-
-## Tensor types
-
-@@DType
-@@as_dtype
-
-## Utility functions
-
-@@device
-@@container
-@@name_scope
-@@colocate_with
-@@control_dependencies
-@@convert_to_tensor
-@@convert_to_tensor_or_indexed_slices
-@@convert_to_tensor_or_sparse_tensor
-@@get_default_graph
-@@reset_default_graph
-@@import_graph_def
-@@load_file_system_library
-@@load_op_library
-@@make_tensor_proto
-@@make_ndarray
-
-## Graph collections
-
-@@add_to_collection
-@@add_to_collections
-@@get_collection
-@@get_collection_ref
-@@GraphKeys
-
-## Defining new operations
-
-@@RegisterGradient
-@@NotDifferentiable
-@@NoGradient
-@@TensorShape
-@@Dimension
-@@op_scope
-@@get_seed
-
-## For libraries building on TensorFlow
-
-@@register_tensor_conversion_function
-"""
+"""Classes and functions for building TensorFlow graphs."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 2432ab378c8ed8..79ee57355df6d3 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -68,9 +68,10 @@ def foo(x, y):
   during the first call to the function. Subsequent function calls will refer to
   the same set of variables.
 
-  Definitions of functions are frozen in a graph as soon as the graph is used to
-  create a session. Therefore, nodes using the function must be created in the
-  graph before the corresponding session is created.
+  Definitions of functions in a graph are frozen as soon as the graph is used to
+  create a session. However, new functions and new calls to existing functions
+  may be added to the graph, with the new functions themselves becoming
+  immediately frozen.
 
   Example, but also see the [How To on functions](link_needed).
 
@@ -248,6 +249,9 @@ def __init__(self,
     # Constructed only when C API is enabled, lazily
     self._c_func = None
     self._sub_functions = dict()  # Constructed with _definition or _c_func
+    device_stack = ops.get_default_graph()._device_function_stack  # pylint: disable=protected-access
+    # Get the innermost device if possbile.
+    self._caller_device = device_stack[-1] if device_stack else None
 
     # Cached OpDef for this function. When C API is enabled, this is
     # the only part of FunctionDef that we cache in Python. When C API
@@ -255,12 +259,10 @@ def __init__(self,
     # another reference to _definition.signature
     self._op_def = None
 
-    self._args = []
     assert isinstance(input_types, (list, tuple))
-    for i in range(len(input_types)):
-      argname = argnames[i] if i < len(argnames) else ("arg%d" % i)
-      argtype = input_types[i]
-      self._args.append((argname, argtype))
+    self._arg_types = input_types
+    self._arg_names = [argnames[i] if i < len(argnames) else ("arg%d" % i)
+                       for i in range(len(input_types))]
 
   @property
   def name(self):
@@ -313,6 +315,16 @@ def captured_inputs(self):
     self._create_definition_if_needed()
     return self._extra_inputs
 
+  @property
+  def stateful_ops(self):
+    """Returns the list of stateful ops in function definition.
+
+    Returns:
+      A list of (op.name, op.type) pairs.
+    """
+    self._create_definition_if_needed()
+    return self._stateful_ops
+
   def _create_definition_if_needed(self):
     """Creates the function definition if it's not created yet."""
     with context.graph_mode():
@@ -323,56 +335,31 @@ def _create_definition_if_needed_impl(self):
     if self._definition is not None or self._c_func is not None:
       return
 
-    # Create the func_def object.
-    temp_graph = _FuncGraph(capture_by_value=self._capture_by_value)
-    with temp_graph.as_default():
-      # List of placeholders for the function_def.
-      inputs = []
-      for (argname, argtype) in self._args:
-        argholder = array_ops.placeholder(argtype, name=argname)
-        inputs.append(argholder)
-      # Call func and gather the output tensors.
-      with vs.variable_scope("", custom_getter=temp_graph.getvar):
-        outputs = self._func(*inputs)
-
-      # There is no way of distinguishing between a function not returning
-      # anything and a function returning None in Python.
-      # We need to allow the former and ideally want to forbid the latter as
-      # it is most likely user error.
-      # TODO(iga): Consider adding a @NoOutput decorator on top of @Defun to
-      # allow users to explicitly mark the function as not returning anything.
-      # For now, we allow a single None return and interpret it as a function
-      # with no output.
-      if outputs is None:
-        outputs = []
-      else:
-        # If func only returned one value, make it a tuple.
-        if not isinstance(outputs, (list, tuple)):
-          outputs = (outputs,)
-        if any([_ is None for _ in outputs]):
-          raise ValueError("Function can not return None.")
-      # Ensures each output is a Tensor in the function graph.
-      outputs = [ops.convert_to_tensor(t) for t in outputs]
-      outputs = [temp_graph.capture(t) if t.graph is not temp_graph else t
-                 for t in outputs]
+    temp_graph = func_graph_from_py_func(
+        self._func, self._arg_names, self._arg_types, self._func_name,
+        self._capture_by_value, self._caller_device)
+
     self._extra_inputs = temp_graph.extra_inputs
-    inputs.extend(temp_graph.extra_args)
     # pylint: disable=protected-access
     self._sub_functions = temp_graph._functions
     # pylint: enable=protected-access
 
     # Extra kwargs are treated as attrs on the function def.
-    base_func_name = self._func_name or _get_func_name(self._func)
-    kwargs_attr = _parse_kwargs_as_attrs(base_func_name,
-                                         **self._extra_kwargs)
+    if self._func_name:
+      base_func_name = self._func_name
+    else:
+      base_func_name = _get_func_name(self._func)
+      if self._grad_func:
+        base_func_name += ("_%s" % self._grad_func.name)
+    kwargs_attr = _parse_kwargs_as_attrs(base_func_name, **self._extra_kwargs)
 
     if not temp_graph._c_graph:  # pylint: disable=protected-access
       # Build the FunctionDef
       self._definition = graph_to_function_def.graph_to_function_def(
           temp_graph,
           temp_graph.get_operations(),
-          inputs,
-          outputs,
+          temp_graph.inputs,
+          temp_graph.outputs,
           out_names=self._out_names)
 
       for k in kwargs_attr:
@@ -402,8 +389,8 @@ def _create_definition_if_needed_impl(self):
           base_func_name,
           self._func_name is None,  # append_hash_to_fn_name
           None,  # opers
-          [t._as_tf_output() for t in inputs],
-          [t._as_tf_output() for t in outputs],
+          [t._as_tf_output() for t in temp_graph.inputs],
+          [t._as_tf_output() for t in temp_graph.outputs],
           output_names,
           None,  # opts
           description)
@@ -418,6 +405,10 @@ def _create_definition_if_needed_impl(self):
       else:
         self._func_name = compat.as_str(self._op_def.name)
 
+    self._stateful_ops = [(op.name, op.type)
+                          for op in temp_graph.get_operations()
+                          if op.op_def.is_stateful]
+
   def _set_c_attrs(self, attrs):
     """Sets `attrs` as attributes of self._c_func.
 
@@ -503,6 +494,12 @@ def __call__(self, *args, **kwargs):
     self.add_to_graph(ops.get_default_graph())
     args = [ops.convert_to_tensor(_) for _ in args] + self._extra_inputs
     ret, op = _call(self._signature, *args, **kwargs)
+
+    # Set a hidden attr in 'op' so that gradients_impl can refer back
+    # to this _DefinedFunction instance to access python_grad_func.
+    assert isinstance(op, ops.Operation)
+    setattr(op, "__defun", self)
+
     if self._shape_func is not None:
       shapes = self._shape_func(op)
       if len(shapes) != len(op.outputs):
@@ -591,12 +588,11 @@ def instantiate(self, input_types):
         # _OverloadedFunction. We need to instantiate it with the
         # right input types.
         output_types = [
-            dtypes.DType(_.type)
-            for _ in defined._signature.output_arg  # pylint: disable=protected-access
+            dtypes.DType(_.type) for _ in defined._signature.output_arg  # pylint: disable=protected-access
         ]
         # pylint: disable=protected-access
-        defined._grad_func = self._grad_func.instantiate(
-            input_types + output_types)
+        defined._grad_func = self._grad_func.instantiate(input_types +
+                                                         output_types)
         # pylint: enable=protected-access
       self._overload[key] = defined
     return defined
@@ -625,16 +621,33 @@ class _FuncGraph(ops.Graph):
   function argument and the caller passes in the captured tensor.
   """
 
-  def __init__(self, capture_by_value, *args, **kwargs):
+  def __init__(self, name, capture_by_value, *args, **kwargs):
     super(_FuncGraph, self).__init__(*args, **kwargs)
     self._capture_by_value = capture_by_value
     self._building_function = True
     self._outer_graph = ops.get_default_graph()
     self._vscope = vs.get_variable_scope()
     self._old_custom_getter = self._vscope.custom_getter
+
+    # The name of the function.
+    self.name = name
+    # Placeholder tensors representing the inputs to this function. The tensors
+    # are in this _FuncGraph.
+    self.inputs = []
+    # Tensors that will be returned this function. The tensors are in this
+    # _FuncGraph.
+    self.outputs = []
+    # Maps external tensor -> internal tensor (e.g. input placeholder).
     self._captured = {}
+    # The external tensors that have been captured as inputs and must be passed
+    # to this function (empty if capturing by value, otherwise these are the
+    # keys of _captured).
     self.extra_inputs = []
+    # Input placeholders that been added for captured values (empty if capturing
+    # by value).
     self.extra_args = []
+    # Captured variables.
+    # TODO(skyewm): is this needed?
     self.extra_vars = []
 
   def getvar(
@@ -685,7 +698,7 @@ def create_op(self, op_type, inputs, data_types, **kwargs):
     return super(_FuncGraph, self).create_op(op_type, inputs, data_types,
                                              **kwargs)
 
-  def capture(self, tensor):
+  def capture(self, tensor, name=None):
     """Adds the given tensor to this graph and returns the captured tensor."""
     if tensor in self._captured:
       # Captured already.
@@ -693,19 +706,26 @@ def capture(self, tensor):
     elif self._capture_by_value:
       return self._add_tensor_and_parents(tensor)
     else:
-      return self._capture_tensor_as_extra_input(tensor)
+      return self._capture_tensor_as_extra_input(tensor, name)
 
-  def _capture_tensor_as_extra_input(self, tensor):
+  def _capture_tensor_as_extra_input(self, tensor, name=None):
     # Substitute with a placeholder.
     self.extra_inputs.append(tensor)
     # Hoist the new input placeholder out of any control flow context
     # we're currently in.
     with ops.control_dependencies(None):
-      ph = array_ops.placeholder(tensor.dtype, shape=tensor.get_shape())
+      ph = array_ops.placeholder(
+          tensor.dtype, shape=tensor.get_shape(), name=name)
     # pylint: disable=protected-access
     if ops._USE_C_SHAPES:
-      handle_data = c_api.GetResourceHandleShapeAndType(tensor.graph._c_graph,
-                                                        tensor._as_tf_output())
+      if isinstance(tensor, ops.EagerTensor):
+        handle_data = tensor._handle_data
+        if handle_data:
+          handle_data = handle_data.SerializeToString()
+      else:
+        handle_data = c_api.GetResourceHandleShapeAndType(
+            tensor.graph._c_graph, tensor._as_tf_output())
+
       if handle_data:
         c_api.SetResourceHandleShapeAndType(ph.graph._c_graph,
                                             ph._as_tf_output(),
@@ -713,6 +733,7 @@ def _capture_tensor_as_extra_input(self, tensor):
     else:
       ph._handle_data = tensor._handle_data
     # pylint: enable=protected-access
+    self.inputs.append(ph)
     self._captured[tensor] = ph
     self.extra_args.append(ph)
     if _is_guaranteed_const(tensor):
@@ -751,6 +772,63 @@ def _add_op_and_parents(self, op):
     return captured_op
 
 
+def func_graph_from_py_func(func, arg_names, arg_types, name=None,
+                            capture_by_value=False, device=None):
+  """Returns a _FuncGraph generated from `func`.
+
+  Args:
+    func: A Python callable which constructs a TF function body. The arguments
+      must correspond to `arg_types`. Returns a value or list/tuple of values.
+      No returned value can be None.
+    arg_names: A sequence of strings for the function argument names.
+    arg_types: A sequence of the function's argument types.
+    name: The function name. If None, the name is derived from `func`.
+    capture_by_value: boolean. If True, captured values will be copied into the
+      function body.
+    device: device name or function.
+
+  Returns:
+    A _FuncGraph.
+
+  Raises:
+    ValueError: if func returns None.
+  """
+  if not name:
+    name = _get_func_name(func)
+  func_graph = _FuncGraph(name, capture_by_value)
+  with func_graph.as_default(), ops.device(device):
+    # Create placeholders for the function arguments.
+    for (argname, argtype) in zip(arg_names, arg_types):
+      argholder = array_ops.placeholder(argtype, name=argname)
+      func_graph.inputs.append(argholder)
+    # Call func and gather the output tensors.
+    with vs.variable_scope("", custom_getter=func_graph.getvar):
+      outputs = func(*func_graph.inputs)
+
+    # There is no way of distinguishing between a function not returning
+    # anything and a function returning None in Python.
+    # We need to allow the former and ideally want to forbid the latter as
+    # it is most likely user error.
+    # TODO(iga): Consider adding a @NoOutput decorator on top of @Defun to
+    # allow users to explicitly mark the function as not returning anything.
+    # For now, we allow a single None return and interpret it as a function
+    # with no output.
+    if outputs is None:
+      outputs = []
+    else:
+      # If func only returned one value, make it a tuple.
+      if not isinstance(outputs, (list, tuple)):
+        outputs = (outputs,)
+      if any([_ is None for _ in outputs]):
+        raise ValueError("Function can not return None.")
+    # Ensures each output is a Tensor in the function graph.
+    outputs = [ops.convert_to_tensor(t) for t in outputs]
+    outputs = [func_graph.capture(t) if t.graph is not func_graph else t
+               for t in outputs]
+    func_graph.outputs = outputs
+  return func_graph
+
+
 def _is_guaranteed_const(tensor):
   """Determines whether `tensor` is guaranteed to be a constant.
 
@@ -833,8 +911,8 @@ def _call(sig, *inputs, **kwargs):
     ValueError: if the arguments are invalid.
   """
   if len(inputs) != len(sig.input_arg):
-    raise ValueError("Expected number of arguments: %d, received: %d" %
-                     (len(sig.input_arg), len(inputs)))
+    raise ValueError("Expected number of arguments: %d, received: %d" % (len(
+        sig.input_arg), len(inputs)))
   name = kwargs.pop("name", None)
   g = ops.get_default_graph()
   func_name = sig.name
@@ -950,8 +1028,8 @@ def _from_library(lib):
       fdef for fdef in lib.function if func_to_grad[fdef.signature.name] is None
   ]
   if not ready:
-    raise ValueError("FunctionDefLibrary contains cyclic gradient functions!\n"
-                     + str(lib))
+    raise ValueError(
+        "FunctionDefLibrary contains cyclic gradient functions!\n" + str(lib))
   # function name -> _DefinedFunction
   initialized = {}
 
diff --git a/tensorflow/python/framework/function_def_to_graph.py b/tensorflow/python/framework/function_def_to_graph.py
new file mode 100644
index 00000000000000..4fecc41343f262
--- /dev/null
+++ b/tensorflow/python/framework/function_def_to_graph.py
@@ -0,0 +1,189 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Utlity to convert FunctionDef to GraphDef and Graph."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.framework import graph_pb2
+from tensorflow.core.framework import types_pb2
+from tensorflow.core.framework import versions_pb2
+from tensorflow.python.framework import function
+from tensorflow.python.framework import importer
+from tensorflow.python.framework import op_def_registry
+from tensorflow.python.framework import versions
+
+
+def function_def_to_graph(fdef, input_shapes=None):
+  """Converts a FunctionDef to a function._FuncGraph (sub-class Graph).
+
+  The returned _FuncGraph's `name`, `inputs` and `outputs` fields will be set.
+  The input tensors are represented as placeholders.
+
+  Note: `_FuncGraph.inputs` and `_FuncGraph._captured` are not set and may be
+  set by the caller.
+
+  Args:
+    fdef: FunctionDef.
+    input_shapes: Optional. A list of TensorShape objects of the shapes of
+      function inputs. If specified, its length must match length of
+      `fdef.signature.input_arg`. If a shape is None, the corresponding input
+      placeholder will have unknown shape.
+
+  Returns:
+    A _FuncGraph.
+  """
+  func_graph = function._FuncGraph(fdef.signature.name, capture_by_value=False)  # pylint: disable=protected-access
+  graph_def, nested_to_flat_tensor_name = function_def_to_graph_def(
+      fdef, input_shapes)
+
+  with func_graph.as_default():
+    # Add all function nodes to the graph.
+    importer.import_graph_def(graph_def, name="")
+
+    # Initialize fields specific to _FuncGraph.
+
+    # inputs
+    input_tensor_names = [
+        nested_to_flat_tensor_name[arg.name] for arg in fdef.signature.input_arg
+    ]
+    func_graph.inputs = [
+        func_graph.get_tensor_by_name(name) for name in input_tensor_names
+    ]
+
+    # outputs
+    output_tensor_names = [
+        nested_to_flat_tensor_name[fdef.ret[arg.name]]
+        for arg in fdef.signature.output_arg
+    ]
+    func_graph.outputs = [
+        func_graph.get_tensor_by_name(name) for name in output_tensor_names
+    ]
+
+  return func_graph
+
+
+def function_def_to_graph_def(fdef, input_shapes=None):
+  """Convert a FunctionDef to a GraphDef.
+
+  Steps:
+  1. Creates placeholder nodes corresponding to inputs in
+     `FunctionDef.signature.input_arg`.
+  2. Adds NodeDefs in `FunctionDef.node_def` to `GraphDef.node`.
+  3. Renames inputs of all nodes to use the convention of GraphDef instead of
+     FunctionDef. See comment on `FunctionDef.node_def` on how the tensor naming
+     in FunctionDefs is different from GraphDefs.
+
+  Args:
+    fdef: FunctionDef.
+    input_shapes: Optional. A list of TensorShape objects of the shapes of
+      function inputs. If specified, its length must match length of
+      `fdef.signature.input_arg`. If a shape is None, the corresponding input
+      placeholder will have unknown shape.
+
+  Returns:
+    A tuple of (GraphDef, dict<string, string>). The dict contains a mapping
+    from nested tensor names (in FunctionDef) to flattened names (in GraphDef).
+
+  Raises:
+    ValueError: If the length of input_shapes does not match the number of
+      input_args or if the FunctionDef is invalid.
+  """
+  graph_def = graph_pb2.GraphDef()
+  graph_def.versions.CopyFrom(
+      versions_pb2.VersionDef(
+          producer=versions.GRAPH_DEF_VERSION,
+          min_consumer=versions.GRAPH_DEF_VERSION_MIN_CONSUMER))
+
+  if input_shapes and len(input_shapes) != len(fdef.signature.input_arg):
+    raise ValueError("Length of input_shapes must match the number of " +
+                     "input_args. len(input_shapes): {} len(input_arg): {}".
+                     format(len(input_shapes), len(fdef.signature.input_arg)))
+
+  # 1. Create placeholders for input nodes.
+  for i, arg_def in enumerate(fdef.signature.input_arg):
+    node_def = graph_def.node.add()
+    node_def.name = arg_def.name
+    node_def.op = "Placeholder"
+    node_def.attr["dtype"].type = arg_def.type
+    if input_shapes and input_shapes[i] is not None:
+      node_def.attr["shape"].shape.CopyFrom(input_shapes[i].as_proto())
+
+  # 2. Copy all body NodeDefs to the GraphDef.
+  graph_def.node.extend(fdef.node_def)
+
+  # 3. Perform the renaming.
+
+  # Build the tensor name mapping then flatten the tensor names.
+  # See comment on `FunctionDef.node_def` on how the tensor naming in
+  # FunctionDefs is different from GraphDefs.
+  nested_to_flat_tensor_name = {}
+
+  for arg_def in fdef.signature.input_arg:
+    nested_to_flat_tensor_name[arg_def.name] = "{}:0".format(arg_def.name)
+
+  for node_def in fdef.node_def:
+    op_def = op_def_registry.get_registered_ops().get(node_def.op)
+    if not op_def:
+      # TODO(b/80470245): Support functions which refer other functions.
+      raise NotImplementedError(
+          "No op registered for {},".format(node_def.op) +
+          " it may be a function. function_def_to_graph_def " +
+          "currently does not support converting functions with " +
+          "references to other graph functions.")
+
+    for attr in op_def.attr:
+      if attr.type in ("func", "list(func)"):
+        # TODO(b/80470245): Support functions which refer other functions.
+        raise NotImplementedError("Unsupported attr {} ".format(attr.name) +
+                                  " with type {}".format(attr.type) +
+                                  " in op {}. ".format(op_def.name) +
+                                  "function_def_to_graph_def currently does " +
+                                  "not support converting functions with " +
+                                  "references to other graph functions.")
+
+    # Iterate over output_args in op_def to build the map.
+    # Index of the output tensor in the flattened list of *all* output
+    # tensors of the op.
+    flattened_index = 0
+    for arg_def in op_def.output_arg:
+      num_args = _get_num_args(arg_def, node_def)
+      for i in range(num_args):
+        # Map tensor names from "node_name:output_arg_name:index" to
+        # "node_name:flattened_index".
+        nested_name = "{}:{}:{}".format(node_def.name, arg_def.name, i)
+        flat_name = "{}:{}".format(node_def.name, flattened_index)
+        nested_to_flat_tensor_name[nested_name] = flat_name
+        flattened_index += 1
+
+  # Update inputs of all nodes in graph.
+  for node_def in graph_def.node:
+    for i in range(len(node_def.input)):
+      node_def.input[i] = nested_to_flat_tensor_name[node_def.input[i]]
+
+  return graph_def, nested_to_flat_tensor_name
+
+
+# Based on implementation in core/framework/node_def_util.cc::ComputeArgRange.
+def _get_num_args(arg_def, node_def):
+  if arg_def.number_attr:
+    return node_def.attr[arg_def.number_attr].i
+  elif arg_def.type_list_attr:
+    return len(node_def.attr[arg_def.type_list_attr].list.type)
+  elif arg_def.type_attr or arg_def.type != types_pb2.DT_INVALID:
+    return 1
+  else:
+    raise ValueError("Invalid arg_def:\n\n{}".format(str(arg_def)))
diff --git a/tensorflow/python/framework/function_def_to_graph_test.py b/tensorflow/python/framework/function_def_to_graph_test.py
new file mode 100644
index 00000000000000..0f4e6ef54fb02c
--- /dev/null
+++ b/tensorflow/python/framework/function_def_to_graph_test.py
@@ -0,0 +1,184 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.python.framework.function_def_to_graph."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function_def_to_graph
+from tensorflow.python.framework import graph_to_function_def
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class FunctionDefToGraphTest(test.TestCase):
+
+  def _build_function_def(self):
+    with ops.Graph().as_default() as g:
+      # Inputs
+      x = array_ops.placeholder(dtypes.float32, name="x")
+      y = array_ops.placeholder(dtypes.float32, name="y")
+
+      # Outputs
+      sum_squares = math_ops.add_n(
+          [math_ops.pow(x, 2), math_ops.pow(y, 2)], name="sum_squares")
+      sum_cubes = math_ops.add_n(
+          [math_ops.pow(x, 3), math_ops.pow(y, 3)], name="sum_cubes")
+    fdef = graph_to_function_def.graph_to_function_def(
+        g,
+        g.get_operations(),
+        [x, y],  # Inputs
+        [sum_squares, sum_cubes])  # Outputs.
+    fdef.signature.name = "_whats_in_a_name"
+    return fdef
+
+  def testInputsAndOutputs(self):
+    fdef = self._build_function_def()
+    g = function_def_to_graph.function_def_to_graph(fdef)
+    self.assertEqual(g.name, "_whats_in_a_name")
+    with self.test_session(graph=g) as sess:
+      inputs = sess.run(g.inputs, feed_dict={"x:0": 2, "y:0": 3})
+      self.assertSequenceEqual(inputs, [2.0, 3.0])
+      outputs = sess.run(g.outputs, feed_dict={"x:0": 2, "y:0": 3})
+      self.assertSequenceEqual(outputs, [13.0, 35.0])
+
+  def testShapes(self):
+    fdef = self._build_function_def()
+
+    g = function_def_to_graph.function_def_to_graph(fdef)
+    self.assertIsNone(g.inputs[0].shape.dims)  # Unknown dims.
+    self.assertIsNone(g.inputs[1].shape.dims)  # Unknown dims.
+    self.assertIsNone(g.outputs[0].shape.dims)  # Unknown dims.
+    self.assertIsNone(g.outputs[1].shape.dims)  # Unknown dims.
+
+    g = function_def_to_graph.function_def_to_graph(
+        fdef, input_shapes=[tensor_shape.vector(5),
+                            tensor_shape.vector(5)])
+    self.assertSequenceEqual(g.inputs[0].shape.dims, [5])
+    self.assertSequenceEqual(g.inputs[1].shape.dims, [5])
+    self.assertSequenceEqual(g.outputs[0].shape.dims, [5])
+    self.assertSequenceEqual(g.outputs[1].shape.dims, [5])
+
+    g = function_def_to_graph.function_def_to_graph(
+        fdef, input_shapes=[None, tensor_shape.matrix(5, 7)])
+    print(g.as_graph_def())
+    self.assertIsNone(g.inputs[0].shape.dims)
+    self.assertSequenceEqual(g.inputs[1].shape.dims, [5, 7])
+    self.assertSequenceEqual(g.outputs[0].shape.dims, [5, 7])
+    self.assertSequenceEqual(g.outputs[1].shape.dims, [5, 7])
+
+    # Should raise a ValueError if the length of input_shapes does not match
+    # the number of input args in FunctionDef.signature.input_arg.
+    with self.assertRaises(ValueError):
+      g = function_def_to_graph.function_def_to_graph(
+          fdef, input_shapes=[tensor_shape.matrix(5, 7)])
+
+
+class FunctionDefToGraphDefTest(test.TestCase):
+
+  def _build_function_def(self):
+    with ops.Graph().as_default() as g:
+      # Inputs:    x    y    z
+      #            |\   |   /
+      #            | \  |  /
+      #            |  foo_1     list_output
+      #            |   / \       /       \
+      #            | d_1 e_1  a:1        a:0
+      #            |  \   |   /           |
+      #            |   \  |  /            |
+      #            |    foo_2             |
+      #            |     / \              |
+      # Outputs:   x   d_2 e_2           a:0
+
+      x = array_ops.placeholder(dtypes.float32, name="x")
+      y = array_ops.placeholder(dtypes.int32, name="y")
+      z = array_ops.placeholder(dtypes.int32, name="z")
+
+      d_1, e_1 = test_ops._op_def_lib.apply_op(
+          "Foo1", name="foo_1", a=x, b=y, c=z)
+
+      list_output0, list_output1 = test_ops.list_output(
+          T=[dtypes.int32, dtypes.int32], name="list_output")
+
+      d_2, e_2 = test_ops.foo1(a=d_1, b=e_1, c=list_output1, name="foo_2")
+
+    fdef = graph_to_function_def.graph_to_function_def(
+        g,
+        g.get_operations(),
+        [x, y, z],  # Inputs
+        [x, d_2, e_2, list_output0])  # Outputs.
+
+    # Assert that the FunctionDef was correctly built.
+    assert len(fdef.node_def) == 3  # 2 Foo1 nodes and 1 ListOutput node.
+    assert fdef.node_def[0].op == "Foo1"
+    assert fdef.node_def[0].input == ["x", "y", "z"]
+    assert fdef.node_def[1].op == "ListOutput"
+    assert not fdef.node_def[1].input
+    assert fdef.node_def[2].op == "Foo1"
+    assert fdef.node_def[2].input == [
+        "foo_1:d:0", "foo_1:e:0", "list_output:a:1"
+    ]
+    return fdef
+
+  def testTensorNames(self):
+    fdef = self._build_function_def()
+    g, tensor_name_map = function_def_to_graph.function_def_to_graph_def(fdef)
+
+    # Verify that inputs of body nodes are correctly renamed.
+    # foo_1
+    self.assertSequenceEqual(g.node[3].input, ["x:0", "y:0", "z:0"])
+    # foo_2
+    self.assertSequenceEqual(g.node[5].input,
+                             ["foo_1:0", "foo_1:1", "list_output:1"])
+
+    # Verify that the `tensor_name_map` has the correct mapping.
+    self.assertDictEqual(
+        tensor_name_map, {
+            "x": "x:0",
+            "y": "y:0",
+            "z": "z:0",
+            "foo_1:d:0": "foo_1:0",
+            "foo_1:e:0": "foo_1:1",
+            "list_output:a:0": "list_output:0",
+            "list_output:a:1": "list_output:1",
+            "foo_2:d:0": "foo_2:0",
+            "foo_2:e:0": "foo_2:1",
+        })
+
+  def testShapes(self):
+    fdef = self._build_function_def()
+    g, _ = function_def_to_graph.function_def_to_graph_def(
+        fdef,
+        input_shapes=[tensor_shape.scalar(),
+                      tensor_shape.vector(5), None])
+    self.assertEqual("shape" in g.node[0].attr, True)
+    self.assertSequenceEqual(
+        tensor_shape.TensorShape(g.node[0].attr["shape"].shape).as_list(), [])
+    self.assertEqual(g.node[0].attr["shape"].shape.unknown_rank, False)
+    self.assertEqual("shape" in g.node[1].attr, True)
+    self.assertSequenceEqual(
+        tensor_shape.TensorShape(g.node[1].attr["shape"].shape).as_list(), [5])
+    self.assertEqual(g.node[0].attr["shape"].shape.unknown_rank, False)
+    self.assertFalse("shape" in g.node[2].attr)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 594596ec1e195d..15e41ba91f9ae1 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -36,6 +36,7 @@
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
+from tensorflow.python.framework.errors import InvalidArgumentError
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
@@ -136,7 +137,8 @@ def MyIdentityFunc(a):
   def testTooManyOutputNames(self):
 
     @function.Defun(
-        dtypes.float32, func_name="MyIdentity",
+        dtypes.float32,
+        func_name="MyIdentity",
         out_names=["my_result1", "my_result2"])
     def MyIdentityFunc(a):
       return a
@@ -181,6 +183,8 @@ def testDefineFunction2ArgsOutputName(self):
     def APlus2B(a, b):
       return a + b * 2
 
+    # APlus2B is stateless.
+    self.assertEqual([], APlus2B.stateful_ops)
     with ops.Graph().as_default():
       call = APlus2B([1.0], [2.0])
       self.assertEqual("APlus2B", call.op.name)
@@ -239,10 +243,11 @@ def Forward(x):
 
     inp = np.array([-1, 1, 2, -2], dtype=np.float32)
     feed = {x: inp}
-    cfg = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
-        optimizer_options=config_pb2.OptimizerOptions(
-            opt_level=config_pb2.OptimizerOptions.L1,
-            do_function_inlining=True)))
+    cfg = config_pb2.ConfigProto(
+        graph_options=config_pb2.GraphOptions(
+            optimizer_options=config_pb2.OptimizerOptions(
+                opt_level=config_pb2.OptimizerOptions.L1,
+                do_function_inlining=True)))
     with session.Session(graph=g, config=cfg) as sess:
       out, = sess.run(dx, feed)
     self.assertAllClose(1 - np.square(np.tanh(inp)), out)
@@ -334,18 +339,20 @@ def Foo(x):
       y = Foo(x)
       dx, = gradients_impl.gradients(y, [x])
 
-    cfg = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
-        optimizer_options=config_pb2.OptimizerOptions(
-            opt_level=config_pb2.OptimizerOptions.L0,
-            do_common_subexpression_elimination=True,
-            do_function_inlining=True,
-            do_constant_folding=True)))
+    cfg = config_pb2.ConfigProto(
+        graph_options=config_pb2.GraphOptions(
+            optimizer_options=config_pb2.OptimizerOptions(
+                opt_level=config_pb2.OptimizerOptions.L0,
+                do_common_subexpression_elimination=True,
+                do_function_inlining=True,
+                do_constant_folding=True)))
 
     with self.test_session(graph=g, config=cfg):
       self.assertAllClose(y.eval(), 6.)
       self.assertAllClose(dx.eval(), 2.)
 
   def _testZNoDepOnY(self, use_const_grad_ys):
+
     @function.Defun(dtypes.float32, dtypes.float32)
     def Foo(x, y):  # pylint: disable=unused-argument
       return x * 2
@@ -424,6 +431,8 @@ def Foo(x):
       with ops.control_dependencies([check]):
         return x * 2
 
+    # Foo contains a stateful op (Assert).
+    self.assertEqual([("Assert", "Assert")], Foo.stateful_ops)
     g = ops.Graph()
     with g.as_default(), self.test_session():
       self.assertAllEqual(Foo(constant_op.constant(3.0)).eval(), 6.0)
@@ -775,9 +784,9 @@ def testCaptureInWhileLoop(self):
 
       @function.Defun()
       def Foo():
-        return control_flow_ops.while_loop(lambda i: i < 10,
-                                           lambda i: i + x,
+        return control_flow_ops.while_loop(lambda i: i < 10, lambda i: i + x,
                                            [0])
+
       y = Foo()
 
     with self.test_session(graph=g) as sess:
@@ -790,9 +799,8 @@ def testCaptureInCond(self):
 
       @function.Defun(dtypes.bool)
       def Foo(pred):
-        return control_flow_ops.cond(pred,
-                                     lambda: x,
-                                     lambda: x + 1)
+        return control_flow_ops.cond(pred, lambda: x, lambda: x + 1)
+
       y = Foo(True)
       z = Foo(False)
 
@@ -806,17 +814,11 @@ def testStableName(self):
     def Foo(x, y, z):
       return math_ops.tanh(math_ops.matmul(x, y) + z)
 
-    # We added more randomness to function names in C API.
-    # TODO(iga): Remove this if statement when we switch to C API.
-    if ops._USE_C_API:  # pylint: disable=protected-access
-      if sys.byteorder == "big":
-        self.assertEqual("Foo_kEdkAG8SJvg",
-                         Foo.instantiate([dtypes.float32] * 3).name)
-      else:
-        self.assertEqual("Foo_aCYSbwBkR5A",
-                         Foo.instantiate([dtypes.float32] * 3).name)
+    if sys.byteorder == "big":
+      self.assertEqual("Foo_kEdkAG8SJvg",
+                       Foo.instantiate([dtypes.float32] * 3).name)
     else:
-      self.assertEqual("Foo_d643acf7",
+      self.assertEqual("Foo_aCYSbwBkR5A",
                        Foo.instantiate([dtypes.float32] * 3).name)
 
   def testSignatureHash(self):
@@ -945,6 +947,7 @@ def testTwoInputsSameOp(self):
     self.assertEqual(len(f.signature.input_arg), 3)
 
   def testGradientWithIntegerFunctionArgument(self):
+
     @function.Defun(dtypes.int32, dtypes.float32)
     def Foo(t, x):
       return x[t]
@@ -959,8 +962,7 @@ def Foo(t, x):
     x = np.zeros((2,)).astype(np.float32)
     with session.Session(graph=g) as sess:
       self.assertAllClose(
-          np.array([1.0, 0.0]).astype(np.float32),
-          sess.run(dinp, {inp: x}))
+          np.array([1.0, 0.0]).astype(np.float32), sess.run(dinp, {inp: x}))
 
   def testFunctionMarkedStateful(self):
 
@@ -1073,6 +1075,60 @@ def CapturesGuaranteedConst():
       sess.run(var.initializer)
       _ = sess.run(CapturesGuaranteedConst(), {also_not_const: 1.0})
 
+  def testSameFunctionDifferentGrads(self):
+
+    def PartOne(x):
+
+      # Default grad is dx = dy * 2
+      @function.Defun(dtypes.float32)
+      def Foo(x):
+        return x * 2
+
+      return Foo(x)
+
+    def PartTwo(x):
+
+      @function.Defun(dtypes.float32, dtypes.float32)
+      def Bar(x, dy):
+        return x + dy  # crazy backprop
+
+      @function.Defun(dtypes.float32, grad_func=Bar)
+      def Foo(x):
+        return x * 2
+
+      return Foo(x)
+
+    def PartThree(x):
+
+      def Bar(op, dy):
+        return op.inputs[0] * dy / 2  # crazy backprop
+
+      @function.Defun(dtypes.float32, python_grad_func=Bar)
+      def Foo(x):
+        return x * 2
+
+      return Foo(x)
+
+    g = ops.Graph()
+    with g.as_default():
+      x = constant_op.constant(100.)
+      x0 = x
+      y0 = PartOne(x0)
+      dx0, = gradients_impl.gradients(ys=[y0], xs=[x0])
+      x1 = x
+      y1 = PartTwo(x1)
+      dx1, = gradients_impl.gradients(ys=[y1], xs=[x1])
+      x2 = x
+      y2 = PartThree(x2)
+      dx2, = gradients_impl.gradients(ys=[y2], xs=[x2])
+
+    with self.test_session(graph=g) as sess:
+      v0, v1, v2 = sess.run([dx0, dx1, dx2])
+
+    self.assertAllEqual(v0, 2.)
+    self.assertAllEqual(v1, 101.)
+    self.assertAllEqual(v2, 50.)
+
 
 @test_util.with_c_shapes
 class FunctionsFromProtos(test.TestCase):
@@ -1271,9 +1327,10 @@ def testExperimentalAttrs(self):
     @function.Defun(dtypes.int32, experimental_tag="tag_value")
     def FunctionWithAttr(i):
       return array_ops.identity(i)
+
     self.assertTrue("experimental_tag" in FunctionWithAttr.definition.attr)
-    self.assertEqual(
-        FunctionWithAttr.definition.attr["experimental_tag"].s, b"tag_value")
+    self.assertEqual(FunctionWithAttr.definition.attr["experimental_tag"].s,
+                     b"tag_value")
 
 
 @test_util.with_c_shapes
@@ -1401,7 +1458,8 @@ def Loop(cell, w, i):
       return Loop(cell, weights, inp)
 
     cell = function.Defun(dtypes.float32, dtypes.float32, dtypes.float32,
-                          dtypes.float32)(cell)
+                          dtypes.float32)(
+                              cell)
     if mode == "cell":
       # Just represent the LSTM as a function.
       return Loop(cell, weights, inp)
@@ -1500,12 +1558,13 @@ class FunctionInlineControlTest(test.TestCase):
 
   def testFoo(self):
     dtype = dtypes.float32
-    cfg = config_pb2.ConfigProto(graph_options=config_pb2.GraphOptions(
-        optimizer_options=config_pb2.OptimizerOptions(
-            opt_level=config_pb2.OptimizerOptions.L0,
-            do_common_subexpression_elimination=True,
-            do_function_inlining=True,
-            do_constant_folding=True)))
+    cfg = config_pb2.ConfigProto(
+        graph_options=config_pb2.GraphOptions(
+            optimizer_options=config_pb2.OptimizerOptions(
+                opt_level=config_pb2.OptimizerOptions.L0,
+                do_common_subexpression_elimination=True,
+                do_function_inlining=True,
+                do_constant_folding=True)))
     cell_func_call_pattern = re.compile(r"Cell[^/]*\(")
     for noinline in [False, True]:
 
@@ -1659,5 +1718,91 @@ def testBasicResource(self):
     self._testSimpleModel(False, use_resource=True)
 
 
+class DevicePlacementTest(test.TestCase):
+
+  def testNoDeviceGraph(self):
+    with ops.Graph().as_default():
+
+      @function.Defun(*[dtypes.float32] * 2)
+      def Matmul(a, b):
+        return math_ops.matmul(a, b)
+
+      Matmul(1., 2.)
+
+      gdef = ops.get_default_graph().as_graph_def()
+      self.assertAllEqual(len(gdef.library.function), 1)
+      fdef = gdef.library.function[0]
+
+      for node in fdef.node_def:
+        self.assertAllEqual(node.device, "")
+
+  def testNestedDevices(self):
+    with ops.Graph().as_default(), ops.device("CPU:0"):
+
+      @function.Defun(*[dtypes.float32] * 2)
+      def Matmul(a, b):
+        return math_ops.matmul(a, b)
+
+      with ops.device("CPU:1"):
+
+        @function.Defun(*[dtypes.float32] * 2)
+        def Divide(a, b):
+          return math_ops.divide(a, b)
+
+        Divide(Matmul(1., 2.), 3.)
+
+      gdef = ops.get_default_graph().as_graph_def()
+      matmul_fdef = [
+          f for f in gdef.library.function if "Matmul" in f.signature.name
+      ]
+      divide_fdef = [
+          f for f in gdef.library.function if "Divide" in f.signature.name
+      ]
+      self.assertAllEqual(len(matmul_fdef), 1)
+      self.assertAllEqual(len(divide_fdef), 1)
+      for node in matmul_fdef[0].node_def:
+        self.assertAllEqual(node.device, "/device:CPU:0")
+      for node in divide_fdef[0].node_def:
+        self.assertAllEqual(node.device, "/device:CPU:1")
+
+  def _testNestedDeviceWithSameFunction(self, func_name):
+
+    def MatmulWrap(a, b):
+
+      @function.Defun(
+          func_name=func_name, *[dtypes.int32] * 2)
+      def Matmul(a, b):
+        return math_ops.matmul(a, b)
+
+      return Matmul(a, b)
+
+    with ops.Graph().as_default(), ops.device("CPU:0"):
+      c = MatmulWrap(1, 2)
+
+      with ops.device("CPU:1"):
+        MatmulWrap(c, 3)
+
+      gdef = ops.get_default_graph().as_graph_def()
+
+      devices = []
+      for node in gdef.library.function[0].node_def:
+        devices.append(node.device)
+      for node in gdef.library.function[1].node_def:
+        devices.append(node.device)
+
+      self.assertAllEqual(sorted(devices), ["/device:CPU:0", "/device:CPU:1"])
+
+  def testFunctionWithName(self):
+    with self.assertRaises(InvalidArgumentError) as cm:
+      self._testNestedDeviceWithSameFunction("MatmulTest")
+    self.assertEqual(
+        cm.exception.message,
+        "Cannot add function \'MatmulTest\' because a different "
+        "function with the same name already exists.")
+
+  def testFunctionWithoutName(self):
+    self._testNestedDeviceWithSameFunction(None)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index 5112bea48b5033..72eb7e0eeb73fb 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -17,78 +17,21 @@
 from __future__ import division
 from __future__ import print_function
 
-import collections
 import contextlib
-import copy
 
-from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import graph_pb2
-from tensorflow.core.framework import types_pb2
 from tensorflow.python import pywrap_tensorflow as c_api
 from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import device as pydev
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import function
 from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.util import compat
 from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.tf_export import tf_export
 
 
-# TODO(josh11b): SWIG the code from node_def_util instead of duplicating
-# the logic here.
-def _GetNodeAttr(node_def, attr_name):
-  if attr_name not in node_def.attr:
-    raise ValueError('Expected one attr with name %r in %s.' % (attr_name,
-                                                                str(node_def)))
-  return node_def.attr[attr_name]
-
-
-def _ArgToTypesNoRef(node_def, arg_def):
-  if arg_def.number_attr:
-    repeats = _GetNodeAttr(node_def, arg_def.number_attr).i
-    if arg_def.type_attr:
-      dtype = _GetNodeAttr(node_def, arg_def.type_attr).type
-    else:
-      assert arg_def.type != types_pb2.DT_INVALID
-      dtype = arg_def.type
-    return [dtype] * repeats
-  elif arg_def.type_attr:
-    return [_GetNodeAttr(node_def, arg_def.type_attr).type]
-  elif arg_def.type_list_attr:
-    return _GetNodeAttr(node_def, arg_def.type_list_attr).list.type
-  else:
-    assert arg_def.type != types_pb2.DT_INVALID
-    return [arg_def.type]
-
-
-def _SingleArgToTypes(node_def, arg_def):
-  types = _ArgToTypesNoRef(node_def, arg_def)
-  if arg_def.is_ref:
-    return [dtypes.as_dtype(dt)._as_ref.as_datatype_enum for dt in types]  # pylint: disable=protected-access
-  return types
-
-
-def _ArgsToTypes(node_def, arg_list):
-  types = []
-  for arg_def in arg_list:
-    types.extend(_SingleArgToTypes(node_def, arg_def))
-  return types
-
-
-def _InputTypes(node_def, op_dict):
-  op_def = op_dict[node_def.op]
-  return _ArgsToTypes(node_def, op_def.input_arg)
-
-
-def _OutputTypes(node_def, op_dict):
-  op_def = op_dict[node_def.op]
-  return _ArgsToTypes(node_def, op_def.output_arg)
-
-
 def _IsControlInput(input_name):
   # Expected format: '^operation_name' (control input).
   return input_name.startswith('^')
@@ -128,18 +71,6 @@ def _ParseTensorName(tensor_name):
     raise ValueError('Cannot convert %r to a tensor name.' % (tensor_name,))
 
 
-def _CanonicalInputName(input_name):
-  input_name = compat.as_str(input_name)
-  if _IsControlInput(input_name):
-    return input_name
-  input_op_name, output_index = _ParseTensorName(input_name)
-  return '%s:%d' % (input_op_name, output_index)
-
-
-def _InvalidNodeMessage(node, message):
-  return 'graph_def is invalid at node %r: %s.' % (node.name, message)
-
-
 @contextlib.contextmanager
 def _MaybeDevice(device):
   """Applies the given device only if device is not None or empty."""
@@ -460,351 +391,70 @@ def import_graph_def(graph_def,
     _RemoveDefaultAttrs(op_dict, producer_op_list, graph_def)
 
   graph = ops.get_default_graph()
-
-  if graph._c_graph:  # pylint: disable=protected-access
-    with ops.name_scope(name, 'import', input_map.values()) as scope:
-      # Save unique prefix generated by name_scope
-      if scope:
-        assert scope.endswith('/')
-        prefix = scope[:-1]
-      else:
-        prefix = ''
-
-      # Generate any input map tensors inside name scope
-      input_map = _ConvertInputMapValues(name, input_map)
-
-    scoped_options = c_api_util.ScopedTFImportGraphDefOptions()
-    options = scoped_options.options
-    _PopulateTFImportGraphDefOptions(options, prefix, input_map,
-                                     return_elements)
-
-    # _ProcessNewOps mutates the new operations. _lock ensures a Session.run
-    # call cannot occur between creating the TF_Operations in the
-    # TF_GraphImportGraphDefWithResults call and mutating the them in
-    # _ProcessNewOps.
-    with graph._lock:  # pylint: disable=protected-access
-      with c_api_util.tf_buffer(graph_def.SerializeToString()) as serialized:
-        try:
-          results = c_api.TF_GraphImportGraphDefWithResults(
-              graph._c_graph, serialized, options)  # pylint: disable=protected-access
-          results = c_api_util.ScopedTFImportGraphDefResults(results)
-        except errors.InvalidArgumentError as e:
-          # Convert to ValueError for backwards compatibility.
-          raise ValueError(str(e))
-
-      # Create _DefinedFunctions for any imported functions.
-      #
-      # We do this by creating _DefinedFunctions directly from `graph_def`, and
-      # adding them to `graph`. Adding an existing function to a TF_Graph is a
-      # no-op, so this only has the effect of updating the Python state (usually
-      # _DefinedFunction.add_to_graph also adds the function to the TF_Graph).
-      #
-      # TODO(skyewm): fetch the TF_Functions directly from the TF_Graph
-      # TODO(skyewm): avoid sending serialized FunctionDefs back to the TF_Graph
-      # TODO(b/74620627): move this after _ProcessNewOps outside the lock once
-      # _USE_C_SHAPES is removed.
-      if graph_def.library and graph_def.library.function:
-        # pylint: disable=protected-access
-        functions = function._from_library(graph_def.library)
-        for f in functions:
-          f.add_to_graph(graph)
-        # pylint: enable=protected-access
-
-      _ProcessNewOps(graph)
-
-    # Treat input mappings that don't appear in the graph as an error, because
-    # they are likely to be due to a typo.
-    missing_unused_input_keys = (
-        c_api.TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper(
-            results.results))
-    if missing_unused_input_keys:
-      missing_unused_input_keys = [
-          compat.as_str(s) for s in missing_unused_input_keys
-      ]
-      raise ValueError(
-          'Attempted to map inputs that were not found in graph_def: [%s]' %
-          ', '.join(missing_unused_input_keys))
-
-    if return_elements is None:
-      return None
+  with ops.name_scope(name, 'import', input_map.values()) as scope:
+    # Save unique prefix generated by name_scope
+    if scope:
+      assert scope.endswith('/')
+      prefix = scope[:-1]
     else:
-      return _GatherReturnElements(return_elements, graph, results.results)
-
-  else:
-    g = graph
-
-    # Use a canonical representation for all tensor names.
-    input_map = {_CanonicalInputName(k): v for k, v in input_map.items()}
-    used_input_keys = set()
-    name_to_op = {}
-
-    # Add any functions defined in `graph_def` to `g`
+      prefix = ''
+
+    # Generate any input map tensors inside name scope
+    input_map = _ConvertInputMapValues(name, input_map)
+
+  scoped_options = c_api_util.ScopedTFImportGraphDefOptions()
+  options = scoped_options.options
+  _PopulateTFImportGraphDefOptions(options, prefix, input_map,
+                                   return_elements)
+
+  # _ProcessNewOps mutates the new operations. _lock ensures a Session.run
+  # call cannot occur between creating the TF_Operations in the
+  # TF_GraphImportGraphDefWithResults call and mutating the them in
+  # _ProcessNewOps.
+  with graph._lock:  # pylint: disable=protected-access
+    with c_api_util.tf_buffer(graph_def.SerializeToString()) as serialized:
+      try:
+        results = c_api.TF_GraphImportGraphDefWithResults(
+            graph._c_graph, serialized, options)  # pylint: disable=protected-access
+        results = c_api_util.ScopedTFImportGraphDefResults(results)
+      except errors.InvalidArgumentError as e:
+        # Convert to ValueError for backwards compatibility.
+        raise ValueError(str(e))
+
+    # Create _DefinedFunctions for any imported functions.
+    #
+    # We do this by creating _DefinedFunctions directly from `graph_def`, and
+    # adding them to `graph`. Adding an existing function to a TF_Graph is a
+    # no-op, so this only has the effect of updating the Python state (usually
+    # _DefinedFunction.add_to_graph also adds the function to the TF_Graph).
+    #
+    # TODO(skyewm): fetch the TF_Functions directly from the TF_Graph
+    # TODO(skyewm): avoid sending serialized FunctionDefs back to the TF_Graph
+    # TODO(b/74620627): move this after _ProcessNewOps outside the lock once
+    # _USE_C_SHAPES is removed.
     if graph_def.library and graph_def.library.function:
-      # Copy op_dict so we don't clobber the original
-      op_dict = copy.copy(op_dict)
       # pylint: disable=protected-access
-      # Note that we do not prepend `name` to the function name. The reasoning
-      # is that function names are similar to op definition names, which
-      # currently do not have a scoped name or namespace scheme.
       functions = function._from_library(graph_def.library)
       for f in functions:
-        f.add_to_graph(g)
-        op_dict[f.name] = f.definition.signature
+        f.add_to_graph(graph)
       # pylint: enable=protected-access
 
-    # LINT.IfChange
-    with ops.name_scope(name, 'import', input_map.values()) as scope:
-      # TODO(ashankar): Should this just copy over or should it do some
-      # more nuanced merging? For example, the graph may already have some
-      # marked "bad versions" and we don't want to lose those because of
-      # what's in graph_def.versions? The C++ ImporGraphDef does something
-      # more nuanced.
-      g.graph_def_versions.CopyFrom(graph_def.versions)
-
-      input_map = _ConvertInputMapValues(name, input_map)
-
-      # NOTE(mrry): We do this in two passes, because there may be a cycle in
-      # `graph_def`.
-
-      # 1. Add operations without their inputs.
-      for node in graph_def.node:
-        # Check to see if this op's name matches a previously seen op
-        if node.name in name_to_op:
-          raise ValueError('Duplicate name \'%s\' in GraphDef.' % node.name)
-        if node.op not in op_dict:
-          raise ValueError(
-              'No op named %s in defined operations. If the Graph you are '
-              'importing uses custom ops or any parts of tf.contrib, you '
-              'should explicitly import the libraries defining those ops '
-              'before loading the Graph. Note that tf.contrib is lazily loaded '
-              'when accessed, so simply referencing (e.g.) '
-              '`tf.contrib.resampler` will cause those ops to be made '
-              'available.' % node.op)
-        op_def = op_dict[node.op]
-
-        output_types = _OutputTypes(node, op_dict)
-        name_to_op[node.name] = g.create_op(
-            node.op, [], output_types, name=node.name, attrs=node.attr,
-            compute_shapes=False, compute_device=False,
-            op_def=op_def)
-
-      # Maps from a node to the ops it is colocated with, if colocation
-      # is specified in the attributes.
-      colocation_pairs = collections.defaultdict(list)
-
-      # 2. Add inputs to the operations.
-      for node in graph_def.node:
-        op = name_to_op[node.name]
-        input_types = _InputTypes(node, op_dict)
-        apply_device_function = True
-
-        # Rewrite the colocation attributes in the graph, since the
-        # names of new ops may have changed.
-        for key, value in op.node_def.attr.items():
-          if key == '_class':
-            class_values = value.list
-            new_class_values = []
-            for class_value in class_values.s:
-              if class_value.startswith(b'loc:@'):
-                op_to_bind_to = class_value[5:].decode()
-                # Find the op by its original name.
-                if op_to_bind_to not in name_to_op:
-                  raise ValueError('Specified colocation to an op that '
-                                   'does not exist during import: %s in %s' % (
-                                       op_to_bind_to, node.name))
-                original_op = name_to_op[op_to_bind_to]
-                new_class_values.append(compat.as_bytes(
-                    'loc:@' + original_op.name))
-                if op_to_bind_to != node.name:
-                  # Keep track of this mapping for a later phase.
-                  colocation_pairs[op].append(original_op)
-                  # Don't apply this op's device function,
-                  # the colocation constraint will ensure
-                  # the proper device gets assigned at runtime.
-                  apply_device_function = False
-
-              else:
-                new_class_values.append(class_value)
-            value.list.CopyFrom(attr_value_pb2.AttrValue.ListValue(
-                s=new_class_values))
-
-        # NOTE(mrry): We cannot use zip here because control inputs do not
-        # appear in the list of input_types.
-        for i, input_name in enumerate(
-            [_CanonicalInputName(x) for x in node.input]):
-
-          if _IsControlInput(input_name):
-            # (a) Input is a control input that should be taken from an op
-            #     in "graph_def".
-            try:
-              source_op = name_to_op[input_name[1:]]
-            except KeyError:
-              raise ValueError(
-                  _InvalidNodeMessage(
-                      node,
-                      'Control input %r not found in graph_def.'
-                      % (input_name,)))
-            # pylint: disable=protected-access
-            op._add_control_input(source_op)
-            # pylint: enable=protected-access
-
-          else:
-            try:
-              input_type = input_types[i]
-            except IndexError:
-              raise ValueError(_InvalidNodeMessage(
-                  node, 'More inputs specified (%r) than the op expects.'
-                  % (input_name,)))
-
-            if input_name in input_map:
-              # (b) Input should be replaced by a tensor from the caller.
-              source_tensor = input_map[input_name]
-              used_input_keys.add(input_name)
-
-            else:
-              # (c) Input should be taken from an op in `graph_def`.
-              operation_name, output_index = _ParseTensorName(input_name)
-              try:
-                source_op = name_to_op[operation_name]
-                source_tensor = list(source_op.values())[output_index]
-              except (KeyError, IndexError):
-                raise ValueError(
-                    _InvalidNodeMessage(
-                        node,
-                        'Input tensor %r not found in graph_def.'
-                        % (input_name,)))
-
-            try:
-              # pylint: disable=protected-access
-              op._add_input(source_tensor, dtype=input_type)
-              # pylint: enable=protected-access
-            except TypeError as te:
-              raise ValueError(_InvalidNodeMessage(
-                  node, 'Input tensor %r %s' % (input_name, te)))
-
-        # pylint: disable=protected-access
-        if op._input_types != input_types:
-          raise ValueError(
-              _InvalidNodeMessage(
-                  node,
-                  'Input types mismatch (expected %r but got %r)'
-                  % (', '.join(dtypes.as_dtype(x).name for x in input_types),
-                     ', '.join(x.name for x in op._input_types))))
-        # pylint: enable=protected-access
-
-        # Execute shape inference for this op.
-        # NOTE(mrry): If the graph contains a cycle, the full shape
-        # information may not be available for this op's inputs.
-        ops.set_shape_and_handle_data_for_outputs(op)
-        # For nodes with _output_shapes set, set the output shapes.
-        if '_output_shapes' in op.node_def.attr:
-          for i, output in enumerate(op.outputs):
-            dims = op.node_def.attr['_output_shapes'].list.shape[i]
-            output_shape = tensor_shape.TensorShape(
-                None if dims.unknown_rank else
-                [dim.size if dim.size >= 0 else None for dim in dims.dim])
-
-            try:
-              output.set_shape(output_shape)
-            except ValueError as e:
-              # If the output shape is incompatible with what is inferred
-              # by the graph for a very specific whitelist of ops, then we
-              # ignore this output shape.  This can happen if there is a
-              # bug in the shape function for some operation, and the
-              # serialized graph def has the incorrect shape set when
-              # running on a newer binary with the fixed shape function.
-              # This is an escape hatch that allows us to correct shape
-              # functions that are not critical to correct execution but
-              # would cause graphs to fail if imported after correcting.
-              #
-              # This can be removed after 2017/03/08.
-              if op.type in ['RandomShuffleQueue', 'PaddingFIFOQueue',
-                             'FIFOQueue', 'PriorityQueue', 'QueueSize',
-                             'Stack', 'Barrier', 'BarrierReadySize',
-                             'BarrierIncompleteSize', 'HashTable',
-                             'MutableHashTable',
-                             'MutableHashTableOfTensors', 'Mutex',
-                             'CuckooTable', 'IndexTable',
-                             'WholeFileReader', 'TextLineReader',
-                             'FixedLengthRecordReader',
-                             'TFRecordReader', 'IdentityReader',
-                             'LMDBReader',
-                             'RefSwitch', 'RefEnter', 'RefNextIteration',
-                             'RefMerge', 'RefIdentity']:
-                pass
-              elif op.type in [
-                  'ConditionalAccumulator', 'SparseConditionalAccumulator',
-                  'Table'
-              ]:
-                # This can be removed after 2017/04/24.
-                pass
-              else:
-                raise e
-
-          del op.node_def.attr['_output_shapes']
-
-        # NOTE(mrry): We do this after configuring the inputs, because
-        # the result of the device functions may depend on the inputs.
-        if apply_device_function:
-          with _MaybeDevice(node.device):
-            g._apply_device_functions(op)  # pylint: disable=protected-access
-
-      # The following loop populates the device field of ops that are
-      # colocated with another op.  This is implied by the colocation
-      # attribute, but we propagate the device field for completeness.
-      for op, coloc_op_list in colocation_pairs.items():
-        coloc_device = None
-        # Find any device in the list of colocated ops that have a
-        # device, if it exists.  We assume that if multiple ops
-        # have devices, they refer to the same device.  Otherwise, a
-        # runtime error will occur since the colocation property
-        # cannot be guaranteed.
-        #
-        # One possible improvement is to try to check for compatibility
-        # of all devices in this list at import time here, which would
-        # require implementing a compatibility function for device specs
-        # in python.
-        for coloc_op in coloc_op_list:
-          if coloc_op.device:
-            coloc_device = pydev.DeviceSpec.from_string(coloc_op.device)
-            break
-        if coloc_device:
-          op._set_device(coloc_device)  # pylint: disable=protected-access
-
-      # Treat input mappings that don't appear in the graph as an error,
-      # because they are likely to be due to a typo.
-      def _IsImportedNodeOutput(tensor_name):
-        operation_name, output_index = _ParseTensorName(tensor_name)
-        try:
-          return output_index < len(name_to_op[operation_name].outputs)
-        except KeyError:
-          return False
-      absent_input_keys = [
-          k for k in frozenset(input_map.keys()).difference(used_input_keys)
-          if not _IsImportedNodeOutput(k)]
-      if absent_input_keys:
-        raise ValueError(
-            'Attempted to map inputs that were not found in graph_def: [%s]'
-            % ', '.join(absent_input_keys))
-
-      if return_elements is None:
-        return None
-      else:
-        ret = []
-        for name in return_elements:
-          name = compat.as_str(name)
-          if ':' in name:
-            try:
-              operation_name, output_index = _ParseTensorName(name)
-              ret.append(name_to_op[operation_name].outputs[output_index])
-            except (ValueError, KeyError, IndexError):
-              raise ValueError(
-                  'Requested return_element %r not found in graph_def.' % name)
-          else:
-            try:
-              ret.append(name_to_op[name])
-            except KeyError:
-              raise ValueError(
-                  'Requested return_element %r not found in graph_def.' % name)
-        return ret
-    # LINT.ThenChange(//tensorflow/core/graph/graph_constructor.cc)
+    _ProcessNewOps(graph)
+
+  # Treat input mappings that don't appear in the graph as an error, because
+  # they are likely to be due to a typo.
+  missing_unused_input_keys = (
+      c_api.TF_ImportGraphDefResultsMissingUnusedInputMappings_wrapper(
+          results.results))
+  if missing_unused_input_keys:
+    missing_unused_input_keys = [
+        compat.as_str(s) for s in missing_unused_input_keys
+    ]
+    raise ValueError(
+        'Attempted to map inputs that were not found in graph_def: [%s]' %
+        ', '.join(missing_unused_input_keys))
+
+  if return_elements is None:
+    return None
+  else:
+    return _GatherReturnElements(return_elements, graph, results.results)
diff --git a/tensorflow/python/framework/importer_test.py b/tensorflow/python/framework/importer_test.py
index 2c913d1e028e15..c5a54470d27b59 100644
--- a/tensorflow/python/framework/importer_test.py
+++ b/tensorflow/python/framework/importer_test.py
@@ -31,7 +31,6 @@
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_ops  # pylint: disable=unused-import
-from tensorflow.python.framework import test_util
 from tensorflow.python.framework import versions
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -45,7 +44,6 @@
 from tensorflow.python.platform import test
 
 
-@test_util.with_c_api
 class ImportGraphDefTest(test.TestCase):
 
   def _MakeGraphDef(self,
@@ -231,10 +229,7 @@ def testEmptyNameScope(self):
           return_elements=["foo"],
           name="")
 
-      if ops._USE_C_API:
-        self.assertEqual(op.name, "foo")
-      else:
-        self.assertEqual(op.name, "foo_1")
+      self.assertEqual(op.name, "foo")
 
   def testInputMap(self):
     with ops.Graph().as_default():
@@ -425,14 +420,9 @@ def testWhileLoop(self):
         self.assertEqual(sess.run(imported_r), 10)
 
   def testTypeMismatchInGraphDef(self):
-    if ops._USE_C_API:
-      # TODO(skyewm): improve error message
-      error_msg = ("Input 0 of node import/B was passed int32 from import/A:0 "
-                   "incompatible with expected float.")
-    else:
-      error_msg = ("Cannot convert a tensor of type int32 to an input of type "
-                   "float")
-
+    # TODO(skyewm): improve error message
+    error_msg = ("Input 0 of node import/B was passed int32 from import/A:0 "
+                 "incompatible with expected float.")
     with ops.Graph().as_default():
       with self.assertRaisesRegexp(ValueError, error_msg):
         importer.import_graph_def(
@@ -476,14 +466,11 @@ def testShapeWhitelistViolation(self):
             "Shapes () and (43,) are not compatible" in str(e.exception))
 
   def testInvalidSignatureTooManyInputsInGraphDef(self):
-    if ops._USE_C_API:
-      # TODO(skyewm): improve error message
-      error_msg = "NodeDef expected inputs '' do not match 1 inputs specified"
-    else:
-      error_msg = r"More inputs specified \('A:0'\) than the op expects"
-
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, error_msg):
+      # TODO(skyewm): improve error message
+      with self.assertRaisesRegexp(
+          ValueError,
+          "NodeDef expected inputs '' do not match 1 inputs specified"):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
@@ -491,16 +478,12 @@ def testInvalidSignatureTooManyInputsInGraphDef(self):
             """))
 
   def testInvalidSignatureNotEnoughInputsInGraphDef(self):
-    if ops._USE_C_API:
-      # TODO(skyewm): improve error message
-      error_msg = ("NodeDef expected inputs 'int32, float' do not match 1 "
-                   "inputs specified")
-    else:
-      error_msg = (r"Input types mismatch \(expected 'int32, float32' but "
-                   r"got 'int32'\)")
-
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, error_msg):
+      # TODO(skyewm): improve error message
+      with self.assertRaisesRegexp(
+          ValueError,
+          "NodeDef expected inputs 'int32, float' do not match 1 inputs "
+          "specified"):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
@@ -508,13 +491,9 @@ def testInvalidSignatureNotEnoughInputsInGraphDef(self):
             """))
 
   def testMissingInputOpInGraphDef(self):
-    if ops._USE_C_API:
-      error_msg = "Node 'B': Unknown input node 'A:0'"
-    else:
-      error_msg = "Input tensor 'A:0' not found"
-
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, error_msg):
+      with self.assertRaisesRegexp(ValueError,
+                                   "Node 'B': Unknown input node 'A:0'"):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'B' op: 'FloatInput' input: 'A:0' }
@@ -532,14 +511,11 @@ def testMissingInputOpInGraphDefButAppearsInInputMap(self):
       self.assertEqual(b.inputs[0], feed_a_0)
 
   def testMissingInputTensorInGraphDef(self):
-    if ops._USE_C_API:
-      error_msg = ("Node 'B': Connecting to invalid output 1 of source node A "
-                   "which has 1 outputs")
-    else:
-      error_msg = "Input tensor 'A:1' not found"
-
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, error_msg):
+      with self.assertRaisesRegexp(
+          ValueError,
+          "Node 'B': Connecting to invalid output 1 of source node A "
+          "which has 1 outputs"):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'FloatOutput' }
@@ -547,52 +523,36 @@ def testMissingInputTensorInGraphDef(self):
             """))
 
   def testMissingControlInputInGraphDef(self):
-    if ops._USE_C_API:
-      error_msg = r"Node 'B': Unknown input node '\^A'"
-    else:
-      error_msg = r"Control input '\^A' not found"
-
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, error_msg):
+      with self.assertRaisesRegexp(ValueError,
+                                   r"Node 'B': Unknown input node '\^A'"):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'B' op: 'None' input: '^A' }
             """))
 
   def testInvalidTensorNameOutputIndexInGraphDef(self):
-    if ops._USE_C_API:
-      error_msg = "Node 'B': Unknown input node 'A:B'"
-    else:
-      error_msg = "Cannot convert 'A:B' to a tensor name."
-
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, error_msg):
+      with self.assertRaisesRegexp(ValueError,
+                                   "Node 'B': Unknown input node 'A:B'"):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'B' op: 'None' input: 'A:B' }
             """))
 
   def testInvalidTensorNameInGraphDef(self):
-    if ops._USE_C_API:
-      error_msg = "Node 'B': Unknown input node 'A:B:0'"
-    else:
-      error_msg = "Cannot convert 'A:B:0' to a tensor name."
-
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, error_msg):
+      with self.assertRaisesRegexp(ValueError,
+                                   "Node 'B': Unknown input node 'A:B:0'"):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'B' op: 'None' input: 'A:B:0' }
             """))
 
   def testMissingReturnOperation(self):
-    if ops._USE_C_API:
-      error_msg = "Requested return node 'B' not found in graph def"
-    else:
-      error_msg = "return_element 'B' not found in graph_def."
-
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, error_msg):
+      with self.assertRaisesRegexp(
+          ValueError, "Requested return node 'B' not found in graph def"):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'None' }
@@ -600,38 +560,26 @@ def testMissingReturnOperation(self):
             return_elements=["B"])
 
   def testMissingReturnTensor(self):
-    if ops._USE_C_API:
-      error_msg = (r"Invalid return output 1 of node 'A', which has 1 "
-                   r"output\(s\)")
-    else:
-      error_msg = "return_element 'A:1' not found in graph_def."
-
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, error_msg):
+      with self.assertRaisesRegexp(
+          ValueError,
+          r"Invalid return output 1 of node 'A', which has 1 output\(s\)"):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
             """),
             return_elements=["A:1"])
 
-      if ops._USE_C_API:
-        error_msg = "Requested return tensor 'B:0' not found in graph def"
-      else:
-        error_msg = "return_element 'B:0' not found in graph_def."
-
-      with self.assertRaisesRegexp(ValueError, error_msg):
+      with self.assertRaisesRegexp(
+          ValueError, "Requested return tensor 'B:0' not found in graph def"):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
             """),
             return_elements=["B:0"])
 
-      if ops._USE_C_API:
-        error_msg = "Cannot convert 'A:B:0' to a tensor name."
-      else:
-        error_msg = "return_element 'A:B:0' not found in graph_def."
-
-      with self.assertRaisesRegexp(ValueError, error_msg):
+      with self.assertRaisesRegexp(ValueError,
+                                   "Cannot convert 'A:B:0' to a tensor name."):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
@@ -669,14 +617,10 @@ def testInputMapUnusedAsInput(self):
             input_map={"A:2": constant_op.constant(5.0)})
 
   def testInputMapTypeMismatch(self):
-    if ops._USE_C_API:
-      error_msg = ("Input 0 of node import/B was passed float from Const:0 "
-                   "incompatible with expected int32.")
-    else:
-      error_msg = ("Cannot convert a tensor of type float32 to an input of "
-                   "type int32.")
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, error_msg):
+      with self.assertRaisesRegexp(
+          ValueError, "Input 0 of node import/B was passed float from Const:0 "
+          "incompatible with expected int32."):
         importer.import_graph_def(
             self._MakeGraphDef("""
             node { name: 'A' op: 'IntOutput' }
@@ -899,13 +843,9 @@ def testNamePrefixColocationAttrsNotFound(self):
             value { list { s: 'loc:@A' } }
           } }""")
 
-    if ops._USE_C_API:
-      error_msg = "Node 'B' expects to be colocated with unknown node 'A'"
-    else:
-      error_msg = "does not exist during import"
-
     with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, error_msg):
+      with self.assertRaisesRegexp(
+          ValueError, "Node 'B' expects to be colocated with unknown node 'A'"):
         importer.import_graph_def(
             original_graph_def, return_elements=["B"], name="imported_graph")
 
@@ -957,28 +897,19 @@ def testInvalidInputForReturnOperations(self):
           TypeError, "return_elements must be a list of strings."):
         importer.import_graph_def(self._MakeGraphDef(""), return_elements=[7])
 
-      if ops._USE_C_API:
-        error_msg = "Cannot convert 'a:b:c' to a tensor name."
-      else:
-        error_msg = "Requested return_element 'a:b:c' not found in graph_def."
-      with self.assertRaisesRegexp(ValueError, error_msg):
-        importer.import_graph_def(self._MakeGraphDef(""),
-                                  return_elements=["a:b:c"])
+      with self.assertRaisesRegexp(ValueError,
+                                   "Cannot convert 'a:b:c' to a tensor name."):
+        importer.import_graph_def(
+            self._MakeGraphDef(""), return_elements=["a:b:c"])
 
   def testDuplicateOperationNames(self):
-    if ops._USE_C_API:
-      error_msg = "Node 'A' is not unique"
-    else:
-      error_msg = "Duplicate name 'A' in GraphDef."
-
-    with ops.Graph().as_default():
-      with self.assertRaisesRegexp(ValueError, error_msg):
-        importer.import_graph_def(
-            self._MakeGraphDef("""
-            node { name: 'A' op: 'IntOutput' }
-            node { name: 'B' op: 'IntOutput' }
-            node { name: 'A' op: 'IntOutput' }
-            """))
+    with self.assertRaisesRegexp(ValueError, "Node 'A' is not unique"):
+      importer.import_graph_def(
+          self._MakeGraphDef("""
+          node { name: 'A' op: 'IntOutput' }
+          node { name: 'B' op: 'IntOutput' }
+          node { name: 'A' op: 'IntOutput' }
+          """))
 
   def testWithExtensionAndAttr(self):
     with ops.Graph().as_default() as g:
@@ -1119,40 +1050,22 @@ def testVersion(self):
                            min_consumer)
 
   def testVersionLow(self):
-    with ops.Graph().as_default() as g:
-      pat = (r"GraphDef producer version -1 below min producer %d supported "
-             r"by TensorFlow \S+\.  Please regenerate your graph.$" %
-             versions.GRAPH_DEF_VERSION_MIN_PRODUCER)
-      # C API throws error during import, Python-only throws error during run
-      if ops._USE_C_API:
-        with self.assertRaisesRegexp(Exception, pat):
-          importer.import_graph_def(self._MakeGraphDef("", producer=-1))
-      else:
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(
+          Exception,
+          r"GraphDef producer version -1 below min producer %d supported "
+          r"by TensorFlow \S+\.  Please regenerate your graph.$" %
+          versions.GRAPH_DEF_VERSION_MIN_PRODUCER):
         importer.import_graph_def(self._MakeGraphDef("", producer=-1))
-        x = constant_op.constant(
-            7)  # Need at least one op to get a C++ graph generated
-        with self.test_session(graph=g) as sess:
-          with self.assertRaisesRegexp(Exception, pat):
-            sess.run(x)
 
   def testVersionHigh(self):
-    with ops.Graph().as_default() as g:
-      pat = (r"GraphDef min consumer version %d above current version %d "
-             r"for TensorFlow \S+\.  Please upgrade TensorFlow\.$" %
-             (1 << 30, versions.GRAPH_DEF_VERSION))
-
-      if ops._USE_C_API:
-        with self.assertRaisesRegexp(ValueError, pat):
-          importer.import_graph_def(self._MakeGraphDef("",
-                                                       min_consumer=1 << 30))
-      else:
-        # Python API only throws when graph is run
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(
+          ValueError,
+          r"GraphDef min consumer version %d above current version %d "
+          r"for TensorFlow \S+\.  Please upgrade TensorFlow\.$" %
+          (1 << 30, versions.GRAPH_DEF_VERSION)):
         importer.import_graph_def(self._MakeGraphDef("", min_consumer=1 << 30))
-        x = constant_op.constant(
-            7)  # Need at least one op to get a C++ graph generated
-        with self.test_session(graph=g) as sess:
-          with self.assertRaisesRegexp(Exception, pat):
-            sess.run(x)
 
   def testVersionAppliesToOpConstruction(self):
     """These tests rely on shape fns in test_ops.cc."""
@@ -1198,29 +1111,13 @@ def testDefaultAttrsRemoved(self):
           """),
           return_elements=["A"],
           producer_op_list=producer_op_list)
-      if ops._USE_C_API:
-        error_msg = "Operation 'import/A' has no attr named 'default_int'."
-      else:
-        error_msg = "No attr named 'default_int'"
-      with self.assertRaisesRegexp(ValueError, error_msg):
+      with self.assertRaisesRegexp(
+          ValueError, "Operation 'import/A' has no attr named 'default_int'."):
         a[0].get_attr("default_int")
 
-    # Unknown attrs cannot be imported using C API. This test will eventually be
-    # deleted.
-    if not ops._USE_C_API:
-      # Attr only in producer_op_list with non-default value is preserved.
-      with ops.Graph().as_default():
-        a = importer.import_graph_def(
-            self._MakeGraphDef("""
-            node { name: 'A' op: 'OpWithFutureDefaultAttr'
-                   attr { key: 'default_int' value { i: 987 } } }
-            """),
-            return_elements=["A"],
-            producer_op_list=producer_op_list)
-        self.assertEqual(987, a[0].get_attr("default_int"))
-
   def testFunctions(self):
     dtype = dtypes.float32
+
     @function.Defun(dtype, dtype, dtype, dtype)
     def Grad(x, y, dout1, dout2):  # pylint: disable=unused-argument
       # Return the inputs for simplicity of testing. The correct return value
@@ -1299,6 +1196,7 @@ def InnerFunc(x):
   def testImportInsideDefun(self):
     g = ops.Graph()
     with g.as_default():
+
       @function.Defun()
       def Add2(x, y):
         return math_ops.add(x, y)
@@ -1322,6 +1220,7 @@ def TestFunc():
   def testImportGraphWithFunctionTwice(self):
     g = ops.Graph()
     with g.as_default():
+
       @function.Defun()
       def Add2(x, y):
         return math_ops.add(x, y)
diff --git a/tensorflow/python/framework/load_library.py b/tensorflow/python/framework/load_library.py
index 9a8477debb05fd..535c6017f5fd0f 100644
--- a/tensorflow/python/framework/load_library.py
+++ b/tensorflow/python/framework/load_library.py
@@ -58,7 +58,7 @@ def load_op_library(library_filename):
   op_list_str = py_tf.TF_GetOpList(lib_handle)
   op_list = op_def_pb2.OpList()
   op_list.ParseFromString(compat.as_bytes(op_list_str))
-  wrappers = py_tf.GetEagerPythonWrappers(op_list_str)
+  wrappers = py_tf.GetPythonWrappers(op_list_str)
 
   # Delete the library handle to release any memory held in C
   # that are no longer needed.
diff --git a/tensorflow/python/framework/meta_graph_test.py b/tensorflow/python/framework/meta_graph_test.py
index e5b157648e0830..5cf86972100bd6 100644
--- a/tensorflow/python/framework/meta_graph_test.py
+++ b/tensorflow/python/framework/meta_graph_test.py
@@ -61,7 +61,6 @@ def _TestDir(test_name):
 # pylint: enable=invalid-name
 
 
-@test_util.with_c_api
 class SimpleMetaGraphTest(test.TestCase):
 
   def testNoVariables(self):
@@ -285,7 +284,6 @@ def testVariableObjectsAreSharedAmongCollections(self):
       self.assertIs(global_vars[0], trainable_vars[0])
 
 
-@test_util.with_c_api
 class ScopedMetaGraphTest(test.TestCase):
 
   def _testScopedExport(self, test_dir, exported_filenames):
@@ -476,11 +474,12 @@ def testWhileLoopGradients(self):
     # Create a simple while loop.
     with ops.Graph().as_default():
       with ops.name_scope("export"):
-        var = variables.Variable(0)
+        var = variables.Variable(0.)
         var_name = var.name
-        _, output = control_flow_ops.while_loop(lambda i, x: i < 5,
-                                                lambda i, x: (i + 1, x + i),
-                                                [0, var])
+        _, output = control_flow_ops.while_loop(
+            lambda i, x: i < 5,
+            lambda i, x: (i + 1, x + math_ops.cast(i, dtypes.float32)),
+            [0, var])
         output_name = output.name
 
       # Generate a MetaGraphDef containing the while loop with an export scope.
@@ -840,7 +839,6 @@ def testClearDevices(self):
     self.assertEqual("", str(graph2.as_graph_element("matmul").device))
 
 
-@test_util.with_c_api
 class MetaGraphWithVariableScopeTest(test.TestCase):
 
   def testMetricsCollection(self):
@@ -898,7 +896,6 @@ def _enqueue_vector(sess, queue, values, shape=None):
         initializer = variables.local_variables_initializer()
 
 
-@test_util.with_c_api
 class ExportImportAcrossScopesTest(test.TestCase):
 
   def testPartionedVariables(self):
diff --git a/tensorflow/python/framework/op_def_library_test.py b/tensorflow/python/framework/op_def_library_test.py
index 84ca062ade3b32..66cfe213b3cc94 100644
--- a/tensorflow/python/framework/op_def_library_test.py
+++ b/tensorflow/python/framework/op_def_library_test.py
@@ -36,7 +36,6 @@ def _unknown_shape(op):
   return [tensor_shape.unknown_shape() for _ in op.outputs]
 
 
-@test_util.with_c_api
 class OpDefLibraryTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -1330,7 +1329,6 @@ def testStructuredOutputMultipleLists(self):
             self.assertEqual(t_c, [x.dtype for x in c])
 
 
-@test_util.with_c_api
 class OpDefLibraryGraphTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index dd9acdd9ebb817..b440cde3ad4268 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -20,7 +20,6 @@
 
 import collections
 import copy
-import functools
 import linecache
 import os
 import re
@@ -57,14 +56,13 @@
 from tensorflow.python.util import compat
 from tensorflow.python.util import decorator_utils
 from tensorflow.python.util import tf_contextlib
+from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.tf_export import tf_export
 
 
-# Temporary global switch determining if we should enable the work-in-progress
-# calls to the C API. Currently disabled by default but can be manually enabled
-# in code or via the environment variable. This will be removed once all
-# functionality is supported and there's no performance penalty with it enabled.
-_USE_C_API = os.getenv("TF_C_API_GRAPH_CONSTRUCTION", "1") is not "0"
+# Temporary global switches determining if we should enable the work-in-progress
+# calls to the C API. These will be removed once all functionality is supported.
+_USE_C_API = True
 _USE_C_SHAPES = os.getenv("TF_C_API_GRAPH_CONSTRUCTION_SHAPES", "0") is not "0"
 
 
@@ -291,15 +289,8 @@ def __init__(self, op, value_index, dtype):
     self._value_index = value_index
     self._dtype = dtypes.as_dtype(dtype)
 
-    if _USE_C_API:
-      # This will be set by set_shape_and_handle_data_for_outputs.
-      self._shape_val = None
-    else:
-      # The Python code requires all tensors start with a shape to support shape
-      # inference on imported while loops. This isn't necessary with the C API
-      # enabled because the C API provides the shapes for imported nodes.
-      # TODO(skyewm): remove when _USE_C_API is removed.
-      self._shape_val = tensor_shape.unknown_shape()
+    # This will be set by self.shape().
+    self._shape_val = None
 
     # List of operations that use this Tensor as input.  We maintain this list
     # to easily navigate a computation graph.
@@ -387,7 +378,6 @@ def shape(self):
       if _USE_C_SHAPES:
         self._shape_val = self._c_api_shape()
       else:
-        assert _USE_C_API
         # Call set_shape_and_handle_data_for_outputs in topological order on all
         # ops that are needed to compute self.op's shape. We do this instead of
         # having set_shape_and_handle_data_for_outputs recursively call
@@ -511,8 +501,6 @@ def set_shape(self, shape):
     else:
       self._shape_val = self.shape.merge_with(shape)
 
-    if not self._op._graph._c_graph: return
-
     # Update C shape even if _USE_C_SHAPES = False, since we still want
     # set_shape to be reflected in the C API graph for when we run it.
     if not isinstance(shape, tensor_shape.TensorShape):
@@ -548,33 +536,14 @@ def consumers(self):
     Returns:
       A list of `Operation`s.
     """
-    if self._op._c_op:  # pylint: disable=protected-access
-      consumer_names = c_api.TF_OperationOutputConsumers_wrapper(
-          self._as_tf_output())
-      # pylint: disable=protected-access
-      return [
-          self.graph._get_operation_by_name_unsafe(name)
-          for name in consumer_names
-      ]
-      # pylint: enable=protected-access
-    else:
-      return self._consumers
-
-  def _add_consumer(self, consumer):
-    """Add a consumer to this tensor.
-
-    Args:
-      consumer: an Operation.
-
-    Raises:
-      TypeError: if the consumer is not an Operation.
-    """
+    consumer_names = c_api.TF_OperationOutputConsumers_wrapper(
+        self._as_tf_output())
     # pylint: disable=protected-access
-    assert not self._op._c_op, "Tensor._add_consumer doesn't work with C API"
+    return [
+        self.graph._get_operation_by_name_unsafe(name)
+        for name in consumer_names
+    ]
     # pylint: enable=protected-access
-    if not isinstance(consumer, Operation):
-      raise TypeError("Consumer must be an Operation: %s" % consumer)
-    self._consumers.append(consumer)
 
   def _as_node_def_input(self):
     """Return a value to use for the NodeDef "input" attribute.
@@ -597,7 +566,6 @@ def _as_node_def_input(self):
 
   def _as_tf_output(self):
     # pylint: disable=protected-access
-    assert self.op._c_op
     return c_api_util.tf_output(self.op._c_op, self.value_index)
     # pylint: enable=protected-access
 
@@ -1057,13 +1025,19 @@ def internal_convert_to_tensor(value,
 
   """
   if ctx is None: ctx = context.context()
-  if ctx.executing_eagerly():
-    # Fast path for EagerTensors that don't need any conversion.
-    if isinstance(value, EagerTensor):
+  if isinstance(value, EagerTensor):
+    if ctx.executing_eagerly():
+      # Fast path for EagerTensors that don't need any conversion.
       # Note that we don't check that value's dtype matches the dtype
       # argument.  We expect that the C runtime will do that checking
       # when we execute the kernel.
       return value
+    else:
+      graph = get_default_graph()
+      if not graph.building_function:
+        raise RuntimeError("Attempting to capture an EagerTensor without "
+                           "building a function.")
+      return graph.capture(value, name=name)
 
   if dtype is not None:
     dtype = dtypes.as_dtype(dtype)
@@ -1251,7 +1225,10 @@ def internal_convert_to_tensor_or_indexed_slices(value,
   Raises:
     ValueError: If `dtype` does not match the element type of `value`.
   """
-  if isinstance(value, _TensorLike):
+  if isinstance(value, EagerTensor) and not context.executing_eagerly():
+    return internal_convert_to_tensor(
+        value, dtype=dtype, name=name, as_ref=as_ref)
+  elif isinstance(value, _TensorLike):
     if dtype and not dtypes.as_dtype(dtype).is_compatible_with(value.dtype):
       raise ValueError(
           "Tensor conversion requested dtype %s for Tensor with dtype %s: %r" %
@@ -1716,18 +1693,8 @@ def __init__(self,
                           "a Tensor, or IndexedSlices: %s" % c)
         control_input_ops.append(control_op)
 
-    # Don't set private fields with C API enabled to catch users who need to
-    # switch to public API.
-    # TODO(skyewm): delete these fields once we remove _USE_C_API
-    if not self._graph._c_graph:
-      self._inputs_val = list(inputs)  # Defensive copy.
-      self._input_types_val = input_types
-      self._control_inputs_val = control_input_ops
-      self._node_def_val = copy.deepcopy(node_def)
-      self._op_def_val = op_def
-    else:
-      # This will be set by self.inputs.
-      self._inputs_val = None
+    # This will be set by self.inputs.
+    self._inputs_val = None
 
     self._id_value = self._graph._next_id()  # pylint: disable=protected-access
     self._original_op = original_op
@@ -1736,10 +1703,8 @@ def __init__(self,
 
     # Initialize self._c_op.
     if c_op:
-      # TODO(skyewm): remove this assert when we remove USE_C_API
-      assert self._graph._c_graph  # pylint: disable=protected-access
       self._c_op = c_op
-    elif self._graph._c_graph:  # pylint: disable=protected-access
+    else:
       if op_def is None:
         op_def = self._graph._get_op_def(node_def.op)
       # TODO(skyewm): op_def_library.apply_op() flattens the incoming inputs.
@@ -1748,30 +1713,19 @@ def __init__(self,
           op_def, inputs, node_def.attr)
       self._c_op = _create_c_op(self._graph, node_def, grouped_inputs,
                                 control_input_ops)
-    else:
-      self._c_op = None
-
-    # Mark that we consume the inputs. This is unnecessary and unsupported with
-    # the C API enabled, since the C API tracks the tensor consumers instead.
-    if not self._c_op:
-      for input_tensor in self._inputs_val:
-        input_tensor._add_consumer(self)  # pylint: disable=protected-access
 
     # Initialize self._outputs.
-    if self._c_op:
-      num_outputs = c_api.TF_OperationNumOutputs(self._c_op)
-      output_types = [
-          c_api.TF_OperationOutputType(c_api_util.tf_output(self._c_op, i))
-          for i in range(num_outputs)]
-      assert output_types is not None
-    elif output_types is None:
-      output_types = []
-    self._output_types_val = output_types
+    num_outputs = c_api.TF_OperationNumOutputs(self._c_op)
+    output_types = [
+        c_api.TF_OperationOutputType(c_api_util.tf_output(self._c_op, i))
+        for i in range(num_outputs)]
     self._outputs = [
         Tensor(self, i, output_type)
         for i, output_type in enumerate(output_types)
     ]
 
+    self._graph._add_op(self)  # pylint: disable=protected-access
+
     if not c_op:
       self._control_flow_post_processing()
 
@@ -1785,7 +1739,6 @@ def _control_flow_post_processing(self):
       control_flow_util.CheckInputFromValidContext(self, input_tensor.op)
     if self._control_flow_context is not None:
       self._control_flow_context.AddOp(self)
-    self._recompute_node_def()
 
   def _reconstruct_sequence_inputs(self, op_def, inputs, attrs):
     """Regroups a flat list of input tensors into scalar and sequence inputs.
@@ -1866,10 +1819,7 @@ def _set_control_flow_context(self, ctx):
   @property
   def name(self):
     """The full name of this operation."""
-    if self._c_op:
-      return c_api.TF_OperationName(self._c_op)
-    else:
-      return self._node_def_val.name
+    return c_api.TF_OperationName(self._c_op)
 
   @property
   def _id(self):
@@ -1885,10 +1835,7 @@ def device(self):
       assigned, or an empty string if it has not been assigned to a
       device.
     """
-    if self._c_op:
-      return c_api.TF_OperationDevice(self._c_op)
-    else:
-      return self._node_def_val.device
+    return c_api.TF_OperationDevice(self._c_op)
 
   @property
   def _output_types(self):
@@ -1901,28 +1848,21 @@ def _output_types(self):
       The length of this list indicates the number of output endpoints
       of the operation.
     """
-    if self._c_op:
-      num_outputs = c_api.TF_OperationNumOutputs(self._c_op)
-      output_types = [
-          c_api.TF_OperationOutputType(self._tf_output(i))
-          for i in xrange(num_outputs)
-      ]
-      # TODO(iga): Remove this assert after converting to C API by default.
-      # Just being a bit paranoid here.
-      assert self._output_types_val == output_types
-      # In all the tests we have output_types that are passed into
-      # Operation.__init__ are a list of ints (which is illegal according
-      # to the docstring), but input_types are instances of DType.
-      # This extra assert is to catch if we ever use DType for output_types.
-      if output_types:
-        assert isinstance(output_types[0], int)
-      return output_types
-    else:
-      return self._output_types_val
+    num_outputs = c_api.TF_OperationNumOutputs(self._c_op)
+    output_types = [
+        c_api.TF_OperationOutputType(self._tf_output(i))
+        for i in xrange(num_outputs)
+    ]
+    # In all the tests we have output_types that are passed into
+    # Operation.__init__ are a list of ints (which is illegal according
+    # to the docstring), but input_types are instances of DType.
+    # This extra assert is to catch if we ever use DType for output_types.
+    if output_types:
+      assert isinstance(output_types[0], int)
+    return output_types
 
   def _tf_output(self, output_idx):
     """Create and return a new TF_Output for output_idx'th output of this op."""
-    assert self._c_op
     tf_output = c_api.TF_Output()
     tf_output.oper = self._c_op
     tf_output.index = output_idx
@@ -1930,7 +1870,6 @@ def _tf_output(self, output_idx):
 
   def _tf_input(self, input_idx):
     """Create and return a new TF_Input for input_idx'th input of this op."""
-    assert self._c_op
     tf_input = c_api.TF_Input()
     tf_input.oper = self._c_op
     tf_input.index = input_idx
@@ -1942,47 +1881,12 @@ def _set_device(self, device):  # pylint: disable=redefined-outer-name
     Args:
       device: string or device..  The device to set.
     """
-    if self._c_op:
-      c_api.SetRequestedDevice(
-          self._graph._c_graph,  # pylint: disable=protected-access
-          self._c_op,  # pylint: disable=protected-access
-          compat.as_str(_device_string(device)))
-    else:
-      self._node_def_val.device = _device_string(device)
-
-  def _add_input(self, tensor, dtype=None):
-    """Add a new input to this operation.
-
-    Args:
-      tensor: the Tensor to add as an input.
-      dtype: tf.DType: type of the input; defaults to
-        the tensor's dtype.
+    c_api.SetRequestedDevice(
+        self._graph._c_graph,  # pylint: disable=protected-access
+        self._c_op,  # pylint: disable=protected-access
+        compat.as_str(_device_string(device)))
 
-    Raises:
-      TypeError: if tensor is not a Tensor,
-        or if input tensor type is not convertible to dtype.
-      ValueError: if the Tensor is from a different graph.
-    """
-    assert not self._c_op, (
-        "Operation._add_input doesn't work with C API")
-    if not isinstance(tensor, Tensor):
-      raise TypeError("tensor must be a Tensor: %s" % tensor)
-    _assert_same_graph(self, tensor)
-    if dtype is None:
-      dtype = tensor.dtype
-    else:
-      dtype = dtypes.as_dtype(dtype)
-      if not dtype.is_compatible_with(tensor.dtype):
-        raise TypeError(
-            "Cannot convert a tensor of type %s to an input of type %s" %
-            (tensor.dtype.name, dtype.name))
-    self._inputs_val.append(tensor)
-    self._input_types_val.append(dtype)
-    tensor._add_consumer(self)  # pylint: disable=protected-access
-    self._recompute_node_def()
-
-  # TODO(skyewm): Remove `update_dtype` when we enable the C API.
-  def _update_input(self, index, tensor, update_dtype=True):
+  def _update_input(self, index, tensor):
     """Update the input to this operation at the given index.
 
     NOTE: This is for TF internal use only. Please don't use it.
@@ -1990,7 +1894,6 @@ def _update_input(self, index, tensor, update_dtype=True):
     Args:
       index: the index of the input to update.
       tensor: the Tensor to be used as the input at the given index.
-      update_dtype: If `False`, the type for this input is not updated.
 
     Raises:
       TypeError: if tensor is not a Tensor,
@@ -2007,20 +1910,12 @@ def _update_input(self, index, tensor, update_dtype=True):
     if not _USE_C_SHAPES:
       set_shape_and_handle_data_for_outputs(self)
 
-    if self._c_op:
-      # Reset cached inputs.
-      self._inputs_val = None
-      c_api.UpdateEdge(
-          self._graph._c_graph,  # pylint: disable=protected-access
-          tensor._as_tf_output(),  # pylint: disable=protected-access
-          self._tf_input(index))
-    else:
-      self._inputs_val[index].consumers().remove(self)
-      self._inputs_val[index] = tensor
-      if update_dtype:
-        self._input_types_val[index] = tensor.dtype
-      tensor._add_consumer(self)  # pylint: disable=protected-access
-      self._recompute_node_def()
+    # Reset cached inputs.
+    self._inputs_val = None
+    c_api.UpdateEdge(
+        self._graph._c_graph,  # pylint: disable=protected-access
+        tensor._as_tf_output(),  # pylint: disable=protected-access
+        self._tf_input(index))
 
   def _add_control_inputs(self, ops):
     """Add a list of new control inputs to this operation.
@@ -2032,19 +1927,10 @@ def _add_control_inputs(self, ops):
       TypeError: if ops is not a list of Operations.
       ValueError: if any op in ops is from a different graph.
     """
-    if self._c_op:
-      for op in ops:
-        if not isinstance(op, Operation):
-          raise TypeError("op must be an Operation: %s" % op)
-        c_api.AddControlInput(self._graph._c_graph, self._c_op, op._c_op)  # pylint: disable=protected-access
-    else:
-      if ops:
-        for op in ops:
-          if not isinstance(op, Operation):
-            raise TypeError("op must be an Operation: %s" % op)
-          _assert_same_graph(self, op)
-          self._control_inputs_val.append(op)
-        self._recompute_node_def()
+    for op in ops:
+      if not isinstance(op, Operation):
+        raise TypeError("op must be an Operation: %s" % op)
+      c_api.AddControlInput(self._graph._c_graph, self._c_op, op._c_op)  # pylint: disable=protected-access
 
   def _add_control_input(self, op):
     """Add a new control input to this operation.
@@ -2056,33 +1942,13 @@ def _add_control_input(self, op):
       TypeError: if op is not an Operation.
       ValueError: if op is from a different graph.
     """
-    if self._c_op:
-      if not isinstance(op, Operation):
-        raise TypeError("op must be an Operation: %s" % op)
-      c_api.AddControlInput(self._graph._c_graph, self._c_op, op._c_op)  # pylint: disable=protected-access
-    else:
-      self._add_control_inputs([op])
+    if not isinstance(op, Operation):
+      raise TypeError("op must be an Operation: %s" % op)
+    c_api.AddControlInput(self._graph._c_graph, self._c_op, op._c_op)  # pylint: disable=protected-access
 
   def _remove_all_control_inputs(self):
     """Removes any control inputs to this operation."""
-    if self._c_op:
-      c_api.RemoveAllControlInputs(self._graph._c_graph, self._c_op)  # pylint: disable=protected-access
-    else:
-      del self.control_inputs[:]
-
-  # Methods below are used when building the NodeDef and Graph proto.
-  def _recompute_node_def(self):
-    # TODO(skyewm): remove this function when we switch to C API
-    if self._c_op: return
-
-    del self._node_def_val.input[:]
-    # pylint: disable=protected-access
-    self._node_def_val.input.extend(
-        [t._as_node_def_input() for t in self._inputs_val])
-    # pylint: enable=protected-access
-    if self._control_inputs_val:
-      self._node_def_val.input.extend(
-          ["^%s" % op.name for op in self._control_inputs_val])
+    c_api.RemoveAllControlInputs(self._graph._c_graph, self._c_op)  # pylint: disable=protected-access
 
   def __str__(self):
     return str(self.node_def)
@@ -2123,19 +1989,16 @@ def __getitem__(self, i):
   @property
   def inputs(self):
     """The list of `Tensor` objects representing the data inputs of this op."""
-    if self._c_op:
-      if self._inputs_val is None:
-        tf_outputs = c_api.GetOperationInputs(self._c_op)
-        # pylint: disable=protected-access
-        retval = [
-            self.graph._get_tensor_by_tf_output(tf_output)
-            for tf_output in tf_outputs
-        ]
-        # pylint: enable=protected-access
-        self._inputs_val = Operation._InputList(retval)
-      return self._inputs_val
-    else:
-      return Operation._InputList(self._inputs_val)
+    if self._inputs_val is None:
+      tf_outputs = c_api.GetOperationInputs(self._c_op)
+      # pylint: disable=protected-access
+      retval = [
+          self.graph._get_tensor_by_tf_output(tf_output)
+          for tf_output in tf_outputs
+      ]
+      # pylint: enable=protected-access
+      self._inputs_val = Operation._InputList(retval)
+    return self._inputs_val
 
   @property
   def _inputs(self):
@@ -2149,15 +2012,12 @@ def _inputs(self, value):
 
   @property
   def _input_types(self):
-    if self._c_op:
-      num_inputs = c_api.TF_OperationNumInputs(self._c_op)
-      input_types = [
-          dtypes.as_dtype(c_api.TF_OperationInputType(self._tf_input(i)))
-          for i in xrange(num_inputs)
-      ]
-      return input_types
-    else:
-      return self._input_types_val
+    num_inputs = c_api.TF_OperationNumInputs(self._c_op)
+    input_types = [
+        dtypes.as_dtype(c_api.TF_OperationInputType(self._tf_input(i)))
+        for i in xrange(num_inputs)
+    ]
+    return input_types
 
   @_input_types.setter
   def _input_types(self, value):
@@ -2177,16 +2037,13 @@ def control_inputs(self):
       A list of `Operation` objects.
 
     """
-    if self._c_op:
-      control_c_ops = c_api.TF_OperationGetControlInputs_wrapper(self._c_op)
-      # pylint: disable=protected-access
-      return [
-          self.graph._get_operation_by_name_unsafe(
-              c_api.TF_OperationName(c_op)) for c_op in control_c_ops
-      ]
-      # pylint: enable=protected-access
-    else:
-      return self._control_inputs_val
+    control_c_ops = c_api.TF_OperationGetControlInputs_wrapper(self._c_op)
+    # pylint: disable=protected-access
+    return [
+        self.graph._get_operation_by_name_unsafe(
+            c_api.TF_OperationName(c_op)) for c_op in control_c_ops
+    ]
+    # pylint: enable=protected-access
 
   @property
   def _control_outputs(self):
@@ -2199,18 +2056,13 @@ def _control_outputs(self):
       A list of `Operation` objects.
 
     """
-    if self._c_op:
-      control_c_ops = c_api.TF_OperationGetControlOutputs_wrapper(self._c_op)
-      # pylint: disable=protected-access
-      return [
-          self.graph._get_operation_by_name_unsafe(
-              c_api.TF_OperationName(c_op)) for c_op in control_c_ops
-      ]
-      # pylint: enable=protected-access
-    else:
-      # TODO(apassos) this should be less inefficient.
-      return [o for o in self._graph.get_operations()
-              if self in o.control_inputs]
+    control_c_ops = c_api.TF_OperationGetControlOutputs_wrapper(self._c_op)
+    # pylint: disable=protected-access
+    return [
+        self.graph._get_operation_by_name_unsafe(
+            c_api.TF_OperationName(c_op)) for c_op in control_c_ops
+    ]
+    # pylint: enable=protected-access
 
   @property
   def _control_inputs(self):
@@ -2234,11 +2086,7 @@ def _control_inputs(self, value):
   @property
   def type(self):
     """The type of the op (e.g. `"MatMul"`)."""
-    if self._c_op:
-      op_type = c_api.TF_OperationOpType(self._c_op)
-      return op_type
-    else:
-      return self._node_def_val.op
+    return c_api.TF_OperationOpType(self._c_op)
 
   @property
   def graph(self):
@@ -2256,15 +2104,12 @@ def node_def(self):
       protocol buffer.
     """
     # pylint: enable=line-too-long
-    if self._c_op:
-      with c_api_util.tf_buffer() as buf:
-        c_api.TF_OperationToNodeDef(self._c_op, buf)
-        data = c_api.TF_GetBuffer(buf)
-      node_def = node_def_pb2.NodeDef()
-      node_def.ParseFromString(compat.as_bytes(data))
-      return node_def
-    else:
-      return self._node_def_val
+    with c_api_util.tf_buffer() as buf:
+      c_api.TF_OperationToNodeDef(self._c_op, buf)
+      data = c_api.TF_GetBuffer(buf)
+    node_def = node_def_pb2.NodeDef()
+    node_def.ParseFromString(compat.as_bytes(data))
+    return node_def
 
   @property
   def _node_def(self):
@@ -2283,10 +2128,7 @@ def op_def(self):
       protocol buffer.
     """
     # pylint: enable=line-too-long
-    if self._c_op:
-      return self._graph._get_op_def(self.type)
-    else:
-      return self._op_def_val
+    return self._graph._get_op_def(self.type)
 
   @property
   def _op_def(self):
@@ -2312,17 +2154,14 @@ def traceback_with_start_lines(self):
 
   def _set_attr(self, attr_name, attr_value):
     """Private method used to set an attribute in the node_def."""
-    if self._c_op:
-      buf = c_api.TF_NewBufferFromString(
-          compat.as_bytes(attr_value.SerializeToString()))
-      try:
-        # pylint: disable=protected-access
-        c_api.SetAttr(self._graph._c_graph, self._c_op, attr_name, buf)
-        # pylint: enable=protected-access
-      finally:
-        c_api.TF_DeleteBuffer(buf)
-    else:
-      self._node_def_val.attr[attr_name].CopyFrom(attr_value)
+    buf = c_api.TF_NewBufferFromString(
+        compat.as_bytes(attr_value.SerializeToString()))
+    try:
+      # pylint: disable=protected-access
+      c_api.SetAttr(self._graph._c_graph, self._c_op, attr_name, buf)
+      # pylint: enable=protected-access
+    finally:
+      c_api.TF_DeleteBuffer(buf)
 
   def get_attr(self, name):
     """Returns the value of the attr of this op with the given `name`.
@@ -2337,21 +2176,15 @@ def get_attr(self, name):
       ValueError: If this op does not have an attr with the given `name`.
     """
     fields = ["s", "i", "f", "b", "type", "shape", "tensor", "func"]
-    if self._c_op:
-      try:
-        with c_api_util.tf_buffer() as buf:
-          c_api.TF_OperationGetAttrValueProto(self._c_op, name, buf)
-          data = c_api.TF_GetBuffer(buf)
-      except errors.InvalidArgumentError as e:
-        # Convert to ValueError for backwards compatibility.
-        raise ValueError(str(e))
-      x = attr_value_pb2.AttrValue()
-      x.ParseFromString(data)
-    else:
-      if name not in self._node_def_val.attr:
-        raise ValueError(
-            "No attr named '" + name + "' in " + str(self._node_def_val))
-      x = self._node_def_val.attr[name]
+    try:
+      with c_api_util.tf_buffer() as buf:
+        c_api.TF_OperationGetAttrValueProto(self._c_op, name, buf)
+        data = c_api.TF_GetBuffer(buf)
+    except errors.InvalidArgumentError as e:
+      # Convert to ValueError for backwards compatibility.
+      raise ValueError(str(e))
+    x = attr_value_pb2.AttrValue()
+    x.ParseFromString(data)
 
     # Treat an empty oneof value as an empty list.
     if not x.WhichOneof("value"):
@@ -2571,9 +2404,9 @@ def _set_shape_and_handle_data_for_outputs_c_api(op):
 def set_shape_and_handle_data_for_outputs(op):
   """Set the shapes and resource handle data for op's outputs.
 
-  When _USE_C_API = True, this is lazily called when a tensor's shape is first
-  requested. Usually this should work automatically, but some edge cases may
-  require manaully calling this first to make sure Tensor._shape_val and
+  When _USE_C_SHAPES = False, this is lazily called when a tensor's shape is
+  first requested. Usually this should work automatically, but some edge cases
+  may require manually calling this first to make sure Tensor._shape_val and
   Tensor._handle_data are set (e.g. manually overriding _handle_data, copying a
   Tensor).
   """
@@ -3077,15 +2910,12 @@ def graph_def_versions(self):
       A `VersionDef`.
     """
     # pylint: enable=line-too-long
-    if self._c_graph:
-      with c_api_util.tf_buffer() as buf:
-        c_api.TF_GraphVersions(self._c_graph, buf)
-        data = c_api.TF_GetBuffer(buf)
-      version_def = versions_pb2.VersionDef()
-      version_def.ParseFromString(compat.as_bytes(data))
-      return version_def
-    else:
-      return self._graph_def_versions
+    with c_api_util.tf_buffer() as buf:
+      c_api.TF_GraphVersions(self._c_graph, buf)
+      data = c_api.TF_GetBuffer(buf)
+    version_def = versions_pb2.VersionDef()
+    version_def.ParseFromString(compat.as_bytes(data))
+    return version_def
 
   @property
   def seed(self):
@@ -3179,40 +3009,22 @@ def _as_graph_def(self, from_version=None, add_shapes=False):
 
     """
     # pylint: enable=line-too-long
-    if self._c_graph:
-      with self._lock:
-        with c_api_util.tf_buffer() as buf:
-          c_api.TF_GraphToGraphDef(self._c_graph, buf)
-          data = c_api.TF_GetBuffer(buf)
-        graph = graph_pb2.GraphDef()
-        graph.ParseFromString(compat.as_bytes(data))
-        # Strip the experimental library field iff it's empty.
-        if not graph.library.function:
-          graph.ClearField("library")
-
-        if add_shapes:
-          for node in graph.node:
-            op = self._nodes_by_name[node.name]
-            if op.outputs:
-              node.attr["_output_shapes"].list.shape.extend(
-                  [output.get_shape().as_proto() for output in op.outputs])
-    else:
-      with self._lock:
-        graph = graph_pb2.GraphDef()
-        graph.versions.CopyFrom(self._graph_def_versions)
-        bytesize = 0
-        for op_id in sorted(self._nodes_by_id):
-          op = self._nodes_by_id[op_id]
-          if from_version is None or op_id > from_version:
-            graph.node.extend([op.node_def])
-            if op.outputs and add_shapes:
-              assert "_output_shapes" not in graph.node[-1].attr
-              graph.node[-1].attr["_output_shapes"].list.shape.extend(
-                  [output.get_shape().as_proto() for output in op.outputs])
-            bytesize += op.node_def.ByteSize()
-            if bytesize >= (1 << 31) or bytesize < 0:
-              raise ValueError("GraphDef cannot be larger than 2GB.")
-        self._copy_functions_to_graph_def(graph, bytesize)
+    with self._lock:
+      with c_api_util.tf_buffer() as buf:
+        c_api.TF_GraphToGraphDef(self._c_graph, buf)
+        data = c_api.TF_GetBuffer(buf)
+      graph = graph_pb2.GraphDef()
+      graph.ParseFromString(compat.as_bytes(data))
+      # Strip the experimental library field iff it's empty.
+      if not graph.library.function:
+        graph.ClearField("library")
+
+      if add_shapes:
+        for node in graph.node:
+          op = self._nodes_by_name[node.name]
+          if op.outputs:
+            node.attr["_output_shapes"].list.shape.extend(
+                [output.get_shape().as_proto() for output in op.outputs])
     return graph, self._version
 
   def as_graph_def(self, from_version=None, add_shapes=False):
@@ -3286,34 +3098,16 @@ def _add_function(self, function):
 
     # Add function to graph
     # pylint: disable=protected-access
-    if self._c_graph:
-      # Handle functions created without using the C API. TODO(apassos,skyewm)
-      # remove this when all functions are generated using the C API by default
-      # as this will be unnecessary.
-      if not function._c_func:
-        serialized = function.definition.SerializeToString()
-        c_func = c_api.TF_FunctionImportFunctionDef(serialized)
-        function._c_func = c_api_util.ScopedTFFunction(c_func)
-      gradient = (function._grad_func._c_func.func if function._grad_func
-                  else None)
-      c_api.TF_GraphCopyFunction(self._c_graph, function._c_func.func, gradient)
-    else:
-      # If there is already a function with the same name, raise an error
-      # if bodies are different. Else, do nothing. The C API version above
-      # has the same behavior.
-      previous = self._functions.get(name, None)
-      if previous:
-        # This check is not ideal as we can have a hash collision with only
-        # 32 bits in the hash, but the non C API mode is being deprecated.
-        # Don't bother changing it now.
-        if previous._hash_str == function._hash_str:
-          return
-        else:
-          raise ValueError("Cannot add function (%s, hash %s) to graph (%s). "
-                           "Another function (%s, hash %s) is already defined "
-                           "with that name (%s)" % (
-                               function, function._hash_str, self,
-                               previous, previous._hash_str, name))
+    # Handle functions created without using the C API. TODO(apassos,skyewm)
+    # remove this when all functions are generated using the C API by default
+    # as this will be unnecessary.
+    if not function._c_func:
+      serialized = function.definition.SerializeToString()
+      c_func = c_api.TF_FunctionImportFunctionDef(serialized)
+      function._c_func = c_api_util.ScopedTFFunction(c_func)
+    gradient = (function._grad_func._c_func.func if function._grad_func
+                else None)
+    c_api.TF_GraphCopyFunction(self._c_graph, function._c_func.func, gradient)
     # pylint: enable=protected-access
 
     self._functions[name] = function
@@ -3328,6 +3122,9 @@ def building_function(self):
     return self._building_function
 
   # Helper functions to create operations.
+  @deprecated_args(None,
+                   "Shapes are always computed; don't use the compute_shapes "
+                   "as it has no effect.", "compute_shapes")
   def create_op(
       self,
       op_type,
@@ -3364,8 +3161,8 @@ def create_op(
         proto).
       op_def: (Optional.) The `OpDef` proto that describes the `op_type` that
         the operation will have.
-      compute_shapes: (Optional.) If True, shape inference will be performed
-        to compute the shapes of the outputs.
+      compute_shapes: (Optional.) Deprecated. Has no effect (shapes are always
+        computed).
       compute_device: (Optional.) If True, device functions will be executed
         to compute the device property of the Operation.
 
@@ -3375,8 +3172,9 @@ def create_op(
 
     Returns:
       An `Operation` object.
-
     """
+    del compute_shapes
+
     self._check_not_finalized()
     for idx, a in enumerate(inputs):
       if not isinstance(a, Tensor):
@@ -3406,18 +3204,7 @@ def create_op(
           input_types=input_types,
           original_op=self._default_original_op,
           op_def=op_def)
-
-      # Note: shapes are lazily computed with the C API enabled.
-      #
-      # TODO(skyewm): unlike in the original Python implementation, the C API
-      # always computes shape information (even for function calls, which the
-      # original Python shape inference code doesn't handle). Deprecate the
-      # compute_shapes argument.
-      if not _USE_C_API and compute_shapes:
-        set_shape_and_handle_data_for_outputs(ret)
-
-      self._create_op_helper(ret, compute_shapes=compute_shapes,
-                             compute_device=compute_device)
+      self._create_op_helper(ret, compute_device=compute_device)
     return ret
 
   def _create_op_from_tf_operation(self, c_op, compute_device=True):
@@ -3446,16 +3233,14 @@ def _create_op_from_tf_operation(self, c_op, compute_device=True):
     # the name will still appear in _names_in_use even though the name hasn't
     # been used. This is ok, just leave _names_in_use as-is in this case.
     # TODO(skyewm): make the C API guarantee no name conflicts.
-    if ret.name not in self._names_in_use:
-      self._names_in_use[ret.name] = 1
+    name_key = ret.name.lower()
+    if name_key not in self._names_in_use:
+      self._names_in_use[name_key] = 1
     self._create_op_helper(ret, compute_device=compute_device)
     return ret
 
-  def _create_op_helper(self, op, compute_shapes=True, compute_device=True):
+  def _create_op_helper(self, op, compute_device=True):
     """Common logic for creating an op in this graph."""
-    # TODO(b/XXXX): move to Operation.__init__ once _USE_C_API flag is removed.
-    self._add_op(op)
-
     # Apply any additional attributes requested. Do not overwrite any existing
     # attributes.
     for key, value in self._attr_scope_map.items():
@@ -3522,8 +3307,7 @@ def _create_op_helper(self, op, compute_shapes=True, compute_device=True):
     # (2) "is_stateful" is set in OpDef
     # (3) "container" attribute is in OpDef
     # (4) "container" attribute is None
-    # TODO(skyewm): remove op.op_def check when _USE_C_API is removed.
-    if self._container and op.op_def and op.op_def.is_stateful:
+    if self._container and op.op_def.is_stateful:
       try:
         container_attr = op.get_attr("container")
       except ValueError:
@@ -3810,17 +3594,14 @@ def _last_id(self):
 
   def _get_op_def(self, type):  # pylint: disable=redefined-builtin
     """Returns the `OpDef` proto for `type`. `type` is a string."""
-    if self._c_graph:
-      with c_api_util.tf_buffer() as buf:
-        # pylint: disable=protected-access
-        c_api.TF_GraphGetOpDef(self._c_graph, compat.as_bytes(type), buf)
-        # pylint: enable=protected-access
-        data = c_api.TF_GetBuffer(buf)
-      op_def = op_def_pb2.OpDef()
-      op_def.ParseFromString(compat.as_bytes(data))
-      return op_def
-    else:
-      return self._registered_ops[type]
+    with c_api_util.tf_buffer() as buf:
+      # pylint: disable=protected-access
+      c_api.TF_GraphGetOpDef(self._c_graph, compat.as_bytes(type), buf)
+      # pylint: enable=protected-access
+      data = c_api.TF_GetBuffer(buf)
+    op_def = op_def_pb2.OpDef()
+    op_def.ParseFromString(compat.as_bytes(data))
+    return op_def
 
   def as_default(self):
     """Returns a context manager that makes this `Graph` the default graph.
@@ -3852,6 +3633,9 @@ def as_default(self):
       assert c.graph is g
     ```
 
+    If eager execution is enabled ops created under this context manager will be
+    added to the graph instead of executed eagerly.
+
     Returns:
       A context manager for using this graph as the default graph.
     """
@@ -3873,7 +3657,6 @@ def add_to_collection(self, name, value):
         contains many standard names for collections.
       value: The value to add to the collection.
     """  # pylint: disable=g-doc-exception
-    _assert_collection_is_ok(name)
     self._check_not_finalized()
     with self._lock:
       if name not in self._collections:
@@ -3920,7 +3703,6 @@ def get_collection_ref(self, name):
       The list of values in the collection with the given `name`, or an empty
       list if no value has been added to that collection.
     """  # pylint: disable=g-doc-exception
-    _assert_collection_is_ok(name)
     with self._lock:
       coll_list = self._collections.get(name, None)
       if coll_list is None:
@@ -3950,7 +3732,6 @@ def get_collection(self, name, scope=None):
       list contains the values in the order under which they were
       collected.
     """  # pylint: disable=g-doc-exception
-    _assert_collection_is_ok(name)
     with self._lock:
       collection = self._collections.get(name, None)
       if collection is None:
@@ -4163,20 +3944,27 @@ def unique_name(self, name, mark_as_used=True):
     """
     if self._name_stack:
       name = self._name_stack + "/" + name
-    i = self._names_in_use.get(name, 0)
-    # Increment the number for "name".
+
+    # For the sake of checking for names in use, we treat names as case
+    # insensitive (e.g. foo = Foo).
+    name_key = name.lower()
+    i = self._names_in_use.get(name_key, 0)
+    # Increment the number for "name_key".
     if mark_as_used:
-      self._names_in_use[name] = i + 1
+      self._names_in_use[name_key] = i + 1
     if i > 0:
-      base_name = name
-      # Make sure the composed name is not already used.
-      while name in self._names_in_use:
-        name = "%s_%d" % (base_name, i)
+      base_name_key = name_key
+      # Make sure the composed name key is not already used.
+      while name_key in self._names_in_use:
+        name_key = "%s_%d" % (base_name_key, i)
         i += 1
-      # Mark the composed name as used in case someone wants
+      # Mark the composed name_key as used in case someone wants
       # to call unique_name("name_1").
       if mark_as_used:
-        self._names_in_use[name] = 1
+        self._names_in_use[name_key] = 1
+
+      # Return the new name with the original capitalization of the given name.
+      name = "%s_%d" % (name, i-1)
     return name
 
   def get_name_scope(self):
@@ -5261,35 +5049,15 @@ def reset(self):
   @tf_contextlib.contextmanager
   def get_controller(self, default):
     try:
-      if context.executing_eagerly():
-        # A Graph alone on the context stack would keep init_scope-wrapped
-        # operations graph building when entered (assuming init_scope is called
-        # in a graph building context). Instead, we push a context which first
-        # enables eager execution and then re-enters the Graph.
-        context.context().context_switches.push(
-            default.building_function,
-            functools.partial(
-                _enter_context_and_graph,
-                context.eager_mode,
-                default.as_default))
-      else:
-        # This Graph is being used from a graph building context. A lack of
-        # context switch implies that the context is graph building.
-        context.context().context_switches.push(default.building_function,
-                                                default.as_default)
-      with super(_DefaultGraphStack, self).get_controller(default) as g:
+      context.context().context_switches.push(
+          default.building_function, default.as_default)
+      with super(_DefaultGraphStack, self).get_controller(
+          default) as g, context.graph_mode():
         yield g
     finally:
       context.context().context_switches.pop()
 
 
-@tf_contextlib.contextmanager
-def _enter_context_and_graph(context_fn, graph_fn):
-  """Combines two context managers."""
-  with context_fn(), graph_fn():
-    yield
-
-
 _default_graph_stack = _DefaultGraphStack()
 
 
@@ -5338,6 +5106,7 @@ def init_scope():
       # Names that end with trailing slashes are treated by `name_scope` as
       # absolute.
       scope = scope + '/'
+    inner_device_stack = default_graph._device_function_stack  # pylint: disable=protected-access
 
     outer_context = None
     if not _default_graph_stack.stack:
@@ -5366,9 +5135,23 @@ def init_scope():
       raise RuntimeError("All graphs are building functions, and no "
                          "eager context was previously active.")
 
-    with outer_context(), name_scope(scope), control_dependencies(
-        None), tape.stop_recording():
-      yield
+    outer_graph = None
+    outer_device_stack = None
+    try:
+      with outer_context(), name_scope(scope), control_dependencies(
+          None), tape.stop_recording():
+        if not context.executing_eagerly():
+          # The device stack is preserved when lifting into a graph. Eager
+          # execution doesn't implement device stacks and in particular it
+          # doesn't support device functions, so in general it's not possible
+          # to do the same when lifting into the eager context.
+          outer_graph = get_default_graph()
+          outer_device_stack = outer_graph._device_function_stack  # pylint: disable=protected-access
+          outer_graph._device_function_stack = inner_device_stack  # pylint: disable=protected-access
+        yield
+    finally:
+      if outer_graph is not None:
+        outer_graph._device_function_stack = outer_device_stack  # pylint: disable=protected-access
 
 
 @tf_export("enable_eager_execution")
@@ -5402,36 +5185,30 @@ def enable_eager_execution(config=None, device_policy=None,
       in which operations are executed. Note that @{tf.ConfigProto} is also
       used to configure graph execution (via @{tf.Session}) and many options
       within `tf.ConfigProto` are not implemented (or are irrelevant) when
-     eager execution is enabled.
+      eager execution is enabled.
     device_policy: (Optional.) Policy controlling how operations requiring
-     inputs on a specific device (e.g., a GPU 0) handle inputs on a different
-     device  (e.g. GPU 1 or CPU). When set to None, an appropriate value will be
-     picked automatically. The value picked may change between TensorFlow
-     releases.
-     Valid values:
-
+      inputs on a specific device (e.g., a GPU 0) handle inputs on a different
+      device  (e.g. GPU 1 or CPU). When set to None, an appropriate value will be
+      picked automatically. The value picked may change between TensorFlow
+      releases.
+      Valid values:
       - tf.contrib.eager.DEVICE_PLACEMENT_EXPLICIT: raises an error if the
         placement is not correct.
-
       - tf.contrib.eager.DEVICE_PLACEMENT_WARN: copies the tensors which are not
         on the right device but logs a warning.
-
       - tf.contrib.eager.DEVICE_PLACEMENT_SILENT: silently copies the tensors.
         Note that this may hide performance problems as there is no notification
         provided when operations are blocked on the tensor being copied between
         devices.
-
       - tf.contrib.eager.DEVICE_PLACEMENT_SILENT_FOR_INT32: silently copies
         int32 tensors, raising errors on the other ones.
     execution_mode: (Optional.) Policy controlling how operations dispatched are
       actually executed. When set to None, an appropriate value will be picked
       automatically. The value picked may change between TensorFlow releases.
       Valid values:
-
-        - tf.contrib.eager.SYNC: executes each operation synchronously.
-
-        - tf.contrib.eager.ASYNC: executes each operation asynchronously. These
-          operations may return "non-ready" handles.
+      - tf.contrib.eager.SYNC: executes each operation synchronously.
+      - tf.contrib.eager.ASYNC: executes each operation asynchronously. These
+        operations may return "non-ready" handles.
 
   Raises:
     ValueError: If eager execution is enabled after creating/executing a
@@ -5455,8 +5232,8 @@ def enable_eager_execution(config=None, device_policy=None,
   # pylint: disable=protected-access
   if context._default_mode == context.GRAPH_MODE:
     graph_mode_has_been_used = (
-        _default_session_stack.stack or
-        _default_graph_stack._global_default_graph is not None)
+        _default_session_stack.stack
+        or len(get_default_graph().get_operations()) > 0)  # pylint: disable=g-explicit-length-test
     if graph_mode_has_been_used:
       raise ValueError(
           "tf.enable_eager_execution must be called at program startup.")
@@ -5553,6 +5330,10 @@ def get_default_graph():
   """
   return _default_graph_stack.get_default()
 
+def has_default_graph():
+  """Returns True if there is a default graph."""
+  return len(_default_graph_stack.stack) >= 1
+
 
 def get_name_scope():
   """Returns the current name scope in the default_graph.
@@ -5820,7 +5601,8 @@ def add_to_collection(name, value):
     value: The value to add to the collection.
 
   @compatibility(eager)
-  Collections are not supported when eager execution is enabled.
+  Collections are only supported in eager when variables are created inside an
+  EagerVariableStore (e.g. as part of a layer or template).
   @end_compatibility
   """
   get_default_graph().add_to_collection(name, value)
@@ -5838,7 +5620,8 @@ def add_to_collections(names, value):
     value: The value to add to the collections.
 
   @compatibility(eager)
-  Collections are not supported when eager execution is enabled.
+  Collections are only supported in eager when variables are created inside an
+  EagerVariableStore (e.g. as part of a layer or template).
   @end_compatibility
   """
   get_default_graph().add_to_collections(names, value)
@@ -6131,14 +5914,6 @@ def get_from_proto_function(collection_name):
     return None
 
 
-def _assert_collection_is_ok(collection_name):
-  if context.executing_eagerly():
-    if collection_name in GraphKeys._VARIABLE_COLLECTIONS:  # pylint: disable=protected-access
-      raise ValueError(
-          "variable collections are not supported when eager execution is enabled."
-      )
-
-
 def _operation_conversion_error(op, dtype=None, name=None, as_ref=False):
   """Produce a nice error if someone converts an Operation to a Tensor."""
   raise TypeError(("Can't convert Operation '%s' to Tensor "
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index c9c1a3d66be105..750df4d8e3926f 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -23,7 +23,6 @@
 import weakref
 
 from tensorflow.core.framework import attr_value_pb2
-from tensorflow.core.framework import types_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
@@ -43,7 +42,6 @@
 from tensorflow.python.framework import versions
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import resources
@@ -56,7 +54,6 @@
 ops._set_call_cpp_shape_fn(common_shapes.call_cpp_shape_fn)
 
 
-@test_util.with_c_api
 class ResourceTest(test_util.TensorFlowTestCase):
 
   def testBuildGraph(self):
@@ -82,7 +79,6 @@ def testInitialize(self):
                   resources.shared_resources()).eval()), 0)
 
 
-@test_util.with_c_api
 class TensorAndShapeTest(test_util.TensorFlowTestCase):
 
   def testShape(self):
@@ -141,7 +137,6 @@ def testShapeFunctionError(self):
         _ = a + b
 
 
-@test_util.with_c_api
 class IndexedSlicesTest(test_util.TensorFlowTestCase):
 
   def testToTensor(self):
@@ -170,7 +165,6 @@ def testScalarMul(self):
       self.assertAllEqual(x.indices.eval(), [0, 2])
 
 
-@test_util.with_c_api
 class NodeDefConstructorTest(test_util.TensorFlowTestCase):
 
   def testNoArgs(self):
@@ -193,7 +187,6 @@ def _apply_op(g, *args, **kwargs):
     return op.outputs
 
 
-@test_util.with_c_api
 class OperationTest(test_util.TensorFlowTestCase):
 
   def testNoInputs(self):
@@ -277,7 +270,6 @@ def testReferenceInput(self):
     op1 = ops.Operation(
         ops._NodeDef("RefOutputFloatOutput", "op1"), g, [],
         [dtypes.float32_ref, dtypes.float32])
-    g._add_op(op1)
     self.assertProtoEquals("op:'RefOutputFloatOutput' name:'op1'", op1.node_def)
     self.assertEquals([], list(op1.inputs))
     ref_t, nonref_t = op1.values()
@@ -286,14 +278,12 @@ def testReferenceInput(self):
         ops._NodeDef("RefInputFloatInput", "op2"),
         g, [ref_t, nonref_t], [],
         input_types=[dtypes.float32_ref, dtypes.float32])
-    g._add_op(op2)
     self.assertProtoEquals(
         "op:'RefInputFloatInput' name:'op2' input:'op1' input:'op1:1'",
         op2.node_def)
     self.assertEquals([ref_t, nonref_t], list(op2.inputs))
     op3 = ops.Operation(
         ops._NodeDef("TwoFloatInputs", "op3"), g, [ref_t, nonref_t], [])
-    g._add_op(op3)
     self.assertProtoEquals(
         "op:'TwoFloatInputs' name:'op3' input:'op1' input:'op1:1'",
         op3.node_def)
@@ -443,12 +433,8 @@ def func(x):
                      attr_value_pb2.NameAttrList(name="MyFunc"))
 
     # Try fetching missing attr
-    if ops._USE_C_API:
-      error_msg = "Operation 'FuncAttr' has no attr named 'FakeAttr'."
-    else:
-      error_msg = "No attr named 'FakeAttr' in name: \"FuncAttr\""
-
-    with self.assertRaisesRegexp(ValueError, error_msg):
+    with self.assertRaisesRegexp(
+        ValueError, "Operation 'FuncAttr' has no attr named 'FakeAttr'."):
       op.get_attr("FakeAttr")
 
   # TODO(b/65162920): remove this test when users who are directly mutating the
@@ -461,23 +447,6 @@ def testSetAttr(self):
 
   # TODO(nolivia): test all error cases
   def testAddControlInput(self):
-    # The C API dedups redundant control edges, pure Python does not
-    if ops._USE_C_API: return
-    with ops.Graph().as_default():
-      x = constant_op.constant(1).op
-      y = constant_op.constant(2).op
-      z = constant_op.constant(3).op
-    z._add_control_input(x)  # pylint: disable=protected-access
-    self.assertEqual(z.control_inputs, [x])
-    z._add_control_input(x)  # pylint: disable=protected-access
-    self.assertEqual(z.control_inputs, [x, x])
-    z._add_control_inputs([x, y, y])  # pylint: disable=protected-access
-    self.assertEqual(z.control_inputs, [x, x, x, y, y])
-    self.assertEqual(x._control_outputs, [z])
-
-  def testAddControlInputC(self):
-    # The C API dedups redundant control edges, pure Python does not
-    if not ops._USE_C_API: return
     with ops.Graph().as_default():
       x = constant_op.constant(1).op
       y = constant_op.constant(2).op
@@ -515,8 +484,6 @@ def testRemoveAllControlInputs(self):
     self.assertEqual(list(f.op.inputs), [d, e])
 
   def testControlInputCycle(self):
-    # Non-C API path has a different error message
-    if not ops._USE_C_API: return
     graph = ops.Graph()
     with graph.as_default():
       z = constant_op.constant(0)
@@ -586,25 +553,6 @@ def testUpdateInputTypeError(self):
         sess.run(z)
 
   def testUpdateInputShapeError(self):
-    # C-API throws the error differently.
-    if ops._USE_C_API:
-      return
-    g = ops.Graph()
-    with g.as_default():
-      w = constant_op.constant(2, shape=[3, 1])
-      x = constant_op.constant(0, shape=[3, 1])
-      y = constant_op.constant(1, shape=[2, 2])
-      z = w + x
-      z.op._update_input(0, y)  # pylint: disable=protected-access
-
-    with session.Session(graph=g) as sess:
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   r"Incompatible shapes: \[2,2\] vs. \[3,1\]"):
-        sess.run(z)
-
-  def testUpdateInputShapeErrorC(self):
-    if not ops._USE_C_API:
-      return
     g = ops.Graph()
     with g.as_default():
       w = constant_op.constant(2, shape=[3, 1])
@@ -617,17 +565,6 @@ def testUpdateInputShapeErrorC(self):
       z.op._update_input(0, y)  # pylint: disable=protected-access
 
   def testUpdateInputOutOfRange(self):
-    # C-API throws the error differently.
-    if ops._USE_C_API: return
-    g = ops.Graph()
-    with g.as_default():
-      x = constant_op.constant(1)
-    with self.assertRaisesRegexp(IndexError, "list index out of range"):
-      x.op._update_input(1, x)  # pylint: disable=protected-access
-
-  def testUpdateInputOutOfRangeC(self):
-    # C-API throws the error differently.
-    if not ops._USE_C_API: return
     g = ops.Graph()
     with g.as_default():
       x = constant_op.constant(1)
@@ -643,11 +580,9 @@ def testOpDef(self):
     y = constant_op.constant(1)
     z = x + y
 
-    # Pure Python mode doesn't create OpDefs for constants
-    if ops._USE_C_API:
-      self.assertEqual(x.op.op_def.name, "Const")
-      self.assertEqual(len(x.op.op_def.input_arg), 0)
-      self.assertEqual(len(x.op.op_def.output_arg), 1)
+    self.assertEqual(x.op.op_def.name, "Const")
+    self.assertEqual(len(x.op.op_def.input_arg), 0)
+    self.assertEqual(len(x.op.op_def.output_arg), 1)
 
     self.assertEqual(z.op.op_def.name, "Add")
     self.assertEqual(len(z.op.op_def.input_arg), 2)
@@ -673,7 +608,6 @@ def testInputsAreImmutable(self):
       op.inputs.append(None)
 
 
-@test_util.with_c_api
 class CreateOpTest(test_util.TensorFlowTestCase):
 
   def testNodeDefArgs(self):
@@ -738,20 +672,15 @@ def testFinalized(self):
 # the control flow context isn't set properly, but a more complicated use case
 # that might not be obvious to test will fail). Thus we instead explicitly test
 # the low-level behavior.
-@test_util.with_c_api
 class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
 
   def testBasic(self):
     g = ops.Graph()
     with g.as_default():
       x = test_ops.int_output()
-      if ops._USE_C_API:
-        c_op = ops._create_c_op(
-            g, ops._NodeDef("IntInputIntOutput", "myop"), [x], [])
-        op = g._create_op_from_tf_operation(c_op)
-      else:
-        # Test pure-Python version to make sure C API has same behavior.
-        op = test_ops.int_input_int_output(x, name="myop").op
+      c_op = ops._create_c_op(
+          g, ops._NodeDef("IntInputIntOutput", "myop"), [x], [])
+      op = g._create_op_from_tf_operation(c_op)
 
     self.assertEqual(op.name, "myop")
     self.assertEqual(op.type, "IntInputIntOutput")
@@ -770,12 +699,8 @@ def testShape(self):
     g = ops.Graph()
     with g.as_default():
       x = constant_op.constant([[1, 2, 3], [4, 5, 6]])
-      if ops._USE_C_API:
-        c_op = ops._create_c_op(g, ops._NodeDef("Identity", "myop"), [x], [])
-        op = g._create_op_from_tf_operation(c_op)
-      else:
-        # Test pure-Python version to make sure C API has same behavior.
-        op = array_ops.identity(x, name="myop").op
+      c_op = ops._create_c_op(g, ops._NodeDef("Identity", "myop"), [x], [])
+      op = g._create_op_from_tf_operation(c_op)
 
     self.assertEqual(op.name, "myop")
     self.assertEqual(op.type, "Identity")
@@ -785,15 +710,10 @@ def testShape(self):
   def testUniqueName(self):
     g = ops.Graph()
     with g.as_default():
-      if ops._USE_C_API:
-        c_op = ops._create_c_op(g, ops._NodeDef("IntOutput", "myop"), [], [])
-        c_op2 = ops._create_c_op(g, ops._NodeDef("IntOutput", "myop_1"), [], [])
-        op = g._create_op_from_tf_operation(c_op)
-        op2 = g._create_op_from_tf_operation(c_op2)
-      else:
-        # Test pure-Python version to make sure C API has same behavior.
-        op = test_ops.int_output(name="myop").op
-        op2 = test_ops.int_output(name="myop_1").op
+      c_op = ops._create_c_op(g, ops._NodeDef("IntOutput", "myop"), [], [])
+      c_op2 = ops._create_c_op(g, ops._NodeDef("IntOutput", "myop_1"), [], [])
+      op = g._create_op_from_tf_operation(c_op)
+      op2 = g._create_op_from_tf_operation(c_op2)
 
       # Create ops with same names as op1 and op2. We expect the new names to be
       # uniquified.
@@ -811,14 +731,10 @@ def testCond(self):
       x = test_ops.int_output()
 
       def true_fn():
-        if ops._USE_C_API:
-          ops._create_c_op(ops.get_default_graph(),
-                           ops._NodeDef("IntInput", "cond/myop"), [x], [])
-          new_ops = g._add_new_tf_operations()
-          self.assertEqual(len(new_ops), 1)
-        else:
-          # Test pure-Python version to make sure C API has same behavior.
-          test_ops.int_input(x, name="myop")
+        ops._create_c_op(ops.get_default_graph(),
+                         ops._NodeDef("IntInput", "cond/myop"), [x], [])
+        new_ops = g._add_new_tf_operations()
+        self.assertEqual(len(new_ops), 1)
         return x
 
       control_flow_ops.cond(x < 10, true_fn, lambda: x)
@@ -844,14 +760,10 @@ def testWhileLoop(self):
       x = test_ops.int_output()
 
       def body(i):
-        if ops._USE_C_API:
-          ops._create_c_op(ops.get_default_graph(),
-                           ops._NodeDef("IntInput", "myloop/myop"), [x], [])
-          new_ops = g._add_new_tf_operations()
-          self.assertEqual(len(new_ops), 1)
-        else:
-          # Test pure-Python version to make sure C API has same behavior.
-          test_ops.int_input(x, name="myop")
+        ops._create_c_op(ops.get_default_graph(),
+                         ops._NodeDef("IntInput", "myloop/myop"), [x], [])
+        new_ops = g._add_new_tf_operations()
+        self.assertEqual(len(new_ops), 1)
         return i
 
       control_flow_ops.while_loop(lambda i: i < 10, body, [0], name="myloop")
@@ -878,15 +790,11 @@ def testWhileLoopWithInternalControlDep(self):
 
       def body(i):
         c = constant_op.constant(1.0, name="c")
-        if ops._USE_C_API:
-          ops._create_c_op(ops.get_default_graph(),
-                           ops._NodeDef("IntInput", "myloop/myop"), [x], [])
-          with ops.control_dependencies([c]):
-            new_ops = g._add_new_tf_operations()
-            self.assertEqual(len(new_ops), 1)
-        else:
-          with ops.control_dependencies([c]):
-            test_ops.int_input(x, name="myop")
+        ops._create_c_op(ops.get_default_graph(),
+                         ops._NodeDef("IntInput", "myloop/myop"), [x], [])
+        with ops.control_dependencies([c]):
+          new_ops = g._add_new_tf_operations()
+          self.assertEqual(len(new_ops), 1)
         return i
 
       control_flow_ops.while_loop(lambda i: i < 10, body, [0], name="myloop")
@@ -905,15 +813,11 @@ def testWhileLoopWithExternalControlDep(self):
       c = constant_op.constant(1.0)
 
       def body(i):
-        if ops._USE_C_API:
-          ops._create_c_op(ops.get_default_graph(),
-                           ops._NodeDef("IntInput", "myloop/myop"), [x], [])
-          with ops.control_dependencies([c]):
-            new_ops = g._add_new_tf_operations()
-            self.assertEqual(len(new_ops), 1)
-        else:
-          with ops.control_dependencies([c]):
-            test_ops.int_input(x, name="myop")
+        ops._create_c_op(ops.get_default_graph(),
+                         ops._NodeDef("IntInput", "myloop/myop"), [x], [])
+        with ops.control_dependencies([c]):
+          new_ops = g._add_new_tf_operations()
+          self.assertEqual(len(new_ops), 1)
         return i
 
       control_flow_ops.while_loop(lambda i: i < 10, body, [0], name="myloop")
@@ -925,7 +829,6 @@ def body(i):
     self.assertIsNotNone(op.control_inputs[0]._get_control_flow_context())
 
 
-@test_util.with_c_api
 class ApplyOpTest(test_util.TensorFlowTestCase):
 
   def testNodeDefArgs(self):
@@ -979,7 +882,6 @@ def testReferenceInput(self):
         out_3.op.node_def)
 
 
-@test_util.with_c_api
 class NameStackTest(test_util.TensorFlowTestCase):
 
   def testBasics(self):
@@ -1063,6 +965,15 @@ def testOutOfOrderUniqueName(self):
     self.assertEqual("foo_1", g.unique_name("foo"))
     self.assertEqual("foo_3", g.unique_name("foo"))
 
+  def testUniqueNameCaseInsensitivity(self):
+    g = ops.Graph()
+    self.assertEqual("foo", g.unique_name("foo"))
+    self.assertEqual("Foo_1", g.unique_name("Foo"))
+    with g.name_scope("bar"):
+      self.assertEqual("bar/foo", g.unique_name("foo"))
+    with g.name_scope("Bar"):
+      self.assertEqual("Bar_1/foo", g.unique_name("foo"))
+
   def testInvalidNameRaisesError(self):
     g = ops.Graph()
     with g.name_scope(""):  # Should not raise
@@ -1078,7 +989,6 @@ def testInvalidNameRaisesError(self):
         pass
 
 
-@test_util.with_c_api
 class NameTest(test_util.TensorFlowTestCase):
 
   def testGenerateName(self):
@@ -1148,7 +1058,6 @@ def testNameScope(self):
                        g.create_op("FloatOutput", [], [dtypes.float32]).name)
 
 
-@test_util.with_c_api
 class DeviceTest(test_util.TensorFlowTestCase):
 
   def testNoDevice(self):
@@ -1385,7 +1294,6 @@ def testOverwritingBehavior(self):
     """, gd)
 
 
-@test_util.with_c_api
 class MultithreadedGraphStateTest(test_util.TensorFlowTestCase):
 
   class TestThread(threading.Thread):
@@ -1588,7 +1496,6 @@ def run(self):
       self.assertEquals("foo" + s + "/FloatOutput_1", t.result[1].name)
 
 
-@test_util.with_c_api
 class ObjectWithName(object):
 
   def __init__(self, name):
@@ -1599,7 +1506,6 @@ def name(self):
     return self._name
 
 
-@test_util.with_c_api
 class CollectionTest(test_util.TensorFlowTestCase):
 
   def test_get_collections(self):
@@ -1723,7 +1629,6 @@ def _CopyOverrideGrad(op, x_grad):  # pylint: disable=invalid-name
   return x_grad
 
 
-@test_util.with_c_api
 class RegistrationTest(test_util.TensorFlowTestCase):
 
   def testRegisterGradients(self):
@@ -1751,7 +1656,6 @@ def testNonExistentOverride(self):
         ops.get_gradient_function(y.op)
 
 
-@test_util.with_c_api
 class ComparisonTest(test_util.TensorFlowTestCase):
 
   def testMembershipAllowed(self):
@@ -1764,10 +1668,8 @@ def testMembershipAllowed(self):
     self.assertTrue(t1 not in [t2])
 
 
-@test_util.with_c_api
 class ControlDependenciesTest(test_util.TensorFlowTestCase):
 
-  @test_util.enable_c_api
   def testBasic(self):
     g = ops.Graph()
     with g.as_default():
@@ -1971,7 +1873,6 @@ def testNoControlDependencyWithDataDependency(self):
     self.assertEqual(b.op.control_inputs, [])
 
 
-@test_util.with_c_api
 class OpScopeTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes()
@@ -2138,6 +2039,21 @@ def testLiftsOpsFromFunctions(self):
     self.assertEqual(len(g1.get_operations()), 0)
     self.assertEqual(len(g0.get_operations()), 1)
 
+  def testPreservesDevices(self):
+    g0 = ops.Graph()
+    with g0.as_default(), ops.device("CPU:0"):
+      g1 = ops.Graph()
+      g1._building_function = True  # pylint: disable=protected-access
+      with g1.as_default(), ops.device("GPU:0"):
+        with ops.init_scope():
+          # init_scope should preserve device set under `g1`.
+          on_gpu = constant_op.constant(1.0)
+          self.assertEqual(on_gpu.device, "/device:GPU:0")
+        still_on_gpu = constant_op.constant(1.0)
+        self.assertEqual(still_on_gpu.device, "/device:GPU:0")
+      on_cpu = constant_op.constant(1.0)
+      self.assertEqual(on_cpu.device, "/device:CPU:0")
+
   def testComposes(self):
     g0 = ops.Graph()
     g1 = ops.Graph()
@@ -2305,12 +2221,25 @@ def testPreservesNameScopeInGraphConstruction(self):
           self.assertEqual(ops.get_name_scope(), "inner")
       self.assertEqual(ops.get_name_scope(), "")
 
-  def testEagerGraphContextsExecuteEagerly(self):
+  def testEnteringGraphFromEagerIsSticky(self):
     with context.eager_mode():
+      g = ops.Graph()
+      with g.as_default():
+        with ops.init_scope():
+          self.assertFalse(context.executing_eagerly())
+          self.assertEqual(g, ops.get_default_graph())
+
+  def testMixGraphEager(self):
+    with context.eager_mode():
+      c = constant_op.constant(1.0)
       with ops.Graph().as_default():
-        with context.graph_mode():
-          with ops.init_scope():
-            self.assertTrue(context.executing_eagerly())
+        with self.assertRaisesRegexp(
+            RuntimeError, "Attempting to capture an EagerTensor"):
+          math_ops.add(c, c)
+        c2 = constant_op.constant(2.0)
+      with self.assertRaisesRegexp(
+          TypeError, "contains objects other than 'EagerTensor'"):
+        math_ops.add(c2, c2)
 
   def testPreservesNameScopeInEagerExecution(self):
     with context.eager_mode():
@@ -2330,7 +2259,6 @@ def foo():
       self.assertEqual(ops.get_name_scope(), "")
 
 
-@test_util.with_c_api
 class GraphTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -2345,6 +2273,11 @@ def testResetDefaultGraphNesting(self):
       with g0.as_default():
         ops.reset_default_graph()
 
+  def testGraphContextManagerCancelsEager(self):
+    with context.eager_mode():
+      with ops.Graph().as_default():
+        self.assertFalse(context.executing_eagerly())
+
   def testGraphContextManager(self):
     g0 = ops.Graph()
     with g0.as_default() as g1:
@@ -2440,7 +2373,6 @@ def testRunnableAfterInvalidShapeWithKernelLabelMap(self):
         sess.run(a)
 
 
-@test_util.with_c_api
 class AttrScopeTest(test_util.TensorFlowTestCase):
 
   def _get_test_attrs(self):
@@ -2491,10 +2423,8 @@ def testLabelMap(self):
 ops.RegisterShape("KernelLabel")(common_shapes.scalar_shape)
 
 
-@test_util.with_c_api
 class KernelLabelTest(test_util.TensorFlowTestCase):
 
-  @test_util.enable_c_api
   def testNoLabel(self):
     with self.test_session():
       self.assertAllEqual(b"My label is: default",
@@ -2522,7 +2452,6 @@ def testLabelMap(self):
       self.assertAllEqual(b"My label is: overload_2", overload_2.eval())
 
 
-@test_util.with_c_api
 class AsGraphDefTest(test_util.TensorFlowTestCase):
 
   def testGraphDefVersion(self):
@@ -2589,7 +2518,6 @@ def _calc_a_forward_flops(unused_graph, unused_node):
   return ops.OpStats("flops", 20)
 
 
-@test_util.with_c_api
 class StatisticsTest(test_util.TensorFlowTestCase):
 
   def testRegisteredNode(self):
@@ -2614,7 +2542,6 @@ def testAccumulateStatistics(self):
     self.assertEqual(3, flops_total.value)
 
 
-@test_util.with_c_api
 class ColocationGroupTest(test_util.TensorFlowTestCase):
 
   def testBasic(self):
@@ -2739,15 +2666,11 @@ def testInconsistentDeviceWithinColocate(self):
     self.assertEqual("/device:CPU:0", b.device)
 
 
-@test_util.with_c_api
 class DeprecatedTest(test_util.TensorFlowTestCase):
 
   def testSuccess(self):
-    # TODO(skyewm): make g.graph_def_versions work with the C API enabled
-    if ops._USE_C_API: return
-
     with ops.Graph().as_default() as g:
-      g.graph_def_versions.producer = 7
+      test_util.set_producer_version(g, 7)
       old = test_ops.old()
       with self.test_session(graph=g):
         old.run()
@@ -2762,20 +2685,7 @@ def testGraphConstructionFail(self):
       with self.assertRaisesRegexp(NotImplementedError, self._error()):
         test_ops.old()
 
-  def testGraphExecutionFail(self):
-    # TODO(skyewm): make g.graph_def_versions work with the C API enabled
-    if ops._USE_C_API: return
-
-    with ops.Graph().as_default() as g:
-      g.graph_def_versions.producer = 7
-      old = test_ops.old()
-      g.graph_def_versions.producer = versions.GRAPH_DEF_VERSION
-      with self.test_session(graph=g):
-        with self.assertRaisesRegexp(errors.UnimplementedError, self._error()):
-          old.run()
-
 
-@test_util.with_c_api
 class DenseTensorLikeTypeTest(test_util.TensorFlowTestCase):
 
   def testSuccess(self):
@@ -2825,7 +2735,6 @@ def testBadClass(self):
           DenseTensorLikeTypeTest.BadClassBadDtype)
 
 
-@test_util.with_c_api
 class NameScopeTest(test_util.TensorFlowTestCase):
 
   def testStripAndPrependScope(self):
@@ -2876,7 +2785,6 @@ def f():
     self.assertRaisesRegexp(ValueError, "'_' is not a valid scope name", f)
 
 
-@test_util.with_c_api
 class TracebackTest(test_util.TensorFlowTestCase):
 
   def testTracebackWithStartLines(self):
@@ -2898,57 +2806,6 @@ def testTracebackWithStartLines(self):
           self.assertEquals(frame, frame_with_start_line[:-1])
 
 
-@test_util.with_c_api
-class OutputTypesTest(test_util.TensorFlowTestCase):
-  """Tests Operation._output_types property.
-
-  This test should not exist as _output_types is a private property.
-  This property is used by util.copy_elements and its tests would normally
-  cover Operation._output_types. However, we can't yet run these tests in C
-  API mode because their use _set_device method. This test will be deleted
-  once we port _set_device and run the copy tests with C API on.
-  """
-  # TODO(iga): Remove this test
-
-  def setUp(self):
-    self.prev_use_c_api = ops._USE_C_API  # pylint: disable=protected-access
-    ops._USE_C_API = True  # pylint: disable=protected-access
-
-  def tearDown(self):
-    ops._USE_C_API = self.prev_use_c_api  # pylint: disable=protected-access
-
-  def testOneOutput(self):
-    g = ops.Graph()
-    with g.as_default():
-      # Using a constant because creating unregistered ops
-      # doesn't work with the C API.
-      op = constant_op.constant(12, dtype=dtypes.uint16).op
-      # pylint: disable=protected-access
-      self.assertEqual([types_pb2.DT_UINT16], op._output_types)
-      # pylint: enable=protected-access
-
-  def testTwoDifferentOutputs(self):
-    g = ops.Graph()
-    with g.as_default():
-      x = constant_op.constant([1, 1, 2, 4, 4, 4, 7, 8, 8],
-                               dtype=dtypes.double)
-      y, _ = gen_array_ops.unique(x)
-      self.assertEqual([types_pb2.DT_DOUBLE, types_pb2.DT_INT32],
-                       y.op._output_types)  # pylint: disable=protected-access
-
-  def testThreeOutputs(self):
-    g = ops.Graph()
-    with g.as_default():
-      # Using a split operationt because creating unregistered ops
-      # doesn't work with the C API.
-      a = constant_op.constant("abc", dtype=dtypes.string, shape=[5, 30])
-      split0, _, _ = array_ops.split(a, [4, 15, 11], 1)
-      # pylint: disable=protected-access
-      self.assertEqual([types_pb2.DT_STRING] * 3, split0.op._output_types)
-      # pylint: enable=protected-access
-
-
-@test_util.with_c_api
 class EnableEagerExecutionTest(test_util.TensorFlowTestCase):
 
   def testBadArgumentsToEnableEagerExecution(self):
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index ad6c36b4b1773e..ec3748b40ec538 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #include "tensorflow/python/framework/python_op_gen.h"
 
 #include <stdio.h>
@@ -26,8 +25,6 @@ limitations under the License.
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
 #include "tensorflow/core/framework/tensor.pb_text.h"
-#include "tensorflow/core/framework/tensor.pb.h"
-#include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -41,792 +38,913 @@ limitations under the License.
 #include "tensorflow/python/framework/python_op_gen_internal.h"
 
 namespace tensorflow {
-namespace python_op_gen_internal {
+namespace {
 
 const int kRightMargin = 78;
 
-bool IsPythonReserved(const string& s) {
-  static const std::set<string>* const kPythonReserved = new std::set<string>(
-      {// Keywords in Python, from:
-       //   import keyword
-       //   print keyword.kwlist
-       "and", "as", "assert", "break", "class", "continue", "def", "del",
-       "elif", "else", "except", "exec", "finally", "for", "from", "global",
-       "if", "import", "in", "is", "lambda", "not", "or", "pass", "print",
-       "raise", "return", "try", "while", "with", "yield",
-       // Built-in functions and types in Python, from:
-       //   [x for x in dir(__builtins__) if not x[0].islower()]
-       "ArithmeticError", "AssertionError", "AttributeError", "BaseException",
-       "BufferError", "BytesWarning", "DeprecationWarning", "EOFError",
-       "Ellipsis", "EnvironmentError", "Exception", "False",
-       "FloatingPointError", "FutureWarning", "GeneratorExit", "IOError",
-       "ImportError", "ImportWarning", "IndentationError", "IndexError",
-       "KeyError", "KeyboardInterrupt", "LookupError", "MemoryError",
-       "NameError", "None", "NotImplemented", "NotImplementedError", "OSError",
-       "OverflowError", "PendingDeprecationWarning", "ReferenceError",
-       "RuntimeError", "RuntimeWarning", "StandardError", "StopIteration",
-       "SyntaxError", "SyntaxWarning", "SystemError", "SystemExit", "TabError",
-       "True", "TypeError", "UnboundLocalError", "UnicodeDecodeError",
-       "UnicodeEncodeError", "UnicodeError", "UnicodeTranslateError",
-       "UnicodeWarning", "UserWarning", "ValueError", "Warning",
-       "ZeroDivisionError", "__debug__", "__doc__", "__import__", "__name__",
-       "__package__"});
-
-  return kPythonReserved->count(s) > 0;
-}
+constexpr char kEagerFallbackSuffix[] = "_eager_fallback";
 
-bool IsOpWithUnderscorePrefix(const string& s) {
-  static const std::set<string>* const kUnderscoreOps = new std::set<string>(
-      {// Lowercase built-in functions and types in Python, from:
-       // [x for x in dir(__builtins__) if x[0].islower()] except "round".
-       // These need to be excluded so they don't conflict with actual built-in
-       // functions since we use '*' imports.
-       "abs", "all", "any", "apply", "bin", "bool", "buffer", "bytearray",
-       "bytes", "callable", "chr", "classmethod", "cmp", "coerce", "compile",
-       "complex", "copyright", "credits", "delattr", "dict", "dir", "divmod",
-       "enumerate", "eval", "execfile", "exit", "file", "filter", "float",
-       "format", "frozenset", "getattr", "globals", "hasattr", "hash", "help",
-       "hex", "id", "input", "int", "intern", "isinstance", "issubclass",
-       "iter", "len", "license", "list", "locals", "long", "map", "max",
-       "memoryview", "min", "next", "object", "oct", "open", "ord", "pow",
-       "print", "property", "quit", "range", "raw_input", "reduce", "reload",
-       "repr", "reversed", "set", "setattr", "slice", "sorted", "staticmethod",
-       "str", "sum", "super", "tuple", "type", "unichr", "unicode", "vars",
-       "xrange", "zip",
-       // These have the same name as ops defined in Python and might be used
-       // incorrectly depending on order of '*' imports.
-       // TODO(annarev): reduce usage of '*' imports and remove these from the
-       // list.
-       "fused_batch_norm", "histogram_fixed_width", "stack",
-       "batch_norm_with_global_normalization", "clip_by_value"});
-  return kUnderscoreOps->count(s) > 0;
+string AttrVarName(const string& attr_name,
+                   std::unordered_map<string, string>* attr_expressions) {
+  const string var = strings::StrCat("_attr_", attr_name);
+  if (attr_expressions != nullptr) (*attr_expressions)[attr_name] = var;
+  return var;
 }
 
-string AvoidPythonReserved(const string& s) {
-  if (IsPythonReserved(s)) return strings::StrCat(s, "_");
-  return s;
+void AddInferredAttr(const string& indentation, const string& attr_name,
+                     const string& value_expression, string* result,
+                     std::unordered_map<string, string>* attr_expressions) {
+  strings::StrAppend(result, indentation,
+                     AttrVarName(attr_name, attr_expressions), " = ",
+                     value_expression, "\n");
 }
 
-// Indent the first line by "initial" spaces and all following lines
-// by "rest" spaces.
-string Indent(int initial, int rest, StringPiece in) {
-  // TODO(josh11b): Also word-wrapping?
-  string copy(in.data(), in.size());
-  str_util::StripTrailingWhitespace(&copy);
-  std::vector<string> v = str_util::Split(copy, '\n');
+string VectorToTuple(const std::vector<string>& l) {
+  if (l.size() == 1) return strings::StrCat("(", l.front(), ",)");
+  string ret = "(";
+  for (int i = 0; i < l.size(); ++i) {
+    if (i > 0) {
+      strings::StrAppend(&ret, ", ");
+    }
+    strings::StrAppend(&ret, l[i]);
+  }
+  strings::StrAppend(&ret, ")");
+  return ret;
+}
 
-  string result;
-  bool first = true;
-  for (const string& line : v) {
-    if (first) {
-      result = strings::StrCat(Spaces(initial), line, "\n");
-      first = false;
-    } else {
-      if (line.empty()) {
-        strings::StrAppend(&result, "\n");
+void Unflatten(const string& prefix, const std::vector<string>& output_sizes,
+               const string& var, string* result) {
+  for (int i = 0; i < output_sizes.size(); ++i) {
+    if (!output_sizes[i].empty()) {
+      strings::StrAppend(result, prefix, var, " = ");
+      if (i > 0) strings::StrAppend(result, var, "[:", i, "] + ");
+      if (i + 1 < output_sizes.size()) {
+        // Special case i == 0 to avoid "0 +" in the generated code.
+        if (i == 0) {
+          strings::StrAppend(result, "[", var, "[:", output_sizes[i], "]] + ",
+                             var, "[", output_sizes[i], ":]");
+        } else {
+          strings::StrAppend(result, "[", var, "[", i, ":", i, " + ",
+                             output_sizes[i], "]] + ", var, "[", i, " + ",
+                             output_sizes[i], ":]");
+        }
       } else {
-        strings::StrAppend(&result, Spaces(rest), line, "\n");
+        strings::StrAppend(result, "[", var, "[", i, ":]]");
       }
+      strings::StrAppend(result, "\n");
     }
   }
-  return result;
 }
 
-// Adds append to *dest, with a space if the first line will be <= width,
-// or a newline otherwise.
-void AppendWithinWidth(string* dest, StringPiece append, int width) {
-  auto first_line = append.find('\n');
-  if (first_line == string::npos) first_line = append.size();
-  if (dest->size() + first_line + 1 /* space */ > static_cast<size_t>(width)) {
-    strings::StrAppend(dest, "\n", append);
-  } else {
-    strings::StrAppend(dest, " ", append);
-  }
+string TensorPBString(const TensorProto& pb) {
+  // Note: This gets used in the argument list, and so must survive naive
+  // word wrapping.
+  return strings::StrCat("\"\"\"", ProtoShortDebugString(pb), "\"\"\"");
 }
 
-// Like DataTypeString() but uses the Python names for the
-// float types.
-string PythonDataTypeString(DataType dtype) {
-  switch (dtype) {
-    case DT_FLOAT:
-      return "float32";
-    case DT_DOUBLE:
-      return "float64";
-    default:
-      return DataTypeString(dtype);
+const ApiDef::Arg* FindInputArg(StringPiece name, const ApiDef& api_def) {
+  for (int i = 0; i < api_def.in_arg_size(); ++i) {
+    if (api_def.in_arg(i).name() == name) {
+      return &api_def.in_arg(i);
+    }
   }
+  return nullptr;
 }
 
-string TypeString(DataType dtype, bool ref) {
-  if (ref) {
-    return strings::StrCat("mutable `", PythonDataTypeString(dtype), "`");
-  } else {
-    return strings::StrCat("`", PythonDataTypeString(dtype), "`");
+class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
+ public:
+  GenEagerPythonOp(const OpDef& op_def, const ApiDef& api_def,
+                   const string& function_name)
+      : python_op_gen_internal::GenPythonOp(op_def, api_def, function_name) {
+    op_name_ = function_name_;
+    str_util::ConsumePrefix(&op_name_, "_");
   }
-}
-
-string TypeListString(const AttrValue& value) {
-  string ret;
-  for (int t : value.list().type()) {
-    if (!ret.empty()) strings::StrAppend(&ret, ", ");
-    DataType dtype = static_cast<DataType>(t);
-    if (IsRefType(dtype)) {
-      strings::StrAppend(&ret, PythonDataTypeString(RemoveRefType(dtype)),
-                         " mutable");
+  ~GenEagerPythonOp() override {}
+
+  string Code() override;
+
+ protected:
+  void HandleGraphMode(const string& function_setup);
+
+  string GetEagerNotAllowedError();
+  void ExpectListArg(const string& indentation, const string& arg_name,
+                     string* output);
+  bool GetEagerFunctionSetup(const string& indentation, string* function_setup);
+  void GetOutputSizesAndNumOutputsExpr(std::vector<string>* output_sizes,
+                                       string* num_outputs_expr);
+
+  void AddEagerFunctionTeardown(const string& indentation,
+                                const std::vector<string>& output_sizes,
+                                bool execute_record_gradient);
+
+  bool AddEagerFastPathAndGraphCode(const string& parameters,
+                                    const std::vector<string>& output_sizes,
+                                    const string& eager_not_allowed_error);
+  bool AddEagerFallbackCode(const string& parameters,
+                            const std::vector<string>& output_sizes,
+                            const string& num_outputs_expr,
+                            const string& eager_not_allowed_error);
+  void AddEagerFastPathExecute();
+
+  void AddEagerInferredAttrs(const string& indentation);
+  void AddEagerInputCasts(const string& indentation);
+  void AddEagerAttrs(const string& indentation);
+  void AddEagerExecute(const string& indentation,
+                       const string& num_outputs_expr);
+
+  void AddAttrForArg(const string& attr, int arg_index) {
+    gtl::InsertIfNotPresent(&inferred_attrs_, attr,
+                            op_def_.input_arg(arg_index).name());
+    auto iter = attr_to_args_.find(attr);
+    if (iter == attr_to_args_.end()) {
+      attr_to_args_.insert(AttrToArgMap::value_type(attr, {arg_index}));
     } else {
-      strings::StrAppend(&ret, "`", PythonDataTypeString(dtype), "`");
+      iter->second.push_back(arg_index);
     }
   }
-  return ret;
-}
 
-string SingleTensorName(DataType dtype, bool is_ref) {
-  const string type_str = TypeString(dtype, is_ref);
-  return strings::StrCat("A `Tensor` of type ", type_str, ".");
-}
+  // Returns a string expression representing a flattened list of all
+  // the inputs given by `*input_indices` (or all inputs if
+  // `input_indices` is nullptr).  `*output_sizes` can be used to unflatten.
+  string FlattenInputs(const std::vector<int>* input_indices,
+                       std::vector<string>* output_sizes) const;
 
-const char kUnknownTensorType[] = {"A `Tensor`."};
-
-string ArgTypeName(const OpDef& op_def, const OpDef::ArgDef& arg,
-                   const std::unordered_map<string, string>& inferred_attrs,
-                   bool is_output) {
-  if (!arg.number_attr().empty()) {
-    // N Tensors with the same type
-    const string* original_arg =
-        gtl::FindOrNull(inferred_attrs, arg.number_attr());
-    string prefix;
-    if (original_arg == nullptr) {
-      prefix = strings::StrCat("A list of `", arg.number_attr(), "`");
-    } else if (*original_arg == arg.name()) {
-      const OpDef::AttrDef* attr = FindAttr(arg.number_attr(), op_def);
-      if (attr->has_minimum() && attr->minimum() > 0) {
-        prefix = strings::StrCat("A list of at least ", attr->minimum());
-      } else {
-        prefix = "A list of";
-      }
-    } else {
-      prefix = strings::StrCat("A list with the same length as `",
-                               AvoidPythonReserved(*original_arg), "` of");
-    }
+  StringPiece op_name_;
+  typedef std::unordered_map<string, std::vector<int>> AttrToArgMap;
+  AttrToArgMap attr_to_args_;
+  std::unordered_map<string, string> attr_expressions_;
+  // This has all the input args followed by those attrs that don't have
+  // defaults.
+  std::vector<python_op_gen_internal::ParamNames> params_no_default_;
+  // The parameters with defaults (these have to be listed after those without).
+  // No input args are included, just attrs.
+  std::vector<std::pair<python_op_gen_internal::ParamNames, string>>
+      params_with_default_;
+};
 
-    if (arg.type() != DT_INVALID) {
-      return strings::StrCat(prefix, " `Tensor` objects with type ",
-                             TypeString(arg.type(), arg.is_ref()), ".");
-    } else {
-      original_arg = gtl::FindOrNull(inferred_attrs, arg.type_attr());
-      if (arg.is_ref()) {
-        strings::StrAppend(&prefix, " mutable");
+string GetEagerPythonOp(const OpDef& op_def, const ApiDef& api_def,
+                        const string& function_name) {
+  return GenEagerPythonOp(op_def, api_def, function_name).Code();
+}
+
+string GenEagerPythonOp::FlattenInputs(
+    const std::vector<int>* input_indices,
+    std::vector<string>* output_sizes) const {
+  string inputs;
+  enum { STARTING, WAS_LIST_INPUT, WAS_SOLO_INPUT } inputs_state = STARTING;
+  const int n = input_indices != nullptr ? input_indices->size()
+                                         : op_def_.input_arg_size();
+  for (int j = 0; j < n; ++j) {
+    const int i = input_indices ? (*input_indices)[j] : j;
+    const auto& arg(op_def_.input_arg(i));
+    const bool is_list =
+        !arg.type_list_attr().empty() || !arg.number_attr().empty();
+    if (is_list) {
+      if (inputs_state == WAS_SOLO_INPUT) {
+        strings::StrAppend(&inputs, "] + ");
+      } else if (inputs_state == WAS_LIST_INPUT) {
+        strings::StrAppend(&inputs, " + ");
       }
-      if (original_arg == nullptr) {
-        return strings::StrCat(prefix, " `Tensor` objects with type `",
-                               arg.type_attr(), "`.");
-      } else if (*original_arg == arg.name()) {
-        const OpDef::AttrDef* attr = FindAttr(arg.type_attr(), op_def);
-        if (attr->has_allowed_values()) {
-          return strings::StrCat(prefix,
-                                 " `Tensor` objects with the same type in: ",
-                                 TypeListString(attr->allowed_values()), ".");
+      strings::StrAppend(&inputs, "list(", param_names_[i].GetRenameTo(), ")");
+      inputs_state = WAS_LIST_INPUT;
+      if (output_sizes != nullptr) {
+        if (!arg.number_attr().empty()) {
+          output_sizes->emplace_back(AttrVarName(arg.number_attr(), nullptr));
         } else {
-          return strings::StrCat(prefix,
-                                 " `Tensor` objects with the same type.");
+          output_sizes->emplace_back(
+              strings::StrCat("len(", param_names_[i].GetRenameTo(), ")"));
         }
-      } else {
-        return strings::StrCat(prefix,
-                               " `Tensor` objects with the same type as `",
-                               AvoidPythonReserved(*original_arg), "`.");
       }
-    }
-  } else if (!arg.type_attr().empty() || !arg.type_list_attr().empty()) {
-    const bool is_list = !arg.type_list_attr().empty();
-    const string attr_name = is_list ? arg.type_list_attr() : arg.type_attr();
-    const OpDef::AttrDef* attr = FindAttr(attr_name, op_def);
-    const string mutable_str = arg.is_ref() ? "mutable " : "";
-    const string prefix =
-        is_list ? strings::StrCat("A list of ", mutable_str, "`Tensor` objects")
-                : strings::StrCat("A ", mutable_str, "`Tensor`");
-    const string* original_arg = gtl::FindOrNull(inferred_attrs, attr_name);
-    if (original_arg == nullptr) {
-      return strings::StrCat(prefix, " of type `", attr_name, "`.");
-    } else if (*original_arg == arg.name()) {
-      if (attr->has_allowed_values()) {
-        if (is_list) {
-          return strings::StrCat(prefix, " with types from: ",
-                                 TypeListString(attr->allowed_values()), ".");
-        } else {
-          return strings::StrCat(
-              prefix, is_output ? ". Has one of the following types: "
-                                : ". Must be one of the following types: ",
-              TypeListString(attr->allowed_values()), ".");
-        }
+    } else {
+      if (inputs_state == WAS_SOLO_INPUT) {
+        strings::StrAppend(&inputs, ", ");
+      } else if (inputs_state == WAS_LIST_INPUT) {
+        strings::StrAppend(&inputs, " + [");
       } else {
-        return strings::StrCat(prefix, ".");
+        strings::StrAppend(&inputs, "[");
       }
-    } else {
-      return strings::StrCat(prefix,
-                             is_output ? ". Has the same type as `"
-                                       : ". Must have the same type as `",
-                             AvoidPythonReserved(*original_arg), "`.");
+      strings::StrAppend(&inputs, param_names_[i].GetRenameTo());
+      inputs_state = WAS_SOLO_INPUT;
+      if (output_sizes != nullptr) output_sizes->emplace_back();
     }
-  } else {
-    return SingleTensorName(arg.type(), arg.is_ref());
   }
+  if (inputs_state == STARTING) return "[]";
+  if (inputs_state == WAS_SOLO_INPUT) {
+    strings::StrAppend(&inputs, "]");
+  }
+  return inputs;
 }
 
-string GetReturns(const OpDef& op_def,
-                  const std::vector<string>& output_type_string) {
-  string result;
-  DCHECK_EQ(op_def.output_arg_size(), output_type_string.size());
-  const int num_outs = op_def.output_arg_size();
-  strings::StrAppend(&result, "\n  Returns:\n");
-  if (num_outs == 0) {
-    strings::StrAppend(&result, "    The created Operation.\n");
-  } else {
-    if (num_outs == 1) {
-      StringPiece description = op_def.output_arg(0).description();
-      if (ConsumeEquals(&description)) {  // Skip the generated type info.
-        strings::StrAppend(&result, Indent(4, 4, description));
-      } else {
-        // Special case of one output, don't use the name of the output unless
-        // there is no description.
-        string desc = output_type_string.empty() ? kUnknownTensorType
-                                                 : output_type_string[0];
-        if (desc == kUnknownTensorType) {
-          // Special case where we don't understand how the output tensor type
-          // depends on the input tensor types, just use the output arg
-          // description if we can.
-          if (!description.empty()) {
-            desc = op_def.output_arg(0).description();
-          } else if (!op_def.output_arg(0).name().empty()) {
-            desc = strings::StrCat(" The ", op_def.output_arg(0).name(),
-                                   " `Tensor`.");
+string GenEagerPythonOp::Code() {
+  if (api_def_.visibility() == ApiDef::SKIP) {
+    return "";
+  }
+
+  for (int i = 0; i < api_def_.arg_order_size(); ++i) {
+    const auto& arg = *FindInputArg(api_def_.arg_order(i), op_def_);
+    const auto& api_def_arg = *FindInputArg(api_def_.arg_order(i), api_def_);
+    params_no_default_.emplace_back(api_def_arg.name(),
+                                    api_def_arg.rename_to());
+    if (!arg.type_attr().empty()) {
+      AddAttrForArg(arg.type_attr(), i);
+    } else if (!arg.type_list_attr().empty()) {
+      AddAttrForArg(arg.type_list_attr(), i);
+    }
+    if (!arg.number_attr().empty()) {
+      AddAttrForArg(arg.number_attr(), i);
+    }
+  }
+  for (int i = 0; i < op_def_.attr_size(); ++i) {
+    const auto& attr(op_def_.attr(i));
+    const auto& api_def_attr(api_def_.attr(i));
+    // Do not add inferred attrs to the Python function signature.
+    if (inferred_attrs_.find(attr.name()) == inferred_attrs_.end()) {
+      if (api_def_attr.has_default_value()) {
+        if (attr.type() == "tensor") {
+          params_with_default_.emplace_back(
+              python_op_gen_internal::ParamNames(api_def_attr.name(),
+                                                 api_def_attr.rename_to()),
+              strings::StrCat(
+                  "_execute.make_tensor(",
+                  TensorPBString(api_def_attr.default_value().tensor()), ", \"",
+                  api_def_attr.rename_to(), "\")"));
+        } else if (attr.type() == "list(tensor)") {
+          std::vector<string> pbtxt;
+          for (const auto& pb : api_def_attr.default_value().list().tensor()) {
+            pbtxt.emplace_back(TensorPBString(pb));
           }
-        } else if (!description.empty()) {
-          AppendWithinWidth(&desc, description, kRightMargin - 4 /* indent */);
-        }
-        strings::StrAppend(&result, Indent(4, 4, desc));
-      }
-    } else {
-      std::vector<string> out_names(num_outs);
-      for (int i = 0; i < num_outs; ++i) {
-        if (!op_def.output_arg(i).name().empty()) {
-          out_names[i] = op_def.output_arg(i).name();
-        } else {
-          out_names[i] = strings::StrCat("output", i);
-        }
-      }
-      strings::StrAppend(&result, "    A tuple of `Tensor` objects (",
-                         str_util::Join(out_names, ", "), ").\n\n");
-      for (int i = 0; i < num_outs; ++i) {
-        string desc = strings::StrCat(out_names[i], ": ");
-        StringPiece description = op_def.output_arg(i).description();
-        if (ConsumeEquals(&description)) {  // Skip the generated type info.
-          strings::StrAppend(&desc, description);
+          params_with_default_.emplace_back(
+              python_op_gen_internal::ParamNames(api_def_attr.name(),
+                                                 api_def_attr.rename_to()),
+              strings::StrCat("[_execute.make_tensor(_pb, \"",
+                              api_def_attr.rename_to(), "\") for _pb in ",
+                              VectorToTuple(pbtxt), "]"));
         } else {
-          const string type = static_cast<size_t>(i) < output_type_string.size()
-                                  ? output_type_string[i]
-                                  : kUnknownTensorType;
-          if (!description.empty()) {
-            if (type == kUnknownTensorType) {
-              // Special case where we don't understand how the output tensor
-              // type depends on the input tensor types, so we just use the
-              // output arg description.
-              strings::StrAppend(&desc, description);
-            } else {
-              strings::StrAppend(&desc, type, " ", description);
-            }
-          } else {
-            strings::StrAppend(&desc, type);
-          }
+          params_with_default_.emplace_back(
+              python_op_gen_internal::ParamNames(api_def_attr.name(),
+                                                 api_def_attr.rename_to()),
+              python_op_gen_internal::AttrValueToPython(
+                  attr.type(), api_def_attr.default_value(), "_dtypes."));
         }
-        strings::StrAppend(&result, Indent(4, 6, desc));
+      } else {
+        params_no_default_.emplace_back(api_def_attr.name(),
+                                        api_def_attr.rename_to());
       }
     }
   }
-  return result;
-}
 
-string StringToPython(const string& str) {
-  return strings::StrCat("\"", str_util::CEscape(str), "\"");
-}
+  // Save the list of attr parameters (attrs that won't be inferred),
+  // those with defaults go at the end.
+  // Get the attrs in the order we want by taking the attrs without defaults
+  // from the end of params_no_default_, and adding params_no_default_.
+  attrs_.reserve(params_no_default_.size() - op_def_.input_arg_size() +
+                 params_with_default_.size());
+  for (int i = op_def_.input_arg_size(); i < params_no_default_.size(); ++i) {
+    attrs_.push_back(params_no_default_[i].GetName());
+  }
+  for (const auto& p : params_with_default_) {
+    attrs_.push_back(p.first.GetName());
+  }
 
-string DataTypeToPython(DataType dtype, const string& dtype_module) {
-  return strings::StrCat(dtype_module, PythonDataTypeString(dtype));
-}
+  param_names_.reserve(params_no_default_.size() + params_with_default_.size());
+  param_names_.insert(param_names_.begin(), params_no_default_.begin(),
+                      params_no_default_.end());
+  for (const auto& param_and_default : params_with_default_) {
+    param_names_.push_back(param_and_default.first);
+  }
 
-string ShapeToPython(const TensorShapeProto& shape) {
-  if (shape.unknown_rank()) {
-    return "None";
+  string parameters;
+  for (const auto& param : params_no_default_) {
+    if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
+    strings::StrAppend(&parameters, param.GetRenameTo());
   }
-  string python = "[";
-  for (const auto& dim : shape.dim()) {
-    if (python.size() > 1) strings::StrAppend(&python, ", ");
-    if (!dim.name().empty()) {
-      strings::StrAppend(&python, "(", StringToPython(dim.name()), ", ",
-                         dim.size(), ")");
-    } else {
-      strings::StrAppend(&python, dim.size());
+  for (const auto& param_and_default : params_with_default_) {
+    if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
+    strings::StrAppend(&parameters, param_and_default.first.GetRenameTo(), "=",
+                       param_and_default.second);
+  }
+  if (!parameters.empty()) strings::StrAppend(&parameters, ", ");
+  strings::StrAppend(&parameters, "name=None");
+
+  // Add attr_expressions_ for attrs that are params.
+  for (int i = 0; i < attrs_.size(); ++i) {
+    const string& attr_name = attrs_[i];
+    const string& attr_api_name =
+        param_names_[i + op_def_.input_arg_size()].GetRenameTo();
+    attr_expressions_[attr_name] = attr_api_name;
+  }
+  // Add attr_expressions_ for attrs that are inferred.
+  for (int i = 0; i < op_def_.attr_size(); ++i) {
+    const auto& attr(op_def_.attr(i));
+    if (attr.type() == "int") {
+      auto arg_list = attr_to_args_.find(attr.name());
+      if (arg_list != attr_to_args_.end()) {
+        AttrVarName(attr.name(), &attr_expressions_);
+      }
     }
   }
-  strings::StrAppend(&python, "]");
-  return python;
-}
 
-string TensorToPython(const TensorProto& proto) {
-  return ProtoShortDebugString(proto);
-}
+  string num_outputs_expr;
+  std::vector<string> output_sizes(num_outs_);
+  GetOutputSizesAndNumOutputsExpr(&output_sizes, &num_outputs_expr);
 
-string AttrListToPython(const AttrValue& value,
-                        const string& dtype_module = "tf.") {
-  string ret;
-  if (value.list().s_size() > 0) {
-    for (int i = 0; i < value.list().s_size(); ++i) {
-      if (i > 0) strings::StrAppend(&ret, ", ");
-      strings::StrAppend(&ret, StringToPython(value.list().s(i)));
-    }
-  } else if (value.list().i_size() > 0) {
-    for (int i = 0; i < value.list().i_size(); ++i) {
-      if (i > 0) strings::StrAppend(&ret, ", ");
-      strings::StrAppend(&ret, value.list().i(i));
-    }
-  } else if (value.list().f_size() > 0) {
-    for (int i = 0; i < value.list().f_size(); ++i) {
-      if (i > 0) strings::StrAppend(&ret, ", ");
-      strings::StrAppend(&ret, value.list().f(i));
-    }
-  } else if (value.list().b_size() > 0) {
-    for (int i = 0; i < value.list().b_size(); ++i) {
-      if (i > 0) strings::StrAppend(&ret, ", ");
-      strings::StrAppend(&ret, value.list().b(i) ? "True" : "False");
-    }
-  } else if (value.list().type_size() > 0) {
-    for (int i = 0; i < value.list().type_size(); ++i) {
-      if (i > 0) strings::StrAppend(&ret, ", ");
-      strings::StrAppend(&ret,
-                         DataTypeToPython(value.list().type(i), dtype_module));
-    }
-  } else if (value.list().shape_size() > 0) {
-    for (int i = 0; i < value.list().shape_size(); ++i) {
-      if (i > 0) strings::StrAppend(&ret, ", ");
-      strings::StrAppend(&ret, ShapeToPython(value.list().shape(i)));
-    }
-  } else if (value.list().tensor_size() > 0) {
-    for (int i = 0; i < value.list().tensor_size(); ++i) {
-      if (i > 0) strings::StrAppend(&ret, ", ");
-      strings::StrAppend(&ret, TensorToPython(value.list().tensor(i)));
-    }
-  } else if (value.list().func_size() > 0) {
-    for (int i = 0; i < value.list().func_size(); ++i) {
-      if (i > 0) strings::StrAppend(&ret, ", ");
-      strings::StrAppend(&ret, StringToPython(value.list().func(i).name()));
-    }
+  string eager_not_allowed_error = GetEagerNotAllowedError();
+
+  if (!AddEagerFastPathAndGraphCode(parameters, output_sizes,
+                                    eager_not_allowed_error)) {
+    return result_;
   }
-  return ret;
+
+  if (!AddEagerFallbackCode(parameters, output_sizes, num_outputs_expr,
+                            eager_not_allowed_error)) {
+    return result_;
+  }
+
+  return prelude_ + result_;
 }
 
-// NOTE: The return value may contain spaces (for example, it could be
-// a string "foo bar" with an embedded space) and is not safe to pass
-// to WordWrap().
-string AttrValueToPython(const string& type, const AttrValue& value,
-                         const string& dtype_module) {
-  if (type == "string") {
-    return StringToPython(value.s());
-  } else if (type == "int") {
-    return strings::StrCat(value.i());
-  } else if (type == "float") {
-    if (std::isnan(value.f()) || std::isinf(value.f())) {
-      return strings::StrCat("float('", value.f(), "')");
+void GenEagerPythonOp::HandleGraphMode(const string& function_setup) {
+  // Handle graph-mode case
+  strings::StrAppend(&result_,
+                     "  _ctx = _context._context\n"
+                     "  if _ctx is None or not _ctx._eager_context.is_eager:\n",
+                     function_setup,
+                     "    _, _, _op = _op_def_lib._apply_op_helper(\n");
+  AddBodyNoReturn("        ");
+  if (num_outs_ > 0) {
+    strings::StrAppend(&result_, "    _result = _op.outputs[:]\n");
+    // Special case handling for stateful op with single list output
+    // that might be empty.
+    if (num_outs_ == 1 && op_def_.is_stateful() &&
+        (!op_def_.output_arg(0).number_attr().empty() ||
+         !op_def_.output_arg(0).type_list_attr().empty())) {
+      // TODO(josh11b): Can skip this if the number_attr/type_list_attr has
+      // a constraint indicating that this can never be empty.
+      strings::StrAppend(&result_,
+                         "    if not _result:\n"
+                         "      return _op\n");
+    }
+    strings::StrAppend(&result_, "    _inputs_flat = _op.inputs\n");
+
+    // Compute graph-mode attrs.
+    if (op_def_.attr_size() > 0) {
+      string attr_values;
+      for (int i = 0; i < op_def_.attr_size(); ++i) {
+        if (i > 0) strings::StrAppend(&attr_values, ", ");
+        const auto& attr_name(op_def_.attr(i).name());
+        strings::StrAppend(&attr_values, "\"", attr_name, "\", _op.get_attr(\"",
+                           attr_name, "\")");
+      }
+      strings::StrAppend(&attr_values, ")");
+      strings::StrAppend(&result_,
+                         WordWrap("    _attrs = (", attr_values, kRightMargin),
+                         "\n");
     } else {
-      return strings::StrCat(value.f());
+      strings::StrAppend(&result_, "    _attrs = None\n");
     }
-  } else if (type == "bool") {
-    return value.b() ? "True" : "False";
-  } else if (type == "type") {
-    return DataTypeToPython(value.type(), dtype_module);
-  } else if (type == "shape") {
-    return ShapeToPython(value.shape());
-  } else if (type == "tensor") {
-    return TensorToPython(value.tensor());
-  } else if (type == "func") {
-    return StringToPython(value.func().name());
-  } else if (str_util::StartsWith(type, "list(")) {
-    return strings::StrCat("[", AttrListToPython(value, dtype_module), "]");
   } else {
-    return "?";
+    strings::StrAppend(&result_, "    return _op\n");
   }
 }
 
-void GenerateLowerCaseOpName(const string& str, string* result) {
-  const char joiner = '_';
-  const int last_index = str.size() - 1;
-  for (int i = 0; i <= last_index; ++i) {
-    const char c = str[i];
-    // Emit a joiner only if a previous-lower-to-now-upper or a
-    // now-upper-to-next-lower transition happens.
-    if (isupper(c) && (i > 0)) {
-      if (islower(str[i - 1]) || ((i < last_index) && islower(str[i + 1]))) {
-        result->push_back(joiner);
-      }
+string GenEagerPythonOp::GetEagerNotAllowedError() {
+  bool eager_allowed = true;
+  string ref_arg;
+  for (int i = 0; i < op_def_.input_arg_size(); ++i) {
+    const auto& arg = op_def_.input_arg(i);
+    if (arg.is_ref()) {
+      eager_allowed = false;
+      DCHECK_EQ(op_def_.input_arg(i).name(), api_def_.in_arg(i).name());
+      ref_arg = api_def_.in_arg(i).rename_to();
+    }
+  }
+  for (int i = 0; i < op_def_.output_arg_size(); ++i) {
+    const auto& arg = op_def_.output_arg(i);
+    if (arg.is_ref()) {
+      eager_allowed = false;
+      DCHECK_EQ(op_def_.output_arg(i).name(), api_def_.out_arg(i).name());
+      ref_arg = api_def_.out_arg(i).rename_to();
     }
-    result->push_back(tolower(c));
   }
+
+  if (eager_allowed) return "";
+
+  return strings::StrCat("raise RuntimeError(\"", op_name_,
+                         " op does not support eager execution. ", "Arg '",
+                         ref_arg, "' is a ref.\")\n");
 }
 
-static void AddDelimiter(string* append_to, const string& delim) {
-  if (!append_to->empty()) strings::StrAppend(append_to, delim);
+void GenEagerPythonOp::ExpectListArg(const string& indentation,
+                                     const string& arg_name, string* output) {
+  strings::StrAppend(output, indentation, "if not isinstance(", arg_name,
+                     ", (list, tuple)):\n", indentation, "  raise TypeError(\n",
+                     indentation, "      \"Expected list for '", arg_name,
+                     "' argument to \"\n", indentation, "      \"'", op_name_,
+                     "' Op, not %r.\" % ", arg_name, ")\n");
 }
 
-const ApiDef::Attr* FindAttr(StringPiece name, const ApiDef& api_def) {
-  for (int i = 0; i < api_def.attr_size(); ++i) {
-    if (api_def.attr(i).name() == name) {
-      return &api_def.attr(i);
+bool GenEagerPythonOp::GetEagerFunctionSetup(const string& indentation,
+                                             string* function_setup) {
+  // Validate list inputs, infer length attrs.
+  for (int i = 0; i < op_def_.attr_size(); ++i) {
+    const auto& attr(op_def_.attr(i));
+    if (attr.type() == "int") {
+      auto arg_list = attr_to_args_.find(attr.name());
+      if (arg_list != attr_to_args_.end()) {
+        // Inferred int attrs are the lengths of inputs. Validate those
+        // inputs are lists and have the same length.
+        for (auto iter = arg_list->second.begin();
+             iter != arg_list->second.end(); ++iter) {
+          const string& arg_api_name = param_names_[*iter].GetRenameTo();
+          ExpectListArg(indentation, arg_api_name, function_setup);
+          if (iter == arg_list->second.begin()) {
+            AddInferredAttr(indentation, attr.name(),
+                            strings::StrCat("len(", arg_api_name, ")"),
+                            function_setup, &attr_expressions_);
+          } else {
+            const auto& attr_var = attr_expressions_[attr.name()];
+            strings::StrAppend(
+                function_setup, indentation, "if len(", arg_api_name,
+                ") != ", attr_var, ":\n", indentation, "  raise ValueError(\n",
+                indentation, "      \"List argument '", arg_api_name, "' to '",
+                op_name_, "' Op with length %d \"\n", indentation,
+                "      \"must match length %d of argument '",
+                inferred_attrs_[attr.name()], "'.\" %\n", indentation,
+                "      (len(", arg_api_name, "), ", attr_var, "))\n");
+          }
+        }
+      }
     }
   }
-  return nullptr;
-}
 
-const ApiDef::Arg* FindInputArg(StringPiece name, const ApiDef& api_def) {
-  for (int i = 0; i < api_def.in_arg_size(); ++i) {
-    if (api_def.in_arg(i).name() == name) {
-      return &api_def.in_arg(i);
+  for (int i = 0; i < attrs_.size(); ++i) {
+    const string& attr_name = attrs_[i];
+    const auto& param = param_names_[i + op_def_.input_arg_size()];
+    const auto& attr = *FindAttr(attr_name, op_def_);
+    const string& attr_api_name = param.GetRenameTo();
+    StringPiece attr_type = attr.type();
+    attr_expressions_[attr_name] = attr_api_name;
+    const int default_index = i - (attrs_.size() - params_with_default_.size());
+    if (default_index >= 0) {
+      const string& default_value = params_with_default_[default_index].second;
+      strings::StrAppend(function_setup, indentation, "if ", attr_api_name,
+                         " is None:\n");
+      strings::StrAppend(function_setup, indentation, "  ", attr_api_name,
+                         " = ", default_value, "\n");
+    }
+    if (str_util::StartsWith(attr_type, "list(")) {
+      ExpectListArg(indentation, attr_api_name, function_setup);
+    }
+
+    if (attr_type == "string") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = _execute.make_str(", attr_api_name, ", \"",
+                         attr_api_name, "\")\n");
+    } else if (attr_type == "list(string)") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = [_execute.make_str(_s, \"", attr_api_name,
+                         "\") for _s in ", attr_api_name, "]\n");
+    } else if (attr_type == "int") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = _execute.make_int(", attr_api_name, ", \"",
+                         attr_api_name, "\")\n");
+    } else if (attr_type == "list(int)") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = [_execute.make_int(_i, \"", attr_api_name,
+                         "\") for _i in ", attr_api_name, "]\n");
+    } else if (attr_type == "float") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = _execute.make_float(", attr_api_name, ", \"",
+                         attr_api_name, "\")\n");
+    } else if (attr_type == "list(float)") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = [_execute.make_float(_f, \"", attr_api_name,
+                         "\") for _f in ", attr_api_name, "]\n");
+    } else if (attr_type == "bool") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = _execute.make_bool(", attr_api_name, ", \"",
+                         attr_api_name, "\")\n");
+    } else if (attr_type == "list(bool)") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = [_execute.make_bool(_b, \"", attr_api_name,
+                         "\") for _b in ", attr_api_name, "]\n");
+    } else if (attr_type == "type") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = _execute.make_type(", attr_api_name, ", \"",
+                         attr_api_name, "\")\n");
+    } else if (attr_type == "list(type)") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = [_execute.make_type(_t, \"", attr_api_name,
+                         "\") for _t in ", attr_api_name, "]\n");
+    } else if (attr_type == "shape") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = _execute.make_shape(", attr_api_name, ", \"",
+                         attr_api_name, "\")\n");
+    } else if (attr_type == "list(shape)") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = [_execute.make_shape(_s, \"", attr_api_name,
+                         "\") for _s in ", attr_api_name, "]\n");
+    } else if (attr_type == "tensor") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = _execute.make_tensor(", attr_api_name, ", \"",
+                         attr_api_name, "\")\n");
+    } else if (attr_type == "list(tensor)") {
+      strings::StrAppend(function_setup, indentation, attr_api_name,
+                         " = [_execute.make_tensor(_t, \"", attr_api_name,
+                         "\") for _t in ", attr_api_name, "]\n");
+    } else if (attr_type != "func") {
+      *function_setup =
+          strings::StrCat("# No definition for ", function_name_,
+                          " since we don't support attrs with type\n"
+                          "# '",
+                          attr_type, "' right now.\n\n");
+      return false;
     }
   }
-  return nullptr;
+  return true;
 }
 
-GenPythonOp::GenPythonOp(const OpDef& op_def, const ApiDef& api_def,
-                         const string& function_name)
-    : op_def_(op_def),
-      api_def_(api_def),
-      function_name_(function_name),
-      num_outs_(op_def.output_arg_size()) {}
-
-GenPythonOp::~GenPythonOp() {}
-
-string GenPythonOp::Code() {
-  // This has all the input args followed by those attrs that don't have
-  // defaults.
-  std::vector<ParamNames> params_no_default;
-  // The parameters with defaults (these have to be listed after those without).
-  // No input args are included, just attrs.
-  std::vector<ParamNames> params_with_default;
-
-  for (int i = 0; i < api_def_.arg_order_size(); ++i) {
-    const auto& arg = *FindInputArg(api_def_.arg_order(i), op_def_);
-    const auto& api_def_arg = *FindInputArg(api_def_.arg_order(i), api_def_);
-    params_no_default.emplace_back(api_def_arg.name(), api_def_arg.rename_to());
-    if (!arg.type_attr().empty()) {
-      gtl::InsertIfNotPresent(&inferred_attrs_, arg.type_attr(), arg.name());
-    } else if (!arg.type_list_attr().empty()) {
-      gtl::InsertIfNotPresent(&inferred_attrs_, arg.type_list_attr(),
-                              arg.name());
-    }
+// If output i is list output, output_sizes[i] will be set to a
+// string with the python expression that will evaluate to its
+// length. output_sizes[i] is empty for non-list outputs.
+void GenEagerPythonOp::GetOutputSizesAndNumOutputsExpr(
+    std::vector<string>* output_sizes, string* num_outputs_expr) {
+  // Expression representing the number of outputs.
+  int num_fixed_outputs = 0;
+  for (int i = 0; i < num_outs_; ++i) {
+    const auto& arg(op_def_.output_arg(i));
     if (!arg.number_attr().empty()) {
-      gtl::InsertIfNotPresent(&inferred_attrs_, arg.number_attr(), arg.name());
-    }
-  }
-  for (int i = 0; i < api_def_.attr_size(); ++i) {
-    const auto& attr(api_def_.attr(i));
-    // Do not add inferred attrs to the Python function signature.
-    if (inferred_attrs_.find(attr.name()) == inferred_attrs_.end()) {
-      if (attr.has_default_value()) {
-        params_with_default.emplace_back(attr.name(), attr.rename_to());
+      if (!num_outputs_expr->empty()) {
+        strings::StrAppend(num_outputs_expr, " + ");
+      }
+      (*output_sizes)[i] = attr_expressions_[arg.number_attr()];
+      strings::StrAppend(num_outputs_expr, (*output_sizes)[i]);
+    } else if (!arg.type_list_attr().empty()) {
+      if (!num_outputs_expr->empty()) {
+        strings::StrAppend(num_outputs_expr, " + ");
+      }
+      // Have to be careful to use an expression that works in both
+      // graph and eager paths here.
+      const auto iter = inferred_attrs_.find(arg.type_list_attr());
+      if (iter == inferred_attrs_.end()) {
+        (*output_sizes)[i] = strings::StrCat(
+            "len(", attr_expressions_[arg.type_list_attr()], ")");
       } else {
-        params_no_default.emplace_back(attr.name(), attr.rename_to());
+        (*output_sizes)[i] = strings::StrCat("len(", iter->second, ")");
       }
+      strings::StrAppend(num_outputs_expr, (*output_sizes)[i]);
+    } else {
+      ++num_fixed_outputs;
     }
   }
-
-  // Save the list of attr parameters (attrs that won't be inferred),
-  // those with defaults go at the end.
-  // Get the attrs in the order we want by taking the attrs without defaults
-  // from the end of args_no_default, and adding args_no_default.
-  attrs_.reserve(params_no_default.size() - op_def_.input_arg_size() +
-                 params_with_default.size());
-  for (int i = op_def_.input_arg_size(); i < params_no_default.size(); ++i) {
-    attrs_.push_back(params_no_default[i].GetName());
-  }
-  for (int i = 0; i < params_with_default.size(); ++i) {
-    attrs_.push_back(params_with_default[i].GetName());
-  }
-
-  param_names_.reserve(params_no_default.size() + params_with_default.size());
-  param_names_.insert(param_names_.begin(), params_no_default.begin(),
-                      params_no_default.end());
-  for (const auto& param : params_with_default) {
-    param_names_.push_back(param);
+  if (num_fixed_outputs > 0) {
+    if (!num_outputs_expr->empty()) {
+      strings::StrAppend(num_outputs_expr, " + ");
+    }
+    strings::StrAppend(num_outputs_expr, num_fixed_outputs);
+  } else if (num_outputs_expr->empty()) {
+    *num_outputs_expr = "0";
   }
+}
 
-  string parameters;
-  for (const auto& param : params_no_default) {
-    AddDelimiter(&parameters, ", ");
-    strings::StrAppend(&parameters, param.GetRenameTo());
-  }
-  for (const auto& param_and_default : params_with_default) {
-    AddDelimiter(&parameters, ", ");
-    strings::StrAppend(&parameters, param_and_default.GetRenameTo(), "=None");
+void GenEagerPythonOp::AddEagerFunctionTeardown(
+    const string& indentation, const std::vector<string>& output_sizes,
+    bool execute_record_gradient) {
+  if (num_outs_ > 0) {
+    if (execute_record_gradient) {
+      strings::StrAppend(&result_, indentation, "_execute.record_gradient(\n",
+                         "      \"", op_def_.name(),
+                         "\", _inputs_flat, _attrs, _result, name)\n");
+    }
+    if (num_outs_ == 1 && !output_sizes[0].empty()) {
+      // Single list result.
+    } else if (num_outs_ == 1) {
+      // Execute returns a single-element list which we need to destructure.
+      strings::StrAppend(&result_, indentation, "_result, = _result\n");
+    } else {
+      // Have multiple outputs, so we will need to reformat the return
+      // value of execute() to be a list with one entry per op output
+      // (that entry will be a list of tensors if that output is of list
+      // type).
+      // For list outputs, convert the right subrange of _result into a list.
+      Unflatten(indentation, output_sizes, "_result", &result_);
+      // Convert to a named tuple.
+      strings::StrAppend(&result_, indentation, "_result = _", op_def_.name(),
+                         "Output._make(_result)\n");
+    }
+  } else {
+    strings::StrAppend(&result_, indentation, "_result = None\n");
   }
-  AddDelimiter(&parameters, ", ");
-  strings::StrAppend(&parameters, "name=None");
+  strings::StrAppend(&result_, indentation, "return _result\n\n");
+}
 
+bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
+    const string& parameters, const std::vector<string>& output_sizes,
+    const string& eager_not_allowed_error) {
   AddExport();
-  AddDefLine(parameters);
+  AddDefLine(function_name_, parameters);
   AddDocStringDescription();
   AddDocStringArgs();
   AddDocStringInputs();
   AddDocStringAttrs();
   AddDocStringNameArg();
-  AddOutputGlobals();
+  AddOutputGlobals();  // Added to prelude_
   AddDocStringOutputs();
   strings::StrAppend(&result_, "  \"\"\"\n");
-  AddBody("  ");
-  strings::StrAppend(&result_, "\n\n");
 
-  return prelude_ + result_;
+  // Handle graph-mode case
+  string function_setup;
+  if (!GetEagerFunctionSetup("    ", &function_setup)) {
+    result_ = function_setup;
+    return false;
+  }
+  HandleGraphMode(function_setup);
+  AddEagerFunctionTeardown("    ", output_sizes,
+                           true /* execute_record_gradient */);
+
+  // Handle eager-mode case
+  strings::StrAppend(&result_, "  else:\n");
+
+  if (eager_not_allowed_error.empty()) {
+    AddEagerFastPathExecute();
+  } else {
+    strings::StrAppend(&result_, "    ", eager_not_allowed_error);
+  }
+
+  strings::StrAppend(&result_, "\n\n");
+  return true;
 }
 
-void GenPythonOp::AddExport() {
-  if (api_def_.visibility() != ApiDef::VISIBLE) {
-    return;
+bool GenEagerPythonOp::AddEagerFallbackCode(
+    const string& parameters, const std::vector<string>& output_sizes,
+    const string& num_outputs_expr, const string& eager_not_allowed_error) {
+  if (!eager_not_allowed_error.empty()) {
+    strings::StrAppend(&result_, "  ", eager_not_allowed_error);
+    return true;
   }
 
-  strings::StrAppend(&result_, "@tf_export(");
+  AddDefLine(strings::StrCat(function_name_, kEagerFallbackSuffix),
+             strings::StrCat(parameters, ", ctx=None"));
+  strings::StrAppend(
+      &result_, "  r\"\"\"This is the slowpath function for Eager mode.\n");
+  strings::StrAppend(&result_, "  This is for function ", function_name_,
+                     "\n  \"\"\"\n");
 
-  // Add all endpoint names to tf_export.
-  bool first_endpoint = true;
-  for (const auto& endpoint : api_def_.endpoint()) {
-    if (!first_endpoint) {
-      strings::StrAppend(&result_, ", ");
-    } else {
-      first_endpoint = false;
-    }
-    string endpoint_name;
-    python_op_gen_internal::GenerateLowerCaseOpName(endpoint.name(),
-                                                    &endpoint_name);
-    strings::StrAppend(&result_, "'", endpoint_name, "'");
+  strings::StrAppend(&result_, "  _ctx = ctx if ctx else _context.context()\n");
+
+  string function_setup;
+  if (!GetEagerFunctionSetup("  ", &function_setup)) {
+    result_ = function_setup;
+    return false;
   }
-  strings::StrAppend(&result_, ")\n");
-}
+  strings::StrAppend(&result_, function_setup);
 
-void GenPythonOp::AddDefLine(const string& function_name,
-                             const string& parameters) {
-  strings::StrAppend(&result_, "def ", function_name, "(", parameters, "):\n");
-}
+  AddEagerInferredAttrs("  ");
+  AddEagerInputCasts("  ");
+  strings::StrAppend(
+      &result_, "  _inputs_flat = ", FlattenInputs(nullptr, nullptr), "\n");
+  AddEagerAttrs("  ");
+  AddEagerExecute("  ", num_outputs_expr);
 
-void GenPythonOp::AddDefLine(const string& parameters) {
-  AddDefLine(function_name_, parameters);
+  AddEagerFunctionTeardown("  ", output_sizes,
+                           true /* execute_record_gradient */);
+
+  return true;
 }
 
-void GenPythonOp::AddDocStringDescription() {
-  string comment;
-  if (api_def_.summary().empty()) {
-    comment = "TODO: add doc.\n";
-  } else {
-    comment = strings::StrCat(api_def_.summary(), "\n");
-    if (!api_def_.description().empty()) {
-      strings::StrAppend(&comment, "\n", Indent(2, 2, api_def_.description()));
-    }
+void GenEagerPythonOp::AddEagerFastPathExecute() {
+  string fastpath_execute_params = strings::StrCat(
+      "_ctx._context_handle, _ctx._eager_context.device_name, \"",
+      op_def_.name(), "\", ", "name, _ctx._post_execution_callbacks");
+  string fallback_params;
+
+  for (int i = 0; i < api_def_.in_arg_size(); i++) {
+    const string param_name = param_names_[i].GetRenameTo();
+    strings::StrAppend(&fastpath_execute_params, ", ", param_name);
+    if (!fallback_params.empty()) strings::StrAppend(&fallback_params, ", ");
+    strings::StrAppend(&fallback_params, param_name);
   }
-  strings::StrAppend(&result_, "  r\"\"\"", comment, "\n");
-}
 
-void GenPythonOp::AddDocStringArgs() {
-  strings::StrAppend(&result_, "  Args:\n");
-}
+  for (const auto& attr : api_def_.attr()) {
+    if (inferred_attrs_.find(attr.name()) == inferred_attrs_.end()) {
+      strings::StrAppend(&fastpath_execute_params, ", \"", attr.name(), "\", ",
+                         attr.rename_to());
 
-void GenPythonOp::AddDocStringInputs() {
-  for (int i = 0; i < api_def_.arg_order_size(); ++i) {
-    const auto& arg = *FindInputArg(api_def_.arg_order(i), op_def_);
-    const auto& api_def_arg = *FindInputArg(api_def_.arg_order(i), api_def_);
-    StringPiece description = api_def_arg.description();
-    string desc;
-    if (ConsumeEquals(&description)) {  // Skip the generated type info.
-      desc = strings::StrCat(param_names_[i].GetRenameTo(), ": ");
-    } else {
-      desc = strings::StrCat(param_names_[i].GetRenameTo(), ": ",
-                             ArgTypeName(op_def_, arg, inferred_attrs_, false));
+      if (!fallback_params.empty()) strings::StrAppend(&fallback_params, ", ");
+      strings::StrAppend(&fallback_params, attr.rename_to(), "=",
+                         attr.rename_to());
     }
-    if (!description.empty()) {
-      AppendWithinWidth(&desc, description, kRightMargin - 4 /* indent */);
-    }
-    strings::StrAppend(&result_, Indent(4, 6, desc));
   }
+
+  if (!fallback_params.empty()) strings::StrAppend(&fallback_params, ", ");
+  strings::StrAppend(&fallback_params, "name=name");
+
+  strings::StrAppend(&result_, "    try:\n");
+  strings::StrAppend(
+      &result_, "      ",
+      "_result = _pywrap_tensorflow.TFE_Py_FastPathExecute(\n",
+      WordWrap(strings::StrCat("        "),
+               strings::StrCat(fastpath_execute_params, ")"), kRightMargin),
+      "\n");
+
+  if (op_def_.output_arg_size() > 1) {
+    const string output_tuple_name =
+        strings::StrCat("_", op_def_.name(), "Output");
+    strings::StrAppend(&result_, "      ", "_result = ", output_tuple_name,
+                       "._make(_result)\n");
+  }
+  strings::StrAppend(&result_, "      ", "return _result\n");
+
+  // Handle fallback.
+  if (!fallback_params.empty()) strings::StrAppend(&fallback_params, ", ");
+  strings::StrAppend(&fallback_params, "ctx=_ctx");
+  strings::StrAppend(&result_, "    ", "except _core._FallbackException:\n");
+  strings::StrAppend(
+      &result_, "      ", "return ", function_name_, kEagerFallbackSuffix,
+      "(\n",
+      WordWrap(strings::StrCat("          "),
+               strings::StrCat(fallback_params, ")"), kRightMargin),
+      "\n");
+
+  // Any errors thrown from execute need to be unwrapped from
+  // _NotOkStatusException.
+  strings::StrAppend(&result_, "    ",
+                     "except _core._NotOkStatusException as e:\n");
+  strings::StrAppend(&result_, "      ", "if name is not None:\n");
+  strings::StrAppend(&result_, "        ",
+                     "message = e.message + \" name: \" + name\n");
+  strings::StrAppend(&result_, "      ", "else:\n");
+  strings::StrAppend(&result_, "        ", "message = e.message\n");
+  strings::StrAppend(
+      &result_, "      ",
+      "_six.raise_from(_core._status_to_exception(e.code, message), None)\n");
 }
 
-void GenPythonOp::AddDocStringAttrs() {
-  for (const string& name : attrs_) {
-    const auto& attr = *FindAttr(name, op_def_);
-    const auto& api_def_attr = *FindAttr(name, api_def_);
-    string desc =
-        strings::StrCat(AvoidPythonReserved(api_def_attr.rename_to()), ": ");
-
-    static const char* const kAttrTypeName[][2] = {
-        {"string", "`string`"},
-        {"list(string)", "list of `strings`"},
-        {"int", "`int`"},
-        {"list(int)", "list of `ints`"},
-        {"float", "`float`"},
-        {"list(float)", "list of `floats`"},
-        {"bool", "`bool`"},
-        {"list(bool)", "list of `bools`"},
-        {"type", "`tf.DType`"},
-        {"list(type)", "list of `tf.DTypes`"},
-        {"shape", "`tf.TensorShape` or list of `ints`"},
-        {"list(shape)",
-         "list of shapes (each a `tf.TensorShape` or list of `ints`)"},
-        {"tensor", "`tf.TensorProto`"},
-        {"list(tensor)", "list of `tf.TensorProto` objects"},
-        {"func", "function decorated with @Defun"},
-        {"list(func)", "list of functions decorated with @Defun"},
-    };
-    for (size_t i = 0; i < TF_ARRAYSIZE(kAttrTypeName); ++i) {
-      if (attr.type() == kAttrTypeName[i][0]) {
-        string s;
-        if (api_def_attr.has_default_value()) {
-          s = strings::StrCat("optional ", kAttrTypeName[i][1]);
+void GenEagerPythonOp::AddEagerInferredAttrs(const string& indentation) {
+  // Figure out values for inferred attrs, and cast to eager tensors.
+  for (int i = 0; i < op_def_.attr_size(); ++i) {
+    const auto& attr(op_def_.attr(i));
+    const auto& api_def_attr(api_def_.attr(i));
+    auto arg_list = attr_to_args_.find(attr.name());
+    if (arg_list != attr_to_args_.end()) {
+      if (attr.type() == "type") {
+        std::vector<string> output_sizes;
+        const string flattened =
+            FlattenInputs(&arg_list->second, &output_sizes);
+        string conversion = strings::StrCat("_execute.args_to_matching_eager(",
+                                            flattened, ", _ctx");
+        if (attr.has_default_value()) {
+          strings::StrAppend(
+              &conversion, ", ",
+              python_op_gen_internal::AttrValueToPython(
+                  attr.type(), api_def_attr.default_value(), "_dtypes."));
+        }
+        strings::StrAppend(&conversion, ")");
+        const string var_name = AttrVarName(attr.name(), &attr_expressions_);
+        if (output_sizes.size() == 1) {
+          // Avoid creating a temporary variable in the case where
+          // we can easily assign to the right value directly.
+          const string inputs_var =
+              param_names_[arg_list->second.front()].GetRenameTo();
+          if (output_sizes.front().empty()) {
+            strings::StrAppend(&result_, indentation, var_name, ", (",
+                               inputs_var, ",) = ", conversion, "\n");
+          } else {
+            strings::StrAppend(&result_, indentation, var_name, ", ",
+                               inputs_var, " = ", conversion, "\n");
+          }
         } else {
-          s = kAttrTypeName[i][1];
+          const string inputs_var = strings::StrCat("_inputs_", attr.name());
+          strings::StrAppend(&result_, indentation, var_name, ", ", inputs_var,
+                             " = ", conversion, "\n");
+          // Convert from a flat list of eager tensors back to the
+          // parameter variables.
+          Unflatten(indentation, output_sizes, inputs_var, &result_);
+          std::vector<string> p;
+          for (int j : arg_list->second) {
+            p.emplace_back(param_names_[j].GetRenameTo());
+          }
+          strings::StrAppend(&result_, indentation, VectorToTuple(p), " = ",
+                             inputs_var, "\n");
         }
-        if (s[0] == 'o' || (s[0] == '`' && (s[1] == 'i' || s[1] == 'o'))) {
-          strings::StrAppend(&desc, "An ", s);
+      } else if (attr.type() == "list(type)") {
+        // NOTE: We ignore default values for these attrs, since it is
+        // unclear how you would use it, and the one use case is
+        // parse_single_sequence_example which only needs it for
+        // backwards compatibility.
+        const string var_name = AttrVarName(attr.name(), &attr_expressions_);
+        string inputs_var;
+        string conversion;
+        if (arg_list->second.size() > 1) {
+          // If you have more than one list(tensor) argument, their types
+          // have to match.
+          std::vector<string> lists;
+          for (auto iter = arg_list->second.begin();
+               iter != arg_list->second.end(); ++iter) {
+            lists.push_back(param_names_[*iter].GetRenameTo());
+          }
+          inputs_var = VectorToTuple(lists);
+          conversion = "_execute.args_to_mixed_eager_tensors";
         } else {
-          strings::StrAppend(&desc, "A ", s);
+          // For one list(tensor) argument, we just convert every
+          // element of the list to an eager tensor.
+          inputs_var = param_names_[arg_list->second.front()].GetRenameTo();
+          conversion = "_execute.convert_to_mixed_eager_tensors";
         }
-        break;
+        strings::StrAppend(&result_, indentation, var_name, ", ", inputs_var,
+                           " = ", conversion, "(", inputs_var, ", _ctx)\n");
       }
     }
-
-    if (attr.has_allowed_values()) {
-      strings::StrAppend(&desc, " from: `",
-                         AttrListToPython(attr.allowed_values()), "`");
-    }
-
-    if (attr.has_minimum()) {
-      if (attr.type() == "int") {
-        strings::StrAppend(&desc, " that is `>= ", attr.minimum(), "`");
-      } else if (attr.minimum() > 0) {
-        strings::StrAppend(&desc, " that has length `>= ", attr.minimum(), "`");
-      }
-    }
-
-    strings::StrAppend(&desc, ".");
-
-    if (api_def_attr.has_default_value()) {
-      strings::StrAppend(
-          &desc, " Defaults to `",
-          AttrValueToPython(attr.type(), api_def_attr.default_value()), "`.");
-    }
-    if (!api_def_attr.description().empty()) {
-      AppendWithinWidth(&desc, api_def_attr.description(),
-                        kRightMargin - 4 /* indent */);
-    }
-    strings::StrAppend(&result_, Indent(4, 6, desc));
   }
 }
 
-void GenPythonOp::AddDocStringNameArg() {
-  strings::StrAppend(&result_,
-                     "    name: A name for the operation (optional).\n");
+void GenEagerPythonOp::AddEagerInputCasts(const string& indentation) {
+  // Cast remaining args to eager tensors
+  for (int i = 0; i < op_def_.input_arg_size(); ++i) {
+    const auto& arg(op_def_.input_arg(i));
+    if (!arg.type_attr().empty() || !arg.type_list_attr().empty()) continue;
+    const string& param = param_names_[i].GetRenameTo();
+    const string fn = arg.number_attr().empty() ? "" : "n_";
+    const string dtype =
+        python_op_gen_internal::DataTypeToPython(arg.type(), "_dtypes.");
+    strings::StrAppend(&result_, indentation, param, " = _ops.convert_", fn,
+                       "to_tensor(", param, ", ", dtype, ")\n");
+  }
 }
 
-void GenPythonOp::AddOutputGlobals() {
-  // Prepare a NamedTuple type to hold the outputs, if there are multiple
-  if (num_outs_ > 1) {
-    // Prepare the list of output names
-    std::vector<string> out_names(num_outs_);
-    for (int i = 0; i < num_outs_; ++i) {
-      if (!api_def_.out_arg(i).rename_to().empty()) {
-        out_names[i] = api_def_.out_arg(i).rename_to();
-      } else {
-        out_names[i] = strings::StrCat("output", i);
-      }
+void GenEagerPythonOp::AddEagerAttrs(const string& indentation) {
+  // Compute eager attrs
+  if (op_def_.attr_size() > 0) {
+    string attr_values;
+    for (int i = 0; i < op_def_.attr_size(); ++i) {
+      if (i > 0) strings::StrAppend(&attr_values, ", ");
+      const auto& attr_name(op_def_.attr(i).name());
+      strings::StrAppend(&attr_values, "\"", attr_name, "\", ",
+                         attr_expressions_[attr_name]);
     }
-    string out_names_list =
-        strings::StrCat("[\"", str_util::Join(out_names, "\", \""), "\"]");
-
-    // Provide the output names as a Python list
-    string lower_op_name_outputs =
-        strings::StrCat("_", function_name_, "_outputs");
-    const string outputs_prefix = strings::StrCat(lower_op_name_outputs, " = ");
-    strings::StrAppend(&prelude_, "\n",
-                       WordWrap(outputs_prefix, out_names_list, kRightMargin),
-                       "\n");
-
-    strings::StrAppend(&prelude_, "_", op_def_.name(),
-                       "Output = _collections.namedtuple(\n");
-    const string tuple_type_prefix = "    ";
-    const string tuple_type_suffix = strings::StrCat(
-        "\"", op_def_.name(), "\", ", lower_op_name_outputs, ")");
+    strings::StrAppend(&attr_values, ")");
     strings::StrAppend(
-        &prelude_, WordWrap(tuple_type_prefix, tuple_type_suffix, kRightMargin),
-        "\n\n");
-  }
-  strings::StrAppend(&prelude_, "\n");
-}
-
-void GenPythonOp::AddDocStringOutputs() {
-  std::vector<string> output_type_string;
-  output_type_string.reserve(num_outs_);
-  for (int i = 0; i < num_outs_; ++i) {
-    output_type_string.push_back(
-        ArgTypeName(op_def_, op_def_.output_arg(i), inferred_attrs_, true));
-  }
-  strings::StrAppend(&result_, GetReturns(op_def_, output_type_string));
-}
-
-void GenPythonOp::AddBody(const string& prefix) {
-  const string apply_prefix =
-      strings::StrCat(prefix, "_result = _op_def_lib.apply_op(");
-  AddBodyNoReturn(apply_prefix);
-  if (num_outs_ > 1) {
-    strings::StrAppend(&result_, prefix, "_result = _", op_def_.name(),
-                       "Output._make(_result)\n");
+        &result_,
+        WordWrap(indentation, strings::StrCat("_attrs = (", attr_values),
+                 kRightMargin),
+        "\n");
+  } else {
+    strings::StrAppend(&result_, indentation, "_attrs = None\n");
   }
-  strings::StrAppend(&result_, prefix, "return _result\n");
 }
 
-void GenPythonOp::AddBodyNoReturn(const string& apply_prefix) {
-  string args = strings::StrCat("\"", op_def_.name(), "\", ");
-  for (size_t i = 0; i < param_names_.size(); ++i) {
-    strings::StrAppend(&args, AvoidPythonReserved(param_names_[i].GetName()),
-                       "=", param_names_[i].GetRenameTo(), ", ");
-  }
-  strings::StrAppend(&args, "name=name)");
-
+void GenEagerPythonOp::AddEagerExecute(const string& indentation,
+                                       const string& num_outputs_expr) {
+  const string return_prefix =
+      strings::StrCat(indentation, "_result = _execute.execute(");
+  const string return_args = strings::StrCat(
+      "b\"", op_def_.name(), "\", ", num_outputs_expr,
+      ", inputs=_inputs_flat, attrs=_attrs, ctx=_ctx, name=name)");
   strings::StrAppend(&result_,
                      // Wrap the arguments, and indent to the (.
-                     WordWrap(apply_prefix, args, kRightMargin), "\n");
-}
-
-}  // namespace python_op_gen_internal
-
-string GetPythonOp(const OpDef& op_def, const ApiDef& api_def,
-                   const string& function_name) {
-  return python_op_gen_internal::GenPythonOp(op_def, api_def, function_name)
-      .Code();
+                     WordWrap(return_prefix, return_args, kRightMargin), "\n");
 }
 
 string GetPythonOps(const OpList& ops, const ApiDefMap& api_defs,
-                    const std::vector<string>& hidden_ops,
-                    bool require_shapes) {
+                    const std::vector<string>& hidden_ops, bool require_shapes,
+                    const string& source_file_name = "") {
   string result;
   // Header
   // TODO(josh11b): Mention the library for which wrappers are being generated.
   strings::StrAppend(&result, R"("""Python wrappers around TensorFlow ops.
 
 This file is MACHINE GENERATED! Do not edit.
-"""
+)");
+
+  // Mention the original source file so someone tracing back through
+  // generated Python code will know where to look next.
+  if (!source_file_name.empty()) {
+    strings::StrAppend(&result, "Original C++ source file: ");
+    strings::StrAppend(&result, source_file_name);
+    strings::StrAppend(&result, "\n");
+  }
+
+  strings::StrAppend(&result, R"("""
 
 import collections as _collections
+import six as _six
 
-from tensorflow.core.framework import op_def_pb2 as _op_def_pb2
+from tensorflow.python import pywrap_tensorflow as _pywrap_tensorflow
+from tensorflow.python.eager import context as _context
+from tensorflow.python.eager import core as _core
+from tensorflow.python.eager import execute as _execute
+from tensorflow.python.framework import dtypes as _dtypes
+from tensorflow.python.framework import errors as _errors
+from tensorflow.python.framework import tensor_shape as _tensor_shape
 
+from tensorflow.core.framework import op_def_pb2 as _op_def_pb2
 # Needed to trigger the call to _set_call_cpp_shape_fn.
 from tensorflow.python.framework import common_shapes as _common_shapes
-
 from tensorflow.python.framework import op_def_registry as _op_def_registry
 from tensorflow.python.framework import ops as _ops
 from tensorflow.python.framework import op_def_library as _op_def_library
 from tensorflow.python.util.tf_export import tf_export
+
 )");
 
   // We'll make a copy of ops that filters out descriptions.
@@ -839,7 +957,6 @@ from tensorflow.python.util.tf_export import tf_export
     if (api_def->visibility() == ApiDef::SKIP) {
       continue;
     }
-
     // An op is hidden if either its ApiDef visibility is HIDDEN
     // or it is in the hidden_ops list.
     bool is_hidden = api_def->visibility() == ApiDef::HIDDEN;
@@ -875,11 +992,12 @@ from tensorflow.python.util.tf_export import tf_export
       continue;
     }
 
-    strings::StrAppend(&result, GetPythonOp(op_def, *api_def, function_name));
+    strings::StrAppend(&result,
+                       GetEagerPythonOp(op_def, *api_def, function_name));
 
     if (!require_shapes) {
       strings::StrAppend(&result, "_ops.RegisterShape(\"", op_def.name(),
-                         "\")(None)\n");
+                         "\")(None)\n\n");
     }
 
     auto added = out->Add();
@@ -894,8 +1012,6 @@ from tensorflow.python.util.tf_export import tf_export
   op_def_lib = _op_def_library.OpDefLibrary()
   op_def_lib.add_op_list(op_list)
   return op_def_lib
-
-
 )");
 
   result.append("# ");
@@ -908,16 +1024,21 @@ from tensorflow.python.util.tf_export import tf_export
   return result;
 }
 
+}  // namespace
+
 void PrintPythonOps(const OpList& ops, const ApiDefMap& api_defs,
-                    const std::vector<string>& hidden_ops,
-                    bool require_shapes) {
-  printf("%s", GetPythonOps(ops, api_defs, hidden_ops, require_shapes).c_str());
+                    const std::vector<string>& hidden_ops, bool require_shapes,
+                    const string& source_file_name) {
+  printf("%s", GetPythonOps(ops, api_defs, hidden_ops, require_shapes,
+                            source_file_name)
+                   .c_str());
 }
 
 string GetPythonWrappers(const char* op_list_buf, size_t op_list_len) {
   string op_list_str(op_list_buf, op_list_len);
   OpList ops;
   ops.ParseFromString(op_list_str);
+
   ApiDefMap api_def_map(ops);
   return GetPythonOps(ops, api_def_map, {}, false);
 }
diff --git a/tensorflow/python/framework/python_op_gen.h b/tensorflow/python/framework/python_op_gen.h
index 4d20888dc63462..7e754fd12246bd 100644
--- a/tensorflow/python/framework/python_op_gen.h
+++ b/tensorflow/python/framework/python_op_gen.h
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,29 +12,24 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-
 #ifndef TENSORFLOW_PYTHON_FRAMEWORK_PYTHON_OP_GEN_H_
 #define TENSORFLOW_PYTHON_FRAMEWORK_PYTHON_OP_GEN_H_
 
 #include <string>
 #include <vector>
-#include "tensorflow/core/framework/api_def.pb.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
-// hidden_ops should be a vector of Op names that should get a leading _ in the
-// output.
-// The Print* version prints the output to stdout, Get* version returns the
-// output as a string.
+// hidden_ops should be a list of Op names that should get a leading _
+// in the output. Prints the output to stdout.
+// Optional fourth argument is the name of the original C++ source file
+// where the ops' REGISTER_OP() calls reside.
 void PrintPythonOps(const OpList& ops, const ApiDefMap& api_defs,
-                    const std::vector<string>& hidden_ops, bool require_shapes);
-string GetPythonOps(const OpList& ops, const ApiDefMap& api_defs,
-                    const std::vector<string>& hidden_ops, bool require_shapes);
-string GetPythonOp(const OpDef& op_def, const ApiDef& api_def,
-                   const string& function_name);
+                    const std::vector<string>& hidden_ops, bool require_shapes,
+                    const string& source_file_name = "");
 
 // Get the python wrappers for a list of ops in a OpList.
 // `op_list_buf` should be a pointer to a buffer containing
diff --git a/tensorflow/python/framework/python_op_gen.i b/tensorflow/python/framework/python_op_gen.i
index efcce2f2094179..26ec4e8e66b5d4 100644
--- a/tensorflow/python/framework/python_op_gen.i
+++ b/tensorflow/python/framework/python_op_gen.i
@@ -16,10 +16,10 @@ limitations under the License.
 %include "tensorflow/python/platform/base.i"
 
 %{
-#include "tensorflow/python/eager/python_eager_op_gen.h"
+#include "tensorflow/python/framework/python_op_gen.h"
 %}
 
-// Input typemap for GetEagerPythonWrappers.
+// Input typemap for GetPythonWrappers.
 // Accepts a python object of 'bytes' type, and converts it to
 // a const char* pointer and size_t length. The default typemap
 // going from python bytes to const char* tries to decode the
@@ -37,5 +37,5 @@ limitations under the License.
 
 
 %ignoreall;
-%unignore tensorflow::GetEagerPythonWrappers;
-%include "tensorflow/python/eager/python_eager_op_gen.h"
+%unignore tensorflow::GetPythonWrappers;
+%include "tensorflow/python/framework/python_op_gen.h"
diff --git a/tensorflow/python/framework/python_op_gen_internal.cc b/tensorflow/python/framework/python_op_gen_internal.cc
new file mode 100644
index 00000000000000..940bffb906db75
--- /dev/null
+++ b/tensorflow/python/framework/python_op_gen_internal.cc
@@ -0,0 +1,800 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/python/framework/python_op_gen_internal.h"
+
+#include <stdio.h>
+#include <sstream>
+#include <unordered_map>
+#include "tensorflow/core/framework/api_def.pb.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def.pb_text.h"
+#include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_def_util.h"
+#include "tensorflow/core/framework/op_gen_lib.h"
+#include "tensorflow/core/framework/tensor.pb_text.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/gtl/stl_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace python_op_gen_internal {
+
+const int kRightMargin = 78;
+
+bool IsPythonReserved(const string& s) {
+  static const std::set<string>* const kPythonReserved = new std::set<string>(
+      {// Keywords in Python, from:
+       //   import keyword
+       //   print keyword.kwlist
+       "and", "as", "assert", "break", "class", "continue", "def", "del",
+       "elif", "else", "except", "exec", "finally", "for", "from", "global",
+       "if", "import", "in", "is", "lambda", "not", "or", "pass", "print",
+       "raise", "return", "try", "while", "with", "yield",
+       // Built-in functions and types in Python, from:
+       //   [x for x in dir(__builtins__) if not x[0].islower()]
+       "ArithmeticError", "AssertionError", "AttributeError", "BaseException",
+       "BufferError", "BytesWarning", "DeprecationWarning", "EOFError",
+       "Ellipsis", "EnvironmentError", "Exception", "False",
+       "FloatingPointError", "FutureWarning", "GeneratorExit", "IOError",
+       "ImportError", "ImportWarning", "IndentationError", "IndexError",
+       "KeyError", "KeyboardInterrupt", "LookupError", "MemoryError",
+       "NameError", "None", "NotImplemented", "NotImplementedError", "OSError",
+       "OverflowError", "PendingDeprecationWarning", "ReferenceError",
+       "RuntimeError", "RuntimeWarning", "StandardError", "StopIteration",
+       "SyntaxError", "SyntaxWarning", "SystemError", "SystemExit", "TabError",
+       "True", "TypeError", "UnboundLocalError", "UnicodeDecodeError",
+       "UnicodeEncodeError", "UnicodeError", "UnicodeTranslateError",
+       "UnicodeWarning", "UserWarning", "ValueError", "Warning",
+       "ZeroDivisionError", "__debug__", "__doc__", "__import__", "__name__",
+       "__package__"});
+
+  return kPythonReserved->count(s) > 0;
+}
+
+bool IsOpWithUnderscorePrefix(const string& s) {
+  static const std::set<string>* const kUnderscoreOps = new std::set<string>(
+      {// Lowercase built-in functions and types in Python, from:
+       // [x for x in dir(__builtins__) if x[0].islower()] except "round".
+       // These need to be excluded so they don't conflict with actual built-in
+       // functions since we use '*' imports.
+       "abs", "all", "any", "apply", "bin", "bool", "buffer", "bytearray",
+       "bytes", "callable", "chr", "classmethod", "cmp", "coerce", "compile",
+       "complex", "copyright", "credits", "delattr", "dict", "dir", "divmod",
+       "enumerate", "eval", "execfile", "exit", "file", "filter", "float",
+       "format", "frozenset", "getattr", "globals", "hasattr", "hash", "help",
+       "hex", "id", "input", "int", "intern", "isinstance", "issubclass",
+       "iter", "len", "license", "list", "locals", "long", "map", "max",
+       "memoryview", "min", "next", "object", "oct", "open", "ord", "pow",
+       "print", "property", "quit", "range", "raw_input", "reduce", "reload",
+       "repr", "reversed", "set", "setattr", "slice", "sorted", "staticmethod",
+       "str", "sum", "super", "tuple", "type", "unichr", "unicode", "vars",
+       "xrange", "zip",
+       // These have the same name as ops defined in Python and might be used
+       // incorrectly depending on order of '*' imports.
+       // TODO(annarev): reduce usage of '*' imports and remove these from the
+       // list.
+       "fused_batch_norm", "histogram_fixed_width", "stack",
+       "batch_norm_with_global_normalization", "clip_by_value"});
+  return kUnderscoreOps->count(s) > 0;
+}
+
+string AvoidPythonReserved(const string& s) {
+  if (IsPythonReserved(s)) return strings::StrCat(s, "_");
+  return s;
+}
+
+// Indent the first line by "initial" spaces and all following lines
+// by "rest" spaces.
+string Indent(int initial, int rest, StringPiece in) {
+  // TODO(josh11b): Also word-wrapping?
+  string copy(in.data(), in.size());
+  str_util::StripTrailingWhitespace(&copy);
+  std::vector<string> v = str_util::Split(copy, '\n');
+
+  string result;
+  bool first = true;
+  for (const string& line : v) {
+    if (first) {
+      result = strings::StrCat(Spaces(initial), line, "\n");
+      first = false;
+    } else {
+      if (line.empty()) {
+        strings::StrAppend(&result, "\n");
+      } else {
+        strings::StrAppend(&result, Spaces(rest), line, "\n");
+      }
+    }
+  }
+  return result;
+}
+
+// Adds append to *dest, with a space if the first line will be <= width,
+// or a newline otherwise.
+void AppendWithinWidth(string* dest, StringPiece append, int width) {
+  auto first_line = append.find('\n');
+  if (first_line == string::npos) first_line = append.size();
+  if (dest->size() + first_line + 1 /* space */ > static_cast<size_t>(width)) {
+    strings::StrAppend(dest, "\n", append);
+  } else {
+    strings::StrAppend(dest, " ", append);
+  }
+}
+
+// Like DataTypeString() but uses the Python names for the
+// float types.
+string PythonDataTypeString(DataType dtype) {
+  switch (dtype) {
+    case DT_FLOAT:
+      return "float32";
+    case DT_DOUBLE:
+      return "float64";
+    default:
+      return DataTypeString(dtype);
+  }
+}
+
+string TypeString(DataType dtype, bool ref) {
+  if (ref) {
+    return strings::StrCat("mutable `", PythonDataTypeString(dtype), "`");
+  } else {
+    return strings::StrCat("`", PythonDataTypeString(dtype), "`");
+  }
+}
+
+string TypeListString(const AttrValue& value) {
+  string ret;
+  for (int t : value.list().type()) {
+    if (!ret.empty()) strings::StrAppend(&ret, ", ");
+    DataType dtype = static_cast<DataType>(t);
+    if (IsRefType(dtype)) {
+      strings::StrAppend(&ret, PythonDataTypeString(RemoveRefType(dtype)),
+                         " mutable");
+    } else {
+      strings::StrAppend(&ret, "`", PythonDataTypeString(dtype), "`");
+    }
+  }
+  return ret;
+}
+
+string SingleTensorName(DataType dtype, bool is_ref) {
+  const string type_str = TypeString(dtype, is_ref);
+  return strings::StrCat("A `Tensor` of type ", type_str, ".");
+}
+
+const char kUnknownTensorType[] = {"A `Tensor`."};
+
+string ArgTypeName(const OpDef& op_def, const OpDef::ArgDef& arg,
+                   const std::unordered_map<string, string>& inferred_attrs,
+                   bool is_output) {
+  if (!arg.number_attr().empty()) {
+    // N Tensors with the same type
+    const string* original_arg =
+        gtl::FindOrNull(inferred_attrs, arg.number_attr());
+    string prefix;
+    if (original_arg == nullptr) {
+      prefix = strings::StrCat("A list of `", arg.number_attr(), "`");
+    } else if (*original_arg == arg.name()) {
+      const OpDef::AttrDef* attr = FindAttr(arg.number_attr(), op_def);
+      if (attr->has_minimum() && attr->minimum() > 0) {
+        prefix = strings::StrCat("A list of at least ", attr->minimum());
+      } else {
+        prefix = "A list of";
+      }
+    } else {
+      prefix = strings::StrCat("A list with the same length as `",
+                               AvoidPythonReserved(*original_arg), "` of");
+    }
+
+    if (arg.type() != DT_INVALID) {
+      return strings::StrCat(prefix, " `Tensor` objects with type ",
+                             TypeString(arg.type(), arg.is_ref()), ".");
+    } else {
+      original_arg = gtl::FindOrNull(inferred_attrs, arg.type_attr());
+      if (arg.is_ref()) {
+        strings::StrAppend(&prefix, " mutable");
+      }
+      if (original_arg == nullptr) {
+        return strings::StrCat(prefix, " `Tensor` objects with type `",
+                               arg.type_attr(), "`.");
+      } else if (*original_arg == arg.name()) {
+        const OpDef::AttrDef* attr = FindAttr(arg.type_attr(), op_def);
+        if (attr->has_allowed_values()) {
+          return strings::StrCat(prefix,
+                                 " `Tensor` objects with the same type in: ",
+                                 TypeListString(attr->allowed_values()), ".");
+        } else {
+          return strings::StrCat(prefix,
+                                 " `Tensor` objects with the same type.");
+        }
+      } else {
+        return strings::StrCat(prefix,
+                               " `Tensor` objects with the same type as `",
+                               AvoidPythonReserved(*original_arg), "`.");
+      }
+    }
+  } else if (!arg.type_attr().empty() || !arg.type_list_attr().empty()) {
+    const bool is_list = !arg.type_list_attr().empty();
+    const string attr_name = is_list ? arg.type_list_attr() : arg.type_attr();
+    const OpDef::AttrDef* attr = FindAttr(attr_name, op_def);
+    const string mutable_str = arg.is_ref() ? "mutable " : "";
+    const string prefix =
+        is_list ? strings::StrCat("A list of ", mutable_str, "`Tensor` objects")
+                : strings::StrCat("A ", mutable_str, "`Tensor`");
+    const string* original_arg = gtl::FindOrNull(inferred_attrs, attr_name);
+    if (original_arg == nullptr) {
+      return strings::StrCat(prefix, " of type `", attr_name, "`.");
+    } else if (*original_arg == arg.name()) {
+      if (attr->has_allowed_values()) {
+        if (is_list) {
+          return strings::StrCat(prefix, " with types from: ",
+                                 TypeListString(attr->allowed_values()), ".");
+        } else {
+          return strings::StrCat(
+              prefix, is_output ? ". Has one of the following types: "
+                                : ". Must be one of the following types: ",
+              TypeListString(attr->allowed_values()), ".");
+        }
+      } else {
+        return strings::StrCat(prefix, ".");
+      }
+    } else {
+      return strings::StrCat(prefix,
+                             is_output ? ". Has the same type as `"
+                                       : ". Must have the same type as `",
+                             AvoidPythonReserved(*original_arg), "`.");
+    }
+  } else {
+    return SingleTensorName(arg.type(), arg.is_ref());
+  }
+}
+
+string GetReturns(const OpDef& op_def,
+                  const std::vector<string>& output_type_string) {
+  string result;
+  DCHECK_EQ(op_def.output_arg_size(), output_type_string.size());
+  const int num_outs = op_def.output_arg_size();
+  strings::StrAppend(&result, "\n  Returns:\n");
+  if (num_outs == 0) {
+    strings::StrAppend(&result, "    The created Operation.\n");
+  } else {
+    if (num_outs == 1) {
+      StringPiece description = op_def.output_arg(0).description();
+      if (ConsumeEquals(&description)) {  // Skip the generated type info.
+        strings::StrAppend(&result, Indent(4, 4, description));
+      } else {
+        // Special case of one output, don't use the name of the output unless
+        // there is no description.
+        string desc = output_type_string.empty() ? kUnknownTensorType
+                                                 : output_type_string[0];
+        if (desc == kUnknownTensorType) {
+          // Special case where we don't understand how the output tensor type
+          // depends on the input tensor types, just use the output arg
+          // description if we can.
+          if (!description.empty()) {
+            desc = op_def.output_arg(0).description();
+          } else if (!op_def.output_arg(0).name().empty()) {
+            desc = strings::StrCat(" The ", op_def.output_arg(0).name(),
+                                   " `Tensor`.");
+          }
+        } else if (!description.empty()) {
+          AppendWithinWidth(&desc, description, kRightMargin - 4 /* indent */);
+        }
+        strings::StrAppend(&result, Indent(4, 4, desc));
+      }
+    } else {
+      std::vector<string> out_names(num_outs);
+      for (int i = 0; i < num_outs; ++i) {
+        if (!op_def.output_arg(i).name().empty()) {
+          out_names[i] = op_def.output_arg(i).name();
+        } else {
+          out_names[i] = strings::StrCat("output", i);
+        }
+      }
+      strings::StrAppend(&result, "    A tuple of `Tensor` objects (",
+                         str_util::Join(out_names, ", "), ").\n\n");
+      for (int i = 0; i < num_outs; ++i) {
+        string desc = strings::StrCat(out_names[i], ": ");
+        StringPiece description = op_def.output_arg(i).description();
+        if (ConsumeEquals(&description)) {  // Skip the generated type info.
+          strings::StrAppend(&desc, description);
+        } else {
+          const string type = static_cast<size_t>(i) < output_type_string.size()
+                                  ? output_type_string[i]
+                                  : kUnknownTensorType;
+          if (!description.empty()) {
+            if (type == kUnknownTensorType) {
+              // Special case where we don't understand how the output tensor
+              // type depends on the input tensor types, so we just use the
+              // output arg description.
+              strings::StrAppend(&desc, description);
+            } else {
+              strings::StrAppend(&desc, type, " ", description);
+            }
+          } else {
+            strings::StrAppend(&desc, type);
+          }
+        }
+        strings::StrAppend(&result, Indent(4, 6, desc));
+      }
+    }
+  }
+  return result;
+}
+
+string StringToPython(const string& str) {
+  return strings::StrCat("\"", str_util::CEscape(str), "\"");
+}
+
+string DataTypeToPython(DataType dtype, const string& dtype_module) {
+  return strings::StrCat(dtype_module, PythonDataTypeString(dtype));
+}
+
+string ShapeToPython(const TensorShapeProto& shape) {
+  if (shape.unknown_rank()) {
+    return "None";
+  }
+  string python = "[";
+  for (const auto& dim : shape.dim()) {
+    if (python.size() > 1) strings::StrAppend(&python, ", ");
+    if (!dim.name().empty()) {
+      strings::StrAppend(&python, "(", StringToPython(dim.name()), ", ",
+                         dim.size(), ")");
+    } else {
+      strings::StrAppend(&python, dim.size());
+    }
+  }
+  strings::StrAppend(&python, "]");
+  return python;
+}
+
+string TensorToPython(const TensorProto& proto) {
+  return ProtoShortDebugString(proto);
+}
+
+string AttrListToPython(const AttrValue& value,
+                        const string& dtype_module = "tf.") {
+  string ret;
+  if (value.list().s_size() > 0) {
+    for (int i = 0; i < value.list().s_size(); ++i) {
+      if (i > 0) strings::StrAppend(&ret, ", ");
+      strings::StrAppend(&ret, StringToPython(value.list().s(i)));
+    }
+  } else if (value.list().i_size() > 0) {
+    for (int i = 0; i < value.list().i_size(); ++i) {
+      if (i > 0) strings::StrAppend(&ret, ", ");
+      strings::StrAppend(&ret, value.list().i(i));
+    }
+  } else if (value.list().f_size() > 0) {
+    for (int i = 0; i < value.list().f_size(); ++i) {
+      if (i > 0) strings::StrAppend(&ret, ", ");
+      strings::StrAppend(&ret, value.list().f(i));
+    }
+  } else if (value.list().b_size() > 0) {
+    for (int i = 0; i < value.list().b_size(); ++i) {
+      if (i > 0) strings::StrAppend(&ret, ", ");
+      strings::StrAppend(&ret, value.list().b(i) ? "True" : "False");
+    }
+  } else if (value.list().type_size() > 0) {
+    for (int i = 0; i < value.list().type_size(); ++i) {
+      if (i > 0) strings::StrAppend(&ret, ", ");
+      strings::StrAppend(&ret,
+                         DataTypeToPython(value.list().type(i), dtype_module));
+    }
+  } else if (value.list().shape_size() > 0) {
+    for (int i = 0; i < value.list().shape_size(); ++i) {
+      if (i > 0) strings::StrAppend(&ret, ", ");
+      strings::StrAppend(&ret, ShapeToPython(value.list().shape(i)));
+    }
+  } else if (value.list().tensor_size() > 0) {
+    for (int i = 0; i < value.list().tensor_size(); ++i) {
+      if (i > 0) strings::StrAppend(&ret, ", ");
+      strings::StrAppend(&ret, TensorToPython(value.list().tensor(i)));
+    }
+  } else if (value.list().func_size() > 0) {
+    for (int i = 0; i < value.list().func_size(); ++i) {
+      if (i > 0) strings::StrAppend(&ret, ", ");
+      strings::StrAppend(&ret, StringToPython(value.list().func(i).name()));
+    }
+  }
+  return ret;
+}
+
+// NOTE: The return value may contain spaces (for example, it could be
+// a string "foo bar" with an embedded space) and is not safe to pass
+// to WordWrap().
+string AttrValueToPython(const string& type, const AttrValue& value,
+                         const string& dtype_module) {
+  if (type == "string") {
+    return StringToPython(value.s());
+  } else if (type == "int") {
+    return strings::StrCat(value.i());
+  } else if (type == "float") {
+    if (std::isnan(value.f()) || std::isinf(value.f())) {
+      return strings::StrCat("float('", value.f(), "')");
+    } else {
+      return strings::StrCat(value.f());
+    }
+  } else if (type == "bool") {
+    return value.b() ? "True" : "False";
+  } else if (type == "type") {
+    return DataTypeToPython(value.type(), dtype_module);
+  } else if (type == "shape") {
+    return ShapeToPython(value.shape());
+  } else if (type == "tensor") {
+    return TensorToPython(value.tensor());
+  } else if (type == "func") {
+    return StringToPython(value.func().name());
+  } else if (str_util::StartsWith(type, "list(")) {
+    return strings::StrCat("[", AttrListToPython(value, dtype_module), "]");
+  } else {
+    return "?";
+  }
+}
+
+void GenerateLowerCaseOpName(const string& str, string* result) {
+  const char joiner = '_';
+  const int last_index = str.size() - 1;
+  for (int i = 0; i <= last_index; ++i) {
+    const char c = str[i];
+    // Emit a joiner only if a previous-lower-to-now-upper or a
+    // now-upper-to-next-lower transition happens.
+    if (isupper(c) && (i > 0)) {
+      if (islower(str[i - 1]) || ((i < last_index) && islower(str[i + 1]))) {
+        result->push_back(joiner);
+      }
+    }
+    result->push_back(tolower(c));
+  }
+}
+
+static void AddDelimiter(string* append_to, const string& delim) {
+  if (!append_to->empty()) strings::StrAppend(append_to, delim);
+}
+
+const ApiDef::Attr* FindAttr(StringPiece name, const ApiDef& api_def) {
+  for (int i = 0; i < api_def.attr_size(); ++i) {
+    if (api_def.attr(i).name() == name) {
+      return &api_def.attr(i);
+    }
+  }
+  return nullptr;
+}
+
+const ApiDef::Arg* FindInputArg(StringPiece name, const ApiDef& api_def) {
+  for (int i = 0; i < api_def.in_arg_size(); ++i) {
+    if (api_def.in_arg(i).name() == name) {
+      return &api_def.in_arg(i);
+    }
+  }
+  return nullptr;
+}
+
+GenPythonOp::GenPythonOp(const OpDef& op_def, const ApiDef& api_def,
+                         const string& function_name)
+    : op_def_(op_def),
+      api_def_(api_def),
+      function_name_(function_name),
+      num_outs_(op_def.output_arg_size()) {}
+
+GenPythonOp::~GenPythonOp() {}
+
+string GenPythonOp::Code() {
+  // This has all the input args followed by those attrs that don't have
+  // defaults.
+  std::vector<ParamNames> params_no_default;
+  // The parameters with defaults (these have to be listed after those without).
+  // No input args are included, just attrs.
+  std::vector<ParamNames> params_with_default;
+
+  for (int i = 0; i < api_def_.arg_order_size(); ++i) {
+    const auto& arg = *FindInputArg(api_def_.arg_order(i), op_def_);
+    const auto& api_def_arg = *FindInputArg(api_def_.arg_order(i), api_def_);
+    params_no_default.emplace_back(api_def_arg.name(), api_def_arg.rename_to());
+    if (!arg.type_attr().empty()) {
+      gtl::InsertIfNotPresent(&inferred_attrs_, arg.type_attr(), arg.name());
+    } else if (!arg.type_list_attr().empty()) {
+      gtl::InsertIfNotPresent(&inferred_attrs_, arg.type_list_attr(),
+                              arg.name());
+    }
+    if (!arg.number_attr().empty()) {
+      gtl::InsertIfNotPresent(&inferred_attrs_, arg.number_attr(), arg.name());
+    }
+  }
+  for (int i = 0; i < api_def_.attr_size(); ++i) {
+    const auto& attr(api_def_.attr(i));
+    // Do not add inferred attrs to the Python function signature.
+    if (inferred_attrs_.find(attr.name()) == inferred_attrs_.end()) {
+      if (attr.has_default_value()) {
+        params_with_default.emplace_back(attr.name(), attr.rename_to());
+      } else {
+        params_no_default.emplace_back(attr.name(), attr.rename_to());
+      }
+    }
+  }
+
+  // Save the list of attr parameters (attrs that won't be inferred),
+  // those with defaults go at the end.
+  // Get the attrs in the order we want by taking the attrs without defaults
+  // from the end of args_no_default, and adding args_no_default.
+  attrs_.reserve(params_no_default.size() - op_def_.input_arg_size() +
+                 params_with_default.size());
+  for (int i = op_def_.input_arg_size(); i < params_no_default.size(); ++i) {
+    attrs_.push_back(params_no_default[i].GetName());
+  }
+  for (int i = 0; i < params_with_default.size(); ++i) {
+    attrs_.push_back(params_with_default[i].GetName());
+  }
+
+  param_names_.reserve(params_no_default.size() + params_with_default.size());
+  param_names_.insert(param_names_.begin(), params_no_default.begin(),
+                      params_no_default.end());
+  for (const auto& param : params_with_default) {
+    param_names_.push_back(param);
+  }
+
+  string parameters;
+  for (const auto& param : params_no_default) {
+    AddDelimiter(&parameters, ", ");
+    strings::StrAppend(&parameters, param.GetRenameTo());
+  }
+  for (const auto& param_and_default : params_with_default) {
+    AddDelimiter(&parameters, ", ");
+    strings::StrAppend(&parameters, param_and_default.GetRenameTo(), "=None");
+  }
+  AddDelimiter(&parameters, ", ");
+  strings::StrAppend(&parameters, "name=None");
+
+  AddExport();
+  AddDefLine(parameters);
+  AddDocStringDescription();
+  AddDocStringArgs();
+  AddDocStringInputs();
+  AddDocStringAttrs();
+  AddDocStringNameArg();
+  AddOutputGlobals();
+  AddDocStringOutputs();
+  strings::StrAppend(&result_, "  \"\"\"\n");
+  AddBody("  ");
+  strings::StrAppend(&result_, "\n\n");
+
+  return prelude_ + result_;
+}
+
+void GenPythonOp::AddExport() {
+  if (api_def_.visibility() != ApiDef::VISIBLE) {
+    return;
+  }
+
+  strings::StrAppend(&result_, "@tf_export(");
+
+  // Add all endpoint names to tf_export.
+  bool first_endpoint = true;
+  for (const auto& endpoint : api_def_.endpoint()) {
+    if (!first_endpoint) {
+      strings::StrAppend(&result_, ", ");
+    } else {
+      first_endpoint = false;
+    }
+    string endpoint_name;
+    python_op_gen_internal::GenerateLowerCaseOpName(endpoint.name(),
+                                                    &endpoint_name);
+    strings::StrAppend(&result_, "'", endpoint_name, "'");
+  }
+  strings::StrAppend(&result_, ")\n");
+}
+
+void GenPythonOp::AddDefLine(const string& function_name,
+                             const string& parameters) {
+  strings::StrAppend(&result_, "def ", function_name, "(", parameters, "):\n");
+}
+
+void GenPythonOp::AddDefLine(const string& parameters) {
+  AddDefLine(function_name_, parameters);
+}
+
+void GenPythonOp::AddDocStringDescription() {
+  string comment;
+  if (api_def_.summary().empty()) {
+    comment = "TODO: add doc.\n";
+  } else {
+    comment = strings::StrCat(api_def_.summary(), "\n");
+    if (!api_def_.description().empty()) {
+      strings::StrAppend(&comment, "\n", Indent(2, 2, api_def_.description()));
+    }
+  }
+  strings::StrAppend(&result_, "  r\"\"\"", comment, "\n");
+}
+
+void GenPythonOp::AddDocStringArgs() {
+  strings::StrAppend(&result_, "  Args:\n");
+}
+
+void GenPythonOp::AddDocStringInputs() {
+  for (int i = 0; i < api_def_.arg_order_size(); ++i) {
+    const auto& arg = *FindInputArg(api_def_.arg_order(i), op_def_);
+    const auto& api_def_arg = *FindInputArg(api_def_.arg_order(i), api_def_);
+    StringPiece description = api_def_arg.description();
+    string desc;
+    if (ConsumeEquals(&description)) {  // Skip the generated type info.
+      desc = strings::StrCat(param_names_[i].GetRenameTo(), ": ");
+    } else {
+      desc = strings::StrCat(param_names_[i].GetRenameTo(), ": ",
+                             ArgTypeName(op_def_, arg, inferred_attrs_, false));
+    }
+    if (!description.empty()) {
+      AppendWithinWidth(&desc, description, kRightMargin - 4 /* indent */);
+    }
+    strings::StrAppend(&result_, Indent(4, 6, desc));
+  }
+}
+
+void GenPythonOp::AddDocStringAttrs() {
+  for (const string& name : attrs_) {
+    const auto& attr = *FindAttr(name, op_def_);
+    const auto& api_def_attr = *FindAttr(name, api_def_);
+    string desc =
+        strings::StrCat(AvoidPythonReserved(api_def_attr.rename_to()), ": ");
+
+    static const char* const kAttrTypeName[][2] = {
+        {"string", "`string`"},
+        {"list(string)", "list of `strings`"},
+        {"int", "`int`"},
+        {"list(int)", "list of `ints`"},
+        {"float", "`float`"},
+        {"list(float)", "list of `floats`"},
+        {"bool", "`bool`"},
+        {"list(bool)", "list of `bools`"},
+        {"type", "`tf.DType`"},
+        {"list(type)", "list of `tf.DTypes`"},
+        {"shape", "`tf.TensorShape` or list of `ints`"},
+        {"list(shape)",
+         "list of shapes (each a `tf.TensorShape` or list of `ints`)"},
+        {"tensor", "`tf.TensorProto`"},
+        {"list(tensor)", "list of `tf.TensorProto` objects"},
+        {"func", "function decorated with @Defun"},
+        {"list(func)", "list of functions decorated with @Defun"},
+    };
+    for (size_t i = 0; i < TF_ARRAYSIZE(kAttrTypeName); ++i) {
+      if (attr.type() == kAttrTypeName[i][0]) {
+        string s;
+        if (api_def_attr.has_default_value()) {
+          s = strings::StrCat("optional ", kAttrTypeName[i][1]);
+        } else {
+          s = kAttrTypeName[i][1];
+        }
+        if (s[0] == 'o' || (s[0] == '`' && (s[1] == 'i' || s[1] == 'o'))) {
+          strings::StrAppend(&desc, "An ", s);
+        } else {
+          strings::StrAppend(&desc, "A ", s);
+        }
+        break;
+      }
+    }
+
+    if (attr.has_allowed_values()) {
+      strings::StrAppend(&desc, " from: `",
+                         AttrListToPython(attr.allowed_values()), "`");
+    }
+
+    if (attr.has_minimum()) {
+      if (attr.type() == "int") {
+        strings::StrAppend(&desc, " that is `>= ", attr.minimum(), "`");
+      } else if (attr.minimum() > 0) {
+        strings::StrAppend(&desc, " that has length `>= ", attr.minimum(), "`");
+      }
+    }
+
+    strings::StrAppend(&desc, ".");
+
+    if (api_def_attr.has_default_value()) {
+      strings::StrAppend(
+          &desc, " Defaults to `",
+          AttrValueToPython(attr.type(), api_def_attr.default_value()), "`.");
+    }
+    if (!api_def_attr.description().empty()) {
+      AppendWithinWidth(&desc, api_def_attr.description(),
+                        kRightMargin - 4 /* indent */);
+    }
+    strings::StrAppend(&result_, Indent(4, 6, desc));
+  }
+}
+
+void GenPythonOp::AddDocStringNameArg() {
+  strings::StrAppend(&result_,
+                     "    name: A name for the operation (optional).\n");
+}
+
+void GenPythonOp::AddOutputGlobals() {
+  // Prepare a NamedTuple type to hold the outputs, if there are multiple
+  if (num_outs_ > 1) {
+    // Prepare the list of output names
+    std::vector<string> out_names(num_outs_);
+    for (int i = 0; i < num_outs_; ++i) {
+      if (!api_def_.out_arg(i).rename_to().empty()) {
+        out_names[i] = api_def_.out_arg(i).rename_to();
+      } else {
+        out_names[i] = strings::StrCat("output", i);
+      }
+    }
+    string out_names_list =
+        strings::StrCat("[\"", str_util::Join(out_names, "\", \""), "\"]");
+
+    // Provide the output names as a Python list
+    string lower_op_name_outputs =
+        strings::StrCat("_", function_name_, "_outputs");
+    const string outputs_prefix = strings::StrCat(lower_op_name_outputs, " = ");
+    strings::StrAppend(&prelude_, "\n",
+                       WordWrap(outputs_prefix, out_names_list, kRightMargin),
+                       "\n");
+
+    strings::StrAppend(&prelude_, "_", op_def_.name(),
+                       "Output = _collections.namedtuple(\n");
+    const string tuple_type_prefix = "    ";
+    const string tuple_type_suffix = strings::StrCat(
+        "\"", op_def_.name(), "\", ", lower_op_name_outputs, ")");
+    strings::StrAppend(
+        &prelude_, WordWrap(tuple_type_prefix, tuple_type_suffix, kRightMargin),
+        "\n\n");
+  }
+  strings::StrAppend(&prelude_, "\n");
+}
+
+void GenPythonOp::AddDocStringOutputs() {
+  std::vector<string> output_type_string;
+  output_type_string.reserve(num_outs_);
+  for (int i = 0; i < num_outs_; ++i) {
+    output_type_string.push_back(
+        ArgTypeName(op_def_, op_def_.output_arg(i), inferred_attrs_, true));
+  }
+  strings::StrAppend(&result_, GetReturns(op_def_, output_type_string));
+}
+
+void GenPythonOp::AddBody(const string& prefix) {
+  const string apply_prefix =
+      strings::StrCat(prefix, "_result = _op_def_lib.apply_op(");
+  AddBodyNoReturn(apply_prefix);
+  if (num_outs_ > 1) {
+    strings::StrAppend(&result_, prefix, "_result = _", op_def_.name(),
+                       "Output._make(_result)\n");
+  }
+  strings::StrAppend(&result_, prefix, "return _result\n");
+}
+
+void GenPythonOp::AddBodyNoReturn(const string& apply_prefix) {
+  string args = strings::StrCat("\"", op_def_.name(), "\", ");
+  for (size_t i = 0; i < param_names_.size(); ++i) {
+    strings::StrAppend(&args, AvoidPythonReserved(param_names_[i].GetName()),
+                       "=", param_names_[i].GetRenameTo(), ", ");
+  }
+  strings::StrAppend(&args, "name=name)");
+
+  strings::StrAppend(&result_,
+                     // Wrap the arguments, and indent to the (.
+                     WordWrap(apply_prefix, args, kRightMargin), "\n");
+}
+
+}  // namespace python_op_gen_internal
+}  // namespace tensorflow
diff --git a/tensorflow/python/framework/python_op_gen_main.cc b/tensorflow/python/framework/python_op_gen_main.cc
index ca6ed42beec4a3..8eb943b960800e 100644
--- a/tensorflow/python/framework/python_op_gen_main.cc
+++ b/tensorflow/python/framework/python_op_gen_main.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/python/eager/python_eager_op_gen.h"
+#include "tensorflow/python/framework/python_op_gen.h"
 
 #include <memory>
 #include <string>
@@ -133,11 +133,10 @@ void PrintAllPythonOps(const std::vector<string>& op_list,
         *pruned_ops.mutable_op()->Add() = op_def;
       }
     }
-    PrintEagerPythonOps(pruned_ops, api_def_map, {}, require_shapes,
-                        source_file_name);
+    PrintPythonOps(pruned_ops, api_def_map, {}, require_shapes,
+                   source_file_name);
   } else {
-    PrintEagerPythonOps(ops, api_def_map, op_list, require_shapes,
-                        source_file_name);
+    PrintPythonOps(ops, api_def_map, op_list, require_shapes, source_file_name);
   }
 }
 
diff --git a/tensorflow/python/framework/smart_cond_test.py b/tensorflow/python/framework/smart_cond_test.py
index 1170a41c99995a..b8a9672b06da9b 100644
--- a/tensorflow/python/framework/smart_cond_test.py
+++ b/tensorflow/python/framework/smart_cond_test.py
@@ -33,7 +33,6 @@ def raise_exception():
   raise RuntimeError("did not expect to be called")
 
 
-@test_util.with_c_api
 class SmartCondTest(test_util.TensorFlowTestCase):
 
   def testTrue(self):
@@ -64,9 +63,6 @@ def testUnknown(self):
         self.assertEqual(y.eval(feed_dict={x: -1}), 2)
 
   def testEval(self):
-    # Constant expression evaluation only works with the C API enabled.
-    if not ops._USE_C_API: return
-
     with ops.Graph().as_default():
       with session.Session():
         x = constant_op.constant(1)
@@ -101,7 +97,6 @@ def testMissingArg2(self):
           smart_cond.smart_cond(True, lambda: x)
 
 
-@test_util.with_c_api
 class SmartCaseTest(test_util.TensorFlowTestCase):
 
   def testTrue(self):
@@ -130,9 +125,6 @@ def testFalse(self):
       self.assertEqual(sess.run(z), 1)
 
   def testMix(self):
-    # Constant expression evaluation only works with the C API enabled.
-    if not ops._USE_C_API: return
-
     x = array_ops.placeholder(dtype=dtypes.int32, shape=[])
     y = constant_op.constant(10)
     conditions = [(x > 1, lambda: constant_op.constant(1)),
@@ -145,7 +137,6 @@ def testMix(self):
       self.assertEqual(sess.run(z, feed_dict={x: 0}), 3)
 
 
-@test_util.with_c_api
 class SmartConstantValueTest(test_util.TensorFlowTestCase):
 
   # TODO(skyewm): this is essentially a regression test for
diff --git a/tensorflow/python/framework/sparse_tensor.py b/tensorflow/python/framework/sparse_tensor.py
index 1fe81e5f17a7de..6a5c6468f77382 100644
--- a/tensorflow/python/framework/sparse_tensor.py
+++ b/tensorflow/python/framework/sparse_tensor.py
@@ -20,6 +20,7 @@
 
 import collections
 
+from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -225,6 +226,7 @@ def _override_operator(operator, func):
 SparseTensorValue = collections.namedtuple(
     "SparseTensorValue", ["indices", "values", "dense_shape"])
 tf_export("SparseTensorValue")(SparseTensorValue)
+pywrap_tensorflow.RegisterSparseTensorValueClass(SparseTensorValue)
 
 
 @tf_export("convert_to_tensor_or_sparse_tensor")
diff --git a/tensorflow/python/framework/subscribe_test.py b/tensorflow/python/framework/subscribe_test.py
index 8b95b25e82a188..d6de45fdc41d8e 100644
--- a/tensorflow/python/framework/subscribe_test.py
+++ b/tensorflow/python/framework/subscribe_test.py
@@ -36,7 +36,6 @@
 from tensorflow.python.platform import googletest
 
 
-@test_util.with_c_api
 class SubscribeTest(test_util.TensorFlowTestCase):
 
   def _ExpectSubscribedIdentities(self, container):
diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py
index 0dd29460ed93aa..c9be3d50056b28 100644
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@@ -961,9 +961,12 @@ def unknown_shape(ndims=None):
     return TensorShape([Dimension(None)] * ndims)
 
 
+_SCALAR_SHAPE = TensorShape([])
+
+
 def scalar():
   """Returns a shape representing a scalar."""
-  return TensorShape([])
+  return _SCALAR_SHAPE
 
 
 def vector(length):
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 8cf24206edab8b..ca63efbc84dab2 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -50,6 +50,13 @@ def SlowAppendFloat16ArrayToTensorProto(tensor_proto, proto_values):
       [ExtractBitsFromFloat16(x) for x in proto_values])
 
 
+def _MediumAppendFloat16ArrayToTensorProto(tensor_proto, proto_values):
+  # TODO: Remove the conversion if cython supports np.float16_t
+  fast_tensor_util.AppendFloat16ArrayToTensorProto(
+      tensor_proto,
+      np.asarray(proto_values, dtype=np.float16).view(np.uint16))
+
+
 def ExtractBitsFromBFloat16(x):
   return np.asscalar(
       np.asarray(x, dtype=dtypes.bfloat16.as_numpy_dtype).view(np.uint16))
@@ -64,11 +71,8 @@ def SlowAppendBFloat16ArrayToTensorProto(tensor_proto, proto_values):
   _NP_TO_APPEND_FN = {
       dtypes.bfloat16.as_numpy_dtype:
           SlowAppendBFloat16ArrayToTensorProto,
-      # TODO(sesse): We should have a
-      # fast_tensor_util.AppendFloat16ArrayToTensorProto,
-      # but it seems np.float16_t doesn't exist?
       np.float16:
-          SlowAppendFloat16ArrayToTensorProto,
+          _MediumAppendFloat16ArrayToTensorProto,
       np.float32:
           fast_tensor_util.AppendFloat32ArrayToTensorProto,
       np.float64:
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 2265fe1dc4fbbf..0b7b20c3c21ca9 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -560,12 +560,16 @@ def _is_tensorflow_object(obj):
 
     tensors_before = set(
         id(obj) for obj in gc.get_objects() if _is_tensorflow_object(obj))
-    outside_graph_key = ops.get_default_graph()._graph_key
-    with ops.Graph().as_default():
+    if context.executing_eagerly():
+      f(self, **kwargs)
+      ops.reset_default_graph()
+    else:
       # Run the test in a new graph so that collections get cleared when it's
       # done, but inherit the graph key so optimizers behave.
-      ops.get_default_graph()._graph_key = outside_graph_key
-      f(self, **kwargs)
+      outside_graph_key = ops.get_default_graph()._graph_key
+      with ops.Graph().as_default():
+        ops.get_default_graph()._graph_key = outside_graph_key
+        f(self, **kwargs)
     # Make an effort to clear caches, which would otherwise look like leaked
     # Tensors.
     backprop._zeros_cache.flush()
@@ -643,6 +647,15 @@ def _safe_object_str(obj):
   return decorator
 
 
+def run_all_in_graph_and_eager_modes(cls):
+  """Execute all test methods in the given class with and without eager."""
+  base_decorator = run_in_graph_and_eager_modes()
+  for name, value in cls.__dict__.copy().items():
+    if callable(value) and name.startswith("test"):
+      setattr(cls, name, base_decorator(value))
+  return cls
+
+
 def run_in_graph_and_eager_modes(__unused__=None,
                                  config=None,
                                  use_gpu=True,
@@ -678,7 +691,7 @@ def test_foo(self):
 
 
   Args:
-    __unused__: Prevents sliently skipping tests.
+    __unused__: Prevents silently skipping tests.
     config: An optional config_pb2.ConfigProto to use to configure the
       session when executing graphs.
     use_gpu: If True, attempt to run as many operations as possible on GPU.
@@ -723,12 +736,12 @@ def run_eagerly(self, **kwargs):
           f(self, **kwargs)
 
       if assert_no_eager_garbage:
+        ops.reset_default_graph()
         run_eagerly = assert_no_new_tensors(
             assert_no_garbage_created(run_eagerly))
 
       with context.eager_mode():
-        with ops.Graph().as_default():
-          run_eagerly(self, **kwargs)
+        run_eagerly(self, **kwargs)
 
     return decorated
 
@@ -1038,7 +1051,9 @@ def prepare_config(config):
           rewriter_config_pb2.RewriterConfig.OFF)
       return config
 
-    if graph is None:
+    if context.executing_eagerly():
+      yield None
+    elif graph is None:
       if self._cached_session is None:
         self._cached_session = session.Session(
             graph=None, config=prepare_config(config))
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index 479d068c431e17..c183197a05f942 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -44,7 +44,6 @@
 from tensorflow.python.platform import googletest
 
 
-@test_util.with_c_api
 class TestUtilTest(test_util.TensorFlowTestCase):
 
   def test_assert_ops_in_graph(self):
@@ -605,7 +604,6 @@ def test_get_node_def_from_graph(self):
     self.assertIsNone(test_util.get_node_def_from_graph("bar", graph_def))
 
 
-@test_util.with_c_api
 class GarbageCollectionTest(test_util.TensorFlowTestCase):
 
   def test_no_reference_cycle_decorator(self):
@@ -629,6 +627,7 @@ def test_has_no_cycle(self):
 
     ReferenceCycleTest().test_has_no_cycle()
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_no_leaked_tensor_decorator(self):
 
     class LeakedTensorTest(object):
@@ -638,11 +637,11 @@ def __init__(inner_self):  # pylint: disable=no-self-argument
 
       @test_util.assert_no_new_tensors
       def test_has_leak(self):
-        self.a = constant_op.constant([3.])
+        self.a = constant_op.constant([3.], name="leak")
 
       @test_util.assert_no_new_tensors
       def test_has_no_leak(self):
-        constant_op.constant([3.])
+        constant_op.constant([3.], name="no-leak")
 
     with self.assertRaisesRegexp(AssertionError, "Tensors not deallocated"):
       LeakedTensorTest().test_has_leak()
diff --git a/tensorflow/python/grappler/cluster_test.py b/tensorflow/python/grappler/cluster_test.py
index 26c6f22d34b27c..541747867fa81b 100644
--- a/tensorflow/python/grappler/cluster_test.py
+++ b/tensorflow/python/grappler/cluster_test.py
@@ -45,7 +45,7 @@ def testBasic(self):
       op_perfs, run_time, step_stats = grappler_cluster.MeasureCosts(
           grappler_item)
       self.assertTrue(run_time > 0)
-      self.assertEqual(len(op_perfs), 8)
+      self.assertEqual(len(op_perfs), 4)
       self.assertTrue(step_stats.dev_stats)
 
   def testNoDetailedStats(self):
@@ -129,7 +129,7 @@ def testContext(self):
         disable_detailed_stats=False, disable_timeline=False) as gcluster:
       op_perfs, run_time, step_stats = gcluster.MeasureCosts(grappler_item)
       self.assertTrue(run_time > 0)
-      self.assertEqual(len(op_perfs), 8)
+      self.assertEqual(len(op_perfs), 4)
       self.assertTrue(step_stats.dev_stats)
 
   def testAvailableOps(self):
diff --git a/tensorflow/python/grappler/cost_analyzer_tool.py b/tensorflow/python/grappler/cost_analyzer_tool.py
index 0853db25240696..e6229e18566d7b 100644
--- a/tensorflow/python/grappler/cost_analyzer_tool.py
+++ b/tensorflow/python/grappler/cost_analyzer_tool.py
@@ -21,11 +21,13 @@
 import argparse
 import sys
 
+from google.protobuf import message
 from google.protobuf import text_format
 from tensorflow.contrib.fused_conv.ops import gen_fused_conv2d_bias_activation_op  # pylint: disable=unused-import
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.core.protobuf import saved_model_pb2
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.grappler import cost_analyzer
@@ -37,33 +39,42 @@
 
 def get_metagraph():
   """Constructs and returns a MetaGraphDef from the input file."""
-  if FLAGS.metagraphdef:
-    with gfile.GFile(FLAGS.metagraphdef) as meta_file:
-      metagraph = meta_graph_pb2.MetaGraphDef()
-      if FLAGS.metagraphdef.endswith(".pbtxt"):
-        text_format.Merge(meta_file.read(), metagraph)
-      else:
-        metagraph.ParseFromString(meta_file.read())
-    if FLAGS.fetch is not None:
-      fetch_collection = meta_graph_pb2.CollectionDef()
-      for fetch in FLAGS.fetch.split(","):
-        fetch_collection.node_list.value.append(fetch)
-      metagraph.collection_def["train_op"].CopyFrom(fetch_collection)
-  else:
-    with gfile.GFile(FLAGS.graphdef) as graph_file:
-      graph_def = graph_pb2.GraphDef()
-      if FLAGS.graphdef.endswith(".pbtxt"):
-        text_format.Merge(graph_file.read(), graph_def)
-      else:
-        graph_def.ParseFromString(graph_file.read())
-      importer.import_graph_def(graph_def, name="")
-      graph = ops.get_default_graph()
-      for fetch in FLAGS.fetch.split(","):
-        fetch_op = graph.get_operation_by_name(fetch)
-        graph.add_to_collection("train_op", fetch_op)
-      metagraph = saver.export_meta_graph(
-          graph_def=graph.as_graph_def(), graph=graph)
-  return metagraph
+  with gfile.GFile(FLAGS.input) as input_file:
+    input_data = input_file.read()
+    try:
+      saved_model = saved_model_pb2.SavedModel()
+      text_format.Merge(input_data, saved_model)
+      meta_graph = saved_model.meta_graphs[0]
+    except text_format.ParseError:
+      try:
+        saved_model.ParseFromString(input_data)
+        meta_graph = saved_model.meta_graphs[0]
+      except message.DecodeError:
+        try:
+          meta_graph = meta_graph_pb2.MetaGraphDef()
+          text_format.Merge(input_data, meta_graph)
+        except text_format.ParseError:
+          try:
+            meta_graph.ParseFromString(input_data)
+          except message.DecodeError:
+            try:
+              graph_def = graph_pb2.GraphDef()
+              text_format.Merge(input_data, graph_def)
+            except text_format.ParseError:
+              try:
+                graph_def.ParseFromString(input_data)
+              except message.DecodeError:
+                raise ValueError("Invalid input file.")
+            importer.import_graph_def(graph_def, name="")
+            graph = ops.get_default_graph()
+            meta_graph = saver.export_meta_graph(
+                graph_def=graph.as_graph_def(), graph=graph)
+  if FLAGS.fetch is not None:
+    fetch_collection = meta_graph_pb2.CollectionDef()
+    for fetch in FLAGS.fetch.split(","):
+      fetch_collection.node_list.value.append(fetch)
+    meta_graph.collection_def["train_op"].CopyFrom(fetch_collection)
+  return meta_graph
 
 
 def main(_):
@@ -85,15 +96,11 @@ def main(_):
 if __name__ == "__main__":
   parser = argparse.ArgumentParser()
   parser.add_argument(
-      "--metagraphdef",
+      "--input",
       type=str,
       default=None,
-      help="Input .meta MetaGraphDef file path.")
-  parser.add_argument(
-      "--graphdef",
-      type=str,
-      default=None,
-      help="Input .pb GraphDef file path.")
+      help="Input file path. Accept SavedModel, MetaGraphDef, and GraphDef in "
+      "either binary or text format.")
   parser.add_argument(
       "--fetch",
       type=str,
diff --git a/tensorflow/python/grappler/graph_placer.py b/tensorflow/python/grappler/graph_placer.py
index 1cd51df4d96258..654013b23c5811 100644
--- a/tensorflow/python/grappler/graph_placer.py
+++ b/tensorflow/python/grappler/graph_placer.py
@@ -55,11 +55,6 @@ def PlaceGraph(metagraph,
 
   # Optimize the metagraph to speedup the placement
   rewriter_config = rewriter_config_pb2.RewriterConfig()
-  rewriter_config.optimizers.append("pruning")
-  rewriter_config.optimizers.append("constfold")
-  rewriter_config.optimizers.append("arithmetic")
-  rewriter_config.optimizers.append("dependency")
-  rewriter_config.optimizers.append("pruning")
   optimized_graph = tf_optimizer.OptimizeGraph(
       rewriter_config, metagraph, verbose=verbose, cluster=cluster)
   optimized_metagraph = meta_graph_pb2.MetaGraphDef()
diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index e3dd4b0bdfbb28..af5d709f7e936e 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -150,10 +150,14 @@ def _loop_with_vec_and_4d():
 def _get_config(layout_optimizer=True):
   if layout_optimizer:
     rewrite_options = rewriter_config_pb2.RewriterConfig(
-        layout_optimizer=rewriter_config_pb2.RewriterConfig.ON)
+        layout_optimizer=rewriter_config_pb2.RewriterConfig.ON,
+        # do not remove duplicated nodes
+        arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF)
   else:
     rewrite_options = rewriter_config_pb2.RewriterConfig(
-        layout_optimizer=rewriter_config_pb2.RewriterConfig.OFF)
+        layout_optimizer=rewriter_config_pb2.RewriterConfig.OFF,
+        # do not remove duplicated nodes
+        arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF)
   graph_options = config_pb2.GraphOptions(
       rewrite_options=rewrite_options, build_cost_model=1)
   config = config_pb2.ConfigProto(graph_options=graph_options)
@@ -1385,7 +1389,7 @@ def testLoopWithBranch(self):
       expected_num_transposes = 3
       self.assertEqual(expected_num_transposes, num_transposes)
       self._assert_trans_nhwc_to_nchw('map/while/Conv2D-0', nodes)
-      self._assert_trans_nchw_to_nhwc('map/while/Add-0-2', nodes)
+      self._assert_trans_nchw_to_nhwc('map/while/Add_1-0-2', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
   def testLoopWithVecAnd4D(self):
@@ -1409,7 +1413,7 @@ def testLoopWithVecAnd4D(self):
       expected_num_transposes = 2
       self.assertEqual(expected_num_transposes, num_transposes)
       self._assert_trans_nhwc_to_nchw('map/while/Conv2D-0', nodes)
-      self._assert_trans_nchw_to_nhwc('map/while/Add-0-2', nodes)
+      self._assert_trans_nchw_to_nhwc('map/while/Add_1-0-2', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
   def testBinaryOpSecondPort(self):
diff --git a/tensorflow/python/grappler/memory_optimizer_test.py b/tensorflow/python/grappler/memory_optimizer_test.py
index 4df959ce041693..7ed4b128e495c4 100644
--- a/tensorflow/python/grappler/memory_optimizer_test.py
+++ b/tensorflow/python/grappler/memory_optimizer_test.py
@@ -25,7 +25,6 @@
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
-from tensorflow.python.framework import test_util
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
@@ -35,7 +34,6 @@
 from tensorflow.python.training import training as train
 
 
-@test_util.with_c_api
 class MemoryOptimizerSwapTest(test.TestCase):
   """Tests the Grappler memory optimizer."""
 
@@ -76,6 +74,7 @@ def testSimpleSwap(self):
 
     rewriter_config = rewriter_config_pb2.RewriterConfig(
         disable_model_pruning=True,
+        meta_optimizer_iterations=rewriter_config_pb2.RewriterConfig.ONE,
         constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
         memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL)
     graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
@@ -95,7 +94,6 @@ def testSimpleSwap(self):
         self.assertEqual('c', node.input[1])
 
 
-@test_util.with_c_api
 class MemoryOptimizerRecomputeTest(test.TestCase):
   """Tests the Python interface to recomputation rewrites.
 
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index b88b9c43d349bb..50522e861cc54f 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -19,76 +19,37 @@ py_library(
     name = "keras",
     srcs = [
         "__init__.py",
-        "_impl/keras/__init__.py",
-        "_impl/keras/applications/__init__.py",
-        "_impl/keras/applications/densenet.py",
-        "_impl/keras/applications/imagenet_utils.py",
-        "_impl/keras/applications/inception_resnet_v2.py",
-        "_impl/keras/applications/inception_v3.py",
-        "_impl/keras/applications/mobilenet.py",
-        "_impl/keras/applications/nasnet.py",
-        "_impl/keras/applications/resnet50.py",
-        "_impl/keras/applications/vgg16.py",
-        "_impl/keras/applications/vgg19.py",
-        "_impl/keras/applications/xception.py",
-        "_impl/keras/datasets/__init__.py",
-        "_impl/keras/datasets/boston_housing.py",
-        "_impl/keras/datasets/cifar.py",
-        "_impl/keras/datasets/cifar10.py",
-        "_impl/keras/datasets/cifar100.py",
-        "_impl/keras/datasets/fashion_mnist.py",
-        "_impl/keras/datasets/imdb.py",
-        "_impl/keras/datasets/mnist.py",
-        "_impl/keras/datasets/reuters.py",
-        "_impl/keras/estimator.py",
-        "_impl/keras/preprocessing/__init__.py",
-        "_impl/keras/preprocessing/image.py",
-        "_impl/keras/preprocessing/sequence.py",
-        "_impl/keras/preprocessing/text.py",
-        "_impl/keras/testing_utils.py",
-        "_impl/keras/utils/__init__.py",
-        "_impl/keras/utils/multi_gpu_utils.py",
-        "_impl/keras/utils/np_utils.py",
-        "_impl/keras/utils/vis_utils.py",
-        "_impl/keras/wrappers/__init__.py",
-        "_impl/keras/wrappers/scikit_learn.py",
-        "activations/__init__.py",
         "applications/__init__.py",
-        "applications/densenet/__init__.py",
-        "applications/inception_resnet_v2/__init__.py",
-        "applications/inception_v3/__init__.py",
-        "applications/mobilenet/__init__.py",
-        "applications/nasnet/__init__.py",
-        "applications/resnet50/__init__.py",
-        "applications/vgg16/__init__.py",
-        "applications/vgg19/__init__.py",
-        "applications/xception/__init__.py",
-        "backend/__init__.py",
-        "callbacks/__init__.py",
-        "constraints/__init__.py",
+        "applications/densenet.py",
+        "applications/imagenet_utils.py",
+        "applications/inception_resnet_v2.py",
+        "applications/inception_v3.py",
+        "applications/mobilenet.py",
+        "applications/nasnet.py",
+        "applications/resnet50.py",
+        "applications/vgg16.py",
+        "applications/vgg19.py",
+        "applications/xception.py",
         "datasets/__init__.py",
-        "datasets/boston_housing/__init__.py",
-        "datasets/cifar10/__init__.py",
-        "datasets/cifar100/__init__.py",
-        "datasets/fashion_mnist/__init__.py",
-        "datasets/imdb/__init__.py",
-        "datasets/mnist/__init__.py",
-        "datasets/reuters/__init__.py",
-        "estimator/__init__.py",
-        "initializers/__init__.py",
-        "layers/__init__.py",
-        "losses/__init__.py",
-        "metrics/__init__.py",
-        "models/__init__.py",
-        "optimizers/__init__.py",
+        "datasets/boston_housing.py",
+        "datasets/cifar.py",
+        "datasets/cifar10.py",
+        "datasets/cifar100.py",
+        "datasets/fashion_mnist.py",
+        "datasets/imdb.py",
+        "datasets/mnist.py",
+        "datasets/reuters.py",
         "preprocessing/__init__.py",
-        "preprocessing/image/__init__.py",
-        "preprocessing/sequence/__init__.py",
-        "preprocessing/text/__init__.py",
-        "regularizers/__init__.py",
+        "preprocessing/image.py",
+        "preprocessing/sequence.py",
+        "preprocessing/text.py",
+        "testing_utils.py",
         "utils/__init__.py",
+        "utils/multi_gpu_utils.py",
+        "utils/np_utils.py",
+        "utils/vis_utils.py",
         "wrappers/__init__.py",
-        "wrappers/scikit_learn/__init__.py",
+        "wrappers/scikit_learn.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
@@ -99,8 +60,6 @@ py_library(
         ":backend",
         ":engine",
         ":layers",
-        "//tensorflow/python/estimator",
-        "//tensorflow/python/estimator:model_fn",
         "//tensorflow/python/saved_model",
         "//tensorflow/python:training",
     ],
@@ -108,7 +67,7 @@ py_library(
 
 py_library(
     name = "backend",
-    srcs = ["_impl/keras/backend.py"],
+    srcs = ["backend.py"],
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/core:protos_all_py",
@@ -149,33 +108,34 @@ py_library(
 py_library(
     name = "engine",
     srcs = [
-        "_impl/keras/activations.py",
-        "_impl/keras/callbacks.py",
-        "_impl/keras/constraints.py",
-        "_impl/keras/engine/__init__.py",
-        "_impl/keras/engine/base_layer.py",
-        "_impl/keras/engine/input_layer.py",
-        "_impl/keras/engine/network.py",
-        "_impl/keras/engine/saving.py",
-        "_impl/keras/engine/sequential.py",
-        "_impl/keras/engine/training.py",
-        "_impl/keras/engine/training_arrays.py",
-        "_impl/keras/engine/training_eager.py",
-        "_impl/keras/engine/training_generator.py",
-        "_impl/keras/engine/training_utils.py",
-        "_impl/keras/initializers.py",
-        "_impl/keras/losses.py",
-        "_impl/keras/metrics.py",
-        "_impl/keras/models.py",
-        "_impl/keras/optimizers.py",
-        "_impl/keras/regularizers.py",
-        "_impl/keras/utils/data_utils.py",
-        "_impl/keras/utils/io_utils.py",
+        "activations.py",
+        "callbacks.py",
+        "constraints.py",
+        "engine/__init__.py",
+        "engine/base_layer.py",
+        "engine/input_layer.py",
+        "engine/network.py",
+        "engine/saving.py",
+        "engine/sequential.py",
+        "engine/training.py",
+        "engine/training_arrays.py",
+        "engine/training_eager.py",
+        "engine/training_generator.py",
+        "engine/training_utils.py",
+        "initializers.py",
+        "losses.py",
+        "metrics.py",
+        "models.py",
+        "optimizers.py",
+        "regularizers.py",
+        "utils/data_utils.py",
+        "utils/io_utils.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
         ":backend",
         "//tensorflow/python/data",
+        "//tensorflow/python/training/checkpointable:data_structures_base",
         "@six_archive//:six",
     ],
 )
@@ -183,29 +143,31 @@ py_library(
 py_library(
     name = "layers",
     srcs = [
-        "_impl/keras/layers/__init__.py",
-        "_impl/keras/layers/advanced_activations.py",
-        "_impl/keras/layers/convolutional.py",
-        "_impl/keras/layers/convolutional_recurrent.py",
-        "_impl/keras/layers/core.py",
-        "_impl/keras/layers/embeddings.py",
-        "_impl/keras/layers/local.py",
-        "_impl/keras/layers/merge.py",
-        "_impl/keras/layers/noise.py",
-        "_impl/keras/layers/normalization.py",
-        "_impl/keras/layers/pooling.py",
-        "_impl/keras/layers/recurrent.py",
-        "_impl/keras/layers/serialization.py",
-        "_impl/keras/layers/wrappers.py",
-        "_impl/keras/utils/conv_utils.py",
-        "_impl/keras/utils/generic_utils.py",
-        "_impl/keras/utils/layer_utils.py",
-        "_impl/keras/utils/tf_utils.py",
+        "layers/__init__.py",
+        "layers/advanced_activations.py",
+        "layers/convolutional.py",
+        "layers/convolutional_recurrent.py",
+        "layers/core.py",
+        "layers/cudnn_recurrent.py",
+        "layers/embeddings.py",
+        "layers/local.py",
+        "layers/merge.py",
+        "layers/noise.py",
+        "layers/normalization.py",
+        "layers/pooling.py",
+        "layers/recurrent.py",
+        "layers/serialization.py",
+        "layers/wrappers.py",
+        "utils/conv_utils.py",
+        "utils/generic_utils.py",
+        "utils/layer_utils.py",
+        "utils/tf_utils.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
         ":engine",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:cudnn_rnn_ops_gen",
         "//tensorflow/python:distribute",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:embedding_ops",
@@ -226,7 +188,7 @@ py_library(
 py_test(
     name = "integration_test",
     size = "medium",
-    srcs = ["_impl/keras/integration_test.py"],
+    srcs = ["integration_test.py"],
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
@@ -241,7 +203,7 @@ py_test(
 py_test(
     name = "activations_test",
     size = "small",
-    srcs = ["_impl/keras/activations_test.py"],
+    srcs = ["activations_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -253,7 +215,7 @@ py_test(
 py_test(
     name = "constraints_test",
     size = "small",
-    srcs = ["_impl/keras/constraints_test.py"],
+    srcs = ["constraints_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -265,7 +227,7 @@ py_test(
 py_test(
     name = "initializers_test",
     size = "small",
-    srcs = ["_impl/keras/initializers_test.py"],
+    srcs = ["initializers_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -278,7 +240,7 @@ py_test(
 py_test(
     name = "regularizers_test",
     size = "small",
-    srcs = ["_impl/keras/regularizers_test.py"],
+    srcs = ["regularizers_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -289,7 +251,7 @@ py_test(
 py_test(
     name = "optimizers_test",
     size = "medium",
-    srcs = ["_impl/keras/optimizers_test.py"],
+    srcs = ["optimizers_test.py"],
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
@@ -303,7 +265,7 @@ py_test(
 py_test(
     name = "losses_test",
     size = "small",
-    srcs = ["_impl/keras/losses_test.py"],
+    srcs = ["losses_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -314,8 +276,8 @@ py_test(
 
 py_test(
     name = "metrics_test",
-    size = "small",
-    srcs = ["_impl/keras/metrics_test.py"],
+    size = "medium",
+    srcs = ["metrics_test.py"],
     srcs_version = "PY2AND3",
     tags = [
         "manual",
@@ -332,8 +294,9 @@ py_test(
 py_test(
     name = "densenet_test",
     size = "large",
-    srcs = ["_impl/keras/applications/densenet_test.py"],
+    srcs = ["applications/densenet_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["nomsan"],  # times out, http://b/78650237
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
@@ -344,7 +307,7 @@ py_test(
 py_test(
     name = "inception_resnet_v2_test",
     size = "medium",
-    srcs = ["_impl/keras/applications/inception_resnet_v2_test.py"],
+    srcs = ["applications/inception_resnet_v2_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -356,7 +319,7 @@ py_test(
 py_test(
     name = "inception_v3_test",
     size = "medium",
-    srcs = ["_impl/keras/applications/inception_v3_test.py"],
+    srcs = ["applications/inception_v3_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -368,7 +331,7 @@ py_test(
 py_test(
     name = "mobilenet_test",
     size = "medium",
-    srcs = ["_impl/keras/applications/mobilenet_test.py"],
+    srcs = ["applications/mobilenet_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -380,7 +343,7 @@ py_test(
 py_test(
     name = "nasnet_test",
     size = "large",
-    srcs = ["_impl/keras/applications/nasnet_test.py"],
+    srcs = ["applications/nasnet_test.py"],
     srcs_version = "PY2AND3",
     tags = ["nomsan"],  # times out, http://b/78573625
     deps = [
@@ -392,8 +355,8 @@ py_test(
 
 py_test(
     name = "resnet50_test",
-    size = "small",
-    srcs = ["_impl/keras/applications/resnet50_test.py"],
+    size = "medium",
+    srcs = ["applications/resnet50_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -404,7 +367,7 @@ py_test(
 py_test(
     name = "vgg16_test",
     size = "small",
-    srcs = ["_impl/keras/applications/vgg16_test.py"],
+    srcs = ["applications/vgg16_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -415,7 +378,7 @@ py_test(
 py_test(
     name = "vgg19_test",
     size = "small",
-    srcs = ["_impl/keras/applications/vgg19_test.py"],
+    srcs = ["applications/vgg19_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -426,7 +389,7 @@ py_test(
 py_test(
     name = "xception_test",
     size = "medium",
-    srcs = ["_impl/keras/applications/xception_test.py"],
+    srcs = ["applications/xception_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -438,7 +401,7 @@ py_test(
 py_test(
     name = "advanced_activations_test",
     size = "small",
-    srcs = ["_impl/keras/layers/advanced_activations_test.py"],
+    srcs = ["layers/advanced_activations_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -449,7 +412,7 @@ py_test(
 py_test(
     name = "convolutional_recurrent_test",
     size = "large",
-    srcs = ["_impl/keras/layers/convolutional_recurrent_test.py"],
+    srcs = ["layers/convolutional_recurrent_test.py"],
     shard_count = 2,
     srcs_version = "PY2AND3",
     deps = [
@@ -462,7 +425,7 @@ py_test(
 py_test(
     name = "convolutional_test",
     size = "large",
-    srcs = ["_impl/keras/layers/convolutional_test.py"],
+    srcs = ["layers/convolutional_test.py"],
     srcs_version = "PY2AND3",
     tags = [
         "manual",
@@ -476,10 +439,23 @@ py_test(
     ],
 )
 
+gpu_py_test(
+    name = "cudnn_recurrent_test",
+    size = "large",
+    srcs = ["layers/cudnn_recurrent_test.py"],
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    shard_count = 2,
+)
+
 py_test(
     name = "pooling_test",
     size = "small",
-    srcs = ["_impl/keras/layers/pooling_test.py"],
+    srcs = ["layers/pooling_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -490,7 +466,7 @@ py_test(
 py_test(
     name = "core_test",
     size = "medium",
-    srcs = ["_impl/keras/layers/core_test.py"],
+    srcs = ["layers/core_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -502,7 +478,7 @@ py_test(
 py_test(
     name = "embeddings_test",
     size = "small",
-    srcs = ["_impl/keras/layers/embeddings_test.py"],
+    srcs = ["layers/embeddings_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -513,7 +489,7 @@ py_test(
 py_test(
     name = "local_test",
     size = "medium",
-    srcs = ["_impl/keras/layers/local_test.py"],
+    srcs = ["layers/local_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -525,7 +501,7 @@ py_test(
 py_test(
     name = "merge_test",
     size = "small",
-    srcs = ["_impl/keras/layers/merge_test.py"],
+    srcs = ["layers/merge_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -537,7 +513,7 @@ py_test(
 py_test(
     name = "noise_test",
     size = "small",
-    srcs = ["_impl/keras/layers/noise_test.py"],
+    srcs = ["layers/noise_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -547,8 +523,8 @@ py_test(
 
 py_test(
     name = "normalization_test",
-    size = "small",
-    srcs = ["_impl/keras/layers/normalization_test.py"],
+    size = "medium",
+    srcs = ["layers/normalization_test.py"],
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
@@ -561,7 +537,7 @@ py_test(
 py_test(
     name = "simplernn_test",
     size = "medium",
-    srcs = ["_impl/keras/layers/simplernn_test.py"],
+    srcs = ["layers/simplernn_test.py"],
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
@@ -574,7 +550,7 @@ py_test(
 py_test(
     name = "gru_test",
     size = "medium",
-    srcs = ["_impl/keras/layers/gru_test.py"],
+    srcs = ["layers/gru_test.py"],
     srcs_version = "PY2AND3",
     tags = ["notsan"],  # http://b/62136390
     deps = [
@@ -587,7 +563,8 @@ py_test(
 py_test(
     name = "lstm_test",
     size = "medium",
-    srcs = ["_impl/keras/layers/lstm_test.py"],
+    srcs = ["layers/lstm_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
     tags = [
         "noasan",  # times out b/63678675
@@ -603,7 +580,7 @@ py_test(
 py_test(
     name = "recurrent_test",
     size = "medium",
-    srcs = ["_impl/keras/layers/recurrent_test.py"],
+    srcs = ["layers/recurrent_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -615,7 +592,7 @@ py_test(
 py_test(
     name = "serialization_test",
     size = "small",
-    srcs = ["_impl/keras/layers/serialization_test.py"],
+    srcs = ["layers/serialization_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -626,9 +603,13 @@ py_test(
 py_test(
     name = "wrappers_test",
     size = "medium",
-    srcs = ["_impl/keras/layers/wrappers_test.py"],
+    srcs = ["layers/wrappers_test.py"],
+    shard_count = 4,
     srcs_version = "PY2AND3",
-    tags = ["notsan"],
+    tags = [
+        "noasan",  # http://b/78599823
+        "notsan",
+    ],
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
@@ -639,7 +620,7 @@ py_test(
 py_test(
     name = "scikit_learn_test",
     size = "small",
-    srcs = ["_impl/keras/wrappers/scikit_learn_test.py"],
+    srcs = ["wrappers/scikit_learn_test.py"],
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
@@ -652,7 +633,7 @@ py_test(
 py_test(
     name = "data_utils_test",
     size = "large",
-    srcs = ["_impl/keras/utils/data_utils_test.py"],
+    srcs = ["utils/data_utils_test.py"],
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -671,7 +652,7 @@ py_test(
 py_test(
     name = "generic_utils_test",
     size = "small",
-    srcs = ["_impl/keras/utils/generic_utils_test.py"],
+    srcs = ["utils/generic_utils_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -682,7 +663,7 @@ py_test(
 py_test(
     name = "io_utils_test",
     size = "small",
-    srcs = ["_impl/keras/utils/io_utils_test.py"],
+    srcs = ["utils/io_utils_test.py"],
     srcs_version = "PY2AND3",
     tags = [
         "no_windows",  # TODO: needs investigation on Windows
@@ -698,7 +679,7 @@ py_test(
 py_test(
     name = "np_utils_test",
     size = "small",
-    srcs = ["_impl/keras/utils/np_utils_test.py"],
+    srcs = ["utils/np_utils_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -709,7 +690,7 @@ py_test(
 
 gpu_py_test(
     name = "multi_gpu_utils_test",
-    srcs = ["_impl/keras/utils/multi_gpu_utils_test.py"],
+    srcs = ["utils/multi_gpu_utils_test.py"],
     additional_deps = [
         ":keras",
         "//third_party/py/numpy",
@@ -724,7 +705,7 @@ gpu_py_test(
 py_test(
     name = "imagenet_utils_test",
     size = "small",
-    srcs = ["_impl/keras/applications/imagenet_utils_test.py"],
+    srcs = ["applications/imagenet_utils_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -736,7 +717,7 @@ py_test(
 py_test(
     name = "image_test",
     size = "medium",
-    srcs = ["_impl/keras/preprocessing/image_test.py"],
+    srcs = ["preprocessing/image_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -748,7 +729,7 @@ py_test(
 py_test(
     name = "sequence_test",
     size = "small",
-    srcs = ["_impl/keras/preprocessing/sequence_test.py"],
+    srcs = ["preprocessing/sequence_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -760,7 +741,7 @@ py_test(
 py_test(
     name = "text_test",
     size = "small",
-    srcs = ["_impl/keras/preprocessing/text_test.py"],
+    srcs = ["preprocessing/text_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -772,7 +753,7 @@ py_test(
 py_test(
     name = "callbacks_test",
     size = "medium",
-    srcs = ["_impl/keras/callbacks_test.py"],
+    srcs = ["callbacks_test.py"],
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
@@ -785,7 +766,7 @@ py_test(
 py_test(
     name = "training_test",
     size = "medium",
-    srcs = ["_impl/keras/engine/training_test.py"],
+    srcs = ["engine/training_test.py"],
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
@@ -798,7 +779,7 @@ py_test(
 py_test(
     name = "training_eager_test",
     size = "medium",
-    srcs = ["_impl/keras/engine/training_eager_test.py"],
+    srcs = ["engine/training_eager_test.py"],
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
@@ -811,7 +792,7 @@ py_test(
 py_test(
     name = "model_subclassing_test",
     size = "medium",
-    srcs = ["_impl/keras/model_subclassing_test.py"],
+    srcs = ["model_subclassing_test.py"],
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
@@ -824,7 +805,7 @@ py_test(
 py_test(
     name = "topology_test",
     size = "small",
-    srcs = ["_impl/keras/engine/topology_test.py"],
+    srcs = ["engine/topology_test.py"],
     srcs_version = "PY2AND3",
     tags = [
         "no-internal-py3",
@@ -839,19 +820,20 @@ py_test(
 py_test(
     name = "saving_test",
     size = "medium",
-    srcs = ["_impl/keras/engine/saving_test.py"],
+    srcs = ["engine/saving_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
 py_test(
     name = "sequential_test",
     size = "small",
-    srcs = ["_impl/keras/engine/sequential_test.py"],
+    srcs = ["engine/sequential_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -863,7 +845,7 @@ py_test(
 py_test(
     name = "models_test",
     size = "small",
-    srcs = ["_impl/keras/models_test.py"],
+    srcs = ["models_test.py"],
     srcs_version = "PY2AND3",
     tags = ["notsan"],  # b/67509773
     deps = [
@@ -874,29 +856,10 @@ py_test(
     ],
 )
 
-py_test(
-    name = "estimator_test",
-    size = "large",
-    srcs = ["_impl/keras/estimator_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
-        ":keras",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform",
-        "//tensorflow/python/estimator:numpy_io",
-        "//tensorflow/python/estimator:run_config",
-        "//third_party/py/numpy",
-    ],
-)
-
 py_test(
     name = "backend_test",
     size = "small",
-    srcs = ["_impl/keras/backend_test.py"],
+    srcs = ["backend_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
@@ -909,7 +872,7 @@ py_test(
 py_library(
     name = "testing_utils",
     srcs = [
-        "_impl/keras/testing_utils.py",
+        "testing_utils.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
diff --git a/tensorflow/python/keras/__init__.py b/tensorflow/python/keras/__init__.py
index f56be967ff5c06..3493069a5bf53f 100644
--- a/tensorflow/python/keras/__init__.py
+++ b/tensorflow/python/keras/__init__.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -22,14 +21,12 @@
 from __future__ import division
 from __future__ import print_function
 
-# pylint: disable=wildcard-import
 from tensorflow.python.keras import activations
 from tensorflow.python.keras import applications
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import callbacks
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import datasets
-from tensorflow.python.keras import estimator
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import layers
 from tensorflow.python.keras import losses
@@ -40,11 +37,16 @@
 from tensorflow.python.keras import regularizers
 from tensorflow.python.keras import utils
 from tensorflow.python.keras import wrappers
-from tensorflow.python.keras._impl.keras import __version__
 from tensorflow.python.keras.layers import Input
 from tensorflow.python.keras.models import Model
 from tensorflow.python.keras.models import Sequential
 
+from tensorflow.python.util.tf_export import tf_export
+
+__version__ = '2.1.6-tf'
+
+tf_export('keras.__version__').export_constant(__name__, '__version__')
+
 del absolute_import
 del division
 del print_function
diff --git a/tensorflow/python/keras/_impl/keras/__init__.py b/tensorflow/python/keras/_impl/keras/__init__.py
deleted file mode 100644
index 53f5d31e9c5b86..00000000000000
--- a/tensorflow/python/keras/_impl/keras/__init__.py
+++ /dev/null
@@ -1,43 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""The Keras API.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras import activations
-from tensorflow.python.keras._impl.keras import applications
-from tensorflow.python.keras._impl.keras import backend
-from tensorflow.python.keras._impl.keras import callbacks
-from tensorflow.python.keras._impl.keras import constraints
-from tensorflow.python.keras._impl.keras import datasets
-from tensorflow.python.keras._impl.keras import engine
-from tensorflow.python.keras._impl.keras import estimator
-from tensorflow.python.keras._impl.keras import initializers
-from tensorflow.python.keras._impl.keras import layers
-from tensorflow.python.keras._impl.keras import losses
-from tensorflow.python.keras._impl.keras import metrics
-from tensorflow.python.keras._impl.keras import models
-from tensorflow.python.keras._impl.keras import optimizers
-from tensorflow.python.keras._impl.keras import preprocessing
-from tensorflow.python.keras._impl.keras import regularizers
-from tensorflow.python.keras._impl.keras import utils
-from tensorflow.python.keras._impl.keras import wrappers
-from tensorflow.python.keras._impl.keras.layers import Input
-from tensorflow.python.keras._impl.keras.models import Model
-from tensorflow.python.keras._impl.keras.models import Sequential
-
-__version__ = '2.1.5-tf'
diff --git a/tensorflow/python/keras/_impl/keras/applications/__init__.py b/tensorflow/python/keras/_impl/keras/applications/__init__.py
deleted file mode 100644
index 206a769b377483..00000000000000
--- a/tensorflow/python/keras/_impl/keras/applications/__init__.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras Applications: models with automatic loading of pre-trained weights.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.applications.densenet import DenseNet121
-from tensorflow.python.keras._impl.keras.applications.densenet import DenseNet169
-from tensorflow.python.keras._impl.keras.applications.densenet import DenseNet201
-from tensorflow.python.keras._impl.keras.applications.inception_resnet_v2 import InceptionResNetV2
-from tensorflow.python.keras._impl.keras.applications.inception_v3 import InceptionV3
-from tensorflow.python.keras._impl.keras.applications.mobilenet import MobileNet
-from tensorflow.python.keras._impl.keras.applications.nasnet import NASNetLarge
-from tensorflow.python.keras._impl.keras.applications.nasnet import NASNetMobile
-from tensorflow.python.keras._impl.keras.applications.resnet50 import ResNet50
-from tensorflow.python.keras._impl.keras.applications.vgg16 import VGG16
-from tensorflow.python.keras._impl.keras.applications.vgg19 import VGG19
-from tensorflow.python.keras._impl.keras.applications.xception import Xception
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager.py b/tensorflow/python/keras/_impl/keras/engine/training_eager.py
deleted file mode 100644
index 34adeb7599d657..00000000000000
--- a/tensorflow/python/keras/_impl/keras/engine/training_eager.py
+++ /dev/null
@@ -1,689 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras training and evaluation routines for eager execution.
-"""
-# pylint: disable=protected-access
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import copy
-
-import numpy as np
-
-from tensorflow.python.eager.backprop import GradientTape
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.keras._impl.keras import backend
-from tensorflow.python.keras._impl.keras import callbacks as cbks
-from tensorflow.python.keras._impl.keras import losses
-from tensorflow.python.keras._impl.keras import metrics as metrics_module
-from tensorflow.python.keras._impl.keras.engine import training_utils
-from tensorflow.python.keras._impl.keras.utils import generic_utils
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import tf_logging as logging
-
-
-def _get_metrics_info(metric, internal_output_shapes=None, loss_func=None):
-  if metric == 'accuracy' or metric == 'acc':
-    # custom handling of accuracy
-    # (because of class mode duality)
-    output_shape = internal_output_shapes
-    if output_shape[-1] == 1 or loss_func == losses.binary_crossentropy:
-      # case: binary accuracy
-      acc_fn = metrics_module.binary_accuracy
-    elif loss_func == losses.sparse_categorical_crossentropy:
-      # case: categorical accuracy with sparse targets
-      acc_fn = metrics_module.sparse_categorical_accuracy
-    else:
-      acc_fn = metrics_module.categorical_accuracy
-
-    metric_name = 'acc'
-    return metric_name, acc_fn
-  else:
-    metric_fn = metrics_module.get(metric)
-    metric_name = metric_fn.__name__
-    return metric_name, metric_fn
-
-
-def _eager_loss_fn(outputs, targets, loss_fn, output_name):
-  with backend.name_scope(output_name + '_loss'):
-    loss = loss_fn(targets, outputs)
-  return loss
-
-
-def _eager_metrics_fn(model, outputs, targets):
-  """Calculates the metrics for each output of the given model.
-
-  Arguments:
-      model: The model on which metrics are being calculated.
-      outputs: The outputs of the given model.
-      targets: The predictions or targets of the given model.
-
-  Returns:
-      Returns the metric names and metric results for each output of the model.
-  """
-  metric_names = []
-  metric_results = []
-  if not isinstance(outputs, list):
-    outputs = [outputs]
-
-  if not isinstance(targets, list):
-    targets = [targets]
-
-  for i in range(len(model.outputs)):
-    output_metrics = model.nested_metrics[i]
-    for nested_output_metric in output_metrics:
-      metric_name, metric_fn = _get_metrics_info(
-          nested_output_metric, backend.int_shape(model.outputs[i]),
-          model.loss_functions[i])
-
-      if len(model.output_names) > 1:
-        metric_name = model.output_names[i] + '_' + metric_name
-        if metric_name not in model.metrics_names:
-          model.metrics_names.append(metric_name)
-
-      with backend.name_scope(metric_name):
-        metric_result = metric_fn(targets[i], outputs[i])
-        metric_names.append(metric_name)
-        metric_results.append(backend.mean(metric_result))
-
-  return metric_results
-
-
-def _model_loss(model, inputs, targets, sample_weights=None, training=False):
-  """Calculates the loss for a given model.
-
-  Arguments:
-      model: The model on which metrics are being calculated.
-      inputs: List of input arrays.
-      targets: List of target arrays.
-      sample_weights: Optional list of sample weight arrays.
-      training: Whether the model should be run in inference or training mode.
-
-  Returns:
-     Returns the model output, total loss and loss value calculated using the
-     specified loss function. The total loss includes regularization losses and
-     applies masking and sample weighting to the loss value.
-  """
-  total_loss = 0
-  if len(inputs) == 1:
-    if model._expects_training_arg:
-      outs = model.call(inputs[0], training=training)
-    else:
-      outs = model.call(inputs[0])
-  else:
-    if model._expects_training_arg:
-      outs = model.call(inputs, training=training)
-    else:
-      outs = model.call(inputs)
-  if not isinstance(outs, list):
-    outs = [outs]
-
-  if not isinstance(targets, list):
-    targets = [targets]
-
-  loss_metrics = []
-  with backend.name_scope('loss'):
-    for i, loss_fn in enumerate(model.loss_functions):
-      if sample_weights:
-        weights = sample_weights[i]
-      else:
-        weights = None
-
-      # TODO(fchollet): support masking; in practice `_keras_mask` is never
-      # set in this context currently.
-      mask = outs[i]._keras_mask
-
-      weighted_masked_fn = training_utils.weighted_masked_objective(loss_fn)
-      with backend.name_scope(model.output_names[i] + '_loss'):
-        output_loss = weighted_masked_fn(
-            targets[i], outs[i], weights, mask=mask)
-      # If the number of outputs is 1 then we don't append the loss metric
-      # associated with each model output. When there are multiple outputs
-      # associated with a model, each output's loss is calculated and returned
-      # as part of the loss_metrics.
-      if len(model.outputs) > 1:
-        loss_metrics.append(backend.mean(output_loss))
-
-      loss_weight = model.loss_weights_list[i]
-      if total_loss is None:
-        total_loss = loss_weight * output_loss
-      else:
-        total_loss += loss_weight * output_loss
-
-    total_loss = backend.mean(total_loss)
-    # Add regularization losses
-    custom_losses = []
-    for layer in model.layers:
-      if layer.losses:
-        custom_losses += layer.losses
-
-    if custom_losses:
-      total_loss += sum(custom_losses)
-
-  return outs, total_loss, loss_metrics
-
-
-def slice_arrays(arrays, indices, contiguous=True):
-  """Slices batches out of provided arrays (workaround for eager tensors).
-
-  Unfortunately eager tensors don't have the same slicing behavior as
-  Numpy arrays (they folow  the same slicing behavior as symbolic TF tensors),
-  hence we cannot use `generic_utils.slice_arrays` directly
-  and we have to implement this workaround based on `concat`. This has a
-  performance cost.
-
-  Arguments:
-    arrays: Single array or list of arrays.
-    indices: List of indices in the array that should be included in the output
-      batch.
-    contiguous: Boolean flag indicating whether the indices are contiguous.
-
-  Returns:
-    Slice of data (either single array or list of arrays).
-  """
-  if any(tensor_util.is_tensor(x) for x in arrays):
-    converted_to_list = False
-    if not isinstance(arrays, list):
-      converted_to_list = True
-      arrays = [arrays]
-    if not contiguous:
-      entries = [[x[i:i + 1] for i in indices] for x in arrays]
-      slices = [array_ops.concat(x, axis=0) for x in entries]
-    else:
-      slices = [x[indices[0]:indices[-1] + 1] for x in arrays]
-    if converted_to_list:
-      slices = slices[0]
-    return slices
-  else:
-    return generic_utils.slice_arrays(arrays, indices)
-
-
-def _process_single_batch(model,
-                          inputs,
-                          targets,
-                          sample_weights=None,
-                          training=False):
-  """Calculate the loss and gradient for one input batch.
-
-     The model weights are updated if training is set to True.
-
-  Arguments:
-      model: Model whose loss has to be calculated.
-      inputs: List of input arrays.
-      targets: List of target arrays.
-      sample_weights: Optional list of sample weight arrays.
-      training: The boolean represents if the weights of the model are updated.
-              'fit' methods will set this to True while 'evaluate' methods will
-              set this to False.
-
-  Returns:
-      output of the model, total loss and the loss associated with each output.
-
-  Raises:
-      ValueError: If the model has no loss to optimize.
-  """
-  with backend.learning_phase_scope(1 if training else 0):
-    with GradientTape() as tape:
-      outs, loss, loss_metrics = _model_loss(model, inputs, targets,
-                                             sample_weights=sample_weights,
-                                             training=training)
-      if loss is None:
-        raise ValueError('The model cannot be run '
-                         'because it has no loss to optimize.')
-    if training:
-      if not model._collected_trainable_weights:
-        logging.warning('The list of trainable weights is empty. Make sure that'
-                        ' you are not setting model.trainable to False before '
-                        'compiling the model.')
-      else:
-        grads = tape.gradient(loss, model._collected_trainable_weights)
-        model.optimizer.apply_gradients(zip(grads,
-                                            model._collected_trainable_weights))
-    return outs, loss, loss_metrics
-
-
-def train_on_batch(model, inputs, targets, sample_weights=None):
-  """Calculates the loss and gradient updates for one input batch.
-
-  Arguments:
-      model: Model whose loss has to be calculated.
-      inputs: Input batch data.
-      targets: Target batch data.
-      sample_weights: Sample weight batch data.
-
-  Returns:
-      total loss and the loss associated with each output.
-  """
-  inputs = [
-      ops.convert_to_tensor(val, dtype=backend.floatx()) for val in inputs]
-  targets = [
-      ops.convert_to_tensor(val, dtype=backend.floatx()) for val in targets]
-  sample_weights = [
-      ops.convert_to_tensor(val, dtype=backend.floatx())
-      if val is not None else None for val in sample_weights]
-  outs, loss, _ = _process_single_batch(
-      model, inputs, targets, sample_weights=sample_weights, training=True)
-  if not isinstance(outs, list):
-    outs = [outs]
-  metrics_results = _eager_metrics_fn(
-      model, outs, targets)
-  if not isinstance(loss, list):
-    loss = [loss]
-  return loss + metrics_results
-
-
-def test_on_batch(model, inputs, targets, sample_weights=None):
-  """Calculates the loss for one input batch.
-
-  Arguments:
-      model: Model whose loss has to be calculated.
-      inputs: Input batch data.
-      targets: Target batch data.
-      sample_weights: Sample weight batch data.
-
-  Returns:
-      total loss, loss and metrics associated with each output.
-  """
-  inputs = [
-      ops.convert_to_tensor(val, dtype=backend.floatx()) for val in inputs]
-  targets = [
-      ops.convert_to_tensor(val, dtype=backend.floatx()) for val in targets]
-  sample_weights = [
-      ops.convert_to_tensor(val, dtype=backend.floatx())
-      if val is not None else None for val in sample_weights]
-  outs, loss, loss_metrics = _process_single_batch(
-      model, inputs, targets, sample_weights=sample_weights, training=False)
-  if not isinstance(outs, list):
-    outs = [outs]
-  metrics_results = _eager_metrics_fn(
-      model, outs, targets)
-  if not isinstance(loss, list):
-    loss = [loss]
-  return loss + loss_metrics + metrics_results
-
-
-def fit_loop(
-    model,
-    inputs,
-    targets,
-    sample_weights=None,
-    val_inputs=None,
-    val_targets=None,
-    val_sample_weights=None,
-    batch_size=None,
-    epochs=100,
-    verbose=1,
-    callbacks=None,
-    shuffle=True,
-    callback_metrics=None,
-    initial_epoch=0,
-    steps_per_epoch=None,
-    validation_steps=None):
-  """Abstract fit function for eager execution.
-
-  Arguments:
-      model: Instance of the model that is being executed in Eager mode.
-      inputs: List of input arrays.
-      targets: List of target arrays.
-      sample_weights: Optional list of sample weight arrays.
-      val_inputs: Input data for validation.
-      val_targets: Target data for validation.
-      val_sample_weights: Sample weight data for validation.
-      batch_size: Integer batch size or None if unknown.
-      epochs: Number of times to iterate over the data
-      verbose: Verbosity mode, 0, 1 or 2
-      callbacks: List of callbacks to be called during training
-      shuffle: Whether to shuffle the data at the beginning of each epoch
-      callback_metrics: List of strings, the display names of the metrics
-          passed to the callbacks. They should be the
-          concatenation of list the display names of the outputs of
-           `f` and the list of display names of the outputs of `f_val`.
-      initial_epoch: Epoch at which to start training
-          (useful for resuming a previous training run)
-      steps_per_epoch: Total number of steps (batches of samples)
-          before declaring one epoch finished and starting the
-          next epoch. Ignored with the default value of `None`.
-      validation_steps: Number of steps to run validation for (only if doing
-        validation from data tensors). Ignored with default value of `None`.
-
-  Returns:
-      `History` object.
-
-  Raises:
-    ValueError: In case of invalid argument values.
-  """
-  if not batch_size:
-    raise ValueError('With eager execution, `batch_size` should be specified.')
-  if steps_per_epoch or validation_steps:
-    raise ValueError('With eager execution, `steps_per_epoch` and '
-                     '`validation_steps` are not valid arguments '
-                     '(set `batch_size` instead).')
-  # Required for Eager mode
-  with backend.learning_phase_scope(1):
-    do_validation = False
-    if val_inputs:
-      do_validation = True
-      if (verbose and inputs and hasattr(inputs[0], 'shape') and
-          hasattr(val_inputs[0], 'shape')):
-        print('Train on %d samples, validate on %d samples' %
-              (inputs[0].shape[0], val_inputs[0].shape[0]))
-    if validation_steps:
-      if steps_per_epoch is None:
-        raise ValueError('Can only use `validation_steps` when doing step-wise '
-                         'training, i.e. `steps_per_epoch` must be set.')
-      do_validation = True
-
-    out_labels = model.metrics_names
-    if do_validation:
-      callback_metrics = copy.copy(out_labels) + [
-          'val_' + n for n in out_labels
-      ]
-    else:
-      callback_metrics = copy.copy(out_labels)
-
-    if sample_weights:
-      feed_data = inputs + targets + sample_weights
-    else:
-      feed_data = inputs + targets
-    num_train_samples = training_utils.check_num_samples(
-        feed_data,
-        batch_size=batch_size,
-        steps=steps_per_epoch,
-        steps_name='steps_per_epoch')
-
-    if num_train_samples is not None:
-      index_array = np.arange(num_train_samples)
-
-    model.history = cbks.History()
-    callbacks = [cbks.BaseLogger()] + (callbacks or []) + [model.history]
-    if verbose:
-      if steps_per_epoch is not None:
-        count_mode = 'steps'
-      else:
-        count_mode = 'samples'
-      callbacks += [cbks.ProgbarLogger(count_mode)]
-    callbacks = cbks.CallbackList(callbacks)
-
-    # it's possible to callback a different model than self
-    # (used by Sequential models)
-    if hasattr(model, 'callback_model') and model.callback_model:
-      callback_model = model.callback_model
-    else:
-      callback_model = model
-
-    callbacks.set_model(callback_model)
-
-    callbacks.set_params({
-        'batch_size': batch_size,
-        'epochs': epochs,
-        'steps': steps_per_epoch,
-        'samples': num_train_samples,
-        'verbose': verbose,
-        'do_validation': do_validation,
-        'metrics': callback_metrics or [],
-    })
-    callbacks.on_train_begin()
-    callback_model.stop_training = False
-    for cbk in callbacks:
-      if not val_inputs:
-        cbk.validation_data = []
-      elif val_sample_weights:
-        cbk.validation_data = val_inputs + val_targets + val_sample_weights
-      else:
-        cbk.validation_data = val_inputs + val_targets
-
-    for epoch in range(initial_epoch, epochs):
-      callbacks.on_epoch_begin(epoch)
-      epoch_logs = {}
-      if shuffle == 'batch':
-        index_array = model._batch_shuffle(index_array, batch_size)
-      elif shuffle:
-        np.random.shuffle(index_array)
-
-      batches = generic_utils.make_batches(num_train_samples, batch_size)
-
-      for batch_index, (batch_start, batch_end) in enumerate(batches):
-        batch_ids = index_array[batch_start:batch_end]
-        try:
-          inputs_batch = slice_arrays(inputs, batch_ids,
-                                      contiguous=not shuffle)
-          targets_batch = slice_arrays(targets, batch_ids,
-                                       contiguous=not shuffle)
-          if sample_weights:
-            sample_weights_batch = slice_arrays(sample_weights, batch_ids,
-                                                contiguous=not shuffle)
-          else:
-            sample_weights_batch = None
-        except TypeError:
-          raise TypeError('TypeError while preparing batch. '
-                          'If using HDF5 input data, '
-                          'pass shuffle="batch".')
-        batch_logs = {}
-        batch_logs['batch'] = batch_index
-        batch_logs['size'] = len(batch_ids)
-
-        callbacks.on_batch_begin(batch_index, batch_logs)
-
-        inputs_batch = [
-            ops.convert_to_tensor(val, dtype=backend.floatx())
-            for val in inputs_batch]
-        targets_batch = [
-            ops.convert_to_tensor(val, dtype=backend.floatx())
-            for val in targets_batch]
-        if sample_weights:
-          sample_weights_batch = [
-              ops.convert_to_tensor(val, dtype=backend.floatx())
-              if val is not None else None
-              for val in sample_weights_batch]
-
-        outs, loss, loss_metrics = _process_single_batch(
-            model,
-            inputs_batch,
-            targets_batch,
-            sample_weights=sample_weights_batch,
-            training=True)
-
-        if not isinstance(outs, list):
-          outs = [outs]
-
-        for l, o in zip(out_labels, outs):
-          batch_logs[l] = o
-        # Required for Eager mode
-        metrics_results = _eager_metrics_fn(model, outs, targets_batch)
-        batch_logs['loss'] = tensor_util.constant_value(backend.mean(loss))
-
-        for k, v in zip(model.metrics_names,
-                        [backend.mean(loss)] + loss_metrics + metrics_results):
-          batch_logs[k] = tensor_util.constant_value(v)
-        callbacks.on_batch_end(batch_index, batch_logs)
-        if callback_model.stop_training:
-          break
-
-        if batch_index == len(batches) - 1:  # Last batch.
-          if do_validation:
-            val_outs = test_loop(
-                model, val_inputs, val_targets,
-                sample_weights=val_sample_weights,
-                batch_size=batch_size,
-                verbose=0)
-            if not isinstance(val_outs, list):
-              val_outs = [val_outs]
-            # Same labels assumed.
-            for l, o in zip(out_labels, val_outs):
-              epoch_logs['val_' + l] = o
-      callbacks.on_epoch_end(epoch, epoch_logs)
-      if callback_model.stop_training:
-        break
-    callbacks.on_train_end()
-    return model.history
-
-
-def test_loop(model, inputs, targets,
-              sample_weights=None,
-              batch_size=None,
-              verbose=0,
-              steps=None):
-  """Abstract method to loop over some data in batches.
-
-  Arguments:
-      model: Model instance that is being evaluated in Eager mode.
-      inputs: List of input arrays.
-      targets: List of target arrays.
-      sample_weights: Optional list of sample weight arrays.
-      batch_size: integer batch size or `None`.
-      verbose: verbosity mode.
-      steps: Total number of steps (batches of samples)
-          before declaring predictions finished.
-          Ignored with the default value of `None`.
-
-  Returns:
-      Scalar loss (if the model has a single output and no metrics)
-      or list of scalars (if the model has multiple outputs
-      and/or metrics). The attribute `model.metrics_names` will give you
-      the display labels for the scalar outputs.
-  """
-  with backend.learning_phase_scope(0):
-    feed_data = inputs + targets
-    if sample_weights:
-      feed_data += sample_weights
-    num_samples = training_utils.check_num_samples(
-        feed_data, batch_size=batch_size, steps=steps, steps_name='steps')
-    outs = []
-    if verbose == 1:
-      progbar = generic_utils.Progbar(target=num_samples)
-    batches = generic_utils.make_batches(num_samples, batch_size)
-    index_array = np.arange(num_samples)
-    for batch_index, (batch_start, batch_end) in enumerate(batches):
-      batch_ids = index_array[batch_start:batch_end]
-      inputs_batch = slice_arrays(inputs, batch_ids)
-      targets_batch = slice_arrays(targets, batch_ids)
-      if sample_weights:
-        sample_weights_batch = slice_arrays(sample_weights, batch_ids)
-      else:
-        sample_weights_batch = None
-
-      inputs_batch = [
-          ops.convert_to_tensor(val, dtype=backend.floatx())
-          for val in inputs_batch]
-      targets_batch = [
-          ops.convert_to_tensor(val, dtype=backend.floatx())
-          for val in targets_batch]
-      if sample_weights:
-        sample_weights_batch = [
-            ops.convert_to_tensor(val, dtype=backend.floatx())
-            if val is not None else None
-            for val in sample_weights_batch]
-
-      loss_outs, loss, loss_metrics = _model_loss(
-          model,
-          inputs_batch,
-          targets_batch,
-          sample_weights=sample_weights_batch,
-          training=False)
-      metrics_results = _eager_metrics_fn(model, loss_outs, targets_batch)
-      batch_outs = []
-      for _, v in zip(model.metrics_names,
-                      [backend.mean(loss)] + loss_metrics + metrics_results):
-        batch_outs.append(tensor_util.constant_value(v))
-
-      if isinstance(batch_outs, list):
-        if batch_index == 0:
-          for batch_out in enumerate(batch_outs):
-            outs.append(0.)
-        for i, batch_out in enumerate(batch_outs):
-          outs[i] += batch_out * len(batch_ids)
-      else:
-        if batch_index == 0:
-          outs.append(0.)
-        outs[0] += batch_outs * len(batch_ids)
-
-      if verbose == 1:
-        progbar.update(batch_end)
-    for i in range(len(outs)):
-      outs[i] /= num_samples
-    if len(outs) == 1:
-      return outs[0]
-    return outs
-
-
-def predict_loop(model, inputs,
-                 batch_size=32,
-                 verbose=0,
-                 steps=None):
-  """Abstract method to loop over some data in batches.
-
-  Arguments:
-      model:
-      inputs: List of input arrays.
-      batch_size: integer batch size.
-      verbose: verbosity mode.
-      steps: Total number of steps (batches of samples)
-          before declaring `_predict_loop` finished.
-          Ignored with the default value of `None`.
-
-  Returns:
-      Array of predictions (if the model has a single output)
-      or list of arrays of predictions
-      (if the model has multiple outputs).
-  """
-  with backend.learning_phase_scope(0):
-    num_samples = training_utils.check_num_samples(
-        inputs, batch_size, steps, 'steps')
-    if verbose == 1:
-      if steps is not None:
-        progbar = generic_utils.Progbar(target=steps)
-      else:
-        progbar = generic_utils.Progbar(target=num_samples)
-
-    outs = []
-    batches = generic_utils.make_batches(num_samples, batch_size)
-    index_array = np.arange(num_samples)
-    for batch_index, (batch_start, batch_end) in enumerate(batches):
-      batch_ids = index_array[batch_start:batch_end]
-      inputs_batch = slice_arrays(inputs, batch_ids)
-
-      inputs_batch = [
-          ops.convert_to_tensor(val, dtype=backend.floatx())
-          for val in inputs_batch]
-
-      if len(inputs_batch) == 1:
-        if model._expects_training_arg:
-          batch_outs = model.call(inputs_batch[0], training=False)
-        else:
-          batch_outs = model.call(inputs_batch[0])
-      else:
-        if model._expects_training_arg:
-          batch_outs = model.call(inputs_batch, training=False)
-        else:
-          batch_outs = model.call(inputs_batch)
-
-      if not isinstance(batch_outs, list):
-        batch_outs = [batch_outs]
-      if batch_index == 0:
-        # Pre-allocate the results arrays.
-        for batch_out in batch_outs:
-          dims = batch_out.shape[1:].dims
-          dims_list = [d.value for d in dims]
-          shape = (num_samples,) + tuple(dims_list)
-          outs.append(np.zeros(shape, dtype=batch_out.dtype.as_numpy_dtype))
-      for i, batch_out in enumerate(batch_outs):
-        outs[i][batch_start:batch_end] = batch_out
-      if verbose == 1:
-        progbar.update(batch_end)
-    if len(outs) == 1:
-      return outs[0]
-    return outs
diff --git a/tensorflow/python/keras/_impl/keras/layers/__init__.py b/tensorflow/python/keras/_impl/keras/layers/__init__.py
deleted file mode 100644
index 81b2faf106925d..00000000000000
--- a/tensorflow/python/keras/_impl/keras/layers/__init__.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras layers module.
-"""
-# pylint: disable=wildcard-import
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.engine import Input
-from tensorflow.python.keras._impl.keras.engine import InputLayer
-from tensorflow.python.keras._impl.keras.engine import InputSpec
-from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.layers.advanced_activations import *
-from tensorflow.python.keras._impl.keras.layers.convolutional import *
-from tensorflow.python.keras._impl.keras.layers.convolutional_recurrent import *
-from tensorflow.python.keras._impl.keras.layers.core import *
-from tensorflow.python.keras._impl.keras.layers.embeddings import *
-from tensorflow.python.keras._impl.keras.layers.local import *
-from tensorflow.python.keras._impl.keras.layers.merge import *
-from tensorflow.python.keras._impl.keras.layers.noise import *
-from tensorflow.python.keras._impl.keras.layers.normalization import *
-from tensorflow.python.keras._impl.keras.layers.pooling import *
-from tensorflow.python.keras._impl.keras.layers.recurrent import *
-from tensorflow.python.keras._impl.keras.layers.serialization import deserialize
-from tensorflow.python.keras._impl.keras.layers.serialization import serialize
-from tensorflow.python.keras._impl.keras.layers.wrappers import *
-
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/__init__.py b/tensorflow/python/keras/_impl/keras/preprocessing/__init__.py
deleted file mode 100644
index 2ca48cdbf9c066..00000000000000
--- a/tensorflow/python/keras/_impl/keras/preprocessing/__init__.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Data preprocessing module.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.preprocessing import image
-from tensorflow.python.keras._impl.keras.preprocessing import sequence
-from tensorflow.python.keras._impl.keras.preprocessing import text
-
diff --git a/tensorflow/python/keras/_impl/keras/utils/__init__.py b/tensorflow/python/keras/_impl/keras/utils/__init__.py
deleted file mode 100644
index 0c9f19a0c8dcf3..00000000000000
--- a/tensorflow/python/keras/_impl/keras/utils/__init__.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras utilities.
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.utils.data_utils import GeneratorEnqueuer
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
-from tensorflow.python.keras._impl.keras.utils.data_utils import OrderedEnqueuer
-from tensorflow.python.keras._impl.keras.utils.data_utils import Sequence
-from tensorflow.python.keras._impl.keras.utils.generic_utils import custom_object_scope
-from tensorflow.python.keras._impl.keras.utils.generic_utils import CustomObjectScope
-from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
-from tensorflow.python.keras._impl.keras.utils.generic_utils import get_custom_objects
-from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
-from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
-from tensorflow.python.keras._impl.keras.utils.io_utils import HDF5Matrix
-from tensorflow.python.keras._impl.keras.utils.layer_utils import convert_all_kernels_in_model
-from tensorflow.python.keras._impl.keras.utils.layer_utils import print_summary
-from tensorflow.python.keras._impl.keras.utils.multi_gpu_utils import multi_gpu_model
-from tensorflow.python.keras._impl.keras.utils.np_utils import normalize
-from tensorflow.python.keras._impl.keras.utils.np_utils import to_categorical
-from tensorflow.python.keras._impl.keras.utils.vis_utils import plot_model
-
diff --git a/tensorflow/python/keras/_impl/keras/activations.py b/tensorflow/python/keras/activations.py
similarity index 94%
rename from tensorflow/python/keras/_impl/keras/activations.py
rename to tensorflow/python/keras/activations.py
index 8def7ec49375c7..92ad7c7e36d6b0 100644
--- a/tensorflow/python/keras/_impl/keras/activations.py
+++ b/tensorflow/python/keras/activations.py
@@ -20,8 +20,8 @@
 
 import six
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import tf_export
@@ -71,6 +71,8 @@ def selu(x):
       - To be used together with the initialization "lecun_normal".
       - To be used together with the dropout variant "AlphaDropout".
 
+  References:
+      - [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
   """
   alpha = 1.6732632423543772848170429916717
   scale = 1.0507009873554804934193349852946
diff --git a/tensorflow/python/keras/activations/__init__.py b/tensorflow/python/keras/activations/__init__.py
deleted file mode 100644
index d04838c218d664..00000000000000
--- a/tensorflow/python/keras/activations/__init__.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras built-in activation functions."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# Activation functions.
-from tensorflow.python.keras._impl.keras.activations import elu
-from tensorflow.python.keras._impl.keras.activations import hard_sigmoid
-from tensorflow.python.keras._impl.keras.activations import linear
-from tensorflow.python.keras._impl.keras.activations import relu
-from tensorflow.python.keras._impl.keras.activations import selu
-from tensorflow.python.keras._impl.keras.activations import sigmoid
-from tensorflow.python.keras._impl.keras.activations import softmax
-from tensorflow.python.keras._impl.keras.activations import softplus
-from tensorflow.python.keras._impl.keras.activations import softsign
-from tensorflow.python.keras._impl.keras.activations import tanh
-
-# Auxiliary utils.
-# pylint: disable=g-bad-import-order
-from tensorflow.python.keras._impl.keras.activations import deserialize
-from tensorflow.python.keras._impl.keras.activations import serialize
-from tensorflow.python.keras._impl.keras.activations import get
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/activations_test.py b/tensorflow/python/keras/activations_test.py
similarity index 99%
rename from tensorflow/python/keras/_impl/keras/activations_test.py
rename to tensorflow/python/keras/activations_test.py
index fb0bb5f1269d11..5cff1f8f9cb065 100644
--- a/tensorflow/python/keras/_impl/keras/activations_test.py
+++ b/tensorflow/python/keras/activations_test.py
@@ -20,7 +20,7 @@
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/applications/__init__.py b/tensorflow/python/keras/applications/__init__.py
index fccedf919a7b26..062135266dd8b1 100644
--- a/tensorflow/python/keras/applications/__init__.py
+++ b/tensorflow/python/keras/applications/__init__.py
@@ -18,15 +18,6 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras.applications import densenet
-from tensorflow.python.keras.applications import inception_resnet_v2
-from tensorflow.python.keras.applications import inception_v3
-from tensorflow.python.keras.applications import mobilenet
-from tensorflow.python.keras.applications import nasnet
-from tensorflow.python.keras.applications import resnet50
-from tensorflow.python.keras.applications import vgg16
-from tensorflow.python.keras.applications import vgg19
-from tensorflow.python.keras.applications import xception
 from tensorflow.python.keras.applications.densenet import DenseNet121
 from tensorflow.python.keras.applications.densenet import DenseNet169
 from tensorflow.python.keras.applications.densenet import DenseNet201
diff --git a/tensorflow/python/keras/_impl/keras/applications/densenet.py b/tensorflow/python/keras/applications/densenet.py
similarity index 90%
rename from tensorflow/python/keras/_impl/keras/applications/densenet.py
rename to tensorflow/python/keras/applications/densenet.py
index ca83e869123721..f81f10719a31e2 100644
--- a/tensorflow/python/keras/_impl/keras/applications/densenet.py
+++ b/tensorflow/python/keras/applications/densenet.py
@@ -27,24 +27,24 @@
 
 import os
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.applications import imagenet_utils
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
-from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
-from tensorflow.python.keras._impl.keras.layers import Activation
-from tensorflow.python.keras._impl.keras.layers import AveragePooling2D
-from tensorflow.python.keras._impl.keras.layers import BatchNormalization
-from tensorflow.python.keras._impl.keras.layers import Concatenate
-from tensorflow.python.keras._impl.keras.layers import Conv2D
-from tensorflow.python.keras._impl.keras.layers import Dense
-from tensorflow.python.keras._impl.keras.layers import GlobalAveragePooling2D
-from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
-from tensorflow.python.keras._impl.keras.layers import Input
-from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
-from tensorflow.python.keras._impl.keras.layers import ZeroPadding2D
-from tensorflow.python.keras._impl.keras.models import Model
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.applications import imagenet_utils
+from tensorflow.python.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.python.keras.applications.imagenet_utils import decode_predictions
+from tensorflow.python.keras.engine.network import get_source_inputs
+from tensorflow.python.keras.layers import Activation
+from tensorflow.python.keras.layers import AveragePooling2D
+from tensorflow.python.keras.layers import BatchNormalization
+from tensorflow.python.keras.layers import Concatenate
+from tensorflow.python.keras.layers import Conv2D
+from tensorflow.python.keras.layers import Dense
+from tensorflow.python.keras.layers import GlobalAveragePooling2D
+from tensorflow.python.keras.layers import GlobalMaxPooling2D
+from tensorflow.python.keras.layers import Input
+from tensorflow.python.keras.layers import MaxPooling2D
+from tensorflow.python.keras.layers import ZeroPadding2D
+from tensorflow.python.keras.models import Model
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.util.tf_export import tf_export
 
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/densenet_test.py b/tensorflow/python/keras/applications/densenet_test.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/applications/densenet_test.py
rename to tensorflow/python/keras/applications/densenet_test.py
index 3b92287a1e77a9..8b6aa281ad0e2d 100644
--- a/tensorflow/python/keras/_impl/keras/applications/densenet_test.py
+++ b/tensorflow/python/keras/applications/densenet_test.py
@@ -18,7 +18,7 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py b/tensorflow/python/keras/applications/imagenet_utils.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
rename to tensorflow/python/keras/applications/imagenet_utils.py
index d928a7afdc6394..0d8ccca1b5c2a6 100644
--- a/tensorflow/python/keras/_impl/keras/applications/imagenet_utils.py
+++ b/tensorflow/python/keras/applications/imagenet_utils.py
@@ -23,8 +23,8 @@
 import numpy as np
 
 from tensorflow.python.framework import constant_op
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
diff --git a/tensorflow/python/keras/_impl/keras/applications/imagenet_utils_test.py b/tensorflow/python/keras/applications/imagenet_utils_test.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/applications/imagenet_utils_test.py
rename to tensorflow/python/keras/applications/imagenet_utils_test.py
index d843dace59f1c8..349339309017f3 100644
--- a/tensorflow/python/keras/_impl/keras/applications/imagenet_utils_test.py
+++ b/tensorflow/python/keras/applications/imagenet_utils_test.py
@@ -20,8 +20,8 @@
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import preprocess_input
+from tensorflow.python import keras
+from tensorflow.python.keras.applications.imagenet_utils import preprocess_input
 from tensorflow.python.platform import test
 
 
@@ -197,4 +197,3 @@ def test_obtain_input_shape(self):
 
 if __name__ == '__main__':
   test.main()
-
diff --git a/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py b/tensorflow/python/keras/applications/inception_resnet_v2.py
similarity index 91%
rename from tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py
rename to tensorflow/python/keras/applications/inception_resnet_v2.py
index 17e407dd58460e..fe1d0f2d4fb47f 100644
--- a/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2.py
+++ b/tensorflow/python/keras/applications/inception_resnet_v2.py
@@ -27,24 +27,24 @@
 
 import os
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.applications import imagenet_utils
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
-from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
-from tensorflow.python.keras._impl.keras.layers import Activation
-from tensorflow.python.keras._impl.keras.layers import AveragePooling2D
-from tensorflow.python.keras._impl.keras.layers import BatchNormalization
-from tensorflow.python.keras._impl.keras.layers import Concatenate
-from tensorflow.python.keras._impl.keras.layers import Conv2D
-from tensorflow.python.keras._impl.keras.layers import Dense
-from tensorflow.python.keras._impl.keras.layers import GlobalAveragePooling2D
-from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
-from tensorflow.python.keras._impl.keras.layers import Input
-from tensorflow.python.keras._impl.keras.layers import Lambda
-from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
-from tensorflow.python.keras._impl.keras.models import Model
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.applications import imagenet_utils
+from tensorflow.python.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.python.keras.applications.imagenet_utils import decode_predictions
+from tensorflow.python.keras.engine.network import get_source_inputs
+from tensorflow.python.keras.layers import Activation
+from tensorflow.python.keras.layers import AveragePooling2D
+from tensorflow.python.keras.layers import BatchNormalization
+from tensorflow.python.keras.layers import Concatenate
+from tensorflow.python.keras.layers import Conv2D
+from tensorflow.python.keras.layers import Dense
+from tensorflow.python.keras.layers import GlobalAveragePooling2D
+from tensorflow.python.keras.layers import GlobalMaxPooling2D
+from tensorflow.python.keras.layers import Input
+from tensorflow.python.keras.layers import Lambda
+from tensorflow.python.keras.layers import MaxPooling2D
+from tensorflow.python.keras.models import Model
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
diff --git a/tensorflow/python/keras/applications/inception_resnet_v2/__init__.py b/tensorflow/python/keras/applications/inception_resnet_v2/__init__.py
deleted file mode 100644
index 223660e9bef338..00000000000000
--- a/tensorflow/python/keras/applications/inception_resnet_v2/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""InceptionResNetV2 Keras application."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.applications.inception_resnet_v2 import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.inception_resnet_v2 import InceptionResNetV2
-from tensorflow.python.keras._impl.keras.applications.inception_resnet_v2 import preprocess_input
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2_test.py b/tensorflow/python/keras/applications/inception_resnet_v2_test.py
similarity index 97%
rename from tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2_test.py
rename to tensorflow/python/keras/applications/inception_resnet_v2_test.py
index de71e9615a09ec..0a12f885052ae9 100644
--- a/tensorflow/python/keras/_impl/keras/applications/inception_resnet_v2_test.py
+++ b/tensorflow/python/keras/applications/inception_resnet_v2_test.py
@@ -20,7 +20,7 @@
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/inception_v3.py b/tensorflow/python/keras/applications/inception_v3.py
similarity index 92%
rename from tensorflow/python/keras/_impl/keras/applications/inception_v3.py
rename to tensorflow/python/keras/applications/inception_v3.py
index 2897c6058eb445..857ad49dae9ef2 100644
--- a/tensorflow/python/keras/_impl/keras/applications/inception_v3.py
+++ b/tensorflow/python/keras/applications/inception_v3.py
@@ -32,23 +32,23 @@
 
 import os
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import layers
-from tensorflow.python.keras._impl.keras.applications import imagenet_utils
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
-from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
-from tensorflow.python.keras._impl.keras.layers import Activation
-from tensorflow.python.keras._impl.keras.layers import AveragePooling2D
-from tensorflow.python.keras._impl.keras.layers import BatchNormalization
-from tensorflow.python.keras._impl.keras.layers import Conv2D
-from tensorflow.python.keras._impl.keras.layers import Dense
-from tensorflow.python.keras._impl.keras.layers import GlobalAveragePooling2D
-from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
-from tensorflow.python.keras._impl.keras.layers import Input
-from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
-from tensorflow.python.keras._impl.keras.models import Model
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import layers
+from tensorflow.python.keras.applications import imagenet_utils
+from tensorflow.python.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.python.keras.applications.imagenet_utils import decode_predictions
+from tensorflow.python.keras.engine.network import get_source_inputs
+from tensorflow.python.keras.layers import Activation
+from tensorflow.python.keras.layers import AveragePooling2D
+from tensorflow.python.keras.layers import BatchNormalization
+from tensorflow.python.keras.layers import Conv2D
+from tensorflow.python.keras.layers import Dense
+from tensorflow.python.keras.layers import GlobalAveragePooling2D
+from tensorflow.python.keras.layers import GlobalMaxPooling2D
+from tensorflow.python.keras.layers import Input
+from tensorflow.python.keras.layers import MaxPooling2D
+from tensorflow.python.keras.models import Model
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
diff --git a/tensorflow/python/keras/applications/inception_v3/__init__.py b/tensorflow/python/keras/applications/inception_v3/__init__.py
deleted file mode 100644
index abf8393ae45d71..00000000000000
--- a/tensorflow/python/keras/applications/inception_v3/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Inception V3 Keras application."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.applications.inception_v3 import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.inception_v3 import InceptionV3
-from tensorflow.python.keras._impl.keras.applications.inception_v3 import preprocess_input
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/applications/inception_v3_test.py b/tensorflow/python/keras/applications/inception_v3_test.py
similarity index 97%
rename from tensorflow/python/keras/_impl/keras/applications/inception_v3_test.py
rename to tensorflow/python/keras/applications/inception_v3_test.py
index 20e11fa0191344..a3fcdd55644af5 100644
--- a/tensorflow/python/keras/_impl/keras/applications/inception_v3_test.py
+++ b/tensorflow/python/keras/applications/inception_v3_test.py
@@ -20,7 +20,7 @@
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/mobilenet.py b/tensorflow/python/keras/applications/mobilenet.py
similarity index 91%
rename from tensorflow/python/keras/_impl/keras/applications/mobilenet.py
rename to tensorflow/python/keras/applications/mobilenet.py
index 7b7288793de954..9d845be0d5b1ab 100644
--- a/tensorflow/python/keras/_impl/keras/applications/mobilenet.py
+++ b/tensorflow/python/keras/applications/mobilenet.py
@@ -71,28 +71,28 @@
 
 import os
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import constraints
-from tensorflow.python.keras._impl.keras import initializers
-from tensorflow.python.keras._impl.keras import regularizers
-from tensorflow.python.keras._impl.keras.applications import imagenet_utils
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
-from tensorflow.python.keras._impl.keras.engine import InputSpec
-from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
-from tensorflow.python.keras._impl.keras.layers import Activation
-from tensorflow.python.keras._impl.keras.layers import BatchNormalization
-from tensorflow.python.keras._impl.keras.layers import Conv2D
-from tensorflow.python.keras._impl.keras.layers import DepthwiseConv2D
-from tensorflow.python.keras._impl.keras.layers import Dropout
-from tensorflow.python.keras._impl.keras.layers import GlobalAveragePooling2D
-from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
-from tensorflow.python.keras._impl.keras.layers import Input
-from tensorflow.python.keras._impl.keras.layers import Reshape
-from tensorflow.python.keras._impl.keras.layers import ZeroPadding2D
-from tensorflow.python.keras._impl.keras.models import Model
-from tensorflow.python.keras._impl.keras.utils import conv_utils
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import constraints
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.applications import imagenet_utils
+from tensorflow.python.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.python.keras.applications.imagenet_utils import decode_predictions
+from tensorflow.python.keras.engine import InputSpec
+from tensorflow.python.keras.engine.network import get_source_inputs
+from tensorflow.python.keras.layers import Activation
+from tensorflow.python.keras.layers import BatchNormalization
+from tensorflow.python.keras.layers import Conv2D
+from tensorflow.python.keras.layers import DepthwiseConv2D
+from tensorflow.python.keras.layers import Dropout
+from tensorflow.python.keras.layers import GlobalAveragePooling2D
+from tensorflow.python.keras.layers import GlobalMaxPooling2D
+from tensorflow.python.keras.layers import Input
+from tensorflow.python.keras.layers import Reshape
+from tensorflow.python.keras.layers import ZeroPadding2D
+from tensorflow.python.keras.models import Model
+from tensorflow.python.keras.utils import conv_utils
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
@@ -240,10 +240,15 @@ def MobileNet(input_shape=None,
                        '`0.25`, `0.50`, `0.75` or `1.0` only.')
 
     if rows != cols or rows not in [128, 160, 192, 224]:
-      raise ValueError('If imagenet weights are being loaded, '
-                       'input must have a static square shape (one of '
-                       '(128,128), (160,160), (192,192), or (224, 224)).'
-                       ' Input shape provided = %s' % (input_shape,))
+      if rows is None:
+        rows = 224
+        logging.warning('MobileNet shape is undefined.'
+                        ' Weights for input shape (224, 224) will be loaded.')
+      else:
+        raise ValueError('If imagenet weights are being loaded, '
+                         'input must have a static square shape (one of '
+                         '(128, 128), (160, 160), (192, 192), or (224, 224)).'
+                         ' Input shape provided = %s' % (input_shape,))
 
   if K.image_data_format() != 'channels_last':
     logging.warning('The MobileNet family of models is only available '
diff --git a/tensorflow/python/keras/applications/mobilenet/__init__.py b/tensorflow/python/keras/applications/mobilenet/__init__.py
deleted file mode 100644
index b809e91193b459..00000000000000
--- a/tensorflow/python/keras/applications/mobilenet/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""MobileNet Keras application."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.applications.mobilenet import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.mobilenet import MobileNet
-from tensorflow.python.keras._impl.keras.applications.mobilenet import preprocess_input
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/applications/mobilenet_test.py b/tensorflow/python/keras/applications/mobilenet_test.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/applications/mobilenet_test.py
rename to tensorflow/python/keras/applications/mobilenet_test.py
index 601d417e496b82..5661ed7856ad6e 100644
--- a/tensorflow/python/keras/_impl/keras/applications/mobilenet_test.py
+++ b/tensorflow/python/keras/applications/mobilenet_test.py
@@ -20,7 +20,7 @@
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/nasnet.py b/tensorflow/python/keras/applications/nasnet.py
similarity index 93%
rename from tensorflow/python/keras/_impl/keras/applications/nasnet.py
rename to tensorflow/python/keras/applications/nasnet.py
index dd33230a7eb927..b521bc67313940 100644
--- a/tensorflow/python/keras/_impl/keras/applications/nasnet.py
+++ b/tensorflow/python/keras/applications/nasnet.py
@@ -45,27 +45,27 @@
 
 import os
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.inception_v3 import preprocess_input
-from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
-from tensorflow.python.keras._impl.keras.layers import Activation
-from tensorflow.python.keras._impl.keras.layers import add
-from tensorflow.python.keras._impl.keras.layers import AveragePooling2D
-from tensorflow.python.keras._impl.keras.layers import BatchNormalization
-from tensorflow.python.keras._impl.keras.layers import concatenate
-from tensorflow.python.keras._impl.keras.layers import Conv2D
-from tensorflow.python.keras._impl.keras.layers import Cropping2D
-from tensorflow.python.keras._impl.keras.layers import Dense
-from tensorflow.python.keras._impl.keras.layers import GlobalAveragePooling2D
-from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
-from tensorflow.python.keras._impl.keras.layers import Input
-from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
-from tensorflow.python.keras._impl.keras.layers import SeparableConv2D
-from tensorflow.python.keras._impl.keras.layers import ZeroPadding2D
-from tensorflow.python.keras._impl.keras.models import Model
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.python.keras.applications.imagenet_utils import decode_predictions
+from tensorflow.python.keras.applications.inception_v3 import preprocess_input
+from tensorflow.python.keras.engine.network import get_source_inputs
+from tensorflow.python.keras.layers import Activation
+from tensorflow.python.keras.layers import add
+from tensorflow.python.keras.layers import AveragePooling2D
+from tensorflow.python.keras.layers import BatchNormalization
+from tensorflow.python.keras.layers import concatenate
+from tensorflow.python.keras.layers import Conv2D
+from tensorflow.python.keras.layers import Cropping2D
+from tensorflow.python.keras.layers import Dense
+from tensorflow.python.keras.layers import GlobalAveragePooling2D
+from tensorflow.python.keras.layers import GlobalMaxPooling2D
+from tensorflow.python.keras.layers import Input
+from tensorflow.python.keras.layers import MaxPooling2D
+from tensorflow.python.keras.layers import SeparableConv2D
+from tensorflow.python.keras.layers import ZeroPadding2D
+from tensorflow.python.keras.models import Model
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
@@ -96,10 +96,9 @@ def NASNet(input_shape=None,
   at `~/.keras/keras.json`.
 
   Arguments:
-      input_shape: Optional shape tuple, only to be specified
-          if `include_top` is False (otherwise the input shape
-          has to be `(331, 331, 3)` for NASNetLarge or
-          `(224, 224, 3)` for NASNetMobile
+      input_shape: Optional shape tuple, the input shape
+          is by default `(331, 331, 3)` for NASNetLarge and
+          `(224, 224, 3)` for NASNetMobile.
           It should have exactly 3 inputs channels,
           and width and height should be no smaller than 32.
           E.g. `(224, 224, 3)` would be one valid value.
@@ -169,6 +168,14 @@ def NASNet(input_shape=None,
     raise ValueError('If using `weights` as ImageNet with `include_top` '
                      'as true, `classes` should be 1000')
 
+  if (isinstance(input_shape, tuple) and None in input_shape and
+      weights == 'imagenet'):
+    raise ValueError('When specifying the input shape of a NASNet'
+                     ' and loading `ImageNet` weights, '
+                     'the input_shape argument must be static '
+                     '(no None entries). Got: `input_shape=' +
+                     str(input_shape) + '`.')
+
   if default_size is None:
     default_size = 331
 
@@ -178,7 +185,7 @@ def NASNet(input_shape=None,
       default_size=default_size,
       min_size=32,
       data_format=K.image_data_format(),
-      require_flatten=include_top or weights,
+      require_flatten=False,
       weights=weights)
 
   if K.image_data_format() != 'channels_last':
diff --git a/tensorflow/python/keras/_impl/keras/applications/nasnet_test.py b/tensorflow/python/keras/applications/nasnet_test.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/applications/nasnet_test.py
rename to tensorflow/python/keras/applications/nasnet_test.py
index aa1dec670cb995..f96c3aa51c17ff 100644
--- a/tensorflow/python/keras/_impl/keras/applications/nasnet_test.py
+++ b/tensorflow/python/keras/applications/nasnet_test.py
@@ -18,7 +18,7 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/resnet50.py b/tensorflow/python/keras/applications/resnet50.py
similarity index 88%
rename from tensorflow/python/keras/_impl/keras/applications/resnet50.py
rename to tensorflow/python/keras/applications/resnet50.py
index c3a92bea8920ca..508550f445e39d 100644
--- a/tensorflow/python/keras/_impl/keras/applications/resnet50.py
+++ b/tensorflow/python/keras/applications/resnet50.py
@@ -29,26 +29,26 @@
 
 import os
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import layers
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import preprocess_input
-from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
-from tensorflow.python.keras._impl.keras.layers import Activation
-from tensorflow.python.keras._impl.keras.layers import AveragePooling2D
-from tensorflow.python.keras._impl.keras.layers import BatchNormalization
-from tensorflow.python.keras._impl.keras.layers import Conv2D
-from tensorflow.python.keras._impl.keras.layers import Dense
-from tensorflow.python.keras._impl.keras.layers import Flatten
-from tensorflow.python.keras._impl.keras.layers import GlobalAveragePooling2D
-from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
-from tensorflow.python.keras._impl.keras.layers import Input
-from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
-from tensorflow.python.keras._impl.keras.layers import ZeroPadding2D
-from tensorflow.python.keras._impl.keras.models import Model
-from tensorflow.python.keras._impl.keras.utils import layer_utils
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import layers
+from tensorflow.python.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.python.keras.applications.imagenet_utils import decode_predictions
+from tensorflow.python.keras.applications.imagenet_utils import preprocess_input
+from tensorflow.python.keras.engine.network import get_source_inputs
+from tensorflow.python.keras.layers import Activation
+from tensorflow.python.keras.layers import AveragePooling2D
+from tensorflow.python.keras.layers import BatchNormalization
+from tensorflow.python.keras.layers import Conv2D
+from tensorflow.python.keras.layers import Dense
+from tensorflow.python.keras.layers import Flatten
+from tensorflow.python.keras.layers import GlobalAveragePooling2D
+from tensorflow.python.keras.layers import GlobalMaxPooling2D
+from tensorflow.python.keras.layers import Input
+from tensorflow.python.keras.layers import MaxPooling2D
+from tensorflow.python.keras.layers import ZeroPadding2D
+from tensorflow.python.keras.models import Model
+from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
diff --git a/tensorflow/python/keras/applications/resnet50/__init__.py b/tensorflow/python/keras/applications/resnet50/__init__.py
deleted file mode 100644
index 530805d150bfe3..00000000000000
--- a/tensorflow/python/keras/applications/resnet50/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""ResNet50 Keras application."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.applications.resnet50 import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.resnet50 import preprocess_input
-from tensorflow.python.keras._impl.keras.applications.resnet50 import ResNet50
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/applications/resnet50_test.py b/tensorflow/python/keras/applications/resnet50_test.py
similarity index 97%
rename from tensorflow/python/keras/_impl/keras/applications/resnet50_test.py
rename to tensorflow/python/keras/applications/resnet50_test.py
index 07f9ffd73f55ee..22a3f055805f48 100644
--- a/tensorflow/python/keras/_impl/keras/applications/resnet50_test.py
+++ b/tensorflow/python/keras/applications/resnet50_test.py
@@ -18,7 +18,7 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/vgg16.py b/tensorflow/python/keras/applications/vgg16.py
similarity index 83%
rename from tensorflow/python/keras/_impl/keras/applications/vgg16.py
rename to tensorflow/python/keras/applications/vgg16.py
index cefb25063e3050..659a6533e67724 100644
--- a/tensorflow/python/keras/_impl/keras/applications/vgg16.py
+++ b/tensorflow/python/keras/applications/vgg16.py
@@ -28,21 +28,21 @@
 
 import os
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import preprocess_input
-from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
-from tensorflow.python.keras._impl.keras.layers import Conv2D
-from tensorflow.python.keras._impl.keras.layers import Dense
-from tensorflow.python.keras._impl.keras.layers import Flatten
-from tensorflow.python.keras._impl.keras.layers import GlobalAveragePooling2D
-from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
-from tensorflow.python.keras._impl.keras.layers import Input
-from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
-from tensorflow.python.keras._impl.keras.models import Model
-from tensorflow.python.keras._impl.keras.utils import layer_utils
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.python.keras.applications.imagenet_utils import decode_predictions
+from tensorflow.python.keras.applications.imagenet_utils import preprocess_input
+from tensorflow.python.keras.engine.network import get_source_inputs
+from tensorflow.python.keras.layers import Conv2D
+from tensorflow.python.keras.layers import Dense
+from tensorflow.python.keras.layers import Flatten
+from tensorflow.python.keras.layers import GlobalAveragePooling2D
+from tensorflow.python.keras.layers import GlobalMaxPooling2D
+from tensorflow.python.keras.layers import Input
+from tensorflow.python.keras.layers import MaxPooling2D
+from tensorflow.python.keras.models import Model
+from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
@@ -223,16 +223,6 @@ def VGG16(include_top=True,
           cache_subdir='models',
           file_hash='6d6bbae143d832006294945121d1f1fc')
     model.load_weights(weights_path)
-    if K.backend() == 'theano':
-      layer_utils.convert_all_kernels_in_model(model)
-
-    if K.image_data_format() == 'channels_first':
-      if include_top:
-        maxpool = model.get_layer(name='block5_pool')
-        shape = maxpool.output_shape[1:]
-        dense = model.get_layer(name='fc1')
-        layer_utils.convert_dense_weights_data_format(dense, shape,
-                                                      'channels_first')
 
   elif weights is not None:
     model.load_weights(weights)
diff --git a/tensorflow/python/keras/applications/vgg16/__init__.py b/tensorflow/python/keras/applications/vgg16/__init__.py
deleted file mode 100644
index 118361604bbc7e..00000000000000
--- a/tensorflow/python/keras/applications/vgg16/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""VGG16 Keras application."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.applications.vgg16 import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.vgg16 import preprocess_input
-from tensorflow.python.keras._impl.keras.applications.vgg16 import VGG16
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/applications/vgg16_test.py b/tensorflow/python/keras/applications/vgg16_test.py
similarity index 97%
rename from tensorflow/python/keras/_impl/keras/applications/vgg16_test.py
rename to tensorflow/python/keras/applications/vgg16_test.py
index e6eba83678def5..cad65765f3d18c 100644
--- a/tensorflow/python/keras/_impl/keras/applications/vgg16_test.py
+++ b/tensorflow/python/keras/applications/vgg16_test.py
@@ -18,7 +18,7 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/vgg19.py b/tensorflow/python/keras/applications/vgg19.py
similarity index 84%
rename from tensorflow/python/keras/_impl/keras/applications/vgg19.py
rename to tensorflow/python/keras/applications/vgg19.py
index dadaf4fdf0cc59..5e27ab8fb1fb99 100644
--- a/tensorflow/python/keras/_impl/keras/applications/vgg19.py
+++ b/tensorflow/python/keras/applications/vgg19.py
@@ -28,21 +28,21 @@
 
 import os
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import preprocess_input
-from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
-from tensorflow.python.keras._impl.keras.layers import Conv2D
-from tensorflow.python.keras._impl.keras.layers import Dense
-from tensorflow.python.keras._impl.keras.layers import Flatten
-from tensorflow.python.keras._impl.keras.layers import GlobalAveragePooling2D
-from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
-from tensorflow.python.keras._impl.keras.layers import Input
-from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
-from tensorflow.python.keras._impl.keras.models import Model
-from tensorflow.python.keras._impl.keras.utils import layer_utils
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.python.keras.applications.imagenet_utils import decode_predictions
+from tensorflow.python.keras.applications.imagenet_utils import preprocess_input
+from tensorflow.python.keras.engine.network import get_source_inputs
+from tensorflow.python.keras.layers import Conv2D
+from tensorflow.python.keras.layers import Dense
+from tensorflow.python.keras.layers import Flatten
+from tensorflow.python.keras.layers import GlobalAveragePooling2D
+from tensorflow.python.keras.layers import GlobalMaxPooling2D
+from tensorflow.python.keras.layers import Input
+from tensorflow.python.keras.layers import MaxPooling2D
+from tensorflow.python.keras.models import Model
+from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
@@ -232,16 +232,6 @@ def VGG19(include_top=True,
           cache_subdir='models',
           file_hash='253f8cb515780f3b799900260a226db6')
     model.load_weights(weights_path)
-    if K.backend() == 'theano':
-      layer_utils.convert_all_kernels_in_model(model)
-
-    if K.image_data_format() == 'channels_first':
-      if include_top:
-        maxpool = model.get_layer(name='block5_pool')
-        shape = maxpool.output_shape[1:]
-        dense = model.get_layer(name='fc1')
-        layer_utils.convert_dense_weights_data_format(dense, shape,
-                                                      'channels_first')
 
   elif weights is not None:
     model.load_weights(weights)
diff --git a/tensorflow/python/keras/applications/vgg19/__init__.py b/tensorflow/python/keras/applications/vgg19/__init__.py
deleted file mode 100644
index cda52628f3c10d..00000000000000
--- a/tensorflow/python/keras/applications/vgg19/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""VGG19 Keras application."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.applications.vgg19 import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.vgg19 import preprocess_input
-from tensorflow.python.keras._impl.keras.applications.vgg19 import VGG19
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/applications/vgg19_test.py b/tensorflow/python/keras/applications/vgg19_test.py
similarity index 97%
rename from tensorflow/python/keras/_impl/keras/applications/vgg19_test.py
rename to tensorflow/python/keras/applications/vgg19_test.py
index 25100a2993f8a6..61dccc0c5cc315 100644
--- a/tensorflow/python/keras/_impl/keras/applications/vgg19_test.py
+++ b/tensorflow/python/keras/applications/vgg19_test.py
@@ -18,7 +18,7 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/applications/xception.py b/tensorflow/python/keras/applications/xception.py
similarity index 90%
rename from tensorflow/python/keras/_impl/keras/applications/xception.py
rename to tensorflow/python/keras/applications/xception.py
index 971063a16d1f5b..e1be8a3c46e6ea 100644
--- a/tensorflow/python/keras/_impl/keras/applications/xception.py
+++ b/tensorflow/python/keras/applications/xception.py
@@ -39,23 +39,23 @@
 
 import os
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import layers
-from tensorflow.python.keras._impl.keras.applications import imagenet_utils
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import _obtain_input_shape
-from tensorflow.python.keras._impl.keras.applications.imagenet_utils import decode_predictions
-from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
-from tensorflow.python.keras._impl.keras.layers import Activation
-from tensorflow.python.keras._impl.keras.layers import BatchNormalization
-from tensorflow.python.keras._impl.keras.layers import Conv2D
-from tensorflow.python.keras._impl.keras.layers import Dense
-from tensorflow.python.keras._impl.keras.layers import GlobalAveragePooling2D
-from tensorflow.python.keras._impl.keras.layers import GlobalMaxPooling2D
-from tensorflow.python.keras._impl.keras.layers import Input
-from tensorflow.python.keras._impl.keras.layers import MaxPooling2D
-from tensorflow.python.keras._impl.keras.layers import SeparableConv2D
-from tensorflow.python.keras._impl.keras.models import Model
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import layers
+from tensorflow.python.keras.applications import imagenet_utils
+from tensorflow.python.keras.applications.imagenet_utils import _obtain_input_shape
+from tensorflow.python.keras.applications.imagenet_utils import decode_predictions
+from tensorflow.python.keras.engine.network import get_source_inputs
+from tensorflow.python.keras.layers import Activation
+from tensorflow.python.keras.layers import BatchNormalization
+from tensorflow.python.keras.layers import Conv2D
+from tensorflow.python.keras.layers import Dense
+from tensorflow.python.keras.layers import GlobalAveragePooling2D
+from tensorflow.python.keras.layers import GlobalMaxPooling2D
+from tensorflow.python.keras.layers import Input
+from tensorflow.python.keras.layers import MaxPooling2D
+from tensorflow.python.keras.layers import SeparableConv2D
+from tensorflow.python.keras.models import Model
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
diff --git a/tensorflow/python/keras/applications/xception/__init__.py b/tensorflow/python/keras/applications/xception/__init__.py
deleted file mode 100644
index ae9cd9cd18c5cc..00000000000000
--- a/tensorflow/python/keras/applications/xception/__init__.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Xception Keras application."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.applications.xception import decode_predictions
-from tensorflow.python.keras._impl.keras.applications.xception import preprocess_input
-from tensorflow.python.keras._impl.keras.applications.xception import Xception
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/applications/xception_test.py b/tensorflow/python/keras/applications/xception_test.py
similarity index 97%
rename from tensorflow/python/keras/_impl/keras/applications/xception_test.py
rename to tensorflow/python/keras/applications/xception_test.py
index 7ebdc30010aa48..7e2efd0017836a 100644
--- a/tensorflow/python/keras/_impl/keras/applications/xception_test.py
+++ b/tensorflow/python/keras/applications/xception_test.py
@@ -20,7 +20,7 @@
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/backend.py b/tensorflow/python/keras/backend.py
similarity index 99%
rename from tensorflow/python/keras/_impl/keras/backend.py
rename to tensorflow/python/keras/backend.py
index 449410fe082421..af3d1fa33d3431 100644
--- a/tensorflow/python/keras/_impl/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -74,12 +74,6 @@
 # either train mode (learning_phase == 1) or test mode (learning_phase == 0).
 _GRAPH_LEARNING_PHASES = {}
 
-# This dictionary holds a mapping {graph: UID_DICT}.
-# each UID_DICT is a dictionary mapping name prefixes to a current index,
-# used for generating graph-specific string UIDs
-# for various names (e.g. layer names).
-_GRAPH_UID_DICTS = {}
-
 # This boolean flag can be set to True to leave variable initialization
 # up to the user.
 # Change its value via `manual_variable_initialization(value)`.
@@ -298,6 +292,8 @@ def get_uid(prefix=''):
 
 @tf_export('keras.backend.reset_uids')
 def reset_uids():
+  """Resets graph identifiers.
+  """
   per_graph_layer_name_uids = PER_GRAPH_LAYER_NAME_UIDS
   keys = list(per_graph_layer_name_uids.keys())
   for key in keys:
@@ -1421,6 +1417,9 @@ def batch_dot(x, y, axes=None):
     axes = (axes, axes)
   x_ndim = ndim(x)
   y_ndim = ndim(y)
+  if axes is None:
+    # behaves like tf.batch_matmul as default
+    axes = [x_ndim - 1, y_ndim - 2]
   if x_ndim > y_ndim:
     diff = x_ndim - y_ndim
     y = array_ops.reshape(y,
@@ -2927,7 +2926,7 @@ def function(inputs, outputs, updates=None, **kwargs):
 
 @tf_export('keras.backend.gradients')
 def gradients(loss, variables):
-  """Returns the gradients of `variables` w.r.t. `loss`.
+  """Returns the gradients of `loss` w.r.t. `variables`.
 
   Arguments:
       loss: Scalar tensor to minimize.
@@ -2998,7 +2997,7 @@ def rnn(step_function,
       constants: a list of constant values passed at each step.
       unroll: whether to unroll the RNN or to use a symbolic loop
           (`while_loop` or `scan` depending on backend).
-      input_length: Unused; exists for API compatibility.
+      input_length: If specified, assume time dimension is of this length.
 
   Returns:
       A tuple, `(last_output, outputs, new_states)`.
@@ -3016,7 +3015,6 @@ def rnn(step_function,
       ValueError: if `mask` is provided (not `None`) but states is not provided
           (`len(states)` == 0).
   """
-  del input_length
   ndim = len(inputs.get_shape())
   if ndim < 3:
     raise ValueError('Input should be at least 3D.')
@@ -3194,6 +3192,7 @@ def _step(time, output_ta_t, *states):
         cond=lambda time, *_: time < time_steps,
         body=_step,
         loop_vars=(time, output_ta) + states,
+        maximum_iterations=input_length,
         parallel_iterations=32,
         swap_memory=True)
     last_time = final_outputs[0]
@@ -3395,16 +3394,18 @@ def elu(x, alpha=1.):
 
 
 @tf_export('keras.backend.softmax')
-def softmax(x):
+def softmax(x, axis=-1):
   """Softmax of a tensor.
 
   Arguments:
       x: A tensor or variable.
+      axis: The dimension softmax would be performed on.
+          The default is -1 which indicates the last dimension.
 
   Returns:
       A tensor.
   """
-  return nn.softmax(x)
+  return nn.softmax(x, axis=axis)
 
 
 @tf_export('keras.backend.softplus')
@@ -4588,8 +4589,8 @@ def ctc_batch_cost(y_true, y_pred, input_length, label_length):
       Tensor with shape (samples,1) containing the
           CTC loss of each element.
   """
-  label_length = math_ops.to_int32(array_ops.squeeze(label_length))
-  input_length = math_ops.to_int32(array_ops.squeeze(input_length))
+  label_length = math_ops.to_int32(array_ops.squeeze(label_length, axis=-1))
+  input_length = math_ops.to_int32(array_ops.squeeze(input_length, axis=-1))
   sparse_labels = math_ops.to_int32(
       ctc_label_dense_to_sparse(y_true, label_length))
 
diff --git a/tensorflow/python/keras/backend/__init__.py b/tensorflow/python/keras/backend/__init__.py
deleted file mode 100644
index 10ef5a75852deb..00000000000000
--- a/tensorflow/python/keras/backend/__init__.py
+++ /dev/null
@@ -1,163 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras backend API."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=redefined-builtin
-from tensorflow.python.keras._impl.keras.backend import abs
-from tensorflow.python.keras._impl.keras.backend import all
-from tensorflow.python.keras._impl.keras.backend import any
-from tensorflow.python.keras._impl.keras.backend import arange
-from tensorflow.python.keras._impl.keras.backend import argmax
-from tensorflow.python.keras._impl.keras.backend import argmin
-from tensorflow.python.keras._impl.keras.backend import backend
-from tensorflow.python.keras._impl.keras.backend import batch_dot
-from tensorflow.python.keras._impl.keras.backend import batch_flatten
-from tensorflow.python.keras._impl.keras.backend import batch_get_value
-from tensorflow.python.keras._impl.keras.backend import batch_normalization
-from tensorflow.python.keras._impl.keras.backend import batch_set_value
-from tensorflow.python.keras._impl.keras.backend import bias_add
-from tensorflow.python.keras._impl.keras.backend import binary_crossentropy
-from tensorflow.python.keras._impl.keras.backend import cast
-from tensorflow.python.keras._impl.keras.backend import cast_to_floatx
-from tensorflow.python.keras._impl.keras.backend import categorical_crossentropy
-from tensorflow.python.keras._impl.keras.backend import clear_session
-from tensorflow.python.keras._impl.keras.backend import clip
-from tensorflow.python.keras._impl.keras.backend import concatenate
-from tensorflow.python.keras._impl.keras.backend import constant
-from tensorflow.python.keras._impl.keras.backend import conv1d
-from tensorflow.python.keras._impl.keras.backend import conv2d
-from tensorflow.python.keras._impl.keras.backend import conv2d_transpose
-from tensorflow.python.keras._impl.keras.backend import conv3d
-from tensorflow.python.keras._impl.keras.backend import cos
-from tensorflow.python.keras._impl.keras.backend import count_params
-from tensorflow.python.keras._impl.keras.backend import ctc_batch_cost
-from tensorflow.python.keras._impl.keras.backend import ctc_decode
-from tensorflow.python.keras._impl.keras.backend import ctc_label_dense_to_sparse
-from tensorflow.python.keras._impl.keras.backend import dot
-from tensorflow.python.keras._impl.keras.backend import dropout
-from tensorflow.python.keras._impl.keras.backend import dtype
-from tensorflow.python.keras._impl.keras.backend import elu
-from tensorflow.python.keras._impl.keras.backend import epsilon
-from tensorflow.python.keras._impl.keras.backend import equal
-from tensorflow.python.keras._impl.keras.backend import eval
-from tensorflow.python.keras._impl.keras.backend import exp
-from tensorflow.python.keras._impl.keras.backend import expand_dims
-from tensorflow.python.keras._impl.keras.backend import eye
-from tensorflow.python.keras._impl.keras.backend import flatten
-from tensorflow.python.keras._impl.keras.backend import floatx
-from tensorflow.python.keras._impl.keras.backend import foldl
-from tensorflow.python.keras._impl.keras.backend import foldr
-from tensorflow.python.keras._impl.keras.backend import function
-from tensorflow.python.keras._impl.keras.backend import gather
-from tensorflow.python.keras._impl.keras.backend import get_session
-from tensorflow.python.keras._impl.keras.backend import get_uid
-from tensorflow.python.keras._impl.keras.backend import get_value
-from tensorflow.python.keras._impl.keras.backend import gradients
-from tensorflow.python.keras._impl.keras.backend import greater
-from tensorflow.python.keras._impl.keras.backend import greater_equal
-from tensorflow.python.keras._impl.keras.backend import hard_sigmoid
-from tensorflow.python.keras._impl.keras.backend import image_data_format
-from tensorflow.python.keras._impl.keras.backend import in_test_phase
-from tensorflow.python.keras._impl.keras.backend import in_top_k
-from tensorflow.python.keras._impl.keras.backend import in_train_phase
-from tensorflow.python.keras._impl.keras.backend import int_shape
-from tensorflow.python.keras._impl.keras.backend import is_sparse
-from tensorflow.python.keras._impl.keras.backend import l2_normalize
-from tensorflow.python.keras._impl.keras.backend import learning_phase
-from tensorflow.python.keras._impl.keras.backend import less
-from tensorflow.python.keras._impl.keras.backend import less_equal
-from tensorflow.python.keras._impl.keras.backend import log
-from tensorflow.python.keras._impl.keras.backend import manual_variable_initialization
-from tensorflow.python.keras._impl.keras.backend import map_fn
-from tensorflow.python.keras._impl.keras.backend import max
-from tensorflow.python.keras._impl.keras.backend import maximum
-from tensorflow.python.keras._impl.keras.backend import mean
-from tensorflow.python.keras._impl.keras.backend import min
-from tensorflow.python.keras._impl.keras.backend import minimum
-from tensorflow.python.keras._impl.keras.backend import moving_average_update
-from tensorflow.python.keras._impl.keras.backend import name_scope
-from tensorflow.python.keras._impl.keras.backend import ndim
-from tensorflow.python.keras._impl.keras.backend import normalize_batch_in_training
-from tensorflow.python.keras._impl.keras.backend import not_equal
-from tensorflow.python.keras._impl.keras.backend import one_hot
-from tensorflow.python.keras._impl.keras.backend import ones
-from tensorflow.python.keras._impl.keras.backend import ones_like
-from tensorflow.python.keras._impl.keras.backend import permute_dimensions
-from tensorflow.python.keras._impl.keras.backend import placeholder
-from tensorflow.python.keras._impl.keras.backend import pool2d
-from tensorflow.python.keras._impl.keras.backend import pool3d
-from tensorflow.python.keras._impl.keras.backend import pow
-from tensorflow.python.keras._impl.keras.backend import print_tensor
-from tensorflow.python.keras._impl.keras.backend import prod
-from tensorflow.python.keras._impl.keras.backend import random_binomial
-from tensorflow.python.keras._impl.keras.backend import random_normal
-from tensorflow.python.keras._impl.keras.backend import random_normal_variable
-from tensorflow.python.keras._impl.keras.backend import random_uniform
-from tensorflow.python.keras._impl.keras.backend import random_uniform_variable
-from tensorflow.python.keras._impl.keras.backend import relu
-from tensorflow.python.keras._impl.keras.backend import repeat
-from tensorflow.python.keras._impl.keras.backend import repeat_elements
-from tensorflow.python.keras._impl.keras.backend import reset_uids
-from tensorflow.python.keras._impl.keras.backend import reshape
-from tensorflow.python.keras._impl.keras.backend import resize_images
-from tensorflow.python.keras._impl.keras.backend import resize_volumes
-from tensorflow.python.keras._impl.keras.backend import reverse
-from tensorflow.python.keras._impl.keras.backend import rnn
-from tensorflow.python.keras._impl.keras.backend import round
-from tensorflow.python.keras._impl.keras.backend import separable_conv2d
-from tensorflow.python.keras._impl.keras.backend import set_epsilon
-from tensorflow.python.keras._impl.keras.backend import set_floatx
-from tensorflow.python.keras._impl.keras.backend import set_image_data_format
-from tensorflow.python.keras._impl.keras.backend import set_learning_phase
-from tensorflow.python.keras._impl.keras.backend import set_session
-from tensorflow.python.keras._impl.keras.backend import set_value
-from tensorflow.python.keras._impl.keras.backend import shape
-from tensorflow.python.keras._impl.keras.backend import sigmoid
-from tensorflow.python.keras._impl.keras.backend import sign
-from tensorflow.python.keras._impl.keras.backend import sin
-from tensorflow.python.keras._impl.keras.backend import softmax
-from tensorflow.python.keras._impl.keras.backend import softplus
-from tensorflow.python.keras._impl.keras.backend import softsign
-from tensorflow.python.keras._impl.keras.backend import sparse_categorical_crossentropy
-from tensorflow.python.keras._impl.keras.backend import spatial_2d_padding
-from tensorflow.python.keras._impl.keras.backend import spatial_3d_padding
-from tensorflow.python.keras._impl.keras.backend import sqrt
-from tensorflow.python.keras._impl.keras.backend import square
-from tensorflow.python.keras._impl.keras.backend import squeeze
-from tensorflow.python.keras._impl.keras.backend import stack
-from tensorflow.python.keras._impl.keras.backend import std
-from tensorflow.python.keras._impl.keras.backend import stop_gradient
-from tensorflow.python.keras._impl.keras.backend import sum
-from tensorflow.python.keras._impl.keras.backend import switch
-from tensorflow.python.keras._impl.keras.backend import tanh
-from tensorflow.python.keras._impl.keras.backend import temporal_padding
-from tensorflow.python.keras._impl.keras.backend import to_dense
-from tensorflow.python.keras._impl.keras.backend import transpose
-from tensorflow.python.keras._impl.keras.backend import truncated_normal
-from tensorflow.python.keras._impl.keras.backend import update
-from tensorflow.python.keras._impl.keras.backend import update_add
-from tensorflow.python.keras._impl.keras.backend import update_sub
-from tensorflow.python.keras._impl.keras.backend import var
-from tensorflow.python.keras._impl.keras.backend import variable
-from tensorflow.python.keras._impl.keras.backend import zeros
-from tensorflow.python.keras._impl.keras.backend import zeros_like
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/backend_test.py b/tensorflow/python/keras/backend_test.py
similarity index 97%
rename from tensorflow/python/keras/_impl/keras/backend_test.py
rename to tensorflow/python/keras/backend_test.py
index de1ed467a2764a..58df263a4f2427 100644
--- a/tensorflow/python/keras/_impl/keras/backend_test.py
+++ b/tensorflow/python/keras/backend_test.py
@@ -20,8 +20,8 @@
 import numpy as np
 import scipy.sparse
 
+from tensorflow.python import keras
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.keras._impl import keras
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.util import tf_inspect
@@ -1122,6 +1122,30 @@ def test_ctc_batch_cost(self):
           keras.backend.ctc_batch_cost(labels, inputs, input_lens, label_lens))
       self.assertAllClose(res[:, 0], loss_log_probs, atol=1e-05)
 
+      # test when batch_size = 1, that is, one sample only
+      ref = [3.34211]
+      input_lens = np.expand_dims(np.asarray([5]), 1)
+      label_lens = np.expand_dims(np.asarray([5]), 1)
+
+      labels = np.asarray([[0, 1, 2, 1, 0]])
+      inputs = np.asarray(
+          [[[0.633766, 0.221185, 0.0917319, 0.0129757, 0.0142857, 0.0260553], [
+              0.111121, 0.588392, 0.278779, 0.0055756, 0.00569609, 0.010436
+          ], [0.0357786, 0.633813, 0.321418, 0.00249248, 0.00272882, 0.0037688],
+            [0.0663296, 0.643849, 0.280111, 0.00283995, 0.0035545, 0.00331533],
+            [0.458235, 0.396634, 0.123377, 0.00648837, 0.00903441, 0.00623107]]
+          ],
+          dtype=np.float32)
+
+      k_labels = keras.backend.variable(labels, dtype='int32')
+      k_inputs = keras.backend.variable(inputs, dtype='float32')
+      k_input_lens = keras.backend.variable(input_lens, dtype='int32')
+      k_label_lens = keras.backend.variable(label_lens, dtype='int32')
+      res = keras.backend.eval(
+          keras.backend.ctc_batch_cost(k_labels, k_inputs, k_input_lens,
+                                       k_label_lens))
+      self.assertAllClose(res[:, 0], ref, atol=1e-05)
+
 
 class TestRandomOps(test.TestCase):
 
diff --git a/tensorflow/python/keras/_impl/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
similarity index 94%
rename from tensorflow/python/keras/_impl/keras/callbacks.py
rename to tensorflow/python/keras/callbacks.py
index deb1e8867dba3d..8d0386e8ddc9a0 100644
--- a/tensorflow/python/keras/_impl/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -30,8 +30,8 @@
 import numpy as np
 import six
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary as tf_summary
@@ -268,9 +268,6 @@ class TerminateOnNaN(Callback):
   """Callback that terminates training when a NaN loss is encountered.
   """
 
-  def __init__(self):
-    super(TerminateOnNaN, self).__init__()
-
   def on_batch_end(self, batch, logs=None):
     logs = logs or {}
     loss = logs.get('loss')
@@ -427,7 +424,7 @@ def __init__(self,
 
     if mode not in ['auto', 'min', 'max']:
       logging.warning('ModelCheckpoint mode %s is unknown, '
-                      'fallback to auto mode.', (mode), RuntimeWarning)
+                      'fallback to auto mode.', mode)
       mode = 'auto'
 
     if mode == 'min':
@@ -454,7 +451,7 @@ def on_epoch_end(self, epoch, logs=None):
         current = logs.get(self.monitor)
         if current is None:
           logging.warning('Can save best model only with %s available, '
-                          'skipping.', self.monitor, RuntimeWarning)
+                          'skipping.', self.monitor)
         else:
           if self.monitor_op(current, self.best):
             if self.verbose > 0:
@@ -468,8 +465,8 @@ def on_epoch_end(self, epoch, logs=None):
               self.model.save(filepath, overwrite=True)
           else:
             if self.verbose > 0:
-              print('\nEpoch %05d: %s did not improve' % (epoch + 1,
-                                                          self.monitor))
+              print('\nEpoch %05d: %s did not improve from %0.5f' %
+                    (epoch + 1, self.monitor, self.best))
       else:
         if self.verbose > 0:
           print('\nEpoch %05d: saving model to %s' % (epoch + 1, filepath))
@@ -518,7 +515,7 @@ def __init__(self,
 
     if mode not in ['auto', 'min', 'max']:
       logging.warning('EarlyStopping mode %s is unknown, '
-                      'fallback to auto mode.', mode, RuntimeWarning)
+                      'fallback to auto mode.', mode)
       mode = 'auto'
 
     if mode == 'min':
@@ -547,7 +544,7 @@ def on_epoch_end(self, epoch, logs=None):
     if current is None:
       logging.warning('Early stopping conditioned on metric `%s` '
                       'which is not available. Available metrics are: %s',
-                      self.monitor, ','.join(list(logs.keys())), RuntimeWarning)
+                      self.monitor, ','.join(list(logs.keys())))
       return
     if self.monitor_op(current - self.min_delta, self.best):
       self.best = current
@@ -571,38 +568,49 @@ class RemoteMonitor(Callback):
   Events are sent to `root + '/publish/epoch/end/'` by default. Calls are
   HTTP POST, with a `data` argument which is a
   JSON-encoded dictionary of event data.
+  If send_as_json is set to True, the content type of the request will be
+  application/json. Otherwise the serialized JSON will be sent within a form.
 
   Arguments:
       root: String; root url of the target server.
       path: String; path relative to `root` to which the events will be sent.
       field: String; JSON field under which the data will be stored.
+          The field is used only if the payload is sent within a form
+          (i.e. send_as_json is set to False).
       headers: Dictionary; optional custom HTTP headers.
+      send_as_json: Boolean; whether the request should be
+          sent as application/json.
   """
 
   def __init__(self,
                root='http://localhost:9000',
                path='/publish/epoch/end/',
                field='data',
-               headers=None):
+               headers=None,
+               send_as_json=False):
     super(RemoteMonitor, self).__init__()
 
     self.root = root
     self.path = path
     self.field = field
     self.headers = headers
+    self.send_as_json = send_as_json
 
   def on_epoch_end(self, epoch, logs=None):
     if requests is None:
-      raise ImportError('RemoteMonitor requires ' 'the `requests` library.')
+      raise ImportError('RemoteMonitor requires the `requests` library.')
     logs = logs or {}
     send = {}
     send['epoch'] = epoch
     for k, v in logs.items():
       send[k] = v
     try:
-      requests.post(
-          self.root + self.path, {self.field: json.dumps(send)},
-          headers=self.headers)
+      if self.send_as_json:
+        requests.post(self.root + self.path, json=send, headers=self.headers)
+      else:
+        requests.post(
+            self.root + self.path, {self.field: json.dumps(send)},
+            headers=self.headers)
     except requests.exceptions.RequestException:
       logging.warning('Warning: could not reach RemoteMonitor '
                       'root server at ' + str(self.root))
@@ -712,15 +720,6 @@ def set_model(self, model):
         for weight in layer.weights:
           mapped_weight_name = weight.name.replace(':', '_')
           tf_summary.histogram(mapped_weight_name, weight)
-          if self.write_grads:
-            grads = model.optimizer.get_gradients(model.total_loss, weight)
-
-            def is_indexed_slices(grad):
-              return type(grad).__name__ == 'IndexedSlices'
-
-            grads = [grad.values if is_indexed_slices(grad) else grad
-                     for grad in grads]
-            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)
           if self.write_images:
             w_img = array_ops.squeeze(weight)
             shape = K.int_shape(w_img)
@@ -747,6 +746,18 @@ def is_indexed_slices(grad):
             assert len(shape) == 4 and shape[-1] in [1, 3, 4]
             tf_summary.image(mapped_weight_name, w_img)
 
+        if self.write_grads:
+          for weight in layer.trainable_weights:
+            mapped_weight_name = weight.name.replace(':', '_')
+            grads = model.optimizer.get_gradients(model.total_loss, weight)
+
+            def is_indexed_slices(grad):
+              return type(grad).__name__ == 'IndexedSlices'
+
+            grads = [grad.values if is_indexed_slices(grad) else grad
+                     for grad in grads]
+            tf_summary.histogram('{}_grad'.format(mapped_weight_name), grads)
+
         if hasattr(layer, 'output'):
           tf_summary.histogram('{}_out'.format(layer.name), layer.output)
     self.merged = tf_summary.merge_all()
@@ -846,7 +857,7 @@ class ReduceLROnPlateau(Callback):
           monitored has stopped increasing; in `auto`
           mode, the direction is automatically inferred
           from the name of the monitored quantity.
-      epsilon: threshold for measuring the new optimum,
+      min_delta: threshold for measuring the new optimum,
           to only focus on significant changes.
       cooldown: number of epochs to wait before resuming
           normal operation after lr has been reduced.
@@ -859,17 +870,22 @@ def __init__(self,
                patience=10,
                verbose=0,
                mode='auto',
-               epsilon=1e-4,
+               min_delta=1e-4,
                cooldown=0,
-               min_lr=0):
+               min_lr=0,
+               **kwargs):
     super(ReduceLROnPlateau, self).__init__()
 
     self.monitor = monitor
     if factor >= 1.0:
       raise ValueError('ReduceLROnPlateau ' 'does not support a factor >= 1.0.')
+    if 'epsilon' in kwargs:
+      min_delta = kwargs.pop('epsilon')
+      logging.warning('`epsilon` argument is deprecated and '
+                      'will be removed, use `min_delta` instead.')
     self.factor = factor
     self.min_lr = min_lr
-    self.epsilon = epsilon
+    self.min_delta = min_delta
     self.patience = patience
     self.verbose = verbose
     self.cooldown = cooldown
@@ -885,14 +901,14 @@ def _reset(self):
     """
     if self.mode not in ['auto', 'min', 'max']:
       logging.warning('Learning Rate Plateau Reducing mode %s is unknown, '
-                      'fallback to auto mode.', self.mode, RuntimeWarning)
+                      'fallback to auto mode.', self.mode)
       self.mode = 'auto'
     if (self.mode == 'min' or
         (self.mode == 'auto' and 'acc' not in self.monitor)):
-      self.monitor_op = lambda a, b: np.less(a, b - self.epsilon)
+      self.monitor_op = lambda a, b: np.less(a, b - self.min_delta)
       self.best = np.Inf
     else:
-      self.monitor_op = lambda a, b: np.greater(a, b + self.epsilon)
+      self.monitor_op = lambda a, b: np.greater(a, b + self.min_delta)
       self.best = -np.Inf
     self.cooldown_counter = 0
     self.wait = 0
@@ -907,7 +923,7 @@ def on_epoch_end(self, epoch, logs=None):
     if current is None:
       logging.warning('Reduce LR on plateau conditioned on metric `%s` '
                       'which is not available. Available metrics are: %s',
-                      self.monitor, ','.join(list(logs.keys())), RuntimeWarning)
+                      self.monitor, ','.join(list(logs.keys())))
 
     else:
       if self.in_cooldown():
@@ -918,6 +934,7 @@ def on_epoch_end(self, epoch, logs=None):
         self.best = current
         self.wait = 0
       elif not self.in_cooldown():
+        self.wait += 1
         if self.wait >= self.patience:
           old_lr = float(K.get_value(self.model.optimizer.lr))
           if old_lr > self.min_lr:
@@ -929,7 +946,6 @@ def on_epoch_end(self, epoch, logs=None):
                     'rate to %s.' % (epoch + 1, new_lr))
             self.cooldown_counter = self.cooldown
             self.wait = 0
-        self.wait += 1
 
   def in_cooldown(self):
     return self.cooldown_counter > 0
diff --git a/tensorflow/python/keras/callbacks/__init__.py b/tensorflow/python/keras/callbacks/__init__.py
deleted file mode 100644
index 2d884790ddb9cc..00000000000000
--- a/tensorflow/python/keras/callbacks/__init__.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras callback classes."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.callbacks import BaseLogger
-from tensorflow.python.keras._impl.keras.callbacks import Callback
-from tensorflow.python.keras._impl.keras.callbacks import CSVLogger
-from tensorflow.python.keras._impl.keras.callbacks import EarlyStopping
-from tensorflow.python.keras._impl.keras.callbacks import History
-from tensorflow.python.keras._impl.keras.callbacks import LambdaCallback
-from tensorflow.python.keras._impl.keras.callbacks import LearningRateScheduler
-from tensorflow.python.keras._impl.keras.callbacks import ModelCheckpoint
-from tensorflow.python.keras._impl.keras.callbacks import ProgbarLogger
-from tensorflow.python.keras._impl.keras.callbacks import ReduceLROnPlateau
-from tensorflow.python.keras._impl.keras.callbacks import RemoteMonitor
-from tensorflow.python.keras._impl.keras.callbacks import TensorBoard
-from tensorflow.python.keras._impl.keras.callbacks import TerminateOnNaN
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
similarity index 86%
rename from tensorflow/python/keras/_impl/keras/callbacks_test.py
rename to tensorflow/python/keras/callbacks_test.py
index 79dfcd1bb669db..eb40fb4acc11d2 100644
--- a/tensorflow/python/keras/_impl/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -27,9 +27,10 @@
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python import keras
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary.writer import writer_cache
 
 try:
@@ -354,7 +355,7 @@ def make_model():
           keras.callbacks.ReduceLROnPlateau(
               monitor='val_loss',
               factor=0.1,
-              epsilon=10,
+              min_delta=10,
               patience=1,
               cooldown=5)
       ]
@@ -371,6 +372,63 @@ def make_model():
           0.01,
           atol=1e-4)
 
+      model = make_model()
+      cbks = [
+          keras.callbacks.ReduceLROnPlateau(
+              monitor='val_loss',
+              factor=0.1,
+              min_delta=0,
+              patience=1,
+              cooldown=5)
+      ]
+      model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=5,
+          verbose=2)
+      self.assertAllClose(
+          float(keras.backend.get_value(model.optimizer.lr)), 0.1, atol=1e-4)
+
+  def test_ReduceLROnPlateau_patience(self):
+
+    class DummyOptimizer(object):
+
+      def __init__(self):
+        self.lr = keras.backend.variable(1.0)
+
+    class DummyModel(object):
+
+      def __init__(self):
+        self.optimizer = DummyOptimizer()
+
+    reduce_on_plateau = keras.callbacks.ReduceLROnPlateau(
+        monitor='val_loss', patience=2)
+    reduce_on_plateau.model = DummyModel()
+
+    losses = [0.0860, 0.1096, 0.1040]
+    lrs = []
+
+    for epoch in range(len(losses)):
+      reduce_on_plateau.on_epoch_end(epoch, logs={'val_loss': losses[epoch]})
+      lrs.append(keras.backend.get_value(reduce_on_plateau.model.optimizer.lr))
+
+    # The learning rates should be 1.0 except the last one
+    for lr in lrs[:-1]:
+      self.assertEqual(lr, 1.0)
+    self.assertLess(lrs[-1], 1.0)
+
+  def test_ReduceLROnPlateau_backwards_compatibility(self):
+    with test.mock.patch.object(logging, 'warning') as mock_log:
+      reduce_on_plateau = keras.callbacks.ReduceLROnPlateau(epsilon=1e-13)
+      self.assertRegexpMatches(
+          str(mock_log.call_args), '`epsilon` argument is deprecated')
+    self.assertFalse(hasattr(reduce_on_plateau, 'epsilon'))
+    self.assertTrue(hasattr(reduce_on_plateau, 'min_delta'))
+    self.assertEqual(reduce_on_plateau.min_delta, 1e-13)
+
   def test_CSVLogger(self):
     with self.test_session():
       np.random.seed(1337)
@@ -507,33 +565,39 @@ def data_generator():
       assert 'nan' in values[-1], 'The last epoch was not logged.'
 
   def test_TerminateOnNaN(self):
-    np.random.seed(1337)
-    (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-        train_samples=TRAIN_SAMPLES,
-        test_samples=TEST_SAMPLES,
-        input_shape=(INPUT_DIM,),
-        num_classes=NUM_CLASSES)
+    with self.test_session():
+      np.random.seed(1337)
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=TRAIN_SAMPLES,
+          test_samples=TEST_SAMPLES,
+          input_shape=(INPUT_DIM,),
+          num_classes=NUM_CLASSES)
 
-    y_test = keras.utils.to_categorical(y_test)
-    y_train = keras.utils.to_categorical(y_train)
-    cbks = [keras.callbacks.TerminateOnNaN()]
-    model = keras.models.Sequential()
-    initializer = keras.initializers.Constant(value=1e5)
-    for _ in range(5):
-      model.add(keras.layers.Dense(2,
-                                   input_dim=INPUT_DIM,
-                                   activation='relu',
-                                   kernel_initializer=initializer))
-    model.add(keras.layers.Dense(NUM_CLASSES))
-    model.compile(loss='mean_squared_error',
-                  optimizer='rmsprop')
-
-    history = model.fit(x_train, y_train, batch_size=BATCH_SIZE,
-                        validation_data=(x_test, y_test),
-                        callbacks=cbks, epochs=20)
-    loss = history.history['loss']
-    assert len(loss) == 1
-    assert loss[0] == np.inf
+      y_test = keras.utils.to_categorical(y_test)
+      y_train = keras.utils.to_categorical(y_train)
+      cbks = [keras.callbacks.TerminateOnNaN()]
+      model = keras.models.Sequential()
+      initializer = keras.initializers.Constant(value=1e5)
+      for _ in range(5):
+        model.add(
+            keras.layers.Dense(
+                2,
+                input_dim=INPUT_DIM,
+                activation='relu',
+                kernel_initializer=initializer))
+      model.add(keras.layers.Dense(NUM_CLASSES))
+      model.compile(loss='mean_squared_error', optimizer='rmsprop')
+
+      history = model.fit(
+          x_train,
+          y_train,
+          batch_size=BATCH_SIZE,
+          validation_data=(x_test, y_test),
+          callbacks=cbks,
+          epochs=20)
+      loss = history.history['loss']
+      assert len(loss) == 1
+      assert loss[0] == np.inf
 
   def test_TensorBoard(self):
     np.random.seed(1337)
@@ -571,6 +635,8 @@ def data_generator(train):
       model.add(
           keras.layers.Dense(
               NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+      # non_trainable_weights: moving_variance, moving_mean
+      model.add(keras.layers.BatchNormalization())
       model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
       model.compile(
           loss='categorical_crossentropy',
@@ -875,6 +941,37 @@ def test_TensorBoard_with_ReduceLROnPlateau(self):
 
       assert os.path.exists(temp_dir)
 
+  def test_RemoteMonitorWithJsonPayload(self):
+    if requests is None:
+      self.skipTest('`requests` required to run this test')
+    with self.test_session():
+      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
+          train_samples=TRAIN_SAMPLES,
+          test_samples=TEST_SAMPLES,
+          input_shape=(INPUT_DIM,),
+          num_classes=NUM_CLASSES)
+      y_test = keras.utils.np_utils.to_categorical(y_test)
+      y_train = keras.utils.np_utils.to_categorical(y_train)
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Dense(
+              NUM_HIDDEN, input_dim=INPUT_DIM, activation='relu'))
+      model.add(keras.layers.Dense(NUM_CLASSES, activation='softmax'))
+      model.compile(
+          loss='categorical_crossentropy',
+          optimizer='rmsprop',
+          metrics=['accuracy'])
+      cbks = [keras.callbacks.RemoteMonitor(send_as_json=True)]
+
+      with test.mock.patch.object(requests, 'post'):
+        model.fit(
+            x_train,
+            y_train,
+            batch_size=BATCH_SIZE,
+            validation_data=(x_test, y_test),
+            callbacks=cbks,
+            epochs=1)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/constraints.py b/tensorflow/python/keras/constraints.py
similarity index 96%
rename from tensorflow/python/keras/_impl/keras/constraints.py
rename to tensorflow/python/keras/constraints.py
index abe95d8e0ca68b..bf3a3a728aafc8 100644
--- a/tensorflow/python/keras/_impl/keras/constraints.py
+++ b/tensorflow/python/keras/constraints.py
@@ -21,9 +21,9 @@
 
 import six
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
-from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
diff --git a/tensorflow/python/keras/constraints/__init__.py b/tensorflow/python/keras/constraints/__init__.py
deleted file mode 100644
index 152606d8ebbcad..00000000000000
--- a/tensorflow/python/keras/constraints/__init__.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras built-in constraints functions."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# Constraints functions / callable classes.
-from tensorflow.python.keras._impl.keras.constraints import Constraint
-from tensorflow.python.keras._impl.keras.constraints import max_norm
-from tensorflow.python.keras._impl.keras.constraints import MaxNorm
-from tensorflow.python.keras._impl.keras.constraints import min_max_norm
-from tensorflow.python.keras._impl.keras.constraints import MinMaxNorm
-from tensorflow.python.keras._impl.keras.constraints import non_neg
-from tensorflow.python.keras._impl.keras.constraints import NonNeg
-from tensorflow.python.keras._impl.keras.constraints import unit_norm
-from tensorflow.python.keras._impl.keras.constraints import UnitNorm
-
-# Auxiliary utils.
-# pylint: disable=g-bad-import-order
-from tensorflow.python.keras._impl.keras.constraints import deserialize
-from tensorflow.python.keras._impl.keras.constraints import serialize
-from tensorflow.python.keras._impl.keras.constraints import get
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/constraints_test.py b/tensorflow/python/keras/constraints_test.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/constraints_test.py
rename to tensorflow/python/keras/constraints_test.py
index 87905693caa900..84e2db10332c82 100644
--- a/tensorflow/python/keras/_impl/keras/constraints_test.py
+++ b/tensorflow/python/keras/constraints_test.py
@@ -20,7 +20,7 @@
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/datasets/boston_housing.py b/tensorflow/python/keras/datasets/boston_housing.py
similarity index 96%
rename from tensorflow/python/keras/_impl/keras/datasets/boston_housing.py
rename to tensorflow/python/keras/datasets/boston_housing.py
index 13fa9aed2b8da1..8c043638c0d116 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/boston_housing.py
+++ b/tensorflow/python/keras/datasets/boston_housing.py
@@ -20,7 +20,7 @@
 
 import numpy as np
 
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.util.tf_export import tf_export
 
 
diff --git a/tensorflow/python/keras/_impl/keras/datasets/cifar.py b/tensorflow/python/keras/datasets/cifar.py
similarity index 100%
rename from tensorflow/python/keras/_impl/keras/datasets/cifar.py
rename to tensorflow/python/keras/datasets/cifar.py
diff --git a/tensorflow/python/keras/_impl/keras/datasets/cifar10.py b/tensorflow/python/keras/datasets/cifar10.py
similarity index 90%
rename from tensorflow/python/keras/_impl/keras/datasets/cifar10.py
rename to tensorflow/python/keras/datasets/cifar10.py
index 6b772433822474..d627160875c007 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/cifar10.py
+++ b/tensorflow/python/keras/datasets/cifar10.py
@@ -22,9 +22,9 @@
 
 import numpy as np
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.datasets.cifar import load_batch
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.datasets.cifar import load_batch
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.util.tf_export import tf_export
 
 
diff --git a/tensorflow/python/keras/_impl/keras/datasets/cifar100.py b/tensorflow/python/keras/datasets/cifar100.py
similarity index 90%
rename from tensorflow/python/keras/_impl/keras/datasets/cifar100.py
rename to tensorflow/python/keras/datasets/cifar100.py
index 28d74116a50979..e9a6d634a5308a 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/cifar100.py
+++ b/tensorflow/python/keras/datasets/cifar100.py
@@ -22,9 +22,9 @@
 
 import numpy as np
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.datasets.cifar import load_batch
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.datasets.cifar import load_batch
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.util.tf_export import tf_export
 
 
diff --git a/tensorflow/python/keras/_impl/keras/datasets/fashion_mnist.py b/tensorflow/python/keras/datasets/fashion_mnist.py
similarity index 96%
rename from tensorflow/python/keras/_impl/keras/datasets/fashion_mnist.py
rename to tensorflow/python/keras/datasets/fashion_mnist.py
index 508e95f719a029..45e27aad34f658 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/fashion_mnist.py
+++ b/tensorflow/python/keras/datasets/fashion_mnist.py
@@ -23,7 +23,7 @@
 
 import numpy as np
 
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.util.tf_export import tf_export
 
 
diff --git a/tensorflow/python/keras/_impl/keras/datasets/imdb.py b/tensorflow/python/keras/datasets/imdb.py
similarity index 96%
rename from tensorflow/python/keras/_impl/keras/datasets/imdb.py
rename to tensorflow/python/keras/datasets/imdb.py
index 7467bb24646227..411b3e8635f5b9 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/imdb.py
+++ b/tensorflow/python/keras/datasets/imdb.py
@@ -22,8 +22,8 @@
 
 import numpy as np
 
-from tensorflow.python.keras._impl.keras.preprocessing.sequence import _remove_long_seq
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras.preprocessing.sequence import _remove_long_seq
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
diff --git a/tensorflow/python/keras/datasets/imdb/__init__.py b/tensorflow/python/keras/datasets/imdb/__init__.py
deleted file mode 100644
index 1c6396d2d32b88..00000000000000
--- a/tensorflow/python/keras/datasets/imdb/__init__.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""IMDB movie review sentiment classification dataset."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.datasets.imdb import get_word_index
-from tensorflow.python.keras._impl.keras.datasets.imdb import load_data
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/datasets/mnist.py b/tensorflow/python/keras/datasets/mnist.py
similarity index 95%
rename from tensorflow/python/keras/_impl/keras/datasets/mnist.py
rename to tensorflow/python/keras/datasets/mnist.py
index e30691373e9aaf..631189731a91bf 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/mnist.py
+++ b/tensorflow/python/keras/datasets/mnist.py
@@ -20,7 +20,7 @@
 
 import numpy as np
 
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.util.tf_export import tf_export
 
 
diff --git a/tensorflow/python/keras/datasets/mnist/__init__.py b/tensorflow/python/keras/datasets/mnist/__init__.py
deleted file mode 100644
index 364255f3387b59..00000000000000
--- a/tensorflow/python/keras/datasets/mnist/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""MNIST handwritten digits classification dataset."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.datasets.mnist import load_data
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/datasets/reuters.py b/tensorflow/python/keras/datasets/reuters.py
similarity index 96%
rename from tensorflow/python/keras/_impl/keras/datasets/reuters.py
rename to tensorflow/python/keras/datasets/reuters.py
index b711696b5eecf9..b070ba8d125614 100644
--- a/tensorflow/python/keras/_impl/keras/datasets/reuters.py
+++ b/tensorflow/python/keras/datasets/reuters.py
@@ -22,8 +22,8 @@
 
 import numpy as np
 
-from tensorflow.python.keras._impl.keras.preprocessing.sequence import _remove_long_seq
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
+from tensorflow.python.keras.preprocessing.sequence import _remove_long_seq
+from tensorflow.python.keras.utils.data_utils import get_file
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
diff --git a/tensorflow/python/keras/datasets/reuters/__init__.py b/tensorflow/python/keras/datasets/reuters/__init__.py
deleted file mode 100644
index bb6791a344ad0c..00000000000000
--- a/tensorflow/python/keras/datasets/reuters/__init__.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Reuters newswire topic classification dataset."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.datasets.reuters import get_word_index
-from tensorflow.python.keras._impl.keras.datasets.reuters import load_data
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/engine/__init__.py b/tensorflow/python/keras/engine/__init__.py
similarity index 62%
rename from tensorflow/python/keras/_impl/keras/engine/__init__.py
rename to tensorflow/python/keras/engine/__init__.py
index 1bc533ab8f7ba3..ec7c0831992b26 100644
--- a/tensorflow/python/keras/_impl/keras/engine/__init__.py
+++ b/tensorflow/python/keras/engine/__init__.py
@@ -18,10 +18,14 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.engine.base_layer import InputSpec
-from tensorflow.python.keras._impl.keras.engine.base_layer import Layer
-from tensorflow.python.keras._impl.keras.engine.input_layer import Input
-from tensorflow.python.keras._impl.keras.engine.input_layer import InputLayer
-from tensorflow.python.keras._impl.keras.engine.network import get_source_inputs
-from tensorflow.python.keras._impl.keras.engine.network import Network
-from tensorflow.python.keras._impl.keras.engine.training import Model
+from tensorflow.python.keras.engine.base_layer import InputSpec
+from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_layer import Input
+from tensorflow.python.keras.engine.input_layer import InputLayer
+from tensorflow.python.keras.engine.network import get_source_inputs
+from tensorflow.python.keras.engine.network import Network
+from tensorflow.python.keras.engine.training import Model
+
+del absolute_import
+del division
+del print_function
diff --git a/tensorflow/python/keras/_impl/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
similarity index 95%
rename from tensorflow/python/keras/_impl/keras/engine/base_layer.py
rename to tensorflow/python/keras/engine/base_layer.py
index a3e78c95dc9957..4814275fd5ba53 100644
--- a/tensorflow/python/keras/_impl/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -19,36 +19,52 @@
 from __future__ import print_function
 
 import collections
+import enum  # pylint: disable=g-bad-import-order
 import inspect  # Necessary supplement to tf_inspect to deal with variadic args.
 
 import numpy as np
 from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensorflow.python.eager import context
-from tensorflow.python.estimator import util as estimator_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl.keras import backend
-from tensorflow.python.keras._impl.keras import constraints
-from tensorflow.python.keras._impl.keras import initializers
-from tensorflow.python.keras._impl.keras import regularizers
-from tensorflow.python.keras._impl.keras.utils import generic_utils
-from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.keras import backend
+from tensorflow.python.keras import constraints
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils import tf_utils
 # A module that only depends on `keras.layers` import these from here.
-from tensorflow.python.keras._impl.keras.utils.generic_utils import to_snake_case  # pylint: disable=unused-import
-from tensorflow.python.keras._impl.keras.utils.tf_utils import is_tensor_or_tensor_list  # pylint: disable=unused-import
+from tensorflow.python.keras.utils.generic_utils import to_snake_case  # pylint: disable=unused-import
+from tensorflow.python.keras.utils.tf_utils import is_tensor_or_tensor_list  # pylint: disable=unused-import
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
-from tensorflow.python.training import checkpointable
+from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.util import function_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
 
+class CallConvention(enum.Enum):
+  """Calling conventions for passing `Layer` inputs to `Layer.call`."""
+  # The Layer takes inputs as its first argument, named "inputs" for
+  # compatibility with the signature of Layer.__call__. This is the mode assumed
+  # for Layers which are not subclassed Models.
+  EXPLICIT_INPUTS_ARGUMENT = 1
+  # The Layer takes a single positional argument, not named "inputs". It's
+  # treated like an "inputs" argument.
+  SINGLE_POSITIONAL_ARGUMENT = 2
+  # The Layer has multiple positional arguments to which its inputs should be
+  # bound.
+  POSITIONAL_ARGUMENTS_ARE_INPUTS = 3
+
+
 @tf_export('keras.layers.Layer')
 class Layer(checkpointable.CheckpointableBase):
   """Base layer class.
@@ -145,10 +161,10 @@ def __init__(self, trainable=True, name=None, dtype=None, **kwargs):
     # return tensors. When using graph execution, _losses is a list of ops.
     self._losses = []
     self._dtype = None if dtype is None else dtypes.as_dtype(dtype).name
-    self._call_fn_args = estimator_util.fn_args(self.call)
+    self._call_fn_args = function_utils.fn_args(self.call)
     self._compute_previous_mask = ('mask' in self._call_fn_args or
                                    hasattr(self, 'compute_mask'))
-    self._uses_inputs_arg = True
+    self._call_convention = CallConvention.EXPLICIT_INPUTS_ARGUMENT
 
     # These lists will be filled via successive calls
     # to self._add_inbound_node().
@@ -390,6 +406,8 @@ def add_loss(self, losses, inputs=None):
       raise RuntimeError('Layer.add_loss not supported in Eager mode.')
 
     losses = generic_utils.to_list(losses)
+    losses = [ops.convert_to_tensor(loss, dtype=backend.floatx())
+              if not tensor_util.is_tensor(loss) else loss for loss in losses]
     self._losses += losses
     if inputs is None:
       for loss in losses:
@@ -433,7 +451,7 @@ def get_losses_for(self, inputs):
   def _name_scope(self):
     return self.name
 
-  def build(self, _):
+  def build(self, input_shape):
     """Creates the variables of the layer."""
     self.built = True
 
@@ -481,8 +499,7 @@ def add_weight(self, name, shape,
     """
     if dtype is None:
       dtype = self.dtype or backend.floatx()
-    else:
-      dtype = dtypes.as_dtype(dtype)
+    dtype = dtypes.as_dtype(dtype)
     initializer = initializers.get(initializer)
     regularizer = regularizers.get(regularizer)
     constraint = constraints.get(constraint)
@@ -510,7 +527,7 @@ def add_weight(self, name, shape,
         # Manage errors in Layer rather than Checkpointable.
         overwrite=True,
         initializer=initializer,
-        dtype=dtypes.as_dtype(dtype),
+        dtype=dtype,
         constraint=constraint,
         trainable=trainable and self.trainable,
         partitioner=partitioner,
@@ -641,7 +658,7 @@ def __call__(self, inputs, *args, **kwargs):
         self._compute_previous_mask):
       previous_mask = collect_previous_mask(inputs)
       if not hasattr(self, '_call_fn_args'):
-        self._call_fn_args = estimator_util.fn_args(self.call)
+        self._call_fn_args = function_utils.fn_args(self.call)
       if ('mask' in self._call_fn_args and 'mask' not in kwargs and
           not generic_utils.is_all_none(previous_mask)):
         # The previous layer generated a mask, and mask was not explicitly pass
@@ -791,12 +808,22 @@ def _set_mask_metadata(self, inputs, outputs, previous_mask):
           pass  # C type such as dict. Masking not supported in this case.
 
   def _set_connectivity_metadata_(self, inputs, outputs, args, kwargs):
-    if args and getattr(self, '_uses_inputs_arg', True):
-      raise TypeError(
-          'This Layer takes an `inputs` argument to call(), and only the '
-          '`inputs` argument may be specified as a positional argument. '
-          'Pass everything else as a keyword argument (those arguments will'
-          ' not be tracked as inputs to the Layer).')
+    call_convention = getattr(self, '_call_convention',
+                              CallConvention.EXPLICIT_INPUTS_ARGUMENT)
+    if args:
+      if call_convention == CallConvention.EXPLICIT_INPUTS_ARGUMENT:
+        raise TypeError(
+            'This Layer takes an `inputs` argument to call(), and only the '
+            '`inputs` argument may be specified as a positional argument. '
+            'Pass everything else as a keyword argument (those arguments will'
+            ' not be tracked as inputs to the Layer).')
+      elif call_convention == CallConvention.SINGLE_POSITIONAL_ARGUMENT:
+        raise TypeError(
+            'This Layer takes a single positional argument to call(), which is '
+            'by convention the inputs argument, and only this argument may be '
+            'specified as a positional argument. Pass everything else as a '
+            'keyword argument (those arguments will not be tracked as inputs '
+            'to the Layer).')
 
     # If the layer returns tensors from its inputs, unmodified,
     # we copy them to avoid loss of tensor metadata.
@@ -832,7 +859,11 @@ def _inputs_from_call_args(self, call_args, call_kwargs):
       A tuple of (inputs, non_input_kwargs). These may be the same objects as
       were passed in (call_args and call_kwargs).
     """
-    if getattr(self, '_uses_inputs_arg', True):
+    call_convention = getattr(self, '_call_convention',
+                              CallConvention.EXPLICIT_INPUTS_ARGUMENT)
+    if (call_convention in (
+        CallConvention.EXPLICIT_INPUTS_ARGUMENT,
+        CallConvention.SINGLE_POSITIONAL_ARGUMENT)):
       assert len(call_args) == 1  # TypeError raised earlier in __call__.
       return call_args[0], call_kwargs
     else:
@@ -1655,7 +1686,7 @@ class DeferredTensor(object):
   """Tensor-like object used to build graphs of layers in Eager mode.
 
   When calling a layer on a DeferredTensor, the layer will not perform any
-  computation and will simply perfom shape inference to return new
+  computation and will simply perform shape inference to return new
   DeferredTensors with appropriate shape information. Thus DeferredTensor
   behaves like a graph-mode Tensor when manipulated by layers.
   """
diff --git a/tensorflow/python/keras/_impl/keras/engine/input_layer.py b/tensorflow/python/keras/engine/input_layer.py
similarity index 94%
rename from tensorflow/python/keras/_impl/keras/engine/input_layer.py
rename to tensorflow/python/keras/engine/input_layer.py
index bd9dcbe3c57685..7996110829b56b 100644
--- a/tensorflow/python/keras/_impl/keras/engine/input_layer.py
+++ b/tensorflow/python/keras/engine/input_layer.py
@@ -21,8 +21,8 @@
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.engine import base_layer
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.util.tf_export import tf_export
 
@@ -119,6 +119,12 @@ def __init__(self,
       self.is_placeholder = False
       self._batch_input_shape = tuple(input_tensor.get_shape().as_list())
 
+      if context.executing_eagerly():
+        raise ValueError('You should not pass an input tensor when executing '
+                         'in eager mode. For example, instead of creating an '
+                         'InputLayer, you should instantiate your model and '
+                         'directly call it on your input.')
+
     # Create an input node to add to self.outbound_node
     # and set output_tensors' _keras_history.
     input_tensor._keras_history = (self, 0, 0)  # pylint: disable=protected-access
diff --git a/tensorflow/python/keras/_impl/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
similarity index 89%
rename from tensorflow/python/keras/_impl/keras/engine/network.py
rename to tensorflow/python/keras/engine/network.py
index a0229be346fc69..f53898d8e3596a 100644
--- a/tensorflow/python/keras/_impl/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -32,16 +32,18 @@
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl.keras import backend
-from tensorflow.python.keras._impl.keras.engine import base_layer
-from tensorflow.python.keras._impl.keras.engine import saving
-from tensorflow.python.keras._impl.keras.utils import generic_utils
-from tensorflow.python.keras._impl.keras.utils import tf_utils
-from tensorflow.python.keras._impl.keras.utils.io_utils import ask_to_proceed_with_overwrite
-from tensorflow.python.keras._impl.keras.utils.layer_utils import print_summary as print_layer_summary
+from tensorflow.python.keras import backend
+from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import saving
+from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.keras.utils import tf_utils
+from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import checkpointable
-from tensorflow.python.training import checkpointable_utils
+from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.checkpointable import data_structures_base
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_inspect
 
@@ -93,6 +95,11 @@ def _base_init(self, name=None):
     self.trainable = True
     self._is_compiled = False
     self._expects_training_arg = False
+    # A list of "extra" variables assigned to attributes of this class, included
+    # in self.weights and self.variables. Always empty for graph networks (but
+    # included in base_init to avoid excessive special casing when retrieving
+    # the value).
+    self._extra_variables = []
 
     self.supports_masking = False
     if not hasattr(self, 'optimizer'):
@@ -115,7 +122,7 @@ def _base_init(self, name=None):
     # Entries are unique. Includes input and output layers.
     self._layers = []
 
-    # Used in symbolic mode only, only in conjonction with graph-networks
+    # Used in symbolic mode only, only in conjunction with graph-networks
     self._outbound_nodes = []
     self._inbound_nodes = []
 
@@ -127,7 +134,7 @@ def _base_init(self, name=None):
     self._in_progress_restore_finalizer = None
 
   def _init_graph_network(self, inputs, outputs, name=None):
-    self._uses_inputs_arg = True
+    self._call_convention = base_layer.CallConvention.EXPLICIT_INPUTS_ARGUMENT
     # Normalize and set self.inputs, self.outputs.
     if isinstance(inputs, (list, tuple)):
       self.inputs = list(inputs)  # Tensor or list of tensors.
@@ -146,14 +153,14 @@ def _init_graph_network(self, inputs, outputs, name=None):
           raise TypeError('When eager execution is enabled, '
                           'inputs must come from a call to '
                           '`tf.keras.Input` (called after '
-                          'tfe.enable_eager_execution()). '
+                          'tf.enable_eager_execution()). '
                           'Received invalid input: ' + str(tensor))
       for tensor in self.outputs:
         if not isinstance(tensor, base_layer.DeferredTensor):  # pylint: disable=protected-access
           raise TypeError('When eager execution is enabled, '
                           'outputs must come from a call to '
                           'a layer (called after '
-                          'tfe.enable_eager_execution()). '
+                          'tf.enable_eager_execution()). '
                           'Received invalid output: ' + str(tensor))
     # Check for redundancy in inputs.
     if len(set(self.inputs)) != len(self.inputs):
@@ -287,19 +294,55 @@ def _init_graph_network(self, inputs, outputs, name=None):
   def _init_subclassed_network(self, name=None):
     self._base_init(name=name)
     self._is_graph_network = False
-    call_args = tf_inspect.getargspec(self.call).args
-    if 'training' in call_args:
+    call_argspec = tf_inspect.getargspec(self.call)
+    if 'training' in call_argspec.args:
       self._expects_training_arg = True
     else:
       self._expects_training_arg = False
-    if 'inputs' in call_args:
-      self._uses_inputs_arg = True
-    else:
-      self._uses_inputs_arg = False
+    self._call_convention = self._determine_call_convention(call_argspec)
     self.outputs = None
     self.inputs = None
     self.built = False
 
+  def _determine_call_convention(self, call_argspec):
+    """Decides how `self.call()` is invoked. See base_layer.CallConvention."""
+    if call_argspec.varargs:
+      may_take_single_argument = False
+    else:
+      try:
+        # Note: tf_inspect doesn't raise a TypeError when regular inspect would,
+        # so we need to keep in mind that "getcallargs" may have returned
+        # something even though we under-specified positional arguments.
+        all_args = tf_inspect.getcallargs(self.call, None)
+        self_args = set()
+        for arg_name, obj in all_args.items():
+          if obj is self:
+            self_args.add(arg_name)
+        may_take_single_argument = True
+      except TypeError:
+        may_take_single_argument = False
+    if may_take_single_argument:
+      # A single positional argument (plus "self") is considered equivalent to
+      # an "inputs" argument.
+      all_positional_args = len(call_argspec.args)
+      if call_argspec.defaults is not None:
+        all_positional_args -= len(call_argspec.defaults)
+      non_self_positional_args = all_positional_args
+      for positional_arg_name in call_argspec.args[:all_positional_args]:
+        if positional_arg_name in self_args:
+          non_self_positional_args -= 1
+      if non_self_positional_args == 1:
+        if 'inputs' in call_argspec.args[all_positional_args:]:
+          raise TypeError(
+              "Model.call() takes a single positional argument (to which "
+              "inputs are passed by convention) and a separate 'inputs' "
+              "argument. Unable to determine which arguments are inputs.")
+        return base_layer.CallConvention.SINGLE_POSITIONAL_ARGUMENT
+    if 'inputs' in call_argspec.args:
+      return base_layer.CallConvention.EXPLICIT_INPUTS_ARGUMENT
+    else:
+      return base_layer.CallConvention.POSITIONAL_ARGUMENTS_ARE_INPUTS
+
   def _track_layers(self, layers):
     """Add Checkpointable dependencies on a list of Layers."""
     weight_layer_index = 0
@@ -318,7 +361,13 @@ def _track_layers(self, layers):
           layer, name='layer-%d' % layer_index, overwrite=True)
 
   def __setattr__(self, name, value):
-    if isinstance(value, (base_layer.Layer, Network)):
+    no_dependency = isinstance(value, checkpointable.NoDependency)
+    if no_dependency:
+      value = value.value
+    if isinstance(value, (
+        base_layer.Layer,
+        Network,
+        data_structures_base.CheckpointableDataStructureBase)):
       try:
         is_graph_network = self._is_graph_network
       except AttributeError:
@@ -332,18 +381,30 @@ def __setattr__(self, name, value):
             # In subclassed models, legacy layers (tf.layers) must always use
             # resource variables.
             value._use_resource_variables = True
-    if isinstance(value, checkpointable.CheckpointableBase):
+    if (not no_dependency
+        and isinstance(value, checkpointable.CheckpointableBase)):
       # Layer (and therefore Network/Model) inherit from CheckpointableBase
       # rather than Checkpointable, which means there is no Checkpointable
       # __setattr__ override (it would be a performance issue for functional
       # layers). Therefore Model tracks Checkpointable objects itself.
       self._track_checkpointable(
           checkpointable=value, name=name, overwrite=True)
+      if (  # For subclassed models only, users may add extra weights/variables
+            # simply by assigning them to attributes.
+          not self._is_graph_network
+          and isinstance(value, variables.Variable)):
+        self._extra_variables.append(value)
     super(Network, self).__setattr__(name, value)
 
   def add_variable(self, name, shape, dtype=None, initializer=None,
                    regularizer=None, trainable=True, constraint=None):
-    raise NotImplementedError('`add_variable` is not supported on Networks.')
+    if self._is_graph_network:
+      raise NotImplementedError('`add_variable` is not supported on Networks.')
+    else:
+      raise NotImplementedError(
+          '`add_variable` is not supported on Networks. However, you may '
+          'assign variables to attributes and they will show up in the weights '
+          'and variables properties.')
 
   def add_loss(self, *args, **kwargs):
     if context.executing_eagerly():
@@ -581,24 +642,17 @@ def losses(self):
 
   @property
   def trainable_weights(self):
-    if not self.trainable:
-      return []
-    weights = []
-    for layer in self.layers:
-      weights += layer.trainable_weights
-    return weights
+    return layer_utils.gather_trainable_weights(
+        trainable=self.trainable,
+        sub_layers=self.layers,
+        extra_variables=self._extra_variables)
 
   @property
   def non_trainable_weights(self):
-    weights = []
-    for layer in self.layers:
-      weights += layer.non_trainable_weights
-    if not self.trainable:
-      trainable_weights = []
-      for layer in self.layers:
-        trainable_weights += layer.trainable_weights
-      return trainable_weights + weights
-    return weights
+    return layer_utils.gather_non_trainable_weights(
+        trainable=self.trainable,
+        sub_layers=self.layers,
+        extra_variables=self._extra_variables)
 
   @property
   def input_spec(self):
@@ -835,10 +889,14 @@ def _run_internal_graph(self, inputs, training=None, mask=None):
               output_tensors = nest.flatten(
                   layer.call(computed_tensor, **kwargs))
               if hasattr(layer, 'compute_mask'):
-                output_masks = nest.flatten(
-                    layer.compute_mask(computed_tensor, computed_mask))
+                output_masks = layer.compute_mask(computed_tensor,
+                                                  computed_mask)
+                if output_masks is None:
+                  output_masks = [None for _ in output_tensors]
+                else:
+                  output_masks = nest.flatten(output_masks)
               else:
-                output_masks = [None for _ in range(len(output_tensors))]
+                output_masks = [None for _ in output_tensors]
               computed_tensors = [computed_tensor]
               computed_masks = [computed_mask]
             else:
@@ -851,11 +909,16 @@ def _run_internal_graph(self, inputs, training=None, mask=None):
 
               output_tensors = nest.flatten(
                   layer.call(computed_tensors, **kwargs))
+
               if hasattr(layer, 'compute_mask'):
-                output_masks = nest.flatten(
-                    layer.compute_mask(computed_tensors, computed_masks))
+                output_masks = layer.compute_mask(computed_tensors,
+                                                  computed_masks)
+                if output_masks is None:
+                  output_masks = [None for _ in output_tensors]
+                else:
+                  output_masks = nest.flatten(output_masks)
               else:
-                output_masks = [None for _ in range(len(output_tensors))]
+                output_masks = [None for _ in output_tensors]
 
             if not context.executing_eagerly():
               if layer.activity_regularizer is not None:
@@ -1069,7 +1132,7 @@ def process_layer(layer_data):
       layer_name = layer_data['name']
 
       # Instantiate layer.
-      from tensorflow.python.keras._impl.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
+      from tensorflow.python.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
 
       layer = deserialize_layer(layer_data, custom_objects=custom_objects)
       created_layers[layer_name] = layer
@@ -1153,7 +1216,7 @@ def save(self, filepath, overwrite=True, include_optimizer=True):
     if not self._is_graph_network:
       raise NotImplementedError
 
-    from tensorflow.python.keras._impl.keras.models import save_model  # pylint: disable=g-import-not-at-top
+    from tensorflow.python.keras.models import save_model  # pylint: disable=g-import-not-at-top
     save_model(self, filepath, overwrite, include_optimizer)
 
   def save_weights(self, filepath, overwrite=True, save_format=None):
@@ -1198,7 +1261,7 @@ def save_weights(self, filepath, overwrite=True, save_format=None):
             format.
         ValueError: For invalid/unknown format arguments.
     """
-    filepath_is_h5 = filepath.endswith('.h5') or filepath.endswith('.keras')
+    filepath_is_h5 = _is_hdf5_filepath(filepath)
     if save_format is None:
       if filepath_is_h5:
         save_format = 'h5'
@@ -1280,12 +1343,15 @@ def load_weights(self, filepath, by_name=False):
         ImportError: If h5py is not available and the weight file is in HDF5
             format.
     """
-    try:
-      pywrap_tensorflow.NewCheckpointReader(filepath)
-      save_format = 'tf'
-    except errors_impl.DataLossError:
-      # The checkpoint is not readable in TensorFlow format. Try HDF5.
+    if _is_hdf5_filepath(filepath):
       save_format = 'h5'
+    else:
+      try:
+        pywrap_tensorflow.NewCheckpointReader(filepath)
+        save_format = 'tf'
+      except errors_impl.DataLossError:
+        # The checkpoint is not readable in TensorFlow format. Try HDF5.
+        save_format = 'h5'
     if save_format == 'tf':
       status = self._checkpointable_saver.restore(filepath)
       if by_name:
@@ -1332,7 +1398,7 @@ def _updated_config(self):
     Returns:
         Model config with Keras version information added.
     """
-    from tensorflow.python.keras._impl.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
+    from tensorflow.python.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
 
     config = self.get_config()
     model_config = {
@@ -1391,7 +1457,8 @@ def to_yaml(self, **kwargs):
         ImportError: if yaml module is not found.
     """
     if yaml is None:
-      raise ImportError('Requires yaml module installed.')
+      raise ImportError(
+          'Requires yaml module installed (`pip install pyyaml`).')
     return yaml.dump(self._updated_config(), **kwargs)
 
   def summary(self, line_length=None, positions=None, print_fn=None):
@@ -1408,11 +1475,19 @@ def summary(self, line_length=None, positions=None, print_fn=None):
             It will be called on each line of the summary.
             You can set it to a custom function
             in order to capture the string summary.
+
+    Raises:
+        ValueError: if `summary()` is called before the model is built.
     """
-    print_layer_summary(self,
-                        line_length=line_length,
-                        positions=positions,
-                        print_fn=print_fn)
+    if not self.built:
+      raise ValueError('This model has never been called, thus its weights '
+                       'have not yet been created, so no summary can be '
+                       'displayed. Build the model first '
+                       '(e.g. by calling it on some data).')
+    layer_utils.print_summary(self,
+                              line_length=line_length,
+                              positions=positions,
+                              print_fn=print_fn)
 
 
 def get_source_inputs(tensor, layer=None, node_index=None):
@@ -1456,6 +1531,10 @@ def get_source_inputs(tensor, layer=None, node_index=None):
       return source_tensors
 
 
+def _is_hdf5_filepath(filepath):
+  return filepath.endswith('.h5') or filepath.endswith('.keras')
+
+
 def _make_node_key(layer_name, node_index):
   return layer_name + '_ib-' + str(node_index)
 
diff --git a/tensorflow/python/keras/_impl/keras/engine/saving.py b/tensorflow/python/keras/engine/saving.py
similarity index 79%
rename from tensorflow/python/keras/_impl/keras/engine/saving.py
rename to tensorflow/python/keras/engine/saving.py
index 2ad06ca4fdcd55..b9a2e1f25f637d 100644
--- a/tensorflow/python/keras/_impl/keras/engine/saving.py
+++ b/tensorflow/python/keras/engine/saving.py
@@ -25,11 +25,12 @@
 import numpy as np
 from six.moves import zip  # pylint: disable=redefined-builtin
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import optimizers
-from tensorflow.python.keras._impl.keras.utils import conv_utils
-from tensorflow.python.keras._impl.keras.utils.io_utils import ask_to_proceed_with_overwrite
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import optimizers
+from tensorflow.python.keras.utils import conv_utils
+from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import serialization
 from tensorflow.python.util.tf_export import tf_export
 
 # pylint: disable=g-import-not-at-top
@@ -61,7 +62,9 @@ def save_model(model, filepath, overwrite=True, include_optimizer=True):
 
   Arguments:
       model: Keras model instance to be saved.
-      filepath: String, path where to save the model.
+      filepath: One of the following:
+          - String, path where to save the model
+          - `h5py.File` object where to save the model
       overwrite: Whether we should overwrite any existing
           model at the target location, or instead
           ask the user with a manual prompt.
@@ -74,49 +77,22 @@ def save_model(model, filepath, overwrite=True, include_optimizer=True):
   if h5py is None:
     raise ImportError('`save_model` requires h5py.')
 
-  def get_json_type(obj):
-    """Serializes any object to a JSON-serializable structure.
+  from tensorflow.python.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
 
-    Arguments:
-        obj: the object to serialize
-
-    Returns:
-        JSON-serializable structure representing `obj`.
-
-    Raises:
-        TypeError: if `obj` cannot be serialized.
-    """
-    # if obj is a serializable Keras class instance
-    # e.g. optimizer, layer
-    if hasattr(obj, 'get_config'):
-      return {'class_name': obj.__class__.__name__, 'config': obj.get_config()}
-
-    # if obj is any numpy type
-    if type(obj).__module__ == np.__name__:
-      if isinstance(obj, np.ndarray):
-        return {'type': type(obj), 'value': obj.tolist()}
-      else:
-        return obj.item()
+  if not isinstance(filepath, h5py.File):
+    # If file exists and should not be overwritten.
+    if not overwrite and os.path.isfile(filepath):
+      proceed = ask_to_proceed_with_overwrite(filepath)
+      if not proceed:
+        return
 
-    # misc functions (e.g. loss function)
-    if callable(obj):
-      return obj.__name__
-
-    # if obj is a python 'type'
-    if type(obj).__name__ == type.__name__:
-      return obj.__name__
-
-    raise TypeError('Not JSON Serializable:', obj)
-
-  from tensorflow.python.keras._impl.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
-
-  # If file exists and should not be overwritten.
-  if not overwrite and os.path.isfile(filepath):
-    proceed = ask_to_proceed_with_overwrite(filepath)
-    if not proceed:
-      return
+    f = h5py.File(filepath, mode='w')
+    opened_new_file = True
+  else:
+    f = filepath
+    opened_new_file = False
 
-  with h5py.File(filepath, mode='w') as f:
+  try:
     f.attrs['keras_version'] = str(keras_version).encode('utf8')
     f.attrs['backend'] = K.backend().encode('utf8')
     f.attrs['model_config'] = json.dumps(
@@ -124,13 +100,13 @@ def get_json_type(obj):
             'class_name': model.__class__.__name__,
             'config': model.get_config()
         },
-        default=get_json_type).encode('utf8')
+        default=serialization.get_json_type).encode('utf8')
 
     model_weights_group = f.create_group('model_weights')
     model_layers = model.layers
     save_weights_to_hdf5_group(model_weights_group, model_layers)
 
-    if include_optimizer and hasattr(model, 'optimizer'):
+    if include_optimizer and model.optimizer:
       if isinstance(model.optimizer, optimizers.TFOptimizer):
         logging.warning(
             'TensorFlow optimizers do not '
@@ -154,7 +130,7 @@ def get_json_type(obj):
                 'sample_weight_mode': model.sample_weight_mode,
                 'loss_weights': model.loss_weights,
             },
-            default=get_json_type).encode('utf8')
+            default=serialization.get_json_type).encode('utf8')
 
         # Save optimizer weights.
         symbolic_weights = getattr(model.optimizer, 'weights')
@@ -175,6 +151,9 @@ def get_json_type(obj):
             else:
               param_dset[:] = val
     f.flush()
+  finally:
+    if opened_new_file:
+      f.close()
 
 
 @tf_export('keras.models.load_model')
@@ -182,7 +161,9 @@ def load_model(filepath, custom_objects=None, compile=True):  # pylint: disable=
   """Loads a model saved via `save_model`.
 
   Arguments:
-      filepath: String, path to the saved model.
+      filepath: One of the following:
+          - String, path to the saved model
+          - `h5py.File` object from which to load the model
       custom_objects: Optional dictionary mapping names
           (strings) to custom classes or functions to be
           considered during deserialization.
@@ -232,7 +213,14 @@ def convert_custom_objects(obj):
       return custom_objects[obj]
     return obj
 
-  with h5py.File(filepath, mode='r') as f:
+  opened_new_file = not isinstance(filepath, h5py.File)
+  if opened_new_file:
+    f = h5py.File(filepath, mode='r')
+  else:
+    f = filepath
+
+  model = None
+  try:
     # instantiate model
     model_config = f.attrs.get('model_config')
     if model_config is None:
@@ -243,54 +231,54 @@ def convert_custom_objects(obj):
     # set weights
     load_weights_from_hdf5_group(f['model_weights'], model.layers)
 
-    # Early return if compilation is not required.
-    if not compile:
-      return model
-
-    # instantiate optimizer
-    training_config = f.attrs.get('training_config')
-    if training_config is None:
-      logging.warning('No training configuration found in save file: '
-                      'the model was *not* compiled. Compile it manually.')
-      return model
-    training_config = json.loads(training_config.decode('utf-8'))
-    optimizer_config = training_config['optimizer_config']
-    optimizer = optimizers.deserialize(
-        optimizer_config, custom_objects=custom_objects)
-
-    # Recover loss functions and metrics.
-    loss = convert_custom_objects(training_config['loss'])
-    metrics = convert_custom_objects(training_config['metrics'])
-    sample_weight_mode = training_config['sample_weight_mode']
-    loss_weights = training_config['loss_weights']
-
-    # Compile model.
-    model.compile(
-        optimizer=optimizer,
-        loss=loss,
-        metrics=metrics,
-        loss_weights=loss_weights,
-        sample_weight_mode=sample_weight_mode)
-
-    # Set optimizer weights.
-    if 'optimizer_weights' in f:
-      # Build train function (to get weight updates).
-      model._make_train_function()
-      optimizer_weights_group = f['optimizer_weights']
-      optimizer_weight_names = [
-          n.decode('utf8')
-          for n in optimizer_weights_group.attrs['weight_names']
-      ]
-      optimizer_weight_values = [
-          optimizer_weights_group[n] for n in optimizer_weight_names
-      ]
-      try:
-        model.optimizer.set_weights(optimizer_weight_values)
-      except ValueError:
-        logging.warning('Error in loading the saved optimizer '
-                        'state. As a result, your model is '
-                        'starting with a freshly initialized '
-                        'optimizer.')
+    if compile:
+      # instantiate optimizer
+      training_config = f.attrs.get('training_config')
+      if training_config is None:
+        logging.warning('No training configuration found in save file: '
+                        'the model was *not* compiled. Compile it manually.')
+        return model
+      training_config = json.loads(training_config.decode('utf-8'))
+      optimizer_config = training_config['optimizer_config']
+      optimizer = optimizers.deserialize(
+          optimizer_config, custom_objects=custom_objects)
+
+      # Recover loss functions and metrics.
+      loss = convert_custom_objects(training_config['loss'])
+      metrics = convert_custom_objects(training_config['metrics'])
+      sample_weight_mode = training_config['sample_weight_mode']
+      loss_weights = training_config['loss_weights']
+
+      # Compile model.
+      model.compile(
+          optimizer=optimizer,
+          loss=loss,
+          metrics=metrics,
+          loss_weights=loss_weights,
+          sample_weight_mode=sample_weight_mode)
+
+      # Set optimizer weights.
+      if 'optimizer_weights' in f:
+        # Build train function (to get weight updates).
+        model._make_train_function()
+        optimizer_weights_group = f['optimizer_weights']
+        optimizer_weight_names = [
+            n.decode('utf8')
+            for n in optimizer_weights_group.attrs['weight_names']
+        ]
+        optimizer_weight_values = [
+            optimizer_weights_group[n] for n in optimizer_weight_names
+        ]
+        try:
+          model.optimizer.set_weights(optimizer_weight_values)
+        except ValueError:
+          logging.warning('Error in loading the saved optimizer '
+                          'state. As a result, your model is '
+                          'starting with a freshly initialized '
+                          'optimizer.')
+  finally:
+    if opened_new_file:
+      f.close()
   return model
 
 
@@ -314,7 +302,7 @@ def model_from_config(config, custom_objects=None):
     raise TypeError('`model_from_config` expects a dictionary, not a list. '
                     'Maybe you meant to use '
                     '`Sequential.from_config(config)`?')
-  from tensorflow.python.keras._impl.keras.layers import deserialize  # pylint: disable=g-import-not-at-top
+  from tensorflow.python.keras.layers import deserialize  # pylint: disable=g-import-not-at-top
   return deserialize(config, custom_objects=custom_objects)
 
 
@@ -335,9 +323,9 @@ def model_from_yaml(yaml_string, custom_objects=None):
       ImportError: if yaml module is not found.
   """
   if yaml is None:
-    raise ImportError('Requires yaml module installed.')
+    raise ImportError('Requires yaml module installed (`pip install pyyaml`).')
   config = yaml.load(yaml_string)
-  from tensorflow.python.keras._impl.keras.layers import deserialize  # pylint: disable=g-import-not-at-top
+  from tensorflow.python.keras.layers import deserialize  # pylint: disable=g-import-not-at-top
   return deserialize(config, custom_objects=custom_objects)
 
 
@@ -355,7 +343,7 @@ def model_from_json(json_string, custom_objects=None):
       A Keras model instance (uncompiled).
   """
   config = json.loads(json_string)
-  from tensorflow.python.keras._impl.keras.layers import deserialize  # pylint: disable=g-import-not-at-top
+  from tensorflow.python.keras.layers import deserialize  # pylint: disable=g-import-not-at-top
   return deserialize(config, custom_objects=custom_objects)
 
 
@@ -498,34 +486,10 @@ def preprocess_weights_for_loading(layer,
       if layer.__class__.__name__ == 'ConvLSTM2D':
         weights[1] = np.transpose(weights[1], (3, 2, 0, 1))
 
-  # Convert the weights of CuDNNLSTM so that they could be loaded into LSTM
-  if layer.__class__.__name__ == 'LSTM' and len(weights) == 3:
-    # Determine if loading a CuDNNLSTM layer from the number of bias weights:
-    # CuDNNLSTM has (units * 8) weights; while LSTM has (units * 4)
-    # if there's no bias weight in the file, skip this conversion
-    units = weights[1].shape[0]
-    bias = weights[2]
-    if len(bias) == units * 8:
-      # reshape the kernels
-      kernels = np.split(weights[0], 4, axis=1)
-      kernels = [
-          kernel.reshape(-1).reshape(kernel.shape, order='F')
-          for kernel in kernels
-      ]
-      weights[0] = np.concatenate(kernels, axis=1)
-
-      # transpose the recurrent kernels
-      recurrent_kernels = np.split(weights[1], 4, axis=1)
-      recurrent_kernels = [kernel.T for kernel in recurrent_kernels]
-      weights[1] = np.concatenate(recurrent_kernels, axis=1)
-
-      # split the bias into half and merge
-      weights[2] = bias[:units * 4] + bias[units * 4:]
+  return _convert_rnn_weights(layer, weights)
 
-  return convert_rnn_weights(layer, weights)
 
-
-def convert_rnn_weights(layer, weights):
+def _convert_rnn_weights(layer, weights):
   """Converts weights for RNN layers between native and CuDNN format.
 
   Input kernels for each gate are transposed and converted between Fortran
@@ -557,6 +521,7 @@ def transform_kernels(kernels, func, n_gates):
         kernels: Stacked array of kernels for individual gates.
         func: Function applied to kernel of each gate.
         n_gates: Number of gates (4 for LSTM, 3 for GRU).
+
     Returns:
         Stacked array of transformed kernels.
     """
@@ -578,6 +543,7 @@ def transpose_input(from_cudnn):
     Arguments:
         from_cudnn: `True` if source weights are in CuDNN format, `False`
             if they're in plain Keras format.
+
     Returns:
         Function that converts input kernel to the other format.
     """
@@ -608,27 +574,96 @@ def transform(kernel):
       raise ValueError('Invalid bias shape: ' + str(bias_shape))
 
     def convert_lstm_weights(weights, from_cudnn=True):
-      # Transpose (and reshape) input and recurrent kernels.
+      """Converts the weights between CuDNNLSTM and LSTM.
+
+      Arguments:
+        weights: Original weights.
+        from_cudnn: Indicates whether original weights are from CuDNN layer.
+
+      Returns:
+        Updated weights compatible with LSTM.
+      """
+
+      # Transpose (and reshape) input and recurrent kernels
       kernels = transform_kernels(weights[0], transpose_input(from_cudnn),
                                   n_gates)
       recurrent_kernels = transform_kernels(weights[1], lambda k: k.T, n_gates)
-      if from_cudnn:  # Merge input and recurrent biases into a single set.
+      if from_cudnn:
+        # merge input and recurrent biases into a single set
         biases = np.sum(np.split(weights[2], 2, axis=0), axis=0)
       else:
-        # Split single set of biases evenly to two sets.
+        # Split single set of biases evenly to two sets. The way of
+        # splitting doesn't matter as long as the two sets sum is kept.
         biases = np.tile(0.5 * weights[2], 2)
       return [kernels, recurrent_kernels, biases]
 
     if source != target_class:
       weights = convert_lstm_weights(weights, from_cudnn=source == 'CuDNNLSTM')
 
-  # TODO(fchollet): add feature after GRU is refactored:
-  # convert the weights between `CuDNNGRU` and `GRU(reset_after=True)`
+  # convert the weights between CuDNNGRU and GRU(reset_after=True)
+  if target_class in ['GRU', 'CuDNNGRU'] and len(weights) == 3:
+    # We can determine the source of the weights from the shape of the bias.
+    # If there is no bias we skip the conversion since
+    # CuDNNGRU always has biases.
+
+    units = weights[1].shape[0]
+    bias_shape = weights[2].shape
+    n_gates = 3
+
+    def convert_gru_weights(weights, from_cudnn=True):
+      """Converts the weights between CuDNNGRU and GRU.
+
+      Arguments:
+        weights: Original weights.
+        from_cudnn: Indicates whether original weights are from CuDNN layer.
+
+      Returns:
+        Updated weights compatible with GRU.
+      """
+
+      kernels = transform_kernels(weights[0], transpose_input(from_cudnn),
+                                  n_gates)
+      recurrent_kernels = transform_kernels(weights[1], lambda k: k.T, n_gates)
+      biases = weights[2].reshape((2, -1) if from_cudnn else -1)
+      return [kernels, recurrent_kernels, biases]
+
+    if bias_shape == (2 * units * n_gates,):
+      source = 'CuDNNGRU'
+    elif bias_shape == (2, units * n_gates):
+      source = 'GRU(reset_after=True)'
+    elif bias_shape == (units * n_gates,):
+      source = 'GRU(reset_after=False)'
+    else:
+      raise ValueError('Invalid bias shape: ' + str(bias_shape))
+
+    if target_class == 'CuDNNGRU':
+      target = 'CuDNNGRU'
+    elif layer.reset_after:
+      target = 'GRU(reset_after=True)'
+    else:
+      target = 'GRU(reset_after=False)'
+
+    # only convert between different types
+    if source != target:
+      types = (source, target)
+      if 'GRU(reset_after=False)' in types:
+        raise ValueError('%s is not compatible with %s' % types)
+      if source == 'CuDNNGRU':
+        weights = convert_gru_weights(weights, from_cudnn=True)
+      elif source == 'GRU(reset_after=True)':
+        weights = convert_gru_weights(weights, from_cudnn=False)
+
   return weights
 
 
 def save_weights_to_hdf5_group(f, layers):
-  from tensorflow.python.keras._impl.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
+  """Saves the weights of a list of layers to a HDF5 group.
+
+  Arguments:
+      f: HDF5 group.
+      layers: List of layer instances.
+  """
+  from tensorflow.python.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
 
   save_attributes_to_hdf5_group(
       f, 'layer_names', [layer.name.encode('utf8') for layer in layers])
@@ -702,7 +737,7 @@ def load_weights_from_hdf5_group(f, layers):
   for k, name in enumerate(layer_names):
     g = f[name]
     weight_names = load_attributes_from_hdf5_group(g, 'weight_names')
-    weight_values = [g[weight_name] for weight_name in weight_names]
+    weight_values = [np.asarray(g[weight_name]) for weight_name in weight_names]
     layer = filtered_layers[k]
     symbolic_weights = layer.weights
     weight_values = preprocess_weights_for_loading(
@@ -758,7 +793,7 @@ def load_weights_from_hdf5_group_by_name(f, layers):
   for k, name in enumerate(layer_names):
     g = f[name]
     weight_names = load_attributes_from_hdf5_group(g, 'weight_names')
-    weight_values = [g[weight_name] for weight_name in weight_names]
+    weight_values = [np.asarray(g[weight_name]) for weight_name in weight_names]
 
     for layer in index.get(name, []):
       symbolic_weights = layer.weights
diff --git a/tensorflow/python/keras/_impl/keras/engine/saving_test.py b/tensorflow/python/keras/engine/saving_test.py
similarity index 85%
rename from tensorflow/python/keras/_impl/keras/engine/saving_test.py
rename to tensorflow/python/keras/engine/saving_test.py
index edd296a281766e..4352c5cb189d41 100644
--- a/tensorflow/python/keras/_impl/keras/engine/saving_test.py
+++ b/tensorflow/python/keras/engine/saving_test.py
@@ -22,15 +22,16 @@
 import shutil
 import tempfile
 
+from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras.engine import training
+from tensorflow.python.keras.engine import training
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
@@ -42,7 +43,7 @@
   h5py = None
 
 
-class TestWeightSavingAndLoading(test.TestCase):
+class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
 
   def test_weight_loading(self):
     with self.test_session():
@@ -181,6 +182,41 @@ def test_weight_preprocessing(self):
     _ = keras.engine.saving.preprocess_weights_for_loading(
         model, model.weights, original_keras_version='1')
 
+  @parameterized.named_parameters(
+      ('gru', keras.layers.GRU, {
+          'units': 2,
+          'input_shape': (3, 5)
+      }),
+      ('gru_with_reset_after', keras.layers.GRU, {
+          'units': 2,
+          'input_shape': (3, 5),
+          'reset_after': True
+      }),
+      ('lstm', keras.layers.LSTM, {
+          'units': 2,
+          'input_shape': (3, 5)
+      }),
+      ('cudnngru', keras.layers.CuDNNGRU, {
+          'units': 2,
+          'input_shape': (3, 5)
+      }),
+      ('cudnnlstm', keras.layers.CuDNNLSTM, {
+          'units': 2,
+          'input_shape': (3, 5)
+      }))
+  def test_preprocess_weights_for_loading_rnn_should_be_idempotent(
+      self, layer_class, layer_args):
+    with self.test_session():
+      layer = layer_class(**layer_args)
+      layer.build(input_shape=layer_args.get('input_shape'))
+      weights1 = layer.get_weights()
+      weights2 = keras.engine.saving.preprocess_weights_for_loading(
+          layer, weights1)
+      _ = [
+          self.assertAllClose(x, y, rtol=1e-05)
+          for (x, y) in zip(weights1, weights2)
+      ]
+
   def test_sequential_weight_loading(self):
     if h5py is None:
       return
@@ -217,7 +253,7 @@ class TestWholeModelSaving(test.TestCase):
 
   def test_sequential_model_saving(self):
     if h5py is None:
-      return  # Skip test if models cannot be saved.
+      self.skipTest('h5py required to run this test')
 
     with self.test_session():
       model = keras.models.Sequential()
@@ -252,9 +288,33 @@ def test_sequential_model_saving(self):
       out2 = new_model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
 
+  def test_sequential_model_saving_without_compile(self):
+    if h5py is None:
+      self.skipTest('h5py required to run this test')
+
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_shape=(3,)))
+      model.add(keras.layers.RepeatVector(3))
+      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
+
+      x = np.random.random((1, 3))
+      out = model.predict(x)
+      fd, fname = tempfile.mkstemp('.h5')
+
+      # Save the model without any compilation or training.
+      keras.models.save_model(model, fname)
+
+      new_model = keras.models.load_model(fname)
+      os.close(fd)
+      os.remove(fname)
+
+      out2 = new_model.predict(x)
+      self.assertAllClose(out, out2, atol=1e-05)
+
   def test_sequential_model_saving_2(self):
     if h5py is None:
-      return  # Skip test if models cannot be saved.
+      self.skipTest('h5py required to run this test')
 
     with self.test_session():
       # test with custom optimizer, loss
@@ -290,7 +350,7 @@ def custom_loss(y_true, y_pred):
 
   def test_functional_model_saving(self):
     if h5py is None:
-      return  # Skip test if models cannot be saved.
+      self.skipTest('h5py required to run this test')
 
     with self.test_session():
       inputs = keras.layers.Input(shape=(3,))
@@ -318,7 +378,7 @@ def test_functional_model_saving(self):
 
   def test_saving_without_compilation(self):
     if h5py is None:
-      return  # Skip test if models cannot be saved.
+      self.skipTest('h5py required to run this test')
 
     with self.test_session():
       model = keras.models.Sequential()
@@ -334,7 +394,7 @@ def test_saving_without_compilation(self):
 
   def test_saving_with_tf_optimizer(self):
     if h5py is None:
-      return  # Skip test if models cannot be saved.
+      self.skipTest('h5py required to run this test')
 
     with self.test_session():
       model = keras.models.Sequential()
@@ -352,7 +412,7 @@ def test_saving_with_tf_optimizer(self):
 
   def test_saving_right_after_compilation(self):
     if h5py is None:
-      return  # Skip test if models cannot be saved.
+      self.skipTest('h5py required to run this test')
 
     with self.test_session():
       model = keras.models.Sequential()
@@ -369,7 +429,7 @@ def test_saving_right_after_compilation(self):
 
   def test_saving_lambda_numpy_array_arguments(self):
     if h5py is None:
-      return  # Skip test if models cannot be saved.
+      self.skipTest('h5py required to run this test')
 
     mean = np.random.random((4, 2, 3))
     std = np.abs(np.random.random((4, 2, 3))) + 1e-5
@@ -391,7 +451,7 @@ def test_saving_lambda_numpy_array_arguments(self):
 
   def test_saving_model_with_long_layer_names(self):
     if h5py is None:
-      return  # Skip test if models cannot be saved.
+      self.skipTest('h5py required to run this test')
 
     with self.test_session():
       # This layer name will make the `layers_name` HDF5 attribute blow
@@ -421,7 +481,7 @@ def test_saving_model_with_long_layer_names(self):
       with h5py.File(fname, 'r') as h5file:
         num_names_arrays = len([attr for attr in h5file['model_weights'].attrs
                                 if attr.startswith('layer_names')])
-      # The chunking of layer names array should have happend.
+      # The chunking of layer names array should have happened.
       self.assertGreater(num_names_arrays, 0)
       out2 = model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
@@ -432,7 +492,7 @@ def test_saving_model_with_long_layer_names(self):
 
   def test_saving_model_with_long_weights_names(self):
     if h5py is None:
-      return  # Skip test if models cannot be saved.
+      self.skipTest('h5py required to run this test')
 
     with self.test_session():
       x = keras.Input(shape=(2,), name='nested_model_input')
@@ -466,7 +526,7 @@ def test_saving_model_with_long_weights_names(self):
         num_weight_arrays = len(
             [attr for attr in h5file['model_weights']['nested_model'].attrs
              if attr.startswith('weight_names')])
-      # The chunking of layer names array should have happend.
+      # The chunking of layer names array should have happened.
       self.assertGreater(num_weight_arrays, 0)
       out2 = model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
@@ -475,6 +535,43 @@ def test_saving_model_with_long_weights_names(self):
       os.close(fd)
       os.remove(fname)
 
+  def test_model_saving_to_pre_created_h5py_file(self):
+    if h5py is None:
+      self.skipTest('h5py required to run this test')
+
+    with self.test_session():
+      inputs = keras.Input(shape=(3,))
+      x = keras.layers.Dense(2)(inputs)
+      outputs = keras.layers.Dense(3)(x)
+
+      model = keras.Model(inputs, outputs)
+      model.compile(loss=keras.losses.MSE,
+                    optimizer=keras.optimizers.Adam(),
+                    metrics=[keras.metrics.categorical_accuracy])
+      x = np.random.random((1, 3))
+      y = np.random.random((1, 3))
+      model.train_on_batch(x, y)
+
+      out = model.predict(x)
+      fd, fname = tempfile.mkstemp('.h5')
+      with h5py.File(fname, mode='r+') as h5file:
+        keras.models.save_model(model, h5file)
+        loaded_model = keras.models.load_model(h5file)
+        out2 = loaded_model.predict(x)
+      self.assertAllClose(out, out2, atol=1e-05)
+
+      # Test non-default options in h5
+      with h5py.File('_', driver='core',
+                     backing_store=False) as h5file:
+        keras.models.save_model(model, h5file)
+        loaded_model = keras.models.load_model(h5file)
+        out2 = loaded_model.predict(x)
+      self.assertAllClose(out, out2, atol=1e-05)
+
+      # Cleanup
+      os.close(fd)
+      os.remove(fname)
+
 
 class SubclassedModel(training.Model):
 
diff --git a/tensorflow/python/keras/_impl/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
similarity index 95%
rename from tensorflow/python/keras/_impl/keras/engine/sequential.py
rename to tensorflow/python/keras/engine/sequential.py
index 8626626ca1a232..52e29b0ffad7d2 100644
--- a/tensorflow/python/keras/_impl/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -21,13 +21,13 @@
 
 import copy
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import layers as layer_module
-from tensorflow.python.keras._impl.keras.engine import base_layer
-from tensorflow.python.keras._impl.keras.engine import network
-from tensorflow.python.keras._impl.keras.engine.input_layer import Input
-from tensorflow.python.keras._impl.keras.engine.input_layer import InputLayer
-from tensorflow.python.keras._impl.keras.engine.training import Model
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import layers as layer_module
+from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import network
+from tensorflow.python.keras.engine.input_layer import Input
+from tensorflow.python.keras.engine.input_layer import InputLayer
+from tensorflow.python.keras.engine.training import Model
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
diff --git a/tensorflow/python/keras/_impl/keras/engine/sequential_test.py b/tensorflow/python/keras/engine/sequential_test.py
similarity index 79%
rename from tensorflow/python/keras/_impl/keras/engine/sequential_test.py
rename to tensorflow/python/keras/engine/sequential_test.py
index 8aba16aef3e187..69a288e69b60b0 100644
--- a/tensorflow/python/keras/_impl/keras/engine/sequential_test.py
+++ b/tensorflow/python/keras/engine/sequential_test.py
@@ -20,8 +20,11 @@
 
 import numpy as np
 
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.keras._impl import keras
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import rmsprop
 
@@ -75,7 +78,7 @@ def test_sequential_pop(self):
       model.pop()
 
   @tf_test_util.run_in_graph_and_eager_modes()
-  def test_sequential_deferred_build(self):
+  def test_sequential_deferred_build_with_np_arrays(self):
     num_hidden = 5
     input_dim = 3
     batch_size = 5
@@ -99,6 +102,40 @@ def test_sequential_deferred_build(self):
                      [None, num_classes])
     self.assertEqual(len(model.weights), 2 * 2)
 
+  @tf_test_util.run_in_graph_and_eager_modes()
+  def test_sequential_deferred_build_with_dataset_iterators(self):
+    if not context.executing_eagerly():
+      # TODO(psv/fchollet): Add support for this use case in graph mode.
+      return
+    num_hidden = 5
+    input_dim = 3
+    num_classes = 2
+    num_samples = 50
+    steps_per_epoch = 10
+
+    model = keras.models.Sequential()
+    # We don't specify the input shape.
+    model.add(keras.layers.Dense(num_hidden))
+    model.add(keras.layers.Dense(num_classes))
+    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
+    self.assertEqual(len(model.layers), 2)
+    self.assertEqual(len(model.weights), 0)
+    self.assertFalse(model.built)
+
+    x = array_ops.ones((num_samples, input_dim))
+    y = array_ops.zeros((num_samples, num_classes))
+    dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+    iterator = dataset.make_one_shot_iterator()
+
+    model.fit(iterator, epochs=1, steps_per_epoch=steps_per_epoch)
+    self.assertTrue(model.built)
+    self.assertEqual(model.inputs[0].get_shape().as_list(), [None, input_dim])
+    self.assertEqual(model.outputs[0].get_shape().as_list(),
+                     [None, num_classes])
+    self.assertEqual(len(model.weights), 2 * 2)
+
   @tf_test_util.run_in_graph_and_eager_modes()
   def test_invalid_use_cases(self):
     # Added objects must be layer instances
diff --git a/tensorflow/python/keras/_impl/keras/engine/topology_test.py b/tensorflow/python/keras/engine/topology_test.py
similarity index 97%
rename from tensorflow/python/keras/_impl/keras/engine/topology_test.py
rename to tensorflow/python/keras/engine/topology_test.py
index 6993a042890088..183e26e8bf813e 100644
--- a/tensorflow/python/keras/_impl/keras/engine/topology_test.py
+++ b/tensorflow/python/keras/engine/topology_test.py
@@ -20,12 +20,12 @@
 
 import numpy as np
 
+from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras.engine import base_layer
+from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
@@ -883,6 +883,33 @@ def test_explicit_training_argument(self):
       preds = model.predict(x)
       self.assertEqual(np.min(preds), 0.)  # At least one unit was dropped.
 
+  def test_multi_output_model_with_none_masking(self):
+
+    with self.test_session():
+      def func(x):
+        return [x * 0.2, x * 0.3]
+
+      def output_shape(input_shape):
+        return [input_shape, input_shape]
+
+      i = keras.layers.Input(shape=(3, 2, 1))
+      o = keras.layers.Lambda(function=func, output_shape=output_shape)(i)
+
+      self.assertEqual(keras.backend.int_shape(o[0]), (None, 3, 2, 1))
+      self.assertEqual(keras.backend.int_shape(o[1]), (None, 3, 2, 1))
+
+      o = keras.layers.add(o)
+      model = keras.Model(i, o)
+
+      i2 = keras.layers.Input(shape=(3, 2, 1))
+      o2 = model(i2)
+      model2 = keras.Model(i2, o2)
+
+      x = np.random.random((4, 3, 2, 1))
+      out = model2.predict(x)
+      assert out.shape == (4, 3, 2, 1)
+      self.assertAllClose(out, x * 0.2 + x * 0.3, atol=1e-4)
+
 
 class DeferredModeTest(test.TestCase):
 
diff --git a/tensorflow/python/keras/_impl/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
similarity index 85%
rename from tensorflow/python/keras/_impl/keras/engine/training.py
rename to tensorflow/python/keras/engine/training.py
index 5f9b3e8c7d7a93..aca63f822bb1c5 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -18,25 +18,27 @@
 from __future__ import division
 from __future__ import print_function
 
+import weakref
 import numpy as np
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import losses
-from tensorflow.python.keras._impl.keras import metrics as metrics_module
-from tensorflow.python.keras._impl.keras import optimizers
-from tensorflow.python.keras._impl.keras.engine import training_arrays
-from tensorflow.python.keras._impl.keras.engine import training_eager
-from tensorflow.python.keras._impl.keras.engine import training_generator
-from tensorflow.python.keras._impl.keras.engine import training_utils
-from tensorflow.python.keras._impl.keras.engine.base_layer import DeferredTensor
-from tensorflow.python.keras._impl.keras.engine.base_layer import Layer
-from tensorflow.python.keras._impl.keras.engine.network import Network
-from tensorflow.python.keras._impl.keras.utils.generic_utils import slice_arrays
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import losses
+from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.keras import optimizers
+from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import training_arrays
+from tensorflow.python.keras.engine import training_eager
+from tensorflow.python.keras.engine import training_generator
+from tensorflow.python.keras.engine import training_utils
+from tensorflow.python.keras.engine.network import Network
+from tensorflow.python.keras.utils.generic_utils import slice_arrays
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import optimizer as tf_optimizer_module
@@ -106,6 +108,13 @@ def call(self, inputs, training=False):
   ```
   """
 
+  def __init__(self, *args, **kwargs):
+    super(Model, self).__init__(*args, **kwargs)
+    # Create a cache for iterator get_next op.
+    self._iterator_get_next = weakref.WeakKeyDictionary()
+    # Create a cache for dataset - uninitialized iterators
+    self._dataset_iterator_cache = weakref.WeakKeyDictionary()
+
   def compile(self,
               optimizer,
               loss=None,
@@ -278,6 +287,10 @@ def compile(self,
           self.metrics_names.append(self.output_names[i] + '_loss')
       self.nested_metrics = training_utils.collect_metrics(metrics,
                                                            self.output_names)
+      # TODO(fchollet): support stateful metrics in eager execution.
+      self.stateful_metric_functions = []
+      self.stateful_metric_names = []
+
       with K.name_scope('metrics'):
         training_utils.populate_metric_names(self)
       self._feed_sample_weight_modes = []
@@ -397,11 +410,13 @@ def compile(self,
         else:
           if sample_weight_mode == 'temporal':
             sample_weights.append(array_ops.placeholder_with_default(
-                [[1.]], shape=[None, None], name=name + '_sample_weights'))
+                constant_op.constant([[1.]], dtype=K.floatx()),
+                shape=[None, None], name=name + '_sample_weights'))
             sample_weight_modes.append('temporal')
           else:
             sample_weights.append(array_ops.placeholder_with_default(
-                [1.], shape=[None], name=name + '_sample_weights'))
+                constant_op.constant([1.], dtype=K.floatx()),
+                shape=[None], name=name + '_sample_weights'))
             sample_weight_modes.append(None)
     self.sample_weight_modes = sample_weight_modes
     self._feed_sample_weight_modes = []
@@ -454,6 +469,7 @@ def compile(self,
                                                              self.output_names)
     self.metrics_updates = []
     self.stateful_metric_names = []
+    self.stateful_metric_functions = []
     with K.name_scope('metrics'):
       for i in range(len(self.outputs)):
         if i in skip_target_indices:
@@ -509,8 +525,9 @@ def handle_metrics(metrics, weights=None):
 
             # Keep track of state updates created by
             # stateful metrics (i.e. metrics layers).
-            if isinstance(metric_fn, Layer):
+            if isinstance(metric_fn, base_layer.Layer) and metric_fn.stateful:
               self.stateful_metric_names.append(metric_name)
+              self.stateful_metric_functions.append(metric_fn)
               self.metrics_updates += metric_fn.updates
 
         handle_metrics(output_metrics)
@@ -623,12 +640,23 @@ def _make_predict_function(self):
           **kwargs)
     self._post_build_cleanup()
 
+  def _get_iterator_get_next_tensors(self, iterator):
+    get_next_op = self._iterator_get_next.get(iterator, None)
+    if get_next_op is None:
+      get_next_op = iterator.get_next()
+      self._iterator_get_next[iterator] = get_next_op
+    return get_next_op
+
   def _standardize_user_data(self,
                              x,
                              y=None,
                              sample_weight=None,
                              class_weight=None,
-                             batch_size=None):
+                             batch_size=None,
+                             check_steps=False,
+                             steps_name='steps',
+                             steps=None,
+                             validation_split=0):
     """Runs validation checks on input and target data passed by the user.
 
     Also standardizes the data to lists of arrays, in order.
@@ -646,12 +674,12 @@ def _standardize_user_data(self,
           (in case the model has multiple inputs).
         - A dict mapping input names to the corresponding array/tensors,
           if the model has named inputs.
-        - A `tf.data` dataset iterator.
+        - A `tf.data` dataset or a dataset iterator.
       y: Target data. Like the input data `x`,
         it could be either Numpy array(s) or TensorFlow tensor(s).
         It should be consistent with `x` (you cannot have Numpy inputs and
-        tensor targets, or inversely). If `x` is a dataset iterator,
-        `y` should not be specified
+        tensor targets, or inversely). If `x` is a dataset or a
+        dataset iterator, `y` should not be specified
         (since targets will be obtained from the iterator).
       sample_weight: An optional sample-weight array passed by the user to
         weight the importance of each sample in `x`.
@@ -660,6 +688,16 @@ def _standardize_user_data(self,
         to, as conveyed by `y`.
       batch_size: Integer batch size. If provided, it is used to run additional
         validation checks on stateful models.
+      check_steps: boolean, True if we want to check for validity of `steps` and
+        False, otherwise. For example, when we are standardizing one batch of
+        data for train_on_batch/predict_on_batch/test_on_batch APIs, `steps`
+        value is not required and we should not check for its validity in these
+        cases.
+      steps_name: The public API's parameter name for `steps`.
+      steps: Integer or `None`. Total number of steps (batches of samples) to
+        execute.
+      validation_split: Float between 0 and 1.
+        Fraction of the training data to be used as validation data.
 
     Returns:
       A tuple of 3 lists: input arrays, target arrays, sample-weight arrays.
@@ -671,33 +709,59 @@ def _standardize_user_data(self,
       ValueError: In case of invalid user-provided data.
       RuntimeError: If the model was never compiled.
     """
-    # First, we build/compile the model on the fly if necessary.
     if isinstance(x, dataset_ops.Dataset):
-      raise ValueError('You passed a `Dataset` instance to your model (%s), '
-                       'which is not supported. Instead, pass an `Iterator`, '
-                       'which you can obtain e.g. via '
-                       '`dataset.make_one_shot_iterator()` (the exact method '
-                       'to use will depend on your specific dataset).' % x)
-    if isinstance(x, iterator_ops.Iterator):
-      if y is not None:
-        raise ValueError('You passed a dataset iterator (%s) as input `x` to '
-                         'your model. In that case, you should not specify '
-                         'a target (`y`) argument, since the dataset iterator '
-                         'generates both input data and target data. '
-                         'Received: %s' % (x, y))
-      if not context.executing_eagerly():
-        x, y = x.get_next()
-        # TODO(fchollet): handle case of `get_next` not returning 2 tensors?
+      if context.executing_eagerly():
+        x = x.make_one_shot_iterator()
       else:
-        # TODO(psv): implement this. The way to support it will be to typecheck
-        # for `iterator` before `_standardize_user_data` is called and redirect
-        # to new training/eval functions in `training_eager.py`. The model
-        # may need to get built using the specs of the data from the first batch
-        # drawn from the iterator.
-        raise ValueError('Dataset iterators are not supported '
-                         'with eager execution yet.')
+        if x in self._dataset_iterator_cache:
+          x = self._dataset_iterator_cache[x]
+        else:
+          iterator = x.make_initializable_iterator()
+          self._dataset_iterator_cache[x] = iterator
+          x = iterator
+        K.get_session().run(x.initializer)
+
+    # Validates `steps` argument based on x's type.
+    if check_steps:
+      training_utils.check_steps_argument(x, steps, steps_name)
+
+    is_x_eager_iterator = isinstance(x, iterator_ops.EagerIterator)
+    is_x_iterator = isinstance(x, iterator_ops.Iterator)
+
+    # Validate user inputs when data is given as a dataset or dataset iterator.
+    if is_x_iterator or is_x_eager_iterator:
+      training_utils.validate_iterator_input(x, y, sample_weight,
+                                             validation_split)
+
+    # For eager iterators, when we have to process multiple batches of samples,
+    # we will standardize the data when we actually loop over iterator and get
+    # the batches. For now, we just return the iterator as is.
+    if is_x_eager_iterator and steps is not None:
+      return x, y, sample_weight
+
+    # If input data is a dataset iterator in graph mode or if it is an eager
+    # iterator and only one batch of samples is required, we fetch the data
+    # tensors from the iterator and then standardize them.
+    if is_x_iterator or is_x_eager_iterator:
+      try:
+        if is_x_iterator:
+          next_element = self._get_iterator_get_next_tensors(x)
+        else:
+          next_element = x.get_next()
+      except errors.OutOfRangeError:
+        raise RuntimeError('Your dataset iterator ran out of data; '
+                           'Make sure that your dataset can generate '
+                           'required number of samples.')
+
+      if not isinstance(next_element, (list, tuple)) or len(next_element) != 2:
+        raise ValueError('Please provide data as a list or tuple of 2 elements '
+                         ' - input and target pair. Received %s' % next_element)
+      x, y = next_element
 
+    # First, we build/compile the model on the fly if necessary.
     all_inputs = []
+    is_build_called = False
+    is_compile_called = False
     if not self.built:
       # We need to use `x` to set the model inputs.
       # We type-check that `x` and `y` are either single arrays
@@ -720,6 +784,7 @@ def _standardize_user_data(self,
       # If values, then in symbolic-mode placeholders will be created
       # to match the value shapes.
       if not self.inputs:
+        is_build_called = True
         self._set_inputs(x)
 
     if y is not None:
@@ -736,6 +801,7 @@ def _standardize_user_data(self,
             raise ValueError('Please provide as model targets either a single '
                              'array or a list of arrays. '
                              'You passed: y=' + str(y))
+          all_inputs += list(y)
         elif isinstance(y, dict):
           raise ValueError('Please do not pass a dictionary as model targets.')
         else:
@@ -743,14 +809,10 @@ def _standardize_user_data(self,
             raise ValueError('Please provide as model targets either a single '
                              'array or a list of arrays. '
                              'You passed: y=' + str(y))
+          all_inputs.append(y)
 
         # Typecheck that all inputs are *either* value *or* symbolic.
         # TODO(fchollet): this check could be removed in Eager mode?
-        if y is not None:
-          if isinstance(y, (list, tuple)):
-            all_inputs += list(y)
-          else:
-            all_inputs.append(y)
         if any(tensor_util.is_tensor(v) for v in all_inputs):
           if not all(tensor_util.is_tensor(v) for v in all_inputs):
             raise ValueError('Do not pass inputs that mix Numpy arrays and '
@@ -764,24 +826,30 @@ def _standardize_user_data(self,
           if not isinstance(y, (list, tuple)):
             y = [y]
           target_tensors = [v for v in y if tensor_util.is_tensor(v)]
+        is_compile_called = True
         self.compile(optimizer=self.optimizer,
                      loss=self.loss,
                      metrics=self.metrics,
                      loss_weights=self.loss_weights,
                      target_tensors=target_tensors)
 
-    # If `x` and `y` were all symbolic, then no model should not be fed any
-    # inputs and targets.
+    # In graph mode, if we had just set inputs and targets as symbolic tensors
+    # by invoking build and compile on the model respectively, we do not have to
+    # feed anything to the model. Model already has input and target data as
+    # part of the graph.
     # Note: in this case, `any` and `all` are equivalent since we disallow
     # mixed symbolic/value inputs.
-    if any(tensor_util.is_tensor(v) for v in all_inputs):
+    if (not context.executing_eagerly() and is_build_called and
+        is_compile_called and
+        any(tensor_util.is_tensor(v) for v in all_inputs)):
       return [], [], []
 
     # What follows is input validation and standardization to list format,
     # in the case where all inputs are value arrays.
 
     if context.executing_eagerly():
-      # In eager mode, do not do shape validation.
+      # In eager mode, do not do shape validation
+      # since the network has no input nodes (placeholders) to be fed.
       feed_input_names = self.input_names
       feed_input_shapes = None
     elif not self._is_graph_network:
@@ -893,18 +961,29 @@ def _set_inputs(self, inputs, training=None):
         whether to build the model's graph in inference mode (False), training
         mode (True), or using the Keras learning phase (None).
     """
-    if not getattr(self, '_uses_inputs_arg', True):
+    call_convention = getattr(
+        self,
+        '_call_convention',
+        base_layer.CallConvention.EXPLICIT_INPUTS_ARGUMENT)
+    if call_convention not in (
+        base_layer.CallConvention.EXPLICIT_INPUTS_ARGUMENT,
+        base_layer.CallConvention.SINGLE_POSITIONAL_ARGUMENT):
       raise NotImplementedError(
-          'Subclassed Models without "inputs" in their call() signatures do '
-          'not yet support shape inference. File a feature request if this '
-          'limitation bothers you.')
+          'Subclassed Models without "inputs" (or single positional arguments) '
+          'in their call() signatures do not yet support shape inference. File '
+          'a feature request if this limitation bothers you.')
     if self.__class__.__name__ == 'Sequential':
       # Note: we can't test whether the model is `Sequential` via `isinstance`
       # since `Sequential` depends on `Model`.
       if isinstance(inputs, list):
         assert len(inputs) == 1
         inputs = inputs[0]
-      self.build(input_shape=(None,) + inputs.shape[1:])
+
+      if tensor_util.is_tensor(inputs):
+        input_shape = (None,) + tuple(inputs.get_shape().as_list()[1:])
+      else:
+        input_shape = (None,) + inputs.shape[1:]
+      self.build(input_shape=input_shape)
     elif context.executing_eagerly():
       self._eager_set_inputs(inputs)
     else:
@@ -931,23 +1010,29 @@ def _eager_set_inputs(self, inputs):
     # On-the-fly setting of model inputs/outputs as DeferredTensors,
     # to keep track of number of inputs and outputs and their ndim.
     if isinstance(inputs, (list, tuple)):
-      dummy_output_values = self.call(
-          [ops.convert_to_tensor(v, dtype=K.floatx()) for v in inputs])
+      if tensor_util.is_tensor(inputs[0]):
+        dummy_output_values = self.call(inputs)
+      else:
+        dummy_output_values = self.call(
+            [ops.convert_to_tensor(v, dtype=K.floatx()) for v in inputs])
       dummy_input_values = list(inputs)
     else:
-      dummy_output_values = self.call(
-          ops.convert_to_tensor(inputs, dtype=K.floatx()))
+      if tensor_util.is_tensor(inputs):
+        dummy_output_values = self.call(inputs)
+      else:
+        dummy_output_values = self.call(
+            ops.convert_to_tensor(inputs, dtype=K.floatx()))
       dummy_input_values = [inputs]
     if isinstance(dummy_output_values, (list, tuple)):
       dummy_output_values = list(dummy_output_values)
     else:
       dummy_output_values = [dummy_output_values]
     self.outputs = [
-        DeferredTensor(shape=(None for _ in v.shape),
-                       dtype=v.dtype) for v in dummy_output_values]
+        base_layer.DeferredTensor(shape=(None for _ in v.shape),
+                                  dtype=v.dtype) for v in dummy_output_values]
     self.inputs = [
-        DeferredTensor(shape=(None for _ in v.shape),
-                       dtype=v.dtype) for v in dummy_input_values]
+        base_layer.DeferredTensor(shape=(None for _ in v.shape),
+                                  dtype=v.dtype) for v in dummy_input_values]
     self.input_names = [
         'input_%d' % (i + 1) for i in range(len(dummy_input_values))]
     self.output_names = [
@@ -1061,19 +1146,19 @@ def fit(self,
             (in case the model has multiple inputs).
           - A dict mapping input names to the corresponding array/tensors,
             if the model has named inputs.
-          - A `tf.data` dataset iterator.
+          - A `tf.data` dataset or a dataset iterator.
         y: Target data. Like the input data `x`,
           it could be either Numpy array(s) or TensorFlow tensor(s).
           It should be consistent with `x` (you cannot have Numpy inputs and
-          tensor targets, or inversely). If `x` is a dataset iterator,
-          `y` should not be specified
+          tensor targets, or inversely). If `x` is a dataset or dataset
+          iterator, `y` should not be specified
           (since targets will be obtained from the iterator).
         batch_size: Integer or `None`.
             Number of samples per gradient update.
             If unspecified, `batch_size` will default to 32.
-            Do not specify the `batch_size` is your data is in the
-            form of symbolic tensors or dataset iterators (since they generate
-            batches).
+            Do not specify the `batch_size` if your data is in the
+            form of symbolic tensors, datasets, or dataset iterators
+            (since they generate batches).
         epochs: Integer. Number of epochs to train the model.
             An epoch is an iteration over the entire `x` and `y`
             data provided.
@@ -1094,7 +1179,8 @@ def fit(self,
             the loss and any model metrics
             on this data at the end of each epoch.
             The validation data is selected from the last samples
-            in the `x` and `y` data provided, before shuffling.
+            in the `x` and `y` data provided, before shuffling. This argument is
+            not supported when `x` is a dataset or a dataset iterator.
         validation_data: Data on which to evaluate
             the loss and any model metrics at the end of each epoch.
             The model will not be trained on this data.
@@ -1102,7 +1188,7 @@ def fit(self,
             `validation_data` could be:
               - tuple `(x_val, y_val)` of Numpy arrays or tensors
               - tuple `(x_val, y_val, val_sample_weights)` of Numpy arrays
-              - dataset iterator
+              - dataset or a dataset iterator
         shuffle: Boolean (whether to shuffle the training data
             before each epoch) or str (for 'batch').
             'batch' is a special option for dealing with the
@@ -1124,7 +1210,8 @@ def fit(self,
             `(samples, sequence_length)`,
             to apply a different weight to every timestep of every sample.
             In this case you should make sure to specify
-            `sample_weight_mode="temporal"` in `compile()`.
+            `sample_weight_mode="temporal"` in `compile()`. This argument is not
+            supported when `x` is a dataset or a dataset iterator.
         initial_epoch: Integer.
             Epoch at which to start training
             (useful for resuming a previous training run).
@@ -1165,21 +1252,24 @@ def fit(self,
       epochs = kwargs.pop('nb_epoch')
     if kwargs:
       raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
-    if x is None and y is None and steps_per_epoch is None:
-      raise ValueError('If fitting from data tensors, '
-                       'you should specify the `steps_per_epoch` '
-                       'argument.')
 
-    # Validate user data.
+    # Validate and standardize user data.
     x, y, sample_weights = self._standardize_user_data(
         x,
         y,
         sample_weight=sample_weight,
         class_weight=class_weight,
-        batch_size=batch_size)
+        batch_size=batch_size,
+        check_steps=True,
+        steps_name='steps_per_epoch',
+        steps=steps_per_epoch,
+        validation_split=validation_split)
+
     # Prepare validation data.
     if validation_data:
-      if isinstance(validation_data, iterator_ops.Iterator):
+      if (isinstance(validation_data, iterator_ops.Iterator) or
+          isinstance(validation_data, iterator_ops.EagerIterator) or
+          isinstance(validation_data, dataset_ops.Dataset)):
         val_x = validation_data
         val_y = None
         val_sample_weight = None
@@ -1193,14 +1283,17 @@ def fit(self,
             'When passing a `validation_data` argument, '
             'it must contain either 2 items (x_val, y_val), '
             'or 3 items (x_val, y_val, val_sample_weights), '
-            'or alternatively it could be a dataset iterator. However we '
-            'received `validation_data=%s`' % validation_data)
+            'or alternatively it could be a dataset or a '
+            'dataset or a dataset iterator. '
+            'However we received `validation_data=%s`' % validation_data)
 
+      # Validate and standardize validation data.
       val_x, val_y, val_sample_weights = self._standardize_user_data(
           val_x,
           val_y,
           sample_weight=val_sample_weight,
-          batch_size=batch_size)
+          batch_size=batch_size,
+          steps=validation_steps)
 
     elif validation_split and 0. < validation_split < 1.:
       if training_utils.has_symbolic_tensors(x):
@@ -1229,6 +1322,7 @@ def fit(self,
           inputs=x,
           targets=y,
           sample_weights=sample_weights,
+          class_weight=class_weight,
           batch_size=batch_size,
           epochs=epochs,
           verbose=verbose,
@@ -1275,19 +1369,19 @@ def evaluate(self,
             (in case the model has multiple inputs).
           - A dict mapping input names to the corresponding array/tensors,
             if the model has named inputs.
-          - A `tf.data` dataset iterator.
+          - A `tf.data` dataset or a dataset iterator.
         y: Target data. Like the input data `x`,
           it could be either Numpy array(s) or TensorFlow tensor(s).
           It should be consistent with `x` (you cannot have Numpy inputs and
-          tensor targets, or inversely). If `x` is a dataset iterator,
-          `y` should not be specified
-          (since targets will be obtained from the iterator).
+          tensor targets, or inversely).
+          If `x` is a dataset or a dataset iterator, `y` should not be specified
+          (since targets will be obtained from the iterator/dataset).
         batch_size: Integer or `None`.
             Number of samples per gradient update.
             If unspecified, `batch_size` will default to 32.
             Do not specify the `batch_size` is your data is in the
-            form of symbolic tensors or dataset iterators (since they generate
-            batches).
+            form of symbolic tensors, datasets, or dataset iterators
+            (since they generate batches).
         verbose: 0 or 1. Verbosity mode.
             0 = silent, 1 = progress bar.
         sample_weight: Optional Numpy array of weights for
@@ -1300,7 +1394,8 @@ def evaluate(self,
             `(samples, sequence_length)`,
             to apply a different weight to every timestep of every sample.
             In this case you should make sure to specify
-            `sample_weight_mode="temporal"` in `compile()`.
+            `sample_weight_mode="temporal"` in `compile()`. This argument is not
+            supported when `x` is a dataset or a dataset iterator.
         steps: Integer or `None`.
             Total number of steps (batches of samples)
             before declaring the evaluation round finished.
@@ -1318,17 +1413,16 @@ def evaluate(self,
     # Backwards compatibility.
     if batch_size is None and steps is None:
       batch_size = 32
-    if x is None and y is None and steps is None:
-      raise ValueError('If evaluating from data tensors, '
-                       'you should specify the `steps` '
-                       'argument.')
 
-    # Validate user data.
+    # Validate and standardize user data.
     x, y, sample_weights = self._standardize_user_data(
         x,
         y,
         sample_weight=sample_weight,
-        batch_size=batch_size)
+        batch_size=batch_size,
+        check_steps=True,
+        steps_name='steps',
+        steps=steps)
 
     if context.executing_eagerly():
       return training_eager.test_loop(
@@ -1345,13 +1439,18 @@ def predict(self, x, batch_size=None, verbose=0, steps=None):
     Computation is done in batches.
 
     Arguments:
-        x: Input samples, as Numpy array(s) or tensor(s).
+         x: Input samples. It could be:
+          - A Numpy array (or array-like), or a list of arrays
+            (in case the model has multiple inputs).
+          - A TensorFlow tensor, or a list of tensors
+            (in case the model has multiple inputs).
+          - A `tf.data` dataset or a dataset iterator.
         batch_size: Integer or `None`.
             Number of samples per gradient update.
             If unspecified, `batch_size` will default to 32.
             Do not specify the `batch_size` is your data is in the
-            form of symbolic tensors or dataset iterators (since they generate
-            batches).
+            form of symbolic tensors, dataset, or dataset iterators
+            (since they generate batches).
         verbose: Verbosity mode, 0 or 1.
         steps: Total number of steps (batches of samples)
             before declaring the prediction round finished.
@@ -1369,11 +1468,10 @@ def predict(self, x, batch_size=None, verbose=0, steps=None):
     # Backwards compatibility.
     if batch_size is None and steps is None:
       batch_size = 32
-    if x is None and steps is None:
-      raise ValueError('If predicting from data tensors, '
-                       'you should specify the `steps` '
-                       'argument.')
-    x, _, _ = self._standardize_user_data(x)
+
+    # Validate and standardize user data.
+    x, _, _ = self._standardize_user_data(
+        x, check_steps=True, steps_name='steps', steps=steps)
 
     if context.executing_eagerly():
       return training_eager.predict_loop(
@@ -1393,12 +1491,12 @@ def train_on_batch(self, x, y=None, sample_weight=None, class_weight=None):
             (in case the model has multiple inputs).
           - A dict mapping input names to the corresponding array/tensors,
             if the model has named inputs.
-          - A `tf.data` dataset iterator.
+          - A `tf.data` dataset or a dataset iterator.
         y: Target data. Like the input data `x`,
           it could be either Numpy array(s) or TensorFlow tensor(s).
           It should be consistent with `x` (you cannot have Numpy inputs and
-          tensor targets, or inversely). If `x` is a dataset iterator,
-          `y` should not be specified
+          tensor targets, or inversely). If `x` is a dataset or a
+          dataset iterator, `y` should not be specified
           (since targets will be obtained from the iterator).
         sample_weight: Optional array of the same length as x, containing
             weights to apply to the model's loss for each sample.
@@ -1406,7 +1504,8 @@ def train_on_batch(self, x, y=None, sample_weight=None, class_weight=None):
             with shape (samples, sequence_length),
             to apply a different weight to every timestep of every sample.
             In this case you should make sure to specify
-            sample_weight_mode="temporal" in compile().
+            sample_weight_mode="temporal" in compile(). This argument is not
+            supported when `x` is a dataset or a dataset iterator.
         class_weight: Optional dictionary mapping
             class indices (integers) to
             a weight (float) to apply to the model's loss for the samples
@@ -1424,11 +1523,9 @@ class indices (integers) to
     Raises:
       ValueError: In case of invalid user-provided arguments.
     """
+    # Validate and standardize user data.
     x, y, sample_weights = self._standardize_user_data(
-        x,
-        y,
-        sample_weight=sample_weight,
-        class_weight=class_weight)
+        x, y, sample_weight=sample_weight, class_weight=class_weight)
 
     if context.executing_eagerly():
       outputs = training_eager.train_on_batch(
@@ -1457,12 +1554,12 @@ def test_on_batch(self, x, y=None, sample_weight=None):
             (in case the model has multiple inputs).
           - A dict mapping input names to the corresponding array/tensors,
             if the model has named inputs.
-          - A `tf.data` dataset iterator.
+          - A `tf.data` dataset or a dataset iterator.
         y: Target data. Like the input data `x`,
           it could be either Numpy array(s) or TensorFlow tensor(s).
           It should be consistent with `x` (you cannot have Numpy inputs and
-          tensor targets, or inversely). If `x` is a dataset iterator,
-          `y` should not be specified
+          tensor targets, or inversely). If `x` is a dataset or a
+          dataset iterator, `y` should not be specified
           (since targets will be obtained from the iterator).
         sample_weight: Optional array of the same length as x, containing
             weights to apply to the model's loss for each sample.
@@ -1470,7 +1567,8 @@ def test_on_batch(self, x, y=None, sample_weight=None):
             with shape (samples, sequence_length),
             to apply a different weight to every timestep of every sample.
             In this case you should make sure to specify
-            sample_weight_mode="temporal" in compile().
+            sample_weight_mode="temporal" in compile(). This argument is not
+            supported when `x` is a dataset or a dataset iterator.
 
     Returns:
         Scalar test loss (if the model has a single output and no metrics)
@@ -1481,6 +1579,7 @@ def test_on_batch(self, x, y=None, sample_weight=None):
     Raises:
         ValueError: In case of invalid user-provided arguments.
     """
+    # Validate and standardize user data.
     x, y, sample_weights = self._standardize_user_data(
         x, y, sample_weight=sample_weight)
 
@@ -1503,23 +1602,34 @@ def predict_on_batch(self, x):
     """Returns predictions for a single batch of samples.
 
     Arguments:
-        x: Input samples, as Numpy array(s) or tensor(s).
+        x: Input data. It could be:
+          - A Numpy array (or array-like), or a list of arrays
+            (in case the model has multiple inputs).
+          - A TensorFlow tensor, or a list of tensors
+            (in case the model has multiple inputs).
+          - A `tf.data` dataset or a dataset iterator.
 
     Returns:
         Numpy array(s) of predictions.
 
+    Raises:
+        ValueError: In case of mismatch between given number of inputs and
+          expectations of the model.
     """
-    x, _, _ = self._standardize_user_data(x)
-
+    # Validate and standardize user data.
+    inputs, _, _ = self._standardize_user_data(x)
     if context.executing_eagerly():
-      inputs = [ops.convert_to_tensor(val, dtype=K.floatx()) for val in x]
+      if not isinstance(inputs, iterator_ops.EagerIterator):
+        inputs = [
+            ops.convert_to_tensor(val, dtype=K.floatx()) for val in inputs
+        ]
       return self(inputs)  # pylint: disable=not-callable
 
     if not context.executing_eagerly():
       if self.uses_learning_phase and not isinstance(K.learning_phase(), int):
-        ins = x + [0]
+        ins = inputs + [0]
       else:
-        ins = x
+        ins = inputs
 
       self._make_predict_function()
       outputs = self.predict_function(ins)
@@ -1631,8 +1741,7 @@ def generate_arrays_from_file(path):
                             steps_per_epoch=10000, epochs=10)
     ```
     Raises:
-        ValueError: In case the generator yields
-            data in an invalid format.
+        ValueError: In case the generator yields data in an invalid format.
     """
     if not self.built and not self._is_graph_network:
       raise NotImplementedError(
@@ -1659,7 +1768,8 @@ def evaluate_generator(self,
                          steps=None,
                          max_queue_size=10,
                          workers=1,
-                         use_multiprocessing=False):
+                         use_multiprocessing=False,
+                         verbose=0):
     """Evaluates the model on a data generator.
 
     The generator should return the same kind of data
@@ -1686,6 +1796,7 @@ def evaluate_generator(self,
             Note that because this implementation relies on multiprocessing,
             you should not pass non-picklable arguments to the generator
             as they can't be passed easily to children processes.
+        verbose: Verbosity mode, 0 or 1.
 
     Returns:
         Scalar test loss (if the model has a single output and no metrics)
@@ -1697,8 +1808,7 @@ def evaluate_generator(self,
         ValueError: in case of invalid arguments.
 
     Raises:
-        ValueError: In case the generator yields
-            data in an invalid format.
+        ValueError: In case the generator yields data in an invalid format.
     """
     if not self.built and not self._is_graph_network:
       raise NotImplementedError(
@@ -1711,7 +1821,8 @@ def evaluate_generator(self,
         steps=steps,
         max_queue_size=max_queue_size,
         workers=workers,
-        use_multiprocessing=use_multiprocessing)
+        use_multiprocessing=use_multiprocessing,
+        verbose=verbose)
 
   def predict_generator(self,
                         generator,
@@ -1751,8 +1862,7 @@ def predict_generator(self,
         Numpy array(s) of predictions.
 
     Raises:
-        ValueError: In case the generator yields
-            data in an invalid format.
+        ValueError: In case the generator yields data in an invalid format.
     """
     if not self.built and not self._is_graph_network:
       raise NotImplementedError(
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_arrays.py b/tensorflow/python/keras/engine/training_arrays.py
similarity index 95%
rename from tensorflow/python/keras/_impl/keras/engine/training_arrays.py
rename to tensorflow/python/keras/engine/training_arrays.py
index 4164cae864c7f8..93f4f1bd1dde84 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_arrays.py
+++ b/tensorflow/python/keras/engine/training_arrays.py
@@ -24,13 +24,12 @@
 import numpy as np
 
 from tensorflow.python.framework import errors
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import callbacks as cbks
-from tensorflow.python.keras._impl.keras.engine import training_utils
-from tensorflow.python.keras._impl.keras.engine.base_layer import Layer
-from tensorflow.python.keras._impl.keras.utils.generic_utils import make_batches
-from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
-from tensorflow.python.keras._impl.keras.utils.generic_utils import slice_arrays
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import callbacks as cbks
+from tensorflow.python.keras.engine import training_utils
+from tensorflow.python.keras.utils.generic_utils import make_batches
+from tensorflow.python.keras.utils.generic_utils import Progbar
+from tensorflow.python.keras.utils.generic_utils import slice_arrays
 from tensorflow.python.platform import tf_logging as logging
 
 try:
@@ -108,8 +107,8 @@ def fit_loop(model,
   do_validation = False
   if val_inputs:
     do_validation = True
-    if verbose and inputs and hasattr(inputs[0], 'shape') and hasattr(
-        val_inputs[0], 'shape'):
+    if (steps_per_epoch is None and verbose and inputs and
+        hasattr(inputs[0], 'shape') and hasattr(val_inputs[0], 'shape')):
       print('Train on %d samples, validate on %d samples' %
             (inputs[0].shape[0], val_inputs[0].shape[0]))
   if validation_steps:
@@ -180,9 +179,8 @@ def fit_loop(model,
 
   for epoch in range(initial_epoch, epochs):
     # Reset stateful metrics
-    for m in model.metrics:
-      if isinstance(m, Layer):
-        m.reset_states()
+    for m in model.stateful_metric_functions:
+      m.reset_states()
     # Update callbacks
     callbacks.on_epoch_begin(epoch)
     epoch_logs = {}
@@ -413,9 +411,8 @@ def test_loop(model, inputs, targets,
     ins = inputs + targets + sample_weights
 
   if hasattr(model, 'metrics'):
-    for m in model.metrics:
-      if isinstance(m, Layer):
-        m.reset_states()
+    for m in model.stateful_metric_functions:
+      m.reset_states()
     stateful_metric_indices = [
         i for i, name in enumerate(model.metrics_names)
         if str(name) in model.stateful_metric_names
diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py
new file mode 100644
index 00000000000000..081e46aa66176d
--- /dev/null
+++ b/tensorflow/python/keras/engine/training_eager.py
@@ -0,0 +1,1111 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Keras training and evaluation routines for eager execution.
+"""
+# pylint: disable=protected-access
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+import numpy as np
+
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.eager.backprop import GradientTape
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.keras import backend
+from tensorflow.python.keras import callbacks as cbks
+from tensorflow.python.keras import losses
+from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.keras.engine import training_utils
+from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import tf_logging as logging
+
+
+def _get_metrics_info(metric, internal_output_shapes=None, loss_func=None):
+  if metric == 'accuracy' or metric == 'acc':
+    # custom handling of accuracy
+    # (because of class mode duality)
+    output_shape = internal_output_shapes
+    if output_shape[-1] == 1 or loss_func == losses.binary_crossentropy:
+      # case: binary accuracy
+      acc_fn = metrics_module.binary_accuracy
+    elif loss_func == losses.sparse_categorical_crossentropy:
+      # case: categorical accuracy with sparse targets
+      acc_fn = metrics_module.sparse_categorical_accuracy
+    else:
+      acc_fn = metrics_module.categorical_accuracy
+
+    metric_name = 'acc'
+    return metric_name, acc_fn
+  else:
+    metric_fn = metrics_module.get(metric)
+    metric_name = metric_fn.__name__
+    return metric_name, metric_fn
+
+
+def _eager_loss_fn(outputs, targets, loss_fn, output_name):
+  with backend.name_scope(output_name + '_loss'):
+    loss = loss_fn(targets, outputs)
+  return loss
+
+
+def _eager_metrics_fn(model, outputs, targets):
+  """Calculates the metrics for each output of the given model.
+
+  Arguments:
+      model: The model on which metrics are being calculated.
+      outputs: The outputs of the given model.
+      targets: The predictions or targets of the given model.
+
+  Returns:
+      Returns the metric names and metric results for each output of the model.
+  """
+  metric_names = []
+  metric_results = []
+  if not isinstance(outputs, list):
+    outputs = [outputs]
+
+  if not isinstance(targets, list):
+    targets = [targets]
+
+  for i in range(len(model.outputs)):
+    output_metrics = model.nested_metrics[i]
+    for nested_output_metric in output_metrics:
+      metric_name, metric_fn = _get_metrics_info(
+          nested_output_metric, backend.int_shape(model.outputs[i]),
+          model.loss_functions[i])
+
+      if len(model.output_names) > 1:
+        metric_name = model.output_names[i] + '_' + metric_name
+        if metric_name not in model.metrics_names:
+          model.metrics_names.append(metric_name)
+
+      with backend.name_scope(metric_name):
+        metric_result = metric_fn(targets[i], outputs[i])
+        metric_names.append(metric_name)
+        metric_results.append(backend.mean(metric_result))
+
+  return metric_results
+
+
+def _model_loss(model, inputs, targets, sample_weights=None, training=False):
+  """Calculates the loss for a given model.
+
+  Arguments:
+      model: The model on which metrics are being calculated.
+      inputs: List of input arrays.
+      targets: List of target arrays.
+      sample_weights: Optional list of sample weight arrays.
+      training: Whether the model should be run in inference or training mode.
+
+  Returns:
+     Returns the model output, total loss and loss value calculated using the
+     specified loss function. The total loss includes regularization losses and
+     applies masking and sample weighting to the loss value.
+  """
+  total_loss = 0
+  if len(inputs) == 1:
+    if model._expects_training_arg:
+      outs = model.call(inputs[0], training=training)
+    else:
+      outs = model.call(inputs[0])
+  else:
+    if model._expects_training_arg:
+      outs = model.call(inputs, training=training)
+    else:
+      outs = model.call(inputs)
+  if not isinstance(outs, list):
+    outs = [outs]
+
+  if not isinstance(targets, list):
+    targets = [targets]
+
+  loss_metrics = []
+  with backend.name_scope('loss'):
+    for i, loss_fn in enumerate(model.loss_functions):
+      if sample_weights:
+        weights = sample_weights[i]
+      else:
+        weights = None
+
+      # TODO(fchollet): support masking; in practice `_keras_mask` is never
+      # set in this context currently.
+      mask = outs[i]._keras_mask
+
+      weighted_masked_fn = training_utils.weighted_masked_objective(loss_fn)
+      with backend.name_scope(model.output_names[i] + '_loss'):
+        output_loss = weighted_masked_fn(
+            targets[i], outs[i], weights, mask=mask)
+      # If the number of outputs is 1 then we don't append the loss metric
+      # associated with each model output. When there are multiple outputs
+      # associated with a model, each output's loss is calculated and returned
+      # as part of the loss_metrics.
+      if len(model.outputs) > 1:
+        loss_metrics.append(backend.mean(output_loss))
+
+      loss_weight = model.loss_weights_list[i]
+      if total_loss is None:
+        total_loss = loss_weight * output_loss
+      else:
+        total_loss += loss_weight * output_loss
+
+    total_loss = backend.mean(total_loss)
+    # Add regularization losses
+    custom_losses = []
+    for layer in model.layers:
+      if layer.losses:
+        custom_losses += layer.losses
+
+    if custom_losses:
+      total_loss += sum(custom_losses)
+
+  return outs, total_loss, loss_metrics
+
+
+def iterator_fit_loop(model,
+                      inputs,
+                      class_weight,
+                      steps_per_epoch,
+                      callback_model,
+                      out_labels,
+                      epoch_logs,
+                      val_inputs=None,
+                      val_targets=None,
+                      val_sample_weights=None,
+                      epochs=1,
+                      verbose=1,
+                      callbacks=None,
+                      callback_metrics=None,
+                      validation_steps=None,
+                      do_validation=False):
+  """Fit function for eager execution when input is given as dataset iterator.
+
+  Updates the given epoch logs.
+
+  Arguments:
+      model: Instance of the `Model`.
+      inputs: Input dataset iterator.
+      class_weight: Optional class-weight array to weight the importance of
+          samples in `inputs` based on the class they belong to, as conveyed by
+          the targets from the `inputs` iterator.
+      steps_per_epoch: Total number of steps (batches of samples)
+          before declaring one epoch finished and starting the
+          next epoch.
+      callback_model: Instance of `Model` to callback.
+      out_labels: Output labels generated from model metric names.
+      epoch_logs: Dictionary of logs from every epoch.
+      val_inputs: Input data for validation.
+      val_targets: Target data for validation.
+      val_sample_weights: Sample weight data for validation.
+      epochs: Number of times to iterate over the data
+      verbose: Verbosity mode, 0, 1 or 2
+      callbacks: List of callbacks to be called during training
+      callback_metrics: List of strings, the display names of the metrics
+          passed to the callbacks. They should be the
+          concatenation of list the display names of the outputs of
+           `f` and the list of display names of the outputs of `f_val`.
+      validation_steps: Number of steps to run validation for (only if doing
+        validation from data tensors). Ignored with default value of `None`.
+      do_validation: Boolean value indicating whether we should do validation.
+
+  Raises:
+      ValueError: In case of mismatch between given number of inputs and
+        expectations of the model.
+  """
+  assert isinstance(inputs, iterator_ops.EagerIterator)
+  for step_index in range(steps_per_epoch):
+    batch_logs = {}
+    batch_logs['batch'] = step_index
+    batch_logs['size'] = 1
+    callbacks.on_batch_begin(step_index, batch_logs)
+
+    # Get data from the iterator.
+    try:
+      next_element = inputs.get_next()
+    except errors.OutOfRangeError:
+      logging.warning(
+          'Your dataset iterator ran out of data; '
+          'interrupting training. Make sure that your dataset'
+          ' can generate at least `steps_per_epoch * epochs` '
+          'batches (in this case, %d batches).' % steps_per_epoch * epochs)
+      break
+
+    if not isinstance(next_element, (list, tuple)) or len(next_element) != 2:
+      raise ValueError('Please provide data as a list or tuple of 2 elements '
+                       ' - input and target pair. Received %s' % next_element)
+    x, y = next_element
+
+    # Validate and standardize data.
+    x, y, sample_weights = model._standardize_user_data(
+        x, y, class_weight=class_weight)
+    if sample_weights:
+      sample_weights = [
+          ops.convert_to_tensor(val, dtype=backend.floatx())
+          if val is not None else None for val in sample_weights
+      ]
+
+    if step_index == 0 and not callback_metrics:
+      out_labels = model.metrics_names
+      if do_validation:
+        callback_metrics = copy.copy(out_labels) + [
+            'val_' + n for n in out_labels
+        ]
+      else:
+        callback_metrics = copy.copy(out_labels)
+      callbacks.set_params({
+          'epochs': epochs,
+          'steps': steps_per_epoch,
+          'verbose': verbose,
+          'do_validation': do_validation,
+          'metrics': callback_metrics or [],
+      })
+
+    # Train model.
+    outs, loss, loss_metrics = _process_single_batch(
+        model, x, y, sample_weights=sample_weights, training=True)
+    if not isinstance(outs, list):
+      outs = [outs]
+
+    # Calculate metrics.
+    for l, o in zip(out_labels, outs):
+      batch_logs[l] = o
+    # Required for eager execution
+    metrics_results = _eager_metrics_fn(model, outs, y)
+    batch_logs['loss'] = tensor_util.constant_value(backend.mean(loss))
+
+    for k, v in zip(model.metrics_names,
+                    [backend.mean(loss)] + loss_metrics + metrics_results):
+      batch_logs[k] = tensor_util.constant_value(v)
+    callbacks.on_batch_end(step_index, batch_logs)
+    if callback_model.stop_training:
+      break
+
+    if step_index == steps_per_epoch - 1:
+      if do_validation:
+        val_outs = test_loop(
+            model,
+            val_inputs,
+            val_targets,
+            sample_weights=val_sample_weights,
+            steps=validation_steps,
+            verbose=0)
+        if not isinstance(val_outs, list):
+          val_outs = [val_outs]
+        # Same labels assumed.
+        for l, o in zip(out_labels, val_outs):
+          epoch_logs['val_' + l] = o
+
+
+def batch_fit_loop(model,
+                   inputs,
+                   targets,
+                   epoch_logs,
+                   index_array,
+                   out_labels,
+                   callback_model,
+                   batch_size,
+                   sample_weights=None,
+                   val_inputs=None,
+                   val_targets=None,
+                   val_sample_weights=None,
+                   callbacks=None,
+                   shuffle=True,
+                   num_train_samples=None,
+                   do_validation=False):
+  """Fit function for eager execution when input is given as arrays or tensors.
+
+  Updates the given epoch logs.
+
+  Arguments:
+      model: Instance of the `Model`.
+      inputs: List of input arrays.
+      targets: List of target arrays.
+      epoch_logs: Dictionary of logs from every epoch.
+      index_array: Index array generated from number of training samples.
+      out_labels: Output labels generated from model metric names.
+      callback_model: Instance of `Model` to callback.
+      batch_size: Integer batch size or None if unknown.
+      sample_weights: Optional list of sample weight arrays.
+      val_inputs: Input data for validation.
+      val_targets: Target data for validation.
+      val_sample_weights: Sample weight data for validation.
+      callbacks: List of callbacks to be called during training.
+      shuffle: Whether to shuffle the data at the beginning of each epoch.
+      num_train_samples: Integer number of training samples.
+      do_validation: Boolean value indicating whether we should do validation.
+  """
+  # TODO(psv): Create a dataset iterator instead of manually creating batches
+  # here and in batch_test_loop, batch_predict_loop.
+  if shuffle == 'batch':
+    index_array = model._batch_shuffle(index_array, batch_size)
+  elif shuffle:
+    np.random.shuffle(index_array)
+
+  batches = generic_utils.make_batches(num_train_samples, batch_size)
+
+  for batch_index, (batch_start, batch_end) in enumerate(batches):
+    batch_ids = index_array[batch_start:batch_end]
+    inputs_batch = slice_arrays(inputs, batch_ids, contiguous=not shuffle)
+    targets_batch = slice_arrays(targets, batch_ids, contiguous=not shuffle)
+    if sample_weights:
+      sample_weights_batch = slice_arrays(
+          sample_weights, batch_ids, contiguous=not shuffle)
+    else:
+      sample_weights_batch = None
+    batch_logs = {}
+    batch_logs['batch'] = batch_index
+    batch_logs['size'] = len(batch_ids)
+
+    callbacks.on_batch_begin(batch_index, batch_logs)
+
+    inputs_batch = [
+        ops.convert_to_tensor(val, dtype=backend.floatx())
+        for val in inputs_batch
+    ]
+    targets_batch = [
+        ops.convert_to_tensor(val, dtype=backend.floatx())
+        for val in targets_batch
+    ]
+    if sample_weights:
+      sample_weights_batch = [
+          ops.convert_to_tensor(val, dtype=backend.floatx())
+          if val is not None else None for val in sample_weights_batch
+      ]
+
+    outs, loss, loss_metrics = _process_single_batch(
+        model,
+        inputs_batch,
+        targets_batch,
+        sample_weights=sample_weights_batch,
+        training=True)
+
+    if not isinstance(outs, list):
+      outs = [outs]
+
+    for l, o in zip(out_labels, outs):
+      batch_logs[l] = o
+    # Required for eager execution
+    metrics_results = _eager_metrics_fn(model, outs, targets_batch)
+    batch_logs['loss'] = tensor_util.constant_value(backend.mean(loss))
+
+    for k, v in zip(model.metrics_names,
+                    [backend.mean(loss)] + loss_metrics + metrics_results):
+      batch_logs[k] = tensor_util.constant_value(v)
+    callbacks.on_batch_end(batch_index, batch_logs)
+    if callback_model.stop_training:
+      break
+
+    if batch_index == len(batches) - 1:  # Last batch.
+      if do_validation:
+        val_outs = test_loop(
+            model,
+            val_inputs,
+            val_targets,
+            sample_weights=val_sample_weights,
+            batch_size=batch_size,
+            verbose=0)
+        if not isinstance(val_outs, list):
+          val_outs = [val_outs]
+        # Same labels assumed.
+        for l, o in zip(out_labels, val_outs):
+          epoch_logs['val_' + l] = o
+
+
+def iterator_test_loop(model, inputs, steps, verbose=0):
+  """Test function for eager execution when input is given as dataset iterator.
+
+  Arguments:
+      model: Model instance that is being evaluated in Eager mode.
+      inputs: Input dataset iterator.
+      steps: Total number of steps (batches of samples) before declaring
+      predictions finished.
+      verbose: Verbosity mode.
+
+  Returns:
+      Scalar loss (if the model has a single output and no metrics)
+      or list of scalars (if the model has multiple outputs
+      and/or metrics). The attribute `model.metrics_names` will give you
+      the display labels for the scalar outputs.
+
+  Raises:
+      ValueError: In case of mismatch between given number of inputs and
+        expectations of the model.
+  """
+  assert isinstance(inputs, iterator_ops.EagerIterator)
+  outs = []
+  num_samples = 0
+  if verbose == 1:
+    progbar = generic_utils.Progbar(target=steps)
+  for step_index in range(steps):
+    # Get data from the iterator.
+    try:
+      next_element = inputs.get_next()
+    except errors.OutOfRangeError:
+      logging.warning(
+          'Your dataset iterator ran out of data interrupting testing. '
+          'Make sure that your dataset can generate at least `steps` batches '
+          '(in this case, %d batches).', steps)
+      break
+
+    if not isinstance(next_element, (list, tuple)) or len(next_element) != 2:
+      raise ValueError('Please provide data as a list or tuple of 2 elements '
+                       ' - input and target pair. Received %s' % next_element)
+    x, y = next_element
+
+    # Validate and standardize data.
+    x, y, sample_weights = model._standardize_user_data(x, y)
+
+    # Calculate model output, loss values.
+    loss_outs, loss, loss_metrics = _model_loss(
+        model, x, y, sample_weights=sample_weights, training=False)
+    metrics_results = _eager_metrics_fn(model, loss_outs, y)
+    batch_outs = []
+    for _, v in zip(model.metrics_names,
+                    [backend.mean(loss)] + loss_metrics + metrics_results):
+      batch_outs.append(tensor_util.constant_value(v))
+
+    # Get current step size.
+    if isinstance(x, list):
+      step_size = x[0].get_shape().as_list()[0]
+    else:
+      step_size = x.get_shape().as_list()[0]
+
+    # Accumulate results in output array.
+    if not isinstance(batch_outs, list):
+      batch_outs = [batch_outs]
+    if step_index == 0:
+      for _ in enumerate(batch_outs):
+        outs.append(0.)
+    for i, batch_out in enumerate(batch_outs):
+      outs[i] += batch_out * step_size
+
+    # Calculate sample size.
+    num_samples += step_size
+    if verbose == 1:
+      progbar.update(step_index + 1)
+
+    for i in range(len(outs)):
+      outs[i] /= num_samples
+    if len(outs) == 1:
+      return outs[0]
+    return outs
+
+
+def batch_test_loop(model,
+                    inputs,
+                    targets,
+                    batch_size,
+                    sample_weights=None,
+                    verbose=0):
+  """Test function for eager execution when input is given as arrays or tensors.
+
+  Arguments:
+      model: Model instance that is being evaluated in Eager mode.
+      inputs: List of input arrays.
+      targets: List of target arrays.
+      batch_size: Integer batch size.
+      sample_weights: Optional list of sample weight arrays.
+      verbose: Verbosity mode.
+
+  Returns:
+      Scalar loss (if the model has a single output and no metrics)
+      or list of scalars (if the model has multiple outputs
+      and/or metrics). The attribute `model.metrics_names` will give you
+      the display labels for the scalar outputs.
+  """
+  outs = []
+  feed_data = inputs + targets
+  if sample_weights:
+    feed_data += sample_weights
+  num_samples = training_utils.check_num_samples(
+      feed_data, batch_size=batch_size)
+  if verbose == 1:
+    progbar = generic_utils.Progbar(target=num_samples)
+  batches = generic_utils.make_batches(num_samples, batch_size)
+  index_array = np.arange(num_samples)
+  for batch_index, (batch_start, batch_end) in enumerate(batches):
+    batch_ids = index_array[batch_start:batch_end]
+    inputs_batch = slice_arrays(inputs, batch_ids)
+    targets_batch = slice_arrays(targets, batch_ids)
+    if sample_weights:
+      sample_weights_batch = slice_arrays(sample_weights, batch_ids)
+    else:
+      sample_weights_batch = None
+
+    inputs_batch = [
+        ops.convert_to_tensor(val, dtype=backend.floatx())
+        for val in inputs_batch
+    ]
+    targets_batch = [
+        ops.convert_to_tensor(val, dtype=backend.floatx())
+        for val in targets_batch
+    ]
+    if sample_weights:
+      sample_weights_batch = [
+          ops.convert_to_tensor(val, dtype=backend.floatx())
+          if val is not None else None for val in sample_weights_batch
+      ]
+
+    loss_outs, loss, loss_metrics = _model_loss(
+        model,
+        inputs_batch,
+        targets_batch,
+        sample_weights=sample_weights_batch,
+        training=False)
+    metrics_results = _eager_metrics_fn(model, loss_outs, targets_batch)
+    batch_outs = []
+    for _, v in zip(model.metrics_names,
+                    [backend.mean(loss)] + loss_metrics + metrics_results):
+      batch_outs.append(tensor_util.constant_value(v))
+
+    if isinstance(batch_outs, list):
+      if batch_index == 0:
+        for _ in enumerate(batch_outs):
+          outs.append(0.)
+      for i, batch_out in enumerate(batch_outs):
+        outs[i] += batch_out * len(batch_ids)
+    else:
+      if batch_index == 0:
+        outs.append(0.)
+      outs[0] += batch_outs * len(batch_ids)
+
+    if verbose == 1:
+      progbar.update(batch_end)
+
+  for i in range(len(outs)):
+    outs[i] /= num_samples
+  if len(outs) == 1:
+    return outs[0]
+  return outs
+
+
+def iterator_predict_loop(model, inputs, steps, verbose=0):
+  """Predict function for eager execution when input is dataset iterator.
+
+  Arguments:
+      model: Instance of `Model`.
+      inputs: Input dataset iterator.
+      steps: Total number of steps (batches of samples) before declaring
+          `_predict_loop` finished.
+      verbose: Verbosity mode.
+
+  Returns:
+      Array of predictions (if the model has a single output)
+      or list of arrays of predictions (if the model has multiple outputs).
+
+  Raises:
+      ValueError: In case of mismatch between given number of inputs and
+        expectations of the model.
+  """
+  assert isinstance(inputs, iterator_ops.EagerIterator)
+  outs = []
+  if verbose == 1:
+    progbar = generic_utils.Progbar(target=steps)
+  for step_index in range(steps):
+    # Get data from the iterator.
+    try:
+      next_element = inputs.get_next()
+    except errors.OutOfRangeError:
+      logging.warning(
+          'Your dataset iterator ran out of data; '
+          'interrupting prediction. Make sure that your '
+          'dataset can generate at least `steps` '
+          'batches (in this case, %d batches).', steps)
+      break
+
+    if not isinstance(next_element, (list, tuple)) or len(next_element) != 2:
+      raise ValueError(
+          'Please provide data as a list or tuple of 2 elements '
+          ' - input and target pair. Received %s. We do not use the '
+          '`target` value here.' % next_element)
+    x, _ = next_element
+
+    # Validate and standardize data.
+    x, _, _ = model._standardize_user_data(x)
+
+    if model._expects_training_arg:
+      batch_outs = model.call(x[0] if len(x) == 1 else x, training=False)
+    else:
+      batch_outs = model.call(x[0] if len(x) == 1 else x)
+    if not isinstance(batch_outs, list):
+      batch_outs = [batch_outs]
+
+    # We collect the results from every step and then concatenate them once
+    # in the end. This is an expensive process. We are doing this because we
+    # do not know the number of samples beforehand.
+    if step_index == 0:
+      for _ in batch_outs:
+        outs.append([])
+    for i, batch_out in enumerate(batch_outs):
+      outs[i].append(backend.get_value(batch_out))
+
+    if verbose == 1:
+      progbar.update(step_index + 1)
+  for i, out in enumerate(outs):
+    outs[i] = np.concatenate(tuple(out), axis=0)
+  if len(outs) == 1:
+    return outs[0]
+  return outs
+
+
+def batch_predict_loop(model, inputs, batch_size, verbose=0):
+  """Predict function for eager execution when input is arrays or tensors.
+
+  Arguments:
+      model: Instance of `Model`.
+      inputs: List of input arrays.
+      batch_size: Integer batch size.
+      verbose: Verbosity mode.
+
+  Returns:
+      Array of predictions (if the model has a single output)
+      or list of arrays of predictions (if the model has multiple outputs).
+  """
+  outs = []
+  num_samples = training_utils.check_num_samples(inputs, batch_size)
+  if verbose == 1:
+    progbar = generic_utils.Progbar(target=num_samples)
+  batches = generic_utils.make_batches(num_samples, batch_size)
+  index_array = np.arange(num_samples)
+  for batch_index, (batch_start, batch_end) in enumerate(batches):
+    batch_ids = index_array[batch_start:batch_end]
+    inputs_batch = slice_arrays(inputs, batch_ids)
+
+    inputs_batch = [
+        ops.convert_to_tensor(val, dtype=backend.floatx())
+        for val in inputs_batch
+    ]
+
+    if len(inputs_batch) == 1:
+      if model._expects_training_arg:
+        batch_outs = model.call(inputs_batch[0], training=False)
+      else:
+        batch_outs = model.call(inputs_batch[0])
+    else:
+      if model._expects_training_arg:
+        batch_outs = model.call(inputs_batch, training=False)
+      else:
+        batch_outs = model.call(inputs_batch)
+
+    if not isinstance(batch_outs, list):
+      batch_outs = [batch_outs]
+    if batch_index == 0:
+      # Pre-allocate the results arrays.
+      for batch_out in batch_outs:
+        dims = batch_out.shape[1:].dims
+        dims_list = [d.value for d in dims]
+        shape = (num_samples,) + tuple(dims_list)
+        outs.append(np.zeros(shape, dtype=batch_out.dtype.as_numpy_dtype))
+    for i, batch_out in enumerate(batch_outs):
+      outs[i][batch_start:batch_end] = batch_out
+    if verbose == 1:
+      progbar.update(batch_end)
+
+  if len(outs) == 1:
+    return outs[0]
+  return outs
+
+
+def slice_arrays(arrays, indices, contiguous=True):
+  """Slices batches out of provided arrays (workaround for eager tensors).
+
+  Unfortunately eager tensors don't have the same slicing behavior as
+  Numpy arrays (they follow the same slicing behavior as symbolic TF tensors),
+  hence we cannot use `generic_utils.slice_arrays` directly
+  and we have to implement this workaround based on `concat`. This has a
+  performance cost.
+
+  Arguments:
+    arrays: Single array or list of arrays.
+    indices: List of indices in the array that should be included in the output
+      batch.
+    contiguous: Boolean flag indicating whether the indices are contiguous.
+
+  Returns:
+    Slice of data (either single array or list of arrays).
+  """
+  if any(tensor_util.is_tensor(x) for x in arrays):
+    converted_to_list = False
+    if not isinstance(arrays, list):
+      converted_to_list = True
+      arrays = [arrays]
+    if not contiguous:
+      entries = [[x[i:i + 1] for i in indices] for x in arrays]
+      slices = [array_ops.concat(x, axis=0) for x in entries]
+    else:
+      slices = [x[indices[0]:indices[-1] + 1] for x in arrays]
+    if converted_to_list:
+      slices = slices[0]
+    return slices
+  else:
+    return generic_utils.slice_arrays(arrays, indices)
+
+
+def _process_single_batch(model,
+                          inputs,
+                          targets,
+                          sample_weights=None,
+                          training=False):
+  """Calculate the loss and gradient for one input batch.
+
+     The model weights are updated if training is set to True.
+
+  Arguments:
+      model: Model whose loss has to be calculated.
+      inputs: List of input arrays.
+      targets: List of target arrays.
+      sample_weights: Optional list of sample weight arrays.
+      training: The boolean represents if the weights of the model are updated.
+              'fit' methods will set this to True while 'evaluate' methods will
+              set this to False.
+
+  Returns:
+      output of the model, total loss and the loss associated with each output.
+
+  Raises:
+      ValueError: If the model has no loss to optimize.
+  """
+  with backend.learning_phase_scope(1 if training else 0):
+    with GradientTape() as tape:
+      outs, loss, loss_metrics = _model_loss(model, inputs, targets,
+                                             sample_weights=sample_weights,
+                                             training=training)
+      if loss is None:
+        raise ValueError('The model cannot be run '
+                         'because it has no loss to optimize.')
+    if training:
+      if not model._collected_trainable_weights:
+        logging.warning('The list of trainable weights is empty. Make sure that'
+                        ' you are not setting model.trainable to False before '
+                        'compiling the model.')
+      else:
+        grads = tape.gradient(loss, model._collected_trainable_weights)
+        model.optimizer.apply_gradients(zip(grads,
+                                            model._collected_trainable_weights))
+    return outs, loss, loss_metrics
+
+
+def train_on_batch(model, inputs, targets, sample_weights=None):
+  """Calculates the loss and gradient updates for one input batch.
+
+  Arguments:
+      model: Model whose loss has to be calculated.
+      inputs: Input batch data.
+      targets: Target batch data.
+      sample_weights: Sample weight batch data.
+
+  Returns:
+      total loss and the loss associated with each output.
+  """
+  if len(inputs) and not tensor_util.is_tensor(inputs[0]):
+    inputs = [
+        ops.convert_to_tensor(val, dtype=backend.floatx()) for val in inputs
+    ]
+    targets = [
+        ops.convert_to_tensor(val, dtype=backend.floatx()) for val in targets
+    ]
+  if sample_weights:
+    sample_weights = [
+        ops.convert_to_tensor(val, dtype=backend.floatx())
+        if val is not None else None for val in sample_weights
+    ]
+
+  outs, loss, _ = _process_single_batch(
+      model, inputs, targets, sample_weights=sample_weights, training=True)
+  if not isinstance(outs, list):
+    outs = [outs]
+  metrics_results = _eager_metrics_fn(model, outs, targets)
+  if not isinstance(loss, list):
+    loss = [loss]
+  return loss + metrics_results
+
+
+def test_on_batch(model, inputs, targets, sample_weights=None):
+  """Calculates the loss for one input batch.
+
+  Arguments:
+      model: Model whose loss has to be calculated.
+      inputs: Input batch data.
+      targets: Target batch data.
+      sample_weights: Sample weight batch data.
+
+  Returns:
+      total loss, loss and metrics associated with each output.
+  """
+  if len(inputs) and not tensor_util.is_tensor(inputs[0]):
+    inputs = [
+        ops.convert_to_tensor(val, dtype=backend.floatx()) for val in inputs
+    ]
+    targets = [
+        ops.convert_to_tensor(val, dtype=backend.floatx()) for val in targets
+    ]
+  if sample_weights:
+    sample_weights = [
+        ops.convert_to_tensor(val, dtype=backend.floatx())
+        if val is not None else None for val in sample_weights
+    ]
+  outs, loss, loss_metrics = _model_loss(
+      model, inputs, targets, sample_weights=sample_weights, training=False)
+  if not isinstance(outs, list):
+    outs = [outs]
+  metrics_results = _eager_metrics_fn(model, outs, targets)
+  if not isinstance(loss, list):
+    loss = [loss]
+  return loss + loss_metrics + metrics_results
+
+
+def fit_loop(model,
+             inputs,
+             targets,
+             sample_weights=None,
+             class_weight=None,
+             val_inputs=None,
+             val_targets=None,
+             val_sample_weights=None,
+             batch_size=None,
+             epochs=1,
+             verbose=1,
+             callbacks=None,
+             shuffle=True,
+             callback_metrics=None,
+             initial_epoch=0,
+             steps_per_epoch=None,
+             validation_steps=None):
+  """Fit function for eager execution.
+
+  Arguments:
+      model: Instance of the model that is being executed in Eager mode.
+      inputs: List of input arrays.
+      targets: List of target arrays.
+      sample_weights: Optional list of sample weight arrays.
+      class_weight: Optional class-weight array to weight the importance of
+          samples in `inputs` based on the class they belong to, as conveyed by
+          `targets`.
+      val_inputs: Input data for validation.
+      val_targets: Target data for validation.
+      val_sample_weights: Sample weight data for validation.
+      batch_size: Integer batch size or None if unknown.
+      epochs: Number of times to iterate over the data
+      verbose: Verbosity mode, 0, 1 or 2
+      callbacks: List of callbacks to be called during training
+      shuffle: Whether to shuffle the data at the beginning of each epoch
+      callback_metrics: List of strings, the display names of the metrics
+          passed to the callbacks. They should be the
+          concatenation of list the display names of the outputs of
+           `f` and the list of display names of the outputs of `f_val`.
+      initial_epoch: Epoch at which to start training
+          (useful for resuming a previous training run)
+      steps_per_epoch: Total number of steps (batches of samples)
+          before declaring one epoch finished and starting the
+          next epoch. Ignored with the default value of `None`.
+      validation_steps: Number of steps to run validation for (only if doing
+        validation from data tensors). Ignored with default value of `None`.
+
+  Returns:
+      `History` object.
+
+  Raises:
+    ValueError: In case of invalid argument values.
+  """
+  # Required for eager execution
+  with backend.learning_phase_scope(1):
+    do_validation = False
+    if val_inputs:
+      do_validation = True
+      if (steps_per_epoch is None and verbose and inputs and
+          hasattr(inputs[0], 'shape') and hasattr(val_inputs[0], 'shape')):
+        print('Train on %d samples, validate on %d samples' %
+              (inputs[0].shape[0], val_inputs[0].shape[0]))
+
+    num_train_samples = None
+    out_labels = None
+    if steps_per_epoch is None or model._is_compiled:
+      out_labels = model.metrics_names
+      if do_validation:
+        callback_metrics = copy.copy(out_labels) + [
+            'val_' + n for n in out_labels
+        ]
+      else:
+        callback_metrics = copy.copy(out_labels)
+
+    if steps_per_epoch is None:
+      if sample_weights:
+        feed_data = inputs + targets + sample_weights
+      else:
+        feed_data = inputs + targets
+      num_train_samples = training_utils.check_num_samples(
+          feed_data,
+          batch_size=batch_size,
+          steps=steps_per_epoch,
+          steps_name='steps_per_epoch')
+
+      if num_train_samples is not None:
+        index_array = np.arange(num_train_samples)
+
+    model.history = cbks.History()
+    callbacks = [cbks.BaseLogger()] + (callbacks or []) + [model.history]
+    if verbose:
+      if steps_per_epoch is not None:
+        count_mode = 'steps'
+      else:
+        count_mode = 'samples'
+      callbacks += [cbks.ProgbarLogger(count_mode)]
+    callbacks = cbks.CallbackList(callbacks)
+
+    # it's possible to callback a different model than self
+    # (used by Sequential models)
+    if hasattr(model, 'callback_model') and model.callback_model:
+      callback_model = model.callback_model
+    else:
+      callback_model = model
+
+    callbacks.set_model(callback_model)
+
+    callbacks.set_params({
+        'batch_size': batch_size,
+        'epochs': epochs,
+        'steps': steps_per_epoch,
+        'samples': num_train_samples,
+        'verbose': verbose,
+        'do_validation': do_validation,
+        'metrics': callback_metrics or [],
+    })
+    callbacks.on_train_begin()
+    callback_model.stop_training = False
+    for cbk in callbacks:
+      if not val_inputs:
+        cbk.validation_data = []
+      elif isinstance(val_inputs, iterator_ops.EagerIterator):
+        cbk.validation_data = val_inputs
+      elif val_sample_weights:
+        cbk.validation_data = val_inputs + val_targets + val_sample_weights
+      else:
+        cbk.validation_data = val_inputs + val_targets
+
+    for epoch in range(initial_epoch, epochs):
+      callbacks.on_epoch_begin(epoch)
+      epoch_logs = {}
+
+      if steps_per_epoch is not None:
+        iterator_fit_loop(
+            model,
+            inputs,
+            class_weight,
+            steps_per_epoch=steps_per_epoch,
+            callback_model=callback_model,
+            out_labels=out_labels,
+            epoch_logs=epoch_logs,
+            val_inputs=val_inputs,
+            val_targets=val_targets,
+            val_sample_weights=val_sample_weights,
+            epochs=epochs,
+            verbose=verbose,
+            callbacks=callbacks,
+            callback_metrics=callback_metrics,
+            validation_steps=validation_steps,
+            do_validation=do_validation)
+      else:
+        batch_fit_loop(
+            model,
+            inputs,
+            targets,
+            epoch_logs=epoch_logs,
+            index_array=index_array,
+            out_labels=out_labels,
+            callback_model=callback_model,
+            batch_size=batch_size,
+            sample_weights=sample_weights,
+            val_inputs=val_inputs,
+            val_targets=val_targets,
+            val_sample_weights=val_sample_weights,
+            callbacks=callbacks,
+            shuffle=shuffle,
+            num_train_samples=num_train_samples,
+            do_validation=do_validation)
+      callbacks.on_epoch_end(epoch, epoch_logs)
+      if callback_model.stop_training:
+        break
+  callbacks.on_train_end()
+  return model.history
+
+
+def test_loop(model, inputs, targets,
+              sample_weights=None,
+              batch_size=None,
+              verbose=0,
+              steps=None):
+  """Test function for eager execution.
+
+  Arguments:
+      model: Model instance that is being evaluated in Eager mode.
+      inputs: List of input arrays.
+      targets: List of target arrays.
+      sample_weights: Optional list of sample weight arrays.
+      batch_size: integer batch size or `None`.
+      verbose: verbosity mode.
+      steps: Total number of steps (batches of samples)
+          before declaring predictions finished.
+          Ignored with the default value of `None`.
+
+  Returns:
+      Scalar loss (if the model has a single output and no metrics)
+      or list of scalars (if the model has multiple outputs
+      and/or metrics). The attribute `model.metrics_names` will give you
+      the display labels for the scalar outputs.
+  """
+  with backend.learning_phase_scope(0):
+    if steps is not None:
+      return iterator_test_loop(model, inputs, steps, verbose=verbose)
+    else:
+      return batch_test_loop(
+          model,
+          inputs,
+          targets,
+          batch_size=batch_size,
+          sample_weights=sample_weights,
+          verbose=verbose)
+
+
+def predict_loop(model, inputs,
+                 batch_size=32,
+                 verbose=0,
+                 steps=None):
+  """Predict function for eager execution.
+
+  Arguments:
+      model: Instance of `Model`.
+      inputs: List of input arrays.
+      batch_size: integer batch size.
+      verbose: verbosity mode.
+      steps: Total number of steps (batches of samples)
+          before declaring `_predict_loop` finished.
+          Ignored with the default value of `None`.
+
+  Returns:
+      Array of predictions (if the model has a single output)
+      or list of arrays of predictions
+      (if the model has multiple outputs).
+  """
+  with backend.learning_phase_scope(0):
+    if steps is not None:
+      return iterator_predict_loop(model, inputs, steps, verbose=verbose)
+    else:
+      return batch_predict_loop(
+          model, inputs, batch_size=batch_size, verbose=verbose)
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py b/tensorflow/python/keras/engine/training_eager_test.py
similarity index 99%
rename from tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
rename to tensorflow/python/keras/engine/training_eager_test.py
index 5adb3ef94086f6..d9446fd4373e74 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_eager_test.py
+++ b/tensorflow/python/keras/engine/training_eager_test.py
@@ -20,10 +20,10 @@
 
 import numpy as np
 
+from tensorflow.python import keras
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
@@ -94,7 +94,7 @@ def test_fit_on_arrays(self):
         verbose=2)
     model.train_on_batch([input_a_np, input_b_np], [output_d_np, output_e_np])
 
-  # Test with validation split
+    # Test with validation split
     model.fit(
         [input_a_np, input_b_np], [output_d_np, output_e_np],
         epochs=2,
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_generator.py b/tensorflow/python/keras/engine/training_generator.py
similarity index 92%
rename from tensorflow/python/keras/_impl/keras/engine/training_generator.py
rename to tensorflow/python/keras/engine/training_generator.py
index 58b5bc39c10ea0..d81b384f0e1810 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_generator.py
+++ b/tensorflow/python/keras/engine/training_generator.py
@@ -21,12 +21,12 @@
 
 import numpy as np
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import callbacks as cbks
-from tensorflow.python.keras._impl.keras.utils.data_utils import GeneratorEnqueuer
-from tensorflow.python.keras._impl.keras.utils.data_utils import OrderedEnqueuer
-from tensorflow.python.keras._impl.keras.utils.data_utils import Sequence
-from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import callbacks as cbks
+from tensorflow.python.keras.utils.data_utils import GeneratorEnqueuer
+from tensorflow.python.keras.utils.data_utils import OrderedEnqueuer
+from tensorflow.python.keras.utils.data_utils import Sequence
+from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.platform import tf_logging as logging
 
 
@@ -49,9 +49,6 @@ def fit_generator(model,
   epoch = initial_epoch
 
   do_validation = bool(validation_data)
-  model._make_train_function()
-  if do_validation:
-    model._make_test_function()
 
   is_sequence = isinstance(generator, Sequence)
   if not is_sequence and use_multiprocessing and workers > 1:
@@ -155,6 +152,8 @@ def fit_generator(model,
     # Construct epoch logs.
     epoch_logs = {}
     while epoch < epochs:
+      for m in model.stateful_metric_functions:
+        m.reset_states()
       callbacks.on_epoch_begin(epoch)
       steps_done = 0
       batch_index = 0
@@ -250,9 +249,18 @@ def evaluate_generator(model,
                        steps=None,
                        max_queue_size=10,
                        workers=1,
-                       use_multiprocessing=False):
+                       use_multiprocessing=False,
+                       verbose=0):
   """See docstring for `Model.evaluate_generator`."""
-  model._make_test_function()
+  stateful_metric_indices = []
+  if hasattr(model, 'metrics'):
+    for m in model.stateful_metric_functions:
+      m.reset_states()
+    stateful_metric_indices = [
+        i for i, name in enumerate(model.metrics_names)
+        if str(name) in model.stateful_metric_names]
+  else:
+    stateful_metric_indices = []
 
   steps_done = 0
   wait_time = 0.01
@@ -293,6 +301,9 @@ def evaluate_generator(model,
       else:
         output_generator = generator
 
+    if verbose == 1:
+      progbar = Progbar(target=steps)
+
     while steps_done < steps:
       generator_output = next(output_generator)
       if not hasattr(generator_output, '__len__'):
@@ -323,6 +334,8 @@ def evaluate_generator(model,
 
       steps_done += 1
       batch_sizes.append(batch_size)
+      if verbose == 1:
+        progbar.update(steps_done)
 
   finally:
     if enqueuer is not None:
@@ -333,8 +346,11 @@ def evaluate_generator(model,
   else:
     averages = []
     for i in range(len(outs)):
-      averages.append(
-          np.average([out[i] for out in all_outs], weights=batch_sizes))
+      if i not in stateful_metric_indices:
+        averages.append(
+            np.average([out[i] for out in all_outs], weights=batch_sizes))
+      else:
+        averages.append(float(all_outs[-1][i]))
     return averages
 
 
@@ -346,8 +362,6 @@ def predict_generator(model,
                       use_multiprocessing=False,
                       verbose=0):
   """See docstring for `Model.predict_generator`."""
-  model._make_predict_function()
-
   steps_done = 0
   wait_time = 0.01
   all_outs = []
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
similarity index 88%
rename from tensorflow/python/keras/_impl/keras/engine/training_test.py
rename to tensorflow/python/keras/engine/training_test.py
index 58011a141268e5..5c02d363824a58 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -23,13 +23,14 @@
 
 import numpy as np
 
+from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
-from tensorflow.python.keras._impl.keras.engine.training_utils import weighted_masked_objective
-from tensorflow.python.keras._impl.keras.utils.generic_utils import slice_arrays
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.engine.training_utils import weighted_masked_objective
+from tensorflow.python.keras.utils.generic_utils import slice_arrays
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
@@ -946,6 +947,7 @@ def custom_generator():
                                  steps=5,
                                  max_queue_size=10,
                                  workers=2,
+                                 verbose=1,
                                  use_multiprocessing=True)
         model.evaluate_generator(custom_generator(),
                                  steps=5,
@@ -1340,16 +1342,12 @@ def test_model_with_input_feed_tensor(self):
                                  output_a_np)
 
       # test fit
-      out = model.fit(None,
-                      output_a_np, epochs=1, batch_size=10)
-      out = model.fit(None,
-                      output_a_np, epochs=1, batch_size=10)
+      _ = model.fit(None, output_a_np, epochs=1, steps_per_epoch=3)
+      _ = model.fit(None, output_a_np, epochs=1, steps_per_epoch=3)
 
       # test evaluate
-      out = model.evaluate(None,
-                           output_a_np, batch_size=10)
-      out = model.evaluate(None,
-                           output_a_np, batch_size=10)
+      _ = model.evaluate(None, output_a_np, steps=3)
+      _ = model.evaluate(None, output_a_np, steps=3)
 
       # test predict
       out = model.predict(None, steps=3)
@@ -1383,16 +1381,12 @@ def test_model_with_input_feed_tensor(self):
                                  output_a_np)
 
       # test fit
-      out = model.fit(None,
-                      output_a_np, epochs=1, batch_size=10)
-      out = model.fit(None,
-                      output_a_np, epochs=1, batch_size=10)
+      _ = model.fit(None, output_a_np, epochs=1, steps_per_epoch=10)
+      _ = model.fit(None, output_a_np, epochs=1, steps_per_epoch=10)
 
       # test evaluate
-      out = model.evaluate(None,
-                           output_a_np, batch_size=10)
-      out = model.evaluate(None,
-                           output_a_np, batch_size=10)
+      _ = model.evaluate(None, output_a_np, steps=10)
+      _ = model.evaluate(None, output_a_np, steps=10)
 
       # test predict
       out = model.predict(None, steps=3)
@@ -1715,63 +1709,110 @@ def test_metric_names_are_identical_in_graph_and_eager(self):
 
 class TestTrainingWithDatasetIterators(test.TestCase):
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_training_and_eval_methods_on_iterators_single_io(self):
     with self.test_session():
       x = keras.layers.Input(shape=(3,), name='input')
       y = keras.layers.Dense(4, name='dense')(x)
       model = keras.Model(x, y)
 
-      optimizer = 'rmsprop'
+      optimizer = RMSPropOptimizer(learning_rate=0.001)
       loss = 'mse'
       metrics = ['mae']
       model.compile(optimizer, loss, metrics=metrics)
 
-      inputs = np.zeros((10, 3))
-      targets = np.zeros((10, 4))
+      inputs = np.zeros((10, 3), dtype=np.float32)
+      targets = np.zeros((10, 4), dtype=np.float32)
       dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
       dataset = dataset.repeat(100)
       dataset = dataset.batch(10)
       iterator = dataset.make_one_shot_iterator()
 
-      model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=0)
-      model.evaluate(iterator, steps=2, verbose=0)
+      model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
+      model.evaluate(iterator, steps=2, verbose=1)
       model.predict(iterator, steps=2)
       model.train_on_batch(iterator)
       model.test_on_batch(iterator)
+      model.predict_on_batch(iterator)
+
       # Test with validation data
       model.fit(iterator,
                 epochs=1, steps_per_epoch=2, verbose=0,
                 validation_data=iterator, validation_steps=2)
       # Test with validation split
-      with self.assertRaisesRegexp(ValueError,
-                                   'you cannot use `validation_split`'):
+      with self.assertRaisesRegexp(
+          ValueError, '`validation_split` argument is not supported '
+          'when input `x` is a dataset or a dataset iterator'):
         model.fit(iterator,
                   epochs=1, steps_per_epoch=2, verbose=0,
                   validation_split=0.5, validation_steps=2)
 
+      # Test with sample weight.
+      sample_weight = np.random.random((10,))
+      with self.assertRaisesRegexp(
+          ValueError, '`sample_weight` argument is not supported '
+          'when input `x` is a dataset or a dataset iterator'):
+        model.fit(
+            iterator,
+            epochs=1,
+            steps_per_epoch=2,
+            verbose=0,
+            sample_weight=sample_weight)
+
       # Test invalid usage
-      with self.assertRaisesRegexp(ValueError,
-                                   'Instead, pass an `Iterator`'):
-        model.fit(dataset,
-                  epochs=1, steps_per_epoch=2, verbose=0)
       with self.assertRaisesRegexp(ValueError,
                                    'you should not specify a target'):
         model.fit(iterator, iterator,
                   epochs=1, steps_per_epoch=2, verbose=0)
 
+      with self.assertRaisesRegexp(
+          ValueError, 'you should specify the `steps_per_epoch` argument'):
+        model.fit(iterator, epochs=1, verbose=0)
+      with self.assertRaisesRegexp(ValueError,
+                                   'you should specify the `steps` argument'):
+        model.evaluate(iterator, verbose=0)
+      with self.assertRaisesRegexp(ValueError,
+                                   'you should specify the `steps` argument'):
+        model.predict(iterator, verbose=0)
+
+  def test_get_next_op_created_once(self):
+    with self.test_session():
+      x = keras.layers.Input(shape=(3,), name='input')
+      y = keras.layers.Dense(4, name='dense')(x)
+      model = keras.Model(x, y)
+
+      optimizer = RMSPropOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics)
+
+      inputs = np.zeros((10, 3), dtype=np.float32)
+      targets = np.zeros((10, 4), dtype=np.float32)
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+      iterator = dataset.make_one_shot_iterator()
+
+      model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
+      # Finalize graph to make sure we are not appending another iterator
+      # get_next op in the graph.
+      ops.get_default_graph().finalize()
+      model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
+
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_iterators_running_out_of_data(self):
     with self.test_session():
       x = keras.layers.Input(shape=(3,), name='input')
       y = keras.layers.Dense(4, name='dense')(x)
       model = keras.Model(x, y)
 
-      optimizer = 'rmsprop'
+      optimizer = RMSPropOptimizer(learning_rate=0.001)
       loss = 'mse'
       metrics = ['mae']
       model.compile(optimizer, loss, metrics=metrics)
 
-      inputs = np.zeros((10, 3))
-      targets = np.zeros((10, 4))
+      inputs = np.zeros((10, 3), dtype=np.float32)
+      targets = np.zeros((10, 4), dtype=np.float32)
       dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
       dataset = dataset.repeat(2)
       dataset = dataset.batch(10)
@@ -1784,5 +1825,129 @@ def test_iterators_running_out_of_data(self):
             'dataset iterator ran out of data')
 
 
+class TestTrainingWithDataset(test.TestCase):
+
+  def test_calling_model_on_same_dataset(self):
+    with self.test_session():
+      x = keras.layers.Input(shape=(3,), name='input')
+      y = keras.layers.Dense(4, name='dense')(x)
+      model = keras.Model(x, y)
+
+      optimizer = RMSPropOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics)
+
+      inputs = np.zeros((10, 3), dtype=np.float32)
+      targets = np.zeros((10, 4), dtype=np.float32)
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+
+      # Call fit with validation data
+      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
+                validation_data=dataset, validation_steps=2)
+      # Finalize the graph to make sure new ops aren't added when calling on the
+      # same dataset
+      ops.get_default_graph().finalize()
+      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
+                validation_data=dataset, validation_steps=2)
+
+  @tf_test_util.run_in_graph_and_eager_modes()
+  def test_training_and_eval_methods_on_dataset(self):
+    with self.test_session():
+      x = keras.layers.Input(shape=(3,), name='input')
+      y = keras.layers.Dense(4, name='dense')(x)
+      model = keras.Model(x, y)
+
+      optimizer = RMSPropOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics)
+
+      inputs = np.zeros((10, 3), dtype=np.float32)
+      targets = np.zeros((10, 4), dtype=np.float32)
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+
+      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+      model.evaluate(dataset, steps=2, verbose=1)
+      model.predict(dataset, steps=2)
+      model.train_on_batch(dataset)
+      model.predict_on_batch(dataset)
+
+      # Test with validation data
+      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
+                validation_data=dataset, validation_steps=2)
+
+      # Test with validation split
+      with self.assertRaisesRegexp(
+          ValueError, '`validation_split` argument is not supported '
+          'when input `x` is a dataset or a dataset iterator'):
+        model.fit(dataset,
+                  epochs=1, steps_per_epoch=2, verbose=0,
+                  validation_split=0.5, validation_steps=2)
+
+      # Test with sample weight.
+      sample_weight = np.random.random((10,))
+      with self.assertRaisesRegexp(
+          ValueError, '`sample_weight` argument is not supported '
+          'when input `x` is a dataset or a dataset iterator'):
+        model.fit(
+            dataset,
+            epochs=1,
+            steps_per_epoch=2,
+            verbose=0,
+            sample_weight=sample_weight)
+
+      # Test invalid usage
+      with self.assertRaisesRegexp(ValueError,
+                                   'you should not specify a target'):
+        model.fit(dataset, dataset,
+                  epochs=1, steps_per_epoch=2, verbose=0)
+
+      with self.assertRaisesRegexp(
+          ValueError, 'you should specify the `steps_per_epoch` argument'):
+        model.fit(dataset, epochs=1, verbose=0)
+      with self.assertRaisesRegexp(ValueError,
+                                   'you should specify the `steps` argument'):
+        model.evaluate(dataset, verbose=0)
+      with self.assertRaisesRegexp(ValueError,
+                                   'you should specify the `steps` argument'):
+        model.predict(dataset, verbose=0)
+
+  def test_dataset_input_shape_validation(self):
+    with self.test_session():
+      x = keras.layers.Input(shape=(3,), name='input')
+      y = keras.layers.Dense(4, name='dense')(x)
+      model = keras.Model(x, y)
+
+      optimizer = RMSPropOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      model.compile(optimizer, loss)
+
+      # User forgets to batch the dataset
+      inputs = np.zeros((10, 3), dtype=np.float32)
+      targets = np.zeros((10, 4), dtype=np.float32)
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(100)
+
+      with self.assertRaisesRegexp(ValueError,
+                                   'expected input to have 2 dimensions'):
+        model.train_on_batch(dataset)
+
+      # Wrong input shape
+      inputs = np.zeros((10, 5), dtype=np.float32)
+      targets = np.zeros((10, 4), dtype=np.float32)
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+
+      with self.assertRaisesRegexp(ValueError,
+                                   'expected input to have shape'):
+        model.train_on_batch(dataset)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/engine/training_utils.py b/tensorflow/python/keras/engine/training_utils.py
similarity index 85%
rename from tensorflow/python/keras/_impl/keras/engine/training_utils.py
rename to tensorflow/python/keras/engine/training_utils.py
index 662938f421b3a3..b93f999444c645 100644
--- a/tensorflow/python/keras/_impl/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/engine/training_utils.py
@@ -22,11 +22,12 @@
 
 import numpy as np
 
+from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_util
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import losses
-from tensorflow.python.keras._impl.keras import metrics as metrics_module
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import losses
+from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.ops import math_ops
 
 
@@ -65,14 +66,7 @@ def check_num_samples(ins,
   if steps is not None and batch_size is not None:
     raise ValueError(
         'If ' + steps_name + ' is set, the `batch_size` must be None.')
-
-  if not ins or has_symbolic_tensors(ins):
-    if steps is None:
-      raise ValueError('If your data is in the form of symbolic tensors, '
-                       'you should specify the `' + steps_name + '` argument '
-                       '(instead of the `batch_size` argument, '
-                       'because symbolic tensors are expected to produce '
-                       'batches of input data).')
+  if check_steps_argument(ins, steps, steps_name):
     return None
   if hasattr(ins[0], 'shape'):
     return int(ins[0].shape[0])
@@ -172,10 +166,16 @@ def standardize_input_data(data,
   # Check shapes compatibility.
   if shapes:
     for i in range(len(names)):
-      if shapes[i] is not None and not tensor_util.is_tensor(data[i]):
-        data_shape = data[i].shape
+      if shapes[i] is not None:
+        if tensor_util.is_tensor(data[i]):
+          tensorshape = data[i].get_shape()
+          if not tensorshape:
+            continue
+          data_shape = tuple(tensorshape.as_list())
+        else:
+          data_shape = data[i].shape
         shape = shapes[i]
-        if data[i].ndim != len(shape):
+        if len(data_shape) != len(shape):
           raise ValueError('Error when checking ' + exception_prefix +
                            ': expected ' + names[i] + ' to have ' +
                            str(len(shape)) + ' dimensions, but got array '
@@ -184,7 +184,7 @@ def standardize_input_data(data,
           data_shape = data_shape[1:]
           shape = shape[1:]
         for dim, ref_dim in zip(data_shape, shape):
-          if ref_dim != dim and ref_dim:
+          if ref_dim != dim and ref_dim is not None and dim is not None:
             raise ValueError(
                 'Error when checking ' + exception_prefix + ': expected ' +
                 names[i] + ' to have shape ' + str(shape) +
@@ -551,8 +551,11 @@ def standardize_weights(y,
 
 
 def has_symbolic_tensors(ls):
-  return (any(tensor_util.is_tensor(v) for v in ls)
-          and not context.executing_eagerly())
+  if context.executing_eagerly():
+    return False
+  if isinstance(ls, (list, tuple)):
+    return any(tensor_util.is_tensor(v) for v in ls)
+  return tensor_util.is_tensor(ls)
 
 
 def populate_metric_names(model):
@@ -614,3 +617,78 @@ def add_metric_name(model, metric_name, index):
     metric_name = '%s_%d' % (base_metric_name, j)
     j += 1
   model.metrics_names.append(metric_name)
+
+
+def validate_iterator_input(x, y, sample_weight, validation_split=None):
+  """Validates user input arguments when a dataset iterator is passed.
+
+  Arguments:
+    x: Input data. A `tf.data` dataset iterator.
+    y: Target data. It could be either Numpy array(s) or TensorFlow tensor(s).
+        Expected to be `None` when `x` is a dataset iterator.
+    sample_weight: An optional sample-weight array passed by the user to
+        weight the importance of each sample in `x`. Expected to be `None` when
+        `x` is a dataset iterator
+    validation_split: Float between 0 and 1. Fraction of the training data to
+        be used as validation data. Expected to be `None` when `x` is a dataset
+        iterator.
+
+  Raises:
+    ValueError: if argument `y` or `sample_weight` or `validation_split` are
+        provided by user.
+  """
+  if y is not None:
+    raise ValueError('You passed a dataset or dataset iterator (%s) as '
+                     'input `x` to your model. In that case, you should '
+                     'not specify a target (`y`) argument, since the dataset '
+                     'or dataset iterator generates both input data and '
+                     'target data. '
+                     'Received: %s' % (x, y))
+  if sample_weight is not None:
+    raise ValueError('`sample_weight` argument is not supported when input '
+                     '`x` is a dataset or a dataset iterator. '
+                     'Received: x=%s, sample_weight=%s' % (x, sample_weight))
+  if validation_split is not None and validation_split != 0.0:
+    raise ValueError(
+        '`validation_split` argument is not supported when '
+        'input `x` is a dataset or a dataset iterator. '
+        'Received: x=%s, validation_split=%f' % (x, validation_split))
+
+
+def check_steps_argument(input_data, steps, steps_name):
+  """Validates `steps` argument based on input data's type.
+
+  The cases when `steps` value must be provided are when
+    1. input data passed is an iterator.
+    2. model was built on top of symbolic tensors, input data is not
+       required and is `None`.
+    3. input data passed is a symbolic tensor.
+
+  Arguments:
+      input_data: Input data. Can be Numpy array(s) or TensorFlow tensor(s) or
+        tf.data.Dataset iterator or `None`.
+      steps: Integer or `None`. Total number of steps (batches of samples) to
+        execute.
+      steps_name: The public API's parameter name for `steps`.
+
+  Returns:
+    boolean, True if `steps` argument is required, else False.
+
+  Raises:
+      ValueError: if `steps` argument is required for given input data type
+        but not provided.
+  """
+
+  is_x_iterator = (
+      isinstance(input_data, iterator_ops.Iterator) or
+      isinstance(input_data, iterator_ops.EagerIterator))
+
+  if (input_data is None or is_x_iterator or has_symbolic_tensors(input_data) or
+      (isinstance(input_data, list) and not input_data)):
+    if steps is None:
+      input_type_str = 'iterators' if is_x_iterator else 'data tensors'
+      raise ValueError('When using {input_type} as input to a model, you should'
+                       ' specify the `{steps_name}` argument.'.format(
+                           input_type=input_type_str, steps_name=steps_name))
+    return True
+  return False
diff --git a/tensorflow/python/keras/estimator/__init__.py b/tensorflow/python/keras/estimator/__init__.py
deleted file mode 100644
index 6f931f41581099..00000000000000
--- a/tensorflow/python/keras/estimator/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras estimator API."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.estimator import model_to_estimator
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/initializers.py b/tensorflow/python/keras/initializers.py
similarity index 97%
rename from tensorflow/python/keras/_impl/keras/initializers.py
rename to tensorflow/python/keras/initializers.py
index ecb71d00e2c78c..b9b2e9ad598fab 100644
--- a/tensorflow/python/keras/_impl/keras/initializers.py
+++ b/tensorflow/python/keras/initializers.py
@@ -20,8 +20,8 @@
 
 import six
 
-from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
-from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops.init_ops import Constant
 from tensorflow.python.ops.init_ops import Identity
 from tensorflow.python.ops.init_ops import Initializer  # pylint: disable=unused-import
diff --git a/tensorflow/python/keras/initializers/__init__.py b/tensorflow/python/keras/initializers/__init__.py
deleted file mode 100644
index 6b1fcfd2d9585d..00000000000000
--- a/tensorflow/python/keras/initializers/__init__.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras built-in initializers."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# Initializer functions / callable classes.
-from tensorflow.python.keras._impl.keras.initializers import Constant
-from tensorflow.python.keras._impl.keras.initializers import Identity
-from tensorflow.python.keras._impl.keras.initializers import Initializer
-from tensorflow.python.keras._impl.keras.initializers import Ones
-from tensorflow.python.keras._impl.keras.initializers import Orthogonal
-from tensorflow.python.keras._impl.keras.initializers import RandomNormal
-from tensorflow.python.keras._impl.keras.initializers import RandomUniform
-from tensorflow.python.keras._impl.keras.initializers import TruncatedNormal
-from tensorflow.python.keras._impl.keras.initializers import VarianceScaling
-from tensorflow.python.keras._impl.keras.initializers import Zeros
-
-# Functional interface.
-# pylint: disable=g-bad-import-order
-from tensorflow.python.keras._impl.keras.initializers import glorot_normal
-from tensorflow.python.keras._impl.keras.initializers import glorot_uniform
-from tensorflow.python.keras._impl.keras.initializers import he_normal
-from tensorflow.python.keras._impl.keras.initializers import he_uniform
-from tensorflow.python.keras._impl.keras.initializers import lecun_normal
-from tensorflow.python.keras._impl.keras.initializers import lecun_uniform
-
-# Auxiliary utils.
-from tensorflow.python.keras._impl.keras.initializers import deserialize
-from tensorflow.python.keras._impl.keras.initializers import serialize
-from tensorflow.python.keras._impl.keras.initializers import get
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/initializers_test.py b/tensorflow/python/keras/initializers_test.py
similarity index 87%
rename from tensorflow/python/keras/_impl/keras/initializers_test.py
rename to tensorflow/python/keras/initializers_test.py
index 7b4e6b4d5b115b..c519e194bdc216 100644
--- a/tensorflow/python/keras/_impl/keras/initializers_test.py
+++ b/tensorflow/python/keras/initializers_test.py
@@ -20,7 +20,7 @@
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.ops import init_ops
 from tensorflow.python.platform import test
 
@@ -71,7 +71,7 @@ def test_truncated_normal(self):
                                                       stddev=1,
                                                       seed=126),
                    tensor_shape,
-                   target_mean=0., target_std=None, target_max=2)
+                   target_mean=0., target_max=2, target_min=-2)
 
   def test_constant(self):
     tensor_shape = (5, 6, 4)
@@ -83,49 +83,49 @@ def test_lecun_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(3. / fan_in)
+      std = np.sqrt(1. / fan_in)
       self._runner(keras.initializers.lecun_uniform(seed=123), tensor_shape,
-                   target_mean=0., target_max=scale, target_min=-scale)
+                   target_mean=0., target_std=std)
 
   def test_glorot_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, fan_out = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(6. / (fan_in + fan_out))
+      std = np.sqrt(2. / (fan_in + fan_out))
       self._runner(keras.initializers.glorot_uniform(seed=123), tensor_shape,
-                   target_mean=0., target_max=scale, target_min=-scale)
+                   target_mean=0., target_std=std)
 
   def test_he_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(6. / fan_in)
+      std = np.sqrt(2. / fan_in)
       self._runner(keras.initializers.he_uniform(seed=123), tensor_shape,
-                   target_mean=0., target_max=scale, target_min=-scale)
+                   target_mean=0., target_std=std)
 
   def test_lecun_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(1. / fan_in)
+      std = np.sqrt(1. / fan_in)
       self._runner(keras.initializers.lecun_normal(seed=123), tensor_shape,
-                   target_mean=0., target_std=None, target_max=2 * scale)
+                   target_mean=0., target_std=std)
 
   def test_glorot_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, fan_out = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(2. / (fan_in + fan_out))
+      std = np.sqrt(2. / (fan_in + fan_out))
       self._runner(keras.initializers.glorot_normal(seed=123), tensor_shape,
-                   target_mean=0., target_std=None, target_max=2 * scale)
+                   target_mean=0., target_std=std)
 
   def test_he_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(2. / fan_in)
+      std = np.sqrt(2. / fan_in)
       self._runner(keras.initializers.he_normal(seed=123), tensor_shape,
-                   target_mean=0., target_std=None, target_max=2 * scale)
+                   target_mean=0., target_std=std)
 
   def test_orthogonal(self):
     tensor_shape = (20, 20)
diff --git a/tensorflow/python/keras/_impl/keras/integration_test.py b/tensorflow/python/keras/integration_test.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/integration_test.py
rename to tensorflow/python/keras/integration_test.py
index 43aff67ef93c8e..2a05699407cc60 100644
--- a/tensorflow/python/keras/_impl/keras/integration_test.py
+++ b/tensorflow/python/keras/integration_test.py
@@ -20,8 +20,8 @@
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python import keras
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.layers import core as tf_core_layers
 from tensorflow.python.ops import nn
 from tensorflow.python.platform import test
@@ -29,6 +29,9 @@
 
 class KerasIntegrationTest(test.TestCase):
 
+  def test_version(self):
+    self.assertTrue(keras.__version__.endswith('-tf'))
+
   def test_vector_classification_sequential(self):
     with self.test_session():
       np.random.seed(1337)
diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index b45cafed3186a0..8fb663a17e16f9 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -20,137 +20,147 @@
 
 # Generic layers.
 # pylint: disable=g-bad-import-order
-from tensorflow.python.keras._impl.keras.engine import Input
-from tensorflow.python.keras._impl.keras.engine import InputLayer
-from tensorflow.python.keras._impl.keras.engine import InputSpec
-from tensorflow.python.keras._impl.keras.engine import Layer
+from tensorflow.python.keras.engine import Input
+from tensorflow.python.keras.engine import InputLayer
+from tensorflow.python.keras.engine import InputSpec
+from tensorflow.python.keras.engine import Layer
 
 # Advanced activations.
-from tensorflow.python.keras._impl.keras.layers.advanced_activations import LeakyReLU
-from tensorflow.python.keras._impl.keras.layers.advanced_activations import PReLU
-from tensorflow.python.keras._impl.keras.layers.advanced_activations import ELU
-from tensorflow.python.keras._impl.keras.layers.advanced_activations import ThresholdedReLU
-from tensorflow.python.keras._impl.keras.layers.advanced_activations import Softmax
+from tensorflow.python.keras.layers.advanced_activations import LeakyReLU
+from tensorflow.python.keras.layers.advanced_activations import PReLU
+from tensorflow.python.keras.layers.advanced_activations import ELU
+from tensorflow.python.keras.layers.advanced_activations import ThresholdedReLU
+from tensorflow.python.keras.layers.advanced_activations import Softmax
 
 # Convolution layers.
-from tensorflow.python.keras._impl.keras.layers.convolutional import Conv1D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Conv2D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Conv3D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Conv2DTranspose
-from tensorflow.python.keras._impl.keras.layers.convolutional import Conv3DTranspose
-from tensorflow.python.keras._impl.keras.layers.convolutional import SeparableConv1D
-from tensorflow.python.keras._impl.keras.layers.convolutional import SeparableConv2D
+from tensorflow.python.keras.layers.convolutional import Conv1D
+from tensorflow.python.keras.layers.convolutional import Conv2D
+from tensorflow.python.keras.layers.convolutional import Conv3D
+from tensorflow.python.keras.layers.convolutional import Conv2DTranspose
+from tensorflow.python.keras.layers.convolutional import Conv3DTranspose
+from tensorflow.python.keras.layers.convolutional import SeparableConv1D
+from tensorflow.python.keras.layers.convolutional import SeparableConv2D
 
 # Convolution layer aliases.
-from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution1D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution2D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution3D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution2DTranspose
-from tensorflow.python.keras._impl.keras.layers.convolutional import Convolution3DTranspose
-from tensorflow.python.keras._impl.keras.layers.convolutional import SeparableConvolution1D
-from tensorflow.python.keras._impl.keras.layers.convolutional import SeparableConvolution2D
-from tensorflow.python.keras._impl.keras.layers.convolutional import DepthwiseConv2D
+from tensorflow.python.keras.layers.convolutional import Convolution1D
+from tensorflow.python.keras.layers.convolutional import Convolution2D
+from tensorflow.python.keras.layers.convolutional import Convolution3D
+from tensorflow.python.keras.layers.convolutional import Convolution2DTranspose
+from tensorflow.python.keras.layers.convolutional import Convolution3DTranspose
+from tensorflow.python.keras.layers.convolutional import SeparableConvolution1D
+from tensorflow.python.keras.layers.convolutional import SeparableConvolution2D
+from tensorflow.python.keras.layers.convolutional import DepthwiseConv2D
 
 # Image processing layers.
-from tensorflow.python.keras._impl.keras.layers.convolutional import UpSampling1D
-from tensorflow.python.keras._impl.keras.layers.convolutional import UpSampling2D
-from tensorflow.python.keras._impl.keras.layers.convolutional import UpSampling3D
-from tensorflow.python.keras._impl.keras.layers.convolutional import ZeroPadding1D
-from tensorflow.python.keras._impl.keras.layers.convolutional import ZeroPadding2D
-from tensorflow.python.keras._impl.keras.layers.convolutional import ZeroPadding3D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Cropping1D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Cropping2D
-from tensorflow.python.keras._impl.keras.layers.convolutional import Cropping3D
-
-# Convolutional-recurrent layers.
-from tensorflow.python.keras._impl.keras.layers.convolutional_recurrent import ConvLSTM2D
+from tensorflow.python.keras.layers.convolutional import UpSampling1D
+from tensorflow.python.keras.layers.convolutional import UpSampling2D
+from tensorflow.python.keras.layers.convolutional import UpSampling3D
+from tensorflow.python.keras.layers.convolutional import ZeroPadding1D
+from tensorflow.python.keras.layers.convolutional import ZeroPadding2D
+from tensorflow.python.keras.layers.convolutional import ZeroPadding3D
+from tensorflow.python.keras.layers.convolutional import Cropping1D
+from tensorflow.python.keras.layers.convolutional import Cropping2D
+from tensorflow.python.keras.layers.convolutional import Cropping3D
 
 # Core layers.
-from tensorflow.python.keras._impl.keras.layers.core import Masking
-from tensorflow.python.keras._impl.keras.layers.core import Dropout
-from tensorflow.python.keras._impl.keras.layers.core import SpatialDropout1D
-from tensorflow.python.keras._impl.keras.layers.core import SpatialDropout2D
-from tensorflow.python.keras._impl.keras.layers.core import SpatialDropout3D
-from tensorflow.python.keras._impl.keras.layers.core import Activation
-from tensorflow.python.keras._impl.keras.layers.core import Reshape
-from tensorflow.python.keras._impl.keras.layers.core import Permute
-from tensorflow.python.keras._impl.keras.layers.core import Flatten
-from tensorflow.python.keras._impl.keras.layers.core import RepeatVector
-from tensorflow.python.keras._impl.keras.layers.core import Lambda
-from tensorflow.python.keras._impl.keras.layers.core import Dense
-from tensorflow.python.keras._impl.keras.layers.core import ActivityRegularization
+from tensorflow.python.keras.layers.core import Masking
+from tensorflow.python.keras.layers.core import Dropout
+from tensorflow.python.keras.layers.core import SpatialDropout1D
+from tensorflow.python.keras.layers.core import SpatialDropout2D
+from tensorflow.python.keras.layers.core import SpatialDropout3D
+from tensorflow.python.keras.layers.core import Activation
+from tensorflow.python.keras.layers.core import Reshape
+from tensorflow.python.keras.layers.core import Permute
+from tensorflow.python.keras.layers.core import Flatten
+from tensorflow.python.keras.layers.core import RepeatVector
+from tensorflow.python.keras.layers.core import Lambda
+from tensorflow.python.keras.layers.core import Dense
+from tensorflow.python.keras.layers.core import ActivityRegularization
 
 # Embedding layers.
-from tensorflow.python.keras._impl.keras.layers.embeddings import Embedding
+from tensorflow.python.keras.layers.embeddings import Embedding
 
 # Locally-connected layers.
-from tensorflow.python.keras._impl.keras.layers.local import LocallyConnected1D
-from tensorflow.python.keras._impl.keras.layers.local import LocallyConnected2D
+from tensorflow.python.keras.layers.local import LocallyConnected1D
+from tensorflow.python.keras.layers.local import LocallyConnected2D
 
 # Merge layers.
-from tensorflow.python.keras._impl.keras.layers.merge import Add
-from tensorflow.python.keras._impl.keras.layers.merge import Multiply
-from tensorflow.python.keras._impl.keras.layers.merge import Average
-from tensorflow.python.keras._impl.keras.layers.merge import Maximum
-from tensorflow.python.keras._impl.keras.layers.merge import Concatenate
-from tensorflow.python.keras._impl.keras.layers.merge import Dot
-from tensorflow.python.keras._impl.keras.layers.merge import add
-from tensorflow.python.keras._impl.keras.layers.merge import multiply
-from tensorflow.python.keras._impl.keras.layers.merge import average
-from tensorflow.python.keras._impl.keras.layers.merge import maximum
-from tensorflow.python.keras._impl.keras.layers.merge import concatenate
-from tensorflow.python.keras._impl.keras.layers.merge import dot
+from tensorflow.python.keras.layers.merge import Add
+from tensorflow.python.keras.layers.merge import Multiply
+from tensorflow.python.keras.layers.merge import Average
+from tensorflow.python.keras.layers.merge import Maximum
+from tensorflow.python.keras.layers.merge import Concatenate
+from tensorflow.python.keras.layers.merge import Dot
+from tensorflow.python.keras.layers.merge import add
+from tensorflow.python.keras.layers.merge import subtract
+from tensorflow.python.keras.layers.merge import multiply
+from tensorflow.python.keras.layers.merge import average
+from tensorflow.python.keras.layers.merge import maximum
+from tensorflow.python.keras.layers.merge import minimum
+from tensorflow.python.keras.layers.merge import concatenate
+from tensorflow.python.keras.layers.merge import dot
 
 # Noise layers.
-from tensorflow.python.keras._impl.keras.layers.noise import AlphaDropout
-from tensorflow.python.keras._impl.keras.layers.noise import GaussianNoise
-from tensorflow.python.keras._impl.keras.layers.noise import GaussianDropout
+from tensorflow.python.keras.layers.noise import AlphaDropout
+from tensorflow.python.keras.layers.noise import GaussianNoise
+from tensorflow.python.keras.layers.noise import GaussianDropout
 
 # Normalization layers.
-from tensorflow.python.keras._impl.keras.layers.normalization import BatchNormalization
+from tensorflow.python.keras.layers.normalization import BatchNormalization
 
 # Pooling layers.
-from tensorflow.python.keras._impl.keras.layers.pooling import MaxPooling1D
-from tensorflow.python.keras._impl.keras.layers.pooling import MaxPooling2D
-from tensorflow.python.keras._impl.keras.layers.pooling import MaxPooling3D
-from tensorflow.python.keras._impl.keras.layers.pooling import AveragePooling1D
-from tensorflow.python.keras._impl.keras.layers.pooling import AveragePooling2D
-from tensorflow.python.keras._impl.keras.layers.pooling import AveragePooling3D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalAveragePooling1D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalAveragePooling2D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalAveragePooling3D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalMaxPooling1D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalMaxPooling2D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalMaxPooling3D
+from tensorflow.python.keras.layers.pooling import MaxPooling1D
+from tensorflow.python.keras.layers.pooling import MaxPooling2D
+from tensorflow.python.keras.layers.pooling import MaxPooling3D
+from tensorflow.python.keras.layers.pooling import AveragePooling1D
+from tensorflow.python.keras.layers.pooling import AveragePooling2D
+from tensorflow.python.keras.layers.pooling import AveragePooling3D
+from tensorflow.python.keras.layers.pooling import GlobalAveragePooling1D
+from tensorflow.python.keras.layers.pooling import GlobalAveragePooling2D
+from tensorflow.python.keras.layers.pooling import GlobalAveragePooling3D
+from tensorflow.python.keras.layers.pooling import GlobalMaxPooling1D
+from tensorflow.python.keras.layers.pooling import GlobalMaxPooling2D
+from tensorflow.python.keras.layers.pooling import GlobalMaxPooling3D
 
 # Pooling layer aliases.
-from tensorflow.python.keras._impl.keras.layers.pooling import MaxPool1D
-from tensorflow.python.keras._impl.keras.layers.pooling import MaxPool2D
-from tensorflow.python.keras._impl.keras.layers.pooling import MaxPool3D
-from tensorflow.python.keras._impl.keras.layers.pooling import AvgPool1D
-from tensorflow.python.keras._impl.keras.layers.pooling import AvgPool2D
-from tensorflow.python.keras._impl.keras.layers.pooling import AvgPool3D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalAvgPool1D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalAvgPool2D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalAvgPool3D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalMaxPool1D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalMaxPool2D
-from tensorflow.python.keras._impl.keras.layers.pooling import GlobalMaxPool3D
+from tensorflow.python.keras.layers.pooling import MaxPool1D
+from tensorflow.python.keras.layers.pooling import MaxPool2D
+from tensorflow.python.keras.layers.pooling import MaxPool3D
+from tensorflow.python.keras.layers.pooling import AvgPool1D
+from tensorflow.python.keras.layers.pooling import AvgPool2D
+from tensorflow.python.keras.layers.pooling import AvgPool3D
+from tensorflow.python.keras.layers.pooling import GlobalAvgPool1D
+from tensorflow.python.keras.layers.pooling import GlobalAvgPool2D
+from tensorflow.python.keras.layers.pooling import GlobalAvgPool3D
+from tensorflow.python.keras.layers.pooling import GlobalMaxPool1D
+from tensorflow.python.keras.layers.pooling import GlobalMaxPool2D
+from tensorflow.python.keras.layers.pooling import GlobalMaxPool3D
 
 # Recurrent layers.
-from tensorflow.python.keras._impl.keras.layers.recurrent import RNN
-from tensorflow.python.keras._impl.keras.layers.recurrent import StackedRNNCells
-from tensorflow.python.keras._impl.keras.layers.recurrent import SimpleRNNCell
-from tensorflow.python.keras._impl.keras.layers.recurrent import GRUCell
-from tensorflow.python.keras._impl.keras.layers.recurrent import LSTMCell
-from tensorflow.python.keras._impl.keras.layers.recurrent import SimpleRNN
-from tensorflow.python.keras._impl.keras.layers.recurrent import GRU
-from tensorflow.python.keras._impl.keras.layers.recurrent import LSTM
+from tensorflow.python.keras.layers.recurrent import RNN
+from tensorflow.python.keras.layers.recurrent import StackedRNNCells
+from tensorflow.python.keras.layers.recurrent import SimpleRNNCell
+from tensorflow.python.keras.layers.recurrent import GRUCell
+from tensorflow.python.keras.layers.recurrent import LSTMCell
+from tensorflow.python.keras.layers.recurrent import SimpleRNN
+from tensorflow.python.keras.layers.recurrent import GRU
+from tensorflow.python.keras.layers.recurrent import LSTM
+
+# Convolutional-recurrent layers.
+from tensorflow.python.keras.layers.convolutional_recurrent import ConvLSTM2D
+
+# CuDNN recurrent layers.
+from tensorflow.python.keras.layers.cudnn_recurrent import CuDNNLSTM
+from tensorflow.python.keras.layers.cudnn_recurrent import CuDNNGRU
 
 # Wrapper functions
-from tensorflow.python.keras._impl.keras.layers.wrappers import Wrapper
-from tensorflow.python.keras._impl.keras.layers.wrappers import Bidirectional
-from tensorflow.python.keras._impl.keras.layers.wrappers import TimeDistributed
+from tensorflow.python.keras.layers.wrappers import Wrapper
+from tensorflow.python.keras.layers.wrappers import Bidirectional
+from tensorflow.python.keras.layers.wrappers import TimeDistributed
+
+# Serialization functions
+from tensorflow.python.keras.layers.serialization import deserialize
+from tensorflow.python.keras.layers.serialization import serialize
 
 del absolute_import
 del division
diff --git a/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py b/tensorflow/python/keras/layers/advanced_activations.py
similarity index 94%
rename from tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
rename to tensorflow/python/keras/layers/advanced_activations.py
index 89931db3c0786b..8ade3c317456a8 100644
--- a/tensorflow/python/keras/_impl/keras/layers/advanced_activations.py
+++ b/tensorflow/python/keras/layers/advanced_activations.py
@@ -18,14 +18,14 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras import activations
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import constraints
-from tensorflow.python.keras._impl.keras import initializers
-from tensorflow.python.keras._impl.keras import regularizers
-from tensorflow.python.keras._impl.keras.engine import InputSpec
-from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.keras import activations
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import constraints
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.engine import InputSpec
+from tensorflow.python.keras.engine import Layer
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/advanced_activations_test.py b/tensorflow/python/keras/layers/advanced_activations_test.py
similarity index 95%
rename from tensorflow/python/keras/_impl/keras/layers/advanced_activations_test.py
rename to tensorflow/python/keras/layers/advanced_activations_test.py
index 343b7949accf3f..81c76db14cd374 100644
--- a/tensorflow/python/keras/_impl/keras/layers/advanced_activations_test.py
+++ b/tensorflow/python/keras/layers/advanced_activations_test.py
@@ -18,8 +18,8 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python import keras
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/layers/convolutional.py
rename to tensorflow/python/keras/layers/convolutional.py
index 9971f12773233c..ce1c84e98d04c8 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -21,24 +21,24 @@
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl.keras import activations
-from tensorflow.python.keras._impl.keras import backend
-from tensorflow.python.keras._impl.keras import constraints
-from tensorflow.python.keras._impl.keras import initializers
-from tensorflow.python.keras._impl.keras import regularizers
-from tensorflow.python.keras._impl.keras.engine import InputSpec
-from tensorflow.python.keras._impl.keras.engine import Layer
+from tensorflow.python.keras import activations
+from tensorflow.python.keras import backend
+from tensorflow.python.keras import constraints
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.engine import InputSpec
+from tensorflow.python.keras.engine import Layer
 # imports for backwards namespace compatibility
 # pylint: disable=unused-import
-from tensorflow.python.keras._impl.keras.layers.pooling import AveragePooling1D
-from tensorflow.python.keras._impl.keras.layers.pooling import AveragePooling2D
-from tensorflow.python.keras._impl.keras.layers.pooling import AveragePooling3D
-from tensorflow.python.keras._impl.keras.layers.pooling import MaxPooling1D
-from tensorflow.python.keras._impl.keras.layers.pooling import MaxPooling2D
-from tensorflow.python.keras._impl.keras.layers.pooling import MaxPooling3D
+from tensorflow.python.keras.layers.pooling import AveragePooling1D
+from tensorflow.python.keras.layers.pooling import AveragePooling2D
+from tensorflow.python.keras.layers.pooling import AveragePooling3D
+from tensorflow.python.keras.layers.pooling import MaxPooling1D
+from tensorflow.python.keras.layers.pooling import MaxPooling2D
+from tensorflow.python.keras.layers.pooling import MaxPooling3D
 # pylint: enable=unused-import
-from tensorflow.python.keras._impl.keras.utils import conv_utils
-from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.keras.utils import conv_utils
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import nn_ops
@@ -1467,10 +1467,14 @@ class SeparableConv2D(SeparableConv):
           It defaults to the `image_data_format` value found in your
           Keras config file at `~/.keras/keras.json`.
           If you never set it, then it will be "channels_last".
+      dilation_rate: An integer or tuple/list of 2 integers, specifying
+          the dilation rate to use for dilated convolution.
+          Currently, specifying any `dilation_rate` value != 1 is
+          incompatible with specifying any `strides` value != 1.
       depth_multiplier: The number of depthwise convolution output channels
           for each input channel.
           The total number of depthwise convolution output
-          channels will be equal to `filterss_in * depth_multiplier`.
+          channels will be equal to `filters_in * depth_multiplier`.
       activation: Activation function to use.
           If you don't specify anything, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
@@ -1511,7 +1515,7 @@ def __init__(self,
                strides=(1, 1),
                padding='valid',
                data_format=None,
-               dilation_rate=1,
+               dilation_rate=(1, 1),
                depth_multiplier=1,
                activation=None,
                use_bias=True,
@@ -2095,14 +2099,14 @@ class ZeroPadding3D(Layer):
   """Zero-padding layer for 3D data (spatial or spatio-temporal).
 
   Arguments:
-      padding: int, or tuple of 2 ints, or tuple of 2 tuples of 2 ints.
+      padding: int, or tuple of 3 ints, or tuple of 3 tuples of 2 ints.
           - If int: the same symmetric padding
               is applied to width and height.
-          - If tuple of 2 ints:
+          - If tuple of 3 ints:
               interpreted as two different
               symmetric padding values for height and width:
               `(symmetric_dim1_pad, symmetric_dim2_pad, symmetric_dim3_pad)`.
-          - If tuple of 2 tuples of 2 ints:
+          - If tuple of 3 tuples of 2 ints:
               interpreted as
               `((left_dim1_pad, right_dim1_pad), (left_dim2_pad,
                 right_dim2_pad), (left_dim3_pad, right_dim3_pad))`
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/layers/convolutional_recurrent.py
similarity index 96%
rename from tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
rename to tensorflow/python/keras/layers/convolutional_recurrent.py
index be25bbc043a3be..c731508b3c32d9 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/python/keras/layers/convolutional_recurrent.py
@@ -21,18 +21,19 @@
 
 import numpy as np
 
-from tensorflow.python.keras._impl.keras import activations
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import constraints
-from tensorflow.python.keras._impl.keras import initializers
-from tensorflow.python.keras._impl.keras import regularizers
-from tensorflow.python.keras._impl.keras.engine import InputSpec
-from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.layers.recurrent import _generate_dropout_mask
-from tensorflow.python.keras._impl.keras.layers.recurrent import RNN
-from tensorflow.python.keras._impl.keras.utils import conv_utils
-from tensorflow.python.keras._impl.keras.utils import generic_utils
-from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.keras import activations
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import constraints
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.engine import InputSpec
+from tensorflow.python.keras.engine import Layer
+from tensorflow.python.keras.layers.recurrent import _generate_dropout_mask
+from tensorflow.python.keras.layers.recurrent import _standardize_args
+from tensorflow.python.keras.layers.recurrent import RNN
+from tensorflow.python.keras.utils import conv_utils
+from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -167,6 +168,7 @@ def __init__(self,
                                     **kwargs)
     self.input_spec = [InputSpec(ndim=5)]
     self.states = None
+    self._num_constants = None
 
   @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
@@ -214,7 +216,7 @@ def build(self, input_shape):
     # Note input_shape will be list of shapes of initial states and
     # constants if these are passed in __call__.
     if self._num_constants is not None:
-      constants_shape = input_shape[-self._num_constants:]
+      constants_shape = input_shape[-self._num_constants:]  # pylint: disable=E1130
     else:
       constants_shape = None
 
@@ -279,8 +281,8 @@ def get_initial_state(self, inputs):
       return [initial_state]
 
   def __call__(self, inputs, initial_state=None, constants=None, **kwargs):
-    inputs, initial_state, constants = self._standardize_args(
-        inputs, initial_state, constants)
+    inputs, initial_state, constants = _standardize_args(
+        inputs, initial_state, constants, self._num_constants)
 
     if initial_state is None and constants is None:
       return super(ConvRNN2D, self).__call__(inputs, **kwargs)
@@ -609,16 +611,25 @@ def build(self, input_shape):
         name='recurrent_kernel',
         regularizer=self.recurrent_regularizer,
         constraint=self.recurrent_constraint)
+
     if self.use_bias:
-      self.bias = self.add_weight(shape=(self.filters * 4,),
-                                  initializer=self.bias_initializer,
-                                  name='bias',
-                                  regularizer=self.bias_regularizer,
-                                  constraint=self.bias_constraint)
       if self.unit_forget_bias:
-        bias_value = np.zeros((self.filters * 4,))
-        bias_value[self.filters: self.filters * 2] = 1.
-        K.set_value(self.bias, bias_value)
+
+        def bias_initializer(_, *args, **kwargs):
+          return K.concatenate([
+              self.bias_initializer((self.filters,), *args, **kwargs),
+              initializers.Ones()((self.filters,), *args, **kwargs),
+              self.bias_initializer((self.filters * 2,), *args, **kwargs),
+          ])
+      else:
+        bias_initializer = self.bias_initializer
+      self.bias = self.add_weight(
+          shape=(self.filters * 4,),
+          name='bias',
+          initializer=bias_initializer,
+          regularizer=self.bias_regularizer,
+          constraint=self.bias_constraint)
+
     else:
       self.bias = None
 
@@ -844,10 +855,10 @@ class ConvLSTM2D(ConvRNN2D):
   Input shape:
     - if data_format='channels_first'
         5D tensor with shape:
-        `(samples,time, channels, rows, cols)`
+        `(samples, time, channels, rows, cols)`
     - if data_format='channels_last'
         5D tensor with shape:
-        `(samples,time, rows, cols, channels)`
+        `(samples, time, rows, cols, channels)`
 
   Output shape:
     - if `return_sequences`
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent_test.py b/tensorflow/python/keras/layers/convolutional_recurrent_test.py
similarity index 90%
rename from tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent_test.py
rename to tensorflow/python/keras/layers/convolutional_recurrent_test.py
index 9e768b4e9552d1..4b8f6f2a14e490 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional_recurrent_test.py
+++ b/tensorflow/python/keras/layers/convolutional_recurrent_test.py
@@ -20,8 +20,8 @@
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python import keras
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
@@ -180,6 +180,23 @@ def test_conv_lstm_dropout(self):
                   'recurrent_dropout': 0.1},
           input_shape=(1, 2, 5, 5, 2))
 
+  def test_conv_lstm_cloning(self):
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.ConvLSTM2D(5, 3, input_shape=(None, 5, 5, 3)))
+
+      test_inputs = np.random.random((2, 4, 5, 5, 3))
+      reference_outputs = model.predict(test_inputs)
+      weights = model.get_weights()
+
+    # Use a new graph to clone the model
+    with self.test_session():
+      clone = keras.models.clone_model(model)
+      clone.set_weights(weights)
+
+      outputs = clone.predict(test_inputs)
+      self.assertAllClose(reference_outputs, outputs, atol=1e-5)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py b/tensorflow/python/keras/layers/convolutional_test.py
similarity index 99%
rename from tensorflow/python/keras/_impl/keras/layers/convolutional_test.py
rename to tensorflow/python/keras/layers/convolutional_test.py
index 12b42676759d49..167cabaeecb0c4 100644
--- a/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py
+++ b/tensorflow/python/keras/layers/convolutional_test.py
@@ -22,10 +22,10 @@
 
 import numpy as np
 
+from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
similarity index 92%
rename from tensorflow/python/keras/_impl/keras/layers/core.py
rename to tensorflow/python/keras/layers/core.py
index 9c4cb0f4fda681..db0c22038018ca 100644
--- a/tensorflow/python/keras/_impl/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -19,22 +19,25 @@
 from __future__ import print_function
 
 import copy
+import sys
 import types as python_types
+import warnings
 
 import numpy as np
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl.keras import activations
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import constraints
-from tensorflow.python.keras._impl.keras import initializers
-from tensorflow.python.keras._impl.keras import regularizers
-from tensorflow.python.keras._impl.keras.engine import InputSpec
-from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.utils import generic_utils
-from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.keras import activations
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import constraints
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.engine import InputSpec
+from tensorflow.python.keras.engine import Layer
+from tensorflow.python.keras.utils import conv_utils
+from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
@@ -501,6 +504,17 @@ def get_config(self):
 class Flatten(Layer):
   """Flattens the input. Does not affect the batch size.
 
+  Arguments:
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, ..., channels)` while `channels_first` corresponds to
+          inputs with shape `(batch, channels, ...)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
+
   Example:
 
   ```python
@@ -515,11 +529,19 @@ class Flatten(Layer):
   ```
   """
 
-  def __init__(self, **kwargs):
+  def __init__(self, data_format=None, **kwargs):
     super(Flatten, self).__init__(**kwargs)
+    self.data_format = conv_utils.normalize_data_format(data_format)
     self.input_spec = InputSpec(min_ndim=2)
 
   def call(self, inputs):
+    if self.data_format == 'channels_first':
+      permutation = [0]
+      permutation.extend([i for i in
+                          range(2, K.ndim(inputs))])
+      permutation.append(1)
+      inputs = array_ops.transpose(inputs, perm=permutation)
+
     outputs = array_ops.reshape(inputs, (array_ops.shape(inputs)[0], -1))
     if not context.executing_eagerly():
       outputs.set_shape(self.compute_output_shape(inputs.get_shape()))
@@ -534,6 +556,11 @@ def compute_output_shape(self, input_shape):
       output_shape += [None]
     return tensor_shape.TensorShape(output_shape)
 
+  def get_config(self):
+    config = {'data_format': self.data_format}
+    base_config = super(Flatten, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
 
 @tf_export('keras.layers.RepeatVector')
 class RepeatVector(Layer):
@@ -689,6 +716,7 @@ def compute_mask(self, inputs, mask=None):
     return self.mask
 
   def get_config(self):
+    module = self.function.__module__
     if isinstance(self.function, python_types.LambdaType):
       function = generic_utils.func_dump(self.function)
       function_type = 'lambda'
@@ -696,21 +724,26 @@ def get_config(self):
       function = self.function.__name__
       function_type = 'function'
 
+    output_shape_module = None
     if isinstance(self._output_shape, python_types.LambdaType):
       output_shape = generic_utils.func_dump(self._output_shape)
       output_shape_type = 'lambda'
+      output_shape_module = self._output_shape.__module__
     elif callable(self._output_shape):
       output_shape = self._output_shape.__name__
       output_shape_type = 'function'
+      output_shape_module = self._output_shape.__module__
     else:
       output_shape = self._output_shape
       output_shape_type = 'raw'
 
     config = {
         'function': function,
+        'module': module,
         'function_type': function_type,
         'output_shape': output_shape,
         'output_shape_type': output_shape_type,
+        'output_shape_module': output_shape_module,
         'arguments': self.arguments
     }
     base_config = super(Lambda, self).get_config()
@@ -720,8 +753,16 @@ def get_config(self):
   def from_config(cls, config, custom_objects=None):
     config = config.copy()
     globs = globals()
+    module = config.pop('module', None)
+    if module in sys.modules:
+      globs.update(sys.modules[module].__dict__)
+    elif module is not None:
+      # Note: we don't know the name of the function if it's a lambda.
+      warnings.warn('{} is not loaded, but a Lambda layer uses it. '
+                    'It may cause errors.'.format(module)
+                    , UserWarning)
     if custom_objects:
-      globs = dict(list(globs.items()) + list(custom_objects.items()))
+      globs.update(custom_objects)
     function_type = config.pop('function_type')
     if function_type == 'function':
       # Simple lookup in custom objects
@@ -735,6 +776,14 @@ def from_config(cls, config, custom_objects=None):
     else:
       raise TypeError('Unknown function type:', function_type)
 
+    output_shape_module = config.pop('output_shape_module', None)
+    if output_shape_module in sys.modules:
+      globs.update(sys.modules[output_shape_module].__dict__)
+    elif output_shape_module is not None:
+      # Note: we don't know the name of the function if it's a lambda.
+      warnings.warn('{} is not loaded, but a Lambda layer uses it. '
+                    'It may cause errors.'.format(output_shape_module)
+                    , UserWarning)
     output_shape_type = config.pop('output_shape_type')
     if output_shape_type == 'function':
       # Simple lookup in custom objects
diff --git a/tensorflow/python/keras/_impl/keras/layers/core_test.py b/tensorflow/python/keras/layers/core_test.py
similarity index 94%
rename from tensorflow/python/keras/_impl/keras/layers/core_test.py
rename to tensorflow/python/keras/layers/core_test.py
index d22d8d12dc4e76..ff8af976b99376 100644
--- a/tensorflow/python/keras/_impl/keras/layers/core_test.py
+++ b/tensorflow/python/keras/layers/core_test.py
@@ -20,9 +20,9 @@
 
 import numpy as np
 
+from tensorflow.python import keras
 from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -124,6 +124,16 @@ def test_flatten(self):
     testing_utils.layer_test(
         keras.layers.Flatten, kwargs={}, input_shape=(3, 2, 4))
 
+    # Test channels_first
+    inputs = np.random.random((10, 3, 5, 5)).astype('float32')
+    outputs = testing_utils.layer_test(
+        keras.layers.Flatten,
+        kwargs={'data_format': 'channels_first'},
+        input_data=inputs)
+    target_outputs = np.reshape(
+        np.transpose(inputs, (0, 2, 3, 1)), (-1, 5 * 5 * 3))
+    self.assertAllClose(outputs, target_outputs)
+
   @tf_test_util.run_in_graph_and_eager_modes()
   def test_repeat_vector(self):
     testing_utils.layer_test(
diff --git a/tensorflow/python/keras/layers/cudnn_recurrent.py b/tensorflow/python/keras/layers/cudnn_recurrent.py
new file mode 100644
index 00000000000000..ad6594279d037c
--- /dev/null
+++ b/tensorflow/python/keras/layers/cudnn_recurrent.py
@@ -0,0 +1,524 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Recurrent layers backed by cuDNN.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import constraints
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.engine import InputSpec
+from tensorflow.python.keras.layers.recurrent import RNN
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_cudnn_rnn_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+class _CuDNNRNN(RNN):
+  """Private base class for CuDNNGRU and CuDNNLSTM layers.
+
+  Arguments:
+    return_sequences: Boolean. Whether to return the last output
+        in the output sequence, or the full sequence.
+    return_state: Boolean. Whether to return the last state
+        in addition to the output.
+    go_backwards: Boolean (default False).
+        If True, process the input sequence backwards and return the
+        reversed sequence.
+    stateful: Boolean (default False). If True, the last state
+        for each sample at index i in a batch will be used as initial
+        state for the sample of index i in the following batch.
+  """
+
+  def __init__(self,
+               return_sequences=False,
+               return_state=False,
+               go_backwards=False,
+               stateful=False,
+               **kwargs):
+    # We invoke the base layer's initializer directly here because we do not
+    # want to create RNN cell instance.
+    super(RNN, self).__init__(**kwargs)  # pylint: disable=bad-super-call
+    self.return_sequences = return_sequences
+    self.return_state = return_state
+    self.go_backwards = go_backwards
+    self.stateful = stateful
+    self.supports_masking = False
+    self.input_spec = [InputSpec(ndim=3)]
+    if hasattr(self.cell.state_size, '__len__'):
+      state_size = self.cell.state_size
+    else:
+      state_size = [self.cell.state_size]
+    self.state_spec = [InputSpec(shape=(None, dim)) for dim in state_size]
+    self.constants_spec = None
+    self._states = None
+    self._num_constants = None
+    self._vector_shape = constant_op.constant([-1])
+
+  def _canonical_to_params(self, weights, biases):
+    weights = [array_ops.reshape(x, self._vector_shape) for x in weights]
+    biases = [array_ops.reshape(x, self._vector_shape) for x in biases]
+    return array_ops.concat(weights + biases, axis=0)
+
+  def call(self, inputs, mask=None, training=None, initial_state=None):
+    if isinstance(mask, list):
+      mask = mask[0]
+    if mask is not None:
+      raise ValueError('Masking is not supported for CuDNN RNNs.')
+
+    # input shape: `(samples, time (padded with zeros), input_dim)`
+    # note that the .build() method of subclasses MUST define
+    # self.input_spec and self.state_spec with complete input shapes.
+    if isinstance(inputs, list):
+      initial_state = inputs[1:]
+      inputs = inputs[0]
+    elif initial_state is not None:
+      pass
+    elif self.stateful:
+      initial_state = self.states
+    else:
+      initial_state = self.get_initial_state(inputs)
+
+    if len(initial_state) != len(self.states):
+      raise ValueError('Layer has ' + str(len(self.states)) +
+                       ' states but was passed ' + str(len(initial_state)) +
+                       ' initial states.')
+
+    if self.go_backwards:
+      # Reverse time axis.
+      inputs = K.reverse(inputs, 1)
+    output, states = self._process_batch(inputs, initial_state)
+
+    if self.stateful:
+      updates = []
+      for i in range(len(states)):
+        updates.append(state_ops.assign(self.states[i], states[i]))
+      self.add_update(updates, inputs)
+
+    if self.return_state:
+      return [output] + states
+    else:
+      return output
+
+  def get_config(self):
+    config = {
+        'return_sequences': self.return_sequences,
+        'return_state': self.return_state,
+        'go_backwards': self.go_backwards,
+        'stateful': self.stateful
+    }
+    base_config = super(  # pylint: disable=bad-super-call
+        RNN, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
+
+  @property
+  def trainable_weights(self):
+    if self.trainable and self.built:
+      return [self.kernel, self.recurrent_kernel, self.bias]
+    return []
+
+  @property
+  def non_trainable_weights(self):
+    if not self.trainable and self.built:
+      return [self.kernel, self.recurrent_kernel, self.bias]
+    return []
+
+  @property
+  def losses(self):
+    return super(RNN, self).losses
+
+  def get_losses_for(self, inputs=None):
+    return super(  # pylint: disable=bad-super-call
+        RNN, self).get_losses_for(inputs=inputs)
+
+
+@tf_export('keras.layers.CuDNNGRU')
+class CuDNNGRU(_CuDNNRNN):
+  """Fast GRU implementation backed by cuDNN.
+
+  More information about cuDNN can be found on the [NVIDIA
+  developer website](https://developer.nvidia.com/cudnn).
+  Can only be run on GPU.
+
+  Arguments:
+      units: Positive integer, dimensionality of the output space.
+      kernel_initializer: Initializer for the `kernel` weights matrix, used for
+        the linear transformation of the inputs.
+      recurrent_initializer: Initializer for the `recurrent_kernel` weights
+        matrix, used for the linear transformation of the recurrent state.
+      bias_initializer: Initializer for the bias vector.
+      kernel_regularizer: Regularizer function applied to the `kernel` weights
+        matrix.
+      recurrent_regularizer: Regularizer function applied to the
+        `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to the output of the
+        layer (its "activation").
+      kernel_constraint: Constraint function applied to the `kernel` weights
+        matrix.
+      recurrent_constraint: Constraint function applied to the
+        `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+      return_sequences: Boolean. Whether to return the last output in the output
+        sequence, or the full sequence.
+      return_state: Boolean. Whether to return the last state in addition to the
+        output.
+      go_backwards: Boolean (default False). If True, process the input sequence
+        backwards and return the reversed sequence.
+      stateful: Boolean (default False). If True, the last state for each sample
+        at index i in a batch will be used as initial state for the sample of
+        index i in the following batch.
+  """
+
+  def __init__(self,
+               units,
+               kernel_initializer='glorot_uniform',
+               recurrent_initializer='orthogonal',
+               bias_initializer='zeros',
+               kernel_regularizer=None,
+               recurrent_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               recurrent_constraint=None,
+               bias_constraint=None,
+               return_sequences=False,
+               return_state=False,
+               go_backwards=False,
+               stateful=False,
+               **kwargs):
+    self.units = units
+    cell_spec = collections.namedtuple('cell', 'state_size')
+    self._cell = cell_spec(state_size=self.units)
+    super(CuDNNGRU, self).__init__(
+        return_sequences=return_sequences,
+        return_state=return_state,
+        go_backwards=go_backwards,
+        stateful=stateful,
+        **kwargs)
+
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.recurrent_initializer = initializers.get(recurrent_initializer)
+    self.bias_initializer = initializers.get(bias_initializer)
+
+    self.kernel_regularizer = regularizers.get(kernel_regularizer)
+    self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
+    self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
+
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.recurrent_constraint = constraints.get(recurrent_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+
+  @property
+  def cell(self):
+    return self._cell
+
+  def build(self, input_shape):
+    super(CuDNNGRU, self).build(input_shape)
+    if isinstance(input_shape, list):
+      input_shape = input_shape[0]
+    input_dim = int(input_shape[-1])
+
+    self.kernel = self.add_weight(
+        shape=(input_dim, self.units * 3),
+        name='kernel',
+        initializer=self.kernel_initializer,
+        regularizer=self.kernel_regularizer,
+        constraint=self.kernel_constraint)
+
+    self.recurrent_kernel = self.add_weight(
+        shape=(self.units, self.units * 3),
+        name='recurrent_kernel',
+        initializer=self.recurrent_initializer,
+        regularizer=self.recurrent_regularizer,
+        constraint=self.recurrent_constraint)
+
+    self.bias = self.add_weight(
+        shape=(self.units * 6,),
+        name='bias',
+        initializer=self.bias_initializer,
+        regularizer=self.bias_regularizer,
+        constraint=self.bias_constraint)
+
+    self.built = True
+
+  def _process_batch(self, inputs, initial_state):
+    inputs = array_ops.transpose(inputs, perm=(1, 0, 2))
+    input_h = initial_state[0]
+    input_h = array_ops.expand_dims(input_h, axis=0)
+
+    params = self._canonical_to_params(
+        weights=[
+            self.kernel[:, self.units:self.units * 2],
+            self.kernel[:, :self.units],
+            self.kernel[:, self.units * 2:],
+            self.recurrent_kernel[:, self.units:self.units * 2],
+            self.recurrent_kernel[:, :self.units],
+            self.recurrent_kernel[:, self.units * 2:],
+        ],
+        biases=[
+            self.bias[self.units:self.units * 2],
+            self.bias[:self.units],
+            self.bias[self.units * 2:self.units * 3],
+            self.bias[self.units * 4:self.units * 5],
+            self.bias[self.units * 3:self.units * 4],
+            self.bias[self.units * 5:],
+        ],
+    )
+
+    outputs, h, _, _ = gen_cudnn_rnn_ops.cudnn_rnn(
+        inputs,
+        input_h=input_h,
+        input_c=0,
+        params=params,
+        is_training=True,
+        rnn_mode='gru')
+
+    if self.stateful or self.return_state:
+      h = h[0]
+    if self.return_sequences:
+      output = array_ops.transpose(outputs, perm=(1, 0, 2))
+    else:
+      output = outputs[-1]
+    return output, [h]
+
+  def get_config(self):
+    config = {
+        'units': self.units,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'recurrent_initializer':
+            initializers.serialize(self.recurrent_initializer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'recurrent_regularizer':
+            regularizers.serialize(self.recurrent_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'recurrent_constraint':
+            constraints.serialize(self.recurrent_constraint),
+        'bias_constraint': constraints.serialize(self.bias_constraint)
+    }
+    base_config = super(CuDNNGRU, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
+@tf_export('keras.layers.CuDNNLSTM')
+class CuDNNLSTM(_CuDNNRNN):
+  """Fast LSTM implementation backed by cuDNN.
+
+  More information about cuDNN can be found on the [NVIDIA
+  developer website](https://developer.nvidia.com/cudnn).
+  Can only be run on GPU.
+
+  Arguments:
+      units: Positive integer, dimensionality of the output space.
+      kernel_initializer: Initializer for the `kernel` weights matrix, used for
+        the linear transformation of the inputs.
+      unit_forget_bias: Boolean. If True, add 1 to the bias of the forget gate
+        at initialization. Setting it to true will also force
+        `bias_initializer="zeros"`. This is recommended in [Jozefowicz et
+        al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+      recurrent_initializer: Initializer for the `recurrent_kernel` weights
+        matrix, used for the linear transformation of the recurrent state.
+      bias_initializer: Initializer for the bias vector.
+      kernel_regularizer: Regularizer function applied to the `kernel` weights
+        matrix.
+      recurrent_regularizer: Regularizer function applied to the
+        `recurrent_kernel` weights matrix.
+      bias_regularizer: Regularizer function applied to the bias vector.
+      activity_regularizer: Regularizer function applied to the output of the
+        layer (its "activation").
+      kernel_constraint: Constraint function applied to the `kernel` weights
+        matrix.
+      recurrent_constraint: Constraint function applied to the
+        `recurrent_kernel` weights matrix.
+      bias_constraint: Constraint function applied to the bias vector.
+      return_sequences: Boolean. Whether to return the last output. in the
+        output sequence, or the full sequence.
+      return_state: Boolean. Whether to return the last state in addition to the
+        output.
+      go_backwards: Boolean (default False). If True, process the input sequence
+        backwards and return the reversed sequence.
+      stateful: Boolean (default False). If True, the last state for each sample
+        at index i in a batch will be used as initial state for the sample of
+        index i in the following batch.
+  """
+
+  def __init__(self,
+               units,
+               kernel_initializer='glorot_uniform',
+               recurrent_initializer='orthogonal',
+               bias_initializer='zeros',
+               unit_forget_bias=True,
+               kernel_regularizer=None,
+               recurrent_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               recurrent_constraint=None,
+               bias_constraint=None,
+               return_sequences=False,
+               return_state=False,
+               go_backwards=False,
+               stateful=False,
+               **kwargs):
+    self.units = units
+    cell_spec = collections.namedtuple('cell', 'state_size')
+    self._cell = cell_spec(state_size=(self.units, self.units))
+    super(CuDNNLSTM, self).__init__(
+        return_sequences=return_sequences,
+        return_state=return_state,
+        go_backwards=go_backwards,
+        stateful=stateful,
+        **kwargs)
+
+    self.kernel_initializer = initializers.get(kernel_initializer)
+    self.recurrent_initializer = initializers.get(recurrent_initializer)
+    self.bias_initializer = initializers.get(bias_initializer)
+    self.unit_forget_bias = unit_forget_bias
+
+    self.kernel_regularizer = regularizers.get(kernel_regularizer)
+    self.recurrent_regularizer = regularizers.get(recurrent_regularizer)
+    self.bias_regularizer = regularizers.get(bias_regularizer)
+    self.activity_regularizer = regularizers.get(activity_regularizer)
+
+    self.kernel_constraint = constraints.get(kernel_constraint)
+    self.recurrent_constraint = constraints.get(recurrent_constraint)
+    self.bias_constraint = constraints.get(bias_constraint)
+
+  @property
+  def cell(self):
+    return self._cell
+
+  def build(self, input_shape):
+    super(CuDNNLSTM, self).build(input_shape)
+    if isinstance(input_shape, list):
+      input_shape = input_shape[0]
+    input_dim = int(input_shape[-1])
+
+    self.kernel = self.add_weight(
+        shape=(input_dim, self.units * 4),
+        name='kernel',
+        initializer=self.kernel_initializer,
+        regularizer=self.kernel_regularizer,
+        constraint=self.kernel_constraint)
+
+    self.recurrent_kernel = self.add_weight(
+        shape=(self.units, self.units * 4),
+        name='recurrent_kernel',
+        initializer=self.recurrent_initializer,
+        regularizer=self.recurrent_regularizer,
+        constraint=self.recurrent_constraint)
+
+    if self.unit_forget_bias:
+
+      def bias_initializer(_, *args, **kwargs):
+        return array_ops.concat([
+            self.bias_initializer((self.units * 5,), *args, **kwargs),
+            initializers.Ones()((self.units,), *args, **kwargs),
+            self.bias_initializer((self.units * 2,), *args, **kwargs),
+        ], axis=0)
+    else:
+      bias_initializer = self.bias_initializer
+    self.bias = self.add_weight(
+        shape=(self.units * 8,),
+        name='bias',
+        initializer=bias_initializer,
+        regularizer=self.bias_regularizer,
+        constraint=self.bias_constraint)
+
+    self.built = True
+
+  def _process_batch(self, inputs, initial_state):
+    inputs = array_ops.transpose(inputs, perm=(1, 0, 2))
+    input_h = initial_state[0]
+    input_c = initial_state[1]
+    input_h = array_ops.expand_dims(input_h, axis=0)
+    input_c = array_ops.expand_dims(input_c, axis=0)
+
+    params = self._canonical_to_params(
+        weights=[
+            self.kernel[:, :self.units],
+            self.kernel[:, self.units:self.units * 2],
+            self.kernel[:, self.units * 2:self.units * 3],
+            self.kernel[:, self.units * 3:],
+            self.recurrent_kernel[:, :self.units],
+            self.recurrent_kernel[:, self.units:self.units * 2],
+            self.recurrent_kernel[:, self.units * 2:self.units * 3],
+            self.recurrent_kernel[:, self.units * 3:],
+        ],
+        biases=[
+            self.bias[:self.units],
+            self.bias[self.units:self.units * 2],
+            self.bias[self.units * 2:self.units * 3],
+            self.bias[self.units * 3:self.units * 4],
+            self.bias[self.units * 4:self.units * 5],
+            self.bias[self.units * 5:self.units * 6],
+            self.bias[self.units * 6:self.units * 7],
+            self.bias[self.units * 7:],
+        ],
+    )
+
+    outputs, h, c, _ = gen_cudnn_rnn_ops.cudnn_rnn(
+        inputs,
+        input_h=input_h,
+        input_c=input_c,
+        params=params,
+        is_training=True)
+
+    if self.stateful or self.return_state:
+      h = h[0]
+      c = c[0]
+    if self.return_sequences:
+      output = array_ops.transpose(outputs, perm=(1, 0, 2))
+    else:
+      output = outputs[-1]
+    return output, [h, c]
+
+  def get_config(self):
+    config = {
+        'units': self.units,
+        'kernel_initializer': initializers.serialize(self.kernel_initializer),
+        'recurrent_initializer':
+            initializers.serialize(self.recurrent_initializer),
+        'bias_initializer': initializers.serialize(self.bias_initializer),
+        'unit_forget_bias': self.unit_forget_bias,
+        'kernel_regularizer': regularizers.serialize(self.kernel_regularizer),
+        'recurrent_regularizer':
+            regularizers.serialize(self.recurrent_regularizer),
+        'bias_regularizer': regularizers.serialize(self.bias_regularizer),
+        'activity_regularizer':
+            regularizers.serialize(self.activity_regularizer),
+        'kernel_constraint': constraints.serialize(self.kernel_constraint),
+        'recurrent_constraint':
+            constraints.serialize(self.recurrent_constraint),
+        'bias_constraint': constraints.serialize(self.bias_constraint)
+    }
+    base_config = super(CuDNNLSTM, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
diff --git a/tensorflow/python/keras/layers/cudnn_recurrent_test.py b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
new file mode 100644
index 00000000000000..9d186f8c586bd9
--- /dev/null
+++ b/tensorflow/python/keras/layers/cudnn_recurrent_test.py
@@ -0,0 +1,397 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for cudnn recurrent layers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.platform import test
+from tensorflow.python.training.rmsprop import RMSPropOptimizer
+
+
+class CuDNNTest(test.TestCase, parameterized.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_cudnn_rnn_basics(self):
+    if test.is_gpu_available(cuda_only=True):
+      with self.test_session(use_gpu=True):
+        input_size = 10
+        timesteps = 6
+        units = 2
+        num_samples = 32
+        for layer_class in [keras.layers.CuDNNGRU, keras.layers.CuDNNLSTM]:
+          for return_sequences in [True, False]:
+            with keras.utils.CustomObjectScope(
+                {'keras.layers.CuDNNGRU': keras.layers.CuDNNGRU,
+                 'keras.layers.CuDNNLSTM': keras.layers.CuDNNLSTM}):
+              testing_utils.layer_test(
+                  layer_class,
+                  kwargs={'units': units,
+                          'return_sequences': return_sequences},
+                  input_shape=(num_samples, timesteps, input_size))
+          for go_backwards in [True, False]:
+            with keras.utils.CustomObjectScope(
+                {'keras.layers.CuDNNGRU': keras.layers.CuDNNGRU,
+                 'keras.layers.CuDNNLSTM': keras.layers.CuDNNLSTM}):
+              testing_utils.layer_test(
+                  layer_class,
+                  kwargs={'units': units,
+                          'go_backwards': go_backwards},
+                  input_shape=(num_samples, timesteps, input_size))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_trainability(self):
+    if test.is_gpu_available(cuda_only=True):
+      with self.test_session(use_gpu=True):
+        input_size = 10
+        units = 2
+        for layer_class in [keras.layers.CuDNNGRU, keras.layers.CuDNNLSTM]:
+          layer = layer_class(units)
+          layer.build((None, None, input_size))
+          self.assertEqual(len(layer.weights), 3)
+          self.assertEqual(len(layer.trainable_weights), 3)
+          self.assertEqual(len(layer.non_trainable_weights), 0)
+          layer.trainable = False
+          self.assertEqual(len(layer.weights), 3)
+          self.assertEqual(len(layer.non_trainable_weights), 3)
+          self.assertEqual(len(layer.trainable_weights), 0)
+          layer.trainable = True
+          self.assertEqual(len(layer.weights), 3)
+          self.assertEqual(len(layer.trainable_weights), 3)
+          self.assertEqual(len(layer.non_trainable_weights), 0)
+
+  @parameterized.named_parameters(
+      ('cudnngru', keras.layers.CuDNNGRU),
+      ('cudnnlstm', keras.layers.CuDNNLSTM),
+  )
+  def test_regularizer(self, layer_class):
+    if test.is_gpu_available(cuda_only=True):
+      with self.test_session(use_gpu=True):
+        input_size = 10
+        timesteps = 6
+        units = 2
+        num_samples = 32
+        layer = layer_class(
+            units,
+            return_sequences=False,
+            input_shape=(timesteps, input_size),
+            kernel_regularizer=keras.regularizers.l1(0.01),
+            recurrent_regularizer=keras.regularizers.l1(0.01),
+            bias_regularizer='l2')
+        layer.build((None, None, input_size))
+        self.assertEqual(len(layer.losses), 3)
+
+        layer = layer_class(
+            units,
+            return_sequences=False,
+            input_shape=(timesteps, input_size),
+            activity_regularizer='l2')
+        self.assertTrue(layer.activity_regularizer)
+        x = keras.backend.variable(
+            np.ones((num_samples, timesteps, input_size)))
+        layer(x)
+        self.assertEqual(len(layer.get_losses_for(x)), 1)
+
+  @parameterized.named_parameters(
+      ('cudnngru', keras.layers.CuDNNGRU),
+      ('cudnnlstm', keras.layers.CuDNNLSTM),
+  )
+  def test_return_state(self, layer_class):
+    if test.is_gpu_available(cuda_only=True):
+      with self.test_session(use_gpu=True):
+        input_size = 10
+        timesteps = 6
+        units = 2
+        num_samples = 32
+        num_states = 2 if layer_class is keras.layers.CuDNNLSTM else 1
+
+        inputs = keras.Input(batch_shape=(num_samples, timesteps, input_size))
+        layer = layer_class(units, return_state=True, stateful=True)
+        outputs = layer(inputs)
+        _, state = outputs[0], outputs[1:]
+        self.assertEqual(len(state), num_states)
+        model = keras.models.Model(inputs, state[0])
+
+        inputs = np.random.random((num_samples, timesteps, input_size))
+        state = model.predict(inputs)
+        np.testing.assert_allclose(
+            keras.backend.eval(layer.states[0]), state, atol=1e-4)
+
+  @parameterized.named_parameters(
+      ('cudnngru', keras.layers.CuDNNGRU),
+      ('cudnnlstm', keras.layers.CuDNNLSTM),
+  )
+  def test_specify_initial_state_keras_tensor(self, layer_class):
+    if test.is_gpu_available(cuda_only=True):
+      with self.test_session(use_gpu=True):
+        input_size = 10
+        timesteps = 6
+        units = 2
+        num_samples = 32
+        num_states = 2 if layer_class is keras.layers.CuDNNLSTM else 1
+
+        inputs = keras.Input((timesteps, input_size))
+        initial_state = [keras.Input((units,)) for _ in range(num_states)]
+        layer = layer_class(units)
+        if len(initial_state) == 1:
+          output = layer(inputs, initial_state=initial_state[0])
+        else:
+          output = layer(inputs, initial_state=initial_state)
+        self.assertIn(initial_state[0], layer._inbound_nodes[0].input_tensors)
+
+        model = keras.models.Model([inputs] + initial_state, output)
+        model.compile(loss='categorical_crossentropy', optimizer='adam')
+
+        inputs = np.random.random((num_samples, timesteps, input_size))
+        initial_state = [
+            np.random.random((num_samples, units)) for _ in range(num_states)
+        ]
+        targets = np.random.random((num_samples, units))
+        model.fit([inputs] + initial_state, targets)
+
+  @parameterized.named_parameters(
+      ('cudnngru', keras.layers.CuDNNGRU),
+      ('cudnnlstm', keras.layers.CuDNNLSTM),
+  )
+  def test_statefulness(self, layer_class):
+    if test.is_gpu_available(cuda_only=True):
+      with self.test_session(use_gpu=True):
+        input_size = 10
+        timesteps = 6
+        units = 2
+        num_samples = 32
+
+        model = keras.models.Sequential()
+        model.add(
+            keras.layers.Embedding(
+                10,
+                input_size,
+                input_length=timesteps,
+                batch_input_shape=(num_samples, timesteps)))
+        layer = layer_class(
+            units, return_sequences=False, stateful=True, weights=None)
+        model.add(layer)
+        model.compile(optimizer='sgd', loss='mse')
+        out1 = model.predict(np.ones((num_samples, timesteps)))
+        self.assertEqual(out1.shape, (num_samples, units))
+
+        # train once so that the states change
+        model.train_on_batch(
+            np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
+        out2 = model.predict(np.ones((num_samples, timesteps)))
+
+        # if the state is not reset, output should be different
+        self.assertNotEqual(out1.max(), out2.max())
+
+        # check that output changes after states are reset
+        # (even though the model itself didn't change)
+        layer.reset_states()
+        out3 = model.predict(np.ones((num_samples, timesteps)))
+        self.assertNotEqual(out2.max(), out3.max())
+
+        # check that container-level reset_states() works
+        model.reset_states()
+        out4 = model.predict(np.ones((num_samples, timesteps)))
+        self.assertAllClose(out3, out4, atol=1e-5)
+
+        # check that the call to `predict` updated the states
+        out5 = model.predict(np.ones((num_samples, timesteps)))
+        self.assertNotEqual(out4.max(), out5.max())
+
+  # TODO(psv): Add generic cross product helper function for parametrized tests.
+  @parameterized.named_parameters(
+      ('cudnnlstm_to_lstm_unidirectional_impl_1', 'LSTM', False, False, 1),
+      ('cudnnlstm_to_lstm_bidirectional_impl_1', 'LSTM', False, True, 1),
+      ('lstm_to_cudnnlstm_unidirectional_impl_1', 'LSTM', True, False, 1),
+      ('lstm_to_cudnnlstm_bidirectional_impl_1', 'LSTM', True, True, 1),
+      ('cudnngru_to_gru_unidirectional_impl_1', 'GRU', False, False, 1),
+      ('cudnngru_to_gru_bidirectional_impl_1', 'GRU', False, True, 1),
+      ('gru_to_cudnngru_unidirectional_impl_1', 'GRU', True, False, 1),
+      ('gru_to_cudnngru_bidirectional_impl_1', 'GRU', True, True, 1),
+      ('cudnnlstm_to_lstm_unidirectional_impl_2', 'LSTM', False, False, 2),
+      ('cudnnlstm_to_lstm_bidirectional_impl_2', 'LSTM', False, True, 2),
+      ('lstm_to_cudnnlstm_unidirectional_impl_2', 'LSTM', True, False, 2),
+      ('lstm_to_cudnnlstm_bidirectional_impl_2', 'LSTM', True, True, 2),
+      ('cudnngru_to_gru_unidirectional_impl_2', 'GRU', False, False, 2),
+      ('cudnngru_to_gru_bidirectional_impl_2', 'GRU', False, True, 2),
+      ('gru_to_cudnngru_unidirectional_impl_2', 'GRU', True, False, 2),
+      ('gru_to_cudnngru_bidirectional_impl_2', 'GRU', True, True, 2),
+  )
+  def test_load_weights_between_noncudnn_rnn(self, rnn_type, to_cudnn,
+                                             bidirectional, implementation):
+    if test.is_gpu_available(cuda_only=True):
+      with self.test_session(use_gpu=True):
+        input_size = 10
+        timesteps = 6
+        input_shape = (timesteps, input_size)
+        units = 2
+        num_samples = 32
+        inputs = np.random.random((num_samples, timesteps, input_size))
+
+        rnn_layer_kwargs = {
+            'recurrent_activation': 'sigmoid',
+            # ensure biases are non-zero and properly converted
+            'bias_initializer': 'random_uniform',
+            'implementation': implementation
+        }
+        if rnn_type == 'LSTM':
+          rnn_layer_class = keras.layers.LSTM
+          cudnn_rnn_layer_class = keras.layers.CuDNNLSTM
+        else:
+          rnn_layer_class = keras.layers.GRU
+          cudnn_rnn_layer_class = keras.layers.CuDNNGRU
+          rnn_layer_kwargs['reset_after'] = True
+
+        def convert_weights(source_layer, target_layer):
+          weights = source_layer.get_weights()
+          weights = keras.engine.saving.preprocess_weights_for_loading(
+              target_layer, weights)
+          target_layer.set_weights(weights)
+
+        input_layer = keras.layers.InputLayer(input_shape)
+
+        layer = rnn_layer_class(units, **rnn_layer_kwargs)
+        if bidirectional:
+          layer = keras.layers.Bidirectional(layer)
+
+        cudnn_layer = cudnn_rnn_layer_class(units)
+        if bidirectional:
+          cudnn_layer = keras.layers.Bidirectional(cudnn_layer)
+
+        model = keras.models.Sequential([input_layer, layer])
+        cudnn_model = keras.models.Sequential([input_layer, cudnn_layer])
+
+        if to_cudnn:
+          convert_weights(layer, cudnn_layer)
+        else:
+          convert_weights(cudnn_layer, layer)
+
+        self.assertAllClose(
+            model.predict(inputs), cudnn_model.predict(inputs), atol=1e-4)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_cudnnrnn_bidirectional(self):
+    if test.is_gpu_available(cuda_only=True):
+      with self.test_session(use_gpu=True):
+        rnn = keras.layers.CuDNNGRU
+        samples = 2
+        dim = 2
+        timesteps = 2
+        output_dim = 2
+        mode = 'concat'
+
+        x = np.random.random((samples, timesteps, dim))
+        target_dim = 2 * output_dim if mode == 'concat' else output_dim
+        y = np.random.random((samples, target_dim))
+
+        # test with Sequential model
+        model = keras.Sequential()
+        model.add(
+            keras.layers.Bidirectional(
+                rnn(output_dim), merge_mode=mode, input_shape=(None, dim)))
+        model.compile(
+            loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+        model.fit(x, y, epochs=1, batch_size=1)
+
+        # test config
+        model.get_config()
+        model = keras.models.model_from_json(model.to_json())
+        model.summary()
+
+        # test stacked bidirectional layers
+        model = keras.Sequential()
+        model.add(
+            keras.layers.Bidirectional(
+                rnn(output_dim, return_sequences=True),
+                merge_mode=mode,
+                input_shape=(None, dim)))
+        model.add(keras.layers.Bidirectional(rnn(output_dim), merge_mode=mode))
+        model.compile(
+            loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+        model.fit(x, y, epochs=1, batch_size=1)
+
+        # test with functional API
+        inputs = keras.Input((timesteps, dim))
+        outputs = keras.layers.Bidirectional(
+            rnn(output_dim), merge_mode=mode)(
+                inputs)
+        model = keras.Model(inputs, outputs)
+        model.compile(
+            loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+        model.fit(x, y, epochs=1, batch_size=1)
+
+        # Bidirectional and stateful
+        inputs = keras.Input(batch_shape=(1, timesteps, dim))
+        outputs = keras.layers.Bidirectional(
+            rnn(output_dim, stateful=True), merge_mode=mode)(
+                inputs)
+        model = keras.Model(inputs, outputs)
+        model.compile(
+            loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+        model.fit(x, y, epochs=1, batch_size=1)
+
+  def test_preprocess_weights_for_loading_gru_incompatible(self):
+    """Test loading weights between incompatible layers.
+
+    Should fail fast with an exception.
+    """
+    if test.is_gpu_available(cuda_only=True):
+      with self.test_session(use_gpu=True):
+        input_shape = (3, 5)
+
+        def gru(cudnn=False, **kwargs):
+          layer_class = keras.layers.CuDNNGRU if cudnn else keras.layers.GRU
+          return layer_class(2, input_shape=input_shape, **kwargs)
+
+        def get_layer_weights(layer):
+          layer.build(input_shape=input_shape)
+          return layer.get_weights()
+
+        def assert_not_compatible(src, dest, message):
+          with self.assertRaises(ValueError) as ex:
+            keras.engine.saving.preprocess_weights_for_loading(
+                dest,
+                get_layer_weights(src))
+          self.assertIn(message, str(ex.exception))
+
+        assert_not_compatible(
+            gru(),
+            gru(cudnn=True),
+            'GRU(reset_after=False) is not compatible with CuDNNGRU')
+        assert_not_compatible(
+            gru(cudnn=True),
+            gru(),
+            'CuDNNGRU is not compatible with GRU(reset_after=False)')
+        assert_not_compatible(
+            gru(),
+            gru(reset_after=True),
+            'GRU(reset_after=False) is not compatible with '
+            'GRU(reset_after=True)')
+        assert_not_compatible(
+            gru(reset_after=True),
+            gru(),
+            'GRU(reset_after=True) is not compatible with '
+            'GRU(reset_after=False)')
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/_impl/keras/layers/embeddings.py b/tensorflow/python/keras/layers/embeddings.py
similarity index 94%
rename from tensorflow/python/keras/_impl/keras/layers/embeddings.py
rename to tensorflow/python/keras/layers/embeddings.py
index 2b353ac007a33d..25eeeee9529bcb 100644
--- a/tensorflow/python/keras/_impl/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/layers/embeddings.py
@@ -18,12 +18,12 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import constraints
-from tensorflow.python.keras._impl.keras import initializers
-from tensorflow.python.keras._impl.keras import regularizers
-from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import constraints
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.engine import Layer
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
@@ -153,7 +153,8 @@ def compute_output_shape(self, input_shape):
       return (input_shape[0],) + tuple(in_lens) + (self.output_dim,)
 
   def call(self, inputs):
-    if K.dtype(inputs) != 'int32':
+    dtype = K.dtype(inputs)
+    if dtype != 'int32' and dtype != 'int64':
       inputs = math_ops.cast(inputs, 'int32')
     out = embedding_ops.embedding_lookup(self.embeddings, inputs)
     return out
diff --git a/tensorflow/python/keras/_impl/keras/layers/embeddings_test.py b/tensorflow/python/keras/layers/embeddings_test.py
similarity index 96%
rename from tensorflow/python/keras/_impl/keras/layers/embeddings_test.py
rename to tensorflow/python/keras/layers/embeddings_test.py
index 6ebf5dc94adb42..fff1c5ef9882f0 100644
--- a/tensorflow/python/keras/_impl/keras/layers/embeddings_test.py
+++ b/tensorflow/python/keras/layers/embeddings_test.py
@@ -20,9 +20,9 @@
 
 import numpy as np
 
+from tensorflow.python import keras
 from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/gru_test.py b/tensorflow/python/keras/layers/gru_test.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/layers/gru_test.py
rename to tensorflow/python/keras/layers/gru_test.py
index 48e7e14f5ab73b..234434f7a0205c 100644
--- a/tensorflow/python/keras/_impl/keras/layers/gru_test.py
+++ b/tensorflow/python/keras/layers/gru_test.py
@@ -20,9 +20,9 @@
 
 import numpy as np
 
+from tensorflow.python import keras
 from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/local.py b/tensorflow/python/keras/layers/local.py
similarity index 96%
rename from tensorflow/python/keras/_impl/keras/layers/local.py
rename to tensorflow/python/keras/layers/local.py
index caae820fb3a8eb..46c18b763e80b5 100644
--- a/tensorflow/python/keras/_impl/keras/layers/local.py
+++ b/tensorflow/python/keras/layers/local.py
@@ -18,15 +18,15 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras import activations
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import constraints
-from tensorflow.python.keras._impl.keras import initializers
-from tensorflow.python.keras._impl.keras import regularizers
-from tensorflow.python.keras._impl.keras.engine import InputSpec
-from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.utils import conv_utils
-from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.keras import activations
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import constraints
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.engine import InputSpec
+from tensorflow.python.keras.engine import Layer
+from tensorflow.python.keras.utils import conv_utils
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.util.tf_export import tf_export
 
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/local_test.py b/tensorflow/python/keras/layers/local_test.py
similarity index 97%
rename from tensorflow/python/keras/_impl/keras/layers/local_test.py
rename to tensorflow/python/keras/layers/local_test.py
index 93741d24b9a74c..90ae1719e171b1 100644
--- a/tensorflow/python/keras/_impl/keras/layers/local_test.py
+++ b/tensorflow/python/keras/layers/local_test.py
@@ -20,9 +20,9 @@
 
 import numpy as np
 
+from tensorflow.python import keras
 from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/lstm_test.py b/tensorflow/python/keras/layers/lstm_test.py
similarity index 99%
rename from tensorflow/python/keras/_impl/keras/layers/lstm_test.py
rename to tensorflow/python/keras/layers/lstm_test.py
index 11a5e0aeaacfa7..87cb344bf82b73 100644
--- a/tensorflow/python/keras/_impl/keras/layers/lstm_test.py
+++ b/tensorflow/python/keras/layers/lstm_test.py
@@ -20,9 +20,9 @@
 
 import numpy as np
 
+from tensorflow.python import keras
 from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/merge.py b/tensorflow/python/keras/layers/merge.py
similarity index 99%
rename from tensorflow/python/keras/_impl/keras/layers/merge.py
rename to tensorflow/python/keras/layers/merge.py
index 2b6cf7c8a94ff4..683e3e0ed1ce9a 100644
--- a/tensorflow/python/keras/_impl/keras/layers/merge.py
+++ b/tensorflow/python/keras/layers/merge.py
@@ -20,9 +20,9 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.engine.base_layer import Layer
-from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
diff --git a/tensorflow/python/keras/_impl/keras/layers/merge_test.py b/tensorflow/python/keras/layers/merge_test.py
similarity index 99%
rename from tensorflow/python/keras/_impl/keras/layers/merge_test.py
rename to tensorflow/python/keras/layers/merge_test.py
index b2fe06f93e33ed..8a097cf7f57d06 100644
--- a/tensorflow/python/keras/_impl/keras/layers/merge_test.py
+++ b/tensorflow/python/keras/layers/merge_test.py
@@ -20,8 +20,8 @@
 
 import numpy as np
 
+from tensorflow.python import keras
 from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.keras._impl import keras
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/noise.py b/tensorflow/python/keras/layers/noise.py
similarity index 97%
rename from tensorflow/python/keras/_impl/keras/layers/noise.py
rename to tensorflow/python/keras/layers/noise.py
index addac5b137430d..a895caa25b9170 100644
--- a/tensorflow/python/keras/_impl/keras/layers/noise.py
+++ b/tensorflow/python/keras/layers/noise.py
@@ -20,9 +20,9 @@
 
 import numpy as np
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.engine import Layer
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
diff --git a/tensorflow/python/keras/_impl/keras/layers/noise_test.py b/tensorflow/python/keras/layers/noise_test.py
similarity index 93%
rename from tensorflow/python/keras/_impl/keras/layers/noise_test.py
rename to tensorflow/python/keras/layers/noise_test.py
index af4f031ec95bb5..bde2185f03bd45 100644
--- a/tensorflow/python/keras/_impl/keras/layers/noise_test.py
+++ b/tensorflow/python/keras/layers/noise_test.py
@@ -18,9 +18,9 @@
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python import keras
 from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
similarity index 97%
rename from tensorflow/python/keras/_impl/keras/layers/normalization.py
rename to tensorflow/python/keras/layers/normalization.py
index c16fc07fb4ecda..7743d00c0f000c 100644
--- a/tensorflow/python/keras/_impl/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -22,13 +22,13 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import constraints
-from tensorflow.python.keras._impl.keras import initializers
-from tensorflow.python.keras._impl.keras import regularizers
-from tensorflow.python.keras._impl.keras.engine import InputSpec
-from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import constraints
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.engine import InputSpec
+from tensorflow.python.keras.engine import Layer
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -574,28 +574,26 @@ def _compose_transforms(scale, offset, then_scale, then_offset):
                                      lambda: variance,
                                      lambda: moving_variance)
 
+      if self.virtual_batch_size is not None:
+        # This isn't strictly correct since in ghost batch norm, you are
+        # supposed to sequentially update the moving_mean and moving_variance
+        # with each sub-batch. However, since the moving statistics are only
+        # used during evaluation, it is more efficient to just update in one
+        # step and should not make a significant difference in the result.
+        new_mean = math_ops.reduce_mean(mean, axis=1, keepdims=True)
+        new_variance = math_ops.reduce_mean(variance, axis=1, keepdims=True)
+      else:
+        new_mean, new_variance = mean, variance
+
       if self.renorm:
         r, d, new_mean, new_variance = self._renorm_correction_and_moments(
-            mean, variance, training)
+            new_mean, new_variance, training)
         # When training, the normalized values (say, x) will be transformed as
         # x * gamma + beta without renorm, and (x * r + d) * gamma + beta
         # = x * (r * gamma) + (d * gamma + beta) with renorm.
         r = _broadcast(array_ops.stop_gradient(r, name='renorm_r'))
         d = _broadcast(array_ops.stop_gradient(d, name='renorm_d'))
         scale, offset = _compose_transforms(r, d, scale, offset)
-      else:
-        new_mean, new_variance = mean, variance
-
-      if self.virtual_batch_size is not None:
-        # This isn't strictly correct since in ghost batch norm, you are
-        # supposed to sequentially update the moving_mean and moving_variance
-        # with each sub-batch. However, since the moving statistics are only
-        # used during evaluation, it is more efficient to just update in one
-        # step and should not make a significant difference in the result.
-        new_mean = math_ops.reduce_mean(new_mean,
-                                        axis=1, keepdims=True)
-        new_variance = math_ops.reduce_mean(new_variance,
-                                            axis=1, keepdims=True)
 
       def _do_update(var, value):
         if in_eager_mode and not self.trainable:
diff --git a/tensorflow/python/keras/_impl/keras/layers/normalization_test.py b/tensorflow/python/keras/layers/normalization_test.py
similarity index 74%
rename from tensorflow/python/keras/_impl/keras/layers/normalization_test.py
rename to tensorflow/python/keras/layers/normalization_test.py
index fa9277e3d1e5bb..b22f3bd1529812 100644
--- a/tensorflow/python/keras/_impl/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/layers/normalization_test.py
@@ -20,8 +20,8 @@
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python import keras
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
@@ -168,6 +168,76 @@ def test_shared_batchnorm(self):
       new_model.compile('sgd', 'mse')
       new_model.train_on_batch(x, x)
 
+  def test_that_trainable_disables_updates(self):
+    with self.test_session():
+      val_a = np.random.random((10, 4))
+      val_out = np.random.random((10, 4))
+
+      a = keras.layers.Input(shape=(4,))
+      layer = keras.layers.BatchNormalization(input_shape=(4,))
+      b = layer(a)
+      model = keras.models.Model(a, b)
+
+      model.trainable = False
+      assert not model.updates
+
+      model.compile('sgd', 'mse')
+      assert not model.updates
+
+      x1 = model.predict(val_a)
+      model.train_on_batch(val_a, val_out)
+      x2 = model.predict(val_a)
+      self.assertAllClose(x1, x2, atol=1e-7)
+
+      model.trainable = True
+      model.compile('sgd', 'mse')
+      assert model.updates
+
+      model.train_on_batch(val_a, val_out)
+      x2 = model.predict(val_a)
+      assert np.abs(np.sum(x1 - x2)) > 1e-5
+
+      layer.trainable = False
+      model.compile('sgd', 'mse')
+      assert not model.updates
+
+      x1 = model.predict(val_a)
+      model.train_on_batch(val_a, val_out)
+      x2 = model.predict(val_a)
+      self.assertAllClose(x1, x2, atol=1e-7)
+
+  def test_batchnorm_trainable(self):
+    """Tests that batchnorm layer is trainable when learning phase is enabled.
+
+    Computes mean and std for current inputs then
+    applies batch normalization using them.
+    """
+    with self.test_session():
+      bn_mean = 0.5
+      bn_std = 10.
+      val_a = np.expand_dims(np.arange(10.), axis=1)
+
+      def get_model(bn_mean, bn_std):
+        inp = keras.layers.Input(shape=(1,))
+        x = keras.layers.BatchNormalization()(inp)
+        model1 = keras.models.Model(inp, x)
+        model1.set_weights([
+            np.array([1.]),
+            np.array([0.]),
+            np.array([bn_mean]),
+            np.array([bn_std**2])
+        ])
+        return model1
+
+      # Simulates training-mode with trainable layer.
+      # Should use mini-batch statistics.
+      keras.backend.set_learning_phase(1)
+      model = get_model(bn_mean, bn_std)
+      model.compile(loss='mse', optimizer='rmsprop')
+      out = model.predict(val_a)
+      self.assertAllClose(
+          (val_a - np.mean(val_a)) / np.std(val_a), out, atol=1e-3)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/layers/pooling.py b/tensorflow/python/keras/layers/pooling.py
similarity index 99%
rename from tensorflow/python/keras/_impl/keras/layers/pooling.py
rename to tensorflow/python/keras/layers/pooling.py
index 86bc8a680a529a..10a82b285eff6f 100644
--- a/tensorflow/python/keras/_impl/keras/layers/pooling.py
+++ b/tensorflow/python/keras/layers/pooling.py
@@ -19,10 +19,10 @@
 from __future__ import print_function
 
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl.keras import backend
-from tensorflow.python.keras._impl.keras.engine import InputSpec
-from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.utils import conv_utils
+from tensorflow.python.keras import backend
+from tensorflow.python.keras.engine import InputSpec
+from tensorflow.python.keras.engine import Layer
+from tensorflow.python.keras.utils import conv_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import tf_export
diff --git a/tensorflow/python/keras/_impl/keras/layers/pooling_test.py b/tensorflow/python/keras/layers/pooling_test.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/layers/pooling_test.py
rename to tensorflow/python/keras/layers/pooling_test.py
index 2c08b647ea0faf..cbd58a22879975 100644
--- a/tensorflow/python/keras/_impl/keras/layers/pooling_test.py
+++ b/tensorflow/python/keras/layers/pooling_test.py
@@ -18,10 +18,10 @@
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
similarity index 96%
rename from tensorflow/python/keras/_impl/keras/layers/recurrent.py
rename to tensorflow/python/keras/layers/recurrent.py
index f6d6e1391c834b..7e509fb4518265 100644
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -24,15 +24,15 @@
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl.keras import activations
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras import constraints
-from tensorflow.python.keras._impl.keras import initializers
-from tensorflow.python.keras._impl.keras import regularizers
-from tensorflow.python.keras._impl.keras.engine import InputSpec
-from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.utils import generic_utils
-from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.keras import activations
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import constraints
+from tensorflow.python.keras import initializers
+from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.engine import InputSpec
+from tensorflow.python.keras.engine import Layer
+from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
@@ -153,7 +153,7 @@ def get_config(self):
 
   @classmethod
   def from_config(cls, config, custom_objects=None):
-    from tensorflow.python.keras._impl.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
+    from tensorflow.python.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
     cells = []
     for cell_config in config.pop('cells'):
       cells.append(
@@ -503,6 +503,7 @@ def build(self, input_shape):
       self.state_spec = [InputSpec(shape=(None, dim)) for dim in state_size]
     if self.stateful:
       self.reset_states()
+    self.built = True
 
   def get_initial_state(self, inputs):
     # build an all-zero tensor of shape (samples, output_dim)
@@ -518,9 +519,10 @@ def get_initial_state(self, inputs):
       return [K.tile(initial_state, [1, self.cell.state_size])]
 
   def __call__(self, inputs, initial_state=None, constants=None, **kwargs):
-    inputs, initial_state, constants = self._standardize_args(
-        inputs, initial_state, constants)
-
+    inputs, initial_state, constants = _standardize_args(inputs,
+                                                         initial_state,
+                                                         constants,
+                                                         self._num_constants)
     if initial_state is None and constants is None:
       return super(RNN, self).__call__(inputs, **kwargs)
 
@@ -660,46 +662,6 @@ def step(inputs, states):
     else:
       return output
 
-  def _standardize_args(self, inputs, initial_state, constants):
-    """Standardize `__call__` to a single list of tensor inputs.
-
-    When running a model loaded from file, the input tensors
-    `initial_state` and `constants` can be passed to `RNN.__call__` as part
-    of `inputs` instead of by the dedicated keyword arguments. This method
-    makes sure the arguments are separated and that `initial_state` and
-    `constants` are lists of tensors (or None).
-
-    Arguments:
-        inputs: tensor or list/tuple of tensors
-        initial_state: tensor or list of tensors or None
-        constants: tensor or list of tensors or None
-
-    Returns:
-        inputs: tensor
-        initial_state: list of tensors or None
-        constants: list of tensors or None
-    """
-    if isinstance(inputs, list):
-      assert initial_state is None and constants is None
-      if self._num_constants is not None:
-        constants = inputs[-self._num_constants:]  # pylint: disable=invalid-unary-operand-type
-        inputs = inputs[:-self._num_constants]  # pylint: disable=invalid-unary-operand-type
-      if len(inputs) > 1:
-        initial_state = inputs[1:]
-      inputs = inputs[0]
-
-    def to_list_or_none(x):
-      if x is None or isinstance(x, list):
-        return x
-      if isinstance(x, tuple):
-        return list(x)
-      return [x]
-
-    initial_state = to_list_or_none(initial_state)
-    constants = to_list_or_none(constants)
-
-    return inputs, initial_state, constants
-
   def reset_states(self, states=None):
     if not self.stateful:
       raise AttributeError('Layer must be stateful.')
@@ -772,7 +734,7 @@ def get_config(self):
 
   @classmethod
   def from_config(cls, config, custom_objects=None):
-    from tensorflow.python.keras._impl.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
+    from tensorflow.python.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
     cell = deserialize_layer(config.pop('cell'), custom_objects=custom_objects)
     num_constants = config.pop('num_constants', None)
     layer = cls(cell, **config)
@@ -913,13 +875,13 @@ def call(self, inputs, states, training=None):
     prev_output = states[0]
     if 0 < self.dropout < 1 and self._dropout_mask is None:
       self._dropout_mask = _generate_dropout_mask(
-          _generate_dropout_ones(inputs, array_ops.shape(inputs)[-1]),
+          array_ops.ones_like(inputs),
           self.dropout,
           training=training)
     if (0 < self.recurrent_dropout < 1 and
         self._recurrent_dropout_mask is None):
       self._recurrent_dropout_mask = _generate_dropout_mask(
-          _generate_dropout_ones(inputs, self.units),
+          array_ops.ones_like(prev_output),
           self.recurrent_dropout,
           training=training)
 
@@ -1332,14 +1294,14 @@ def call(self, inputs, states, training=None):
 
     if 0 < self.dropout < 1 and self._dropout_mask is None:
       self._dropout_mask = _generate_dropout_mask(
-          _generate_dropout_ones(inputs, array_ops.shape(inputs)[-1]),
+          array_ops.ones_like(inputs),
           self.dropout,
           training=training,
           count=3)
     if (0 < self.recurrent_dropout < 1 and
         self._recurrent_dropout_mask is None):
       self._recurrent_dropout_mask = _generate_dropout_mask(
-          _generate_dropout_ones(inputs, self.units),
+          array_ops.ones_like(h_tm1),
           self.recurrent_dropout,
           training=training,
           count=3)
@@ -1417,7 +1379,15 @@ def call(self, inputs, states, training=None):
 
       if 0. < self.recurrent_dropout < 1.:
         h_tm1 *= rec_dp_mask[0]
-      matrix_inner = K.dot(h_tm1, self.recurrent_kernel[:, :2 * self.units])
+
+      if self.reset_after:
+        # hidden state projected by all gate matrices at once
+        matrix_inner = K.dot(h_tm1, self.recurrent_kernel)
+        if self.use_bias:
+          matrix_inner = K.bias_add(matrix_inner, self.recurrent_bias)
+      else:
+        # hidden state projected separately for update/reset and new
+        matrix_inner = K.dot(h_tm1, self.recurrent_kernel[:, :2 * self.units])
 
       recurrent_z = matrix_inner[:, :self.units]
       recurrent_r = matrix_inner[:, self.units:2 * self.units]
@@ -1864,14 +1834,14 @@ def bias_initializer(_, *args, **kwargs):
   def call(self, inputs, states, training=None):
     if 0 < self.dropout < 1 and self._dropout_mask is None:
       self._dropout_mask = _generate_dropout_mask(
-          _generate_dropout_ones(inputs, array_ops.shape(inputs)[-1]),
+          array_ops.ones_like(inputs),
           self.dropout,
           training=training,
           count=4)
     if (0 < self.recurrent_dropout < 1 and
         self._recurrent_dropout_mask is None):
       self._recurrent_dropout_mask = _generate_dropout_mask(
-          _generate_dropout_ones(inputs, self.units),
+          array_ops.ones_like(states[0]),
           self.recurrent_dropout,
           training=training,
           count=4)
@@ -2245,12 +2215,7 @@ def from_config(cls, config):
     return cls(**config)
 
 
-def _generate_dropout_ones(inputs, dims):
-  return K.ones((array_ops.shape(inputs)[0], dims))
-
-
 def _generate_dropout_mask(ones, rate, training=None, count=1):
-
   def dropped_inputs():
     return K.dropout(ones, rate)
 
@@ -2596,3 +2561,47 @@ def get_config(self):
     }
     base_config = super(Recurrent, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
+
+
+def _standardize_args(inputs, initial_state, constants, num_constants):
+  """Standardizes `__call__` to a single list of tensor inputs.
+
+  When running a model loaded from a file, the input tensors
+  `initial_state` and `constants` can be passed to `RNN.__call__()` as part
+  of `inputs` instead of by the dedicated keyword arguments. This method
+  makes sure the arguments are separated and that `initial_state` and
+  `constants` are lists of tensors (or None).
+
+  Arguments:
+      inputs: Tensor or list/tuple of tensors. which may include constants
+        and initial states. In that case `num_constant` must be specified.
+      initial_state: Tensor or list of tensors or None, initial states.
+      constants: Tensor or list of tensors or None, constant tensors.
+      num_constants: Expected number of constants (if constants are passed as
+        part of the `inputs` list.
+
+  Returns:
+      inputs: Single tensor.
+      initial_state: List of tensors or None.
+      constants: List of tensors or None.
+  """
+  if isinstance(inputs, list):
+    assert initial_state is None and constants is None
+    if num_constants is not None:
+      constants = inputs[-num_constants:]
+      inputs = inputs[:-num_constants]
+    if len(inputs) > 1:
+      initial_state = inputs[1:]
+    inputs = inputs[0]
+
+  def to_list_or_none(x):
+    if x is None or isinstance(x, list):
+      return x
+    if isinstance(x, tuple):
+      return list(x)
+    return [x]
+
+  initial_state = to_list_or_none(initial_state)
+  constants = to_list_or_none(constants)
+
+  return inputs, initial_state, constants
diff --git a/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py b/tensorflow/python/keras/layers/recurrent_test.py
similarity index 99%
rename from tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
rename to tensorflow/python/keras/layers/recurrent_test.py
index 4c68c18825a47d..802374d2d28d79 100644
--- a/tensorflow/python/keras/_impl/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/layers/recurrent_test.py
@@ -23,7 +23,7 @@
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
diff --git a/tensorflow/python/keras/_impl/keras/layers/serialization.py b/tensorflow/python/keras/layers/serialization.py
similarity index 59%
rename from tensorflow/python/keras/_impl/keras/layers/serialization.py
rename to tensorflow/python/keras/layers/serialization.py
index 928feaadbf3554..be306c0af765dd 100644
--- a/tensorflow/python/keras/_impl/keras/layers/serialization.py
+++ b/tensorflow/python/keras/layers/serialization.py
@@ -20,21 +20,22 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.engine import Input
-from tensorflow.python.keras._impl.keras.engine import InputLayer
-from tensorflow.python.keras._impl.keras.layers.advanced_activations import *
-from tensorflow.python.keras._impl.keras.layers.convolutional import *
-from tensorflow.python.keras._impl.keras.layers.convolutional_recurrent import *
-from tensorflow.python.keras._impl.keras.layers.core import *
-from tensorflow.python.keras._impl.keras.layers.embeddings import *
-from tensorflow.python.keras._impl.keras.layers.local import *
-from tensorflow.python.keras._impl.keras.layers.merge import *
-from tensorflow.python.keras._impl.keras.layers.noise import *
-from tensorflow.python.keras._impl.keras.layers.normalization import *
-from tensorflow.python.keras._impl.keras.layers.pooling import *
-from tensorflow.python.keras._impl.keras.layers.recurrent import *
-from tensorflow.python.keras._impl.keras.layers.wrappers import *
-from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.python.keras.engine import Input
+from tensorflow.python.keras.engine import InputLayer
+from tensorflow.python.keras.layers.advanced_activations import *
+from tensorflow.python.keras.layers.convolutional import *
+from tensorflow.python.keras.layers.convolutional_recurrent import *
+from tensorflow.python.keras.layers.core import *
+from tensorflow.python.keras.layers.cudnn_recurrent import *
+from tensorflow.python.keras.layers.embeddings import *
+from tensorflow.python.keras.layers.local import *
+from tensorflow.python.keras.layers.merge import *
+from tensorflow.python.keras.layers.noise import *
+from tensorflow.python.keras.layers.normalization import *
+from tensorflow.python.keras.layers.pooling import *
+from tensorflow.python.keras.layers.recurrent import *
+from tensorflow.python.keras.layers.wrappers import *
+from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 
 
 def serialize(layer):
@@ -52,7 +53,7 @@ def deserialize(config, custom_objects=None):
   Returns:
       Layer instance (may be Model, Sequential, Layer...)
   """
-  from tensorflow.python.keras._impl.keras import models  # pylint: disable=g-import-not-at-top
+  from tensorflow.python.keras import models  # pylint: disable=g-import-not-at-top
   globs = globals()  # All layers.
   globs['Model'] = models.Model
   globs['Sequential'] = models.Sequential
diff --git a/tensorflow/python/keras/_impl/keras/layers/serialization_test.py b/tensorflow/python/keras/layers/serialization_test.py
similarity index 96%
rename from tensorflow/python/keras/_impl/keras/layers/serialization_test.py
rename to tensorflow/python/keras/layers/serialization_test.py
index 787160d1e71f57..5872185ef7c30a 100644
--- a/tensorflow/python/keras/_impl/keras/layers/serialization_test.py
+++ b/tensorflow/python/keras/layers/serialization_test.py
@@ -18,7 +18,7 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/simplernn_test.py b/tensorflow/python/keras/layers/simplernn_test.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/layers/simplernn_test.py
rename to tensorflow/python/keras/layers/simplernn_test.py
index 8c7189cd471845..3d24b0d5045d9c 100644
--- a/tensorflow/python/keras/_impl/keras/layers/simplernn_test.py
+++ b/tensorflow/python/keras/layers/simplernn_test.py
@@ -20,9 +20,9 @@
 
 import numpy as np
 
+from tensorflow.python import keras
 from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
diff --git a/tensorflow/python/keras/_impl/keras/layers/wrappers.py b/tensorflow/python/keras/layers/wrappers.py
similarity index 81%
rename from tensorflow/python/keras/_impl/keras/layers/wrappers.py
rename to tensorflow/python/keras/layers/wrappers.py
index 34a8eeeb5b5c4c..7759561ef94c4a 100644
--- a/tensorflow/python/keras/_impl/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/layers/wrappers.py
@@ -22,11 +22,12 @@
 import copy
 
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.engine import InputSpec
-from tensorflow.python.keras._impl.keras.engine import Layer
-from tensorflow.python.keras._impl.keras.utils import generic_utils
-from tensorflow.python.keras._impl.keras.utils import tf_utils
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.engine import InputSpec
+from tensorflow.python.keras.engine import Layer
+from tensorflow.python.keras.layers.recurrent import _standardize_args
+from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.util.tf_export import tf_export
 
@@ -103,7 +104,7 @@ def get_config(self):
 
   @classmethod
   def from_config(cls, config, custom_objects=None):
-    from tensorflow.python.keras._impl.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
+    from tensorflow.python.keras.layers import deserialize as deserialize_layer  # pylint: disable=g-import-not-at-top
     layer = deserialize_layer(
         config.pop('layer'), custom_objects=custom_objects)
     return cls(layer, **config)
@@ -201,6 +202,7 @@ def step(x, _):
           step,
           inputs,
           initial_states=[],
+          input_length=input_shape[1],
           unroll=False)
       y = outputs
     else:
@@ -283,6 +285,7 @@ def __init__(self, layer, merge_mode='concat', weights=None, **kwargs):
     self.return_state = layer.return_state
     self.supports_masking = True
     self._trainable = True
+    self._num_constants = None
     super(Bidirectional, self).__init__(layer, **kwargs)
     self.input_spec = layer.input_spec
 
@@ -325,37 +328,51 @@ def compute_output_shape(self, input_shape):
       return [output_shape] + state_shape + copy.copy(state_shape)
     return output_shape
 
-  def __call__(self, inputs, initial_state=None, **kwargs):
+  def __call__(self, inputs, initial_state=None, constants=None, **kwargs):
+    """`Bidirectional.__call__` implements the same API as the wrapped `RNN`."""
+    inputs, initial_state, constants = _standardize_args(
+        inputs, initial_state, constants, self._num_constants)
+
     if isinstance(inputs, list):
       if len(inputs) > 1:
         initial_state = inputs[1:]
       inputs = inputs[0]
 
-    if initial_state is None:
+    if initial_state is None and constants is None:
       return super(Bidirectional, self).__call__(inputs, **kwargs)
 
-    # Standardize `initial_state` into list
-    if isinstance(initial_state, tuple):
-      initial_state = list(initial_state)
-    elif not isinstance(initial_state, list):
-      initial_state = [initial_state]
-
-    # Check if `initial_state` can be splitted into half
-    num_states = len(initial_state)
-    if num_states % 2 > 0:
-      raise ValueError(
-          'When passing `initial_state` to a Bidirectional RNN, the state '
-          'should be a list containing the states of the underlying RNNs. '
-          'Found: ' + str(initial_state))
-
-    # Applies the same workaround as in `RNN.__call__`, without handling
-    # constants
-    kwargs['initial_state'] = initial_state
-    additional_inputs = initial_state
-    additional_specs = [InputSpec(shape=K.int_shape(state))
-                        for state in initial_state]
-    self.forward_layer.state_spec = additional_specs[:num_states // 2]
-    self.backward_layer.state_spec = additional_specs[num_states // 2:]
+    # Applies the same workaround as in `RNN.__call__`
+    additional_inputs = []
+    additional_specs = []
+    if initial_state is not None:
+      # Check if `initial_state` can be splitted into half
+      num_states = len(initial_state)
+      if num_states % 2 > 0:
+        raise ValueError(
+            'When passing `initial_state` to a Bidirectional RNN, '
+            'the state should be a list containing the states of '
+            'the underlying RNNs. '
+            'Found: ' + str(initial_state))
+
+      kwargs['initial_state'] = initial_state
+      additional_inputs += initial_state
+      state_specs = [InputSpec(shape=K.int_shape(state))
+                     for state in initial_state]
+      self.forward_layer.state_spec = state_specs[:num_states // 2]
+      self.backward_layer.state_spec = state_specs[num_states // 2:]
+      additional_specs += state_specs
+    if constants is not None:
+      kwargs['constants'] = constants
+      additional_inputs += constants
+      constants_spec = [InputSpec(shape=K.int_shape(constant))
+                        for constant in constants]
+      self.forward_layer.constants_spec = constants_spec
+      self.backward_layer.constants_spec = constants_spec
+      additional_specs += constants_spec
+
+      self._num_constants = len(constants)
+      self.forward_layer._num_constants = self._num_constants
+      self.backward_layer._num_constants = self._num_constants
 
     is_keras_tensor = K.is_keras_tensor(additional_inputs[0])
     for tensor in additional_inputs:
@@ -380,12 +397,19 @@ def __call__(self, inputs, initial_state=None, **kwargs):
     else:
       return super(Bidirectional, self).__call__(inputs, **kwargs)
 
-  def call(self, inputs, training=None, mask=None, initial_state=None):
+  def call(self, inputs,
+           training=None,
+           mask=None,
+           initial_state=None,
+           constants=None):
+    """`Bidirectional.call` implements the same API as the wrapped `RNN`."""
     kwargs = {}
     if generic_utils.has_arg(self.layer.call, 'training'):
       kwargs['training'] = training
     if generic_utils.has_arg(self.layer.call, 'mask'):
       kwargs['mask'] = mask
+    if generic_utils.has_arg(self.layer.call, 'constants'):
+      kwargs['constants'] = constants
 
     if initial_state is not None and generic_utils.has_arg(
         self.layer.call, 'initial_state'):
@@ -443,13 +467,23 @@ def build(self, input_shape):
     self.built = True
 
   def compute_mask(self, inputs, mask):
+    if isinstance(mask, list):
+      mask = mask[0]
     if self.return_sequences:
       if not self.merge_mode:
-        return [mask, mask]
+        output_mask = [mask, mask]
       else:
-        return mask
+        output_mask = mask
     else:
-      return None
+      output_mask = [None, None] if not self.merge_mode else None
+
+    if self.return_state:
+      states = self.forward_layer.states
+      state_mask = [None for _ in states]
+      if isinstance(output_mask, list):
+        return output_mask + state_mask * 2
+      return [output_mask] + state_mask * 2
+    return output_mask
 
   @property
   def trainable_weights(self):
@@ -487,5 +521,15 @@ def constraints(self):
 
   def get_config(self):
     config = {'merge_mode': self.merge_mode}
+    if self._num_constants is not None:
+      config['num_constants'] = self._num_constants
     base_config = super(Bidirectional, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    num_constants = config.pop('num_constants', None)
+    layer = super(Bidirectional, cls).from_config(config,
+                                                  custom_objects=custom_objects)
+    layer._num_constants = num_constants
+    return layer
diff --git a/tensorflow/python/keras/_impl/keras/layers/wrappers_test.py b/tensorflow/python/keras/layers/wrappers_test.py
similarity index 74%
rename from tensorflow/python/keras/_impl/keras/layers/wrappers_test.py
rename to tensorflow/python/keras/layers/wrappers_test.py
index 8fcf66e90ff128..5eab6aba8a5f9a 100644
--- a/tensorflow/python/keras/_impl/keras/layers/wrappers_test.py
+++ b/tensorflow/python/keras/layers/wrappers_test.py
@@ -18,14 +18,55 @@
 from __future__ import division
 from __future__ import print_function
 
+import copy
+
 import numpy as np
 
+from tensorflow.python import keras
 from tensorflow.python.framework import test_util as tf_test_util
-from tensorflow.python.keras._impl import keras
 from tensorflow.python.platform import test
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
+class _RNNCellWithConstants(keras.layers.Layer):
+
+  def __init__(self, units, **kwargs):
+    self.units = units
+    self.state_size = units
+    super(_RNNCellWithConstants, self).__init__(**kwargs)
+
+  def build(self, input_shape):
+    [input_shape, constant_shape] = input_shape
+
+    self.input_kernel = self.add_weight(
+        shape=(input_shape[-1], self.units),
+        initializer='uniform',
+        name='kernel')
+    self.recurrent_kernel = self.add_weight(
+        shape=(self.units, self.units),
+        initializer='uniform',
+        name='recurrent_kernel')
+    self.constant_kernel = self.add_weight(
+        shape=(constant_shape[-1], self.units),
+        initializer='uniform',
+        name='constant_kernel')
+    self.built = True
+
+  def call(self, inputs, states, constants):
+    [prev_output] = states
+    [constant] = constants
+    h_input = keras.backend.dot(inputs, self.input_kernel)
+    h_state = keras.backend.dot(prev_output, self.recurrent_kernel)
+    h_const = keras.backend.dot(constant, self.constant_kernel)
+    output = h_input + h_state + h_const
+    return output, [output]
+
+  def get_config(self):
+    config = {'units': self.units}
+    base_config = super(_RNNCellWithConstants, self).get_config()
+    return dict(list(base_config.items()) + list(config.items()))
+
+
 class TimeDistributedTest(test.TestCase):
 
   @tf_test_util.run_in_graph_and_eager_modes()
@@ -383,6 +424,100 @@ def test_Bidirectional_trainable(self):
       layer.trainable = True
       assert len(layer.trainable_weights) == 6
 
+  def test_Bidirectional_with_constants(self):
+    with self.test_session():
+      # Test basic case.
+      x = keras.Input((5, 5))
+      c = keras.Input((3,))
+      cell = _RNNCellWithConstants(32)
+      custom_objects = {'_RNNCellWithConstants': _RNNCellWithConstants}
+      with keras.utils.CustomObjectScope(custom_objects):
+        layer = keras.layers.Bidirectional(keras.layers.RNN(cell))
+      y = layer(x, constants=c)
+      model = keras.Model([x, c], y)
+      model.compile(optimizer='rmsprop', loss='mse')
+      model.train_on_batch(
+          [np.zeros((6, 5, 5)), np.zeros((6, 3))],
+          np.zeros((6, 64))
+      )
+
+      # Test basic case serialization.
+      x_np = np.random.random((6, 5, 5))
+      c_np = np.random.random((6, 3))
+      y_np = model.predict([x_np, c_np])
+      weights = model.get_weights()
+      config = layer.get_config()
+
+      with keras.utils.CustomObjectScope(custom_objects):
+        layer = keras.layers.Bidirectional.from_config(copy.deepcopy(config))
+      y = layer(x, constants=c)
+      model = keras.Model([x, c], y)
+      model.set_weights(weights)
+      y_np_2 = model.predict([x_np, c_np])
+      self.assertAllClose(y_np, y_np_2, atol=1e-4)
+
+      # Test flat list inputs
+      with keras.utils.CustomObjectScope(custom_objects):
+        layer = keras.layers.Bidirectional.from_config(copy.deepcopy(config))
+      y = layer([x, c])
+      model = keras.Model([x, c], y)
+      model.set_weights(weights)
+      y_np_3 = model.predict([x_np, c_np])
+      self.assertAllClose(y_np, y_np_3, atol=1e-4)
+
+  def test_Bidirectional_with_constants_layer_passing_initial_state(self):
+    with self.test_session():
+      # Test basic case.
+      x = keras.Input((5, 5))
+      c = keras.Input((3,))
+      s_for = keras.Input((32,))
+      s_bac = keras.Input((32,))
+      cell = _RNNCellWithConstants(32)
+      custom_objects = {'_RNNCellWithConstants': _RNNCellWithConstants}
+      with keras.utils.CustomObjectScope(custom_objects):
+        layer = keras.layers.Bidirectional(keras.layers.RNN(cell))
+      y = layer(x, initial_state=[s_for, s_bac], constants=c)
+      model = keras.Model([x, s_for, s_bac, c], y)
+      model.compile(optimizer='rmsprop', loss='mse')
+      model.train_on_batch(
+          [np.zeros((6, 5, 5)),
+           np.zeros((6, 32)),
+           np.zeros((6, 32)),
+           np.zeros((6, 3))],
+          np.zeros((6, 64))
+      )
+
+      # Test basic case serialization.
+      x_np = np.random.random((6, 5, 5))
+      s_fw_np = np.random.random((6, 32))
+      s_bk_np = np.random.random((6, 32))
+      c_np = np.random.random((6, 3))
+      y_np = model.predict([x_np, s_fw_np, s_bk_np, c_np])
+      weights = model.get_weights()
+      config = layer.get_config()
+
+      with keras.utils.CustomObjectScope(custom_objects):
+        layer = keras.layers.Bidirectional.from_config(copy.deepcopy(config))
+      y = layer(x, initial_state=[s_for, s_bac], constants=c)
+      model = keras.Model([x, s_for, s_bac, c], y)
+      model.set_weights(weights)
+      y_np_2 = model.predict([x_np, s_fw_np, s_bk_np, c_np])
+      self.assertAllClose(y_np, y_np_2, atol=1e-4)
+
+      # Verify that state is used
+      y_np_2_different_s = model.predict(
+          [x_np, s_fw_np + 10., s_bk_np + 10., c_np])
+      assert np.mean(y_np - y_np_2_different_s) != 0
+
+      # Test flat list inputs
+      with keras.utils.CustomObjectScope(custom_objects):
+        layer = keras.layers.Bidirectional.from_config(copy.deepcopy(config))
+      y = layer([x, s_for, s_bac, c])
+      model = keras.Model([x, s_for, s_bac, c], y)
+      model.set_weights(weights)
+      y_np_3 = model.predict([x_np, s_fw_np, s_bk_np, c_np])
+      self.assertAllClose(y_np, y_np_3, atol=1e-4)
+
 
 def _to_list(ls):
   if isinstance(ls, list):
diff --git a/tensorflow/python/keras/_impl/keras/losses.py b/tensorflow/python/keras/losses.py
similarity index 81%
rename from tensorflow/python/keras/_impl/keras/losses.py
rename to tensorflow/python/keras/losses.py
index 1d634d38013164..9f548bfe0408d5 100644
--- a/tensorflow/python/keras/_impl/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -21,28 +21,40 @@
 
 import six
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
-from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('keras.metrics.mean_squared_error',
-           'keras.losses.mean_squared_error')
+           'keras.metrics.mse',
+           'keras.metrics.MSE',
+           'keras.losses.mean_squared_error',
+           'keras.losses.mse',
+           'keras.losses.MSE')
 def mean_squared_error(y_true, y_pred):
   return K.mean(math_ops.square(y_pred - y_true), axis=-1)
 
 
 @tf_export('keras.metrics.mean_absolute_error',
-           'keras.losses.mean_absolute_error')
+           'keras.metrics.mae',
+           'keras.metrics.MAE',
+           'keras.losses.mean_absolute_error',
+           'keras.losses.mae',
+           'keras.losses.MAE')
 def mean_absolute_error(y_true, y_pred):
   return K.mean(math_ops.abs(y_pred - y_true), axis=-1)
 
 
 @tf_export('keras.metrics.mean_absolute_percentage_error',
-           'keras.losses.mean_absolute_percentage_error')
+           'keras.metrics.mape',
+           'keras.metrics.MAPE',
+           'keras.losses.mean_absolute_percentage_error',
+           'keras.losses.mape',
+           'keras.losses.MAPE')
 def mean_absolute_percentage_error(y_true, y_pred):
   diff = math_ops.abs(
       (y_true - y_pred) / K.clip(math_ops.abs(y_true), K.epsilon(), None))
@@ -50,7 +62,11 @@ def mean_absolute_percentage_error(y_true, y_pred):
 
 
 @tf_export('keras.metrics.mean_squared_logarithmic_error',
-           'keras.losses.mean_squared_logarithmic_error')
+           'keras.metrics.msle',
+           'keras.metrics.MSLE',
+           'keras.losses.mean_squared_logarithmic_error',
+           'keras.losses.msle',
+           'keras.losses.MSLE')
 def mean_squared_logarithmic_error(y_true, y_pred):
   first_log = math_ops.log(K.clip(y_pred, K.epsilon(), None) + 1.)
   second_log = math_ops.log(K.clip(y_true, K.epsilon(), None) + 1.)
@@ -117,7 +133,11 @@ def binary_crossentropy(y_true, y_pred):
 
 
 @tf_export('keras.metrics.kullback_leibler_divergence',
-           'keras.losses.kullback_leibler_divergence')
+           'keras.metrics.kld',
+           'keras.metrics.KLD',
+           'keras.losses.kullback_leibler_divergence',
+           'keras.losses.kld',
+           'keras.losses.KLD')
 def kullback_leibler_divergence(y_true, y_pred):
   y_true = K.clip(y_true, K.epsilon(), 1)
   y_pred = K.clip(y_pred, K.epsilon(), 1)
@@ -129,7 +149,10 @@ def poisson(y_true, y_pred):
   return K.mean(y_pred - y_true * math_ops.log(y_pred + K.epsilon()), axis=-1)
 
 
-@tf_export('keras.metrics.cosine_proximity', 'keras.losses.cosine_proximity')
+@tf_export('keras.metrics.cosine_proximity',
+           'keras.metrics.cosine',
+           'keras.losses.cosine_proximity',
+           'keras.losses.cosine')
 def cosine_proximity(y_true, y_pred):
   y_true = nn.l2_normalize(y_true, axis=-1)
   y_pred = nn.l2_normalize(y_pred, axis=-1)
diff --git a/tensorflow/python/keras/losses/__init__.py b/tensorflow/python/keras/losses/__init__.py
deleted file mode 100644
index 66721b694f5fd5..00000000000000
--- a/tensorflow/python/keras/losses/__init__.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras built-in loss functions."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# Loss functions.
-from tensorflow.python.keras._impl.keras.losses import binary_crossentropy
-from tensorflow.python.keras._impl.keras.losses import categorical_crossentropy
-from tensorflow.python.keras._impl.keras.losses import categorical_hinge
-from tensorflow.python.keras._impl.keras.losses import cosine_proximity
-from tensorflow.python.keras._impl.keras.losses import hinge
-from tensorflow.python.keras._impl.keras.losses import kullback_leibler_divergence
-from tensorflow.python.keras._impl.keras.losses import logcosh
-from tensorflow.python.keras._impl.keras.losses import mean_absolute_error
-from tensorflow.python.keras._impl.keras.losses import mean_absolute_percentage_error
-from tensorflow.python.keras._impl.keras.losses import mean_squared_error
-from tensorflow.python.keras._impl.keras.losses import mean_squared_logarithmic_error
-from tensorflow.python.keras._impl.keras.losses import poisson
-from tensorflow.python.keras._impl.keras.losses import sparse_categorical_crossentropy
-from tensorflow.python.keras._impl.keras.losses import squared_hinge
-
-# Auxiliary utils.
-# pylint: disable=g-bad-import-order
-from tensorflow.python.keras._impl.keras.losses import deserialize
-from tensorflow.python.keras._impl.keras.losses import serialize
-from tensorflow.python.keras._impl.keras.losses import get
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/losses_test.py b/tensorflow/python/keras/losses_test.py
similarity index 99%
rename from tensorflow/python/keras/_impl/keras/losses_test.py
rename to tensorflow/python/keras/losses_test.py
index 1884c0fdca7980..3098a6d071a77e 100644
--- a/tensorflow/python/keras/_impl/keras/losses_test.py
+++ b/tensorflow/python/keras/losses_test.py
@@ -23,7 +23,7 @@
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 try:
diff --git a/tensorflow/python/keras/_impl/keras/metrics.py b/tensorflow/python/keras/metrics.py
similarity index 72%
rename from tensorflow/python/keras/_impl/keras/metrics.py
rename to tensorflow/python/keras/metrics.py
index 747c3e65157ded..e03d7dfe93585e 100644
--- a/tensorflow/python/keras/_impl/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -21,22 +21,22 @@
 
 import six
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.losses import binary_crossentropy
-from tensorflow.python.keras._impl.keras.losses import categorical_crossentropy
-from tensorflow.python.keras._impl.keras.losses import cosine_proximity
-from tensorflow.python.keras._impl.keras.losses import hinge
-from tensorflow.python.keras._impl.keras.losses import kullback_leibler_divergence
-from tensorflow.python.keras._impl.keras.losses import logcosh
-from tensorflow.python.keras._impl.keras.losses import mean_absolute_error
-from tensorflow.python.keras._impl.keras.losses import mean_absolute_percentage_error
-from tensorflow.python.keras._impl.keras.losses import mean_squared_error
-from tensorflow.python.keras._impl.keras.losses import mean_squared_logarithmic_error
-from tensorflow.python.keras._impl.keras.losses import poisson
-from tensorflow.python.keras._impl.keras.losses import sparse_categorical_crossentropy
-from tensorflow.python.keras._impl.keras.losses import squared_hinge
-from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
-from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.losses import binary_crossentropy
+from tensorflow.python.keras.losses import categorical_crossentropy
+from tensorflow.python.keras.losses import cosine_proximity
+from tensorflow.python.keras.losses import hinge
+from tensorflow.python.keras.losses import kullback_leibler_divergence
+from tensorflow.python.keras.losses import logcosh
+from tensorflow.python.keras.losses import mean_absolute_error
+from tensorflow.python.keras.losses import mean_absolute_percentage_error
+from tensorflow.python.keras.losses import mean_squared_error
+from tensorflow.python.keras.losses import mean_squared_logarithmic_error
+from tensorflow.python.keras.losses import poisson
+from tensorflow.python.keras.losses import sparse_categorical_crossentropy
+from tensorflow.python.keras.losses import squared_hinge
+from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.util.tf_export import tf_export
diff --git a/tensorflow/python/keras/metrics/__init__.py b/tensorflow/python/keras/metrics/__init__.py
deleted file mode 100644
index 59faf037bce0f0..00000000000000
--- a/tensorflow/python/keras/metrics/__init__.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras built-in metrics functions."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# Metrics functions.
-from tensorflow.python.keras._impl.keras.metrics import binary_accuracy
-from tensorflow.python.keras._impl.keras.metrics import binary_crossentropy
-from tensorflow.python.keras._impl.keras.metrics import categorical_accuracy
-from tensorflow.python.keras._impl.keras.metrics import categorical_crossentropy
-from tensorflow.python.keras._impl.keras.metrics import cosine_proximity
-from tensorflow.python.keras._impl.keras.metrics import hinge
-from tensorflow.python.keras._impl.keras.metrics import kullback_leibler_divergence
-from tensorflow.python.keras._impl.keras.metrics import mean_absolute_error
-from tensorflow.python.keras._impl.keras.metrics import mean_absolute_percentage_error
-from tensorflow.python.keras._impl.keras.metrics import mean_squared_error
-from tensorflow.python.keras._impl.keras.metrics import mean_squared_logarithmic_error
-from tensorflow.python.keras._impl.keras.metrics import poisson
-from tensorflow.python.keras._impl.keras.metrics import sparse_categorical_crossentropy
-from tensorflow.python.keras._impl.keras.metrics import sparse_top_k_categorical_accuracy
-from tensorflow.python.keras._impl.keras.metrics import squared_hinge
-from tensorflow.python.keras._impl.keras.metrics import top_k_categorical_accuracy
-
-# Auxiliary utils.
-# pylint: disable=g-bad-import-order
-from tensorflow.python.keras._impl.keras.metrics import deserialize
-from tensorflow.python.keras._impl.keras.metrics import serialize
-from tensorflow.python.keras._impl.keras.metrics import get
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/metrics_test.py b/tensorflow/python/keras/metrics_test.py
similarity index 75%
rename from tensorflow/python/keras/_impl/keras/metrics_test.py
rename to tensorflow/python/keras/metrics_test.py
index 13cef978127810..15e793f5fcf0b4 100644
--- a/tensorflow/python/keras/_impl/keras/metrics_test.py
+++ b/tensorflow/python/keras/metrics_test.py
@@ -20,7 +20,7 @@
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import test
@@ -92,6 +92,7 @@ class BinaryTruePositives(keras.layers.Layer):
         def __init__(self, name='true_positives', **kwargs):
           super(BinaryTruePositives, self).__init__(name=name, **kwargs)
           self.true_positives = keras.backend.variable(value=0, dtype='int32')
+          self.stateful = True
 
         def reset_states(self):
           keras.backend.set_value(self.true_positives, 0)
@@ -132,10 +133,17 @@ def __call__(self, y_true, y_pred):
                     metrics=['acc', metric_fn])
 
       # Test fit, evaluate
-      samples = 1000
+      samples = 100
       x = np.random.random((samples, 2))
       y = np.random.randint(2, size=(samples, 1))
-      model.fit(x, y, epochs=1, batch_size=10)
+      val_samples = 10
+      val_x = np.random.random((val_samples, 2))
+      val_y = np.random.randint(2, size=(val_samples, 1))
+
+      history = model.fit(x, y,
+                          epochs=1,
+                          batch_size=10,
+                          validation_data=(val_x, val_y))
       outs = model.evaluate(x, y, batch_size=10)
       preds = model.predict(x)
 
@@ -145,6 +153,37 @@ def ref_true_pos(y_true, y_pred):
       # Test correctness (e.g. updates should have been run)
       self.assertAllClose(outs[2], ref_true_pos(y, preds), atol=1e-5)
 
+      # Test correctness of the validation metric computation
+      val_preds = model.predict(val_x)
+      val_outs = model.evaluate(val_x, val_y, batch_size=10)
+      self.assertAllClose(
+          val_outs[2], ref_true_pos(val_y, val_preds), atol=1e-5)
+      self.assertAllClose(
+          val_outs[2], history.history['val_true_positives'][-1], atol=1e-5)
+
+      # Test with generators
+      gen = [(np.array([x0]), np.array([y0])) for x0, y0 in zip(x, y)]
+      val_gen = [(np.array([x0]), np.array([y0]))
+                 for x0, y0 in zip(val_x, val_y)]
+      history = model.fit_generator(iter(gen),
+                                    epochs=1,
+                                    steps_per_epoch=samples,
+                                    validation_data=iter(val_gen),
+                                    validation_steps=val_samples)
+      outs = model.evaluate_generator(iter(gen), steps=samples)
+      preds = model.predict_generator(iter(gen), steps=samples)
+
+      # Test correctness of the metric results
+      self.assertAllClose(outs[2], ref_true_pos(y, preds), atol=1e-5)
+
+      # Test correctness of the validation metric computation
+      val_preds = model.predict_generator(iter(val_gen), steps=val_samples)
+      val_outs = model.evaluate_generator(iter(val_gen), steps=val_samples)
+      self.assertAllClose(
+          val_outs[2], ref_true_pos(val_y, val_preds), atol=1e-5)
+      self.assertAllClose(
+          val_outs[2], history.history['val_true_positives'][-1], atol=1e-5)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/model_subclassing_test.py b/tensorflow/python/keras/model_subclassing_test.py
similarity index 87%
rename from tensorflow/python/keras/_impl/keras/model_subclassing_test.py
rename to tensorflow/python/keras/model_subclassing_test.py
index 295ad47f6be464..8fb957da439dd4 100644
--- a/tensorflow/python/keras/_impl/keras/model_subclassing_test.py
+++ b/tensorflow/python/keras/model_subclassing_test.py
@@ -23,12 +23,15 @@
 import numpy as np
 import six
 
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras._impl import keras
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
+from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 try:
@@ -53,8 +56,8 @@ def __init__(self, use_bn=False, use_dp=False, num_classes=10):
     if self.use_bn:
       self.bn = keras.layers.BatchNormalization(axis=-1)
 
-  def call(self, inputs):
-    x = self.dense1(inputs)
+  def call(self, x):
+    x = self.dense1(x)
     if self.use_dp:
       x = self.dp(x)
     if self.use_bn:
@@ -248,6 +251,26 @@ def test_multi_io_workflow_with_tensors(self):
       model.fit([x1, x2], [y1, y2], epochs=2, steps_per_epoch=10, verbose=0)
       _ = model.evaluate(steps=10, verbose=0)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def test_single_io_workflow_with_dataset_iterators(self):
+    num_classes = 2
+    num_samples = 10
+    input_dim = 50
+
+    with self.test_session():
+      model = SimpleTestModel(num_classes=num_classes, use_dp=True, use_bn=True)
+      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+
+      x = np.ones((num_samples, input_dim))
+      y = np.zeros((num_samples, num_classes))
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+      iterator = dataset.make_one_shot_iterator()
+
+      model.fit(iterator, epochs=2, steps_per_epoch=10, verbose=0)
+      _ = model.evaluate(iterator, steps=10, verbose=0)
+
   def test_multi_io_workflow_with_numpy_arrays_and_custom_placeholders(self):
 
     num_classes = (2, 3)
@@ -583,6 +606,67 @@ def call(self, inputs, training=False):
     loss = model.train_on_batch(x, y)
     self.assertGreater(loss, 0.1)
 
+  def test_no_dependency(self):
+    class Foo(keras.Model):
+
+      def __init__(self):
+        super(Foo, self).__init__()
+        self.isdep = keras.layers.Dense(1)
+        self.notdep = checkpointable.NoDependency(keras.layers.Dense(2))
+        self.notdep_var = checkpointable.NoDependency(
+            resource_variable_ops.ResourceVariable(1., name='notdep_var'))
+
+    m = Foo()
+    self.assertEqual([m.isdep, m.notdep], m.layers)
+    self.assertEqual(1, len(m._checkpoint_dependencies))
+    self.assertIs(m.isdep, m._checkpoint_dependencies[0].ref)
+    self.assertEqual('notdep_var:0', m.notdep_var.name)
+
+  def test_extra_variable(self):
+
+    class ExtraVar(keras.Model):
+
+      def __init__(self):
+        super(ExtraVar, self).__init__()
+        self.dense = keras.layers.Dense(1)
+        self.var = resource_variable_ops.ResourceVariable(1.)
+        self.not_trainable_var = resource_variable_ops.ResourceVariable(
+            2., trainable=False)
+
+      def call(self, inputs):
+        return self.dense(inputs + self.var)
+
+    m = ExtraVar()
+    self.assertTrue(m.trainable)
+    self.assertEqual([m.dense], m.layers)
+    self.assertEqual([m.var, m.not_trainable_var], m.variables)
+    self.assertEqual([m.var], m.trainable_variables)
+    self.assertEqual([m.not_trainable_var], m.non_trainable_variables)
+    m.trainable = False
+    self.assertEqual([m.var, m.not_trainable_var], m.variables)
+    self.assertEqual([], m.trainable_variables)
+    self.assertEqual([m.var, m.not_trainable_var], m.non_trainable_variables)
+    m.trainable = True
+
+    m(array_ops.ones([1, 1]))
+
+    self.assertEqual([m.dense.kernel, m.dense.bias], m.dense.variables)
+    self.assertEqual([m.dense.kernel, m.dense.bias], m.dense.weights)
+
+    self.assertEqual([m.dense.kernel, m.dense.bias, m.var, m.not_trainable_var],
+                     m.variables)
+    self.assertEqual([m.dense.kernel, m.dense.bias, m.var],
+                     m.trainable_variables)
+    self.assertEqual([m.not_trainable_var], m.non_trainable_variables)
+
+    m.dense.trainable = False
+    self.assertEqual(
+        [m.var, m.dense.kernel, m.dense.bias, m.not_trainable_var],
+        m.variables)
+    self.assertEqual([m.var], m.trainable_variables)
+    self.assertEqual([m.dense.kernel, m.dense.bias, m.not_trainable_var],
+                     m.non_trainable_variables)
+
 
 class CustomCallModel(keras.Model):
 
diff --git a/tensorflow/python/keras/_impl/keras/models.py b/tensorflow/python/keras/models.py
similarity index 94%
rename from tensorflow/python/keras/_impl/keras/models.py
rename to tensorflow/python/keras/models.py
index 9602e7ba39b290..21217fdca14eab 100644
--- a/tensorflow/python/keras/_impl/keras/models.py
+++ b/tensorflow/python/keras/models.py
@@ -19,14 +19,14 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.engine import saving
-from tensorflow.python.keras._impl.keras.engine import sequential
-from tensorflow.python.keras._impl.keras.engine import training
-from tensorflow.python.keras._impl.keras.engine.input_layer import Input
-from tensorflow.python.keras._impl.keras.engine.input_layer import InputLayer
-from tensorflow.python.keras._impl.keras.utils import generic_utils
-from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.engine import saving
+from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.engine.input_layer import Input
+from tensorflow.python.keras.engine.input_layer import InputLayer
+from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils.generic_utils import has_arg
 
 
 # API entries importable from `keras.models`:
diff --git a/tensorflow/python/keras/models/__init__.py b/tensorflow/python/keras/models/__init__.py
deleted file mode 100644
index 2fb4ac0960d38f..00000000000000
--- a/tensorflow/python/keras/models/__init__.py
+++ /dev/null
@@ -1,31 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras models API."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.models import load_model
-from tensorflow.python.keras._impl.keras.models import Model
-from tensorflow.python.keras._impl.keras.models import model_from_config
-from tensorflow.python.keras._impl.keras.models import model_from_json
-from tensorflow.python.keras._impl.keras.models import model_from_yaml
-from tensorflow.python.keras._impl.keras.models import save_model
-from tensorflow.python.keras._impl.keras.models import Sequential
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/models_test.py b/tensorflow/python/keras/models_test.py
similarity index 78%
rename from tensorflow/python/keras/_impl/keras/models_test.py
rename to tensorflow/python/keras/models_test.py
index 5978ddd987c63b..e6e45902a8f117 100644
--- a/tensorflow/python/keras/_impl/keras/models_test.py
+++ b/tensorflow/python/keras/models_test.py
@@ -18,10 +18,14 @@
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
+from tensorflow.python.training import adam
 
 
 class TestModelCloning(test.TestCase):
@@ -123,5 +127,36 @@ def test_model_cloning_invalid_use_cases(self):
       keras.models._clone_sequential_model(seq_model, input_tensors=y)
 
 
+class CheckpointingTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_optimizer_dependency(self):
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(1, input_shape=(4,)))
+    opt = adam.AdamOptimizer(0.01)
+    model.compile(optimizer=opt, loss='mse')
+    model.fit(x=np.array([[1., 2., 3., 4.]]), y=[1.], epochs=2)
+    save_prefix = os.path.join(self.get_temp_dir(), 'ckpt')
+    beta1_power, _ = opt._get_beta_accumulators()
+    self.evaluate(beta1_power.assign(12.))
+    model.save_weights(save_prefix)
+    self.evaluate(beta1_power.assign(13.))
+    model.load_weights(save_prefix)
+    self.assertEqual(12., self.evaluate(beta1_power))
+
+class TestModelBackend(test.TestCase):
+
+  def test_model_backend_float64_use_cases(self):
+    # Test case for GitHub issue 19318
+    floatx = keras.backend.floatx()
+    keras.backend.set_floatx('float64')
+
+    x = keras.Input((5,))
+    y = keras.layers.Dense(1)(x)
+    model = keras.models.Model(x, y)
+    model.compile('rmsprop', 'mse')
+
+    keras.backend.set_floatx(floatx)
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/optimizers.py b/tensorflow/python/keras/optimizers.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/optimizers.py
rename to tensorflow/python/keras/optimizers.py
index 9f383deb725ac6..f58aeaea1acae2 100644
--- a/tensorflow/python/keras/_impl/keras/optimizers.py
+++ b/tensorflow/python/keras/optimizers.py
@@ -26,15 +26,16 @@
 
 from tensorflow.python.framework import dtypes as dtypes_module
 from tensorflow.python.framework import ops
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
-from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import optimizer as tf_optimizer_module
 from tensorflow.python.training import training_util
+from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -718,7 +719,7 @@ def get_config(self):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-class TFOptimizer(Optimizer):
+class TFOptimizer(Optimizer, checkpointable.Checkpointable):
   """Wrapper class for native TensorFlow optimizers.
   """
 
diff --git a/tensorflow/python/keras/optimizers/__init__.py b/tensorflow/python/keras/optimizers/__init__.py
deleted file mode 100644
index 44f47bc47f4a0e..00000000000000
--- a/tensorflow/python/keras/optimizers/__init__.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras built-in optimizers."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# Optimizer classes.
-from tensorflow.python.keras._impl.keras.optimizers import Adadelta
-from tensorflow.python.keras._impl.keras.optimizers import Adagrad
-from tensorflow.python.keras._impl.keras.optimizers import Adam
-from tensorflow.python.keras._impl.keras.optimizers import Adamax
-from tensorflow.python.keras._impl.keras.optimizers import Nadam
-from tensorflow.python.keras._impl.keras.optimizers import Optimizer
-from tensorflow.python.keras._impl.keras.optimizers import RMSprop
-from tensorflow.python.keras._impl.keras.optimizers import SGD
-
-# Auxiliary utils.
-# pylint: disable=g-bad-import-order
-from tensorflow.python.keras._impl.keras.optimizers import deserialize
-from tensorflow.python.keras._impl.keras.optimizers import serialize
-from tensorflow.python.keras._impl.keras.optimizers import get
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/optimizers_test.py b/tensorflow/python/keras/optimizers_test.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/optimizers_test.py
rename to tensorflow/python/keras/optimizers_test.py
index 57636afbf089f2..92b0cf326158ad 100644
--- a/tensorflow/python/keras/_impl/keras/optimizers_test.py
+++ b/tensorflow/python/keras/optimizers_test.py
@@ -20,8 +20,8 @@
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python import keras
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.training.adam import AdamOptimizer
 
diff --git a/tensorflow/python/keras/preprocessing/__init__.py b/tensorflow/python/keras/preprocessing/__init__.py
index 8fa3911a7a8833..e6704eeaa1f953 100644
--- a/tensorflow/python/keras/preprocessing/__init__.py
+++ b/tensorflow/python/keras/preprocessing/__init__.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 # ==============================================================================
 """Keras data preprocessing utils."""
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/image.py b/tensorflow/python/keras/preprocessing/image.py
similarity index 79%
rename from tensorflow/python/keras/_impl/keras/preprocessing/image.py
rename to tensorflow/python/keras/preprocessing/image.py
index 6299445c34b99f..aa425df6a8bdb2 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/image.py
+++ b/tensorflow/python/keras/preprocessing/image.py
@@ -29,8 +29,8 @@
 import threading
 
 import numpy as np
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.utils.data_utils import Sequence
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.utils.data_utils import Sequence
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
 
@@ -217,6 +217,16 @@ def random_zoom(x,
 
 @tf_export('keras.preprocessing.image.random_channel_shift')
 def random_channel_shift(x, intensity, channel_axis=0):
+  """Perform a random channel shift.
+
+  Arguments:
+      x: Input tensor. Must be 3D.
+      intensity: Transformation intensity.
+      channel_axis: Index of axis for channels in the input tensor.
+
+  Returns:
+      Numpy image tensor.
+  """
   x = np.rollaxis(x, channel_axis, 0)
   min_x, max_x = np.min(x), np.max(x)
   channel_images = [
@@ -451,54 +461,149 @@ def list_pictures(directory, ext='jpg|jpeg|bmp|png|ppm'):
 
 @tf_export('keras.preprocessing.image.ImageDataGenerator')
 class ImageDataGenerator(object):
-  """Generate minibatches of image data with real-time data augmentation.
+  """Generates batches of tensor image data with real-time data augmentation.
+  The data will be looped over (in batches).
 
   Arguments:
-      featurewise_center: set input mean to 0 over the dataset.
-      samplewise_center: set each sample mean to 0.
-      featurewise_std_normalization: divide inputs by std of the dataset.
-      samplewise_std_normalization: divide each input by its std.
-      zca_whitening: apply ZCA whitening.
+      featurewise_center: boolean, set input mean to 0 over the dataset,
+          feature-wise.
+      samplewise_center: boolean, set each sample mean to 0.
+      featurewise_std_normalization: boolean, divide inputs by std
+          of the dataset, feature-wise.
+      samplewise_std_normalization: boolean, divide each input by its std.
       zca_epsilon: epsilon for ZCA whitening. Default is 1e-6.
-      rotation_range: degrees (0 to 180).
-      width_shift_range: fraction of total width, if < 1, or pixels if >= 1.
-      height_shift_range: fraction of total height, if < 1, or pixels if >= 1.
-      brightness_range: the range of brightness to apply
-      shear_range: shear intensity (shear angle in degrees).
-      zoom_range: amount of zoom. if scalar z, zoom will be randomly picked
-          in the range [1-z, 1+z]. A sequence of two can be passed instead
-          to select this range.
-      channel_shift_range: shift range for each channel.
-      fill_mode: points outside the boundaries are filled according to the
-          given mode ('constant', 'nearest', 'reflect' or 'wrap'). Default
-          is 'nearest'.
-          Points outside the boundaries of the input are filled according to the
-            given mode:
+      zca_whitening: boolean, apply ZCA whitening.
+      rotation_range: int, degree range for random rotations.
+      width_shift_range: float, 1-D array-like or int
+          float: fraction of total width, if < 1, or pixels if >= 1.
+          1-D array-like: random elements from the array.
+          int: integer number of pixels from interval
+              `(-width_shift_range, +width_shift_range)`
+          With `width_shift_range=2` possible values are integers [-1, 0, +1],
+          same as with `width_shift_range=[-1, 0, +1]`,
+          while with `width_shift_range=1.0` possible values are floats in
+          the interval [-1.0, +1.0).
+      shear_range: float, shear Intensity
+          (Shear angle in counter-clockwise direction in degrees)
+      zoom_range: float or [lower, upper], Range for random zoom.
+          If a float, `[lower, upper] = [1-zoom_range, 1+zoom_range]`.
+      channel_shift_range: float, range for random channel shifts.
+      fill_mode: One of {"constant", "nearest", "reflect" or "wrap"}.
+          Default is 'nearest'. Points outside the boundaries of the input
+          are filled according to the given mode:
               'constant': kkkkkkkk|abcd|kkkkkkkk (cval=k)
               'nearest':  aaaaaaaa|abcd|dddddddd
               'reflect':  abcddcba|abcd|dcbaabcd
               'wrap':  abcdabcd|abcd|abcdabcd
-      cval: value used for points outside the boundaries when fill_mode is
-          'constant'. Default is 0.
-      horizontal_flip: whether to randomly flip images horizontally.
-      vertical_flip: whether to randomly flip images vertically.
-      rescale: rescaling factor. If None or 0, no rescaling is applied,
-          otherwise we multiply the data by the value provided. This is
-          applied after the `preprocessing_function` (if any provided)
-          but before any other transformation.
+      cval: float or int, value used for points outside the boundaries
+          when `fill_mode = "constant"`.
+      horizontal_flip: boolean, randomly flip inputs horizontally.
+      vertical_flip: boolean, randomly flip inputs vertically.
+      rescale: rescaling factor. Defaults to None. If None or 0, no rescaling
+          is applied, otherwise we multiply the data by the value provided
+          (before applying any other transformation).
       preprocessing_function: function that will be implied on each input.
-          The function will run before any other modification on it.
+          The function will run after the image is resized and augmented.
           The function should take one argument:
           one image (Numpy tensor with rank 3),
           and should output a Numpy tensor with the same shape.
-      data_format: 'channels_first' or 'channels_last'. In 'channels_first'
-        mode, the channels dimension
-          (the depth) is at index 1, in 'channels_last' mode it is at index 3.
+      data_format: One of {"channels_first", "channels_last"}.
+          "channels_last" mode means that the images should have shape
+              `(samples, height, width, channels)`,
+          "channels_first" mode means that the images should have shape
+              `(samples, channels, height, width)`.
           It defaults to the `image_data_format` value found in your
-          Keras config file at `~/.keras/keras.json`.
+              Keras config file at `~/.keras/keras.json`.
           If you never set it, then it will be "channels_last".
-      validation_split: fraction of images reserved for validation (strictly
-        between 0 and 1).
+      validation_split: float, fraction of images reserved for validation
+          (strictly between 0 and 1).
+
+  Examples:
+      Example of using `.flow(x, y)`:
+      ```python
+      (x_train, y_train), (x_test, y_test) = cifar10.load_data()
+      y_train = np_utils.to_categorical(y_train, num_classes)
+      y_test = np_utils.to_categorical(y_test, num_classes)
+      datagen = ImageDataGenerator(
+          featurewise_center=True,
+          featurewise_std_normalization=True,
+          rotation_range=20,
+          width_shift_range=0.2,
+          height_shift_range=0.2,
+          horizontal_flip=True)
+      # compute quantities required for featurewise normalization
+      # (std, mean, and principal components if ZCA whitening is applied)
+      datagen.fit(x_train)
+      # fits the model on batches with real-time data augmentation:
+      model.fit_generator(datagen.flow(x_train, y_train, batch_size=32),
+                          steps_per_epoch=len(x_train) / 32, epochs=epochs)
+      # here's a more "manual" example
+      for e in range(epochs):
+          print('Epoch', e)
+          batches = 0
+          for x_batch, y_batch in datagen.flow(x_train, y_train, batch_size=32):
+              model.fit(x_batch, y_batch)
+              batches += 1
+              if batches >= len(x_train) / 32:
+                  # we need to break the loop by hand because
+                  # the generator loops indefinitely
+                  break
+      ```
+      Example of using `.flow_from_directory(directory)`:
+      ```python
+      train_datagen = ImageDataGenerator(
+          rescale=1./255,
+          shear_range=0.2,
+          zoom_range=0.2,
+          horizontal_flip=True)
+      test_datagen = ImageDataGenerator(rescale=1./255)
+      train_generator = train_datagen.flow_from_directory(
+          'data/train',
+          target_size=(150, 150),
+          batch_size=32,
+          class_mode='binary')
+      validation_generator = test_datagen.flow_from_directory(
+          'data/validation',
+          target_size=(150, 150),
+          batch_size=32,
+          class_mode='binary')
+      model.fit_generator(
+          train_generator,
+          steps_per_epoch=2000,
+          epochs=50,
+          validation_data=validation_generator,
+          validation_steps=800)
+      ```
+      Example of transforming images and masks together.
+      ```python
+      # we create two instances with the same arguments
+      data_gen_args = dict(featurewise_center=True,
+                           featurewise_std_normalization=True,
+                           rotation_range=90.,
+                           width_shift_range=0.1,
+                           height_shift_range=0.1,
+                           zoom_range=0.2)
+      image_datagen = ImageDataGenerator(**data_gen_args)
+      mask_datagen = ImageDataGenerator(**data_gen_args)
+      # Provide the same seed and keyword arguments to the fit and flow methods
+      seed = 1
+      image_datagen.fit(images, augment=True, seed=seed)
+      mask_datagen.fit(masks, augment=True, seed=seed)
+      image_generator = image_datagen.flow_from_directory(
+          'data/images',
+          class_mode=None,
+          seed=seed)
+      mask_generator = mask_datagen.flow_from_directory(
+          'data/masks',
+          class_mode=None,
+          seed=seed)
+      # combine generators into one which yields image and masks
+      train_generator = zip(image_generator, mask_generator)
+      model.fit_generator(
+          train_generator,
+          steps_per_epoch=2000,
+          epochs=50)
+      ```
   """
 
   def __init__(self,
@@ -613,6 +718,31 @@ def flow(self,
            save_prefix='',
            save_format='png',
            subset=None):
+    """Generates batches of augmented/normalized data with given numpy arrays.
+
+    Arguments:
+        x: data. Should have rank 4.
+            In case of grayscale data, the channels axis should have value 1
+            and in case of RGB data, it should have value 3.
+        y: labels.
+        batch_size: int (default: 32).
+        shuffle: boolean (default: True).
+        seed: int (default: None).
+        save_to_dir: None or str (default: None).
+            This allows you to optionally specify a directory
+            to which to save the augmented pictures being generated
+            (useful for visualizing what you are doing).
+        save_prefix: str (default: `''`). Prefix to use for filenames of
+            saved pictures (only relevant if `save_to_dir` is set).
+        save_format: one of "png", "jpeg". Default: "png".
+            (only relevant if `save_to_dir` is set)
+        subset: Subset of data (`"training"` or `"validation"`) if
+            `validation_split` is set in `ImageDataGenerator`.
+
+    Returns:
+        An Iterator yielding tuples of `(x, y)` where `x` is a numpy array of
+          image data and `y` is a numpy array of corresponding labels.
+    """
     return NumpyArrayIterator(
         x,
         y,
@@ -641,6 +771,65 @@ def flow_from_directory(self,
                           follow_links=False,
                           subset=None,
                           interpolation='nearest'):
+    """Generates batches of augmented/normalized data given directory path.
+
+    Arguments:
+        directory: path to the target directory. It should contain one
+            subdirectory per class. Any PNG, JPG, BMP, PPM or TIF images
+            inside each of the subdirectories directory tree will be included
+            in the generator. See [this script]
+            (https://gist.github.com/fchollet/0830affa1f7f19fd47b06d4cf89ed44d)
+            for more details.
+        target_size: tuple of integers `(height, width)`, default: `(256,
+            256)`. The dimensions to which all images found will be resized.
+        color_mode: one of "grayscale", "rbg". Default: "rgb". Whether the
+            images will be converted to have 1 or 3 color channels.
+        classes: optional list of class subdirectories (e.g. `['dogs',
+            'cats']`). Default: None. If not provided, the list of classes
+            will be automatically inferred from the subdirectory
+            names/structure under `directory`, where each subdirectory will be
+            treated as a different class (and the order of the classes, which
+            will map to the label indices, will be alphanumeric). The
+            dictionary containing the mapping from class names to class
+            indices can be obtained via the attribute `class_indices`.
+        class_mode: one of "categorical", "binary", "sparse", "input" or
+            None. Default: "categorical". Determines the type of label arrays
+            that are returned: "categorical" will be 2D one-hot encoded
+            labels, "binary" will be 1D binary labels, "sparse" will be 1D
+            integer labels, "input" will be images identical to input images
+            (mainly used to work with autoencoders). If None, no labels are
+            returned (the generator will only yield batches of image data,
+            which is useful to use `model.predict_generator()`,
+            `model.evaluate_generator()`, etc.). Please note that in case of
+            class_mode None, the data still needs to reside in a subdirectory
+            of `directory` for it to work correctly.
+        batch_size: size of the batches of data (default: 32).
+        shuffle: whether to shuffle the data (default: True)
+        seed: optional random seed for shuffling and transformations.
+        save_to_dir: None or str (default: None). This allows you to
+            optionally specify a directory to which to save the augmented
+            pictures being generated (useful for visualizing what you are doing)
+        save_prefix: str. Prefix to use for filenames of saved pictures
+            (only relevant if `save_to_dir` is set).
+        save_format: one of "png", "jpeg" (only relevant if `save_to_dir` is
+            set). Default: "png".
+        follow_links: whether to follow symlinks inside class subdirectories
+            (default: False).
+        subset: Subset of data (`"training"` or `"validation"`) if
+          ` validation_split` is set in `ImageDataGenerator`.
+        interpolation: Interpolation method used to resample the image if
+            the target size is different from that of the loaded image.
+            Supported methods are `"nearest"`, `"bilinear"`, and `"bicubic"`.
+            If PIL version 1.1.3 or newer is installed, `"lanczos"` is also
+            supported. If PIL version 3.4.0 or newer is installed, `"box"` and
+            `"hamming"` are also supported. By default, `"nearest"` is used.
+
+    Returns:
+        A DirectoryIterator yielding tuples of `(x, y)` where `x` is a
+        numpy array containing a batch of images with shape
+        `(batch_size, *target_size, channels)` and `y` is a numpy
+        array of corresponding labels.
+    """
     return DirectoryIterator(
         directory,
         self,
@@ -669,7 +858,7 @@ def standardize(self, x):
         The inputs, normalized.
     """
     if self.preprocessing_function:
-      x = self.image_data_generator.preprocessing_function(x)
+      x = self.preprocessing_function(x)
     if self.rescale:
       x *= self.rescale
     if self.samplewise_center:
@@ -737,15 +926,24 @@ def random_transform(self, x, seed=None):
       theta = 0
 
     if self.height_shift_range:
-      tx = np.random.uniform(-self.height_shift_range, self.height_shift_range)
-      if self.height_shift_range < 1:
+      try:  # 1-D array-like or int
+        tx = np.random.choice(self.height_shift_range)
+        tx *= np.random.choice([-1, 1])
+      except ValueError:  # floating point
+        tx = np.random.uniform(-self.height_shift_range,
+                               self.height_shift_range)
+      if np.max(self.height_shift_range) < 1:
         tx *= x.shape[img_row_axis]
     else:
       tx = 0
 
     if self.width_shift_range:
-      ty = np.random.uniform(-self.width_shift_range, self.width_shift_range)
-      if self.width_shift_range < 1:
+      try:  # 1-D array-like or int
+        ty = np.random.choice(self.width_shift_range)
+        ty *= np.random.choice([-1, 1])
+      except ValueError:  # floating point
+        ty = np.random.uniform(-self.width_shift_range, self.width_shift_range)
+      if np.max(self.width_shift_range) < 1:
         ty *= x.shape[img_col_axis]
     else:
       ty = 0
@@ -809,24 +1007,25 @@ def random_transform(self, x, seed=None):
     return x
 
   def fit(self, x, augment=False, rounds=1, seed=None):
-    """Fits internal statistics to some sample data.
+    """Computes the internal data statistics based on an array of sample data.
 
-    Required for featurewise_center, featurewise_std_normalization
-    and zca_whitening.
+    These are statistics related to the data-dependent transformations.
+    Only required if featurewise_center or featurewise_std_normalization or
+    zca_whitening.
 
     Arguments:
-        x: Numpy array, the data to fit on. Should have rank 4.
-            In case of grayscale data,
-            the channels axis should have value 1, and in case
-            of RGB data, it should have value 3.
-        augment: Whether to fit on randomly augmented samples
-        rounds: If `augment`,
-            how many augmentation passes to do over the data
-        seed: random seed.
+        x: sample data. Should have rank 4.
+            In case of grayscale data, the channels axis should have value 1
+            and in case of RGB data, it should have value 3.
+        augment: Boolean (default: False). Whether to fit on randomly
+            augmented samples.
+        rounds: int (default: 1). If augment, how many augmentation passes
+            over the data to use.
+        seed: int (default: None). Random seed.
 
     Raises:
-        ValueError: in case of invalid input `x`.
-        ImportError: if Scipy is not available.
+        ValueError: If input rank is not 4.
+        ImportError: If scipy is not imported.
     """
     x = np.asarray(x, dtype=K.floatx())
     if x.ndim != 4:
diff --git a/tensorflow/python/keras/preprocessing/image/__init__.py b/tensorflow/python/keras/preprocessing/image/__init__.py
deleted file mode 100644
index 6aba5fc8252e1a..00000000000000
--- a/tensorflow/python/keras/preprocessing/image/__init__.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras data preprocessing utils for image data."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.preprocessing.image import apply_transform
-from tensorflow.python.keras._impl.keras.preprocessing.image import array_to_img
-from tensorflow.python.keras._impl.keras.preprocessing.image import DirectoryIterator
-from tensorflow.python.keras._impl.keras.preprocessing.image import flip_axis
-from tensorflow.python.keras._impl.keras.preprocessing.image import ImageDataGenerator
-from tensorflow.python.keras._impl.keras.preprocessing.image import img_to_array
-from tensorflow.python.keras._impl.keras.preprocessing.image import Iterator
-from tensorflow.python.keras._impl.keras.preprocessing.image import load_img
-from tensorflow.python.keras._impl.keras.preprocessing.image import NumpyArrayIterator
-from tensorflow.python.keras._impl.keras.preprocessing.image import random_brightness
-from tensorflow.python.keras._impl.keras.preprocessing.image import random_channel_shift
-from tensorflow.python.keras._impl.keras.preprocessing.image import random_rotation
-from tensorflow.python.keras._impl.keras.preprocessing.image import random_shear
-from tensorflow.python.keras._impl.keras.preprocessing.image import random_shift
-from tensorflow.python.keras._impl.keras.preprocessing.image import random_zoom
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/image_test.py b/tensorflow/python/keras/preprocessing/image_test.py
similarity index 93%
rename from tensorflow/python/keras/_impl/keras/preprocessing/image_test.py
rename to tensorflow/python/keras/preprocessing/image_test.py
index 001fee91f9ed60..275808a6155b26 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/image_test.py
+++ b/tensorflow/python/keras/preprocessing/image_test.py
@@ -24,7 +24,7 @@
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 try:
@@ -246,7 +246,37 @@ def test_directory_iterator(self):
     self.assertEqual(len(dir_iterator.class_indices), num_classes)
     self.assertEqual(len(dir_iterator.classes), count)
     self.assertEqual(set(dir_iterator.filenames), set(filenames))
-    _ = dir_iterator.next()
+
+    def preprocessing_function(x):
+      """This will fail if not provided by a Numpy array.
+
+      Note: This is made to enforce backward compatibility.
+
+      Args:
+          x: A numpy array.
+
+      Returns:
+          An array of zeros with the same shape as the given array.
+      """
+      self.assertEqual(x.shape, (26, 26, 3))
+      self.assertIs(type(x), np.ndarray)
+      return np.zeros_like(x)
+
+    # Test usage as Sequence
+    generator = keras.preprocessing.image.ImageDataGenerator(
+        preprocessing_function=preprocessing_function)
+    dir_seq = generator.flow_from_directory(
+        str(temp_dir),
+        target_size=(26, 26),
+        color_mode='rgb',
+        batch_size=3,
+        class_mode='categorical')
+    self.assertEqual(len(dir_seq), count // 3 + 1)
+    x1, y1 = dir_seq[1]
+    self.assertEqual(x1.shape, (3, 26, 26, 3))
+    self.assertEqual(y1.shape, (3, num_classes))
+    x1, y1 = dir_seq[5]
+    self.assertTrue((x1 == 0).all())
 
   def directory_iterator_with_validation_split_test_helper(
       self, validation_split):
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/sequence.py b/tensorflow/python/keras/preprocessing/sequence.py
similarity index 95%
rename from tensorflow/python/keras/_impl/keras/preprocessing/sequence.py
rename to tensorflow/python/keras/preprocessing/sequence.py
index e68c171d9c7e33..e0924f837a79db 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/sequence.py
+++ b/tensorflow/python/keras/preprocessing/sequence.py
@@ -23,7 +23,7 @@
 import numpy as np
 from six.moves import range  # pylint: disable=redefined-builtin
 
-from tensorflow.python.keras._impl.keras.utils.data_utils import Sequence
+from tensorflow.python.keras.utils.data_utils import Sequence
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -357,9 +357,15 @@ def __init__(self,
     self.reverse = reverse
     self.batch_size = batch_size
 
+    if self.start_index > self.end_index:
+      raise ValueError('`start_index+length=%i > end_index=%i` '
+                       'is disallowed, as no part of the sequence '
+                       'would be left to be used as current step.' %
+                       (self.start_index, self.end_index))
+
   def __len__(self):
     length = int(
-        np.ceil((self.end_index - self.start_index) /
+        np.ceil((self.end_index - self.start_index + 1) /
                 (self.batch_size * self.stride)))
     return length if length >= 0 else 0
 
@@ -373,11 +379,12 @@ def _empty_batch(self, num_rows):
   def __getitem__(self, index):
     if self.shuffle:
       rows = np.random.randint(
-          self.start_index, self.end_index, size=self.batch_size)
+          self.start_index, self.end_index + 1, size=self.batch_size)
     else:
       i = self.start_index + self.batch_size * self.stride * index
-      rows = np.arange(i, min(i + self.batch_size * self.stride,
-                              self.end_index), self.stride)
+      rows = np.arange(
+          i, min(i + self.batch_size * self.stride, self.end_index + 1),
+          self.stride)
 
     samples, targets = self._empty_batch(len(rows))
     for j in range(len(rows)):
diff --git a/tensorflow/python/keras/preprocessing/sequence/__init__.py b/tensorflow/python/keras/preprocessing/sequence/__init__.py
deleted file mode 100644
index b7a7149cc40654..00000000000000
--- a/tensorflow/python/keras/preprocessing/sequence/__init__.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras data preprocessing utils for sequence data."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.preprocessing.sequence import make_sampling_table
-from tensorflow.python.keras._impl.keras.preprocessing.sequence import pad_sequences
-from tensorflow.python.keras._impl.keras.preprocessing.sequence import skipgrams
-from tensorflow.python.keras._impl.keras.preprocessing.sequence import TimeseriesGenerator
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/sequence_test.py b/tensorflow/python/keras/preprocessing/sequence_test.py
similarity index 75%
rename from tensorflow/python/keras/_impl/keras/preprocessing/sequence_test.py
rename to tensorflow/python/keras/preprocessing/sequence_test.py
index b9bfdd00048466..ab6a09106b5f3c 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/sequence_test.py
+++ b/tensorflow/python/keras/preprocessing/sequence_test.py
@@ -18,9 +18,11 @@
 from __future__ import division
 from __future__ import print_function
 
+from math import ceil
+
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
@@ -146,7 +148,7 @@ def test_TimeseriesGenerator(self):
         start_index=10,
         end_index=30,
         batch_size=2)
-    self.assertEqual(len(data_gen), 5)
+    self.assertEqual(len(data_gen), 6)
     self.assertAllClose(data_gen[0][0],
                         np.array([[[10], [12], [14], [16], [18]],
                                   [[11], [13], [15], [17], [19]]]))
@@ -163,13 +165,74 @@ def test_TimeseriesGenerator(self):
         end_index=30,
         batch_size=2)
 
-    self.assertEqual(len(data_gen), 5)
+    self.assertEqual(len(data_gen), 6)
     self.assertAllClose(data_gen[0][0],
                         np.array(
                             [np.array(data[10:19:2]),
                              np.array(data[11:20:2])]))
     self.assertAllClose(data_gen[0][1], np.array([targets[20], targets[21]]))
 
+    with self.assertRaises(ValueError) as context:
+      keras.preprocessing.sequence.TimeseriesGenerator(data, targets, length=50)
+    error = str(context.exception)
+    self.assertIn('`start_index+length=50 > end_index=49` is disallowed', error)
+
+  def test_TimeSeriesGenerator_doesnt_miss_any_sample(self):
+    x = np.array([[i] for i in range(10)])
+
+    for length in range(3, 10):
+      g = keras.preprocessing.sequence.TimeseriesGenerator(
+          x, x, length=length, batch_size=1)
+      expected = max(0, len(x) - length)
+      actual = len(g)
+      self.assertEqual(expected, actual)
+
+      if actual > 0:
+        # All elements in range(length, 10) should be used as current step
+        expected = np.arange(length, 10).reshape(-1, 1)
+
+        y = np.concatenate([g[ix][1] for ix in range(len(g))], axis=0)
+        self.assertAllClose(y, expected)
+
+    x = np.array([[i] for i in range(23)])
+
+    strides = (1, 1, 5, 7, 3, 5, 3)
+    lengths = (3, 3, 4, 3, 1, 3, 7)
+    batch_sizes = (6, 6, 6, 5, 6, 6, 6)
+    shuffles = (False, True, True, False, False, False, False)
+
+    for stride, length, batch_size, shuffle in zip(strides, lengths,
+                                                   batch_sizes, shuffles):
+      g = keras.preprocessing.sequence.TimeseriesGenerator(
+          x,
+          x,
+          length=length,
+          sampling_rate=1,
+          stride=stride,
+          start_index=0,
+          end_index=None,
+          shuffle=shuffle,
+          reverse=False,
+          batch_size=batch_size)
+      if shuffle:
+        # all batches have the same size when shuffle is True.
+        expected_sequences = ceil(
+            (23 - length) / float(batch_size * stride)) * batch_size
+      else:
+        # last batch will be different if `(samples - length) / stride`
+        # is not a multiple of `batch_size`.
+        expected_sequences = ceil((23 - length) / float(stride))
+
+      expected_batches = ceil(expected_sequences / float(batch_size))
+
+      y = [g[ix][1] for ix in range(len(g))]
+
+      actual_sequences = sum(len(iy) for iy in y)
+      actual_batches = len(y)
+
+      self.assertEqual(expected_sequences, actual_sequences)
+      self.assertEqual(expected_batches, actual_batches)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/text.py b/tensorflow/python/keras/preprocessing/text.py
similarity index 85%
rename from tensorflow/python/keras/_impl/keras/preprocessing/text.py
rename to tensorflow/python/keras/preprocessing/text.py
index f652f318f3d6da..f3b57de257a586 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/text.py
+++ b/tensorflow/python/keras/preprocessing/text.py
@@ -42,13 +42,15 @@ def text_to_word_sequence(text,
                           filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                           lower=True,
                           split=' '):
-  """Converts a text to a sequence of words (or tokens).
+  r"""Converts a text to a sequence of words (or tokens).
 
   Arguments:
       text: Input text (string).
-      filters: Sequence of characters to filter out.
-      lower: Whether to convert the input to lowercase.
-      split: Sentence split marker (string).
+      filters: list (or concatenation) of characters to filter out, such as
+          punctuation. Default: '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
+          includes basic punctuation, tabs, and newlines.
+      lower: boolean, whether to convert the input to lowercase.
+      split: string, separator for word splitting.
 
   Returns:
       A list of words (or tokens).
@@ -56,12 +58,21 @@ def text_to_word_sequence(text,
   if lower:
     text = text.lower()
 
-  if sys.version_info < (3,) and isinstance(text, unicode):
-    translate_map = dict((ord(c), unicode(split)) for c in filters)
+  if sys.version_info < (3,):
+    if isinstance(text, unicode):
+      translate_map = dict((ord(c), unicode(split)) for c in filters)
+      text = text.translate(translate_map)
+    elif len(split) == 1:
+      translate_map = maketrans(filters, split * len(filters))
+      text = text.translate(translate_map)
+    else:
+      for c in filters:
+        text = text.replace(c, split)
   else:
-    translate_map = maketrans(filters, split * len(filters))
+    translate_dict = dict((c, split) for c in filters)
+    translate_map = maketrans(translate_dict)
+    text = text.translate(translate_map)
 
-  text = text.translate(translate_map)
   seq = text.split(split)
   return [i for i in seq if i]
 
@@ -72,20 +83,23 @@ def one_hot(text,
             filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
             lower=True,
             split=' '):
-  """One-hot encodes a text into a list of word indexes of size n.
+  r"""One-hot encodes a text into a list of word indexes of size n.
 
   This is a wrapper to the `hashing_trick` function using `hash` as the
   hashing function; unicity of word to index mapping non-guaranteed.
 
   Arguments:
       text: Input text (string).
-      n: Dimension of the hashing space.
-      filters: Sequence of characters to filter out.
-      lower: Whether to convert the input to lowercase.
-      split: Sentence split marker (string).
+      n: int, size of vocabulary.
+      filters: list (or concatenation) of characters to filter out, such as
+          punctuation. Default: '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
+          includes basic punctuation, tabs, and newlines.
+      lower: boolean, whether to set the text to lowercase.
+      split: string, separator for word splitting.
 
   Returns:
-      A list of integer word indices (unicity non-guaranteed).
+      List of integers in [1, n].
+      Each integer encodes a word (unicity non-guaranteed).
   """
   return hashing_trick(
       text, n, hash_function=hash, filters=filters, lower=lower, split=split)
@@ -98,19 +112,21 @@ def hashing_trick(text,
                   filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                   lower=True,
                   split=' '):
-  """Converts a text to a sequence of indexes in a fixed-size hashing space.
+  r"""Converts a text to a sequence of indexes in a fixed-size hashing space.
 
   Arguments:
       text: Input text (string).
       n: Dimension of the hashing space.
-      hash_function: if `None` uses python `hash` function, can be 'md5' or
+      hash_function: defaults to python `hash` function, can be 'md5' or
           any function that takes in input a string and returns a int.
-          Note that `hash` is not a stable hashing function, so
+          Note that 'hash' is not a stable hashing function, so
           it is not consistent across different runs, while 'md5'
           is a stable hashing function.
-      filters: Sequence of characters to filter out.
-      lower: Whether to convert the input to lowercase.
-      split: Sentence split marker (string).
+      filters: list (or concatenation) of characters to filter out, such as
+          punctuation. Default: '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
+          includes basic punctuation, tabs, and newlines.
+      lower: boolean, whether to set the text to lowercase.
+      split: string, separator for word splitting.
 
   Returns:
       A list of integer word indices (unicity non-guaranteed).
@@ -150,7 +166,7 @@ class Tokenizer(object):
           filtered from the texts. The default is all punctuation, plus
           tabs and line breaks, minus the `'` character.
       lower: boolean. Whether to convert the texts to lowercase.
-      split: character or string to use for token splitting.
+      split: string, separator for word splitting.
       char_level: if True, every character will be treated as a token.
       oov_token: if given, it will be added to word_index and used to
           replace out-of-vocabulary words during text_to_sequence calls
diff --git a/tensorflow/python/keras/preprocessing/text/__init__.py b/tensorflow/python/keras/preprocessing/text/__init__.py
deleted file mode 100644
index 000ad68a0c01e9..00000000000000
--- a/tensorflow/python/keras/preprocessing/text/__init__.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras data preprocessing utils for text data."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.preprocessing.text import hashing_trick
-from tensorflow.python.keras._impl.keras.preprocessing.text import one_hot
-from tensorflow.python.keras._impl.keras.preprocessing.text import text_to_word_sequence
-from tensorflow.python.keras._impl.keras.preprocessing.text import Tokenizer
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/preprocessing/text_test.py b/tensorflow/python/keras/preprocessing/text_test.py
similarity index 89%
rename from tensorflow/python/keras/_impl/keras/preprocessing/text_test.py
rename to tensorflow/python/keras/preprocessing/text_test.py
index c6a267e57e4e2d..566fd3bb1a3639 100644
--- a/tensorflow/python/keras/_impl/keras/preprocessing/text_test.py
+++ b/tensorflow/python/keras/preprocessing/text_test.py
@@ -21,7 +21,7 @@
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
@@ -114,11 +114,21 @@ def test_text_to_word_sequence(self):
     seq = keras.preprocessing.text.text_to_word_sequence(text)
     self.assertEqual(seq, ['hello', 'world'])
 
+  def test_text_to_word_sequence_multichar_split(self):
+    text = 'hello!stop?world!'
+    seq = keras.preprocessing.text.text_to_word_sequence(text, split='stop')
+    self.assertEqual(seq, ['hello', 'world'])
+
   def test_text_to_word_sequence_unicode(self):
     text = u'ali! veli? kırk dokuz elli'
     seq = keras.preprocessing.text.text_to_word_sequence(text)
     self.assertEqual(seq, [u'ali', u'veli', u'kırk', u'dokuz', u'elli'])
 
+  def test_text_to_word_sequence_unicode_multichar_split(self):
+    text = u'ali!stopveli?stopkırkstopdokuzstopelli'
+    seq = keras.preprocessing.text.text_to_word_sequence(text, split='stop')
+    self.assertEqual(seq, [u'ali', u'veli', u'kırk', u'dokuz', u'elli'])
+
   def test_tokenizer_unicode(self):
     texts = [
         u'ali veli kırk dokuz elli', u'ali veli kırk dokuz elli veli kırk dokuz'
diff --git a/tensorflow/python/keras/_impl/keras/regularizers.py b/tensorflow/python/keras/regularizers.py
similarity index 92%
rename from tensorflow/python/keras/_impl/keras/regularizers.py
rename to tensorflow/python/keras/regularizers.py
index 74c37d370ea630..28b6ad4c65a291 100644
--- a/tensorflow/python/keras/_impl/keras/regularizers.py
+++ b/tensorflow/python/keras/regularizers.py
@@ -20,9 +20,9 @@
 
 import six
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
-from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
diff --git a/tensorflow/python/keras/regularizers/__init__.py b/tensorflow/python/keras/regularizers/__init__.py
deleted file mode 100644
index 3e707ccab577b5..00000000000000
--- a/tensorflow/python/keras/regularizers/__init__.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras built-in regularizers."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# Regularizer functions / callable classes.
-from tensorflow.python.keras._impl.keras.regularizers import L1L2
-from tensorflow.python.keras._impl.keras.regularizers import Regularizer
-
-# Functional interface.
-# pylint: disable=g-bad-import-order
-from tensorflow.python.keras._impl.keras.regularizers import l1
-from tensorflow.python.keras._impl.keras.regularizers import l2
-from tensorflow.python.keras._impl.keras.regularizers import l1_l2
-
-# Auxiliary utils.
-from tensorflow.python.keras._impl.keras.regularizers import deserialize
-from tensorflow.python.keras._impl.keras.regularizers import serialize
-from tensorflow.python.keras._impl.keras.regularizers import get
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/regularizers_test.py b/tensorflow/python/keras/regularizers_test.py
similarity index 90%
rename from tensorflow/python/keras/_impl/keras/regularizers_test.py
rename to tensorflow/python/keras/regularizers_test.py
index 9a1612b7779d1e..e2075785d8061a 100644
--- a/tensorflow/python/keras/_impl/keras/regularizers_test.py
+++ b/tensorflow/python/keras/regularizers_test.py
@@ -18,8 +18,8 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python import keras
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
@@ -71,6 +71,11 @@ def test_activity_regularization(self):
         model.fit(x_train, y_train, batch_size=10,
                   epochs=1, verbose=0)
 
+  def test_zero_regularization(self):
+    inputs = keras.backend.ones(shape=(10, 10))
+    layer = keras.layers.Dense(3, kernel_regularizer=keras.regularizers.l2(0))
+    layer(inputs)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/_impl/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/testing_utils.py
rename to tensorflow/python/keras/testing_utils.py
index 60799ee1e038b4..e7cb45d5e110dc 100644
--- a/tensorflow/python/keras/_impl/keras/testing_utils.py
+++ b/tensorflow/python/keras/testing_utils.py
@@ -20,8 +20,8 @@
 
 import numpy as np
 
+from tensorflow.python import keras
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl import keras
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
 from tensorflow.python.util import tf_inspect
 
@@ -37,7 +37,6 @@ def get_test_data(train_samples,
     test_samples: Integer, how many test samples to generate.
     input_shape: Tuple of integers, shape of the inputs.
     num_classes: Integer, number of classes for the data and targets.
-      Only relevant if `classification=True`.
 
   Returns:
     A tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
@@ -45,7 +44,7 @@ def get_test_data(train_samples,
   num_sample = train_samples + test_samples
   templates = 2 * num_classes * np.random.random((num_classes,) + input_shape)
   y = np.random.randint(0, num_classes, size=(num_sample,))
-  x = np.zeros((num_sample,) + input_shape)
+  x = np.zeros((num_sample,) + input_shape, dtype=np.float32)
   for i in range(num_sample):
     x[i] = templates[y[i]] + np.random.normal(loc=0, scale=1., size=input_shape)
   return ((x[:train_samples], y[:train_samples]),
diff --git a/tensorflow/python/keras/utils/__init__.py b/tensorflow/python/keras/utils/__init__.py
index 2f74cf031d0520..69337b6a8d52ab 100644
--- a/tensorflow/python/keras/utils/__init__.py
+++ b/tensorflow/python/keras/utils/__init__.py
@@ -18,22 +18,23 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.utils.data_utils import GeneratorEnqueuer
-from tensorflow.python.keras._impl.keras.utils.data_utils import get_file
-from tensorflow.python.keras._impl.keras.utils.data_utils import Sequence
-from tensorflow.python.keras._impl.keras.utils.data_utils import SequenceEnqueuer
-from tensorflow.python.keras._impl.keras.utils.generic_utils import custom_object_scope
-from tensorflow.python.keras._impl.keras.utils.generic_utils import CustomObjectScope
-from tensorflow.python.keras._impl.keras.utils.generic_utils import deserialize_keras_object
-from tensorflow.python.keras._impl.keras.utils.generic_utils import get_custom_objects
-from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
-from tensorflow.python.keras._impl.keras.utils.generic_utils import serialize_keras_object
-from tensorflow.python.keras._impl.keras.utils.io_utils import HDF5Matrix
-from tensorflow.python.keras._impl.keras.utils.layer_utils import convert_all_kernels_in_model
-from tensorflow.python.keras._impl.keras.utils.multi_gpu_utils import multi_gpu_model
-from tensorflow.python.keras._impl.keras.utils.np_utils import normalize
-from tensorflow.python.keras._impl.keras.utils.np_utils import to_categorical
-from tensorflow.python.keras._impl.keras.utils.vis_utils import plot_model
+from tensorflow.python.keras.utils.data_utils import GeneratorEnqueuer
+from tensorflow.python.keras.utils.data_utils import get_file
+from tensorflow.python.keras.utils.data_utils import OrderedEnqueuer
+from tensorflow.python.keras.utils.data_utils import Sequence
+from tensorflow.python.keras.utils.data_utils import SequenceEnqueuer
+from tensorflow.python.keras.utils.generic_utils import custom_object_scope
+from tensorflow.python.keras.utils.generic_utils import CustomObjectScope
+from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
+from tensorflow.python.keras.utils.generic_utils import get_custom_objects
+from tensorflow.python.keras.utils.generic_utils import Progbar
+from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.keras.utils.io_utils import HDF5Matrix
+from tensorflow.python.keras.utils.layer_utils import convert_all_kernels_in_model
+from tensorflow.python.keras.utils.multi_gpu_utils import multi_gpu_model
+from tensorflow.python.keras.utils.np_utils import normalize
+from tensorflow.python.keras.utils.np_utils import to_categorical
+from tensorflow.python.keras.utils.vis_utils import plot_model
 
 del absolute_import
 del division
diff --git a/tensorflow/python/keras/_impl/keras/utils/conv_utils.py b/tensorflow/python/keras/utils/conv_utils.py
similarity index 99%
rename from tensorflow/python/keras/_impl/keras/utils/conv_utils.py
rename to tensorflow/python/keras/utils/conv_utils.py
index 8882a3a46bcb9d..5419e7ae0583ab 100644
--- a/tensorflow/python/keras/_impl/keras/utils/conv_utils.py
+++ b/tensorflow/python/keras/utils/conv_utils.py
@@ -21,7 +21,7 @@
 import numpy as np
 from six.moves import range  # pylint: disable=redefined-builtin
 
-from tensorflow.python.keras._impl.keras import backend
+from tensorflow.python.keras import backend
 
 
 def convert_data_format(data_format, ndim):
diff --git a/tensorflow/python/keras/_impl/keras/utils/data_utils.py b/tensorflow/python/keras/utils/data_utils.py
similarity index 99%
rename from tensorflow/python/keras/_impl/keras/utils/data_utils.py
rename to tensorflow/python/keras/utils/data_utils.py
index 4c49544c6a63c4..a1f89d9d434009 100644
--- a/tensorflow/python/keras/_impl/keras/utils/data_utils.py
+++ b/tensorflow/python/keras/utils/data_utils.py
@@ -39,7 +39,7 @@
 from six.moves.urllib.error import URLError
 from six.moves.urllib.request import urlopen
 
-from tensorflow.python.keras._impl.keras.utils.generic_utils import Progbar
+from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.util.tf_export import tf_export
 
 
diff --git a/tensorflow/python/keras/_impl/keras/utils/data_utils_test.py b/tensorflow/python/keras/utils/data_utils_test.py
similarity index 99%
rename from tensorflow/python/keras/_impl/keras/utils/data_utils_test.py
rename to tensorflow/python/keras/utils/data_utils_test.py
index 677e98e871d4a1..395df7e0e786d5 100644
--- a/tensorflow/python/keras/_impl/keras/utils/data_utils_test.py
+++ b/tensorflow/python/keras/utils/data_utils_test.py
@@ -29,7 +29,7 @@
 from six.moves.urllib.parse import urljoin
 from six.moves.urllib.request import pathname2url
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/utils/generic_utils.py b/tensorflow/python/keras/utils/generic_utils.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/utils/generic_utils.py
rename to tensorflow/python/keras/utils/generic_utils.py
index db184d278cfd10..a69893955f4f1c 100644
--- a/tensorflow/python/keras/_impl/keras/utils/generic_utils.py
+++ b/tensorflow/python/keras/utils/generic_utils.py
@@ -349,7 +349,10 @@ def update(self, current, values=None):
           self._values[k][0] += v * (current - self._seen_so_far)
           self._values[k][1] += (current - self._seen_so_far)
       else:
-        self._values[k] = v
+        # Stateful metrics output a numeric value. This representation
+        # means "take an average from a single value" but keeps the
+        # numeric formatting.
+        self._values[k] = [v, 1]
     self._seen_so_far = current
 
     now = time.time()
diff --git a/tensorflow/python/keras/_impl/keras/utils/generic_utils_test.py b/tensorflow/python/keras/utils/generic_utils_test.py
similarity index 97%
rename from tensorflow/python/keras/_impl/keras/utils/generic_utils_test.py
rename to tensorflow/python/keras/utils/generic_utils_test.py
index d57692f4f41753..87bc19eb37d15d 100644
--- a/tensorflow/python/keras/_impl/keras/utils/generic_utils_test.py
+++ b/tensorflow/python/keras/utils/generic_utils_test.py
@@ -18,7 +18,7 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/utils/io_utils.py b/tensorflow/python/keras/utils/io_utils.py
similarity index 100%
rename from tensorflow/python/keras/_impl/keras/utils/io_utils.py
rename to tensorflow/python/keras/utils/io_utils.py
diff --git a/tensorflow/python/keras/_impl/keras/utils/io_utils_test.py b/tensorflow/python/keras/utils/io_utils_test.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/utils/io_utils_test.py
rename to tensorflow/python/keras/utils/io_utils_test.py
index cfeba188d3cadf..3895dca68e37e1 100644
--- a/tensorflow/python/keras/_impl/keras/utils/io_utils_test.py
+++ b/tensorflow/python/keras/utils/io_utils_test.py
@@ -23,7 +23,7 @@
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 try:
diff --git a/tensorflow/python/keras/_impl/keras/utils/layer_utils.py b/tensorflow/python/keras/utils/layer_utils.py
similarity index 82%
rename from tensorflow/python/keras/_impl/keras/utils/layer_utils.py
rename to tensorflow/python/keras/utils/layer_utils.py
index 902972ecbb8fd6..88daff0461593f 100644
--- a/tensorflow/python/keras/_impl/keras/utils/layer_utils.py
+++ b/tensorflow/python/keras/utils/layer_utils.py
@@ -21,8 +21,8 @@
 
 import numpy as np
 
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.utils.conv_utils import convert_kernel
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.utils.conv_utils import convert_kernel
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -201,6 +201,61 @@ def print_layer_summary_with_connections(layer):
   print_fn('_' * line_length)
 
 
+def gather_trainable_weights(trainable, sub_layers, extra_variables):
+  """Lists the trainable weights for an object with sub-layers.
+
+  Args:
+    trainable: Whether the object collecting the variables is trainable.
+    sub_layers: A flat list of Layer objects owned by this object, to collect
+      variables from.
+    extra_variables: Any extra variables to include. Their `.trainable` property
+      is used to categorize them.
+
+  Returns:
+    A list of collected trainable weights/variables.
+  """
+  if not trainable:
+    return []
+  weights = []
+  for layer in sub_layers:
+    weights += layer.trainable_weights
+  trainable_extra_variables = [
+      v for v in extra_variables if v.trainable]
+  return weights + trainable_extra_variables
+
+
+def gather_non_trainable_weights(trainable, sub_layers, extra_variables):
+  """Lists the non-trainable weights for an object with sub-layers.
+
+  Args:
+    trainable: Whether the object collecting the variables is trainable.
+    sub_layers: A flat list of Layer objects owned by this object, to collect
+      variables from.
+    extra_variables: Any extra variables to include. Their `.trainable` property
+      is used to categorize them.
+
+  Returns:
+    A list of collected non-trainable weights/variables.
+  """
+  trainable_extra_variables = []
+  non_trainable_extra_variables = []
+  for v in extra_variables:
+    if v.trainable:
+      trainable_extra_variables.append(v)
+    else:
+      non_trainable_extra_variables.append(v)
+  weights = []
+  for layer in sub_layers:
+    weights += layer.non_trainable_weights
+  if not trainable:
+    trainable_weights = []
+    for layer in sub_layers:
+      trainable_weights += layer.trainable_weights
+    return (trainable_weights + trainable_extra_variables
+            + weights + non_trainable_extra_variables)
+  return weights + non_trainable_extra_variables
+
+
 @tf_export('keras.utils.convert_all_kernels_in_model')
 def convert_all_kernels_in_model(model):
   """Converts all convolution kernels in a model from Theano to TensorFlow.
diff --git a/tensorflow/python/keras/_impl/keras/utils/multi_gpu_utils.py b/tensorflow/python/keras/utils/multi_gpu_utils.py
similarity index 77%
rename from tensorflow/python/keras/_impl/keras/utils/multi_gpu_utils.py
rename to tensorflow/python/keras/utils/multi_gpu_utils.py
index 231ace2a0b4a4f..e5442f04e316c6 100644
--- a/tensorflow/python/keras/_impl/keras/utils/multi_gpu_utils.py
+++ b/tensorflow/python/keras/utils/multi_gpu_utils.py
@@ -18,8 +18,8 @@
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
-from tensorflow.python.keras._impl.keras import backend as K
-from tensorflow.python.keras._impl.keras.engine.training import Model
+from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.engine.training import Model
 from tensorflow.python.ops import array_ops
 from tensorflow.python.util.tf_export import tf_export
 
@@ -34,7 +34,7 @@ def _normalize_device_name(name):
 
 
 @tf_export('keras.utils.multi_gpu_model')
-def multi_gpu_model(model, gpus):
+def multi_gpu_model(model, gpus, cpu_merge=True, cpu_relocation=False):
   """Replicates a model on different GPUs.
 
   Specifically, this function implements single-machine
@@ -61,12 +61,18 @@ def multi_gpu_model(model, gpus):
           (see usage example below).
       gpus: Integer >= 2, number of on GPUs on which to create
           model replicas.
+      cpu_merge: A boolean value to identify whether to force
+          merging model weights under the scope of the CPU or not.
+      cpu_relocation: A boolean value to identify whether to
+          create the model's weights under the scope of the CPU.
+          If the model is not defined under any preceding device
+          scope, you can still rescue it by activating this option.
 
   Returns:
       A Keras `Model` instance which can be used just like the initial
       `model` argument, but which distributes its workload on multiple GPUs.
 
-  Example:
+  Example 1: Training models with weights merge on CPU
 
   ```python
       import tensorflow as tf
@@ -107,12 +113,45 @@ def multi_gpu_model(model, gpus):
       model.save('my_model.h5')
   ```
 
+  Example 2: Training models with weights merge on CPU using cpu_relocation
+
+  ```python
+       ..
+       # Not needed to change the device scope for model definition:
+       model = Xception(weights=None, ..)
+
+       try:
+           model = multi_gpu_model(model, cpu_relocation=True)
+           print("Training using multiple GPUs..")
+       except:
+           print("Training using single GPU or CPU..")
+
+       model.compile(..)
+       ..
+  ```
+
+  Example 3: Training models with weights merge on GPU (recommended for NV-link)
+
+  ```python
+       ..
+       # Not needed to change the device scope for model definition:
+       model = Xception(weights=None, ..)
+
+       try:
+           model = multi_gpu_model(model, cpu_merge=False)
+           print("Training using multiple GPUs..")
+       except:
+           print("Training using single GPU or CPU..")
+       model.compile(..)
+       ..
+  ```
+
   Raises:
     ValueError: if the `gpus` argument does not match available devices.
   """
   # pylint: disable=g-import-not-at-top
-  from tensorflow.python.keras._impl.keras.layers.core import Lambda
-  from tensorflow.python.keras._impl.keras.layers.merge import concatenate
+  from tensorflow.python.keras.layers.core import Lambda
+  from tensorflow.python.keras.layers.merge import concatenate
 
   if isinstance(gpus, (list, tuple)):
     if len(gpus) <= 1:
@@ -166,6 +205,12 @@ def get_slice(data, i, parts):
     start = stride * i
     return array_ops.slice(data, start, size)
 
+  # Relocate the model definition under CPU device scope if needed
+  if cpu_relocation:
+    from tensorflow.python.keras.models import clone_model  # pylint: disable=g-import-not-at-top
+    with ops.device('/cpu:0'):
+      model = clone_model(model)
+
   all_outputs = []
   for i in range(len(model.outputs)):
     all_outputs.append([])
@@ -199,8 +244,8 @@ def get_slice(data, i, parts):
         for o in range(len(outputs)):
           all_outputs[o].append(outputs[o])
 
-  # Merge outputs on CPU.
-  with ops.device('/cpu:0'):
+  # Merge outputs under expected scope.
+  with ops.device('/cpu:0' if cpu_merge else '/gpu:%d' % target_gpu_ids[0]):
     merged = []
     for name, outputs in zip(model.output_names, all_outputs):
       merged.append(concatenate(outputs, axis=0, name=name))
diff --git a/tensorflow/python/keras/_impl/keras/utils/multi_gpu_utils_test.py b/tensorflow/python/keras/utils/multi_gpu_utils_test.py
similarity index 99%
rename from tensorflow/python/keras/_impl/keras/utils/multi_gpu_utils_test.py
rename to tensorflow/python/keras/utils/multi_gpu_utils_test.py
index 0a38d6b5228fe7..77792d14f53d00 100644
--- a/tensorflow/python/keras/_impl/keras/utils/multi_gpu_utils_test.py
+++ b/tensorflow/python/keras/utils/multi_gpu_utils_test.py
@@ -20,7 +20,7 @@
 import numpy as np
 
 from tensorflow.python import data
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/utils/np_utils.py b/tensorflow/python/keras/utils/np_utils.py
similarity index 97%
rename from tensorflow/python/keras/_impl/keras/utils/np_utils.py
rename to tensorflow/python/keras/utils/np_utils.py
index a611be08aaed82..9d9c72b162700c 100644
--- a/tensorflow/python/keras/_impl/keras/utils/np_utils.py
+++ b/tensorflow/python/keras/utils/np_utils.py
@@ -43,7 +43,7 @@ def to_categorical(y, num_classes=None):
   if not num_classes:
     num_classes = np.max(y) + 1
   n = y.shape[0]
-  categorical = np.zeros((n, num_classes))
+  categorical = np.zeros((n, num_classes), dtype=np.float32)
   categorical[np.arange(n), y] = 1
   output_shape = input_shape + (num_classes,)
   categorical = np.reshape(categorical, output_shape)
diff --git a/tensorflow/python/keras/_impl/keras/utils/np_utils_test.py b/tensorflow/python/keras/utils/np_utils_test.py
similarity index 97%
rename from tensorflow/python/keras/_impl/keras/utils/np_utils_test.py
rename to tensorflow/python/keras/utils/np_utils_test.py
index 1e974c2ef2aee3..d77e76ff3ecb70 100644
--- a/tensorflow/python/keras/_impl/keras/utils/np_utils_test.py
+++ b/tensorflow/python/keras/utils/np_utils_test.py
@@ -20,7 +20,7 @@
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
+from tensorflow.python import keras
 from tensorflow.python.platform import test
 
 
diff --git a/tensorflow/python/keras/_impl/keras/utils/tf_utils.py b/tensorflow/python/keras/utils/tf_utils.py
similarity index 100%
rename from tensorflow/python/keras/_impl/keras/utils/tf_utils.py
rename to tensorflow/python/keras/utils/tf_utils.py
diff --git a/tensorflow/python/keras/_impl/keras/utils/vis_utils.py b/tensorflow/python/keras/utils/vis_utils.py
similarity index 96%
rename from tensorflow/python/keras/_impl/keras/utils/vis_utils.py
rename to tensorflow/python/keras/utils/vis_utils.py
index 4761cece82c727..7a454ac8314acd 100644
--- a/tensorflow/python/keras/_impl/keras/utils/vis_utils.py
+++ b/tensorflow/python/keras/utils/vis_utils.py
@@ -65,8 +65,8 @@ def model_to_dot(model, show_shapes=False, show_layer_names=True, rankdir='TB'):
   Returns:
       A `pydot.Dot` instance representing the Keras model.
   """
-  from tensorflow.python.keras._impl.keras.layers.wrappers import Wrapper
-  from tensorflow.python.keras._impl.keras.models import Sequential
+  from tensorflow.python.keras.layers.wrappers import Wrapper
+  from tensorflow.python.keras.models import Sequential
 
   _check_pydot()
   dot = pydot.Dot()
@@ -77,7 +77,6 @@ def model_to_dot(model, show_shapes=False, show_layer_names=True, rankdir='TB'):
   if isinstance(model, Sequential):
     if not model.built:
       model.build()
-    model = model.model
   layers = model.layers
 
   # Create graph nodes.
diff --git a/tensorflow/python/keras/_impl/keras/wrappers/scikit_learn.py b/tensorflow/python/keras/wrappers/scikit_learn.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/wrappers/scikit_learn.py
rename to tensorflow/python/keras/wrappers/scikit_learn.py
index 2884dc84cc5d99..4462d94ecdb10c 100644
--- a/tensorflow/python/keras/_impl/keras/wrappers/scikit_learn.py
+++ b/tensorflow/python/keras/wrappers/scikit_learn.py
@@ -23,9 +23,9 @@
 
 import numpy as np
 
-from tensorflow.python.keras._impl.keras.models import Sequential
-from tensorflow.python.keras._impl.keras.utils.generic_utils import has_arg
-from tensorflow.python.keras._impl.keras.utils.np_utils import to_categorical
+from tensorflow.python.keras.models import Sequential
+from tensorflow.python.keras.utils.generic_utils import has_arg
+from tensorflow.python.keras.utils.np_utils import to_categorical
 from tensorflow.python.util.tf_export import tf_export
 
 
diff --git a/tensorflow/python/keras/wrappers/scikit_learn/__init__.py b/tensorflow/python/keras/wrappers/scikit_learn/__init__.py
deleted file mode 100644
index a46f859273ea01..00000000000000
--- a/tensorflow/python/keras/wrappers/scikit_learn/__init__.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Keras scikit-learn API wrapper."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.keras._impl.keras.wrappers.scikit_learn import KerasClassifier
-from tensorflow.python.keras._impl.keras.wrappers.scikit_learn import KerasRegressor
-
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/_impl/keras/wrappers/scikit_learn_test.py b/tensorflow/python/keras/wrappers/scikit_learn_test.py
similarity index 98%
rename from tensorflow/python/keras/_impl/keras/wrappers/scikit_learn_test.py
rename to tensorflow/python/keras/wrappers/scikit_learn_test.py
index b20a84ee88b5b2..c322efdedf10e9 100644
--- a/tensorflow/python/keras/_impl/keras/wrappers/scikit_learn_test.py
+++ b/tensorflow/python/keras/wrappers/scikit_learn_test.py
@@ -20,8 +20,8 @@
 
 import numpy as np
 
-from tensorflow.python.keras._impl import keras
-from tensorflow.python.keras._impl.keras import testing_utils
+from tensorflow.python import keras
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 INPUT_DIM = 5
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index aed4ee88ff3891..b9a9853d41c62d 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -9,6 +9,7 @@ licenses(["notice"])  # Apache 2.0
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "gpu_py_test")
 load("//tensorflow:tensorflow.bzl", "sycl_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
 
 # CPU only tests should use tf_py_test, GPU tests use gpu_py_test
 # Please avoid the py_tests and gpu_py_tests (plural) while we
@@ -112,6 +113,22 @@ gpu_py_test(
     tags = ["no_windows"],
 )
 
+gpu_py_test(
+    name = "reduce_benchmark_test",
+    srcs = ["reduce_benchmark_test.py"],
+    additional_deps = [
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_benchmark",
+    ],
+)
+
 tf_py_test(
     name = "bincount_op_test",
     size = "small",
@@ -725,6 +742,18 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "regex_full_match_op_test",
+    size = "small",
+    srcs = ["regex_full_match_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:string_ops",
+    ],
+)
+
 tf_py_test(
     name = "save_restore_ops_test",
     size = "small",
@@ -1206,6 +1235,7 @@ gpu_py_test(
     shard_count = 10,
     tags = [
         "noasan",  # times out
+        "optonly",  # times out
     ],
 )
 
@@ -2305,6 +2335,9 @@ gpu_py_test(
         "//tensorflow/python:nn_ops",
     ],
     shard_count = 2,
+    tags = [
+        "no_gpu",  #  Flaky: b/80127739
+    ],
 )
 
 gpu_py_test(
@@ -2347,6 +2380,9 @@ gpu_py_test(
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
     ],
+    tags = [
+        "optonly",  # flaky timeouts unless optimized
+    ],
 )
 
 gpu_py_test(
@@ -2994,3 +3030,60 @@ tf_py_test(
         "//tensorflow/python/eager:tape",
     ],
 )
+
+# Custom op tests
+tf_custom_op_library(
+    name = "ackermann_op.so",
+    srcs = ["ackermann_op.cc"],
+)
+
+tf_py_test(
+    name = "ackermann_test",
+    size = "small",
+    srcs = ["ackermann_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform",
+    ],
+    data = [":ackermann_op.so"],
+    tags = ["no_pip"],
+)
+
+tf_custom_op_library(
+    name = "duplicate_op.so",
+    srcs = ["duplicate_op.cc"],
+)
+
+tf_py_test(
+    name = "duplicate_op_test",
+    size = "small",
+    srcs = ["duplicate_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+    ],
+    data = [":duplicate_op.so"],
+    tags = ["no_pip"],
+)
+
+tf_custom_op_library(
+    name = "invalid_op.so",
+    srcs = ["invalid_op.cc"],
+)
+
+tf_py_test(
+    name = "invalid_op_test",
+    size = "small",
+    srcs = ["invalid_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform",
+    ],
+    data = [":invalid_op.so"],
+    tags = ["no_pip"],
+)
diff --git a/tensorflow/python/kernel_tests/accumulate_n_eager_test.py b/tensorflow/python/kernel_tests/accumulate_n_eager_test.py
index dc11b7deceb904..5f516f2c7e6af2 100644
--- a/tensorflow/python/kernel_tests/accumulate_n_eager_test.py
+++ b/tensorflow/python/kernel_tests/accumulate_n_eager_test.py
@@ -43,10 +43,9 @@ def testFloat(self):
     np.random.seed(12345)
     x = [np.random.random((1, 2, 3, 4, 5)) - 0.5 for _ in range(5)]
     tf_x = ops.convert_n_to_tensor(x)
-    with self.test_session(use_gpu=True):
-      self.assertAllClose(sum(x), math_ops.accumulate_n(tf_x).numpy())
-      self.assertAllClose(x[0] * 5,
-                          math_ops.accumulate_n([tf_x[0]] * 5).numpy())
+    self.assertAllClose(sum(x), math_ops.accumulate_n(tf_x))
+    self.assertAllClose(x[0] * 5,
+                        math_ops.accumulate_n([tf_x[0]] * 5))
 
   def testGrad(self):
     np.random.seed(42)
diff --git a/tensorflow/user_ops/ackermann_op.cc b/tensorflow/python/kernel_tests/ackermann_op.cc
similarity index 100%
rename from tensorflow/user_ops/ackermann_op.cc
rename to tensorflow/python/kernel_tests/ackermann_op.cc
diff --git a/tensorflow/user_ops/ackermann_test.py b/tensorflow/python/kernel_tests/ackermann_test.py
similarity index 76%
rename from tensorflow/user_ops/ackermann_test.py
rename to tensorflow/python/kernel_tests/ackermann_test.py
index 257de498088d1f..5e0d87c783109b 100644
--- a/tensorflow/user_ops/ackermann_test.py
+++ b/tensorflow/python/kernel_tests/ackermann_test.py
@@ -17,17 +17,19 @@
 from __future__ import division
 from __future__ import print_function
 
-import os.path
+import os
 
-import tensorflow as tf
+from tensorflow.python.framework import load_library
+from tensorflow.python.platform import resource_loader
+from tensorflow.python.platform import test
 
 
-class AckermannTest(tf.test.TestCase):
+class AckermannTest(test.TestCase):
 
   def testBasic(self):
-    library_filename = os.path.join(tf.resource_loader.get_data_files_path(),
+    library_filename = os.path.join(resource_loader.get_data_files_path(),
                                     'ackermann_op.so')
-    ackermann = tf.load_op_library(library_filename)
+    ackermann = load_library.load_op_library(library_filename)
 
     self.assertEqual(len(ackermann.OP_LIST.op), 1)
     self.assertEqual(ackermann.OP_LIST.op[0].name, 'Ackermann')
@@ -37,4 +39,4 @@ def testBasic(self):
 
 
 if __name__ == '__main__':
-  tf.test.main()
+  test.main()
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index 5a20eebbc559cf..08bf2d9c644bcd 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -730,7 +730,7 @@ def __getitem__(self, spec):
     analytic_grad2 = 2 * slice_val
 
     dy = variables.Variable(
-        array_ops.ones(shape=slice_var.get_shape(), dtype=dtypes.int32))
+        array_ops.ones(shape=slice_var.get_shape(), dtype=dtypes.float32))
     assign = dy.assign(slice_var)
     slice_val_grad, = gradients_impl.gradients(slice_val, self.var, grad_ys=dy)
     slice_val_grad2, = gradients_impl.gradients(
@@ -755,7 +755,8 @@ class StridedSliceGradTest(test_util.TensorFlowTestCase):
   def testGradient(self):
     with self.test_session(use_gpu=True) as sess:
       var = variables.Variable(
-          array_ops.reshape(math_ops.range(1, 97, 1), shape=(6, 4, 4)))
+          array_ops.reshape(
+              math_ops.range(1, 97, 1, dtype=dtypes.float32), shape=(6, 4, 4)))
       init = variables.global_variables_initializer()
       sess.run(init)
 
@@ -774,7 +775,7 @@ def testGradient(self):
 
   def testGradientZero(self):
     with self.test_session(use_gpu=True) as sess:
-      var = variables.Variable(8)
+      var = variables.Variable(8.)
       init = variables.global_variables_initializer()
       sess.run(init)
       grad = GradSliceChecker(self, sess, var, np.array(8))
@@ -782,11 +783,11 @@ def testGradientZero(self):
 
   def testInt64Indices(self):
     with self.test_session(use_gpu=True) as sess:
-      a = math_ops.range(3)
+      a = math_ops.range(3, dtype=dtypes.float32)
       index = constant_op.constant(1, dtype=dtypes.int64)
-      b = 2 * a[index]
+      b = 2. * a[index]
       grad, = gradients_impl.gradients(b, a)
-      self.assertAllEqual(sess.run(grad), [0, 2, 0])
+      self.assertAllEqual(sess.run(grad), [0., 2., 0.])
 
 
 class StridedSliceGradTypeTest(test_util.TensorFlowTestCase):
@@ -997,11 +998,9 @@ def testTypeErrorResource(self):
     v = resource_variable_ops.ResourceVariable(init_val)
     with self.test_session() as sess:
       sess.run(v.initializer)
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          "l-value dtype int32 does not match r-value dtype int64"):
+      with self.assertRaises(ValueError):
         sess.run(v[:].assign(too_large_val))
-      with self.assertRaises(errors.InvalidArgumentError):
+      with self.assertRaises(ValueError):
         sess.run(v[:].assign(too_small_val))
 
 
@@ -1041,7 +1040,6 @@ def testSizeDtype(self):
         self.evaluate(array_ops.size(tensor, out_type=dtypes.int64)).dtype)
 
 
-@test_util.with_c_api
 class SequenceMaskTest(test_util.TensorFlowTestCase):
 
   def testExceptions(self):
@@ -1064,10 +1062,7 @@ def testOneDimensionalDtypeWithoutMaxlen(self):
       # test dtype and default maxlen:
       res = array_ops.sequence_mask(constant_op.constant([0, 1, 4]),
                                     dtype=dtypes.float32)
-      if ops._USE_C_API:
-        self.assertAllEqual(res.get_shape().as_list(), [3, 4])
-      else:
-        self.assertAllEqual(res.get_shape().as_list(), [3, None])
+      self.assertAllEqual(res.get_shape().as_list(), [3, 4])
       self.assertAllEqual(
           res.eval(),
           [[0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0]])
@@ -1077,10 +1072,7 @@ def testOneDimensionalWithoutMaxlen(self):
     with self.test_session():
       res = array_ops.sequence_mask(
           constant_op.constant([0, 1, 4]))
-      if ops._USE_C_API:
-        self.assertAllEqual(res.get_shape().as_list(), [3, 4])
-      else:
-        self.assertAllEqual(res.get_shape().as_list(), [3, None])
+      self.assertAllEqual(res.get_shape().as_list(), [3, 4])
       self.assertAllEqual(
           res.eval(),
           [[False, False, False, False],
@@ -1099,10 +1091,7 @@ def testTwoDimensional(self):
       # test dtype and default maxlen:
       res = array_ops.sequence_mask(
           constant_op.constant([[0, 1, 4], [1, 2, 3]]), dtype=dtypes.float32)
-      if ops._USE_C_API:
-        self.assertAllEqual(res.get_shape().as_list(), [2, 3, 4])
-      else:
-        self.assertAllEqual(res.get_shape().as_list(), [2, 3, None])
+      self.assertAllEqual(res.get_shape().as_list(), [2, 3, 4])
       self.assertAllEqual(
           res.eval(),
           [[[0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0]],
diff --git a/tensorflow/python/kernel_tests/as_string_op_test.py b/tensorflow/python/kernel_tests/as_string_op_test.py
index 9d54add2644fb9..94ed8ebd31f587 100644
--- a/tensorflow/python/kernel_tests/as_string_op_test.py
+++ b/tensorflow/python/kernel_tests/as_string_op_test.py
@@ -130,6 +130,16 @@ def testLargeInt(self):
       result = output.eval(feed_dict={input_: int_inputs_})
       self.assertAllEqual(s(result), ["%d" % x for x in int_inputs_])
 
+  def testHalfInt(self):
+    s = lambda strs: [x.decode("ascii") for x in strs]
+
+    with self.test_session():
+      input_ = array_ops.placeholder(dtypes.int16)
+      int_inputs_ = [np.iinfo(np.int16).min, np.iinfo(np.int16).max]
+      output = string_ops.as_string(input_)
+      result = output.eval(feed_dict={input_: int_inputs_})
+      self.assertAllEqual(s(result), ["%d" % x for x in int_inputs_])
+
   def testBool(self):
     bool_inputs_ = [False, True]
     s = lambda strs: [x.decode("ascii") for x in strs]
diff --git a/tensorflow/python/kernel_tests/benchmark_test.py b/tensorflow/python/kernel_tests/benchmark_test.py
index 623343602dffa8..78b6e38d949b28 100644
--- a/tensorflow/python/kernel_tests/benchmark_test.py
+++ b/tensorflow/python/kernel_tests/benchmark_test.py
@@ -67,7 +67,7 @@ def benchmark_times_an_op(self):
     with session.Session() as sess:
       a = constant_op.constant(0.0)
       a_plus_a = a + a
-      self.run_op_benchmark(
+      return self.run_op_benchmark(
           sess, a_plus_a, min_iters=1000, store_trace=True, name="op_benchmark")
 
 
@@ -148,7 +148,7 @@ def testReportingBenchmark(self):
       reporting = TestReportingBenchmark()
       reporting.benchmarkReport1()  # This should write
       reporting.benchmarkReport2()  # This should write
-      reporting.benchmark_times_an_op()  # This should write
+      benchmark_values3 = reporting.benchmark_times_an_op()  # This should write
 
       # Check the files were written
       self.assertTrue(gfile.Exists(expected_output_file))
@@ -186,8 +186,12 @@ def read_benchmark_entry(f):
       self.assertEquals(expected_3.name, read_benchmark_3.name)
       self.assertEquals(expected_3.iters, read_benchmark_3.iters)
       self.assertGreater(read_benchmark_3.wall_time, 0)
-      full_trace = read_benchmark_3.extras["full_trace_chrome_format"]
-      json_trace = json.loads(full_trace.string_value)
+
+      # Trace is not stored in benchmark entry. Instead we get it from
+      # return value of `run_op_benchmark` call.
+      full_trace = benchmark_values3["extras"]["full_trace_chrome_format"]
+      json_trace = json.loads(full_trace)
+
       self.assertTrue(isinstance(json_trace, dict))
       self.assertTrue("traceEvents" in json_trace.keys())
       allocator_keys = [k for k in read_benchmark_3.extras.keys()
diff --git a/tensorflow/python/kernel_tests/betainc_op_test.py b/tensorflow/python/kernel_tests/betainc_op_test.py
index 08b03f851803a3..16fdedac4136d7 100644
--- a/tensorflow/python/kernel_tests/betainc_op_test.py
+++ b/tensorflow/python/kernel_tests/betainc_op_test.py
@@ -172,7 +172,7 @@ def testBetaIncGrads(self):
       tf_gout_t = math_ops.betainc(tf_ga_s, tf_gb_s, tf_gx_s)
       err = gradient_checker.compute_gradient_error(
           [tf_gx_s], [gx_s.shape], tf_gout_t, gx_s.shape)
-      print("betainc gradient err = %g " % err)
+      tf_logging.info("betainc gradient err = %g " % err)
       self.assertLess(err, err_tolerance)
 
       # Test broadcast gradient
@@ -181,7 +181,7 @@ def testBetaIncGrads(self):
       tf_gout_t = math_ops.betainc(tf_ga_s, tf_gb_s, tf_gx_s)
       err = gradient_checker.compute_gradient_error(
           [tf_gx_s], [()], tf_gout_t, ga_s.shape)
-      print("betainc gradient err = %g " % err)
+      tf_logging.info("betainc gradient err = %g " % err)
       self.assertLess(err, err_tolerance)
 
 
diff --git a/tensorflow/python/kernel_tests/boosted_trees/BUILD b/tensorflow/python/kernel_tests/boosted_trees/BUILD
index 30e6289420b36a..4f92ab0795d231 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/BUILD
+++ b/tensorflow/python/kernel_tests/boosted_trees/BUILD
@@ -52,7 +52,7 @@ tf_py_test(
 
 tf_py_test(
     name = "stats_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["stats_ops_test.py"],
     additional_deps = [
         "//tensorflow/python:boosted_trees_ops",
diff --git a/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
index 54f33f336015cc..92cd53a031e73d 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
@@ -792,6 +792,28 @@ def testCachedPredictionTheWholeTreeWasPruned(self):
 class PredictionOpsTest(test_util.TensorFlowTestCase):
   """Tests prediction ops for inference."""
 
+  def testPredictionOnEmptyEnsemble(self):
+    """Tests that prediction on a empty ensemble does not fail."""
+    with self.test_session() as session:
+      # Create an empty ensemble.
+      tree_ensemble = boosted_trees_ops.TreeEnsemble(
+          'ensemble', serialized_proto='')
+      tree_ensemble_handle = tree_ensemble.resource_handle
+      resources.initialize_resources(resources.shared_resources()).run()
+
+      feature_0_values = [36, 32]
+      feature_1_values = [11, 27]
+      expected_logits = [[0.0], [0.0]]
+
+      # Prediction should work fine.
+      predict_op = boosted_trees_ops.predict(
+          tree_ensemble_handle,
+          bucketized_features=[feature_0_values, feature_1_values],
+          logits_dimension=1)
+
+      logits = session.run(predict_op)
+      self.assertAllClose(expected_logits, logits)
+
   def testPredictionMultipleTree(self):
     """Tests the predictions work when we have multiple trees."""
     with self.test_session() as session:
@@ -893,16 +915,7 @@ def testPredictionMultipleTree(self):
       #            logit= 0.1*1.14+0.2*7.0-1*7.0
       expected_logits = [[6.114], [-5.486]]
 
-      # Do with parallelization, e.g. EVAL
-      predict_op = boosted_trees_ops.predict(
-          tree_ensemble_handle,
-          bucketized_features=[feature_0_values, feature_1_values],
-          logits_dimension=1)
-
-      logits = session.run(predict_op)
-      self.assertAllClose(expected_logits, logits)
-
-      # Do without parallelization, e.g. INFER - the result is the same
+      # Prediction should work fine.
       predict_op = boosted_trees_ops.predict(
           tree_ensemble_handle,
           bucketized_features=[feature_0_values, feature_1_values],
diff --git a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
index f0bb84e69a5ae2..568e695fd590a6 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
@@ -17,7 +17,10 @@
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import boosted_trees_ops
 from tensorflow.python.platform import googletest
 
@@ -224,7 +227,7 @@ def testCalculateBestGainsWithTreeComplexity(self):
       self.assertAllClose([[[-.424658], [-.6]], [[-.043478], [.485294]]],
                           sess.run(right_node_contribs_list))
 
-  def testCalculateBestGainsWithMinNodeWEight(self):
+  def testCalculateBestGainsWithMinNodeWeight(self):
     """Testing Gain calculation without any regularization."""
     with self.test_session() as sess:
       max_splits = 7
@@ -271,6 +274,59 @@ def testCalculateBestGainsWithMinNodeWEight(self):
       self.assertAllClose([[[-0.75]], [[-0.014925]]],
                           sess.run(right_node_contribs_list))
 
+  def testCalculateBestGainsWithMinNodeWeightNoSplitOnFeturePossible(self):
+    """Testing Gain calculation without any regularization."""
+    with self.test_session() as sess:
+      max_splits = 7
+      node_id_range = [1, 3]  # node 1 through 2 will be processed.
+      stats_summary_list = [
+          [
+              [[0., 0.], [.08, .09], [0., 0.], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.15, .0036], [.06, .007], [.1, .2]],  # node 1
+              [[0., 0.], [-.33, .068], [0., 0.], [.3, .04]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 0
+          [
+              [[0., 0.], [0., 0.], [.08, .09], [0., 0.]],  # node 0; ignored
+              [[0., 0.], [.3, .5], [-.05, .6], [.06, .07]],  # node 1
+              [[.1, .1], [.2, .03], [-.4, .05], [.07, .08]],  # node 2
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 3; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 4; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 5; ignored
+              [[0., 0.], [0., 0.], [0., 0.], [0., 0.]],  # node 6; ignored
+          ],  # feature 1
+      ]  # num_features * shape=[max_splits, num_buckets, 2]
+
+      (node_ids_list, _, _, _,
+       _) = boosted_trees_ops.calculate_best_gains_per_feature(
+           node_id_range,
+           stats_summary_list,
+           l1=0.0,
+           l2=0.0,
+           tree_complexity=0.0,
+           min_node_weight=1,
+           max_splits=max_splits)
+
+      # We can't split either of the nodes on the first feature
+      self.assertEqual(2, len(sess.run(node_ids_list)))
+      self.assertAllEqual([], sess.run(node_ids_list)[0])
+      self.assertAllEqual([1], sess.run(node_ids_list)[1])
+
+      # Now check when we can't split on any feature
+      (node_ids_list, _, _, _,
+       _) = boosted_trees_ops.calculate_best_gains_per_feature(
+           node_id_range,
+           stats_summary_list,
+           l1=0.0,
+           l2=0.0,
+           tree_complexity=0.0,
+           min_node_weight=10,
+           max_splits=max_splits)
+      self.assertAllEqual([[], []], sess.run(node_ids_list))
+
   def testMakeStatsSummarySimple(self):
     """Simple test for MakeStatsSummary."""
     with self.test_session():
@@ -335,6 +391,41 @@ def testMakeStatsSummaryMultipleFeatures(self):
           ],
           result.eval())
 
+  def _verify_precision(self, length):
+    with self.test_session():
+      max_splits = 1
+      num_buckets = 1
+      node_ids = array_ops.fill([length], 0)
+
+      gradients = constant_op.constant(
+          2.0 / length, dtype=dtypes.float32, shape=[length, 1])
+      hessians = constant_op.constant(
+          0.2 / length, dtype=dtypes.float32, shape=[length, 1])
+
+      bucketized_features = array_ops.zeros([length], dtype=dtypes.int32)
+
+      result = boosted_trees_ops.make_stats_summary(
+          node_ids, gradients, hessians, [bucketized_features], max_splits,
+          num_buckets)  # shape=[max_splits, num_buckets, num_features, 2]
+
+      self.assertAllClose([[[[2., 0.2]]]], result.eval())
+
+  def testMakeStatsSummaryNumericalPrecisionSmallBatch(self):
+    """Tests numeric precision."""
+    self._verify_precision(length=2000)
+
+  def testMakeStatsSummaryNumericalPrecisionMediumBatch(self):
+    """Tests numeric precision."""
+    self._verify_precision(length=100000)
+
+  def testMakeStatsSummaryNumericalPrecisionLargeBatch(self):
+    """Tests numeric precision."""
+    self._verify_precision(length=1000000)
+
+  def testMakeStatsSummaryNumericalPrecisionMegaBatch(self):
+    """Tests numeric precision."""
+    self._verify_precision(length=50000000)
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
index d6c004774746dd..13b804875e94a9 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
@@ -1379,7 +1379,7 @@ def testPostPruningOfAllNodes(self):
         }
         post_pruned_nodes_meta {
           new_node_id: 0
-          logit_change: -24.0143
+          logit_change: -24.014299
         }
       }
       tree_metadata {
diff --git a/tensorflow/python/kernel_tests/check_ops_test.py b/tensorflow/python/kernel_tests/check_ops_test.py
index 5a83ec8d302b4c..7ef841c96b5cec 100644
--- a/tensorflow/python/kernel_tests/check_ops_test.py
+++ b/tensorflow/python/kernel_tests/check_ops_test.py
@@ -88,6 +88,13 @@ def test_doesnt_raise_when_equal(self):
       out = array_ops.identity(small)
     self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def test_scalar_comparison(self):
+    const_true = constant_op.constant(True, name="true")
+    const_false = constant_op.constant(False, name="false")
+    with self.assertRaisesRegexp(errors.InvalidArgumentError, "fail"):
+      check_ops.assert_equal(const_true, const_false, message="fail")
+
   def test_returns_none_with_eager(self):
     with context.eager_mode():
       small = constant_op.constant([1, 2], name="small")
diff --git a/tensorflow/python/kernel_tests/clip_ops_test.py b/tensorflow/python/kernel_tests/clip_ops_test.py
index e08123b0417912..fb52d10475fa47 100644
--- a/tensorflow/python/kernel_tests/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/clip_ops_test.py
@@ -18,9 +18,12 @@
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.platform import test
@@ -414,6 +417,16 @@ def testClipByAverageNormZero(self):
 
     self.assertAllClose(np_ans, tf_ans)
 
+  def testClipByValueEmptyTensor(self):
+    # Test case for GitHub issue 19337
+    zero = array_ops.placeholder(dtype=dtypes.float32, shape=None)
+    x = clip_ops.clip_by_value(zero, zero, zero)
+    y = clip_ops.clip_by_value(zero, 1.0, 1.0)
+    z = clip_ops.clip_by_value(zero, zero, 1.0)
+    w = clip_ops.clip_by_value(zero, 1.0, zero)
+    with self.test_session(use_gpu=True) as sess:
+      sess.run([x, y, z, w], feed_dict={zero: np.zeros((7, 0))})
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 209411cf5195b0..68873df97ea2a6 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -38,7 +38,6 @@
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
@@ -123,7 +122,6 @@ def isum(s, maximum_iterations=None):
   return r_s
 
 
-@test_util.with_c_api
 class ControlFlowTest(test.TestCase):
 
   def testRefIdentity(self):
@@ -1847,6 +1845,23 @@ def fn1():
       r = control_flow_ops.cond(math_ops.less(1, 2), fn1, lambda: x)
       self.assertAllClose(9.0, r.eval(feed_dict={x: 1.0}))
 
+  def testGradInWhileWrtInitialLoopVal(self):
+    with self.test_session():
+      x = array_ops.placeholder(dtypes.float32, shape=(), name="x")
+      y = x + 1
+
+      def body(i, v):
+        z = v * 2
+        return i + 1, gradients_impl.gradients(z, x)[0]
+
+      with self.assertRaisesRegexp(
+          ValueError,
+          "Cannot compute gradient inside while loop with respect to op 'x'. "
+          "We do not support taking the gradient wrt or through the initial "
+          "value of a loop variable. Gradients can be computed through "
+          "loop invariants or wrt the input parameters to the loop body."):
+        control_flow_ops.while_loop(lambda i, x: i < 3, body, [0, y])
+
   def testWhileGradInWhile(self):
     with self.test_session():
       n = ops.convert_to_tensor(1.0, name="n")
@@ -2222,14 +2237,14 @@ def body(i, h):
 
   def testWhileWithRefsWithGradients_1(self):
     with self.test_session() as sess:
-      x = variables.Variable(0)._ref()  # pylint: disable=protected-access
+      x = variables.Variable(0.)._ref()  # pylint: disable=protected-access
       i = constant_op.constant(0)
       c = lambda i, x: math_ops.less(i, 10)
 
-      self.assertEqual(x.dtype, dtypes.int32_ref)
+      self.assertEqual(x.dtype, dtypes.float32_ref)
 
       def body(i, x):
-        self.assertEqual(x.dtype, dtypes.int32_ref)
+        self.assertEqual(x.dtype, dtypes.float32_ref)
         return [i + 1, gen_array_ops.ref_identity(x)]
 
       r = control_flow_ops.while_loop(c, body, [i, x], parallel_iterations=5)
@@ -2240,7 +2255,7 @@ def body(i, x):
       variables.global_variables_initializer().run()
 
       self.assertEqual(r[0].dtype, dtypes.int32)
-      self.assertEqual(r[1].dtype, dtypes.int32_ref)
+      self.assertEqual(r[1].dtype, dtypes.float32_ref)
 
       value_i, value_x, value_x_grad = sess.run(r + grad)
 
@@ -2443,6 +2458,63 @@ def testStopGradOnWhileGrad(self):
       r = gradients_impl.gradients(r, y)[0]
       self.assertEqual(388.0, r.eval())
 
+  def testWhileGradientWithNontrainablePath1(self):
+    q = variables.Variable([7., 8.])
+
+    def cond(_, y):
+      del y
+      return False
+
+    def body(x, _):
+      return x, math_ops.cast(x, dtypes.float32) + math_ops.reduce_sum(q)
+
+    _, y = control_flow_ops.while_loop(cond, body, (math_ops.argmin(q), 0.))
+    dy_dq, = gradients_impl.gradients(y, q)
+    self.assertIsNotNone(dy_dq)
+    with self.test_session() as sess:
+      sess.run(q.initializer)
+      self.assertAllClose([0., 0.], sess.run(dy_dq))
+
+  def testWhileGradientWithNontrainablePath2(self):
+    q = variables.Variable([7., 8.])
+
+    def cond(_, y):
+      return math_ops.equal(y, 0.)
+
+    def body(x, _):
+      zero = constant_op.constant(0, dtype=dtypes.int64)
+      return zero, math_ops.cast(x, dtypes.float32) + math_ops.reduce_sum(q)
+
+    _, y = control_flow_ops.while_loop(cond, body, (math_ops.argmin(q), 0.))
+    dy_dq, = gradients_impl.gradients(y, q)
+    self.assertIsNotNone(dy_dq)
+    with self.test_session() as sess:
+      sess.run(q.initializer)
+      self.assertAllClose([1., 1.], sess.run(dy_dq))
+
+  def testIssue16504(self):
+    c = constant_op.constant(np.arange(100), dtype=dtypes.float32)
+    w = variables.Variable(
+        initial_value=np.ones(100), dtype=dtypes.float32) / 100
+    k = variables.Variable(0, dtype=dtypes.int32)
+    chg_w = constant_op.constant(np.inf, dtype=dtypes.float32)
+
+    def cond(k, _, chg_w):
+      return math_ops.logical_and(k < 10, chg_w > 1e-3)
+
+    def body(k, w, chg_w):
+      grad, = gradients_impl.gradients(-math_ops.reduce_sum(w * c), w)
+      w_n = w * math_ops.exp(-0.1 * grad)
+      w_n /= math_ops.reduce_sum(w_n)
+      chg_w = (
+          math_ops.reduce_sum(math_ops.abs(w_n - w)) / math_ops.reduce_sum(
+              math_ops.abs(w)))
+      return k + 1, w_n, chg_w
+
+    _, w, _ = control_flow_ops.while_loop(cond, body, [k, w, chg_w])
+    grad, = gradients_impl.gradients(w, c)
+    self.assertIsNotNone(grad)
+
   def testStopGradMultiFlows(self):
     with self.test_session():
 
@@ -2873,7 +2945,6 @@ def func(x):
           1)
 
 
-@test_util.with_c_api
 class ControlFlowContextCheckTest(test.TestCase):
 
   def _getWhileTensor(self):
@@ -2993,7 +3064,6 @@ def true_fn():
           math_ops.less(1, 2), true_fn, lambda: constant_op.constant(0))
 
 
-@test_util.with_c_api
 class TupleTest(test.TestCase):
 
   def testTensors(self):
@@ -3079,7 +3149,6 @@ def testAcceptTensorsAsControlInputs(self):
       self.assertEquals(1, var.eval())
 
 
-@test_util.with_c_api
 class AssertTest(test.TestCase):
 
   def testGuardedAssertDoesNotCopyWhenTrue(self):
@@ -3119,7 +3188,6 @@ def testGuardedAssertDoesNotCopyWhenTrue(self):
       self.assertEqual([], guarded_memcpy_nodestat_names)
 
 
-@test_util.with_c_api
 class WhileOpBenchmark(test.Benchmark):
   """Evaluate the performance of while_loop op."""
 
@@ -3234,7 +3302,6 @@ def benchmarkWhileOpUnrollSameDevicePlacement(self):
         name="unroll_same_device", iters=iters, wall_time=duration)
 
 
-@test_util.with_c_api
 class EagerTest(test.TestCase):
 
   def testCond(self):
diff --git a/tensorflow/python/kernel_tests/control_flow_util_test.py b/tensorflow/python/kernel_tests/control_flow_util_test.py
index 39e96f74b0461d..762c445da05008 100644
--- a/tensorflow/python/kernel_tests/control_flow_util_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_util_test.py
@@ -19,9 +19,13 @@
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.platform import test
 
@@ -66,6 +70,111 @@ def testIsLoopExit(self):
 
     self.assertFalse(control_flow_util.IsLoopExit(test_ops.int_output().op))
 
+  def build_test_graph(self):
+    g = ops.Graph()
+    with g.as_default():
+
+      def while_loop(x):
+
+        def b(x):
+          with ops.name_scope("NestedCond"):
+            return control_flow_ops.cond(
+                math_ops.less(x, 100), lambda: math_ops.add(x, 1),
+                lambda: math_ops.add(x, 2))
+
+        c = lambda x: math_ops.less(x, 10000)
+        with ops.name_scope("OuterWhile"):
+          return control_flow_ops.while_loop(c, b, [x])
+
+      x = array_ops.placeholder(dtypes.int32)
+      with ops.name_scope("OuterCond"):
+        control_flow_ops.cond(
+            math_ops.less(x, 1000), lambda: while_loop(x),
+            lambda: math_ops.add(x, 2))
+    return g
+
+  def testIsCondSwitch(self):
+    g = self.build_test_graph()
+
+    cond_switch = [
+        "OuterCond/cond/Switch",
+        "OuterCond/cond/OuterWhile/while/Switch",
+        "OuterCond/cond/OuterWhile/while/NestedCond/cond/Switch",
+        "OuterCond/cond/OuterWhile/while/NestedCond/cond/Add/Switch",
+        "OuterCond/cond/OuterWhile/while/NestedCond/cond/Add_1/Switch",
+        "OuterCond/cond/Add/Switch",
+    ]
+    for n in g.get_operations():
+      if control_flow_util.IsSwitch(n):
+        self.assertTrue(
+            control_flow_util.IsCondSwitch(n) != control_flow_util.IsLoopSwitch(
+                n))
+      if n.name in cond_switch:
+        self.assertTrue(control_flow_util.IsSwitch(n))
+        self.assertTrue(
+            control_flow_util.IsCondSwitch(n),
+            msg="Mismatch for {}".format(n.name))
+        self.assertFalse(
+            control_flow_util.IsLoopSwitch(n),
+            msg="Mismatch for {}".format(n.name))
+      else:
+        self.assertFalse(
+            control_flow_util.IsCondSwitch(n),
+            msg="Mismatch for {}".format(n.name))
+
+  def testIsLoopSwitch(self):
+    g = self.build_test_graph()
+
+    loop_switch = ["OuterCond/cond/OuterWhile/while/Switch_1"]
+    for n in g.get_operations():
+      if control_flow_util.IsSwitch(n):
+        self.assertTrue(
+            control_flow_util.IsCondSwitch(n) != control_flow_util.IsLoopSwitch(
+                n))
+      if n.name in loop_switch:
+        self.assertTrue(control_flow_util.IsSwitch(n))
+        self.assertFalse(
+            control_flow_util.IsCondSwitch(n),
+            msg="Mismatch for {}".format(n.name))
+        self.assertTrue(
+            control_flow_util.IsLoopSwitch(n),
+            msg="Mismatch for {}".format(n.name))
+      else:
+        self.assertFalse(
+            control_flow_util.IsLoopSwitch(n),
+            msg="Mismatch for {}".format(n.name))
+
+  def testIsCondMerge(self):
+    g = self.build_test_graph()
+    cond_merges = [
+        "OuterCond/cond/OuterWhile/while/NestedCond/cond/Merge",
+        "OuterCond/cond/Merge"
+    ]
+    for n in g.get_operations():
+      if n.name in cond_merges:
+        self.assertTrue(control_flow_util.IsMerge(n))
+        self.assertTrue(control_flow_util.IsCondMerge(n))
+        self.assertFalse(control_flow_util.IsLoopMerge(n))
+      else:
+        self.assertFalse(control_flow_util.IsCondMerge(n))
+        self.assertTrue(not control_flow_util.IsMerge(n) or
+                        control_flow_util.IsLoopMerge(n))
+
+  def testIsLoopMerge(self):
+    g = self.build_test_graph()
+    loop_merges = [
+        "OuterCond/cond/OuterWhile/while/Merge",
+    ]
+    for n in g.get_operations():
+      if n.name in loop_merges:
+        self.assertTrue(control_flow_util.IsMerge(n))
+        self.assertFalse(control_flow_util.IsCondMerge(n))
+        self.assertTrue(control_flow_util.IsLoopMerge(n))
+      else:
+        self.assertFalse(control_flow_util.IsLoopMerge(n))
+        self.assertTrue(not control_flow_util.IsMerge(n) or
+                        control_flow_util.IsCondMerge(n))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/conv1d_test.py b/tensorflow/python/kernel_tests/conv1d_test.py
index e2e6205911caa0..fcba456004407b 100644
--- a/tensorflow/python/kernel_tests/conv1d_test.py
+++ b/tensorflow/python/kernel_tests/conv1d_test.py
@@ -31,9 +31,7 @@ class Conv1DTest(test.TestCase):
 
   def testBasic(self):
     """Test that argument passing to conv1d is handled properly."""
-    # TODO(yongtang): dtypes.float64 can only be enabled once conv2d support
-    # dtypes.float64, as conv1d implicitly calls conv2d after expand_dims.
-    for dtype in [dtypes.float16, dtypes.float32]:
+    for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
       x = constant_op.constant([1, 2, 3, 4], dtype=dtype)
       x = array_ops.expand_dims(x, 0)  # Add batch dimension
       x = array_ops.expand_dims(x, 2)  # And depth dimension
diff --git a/tensorflow/python/kernel_tests/conv2d_transpose_test.py b/tensorflow/python/kernel_tests/conv2d_transpose_test.py
index b692d3da609fd9..27804be65ca9a5 100644
--- a/tensorflow/python/kernel_tests/conv2d_transpose_test.py
+++ b/tensorflow/python/kernel_tests/conv2d_transpose_test.py
@@ -23,6 +23,7 @@
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_ops
@@ -292,6 +293,7 @@ def testConv2DTransposeValidNCHW(self):
 
         self.assertAllClose(cache_values, value)
 
+  @test_util.enable_c_shapes
   def testConv2DTransposeShapeInference(self):
     # Test case for 8972
     initializer = random_ops.truncated_normal(
@@ -301,7 +303,8 @@ def testConv2DTransposeShapeInference(self):
     f_shape = array_ops.stack([array_ops.shape(x)[0], 10, 5, 5])
     output = nn_ops.conv2d_transpose(
         x, f, f_shape, strides=[1, 1, 1, 1], padding="SAME")
-    self.assertEqual(output.get_shape().as_list(), [None, 10, 5, 5])
+    self.assertEqual(output.get_shape().as_list(), [3, 10, 5, 5])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/conv3d_transpose_test.py b/tensorflow/python/kernel_tests/conv3d_transpose_test.py
index 8973a450fa246e..289ae29fcec724 100644
--- a/tensorflow/python/kernel_tests/conv3d_transpose_test.py
+++ b/tensorflow/python/kernel_tests/conv3d_transpose_test.py
@@ -131,6 +131,23 @@ def testConv3DTransposeShapeMismatch(self):
     nn_ops.conv3d_transpose(
         x_value, f_value, y_shape, strides, data_format='NCDHW')
 
+  def testConv3DTransposeOutputShapeType(self):
+    # Test case for GitHub issue 18887
+    for dtype in [dtypes.int32, dtypes.int64]:
+      with self.test_session():
+        x_shape = [2, 5, 6, 4, 3]
+        y_shape = [2, 5, 6, 4, 2]
+        f_shape = [3, 3, 3, 2, 3]
+        strides = [1, 1, 1, 1, 1]
+        x_value = constant_op.constant(
+            1.0, shape=x_shape, name="x", dtype=dtypes.float32)
+        f_value = constant_op.constant(
+            1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
+        output = nn_ops.conv3d_transpose(
+            x_value, f_value, constant_op.constant(y_shape, dtype=dtype),
+            strides=strides, padding="SAME")
+        output.eval()
+
   def testConv3DTransposeValid(self):
     with self.test_session():
       strides = [1, 2, 2, 2, 1]
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index 254a0007d6fc3a..d4aa35201f5e6c 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -312,8 +312,8 @@ def _VerifyDilatedConvValues(self, tensor_in_sizes, filter_in_sizes, strides,
       expected_values = self.evaluate(expected_results)
       computed_values = self.evaluate(computed_results)
       for e_value, c_value in zip(expected_values, computed_values):
-        print("expected = ", e_value)
-        print("actual = ", c_value)
+        tf_logging.info("expected = ", e_value)
+        tf_logging.info("actual = ", c_value)
         self.assertAllClose(
             e_value.flatten(), c_value.flatten(), atol=tolerance, rtol=1e-4)
 
@@ -337,8 +337,8 @@ def _VerifyValues(self, tensor_in_sizes, filter_in_sizes, strides, padding,
       for i in range(len(tensors)):
         conv = tensors[i]
         value = values[i]
-        print("expected = ", expected)
-        print("actual = ", value)
+        tf_logging.info("expected = ", expected)
+        tf_logging.info("actual = ", value)
         tol = 1e-5
         if value.dtype == np.float16:
           tol = 1e-3
@@ -547,8 +547,8 @@ def _RunAndVerifyBackpropInput(self, input_sizes, filter_sizes, output_sizes,
       # "values" consists of two tensors for two backprops
       value = self.evaluate(conv)
       self.assertShapeEqual(value, conv)
-    print("expected = ", expected)
-    print("actual = ", value)
+    tf_logging.info("expected = ", expected)
+    tf_logging.info("actual = ", value)
     self.assertArrayNear(expected, value.flatten(), err)
 
   def _CompareBackpropInput(self, input_sizes, filter_sizes, output_sizes,
@@ -723,8 +723,8 @@ def _RunAndVerifyBackpropFilter(self, input_sizes, filter_sizes, output_sizes,
             data_format=data_format)
         value = self.evaluate(conv)
         self.assertShapeEqual(value, conv)
-      print("expected = ", expected)
-      print("actual = ", value)
+      tf_logging.info("expected = ", expected)
+      tf_logging.info("actual = ", value)
       self.assertArrayNear(expected, value.flatten(), 1e-5)
 
   def _CompareBackFilter(self, input_sizes, filter_sizes, output_sizes,
@@ -912,8 +912,8 @@ def _RunAndVerifyBackpropInputDilation(self, input_sizes, filter_sizes,
         value_2 = sess.run(conv_2)
         self.assertShapeEqual(value, conv)
         self.assertShapeEqual(value_2, conv_2)
-      print("expected = ", value_2)
-      print("actual = ", value)
+      tf_logging.info("expected = ", value_2)
+      tf_logging.info("actual = ", value)
       self.assertArrayNear(value_2.flatten(), value.flatten(), err)
 
   # Testing for backprops
@@ -965,8 +965,8 @@ def _RunAndVerifyBackpropFilterDilation(self, input_sizes, filter_sizes,
         value_2 = sess.run(conv_2)
         self.assertShapeEqual(value, conv)
         self.assertShapeEqual(value_2, conv_2)
-      print("expected = ", value_2)
-      print("actual = ", value)
+      tf_logging.info("expected = ", value_2)
+      tf_logging.info("actual = ", value)
       self.assertArrayNear(value_2.flatten(), value.flatten(), err)
 
   def testConv2D2x2Depth3ValidBackpropFilterStride1x1Dilation2x1(self):
@@ -1178,7 +1178,7 @@ def ConstructAndTestGradient(self, batch, input_rows, input_cols, filter_rows,
           # since fp16 numerical gradients are too imprecise.
           err = np.fabs(jacob_t - reference_jacob_t).max()
 
-        print("conv_2d gradient error = ", err)
+        tf_logging.info("conv_2d gradient error = ", err)
         self.assertLess(err, 0.002)
 
   def testInputGradientValidPaddingStrideOne(self):
@@ -1546,7 +1546,7 @@ def _VerifyValues(self, tensor_in_sizes, filter_in_sizes, stride, padding,
       conv = nn_impl.depthwise_conv2d(
           t1, t2, strides=[1, stride, stride, 1], padding=padding)
       value = sess.run(conv)
-    print("value = ", value)
+    tf_logging.info("value = ", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
 
@@ -1668,7 +1668,7 @@ def _VerifyValues(self,
         conv = array_ops.transpose(conv, [0, 2, 3, 1])
 
       value = sess.run(conv)
-    print("value = ", value)
+    tf_logging.info("value = ", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
 
@@ -1826,7 +1826,7 @@ def benchmarkGPUConvStackFirst(self):
         wall_time = time.time() - start
         self.report_benchmark(
             name="conv_stack_iter_%d" % iter_index, wall_time=wall_time)
-        print("conv_stack_iter_%d: %.4f" % (iter_index, wall_time))
+        tf_logging.info("conv_stack_iter_%d: %.4f" % (iter_index, wall_time))
 
 
 def GetInceptionFwdTest(input_size, filter_size, stride, padding,
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index 87da89831c8ded..1128cd7a633d2b 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -27,6 +27,7 @@
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gradient_checker
@@ -152,7 +153,7 @@ def _compareSparseCpu(self, x, np_func, tf_func, tol):
 
   def _compareGpu(self, x, np_func, tf_func):
     np_ans = np_func(x)
-    with self.test_session(use_gpu=True):
+    with self.test_session(force_gpu=test_util.is_gpu_available()):
       result = tf_func(ops.convert_to_tensor(x))
       tf_gpu = result.eval()
     if x.dtype == np.float16:
@@ -164,7 +165,7 @@ def _compareGpu(self, x, np_func, tf_func):
   def _compareSparseGpu(self, x, np_func, tf_func, tol):
     x_sp, x_sp_vals = _sparsify(x)
     res_np = np_func(x_sp_vals)
-    with self.test_session(use_gpu=True):
+    with self.test_session(force_gpu=test_util.is_gpu_available()):
       self._check(tf_func(x_sp), res_np, x_sp, tol)
 
   def _compareBoth(self, x, np_func, tf_func):
@@ -630,7 +631,7 @@ def _compareGradientY(self,
 
   def _compareGpu(self, x, y, np_func, tf_func):
     np_ans = np_func(x, y)
-    with self.test_session(use_gpu=True):
+    with self.test_session(force_gpu=test_util.is_gpu_available()):
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       out = tf_func(inx, iny)
@@ -1203,7 +1204,7 @@ def testPowNegativeExponent(self):
 class ComparisonOpTest(test.TestCase):
 
   def _compareScalar(self, func, x, y, dtype):
-    with self.test_session(use_gpu=True):
+    with self.test_session(force_gpu=test_util.is_gpu_available()):
       out = func(
           ops.convert_to_tensor(np.array([x]).astype(dtype)),
           ops.convert_to_tensor(np.array([y]).astype(dtype)))
@@ -1236,7 +1237,7 @@ def testScalarCompareScalar(self):
 
   def _compare(self, x, y, np_func, tf_func):
     np_ans = np_func(x, y)
-    with self.test_session(use_gpu=True):
+    with self.test_session(force_gpu=test_util.is_gpu_available()):
       out = tf_func(ops.convert_to_tensor(x), ops.convert_to_tensor(y))
       tf_ans = out.eval()
     self.assertAllEqual(np_ans, tf_ans)
@@ -1337,7 +1338,8 @@ class LogicalOpTest(test.TestCase):
 
   def _compareBinary(self, x, y, np_func, tf_func, use_gpu=False):
     np_ans = np_func(x, y)
-    with self.test_session(use_gpu=use_gpu):
+    with self.test_session(use_gpu=use_gpu,
+                           force_gpu=use_gpu and test_util.is_gpu_available()):
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       out = tf_func(inx, iny)
@@ -1348,7 +1350,8 @@ def _compareBinary(self, x, y, np_func, tf_func, use_gpu=False):
 
   def _not(self, x, use_gpu=False):
     np_ans = np.logical_not(x)
-    with self.test_session(use_gpu=use_gpu):
+    with self.test_session(use_gpu=use_gpu,
+                           force_gpu=use_gpu and test_util.is_gpu_available()):
       out = math_ops.logical_not(ops.convert_to_tensor(x))
       tf_val = out.eval()
     self.assertEqual(out.dtype, dtypes_lib.bool)
@@ -1433,7 +1436,8 @@ class SelectOpTest(test.TestCase):
 
   def _compare(self, c, x, y, use_gpu):
     np_ans = np.where(c, x, y)
-    with self.test_session(use_gpu=use_gpu):
+    with self.test_session(use_gpu=use_gpu,
+                           force_gpu=use_gpu and test_util.is_gpu_available()):
       out = array_ops.where(c, x, y)
       tf_ans = out.eval()
     self.assertAllEqual(np_ans, tf_ans)
@@ -1576,7 +1580,8 @@ def _compare(self, c, x, y, use_gpu):
     np_ans = np.dstack(
         [x_i if c_i else y_i for c_i, x_i, y_i in zip(c, x, y)]).transpose(
             [2, 0, 1])
-    with self.test_session(use_gpu=use_gpu):
+    with self.test_session(use_gpu=use_gpu,
+                           force_gpu=use_gpu and test_util.is_gpu_available()):
       out = array_ops.where(c, x, y)
       tf_ans = out.eval()
     self.assertAllEqual(np_ans, tf_ans)
@@ -1681,7 +1686,9 @@ class MinMaxOpTest(test.TestCase):
 
   def _compare(self, x, y, use_gpu):
     np_min, np_max = np.minimum(x, y), np.maximum(x, y)
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.test_session(
+        use_gpu=use_gpu,
+        force_gpu=use_gpu and test_util.is_gpu_available()) as sess:
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       omin, omax = math_ops.minimum(inx, iny), math_ops.maximum(inx, iny)
@@ -1843,7 +1850,9 @@ class IsFiniteInfNanTest(test.TestCase):
 
   def _compare(self, x, use_gpu):
     np_finite, np_inf, np_nan = np.isfinite(x), np.isinf(x), np.isnan(x)
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.test_session(
+        use_gpu=use_gpu,
+        force_gpu=use_gpu and test_util.is_gpu_available()) as sess:
       inx = ops.convert_to_tensor(x)
       ofinite, oinf, onan = math_ops.is_finite(inx), math_ops.is_inf(
           inx), math_ops.is_nan(inx)
@@ -1884,7 +1893,7 @@ def testSqrt(self):
           x = np.full((size,), value, dtype=dtype)
           np_y = np.sqrt(x)
           np_nan = np.isnan(np_y)
-          with self.test_session(use_gpu=True):
+          with self.test_session(force_gpu=test_util.is_gpu_available()):
             tf_y = math_ops.sqrt(x)
             tf_nan = math_ops.is_nan(tf_y)
             if value < 0:
@@ -1939,7 +1948,8 @@ class ComplexMakeRealImagTest(test.TestCase):
 
   def _compareMake(self, real, imag, use_gpu):
     np_ans = real + (1j) * imag
-    with self.test_session(use_gpu=use_gpu):
+    with self.test_session(use_gpu=use_gpu,
+                           force_gpu=use_gpu and test_util.is_gpu_available()):
       real = ops.convert_to_tensor(real)
       imag = ops.convert_to_tensor(imag)
       tf_ans = math_ops.complex(real, imag)
@@ -1958,7 +1968,8 @@ def testMake(self):
   def _compareRealImag(self, cplx, use_gpu):
     np_real, np_imag = np.real(cplx), np.imag(cplx)
     np_zeros = np_real * 0
-    with self.test_session(use_gpu=use_gpu):
+    with self.test_session(use_gpu=use_gpu,
+                           force_gpu=use_gpu and test_util.is_gpu_available()):
       inx = ops.convert_to_tensor(cplx)
       tf_real = math_ops.real(inx)
       tf_imag = math_ops.imag(inx)
@@ -1985,7 +1996,9 @@ def testRealImag128(self):
 
   def _compareAngle(self, cplx, use_gpu):
     np_angle = np.angle(cplx)
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.test_session(
+        use_gpu=use_gpu,
+        force_gpu=use_gpu and test_util.is_gpu_available()) as sess:
       inx = ops.convert_to_tensor(cplx)
       tf_angle = math_ops.angle(inx)
       tf_angle_val = sess.run(tf_angle)
@@ -2019,7 +2032,8 @@ def testRealReal(self):
 
   def _compareConj(self, cplx, use_gpu):
     np_ans = np.conj(cplx)
-    with self.test_session(use_gpu=use_gpu):
+    with self.test_session(use_gpu=use_gpu,
+                           force_gpu=use_gpu and test_util.is_gpu_available()):
       inx = ops.convert_to_tensor(cplx)
       tf_conj = math_ops.conj(inx)
       tf_ans = tf_conj.eval()
diff --git a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
index f7ae1a0f37ebf8..5e223b18281ed9 100644
--- a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
+++ b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
@@ -22,12 +22,15 @@
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
 
 
 def ConfigsToTest():
@@ -98,6 +101,7 @@ def _VerifyValues(self,
                     padding,
                     data_type,
                     use_gpu,
+                    grouped_conv=False,
                     data_format="NHWC"):
     """Verifies the output values of the convolution function.
 
@@ -110,25 +114,26 @@ def _VerifyValues(self,
       padding: Padding type.
       data_type: The data type to use.
       use_gpu: Whether to use GPU.
+      grouped_conv: Whether to use cuDNN 7's grouped convolution.
       data_format: The data_format of the input. "NHWC" or "NCHW".
     """
-    total_size_1 = 1
-    total_size_2 = 1
+    input_size = 1
+    filter_size = 1
     for s in tensor_in_sizes:
-      total_size_1 *= s
+      input_size *= s
     for s in filter_in_sizes:
-      total_size_2 *= s
+      filter_size *= s
     # Initializes the input and filter tensor with numbers incrementing from 1.
-    x1 = [f * 1.0 for f in range(1, total_size_1 + 1)]
-    x2 = [f * 1.0 for f in range(1, total_size_2 + 1)]
-    with self.test_session(use_gpu=use_gpu) as sess:
-      if data_type == dtypes.float16:
-        tolerance = 1e-5
-      elif data_type == dtypes.float32:
-        tolerance = 1e-5
-      else:
-        self.assertEqual(data_type, dtypes.float64)
-        tolerance = 1e-8
+    x1 = [f * 1.0 / input_size for f in range(1, input_size + 1)]
+    x2 = [f * 1.0 / filter_size for f in range(1, filter_size + 1)]
+    ops.reset_default_graph()
+    graph = ops.get_default_graph()
+    with self.test_session(graph=graph, use_gpu=use_gpu) as sess:
+      tolerance = {
+          dtypes.float16: 4e-2,
+          dtypes.float32: 1e-8,
+          dtypes.float64: 1e-13,
+      }[data_type]
 
       t1 = constant_op.constant(x1, shape=tensor_in_sizes, dtype=data_type)
       t1.set_shape(tensor_in_sizes)
@@ -142,25 +147,39 @@ def _VerifyValues(self,
         native_t1 = array_ops.transpose(t1, [0, 3, 1, 2])
         strides = [1, 1, stride, stride]
 
-      conv_native = nn_ops.depthwise_conv2d_native(
-          native_t1,
-          t2,
-          strides=strides,
-          data_format=data_format,
-          padding=padding)
+      with sess.graph._kernel_label_map({
+          "DepthwiseConv2dNative": "cudnn_grouped_convolution"
+      } if grouped_conv else {}):
+        conv_native = nn_ops.depthwise_conv2d_native(
+            native_t1,
+            t2,
+            strides=strides,
+            data_format=data_format,
+            padding=padding)
 
       if data_format == "NCHW":
         # Transpose back from NCHW to NHWC
         conv_native = array_ops.transpose(conv_native, [0, 2, 3, 1])
 
+      try:
+        native_result = sess.run(conv_native)
+      except errors.InvalidArgumentError as e:
+        # Grouped convolution kernel is only registered for cuDNN 7. Silently
+        # return when we are running on an earlier version or without GPU.
+        if e.message.startswith(
+            "No OpKernel was registered to support Op 'DepthwiseConv2dNative'"):
+          tf_logging.warn("Skipping grouped convolution test")
+          return
+        raise e
+
       conv_interface = nn_impl.depthwise_conv2d(
           t1, t2, strides=[1, stride, stride, 1], padding=padding)
-
-      native_result = sess.run(conv_native)
       interface_result = sess.run(conv_interface)
 
-    print("data_type:", data_type, "use_gpu:", use_gpu, "max diff = ",
-          np.amax(np.absolute(native_result - interface_result)))
+    tf_logging.info(
+        "data_type: %r, use_gpu: %r, grouped_conv: %r, max diff = %f",
+        data_type, use_gpu, grouped_conv,
+        np.amax(np.absolute(native_result - interface_result)))
     self.assertArrayNear(
         np.ravel(native_result), np.ravel(interface_result), tolerance)
     self.assertShapeEqual(native_result, conv_native)
@@ -169,11 +188,22 @@ def _VerifyValues(self,
   def testDepthwiseConv2D(self):
     for index, (input_size, filter_size, _, stride,
                 padding) in enumerate(ConfigsToTest()):
-      print("Testing DepthwiseConv2D,", index, "th config:", input_size, "*",
-            filter_size, "stride:", stride, "padding:", padding)
+      tf_logging.info(
+          "Testing DepthwiseConv2D, %dth config: %r * %r, stride: %d, padding: "
+          "%s", index, input_size, filter_size, stride, padding)
       for data_type in [dtypes.float16, dtypes.float32, dtypes.float64]:
+        tf_logging.info("Testing without grouped_conv")
         self._VerifyValues(
             input_size, filter_size, stride, padding, data_type, use_gpu=True)
+        tf_logging.info("Testing with grouped_conv")
+        self._VerifyValues(
+            input_size,
+            filter_size,
+            stride,
+            padding,
+            data_type,
+            use_gpu=True,
+            grouped_conv=True)
 
   def testDepthwiseConv2DFormat(self):
     if not test.is_gpu_available():
@@ -181,8 +211,9 @@ def testDepthwiseConv2DFormat(self):
 
     for index, (input_size, filter_size, _, stride,
                 padding) in enumerate(ConfigsToTest()):
-      print("Testing DepthwiseConv2DFormat,", index, "th config:", input_size,
-            "*", filter_size, "stride:", stride, "padding:", padding)
+      tf_logging.info(
+          "Testing DepthwiseConv2DFormat, %dth config: %r * %r, stride: %d, "
+          "padding: %s", index, input_size, filter_size, stride, padding)
       for data_type in [dtypes.float16, dtypes.float32, dtypes.float64]:
         self._VerifyValues(
             input_size,
@@ -226,7 +257,7 @@ def _VerifyHandValues(self, tensor_in_sizes, filter_in_sizes, stride, padding,
       conv = nn_ops.depthwise_conv2d_native(
           t1, t2, strides=[1, stride, stride, 1], padding=padding)
       value = sess.run(conv)
-    print("value = ", value)
+    tf_logging.info("value = %r", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
 
@@ -296,7 +327,7 @@ def testConv2D2x2Filter(self):
         expected=expected_output,
         use_gpu=True)
 
-  # Gradient checkers.This tests depthwise gradient computations for both
+  # Gradient checkers. This tests depthwise gradient computations for both
   # BackpropFilter and BackpropInput by comparing gradients computed by the
   # depthwise gradient ops with the gradients computed numerically (details can
   # be found in the compute_gradient_error().
@@ -310,6 +341,7 @@ def _ConstructAndTestGradient(self,
                                 data_type,
                                 test_input,
                                 use_gpu,
+                                grouped_conv=False,
                                 data_format="NHWC"):
     input_size = 1
     for x in input_shape:
@@ -319,14 +351,14 @@ def _ConstructAndTestGradient(self,
       filter_size *= x
     input_data = [x * 1.0 / input_size for x in range(0, input_size)]
     filter_data = [x * 1.0 / filter_size for x in range(0, filter_size)]
-    with self.test_session(use_gpu=use_gpu):
-      if data_type == dtypes.float16:
-        tolerance = 0.002
-      elif data_type == dtypes.float32:
-        tolerance = 0.002
-      else:
-        self.assertEqual(data_type, dtypes.float64)
-        tolerance = 1e-8
+    ops.reset_default_graph()
+    graph = ops.get_default_graph()
+    with self.test_session(graph=graph, use_gpu=use_gpu) as sess:
+      tolerance = {
+          dtypes.float16: 4e-0,
+          dtypes.float32: 5e-4,
+          dtypes.float64: 1e-12,
+      }[data_type]
 
       input_tensor = constant_op.constant(
           input_data, shape=input_shape, dtype=data_type, name="input")
@@ -347,35 +379,49 @@ def _ConstructAndTestGradient(self,
         ]
         strides = [1, 1, stride, stride]
 
-      depthwise_conv2d = nn_ops.depthwise_conv2d_native(
-          native_input,
-          filter_tensor,
-          strides,
-          padding,
-          data_format=data_format,
-          name="depthwise_conv2d")
+      with sess.graph._kernel_label_map({
+          "DepthwiseConv2dNative": "cudnn_grouped_convolution",
+          "DepthwiseConv2dNativeBackpropInput": "cudnn_grouped_convolution",
+          "DepthwiseConv2dNativeBackpropFilter": "cudnn_grouped_convolution",
+      } if grouped_conv else {}):
+        depthwise_conv2d = nn_ops.depthwise_conv2d_native(
+            native_input,
+            filter_tensor,
+            strides,
+            padding,
+            data_format=data_format,
+            name="depthwise_conv2d")
 
       self.assertEqual(output_shape, depthwise_conv2d.get_shape())
-      if test_input:
-        err = gradient_checker.compute_gradient_error(
-            native_input, input_shape, depthwise_conv2d, output_shape)
-      else:
-        err = gradient_checker.compute_gradient_error(filter_tensor,
-                                                      filter_shape,
-                                                      depthwise_conv2d,
-                                                      output_shape)
-      print("data_type:", data_type, "use_gpu:", use_gpu, ", error = ", err)
+
+      try:
+        if test_input:
+          err = gradient_checker.compute_gradient_error(
+              native_input, input_shape, depthwise_conv2d, output_shape)
+        else:
+          err = gradient_checker.compute_gradient_error(
+              filter_tensor, filter_shape, depthwise_conv2d, output_shape)
+      except errors.InvalidArgumentError as e:
+        # Grouped convolution kernel is only registered for cuDNN 7. Silently
+        # return when we are running on an earlier version or without GPU.
+        if grouped_conv and e.message.startswith(
+            "No OpKernel was registered to support Op 'DepthwiseConv2dNative'"):
+          tf_logging.warn("Skipping grouped convolution test")
+          return
+        raise e
+
+      tf_logging.info(
+          "data_type: %r, use_gpu: %r, grouped_conv: %r, error = %f", data_type,
+          use_gpu, grouped_conv, err)
       self.assertLess(err, tolerance)
 
   def testDepthwiseConv2DInputGrad(self):
     for index, (input_size, filter_size, output_size, stride,
                 padding) in enumerate(CheckGradConfigsToTest()):
-      print("Testing DepthwiseConv2DInputGrad,", index, "th config:",
-            input_size, "*", filter_size, "stride:", stride, "padding:",
-            padding)
-      # Note: float16 test for DepthwiseConv2DInputGrad is not enabled,
-      # calculations are not very precise.
-      for data_type in [dtypes.float32, dtypes.float64]:
+      tf_logging.info(
+          "Testing DepthwiseConv2DInputGrad, %dth config: %r * %r, stride: %d, "
+          "padding: %s", index, input_size, filter_size, stride, padding)
+      for data_type in [dtypes.float16, dtypes.float32, dtypes.float64]:
         self._ConstructAndTestGradient(
             input_size,
             filter_size,
@@ -385,6 +431,16 @@ def testDepthwiseConv2DInputGrad(self):
             data_type,
             test_input=True,
             use_gpu=True)
+        self._ConstructAndTestGradient(
+            input_size,
+            filter_size,
+            output_size,
+            stride,
+            padding,
+            data_type,
+            test_input=True,
+            use_gpu=True,
+            grouped_conv=True)
 
   def testDepthwiseConv2DInputGradFormat(self):
     if not test.is_gpu_available():
@@ -392,12 +448,11 @@ def testDepthwiseConv2DInputGradFormat(self):
 
     for index, (input_size, filter_size, output_size, stride,
                 padding) in enumerate(CheckGradConfigsToTest()):
-      print("Testing DepthwiseConv2DInputGradFormat,", index, "th config:",
-            input_size, "*", filter_size, "stride:", stride, "padding:",
-            padding)
-      # Note: float16 test for DepthwiseConv2DInputGradFormat is not enabled,
-      # calculations are not very precise.
-      for data_type in [dtypes.float32, dtypes.float64]:
+      tf_logging.info(
+          "Testing DepthwiseConv2DInputGradFormat, %dth config: %r * %r, "
+          "stride: %d, padding: %s", index, input_size, filter_size, stride,
+          padding)
+      for data_type in [dtypes.float16, dtypes.float32, dtypes.float64]:
         self._ConstructAndTestGradient(
             input_size,
             filter_size,
@@ -412,12 +467,10 @@ def testDepthwiseConv2DInputGradFormat(self):
   def testDepthwiseConv2DFilterGrad(self):
     for index, (input_size, filter_size, output_size, stride,
                 padding) in enumerate(CheckGradConfigsToTest()):
-      print("Testing DepthwiseConv2DFilterGrad,", index, "th config:",
-            input_size, "*", filter_size, "stride:", stride, "padding:",
-            padding)
-      # Note: float16 test for DepthwiseConv2DFilterGrad is not enabled,
-      # calculations are not very precise.
-      for data_type in [dtypes.float32, dtypes.float64]:
+      tf_logging.info(
+          "Testing DepthwiseConv2DFilterGrad, %dth config: %r * %r, stride: "
+          "%d, padding: %s", index, input_size, filter_size, stride, padding)
+      for data_type in [dtypes.float16, dtypes.float32, dtypes.float64]:
         self._ConstructAndTestGradient(
             input_size,
             filter_size,
@@ -434,12 +487,11 @@ def testDepthwiseConv2DFilterGradFormat(self):
 
     for index, (input_size, filter_size, output_size, stride,
                 padding) in enumerate(CheckGradConfigsToTest()):
-      print("Testing DepthwiseConv2DFilterGradFormat,", index, "th config:",
-            input_size, "*", filter_size, "stride:", stride, "padding:",
-            padding)
-      # Note: float16 test for DepthwiseConv2DFilterGradFormat is not enabled,
-      # calculations are not very precise.
-      for data_type in [dtypes.float32, dtypes.float64]:
+      tf_logging.info(
+          "Testing DepthwiseConv2DFilterGradFormat, %dth config: %r * %r, "
+          "stride: %d, padding: %s", index, input_size, filter_size, stride,
+          padding)
+      for data_type in [dtypes.float16, dtypes.float32, dtypes.float64]:
         self._ConstructAndTestGradient(
             input_size,
             filter_size,
@@ -494,9 +546,10 @@ def _GetVal(use_gpu):
   def testDepthwiseConv2DInputGradCompare(self):
     for index, (input_size, filter_size, output_size, stride,
                 padding) in enumerate(ConfigsToTest()):
-      print("Testing DepthwiseConv2DInputGradCompare,", index, "th config:",
-            input_size, "*", filter_size, "stride:", stride, "padding:",
-            padding)
+      tf_logging.info(
+          "Testing DepthwiseConv2DInputGradCompare, %dth config: %r * %r, "
+          "stride: %d, padding: %s", index, input_size, filter_size, stride,
+          padding)
       self._CompareBackpropInputFloat(input_size, filter_size, output_size,
                                       stride, padding)
       self._CompareBackpropInputDouble(input_size, filter_size, output_size,
@@ -545,9 +598,10 @@ def _GetVal(use_gpu):
   def testDepthwiseConv2DFilterGradCompare(self):
     for index, (input_size, filter_size, output_size, stride,
                 padding) in enumerate(ConfigsToTest()):
-      print("Testing DepthwiseConv2DFilterGradCompare,", index, "th config:",
-            input_size, "*", filter_size, "stride:", stride, "padding:",
-            padding)
+      tf_logging.info(
+          "Testing DepthwiseConv2DFilterGradCompare, %dth config: %r * %r, "
+          "stride: %d, padding: %s", index, input_size, filter_size, stride,
+          padding)
       self._CompareBackpropFilterFloat(input_size, filter_size, output_size,
                                        stride, padding)
       self._CompareBackpropFilterDouble(input_size, filter_size, output_size,
diff --git a/tensorflow/python/kernel_tests/distributions/BUILD b/tensorflow/python/kernel_tests/distributions/BUILD
index 03299d45e9266c..a864cfcb370837 100644
--- a/tensorflow/python/kernel_tests/distributions/BUILD
+++ b/tensorflow/python/kernel_tests/distributions/BUILD
@@ -41,6 +41,7 @@ gpu_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
+    shard_count = 3,
 )
 
 gpu_py_test(
diff --git a/tensorflow/python/kernel_tests/distributions/bernoulli_test.py b/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
index 09812db8166567..095d1cde1530f1 100644
--- a/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
+++ b/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
@@ -24,6 +24,7 @@
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.distributions import bernoulli
 from tensorflow.python.ops.distributions import kullback_leibler
@@ -56,59 +57,65 @@ def entropy(p):
 
 class BernoulliTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def testP(self):
     p = [0.2, 0.4]
     dist = bernoulli.Bernoulli(probs=p)
     with self.test_session():
-      self.assertAllClose(p, dist.probs.eval())
+      self.assertAllClose(p, self.evaluate(dist.probs))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testLogits(self):
     logits = [-42., 42.]
     dist = bernoulli.Bernoulli(logits=logits)
     with self.test_session():
-      self.assertAllClose(logits, dist.logits.eval())
+      self.assertAllClose(logits, self.evaluate(dist.logits))
 
     if not special:
       return
 
     with self.test_session():
-      self.assertAllClose(special.expit(logits), dist.probs.eval())
+      self.assertAllClose(special.expit(logits), self.evaluate(dist.probs))
 
     p = [0.01, 0.99, 0.42]
     dist = bernoulli.Bernoulli(probs=p)
     with self.test_session():
-      self.assertAllClose(special.logit(p), dist.logits.eval())
+      self.assertAllClose(special.logit(p), self.evaluate(dist.logits))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testInvalidP(self):
     invalid_ps = [1.01, 2.]
     for p in invalid_ps:
       with self.test_session():
         with self.assertRaisesOpError("probs has components greater than 1"):
           dist = bernoulli.Bernoulli(probs=p, validate_args=True)
-          dist.probs.eval()
+          self.evaluate(dist.probs)
 
     invalid_ps = [-0.01, -3.]
     for p in invalid_ps:
       with self.test_session():
         with self.assertRaisesOpError("Condition x >= 0"):
           dist = bernoulli.Bernoulli(probs=p, validate_args=True)
-          dist.probs.eval()
+          self.evaluate(dist.probs)
 
     valid_ps = [0.0, 0.5, 1.0]
     for p in valid_ps:
       with self.test_session():
         dist = bernoulli.Bernoulli(probs=p)
-        self.assertEqual(p, dist.probs.eval())  # Should not fail
+        self.assertEqual(p, self.evaluate(dist.probs))  # Should not fail
 
+  @test_util.run_in_graph_and_eager_modes()
   def testShapes(self):
     with self.test_session():
       for batch_shape in ([], [1], [2, 3, 4]):
         dist = make_bernoulli(batch_shape)
         self.assertAllEqual(batch_shape, dist.batch_shape.as_list())
-        self.assertAllEqual(batch_shape, dist.batch_shape_tensor().eval())
+        self.assertAllEqual(batch_shape,
+                            self.evaluate(dist.batch_shape_tensor()))
         self.assertAllEqual([], dist.event_shape.as_list())
-        self.assertAllEqual([], dist.event_shape_tensor().eval())
+        self.assertAllEqual([], self.evaluate(dist.event_shape_tensor()))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testDtype(self):
     dist = make_bernoulli([])
     self.assertEqual(dist.dtype, dtypes.int32)
@@ -126,6 +133,7 @@ def testDtype(self):
     self.assertEqual(dist64.dtype, dist64.sample(5).dtype)
     self.assertEqual(dist64.dtype, dist64.mode().dtype)
 
+  @test_util.run_in_graph_and_eager_modes()
   def _testPmf(self, **kwargs):
     dist = bernoulli.Bernoulli(**kwargs)
     with self.test_session():
@@ -147,8 +155,9 @@ def _testPmf(self, **kwargs):
       # pylint: enable=bad-continuation
 
       for x, expected_pmf in zip(xs, expected_pmfs):
-        self.assertAllClose(dist.prob(x).eval(), expected_pmf)
-        self.assertAllClose(dist.log_prob(x).eval(), np.log(expected_pmf))
+        self.assertAllClose(self.evaluate(dist.prob(x)), expected_pmf)
+        self.assertAllClose(
+            self.evaluate(dist.log_prob(x)), np.log(expected_pmf))
 
   def testPmfCorrectBroadcastDynamicShape(self):
     with self.test_session():
@@ -165,15 +174,17 @@ def testPmfCorrectBroadcastDynamicShape(self):
               p: [0.2, 0.3, 0.4]
           }), [[0.2, 0.7, 0.4]])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testPmfInvalid(self):
     p = [0.1, 0.2, 0.7]
     with self.test_session():
       dist = bernoulli.Bernoulli(probs=p, validate_args=True)
       with self.assertRaisesOpError("must be non-negative."):
-        dist.prob([1, 1, -1]).eval()
+        self.evaluate(dist.prob([1, 1, -1]))
       with self.assertRaisesOpError("Elements cannot exceed 1."):
-        dist.prob([2, 0, 1]).eval()
+        self.evaluate(dist.prob([2, 0, 1]))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testPmfWithP(self):
     p = [[0.2, 0.4], [0.3, 0.6]]
     self._testPmf(probs=p)
@@ -203,7 +214,7 @@ def testPmfShapes(self):
 
     with self.test_session():
       dist = bernoulli.Bernoulli(probs=0.5)
-      self.assertEqual(2, len(dist.log_prob([[1], [1]]).eval().shape))
+      self.assertEqual(2, len(self.evaluate(dist.log_prob([[1], [1]])).shape))
 
     with self.test_session():
       dist = bernoulli.Bernoulli(probs=0.5)
@@ -215,25 +226,31 @@ def testPmfShapes(self):
       dist = bernoulli.Bernoulli(probs=[[0.5], [0.5]])
       self.assertEqual((2, 1), dist.log_prob(1).get_shape())
 
+  @test_util.run_in_graph_and_eager_modes()
   def testBoundaryConditions(self):
     with self.test_session():
       dist = bernoulli.Bernoulli(probs=1.0)
-      self.assertAllClose(np.nan, dist.log_prob(0).eval())
-      self.assertAllClose([np.nan], [dist.log_prob(1).eval()])
+      self.assertAllClose(np.nan, self.evaluate(dist.log_prob(0)))
+      self.assertAllClose([np.nan], [self.evaluate(dist.log_prob(1))])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testEntropyNoBatch(self):
     p = 0.2
     dist = bernoulli.Bernoulli(probs=p)
     with self.test_session():
-      self.assertAllClose(dist.entropy().eval(), entropy(p))
+      self.assertAllClose(self.evaluate(dist.entropy()), entropy(p))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testEntropyWithBatch(self):
     p = [[0.1, 0.7], [0.2, 0.6]]
     dist = bernoulli.Bernoulli(probs=p, validate_args=False)
     with self.test_session():
-      self.assertAllClose(dist.entropy().eval(), [[entropy(0.1), entropy(0.7)],
-                                                  [entropy(0.2), entropy(0.6)]])
+      self.assertAllClose(
+          self.evaluate(dist.entropy()),
+          [[entropy(0.1), entropy(0.7)], [entropy(0.2),
+                                          entropy(0.6)]])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testSampleN(self):
     with self.test_session():
       p = [0.2, 0.6]
@@ -242,7 +259,7 @@ def testSampleN(self):
       samples = dist.sample(n)
       samples.set_shape([n, 2])
       self.assertEqual(samples.dtype, dtypes.int32)
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
       self.assertTrue(np.all(sample_values >= 0))
       self.assertTrue(np.all(sample_values <= 1))
       # Note that the standard error for the sample mean is ~ sqrt(p * (1 - p) /
@@ -262,51 +279,54 @@ def testSampleActsLikeSampleN(self):
       n = 1000
       seed = 42
       self.assertAllEqual(
-          dist.sample(n, seed).eval(), dist.sample(n, seed).eval())
+          self.evaluate(dist.sample(n, seed)),
+          self.evaluate(dist.sample(n, seed)))
       n = array_ops.placeholder(dtypes.int32)
       sample, sample = sess.run([dist.sample(n, seed), dist.sample(n, seed)],
                                 feed_dict={n: 1000})
       self.assertAllEqual(sample, sample)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testMean(self):
     with self.test_session():
       p = np.array([[0.2, 0.7], [0.5, 0.4]], dtype=np.float32)
       dist = bernoulli.Bernoulli(probs=p)
-      self.assertAllEqual(dist.mean().eval(), p)
+      self.assertAllEqual(self.evaluate(dist.mean()), p)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testVarianceAndStd(self):
     var = lambda p: p * (1. - p)
     with self.test_session():
       p = [[0.2, 0.7], [0.5, 0.4]]
       dist = bernoulli.Bernoulli(probs=p)
       self.assertAllClose(
-          dist.variance().eval(),
+          self.evaluate(dist.variance()),
           np.array(
               [[var(0.2), var(0.7)], [var(0.5), var(0.4)]], dtype=np.float32))
       self.assertAllClose(
-          dist.stddev().eval(),
+          self.evaluate(dist.stddev()),
           np.array(
               [[np.sqrt(var(0.2)), np.sqrt(var(0.7))],
                [np.sqrt(var(0.5)), np.sqrt(var(0.4))]],
               dtype=np.float32))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testBernoulliBernoulliKL(self):
-    with self.test_session() as sess:
-      batch_size = 6
-      a_p = np.array([0.5] * batch_size, dtype=np.float32)
-      b_p = np.array([0.4] * batch_size, dtype=np.float32)
+    batch_size = 6
+    a_p = np.array([0.5] * batch_size, dtype=np.float32)
+    b_p = np.array([0.4] * batch_size, dtype=np.float32)
 
-      a = bernoulli.Bernoulli(probs=a_p)
-      b = bernoulli.Bernoulli(probs=b_p)
+    a = bernoulli.Bernoulli(probs=a_p)
+    b = bernoulli.Bernoulli(probs=b_p)
 
-      kl = kullback_leibler.kl_divergence(a, b)
-      kl_val = sess.run(kl)
+    kl = kullback_leibler.kl_divergence(a, b)
+    kl_val = self.evaluate(kl)
 
-      kl_expected = (a_p * np.log(a_p / b_p) + (1. - a_p) * np.log(
-          (1. - a_p) / (1. - b_p)))
+    kl_expected = (a_p * np.log(a_p / b_p) + (1. - a_p) * np.log(
+        (1. - a_p) / (1. - b_p)))
 
-      self.assertEqual(kl.get_shape(), (batch_size,))
-      self.assertAllClose(kl_val, kl_expected)
+    self.assertEqual(kl.get_shape(), (batch_size,))
+    self.assertAllClose(kl_val, kl_expected)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/distributions/beta_test.py b/tensorflow/python/kernel_tests/distributions/beta_test.py
index ab5041a6eb477c..4bc8303ebb6939 100644
--- a/tensorflow/python/kernel_tests/distributions/beta_test.py
+++ b/tensorflow/python/kernel_tests/distributions/beta_test.py
@@ -24,6 +24,7 @@
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops.distributions import beta as beta_lib
@@ -45,6 +46,7 @@ def try_import(name):  # pylint: disable=invalid-name
 stats = try_import("scipy.stats")
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class BetaTest(test.TestCase):
 
   def testSimpleShapes(self):
@@ -52,8 +54,8 @@ def testSimpleShapes(self):
       a = np.random.rand(3)
       b = np.random.rand(3)
       dist = beta_lib.Beta(a, b)
-      self.assertAllEqual([], dist.event_shape_tensor().eval())
-      self.assertAllEqual([3], dist.batch_shape_tensor().eval())
+      self.assertAllEqual([], self.evaluate(dist.event_shape_tensor()))
+      self.assertAllEqual([3], self.evaluate(dist.batch_shape_tensor()))
       self.assertEqual(tensor_shape.TensorShape([]), dist.event_shape)
       self.assertEqual(tensor_shape.TensorShape([3]), dist.batch_shape)
 
@@ -62,8 +64,8 @@ def testComplexShapes(self):
       a = np.random.rand(3, 2, 2)
       b = np.random.rand(3, 2, 2)
       dist = beta_lib.Beta(a, b)
-      self.assertAllEqual([], dist.event_shape_tensor().eval())
-      self.assertAllEqual([3, 2, 2], dist.batch_shape_tensor().eval())
+      self.assertAllEqual([], self.evaluate(dist.event_shape_tensor()))
+      self.assertAllEqual([3, 2, 2], self.evaluate(dist.batch_shape_tensor()))
       self.assertEqual(tensor_shape.TensorShape([]), dist.event_shape)
       self.assertEqual(
           tensor_shape.TensorShape([3, 2, 2]), dist.batch_shape)
@@ -73,8 +75,8 @@ def testComplexShapesBroadcast(self):
       a = np.random.rand(3, 2, 2)
       b = np.random.rand(2, 2)
       dist = beta_lib.Beta(a, b)
-      self.assertAllEqual([], dist.event_shape_tensor().eval())
-      self.assertAllEqual([3, 2, 2], dist.batch_shape_tensor().eval())
+      self.assertAllEqual([], self.evaluate(dist.event_shape_tensor()))
+      self.assertAllEqual([3, 2, 2], self.evaluate(dist.batch_shape_tensor()))
       self.assertEqual(tensor_shape.TensorShape([]), dist.event_shape)
       self.assertEqual(
           tensor_shape.TensorShape([3, 2, 2]), dist.batch_shape)
@@ -85,7 +87,7 @@ def testAlphaProperty(self):
     with self.test_session():
       dist = beta_lib.Beta(a, b)
       self.assertEqual([1, 3], dist.concentration1.get_shape())
-      self.assertAllClose(a, dist.concentration1.eval())
+      self.assertAllClose(a, self.evaluate(dist.concentration1))
 
   def testBetaProperty(self):
     a = [[1., 2, 3]]
@@ -93,24 +95,24 @@ def testBetaProperty(self):
     with self.test_session():
       dist = beta_lib.Beta(a, b)
       self.assertEqual([1, 3], dist.concentration0.get_shape())
-      self.assertAllClose(b, dist.concentration0.eval())
+      self.assertAllClose(b, self.evaluate(dist.concentration0))
 
   def testPdfXProper(self):
     a = [[1., 2, 3]]
     b = [[2., 4, 3]]
     with self.test_session():
       dist = beta_lib.Beta(a, b, validate_args=True)
-      dist.prob([.1, .3, .6]).eval()
-      dist.prob([.2, .3, .5]).eval()
+      self.evaluate(dist.prob([.1, .3, .6]))
+      self.evaluate(dist.prob([.2, .3, .5]))
       # Either condition can trigger.
       with self.assertRaisesOpError("sample must be positive"):
-        dist.prob([-1., 0.1, 0.5]).eval()
+        self.evaluate(dist.prob([-1., 0.1, 0.5]))
       with self.assertRaisesOpError("sample must be positive"):
-        dist.prob([0., 0.1, 0.5]).eval()
+        self.evaluate(dist.prob([0., 0.1, 0.5]))
       with self.assertRaisesOpError("sample must be less than `1`"):
-        dist.prob([.1, .2, 1.2]).eval()
+        self.evaluate(dist.prob([.1, .2, 1.2]))
       with self.assertRaisesOpError("sample must be less than `1`"):
-        dist.prob([.1, .2, 1.0]).eval()
+        self.evaluate(dist.prob([.1, .2, 1.0]))
 
   def testPdfTwoBatches(self):
     with self.test_session():
@@ -119,7 +121,7 @@ def testPdfTwoBatches(self):
       x = [.5, .5]
       dist = beta_lib.Beta(a, b)
       pdf = dist.prob(x)
-      self.assertAllClose([1., 3. / 2], pdf.eval())
+      self.assertAllClose([1., 3. / 2], self.evaluate(pdf))
       self.assertEqual((2,), pdf.get_shape())
 
   def testPdfTwoBatchesNontrivialX(self):
@@ -129,7 +131,7 @@ def testPdfTwoBatchesNontrivialX(self):
       x = [.3, .7]
       dist = beta_lib.Beta(a, b)
       pdf = dist.prob(x)
-      self.assertAllClose([1, 63. / 50], pdf.eval())
+      self.assertAllClose([1, 63. / 50], self.evaluate(pdf))
       self.assertEqual((2,), pdf.get_shape())
 
   def testPdfUniformZeroBatch(self):
@@ -140,7 +142,7 @@ def testPdfUniformZeroBatch(self):
       x = np.array([.1, .2, .3, .5, .8], dtype=np.float32)
       dist = beta_lib.Beta(a, b)
       pdf = dist.prob(x)
-      self.assertAllClose([1.] * 5, pdf.eval())
+      self.assertAllClose([1.] * 5, self.evaluate(pdf))
       self.assertEqual((5,), pdf.get_shape())
 
   def testPdfAlphaStretchedInBroadcastWhenSameRank(self):
@@ -150,7 +152,7 @@ def testPdfAlphaStretchedInBroadcastWhenSameRank(self):
       x = [[.5, .5], [.3, .7]]
       dist = beta_lib.Beta(a, b)
       pdf = dist.prob(x)
-      self.assertAllClose([[1., 3. / 2], [1., 63. / 50]], pdf.eval())
+      self.assertAllClose([[1., 3. / 2], [1., 63. / 50]], self.evaluate(pdf))
       self.assertEqual((2, 2), pdf.get_shape())
 
   def testPdfAlphaStretchedInBroadcastWhenLowerRank(self):
@@ -159,7 +161,7 @@ def testPdfAlphaStretchedInBroadcastWhenLowerRank(self):
       b = [1., 2]
       x = [[.5, .5], [.2, .8]]
       pdf = beta_lib.Beta(a, b).prob(x)
-      self.assertAllClose([[1., 3. / 2], [1., 24. / 25]], pdf.eval())
+      self.assertAllClose([[1., 3. / 2], [1., 24. / 25]], self.evaluate(pdf))
       self.assertEqual((2, 2), pdf.get_shape())
 
   def testPdfXStretchedInBroadcastWhenSameRank(self):
@@ -168,7 +170,7 @@ def testPdfXStretchedInBroadcastWhenSameRank(self):
       b = [[1., 2], [2., 3]]
       x = [[.5, .5]]
       pdf = beta_lib.Beta(a, b).prob(x)
-      self.assertAllClose([[1., 3. / 2], [3. / 2, 15. / 8]], pdf.eval())
+      self.assertAllClose([[1., 3. / 2], [3. / 2, 15. / 8]], self.evaluate(pdf))
       self.assertEqual((2, 2), pdf.get_shape())
 
   def testPdfXStretchedInBroadcastWhenLowerRank(self):
@@ -177,7 +179,7 @@ def testPdfXStretchedInBroadcastWhenLowerRank(self):
       b = [[1., 2], [2., 3]]
       x = [.5, .5]
       pdf = beta_lib.Beta(a, b).prob(x)
-      self.assertAllClose([[1., 3. / 2], [3. / 2, 15. / 8]], pdf.eval())
+      self.assertAllClose([[1., 3. / 2], [3. / 2, 15. / 8]], self.evaluate(pdf))
       self.assertEqual((2, 2), pdf.get_shape())
 
   def testBetaMean(self):
@@ -189,7 +191,7 @@ def testBetaMean(self):
       if not stats:
         return
       expected_mean = stats.beta.mean(a, b)
-      self.assertAllClose(expected_mean, dist.mean().eval())
+      self.assertAllClose(expected_mean, self.evaluate(dist.mean()))
 
   def testBetaVariance(self):
     with session.Session():
@@ -200,7 +202,7 @@ def testBetaVariance(self):
       if not stats:
         return
       expected_variance = stats.beta.var(a, b)
-      self.assertAllClose(expected_variance, dist.variance().eval())
+      self.assertAllClose(expected_variance, self.evaluate(dist.variance()))
 
   def testBetaMode(self):
     with session.Session():
@@ -209,7 +211,7 @@ def testBetaMode(self):
       expected_mode = (a - 1) / (a + b - 2)
       dist = beta_lib.Beta(a, b)
       self.assertEqual(dist.mode().get_shape(), (3,))
-      self.assertAllClose(expected_mode, dist.mode().eval())
+      self.assertAllClose(expected_mode, self.evaluate(dist.mode()))
 
   def testBetaModeInvalid(self):
     with session.Session():
@@ -217,13 +219,13 @@ def testBetaModeInvalid(self):
       b = np.array([2., 4, 1.2])
       dist = beta_lib.Beta(a, b, allow_nan_stats=False)
       with self.assertRaisesOpError("Condition x < y.*"):
-        dist.mode().eval()
+        self.evaluate(dist.mode())
 
       a = np.array([2., 2, 3])
       b = np.array([1., 4, 1.2])
       dist = beta_lib.Beta(a, b, allow_nan_stats=False)
       with self.assertRaisesOpError("Condition x < y.*"):
-        dist.mode().eval()
+        self.evaluate(dist.mode())
 
   def testBetaModeEnableAllowNanStats(self):
     with session.Session():
@@ -234,7 +236,7 @@ def testBetaModeEnableAllowNanStats(self):
       expected_mode = (a - 1) / (a + b - 2)
       expected_mode[0] = np.nan
       self.assertEqual((3,), dist.mode().get_shape())
-      self.assertAllClose(expected_mode, dist.mode().eval())
+      self.assertAllClose(expected_mode, self.evaluate(dist.mode()))
 
       a = np.array([2., 2, 3])
       b = np.array([1., 4, 1.2])
@@ -243,7 +245,7 @@ def testBetaModeEnableAllowNanStats(self):
       expected_mode = (a - 1) / (a + b - 2)
       expected_mode[0] = np.nan
       self.assertEqual((3,), dist.mode().get_shape())
-      self.assertAllClose(expected_mode, dist.mode().eval())
+      self.assertAllClose(expected_mode, self.evaluate(dist.mode()))
 
   def testBetaEntropy(self):
     with session.Session():
@@ -254,7 +256,7 @@ def testBetaEntropy(self):
       if not stats:
         return
       expected_entropy = stats.beta.entropy(a, b)
-      self.assertAllClose(expected_entropy, dist.entropy().eval())
+      self.assertAllClose(expected_entropy, self.evaluate(dist.entropy()))
 
   def testBetaSample(self):
     with self.test_session():
@@ -263,7 +265,7 @@ def testBetaSample(self):
       beta = beta_lib.Beta(a, b)
       n = constant_op.constant(100000)
       samples = beta.sample(n)
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
       self.assertEqual(sample_values.shape, (100000,))
       self.assertFalse(np.any(sample_values < 0.0))
       if not stats:
@@ -291,13 +293,13 @@ def testBetaSampleMultipleTimes(self):
       beta1 = beta_lib.Beta(concentration1=a_val,
                             concentration0=b_val,
                             name="beta1")
-      samples1 = beta1.sample(n_val, seed=123456).eval()
+      samples1 = self.evaluate(beta1.sample(n_val, seed=123456))
 
       random_seed.set_random_seed(654321)
       beta2 = beta_lib.Beta(concentration1=a_val,
                             concentration0=b_val,
                             name="beta2")
-      samples2 = beta2.sample(n_val, seed=123456).eval()
+      samples2 = self.evaluate(beta2.sample(n_val, seed=123456))
 
       self.assertAllClose(samples1, samples2)
 
@@ -308,7 +310,7 @@ def testBetaSampleMultidimensional(self):
       beta = beta_lib.Beta(a, b)
       n = constant_op.constant(100000)
       samples = beta.sample(n)
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
       self.assertEqual(sample_values.shape, (100000, 3, 2, 2))
       self.assertFalse(np.any(sample_values < 0.0))
       if not stats:
@@ -325,7 +327,7 @@ def testBetaCdf(self):
         a = 10. * np.random.random(shape).astype(dt)
         b = 10. * np.random.random(shape).astype(dt)
         x = np.random.random(shape).astype(dt)
-        actual = beta_lib.Beta(a, b).cdf(x).eval()
+        actual = self.evaluate(beta_lib.Beta(a, b).cdf(x))
         self.assertAllEqual(np.ones(shape, dtype=np.bool), 0. <= x)
         self.assertAllEqual(np.ones(shape, dtype=np.bool), 1. >= x)
         if not stats:
@@ -339,7 +341,7 @@ def testBetaLogCdf(self):
         a = 10. * np.random.random(shape).astype(dt)
         b = 10. * np.random.random(shape).astype(dt)
         x = np.random.random(shape).astype(dt)
-        actual = math_ops.exp(beta_lib.Beta(a, b).log_cdf(x)).eval()
+        actual = self.evaluate(math_ops.exp(beta_lib.Beta(a, b).log_cdf(x)))
         self.assertAllEqual(np.ones(shape, dtype=np.bool), 0. <= x)
         self.assertAllEqual(np.ones(shape, dtype=np.bool), 1. >= x)
         if not stats:
@@ -350,46 +352,47 @@ def testBetaWithSoftplusConcentration(self):
     with self.test_session():
       a, b = -4.2, -9.1
       dist = beta_lib.BetaWithSoftplusConcentration(a, b)
-      self.assertAllClose(nn_ops.softplus(a).eval(), dist.concentration1.eval())
-      self.assertAllClose(nn_ops.softplus(b).eval(), dist.concentration0.eval())
+      self.assertAllClose(
+          self.evaluate(nn_ops.softplus(a)), self.evaluate(dist.concentration1))
+      self.assertAllClose(
+          self.evaluate(nn_ops.softplus(b)), self.evaluate(dist.concentration0))
 
   def testBetaBetaKL(self):
-    with self.test_session() as sess:
-      for shape in [(10,), (4, 5)]:
-        a1 = 6.0 * np.random.random(size=shape) + 1e-4
-        b1 = 6.0 * np.random.random(size=shape) + 1e-4
-        a2 = 6.0 * np.random.random(size=shape) + 1e-4
-        b2 = 6.0 * np.random.random(size=shape) + 1e-4
-        # Take inverse softplus of values to test BetaWithSoftplusConcentration
-        a1_sp = np.log(np.exp(a1) - 1.0)
-        b1_sp = np.log(np.exp(b1) - 1.0)
-        a2_sp = np.log(np.exp(a2) - 1.0)
-        b2_sp = np.log(np.exp(b2) - 1.0)
-
-        d1 = beta_lib.Beta(concentration1=a1, concentration0=b1)
-        d2 = beta_lib.Beta(concentration1=a2, concentration0=b2)
-        d1_sp = beta_lib.BetaWithSoftplusConcentration(concentration1=a1_sp,
-                                                       concentration0=b1_sp)
-        d2_sp = beta_lib.BetaWithSoftplusConcentration(concentration1=a2_sp,
-                                                       concentration0=b2_sp)
-
-        if not special:
-          return
-        kl_expected = (special.betaln(a2, b2) - special.betaln(a1, b1) +
-                       (a1 - a2) * special.digamma(a1) +
-                       (b1 - b2) * special.digamma(b1) +
-                       (a2 - a1 + b2 - b1) * special.digamma(a1 + b1))
-
-        for dist1 in [d1, d1_sp]:
-          for dist2 in [d2, d2_sp]:
-            kl = kullback_leibler.kl_divergence(dist1, dist2)
-            kl_val = sess.run(kl)
-            self.assertEqual(kl.get_shape(), shape)
-            self.assertAllClose(kl_val, kl_expected)
-
-        # Make sure KL(d1||d1) is 0
-        kl_same = sess.run(kullback_leibler.kl_divergence(d1, d1))
-        self.assertAllClose(kl_same, np.zeros_like(kl_expected))
+    for shape in [(10,), (4, 5)]:
+      a1 = 6.0 * np.random.random(size=shape) + 1e-4
+      b1 = 6.0 * np.random.random(size=shape) + 1e-4
+      a2 = 6.0 * np.random.random(size=shape) + 1e-4
+      b2 = 6.0 * np.random.random(size=shape) + 1e-4
+      # Take inverse softplus of values to test BetaWithSoftplusConcentration
+      a1_sp = np.log(np.exp(a1) - 1.0)
+      b1_sp = np.log(np.exp(b1) - 1.0)
+      a2_sp = np.log(np.exp(a2) - 1.0)
+      b2_sp = np.log(np.exp(b2) - 1.0)
+
+      d1 = beta_lib.Beta(concentration1=a1, concentration0=b1)
+      d2 = beta_lib.Beta(concentration1=a2, concentration0=b2)
+      d1_sp = beta_lib.BetaWithSoftplusConcentration(concentration1=a1_sp,
+                                                     concentration0=b1_sp)
+      d2_sp = beta_lib.BetaWithSoftplusConcentration(concentration1=a2_sp,
+                                                     concentration0=b2_sp)
+
+      if not special:
+        return
+      kl_expected = (special.betaln(a2, b2) - special.betaln(a1, b1) +
+                     (a1 - a2) * special.digamma(a1) +
+                     (b1 - b2) * special.digamma(b1) +
+                     (a2 - a1 + b2 - b1) * special.digamma(a1 + b1))
+
+      for dist1 in [d1, d1_sp]:
+        for dist2 in [d2, d2_sp]:
+          kl = kullback_leibler.kl_divergence(dist1, dist2)
+          kl_val = self.evaluate(kl)
+          self.assertEqual(kl.get_shape(), shape)
+          self.assertAllClose(kl_val, kl_expected)
+
+      # Make sure KL(d1||d1) is 0
+      kl_same = self.evaluate(kullback_leibler.kl_divergence(d1, d1))
+      self.assertAllClose(kl_same, np.zeros_like(kl_expected))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/distributions/bijector_test.py b/tensorflow/python/kernel_tests/distributions/bijector_test.py
index 18582241e2fb69..8b11556330acc7 100644
--- a/tensorflow/python/kernel_tests/distributions/bijector_test.py
+++ b/tensorflow/python/kernel_tests/distributions/bijector_test.py
@@ -24,11 +24,14 @@
 import six
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import bijector
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class BaseBijectorTest(test.TestCase):
   """Tests properties of the Bijector base-class."""
 
@@ -46,42 +49,38 @@ class _BareBonesBijector(bijector.Bijector):
       def __init__(self):
         super(_BareBonesBijector, self).__init__(forward_min_event_ndims=0)
 
-    with self.test_session() as sess:
-      bij = _BareBonesBijector()
-      self.assertEqual([], bij.graph_parents)
-      self.assertEqual(False, bij.is_constant_jacobian)
-      self.assertEqual(False, bij.validate_args)
-      self.assertEqual(None, bij.dtype)
-      self.assertEqual("bare_bones_bijector", bij.name)
-
-      for shape in [[], [1, 2], [1, 2, 3]]:
-        [
-            forward_event_shape_,
-            inverse_event_shape_,
-        ] = sess.run([
-            bij.inverse_event_shape_tensor(shape),
-            bij.forward_event_shape_tensor(shape),
-        ])
-        self.assertAllEqual(shape, forward_event_shape_)
-        self.assertAllEqual(shape, bij.forward_event_shape(shape))
-        self.assertAllEqual(shape, inverse_event_shape_)
-        self.assertAllEqual(shape, bij.inverse_event_shape(shape))
-
-      with self.assertRaisesRegexp(
-          NotImplementedError, "inverse not implemented"):
-        bij.inverse(0)
-
-      with self.assertRaisesRegexp(
-          NotImplementedError, "forward not implemented"):
-        bij.forward(0)
-
-      with self.assertRaisesRegexp(
-          NotImplementedError, "inverse_log_det_jacobian not implemented"):
-        bij.inverse_log_det_jacobian(0, event_ndims=0)
-
-      with self.assertRaisesRegexp(
-          NotImplementedError, "forward_log_det_jacobian not implemented"):
-        bij.forward_log_det_jacobian(0, event_ndims=0)
+    bij = _BareBonesBijector()
+    self.assertEqual([], bij.graph_parents)
+    self.assertEqual(False, bij.is_constant_jacobian)
+    self.assertEqual(False, bij.validate_args)
+    self.assertEqual(None, bij.dtype)
+    self.assertEqual("bare_bones_bijector", bij.name)
+
+    for shape in [[], [1, 2], [1, 2, 3]]:
+      forward_event_shape_ = self.evaluate(
+          bij.inverse_event_shape_tensor(shape))
+      inverse_event_shape_ = self.evaluate(
+          bij.forward_event_shape_tensor(shape))
+      self.assertAllEqual(shape, forward_event_shape_)
+      self.assertAllEqual(shape, bij.forward_event_shape(shape))
+      self.assertAllEqual(shape, inverse_event_shape_)
+      self.assertAllEqual(shape, bij.inverse_event_shape(shape))
+
+    with self.assertRaisesRegexp(
+        NotImplementedError, "inverse not implemented"):
+      bij.inverse(0)
+
+    with self.assertRaisesRegexp(
+        NotImplementedError, "forward not implemented"):
+      bij.forward(0)
+
+    with self.assertRaisesRegexp(
+        NotImplementedError, "inverse_log_det_jacobian not implemented"):
+      bij.inverse_log_det_jacobian(0, event_ndims=0)
+
+    with self.assertRaisesRegexp(
+        NotImplementedError, "forward_log_det_jacobian not implemented"):
+      bij.forward_log_det_jacobian(0, event_ndims=0)
 
 
 class IntentionallyMissingError(Exception):
@@ -91,9 +90,10 @@ class IntentionallyMissingError(Exception):
 class BrokenBijector(bijector.Bijector):
   """Forward and inverse are not inverses of each other."""
 
-  def __init__(self, forward_missing=False, inverse_missing=False):
+  def __init__(
+      self, forward_missing=False, inverse_missing=False, validate_args=False):
     super(BrokenBijector, self).__init__(
-        validate_args=False, forward_min_event_ndims=0, name="broken")
+        validate_args=validate_args, forward_min_event_ndims=0, name="broken")
     self._forward_missing = forward_missing
     self._inverse_missing = inverse_missing
 
@@ -117,6 +117,33 @@ def _forward_log_det_jacobian(self, x):  # pylint:disable=unused-argument
       raise IntentionallyMissingError
     return math_ops.log(2.)
 
+class BijectorTestEventNdims(test.TestCase):
+
+  def testBijectorNonIntegerEventNdims(self):
+    bij = BrokenBijector()
+    with self.assertRaisesRegexp(ValueError, "Expected integer"):
+      bij.forward_log_det_jacobian(1., event_ndims=1.5)
+    with self.assertRaisesRegexp(ValueError, "Expected integer"):
+      bij.inverse_log_det_jacobian(1., event_ndims=1.5)
+
+  def testBijectorArrayEventNdims(self):
+    bij = BrokenBijector()
+    with self.assertRaisesRegexp(ValueError, "Expected scalar"):
+      bij.forward_log_det_jacobian(1., event_ndims=(1, 2))
+    with self.assertRaisesRegexp(ValueError, "Expected scalar"):
+      bij.inverse_log_det_jacobian(1., event_ndims=(1, 2))
+
+  def testBijectorDynamicEventNdims(self):
+    bij = BrokenBijector(validate_args=True)
+    event_ndims = array_ops.placeholder(dtype=np.int32, shape=None)
+    with self.test_session():
+      with self.assertRaisesOpError("Expected scalar"):
+        bij.forward_log_det_jacobian(1., event_ndims=event_ndims).eval({
+            event_ndims: (1, 2)})
+      with self.assertRaisesOpError("Expected scalar"):
+        bij.inverse_log_det_jacobian(1., event_ndims=event_ndims).eval({
+            event_ndims: (1, 2)})
+
 
 @six.add_metaclass(abc.ABCMeta)
 class BijectorCachingTestBase(object):
@@ -275,6 +302,17 @@ def testReduceEventNdimsInverseConstJacobian(self):
         8.,
         self.evaluate(bij.inverse_log_det_jacobian(x, event_ndims=2)))
 
+  def testHandlesNonStaticEventNdims(self):
+    x_ = [[[1., 2.], [3., 4.]]]
+    x = array_ops.placeholder_with_default(x_, shape=None)
+    event_ndims = array_ops.placeholder(dtype=np.int32, shape=[])
+    bij = ExpOnlyJacobian(forward_min_event_ndims=1)
+    bij.inverse_log_det_jacobian(x, event_ndims=event_ndims)
+    with self.test_session() as sess:
+      ildj = sess.run(bij.inverse_log_det_jacobian(x, event_ndims=event_ndims),
+                      feed_dict={event_ndims: 1})
+    self.assertAllClose(-np.log(x_), ildj)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/distributions/dirichlet_test.py b/tensorflow/python/kernel_tests/distributions/dirichlet_test.py
index a2f1de5aaf3a75..bcec6ef610d038 100644
--- a/tensorflow/python/kernel_tests/distributions/dirichlet_test.py
+++ b/tensorflow/python/kernel_tests/distributions/dirichlet_test.py
@@ -22,9 +22,11 @@
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import dirichlet as dirichlet_lib
+from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 
@@ -38,17 +40,19 @@ def try_import(name):  # pylint: disable=invalid-name
   return module
 
 
+special = try_import("scipy.special")
 stats = try_import("scipy.stats")
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class DirichletTest(test.TestCase):
 
   def testSimpleShapes(self):
     with self.test_session():
       alpha = np.random.rand(3)
       dist = dirichlet_lib.Dirichlet(alpha)
-      self.assertEqual(3, dist.event_shape_tensor().eval())
-      self.assertAllEqual([], dist.batch_shape_tensor().eval())
+      self.assertEqual(3, self.evaluate(dist.event_shape_tensor()))
+      self.assertAllEqual([], self.evaluate(dist.batch_shape_tensor()))
       self.assertEqual(tensor_shape.TensorShape([3]), dist.event_shape)
       self.assertEqual(tensor_shape.TensorShape([]), dist.batch_shape)
 
@@ -56,8 +60,8 @@ def testComplexShapes(self):
     with self.test_session():
       alpha = np.random.rand(3, 2, 2)
       dist = dirichlet_lib.Dirichlet(alpha)
-      self.assertEqual(2, dist.event_shape_tensor().eval())
-      self.assertAllEqual([3, 2], dist.batch_shape_tensor().eval())
+      self.assertEqual(2, self.evaluate(dist.event_shape_tensor()))
+      self.assertAllEqual([3, 2], self.evaluate(dist.batch_shape_tensor()))
       self.assertEqual(tensor_shape.TensorShape([2]), dist.event_shape)
       self.assertEqual(tensor_shape.TensorShape([3, 2]), dist.batch_shape)
 
@@ -66,22 +70,22 @@ def testConcentrationProperty(self):
     with self.test_session():
       dist = dirichlet_lib.Dirichlet(alpha)
       self.assertEqual([1, 3], dist.concentration.get_shape())
-      self.assertAllClose(alpha, dist.concentration.eval())
+      self.assertAllClose(alpha, self.evaluate(dist.concentration))
 
   def testPdfXProper(self):
     alpha = [[1., 2, 3]]
     with self.test_session():
       dist = dirichlet_lib.Dirichlet(alpha, validate_args=True)
-      dist.prob([.1, .3, .6]).eval()
-      dist.prob([.2, .3, .5]).eval()
+      self.evaluate(dist.prob([.1, .3, .6]))
+      self.evaluate(dist.prob([.2, .3, .5]))
       # Either condition can trigger.
       with self.assertRaisesOpError("samples must be positive"):
-        dist.prob([-1., 1.5, 0.5]).eval()
+        self.evaluate(dist.prob([-1., 1.5, 0.5]))
       with self.assertRaisesOpError("samples must be positive"):
-        dist.prob([0., .1, .9]).eval()
+        self.evaluate(dist.prob([0., .1, .9]))
       with self.assertRaisesOpError(
           "sample last-dimension must sum to `1`"):
-        dist.prob([.1, .2, .8]).eval()
+        self.evaluate(dist.prob([.1, .2, .8]))
 
   def testPdfZeroBatches(self):
     with self.test_session():
@@ -89,7 +93,7 @@ def testPdfZeroBatches(self):
       x = [.5, .5]
       dist = dirichlet_lib.Dirichlet(alpha)
       pdf = dist.prob(x)
-      self.assertAllClose(1., pdf.eval())
+      self.assertAllClose(1., self.evaluate(pdf))
       self.assertEqual((), pdf.get_shape())
 
   def testPdfZeroBatchesNontrivialX(self):
@@ -98,7 +102,7 @@ def testPdfZeroBatchesNontrivialX(self):
       x = [.3, .7]
       dist = dirichlet_lib.Dirichlet(alpha)
       pdf = dist.prob(x)
-      self.assertAllClose(7. / 5, pdf.eval())
+      self.assertAllClose(7. / 5, self.evaluate(pdf))
       self.assertEqual((), pdf.get_shape())
 
   def testPdfUniformZeroBatches(self):
@@ -108,7 +112,7 @@ def testPdfUniformZeroBatches(self):
       x = [[.2, .5, .3], [.3, .4, .3]]
       dist = dirichlet_lib.Dirichlet(alpha)
       pdf = dist.prob(x)
-      self.assertAllClose([2., 2.], pdf.eval())
+      self.assertAllClose([2., 2.], self.evaluate(pdf))
       self.assertEqual((2), pdf.get_shape())
 
   def testPdfAlphaStretchedInBroadcastWhenSameRank(self):
@@ -117,7 +121,7 @@ def testPdfAlphaStretchedInBroadcastWhenSameRank(self):
       x = [[.5, .5], [.3, .7]]
       dist = dirichlet_lib.Dirichlet(alpha)
       pdf = dist.prob(x)
-      self.assertAllClose([1., 7. / 5], pdf.eval())
+      self.assertAllClose([1., 7. / 5], self.evaluate(pdf))
       self.assertEqual((2), pdf.get_shape())
 
   def testPdfAlphaStretchedInBroadcastWhenLowerRank(self):
@@ -125,7 +129,7 @@ def testPdfAlphaStretchedInBroadcastWhenLowerRank(self):
       alpha = [1., 2]
       x = [[.5, .5], [.2, .8]]
       pdf = dirichlet_lib.Dirichlet(alpha).prob(x)
-      self.assertAllClose([1., 8. / 5], pdf.eval())
+      self.assertAllClose([1., 8. / 5], self.evaluate(pdf))
       self.assertEqual((2), pdf.get_shape())
 
   def testPdfXStretchedInBroadcastWhenSameRank(self):
@@ -133,7 +137,7 @@ def testPdfXStretchedInBroadcastWhenSameRank(self):
       alpha = [[1., 2], [2., 3]]
       x = [[.5, .5]]
       pdf = dirichlet_lib.Dirichlet(alpha).prob(x)
-      self.assertAllClose([1., 3. / 2], pdf.eval())
+      self.assertAllClose([1., 3. / 2], self.evaluate(pdf))
       self.assertEqual((2), pdf.get_shape())
 
   def testPdfXStretchedInBroadcastWhenLowerRank(self):
@@ -141,7 +145,7 @@ def testPdfXStretchedInBroadcastWhenLowerRank(self):
       alpha = [[1., 2], [2., 3]]
       x = [.5, .5]
       pdf = dirichlet_lib.Dirichlet(alpha).prob(x)
-      self.assertAllClose([1., 3. / 2], pdf.eval())
+      self.assertAllClose([1., 3. / 2], self.evaluate(pdf))
       self.assertEqual((2), pdf.get_shape())
 
   def testMean(self):
@@ -152,43 +156,44 @@ def testMean(self):
       if not stats:
         return
       expected_mean = stats.dirichlet.mean(alpha)
-      self.assertAllClose(dirichlet.mean().eval(), expected_mean)
+      self.assertAllClose(self.evaluate(dirichlet.mean()), expected_mean)
 
   def testCovarianceFromSampling(self):
     alpha = np.array([[1., 2, 3],
                       [2.5, 4, 0.01]], dtype=np.float32)
-    with self.test_session() as sess:
-      dist = dirichlet_lib.Dirichlet(alpha)  # batch_shape=[2], event_shape=[3]
-      x = dist.sample(int(250e3), seed=1)
-      sample_mean = math_ops.reduce_mean(x, 0)
-      x_centered = x - sample_mean[None, ...]
-      sample_cov = math_ops.reduce_mean(math_ops.matmul(
-          x_centered[..., None], x_centered[..., None, :]), 0)
-      sample_var = array_ops.matrix_diag_part(sample_cov)
-      sample_stddev = math_ops.sqrt(sample_var)
-      [
-          sample_mean_,
-          sample_cov_,
-          sample_var_,
-          sample_stddev_,
-          analytic_mean,
-          analytic_cov,
-          analytic_var,
-          analytic_stddev,
-      ] = sess.run([
-          sample_mean,
-          sample_cov,
-          sample_var,
-          sample_stddev,
-          dist.mean(),
-          dist.covariance(),
-          dist.variance(),
-          dist.stddev(),
-      ])
-      self.assertAllClose(sample_mean_, analytic_mean, atol=0., rtol=0.04)
-      self.assertAllClose(sample_cov_, analytic_cov, atol=0., rtol=0.06)
-      self.assertAllClose(sample_var_, analytic_var, atol=0., rtol=0.03)
-      self.assertAllClose(sample_stddev_, analytic_stddev, atol=0., rtol=0.02)
+    dist = dirichlet_lib.Dirichlet(alpha)  # batch_shape=[2], event_shape=[3]
+    x = dist.sample(int(250e3), seed=1)
+    sample_mean = math_ops.reduce_mean(x, 0)
+    x_centered = x - sample_mean[None, ...]
+    sample_cov = math_ops.reduce_mean(math_ops.matmul(
+        x_centered[..., None], x_centered[..., None, :]), 0)
+    sample_var = array_ops.matrix_diag_part(sample_cov)
+    sample_stddev = math_ops.sqrt(sample_var)
+
+    [
+        sample_mean_,
+        sample_cov_,
+        sample_var_,
+        sample_stddev_,
+        analytic_mean,
+        analytic_cov,
+        analytic_var,
+        analytic_stddev,
+    ] = self.evaluate([
+        sample_mean,
+        sample_cov,
+        sample_var,
+        sample_stddev,
+        dist.mean(),
+        dist.covariance(),
+        dist.variance(),
+        dist.stddev(),
+    ])
+
+    self.assertAllClose(sample_mean_, analytic_mean, atol=0., rtol=0.04)
+    self.assertAllClose(sample_cov_, analytic_cov, atol=0., rtol=0.06)
+    self.assertAllClose(sample_var_, analytic_var, atol=0., rtol=0.03)
+    self.assertAllClose(sample_stddev_, analytic_stddev, atol=0., rtol=0.02)
 
   def testVariance(self):
     with self.test_session():
@@ -201,7 +206,8 @@ def testVariance(self):
       expected_covariance = np.diag(stats.dirichlet.var(alpha))
       expected_covariance += [[0., -2, -3], [-2, 0, -6],
                               [-3, -6, 0]] / denominator
-      self.assertAllClose(dirichlet.covariance().eval(), expected_covariance)
+      self.assertAllClose(
+          self.evaluate(dirichlet.covariance()), expected_covariance)
 
   def testMode(self):
     with self.test_session():
@@ -209,7 +215,7 @@ def testMode(self):
       expected_mode = (alpha - 1) / (np.sum(alpha) - 3)
       dirichlet = dirichlet_lib.Dirichlet(concentration=alpha)
       self.assertEqual(dirichlet.mode().get_shape(), [3])
-      self.assertAllClose(dirichlet.mode().eval(), expected_mode)
+      self.assertAllClose(self.evaluate(dirichlet.mode()), expected_mode)
 
   def testModeInvalid(self):
     with self.test_session():
@@ -217,7 +223,7 @@ def testModeInvalid(self):
       dirichlet = dirichlet_lib.Dirichlet(concentration=alpha,
                                           allow_nan_stats=False)
       with self.assertRaisesOpError("Condition x < y.*"):
-        dirichlet.mode().eval()
+        self.evaluate(dirichlet.mode())
 
   def testModeEnableAllowNanStats(self):
     with self.test_session():
@@ -227,7 +233,7 @@ def testModeEnableAllowNanStats(self):
       expected_mode = np.zeros_like(alpha) + np.nan
 
       self.assertEqual(dirichlet.mode().get_shape(), [3])
-      self.assertAllClose(dirichlet.mode().eval(), expected_mode)
+      self.assertAllClose(self.evaluate(dirichlet.mode()), expected_mode)
 
   def testEntropy(self):
     with self.test_session():
@@ -237,7 +243,7 @@ def testEntropy(self):
       if not stats:
         return
       expected_entropy = stats.dirichlet.entropy(alpha)
-      self.assertAllClose(dirichlet.entropy().eval(), expected_entropy)
+      self.assertAllClose(self.evaluate(dirichlet.entropy()), expected_entropy)
 
   def testSample(self):
     with self.test_session():
@@ -245,7 +251,7 @@ def testSample(self):
       dirichlet = dirichlet_lib.Dirichlet(alpha)
       n = constant_op.constant(100000)
       samples = dirichlet.sample(n)
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
       self.assertEqual(sample_values.shape, (100000, 2))
       self.assertTrue(np.all(sample_values > 0.0))
       if not stats:
@@ -258,6 +264,39 @@ def testSample(self):
                   a=1., b=2.).cdf)[0],
           0.01)
 
+  def testDirichletDirichletKL(self):
+    conc1 = np.array([[1., 2., 3., 1.5, 2.5, 3.5],
+                      [1.5, 2.5, 3.5, 4.5, 5.5, 6.5]])
+    conc2 = np.array([[0.5, 1., 1.5, 2., 2.5, 3.]])
+
+    d1 = dirichlet_lib.Dirichlet(conc1)
+    d2 = dirichlet_lib.Dirichlet(conc2)
+    x = d1.sample(int(1e4), seed=0)
+    kl_sample = math_ops.reduce_mean(d1.log_prob(x) - d2.log_prob(x), 0)
+    kl_actual = kullback_leibler.kl_divergence(d1, d2)
+
+    kl_sample_val = self.evaluate(kl_sample)
+    kl_actual_val = self.evaluate(kl_actual)
+
+    self.assertEqual(conc1.shape[:-1], kl_actual.get_shape())
+
+    if not special:
+      return
+
+    kl_expected = (
+        special.gammaln(np.sum(conc1, -1))
+        - special.gammaln(np.sum(conc2, -1))
+        - np.sum(special.gammaln(conc1) - special.gammaln(conc2), -1)
+        + np.sum((conc1 - conc2) * (special.digamma(conc1) - special.digamma(
+            np.sum(conc1, -1, keepdims=True))), -1))
+
+    self.assertAllClose(kl_expected, kl_actual_val, atol=0., rtol=1e-6)
+    self.assertAllClose(kl_sample_val, kl_actual_val, atol=0., rtol=1e-1)
+
+    # Make sure KL(d1||d1) is 0
+    kl_same = self.evaluate(kullback_leibler.kl_divergence(d1, d1))
+    self.assertAllClose(kl_same, np.zeros_like(kl_expected))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/distributions/exponential_test.py b/tensorflow/python/kernel_tests/distributions/exponential_test.py
index 7afdf0f947605c..ebcd41b0e24ae8 100644
--- a/tensorflow/python/kernel_tests/distributions/exponential_test.py
+++ b/tensorflow/python/kernel_tests/distributions/exponential_test.py
@@ -24,6 +24,7 @@
 
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops.distributions import exponential as exponential_lib
 from tensorflow.python.platform import test
@@ -42,6 +43,7 @@ def try_import(name):  # pylint: disable=invalid-name
 stats = try_import("scipy.stats")
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class ExponentialTest(test.TestCase):
 
   def testExponentialLogPDF(self):
@@ -61,8 +63,8 @@ def testExponentialLogPDF(self):
       if not stats:
         return
       expected_log_pdf = stats.expon.logpdf(x, scale=1 / lam_v)
-      self.assertAllClose(log_pdf.eval(), expected_log_pdf)
-      self.assertAllClose(pdf.eval(), np.exp(expected_log_pdf))
+      self.assertAllClose(self.evaluate(log_pdf), expected_log_pdf)
+      self.assertAllClose(self.evaluate(pdf), np.exp(expected_log_pdf))
 
   def testExponentialCDF(self):
     with session.Session():
@@ -79,7 +81,7 @@ def testExponentialCDF(self):
       if not stats:
         return
       expected_cdf = stats.expon.cdf(x, scale=1 / lam_v)
-      self.assertAllClose(cdf.eval(), expected_cdf)
+      self.assertAllClose(self.evaluate(cdf), expected_cdf)
 
   def testExponentialMean(self):
     with session.Session():
@@ -89,7 +91,7 @@ def testExponentialMean(self):
       if not stats:
         return
       expected_mean = stats.expon.mean(scale=1 / lam_v)
-      self.assertAllClose(exponential.mean().eval(), expected_mean)
+      self.assertAllClose(self.evaluate(exponential.mean()), expected_mean)
 
   def testExponentialVariance(self):
     with session.Session():
@@ -99,7 +101,8 @@ def testExponentialVariance(self):
       if not stats:
         return
       expected_variance = stats.expon.var(scale=1 / lam_v)
-      self.assertAllClose(exponential.variance().eval(), expected_variance)
+      self.assertAllClose(
+          self.evaluate(exponential.variance()), expected_variance)
 
   def testExponentialEntropy(self):
     with session.Session():
@@ -109,7 +112,8 @@ def testExponentialEntropy(self):
       if not stats:
         return
       expected_entropy = stats.expon.entropy(scale=1 / lam_v)
-      self.assertAllClose(exponential.entropy().eval(), expected_entropy)
+      self.assertAllClose(
+          self.evaluate(exponential.entropy()), expected_entropy)
 
   def testExponentialSample(self):
     with self.test_session():
@@ -119,7 +123,7 @@ def testExponentialSample(self):
       exponential = exponential_lib.Exponential(rate=lam)
 
       samples = exponential.sample(n, seed=137)
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
       self.assertEqual(sample_values.shape, (100000, 2))
       self.assertFalse(np.any(sample_values < 0.0))
       if not stats:
@@ -142,7 +146,7 @@ def testExponentialSampleMultiDimensional(self):
       samples = exponential.sample(n, seed=138)
       self.assertEqual(samples.get_shape(), (n, batch_size, 2))
 
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
 
       self.assertFalse(np.any(sample_values < 0.0))
       if not stats:
@@ -163,8 +167,8 @@ def testExponentialWithSoftplusRate(self):
     with self.test_session():
       lam = [-2.2, -3.4]
       exponential = exponential_lib.ExponentialWithSoftplusRate(rate=lam)
-      self.assertAllClose(nn_ops.softplus(lam).eval(),
-                          exponential.rate.eval())
+      self.assertAllClose(
+          self.evaluate(nn_ops.softplus(lam)), self.evaluate(exponential.rate))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/distributions/laplace_test.py b/tensorflow/python/kernel_tests/distributions/laplace_test.py
index 55577386c450c7..918c7f63f20655 100644
--- a/tensorflow/python/kernel_tests/distributions/laplace_test.py
+++ b/tensorflow/python/kernel_tests/distributions/laplace_test.py
@@ -24,6 +24,7 @@
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops.distributions import laplace as laplace_lib
 from tensorflow.python.platform import test
@@ -43,6 +44,7 @@ def try_import(name):  # pylint: disable=invalid-name
 stats = try_import("scipy.stats")
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class LaplaceTest(test.TestCase):
 
   def testLaplaceShape(self):
@@ -51,9 +53,9 @@ def testLaplaceShape(self):
       scale = constant_op.constant(11.0)
       laplace = laplace_lib.Laplace(loc=loc, scale=scale)
 
-      self.assertEqual(laplace.batch_shape_tensor().eval(), (5,))
+      self.assertEqual(self.evaluate(laplace.batch_shape_tensor()), (5,))
       self.assertEqual(laplace.batch_shape, tensor_shape.TensorShape([5]))
-      self.assertAllEqual(laplace.event_shape_tensor().eval(), [])
+      self.assertAllEqual(self.evaluate(laplace.event_shape_tensor()), [])
       self.assertEqual(laplace.event_shape, tensor_shape.TensorShape([]))
 
   def testLaplaceLogPDF(self):
@@ -70,11 +72,11 @@ def testLaplaceLogPDF(self):
       if not stats:
         return
       expected_log_pdf = stats.laplace.logpdf(x, loc_v, scale=scale_v)
-      self.assertAllClose(log_pdf.eval(), expected_log_pdf)
+      self.assertAllClose(self.evaluate(log_pdf), expected_log_pdf)
 
       pdf = laplace.prob(x)
       self.assertEqual(pdf.get_shape(), (6,))
-      self.assertAllClose(pdf.eval(), np.exp(expected_log_pdf))
+      self.assertAllClose(self.evaluate(pdf), np.exp(expected_log_pdf))
 
   def testLaplaceLogPDFMultidimensional(self):
     with self.test_session():
@@ -86,11 +88,11 @@ def testLaplaceLogPDFMultidimensional(self):
       x = np.array([[2.5, 2.5, 4.0, 0.1, 1.0, 2.0]], dtype=np.float32).T
       laplace = laplace_lib.Laplace(loc=loc, scale=scale)
       log_pdf = laplace.log_prob(x)
-      log_pdf_values = log_pdf.eval()
+      log_pdf_values = self.evaluate(log_pdf)
       self.assertEqual(log_pdf.get_shape(), (6, 2))
 
       pdf = laplace.prob(x)
-      pdf_values = pdf.eval()
+      pdf_values = self.evaluate(pdf)
       self.assertEqual(pdf.get_shape(), (6, 2))
       if not stats:
         return
@@ -108,11 +110,11 @@ def testLaplaceLogPDFMultidimensionalBroadcasting(self):
       x = np.array([[2.5, 2.5, 4.0, 0.1, 1.0, 2.0]], dtype=np.float32).T
       laplace = laplace_lib.Laplace(loc=loc, scale=scale)
       log_pdf = laplace.log_prob(x)
-      log_pdf_values = log_pdf.eval()
+      log_pdf_values = self.evaluate(log_pdf)
       self.assertEqual(log_pdf.get_shape(), (6, 2))
 
       pdf = laplace.prob(x)
-      pdf_values = pdf.eval()
+      pdf_values = self.evaluate(pdf)
       self.assertEqual(pdf.get_shape(), (6, 2))
       if not stats:
         return
@@ -136,7 +138,7 @@ def testLaplaceCDF(self):
       if not stats:
         return
       expected_cdf = stats.laplace.cdf(x, loc_v, scale=scale_v)
-      self.assertAllClose(cdf.eval(), expected_cdf)
+      self.assertAllClose(self.evaluate(cdf), expected_cdf)
 
   def testLaplaceLogCDF(self):
     with self.test_session():
@@ -154,7 +156,7 @@ def testLaplaceLogCDF(self):
       if not stats:
         return
       expected_cdf = stats.laplace.logcdf(x, loc_v, scale=scale_v)
-      self.assertAllClose(cdf.eval(), expected_cdf)
+      self.assertAllClose(self.evaluate(cdf), expected_cdf)
 
   def testLaplaceLogSurvivalFunction(self):
     with self.test_session():
@@ -172,7 +174,7 @@ def testLaplaceLogSurvivalFunction(self):
       if not stats:
         return
       expected_sf = stats.laplace.logsf(x, loc_v, scale=scale_v)
-      self.assertAllClose(sf.eval(), expected_sf)
+      self.assertAllClose(self.evaluate(sf), expected_sf)
 
   def testLaplaceMean(self):
     with self.test_session():
@@ -183,7 +185,7 @@ def testLaplaceMean(self):
       if not stats:
         return
       expected_means = stats.laplace.mean(loc_v, scale=scale_v)
-      self.assertAllClose(laplace.mean().eval(), expected_means)
+      self.assertAllClose(self.evaluate(laplace.mean()), expected_means)
 
   def testLaplaceMode(self):
     with self.test_session():
@@ -191,7 +193,7 @@ def testLaplaceMode(self):
       scale_v = np.array([1.0, 4.0, 5.0])
       laplace = laplace_lib.Laplace(loc=loc_v, scale=scale_v)
       self.assertEqual(laplace.mode().get_shape(), (3,))
-      self.assertAllClose(laplace.mode().eval(), loc_v)
+      self.assertAllClose(self.evaluate(laplace.mode()), loc_v)
 
   def testLaplaceVariance(self):
     with self.test_session():
@@ -202,7 +204,7 @@ def testLaplaceVariance(self):
       if not stats:
         return
       expected_variances = stats.laplace.var(loc_v, scale=scale_v)
-      self.assertAllClose(laplace.variance().eval(), expected_variances)
+      self.assertAllClose(self.evaluate(laplace.variance()), expected_variances)
 
   def testLaplaceStd(self):
     with self.test_session():
@@ -213,7 +215,7 @@ def testLaplaceStd(self):
       if not stats:
         return
       expected_stddev = stats.laplace.std(loc_v, scale=scale_v)
-      self.assertAllClose(laplace.stddev().eval(), expected_stddev)
+      self.assertAllClose(self.evaluate(laplace.stddev()), expected_stddev)
 
   def testLaplaceEntropy(self):
     with self.test_session():
@@ -224,7 +226,7 @@ def testLaplaceEntropy(self):
       if not stats:
         return
       expected_entropy = stats.laplace.entropy(loc_v, scale=scale_v)
-      self.assertAllClose(laplace.entropy().eval(), expected_entropy)
+      self.assertAllClose(self.evaluate(laplace.entropy()), expected_entropy)
 
   def testLaplaceSample(self):
     with session.Session():
@@ -235,7 +237,7 @@ def testLaplaceSample(self):
       n = 100000
       laplace = laplace_lib.Laplace(loc=loc, scale=scale)
       samples = laplace.sample(n, seed=137)
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
       self.assertEqual(samples.get_shape(), (n,))
       self.assertEqual(sample_values.shape, (n,))
       if not stats:
@@ -260,7 +262,7 @@ def testLaplaceSampleMultiDimensional(self):
       laplace = laplace_lib.Laplace(loc=loc_v, scale=scale_v)
       n = 10000
       samples = laplace.sample(n, seed=137)
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
       self.assertEqual(samples.get_shape(), (n, 10, 100))
       self.assertEqual(sample_values.shape, (n, 10, 100))
       zeros = np.zeros_like(loc_v + scale_v)  # 10 x 100
@@ -297,32 +299,31 @@ def _kstest(self, loc, scale, samples):
     return ks < 0.02
 
   def testLaplacePdfOfSampleMultiDims(self):
-    with session.Session() as sess:
-      laplace = laplace_lib.Laplace(loc=[7., 11.], scale=[[5.], [6.]])
-      num = 50000
-      samples = laplace.sample(num, seed=137)
-      pdfs = laplace.prob(samples)
-      sample_vals, pdf_vals = sess.run([samples, pdfs])
-      self.assertEqual(samples.get_shape(), (num, 2, 2))
-      self.assertEqual(pdfs.get_shape(), (num, 2, 2))
-      self._assertIntegral(sample_vals[:, 0, 0], pdf_vals[:, 0, 0], err=0.02)
-      self._assertIntegral(sample_vals[:, 0, 1], pdf_vals[:, 0, 1], err=0.02)
-      self._assertIntegral(sample_vals[:, 1, 0], pdf_vals[:, 1, 0], err=0.02)
-      self._assertIntegral(sample_vals[:, 1, 1], pdf_vals[:, 1, 1], err=0.02)
-      if not stats:
-        return
-      self.assertAllClose(
-          stats.laplace.mean(
-              [[7., 11.], [7., 11.]], scale=np.array([[5., 5.], [6., 6.]])),
-          sample_vals.mean(axis=0),
-          rtol=0.05,
-          atol=0.)
-      self.assertAllClose(
-          stats.laplace.var([[7., 11.], [7., 11.]],
-                            scale=np.array([[5., 5.], [6., 6.]])),
-          sample_vals.var(axis=0),
-          rtol=0.05,
-          atol=0.)
+    laplace = laplace_lib.Laplace(loc=[7., 11.], scale=[[5.], [6.]])
+    num = 50000
+    samples = laplace.sample(num, seed=137)
+    pdfs = laplace.prob(samples)
+    sample_vals, pdf_vals = self.evaluate([samples, pdfs])
+    self.assertEqual(samples.get_shape(), (num, 2, 2))
+    self.assertEqual(pdfs.get_shape(), (num, 2, 2))
+    self._assertIntegral(sample_vals[:, 0, 0], pdf_vals[:, 0, 0], err=0.02)
+    self._assertIntegral(sample_vals[:, 0, 1], pdf_vals[:, 0, 1], err=0.02)
+    self._assertIntegral(sample_vals[:, 1, 0], pdf_vals[:, 1, 0], err=0.02)
+    self._assertIntegral(sample_vals[:, 1, 1], pdf_vals[:, 1, 1], err=0.02)
+    if not stats:
+      return
+    self.assertAllClose(
+        stats.laplace.mean(
+            [[7., 11.], [7., 11.]], scale=np.array([[5., 5.], [6., 6.]])),
+        sample_vals.mean(axis=0),
+        rtol=0.05,
+        atol=0.)
+    self.assertAllClose(
+        stats.laplace.var([[7., 11.], [7., 11.]],
+                          scale=np.array([[5., 5.], [6., 6.]])),
+        sample_vals.var(axis=0),
+        rtol=0.05,
+        atol=0.)
 
   def _assertIntegral(self, sample_vals, pdf_vals, err=1e-3):
     s_p = zip(sample_vals, pdf_vals)
@@ -338,24 +339,27 @@ def testLaplaceNonPositiveInitializationParamsRaises(self):
     with self.test_session():
       loc_v = constant_op.constant(0.0, name="loc")
       scale_v = constant_op.constant(-1.0, name="scale")
-      laplace = laplace_lib.Laplace(
-          loc=loc_v, scale=scale_v, validate_args=True)
-      with self.assertRaisesOpError("scale"):
-        laplace.mean().eval()
+      with self.assertRaisesOpError(
+          "Condition x > 0 did not hold element-wise"):
+        laplace = laplace_lib.Laplace(
+            loc=loc_v, scale=scale_v, validate_args=True)
+        self.evaluate(laplace.mean())
       loc_v = constant_op.constant(1.0, name="loc")
       scale_v = constant_op.constant(0.0, name="scale")
-      laplace = laplace_lib.Laplace(
-          loc=loc_v, scale=scale_v, validate_args=True)
-      with self.assertRaisesOpError("scale"):
-        laplace.mean().eval()
+      with self.assertRaisesOpError(
+          "Condition x > 0 did not hold element-wise"):
+        laplace = laplace_lib.Laplace(
+            loc=loc_v, scale=scale_v, validate_args=True)
+        self.evaluate(laplace.mean())
 
   def testLaplaceWithSoftplusScale(self):
     with self.test_session():
       loc_v = constant_op.constant([0.0, 1.0], name="loc")
       scale_v = constant_op.constant([-1.0, 2.0], name="scale")
       laplace = laplace_lib.LaplaceWithSoftplusScale(loc=loc_v, scale=scale_v)
-      self.assertAllClose(nn_ops.softplus(scale_v).eval(), laplace.scale.eval())
-      self.assertAllClose(loc_v.eval(), laplace.loc.eval())
+      self.assertAllClose(
+          self.evaluate(nn_ops.softplus(scale_v)), self.evaluate(laplace.scale))
+      self.assertAllClose(self.evaluate(loc_v), self.evaluate(laplace.loc))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/distributions/normal_test.py b/tensorflow/python/kernel_tests/distributions/normal_test.py
index 07c7d6d11d0f3b..d793e03272909c 100644
--- a/tensorflow/python/kernel_tests/distributions/normal_test.py
+++ b/tensorflow/python/kernel_tests/distributions/normal_test.py
@@ -27,6 +27,7 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_ops
@@ -54,7 +55,7 @@ def setUp(self):
     self._rng = np.random.RandomState(123)
 
   def assertAllFinite(self, tensor):
-    is_finite = np.isfinite(tensor.eval())
+    is_finite = np.isfinite(self.evaluate(tensor))
     all_true = np.ones_like(is_finite, dtype=np.bool)
     self.assertAllEqual(all_true, is_finite)
 
@@ -62,13 +63,13 @@ def _testParamShapes(self, sample_shape, expected):
     with self.test_session():
       param_shapes = normal_lib.Normal.param_shapes(sample_shape)
       mu_shape, sigma_shape = param_shapes["loc"], param_shapes["scale"]
-      self.assertAllEqual(expected, mu_shape.eval())
-      self.assertAllEqual(expected, sigma_shape.eval())
+      self.assertAllEqual(expected, self.evaluate(mu_shape))
+      self.assertAllEqual(expected, self.evaluate(sigma_shape))
       mu = array_ops.zeros(mu_shape)
       sigma = array_ops.ones(sigma_shape)
       self.assertAllEqual(
           expected,
-          array_ops.shape(normal_lib.Normal(mu, sigma).sample()).eval())
+          self.evaluate(array_ops.shape(normal_lib.Normal(mu, sigma).sample())))
 
   def _testParamStaticShapes(self, sample_shape, expected):
     param_shapes = normal_lib.Normal.param_static_shapes(sample_shape)
@@ -76,25 +77,30 @@ def _testParamStaticShapes(self, sample_shape, expected):
     self.assertEqual(expected, mu_shape)
     self.assertEqual(expected, sigma_shape)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testParamShapes(self):
     sample_shape = [10, 3, 4]
     self._testParamShapes(sample_shape, sample_shape)
     self._testParamShapes(constant_op.constant(sample_shape), sample_shape)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testParamStaticShapes(self):
     sample_shape = [10, 3, 4]
     self._testParamStaticShapes(sample_shape, sample_shape)
     self._testParamStaticShapes(
         tensor_shape.TensorShape(sample_shape), sample_shape)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalWithSoftplusScale(self):
     with self.test_session():
       mu = array_ops.zeros((10, 3))
       rho = array_ops.ones((10, 3)) * -2.
       normal = normal_lib.NormalWithSoftplusScale(loc=mu, scale=rho)
-      self.assertAllEqual(mu.eval(), normal.loc.eval())
-      self.assertAllEqual(nn_ops.softplus(rho).eval(), normal.scale.eval())
+      self.assertAllEqual(self.evaluate(mu), self.evaluate(normal.loc))
+      self.assertAllEqual(
+          self.evaluate(nn_ops.softplus(rho)), self.evaluate(normal.scale))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalLogPDF(self):
     with self.test_session():
       batch_size = 6
@@ -104,25 +110,31 @@ def testNormalLogPDF(self):
       normal = normal_lib.Normal(loc=mu, scale=sigma)
 
       log_pdf = normal.log_prob(x)
-      self.assertAllEqual(normal.batch_shape_tensor().eval(),
-                          log_pdf.get_shape())
-      self.assertAllEqual(normal.batch_shape_tensor().eval(),
-                          log_pdf.eval().shape)
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()), log_pdf.get_shape())
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()),
+          self.evaluate(log_pdf).shape)
       self.assertAllEqual(normal.batch_shape, log_pdf.get_shape())
-      self.assertAllEqual(normal.batch_shape, log_pdf.eval().shape)
+      self.assertAllEqual(normal.batch_shape, self.evaluate(log_pdf).shape)
 
       pdf = normal.prob(x)
-      self.assertAllEqual(normal.batch_shape_tensor().eval(), pdf.get_shape())
-      self.assertAllEqual(normal.batch_shape_tensor().eval(), pdf.eval().shape)
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()), pdf.get_shape())
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()),
+          self.evaluate(pdf).shape)
       self.assertAllEqual(normal.batch_shape, pdf.get_shape())
-      self.assertAllEqual(normal.batch_shape, pdf.eval().shape)
+      self.assertAllEqual(normal.batch_shape, self.evaluate(pdf).shape)
 
       if not stats:
         return
-      expected_log_pdf = stats.norm(mu.eval(), sigma.eval()).logpdf(x)
-      self.assertAllClose(expected_log_pdf, log_pdf.eval())
-      self.assertAllClose(np.exp(expected_log_pdf), pdf.eval())
+      expected_log_pdf = stats.norm(self.evaluate(mu),
+                                    self.evaluate(sigma)).logpdf(x)
+      self.assertAllClose(expected_log_pdf, self.evaluate(log_pdf))
+      self.assertAllClose(np.exp(expected_log_pdf), self.evaluate(pdf))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalLogPDFMultidimensional(self):
     with self.test_session():
       batch_size = 6
@@ -133,29 +145,34 @@ def testNormalLogPDFMultidimensional(self):
       normal = normal_lib.Normal(loc=mu, scale=sigma)
 
       log_pdf = normal.log_prob(x)
-      log_pdf_values = log_pdf.eval()
+      log_pdf_values = self.evaluate(log_pdf)
       self.assertEqual(log_pdf.get_shape(), (6, 2))
-      self.assertAllEqual(normal.batch_shape_tensor().eval(),
-                          log_pdf.get_shape())
-      self.assertAllEqual(normal.batch_shape_tensor().eval(),
-                          log_pdf.eval().shape)
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()), log_pdf.get_shape())
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()),
+          self.evaluate(log_pdf).shape)
       self.assertAllEqual(normal.batch_shape, log_pdf.get_shape())
-      self.assertAllEqual(normal.batch_shape, log_pdf.eval().shape)
+      self.assertAllEqual(normal.batch_shape, self.evaluate(log_pdf).shape)
 
       pdf = normal.prob(x)
-      pdf_values = pdf.eval()
+      pdf_values = self.evaluate(pdf)
       self.assertEqual(pdf.get_shape(), (6, 2))
-      self.assertAllEqual(normal.batch_shape_tensor().eval(), pdf.get_shape())
-      self.assertAllEqual(normal.batch_shape_tensor().eval(), pdf_values.shape)
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()), pdf.get_shape())
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()), pdf_values.shape)
       self.assertAllEqual(normal.batch_shape, pdf.get_shape())
       self.assertAllEqual(normal.batch_shape, pdf_values.shape)
 
       if not stats:
         return
-      expected_log_pdf = stats.norm(mu.eval(), sigma.eval()).logpdf(x)
+      expected_log_pdf = stats.norm(self.evaluate(mu),
+                                    self.evaluate(sigma)).logpdf(x)
       self.assertAllClose(expected_log_pdf, log_pdf_values)
       self.assertAllClose(np.exp(expected_log_pdf), pdf_values)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalCDF(self):
     with self.test_session():
       batch_size = 50
@@ -165,15 +182,19 @@ def testNormalCDF(self):
 
       normal = normal_lib.Normal(loc=mu, scale=sigma)
       cdf = normal.cdf(x)
-      self.assertAllEqual(normal.batch_shape_tensor().eval(), cdf.get_shape())
-      self.assertAllEqual(normal.batch_shape_tensor().eval(), cdf.eval().shape)
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()), cdf.get_shape())
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()),
+          self.evaluate(cdf).shape)
       self.assertAllEqual(normal.batch_shape, cdf.get_shape())
-      self.assertAllEqual(normal.batch_shape, cdf.eval().shape)
+      self.assertAllEqual(normal.batch_shape, self.evaluate(cdf).shape)
       if not stats:
         return
       expected_cdf = stats.norm(mu, sigma).cdf(x)
-      self.assertAllClose(expected_cdf, cdf.eval(), atol=0)
+      self.assertAllClose(expected_cdf, self.evaluate(cdf), atol=0)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalSurvivalFunction(self):
     with self.test_session():
       batch_size = 50
@@ -184,15 +205,19 @@ def testNormalSurvivalFunction(self):
       normal = normal_lib.Normal(loc=mu, scale=sigma)
 
       sf = normal.survival_function(x)
-      self.assertAllEqual(normal.batch_shape_tensor().eval(), sf.get_shape())
-      self.assertAllEqual(normal.batch_shape_tensor().eval(), sf.eval().shape)
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()), sf.get_shape())
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()),
+          self.evaluate(sf).shape)
       self.assertAllEqual(normal.batch_shape, sf.get_shape())
-      self.assertAllEqual(normal.batch_shape, sf.eval().shape)
+      self.assertAllEqual(normal.batch_shape, self.evaluate(sf).shape)
       if not stats:
         return
       expected_sf = stats.norm(mu, sigma).sf(x)
-      self.assertAllClose(expected_sf, sf.eval(), atol=0)
+      self.assertAllClose(expected_sf, self.evaluate(sf), atol=0)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalLogCDF(self):
     with self.test_session():
       batch_size = 50
@@ -203,15 +228,18 @@ def testNormalLogCDF(self):
       normal = normal_lib.Normal(loc=mu, scale=sigma)
 
       cdf = normal.log_cdf(x)
-      self.assertAllEqual(normal.batch_shape_tensor().eval(), cdf.get_shape())
-      self.assertAllEqual(normal.batch_shape_tensor().eval(), cdf.eval().shape)
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()), cdf.get_shape())
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()),
+          self.evaluate(cdf).shape)
       self.assertAllEqual(normal.batch_shape, cdf.get_shape())
-      self.assertAllEqual(normal.batch_shape, cdf.eval().shape)
+      self.assertAllEqual(normal.batch_shape, self.evaluate(cdf).shape)
 
       if not stats:
         return
       expected_cdf = stats.norm(mu, sigma).logcdf(x)
-      self.assertAllClose(expected_cdf, cdf.eval(), atol=0, rtol=1e-5)
+      self.assertAllClose(expected_cdf, self.evaluate(cdf), atol=0, rtol=1e-5)
 
   def testFiniteGradientAtDifficultPoints(self):
     for dtype in [np.float32, np.float64]:
@@ -233,6 +261,7 @@ def testFiniteGradientAtDifficultPoints(self):
             self.assertAllFinite(grads[0])
             self.assertAllFinite(grads[1])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalLogSurvivalFunction(self):
     with self.test_session():
       batch_size = 50
@@ -243,16 +272,20 @@ def testNormalLogSurvivalFunction(self):
       normal = normal_lib.Normal(loc=mu, scale=sigma)
 
       sf = normal.log_survival_function(x)
-      self.assertAllEqual(normal.batch_shape_tensor().eval(), sf.get_shape())
-      self.assertAllEqual(normal.batch_shape_tensor().eval(), sf.eval().shape)
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()), sf.get_shape())
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()),
+          self.evaluate(sf).shape)
       self.assertAllEqual(normal.batch_shape, sf.get_shape())
-      self.assertAllEqual(normal.batch_shape, sf.eval().shape)
+      self.assertAllEqual(normal.batch_shape, self.evaluate(sf).shape)
 
       if not stats:
         return
       expected_sf = stats.norm(mu, sigma).logsf(x)
-      self.assertAllClose(expected_sf, sf.eval(), atol=0, rtol=1e-5)
+      self.assertAllClose(expected_sf, self.evaluate(sf), atol=0, rtol=1e-5)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalEntropyWithScalarInputs(self):
     # Scipy.stats.norm cannot deal with the shapes in the other test.
     with self.test_session():
@@ -261,18 +294,20 @@ def testNormalEntropyWithScalarInputs(self):
       normal = normal_lib.Normal(loc=mu_v, scale=sigma_v)
 
       entropy = normal.entropy()
-      self.assertAllEqual(normal.batch_shape_tensor().eval(),
-                          entropy.get_shape())
-      self.assertAllEqual(normal.batch_shape_tensor().eval(),
-                          entropy.eval().shape)
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()), entropy.get_shape())
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()),
+          self.evaluate(entropy).shape)
       self.assertAllEqual(normal.batch_shape, entropy.get_shape())
-      self.assertAllEqual(normal.batch_shape, entropy.eval().shape)
+      self.assertAllEqual(normal.batch_shape, self.evaluate(entropy).shape)
       # scipy.stats.norm cannot deal with these shapes.
       if not stats:
         return
       expected_entropy = stats.norm(mu_v, sigma_v).entropy()
-      self.assertAllClose(expected_entropy, entropy.eval())
+      self.assertAllClose(expected_entropy, self.evaluate(entropy))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalEntropy(self):
     with self.test_session():
       mu_v = np.array([1.0, 1.0, 1.0])
@@ -284,14 +319,16 @@ def testNormalEntropy(self):
       expected_entropy = 0.5 * np.log(2 * np.pi * np.exp(1) * sigma_broadcast**
                                       2)
       entropy = normal.entropy()
-      np.testing.assert_allclose(expected_entropy, entropy.eval())
-      self.assertAllEqual(normal.batch_shape_tensor().eval(),
-                          entropy.get_shape())
-      self.assertAllEqual(normal.batch_shape_tensor().eval(),
-                          entropy.eval().shape)
+      np.testing.assert_allclose(expected_entropy, self.evaluate(entropy))
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()), entropy.get_shape())
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()),
+          self.evaluate(entropy).shape)
       self.assertAllEqual(normal.batch_shape, entropy.get_shape())
-      self.assertAllEqual(normal.batch_shape, entropy.eval().shape)
+      self.assertAllEqual(normal.batch_shape, self.evaluate(entropy).shape)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalMeanAndMode(self):
     with self.test_session():
       # Mu will be broadcast to [7, 7, 7].
@@ -301,11 +338,12 @@ def testNormalMeanAndMode(self):
       normal = normal_lib.Normal(loc=mu, scale=sigma)
 
       self.assertAllEqual((3,), normal.mean().get_shape())
-      self.assertAllEqual([7., 7, 7], normal.mean().eval())
+      self.assertAllEqual([7., 7, 7], self.evaluate(normal.mean()))
 
       self.assertAllEqual((3,), normal.mode().get_shape())
-      self.assertAllEqual([7., 7, 7], normal.mode().eval())
+      self.assertAllEqual([7., 7, 7], self.evaluate(normal.mode()))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalQuantile(self):
     with self.test_session():
       batch_size = 52
@@ -319,15 +357,18 @@ def testNormalQuantile(self):
       normal = normal_lib.Normal(loc=mu, scale=sigma)
       x = normal.quantile(p)
 
-      self.assertAllEqual(normal.batch_shape_tensor().eval(), x.get_shape())
-      self.assertAllEqual(normal.batch_shape_tensor().eval(), x.eval().shape)
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()), x.get_shape())
+      self.assertAllEqual(
+          self.evaluate(normal.batch_shape_tensor()),
+          self.evaluate(x).shape)
       self.assertAllEqual(normal.batch_shape, x.get_shape())
-      self.assertAllEqual(normal.batch_shape, x.eval().shape)
+      self.assertAllEqual(normal.batch_shape, self.evaluate(x).shape)
 
       if not stats:
         return
       expected_x = stats.norm(mu, sigma).ppf(p)
-      self.assertAllClose(expected_x, x.eval(), atol=0.)
+      self.assertAllClose(expected_x, self.evaluate(x), atol=0.)
 
   def _baseQuantileFiniteGradientAtDifficultPoints(self, dtype):
     g = ops.Graph()
@@ -354,6 +395,7 @@ def testQuantileFiniteGradientAtDifficultPointsFloat32(self):
   def testQuantileFiniteGradientAtDifficultPointsFloat64(self):
     self._baseQuantileFiniteGradientAtDifficultPoints(np.float64)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalVariance(self):
     with self.test_session():
       # sigma will be broadcast to [7, 7, 7]
@@ -363,8 +405,9 @@ def testNormalVariance(self):
       normal = normal_lib.Normal(loc=mu, scale=sigma)
 
       self.assertAllEqual((3,), normal.variance().get_shape())
-      self.assertAllEqual([49., 49, 49], normal.variance().eval())
+      self.assertAllEqual([49., 49, 49], self.evaluate(normal.variance()))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalStandardDeviation(self):
     with self.test_session():
       # sigma will be broadcast to [7, 7, 7]
@@ -374,8 +417,9 @@ def testNormalStandardDeviation(self):
       normal = normal_lib.Normal(loc=mu, scale=sigma)
 
       self.assertAllEqual((3,), normal.stddev().get_shape())
-      self.assertAllEqual([7., 7, 7], normal.stddev().eval())
+      self.assertAllEqual([7., 7, 7], self.evaluate(normal.stddev()))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalSample(self):
     with self.test_session():
       mu = constant_op.constant(3.0)
@@ -385,7 +429,7 @@ def testNormalSample(self):
       n = constant_op.constant(100000)
       normal = normal_lib.Normal(loc=mu, scale=sigma)
       samples = normal.sample(n)
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
       # Note that the standard error for the sample mean is ~ sigma / sqrt(n).
       # The sample variance similarly is dependent on sigma and n.
       # Thus, the tolerances below are very sensitive to number of samples
@@ -394,18 +438,22 @@ def testNormalSample(self):
       self.assertAllClose(sample_values.mean(), mu_v, atol=1e-1)
       self.assertAllClose(sample_values.std(), sigma_v, atol=1e-1)
 
-      expected_samples_shape = tensor_shape.TensorShape([n.eval()]).concatenate(
-          tensor_shape.TensorShape(normal.batch_shape_tensor().eval()))
+      expected_samples_shape = tensor_shape.TensorShape(
+          [self.evaluate(n)]).concatenate(
+              tensor_shape.TensorShape(
+                  self.evaluate(normal.batch_shape_tensor())))
 
       self.assertAllEqual(expected_samples_shape, samples.get_shape())
       self.assertAllEqual(expected_samples_shape, sample_values.shape)
 
-      expected_samples_shape = (tensor_shape.TensorShape(
-          [n.eval()]).concatenate(normal.batch_shape))
+      expected_samples_shape = (
+          tensor_shape.TensorShape([self.evaluate(n)]).concatenate(
+              normal.batch_shape))
 
       self.assertAllEqual(expected_samples_shape, samples.get_shape())
       self.assertAllEqual(expected_samples_shape, sample_values.shape)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalSampleMultiDimensional(self):
     with self.test_session():
       batch_size = 2
@@ -417,7 +465,7 @@ def testNormalSampleMultiDimensional(self):
       n = constant_op.constant(100000)
       normal = normal_lib.Normal(loc=mu, scale=sigma)
       samples = normal.sample(n)
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
       # Note that the standard error for the sample mean is ~ sigma / sqrt(n).
       # The sample variance similarly is dependent on sigma and n.
       # Thus, the tolerances below are very sensitive to number of samples
@@ -428,32 +476,37 @@ def testNormalSampleMultiDimensional(self):
       self.assertAllClose(sample_values[:, 0, 1].mean(), mu_v[1], atol=1e-1)
       self.assertAllClose(sample_values[:, 0, 1].std(), sigma_v[1], atol=1e-1)
 
-      expected_samples_shape = tensor_shape.TensorShape([n.eval()]).concatenate(
-          tensor_shape.TensorShape(normal.batch_shape_tensor().eval()))
+      expected_samples_shape = tensor_shape.TensorShape(
+          [self.evaluate(n)]).concatenate(
+              tensor_shape.TensorShape(
+                  self.evaluate(normal.batch_shape_tensor())))
       self.assertAllEqual(expected_samples_shape, samples.get_shape())
       self.assertAllEqual(expected_samples_shape, sample_values.shape)
 
-      expected_samples_shape = (tensor_shape.TensorShape(
-          [n.eval()]).concatenate(normal.batch_shape))
+      expected_samples_shape = (
+          tensor_shape.TensorShape([self.evaluate(n)]).concatenate(
+              normal.batch_shape))
       self.assertAllEqual(expected_samples_shape, samples.get_shape())
       self.assertAllEqual(expected_samples_shape, sample_values.shape)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNegativeSigmaFails(self):
     with self.test_session():
-      normal = normal_lib.Normal(
-          loc=[1.], scale=[-5.], validate_args=True, name="G")
       with self.assertRaisesOpError("Condition x > 0 did not hold"):
-        normal.mean().eval()
+        normal = normal_lib.Normal(
+            loc=[1.], scale=[-5.], validate_args=True, name="G")
+        self.evaluate(normal.mean())
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalShape(self):
     with self.test_session():
       mu = constant_op.constant([-3.0] * 5)
       sigma = constant_op.constant(11.0)
       normal = normal_lib.Normal(loc=mu, scale=sigma)
 
-      self.assertEqual(normal.batch_shape_tensor().eval(), [5])
+      self.assertEqual(self.evaluate(normal.batch_shape_tensor()), [5])
       self.assertEqual(normal.batch_shape, tensor_shape.TensorShape([5]))
-      self.assertAllEqual(normal.event_shape_tensor().eval(), [])
+      self.assertAllEqual(self.evaluate(normal.event_shape_tensor()), [])
       self.assertEqual(normal.event_shape, tensor_shape.TensorShape([]))
 
   def testNormalShapeWithPlaceholders(self):
@@ -465,31 +518,31 @@ def testNormalShapeWithPlaceholders(self):
       # get_batch_shape should return an "<unknown>" tensor.
       self.assertEqual(normal.batch_shape, tensor_shape.TensorShape(None))
       self.assertEqual(normal.event_shape, ())
-      self.assertAllEqual(normal.event_shape_tensor().eval(), [])
+      self.assertAllEqual(self.evaluate(normal.event_shape_tensor()), [])
       self.assertAllEqual(
           sess.run(normal.batch_shape_tensor(),
                    feed_dict={mu: 5.0,
                               sigma: [1.0, 2.0]}), [2])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNormalNormalKL(self):
-    with self.test_session() as sess:
-      batch_size = 6
-      mu_a = np.array([3.0] * batch_size)
-      sigma_a = np.array([1.0, 2.0, 3.0, 1.5, 2.5, 3.5])
-      mu_b = np.array([-3.0] * batch_size)
-      sigma_b = np.array([0.5, 1.0, 1.5, 2.0, 2.5, 3.0])
+    batch_size = 6
+    mu_a = np.array([3.0] * batch_size)
+    sigma_a = np.array([1.0, 2.0, 3.0, 1.5, 2.5, 3.5])
+    mu_b = np.array([-3.0] * batch_size)
+    sigma_b = np.array([0.5, 1.0, 1.5, 2.0, 2.5, 3.0])
 
-      n_a = normal_lib.Normal(loc=mu_a, scale=sigma_a)
-      n_b = normal_lib.Normal(loc=mu_b, scale=sigma_b)
+    n_a = normal_lib.Normal(loc=mu_a, scale=sigma_a)
+    n_b = normal_lib.Normal(loc=mu_b, scale=sigma_b)
 
-      kl = kullback_leibler.kl_divergence(n_a, n_b)
-      kl_val = sess.run(kl)
+    kl = kullback_leibler.kl_divergence(n_a, n_b)
+    kl_val = self.evaluate(kl)
 
-      kl_expected = ((mu_a - mu_b)**2 / (2 * sigma_b**2) + 0.5 * (
-          (sigma_a**2 / sigma_b**2) - 1 - 2 * np.log(sigma_a / sigma_b)))
+    kl_expected = ((mu_a - mu_b)**2 / (2 * sigma_b**2) + 0.5 * (
+        (sigma_a**2 / sigma_b**2) - 1 - 2 * np.log(sigma_a / sigma_b)))
 
-      self.assertEqual(kl.get_shape(), (batch_size,))
-      self.assertAllClose(kl_val, kl_expected)
+    self.assertEqual(kl.get_shape(), (batch_size,))
+    self.assertAllClose(kl_val, kl_expected)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/distributions/special_math_test.py b/tensorflow/python/kernel_tests/distributions/special_math_test.py
index 2d434a39c29338..4565bf5c4669b4 100644
--- a/tensorflow/python/kernel_tests/distributions/special_math_test.py
+++ b/tensorflow/python/kernel_tests/distributions/special_math_test.py
@@ -23,11 +23,14 @@
 
 import numpy as np
 
+from tensorflow.python.eager import backprop as tfe_backprop
+from tensorflow.python.eager import context as tfe_context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import variables
 from tensorflow.python.ops.distributions import special_math
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
@@ -64,6 +67,16 @@ def _make_grid(dtype, grid_spec):
   return np.reshape(grid, grid_spec.shape)
 
 
+def _value_and_gradient(fn, *args):
+  """Calls `fn` and computes the gradient of the result wrt `arg`."""
+  if tfe_context.executing_eagerly():
+    v, g = tfe_backprop.val_and_grad_function(fn)(args)
+  else:
+    v = fn(*args)
+    g = gradients_impl.gradients(v, args)
+  return v, g
+
+
 GridSpec = collections.namedtuple("GridSpec", ["min", "max", "shape"])
 
 ErrorSpec = collections.namedtuple("ErrorSpec", ["rtol", "atol"])
@@ -71,11 +84,12 @@ def _make_grid(dtype, grid_spec):
 
 class NdtriTest(test.TestCase):
 
-  def assertAllFinite(self, tensor):
-    is_finite = np.isfinite(tensor.eval())
+  def assertAllFinite(self, x):
+    is_finite = np.isfinite(x)
     all_true = np.ones_like(is_finite, dtype=np.bool)
     self.assertAllEqual(all_true, is_finite)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNdtri(self):
     """Verifies that ndtri computation is correct."""
     with self.test_session():
@@ -89,7 +103,7 @@ def testNdtri(self):
                      np.exp(-2), 1. - np.exp(-2)))
       expected_x = special.ndtri(p)
       x = special_math.ndtri(p)
-      self.assertAllClose(expected_x, x.eval(), atol=0.)
+      self.assertAllClose(expected_x, self.evaluate(x), atol=0.)
 
   def testNdtriDynamicShape(self):
     """Verifies that ndtri computation is correct."""
@@ -108,27 +122,32 @@ def testNdtriDynamicShape(self):
 
   def _baseNdtriFiniteGradientTest(self, dtype):
     """Verifies that ndtri has finite gradients at interesting points."""
-    g = ops.Graph()
-    with g.as_default():
-      # Tests gradients at 0, 1, and piece-wise boundaries.
-      p = variables.Variable(
-          np.array([0.,
-                    np.exp(-32.), np.exp(-2.),
-                    1. - np.exp(-2.), 1. - np.exp(-32.),
-                    1.]).astype(dtype))
-    value = special_math.ndtri(p)
-    grads = gradients_impl.gradients(value, p)
-    with self.test_session(graph=g):
-      variables.global_variables_initializer().run()
-      self.assertAllFinite(grads[0])
-
+    # Tests gradients at 0, 1, and piece-wise boundaries.
+    p = constant_op.constant(
+        np.array([
+            0.,
+            np.exp(-32.),
+            np.exp(-2.),
+            1. - np.exp(-2.),
+            1. - np.exp(-32.),
+            1.,
+        ]).astype(dtype))
+    # Not having the lambda sanitzer means we'd get an `IndexError` whenever
+    # the user supplied function has default args.
+    _, grads = _value_and_gradient(
+        lambda x: special_math.ndtri(x), p)  # pylint: disable=unnecessary-lambda
+    self.assertAllFinite(self.evaluate(grads[0]))
+
+  @test_util.run_in_graph_and_eager_modes()
   def testNdtriFiniteGradientFloat32(self):
     self._baseNdtriFiniteGradientTest(np.float32)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testNdtriFiniteGradientFloat64(self):
     self._baseNdtriFiniteGradientTest(np.float64)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class NdtrTest(test.TestCase):
   _use_log = False
   # Grid min/max chosen to ensure 0 < cdf(x) < 1.
@@ -147,55 +166,53 @@ def _test_grid_log(self, dtype, grid_spec, error_spec):
     if not special:
       return
 
-    with self.test_session():
-      grid = _make_grid(dtype, grid_spec)
-      actual = sm.log_ndtr(grid).eval()
-
-      # Basic tests.
-      # isfinite checks for NaN and Inf.
-      self.assertTrue(np.isfinite(actual).all())
-      # On the grid, -inf < log_cdf(x) < 0.  In this case, we should be able
-      # to use a huge grid because we have used tricks to escape numerical
-      # difficulties.
-      self.assertTrue((actual < 0).all())
-      _check_strictly_increasing(actual)
-
-      # Versus scipy.
-      expected = special.log_ndtr(grid)
-      # Scipy prematurely goes to zero at some places that we don't.  So don't
-      # include these in the comparison.
-      self.assertAllClose(
-          expected.astype(np.float64)[expected < 0],
-          actual.astype(np.float64)[expected < 0],
-          rtol=error_spec.rtol,
-          atol=error_spec.atol)
+    grid = _make_grid(dtype, grid_spec)
+    actual = self.evaluate(sm.log_ndtr(grid))
+
+    # Basic tests.
+    # isfinite checks for NaN and Inf.
+    self.assertTrue(np.isfinite(actual).all())
+    # On the grid, -inf < log_cdf(x) < 0.  In this case, we should be able
+    # to use a huge grid because we have used tricks to escape numerical
+    # difficulties.
+    self.assertTrue((actual < 0).all())
+    _check_strictly_increasing(actual)
+
+    # Versus scipy.
+    expected = special.log_ndtr(grid)
+    # Scipy prematurely goes to zero at some places that we don't.  So don't
+    # include these in the comparison.
+    self.assertAllClose(
+        expected.astype(np.float64)[expected < 0],
+        actual.astype(np.float64)[expected < 0],
+        rtol=error_spec.rtol,
+        atol=error_spec.atol)
 
   def _test_grid_no_log(self, dtype, grid_spec, error_spec):
     if not special:
       return
 
-    with self.test_session():
-      grid = _make_grid(dtype, grid_spec)
-      actual = sm.ndtr(grid).eval()
-
-      # Basic tests.
-      # isfinite checks for NaN and Inf.
-      self.assertTrue(np.isfinite(actual).all())
-      # On the grid, 0 < cdf(x) < 1.  The grid cannot contain everything due
-      # to numerical limitations of cdf.
-      self.assertTrue((actual > 0).all())
-      self.assertTrue((actual < 1).all())
-      _check_strictly_increasing(actual)
-
-      # Versus scipy.
-      expected = special.ndtr(grid)
-      # Scipy prematurely goes to zero at some places that we don't.  So don't
-      # include these in the comparison.
-      self.assertAllClose(
-          expected.astype(np.float64)[expected < 0],
-          actual.astype(np.float64)[expected < 0],
-          rtol=error_spec.rtol,
-          atol=error_spec.atol)
+    grid = _make_grid(dtype, grid_spec)
+    actual = self.evaluate(sm.ndtr(grid))
+
+    # Basic tests.
+    # isfinite checks for NaN and Inf.
+    self.assertTrue(np.isfinite(actual).all())
+    # On the grid, 0 < cdf(x) < 1.  The grid cannot contain everything due
+    # to numerical limitations of cdf.
+    self.assertTrue((actual > 0).all())
+    self.assertTrue((actual < 1).all())
+    _check_strictly_increasing(actual)
+
+    # Versus scipy.
+    expected = special.ndtr(grid)
+    # Scipy prematurely goes to zero at some places that we don't.  So don't
+    # include these in the comparison.
+    self.assertAllClose(
+        expected.astype(np.float64)[expected < 0],
+        actual.astype(np.float64)[expected < 0],
+        rtol=error_spec.rtol,
+        atol=error_spec.atol)
 
   def test_float32(self):
     self._test_grid(np.float32, self._grid32, self._error32)
@@ -254,14 +271,17 @@ def assert_all_false(self, v):
     self.assertAllEqual(np.zeros_like(v, dtype=np.bool), v)
 
   def _test_grad_finite(self, dtype):
-    with self.test_session():
-      x = variables.Variable([-100., 0., 100.], dtype=dtype)
-      output = (sm.log_ndtr(x) if self._use_log else sm.ndtr(x))
-      grad_output = gradients_impl.gradients(output, x)
-      variables.global_variables_initializer().run()
-      # isfinite checks for NaN and Inf.
-      self.assert_all_true(np.isfinite(output.eval()))
-      self.assert_all_true(np.isfinite(grad_output[0].eval()))
+    x = constant_op.constant([-100., 0., 100.], dtype=dtype)
+    output = (sm.log_ndtr(x) if self._use_log else sm.ndtr(x))
+    fn = sm.log_ndtr if self._use_log else sm.ndtr
+    # Not having the lambda sanitzer means we'd get an `IndexError` whenever
+    # the user supplied function has default args.
+    output, grad_output = _value_and_gradient(
+        lambda x_: fn(x_), x)  # pylint: disable=unnecessary-lambda
+    # isfinite checks for NaN and Inf.
+    output_, grad_output_ = self.evaluate([output, grad_output])
+    self.assert_all_true(np.isfinite(output_))
+    self.assert_all_true(np.isfinite(grad_output_[0]))
 
   def _test_grad_accuracy(self, dtype, grid_spec, error_spec):
     raw_grid = _make_grid(dtype, grid_spec)
@@ -357,7 +377,6 @@ def testErfInvIntegerInput(self):
         special_math.erfinv(x)
 
 
-
 class LogCDFLaplaceTest(test.TestCase):
   # Note that scipy.stats.laplace does not have a stable Log CDF, so we cannot
   # rely on scipy to cross check the extreme values.
diff --git a/tensorflow/python/kernel_tests/distributions/student_t_test.py b/tensorflow/python/kernel_tests/distributions/student_t_test.py
index f1150de58e0dae..a4fdb658e857d8 100644
--- a/tensorflow/python/kernel_tests/distributions/student_t_test.py
+++ b/tensorflow/python/kernel_tests/distributions/student_t_test.py
@@ -25,6 +25,7 @@
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops.distributions import student_t
@@ -44,6 +45,7 @@ def try_import(name):  # pylint: disable=invalid-name
 stats = try_import("scipy.stats")
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class StudentTTest(test.TestCase):
 
   def testStudentPDFAndLogPDF(self):
@@ -60,10 +62,10 @@ def testStudentPDFAndLogPDF(self):
 
       log_pdf = student.log_prob(t)
       self.assertEquals(log_pdf.get_shape(), (6,))
-      log_pdf_values = log_pdf.eval()
+      log_pdf_values = self.evaluate(log_pdf)
       pdf = student.prob(t)
       self.assertEquals(pdf.get_shape(), (6,))
-      pdf_values = pdf.eval()
+      pdf_values = self.evaluate(pdf)
 
       if not stats:
         return
@@ -88,10 +90,10 @@ def testStudentLogPDFMultidimensional(self):
       t = np.array([[-2.5, 2.5, 4., 0., -1., 2.]], dtype=np.float32).T
       student = student_t.StudentT(df, loc=mu, scale=sigma)
       log_pdf = student.log_prob(t)
-      log_pdf_values = log_pdf.eval()
+      log_pdf_values = self.evaluate(log_pdf)
       self.assertEqual(log_pdf.get_shape(), (6, 2))
       pdf = student.prob(t)
-      pdf_values = pdf.eval()
+      pdf_values = self.evaluate(pdf)
       self.assertEqual(pdf.get_shape(), (6, 2))
 
       if not stats:
@@ -117,10 +119,10 @@ def testStudentCDFAndLogCDF(self):
 
       log_cdf = student.log_cdf(t)
       self.assertEquals(log_cdf.get_shape(), (6,))
-      log_cdf_values = log_cdf.eval()
+      log_cdf_values = self.evaluate(log_cdf)
       cdf = student.cdf(t)
       self.assertEquals(cdf.get_shape(), (6,))
-      cdf_values = cdf.eval()
+      cdf_values = self.evaluate(cdf)
 
       if not stats:
         return
@@ -140,7 +142,7 @@ def testStudentEntropy(self):
     with self.test_session():
       student = student_t.StudentT(df=df_v, loc=mu_v, scale=sigma_v)
       ent = student.entropy()
-      ent_values = ent.eval()
+      ent_values = self.evaluate(ent)
 
     # Help scipy broadcast to 3x3
     ones = np.array([[1, 1, 1]])
@@ -167,7 +169,7 @@ def testStudentSample(self):
       n = constant_op.constant(200000)
       student = student_t.StudentT(df=df, loc=mu, scale=sigma)
       samples = student.sample(n, seed=123456)
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
       n_val = 200000
       self.assertEqual(sample_values.shape, (n_val,))
       self.assertAllClose(sample_values.mean(), mu_v, rtol=1e-2, atol=0)
@@ -189,12 +191,12 @@ def testStudentSampleMultipleTimes(self):
       random_seed.set_random_seed(654321)
       student = student_t.StudentT(
           df=df, loc=mu, scale=sigma, name="student_t1")
-      samples1 = student.sample(n, seed=123456).eval()
+      samples1 = self.evaluate(student.sample(n, seed=123456))
 
       random_seed.set_random_seed(654321)
       student2 = student_t.StudentT(
           df=df, loc=mu, scale=sigma, name="student_t2")
-      samples2 = student2.sample(n, seed=123456).eval()
+      samples2 = self.evaluate(student2.sample(n, seed=123456))
 
       self.assertAllClose(samples1, samples2)
 
@@ -205,7 +207,7 @@ def testStudentSampleSmallDfNoNan(self):
       n = constant_op.constant(200000)
       student = student_t.StudentT(df=df, loc=1., scale=1.)
       samples = student.sample(n, seed=123456)
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
       n_val = 200000
       self.assertEqual(sample_values.shape, (n_val, 4))
       self.assertTrue(np.all(np.logical_not(np.isnan(sample_values))))
@@ -223,7 +225,7 @@ def testStudentSampleMultiDimensional(self):
       n = constant_op.constant(200000)
       student = student_t.StudentT(df=df, loc=mu, scale=sigma)
       samples = student.sample(n, seed=123456)
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
       self.assertEqual(samples.get_shape(), (200000, batch_size, 2))
       self.assertAllClose(
           sample_values[:, 0, 0].mean(), mu_v[0], rtol=1e-2, atol=0)
@@ -325,7 +327,7 @@ def testMeanAllowNanStatsIsFalseWorksWhenAllBatchMembersAreDefined(self):
     with self.test_session():
       mu = [1., 3.3, 4.4]
       student = student_t.StudentT(df=[3., 5., 7.], loc=mu, scale=[3., 2., 1.])
-      mean = student.mean().eval()
+      mean = self.evaluate(student.mean())
       self.assertAllClose([1., 3.3, 4.4], mean)
 
   def testMeanAllowNanStatsIsFalseRaisesWhenBatchMemberIsUndefined(self):
@@ -335,7 +337,7 @@ def testMeanAllowNanStatsIsFalseRaisesWhenBatchMemberIsUndefined(self):
           df=[0.5, 5., 7.], loc=mu, scale=[3., 2., 1.],
           allow_nan_stats=False)
       with self.assertRaisesOpError("x < y"):
-        student.mean().eval()
+        self.evaluate(student.mean())
 
   def testMeanAllowNanStatsIsTrueReturnsNaNForUndefinedBatchMembers(self):
     with self.test_session():
@@ -344,7 +346,7 @@ def testMeanAllowNanStatsIsTrueReturnsNaNForUndefinedBatchMembers(self):
       student = student_t.StudentT(
           df=[0.5, 1., 3., 5., 7.], loc=mu, scale=sigma,
           allow_nan_stats=True)
-      mean = student.mean().eval()
+      mean = self.evaluate(student.mean())
       self.assertAllClose([np.nan, np.nan, 1., 3.3, 4.4], mean)
 
   def testVarianceAllowNanStatsTrueReturnsNaNforUndefinedBatchMembers(self):
@@ -356,7 +358,7 @@ def testVarianceAllowNanStatsTrueReturnsNaNforUndefinedBatchMembers(self):
       sigma = [5., 4., 3., 2., 1.]
       student = student_t.StudentT(
           df=df, loc=mu, scale=sigma, allow_nan_stats=True)
-      var = student.variance().eval()
+      var = self.evaluate(student.variance())
       ## scipy uses inf for variance when the mean is undefined.  When mean is
       # undefined we say variance is undefined as well.  So test the first
       # member of var, making sure it is NaN, then replace with inf and compare
@@ -379,7 +381,7 @@ def testVarianceAllowNanStatsFalseGivesCorrectValueForDefinedBatchMembers(
       mu = [0., 1., 3.3, 4.4]
       sigma = [4., 3., 2., 1.]
       student = student_t.StudentT(df=df, loc=mu, scale=sigma)
-      var = student.variance().eval()
+      var = self.evaluate(student.variance())
 
       if not stats:
         return
@@ -394,14 +396,14 @@ def testVarianceAllowNanStatsFalseRaisesForUndefinedBatchMembers(self):
       student = student_t.StudentT(
           df=1., loc=0., scale=1., allow_nan_stats=False)
       with self.assertRaisesOpError("x < y"):
-        student.variance().eval()
+        self.evaluate(student.variance())
 
     with self.test_session():
       # df <= 1 ==> variance not defined
       student = student_t.StudentT(
           df=0.5, loc=0., scale=1., allow_nan_stats=False)
       with self.assertRaisesOpError("x < y"):
-        student.variance().eval()
+        self.evaluate(student.variance())
 
   def testStd(self):
     with self.test_session():
@@ -411,7 +413,7 @@ def testStd(self):
       sigma = [5., 4., 3., 2., 1.]
       student = student_t.StudentT(df=df, loc=mu, scale=sigma)
       # Test broadcast of mu across shape of df/sigma
-      stddev = student.stddev().eval()
+      stddev = self.evaluate(student.stddev())
       mu *= len(df)
 
       if not stats:
@@ -428,59 +430,58 @@ def testMode(self):
       sigma = [5., 4., 3.]
       student = student_t.StudentT(df=df, loc=mu, scale=sigma)
       # Test broadcast of mu across shape of df/sigma
-      mode = student.mode().eval()
+      mode = self.evaluate(student.mode())
       self.assertAllClose([-1., 0, 1], mode)
 
   def testPdfOfSample(self):
-    with self.test_session() as sess:
-      student = student_t.StudentT(df=3., loc=np.pi, scale=1.)
-      num = 20000
-      samples = student.sample(num, seed=123456)
-      pdfs = student.prob(samples)
-      mean = student.mean()
-      mean_pdf = student.prob(student.mean())
-      sample_vals, pdf_vals, mean_val, mean_pdf_val = sess.run(
-          [samples, pdfs, student.mean(), mean_pdf])
-      self.assertEqual(samples.get_shape(), (num,))
-      self.assertEqual(pdfs.get_shape(), (num,))
-      self.assertEqual(mean.get_shape(), ())
-      self.assertNear(np.pi, np.mean(sample_vals), err=0.02)
-      self.assertNear(np.pi, mean_val, err=1e-6)
-      # Verify integral over sample*pdf ~= 1.
-      self._assertIntegral(sample_vals, pdf_vals, err=2e-3)
-      if not stats:
-        return
-      self.assertNear(stats.t.pdf(np.pi, 3., loc=np.pi), mean_pdf_val, err=1e-6)
+    student = student_t.StudentT(df=3., loc=np.pi, scale=1.)
+    num = 20000
+    samples = student.sample(num, seed=123456)
+    pdfs = student.prob(samples)
+    mean = student.mean()
+    mean_pdf = student.prob(student.mean())
+    sample_vals, pdf_vals, mean_val, mean_pdf_val = self.evaluate(
+        [samples, pdfs, student.mean(), mean_pdf])
+    self.assertEqual(samples.get_shape(), (num,))
+    self.assertEqual(pdfs.get_shape(), (num,))
+    self.assertEqual(mean.get_shape(), ())
+    self.assertNear(np.pi, np.mean(sample_vals), err=0.02)
+    self.assertNear(np.pi, mean_val, err=1e-6)
+    # Verify integral over sample*pdf ~= 1.
+    # Tolerance increased since eager was getting a value of 1.002041.
+    self._assertIntegral(sample_vals, pdf_vals, err=3e-3)
+    if not stats:
+      return
+    self.assertNear(stats.t.pdf(np.pi, 3., loc=np.pi), mean_pdf_val, err=1e-6)
 
   def testPdfOfSampleMultiDims(self):
-    with self.test_session() as sess:
-      student = student_t.StudentT(df=[7., 11.], loc=[[5.], [6.]], scale=3.)
-      self.assertAllEqual([], student.event_shape)
-      self.assertAllEqual([], student.event_shape_tensor().eval())
-      self.assertAllEqual([2, 2], student.batch_shape)
-      self.assertAllEqual([2, 2], student.batch_shape_tensor().eval())
-      num = 50000
-      samples = student.sample(num, seed=123456)
-      pdfs = student.prob(samples)
-      sample_vals, pdf_vals = sess.run([samples, pdfs])
-      self.assertEqual(samples.get_shape(), (num, 2, 2))
-      self.assertEqual(pdfs.get_shape(), (num, 2, 2))
-      self.assertNear(5., np.mean(sample_vals[:, 0, :]), err=.03)
-      self.assertNear(6., np.mean(sample_vals[:, 1, :]), err=.03)
-      self._assertIntegral(sample_vals[:, 0, 0], pdf_vals[:, 0, 0], err=0.02)
-      self._assertIntegral(sample_vals[:, 0, 1], pdf_vals[:, 0, 1], err=0.02)
-      self._assertIntegral(sample_vals[:, 1, 0], pdf_vals[:, 1, 0], err=0.02)
-      self._assertIntegral(sample_vals[:, 1, 1], pdf_vals[:, 1, 1], err=0.02)
-      if not stats:
-        return
-      self.assertNear(
-          stats.t.var(7., loc=0., scale=3.),  # loc d.n. effect var
-          np.var(sample_vals[:, :, 0]),
-          err=.4)
-      self.assertNear(
-          stats.t.var(11., loc=0., scale=3.),  # loc d.n. effect var
-          np.var(sample_vals[:, :, 1]),
-          err=.4)
+    student = student_t.StudentT(df=[7., 11.], loc=[[5.], [6.]], scale=3.)
+    self.assertAllEqual([], student.event_shape)
+    self.assertAllEqual([], self.evaluate(student.event_shape_tensor()))
+    self.assertAllEqual([2, 2], student.batch_shape)
+    self.assertAllEqual([2, 2], self.evaluate(student.batch_shape_tensor()))
+    num = 50000
+    samples = student.sample(num, seed=123456)
+    pdfs = student.prob(samples)
+    sample_vals, pdf_vals = self.evaluate([samples, pdfs])
+    self.assertEqual(samples.get_shape(), (num, 2, 2))
+    self.assertEqual(pdfs.get_shape(), (num, 2, 2))
+    self.assertNear(5., np.mean(sample_vals[:, 0, :]), err=.03)
+    self.assertNear(6., np.mean(sample_vals[:, 1, :]), err=.03)
+    self._assertIntegral(sample_vals[:, 0, 0], pdf_vals[:, 0, 0], err=0.02)
+    self._assertIntegral(sample_vals[:, 0, 1], pdf_vals[:, 0, 1], err=0.02)
+    self._assertIntegral(sample_vals[:, 1, 0], pdf_vals[:, 1, 0], err=0.02)
+    self._assertIntegral(sample_vals[:, 1, 1], pdf_vals[:, 1, 1], err=0.02)
+    if not stats:
+      return
+    self.assertNear(
+        stats.t.var(7., loc=0., scale=3.),  # loc d.n. effect var
+        np.var(sample_vals[:, :, 0]),
+        err=.4)
+    self.assertNear(
+        stats.t.var(11., loc=0., scale=3.),  # loc d.n. effect var
+        np.var(sample_vals[:, :, 1]),
+        err=.4)
 
   def _assertIntegral(self, sample_vals, pdf_vals, err=1.5e-3):
     s_p = zip(sample_vals, pdf_vals)
@@ -494,10 +495,10 @@ def _assertIntegral(self, sample_vals, pdf_vals, err=1.5e-3):
 
   def testNegativeDofFails(self):
     with self.test_session():
-      student = student_t.StudentT(df=[2, -5.], loc=0., scale=1.,
-                                   validate_args=True, name="S")
       with self.assertRaisesOpError(r"Condition x > 0 did not hold"):
-        student.mean().eval()
+        student = student_t.StudentT(
+            df=[2, -5.], loc=0., scale=1., validate_args=True, name="S")
+        self.evaluate(student.mean())
 
   def testStudentTWithAbsDfSoftplusScale(self):
     with self.test_session():
@@ -507,9 +508,11 @@ def testStudentTWithAbsDfSoftplusScale(self):
       student = student_t.StudentTWithAbsDfSoftplusScale(
           df=df, loc=mu, scale=sigma)
       self.assertAllClose(
-          math_ops.floor(math_ops.abs(df)).eval(), student.df.eval())
-      self.assertAllClose(mu.eval(), student.loc.eval())
-      self.assertAllClose(nn_ops.softplus(sigma).eval(), student.scale.eval())
+          math_ops.floor(self.evaluate(math_ops.abs(df))),
+          self.evaluate(student.df))
+      self.assertAllClose(self.evaluate(mu), self.evaluate(student.loc))
+      self.assertAllClose(
+          self.evaluate(nn_ops.softplus(sigma)), self.evaluate(student.scale))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/distributions/uniform_test.py b/tensorflow/python/kernel_tests/distributions/uniform_test.py
index a8def95b147b6d..e74051c9013b7d 100644
--- a/tensorflow/python/kernel_tests/distributions/uniform_test.py
+++ b/tensorflow/python/kernel_tests/distributions/uniform_test.py
@@ -25,6 +25,7 @@
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import uniform as uniform_lib
@@ -46,15 +47,17 @@ def try_import(name):  # pylint: disable=invalid-name
 
 class UniformTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUniformRange(self):
     with self.test_session():
       a = 3.0
       b = 10.0
       uniform = uniform_lib.Uniform(low=a, high=b)
-      self.assertAllClose(a, uniform.low.eval())
-      self.assertAllClose(b, uniform.high.eval())
-      self.assertAllClose(b - a, uniform.range().eval())
+      self.assertAllClose(a, self.evaluate(uniform.low))
+      self.assertAllClose(b, self.evaluate(uniform.high))
+      self.assertAllClose(b - a, self.evaluate(uniform.range()))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUniformPDF(self):
     with self.test_session():
       a = constant_op.constant([-3.0] * 5 + [15.0])
@@ -75,22 +78,24 @@ def _expected_pdf():
       expected_pdf = _expected_pdf()
 
       pdf = uniform.prob(x)
-      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertAllClose(expected_pdf, self.evaluate(pdf))
 
       log_pdf = uniform.log_prob(x)
-      self.assertAllClose(np.log(expected_pdf), log_pdf.eval())
+      self.assertAllClose(np.log(expected_pdf), self.evaluate(log_pdf))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUniformShape(self):
     with self.test_session():
       a = constant_op.constant([-3.0] * 5)
       b = constant_op.constant(11.0)
       uniform = uniform_lib.Uniform(low=a, high=b)
 
-      self.assertEqual(uniform.batch_shape_tensor().eval(), (5,))
+      self.assertEqual(self.evaluate(uniform.batch_shape_tensor()), (5,))
       self.assertEqual(uniform.batch_shape, tensor_shape.TensorShape([5]))
-      self.assertAllEqual(uniform.event_shape_tensor().eval(), [])
+      self.assertAllEqual(self.evaluate(uniform.event_shape_tensor()), [])
       self.assertEqual(uniform.event_shape, tensor_shape.TensorShape([]))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUniformPDFWithScalarEndpoint(self):
     with self.test_session():
       a = constant_op.constant([0.0, 5.0])
@@ -101,8 +106,9 @@ def testUniformPDFWithScalarEndpoint(self):
       expected_pdf = np.array([1.0 / (10.0 - 0.0), 1.0 / (10.0 - 5.0)])
 
       pdf = uniform.prob(x)
-      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertAllClose(expected_pdf, self.evaluate(pdf))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUniformCDF(self):
     with self.test_session():
       batch_size = 6
@@ -121,11 +127,12 @@ def _expected_cdf():
         return cdf
 
       cdf = uniform.cdf(x)
-      self.assertAllClose(_expected_cdf(), cdf.eval())
+      self.assertAllClose(_expected_cdf(), self.evaluate(cdf))
 
       log_cdf = uniform.log_cdf(x)
-      self.assertAllClose(np.log(_expected_cdf()), log_cdf.eval())
+      self.assertAllClose(np.log(_expected_cdf()), self.evaluate(log_cdf))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUniformEntropy(self):
     with self.test_session():
       a_v = np.array([1.0, 1.0, 1.0])
@@ -133,18 +140,20 @@ def testUniformEntropy(self):
       uniform = uniform_lib.Uniform(low=a_v, high=b_v)
 
       expected_entropy = np.log(b_v - a_v)
-      self.assertAllClose(expected_entropy, uniform.entropy().eval())
+      self.assertAllClose(expected_entropy, self.evaluate(uniform.entropy()))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUniformAssertMaxGtMin(self):
     with self.test_session():
       a_v = np.array([1.0, 1.0, 1.0], dtype=np.float32)
       b_v = np.array([1.0, 2.0, 3.0], dtype=np.float32)
-      uniform = uniform_lib.Uniform(low=a_v, high=b_v, validate_args=True)
 
       with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
                                                "x < y"):
-        uniform.low.eval()
+        uniform = uniform_lib.Uniform(low=a_v, high=b_v, validate_args=True)
+        self.evaluate(uniform.low)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUniformSample(self):
     with self.test_session():
       a = constant_op.constant([3.0, 4.0])
@@ -156,7 +165,7 @@ def testUniformSample(self):
       uniform = uniform_lib.Uniform(low=a, high=b)
 
       samples = uniform.sample(n, seed=137)
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
       self.assertEqual(sample_values.shape, (100000, 2))
       self.assertAllClose(
           sample_values[::, 0].mean(), (b_v + a1_v) / 2, atol=1e-2)
@@ -167,6 +176,7 @@ def testUniformSample(self):
       self.assertFalse(
           np.any(sample_values[::, 1] < a2_v) or np.any(sample_values >= b_v))
 
+  @test_util.run_in_graph_and_eager_modes()
   def _testUniformSampleMultiDimensional(self):
     # DISABLED: Please enable this test once b/issues/30149644 is resolved.
     with self.test_session():
@@ -183,7 +193,7 @@ def _testUniformSampleMultiDimensional(self):
       samples = uniform.sample(n)
       self.assertEqual(samples.get_shape(), (n_v, batch_size, 2))
 
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
 
       self.assertFalse(
           np.any(sample_values[:, 0, 0] < a_v[0]) or
@@ -197,6 +207,7 @@ def _testUniformSampleMultiDimensional(self):
       self.assertAllClose(
           sample_values[:, 0, 1].mean(), (a_v[1] + b_v[1]) / 2, atol=1e-2)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUniformMean(self):
     with self.test_session():
       a = 10.0
@@ -205,8 +216,9 @@ def testUniformMean(self):
       if not stats:
         return
       s_uniform = stats.uniform(loc=a, scale=b - a)
-      self.assertAllClose(uniform.mean().eval(), s_uniform.mean())
+      self.assertAllClose(self.evaluate(uniform.mean()), s_uniform.mean())
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUniformVariance(self):
     with self.test_session():
       a = 10.0
@@ -215,8 +227,9 @@ def testUniformVariance(self):
       if not stats:
         return
       s_uniform = stats.uniform(loc=a, scale=b - a)
-      self.assertAllClose(uniform.variance().eval(), s_uniform.var())
+      self.assertAllClose(self.evaluate(uniform.variance()), s_uniform.var())
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUniformStd(self):
     with self.test_session():
       a = 10.0
@@ -225,8 +238,9 @@ def testUniformStd(self):
       if not stats:
         return
       s_uniform = stats.uniform(loc=a, scale=b - a)
-      self.assertAllClose(uniform.stddev().eval(), s_uniform.std())
+      self.assertAllClose(self.evaluate(uniform.stddev()), s_uniform.std())
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUniformNans(self):
     with self.test_session():
       a = 10.0
@@ -235,23 +249,26 @@ def testUniformNans(self):
 
       no_nans = constant_op.constant(1.0)
       nans = constant_op.constant(0.0) / constant_op.constant(0.0)
-      self.assertTrue(math_ops.is_nan(nans).eval())
+      self.assertTrue(self.evaluate(math_ops.is_nan(nans)))
       with_nans = array_ops.stack([no_nans, nans])
 
       pdf = uniform.prob(with_nans)
 
-      is_nan = math_ops.is_nan(pdf).eval()
+      is_nan = self.evaluate(math_ops.is_nan(pdf))
       self.assertFalse(is_nan[0])
       self.assertTrue(is_nan[1])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUniformSamplePdf(self):
     with self.test_session():
       a = 10.0
       b = [11.0, 100.0]
       uniform = uniform_lib.Uniform(a, b)
       self.assertTrue(
-          math_ops.reduce_all(uniform.prob(uniform.sample(10)) > 0).eval())
+          self.evaluate(
+              math_ops.reduce_all(uniform.prob(uniform.sample(10)) > 0)))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUniformBroadcasting(self):
     with self.test_session():
       a = 10.0
@@ -260,8 +277,9 @@ def testUniformBroadcasting(self):
 
       pdf = uniform.prob([[10.5, 11.5], [9.0, 19.0], [10.5, 21.0]])
       expected_pdf = np.array([[1.0, 0.1], [0.0, 0.1], [1.0, 0.0]])
-      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertAllClose(expected_pdf, self.evaluate(pdf))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUniformSampleWithShape(self):
     with self.test_session():
       a = 10.0
@@ -275,12 +293,13 @@ def testUniformSampleWithShape(self):
           [[1.0, 0.1], [1.0, 0.1], [1.0, 0.1]],
       ]
       # pylint: enable=bad-continuation
-      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertAllClose(expected_pdf, self.evaluate(pdf))
 
       pdf = uniform.prob(uniform.sample())
       expected_pdf = [1.0, 0.1]
-      self.assertAllClose(expected_pdf, pdf.eval())
+      self.assertAllClose(expected_pdf, self.evaluate(pdf))
 
+  # Eager doesn't pass due to a type mismatch in one of the ops.
   def testUniformFloat64(self):
     uniform = uniform_lib.Uniform(
         low=np.float64(0.), high=np.float64(1.))
diff --git a/tensorflow/python/kernel_tests/distributions/util_test.py b/tensorflow/python/kernel_tests/distributions/util_test.py
index f54f146e0ac102..2f256d3e8beac1 100644
--- a/tensorflow/python/kernel_tests/distributions/util_test.py
+++ b/tensorflow/python/kernel_tests/distributions/util_test.py
@@ -22,6 +22,7 @@
 
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -56,7 +57,6 @@ def _logit(x):
   return np.log(x) - np.log1p(-x)
 
 
-@test_util.with_c_api
 class AssertCloseTest(test.TestCase):
 
   def testAssertCloseIntegerDtype(self):
@@ -99,6 +99,7 @@ def testAssertCloseNonIntegerDtype(self):
         with ops.control_dependencies([du.assert_close(y, z)]):
           array_ops.identity(y).eval(feed_dict=feed_dict)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testAssertCloseEpsilon(self):
     x = [0., 5, 10, 15, 20]
     # x != y
@@ -107,15 +108,15 @@ def testAssertCloseEpsilon(self):
     z = [1e-8, 5, 10, 15, 20]
     with self.test_session():
       with ops.control_dependencies([du.assert_close(x, z)]):
-        array_ops.identity(x).eval()
+        self.evaluate(array_ops.identity(x))
 
       with self.assertRaisesOpError("Condition x ~= y"):
         with ops.control_dependencies([du.assert_close(x, y)]):
-          array_ops.identity(x).eval()
+          self.evaluate(array_ops.identity(x))
 
       with self.assertRaisesOpError("Condition x ~= y"):
         with ops.control_dependencies([du.assert_close(y, z)]):
-          array_ops.identity(y).eval()
+          self.evaluate(array_ops.identity(y))
 
   def testAssertIntegerForm(self):
     # This should only be detected as an integer.
@@ -147,9 +148,38 @@ def testAssertIntegerForm(self):
           array_ops.identity(w).eval(feed_dict=feed_dict)
 
 
-@test_util.with_c_api
+class MaybeGetStaticTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testGetStaticInt(self):
+    x = 2
+    self.assertEqual(x, du.maybe_get_static_value(x))
+    self.assertAllClose(
+        np.array(2.), du.maybe_get_static_value(x, dtype=np.float64))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testGetStaticNumpyArray(self):
+    x = np.array(2, dtype=np.int32)
+    self.assertEqual(x, du.maybe_get_static_value(x))
+    self.assertAllClose(
+        np.array(2.), du.maybe_get_static_value(x, dtype=np.float64))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testGetStaticConstant(self):
+    x = constant_op.constant(2, dtype=dtypes.int32)
+    self.assertEqual(np.array(2, dtype=np.int32), du.maybe_get_static_value(x))
+    self.assertAllClose(
+        np.array(2.), du.maybe_get_static_value(x, dtype=np.float64))
+
+  def testGetStaticPlaceholder(self):
+    x = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+    self.assertEqual(None, du.maybe_get_static_value(x))
+    self.assertEqual(None, du.maybe_get_static_value(x, dtype=np.float64))
+
+
 class GetLogitsAndProbsTest(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def testImproperArguments(self):
     with self.test_session():
       with self.assertRaises(ValueError):
@@ -158,6 +188,7 @@ def testImproperArguments(self):
       with self.assertRaises(ValueError):
         du.get_logits_and_probs(logits=[0.1], probs=[0.1])
 
+  @test_util.run_in_graph_and_eager_modes()
   def testLogits(self):
     p = np.array([0.01, 0.2, 0.5, 0.7, .99], dtype=np.float32)
     logits = _logit(p)
@@ -166,9 +197,10 @@ def testLogits(self):
       new_logits, new_p = du.get_logits_and_probs(
           logits=logits, validate_args=True)
 
-      self.assertAllClose(p, new_p.eval(), rtol=1e-5, atol=0.)
-      self.assertAllClose(logits, new_logits.eval(), rtol=1e-5, atol=0.)
+      self.assertAllClose(p, self.evaluate(new_p), rtol=1e-5, atol=0.)
+      self.assertAllClose(logits, self.evaluate(new_logits), rtol=1e-5, atol=0.)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testLogitsMultidimensional(self):
     p = np.array([0.2, 0.3, 0.5], dtype=np.float32)
     logits = np.log(p)
@@ -177,9 +209,10 @@ def testLogitsMultidimensional(self):
       new_logits, new_p = du.get_logits_and_probs(
           logits=logits, multidimensional=True, validate_args=True)
 
-      self.assertAllClose(new_p.eval(), p)
-      self.assertAllClose(new_logits.eval(), logits)
+      self.assertAllClose(self.evaluate(new_p), p)
+      self.assertAllClose(self.evaluate(new_logits), logits)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testProbability(self):
     p = np.array([0.01, 0.2, 0.5, 0.7, .99], dtype=np.float32)
 
@@ -187,9 +220,10 @@ def testProbability(self):
       new_logits, new_p = du.get_logits_and_probs(
           probs=p, validate_args=True)
 
-      self.assertAllClose(_logit(p), new_logits.eval())
-      self.assertAllClose(p, new_p.eval())
+      self.assertAllClose(_logit(p), self.evaluate(new_logits))
+      self.assertAllClose(p, self.evaluate(new_p))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testProbabilityMultidimensional(self):
     p = np.array([[0.3, 0.4, 0.3], [0.1, 0.5, 0.4]], dtype=np.float32)
 
@@ -197,9 +231,10 @@ def testProbabilityMultidimensional(self):
       new_logits, new_p = du.get_logits_and_probs(
           probs=p, multidimensional=True, validate_args=True)
 
-      self.assertAllClose(np.log(p), new_logits.eval())
-      self.assertAllClose(p, new_p.eval())
+      self.assertAllClose(np.log(p), self.evaluate(new_logits))
+      self.assertAllClose(p, self.evaluate(new_p))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testProbabilityValidateArgs(self):
     p = [0.01, 0.2, 0.5, 0.7, .99]
     # Component less than 0.
@@ -210,26 +245,27 @@ def testProbabilityValidateArgs(self):
     with self.test_session():
       _, prob = du.get_logits_and_probs(
           probs=p, validate_args=True)
-      prob.eval()
+      self.evaluate(prob)
 
       with self.assertRaisesOpError("Condition x >= 0"):
         _, prob = du.get_logits_and_probs(
             probs=p2, validate_args=True)
-        prob.eval()
+        self.evaluate(prob)
 
       _, prob = du.get_logits_and_probs(
           probs=p2, validate_args=False)
-      prob.eval()
+      self.evaluate(prob)
 
       with self.assertRaisesOpError("probs has components greater than 1"):
         _, prob = du.get_logits_and_probs(
             probs=p3, validate_args=True)
-        prob.eval()
+        self.evaluate(prob)
 
       _, prob = du.get_logits_and_probs(
           probs=p3, validate_args=False)
-      prob.eval()
+      self.evaluate(prob)
 
+  @test_util.run_in_graph_and_eager_modes()
   def testProbabilityValidateArgsMultidimensional(self):
     p = np.array([[0.3, 0.4, 0.3], [0.1, 0.5, 0.4]], dtype=np.float32)
     # Component less than 0. Still sums to 1.
@@ -242,35 +278,35 @@ def testProbabilityValidateArgsMultidimensional(self):
     with self.test_session():
       _, prob = du.get_logits_and_probs(
           probs=p, multidimensional=True)
-      prob.eval()
+      self.evaluate(prob)
 
       with self.assertRaisesOpError("Condition x >= 0"):
         _, prob = du.get_logits_and_probs(
             probs=p2, multidimensional=True, validate_args=True)
-        prob.eval()
+        self.evaluate(prob)
 
       _, prob = du.get_logits_and_probs(
           probs=p2, multidimensional=True, validate_args=False)
-      prob.eval()
+      self.evaluate(prob)
 
       with self.assertRaisesOpError(
           "(probs has components greater than 1|probs does not sum to 1)"):
         _, prob = du.get_logits_and_probs(
             probs=p3, multidimensional=True, validate_args=True)
-        prob.eval()
+        self.evaluate(prob)
 
       _, prob = du.get_logits_and_probs(
           probs=p3, multidimensional=True, validate_args=False)
-      prob.eval()
+      self.evaluate(prob)
 
       with self.assertRaisesOpError("probs does not sum to 1"):
         _, prob = du.get_logits_and_probs(
             probs=p4, multidimensional=True, validate_args=True)
-        prob.eval()
+        self.evaluate(prob)
 
       _, prob = du.get_logits_and_probs(
           probs=p4, multidimensional=True, validate_args=False)
-      prob.eval()
+      self.evaluate(prob)
 
   def testProbsMultidimShape(self):
     with self.test_session():
@@ -301,7 +337,6 @@ def testLogitsMultidimShape(self):
         logit.eval(feed_dict={l: np.ones([int(2**11+1)])})
 
 
-@test_util.with_c_api
 class EmbedCheckCategoricalEventShapeTest(test.TestCase):
 
   def testTooSmall(self):
@@ -332,6 +367,7 @@ def testTooLarge(self):
             param)
         checked_param.eval(feed_dict={param: np.ones([int(2**11+1)])})
 
+  @test_util.run_in_graph_and_eager_modes()
   def testUnsupportedDtype(self):
     with self.test_session():
       with self.assertRaises(TypeError):
@@ -339,7 +375,6 @@ def testUnsupportedDtype(self):
         du.embed_check_categorical_event_shape(param)
 
 
-@test_util.with_c_api
 class EmbedCheckIntegerCastingClosedTest(test.TestCase):
 
   def testCorrectlyAssertsNonnegative(self):
@@ -375,7 +410,7 @@ def testCorrectlyAssertsSmallestPossibleInteger(self):
         x_checked.eval(feed_dict={x: np.array([1, -1], dtype=np.int32)})
 
 
-@test_util.with_c_api
+@test_util.run_all_in_graph_and_eager_modes
 class LogCombinationsTest(test.TestCase):
 
   def testLogCombinationsBinomial(self):
@@ -392,7 +427,7 @@ def testLogCombinationsBinomial(self):
       counts = [[1., 1], [2., 3], [4., 8], [11, 4]]
       log_binom = du.log_combinations(n, counts)
       self.assertEqual([4], log_binom.get_shape())
-      self.assertAllClose(log_combs, log_binom.eval())
+      self.assertAllClose(log_combs, self.evaluate(log_binom))
 
   def testLogCombinationsShape(self):
     # Shape [2, 2]
@@ -406,7 +441,6 @@ def testLogCombinationsShape(self):
       self.assertEqual([2, 2], log_binom.get_shape())
 
 
-@test_util.with_c_api
 class DynamicShapeTest(test.TestCase):
 
   def testSameDynamicShape(self):
@@ -511,7 +545,6 @@ def testSameDynamicShape(self):
               }))
 
 
-@test_util.with_c_api
 class RotateTransposeTest(test.TestCase):
 
   def _np_rotate_transpose(self, x, shift):
@@ -519,14 +552,20 @@ def _np_rotate_transpose(self, x, shift):
       x = np.array(x)
     return np.transpose(x, np.roll(np.arange(len(x.shape)), shift))
 
+  @test_util.run_in_graph_and_eager_modes()
   def testRollStatic(self):
     with self.test_session():
-      with self.assertRaisesRegexp(ValueError, "None values not supported."):
+      if context.executing_eagerly():
+        error_message = r"Attempt to convert a value \(None\)"
+      else:
+        error_message = "None values not supported."
+      with self.assertRaisesRegexp(ValueError, error_message):
         du.rotate_transpose(None, 1)
       for x in (np.ones(1), np.ones((2, 1)), np.ones((3, 2, 1))):
         for shift in np.arange(-5, 5):
           y = du.rotate_transpose(x, shift)
-          self.assertAllEqual(self._np_rotate_transpose(x, shift), y.eval())
+          self.assertAllEqual(
+              self._np_rotate_transpose(x, shift), self.evaluate(y))
           self.assertAllEqual(np.roll(x.shape, shift), y.get_shape().as_list())
 
   def testRollDynamic(self):
@@ -545,19 +584,16 @@ def testRollDynamic(self):
                                   shift: shift_value}))
 
 
-@test_util.with_c_api
 class PickVectorTest(test.TestCase):
 
   def testCorrectlyPicksVector(self):
     with self.test_session():
       x = np.arange(10, 12)
       y = np.arange(15, 18)
-      self.assertAllEqual(x,
-                          du.pick_vector(
-                              math_ops.less(0, 5), x, y).eval())
-      self.assertAllEqual(y,
-                          du.pick_vector(
-                              math_ops.less(5, 0), x, y).eval())
+      self.assertAllEqual(
+          x, self.evaluate(du.pick_vector(math_ops.less(0, 5), x, y)))
+      self.assertAllEqual(
+          y, self.evaluate(du.pick_vector(math_ops.less(5, 0), x, y)))
       self.assertAllEqual(x,
                           du.pick_vector(
                               constant_op.constant(True), x, y))  # No eval.
@@ -566,7 +602,6 @@ def testCorrectlyPicksVector(self):
                               constant_op.constant(False), x, y))  # No eval.
 
 
-@test_util.with_c_api
 class PreferStaticRankTest(test.TestCase):
 
   def testNonEmptyConstantTensor(self):
@@ -606,7 +641,6 @@ def testDynamicRankEndsUpBeingScalar(self):
       self.assertAllEqual(0, rank.eval(feed_dict={x: 1}))
 
 
-@test_util.with_c_api
 class PreferStaticShapeTest(test.TestCase):
 
   def testNonEmptyConstantTensor(self):
@@ -646,7 +680,6 @@ def testDynamicShapeEndsUpBeingScalar(self):
       self.assertAllEqual(np.array([]), shape.eval(feed_dict={x: 1}))
 
 
-@test_util.with_c_api
 class PreferStaticValueTest(test.TestCase):
 
   def testNonEmptyConstantTensor(self):
@@ -687,7 +720,6 @@ def testDynamicValueEndsUpBeingScalar(self):
       self.assertAllEqual(np.array(1), value.eval(feed_dict={x: 1}))
 
 
-@test_util.with_c_api
 class FillTriangularTest(test.TestCase):
 
   def setUp(self):
@@ -703,7 +735,7 @@ def _fill_triangular(self, x, upper=False):
       raise ValueError("Invalid shape.")
     n = np.int32(n)
     # We can't do: `x[..., -(n**2-m):]` because this doesn't correctly handle
-    # `m == n == 1`. Hence, we do absoulte indexing.
+    # `m == n == 1`. Hence, we do absolute indexing.
     x_tail = x[..., (m - (n * n - m)):]
     y = np.concatenate(
         [x, x_tail[..., ::-1]] if upper else [x_tail, x[..., ::-1]],
@@ -782,7 +814,30 @@ def testCorrectlyMakesBatch7x7TriUpper(self):
     self._run_test(self._rng.randn(2, 3, int(7*8/2)), upper=True)
 
 
-@test_util.with_c_api
+class FillTriangularInverseTest(FillTriangularTest):
+
+  def _run_test(self, x_, use_deferred_shape=False, **kwargs):
+    x_ = np.asarray(x_)
+    with self.test_session() as sess:
+      static_shape = None if use_deferred_shape else x_.shape
+      x_pl = array_ops.placeholder_with_default(x_, shape=static_shape)
+      zeros_like_x_pl = (x_pl * array_ops.stop_gradient(x_pl - 1.)
+                         - array_ops.stop_gradient(x_pl * (x_pl - 1.)))
+      x = x_pl + zeros_like_x_pl
+      actual = du.fill_triangular(x, **kwargs)
+      inverse_actual = du.fill_triangular_inverse(actual, **kwargs)
+
+      inverse_actual_ = sess.run(
+          inverse_actual,
+          feed_dict={x_pl: x_})
+
+    if use_deferred_shape:
+      self.assertEqual(None, inverse_actual.shape)
+    else:
+      self.assertAllEqual(x_.shape, inverse_actual.shape)
+    self.assertAllEqual(x_, inverse_actual_)
+
+
 class ReduceWeightedLogSumExp(test.TestCase):
 
   def _reduce_weighted_logsumexp(self, logx, w, axis, keep_dims=False):
@@ -858,28 +913,27 @@ def testDocString(self):
                                 [1, 1, 1]])
 
       self.assertAllClose(
-          np.log(4),
-          du.reduce_weighted_logsumexp(x, w).eval())
+          np.log(4), self.evaluate(du.reduce_weighted_logsumexp(x, w)))
 
       with np.errstate(divide="ignore"):
         self.assertAllClose(
             np.log([0, 2, 2]),
-            du.reduce_weighted_logsumexp(x, w, axis=0).eval())
+            self.evaluate(du.reduce_weighted_logsumexp(x, w, axis=0)))
 
       self.assertAllClose(
           np.log([1, 3]),
-          du.reduce_weighted_logsumexp(x, w, axis=1).eval())
+          self.evaluate(du.reduce_weighted_logsumexp(x, w, axis=1)))
 
       self.assertAllClose(
           np.log([[1], [3]]),
-          du.reduce_weighted_logsumexp(x, w, axis=1, keep_dims=True).eval())
+          self.evaluate(
+              du.reduce_weighted_logsumexp(x, w, axis=1, keep_dims=True)))
 
       self.assertAllClose(
           np.log(4),
-          du.reduce_weighted_logsumexp(x, w, axis=[0, 1]).eval())
+          self.evaluate(du.reduce_weighted_logsumexp(x, w, axis=[0, 1])))
 
 
-@test_util.with_c_api
 class GenNewSeedTest(test.TestCase):
 
   def testOnlyNoneReturnsNone(self):
@@ -890,7 +944,6 @@ def testOnlyNoneReturnsNone(self):
 # TODO(jvdillon): Merge this test back into:
 # tensorflow/python/kernel_tests/softplus_op_test.py
 # once TF core is accepting new ops.
-@test_util.with_c_api
 class SoftplusTest(test.TestCase):
 
   def _npSoftplus(self, np_features):
@@ -976,7 +1029,7 @@ def testInverseSoftplusGradientNeverNan(self):
       # Note that this range contains both zero and inf.
       x = constant_op.constant(np.logspace(-8, 6).astype(np.float16))
       y = du.softplus_inverse(x)
-      grads = gradients_impl.gradients(y, x)[0].eval()
+      grads = self.evaluate(gradients_impl.gradients(y, x)[0])
       # Equivalent to `assertAllFalse` (if it existed).
       self.assertAllEqual(np.zeros_like(grads).astype(np.bool), np.isnan(grads))
 
@@ -986,11 +1039,69 @@ def testInverseSoftplusGradientFinite(self):
       # gradient and its approximations should be finite as well.
       x = constant_op.constant(np.logspace(-4.8, 4.5).astype(np.float16))
       y = du.softplus_inverse(x)
-      grads = gradients_impl.gradients(y, x)[0].eval()
+      grads = self.evaluate(gradients_impl.gradients(y, x)[0])
       # Equivalent to `assertAllTrue` (if it existed).
       self.assertAllEqual(
           np.ones_like(grads).astype(np.bool), np.isfinite(grads))
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class ArgumentsTest(test.TestCase):
+
+  def testNoArguments(self):
+    def foo():
+      return du.parent_frame_arguments()
+
+    self.assertEqual({}, foo())
+
+  def testPositionalArguments(self):
+    def foo(a, b, c, d):  # pylint: disable=unused-argument
+      return du.parent_frame_arguments()
+
+    self.assertEqual({"a": 1, "b": 2, "c": 3, "d": 4}, foo(1, 2, 3, 4))
+
+    # Tests that it does not matter where this function is called, and
+    # no other local variables are returned back.
+    def bar(a, b, c):
+      unused_x = a * b
+      unused_y = c * 3
+      return du.parent_frame_arguments()
+
+    self.assertEqual({"a": 1, "b": 2, "c": 3}, bar(1, 2, 3))
+
+  def testOverloadedArgumentValues(self):
+    def foo(a, b, c):  # pylint: disable=unused-argument
+      a = 42
+      b = 31
+      c = 42
+      return du.parent_frame_arguments()
+    self.assertEqual({"a": 42, "b": 31, "c": 42}, foo(1, 2, 3))
+
+  def testKeywordArguments(self):
+    def foo(**kwargs):  # pylint: disable=unused-argument
+      return du.parent_frame_arguments()
+
+    self.assertEqual({"a": 1, "b": 2, "c": 3, "d": 4}, foo(a=1, b=2, c=3, d=4))
+
+  def testPositionalKeywordArgs(self):
+    def foo(a, b, c, **kwargs):  # pylint: disable=unused-argument
+      return du.parent_frame_arguments()
+
+    self.assertEqual({"a": 1, "b": 2, "c": 3}, foo(a=1, b=2, c=3))
+    self.assertEqual({"a": 1, "b": 2, "c": 3, "unicorn": None},
+                     foo(a=1, b=2, c=3, unicorn=None))
+
+  def testNoVarargs(self):
+    def foo(a, b, c, *varargs, **kwargs):  # pylint: disable=unused-argument
+      return du.parent_frame_arguments()
+
+    self.assertEqual({"a": 1, "b": 2, "c": 3}, foo(a=1, b=2, c=3))
+    self.assertEqual({"a": 1, "b": 2, "c": 3}, foo(1, 2, 3, *[1, 2, 3]))
+    self.assertEqual({"a": 1, "b": 2, "c": 3, "unicorn": None},
+                     foo(1, 2, 3, unicorn=None))
+    self.assertEqual({"a": 1, "b": 2, "c": 3, "unicorn": None},
+                     foo(1, 2, 3, *[1, 2, 3], unicorn=None))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/user_ops/duplicate_op.cc b/tensorflow/python/kernel_tests/duplicate_op.cc
similarity index 100%
rename from tensorflow/user_ops/duplicate_op.cc
rename to tensorflow/python/kernel_tests/duplicate_op.cc
diff --git a/tensorflow/user_ops/duplicate_op_test.py b/tensorflow/python/kernel_tests/duplicate_op_test.py
similarity index 69%
rename from tensorflow/user_ops/duplicate_op_test.py
rename to tensorflow/python/kernel_tests/duplicate_op_test.py
index b61e68d75e3ef2..529d3dd0b3aa1f 100644
--- a/tensorflow/user_ops/duplicate_op_test.py
+++ b/tensorflow/python/kernel_tests/duplicate_op_test.py
@@ -17,23 +17,26 @@
 from __future__ import division
 from __future__ import print_function
 
-import os.path
+import os
 
-import tensorflow as tf
+from tensorflow.python.framework import load_library
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import resource_loader
+from tensorflow.python.platform import test
 
 
-class DuplicateOpTest(tf.test.TestCase):
+class DuplicateOpTest(test.TestCase):
 
   def testBasic(self):
-    library_filename = os.path.join(tf.resource_loader.get_data_files_path(),
+    library_filename = os.path.join(resource_loader.get_data_files_path(),
                                     'duplicate_op.so')
-    duplicate = tf.load_op_library(library_filename)
+    duplicate = load_library.load_op_library(library_filename)
 
     self.assertEqual(len(duplicate.OP_LIST.op), 0)
 
     with self.test_session():
-      self.assertEqual(tf.add(1, 41).eval(), 42)
+      self.assertEqual(math_ops.add(1, 41).eval(), 42)
 
 
 if __name__ == '__main__':
-  tf.test.main()
+  test.main()
diff --git a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
index a4b30e4319527c..159cba5fa3d69b 100644
--- a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
+++ b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
@@ -113,22 +113,23 @@ def testHigherRank(self):
           constant_op.constant([[5, 2], [0, 3]])
       ]
       data = [
-          constant_op.constant([61, 62]),
-          constant_op.constant([[41, 42], [11, 12]]),
-          constant_op.constant([[[51, 52], [21, 22]], [[1, 2], [31, 32]]])
+          constant_op.constant([61., 62.]),
+          constant_op.constant([[41., 42.], [11., 12.]]),
+          constant_op.constant([[[51., 52.], [21., 22.]],
+                                [[1., 2.], [31., 32.]]])
       ]
       stitched_t = self.stitch_op(indices, data)
       stitched_val = stitched_t.eval()
-      correct = 10 * np.arange(7)[:, None] + [1, 2]
+      correct = 10. * np.arange(7)[:, None] + [1., 2.]
       self.assertAllEqual(correct, stitched_val)
       self.assertEqual([7, 2], stitched_t.get_shape().as_list())
       # Test gradients
-      stitched_grad = 7 * stitched_val
+      stitched_grad = 7. * stitched_val
       grads = gradients_impl.gradients(stitched_t, indices + data,
                                        stitched_grad)
       self.assertEqual(grads[:3], [None] * 3)  # Indices have no gradients
       for datum, grad in zip(data, sess.run(grads[3:])):
-        self.assertAllEqual(7 * datum.eval(), grad)
+        self.assertAllEqual(7. * datum.eval(), grad)
 
   def testErrorIndicesMultiDimensional(self):
     indices = [
diff --git a/tensorflow/python/kernel_tests/fft_ops_test.py b/tensorflow/python/kernel_tests/fft_ops_test.py
index b9e2aa1f3a4ebb..629acedda5c5f0 100644
--- a/tensorflow/python/kernel_tests/fft_ops_test.py
+++ b/tensorflow/python/kernel_tests/fft_ops_test.py
@@ -38,11 +38,13 @@
 
 class BaseFFTOpsTest(test.TestCase):
 
-  def _compare(self, x, rank, fft_length=None, use_placeholder=False):
-    self._compareForward(x, rank, fft_length, use_placeholder)
-    self._compareBackward(x, rank, fft_length, use_placeholder)
+  def _compare(self, x, rank, fft_length=None, use_placeholder=False,
+               rtol=1e-4, atol=1e-4):
+    self._compareForward(x, rank, fft_length, use_placeholder, rtol, atol)
+    self._compareBackward(x, rank, fft_length, use_placeholder, rtol, atol)
 
-  def _compareForward(self, x, rank, fft_length=None, use_placeholder=False):
+  def _compareForward(self, x, rank, fft_length=None, use_placeholder=False,
+                      rtol=1e-4, atol=1e-4):
     x_np = self._npFFT(x, rank, fft_length)
     if use_placeholder:
       x_ph = array_ops.placeholder(dtype=dtypes.as_dtype(x.dtype))
@@ -50,9 +52,10 @@ def _compareForward(self, x, rank, fft_length=None, use_placeholder=False):
     else:
       x_tf = self._tfFFT(x, rank, fft_length)
 
-    self.assertAllClose(x_np, x_tf, rtol=1e-4, atol=1e-4)
+    self.assertAllClose(x_np, x_tf, rtol=rtol, atol=atol)
 
-  def _compareBackward(self, x, rank, fft_length=None, use_placeholder=False):
+  def _compareBackward(self, x, rank, fft_length=None, use_placeholder=False,
+                       rtol=1e-4, atol=1e-4):
     x_np = self._npIFFT(x, rank, fft_length)
     if use_placeholder:
       x_ph = array_ops.placeholder(dtype=dtypes.as_dtype(x.dtype))
@@ -60,7 +63,7 @@ def _compareBackward(self, x, rank, fft_length=None, use_placeholder=False):
     else:
       x_tf = self._tfIFFT(x, rank, fft_length)
 
-    self.assertAllClose(x_np, x_tf, rtol=1e-4, atol=1e-4)
+    self.assertAllClose(x_np, x_tf, rtol=rtol, atol=atol)
 
   def _checkMemoryFail(self, x, rank):
     config = config_pb2.ConfigProto()
@@ -68,7 +71,8 @@ def _checkMemoryFail(self, x, rank):
     with self.test_session(config=config, force_gpu=True):
       self._tfFFT(x, rank, fft_length=None)
 
-  def _checkGradComplex(self, func, x, y, result_is_complex=True):
+  def _checkGradComplex(self, func, x, y, result_is_complex=True,
+                        rtol=1e-2, atol=1e-2):
     with self.test_session(use_gpu=True):
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
@@ -85,10 +89,10 @@ def _checkGradComplex(self, func, x, y, result_is_complex=True):
            x_init_value=[x, y],
            delta=1e-2)
 
-    self.assertAllClose(x_jacob_t, x_jacob_n, rtol=1e-2, atol=1e-2)
-    self.assertAllClose(y_jacob_t, y_jacob_n, rtol=1e-2, atol=1e-2)
+    self.assertAllClose(x_jacob_t, x_jacob_n, rtol=rtol, atol=atol)
+    self.assertAllClose(y_jacob_t, y_jacob_n, rtol=rtol, atol=atol)
 
-  def _checkGradReal(self, func, x):
+  def _checkGradReal(self, func, x, rtol=1e-2, atol=1e-2):
     with self.test_session(use_gpu=True):
       inx = ops.convert_to_tensor(x)
       # func is a forward RFFT function (batched or unbatched).
@@ -98,7 +102,7 @@ def _checkGradReal(self, func, x):
       x_jacob_t, x_jacob_n = test.compute_gradient(
           inx, list(x.shape), loss, [1], x_init_value=x, delta=1e-2)
 
-    self.assertAllClose(x_jacob_t, x_jacob_n, rtol=1e-2, atol=1e-2)
+    self.assertAllClose(x_jacob_t, x_jacob_n, rtol=rtol, atol=atol)
 
 
 class FFTOpsTest(BaseFFTOpsTest):
@@ -155,27 +159,30 @@ def _tfIFFTForRank(self, rank):
 
   def testEmpty(self):
     with spectral_ops_test_util.fft_kernel_label_map():
-      for rank in VALID_FFT_RANKS:
-        for dims in xrange(rank, rank + 3):
-          x = np.zeros((0,) * dims).astype(np.complex64)
-          self.assertEqual(x.shape, self._tfFFT(x, rank).shape)
-          self.assertEqual(x.shape, self._tfIFFT(x, rank).shape)
+      for np_type in (np.complex64, np.complex128):
+        for rank in VALID_FFT_RANKS:
+          for dims in xrange(rank, rank + 3):
+            x = np.zeros((0,) * dims).astype(np_type)
+            self.assertEqual(x.shape, self._tfFFT(x, rank).shape)
+            self.assertEqual(x.shape, self._tfIFFT(x, rank).shape)
 
   def testBasic(self):
     with spectral_ops_test_util.fft_kernel_label_map():
-      for rank in VALID_FFT_RANKS:
-        for dims in xrange(rank, rank + 3):
-          self._compare(
-              np.mod(np.arange(np.power(4, dims)), 10).reshape(
-                  (4,) * dims).astype(np.complex64), rank)
+      for np_type, tol in ((np.complex64, 1e-4), (np.complex128, 1e-8)):
+        for rank in VALID_FFT_RANKS:
+          for dims in xrange(rank, rank + 3):
+            self._compare(
+                np.mod(np.arange(np.power(4, dims)), 10).reshape(
+                    (4,) * dims).astype(np_type), rank, rtol=tol, atol=tol)
 
   def testLargeBatch(self):
     if test.is_gpu_available(cuda_only=True):
       rank = 1
       for dims in xrange(rank, rank + 3):
-        self._compare(
-            np.mod(np.arange(np.power(128, dims)), 10).reshape(
-                (128,) * dims).astype(np.complex64), rank)
+        for np_type, tol in ((np.complex64, 1e-4), (np.complex128, 1e-5)):
+          self._compare(
+              np.mod(np.arange(np.power(128, dims)), 10).reshape(
+                  (128,) * dims).astype(np_type), rank, rtol=tol, atol=tol)
 
   # TODO(yangzihao): Disable before we can figure out a way to
   # properly test memory fail for large batch fft.
@@ -189,27 +196,49 @@ def testLargeBatch(self):
 
   def testBasicPlaceholder(self):
     with spectral_ops_test_util.fft_kernel_label_map():
-      for rank in VALID_FFT_RANKS:
-        for dims in xrange(rank, rank + 3):
-          self._compare(
-              np.mod(np.arange(np.power(4, dims)), 10).reshape(
-                  (4,) * dims).astype(np.complex64),
-              rank,
-              use_placeholder=True)
+      for np_type, tol in ((np.complex64, 1e-4), (np.complex128, 1e-8)):
+        for rank in VALID_FFT_RANKS:
+          for dims in xrange(rank, rank + 3):
+            self._compare(
+                np.mod(np.arange(np.power(4, dims)), 10).reshape(
+                    (4,) * dims).astype(np_type),
+                rank, use_placeholder=True, rtol=tol, atol=tol)
 
   def testRandom(self):
     with spectral_ops_test_util.fft_kernel_label_map():
-      np.random.seed(12345)
+      for np_type, tol in ((np.complex64, 1e-4), (np.complex128, 5e-6)):
+        def gen(shape):
+          n = np.prod(shape)
+          re = np.random.uniform(size=n)
+          im = np.random.uniform(size=n)
+          return (re + im * 1j).reshape(shape)
 
-      def gen(shape):
-        n = np.prod(shape)
-        re = np.random.uniform(size=n)
-        im = np.random.uniform(size=n)
-        return (re + im * 1j).reshape(shape)
+        for rank in VALID_FFT_RANKS:
+          for dims in xrange(rank, rank + 3):
+            self._compare(gen((4,) * dims).astype(np_type), rank,
+                          rtol=tol, atol=tol)
 
-      for rank in VALID_FFT_RANKS:
-        for dims in xrange(rank, rank + 3):
-          self._compare(gen((4,) * dims), rank)
+  def testRandom1D(self):
+    with spectral_ops_test_util.fft_kernel_label_map():
+      for np_type in (np.complex64, np.complex128):
+        has_gpu = test.is_gpu_available(cuda_only=True)
+        tol = {(np.complex64, True): 1e-4,
+               (np.complex64, False): 1e-2,
+               (np.complex128, True): 1e-4,
+               (np.complex128, False): 1e-2}[(np_type, has_gpu)]
+        def gen(shape):
+          n = np.prod(shape)
+          re = np.random.uniform(size=n)
+          im = np.random.uniform(size=n)
+          return (re + im * 1j).reshape(shape)
+
+        # Check a variety of power-of-2 FFT sizes.
+        for dim in (128, 256, 512, 1024):
+          self._compare(gen((dim,)).astype(np_type), 1, rtol=tol, atol=tol)
+
+        # Check a variety of non-power-of-2 FFT sizes.
+        for dim in (127, 255, 511, 1023):
+          self._compare(gen((dim,)).astype(np_type), 1, rtol=tol, atol=tol)
 
   def testError(self):
     for rank in VALID_FFT_RANKS:
@@ -224,22 +253,27 @@ def testError(self):
 
   def testGrad_Simple(self):
     with spectral_ops_test_util.fft_kernel_label_map():
-      for rank in VALID_FFT_RANKS:
-        for dims in xrange(rank, rank + 2):
-          re = np.ones(shape=(4,) * dims, dtype=np.float32) / 10.0
-          im = np.zeros(shape=(4,) * dims, dtype=np.float32)
-          self._checkGradComplex(self._tfFFTForRank(rank), re, im)
-          self._checkGradComplex(self._tfIFFTForRank(rank), re, im)
+      for np_type, tol in ((np.float32, 1e-4), (np.float64, 1e-10)):
+        for rank in VALID_FFT_RANKS:
+          for dims in xrange(rank, rank + 2):
+            re = np.ones(shape=(4,) * dims, dtype=np_type) / 10.0
+            im = np.zeros(shape=(4,) * dims, dtype=np_type)
+            self._checkGradComplex(self._tfFFTForRank(rank), re, im,
+                                   rtol=tol, atol=tol)
+            self._checkGradComplex(self._tfIFFTForRank(rank), re, im,
+                                   rtol=tol, atol=tol)
 
   def testGrad_Random(self):
     with spectral_ops_test_util.fft_kernel_label_map():
-      np.random.seed(54321)
-      for rank in VALID_FFT_RANKS:
-        for dims in xrange(rank, rank + 2):
-          re = np.random.rand(*((3,) * dims)).astype(np.float32) * 2 - 1
-          im = np.random.rand(*((3,) * dims)).astype(np.float32) * 2 - 1
-          self._checkGradComplex(self._tfFFTForRank(rank), re, im)
-          self._checkGradComplex(self._tfIFFTForRank(rank), re, im)
+      for np_type, tol in ((np.float32, 1e-2), (np.float64, 1e-10)):
+        for rank in VALID_FFT_RANKS:
+          for dims in xrange(rank, rank + 2):
+            re = np.random.rand(*((3,) * dims)).astype(np_type) * 2 - 1
+            im = np.random.rand(*((3,) * dims)).astype(np_type) * 2 - 1
+            self._checkGradComplex(self._tfFFTForRank(rank), re, im,
+                                   rtol=tol, atol=tol)
+            self._checkGradComplex(self._tfIFFTForRank(rank), re, im,
+                                   rtol=tol, atol=tol)
 
 
 class RFFTOpsTest(BaseFFTOpsTest):
@@ -395,8 +429,6 @@ def testFftLength(self):
 
   def testRandom(self):
     with spectral_ops_test_util.fft_kernel_label_map():
-      np.random.seed(12345)
-
       def gen_real(shape):
         n = np.prod(shape)
         re = np.random.uniform(size=n)
@@ -491,7 +523,6 @@ def testGrad_Simple(self):
 
   def testGrad_Random(self):
     with spectral_ops_test_util.fft_kernel_label_map():
-      np.random.seed(54321)
       for rank in VALID_FFT_RANKS:
         # rfft3d/irfft3d do not have gradients yet.
         if rank == 3:
diff --git a/tensorflow/python/kernel_tests/functional_ops_test.py b/tensorflow/python/kernel_tests/functional_ops_test.py
index 35a274e75f51b7..facadc971ff516 100644
--- a/tensorflow/python/kernel_tests/functional_ops_test.py
+++ b/tensorflow/python/kernel_tests/functional_ops_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for tensorflow.kernels.bcast_ops."""
+"""Tests for tensorflow.kernels.functional_ops."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -328,6 +328,23 @@ def testScan_Simple(self):
       self.assertAllEqual([2., 4., 12., 48., 240., 1440.], self.evaluate(r))
       # pylint: enable=unnecessary-lambda
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testScan_Reverse(self):
+    with self.test_session():
+      elems = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="data")
+      v = constant_op.constant(2.0, name="v")
+
+      # pylint: disable=unnecessary-lambda
+      r = functional_ops.scan(lambda a, x: math_ops.multiply(a, x), elems,
+                              reverse=True)
+      self.assertAllEqual([720., 720., 360., 120., 30., 6.], self.evaluate(r))
+      r = functional_ops.scan(
+          lambda a, x: math_ops.multiply(a, x), elems, initializer=v,
+          reverse=True)
+      self.assertAllEqual([1440., 1440., 720., 240., 60., 12.],
+                          self.evaluate(r))
+      # pylint: enable=unnecessary-lambda
+
   @test_util.run_in_graph_and_eager_modes()
   def testScan_SingleInputMultiOutput(self):
     with self.test_session():
@@ -670,117 +687,117 @@ def Thrice(x):
 
     with self.test_session(use_gpu=False) as sess:
 
-      def Run(x):
-        return sess.run(
-            functional_ops.If(math_ops.greater(x, 0), [x], Twice, Thrice))[0]
+      x = array_ops.placeholder(dtypes.float32)
+      ret = functional_ops.If(math_ops.greater(x, 0), [x], Twice, Thrice)[0]
 
-      self.assertAllEqual(Run(9.), 18.)
-      self.assertAllEqual(Run(-8.), -23.)
-      self.assertAllEqual(Run(0.), 1.)
+      self.assertAllEqual(sess.run(ret, feed_dict={x: 9.}), 18.)
+      self.assertAllEqual(sess.run(ret, feed_dict={x: -8.}), -23.)
+      self.assertAllEqual(sess.run(ret, feed_dict={x: 0.}), 1.)
 
   def testWhile(self):
 
-    @function.Defun(*[dtypes.float32] * 2)
-    def Cond(n, unused_x):
-      return n > 0
+    for use_gpu in (True, False):
+      with ops.Graph().as_default() as g:
 
-    @function.Defun(*[dtypes.float32] * 2)
-    def Body(n, x):
-      return n - 1, x + n
+        @function.Defun(*[dtypes.float32] * 2)
+        def Cond(n, unused_x):
+          return n > 0
 
-    # TODO(b/65752372): Set `use_gpu=False` because
-    # `functional_ops.While()` does not reliably work on GPU (apparently
-    # because the result of evaluating the condition may be in device
-    # memory, but it is read on the host).
-    with self.test_session(use_gpu=False) as sess:
+        @function.Defun(*[dtypes.float32] * 2)
+        def Body(n, x):
+          return n - 1, x + n
 
-      def Run(n):
-        return sess.run(functional_ops.While([n, 0.], Cond, Body))[1]
+        def Run(sess, n):
+          return sess.run(functional_ops.While([n, 0.], Cond, Body))[1]
 
-      self.assertAllEqual(Run(20.), 210.)
-      self.assertAllEqual(Run(100.), 5050.)
+        with self.test_session(graph=g, use_gpu=use_gpu) as sess:
+          self.assertAllEqual(Run(sess, 20.), 210.)
+          self.assertAllEqual(Run(sess, 100.), 5050.)
 
   def testWhileError(self):
-
-    @function.Defun(*[dtypes.float32] * 2)
-    def Cond(n, unused_x):
-      return n > 0
-
-    @function.Defun(*[dtypes.float32] * 2)
-    def CondReturnsTooManyArgs(n, x):
-      return n > 0, x
-
-    @function.Defun(*[dtypes.float32] * 2)
-    def Body(n, x):
-      return n - 1, x + n
-
-    @function.Defun(*[dtypes.float32] * 2)
-    def BodyReturnsTooManyArgs(n, x):
-      return n - 1, x + n, x
-
-    # TODO(b/65752372): Set `use_gpu=False` because
-    # `functional_ops.While()` does not reliably work on GPU (apparently
-    # because the result of evaluating the condition may be in device
-    # memory, but it is read on the host).
-    with self.test_session(use_gpu=False):
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "Expected a single scalar.*got 2 tensors."):
-        functional_ops.While([5., 0.], CondReturnsTooManyArgs, Body)[0].eval()
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          "While loop body returned 3 arguments. Expected: 2"):
-        functional_ops.While([5., 0.], Cond, BodyReturnsTooManyArgs)[0].eval()
+    for use_gpu in (True, False):
+      with ops.Graph().as_default() as g:
+
+        @function.Defun(*[dtypes.float32] * 2)
+        def Cond(n, unused_x):
+          return n > 0
+
+        @function.Defun(*[dtypes.float32] * 2)
+        def CondReturnsTooManyArgs(n, x):
+          return n > 0, x
+
+        @function.Defun(*[dtypes.float32] * 2)
+        def Body(n, x):
+          return n - 1, x + n
+
+        @function.Defun(*[dtypes.float32] * 2)
+        def BodyReturnsTooManyArgs(n, x):
+          return n - 1, x + n, x
+
+        with self.test_session(graph=g, use_gpu=use_gpu):
+          with self.assertRaisesRegexp(
+              errors.InvalidArgumentError,
+              "Expected a single scalar.*got 2 tensors."):
+            functional_ops.While([5., 0.], CondReturnsTooManyArgs,
+                                 Body)[0].eval()
+          with self.assertRaisesRegexp(
+              errors.InvalidArgumentError,
+              "While loop body returned 3 arguments. Expected: 2"):
+            functional_ops.While([5., 0.], Cond,
+                                 BodyReturnsTooManyArgs)[0].eval()
 
   def testWhileInMultipleSubgraphs(self):
 
-    @function.Defun(* [dtypes.float32] * 2)
-    def Cond(n, x):  # pylint: disable=unused-argument
-      return n > 0
-
-    @function.Defun(* [dtypes.float32] * 2)
-    def Body(n, x):
-      return n - 1, x + n
-
-    # TODO(b/65752372): Set `use_gpu=False` because
-    # `functional_ops.While()` does not reliably work on GPU (apparently
-    # because the result of evaluating the condition may be in device
-    # memory, but it is read on the host).
-    with self.test_session(use_gpu=False) as sess:
-      n = array_ops.placeholder(dtypes.float32)
-      _, result = functional_ops.While([n, 0.], Cond, Body)
-      c = constant_op.constant(37.)
-
-      self.assertAllEqual(210., sess.run(result, feed_dict={n: 20.}))
-      self.assertAllEqual(5050., sess.run(result, feed_dict={n: 100.}))
-      # Test that the result is the same when we run a different subgraph.
-      self.assertAllEqual(5050., sess.run([result, c], feed_dict={n: 100.})[0])
-
-  def _tfSum(self, rewrite_with_while):
-    # On GPU, don't rewrite using a while loop.
-    use_gpu = not rewrite_with_while
-    with self.test_session(use_gpu=use_gpu) as sess:
-
-      @function.Defun(dtypes.int32, dtypes.float32)
-      def Body(n, x):
-        return x + math_ops.to_float(n)
-
-      xs = [
-          # 1 + 2  + ... + 20
-          functional_ops.For(
-              1, 21, 1, [0.], Body, rewrite_with_while=rewrite_with_while)[0],
-          # 100 + 99 + ... + 1
-          functional_ops.For(
-              100, 0, -1, [0.], Body, rewrite_with_while=rewrite_with_while)[0],
-      ]
-      xvals = sess.run(xs)
-    self.assertAllEqual(210, xvals[0])
-    self.assertAllEqual(5050, xvals[1])
+    for use_gpu in (True, False):
+      with ops.Graph().as_default() as g:
+
+        @function.Defun(*[dtypes.float32] * 2)
+        def Cond(n, x):  # pylint: disable=unused-argument
+          return n > 0
+
+        @function.Defun(*[dtypes.float32] * 2)
+        def Body(n, x):
+          return n - 1, x + n
+
+        with self.test_session(graph=g, use_gpu=use_gpu) as sess:
+          n = array_ops.placeholder(dtypes.float32)
+          _, result = functional_ops.While([n, 0.], Cond, Body)
+          c = constant_op.constant(37.)
+
+          self.assertAllEqual(210., sess.run(result, feed_dict={n: 20.}))
+          self.assertAllEqual(5050., sess.run(result, feed_dict={n: 100.}))
+          # Test that the result is the same when we run a different subgraph.
+          self.assertAllEqual(5050.,
+                              sess.run([result, c], feed_dict={n: 100.})[0])
+
+  def _tfSum(self, use_gpu, rewrite_with_while):
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g, use_gpu=use_gpu) as sess:
+
+        @function.Defun(dtypes.int32, dtypes.float32)
+        def Body(n, x):
+          return x + math_ops.to_float(n)
+
+        xs = [
+            # 1 + 2  + ... + 20
+            functional_ops.For(
+                1, 21, 1, [0.], Body, rewrite_with_while=rewrite_with_while)[0],
+            # 100 + 99 + ... + 1
+            functional_ops.For(
+                100, 0, -1, [0.], Body, rewrite_with_while=rewrite_with_while)
+            [0],
+        ]
+        xvals = sess.run(xs)
+      self.assertAllEqual(210, xvals[0])
+      self.assertAllEqual(5050, xvals[1])
 
   def testFor(self):
-    self._tfSum(False)
+    for use_gpu in (True, False):
+      self._tfSum(use_gpu, False)
 
   def testForWithWhile(self):
-    self._tfSum(True)
+    for use_gpu in (True, False):
+      self._tfSum(use_gpu, True)
 
   def testForWithWhileNaming(self):
     g = ops.Graph()
@@ -816,10 +833,6 @@ def TestBinary(n, x, x2):
       return x + math_ops.to_float(n) + v, x2 + v
 
     for rewrite_with_while in (True, False):
-      # TODO(b/65752372): Set `use_gpu=False` because
-      # `functional_ops.While()` does not reliably work on GPU (apparently
-      # because the result of evaluating the condition may be in device
-      # memory, but it is read on the host).
       use_gpu = not rewrite_with_while
       with self.test_session(use_gpu=use_gpu) as sess:
         result_nullary = functional_ops.For(
diff --git a/tensorflow/python/kernel_tests/gather_nd_op_test.py b/tensorflow/python/kernel_tests/gather_nd_op_test.py
index 91ebe8de992126..58e2a8ac2a3b82 100644
--- a/tensorflow/python/kernel_tests/gather_nd_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_nd_op_test.py
@@ -197,7 +197,21 @@ def testUnknownIndices(self):
     self.assertEqual(None, shape.ndims)
     self.assertEqual(None, shape[0].value)
 
-  def testBadIndices(self):
+  def testBadIndicesCPU(self):
+    with self.test_session(use_gpu=False):
+      params = [0, 1, 2]
+      indices = [[[0], [7]]]  # Make this one higher rank
+      gather_nd = array_ops.gather_nd(params, indices)
+      with self.assertRaisesOpError(
+          r"flat indices\[1, :\] = \[7\] does not index into param "
+          r"\(shape: \[3\]\)"):
+        gather_nd.eval()
+
+  def _disabledTestBadIndicesGPU(self):
+    # TODO disabled due to different behavior on GPU and CPU
+    # On GPU the bad indices do not raise error but fetch 0 values
+    if not test.is_gpu_available():
+      return
     with self.test_session(use_gpu=True):
       params = [0, 1, 2]
       indices = [[[0], [7]]]  # Make this one higher rank
@@ -207,7 +221,21 @@ def testBadIndices(self):
           r"\(shape: \[3\]\)"):
         gather_nd.eval()
 
-  def testBadIndicesWithSlices(self):
+  def testBadIndicesWithSlicesCPU(self):
+    with self.test_session(use_gpu=False):
+      params = [[0, 1, 2]]
+      indices = [[[0], [0], [1]]]  # Make this one higher rank
+      gather_nd = array_ops.gather_nd(params, indices)
+      with self.assertRaisesOpError(
+          r"flat indices\[2, :\] = \[1\] does not index into param "
+          r"\(shape: \[1,3\]\)"):
+        gather_nd.eval()
+
+  def _disabledTestBadIndicesWithSlicesGPU(self):
+    # TODO disabled due to different behavior on GPU and CPU
+    # On GPU the bad indices do not raise error but fetch 0 values
+    if not test.is_gpu_available():
+      return
     with self.test_session(use_gpu=True):
       params = [[0, 1, 2]]
       indices = [[[0], [0], [1]]]  # Make this one higher rank
diff --git a/tensorflow/python/kernel_tests/gather_op_test.py b/tensorflow/python/kernel_tests/gather_op_test.py
index a2fcd751dfa946..033fa959359e20 100644
--- a/tensorflow/python/kernel_tests/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_op_test.py
@@ -27,7 +27,8 @@
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.platform import test
 
-_TEST_TYPES = (dtypes.float32, dtypes.complex64, dtypes.complex128)
+_TEST_TYPES = (dtypes.int64, dtypes.float32,
+               dtypes.complex64, dtypes.complex128)
 
 
 class GatherTest(test.TestCase):
@@ -122,6 +123,9 @@ def testHigherRank(self):
                 gather, [tf_params, tf_indices, tf_axis], gather_grad)
             self.assertEqual(indices_grad, None)
             self.assertEqual(axis_grad, None)
+            if dtype.is_integer:
+              self.assertEqual(params_grad, None)
+              continue
             # For axis 0, we are able to create an efficient IndexedSlices for
             # the gradient.
             if axis == 0:
@@ -177,7 +181,19 @@ def testUnknownAxis(self):
     gather_t = array_ops.gather(params, indices, axis=axis)
     self.assertEqual(None, gather_t.shape)
 
-  def testBadIndices(self):
+  def testBadIndicesCPU(self):
+    with self.test_session(use_gpu=False):
+      params = [[0, 1, 2], [3, 4, 5]]
+      with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 2\)"):
+        array_ops.gather(params, [[7]], axis=0).eval()
+      with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 3\)"):
+        array_ops.gather(params, [[7]], axis=1).eval()
+
+  def _disabledTestBadIndicesGPU(self):
+    # TODO disabled due to different behavior on GPU and CPU
+    # On GPU the bad indices do not raise error but fetch 0 values
+    if not test.is_gpu_available():
+      return
     with self.test_session(use_gpu=True):
       params = [[0, 1, 2], [3, 4, 5]]
       with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 2\)"):
diff --git a/tensorflow/python/kernel_tests/gradient_correctness_test.py b/tensorflow/python/kernel_tests/gradient_correctness_test.py
index 10fe4f509080b2..e93c6235f74e8f 100644
--- a/tensorflow/python/kernel_tests/gradient_correctness_test.py
+++ b/tensorflow/python/kernel_tests/gradient_correctness_test.py
@@ -40,6 +40,71 @@ def testMultipleOutputChainedGradients(self):
       # [dexp(x)/dx + d(log(exp(x)))/dx] @ x=1 == exp(1) + 1
       self.assertAllClose(grad_vals[0], exp1_plus_one)
 
+  def testIdentityGradient(self):
+    x = constant_op.constant(3.)
+    dx_dx, = gradients_impl.gradients(x, x)
+    with self.test_session() as sess:
+      self.assertAllClose(1., sess.run(dx_dx))
+
+  def testIntegerIdentityGradient(self):
+    x = constant_op.constant(3)
+    dx_dx, = gradients_impl.gradients(x, x)
+    with self.test_session() as sess:
+      self.assertAllClose(1, sess.run(dx_dx))
+
+  def testGradientWithIntegerPath(self):
+    x = constant_op.constant([3.9, 4.1])
+    k = math_ops.to_float(math_ops.to_int32(x))
+    y = x * k
+    dy_dx, = gradients_impl.gradients(y, x)
+    with self.test_session() as sess:
+      self.assertAllClose([3., 4.], sess.run(dy_dx))
+
+  def testNoIntegerGradient1(self):
+    x = constant_op.constant([3.9, 4.1])
+    k = math_ops.to_float(math_ops.to_int32(x))
+    y = k * k
+    dy_dx, = gradients_impl.gradients(y, x)
+    self.assertIsNone(dy_dx)
+
+  def testNoIntegerGradient2(self):
+    k = constant_op.constant([3, 4])
+    x = math_ops.to_float(k)
+    y = x * x
+    dy_dk, = gradients_impl.gradients(y, k)
+    self.assertIsNone(dy_dk)
+
+  def testNoIntegerGradient3(self):
+    k = constant_op.constant([3, 4])
+    m = k * k
+    dm_dk, = gradients_impl.gradients(m, k)
+    self.assertIsNone(dm_dk)
+
+  def testNoIntegerGradient4(self):
+    k = constant_op.constant([3, 4])
+    m = k * k * k
+    dm_dk, = gradients_impl.gradients(m, k)
+    self.assertIsNone(dm_dk)
+
+  def testNoIntegerGradient5(self):
+    k = constant_op.constant([3, 4])
+    m = k * k
+    n = m * m
+    dn_dk, = gradients_impl.gradients(n, k)
+    self.assertIsNone(dn_dk)
+
+  def testNoIntegerGradient6(self):
+    k = constant_op.constant(3)
+    x = math_ops.to_float(k)
+    grad_1, = gradients_impl.gradients(k * k, k)
+    grad_2, = gradients_impl.gradients(x * x, k)
+    grad_3, = gradients_impl.gradients(math_ops.square(k), k)
+    grad_4, = gradients_impl.gradients(math_ops.square(x), k)
+    self.assertIsNone(grad_1)
+    self.assertIsNone(grad_2)
+    self.assertIsNone(grad_3)
+    self.assertIsNone(grad_4)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py
index a9b55854f1b4a3..795aa67248f66e 100644
--- a/tensorflow/python/kernel_tests/init_ops_test.py
+++ b/tensorflow/python/kernel_tests/init_ops_test.py
@@ -362,6 +362,33 @@ def testInvalidDataType(self):
         dtype=dtypes.string)
 
 
+class VarianceScalingInitializationTest(test.TestCase):
+
+  def testNormalDistribution(self):
+    shape = [100, 100]
+    expect_mean = 0.
+    expect_var = 1. / shape[0]
+    init = init_ops.variance_scaling_initializer(distribution='normal')
+
+    with self.test_session(use_gpu=True):
+      x = init(shape).eval()
+
+    self.assertNear(np.mean(x), expect_mean, err=1e-2)
+    self.assertNear(np.var(x), expect_var, err=1e-2)
+
+  def testUniformDistribution(self):
+    shape = [100, 100]
+    expect_mean = 0.
+    expect_var = 1. / shape[0]
+    init = init_ops.variance_scaling_initializer(distribution='uniform')
+
+    with self.test_session(use_gpu=True):
+      x = init(shape).eval()
+
+    self.assertNear(np.mean(x), expect_mean, err=1e-2)
+    self.assertNear(np.var(x), expect_var, err=1e-2)
+
+
 # TODO(vrv): move to sequence_ops_test?
 class RangeTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/inplace_ops_test.py b/tensorflow/python/kernel_tests/inplace_ops_test.py
index 0f95e13187fcd5..6e894365af6887 100644
--- a/tensorflow/python/kernel_tests/inplace_ops_test.py
+++ b/tensorflow/python/kernel_tests/inplace_ops_test.py
@@ -166,7 +166,8 @@ def testError(self):
 
   def testEmpty(self):
     for dtype in [
-        dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64, dtypes.bool
+        dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64, dtypes.bool,
+        dtypes.uint8
     ]:
       with self.test_session(use_gpu=True):
         test_shapes = [(), (1,), (2, 3), (0, 2), (2, 3, 5), (2, 0, 5)]
@@ -187,11 +188,12 @@ def testEmpty(self):
           self.assertEqual(val.dtype, dtype.as_numpy_dtype)
           self.assertAllEqual(val, np.zeros(shape, dtype.as_numpy_dtype))
 
-        val = inplace_ops.empty((1, 2), dtypes.string, init=True).eval()
-        self.assertEqual(val.tolist(), [[b"", b""]])
+    with self.test_session(use_gpu=True):
+      val = inplace_ops.empty((1, 2), dtypes.string, init=True).eval()
+      self.assertEqual(val.tolist(), [[b"", b""]])
 
-        val = inplace_ops.empty((1, 2), dtypes.string, init=False).eval()
-        self.assertEqual(val.tolist(), [[b"", b""]])
+      val = inplace_ops.empty((1, 2), dtypes.string, init=False).eval()
+      self.assertEqual(val.tolist(), [[b"", b""]])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/user_ops/invalid_op.cc b/tensorflow/python/kernel_tests/invalid_op.cc
similarity index 100%
rename from tensorflow/user_ops/invalid_op.cc
rename to tensorflow/python/kernel_tests/invalid_op.cc
diff --git a/tensorflow/user_ops/invalid_op_test.py b/tensorflow/python/kernel_tests/invalid_op_test.py
similarity index 67%
rename from tensorflow/user_ops/invalid_op_test.py
rename to tensorflow/python/kernel_tests/invalid_op_test.py
index c90a00ce58bb4f..238299a895487b 100644
--- a/tensorflow/user_ops/invalid_op_test.py
+++ b/tensorflow/python/kernel_tests/invalid_op_test.py
@@ -17,19 +17,22 @@
 from __future__ import division
 from __future__ import print_function
 
-import os.path
+import os
 
-import tensorflow as tf
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import load_library
+from tensorflow.python.platform import resource_loader
+from tensorflow.python.platform import test
 
 
-class InvalidOpTest(tf.test.TestCase):
+class InvalidOpTest(test.TestCase):
 
   def testBasic(self):
-    library_filename = os.path.join(tf.resource_loader.get_data_files_path(),
+    library_filename = os.path.join(resource_loader.get_data_files_path(),
                                     'invalid_op.so')
-    with self.assertRaises(tf.errors.InvalidArgumentError):
-      tf.load_op_library(library_filename)
+    with self.assertRaises(errors.InvalidArgumentError):
+      load_library.load_op_library(library_filename)
 
 
 if __name__ == '__main__':
-  tf.test.main()
+  test.main()
diff --git a/tensorflow/python/kernel_tests/large_concat_op_test.py b/tensorflow/python/kernel_tests/large_concat_op_test.py
index 184d1dde2aa5e8..66afb6ec014991 100644
--- a/tensorflow/python/kernel_tests/large_concat_op_test.py
+++ b/tensorflow/python/kernel_tests/large_concat_op_test.py
@@ -19,12 +19,10 @@
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-@test_util.with_c_api
 class LargeConcatOpTest(test.TestCase):
   """Tests that belong in concat_op_test.py, but run over large tensors."""
 
diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD
index 0a1a171ad46830..ebc704240e9ee7 100644
--- a/tensorflow/python/kernel_tests/linalg/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/BUILD
@@ -24,6 +24,28 @@ gpu_py_test(
     ],
 )
 
+gpu_py_test(
+    name = "linear_operator_block_diag_test",
+    size = "medium",
+    srcs = ["linear_operator_block_diag_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/linalg",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+    shard_count = 6,
+    tags = [
+        "noasan",
+        "optonly",
+    ],
+)
+
 gpu_py_test(
     name = "linear_operator_composition_test",
     size = "medium",
@@ -40,7 +62,10 @@ gpu_py_test(
         "//tensorflow/python:platform_test",
     ],
     shard_count = 5,
-    tags = ["noasan"],  # times out b/63678675
+    tags = [
+        "noasan",  # times out, b/63678675
+        "optonly",  # times out
+    ],
 )
 
 gpu_py_test(
@@ -60,7 +85,10 @@ gpu_py_test(
         "//tensorflow/python:platform_test",
     ],
     shard_count = 5,
-    tags = ["noasan"],  # times out b/63678675
+    tags = [
+        "noasan",  # times out, b/63678675
+        "optonly",  # times out, b/79171797
+    ],
 )
 
 gpu_py_test(
@@ -96,6 +124,7 @@ gpu_py_test(
         "//tensorflow/python:random_ops",
     ],
     shard_count = 5,
+    tags = ["optonly"],  # Test is flaky without optimization.
 )
 
 gpu_py_test(
@@ -114,6 +143,28 @@ gpu_py_test(
     shard_count = 5,
 )
 
+gpu_py_test(
+    name = "linear_operator_kronecker_test",
+    size = "medium",
+    srcs = ["linear_operator_kronecker_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/linalg",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+    shard_count = 8,
+    tags = [
+        "noasan",
+        "optonly",
+    ],
+)
+
 gpu_py_test(
     name = "linear_operator_lower_triangular_test",
     size = "medium",
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_block_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
similarity index 98%
rename from tensorflow/contrib/linalg/python/kernel_tests/linear_operator_block_diag_test.py
rename to tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
index e7407ede11409a..2b80f01b734411 100644
--- a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_block_diag_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
@@ -19,11 +19,11 @@
 
 import numpy as np
 
-from tensorflow.contrib.linalg.python.ops import linear_operator_block_diag as block_diag
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.linalg import linalg as linalg_lib
+from tensorflow.python.ops.linalg import linear_operator_block_diag as block_diag
 from tensorflow.python.ops.linalg import linear_operator_test_util
 from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.platform import test
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
index e7f2f1c12bf46b..5713d169696c78 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
@@ -73,7 +73,7 @@ def _spectrum_to_circulant_1d(self, spectrum, shape, dtype):
       x = np.zeros([domain_dimension])
       # x is a basis vector.
       x[m] = 1.0
-      fft_x = math_ops.fft(x)
+      fft_x = math_ops.fft(x.astype(np.complex64))
       h_convolve_x = math_ops.ifft(spectrum * fft_x)
       matrix_rows.append(h_convolve_x)
     matrix = array_ops.stack(matrix_rows, axis=-1)
@@ -91,7 +91,7 @@ class LinearOperatorCirculantTestSelfAdjointOperator(
 
   @property
   def _dtypes_to_test(self):
-    # This operator will always be complex because, although the specturm is
+    # This operator will always be complex because, although the spectrum is
     # real, the matrix will not be real.
     return [dtypes.complex64]
 
@@ -408,7 +408,7 @@ def _spectrum_to_circulant_2d(self, spectrum, shape, dtype):
         x = np.zeros(block_shape)
         # x is a basis vector.
         x[n0, n1] = 1.0
-        fft_x = math_ops.fft2d(x)
+        fft_x = math_ops.fft2d(x.astype(np.complex64))
         h_convolve_x = math_ops.ifft2d(spectrum * fft_x)
         # We want the flat version of the action of the operator on a basis
         # vector, not the block version.
diff --git a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_kronecker_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
similarity index 97%
rename from tensorflow/contrib/linalg/python/kernel_tests/linear_operator_kronecker_test.py
rename to tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
index 6574da22a188c7..784c730bbc8179 100644
--- a/tensorflow/contrib/linalg/python/kernel_tests/linear_operator_kronecker_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
@@ -19,12 +19,12 @@
 
 import numpy as np
 
-from tensorflow.contrib.linalg.python.ops import linear_operator_kronecker as kronecker
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.linalg import linalg as linalg_lib
+from tensorflow.python.ops.linalg import linear_operator_kronecker as kronecker
 from tensorflow.python.ops.linalg import linear_operator_test_util
 from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.platform import test
@@ -97,6 +97,10 @@ def _operator_build_infos(self):
         build_info((3, 6, 6), factors=[(3, 1, 1), (1, 2, 2), (1, 3, 3)]),
     ]
 
+  @property
+  def _tests_to_skip(self):
+    return ["det", "solve", "solve_with_broadcast"]
+
   def _operator_and_mat_and_feed_dict(self, build_info, dtype, use_placeholder):
     shape = list(build_info.shape)
     expected_factors = build_info.__dict__["factors"]
diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py
index 098f9724a2a65f..49855200c2427a 100644
--- a/tensorflow/python/kernel_tests/list_ops_test.py
+++ b/tensorflow/python/kernel_tests/list_ops_test.py
@@ -43,6 +43,7 @@ def scalar_shape():
   return ops.convert_to_tensor([], dtype=dtypes.int32)
 
 
+@test_util.with_c_shapes
 class ListOpsTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes()
diff --git a/tensorflow/python/kernel_tests/manip_ops_test.py b/tensorflow/python/kernel_tests/manip_ops_test.py
index f31426713c49ba..dc3ea386714c98 100644
--- a/tensorflow/python/kernel_tests/manip_ops_test.py
+++ b/tensorflow/python/kernel_tests/manip_ops_test.py
@@ -93,7 +93,7 @@ def testComplexTypes(self):
   def testNegativeAxis(self):
     self._testAll(np.random.randint(-100, 100, (5)).astype(np.int32), 3, -1)
     self._testAll(np.random.randint(-100, 100, (4, 4)).astype(np.int32), 3, -2)
-    # Make sure negative axis shoudl be 0 <= axis + dims < dims
+    # Make sure negative axis should be 0 <= axis + dims < dims
     with self.test_session():
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    "is out of range"):
diff --git a/tensorflow/python/kernel_tests/nth_element_op_test.py b/tensorflow/python/kernel_tests/nth_element_op_test.py
index 58cd46d2d52079..1b8f02140fb5d5 100644
--- a/tensorflow/python/kernel_tests/nth_element_op_test.py
+++ b/tensorflow/python/kernel_tests/nth_element_op_test.py
@@ -154,14 +154,14 @@ def testNTooLargeAtEval(self):
 
   def testGradients(self):
     with self.test_session(use_gpu=False) as sess:
-      inputs = array_ops.placeholder(dtypes.int32, shape=[3, 5])
+      inputs = array_ops.placeholder(dtypes.float32, shape=[3, 5])
       values = nn_ops.nth_element(inputs, 3)
       grad = sess.run(
           gradients_impl.gradients(
               values, inputs, grad_ys=[[-1., 2., 5.]]),
-          feed_dict={inputs: [[2, -1, 1000, 3, 1000],
-                              [1, 5, 2, 4, 3],
-                              [2, 2, 2, 2, 2],
+          feed_dict={inputs: [[2., -1., 1000., 3., 1000.],
+                              [1., 5., 2., 4., 3.],
+                              [2., 2., 2., 2., 2.],
                              ]})
     self.assertAllClose(grad[0], [[0, 0, -0.5, 0, -0.5],
                                   [0, 0, 0, 2, 0],
diff --git a/tensorflow/python/kernel_tests/pad_op_test.py b/tensorflow/python/kernel_tests/pad_op_test.py
index 361853448ce2c8..944de217a17576 100644
--- a/tensorflow/python/kernel_tests/pad_op_test.py
+++ b/tensorflow/python/kernel_tests/pad_op_test.py
@@ -317,6 +317,11 @@ def testPartialShapeInformation(self):
                            [constant_op.constant(1, shape=[2]), [0, unknown]])
     self.assertEqual([6, None], padded.get_shape().as_list())
 
+    # Zero padding on a known dimension.
+    inp = array_ops.placeholder(dtypes.int32, [None, None, 20])
+    padded = array_ops.pad(inp, [[0, 0], [0, unknown], [0, 0]])
+    self.assertEqual([None, None, 20], padded.get_shape().as_list())
+
   def testScalars(self):
     paddings = np.zeros((0, 2), dtype=np.int32)
     inp = np.asarray(7)
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index 7b2bddcd5af5d2..a16eb355c4fb38 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -947,7 +947,7 @@ def _ConstructAndTestGradient(self,
           output_sizes,
           x_init_value=x_init_value,
           delta=1e-2)
-    print("%s gradient error = " % func_name, err)
+    tf_logging.info("%s gradient error = " % func_name, err)
     self.assertLess(err, err_tolerance)
 
   def _ConstructAndTestSecondGradient(self,
@@ -1024,7 +1024,7 @@ def _ConstructAndTestSecondGradient(self,
           input_sizes,
           x_init_value=x_init_value,
           delta=1e-2)
-    print("%s second-order gradient error = " % func_name, err)
+    tf_logging.info("%s second-order gradient error = " % func_name, err)
     self.assertLess(err, err_tolerance)
 
   def _testMaxPoolGradValidPadding1_1(self, data_format, use_gpu):
diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index b9f44d728a1d98..b59e3dd7e724de 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -19,6 +19,7 @@
 from __future__ import division
 from __future__ import print_function
 
+import gc
 import re
 
 import numpy as np
@@ -432,13 +433,29 @@ class WeirdError(Exception):
 
   # ----- Tests shared by py_func and eager_py_func -----
   def testCleanup(self):
-    for _ in xrange(1000):
-      g = ops.Graph()
-      with g.as_default():
-        c = constant_op.constant([1.], dtypes.float32)
-        _ = script_ops.py_func(lambda x: x + 1, [c], [dtypes.float32])
-        _ = script_ops.eager_py_func(lambda x: x + 1, [c], [dtypes.float32])
-    self.assertTrue(script_ops._py_funcs.size() < 100)
+    # Delete everything created by previous tests to avoid side effects.
+    ops.reset_default_graph()
+    gc.collect()
+    initial_size = script_ops._py_funcs.size()
+    # Encapsulate the graph generation, so locals can be deleted.
+    def make_graphs():
+      for _ in xrange(1000):
+        g = ops.Graph()
+        with g.as_default():
+          c = constant_op.constant([1.], dtypes.float32)
+          _ = script_ops.py_func(lambda x: x + 1, [c], [dtypes.float32])
+          _ = script_ops.eager_py_func(lambda x: x + 1, [c], [dtypes.float32])
+          # These ops have a reference to 'c' which has a reference to the graph.
+          # Checks if the functions are being deleted though the graph is referenced from them.
+          # (see #18292)
+          _ = script_ops.py_func(lambda x: x + c.shape[0], [c], [dtypes.float32])
+          _ = script_ops.eager_py_func(lambda x: x + c.shape[0], [c], [dtypes.float32])
+ 
+    # Call garbage collector to enforce deletion.
+    make_graphs()
+    ops.reset_default_graph()
+    gc.collect()
+    self.assertEqual(initial_size, script_ops._py_funcs.size())
 
   # ----- Tests for eager_py_func -----
   @test_util.run_in_graph_and_eager_modes()
@@ -446,9 +463,8 @@ def testEagerSingleOutputInt32(self):
     a = array_ops.ones((3, 3), dtype=dtypes.int32)
     x = array_ops.ones((3, 1), dtype=dtypes.int32)
     output = script_ops.eager_py_func(matmul, inp=[a, x], Tout=dtypes.int32)
-    with self.test_session():
-      ret = self.evaluate(output)
-      self.assertAllEqual(ret, [[3], [3], [3]])
+    ret = self.evaluate(output)
+    self.assertAllEqual(ret, [[3], [3], [3]])
 
   @test_util.run_in_graph_and_eager_modes()
   def testEagerSingleOutputFloat32(self):
diff --git a/tensorflow/python/kernel_tests/reader_ops_test.py b/tensorflow/python/kernel_tests/reader_ops_test.py
index 82a27eebeef16c..7be473a5e750d9 100644
--- a/tensorflow/python/kernel_tests/reader_ops_test.py
+++ b/tensorflow/python/kernel_tests/reader_ops_test.py
@@ -77,6 +77,69 @@
     """
 
 
+class TFCompressionTestCase(test.TestCase):
+
+  def setUp(self):
+    super(TFCompressionTestCase, self).setUp()
+    self._num_files = 2
+    self._num_records = 7
+
+  def _Record(self, f, r):
+    return compat.as_bytes("Record %d of file %d" % (r, f))
+
+  def _CreateFiles(self, options=None, prefix=""):
+    filenames = []
+    for i in range(self._num_files):
+      name = prefix + "tfrecord.%d.txt" % i
+      records = [self._Record(i, j) for j in range(self._num_records)]
+      fn = self._WriteRecordsToFile(records, name, options)
+      filenames.append(fn)
+    return filenames
+
+  def _WriteRecordsToFile(self, records, name="tfrecord", options=None):
+    fn = os.path.join(self.get_temp_dir(), name)
+    with tf_record.TFRecordWriter(fn, options=options) as writer:
+      for r in records:
+        writer.write(r)
+    return fn
+
+  def _ZlibCompressFile(self, infile, name="tfrecord.z"):
+    # zlib compress the file and write compressed contents to file.
+    with open(infile, "rb") as f:
+      cdata = zlib.compress(f.read())
+
+    zfn = os.path.join(self.get_temp_dir(), name)
+    with open(zfn, "wb") as f:
+      f.write(cdata)
+    return zfn
+
+  def _GzipCompressFile(self, infile, name="tfrecord.gz"):
+    # gzip compress the file and write compressed contents to file.
+    with open(infile, "rb") as f:
+      cdata = f.read()
+
+    gzfn = os.path.join(self.get_temp_dir(), name)
+    with gzip.GzipFile(gzfn, "wb") as f:
+      f.write(cdata)
+    return gzfn
+
+  def _ZlibDecompressFile(self, infile, name="tfrecord"):
+    with open(infile, "rb") as f:
+      cdata = zlib.decompress(f.read())
+    fn = os.path.join(self.get_temp_dir(), name)
+    with open(fn, "wb") as f:
+      f.write(cdata)
+    return fn
+
+  def _GzipDecompressFile(self, infile, name="tfrecord"):
+    with gzip.GzipFile(infile, "rb") as f:
+      cdata = f.read()
+    fn = os.path.join(self.get_temp_dir(), name)
+    with open(fn, "wb") as f:
+      f.write(cdata)
+    return fn
+
+
 class IdentityReaderTest(test.TestCase):
 
   def _ExpectRead(self, sess, key, value, expected):
@@ -348,7 +411,7 @@ def testSkipHeaderLines(self):
         k, v = sess.run([key, value])
 
 
-class FixedLengthRecordReaderTest(test.TestCase):
+class FixedLengthRecordReaderTest(TFCompressionTestCase):
 
   def setUp(self):
     super(FixedLengthRecordReaderTest, self).setUp()
@@ -407,40 +470,18 @@ def _CreateOverlappedRecordFiles(self, num_overlapped_records):
 
   # gap_bytes=hop_bytes-record_bytes
   def _CreateGzipFiles(self, num_records, gap_bytes):
-    filenames = []
-    for i in range(self._num_files):
-      fn = os.path.join(self.get_temp_dir(), "fixed_length_record.%d.txt" % i)
-      filenames.append(fn)
-      with gzip.GzipFile(fn, "wb") as f:
-        f.write(b"H" * self._header_bytes)
-        if num_records > 0:
-          f.write(self._Record(i, 0))
-        for j in range(1, num_records):
-          if gap_bytes > 0:
-            f.write(b"G" * gap_bytes)
-          f.write(self._Record(i, j))
-        f.write(b"F" * self._footer_bytes)
+    filenames = self._CreateFiles(num_records, gap_bytes)
+    for fn in filenames:
+      # compress inplace.
+      self._GzipCompressFile(fn, fn)
     return filenames
 
   # gap_bytes=hop_bytes-record_bytes
   def _CreateZlibFiles(self, num_records, gap_bytes):
-    filenames = []
-    for i in range(self._num_files):
-      fn = os.path.join(self.get_temp_dir(), "fixed_length_record.%d.txt" % i)
-      filenames.append(fn)
-      with open(fn + ".tmp", "wb") as f:
-        f.write(b"H" * self._header_bytes)
-        if num_records > 0:
-          f.write(self._Record(i, 0))
-        for j in range(1, num_records):
-          if gap_bytes > 0:
-            f.write(b"G" * gap_bytes)
-          f.write(self._Record(i, j))
-        f.write(b"F" * self._footer_bytes)
-      with open(fn + ".tmp", "rb") as f:
-        cdata = zlib.compress(f.read())
-        with open(fn, "wb") as zf:
-          zf.write(cdata)
+    filenames = self._CreateFiles(num_records, gap_bytes)
+    for fn in filenames:
+      # compress inplace.
+      self._ZlibCompressFile(fn, fn)
     return filenames
 
   def _CreateGzipOverlappedRecordFiles(self, num_overlapped_records):
@@ -477,10 +518,7 @@ def _CreateZlibOverlappedRecordFiles(self, num_overlapped_records):
           ])
           f.write(compat.as_bytes(all_records_str))
         f.write(b"F" * self._footer_bytes)
-      with open(fn + ".tmp", "rb") as f:
-        cdata = zlib.compress(f.read())
-        with open(fn, "wb") as zf:
-          zf.write(cdata)
+      self._ZlibCompressFile(fn + ".tmp", fn)
     return filenames
 
   # gap_bytes=hop_bytes-record_bytes
@@ -529,7 +567,6 @@ def _TestOneEpochWithHopBytes(self,
       for i in range(self._num_files):
         for j in range(num_overlapped_records):
           k, v = sess.run([key, value])
-          print(v)
           self.assertAllEqual("%s:%d" % (files[i], j), compat.as_text(k))
           self.assertAllEqual(self._OverlappedRecord(i, j), v)
 
@@ -579,25 +616,10 @@ def testZlibOneEpochWithHopBytes(self):
           files, num_overlapped_records, encoding="ZLIB")
 
 
-class TFRecordReaderTest(test.TestCase):
+class TFRecordReaderTest(TFCompressionTestCase):
 
   def setUp(self):
     super(TFRecordReaderTest, self).setUp()
-    self._num_files = 2
-    self._num_records = 7
-
-  def _Record(self, f, r):
-    return compat.as_bytes("Record %d of file %d" % (r, f))
-
-  def _CreateFiles(self):
-    filenames = []
-    for i in range(self._num_files):
-      fn = os.path.join(self.get_temp_dir(), "tf_record.%d.txt" % i)
-      filenames.append(fn)
-      writer = tf_record.TFRecordWriter(fn)
-      for j in range(self._num_records):
-        writer.write(self._Record(i, j))
-    return filenames
 
   def testOneEpoch(self):
     files = self._CreateFiles()
@@ -647,107 +669,106 @@ def testReadUpTo(self):
       self.assertEqual(self._num_files * self._num_records, num_v)
 
   def testReadZlibFiles(self):
-    files = self._CreateFiles()
-    zlib_files = []
-    for i, fn in enumerate(files):
-      with open(fn, "rb") as f:
-        cdata = zlib.compress(f.read())
-
-        zfn = os.path.join(self.get_temp_dir(), "tfrecord_%s.z" % i)
-        with open(zfn, "wb") as f:
-          f.write(cdata)
-        zlib_files.append(zfn)
+    options = tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB)
+    files = self._CreateFiles(options)
 
     with self.test_session() as sess:
-      options = tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB)
       reader = io_ops.TFRecordReader(name="test_reader", options=options)
       queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
       key, value = reader.read(queue)
 
-      queue.enqueue_many([zlib_files]).run()
+      queue.enqueue_many([files]).run()
       queue.close().run()
       for i in range(self._num_files):
         for j in range(self._num_records):
           k, v = sess.run([key, value])
-          self.assertTrue(compat.as_text(k).startswith("%s:" % zlib_files[i]))
+          self.assertTrue(compat.as_text(k).startswith("%s:" % files[i]))
           self.assertAllEqual(self._Record(i, j), v)
 
   def testReadGzipFiles(self):
-    files = self._CreateFiles()
-    gzip_files = []
-    for i, fn in enumerate(files):
-      with open(fn, "rb") as f:
-        cdata = f.read()
-
-        zfn = os.path.join(self.get_temp_dir(), "tfrecord_%s.gz" % i)
-        with gzip.GzipFile(zfn, "wb") as f:
-          f.write(cdata)
-        gzip_files.append(zfn)
+    options = tf_record.TFRecordOptions(TFRecordCompressionType.GZIP)
+    files = self._CreateFiles(options)
 
     with self.test_session() as sess:
-      options = tf_record.TFRecordOptions(TFRecordCompressionType.GZIP)
       reader = io_ops.TFRecordReader(name="test_reader", options=options)
       queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
       key, value = reader.read(queue)
 
-      queue.enqueue_many([gzip_files]).run()
+      queue.enqueue_many([files]).run()
       queue.close().run()
       for i in range(self._num_files):
         for j in range(self._num_records):
           k, v = sess.run([key, value])
-          self.assertTrue(compat.as_text(k).startswith("%s:" % gzip_files[i]))
+          self.assertTrue(compat.as_text(k).startswith("%s:" % files[i]))
           self.assertAllEqual(self._Record(i, j), v)
 
 
-class TFRecordWriterZlibTest(test.TestCase):
+class TFRecordWriterTest(TFCompressionTestCase):
 
   def setUp(self):
-    super(TFRecordWriterZlibTest, self).setUp()
-    self._num_files = 2
-    self._num_records = 7
+    super(TFRecordWriterTest, self).setUp()
+
+  def _AssertFilesEqual(self, a, b, equal):
+    for an, bn in zip(a, b):
+      with open(an, "rb") as af, open(bn, "rb") as bf:
+        if equal:
+          self.assertEqual(af.read(), bf.read())
+        else:
+          self.assertNotEqual(af.read(), bf.read())
+
+  def testWriteReadZLibFiles(self):
+    # Write uncompressed then compress manually.
+    options = tf_record.TFRecordOptions(TFRecordCompressionType.NONE)
+    files = self._CreateFiles(options, prefix="uncompressed")
+    zlib_files = [
+        self._ZlibCompressFile(fn, "tfrecord_%s.z" % i)
+        for i, fn in enumerate(files)
+    ]
+    self._AssertFilesEqual(files, zlib_files, False)
 
-  def _Record(self, f, r):
-    return compat.as_bytes("Record %d of file %d" % (r, f))
+    # Now write compressd and verify same.
+    options = tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB)
+    compressed_files = self._CreateFiles(options, prefix="compressed")
+    self._AssertFilesEqual(compressed_files, zlib_files, True)
 
-  def _CreateFiles(self):
-    filenames = []
-    for i in range(self._num_files):
-      fn = os.path.join(self.get_temp_dir(), "tf_record.%d.txt" % i)
-      filenames.append(fn)
-      options = tf_record.TFRecordOptions(
-          compression_type=TFRecordCompressionType.ZLIB)
-      writer = tf_record.TFRecordWriter(fn, options=options)
-      for j in range(self._num_records):
-        writer.write(self._Record(i, j))
-      writer.close()
-      del writer
+    # Decompress compress and verify same.
+    uncompressed_files = [
+        self._ZlibDecompressFile(fn, "tfrecord_%s.z" % i)
+        for i, fn in enumerate(compressed_files)
+    ]
+    self._AssertFilesEqual(uncompressed_files, files, True)
+
+  def testWriteReadGzipFiles(self):
+    # Write uncompressed then compress manually.
+    options = tf_record.TFRecordOptions(TFRecordCompressionType.NONE)
+    files = self._CreateFiles(options, prefix="uncompressed")
+    gzip_files = [
+        self._GzipCompressFile(fn, "tfrecord_%s.gz" % i)
+        for i, fn in enumerate(files)
+    ]
+    self._AssertFilesEqual(files, gzip_files, False)
 
-    return filenames
+    # Now write compressd and verify same.
+    options = tf_record.TFRecordOptions(TFRecordCompressionType.GZIP)
+    compressed_files = self._CreateFiles(options, prefix="compressed")
 
-  def _WriteRecordsToFile(self, records, name="tf_record"):
-    fn = os.path.join(self.get_temp_dir(), name)
-    writer = tf_record.TFRecordWriter(fn, options=None)
-    for r in records:
-      writer.write(r)
-    writer.close()
-    del writer
-    return fn
+    # Note: Gzips written by TFRecordWriter add 'tfrecord_0' so
+    # compressed_files can't be compared with gzip_files
 
-  def _ZlibCompressFile(self, infile, name="tfrecord.z"):
-    # zlib compress the file and write compressed contents to file.
-    with open(infile, "rb") as f:
-      cdata = zlib.compress(f.read())
+    # Decompress compress and verify same.
+    uncompressed_files = [
+        self._GzipDecompressFile(fn, "tfrecord_%s.gz" % i)
+        for i, fn in enumerate(compressed_files)
+    ]
+    self._AssertFilesEqual(uncompressed_files, files, True)
 
-    zfn = os.path.join(self.get_temp_dir(), name)
-    with open(zfn, "wb") as f:
-      f.write(cdata)
-    return zfn
+
+class TFRecordWriterZlibTest(TFCompressionTestCase):
 
   def testOneEpoch(self):
-    files = self._CreateFiles()
+    options = tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB)
+    files = self._CreateFiles(options)
     with self.test_session() as sess:
-      options = tf_record.TFRecordOptions(
-          compression_type=TFRecordCompressionType.ZLIB)
       reader = io_ops.TFRecordReader(name="test_reader", options=options)
       queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
       key, value = reader.read(queue)
@@ -788,8 +809,7 @@ def testZLibFlushRecord(self):
       h.write(output)
 
     with self.test_session() as sess:
-      options = tf_record.TFRecordOptions(
-          compression_type=TFRecordCompressionType.ZLIB)
+      options = tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB)
       reader = io_ops.TFRecordReader(name="test_reader", options=options)
       queue = data_flow_ops.FIFOQueue(1, [dtypes.string], shapes=())
       key, value = reader.read(queue)
@@ -808,9 +828,7 @@ def testZlibReadWrite(self):
     # read the compressed contents and verify.
     actual = []
     for r in tf_record.tf_record_iterator(
-        zfn,
-        options=tf_record.TFRecordOptions(
-            tf_record.TFRecordCompressionType.ZLIB)):
+        zfn, options=tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB)):
       actual.append(r)
     self.assertEqual(actual, original)
 
@@ -822,12 +840,9 @@ def testZlibReadWriteLarge(self):
     fn = self._WriteRecordsToFile(original, "zlib_read_write_large.tfrecord")
     zfn = self._ZlibCompressFile(fn, "zlib_read_write_large.tfrecord.z")
 
-    # read the compressed contents and verify.
     actual = []
     for r in tf_record.tf_record_iterator(
-        zfn,
-        options=tf_record.TFRecordOptions(
-            tf_record.TFRecordCompressionType.ZLIB)):
+        zfn, options=tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB)):
       actual.append(r)
     self.assertEqual(actual, original)
 
@@ -835,13 +850,7 @@ def testGzipReadWrite(self):
     """Verify that files produced are gzip compatible."""
     original = [b"foo", b"bar"]
     fn = self._WriteRecordsToFile(original, "gzip_read_write.tfrecord")
-
-    # gzip compress the file and write compressed contents to file.
-    with open(fn, "rb") as f:
-      cdata = f.read()
-    gzfn = os.path.join(self.get_temp_dir(), "tf_record.gz")
-    with gzip.GzipFile(gzfn, "wb") as f:
-      f.write(cdata)
+    gzfn = self._GzipCompressFile(fn, "tfrecord.gz")
 
     actual = []
     for r in tf_record.tf_record_iterator(
@@ -850,89 +859,54 @@ def testGzipReadWrite(self):
     self.assertEqual(actual, original)
 
 
-class TFRecordIteratorTest(test.TestCase):
+class TFRecordIteratorTest(TFCompressionTestCase):
 
   def setUp(self):
     super(TFRecordIteratorTest, self).setUp()
     self._num_records = 7
 
-  def _Record(self, r):
-    return compat.as_bytes("Record %d" % r)
-
-  def _WriteCompressedRecordsToFile(
-      self,
-      records,
-      name="tfrecord.z",
-      compression_type=tf_record.TFRecordCompressionType.ZLIB):
-    fn = os.path.join(self.get_temp_dir(), name)
-    options = tf_record.TFRecordOptions(compression_type=compression_type)
-    writer = tf_record.TFRecordWriter(fn, options=options)
-    for r in records:
-      writer.write(r)
-    writer.close()
-    del writer
-    return fn
-
-  def _ZlibDecompressFile(self, infile, name="tfrecord", wbits=zlib.MAX_WBITS):
-    with open(infile, "rb") as f:
-      cdata = zlib.decompress(f.read(), wbits)
-    zfn = os.path.join(self.get_temp_dir(), name)
-    with open(zfn, "wb") as f:
-      f.write(cdata)
-    return zfn
-
   def testIterator(self):
-    fn = self._WriteCompressedRecordsToFile(
-        [self._Record(i) for i in range(self._num_records)],
-        "compressed_records")
-    options = tf_record.TFRecordOptions(
-        compression_type=TFRecordCompressionType.ZLIB)
+    records = [self._Record(0, i) for i in range(self._num_records)]
+    options = tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB)
+    fn = self._WriteRecordsToFile(records, "compressed_records", options)
+
     reader = tf_record.tf_record_iterator(fn, options)
-    for i in range(self._num_records):
+    for expected in records:
       record = next(reader)
-      self.assertAllEqual(self._Record(i), record)
+      self.assertAllEqual(expected, record)
     with self.assertRaises(StopIteration):
       record = next(reader)
 
   def testWriteZlibRead(self):
     """Verify compression with TFRecordWriter is zlib library compatible."""
     original = [b"foo", b"bar"]
-    fn = self._WriteCompressedRecordsToFile(original,
-                                            "write_zlib_read.tfrecord.z")
+    options = tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB)
+    fn = self._WriteRecordsToFile(original, "write_zlib_read.tfrecord.z",
+                                  options)
+
     zfn = self._ZlibDecompressFile(fn, "write_zlib_read.tfrecord")
-    actual = []
-    for r in tf_record.tf_record_iterator(zfn):
-      actual.append(r)
+    actual = list(tf_record.tf_record_iterator(zfn))
     self.assertEqual(actual, original)
 
   def testWriteZlibReadLarge(self):
     """Verify compression for large records is zlib library compatible."""
     # Make it large (about 5MB)
     original = [_TEXT * 10240]
-    fn = self._WriteCompressedRecordsToFile(original,
-                                            "write_zlib_read_large.tfrecord.z")
-    zfn = self._ZlibDecompressFile(fn, "write_zlib_read_large.tf_record")
-    actual = []
-    for r in tf_record.tf_record_iterator(zfn):
-      actual.append(r)
+    options = tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB)
+    fn = self._WriteRecordsToFile(original, "write_zlib_read_large.tfrecord.z",
+                                  options)
+    zfn = self._ZlibDecompressFile(fn, "write_zlib_read_large.tfrecord")
+    actual = list(tf_record.tf_record_iterator(zfn))
     self.assertEqual(actual, original)
 
   def testWriteGzipRead(self):
     original = [b"foo", b"bar"]
-    fn = self._WriteCompressedRecordsToFile(
-        original,
-        "write_gzip_read.tfrecord.gz",
-        compression_type=TFRecordCompressionType.GZIP)
-
-    with gzip.GzipFile(fn, "rb") as f:
-      cdata = f.read()
-    zfn = os.path.join(self.get_temp_dir(), "tf_record")
-    with open(zfn, "wb") as f:
-      f.write(cdata)
+    options = tf_record.TFRecordOptions(TFRecordCompressionType.GZIP)
+    fn = self._WriteRecordsToFile(original, "write_gzip_read.tfrecord.gz",
+                                  options)
 
-    actual = []
-    for r in tf_record.tf_record_iterator(zfn):
-      actual.append(r)
+    gzfn = self._GzipDecompressFile(fn, "write_gzip_read.tfrecord")
+    actual = list(tf_record.tf_record_iterator(gzfn))
     self.assertEqual(actual, original)
 
   def testBadFile(self):
diff --git a/tensorflow/python/kernel_tests/reduce_benchmark_test.py b/tensorflow/python/kernel_tests/reduce_benchmark_test.py
new file mode 100644
index 00000000000000..3a2fb81157d923
--- /dev/null
+++ b/tensorflow/python/kernel_tests/reduce_benchmark_test.py
@@ -0,0 +1,107 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Simple benchmarks for reductions and their gradients."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+from six.moves import range  # pylint: disable=redefined-builtin
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class ReduceBenchmarks(test.Benchmark):
+  """Benchmarks for reductions."""
+
+  def _run(self, func, num_iters):
+    # call func to maybe warm up the GPU
+    func()
+    start = time.time()
+    for _ in range(num_iters):
+      func()
+    end = time.time()
+    mean_us = (end - start) * 1e6 / num_iters
+    self.report_benchmark(
+        iters=num_iters,
+        wall_time=mean_us,
+        extras={"examples_per_sec": num_iters / (end - start)})
+
+  def benchmark_reduce_sum_grad_eager(self):
+    with context.eager_mode():
+      tensor = array_ops.zeros([100, 1000])
+
+      def fn():
+        backprop.gradients_function(math_ops.reduce_sum, [0])(tensor)
+
+      self._run(fn, 10000)
+
+  def benchmark_reduce_sum_grad_eager_cpu(self):
+    with context.eager_mode(), ops.device("/cpu:0"):
+      tensor = array_ops.zeros([100, 1000])
+
+      def fn():
+        backprop.gradients_function(math_ops.reduce_sum, [0])(tensor)
+
+      self._run(fn, 10000)
+
+  def benchmark_reduce_sum_grad_graph(self):
+    config = config_pb2.ConfigProto(
+        graph_options=config_pb2.GraphOptions(
+            optimizer_options=config_pb2.OptimizerOptions(
+                opt_level=config_pb2.OptimizerOptions.L0)))
+    with ops.Graph().as_default(), session.Session(config=config) as sess:
+
+      tensor = constant_op.constant(np.zeros([100, 1000], dtype=np.float32))
+      reduction = math_ops.reduce_sum(tensor)
+      grad, = gradients_impl.gradients(reduction, tensor)
+
+      def fn():
+        sess.run(grad.op)
+
+      self._run(fn, 10000)
+
+  def benchmark_reduce_sum_grad_graph_cpu(self):
+    config = config_pb2.ConfigProto(
+        graph_options=config_pb2.GraphOptions(
+            optimizer_options=config_pb2.OptimizerOptions(
+                opt_level=config_pb2.OptimizerOptions.L0)))
+    with ops.Graph().as_default(), session.Session(config=config) as sess:
+
+      with ops.device("/cpu:0"):
+        tensor = constant_op.constant(np.zeros([100, 1000], dtype=np.float32))
+        reduction = math_ops.reduce_sum(tensor)
+        grad, = gradients_impl.gradients(reduction, tensor)
+
+      def fn():
+        sess.run(grad.op)
+
+      self._run(fn, 10000)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/reduce_join_op_test.py b/tensorflow/python/kernel_tests/reduce_join_op_test.py
index fb9e5cc2a3727b..663561ced78161 100644
--- a/tensorflow/python/kernel_tests/reduce_join_op_test.py
+++ b/tensorflow/python/kernel_tests/reduce_join_op_test.py
@@ -100,7 +100,7 @@ def _testReduceJoin(self,
                       input_array,
                       truth,
                       truth_shape,
-                      reduction_indices,
+                      axis,
                       keep_dims=False,
                       separator=""):
     """Compares the output of reduce_join to an expected result.
@@ -109,14 +109,14 @@ def _testReduceJoin(self,
       input_array: The string input to be joined.
       truth: An array or np.array of the expected result.
       truth_shape: An array or np.array of the expected shape.
-      reduction_indices: The indices to reduce over.
+      axis: The indices to reduce over.
       keep_dims: Whether or not to retain reduced dimensions.
       separator: The separator to use for joining.
     """
     with self.test_session():
       output = string_ops.reduce_join(
           inputs=input_array,
-          reduction_indices=reduction_indices,
+          axis=axis,
           keep_dims=keep_dims,
           separator=separator)
       output_array = output.eval()
@@ -124,11 +124,8 @@ def _testReduceJoin(self,
     self.assertAllEqualUnicode(truth, output_array)
     self.assertAllEqual(truth_shape, output.get_shape())
 
-  def _testMultipleReduceJoin(self,
-                              input_array,
-                              reduction_indices,
-                              separator=" "):
-    """Tests reduce_join for one input and multiple reduction_indices.
+  def _testMultipleReduceJoin(self, input_array, axis, separator=" "):
+    """Tests reduce_join for one input and multiple axes.
 
     Does so by comparing the output to that from nested reduce_string_joins.
     The correctness of single-dimension reduce_join is verified by other
@@ -136,31 +133,22 @@ def _testMultipleReduceJoin(self,
 
     Args:
       input_array: The input to test.
-      reduction_indices: The indices to reduce.
+      axis: The indices to reduce.
       separator: The separator to use when joining.
     """
     with self.test_session():
       output = string_ops.reduce_join(
-          inputs=input_array,
-          reduction_indices=reduction_indices,
-          keep_dims=False,
-          separator=separator)
+          inputs=input_array, axis=axis, keep_dims=False, separator=separator)
       output_keep_dims = string_ops.reduce_join(
-          inputs=input_array,
-          reduction_indices=reduction_indices,
-          keep_dims=True,
-          separator=separator)
+          inputs=input_array, axis=axis, keep_dims=True, separator=separator)
 
       truth = input_array
-      for index in reduction_indices:
+      for index in axis:
         truth = string_ops.reduce_join(
-            inputs=truth,
-            reduction_indices=index,
-            keep_dims=True,
-            separator=separator)
-      if not reduction_indices:
+            inputs=truth, axis=index, keep_dims=True, separator=separator)
+      if not axis:
         truth = constant_op.constant(truth)
-      truth_squeezed = array_ops.squeeze(truth, axis=reduction_indices)
+      truth_squeezed = array_ops.squeeze(truth, axis=axis)
       output_array = output.eval()
       output_keep_dims_array = output_keep_dims.eval()
       truth_array = truth.eval()
@@ -174,7 +162,7 @@ def testRankOne(self):
     input_array = ["this", "is", "a", "test"]
     truth = "thisisatest"
     truth_shape = []
-    self._testReduceJoin(input_array, truth, truth_shape, reduction_indices=0)
+    self._testReduceJoin(input_array, truth, truth_shape, axis=0)
 
   def testRankTwo(self):
     input_array = [["this", "is", "a", "test"],
@@ -184,43 +172,32 @@ def testRankTwo(self):
     truth_dim_one = ["thisisatest", "pleasedonotpanic"]
     truth_shape_dim_one = [2]
     self._testReduceJoin(
-        input_array, truth_dim_zero, truth_shape_dim_zero, reduction_indices=0)
+        input_array, truth_dim_zero, truth_shape_dim_zero, axis=0)
     self._testReduceJoin(
-        input_array, truth_dim_one, truth_shape_dim_one, reduction_indices=1)
+        input_array, truth_dim_one, truth_shape_dim_one, axis=1)
 
-    expected_val = "thisisatestpleasedonotpanic"
-    expected_shape = None
-    self._testReduceJoin(
-        input_array, expected_val, expected_shape, reduction_indices=None)
-
-    # When using Tensor for input with reduction_indices=None, shape is known.
     expected_val = "thisisatestpleasedonotpanic"
     expected_shape = []
-    self._testReduceJoin(
-        constant_op.constant(input_array), expected_val,
-        expected_shape, reduction_indices=None)
+    self._testReduceJoin(input_array, expected_val, expected_shape, axis=None)
 
-    # Using [] reduction_indices is a no-op.
+    # Using axis=[] is a no-op.
     expected_val = input_array
     expected_shape = [2, 4]
-    self._testReduceJoin(
-        input_array, expected_val, expected_shape, reduction_indices=[])
+    self._testReduceJoin(input_array, expected_val, expected_shape, axis=[])
 
   def testRankFive(self):
     input_array = _input_array(num_dims=5)
     truths = [_joined_array(num_dims=5, reduce_dim=i) for i in xrange(5)]
     truth_shape = [2] * 4
     for i in xrange(5):
-      self._testReduceJoin(
-          input_array, truths[i], truth_shape, reduction_indices=i)
+      self._testReduceJoin(input_array, truths[i], truth_shape, axis=i)
 
   def testNegative(self):
     input_array = _input_array(num_dims=5)
     truths = [_joined_array(num_dims=5, reduce_dim=i) for i in xrange(5)]
     truth_shape = [2] * 4
     for i in xrange(5):
-      self._testReduceJoin(
-          input_array, truths[i], truth_shape, reduction_indices=i - 5)
+      self._testReduceJoin(input_array, truths[i], truth_shape, axis=i - 5)
 
   def testSingletonDimension(self):
     input_arrays = [
@@ -230,8 +207,7 @@ def testSingletonDimension(self):
     truth = _input_array(num_dims=5)
     truth_shape = [2] * 5
     for i in xrange(6):
-      self._testReduceJoin(
-          input_arrays[i], truth, truth_shape, reduction_indices=i)
+      self._testReduceJoin(input_arrays[i], truth, truth_shape, axis=i)
 
   def testSeparator(self):
     input_array = [["this", "is", "a", "test"],
@@ -245,13 +221,13 @@ def testSeparator(self):
         input_array,
         truth_dim_zero,
         truth_shape_dim_zero,
-        reduction_indices=0,
+        axis=0,
         separator="  ")
     self._testReduceJoin(
         input_array,
         truth_dim_one,
         truth_shape_dim_one,
-        reduction_indices=1,
+        axis=1,
         separator="  ")
 
   def testUnknownShape(self):
@@ -260,7 +236,7 @@ def testUnknownShape(self):
     truth_shape = None
     with self.test_session():
       placeholder = array_ops.placeholder(dtypes.string, name="placeholder")
-      reduced = string_ops.reduce_join(placeholder, reduction_indices=0)
+      reduced = string_ops.reduce_join(placeholder, axis=0)
       output_array = reduced.eval(feed_dict={placeholder.name: input_array})
       self.assertAllEqualUnicode(truth, output_array)
       self.assertAllEqual(truth_shape, reduced.get_shape())
@@ -273,8 +249,7 @@ def testUnknownIndices(self):
     truth_shape = None
     with self.test_session():
       placeholder = array_ops.placeholder(dtypes.int32, name="placeholder")
-      reduced = string_ops.reduce_join(
-          input_array, reduction_indices=placeholder)
+      reduced = string_ops.reduce_join(input_array, axis=placeholder)
       output_array_dim_zero = reduced.eval(feed_dict={placeholder.name: [0]})
       output_array_dim_one = reduced.eval(feed_dict={placeholder.name: [1]})
       self.assertAllEqualUnicode(truth_dim_zero, output_array_dim_zero)
@@ -293,27 +268,26 @@ def testKeepDims(self):
         input_array,
         truth_dim_zero,
         truth_shape_dim_zero,
-        reduction_indices=0,
+        axis=0,
         keep_dims=True)
     self._testReduceJoin(
         input_array,
         truth_dim_one,
         truth_shape_dim_one,
-        reduction_indices=1,
+        axis=1,
         keep_dims=True)
 
     expected_val = [["thisisatestpleasedonotpanic"]]
     expected_shape = [1, 1]
     self._testReduceJoin(
         constant_op.constant(input_array), expected_val, expected_shape,
-        keep_dims=True, reduction_indices=None)
+        keep_dims=True, axis=None)
 
-    # Using [] reduction_indices is a no-op.
+    # Using axis=[] is a no-op.
     expected_val = input_array
     expected_shape = [2, 4]
     self._testReduceJoin(
-        input_array, expected_val, expected_shape,
-        keep_dims=True, reduction_indices=[])
+        input_array, expected_val, expected_shape, keep_dims=True, axis=[])
 
   def testMultiIndex(self):
     num_dims = 3
@@ -321,42 +295,41 @@ def testMultiIndex(self):
     # Also tests [].
     for i in xrange(num_dims + 1):
       for permutation in itertools.permutations(xrange(num_dims), i):
-        self._testMultipleReduceJoin(input_array, reduction_indices=permutation)
+        self._testMultipleReduceJoin(input_array, axis=permutation)
 
   def testInvalidReductionIndices(self):
     with self.test_session():
       with self.assertRaisesRegexp(ValueError, "Invalid reduction dim"):
-        string_ops.reduce_join(inputs="", reduction_indices=0)
+        string_ops.reduce_join(inputs="", axis=0)
       with self.assertRaisesRegexp(ValueError,
                                    "Invalid reduction dimension -3"):
-        string_ops.reduce_join(inputs=[[""]], reduction_indices=-3)
+        string_ops.reduce_join(inputs=[[""]], axis=-3)
       with self.assertRaisesRegexp(ValueError, "Invalid reduction dimension 2"):
-        string_ops.reduce_join(inputs=[[""]], reduction_indices=2)
+        string_ops.reduce_join(inputs=[[""]], axis=2)
       with self.assertRaisesRegexp(ValueError,
                                    "Invalid reduction dimension -3"):
-        string_ops.reduce_join(inputs=[[""]], reduction_indices=[0, -3])
+        string_ops.reduce_join(inputs=[[""]], axis=[0, -3])
       with self.assertRaisesRegexp(ValueError, "Invalid reduction dimension 2"):
-        string_ops.reduce_join(inputs=[[""]], reduction_indices=[0, 2])
+        string_ops.reduce_join(inputs=[[""]], axis=[0, 2])
 
   def testZeroDims(self):
     with self.test_session():
       inputs = np.zeros([0, 1], dtype=str)
 
       # Reduction that drops the dim of size 0.
-      output = string_ops.reduce_join(inputs=inputs, reduction_indices=0)
+      output = string_ops.reduce_join(inputs=inputs, axis=0)
       self.assertAllEqualUnicode([""], output.eval())
 
       # Reduction that keeps the dim of size 0.
-      output = string_ops.reduce_join(inputs=inputs, reduction_indices=1)
+      output = string_ops.reduce_join(inputs=inputs, axis=1)
       output_shape = output.eval().shape
       self.assertAllEqual([0], output_shape)
 
   def testInvalidArgsUnknownShape(self):
     with self.test_session():
       placeholder = array_ops.placeholder(dtypes.string, name="placeholder")
-      index_too_high = string_ops.reduce_join(placeholder, reduction_indices=1)
-      duplicate_index = string_ops.reduce_join(
-          placeholder, reduction_indices=[-1, 1])
+      index_too_high = string_ops.reduce_join(placeholder, axis=1)
+      duplicate_index = string_ops.reduce_join(placeholder, axis=[-1, 1])
       with self.assertRaisesOpError("Invalid reduction dimension 1"):
         index_too_high.eval(feed_dict={placeholder.name: [""]})
       with self.assertRaisesOpError("Duplicate reduction dimension 1"):
@@ -365,8 +338,7 @@ def testInvalidArgsUnknownShape(self):
   def testInvalidArgsUnknownIndices(self):
     with self.test_session():
       placeholder = array_ops.placeholder(dtypes.int32, name="placeholder")
-      reduced = string_ops.reduce_join(
-          ["test", "test2"], reduction_indices=placeholder)
+      reduced = string_ops.reduce_join(["test", "test2"], axis=placeholder)
 
       with self.assertRaisesOpError("reduction dimension -2"):
         reduced.eval(feed_dict={placeholder.name: -2})
diff --git a/tensorflow/python/kernel_tests/regex_full_match_op_test.py b/tensorflow/python/kernel_tests/regex_full_match_op_test.py
new file mode 100644
index 00000000000000..5daae1b79bf493
--- /dev/null
+++ b/tensorflow/python/kernel_tests/regex_full_match_op_test.py
@@ -0,0 +1,54 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for RegexFullMatch op from string_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import test
+
+
+class RegexFullMatchOpTest(test.TestCase):
+
+  def testRegexFullMatch(self):
+    values = ["abaaba", "abcdabcde"]
+    with self.test_session():
+      input_vector = constant_op.constant(values, dtypes.string)
+      matched = string_ops.regex_full_match(input_vector, "a.*a").eval()
+      self.assertAllEqual([True, False], matched)
+
+  def testEmptyMatch(self):
+    values = ["abc", "1"]
+    with self.test_session():
+      input_vector = constant_op.constant(values, dtypes.string)
+      matched = string_ops.regex_full_match(input_vector, "").eval()
+      self.assertAllEqual([False, False], matched)
+
+  def testInvalidPattern(self):
+    values = ["abc", "1"]
+    with self.test_session():
+      input_vector = constant_op.constant(values, dtypes.string)
+      invalid_pattern = "A["
+      matched = string_ops.regex_full_match(input_vector, invalid_pattern)
+      with self.assertRaisesOpError("Invalid pattern"):
+        matched.eval()
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index 984192258c9724..00d517e64e8829 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -42,7 +42,6 @@
 from tensorflow.python.util import compat
 
 
-@test_util.with_c_api
 class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
 
   def tearDown(self):
@@ -107,12 +106,26 @@ def testEagerBool(self):
       v = resource_variable_ops.ResourceVariable(False, name="bool_test")
       self.assertAllEqual(bool(v), False)
 
+  def testDifferentAssignGraph(self):
+    with ops.Graph().as_default():
+      v = resource_variable_ops.ResourceVariable(1.0)
+    ops.reset_default_graph()
+    v.assign(2.0)  # Note: this fails if we run convert_to_tensor on not the
+                   # variable graph.
+
   def testFetchHandle(self):
     with self.test_session():
       handle = resource_variable_ops.var_handle_op(
           dtype=dtypes.int32, shape=[1], name="foo")
       self.assertGreater(len(handle.eval()), 0)
 
+  def testCachedValueReadBeforeWrite(self):
+    with self.test_session() as sess:
+      v = resource_variable_ops.ResourceVariable(0.0, caching_device="cpu:0")
+      sess.run(v.initializer)
+      value, _ = sess.run([v, v.assign_add(1.0)])
+      self.assertAllEqual(value, 0.0)
+
   def testAssignVariableDtypeMismatchEager(self):
     with context.eager_mode():
       handle = resource_variable_ops.var_handle_op(
@@ -400,6 +413,15 @@ def testGPU(self):
               resource_variable_ops.var_is_initialized_op(abc.handle)),
           True)
 
+  def testScatterBool(self):
+    with context.eager_mode():
+      ref = resource_variable_ops.ResourceVariable(
+          [False, True, False], trainable=False)
+      indices = math_ops.range(3)
+      updates = constant_op.constant([True, True, True])
+      state_ops.scatter_update(ref, indices, updates)
+      self.assertAllEqual(ref.read_value(), [True, True, True])
+
   @test_util.run_in_graph_and_eager_modes()
   def testConstraintArg(self):
     constraint = lambda x: x
@@ -516,6 +538,25 @@ def testVariableDefInitializedInstances(self):
       with self.assertRaises(ValueError):
         sess.run(v.initialized_value())
 
+  def testTrainableInProto(self):
+    with ops.Graph().as_default():
+      non_trainable_variable = resource_variable_ops.ResourceVariable(
+          trainable=False,
+          initial_value=constant_op.constant(10.0))
+      self.assertEqual(
+          False,
+          resource_variable_ops.ResourceVariable(
+              variable_def=non_trainable_variable.to_proto())
+          .trainable)
+      trainable_variable = resource_variable_ops.ResourceVariable(
+          trainable=True,
+          initial_value=constant_op.constant(10.0))
+      self.assertEqual(
+          True,
+          resource_variable_ops.ResourceVariable(
+              variable_def=trainable_variable.to_proto())
+          .trainable)
+
   @test_util.run_in_graph_and_eager_modes()
   def testSparseRead(self):
     with self.test_session():
diff --git a/tensorflow/python/kernel_tests/scalar_test.py b/tensorflow/python/kernel_tests/scalar_test.py
index 0d8fd232946883..287919bab72b42 100644
--- a/tensorflow/python/kernel_tests/scalar_test.py
+++ b/tensorflow/python/kernel_tests/scalar_test.py
@@ -31,7 +31,6 @@
 from tensorflow.python.platform import test
 
 
-@test_util.with_c_api
 class ScalarTest(test.TestCase):
 
   def check(self, op, args, error, correct=None):
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index b7477a768ab718..faa4b49a8d7d8b 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -23,8 +23,11 @@
 import numpy as np
 
 from tensorflow.python.client import session
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import resource_variable_ops
@@ -141,7 +144,9 @@ def _VariableRankTest(self,
         self.assertAllClose(new, ref_var.eval())
 
   def _VariableRankTests(self, np_scatter, tf_scatter):
-    for vtype in (np.float32, np.float64, np.complex64, np.complex128):
+    for vtype in (np.int32,
+                  np.float32, np.float64,
+                  np.complex64, np.complex128):
       for itype in (np.int32, np.int64):
         self._VariableRankTest(np_scatter, tf_scatter, vtype, itype)
 
@@ -218,7 +223,7 @@ def testVariableRankSub(self):
   #   self._VariableRankTests(_NumpyDiv, state_ops.scatter_nd_div)
 
   def _ScatterRepeatIndicesTest(self, np_scatter, tf_scatter):
-    for vtype in (np.float32, np.float64):
+    for vtype in (np.int32, np.float32, np.float64):
       for itype in (np.int32, np.int64):
         self._VariableRankTest(
             np_scatter, tf_scatter, vtype, itype, repeat_indices=True)
@@ -364,6 +369,15 @@ def scatter_nd(self, indices, updates, shape, input_=None):
     del input_  # input_ is not used in scatter_nd
     return array_ops.scatter_nd(indices, updates, shape)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testInvalidShape(self):
+    # TODO(apassos) figure out how to unify these errors
+    with self.assertRaises(errors.InvalidArgumentError
+                           if context.executing_eagerly() else ValueError):
+      array_ops.scatter_nd(indices=[0],  # this should be indices=[[0]]
+                           updates=[0.0],
+                           shape=[1])
+
   def testString(self):
     indices = constant_op.constant([[4], [3], [1], [7]],
                                    dtype=dtypes.int32)
diff --git a/tensorflow/python/kernel_tests/scatter_ops_test.py b/tensorflow/python/kernel_tests/scatter_ops_test.py
index c70a4ffce7be71..1a0fa744aeec7b 100644
--- a/tensorflow/python/kernel_tests/scatter_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_ops_test.py
@@ -159,7 +159,13 @@ def _VariableRankTest(self,
 
           # Clips small values to avoid division by zero.
           def clip_small_values(x):
-            return 1e-4 * np.sign(x) if np.abs(x) < 1e-4 else x
+            threshold = 1e-4
+            sign = np.sign(x)
+
+            if isinstance(x, np.int32):
+              threshold = 1
+              sign = np.random.choice([-1, 1])
+            return threshold * sign if np.abs(x) < threshold else x
 
           updates = np.vectorize(clip_small_values)(updates)
           old = _AsType(np.random.randn(*((first_dim,) + extra_shape)), vtype)
@@ -181,7 +187,11 @@ def _VariableRankTests(self,
                          tf_scatter,
                          repeat_indices=False,
                          updates_are_scalar=False):
-    for vtype in (np.float32, np.float64):
+    vtypes = [np.float32, np.float64]
+    if tf_scatter != state_ops.scatter_div:
+      vtypes.append(np.int32)
+
+    for vtype in vtypes:
       for itype in (np.int32, np.int64):
         self._VariableRankTest(tf_scatter, vtype, itype, repeat_indices,
                                updates_are_scalar)
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index 3bca5fadc42693..a82855dfeb5b8f 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -91,16 +91,18 @@ def testValues(self):
     ]
 
     # Each item is np_op1, np_op2, tf_op
-    ops_list = [(np.add, None, math_ops.segment_sum), (self._mean_cum_op,
-                                                       self._mean_reduce_op,
-                                                       math_ops.segment_mean),
+    ops_list = [(np.add, None, math_ops.segment_sum),
+                (self._mean_cum_op, self._mean_reduce_op,
+                 math_ops.segment_mean),
                 (np.ndarray.__mul__, None, math_ops.segment_prod),
                 (np.minimum, None, math_ops.segment_min),
                 (np.maximum, None, math_ops.segment_max)]
 
     # A subset of ops has been enabled for complex numbers
     complex_ops_list = [(np.add, None, math_ops.segment_sum),
-                        (np.ndarray.__mul__, None, math_ops.segment_prod)]
+                        (np.ndarray.__mul__, None, math_ops.segment_prod),
+                        (self._mean_cum_op, self._mean_reduce_op,
+                         math_ops.segment_mean)]
 
     n = 10
     shape = [n, 2]
@@ -262,7 +264,9 @@ def __init__(self, methodName='runTest'):
 
     # A subset of ops has been enabled for complex numbers
     self.complex_ops_list = [(np.add, None,
-                              math_ops.unsorted_segment_sum, lambda t: 0)]
+                              math_ops.unsorted_segment_sum, lambda t: 0),
+                             (np.ndarray.__mul__, None,
+                              math_ops.unsorted_segment_prod, lambda t: 1)]
     self.differentiable_dtypes = [dtypes_lib.float16, dtypes_lib.float32,
                                   dtypes_lib.float64]
     self.all_dtypes = (self.differentiable_dtypes +
diff --git a/tensorflow/python/kernel_tests/softmax_op_test.py b/tensorflow/python/kernel_tests/softmax_op_test.py
index 8c36bde1637af7..8cf4995ac50e93 100644
--- a/tensorflow/python/kernel_tests/softmax_op_test.py
+++ b/tensorflow/python/kernel_tests/softmax_op_test.py
@@ -24,14 +24,12 @@
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 
 
-@test_util.with_c_api
 class SoftmaxTest(test.TestCase):
 
   def _npSoftmax(self, features, dim=-1, log=False):
diff --git a/tensorflow/python/kernel_tests/sparse_cross_op_test.py b/tensorflow/python/kernel_tests/sparse_cross_op_test.py
index 3d09badf27e621..ca7898d466972c 100644
--- a/tensorflow/python/kernel_tests/sparse_cross_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_cross_op_test.py
@@ -32,7 +32,7 @@ class SparseCrossOpTest(test.TestCase):
 
   def test_simple(self):
     """Tests a simple scenario."""
-    op = sparse_ops._sparse_cross([
+    op = sparse_ops.sparse_cross([
         self._sparse_tensor([['batch1-FC1-F1'],
                              ['batch2-FC1-F1', 'batch2-FC1-F2']]),
         self._sparse_tensor([['batch1-FC2-F1'],
@@ -47,7 +47,7 @@ def test_simple(self):
 
   def test_dense(self):
     """Tests only dense inputs."""
-    op = sparse_ops._sparse_cross([
+    op = sparse_ops.sparse_cross([
         constant_op.constant([['batch1-FC1-F1', 'batch1-FC1-F2'],
                               ['batch2-FC1-F1', 'batch2-FC1-F2']],
                              dtypes.string),
@@ -67,7 +67,7 @@ def test_dense(self):
 
   def test_integer_mixed_string_sparse(self):
     """Tests mixed type."""
-    op = sparse_ops._sparse_cross([
+    op = sparse_ops.sparse_cross([
         self._sparse_tensor([[11], [333, 55555]]),
         self._sparse_tensor([['batch1-FC2-F1'],
                              ['batch2-FC2-F1', 'batch2-FC2-F2']])
@@ -81,7 +81,7 @@ def test_integer_mixed_string_sparse(self):
 
   def test_integer_mixed_string_dense(self):
     """Tests mixed dense inputs."""
-    op = sparse_ops._sparse_cross([
+    op = sparse_ops.sparse_cross([
         constant_op.constant([[11, 333], [55555, 999999]], dtypes.int64),
         constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
                               ['batch2-FC2-F1', 'batch2-FC2-F2']],
@@ -99,7 +99,7 @@ def test_integer_mixed_string_dense(self):
 
   def test_sparse_cross_dense(self):
     """Tests sparse and dense inputs."""
-    op = sparse_ops._sparse_cross([
+    op = sparse_ops.sparse_cross([
         self._sparse_tensor([['batch1-FC1-F1'],
                              ['batch2-FC1-F1', 'batch2-FC1-F2']]),
         constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
@@ -116,7 +116,7 @@ def test_sparse_cross_dense(self):
 
   def test_integer_sparse_input(self):
     """Tests mixed type sparse and dense inputs."""
-    op = sparse_ops._sparse_cross([
+    op = sparse_ops.sparse_cross([
         self._sparse_tensor([[11], [333, 5555]]),
         constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'],
                               ['batch2-FC2-F1', 'batch2-FC2-F2']],
@@ -132,7 +132,7 @@ def test_integer_sparse_input(self):
 
   def test_permutation_3x3x3(self):
     """Tests 3x3x3 permutation."""
-    op = sparse_ops._sparse_cross([
+    op = sparse_ops.sparse_cross([
         self._sparse_tensor(
             [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']]),
         self._sparse_tensor(
@@ -174,7 +174,7 @@ def test_permutation_3x3x3(self):
 
   def test_permutation_3x1x2(self):
     """Tests 3x1x2 permutation."""
-    op = sparse_ops._sparse_cross([
+    op = sparse_ops.sparse_cross([
         self._sparse_tensor(
             [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']]),
         self._sparse_tensor([['batch1-FC2-F1']]),
@@ -203,8 +203,9 @@ def test_large_batch(self):
       col2.append(['batch%d-FC2-F1' % b])
       col3.append(['batch%d-FC3-F1' % b, 'batch%d-FC3-F2' % b])
 
-    op = sparse_ops._sparse_cross([
-        self._sparse_tensor(col1), self._sparse_tensor(col2),
+    op = sparse_ops.sparse_cross([
+        self._sparse_tensor(col1),
+        self._sparse_tensor(col2),
         self._sparse_tensor(col3)
     ])
 
@@ -228,7 +229,7 @@ def test_one_column_empty(self):
 
     The crossed tensor should be empty.
     """
-    op = sparse_ops._sparse_cross([
+    op = sparse_ops.sparse_cross([
         self._sparse_tensor([['batch1-FC1-F1', 'batch1-FC1-F2']]),
         self._sparse_tensor([], 1),
         self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
@@ -241,7 +242,7 @@ def test_some_columns_empty(self):
 
     Cross for the corresponding batch should be empty.
     """
-    op = sparse_ops._sparse_cross([
+    op = sparse_ops.sparse_cross([
         self._sparse_tensor([['batch1-FC1-F1', 'batch1-FC1-F2']], 2),
         self._sparse_tensor([['batch1-FC2-F1'], ['batch2-FC2-F1']], 2),
         self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']], 2)
@@ -260,27 +261,27 @@ def test_all_columns_empty(self):
 
     The crossed tensor should be empty.
     """
-    op = sparse_ops._sparse_cross([
-        self._sparse_tensor([]), self._sparse_tensor([]),
+    op = sparse_ops.sparse_cross([
+        self._sparse_tensor([]),
+        self._sparse_tensor([]),
         self._sparse_tensor([])
     ])
     with self.test_session() as sess:
       self._assert_sparse_tensor_empty(sess.run(op))
 
   def test_hashed_zero_bucket_no_hash_key(self):
-    op = sparse_ops._sparse_cross_hashed(
-        [
-            self._sparse_tensor([['batch1-FC1-F1']]),
-            self._sparse_tensor([['batch1-FC2-F1']]),
-            self._sparse_tensor([['batch1-FC3-F1']])
-        ])
+    op = sparse_ops.sparse_cross_hashed([
+        self._sparse_tensor([['batch1-FC1-F1']]),
+        self._sparse_tensor([['batch1-FC2-F1']]),
+        self._sparse_tensor([['batch1-FC3-F1']])
+    ])
     # Check actual hashed output to prevent unintentional hashing changes.
     expected_out = self._sparse_tensor([[1971693436396284976]])
     with self.test_session() as sess:
       self._assert_sparse_tensor_equals(expected_out, sess.run(op))
 
   def test_hashed_zero_bucket(self):
-    op = sparse_ops._sparse_cross_hashed(
+    op = sparse_ops.sparse_cross_hashed(
         [
             self._sparse_tensor([['batch1-FC1-F1']]),
             self._sparse_tensor([['batch1-FC2-F1']]),
@@ -294,7 +295,7 @@ def test_hashed_zero_bucket(self):
 
   # TODO(sibyl-Aix6ihai): Add benchmark to compare Hashed vs Non-hashed.
   def test_hashed_no_hash_key(self):
-    op = sparse_ops._sparse_cross_hashed(
+    op = sparse_ops.sparse_cross_hashed(
         [
             self._sparse_tensor([['batch1-FC1-F1']]),
             self._sparse_tensor([['batch1-FC2-F1']]),
@@ -307,7 +308,7 @@ def test_hashed_no_hash_key(self):
       self._assert_sparse_tensor_equals(expected_out, sess.run(op))
 
   def test_hashed_output(self):
-    op = sparse_ops._sparse_cross_hashed(
+    op = sparse_ops.sparse_cross_hashed(
         [
             self._sparse_tensor([['batch1-FC1-F1']]),
             self._sparse_tensor([['batch1-FC2-F1']]),
@@ -326,10 +327,8 @@ def test_hashed__has_no_collision(self):
     # As a result, all the crosses shouldn't collide.
     t1 = constant_op.constant([[359], [359 + 1024]])
     t2 = constant_op.constant([list(range(10)), list(range(10))])
-    cross = sparse_ops._sparse_cross_hashed(
-        [t2, t1],
-        num_buckets=1024,
-        hash_key=sparse_ops._DEFAULT_HASH_KEY + 1)
+    cross = sparse_ops.sparse_cross_hashed(
+        [t2, t1], num_buckets=1024, hash_key=sparse_ops._DEFAULT_HASH_KEY + 1)
     cross_dense = sparse_ops.sparse_tensor_to_dense(cross)
     with session.Session():
       values = cross_dense.eval()
@@ -337,7 +336,7 @@ def test_hashed__has_no_collision(self):
 
   def test_hashed_3x1x2(self):
     """Tests 3x1x2 permutation with hashed output."""
-    op = sparse_ops._sparse_cross_hashed(
+    op = sparse_ops.sparse_cross_hashed(
         [
             self._sparse_tensor(
                 [['batch1-FC1-F1', 'batch1-FC1-F2', 'batch1-FC1-F3']]),
diff --git a/tensorflow/python/kernel_tests/string_split_op_test.py b/tensorflow/python/kernel_tests/string_split_op_test.py
index a5bd1b6ee072e4..e20daccb28a72f 100644
--- a/tensorflow/python/kernel_tests/string_split_op_test.py
+++ b/tensorflow/python/kernel_tests/string_split_op_test.py
@@ -146,5 +146,101 @@ def testStringSplitWithNoSkipEmpty(self):
       self.assertAllEqual(shape, [3, 1])
 
 
+class StringSplitV2OpTest(test.TestCase):
+
+  def testSplitV2(self):
+    strings = ["pigs on the wing", "animals"]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings)
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0]])
+      self.assertAllEqual(values, [b"pigs", b"on", b"the", b"wing", b"animals"])
+      self.assertAllEqual(shape, [2, 4])
+
+  def testSplitV2MultiCharSeparator(self):
+    # Match Python behavior:
+    # >>> '1<>2<>3'.split('<>')
+    # ['1', '2', '3']
+    # >>> "<><>4<>5<><>6<>".split("<>")
+    # ['', '', '4', '5', '', '6', '']
+    strings = ["1<>2<>3", "<><>4<>5<><>6<>"]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings, sep="<>")
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(
+          indices, [[0, 0], [0, 1], [0, 2],
+                    [1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5], [1, 6]])
+      self.assertAllEqual(values, [b"1", b"2", b"3",
+                                   b"", b"", b"4", b"5", b"", b"6", b""])
+      self.assertAllEqual(shape, [2, 7])
+
+  def testSplitV2SimpleSeparator(self):
+    # Match Python behavior:
+    # >>> '1,2,3'.split(',')
+    # ['1', '2', '3']
+    # >>> '1,2,,3,'.split(',')
+    # ['1', '2', '', '3', '']
+    strings = ["1,2,3", "4,5,,6,"]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings, sep=',')
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2],
+                                    [1, 0], [1, 1], [1, 2], [1, 3], [1, 4]])
+      self.assertAllEqual(values, [b"1", b"2", b"3",
+                                   b"4", b"5", b"", b"6", b""])
+      self.assertAllEqual(shape, [2, 5])
+
+  def testSplitV2EmptySeparator(self):
+    # Match Python behavior:
+    # >>> '1 2 3'.split()
+    # ['1', '2', '3']
+    #>>> '   1   2   3   '.split()
+    #['1', '2', '3']
+    strings = ["1 2 3", "  4  5    6  "]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings)
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2],
+                                    [1, 0], [1, 1], [1, 2]])
+      self.assertAllEqual(values, [b"1", b"2", b"3", b"4", b"5", b"6"])
+      self.assertAllEqual(shape, [2, 3])
+
+  def testSplitV2SimpleSeparatorMaxSplit(self):
+    # Match Python behavior:
+    # >>> '1,2,3'.split(',', maxsplit=1)
+    # ['1', '2,3']
+    # >>> '4,5,,6,'.split(',', maxsplit=1)
+    # ['4', '5,,6,']
+    strings = ["1,2,3", "4,5,,6,"]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings, sep=',', maxsplit=1)
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(indices, [[0, 0], [0, 1],
+                                    [1, 0], [1, 1]])
+      self.assertAllEqual(values, [b"1", b"2,3", b"4", b"5,,6,"])
+      self.assertAllEqual(shape, [2, 2])
+
+  def testSplitV2EmptySeparatorMaxSplit(self):
+    # Match Python behavior:
+    # '1 2 3'.split(maxsplit=1)
+    # ['1', '2 3']
+    # >>> "  4  5    6  ".split(maxsplit=1)
+    # ['4', '5    6  ']
+    strings = ["1 2 3", "  4  5    6  "]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings, maxsplit=1)
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(indices, [[0, 0], [0, 1],
+                                    [1, 0], [1, 1]])
+      self.assertAllEqual(values, [b"1", b"2 3", b"4", b"5    6  "])
+      self.assertAllEqual(shape, [2, 2])
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index a834675828b67a..c0b36f143d109e 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -438,7 +438,6 @@ def testTensorArrayReadWrongIndexOrDataTypeFails(self):
           "Tried to read from index 3 but array size is: 3"):
         self.evaluate(ta.read(3))
 
-  @test_util.run_in_graph_and_eager_modes()
   def testTensorArrayWriteMultipleFails(self):
     with self.test_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
@@ -615,8 +614,7 @@ def _testTensorArrayGradientWriteReadType(self, dtype):
       self.assertAllEqual(c(-2.0), grad_vals[1])
 
   def testTensorArrayGradientWriteRead(self):
-    for dtype in (np.float32, np.float64, np.int32, np.int64, np.complex64,
-                  np.complex128):
+    for dtype in (np.float32, np.float64, np.complex64, np.complex128):
       self._testTensorArrayGradientWriteReadType(dtype)
 
   def _testTensorArrayGradientWritePackConcatAndRead(self):
diff --git a/tensorflow/python/kernel_tests/topk_op_test.py b/tensorflow/python/kernel_tests/topk_op_test.py
index 6ab931fdb97a89..fa7c6a0f8a6c76 100644
--- a/tensorflow/python/kernel_tests/topk_op_test.py
+++ b/tensorflow/python/kernel_tests/topk_op_test.py
@@ -197,13 +197,15 @@ def testKTooLarge(self):
 
   def testTopKGradients(self):
     with self.test_session(use_gpu=True) as sess:
-      inputs = array_ops.placeholder(dtypes.int32, shape=[2, 5])
+      inputs = array_ops.placeholder(dtypes.float32, shape=[2, 5])
       values, _ = nn_ops.top_k(inputs, 3)
       grad = sess.run(
           gradients_impl.gradients(
-              values, inputs, grad_ys=[[[1, 2, 3], [4, 5, 6]]]),
-          feed_dict={inputs: [[2, -1, 1000, 3, 4], [1, 5, 2, 4, 3]]})[0]
-    self.assertEqual(grad.tolist(), [[0, 0, 1, 3, 2], [0, 4, 0, 5, 6]])
+              values, inputs, grad_ys=[[[1., 2., 3.], [4., 5., 6.]]]),
+          feed_dict={inputs: [[2., -1., 1000., 3., 4.],
+                              [1., 5., 2., 4., 3.]]})[0]
+    self.assertEqual(
+        grad.tolist(), [[0., 0., 1., 3., 2.], [0., 4., 0., 5., 6.]])
 
 
 class TopKBenchmark(test.Benchmark):
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index 51aa671098905e..2ee53df9317331 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -40,6 +40,7 @@
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
+from tensorflow.python.util import compat
 
 
 class VariableScopeTest(test.TestCase):
@@ -110,6 +111,12 @@ def testVarScopeConstraint(self):
         w = variable_scope.get_variable("w", [])
         self.assertEqual(w.constraint, constraint)
 
+  def testStringDefaultInitializer(self):
+    with self.test_session():
+      v = variable_scope.get_variable("string", shape=[], dtype=dtypes.string)
+      variables_lib.global_variables_initializer().run()
+      self.assertAllEqual(compat.as_bytes(v.eval()), b"")
+
   @test_util.run_in_graph_and_eager_modes()
   def testVarScopeDType(self):
     with variable_scope.variable_scope("tower2") as tower:
@@ -190,6 +197,32 @@ def f():
         self.assertAllEqual([v1, v2], [v3, v4])
       f()
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testEagerVariablesStoreAddsToCollections(self):
+    store = variable_scope.EagerVariableStore()
+    with store.as_default():
+      trainable = variable_scope.get_variable("v1", [], trainable=True)
+      not_trainable = variable_scope.get_variable("v2", [], trainable=False)
+      concat = variable_scope.get_variable(
+          "v3", [], collections=[ops.GraphKeys.CONCATENATED_VARIABLES])
+      self.assertEqual(
+          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES),
+          [trainable, not_trainable])
+      self.assertEqual(
+          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES),
+          [trainable, concat])
+      self.assertEqual(
+          ops.get_collection(ops.GraphKeys.CONCATENATED_VARIABLES), [concat])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testEagerVariablesOutsideStoreNotAddedToCollections(self):
+    if not context.executing_eagerly():
+      return
+    variable_scope.get_variable("v1", [], trainable=True)
+    variable_scope.get_variable("v2", [], trainable=False)
+    self.assertFalse(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    self.assertFalse(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
+
   @test_util.run_in_graph_and_eager_modes()
   def testInitFromNonTensorValue(self):
     v = variable_scope.get_variable("v4", initializer=4, dtype=dtypes.int32)
diff --git a/tensorflow/python/kernel_tests/variables_test.py b/tensorflow/python/kernel_tests/variables_test.py
index 27599868b74be3..62d596da91682c 100644
--- a/tensorflow/python/kernel_tests/variables_test.py
+++ b/tensorflow/python/kernel_tests/variables_test.py
@@ -496,6 +496,23 @@ def testVariableDefInitializedInstances(self):
       with self.assertRaises(ValueError):
         sess.run(v.initialized_value())
 
+  def testTrainableInProto(self):
+    with ops.Graph().as_default():
+      non_trainable_variable = variables.Variable(
+          trainable=False,
+          initial_value=constant_op.constant(10.0))
+      self.assertEqual(
+          False,
+          variables.Variable(variable_def=non_trainable_variable.to_proto())
+          .trainable)
+      trainable_variable = variables.Variable(
+          trainable=True,
+          initial_value=constant_op.constant(10.0))
+      self.assertEqual(
+          True,
+          variables.Variable(variable_def=trainable_variable.to_proto())
+          .trainable)
+
   def testLoad(self):
     with self.test_session():
       var = variables.Variable(np.zeros((5, 5), np.float32))
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index 64db49c900c21d..eda036ece4a7d7 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -20,12 +20,12 @@
 import copy
 
 from tensorflow.python.eager import context
-from tensorflow.python.estimator import util as estimator_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.keras._impl.keras.engine import base_layer
+from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
+from tensorflow.python.util import function_utils
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -191,6 +191,16 @@ def add_weight(self, name, shape, dtype=None,
       RuntimeError: If called with partioned variable regularization and
         eager execution is enabled.
     """
+    
+    def _should_add_regularizer(variable, existing_variable_set):
+      if isinstance(variable, tf_variables.PartitionedVariable):
+        for var in variable:
+          if var in existing_variable_set:
+            return False
+        return True
+      else:
+        return variable not in existing_variable_set
+
     init_graph = None
     if not context.executing_eagerly():
       default_graph = ops.get_default_graph()
@@ -233,7 +243,8 @@ def add_weight(self, name, shape, dtype=None,
             getter=vs.get_variable)
 
         if regularizer:
-          if context.executing_eagerly() or variable not in existing_variables:
+          if context.executing_eagerly() or _should_add_regularizer(
+              variable, existing_variables):
             self._handle_weight_regularization(name, variable, regularizer)
 
         if init_graph is not None:
@@ -308,7 +319,7 @@ def __call__(self, inputs, *args, **kwargs):
       try:
         call_has_scope_arg = self._call_has_scope_arg
       except AttributeError:
-        self._call_fn_args = estimator_util.fn_args(self.call)
+        self._call_fn_args = function_utils.fn_args(self.call)
         self._call_has_scope_arg = 'scope' in self._call_fn_args
         call_has_scope_arg = self._call_has_scope_arg
       if call_has_scope_arg:
@@ -353,4 +364,3 @@ def _add_elements_to_collection(elements, collection_list):
     for element in elements:
       if element not in collection_set:
         collection.append(element)
-
diff --git a/tensorflow/python/layers/base_test.py b/tensorflow/python/layers/base_test.py
index f08b552840f5ff..ab49e37b90e183 100644
--- a/tensorflow/python/layers/base_test.py
+++ b/tensorflow/python/layers/base_test.py
@@ -30,6 +30,7 @@
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
@@ -95,6 +96,21 @@ def testAddWeight(self):
           regularizer=regularizer)
       self.assertEqual(len(layer.losses), 1)
 
+  def testReusePartitionedVaraiblesAndRegularizers(self):
+    regularizer = lambda x: math_ops.reduce_sum(x) * 1e-3
+    partitioner = partitioned_variables.fixed_size_partitioner(3)
+    for reuse in [False, True]:
+      with variable_scope.variable_scope(variable_scope.get_variable_scope(),
+                                         partitioner=partitioner,
+                                         reuse=reuse):
+        layer = base_layers.Layer(name='my_layer')
+        variable = layer.add_variable(
+            'reg_part_var', [4, 4],
+            initializer=init_ops.zeros_initializer(),
+            regularizer=regularizer)
+    self.assertEqual(
+        len(ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)), 3)
+
   def testNoEagerActivityRegularizer(self):
     with context.eager_mode():
       with self.assertRaisesRegexp(ValueError, 'activity_regularizer'):
diff --git a/tensorflow/python/layers/convolutional.py b/tensorflow/python/layers/convolutional.py
index 34a1487e748e41..267d78dbcb2739 100644
--- a/tensorflow/python/layers/convolutional.py
+++ b/tensorflow/python/layers/convolutional.py
@@ -23,7 +23,7 @@
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.keras._impl.keras import layers as keras_layers
+from tensorflow.python.keras import layers as keras_layers
 from tensorflow.python.layers import base
 from tensorflow.python.layers import utils
 from tensorflow.python.ops import array_ops
diff --git a/tensorflow/python/layers/convolutional_test.py b/tensorflow/python/layers/convolutional_test.py
index cdb42f5bd18292..625320b48bc801 100644
--- a/tensorflow/python/layers/convolutional_test.py
+++ b/tensorflow/python/layers/convolutional_test.py
@@ -22,7 +22,6 @@
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.layers import convolutional as conv_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
@@ -34,7 +33,6 @@
 from tensorflow.python.platform import test
 
 
-@test_util.with_c_api
 class ConvTest(test.TestCase):
 
   def testInvalidDataFormat(self):
@@ -332,7 +330,6 @@ def testConv3DChannelsFirst(self):
     conv_layers.conv3d(images, 32, 9, data_format='channels_first')
 
 
-@test_util.with_c_api
 class SeparableConv1DTest(test.TestCase):
 
   def testInvalidDataFormat(self):
@@ -494,7 +491,6 @@ def testConstraints(self):
     self.assertEqual(layer.bias_constraint, b_constraint)
 
 
-@test_util.with_c_api
 class SeparableConv2DTest(test.TestCase):
 
   def testInvalidDataFormat(self):
@@ -738,7 +734,6 @@ def testConstraints(self):
     self.assertEqual(layer.bias_constraint, b_constraint)
 
 
-@test_util.with_c_api
 class Conv2DTransposeTest(test.TestCase):
 
   def testInvalidDataFormat(self):
@@ -924,7 +919,6 @@ def testConstraints(self):
     self.assertEqual(layer.bias_constraint, b_constraint)
 
 
-@test_util.with_c_api
 class Conv3DTransposeTest(test.TestCase):
 
   def testInvalidDataFormat(self):
diff --git a/tensorflow/python/layers/core.py b/tensorflow/python/layers/core.py
index 6d8e9eac878bb2..abbacac442c5bb 100644
--- a/tensorflow/python/layers/core.py
+++ b/tensorflow/python/layers/core.py
@@ -27,7 +27,7 @@
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import numpy as np
 
-from tensorflow.python.keras._impl.keras import layers as keras_layers
+from tensorflow.python.keras import layers as keras_layers
 from tensorflow.python.layers import base
 from tensorflow.python.ops import init_ops
 from tensorflow.python.util.tf_export import tf_export
diff --git a/tensorflow/python/layers/layers.py b/tensorflow/python/layers/layers.py
index c5fa0d3aba7c13..11a2ebc040f017 100644
--- a/tensorflow/python/layers/layers.py
+++ b/tensorflow/python/layers/layers.py
@@ -14,48 +14,7 @@
 # ==============================================================================
 
 # pylint: disable=line-too-long
-"""This library provides a set of high-level neural networks layers.
-
-@@Dense
-@@Dropout
-@@Flatten
-@@Conv1D
-@@Conv2D
-@@Conv3D
-@@SeparableConv1D
-@@SeparableConv2D
-@@Conv2DTranspose
-@@Conv3DTranspose
-@@AveragePooling1D
-@@MaxPooling1D
-@@AveragePooling2D
-@@MaxPooling2D
-@@AveragePooling3D
-@@MaxPooling3D
-@@BatchNormalization
-
-@@Layer
-@@Input
-@@InputSpec
-
-@@dense
-@@dropout
-@@flatten
-@@conv1d
-@@conv2d
-@@conv3d
-@@separable_conv1d
-@@separable_conv2d
-@@conv2d_transpose
-@@conv3d_transpose
-@@average_pooling1d
-@@max_pooling1d
-@@average_pooling2d
-@@max_pooling2d
-@@average_pooling3d
-@@max_pooling3d
-@@batch_normalization
-"""
+"""This library provides a set of high-level neural networks layers."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/python/layers/normalization.py b/tensorflow/python/layers/normalization.py
index 33284b0d695272..d082e312e9a375 100644
--- a/tensorflow/python/layers/normalization.py
+++ b/tensorflow/python/layers/normalization.py
@@ -24,7 +24,7 @@
 from six.moves import xrange  # pylint: disable=redefined-builtin
 import numpy as np
 
-from tensorflow.python.keras._impl.keras import layers as keras_layers
+from tensorflow.python.keras import layers as keras_layers
 from tensorflow.python.layers import base
 from tensorflow.python.ops import init_ops
 from tensorflow.python.util.tf_export import tf_export
diff --git a/tensorflow/python/layers/pooling.py b/tensorflow/python/layers/pooling.py
index 75abe56f51f2a2..c53cca3d312470 100644
--- a/tensorflow/python/layers/pooling.py
+++ b/tensorflow/python/layers/pooling.py
@@ -19,7 +19,7 @@
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras import layers as keras_layers
+from tensorflow.python.keras import layers as keras_layers
 from tensorflow.python.layers import base
 from tensorflow.python.util.tf_export import tf_export
 
diff --git a/tensorflow/python/lib/core/ndarray_tensor.cc b/tensorflow/python/lib/core/ndarray_tensor.cc
index a07e305ffbe8b4..9df38d464ca6ad 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor.cc
@@ -145,7 +145,7 @@ Status PyBytesArrayMap(PyArrayObject* array, F f) {
   while (PyArray_ITER_NOTDONE(iter.get())) {
     auto item = tensorflow::make_safe(PyArray_GETITEM(
         array, static_cast<char*>(PyArray_ITER_DATA(iter.get()))));
-    if (!item.get()) {
+    if (!item) {
       return errors::Internal("Unable to get element from the feed - no item.");
     }
     char* ptr;
diff --git a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
index 65e2178cda4982..0d5838505f2553 100644
--- a/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor_bridge.cc
@@ -72,10 +72,11 @@ struct TensorReleaser {
 
 extern PyTypeObject TensorReleaserType;
 
-static void TensorReleaser_dealloc(TensorReleaser* self) {
+static void TensorReleaser_dealloc(PyObject* pself) {
+  TensorReleaser* self = reinterpret_cast<TensorReleaser*>(pself);
   (*self->destructor)();
   delete self->destructor;
-  TensorReleaserType.tp_free(self);
+  TensorReleaserType.tp_free(pself);
 }
 
 PyTypeObject TensorReleaserType = {
@@ -84,26 +85,26 @@ PyTypeObject TensorReleaserType = {
     sizeof(TensorReleaser),           /* tp_basicsize */
     0,                                /* tp_itemsize */
     /* methods */
-    (destructor)TensorReleaser_dealloc, /* tp_dealloc */
-    nullptr,                            /* tp_print */
-    nullptr,                            /* tp_getattr */
-    nullptr,                            /* tp_setattr */
-    nullptr,                            /* tp_compare */
-    nullptr,                            /* tp_repr */
-    nullptr,                            /* tp_as_number */
-    nullptr,                            /* tp_as_sequence */
-    nullptr,                            /* tp_as_mapping */
-    nullptr,                            /* tp_hash */
-    nullptr,                            /* tp_call */
-    nullptr,                            /* tp_str */
-    nullptr,                            /* tp_getattro */
-    nullptr,                            /* tp_setattro */
-    nullptr,                            /* tp_as_buffer */
-    Py_TPFLAGS_DEFAULT,                 /* tp_flags */
-    "Wrapped TensorFlow Tensor",        /* tp_doc */
-    nullptr,                            /* tp_traverse */
-    nullptr,                            /* tp_clear */
-    nullptr,                            /* tp_richcompare */
+    TensorReleaser_dealloc,      /* tp_dealloc */
+    nullptr,                     /* tp_print */
+    nullptr,                     /* tp_getattr */
+    nullptr,                     /* tp_setattr */
+    nullptr,                     /* tp_compare */
+    nullptr,                     /* tp_repr */
+    nullptr,                     /* tp_as_number */
+    nullptr,                     /* tp_as_sequence */
+    nullptr,                     /* tp_as_mapping */
+    nullptr,                     /* tp_hash */
+    nullptr,                     /* tp_call */
+    nullptr,                     /* tp_str */
+    nullptr,                     /* tp_getattro */
+    nullptr,                     /* tp_setattro */
+    nullptr,                     /* tp_as_buffer */
+    Py_TPFLAGS_DEFAULT,          /* tp_flags */
+    "Wrapped TensorFlow Tensor", /* tp_doc */
+    nullptr,                     /* tp_traverse */
+    nullptr,                     /* tp_clear */
+    nullptr,                     /* tp_richcompare */
 };
 
 Status TF_DataType_to_PyArray_TYPE(TF_DataType tf_datatype,
diff --git a/tensorflow/python/lib/core/py_exception_registry.cc b/tensorflow/python/lib/core/py_exception_registry.cc
index 6637de632b48e4..d03cf8930b9e2b 100644
--- a/tensorflow/python/lib/core/py_exception_registry.cc
+++ b/tensorflow/python/lib/core/py_exception_registry.cc
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/python/lib/core/py_exception_registry.h"
-
 #include <Python.h>
 
+#include "tensorflow/python/lib/core/py_exception_registry.h"
+
 namespace tensorflow {
 
 PyExceptionRegistry* PyExceptionRegistry::singleton_ = nullptr;
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index 8c6bb7955a4e29..30c1a9c75986f2 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <array>
 
+#include <Python.h>
+
 #include "numpy/arrayobject.h"
 #include "tensorflow/c/eager/c_api.h"
 #include "tensorflow/c/eager/c_api_internal.h"
@@ -33,8 +35,6 @@ limitations under the License.
 #include "tensorflow/python/lib/core/py_util.h"
 #include "tensorflow/python/lib/core/safe_ptr.h"
 
-#include <Python.h>
-
 namespace tensorflow {
 namespace {
 
diff --git a/tensorflow/python/lib/core/py_seq_tensor.cc b/tensorflow/python/lib/core/py_seq_tensor.cc
index 32ea737a990678..386be35ba2ff1f 100644
--- a/tensorflow/python/lib/core/py_seq_tensor.cc
+++ b/tensorflow/python/lib/core/py_seq_tensor.cc
@@ -51,6 +51,10 @@ bool IsPyInt(PyObject* obj) {
 #endif
 }
 
+bool IsPyDouble(PyObject* obj) {
+  return PyIsInstance(obj, &PyDoubleArrType_Type);  // NumPy double type.
+}
+
 bool IsPyFloat(PyObject* obj) {
   return PyFloat_Check(obj) ||
          PyIsInstance(obj, &PyFloatingArrType_Type);  // NumPy float types
@@ -113,8 +117,10 @@ Status InferShapeAndType(PyObject* obj, TensorShape* shape, DataType* dtype) {
               "Attempted to convert an invalid sequence to a Tensor.");
         }
       }
-    } else if (IsPyFloat(obj)) {
+    } else if (IsPyDouble(obj)) {
       *dtype = DT_DOUBLE;
+    } else if (IsPyFloat(obj)) {
+      *dtype = DT_FLOAT;
     } else if (PyBool_Check(obj) || PyIsInstance(obj, &PyBoolArrType_Type)) {
       // Have to test for bool before int, since IsInt(True/False) == true.
       *dtype = DT_BOOL;
@@ -433,7 +439,7 @@ Status PySeqToTensor(PyObject* obj, PyObject* dtype, Tensor* ret) {
       break;
   }
   switch (infer_dtype) {
-    case DT_DOUBLE:
+    case DT_FLOAT:
       // TODO(josh11b): Handle mixed floats and complex numbers?
       if (requested_dtype == DT_INVALID) {
         // TensorFlow uses float32s to represent floating point numbers
@@ -446,7 +452,8 @@ Status PySeqToTensor(PyObject* obj, PyObject* dtype, Tensor* ret) {
         // final type.
         RETURN_STRING_AS_STATUS(ConvertDouble(obj, shape, ret));
       }
-
+    case DT_DOUBLE:
+      RETURN_STRING_AS_STATUS(ConvertDouble(obj, shape, ret));
     case DT_INT64:
       if (requested_dtype == DT_INVALID) {
         const char* error = ConvertInt32(obj, shape, ret);
diff --git a/tensorflow/python/lib/core/py_util.cc b/tensorflow/python/lib/core/py_util.cc
index 00cbf0c532cf80..dcda1f4a446dd7 100644
--- a/tensorflow/python/lib/core/py_util.cc
+++ b/tensorflow/python/lib/core/py_util.cc
@@ -15,9 +15,10 @@ limitations under the License.
 
 #include "tensorflow/python/lib/core/py_util.h"
 
+#include <Python.h>
+
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/strcat.h"
-#include <Python.h>
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/python/lib/core/safe_ptr.h b/tensorflow/python/lib/core/safe_ptr.h
index 32d286888666bd..35d71f7629e540 100644
--- a/tensorflow/python/lib/core/safe_ptr.h
+++ b/tensorflow/python/lib/core/safe_ptr.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 
 #include <Python.h>
+
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/eager/c_api.h"
 
diff --git a/tensorflow/python/lib/io/file_io.py b/tensorflow/python/lib/io/file_io.py
index 59f5075f177ef5..f22fb253e4d598 100644
--- a/tensorflow/python/lib/io/file_io.py
+++ b/tensorflow/python/lib/io/file_io.py
@@ -21,6 +21,7 @@
 from __future__ import division
 from __future__ import print_function
 
+import binascii
 import os
 import uuid
 
@@ -33,6 +34,10 @@
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
+# A good default block size depends on the system in question.
+# A somewhat conservative default chosen here.
+_DEFAULT_BLOCK_SIZE = 16 * 1024 * 1024
+
 
 class FileIO(object):
   """FileIO class that exposes methods to read / write to / from files.
@@ -551,3 +556,56 @@ def stat(filename):
   with errors.raise_exception_on_not_ok_status() as status:
     pywrap_tensorflow.Stat(compat.as_bytes(filename), file_statistics, status)
     return file_statistics
+
+
+def filecmp(filename_a, filename_b):
+  """Compare two files, returning True if they are the same, False otherwise.
+
+  We check size first and return False quickly if the files are different sizes.
+  If they are the same size, we continue to generating a crc for the whole file.
+
+  You might wonder: why not use Python's filecmp.cmp() instead? The answer is
+  that the builtin library is not robust to the many different filesystems
+  TensorFlow runs on, and so we here perform a similar comparison with
+  the more robust FileIO.
+
+  Args:
+    filename_a: string path to the first file.
+    filename_b: string path to the second file.
+
+  Returns:
+    True if the files are the same, False otherwise.
+  """
+  size_a = FileIO(filename_a, "rb").size()
+  size_b = FileIO(filename_b, "rb").size()
+  if size_a != size_b:
+    return False
+
+  # Size is the same. Do a full check.
+  crc_a = file_crc32(filename_a)
+  crc_b = file_crc32(filename_b)
+  return crc_a == crc_b
+
+
+def file_crc32(filename, block_size=_DEFAULT_BLOCK_SIZE):
+  """Get the crc32 of the passed file.
+
+  The crc32 of a file can be used for error checking; two files with the same
+  crc32 are considered equivalent. Note that the entire file must be read
+  to produce the crc32.
+
+  Args:
+    filename: string, path to a file
+    block_size: Integer, process the files by reading blocks of `block_size`
+      bytes. Use -1 to read the file as once.
+
+  Returns:
+    hexadecimal as string, the crc32 of the passed file.
+  """
+  crc = 0
+  with FileIO(filename, mode="rb") as f:
+    chunk = f.read(n=block_size)
+    while chunk:
+      crc = binascii.crc32(chunk, crc)
+      chunk = f.read(n=block_size)
+  return hex(crc & 0xFFFFFFFF)
diff --git a/tensorflow/python/lib/io/file_io_test.py b/tensorflow/python/lib/io/file_io_test.py
index 223858edfa84ea..c21eb931037f17 100644
--- a/tensorflow/python/lib/io/file_io_test.py
+++ b/tensorflow/python/lib/io/file_io_test.py
@@ -491,5 +491,96 @@ def testUTF8StringPathExists(self):
     v = file_io.file_exists(file_path)
     self.assertEqual(v, True)
 
+  def testFilecmp(self):
+    file1 = os.path.join(self._base_dir, "file1")
+    file_io.write_string_to_file(file1, "This is a sentence\n" * 100)
+
+    file2 = os.path.join(self._base_dir, "file2")
+    file_io.write_string_to_file(file2, "This is another sentence\n" * 100)
+
+    file3 = os.path.join(self._base_dir, "file3")
+    file_io.write_string_to_file(file3, u"This is another sentence\n" * 100)
+
+    self.assertFalse(file_io.filecmp(file1, file2))
+    self.assertTrue(file_io.filecmp(file2, file3))
+
+  def testFilecmpSameSize(self):
+    file1 = os.path.join(self._base_dir, "file1")
+    file_io.write_string_to_file(file1, "This is a sentence\n" * 100)
+
+    file2 = os.path.join(self._base_dir, "file2")
+    file_io.write_string_to_file(file2, "This is b sentence\n" * 100)
+
+    file3 = os.path.join(self._base_dir, "file3")
+    file_io.write_string_to_file(file3, u"This is b sentence\n" * 100)
+
+    self.assertFalse(file_io.filecmp(file1, file2))
+    self.assertTrue(file_io.filecmp(file2, file3))
+
+  def testFilecmpBinary(self):
+    file1 = os.path.join(self._base_dir, "file1")
+    file_io.FileIO(file1, "wb").write("testing\n\na")
+
+    file2 = os.path.join(self._base_dir, "file2")
+    file_io.FileIO(file2, "wb").write("testing\n\nb")
+
+    file3 = os.path.join(self._base_dir, "file3")
+    file_io.FileIO(file3, "wb").write("testing\n\nb")
+
+    file4 = os.path.join(self._base_dir, "file4")
+    file_io.FileIO(file4, "wb").write("testing\n\ntesting")
+
+    self.assertFalse(file_io.filecmp(file1, file2))
+    self.assertFalse(file_io.filecmp(file1, file4))
+    self.assertTrue(file_io.filecmp(file2, file3))
+
+  def testFileCrc32(self):
+    file1 = os.path.join(self._base_dir, "file1")
+    file_io.write_string_to_file(file1, "This is a sentence\n" * 100)
+    crc1 = file_io.file_crc32(file1)
+
+    file2 = os.path.join(self._base_dir, "file2")
+    file_io.write_string_to_file(file2, "This is another sentence\n" * 100)
+    crc2 = file_io.file_crc32(file2)
+
+    file3 = os.path.join(self._base_dir, "file3")
+    file_io.write_string_to_file(file3, "This is another sentence\n" * 100)
+    crc3 = file_io.file_crc32(file3)
+
+    self.assertTrue(crc1 != crc2)
+    self.assertEqual(crc2, crc3)
+
+  def testFileCrc32WithBytes(self):
+    file1 = os.path.join(self._base_dir, "file1")
+    file_io.write_string_to_file(file1, "This is a sentence\n" * 100)
+    crc1 = file_io.file_crc32(file1, block_size=24)
+
+    file2 = os.path.join(self._base_dir, "file2")
+    file_io.write_string_to_file(file2, "This is another sentence\n" * 100)
+    crc2 = file_io.file_crc32(file2, block_size=24)
+
+    file3 = os.path.join(self._base_dir, "file3")
+    file_io.write_string_to_file(file3, "This is another sentence\n" * 100)
+    crc3 = file_io.file_crc32(file3, block_size=-1)
+
+    self.assertTrue(crc1 != crc2)
+    self.assertEqual(crc2, crc3)
+
+  def testFileCrc32Binary(self):
+    file1 = os.path.join(self._base_dir, "file1")
+    file_io.FileIO(file1, "wb").write("testing\n\n")
+    crc1 = file_io.file_crc32(file1)
+
+    file2 = os.path.join(self._base_dir, "file2")
+    file_io.FileIO(file2, "wb").write("testing\n\n\n")
+    crc2 = file_io.file_crc32(file2)
+
+    file3 = os.path.join(self._base_dir, "file3")
+    file_io.FileIO(file3, "wb").write("testing\n\n\n")
+    crc3 = file_io.file_crc32(file3)
+
+    self.assertTrue(crc1 != crc2)
+    self.assertEqual(crc2, crc3)
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/lib/io/python_io.py b/tensorflow/python/lib/io/python_io.py
index d4bc8afd1e32aa..aec12ab3eaaa9c 100644
--- a/tensorflow/python/lib/io/python_io.py
+++ b/tensorflow/python/lib/io/python_io.py
@@ -16,11 +16,6 @@
 """Python functions for directly manipulating TFRecord-formatted files.
 
 See the @{$python/python_io} guide.
-
-@@TFRecordWriter
-@@tf_record_iterator
-@@TFRecordCompressionType
-@@TFRecordOptions
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 586eaa4d5e4b83..fae63b1132cca5 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -16,69 +16,6 @@
 """Support for manipulating tensors.
 
 See the @{$python/array_ops} guide.
-
-@@string_to_number
-@@to_double
-@@to_float
-@@to_bfloat16
-@@to_int32
-@@to_int64
-@@cast
-@@bitcast
-@@saturate_cast
-@@broadcast_dynamic_shape
-@@broadcast_static_shape
-@@shape
-@@shape_n
-@@size
-@@rank
-@@reshape
-@@squeeze
-@@expand_dims
-@@unravel_index
-@@meshgrid
-@@slice
-@@strided_slice
-@@split
-@@tile
-@@pad
-@@concat
-@@stack
-@@parallel_stack
-@@unstack
-@@reverse_sequence
-@@reverse
-@@reverse_v2
-@@transpose
-@@extract_image_patches
-@@space_to_batch_nd
-@@space_to_batch
-@@required_space_to_batch_paddings
-@@batch_to_space_nd
-@@batch_to_space
-@@space_to_depth
-@@depth_to_space
-@@gather
-@@gather_nd
-@@unique_with_counts
-@@scatter_nd
-@@dynamic_partition
-@@dynamic_stitch
-@@boolean_mask
-@@one_hot
-@@sequence_mask
-@@dequantize
-@@quantize
-@@quantize_v2
-@@quantized_concat
-@@setdiff1d
-@@guarantee_const
-@@fake_quant_with_min_max_args
-@@fake_quant_with_min_max_args_gradient
-@@fake_quant_with_min_max_vars
-@@fake_quant_with_min_max_vars_gradient
-@@fake_quant_with_min_max_vars_per_channel
-@@fake_quant_with_min_max_vars_per_channel_gradient
 """
 
 from __future__ import absolute_import
@@ -326,15 +263,7 @@ def shape_n(input, out_type=dtypes.int32, name=None):
       type `out_type`.
   """
 
-  output = gen_array_ops.shape_n(input, out_type=out_type, name=name)
-  if not context.executing_eagerly():
-    for i, input_tensor in enumerate(input):
-      input_tensor = ops.convert_to_tensor(input_tensor)
-      input_shape = input_tensor.get_shape()
-      if input_shape.is_fully_defined():
-        output[i] = constant(
-            input_shape.as_list(), dtype=out_type, name=name)
-  return output
+  return gen_array_ops.shape_n(input, out_type=out_type, name=name)
 
 
 @tf_export("size")
@@ -1694,7 +1623,7 @@ def ones_like(tensor, dtype=None, name=None, optimize=True):
   Args:
     tensor: A `Tensor`.
     dtype: A type for the returned `Tensor`. Must be `float32`, `float64`,
-      `int8`, `uint8`, `int16`, `uint16`, int32`, `int64`,
+      `int8`, `uint8`, `int16`, `uint16`, `int32`, `int64`,
       `complex64`, `complex128` or `bool`.
     name: A name for the operation (optional).
     optimize: if true, attempt to statically determine the shape of 'tensor'
@@ -1781,8 +1710,10 @@ def placeholder(dtype, shape=None, name=None):
     print(sess.run(y, feed_dict={x: rand_array}))  # Will succeed.
   ```
 
-  @compatibility{eager} Placeholders are not compatible with eager execution.
-
+  @compatibility(eager)
+  Placeholders are not compatible with eager execution.
+  @end_compatibility
+  
   Args:
     dtype: The type of elements in the tensor to be fed.
     shape: The shape of the tensor to be fed (optional). If the shape is not
@@ -1966,7 +1897,7 @@ def pad(tensor, paddings, mode="CONSTANT", name=None, constant_values=0):  # pyl
         and paddings_constant is not None):
       new_shape = []
       for padding, dim in zip(paddings_constant, input_shape.as_list()):
-        if padding is None or dim is None or not all(padding):
+        if padding is None or dim is None or any((x is None for x in padding)):
           new_shape.append(None)
         else:
           new_shape.append(sum(padding) + dim)
@@ -2688,6 +2619,10 @@ def reverse(tensor, axis, name=None):
 
 # pylint: disable=redefined-builtin
 @tf_export("reverse_sequence")
+@deprecation.deprecated_args(
+    None, "seq_dim is deprecated, use seq_axis instead", "seq_dim")
+@deprecation.deprecated_args(
+    None, "batch_dim is deprecated, use batch_axis instead", "batch_dim")
 def reverse_sequence(input,
                      seq_lengths,
                      seq_axis=None,
diff --git a/tensorflow/python/ops/bitwise_ops.py b/tensorflow/python/ops/bitwise_ops.py
index 123380cf04acf6..a1260b95cdb47b 100644
--- a/tensorflow/python/ops/bitwise_ops.py
+++ b/tensorflow/python/ops/bitwise_ops.py
@@ -13,15 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Operations for manipulating the binary representations of integers.
-
-@@bitwise_and
-@@bitwise_or
-@@bitwise_xor
-@@invert
-@@left_shift
-@@right_shift
-"""
+"""Operations for manipulating the binary representations of integers."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index 9cea3e91f77600..375a5ec2c30069 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -16,29 +16,6 @@
 """Asserts and Boolean Checks.
 
 See the @{$python/check_ops} guide.
-
-@@assert_negative
-@@assert_positive
-@@assert_non_negative
-@@assert_non_positive
-@@assert_equal
-@@assert_none_equal
-@@assert_near
-@@assert_less
-@@assert_less_equal
-@@assert_greater
-@@assert_greater_equal
-@@assert_rank
-@@assert_rank_at_least
-@@assert_rank_in
-@@assert_type
-@@assert_integer
-@@assert_proper_iterable
-@@assert_same_float_dtype
-@@assert_scalar
-@@is_non_decreasing
-@@is_numeric_tensor
-@@is_strictly_increasing
 """
 
 from __future__ import absolute_import
@@ -364,8 +341,8 @@ def assert_equal(x, y, data=None, summarize=None, message=None, name=None):
                           y_sum, y_np[:y_sum]))
 
         index_and_values_str = ''
-        if x.shape == y.shape:
-          # If the shapes of x and y are the same,
+        if x.shape == y.shape and x.shape.as_list():
+          # If the shapes of x and y are the same (and not scalars),
           # Get the values that actually differed and their indices.
           # If shapes are different this information is more confusing
           # than useful.
@@ -1192,19 +1169,35 @@ def _assert_same_base_type(items, expected_type=None):
   Raises:
     ValueError: If any types do not match.
   """
-  original_item_str = None
+  original_expected_type = expected_type
+  mismatch = False
   for item in items:
     if item is not None:
       item_type = item.dtype.base_dtype
       if not expected_type:
         expected_type = item_type
-        original_item_str = item.name if hasattr(item, 'name') else str(item)
       elif expected_type != item_type:
-        raise ValueError('%s, type=%s, must be of the same type (%s)%s.' % (
-            item.name if hasattr(item, 'name') else str(item),
-            item_type, expected_type,
-            (' as %s' % original_item_str) if original_item_str else ''))
-  return expected_type
+        mismatch = True
+        break
+  if mismatch:
+    # Loop back through and build up an informative error message (this is very
+    # slow, so we don't do it unless we found an error above).
+    expected_type = original_expected_type
+    original_item_str = None
+    for item in items:
+      if item is not None:
+        item_type = item.dtype.base_dtype
+        if not expected_type:
+          expected_type = item_type
+          original_item_str = item.name if hasattr(item, 'name') else str(item)
+        elif expected_type != item_type:
+          raise ValueError('%s, type=%s, must be of the same type (%s)%s.' % (
+              item.name if hasattr(item, 'name') else str(item),
+              item_type, expected_type,
+              (' as %s' % original_item_str) if original_item_str else ''))
+    return expected_type  # Should be unreachable
+  else:
+    return expected_type
 
 
 @tf_export('assert_same_float_dtype')
diff --git a/tensorflow/python/ops/collective_ops.py b/tensorflow/python/ops/collective_ops.py
new file mode 100644
index 00000000000000..a05fd15eca12a4
--- /dev/null
+++ b/tensorflow/python/ops/collective_ops.py
@@ -0,0 +1,133 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TensorFlow collective Ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import device
+from tensorflow.python.ops import gen_collective_ops
+
+
+def all_reduce(t, group_size, group_key, instance_key, merge_op, final_op,
+               subdiv_offsets=(0)):
+  """Reduces tensors collectively, across devices.
+
+  Args:
+    t: the tensor to be reduced.
+    group_size: the total number of tensors to be collectively reduced.
+      Each must reside on a different device.
+    group_key: an integer identifying the group of devices.
+    instance_key: an integer identifying the participating group of Ops.
+    merge_op: string naming the binary Op to be applied to compute each
+      partial reduction.
+    final_op: string naming the unary Op to be applied to each fully
+      reduced value.  Can be 'Id' for no operation.
+    subdiv_offsets: a list of integer offsets into the tensor at which each
+      independent subdivision should begin.  Use [0] if no subdivision should
+      be done.
+
+  Returns:
+    An Op implementing the distributed reduction.
+
+  Raises:
+    ValueError: if any of the input parameter constraints are not met.
+  """
+  if not device.canonical_name(t.device):
+    raise ValueError('Device assignment required for collective ops')
+  if group_size <= 1:
+    raise ValueError('Parameter group_size to add_reduce must be at least 2.')
+  return gen_collective_ops.collective_reduce(t,
+                                              group_size=group_size,
+                                              group_key=group_key,
+                                              instance_key=instance_key,
+                                              merge_op=merge_op,
+                                              final_op=final_op,
+                                              subdiv_offsets=subdiv_offsets)
+
+
+def broadcast_send(t, shape, dtype, group_size, group_key, instance_key):
+  """Broadcasts one tensor to a group of others, across devices.
+
+  Args:
+    t: the tensor to be sent.
+    shape: the shape of the tensor being sent, which must agree with t.
+    dtype: the type of the tensor being sent, which must agree with t.
+    group_size: one plus the number of receiving tensors, i.e. the total
+      number of devices participating.  Each tensor must reside on a
+      different device.
+    group_key: an integer identifying the group of devices.
+    instance_key: an integer identifying the participating group of Ops.
+
+  Returns:
+    An Op implementing the distributed broadcast send.
+
+  Raises:
+    ValueError: if any of the input parameter constraints are not met.
+
+  Note that the shape and dtype arguments appear redundant since they
+  should be obtainable from t.  The are two reasons for including
+  them.  First, the shape and type of tensors passed via broadcast must
+  be known ahead of time in their most specific form so that the receive
+  side can allocate memory for the operation and shape/type inference can
+  carry forward from there.  Including the same declarations on the
+  send side clarifies a commitment already made.  Secondly, having nearly
+  identical use syntax for send and receive sides may simplify tool-driven
+  generation of broadcast.
+  """
+  if not device.canonical_name(t.device):
+    raise ValueError('Device assignment required for collective ops')
+  if group_size <= 1:
+    raise ValueError(
+        'Parameter group_size to broadcast_send must be at least 2.')
+  if t.shape != shape:
+    raise ValueError(
+        'Shape of broadcast_send tensor not equal to delcared shape')
+  if t.dtype != dtype:
+    raise ValueError(
+        'Type of broadcast_send tensor not equal to declared type')
+  return gen_collective_ops.collective_bcast_send(t,
+                                                  shape=shape,
+                                                  group_size=group_size,
+                                                  group_key=group_key,
+                                                  instance_key=instance_key)
+
+
+def broadcast_recv(shape, dtype, group_size, group_key, instance_key):
+  """Receives a broadcasts tensor, across devices.
+
+  Args:
+    shape: Shape of the tensor to be received.
+    dtype: Type of the tensor to be received.
+    group_size: one plus the number of receiving tensors, i.e. the total
+      number of devices participating.  Each tensor must reside on a
+      different device.
+    group_key: an integer identifying the group of devices.
+    instance_key: an integer identifying the participating group of Ops.
+
+  Returns:
+    An Op implementing the broadcast receive.
+
+  Raises:
+    ValueError: if any of the input parameter constraints are not met.
+  """
+  if group_size <= 1:
+    raise ValueError(
+        'Parameter group_size to broadcast_send must be at least 2.')
+  return gen_collective_ops.collective_bcast_recv(shape=shape,
+                                                  T=dtype,
+                                                  group_size=group_size,
+                                                  group_key=group_key,
+                                                  instance_key=instance_key)
diff --git a/tensorflow/python/ops/collective_ops_test.py b/tensorflow/python/ops/collective_ops_test.py
new file mode 100644
index 00000000000000..8e16cffdf4917b
--- /dev/null
+++ b/tensorflow/python/ops/collective_ops_test.py
@@ -0,0 +1,80 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Collective Operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import collective_ops
+from tensorflow.python.platform import test
+
+# TODO(tucker): Make these ops work in eager mode. b/79776476
+
+
+class CollectiveOpTest(test.TestCase):
+
+  def _testCollectiveReduce(self, t0, t1, expected):
+    group_key = 1
+    instance_key = 1
+    with self.test_session(
+        config=config_pb2.ConfigProto(device_count={'CPU': 2})) as sess:
+      with ops.device('/CPU:0'):
+        in0 = constant_op.constant(t0)
+        colred0 = collective_ops.all_reduce(in0, 2, group_key, instance_key,
+                                            'Add', 'Div', [0])
+      with ops.device('/CPU:1'):
+        in1 = constant_op.constant(t1)
+        colred1 = collective_ops.all_reduce(in1, 2, group_key, instance_key,
+                                            'Add', 'Div', [0])
+      run_options = config_pb2.RunOptions()
+      run_options.experimental.collective_graph_key = 1
+      results = sess.run([colred0, colred1], options=run_options)
+    self.assertAllClose(results[0], expected, rtol=1e-5, atol=1e-5)
+    self.assertAllClose(results[1], expected, rtol=1e-5, atol=1e-5)
+
+  def testCollectiveReduce(self):
+    self._testCollectiveReduce([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
+                               [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3],
+                               [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2])
+
+  def _testCollectiveBroadcast(self, t0):
+    group_key = 1
+    instance_key = 1
+    with self.test_session(
+        config=config_pb2.ConfigProto(device_count={'CPU': 2})) as sess:
+      with ops.device('/CPU:0'):
+        in0 = constant_op.constant(t0)
+        out0 = collective_ops.broadcast_send(in0, in0.shape, in0.dtype,
+                                             2, group_key, instance_key)
+      with ops.device('/CPU:1'):
+        c1 = constant_op.constant(t0)
+        out1 = collective_ops.broadcast_recv(c1.shape, c1.dtype,
+                                             2, group_key, instance_key)
+      run_options = config_pb2.RunOptions()
+      run_options.experimental.collective_graph_key = 1
+      results = sess.run([out0, out1], options=run_options)
+    self.assertAllClose(results[0], t0, rtol=1e-5, atol=1e-5)
+    self.assertAllClose(results[1], t0, rtol=1e-5, atol=1e-5)
+
+  def testCollectiveBroadcast(self):
+    self._testCollectiveBroadcast([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1])
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/ops/confusion_matrix.py b/tensorflow/python/ops/confusion_matrix.py
index b9a93c3bedfff1..c09154129f1a72 100644
--- a/tensorflow/python/ops/confusion_matrix.py
+++ b/tensorflow/python/ops/confusion_matrix.py
@@ -12,12 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Confusion matrix related utilities.
-
-
-@@remove_squeezable_dimensions
-@@confusion_matrix
-"""
+"""Confusion matrix related utilities."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index f1e068d51403a6..2e5a801f8e96aa 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -15,36 +15,6 @@
 """Control Flow Operations.
 
 See the @{$python/control_flow_ops} guide.
-
-@@identity
-@@identity_n
-@@tuple
-@@group
-@@no_op
-@@count_up_to
-@@cond
-@@case
-@@while_loop
-@@logical_and
-@@logical_not
-@@logical_or
-@@logical_xor
-@@equal
-@@not_equal
-@@less
-@@less_equal
-@@greater
-@@greater_equal
-@@where
-@@is_finite
-@@is_inf
-@@is_nan
-@@verify_tensor_all_finite
-@@check_numerics
-@@add_check_numerics_ops
-@@Assert
-@@Print
-@@timestamp
 """
 # pylint: disable=g-bad-name
 from __future__ import absolute_import
@@ -73,6 +43,7 @@
 from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import gen_logging_ops
+from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import tensor_array_ops
 # go/tf-wildcard-import
@@ -1221,20 +1192,18 @@ def ProcessUnusedLoopExits(self, pending_count, to_ops_set):
       to backprop.
     """
     loop_exits = []
-    for _, grad_state in self._map.items():
-      # pylint: disable=protected-access
+    for grad_state in self._map.values():
       for y in grad_state.forward_loop_exits:
-        if pending_count[y.op._id] == 0:
+        if pending_count[y.op] == 0:
           grad_state.pending_exits_count -= 1
-          if y.op._id not in to_ops_set:
+          if y.op not in to_ops_set:
             grad_state.unused_exits.append(y)
           if grad_state.pending_exits_count == 0:
             loop_exits.extend(grad_state.unused_exits)
       # Need to include Enters in backprop for higher-order gradients.
       for y in grad_state.forward_context.loop_enters:
-        if pending_count[y.op._id] == 0:
-          pending_count[y.op._id] = 1
-      # pylint: enable=protected-access
+        if pending_count[y.op] == 0:
+          pending_count[y.op] = 1
     return loop_exits
 
   def EnterGradWhileContext(self, op, before):
@@ -1272,8 +1241,8 @@ def AddWhileContext(self, op, between_op_list, between_ops):
 
       # We need to include all exits of a loop for backprop.
       for loop_exit in grad_state.forward_loop_exits:
-        if not between_ops[loop_exit.op._id]:
-          between_ops[loop_exit.op._id] = True
+        if loop_exit.op not in between_ops:
+          between_ops.add(loop_exit.op)
           between_op_list.append(loop_exit.op)
 
   def ZerosLikeForExit(self, val):
@@ -1463,6 +1432,8 @@ def ZerosLikeOutsideLoop(op, index):
   """Create zeros_like for the specified output of an op."""
   val = op.outputs[index]
   if not util.IsSwitch(op):
+    if val.dtype == dtypes.resource:
+      return array_ops.zeros(gen_resource_variable_ops.variable_shape(val))
     return array_ops.zeros_like(val, optimize=False)
   else:
     op_ctxt = op._get_control_flow_context()
@@ -1471,6 +1442,10 @@ def ZerosLikeOutsideLoop(op, index):
       pred = op_ctxt.pred
       branch = op_ctxt.branch
       switch_val = switch(op.inputs[0], pred)[1 - branch]
+      if val.dtype == dtypes.resource:
+        with ops.control_dependencies([switch_val]):
+          return array_ops.zeros(
+              gen_resource_variable_ops.variable_shape(switch_val))
       zeros_shape = array_ops.shape_internal(switch_val, optimize=False)
       # Ensure ops created within array_ops.zeros are dominated by switch in
       # cond context.
@@ -1708,12 +1683,12 @@ def __init__(self,
       self._pivot = pivot  # The predicate tensor in this branch
       self._branch = branch  # 0 or 1 representing this branch
 
-      # Values considered to have been already seen in this context. They are
-      # not included in this context.
+      # Values considered to have been already seen in this context. pred is not
+      # included in this context.
       self._values.add(pred.name)
       self._external_values[pred.name] = pred
       self._values.add(pivot.name)
-      self._external_values[pivot.name] = pivot
+      pivot.op._set_control_flow_context(self)  # pylint: disable=protected-access
 
   def _init_from_proto(self, context_def, import_scope=None):
     """Creates a new `CondContext` from protocol buffer.
@@ -2754,7 +2729,8 @@ def AddBackpropIndexedSlicesAccumulator(self, op, grad):
           self.outer_context.Exit()
       else:
         shape_acc = array_ops.zeros_like(
-            array_ops.shape_internal(op.inputs[0], optimize=False),
+            array_ops.shape_internal(op.inputs[0], optimize=False,
+                                     out_type=dense_shape.dtype),
             optimize=False)
 
     if self.outer_context:
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index 289df6f3016e9d..59bb925df0f25b 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -51,7 +51,6 @@
 SingletonTestTuple = collections.namedtuple("SingletonTestTuple", "a")
 
 
-@test_util.with_c_api
 class GroupTestCase(test_util.TensorFlowTestCase):
 
   def _StripNode(self, nd):
@@ -133,7 +132,6 @@ def testPassingNonTensors(self):
         control_flow_ops.group(1, 2)
 
 
-@test_util.with_c_api
 class ShapeTestCase(test_util.TensorFlowTestCase):
 
   def testShape(self):
@@ -145,7 +143,6 @@ def testShape(self):
                             [constant_op.constant(1.0)], tensor).get_shape())
 
 
-@test_util.with_c_api
 class WithDependenciesTestCase(test_util.TensorFlowTestCase):
 
   def testTupleDependencies(self):
@@ -177,7 +174,6 @@ def testListDependencies(self):
         self.assertEquals(1, counter.eval())
 
 
-@test_util.with_c_api
 class SwitchTestCase(test_util.TensorFlowTestCase):
 
   def testIndexedSlicesWithDenseShape(self):
@@ -349,12 +345,9 @@ def testGradientThroughSingleBranchOutsideOfContext(self):
       self.assertEquals(grad_x_false.eval(), 0.)
 
 
-@test_util.with_c_api
 class CondTest(test_util.TensorFlowTestCase):
 
   def testCondTrue(self):
-    # Create new Graph and Session for each test so we pick up _USE_C_API
-    # correctly.
     with ops.Graph().as_default():
       with session.Session():
         x = constant_op.constant(2)
@@ -438,7 +431,6 @@ def testCondDuplicateArg2(self):
           control_flow_ops.cond(True, lambda: x, lambda: x, fn2=lambda: x)
 
 
-@test_util.with_c_api
 class ContextTest(test_util.TensorFlowTestCase):
 
   def testCondContext(self):
@@ -535,7 +527,6 @@ def _raw_shape(shape):
 
 
 # TODO(yori): Add tests for indexed slices.
-@test_util.with_c_api
 class DataTypesTest(test_util.TensorFlowTestCase):
 
   def assertAllEqualNested(self, a, b):
@@ -885,7 +876,6 @@ def body(i, matrix):
     self.assertEqual(matrix.get_shape(), tensor_shape.TensorShape([2, 2]))
 
 
-@test_util.with_c_api
 class CaseTest(test_util.TensorFlowTestCase):
 
   def testCase_withDefault(self):
@@ -947,7 +937,6 @@ def testCase_withoutDefault_oneCondition(self):
         sess.run(output, feed_dict={x: 4})
 
 
-@test_util.with_c_api
 class WhileLoopTestCase(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes()
diff --git a/tensorflow/python/ops/control_flow_util.py b/tensorflow/python/ops/control_flow_util.py
index eee31102db57b4..7a18986c5b0344 100644
--- a/tensorflow/python/ops/control_flow_util.py
+++ b/tensorflow/python/ops/control_flow_util.py
@@ -53,6 +53,11 @@ def IsSwitch(op):
   return op.type == "Switch" or op.type == "RefSwitch"
 
 
+def IsMerge(op):
+  """Return true if `op` is a Merge."""
+  return op.type == "Merge" or op.type == "RefMerge"
+
+
 def IsLoopEnter(op):
   """Returns true if `op` is an Enter."""
   return op.type == "Enter" or op.type == "RefEnter"
@@ -63,11 +68,57 @@ def IsLoopExit(op):
   return op.type == "Exit" or op.type == "RefExit"
 
 
+def IsCondSwitch(op):
+  """Return true if `op` is the Switch for a conditional."""
+  if not IsSwitch(op):
+    return False
+  if not op.outputs:
+    return False
+  # Switch nodes are not part of the cond control flow context that they
+  # represent, so consider the consumers of its outputs to determine if it is
+  # cond switch or not. A switch is a cond switch iff all its consumers are in
+  # cond contexts.
+  is_cond_switch = True
+  for o in op.outputs:
+    for c in o.consumers():
+      ctxt = c._get_control_flow_context()  # pylint: disable=protected-access
+      if IsLoopEnter(c):
+        ctxt = ctxt.outer_context
+      is_cond_switch = is_cond_switch and (ctxt is not None and
+                                           ctxt.IsCondContext())
+  return is_cond_switch
+
+
+def IsCondMerge(op):
+  """Return true if `op` is the Merge for a conditional."""
+  if not IsMerge(op):
+    return False
+  if not op.inputs:
+    return False
+  # Merge nodes are not part of the cond control flow context that they
+  # represent, so consider the inputs to the merge of to determine if it is
+  # cond merge or not: A merge is a cond merge iff all its inputs are in
+  # cond contexts.
+  is_cond_merge = True
+  for i in op.inputs:
+    ctxt = GetOutputContext(i.op)
+    is_cond_merge = is_cond_merge and ctxt is not None and ctxt.IsCondContext()
+  return is_cond_merge
+
+
 def IsLoopSwitch(op):
   """Return true if `op` is the Switch for a while loop."""
   if IsSwitch(op):
     ctxt = op._get_control_flow_context()  # pylint: disable=protected-access
-    return ctxt and ctxt.IsWhileContext()
+    return ctxt is not None and ctxt.IsWhileContext() and not IsCondSwitch(op)
+  return False
+
+
+def IsLoopMerge(op):
+  """Return true if `op` is the Merge for a while loop."""
+  if IsMerge(op):
+    ctxt = op._get_control_flow_context()  # pylint: disable=protected-access
+    return ctxt is not None and ctxt.IsWhileContext() and not IsCondMerge(op)
   return False
 
 
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index dfa07abfc64748..d934f27cb96f4a 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -18,13 +18,18 @@
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.eager import tape
+from tensorflow.python.eager import tape as tape_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -73,17 +78,25 @@ def grad(dy):
   for fine grained control over the gradient computation of a sequence of
   operations.
 
+  Note that if the decorated function uses `Variable`s, the enclosing variable
+  scope must be using `ResourceVariable`s.
+
   Args:
     f: function `f(x)` that returns a tuple `(y, grad_fn)` where:
        - `x` is a `Tensor` or sequence of `Tensor` inputs to the function.
        - `y` is a `Tensor` or sequence of `Tensor` outputs of applying
          TensorFlow
          operations in `f` to `x`.
-       - `grad_fn` is a function with the signature `g(grad_ys)` which returns
+       - `grad_fn` is a function with the signature `g(*grad_ys)` which returns
          a list of `Tensor`s - the derivatives of `Tensor`s in `y` with respect
          to the `Tensor`s in `x.  `grad_ys` is a `Tensor` or sequence of
          `Tensor`s the same size as `y` holding the initial value gradients for
-         each `Tensor` in `y`.
+         each `Tensor` in `y`. If `f` uses `Variable`s (that are not part of the
+         inputs), i.e. through `get_variable`, then `grad_fn` should have
+         signature `g(*grad_ys, variables=None)`, where `variables` is a list of
+         the `Variable`s, and return a 2-tuple `(grad_xs, grad_vars)`, where
+         `grad_xs` is the same as above, and `grad_vars` is a `list<Tensor>`
+         with the derivatives of `Tensor`s in `y` with respect to the variables.
 
   Returns:
     A function `h(x)` which returns the same value as `f(x)[0]` and whose
@@ -92,43 +105,120 @@ def grad(dy):
 
   def decorated(*args, **kwargs):
     """Decorated function with custom gradient."""
-    if not context.executing_eagerly():
-      if kwargs:
-        raise ValueError(
-            "The custom_gradient decorator currently supports keywords "
-            "arguments only when eager execution is enabled.")
-      name = "CustomGradient-%s" % ops.uid()
-      args = [ops.convert_to_tensor(x) for x in args]
-      result, grad_fn = f(*args)
-      flat_result = nest.flatten(result)
-      all_tensors = flat_result + args
-
-      @ops.RegisterGradient(name)
-      def internal_grad_fn(unused_op, *result_grads):  # pylint: disable=unused-variable
-        gradients = nest.flatten(grad_fn(*result_grads[:len(flat_result)]))
-        # Need to return one value per input to the IdentityN, so pad the
-        # gradients of the inputs of the custom_gradient function with the
-        # gradients of the outputs as well.
-        return ([None] * len(flat_result)) + gradients
-
-      with ops.get_default_graph().gradient_override_map({"IdentityN": name}):
-        all_tensors = array_ops.identity_n(all_tensors)
-      return nest.pack_sequence_as(
-          structure=result, flat_sequence=all_tensors[:len(flat_result)])
-
-    input_tensors = [ops.convert_to_tensor(x) for x in args]
-
-    result, grad_fn = f(*args, **kwargs)
-    flat_result = nest.flatten(result)
-    # TODO(apassos) consider removing the identity below.
-    flat_result = [gen_array_ops.identity(x) for x in flat_result]
+    if context.executing_eagerly():
+      return _eager_mode_decorator(f, *args, **kwargs)
+    else:
+      return _graph_mode_decorator(f, *args, **kwargs)
 
-    def actual_grad_fn(*outputs):
-      return nest.flatten(grad_fn(*outputs))
+  return tf_decorator.make_decorator(f, decorated)
 
-    tape.record_operation(f.__name__, flat_result, input_tensors,
-                          actual_grad_fn)
-    flat_result = list(flat_result)
-    return nest.pack_sequence_as(result, flat_result)
 
-  return tf_decorator.make_decorator(f, decorated)
+def _graph_mode_decorator(f, *args, **kwargs):
+  """Implement custom gradient decorator for graph mode."""
+  # TODO(rsepassi): Add support for kwargs
+  if kwargs:
+    raise ValueError(
+        "The custom_gradient decorator currently supports keywords "
+        "arguments only when eager execution is enabled.")
+  name = "CustomGradient-%s" % ops.uid()
+  args = [ops.convert_to_tensor(x) for x in args]
+
+  # Checking global and local variables attempts to ensure that no non-resource
+  # Variables are added to the graph.
+  current_var_scope = variable_scope.get_variable_scope()
+  before_vars = set(current_var_scope.global_variables() +
+                    current_var_scope.local_variables())
+  with backprop.GradientTape() as tape:
+    result, grad_fn = f(*args)
+  after_vars = set(current_var_scope.global_variables() +
+                   current_var_scope.local_variables())
+  new_vars = after_vars - before_vars
+  for v in new_vars:
+    if not isinstance(v, resource_variable_ops.ResourceVariable):
+      raise TypeError(
+          "All variables used by a function wrapped with @custom_gradient must "
+          "be `ResourceVariable`s. Ensure that no `variable_scope` is created "
+          "with `use_resource=False`.")
+  # The variables that grad_fn needs to return gradients for are the set of
+  # variables used that are *not* part of the inputs.
+  variables = list(set(tape.watched_variables()) - set(args))
+  grad_argspec = tf_inspect.getargspec(grad_fn)
+  variables_in_signature = ("variables" in grad_argspec.args or
+                            grad_argspec.keywords)
+  if variables and not variables_in_signature:
+    raise TypeError("If using @custom_gradient with a function that "
+                    "uses variables, then grad_fn must accept a keyword "
+                    "argument 'variables'.")
+  if variables_in_signature and not variables:
+    # User seems to intend to use variables but none were captured.
+    if not variable_scope.get_variable_scope().use_resource:
+      raise TypeError("If using @custom_gradient with a function that "
+                      "uses variables, the enclosing variable scope must "
+                      "have use_resource=True.")
+    else:
+      logging.warn("@custom_gradient grad_fn has 'variables' in signature, but "
+                   "no ResourceVariables were used on the forward pass.")
+  flat_result = nest.flatten(result)
+  all_tensors = flat_result + args + variables
+
+  @ops.RegisterGradient(name)
+  def internal_grad_fn(unused_op, *result_grads):  # pylint: disable=unused-variable
+    """Custom grad fn wrapper."""
+    result_grads = result_grads[:len(flat_result)]
+    if variables:
+      input_grads, variable_grads = grad_fn(*result_grads, variables=variables)
+      if len(variable_grads) != len(variables):
+        raise ValueError("Must return gradient for each variable from "
+                         "@custom_gradient grad_fn.")
+    else:
+      input_grads = grad_fn(*result_grads)
+      variable_grads = []
+
+    # Need to return one value per input to the IdentityN, so pad the
+    # gradients of the inputs of the custom_gradient function with the
+    # gradients of the outputs as well.
+    input_grads = nest.flatten(input_grads)
+    return ([None] * len(flat_result)) + input_grads + variable_grads
+
+  with ops.get_default_graph().gradient_override_map({"IdentityN": name}):
+    all_tensors = array_ops.identity_n(all_tensors)
+  return nest.pack_sequence_as(
+      structure=result, flat_sequence=all_tensors[:len(flat_result)])
+
+
+def _eager_mode_decorator(f, *args, **kwargs):
+  """Implement custom gradient decorator for eager mode."""
+  with backprop.GradientTape() as tape:
+    result, grad_fn = f(*args, **kwargs)
+  all_inputs = list(args) + list(kwargs.values())
+  # The variables that grad_fn needs to return gradients for are the set of
+  # variables used that are *not* part of the inputs.
+  variables = [v for v in set(tape.watched_variables()) if v not in all_inputs]
+  grad_argspec = tf_inspect.getargspec(grad_fn)
+  if (variables and
+      not ("variables" in grad_argspec.args or grad_argspec.keywords)):
+    raise TypeError("If using @custom_gradient with a function that "
+                    "uses variables, then grad_fn must accept a keyword "
+                    "argument 'variables'.")
+  flat_result = nest.flatten(result)
+  # TODO(apassos) consider removing the identity below.
+  flat_result = [gen_array_ops.identity(x) for x in flat_result]
+
+  def actual_grad_fn(*result_grads):
+    """Custom grad fn wrapper."""
+    if variables:
+      input_grads, variable_grads = grad_fn(*result_grads, variables=variables)
+      if len(variable_grads) != len(variables):
+        raise ValueError("Must return gradient for each variable from "
+                         "@custom_gradient grad_fn.")
+    else:
+      input_grads = grad_fn(*result_grads)
+      variable_grads = []
+    return nest.flatten(input_grads) + variable_grads
+
+  input_tensors = [ops.convert_to_tensor(x) for x
+                   in list(args) + list(variables)]
+  tape_lib.record_operation(f.__name__, flat_result, input_tensors,
+                            actual_grad_fn)
+  flat_result = list(flat_result)
+  return nest.pack_sequence_as(result, flat_result)
diff --git a/tensorflow/python/ops/distributions/bernoulli.py b/tensorflow/python/ops/distributions/bernoulli.py
index 2c9f0e9a32dd3f..84d9d40a350956 100644
--- a/tensorflow/python/ops/distributions/bernoulli.py
+++ b/tensorflow/python/ops/distributions/bernoulli.py
@@ -71,7 +71,7 @@ def __init__(self,
     Raises:
       ValueError: If p and logits are passed, or if neither are passed.
     """
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name) as name:
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           logits=logits,
diff --git a/tensorflow/python/ops/distributions/beta.py b/tensorflow/python/ops/distributions/beta.py
index 8beab99bf868cd..f28f76b6c42a86 100644
--- a/tensorflow/python/ops/distributions/beta.py
+++ b/tensorflow/python/ops/distributions/beta.py
@@ -150,7 +150,7 @@ def __init__(self,
         more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[concentration1, concentration0]) as name:
       self._concentration1 = self._maybe_assert_valid_concentration(
           ops.convert_to_tensor(concentration1, name="concentration1"),
@@ -321,7 +321,7 @@ def __init__(self,
                validate_args=False,
                allow_nan_stats=True,
                name="BetaWithSoftplusConcentration"):
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[concentration1,
                                       concentration0]) as name:
       super(BetaWithSoftplusConcentration, self).__init__(
diff --git a/tensorflow/python/ops/distributions/bijector_impl.py b/tensorflow/python/ops/distributions/bijector_impl.py
index 4ebc600d034603..b65e64d401b800 100644
--- a/tensorflow/python/ops/distributions/bijector_impl.py
+++ b/tensorflow/python/ops/distributions/bijector_impl.py
@@ -23,6 +23,7 @@
 import contextlib
 import re
 
+import numpy as np
 import six
 
 from tensorflow.python.framework import dtypes
@@ -32,6 +33,7 @@
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import util as distribution_util
 
 
 __all__ = [
@@ -146,19 +148,32 @@ class Bijector(object):
   for transforming a `Distribution` generated `Tensor`. A `Bijector` is
   characterized by three operations:
 
-  1. Forward\
+  1. Forward
+
      Useful for turning one random outcome into another random outcome from a
      different distribution.
-  2. Inverse\
+
+  2. Inverse
+
      Useful for "reversing" a transformation to compute one probability in
      terms of another.
-  3. `log_det_jacobian(x)`\
-     "The log of the determinant of the matrix of all first-order partial
-     derivatives of the inverse function."\
+
+  3. `log_det_jacobian(x)`
+
+     "The log of the absolute value of the determinant of the matrix of all
+     first-order partial derivatives of the inverse function."
+
      Useful for inverting a transformation to compute one probability in terms
      of another. Geometrically, the Jacobian determinant is the volume of the
      transformation and is used to scale the probability.
 
+     We take the absolute value of the determinant before log to avoid NaN
+     values.  Geometrically, a negative determinant corresponds to an
+     orientation-reversing transformation.  It is ok for us to discard the sign
+     of the determinant because we only integrate everywhere-nonnegative
+     functions (probability densities) and the correct orientation is always the
+     one that produces a nonnegative integrand.
+
   By convention, transformations of random variables are named in terms of the
   forward transformation. The forward transformation creates samples, the
   inverse is useful for computing probabilities.
@@ -529,12 +544,23 @@ def __init__(self,
     elif forward_min_event_ndims is None:
       forward_min_event_ndims = inverse_min_event_ndims
 
+    if not isinstance(forward_min_event_ndims, int):
+      raise TypeError("Expected forward_min_event_ndims to be of "
+                      "type int, got {}".format(
+                          type(forward_min_event_ndims).__name__))
+
+    if not isinstance(inverse_min_event_ndims, int):
+      raise TypeError("Expected inverse_min_event_ndims to be of "
+                      "type int, got {}".format(
+                          type(inverse_min_event_ndims).__name__))
+
     if forward_min_event_ndims < 0:
       raise ValueError("forward_min_event_ndims must be a non-negative "
                        "integer.")
     if inverse_min_event_ndims < 0:
       raise ValueError("inverse_min_event_ndims must be a non-negative "
                        "integer.")
+
     self._forward_min_event_ndims = forward_min_event_ndims
     self._inverse_min_event_ndims = inverse_min_event_ndims
     self._is_constant_jacobian = is_constant_jacobian
@@ -795,33 +821,37 @@ def _call_inverse_log_det_jacobian(self, y, event_ndims, name, **kwargs):
         return self._constant_ildj_map[event_ndims]
       y = ops.convert_to_tensor(y, name="y")
       self._maybe_assert_dtype(y)
-      if not self._is_injective:  # No caching for non-injective
-        ildjs = self._inverse_log_det_jacobian(y, **kwargs)
-        return tuple(self._reduce_jacobian_det_over_event(
-            y, ildj, self.inverse_min_event_ndims, event_ndims)
-                     for ildj in ildjs)
-      mapping = self._lookup(y=y, kwargs=kwargs)
-      if mapping.ildj_map is not None and event_ndims in mapping.ildj_map:
-        return mapping.ildj_map[event_ndims]
-      try:
-        x = None  # Not needed; leave cache as is.
-        ildj = self._inverse_log_det_jacobian(y, **kwargs)
-        ildj = self._reduce_jacobian_det_over_event(
-            y, ildj, self.inverse_min_event_ndims, event_ndims)
-      except NotImplementedError as original_exception:
+      with ops.control_dependencies(self._check_valid_event_ndims(
+          min_event_ndims=self.inverse_min_event_ndims,
+          event_ndims=event_ndims)):
+        if not self._is_injective:  # No caching for non-injective
+          ildjs = self._inverse_log_det_jacobian(y, **kwargs)
+          return tuple(self._reduce_jacobian_det_over_event(
+              y, ildj, self.inverse_min_event_ndims, event_ndims)
+                       for ildj in ildjs)
+        mapping = self._lookup(y=y, kwargs=kwargs)
+        if mapping.ildj_map is not None and event_ndims in mapping.ildj_map:
+          return mapping.ildj_map[event_ndims]
         try:
-          x = mapping.x if mapping.x is not None else self._inverse(y, **kwargs)
-          ildj = -self._forward_log_det_jacobian(x, **kwargs)
+          x = None  # Not needed; leave cache as is.
+          ildj = self._inverse_log_det_jacobian(y, **kwargs)
           ildj = self._reduce_jacobian_det_over_event(
-              x, ildj, self.forward_min_event_ndims, event_ndims)
-        except NotImplementedError:
-          raise original_exception
-
-      mapping = mapping.merge(x=x, ildj_map={event_ndims: ildj})
-      self._cache(mapping)
-      if self.is_constant_jacobian:
-        self._constant_ildj_map[event_ndims] = ildj
-      return ildj
+              y, ildj, self.inverse_min_event_ndims, event_ndims)
+        except NotImplementedError as original_exception:
+          try:
+            x = (mapping.x if mapping.x is not None
+                 else self._inverse(y, **kwargs))
+            ildj = -self._forward_log_det_jacobian(x, **kwargs)
+            ildj = self._reduce_jacobian_det_over_event(
+                x, ildj, self.forward_min_event_ndims, event_ndims)
+          except NotImplementedError:
+            raise original_exception
+
+        mapping = mapping.merge(x=x, ildj_map={event_ndims: ildj})
+        self._cache(mapping)
+        if self.is_constant_jacobian:
+          self._constant_ildj_map[event_ndims] = ildj
+        return ildj
 
   def inverse_log_det_jacobian(
       self, y, event_ndims, name="inverse_log_det_jacobian"):
@@ -852,9 +882,7 @@ def inverse_log_det_jacobian(
         `self.dtype`.
       NotImplementedError: if `_inverse_log_det_jacobian` is not implemented.
     """
-    with ops.control_dependencies(self._check_valid_event_ndims(
-        min_event_ndims=self.inverse_min_event_ndims, event_ndims=event_ndims)):
-      return self._call_inverse_log_det_jacobian(y, event_ndims, name)
+    return self._call_inverse_log_det_jacobian(y, event_ndims, name)
 
   def _forward_log_det_jacobian(self, x):
     """Subclass implementation of `forward_log_det_jacobian` public function.
@@ -876,38 +904,46 @@ def _forward_log_det_jacobian(self, x):
         "forward_log_det_jacobian not implemented.")
 
   def _call_forward_log_det_jacobian(self, x, event_ndims, name, **kwargs):
+    if not self._is_injective:
+      raise NotImplementedError(
+          "forward_log_det_jacobian cannot be implemented for non-injective "
+          "transforms.")
     with self._name_scope(name, [x]):
-      if event_ndims in self._constant_ildj_map:
-        # Need "-1. *" to avoid invalid-unary-operand-type linter warning.
-        return -1. * self._constant_ildj_map[event_ndims]
-      x = ops.convert_to_tensor(x, name="x")
-      self._maybe_assert_dtype(x)
-      if not self._is_injective:
-        fldjs = self._forward_log_det_jacobian(x, **kwargs)  # No caching.
-        return tuple(self._reduce_jacobian_det_over_event(
-            x, fldj, self.forward_min_event_ndims, event_ndims)
-                     for fldj in fldjs)
-      mapping = self._lookup(x=x, kwargs=kwargs)
-      if mapping.ildj_map is not None and event_ndims in mapping.ildj_map:
-        return -mapping.ildj_map[event_ndims]
-      try:
-        y = None  # Not needed; leave cache as is.
-        ildj = -self._forward_log_det_jacobian(x, **kwargs)
-        ildj = self._reduce_jacobian_det_over_event(
-            x, ildj, self.forward_min_event_ndims, event_ndims)
-      except NotImplementedError as original_exception:
+      with ops.control_dependencies(self._check_valid_event_ndims(
+          min_event_ndims=self.forward_min_event_ndims,
+          event_ndims=event_ndims)):
+        if event_ndims in self._constant_ildj_map:
+          # Need "-1. *" to avoid invalid-unary-operand-type linter warning.
+          return -1. * self._constant_ildj_map[event_ndims]
+        x = ops.convert_to_tensor(x, name="x")
+        self._maybe_assert_dtype(x)
+        if not self._is_injective:
+          fldjs = self._forward_log_det_jacobian(x, **kwargs)  # No caching.
+          return tuple(self._reduce_jacobian_det_over_event(
+              x, fldj, self.forward_min_event_ndims, event_ndims)
+                       for fldj in fldjs)
+        mapping = self._lookup(x=x, kwargs=kwargs)
+        if mapping.ildj_map is not None and event_ndims in mapping.ildj_map:
+          return -mapping.ildj_map[event_ndims]
         try:
-          y = mapping.y if mapping.y is not None else self._forward(x, **kwargs)
-          ildj = self._inverse_log_det_jacobian(y, **kwargs)
+          y = None  # Not needed; leave cache as is.
+          ildj = -self._forward_log_det_jacobian(x, **kwargs)
           ildj = self._reduce_jacobian_det_over_event(
-              y, ildj, self.inverse_min_event_ndims, event_ndims)
-        except NotImplementedError:
-          raise original_exception
-      mapping = mapping.merge(y=y, ildj_map={event_ndims: ildj})
-      self._cache(mapping)
-      if self.is_constant_jacobian:
-        self._constant_ildj_map[event_ndims] = ildj
-      return -ildj
+              x, ildj, self.forward_min_event_ndims, event_ndims)
+        except NotImplementedError as original_exception:
+          try:
+            y = (mapping.y if mapping.y is not None
+                 else self._forward(x, **kwargs))
+            ildj = self._inverse_log_det_jacobian(y, **kwargs)
+            ildj = self._reduce_jacobian_det_over_event(
+                y, ildj, self.inverse_min_event_ndims, event_ndims)
+          except NotImplementedError:
+            raise original_exception
+        mapping = mapping.merge(y=y, ildj_map={event_ndims: ildj})
+        self._cache(mapping)
+        if self.is_constant_jacobian:
+          self._constant_ildj_map[event_ndims] = ildj
+        return -ildj
 
   def forward_log_det_jacobian(
       self, x, event_ndims, name="forward_log_det_jacobian"):
@@ -933,13 +969,7 @@ def forward_log_det_jacobian(
         nor {`_inverse`, `_inverse_log_det_jacobian`} are implemented, or
         this is a non-injective bijector.
     """
-    if not self._is_injective:
-      raise NotImplementedError(
-          "forward_log_det_jacobian cannot be implemented for non-injective "
-          "transforms.")
-    with ops.control_dependencies(self._check_valid_event_ndims(
-        min_event_ndims=self.forward_min_event_ndims, event_ndims=event_ndims)):
-      return self._call_forward_log_det_jacobian(x, event_ndims, name)
+    return self._call_forward_log_det_jacobian(x, event_ndims, name)
 
   @contextlib.contextmanager
   def _name_scope(self, name=None, values=None):
@@ -981,12 +1011,13 @@ def _lookup(self, x=None, y=None, kwargs=None):
   def _reduce_jacobian_det_over_event(
       self, y, ildj, min_event_ndims, event_ndims):
     """Reduce jacobian over event_ndims - min_event_ndims."""
+
     if not self.is_constant_jacobian:
       return math_ops.reduce_sum(
           ildj,
           self._get_event_reduce_dims(min_event_ndims, event_ndims))
 
-    # In this case, we need to tile the jacobian over the event and reduce.
+    # In this case, we need to tile the Jacobian over the event and reduce.
     y_rank = array_ops.rank(y)
     y_shape = array_ops.shape(y)[
         y_rank - event_ndims : y_rank - min_event_ndims]
@@ -997,47 +1028,73 @@ def _reduce_jacobian_det_over_event(
         axis=self._get_event_reduce_dims(min_event_ndims, event_ndims))
     # The multiplication by ones can change the inferred static shape so we try
     # to recover as much as possible.
-    if (isinstance(event_ndims, int) and
-        y.get_shape().ndims and ildj.get_shape().ndims):
-      y_shape = y.get_shape()
-      y_shape = y_shape[y_shape.ndims - event_ndims :
-                        y_shape.ndims - min_event_ndims]
-      ildj_shape = ildj.get_shape()
-      broadcast_shape = array_ops.broadcast_static_shape(
-          ildj_shape, y_shape)
+    event_ndims_ = self._maybe_get_static_event_ndims(event_ndims)
+    if (event_ndims_ is not None and
+        y.shape.ndims is not None and
+        ildj.shape.ndims is not None):
+      y_shape = y.shape[y.shape.ndims - event_ndims_ :
+                        y.shape.ndims - min_event_ndims]
+      broadcast_shape = array_ops.broadcast_static_shape(ildj.shape, y_shape)
       reduced_ildj.set_shape(
           broadcast_shape[: broadcast_shape.ndims - (
-              event_ndims - min_event_ndims)])
+              event_ndims_ - min_event_ndims)])
 
     return reduced_ildj
 
   def _get_event_reduce_dims(self, min_event_ndims, event_ndims):
     """Compute the reduction dimensions given event_ndims."""
-    min_event_ndims_ = (min_event_ndims if isinstance(min_event_ndims, int)
-                        else tensor_util.constant_value(min_event_ndims))
-    event_ndims_ = (event_ndims if isinstance(event_ndims, int)
-                    else tensor_util.constant_value(event_ndims))
+    event_ndims_ = self._maybe_get_static_event_ndims(event_ndims)
 
-    if min_event_ndims_ is not None and event_ndims_ is not None:
-      return [-index for index in range(1, event_ndims_ - min_event_ndims_ + 1)]
+    if event_ndims_ is not None:
+      return [-index for index in range(1, event_ndims_ - min_event_ndims + 1)]
     else:
       reduce_ndims = event_ndims - min_event_ndims
       return math_ops.range(-reduce_ndims, 0)
 
   def _check_valid_event_ndims(self, min_event_ndims, event_ndims):
     """Check whether event_ndims is atleast min_event_ndims."""
-    min_event_ndims_ = (min_event_ndims if isinstance(min_event_ndims, int)
-                        else tensor_util.constant_value(min_event_ndims))
-    event_ndims_ = (event_ndims if isinstance(event_ndims, int)
-                    else tensor_util.constant_value(event_ndims))
-
-    if min_event_ndims_ is not None and event_ndims_ is not None:
-      if min_event_ndims_ > event_ndims_:
+    event_ndims = ops.convert_to_tensor(event_ndims, name="event_ndims")
+    event_ndims_ = tensor_util.constant_value(event_ndims)
+    assertions = []
+
+    if not event_ndims.dtype.is_integer:
+      raise ValueError("Expected integer dtype, got dtype {}".format(
+          event_ndims.dtype))
+
+    if event_ndims_ is not None:
+      if event_ndims.shape.ndims != 0:
+        raise ValueError("Expected scalar event_ndims, got shape {}".format(
+            event_ndims.shape))
+      if min_event_ndims > event_ndims_:
         raise ValueError("event_ndims ({}) must be larger than "
                          "min_event_ndims ({})".format(
-                             event_ndims_, min_event_ndims_))
-      return []
-
-    if self.validate_args:
-      return [check_ops.assert_greater_equal(event_ndims, min_event_ndims)]
-    return []
+                             event_ndims_, min_event_ndims))
+    elif self.validate_args:
+      assertions += [
+          check_ops.assert_greater_equal(event_ndims, min_event_ndims)]
+
+    if event_ndims.shape.is_fully_defined():
+      if event_ndims.shape.ndims != 0:
+        raise ValueError("Expected scalar shape, got ndims {}".format(
+            event_ndims.shape.ndims))
+
+    elif self.validate_args:
+      assertions += [
+          check_ops.assert_rank(event_ndims, 0, message="Expected scalar.")]
+    return assertions
+
+  def _maybe_get_static_event_ndims(self, event_ndims):
+    """Helper which returns tries to return an integer static value."""
+    event_ndims_ = distribution_util.maybe_get_static_value(event_ndims)
+
+    if isinstance(event_ndims_, (np.generic, np.ndarray)):
+      if event_ndims_.dtype not in (np.int32, np.int64):
+        raise ValueError("Expected integer dtype, got dtype {}".format(
+            event_ndims_.dtype))
+
+      if isinstance(event_ndims_, np.ndarray) and len(event_ndims_.shape):
+        raise ValueError("Expected a scalar integer, got {}".format(
+            event_ndims_))
+      event_ndims_ = int(event_ndims_)
+
+    return event_ndims_
diff --git a/tensorflow/python/ops/distributions/categorical.py b/tensorflow/python/ops/distributions/categorical.py
index 8f25b1149c3c8b..b88a0518b6db15 100644
--- a/tensorflow/python/ops/distributions/categorical.py
+++ b/tensorflow/python/ops/distributions/categorical.py
@@ -182,7 +182,7 @@ def __init__(
         more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[logits, probs]) as name:
       self._logits, self._probs = distribution_util.get_logits_and_probs(
           logits=logits,
diff --git a/tensorflow/python/ops/distributions/dirichlet.py b/tensorflow/python/ops/distributions/dirichlet.py
index eafcd5c78f7752..72567e62f78665 100644
--- a/tensorflow/python/ops/distributions/dirichlet.py
+++ b/tensorflow/python/ops/distributions/dirichlet.py
@@ -28,6 +28,7 @@
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import special_math_ops
 from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.ops.distributions import util as distribution_util
 from tensorflow.python.util.tf_export import tf_export
 
@@ -154,7 +155,7 @@ def __init__(self,
         more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[concentration]) as name:
       self._concentration = self._maybe_assert_valid_concentration(
           ops.convert_to_tensor(concentration, name="concentration"),
@@ -297,3 +298,80 @@ def _maybe_assert_valid_sample(self, x):
             math_ops.reduce_sum(x, -1),
             message="sample last-dimension must sum to `1`"),
     ], x)
+
+
+@kullback_leibler.RegisterKL(Dirichlet, Dirichlet)
+def _kl_dirichlet_dirichlet(d1, d2, name=None):
+  """Batchwise KL divergence KL(d1 || d2) with d1 and d2 Dirichlet.
+
+  Args:
+    d1: instance of a Dirichlet distribution object.
+    d2: instance of a Dirichlet distribution object.
+    name: (optional) Name to use for created operations.
+      default is "kl_dirichlet_dirichlet".
+
+  Returns:
+    Batchwise KL(d1 || d2)
+  """
+  with ops.name_scope(name, "kl_dirichlet_dirichlet", values=[
+      d1.concentration, d2.concentration]):
+    # The KL between Dirichlet distributions can be derived as follows. We have
+    #
+    #   Dir(x; a) = 1 / B(a) * prod_i[x[i]^(a[i] - 1)]
+    #
+    # where B(a) is the multivariate Beta function:
+    #
+    #   B(a) = Gamma(a[1]) * ... * Gamma(a[n]) / Gamma(a[1] + ... + a[n])
+    #
+    # The KL is
+    #
+    #   KL(Dir(x; a), Dir(x; b)) = E_Dir(x; a){log(Dir(x; a) / Dir(x; b))}
+    #
+    # so we'll need to know the log density of the Dirichlet. This is
+    #
+    #   log(Dir(x; a)) = sum_i[(a[i] - 1) log(x[i])] - log B(a)
+    #
+    # The only term that matters for the expectations is the log(x[i]). To
+    # compute the expectation of this term over the Dirichlet density, we can
+    # use the following facts about the Dirichlet in exponential family form:
+    #   1. log(x[i]) is a sufficient statistic
+    #   2. expected sufficient statistics (of any exp family distribution) are
+    #      equal to derivatives of the log normalizer with respect to
+    #      corresponding natural parameters: E{T[i](x)} = dA/d(eta[i])
+    #
+    # To proceed, we can rewrite the Dirichlet density in exponential family
+    # form as follows:
+    #
+    #   Dir(x; a) = exp{eta(a) . T(x) - A(a)}
+    #
+    # where '.' is the dot product of vectors eta and T, and A is a scalar:
+    #
+    #   eta[i](a) = a[i] - 1
+    #     T[i](x) = log(x[i])
+    #        A(a) = log B(a)
+    #
+    # Now, we can use fact (2) above to write
+    #
+    #   E_Dir(x; a)[log(x[i])]
+    #       = dA(a) / da[i]
+    #       = d/da[i] log B(a)
+    #       = d/da[i] (sum_j lgamma(a[j])) - lgamma(sum_j a[j])
+    #       = digamma(a[i])) - digamma(sum_j a[j])
+    #
+    # Putting it all together, we have
+    #
+    # KL[Dir(x; a) || Dir(x; b)]
+    #     = E_Dir(x; a){log(Dir(x; a) / Dir(x; b)}
+    #     = E_Dir(x; a){sum_i[(a[i] - b[i]) log(x[i])} - (lbeta(a) - lbeta(b))
+    #     = sum_i[(a[i] - b[i]) * E_Dir(x; a){log(x[i])}] - lbeta(a) + lbeta(b)
+    #     = sum_i[(a[i] - b[i]) * (digamma(a[i]) - digamma(sum_j a[j]))]
+    #          - lbeta(a) + lbeta(b))
+
+    digamma_sum_d1 = math_ops.digamma(
+        math_ops.reduce_sum(d1.concentration, axis=-1, keepdims=True))
+    digamma_diff = math_ops.digamma(d1.concentration) - digamma_sum_d1
+    concentration_diff = d1.concentration - d2.concentration
+
+    return (math_ops.reduce_sum(concentration_diff * digamma_diff, axis=-1) -
+            special_math_ops.lbeta(d1.concentration) +
+            special_math_ops.lbeta(d2.concentration))
diff --git a/tensorflow/python/ops/distributions/dirichlet_multinomial.py b/tensorflow/python/ops/distributions/dirichlet_multinomial.py
index fe0ed7e07d5965..5350c8284704a1 100644
--- a/tensorflow/python/ops/distributions/dirichlet_multinomial.py
+++ b/tensorflow/python/ops/distributions/dirichlet_multinomial.py
@@ -191,7 +191,7 @@ def __init__(self,
         more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[total_count, concentration]) as name:
       # Broadcasting works because:
       # * The broadcasting convention is to prepend dimensions of size [1], and
diff --git a/tensorflow/python/ops/distributions/distribution.py b/tensorflow/python/ops/distributions/distribution.py
index 3815abf72de1e2..41dcd401887a12 100644
--- a/tensorflow/python/ops/distributions/distribution.py
+++ b/tensorflow/python/ops/distributions/distribution.py
@@ -357,7 +357,7 @@ def _log_prob(self, value):
 
   For detailed usage examples of TensorFlow Distributions shapes, see
   [this tutorial](
-  https://github.com/tensorflow/probability/blob/master/tensorflow_probability/examples/jupyter_notebooks/Understanding%20TensorFlow%20Distributions%20Shapes.ipynb)
+  https://github.com/tensorflow/probability/blob/master/tensorflow_probability/examples/jupyter_notebooks/Understanding_TensorFlow_Distributions_Shapes.ipynb)
 
   #### Parameter values leading to undefined statistics or distributions.
 
@@ -524,7 +524,8 @@ def dtype(self):
   def parameters(self):
     """Dictionary of parameters used to instantiate this `Distribution`."""
     # Remove "self", "__class__", or other special variables. These can appear
-    # if the subclass used `parameters = locals()`.
+    # if the subclass used:
+    # `parameters = dict(locals())`.
     return dict((k, v) for k, v in self._parameters.items()
                 if not k.startswith("__") and k != "self")
 
@@ -721,11 +722,8 @@ def _call_log_prob(self, value, name, **kwargs):
       value = ops.convert_to_tensor(value, name="value")
       try:
         return self._log_prob(value, **kwargs)
-      except NotImplementedError as original_exception:
-        try:
-          return math_ops.log(self._prob(value, **kwargs))
-        except NotImplementedError:
-          raise original_exception
+      except NotImplementedError:
+        return math_ops.log(self._prob(value, **kwargs))
 
   def log_prob(self, value, name="log_prob"):
     """Log probability density/mass function.
@@ -748,11 +746,8 @@ def _call_prob(self, value, name, **kwargs):
       value = ops.convert_to_tensor(value, name="value")
       try:
         return self._prob(value, **kwargs)
-      except NotImplementedError as original_exception:
-        try:
-          return math_ops.exp(self._log_prob(value, **kwargs))
-        except NotImplementedError:
-          raise original_exception
+      except NotImplementedError:
+        return math_ops.exp(self._log_prob(value, **kwargs))
 
   def prob(self, value, name="prob"):
     """Probability density/mass function.
@@ -775,11 +770,8 @@ def _call_log_cdf(self, value, name, **kwargs):
       value = ops.convert_to_tensor(value, name="value")
       try:
         return self._log_cdf(value, **kwargs)
-      except NotImplementedError as original_exception:
-        try:
-          return math_ops.log(self._cdf(value, **kwargs))
-        except NotImplementedError:
-          raise original_exception
+      except NotImplementedError:
+        return math_ops.log(self._cdf(value, **kwargs))
 
   def log_cdf(self, value, name="log_cdf"):
     """Log cumulative distribution function.
@@ -812,11 +804,8 @@ def _call_cdf(self, value, name, **kwargs):
       value = ops.convert_to_tensor(value, name="value")
       try:
         return self._cdf(value, **kwargs)
-      except NotImplementedError as original_exception:
-        try:
-          return math_ops.exp(self._log_cdf(value, **kwargs))
-        except NotImplementedError:
-          raise original_exception
+      except NotImplementedError:
+        return math_ops.exp(self._log_cdf(value, **kwargs))
 
   def cdf(self, value, name="cdf"):
     """Cumulative distribution function.
@@ -845,11 +834,8 @@ def _call_log_survival_function(self, value, name, **kwargs):
       value = ops.convert_to_tensor(value, name="value")
       try:
         return self._log_survival_function(value, **kwargs)
-      except NotImplementedError as original_exception:
-        try:
-          return math_ops.log1p(-self.cdf(value, **kwargs))
-        except NotImplementedError:
-          raise original_exception
+      except NotImplementedError:
+        return math_ops.log1p(-self.cdf(value, **kwargs))
 
   def log_survival_function(self, value, name="log_survival_function"):
     """Log survival function.
@@ -883,11 +869,8 @@ def _call_survival_function(self, value, name, **kwargs):
       value = ops.convert_to_tensor(value, name="value")
       try:
         return self._survival_function(value, **kwargs)
-      except NotImplementedError as original_exception:
-        try:
-          return 1. - self.cdf(value, **kwargs)
-        except NotImplementedError:
-          raise original_exception
+      except NotImplementedError:
+        return 1. - self.cdf(value, **kwargs)
 
   def survival_function(self, value, name="survival_function"):
     """Survival function.
@@ -932,10 +915,7 @@ def _quantile(self, value):
   def _call_quantile(self, value, name, **kwargs):
     with self._name_scope(name, values=[value]):
       value = ops.convert_to_tensor(value, name="value")
-      try:
-        return self._quantile(value, **kwargs)
-      except NotImplementedError as original_exception:
-        raise original_exception
+      return self._quantile(value, **kwargs)
 
   def quantile(self, value, name="quantile"):
     """Quantile function. Aka "inverse cdf" or "percent point function".
@@ -981,11 +961,8 @@ def variance(self, name="variance"):
     with self._name_scope(name):
       try:
         return self._variance()
-      except NotImplementedError as original_exception:
-        try:
-          return math_ops.square(self._stddev())
-        except NotImplementedError:
-          raise original_exception
+      except NotImplementedError:
+        return math_ops.square(self._stddev())
 
   def _stddev(self):
     raise NotImplementedError("stddev is not implemented")
@@ -1013,11 +990,8 @@ def stddev(self, name="stddev"):
     with self._name_scope(name):
       try:
         return self._stddev()
-      except NotImplementedError as original_exception:
-        try:
-          return math_ops.sqrt(self._variance())
-        except NotImplementedError:
-          raise original_exception
+      except NotImplementedError:
+        return math_ops.sqrt(self._variance())
 
   def _covariance(self):
     raise NotImplementedError("covariance is not implemented")
diff --git a/tensorflow/python/ops/distributions/exponential.py b/tensorflow/python/ops/distributions/exponential.py
index cf0e729e1a189d..24bc3f3d3eb06a 100644
--- a/tensorflow/python/ops/distributions/exponential.py
+++ b/tensorflow/python/ops/distributions/exponential.py
@@ -90,7 +90,7 @@ def __init__(self,
         more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
-    parameters = locals()
+    parameters = dict(locals())
     # Even though all statistics of are defined for valid inputs, this is not
     # true in the parent class "Gamma."  Therefore, passing
     # allow_nan_stats=True
@@ -143,7 +143,7 @@ def __init__(self,
                validate_args=False,
                allow_nan_stats=True,
                name="ExponentialWithSoftplusRate"):
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[rate]) as name:
       super(ExponentialWithSoftplusRate, self).__init__(
           rate=nn.softplus(rate, name="softplus_rate"),
diff --git a/tensorflow/python/ops/distributions/gamma.py b/tensorflow/python/ops/distributions/gamma.py
index d39f7c56d39ae1..163a27f7585518 100644
--- a/tensorflow/python/ops/distributions/gamma.py
+++ b/tensorflow/python/ops/distributions/gamma.py
@@ -126,7 +126,7 @@ def __init__(self,
     Raises:
       TypeError: if `concentration` and `rate` are different dtypes.
     """
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[concentration, rate]) as name:
       with ops.control_dependencies([
           check_ops.assert_positive(concentration),
@@ -261,7 +261,7 @@ def __init__(self,
                validate_args=False,
                allow_nan_stats=True,
                name="GammaWithSoftplusConcentrationRate"):
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[concentration, rate]) as name:
       super(GammaWithSoftplusConcentrationRate, self).__init__(
           concentration=nn.softplus(concentration,
diff --git a/tensorflow/python/ops/distributions/laplace.py b/tensorflow/python/ops/distributions/laplace.py
index 3ccfc618d11577..be17cf2527eacb 100644
--- a/tensorflow/python/ops/distributions/laplace.py
+++ b/tensorflow/python/ops/distributions/laplace.py
@@ -100,7 +100,7 @@ def __init__(self,
     Raises:
       TypeError: if `loc` and `scale` are of different dtype.
     """
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[loc, scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(scale)] if
                                     validate_args else []):
@@ -217,7 +217,7 @@ def __init__(self,
                validate_args=False,
                allow_nan_stats=True,
                name="LaplaceWithSoftplusScale"):
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[loc, scale]) as name:
       super(LaplaceWithSoftplusScale, self).__init__(
           loc=loc,
diff --git a/tensorflow/python/ops/distributions/multinomial.py b/tensorflow/python/ops/distributions/multinomial.py
index ab77f5c1f815f3..d0943e8eee69a5 100644
--- a/tensorflow/python/ops/distributions/multinomial.py
+++ b/tensorflow/python/ops/distributions/multinomial.py
@@ -182,7 +182,7 @@ def __init__(self,
         more of the statistic's batch members are undefined.
       name: Python `str` name prefixed to Ops created by this class.
     """
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[total_count, logits, probs]) as name:
       self._total_count = ops.convert_to_tensor(total_count, name="total_count")
       if validate_args:
diff --git a/tensorflow/python/ops/distributions/normal.py b/tensorflow/python/ops/distributions/normal.py
index 20d4420e91886c..d0a987ba7ccf0b 100644
--- a/tensorflow/python/ops/distributions/normal.py
+++ b/tensorflow/python/ops/distributions/normal.py
@@ -131,7 +131,7 @@ def __init__(self,
     Raises:
       TypeError: if `loc` and `scale` have different `dtype`.
     """
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[loc, scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(scale)] if
                                     validate_args else []):
@@ -243,7 +243,7 @@ def __init__(self,
                validate_args=False,
                allow_nan_stats=True,
                name="NormalWithSoftplusScale"):
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[scale]) as name:
       super(NormalWithSoftplusScale, self).__init__(
           loc=loc,
diff --git a/tensorflow/python/ops/distributions/special_math.py b/tensorflow/python/ops/distributions/special_math.py
index 1d605c5dfcca9b..31b7a36fd3ae40 100644
--- a/tensorflow/python/ops/distributions/special_math.py
+++ b/tensorflow/python/ops/distributions/special_math.py
@@ -18,7 +18,6 @@
 from __future__ import division
 from __future__ import print_function
 
-import math
 import numpy as np
 
 from tensorflow.python.framework import constant_op
@@ -42,15 +41,15 @@
 # then made more conservative just to be safe. (Conservative means use the
 # expansion more than we probably need to.) See `NdtrTest` in
 # special_math_test.py.
-LOGNDTR_FLOAT64_LOWER = -20
-LOGNDTR_FLOAT32_LOWER = -10
+LOGNDTR_FLOAT64_LOWER = np.array(-20, np.float64)
+LOGNDTR_FLOAT32_LOWER = np.array(-10, np.float32)
 
 # Upper bound values were chosen by examining for which values of 'x'
 # Log[cdf(x)] is 0, after which point we need to use the approximation
 # Log[cdf(x)] = Log[1 - cdf(-x)] approx -cdf(-x). We chose a value slightly
 # conservative, meaning we use the approximation earlier than needed.
-LOGNDTR_FLOAT64_UPPER = 8
-LOGNDTR_FLOAT32_UPPER = 5
+LOGNDTR_FLOAT64_UPPER = np.array(8, np.float64)
+LOGNDTR_FLOAT32_UPPER = np.array(5, np.float32)
 
 
 def ndtr(x, name="ndtr"):
@@ -91,7 +90,7 @@ def ndtr(x, name="ndtr"):
 def _ndtr(x):
   """Implements ndtr core logic."""
   half_sqrt_2 = constant_op.constant(
-      0.5 * math.sqrt(2.), dtype=x.dtype, name="half_sqrt_2")
+      0.5 * np.sqrt(2.), dtype=x.dtype, name="half_sqrt_2")
   w = x * half_sqrt_2
   z = math_ops.abs(w)
   y = array_ops.where(math_ops.less(z, half_sqrt_2),
@@ -190,18 +189,18 @@ def _ndtri(p):
 
   def _create_polynomial(var, coeffs):
     """Compute n_th order polynomial via Horner's method."""
-    if not coeffs:
-      return 0.
+    coeffs = np.array(coeffs, var.dtype.as_numpy_dtype)
+    if not coeffs.size:
+      return array_ops.zeros_like(var)
     return coeffs[0] + _create_polynomial(var, coeffs[1:]) * var
 
-  maybe_complement_p = array_ops.where(p > 1. - np.exp(-2.), 1. - p, p)
+  maybe_complement_p = array_ops.where(p > -np.expm1(-2.), 1. - p, p)
   # Write in an arbitrary value in place of 0 for p since 0 will cause NaNs
   # later on. The result from the computation when p == 0 is not used so any
   # number that doesn't result in NaNs is fine.
-  one_half = constant_op.constant(0.5, dtype=p.dtype)
   sanitized_mcp = array_ops.where(
       maybe_complement_p <= 0.,
-      array_ops.fill(array_ops.shape(p), one_half),
+      array_ops.fill(array_ops.shape(p), np.array(0.5, p.dtype.as_numpy_dtype)),
       maybe_complement_p)
 
   # Compute x for p > exp(-2): x/sqrt(2pi) = w + w**3 P0(w**2)/Q0(w**2).
@@ -216,10 +215,12 @@ def _create_polynomial(var, coeffs):
   # arrays based on whether p < exp(-32).
   z = math_ops.sqrt(-2. * math_ops.log(sanitized_mcp))
   first_term = z - math_ops.log(z) / z
-  second_term_small_p = (_create_polynomial(1. / z, p2)
-                         / _create_polynomial(1. / z, q2)) / z
-  second_term_otherwise = (_create_polynomial(1. / z, p1)
-                           / _create_polynomial(1. / z, q1)) / z
+  second_term_small_p = (
+      _create_polynomial(1. / z, p2) /
+      _create_polynomial(1. / z, q2) / z)
+  second_term_otherwise = (
+      _create_polynomial(1. / z, p1) /
+      _create_polynomial(1. / z, q1) / z)
   x_for_small_p = first_term - second_term_small_p
   x_otherwise = first_term - second_term_otherwise
 
@@ -330,23 +331,25 @@ def _log_ndtr_lower(x, series_order):
   """Asymptotic expansion version of `Log[cdf(x)]`, appropriate for `x<<-1`."""
   x_2 = math_ops.square(x)
   # Log of the term multiplying (1 + sum)
-  log_scale = -0.5 * x_2 - math_ops.log(-x) - 0.5 * math.log(2. * math.pi)
+  log_scale = -0.5 * x_2 - math_ops.log(-x) - 0.5 * np.log(2. * np.pi)
   return log_scale + math_ops.log(_log_ndtr_asymptotic_series(x, series_order))
 
 
 def _log_ndtr_asymptotic_series(x, series_order):
   """Calculates the asymptotic series used in log_ndtr."""
+  dtype = x.dtype.as_numpy_dtype
   if series_order <= 0:
-    return 1.
+    return np.array(1, dtype)
   x_2 = math_ops.square(x)
-  even_sum = 0.
-  odd_sum = 0.
+  even_sum = array_ops.zeros_like(x)
+  odd_sum = array_ops.zeros_like(x)
   x_2n = x_2  # Start with x^{2*1} = x^{2*n} with n = 1.
   for n in range(1, series_order + 1):
+    y = np.array(_double_factorial(2 * n - 1), dtype) / x_2n
     if n % 2:
-      odd_sum += _double_factorial(2 * n - 1) / x_2n
+      odd_sum += y
     else:
-      even_sum += _double_factorial(2 * n - 1) / x_2n
+      even_sum += y
     x_2n *= x_2
   return 1. + even_sum - odd_sum
 
diff --git a/tensorflow/python/ops/distributions/student_t.py b/tensorflow/python/ops/distributions/student_t.py
index 961b07a7bdac34..20a2d16181442b 100644
--- a/tensorflow/python/ops/distributions/student_t.py
+++ b/tensorflow/python/ops/distributions/student_t.py
@@ -157,7 +157,7 @@ def __init__(self,
     Raises:
       TypeError: if loc and scale are different dtypes.
     """
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[df, loc, scale]) as name:
       with ops.control_dependencies([check_ops.assert_positive(df)]
                                     if validate_args else []):
@@ -349,7 +349,7 @@ def __init__(self,
                validate_args=False,
                allow_nan_stats=True,
                name="StudentTWithAbsDfSoftplusScale"):
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[df, scale]) as name:
       super(StudentTWithAbsDfSoftplusScale, self).__init__(
           df=math_ops.floor(math_ops.abs(df)),
diff --git a/tensorflow/python/ops/distributions/transformed_distribution.py b/tensorflow/python/ops/distributions/transformed_distribution.py
index bc321900dcbcfe..e80bf9ee4272c8 100644
--- a/tensorflow/python/ops/distributions/transformed_distribution.py
+++ b/tensorflow/python/ops/distributions/transformed_distribution.py
@@ -252,7 +252,7 @@ def __init__(self,
       name: Python `str` name prefixed to Ops created by this class. Default:
         `bijector.name + distribution.name`.
     """
-    parameters = locals()
+    parameters = dict(locals())
     name = name or (("" if bijector is None else bijector.name) +
                     distribution.name)
     with ops.name_scope(name, values=[event_shape, batch_shape]) as name:
@@ -416,7 +416,7 @@ def _log_prob(self, y):
     # For caching to work, it is imperative that the bijector is the first to
     # modify the input.
     x = self.bijector.inverse(y)
-    event_ndims = self._maybe_get_event_ndims_statically()
+    event_ndims = self._maybe_get_static_event_ndims()
 
     ildj = self.bijector.inverse_log_det_jacobian(y, event_ndims=event_ndims)
     if self.bijector._is_injective:  # pylint: disable=protected-access
@@ -435,13 +435,15 @@ def _finish_log_prob_for_one_fiber(self, y, x, ildj, event_ndims):
       log_prob = math_ops.reduce_sum(log_prob, self._reduce_event_indices)
     log_prob += math_ops.cast(ildj, log_prob.dtype)
     if self._is_maybe_event_override and isinstance(event_ndims, int):
-      log_prob.set_shape(array_ops.broadcast_static_shape(
-          x.get_shape().with_rank_at_least(1)[:-event_ndims], self.batch_shape))
+      log_prob.set_shape(
+          array_ops.broadcast_static_shape(
+              y.get_shape().with_rank_at_least(1)[:-event_ndims],
+              self.batch_shape))
     return log_prob
 
   def _prob(self, y):
     x = self.bijector.inverse(y)
-    event_ndims = self._maybe_get_event_ndims_statically()
+    event_ndims = self._maybe_get_static_event_ndims()
     ildj = self.bijector.inverse_log_det_jacobian(y, event_ndims=event_ndims)
     if self.bijector._is_injective:  # pylint: disable=protected-access
       return self._finish_prob_for_one_fiber(y, x, ildj, event_ndims)
@@ -459,8 +461,10 @@ def _finish_prob_for_one_fiber(self, y, x, ildj, event_ndims):
       prob = math_ops.reduce_prod(prob, self._reduce_event_indices)
     prob *= math_ops.exp(math_ops.cast(ildj, prob.dtype))
     if self._is_maybe_event_override and isinstance(event_ndims, int):
-      prob.set_shape(array_ops.broadcast_static_shape(
-          y.get_shape().with_rank_at_least(1)[:-event_ndims], self.batch_shape))
+      prob.set_shape(
+          array_ops.broadcast_static_shape(
+              y.get_shape().with_rank_at_least(1)[:-event_ndims],
+              self.batch_shape))
     return prob
 
   def _log_cdf(self, y):
@@ -618,15 +622,14 @@ def _maybe_rotate_dims(self, x, rotate_right=False):
     return array_ops.transpose(
         x, _concat_vectors(math_ops.range(n, ndims), math_ops.range(0, n)))
 
-  def _maybe_get_event_ndims_statically(self):
+  def _maybe_get_static_event_ndims(self):
     if self.event_shape.ndims is not None:
       return self.event_shape.ndims
 
     event_ndims = array_ops.size(self.event_shape_tensor())
+    event_ndims_ = distribution_util.maybe_get_static_value(event_ndims)
 
-    static_event_ndims = tensor_util.constant_value(event_ndims)
-
-    if static_event_ndims is not None:
-      return static_event_ndims
+    if event_ndims_ is not None:
+      return event_ndims_
 
     return event_ndims
diff --git a/tensorflow/python/ops/distributions/uniform.py b/tensorflow/python/ops/distributions/uniform.py
index 087797c653bb3e..e66c4a37e7c633 100644
--- a/tensorflow/python/ops/distributions/uniform.py
+++ b/tensorflow/python/ops/distributions/uniform.py
@@ -102,7 +102,7 @@ def __init__(self,
     Raises:
       InvalidArgumentError: if `low >= high` and `validate_args=False`.
     """
-    parameters = locals()
+    parameters = dict(locals())
     with ops.name_scope(name, values=[low, high]) as name:
       with ops.control_dependencies([
           check_ops.assert_less(
diff --git a/tensorflow/python/ops/distributions/util.py b/tensorflow/python/ops/distributions/util.py
index 2e067eab459050..401676bf842b4d 100644
--- a/tensorflow/python/ops/distributions/util.py
+++ b/tensorflow/python/ops/distributions/util.py
@@ -33,6 +33,7 @@
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
+from tensorflow.python.util import tf_inspect
 
 
 def assert_close(
@@ -162,6 +163,31 @@ def all_shapes_equal():
       lambda: constant_op.constant(False))
 
 
+def maybe_get_static_value(x, dtype=None):
+  """Helper which tries to return a static value.
+
+  Given `x`, extract it's value statically, optionally casting to a specific
+  dtype. If this is not possible, None is returned.
+
+  Args:
+    x: `Tensor` for which to extract a value statically.
+    dtype: Optional dtype to cast to.
+
+  Returns:
+    Statically inferred value if possible, otherwise None.
+  """
+  if x is None:
+    return x
+  try:
+    # This returns an np.ndarray.
+    x_ = tensor_util.constant_value(x)
+  except TypeError:
+    x_ = x
+  if x_ is None or dtype is None:
+    return x_
+  return np.array(x_, dtype)
+
+
 def get_logits_and_probs(logits=None,
                          probs=None,
                          multidimensional=False,
@@ -798,8 +824,8 @@ def fill_triangular(x, upper=False, name=None):
   Triangular matrix elements are filled in a clockwise spiral. See example,
   below.
 
-  If `x.get_shape()` is `[b1, b2, ..., bK, d]` then the output shape is `[b1,
-  b2, ..., bK, n, n]` where `n` is such that `d = n(n+1)/2`, i.e.,
+  If `x.get_shape()` is `[b1, b2, ..., bB, d]` then the output shape is
+  `[b1, b2, ..., bB, n, n]` where `n` is such that `d = n(n+1)/2`, i.e.,
   `n = int(np.sqrt(0.25 + 2. * m) - 0.5)`.
 
   Example:
@@ -888,10 +914,11 @@ def fill_triangular(x, upper=False, name=None):
     #   = 2 (n**2 / 2 + n / 2) - n**2
     #   = n**2 + n - n**2
     #   = n
+    ndims = prefer_static_rank(x)
     if upper:
-      x_list = [x, array_ops.reverse(x[..., n:], axis=[-1])]
+      x_list = [x, array_ops.reverse(x[..., n:], axis=[ndims - 1])]
     else:
-      x_list = [x[..., n:], array_ops.reverse(x, axis=[-1])]
+      x_list = [x[..., n:], array_ops.reverse(x, axis=[ndims - 1])]
     new_shape = (
         static_final_shape.as_list()
         if static_final_shape.is_fully_defined()
@@ -905,6 +932,74 @@ def fill_triangular(x, upper=False, name=None):
     return x
 
 
+def fill_triangular_inverse(x, upper=False, name=None):
+  """Creates a vector from a (batch of) triangular matrix.
+
+  The vector is created from the lower-triangular or upper-triangular portion
+  depending on the value of the parameter `upper`.
+
+  If `x.shape` is `[b1, b2, ..., bB, n, n]` then the output shape is
+  `[b1, b2, ..., bB, d]` where `d = n (n + 1) / 2`.
+
+  Example:
+
+  ```python
+  fill_triangular_inverse(
+    [[4, 0, 0],
+     [6, 5, 0],
+     [3, 2, 1]])
+
+  # ==> [1, 2, 3, 4, 5, 6]
+
+  fill_triangular_inverse(
+    [[1, 2, 3],
+     [0, 5, 6],
+     [0, 0, 4]], upper=True)
+
+  # ==> [1, 2, 3, 4, 5, 6]
+  ```
+
+  Args:
+    x: `Tensor` representing lower (or upper) triangular elements.
+    upper: Python `bool` representing whether output matrix should be upper
+      triangular (`True`) or lower triangular (`False`, default).
+    name: Python `str`. The name to give this op.
+
+  Returns:
+    flat_tril: (Batch of) vector-shaped `Tensor` representing vectorized lower
+      (or upper) triangular elements from `x`.
+  """
+
+  with ops.name_scope(name, "fill_triangular_inverse", values=[x]):
+    x = ops.convert_to_tensor(x, name="x")
+    if x.shape.with_rank_at_least(2)[-1].value is not None:
+      n = np.int32(x.shape[-1].value)
+      m = np.int32((n * (n + 1)) // 2)
+      static_final_shape = x.shape[:-2].concatenate([m])
+    else:
+      n = array_ops.shape(x)[-1]
+      m = (n * (n + 1)) // 2
+      static_final_shape = x.shape.with_rank_at_least(2)[:-2].concatenate(
+          [None])
+    ndims = prefer_static_rank(x)
+    if upper:
+      initial_elements = x[..., 0, :]
+      triangular_portion = x[..., 1:, :]
+    else:
+      initial_elements = array_ops.reverse(x[..., -1, :], axis=[ndims - 2])
+      triangular_portion = x[..., :-1, :]
+    rotated_triangular_portion = array_ops.reverse(
+        array_ops.reverse(triangular_portion, axis=[ndims - 1]),
+        axis=[ndims - 2])
+    consolidated_matrix = triangular_portion + rotated_triangular_portion
+    end_sequence = array_ops.reshape(
+        consolidated_matrix,
+        array_ops.concat([array_ops.shape(x)[:-2], [n * (n - 1)]], axis=0))
+    y = array_ops.concat([initial_elements, end_sequence[..., :m - n]], axis=-1)
+    y.set_shape(static_final_shape)
+    return y
+
+
 def tridiag(below=None, diag=None, above=None, name=None):
   """Creates a matrix with values set above, below, and on the diagonal.
 
@@ -1273,6 +1368,43 @@ def pad(x, axis, front=False, back=False, value=0, count=1, name=None):
     return x
 
 
+def parent_frame_arguments():
+  """Returns parent frame arguments.
+
+  When called inside a function, returns a dictionary with the caller's function
+  arguments. These are positional arguments and keyword arguments (**kwargs),
+  while variable arguments (*varargs) are excluded.
+
+  When called at global scope, this will return an empty dictionary, since there
+  are no arguments.
+
+  WARNING: If caller function argument names are overloaded before invoking
+  this method, then values will reflect the overloaded value. For this reason,
+  we recommend calling `parent_frame_arguments` at the beginning of the
+  function.
+  """
+  # All arguments and the names used for *varargs, and **kwargs
+  arg_names, variable_arg_name, keyword_arg_name, local_vars = (
+      tf_inspect._inspect.getargvalues(  # pylint: disable=protected-access
+          # Get the first frame of the caller of this method.
+          tf_inspect._inspect.stack()[1][0]))  # pylint: disable=protected-access
+
+  # Remove the *varargs, and flatten the **kwargs. Both are
+  # nested lists.
+  local_vars.pop(variable_arg_name, {})
+  keyword_args = local_vars.pop(keyword_arg_name, {})
+
+  final_args = {}
+  # Copy over arguments and their values. In general, local_vars
+  # may contain more than just the arguments, since this method
+  # can be called anywhere in a function.
+  for arg_name in arg_names:
+    final_args[arg_name] = local_vars.pop(arg_name)
+  final_args.update(keyword_args)
+
+  return final_args
+
+
 class AppendDocstring(object):
   """Helper class to promote private subclass docstring to public counterpart.
 
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index 6f2a34c731c42e..bcc717b043f226 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -385,7 +385,7 @@ def embedding_lookup_sparse(params,
       ```
 
   Raises:
-    TypeError: If `sp_ids` is not a `SparseTensor`, or if `sp_weights` is 
+    TypeError: If `sp_ids` is not a `SparseTensor`, or if `sp_weights` is
       neither `None` nor `SparseTensor`.
     ValueError: If `combiner` is not one of {"mean", "sqrtn", "sum"}.
   """
@@ -421,10 +421,7 @@ def embedding_lookup_sparse(params,
       segment_ids = math_ops.cast(segment_ids, dtypes.int32)
 
     ids = sp_ids.values
-    if ignore_weights:
-      ids, idx = array_ops.unique(ids)
-    else:
-      idx = None
+    ids, idx = array_ops.unique(ids)
 
     embeddings = embedding_lookup(
         params, ids, partition_strategy=partition_strategy, max_norm=max_norm)
@@ -433,6 +430,8 @@ def embedding_lookup_sparse(params,
       if weights.dtype != embeddings.dtype:
         weights = math_ops.cast(weights, embeddings.dtype)
 
+      embeddings = array_ops.gather(embeddings, idx)
+
       # Reshape weights to allow broadcast
       ones = array_ops.fill(
           array_ops.expand_dims(array_ops.rank(embeddings) - 1, 0), 1)
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index 765a2ef99332fb..30413f289a0674 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -16,11 +16,6 @@
 """Functional operations.
 
 See the @{$python/functional_ops} guide.
-
-@@map_fn
-@@foldl
-@@foldr
-@@scan
 """
 
 from __future__ import absolute_import
@@ -460,7 +455,8 @@ def compute(i, tas):
         lambda i, _: i < n, compute, (i, accs_ta),
         parallel_iterations=parallel_iterations,
         back_prop=back_prop,
-        swap_memory=swap_memory)
+        swap_memory=swap_memory,
+        maximum_iterations=n)
     results_flat = [r.stack() for r in r_a]
 
     n_static = elems_flat[0].get_shape().with_rank_at_least(1)[0]
@@ -480,7 +476,7 @@ def compute(i, tas):
 
 @tf_export("scan")
 def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
-         swap_memory=False, infer_shape=True, name=None):
+         swap_memory=False, infer_shape=True, reverse=False, name=None):
   """scan on the list of tensors unpacked from `elems` on dimension 0.
 
   The simplest version of `scan` repeatedly applies the callable `fn` to a
@@ -492,6 +488,7 @@ def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
 
   Suppose that `elems` is unpacked into `values`, a list of tensors. The shape
   of the result tensor is `[len(values)] + fn(initializer, values[0]).shape`.
+  If reverse=True, it's fn(initializer, values[-1]).shape.
 
   This method also allows multi-arity `elems` and accumulator.  If `elems`
   is a (possibly nested) list or tuple of tensors, then each of these tensors
@@ -530,12 +527,15 @@ def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
     back_prop: (optional) True enables support for back propagation.
     swap_memory: (optional) True enables GPU-CPU memory swapping.
     infer_shape: (optional) False disables tests for consistent output shapes.
+    reverse: (optional) True scans the tensor last to first (instead of first
+      to last).
     name: (optional) Name prefix for the returned tensors.
 
   Returns:
     A tensor or (possibly nested) sequence of tensors.  Each tensor packs the
     results of applying `fn` to tensors unpacked from `elems` along the first
-    dimension, and the previous accumulator value(s), from first to last.
+    dimension, and the previous accumulator value(s), from first to last (or
+    last to first, if `reverse=True`).
 
   Raises:
     TypeError: if `fn` is not callable or the structure of the output of
@@ -548,6 +548,8 @@ def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
     elems = np.array([1, 2, 3, 4, 5, 6])
     sum = scan(lambda a, x: a + x, elems)
     # sum == [1, 3, 6, 10, 15, 21]
+    sum = scan(lambda a, x: a + x, elems, reverse=True)
+    # sum == [22, 21, 18, 15, 11, 6]
     ```
 
     ```python
@@ -619,7 +621,7 @@ def output_pack(x):
         elem_ta.unstack(elem) for elem_ta, elem in zip(elems_ta, elems_flat)]
 
     if initializer is None:
-      a_flat = [elem.read(0) for elem in elems_ta]
+      a_flat = [elem.read(n - 1 if reverse else 0) for elem in elems_ta]
       i = constant_op.constant(1)
     else:
       initializer_flat = output_flatten(initializer)
@@ -636,7 +638,8 @@ def output_pack(x):
         for init in a_flat]
 
     if initializer is None:
-      accs_ta = [acc_ta.write(0, a) for (acc_ta, a) in zip(accs_ta, a_flat)]
+      accs_ta = [acc_ta.write(n - 1 if reverse else 0, a)
+                 for (acc_ta, a) in zip(accs_ta, a_flat)]
 
     def compute(i, a_flat, tas):
       """The loop body of scan.
@@ -661,10 +664,20 @@ def compute(i, a_flat, tas):
           elems if initializer is None else initializer, a_out)
       flat_a_out = output_flatten(a_out)
       tas = [ta.write(i, value) for (ta, value) in zip(tas, flat_a_out)]
-      return (i + 1, flat_a_out, tas)
+      if reverse:
+        next_i = i - 1
+      else:
+        next_i = i + 1
+      return (next_i, flat_a_out, tas)
 
+    if reverse:
+      initial_i = n - 1 - i
+      condition = lambda i, _1, _2: i >= 0
+    else:
+      initial_i = i
+      condition = lambda i, _1, _2: i < n
     _, _, r_a = control_flow_ops.while_loop(
-        lambda i, _1, _2: i < n, compute, (i, a_flat, accs_ta),
+        condition, compute, (initial_i, a_flat, accs_ta),
         parallel_iterations=parallel_iterations,
         back_prop=back_prop, swap_memory=swap_memory,
         maximum_iterations=n)
@@ -848,7 +861,9 @@ def WhileBody(i, n, start, delta, *args):
     return (i + 1, n, start, delta) + tuple(for_result) + extra_args
 
   if hostmem is not None:
-    hostmem = [(4 + _) for _ in hostmem]
+    hostmem = [0, 1, 2, 3] + [(4 + _) for _ in hostmem]
+  else:
+    hostmem = [0, 1, 2, 3]
 
   results = While(
       input_=[0, n, start, delta] + inputs + WhileBody.captured_inputs,
diff --git a/tensorflow/python/ops/gradient_checker.py b/tensorflow/python/ops/gradient_checker.py
index 12afcd0b517d5e..94c8d7933523a3 100644
--- a/tensorflow/python/ops/gradient_checker.py
+++ b/tensorflow/python/ops/gradient_checker.py
@@ -283,10 +283,10 @@ def compute_gradient(x,
   numbers.  For example, if `x` is complex with shape `[m]` and `y` is complex
   with shape `[n]`, each Jacobian `J` will have shape `[m * 2, n * 2]` with
 
-      J[:m, :n] = d(Re y)/d(Re x)
-      J[:m, n:] = d(Im y)/d(Re x)
-      J[m:, :n] = d(Re y)/d(Im x)
-      J[m:, n:] = d(Im y)/d(Im x)
+      J[::2, ::2] = d(Re y)/d(Re x)
+      J[::2, 1::2] = d(Im y)/d(Re x)
+      J[1::2, ::2] = d(Re y)/d(Im x)
+      J[1::2, 1::2] = d(Im y)/d(Im x)
 
   Args:
     x: a tensor or list of tensors
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index 13420b7f0ee5f2..7385cb758514e1 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -112,16 +112,17 @@ def _MarkReachedOps(from_ops, reached_ops):
 
   Args:
     from_ops: list of Operations.
-    reached_ops: list of booleans, indexed by operation id.
+    reached_ops: set of Operations.
   """
   queue = collections.deque()
   queue.extend(from_ops)
   while queue:
     op = queue.popleft()
-    if not reached_ops[op._id]:
-      reached_ops[op._id] = True
+    if op not in reached_ops:
+      reached_ops.add(op)
       for output in op.outputs:
-        queue.extend(output.consumers())
+        if _IsBackpropagatable(output):
+          queue.extend(output.consumers())
 
 
 def _GatherInputs(to_ops, reached_ops):
@@ -129,7 +130,7 @@ def _GatherInputs(to_ops, reached_ops):
 
   Args:
     to_ops: list of Operations.
-    reached_ops: list of booleans, indexed by operation id.
+    reached_ops: set of Operations.
 
   Returns:
     The list of all inputs of to_ops that are in reached_ops.
@@ -141,67 +142,71 @@ def _GatherInputs(to_ops, reached_ops):
   while queue:
     op = queue.popleft()
     # We are interested in this op.
-    if reached_ops[op._id]:
+    if op in reached_ops:
       inputs.append(op)
       # Clear the boolean so we won't add the inputs again.
-      reached_ops[op._id] = False
+      reached_ops.remove(op)
       for inp in op.inputs:
         queue.append(inp.op)
   return inputs
 
 
-def _PendingCount(graph, to_ops, from_ops, colocate_gradients_with_ops):
+def _PendingCount(to_ops, from_ops, colocate_gradients_with_ops):
   """Initialize the pending count for ops between two lists of Operations.
 
-  'pending_count[op._id]' indicates the number of backprop inputs
+  'pending_count[op]' indicates the number of backprop inputs
   to this operation.
 
   Args:
-    graph: a Graph.
     to_ops: list of Operations.
     from_ops: list of Operations.
     colocate_gradients_with_ops: Python bool.  See docstring of gradients().
 
   Returns:
-    A tuple containing: (1) a list of integers indexed by operation id,
-    indicating the number of backprop inputs to this operation, and (2)
-    a ControlFlowState object which is not None if the ops between from_ops
-    and to_ops contain control flow loops.
+    A tuple containing: (1) the subset of to_ops reachable from from_ops by a
+    path of zero or more backpropagatable tensors, (2) a mapping from operation
+    to the number of backprop inputs to that op, and (3) a ControlFlowState
+    object which is not None if the ops between from_ops and to_ops contain
+    control flow loops.
   """
   # Mark reachable ops from from_ops.
-  reached_ops = [False] * (graph._last_id + 1)
-  for op in to_ops:
-    reached_ops[op._id] = True
+  reached_ops = set()
   _MarkReachedOps(from_ops, reached_ops)
+  # X in reached_ops iff X is reachable from from_ops by a path of zero or more
+  # backpropagatable tensors.
+
+  reachable_to_ops = set(op for op in to_ops if op in reached_ops)
 
   # Mark between ops.
-  between_ops = [False] * (graph._last_id + 1)
+  between_ops = set()
   between_op_list = []
   queue = collections.deque()
   queue.extend(to_ops)
   while queue:
     op = queue.popleft()
     # We are interested in this op.
-    if reached_ops[op._id]:
-      between_ops[op._id] = True
+    if op in reached_ops:
+      between_ops.add(op)
       between_op_list.append(op)
       # Clear the boolean so we won't add the inputs again.
-      reached_ops[op._id] = False
+      reached_ops.remove(op)
       for inp in op.inputs:
         queue.append(inp.op)
+  # X in between_ops iff X is on a path of zero or more backpropagatable tensors
+  # between from_ops and to_ops
 
   # 'loop_state' is None if there are no while loops.
   loop_state = control_flow_ops.MaybeCreateControlFlowState(
       between_op_list, between_ops, colocate_gradients_with_ops)
 
   # Initialize pending count for between ops.
-  pending_count = [0] * (graph._last_id + 1)
+  pending_count = collections.defaultdict(int)
   for op in between_op_list:
     for x in op.inputs:
-      if between_ops[x.op._id]:
-        pending_count[x.op._id] += 1
+      if x.op in between_ops:
+        pending_count[x.op] += 1
 
-  return pending_count, loop_state
+  return reachable_to_ops, pending_count, loop_state
 
 
 def _AsList(x):
@@ -250,21 +255,21 @@ def _DefaultGradYs(grad_ys,
         continue
       if y.dtype.is_floating or y.dtype.is_integer:
         if not grad_y.dtype.is_floating and not grad_y.dtype.is_integer:
-          raise TypeError("Gradient type %s generated for real or "
-                          "integer-valued tensor %s with type %s must be "
-                          "real or integer" %
-                          (dtypes.as_dtype(grad_y.dtype).name, y,
-                           dtypes.as_dtype(y.dtype).name))
+          raise TypeError(
+              "Gradient type %s generated for real or "
+              "integer-valued tensor %s with type %s must be "
+              "real or integer" % (dtypes.as_dtype(grad_y.dtype).name, y,
+                                   dtypes.as_dtype(y.dtype).name))
       elif y.dtype.is_complex:
         if not grad_y.dtype.is_complex:
-          raise TypeError("Gradient type %s generated for complex-valued "
-                          "tensor %s with type %s must be real" %
-                          (dtypes.as_dtype(grad_y.dtype).name, y,
-                           dtypes.as_dtype(y.dtype).name))
+          raise TypeError(
+              "Gradient type %s generated for complex-valued "
+              "tensor %s with type %s must be real" % (dtypes.as_dtype(
+                  grad_y.dtype).name, y, dtypes.as_dtype(y.dtype).name))
       else:
-        raise TypeError("Tensor %s with type %s must be numeric "
-                        "to obtain a default gradient" %
-                        (y, dtypes.as_dtype(y.dtype).name))
+        raise TypeError(
+            "Tensor %s with type %s must be numeric "
+            "to obtain a default gradient" % (y, dtypes.as_dtype(y.dtype).name))
       # Create a grad_y tensor in the name scope of the gradient.
       # Required for TensorArrays to identify which gradient call a
       # grad_y value is coming from.
@@ -291,7 +296,15 @@ def _DefaultGradYs(grad_ys,
 def _IsTrainable(tensor):
   dtype = dtypes.as_dtype(tensor.dtype)
   return dtype.base_dtype in (dtypes.float16, dtypes.float32, dtypes.float64,
-                              dtypes.complex64, dtypes.complex128)
+                              dtypes.complex64, dtypes.complex128,
+                              dtypes.resource)
+
+
+def _IsBackpropagatable(tensor):
+  if _IsTrainable(tensor):
+    return True
+  dtype = dtypes.as_dtype(tensor.dtype)
+  return dtype.base_dtype in (dtypes.bfloat16, dtypes.resource, dtypes.variant)
 
 
 def _VerifyGeneratedGradients(grads, op):
@@ -317,15 +330,15 @@ def _StopOps(from_ops, stop_gradient_ops, pending_count):
   should stop. Operations in the returned set will not be differentiated.
   This set is defined as the subset of `from_ops` containing ops that have
   no predecessor in `from_ops`. `pending_count` is the result of
-  `_PendingCount(g, xs, from_ops)`. An 'op' has predecessors in `from_ops`
-  iff pending_count[op._id] > 0.
+  `_PendingCount(xs, from_ops)`. An 'op' has predecessors in `from_ops`
+  iff pending_count[op] > 0.
 
   In addition, none of `stop_gradient_ops` will be differentiated.
 
   Args:
     from_ops: list of Operations.
     stop_gradient_ops: list of Operations never to backprop through.
-    pending_count: List of integers, indexed by operation id.
+    pending_count: mapping from operation to number of backprop inputs.
 
   Returns:
     The set of operations.
@@ -334,12 +347,12 @@ def _StopOps(from_ops, stop_gradient_ops, pending_count):
   for op in from_ops:
     is_stop_op = True
     for inp in op.inputs:
-      if pending_count[inp.op._id] > 0:
+      if pending_count[inp.op] > 0:
         is_stop_op = False
         break
     if is_stop_op:
-      stop_ops.add(op._id)
-  stop_ops.update(op._id for op in stop_gradient_ops)  # pylint: disable=protected-access
+      stop_ops.add(op)
+  stop_ops.update(op for op in stop_gradient_ops)
   return stop_ops
 
 
@@ -361,9 +374,7 @@ def _SymGrad(op, out_grads):
   f.name = op.type
   for k in op.node_def.attr:
     f.attr[k].CopyFrom(op.node_def.attr[k])
-  # pylint: disable=protected-access
   in_grads = functional_ops.symbolic_gradient(input=f_in, Tout=f_types, f=f)
-  # pylint: enable=protected-access
   return in_grads
 
 
@@ -404,6 +415,30 @@ def _MaybeCompile(scope, op, func, grad_fn):
     return grad_fn()
 
 
+def _RaiseNoGradWrtInitialLoopValError(op, from_ops):
+  """Raises an error if we backprop through a loop var."""
+  # Find the nearest 'to_op' reachable from 'op' to provide a more helpful error
+  # message.
+  target_op = None
+  queue = collections.deque([op])
+  visited = set()
+  while queue:
+    curr_op = queue.popleft()
+    if curr_op in visited: continue
+    visited.add(curr_op)
+    if curr_op in from_ops:
+      target_op = curr_op
+      break
+    queue.extend(t.op for t in curr_op.inputs)
+  assert target_op
+  raise ValueError(
+      "Cannot compute gradient inside while loop with respect to op '%s'. "
+      "We do not support taking the gradient wrt or through the initial value "
+      "of a loop variable. Gradients can be computed through loop invariants "
+      "or wrt the input parameters to the loop body."
+      % target_op.name)
+
+
 @tf_export("gradients")
 def gradients(ys,
               xs,
@@ -460,6 +495,9 @@ def gradients(ys,
   backpropagation stops at both `tf.stop_gradient` nodes and nodes in
   `stop_gradients`, whichever is encountered first.
 
+  All integer tensors are considered constant with respect to all `xs`, as if
+  they were included in `stop_gradients`.
+
   Args:
     ys: A `Tensor` or list of tensors to be differentiated.
     xs: A `Tensor` or list of tensors to be used for differentiation.
@@ -494,13 +532,23 @@ def gradients(ys,
                             gate_gradients, aggregation_method, stop_gradients)
 
 
-def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
-                     gate_gradients, aggregation_method, stop_gradients):
+def _GradientsHelper(ys,
+                     xs,
+                     grad_ys=None,
+                     name="gradients",
+                     colocate_gradients_with_ops=False,
+                     gate_gradients=False,
+                     aggregation_method=None,
+                     stop_gradients=None,
+                     src_graph=None):
   """Implementation of gradients()."""
   if context.executing_eagerly():
     raise RuntimeError("tf.gradients not supported when eager execution "
                        "is enabled. Use tf.contrib.eager.GradientTape "
                        "instead.")
+  if src_graph is None:
+    src_graph = ops.get_default_graph()
+
   ys = _AsList(ys)
   xs = _AsList(xs)
   stop_gradients = [] if stop_gradients is None else _AsList(stop_gradients)
@@ -539,8 +587,8 @@ def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
     to_ops = [t.op for t in ys]
     from_ops = [t.op for t in xs]
     stop_gradient_ops = [t.op for t in stop_gradients]
-    pending_count, loop_state = _PendingCount(
-        ops.get_default_graph(), to_ops, from_ops, colocate_gradients_with_ops)
+    reachable_to_ops, pending_count, loop_state = _PendingCount(
+        to_ops, from_ops, colocate_gradients_with_ops)
 
     # Iterate over the collected ops.
     #
@@ -562,12 +610,10 @@ def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
     for op in to_ops:
       # 'ready' handles the case where one output gradient relies on
       # another output's gradient.
-      # pylint: disable=protected-access
-      ready = (pending_count[op._id] == 0)
-      if ready and op._id not in to_ops_set:
-        to_ops_set.add(op._id)
+      ready = (pending_count[op] == 0)
+      if ready and op not in to_ops_set and op in reachable_to_ops:
+        to_ops_set.add(op)
         queue.append(op)
-      # pylint: enable=protected-access
 
     if loop_state:
       loop_exits = loop_state.ProcessUnusedLoopExits(pending_count, to_ops_set)
@@ -589,15 +635,19 @@ def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
           loop_state.ExitGradWhileContext(op, before=True)
 
         grad_fn = None
-        # pylint: disable=protected-access
         func_call = None
-        is_func_call = ops.get_default_graph()._is_function(op.type)
+        # pylint: disable=protected-access
+        is_func_call = src_graph._is_function(op.type)
+        # pylint: enable=protected-access
         has_out_grads = any(isinstance(g, ops.Tensor) or g for g in out_grads)
-        if has_out_grads and (op._id not in stop_ops):
+        if has_out_grads and (op not in stop_ops):
           if is_func_call:
-            func_call = ops.get_default_graph()._get_function(op.type)
+            func_call = src_graph._get_function(op.type)  # pylint: disable=protected-access
+            # Note that __defun is not set if the graph is
+            # imported. If it's set, we prefer to access the original
+            # defun.
+            func_call = getattr(op, "__defun", func_call)
             grad_fn = func_call.python_grad_func
-            # pylint: enable=protected-access
           else:
             # A grad_fn must be defined, either as a function or as None
             # for ops that do not have gradients.
@@ -609,6 +659,21 @@ def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
                   (op.name, op.type))
         if loop_state:
           loop_state.EnterGradWhileContext(op, before=False)
+
+        # NOTE(skyewm): We don't support computing gradients wrt a loop variable
+        # unless it's within the context of a single iteration (i.e. the
+        # gradient is wrt to the loop parameter in the body function, not wrt or
+        # through the initial value). This means if we're in a while loop
+        # context, we should never see a switch node from this context.
+        # pylint: disable=protected-access
+        if (control_flow_util.IsSwitch(op) and
+            op._control_flow_context is not None and
+            op._control_flow_context.IsWhileContext() and
+            op._control_flow_context ==
+            ops.get_default_graph()._get_control_flow_context()):
+          _RaiseNoGradWrtInitialLoopValError(op, from_ops)
+        # pylint: enable=protected-access
+
         if (grad_fn or is_func_call) and has_out_grads:
           # NOTE: If _AggregatedGrads didn't compute a value for the i'th
           # output, it means that the cost does not depend on output[i],
@@ -627,7 +692,7 @@ def _GradientsHelper(ys, xs, grad_ys, name, colocate_gradients_with_ops,
                 out_grads[i] = control_flow_ops.ZerosLikeOutsideLoop(op, i)
           with ops.name_scope(op.name + "_grad"):
             # pylint: disable=protected-access
-            with ops.get_default_graph()._original_op(op):
+            with src_graph._original_op(op):
               # pylint: enable=protected-access
               if grad_fn:
                 # If grad_fn was found, do not use SymbolicGradient even for
@@ -694,13 +759,10 @@ def _HasAnyNotNoneGrads(grads, op):
 def _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count, loop_state):
   """Update pending count for the inputs of op and enqueue ready ops."""
   for x in op.inputs:
-    # pylint: disable=protected-access
-    pending_count[x.op._id] -= 1
-    ready = (pending_count[x.op._id] == 0)
+    pending_count[x.op] -= 1
+    ready = (pending_count[x.op] == 0)
     if loop_state and not ready:
-      ready = (
-          pending_count[x.op._id] > 0 and control_flow_util.IsLoopSwitch(x.op))
-    # pylint: enable=protected-access
+      ready = pending_count[x.op] > 0 and control_flow_util.IsLoopSwitch(x.op)
     if ready:
       if control_flow_util.IsLoopExit(x.op):
         # if x is an exit without real gradient, defer processing them.
@@ -944,21 +1006,32 @@ def _AggregatedGrads(grads,
         logging.vlog(2, "  _AggregatedGrads %d x %s using %s", len(out_grad),
                      tensor_shape, used)
       else:
-        out_grad = math_ops._as_indexed_slices_list(
-            [g for g in out_grad if g is not None])
-        out_grad = [_HandleNestedIndexedSlices(x) for x in out_grad]
-        # Form IndexedSlices out of the concatenated values and
-        # indices.
-        out_grads[i] = ops.IndexedSlices(
-            array_ops.concat([x.values for x in out_grad], 0),
-            array_ops.concat([x.indices for x in out_grad], 0),
-            out_grad[0].dense_shape)
+        out_grads[i] = _AggregateIndexedSlicesGradients(out_grad)
     else:  # not out_grad
       # out_grads[i] is [], thus its aggregation is simply None.
       out_grads[i] = None
   return out_grads
 
 
+def _AggregateIndexedSlicesGradients(grads):
+  """Aggregates gradients of type `IndexedSlices` by concatenation."""
+  if len(grads) < 1:
+    return None
+  elif len(grads) == 1:
+    return grads[0]
+  else:
+    grads = math_ops._as_indexed_slices_list(  # pylint: disable=protected-access
+        [g for g in grads if g is not None])
+    grads = [_HandleNestedIndexedSlices(x) for x in grads]  # pylint: disable=protected-access
+    # Form IndexedSlices out of the concatenated values and indices.
+    concat_grad = ops.IndexedSlices(
+        array_ops.concat([x.values for x in grads], axis=0),
+        array_ops.concat([x.indices for x in grads], axis=0),
+        grads[0].dense_shape)
+
+    return concat_grad
+
+
 # TODO(vrv): Make this available when we want to make it public.
 def _hessian_vector_product(ys, xs, v):
   """Multiply the Hessian of `ys` wrt `xs` by `v`.
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index 0603d3b6706b96..6891501ae19ac9 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -24,6 +24,8 @@
 import numpy as np
 
 from tensorflow.python.client import session
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
@@ -31,6 +33,7 @@
 from tensorflow.python.framework import test_ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.framework.constant_op import constant
+from tensorflow.python.layers import core as core_layers
 from tensorflow.python.ops import array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_grad  # pylint: disable=unused-import
@@ -48,16 +51,16 @@
 from tensorflow.python.ops import state_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import tensor_array_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.ops.nn_ops import bias_add
 from tensorflow.python.platform import googletest
 
 
-def _OpsBetween(graph, to_ops, from_ops):
+def _OpsBetween(to_ops, from_ops):
   """Build the list of operations between two lists of Operations.
 
   Args:
-    graph: a Graph.
     to_ops: list of Operations.
     from_ops: list of Operations.
 
@@ -68,13 +71,12 @@ def _OpsBetween(graph, to_ops, from_ops):
     TODO(touts): Think about returning an empty list if from_ops are not
     reachable from to_ops.  Presently it returns to_ops in that case.
   """
-  # List of booleans, indexed by operation id, indicating if
-  # an op is reached from the output of "input_ops".
-  reached_ops = [False] * (graph._last_id + 1)
+  # Ops that are reachable from the output of "input_ops".
+  reached_ops = set()
   # We only care to reach up to "output_ops" so we mark the
   # output ops as reached to avoid recursing past them.
   for op in to_ops:
-    reached_ops[op._id] = True
+    reached_ops.add(op)
   gradients_impl._MarkReachedOps(from_ops, reached_ops)
   between_ops = gradients_impl._GatherInputs(to_ops, reached_ops)
   between_ops.sort(key=lambda x: -x._id)
@@ -91,18 +93,18 @@ def _assertOpListEqual(self, ops1, ops2):
     self.assertEquals(self._OpNames(ops1), self._OpNames(ops2))
 
   def testOpsBetweenSimple(self):
-    with ops.Graph().as_default() as g:
+    with ops.Graph().as_default():
       t1 = constant(1.0)
       t2 = constant(2.0)
       t3 = array_ops.stack([t1, t2])
     # Full graph
     self._assertOpListEqual([t3.op, t2.op, t1.op],
-                            _OpsBetween(g, [t3.op], [t1.op, t2.op]))
+                            _OpsBetween([t3.op], [t1.op, t2.op]))
     # Only t1, t3.
-    self._assertOpListEqual([t3.op, t1.op], _OpsBetween(g, [t3.op], [t1.op]))
+    self._assertOpListEqual([t3.op, t1.op], _OpsBetween([t3.op], [t1.op]))
 
   def testOpsBetweenUnreachable(self):
-    with ops.Graph().as_default() as g:
+    with ops.Graph().as_default():
       t1 = constant(1.0)
       t2 = constant(2.0)
       _ = array_ops.stack([t1, t2])
@@ -110,10 +112,10 @@ def testOpsBetweenUnreachable(self):
       t5 = constant(2.0)
       t6 = array_ops.stack([t4, t5])
     # Elements of to_ops are always listed.
-    self._assertOpListEqual([t6.op], _OpsBetween(g, [t6.op], [t1.op]))
+    self._assertOpListEqual([t6.op], _OpsBetween([t6.op], [t1.op]))
 
   def testOpsBetweenCut(self):
-    with ops.Graph().as_default() as g:
+    with ops.Graph().as_default():
       t1 = constant(1.0)
       t2 = constant(2.0)
       t3 = array_ops.stack([t1, t2])
@@ -122,10 +124,10 @@ def testOpsBetweenCut(self):
       t6 = constant([2.0])
       t7 = array_ops.concat([t5, t6], 0)
     self._assertOpListEqual([t7.op, t5.op, t4.op],
-                            _OpsBetween(g, [t7.op], [t4.op]))
+                            _OpsBetween([t7.op], [t4.op]))
 
   def testOpsBetweenCycle(self):
-    with ops.Graph().as_default() as g:
+    with ops.Graph().as_default():
       t1 = constant(1.0)
       t2 = constant(2.0)
       t3 = array_ops.stack([t1, t2])
@@ -134,11 +136,11 @@ def testOpsBetweenCycle(self):
       t6 = array_ops.concat([t4, t5], 0)
       t7 = array_ops.concat([t6, t3], 0)
     self._assertOpListEqual([t6.op, t4.op, t3.op],
-                            _OpsBetween(g, [t6.op], [t3.op]))
+                            _OpsBetween([t6.op], [t3.op]))
     self._assertOpListEqual([t7.op, t6.op, t5.op, t4.op, t3.op, t1.op],
-                            _OpsBetween(g, [t7.op], [t1.op, t5.op]))
+                            _OpsBetween([t7.op], [t1.op, t5.op]))
     self._assertOpListEqual([t6.op, t5.op, t4.op, t3.op, t2.op],
-                            _OpsBetween(g, [t6.op], [t2.op, t5.op]))
+                            _OpsBetween([t6.op], [t2.op, t5.op]))
 
   def testGradients(self):
     with ops.Graph().as_default():
@@ -285,10 +287,6 @@ def testAggregationMethodTree(self):
       self.assertEqual(10.0, grads[1].eval())
 
   def testNoGradientForStringOutputs(self):
-    # This test can't be run twice because the TestStringOutput gradient can
-    # only be registered once. Just run with the C API enabled.
-    if not ops._USE_C_API: return
-
     with ops.Graph().as_default():
 
       def _TestOpGrad(_, float_grad, string_grad):
@@ -434,7 +432,6 @@ def _Gradients(ys, xs, **kwargs):
         np.testing.assert_allclose(a, b)
 
 
-@test_util.with_c_api
 class FunctionGradientsTest(test_util.TensorFlowTestCase):
 
   @classmethod
@@ -524,7 +521,6 @@ def testFunctionGradientWithGradFuncAndRegistration(self):
         f.add_to_graph(ops.Graph())
 
 
-@test_util.with_c_api
 class StopGradientTest(test_util.TensorFlowTestCase):
 
   def testStopGradient(self):
@@ -535,7 +531,6 @@ def testStopGradient(self):
     assert igrad is None
 
 
-@test_util.with_c_api
 class PreventGradientTest(test_util.TensorFlowTestCase):
 
   def testPreventGradient(self):
@@ -546,7 +541,6 @@ def testPreventGradient(self):
         _ = gradients.gradients(out, inp)
 
 
-@test_util.with_c_api
 class HessianVectorProductTest(test_util.TensorFlowTestCase):
 
   def testHessianVectorProduct(self):
@@ -575,7 +569,6 @@ def testHessianVectorProduct(self):
       self.assertAllClose(hess_v_value, hess_v_actual)
 
 
-@test_util.with_c_api
 class HessianTest(test_util.TensorFlowTestCase):
 
   def testHessian1D(self):
@@ -664,7 +657,6 @@ def testHessian2D_non_square_matrix(self):
     self.assertAllClose(hess_value, hess_actual.reshape((m * n, m * n)))
 
 
-@test_util.with_c_api
 class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
 
   def testIndexedSlicesToTensor(self):
@@ -744,6 +736,45 @@ def testWarnings(self):
         "of unknown shape. This may consume a large amount of memory." in
         str(w[0].message))
 
+
+class OnlyRealGradientsTest(test_util.TensorFlowTestCase):
+
+  def testRealOnly(self):
+    x = constant_op.constant(7+3j, dtype=dtypes.complex64)
+    y = math_ops.square(x)
+    with self.assertRaisesRegexp(
+        TypeError,
+        r"Gradients of complex tensors must set grad_ys "
+        r"\(y\.dtype = tf\.complex64\)"):
+      gradients.gradients(y, x)
+
+
+class ResourceCondTest(test_util.TensorFlowTestCase):
+
+  def testBasic(self):
+    gamma = resource_variable_ops.ResourceVariable(
+        np.random.random((3,)),
+        dtype="float32", name="gamma")
+
+    inputs = array_ops.ones(shape=(3,), dtype="float32")
+
+    def TestFn():
+      output = inputs + gamma
+      return output
+
+    training = array_ops.placeholder_with_default(True, shape=())
+    output = control_flow_ops.cond(
+        training, TestFn, lambda: inputs)
+
+    loss = output
+
+    grads = gradients.gradients(
+        loss, [gamma])
+    self.assertTrue(None not in grads)
+
+
+class CustomGradientTest(test_util.TensorFlowTestCase):
+
   def testCustomGradientTrivial(self):
 
     @custom_gradient.custom_gradient
@@ -797,42 +828,170 @@ def Grad(_):
       with self.assertRaises(RuntimeError):
         gradients.gradients(y, x)
 
+  def testCustomGradientWithVariables(self):
 
-@test_util.with_c_api
-class OnlyRealGradientsTest(test_util.TensorFlowTestCase):
+    @custom_gradient.custom_gradient
+    def F(x):
+      out = core_layers.dense(x, 3, use_bias=False)
 
-  def testRealOnly(self):
-    x = constant_op.constant(7+3j, dtype=dtypes.complex64)
-    y = math_ops.square(x)
-    with self.assertRaisesRegexp(
-        TypeError,
-        r"Gradients of complex tensors must set grad_ys "
-        r"\(y\.dtype = tf\.complex64\)"):
-      gradients.gradients(y, x)
+      def Grad(out_grad, variables=None):  # pylint: disable=redefined-outer-name
+        self.assertEqual(1, len(variables))
+        grads = gradients.gradients(out, [x, variables[0]], grad_ys=out_grad)
+        return grads[0], [array_ops.ones((4, 3))]
 
+      return out, Grad
 
-class ResourceCondTest(test_util.TensorFlowTestCase):
+    with ops.Graph().as_default():
+      x = array_ops.ones((2, 4))
+      with variable_scope.variable_scope("f", use_resource=True) as vs:
+        y = F(x)
+        all_vars = vs.global_variables()
+        assert len(all_vars) == 1
+      grads = gradients.gradients(y, [x, all_vars[0]])
+      for g in grads:
+        self.assertTrue(g is not None)
+      with session.Session() as sess:
+        sess.run(variables.global_variables_initializer())
+        dw = sess.run(math_ops.reduce_sum(grads[1]))
+        self.assertEqual(12., dw)
 
-  def testBasic(self):
-    gamma = resource_variable_ops.ResourceVariable(
-        np.random.random((3,)),
-        dtype="float32", name="gamma")
+  def testCustomGradientWithVariablesEager(self):
+    with context.eager_mode():
+      layer = core_layers.Dense(4, use_bias=False)
 
-    inputs = array_ops.ones(shape=(3,), dtype="float32")
+      @custom_gradient.custom_gradient
+      def F(x):
+        out = layer(x)
 
-    def TestFn():
-      output = inputs + gamma
-      return output
+        def Grad(out_grad, variables=None):  # pylint: disable=redefined-outer-name
+          del out_grad
+          self.assertEqual(1, len(variables))
+          return (array_ops.ones((3, 2)),
+                  [array_ops.ones((2, 4))])
 
-    training = array_ops.placeholder_with_default(True, shape=())
-    output = control_flow_ops.cond(
-        training, TestFn, lambda: inputs)
+        return out, Grad
 
-    loss = output
+      x = array_ops.ones((3, 2)) + 2.
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        y = F(x)
+      w, = layer.variables
+      dx, dw = tape.gradient(y, [x, w])
+      self.assertEqual(6., math_ops.reduce_sum(dx).numpy())
+      self.assertEqual(8., math_ops.reduce_sum(dw).numpy())
 
-    grads = gradients.gradients(
-        loss, [gamma])
-    self.assertTrue(None not in grads)
+  def testCustomGradientErrorsWithNonResourceVariables(self):
+
+    def F(x, use_resource=False):
+      with variable_scope.variable_scope("f", use_resource=use_resource):
+        out = core_layers.dense(x, 4, use_bias=False)
+
+      def Grad(out_grad, variables=None):  # pylint: disable=redefined-outer-name
+        del out_grad
+        self.assertEqual(1, len(variables))
+        return (array_ops.ones((3, 2)), [array_ops.ones((2, 4))])
+
+      return out, Grad
+
+    @custom_gradient.custom_gradient
+    def FResource(x):
+      return F(x, use_resource=True)
+
+    @custom_gradient.custom_gradient
+    def FNonResource(x):
+      return F(x, use_resource=False)
+
+    x = array_ops.ones((3, 2)) + 2.
+
+    # Wrapping scope has use_resource=True but inner scope sets to False. Fails.
+    with variable_scope.variable_scope("vs1", use_resource=True):
+      with self.assertRaisesWithPredicateMatch(TypeError,
+                                               "must be `ResourceVariable`s"):
+        FNonResource(x)
+
+    # Wrapping scope has use_resource=False but inner scope sets to True.
+    # Passes.
+    with variable_scope.variable_scope("vs2", use_resource=False):
+      FResource(x)
+
+  def testWithNumpyInputs(self):
+    with context.eager_mode():
+
+      @custom_gradient.custom_gradient
+      def F(x):
+        out = x
+
+        def Grad(_):
+          return (None, None)
+
+        return out, Grad
+
+      x = np.ones((3, 2), dtype=np.float32)
+      # Smoke test to ensure numpy inputs are accepted
+      F(x)
+
+  def testRVGradientsDynamicCond(self):
+    with self.test_session():
+      alpha = resource_variable_ops.ResourceVariable(
+          np.random.random((1,)),
+          dtype="float32")
+
+      conditional = array_ops.placeholder_with_default(True, shape=())
+      output = control_flow_ops.cond(
+          conditional, lambda: alpha * 2, lambda: alpha * 3)
+
+      g, = gradients_impl.gradients(output, alpha)
+      variables.global_variables_initializer().run()
+      self.assertAllEqual(g.eval(), [2.0])
+      self.assertAllEqual(g.eval(feed_dict={conditional: False}), [3.0])
+
+
+class AggregateIndexedSlicesGradientsTest(test_util.TensorFlowTestCase):
+
+  def _assert_indexed_slices_equal(self, left, right):
+    self.assertAllEqual(
+        self.evaluate(ops.convert_to_tensor(left)),
+        self.evaluate(ops.convert_to_tensor(right)))
+
+  def testNoGradients(self):
+    self.assertIsNone(gradients_impl._AggregateIndexedSlicesGradients([]))
+
+  def testOneGradient(self):
+    t = math_ops._as_indexed_slices(constant_op.constant(
+        [[1., 2.], [0, 0], [3., 4.]]))
+    result = gradients_impl._AggregateIndexedSlicesGradients([t])
+    self._assert_indexed_slices_equal(t, result)
+
+  def testMultipleGradients(self):
+    t0 = math_ops._as_indexed_slices(constant_op.constant(
+        [[1., 2.], [0, 0], [3., 4.]]))
+    t1 = math_ops._as_indexed_slices(constant_op.constant(
+        [[0., 0.], [5, 6], [7., 8.]]))
+    total = constant_op.constant(
+        [[1., 2.], [5, 6], [10., 12.]])
+    result = gradients_impl._AggregateIndexedSlicesGradients([t0, t1])
+    self._assert_indexed_slices_equal(total, result)
+
+  def testMultipleGradientsWithNones(self):
+    t0 = math_ops._as_indexed_slices(constant_op.constant(
+        [[1., 2.], [0, 0], [3., 4.]]))
+    t1 = math_ops._as_indexed_slices(constant_op.constant(
+        [[0., 0.], [5, 6], [7., 8.]]))
+    t3 = None
+    total = constant_op.constant(
+        [[1., 2.], [5, 6], [10., 12.]])
+    result = gradients_impl._AggregateIndexedSlicesGradients([t0, t1, t3])
+    self._assert_indexed_slices_equal(total, result)
+
+  def testMixedTensorAndIndexedSlices(self):
+    t0 = math_ops._as_indexed_slices(constant_op.constant(
+        [[1., 2.], [0, 0], [3., 4.]]))
+    t1 = constant_op.constant(
+        [[0., 0.], [5, 6], [7., 8.]])
+    total = constant_op.constant(
+        [[1., 2.], [5, 6], [10., 12.]])
+    result = gradients_impl._AggregateIndexedSlicesGradients([t0, t1])
+    self._assert_indexed_slices_equal(total, result)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/ops/hidden_ops.txt b/tensorflow/python/ops/hidden_ops.txt
deleted file mode 100644
index e1217e984c8f67..00000000000000
--- a/tensorflow/python/ops/hidden_ops.txt
+++ /dev/null
@@ -1,395 +0,0 @@
-# array_ops
-BatchToSpace
-BroadcastArgs
-BroadcastGradientArgs
-ConcatOffset
-Concat
-ConcatV2
-ConjugateTranspose
-Const
-DebugGradientIdentity
-DebugGradientRefIdentity
-EditDistance
-ExpandDims
-ListDiff
-MirrorPad
-MirrorPadGrad
-OneHot
-Pack
-Pad
-PadV2
-ParallelConcat
-Placeholder
-RefIdentity
-Reverse
-Snapshot
-SpaceToBatch
-Split
-SplitV
-Squeeze
-Slice
-TileGrad  # Exported through array_grad instead of array_ops.
-ZerosLike  # TODO(josh11b): Use this instead of the Python version.
-Unique
-UniqueV2
-UniqueWithCounts
-UniqueWithCountsV2
-Unpack
-
-# candidate_sampling_ops
-AllCandidateSampler
-ComputeAccidentalHits
-FixedUnigramCandidateSampler
-LearnedUnigramCandidateSampler
-LogUniformCandidateSampler
-ThreadUnsafeUnigramCandidateSampler
-UniformCandidateSampler
-
-# checkpoint_ops
-GenerateVocabRemapping
-LoadAndRemapMatrix
-
-
-# control_flow_ops
-Switch
-Merge
-RefMerge
-Exit
-RefExit
-
-# ctc_ops
-CTCLoss
-CTCGreedyDecoder
-CTCBeamSearchDecoder
-
-# data_flow_ops
-Barrier
-BarrierClose
-BarrierIncompleteSize
-BarrierInsertMany
-BarrierReadySize
-BarrierTakeMany
-DeleteSessionTensor
-FakeQueue
-FIFOQueue
-FIFOQueueV2
-GetSessionHandle
-GetSessionHandleV2
-GetSessionTensor
-HashTable
-HashTableV2
-InitializeTable
-InitializeTableV2
-InitializeTableFromTextFile
-InitializeTableFromTextFileV2
-LookupTableExport
-LookupTableExportV2
-LookupTableFind
-LookupTableFindV2
-LookupTableImport
-LookupTableImportV2
-LookupTableInsert
-LookupTableInsertV2
-LookupTableSize
-LookupTableSizeV2
-MutableDenseHashTable
-MutableDenseHashTableV2
-MutableHashTable
-MutableHashTableV2
-MutableHashTableOfTensors
-MutableHashTableOfTensorsV2
-Mutex
-MutexAcquire
-MutexRelease
-PaddingFIFOQueue
-PaddingFIFOQueueV2
-PriorityQueue
-PriorityQueueV2
-QueueClose
-QueueCloseV2
-QueueDequeue
-QueueDequeueV2
-QueueDequeueMany
-QueueDequeueManyV2
-QueueDequeueUpTo
-QueueDequeueUpToV2
-QueueEnqueue
-QueueEnqueueV2
-QueueEnqueueMany
-QueueEnqueueManyV2
-QueueSize
-QueueSizeV2
-RandomShuffleQueue
-RandomShuffleQueueV2
-Stack
-StackClose
-StackPop
-StackPush
-StackV2
-StackCloseV2
-StackPopV2
-StackPushV2
-TensorArray
-TensorArrayClose
-TensorArrayCloseV2
-TensorArrayConcat
-TensorArrayConcatV2
-TensorArrayGather
-TensorArrayGatherV2
-TensorArrayGrad
-TensorArrayGradV2
-TensorArrayPack
-TensorArrayPackV2
-TensorArrayRead
-TensorArrayReadV2
-TensorArrayScatter
-TensorArrayScatterV2
-TensorArraySize
-TensorArraySizeV2
-TensorArraySplit
-TensorArraySplitV2
-TensorArrayUnpack
-TensorArrayUnpackV2
-TensorArrayV2
-TensorArrayWrite
-TensorArrayWriteV2
-TensorArrayV3
-TensorArrayCloseV3
-TensorArrayConcatV3
-TensorArrayGatherV3
-TensorArrayGradV3
-TensorArrayReadV3
-TensorArrayPackV3
-TensorArrayScatterV3
-TensorArraySizeV3
-TensorArraySplitV3
-TensorArrayUnpackV3
-TensorArrayWriteV3
-
-# functional_ops
-SymbolicGradient
-
-# image_ops
-AdjustContrastv2
-NonMaxSuppression
-NonMaxSuppressionV2
-RandomCrop
-ResizeBilinearGrad
-ResizeBicubicGrad
-ResizeNearestNeighborGrad
-SampleDistortedBoundingBox
-SampleDistortedBoundingBoxV2
-ScaleImageGrad
-
-# io_ops
-FixedLengthRecordReader
-IdentityReader
-ReaderNumRecordsProduced
-ReaderNumWorkUnitsCompleted
-ReaderRead
-ReaderReadUpTo
-ReaderReset
-ReaderRestoreState
-ReaderSerializeState
-ReaderWorkQueueLength
-FixedLengthRecordReaderV2
-IdentityReaderV2
-ReaderNumRecordsProducedV2
-ReaderNumWorkUnitsCompletedV2
-ReaderReadV2
-ReaderReadUpToV2
-ReaderResetV2
-ReaderRestoreStateV2
-ReaderSerializeStateV2
-ReaderWorkQueueLengthV2
-Restore
-RestoreSlice
-Save
-SaveSlices
-ShardedFilename
-ShardedFilespec
-TextLineReader
-TFRecordReader
-WholeFileReader
-TextLineReaderV2
-TFRecordReaderV2
-WholeFileReaderV2
-LMDBReader
-DecodeCSV
-
-# linalg_ops
-BatchCholesky
-BatchCholeskyGrad
-BatchMatrixDeterminant
-BatchMatrixInverse
-BatchMatrixSolve
-BatchMatrixSolveLs
-BatchMatrixTriangularSolve
-BatchSelfAdjointEig
-BatchSelfAdjointEigV2
-BatchSvd
-LogMatrixDeterminant
-MatrixExponential
-MatrixLogarithm
-MatrixSolveLs
-SelfAdjointEig
-SelfAdjointEigV2
-Svd
-
-# logging_ops
-Assert
-AudioSummary
-AudioSummaryV2
-HistogramSummary
-ImageSummary
-MergeSummary
-Print
-ScalarSummary
-TensorSummary
-TensorSummaryV2
-
-# math_ops
-Abs
-AccumulateNV2
-AddN
-AddV2
-All
-Any
-BatchMatMul
-BatchFFT
-BatchFFT2D
-BatchFFT3D
-BatchIFFT
-BatchIFFT2D
-BatchIFFT3D
-Bucketize
-ClipByValue
-Complex
-ComplexAbs
-Conj
-FloorDiv
-FloorMod
-HistogramFixedWidth
-Max
-Mean
-Min
-Mul
-Neg
-Pow
-Prod
-Range
-RealDiv
-Select
-SparseMatMul
-Sub
-Sum
-MatMul
-Sigmoid
-Tanh
-SigmoidGrad
-TanhGrad
-InvGrad
-ReciprocalGrad
-SqrtGrad
-RsqrtGrad
-TruncateDiv
-TruncateMod
-
-# nn_ops
-AvgPoolGrad  # "*Grad" accessible through nn_grad instead of nn_ops.
-AvgPool3DGrad
-BatchNormWithGlobalNormalization
-BatchNormWithGlobalNormalizationGrad
-FusedBatchNorm
-FusedBatchNormV2
-SoftmaxCrossEntropyWithLogits
-SparseSoftmaxCrossEntropyWithLogits
-LRNGrad
-MaxPoolGrad
-MaxPoolGradWithArgmax
-MaxPoolGradGrad
-MaxPoolGradGradWithArgmax
-MaxPool3DGrad
-MaxPool3DGradGrad
-ReluGrad
-Relu6Grad
-EluGrad
-SeluGrad
-SoftplusGrad
-SoftsignGrad
-TopK
-TopKV2
-BiasAdd
-BiasAddV1
-Relu6
-AvgPool
-MaxPool
-MaxPoolV2
-Softmax
-LogSoftmax
-FractionalAvgPoolGrad
-FractionalMaxPoolGrad
-InTopK
-InTopKV2
-
-# parsing_ops
-ParseExample
-ParseSingleSequenceExample
-
-# random_ops
-RandomGamma
-RandomPoisson
-RandomUniform
-RandomUniformInt
-RandomShuffle
-RandomStandardNormal
-ParameterizedTruncatedNormal
-TruncatedNormal
-
-# script_ops
-PyFunc
-PyFuncStateless
-EagerPyFunc
-
-# sdca_ops
-
-# state_ops
-Variable
-VariableV2
-TemporaryVariable
-DestroyTemporaryVariable
-
-# sparse_ops
-AddSparseToTensorsMap
-AddManySparseToTensorsMap
-TakeManySparseFromTensorsMap
-DeserializeManySparse
-DeserializeSparse
-SerializeManySparse
-SerializeSparse
-SparseAdd
-SparseAddGrad
-SparseConcat
-SparseCross
-SparseFillEmptyRows
-SparseFillEmptyRowsGrad
-SparseSplit
-SparseSelectLastK
-SparseReorder
-SparseReshape
-SparseToDense
-SparseTensorDenseAdd
-SparseTensorDenseMatMul
-
-# string_ops
-StringSplit
-
-# user_ops
-Fact
-
-# training_ops
-# (None)
-
-# word2vec deprecated ops
-NegTrain
-Skipgram
diff --git a/tensorflow/python/ops/histogram_ops.py b/tensorflow/python/ops/histogram_ops.py
index ec38d89a0ec044..e86a8e5a5baa56 100644
--- a/tensorflow/python/ops/histogram_ops.py
+++ b/tensorflow/python/ops/histogram_ops.py
@@ -16,9 +16,6 @@
 """Histograms.
 
 Please see @{$python/histogram_ops} guide.
-
-@@histogram_fixed_width_bins
-@@histogram_fixed_width
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/ops/image_grad.py b/tensorflow/python/ops/image_grad.py
index 9f43e3f1466d90..102181e68b4d09 100644
--- a/tensorflow/python/ops/image_grad.py
+++ b/tensorflow/python/ops/image_grad.py
@@ -107,16 +107,20 @@ def _CropAndResizeGrad(op, grad):
   allowed_types = [dtypes.float16, dtypes.float32, dtypes.float64]
   if op.inputs[0].dtype in allowed_types:
     # pylint: disable=protected-access
-    grad0 = gen_image_ops.crop_and_resize_grad_image(grad,
-                                                     op.inputs[1],
-                                                     op.inputs[2],
-                                                     image_shape,
-                                                     T=op.get_attr("T"))
+    grad0 = gen_image_ops.crop_and_resize_grad_image(
+        grad, op.inputs[1], op.inputs[2], image_shape, T=op.get_attr("T"),
+        method=op.get_attr("method"))
     # pylint: enable=protected-access
   else:
     grad0 = None
 
-  grad1 = gen_image_ops.crop_and_resize_grad_boxes(grad, op.inputs[0],
-                                                   op.inputs[1], op.inputs[2])
+  # `grad0` is the gradient to the input image pixels and it
+  # has been implemented for nearest neighbor and bilinear sampling
+  # respectively. `grad1` is the gradient to the input crop boxes' coordinates.
+  # When using nearest neighbor sampling, the gradient to crop boxes'
+  # coordinates are not well defined. In practice, we still approximate
+  # grad1 using the gradient derived from bilinear sampling.
+  grad1 = gen_image_ops.crop_and_resize_grad_boxes(
+      grad, op.inputs[0], op.inputs[1], op.inputs[2])
 
   return [grad0, grad1, None, None]
diff --git a/tensorflow/python/ops/image_ops.py b/tensorflow/python/ops/image_ops.py
index 3d40c391812b1c..343531ac5549db 100644
--- a/tensorflow/python/ops/image_ops.py
+++ b/tensorflow/python/ops/image_ops.py
@@ -17,63 +17,6 @@
 """Image processing and decoding ops.
 
 See the @{$python/image} guide.
-
-@@decode_bmp
-@@decode_gif
-@@decode_jpeg
-@@decode_and_crop_jpeg
-@@encode_jpeg
-@@extract_jpeg_shape
-@@decode_png
-@@encode_png
-@@is_jpeg
-@@decode_image
-@@resize_images
-@@resize_area
-@@resize_bicubic
-@@resize_bilinear
-@@resize_nearest_neighbor
-@@resize_image_with_crop_or_pad
-@@central_crop
-@@pad_to_bounding_box
-@@crop_to_bounding_box
-@@extract_glimpse
-@@crop_and_resize
-@@flip_up_down
-@@random_flip_up_down
-@@flip_left_right
-@@random_flip_left_right
-@@transpose_image
-@@rot90
-
-@@rgb_to_grayscale
-@@grayscale_to_rgb
-@@hsv_to_rgb
-@@rgb_to_hsv
-@@rgb_to_yiq
-@@yiq_to_rgb
-@@rgb_to_yuv
-@@yuv_to_rgb
-@@convert_image_dtype
-@@adjust_brightness
-@@random_brightness
-@@adjust_contrast
-@@random_contrast
-@@adjust_hue
-@@random_hue
-@@adjust_gamma
-@@adjust_saturation
-@@random_saturation
-@@per_image_standardization
-@@draw_bounding_boxes
-@@non_max_suppression
-@@sample_distorted_bounding_box
-@@total_variation
-@@psnr
-@@ssim
-@@ssim_multiscale
-@@image_gradients
-@@sobel_edges
 """
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index bd5b2ae83b5dd1..4a32f2351bad47 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -28,6 +28,7 @@
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_image_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
@@ -258,14 +259,14 @@ def random_flip_up_down(image, seed=None):
   dimension, which is `height`.  Otherwise output the image as-is.
 
   Args:
-    image: A 3-D tensor of shape `[height, width, channels].`
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
     seed: A Python integer. Used to create a random seed. See
       @{tf.set_random_seed}
       for behavior.
 
   Returns:
-    A 3-D tensor of the same type and shape as `image`.
-
+    A tensor of the same type and shape as `image`.
   Raises:
     ValueError: if the shape of `image` not supported.
   """
@@ -280,13 +281,14 @@ def random_flip_left_right(image, seed=None):
   second dimension, which is `width`.  Otherwise output the image as-is.
 
   Args:
-    image: A 3-D tensor of shape `[height, width, channels].`
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
     seed: A Python integer. Used to create a random seed. See
       @{tf.set_random_seed}
       for behavior.
 
   Returns:
-    A 3-D tensor of the same type and shape as `image`.
+    A tensor of the same type and shape as `image`.
 
   Raises:
     ValueError: if the shape of `image` not supported.
@@ -297,7 +299,8 @@ def random_flip_left_right(image, seed=None):
 def _random_flip(image, flip_index, seed, scope_name):
   """Randomly (50% chance) flip an image along axis `flip_index`.
     Args:
-      image: A 3-D tensor of shape `[height, width, channels].`
+      image: 4-D Tensor of shape `[batch, height, width, channels]` or
+             3-D Tensor of shape `[height, width, channels]`.
       flip_index: The dimension along which to flip the image.
                   Vertical: 0, Horizontal: 1
       seed: A Python integer. Used to create a random seed. See
@@ -306,22 +309,37 @@ def _random_flip(image, flip_index, seed, scope_name):
       scope_name: Name of the scope in which the ops are added.
 
     Returns:
-      A 3-D tensor of the same type and shape as `image`.
+      A tensor of the same type and shape as `image`.
 
     Raises:
       ValueError: if the shape of `image` not supported.
   """
   with ops.name_scope(None, scope_name, [image]) as scope:
     image = ops.convert_to_tensor(image, name='image')
-    image = _Assert3DImage(image)
-    uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
-    mirror_cond = math_ops.less(uniform_random, .5)
-    result = control_flow_ops.cond(
-        mirror_cond,
-        lambda: array_ops.reverse(image, [flip_index]),
-        lambda: image,
-        name=scope)
-    return fix_image_flip_shape(image, result)
+    image = _AssertAtLeast3DImage(image)
+    shape = image.get_shape()
+    if shape.ndims == 3 or shape.ndims is None:
+      uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
+      mirror_cond = math_ops.less(uniform_random, .5)
+      result = control_flow_ops.cond(
+          mirror_cond,
+          lambda: array_ops.reverse(image, [flip_index]),
+          lambda: image,
+          name=scope
+      )
+      return fix_image_flip_shape(image, result)
+    elif shape.ndims == 4:
+      uniform_random = random_ops.random_uniform(
+          [array_ops.shape(image)[0]], 0, 1.0, seed=seed
+      )
+      mirror_cond = math_ops.less(uniform_random, .5)
+      return array_ops.where(
+          mirror_cond,
+          image,
+          functional_ops.map_fn(lambda x: array_ops.reverse(x, [flip_index]), image, dtype=image.dtype)
+      )
+    else:
+      raise ValueError('\'image\' must have either 3 or 4 dimensions.')
 
 
 @tf_export('image.flip_left_right')
@@ -523,7 +541,7 @@ def transpose_image(image):
 
 @tf_export('image.central_crop')
 def central_crop(image, central_fraction):
-  """Crop the central region of the image.
+  """Crop the central region of the image(s).
 
   Remove the outer parts of an image but retain the central region of the image
   along each dimension. If we specify central_fraction = 0.5, this function
@@ -536,15 +554,19 @@ def central_crop(image, central_fraction):
       |        |   where "X" is the central 50% of the image.
        --------
 
+  This function works on either a single image (`image` is a 3-D Tensor), or a
+  batch of images (`image` is a 4-D Tensor).
+
   Args:
-    image: 3-D float Tensor of shape [height, width, depth]
+    image: Either a 3-D float Tensor of shape [height, width, depth], or a 4-D
+      Tensor of shape [batch_size, height, width, depth].
     central_fraction: float (0, 1], fraction of size to crop
 
   Raises:
     ValueError: if central_crop_fraction is not within (0, 1].
 
   Returns:
-    3-D float Tensor
+    3-D / 4-D float Tensor, as per the input.
   """
   with ops.name_scope(None, 'central_crop', [image]):
     image = ops.convert_to_tensor(image, name='image')
@@ -553,24 +575,75 @@ def central_crop(image, central_fraction):
     if central_fraction == 1.0:
       return image
 
-    image = _Assert3DImage(image)
+    _AssertAtLeast3DImage(image)
+    rank = image.get_shape().ndims
+    if rank != 3 and rank != 4:
+      raise ValueError('`image` should either be a Tensor with rank = 3 or '
+                       'rank = 4. Had rank = {}.'.format(rank))
+
+    # Helper method to return the `idx`-th dimension of `tensor`, along with
+    # a boolean signifying if the dimension is dynamic.
+    def _get_dim(tensor, idx):
+      static_shape = tensor.get_shape()[idx].value
+      if static_shape is not None:
+        return static_shape, False
+      return array_ops.shape(tensor)[idx], True
+
+    # Get the height, width, depth (and batch size, if the image is a 4-D
+    # tensor).
+    if rank == 3:
+      img_h, dynamic_h = _get_dim(image, 0)
+      img_w, dynamic_w = _get_dim(image, 1)
+      img_d = image.get_shape()[2]
+    else:
+      img_bs = image.get_shape()[0]
+      img_h, dynamic_h = _get_dim(image, 1)
+      img_w, dynamic_w = _get_dim(image, 2)
+      img_d = image.get_shape()[3]
+
+    # Compute the bounding boxes for the crop. The type and value of the
+    # bounding boxes depend on the `image` tensor's rank and whether / not the
+    # dimensions are statically defined.
+    if dynamic_h:
+      img_hd = math_ops.to_double(img_h)
+      bbox_h_start = math_ops.to_int32((img_hd - img_hd * central_fraction) / 2)
+    else:
+      img_hd = float(img_h)
+      bbox_h_start = int((img_hd - img_hd * central_fraction) / 2)
+
+    if dynamic_w:
+      img_wd = math_ops.to_double(img_w)
+      bbox_w_start = math_ops.to_int32((img_wd - img_wd * central_fraction) / 2)
+    else:
+      img_wd = float(img_w)
+      bbox_w_start = int((img_wd - img_wd * central_fraction) / 2)
 
-    img_shape = array_ops.shape(image)
-    depth = image.get_shape()[2]
-    img_h = math_ops.to_double(img_shape[0])
-    img_w = math_ops.to_double(img_shape[1])
-    bbox_h_start = math_ops.to_int32((img_h - img_h * central_fraction) / 2)
-    bbox_w_start = math_ops.to_int32((img_w - img_w * central_fraction) / 2)
+    bbox_h_size = img_h - bbox_h_start * 2
+    bbox_w_size = img_w - bbox_w_start * 2
 
-    bbox_h_size = img_shape[0] - bbox_h_start * 2
-    bbox_w_size = img_shape[1] - bbox_w_start * 2
+    if rank == 3:
+      bbox_begin = array_ops.stack([bbox_h_start, bbox_w_start, 0])
+      bbox_size = array_ops.stack([bbox_h_size, bbox_w_size, -1])
+    else:
+      bbox_begin = array_ops.stack([0, bbox_h_start, bbox_w_start, 0])
+      bbox_size = array_ops.stack([-1, bbox_h_size, bbox_w_size, -1])
 
-    bbox_begin = array_ops.stack([bbox_h_start, bbox_w_start, 0])
-    bbox_size = array_ops.stack([bbox_h_size, bbox_w_size, -1])
     image = array_ops.slice(image, bbox_begin, bbox_size)
 
-    # The first two dimensions are dynamic and unknown.
-    image.set_shape([None, None, depth])
+    # Reshape the `image` tensor to the desired size.
+    if rank == 3:
+      image.set_shape([
+          None if dynamic_h else bbox_h_size,
+          None if dynamic_w else bbox_w_size,
+          img_d
+      ])
+    else:
+      image.set_shape([
+          img_bs,
+          None if dynamic_h else bbox_h_size,
+          None if dynamic_w else bbox_w_size,
+          img_d
+      ])
     return image
 
 
@@ -1727,7 +1800,7 @@ def sample_distorted_bounding_box(image_size,
       width / height within this range.
     area_range: An optional list of `floats`. Defaults to `[0.05, 1]`.
       The cropped area of the image must contain a fraction of the
-      supplied image within in this range.
+      supplied image within this range.
     max_attempts: An optional `int`. Defaults to `100`.
       Number of attempts at generating a cropped region of the image
       of the specified constraints. After `max_attempts` failures, return the
@@ -1772,6 +1845,7 @@ def non_max_suppression(boxes,
                         scores,
                         max_output_size,
                         iou_threshold=0.5,
+                        score_threshold=float('-inf'),
                         name=None):
   """Greedily selects a subset of bounding boxes in descending order of score.
 
@@ -1800,6 +1874,8 @@ def non_max_suppression(boxes,
       of boxes to be selected by non max suppression.
     iou_threshold: A float representing the threshold for deciding whether boxes
       overlap too much with respect to IOU.
+    score_threshold: A float representing the threshold for deciding when to
+      remove boxes based on score.
     name: A name for the operation (optional).
 
   Returns:
@@ -1808,8 +1884,10 @@ def non_max_suppression(boxes,
   """
   with ops.name_scope(name, 'non_max_suppression'):
     iou_threshold = ops.convert_to_tensor(iou_threshold, name='iou_threshold')
-    return gen_image_ops.non_max_suppression_v2(boxes, scores, max_output_size,
-                                                iou_threshold)
+    score_threshold = ops.convert_to_tensor(
+        score_threshold, name='score_threshold')
+    return gen_image_ops.non_max_suppression_v3(boxes, scores, max_output_size,
+                                                iou_threshold, score_threshold)
 
 
 _rgb_to_yiq_kernel = [[0.299, 0.59590059,
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index c437c12c274479..d50ff3fb60ff24 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -533,6 +533,37 @@ def _benchmarkRandomFlipLeftRight(self, device, cpu_count):
         iters=benchmark_rounds,
         wall_time=step_time)
 
+  def _benchmarkBatchedRandomFlipLeftRight(self, device, cpu_count):
+    image_shape = [16, 299, 299, 3]
+    warmup_rounds = 100
+    benchmark_rounds = 1000
+    config = config_pb2.ConfigProto()
+    if cpu_count is not None:
+      config.inter_op_parallelism_threads = 1
+      config.intra_op_parallelism_threads = cpu_count
+    with session.Session("", graph=ops.Graph(), config=config) as sess:
+      with ops.device(device):
+        inputs = variables.Variable(
+            random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255,
+            trainable=False,
+            dtype=dtypes.float32)
+        run_op = image_ops.random_flip_left_right(inputs)
+        sess.run(variables.global_variables_initializer())
+        for i in xrange(warmup_rounds + benchmark_rounds):
+          if i == warmup_rounds:
+            start = time.time()
+          sess.run(run_op)
+    end = time.time()
+    step_time = (end - start) / benchmark_rounds
+    tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
+    print("benchmarkBatchedRandomFlipLeftRight_16_299_299_3_%s step_time: "
+          "%.2f us" %
+          (tag, step_time * 1e6))
+    self.report_benchmark(
+        name="benchmarkBatchedRandomFlipLeftRight_16_299_299_3_%s" % (tag),
+        iters=benchmark_rounds,
+        wall_time=step_time)
+
   def benchmarkFlipLeftRightCpu1(self):
     self._benchmarkFlipLeftRight("/cpu:0", 1)
 
@@ -551,6 +582,15 @@ def benchmarkRandomFlipLeftRightCpuAll(self):
   def benchmarkRandomFlipLeftRightGpu(self):
     self._benchmarkRandomFlipLeftRight(test.gpu_device_name(), None)
 
+  def benchmarkBatchedRandomFlipLeftRightCpu1(self):
+    self._benchmarkBatchedRandomFlipLeftRight("/cpu:0", 1)
+
+  def benchmarkBatchedRandomFlipLeftRightCpuAll(self):
+    self._benchmarkBatchedRandomFlipLeftRight("/cpu:0", None)
+
+  def benchmarkBatchedRandomFlipLeftRightGpu(self):
+    self._benchmarkBatchedRandomFlipLeftRight(test.gpu_device_name(), None)
+
 
 class AdjustHueBenchmark(test.Benchmark):
 
@@ -987,7 +1027,7 @@ def testRandomFlipLeftRight(self):
 
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_left_right(x_tf)
+      y = image_ops.random_flip_left_right(x_tf, seed=seed)
       self.assertTrue(y.op.name.startswith("random_flip_left_right"))
 
       count_flipped = 0
@@ -1008,6 +1048,50 @@ def testRandomFlipLeftRight(self):
       self.assertGreaterEqual(count_flipped, 20)
       self.assertGreaterEqual(count_unflipped, 20)
 
+  def testRandomFlipLeftRightWithBatch(self):
+    batch_size = 16
+    seed = 42
+
+    # create single item of test data
+    x_np_raw = np.array(
+        [[1, 2, 3], [1, 2, 3]], dtype=np.uint8
+    ).reshape([1, 2, 3, 1])
+    y_np_raw = np.array(
+        [[3, 2, 1], [3, 2, 1]], dtype=np.uint8
+    ).reshape([1, 2, 3, 1])
+
+    # create batched test data
+    x_np = np.vstack([x_np_raw for _ in range(batch_size)])
+    y_np = np.vstack([y_np_raw for _ in range(batch_size)])
+
+    with self.test_session(use_gpu=True):
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.random_flip_left_right(x_tf, seed=seed)
+      self.assertTrue(y.op.name.startswith("random_flip_left_right"))
+
+      count_flipped = 0
+      count_unflipped = 0
+      for _ in range(100):
+        y_tf = y.eval()
+
+        # check every element of the batch
+        for i in range(batch_size):
+          if y_tf[i][0][0] == 1:
+            self.assertAllEqual(y_tf[i], x_np[i])
+            count_unflipped += 1
+          else:
+            self.assertAllEqual(y_tf[i], y_np[i])
+            count_flipped += 1
+
+      # 100 trials, each containing batch_size elements
+      # Mean: 50 * batch_size
+      # Std Dev: ~5 * sqrt(batch_size)
+      # Six Sigma: 50 * batch_size - (5 * 6 * sqrt(batch_size))
+      #          = 50 * batch_size - 30 * sqrt(batch_size) = 800 - 30 * 4 = 680
+      six_sigma = 50 * batch_size - 30 * np.sqrt(batch_size)
+      self.assertGreaterEqual(count_flipped, six_sigma)
+      self.assertGreaterEqual(count_unflipped, six_sigma)
+
   def testInvolutionUpDown(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
 
@@ -1057,9 +1141,11 @@ def testRandomFlipUpDown(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[4, 5, 6], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
 
+    seed = 42
+
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_up_down(x_tf, seed=42)
+      y = image_ops.random_flip_up_down(x_tf, seed=seed)
       self.assertTrue(y.op.name.startswith("random_flip_up_down"))
       count_flipped = 0
       count_unflipped = 0
@@ -1079,6 +1165,50 @@ def testRandomFlipUpDown(self):
       self.assertGreaterEqual(count_flipped, 20)
       self.assertGreaterEqual(count_unflipped, 20)
 
+  def testRandomFlipUpDownWithBatch(self):
+    batch_size = 16
+    seed = 42
+
+    # create single item of test data
+    x_np_raw = np.array(
+        [[1, 2, 3], [4, 5, 6]], dtype=np.uint8
+    ).reshape([1, 2, 3, 1])
+    y_np_raw = np.array(
+        [[4, 5, 6], [1, 2, 3]], dtype=np.uint8
+    ).reshape([1, 2, 3, 1])
+
+    # create batched test data
+    x_np = np.vstack([x_np_raw for _ in range(batch_size)])
+    y_np = np.vstack([y_np_raw for _ in range(batch_size)])
+
+    with self.test_session(use_gpu=True):
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.random_flip_up_down(x_tf, seed=seed)
+      self.assertTrue(y.op.name.startswith("random_flip_up_down"))
+
+      count_flipped = 0
+      count_unflipped = 0
+      for _ in range(100):
+        y_tf = y.eval()
+
+        # check every element of the batch
+        for i in range(batch_size):
+          if y_tf[i][0][0] == 1:
+            self.assertAllEqual(y_tf[i], x_np[i])
+            count_unflipped += 1
+          else:
+            self.assertAllEqual(y_tf[i], y_np[i])
+            count_flipped += 1
+
+      # 100 trials, each containing batch_size elements
+      # Mean: 50 * batch_size
+      # Std Dev: ~5 * sqrt(batch_size)
+      # Six Sigma: 50 * batch_size - (5 * 6 * sqrt(batch_size))
+      #          = 50 * batch_size - 30 * sqrt(batch_size) = 800 - 30 * 4 = 680
+      six_sigma = 50 * batch_size - 30 * np.sqrt(batch_size)
+      self.assertGreaterEqual(count_flipped, six_sigma)
+      self.assertGreaterEqual(count_unflipped, six_sigma)
+
   def testInvolutionTranspose(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
 
@@ -1156,6 +1286,7 @@ def testPartialShapes(self):
     #Ops that support 4D input
     for op in [
         image_ops.flip_left_right, image_ops.flip_up_down,
+        image_ops.random_flip_left_right, image_ops.random_flip_up_down,
         image_ops.transpose_image, image_ops.rot90
     ]:
       transformed_unknown_dims_4 = op(p_unknown_dims_4)
@@ -1166,14 +1297,6 @@ def testPartialShapes(self):
                                    "must be at least three-dimensional"):
         op(p_wrong_rank)
 
-    for op in [
-        image_ops.random_flip_left_right,
-        image_ops.random_flip_up_down,
-    ]:
-      with self.assertRaisesRegexp(ValueError, "must be three-dimensional"):
-        op(p_wrong_rank)
-
-
   def testRot90GroupOrder(self):
     image = np.arange(24, dtype=np.uint8).reshape([2, 4, 3])
     with self.test_session(use_gpu=True):
@@ -1208,41 +1331,6 @@ def testRot90NumpyEquivalenceWithBatch(self):
         y_np = np.rot90(image, k=k, axes=(1, 2))
         self.assertAllEqual(y_np, y_tf.eval({k_placeholder: k}))
 
-class RandomFlipTest(test_util.TensorFlowTestCase):
-
-  def testRandomLeftRight(self):
-    x_np = np.array([0, 1], dtype=np.uint8).reshape([1, 2, 1])
-    num_iterations = 500
-
-    hist = [0, 0]
-    with self.test_session(use_gpu=True):
-      x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_left_right(x_tf)
-      for _ in xrange(num_iterations):
-        y_np = y.eval().flatten()[0]
-        hist[y_np] += 1
-
-    # Ensure that each entry is observed within 4 standard deviations.
-    four_stddev = 4.0 * np.sqrt(num_iterations / 2.0)
-    self.assertAllClose(hist, [num_iterations / 2.0] * 2, atol=four_stddev)
-
-  def testRandomUpDown(self):
-    x_np = np.array([0, 1], dtype=np.uint8).reshape([2, 1, 1])
-    num_iterations = 500
-
-    hist = [0, 0]
-    with self.test_session(use_gpu=True):
-      x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_up_down(x_tf)
-      for _ in xrange(num_iterations):
-        y_np = y.eval().flatten()[0]
-        hist[y_np] += 1
-
-    # Ensure that each entry is observed within 4 standard deviations.
-    four_stddev = 4.0 * np.sqrt(num_iterations / 2.0)
-    self.assertAllClose(hist, [num_iterations / 2.0] * 2, atol=four_stddev)
-
-
 class AdjustContrastTest(test_util.TensorFlowTestCase):
 
   def _testContrast(self, x_np, y_np, contrast_factor):
@@ -1585,14 +1673,16 @@ def _assertShapeInference(self, pre_shape, fraction, post_shape):
       self.assertEqual(y.get_shape().as_list(), post_shape)
 
   def testNoOp(self):
-    x_shape = [13, 9, 3]
-    x_np = np.ones(x_shape, dtype=np.float32)
-    with self.test_session(use_gpu=True):
-      x = constant_op.constant(x_np, shape=x_shape)
-      y = image_ops.central_crop(x, 1.0)
-      y_tf = y.eval()
-      self.assertAllEqual(y_tf, x_np)
-      self.assertEqual(y.op.name, x.op.name)
+    x_shapes = [[13, 9, 3], [5, 13, 9, 3]]
+    for x_shape in x_shapes:
+      x_np = np.ones(x_shape, dtype=np.float32)
+      for use_gpu in [True, False]:
+        with self.test_session(use_gpu=use_gpu):
+          x = constant_op.constant(x_np, shape=x_shape)
+          y = image_ops.central_crop(x, 1.0)
+          y_tf = y.eval()
+          self.assertAllEqual(y_tf, x_np)
+          self.assertEqual(y.op.name, x.op.name)
 
   def testCropping(self):
     x_shape = [4, 8, 1]
@@ -1601,6 +1691,23 @@ def testCropping(self):
          [1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8]],
         dtype=np.int32).reshape(x_shape)
     y_np = np.array([[3, 4, 5, 6], [3, 4, 5, 6]]).reshape([2, 4, 1])
+    for use_gpu in [True, False]:
+      with self.test_session(use_gpu=use_gpu):
+        x = constant_op.constant(x_np, shape=x_shape)
+        y = image_ops.central_crop(x, 0.5)
+        y_tf = y.eval()
+        self.assertAllEqual(y_tf, y_np)
+        self.assertAllEqual(y_tf.shape, y_np.shape)
+
+    x_shape = [2, 4, 8, 1]
+    x_np = np.array(
+        [[1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8],
+         [1, 2, 3, 4, 5, 6, 7, 8], [1, 2, 3, 4, 5, 6, 7, 8],
+         [8, 7, 6, 5, 4, 3, 2, 1], [8, 7, 6, 5, 4, 3, 2, 1],
+         [8, 7, 6, 5, 4, 3, 2, 1], [8, 7, 6, 5, 4, 3, 2, 1]],
+        dtype=np.int32).reshape(x_shape)
+    y_np = np.array([[[3, 4, 5, 6], [3, 4, 5, 6]],
+                     [[6, 5, 4, 3], [6, 5, 4, 3]]]).reshape([2, 2, 4, 1])
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.central_crop(x, 0.5)
@@ -1610,52 +1717,87 @@ def testCropping(self):
 
   def testCropping2(self):
     # Test case for 10315
-    x_shape = [240, 320, 3]
-    x_np = np.zeros(x_shape, dtype=np.int32)
-    y_np = np.zeros([80, 106, 3], dtype=np.int32)
-    with self.test_session(use_gpu=True):
-      x = array_ops.placeholder(shape=x_shape, dtype=dtypes.int32)
-      y = image_ops.central_crop(x, 0.33)
-      y_tf = y.eval(feed_dict={x: x_np})
-      self.assertAllEqual(y_tf, y_np)
-      self.assertAllEqual(y_tf.shape, y_np.shape)
+    x_shapes = [[240, 320, 3], [5, 240, 320, 3]]
+    expected_y_shapes = [[80, 106, 3], [5, 80, 106, 3]]
+
+    for x_shape, y_shape in zip(x_shapes, expected_y_shapes):
+      x_np = np.zeros(x_shape, dtype=np.int32)
+      y_np = np.zeros(y_shape, dtype=np.int32)
+      for use_gpu in [True, False]:
+        with self.test_session(use_gpu=use_gpu):
+          x = array_ops.placeholder(shape=x_shape, dtype=dtypes.int32)
+          y = image_ops.central_crop(x, 0.33)
+          y_tf = y.eval(feed_dict={x: x_np})
+          self.assertAllEqual(y_tf, y_np)
+          self.assertAllEqual(y_tf.shape, y_np.shape)
 
   def testShapeInference(self):
-    # Test no-op fraction=1.0
+    # Test no-op fraction=1.0, with 3-D tensors.
     self._assertShapeInference([50, 60, 3], 1.0, [50, 60, 3])
     self._assertShapeInference([None, 60, 3], 1.0, [None, 60, 3])
     self._assertShapeInference([50, None, 3], 1.0, [50, None, 3])
     self._assertShapeInference([None, None, 3], 1.0, [None, None, 3])
     self._assertShapeInference([50, 60, None], 1.0, [50, 60, None])
     self._assertShapeInference([None, None, None], 1.0, [None, None, None])
-    self._assertShapeInference(None, 1.0, None)
-    # TODO(toddw): Currently central_crop() doesn't infer the result shape even
-    # when it's possible.  If we change it to do so, we can test as follows:
-    #
-    # self._assertShapeInference([50, 60, 3], 0.5, [25, 30, 3])
-    # self._assertShapeInference([None, 60, 3], 0.5, [None, 30, 3])
-    # self._assertShapeInference([50, None, 3], 0.5, [25, None, 3])
-    # self._assertShapeInference([None, None, 3], 0.5, [None, None, 3])
-    # self._assertShapeInference([50, 60, None], 0.5, [25, 30, None])
-    # self._assertShapeInference([None, None, None], 0.5, [None, None, None])
-    # self._assertShapeInference(None, 0.5, None)
 
-  def testError(self):
+    # Test no-op fraction=0.5, with 3-D tensors.
+    self._assertShapeInference([50, 60, 3], 0.5, [26, 30, 3])
+    self._assertShapeInference([None, 60, 3], 0.5, [None, 30, 3])
+    self._assertShapeInference([50, None, 3], 0.5, [26, None, 3])
+    self._assertShapeInference([None, None, 3], 0.5, [None, None, 3])
+    self._assertShapeInference([50, 60, None], 0.5, [26, 30, None])
+    self._assertShapeInference([None, None, None], 0.5, [None, None, None])
+
+    # Test no-op fraction=1.0, with 4-D tensors.
+    self._assertShapeInference([5, 50, 60, 3], 1.0, [5, 50, 60, 3])
+    self._assertShapeInference([5, None, 60, 3], 1.0, [5, None, 60, 3])
+    self._assertShapeInference([5, 50, None, 3], 1.0, [5, 50, None, 3])
+    self._assertShapeInference([5, None, None, 3], 1.0, [5, None, None, 3])
+    self._assertShapeInference([5, 50, 60, None], 1.0, [5, 50, 60, None])
+    self._assertShapeInference([5, None, None, None], 1.0,
+                               [5, None, None, None])
+    self._assertShapeInference([None, None, None, None], 1.0,
+                               [None, None, None, None])
+
+    # Test no-op fraction=0.5, with 4-D tensors.
+    self._assertShapeInference([5, 50, 60, 3], 0.5, [5, 26, 30, 3])
+    self._assertShapeInference([5, None, 60, 3], 0.5, [5, None, 30, 3])
+    self._assertShapeInference([5, 50, None, 3], 0.5, [5, 26, None, 3])
+    self._assertShapeInference([5, None, None, 3], 0.5, [5, None, None, 3])
+    self._assertShapeInference([5, 50, 60, None], 0.5, [5, 26, 30, None])
+    self._assertShapeInference([5, None, None, None], 0.5,
+                               [5, None, None, None])
+    self._assertShapeInference([None, None, None, None], 0.5,
+                               [None, None, None, None])
+
+  def testErrorOnInvalidCentralCropFractionValues(self):
     x_shape = [13, 9, 3]
     x_np = np.ones(x_shape, dtype=np.float32)
-    with self.test_session(use_gpu=True):
-      x = constant_op.constant(x_np, shape=x_shape)
-      with self.assertRaises(ValueError):
-        _ = image_ops.central_crop(x, 0.0)
-      with self.assertRaises(ValueError):
-        _ = image_ops.central_crop(x, 1.01)
+    for use_gpu in [True, False]:
+      with self.test_session(use_gpu=use_gpu):
+        x = constant_op.constant(x_np, shape=x_shape)
+        with self.assertRaises(ValueError):
+          _ = image_ops.central_crop(x, 0.0)
+        with self.assertRaises(ValueError):
+          _ = image_ops.central_crop(x, 1.01)
+
+  def testErrorOnInvalidShapes(self):
+    x_shapes = [None, [], [3], [3, 9], [3, 9, 3, 9, 3]]
+    for x_shape in x_shapes:
+      x_np = np.ones(x_shape, dtype=np.float32)
+      for use_gpu in [True, False]:
+        with self.test_session(use_gpu=use_gpu):
+          x = constant_op.constant(x_np, shape=x_shape)
+          with self.assertRaises(ValueError):
+            _ = image_ops.central_crop(x, 0.5)
 
   def testNameScope(self):
     x_shape = [13, 9, 3]
     x_np = np.ones(x_shape, dtype=np.float32)
-    with self.test_session(use_gpu=True):
-      y = image_ops.central_crop(x_np, 1.0)
-      self.assertTrue(y.op.name.startswith("central_crop"))
+    for use_gpu in [True, False]:
+      with self.test_session(use_gpu=use_gpu):
+        y = image_ops.central_crop(x_np, 1.0)
+        self.assertTrue(y.op.name.startswith("central_crop"))
 
 
 class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index f93bf0a17f31b4..724fcc39cddbdd 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -86,7 +86,7 @@ def from_config(cls, config):
 
 
 @tf_export("keras.initializers.Zeros", "initializers.zeros",
-           "zeros_initializer")
+           "zeros_initializer", "keras.initializers.zeros")
 class Zeros(Initializer):
   """Initializer that generates tensors initialized to 0."""
 
@@ -102,7 +102,8 @@ def get_config(self):
     return {"dtype": self.dtype.name}
 
 
-@tf_export("keras.initializers.Ones", "initializers.ones", "ones_initializer")
+@tf_export("keras.initializers.Ones", "initializers.ones", "ones_initializer",
+           "keras.initializers.ones")
 class Ones(Initializer):
   """Initializer that generates tensors initialized to 1."""
 
@@ -119,7 +120,7 @@ def get_config(self):
 
 
 @tf_export("keras.initializers.Constant", "initializers.constant",
-           "constant_initializer")
+           "constant_initializer", "keras.initializers.constant")
 class Constant(Initializer):
   """Initializer that generates tensors with constant values.
 
@@ -225,7 +226,8 @@ def get_config(self):
 
 
 @tf_export("keras.initializers.RandomUniform", "initializers.random_uniform",
-           "random_uniform_initializer")
+           "random_uniform_initializer", "keras.initializers.uniform",
+           "keras.initializers.random_uniform")
 class RandomUniform(Initializer):
   """Initializer that generates tensors with a uniform distribution.
 
@@ -262,7 +264,8 @@ def get_config(self):
 
 
 @tf_export("keras.initializers.RandomNormal", "initializers.random_normal",
-           "random_normal_initializer")
+           "random_normal_initializer", "keras.initializers.normal",
+           "keras.initializers.random_normal")
 class RandomNormal(Initializer):
   """Initializer that generates tensors with a normal distribution.
 
@@ -299,7 +302,8 @@ def get_config(self):
 
 
 @tf_export("keras.initializers.TruncatedNormal",
-           "initializers.truncated_normal", "truncated_normal_initializer")
+           "initializers.truncated_normal", "truncated_normal_initializer",
+           "keras.initializers.truncated_normal")
 class TruncatedNormal(Initializer):
   """Initializer that generates a truncated normal distribution.
 
@@ -463,7 +467,8 @@ def __call__(self, shape, dtype=None, partition_info=None):
     else:
       scale /= max(1., (fan_in + fan_out) / 2.)
     if self.distribution == "normal":
-      stddev = math.sqrt(scale)
+      # constant taken from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
+      stddev = math.sqrt(scale) / .87962566103423978
       return random_ops.truncated_normal(
           shape, 0.0, stddev, dtype, seed=self.seed)
     else:
@@ -482,15 +487,15 @@ def get_config(self):
 
 
 @tf_export("keras.initializers.Orthogonal", "initializers.orthogonal",
-           "orthogonal_initializer")
+           "orthogonal_initializer", "keras.initializers.orthogonal")
 class Orthogonal(Initializer):
   """Initializer that generates an orthogonal matrix.
 
   If the shape of the tensor to initialize is two-dimensional, it is initialized
   with an orthogonal matrix obtained from the QR decomposition of a matrix of
-  uniform random numbers. If the matrix has fewer rows than columns then the
-  output will have orthogonal rows. Otherwise, the output will have orthogonal
-  columns.
+  random numbers drawn from a normal distribution.
+  If the matrix has fewer rows than columns then the output will have orthogonal
+  rows. Otherwise, the output will have orthogonal columns.
 
   If the shape of the tensor to initialize is more than two-dimensional,
   a matrix of shape `(shape[0] * ... * shape[n - 2], shape[n - 1])`
@@ -1062,7 +1067,8 @@ def _orthogonal_kernel(self, ksize, cin, cout):
     return self._dict_to_tensor(p, ksize, ksize, ksize)
 
 
-@tf_export("keras.initializers.Identity", "initializers.identity")
+@tf_export("keras.initializers.Identity", "initializers.identity",
+           "keras.initializers.identity")
 class Identity(Initializer):
   """Initializer that generates the identity matrix.
 
diff --git a/tensorflow/python/ops/io_ops.py b/tensorflow/python/ops/io_ops.py
index f6a25610c5a2ee..b5274ef2ed05ea 100644
--- a/tensorflow/python/ops/io_ops.py
+++ b/tensorflow/python/ops/io_ops.py
@@ -17,53 +17,6 @@
 """Inputs and Readers.
 
 See the @{$python/io_ops} guide.
-
-@@placeholder
-@@placeholder_with_default
-@@sparse_placeholder
-@@ReaderBase
-@@TextLineReader
-@@WholeFileReader
-@@IdentityReader
-@@TFRecordReader
-@@LMDBReader
-@@FixedLengthRecordReader
-@@decode_csv
-@@decode_raw
-@@VarLenFeature
-@@FixedLenFeature
-@@FixedLenSequenceFeature
-@@SparseFeature
-@@parse_example
-@@parse_single_example
-@@parse_tensor
-@@serialize_tensor
-@@decode_json_example
-@@QueueBase
-@@FIFOQueue
-@@PaddingFIFOQueue
-@@RandomShuffleQueue
-@@PriorityQueue
-@@ConditionalAccumulatorBase
-@@ConditionalAccumulator
-@@SparseConditionalAccumulator
-@@matching_files
-@@read_file
-@@write_file
-@@match_filenames_once
-@@limit_epochs
-@@input_producer
-@@range_input_producer
-@@slice_input_producer
-@@string_input_producer
-@@batch
-@@maybe_batch
-@@batch_join
-@@maybe_batch_join
-@@shuffle_batch
-@@maybe_shuffle_batch
-@@shuffle_batch_join
-@@maybe_shuffle_batch_join
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/ops/linalg/linalg.py b/tensorflow/python/ops/linalg/linalg.py
index d73c21cdc0bc28..a7ba0bbe9cbc4b 100644
--- a/tensorflow/python/ops/linalg/linalg.py
+++ b/tensorflow/python/ops/linalg/linalg.py
@@ -22,11 +22,13 @@
 # pylint: disable=wildcard-import,unused-import
 from tensorflow.python.ops.linalg.linalg_impl import *
 from tensorflow.python.ops.linalg.linear_operator import *
+from tensorflow.python.ops.linalg.linear_operator_block_diag import *
 from tensorflow.python.ops.linalg.linear_operator_circulant import *
 from tensorflow.python.ops.linalg.linear_operator_composition import *
 from tensorflow.python.ops.linalg.linear_operator_diag import *
 from tensorflow.python.ops.linalg.linear_operator_full_matrix import *
 from tensorflow.python.ops.linalg.linear_operator_identity import *
+from tensorflow.python.ops.linalg.linear_operator_kronecker import *
 from tensorflow.python.ops.linalg.linear_operator_low_rank_update import *
 from tensorflow.python.ops.linalg.linear_operator_lower_triangular import *
 # pylint: enable=wildcard-import
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_block_diag.py b/tensorflow/python/ops/linalg/linear_operator_block_diag.py
similarity index 98%
rename from tensorflow/contrib/linalg/python/ops/linear_operator_block_diag.py
rename to tensorflow/python/ops/linalg/linear_operator_block_diag.py
index 9d3af66c92b59d..438c3496bdf427 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_block_diag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_block_diag.py
@@ -27,8 +27,14 @@
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops.linalg import linear_operator
 from tensorflow.python.ops.linalg import linear_operator_util
+from tensorflow.python.util.tf_export import tf_export
 
+__all__ = [
+    "LinearOperatorBlockDiag",
+]
 
+
+@tf_export("linalg.LinearOperatorBlockDiag")
 class LinearOperatorBlockDiag(linear_operator.LinearOperator):
   """Combines one or more `LinearOperators` in to a Block Diagonal matrix.
 
diff --git a/tensorflow/contrib/linalg/python/ops/linear_operator_kronecker.py b/tensorflow/python/ops/linalg/linear_operator_kronecker.py
similarity index 99%
rename from tensorflow/contrib/linalg/python/ops/linear_operator_kronecker.py
rename to tensorflow/python/ops/linalg/linear_operator_kronecker.py
index 79080d194f59b7..1fd5073c17832f 100644
--- a/tensorflow/contrib/linalg/python/ops/linear_operator_kronecker.py
+++ b/tensorflow/python/ops/linalg/linear_operator_kronecker.py
@@ -28,6 +28,11 @@
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg_impl as linalg
 from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.util.tf_export import tf_export
+
+__all__ = [
+    "LinearOperatorKronecker",
+]
 
 
 def _vec(x):
@@ -59,6 +64,7 @@ def _rotate_last_dim(x, rotate_right=False):
   return array_ops.transpose(x, transpose_perm)
 
 
+@tf_export("linalg.LinearOperatorKronecker")
 class LinearOperatorKronecker(linear_operator.LinearOperator):
   """Kronecker product between two `LinearOperators`.
 
@@ -375,10 +381,6 @@ def _matmul(self, x, adjoint=False, adjoint_arg=False):
       else:
         matrix_dimensions = [self.range_dimension, column_dim]
 
-      print("x: ", x)
-      print("bathc_shape:", self.batch_shape)
-      print("self.shape:", self.shape)
-      print("output: ", output)
       output.set_shape(broadcast_batch_shape.concatenate(
           matrix_dimensions))
 
diff --git a/tensorflow/python/ops/linalg/linear_operator_test_util.py b/tensorflow/python/ops/linalg/linear_operator_test_util.py
index 7e4fb6a6fc3196..1b5bb9470c4406 100644
--- a/tensorflow/python/ops/linalg/linear_operator_test_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_test_util.py
@@ -178,7 +178,9 @@ def _skip_if_tests_to_skip_contains(self, test_name):
       SkipTest Exception, if test_name is in self._tests_to_skip.
     """
     if test_name in self._tests_to_skip:
-      self.skipTest("%s skipped because it was added to self._tests_to_skip.")
+      self.skipTest(
+          "{} skipped because it was added to self._tests_to_skip.".format(
+              test_name))
 
   def test_to_dense(self):
     self._skip_if_tests_to_skip_contains("to_dense")
diff --git a/tensorflow/python/ops/logging_ops.py b/tensorflow/python/ops/logging_ops.py
index 222b8ebc9da6b0..8276047cb678f3 100644
--- a/tensorflow/python/ops/logging_ops.py
+++ b/tensorflow/python/ops/logging_ops.py
@@ -35,8 +35,9 @@
 
 
 # Assert and Print are special symbols in python, so we must
-# use an upper-case version of them.
-@tf_export("Print")
+# have an upper-case version of them.  For users with Python 3 or Python 2.7
+# with `from __future__ import print_function`, we also allow lowercase.
+@tf_export("Print", "print")
 def Print(input_, data, message=None, first_n=None, summarize=None,
           name=None):
   """Prints a list of tensors.
diff --git a/tensorflow/python/ops/losses/losses.py b/tensorflow/python/ops/losses/losses.py
index 81ee01a41a21f2..4681eb9b175a67 100644
--- a/tensorflow/python/ops/losses/losses.py
+++ b/tensorflow/python/ops/losses/losses.py
@@ -15,20 +15,6 @@
 """Loss operations for use in neural networks.
 
 Note: All the losses are added to the `GraphKeys.LOSSES` collection by default.
-
-@@Reduction
-@@absolute_difference
-@@compute_weighted_loss
-@@cosine_distance
-@@hinge_loss
-@@huber_loss
-@@log_loss
-@@mean_pairwise_squared_error
-@@mean_squared_error
-@@sigmoid_cross_entropy
-@@softmax_cross_entropy
-@@sparse_softmax_cross_entropy
-
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 9fc545c9678e7e..de9b3c6909ddd9 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -334,8 +334,11 @@ def hinge_loss(labels, logits, weights=1.0, scope=None,
 
   Args:
     labels: The ground truth output tensor. Its shape should match the shape of
-      logits. The values of the tensor are expected to be 0.0 or 1.0.
-    logits: The logits, a float tensor.
+      logits. The values of the tensor are expected to be 0.0 or 1.0. Internally
+      the {0,1} labels are converted to {-1,1} when calculating the hinge loss.
+    logits: The logits, a float tensor. Note that logits are assumed to be
+      unbounded and 0-centered. A value > 0 (resp. < 0) is considered a positive
+      (resp. negative) binary prediction.
     weights: Optional `Tensor` whose rank is either 0, or the same rank as
       `labels`, and must be broadcastable to `labels` (i.e., all dimensions must
       be either `1`, or the same as the corresponding `losses` dimension).
diff --git a/tensorflow/python/ops/losses/util.py b/tensorflow/python/ops/losses/util.py
index b835d963869704..10646af8a983f1 100644
--- a/tensorflow/python/ops/losses/util.py
+++ b/tensorflow/python/ops/losses/util.py
@@ -12,16 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utilities for manipulating the loss collections.
-
-
-@@add_loss
-@@get_losses
-@@get_regularization_loss
-@@get_regularization_losses
-@@get_total_loss
-
-"""
+"""Utilities for manipulating the loss collections."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/python/ops/manip_ops.py b/tensorflow/python/ops/manip_ops.py
index 373585395bb1e7..6633565a649df5 100644
--- a/tensorflow/python/ops/manip_ops.py
+++ b/tensorflow/python/ops/manip_ops.py
@@ -12,10 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Operators for manipulating tensors.
-
-@@roll
-"""
+"""Operators for manipulating tensors."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index 02e07dc7b1f5fe..563c0b3ab3f631 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -171,7 +171,9 @@ def _ProdGrad(op, grad):
   # Calculate product, leaving out the current entry
   left = math_ops.cumprod(reshaped, axis=0, exclusive=True)
   right = math_ops.cumprod(reshaped, axis=0, exclusive=True, reverse=True)
-  y = array_ops.reshape(left * right, permuted_shape)
+  # For complex inputs, the gradient is in the conjugate direction.
+  y = array_ops.reshape(math_ops.conj(left) * math_ops.conj(right),
+                        permuted_shape)
 
   # Invert the transpose and reshape operations.
   # Make sure to set the statically known shape information through a reshape.
diff --git a/tensorflow/python/ops/math_grad_test.py b/tensorflow/python/ops/math_grad_test.py
index 04eeb00518a3af..fa47b8f9b8a0e7 100644
--- a/tensorflow/python/ops/math_grad_test.py
+++ b/tensorflow/python/ops/math_grad_test.py
@@ -152,6 +152,28 @@ def testProdGradientForNegativeAxis(self):
           outputs, outputs.get_shape().as_list())
       self.assertLess(error, 1e-4)
 
+  def testProdGradientComplex(self):
+    for dtype in dtypes.complex64, dtypes.complex128:
+      inputs = constant_op.constant([[1 + 3j, 2 - 1j], [3j, 4]],
+                                    dtype=dtype)
+      outputs = math_ops.reduce_prod(inputs)
+      with self.test_session():
+        error = gradient_checker.compute_gradient_error(
+            inputs, inputs.get_shape().as_list(),
+            outputs, outputs.get_shape().as_list())
+        self.assertLess(error, 1e-4)
+
+  def testProdGradientForNegativeAxisComplex(self):
+    for dtype in dtypes.complex64, dtypes.complex128:
+      inputs = constant_op.constant([[1 + 3j, 2 - 1j], [3j, 4]],
+                                    dtype=dtype)
+      outputs = math_ops.reduce_prod(inputs, -1)
+      with self.test_session():
+        error = gradient_checker.compute_gradient_error(
+            inputs, inputs.get_shape().as_list(),
+            outputs, outputs.get_shape().as_list())
+        self.assertLess(error, 1e-4)
+
 
 class SegmentMinOrMaxGradientTest(test.TestCase):
 
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index b81b1e792eda51..b7e3de7e857bda 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -15,136 +15,6 @@
 """Basic arithmetic operators.
 
 See the @{$python/math_ops} guide.
-
-@@add
-@@subtract
-@@multiply
-@@scalar_mul
-@@div
-@@divide
-@@truediv
-@@floordiv
-@@realdiv
-@@truncatediv
-@@floor_div
-@@truncatemod
-@@floormod
-@@mod
-@@cross
-@@add_n
-@@abs
-@@negative
-@@sign
-@@reciprocal
-@@square
-@@round
-@@sqrt
-@@rsqrt
-@@pow
-@@exp
-@@expm1
-@@log
-@@log1p
-@@sinh
-@@cosh
-@@asinh
-@@acosh
-@@atanh
-@@ceil
-@@floor
-@@maximum
-@@minimum
-@@cos
-@@sin
-@@lbeta
-@@tan
-@@acos
-@@asin
-@@atan
-@@atan2
-@@lgamma
-@@digamma
-@@erf
-@@erfc
-@@squared_difference
-@@igamma
-@@igammac
-@@zeta
-@@polygamma
-@@betainc
-@@rint
-@@diag
-@@diag_part
-@@trace
-@@transpose
-@@eye
-@@matrix_diag
-@@matrix_diag_part
-@@matrix_band_part
-@@matrix_set_diag
-@@matrix_transpose
-@@matmul
-@@norm
-@@matrix_determinant
-@@matrix_inverse
-@@cholesky
-@@cholesky_solve
-@@matrix_solve
-@@matrix_triangular_solve
-@@matrix_solve_ls
-@@qr
-@@self_adjoint_eig
-@@self_adjoint_eigvals
-@@svd
-@@tensordot
-@@complex
-@@conj
-@@imag
-@@angle
-@@real
-@@fft
-@@ifft
-@@fft2d
-@@ifft2d
-@@fft3d
-@@ifft3d
-@@reduce_sum
-@@reduce_prod
-@@reduce_min
-@@reduce_max
-@@reduce_mean
-@@reduce_all
-@@reduce_any
-@@reduce_logsumexp
-@@count_nonzero
-@@accumulate_n
-@@einsum
-@@bincount
-@@cumsum
-@@cumprod
-@@segment_sum
-@@segment_prod
-@@segment_min
-@@segment_max
-@@segment_mean
-@@to_complex128
-@@to_complex64
-@@unsorted_segment_sum
-@@unsorted_segment_max
-@@unsorted_segment_mean
-@@unsorted_segment_min
-@@unsorted_segment_prod
-@@unsorted_segment_sqrt_n
-@@sparse_segment_sum
-@@sparse_segment_mean
-@@sparse_segment_sqrt_n
-@@argmin
-@@argmax
-@@setdiff1d
-@@where
-@@unique
-@@edit_distance
-@@invert_permutation
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -255,8 +125,8 @@ def abs(x, name=None):  # pylint: disable=redefined-builtin
   ```
 
   Args:
-    x: A `Tensor` or `SparseTensor` of type `float32`, `float64`, `int32`,
-      `int64`, `complex64` or `complex128`.
+    x: A `Tensor` or `SparseTensor` of type `float16`, `float32`, `float64`,
+      `int32`, `int64`, `complex64` or `complex128`.
     name: A name for the operation (optional).
 
   Returns:
@@ -560,10 +430,10 @@ def pow(x, y, name=None):  # pylint: disable=redefined-builtin
   ```
 
   Args:
-    x: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
-     or `complex128`.
-    y: A `Tensor` of type `float32`, `float64`, `int32`, `int64`, `complex64`,
-     or `complex128`.
+    x: A `Tensor` of type `float16`, `float32`, `float64`, `int32`, `int64`,
+     `complex64`, or `complex128`.
+    y: A `Tensor` of type `float16`, `float32`, `float64`, `int32`, `int64`,
+     `complex64`, or `complex128`.
     name: A name for the operation (optional).
 
   Returns:
@@ -730,7 +600,7 @@ def round(x, name=None):  # pylint: disable=redefined-builtin
   ```
 
   Args:
-    x: A `Tensor` of type `float32` or `float64`.
+    x: A `Tensor` of type `float16`, `float32`, `float64`, `int32`, or `int64`.
     name: A name for the operation (optional).
 
   Returns:
@@ -1387,7 +1257,7 @@ def reduce_sum(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
@@ -1415,7 +1285,7 @@ def reduce_sum(input_tensor,
     The reduced tensor, of the same dtype as the input_tensor.
 
   @compatibility(numpy)
-  Equivalent to np.sum appart the fact that numpy upcast uint8 and int32 to
+  Equivalent to np.sum apart the fact that numpy upcast uint8 and int32 to
   int64 while tensorflow returns the same dtype as the input.
   @end_compatibility
   """
@@ -1527,7 +1397,7 @@ def reduce_mean(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
@@ -1599,7 +1469,7 @@ def reduce_prod(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   Args:
@@ -1649,7 +1519,7 @@ def reduce_min(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   Args:
@@ -1698,7 +1568,7 @@ def reduce_max(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   Args:
@@ -1747,7 +1617,7 @@ def reduce_all(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
@@ -1805,7 +1675,7 @@ def reduce_any(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
@@ -1898,6 +1768,7 @@ def reduce_logsumexp(input_tensor,
                                                     "keep_dims", keep_dims)
   if keepdims is None:
     keepdims = False
+  input_tensor = ops.convert_to_tensor(input_tensor)
   with ops.name_scope(name, "ReduceLogSumExp", [input_tensor]) as name:
     raw_max = reduce_max(
         input_tensor,
@@ -1910,13 +1781,13 @@ def reduce_logsumexp(input_tensor,
             array_ops.zeros_like(raw_max)))
     result = gen_math_ops.log(
         reduce_sum(
-            gen_math_ops.exp(input_tensor - my_max),
+            gen_math_ops.exp(gen_math_ops.sub(input_tensor, my_max)),
             axis,
             keepdims=keepdims,
             reduction_indices=reduction_indices))
     if not keepdims:
       my_max = array_ops.reshape(my_max, array_ops.shape(result))
-    result += my_max
+    result = gen_math_ops.add(result, my_max)
     return _may_reduce_to_scalar(keepdims, axis, reduction_indices, result)
 
 
@@ -2616,6 +2487,12 @@ def reduced_shape(input_shape, axes):
   """
   # Example:
   # cast needed for SparseTensor reductions
+  if context.executing_eagerly():
+    input_shape = input_shape.numpy()
+    axes = axes.numpy()
+    input_shape[axes] = 1
+    return input_shape
+
   input_shape = to_int32(input_shape)  # [2, 3, 5, 7]
   axes = to_int32(axes)  # [1, 2]
 
@@ -2638,7 +2515,8 @@ def _unsorted_segment_N(data, segment_ids, num_segments):
       of segment entries with 0-entries set to 1 to allow division by N.
   """
   # bincount doesn't support negative indices so we use unsorted_segment_sum
-  ones_tensor = array_ops.ones(segment_ids.shape, dtype=data.dtype)
+  segment_ids_shape = array_ops.shape_internal(segment_ids)
+  ones_tensor = array_ops.ones(segment_ids_shape, dtype=data.dtype)
   N = gen_math_ops.unsorted_segment_sum(ones_tensor, segment_ids, num_segments)
   # add dimensions for all non-reduced axes
   ndims_output = data.shape.ndims - segment_ids.shape.ndims
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 05bcee8801259e..980c92b0d592bc 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -35,7 +35,6 @@
 log = np.log
 
 
-@test_util.with_c_api
 class ReduceTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes()
@@ -70,7 +69,6 @@ def testReduceInvalidAxis(self):
       math_ops.reduce_sum(x, axis)
 
 
-@test_util.with_c_api
 class LogSumExpTest(test_util.TensorFlowTestCase):
 
   def testReduceLogSumExp(self):
@@ -150,7 +148,6 @@ def testInfinity(self):
       self.assertEqual(-np.inf, res)
 
 
-@test_util.with_c_api
 class RoundTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes()
@@ -166,7 +163,6 @@ def testRounding(self):
         self.assertAllClose(y_tf_np, y_np, atol=1e-2)
 
 
-@test_util.with_c_api
 class ModTest(test_util.TensorFlowTestCase):
 
   def testFloat(self):
@@ -196,7 +192,6 @@ def testFixed(self):
         self.assertAllClose(y_tf_np, y_np)
 
 
-@test_util.with_c_api
 class SquaredDifferenceTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes()
@@ -210,7 +205,6 @@ def testSquaredDifference(self):
         self.assertAllClose(z, z_tf)
 
 
-@test_util.with_c_api
 class ApproximateEqualTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes()
@@ -242,7 +236,6 @@ def testApproximateEqual(self):
         self.assertAllEqual(z, z_tf)
 
 
-@test_util.with_c_api
 class ScalarMulTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes()
@@ -284,7 +277,6 @@ def testAcceptsIndexedSlices(self):
       self.assertAllEqual(self.evaluate(x.indices), [0, 2, 5])
 
 
-@test_util.with_c_api
 class AccumulateNTest(test_util.TensorFlowTestCase):
 
   def testFloat(self):
@@ -304,7 +296,6 @@ def testInt(self):
       self.assertAllEqual(x[0] * 6, math_ops.accumulate_n([tf_x[0]] * 6).eval())
 
 
-@test_util.with_c_api
 class AddNTest(test_util.TensorFlowTestCase):
 
   def testPartials(self):
@@ -358,7 +349,6 @@ def testGrad(self):
                             [g.eval() for g in add_n_grad])
 
 
-@test_util.with_c_api
 class DivAndModTest(test_util.TensorFlowTestCase):
   # TODO(aselle): Test more types before exposing new division operators.
 
diff --git a/tensorflow/python/ops/metrics.py b/tensorflow/python/ops/metrics.py
index d1a8249154edd9..54fa3aefaa678b 100644
--- a/tensorflow/python/ops/metrics.py
+++ b/tensorflow/python/ops/metrics.py
@@ -13,43 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Evaluation-related metrics.
-
-@@accuracy
-@@auc
-@@false_negatives
-@@false_negatives_at_thresholds
-@@false_positives
-@@false_positives_at_thresholds
-@@mean
-@@mean_absolute_error
-@@mean_cosine_distance
-@@mean_iou
-@@mean_per_class_accuracy
-@@mean_relative_error
-@@mean_squared_error
-@@mean_tensor
-@@percentage_below
-@@precision
-@@precision_at_thresholds
-@@recall
-@@recall_at_k
-@@recall_at_top_k
-@@recall_at_thresholds
-@@root_mean_squared_error
-@@sensitivity_at_specificity
-@@sparse_average_precision_at_k
-@@average_precision_at_k
-@@sparse_precision_at_k
-@@precision_at_k
-@@precision_at_top_k
-@@specificity_at_sensitivity
-@@true_negatives
-@@true_negatives_at_thresholds
-@@true_positives
-@@true_positives_at_thresholds
-
-"""
+"""Evaluation-related metrics."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tensorflow/python/ops/nn.py b/tensorflow/python/ops/nn.py
index 25e4add569f714..339684122ec303 100644
--- a/tensorflow/python/ops/nn.py
+++ b/tensorflow/python/ops/nn.py
@@ -17,91 +17,6 @@
 """Neural network support.
 
 See the @{$python/nn} guide.
-
-@@relu
-@@relu6
-@@crelu
-@@swish
-@@elu
-@@leaky_relu
-@@selu
-@@softplus
-@@softsign
-@@dropout
-@@bias_add
-@@sigmoid
-@@log_sigmoid
-@@tanh
-@@convolution
-@@conv2d
-@@depthwise_conv2d
-@@depthwise_conv2d_native
-@@separable_conv2d
-@@atrous_conv2d
-@@atrous_conv2d_transpose
-@@conv2d_transpose
-@@conv1d
-@@conv3d
-@@conv3d_transpose
-@@conv2d_backprop_filter
-@@conv2d_backprop_input
-@@conv3d_backprop_filter_v2
-@@depthwise_conv2d_native_backprop_filter
-@@depthwise_conv2d_native_backprop_input
-@@avg_pool
-@@max_pool
-@@max_pool_with_argmax
-@@avg_pool3d
-@@max_pool3d
-@@fractional_avg_pool
-@@fractional_max_pool
-@@pool
-@@dilation2d
-@@erosion2d
-@@with_space_to_batch
-@@l2_normalize
-@@local_response_normalization
-@@sufficient_statistics
-@@normalize_moments
-@@moments
-@@weighted_moments
-@@fused_batch_norm
-@@batch_normalization
-@@batch_norm_with_global_normalization
-@@l2_loss
-@@log_poisson_loss
-@@sigmoid_cross_entropy_with_logits
-@@softmax
-@@log_softmax
-@@softmax_cross_entropy_with_logits
-@@softmax_cross_entropy_with_logits_v2
-@@sparse_softmax_cross_entropy_with_logits
-@@weighted_cross_entropy_with_logits
-@@embedding_lookup
-@@embedding_lookup_sparse
-@@dynamic_rnn
-@@bidirectional_dynamic_rnn
-@@raw_rnn
-@@static_rnn
-@@static_state_saving_rnn
-@@static_bidirectional_rnn
-@@ctc_loss
-@@ctc_greedy_decoder
-@@ctc_beam_search_decoder
-@@top_k
-@@in_top_k
-@@nce_loss
-@@sampled_softmax_loss
-@@uniform_candidate_sampler
-@@log_uniform_candidate_sampler
-@@learned_unigram_candidate_sampler
-@@fixed_unigram_candidate_sampler
-@@compute_accidental_hits
-@@quantized_conv2d
-@@quantized_relu
-@@quantized_relu_x
-@@quantized_max_pool
-@@quantized_avg_pool
 """
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/python/ops/nn_batchnorm_test.py b/tensorflow/python/ops/nn_batchnorm_test.py
index 3ac2c8eb17ef31..7d6dd3fb027c9a 100644
--- a/tensorflow/python/ops/nn_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_batchnorm_test.py
@@ -35,7 +35,6 @@
 from tensorflow.python.platform import test
 
 
-@test_util.with_c_api
 class BatchNormalizationTest(test.TestCase):
 
   def _npBatchNorm(self, x, m, v, beta, gamma, epsilon,
@@ -292,12 +291,16 @@ def testBatchNormKeepDims(self):
             self.assertAllClose(
                 tf_batch_norm, keep_dims_tf_batch_norm, atol=0.000001)
 
-  def _testBatchNormArbitraryShapes(self, x_shape, param_shape, atol=0.0001):
-    x_val = np.random.random_sample(x_shape).astype(np.float32)
-    m_val = np.random.random_sample(param_shape).astype(np.float32)
-    v_val = np.random.random_sample(param_shape).astype(np.float32)
-    beta_val = np.random.random_sample(param_shape).astype(np.float32)
-    gamma_val = np.random.random_sample(param_shape).astype(np.float32)
+  def _testBatchNormArbitraryShapes(self, x_shape, param_shape, atol=0.0001,
+                                    dtype=dtypes.float32,
+                                    param_dtype=dtypes.float32):
+    numpy_dtype = dtype.as_numpy_dtype
+    numpy_param_dtype = param_dtype.as_numpy_dtype
+    x_val = np.random.random_sample(x_shape).astype(numpy_dtype)
+    m_val = np.random.random_sample(param_shape).astype(numpy_param_dtype)
+    v_val = np.random.random_sample(param_shape).astype(numpy_param_dtype)
+    beta_val = np.random.random_sample(param_shape).astype(numpy_param_dtype)
+    gamma_val = np.random.random_sample(param_shape).astype(numpy_param_dtype)
     for use_gpu in [True, False]:
       with self.test_session(use_gpu=use_gpu) as sess:
         x = constant_op.constant(x_val, name="x")
@@ -332,8 +335,11 @@ def testBatchNormArbitraryShapes(self):
     self._testBatchNormArbitraryShapes(
         (2, 3, 2, 4, 5), (1, 1, 1, 4, 5), atol=0.005)
 
+  def testBatchNormMixedPrecision(self):
+    self._testBatchNormArbitraryShapes((3, 3), (1, 3), dtype=dtypes.float16,
+                                       param_dtype=dtypes.float32, atol=0.001)
+
 
-@test_util.with_c_api
 class SufficientStatisticsTest(test.TestCase):
 
   def _npSuffStats(self, x, axes, shift, keep_dims):
@@ -393,7 +399,6 @@ def testSuffStats(self):
           self._testSuffStats([1, 2, 3], [0, 2], shift, keep_dims, has_shape)
 
 
-@test_util.with_c_api
 class NormalizeMomentsTest(test.TestCase):
 
   def _npNormalizeMoments(self, counts, mean_ss, variance_ss, shift):
@@ -437,7 +442,6 @@ def testNormalizeMoments(self):
       self._testNormalizeMoments([2, 3], shift)
 
 
-@test_util.with_c_api
 class MomentsTest(test.TestCase):
 
   def _unweighted_moments(self, x, axes, keep_dims=False, extra_out_grads=None):
@@ -575,7 +579,6 @@ def testVarGlobalGradient(self):
     self._testGlobalGradient(from_y="var")
 
 
-@test_util.with_c_api
 class WeightedMomentsTest(MomentsTest):
   """Tests for nn.weighted_moments.
 
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 576627e78ed10d..f47f38e29e328e 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -621,7 +621,7 @@ def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):
   """Calculate the mean and variance of based on the sufficient statistics.
 
   Args:
-    counts: A `Tensor` containing a the total count of the data (one value).
+    counts: A `Tensor` containing the total count of the data (one value).
     mean_ss: A `Tensor` containing the mean sufficient statistics: the (possibly
       shifted) sum of the elements to average over.
     variance_ss: A `Tensor` containing the variance sufficient statistics: the
@@ -689,6 +689,9 @@ def moments(
     # Compute true mean while keeping the dims for proper broadcasting.
     mean = math_ops.reduce_mean(y, axes, keepdims=True, name="mean")
     # sample variance, not unbiased variance
+    # Note: stop_gradient does not change the gradient that gets 
+    #       backpropagated to the mean from the variance calculation,
+    #       because that gradient is zero
     variance = math_ops.reduce_mean(
         math_ops.squared_difference(y, array_ops.stop_gradient(mean)),
         axes,
@@ -830,8 +833,10 @@ def batch_normalization(x,
     inv = math_ops.rsqrt(variance + variance_epsilon)
     if scale is not None:
       inv *= scale
-    return x * inv + (
-        offset - mean * inv if offset is not None else -mean * inv)
+    # Note: tensorflow/contrib/quantize/python/fold_batch_norms.py depends on
+    # the precise order of ops that are generated by the expression below.
+    return x * math_ops.cast(inv, x.dtype) + math_ops.cast(
+        offset - mean * inv if offset is not None else -mean * inv, x.dtype)
 
 
 @tf_export("nn.fused_batch_norm")
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index cd07550d2ee31e..0c2f5b06c497e8 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -1596,12 +1596,12 @@ def leaky_relu(features, alpha=0.2, name=None):
   Returns:
     The activation value.
   """
-  with ops.name_scope(name, "LeakyRelu", [features, alpha]):
+  with ops.name_scope(name, "LeakyRelu", [features, alpha]) as name:
     features = ops.convert_to_tensor(features, name="features")
     if features.dtype.is_integer:
       features = math_ops.to_float(features)
     alpha = ops.convert_to_tensor(alpha, dtype=features.dtype, name="alpha")
-    return math_ops.maximum(alpha * features, features)
+    return math_ops.maximum(alpha * features, features, name=name)
 
 
 def _flatten_outer_dims(logits):
@@ -2100,11 +2100,10 @@ def avg_pool(value, ksize, strides, padding, data_format="NHWC", name=None):
   Args:
     value: A 4-D `Tensor` of shape `[batch, height, width, channels]` and type
       `float32`, `float64`, `qint8`, `quint8`, or `qint32`.
-    ksize: A 1-D int Tensor of 4 elements.
-      The size of the window for each dimension of the input tensor.
-    strides: A 1-D int Tensor of 4 elements
-      The stride of the sliding window for each dimension of the
-      input tensor.
+    ksize: A list or tuple of 4 ints. The size of the window for each dimension
+      of the input tensor.
+    strides: A list or tuple of 4 ints. The stride of the sliding window for
+      each dimension of the input tensor.
     padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
       See the @{tf.nn.convolution$comment here}
     data_format: A string. 'NHWC' and 'NCHW' are supported.
@@ -2130,10 +2129,10 @@ def max_pool(value, ksize, strides, padding, data_format="NHWC", name=None):
 
   Args:
     value: A 4-D `Tensor` of the format specified by `data_format`.
-    ksize: A 1-D int Tensor of 4 elements.  The size of the window for
+    ksize: A list or tuple of 4 ints. The size of the window for each dimension
+      of the input tensor.
+    strides: A list or tuple of 4 ints. The stride of the sliding window for
       each dimension of the input tensor.
-    strides: A 1-D int Tensor of 4 elements.  The stride of the sliding
-      window for each dimension of the input tensor.
     padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
       See the @{tf.nn.convolution$comment here}
     data_format: A string. 'NHWC', 'NCHW' and 'NCHW_VECT_C' are supported.
@@ -2312,13 +2311,22 @@ def dropout(x, keep_prob, noise_shape=None, seed=None, name=None):  # pylint: di
     if isinstance(keep_prob, numbers.Real) and not 0 < keep_prob <= 1:
       raise ValueError("keep_prob must be a scalar tensor or a float in the "
                        "range (0, 1], got %g" % keep_prob)
-    keep_prob = ops.convert_to_tensor(
-        keep_prob, dtype=x.dtype, name="keep_prob")
-    keep_prob.get_shape().assert_is_compatible_with(tensor_shape.scalar())
 
-    # Do nothing if we know keep_prob == 1
-    if tensor_util.constant_value(keep_prob) == 1:
+    # Early return if nothing needs to be dropped.
+    if isinstance(keep_prob, float) and keep_prob == 1:
       return x
+    if context.executing_eagerly():
+      if isinstance(keep_prob, ops.EagerTensor):
+        if keep_prob.numpy() == 1:
+          return x
+    else:
+      keep_prob = ops.convert_to_tensor(
+          keep_prob, dtype=x.dtype, name="keep_prob")
+      keep_prob.get_shape().assert_is_compatible_with(tensor_shape.scalar())
+
+      # Do nothing if we know keep_prob == 1
+      if tensor_util.constant_value(keep_prob) == 1:
+        return x
 
     noise_shape = _get_noise_shape(x, noise_shape)
 
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 46a5f4fae6b157..035b4735affbd3 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -962,6 +962,16 @@ def testValues(self):
       self.assertAllClose(
           outputs, [-0.4, -0.2, 0.0, 1.0, 2.0], rtol=tol, atol=tol)
 
+  def testName(self):
+    np_values = np.array([-2, -1, 0, 1, 2], dtype=np.float64)
+    outputs_with_name_set = nn_ops.leaky_relu(
+        constant_op.constant(np_values),
+        name='test_relu_op')
+    self.assertEqual(outputs_with_name_set.name, 'test_relu_op:0')
+    outputs_without_name_set = nn_ops.leaky_relu(
+        constant_op.constant(np_values))
+    self.assertEqual(outputs_without_name_set.name, 'LeakyRelu:0')
+
 
 class SwishTest(test_lib.TestCase):
 
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 1e953f658fca08..c137bfacb2e239 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -19,6 +19,8 @@
 from __future__ import division
 from __future__ import print_function
 
+import contextlib
+
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import variable_pb2
 from tensorflow.python import pywrap_tensorflow
@@ -38,7 +40,7 @@
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_resource_variable_ops import *
 # pylint: enable=wildcard-import
-from tensorflow.python.training import checkpointable
+from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import compat
 
 
@@ -115,6 +117,18 @@ def _eager_safe_variable_handle(shape, dtype, shared_name, name, graph_mode):
   return handle
 
 
+@contextlib.contextmanager
+def _handle_graph(handle):
+  # Note: might have an eager tensor but not be executing eagerly when building
+  # functions.
+  if (context.executing_eagerly() or isinstance(handle, ops.EagerTensor)
+      or ops.has_default_graph()):
+    yield
+  else:
+    with handle.graph.as_default():
+      yield
+
+
 class EagerResourceDeleter(object):
   """An object which cleans up a resource handle.
 
@@ -159,7 +173,8 @@ def __del__(self):
 
 def shape_safe_assign_variable_handle(handle, shape, value, name=None):
   """Helper that checks shape compatibility and assigns variable."""
-  value_tensor = ops.convert_to_tensor(value)
+  with _handle_graph(handle):
+    value_tensor = ops.convert_to_tensor(value)
   shape.assert_is_compatible_with(value_tensor.shape)
   return gen_resource_variable_ops.assign_variable_op(handle,
                                                       value_tensor,
@@ -492,6 +507,9 @@ def _init_from_args(self,
           else:
             self._cached_value = None
         if not context.executing_eagerly():
+          # Eager variables are only added to collections if they are part of an
+          # eager variable store (otherwise in an interactive session they would
+          # hog memory and cause OOM). This is done in ops/variable_scope.py.
           ops.add_to_collections(collections, self)
         elif ops.GraphKeys.GLOBAL_STEP in collections:
           ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, self)
@@ -536,6 +554,7 @@ def _init_from_proto(self, variable_def, import_scope=None):
                                  import_scope=import_scope))
     else:
       self._initial_value = None
+    self._trainable = getattr(variable_def, "trainable", True)
     if variable_def.snapshot_name:
       snapshot = g.as_graph_element(
           ops.prepend_name_scope(
@@ -561,6 +580,21 @@ def _init_from_proto(self, variable_def, import_scope=None):
     self._constraint = None
     self._cached_shape_as_list = None
 
+  @contextlib.contextmanager
+  def _assign_dependencies(self):
+    """Makes assignments depend on the cached value, if any.
+
+    This prevents undefined behavior with reads not ordered wrt writes.
+
+    Yields:
+      None.
+    """
+    if self._cached_value is not None:
+      with ops.control_dependencies([self._cached_value]):
+        yield
+    else:
+      yield
+
   def __nonzero__(self):
     return self.__bool__()
 
@@ -705,7 +739,7 @@ def _get_save_slice_info(self):
     return self._save_slice_info
 
   def _read_variable_op(self):
-    if hasattr(self, "_trainable") and self._trainable:
+    if self.trainable:
       tape.watch_variable(self)
     return gen_resource_variable_ops.read_variable_op(self._handle,
                                                       self._dtype)
@@ -730,7 +764,7 @@ def read_value(self):
   def sparse_read(self, indices, name=None):
     """Reads the value of this variable sparsely, using `gather`."""
     with ops.name_scope("Gather" if name is None else name) as name:
-      if self._trainable:
+      if self.trainable:
         tape.watch_variable(self)
       value = gen_resource_variable_ops.resource_gather(
           self._handle, indices, dtype=self._dtype, name=name)
@@ -771,6 +805,7 @@ def to_proto(self, export_scope=None):
         var_def.snapshot_name = ops.strip_name_scope(self._graph_element.name,
                                                      export_scope)
       var_def.is_resource = True
+      var_def.trainable = self.trainable
       if self._save_slice_info:
         var_def.save_slice_info_def.MergeFrom(
             self._save_slice_info.to_proto(export_scope=export_scope))
@@ -850,8 +885,10 @@ def assign_sub(self, delta, use_locking=None, name=None, read_value=True):
     # TODO(apassos): this here and below is not atomic. Consider making it
     # atomic if there's a way to do so without a performance cost for those who
     # don't need it.
-    assign_sub_op = gen_resource_variable_ops.assign_sub_variable_op(
-        self.handle, ops.convert_to_tensor(delta, dtype=self.dtype), name=name)
+    with _handle_graph(self.handle), self._assign_dependencies():
+      assign_sub_op = gen_resource_variable_ops.assign_sub_variable_op(
+          self.handle, ops.convert_to_tensor(delta, dtype=self.dtype),
+          name=name)
     if read_value:
       return self._lazy_read(assign_sub_op)
     return assign_sub_op
@@ -872,14 +909,16 @@ def assign_add(self, delta, use_locking=None, name=None, read_value=True):
       it will return the `Operation` that does the assignment, and when in eager
       mode it will return `None`.
     """
-    assign_add_op = gen_resource_variable_ops.assign_add_variable_op(
-        self.handle, ops.convert_to_tensor(delta, dtype=self.dtype), name=name)
+    with _handle_graph(self.handle), self._assign_dependencies():
+      assign_add_op = gen_resource_variable_ops.assign_add_variable_op(
+          self.handle, ops.convert_to_tensor(delta, dtype=self.dtype),
+          name=name)
     if read_value:
       return self._lazy_read(assign_add_op)
     return assign_add_op
 
   def _lazy_read(self, op):
-    if hasattr(self, "_trainable") and self._trainable:
+    if self.trainable:
       tape.watch_variable(self)
     return _UnreadVariable(
         self._handle, self.dtype, self._shape, self._in_graph_mode,
@@ -902,30 +941,34 @@ def assign(self, value, use_locking=None, name=None, read_value=True):
       it will return the `Operation` that does the assignment, and when in eager
       mode it will return `None`.
     """
-    value_tensor = ops.convert_to_tensor(value, dtype=self.dtype)
-    self._shape.assert_is_compatible_with(value_tensor.shape)
-    assign_op = gen_resource_variable_ops.assign_variable_op(
-        self.handle, value_tensor, name=name)
-    if read_value:
-      return self._lazy_read(assign_op)
+    # Note: not depending on the cached value here since this can used to
+    # initialize the variable.
+    with _handle_graph(self.handle):
+      value_tensor = ops.convert_to_tensor(value, dtype=self.dtype)
+      self._shape.assert_is_compatible_with(value_tensor.shape)
+      assign_op = gen_resource_variable_ops.assign_variable_op(
+          self.handle, value_tensor, name=name)
+      if read_value:
+        return self._lazy_read(assign_op)
     return assign_op
 
   def _strided_slice_assign(self, begin, end, strides, value, name, begin_mask,
                             end_mask, ellipsis_mask, new_axis_mask,
                             shrink_axis_mask):
-    return self._lazy_read(
-        gen_array_ops.resource_strided_slice_assign(
-            ref=self.handle,
-            begin=begin,
-            end=end,
-            strides=strides,
-            value=value,
-            name=name,
-            begin_mask=begin_mask,
-            end_mask=end_mask,
-            ellipsis_mask=ellipsis_mask,
-            new_axis_mask=new_axis_mask,
-            shrink_axis_mask=shrink_axis_mask))
+    with _handle_graph(self.handle), self._assign_dependencies():
+      return self._lazy_read(
+          gen_array_ops.resource_strided_slice_assign(
+              ref=self.handle,
+              begin=begin,
+              end=end,
+              strides=strides,
+              value=ops.convert_to_tensor(value, dtype=self.dtype),
+              name=name,
+              begin_mask=begin_mask,
+              end_mask=end_mask,
+              ellipsis_mask=ellipsis_mask,
+              new_axis_mask=new_axis_mask,
+              shrink_axis_mask=shrink_axis_mask))
 
   def __int__(self):
     if self.dtype != dtypes.int32 and self.dtype != dtypes.int64:
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index 1dd464d51d9d1b..10d576c95bc4fd 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -13,16 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""RNN helpers for TensorFlow models.
-
-
-@@bidirectional_dynamic_rnn
-@@dynamic_rnn
-@@raw_rnn
-@@static_rnn
-@@static_state_saving_rnn
-@@static_bidirectional_rnn
-"""
+"""RNN helpers for TensorFlow models."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -507,7 +498,7 @@ def dynamic_rnn(cell, inputs, sequence_length=None, initial_state=None,
       nested) tuple of Tensors each with dimensions `[batch_size, ...]`.
     sequence_length: (optional) An int32/int64 vector sized `[batch_size]`.
       Used to copy-through state and zero-out outputs when past a batch
-      element's sequence length.  So it's more for correctness than performance.
+      element's sequence length.  So it's more for performance than correctness.
     initial_state: (optional) An initial state for the RNN.
       If `cell.state_size` is an integer, this must be
       a `Tensor` of appropriate type and shape `[batch_size, cell.state_size]`.
@@ -1410,6 +1401,13 @@ def static_state_saving_rnn(cell,
     outputs[-1] = nest.pack_sequence_as(
         structure=last_output, flat_sequence=flat_last_output)
 
+    if state_is_tuple:
+      state = nest.pack_sequence_as(
+          structure=state,
+          flat_sequence=[array_ops.identity(s) for s in flat_state])
+    else:
+      state = array_ops.identity(state)
+
   return (outputs, state)
 
 
diff --git a/tensorflow/python/ops/rnn_cell.py b/tensorflow/python/ops/rnn_cell.py
index 3d26ffb7ae1979..79eab1854a9d7b 100644
--- a/tensorflow/python/ops/rnn_cell.py
+++ b/tensorflow/python/ops/rnn_cell.py
@@ -12,30 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Module for constructing RNN Cells.
-
-## Base interface for all RNN Cells
-
-@@RNNCell
-
-## RNN Cells for use with TensorFlow's core RNN methods
-
-@@BasicRNNCell
-@@BasicLSTMCell
-@@GRUCell
-@@LSTMCell
-
-## Classes storing split `RNNCell` state
-
-@@LSTMStateTuple
-
-## RNN Cell wrappers (RNNCells that wrap other RNNCells)
-
-@@MultiRNNCell
-@@DropoutWrapper
-@@DeviceWrapper
-@@ResidualWrapper
-"""
+"""Module for constructing RNN Cells."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index 86dc053c0fb8d0..05723c6960af37 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -46,7 +46,7 @@
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import checkpointable
+from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -785,10 +785,14 @@ def build(self, inputs_shape):
         shape=[input_depth + h_depth, 4 * self._num_units],
         initializer=self._initializer,
         partitioner=maybe_partitioner)
+    if self.dtype is None:
+      initializer = init_ops.zeros_initializer
+    else:
+      initializer = init_ops.zeros_initializer(dtype=self.dtype)
     self._bias = self.add_variable(
         _BIAS_VARIABLE_NAME,
         shape=[4 * self._num_units],
-        initializer=init_ops.zeros_initializer(dtype=self.dtype))
+        initializer=initializer)
     if self._use_peepholes:
       self._w_f_diag = self.add_variable("w_f_diag", shape=[self._num_units],
                                          initializer=self._initializer)
@@ -975,6 +979,7 @@ def dropout_state_filter_visitor(s):
         but not `callable`.
       ValueError: if any of the keep_probs are not between 0 and 1.
     """
+    super(DropoutWrapper, self).__init__()
     assert_like_rnncell("cell", cell)
 
     if (dropout_state_filter_visitor is not None
@@ -1001,6 +1006,8 @@ def tensor_and_const_value(v):
 
     # Set cell, variational_recurrent, seed before running the code below
     self._cell = cell
+    if isinstance(cell, checkpointable.CheckpointableBase):
+      self._track_checkpointable(self._cell, name="cell")
     self._variational_recurrent = variational_recurrent
     self._seed = seed
 
@@ -1147,7 +1154,10 @@ def __init__(self, cell, residual_fn=None):
         Defaults to calling nest.map_structure on (lambda i, o: i + o), inputs
         and outputs.
     """
+    super(ResidualWrapper, self).__init__()
     self._cell = cell
+    if isinstance(cell, checkpointable.CheckpointableBase):
+      self._track_checkpointable(self._cell, name="cell")
     self._residual_fn = residual_fn
 
   @property
@@ -1202,7 +1212,10 @@ def __init__(self, cell, device):
       cell: An instance of `RNNCell`.
       device: A device string or function, for passing to `tf.device`.
     """
+    super(DeviceWrapper, self).__init__()
     self._cell = cell
+    if isinstance(cell, checkpointable.CheckpointableBase):
+      self._track_checkpointable(self._cell, name="cell")
     self._device = device
 
   @property
@@ -1318,7 +1331,7 @@ def call(self, inputs, state):
     return cur_inp, new_states
 
 
-class _SlimRNNCell(RNNCell):
+class _SlimRNNCell(RNNCell, checkpointable.NotCheckpointable):
   """A simple wrapper for slim.rnn_cells."""
 
   def __init__(self, cell_fn):
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index 96fb0247157851..16c73213d59821 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -13,10 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Script Language Operators. See the @{$python/script_ops} guide.
-
-@@py_func
-"""
+"""Script Language Operators. See the @{$python/script_ops} guide."""
 
 # pylint: disable=g-bad-name
 from __future__ import absolute_import
@@ -27,6 +24,7 @@
 
 # Used by py_util.cc to get tracebacks.
 import traceback  # pylint: disable=unused-import
+import weakref
 
 import numpy as np
 import six
@@ -91,11 +89,14 @@ class FuncRegistry(object):
   def __init__(self):
     self._lock = threading.Lock()
     self._unique_id = 0  # GUARDED_BY(self._lock)
-    self._funcs = {}
+    # Only store weakrefs to the funtions. The strong reference is stored in
+    # the graph.
+    self._funcs = weakref.WeakValueDictionary()
 
   def insert(self, func):
     """Registers `func` and returns a unique token for this entry."""
     token = self._next_unique_token()
+    # Store a weakref to the function
     self._funcs[token] = func
     return token
 
@@ -148,7 +149,7 @@ def __call__(self, token, on_gpu, args):
     Raises:
       ValueError: if no function is registered for `token`.
     """
-    func = self._funcs[token]
+    func = self._funcs.get(token, None)
     if func is None:
       raise ValueError("callback %s is not found" % token)
     if isinstance(func, EagerFunc):
@@ -183,19 +184,6 @@ def _next_unique_token(self):
 pywrap_tensorflow.InitializePyTrampoline(_py_funcs)
 
 
-class CleanupFunc(object):
-  """A helper class to remove a registered function from _py_funcs."""
-
-  def __init__(self, token):
-    self._token = token
-
-  def __del__(self):
-    if _py_funcs is not None:
-      # If _py_funcs is None, the program is most likely in shutdown, and the
-      # _py_funcs object has been destroyed already.
-      _py_funcs.remove(self._token)
-
-
 def _internal_py_func(func, inp, Tout, stateful=None, eager=False, name=None):
   """See documentation for py_func and eager_py_func."""
 
@@ -219,17 +207,15 @@ def _internal_py_func(func, inp, Tout, stateful=None, eager=False, name=None):
     # bound to that of the outer graph instead.
     graph = graph._outer_graph
 
-  cleanup = CleanupFunc(token)
-
   # TODO(zhifengc): Consider adding a Graph method to collect
   # `cleanup` objects in one of its member.
-  if not hasattr(graph, "_cleanup_py_funcs_used_in_graph"):
-    graph._cleanup_py_funcs_used_in_graph = []
+  if not hasattr(graph, "_py_funcs_used_in_graph"):
+    graph._py_funcs_used_in_graph = []
 
-  # When `graph` is destroyed, elements in _cleanup_py_funcs_used_in_graph
-  # will be destroyed and their __del__ will remove the 'token' from
-  # the funcs registry.
-  graph._cleanup_py_funcs_used_in_graph.append(cleanup)
+  # Store a reference to the function in the graph to ensure it stays alive
+  # as long as the graph lives. When the graph is destroyed, the function
+  # is left to the garbage collector for destruction as well.
+  graph._py_funcs_used_in_graph.append(func)
   # pylint: enable=protected-access
 
   if eager:
@@ -246,14 +232,68 @@ def _internal_py_func(func, inp, Tout, stateful=None, eager=False, name=None):
 
 
 def eager_py_func(func, inp, Tout, name=None):
-  """Wraps a python function into a TensorFlow op.
+  """Wraps a python function into a TensorFlow op that executes it eagerly.
+
+  This function allows expressing computations in a TensorFlow graph as
+  Python functions. In particular, it wraps a Python function `func`
+  in a TensorFlow operation that executes it with eager exeuction enabled. As a
+  consequence, `tf.contrib.eager.py_func` makes it possible to express control
+  flow using Python constructs (`if`, `while`, `for`, etc.), instead of
+  TensorFlow control flow constructs (@{tf.cond}, @{tf.while_loop}). For
+  example, you might use `tf.contrib.eager.py_func` to implement the log huber
+  function:
+
+  ```python
+  def log_huber(x, m):
+    if tf.abs(x) <= m:
+      return x ** 2
+    else:
+      return m ** 2 * (1 - 2 * tf.log(m) + tf.log(x ** 2))
+
+  x = tf.placeholder(tf.float32)
+  m = tf.placeholder(tf.float32)
+
+  y = tf.contrib.eager.py_func(func=log_huber, inp=[x, m], Tout=tf.float32)
 
-  When the returned op is executed, `func` is invoked with eager execution
-  enabled. Inputs are Tensor objects and func must return None or objects
-  that may be converted to Tensor objects.
+  with tf.Session() as sess:
+    # The session executes `log_huber` eagerly. Given the feed values below,
+    # it will take the second branch, so `output` evaluates to 7.24372.
+    output = sess.run(y, feed_dict={x: 3.0, m: 2.0})
+  ```
+
+  You can also use `tf.contrib.eager.py_func` to debug your models at runtime
+  using Python tools, i.e., you can isolate portions of your code that
+  you want to debug, wrap them in Python functions and insert `pdb` tracepoints
+  or print statements as desired, and wrap those functions in
+  `tf.contrib.eager.py_func`.
+
+  For more information on eager execution, see @{$programmers_guide/eager}.
+
+  `tf.contrib.eager.py_func` is similar in spirit to @{tf.py_func}, but unlike
+  the latter, the former lets you use TensorFlow operations in the wrapped
+  Python function. In particular, while @{tf.py_func} only runs on CPUs and
+  wraps functions that take NumPy arrays as inputs and return NumPy arrays as
+  outputs, `tf.contrib.eager.py_func` can be placed on GPUs and wraps functions
+  that take Tensors as inputs, execute TensorFlow operations in their bodies,
+  and return Tensors as outputs.
+
+  `tf.contrib.eager.py_func` is not differentiable, though a gradient may be
+  implemented in the future; if you would like to differentiate through it,
+  please file an issue on Github.
+
+  Like @{tf.py_func}, `tf.contrib.eager.py_func` has the following limitations
+  with respect to serialization and distribution:
+
+  * The body of the function (i.e. `func`) will not be serialized in a
+    `GraphDef`. Therefore, you should not use this function if you need to
+    serialize your model and restore it in a different environment.
+
+  * The operation must run in the same address space as the Python program
+    that calls `tf.contrib.eager.py_func()`. If you are using distributed
+    TensorFlow, you must run a `tf.train.Server` in the same process as the
+    program that calls `tf.contrib.eager.py_func()` and you must pin the created
+    operation to a device in that server (e.g. using `with tf.device():`).
 
-  This function has the same limitations as `py_func` with respect to
-  serialization and distribution.
 
   Args:
     func: A Python function which accepts a list of `Tensor` objects
diff --git a/tensorflow/python/ops/sdca_ops.py b/tensorflow/python/ops/sdca_ops.py
index 24ea68892a94c7..4d5aeec59125a5 100644
--- a/tensorflow/python/ops/sdca_ops.py
+++ b/tensorflow/python/ops/sdca_ops.py
@@ -13,10 +13,6 @@
 # limitations under the License.
 # ==============================================================================
 """A Dual Coordinate Ascent optimizer library for training fast linear models.
-
-@@sdca_optimizer
-@@sdca_fprint
-@@sdca_shrink_l1
 """
 
 # pylint: disable=g-bad-name
diff --git a/tensorflow/python/ops/session_ops.py b/tensorflow/python/ops/session_ops.py
index ad38845153c94e..dee84bab0ce007 100644
--- a/tensorflow/python/ops/session_ops.py
+++ b/tensorflow/python/ops/session_ops.py
@@ -13,12 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Tensor Handle Operations. See the @{$python/session_ops} guide.
-
-@@get_session_handle
-@@get_session_tensor
-@@delete_session_tensor
-"""
+"""Tensor Handle Operations. See the @{$python/session_ops} guide."""
 
 # pylint: disable=g-bad-name
 from __future__ import absolute_import
diff --git a/tensorflow/python/ops/sets.py b/tensorflow/python/ops/sets.py
index 54d6e1db41e99c..41ff241beab5f1 100644
--- a/tensorflow/python/ops/sets.py
+++ b/tensorflow/python/ops/sets.py
@@ -12,13 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tensorflow set operations.
-
-@@set_size
-@@set_intersection
-@@set_union
-@@set_difference
-"""
+"""Tensorflow set operations."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index c580052c32c8b6..c3b16a7bd5387e 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -14,33 +14,7 @@
 # ==============================================================================
 
 # pylint: disable=g-short-docstring-punctuation
-"""Sparse Tensor Representation. See the @{$python/sparse_ops} guide.
-
-@@SparseTensor
-@@SparseTensorValue
-@@sparse_to_dense
-@@sparse_tensor_to_dense
-@@sparse_to_indicator
-@@sparse_merge
-@@sparse_concat
-@@sparse_reorder
-@@sparse_reshape
-@@sparse_slice
-@@sparse_split
-@@sparse_retain
-@@sparse_reset_shape
-@@sparse_fill_empty_rows
-@@sparse_transpose
-@@sparse_reduce_max
-@@sparse_reduce_max_sparse
-@@sparse_reduce_sum
-@@sparse_reduce_sum_sparse
-@@sparse_add
-@@sparse_softmax
-@@sparse_tensor_dense_matmul
-@@sparse_maximum
-@@sparse_minimum
-"""
+"""Sparse Tensor Representation. See the @{$python/sparse_ops} guide."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -110,6 +84,8 @@ def _convert_to_sparse_tensors(sp_inputs):
 
 # pylint: disable=protected-access
 @tf_export("sparse_concat")
+@deprecation.deprecated_args(
+    None, "concat_dim is deprecated, use axis instead", "concat_dim")
 def sparse_concat(axis,
                   sp_inputs,
                   name=None,
@@ -321,7 +297,8 @@ def sparse_add(a, b, thresh=0):
                                                   a.dense_shape, b)
 
 
-def _sparse_cross(inputs, name=None):
+@tf_export("sparse.cross")
+def sparse_cross(inputs, name=None):
   """Generates sparse cross from a list of sparse and dense tensors.
 
   For example, if the inputs are
@@ -350,7 +327,11 @@ def _sparse_cross(inputs, name=None):
   return _sparse_cross_internal(inputs=inputs, hashed_output=False, name=name)
 
 
-def _sparse_cross_hashed(inputs, num_buckets=0, hash_key=None, name=None):
+_sparse_cross = sparse_cross
+
+
+@tf_export("sparse.cross_hashed")
+def sparse_cross_hashed(inputs, num_buckets=0, hash_key=None, name=None):
   """Generates hashed sparse cross from a list of sparse and dense tensors.
 
   For example, if the inputs are
@@ -394,6 +375,8 @@ def _sparse_cross_hashed(inputs, num_buckets=0, hash_key=None, name=None):
       name=name)
 
 
+_sparse_cross_hashed = sparse_cross_hashed
+
 _DEFAULT_HASH_KEY = 0xDECAFCAFFE
 
 
@@ -616,6 +599,8 @@ def __repr__(self):
 
 
 @tf_export("sparse_split")
+@deprecation.deprecated_args(
+    None, "split_dim is deprecated, use axis instead", "split_dim")
 def sparse_split(keyword_required=KeywordRequired(),
                  sp_input=None,
                  num_split=None,
diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py
index 5e2146b79f08e6..6204adef3bb5dc 100644
--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@@ -14,9 +14,7 @@
 # ==============================================================================
 """Arithmetic Operations that don't fit into math_ops due to dependencies.
 
-To avoid circular dependencies, some math_ops should go here.  Documentation
-callouts, e.g. "@@my_op" should go in math_ops.  To the user, these are just
-normal math_ops.
+To avoid circular dependencies, some math_ops should go here.
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/ops/spectral_grad.py b/tensorflow/python/ops/spectral_grad.py
index deb0a571780373..0af24114acbe5f 100644
--- a/tensorflow/python/ops/spectral_grad.py
+++ b/tensorflow/python/ops/spectral_grad.py
@@ -32,38 +32,44 @@ def _FFTSizeForGrad(grad, rank):
 
 @ops.RegisterGradient("FFT")
 def _FFTGrad(_, grad):
-  size = math_ops.cast(_FFTSizeForGrad(grad, 1), dtypes.float32)
-  return spectral_ops.ifft(grad) * math_ops.complex(size, 0.)
+  size = math_ops.cast(_FFTSizeForGrad(grad, 1), grad.dtype)
+  return spectral_ops.ifft(grad) * size
 
 
 @ops.RegisterGradient("IFFT")
 def _IFFTGrad(_, grad):
-  rsize = 1. / math_ops.cast(_FFTSizeForGrad(grad, 1), dtypes.float32)
-  return spectral_ops.fft(grad) * math_ops.complex(rsize, 0.)
+  rsize = math_ops.cast(
+      1. / math_ops.cast(_FFTSizeForGrad(grad, 1), grad.dtype.real_dtype),
+      grad.dtype)
+  return spectral_ops.fft(grad) * rsize
 
 
 @ops.RegisterGradient("FFT2D")
 def _FFT2DGrad(_, grad):
-  size = math_ops.cast(_FFTSizeForGrad(grad, 2), dtypes.float32)
-  return spectral_ops.ifft2d(grad) * math_ops.complex(size, 0.)
+  size = math_ops.cast(_FFTSizeForGrad(grad, 2), grad.dtype)
+  return spectral_ops.ifft2d(grad) * size
 
 
 @ops.RegisterGradient("IFFT2D")
 def _IFFT2DGrad(_, grad):
-  rsize = 1. / math_ops.cast(_FFTSizeForGrad(grad, 2), dtypes.float32)
-  return spectral_ops.fft2d(grad) * math_ops.complex(rsize, 0.)
+  rsize = math_ops.cast(
+      1. / math_ops.cast(_FFTSizeForGrad(grad, 2), grad.dtype.real_dtype),
+      grad.dtype)
+  return spectral_ops.fft2d(grad) * rsize
 
 
 @ops.RegisterGradient("FFT3D")
 def _FFT3DGrad(_, grad):
-  size = math_ops.cast(_FFTSizeForGrad(grad, 3), dtypes.float32)
-  return spectral_ops.ifft3d(grad) * math_ops.complex(size, 0.)
+  size = math_ops.cast(_FFTSizeForGrad(grad, 3), grad.dtype)
+  return spectral_ops.ifft3d(grad) * size
 
 
 @ops.RegisterGradient("IFFT3D")
 def _IFFT3DGrad(_, grad):
-  rsize = 1. / math_ops.cast(_FFTSizeForGrad(grad, 3), dtypes.float32)
-  return spectral_ops.fft3d(grad) * math_ops.complex(rsize, 0.)
+  rsize = math_ops.cast(
+      1. / math_ops.cast(_FFTSizeForGrad(grad, 3), grad.dtype.real_dtype),
+      grad.dtype)
+  return spectral_ops.fft3d(grad) * rsize
 
 
 def _RFFTGradHelper(rank, irfft_fn):
diff --git a/tensorflow/python/ops/spectral_ops.py b/tensorflow/python/ops/spectral_ops.py
index 4a4ca693dcd577..28054f50ef3b12 100644
--- a/tensorflow/python/ops/spectral_ops.py
+++ b/tensorflow/python/ops/spectral_ops.py
@@ -12,22 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Spectral operators (e.g. DCT, FFT, RFFT).
-
-@@dct
-@@fft
-@@ifft
-@@fft2d
-@@ifft2d
-@@fft3d
-@@ifft3d
-@@rfft
-@@irfft
-@@rfft2d
-@@irfft2d
-@@rfft3d
-@@irfft3d
-"""
+"""Spectral operators (e.g. DCT, FFT, RFFT)."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py
index f6a11ca625b46c..94d7458ec87358 100644
--- a/tensorflow/python/ops/state_ops.py
+++ b/tensorflow/python/ops/state_ops.py
@@ -13,71 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""Variables. See the @{$python/state_ops} guide.
-
-@@AUTO_REUSE
-@@IndexedSlices
-@@Saver
-@@Variable
-@@VariableScope
-@@all_variables
-@@assert_variables_initialized
-@@assign
-@@assign_add
-@@assign_sub
-@@constant_initializer
-@@export_meta_graph
-@@fixed_size_partitioner
-@@get_checkpoint_state
-@@get_local_variable
-@@get_variable
-@@get_variable_scope
-@@global_variables
-@@global_variables_initializer
-@@glorot_normal_initializer
-@@glorot_uniform_initializer
-@@import_meta_graph
-@@initialize_all_tables
-@@initialize_all_variables
-@@initialize_local_variables
-@@initialize_variables
-@@is_variable_initialized
-@@latest_checkpoint
-@@local_variables
-@@local_variables_initializer
-@@make_template
-@@min_max_variable_partitioner
-@@model_variables
-@@moving_average_variables
-@@no_regularizer
-@@ones_initializer
-@@orthogonal_initializer
-@@random_normal_initializer
-@@random_uniform_initializer
-@@report_uninitialized_variables
-@@scatter_add
-@@scatter_div
-@@scatter_mul
-@@scatter_nd_add
-@@scatter_nd_sub
-@@scatter_nd_update
-@@scatter_sub
-@@scatter_update
-@@scatter_min
-@@scatter_max
-@@sparse_mask
-@@tables_initializer
-@@trainable_variables
-@@truncated_normal_initializer
-@@uniform_unit_scaling_initializer
-@@update_checkpoint_state
-@@variable_axis_size_partitioner
-@@variable_op_scope
-@@variable_scope
-@@variables_initializer
-@@variance_scaling_initializer
-@@zeros_initializer
-"""
+"""Variables. See the @{$python/state_ops} guide."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index 5bd75b9215fdbc..0280c89c10f264 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -16,18 +16,6 @@
 """Operations for working with string Tensors.
 
 See the @{$python/string_ops} guide.
-
-@@regex_replace
-@@string_to_hash_bucket_fast
-@@string_to_hash_bucket_strong
-@@string_to_hash_bucket
-@@reduce_join
-@@string_join
-@@string_split
-@@substr
-@@as_string
-@@encode_base64
-@@decode_base64
 """
 
 from __future__ import absolute_import
@@ -51,6 +39,8 @@
 from tensorflow.python.util.tf_export import tf_export
 # pylint: enable=wildcard-import
 
+# Expose regex_full_match in strings namespace
+tf_export("strings.regex_full_match")(regex_full_match)
 
 @tf_export("string_split")
 def string_split(source, delimiter=" ", skip_empty=True):  # pylint: disable=invalid-name
@@ -101,6 +91,59 @@ def string_split(source, delimiter=" ", skip_empty=True):  # pylint: disable=inv
   shape.set_shape([2])
   return sparse_tensor.SparseTensor(indices, values, shape)
 
+@tf_export("strings.split")
+def string_split_v2(source, sep=None, maxsplit=-1):
+  """Split elements of `source` based on `sep` into a `SparseTensor`.
+
+  Let N be the size of source (typically N will be the batch size). Split each
+  element of `source` based on `sep` and return a `SparseTensor`
+  containing the split tokens. Empty tokens are ignored.
+
+  For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
+  then the output will be
+
+  st.indices = [0, 0;
+                0, 1;
+                1, 0;
+                1, 1;
+                1, 2]
+  st.shape = [2, 3]
+  st.values = ['hello', 'world', 'a', 'b', 'c']
+
+  If `sep` is given, consecutive delimiters are not grouped together and are
+  deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
+  sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
+  string, consecutive whitespace are regarded as a single separator, and the
+  result will contain no empty strings at the startor end if the string has
+  leading or trailing whitespace.
+
+  Note that the above mentioned behavior matches python's str.split.
+
+  Args:
+    source: `1-D` string `Tensor`, the strings to split.
+    sep: `0-D` string `Tensor`, the delimiter character.
+    maxsplit: An `int`. If `maxsplit > 0`, limit of the split of the result.
+
+  Raises:
+    ValueError: If sep is not a string.
+
+  Returns:
+    A `SparseTensor` of rank `2`, the strings split according to the delimiter.
+    The first column of the indices corresponds to the row in `source` and the
+    second column corresponds to the index of the split component in this row.
+  """
+  if sep is None:
+    sep = ''
+  sep = ops.convert_to_tensor(sep, dtype=dtypes.string)
+  source = ops.convert_to_tensor(source, dtype=dtypes.string)
+
+  indices, values, shape = gen_string_ops.string_split_v2(
+      source, sep=sep, maxsplit=maxsplit)
+  indices.set_shape([None, 2])
+  values.set_shape([None])
+  shape.set_shape([2])
+  return sparse_tensor.SparseTensor(indices, values, shape)
+
 
 def _reduce_join_reduction_dims(x, axis, reduction_indices):
   """Returns range(rank(x) - 1, 0, -1) if reduction_indices is None."""
@@ -113,7 +156,7 @@ def _reduce_join_reduction_dims(x, axis, reduction_indices):
     return axis
   else:
     # Fast path: avoid creating Rank and Range ops if ndims is known.
-    if isinstance(x, ops.Tensor) and x.get_shape().ndims is not None:
+    if x.get_shape().ndims is not None:
       return constant_op.constant(
           np.arange(x.get_shape().ndims - 1, -1, -1), dtype=dtypes.int32)
 
@@ -127,10 +170,11 @@ def reduce_join(inputs, axis=None,
                 separator="",
                 name=None,
                 reduction_indices=None):
+  inputs_t = ops.convert_to_tensor(inputs)
   reduction_indices = _reduce_join_reduction_dims(
-      inputs, axis, reduction_indices)
+      inputs_t, axis, reduction_indices)
   return gen_string_ops.reduce_join(
-      inputs=inputs,
+      inputs=inputs_t,
       reduction_indices=reduction_indices,
       keep_dims=keep_dims,
       separator=separator,
diff --git a/tensorflow/python/ops/template.py b/tensorflow/python/ops/template.py
index 9b6b8c508fcd7e..355b0d961e2105 100644
--- a/tensorflow/python/ops/template.py
+++ b/tensorflow/python/ops/template.py
@@ -26,7 +26,7 @@
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import checkpointable
+from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.deprecation import deprecated
@@ -295,42 +295,6 @@ def __init__(self, name, func, create_scope_now=False, unique_name=None,
     # which is not the same as whether the scope has been created.
     self._variables_created = False
 
-  @property
-  def _checkpoint_dependencies(self):
-    """Sanity checking for object-based saving.
-
-    Does not override Checkpointable dependency tracking, but checks that
-    variables accessible through Checkpointable dependencies on other `Template`
-    objects include all of the variable_scope-filtered `Template.variables`.
-
-    Returns:
-      A list of checkpointable.CheckpointableReference objects.
-    Raises:
-      ValueError: If this object is not compatible with object-based saving.
-    """
-    dependencies = super(Template, self)._checkpoint_dependencies
-    dependency_variables = []
-    for _, dependency in dependencies:
-      if isinstance(dependency, Template):
-        dependency_variables.extend(dependency.variables)
-      else:
-        dependency_variables.append(dependency)
-    dependency_variables = set(dependency_variables)
-    not_included_variables = []
-    for expected_variable in sorted(self.variables, key=lambda v: v.name):
-      if expected_variable not in dependency_variables:
-        not_included_variables.append(expected_variable)
-    if not_included_variables:
-      # Trying to save a Template which improperly tracks its variables.
-      raise ValueError(
-          ("The Template '%s' references variables which are not included via "
-           "object-based dependency tracking. Most likely a custom "
-           "getter/creator was registered which does not call Template's "
-           "custom variable creator (which is responsible for tracking "
-           "dependencies).\n\nExpected these variables to be dependencies: %s")
-          % (self, not_included_variables))
-    return dependencies
-
   def _checkpointable_custom_creator(self, next_creator, name, initial_value,
                                      checkpointable_parent=None, **kwargs):
     """A variable creation hook which adds Checkpointable dependencies.
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index 2f6badcb532c0e..cc92da4fd7afd4 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -12,10 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""TensorArray: a dynamically sized array of Tensors.
-
-@@TensorArray
-"""
+"""TensorArray: a dynamically sized array of Tensors."""
 # Mixture of pep8 and non-pep8 names, so disable pylint bad-name
 # pylint: disable=g-bad-name
 from __future__ import absolute_import
@@ -23,6 +20,7 @@
 from __future__ import print_function
 
 import contextlib
+import weakref
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
@@ -398,69 +396,8 @@ def close(self, name=None):
 # pylint: enable=protected-access
 
 
-# pylint: disable=protected-access
-def _eager_write_no_copy(ta, index, value):
-  """Writes value into an _EagerTensorArray without creating a new TensorArray.
-
-  Args:
-    ta: _EagerTensorArray into which to write value.
-    index: 0-D.  int32 scalar with the index to write to.
-    value: N-D.  Tensor of type `dtype`.  The Tensor to write to this index.
-
-  Raises:
-    errors_impl.AlreadyExistsError: attempting to overwrite an entry.
-    errors_impl.InvalidArgumentError: value dtype does not match `ta`'s dtype.
-    errors_impl.OutOfRangeError: `index` is out of bounds.
-    ValueError: shape of `value` is not consistent with inferred shape.
-  """
-
-  if isinstance(index, ops.EagerTensor):
-    index = index.numpy()
-
-  if index < 0:
-    raise errors_impl.OutOfRangeError(
-        None, None,
-        "Writing to negative indices (index %d) is not allowed." % index)
-
-  tensor_array = ta._tensor_array
-  size = len(tensor_array)
-  if index >= size:
-    if not ta._dynamic_size:
-      raise errors_impl.OutOfRangeError(
-          None, None,
-          "Tried to write to index %d but array is not resizeable and size "
-          "is: %d" % (index, size))
-    tensor_array.extend([None for _ in range(index - size + 1)])
-
-  if not isinstance(value, ops.EagerTensor):
-    value = constant_op.constant(value)
-
-  if ta._infer_shape:
-    if ta._element_shape is None:
-      ta._element_shape = value.shape
-    elif ta._element_shape != value.shape:
-      raise ValueError("Incompatible shape for value (%s), expected (%s)" %
-                       (value.shape.as_list(), ta._element_shape.as_list()))
-
-  if ta._dtype != value.dtype:
-    raise errors_impl.InvalidArgumentError(
-        None, None,
-        "TensorArray dtype is %s but Op is trying to write dtype %s" %
-        (ta._dtype.name, value.dtype.name))
-
-  if ta._tensor_array[index] is not None:
-    raise errors_impl.AlreadyExistsError(
-        None, None,
-        "Could not write to TensorArray index %d because it has already been "
-        "written to." % index)
-
-  tensor_array[index] = value
-
-# pylint: enable=protected-access
-
-
 class _EagerTensorArray(object):
-  """Eager-mode implementation of TensorArray.
+  """Eager-compatible implementation of TensorArray.
   """
 
   def __init__(self,
@@ -475,7 +412,7 @@ def __init__(self,
                element_shape=None,
                colocate_with_first_write_call=True,
                name=None):
-    """Constructs an Eager mode TensorArray.
+    """Constructs a TensorArray compatible with eager execution.
 
     Args:
       dtype: (required) data type of the TensorArray.
@@ -498,16 +435,19 @@ def __init__(self,
       ValueError: handle or flow are supplied, or if size is not supplied.
     """
 
-    del (flow, tensor_array_name, name)  # not meaningful in Eager
+    del (flow, tensor_array_name, name)  # Unused.
 
     if handle is not None:
-      raise ValueError("TensorArray handles are not supported in Eager mode.")
+      raise ValueError("TensorArray handles are not supported when eager "
+                       "execution is enabled.")
     if size is None:
-      raise ValueError("Size must be declared for TensorArrays in Eager mode.")
+      raise ValueError("Size must be declared for TensorArrays when eager "
+                       "execution is enabled.")
 
-    # These attributes are not meaningful in Eager, but some library functions
-    # (e.g., those in control_flow_ops.py) access them to create new tensor
-    # arrays; as such, we define them for the sake of compatibility.
+    # These attributes are not meaningful when eager is enabled, but some
+    # library functions (e.g., those in control_flow_ops.py) access them to
+    # create new tensor arrays; as such, we define them for the sake of
+    # compatibility.
     self._handle = None
     # we assign a dummy value to _flow in case other code assumes it to be
     # a Tensor
@@ -528,7 +468,7 @@ def __init__(self,
 
   @property
   def flow(self):
-    """Flows are not meaningful in Eager; this exists for compatibility."""
+    """For compatibility; flows are not meaningful when eager is enabled."""
     return self._flow
 
   @property
@@ -537,42 +477,22 @@ def dtype(self):
 
   @property
   def handle(self):
-    """Handles are not meaningful in Eager; this exists for compatibility."""
+    """For compatibility; handles are not meaningful when eager is enabled."""
     return self._handle
 
-  def _identity_without_array(self):
-    """Returns a new TensorArray with the same properties as this Eager one.
-
-    NB: Does not set the underlying _tensor_array attribute.
-    """
-    ta = TensorArray(
-        dtype=self._dtype,
-        size=len(self._tensor_array),
-        dynamic_size=self._dynamic_size,
-        clear_after_read=self._clear_after_read,
-        handle=self._handle,
-        flow=self._flow,
-        infer_shape=self._infer_shape,
-        element_shape=self._element_shape,
-        colocate_with_first_write_call=self._colocate_with_first_write_call)
-    ta._implementation._previously_read_indices = self._previously_read_indices  # pylint: disable=protected-access
-    return ta
-
   def identity(self):
     """See TensorArray."""
-    ta = self._identity_without_array()
-    ta._implementation._tensor_array = [t for t in self._tensor_array]  # pylint: disable=protected-access
-    return ta
+    return self.parent()
 
   def grad(self, source, flow=None, name=None):
     raise NotImplementedError(
-        "TensorArray.grad is not supported in Eager mode; Eager's gradient "
-        "implementation does not use/need this function to compute gradients "
-        "of operations that use TensorArrays.")
+        "TensorArray.grad is not supported when executing eagerly; eager's "
+        "gradient implementation does not use/need this function to compute "
+        "gradients of operations that use TensorArrays.")
 
   def read(self, index, name=None):
     """See TensorArray."""
-    del name  # not meaningful in Eager mode
+    del name  # not meaningful when executing eagerly.
 
     if isinstance(index, ops.EagerTensor):
       index = index.numpy()
@@ -603,12 +523,58 @@ def read(self, index, name=None):
       self._previously_read_indices.append(index)
     return tensor
 
+  def _write(self, index, value):
+    """Writes `value` into index named by `index`.
+
+    Args:
+      index: 0-D.  int32 scalar with the index to write to.
+      value: N-D.  Tensor of type `dtype`.  The `Tensor` to write to `index`.
+
+    Raises:
+      errors_impl.InvalidArgumentError: `value` dtype does not match dtype.
+      errors_impl.OutOfRangeError: `index` is out of bounds.
+      ValueError: shape of `value` is not consistent with inferred shape.
+    """
+
+    if isinstance(index, ops.EagerTensor):
+      index = index.numpy()
+
+    if index < 0:
+      raise errors_impl.OutOfRangeError(
+          None, None,
+          "Writing to negative indices (index %d) is not allowed." % index)
+
+    size = len(self._tensor_array)
+    if index >= size:
+      if not self._dynamic_size:
+        raise errors_impl.OutOfRangeError(
+            None, None,
+            "Tried to write to index %d but array is not resizeable and size "
+            "is: %d" % (index, size))
+      self._tensor_array.extend([None for _ in range(index - size + 1)])
+
+    if not isinstance(value, ops.EagerTensor):
+      value = constant_op.constant(value)
+
+    if self._infer_shape:
+      if self._element_shape is None:
+        self._element_shape = value.shape
+      elif self._element_shape != value.shape:
+        raise ValueError("Incompatible shape for value (%s), expected (%s)" %
+                         (value.shape.as_list(), self._element_shape.as_list()))
+
+    if self._dtype != value.dtype:
+      raise errors_impl.InvalidArgumentError(
+          None, None,
+          "TensorArray dtype is %s but Op is trying to write dtype %s" %
+          (self._dtype.name, value.dtype.name))
+    self._tensor_array[index] = value
+
   def write(self, index, value, name=None):
     """See TensorArray."""
-    del name  # not meaningful in Eager mode
-    ta = self.identity()
-    _eager_write_no_copy(ta._implementation, index, value)  # pylint: disable=protected-access
-    return ta
+    del name  # not meaningful when executing eagerly.
+    self._write(index, value)
+    return self.parent()
 
   def _maybe_zero(self, ix):
     val = self._tensor_array[ix]
@@ -626,7 +592,7 @@ def stack(self, name=None):
 
   def gather(self, indices, name=None):
     """See TensorArray."""
-    del name  # not meaningful in Eager mode
+    del name  # not meaningful when executing eagerly.
     return array_ops.stack([self._maybe_zero(i) for i in indices.numpy()])
 
   def concat(self, name=None):
@@ -654,17 +620,15 @@ def unstack(self, value, name=None):
       raise ValueError(
           "Cannot unstack %d tensors into a TensorArray of static size %d" %
           (len(tensors), len(self._tensor_array)))
-    ta = self._identity_without_array()
-    ta._implementation._tensor_array = tensors  # pylint: disable=protected-access
-    return ta
+    self._tensor_array = tensors
+    return self.parent()
 
   def scatter(self, indices, value, name=None):
     """See TensorArray."""
-    del name  # unused in Eager
-    ta = self.identity()
+    del name  # not meaningful when executing eagerly.
     for index, val in zip(indices.numpy(), array_ops.unstack(value)):
-      _eager_write_no_copy(ta._implementation, index, val)  # pylint: disable=protected-access
-    return ta
+      self._write(index, val)  # pylint: disable=protected-access
+    return self.parent()
 
   def split(self, value, lengths, name=None):
     """See TensorArray."""
@@ -693,20 +657,17 @@ def split(self, value, lengths, name=None):
           "dynamically resizeable" % (len(self._tensor_array),
                                       lengths.shape[0]))
     else:
-      ta = self._identity_without_array()
-      tensor_array = array_ops.split(value, lengths, name=name)
-      ta._implementation._tensor_array = tensor_array  # pylint: disable=protected-access
-      return ta
+      self._tensor_array = array_ops.split(value, lengths, name=name)
+      return self.parent()
 
   def size(self, name=None):
     """See TensorArray."""
-    del name  # not meaningful in Eager mode
+    del name  # not meaningful when executing eagerly.
     return constant_op.constant(len(self._tensor_array))
 
   def close(self, name=None):
-    del name  # not meaningful in Eager mode
+    del name  # not meaningful when executing eagerly.
     del self._tensor_array[:]
-    return
 
 
 # TensorArray is designed to hide an underlying implementation object
@@ -792,6 +753,8 @@ def __init__(self,
         colocate_with_first_write_call=colocate_with_first_write_call,
         name=name)
 
+    self._implementation.parent = weakref.ref(self)
+
   @property
   def flow(self):
     """The flow `Tensor` forcing ops leading to this TensorArray state."""
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index ba213ef884165f..9a711edaa44f71 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+ # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -32,7 +32,6 @@
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.eager import context
-from tensorflow.python.estimator import util as estimator_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -41,6 +40,7 @@
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import function_utils
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
@@ -422,7 +422,7 @@ def _true_getter(name, shape=None, dtype=dtypes.float32,  # pylint: disable=miss
           "use_resource": use_resource,
       }
       # `fn_args` can handle functions, `functools.partial`, `lambda`.
-      if "constraint" in estimator_util.fn_args(custom_getter):
+      if "constraint" in function_utils.fn_args(custom_getter):
         custom_getter_kwargs["constraint"] = constraint
       return custom_getter(**custom_getter_kwargs)
     else:
@@ -794,6 +794,14 @@ def _get_single_variable(self,
         validate_shape=validate_shape,
         constraint=constraint,
         use_resource=use_resource)
+    if context.executing_eagerly() and self._store_eager_variables:
+      if collections:
+        ops.add_to_collections(collections, v)
+      else:
+        ops.add_to_collection(ops.GraphKeys.GLOBAL_VARIABLES, v)
+      if trainable:
+        ops.add_to_collection(ops.GraphKeys.TRAINABLE_VARIABLES, v)
+
     if not context.executing_eagerly() or self._store_eager_variables:
       # In eager mode we do not want to keep default references to Variable
       # objects as this will prevent their memory from being released.
@@ -840,7 +848,8 @@ def _get_default_initializer(self, name, shape=None, dtype=dtypes.float32):
       initializing_from_value = False
     # If dtype is DT_INT/DT_UINT, provide a default value `zero`
     # If dtype is DT_BOOL, provide a default value `FALSE`
-    elif dtype.is_integer or dtype.is_unsigned or dtype.is_bool:
+    elif (dtype.is_integer or dtype.is_unsigned or dtype.is_bool
+          or dtype == dtypes.string):
       initializer = init_ops.zeros_initializer()
       initializing_from_value = False
     # NOTES:Do we need to support for handling DT_STRING and DT_COMPLEX here?
@@ -1175,7 +1184,7 @@ def open_variable_scope(self, scope_name):
 
   def close_variable_subscopes(self, scope_name):
     for k in list(self.variable_scopes_count.keys()):
-      if not scope_name or k.startswith(scope_name + "/"):
+      if scope_name is None or k.startswith(scope_name + "/"):
         self.variable_scopes_count[k] = 0
 
   def variable_scope_count(self, scope_name):
@@ -1260,13 +1269,13 @@ def variables(self):
 
   def trainable_variables(self):
     # pylint: disable=protected-access
-    return sorted([x for x in self._store._vars.values() if x._trainable],
+    return sorted([x for x in self._store._vars.values() if x.trainable],
                   key=lambda x: x.name)
     # pylint: enable=protected-access
 
   def non_trainable_variables(self):
     # pylint: disable=protected-access
-    return sorted([x for x in self._store._vars.values() if not x._trainable],
+    return sorted([x for x in self._store._vars.values() if not x.trainable],
                   key=lambda x: x.name)
     # pylint: enable=protected-access
 
@@ -1295,7 +1304,7 @@ def copy(self):
       new_var = resource_variable_ops.ResourceVariable(
           var.read_value(),
           name=stripped_var_name,
-          trainable=var._trainable)
+          trainable=var.trainable)
       new_store._store._vars[key] = new_var
     return new_store
     # pylint: enable=protected-access
@@ -1364,7 +1373,9 @@ def foo():
   name: The name of the new or existing variable.
   shape: Shape of the new or existing variable.
   dtype: Type of the new or existing variable (defaults to `DT_FLOAT`).
-  initializer: Initializer for the variable if one is created.
+  initializer: Initializer for the variable if one is created. Can either be
+    an initializer object or a Tensor. If it's a Tensor, its shape must be known
+    unless validate_shape is False.
   regularizer: A (Tensor -> Tensor or None) function; the result of
     applying it on a newly created variable will be added to the collection
     @{tf.GraphKeys.REGULARIZATION_LOSSES} and can be used for regularization.
@@ -1380,7 +1391,8 @@ def foo():
     partitions for each axis (currently only one axis can be partitioned).
   validate_shape: If False, allows the variable to be initialized with a
       value of unknown shape. If True, the default, the shape of initial_value
-      must be known.
+      must be known. For this to be used the initializer must be a Tensor and
+      not an initializer object.
   use_resource: If False, creates a regular Variable. If true, creates an
     experimental ResourceVariable instead with well-defined semantics.
     Defaults to False (will later change to True). When eager execution is
@@ -1774,6 +1786,23 @@ class variable_scope(object):
           assert v.name == "foo/bar/v:0"
   ```
 
+  Simple example of how to reenter a premade variable scope safely:
+
+  ```python
+  with tf.variable_scope("foo") as vs:
+    pass
+
+  # Re-enter the variable scope.
+  with tf.variable_scope(vs,
+                         auxiliary_name_scope=False) as vs1:
+    # Restore the original name_scope.
+    with tf.name_scope(vs1.original_name_scope):
+        v = tf.get_variable("v", [1])
+        assert v.name == "foo/v:0"
+        c = tf.constant([1], name="c")
+        assert c.name == "foo/c:0"
+  ```
+
   Basic example of sharing a variable AUTO_REUSE:
 
   ```python
@@ -1911,7 +1940,9 @@ def __init__(self,
         (which must have the same shape). Constraints are not safe to
         use when doing asynchronous distributed training.
       auxiliary_name_scope: If `True`, we create an auxiliary name scope with
-        the scope. If `False`, we don't touch name scope.
+        the scope. If `False`, we don't create it. Note that the argument is
+        not inherited, and it only takes effect for once when creating. You
+        should only use it for re-entering a premade variable scope.
 
     Returns:
       A scope that can be captured and reused.
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index c646f795896f0a..4be9f5eb686401 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -29,7 +29,7 @@
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import checkpointable
+from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import compat
 from tensorflow.python.util import tf_should_use
 from tensorflow.python.util.deprecation import deprecated
@@ -123,6 +123,30 @@ class Variable(checkpointable.CheckpointableBase):
   various `Optimizer` classes use this collection as the default list of
   variables to optimize.
 
+  WARNING: tf.Variable objects have a non-intuitive memory model. A Variable is
+  represented internally as a mutable Tensor which can non-deterministically
+  alias other Tensors in a graph. The set of operations which consume a Variable
+  and can lead to aliasing is undetermined and can change across TensorFlow
+  versions. Avoid writing code which relies on the value of a Variable either
+  changing or not changing as other operations happen. For example, using
+  Variable objects or simple functions thereof as predicates in a `tf.cond` is
+  dangerous and error-prone:
+
+  ```
+  v = tf.Variable(True)
+  tf.cond(v, lambda: v.assign(False), my_false_fn)  # Note: this is broken.
+  ```
+
+  Here replacing tf.Variable with tf.contrib.eager.Variable will fix any
+  nondeterminism issues.
+
+  To use the replacement for variables which does
+  not have these issues:
+
+  * Replace `tf.Variable` with `tf.contrib.eager.Variable`;
+  * Call `tf.get_variable_scope().set_use_resource(True)` inside a
+    `tf.variable_scope` before the `tf.get_variable()` call.
+
   @compatibility(eager)
   `tf.Variable` is not compatible with eager execution.  Use
   `tf.contrib.eager.Variable` instead which is compatible with both eager
@@ -235,7 +259,7 @@ def __init__(self,
           constraint=constraint)
 
   def __repr__(self):
-    if context.executing_eagerly():
+    if context.executing_eagerly() and not self._in_graph_mode:
       return "<tf.Variable '%s' shape=%s dtype=%s, numpy=%s>" % (
           self.name, self.get_shape(), self.dtype.name,
           ops.numpy_text(self.read_value(), is_repr=True))
@@ -317,6 +341,7 @@ def _init_from_args(self,
       self._update_uid = initial_value.checkpoint_position.restore_uid
       initial_value = initial_value.wrapped_value
 
+    self._trainable = trainable
     if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections:
       collections = list(collections) + [ops.GraphKeys.TRAINABLE_VARIABLES]
     with ops.init_scope():
@@ -426,6 +451,7 @@ def _init_from_proto(self, variable_def, import_scope=None):
                                  import_scope=import_scope))
     else:
       self._initial_value = None
+    self._trainable = getattr(variable_def, "trainable", True)
     self._snapshot = g.as_graph_element(
         ops.prepend_name_scope(variable_def.snapshot_name,
                                import_scope=import_scope))
@@ -519,6 +545,10 @@ def set_shape(self, shape):
     self._ref().set_shape(shape)
     self.value().set_shape(shape)
 
+  @property
+  def trainable(self):
+    return self._trainable
+
   def eval(self, session=None):
     """In a session, computes and returns the value of this variable.
 
@@ -1026,6 +1056,7 @@ def to_proto(self, export_scope=None):
         # For backwards compatibility.
         var_def.initial_value_name = ops.strip_name_scope(
             self._initial_value.name, export_scope)
+      var_def.trainable = self.trainable
       var_def.initializer_name = ops.strip_name_scope(
           self.initializer.name, export_scope)
       var_def.snapshot_name = ops.strip_name_scope(
diff --git a/tensorflow/python/platform/base.i b/tensorflow/python/platform/base.i
index 478dd46f7e6965..2e06f26fa4c43b 100644
--- a/tensorflow/python/platform/base.i
+++ b/tensorflow/python/platform/base.i
@@ -233,7 +233,7 @@ _COPY_TYPEMAPS(unsigned int, mode_t);
 // Typemaps to automatically raise a Python exception from bad output TF_Status.
 // TODO(b/77295559): expand this to all TF_Status* output params and deprecate
 // raise_exception_on_not_ok_status (currently it only affects the C API).
-%typemap(in, numinputs=0) TF_Status* status (TF_Status* status) {
+%typemap(in, numinputs=0) TF_Status* status {
   $1 = TF_NewStatus();
 }
 
diff --git a/tensorflow/python/platform/benchmark.py b/tensorflow/python/platform/benchmark.py
index 12dae94a6404e5..eba2baaf6f836c 100644
--- a/tensorflow/python/platform/benchmark.py
+++ b/tensorflow/python/platform/benchmark.py
@@ -213,9 +213,10 @@ def run_op_benchmark(self,
       burn_iters: Number of burn-in iterations to run.
       min_iters: Minimum number of iterations to use for timing.
       store_trace: Boolean, whether to run an extra untimed iteration and
-        store the trace of iteration in the benchmark report.
+        store the trace of iteration in returned extras.
         The trace will be stored as a string in Google Chrome trace format
-        in the extras field "full_trace_chrome_format".
+        in the extras field "full_trace_chrome_format". Note that trace
+        will not be stored in test_log_pb2.TestResults proto.
       store_memory_usage: Boolean, whether to run an extra untimed iteration,
         calculate memory usage, and store that in extras fields.
       name: (optional) Override the BenchmarkEntry name with `name`.
@@ -227,7 +228,9 @@ def run_op_benchmark(self,
 
     Returns:
       A `dict` containing the key-value pairs that were passed to
-      `report_benchmark`.
+      `report_benchmark`. If `store_trace` option is used, then
+      `full_chrome_trace_format` will be included in return dictionary even
+      though it is not passed to `report_benchmark` with `extras`.
     """
     for _ in range(burn_iters):
       sess.run(op_or_tensor, feed_dict=feed_dict)
@@ -242,6 +245,7 @@ def run_op_benchmark(self,
       deltas[i] = delta
 
     extras = extras if extras is not None else {}
+    unreported_extras = {}
     if store_trace or store_memory_usage:
       run_options = config_pb2.RunOptions(
           trace_level=config_pb2.RunOptions.FULL_TRACE)
@@ -251,7 +255,8 @@ def run_op_benchmark(self,
       tl = timeline.Timeline(run_metadata.step_stats)
 
       if store_trace:
-        extras["full_trace_chrome_format"] = tl.generate_chrome_trace_format()
+        unreported_extras["full_trace_chrome_format"] = (
+            tl.generate_chrome_trace_format())
 
       if store_memory_usage:
         step_stats_analysis = tl.analyze_step_stats(show_memory=True)
@@ -277,6 +282,7 @@ def _median(x):
         "throughput": mbs / median_delta
     }
     self.report_benchmark(**benchmark_values)
+    benchmark_values["extras"].update(unreported_extras)
     return benchmark_values
 
 
diff --git a/tensorflow/python/platform/resource_loader.py b/tensorflow/python/platform/resource_loader.py
index 650a1fd85113e6..b2d95518552de3 100644
--- a/tensorflow/python/platform/resource_loader.py
+++ b/tensorflow/python/platform/resource_loader.py
@@ -12,14 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Resource management library.
-
-@@get_data_files_path
-@@get_path_to_datafile
-@@get_root_dir_with_all_resources
-@@load_resource
-@@readahead_file_path
-"""
+"""Resource management library."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tensorflow/python/platform/sysconfig.py b/tensorflow/python/platform/sysconfig.py
index 56759d1b8e1949..7b6c9d19d0bb5c 100644
--- a/tensorflow/python/platform/sysconfig.py
+++ b/tensorflow/python/platform/sysconfig.py
@@ -13,13 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 
-"""System configuration library.
-
-@@get_include
-@@get_lib
-@@get_compile_flags
-@@get_link_flags
-"""
+"""System configuration library."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
diff --git a/tensorflow/python/platform/test.py b/tensorflow/python/platform/test.py
index e5ea8332349163..35a2ab34f964d7 100644
--- a/tensorflow/python/platform/test.py
+++ b/tensorflow/python/platform/test.py
@@ -19,21 +19,6 @@
 
 Note: `tf.test.mock` is an alias to the python `mock` or `unittest.mock`
 depending on the python version.
-
-@@main
-@@TestCase
-@@test_src_dir_path
-@@assert_equal_graph_def
-@@get_temp_dir
-@@is_built_with_cuda
-@@is_built_with_rocm
-@@is_built_with_gpu_support
-@@is_gpu_available
-@@gpu_device_name
-@@compute_gradient
-@@compute_gradient_error
-@@create_local_cluster
-
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/profiler/internal/run_metadata_test.py b/tensorflow/python/profiler/internal/run_metadata_test.py
index fd893d6cde66e5..216cc3dd54b785 100644
--- a/tensorflow/python/profiler/internal/run_metadata_test.py
+++ b/tensorflow/python/profiler/internal/run_metadata_test.py
@@ -23,6 +23,7 @@
 import six
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
@@ -65,7 +66,10 @@ def _run_model():
   w = random_ops.random_normal(shape=[SIZE, 2 * SIZE])
   y = math_ops.matmul(x, w)
 
-  with session.Session() as sess:
+  config = config_pb2.ConfigProto()
+  config.graph_options.rewrite_options.arithmetic_optimization = (
+      rewriter_config_pb2.RewriterConfig.OFF)
+  with session.Session(config=config) as sess:
     run_metadata = config_pb2.RunMetadata()
     opts = builder.time_and_memory()
     opts['min_micros'] = 0
diff --git a/tensorflow/python/profiler/model_analyzer_test.py b/tensorflow/python/profiler/model_analyzer_test.py
index 04ba28c219e276..9e49188c1ef353 100644
--- a/tensorflow/python/profiler/model_analyzer_test.py
+++ b/tensorflow/python/profiler/model_analyzer_test.py
@@ -232,7 +232,12 @@ def testComplexCodeView(self):
 
         self.assertLess(0, tfprof_node.total_exec_micros)
         self.assertEqual(2844, tfprof_node.total_parameters)
-        self.assertLess(168800, tfprof_node.total_float_ops)
+        #The graph is modifed when MKL is enabled,total_float_ops will
+        #be different
+        if test_util.IsMklEnabled():
+          self.assertLess(101600, tfprof_node.total_float_ops)
+        else:
+          self.assertLess(145660, tfprof_node.total_float_ops)
         self.assertEqual(8, len(tfprof_node.children))
         self.assertEqual('_TFProfRoot', tfprof_node.name)
         self.assertEqual(
diff --git a/tensorflow/python/pywrap_tfe.i b/tensorflow/python/pywrap_tfe.i
index 5ee55301df9869..500dc30cc30f75 100644
--- a/tensorflow/python/pywrap_tfe.i
+++ b/tensorflow/python/pywrap_tfe.i
@@ -33,8 +33,9 @@ limitations under the License.
 %rename("%s") TFE_ContextAsyncClearError;
 %rename("%s") TFE_OpNameGetAttrType;
 %rename("%s") TFE_Py_InitEagerTensor;
+%rename("%s") TFE_Py_SetEagerTensorProfiler;
 %rename("%s") TFE_Py_RegisterExceptionClass;
-%rename("%s") TFE_Py_RegisterBackwardFunctionGetter;
+%rename("%s") TFE_Py_RegisterGradientFunction;
 %rename("%s") TFE_Py_RegisterFallbackExceptionClass;
 %rename("%s") TFE_Py_RegisterResourceVariableType;
 %rename("%s") TFE_Py_Execute;
@@ -42,6 +43,7 @@ limitations under the License.
 %rename("%s") TFE_Py_RecordGradient;
 %rename("%s") TFE_Py_UID;
 %rename("%s") TFE_Py_TapeSetNew;
+%rename("%s") TFE_Py_TapeSetAdd;
 %rename("%s") TFE_Py_TapeSetRemove;
 %rename("%s") TFE_Py_TapeSetStopOnThread;
 %rename("%s") TFE_Py_TapeSetRestartOnThread;
@@ -59,6 +61,7 @@ limitations under the License.
 %rename("%s") TFE_ContextOptionsSetAsync;
 %rename("%s") TFE_DeleteContextOptions;
 %rename("%s") TFE_Py_TensorShapeSlice;
+%rename("%s") TFE_Py_TensorShapeOnDevice;
 
 %{
 #include "tensorflow/python/eager/pywrap_tfe.h"
@@ -152,9 +155,12 @@ limitations under the License.
       if (EagerTensor_CheckExact(elem)) {
         (*$1)[i] = EagerTensor_Handle(elem);
       } else {
-        SWIG_exception_fail(SWIG_TypeError,
-                            "provided list of inputs contains objects other "
-                            "than 'EagerTensor'");
+        SWIG_exception_fail(
+            SWIG_TypeError,
+            tensorflow::strings::StrCat(
+                "provided list of inputs contains objects other "
+                "than 'EagerTensor'. Item ",
+                i, " is ", elem->ob_type->tp_name).c_str());
       }
     }
   }
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index 3447d917e9bf2d..24a13c0f336aa9 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -104,10 +104,10 @@ def _save_and_write_assets(self, assets_collection_to_add=None):
     Args:
       assets_collection_to_add: The collection where the asset paths are setup.
     """
-    asset_source_filepath_list = _maybe_save_assets(assets_collection_to_add)
+    asset_filename_map = _maybe_save_assets(assets_collection_to_add)
 
     # Return if there are no assets to write.
-    if len(asset_source_filepath_list) is 0:
+    if not asset_filename_map:
       tf_logging.info("No assets to write.")
       return
 
@@ -119,12 +119,10 @@ def _save_and_write_assets(self, assets_collection_to_add=None):
       file_io.recursive_create_dir(assets_destination_dir)
 
     # Copy each asset from source path to destination path.
-    for asset_source_filepath in asset_source_filepath_list:
-      asset_source_filename = os.path.basename(asset_source_filepath)
-
+    for asset_basename, asset_source_filepath in asset_filename_map.items():
       asset_destination_filepath = os.path.join(
           compat.as_bytes(assets_destination_dir),
-          compat.as_bytes(asset_source_filename))
+          compat.as_bytes(asset_basename))
 
       # Only copy the asset file to the destination if it does not already
       # exist. This is to ensure that an asset with the same name defined as
@@ -132,7 +130,8 @@ def _save_and_write_assets(self, assets_collection_to_add=None):
       if not file_io.file_exists(asset_destination_filepath):
         file_io.copy(asset_source_filepath, asset_destination_filepath)
 
-    tf_logging.info("Assets written to: %s", assets_destination_dir)
+    tf_logging.info("Assets written to: %s",
+                    compat.as_text(assets_destination_dir))
 
   def _maybe_add_legacy_init_op(self, legacy_init_op=None):
     """Add legacy init op to the SavedModel.
@@ -168,6 +167,25 @@ def _add_main_op(self, main_op):
         raise TypeError("main_op needs to be an Operation: %r" % main_op)
       ops.add_to_collection(constants.MAIN_OP_KEY, main_op)
 
+  def _add_train_op(self, train_op):
+    """Add train op to the SavedModel.
+
+    Note that this functionality is in development, and liable to be
+    moved elsewhere.
+
+    Args:
+      train_op: Op or group of ops that are used for training. These are
+        stored as a collection with key TRAIN_OP_KEY, but not executed.
+
+    Raises:
+      TypeError if Train op is not of type `Operation`.
+    """
+    if train_op is not None:
+      if (not isinstance(train_op, ops.Tensor) and
+          not isinstance(train_op, ops.Operation)):
+        raise TypeError("train_op needs to be a Tensor or Op: %r" % train_op)
+      ops.add_to_collection(constants.TRAIN_OP_KEY, train_op)
+
   def _tag_and_add_meta_graph(self, meta_graph_def, tags, signature_def_map):
     """Tags the meta graph def and adds it to the SavedModel.
 
@@ -238,6 +256,20 @@ def _validate_signature_def_map(self, signature_def_map):
         for outputs_key in outputs:
           self._validate_tensor_info(outputs[outputs_key])
 
+  def _add_collections(
+      self, assets_collection, legacy_init_op, main_op, train_op):
+    """Add asset and op collections to be saved."""
+    # Save asset files and write them to disk, if any.
+    self._save_and_write_assets(assets_collection)
+
+    if main_op is None:
+      # Add legacy init op to the SavedModel.
+      self._maybe_add_legacy_init_op(legacy_init_op)
+    else:
+      self._add_main_op(main_op)
+
+    self._add_train_op(train_op)
+
   def add_meta_graph(self,
                      tags,
                      signature_def_map=None,
@@ -285,14 +317,8 @@ def add_meta_graph(self,
     # properly populated.
     self._validate_signature_def_map(signature_def_map)
 
-    # Save asset files and write them to disk, if any.
-    self._save_and_write_assets(assets_collection)
-
-    if main_op is None:
-      # Add legacy init op to the SavedModel.
-      self._maybe_add_legacy_init_op(legacy_init_op)
-    else:
-      self._add_main_op(main_op)
+    # Add assets and ops
+    self._add_collections(assets_collection, legacy_init_op, main_op, None)
 
     # Initialize a saver to generate a sharded output for all saveables in the
     # current scope.
@@ -351,6 +377,7 @@ def add_meta_graph_and_variables(self,
       strip_default_attrs: Boolean. If `True`, default-valued attributes will be
         removed from the NodeDefs. For a detailed guide, see
         [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
+
     """
     # pylint: enable=line-too-long
     if self._has_saved_variables:
@@ -362,8 +389,8 @@ def add_meta_graph_and_variables(self,
     # properly populated.
     self._validate_signature_def_map(signature_def_map)
 
-    # Save asset files and write them to disk, if any.
-    self._save_and_write_assets(assets_collection)
+    # Add assets and ops
+    self._add_collections(assets_collection, legacy_init_op, main_op, None)
 
     # Create the variables sub-directory, if it does not exist.
     variables_dir = os.path.join(
@@ -376,12 +403,6 @@ def add_meta_graph_and_variables(self,
         compat.as_text(variables_dir),
         compat.as_text(constants.VARIABLES_FILENAME))
 
-    if main_op is None:
-      # Add legacy init op to the SavedModel.
-      self._maybe_add_legacy_init_op(legacy_init_op)
-    else:
-      self._add_main_op(main_op)
-
     # Initialize a saver to generate a sharded output for all saveables in the
     # current scope.
     saver = tf_saver.Saver(
@@ -441,7 +462,7 @@ def save(self, as_text=False):
           compat.as_bytes(self._export_dir),
           compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
       file_io.write_string_to_file(path, self._saved_model.SerializeToString())
-    tf_logging.info("SavedModel written to: %s", path)
+    tf_logging.info("SavedModel written to: %s", compat.as_text(path))
 
     return path
 
@@ -453,16 +474,17 @@ def _maybe_save_assets(assets_collection_to_add=None):
     assets_collection_to_add: The collection where the asset paths are setup.
 
   Returns:
-    The list of filepaths to the assets in the assets collection.
+    A dict of asset basenames for saving to the original full path to the asset.
 
   Raises:
     ValueError: Indicating an invalid filepath tensor.
   """
-  asset_source_filepath_list = []
+  # Map of target file names to original filenames
+  asset_filename_map = {}
 
   if assets_collection_to_add is None:
     tf_logging.info("No assets to save.")
-    return asset_source_filepath_list
+    return asset_filename_map
 
   # Iterate over the supplied asset collection, build the `AssetFile` proto
   # and add them to the collection with key `constants.ASSETS_KEY`, in the
@@ -472,15 +494,71 @@ def _maybe_save_assets(assets_collection_to_add=None):
     if not asset_source_filepath:
       raise ValueError("Invalid asset filepath tensor %s" % asset_tensor)
 
-    asset_source_filename = os.path.basename(asset_source_filepath)
+    asset_filename = _get_asset_filename_to_add(
+        asset_source_filepath, asset_filename_map)
 
     # Build `AssetFile` proto and add it to the asset collection in the graph.
-    _add_asset_to_collection(asset_source_filename, asset_tensor)
+    # Note that this should be done even when the file is a duplicate of an
+    # already-added file, as the tensor reference should still exist.
+    _add_asset_to_collection(asset_filename, asset_tensor)
 
-    asset_source_filepath_list.append(asset_source_filepath)
+    # In the cases where we are adding a duplicate, this will result in the
+    # last of the filepaths being the one used for copying the file to the
+    # SavedModel. Since the files in question are the same, it doesn't matter
+    # either way.
+    asset_filename_map[asset_filename] = asset_source_filepath
 
   tf_logging.info("Assets added to graph.")
-  return asset_source_filepath_list
+  return asset_filename_map
+
+
+def _get_asset_filename_to_add(asset_filepath, asset_filename_map):
+  """Get a unique basename to add to the SavedModel if this file is unseen.
+
+  Assets come from users as full paths, and we save them out to the
+  SavedModel as basenames. In some cases, the basenames collide. Here,
+  we dedupe asset basenames by first checking if the file is the same,
+  and, if different, generate and return an index-suffixed basename
+  that can be used to add the asset to the SavedModel.
+
+  Args:
+    asset_filepath: the full path to the asset that is being saved
+    asset_filename_map: a dict of filenames used for saving the asset in
+      the SavedModel to full paths from which the filenames were derived.
+
+  Returns:
+    Uniquified filename string if the file is not a duplicate, or the original
+    filename if the file has already been seen and saved.
+  """
+  asset_filename = os.path.basename(asset_filepath)
+
+  if asset_filename not in asset_filename_map:
+    # This is an unseen asset. Safe to add.
+    return asset_filename
+
+  other_asset_filepath = asset_filename_map[asset_filename]
+  if other_asset_filepath == asset_filepath:
+    # This is the same file, stored twice in the collection list. No need
+    # to make unique.
+    return asset_filename
+
+  # Else, asset_filename is in the map, and the filepath is different. Dedupe.
+  if not file_io.filecmp(asset_filepath, other_asset_filepath):
+    # Files are different; dedupe filenames.
+    return _get_unique_asset_filename(asset_filename, asset_filename_map)
+
+  # Files are the same; don't make unique.
+  return asset_filename
+
+
+def _get_unique_asset_filename(asset_filename, asset_filename_map):
+  i = 1
+  unique_filename = asset_filename
+  while unique_filename in asset_filename_map:
+    unique_filename = compat.as_bytes("_").join(
+        [compat.as_bytes(asset_filename), compat.as_bytes(str(i))])
+    i += 1
+  return unique_filename
 
 
 def _asset_path_from_tensor(path_tensor):
diff --git a/tensorflow/python/saved_model/constants.py b/tensorflow/python/saved_model/constants.py
index 34206c6f6d49f1..61c6ffbd0d11ef 100644
--- a/tensorflow/python/saved_model/constants.py
+++ b/tensorflow/python/saved_model/constants.py
@@ -41,6 +41,10 @@
 tf_export("saved_model.constants.MAIN_OP_KEY").export_constant(
     __name__, "MAIN_OP_KEY")
 
+# CollectionDef key for the SavedModel train op.
+# Not exported while export_all_saved_models is in contrib.
+TRAIN_OP_KEY = "saved_model_train_op"
+
 # Schema version for SavedModel.
 SAVED_MODEL_SCHEMA_VERSION = 1
 tf_export("saved_model.constants.SAVED_MODEL_SCHEMA_VERSION").export_constant(
@@ -65,3 +69,5 @@
 VARIABLES_FILENAME = "variables"
 tf_export("saved_model.constants.VARIABLES_FILENAME").export_constant(
     __name__, "VARIABLES_FILENAME")
+
+
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index 804255375e7c52..7302c77ad55ad4 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -53,12 +53,9 @@ def tearDownModule():
   file_io.delete_recursively(test.get_temp_dir())
 
 
-@test_util.with_c_api
 class SavedModelTest(test.TestCase):
 
   def _get_export_dir(self, label):
-    if ops._USE_C_API:
-      label += "_c_api"
     return os.path.join(test.get_temp_dir(), label)
 
   def _init_and_validate_variable(self, sess, variable_name, variable_value):
@@ -67,9 +64,12 @@ def _init_and_validate_variable(self, sess, variable_name, variable_value):
     self.assertEqual(variable_value, v.eval())
 
   def _build_asset_collection(self, asset_file_name, asset_file_contents,
-                              asset_file_tensor_name):
+                              asset_file_tensor_name, asset_subdir=""):
+    parent_dir = os.path.join(
+        compat.as_bytes(test.get_temp_dir()), compat.as_bytes(asset_subdir))
+    file_io.recursive_create_dir(parent_dir)
     asset_filepath = os.path.join(
-        compat.as_bytes(test.get_temp_dir()), compat.as_bytes(asset_file_name))
+        compat.as_bytes(parent_dir), compat.as_bytes(asset_file_name))
     file_io.write_string_to_file(asset_filepath, asset_file_contents)
     asset_file_tensor = constant_op.constant(
         asset_filepath, name=asset_file_tensor_name)
@@ -80,10 +80,11 @@ def _build_asset_collection(self, asset_file_name, asset_file_contents,
   def _validate_asset_collection(self, export_dir, graph_collection_def,
                                  expected_asset_file_name,
                                  expected_asset_file_contents,
-                                 expected_asset_tensor_name):
+                                 expected_asset_tensor_name,
+                                 asset_id=0):
     assets_any = graph_collection_def[constants.ASSETS_KEY].any_list.value
     asset = meta_graph_pb2.AssetFileDef()
-    assets_any[0].Unpack(asset)
+    assets_any[asset_id].Unpack(asset)
     assets_path = os.path.join(
         compat.as_bytes(export_dir),
         compat.as_bytes(constants.ASSETS_DIRECTORY),
@@ -637,6 +638,141 @@ def testAssets(self):
           compat.as_bytes("ignored.txt"))
       self.assertFalse(file_io.file_exists(ignored_asset_path))
 
+  def testAssetsNameCollisionDiffFile(self):
+    export_dir = self._get_export_dir("test_assets_name_collision_diff_file")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 42)
+
+      asset_collection = self._build_asset_collection(
+          "hello42.txt", "foo bar bak", "asset_file_tensor",
+          asset_subdir="1")
+
+      asset_collection = self._build_asset_collection(
+          "hello42.txt", "foo bar baz", "asset_file_tensor_1",
+          asset_subdir="2")
+
+      builder.add_meta_graph_and_variables(
+          sess, ["foo"], assets_collection=asset_collection)
+
+    # Save the SavedModel to disk.
+    builder.save()
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      foo_graph = loader.load(sess, ["foo"], export_dir)
+      self._validate_asset_collection(export_dir, foo_graph.collection_def,
+                                      "hello42.txt", "foo bar bak",
+                                      "asset_file_tensor:0")
+      self._validate_asset_collection(export_dir, foo_graph.collection_def,
+                                      "hello42.txt_1", "foo bar baz",
+                                      "asset_file_tensor_1:0",
+                                      asset_id=1)
+
+  def testAssetsNameCollisionSameFilepath(self):
+    export_dir = self._get_export_dir("test_assets_name_collision_same_path")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 42)
+
+      asset_collection = self._build_asset_collection(
+          "hello42.txt", "foo bar baz", "asset_file_tensor")
+
+      asset_collection = self._build_asset_collection(
+          "hello42.txt", "foo bar baz", "asset_file_tensor_1")
+
+      builder.add_meta_graph_and_variables(
+          sess, ["foo"], assets_collection=asset_collection)
+
+    # Save the SavedModel to disk.
+    builder.save()
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      foo_graph = loader.load(sess, ["foo"], export_dir)
+      self._validate_asset_collection(export_dir, foo_graph.collection_def,
+                                      "hello42.txt", "foo bar baz",
+                                      "asset_file_tensor:0")
+      # The second tensor should be recorded, but the same.
+      self._validate_asset_collection(export_dir, foo_graph.collection_def,
+                                      "hello42.txt", "foo bar baz",
+                                      "asset_file_tensor_1:0",
+                                      asset_id=1)
+      ignored_asset_path = os.path.join(
+          compat.as_bytes(export_dir),
+          compat.as_bytes(constants.ASSETS_DIRECTORY),
+          compat.as_bytes("hello42.txt_1"))
+      self.assertFalse(file_io.file_exists(ignored_asset_path))
+
+  def testAssetsNameCollisionSameFile(self):
+    export_dir = self._get_export_dir("test_assets_name_collision_same_file")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 42)
+
+      asset_collection = self._build_asset_collection(
+          "hello42.txt", "foo bar baz", "asset_file_tensor",
+          asset_subdir="1")
+
+      asset_collection = self._build_asset_collection(
+          "hello42.txt", "foo bar baz", "asset_file_tensor_1",
+          asset_subdir="2")
+
+      builder.add_meta_graph_and_variables(
+          sess, ["foo"], assets_collection=asset_collection)
+
+    # Save the SavedModel to disk.
+    builder.save()
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      foo_graph = loader.load(sess, ["foo"], export_dir)
+      self._validate_asset_collection(export_dir, foo_graph.collection_def,
+                                      "hello42.txt", "foo bar baz",
+                                      "asset_file_tensor:0")
+      # The second tensor should be recorded, but the same.
+      self._validate_asset_collection(export_dir, foo_graph.collection_def,
+                                      "hello42.txt", "foo bar baz",
+                                      "asset_file_tensor_1:0",
+                                      asset_id=1)
+      ignored_asset_path = os.path.join(
+          compat.as_bytes(export_dir),
+          compat.as_bytes(constants.ASSETS_DIRECTORY),
+          compat.as_bytes("hello42.txt_1"))
+      self.assertFalse(file_io.file_exists(ignored_asset_path))
+
+  def testAssetsNameCollisionManyFiles(self):
+    export_dir = self._get_export_dir("test_assets_name_collision_many_files")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 42)
+
+      for i in range(5):
+        idx = str(i)
+        asset_collection = self._build_asset_collection(
+            "hello42.txt", "foo bar baz " + idx, "asset_file_tensor_" + idx,
+            asset_subdir=idx)
+
+      builder.add_meta_graph_and_variables(
+          sess, ["foo"], assets_collection=asset_collection)
+
+    # Save the SavedModel to disk.
+    builder.save()
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      foo_graph = loader.load(sess, ["foo"], export_dir)
+      for i in range(1, 5):
+        idx = str(i)
+        self._validate_asset_collection(
+            export_dir, foo_graph.collection_def, "hello42.txt_" + idx,
+            "foo bar baz " + idx, "asset_file_tensor_{}:0".format(idx),
+            asset_id=i)
+
+      self._validate_asset_collection(export_dir, foo_graph.collection_def,
+                                      "hello42.txt", "foo bar baz 0",
+                                      "asset_file_tensor_0:0")
+
   def testCustomMainOp(self):
     export_dir = self._get_export_dir("test_main_op")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
@@ -734,6 +870,96 @@ def testLegacyInitOpWithNonEmptyCollection(self):
         builder.add_meta_graph_and_variables(
             sess, ["foo"], legacy_init_op=legacy_init_op)
 
+  def testTrainOp(self):
+    export_dir = self._get_export_dir("test_train_op")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      # Add `v1` and `v2` variables to the graph.
+      v1 = variables.Variable(1, name="v1")
+      ops.add_to_collection("v", v1)
+      v2 = variables.Variable(2, name="v2")
+      ops.add_to_collection("v", v2)
+
+      sess.run(variables.global_variables_initializer())
+      train_op = state_ops.assign_add(v1, v2)
+
+      sess.run(train_op)
+      # TODO(karmel): remove explicit call when in the public method.
+      builder._add_train_op(train_op)
+      builder.add_meta_graph_and_variables(sess, ["foo"])
+
+    # Save the SavedModel to disk.
+    builder.save()
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      loader.load(sess, ["foo"], export_dir)
+      self.assertEqual(3, ops.get_collection("v")[0].eval())
+      self.assertEqual(2, ops.get_collection("v")[1].eval())
+      self.assertIsInstance(
+          ops.get_collection(constants.TRAIN_OP_KEY)[0], ops.Tensor)
+
+  def testTrainOpGroup(self):
+    export_dir = self._get_export_dir("test_train_op_group")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      # Add `v1` and `v2` variables to the graph.
+      v1 = variables.Variable(1, name="v1")
+      ops.add_to_collection("v", v1)
+      v2 = variables.Variable(2, name="v2")
+      ops.add_to_collection("v", v2)
+
+      sess.run(variables.global_variables_initializer())
+      train_op = control_flow_ops.group()
+
+      sess.run(train_op)
+      # TODO(karmel): remove explicit call when in the public method.
+      builder._add_train_op(train_op)
+      builder.add_meta_graph_and_variables(sess, ["foo"])
+
+    # Save the SavedModel to disk.
+    builder.save()
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      loader.load(sess, ["foo"], export_dir)
+      self.assertEqual(1, ops.get_collection("v")[0].eval())
+      self.assertEqual(2, ops.get_collection("v")[1].eval())
+      self.assertIsInstance(
+          ops.get_collection(constants.TRAIN_OP_KEY)[0], ops.Operation)
+
+  def testTrainOpAfterVariables(self):
+    export_dir = self._get_export_dir("test_train_op_after_variables")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      # Add `v1` and `v2` variables to the graph.
+      v1 = variables.Variable(1, name="v1")
+      ops.add_to_collection("v", v1)
+      v2 = variables.Variable(2, name="v2")
+      ops.add_to_collection("v", v2)
+
+      sess.run(variables.global_variables_initializer())
+      builder.add_meta_graph_and_variables(sess, ["pre_foo"])
+
+      train_op = state_ops.assign_add(v1, v2)
+      sess.run(train_op)
+      # TODO(karmel): remove explicit call when in the public method.
+      builder._add_train_op(train_op)
+      builder.add_meta_graph(["foo"])
+
+    # Save the SavedModel to disk.
+    builder.save()
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      loader.load(sess, ["foo"], export_dir)
+      self.assertIsInstance(
+          ops.get_collection(constants.TRAIN_OP_KEY)[0], ops.Tensor)
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      loader.load(sess, ["pre_foo"], export_dir)
+      self.assertFalse(ops.get_collection(constants.TRAIN_OP_KEY))
+
   def testMultipleAssets(self):
     export_dir = self._get_export_dir("test_multiple_assets")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
@@ -1028,12 +1254,8 @@ def testInconsistentConsumerDefaultAttrs(self):
     # does not have any attr values for the "TestAttr" node, and there is no
     # default specified in the TestAttr OpDef.
     sess = session.Session(graph=ops.Graph())
-    if ops._USE_C_API:
-      error_message = "NodeDef missing attr 'T' from Op<name=TestAttr"
-    else:
-      error_message = ("Expected one attr with name .*T(out)?.* in name: "
-                       "\"test_attr\".*")
-    with self.assertRaisesRegexp(ValueError, error_message):
+    with self.assertRaisesRegexp(
+        ValueError, "NodeDef missing attr 'T' from Op<name=TestAttr"):
       loader.load(sess, ["foo"], export_dir)
 
     # Rewrite the SavedModel to change the type of the T attr in "test_attr"
diff --git a/tensorflow/python/saved_model/signature_constants.py b/tensorflow/python/saved_model/signature_constants.py
index 819f351291f2bb..99007a9634b27d 100644
--- a/tensorflow/python/saved_model/signature_constants.py
+++ b/tensorflow/python/saved_model/signature_constants.py
@@ -94,3 +94,9 @@
     __name__, "REGRESS_OUTPUTS")
 
 ################################################################################
+# Train/Eval API constants.
+# Not exported while export_all_saved_models is in contrib.
+
+SUPERVISED_TRAIN_METHOD_NAME = "tensorflow/supervised/training"
+
+SUPERVISED_EVAL_METHOD_NAME = "tensorflow/supervised/eval"
diff --git a/tensorflow/python/saved_model/signature_def_utils.py b/tensorflow/python/saved_model/signature_def_utils.py
index ea0f52f17e56f9..27d6b70e9dce3f 100644
--- a/tensorflow/python/saved_model/signature_def_utils.py
+++ b/tensorflow/python/saved_model/signature_def_utils.py
@@ -26,6 +26,8 @@
 from tensorflow.python.saved_model.signature_def_utils_impl import is_valid_signature
 from tensorflow.python.saved_model.signature_def_utils_impl import predict_signature_def
 from tensorflow.python.saved_model.signature_def_utils_impl import regression_signature_def
+from tensorflow.python.saved_model.signature_def_utils_impl import supervised_eval_signature_def
+from tensorflow.python.saved_model.signature_def_utils_impl import supervised_train_signature_def
 # pylint: enable=unused-import
 
 del absolute_import
diff --git a/tensorflow/python/saved_model/signature_def_utils_impl.py b/tensorflow/python/saved_model/signature_def_utils_impl.py
index d0331591889110..f8ad788f7775b6 100644
--- a/tensorflow/python/saved_model/signature_def_utils_impl.py
+++ b/tensorflow/python/saved_model/signature_def_utils_impl.py
@@ -185,6 +185,62 @@ def predict_signature_def(inputs, outputs):
   return signature_def
 
 
+def supervised_train_signature_def(
+    inputs, loss, predictions=None, metrics=None):
+  return _supervised_signature_def(
+      signature_constants.SUPERVISED_TRAIN_METHOD_NAME, inputs, loss=loss,
+      predictions=predictions, metrics=metrics)
+
+
+def supervised_eval_signature_def(
+    inputs, loss, predictions=None, metrics=None):
+  return _supervised_signature_def(
+      signature_constants.SUPERVISED_EVAL_METHOD_NAME, inputs, loss=loss,
+      predictions=predictions, metrics=metrics)
+
+
+def _supervised_signature_def(
+    method_name, inputs, loss=None, predictions=None,
+    metrics=None):
+  """Creates a signature for training and eval data.
+
+  This function produces signatures that describe the inputs and outputs
+  of a supervised process, such as training or evaluation, that
+  results in loss, metrics, and the like. Note that this function only requires
+  inputs to be not None.
+
+  Args:
+    method_name: Method name of the SignatureDef as a string.
+    inputs: dict of string to `Tensor`.
+    loss: dict of string to `Tensor` representing computed loss.
+    predictions: dict of string to `Tensor` representing the output predictions.
+    metrics: dict of string to `Tensor` representing metric ops.
+
+  Returns:
+    A train- or eval-flavored signature_def.
+
+  Raises:
+    ValueError: If inputs or outputs is `None`.
+  """
+  if inputs is None or not inputs:
+    raise ValueError('{} inputs cannot be None or empty.'.format(method_name))
+
+  signature_inputs = {key: utils.build_tensor_info(tensor)
+                      for key, tensor in inputs.items()}
+
+  signature_outputs = {}
+  for output_set in (loss, predictions, metrics):
+    if output_set is not None:
+      sig_out = {key: utils.build_tensor_info(tensor)
+                 for key, tensor in output_set.items()}
+      signature_outputs.update(sig_out)
+
+  signature_def = build_signature_def(
+      signature_inputs, signature_outputs, method_name)
+
+  return signature_def
+
+
 @tf_export('saved_model.signature_def_utils.is_valid_signature')
 def is_valid_signature(signature_def):
   """Determine whether a SignatureDef can be served by TensorFlow Serving."""
diff --git a/tensorflow/python/saved_model/signature_def_utils_test.py b/tensorflow/python/saved_model/signature_def_utils_test.py
index b2bd14db8cdab3..ebc54506335d41 100644
--- a/tensorflow/python/saved_model/signature_def_utils_test.py
+++ b/tensorflow/python/saved_model/signature_def_utils_test.py
@@ -180,6 +180,101 @@ def testPredictionSignatureDef(self):
     self.assertEqual(types_pb2.DT_STRING, output2_tensor_info_actual.dtype)
     self.assertEqual(0, len(output2_tensor_info_actual.tensor_shape.dim))
 
+  def testTrainSignatureDef(self):
+    self._testSupervisedSignatureDef(
+        signature_def_utils_impl.supervised_train_signature_def,
+        signature_constants.SUPERVISED_TRAIN_METHOD_NAME)
+
+  def testEvalSignatureDef(self):
+    self._testSupervisedSignatureDef(
+        signature_def_utils_impl.supervised_eval_signature_def,
+        signature_constants.SUPERVISED_EVAL_METHOD_NAME)
+
+  def _testSupervisedSignatureDef(self, fn_to_test, method_name):
+    inputs = {
+        "input-1": constant_op.constant("a", name="input-1"),
+        "input-2": constant_op.constant("b", name="input-2"),
+    }
+    loss = {"loss-1": constant_op.constant(0.45, name="loss-1")}
+    predictions = {
+        "classes": constant_op.constant([100], name="classes"),
+    }
+    metrics_val = constant_op.constant(100.0, name="metrics_val")
+    metrics = {
+        "metrics/value": metrics_val,
+        "metrics/update_op": array_ops.identity(metrics_val, name="metrics_op"),
+    }
+
+    signature_def = fn_to_test(inputs, loss, predictions, metrics)
+
+    self.assertEqual(method_name, signature_def.method_name)
+
+    # Check inputs in signature def.
+    self.assertEqual(2, len(signature_def.inputs))
+    input1_tensor_info_actual = (signature_def.inputs["input-1"])
+    self.assertEqual("input-1:0", input1_tensor_info_actual.name)
+    self.assertEqual(types_pb2.DT_STRING, input1_tensor_info_actual.dtype)
+    self.assertEqual(0, len(input1_tensor_info_actual.tensor_shape.dim))
+    input2_tensor_info_actual = (signature_def.inputs["input-2"])
+    self.assertEqual("input-2:0", input2_tensor_info_actual.name)
+    self.assertEqual(types_pb2.DT_STRING, input2_tensor_info_actual.dtype)
+    self.assertEqual(0, len(input2_tensor_info_actual.tensor_shape.dim))
+
+    # Check outputs in signature def.
+    self.assertEqual(4, len(signature_def.outputs))
+    self.assertEqual("loss-1:0", signature_def.outputs["loss-1"].name)
+    self.assertEqual(types_pb2.DT_FLOAT, signature_def.outputs["loss-1"].dtype)
+
+    self.assertEqual("classes:0", signature_def.outputs["classes"].name)
+    self.assertEqual(1, len(signature_def.outputs["classes"].tensor_shape.dim))
+
+    self.assertEqual(
+        "metrics_val:0", signature_def.outputs["metrics/value"].name)
+    self.assertEqual(
+        types_pb2.DT_FLOAT, signature_def.outputs["metrics/value"].dtype)
+
+    self.assertEqual(
+        "metrics_op:0", signature_def.outputs["metrics/update_op"].name)
+    self.assertEqual(
+        types_pb2.DT_FLOAT, signature_def.outputs["metrics/value"].dtype)
+
+  def testTrainSignatureDefMissingInputs(self):
+    self._testSupervisedSignatureDefMissingInputs(
+        signature_def_utils_impl.supervised_train_signature_def,
+        signature_constants.SUPERVISED_TRAIN_METHOD_NAME)
+
+  def testEvalSignatureDefMissingInputs(self):
+    self._testSupervisedSignatureDefMissingInputs(
+        signature_def_utils_impl.supervised_eval_signature_def,
+        signature_constants.SUPERVISED_EVAL_METHOD_NAME)
+
+  def _testSupervisedSignatureDefMissingInputs(self, fn_to_test, method_name):
+    inputs = {
+        "input-1": constant_op.constant("a", name="input-1"),
+        "input-2": constant_op.constant("b", name="input-2"),
+    }
+    loss = {"loss-1": constant_op.constant(0.45, name="loss-1")}
+    predictions = {
+        "classes": constant_op.constant([100], name="classes"),
+    }
+    metrics_val = constant_op.constant(100, name="metrics_val")
+    metrics = {
+        "metrics/value": metrics_val,
+        "metrics/update_op": array_ops.identity(metrics_val, name="metrics_op"),
+    }
+
+    with self.assertRaises(ValueError):
+      signature_def = fn_to_test(
+          {}, loss=loss, predictions=predictions, metrics=metrics)
+
+    signature_def = fn_to_test(inputs, loss=loss)
+    self.assertEqual(method_name, signature_def.method_name)
+    self.assertEqual(1, len(signature_def.outputs))
+
+    signature_def = fn_to_test(inputs, metrics=metrics, loss=loss)
+    self.assertEqual(method_name, signature_def.method_name)
+    self.assertEqual(3, len(signature_def.outputs))
+
   def testGetShapeAndTypes(self):
     inputs = {
         "input-1": constant_op.constant(["a", "b"]),
diff --git a/tensorflow/python/saved_model/tag_constants.py b/tensorflow/python/saved_model/tag_constants.py
index 5a797da791c82d..c82154e7b93aae 100644
--- a/tensorflow/python/saved_model/tag_constants.py
+++ b/tensorflow/python/saved_model/tag_constants.py
@@ -32,6 +32,9 @@
 tf_export("saved_model.tag_constants.TRAINING").export_constant(
     __name__, "TRAINING")
 
+# Tag for the `eval` graph. Not exported while the export logic is in contrib.
+EVAL = "eval"
+
 # Tag for the `gpu` graph.
 GPU = "gpu"
 tf_export("saved_model.tag_constants.GPU").export_constant(__name__, "GPU")
@@ -39,3 +42,5 @@
 # Tag for the `tpu` graph.
 TPU = "tpu"
 tf_export("saved_model.tag_constants.TPU").export_constant(__name__, "TPU")
+
+
diff --git a/tensorflow/python/summary/summary.py b/tensorflow/python/summary/summary.py
index 969cbe7d358ecb..1421d2772fe140 100644
--- a/tensorflow/python/summary/summary.py
+++ b/tensorflow/python/summary/summary.py
@@ -16,21 +16,6 @@
 """Tensor summaries for exporting information about a model.
 
 See the @{$python/summary} guide.
-
-@@FileWriter
-@@FileWriterCache
-@@tensor_summary
-@@scalar
-@@histogram
-@@audio
-@@image
-@@text
-@@merge
-@@merge_all
-@@get_summary_description
-@@PluginAsset
-@@get_plugin_asset
-@@get_all_plugin_assets
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/tools/import_pb_to_tensorboard.py b/tensorflow/python/tools/import_pb_to_tensorboard.py
old mode 100755
new mode 100644
diff --git a/tensorflow/python/tools/optimize_for_inference_test.py b/tensorflow/python/tools/optimize_for_inference_test.py
index 084a4500f8e1eb..fcb3ceac827bd7 100644
--- a/tensorflow/python/tools/optimize_for_inference_test.py
+++ b/tensorflow/python/tools/optimize_for_inference_test.py
@@ -39,7 +39,6 @@
 from tensorflow.python.tools import optimize_for_inference_lib
 
 
-@test_util.with_c_api
 class OptimizeForInferenceTest(test.TestCase):
 
   def create_node_def(self, op, name, inputs):
diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
index 73ea85ab0c4c58..5b9d25d449d43d 100644
--- a/tensorflow/python/tools/saved_model_cli.py
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -195,14 +195,14 @@ def _show_all(saved_model_dir):
   """
   tag_sets = reader.get_saved_model_tag_sets(saved_model_dir)
   for tag_set in sorted(tag_sets):
-    tag_set = ', '.join(tag_set)
-    print('\nMetaGraphDef with tag-set: \'' + tag_set +
-          '\' contains the following SignatureDefs:')
+    print("\nMetaGraphDef with tag-set: '%s' "
+          "contains the following SignatureDefs:" % ', '.join(tag_set))
 
+    tag_set = ','.join(tag_set)
     signature_def_map = get_signature_def_map(saved_model_dir, tag_set)
     for signature_def_key in sorted(signature_def_map.keys()):
       print('\nsignature_def[\'' + signature_def_key + '\']:')
-      _show_inputs_outputs(saved_model_dir, tag_set, signature_def_key, 
+      _show_inputs_outputs(saved_model_dir, tag_set, signature_def_key,
                            indent=1)
 
 
diff --git a/tensorflow/python/training/adam_test.py b/tensorflow/python/training/adam_test.py
index 9be8b6aafefa33..bc68f24c6fda67 100644
--- a/tensorflow/python/training/adam_test.py
+++ b/tensorflow/python/training/adam_test.py
@@ -180,11 +180,10 @@ def doTestBasic(self, use_resource=False):
         self.assertIn(beta1_power, opt_variables)
         self.assertIn(beta2_power, opt_variables)
 
-        with ops.Graph().as_default():
-          # Shouldn't return non-slot variables from other graphs.
-          self.assertEqual(0, len(opt.variables()))
-
         if not context.executing_eagerly():
+          with ops.Graph().as_default():
+            # Shouldn't return non-slot variables from other graphs.
+            self.assertEqual(0, len(opt.variables()))
           self.evaluate(variables.global_variables_initializer())
           # Fetch params to validate initial values
           self.assertAllClose([1.0, 2.0], self.evaluate(var0))
diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py
index 47339e057fbc08..b0dd188db14a46 100644
--- a/tensorflow/python/training/basic_session_run_hooks.py
+++ b/tensorflow/python/training/basic_session_run_hooks.py
@@ -12,18 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Some common SessionRunHook classes.
-
-@@LoggingTensorHook
-@@StopAtStepHook
-@@CheckpointSaverHook
-@@StepCounterHook
-@@NanLossDuringTrainingError
-@@NanTensorHook
-@@SummarySaverHook
-@@GlobalStepWaiterHook
-@@ProfilerHook
-"""
+"""Some common SessionRunHook classes."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -39,6 +28,7 @@
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.util.event_pb2 import SessionLog
 from tensorflow.python.client import timeline
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import gfile
@@ -214,7 +204,7 @@ def __init__(self, tensors, every_n_iter=None, every_n_secs=None,
       self._tag_order = tensors
       tensors = {item: item for item in tensors}
     else:
-      self._tag_order = tensors.keys()
+      self._tag_order = sorted(tensors.keys())
     self._tensors = tensors
     self._formatter = formatter
     self._timer = (
@@ -347,6 +337,8 @@ def before_save(self, session, global_step_value):
 
     def after_save(self, session, global_step_value):
       print('Done writing checkpoint.')
+      if decided_to_stop_training():
+        return True
 
     def end(self, session, global_step_value):
       print('Done with the session.')
@@ -365,6 +357,11 @@ def end(self, session, global_step_value):
   implementors should implement the `end()` method to handle actions related to
   the last checkpoint save. But the listener should not act twice if
   `after_save()` already handled this last checkpoint save.
+
+  A `CheckpointSaverListener` can request training to be stopped, by returning
+  True in `after_save`. Please note that, in replicated distributed training
+  setting, only `chief` should use this behavior. Otherwise each worker will do
+  their own evaluation, which may be wasteful of resources.
   """
 
   def begin(self):
@@ -391,8 +388,7 @@ def __init__(self,
                saver=None,
                checkpoint_basename="model.ckpt",
                scaffold=None,
-               listeners=None,
-               steps_per_run=1):
+               listeners=None):
     """Initializes a `CheckpointSaverHook`.
 
     Args:
@@ -405,9 +401,6 @@ def __init__(self,
       listeners: List of `CheckpointSaverListener` subclass instances.
         Used for callbacks that run immediately before or after this hook saves
         the checkpoint.
-      steps_per_run: `int`, number of steps that occur between each invocation
-        of the hook. Primarily used for TPU workloads which run multiple steps
-        in a while loop in a single Session.run.
 
     Raises:
       ValueError: One of `save_steps` or `save_secs` should be set.
@@ -423,6 +416,9 @@ def __init__(self,
     self._timer = SecondOrStepTimer(every_secs=save_secs,
                                     every_steps=save_steps)
     self._listeners = listeners or []
+    self._steps_per_run = 1
+
+  def _set_steps_per_run(self, steps_per_run):
     self._steps_per_run = steps_per_run
 
   def begin(self):
@@ -465,7 +461,8 @@ def after_run(self, run_context, run_values):
       global_step = run_context.session.run(self._global_step_tensor)
       if self._timer.should_trigger_for_step(global_step):
         self._timer.update_last_triggered_step(global_step)
-        self._save(run_context.session, global_step)
+        if self._save(run_context.session, global_step):
+          run_context.request_stop()
 
   def end(self, session):
     last_step = session.run(self._global_step_tensor)
@@ -475,7 +472,7 @@ def end(self, session):
       l.end(session, last_step)
 
   def _save(self, session, step):
-    """Saves the latest checkpoint."""
+    """Saves the latest checkpoint, returns should_stop."""
     logging.info("Saving checkpoints for %d into %s.", step, self._save_path)
 
     for l in self._listeners:
@@ -487,8 +484,14 @@ def _save(self, session, step):
             status=SessionLog.CHECKPOINT, checkpoint_path=self._save_path),
         step)
 
+    should_stop = False
     for l in self._listeners:
-      l.after_save(session, step)
+      if l.after_save(session, step):
+        logging.info(
+            "A CheckpointSaverListener requested that training be stopped. "
+            "listener: {}".format(l))
+        should_stop = True
+    return should_stop
 
   def _get_saver(self):
     if self._saver is not None:
@@ -533,6 +536,10 @@ def __init__(self,
     self._output_dir = output_dir
     self._last_global_step = None
     self._global_step_check_count = 0
+    self._steps_per_run = 1
+
+  def _set_steps_per_run(self, steps_per_run):
+    self._steps_per_run = steps_per_run
 
   def begin(self):
     if self._summary_writer is None and self._output_dir:
@@ -558,7 +565,8 @@ def after_run(self, run_context, run_values):
     _ = run_context
 
     stale_global_step = run_values.results
-    if self._timer.should_trigger_for_step(stale_global_step+1):
+    if self._timer.should_trigger_for_step(
+        stale_global_step + self._steps_per_run):
       # get the real value after train op.
       global_step = run_context.session.run(self._global_step_tensor)
       if self._timer.should_trigger_for_step(global_step):
@@ -811,8 +819,25 @@ def final_ops_values(self):
 
   def end(self, session):
     if self._final_ops is not None:
-      self._final_ops_values = session.run(self._final_ops,
-                                           feed_dict=self._final_ops_feed_dict)
+      try:
+        self._final_ops_values = session.run(
+            self._final_ops, feed_dict=self._final_ops_feed_dict)
+      except (errors.OutOfRangeError, StopIteration) as e:
+        logging.warning(
+            "An OutOfRangeError or StopIteration exception is raised by the "
+            "code in FinalOpsHook. This typically means the Ops running by the "
+            "FinalOpsHook have a dependency back to some input source, which "
+            "should not happen. For example, for metrics in "
+            "tf.estimator.Estimator, all metrics functions return two Ops: "
+            "`value_op` and  `update_op`. Estimator.evaluate calls the "
+            "`update_op` for each batch of the data in input source and, once "
+            "it is exhausted, it call the `value_op` to get the metric values. "
+            "The `value_op` here should have dependency back to variables "
+            "reading only, rather than reading another batch from input. "
+            "Otherwise, the `value_op`, executed by `FinalOpsHook`, triggers "
+            "another data reading, which ends OutOfRangeError/StopIteration. "
+            "Please fix that.")
+        raise e
 
 
 @tf_export("train.FeedFnHook")
diff --git a/tensorflow/python/training/basic_session_run_hooks_test.py b/tensorflow/python/training/basic_session_run_hooks_test.py
index 31898562f81fac..b49a871a56a340 100644
--- a/tensorflow/python/training/basic_session_run_hooks_test.py
+++ b/tensorflow/python/training/basic_session_run_hooks_test.py
@@ -29,8 +29,10 @@
 from tensorflow.contrib.framework.python.ops import variables
 from tensorflow.contrib.testing.python.framework import fake_summary_writer
 from tensorflow.python.client import session as session_lib
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -58,6 +60,7 @@ def __init__(self):
     self.before_save_count = 0
     self.after_save_count = 0
     self.end_count = 0
+    self.ask_for_stop = False
 
   def begin(self):
     self.begin_count += 1
@@ -67,6 +70,8 @@ def before_save(self, session, global_step):
 
   def after_save(self, session, global_step):
     self.after_save_count += 1
+    if self.ask_for_stop:
+      return True
 
   def end(self, session, global_step):
     self.end_count += 1
@@ -471,6 +476,25 @@ def test_listener_with_monitored_session(self):
         'end': 1
     }, listener_counts)
 
+  def test_listener_stops_training_in_after_save(self):
+    with ops.Graph().as_default():
+      scaffold = monitored_session.Scaffold()
+      variables.get_or_create_global_step()
+      train_op = training_util._increment_global_step(1)
+      listener = MockCheckpointSaverListener()
+      hook = basic_session_run_hooks.CheckpointSaverHook(
+          self.model_dir, save_steps=1, scaffold=scaffold, listeners=[listener])
+      with monitored_session.SingularMonitoredSession(
+          hooks=[hook], scaffold=scaffold,
+          checkpoint_dir=self.model_dir) as sess:
+        sess.run(train_op)
+        self.assertFalse(sess.should_stop())
+        sess.run(train_op)
+        self.assertFalse(sess.should_stop())
+        listener.ask_for_stop = True
+        sess.run(train_op)
+        self.assertTrue(sess.should_stop())
+
   def test_listener_with_default_saver(self):
     with ops.Graph().as_default():
       global_step = variables.get_or_create_global_step()
@@ -764,8 +788,8 @@ def test_save_steps_saves_in_first_step(self):
       hook = basic_session_run_hooks.CheckpointSaverHook(
           self.model_dir,
           save_steps=2*self.steps_per_run,
-          scaffold=self.scaffold,
-          steps_per_run=self.steps_per_run)
+          scaffold=self.scaffold)
+      hook._set_steps_per_run(self.steps_per_run)
       hook.begin()
       self.scaffold.finalize()
       with session_lib.Session() as sess:
@@ -781,8 +805,8 @@ def test_save_steps_saves_periodically(self):
       hook = basic_session_run_hooks.CheckpointSaverHook(
           self.model_dir,
           save_steps=2*self.steps_per_run,
-          scaffold=self.scaffold,
-          steps_per_run=self.steps_per_run)
+          scaffold=self.scaffold)
+      hook._set_steps_per_run(self.steps_per_run)
       hook.begin()
       self.scaffold.finalize()
       with session_lib.Session() as sess:
@@ -823,8 +847,8 @@ def test_save_steps_saves_at_end(self):
       hook = basic_session_run_hooks.CheckpointSaverHook(
           self.model_dir,
           save_steps=2*self.steps_per_run,
-          scaffold=self.scaffold,
-          steps_per_run=self.steps_per_run)
+          scaffold=self.scaffold)
+      hook._set_steps_per_run(self.steps_per_run)
       hook.begin()
       self.scaffold.finalize()
       with session_lib.Session() as sess:
@@ -997,6 +1021,87 @@ def test_log_warning_if_global_step_not_increased(self):
             'global step.*has not been increased')
       hook.end(sess)
 
+  def _setup_steps_per_run_test(self,
+                                every_n_steps,
+                                steps_per_run,
+                                graph,
+                                sess):
+    variables.get_or_create_global_step()
+    self.train_op = training_util._increment_global_step(steps_per_run)
+    self.summary_writer = fake_summary_writer.FakeSummaryWriter(
+        self.log_dir, graph)
+    self.hook = basic_session_run_hooks.StepCounterHook(
+        summary_writer=self.summary_writer, every_n_steps=every_n_steps)
+    self.hook._set_steps_per_run(steps_per_run)
+    self.hook.begin()
+    sess.run(variables_lib.global_variables_initializer())
+    self.mon_sess = monitored_session._HookedSession(sess, [self.hook])
+
+  def test_steps_per_run_less_than_every_n_steps(self):
+    with ops.Graph().as_default() as g, session_lib.Session() as sess:
+      self._setup_steps_per_run_test(10, 5, g, sess)
+
+      # Logs at 15, 25
+      for _ in range(5):
+        time.sleep(0.01)
+        self.mon_sess.run(self.train_op)
+
+      self.hook.end(sess)
+      self.summary_writer.assert_summaries(
+          test_case=self,
+          expected_logdir=self.log_dir,
+          expected_graph=g,
+          expected_summaries={})
+      self.assertItemsEqual([15, 25], self.summary_writer.summaries.keys())
+      for step in [15, 25]:
+        summary_value = self.summary_writer.summaries[step][0].value[0]
+        self.assertEqual('global_step/sec', summary_value.tag)
+        self.assertGreater(summary_value.simple_value, 0)
+
+  def test_steps_per_run_equal_every_n_steps(self):
+    with ops.Graph().as_default() as g, session_lib.Session() as sess:
+      self._setup_steps_per_run_test(5, 5, g, sess)
+
+      # Logs at 10, 15, 20, 25
+      for _ in range(5):
+        time.sleep(0.01)
+        self.mon_sess.run(self.train_op)
+
+      self.hook.end(sess)
+      self.summary_writer.assert_summaries(
+          test_case=self,
+          expected_logdir=self.log_dir,
+          expected_graph=g,
+          expected_summaries={})
+      self.assertItemsEqual([10, 15, 20, 25],
+                            self.summary_writer.summaries.keys())
+      for step in [10, 15, 20, 25]:
+        summary_value = self.summary_writer.summaries[step][0].value[0]
+        self.assertEqual('global_step/sec', summary_value.tag)
+        self.assertGreater(summary_value.simple_value, 0)
+
+  def test_steps_per_run_greater_than_every_n_steps(self):
+    with ops.Graph().as_default() as g, session_lib.Session() as sess:
+      self._setup_steps_per_run_test(5, 10, g, sess)
+
+      # Logs at 20, 30, 40, 50
+      for _ in range(5):
+        time.sleep(0.01)
+        self.mon_sess.run(self.train_op)
+
+      self.hook.end(sess)
+      self.summary_writer.assert_summaries(
+          test_case=self,
+          expected_logdir=self.log_dir,
+          expected_graph=g,
+          expected_summaries={})
+      self.assertItemsEqual([20, 30, 40, 50],
+                            self.summary_writer.summaries.keys())
+      for step in [20, 30, 40, 50]:
+        summary_value = self.summary_writer.summaries[step][0].value[0]
+        self.assertEqual('global_step/sec', summary_value.tag)
+        self.assertGreater(summary_value.simple_value, 0)
+
 
 class SummarySaverHookTest(test.TestCase):
 
@@ -1225,6 +1330,26 @@ def test_final_ops_is_tensor(self):
         self.assertListEqual(expected_values,
                              hook.final_ops_values.tolist())
 
+  def test_final_ops_triggers_out_of_range_error(self):
+    with ops.Graph().as_default():
+      dataset = dataset_ops.Dataset.range(1)
+      iterator = dataset.make_one_shot_iterator()
+      read_ops = iterator.get_next()
+      final_ops = read_ops
+
+      hook = basic_session_run_hooks.FinalOpsHook(final_ops)
+      hook.begin()
+
+      with session_lib.Session() as session:
+        session.run(read_ops)
+        with test.mock.patch.object(tf_logging, 'warning') as mock_log:
+          with self.assertRaisesRegexp(errors.OutOfRangeError,
+                                       'End of sequence'):
+            hook.end(session)
+          self.assertRegexpMatches(
+              str(mock_log.call_args),
+              'dependency back to some input source')
+
   def test_final_ops_with_dictionary(self):
     with ops.Graph().as_default():
       expected_values = [4, -3]
diff --git a/tensorflow/python/training/checkpointable/BUILD b/tensorflow/python/training/checkpointable/BUILD
new file mode 100644
index 00000000000000..87ba4dc91c89e0
--- /dev/null
+++ b/tensorflow/python/training/checkpointable/BUILD
@@ -0,0 +1,130 @@
+# Description:
+#   Utilities for reading and writing object-based checkpoints.
+
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+
+py_library(
+    name = "base",
+    srcs = ["base.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:io_ops_gen",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:saveable_object",
+        "//tensorflow/python:util",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+py_test(
+    name = "base_test",
+    srcs = ["base_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":base",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
+py_library(
+    name = "data_structures_base",
+    srcs = ["data_structures_base.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":base",
+    ],
+)
+
+py_library(
+    name = "data_structures",
+    srcs = ["data_structures.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":base",
+        ":data_structures_base",
+    ],
+)
+
+py_test(
+    name = "data_structures_test",
+    srcs = ["data_structures_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":data_structures",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:layers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras:engine",
+        "//tensorflow/python/keras:layers",
+    ],
+)
+
+py_library(
+    name = "util",
+    srcs = ["util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":base",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:io_ops_gen",
+        "//tensorflow/python:ops",
+        "//tensorflow/python:saveable_object",
+        "//tensorflow/python:util",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+py_test(
+    name = "util_test",
+    srcs = ["util_test.py"],
+    srcs_version = "PY2AND3",
+    tags = [
+        "no_windows",  # TODO: needs investigation on Windows
+        "notsan",  # b/74395663
+    ],
+    deps = [
+        ":base",
+        ":util",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:init_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:template",
+        "//tensorflow/python:training",
+        "//tensorflow/python:training_util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:function",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/keras:engine",
+        "//tensorflow/python/keras:layers",
+        "@six_archive//:six",
+    ],
+)
diff --git a/tensorflow/python/training/checkpointable.py b/tensorflow/python/training/checkpointable/base.py
similarity index 86%
rename from tensorflow/python/training/checkpointable.py
rename to tensorflow/python/training/checkpointable/base.py
index 05afd37ccd5e80..cfe7259e1b6d99 100644
--- a/tensorflow/python/training/checkpointable.py
+++ b/tensorflow/python/training/checkpointable/base.py
@@ -18,14 +18,21 @@
 from __future__ import print_function
 
 import collections
+import functools
+import json
+import weakref
 
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_io_ops as io_ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import saveable_object
 from tensorflow.python.util import nest
+from tensorflow.python.util import serialization
 
 
 # Key where the object graph proto is saved in a TensorBundle
@@ -37,6 +44,7 @@
 # the object has no dependencies, then its value may be restored on object
 # creation (avoiding double assignment when executing eagerly).
 VARIABLE_VALUE_KEY = "VARIABLE_VALUE"
+OBJECT_CONFIG_JSON_KEY = "OBJECT_CONFIG_JSON"
 
 CheckpointableReference = collections.namedtuple(
     "CheckpointableReference",
@@ -85,6 +93,35 @@ def checkpoint_position(self):
     return self._checkpoint_position
 
 
+class PythonStringStateSaveable(saveable_object.SaveableObject):
+  """Saves Python state in a checkpoint."""
+
+  def __init__(self, name, state_callback):
+    """Configure saving.
+
+    Args:
+      name: The checkpoint key to write to.
+      state_callback: A function taking no arguments which returns a
+        string. This function is run every time a checkpoint is written.
+    """
+    if context.executing_eagerly():
+      self._save_string = (
+          lambda: constant_op.constant(state_callback(), dtype=dtypes.string))
+    else:
+      self._save_string = constant_op.constant("", dtype=dtypes.string)
+      self.feed_dict_additions = (
+          lambda: {self._save_string: state_callback()})
+    spec = saveable_object.SaveSpec(
+        self._save_string, "", name, dtype=dtypes.string)
+    super(PythonStringStateSaveable, self).__init__(
+        self._save_string, [spec], name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    # TODO(allenl): Add a Python hook for state coming out of a checkpoint
+    # (currently PythonStringStateSaveable is write-only).
+    return control_flow_ops.no_op()
+
+
 class _CheckpointPosition(object):
   """Indicates a position within a `_Checkpoint`."""
 
@@ -340,6 +377,21 @@ def _maybe_initialize_checkpointable(self):
           "Internal error: the object had an update UID set before its "
           "initialization code was run.")
     self._update_uid = -1
+    # When executing eagerly, holds a collection of _NameBasedRestoreCoordinator
+    # instances, which should be checked when creating variables or other
+    # saveables. These are passed on recursively to all dependencies, since
+    # unlike object-based checkpoint restores we don't know which subgraph is
+    # being restored in advance. This mechanism is only necessary for
+    # restore-on-create when executing eagerly, and so is unused when graph
+    # building.
+    self._name_based_restores = set()
+
+  def _name_based_attribute_restore(self, checkpoint):
+    """Restore the object's attributes from a name-based checkpoint."""
+    self._name_based_restores.add(checkpoint)
+    if self._update_uid < checkpoint.restore_uid:
+      checkpoint.eager_restore(self)
+      self._update_uid = checkpoint.restore_uid
 
   @property
   def _checkpoint_dependencies(self):
@@ -539,11 +591,11 @@ def _track_checkpointable(self, checkpointable, name, overwrite=False):
           self._unconditional_checkpoint_dependencies):
         if name == old_name:
           self._unconditional_checkpoint_dependencies[index] = new_reference
-    else:
+    elif current_object is None:
       self._unconditional_checkpoint_dependencies.append(new_reference)
-
-    self._unconditional_dependency_names[name] = checkpointable
-    self._handle_deferred_dependencies(name=name, checkpointable=checkpointable)
+      self._unconditional_dependency_names[name] = checkpointable
+      self._handle_deferred_dependencies(
+          name=name, checkpointable=checkpointable)
     return checkpointable
 
   def _handle_deferred_dependencies(self, name, checkpointable):
@@ -570,6 +622,7 @@ def _handle_deferred_dependencies(self, name, checkpointable):
         `CheckpointableBase`).
     """
     self._maybe_initialize_checkpointable()
+    checkpointable._maybe_initialize_checkpointable()  # pylint: disable=protected-access
     deferred_dependencies_list = self._deferred_dependencies.pop(name, ())
     for checkpoint_position in sorted(
         deferred_dependencies_list,
@@ -577,6 +630,13 @@ def _handle_deferred_dependencies(self, name, checkpointable):
         reverse=True):
       checkpoint_position.restore(checkpointable)
 
+    # Pass on any name-based restores queued in this object.
+    for name_based_restore in sorted(
+        self._name_based_restores,
+        key=lambda checkpoint: checkpoint.restore_uid,
+        reverse=True):
+      checkpointable._name_based_attribute_restore(name_based_restore)  # pylint: disable=protected-access
+
   def _restore_from_checkpoint_position(self, checkpoint_position):
     """Restore this object and its dependencies (may be deferred)."""
     # Attempt a breadth-first traversal, since presumably the user has more
@@ -604,7 +664,6 @@ def _single_restoration_from_checkpoint_position(
     # restoration on to our dependencies.
     if checkpoint.restore_uid > self._update_uid:
       restore_ops = checkpoint_position.restore_ops()
-      # TODO(allenl): Get a list of feeds for saving Python state
       self._update_uid = checkpoint.restore_uid
     else:
       restore_ops = ()
@@ -656,7 +715,60 @@ def _gather_saveables_for_checkpoint(self):
        lambda name="global_name_for_this_object":
        SaveableObject(name=name, ...)}
     """
-    return {}
+    if not hasattr(self, "get_config"):
+      return {}
+    try:
+      self.get_config()
+    except NotImplementedError:
+      return {}
+    weak_self = weakref.ref(self)
+    def _state_callback():
+      dereferenced_self = weak_self()
+      if dereferenced_self:
+        return json.dumps(self,
+                          default=serialization.get_json_type,
+                          sort_keys=True).encode("utf8")
+      else:
+        return ""
+    return {OBJECT_CONFIG_JSON_KEY: functools.partial(
+        PythonStringStateSaveable,
+        state_callback=_state_callback)}
+
+
+class NoDependency(object):
+  """Allows attribute assignment to `Checkpointable` objects with no dependency.
+
+  Example usage:
+  ```python
+  obj = Checkpointable()
+  obj.has_dependency = tf.Variable(0., name="dep")
+  obj.no_dependency = NoDependency(tf.Variable(1., name="nodep"))
+  assert obj.no_dependency.name == "nodep:0"
+  ```
+
+  `obj` in this example has a dependency on the variable "dep", and both
+  attributes contain un-wrapped `Variable` objects.
+
+  `NoDependency` also works with `tf.keras.Model`, but only for checkpoint
+  dependencies: wrapping a `Layer` in `NoDependency` will assign the (unwrapped)
+  `Layer` to the attribute without a checkpoint dependency, but the `Model` will
+  still track the `Layer` (so it will appear in `Model.layers`, and its
+  variables will appear in `Model.variables`).
+  """
+
+  def __init__(self, value):
+    self.value = value
+
+
+class NotCheckpointable(object):
+  """Marks instances of child classes as unsaveable using an object-based API.
+
+  Useful for marking objects which would otherwise look checkpointable because
+  of inheritance (e.g. through `Layer`) as not checkpointable. Inheriting from
+  `NotCheckpointable` does not prevent an object from being assigned to any
+  attributes, but will throw an error on save/restore.
+  """
+  pass
 
 
 class Checkpointable(CheckpointableBase):
@@ -691,8 +803,11 @@ def __setattr__(self, name, value):
     """Support self.foo = checkpointable syntax."""
     # Perform the attribute assignment, and potentially call other __setattr__
     # overrides such as that for tf.keras.Model.
+    no_dependency = isinstance(value, NoDependency)
+    if no_dependency:
+      value = value.value
     super(Checkpointable, self).__setattr__(name, value)
-    if isinstance(value, CheckpointableBase):
+    if not no_dependency and isinstance(value, CheckpointableBase):
       self._track_checkpointable(
           value, name=name,
           # Allow the user to switch the Checkpointable which is tracked by this
diff --git a/tensorflow/python/training/checkpointable_test.py b/tensorflow/python/training/checkpointable/base_test.py
similarity index 74%
rename from tensorflow/python/training/checkpointable_test.py
rename to tensorflow/python/training/checkpointable/base_test.py
index e79acb49758b6a..0a274cdfed5af8 100644
--- a/tensorflow/python/training/checkpointable_test.py
+++ b/tensorflow/python/training/checkpointable/base_test.py
@@ -17,7 +17,7 @@
 from __future__ import print_function
 
 from tensorflow.python.platform import test
-from tensorflow.python.training import checkpointable
+from tensorflow.python.training.checkpointable import base as checkpointable
 
 
 class InterfaceTests(test.TestCase):
@@ -34,6 +34,16 @@ def testMultipleAssignment(self):
     root.leaf = duplicate_name_dep
     root._track_checkpointable(duplicate_name_dep, name="leaf", overwrite=True)
 
+  def testNoDependency(self):
+    root = checkpointable.Checkpointable()
+    hasdep = checkpointable.Checkpointable()
+    root.hasdep = hasdep
+    nodep = checkpointable.Checkpointable()
+    root.nodep = checkpointable.NoDependency(nodep)
+    self.assertEqual(1, len(root._checkpoint_dependencies))
+    self.assertIs(root._checkpoint_dependencies[0].ref, root.hasdep)
+    self.assertIs(root.hasdep, hasdep)
+    self.assertIs(root.nodep, nodep)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/checkpointable/data_structures.py b/tensorflow/python/training/checkpointable/data_structures.py
new file mode 100644
index 00000000000000..69ed253fb2d874
--- /dev/null
+++ b/tensorflow/python/training/checkpointable/data_structures.py
@@ -0,0 +1,257 @@
+"""Checkpointable data structures."""
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+import six
+
+from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.ops import variables
+from tensorflow.python.training.checkpointable import base as checkpointable_lib
+from tensorflow.python.training.checkpointable import data_structures_base
+
+
+# TODO(allenl): We could track regular Python data structures which get assigned
+# to Checkpointable objects. Making this work with restore-on-create would be
+# tricky; we'd need to re-create nested structures with our own wrapped objects
+# on assignment to an attribute, and track the user's original structure to make
+# sure they don't modify it except through the wrappers (since we could save the
+# user's updated structure, but would have no way to support restore-on-create
+# for those modifications).
+# TODO(allenl): A dictionary data structure would be good too.
+class CheckpointableDataStructure(
+    data_structures_base.CheckpointableDataStructureBase):
+  """Base class for data structures which contain checkpointable objects."""
+
+  def __init__(self):
+    self._layers = []
+    self.trainable = True
+    self._extra_variables = []
+
+  def _track_value(self, value, name):
+    """Add a dependency on `value`."""
+    if isinstance(value, checkpointable_lib.CheckpointableBase):
+      self._track_checkpointable(value, name=name)
+      if isinstance(value, variables.Variable):
+        self._extra_variables.append(value)
+    else:
+      raise ValueError(
+          ("Only checkpointable objects (such as Layers or Optimizers) may be "
+           "stored in a List object. Got %s, which does not inherit from "
+           "CheckpointableBase.") % (value,))
+    if isinstance(value, (
+        base_layer.Layer,
+        data_structures_base.CheckpointableDataStructureBase)):
+      if value not in self._layers:
+        self._layers.append(value)
+        if hasattr(value, "_use_resource_variables"):
+          # In subclassed models, legacy layers (tf.layers) must always use
+          # resource variables.
+          value._use_resource_variables = True  # pylint: disable=protected-access
+
+  @property
+  def layers(self):
+    return self._layers
+
+  @property
+  def trainable_weights(self):
+    return layer_utils.gather_trainable_weights(
+        trainable=self.trainable,
+        sub_layers=self.layers,
+        extra_variables=self._extra_variables)
+
+  @property
+  def non_trainable_weights(self):
+    return layer_utils.gather_non_trainable_weights(
+        trainable=self.trainable,
+        sub_layers=self.layers,
+        extra_variables=self._extra_variables)
+
+  @property
+  def weights(self):
+    return self.trainable_weights + self.non_trainable_weights
+
+  @property
+  def trainable_variables(self):
+    return self.trainable_weights
+
+  @property
+  def non_trainable_variables(self):
+    return self.non_trainable_weights
+
+  @property
+  def variables(self):
+    return self.weights
+
+  @property
+  def updates(self):
+    """Aggregate updates from any `Layer` instances."""
+    # Updates and conditional losses are forwarded as-is rather than being
+    # filtered based on inputs, since this is just a container and won't ever
+    # have any inputs.
+    aggregated = []
+    for layer in self.layers:
+      aggregated += layer.updates
+    return aggregated
+
+  @property
+  def losses(self):
+    """Aggregate losses from any `Layer` instances."""
+    aggregated = []
+    for layer in self.layers:
+      aggregated += layer.losses
+    return aggregated
+
+  def __hash__(self):
+    # Support object-identity hashing, so these structures can be used as keys
+    # in sets/dicts.
+    return id(self)
+
+  def __eq__(self, other):
+    # Similar to Tensors, checkpointable data structures use object-identity
+    # equality to support set/dict membership.
+    return self is other
+
+
+class List(CheckpointableDataStructure, collections.Sequence):
+  """An append-only sequence type which is checkpointable.
+
+  Maintains checkpoint dependencies on its contents (which must also be
+  checkpointable), and forwards any `Layer` metadata such as updates and losses.
+
+  Note that `List` is purely a container. It lets a `tf.keras.Model` or
+  other checkpointable object know about its contents, but does not call any
+  `Layer` instances which are added to it. To indicate a sequence of `Layer`
+  instances which should be called sequentially, use `tf.keras.Sequential`.
+
+  Example usage:
+  ```python
+  class HasList(tf.keras.Model):
+
+    def __init__(self):
+      super(HasList, self).__init__()
+      self.layer_list = tf.contrib.checkpoint.List([layers.Dense(3)])
+      self.layer_list.append(layers.Dense(4))
+
+    def call(self, x):
+      aggregation = 0.
+      for l in self.layer_list:
+        x = l(x)
+        aggregation += tf.reduce_sum(x)
+      return aggregation
+  ```
+
+  This kind of wrapping is necessary because `Checkpointable` objects do not
+  (yet) deeply inspect regular Python data structures, so for example assigning
+  a regular list (`self.layer_list = [layers.Dense(3)]`) does not create a
+  checkpoint dependency and does not add the `Layer` instance's weights to its
+  parent `Model`.
+  """
+
+  def __init__(self, *args, **kwargs):
+    """Construct a new sequence. Arguments are passed to `list()`."""
+    super(List, self).__init__()
+    self._storage = list(*args, **kwargs)
+    for index, element in enumerate(self._storage):
+      self._track_value(element, name=self._name_element(index))
+
+  def _name_element(self, index):
+    return "%d" % (index,)
+
+  def append(self, value):
+    """Add a new checkpointable value."""
+    self._track_value(value, self._name_element(len(self._storage)))
+    self._storage.append(value)
+
+  def extend(self, values):
+    """Add a sequence of checkpointable values."""
+    for index_offset, value in enumerate(values):
+      self._track_value(
+          value, name=self._name_element(len(self._storage) + index_offset))
+    self._storage.extend(values)
+
+  def __iadd__(self, values):
+    self.extend(values)
+    return self
+
+  def __add__(self, other):
+    if isinstance(other, List):
+      return List(self._storage + other._storage)  # pylint: disable=protected-access
+    else:
+      return List(self._storage + other)
+
+  def __getitem__(self, key):
+    return self._storage[key]
+
+  def __len__(self):
+    return len(self._storage)
+
+  def __repr__(self):
+    return "List(%s)" % (repr(self._storage),)
+
+
+class Mapping(CheckpointableDataStructure, collections.Mapping):
+  """An append-only checkpointable mapping data structure with string keys.
+
+  Maintains checkpoint dependencies on its contents (which must also be
+  checkpointable), named based on its keys.
+
+  Note that once a key has been added, it may not be deleted or replaced. If
+  names may not be unique, see `tf.contrib.checkpoint.UniqueNameTracker`.
+  """
+
+  def __init__(self, *args, **kwargs):
+    """Construct a new sequence. Arguments are passed to `dict()`."""
+    super(Mapping, self).__init__()
+    self._storage = dict(*args, **kwargs)
+    for key, value in self._storage.items():
+      self._track_value(value, name=self._name_element(key))
+
+  def _name_element(self, key):
+    if not isinstance(key, six.string_types):
+      raise TypeError(
+          "Mapping accepts only string keys, but got a key %s."
+          % repr(key))
+    return str(key)
+
+  def __setitem__(self, key, value):
+    current_value = self._storage.setdefault(key, value)
+    if current_value is not value:
+      raise ValueError(
+          ("Mappings are an append-only data structure. Tried to overwrite the "
+           "key '%s' with value %s, but it already contains %s")
+          % (key, value, current_value))
+    self._track_value(value, name=self._name_element(key))
+
+  def update(self, *args, **kwargs):
+    for key, value in dict(*args, **kwargs).items():
+      self[key] = value
+
+  def __getitem__(self, key):
+    return self._storage[key]
+
+  def __len__(self):
+    return len(self._storage)
+
+  def __repr__(self):
+    return "Mapping(%s)" % (repr(self._storage),)
+
+  def __iter__(self):
+    return iter(self._storage)
diff --git a/tensorflow/python/keras/datasets/boston_housing/__init__.py b/tensorflow/python/training/checkpointable/data_structures_base.py
similarity index 65%
rename from tensorflow/python/keras/datasets/boston_housing/__init__.py
rename to tensorflow/python/training/checkpointable/data_structures_base.py
index b5371a03fd5f57..f1b2cf105b8149 100644
--- a/tensorflow/python/keras/datasets/boston_housing/__init__.py
+++ b/tensorflow/python/training/checkpointable/data_structures_base.py
@@ -1,4 +1,5 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+"""A trivial base class to avoid circular imports for isinstance checks."""
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,14 +13,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Boston housing price regression dataset."""
-
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.datasets.boston_housing import load_data
 
-del absolute_import
-del division
-del print_function
+from tensorflow.python.training.checkpointable import base as checkpointable_lib
+
+
+class CheckpointableDataStructureBase(checkpointable_lib.CheckpointableBase):
+  """Base class for data structures which contain checkpointable objects."""
+
+  pass
diff --git a/tensorflow/python/training/checkpointable/data_structures_test.py b/tensorflow/python/training/checkpointable/data_structures_test.py
new file mode 100644
index 00000000000000..b05b3a88002e31
--- /dev/null
+++ b/tensorflow/python/training/checkpointable/data_structures_test.py
@@ -0,0 +1,238 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+import numpy
+
+from tensorflow.python.eager import context
+from tensorflow.python.eager import test
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.layers import core
+from tensorflow.python.keras.layers import normalization
+from tensorflow.python.layers import core as non_keras_core
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.training.checkpointable import data_structures
+
+
+class HasList(training.Model):
+
+  def __init__(self):
+    super(HasList, self).__init__()
+    self.layer_list = data_structures.List([core.Dense(3)])
+    self.layer_list.append(core.Dense(4))
+    self.layer_list.extend(
+        [core.Dense(5),
+         core.Dense(6, kernel_regularizer=math_ops.reduce_sum)])
+    self.layer_list += [
+        core.Dense(7, bias_regularizer=math_ops.reduce_sum),
+        core.Dense(8)
+    ]
+    self.layer_list += (
+        data_structures.List([core.Dense(9)]) + data_structures.List(
+            [core.Dense(10)]))
+    self.layer_list.extend(
+        data_structures.List(
+            list(sequence=[core.Dense(11)]) + [core.Dense(12)]))
+    self.layers_with_updates = data_structures.List(
+        sequence=(normalization.BatchNormalization(),))
+
+  def call(self, x):
+    aggregation = 0.
+    for l in self.layer_list:
+      x = l(x)
+      aggregation += math_ops.reduce_sum(x)
+    bn, = self.layers_with_updates
+    return bn(x) / aggregation
+
+
+class ListTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testTracking(self):
+    model = HasList()
+    output = model(array_ops.ones([32, 2]))
+    self.assertAllEqual([32, 12], output.shape)
+    self.assertEqual(2, len(model.layers))
+    self.assertIs(model.layer_list, model.layers[0])
+    self.assertEqual(10, len(model.layers[0].layers))
+    for index in range(10):
+      self.assertEqual(3 + index, model.layers[0].layers[index].units)
+    self.assertEqual(2, len(model._checkpoint_dependencies))
+    self.assertIs(model.layer_list, model._checkpoint_dependencies[0].ref)
+    self.assertIs(model.layers_with_updates,
+                  model._checkpoint_dependencies[1].ref)
+    self.assertEqual(
+        10, len(model._checkpoint_dependencies[0].ref._checkpoint_dependencies))
+    self.evaluate([v.initializer for v in model.variables])
+    self.evaluate(model.variables[0].assign([[1., 2., 3.], [4., 5., 6.]]))
+    save_path = os.path.join(self.get_temp_dir(), "ckpt")
+    model.save_weights(save_path)
+    self.evaluate(model.variables[0].assign(array_ops.zeros([2, 3])))
+    model.load_weights(save_path)
+    self.assertAllEqual([[1., 2., 3.], [4., 5., 6.]],
+                        self.evaluate(model.variables[0]))
+
+  def testUpdatesForwarded(self):
+    with context.graph_mode():
+      model = HasList()
+      model_input = array_ops.ones([32, 2])
+      model(model_input)
+      self.assertGreater(len(model.layers_with_updates[0].updates), 0)
+      self.assertEqual(set(model.layers_with_updates[0].updates),
+                       set(model.updates))
+
+    with context.eager_mode():
+      model = HasList()
+      model_input = array_ops.ones([32, 2])
+      model(model_input)
+      self.assertEqual(0, len(model.updates))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testLossesForwarded(self):
+    model = HasList()
+    model_input = array_ops.ones([32, 2])
+    model(model_input)
+    self.assertEqual(2, len(model.losses))
+
+  def testNotCheckpointable(self):
+    class NotCheckpointable(object):
+      pass
+
+    with self.assertRaises(ValueError):
+      data_structures.List([NotCheckpointable()])
+
+  def testCallNotImplemented(self):
+    with self.assertRaisesRegexp(TypeError, "not callable"):
+      data_structures.List()(1.)
+
+  def testNoPop(self):
+    with self.assertRaises(AttributeError):
+      data_structures.List().pop()
+
+  def testNesting(self):
+    with context.graph_mode():
+      inner = data_structures.List()
+      outer = data_structures.List([inner])
+      inner.append(non_keras_core.Dense(1))
+      inner[0](array_ops.ones([2, 3]))
+      self.assertEqual(2, len(outer.variables))
+      self.assertIsInstance(
+          outer.variables[0],
+          resource_variable_ops.ResourceVariable)
+
+  def testNonLayerVariables(self):
+    v = resource_variable_ops.ResourceVariable([1.])
+    l = data_structures.List([v])
+    self.assertTrue(l.trainable)
+    self.assertEqual([], l.layers)
+    self.assertEqual([v], l.variables)
+    self.assertEqual([v], l.trainable_weights)
+    self.assertEqual([], l.non_trainable_variables)
+    l.trainable = False
+    self.assertEqual([v], l.variables)
+    self.assertEqual([], l.trainable_variables)
+    self.assertEqual([v], l.non_trainable_variables)
+    l.trainable = True
+    v2 = resource_variable_ops.ResourceVariable(1., trainable=False)
+    l.append(v2)
+    self.assertEqual([v, v2], l.weights)
+    self.assertEqual([v], l.trainable_weights)
+    self.assertEqual([v2], l.non_trainable_weights)
+
+  def testHashing(self):
+    has_sequences = set([data_structures.List(),
+                         data_structures.List()])
+    self.assertEqual(2, len(has_sequences))
+    self.assertNotIn(data_structures.List(), has_sequences)
+
+
+class HasMapping(training.Model):
+
+  def __init__(self):
+    super(HasMapping, self).__init__()
+    self.layer_dict = data_structures.Mapping(output=core.Dense(7))
+    self.layer_dict["norm"] = data_structures.List()
+    self.layer_dict["dense"] = data_structures.List()
+    self.layer_dict["dense"].extend(
+        [core.Dense(5),
+         core.Dense(6, kernel_regularizer=math_ops.reduce_sum)])
+    self.layer_dict["norm"].append(
+        normalization.BatchNormalization())
+    self.layer_dict["norm"].append(
+        normalization.BatchNormalization())
+
+  def call(self, x):
+    aggregation = 0.
+    for norm, dense in zip(self.layer_dict["norm"], self.layer_dict["dense"]):
+      x = norm(dense(x))
+      aggregation += math_ops.reduce_sum(x)
+    return self.layer_dict["output"](x) / aggregation
+
+
+class MappingTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testTracking(self):
+    model = HasMapping()
+    output = model(array_ops.ones([32, 2]))
+    self.assertAllEqual([32, 7], output.shape)
+    self.assertEqual(1, len(model.layers))
+    self.assertIs(model.layer_dict, model.layers[0])
+    self.assertEqual(3, len(model.layers[0].layers))
+    self.assertEqual(1, len(model._checkpoint_dependencies))
+    self.assertIs(model.layer_dict, model._checkpoint_dependencies[0].ref)
+    self.evaluate([v.initializer for v in model.variables])
+    test_var = model.layer_dict["output"].kernel
+    self.evaluate(test_var.assign(array_ops.ones([6, 7])))
+    save_path = os.path.join(self.get_temp_dir(), "ckpt")
+    model.save_weights(save_path)
+    self.evaluate(test_var.assign(array_ops.zeros([6, 7])))
+    model.load_weights(save_path)
+    self.assertAllEqual(numpy.ones([6, 7]),
+                        self.evaluate(test_var))
+
+  def testNoOverwrite(self):
+    mapping = data_structures.Mapping()
+    original = data_structures.List()
+    mapping["a"] = original
+    with self.assertRaises(ValueError):
+      mapping["a"] = data_structures.List()
+    self.assertIs(original, mapping["a"])
+    with self.assertRaises(AttributeError):
+      del mapping["a"]
+    mapping.update(b=data_structures.Mapping())
+    with self.assertRaises(ValueError):
+      mapping.update({"b": data_structures.Mapping()})
+
+  def testNonStringKeys(self):
+    mapping = data_structures.Mapping()
+    with self.assertRaises(TypeError):
+      mapping[1] = data_structures.List()
+
+  def testHashing(self):
+    has_mappings = set([data_structures.Mapping(),
+                        data_structures.Mapping()])
+    self.assertEqual(2, len(has_mappings))
+    self.assertNotIn(data_structures.Mapping(), has_mappings)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/training/checkpointable_utils.py b/tensorflow/python/training/checkpointable/util.py
similarity index 77%
rename from tensorflow/python/training/checkpointable_utils.py
rename to tensorflow/python/training/checkpointable/util.py
index 9cdd53cbf9629b..96e6d10791f396 100644
--- a/tensorflow/python/training/checkpointable_utils.py
+++ b/tensorflow/python/training/checkpointable/util.py
@@ -30,13 +30,16 @@
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_io_ops as io_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.training import checkpointable as checkpointable_lib
 from tensorflow.python.training import optimizer as optimizer_lib
+from tensorflow.python.training import saveable_object as saveable_object_lib
 from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.training.checkpointable import base as checkpointable_lib
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
@@ -116,6 +119,76 @@ def __init__(self, object_graph_proto, save_path, dtype_map=None):
                     slot_name=slot_reference.slot_name))
 
 
+class _NameBasedRestoreCoordinator(object):
+  """Keeps the status of a name-based checkpoint restore."""
+
+  def __init__(self, save_path, dtype_map=None):
+    self.save_path = save_path
+    self.dtype_map = dtype_map
+    self.unused_attributes = weakref.WeakKeyDictionary()
+    self.restore_uid = ops.uid()
+
+  def globally_named_object_attributes(self, checkpointable):
+    """Create globally named SaveableObjects from attributes.
+
+    If an object's attribute has no global name specified (default construction
+    for the SaveableObject factory), records the failure in
+    `self.unused_attributes` (which can then be used to make status assertions
+    fail; see `NameBasedSaverStatus`).
+
+    Args:
+      checkpointable: An object to save.
+
+    Yields:
+      SaveableObjects for `checkpointable`'s attributes.
+    """
+    for attribute_name, saveable_factory in (
+        checkpointable._gather_saveables_for_checkpoint().items()):  # pylint: disable=protected-access
+      if callable(saveable_factory):
+        try:
+          # This saveable object factory does not have a default name= argument,
+          # which means there's no way to save/restore it using a name-based
+          # checkpoint. Ignore the error now and make sure assert_consumed()
+          # fails.
+          saveable = saveable_factory()
+        except TypeError:
+          self.unused_attributes.setdefault(checkpointable, []).append(
+              attribute_name)
+          continue
+      else:
+        saveable = saveable_factory
+      names_to_saveables = saver_lib.BaseSaverBuilder.OpListToDict(
+          [saveable],
+          convert_variable_to_tensor=False)
+      for name, op in names_to_saveables.items():
+        for saveable_object in saver_lib.BaseSaverBuilder.SaveableObjectsForOp(
+            op=op, name=name):
+          yield saveable_object
+
+  def eager_restore(self, checkpointable):
+    """Runs restore ops for `checkpointable`'s attributes."""
+    # When graph building, we don't add any restore ops to the graph until
+    # run_restore_ops/initialize_or_restore on the status object for name-based
+    # checkpoints.
+    assert context.executing_eagerly()
+    for saveable in self.globally_named_object_attributes(
+        checkpointable):
+      restored_tensors = []
+      for spec in saveable.specs:
+        if spec.name in self.dtype_map:
+          with ops.device("cpu:0"):
+            restored, = io_ops.restore_v2(
+                prefix=self.save_path,
+                tensor_names=[spec.name],
+                shape_and_slices=[""],
+                dtypes=[self.dtype_map[spec.name]],
+                name="%s_checkpoint_read" % (spec.name,))
+          restored_tensors.append(array_ops.identity(restored))
+
+      saveable.restore(restored_tensors=restored_tensors,
+                       restored_shapes=None)
+
+
 # TODO(allenl): If this ends up in a public API, consider adding LINT.IfChange
 # or consolidating the implementation with get_variable.
 def _default_getter(name, shape, dtype, initializer=None,
@@ -159,6 +232,44 @@ def add_variable(checkpointable, name, shape=None, dtype=dtypes.float32,
       initializer=initializer, getter=_default_getter)
 
 
+def object_metadata(save_path):
+  """Retrieves information about the objects in a checkpoint.
+
+  Example usage:
+
+  ```python
+  object_graph = tf.contrib.checkpoint.object_metadata(
+      tf.train.latest_checkpoint(checkpoint_directory))
+  ckpt_variable_names = set()
+  for node in object_graph.nodes:
+    for attribute in node.attributes:
+      ckpt_variable_names.add(attribute.full_name)
+  ```
+
+  Args:
+    save_path: The path to the checkpoint, as returned by `save` or
+      `tf.train.latest_checkpoint`.
+  Returns:
+    A parsed `tf.contrib.checkpoint.CheckpointableObjectGraph` protocol buffer.
+  Raises:
+    ValueError: If an object graph was not found in the checkpoint.
+  """
+  reader = pywrap_tensorflow.NewCheckpointReader(save_path)
+  try:
+    object_graph_string = reader.get_tensor(
+        checkpointable_lib.OBJECT_GRAPH_PROTO_KEY)
+  except errors_impl.NotFoundError:
+    raise ValueError(
+        ('The specified checkpoint "%s" does not appear to be object-based (it '
+         'is missing the key "%s"). Likely it was created with a name-based '
+         'saver and does not contain an object dependency graph.') % (
+             save_path, checkpointable_lib.OBJECT_GRAPH_PROTO_KEY))
+  object_graph_proto = (
+      checkpointable_object_graph_pb2.CheckpointableObjectGraph())
+  object_graph_proto.ParseFromString(object_graph_string)
+  return object_graph_proto
+
+
 def _breadth_first_checkpointable_traversal(root_checkpointable):
   """Find shortest paths to all variables owned by dependencies of root."""
   bfs_sorted = []
@@ -166,6 +277,12 @@ def _breadth_first_checkpointable_traversal(root_checkpointable):
   path_to_root = {root_checkpointable: ()}
   while to_visit:
     current_checkpointable = to_visit.popleft()
+    if isinstance(current_checkpointable, checkpointable_lib.NotCheckpointable):
+      raise NotImplementedError(
+          ("The object %s does not support object-based saving. File a feature "
+           "request if this limitation bothers you. In the meantime, you can "
+           "remove the dependency on this object and save everything else.")
+          % (current_checkpointable,))
     current_checkpointable._maybe_initialize_checkpointable()  # pylint: disable=protected-access
     bfs_sorted.append(current_checkpointable)
     for child_checkpointable in (
@@ -265,42 +382,93 @@ def _serialize_slot_variables(checkpointable_objects, node_ids, object_names):
 
 
 def _serialize_checkpointables(
-    checkpointable_objects, node_ids, object_names, slot_variables):
+    checkpointable_objects, node_ids, object_names, slot_variables,
+    saveables_cache):
   """Name non-slot `Checkpointable`s and add them to `object_graph_proto`."""
   object_graph_proto = (
       checkpointable_object_graph_pb2.CheckpointableObjectGraph())
-  named_saveables = {}
-
+  named_saveables = []
+  feed_additions = {}
   for checkpoint_id, checkpointable in enumerate(checkpointable_objects):
     assert node_ids[checkpointable] == checkpoint_id
     object_proto = object_graph_proto.nodes.add()
     object_proto.slot_variables.extend(slot_variables.get(checkpointable, ()))
     object_name = object_names[checkpointable]
+    if saveables_cache is not None:
+      cached_attributes = saveables_cache.setdefault(checkpointable, {})
+    else:
+      cached_attributes = None
     for name, saveable_factory in (
         checkpointable._gather_saveables_for_checkpoint().items()):  # pylint: disable=protected-access
       attribute = object_proto.attributes.add()
       attribute.name = name
       attribute.checkpoint_key = "%s/%s/%s" % (
           object_name, _OBJECT_ATTRIBUTES_NAME, _escape_local_name(name))
-      if callable(saveable_factory):
-        saveable = saveable_factory(name=attribute.checkpoint_key)
+      if cached_attributes is None:
+        saveables = None
       else:
-        saveable = saveable_factory
-      # Figure out the name-based Saver's name for this variable.
-      saver_dict = saver_lib.BaseSaverBuilder.OpListToDict(
-          [saveable], convert_variable_to_tensor=False)
-      attribute.full_name, = saver_dict.keys()
-      named_saveables[attribute.checkpoint_key] = saveable
+        saveables = cached_attributes.get(name, None)
+        if saveables is not None:
+          for saveable in saveables:
+            if attribute.checkpoint_key not in saveable.name:
+              # The checkpoint key for this SaveableObject is different. We need
+              # to re-create it.
+              saveables = None
+              del cached_attributes[name]
+              break
+      if saveables is None:
+        if callable(saveable_factory):
+          maybe_saveable = saveable_factory(name=attribute.checkpoint_key)
+        else:
+          maybe_saveable = saveable_factory
+        if isinstance(maybe_saveable, saveable_object_lib.SaveableObject):
+          saveables = (maybe_saveable,)
+        else:
+          # Figure out the name-based Saver's name for this variable. If it's
+          # already a SaveableObject we'd just get the checkpoint key back, so
+          # we leave full_name blank.
+          saver_dict = saver_lib.BaseSaverBuilder.OpListToDict(
+              [maybe_saveable], convert_variable_to_tensor=False)
+          full_name, = saver_dict.keys()
+          saveables = tuple(saver_lib.BaseSaverBuilder.SaveableObjectsForOp(
+              op=maybe_saveable, name=attribute.checkpoint_key))
+          for saveable in saveables:
+            saveable.full_name = full_name
+        for saveable in saveables:
+          if attribute.checkpoint_key not in saveable.name:
+            raise AssertionError(
+                ("The object %s produced a SaveableObject with name '%s' for "
+                 "attribute '%s'. Expected a name containing '%s'.")
+                % (checkpointable, name, saveable.name,
+                   attribute.checkpoint_key))
+        if cached_attributes is not None:
+          cached_attributes[name] = saveables
+
+      for saveable in saveables:
+        if hasattr(saveable, "full_name"):
+          attribute.full_name = saveable.full_name
+        saveable_feed_dict_fn = getattr(saveable, "feed_dict_additions", None)
+        if saveable_feed_dict_fn is not None:
+          saveable_feed_dict = saveable_feed_dict_fn()  # pylint: disable=not-callable
+          for new_feed_key in saveable_feed_dict.keys():
+            if new_feed_key in feed_additions:
+              raise AssertionError(
+                  ("The object %s tried to feed a value for the Tensor %s "
+                   "when saving, but another object is already feeding a "
+                   "value.")
+                  % (checkpointable, new_feed_key))
+          feed_additions.update(saveable_feed_dict)
+      named_saveables.extend(saveables)
 
     for child in checkpointable._checkpoint_dependencies:  # pylint: disable=protected-access
       child_proto = object_proto.children.add()
       child_proto.node_id = node_ids[child.ref]
       child_proto.local_name = child.name
 
-  return named_saveables, object_graph_proto
+  return named_saveables, object_graph_proto, feed_additions
 
 
-def _serialize_object_graph(root_checkpointable):
+def _serialize_object_graph(root_checkpointable, saveables_cache):
   """Determine checkpoint keys for variables and build a serialized graph.
 
   Non-slot variables are keyed based on a shortest path from the root saveable
@@ -313,12 +481,17 @@ def _serialize_object_graph(root_checkpointable):
   Args:
     root_checkpointable: A `Checkpointable` object whose variables (including
       the variables of dependencies, recursively) should be saved.
+    saveables_cache: A dictionary mapping `Checkpointable` objects -> attribute
+      names -> SaveableObjects, used to avoid re-creating SaveableObjects when
+      graph building.
 
   Returns:
-    A tuple of (named_variables, object_graph_proto):
+    A tuple of (named_variables, object_graph_proto, feed_additions):
       named_variables: A dictionary mapping names to variable objects.
       object_graph_proto: A CheckpointableObjectGraph protocol buffer containing
         the serialized object graph and variable references.
+      feed_additions: A dictionary mapping from Tensors to values which should
+        be fed when saving.
 
   Raises:
     ValueError: If there are invalid characters in an optimizer's slot names.
@@ -338,7 +511,8 @@ def _serialize_object_graph(root_checkpointable):
       checkpointable_objects=checkpointable_objects,
       node_ids=node_ids,
       object_names=object_names,
-      slot_variables=slot_variables)
+      slot_variables=slot_variables,
+      saveables_cache=saveables_cache)
 
 
 def list_objects(root_checkpointable):
@@ -585,32 +759,61 @@ def initialize_or_restore(self, session=None):
     "Restoring a name-based tf.train.Saver checkpoint using the object-based "
     "restore API. This mode uses global names to match variables, and so is "
     "somewhat fragile. It also adds new restore ops to the graph each time it "
-    "is called. Prefer re-encoding training checkpoints in the object-based "
-    "format: run save() on the object-based saver (the same one this message "
-    "is coming from) and use that checkpoint in the future.")
+    "is called when graph building. Prefer re-encoding training checkpoints in "
+    "the object-based format: run save() on the object-based saver (the same "
+    "one this message is coming from) and use that checkpoint in the future.")
 
 
+@deprecation.deprecated(
+    date=None, instructions=_DEPRECATED_RESTORE_INSTRUCTIONS)
 class NameBasedSaverStatus(_LoadStatus):
   """Status for loading a name-based training checkpoint."""
 
-  def __init__(self, object_saver, save_path):
-    self._object_saver = object_saver
-    self._save_path = save_path
+  def __init__(self, checkpoint, root_checkpointable):
+    self._checkpoint = checkpoint
+    self._root_checkpointable = root_checkpointable
 
   def assert_consumed(self):
-    """Assertion for consistency with `CheckpointLoadStatus`. Always fails."""
-    raise AssertionError(
-        "Restoring a name-based checkpoint. No load status is available.")
+    """Raises an exception if any variables/objects are unmatched."""
+    unused_attributes = dict(self._checkpoint.unused_attributes)
+    if unused_attributes:
+      raise AssertionError(
+          "Some objects had attributes which were not restored: %s"
+          % (unused_attributes,))
+    for checkpointable in list_objects(self._root_checkpointable):
+      # pylint: disable=protected-access
+      checkpointable._maybe_initialize_checkpointable()
+      if checkpointable._update_uid < self._checkpoint.restore_uid:
+        raise AssertionError("Object not restored: %s" % (checkpointable,))
+      # pylint: enable=protected-access
+
+  def _gather_saveable_objects(self):
+    """Walk the object graph, using global names for SaveableObjects."""
+    objects = list_objects(self._root_checkpointable)
+    saveable_objects = []
+    for checkpointable in objects:
+      # pylint: disable=protected-access
+      checkpointable._maybe_initialize_checkpointable()
+      if checkpointable._update_uid < self._checkpoint.restore_uid:
+        checkpointable._update_uid = self._checkpoint.restore_uid
+      else:
+        continue
+      # pylint: enable=protected-access
+      saveable_objects.extend(
+          self._checkpoint.globally_named_object_attributes(
+              checkpointable))
+    return saveable_objects
 
-  @deprecation.deprecated(
-      date=None, instructions=_DEPRECATED_RESTORE_INSTRUCTIONS)
   def run_restore_ops(self, session=None):
     """Load the name-based training checkpoint using a new `tf.train.Saver`."""
-    if session is None and not context.executing_eagerly():
+    if context.executing_eagerly():
+      return  # Nothing to do, variables are restored on creation.
+    if session is None:
       session = ops.get_default_session()
     with ops.device("/cpu:0"):
-      saver_lib.Saver(self._object_saver._global_variable_names()).restore(  # pylint: disable=protected-access
-          sess=session, save_path=self._save_path)
+      saveables = self._gather_saveable_objects()
+      saver_lib.Saver(saveables).restore(
+          sess=session, save_path=self._checkpoint.save_path)
 
   def initialize_or_restore(self, session=None):
     """Alias for `run_restore_ops`."""
@@ -690,6 +893,14 @@ def __init__(self, root_checkpointable):
     self._last_restore_object_graph = None
     self._last_restore_checkpoint = None
 
+    if context.executing_eagerly():
+      # SaveableObjects are always recreated when executing eagerly.
+      self._saveable_object_cache = None
+    else:
+      # Maps Checkpointable objects -> attribute names -> SaveableObjects, to
+      # avoid re-creating SaveableObjects when graph building.
+      self._saveable_object_cache = weakref.WeakKeyDictionary()
+
   @property
   def _root_checkpointable(self):
     if isinstance(self._root_checkpointable_ref, weakref.ref):
@@ -721,8 +932,9 @@ def save(self, file_prefix, checkpoint_number=None, session=None):
     Returns:
       The full path to the checkpoint.
     """
-    named_variables, graph_proto = _serialize_object_graph(
-        self._root_checkpointable)
+    named_variables, graph_proto, feed_additions = _serialize_object_graph(
+        self._root_checkpointable,
+        saveables_cache=self._saveable_object_cache)
     if not context.executing_eagerly():
       if session is None:
         session = ops.get_default_session()
@@ -731,15 +943,15 @@ def save(self, file_prefix, checkpoint_number=None, session=None):
           self._object_graph_feed_tensor = constant_op.constant(
               "", dtype=dtypes.string)
       object_graph_tensor = self._object_graph_feed_tensor
-      feed_additions = {object_graph_tensor: graph_proto.SerializeToString()}
+      feed_additions.update(
+          {object_graph_tensor: graph_proto.SerializeToString()})
     else:
       session = None
       with ops.device("/cpu:0"):
         object_graph_tensor = constant_op.constant(
             graph_proto.SerializeToString(), dtype=dtypes.string)
-      feed_additions = None
     assert checkpointable_lib.OBJECT_GRAPH_PROTO_KEY not in named_variables
-    named_variables[checkpointable_lib.OBJECT_GRAPH_PROTO_KEY] = (
+    named_variables.append(
         _NoRestoreSaveable(
             tensor=object_graph_tensor,
             name=checkpointable_lib.OBJECT_GRAPH_PROTO_KEY))
@@ -764,17 +976,6 @@ def save(self, file_prefix, checkpoint_number=None, session=None):
           global_step=checkpoint_number)
     return save_path
 
-  def _global_variable_names(self):
-    """Generate a `tf.train.Saver`-style `var_list` using `variable.name`s."""
-    named_saveables, graph_proto = _serialize_object_graph(
-        self._root_checkpointable)
-    saver_names = {}
-    for object_proto in graph_proto.nodes:
-      for attribute_proto in object_proto.attributes:
-        saver_names[attribute_proto.full_name] = named_saveables[
-            attribute_proto.checkpoint_key]
-    return saver_names
-
   def restore(self, save_path):
     """Restore a training checkpoint.
 
@@ -835,8 +1036,32 @@ def restore(self, save_path):
     """
     if save_path is None:
       return InitializationOnlyStatus(self._root_checkpointable, ops.uid())
-    in_graph_mode = not context.executing_eagerly()
-    if in_graph_mode:
+    reader = pywrap_tensorflow.NewCheckpointReader(save_path)
+    graph_building = not context.executing_eagerly()
+    if graph_building:
+      dtype_map = None
+    else:
+      dtype_map = reader.get_variable_to_dtype_map()
+    try:
+      object_graph_string = reader.get_tensor(
+          checkpointable_lib.OBJECT_GRAPH_PROTO_KEY)
+    except errors_impl.NotFoundError:
+      # The object graph proto does not exist in this checkpoint. Try the
+      # name-based compatibility mode.
+      restore_coordinator = _NameBasedRestoreCoordinator(
+          save_path=save_path, dtype_map=dtype_map)
+      if not graph_building:
+        for existing_checkpointable in list_objects(self._root_checkpointable):
+          # pylint: disable=protected-access
+          existing_checkpointable._maybe_initialize_checkpointable()
+          existing_checkpointable._name_based_restores.add(restore_coordinator)
+          existing_checkpointable._name_based_attribute_restore(
+              restore_coordinator)
+          # pylint: enable=protected-access
+      return NameBasedSaverStatus(
+          restore_coordinator, root_checkpointable=self._root_checkpointable)
+
+    if graph_building:
       if self._file_prefix_placeholder is None:
         with ops.device("/cpu:0"):
           self._file_prefix_placeholder = constant_op.constant("model")
@@ -846,30 +1071,17 @@ def restore(self, save_path):
       with ops.device("/cpu:0"):
         file_prefix_tensor = constant_op.constant(save_path)
       file_prefix_feed_dict = None
-    reader = pywrap_tensorflow.NewCheckpointReader(save_path)
-    try:
-      object_graph_string = reader.get_tensor(
-          checkpointable_lib.OBJECT_GRAPH_PROTO_KEY)
-    except errors_impl.NotFoundError:
-      # The object graph proto does not exist in this checkpoint. Try again with
-      # name-based saving.
-      return NameBasedSaverStatus(self, save_path)
-
     object_graph_proto = (
         checkpointable_object_graph_pb2.CheckpointableObjectGraph())
     object_graph_proto.ParseFromString(object_graph_string)
-    if in_graph_mode and object_graph_proto == self._last_restore_object_graph:
+    if graph_building and object_graph_proto == self._last_restore_object_graph:
       checkpoint = self._last_restore_checkpoint
     else:
-      if in_graph_mode:
-        dtype_map = None
-      else:
-        dtype_map = reader.get_variable_to_dtype_map()
       checkpoint = _CheckpointRestoreCoordinator(
           object_graph_proto=object_graph_proto,
           save_path=file_prefix_tensor,
           dtype_map=dtype_map)
-      if in_graph_mode:
+      if graph_building:
         if self._last_restore_object_graph is not None:
           raise NotImplementedError(
               "Using a single Saver to restore different object graphs is not "
@@ -999,6 +1211,7 @@ def __init__(self, **kwargs):
             % (v,))
       setattr(self, k, v)
     self._save_counter = None  # Created lazily for restore-on-create.
+    self._save_assign_op = None
     self._saver = CheckpointableSaver(weakref.ref(self))
 
   def _maybe_create_save_counter(self):
@@ -1006,8 +1219,11 @@ def _maybe_create_save_counter(self):
     if self._save_counter is None:
       # Initialized to 0 and incremented before saving.
       with ops.device("/cpu:0"):
-        self._save_counter = add_variable(
-            self, name="save_counter", initializer=0, dtype=dtypes.int64)
+        # add_variable creates a dependency named "save_counter"; NoDependency
+        # prevents creating a second dependency named "_save_counter".
+        self._save_counter = checkpointable_lib.NoDependency(
+            add_variable(self, name="save_counter", initializer=0,
+                         dtype=dtypes.int64))
 
   @property
   def save_counter(self):
@@ -1039,8 +1255,8 @@ def save(self, file_prefix, session=None):
     Returns:
       The full path to the checkpoint.
     """
-    in_graph_mode = not context.executing_eagerly()
-    if in_graph_mode:
+    graph_building = not context.executing_eagerly()
+    if graph_building:
       if session is None:
         session = ops.get_default_session()
       if self._save_counter is None:
@@ -1048,10 +1264,13 @@ def save(self, file_prefix, session=None):
         # needs to be initialized before assign_add. This is only an issue if
         # restore() has not been called first.
         session.run(self.save_counter.initializer)
-    with ops.colocate_with(self.save_counter):
-      assign_op = self.save_counter.assign_add(1)
-    if in_graph_mode:
-      session.run(assign_op)
+    if not graph_building or self._save_assign_op is None:
+      with ops.colocate_with(self.save_counter):
+        assign_op = self.save_counter.assign_add(1, read_value=False)
+      if graph_building:
+        self._save_assign_op = assign_op
+    if graph_building:
+      session.run(self._save_assign_op)
     return self._saver.save(
         file_prefix=file_prefix,
         checkpoint_number=self.save_counter,
@@ -1096,9 +1315,9 @@ def restore(self, save_path):
     ops will grow as more objects are added to the dependency graph.
 
     Name-based `tf.train.Saver` checkpoints can be loaded using this
-    method. There is no deferred loading, and names are used to match
-    variables. No restore ops are created/run until `run_restore_ops()` or
-    `initialize_or_restore()` are called on the returned status object, even
+    method. Names are used to match variables. No restore ops are created/run
+    until `run_restore_ops()` or `initialize_or_restore()` are called on the
+    returned status object when graph building, but there is restore-on-creation
     when executing eagerly. Re-encode name-based checkpoints using
     `tf.train.Checkpoint.save` as soon as possible.
 
@@ -1124,14 +1343,13 @@ def restore(self, save_path):
       - `initialize_or_restore(session=None)`:
           When graph building, runs variable initializers if `save_path` is
           `None`, but otherwise runs restore operations. If no `session` is
-          explicitly specified, the default session is used. No effect for
-          object-based checkpoints when executing eagerly (variables are
-          initialized or restored eagerly).
+          explicitly specified, the default session is used. No effect when
+          executing eagerly (variables are initialized or restored eagerly).
       - `run_restore_ops(session=None)`:
           When graph building, runs restore operations. If no `session` is
-          explicitly specified, the default session is used. No effect for
-          object-based checkpoints when executing eagerly (restore operations
-          are run eagerly). May only be called when `save_path` is not `None`.
+          explicitly specified, the default session is used. No effect when
+          executing eagerly (restore operations are run eagerly). May only be
+          called when `save_path` is not `None`.
     """
     status = self._saver.restore(save_path=save_path)
     # Create the save counter now so it gets initialized with other variables
diff --git a/tensorflow/python/training/checkpointable_utils_test.py b/tensorflow/python/training/checkpointable/util_test.py
similarity index 90%
rename from tensorflow/python/training/checkpointable_utils_test.py
rename to tensorflow/python/training/checkpointable/util_test.py
index 40dfeb28d50a2b..8cdf5d78554b01 100644
--- a/tensorflow/python/training/checkpointable_utils_test.py
+++ b/tensorflow/python/training/checkpointable/util_test.py
@@ -17,10 +17,12 @@
 from __future__ import print_function
 
 import functools
+import json
 import os
 
 import six
 
+from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
@@ -30,9 +32,9 @@
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras._impl.keras.engine import sequential
-from tensorflow.python.keras._impl.keras.engine import training
-from tensorflow.python.keras._impl.keras.layers import core
+from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.layers import core
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -40,10 +42,10 @@
 from tensorflow.python.ops import template
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import adam
-from tensorflow.python.training import checkpointable
-from tensorflow.python.training import checkpointable_utils
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import training_util
+from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
 
 
 class NonLayerCheckpointable(checkpointable.Checkpointable):
@@ -120,7 +122,8 @@ def testAddVariable(self):
       # The .name attribute may be globally influenced, but the checkpoint name
       # won't be (tested below).
       self.assertEqual("duplicate_1:0", duplicate.name)
-    named_variables, _ = checkpointable_utils._serialize_object_graph(obj)
+    named_variables, _, _ = checkpointable_utils._serialize_object_graph(
+        obj, saveables_cache=None)
     expected_checkpoint_names = (
         "a_variable/.ATTRIBUTES/VARIABLE_VALUE",
         "bare_initializer/.ATTRIBUTES/VARIABLE_VALUE",
@@ -129,7 +132,7 @@ def testAddVariable(self):
         "ones_initializer/.ATTRIBUTES/VARIABLE_VALUE",
     )
     six.assertCountEqual(
-        self, expected_checkpoint_names, named_variables.keys())
+        self, expected_checkpoint_names, [v.name for v in named_variables])
 
   def testInitNotCalled(self):
 
@@ -155,6 +158,43 @@ def testShapeDtype(self):
     self.assertEqual(dtypes.float64, v2.dtype)
     self.assertAllEqual([1., 1., 1.], self.evaluate(v2))
 
+  def testObjectMetadata(self):
+    with context.eager_mode():
+      checkpoint_directory = self.get_temp_dir()
+      checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+      dense = core.Dense(1)
+      checkpoint = checkpointable_utils.Checkpoint(dense=dense)
+      dense(constant_op.constant([[1.]]))
+      save_path = checkpoint.save(checkpoint_prefix)
+
+    objects = checkpointable_utils.object_metadata(save_path)
+    all_variable_names = []
+    for obj in objects.nodes:
+      for attribute in obj.attributes:
+        all_variable_names.append(attribute.full_name)
+    self.assertIn("dense/kernel", all_variable_names)
+
+  def testNotCheckpointable(self):
+
+    class CallsFunctionalStuff(
+        checkpointable.NotCheckpointable, checkpointable.Checkpointable):
+      pass
+
+    test_dir = self.get_temp_dir()
+    prefix = os.path.join(test_dir, "ckpt")
+    checkpoint = checkpointable_utils.Checkpoint(x=CallsFunctionalStuff())
+    with self.assertRaises(NotImplementedError):
+      checkpoint.save(prefix)
+
+    class CallsFunctionalStuffOtherMRO(
+        checkpointable.Checkpointable, checkpointable.NotCheckpointable):
+      pass
+
+    checkpoint_reversed = checkpointable_utils.Checkpoint(
+        x=CallsFunctionalStuffOtherMRO())
+    with self.assertRaises(NotImplementedError):
+      checkpoint_reversed.save(prefix)
+
 
 class _MirroringSaveable(saver_lib.BaseSaverBuilder.SaveableObject):
 
@@ -229,8 +269,9 @@ def testNamingWithOptimizer(self):
       self.evaluate(checkpointable_utils.gather_initializers(
           root_checkpointable))
       self.evaluate(train_op)
-    named_variables, serialized_graph = (
-        checkpointable_utils._serialize_object_graph(root_checkpointable))
+    named_variables, serialized_graph, _ = (
+        checkpointable_utils._serialize_object_graph(
+            root_checkpointable, saveables_cache=None))
     expected_checkpoint_names = (
         # Created in the root node, so no prefix.
         "optimizer_step",
@@ -253,24 +294,29 @@ def testNamingWithOptimizer(self):
     suffix = "/.ATTRIBUTES/VARIABLE_VALUE"
     expected_checkpoint_names = [
         name + suffix for name in expected_checkpoint_names]
+    # The Dense layers also save get_config() JSON
+    expected_checkpoint_names.extend(
+        ["model/_second/.ATTRIBUTES/OBJECT_CONFIG_JSON",
+         "model/_named_dense/.ATTRIBUTES/OBJECT_CONFIG_JSON"])
+    named_variables = {v.name: v for v in named_variables}
     six.assertCountEqual(self, expected_checkpoint_names,
                          named_variables.keys())
     # Check that we've mapped to the right variable objects (not exhaustive)
     self.assertEqual(
-        "global_step:0",
-        named_variables["optimizer_step" + suffix].name)
+        "global_step",
+        named_variables["optimizer_step" + suffix].full_name)
     self.assertEqual(
-        "my_model/dense_1/kernel:0",
-        named_variables["model/_second/kernel" + suffix].name)
+        "my_model/dense_1/kernel",
+        named_variables["model/_second/kernel" + suffix].full_name)
     self.assertEqual(
-        "my_model/dense/kernel:0",
-        named_variables["model/_named_dense/kernel" + suffix].name)
+        "my_model/dense/kernel",
+        named_variables["model/_named_dense/kernel" + suffix].full_name)
     self.assertEqual(
-        "beta1_power:0",
-        named_variables["optimizer/beta1_power" + suffix].name)
+        "beta1_power",
+        named_variables["optimizer/beta1_power" + suffix].full_name)
     self.assertEqual(
-        "beta2_power:0",
-        named_variables["optimizer/beta2_power" + suffix].name)
+        "beta2_power",
+        named_variables["optimizer/beta2_power" + suffix].full_name)
     # Spot check the generated protocol buffers.
     self.assertEqual("optimizer",
                      serialized_graph.nodes[0].children[1].local_name)
@@ -295,7 +341,7 @@ def testNamingWithOptimizer(self):
     self.assertEqual(
         "my_model/dense/kernel/Adam:0",
         optimizer.get_slot(
-            var=named_variables["model/_named_dense/kernel" + suffix],
+            var=model._named_dense.kernel,
             name="m").name)
     self.assertEqual(
         "model/_named_dense/kernel" + suffix,
@@ -547,11 +593,11 @@ def _get_checkpoint_name(self, name):
     root = checkpointable.Checkpointable()
     checkpointable_utils.add_variable(
         root, name=name, shape=[1, 2], dtype=dtypes.float64)
-    named_variables, _ = checkpointable_utils._serialize_object_graph(root)
-    checkpoint_name, = named_variables.keys()
-    with ops.name_scope("root/" + checkpoint_name):
+    (named_variable,), _, _ = checkpointable_utils._serialize_object_graph(
+        root, saveables_cache=None)
+    with ops.name_scope("root/" + named_variable.name):
       pass  # Make sure we can use this as an op name if we prefix it.
-    return checkpoint_name
+    return named_variable.name
 
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def testVariableNameEscaping(self):
@@ -569,9 +615,9 @@ def testNumberedPath(self):
     leaf = checkpointable.Checkpointable()
     root.leaf = leaf
     checkpointable_utils.add_variable(leaf, name="v", shape=[])
-    named_variables, _ = checkpointable_utils._serialize_object_graph(root)
-    variable_name, = named_variables.keys()
-    self.assertEqual(r"leaf/v/.ATTRIBUTES/VARIABLE_VALUE", variable_name)
+    (named_variable,), _, _ = checkpointable_utils._serialize_object_graph(
+        root, saveables_cache=None)
+    self.assertEqual(r"leaf/v/.ATTRIBUTES/VARIABLE_VALUE", named_variable.name)
 
   @test_util.run_in_graph_and_eager_modes()
   def testLocalNameValidation(self):
@@ -580,9 +626,10 @@ def testLocalNameValidation(self):
     # Dots are escaped, which avoids conflicts with reserved names.
     root._track_checkpointable(leaf, name=".ATTRIBUTES")
     checkpointable_utils.add_variable(checkpointable=leaf, name="a", shape=[])
-    named_variables, _ = checkpointable_utils._serialize_object_graph(root)
-    name, = named_variables.keys()
-    self.assertEqual(name, "..ATTRIBUTES/a/.ATTRIBUTES/VARIABLE_VALUE")
+    (named_variable,), _, _ = checkpointable_utils._serialize_object_graph(
+        root, saveables_cache=None)
+    self.assertEqual("..ATTRIBUTES/a/.ATTRIBUTES/VARIABLE_VALUE",
+                     named_variable.name)
 
   def testAnonymousVarsInInit(self):
 
@@ -1203,14 +1250,20 @@ def test_checkpointable_save_restore(self):
 
     def _templated():
       v = variable_scope.get_variable(
-          "v", shape=[1], initializer=init_ops.zeros_initializer())
+          "v", shape=[1], initializer=init_ops.zeros_initializer(),
+          use_resource=True)
       v2 = variable_scope.get_variable(
-          "v2", shape=[1], initializer=init_ops.zeros_initializer())
+          "v2", shape=[1], initializer=init_ops.zeros_initializer(),
+          use_resource=True)
       return v, v + 1., v2
 
     save_template = template.make_template("s1", _templated)
-    save_root = checkpointable_utils.Checkpoint(my_template=save_template)
     v1_save, _, v2_save = save_template()
+    optimizer = adam.AdamOptimizer(0.0)
+    save_root = checkpointable_utils.Checkpoint(
+        my_template=save_template, optimizer=optimizer)
+    optimizer.minimize(v1_save.read_value)
+    self.evaluate([v.initializer for v in optimizer.variables()])
     self.evaluate(v1_save.assign([12.]))
     self.evaluate(v2_save.assign([14.]))
     checkpoint_directory = self.get_temp_dir()
@@ -1218,9 +1271,12 @@ def _templated():
     save_path = save_root.save(checkpoint_prefix)
 
     load_template = template.make_template("s2", _templated)
-    load_root = checkpointable_utils.Checkpoint(my_template=load_template)
+    load_optimizer = adam.AdamOptimizer(0.0)
+    load_root = checkpointable_utils.Checkpoint(
+        my_template=load_template, optimizer=load_optimizer)
     status = load_root.restore(save_path)
     var, var_plus_one, var2 = load_template()
+    load_optimizer.minimize(var.read_value)
     self.assertEqual(2, len(load_template._checkpoint_dependencies))
     self.assertEqual("v", load_template._checkpoint_dependencies[0].name)
     self.assertEqual("v2", load_template._checkpoint_dependencies[1].name)
@@ -1340,12 +1396,22 @@ def testLoadFromNameBasedSaver(self):
       with self.assertRaises(AssertionError):
         self._check_sentinels(root)
       object_saver = checkpointable_utils.CheckpointableSaver(root)
+      self._set_sentinels(root)
       status = object_saver.restore(save_path)
-      with self.assertRaises(AssertionError):
-        status.assert_consumed()
+      if context.executing_eagerly():
+        self._check_sentinels(root)
+      if context.executing_eagerly():
+        with self.assertRaisesRegexp(AssertionError, "OBJECT_CONFIG_JSON"):
+          status.assert_consumed()
+      else:
+        # When graph building, we haven't read any keys, so we don't know
+        # whether the restore will be complete.
+        with self.assertRaisesRegexp(AssertionError, "not restored"):
+          status.assert_consumed()
       status.run_restore_ops()
       self._check_sentinels(root)
       self._set_sentinels(root)
+      status = object_saver.restore(save_path)
       status.initialize_or_restore()
       self._check_sentinels(root)
 
@@ -1379,5 +1445,48 @@ def testSaveEagerLoadGraph(self):
         root.restore(save_path).assert_consumed().run_restore_ops()
         self._check_sentinels(root)
 
+
+class PythonMetadataTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testSaveLoad(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    dense = core.Dense(1)
+    checkpoint = checkpointable_utils.Checkpoint(dense=dense)
+    dense(constant_op.constant([[1.]]))
+    checkpoint.restore(None).initialize_or_restore()
+    save_path = checkpoint.save(checkpoint_prefix)
+
+    def _get_dense_node_from_object_graph(object_graph_proto):
+      root_node = object_graph_proto.nodes[0]
+      for child in root_node.children:
+        if child.local_name == "dense":
+          break
+      else:
+        raise AssertionError(
+            "Expected a 'dense' dependency of root, didn't find one.")
+      dense_node = object_graph_proto.nodes[child.node_id]  # pylint: disable=undefined-loop-variable
+      self.assertEqual(1, len(dense_node.attributes))
+      reader = pywrap_tensorflow.NewCheckpointReader(save_path)
+      layer_json = reader.get_tensor(dense_node.attributes[0].checkpoint_key)
+      return json.loads(layer_json.decode("utf-8"))
+
+    layer_data = _get_dense_node_from_object_graph(
+        checkpointable_utils.object_metadata(save_path))
+    self.assertEqual("Dense", layer_data["class_name"])
+    self.assertEqual(1, layer_data["config"]["units"])
+
+    # Check that no new ops are added to the graph the second time we save.
+    ops.get_default_graph().finalize()
+
+    dense.units = 42
+    save_path = checkpoint.save(checkpoint_prefix)
+    layer_data = _get_dense_node_from_object_graph(
+        checkpointable_utils.object_metadata(save_path))
+    self.assertEqual("Dense", layer_data["class_name"])
+    self.assertEqual(42, layer_data["config"]["units"])
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/device_util.py b/tensorflow/python/training/device_util.py
index f1137e80ab4394..70e1ca4b5d77e5 100644
--- a/tensorflow/python/training/device_util.py
+++ b/tensorflow/python/training/device_util.py
@@ -23,17 +23,46 @@
 from tensorflow.python.framework import ops
 
 
-def canonicalize(d):
+def canonicalize(d, default=None):
+  """Canonicalize device string.
+
+  If d has missing components, the rest would be deduced from the `default`
+  argument or from '/replica:0/task:0/device:CPU:0'. For example:
+    If d = '/cpu:0', default='/job:worker/task:1', it returns
+      '/job:worker/replica:0/task:1/device:CPU:0'.
+    If d = '/cpu:0', default='/job:worker', it returns
+      '/job:worker/replica:0/task:0/device:CPU:0'.
+    If d = '/gpu:0', default=None, it returns
+      '/replica:0/task:0/device:GPU:0'.
+
+  Note: This uses "job:localhost" as the default if executing eagerly.
+
+  Args:
+    d: a device string.
+    default: a string for default device if d doesn't have all components.
+
+  Returns:
+    a canonicalized device string.
+  """
   d = tf_device.DeviceSpec.from_string(d)
   assert d.device_type is None or d.device_type == d.device_type.upper(), (
       "Device type '%s' must be all-caps." % (d.device_type,))
   # Fill in missing device fields using defaults.
   result = tf_device.DeviceSpec(
-      job="localhost", replica=0, task=0, device_type="CPU", device_index=0)
+      replica=0, task=0, device_type="CPU", device_index=0)
+  if context.executing_eagerly():
+    result.job = "localhost"
+  if default:
+    result.merge_from(tf_device.DeviceSpec.from_string(default))
   result.merge_from(d)
   return result.to_string()
 
 
+def resolve(d):
+  """Canonicalize `d` with current device as default."""
+  return canonicalize(d, default=current())
+
+
 class _FakeNodeDef(object):
   """A fake NodeDef for _FakeOperation."""
 
diff --git a/tensorflow/python/training/device_util_test.py b/tensorflow/python/training/device_util_test.py
new file mode 100644
index 00000000000000..cdbb08229d2f06
--- /dev/null
+++ b/tensorflow/python/training/device_util_test.py
@@ -0,0 +1,89 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for device utilities."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import device_util
+
+
+class DeviceUtilTest(test.TestCase):
+
+  def testCurrentDeviceWithGlobalGraph(self):
+    with ops.device("/cpu:0"):
+      self.assertEqual(device_util.current(), "/device:CPU:0")
+
+    with ops.device("/job:worker"):
+      with ops.device("/cpu:0"):
+        self.assertEqual(device_util.current(), "/job:worker/device:CPU:0")
+
+    with ops.device("/cpu:0"):
+      with ops.device("/gpu:0"):
+        self.assertEqual(device_util.current(), "/device:GPU:0")
+
+  def testCurrentDeviceWithNonGlobalGraph(self):
+    with ops.Graph().as_default():
+      with ops.device("/cpu:0"):
+        self.assertEqual(device_util.current(), "/device:CPU:0")
+
+  def testCurrentDeviceWithEager(self):
+    with context.eager_mode():
+      with ops.device("/cpu:0"):
+        self.assertEqual(device_util.current(),
+                         "/job:localhost/replica:0/task:0/device:CPU:0")
+
+  def testCanonicalizeWithoutDefaultDevice(self):
+    self.assertEqual(
+        device_util.canonicalize("/cpu:0"),
+        "/replica:0/task:0/device:CPU:0")
+    self.assertEqual(
+        device_util.canonicalize("/job:worker/cpu:0"),
+        "/job:worker/replica:0/task:0/device:CPU:0")
+    self.assertEqual(
+        device_util.canonicalize("/job:worker/task:1/cpu:0"),
+        "/job:worker/replica:0/task:1/device:CPU:0")
+
+  def testCanonicalizeWithDefaultDevice(self):
+    self.assertEqual(
+        device_util.canonicalize("/job:worker/task:1/cpu:0", default="/gpu:0"),
+        "/job:worker/replica:0/task:1/device:CPU:0")
+    self.assertEqual(
+        device_util.canonicalize("/job:worker/task:1", default="/gpu:0"),
+        "/job:worker/replica:0/task:1/device:GPU:0")
+    self.assertEqual(
+        device_util.canonicalize("/cpu:0", default="/job:worker"),
+        "/job:worker/replica:0/task:0/device:CPU:0")
+
+  def testResolveWithDeviceScope(self):
+    with ops.device("/gpu:0"):
+      self.assertEqual(
+          device_util.resolve("/job:worker/task:1/cpu:0"),
+          "/job:worker/replica:0/task:1/device:CPU:0")
+      self.assertEqual(
+          device_util.resolve("/job:worker/task:1"),
+          "/job:worker/replica:0/task:1/device:GPU:0")
+    with ops.device("/job:worker"):
+      self.assertEqual(
+          device_util.resolve("/cpu:0"),
+          "/job:worker/replica:0/task:0/device:CPU:0")
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/training/distribute.py b/tensorflow/python/training/distribute.py
index 21ec5292adb5bb..ab8b37bb655bfc 100644
--- a/tensorflow/python/training/distribute.py
+++ b/tensorflow/python/training/distribute.py
@@ -19,6 +19,7 @@
 from __future__ import print_function
 
 import threading
+import six
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import ops
@@ -289,19 +290,31 @@ def _require_distribution_strategy_scope(distribution_strategy):
 class _CurrentDistributionContext(object):
   """Context manager for setting the `DistributionStrategy` and var creator."""
 
-  def __init__(self, distribution_strategy, var_creator_scope, var_scope=None):
+  def __init__(self,
+               distribution_strategy,
+               var_creator_scope,
+               var_scope=None,
+               default_device=None):
     self._context = _CrossTowerThreadMode(distribution_strategy)
     self._var_creator_scope = var_creator_scope
     self._var_scope = var_scope
+    if default_device:
+      self._device_scope = ops.device(default_device)
+    else:
+      self._device_scope = None
 
   def __enter__(self):
     _push_per_thread_mode(self._context)
     if self._var_scope:
       self._var_scope.__enter__()
     self._var_creator_scope.__enter__()
+    if self._device_scope:
+      self._device_scope.__enter__()
     return self._context.distribution_strategy
 
   def __exit__(self, exception_type, exception_value, traceback):
+    if self._device_scope:
+      self._device_scope.__exit__(exception_type, exception_value, traceback)
     self._var_creator_scope.__exit__(exception_type, exception_value, traceback)
     if self._var_scope:
       self._var_scope.__exit__(exception_type, exception_value, traceback)
@@ -344,14 +357,14 @@ class DistributionStrategy(object):
     on different slices of the input data. This is in contrast to
     _model parallelism_ where we divide up a single copy of a model
     across multiple devices.
-    Note: for now we only support data parallelism at this time, but
+    Note: we only support data parallelism for now, but
     hope to add support for model parallelism in the future.
   * A _tower_ is one copy of the model, running on one slice of the
     input data.
-  * _Synchronous_, or more commonly _sync_, training is when the
+  * _Synchronous_, or more commonly _sync_, training is where the
     updates from each tower are aggregated together before updating
     the model variables. This is in contrast to _asynchronous_, or
-    _async_ training where each tower updates the model variables
+    _async_ training, where each tower updates the model variables
     independently.
   * Furthermore you might run your computation on multiple devices
     on one machine (or "host"), or on multiple machines/hosts.
@@ -373,11 +386,11 @@ class DistributionStrategy(object):
   * Reductions and Allreduce: A _reduction_ is some method of
     aggregating multiple values into one value, like "sum" or
     "mean". If doing sync training, we will perform a reduction on the
-    gradients to a parameter from each tower before applying the
+    gradients to a parameter from all towers before applying the
     update. Allreduce is an algorithm for performing a reduction on
     values from multiple devices and making the result available on
     all of those devices.
-  * In the future we will have support for TensorFlows' partitioned
+  * In the future we will have support for TensorFlow's partitioned
     variables, where a single variable is split across multiple
     devices.
 
@@ -406,9 +419,9 @@ class DistributionStrategy(object):
     `tower_fn` can use the `get_tower_context()` API to get enhanced
     behavior in this case.
 
-    You can also create an initializable iterator instead of one shot iterator.
-    In that case, you will need to ensure that you initialize the iterator
-    before calling get_next.
+    You can also create an initializable iterator instead of a one-shot
+    iterator. In that case, you will need to ensure that you initialize the
+    iterator before calling get_next.
     ```
     iterator = my_distribution.distribute_dataset(
         dataset).make_initializable_iterator())
@@ -556,6 +569,9 @@ class DistributionStrategy(object):
   # TODO(josh11b): List of towers with their worker and parameter devices
   #   (where the parameter devices may overlap in the ps case).
 
+  def __init__(self):
+    self._default_device = None
+
   def scope(self):
     """Returns a context manager selecting this DistributionStrategy as current.
 
@@ -586,7 +602,8 @@ def disable_partitioned_variables(getter, *args, **kwargs):
         self, variable_scope.variable_creator_scope(creator_with_resource_vars),
         variable_scope.variable_scope(
             variable_scope.get_variable_scope(),
-            custom_getter=disable_partitioned_variables))
+            custom_getter=disable_partitioned_variables),
+        self._default_device)
 
   def _create_variable(self, next_creator, *args, **kwargs):
     # Note: should support "colocate_with" argument.
@@ -733,7 +750,7 @@ def call_for_each_tower(self, fn, *args, **kwargs):
     `fn` may call `tf.get_tower_context()` to access methods such as
     `tower_id()` and `merge_call()`.
 
-    `merge_call()` is used to communicate betwen the towers and
+    `merge_call()` is used to communicate between the towers and
     re-enter the cross-tower context. All towers pause their execution
     having encountered a `merge_call()` call. After that the
     `merge_fn`-function is executed. Its results are then unwrapped and
@@ -799,6 +816,7 @@ def reduce(self, method_string, value, destinations=None):
     # TODO(josh11b): Return an unwrapped value if colocate_with is a
     # single device.
     _require_cross_tower_context(self)
+    assert method_string in ("sum", "mean")
     return self._reduce(method_string, value, destinations)
 
   def _reduce(self, method_string, value, destinations):
@@ -896,6 +914,8 @@ def fetch(self, val, destination="/device:CPU:0", fn=lambda x: x):
       A `Tensor` on `destination`.
     """
     _require_cross_tower_context(self)
+    assert isinstance(destination, six.string_types)
+    destination = device_util.resolve(destination)
     return self._fetch(val, destination, fn)
 
   def _fetch(self, val, destination, fn):
@@ -1124,8 +1144,7 @@ def scope(self):
 
     def creator(next_creator, *args, **kwargs):
       _require_distribution_strategy_scope(self)
-      if kwargs.pop("tower_local_reduce_method", None) is not None:
-        kwargs["trainable"] = False
+      kwargs.pop("tower_local_reduce_method", None)
       return next_creator(*args, **kwargs)
 
     return _CurrentDistributionContext(
@@ -1135,7 +1154,7 @@ def tower_local_var_scope(self, reduce_method):
     """Does not set to resource variables."""
     def create_tower_local_variable(next_creator, *args, **kwargs):
       _require_distribution_strategy_scope(self)
-      kwargs["tower_local_reduce_method"] = reduce_method
+      kwargs["trainable"] = False
       return next_creator(*args, **kwargs)
 
     _require_distribution_strategy_scope(self)
diff --git a/tensorflow/python/training/gradient_descent.py b/tensorflow/python/training/gradient_descent.py
index 6caf29d83af546..a07ad19a6ec73a 100644
--- a/tensorflow/python/training/gradient_descent.py
+++ b/tensorflow/python/training/gradient_descent.py
@@ -71,6 +71,7 @@ def _apply_sparse_duplicate_indices(self, grad, var):
     return var.scatter_sub(delta, use_locking=self._use_locking)
 
   def _prepare(self):
-    if not context.executing_eagerly() or self._learning_rate_tensor is None:
+    if not context.executing_eagerly() or not isinstance(
+        self._learning_rate_tensor, ops.EagerTensor):
       self._learning_rate_tensor = ops.convert_to_tensor(self._learning_rate,
                                                          name="learning_rate")
diff --git a/tensorflow/python/training/gradient_descent_test.py b/tensorflow/python/training/gradient_descent_test.py
index 5370cafbcfab6e..f89a9c583812a6 100644
--- a/tensorflow/python/training/gradient_descent_test.py
+++ b/tensorflow/python/training/gradient_descent_test.py
@@ -18,6 +18,9 @@
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -218,6 +221,26 @@ def testSparseBasic(self):
         self.assertAllCloseAccordingToType([[3.0], [4.0 - 3.0 * 0.01]],
                                            var1.eval())
 
+  def testCapturingInDefunWhileExecutingEagerly(self):
+    with context.eager_mode():
+      optimizer = gradient_descent.GradientDescentOptimizer(1.0)
+
+      def step():
+        v = resource_variable_ops.ResourceVariable(1.0)
+        with backprop.GradientTape() as tape:
+          loss = v ** 2
+        grad = tape.gradient(loss, v)
+        optimizer.apply_gradients([(grad, v)])
+        return v.read_value()
+
+      compiled_step = function.defun(step)
+
+      self.assertEqual(float(step()), -1.0)
+      self.assertEqual(float(compiled_step()), -1.0)
+      # This shouldn't fail; in particular, the learning rate tensor should
+      # be an EagerTensor once again, not a graph Tensor.
+      self.assertEqual(float(step()), -1.0)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/training/input_test.py b/tensorflow/python/training/input_test.py
index 3a25bfe3432238..1b1e89cb26d336 100644
--- a/tensorflow/python/training/input_test.py
+++ b/tensorflow/python/training/input_test.py
@@ -497,6 +497,28 @@ def testOneThread(self):
   def testOneThreadDict(self):
     self._testOneThreadHelper(use_dict=True)
 
+  def testUint32DataTypes(self):
+    values = constant_op.constant([0, 1, 2, 3, 4, 5], dtype=dtypes.uint32)
+    batched = inp.batch([values], batch_size=2)
+    with self.test_session() as sess:
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord)
+      sess.run(batched)
+      coord.request_stop()
+      for thread in threads:
+        thread.join()
+
+  def testUint64DataTypes(self):
+    values = constant_op.constant([0, 1, 2, 3, 4, 5], dtype=dtypes.uint64)
+    batched = inp.batch([values], batch_size=2)
+    with self.test_session() as sess:
+      coord = coordinator.Coordinator()
+      threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord)
+      sess.run(batched)
+      coord.request_stop()
+      for thread in threads:
+        thread.join()
+
   def testOneThreadDynamicPad(self):
     with self.test_session() as sess:
       batch_size = 10
diff --git a/tensorflow/python/training/momentum_test.py b/tensorflow/python/training/momentum_test.py
index 7bd57ad3d85453..f7e78071d8b155 100644
--- a/tensorflow/python/training/momentum_test.py
+++ b/tensorflow/python/training/momentum_test.py
@@ -134,7 +134,6 @@ def testBasicCallableParams(self):
     with context.eager_mode():
       self.doTestBasic(use_resource=True, use_callable_params=True)
 
-  @test_util.run_in_graph_and_eager_modes(reset_test=True)
   def testVariablesAcrossGraphs(self):
     optimizer = momentum_lib.MomentumOptimizer(0.01, 0.5)
     with ops.Graph().as_default():
@@ -142,10 +141,7 @@ def testVariablesAcrossGraphs(self):
           [1.0, 2.0], dtype=dtypes.float32, name="var0")
       var1 = resource_variable_ops.ResourceVariable(
           [3.0, 4.0], dtype=dtypes.float32, name="var1")
-      if context.executing_eagerly():
-        loss = lambda: math_ops.reduce_sum(var0 + var1)
-      else:
-        loss = math_ops.reduce_sum(var0 + var1)
+      loss = math_ops.reduce_sum(var0 + var1)
       optimizer.minimize(loss)
       optimizer_variables = optimizer.variables()
       self.assertStartsWith(optimizer_variables[0].name, "var0")
@@ -157,10 +153,7 @@ def testVariablesAcrossGraphs(self):
           [1.0, 2.0], dtype=dtypes.float32, name="var2")
       var3 = resource_variable_ops.ResourceVariable(
           [3.0, 4.0], dtype=dtypes.float32, name="var3")
-      if context.executing_eagerly():
-        loss = lambda: math_ops.reduce_sum(var2 + var3)
-      else:
-        loss = math_ops.reduce_sum(var2 + var3)
+      loss = math_ops.reduce_sum(var2 + var3)
       optimizer.minimize(loss)
       optimizer_variables = optimizer.variables()
       self.assertStartsWith(optimizer_variables[0].name, "var2")
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index f584a009d946a1..fece3370f34317 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -25,7 +25,6 @@
 import six
 
 from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.estimator import util
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -41,6 +40,7 @@
 from tensorflow.python.training import saver as training_saver
 from tensorflow.python.training import session_manager as sm
 from tensorflow.python.training import session_run_hook
+from tensorflow.python.util import function_utils
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -620,7 +620,7 @@ def step_fn(step_context):
         `step_context`. It may also optionally have `self` for cases when it
         belongs to an object.
     """
-    step_fn_arguments = util.fn_args(step_fn)
+    step_fn_arguments = function_utils.fn_args(step_fn)
     if step_fn_arguments != ('step_context',) and step_fn_arguments != (
         'self',
         'step_context',
diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index 61fc828a840c49..60cc54c2645a0f 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -344,6 +344,11 @@ def __init__(self, decay, num_updates=None, zero_debias=False,
     self._name = name
     self._averages = {}
 
+  @property
+  def name(self):
+    """The name of this ExponentialMovingAverage object."""
+    return self._name
+
   def apply(self, var_list=None):
     """Maintains moving averages of variables.
 
@@ -394,7 +399,7 @@ def apply(self, var_list=None):
         if isinstance(var, variables.Variable):
           avg = slot_creator.create_slot(var,
                                          var.initialized_value(),
-                                         self._name,
+                                         self.name,
                                          colocate_with_primary=True)
           # NOTE(mrry): We only add `tf.Variable` objects to the
           # `MOVING_AVERAGE_VARIABLES` collection.
@@ -402,7 +407,7 @@ def apply(self, var_list=None):
         else:
           avg = slot_creator.create_zeros_slot(
               var,
-              self._name,
+              self.name,
               colocate_with_primary=(var.op.type in ["Variable",
                                                      "VariableV2",
                                                      "VarHandleOp"]))
@@ -410,7 +415,7 @@ def apply(self, var_list=None):
             zero_debias_true.add(avg)
       self._averages[var] = avg
 
-    with ops.name_scope(self._name) as scope:
+    with ops.name_scope(self.name) as scope:
       decay = ops.convert_to_tensor(self._decay, name="decay")
       if self._num_updates is not None:
         num_updates = math_ops.cast(self._num_updates,
@@ -462,7 +467,7 @@ def average_name(self, var):
     if var in self._averages:
       return self._averages[var].op.name
     return ops.get_default_graph().unique_name(
-        var.op.name + "/" + self._name, mark_as_used=False)
+        var.op.name + "/" + self.name, mark_as_used=False)
 
   def variables_to_restore(self, moving_avg_variables=None):
     """Returns a map of names to `Variables` to restore.
diff --git a/tensorflow/python/training/moving_averages_test.py b/tensorflow/python/training/moving_averages_test.py
index 6717811bbb0f05..3e85e6bfa7b20d 100644
--- a/tensorflow/python/training/moving_averages_test.py
+++ b/tensorflow/python/training/moving_averages_test.py
@@ -263,6 +263,7 @@ def averageVariablesNamesHelper(self, zero_debias):
       tensor2 = v0 + v1
       ema = moving_averages.ExponentialMovingAverage(
           0.25, zero_debias=zero_debias, name="foo")
+      self.assertEqual("foo", ema.name)
       self.assertEqual("v0/foo", ema.average_name(v0))
       self.assertEqual("v1/foo", ema.average_name(v1))
       self.assertEqual("add/foo", ema.average_name(tensor2))
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 66914bacf35c75..a9287a0f0d0391 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -34,9 +34,9 @@
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.training import checkpointable
 from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import slot_creator
+from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -1175,7 +1175,16 @@ def _create_or_restore_slot_variable(
     variable_key = _var_key(variable)
     slot_variable = named_slots.get(variable_key, None)
     if (slot_variable is None and context.executing_eagerly() and
-        slot_variable_position.is_simple_variable()):
+        slot_variable_position.is_simple_variable()
+        # Defer slot variable creation if there is an active variable creator
+        # scope. Generally we'd like to eagerly create/restore slot variables
+        # when possible, but this may mean that scopes intended to catch
+        # `variable` also catch its eagerly created slot variable
+        # unintentionally (specifically make_template would add a dependency on
+        # a slot variable if not for this case). Deferring is mostly harmless
+        # (aside from double initialization), and makes variable creator scopes
+        # behave the same way they do when graph building.
+        and not ops.get_default_graph()._variable_creator_stack):  # pylint: disable=protected-access
       initializer = checkpointable.CheckpointInitialValue(
           checkpoint_position=slot_variable_position)
       slot_variable = self._get_or_make_slot(
diff --git a/tensorflow/python/training/saveable_object.py b/tensorflow/python/training/saveable_object.py
new file mode 100644
index 00000000000000..4b19294b6545de
--- /dev/null
+++ b/tensorflow/python/training/saveable_object.py
@@ -0,0 +1,99 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Types for specifying saving and loading behavior."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+class SaveSpec(object):
+  """Class used to describe tensor slices that need to be saved."""
+
+  def __init__(self, tensor, slice_spec, name, dtype=None):
+    """Creates a `SaveSpec` object.
+
+    Args:
+      tensor: the tensor to save or callable that produces a tensor to save.
+      slice_spec: the slice to be saved. See `Variable.SaveSliceInfo`.
+      name: the name to save the tensor under.
+      dtype: The data type of the Tensor. Required if `tensor` is callable.
+        Used for error checking in the restore op.
+    """
+    self._tensor = tensor
+    self.slice_spec = slice_spec
+    self.name = name
+    if callable(self._tensor):
+      if dtype is None:
+        raise AssertionError(
+            "When passing a callable `tensor` to a SaveSpec, an explicit "
+            "dtype must be provided.")
+      self.dtype = dtype
+    else:
+      self.dtype = tensor.dtype
+
+  @property
+  def tensor(self):
+    return self._tensor() if callable(self._tensor) else self._tensor
+
+
+class SaveableObject(object):
+  """Base class for saving and restoring saveable objects."""
+
+  def __init__(self, op, specs, name):
+    """Creates a `SaveableObject` object.
+
+    Args:
+      op: the "producer" object that this class wraps; it produces a list of
+        tensors to save.  E.g., a "Variable" object saving its backing tensor.
+      specs: a list of SaveSpec, each element of which describes one tensor to
+        save under this object. All Tensors must be on the same device.
+      name: the name to save the object under.
+    """
+    self.op = op
+    self.specs = specs
+    self.name = name
+    self._device = None
+
+  @property
+  def device(self):
+    """The device for SaveSpec Tensors."""
+    # Note that SaveSpec.tensor runs Tensor-gathering ops when executing
+    # eagerly, making this call potentially very expensive.
+    #
+    # TODO(allenl): Consider another way to gather device information. Lower
+    # priority since this property isn't part of the normal save()/restore()
+    # workflow, but does come up when some alternative builders are passed to
+    # the Saver.
+    if self._device is None:
+      self._device = self.specs[0].tensor.device
+    return self._device
+
+  def restore(self, restored_tensors, restored_shapes):
+    """Restores this object from 'restored_tensors'.
+
+    Args:
+      restored_tensors: the tensors that were loaded from a checkpoint
+      restored_shapes: the shapes this object should conform to after
+        restore, or None.
+
+    Returns:
+      An operation that restores the state of the object.
+
+    Raises:
+      ValueError: If the object cannot be restored using the provided
+        parameters.
+    """
+    # pylint: disable=unused-argument
+    raise ValueError("Calling an abstract method.")
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index a74d629a8f81ba..4d464135fd0333 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -53,9 +53,10 @@
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import checkpointable
+from tensorflow.python.training import saveable_object
 from tensorflow.python.training import training_util
 from tensorflow.python.training.checkpoint_state_pb2 import CheckpointState
+from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import tf_export
 
@@ -91,84 +92,8 @@ class BaseSaverBuilder(object):
   Can be extended to create different Ops.
   """
 
-  class SaveSpec(object):
-    """Class used to describe tensor slices that need to be saved."""
-
-    def __init__(self, tensor, slice_spec, name, dtype=None):
-      """Creates a `SaveSpec` object.
-
-      Args:
-        tensor: the tensor to save or callable that produces a tensor to save.
-        slice_spec: the slice to be saved. See `Variable.SaveSliceInfo`.
-        name: the name to save the tensor under.
-        dtype: The data type of the Tensor. Required if `tensor` is callable.
-          Used for error checking in the restore op.
-      """
-      self._tensor = tensor
-      self.slice_spec = slice_spec
-      self.name = name
-      if callable(self._tensor):
-        if dtype is None:
-          raise AssertionError(
-              "When passing a callable `tensor` to a SaveSpec, an explicit "
-              "dtype must be provided.")
-        self.dtype = dtype
-      else:
-        self.dtype = tensor.dtype
-
-    @property
-    def tensor(self):
-      return self._tensor() if callable(self._tensor) else self._tensor
-
-  class SaveableObject(object):
-    """Base class for saving and restoring saveable objects."""
-
-    def __init__(self, op, specs, name):
-      """Creates a `SaveableObject` object.
-
-      Args:
-        op: the "producer" object that this class wraps; it produces a list of
-          tensors to save.  E.g., a "Variable" object saving its backing tensor.
-        specs: a list of SaveSpec, each element of which describes one tensor to
-          save under this object. All Tensors must be on the same device.
-        name: the name to save the object under.
-      """
-      self.op = op
-      self.specs = specs
-      self.name = name
-      self._device = None
-
-    @property
-    def device(self):
-      """The device for SaveSpec Tensors."""
-      # Note that SaveSpec.tensor runs Tensor-gathering ops when executing
-      # eagerly, making this call potentially very expensive.
-      #
-      # TODO(allenl): Consider another way to gather device information. Lower
-      # priority since this property isn't part of the normal save()/restore()
-      # workflow, but does come up when some alternative builders are passed to
-      # the Saver.
-      if self._device is None:
-        self._device = self.specs[0].tensor.device
-      return self._device
-
-    def restore(self, restored_tensors, restored_shapes):
-      """Restores this object from 'restored_tensors'.
-
-      Args:
-        restored_tensors: the tensors that were loaded from a checkpoint
-        restored_shapes: the shapes this object should conform to after
-          restore, or None.
-
-      Returns:
-        An operation that restores the state of the object.
-
-      Raises:
-        ValueError: If the object cannot be restored using the provided
-          parameters.
-      """
-      # pylint: disable=unused-argument
-      raise ValueError("Calling an abstract method.")
+  SaveSpec = saveable_object.SaveSpec
+  SaveableObject = saveable_object.SaveableObject
 
   class VariableSaveable(SaveableObject):
     """SaveableObject implementation that handles Variables."""
@@ -644,6 +569,76 @@ def OpListToDict(op_list, convert_variable_to_tensor=True):
       # pylint: enable=protected-access
     return names_to_saveables
 
+  @staticmethod
+  def SaveableObjectsForOp(op, name):
+    """Create `SaveableObject`s from an operation.
+
+    Args:
+      op: A variable, operation, or SaveableObject to coerce into a
+        SaveableObject.
+      name: A string name for the SaveableObject.
+
+    Yields:
+      `SaveableObject`s which together save/restore `op`.
+
+    Raises:
+      TypeError: If `name` is not a string.
+      ValueError: For operations with no known conversion to SaveableObject.
+    """
+    if not isinstance(name, six.string_types):
+      raise TypeError(
+          "names_to_saveables must be a dict mapping string names to "
+          "checkpointable operations. Name is not a string: %s" % name)
+    if isinstance(op, BaseSaverBuilder.SaveableObject):
+      yield op
+    elif isinstance(op, (list, tuple, variables.PartitionedVariable)):
+      if isinstance(op, variables.PartitionedVariable):
+        op = list(op)
+      # A set of slices.
+      slice_name = None
+      # pylint: disable=protected-access
+      for variable in op:
+        if not isinstance(variable, variables.Variable):
+          raise ValueError("Slices must all be Variables: %s" % variable)
+        if not variable._save_slice_info:
+          raise ValueError("Slices must all be slices: %s" % variable)
+        if slice_name is None:
+          slice_name = variable._save_slice_info.full_name
+        elif slice_name != variable._save_slice_info.full_name:
+          raise ValueError(
+              "Slices must all be from the same tensor: %s != %s" %
+              (slice_name, variable._save_slice_info.full_name))
+        if variable.op.type in ["Variable", "VariableV2",
+                                "AutoReloadVariable"]:
+          yield BaseSaverBuilder.VariableSaveable(
+              variable, variable._save_slice_info.spec, name)
+        else:
+          yield BaseSaverBuilder.ResourceVariableSaveable(
+              variable, variable._save_slice_info.spec, name)
+      # pylint: enable=protected-access
+    else:
+      # A variable or tensor.
+      if context.executing_eagerly():
+        if not isinstance(op, resource_variable_ops.ResourceVariable):
+          raise ValueError("Can only save/restore ResourceVariable eager "
+                           "mode is enabled, type: %s." % type(op))
+        yield BaseSaverBuilder.ResourceVariableSaveable(op, "", name)
+      else:
+        if isinstance(op, resource_variable_ops.ResourceVariable):
+          variable = op._graph_element  # pylint: disable=protected-access
+        else:
+          variable = ops.internal_convert_to_tensor(op, as_ref=True)
+        if not BaseSaverBuilder._IsVariable(variable):
+          raise TypeError("names_to_saveables must be a dict mapping string "
+                          "names to Tensors/Variables. Not a variable: %s" %
+                          variable)
+        if variable.op.type in ["Variable", "VariableV2",
+                                "AutoReloadVariable"]:
+          yield BaseSaverBuilder.VariableSaveable(variable, "", name)
+        else:
+          yield BaseSaverBuilder.ResourceVariableSaveable(
+              variable, "", name)
+
   def _ValidateAndSliceInputs(self, names_to_saveables):
     """Returns the variables and names that will be used for a Saver.
 
@@ -665,63 +660,11 @@ def _ValidateAndSliceInputs(self, names_to_saveables):
 
     saveables = []
     seen_ops = set()
-    for name in sorted(names_to_saveables.keys()):
-      if not isinstance(name, six.string_types):
-        raise TypeError(
-            "names_to_saveables must be a dict mapping string names to "
-            "checkpointable operations. Name is not a string: %s" % name)
-      op = names_to_saveables[name]
-      if isinstance(op, BaseSaverBuilder.SaveableObject):
-        self._AddSaveable(saveables, seen_ops, op)
-      elif isinstance(op, (list, tuple, variables.PartitionedVariable)):
-        if isinstance(op, variables.PartitionedVariable):
-          op = list(op)
-        # A set of slices.
-        slice_name = None
-        # pylint: disable=protected-access
-        for variable in op:
-          if not isinstance(variable, variables.Variable):
-            raise ValueError("Slices must all be Variables: %s" % variable)
-          if not variable._save_slice_info:
-            raise ValueError("Slices must all be slices: %s" % variable)
-          if slice_name is None:
-            slice_name = variable._save_slice_info.full_name
-          elif slice_name != variable._save_slice_info.full_name:
-            raise ValueError(
-                "Slices must all be from the same tensor: %s != %s" %
-                (slice_name, variable._save_slice_info.full_name))
-          if variable.op.type in ["Variable", "VariableV2",
-                                  "AutoReloadVariable"]:
-            saveable = BaseSaverBuilder.VariableSaveable(
-                variable, variable._save_slice_info.spec, name)
-          else:
-            saveable = BaseSaverBuilder.ResourceVariableSaveable(
-                variable, variable._save_slice_info.spec, name)
-          self._AddSaveable(saveables, seen_ops, saveable)
-        # pylint: enable=protected-access
-      else:
-        # A variable or tensor.
-        if context.executing_eagerly():
-          if not isinstance(op, resource_variable_ops.ResourceVariable):
-            raise ValueError("Can only save/restore ResourceVariable eager "
-                             "mode is enabled, type: %s." % type(op))
-          saveable = BaseSaverBuilder.ResourceVariableSaveable(op, "", name)
-        else:
-          if isinstance(op, resource_variable_ops.ResourceVariable):
-            variable = op._graph_element  # pylint: disable=protected-access
-          else:
-            variable = ops.internal_convert_to_tensor(op, as_ref=True)
-          if not BaseSaverBuilder._IsVariable(variable):
-            raise TypeError("names_to_saveables must be a dict mapping string "
-                            "names to Tensors/Variables. Not a variable: %s" %
-                            variable)
-          if variable.op.type in ["Variable", "VariableV2",
-                                  "AutoReloadVariable"]:
-            saveable = BaseSaverBuilder.VariableSaveable(variable, "", name)
-          else:
-            saveable = BaseSaverBuilder.ResourceVariableSaveable(
-                variable, "", name)
-        self._AddSaveable(saveables, seen_ops, saveable)
+    for name, op in sorted(names_to_saveables.items(),
+                           # Avoid comparing ops, sort only by name.
+                           key=lambda x: x[0]):
+      for converted_saveable_object in self.SaveableObjectsForOp(op, name):
+        self._AddSaveable(saveables, seen_ops, converted_saveable_object)
     return saveables
 
   def _AddSaveable(self, saveables, seen_ops, saveable):
@@ -1800,7 +1743,7 @@ def restore(self, sess, save_path):
       return
     if save_path is None:
       raise ValueError("Can't load save_path when it is None.")
-    logging.info("Restoring parameters from %s", save_path)
+    logging.info("Restoring parameters from %s", compat.as_text(save_path))
     try:
       if context.executing_eagerly():
         self._build_eager(save_path, build_save=False, build_restore=True)
@@ -1811,13 +1754,17 @@ def restore(self, sess, save_path):
       exception_type, exception_value, exception_traceback = sys.exc_info()
       # The checkpoint would not be loaded successfully as is. Try to parse it
       # as an object-based checkpoint.
+      should_reraise = False
       try:
         reader = pywrap_tensorflow.NewCheckpointReader(save_path)
         object_graph_string = reader.get_tensor(
             checkpointable.OBJECT_GRAPH_PROTO_KEY)
       except errors.NotFoundError:
         # This is not an object-based checkpoint, or the checkpoint doesn't
-        # exist. Re-raise the original exception.
+        # exist. Re-raise the original exception, but do it outside the except
+        # block so the object graph lookup isn't included in the stack trace.
+        should_reraise = True
+      if should_reraise:
         six.reraise(exception_type, exception_value, exception_traceback)
       del exception_traceback  # avoid reference cycles
 
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index 70495291bc5762..f1991093e0b519 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -24,8 +24,10 @@
 import os
 import random
 import shutil
+import sys
 import tempfile
 import time
+import traceback
 
 import numpy as np
 import six
@@ -51,8 +53,8 @@
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops as ops_lib
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras._impl.keras.engine import training
-from tensorflow.python.keras._impl.keras.layers import core
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.layers import core
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -71,18 +73,17 @@
 from tensorflow.python.platform import test
 from tensorflow.python.summary import summary
 from tensorflow.python.training import adam
-from tensorflow.python.training import checkpointable
-from tensorflow.python.training import checkpointable_utils
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import queue_runner_impl
 from tensorflow.python.training import saver as saver_module
 from tensorflow.python.training import saver_test_utils
 from tensorflow.python.training import training_util
 from tensorflow.python.training.checkpoint_state_pb2 import CheckpointState
+from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
 from tensorflow.python.util import compat
 
 
-@test_util.with_c_api
 class SaverTest(test.TestCase):
 
   def basicSaveRestore(self, variable_op):
@@ -769,7 +770,6 @@ def testSaveToURI(self):
       save.save(sess, save_path)
 
 
-@test_util.with_c_api
 class SaveRestoreShardedTest(test.TestCase):
 
   _WRITE_VERSION = saver_pb2.SaverDef.V1
@@ -1044,12 +1044,10 @@ def testPartitionedResourceVariable(self):
     self._testPartitionedVariables(use_resource=True)
 
 
-@test_util.with_c_api
 class SaveRestoreShardedTestV2(SaveRestoreShardedTest):
   _WRITE_VERSION = saver_pb2.SaverDef.V2
 
 
-@test_util.with_c_api
 class MaxToKeepTest(test.TestCase):
 
   def _get_test_dir(self, dirname):
@@ -1390,7 +1388,6 @@ def testNoMetaGraph(self):
       self.assertFalse(gfile.Exists(save._MetaGraphFilename(s1)))
 
 
-@test_util.with_c_api
 class KeepCheckpointEveryNHoursTest(test.TestCase):
 
   def _get_test_dir(self, dirname):
@@ -1450,7 +1447,6 @@ def testNonSharded(self, mock_time):
       self.assertTrue(saver_module.checkpoint_exists(s4))
 
 
-@test_util.with_c_api
 class SaveRestoreWithVariableNameMap(test.TestCase):
 
   def _testNonReshape(self, variable_op):
@@ -1527,7 +1523,6 @@ def testNonReshapeVariable(self):
     self._testNonReshape(variables.Variable)
 
 
-@test_util.with_c_api
 class LatestCheckpointWithRelativePaths(test.TestCase):
 
   @staticmethod
@@ -1629,7 +1624,6 @@ def testRelativePath(self):
           self.assertEqual(v0.eval(), 2.0)
 
 
-@test_util.with_c_api
 class CheckpointStateTest(test.TestCase):
 
   def _get_test_dir(self, dirname):
@@ -1744,7 +1738,6 @@ def testCheckPointCompletesRelativePaths(self):
                      os.path.join(save_dir, "./model.ckpt-687529"))
 
 
-@test_util.with_c_api
 class MetaGraphTest(test.TestCase):
 
   def _get_test_dir(self, dirname):
@@ -2464,7 +2457,6 @@ def testPreserveDatasetAndFunctions(self):
           sess.run("new_model/output:0")
 
 
-@test_util.with_c_api
 class CheckpointReaderTest(test.TestCase):
 
   _WRITE_VERSION = saver_pb2.SaverDef.V1
@@ -2517,12 +2509,10 @@ def testNonexistentPath(self):
       pywrap_tensorflow.NewCheckpointReader("non-existent")
 
 
-@test_util.with_c_api
 class CheckpointReaderForV2Test(CheckpointReaderTest):
   _WRITE_VERSION = saver_pb2.SaverDef.V2
 
 
-@test_util.with_c_api
 class WriteGraphTest(test.TestCase):
 
   def _get_test_dir(self, dirname):
@@ -2550,7 +2540,6 @@ def testRecursiveCreate(self):
     self.assertTrue(os.path.exists(path))
 
 
-@test_util.with_c_api
 class SaverUtilsTest(test.TestCase):
 
   def setUp(self):
@@ -2593,7 +2582,6 @@ def testGetCheckpointMtimes(self):
     self.assertTrue(mtimes[1] >= mtimes[0])
 
 
-@test_util.with_c_api
 class ScopedGraphTest(test.TestCase):
 
   def _get_test_dir(self, dirname):
@@ -2976,7 +2964,6 @@ def call(self, values):
     return ret
 
 
-@test_util.with_c_api
 class CheckpointableCompatibilityTests(test.TestCase):
 
   # TODO(allenl): Track down python3 reference cycles in these tests.
@@ -3108,6 +3095,16 @@ def testCheckpointNotFoundErrorRaised(self):
           errors.NotFoundError,
           "Failed to find any matching files for path_which_does_not_exist"):
         saver.restore(sess=sess, save_path="path_which_does_not_exist")
+      try:
+        saver.restore(sess=sess, save_path="path_which_does_not_exist")
+      except errors.NotFoundError:
+        # Make sure we don't have a confusing "During handling of the above
+        # exception" block in Python 3.
+        # pylint: disable=no-value-for-parameter
+        exception_string = "\n".join(
+            traceback.format_exception(*sys.exc_info()))
+        # pylint: enable=no-value-for-parameter
+        self.assertNotIn("NewCheckpointReader", exception_string)
 
   def testLoadFromObjectBasedGraph(self):
     checkpoint_directory = self.get_temp_dir()
diff --git a/tensorflow/python/training/session_manager.py b/tensorflow/python/training/session_manager.py
index 3cb3877cc2fb84..974f75777f43ab 100644
--- a/tensorflow/python/training/session_manager.py
+++ b/tensorflow/python/training/session_manager.py
@@ -95,7 +95,8 @@ def __init__(self,
                ready_op=None,
                ready_for_local_init_op=None,
                graph=None,
-               recovery_wait_secs=30):
+               recovery_wait_secs=30,
+               local_init_run_options=None):
     """Creates a SessionManager.
 
     The `local_init_op` is an `Operation` that is run always after a new session
@@ -127,6 +128,8 @@ def __init__(self,
          to run local_init_op.
       graph: The `Graph` that the model will use.
       recovery_wait_secs: Seconds between checks for the model to be ready.
+      local_init_run_options: RunOptions to be passed to session.run when
+        executing the local_init_op.
 
     Raises:
       ValueError: If ready_for_local_init_op is not None but local_init_op is
@@ -141,6 +144,7 @@ def __init__(self,
     self._graph = graph
     self._recovery_wait_secs = recovery_wait_secs
     self._target = None
+    self._local_init_run_options = local_init_run_options
     if ready_for_local_init_op is not None and local_init_op is None:
       raise ValueError("If you pass a ready_for_local_init_op "
                        "you must also pass a local_init_op "
@@ -485,7 +489,7 @@ def _try_run_local_init_op(self, sess):
       is_ready_for_local_init, msg = self._model_ready_for_local_init(sess)
       if is_ready_for_local_init:
         logging.info("Running local_init_op.")
-        sess.run(self._local_init_op)
+        sess.run(self._local_init_op, options=self._local_init_run_options)
         logging.info("Done running local_init_op.")
         return True, None
       else:
diff --git a/tensorflow/python/training/session_run_hook.py b/tensorflow/python/training/session_run_hook.py
index 89f40300650f3b..5daea931288659 100644
--- a/tensorflow/python/training/session_run_hook.py
+++ b/tensorflow/python/training/session_run_hook.py
@@ -84,11 +84,6 @@ def end(self, session):
 hooks.after_run() will not be called but hooks.end() will still be called.
 If sess.run() raises any other exception then neither hooks.after_run() nor
 hooks.end() will be called.
-
-@@SessionRunHook
-@@SessionRunArgs
-@@SessionRunContext
-@@SessionRunValues
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/python/training/slot_creator_test.py b/tensorflow/python/training/slot_creator_test.py
index b0f48e4ecd4d41..08a3c8dc53a5e8 100644
--- a/tensorflow/python/training/slot_creator_test.py
+++ b/tensorflow/python/training/slot_creator_test.py
@@ -21,7 +21,6 @@
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
@@ -30,7 +29,6 @@
 from tensorflow.python.training import slot_creator
 
 
-@test_util.with_c_api
 class SlotCreatorTest(test.TestCase):
 
   def testCreateSlotFromVariable(self):
diff --git a/tensorflow/python/training/supervisor.py b/tensorflow/python/training/supervisor.py
index 7389e344c7d8ee..372ea415df0ee2 100644
--- a/tensorflow/python/training/supervisor.py
+++ b/tensorflow/python/training/supervisor.py
@@ -225,7 +225,8 @@ def __init__(self,
                checkpoint_basename="model.ckpt",
                session_manager=None,
                summary_writer=USE_DEFAULT,
-               init_fn=None):
+               init_fn=None,
+               local_init_run_options=None):
     """Create a `Supervisor`.
 
     Args:
@@ -294,6 +295,8 @@ def __init__(self,
       init_fn: Optional callable used to initialize the model. Called
         after the optional `init_op` is called.  The callable must accept one
         argument, the session being initialized.
+      local_init_run_options: RunOptions to be passed as the SessionManager
+        local_init_run_options parameter.
 
     Returns:
       A `Supervisor`.
@@ -327,6 +330,7 @@ def __init__(self,
     self._recovery_wait_secs = recovery_wait_secs
     self._stop_grace_secs = stop_grace_secs
     self._init_fn = init_fn
+    self._local_init_run_options = local_init_run_options
 
     # Set all attributes related to checkpointing and writing events to None.
     # Afterwards, set them appropriately for chief supervisors, as these are
@@ -362,7 +366,8 @@ def _init_session_manager(self, session_manager=None):
           ready_op=self._ready_op,
           ready_for_local_init_op=self._ready_for_local_init_op,
           graph=self._graph,
-          recovery_wait_secs=self._recovery_wait_secs)
+          recovery_wait_secs=self._recovery_wait_secs,
+          local_init_run_options=self._local_init_run_options)
     else:
       self._session_manager = session_manager
 
diff --git a/tensorflow/python/training/training.py b/tensorflow/python/training/training.py
index 4ae7f8451003c2..3f2dc6797623b4 100644
--- a/tensorflow/python/training/training.py
+++ b/tensorflow/python/training/training.py
@@ -16,88 +16,6 @@
 """Support for training models.
 
 See the @{$python/train} guide.
-
-@@Optimizer
-@@GradientDescentOptimizer
-@@AdadeltaOptimizer
-@@AdagradOptimizer
-@@AdagradDAOptimizer
-@@MomentumOptimizer
-@@AdamOptimizer
-@@FtrlOptimizer
-@@ProximalGradientDescentOptimizer
-@@ProximalAdagradOptimizer
-@@RMSPropOptimizer
-@@custom_gradient
-@@gradients
-@@AggregationMethod
-@@GradientTape
-@@stop_gradient
-@@hessians
-@@clip_by_value
-@@clip_by_norm
-@@clip_by_average_norm
-@@clip_by_global_norm
-@@global_norm
-@@cosine_decay
-@@cosine_decay_restarts
-@@linear_cosine_decay
-@@noisy_linear_cosine_decay
-@@exponential_decay
-@@inverse_time_decay
-@@natural_exp_decay
-@@piecewise_constant
-@@polynomial_decay
-@@ExponentialMovingAverage
-@@Coordinator
-@@QueueRunner
-@@LooperThread
-@@add_queue_runner
-@@start_queue_runners
-@@Server
-@@Supervisor
-@@SessionManager
-@@ClusterSpec
-@@replica_device_setter
-@@MonitoredTrainingSession
-@@MonitoredSession
-@@SingularMonitoredSession
-@@Scaffold
-@@SessionCreator
-@@ChiefSessionCreator
-@@WorkerSessionCreator
-@@summary_iterator
-@@SessionRunHook
-@@SessionRunArgs
-@@SessionRunContext
-@@SessionRunValues
-@@LoggingTensorHook
-@@StopAtStepHook
-@@CheckpointSaverHook
-@@CheckpointSaverListener
-@@NewCheckpointReader
-@@StepCounterHook
-@@NanLossDuringTrainingError
-@@NanTensorHook
-@@SummarySaverHook
-@@GlobalStepWaiterHook
-@@FinalOpsHook
-@@FeedFnHook
-@@ProfilerHook
-@@SecondOrStepTimer
-@@global_step
-@@basic_train_loop
-@@get_global_step
-@@get_or_create_global_step
-@@create_global_step
-@@assert_global_step
-@@write_graph
-@@load_checkpoint
-@@load_variable
-@@list_variables
-@@init_from_checkpoint
-@@warm_start
-@@VocabInfo
 """
 
 # Optimizers.
@@ -149,7 +67,7 @@
 from tensorflow.python.training.basic_session_run_hooks import FeedFnHook
 from tensorflow.python.training.basic_session_run_hooks import ProfilerHook
 from tensorflow.python.training.basic_loops import basic_train_loop
-from tensorflow.python.training.checkpointable_utils import Checkpoint
+from tensorflow.python.training.checkpointable.util import Checkpoint
 from tensorflow.python.training.checkpoint_utils import init_from_checkpoint
 from tensorflow.python.training.checkpoint_utils import list_variables
 from tensorflow.python.training.checkpoint_utils import load_checkpoint
diff --git a/tensorflow/python/training/training_util.py b/tensorflow/python/training/training_util.py
index d05e1d2c830b2a..0877b2a8a2fc7d 100644
--- a/tensorflow/python/training/training_util.py
+++ b/tensorflow/python/training/training_util.py
@@ -119,18 +119,18 @@ def create_global_step(graph=None):
   graph = graph or ops.get_default_graph()
   if get_global_step(graph) is not None:
     raise ValueError('"global_step" already exists.')
+  if context.executing_eagerly():
+    with ops.device('cpu:0'):
+      return variable_scope.get_variable(
+          ops.GraphKeys.GLOBAL_STEP,
+          shape=[],
+          dtype=dtypes.int64,
+          initializer=init_ops.zeros_initializer(),
+          trainable=False,
+          collections=[ops.GraphKeys.GLOBAL_VARIABLES,
+                       ops.GraphKeys.GLOBAL_STEP])
   # Create in proper graph and base name_scope.
   with graph.as_default() as g, g.name_scope(None):
-    if context.executing_eagerly():
-      with ops.device('cpu:0'):
-        return variable_scope.get_variable(
-            ops.GraphKeys.GLOBAL_STEP,
-            shape=[],
-            dtype=dtypes.int64,
-            initializer=init_ops.zeros_initializer(),
-            trainable=False,
-            collections=[ops.GraphKeys.GLOBAL_VARIABLES,
-                         ops.GraphKeys.GLOBAL_STEP])
     return variable_scope.get_variable(
         ops.GraphKeys.GLOBAL_STEP,
         shape=[],
diff --git a/tensorflow/python/training/warm_starting_util.py b/tensorflow/python/training/warm_starting_util.py
index 4d4fb394c1272d..ec740abdd15ae2 100644
--- a/tensorflow/python/training/warm_starting_util.py
+++ b/tensorflow/python/training/warm_starting_util.py
@@ -33,7 +33,7 @@
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.VocabInfo", "estimator.VocabInfo")
+@tf_export("train.VocabInfo", allow_multiple_exports=True)
 class VocabInfo(
     collections.namedtuple("VocabInfo", [
         "new_vocab",
@@ -237,6 +237,62 @@ def _warm_start_var_with_vocab(var,
 # pylint: enable=protected-access
 
 
+def _get_grouped_variables(vars_to_warm_start):
+  """Collects and groups (possibly partitioned) variables into a dictionary.
+
+  The variables can be provided explicitly through vars_to_warm_start, or they
+  are retrieved from collections (see below).
+
+  Args:
+    vars_to_warm_start: One of the following:
+
+      - A regular expression (string) that captures which variables to
+        warm-start (see tf.get_collection).  This expression will only consider
+        variables in the TRAINABLE_VARIABLES collection.
+      - A list of Variables to warm-start.
+      - A list of strings, each representing a full variable name to warm-start.
+      - `None`, in which case only variables specified in
+        `var_name_to_vocab_info` will be warm-started.
+  Returns:
+    A dictionary mapping variable names (strings) to lists of Variables.
+  Raises:
+    ValueError: If vars_to_warm_start is not a string, `None`, a list of
+      `Variables`, or a list of strings.
+  """
+  if isinstance(vars_to_warm_start, str) or vars_to_warm_start is None:
+    # Both vars_to_warm_start = '.*' and vars_to_warm_start = None will match
+    # everything (in TRAINABLE_VARIABLES) here.
+    list_of_vars = ops.get_collection(
+        ops.GraphKeys.TRAINABLE_VARIABLES,
+        scope=vars_to_warm_start)
+  elif isinstance(vars_to_warm_start, list):
+    if all([isinstance(v, str) for v in vars_to_warm_start]):
+      list_of_vars = []
+      for v in vars_to_warm_start:
+        list_of_vars += ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES,
+                                           scope=v)
+    elif all([_is_variable(v) for v in vars_to_warm_start]):
+      list_of_vars = vars_to_warm_start
+    else:
+      raise ValueError("If `vars_to_warm_start` is a list, it must be all "
+                       "`Variable` or all `str`.  Given types are {}".format(
+                           [type(v) for v in vars_to_warm_start]))
+  else:
+    raise ValueError("`vars_to_warm_start must be a `list` or `str`.  Given "
+                     "type is {}".format(type(vars_to_warm_start)))
+  # We have to deal with partitioned variables, since get_collection flattens
+  # out the list.
+  grouped_variables = {}
+  for v in list_of_vars:
+    if not isinstance(v, list):
+      var_name = _infer_var_name([v])
+    else:
+      var_name = _infer_var_name(v)
+    grouped_variables.setdefault(var_name, []).append(v)
+
+  return grouped_variables
+
+
 @tf_export("train.warm_start")
 def warm_start(ckpt_to_initialize_from,
                vars_to_warm_start=".*",
@@ -251,10 +307,19 @@ def warm_start(ckpt_to_initialize_from,
     ckpt_to_initialize_from: [Required] A string specifying the directory with
       checkpoint file(s) or path to checkpoint from which to warm-start the
       model parameters.
-    vars_to_warm_start: [Optional] A regular expression that captures which
-      variables to warm-start (see tf.get_collection).  Defaults to `'.*'`,
-      which warm-starts all variables.  If `None` is explicitly given, only
-      variables specified in `var_name_to_vocab_info` will be warm-started.
+    vars_to_warm_start: [Optional] One of the following:
+
+      - A regular expression (string) that captures which variables to
+        warm-start (see tf.get_collection).  This expression will only consider
+        variables in the TRAINABLE_VARIABLES collection.
+      - A list of Variables to warm-start.
+      - A list of strings, each representing a full variable name to warm-start.
+      - `None`, in which case only variables specified in
+        `var_name_to_vocab_info` will be warm-started.
+
+      Defaults to `'.*'`, which warm-starts all variables in the
+      TRAINABLE_VARIABLES collection.  Note that this excludes variables such as
+      accumulators and moving statistics from batch norm.
     var_name_to_vocab_info: [Optional] Dict of variable names (strings) to
       VocabInfo. The variable names should be "full" variables, not the names
       of the partitions.  If not explicitly provided, the variable is assumed to
@@ -274,21 +339,7 @@ def warm_start(ckpt_to_initialize_from,
   if var_name_to_prev_var_name is None:
     var_name_to_prev_var_name = {}
   logging.info("Warm-starting from: %s", (ckpt_to_initialize_from,))
-  # We have to deal with partitioned variables, since get_collection flattens
-  # out the list.
-  grouped_variables = {}
-  # Both vars_to_warm_start = '.*' and
-  # vars_to_warm_start = None will match everything here.
-  for v in ops.get_collection(
-      # TODO(eddz): Allow for different collections here (to support
-      # warm-starting accumulators).
-      ops.GraphKeys.TRAINABLE_VARIABLES,
-      scope=vars_to_warm_start):
-    if not isinstance(v, list):
-      var_name = _infer_var_name([v])
-    else:
-      var_name = _infer_var_name(v)
-    grouped_variables.setdefault(var_name, []).append(v)
+  grouped_variables = _get_grouped_variables(vars_to_warm_start)
 
   # Keep track of which var_names in var_name_to_prev_var_name and
   # var_name_to_vocab_info have been used.  Err on the safer side by throwing an
diff --git a/tensorflow/python/training/warm_starting_util_test.py b/tensorflow/python/training/warm_starting_util_test.py
index 7e8cbd6baeea16..6a4c207d79edf2 100644
--- a/tensorflow/python/training/warm_starting_util_test.py
+++ b/tensorflow/python/training/warm_starting_util_test.py
@@ -36,6 +36,7 @@
 ones = init_ops.ones_initializer
 norms = init_ops.truncated_normal_initializer
 rand = init_ops.random_uniform_initializer
+zeros = init_ops.zeros_initializer
 
 
 class WarmStartingUtilTest(test.TestCase):
@@ -305,6 +306,46 @@ def testWarmStartVarWithVocabBothVarsPartitioned(self):
         self.assertAllEqual([[0.5], [0.], [0.]],
                             fruit_weights_vars[1].eval(sess))
 
+  def testWarmStart_ListOfVariables(self):
+    # Save checkpoint from which to warm-start.
+    _, prev_int_val = self._create_prev_run_var("v1", shape=[10, 1],
+                                                initializer=ones())
+    # Verify we initialized the values correctly.
+    self.assertAllEqual(np.ones([10, 1]), prev_int_val)
+
+    # New graph, new session with warm-starting.
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        # Initialize with zeros.
+        var = variable_scope.get_variable(
+            "v1",
+            shape=[10, 1],
+            initializer=zeros())
+        ws_util.warm_start(self.get_temp_dir(), vars_to_warm_start=[var])
+        sess.run(variables.global_variables_initializer())
+        # Verify weights were correctly warm-started (init overridden to ones).
+        self.assertAllEqual(var.eval(), prev_int_val)
+
+  def testWarmStart_ListOfStrings(self):
+    # Save checkpoint from which to warm-start.
+    _, prev_int_val = self._create_prev_run_var("v1", shape=[10, 1],
+                                                initializer=ones())
+    # Verify we initialized the values correctly.
+    self.assertAllEqual(np.ones([10, 1]), prev_int_val)
+
+    # New graph, new session with warm-starting.
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        # Initialize with zeros.
+        var = variable_scope.get_variable(
+            "v1",
+            shape=[10, 1],
+            initializer=zeros())
+        ws_util.warm_start(self.get_temp_dir(), vars_to_warm_start=["v1"])
+        sess.run(variables.global_variables_initializer())
+        # Verify weights were correctly warm-started (init overridden to ones).
+        self.assertAllEqual(var.eval(), prev_int_val)
+
   def testWarmStart_SparseColumnIntegerized(self):
     # Create feature column.
     sc_int = fc.categorical_column_with_identity("sc_int", num_buckets=10)
diff --git a/tensorflow/python/util/compat.py b/tensorflow/python/util/compat.py
index 1aba7584d18cd0..a24a52eea9710e 100644
--- a/tensorflow/python/util/compat.py
+++ b/tensorflow/python/util/compat.py
@@ -17,10 +17,6 @@
 ## Conversion routines
 In addition to the functions below, `as_str` converts an object to a `str`.
 
-@@as_bytes
-@@as_text
-@@as_str_any
-@@path_to_str
 
 ## Types
 The compatibility module also provides the following types:
diff --git a/tensorflow/python/util/function_utils.py b/tensorflow/python/util/function_utils.py
new file mode 100644
index 00000000000000..7bbbde3cd288a7
--- /dev/null
+++ b/tensorflow/python/util/function_utils.py
@@ -0,0 +1,57 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility to retrieve function args."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
+
+
+def _is_bounded_method(fn):
+  _, fn = tf_decorator.unwrap(fn)
+  return tf_inspect.ismethod(fn) and (fn.__self__ is not None)
+
+
+def _is_callable_object(obj):
+  return hasattr(obj, '__call__') and tf_inspect.ismethod(obj.__call__)
+
+
+def fn_args(fn):
+  """Get argument names for function-like object.
+
+  Args:
+    fn: Function, or function-like object (e.g., result of `functools.partial`).
+
+  Returns:
+    `tuple` of string argument names.
+
+  Raises:
+    ValueError: if partial function has positionally bound arguments
+  """
+  if isinstance(fn, functools.partial):
+    args = fn_args(fn.func)
+    args = [a for a in args[len(fn.args):] if a not in (fn.keywords or [])]
+  else:
+    if _is_callable_object(fn):
+      fn = fn.__call__
+    args = tf_inspect.getfullargspec(fn).args
+    if _is_bounded_method(fn):
+      args.remove('self')
+  return tuple(args)
diff --git a/tensorflow/python/util/function_utils_test.py b/tensorflow/python/util/function_utils_test.py
new file mode 100644
index 00000000000000..e78cf6a5b02af3
--- /dev/null
+++ b/tensorflow/python/util/function_utils_test.py
@@ -0,0 +1,128 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Estimator related util."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+
+from tensorflow.python.platform import test
+from tensorflow.python.util import function_utils
+
+
+class FnArgsTest(test.TestCase):
+
+  def test_simple_function(self):
+    def fn(a, b):
+      return a + b
+    self.assertEqual(('a', 'b'), function_utils.fn_args(fn))
+
+  def test_callable(self):
+
+    class Foo(object):
+
+      def __call__(self, a, b):
+        return a + b
+
+    self.assertEqual(('a', 'b'), function_utils.fn_args(Foo()))
+
+  def test_bounded_method(self):
+
+    class Foo(object):
+
+      def bar(self, a, b):
+        return a + b
+
+    self.assertEqual(('a', 'b'), function_utils.fn_args(Foo().bar))
+
+  def test_partial_function(self):
+    expected_test_arg = 123
+
+    def fn(a, test_arg):
+      if test_arg != expected_test_arg:
+        return ValueError('partial fn does not work correctly')
+      return a
+
+    wrapped_fn = functools.partial(fn, test_arg=123)
+
+    self.assertEqual(('a',), function_utils.fn_args(wrapped_fn))
+
+  def test_partial_function_with_positional_args(self):
+    expected_test_arg = 123
+
+    def fn(test_arg, a):
+      if test_arg != expected_test_arg:
+        return ValueError('partial fn does not work correctly')
+      return a
+
+    wrapped_fn = functools.partial(fn, 123)
+
+    self.assertEqual(('a',), function_utils.fn_args(wrapped_fn))
+
+    self.assertEqual(3, wrapped_fn(3))
+    self.assertEqual(3, wrapped_fn(a=3))
+
+  def test_double_partial(self):
+    expected_test_arg1 = 123
+    expected_test_arg2 = 456
+
+    def fn(a, test_arg1, test_arg2):
+      if test_arg1 != expected_test_arg1 or test_arg2 != expected_test_arg2:
+        return ValueError('partial does not work correctly')
+      return a
+
+    wrapped_fn = functools.partial(fn, test_arg2=456)
+    double_wrapped_fn = functools.partial(wrapped_fn, test_arg1=123)
+
+    self.assertEqual(('a',), function_utils.fn_args(double_wrapped_fn))
+
+  def test_double_partial_with_positional_args_in_outer_layer(self):
+    expected_test_arg1 = 123
+    expected_test_arg2 = 456
+
+    def fn(test_arg1, a, test_arg2):
+      if test_arg1 != expected_test_arg1 or test_arg2 != expected_test_arg2:
+        return ValueError('partial fn does not work correctly')
+      return a
+
+    wrapped_fn = functools.partial(fn, test_arg2=456)
+    double_wrapped_fn = functools.partial(wrapped_fn, 123)
+
+    self.assertEqual(('a',), function_utils.fn_args(double_wrapped_fn))
+
+    self.assertEqual(3, double_wrapped_fn(3))
+    self.assertEqual(3, double_wrapped_fn(a=3))
+
+  def test_double_partial_with_positional_args_in_both_layers(self):
+    expected_test_arg1 = 123
+    expected_test_arg2 = 456
+
+    def fn(test_arg1, test_arg2, a):
+      if test_arg1 != expected_test_arg1 or test_arg2 != expected_test_arg2:
+        return ValueError('partial fn does not work correctly')
+      return a
+
+    wrapped_fn = functools.partial(fn, 123)  # binds to test_arg1
+    double_wrapped_fn = functools.partial(wrapped_fn, 456)  # binds to test_arg2
+
+    self.assertEqual(('a',), function_utils.fn_args(double_wrapped_fn))
+
+    self.assertEqual(3, double_wrapped_fn(3))
+    self.assertEqual(3, double_wrapped_fn(a=3))
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/util/serialization.py b/tensorflow/python/util/serialization.py
new file mode 100644
index 00000000000000..faf5164faa7f1c
--- /dev/null
+++ b/tensorflow/python/util/serialization.py
@@ -0,0 +1,64 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for serializing Python objects."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import tensor_shape
+
+
+def get_json_type(obj):
+  """Serializes any object to a JSON-serializable structure.
+
+  Arguments:
+      obj: the object to serialize
+
+  Returns:
+      JSON-serializable structure representing `obj`.
+
+  Raises:
+      TypeError: if `obj` cannot be serialized.
+  """
+  # if obj is a serializable Keras class instance
+  # e.g. optimizer, layer
+  if hasattr(obj, 'get_config'):
+    return {'class_name': obj.__class__.__name__, 'config': obj.get_config()}
+
+  # if obj is any numpy type
+  if type(obj).__module__ == np.__name__:
+    if isinstance(obj, np.ndarray):
+      return {'type': type(obj), 'value': obj.tolist()}
+    else:
+      return obj.item()
+
+  # misc functions (e.g. loss function)
+  if callable(obj):
+    return obj.__name__
+
+  # if obj is a python 'type'
+  if type(obj).__name__ == type.__name__:
+    return obj.__name__
+
+  if isinstance(obj, tensor_shape.Dimension):
+    return obj.value
+
+  if isinstance(obj, tensor_shape.TensorShape):
+    return obj.as_list()
+
+  raise TypeError('Not JSON Serializable:', obj)
diff --git a/tensorflow/python/util/serialization_test.py b/tensorflow/python/util/serialization_test.py
new file mode 100644
index 00000000000000..5000bcfad05900
--- /dev/null
+++ b/tensorflow/python/util/serialization_test.py
@@ -0,0 +1,76 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for serialization functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine import input_layer
+from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.layers import core
+from tensorflow.python.platform import test
+from tensorflow.python.util import serialization
+
+
+class SerializationTests(test.TestCase):
+
+  def test_serialize_dense(self):
+    dense = core.Dense(3)
+    dense(constant_op.constant([[4.]]))
+    round_trip = json.loads(json.dumps(
+        dense, default=serialization.get_json_type))
+    self.assertEqual(3, round_trip["config"]["units"])
+
+  def test_serialize_shape(self):
+    round_trip = json.loads(json.dumps(
+        tensor_shape.TensorShape([None, 2, 3]),
+        default=serialization.get_json_type))
+    self.assertIs(round_trip[0], None)
+    self.assertEqual(round_trip[1], 2)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_serialize_sequential(self):
+    model = sequential.Sequential()
+    model.add(core.Dense(4))
+    model.add(core.Dense(5))
+    model(constant_op.constant([[1.]]))
+    sequential_round_trip = json.loads(
+        json.dumps(model, default=serialization.get_json_type))
+    self.assertEqual(5, sequential_round_trip["config"][1]["config"]["units"])
+    input_round_trip = json.loads(
+        json.dumps(model._input_layers, default=serialization.get_json_type))
+    self.assertAllEqual([1, 1],
+                        input_round_trip[0]["config"]["batch_input_shape"])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_serialize_model(self):
+    x = input_layer.Input(shape=[3])
+    y = core.Dense(10)(x)
+    model = training.Model(x, y)
+    model(constant_op.constant([[1., 1., 1.]]))
+    model_round_trip = json.loads(
+        json.dumps(model, default=serialization.get_json_type))
+    self.assertEqual(
+        10, model_round_trip["config"]["layers"][1]["config"]["units"])
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/util/stat_summarizer.i b/tensorflow/python/util/stat_summarizer.i
index 6aeaa0e31b9b48..73fa85494b72d9 100644
--- a/tensorflow/python/util/stat_summarizer.i
+++ b/tensorflow/python/util/stat_summarizer.i
@@ -73,7 +73,7 @@ void _DeleteStatSummarizer(tensorflow::StatSummarizer* ss);
     return ss;
 }
 }
-
+%include "tensorflow/core/util/stat_summarizer_options.h"
 %include "tensorflow/core/util/stat_summarizer.h"
 %unignoreall
 
@@ -88,9 +88,4 @@ def NewStatSummarizer(unused):
 
 def DeleteStatSummarizer(stat_summarizer):
   _DeleteStatSummarizer(stat_summarizer)
-
-NewStatSummarizer._tf_api_names = ["contrib.stat_summarizer.NewStatSummarizer"]
-DeleteStatSummarizer._tf_api_names = [
-    "contrib.stat_summarizer.DeleteStatSummarizer"]
-StatSummarizer._tf_api_names = ["contrib.stat_summarizer.StatSummarizer"]
 %}
diff --git a/tensorflow/python/util/tf_export.py b/tensorflow/python/util/tf_export.py
index a30b8b13363583..bf3961c6920c4c 100644
--- a/tensorflow/python/util/tf_export.py
+++ b/tensorflow/python/util/tf_export.py
@@ -59,13 +59,19 @@ def __init__(self, *args, **kwargs):
 
     Args:
       *args: API names in dot delimited format.
-      **kwargs: Optional keyed arguments. Currently only supports 'overrides'
-        argument. overrides: List of symbols that this is overriding
-        (those overrided api exports will be removed). Note: passing overrides
-        has no effect on exporting a constant.
+      **kwargs: Optional keyed arguments.
+          overrides: List of symbols that this is overriding
+          (those overrided api exports will be removed). Note: passing overrides
+          has no effect on exporting a constant.
+          allow_multiple_exports: Allows exporting the same symbol multiple
+          times with multiple `tf_export` usages. Prefer however, to list all
+          of the exported names in a single `tf_export` usage when possible.
+
     """
     self._names = args
     self._overrides = kwargs.get('overrides', [])
+    self._allow_multiple_exports = kwargs.get(
+        'allow_multiple_exports', False)
 
   def __call__(self, func):
     """Calls this decorator.
@@ -77,7 +83,8 @@ def __call__(self, func):
       The input function with _tf_api_names attribute set.
 
     Raises:
-      SymbolAlreadyExposedError: Raised when a symbol already has API names.
+      SymbolAlreadyExposedError: Raised when a symbol already has API names
+        and kwarg `allow_multiple_exports` not set.
     """
     # Undecorate overridden names
     for f in self._overrides:
@@ -90,16 +97,14 @@ def __call__(self, func):
     # __dict__ instead of using hasattr to verify that subclasses have
     # their own _tf_api_names as opposed to just inheriting it.
     if '_tf_api_names' in undecorated_func.__dict__:
-      # pylint: disable=protected-access
-      raise SymbolAlreadyExposedError(
-          'Symbol %s is already exposed as %s.' %
-          (undecorated_func.__name__, undecorated_func._tf_api_names))
-      # pylint: enable=protected-access
-
-    # Complete the export by creating/overriding attribute
-    # pylint: disable=protected-access
-    undecorated_func._tf_api_names = self._names
-    # pylint: enable=protected-access
+      if self._allow_multiple_exports:
+        undecorated_func._tf_api_names += self._names  # pylint: disable=protected-access
+      else:
+        raise SymbolAlreadyExposedError(
+            'Symbol %s is already exposed as %s.' %
+            (undecorated_func.__name__, undecorated_func._tf_api_names))  # pylint: disable=protected-access
+    else:
+      undecorated_func._tf_api_names = self._names  # pylint: disable=protected-access
     return func
 
   def export_constant(self, module_name, name):
diff --git a/tensorflow/python/util/tf_export_test.py b/tensorflow/python/util/tf_export_test.py
index b9e26ecb33383f..ace3f054ba952f 100644
--- a/tensorflow/python/util/tf_export_test.py
+++ b/tensorflow/python/util/tf_export_test.py
@@ -128,6 +128,13 @@ def testRaisesExceptionIfAlreadyHasAPINames(self):
     with self.assertRaises(tf_export.SymbolAlreadyExposedError):
       export_decorator(_test_function)
 
+  def testEAllowMultipleExports(self):
+    _test_function._tf_api_names = ['name1', 'name2']
+    tf_export.tf_export('nameRed', 'nameBlue', allow_multiple_exports=True)(
+        _test_function)
+    self.assertEquals(['name1', 'name2', 'nameRed', 'nameBlue'],
+                      _test_function._tf_api_names)
+
   def testOverridesFunction(self):
     _test_function2._tf_api_names = ['abc']
 
diff --git a/tensorflow/python/util/tf_inspect.py b/tensorflow/python/util/tf_inspect.py
index 663036de8a01c3..fbd65617670b15 100644
--- a/tensorflow/python/util/tf_inspect.py
+++ b/tensorflow/python/util/tf_inspect.py
@@ -18,8 +18,11 @@
 from __future__ import print_function
 
 from collections import namedtuple
+import functools
 import inspect as _inspect
 
+import six
+
 from tensorflow.python.util import tf_decorator
 
 ArgSpec = _inspect.ArgSpec
@@ -39,27 +42,164 @@ def currentframe():
   return _inspect.stack()[1][0]
 
 
-def getargspec(object):  # pylint: disable=redefined-builtin
+def getargspec(obj):
   """TFDecorator-aware replacement for inspect.getargspec.
 
   Args:
-    object: A callable, possibly decorated.
+    obj: A function, partial function, or callable object, possibly
+    decorated.
 
   Returns:
     The `ArgSpec` that describes the signature of the outermost decorator that
     changes the callable's signature. If the callable is not decorated,
-    `inspect.getargspec()` will be called directly on the callable.
+    `inspect.getargspec()` will be called directly on the object.
+
+  Raises:
+    ValueError: When callable's signature can not be expressed with
+      ArgSpec.
+    TypeError: For objects of unsupported types.
   """
-  decorators, target = tf_decorator.unwrap(object)
-  return next((d.decorator_argspec for d in decorators
-               if d.decorator_argspec is not None), _inspect.getargspec(target))
+  if isinstance(obj, functools.partial):
+    return _get_argspec_for_partial(obj)
+
+  decorators, target = tf_decorator.unwrap(obj)
+
+  spec = next((d.decorator_argspec
+               for d in decorators
+               if d.decorator_argspec is not None), None)
+  if spec:
+    return spec
 
+  try:
+    # Python3 will handle most callables here (not partial).
+    return _inspect.getargspec(target)
+  except TypeError:
+    pass
 
-def getfullargspec(obj):  # pylint: disable=redefined-builtin
-  """TFDecorator-aware replacement for `inspect.getfullargspec`/`getargspec`.
+  if isinstance(target, type):
+    try:
+      return _inspect.getargspec(target.__init__)
+    except TypeError:
+      pass
 
-  This wrapper uses `inspect.getfullargspec` if available and falls back to
-  `inspect.getargspec` in Python 2.
+    try:
+      return _inspect.getargspec(target.__new__)
+    except TypeError:
+      pass
+
+  # The `type(target)` ensures that if a class is received we don't return
+  # the signature of it's __call__ method.
+  return _inspect.getargspec(type(target).__call__)
+
+
+def _get_argspec_for_partial(obj):
+  """Implements `getargspec` for `functools.partial` objects.
+
+  Args:
+    obj: The `functools.partial` obeject
+  Returns:
+    An `inspect.ArgSpec`
+  Raises:
+    ValueError: When callable's signature can not be expressed with
+      ArgSpec.
+  """
+  # When callable is a functools.partial object, we construct its ArgSpec with
+  # following strategy:
+  # - If callable partial contains default value for positional arguments (ie.
+  # object.args), then final ArgSpec doesn't contain those positional arguments.
+  # - If callable partial contains default value for keyword arguments (ie.
+  # object.keywords), then we merge them with wrapped target. Default values
+  # from callable partial takes precedence over those from wrapped target.
+  #
+  # However, there is a case where it is impossible to construct a valid
+  # ArgSpec. Python requires arguments that have no default values must be
+  # defined before those with default values. ArgSpec structure is only valid
+  # when this presumption holds true because default values are expressed as a
+  # tuple of values without keywords and they are always assumed to belong to
+  # last K arguments where K is number of default values present.
+  #
+  # Since functools.partial can give default value to any argument, this
+  # presumption may no longer hold in some cases. For example:
+  #
+  # def func(m, n):
+  #   return 2 * m + n
+  # partialed = functools.partial(func, m=1)
+  #
+  # This example will result in m having a default value but n doesn't. This is
+  # usually not allowed in Python and can not be expressed in ArgSpec correctly.
+  #
+  # Thus, we must detect cases like this by finding first argument with default
+  # value and ensures all following arguments also have default values. When
+  # this is not true, a ValueError is raised.
+
+  n_prune_args = len(obj.args)
+  partial_keywords = obj.keywords or {}
+
+  args, varargs, keywords, defaults = getargspec(obj.func)
+
+  # Pruning first n_prune_args arguments.
+  args = args[n_prune_args:]
+
+  # Partial function may give default value to any argument, therefore length
+  # of default value list must be len(args) to allow each argument to
+  # potentially be given a default value.
+  all_defaults = [None] * len(args)
+  if defaults:
+    all_defaults[-len(defaults):] = defaults
+
+  # Fill in default values provided by partial function in all_defaults.
+  for kw, default in six.iteritems(partial_keywords):
+    idx = args.index(kw)
+    all_defaults[idx] = default
+
+  # Find first argument with default value set.
+  first_default = next((idx for idx, x in enumerate(all_defaults) if x), None)
+
+  # If no default values are found, return ArgSpec with defaults=None.
+  if first_default is None:
+    return ArgSpec(args, varargs, keywords, None)
+
+  # Checks if all arguments have default value set after first one.
+  invalid_default_values = [
+      args[i] for i, j in enumerate(all_defaults) if not j and i > first_default
+  ]
+
+  if invalid_default_values:
+    raise ValueError('Some arguments %s do not have default value, but they '
+                     'are positioned after those with default values. This can '
+                     'not be expressed with ArgSpec.' % invalid_default_values)
+
+  return ArgSpec(args, varargs, keywords, tuple(all_defaults[first_default:]))
+
+
+if hasattr(_inspect, 'getfullargspec'):
+  _getfullargspec = _inspect.getfullargspec
+else:
+
+  def _getfullargspec(target):
+    """A python2 version of getfullargspec.
+
+    Args:
+      target: the target object to inspect.
+    Returns:
+      A FullArgSpec with empty kwonlyargs, kwonlydefaults and annotations.
+    """
+    argspecs = _inspect.getargspec(target)
+    fullargspecs = FullArgSpec(
+        args=argspecs.args,
+        varargs=argspecs.varargs,
+        varkw=argspecs.keywords,
+        defaults=argspecs.defaults,
+        kwonlyargs=[],
+        kwonlydefaults=None,
+        annotations={})
+    return fullargspecs
+
+
+def getfullargspec(obj):
+  """TFDecorator-aware replacement for `inspect.getfullargspec`.
+
+  This wrapper emulates `inspect.getfullargspec` in[^)]* Python2.
 
   Args:
     obj: A callable, possibly decorated.
@@ -70,34 +210,10 @@ def getfullargspec(obj):  # pylint: disable=redefined-builtin
     callable is not decorated, `inspect.getfullargspec()` will be called
     directly on the callable.
   """
-  if hasattr(_inspect, 'getfullargspec'):
-    spec_fn = _inspect.getfullargspec
-  else:
-    def spec_fn(target):
-      """Spec function that adding default value from FullArgSpec.
-
-      It is used when getfullargspec is not available (eg in PY2).
-
-      Args:
-        target: the target object to inspect.
-      Returns:
-        The full argument specs with empty kwonlyargs, kwonlydefaults and
-        annotations.
-      """
-      argspecs = _inspect.getargspec(target)
-      fullargspecs = FullArgSpec(
-          args=argspecs.args,
-          varargs=argspecs.varargs,
-          varkw=argspecs.keywords,
-          defaults=argspecs.defaults,
-          kwonlyargs=[],
-          kwonlydefaults=None,
-          annotations={})
-      return fullargspecs
-
   decorators, target = tf_decorator.unwrap(obj)
-  return next((d.decorator_argspec for d in decorators
-               if d.decorator_argspec is not None), spec_fn(target))
+  return next((d.decorator_argspec
+               for d in decorators
+               if d.decorator_argspec is not None), _getfullargspec(target))
 
 
 def getcallargs(func, *positional, **named):
@@ -116,7 +232,7 @@ def getcallargs(func, *positional, **named):
   it. If no attached decorators modify argspec, the final unwrapped target's
   argspec will be used.
   """
-  argspec = getargspec(func)
+  argspec = getfullargspec(func)
   call_args = named.copy()
   this = getattr(func, 'im_self', None) or getattr(func, '__self__', None)
   if ismethod(func) and this:
diff --git a/tensorflow/python/util/tf_inspect_test.py b/tensorflow/python/util/tf_inspect_test.py
index 129408449ebb45..beaf350de1e469 100644
--- a/tensorflow/python/util/tf_inspect_test.py
+++ b/tensorflow/python/util/tf_inspect_test.py
@@ -19,6 +19,7 @@
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import inspect
 
 from tensorflow.python.platform import test
@@ -109,6 +110,187 @@ def testGetArgSpecReturnsOutermostDecoratorThatChangesArgspec(self):
                                                outer_argspec)
     self.assertEqual(outer_argspec, tf_inspect.getargspec(outer_decorator))
 
+  def testGetArgSpecOnPartialPositionalArgumentOnly(self):
+    """Tests getargspec on partial function with only positional arguments."""
+
+    def func(m, n):
+      return 2 * m + n
+
+    partial_func = functools.partial(func, 7)
+    argspec = tf_inspect.ArgSpec(
+        args=['n'], varargs=None, keywords=None, defaults=None)
+
+    self.assertEqual(argspec, tf_inspect.getargspec(partial_func))
+
+  def testGetArgSpecOnPartialInvalidArgspec(self):
+    """Tests getargspec on partial function that doesn't have valid argspec."""
+
+    def func(m, n, l, k=4):
+      return 2 * m + l + n * k
+
+    partial_func = functools.partial(func, n=7)
+
+    exception_message = (r"Some arguments \['l'\] do not have default value, "
+                         "but they are positioned after those with default "
+                         "values. This can not be expressed with ArgSpec.")
+    with self.assertRaisesRegexp(ValueError, exception_message):
+      tf_inspect.getargspec(partial_func)
+
+  def testGetArgSpecOnPartialValidArgspec(self):
+    """Tests getargspec on partial function with valid argspec."""
+
+    def func(m, n, l, k=4):
+      return 2 * m + l + n * k
+
+    partial_func = functools.partial(func, n=7, l=2)
+    argspec = tf_inspect.ArgSpec(
+        args=['m', 'n', 'l', 'k'],
+        varargs=None,
+        keywords=None,
+        defaults=(7, 2, 4))
+
+    self.assertEqual(argspec, tf_inspect.getargspec(partial_func))
+
+  def testGetArgSpecOnPartialNoArgumentsLeft(self):
+    """Tests getargspec on partial function that prunes all arguments."""
+
+    def func(m, n):
+      return 2 * m + n
+
+    partial_func = functools.partial(func, 7, 10)
+    argspec = tf_inspect.ArgSpec(
+        args=[], varargs=None, keywords=None, defaults=None)
+
+    self.assertEqual(argspec, tf_inspect.getargspec(partial_func))
+
+  def testGetArgSpecOnPartialKeywordArgument(self):
+    """Tests getargspec on partial function that prunes some arguments."""
+
+    def func(m, n):
+      return 2 * m + n
+
+    partial_func = functools.partial(func, n=7)
+    argspec = tf_inspect.ArgSpec(
+        args=['m', 'n'], varargs=None, keywords=None, defaults=(7,))
+
+    self.assertEqual(argspec, tf_inspect.getargspec(partial_func))
+
+  def testGetArgSpecOnPartialKeywordArgumentWithDefaultValue(self):
+    """Tests getargspec on partial function that prunes argument by keyword."""
+
+    def func(m=1, n=2):
+      return 2 * m + n
+
+    partial_func = functools.partial(func, n=7)
+    argspec = tf_inspect.ArgSpec(
+        args=['m', 'n'], varargs=None, keywords=None, defaults=(1, 7))
+
+    self.assertEqual(argspec, tf_inspect.getargspec(partial_func))
+
+  def testGetArgSpecOnPartialWithVarargs(self):
+    """Tests getargspec on partial function with variable arguments."""
+
+    def func(m, *arg):
+      return m + len(arg)
+
+    partial_func = functools.partial(func, 7, 8)
+    argspec = tf_inspect.ArgSpec(
+        args=[], varargs='arg', keywords=None, defaults=None)
+
+    self.assertEqual(argspec, tf_inspect.getargspec(partial_func))
+
+  def testGetArgSpecOnPartialWithVarkwargs(self):
+    """Tests getargspec on partial function with variable keyword arguments."""
+
+    def func(m, n, **kwarg):
+      return m * n + len(kwarg)
+
+    partial_func = functools.partial(func, 7)
+    argspec = tf_inspect.ArgSpec(
+        args=['n'], varargs=None, keywords='kwarg', defaults=None)
+
+    self.assertEqual(argspec, tf_inspect.getargspec(partial_func))
+
+  def testGetArgSpecOnPartialWithDecorator(self):
+    """Tests getargspec on decorated partial function."""
+
+    @test_decorator('decorator')
+    def func(m=1, n=2):
+      return 2 * m + n
+
+    partial_func = functools.partial(func, n=7)
+    argspec = tf_inspect.ArgSpec(
+        args=['m', 'n'], varargs=None, keywords=None, defaults=(1, 7))
+
+    self.assertEqual(argspec, tf_inspect.getargspec(partial_func))
+
+  def testGetArgSpecOnPartialWithDecoratorThatChangesArgspec(self):
+    """Tests getargspec on partial function with decorated argspec."""
+
+    argspec = tf_inspect.ArgSpec(
+        args=['a', 'b', 'c'],
+        varargs=None,
+        keywords=None,
+        defaults=(1, 'hello'))
+    decorator = tf_decorator.TFDecorator('', test_undecorated_function, '',
+                                         argspec)
+    partial_argspec = tf_inspect.ArgSpec(
+        args=['a', 'b', 'c'],
+        varargs=None,
+        keywords=None,
+        defaults=(2, 1, 'hello'))
+    partial_with_decorator = functools.partial(decorator, a=2)
+
+    self.assertEqual(argspec, tf_inspect.getargspec(decorator))
+    self.assertEqual(partial_argspec,
+                     tf_inspect.getargspec(partial_with_decorator))
+
+  def testGetArgSpecOnCallableObject(self):
+
+    class Callable(object):
+
+      def __call__(self, a, b=1, c='hello'):
+        pass
+
+    argspec = tf_inspect.ArgSpec(
+        args=['self', 'a', 'b', 'c'],
+        varargs=None,
+        keywords=None,
+        defaults=(1, 'hello'))
+
+    test_obj = Callable()
+    self.assertEqual(argspec, tf_inspect.getargspec(test_obj))
+
+  def testGetArgSpecOnInitClass(self):
+
+    class InitClass(object):
+
+      def __init__(self, a, b=1, c='hello'):
+        pass
+
+    argspec = tf_inspect.ArgSpec(
+        args=['self', 'a', 'b', 'c'],
+        varargs=None,
+        keywords=None,
+        defaults=(1, 'hello'))
+
+    self.assertEqual(argspec, tf_inspect.getargspec(InitClass))
+
+  def testGetArgSpecOnNewClass(self):
+
+    class NewClass(object):
+
+      def __new__(cls, a, b=1, c='hello'):
+        pass
+
+    argspec = tf_inspect.ArgSpec(
+        args=['cls', 'a', 'b', 'c'],
+        varargs=None,
+        keywords=None,
+        defaults=(1, 'hello'))
+
+    self.assertEqual(argspec, tf_inspect.getargspec(NewClass))
+
   def testGetDoc(self):
     self.assertEqual('Test Decorated Function With Defaults Docstring.',
                      tf_inspect.getdoc(test_decorated_function_with_defaults))
diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index 70aee4a3f663c8..0dd406aa4e00fd 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -14,8 +14,14 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/python/util/util.h"
 
+#include <functional>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/python/lib/core/safe_ptr.h"
 
 namespace tensorflow {
@@ -25,6 +31,7 @@ namespace {
 
 // Type object for collections.Sequence. This is set by RegisterSequenceClass.
 PyObject* CollectionsSequenceType = nullptr;
+PyTypeObject* SparseTensorValueType = nullptr;
 
 bool WarnedThatSetIsNotSequence = false;
 
@@ -135,6 +142,12 @@ class ValIterator {
   Py_ssize_t index_;
 };
 
+mutex g_type_to_sequence_map(LINKER_INITIALIZED);
+std::unordered_map<PyTypeObject*, bool>* IsTypeSequenceMap() {
+  static auto* const m = new std::unordered_map<PyTypeObject*, bool>;
+  return m;
+}
+
 // Returns 1 if `o` is considered a sequence for the purposes of Flatten().
 // Returns 0 otherwise.
 // Returns -1 if an error occurred.
@@ -155,64 +168,146 @@ int IsSequenceHelper(PyObject* o) {
             .c_str());
     return -1;
   }
+
+  // Try not to return to Python - see if the type has already been seen
+  // before.
+
+  auto* type_to_sequence_map = IsTypeSequenceMap();
+  auto* type = Py_TYPE(o);
+
+  {
+    mutex_lock l(g_type_to_sequence_map);
+    auto it = type_to_sequence_map->find(type);
+    if (it != type_to_sequence_map->end()) {
+      return it->second;
+    }
+  }
+
+  // NOTE: We explicitly release the g_type_to_sequence_map mutex,
+  // because PyObject_IsInstance() may release the GIL, allowing another thread
+  // concurrent entry to this function.
   int is_instance = PyObject_IsInstance(o, CollectionsSequenceType);
+
+  // Don't cache a failed is_instance check.
   if (is_instance == -1) return -1;
-  return static_cast<int>(is_instance != 0 && !IsString(o));
+
+  bool is_sequence = static_cast<int>(is_instance != 0 && !IsString(o));
+
+  // NOTE: This is never decref'd, but we don't want the type to get deleted
+  // as long as it is in the map. This should not be too much of a
+  // leak, as there should only be a relatively small number of types in the
+  // map, and an even smaller number that are eligible for decref.
+  Py_INCREF(type);
+  {
+    mutex_lock l(g_type_to_sequence_map);
+    type_to_sequence_map->insert({type, is_sequence});
+  }
+
+  return is_sequence;
 }
 
-bool FlattenHelper(PyObject* nested, PyObject* list) {
-  // if nested is not a sequence, append itself and exit
-  int is_seq = IsSequenceHelper(nested);
-  if (is_seq == -1) return false;
-  if (!is_seq) {
-    return PyList_Append(list, nested) != -1;
+bool IsSparseTensorValueType(PyObject* o) {
+  if (TF_PREDICT_FALSE(SparseTensorValueType == nullptr)) {
+    return false;
   }
 
-  // if nested if dictionary, sort it by key and recurse on each value
-  if (PyDict_Check(nested)) {
-    PyObject* keys = PyDict_Keys(nested);
-    if (PyList_Sort(keys) == -1) return false;
-    Py_ssize_t size = PyList_Size(keys);
-    for (Py_ssize_t i = 0; i < size; ++i) {
-      // We know that key and val will not be deleted because nested owns
-      // a reference to them and callers of flatten must not modify nested
-      // while the method is running.
-      PyObject* key = PyList_GET_ITEM(keys, i);
-      PyObject* val = PyDict_GetItem(nested, key);
-      if (Py_EnterRecursiveCall(" in flatten")) {
-        Py_DECREF(keys);
-        return false;
-      }
-      const bool success = FlattenHelper(val, list);
-      Py_LeaveRecursiveCall();
-      if (!success) {
-        Py_DECREF(keys);
-        return false;
-      }
-    }
-    Py_DECREF(keys);
-    return true;
+  return PyObject_TypeCheck(o, SparseTensorValueType) == 1;
+}
+
+int IsSequenceForDataHelper(PyObject* o) {
+  return IsSequenceHelper(o) == 1 && !PyList_Check(o) &&
+         !IsSparseTensorValueType(o);
+}
+
+bool GetNextValuesForDict(PyObject* nested,
+                          std::vector<Safe_PyObjectPtr>* next_values) {
+  std::vector<PyObject*> result;
+
+  PyObject* keys = PyDict_Keys(nested);
+  if (PyList_Sort(keys) == -1) return false;
+  Py_ssize_t size = PyList_Size(keys);
+  for (Py_ssize_t i = 0; i < size; ++i) {
+    // We know that key and item will not be deleted because nested owns
+    // a reference to them and callers of flatten must not modify nested
+    // while the method is running.
+    PyObject* key = PyList_GET_ITEM(keys, i);
+    PyObject* item = PyDict_GetItem(nested, key);
+    Py_INCREF(item);
+    next_values->emplace_back(item);
   }
+  Py_DECREF(keys);
+  return true;
+}
 
-  // iterate and recurse
+bool GetNextValuesForIterable(PyObject* nested,
+                              std::vector<Safe_PyObjectPtr>* next_values) {
   PyObject* item;
   PyObject* iterator = PyObject_GetIter(nested);
+  if (iterator == nullptr || PyErr_Occurred()) {
+    return false;
+  }
   while ((item = PyIter_Next(iterator)) != nullptr) {
+    next_values->emplace_back(item);
+  }
+  Py_DECREF(iterator);
+  return true;
+}
+
+// GetNextValues returns the values that the FlattenHelper function will recurse
+// over next.
+bool GetNextValues(PyObject* nested,
+                   std::vector<Safe_PyObjectPtr>* next_values) {
+  if (PyDict_Check(nested)) {
+    // if nested is dictionary, sort it by key and recurse on each value
+    return GetNextValuesForDict(nested, next_values);
+  }
+  // iterate and recurse
+  return GetNextValuesForIterable(nested, next_values);
+}
+
+// Similar to above, just specialized for the functions in the data pacakage.
+bool GetNextValuesForData(PyObject* nested,
+                          std::vector<Safe_PyObjectPtr>* next_values) {
+  if (PyDict_Check(nested)) {
+    // if nested is dictionary, sort it by key and recurse on each value
+    return GetNextValuesForDict(nested, next_values);
+  } else if (IsSparseTensorValueType(nested)) {
+    // if nested is a SparseTensorValue, just return itself as a single item
+    Py_INCREF(nested);
+    next_values->emplace_back(nested);
+    return true;
+  }
+  // iterate and recurse
+  return GetNextValuesForIterable(nested, next_values);
+}
+
+bool FlattenHelper(
+    PyObject* nested, PyObject* list,
+    const std::function<int(PyObject*)>& is_sequence_helper,
+    const std::function<bool(PyObject*, std::vector<Safe_PyObjectPtr>*)>&
+        next_values_getter) {
+  // if nested is not a sequence, append itself and exit
+  int is_seq = is_sequence_helper(nested);
+  if (is_seq == -1) return false;
+  if (!is_seq) {
+    return PyList_Append(list, nested) != -1;
+  }
+
+  std::vector<Safe_PyObjectPtr> next_values;
+  // Get the next values to recurse over.
+  if (!next_values_getter(nested, &next_values)) return false;
+
+  for (const auto& item : next_values) {
     if (Py_EnterRecursiveCall(" in flatten")) {
-      Py_DECREF(iterator);
-      Py_DECREF(item);
       return false;
     }
-    bool success = FlattenHelper(item, list);
+    const bool success =
+        FlattenHelper(item.get(), list, is_sequence_helper, next_values_getter);
     Py_LeaveRecursiveCall();
     if (!success) {
-      Py_DECREF(iterator);
-      Py_DECREF(item);
       return false;
     }
-    Py_DECREF(item);
   }
-  Py_DECREF(iterator);
   return true;
 }
 
@@ -234,7 +329,7 @@ void SetDifferentKeysError(PyObject* dict1, PyObject* dict2, string* error_msg,
 
 // Returns true iff there were no "internal" errors. In other words,
 // errors that has nothing to do with structure checking.
-// If an "internal" error occured, the appropriate Python error will be
+// If an "internal" error occurred, the appropriate Python error will be
 // set and the caller can propage it directly to the user.
 //
 // Both `error_msg` and `is_type_error` must be non-null. `error_msg` must
@@ -351,7 +446,7 @@ bool AssertSameStructureHelper(PyObject* o1, PyObject* o2, bool check_types,
   }
 }
 
-}  // anonymous namespace
+}  // namespace
 
 void RegisterSequenceClass(PyObject* sequence_class) {
   if (!PyType_Check(sequence_class)) {
@@ -366,11 +461,38 @@ void RegisterSequenceClass(PyObject* sequence_class) {
   CollectionsSequenceType = sequence_class;
 }
 
+void RegisterSparseTensorValueClass(PyObject* sparse_tensor_value_class) {
+  if (!PyType_Check(sparse_tensor_value_class)) {
+    PyErr_SetString(
+        PyExc_TypeError,
+        tensorflow::strings::StrCat(
+            "Expecting a class definition for `SparseTensorValue`. Got ",
+            Py_TYPE(sparse_tensor_value_class)->tp_name)
+            .c_str());
+    return;
+  }
+  SparseTensorValueType =
+      reinterpret_cast<PyTypeObject*>(sparse_tensor_value_class);
+}
+
 bool IsSequence(PyObject* o) { return IsSequenceHelper(o) == 1; }
 
 PyObject* Flatten(PyObject* nested) {
   PyObject* list = PyList_New(0);
-  if (FlattenHelper(nested, list)) {
+  if (FlattenHelper(nested, list, IsSequenceHelper, GetNextValues)) {
+    return list;
+  } else {
+    Py_DECREF(list);
+    return nullptr;
+  }
+}
+
+bool IsSequenceForData(PyObject* o) { return IsSequenceForDataHelper(o) == 1; }
+
+PyObject* FlattenForData(PyObject* nested) {
+  PyObject* list = PyList_New(0);
+  if (FlattenHelper(nested, list, IsSequenceForDataHelper,
+                    GetNextValuesForData)) {
     return list;
   } else {
     Py_DECREF(list);
diff --git a/tensorflow/python/util/util.h b/tensorflow/python/util/util.h
index c325baa5f86820..70efc10c9abe7c 100644
--- a/tensorflow/python/util/util.h
+++ b/tensorflow/python/util/util.h
@@ -97,7 +97,7 @@ PyObject* AssertSameStructure(PyObject* o1, PyObject* o2, bool check_types);
 // used instead. The same convention is followed in `pack_sequence_as`. This
 // correctly repacks dicts and `OrderedDict`s after they have been flattened,
 // and also allows flattening an `OrderedDict` and then repacking it back using
-// a correponding plain dict, or vice-versa.
+// a corresponding plain dict, or vice-versa.
 // Dictionaries with non-sortable keys cannot be flattened.
 //
 // Args:
@@ -118,6 +118,30 @@ PyObject* Flatten(PyObject* nested);
 // the type from the module. This approach also requires some trigger from
 // Python so that we know that Python interpreter had been initialzied.
 void RegisterSequenceClass(PyObject* sequence_class);
+// Similar to the above function, except for the
+// sparse_tensor.SparseTensorValue class.
+void RegisterSparseTensorValueClass(PyObject* sparse_tensor_value_class);
+
+// The tensorflow.python.data package has its own nest utility that follows very
+// slightly different semantics for its functions than the tensorflow.python
+// nest utility. Returns a true if its input is a collections.Sequence (except
+// strings).
+//
+// Main differences are (this is copied from nest.py in the
+// tensorflow.data.util):
+//
+// 1. It removes support for lists as a level of nesting in nested structures.
+// 2. It adds support for `SparseTensorValue` as an atomic element.
+
+// IsSequence specialized for the data package. Additional comments about
+// difference in functionality can be found in nest.py in tensorflow.data.util
+// and in the comments for Flatten above.
+bool IsSequenceForData(PyObject* o);
+
+// IsSequence specialized for the data package. Additional comments about
+// difference in functionality can be found in nest.py in tensorflow.data.util
+// and in the comments for Flatten above.
+PyObject* FlattenForData(PyObject* nested);
 
 }  // namespace swig
 }  // namespace tensorflow
diff --git a/tensorflow/python/util/util.i b/tensorflow/python/util/util.i
index b7f201b6fe6fd1..9f3b11b982bb0d 100644
--- a/tensorflow/python/util/util.i
+++ b/tensorflow/python/util/util.i
@@ -31,6 +31,9 @@ limitations under the License.
 %unignore tensorflow::swig::RegisterSequenceClass;
 %noexception tensorflow::swig::RegisterSequenceClass;
 
+%unignore tensorflow::swig::RegisterSparseTensorValueClass;
+%noexception tensorflow::swig::RegisterSparseTensorValueClass;
+
 %unignore tensorflow::swig::IsSequence;
 %noexception tensorflow::swig::IsSequence;
 
@@ -46,6 +49,12 @@ limitations under the License.
 %unignore tensorflow::swig::Flatten;
 %noexception tensorflow::swig::Flatten;
 
+%unignore tensorflow::swig::IsSequenceForData;
+%noexception tensorflow::swig::IsSequenceForData;
+
+%unignore tensorflow::swig::FlattenForData;
+%noexception tensorflow::swig::FlattenForData;
+
 %include "tensorflow/python/util/util.h"
 
 %unignoreall
diff --git a/tensorflow/security/advisory/tfsa-2018-001.md b/tensorflow/security/advisory/tfsa-2018-001.md
new file mode 100644
index 00000000000000..bb97543a21988b
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2018-001.md
@@ -0,0 +1,34 @@
+## TFSA-2018-001: BMP File Parser Out-of-bounds Read.
+
+### CVE Number
+
+CVE-2018-7574
+
+### Issue Description
+
+The BMP (bitmap image file graphics format) decoder had an out-of-bounds read
+due to insufficient checking of header sizes and signed integer values. 
+
+### Impact
+
+The most likely consequence of this vulnerability would be that an invalid BMP
+file could lead to an unhandled process crash, but may permit read access to
+unintended regions of the TensorFlow process memory.
+
+### Vulnerable Versions
+
+TensorFlow 1.3.0, 1.3.1, 1.4.0, 1.4.1, 1.5.0, 1.5.1, 1.6.0
+
+### Mitigation
+
+We have patched the vulnerability in GitHub commit
+[49f73c55](https://github.com/tensorflow/tensorflow/commit/49f73c55d56edffebde4bca4a407ad69c1cae4333c55).
+If users are running TensorFlow in production or on untrusted data, they are
+encouraged to apply this patch.
+
+Additionally, this patch has already been integrated into TensorFlow 1.7.0 and
+newer.
+
+### Credits
+
+This issue was discovered by the Blade Team of Tencent.
diff --git a/tensorflow/security/advisory/tfsa-2018-002.md b/tensorflow/security/advisory/tfsa-2018-002.md
new file mode 100644
index 00000000000000..fad7fdd40f6dcc
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2018-002.md
@@ -0,0 +1,33 @@
+## TFSA-2018-002: GIF File Parsing Null Pointer Dereference Error
+
+### CVE Number
+
+CVE-2018-7576
+
+### Issue Description
+
+When parsing certain invalid GIF files, an internal function in the GIF decoder
+returned a null pointer, which was subsequently used as an argument to strcat.
+
+### Impact
+
+A maliciously crafted GIF could be used to cause the TensorFlow process to
+crash.
+
+### Vulnerable Versions
+
+TensorFlow 1.0.0, 1.0.1, 1.1.0, 1.2.0, 1.2.1, 1.3.0, 1.3.1, 1 1.4.1, 1.5.0, 1.5.1
+
+### Mitigation
+
+We have patched the vulnerability in GitHub commit
+[c4843158](https://github.com/tensorflow/tensorflow/commit/c48431588e7cf8aff61d4c299231e3e925144df8).
+If users are running TensorFlow in production or on untrusted data, they are
+encouraged to apply this patch.
+
+Additionally, this patch has already been integrated into TensorFlow 1.6.0 and
+newer.
+
+### Credits
+
+This issue was discovered by the Blade Team of Tencent.
diff --git a/tensorflow/security/advisory/tfsa-2018-003.md b/tensorflow/security/advisory/tfsa-2018-003.md
new file mode 100644
index 00000000000000..747d37064c02db
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2018-003.md
@@ -0,0 +1,48 @@
+## TFSA-2018-003: TensorFlow Lite TOCO FlatBuffer Parsing Vulnerability
+
+### CVE Number
+
+CVE-2018-8825
+
+### Issue Description
+
+The TensorFlow Lite TOCO compiler does not perform correct boundary checks when
+reading from some fields within TFLite files. 
+
+As background, TFLite files are based on the FlatBuffers serialization format,
+which does not have bounds checking built-in, rather it relies on the clients to
+handle the appropriate security checks by themselves.
+
+In particular, TOCO is not performing correct bounds checks in the following places:
+* Out of bounds read in TOCO in import.cc:42
+* Null dereference in TOCO in import.cc:135
+* Out of bounds read in TOCO in import.cc:104
+* Null dereference in TOCO in import.cc:121
+* Out of bounds read in TOCO in import.cc:62
+* Out of bounds read in TOCO in operator.cc:48
+* Out of bounds read in TOCO graph_transformations (propagate_fixed_sizes.cc:93)
+
+
+### Impact
+
+Users passing a malformed or malicious version of a TFLite graph into TOCO will
+cause TOCO to crash or cause a buffer overflow, potentially allowing malicious
+code to be executed.
+
+### Vulnerable Versions
+
+TensorFlow 1.5.0, 1.5.1, 1.6.0, 1.7.0
+
+### Mitigation
+
+We have patched the vulnerability in GitHub commits [41335abb](https://github.com/tensorflow/tensorflow/commit/41335abb46f80ca644b5738550daef6136ba5476) and
+[8badd11d](https://github.com/tensorflow/tensorflow/commit/8badd11d875a826bd318ed439909d5c47a7fb811).
+If users are running the TensorFlow TFLite TOCO compiler in production or on
+untrusted data, they are encouraged to apply this patch.
+
+Additionally, we have released TensorFlow version 1.7.1 to mitigate this
+vulnerability.
+
+### Credits
+
+This issue was discovered by the Blade Team of Tencent.
diff --git a/tensorflow/security/advisory/tfsa-2018-004.md b/tensorflow/security/advisory/tfsa-2018-004.md
new file mode 100644
index 00000000000000..3af28defa1387f
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2018-004.md
@@ -0,0 +1,35 @@
+## TFSA-2018-004: Checkpoint Meta File Out-of-Bounds Read
+
+### CVE Number
+
+CVE-2018-7575
+
+### Issue Description
+
+The block size in meta file might contain a large int64 value which causes
+an integer overflow upon addition. Subsequent code using n as index may cause
+an out-of-bounds read.
+
+### Impact
+
+A maliciously crafted meta checkpoint could be used to cause the TensorFlow
+process to perform an out of bounds read on in process memory.
+
+### Vulnerable Versions
+
+TensorFlow 1.0.0, 1.0.1, 1.1.0, 1.2.0, 1.2.1, 1.3.0, 1.3.1, 1.4.0, 1.4.1, 1.5.0, 1.5.1, 1.6.0, 1.7.0
+
+### Mitigation
+
+We have patched the vulnerability in GitHub commit
+[d107fee1](https://github.com/tensorflow/tensorflow/commit/d107fee1e4a9a4462f01564798d345802acc2aef).
+If users are running TensorFlow on untrusted meta checkpoints, such as those
+downloaded from the Internet, in production or on untrusted data, they are
+encouraged to apply this patch.
+
+Additionally, we have released TensorFlow version 1.7.1 to mitigate this
+vulnerability.
+
+### Credits
+
+This issue was discovered by the Blade Team of Tencent.
diff --git a/tensorflow/security/advisory/tfsa-2018-005.md b/tensorflow/security/advisory/tfsa-2018-005.md
new file mode 100644
index 00000000000000..c0f339fd976f56
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2018-005.md
@@ -0,0 +1,36 @@
+## TFSA-2018-005: Old Snappy Library Usage Resulting in Memcpy Parameter Overlap
+
+### CVE Number
+
+CVE-2018-7577
+
+### Issue Description
+
+TensorFlow checkpoint meta file uses Google's [https://github.com/google/snappy](snappy)
+compression/decompression library. There is a memcpy-param-overlap issue in the
+version of snappy currently used by TensorFlow.
+
+### Impact
+
+A maliciously crafted checkpoint meta file could cause TensorFlow to crash or
+read from other parts of its process memory.
+
+### Vulnerable Versions
+
+TensorFlow 1.1.0, 1.2.0, 1.2.1, 1.3.0, 1.3.1, 1.4.0, 1.4.1, 1.5.0, 1.5.1, 1.6.0, 1.7.0
+
+### Mitigation
+
+We have patched the vulnerability in GitHub commit
+[dfa9921e](https://github.com/tensorflow/tensorflow/commit/dfa9921e6343727b05f42f8d4a918b19528ff994)
+by upgrading the version of the snappy library used by TensorFlow to v1.1.7.
+
+If users are loading untrusted checkpoints in TensorFlow, we encourage users to
+apply the patch to upgrade snappy.
+
+Additionally, we have released TensorFlow version 1.7.1 to mitigate this
+vulnerability.
+
+### Credits
+
+This issue was discovered by the Blade Team of Tencent.
diff --git a/tensorflow/security/advisory/tfsa-2018-006.md b/tensorflow/security/advisory/tfsa-2018-006.md
new file mode 100644
index 00000000000000..17f514d8d2b543
--- /dev/null
+++ b/tensorflow/security/advisory/tfsa-2018-006.md
@@ -0,0 +1,35 @@
+## TFSA-2018-006: Crafted Configuration File results in Invalid Memory Access
+
+### CVE Number
+
+CVE-2018-10055
+
+### Issue Description
+
+A maliciously crafted configuration file passed into the TensorFlow XLA compiler
+could cause an invalid memory access and/or a heap buffer overflow.
+
+### Impact
+
+A maliciously crafted configuration file could cause TensorFlow to crash or
+read from other parts of its process memory.
+
+### Vulnerable Versions
+
+TensorFlow 1.1.0, 1.2.0, 1.2.1, 1.3.0, 1.3.1, 1.4.0, 1.4.1, 1.5.0, 1.5.1, 1.6.0, 1.7.0
+
+### Mitigation
+
+We have patched the vulnerability in GitHub commit
+[c89ab82a](https://github.com/tensorflow/tensorflow/commit/c89ab82a82585cdaa90bf4911980e9e845909e78).
+
+If users are loading untrusted configurations in TensorFlow, we encourage users
+to apply the patch to upgrade snappy or upgrade the version of TensorFlow they
+are currently using.
+
+Additionally, we have released TensorFlow version 1.7.1 to mitigate this
+vulnerability.
+
+### Credits
+
+This issue was discovered by the Blade Team of Tencent.
diff --git a/tensorflow/security/index.md b/tensorflow/security/index.md
new file mode 100644
index 00000000000000..44f51ad07b16f8
--- /dev/null
+++ b/tensorflow/security/index.md
@@ -0,0 +1,18 @@
+# TensorFlow Security Advisories
+
+We regularly publish security advisories about using TensorFlow.
+
+*Note*: In conjunction with these security advisories, we strongly encourage
+TensorFlow users to read and understand TensorFlow's security model as outlined
+in [https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md](SECURITY.md).
+
+| Advisory Number | Type               | Versions affected | Reported by           | Additional Information      |
+|-----------------|--------------------|:-----------------:|-----------------------|-----------------------------|
+| [TFSA-2018-006](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2018-006.md)   | Crafted Configuration File results in Invalid Memory Access | <= 1.7 | Blade Team of Tencent |  |
+| [TFSA-2018-005](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2018-005.md)   | Old Snappy Library Usage Resulting in Memcpy Parameter Overlap | <= 1.7 | Blade Team of Tencent |  |
+| [TFSA-2018-004](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2018-004.md)   | Checkpoint Meta File Out-of-Bounds Read | <= 1.7 | Blade Team of Tencent |  |
+| [TFSA-2018-003](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2018-003.md)   | TensorFlow Lite TOCO FlatBuffer Parsing Vulnerability | <= 1.7 | Blade Team of Tencent |  |
+| [TFSA-2018-002](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2018-002.md)   | GIF File Parsing Null Pointer Dereference Error | <= 1.5 | Blade Team of Tencent |  |
+| [TFSA-2018-001](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2018-001.md)   | BMP File Parser Out-of-bounds Read | <= 1.6 | Blade Team of Tencent |  |
+| -               | Out Of Bounds Read |             <=1.4 | Blade Team of Tencent | [issue report](https://github.com/tensorflow/tensorflow/issues/14959) |
+
diff --git a/tensorflow/stream_executor/blas.h b/tensorflow/stream_executor/blas.h
index be0b0bf5fb20b2..ea87744b225215 100644
--- a/tensorflow/stream_executor/blas.h
+++ b/tensorflow/stream_executor/blas.h
@@ -1083,6 +1083,13 @@ class BlasSupport {
   // This is a batched version of DoBlasGemm.
   // The batched GEMM computes matrix product for each input/output in a, b,
   // and c, which contain batch_count DeviceMemory objects.
+  virtual bool DoBlasGemmBatched(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+      uint64 n, uint64 k, float alpha,
+      const port::ArraySlice<DeviceMemory<Eigen::half> *> &a, int lda,
+      const port::ArraySlice<DeviceMemory<Eigen::half> *> &b, int ldb,
+      float beta, const port::ArraySlice<DeviceMemory<Eigen::half> *> &c,
+      int ldc, int batch_count, ScratchAllocator *scratch_allocator) = 0;
   virtual bool DoBlasGemmBatched(
       Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
       uint64 n, uint64 k, float alpha,
@@ -1945,6 +1952,13 @@ class BlasSupport {
       DeviceMemory<std::complex<double>> *c, int ldc,                          \
       blas::ComputationType computation_type, blas::AlgorithmType algorithm,   \
       blas::ProfileResult *output_profile_result) override;                    \
+  bool DoBlasGemmBatched(                                                      \
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
+      uint64 m, uint64 n, uint64 k, float alpha,                               \
+      const port::ArraySlice<DeviceMemory<Eigen::half> *> &a, int lda,         \
+      const port::ArraySlice<DeviceMemory<Eigen::half> *> &b, int ldb,         \
+      float beta, const port::ArraySlice<DeviceMemory<Eigen::half> *> &c,      \
+      int ldc, int batch_count, ScratchAllocator *scratch_allocator) override; \
   bool DoBlasGemmBatched(                                                      \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64 m, uint64 n, uint64 k, float alpha,                               \
diff --git a/tensorflow/stream_executor/cuda/cuda_activation.cc b/tensorflow/stream_executor/cuda/cuda_activation.cc
index cf6b9e2c6e4b32..02371c3c3ab403 100644
--- a/tensorflow/stream_executor/cuda/cuda_activation.cc
+++ b/tensorflow/stream_executor/cuda/cuda_activation.cc
@@ -38,5 +38,11 @@ ScopedActivateExecutorContext::~ScopedActivateExecutorContext() {
   delete static_cast<ScopedActivateContext *>(driver_scoped_activate_context_);
 }
 
+ScopedActivateExecutorContext::ScopedActivateExecutorContext(
+    ScopedActivateExecutorContext &&other)
+    : driver_scoped_activate_context_(other.driver_scoped_activate_context_) {
+  other.driver_scoped_activate_context_ = nullptr;
+}
+
 }  // namespace cuda
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/cuda/cuda_activation.h b/tensorflow/stream_executor/cuda/cuda_activation.h
index 04ffaef3646bb3..ef9807820fda49 100644
--- a/tensorflow/stream_executor/cuda/cuda_activation.h
+++ b/tensorflow/stream_executor/cuda/cuda_activation.h
@@ -44,10 +44,11 @@ class ScopedActivateExecutorContext {
   // fatal failure if it is not CUDA inside.
   explicit ScopedActivateExecutorContext(StreamExecutor* stream_exec);
 
+  ScopedActivateExecutorContext(ScopedActivateExecutorContext&& other);
+
   ~ScopedActivateExecutorContext();
 
  private:
-
   // The cuda.h-using datatype that we wrap.
   ScopedActivateContext* driver_scoped_activate_context_;
 
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index 3c1353aee31782..08fe153b5909d3 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -16,11 +16,7 @@ limitations under the License.
 #include "cuda/include/cublas_v2.h"
 #include "cuda/include/cuda.h"
 
-#if CUDA_VERSION >= 8000
 #define SE_CUDA_DATA_HALF CUDA_R_16F
-#else
-#define SE_CUDA_DATA_HALF CUBLAS_DATA_HALF
-#endif
 
 #include "tensorflow/stream_executor/cuda/cuda_blas.h"
 
@@ -45,10 +41,8 @@ limitations under the License.
 // approach when the issue is fixed.
 #if CUDA_VERSION < 9000
 #include "cuda/include/cuda_fp16.h"
-#if CUDA_VERSION >= 7050
 #define EIGEN_HAS_CUDA_FP16
 #endif
-#endif
 
 #include "third_party/eigen3/Eigen/Core"
 
@@ -292,6 +286,10 @@ STREAM_EXECUTOR_CUBLAS_WRAP(cublasGetMathMode)
 STREAM_EXECUTOR_CUBLAS_WRAP(cublasSetMathMode)
 #endif
 
+#if CUDA_VERSION >= 9010
+STREAM_EXECUTOR_CUBLAS_WRAP(cublasGemmBatchedEx)
+#endif
+
 }  // namespace wrap
 
 static string ToString(cublasStatus_t status) {
@@ -543,9 +541,7 @@ cublasSideMode_t CUDABlasSide(blas::Side side) {
 // blas::ComputationType to a cudaDataType_t.
 //
 // These are used to build the argument type and computation type args to
-// cublasGemmEx.  cublasGemmEx and cudaDataType_t are available only on
-// CUDA >= 8.0.
-#if CUDA_VERSION >= 8000
+// cublasGemmEx.
 template <typename T>
 struct CUDADataType;
 
@@ -620,15 +616,13 @@ cudaDataType_t CUDAComputationType(blas::ComputationType ty) {
       return CUDA_C_64F;
   }
 }
-#endif
-
 }  // namespace
 
 template <typename FuncT, typename... Args>
 bool CUDABlas::DoBlasInternalImpl(FuncT cublas_func, Stream *stream,
                                   bool pointer_mode_host, bool err_on_failure,
                                   bool use_tensor_op_math, Args... args) {
-  mutex_lock lock{mu_};
+  mutex_lock lock(mu_);
 
   CHECK(blas_ != nullptr);
   if (!SetStream(stream)) {
@@ -2229,7 +2223,6 @@ bool CUDABlas::GetBlasGemmAlgorithms(
 // Note that when CUDA version and compute capability is not sufficient, we
 // still return the out_algorithms. Caller needs to make sure that in this case,
 // the returned vector is empty.
-#if CUDA_VERSION >= 8000
   for (cublasGemmAlgo_t algo : {
          CUBLAS_GEMM_DFALT, CUBLAS_GEMM_ALGO0, CUBLAS_GEMM_ALGO1,
              CUBLAS_GEMM_ALGO2, CUBLAS_GEMM_ALGO3, CUBLAS_GEMM_ALGO4,
@@ -2245,7 +2238,6 @@ bool CUDABlas::GetBlasGemmAlgorithms(
        }) {
     out_algorithms->push_back(algo);
   }
-#endif
   return true;
 }
 
@@ -2342,13 +2334,23 @@ bool CUDABlas::DoBlasGemmWithAlgorithm(
       computation_type, algorithm, output_profile_result);
 }
 
-template <typename T, typename FuncT>
+template <typename T>
+struct HalfAsFloat {
+  typedef T type;
+};
+
+template <>
+struct HalfAsFloat<Eigen::half> {
+  typedef float type;
+};
+
+template <typename T, typename Scalar, typename FuncT>
 port::Status CUDABlas::DoBlasGemmBatchedInternal(
     FuncT cublas_func, Stream *stream, blas::Transpose transa,
-    blas::Transpose transb, uint64 m, uint64 n, uint64 k, T alpha,
+    blas::Transpose transb, uint64 m, uint64 n, uint64 k, Scalar alpha,
     const port::ArraySlice<DeviceMemory<T> *> &a_ptrs_to_wrappers, int lda,
     const port::ArraySlice<DeviceMemory<T> *> &b_ptrs_to_wrappers, int ldb,
-    T beta, const port::ArraySlice<DeviceMemory<T> *> &c_ptrs_to_wrappers,
+    Scalar beta, const port::ArraySlice<DeviceMemory<T> *> &c_ptrs_to_wrappers,
     int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
   std::vector<T *> a_raw_ptrs, b_raw_ptrs, c_raw_ptrs;
   for (int i = 0; i < batch_count; ++i) {
@@ -2357,7 +2359,7 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
     c_raw_ptrs.push_back(static_cast<T *>(c_ptrs_to_wrappers[i]->opaque()));
   }
 
-  typedef typename CUDAComplexT<T>::type CUDA_T;
+  typedef typename HalfAsFloat<typename CUDAComplexT<T>::type>::type CUDA_T;
 
   const size_t size = batch_count * sizeof(CUDA_T *);
 
@@ -2409,18 +2411,84 @@ port::Status CUDABlas::DoBlasGemmBatchedInternal(
                         "CUDABlas::DoBlasGemmBatched");
   }
 
-  bool ok = DoBlasInternal(
-      cublas_func, stream, true /* = pointer_mode_host */,
-      CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
-      CUDAComplex(&alpha), const_cast<const CUDA_T **>(CUDAMemory(a)), lda,
-      const_cast<const CUDA_T **>(CUDAMemory(b)), ldb, CUDAComplex(&beta),
-      const_cast<CUDA_T **>(CUDAMemory(c)), ldc, batch_count);
+  cudaDataType_t data_type = CUDADataType<T>::type;
 
-  if (ok) {
+#if CUDA_VERSION >= 9010
+  int cc_major, cc_minor;
+  if (stream->parent()->GetDeviceDescription().cuda_compute_capability(
+          &cc_major, &cc_minor) &&
+      cc_major >= 5) {
+    bool use_tensor_ops = TensorOpMathEnabled() && data_type == CUDA_R_16F;
+    cublasGemmAlgo_t algo =
+        (use_tensor_ops ? CUBLAS_GEMM_DFALT_TENSOR_OP : CUBLAS_GEMM_DFALT);
+    cudaDataType_t compute_type =
+        (data_type == CUDA_R_16F ? CUDA_R_32F : data_type);
+    const void **a_void_ptrs = reinterpret_cast<const void **>(
+        const_cast<const CUDA_T **>(CUDAMemory(a)));
+    const void **b_void_ptrs = reinterpret_cast<const void **>(
+        const_cast<const CUDA_T **>(CUDAMemory(b)));
+    void **c_void_ptrs =
+        reinterpret_cast<void **>(const_cast<CUDA_T **>(CUDAMemory(c)));
+    bool ok;
+    ok = DoBlasInternalImpl(
+        wrap::cublasGemmBatchedEx, stream, true /* = pointer_mode_host */,
+        true /* = err_on_failure */, use_tensor_ops, CUDABlasTranspose(transa),
+        CUDABlasTranspose(transb), m, n, k, &alpha, a_void_ptrs, data_type, lda,
+        b_void_ptrs, data_type, ldb, &beta, c_void_ptrs, data_type, ldc,
+        batch_count, compute_type, algo);
+    if (ok) {
+      return port::Status::OK();
+    }
+    return port::Status(port::error::INTERNAL,
+                        "failed BLAS call, see log for details");
+  }
+#endif
+  // either CUDA_VERSION < 9.1 or SM < 5.0
+  if (data_type != CUDA_R_16F) {
+    bool ok = DoBlasInternal(
+        cublas_func, stream, true /* = pointer_mode_host */,
+        CUDABlasTranspose(transa), CUDABlasTranspose(transb), m, n, k,
+        CUDAComplex(&alpha), const_cast<const CUDA_T **>(CUDAMemory(a)), lda,
+        const_cast<const CUDA_T **>(CUDAMemory(b)), ldb, CUDAComplex(&beta),
+        const_cast<CUDA_T **>(CUDAMemory(c)), ldc, batch_count);
+    if (ok) {
+      return port::Status::OK();
+    }
+    return port::Status(port::error::INTERNAL,
+                        "failed BLAS call, see log for details");
+  } else {
+    // Fall back to a loop for fp16
+    for (int b = 0; b < batch_count; ++b) {
+      const DeviceMemory<T> &a_matrix = *a_ptrs_to_wrappers[b];
+      const DeviceMemory<T> &b_matrix = *b_ptrs_to_wrappers[b];
+      DeviceMemory<T> *c_matrix = c_ptrs_to_wrappers[b];
+      bool ok = DoBlasGemm(stream, transa, transb, m, n, k, alpha, a_matrix,
+                           lda, b_matrix, ldb, beta, c_matrix, ldc);
+      if (!ok) {
+        return port::Status(port::error::INTERNAL,
+                            "failed BLAS call, see log for details");
+      }
+    }
     return port::Status::OK();
   }
-  return port::Status(port::error::INTERNAL,
-                      "failed BLAS call, see log for details");
+}
+
+bool CUDABlas::DoBlasGemmBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, float alpha,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &a_array, int lda,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &b_array, int ldb,
+    float beta, const port::ArraySlice<DeviceMemory<Eigen::half> *> &c_array,
+    int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
+  // Note: The func passed here (cublasSgemmBatched) is not actually called,
+  // due to special handling of fp16 inside DoBlasGemmBatchedInternal.
+  port::Status status = DoBlasGemmBatchedInternal(
+      wrap::cublasSgemmBatched, stream, transa, transb, m, n, k, alpha, a_array,
+      lda, b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
+  if (!status.ok()) {
+    LOG(ERROR) << status;
+  }
+  return status.ok();
 }
 
 bool CUDABlas::DoBlasGemmBatched(
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.h b/tensorflow/stream_executor/cuda/cuda_blas.h
index 12dc5e47fd1b9d..42b3fde5b0816f 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.h
+++ b/tensorflow/stream_executor/cuda/cuda_blas.h
@@ -107,12 +107,12 @@ class CUDABlas : public blas::BlasSupport {
 
   // A helper function to implement DoBlasGemmBatched interfaces for generic
   // types.
-  template <typename T, typename FuncT>
+  template <typename T, typename Scalar, typename FuncT>
   port::Status DoBlasGemmBatchedInternal(
       FuncT cublas_func, Stream *stream, blas::Transpose transa,
-      blas::Transpose transb, uint64 m, uint64 n, uint64 k, T alpha,
+      blas::Transpose transb, uint64 m, uint64 n, uint64 k, Scalar alpha,
       const port::ArraySlice<DeviceMemory<T> *> &a_array, int lda,
-      const port::ArraySlice<DeviceMemory<T> *> &b_array, int ldb, T beta,
+      const port::ArraySlice<DeviceMemory<T> *> &b_array, int ldb, Scalar beta,
       const port::ArraySlice<DeviceMemory<T> *> &c_array, int ldc,
       int batch_count, ScratchAllocator *scratch_allocator);
 
diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
index feb529297e8fff..46e5deed8474df 100644
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
@@ -76,35 +76,36 @@ string DriverVersionStatusToString(port::StatusOr<DriverVersion> version) {
 port::StatusOr<DriverVersion> StringToDriverVersion(const string &value) {
   std::vector<string> pieces = port::Split(value, '.');
   if (pieces.size() < 2 || pieces.size() > 4) {
-    return port::Status{
+    return port::Status(
         port::error::INVALID_ARGUMENT,
-        port::Printf("expected %%d.%%d, %%d.%%d.%%d, or %%d.%%d.%%d.%%d form for driver version; got \"%s\"",
-                     value.c_str())};
+        port::Printf("expected %%d.%%d, %%d.%%d.%%d, or %%d.%%d.%%d.%%d form "
+                     "for driver version; got \"%s\"",
+                     value.c_str()));
   }
 
   int major;
   int minor;
   int patch = 0;
   if (!port::safe_strto32(pieces[0], &major)) {
-    return port::Status{
+    return port::Status(
         port::error::INVALID_ARGUMENT,
         port::Printf("could not parse major version number \"%s\" as an "
                      "integer from string \"%s\"",
-                     pieces[0].c_str(), value.c_str())};
+                     pieces[0].c_str(), value.c_str()));
   }
   if (!port::safe_strto32(pieces[1], &minor)) {
-    return port::Status{
+    return port::Status(
         port::error::INVALID_ARGUMENT,
         port::Printf("could not parse minor version number \"%s\" as an "
                      "integer from string \"%s\"",
-                     pieces[1].c_str(), value.c_str())};
+                     pieces[1].c_str(), value.c_str()));
   }
   if (pieces.size() == 3 && !port::safe_strto32(pieces[2], &patch)) {
-    return port::Status{
-      port::error::INVALID_ARGUMENT,
-      port::Printf("could not parse patch version number \"%s\" as an "
+    return port::Status(
+        port::error::INVALID_ARGUMENT,
+        port::Printf("could not parse patch version number \"%s\" as an "
                      "integer from string \"%s\"",
-                   pieces[2].c_str(), value.c_str())};
+                     pieces[2].c_str(), value.c_str()));
   }
 
   DriverVersion result{major, minor, patch};
@@ -204,9 +205,9 @@ void Diagnostician::LogDiagnosticInformation() {
 // Iterates through loaded DSOs with DlIteratePhdrCallback to find the
 // driver-interfacing DSO version number. Returns it as a string.
 port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
-  port::StatusOr<DriverVersion> result{port::Status{
+  port::StatusOr<DriverVersion> result(port::Status(
       port::error::NOT_FOUND,
-      "was unable to find libcuda.so DSO loaded into this program"}};
+      "was unable to find libcuda.so DSO loaded into this program"));
 
 #if defined(__APPLE__)
     // OSX CUDA libraries have names like: libcuda_310.41.15_mercury.dylib
@@ -274,11 +275,11 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelModuleVersion(
   static const char *kDriverFilePrelude = "Kernel Module  ";
   size_t offset = driver_version_file_contents.find(kDriverFilePrelude);
   if (offset == string::npos) {
-    return port::Status{
+    return port::Status(
         port::error::NOT_FOUND,
         port::StrCat("could not find kernel module information in "
                      "driver version file contents: \"",
-                     driver_version_file_contents, "\"")};
+                     driver_version_file_contents, "\""));
   }
 
   string version_and_rest = driver_version_file_contents.substr(
@@ -334,25 +335,24 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
     return StringToDriverVersion(version);
   }
   CFRelease(kext_infos);
-  auto status =
-    port::Status{port::error::INTERNAL,
-                 port::StrCat("failed to read driver bundle version: ",
-                              CFStringGetCStringPtr(kDriverKextIdentifier, kCFStringEncodingUTF8))
-    };
+  auto status = port::Status(
+      port::error::INTERNAL,
+      port::StrCat(
+          "failed to read driver bundle version: ",
+          CFStringGetCStringPtr(kDriverKextIdentifier, kCFStringEncodingUTF8)));
   return status;
 #elif defined(PLATFORM_WINDOWS)
   auto status =
-    port::Status{port::error::UNIMPLEMENTED,
-                 "kernel reported driver version not implemented on Windows"
-    };
+      port::Status(port::error::UNIMPLEMENTED,
+                   "kernel reported driver version not implemented on Windows");
   return status;
 #else
   FILE *driver_version_file = fopen(kDriverVersionPath, "r");
   if (driver_version_file == nullptr) {
-    return port::Status{
+    return port::Status(
         port::error::PERMISSION_DENIED,
         port::StrCat("could not open driver version path for reading: ",
-                     kDriverVersionPath)};
+                     kDriverVersionPath));
   }
 
   static const int kContentsSize = 1024;
@@ -371,11 +371,11 @@ port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
     return FindKernelModuleVersion(contents.begin());
   }
 
-  auto status =
-      port::Status{port::error::INTERNAL,
-                   port::StrCat("failed to read driver version file contents: ",
-                                kDriverVersionPath, "; ferror: ",
-                                ferror(driver_version_file))};
+  auto status = port::Status(
+      port::error::INTERNAL,
+      port::StrCat(
+          "failed to read driver version file contents: ", kDriverVersionPath,
+          "; ferror: ", ferror(driver_version_file)));
   fclose(driver_version_file);
   return status;
 #endif
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 42a77aa3f8e949..f6564df0d07719 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <functional>
 #include <memory>
+#include <utility>
 
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/lib/core/errors.h"
@@ -46,8 +47,42 @@ limitations under the License.
 #include "cuda/include/cudnn.h"
 // clang-format on
 
+namespace stream_executor {
+namespace cuda {
+
+PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuDnnPlugin);
+
 namespace {
 
+static_assert(CUDNN_VERSION >= 6000, "cuDNN needs to be version 6.0 or higher");
+
+// Exits the program if 'expr' doesn't return CUDNN_STATUS_SUCCESS.
+#define CHECK_CUDNN_OK(expr) CHECK_EQ(expr, CUDNN_STATUS_SUCCESS)
+
+// If 'expr' doesn't return CUDNN_STATUS_SUCCESS, returns from the current
+// function with a non-successful port::Status.
+#define RETURN_IF_CUDNN_ERROR(expr)                                      \
+  do {                                                                   \
+    cudnnStatus_t _status = expr;                                        \
+    if (!SE_PREDICT_TRUE(_status == CUDNN_STATUS_SUCCESS)) {             \
+      std::ostringstream oss;                                            \
+      oss << ToString(_status) << "\nin " << __FILE__ << "(" << __LINE__ \
+          << "): '" << #expr << "'";                                     \
+      return port::Status(port::error::UNKNOWN, oss.str().c_str());      \
+    }                                                                    \
+  } while (false)
+
+// Returns whether status is 'ok', and potentially logs the error.
+bool IsStatusOk(const port::Status& status, bool report_error) {
+  if (status.ok()) {
+    return true;
+  }
+  if (report_error) {
+    LOG(ERROR) << status.error_message();
+  }
+  return false;
+}
+
 // Converts (via narrowing) a type T value to a type U, and checks that the
 // value has no value change due to the conversion.
 template <typename WideT, typename NarrowT>
@@ -58,20 +93,6 @@ NarrowT CheckedNarrowing(const WideT& wide) {
   return narrow;
 }
 
-}  // namespace
-
-namespace stream_executor {
-
-using dnn::BatchDescriptor;
-using dnn::FilterDescriptor;
-using dnn::ConvolutionDescriptor;
-using dnn::PoolingDescriptor;
-using dnn::NormalizeDescriptor;
-
-namespace cuda {
-
-PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuDnnPlugin);
-
 string ToString(cudnnStatus_t status) {
   switch (status) {
     case CUDNN_STATUS_SUCCESS:
@@ -96,28 +117,20 @@ string ToString(cudnnStatus_t status) {
       return "CUDNN_STATUS_NOT_SUPPORTED";
     case CUDNN_STATUS_LICENSE_ERROR:
       return "CUDNN_STATUS_LICENSE_ERROR";
+    case CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING:
+      return "CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING";
+#if CUDNN_VERSION >= 7000
+    case CUDNN_STATUS_RUNTIME_IN_PROGRESS:
+      return "CUDNN_STATUS_RUNTIME_IN_PROGRESS";
+    case CUDNN_STATUS_RUNTIME_FP_OVERFLOW:
+      return "CUDNN_STATUS_RUNTIME_FP_OVERFLOW";
+#endif
     default:
       return port::StrCat("<unknown cudnn status: ", static_cast<int>(status),
                           ">");
   }
 }
 
-#if CUDNN_VERSION >= 6000
-string ToString(libraryPropertyType type) {
-  switch (type) {
-    case MAJOR_VERSION:
-      return "MAJOR_VERSION";
-    case MINOR_VERSION:
-      return "MINOR_VERSION";
-    case PATCH_LEVEL:
-      return "PATCH_LEVEL";
-    default:
-      return port::StrCat(
-          "<unknown libraryPropertyType: ", static_cast<int>(type), ">");
-  }
-}
-#endif
-
 template <typename T>
 cudnnDataType_t GetCudnnDataType();
 
@@ -136,224 +149,83 @@ cudnnDataType_t GetCudnnDataType<Eigen::half>() {
   return CUDNN_DATA_HALF;
 }
 
-namespace wrap {
-
-static port::ThreadPool* InitCudnnThreadpool() {
-  port::ThreadPool* cudnn_threadpool_;
-  port::ThreadOptions options;
-  // TBD(keveman): Conservatively setting the stack size and guard size to 2MB,
-  // until we can get some guarantees from NVIDIA on the minimum stack space
-  // they will work with.
-  options.stack_size = 2 * 1024 * 1024;
-  options.guard_size = 2 * 1024 * 1024;
-  cudnn_threadpool_ = new port::ThreadPool(port::Env::Default(), options,
-                                           "cudnn_threadpool", 1);
-  CHECK(cudnn_threadpool_);
-  return cudnn_threadpool_;
-}
-
-static mutex cudnn_threadpool_mu(LINKER_INITIALIZED);
-static port::ThreadPool* GetCudaThreadpool() {
-  mutex_lock lock(cudnn_threadpool_mu);
-  static port::ThreadPool* cudnn_threadpool = InitCudnnThreadpool();
-  return cudnn_threadpool;
-}
-
-#define STREAM_EXECUTOR_CUDNN_WRAP(__name)                         \
-  struct WrapperShim__##__name {                                   \
-    template <typename... Args>                                    \
-    cudnnStatus_t operator()(CUDAExecutor* parent, Args... args) { \
-      cuda::ScopedActivateExecutorContext sac{parent};             \
-      cudnnStatus_t retval = ::__name(args...);                    \
-      return retval;                                               \
-    }                                                              \
-  } __name;
-
-#define STREAM_EXECUTOR_CUDNN_WRAP_WITH_CHECKED_STREAM(__name)           \
-  struct WrapperShim__##__name {                                         \
-    template <typename... Args>                                          \
-    cudnnStatus_t operator()(CudnnSupport* dnn, Stream* s, Args... args) \
-        SHARED_LOCKS_REQUIRED(dnn->dnn_handle_mutex_) {                  \
-      CHECK_NOTNULL(s);                                                  \
-      CHECK_EQ(s, dnn->GetCurrentDnnStream())                            \
-          << "Stream is not set correctly!";                             \
-      cuda::ScopedActivateExecutorContext sac{dnn->GetParentExecutor()}; \
-      cudnnStatus_t retval = ::__name(args...);                          \
-      return retval;                                                     \
-    }                                                                    \
-  } __name;
-
-// Handles cudnnSetStream differently in order to add debug information.
-struct WrapperShim__cudnnSetStream {
-  cudnnStatus_t operator()(CudnnSupport* dnn, Stream* stream,
-                           cudnnHandle_t handle)
-      EXCLUSIVE_LOCKS_REQUIRED(dnn->dnn_handle_mutex_) {
-    dnn->SetCurrentDnnStream(stream);
-    cuda::ScopedActivateExecutorContext sac{dnn->GetParentExecutor()};
-    cudnnStatus_t retval = ::cudnnSetStream(handle, AsCUDAStreamValue(stream));
-    return retval;
-  }
-} cudnnSetStream;
-
-// clang-format off
-#define CUDNN_DNN_ROUTINE_EACH(__macro)                   \
-  __macro(cudnnGetConvolutionNdForwardOutputDim)          \
-  __macro(cudnnGetConvolutionForwardAlgorithm)            \
-  __macro(cudnnCreateTensorDescriptor)                    \
-  __macro(cudnnDestroyTensorDescriptor)                   \
-  __macro(cudnnCreateFilterDescriptor)                    \
-  __macro(cudnnSetPoolingNdDescriptor)                    \
-  __macro(cudnnSetLRNDescriptor)                          \
-  __macro(cudnnDestroyFilterDescriptor)                   \
-  __macro(cudnnCreateConvolutionDescriptor)               \
-  __macro(cudnnCreatePoolingDescriptor)                   \
-  __macro(cudnnDestroyPoolingDescriptor)                  \
-  __macro(cudnnCreateLRNDescriptor)                       \
-  __macro(cudnnDestroyLRNDescriptor)                      \
-  __macro(cudnnDestroyConvolutionDescriptor)              \
-  __macro(cudnnCreate)                                    \
-  __macro(cudnnDestroy)                                   \
-  __macro(cudnnGetConvolutionForwardWorkspaceSize)        \
-  __macro(cudnnSetConvolutionNdDescriptor)                \
-  __macro(cudnnSetTensor4dDescriptor)                     \
-  __macro(cudnnSetTensorNdDescriptor)                     \
-  __macro(cudnnSetFilterNdDescriptor)
-
-// clang-format on
-CUDNN_DNN_ROUTINE_EACH(STREAM_EXECUTOR_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH
-
-// clang-format off
-#define CUDNN_DNN_ROUTINE_EACH_WITH_STREAM(__macro)       \
-  __macro(cudnnBatchNormalizationBackward)                \
-  __macro(cudnnBatchNormalizationForwardInference)        \
-  __macro(cudnnBatchNormalizationForwardTraining)         \
-  __macro(cudnnActivationForward)                         \
-  __macro(cudnnConvolutionForward)                        \
-  __macro(cudnnConvolutionBackwardBias)                   \
-  __macro(cudnnTransformTensor)                           \
-  __macro(cudnnPoolingForward)                            \
-  __macro(cudnnPoolingBackward)                           \
-  __macro(cudnnLRNCrossChannelForward)                    \
-  __macro(cudnnLRNCrossChannelBackward)                   \
-  __macro(cudnnAddTensor)                                 \
-  __macro(cudnnConvolutionBackwardData)                   \
-  __macro(cudnnConvolutionBackwardFilter)
-
-// clang-format on
-CUDNN_DNN_ROUTINE_EACH_WITH_STREAM(
-    STREAM_EXECUTOR_CUDNN_WRAP_WITH_CHECKED_STREAM)
-#undef CUDNN_DNN_ROUTINE_EACH_WITH_STREAM
-
-// APIs available after R3:
-#if CUDNN_VERSION >= 3000
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro)              \
-  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize)     \
-  __macro(cudnnGetConvolutionBackwardDataAlgorithm)           \
-  __macro(cudnnGetConvolutionBackwardFilterAlgorithm)         \
-  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize)
-CUDNN_DNN_ROUTINE_EACH_AFTER_R3(STREAM_EXECUTOR_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_AFTER_R3
-#endif
-
-// APIs in R3 but not in R5
-// clang-format off
-#if CUDNN_VERSION >= 3000 && CUDNN_VERSION < 5000
-#define CUDNN_DNN_ROUTINE_EACH_R3_WITH_STREAM(__macro)        \
-  __macro(cudnnAddTensor_v3)                                  \
-  __macro(cudnnConvolutionBackwardData_v3)                    \
-  __macro(cudnnConvolutionBackwardFilter_v3)
-// clang-format on
-
-CUDNN_DNN_ROUTINE_EACH_R3_WITH_STREAM(
-    STREAM_EXECUTOR_CUDNN_WRAP_WITH_CHECKED_STREAM)
-#undef CUDNN_DNN_ROUTINE_EACH_R3_WITH_STREAM
-#endif
-
-// APIs in R5
-// clang-format off
-#if CUDNN_VERSION >= 5000
-#define CUDNN_DNN_ROUTINE_EACH_R5(__macro)                    \
-  __macro(cudnnCreateActivationDescriptor)                    \
-  __macro(cudnnSetActivationDescriptor)                       \
-  __macro(cudnnGetActivationDescriptor)                       \
-  __macro(cudnnDestroyActivationDescriptor)                   \
-  __macro(cudnnCreateDropoutDescriptor)                       \
-  __macro(cudnnDestroyDropoutDescriptor)                      \
-  __macro(cudnnSetDropoutDescriptor)                          \
-  __macro(cudnnDropoutGetStatesSize)                          \
-  __macro(cudnnCreateRNNDescriptor)                           \
-  __macro(cudnnDestroyRNNDescriptor)                          \
-  __macro(cudnnGetRNNParamsSize)                              \
-  __macro(cudnnGetRNNWorkspaceSize)                           \
-  __macro(cudnnGetRNNTrainingReserveSize)                     \
-  __macro(cudnnGetRNNLinLayerMatrixParams)                    \
-  __macro(cudnnGetRNNLinLayerBiasParams)                      \
-  __macro(cudnnSetRNNDescriptor)                              \
-  __macro(cudnnGetFilterNdDescriptor)
-
-// clang-format on
-CUDNN_DNN_ROUTINE_EACH_R5(STREAM_EXECUTOR_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_R5
+// RAII wrapper for all calls to cuDNN with a cuDNN handle argument.
+//
+// See CudnnAccess::GetHandle() for details.
+class CudnnHandle {
+ public:
+  // Takes ownership of the executor context and the lock to access cuDNN
+  // using handle.
+  CudnnHandle(cuda::ScopedActivateExecutorContext context, mutex_lock lock,
+              cudnnHandle_t handle)
+      : context_(std::move(context)), lock_(std::move(lock)), handle_(handle) {}
 
-// clang-format off
-#define CUDNN_DNN_ROUTINE_EACH_R5_WITH_STREAM(__macro)        \
-  __macro(cudnnRNNForwardInference)                           \
-  __macro(cudnnRNNForwardTraining)                            \
-  __macro(cudnnRNNBackwardData)                               \
-  __macro(cudnnRNNBackwardWeights)
+  // Returns cuDNN handle. To be passed directly to cuDNN APIs, don't keep
+  // a copy.
+  cudnnHandle_t handle() const { return handle_; }
 
-// clang-format on
-CUDNN_DNN_ROUTINE_EACH_R5_WITH_STREAM(
-    STREAM_EXECUTOR_CUDNN_WRAP_WITH_CHECKED_STREAM)
-#undef CUDNN_DNN_ROUTINE_EACH_R5_WITH_STREAM
-#endif
-
-// APIs in R6
-// clang-format off
-#if CUDNN_VERSION >= 6000
-#define CUDNN_DNN_ROUTINE_EACH_R6(__macro)                    \
-  __macro(cudnnSetRNNDescriptor_v6)                           \
-  __macro(cudnnCreatePersistentRNNPlan)                       \
-  __macro(cudnnDestroyPersistentRNNPlan)                      \
-  __macro(cudnnSetPersistentRNNPlan)
+ private:
+  cuda::ScopedActivateExecutorContext context_;
+  mutex_lock lock_;
+  cudnnHandle_t handle_;  // Not owned.
+};
 
-// clang-format on
-CUDNN_DNN_ROUTINE_EACH_R6(STREAM_EXECUTOR_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_R6
+}  // namespace
 
-// clang-format off
-#define CUDNN_DNN_ROUTINE_EACH_R6_WITH_STREAM(__macro)        \
-  __macro(cudnnConvolutionBiasActivationForward)
+// Wraps a cuDNN handle and provides access to it through CudnnHandle
+// instances, which also locks a mutex, acquires the CUDA context, and sets
+// the stream that cuDNN should use to enqueue any work.
+//
+// Note: CudnnSupport::cudnn_ should be the only instantiation of this class.
+class CudnnAccess {
+ public:
+  // Takes ownership of the handle.
+  explicit CudnnAccess(cudnnHandle_t handle) : handle_(handle) {}
 
-// clang-format on
-CUDNN_DNN_ROUTINE_EACH_R6_WITH_STREAM(
-    STREAM_EXECUTOR_CUDNN_WRAP_WITH_CHECKED_STREAM)
-#undef CUDNN_DNN_ROUTINE_EACH_R6_WITH_STREAM
-#endif
+  ~CudnnAccess() {
+    mutex_lock lock(mutex_);
+    cudnnDestroy(handle_);
+  }
 
-// APIs in R7
-// clang-format off
-#if CUDNN_VERSION >= 7000
-#define CUDNN_DNN_ROUTINE_EACH_R7(__macro)                    \
-  __macro(cudnnSetConvolutionMathType)                        \
-  __macro(cudnnSetRNNMatrixMathType)
+  // Creates a CudnnHandle instance for stream.
+  //
+  // cuDNN API calls using the same handle instance need to be serialized
+  // across threads. This is guaranteed by CudnnHandle instances locking the
+  // mutex owned by this class.
+  //
+  // Most cuDNN APIs taking a handle perform work on a CUDA stream. The
+  // CudnnHandle instance acquires the executor's CUDA context and sets cuDNN
+  // to use the provided stream.
+  //
+  // The stream argument may be null, which translates to the legacy default
+  // stream. See
+  // https://docs.nvidia.com/cuda/cuda-driver-api/stream-sync-behavior.html.
+  // The legacy default stream synchronizes with all other streams and it is
+  // therefore a bad idea (performance wise) to call any cuDNN APIs that
+  // enqueue work in the stream.
+  CudnnHandle GetHandle(CUDAExecutor* executor, Stream* stream) {
+    mutex_lock lock(mutex_);
+    cuda::ScopedActivateExecutorContext context(executor);
+    CUstream cu_stream = stream ? AsCUDAStreamValue(stream) : cudaStreamLegacy;
+    auto status = cudnnSetStream(handle_, cu_stream);
+    CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << "Failed to set cuDNN stream.";
+    return CudnnHandle(std::move(context), std::move(lock), handle_);
+  }
 
-// clang-format on
-CUDNN_DNN_ROUTINE_EACH_R7(STREAM_EXECUTOR_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_R7
-#endif
+ private:
+  // Guards the enqueueing of cuDNN operations via the handle_ below.
+  mutex mutex_;
 
-}  // namespace wrap
+  // cuDNN library handle.
+  cudnnHandle_t handle_ GUARDED_BY(mutex_);  // Owned.
+};
 
 namespace {
 
+// A helper function to return the internal compute type for
+// RNNs in cudnn.
 cudnnDataType_t GetRnnComputeType(dnn::DataType data_type);
 
-cudnnHandle_t ToHandle(void* opaque_handle) {
-  return static_cast<cudnnHandle_t>(opaque_handle);
-}
-
 cudnnConvolutionFwdAlgo_t ToConvForwardAlgo(dnn::AlgorithmDesc algorithm) {
   cudnnConvolutionFwdAlgo_t algo =
       cudnnConvolutionFwdAlgo_t(algorithm.algo_id());
@@ -364,12 +236,8 @@ cudnnConvolutionFwdAlgo_t ToConvForwardAlgo(dnn::AlgorithmDesc algorithm) {
     case CUDNN_CONVOLUTION_FWD_ALGO_DIRECT:
     case CUDNN_CONVOLUTION_FWD_ALGO_FFT:
     case CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING:
-#if CUDNN_VERSION >= 5000
     case CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD:
-#endif
-#if CUDNN_VERSION >= 5100
     case CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED:
-#endif
       return algo;
     default:
       LOG(FATAL) << "Unsupported Cudnn convolution forward algorithm: "
@@ -386,12 +254,8 @@ cudnnConvolutionBwdDataAlgo_t ToConvBackwardDataAlgo(
     case CUDNN_CONVOLUTION_BWD_DATA_ALGO_1:
     case CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT:
     case CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING:
-#if CUDNN_VERSION >= 5000
     case CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD:
-#endif
-#if CUDNN_VERSION >= 5100
     case CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED:
-#endif
       return algo;
     default:
       LOG(FATAL)
@@ -409,12 +273,13 @@ cudnnConvolutionBwdFilterAlgo_t ToConvBackwardFilterAlgo(
     case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1:
     case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT:
     case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3:
-#if CUDNN_VERSION >= 5100
     // Based on cudnn.h, the following is not implemented.
     // case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD:
     case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED:
-#endif
       return algo;
+    // Produces incorrect results for some shapes. Disabled for now, see
+    // NVIDIA bug 2072856. TODO(csigg): Only disable for subset of shapes.
+    // case CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING:
     default:
       LOG(FATAL)
           << "Unsupported Cudnn convolution backward algorithm for filter: "
@@ -422,17 +287,10 @@ cudnnConvolutionBwdFilterAlgo_t ToConvBackwardFilterAlgo(
   }
 }
 
-#if CUDNN_VERSION >= 6000
-port::Status GetCudnnProperty(libraryPropertyType type, int* value) {
-  cudnnStatus_t status = cudnnGetProperty(type, value);
-  if (status != CUDNN_STATUS_SUCCESS) {
-    const string error =
-        port::StrCat("cudnnGetProperty failed for type: ", ToString(type),
-                     " with status: ", ToString(status));
-    LOG(ERROR) << error;
-    return port::Status{port::error::INTERNAL, error};
-  }
-  return port::Status::OK();
+port::StatusOr<int> GetCudnnProperty(libraryPropertyType type) {
+  int value;
+  RETURN_IF_CUDNN_ERROR(cudnnGetProperty(type, &value));
+  return value;
 }
 
 cudnnRNNAlgo_t ToCudnnRNNAlgo(const dnn::AlgorithmDesc& algorithm) {
@@ -451,37 +309,21 @@ cudnnRNNAlgo_t ToCudnnRNNAlgo(const dnn::AlgorithmDesc& algorithm) {
     }
   }
 }
-#endif
 
 port::Status GetLoadedCudnnVersion(CudnnVersion* version) {
-#if CUDNN_VERSION >= 6000
-  TF_RETURN_IF_ERROR(GetCudnnProperty(MAJOR_VERSION, &version->major_version));
-  TF_RETURN_IF_ERROR(GetCudnnProperty(MINOR_VERSION, &version->minor_version));
-  TF_RETURN_IF_ERROR(GetCudnnProperty(PATCH_LEVEL, &version->patch_level));
-#else
-  size_t loaded_version = ::cudnnGetVersion();
-  version->major_version = loaded_version / 1000;
-  version->minor_version = (loaded_version / 100) % 10;
-  version->patch_level = loaded_version % 100;
-#endif
+  SE_ASSIGN_OR_RETURN(version->major_version, GetCudnnProperty(MAJOR_VERSION));
+  SE_ASSIGN_OR_RETURN(version->minor_version, GetCudnnProperty(MINOR_VERSION));
+  SE_ASSIGN_OR_RETURN(version->patch_level, GetCudnnProperty(PATCH_LEVEL));
   return port::Status::OK();
 }
 
 }  // namespace
 
-CudnnSupport::CudnnSupport(CUDAExecutor* parent)
-    : parent_(parent), dnn_handle_(nullptr), current_dnn_stream_(nullptr) {}
-
-CudnnSupport::~CudnnSupport() {
-  auto status = wrap::cudnnDestroy(parent_, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "could not destroy cudnn handle: " << ToString(status);
-  }
-}
+CudnnSupport::CudnnSupport(CUDAExecutor* parent) : parent_(parent) {}
 
 port::Status CudnnSupport::Init() {
-  auto status = wrap::cudnnCreate(
-      parent_, reinterpret_cast<cudnnHandle_t*>(&dnn_handle_));
+  cudnnHandle_t cudnn_handle = nullptr;
+  auto status = cudnnCreate(&cudnn_handle);
   if (status == CUDNN_STATUS_SUCCESS) {
     CudnnVersion source_version(CUDNN_MAJOR, CUDNN_MINOR, CUDNN_PATCHLEVEL);
 
@@ -494,38 +336,35 @@ port::Status CudnnSupport::Init() {
           ".  CuDNN library major and minor version needs to match or have "
           "higher minor version in case of CuDNN 7.0 or later version. If "
           "using a binary install, upgrade your CuDNN library.  If building "
-          "from sources, make sure the library loaded at runtime is compatible "
+          "from sources, make sure the library loaded at runtime is "
+          "compatible "
           "with the version specified during compile configuration.");
       LOG(ERROR) << error;
-      return port::Status{port::error::INTERNAL, error};
+      cudnnDestroy(cudnn_handle);
+      return port::Status(port::error::INTERNAL, error);
     }
 
+    cudnn_.reset(new CudnnAccess(cudnn_handle));
     return port::Status::OK();
   }
 
-  LOG(ERROR) << "could not create cudnn handle: " << ToString(status);
+  CHECK_EQ(cudnn_handle, nullptr);
+  LOG(ERROR) << "Could not create cudnn handle: " << ToString(status);
   if (status == CUDNN_STATUS_NOT_INITIALIZED) {
     auto result = cuda::Diagnostician::FindKernelDriverVersion();
     if (!result.ok()) {
-      LOG(ERROR) << "error retrieving driver version: "
+      LOG(ERROR) << "Error retrieving driver version: "
                  << DriverVersionStatusToString(result);
     } else {
       const auto& version = result.ValueOrDie();
-      LOG(ERROR) << "possibly insufficient driver version: "
+      LOG(ERROR) << "Possibly insufficient driver version: "
                  << DriverVersionToString(version);
-      // OS X kernel driver does not report version accurately
-#if !defined(__APPLE__)
-      if (std::get<0>(version) < 340) {
-        LOG(ERROR)
-            << "cudnn library is only supported on 340.XX+ driver versions";
-      }
-#endif
     }
   }
 
-  return port::Status{port::error::INTERNAL,
+  return port::Status(port::error::INTERNAL,
                       port::StrCat("cudnn library could not create a handle: ",
-                                   ToString(status))};
+                                   ToString(status)));
 }
 
 port::StatusOr<perftools::gputools::dnn::VersionInfo>
@@ -536,19 +375,131 @@ CudnnSupport::GetVersion() {
       version.major_version, version.minor_version, version.patch_level);
 }
 
-// Turns a BatchDescriptor structure into a cudnn tensor handle within a scope.
+namespace {
+
+// Deleter functors for cuDNN types that need to be deleted.
+struct TensorDescriptorDeleter {
+  void operator()(cudnnTensorDescriptor_t descriptor) const {
+    CHECK_CUDNN_OK(cudnnDestroyTensorDescriptor(descriptor));
+  }
+};
+struct FilterDescriptorDeleter {
+  void operator()(cudnnFilterDescriptor_t descriptor) const {
+    CHECK_CUDNN_OK(cudnnDestroyFilterDescriptor(descriptor));
+  }
+};
+struct ConvolutionDescriptorDeleter {
+  void operator()(cudnnConvolutionDescriptor_t descriptor) const {
+    CHECK_CUDNN_OK(cudnnDestroyConvolutionDescriptor(descriptor));
+  }
+};
+struct PoolingDescriptorDeleter {
+  void operator()(cudnnPoolingDescriptor_t descriptor) const {
+    CHECK_CUDNN_OK(cudnnDestroyPoolingDescriptor(descriptor));
+  }
+};
+struct LrnDescriptorDeleter {
+  void operator()(cudnnLRNDescriptor_t descriptor) const {
+    CHECK_CUDNN_OK(cudnnDestroyLRNDescriptor(descriptor));
+  }
+};
+
+struct ActivationDescriptorDeleter {
+  void operator()(cudnnActivationDescriptor_t descriptor) const {
+    CHECK_CUDNN_OK(cudnnDestroyActivationDescriptor(descriptor));
+  }
+};
+struct DropoutDescriptorDeleter {
+  void operator()(cudnnDropoutDescriptor_t descriptor) const {
+    CHECK_CUDNN_OK(cudnnDestroyDropoutDescriptor(descriptor));
+  }
+};
+struct RnnDescriptorDeleter {
+  void operator()(cudnnRNNDescriptor_t descriptor) const {
+    CHECK_CUDNN_OK(cudnnDestroyRNNDescriptor(descriptor));
+  }
+};
+struct PersistentRnnPlanDeleter {
+  void operator()(cudnnPersistentRNNPlan_t plan) const {
+    CHECK_CUDNN_OK(cudnnDestroyPersistentRNNPlan(plan));
+  }
+};
+
+// RAII wrappers for cuDNN types.
+using TensorDescriptor =
+    std::unique_ptr<cudnnTensorStruct, TensorDescriptorDeleter>;
+using FilterDescriptor =
+    std::unique_ptr<cudnnFilterStruct, FilterDescriptorDeleter>;
+using ConvolutionDescriptor =
+    std::unique_ptr<cudnnConvolutionStruct, ConvolutionDescriptorDeleter>;
+using PoolingDescriptor =
+    std::unique_ptr<cudnnPoolingStruct, PoolingDescriptorDeleter>;
+using LrnDescriptor = std::unique_ptr<cudnnLRNStruct, LrnDescriptorDeleter>;
+using ActivationDescriptor =
+    std::unique_ptr<cudnnActivationStruct, ActivationDescriptorDeleter>;
+using DropoutDescriptor =
+    std::unique_ptr<cudnnDropoutStruct, DropoutDescriptorDeleter>;
+using RnnDescriptor = std::unique_ptr<cudnnRNNStruct, RnnDescriptorDeleter>;
+using PersistentRnnPlan =
+    std::unique_ptr<cudnnPersistentRNNPlan, PersistentRnnPlanDeleter>;
+
+// Factory methods for cuDNN types.
+TensorDescriptor CreateTensorDescriptor() {
+  cudnnTensorDescriptor_t result;
+  CHECK_CUDNN_OK(cudnnCreateTensorDescriptor(&result));
+  return TensorDescriptor(result);
+}
+FilterDescriptor CreateFilterDescriptor() {
+  cudnnFilterDescriptor_t result;
+  CHECK_CUDNN_OK(cudnnCreateFilterDescriptor(&result));
+  return FilterDescriptor(result);
+}
+ConvolutionDescriptor CreateConvolutionDescriptor() {
+  cudnnConvolutionDescriptor_t result;
+  CHECK_CUDNN_OK(cudnnCreateConvolutionDescriptor(&result));
+  return ConvolutionDescriptor(result);
+}
+PoolingDescriptor CreatePoolingDescriptor() {
+  cudnnPoolingDescriptor_t result;
+  CHECK_CUDNN_OK(cudnnCreatePoolingDescriptor(&result));
+  return PoolingDescriptor(result);
+}
+LrnDescriptor CreateLrnDescriptor() {
+  cudnnLRNDescriptor_t result;
+  CHECK_CUDNN_OK(cudnnCreateLRNDescriptor(&result));
+  return LrnDescriptor(result);
+}
+ActivationDescriptor CreateActivationDescriptor() {
+  cudnnActivationDescriptor_t result;
+  CHECK_CUDNN_OK(cudnnCreateActivationDescriptor(&result));
+  return ActivationDescriptor(result);
+}
+DropoutDescriptor CreateDropoutDescriptor() {
+  cudnnDropoutDescriptor_t result;
+  CHECK_CUDNN_OK(cudnnCreateDropoutDescriptor(&result));
+  return DropoutDescriptor(result);
+}
+RnnDescriptor CreateRnnDescriptor() {
+  cudnnRNNDescriptor_t result;
+  CHECK_CUDNN_OK(cudnnCreateRNNDescriptor(&result));
+  return RnnDescriptor(result);
+}
+PersistentRnnPlan CreatePersistentRnnPlan(cudnnRNNDescriptor_t rnn_desc,
+                                          int batch_size,
+                                          cudnnDataType_t data_type) {
+  cudnnPersistentRNNPlan_t result;
+  CHECK_CUDNN_OK(
+      cudnnCreatePersistentRNNPlan(rnn_desc, batch_size, data_type, &result));
+  return PersistentRnnPlan(result);
+}
+
+// Turns a BatchDescriptor structure into a cudnn tensor handle within a
+// scope.
 class ScopedTensorDescriptor {
  public:
-  ScopedTensorDescriptor(CUDAExecutor* parent,
-                         const BatchDescriptor& batch_descriptor,
+  ScopedTensorDescriptor(const dnn::BatchDescriptor& batch_descriptor,
                          cudnnDataType_t elem_type)
-      : parent_(parent), handle_(nullptr) {
-    cudnnStatus_t status = wrap::cudnnCreateTensorDescriptor(parent_, &handle_);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(FATAL) << "could not create cudnn tensor descriptor: "
-                 << ToString(status);
-    }
-
+      : handle_(CreateTensorDescriptor()) {
     switch (batch_descriptor.layout()) {
       case dnn::DataLayout::kBatchYXDepth:
       case dnn::DataLayout::kBatchDepthYX: {
@@ -566,28 +517,17 @@ class ScopedTensorDescriptor {
                        &CheckedNarrowing<int64, int>);
         std::transform(dims64.cbegin(), dims64.cend(), dims.begin(),
                        &CheckedNarrowing<int64, int>);
-        status = wrap::cudnnSetTensorNdDescriptor(
-            parent_, handle_, elem_type, nd, dims.data(), strides.data());
-
-        if (status != CUDNN_STATUS_SUCCESS) {
-          LOG(FATAL) << "could not convert BatchDescriptor "
-                     << batch_descriptor.ToString()
-                     << " to cudnn tensor descriptor: " << ToString(status);
-        }
+        CHECK_CUDNN_OK(cudnnSetTensorNdDescriptor(handle_.get(), elem_type, nd,
+                                                  dims.data(), strides.data()))
+            << "batch_descriptor: " << batch_descriptor.ToString();
       } break;
-#if CUDNN_VERSION >= 6000
       case dnn::DataLayout::kBatchDepthYX4: {
-        status = wrap::cudnnSetTensor4dDescriptor(
-            parent_, handle_, CUDNN_TENSOR_NCHW_VECT_C, elem_type,
+        CHECK_CUDNN_OK(cudnnSetTensor4dDescriptor(
+            handle_.get(), CUDNN_TENSOR_NCHW_VECT_C, elem_type,
             batch_descriptor.count(), batch_descriptor.feature_map_count(),
-            batch_descriptor.height(), batch_descriptor.width());
-        if (status != CUDNN_STATUS_SUCCESS) {
-          LOG(FATAL) << "could not convert BatchDescriptor "
-                     << batch_descriptor.ToString()
-                     << " to cudnn tensor descriptor: " << ToString(status);
-        }
+            batch_descriptor.height(), batch_descriptor.width()))
+            << "batch_descriptor: " << batch_descriptor.ToString();
       } break;
-#endif
       default:
         LOG(FATAL) << "Unsupported tensor format "
                    << DataLayoutString(batch_descriptor.layout());
@@ -595,58 +535,41 @@ class ScopedTensorDescriptor {
     }
   }
 
-  ~ScopedTensorDescriptor() {
-    cudnnStatus_t status = wrap::cudnnDestroyTensorDescriptor(parent_, handle_);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(ERROR) << "could not destroy cudnn tensor descriptor: "
-                 << ToString(status);
-    }
-  }
-
-  cudnnTensorDescriptor_t handle() const { return handle_; }
+  cudnnTensorDescriptor_t handle() const { return handle_.get(); }
 
  private:
-  CUDAExecutor* parent_;            // Parent executor. Not owned.
-  cudnnTensorDescriptor_t handle_;  // Owned.
+  TensorDescriptor handle_;
 
   SE_DISALLOW_COPY_AND_ASSIGN(ScopedTensorDescriptor);
 };
 
-// Turns a FilterDescriptor structure into a cudnn filter handle within a scope.
+// Turns a FilterDescriptor structure into a cudnn filter handle within a
+// scope.
 class ScopedFilterDescriptor {
  public:
-  ScopedFilterDescriptor(CUDAExecutor* parent,
-                         const FilterDescriptor& filter_descriptor,
-                         const BatchDescriptor& batch_descriptor,
+  ScopedFilterDescriptor(const dnn::FilterDescriptor& filter_descriptor,
                          cudnnDataType_t elem_type)
-      : parent_(parent), handle_(nullptr) {
-    cudnnStatus_t status = wrap::cudnnCreateFilterDescriptor(parent_, &handle_);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(FATAL) << "could not create cudnn filter descriptor: "
-                 << ToString(status);
-    }
-
-#if CUDNN_VERSION >= 5000
+      : handle_(CreateFilterDescriptor()) {
     // TODO(b/23032134): Even if the filter layout is not supported,
-    // cudnnSetFilter4DDescriptor_v4 will return CUDNN_STATUS_SUCCESS because it
-    // does not take layout as an input. Maybe force cuDNN by giving wrong
+    // cudnnSetFilter4DDescriptor_v4 will return CUDNN_STATUS_SUCCESS because
+    // it does not take layout as an input. Maybe force cuDNN by giving wrong
     // inputs intentionally?
     cudnnTensorFormat_t format;
     switch (filter_descriptor.layout()) {
       case dnn::FilterLayout::kOutputInputYX:
         format = CUDNN_TENSOR_NCHW;
         break;
-#if CUDNN_VERSION >= 6000
+      case dnn::FilterLayout::kOutputYXInput:
+        format = CUDNN_TENSOR_NHWC;
+        break;
       case dnn::FilterLayout::kOutputInputYX4:
         format = CUDNN_TENSOR_NCHW_VECT_C;
         break;
-#endif
       default:
         LOG(FATAL) << "Unsupported filter format "
                    << FilterLayoutString(filter_descriptor.layout());
         break;
     }
-#endif
 
     std::vector<int> dims(2 + filter_descriptor.ndims());
     dims[0] = filter_descriptor.output_feature_map_count();
@@ -654,39 +577,20 @@ class ScopedFilterDescriptor {
     const auto& spatial_dims = filter_descriptor.input_filter_dims();
     std::copy(spatial_dims.begin(), spatial_dims.end(), dims.begin() + 2);
 
-    status = wrap::cudnnSetFilterNdDescriptor(parent_, handle_, elem_type,
-#if CUDNN_VERSION >= 5000
-                                              format,
-#endif
-                                              dims.size(), dims.data());
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(FATAL) << "could not set cudnn filter descriptor: "
-                 << ToString(status);
-    }
-  }
-
-  ~ScopedFilterDescriptor() {
-    cudnnStatus_t status = wrap::cudnnDestroyFilterDescriptor(parent_, handle_);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(ERROR) << "could not destroy cudnn filter descriptor: "
-                 << ToString(status);
-    }
+    CHECK_CUDNN_OK(cudnnSetFilterNdDescriptor(handle_.get(), elem_type, format,
+                                              dims.size(), dims.data()));
   }
 
-  cudnnFilterDescriptor_t handle() const { return handle_; }
+  cudnnFilterDescriptor_t handle() const { return handle_.get(); }
 
  private:
-  // Parent executor object. Not owned.
-  CUDAExecutor* parent_;
-
-  // cudnn filter descriptor this object creates. Owned.
-  cudnnFilterDescriptor_t handle_;
+  FilterDescriptor handle_;  // Owned.
 
   SE_DISALLOW_COPY_AND_ASSIGN(ScopedFilterDescriptor);
 };
 
 // A helper function to decide whether to enable the TENSOR_OP_MATH math type
-static bool TensorOpMathEnabled() {
+bool TensorOpMathEnabled() {
   static bool is_enabled = [] {
     bool is_disabled = false;
     TF_CHECK_OK(
@@ -699,7 +603,7 @@ static bool TensorOpMathEnabled() {
 
 // A helper function to decide whether to enable the TENSOR_OP_MATH math type
 // for RNNs.
-static bool RnnTensorOpMathEnabled() {
+bool RnnTensorOpMathEnabled() {
   static bool is_enabled = [] {
     bool is_disabled = false;
     TF_CHECK_OK(
@@ -710,15 +614,16 @@ static bool RnnTensorOpMathEnabled() {
   return is_enabled;
 }
 
-// A helper function to decide whether to use CUDNN_BATCHNORM_SPATIAL_PERSISTENT
-// in batchnorm. This mode can be faster in some tasks because an optimized path
-// may be selected for CUDNN_DATA_FLOAT and CUDNN_DATA_HALF data types, compute
-// capability 6.0 or higher. The reason we set it to false by default is that
-// this mode may use scaled atomic integer reduction that may cause a numerical
-// overflow for certain input data range.
+// A helper function to decide whether to use
+// CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be faster in
+// some tasks because an optimized path may be selected for CUDNN_DATA_FLOAT
+// and CUDNN_DATA_HALF data types, compute capability 6.0 or higher. The
+// reason we set it to false by default is that this mode may use scaled
+// atomic integer reduction that may cause a numerical overflow for certain
+// input data range.
 // TODO(yangzihao): Use autotune to choose between this mode and
 // CUDNN_BATCHNORM_SPATIAL mode.
-static bool BatchnormSpatialPersistentEnabled() {
+bool BatchnormSpatialPersistentEnabled() {
   static bool is_enabled = [] {
     bool is_enabled = false;
     TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar(
@@ -734,22 +639,15 @@ static bool BatchnormSpatialPersistentEnabled() {
 class ScopedConvolutionDescriptor {
  public:
   ScopedConvolutionDescriptor(
-      CUDAExecutor* parent, const ConvolutionDescriptor& convolution_descriptor,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
       cudnnDataType_t data_type)
-      : parent_(parent), handle_(nullptr) {
-    cudnnStatus_t status =
-        wrap::cudnnCreateConvolutionDescriptor(parent_, &handle_);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(FATAL) << "could not create cudnn convolution descriptor: "
-                 << ToString(status);
-    }
+      : handle_(CreateConvolutionDescriptor()) {
     const auto& strides64 = convolution_descriptor.strides();
     const auto& padding64 = convolution_descriptor.padding();
     const auto& dilations64 = convolution_descriptor.dilations();
-    if (convolution_descriptor.pad_alignment() ==
-        dnn::PadAlignment::kTensorFlowPadding) {
-      LOG(ERROR) << "TensorFlow padding alignment is not supported.";
-    }
+    CHECK_NE(convolution_descriptor.pad_alignment(),
+             dnn::PadAlignment::kTensorFlowPadding)
+        << "TensorFlow padding alignment is not supported.";
 
     // cuDNN requires arrays of ints.
     std::vector<int> strides(convolution_descriptor.ndims());
@@ -764,52 +662,43 @@ class ScopedConvolutionDescriptor {
     std::transform(dilations64.cbegin(), dilations64.cend(), dilations.begin(),
                    &CheckedNarrowing<int64, int>);
 
-    status = wrap::cudnnSetConvolutionNdDescriptor(
-        parent_, handle_, convolution_descriptor.ndims(), padding.data(),
+    CHECK_CUDNN_OK(cudnnSetConvolutionNdDescriptor(
+        handle_.get(), convolution_descriptor.ndims(), padding.data(),
         strides.data(), dilations.data(),
         // NOTE(keveman): cuDNN supports convolution and cross correlation.
         // However, almost all the use cases do cross correlation, so just
         // hard coding it here.
-        CUDNN_CROSS_CORRELATION, data_type);
+        CUDNN_CROSS_CORRELATION, data_type));
 
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(FATAL) << "could not set cudnn convolution descriptor: "
-                 << ToString(status);
-    }
     // NOTE(benbarsdell): This only applies if tensor op math is enabled
     //                      and algo selection is set to Default.
     this->set_use_tensor_op_math(true);
+
+#if CUDNN_MAJOR >= 7
+    VLOG(2) << "Requesting grouped convolution: "
+            << convolution_descriptor.group_count();
+    CHECK_CUDNN_OK(cudnnSetConvolutionGroupCount(
+        handle_.get(), convolution_descriptor.group_count()));
+#else
+    CHECK_EQ(convolution_descriptor.group_count(), 1)
+        << "Requested grouped convolution for cuDNN version < 7";
+#endif
   }
 
-  void set_use_tensor_op_math(bool use_tensor_op_math) {
+  void set_use_tensor_op_math(bool use_tensor_op_math) const {
 #if CUDNN_VERSION >= 7000
     cudnnMathType_t math_type =
         (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH);
     if (TensorOpMathEnabled()) {
-      cudnnStatus_t status =
-          wrap::cudnnSetConvolutionMathType(parent_, handle_, math_type);
-      if (status != CUDNN_STATUS_SUCCESS) {
-        LOG(FATAL) << "could not set cudnn convolution math type: "
-                   << ToString(status);
-      }
+      CHECK_CUDNN_OK(cudnnSetConvolutionMathType(handle_.get(), math_type));
     }
 #endif
   }
 
-  ~ScopedConvolutionDescriptor() {
-    cudnnStatus_t status =
-        wrap::cudnnDestroyConvolutionDescriptor(parent_, handle_);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(ERROR) << "could not destroy cudnn convolution descriptor: "
-                 << ToString(status);
-    }
-  }
-
-  cudnnConvolutionDescriptor_t handle() const { return handle_; }
+  cudnnConvolutionDescriptor_t handle() const { return handle_.get(); }
 
  private:
-  CUDAExecutor* parent_;                 // Parent executor. Not owned.
-  cudnnConvolutionDescriptor_t handle_;  // Owned.
+  ConvolutionDescriptor handle_;  // Owned.
 
   SE_DISALLOW_COPY_AND_ASSIGN(ScopedConvolutionDescriptor);
 };
@@ -818,15 +707,9 @@ class ScopedConvolutionDescriptor {
 // within a scope.
 class ScopedPoolingDescriptor {
  public:
-  ScopedPoolingDescriptor(CUDAExecutor* parent,
-                          const PoolingDescriptor& pooling_descriptor)
-      : parent_(parent), handle_(nullptr) {
-    cudnnStatus_t status =
-        wrap::cudnnCreatePoolingDescriptor(parent_, &handle_);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(FATAL) << "could not create cudnn pooling descriptor: "
-                 << ToString(status);
-    }
+  explicit ScopedPoolingDescriptor(
+      const dnn::PoolingDescriptor& pooling_descriptor)
+      : handle_(CreatePoolingDescriptor()) {
     const std::vector<int64> strides64 = pooling_descriptor.strides();
     const std::vector<int64> padding64 = pooling_descriptor.padding();
     const std::vector<int64> shape64 = pooling_descriptor.window();
@@ -842,34 +725,19 @@ class ScopedPoolingDescriptor {
     std::transform(shape64.cbegin(), shape64.cend(), shape.begin(),
                    &CheckedNarrowing<int64, int>);
     bool propagate_nans = pooling_descriptor.propagate_nans();
-    status = wrap::cudnnSetPoolingNdDescriptor(
-        parent_, handle_,
+    CHECK_CUDNN_OK(cudnnSetPoolingNdDescriptor(
+        handle_.get(),
         (pooling_descriptor.mode() == dnn::PoolingMode::kMaximum
              ? CUDNN_POOLING_MAX
              : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING),
-#if CUDNN_VERSION >= 5000
-        propagate_nans ? CUDNN_PROPAGATE_NAN : CUDNN_NOT_PROPAGATE_NAN,
-#endif
-        nd, shape.data(), padding.data(), strides.data());
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(FATAL) << "could not set cudnn pooling descriptor: "
-                 << ToString(status);
-    }
-  }
-  ~ScopedPoolingDescriptor() {
-    cudnnStatus_t status =
-        wrap::cudnnDestroyPoolingDescriptor(parent_, handle_);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(ERROR) << "could not destroy cudnn pooling descriptor: "
-                 << ToString(status);
-    }
+        propagate_nans ? CUDNN_PROPAGATE_NAN : CUDNN_NOT_PROPAGATE_NAN, nd,
+        shape.data(), padding.data(), strides.data()));
   }
 
-  cudnnPoolingDescriptor_t handle() const { return handle_; }
+  cudnnPoolingDescriptor_t handle() const { return handle_.get(); }
 
  private:
-  CUDAExecutor* parent_;             // Parent executor. Not owned.
-  cudnnPoolingDescriptor_t handle_;  // Owned.
+  PoolingDescriptor handle_;  // Owned.
 
   SE_DISALLOW_COPY_AND_ASSIGN(ScopedPoolingDescriptor);
 };
@@ -877,15 +745,9 @@ class ScopedPoolingDescriptor {
 // Turns a NormalizeDescriptor structure into a cudnn LRN descriptor handle.
 class ScopedNormalizeDescriptor {
  public:
-  ScopedNormalizeDescriptor(CUDAExecutor* parent,
-                            const NormalizeDescriptor& normalize_descriptor)
-      : parent_(parent), handle_(nullptr) {
-    cudnnStatus_t status = wrap::cudnnCreateLRNDescriptor(parent_, &handle_);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(FATAL) << "could not create cudnn LRN descriptor: "
-                 << ToString(status);
-    }
-
+  explicit ScopedNormalizeDescriptor(
+      const dnn::NormalizeDescriptor& normalize_descriptor)
+      : handle_(CreateLrnDescriptor()) {
     // The range specifies that the indices in the closed range
     // [i - range, i + range] should be included in the normalization for index
     // i. The lrnN value is the total number of elements in the range, so
@@ -906,47 +768,26 @@ class ScopedNormalizeDescriptor {
 
     double lrnBeta = normalize_descriptor.beta();
     double lrnK = normalize_descriptor.bias();
-    status = wrap::cudnnSetLRNDescriptor(parent_, handle_, lrnN, lrnAlpha,
-                                         lrnBeta, lrnK);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(FATAL) << "could not set cudnn LRN descriptor: " << ToString(status);
-    }
+    CHECK_CUDNN_OK(
+        cudnnSetLRNDescriptor(handle_.get(), lrnN, lrnAlpha, lrnBeta, lrnK));
   }
 
-  ~ScopedNormalizeDescriptor() {
-    cudnnStatus_t status = wrap::cudnnDestroyLRNDescriptor(parent_, handle_);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(ERROR) << "could not destroy cudnn LRN descriptor: "
-                 << ToString(status);
-    }
-  }
-
-  cudnnLRNDescriptor_t handle() const { return handle_; }
+  cudnnLRNDescriptor_t handle() const { return handle_.get(); }
 
  private:
-  CUDAExecutor* parent_;         // Parent executor. Not owned.
-  cudnnLRNDescriptor_t handle_;  // Owned.
+  LrnDescriptor handle_;  // Owned.
 
   SE_DISALLOW_COPY_AND_ASSIGN(ScopedNormalizeDescriptor);
 };
 
-#if CUDNN_VERSION >= 5000
 // Turns a ActivationDescriptor structure into a cudnn activation
 // descriptor handle within a scope.
 class ScopedActivationDescriptor {
  public:
-  ScopedActivationDescriptor(CUDAExecutor* parent,
-                             dnn::ActivationMode activation_mode,
+  ScopedActivationDescriptor(dnn::ActivationMode activation_mode,
                              cudnnNanPropagation_t nan_propagation,
                              double value_max)
-      : parent_(parent), handle_(nullptr) {
-    cudnnStatus_t status =
-        wrap::cudnnCreateActivationDescriptor(parent_, &handle_);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(FATAL) << "could not create cudnn activation descriptor: "
-                 << ToString(status);
-    }
-
+      : handle_(CreateActivationDescriptor()) {
     double relu_ceiling = 0.0;
     cudnnActivationMode_t mode;
     switch (activation_mode) {
@@ -972,34 +813,18 @@ class ScopedActivationDescriptor {
                    << static_cast<int>(activation_mode);
     }
 
-    status = wrap::cudnnSetActivationDescriptor(parent_, handle_, mode,
-                                                nan_propagation, relu_ceiling);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(FATAL) << "could not set cudnn activation descriptor: "
-                 << ToString(status);
-    }
-  }
-
-  ~ScopedActivationDescriptor() {
-    cudnnStatus_t status =
-        wrap::cudnnDestroyActivationDescriptor(parent_, handle_);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(ERROR) << "could not destroy cudnn activation descriptor: "
-                 << ToString(status);
-    }
+    CHECK_CUDNN_OK(cudnnSetActivationDescriptor(handle_.get(), mode,
+                                                nan_propagation, relu_ceiling));
   }
 
-  cudnnActivationDescriptor_t handle() const { return handle_; }
+  cudnnActivationDescriptor_t handle() const { return handle_.get(); }
 
  private:
-  CUDAExecutor* parent_;                // Parent executor. Not owned.
-  cudnnActivationDescriptor_t handle_;  // Owned.
+  ActivationDescriptor handle_;  // Owned.
 
   SE_DISALLOW_COPY_AND_ASSIGN(ScopedActivationDescriptor);
 };
-#endif
 
-namespace {
 cudnnDataType_t ToCudnnDataType(
     dnn::DataType data_type,
     dnn::DataLayout data_layout = dnn::DataLayout::kBatchDepthYX) {
@@ -1008,18 +833,14 @@ cudnnDataType_t ToCudnnDataType(
     case dnn::DataType::kDouble:
     case dnn::DataType::kHalf:
       return static_cast<cudnnDataType_t>(data_type);
-#if CUDNN_VERSION >= 6000
     case dnn::DataType::kInt8:
       return data_layout == dnn::DataLayout::kBatchDepthYX4 ? CUDNN_DATA_INT8x4
                                                             : CUDNN_DATA_INT8;
-#endif
     default:
       LOG(FATAL) << "Invalid DNN data type: " << static_cast<int>(data_type);
   }
 }
 
-#if CUDNN_VERSION >= 5000
-
 cudnnRNNInputMode_t ToCudnnRnnInputMode(dnn::RnnInputMode input_mode) {
   switch (input_mode) {
     case dnn::RnnInputMode::kRnnLinearSkip:
@@ -1067,244 +888,174 @@ int CudnnDataTypeToByteSize(cudnnDataType_t data_type) {
   }
 }
 
-#endif  // CUDNN_VERSION
+class ScopedDropoutDescriptor {
+  explicit ScopedDropoutDescriptor(DropoutDescriptor handle)
+      : handle_(std::move(handle)) {}
 
-template <typename Base>
-class MixinBase : public Base {};
-template <>
-class MixinBase<void> {};
-
-}  // namespace
-
-#if CUDNN_VERSION >= 5000
-
-#define CUDNN_RETURN_IF_FAIL(STATUS, ...)                                \
-  if (!SE_PREDICT_TRUE((STATUS) == CUDNN_STATUS_SUCCESS)) {              \
-    string error_msg = port::StrCat(ToString(STATUS), " ", __VA_ARGS__); \
-    SetFailure(port::Status(port::error::UNKNOWN, error_msg));           \
-    LOG(ERROR) << error_msg;                                             \
-    return;                                                              \
-  }
-
-template <typename Base>
-class CudnnDescriptorCommon : public MixinBase<Base> {
  public:
-  bool ok() const { return status_.ok(); }
-  port::Status Status() const { return status_; }
+  ScopedDropoutDescriptor(ScopedDropoutDescriptor&&) = default;
 
- protected:
-  void SetFailure(const port::Status& status) { status_.Update(status); }
-  port::Status status_;
-};
+  static port::StatusOr<ScopedDropoutDescriptor> Create(
+      const CudnnHandle& cudnn, float dropout, uint64 seed,
+      ScratchAllocator* state_allocator) {
+    DropoutDescriptor handle = CreateDropoutDescriptor();
 
-class CudnnDropoutDescriptor : public CudnnDescriptorCommon<void> {
- public:
-  CudnnDropoutDescriptor(CUDAExecutor* parent, cudnnHandle_t cudnn_handle,
-                         float dropout, uint64 seed,
-                         ScratchAllocator* state_allocator)
-      : parent_(parent), handle_(nullptr) {
-    cudnnStatus_t status;
-    status = wrap::cudnnCreateDropoutDescriptor(parent_, &handle_);
-    CUDNN_RETURN_IF_FAIL(status, "Failed to create dropout descriptor");
-
-    if (dropout == 0.f) {
-      return;
+    if (dropout == 0.0f) {
+      // Return 'empty' dropout descriptor.
+      return ScopedDropoutDescriptor(std::move(handle));
     }
 
     DeviceMemory<uint8> state_memory;
     if (state_allocator) {
       size_t state_sizes_in_bytes = 0;
-      status = wrap::cudnnDropoutGetStatesSize(parent_, cudnn_handle,
-                                               &state_sizes_in_bytes);
-      CUDNN_RETURN_IF_FAIL(status, "Failed to query dropout state sizes");
-
-      auto allocated =
-          state_allocator->AllocateBytes(nullptr, state_sizes_in_bytes);
-      if (!allocated.ok() ||
-          (state_memory = allocated.ValueOrDie()) == nullptr) {
-        string error_msg =
-            port::StrCat("Failed to allocate Cudnn dropout state memory of ",
-                         state_sizes_in_bytes, " bytes.");
-        status_ = port::Status(port::error::UNKNOWN, error_msg);
-        LOG(ERROR) << error_msg;
-        return;
-      }
+      RETURN_IF_CUDNN_ERROR(
+          cudnnDropoutGetStatesSize(cudnn.handle(), &state_sizes_in_bytes));
+      SE_ASSIGN_OR_RETURN(state_memory, state_allocator->AllocateBytes(
+                                            nullptr, state_sizes_in_bytes));
     }
-    status = wrap::cudnnSetDropoutDescriptor(parent_, handle_, cudnn_handle,
-                                             dropout, state_memory.opaque(),
-                                             state_memory.size(), seed);
-    CUDNN_RETURN_IF_FAIL(
-        status, port::StrCat(
-                    "Failed to set dropout descriptor with state memory size: ",
-                    state_memory.size(), " bytes."));
-  }
+    RETURN_IF_CUDNN_ERROR(cudnnSetDropoutDescriptor(
+        handle.get(), cudnn.handle(), dropout, state_memory.opaque(),
+        state_memory.size(), seed));
 
-  ~CudnnDropoutDescriptor() {
-    if (handle_) {
-      cudnnStatus_t status =
-          wrap::cudnnDestroyDropoutDescriptor(parent_, handle_);
-      CUDNN_RETURN_IF_FAIL(status, "Failed to destroy Cudnn dropout handle: ");
-    }
+    return ScopedDropoutDescriptor(std::move(handle));
   }
 
-  cudnnDropoutDescriptor_t handle() const {
-    if (!ok()) return nullptr;
-    return handle_;
-  }
+  cudnnDropoutDescriptor_t handle() const { return handle_.get(); }
 
  private:
-  CUDAExecutor* parent_;
-  cudnnDropoutDescriptor_t handle_;
-  float dropout_;
-  uint64 seed_;
-  SE_DISALLOW_COPY_AND_ASSIGN(CudnnDropoutDescriptor);
+  DropoutDescriptor handle_;  // Owned.
+  SE_DISALLOW_COPY_AND_ASSIGN(ScopedDropoutDescriptor);
 };
 
-class CudnnRnnParamsDescriptor : public CudnnDescriptorCommon<void> {
- public:
-  typedef dnn::RnnDescriptor::ParamsRegion ParamsRegion;
+class CudnnRnnParamsDescriptor {
   typedef dnn::RnnDescriptor::ParamsRegions ParamsRegions;
-  CudnnRnnParamsDescriptor(CUDAExecutor* parent, cudnnHandle_t cudnn_handle,
-                           const CudnnRnnDescriptor& rnn_desc);
-  ~CudnnRnnParamsDescriptor() {
-    cudnnStatus_t status = wrap::cudnnDestroyFilterDescriptor(parent_, handle_);
-    CUDNN_RETURN_IF_FAIL(status, "Failed to destroy RNN filter descriptor");
-  }
-  cudnnFilterDescriptor_t handle() const {
-    if (!ok()) return nullptr;
-    return handle_;
-  }
+
+  CudnnRnnParamsDescriptor(FilterDescriptor handle, int64 params_size_in_bytes,
+                           ParamsRegions weights, ParamsRegions biases)
+      : handle_(std::move(handle)),
+        params_size_in_bytes_(params_size_in_bytes),
+        weights_(std::move(weights)),
+        biases_(std::move(biases)) {}
+
+ public:
+  CudnnRnnParamsDescriptor(CudnnRnnParamsDescriptor&&) = default;
+
+  static port::StatusOr<CudnnRnnParamsDescriptor> Create(
+      const CudnnHandle& cudnn, int input_size, cudnnDataType_t data_type,
+      cudnnRNNDescriptor_t rnn_desc, cudnnRNNMode_t rnn_mode,
+      cudnnDirectionMode_t direction_mode, int num_layers);
+
+  cudnnFilterDescriptor_t handle() const { return handle_.get(); }
   int64 params_size_in_bytes() const { return params_size_in_bytes_; }
   ParamsRegions params_weights() const {
-    if (!ok()) return ParamsRegions();
     return weights_;
   }
   ParamsRegions params_biases() const {
-    if (!ok()) return ParamsRegions();
     return biases_;
   }
 
  private:
-  int GetRegionCountPerLayer() const;
-  CUDAExecutor* parent_;
-  cudnnFilterDescriptor_t handle_;
-  const CudnnRnnDescriptor* rnn_desc_;
+  FilterDescriptor handle_;
   int64 params_size_in_bytes_;
   ParamsRegions weights_;
   ParamsRegions biases_;
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnRnnParamsDescriptor);
 };
 
-class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
- public:
-  CudnnRnnDescriptor(CUDAExecutor* parent, cudnnHandle_t cudnn_handle,
-                     int num_layers, int hidden_size, int input_size,
-                     int batch_size, cudnnRNNInputMode_t input_mode,
+}  // namespace
+
+class CudnnRnnDescriptor : public dnn::RnnDescriptor {
+  CudnnRnnDescriptor(const CudnnHandle& cudnn, cuda::RnnDescriptor rnn_desc,
+                     PersistentRnnPlan rnn_plan, int num_layers,
+                     int hidden_size, int input_size, int batch_size,
+                     cudnnRNNInputMode_t input_mode,
                      cudnnDirectionMode_t direction_mode,
                      cudnnRNNMode_t rnn_mode, cudnnDataType_t data_type,
                      cudnnDataType_t compute_type,
                      const dnn::AlgorithmConfig& algorithm_config,
-                     float dropout, uint64 seed,
-                     ScratchAllocator* state_allocator)
-      : parent_(parent),
-        rnn_desc_(nullptr),
+                     ScopedDropoutDescriptor dropout_desc,
+                     CudnnRnnParamsDescriptor params_desc)
+      : rnn_desc_(std::move(rnn_desc)),
+        rnn_plan_(std::move(rnn_plan)),
         num_layers_(num_layers),
         hidden_size_(hidden_size),
         input_size_(input_size),
         batch_size_(batch_size),
-#if CUDNN_VERSION >= 6000
-        rnn_plan_(nullptr),
-#endif
+        rnn_algo_(ToCudnnRNNAlgo(algorithm_config.algorithm())),
         input_mode_(input_mode),
         direction_mode_(direction_mode),
         rnn_mode_(rnn_mode),
         data_type_(data_type),
         compute_type_(compute_type),
-        algorithm_config_(algorithm_config) {
-    // Create the dropout handle.
-    cudnn_dropout_desc_.reset(new CudnnDropoutDescriptor(
-        parent, cudnn_handle, dropout, seed, state_allocator));
-    if (!cudnn_dropout_desc_->ok()) {
-      SetFailure(cudnn_dropout_desc_->Status());
-      return;
-    }
+        algorithm_config_(algorithm_config),
+        dropout_desc_(std::move(dropout_desc)),
+        params_desc_(std::move(params_desc)) {}
+
+ public:
+  CudnnRnnDescriptor(CudnnRnnDescriptor&& other) = default;
+
+  static port::StatusOr<CudnnRnnDescriptor> Create(
+      const CudnnHandle& cudnn, int num_layers, int hidden_size, int input_size,
+      int batch_size, cudnnRNNInputMode_t input_mode,
+      cudnnDirectionMode_t direction_mode, cudnnRNNMode_t rnn_mode,
+      cudnnDataType_t data_type, cudnnDataType_t compute_type,
+      const dnn::AlgorithmConfig& algorithm_config, float dropout, uint64 seed,
+      ScratchAllocator* state_allocator) {
+    SE_ASSIGN_OR_RETURN(
+        ScopedDropoutDescriptor dropout_desc,
+        ScopedDropoutDescriptor::Create(cudnn, dropout, seed, state_allocator));
+
+    cuda::RnnDescriptor rnn_desc = CreateRnnDescriptor();
+    cudnnRNNAlgo_t rnn_algo = ToCudnnRNNAlgo(algorithm_config.algorithm());
 
-    // Create the RNN handle
-    cudnnStatus_t status = wrap::cudnnCreateRNNDescriptor(parent_, &rnn_desc_);
-    CUDNN_RETURN_IF_FAIL(status, "Unable to create RNN descriptor");
-#if CUDNN_VERSION >= 6000
     // TODO: allow the user to choose an algorithm.
-    rnn_algo_ = ToCudnnRNNAlgo(algorithm_config_.algorithm());
-    status = wrap::cudnnSetRNNDescriptor_v6(
-        parent, cudnn_handle, /*rnnDesc=*/rnn_desc_, /*hiddenSize=*/hidden_size,
-        /*numLayers=*/num_layers, /*dropoutDesc=*/dropout_handle(),
+    RETURN_IF_CUDNN_ERROR(cudnnSetRNNDescriptor_v6(
+        cudnn.handle(), /*rnnDesc=*/rnn_desc.get(), /*hiddenSize=*/hidden_size,
+        /*numLayers=*/num_layers, /*dropoutDesc=*/dropout_desc.handle(),
         /*inputMode=*/input_mode, /*direction=*/direction_mode,
-        /*mode=*/rnn_mode, /*algo=*/rnn_algo_, /*dataType=*/compute_type);
-    CUDNN_RETURN_IF_FAIL(status, ::tensorflow::strings::Printf(
-                                     "Unable to update RNN descriptor with "
-                                     "algo_id: %d and compute_type: %d",
-                                     static_cast<int>(rnn_algo_),
-                                     static_cast<int>(compute_type)));
-
-    if (rnn_algo_ == CUDNN_RNN_ALGO_PERSIST_DYNAMIC) {
-      CHECK_GE(batch_size_, 0);
-      status = wrap::cudnnCreatePersistentRNNPlan(
-          parent, rnn_desc_, batch_size_, data_type_, &rnn_plan_);
-      CUDNN_RETURN_IF_FAIL(status, "Unable to create persistent RNN plan.");
-      status = wrap::cudnnSetPersistentRNNPlan(parent, rnn_desc_, rnn_plan_);
-      CUDNN_RETURN_IF_FAIL(status, "Unable to update persistent RNN plan.");
+        /*mode=*/rnn_mode, /*algo=*/rnn_algo,
+        /*dataType=*/compute_type));
+
+    PersistentRnnPlan rnn_plan;
+    if (rnn_algo == CUDNN_RNN_ALGO_PERSIST_DYNAMIC) {
+      CHECK_GE(batch_size, 0);
+      rnn_plan = CreatePersistentRnnPlan(rnn_desc.get(), batch_size, data_type);
+      RETURN_IF_CUDNN_ERROR(
+          cudnnSetPersistentRNNPlan(rnn_desc.get(), rnn_plan.get()));
     }
-#else
-    CHECK(algorithm_config_.is_default())
-        << "Non-default algorithm not supported for CUDA version < 6.0";
-    status = wrap::cudnnSetRNNDescriptor(
-        parent, rnn_desc_ /*rnnDesc*/, hidden_size /*hiddenSize*/,
-        num_layers /*numLayers*/, dropout_handle() /*dropoutDesc*/,
-        input_mode /*inputMode*/, direction_mode /*direction*/,
-        rnn_mode /*mode*/, compute_type /*dataType*/);
-    CUDNN_RETURN_IF_FAIL(status, "Unable to update RNN descriptor");
-#endif
 
     // Create the params handle.
-    cudnn_params_desc_.reset(
-        new CudnnRnnParamsDescriptor(parent, cudnn_handle, *this));
-    if (!cudnn_params_desc_->ok()) {
-      SetFailure(cudnn_params_desc_->Status());
-      return;
-    }
-    set_use_tensor_op_math(algorithm_config_.algorithm().tensor_ops_enabled());
-  }
-  ~CudnnRnnDescriptor() override {
-    if (rnn_desc_) {
-      cudnnStatus_t status;
-#if CUDNN_VERSION >= 6000
-      if (rnn_algo_ == CUDNN_RNN_ALGO_PERSIST_DYNAMIC && rnn_plan_) {
-        status = wrap::cudnnDestroyPersistentRNNPlan(parent_, rnn_plan_);
-        CUDNN_RETURN_IF_FAIL(status, "Unable to destroy persistent RNN plan.");
-      }
-#endif
-      status = wrap::cudnnDestroyRNNDescriptor(parent_, rnn_desc_);
-      CUDNN_RETURN_IF_FAIL(status, "Unable to destroy RNN descriptor");
-    }
-  }
-  void set_use_tensor_op_math(bool use_tensor_op_math) {
+    SE_ASSIGN_OR_RETURN(auto params_desc,
+                        CudnnRnnParamsDescriptor::Create(
+                            cudnn, input_size, data_type, rnn_desc.get(),
+                            rnn_mode, direction_mode, num_layers));
+
 #if CUDNN_VERSION >= 7000
-    cudnnMathType_t math_type =
-        (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH);
-    if (RnnTensorOpMathEnabled()) {
-      cudnnStatus_t status =
-          wrap::cudnnSetRNNMatrixMathType(parent_, rnn_desc_, math_type);
-      if (status != CUDNN_STATUS_SUCCESS) {
-        LOG(FATAL) << "could not set cudnn RNN math type: "
-                   << ToString(status);
-      }
+    // Require explicit algorithm config to enable tensor cores. Some configs
+    // return CUDNN_NOT_SUPPORTED when tensor ops are enabled (which is against
+    // the idiom that enabling tensor ops is only a hint: see nvbugs/2172799).
+    // We can only reasonably expect the user to handle the subsequent failure
+    // in profile mode, which is run with algorithms returned from
+    // GetRnnAlgorithms() (which are non-default and explicitly set whether to
+    // use tensor ops).
+    if (RnnTensorOpMathEnabled() &&
+        !algorithm_config.algorithm().is_default()) {
+      cudnnMathType_t math_type =
+          algorithm_config.algorithm().tensor_ops_enabled()
+              ? CUDNN_TENSOR_OP_MATH
+              : CUDNN_DEFAULT_MATH;
+      CHECK_CUDNN_OK(cudnnSetRNNMatrixMathType(rnn_desc.get(), math_type));
     }
 #endif
+
+    return CudnnRnnDescriptor(cudnn, std::move(rnn_desc), std::move(rnn_plan),
+                              num_layers, hidden_size, input_size, batch_size,
+                              input_mode, direction_mode, rnn_mode, data_type,
+                              compute_type, algorithm_config,
+                              std::move(dropout_desc), std::move(params_desc));
   }
-  cudnnRNNDescriptor_t handle() const {
-    if (!ok()) return nullptr;
-    return rnn_desc_;
-  }
+
+  cudnnRNNDescriptor_t handle() const { return rnn_desc_.get(); }
   int num_layers() const { return num_layers_; }
   int hidden_size() const { return hidden_size_; }
   int input_size() const { return input_size_; }
@@ -1318,213 +1069,164 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
     return algorithm_config_;
   }
   int64 ParamsSizeInBytes() const override {
-    return cudnn_params_desc_->params_size_in_bytes();
-  }
-  cudnnDropoutDescriptor_t dropout_handle() const {
-    if (!cudnn_dropout_desc_) return nullptr;
-    return cudnn_dropout_desc_->handle();
+    return params_desc_.params_size_in_bytes();
   }
   cudnnFilterDescriptor_t params_handle() const {
-    if (!cudnn_params_desc_) return nullptr;
-    return cudnn_params_desc_->handle();
+    return params_desc_.handle();
   }
   ParamsRegions ParamsWeightRegions() const override {
-    if (!ok()) return ParamsRegions();
-    return cudnn_params_desc_->params_weights();
+    return params_desc_.params_weights();
   }
   ParamsRegions ParamsBiasRegions() const override {
-    if (!ok()) return ParamsRegions();
-    return cudnn_params_desc_->params_biases();
+    return params_desc_.params_biases();
   }
 
  private:
-  CUDAExecutor* parent_;
-  cudnnRNNDescriptor_t rnn_desc_;
+  cuda::RnnDescriptor rnn_desc_;
+  PersistentRnnPlan rnn_plan_;
   int num_layers_;
   int hidden_size_;
   int input_size_;
   // batch_size_ is set to -1 when not using CUDNN_RNN_ALGO_PERSIST_DYNAMIC
   // algorithm.
   int batch_size_;
-#if CUDNN_VERSION >= 6000
   cudnnRNNAlgo_t rnn_algo_;
-  cudnnPersistentRNNPlan_t rnn_plan_;
-#endif
   cudnnRNNInputMode_t input_mode_;
   cudnnDirectionMode_t direction_mode_;
   cudnnRNNMode_t rnn_mode_;
   cudnnDataType_t data_type_;
   cudnnDataType_t compute_type_;
   dnn::AlgorithmConfig algorithm_config_;
-  std::unique_ptr<CudnnDropoutDescriptor> cudnn_dropout_desc_;
-  std::unique_ptr<CudnnRnnParamsDescriptor> cudnn_params_desc_;
+  ScopedDropoutDescriptor dropout_desc_;
+  CudnnRnnParamsDescriptor params_desc_;
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnRnnDescriptor);
 };
 
-CudnnRnnParamsDescriptor::CudnnRnnParamsDescriptor(
-    CUDAExecutor* parent, cudnnHandle_t cudnn_handle,
-    const CudnnRnnDescriptor& rnn_desc)
-    : parent_(parent),
-      handle_(nullptr),
-      rnn_desc_(&rnn_desc),
-      params_size_in_bytes_(0) {
-  cudnnTensorDescriptor_t input_desc = nullptr;
-  {
-    // Query the params size.
-    auto status = wrap::cudnnCreateTensorDescriptor(parent, &input_desc);
-    CUDNN_RETURN_IF_FAIL(status, "Cudnn fails to create tensor descriptor");
-    int dims[] = {1, rnn_desc.input_size(), 1};
-    int strides[] = {dims[1] * dims[2], dims[2], 1};
-    status = wrap::cudnnSetTensorNdDescriptor(
-        parent, input_desc /*tensorDesc*/, rnn_desc.data_type() /*dataType*/,
-        sizeof(dims) / sizeof(dims[0]) /*nbDims*/, dims /*dimA*/,
-        strides /*strideA*/);
-    CUDNN_RETURN_IF_FAIL(status, "Cudnn fails to set tensor descriptor");
-
-    size_t params_size = 0;
-    status = wrap::cudnnGetRNNParamsSize(
-        parent, cudnn_handle /*handle*/, rnn_desc.handle() /*rnnDesc*/,
-        input_desc /*xDesc*/, &params_size /*sizeInBytes*/,
-        rnn_desc.data_type() /*dataType*/);
-    CUDNN_RETURN_IF_FAIL(status, "Cudnn fails to get RNN parameter size");
-    params_size_in_bytes_ = static_cast<int64>(params_size);
-  }
+namespace {
 
-  {
-    // Create the params descriptor.
-    auto status = wrap::cudnnCreateFilterDescriptor(parent, &handle_);
-    CUDNN_RETURN_IF_FAIL(status, "Cudnn fails to create RNN filter descriptor");
-    int dims[] = {static_cast<int>(params_size_in_bytes_), 1, 1};
-    status = wrap::cudnnSetFilterNdDescriptor(
-        parent, handle_ /*filterDesc*/, rnn_desc.data_type() /*dataType*/,
-        CUDNN_TENSOR_NCHW /*format*/, sizeof(dims) / sizeof(dims[0]) /*nbDims*/,
-        dims /*filterDimA*/);
-    CUDNN_RETURN_IF_FAIL(status, "Cudnn fails to update RNN filter descriptor");
-  }
+port::StatusOr<CudnnRnnParamsDescriptor> CudnnRnnParamsDescriptor::Create(
+    const CudnnHandle& cudnn, int input_size, cudnnDataType_t data_type,
+    cudnnRNNDescriptor_t rnn_desc, cudnnRNNMode_t rnn_mode,
+    cudnnDirectionMode_t direction_mode, int num_layers) {
+  // Query the params size.
+  TensorDescriptor input_desc = CreateTensorDescriptor();
+  int tensor_dims[] = {1, input_size, 1};
+  int strides[] = {tensor_dims[1] * tensor_dims[2], tensor_dims[2], 1};
+  RETURN_IF_CUDNN_ERROR(cudnnSetTensorNdDescriptor(
+      /*tensorDesc=*/input_desc.get(), /*dataType=*/data_type,
+      /*nbDims=*/sizeof(tensor_dims) / sizeof(tensor_dims[0]),
+      /*dimA=*/tensor_dims,
+      /*strideA=*/strides));
+
+  size_t params_size = 0;
+  RETURN_IF_CUDNN_ERROR(cudnnGetRNNParamsSize(
+      /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc,
+      /*xDesc=*/input_desc.get(), /*sizeInBytes=*/&params_size,
+      /*dataType=*/data_type));
+  int64 params_size_in_bytes = static_cast<int64>(params_size);
+
+  FilterDescriptor filter_desc = CreateFilterDescriptor();
+  int filter_dims[] = {static_cast<int>(params_size_in_bytes), 1, 1};
+  RETURN_IF_CUDNN_ERROR(cudnnSetFilterNdDescriptor(
+      /*filterDesc=*/filter_desc.get(), /*dataType=*/data_type,
+      /*format=*/CUDNN_TENSOR_NCHW,
+      /*nbDims=*/sizeof(filter_dims) / sizeof(filter_dims[0]),
+      /*filterDimA=*/filter_dims));
+
+  // Create the weights and biases into the params buffer
+  int region_count_per_layer = [&] {
+    switch (rnn_mode) {
+      case CUDNN_RNN_RELU:
+      case CUDNN_RNN_TANH:
+        return 2;
+      case CUDNN_LSTM:
+        return 8;
+      case CUDNN_GRU:
+        return 6;
+      default:
+        LOG(FATAL) << "Invalid RNN Mode: " << static_cast<int>(rnn_mode);
+        return 0;
+    }
+  }();
 
-  {
-    // Create the weights and biases into the params buffer
-    int region_count_per_layer = GetRegionCountPerLayer();
-    cudnnFilterDescriptor_t region_desc_handle = nullptr;
-    auto status =
-        wrap::cudnnCreateFilterDescriptor(parent, &region_desc_handle);
-    CUDNN_RETURN_IF_FAIL(status, "Cudnn fails to create filter descriptor");
-    const int layer_count = rnn_desc.direction_mode() == CUDNN_UNIDIRECTIONAL
-                                ? rnn_desc.num_layers()
-                                : 2 * rnn_desc.num_layers();
-    for (int layer = 0; layer < layer_count; layer++) {
-      for (int region = 0; region < region_count_per_layer; region++) {
-        for (int type = 0; type < 2; type++) {
-          void* offset = nullptr;
-          if (type == 0) {
-            status = wrap::cudnnGetRNNLinLayerMatrixParams(
-                parent, cudnn_handle /*handle*/, rnn_desc.handle() /*rnnDesc*/,
-                layer /*layer*/, input_desc /*xDesc*/, handle_ /*wDesc*/,
-                nullptr /*w*/, region /*linLayerID*/,
-                region_desc_handle /*linLayerMatDesc*/,
-                &offset /*linLayerMat*/);
-            CUDNN_RETURN_IF_FAIL(
-                status, "Cudnn fails to call cudnnGetRNNLinLayerMatrixParams");
-          } else {
-            status = wrap::cudnnGetRNNLinLayerBiasParams(
-                parent, cudnn_handle /*rnnDesc*/, rnn_desc.handle() /*rnnDesc*/,
-                layer /*layer*/, input_desc /*xDesc*/, handle_ /*wDesc*/,
-                nullptr /*w*/, region /*linLayerID*/,
-                region_desc_handle /*linLayerBiasDesc*/,
-                &offset /*linLayerBias*/);
-            CUDNN_RETURN_IF_FAIL(
-                status, "Cudnn fails to call cudnnGetRNNLinLayerBiasParams");
-          }
-          int dims[] = {1, 1, 1};
-          cudnnDataType_t data_type;
-          cudnnTensorFormat_t tensor_format;
-          int n_dims;
-          status = wrap::cudnnGetFilterNdDescriptor(
-              parent, region_desc_handle /*filterDesc*/,
-              sizeof(dims) / sizeof(dims[0]) /*nbDimsRequested*/,
-              &data_type /*dataType*/, &tensor_format /*format*/,
-              &n_dims /*nbDims*/, dims /*filterDimA*/);
-          CUDNN_RETURN_IF_FAIL(status, "Cudnn fails to get filter description");
-          int64 size = dims[0] * dims[1] * dims[2] *
-                       CudnnDataTypeToByteSize(rnn_desc.data_type());
-          auto region = ParamsRegion{reinterpret_cast<int64>(offset), size};
-          if (type == 0) {
-            weights_.push_back(region);
-          } else {
-            biases_.push_back(region);
-          }
-        }
+  FilterDescriptor region_desc_handle = CreateFilterDescriptor();
+  const int layer_count =
+      direction_mode == CUDNN_UNIDIRECTIONAL ? num_layers : 2 * num_layers;
+
+  ParamsRegions weights;
+  ParamsRegions biases;
+
+  for (int layer = 0; layer < layer_count; layer++) {
+    for (int region = 0; region < region_count_per_layer; region++) {
+      for (int type = 0; type < 2; type++) {
+        void* offset = nullptr;
+        RETURN_IF_CUDNN_ERROR((type == 0 ? cudnnGetRNNLinLayerMatrixParams
+                                         : cudnnGetRNNLinLayerBiasParams)(
+            /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc,
+            /*layer=*/layer, /*xDesc=*/input_desc.get(),
+            /*wDesc=*/filter_desc.get(),
+            /*w=*/nullptr, /*linLayerID=*/region,
+            /*linLayerMatDesc=*/region_desc_handle.get(),
+            /*linLayerMat or linLayerBias=*/&offset));
+        int dims[] = {1, 1, 1};
+        cudnnDataType_t data_type;
+        cudnnTensorFormat_t tensor_format;
+        int n_dims;
+        RETURN_IF_CUDNN_ERROR(cudnnGetFilterNdDescriptor(
+            /*filterDesc=*/region_desc_handle.get(),
+            /*nbDimsRequested=*/sizeof(dims) / sizeof(dims[0]),
+            /*dataType=*/&data_type, /*format=*/&tensor_format,
+            /*nbDims=*/&n_dims, /*filterDimA=*/dims));
+        int64 size =
+            dims[0] * dims[1] * dims[2] * CudnnDataTypeToByteSize(data_type);
+        dnn::RnnDescriptor::ParamsRegion region = {
+            reinterpret_cast<int64>(offset), size};
+        (type == 0 ? weights : biases).push_back(region);
       }
     }
-    status = wrap::cudnnDestroyFilterDescriptor(parent, region_desc_handle);
-    CUDNN_RETURN_IF_FAIL(status, "Cudnn fails to destroy filter descriptor");
   }
 
-  {
-    // Release the dummy input tensor descriptor.
-    auto status = wrap::cudnnDestroyTensorDescriptor(parent, input_desc);
-    CUDNN_RETURN_IF_FAIL(status, "Cudnn fails to destroy tensor descriptor");
-  }
+  return CudnnRnnParamsDescriptor(std::move(filter_desc), params_size_in_bytes,
+                                  weights, biases);
 }
 
-int CudnnRnnParamsDescriptor::GetRegionCountPerLayer() const {
-  auto rnn_mode = rnn_desc_->rnn_mode();
-  switch (rnn_mode) {
-    case CUDNN_RNN_RELU:
-    case CUDNN_RNN_TANH:
-      return 2;
-    case CUDNN_LSTM:
-      return 8;
-    case CUDNN_GRU:
-      return 6;
-    default:
-      LOG(FATAL) << "Invalid RNN Mode: " << static_cast<int>(rnn_mode);
-  }
-}
+}  // namespace
 
 class CudnnRnnSequenceTensorDescriptor
-    : public CudnnDescriptorCommon<dnn::RnnSequenceTensorDescriptor> {
- public:
+    : public dnn::RnnSequenceTensorDescriptor {
   CudnnRnnSequenceTensorDescriptor(CUDAExecutor* parent, int seq_length,
                                    int batch_size, int data_size,
-                                   cudnnDataType_t data_type)
+                                   cudnnDataType_t data_type,
+                                   TensorDescriptor handle)
       : parent_(parent),
         seq_length_(seq_length),
         batch_size_(batch_size),
         data_size_(data_size),
-        data_type_(data_type) {
-    cudnnTensorDescriptor_t handle = nullptr;
-    if (seq_length <= 0) {
-      string error_msg =
-          port::StrCat("sequence length must be positive: ", seq_length);
-      LOG(ERROR) << error_msg;
-      SetFailure(port::Status(port::error::UNKNOWN, error_msg));
-      return;
-    }
-    cudnnStatus_t status = wrap::cudnnCreateTensorDescriptor(parent, &handle);
-    CUDNN_RETURN_IF_FAIL(status, "Failed to create tensor descriptor");
+        data_type_(data_type),
+        handle_(std::move(handle)),
+        handles_(seq_length, handle_.get()) {}
+
+ public:
+  CudnnRnnSequenceTensorDescriptor(CudnnRnnSequenceTensorDescriptor&&) =
+      default;
+
+  static port::StatusOr<CudnnRnnSequenceTensorDescriptor> Create(
+      CUDAExecutor* parent, int seq_length, int batch_size, int data_size,
+      cudnnDataType_t data_type) {
+    CHECK_GT(seq_length, 0);
     int dims[] = {batch_size, data_size, 1};
     int strides[] = {dims[1] * dims[2], dims[2], 1};
-    status = wrap::cudnnSetTensorNdDescriptor(
-        parent, handle /*tensorDesc*/, data_type /*dataType*/,
-        sizeof(dims) / sizeof(dims[0]) /*nbDims*/, dims /*dimA*/,
-        strides /*strideA*/);
-    CUDNN_RETURN_IF_FAIL(status, "Failed to update tensor descriptor");
-    // Replicate handle across the number of steps.
-    handles_.assign(seq_length, handle);
-  }
-
-  ~CudnnRnnSequenceTensorDescriptor() override {
-    // Only the first one needs to be destroyed. All others are the same.
-    cudnnStatus_t status =
-        wrap::cudnnDestroyTensorDescriptor(parent_, handles_[0]);
-    CUDNN_RETURN_IF_FAIL(status,
-                         "Failed to destroy sequence tensor descriptor");
+    TensorDescriptor tensor_desc = CreateTensorDescriptor();
+    RETURN_IF_CUDNN_ERROR(cudnnSetTensorNdDescriptor(
+        /*tensorDesc=*/tensor_desc.get(), /*dataType=*/data_type,
+        /*nbDims=*/sizeof(dims) / sizeof(dims[0]), /*dimA=*/dims,
+        /*strideA=*/strides));
+    return CudnnRnnSequenceTensorDescriptor(parent, seq_length, batch_size,
+                                            data_size, data_type,
+                                            std::move(tensor_desc));
   }
 
   const cudnnTensorDescriptor_t* handles() const {
-    if (!ok()) return nullptr;
-    CHECK(!handles_.empty()) << "handles cannot be empty";
     return handles_.data();
   }
 
@@ -1538,52 +1240,39 @@ class CudnnRnnSequenceTensorDescriptor
   int batch_size_;
   int data_size_;
   cudnnDataType_t data_type_;
-  std::vector<cudnnTensorDescriptor_t> handles_;
+  TensorDescriptor handle_;
+  std::vector<cudnnTensorDescriptor_t> handles_;  // Copies of handle_.
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnRnnSequenceTensorDescriptor);
 };
 
-class CudnnRnnStateTensorDescriptor
-    : public CudnnDescriptorCommon<dnn::RnnStateTensorDescriptor> {
+class CudnnRnnStateTensorDescriptor : public dnn::RnnStateTensorDescriptor {
  public:
   CudnnRnnStateTensorDescriptor(CUDAExecutor* parent, int num_layers,
                                 int batch_size, int data_size,
                                 cudnnDataType_t data_type)
       : parent_(parent),
-        handle_(nullptr),
+        handle_(CreateTensorDescriptor()),
         num_layers_(num_layers),
         batch_size_(batch_size),
         data_size_(data_size),
         data_type_(data_type) {
-    cudnnStatus_t status = wrap::cudnnCreateTensorDescriptor(parent, &handle_);
-    CUDNN_RETURN_IF_FAIL(status, "Failed to create tensor descriptor");
     int dims[] = {num_layers, batch_size, data_size};
     int strides[] = {dims[1] * dims[2], dims[2], 1};
-    status = wrap::cudnnSetTensorNdDescriptor(
-        parent, handle_ /*tensorDesc*/, data_type /*dataType*/,
-        sizeof(dims) / sizeof(dims[0]) /*nbDims*/, dims /*dimA*/,
-        strides /*strideA*/);
-    CUDNN_RETURN_IF_FAIL(status, "Failed to update tensor descriptor");
+    CHECK_CUDNN_OK(cudnnSetTensorNdDescriptor(
+        /*tensorDesc=*/handle_.get(), /*dataType=*/data_type,
+        /*nbDims=*/sizeof(dims) / sizeof(dims[0]), /*dimA=*/dims,
+        /*strideA=*/strides));
   }
 
-  ~CudnnRnnStateTensorDescriptor() override {
-    if (!handle_) {
-      cudnnStatus_t status =
-          wrap::cudnnDestroyTensorDescriptor(parent_, handle_);
-      CUDNN_RETURN_IF_FAIL(status, "Unable to destroy RNN state tensor");
-    }
-  }
+  cudnnTensorDescriptor_t handle() const { return handle_.get(); }
 
-  cudnnTensorDescriptor_t handle() const {
-    if (!ok()) return nullptr;
-    return handle_;
-  }
   int num_layers() const { return num_layers_; }
   int batch_size() const { return batch_size_; }
   int data_size() const { return data_size_; }
 
  private:
   CUDAExecutor* parent_;
-  cudnnTensorDescriptor_t handle_;
+  TensorDescriptor handle_;
   int num_layers_;
   int batch_size_;
   int data_size_;
@@ -1603,7 +1292,7 @@ struct RnnModelDims {
 };
 
 template <class T>
-bool ExtractAndCheckRnnForward(
+port::StatusOr<RnnModelDims> ExtractAndCheckRnnForward(
     const CudnnRnnDescriptor& rnn_desc,
     const CudnnRnnSequenceTensorDescriptor& input_desc,
     const DeviceMemory<T>& input_data,
@@ -1616,104 +1305,89 @@ bool ExtractAndCheckRnnForward(
     const CudnnRnnStateTensorDescriptor& output_h_desc,
     const DeviceMemory<T>& output_h_data,
     const CudnnRnnStateTensorDescriptor& output_c_desc,
-    const DeviceMemory<T>& output_c_data, RnnModelDims* model_dims) {
+    const DeviceMemory<T>& output_c_data) {
   // extract model parameters
-  model_dims->num_layers = rnn_desc.num_layers();
-  model_dims->batch_size = input_desc.batch_size();
-  model_dims->seq_length = input_desc.seq_length();
-  model_dims->hidden_size = rnn_desc.hidden_size();
-  model_dims->input_size = input_desc.data_size();
-  model_dims->dir_count =
+  RnnModelDims model_dims;
+  model_dims.num_layers = rnn_desc.num_layers();
+  model_dims.batch_size = input_desc.batch_size();
+  model_dims.seq_length = input_desc.seq_length();
+  model_dims.hidden_size = rnn_desc.hidden_size();
+  model_dims.input_size = input_desc.data_size();
+  model_dims.dir_count =
       (rnn_desc.direction_mode() == CUDNN_BIDIRECTIONAL) ? 2 : 1;
 
   // check parameters
   if (!(input_h_desc.num_layers() ==
-            model_dims->num_layers * model_dims->dir_count &&
-        input_h_desc.batch_size() == model_dims->batch_size &&
-        input_h_desc.data_size() == model_dims->hidden_size)) {
-    LOG(ERROR) << "Invalid input_h shape";
-    return false;
+            model_dims.num_layers * model_dims.dir_count &&
+        input_h_desc.batch_size() == model_dims.batch_size &&
+        input_h_desc.data_size() == model_dims.hidden_size)) {
+    return port::Status(port::error::INVALID_ARGUMENT, "Invalid input_h shape");
   }
   if (!(input_h_desc.num_layers() == input_c_desc.num_layers() &&
         input_h_desc.batch_size() == input_c_desc.batch_size() &&
         input_h_desc.data_size() == input_c_desc.data_size())) {
-    LOG(ERROR) << "Invalid input_c shape";
-    return false;
+    return port::Status(port::error::INVALID_ARGUMENT, "Invalid input_c shape");
   }
-  if (!(output_desc.seq_length() == model_dims->seq_length &&
-        output_desc.batch_size() == model_dims->batch_size &&
+  if (!(output_desc.seq_length() == model_dims.seq_length &&
+        output_desc.batch_size() == model_dims.batch_size &&
         output_desc.data_size() ==
-            model_dims->hidden_size * model_dims->dir_count)) {
-    LOG(ERROR) << "Invalid output shape";
-    return false;
+            model_dims.hidden_size * model_dims.dir_count)) {
+    return port::Status(port::error::INVALID_ARGUMENT, "Invalid output shape");
   }
   if (!(input_h_desc.num_layers() == output_h_desc.num_layers() &&
         input_h_desc.batch_size() == output_h_desc.batch_size() &&
         input_h_desc.data_size() == output_h_desc.data_size())) {
-    LOG(ERROR) << "Invalid output_h shape";
-    return false;
+    return port::Status(port::error::INVALID_ARGUMENT,
+                        "Invalid output_h shape");
   }
   if (!(input_h_desc.num_layers() == output_c_desc.num_layers() &&
         input_h_desc.batch_size() == output_c_desc.batch_size() &&
         input_h_desc.data_size() == output_c_desc.data_size())) {
-    LOG(ERROR) << "Invalid output_h shape";
-    return false;
+    return port::Status(port::error::INVALID_ARGUMENT,
+                        "Invalid output_c shape");
   }
 
-  return true;
+  return model_dims;
 }
 
-bool CheckRNNParameterSize(CUDAExecutor* parent, cudnnHandle_t cudnn_handle,
-                           const CudnnRnnDescriptor& rnn_desc,
-                           const CudnnRnnSequenceTensorDescriptor& input_desc) {
+port::Status CheckRNNParameterSize(
+    const CudnnHandle& cudnn, const CudnnRnnDescriptor& rnn_desc,
+    const CudnnRnnSequenceTensorDescriptor& input_desc) {
   size_t params_size_in_bytes = 0;
-  cudnnStatus_t status = wrap::cudnnGetRNNParamsSize(
-      parent, cudnn_handle /*handle*/, rnn_desc.handle() /*rnnDesc*/,
-      input_desc.handles()[0] /*xDesc*/, &params_size_in_bytes /*sizeInBytes*/,
-      rnn_desc.data_type() /*dataType*/);
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "Unable to check RNN param size: " << ToString(status);
-    return false;
+  RETURN_IF_CUDNN_ERROR(cudnnGetRNNParamsSize(
+      /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
+      /*xDesc=*/input_desc.handles()[0], /*sizeInBytes=*/&params_size_in_bytes,
+      /*dataType=*/rnn_desc.data_type()));
+  if (static_cast<int64>(params_size_in_bytes) !=
+      rnn_desc.ParamsSizeInBytes()) {
+    return port::Status(port::error::INVALID_ARGUMENT,
+                        "Mismatching RNN parameter size");
   }
-  return static_cast<int64>(params_size_in_bytes) ==
-         rnn_desc.ParamsSizeInBytes();
+  return port::Status::OK();
 }
 
-bool CreateRnnWorkspace(Stream* stream, CUDAExecutor* parent,
-                        cudnnHandle_t cudnn_handle,
-                        const CudnnRnnDescriptor& rnn_desc,
-                        const CudnnRnnSequenceTensorDescriptor& input_desc,
-                        ScratchAllocator* workspace_allocator,
-                        DeviceMemory<uint8>* workspace) {
+port::StatusOr<DeviceMemory<uint8>> CreateRnnWorkspace(
+    Stream* stream, const CudnnHandle& cudnn,
+    const CudnnRnnDescriptor& rnn_desc,
+    const CudnnRnnSequenceTensorDescriptor& input_desc,
+    ScratchAllocator* workspace_allocator) {
   // Query the workspace size.
   size_t workspace_size_in_bytes = 0;
-  cudnnStatus_t status = wrap::cudnnGetRNNWorkspaceSize(
-      parent, cudnn_handle /*handle*/, rnn_desc.handle() /*rnnDesc*/,
-      input_desc.seq_length() /*seqLength*/, input_desc.handles() /*xDesc*/,
-      &workspace_size_in_bytes /*sizeInBytes*/);
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "Unable to query workspace size: " << ToString(status);
-    return false;
-  }
+  RETURN_IF_CUDNN_ERROR(cudnnGetRNNWorkspaceSize(
+      /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
+      /*seqLength=*/input_desc.seq_length(), /*xDesc=*/input_desc.handles(),
+      /*sizeInBytes=*/&workspace_size_in_bytes));
   // Allocate the workspace.
-  if (workspace_size_in_bytes > 0) {
-    auto allocated =
-        workspace_allocator->AllocateBytes(stream, workspace_size_in_bytes);
-    if (!allocated.ok() || (*workspace = allocated.ValueOrDie()) == nullptr) {
-      LOG(ERROR) << port::StrCat("Failed to allocate RNN workspace of ",
-                                 workspace_size_in_bytes, " bytes.");
-      return false;
-    }
-  } else {
-    *workspace = DeviceMemory<uint8>();
+  if (workspace_size_in_bytes == 0) {
+    return DeviceMemory<uint8>();
   }
-  return true;
+  return workspace_allocator->AllocateBytes(stream, workspace_size_in_bytes);
 }
 
 }  // namespace
 
 template <class T>
-bool CudnnSupport::DoRnnForwardImpl(
+port::Status CudnnSupport::DoRnnForwardImpl(
     Stream* stream, const CudnnRnnDescriptor& rnn_desc,
     const CudnnRnnSequenceTensorDescriptor& input_desc,
     const DeviceMemory<T>& input_data,
@@ -1729,66 +1403,35 @@ bool CudnnSupport::DoRnnForwardImpl(
     DeviceMemory<T>* output_c_data, bool is_training,
     ScratchAllocator* reserve_space_allocator,
     ScratchAllocator* workspace_allocator,
-    dnn::ProfileResult* output_profile_result) {
-  // extract model parameters
-  RnnModelDims model_dims;
-  bool res = ExtractAndCheckRnnForward(
-      rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
-      input_c_desc, input_c_data, params, output_desc, *output_data,
-      output_h_desc, *output_h_data, output_c_desc, *output_c_data,
-      &model_dims);
-  if (!res) {
-    LOG(ERROR) << "Invalid parameters for RNN Model";
-    return false;
-  }
-
-  // check params size
-  mutex_lock lock{dnn_handle_mutex_};
-  auto set_stream_status =
-      wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (set_stream_status != CUDNN_STATUS_SUCCESS) {
-    LOG(FATAL) << "failed to set stream for cudnn handle: "
-               << ToString(set_stream_status);
-  }
+    dnn::ProfileResult* output_profile_result) {
+  SE_ASSIGN_OR_RETURN(
+      RnnModelDims model_dims,
+      ExtractAndCheckRnnForward(
+          rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
+          input_c_desc, input_c_data, params, output_desc, *output_data,
+          output_h_desc, *output_h_data, output_c_desc, *output_c_data));
 
-  if (!CheckRNNParameterSize(parent_, ToHandle(dnn_handle_), rnn_desc,
-                             input_desc)) {
-    LOG(ERROR) << "Invalid parameters";
-    return false;
-  }
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
 
-  // create the workspace
-  DeviceMemory<uint8> workspace;
-  if (!CreateRnnWorkspace(stream, parent_, ToHandle(dnn_handle_), rnn_desc,
-                          input_desc, workspace_allocator, &workspace)) {
-    LOG(ERROR) << "Unable to create rnn workspace";
-    return false;
-  }
+  SE_RETURN_IF_ERROR(CheckRNNParameterSize(cudnn, rnn_desc, input_desc));
+  SE_ASSIGN_OR_RETURN(DeviceMemory<uint8> workspace,
+                      CreateRnnWorkspace(stream, cudnn, rnn_desc, input_desc,
+                                         workspace_allocator))
 
   // query the reserve space size
   // allocate the reserve space
   DeviceMemory<uint8> reserve_space;
   if (is_training) {
     size_t reserve_space_size_in_bytes = 0;
-    cudnnStatus_t status = wrap::cudnnGetRNNTrainingReserveSize(
-        parent_, ToHandle(dnn_handle_) /*handle*/,
-        rnn_desc.handle() /*rnnDesc*/, model_dims.seq_length /*seqLength*/,
-        input_desc.handles() /*xDesc*/,
-        &reserve_space_size_in_bytes /*sizeInBytes*/);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(ERROR) << "Unable to query reserve space size: " << ToString(status);
-      return false;
-    }
+    RETURN_IF_CUDNN_ERROR(cudnnGetRNNTrainingReserveSize(
+        /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
+        /*seqLength=*/model_dims.seq_length, /*xDesc=*/input_desc.handles(),
+        /*sizeInBytes=*/&reserve_space_size_in_bytes));
 
     if (reserve_space_size_in_bytes > 0) {
-      auto allocated = reserve_space_allocator->AllocateBytes(
-          stream, reserve_space_size_in_bytes);
-      if (!allocated.ok() ||
-          (reserve_space = allocated.ValueOrDie()) == nullptr) {
-        LOG(ERROR) << "Failed to allocate RNN reserve space of "
-                   << reserve_space_size_in_bytes << " bytes.";
-        return false;
-      }
+      SE_ASSIGN_OR_RETURN(reserve_space,
+                          reserve_space_allocator->AllocateBytes(
+                              stream, reserve_space_size_in_bytes));
     }
   }
 
@@ -1796,74 +1439,57 @@ bool CudnnSupport::DoRnnForwardImpl(
   const bool is_profiling = output_profile_result != nullptr;
   if (is_profiling) {
     timer.reset(new CUDATimer(parent_));
-    if (!timer->Init()) {
-      return false;
-    }
     // The start and stop of the timer should be as close to the Cudnn call as
     // possible. It is still possible for other threads to issue workload on
     // to this stream. So it could take multiple profiling measurements.
-    if (!timer->Start(AsCUDAStream(stream))) {
-      return false;
+    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+      return port::Status(port::error::INTERNAL, "Failed to start timer");
     }
   }
-  // make the forward call
-  cudnnStatus_t status;
+
   if (!is_training) {
-    status = wrap::cudnnRNNForwardInference(
-        this, stream, ToHandle(dnn_handle_) /*handle*/,
-        rnn_desc.handle() /*rnnDesc*/, model_dims.seq_length /*seqLength*/,
-        input_desc.handles() /*xDesc*/, input_data.opaque() /*x*/,
-        input_h_desc.handle() /*hxDesc*/, input_h_data.opaque() /*hx*/,
-        input_c_desc.handle() /*cxDesc*/, input_c_data.opaque() /*cx*/,
-        rnn_desc.params_handle() /*wDesc*/, params.opaque() /*w*/,
-        output_desc.handles() /*yDesc*/, output_data->opaque() /*y*/,
-        output_h_desc.handle() /*hyDesc*/, output_h_data->opaque() /*hy*/,
-        output_c_desc.handle() /*cyDesc*/, output_c_data->opaque() /*cy*/,
-        workspace.opaque() /*workspace*/,
-        workspace.size() /*workSpaceSizeInBytes*/);
+    RETURN_IF_CUDNN_ERROR(cudnnRNNForwardInference(
+        /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
+        /*seqLength=*/model_dims.seq_length, /*xDesc=*/input_desc.handles(),
+        /*x=*/input_data.opaque(), /*hxDesc=*/input_h_desc.handle(),
+        /*hx=*/input_h_data.opaque(), /*cxDesc=*/input_c_desc.handle(),
+        /*cx=*/input_c_data.opaque(), /*wDesc=*/rnn_desc.params_handle(),
+        /*w=*/params.opaque(), /*yDesc=*/output_desc.handles(),
+        /*y=*/output_data->opaque(), /*hyDesc=*/output_h_desc.handle(),
+        /*hy=*/output_h_data->opaque(), /*cyDesc=*/output_c_desc.handle(),
+        /*cy=*/output_c_data->opaque(), /*workspace=*/workspace.opaque(),
+        /*workSpaceSizeInBytes=*/workspace.size()));
   } else {
-    status = wrap::cudnnRNNForwardTraining(
-        this, stream, ToHandle(dnn_handle_) /*handle*/,
-        rnn_desc.handle() /*rnnDesc*/, model_dims.seq_length /*seqLength*/,
-        input_desc.handles() /*xDesc*/, input_data.opaque() /*x*/,
-        input_h_desc.handle() /*hxDesc*/, input_h_data.opaque() /*hx*/,
-        input_c_desc.handle() /*cxDesc*/, input_c_data.opaque() /*cx*/,
-        rnn_desc.params_handle() /*wDesc*/, params.opaque() /*w*/,
-        output_desc.handles() /*yDesc*/, output_data->opaque() /*y*/,
-        output_h_desc.handle() /*hyDesc*/, output_h_data->opaque() /*hy*/,
-        output_c_desc.handle() /*cyDesc*/, output_c_data->opaque() /*cy*/,
-        workspace.opaque() /*workspace*/,
-        workspace.size() /*workSpaceSizeInBytes*/,
-        reserve_space.opaque() /*reserveSpace*/,
-        reserve_space.size() /*reserveSpaceSizeInBytes*/);
+    RETURN_IF_CUDNN_ERROR(cudnnRNNForwardTraining(
+        /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
+        /*seqLength=*/model_dims.seq_length, /*xDesc=*/input_desc.handles(),
+        /*x=*/input_data.opaque(), /*hxDesc=*/input_h_desc.handle(),
+        /*hx=*/input_h_data.opaque(), /*cxDesc=*/input_c_desc.handle(),
+        /*cx=*/input_c_data.opaque(), /*wDesc=*/rnn_desc.params_handle(),
+        /*w=*/params.opaque(), /*yDesc=*/output_desc.handles(),
+        /*y=*/output_data->opaque(), /*hyDesc=*/output_h_desc.handle(),
+        /*hy=*/output_h_data->opaque(), /*cyDesc=*/output_c_desc.handle(),
+        /*cy=*/output_c_data->opaque(), /*workspace=*/workspace.opaque(),
+        /*workSpaceSizeInBytes=*/workspace.size(),
+        /*reserveSpace=*/reserve_space.opaque(),
+        /*reserveSpaceSizeInBytes=*/reserve_space.size()));
   }
+
   if (is_profiling) {
     if (!timer->Stop(AsCUDAStream(stream))) {
-      return false;
-    }
-    if (status == CUDNN_STATUS_SUCCESS) {
-      auto algo_desc = rnn_desc.algorithm_config().algorithm();
-      output_profile_result->set_algorithm(algo_desc);
-      output_profile_result->set_elapsed_time_in_ms(
-          timer->GetElapsedMilliseconds());
-    }
-  }
-  if (status != CUDNN_STATUS_SUCCESS) {
-    // Silently return when we are profiling.
-    if (!is_profiling) {
-      LOG(ERROR) << "Failed to call "
-                 << (is_training ? "cudnnRNNForwardTraining "
-                                 : "cudnnRNNForwardInference ")
-                 << ToString(status);
-      return false;
+      return port::Status(port::error::INTERNAL, "Failed to stop timer");
     }
+    auto algo_desc = rnn_desc.algorithm_config().algorithm();
+    output_profile_result->set_algorithm(algo_desc);
+    output_profile_result->set_elapsed_time_in_ms(
+        timer->GetElapsedMilliseconds());
   }
 
-  return true;
+  return port::Status::OK();
 }
 
 template <class T>
-bool CudnnSupport::DoRnnBackwardImpl(
+port::Status CudnnSupport::DoRnnBackwardImpl(
     Stream* stream, const CudnnRnnDescriptor& rnn_desc,
     const CudnnRnnSequenceTensorDescriptor& input_desc,
     const DeviceMemory<T>& input_data,
@@ -1887,112 +1513,74 @@ bool CudnnSupport::DoRnnBackwardImpl(
     DeviceMemory<uint8>* reserve_space_data,
     ScratchAllocator* workspace_allocator,
     dnn::ProfileResult* output_profile_result) {
-  // extract model parameters
-  RnnModelDims model_dims;
-  bool res = ExtractAndCheckRnnForward(
-      rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
-      input_c_desc, input_c_data, params, output_desc, output_data,
-      output_h_desc, output_h_data, output_c_desc, output_c_data, &model_dims);
-  if (!res) {
-    LOG(ERROR) << "Invalid parameters for RNN Model";
-    return false;
-  }
-
-  // check params size
-  mutex_lock lock{dnn_handle_mutex_};
-  auto set_stream_status =
-      wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (set_stream_status != CUDNN_STATUS_SUCCESS) {
-    LOG(FATAL) << "failed to set stream for cudnn handle: "
-               << ToString(set_stream_status);
-  }
+  SE_ASSIGN_OR_RETURN(
+      RnnModelDims model_dims,
+      ExtractAndCheckRnnForward(rnn_desc, input_desc, input_data, input_h_desc,
+                                input_h_data, input_c_desc, input_c_data,
+                                params, output_desc, output_data, output_h_desc,
+                                output_h_data, output_c_desc, output_c_data));
 
-  if (!CheckRNNParameterSize(parent_, ToHandle(dnn_handle_), rnn_desc,
-                             input_desc)) {
-    LOG(ERROR) << "Invalid parameters";
-    return false;
-  }
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
 
-  // create the workspace
-  DeviceMemory<uint8> workspace;
-  if (!CreateRnnWorkspace(stream, parent_, ToHandle(dnn_handle_), rnn_desc,
-                          input_desc, workspace_allocator, &workspace)) {
-    LOG(ERROR) << "Unable to create rnn workspace";
-    return false;
-  }
+  SE_RETURN_IF_ERROR(CheckRNNParameterSize(cudnn, rnn_desc, input_desc));
+  SE_ASSIGN_OR_RETURN(DeviceMemory<uint8> workspace,
+                      CreateRnnWorkspace(stream, cudnn, rnn_desc, input_desc,
+                                         workspace_allocator));
 
   std::unique_ptr<CUDATimer, TimerDeleter> timer;
   const bool is_profiling = output_profile_result != nullptr;
   if (is_profiling) {
     timer.reset(new CUDATimer(parent_));
-    if (!timer->Init()) {
-      return false;
-    }
     // The start and stop of the timer should be as close to the Cudnn call as
     // possible. It is still possible for other threads to issue workload on
     // to this stream. So it could take multiple profiling measurements.
-    if (!timer->Start(AsCUDAStream(stream))) {
-      return false;
-    }
-  }
-  // make the backward data call
-  cudnnStatus_t status = wrap::cudnnRNNBackwardData(
-      this, stream, ToHandle(dnn_handle_) /*handle*/,
-      rnn_desc.handle() /*rnnDesc*/, model_dims.seq_length /*seqLength*/,
-      output_desc.handles() /*yDesc*/, output_data.opaque() /*y*/,
-      output_desc.handles() /*dyDesc*/, output_backprop_data.opaque() /*dy*/,
-      output_h_desc.handle() /*dhyDesc*/,
-      output_h_backprop_data.opaque() /*dhy*/,
-      output_c_desc.handle() /*dcyDesc*/,
-      output_c_backprop_data.opaque() /*dcy*/,
-      rnn_desc.params_handle() /*wDesc*/, params.opaque() /*w*/,
-      input_h_desc.handle() /*hxDesc*/, input_h_data.opaque() /*hx*/,
-      input_c_desc.handle() /*cxDesc*/, input_c_data.opaque() /*cx*/,
-      input_desc.handles() /*dxDesc*/, input_backprop_data->opaque() /*dx*/,
-      input_h_desc.handle() /*dhxDesc*/,
-      input_h_backprop_data->opaque() /*dhx*/,
-      input_c_desc.handle() /*dcxDesc*/,
-      input_c_backprop_data->opaque() /*dcx*/, workspace.opaque() /*workspace*/,
-      workspace.size() /*workSpaceSizeInBytes*/,
-      reserve_space_data->opaque() /*reserveSpace*/,
-      reserve_space_data->size() /*reserveSpaceSizeInBytes*/);
-
-  if (status != CUDNN_STATUS_SUCCESS) {
-    if (is_profiling) {
-      timer->Stop(AsCUDAStream(stream));
-    }
-    LOG(ERROR) << "Failed to call cudnnRNNBackwardData: " << ToString(status);
-    return false;
-  }
+    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+      return port::Status(port::error::INTERNAL, "Failed to start timer");
+    }
+  }
+
+  RETURN_IF_CUDNN_ERROR(cudnnRNNBackwardData(
+      /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
+      /*seqLength=*/model_dims.seq_length, /*yDesc=*/output_desc.handles(),
+      /*y=*/output_data.opaque(), /*dyDesc=*/output_desc.handles(),
+      /*dy=*/output_backprop_data.opaque(),
+      /*dhyDesc=*/output_h_desc.handle(),
+      /*dhy=*/output_h_backprop_data.opaque(),
+      /*dcyDesc=*/output_c_desc.handle(),
+      /*dcy=*/output_c_backprop_data.opaque(),
+      /*wDesc=*/rnn_desc.params_handle(), /*w=*/params.opaque(),
+      /*hxDesc=*/input_h_desc.handle(), /*hx=*/input_h_data.opaque(),
+      /*cxDesc=*/input_c_desc.handle(), /*cx=*/input_c_data.opaque(),
+      /*dxDesc=*/input_desc.handles(), /*dx=*/input_backprop_data->opaque(),
+      /*dhxDesc=*/input_h_desc.handle(),
+      /*dhx=*/input_h_backprop_data->opaque(),
+      /*dcxDesc=*/input_c_desc.handle(),
+      /*dcx=*/input_c_backprop_data->opaque(),
+      /*workspace=*/workspace.opaque(),
+      /*workSpaceSizeInBytes=*/workspace.size(),
+      /*reserveSpace=*/reserve_space_data->opaque(),
+      /*reserveSpaceSizeInBytes=*/reserve_space_data->size()));
 
   if (params_backprop_data != nullptr) {
     // Clear the dw to zeros.
     stream->ThenMemZero(params_backprop_data, params_backprop_data->size());
     // make the backward weight call
-    status = wrap::cudnnRNNBackwardWeights(
-        this, stream, ToHandle(dnn_handle_) /*handle*/,
-        rnn_desc.handle() /*rnnDesc*/, model_dims.seq_length /*seqLength*/,
-        input_desc.handles() /*xDesc*/, input_data.opaque() /*x*/,
-        input_h_desc.handle() /*hxDesc*/, input_h_data.opaque() /*hx*/,
-        output_desc.handles() /*yDesc*/, output_data.opaque() /*y*/,
-        workspace.opaque() /*workspace*/,
-        workspace.size() /*workSpaceSizeInBytes*/,
-        rnn_desc.params_handle() /*dwDesc*/,
-        params_backprop_data->opaque() /*dw*/,
-        reserve_space_data->opaque() /*reserveSpace*/,
-        reserve_space_data->size() /*reserveSpaceSizeInBytes*/);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      if (is_profiling) {
-        timer->Stop(AsCUDAStream(stream));
-      }
-      LOG(ERROR) << "Failed to call cudnnRNNBackwardWeights: "
-                 << ToString(status);
-      return false;
-    }
+    RETURN_IF_CUDNN_ERROR(cudnnRNNBackwardWeights(
+        /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
+        /*seqLength=*/model_dims.seq_length, /*xDesc=*/input_desc.handles(),
+        /*x=*/input_data.opaque(), /*hxDesc=*/input_h_desc.handle(),
+        /*hx=*/input_h_data.opaque(), /*yDesc=*/output_desc.handles(),
+        /*y=*/output_data.opaque(), /*workspace=*/workspace.opaque(),
+        /*workSpaceSizeInBytes=*/workspace.size(),
+        /*dwDesc=*/rnn_desc.params_handle(),
+        /*dw=*/params_backprop_data->opaque(),
+        /*reserveSpace=*/reserve_space_data->opaque(),
+        /*reserveSpaceSizeInBytes=*/reserve_space_data->size()));
   }
+
   if (is_profiling) {
     if (!timer->Stop(AsCUDAStream(stream))) {
-      return false;
+      return port::Status(port::error::INTERNAL, "Failed to stop timer");
     }
     auto algo_desc = rnn_desc.algorithm_config().algorithm();
     output_profile_result->set_algorithm(algo_desc);
@@ -2000,11 +1588,9 @@ bool CudnnSupport::DoRnnBackwardImpl(
         timer->GetElapsedMilliseconds());
   }
 
-  return true;
+  return port::Status::OK();
 }
 
-#endif  // CUDNN_VERSION
-
 port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>
 CudnnSupport::createRnnDescriptor(
     int num_layers, int hidden_size, int input_size, int batch_size,
@@ -2012,71 +1598,40 @@ CudnnSupport::createRnnDescriptor(
     dnn::RnnMode rnn_mode, dnn::DataType data_type,
     const dnn::AlgorithmConfig& algorithm_config, float dropout, uint64 seed,
     ScratchAllocator* state_allocator) {
-#if CUDNN_VERSION >= 5000
-  mutex_lock lock{dnn_handle_mutex_};
-  std::unique_ptr<CudnnRnnDescriptor> rnn_desc(new CudnnRnnDescriptor(
-      parent_, ToHandle(dnn_handle_), num_layers, hidden_size, input_size,
-      batch_size, ToCudnnRnnInputMode(input_mode),
-      ToCudnnRnnDirectionMode(direction_mode), ToCudnnRnnMode(rnn_mode),
-      ToCudnnDataType(data_type), GetRnnComputeType(data_type),
-      algorithm_config, dropout, seed, state_allocator));
-  if (!rnn_desc->ok()) {
-    return rnn_desc->Status();
-  }
-  return port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>(
-      std::move(rnn_desc));
-#else
-  string error_msg =
-      port::StrCat("createRnnDescriptor needs at least Cudnn 5.0 to work. ",
-                   "Current Cudnn version: ", CUDNN_VERSION, ". ");
-  LOG(ERROR) << error_msg;
-  return port::Status{port::error::UNIMPLEMENTED, error_msg};
-#endif  // CUDNN_VERSION
+  // Setting up a cudnnRNNDescriptor requires a cuDNN handle, but because it's
+  // not enqueueing anything into a stream, we pass in the null stream.
+  auto cudnn = cudnn_->GetHandle(parent_, /*stream=*/nullptr);
+  SE_ASSIGN_OR_RETURN(
+      CudnnRnnDescriptor rnn_desc,
+      CudnnRnnDescriptor::Create(
+          cudnn, num_layers, hidden_size, input_size, batch_size,
+          ToCudnnRnnInputMode(input_mode),
+          ToCudnnRnnDirectionMode(direction_mode), ToCudnnRnnMode(rnn_mode),
+          ToCudnnDataType(data_type), GetRnnComputeType(data_type),
+          algorithm_config, dropout, seed, state_allocator));
+  return std::unique_ptr<dnn::RnnDescriptor>(
+      new CudnnRnnDescriptor(std::move(rnn_desc)));
 }
 
 port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
 CudnnSupport::createRnnSequenceTensorDescriptor(int seq_length, int batch_size,
                                                 int data_size,
                                                 dnn::DataType data_type) {
-#if CUDNN_VERSION >= 5000
-  std::unique_ptr<CudnnRnnSequenceTensorDescriptor> seq_desc(
-      new CudnnRnnSequenceTensorDescriptor(parent_, seq_length, batch_size,
-                                           data_size,
-                                           ToCudnnDataType(data_type)));
-  if (!seq_desc->ok()) {
-    return seq_desc->Status();
-  }
-  return port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>(
-      std::move(seq_desc));
-#else
-  string error_msg = port::StrCat(
-      "createRnnSequenceTensorDescriptor needs at least Cudnn 5.0 to work. ",
-      "Current Cudnn version: ", CUDNN_VERSION, ". ");
-  LOG(ERROR) << error_msg;
-  return port::Status{port::error::UNIMPLEMENTED, error_msg};
-#endif  // CUDNN_VERSION
+  SE_ASSIGN_OR_RETURN(CudnnRnnSequenceTensorDescriptor descriptor,
+                      CudnnRnnSequenceTensorDescriptor::Create(
+                          parent_, seq_length, batch_size, data_size,
+                          ToCudnnDataType(data_type)));
+  return std::unique_ptr<dnn::RnnSequenceTensorDescriptor>(
+      new CudnnRnnSequenceTensorDescriptor(std::move(descriptor)));
 }
 
 port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
 CudnnSupport::createRnnStateTensorDescriptor(int num_layer, int batch_size,
                                              int data_size,
                                              dnn::DataType data_type) {
-#if CUDNN_VERSION >= 5000
-  std::unique_ptr<CudnnRnnStateTensorDescriptor> state_desc(
+  return std::unique_ptr<dnn::RnnStateTensorDescriptor>(
       new CudnnRnnStateTensorDescriptor(parent_, num_layer, batch_size,
                                         data_size, ToCudnnDataType(data_type)));
-  if (!state_desc->ok()) {
-    return state_desc->Status();
-  }
-  return port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>(
-      std::move(state_desc));
-#else
-  string error_msg = port::StrCat(
-      "createRnnStateTensorDescriptor needs at least Cudnn 5.0 to work. ",
-      "Current Cudnn version: ", CUDNN_VERSION, ". ");
-  LOG(ERROR) << error_msg;
-  return port::Status{port::error::UNIMPLEMENTED, error_msg};
-#endif  // CUDNN_VERSION
 }
 
 bool CudnnSupport::DoRnnForward(
@@ -2097,7 +1652,6 @@ bool CudnnSupport::DoRnnForward(
     ScratchAllocator* reserve_space_allocator,
     ScratchAllocator* workspace_allocator,
     dnn::ProfileResult* output_profile_result) {
-#if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
   const CudnnRnnSequenceTensorDescriptor& cudnn_input_desc =
@@ -2113,15 +1667,14 @@ bool CudnnSupport::DoRnnForward(
   const CudnnRnnStateTensorDescriptor& cudnn_output_c_desc =
       static_cast<const CudnnRnnStateTensorDescriptor&>(output_c_desc);
 
-  return DoRnnForwardImpl<Eigen::half>(
-      stream, cudnn_rnn_desc, cudnn_input_desc, input_data, cudnn_input_h_desc,
-      input_h_data, cudnn_input_c_desc, input_c_data, params, cudnn_output_desc,
-      output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
-      output_c_data, is_training, reserve_space_allocator, workspace_allocator,
-      output_profile_result);
-#else
-  return false;
-#endif  // CUDNN_VERSION
+  return IsStatusOk(
+      DoRnnForwardImpl<Eigen::half>(
+          stream, cudnn_rnn_desc, cudnn_input_desc, input_data,
+          cudnn_input_h_desc, input_h_data, cudnn_input_c_desc, input_c_data,
+          params, cudnn_output_desc, output_data, cudnn_output_h_desc,
+          output_h_data, cudnn_output_c_desc, output_c_data, is_training,
+          reserve_space_allocator, workspace_allocator, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoRnnForward(
@@ -2141,7 +1694,6 @@ bool CudnnSupport::DoRnnForward(
     ScratchAllocator* reserve_space_allocator,
     ScratchAllocator* workspace_allocator,
     dnn::ProfileResult* output_profile_result) {
-#if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
   const CudnnRnnSequenceTensorDescriptor& cudnn_input_desc =
@@ -2157,15 +1709,14 @@ bool CudnnSupport::DoRnnForward(
   const CudnnRnnStateTensorDescriptor& cudnn_output_c_desc =
       static_cast<const CudnnRnnStateTensorDescriptor&>(output_c_desc);
 
-  return DoRnnForwardImpl<float>(
-      stream, cudnn_rnn_desc, cudnn_input_desc, input_data, cudnn_input_h_desc,
-      input_h_data, cudnn_input_c_desc, input_c_data, params, cudnn_output_desc,
-      output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
-      output_c_data, is_training, reserve_space_allocator, workspace_allocator,
-      output_profile_result);
-#else
-  return false;
-#endif  // CUDNN_VERSION
+  return IsStatusOk(
+      DoRnnForwardImpl<float>(
+          stream, cudnn_rnn_desc, cudnn_input_desc, input_data,
+          cudnn_input_h_desc, input_h_data, cudnn_input_c_desc, input_c_data,
+          params, cudnn_output_desc, output_data, cudnn_output_h_desc,
+          output_h_data, cudnn_output_c_desc, output_c_data, is_training,
+          reserve_space_allocator, workspace_allocator, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoRnnForward(
@@ -2186,7 +1737,6 @@ bool CudnnSupport::DoRnnForward(
     ScratchAllocator* reserve_space_allocator,
     ScratchAllocator* workspace_allocator,
     dnn::ProfileResult* output_profile_result) {
-#if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
   const CudnnRnnSequenceTensorDescriptor& cudnn_input_desc =
@@ -2202,15 +1752,14 @@ bool CudnnSupport::DoRnnForward(
   const CudnnRnnStateTensorDescriptor& cudnn_output_c_desc =
       static_cast<const CudnnRnnStateTensorDescriptor&>(output_c_desc);
 
-  return DoRnnForwardImpl<double>(
-      stream, cudnn_rnn_desc, cudnn_input_desc, input_data, cudnn_input_h_desc,
-      input_h_data, cudnn_input_c_desc, input_c_data, params, cudnn_output_desc,
-      output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
-      output_c_data, is_training, reserve_space_allocator, workspace_allocator,
-      output_profile_result);
-#else
-  return false;
-#endif  // CUDNN_VERSION
+  return IsStatusOk(
+      DoRnnForwardImpl<double>(
+          stream, cudnn_rnn_desc, cudnn_input_desc, input_data,
+          cudnn_input_h_desc, input_h_data, cudnn_input_c_desc, input_c_data,
+          params, cudnn_output_desc, output_data, cudnn_output_h_desc,
+          output_h_data, cudnn_output_c_desc, output_c_data, is_training,
+          reserve_space_allocator, workspace_allocator, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoRnnBackward(
@@ -2238,7 +1787,6 @@ bool CudnnSupport::DoRnnBackward(
     DeviceMemory<uint8>* reserve_space_data,
     ScratchAllocator* workspace_allocator,
     dnn::ProfileResult* output_profile_result) {
-#if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
   const CudnnRnnSequenceTensorDescriptor& cudnn_input_desc =
@@ -2254,17 +1802,17 @@ bool CudnnSupport::DoRnnBackward(
   const CudnnRnnStateTensorDescriptor& cudnn_output_c_desc =
       static_cast<const CudnnRnnStateTensorDescriptor&>(output_c_desc);
 
-  return DoRnnBackwardImpl<Eigen::half>(
-      stream, cudnn_rnn_desc, cudnn_input_desc, input_data, cudnn_input_h_desc,
-      input_h_data, cudnn_input_c_desc, input_c_data, params, cudnn_output_desc,
-      output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
-      output_c_data, output_backprop_data, output_h_backprop_data,
-      output_c_backprop_data, input_backprop_data, input_h_backprop_data,
-      input_c_backprop_data, params_backprop_data, reserve_space_data,
-      workspace_allocator, output_profile_result);
-#else
-  return false;
-#endif  // CUDNN_VERSION
+  return IsStatusOk(
+      DoRnnBackwardImpl<Eigen::half>(
+          stream, cudnn_rnn_desc, cudnn_input_desc, input_data,
+          cudnn_input_h_desc, input_h_data, cudnn_input_c_desc, input_c_data,
+          params, cudnn_output_desc, output_data, cudnn_output_h_desc,
+          output_h_data, cudnn_output_c_desc, output_c_data,
+          output_backprop_data, output_h_backprop_data, output_c_backprop_data,
+          input_backprop_data, input_h_backprop_data, input_c_backprop_data,
+          params_backprop_data, reserve_space_data, workspace_allocator,
+          output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoRnnBackward(
@@ -2291,7 +1839,6 @@ bool CudnnSupport::DoRnnBackward(
     DeviceMemory<uint8>* reserve_space_data,
     ScratchAllocator* workspace_allocator,
     dnn::ProfileResult* output_profile_result) {
-#if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
   const CudnnRnnSequenceTensorDescriptor& cudnn_input_desc =
@@ -2307,17 +1854,17 @@ bool CudnnSupport::DoRnnBackward(
   const CudnnRnnStateTensorDescriptor& cudnn_output_c_desc =
       static_cast<const CudnnRnnStateTensorDescriptor&>(output_c_desc);
 
-  return DoRnnBackwardImpl<float>(
-      stream, cudnn_rnn_desc, cudnn_input_desc, input_data, cudnn_input_h_desc,
-      input_h_data, cudnn_input_c_desc, input_c_data, params, cudnn_output_desc,
-      output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
-      output_c_data, output_backprop_data, output_h_backprop_data,
-      output_c_backprop_data, input_backprop_data, input_h_backprop_data,
-      input_c_backprop_data, params_backprop_data, reserve_space_data,
-      workspace_allocator, output_profile_result);
-#else
-  return false;
-#endif  // CUDNN_VERSION
+  return IsStatusOk(
+      DoRnnBackwardImpl<float>(
+          stream, cudnn_rnn_desc, cudnn_input_desc, input_data,
+          cudnn_input_h_desc, input_h_data, cudnn_input_c_desc, input_c_data,
+          params, cudnn_output_desc, output_data, cudnn_output_h_desc,
+          output_h_data, cudnn_output_c_desc, output_c_data,
+          output_backprop_data, output_h_backprop_data, output_c_backprop_data,
+          input_backprop_data, input_h_backprop_data, input_c_backprop_data,
+          params_backprop_data, reserve_space_data, workspace_allocator,
+          output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoRnnBackward(
@@ -2345,7 +1892,6 @@ bool CudnnSupport::DoRnnBackward(
     DeviceMemory<uint8>* reserve_space_data,
     ScratchAllocator* workspace_allocator,
     dnn::ProfileResult* output_profile_result) {
-#if CUDNN_VERSION >= 5000
   const CudnnRnnDescriptor& cudnn_rnn_desc =
       static_cast<const CudnnRnnDescriptor&>(rnn_desc);
   const CudnnRnnSequenceTensorDescriptor& cudnn_input_desc =
@@ -2361,123 +1907,358 @@ bool CudnnSupport::DoRnnBackward(
   const CudnnRnnStateTensorDescriptor& cudnn_output_c_desc =
       static_cast<const CudnnRnnStateTensorDescriptor&>(output_c_desc);
 
-  return DoRnnBackwardImpl<double>(
-      stream, cudnn_rnn_desc, cudnn_input_desc, input_data, cudnn_input_h_desc,
-      input_h_data, cudnn_input_c_desc, input_c_data, params, cudnn_output_desc,
-      output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
-      output_c_data, output_backprop_data, output_h_backprop_data,
-      output_c_backprop_data, input_backprop_data, input_h_backprop_data,
-      input_c_backprop_data, params_backprop_data, reserve_space_data,
-      workspace_allocator, output_profile_result);
-#else
-  return false;
-#endif  // CUDNN_VERSION
+  return IsStatusOk(
+      DoRnnBackwardImpl<double>(
+          stream, cudnn_rnn_desc, cudnn_input_desc, input_data,
+          cudnn_input_h_desc, input_h_data, cudnn_input_c_desc, input_c_data,
+          params, cudnn_output_desc, output_data, cudnn_output_h_desc,
+          output_h_data, cudnn_output_c_desc, output_c_data,
+          output_backprop_data, output_h_backprop_data, output_c_backprop_data,
+          input_backprop_data, input_h_backprop_data, input_c_backprop_data,
+          params_backprop_data, reserve_space_data, workspace_allocator,
+          output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 namespace {
 
-inline cudnnConvolutionFwdAlgo_t GetCudnnConvolutionForwardAlgo(
-    Stream* stream, CUDAExecutor* parent, void* dnn_handle,
-    const ScopedTensorDescriptor& input_nd,
+// TODO(csigg): Merge a lot of duplicate code below for forward, backward data,
+// and backward filter.
+
+port::StatusOr<cudnnConvolutionFwdAlgo_t> GetCudnnConvolutionForwardAlgo(
+    const CudnnHandle& cudnn, const ScopedTensorDescriptor& input_nd,
     const ScopedFilterDescriptor& filter,
     const ScopedConvolutionDescriptor& conv,
     const ScopedTensorDescriptor& output_nd, bool specify_workspace_limit,
-    ScratchAllocator* scratch_allocator) {
+    size_t memory_limit_bytes) {
   cudnnConvolutionFwdPreference_t preference =
       specify_workspace_limit ? CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
                               : CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
-  auto memory_limit_bytes =
-      scratch_allocator == nullptr
-          ? 0
-          : scratch_allocator->GetMemoryLimitInBytes(stream);
-  if (memory_limit_bytes < 0) {
-    memory_limit_bytes = 0;
-  }
-
   cudnnConvolutionFwdAlgo_t algo_to_use;
-  auto status = wrap::cudnnGetConvolutionForwardAlgorithm(
-      parent, ToHandle(dnn_handle), input_nd.handle(), filter.handle(),
-      conv.handle(), output_nd.handle(), preference, memory_limit_bytes,
-      &algo_to_use);
-  CHECK_EQ(status, CUDNN_STATUS_SUCCESS)
-      << "Unable to find a suitable algorithm for doing forward convolution";
+  RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionForwardAlgorithm(
+      cudnn.handle(), input_nd.handle(), filter.handle(), conv.handle(),
+      output_nd.handle(), preference, memory_limit_bytes, &algo_to_use));
+  return algo_to_use;
+}
+
+port::StatusOr<cudnnConvolutionBwdDataAlgo_t>
+GetCudnnConvolutionBackwardDataAlgo(const CudnnHandle& cudnn,
+                                    const ScopedTensorDescriptor& input_nd,
+                                    const ScopedFilterDescriptor& filter,
+                                    const ScopedConvolutionDescriptor& conv,
+                                    const ScopedTensorDescriptor& output_nd,
+                                    bool specify_workspace_limit,
+                                    size_t memory_limit_bytes) {
+  cudnnConvolutionBwdDataPreference_t preference =
+      specify_workspace_limit
+          ? CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT
+          : CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE;
+  cudnnConvolutionBwdDataAlgo_t algo_to_use;
+  RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionBackwardDataAlgorithm(
+      cudnn.handle(), filter.handle(), output_nd.handle(), conv.handle(),
+      input_nd.handle(), preference, memory_limit_bytes, &algo_to_use));
+  return algo_to_use;
+}
+
+port::StatusOr<cudnnConvolutionBwdFilterAlgo_t>
+GetCudnnConvolutionBackwardFilterAlgo(const CudnnHandle& cudnn,
+                                      const ScopedTensorDescriptor& input_nd,
+                                      const ScopedFilterDescriptor& filter,
+                                      const ScopedConvolutionDescriptor& conv,
+                                      const ScopedTensorDescriptor& output_nd,
+                                      bool specify_workspace_limit,
+                                      size_t memory_limit_bytes) {
+  cudnnConvolutionBwdFilterPreference_t preference =
+      specify_workspace_limit
+          ? CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT
+          : CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
+  cudnnConvolutionBwdFilterAlgo_t algo_to_use;
+  RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionBackwardFilterAlgorithm(
+      cudnn.handle(), input_nd.handle(), output_nd.handle(), conv.handle(),
+      filter.handle(), preference, memory_limit_bytes, &algo_to_use));
   return algo_to_use;
 }
 
-dnn::AlgorithmDesc GetCudnnConvolutionForwardAlgorithm(
-    Stream* stream, CUDAExecutor* parent, void* dnn_handle,
-    const dnn::AlgorithmConfig& algorithm_config, bool is_profiling,
+port::StatusOr<DeviceMemory<uint8>> AllocateCudnnConvolutionForwardWorkspace(
+    Stream* stream, const CudnnHandle& cudnn,
+    const dnn::AlgorithmDesc& algorithm_desc,
     const ScopedTensorDescriptor& input_nd,
     const ScopedFilterDescriptor& filter,
     const ScopedConvolutionDescriptor& conv,
     const ScopedTensorDescriptor& output_nd,
-    ScratchAllocator* scratch_allocator, DeviceMemory<uint8>* scratch) {
-  cudnnConvolutionFwdAlgo_t algo;
-  bool use_tensor_ops;
-  if (algorithm_config.algorithm().is_default()) {
-    use_tensor_ops = true;
-    algo = GetCudnnConvolutionForwardAlgo(
-        stream, parent, dnn_handle, input_nd, filter, conv, output_nd,
-        /*specify_workspace_limit=*/scratch_allocator != nullptr,
-        scratch_allocator);
-  } else {
-    use_tensor_ops = algorithm_config.algorithm().tensor_ops_enabled();
-    algo = ToConvForwardAlgo(algorithm_config.algorithm());
+    ScratchAllocator* scratch_allocator) {
+  // TODO(csigg): This has side effects on the convolution descriptor. It is
+  // functionally correct because the convolution is run with the algorithm of
+  // the last call to this function, but should be fixed anyway.
+  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
+
+  // Query the size of the workspace and allocate it.
+  size_t size_in_bytes;
+  RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionForwardWorkspaceSize(
+      cudnn.handle(),
+      /*xDesc=*/input_nd.handle(),
+      /*wDesc=*/filter.handle(), /*convDesc=*/conv.handle(),
+      /*yDesc=*/output_nd.handle(), /*algo=*/ToConvForwardAlgo(algorithm_desc),
+      /*sizeInBytes=*/&size_in_bytes));
+  int64 size_in_bytes_int64 = size_in_bytes;
+
+  if (TF_PREDICT_FALSE(size_in_bytes_int64 < 0)) {
+    return port::Status(
+        port::error::INTERNAL,
+        "cudnnGetConvolutionForwardWorkspaceSize() returned "
+        "negative sizeInBytes value. This could be a cudnn bug.");
+  }
+
+  if (size_in_bytes_int64 == 0) {
+    return DeviceMemory<uint8>();
+  }
+
+  if (TF_PREDICT_FALSE(!scratch_allocator)) {
+    return port::Status(port::error::INVALID_ARGUMENT,
+                        "No scratch allocator provided");
   }
+
+  return scratch_allocator->AllocateBytes(stream, size_in_bytes);
+}
+
+port::StatusOr<DeviceMemory<uint8>>
+AllocateCudnnConvolutionBackwardDataWorkspace(
+    Stream* stream, const CudnnHandle& cudnn,
+    const dnn::AlgorithmDesc& algorithm_desc,
+    const ScopedTensorDescriptor& input_nd,
+    const ScopedFilterDescriptor& filter,
+    const ScopedConvolutionDescriptor& conv,
+    const ScopedTensorDescriptor& output_nd,
+    ScratchAllocator* scratch_allocator) {
+  // TODO(csigg): This has side effects on the convolution descriptor. It is
+  // functionally correct because the convolution is run with the algorithm of
+  // the last call to this function, but should be fixed anyway.
+  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
+
+  // Query the size of the workspace and allocate it.
   size_t size_in_bytes;
-  auto status = wrap::cudnnGetConvolutionForwardWorkspaceSize(
-      parent, ToHandle(dnn_handle), /*srcDesc=*/input_nd.handle(),
-      /*filterDesc=*/filter.handle(), /*convDesc=*/conv.handle(),
-      /*destDesc=*/output_nd.handle(), /*algo=*/algo,
-      /*sizeInBytes=*/&size_in_bytes);
+  RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionBackwardDataWorkspaceSize(
+      cudnn.handle(),
+      /*wDesc=*/filter.handle(),
+      /*dyDesc=*/output_nd.handle(),
+      /*convDesc=*/conv.handle(),
+      /*dxDesc=*/input_nd.handle(),
+      /*algo=*/ToConvBackwardDataAlgo(algorithm_desc),
+      /*sizeInBytes=*/&size_in_bytes));
   int64 size_in_bytes_int64 = size_in_bytes;
-  if (TF_PREDICT_FALSE(status != CUDNN_STATUS_SUCCESS)) {
-    CHECK(is_profiling) << "Cannot query the size of workspace needed "
-                           "for the specified algorithm: "
-                        << algorithm_config.algorithm().algo_id() << " "
-                        << ToString(status);
-    // Silently return when we are profiling.
-    return dnn::AlgorithmDesc();
+
+  if (TF_PREDICT_FALSE(size_in_bytes_int64 < 0)) {
+    return port::Status(
+        port::error::INTERNAL,
+        "cudnnGetConvolutionBackwardDataWorkspaceSize() returned "
+        "negative sizeInBytes value. This could be a cudnn bug.");
+  }
+
+  if (size_in_bytes_int64 == 0) {
+    return DeviceMemory<uint8>();
+  }
+
+  if (TF_PREDICT_FALSE(!scratch_allocator)) {
+    return port::Status(port::error::INVALID_ARGUMENT,
+                        "No scratch allocator provided");
   }
+
+  return scratch_allocator->AllocateBytes(stream, size_in_bytes);
+}
+
+port::StatusOr<DeviceMemory<uint8>>
+AllocateCudnnConvolutionBackwardFilterWorkspace(
+    Stream* stream, const CudnnHandle& cudnn,
+    const dnn::AlgorithmDesc& algorithm_desc,
+    const ScopedTensorDescriptor& input_nd,
+    const ScopedFilterDescriptor& filter,
+    const ScopedConvolutionDescriptor& conv,
+    const ScopedTensorDescriptor& output_nd,
+    ScratchAllocator* scratch_allocator) {
+  // TODO(csigg): This has side effects on the convolution descriptor. It is
+  // functionally correct because the convolution is run with the algorithm of
+  // the last call to this function, but should be fixed anyway.
+  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
+
+  // Query the size of the workspace and allocate it.
+  size_t size_in_bytes;
+  RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionBackwardFilterWorkspaceSize(
+      cudnn.handle(),
+      /*xDesc=*/input_nd.handle(),
+      /*dyDesc=*/output_nd.handle(),
+      /*convDesc=*/conv.handle(),
+      /*gradDesc=*/filter.handle(),
+      /*algo=*/ToConvBackwardFilterAlgo(algorithm_desc),
+      /*sizeInBytes=*/&size_in_bytes));
+  int64 size_in_bytes_int64 = size_in_bytes;
+
   if (TF_PREDICT_FALSE(size_in_bytes_int64 < 0)) {
-    LOG(WARNING) << "cudnnGetConvolutionForwardWorkspaceSize() returned "
-                    "negative sizeInBytes value. This could be a cudnn bug.";
-    if (TF_PREDICT_TRUE(is_profiling)) {
-      return dnn::AlgorithmDesc();
-    }
-  } else if (size_in_bytes_int64 > 0) {
-    port::StatusOr<DeviceMemory<uint8>> allocated;
-    if (TF_PREDICT_TRUE(scratch_allocator)) {
-      allocated = scratch_allocator->AllocateBytes(stream, size_in_bytes);
-      if (TF_PREDICT_TRUE(allocated.ok())) {
-        *scratch = allocated.ValueOrDie();
-      } else {
-        if (TF_PREDICT_TRUE(is_profiling)) {
-          // Silently return when we are profiling.
-          return dnn::AlgorithmDesc();
-        }
-        LOG(WARNING) << allocated.status().error_message();
-        // For the int8 case, we fail at this point since the no_scratch
-        // algorithm should be set to dnn::kDefaultAlgorithm.
-        CHECK(!algorithm_config.algorithm_no_scratch().is_default())
-            << "The primary convolution algorithm failed memory allocation, "
-               "while a secondary algorithm is not provided.";
-      }
-    }
-    if (TF_PREDICT_FALSE(!allocated.ok())) {
-      if (algorithm_config.algorithm_no_scratch().is_default()) {
-        use_tensor_ops = true;
-        algo = GetCudnnConvolutionForwardAlgo(
-            stream, parent, dnn_handle, input_nd, filter, conv, output_nd,
-            /*specify_workspace_limit=*/false, nullptr);
-      } else {
-        use_tensor_ops = algorithm_config.algorithm().tensor_ops_enabled();
-        algo = ToConvForwardAlgo(algorithm_config.algorithm_no_scratch());
-      }
-    }
+    return port::Status(
+        port::error::INTERNAL,
+        "cudnnGetConvolutionBackwardFilterWorkspaceSize() returned "
+        "negative sizeInBytes value. This could be a cudnn bug.");
+  }
+
+  if (size_in_bytes_int64 == 0) {
+    return DeviceMemory<uint8>();
+  }
+
+  if (TF_PREDICT_FALSE(!scratch_allocator)) {
+    return port::Status(port::error::INVALID_ARGUMENT,
+                        "No scratch allocator provided");
   }
 
-  return dnn::AlgorithmDesc(algo, use_tensor_ops);
+  return scratch_allocator->AllocateBytes(stream, size_in_bytes);
+}
+
+port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
+    Stream* stream, const CudnnHandle& cudnn,
+    const dnn::AlgorithmConfig& algorithm_config,
+    const ScopedTensorDescriptor& input_nd,
+    const ScopedFilterDescriptor& filter,
+    const ScopedConvolutionDescriptor& conv,
+    const ScopedTensorDescriptor& output_nd,
+    ScratchAllocator* scratch_allocator, DeviceMemory<uint8>* scratch) {
+  dnn::AlgorithmDesc algo_desc = algorithm_config.algorithm();
+  if (algorithm_config.algorithm().is_default()) {
+    // Pick fastest algorithm within memory limit according to cuDNN's
+    // heuristics.
+    bool specify_workspace_limit = scratch_allocator != nullptr;
+    auto memory_limit_bytes =
+        specify_workspace_limit
+            ? std::max(scratch_allocator->GetMemoryLimitInBytes(stream), 0ll)
+            : 0ll;
+    SE_ASSIGN_OR_RETURN(cudnnConvolutionFwdAlgo_t algo,
+                        GetCudnnConvolutionForwardAlgo(
+                            cudnn, input_nd, filter, conv, output_nd,
+                            specify_workspace_limit, memory_limit_bytes));
+    algo_desc = dnn::AlgorithmDesc(
+        algo, algorithm_config.algorithm().tensor_ops_enabled());
+  }
+
+  auto scratch_or = AllocateCudnnConvolutionForwardWorkspace(
+      stream, cudnn, algo_desc, input_nd, filter, conv, output_nd,
+      scratch_allocator);
+
+  if (scratch_or.ok()) {
+    *scratch = scratch_or.ValueOrDie();
+    return algo_desc;
+  }
+
+  // Failed to allocate workspace for the first algorithm, fall back to the
+  // no_scratch algorithm.
+  if (algorithm_config.algorithm_no_scratch().is_default()) {
+    return port::Status(
+        port::error::INVALID_ARGUMENT,
+        "The primary convolution algorithm failed memory allocation, "
+        "while a secondary algorithm is not provided.");
+  }
+
+  SE_ASSIGN_OR_RETURN(
+      *scratch, AllocateCudnnConvolutionForwardWorkspace(
+                    stream, cudnn, algorithm_config.algorithm_no_scratch(),
+                    input_nd, filter, conv, output_nd, scratch_allocator));
+  return algorithm_config.algorithm_no_scratch();
+}
+
+port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
+    Stream* stream, const CudnnHandle& cudnn,
+    const dnn::AlgorithmConfig& algorithm_config,
+    const ScopedTensorDescriptor& input_nd,
+    const ScopedFilterDescriptor& filter,
+    const ScopedConvolutionDescriptor& conv,
+    const ScopedTensorDescriptor& output_nd,
+    ScratchAllocator* scratch_allocator, DeviceMemory<uint8>* scratch) {
+  dnn::AlgorithmDesc algo_desc = algorithm_config.algorithm();
+  if (algorithm_config.algorithm().is_default()) {
+    // Pick fastest algorithm within memory limit according to cuDNN's
+    // heuristics.
+    bool specify_workspace_limit = scratch_allocator != nullptr;
+    auto memory_limit_bytes =
+        specify_workspace_limit
+            ? std::max(scratch_allocator->GetMemoryLimitInBytes(stream), 0ll)
+            : 0ll;
+    SE_ASSIGN_OR_RETURN(cudnnConvolutionBwdDataAlgo_t algo,
+                        GetCudnnConvolutionBackwardDataAlgo(
+                            cudnn, input_nd, filter, conv, output_nd,
+                            specify_workspace_limit, memory_limit_bytes));
+    algo_desc = dnn::AlgorithmDesc(
+        algo, algorithm_config.algorithm().tensor_ops_enabled());
+  }
+
+  auto scratch_or = AllocateCudnnConvolutionBackwardDataWorkspace(
+      stream, cudnn, algo_desc, input_nd, filter, conv, output_nd,
+      scratch_allocator);
+
+  if (scratch_or.ok()) {
+    *scratch = scratch_or.ValueOrDie();
+    return algo_desc;
+  }
+
+  // Failed to allocate workspace for the first algorithm, fall back to the
+  // no_scratch algorithm.
+  if (algorithm_config.algorithm_no_scratch().is_default()) {
+    return port::Status(
+        port::error::INVALID_ARGUMENT,
+        "The primary convolution algorithm failed memory allocation, "
+        "while a secondary algorithm is not provided.");
+  }
+
+  SE_ASSIGN_OR_RETURN(
+      *scratch, AllocateCudnnConvolutionBackwardDataWorkspace(
+                    stream, cudnn, algorithm_config.algorithm_no_scratch(),
+                    input_nd, filter, conv, output_nd, scratch_allocator));
+  return algorithm_config.algorithm_no_scratch();
+}
+
+port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
+    Stream* stream, const CudnnHandle& cudnn,
+    const dnn::AlgorithmConfig& algorithm_config,
+    const ScopedTensorDescriptor& input_nd,
+    const ScopedFilterDescriptor& filter,
+    const ScopedConvolutionDescriptor& conv,
+    const ScopedTensorDescriptor& output_nd,
+    ScratchAllocator* scratch_allocator, DeviceMemory<uint8>* scratch) {
+  dnn::AlgorithmDesc algo_desc = algorithm_config.algorithm();
+  if (algorithm_config.algorithm().is_default()) {
+    // Pick fastest algorithm within memory limit according to cuDNN's
+    // heuristics.
+    bool specify_workspace_limit = scratch_allocator != nullptr;
+    auto memory_limit_bytes =
+        specify_workspace_limit
+            ? std::max(scratch_allocator->GetMemoryLimitInBytes(stream), 0ll)
+            : 0ll;
+    SE_ASSIGN_OR_RETURN(cudnnConvolutionBwdFilterAlgo_t algo,
+                        GetCudnnConvolutionBackwardFilterAlgo(
+                            cudnn, input_nd, filter, conv, output_nd,
+                            specify_workspace_limit, memory_limit_bytes));
+    algo_desc = dnn::AlgorithmDesc(
+        algo, algorithm_config.algorithm().tensor_ops_enabled());
+  }
+
+  auto scratch_or = AllocateCudnnConvolutionBackwardFilterWorkspace(
+      stream, cudnn, algo_desc, input_nd, filter, conv, output_nd,
+      scratch_allocator);
+
+  if (scratch_or.ok()) {
+    *scratch = scratch_or.ValueOrDie();
+    return algo_desc;
+  }
+
+  // Failed to allocate workspace for the first algorithm, fall back to the
+  // no_scratch algorithm.
+  if (algorithm_config.algorithm_no_scratch().is_default()) {
+    return port::Status(
+        port::error::INVALID_ARGUMENT,
+        "The primary convolution algorithm failed memory allocation, "
+        "while a secondary algorithm is not provided.");
+  }
+
+  SE_ASSIGN_OR_RETURN(*scratch,
+                      AllocateCudnnConvolutionBackwardFilterWorkspace(
+                          stream, cudnn, algorithm_config.algorithm(), input_nd,
+                          filter, conv, output_nd, scratch_allocator));
+  return algorithm_config.algorithm_no_scratch();
 }
 
 // A helper class to set env-vars and choose options for cudnn-related
@@ -2505,16 +2286,14 @@ class CudnnEnvVar {
 };
 
 // A helper struct to decide whether to enable the FFT_TILING algorithms for
-// forward convolution. Before cudnn v5.1 it works fine but since cudnn v5.1
-// it is turned off due to memory corruption caused by some shapes with this
-// algorithm.
-// Before NVIDIA fixes the memory corruption bug, users can explicitly
-// enable the algorithm through an env-var "TF_ENABLE_FFT_TILING_FORWARD=1".
+// forward convolution. It is disabled for cuDNN < 7 due to memory corruption
+// caused by some shapes with this algorithm. Users can explicitly enable the
+// algorithm through an env-var "TF_ENABLE_FFT_TILING_FORWARD=1".
 struct FftTilingForward {
   static constexpr const char* kName = "TF_ENABLE_FFT_TILING_FORWARD";
-  // TODO(yangzihao): turn the default to True when the memory corruption bug
-  // is fixed.
-  static constexpr bool kDefaultFlag = CUDNN_VERSION < 5100;
+  // TODO(csigg): Enabling this algo causes XLA test failures, for example in
+  // platforms/xla/tests/internal:convolution_test_gpu. See b/80018418.
+  static constexpr bool kDefaultFlag = false;  // CUDNN_VERSION >= 7000;
 };
 
 // A helper struct to decide whether to enable the WINOGRAD_NONFUSED algorithms.
@@ -2523,10 +2302,9 @@ struct FftTilingForward {
 // https://github.com/tensorflow/tensorflow/pull/4901
 struct WinogradNonfused {
   static constexpr const char* kName = "TF_ENABLE_WINOGRAD_NONFUSED";
-  // NVIDIA has fixed winograd nonfused bug for cudnn v>=7.
-  // For cudnn v>=5.1, we have a workaround and for any lower version, we
-  // disable it by default.
-  static constexpr bool kDefaultFlag = CUDNN_VERSION >= 5100;
+  // NVIDIA has fixed winograd nonfused bug for cudnn v>=7. For older versions,
+  // we have a workaround.
+  static constexpr bool kDefaultFlag = true;
 };
 
 // A helper struct to decide whether to use FP32 as the internal compute type
@@ -2580,8 +2358,6 @@ struct RnnDoFP32ComputationFP16Input {
   static constexpr bool kDefaultFlag = false;
 };
 
-// A helper function to return the internal compute type for
-// RNNs in cudnn.
 cudnnDataType_t GetRnnComputeType(dnn::DataType data_type) {
   switch (data_type) {
     case dnn::DataType::kFloat:
@@ -2598,32 +2374,28 @@ cudnnDataType_t GetRnnComputeType(dnn::DataType data_type) {
       LOG(FATAL) << "Invalid RNN data type: " << static_cast<int>(data_type);
   }
 }
+
 }  // namespace
 
 template <class T>
-bool CudnnSupport::DoConvolveImpl(
-    Stream* stream, const BatchDescriptor& batch_descriptor,
+port::Status CudnnSupport::DoConvolveImpl(
+    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
     const DeviceMemory<T>& input_data,
-    const FilterDescriptor& filter_descriptor,
+    const dnn::FilterDescriptor& filter_descriptor,
     const DeviceMemory<T>& filter_data,
-    const ConvolutionDescriptor& convolution_descriptor,
-    const BatchDescriptor& output_descriptor, DeviceMemory<T>* output_data,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::BatchDescriptor& output_descriptor, DeviceMemory<T>* output_data,
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
-  ScopedTensorDescriptor input_nd{parent_, batch_descriptor, cudnn_type};
-  ScopedTensorDescriptor output_nd{parent_, output_descriptor, cudnn_type};
-  ScopedFilterDescriptor filter{parent_, filter_descriptor, batch_descriptor,
-                                cudnn_type};
-  ScopedConvolutionDescriptor conv{parent_, convolution_descriptor,
-                                   GetConvComputeType<T>()};
-
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status);
-  }
+  ScopedTensorDescriptor input_nd(input_descriptor, cudnn_type);
+  ScopedTensorDescriptor output_nd(output_descriptor, cudnn_type);
+  ScopedFilterDescriptor filter(filter_descriptor, cudnn_type);
+  ScopedConvolutionDescriptor conv(convolution_descriptor,
+                                   GetConvComputeType<T>());
+
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
   // Alpha is the scaling factor for input.
   float falpha = 1.0;
   double dalpha = 1.0;
@@ -2636,177 +2408,48 @@ bool CudnnSupport::DoConvolveImpl(
                                                : static_cast<void*>(&fbeta);
 
   const bool is_profiling = output_profile_result != nullptr;
-  cudnnConvolutionFwdAlgo_t algo;
-  bool use_tensor_ops;
-  DeviceMemory<uint8> scratch;
 
-  // TODO(pauldonnelly): Replace the following code with a call to
-  //   GetCudnnConvolutionForwardAlgorithm().
-  if (algorithm_config.algorithm().is_default()) {
-    // With the default algorithm, use Cudnn's heuristics.
-    auto get_algorithm =
-        [&](bool specify_limit) SHARED_LOCKS_REQUIRED(dnn_handle_mutex_) {
-          cudnnConvolutionFwdPreference_t preference =
-              specify_limit ? CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
-                            : CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
-
-          auto memory_limit_bytes =
-              scratch_allocator == nullptr
-                  ? 0
-                  : scratch_allocator->GetMemoryLimitInBytes(stream);
-          if (memory_limit_bytes < 0) {
-            memory_limit_bytes = 0;
-          }
-
-          cudnnConvolutionFwdAlgo_t algo_to_use;
-          status = wrap::cudnnGetConvolutionForwardAlgorithm(
-              parent_, ToHandle(dnn_handle_), input_nd.handle(),
-              filter.handle(), conv.handle(), output_nd.handle(),
-              /*preference=*/preference,
-              /*memoryLimitInBytes=*/memory_limit_bytes,
-              /*algo=*/&algo_to_use);
-          CHECK_EQ(status, CUDNN_STATUS_SUCCESS)
-              << "Unable to find a suitable "
-                 "algorithm for doing forward "
-                 "convolution";
-          return algo_to_use;
-        };
-
-    algo = get_algorithm(/*specify_limit=*/scratch_allocator != nullptr);
-    use_tensor_ops = true;
-    if (scratch_allocator != nullptr) {
-      size_t size_in_bytes;
-      status = wrap::cudnnGetConvolutionForwardWorkspaceSize(
-          parent_, ToHandle(dnn_handle_), /*srcDesc=*/input_nd.handle(),
-          /*filterDesc=*/filter.handle(), /*convDesc=*/conv.handle(),
-          /*destDesc=*/output_nd.handle(), /*algo=*/algo,
-          /*sizeInBytes=*/&size_in_bytes);
-      int64 size_in_bytes_int64 = size_in_bytes;
-      if (status == CUDNN_STATUS_SUCCESS && size_in_bytes_int64 != 0) {
-        if (size_in_bytes_int64 > 0) {
-          auto allocated =
-              scratch_allocator->AllocateBytes(stream, size_in_bytes);
-          if (allocated.ok()) {
-            scratch = allocated.ValueOrDie();
-          } else {
-            LOG(WARNING) << allocated.status().error_message();
-          }
-        } else {
-          LOG(WARNING)
-              << "cudnnGetConvolutionForwardWorkspaceSize() returned "
-                 "negative sizeInBytes value. This could be a cudnn bug.";
-        }
-      }
-    }
+  DeviceMemory<uint8> scratch;
+  SE_ASSIGN_OR_RETURN(dnn::AlgorithmDesc algo_desc,
+                      GetCudnnConvolutionForwardAlgorithm(
+                          stream, cudnn, algorithm_config, input_nd, filter,
+                          conv, output_nd, scratch_allocator, &scratch));
 
-    // If we didn't allocate any scratch space (perhaps because of failed
-    // allocation), we force a switch back to the "no workspace" algorithm.
-    if (scratch == nullptr) {
-      algo = get_algorithm(/*specify_limit=*/false);
-    }
-  } else {
-    // An algorithm has been specified.
-    dnn::AlgorithmDesc algotype = algorithm_config.algorithm();
-    algo = ToConvForwardAlgo(algotype);
-    use_tensor_ops = algotype.tensor_ops_enabled();
-    conv.set_use_tensor_op_math(use_tensor_ops);
-    size_t size_in_bytes;
-    status = wrap::cudnnGetConvolutionForwardWorkspaceSize(
-        parent_, ToHandle(dnn_handle_), /*srcDesc=*/input_nd.handle(),
-        /*filterDesc=*/filter.handle(), /*convDesc=*/conv.handle(),
-        /*destDesc=*/output_nd.handle(), /*algo=*/algo,
-        /*sizeInBytes=*/&size_in_bytes);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      if (is_profiling) {
-        // Silently return when we are profiling.
-        return false;
-      }
-      LOG(FATAL) << "Cannot query the size of workspace needed for the given "
-                    "algorithm: "
-                 << algorithm_config.algorithm().algo_id();
-    }
-    int64 size_in_bytes_int64 = size_in_bytes;
-    if (size_in_bytes_int64 > 0) {
-      if (scratch_allocator == nullptr) {
-        LOG(FATAL) << "An allocator must be specified when scratch memory is "
-                      "needed";
-      }
-      auto allocated = scratch_allocator->AllocateBytes(stream, size_in_bytes);
-      if (is_profiling && !allocated.ok()) {
-        // Silently return when we are profiling.
-        return false;
-      }
-      if (allocated.ok()) {
-        scratch = allocated.ValueOrDie();
-      } else {
-        LOG(WARNING) << allocated.status().error_message();
-      }
-      if (scratch == nullptr) {
-        CHECK(!algorithm_config.algorithm_no_scratch().is_default())
-            << "The primary convolution algorithm failed memory allocation, "
-               "while a secondary algorithm is not provided.";
-        dnn::AlgorithmDesc algotype = algorithm_config.algorithm_no_scratch();
-        algo = ToConvForwardAlgo(algotype);
-        use_tensor_ops = algotype.tensor_ops_enabled();
-        conv.set_use_tensor_op_math(use_tensor_ops);
-      }
-    } else if (size_in_bytes_int64 < 0) {
-      LOG(WARNING) << "cudnnGetConvolutionForwardWorkspaceSize() returned "
-                      "negative sizeInBytes value. This could be a cudnn bug.";
-    }
-  }
-  std::unique_ptr<CUDATimer> timer;
+  std::unique_ptr<CUDATimer, TimerDeleter> timer;
   if (is_profiling) {
     timer.reset(new CUDATimer(parent_));  // NOLINT
-    if (!timer->Init()) {
-      return false;
-    }
     // The start and stop of the timer should be as close to the Cudnn call as
     // possible. It is still possible for other threads to issue workload on
     // to this stream. So it could take multiple profiling measurements.
-    if (!timer->Start(AsCUDAStream(stream))) {
-      timer->Destroy();
-      return false;
+    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+      return port::Status(port::error::INTERNAL, "Failed to start timer");
     }
   }
-  status = wrap::cudnnConvolutionForward(
-      this, stream, ToHandle(dnn_handle_),
+
+  RETURN_IF_CUDNN_ERROR(cudnnConvolutionForward(
+      cudnn.handle(),
       /*alpha=*/alpha, /*srcDesc=*/input_nd.handle(),
       /*srcData=*/input_data.opaque(), /*filterDesc=*/filter.handle(),
       /*filterData=*/filter_data.opaque(), /*convDesc=*/conv.handle(),
-      /*algo=*/algo, /*workSpace=*/scratch.opaque(),
+      /*algo=*/ToConvForwardAlgo(algo_desc), /*workSpace=*/scratch.opaque(),
       /*workSpaceSizeInBytes=*/scratch.size(), /*beta=*/beta,
-      /*destDesc=*/output_nd.handle(), /*destData=*/output_data->opaque());
+      /*yDesc=*/output_nd.handle(), /*y=*/output_data->opaque()));
 
   if (is_profiling) {
     if (!timer->Stop(AsCUDAStream(stream))) {
-      timer->Destroy();
-      return false;
+      return port::Status(port::error::INTERNAL, "Failed to stop timer");
     }
-    if (status == CUDNN_STATUS_SUCCESS) {
-      dnn::AlgorithmDesc algotype(algo, use_tensor_ops);
-      output_profile_result->set_algorithm(algotype);
-      output_profile_result->set_elapsed_time_in_ms(
-          timer->GetElapsedMilliseconds());
-    }
-    timer->Destroy();
-  }
-
-  if (status != CUDNN_STATUS_SUCCESS) {
-    // Silently return when we are profiling.
-    if (!is_profiling) {
-      LOG(ERROR) << "failed to enqueue convolution on stream: "
-                 << ToString(status);
-    }
-    return false;
+    output_profile_result->set_algorithm(algo_desc);
+    output_profile_result->set_elapsed_time_in_ms(
+        timer->GetElapsedMilliseconds());
   }
 
-  return true;
+  return port::Status::OK();
 }
 
 template <typename Type, typename BiasType, typename ScaleType,
           int cudnn_data_type, int cudnn_compute_type>
-bool CudnnSupport::DoFusedConvolveImpl(
+port::Status CudnnSupport::DoFusedConvolveImpl(
     Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
     const DeviceMemory<Type>& conv_input_data, ScaleType conv_input_scale,
     const dnn::FilterDescriptor& filter_descriptor,
@@ -2819,71 +2462,49 @@ bool CudnnSupport::DoFusedConvolveImpl(
     DeviceMemory<Type>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-#if CUDNN_VERSION < 6000
-  LOG(ERROR) << "cudnnConvolutionBiasActivationForward() is only "
-                "supported for cuDNN version >= 6";
-  return false;
-#else
-  ScopedTensorDescriptor conv_input_nd{
-      parent_, conv_input_descriptor,
-      static_cast<cudnnDataType_t>(cudnn_data_type)};
-  ScopedTensorDescriptor output_nd{
-      parent_, output_descriptor,
-      static_cast<cudnnDataType_t>(cudnn_data_type)};
-  ScopedFilterDescriptor filter{parent_, filter_descriptor,
-                                conv_input_descriptor,
-                                static_cast<cudnnDataType_t>(cudnn_data_type)};
-  ScopedTensorDescriptor bias_nd{parent_, bias_descriptor, CUDNN_DATA_FLOAT};
-  ScopedConvolutionDescriptor conv{
-      parent_, convolution_descriptor,
-      static_cast<cudnnDataType_t>(cudnn_compute_type)};
-
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  CHECK(status == CUDNN_STATUS_SUCCESS)
-      << "failed to set stream for cudnn handle: " << ToString(status);
+  if (activation_mode != dnn::ActivationMode::kRelu) {
+    return port::Status(port::error::INVALID_ARGUMENT,
+                        "cudnnConvolutionBiasActivationForward() only supports "
+                        "Relu activation.");
+  }
+
+  ScopedTensorDescriptor conv_input_nd(
+      conv_input_descriptor, static_cast<cudnnDataType_t>(cudnn_data_type));
+  ScopedTensorDescriptor output_nd(
+      output_descriptor, static_cast<cudnnDataType_t>(cudnn_data_type));
+  ScopedFilterDescriptor filter(filter_descriptor,
+                                static_cast<cudnnDataType_t>(cudnn_data_type));
+  ScopedTensorDescriptor bias_nd(bias_descriptor, CUDNN_DATA_FLOAT);
+  ScopedConvolutionDescriptor conv(
+      convolution_descriptor, static_cast<cudnnDataType_t>(cudnn_compute_type));
+
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
 
   const bool is_profiling = output_profile_result != nullptr;
-  DeviceMemory<uint8> scratch;
-  dnn::AlgorithmDesc algotype = GetCudnnConvolutionForwardAlgorithm(
-      stream, parent_, dnn_handle_, algorithm_config, is_profiling,
-      conv_input_nd, filter, conv, output_nd, scratch_allocator, &scratch);
-  if (algotype.is_default()) {
-    if (!is_profiling) {
-      LOG(ERROR) << "No suitable algorithm found";
-    }
-    return false;
-  }
-  auto algo = static_cast<cudnnConvolutionFwdAlgo_t>(algotype.algo_id());
-  conv.set_use_tensor_op_math(algotype.tensor_ops_enabled());
 
-  if (activation_mode != dnn::ActivationMode::kRelu) {
-    LOG(ERROR) << "cudnnConvolutionBiasActivationForward() only supports Relu "
-                  "activation.";
-    return false;
-  }
+  DeviceMemory<uint8> scratch;
+  SE_ASSIGN_OR_RETURN(
+      dnn::AlgorithmDesc algo_desc,
+      GetCudnnConvolutionForwardAlgorithm(
+          stream, cudnn, algorithm_config, conv_input_nd, filter, conv,
+          output_nd, scratch_allocator, &scratch));
 
-  std::unique_ptr<CUDATimer> timer;
+  std::unique_ptr<CUDATimer, TimerDeleter> timer;
   if (is_profiling) {
     timer.reset(new CUDATimer(parent_));  // NOLINT
-    if (!timer->Init()) {
-      return false;
-    }
     // The start and stop of the timer should be as close to the Cudnn call as
     // possible. It is still possible for other threads to issue workload on
     // to this stream. So it could take multiple profiling measurements.
-    if (!timer->Start(AsCUDAStream(stream))) {
-      timer->Destroy();
-      return false;
+    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+      return port::Status(port::error::INTERNAL, "Failed to start timer");
     }
   }
   // CUDNN v6 only supports CUDNN_NOT_PROPAGATE_NAN as the reluNanOpt for
   // activation descriptor. Note that this will change the nan propagation
   // behavior from separate conv, bias, and relu (which by default is
   // CUDNN_PROPAGATE_NAN.
-  ScopedActivationDescriptor activation_desc{parent_, activation_mode,
-                                             CUDNN_NOT_PROPAGATE_NAN,
-                                             output_descriptor.value_max()};
+  ScopedActivationDescriptor activation_desc(
+      activation_mode, CUDNN_NOT_PROPAGATE_NAN, output_descriptor.value_max());
   auto side_input_data_ptr = (side_input_scale == 0) ? output_data->opaque()
                                                      : side_input_data.opaque();
 
@@ -2892,7 +2513,8 @@ bool CudnnSupport::DoFusedConvolveImpl(
           << "\nconv_input_data.opaque() = " << conv_input_data.opaque()
           << "\nfilter.handle() = " << filter.handle()
           << "\nfilter_data.opaque() = " << filter_data.opaque()
-          << "\nconv.handle() = " << conv.handle() << "\nalgo = " << algo
+          << "\nconv.handle() = " << conv.handle()
+          << "\nalgo = " << algo_desc.algo_id()
           << "\nscratch.opaque() = " << scratch.opaque()
           << "\nscratch.size() = " << scratch.size()
           << "\nside_input_scale = " << side_input_scale
@@ -2904,41 +2526,29 @@ bool CudnnSupport::DoFusedConvolveImpl(
           << "\noutput_nd.handle() = " << output_nd.handle()
           << "\noutput_data->opaque() = " << output_data->opaque();
 
-  status = wrap::cudnnConvolutionBiasActivationForward(
-      this, stream, ToHandle(dnn_handle_), /*alpha1=*/&conv_input_scale,
+  RETURN_IF_CUDNN_ERROR(cudnnConvolutionBiasActivationForward(
+      cudnn.handle(),
+      /*alpha1=*/&conv_input_scale,
       /*srcDesc=*/conv_input_nd.handle(), /*srcData=*/conv_input_data.opaque(),
       /*filterDesc=*/filter.handle(), /*filterData=*/filter_data.opaque(),
-      /*convDesc=*/conv.handle(), algo, /*workSpace=*/scratch.opaque(),
+      /*convDesc=*/conv.handle(), ToConvForwardAlgo(algo_desc),
+      /*workSpace=*/scratch.opaque(),
       /*workSpaceSizeInBytes=*/scratch.size(), /*alpha2=*/&side_input_scale,
       /*zDesc=*/output_nd.handle(), /*z=*/side_input_data_ptr,
       /*biasDesc=*/bias_nd.handle(), /*bias=*/biases.opaque(),
       /*activationDesc=*/activation_desc.handle(),
-      /*destDesc=*/output_nd.handle(), /*destData=*/output_data->opaque());
+      /*yDesc=*/output_nd.handle(), /*y=*/output_data->opaque()));
 
   if (is_profiling) {
     if (!timer->Stop(AsCUDAStream(stream))) {
-      timer->Destroy();
-      return false;
-    }
-    if (status == CUDNN_STATUS_SUCCESS) {
-      output_profile_result->set_algorithm(algotype);
-      output_profile_result->set_elapsed_time_in_ms(
-          timer->GetElapsedMilliseconds());
-    }
-    timer->Destroy();
-  }
-
-  if (status != CUDNN_STATUS_SUCCESS) {
-    // Silently return when we are profiling.
-    if (!is_profiling) {
-      LOG(ERROR) << "failed to enqueue convolution on stream: "
-                 << ToString(status);
+      return port::Status(port::error::INTERNAL, "Failed to stop timer");
     }
-    return false;
+    output_profile_result->set_algorithm(algo_desc);
+    output_profile_result->set_elapsed_time_in_ms(
+        timer->GetElapsedMilliseconds());
   }
 
-  return true;
-#endif  // CUDNN_VERSION < 6000
+  return port::Status::OK();
 }
 
 bool CudnnSupport::GetConvolveAlgorithms(
@@ -2951,19 +2561,15 @@ bool CudnnSupport::GetConvolveAlgorithms(
     CUDNN_CONVOLUTION_FWD_ALGO_GEMM,
     CUDNN_CONVOLUTION_FWD_ALGO_DIRECT,
     CUDNN_CONVOLUTION_FWD_ALGO_FFT,
-#if CUDNN_VERSION >= 5000
     CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD,
-#endif
     // clang-format on
   };
   if (CudnnEnvVar<FftTilingForward>::IsEnabled()) {
     algo_types.push_back(CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING);
   }
-#if CUDNN_VERSION >= 5100
   if (CudnnEnvVar<WinogradNonfused>::IsEnabled() && with_winograd_nonfused) {
     algo_types.push_back(CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED);
   }
-#endif
 
   out_algorithms->clear();
   for (auto i : algo_types) {
@@ -2978,13 +2584,11 @@ bool CudnnSupport::GetConvolveAlgorithms(
 bool CudnnSupport::GetRnnAlgorithms(
     std::vector<dnn::AlgorithmDesc>* out_algorithms) {
   std::vector<dnn::AlgorithmDesc::Index> algo_types = {
-  // clang-format off
-#if CUDNN_VERSION >= 6000
+      // clang-format off
     CUDNN_RNN_ALGO_STANDARD,
     CUDNN_RNN_ALGO_PERSIST_STATIC,
     CUDNN_RNN_ALGO_PERSIST_DYNAMIC,
-#endif
-    // clang-format on
+      // clang-format on
   };
 
   out_algorithms->clear();
@@ -3003,21 +2607,17 @@ bool CudnnSupport::GetConvolveBackwardDataAlgorithms(
     bool with_winograd_nonfused, int cc_major, int cc_minor,
     std::vector<dnn::AlgorithmDesc>* out_algorithms) {
   std::vector<dnn::AlgorithmDesc::Index> algo_types = {
-    // clang-format off
+      // clang-format off
     CUDNN_CONVOLUTION_BWD_DATA_ALGO_0,
     CUDNN_CONVOLUTION_BWD_DATA_ALGO_1,
     CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT,
     CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING,
-#if CUDNN_VERSION >= 5000
     CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD,
-#endif
-    // clang-format on
+      // clang-format on
   };
-#if CUDNN_VERSION >= 5100
   if (CudnnEnvVar<WinogradNonfused>::IsEnabled() && with_winograd_nonfused) {
     algo_types.push_back(CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED);
   }
-#endif
 
   out_algorithms->clear();
   for (auto i : algo_types) {
@@ -3040,13 +2640,15 @@ bool CudnnSupport::GetConvolveBackwardFilterAlgorithms(
       CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3,
       // Based on cudnn.h, the following is not implemented.
       // CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD,
+
+      // Produces incorrect results for some shapes. Disabled for now, see
+      // NVIDIA bug 2072856. TODO(csigg): Only disable for subset of shapes.
+      // CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING,
       // clang-format on
   };
-#if CUDNN_VERSION >= 5100
   if (CudnnEnvVar<WinogradNonfused>::IsEnabled() && with_winograd_nonfused) {
     algo_types.push_back(CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED);
   }
-#endif
 
   out_algorithms->clear();
   for (auto i : algo_types) {
@@ -3070,11 +2672,13 @@ bool CudnnSupport::DoBatchNormalizationForward(
     DeviceMemory<float>* saved_inv_var, bool is_training,
     std::function<const DeviceMemory<float>&()> var_to_inv_var,
     std::function<void()> inv_var_to_var) {
-  return DoBatchNormalizationForwardImpl<float, float>(
-      stream, dnn::DataType::kFloat, dnn::DataType::kFloat, x, scale, offset,
-      estimated_mean, estimated_variance, x_desc, scale_offset_desc, epsilon, y,
-      batch_mean, batch_var, saved_mean, saved_inv_var, is_training,
-      std::move(var_to_inv_var), std::move(inv_var_to_var));
+  return IsStatusOk(
+      DoBatchNormalizationForwardImpl<float, float>(
+          stream, dnn::DataType::kFloat, dnn::DataType::kFloat, x, scale,
+          offset, estimated_mean, estimated_variance, x_desc, scale_offset_desc,
+          epsilon, y, batch_mean, batch_var, saved_mean, saved_inv_var,
+          is_training, std::move(var_to_inv_var), std::move(inv_var_to_var)),
+      /*report_error=*/true);
 }
 
 bool CudnnSupport::DoBatchNormalizationForward(
@@ -3089,15 +2693,17 @@ bool CudnnSupport::DoBatchNormalizationForward(
     DeviceMemory<float>* saved_inv_var, bool is_training,
     std::function<const DeviceMemory<float>&()> var_to_inv_var,
     std::function<void()> inv_var_to_var) {
-  return DoBatchNormalizationForwardImpl<Eigen::half, float>(
-      stream, dnn::DataType::kHalf, dnn::DataType::kFloat, x, scale, offset,
-      estimated_mean, estimated_variance, x_desc, scale_offset_desc, epsilon, y,
-      batch_mean, batch_var, saved_mean, saved_inv_var, is_training,
-      std::move(var_to_inv_var), std::move(inv_var_to_var));
+  return IsStatusOk(
+      DoBatchNormalizationForwardImpl<Eigen::half, float>(
+          stream, dnn::DataType::kHalf, dnn::DataType::kFloat, x, scale, offset,
+          estimated_mean, estimated_variance, x_desc, scale_offset_desc,
+          epsilon, y, batch_mean, batch_var, saved_mean, saved_inv_var,
+          is_training, std::move(var_to_inv_var), std::move(inv_var_to_var)),
+      /*report_error=*/true);
 }
 
 template <class T, class U>
-bool CudnnSupport::DoBatchNormalizationForwardImpl(
+port::Status CudnnSupport::DoBatchNormalizationForwardImpl(
     Stream* stream, dnn::DataType input_data_type,
     dnn::DataType scale_data_type, const DeviceMemory<T>& x,
     const DeviceMemory<U>& scale, const DeviceMemory<U>& offset,
@@ -3109,17 +2715,9 @@ bool CudnnSupport::DoBatchNormalizationForwardImpl(
     DeviceMemory<U>* saved_mean, DeviceMemory<U>* saved_inv_var,
     bool is_training, std::function<const DeviceMemory<U>&()> var_to_inv_var,
     std::function<void()> inv_var_to_var) {
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
-    return false;
-  }
-
-  ScopedTensorDescriptor x_descriptor{parent_, x_desc,
-                                      ToCudnnDataType(input_data_type)};
-  ScopedTensorDescriptor scale_offset_descriptor{
-      parent_, scale_offset_desc, ToCudnnDataType(scale_data_type)};
+  ScopedTensorDescriptor x_descriptor(x_desc, ToCudnnDataType(input_data_type));
+  ScopedTensorDescriptor scale_offset_descriptor(
+      scale_offset_desc, ToCudnnDataType(scale_data_type));
   cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
 #if CUDNN_VERSION >= 7000
   if (BatchnormSpatialPersistentEnabled() && is_training) {
@@ -3128,6 +2726,7 @@ bool CudnnSupport::DoBatchNormalizationForwardImpl(
 #endif
   float one = 1.0;
   float zero = 0.0;
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
 
   if (is_training) {
     CHECK_EQ(batch_mean->is_null(), batch_var->is_null())
@@ -3145,35 +2744,21 @@ bool CudnnSupport::DoBatchNormalizationForwardImpl(
       batch_var_opaque = nullptr;
     }
 
-    status = wrap::cudnnBatchNormalizationForwardTraining(
-        this, stream, ToHandle(dnn_handle_), mode, &one, &zero,
-        x_descriptor.handle(), x.opaque(), x_descriptor.handle(), y->opaque(),
-        scale_offset_descriptor.handle(), scale.opaque(), offset.opaque(), 1.0,
-        batch_mean_opaque, batch_var_opaque, epsilon, saved_mean->opaque(),
-        saved_inv_var->opaque());
-#if CUDNN_VERSION < 5000
-    CHECK(inv_var_to_var);
-    inv_var_to_var();
-#endif
+    RETURN_IF_CUDNN_ERROR(cudnnBatchNormalizationForwardTraining(
+        cudnn.handle(), mode, &one, &zero, x_descriptor.handle(), x.opaque(),
+        x_descriptor.handle(), y->opaque(), scale_offset_descriptor.handle(),
+        scale.opaque(), offset.opaque(), 1.0, batch_mean_opaque,
+        batch_var_opaque, epsilon, saved_mean->opaque(),
+        saved_inv_var->opaque()));
   } else {
-#if CUDNN_VERSION < 5000
-    CHECK(var_to_inv_var);
-    const void* maybe_inv_var = var_to_inv_var().opaque();
-#else
     const void* maybe_inv_var = estimated_variance.opaque();
-#endif
-    status = wrap::cudnnBatchNormalizationForwardInference(
-        this, stream, ToHandle(dnn_handle_), mode, &one, &zero,
-        x_descriptor.handle(), x.opaque(), x_descriptor.handle(), y->opaque(),
-        scale_offset_descriptor.handle(), scale.opaque(), offset.opaque(),
-        estimated_mean.opaque(), maybe_inv_var, epsilon);
+    RETURN_IF_CUDNN_ERROR(cudnnBatchNormalizationForwardInference(
+        cudnn.handle(), mode, &one, &zero, x_descriptor.handle(), x.opaque(),
+        x_descriptor.handle(), y->opaque(), scale_offset_descriptor.handle(),
+        scale.opaque(), offset.opaque(), estimated_mean.opaque(), maybe_inv_var,
+        epsilon));
   }
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to enqueue forward batch normalization on stream: "
-               << ToString(status);
-    return false;
-  }
-  return true;
+  return port::Status::OK();
 }
 
 bool CudnnSupport::DoBatchNormalizationBackward(
@@ -3184,10 +2769,11 @@ bool CudnnSupport::DoBatchNormalizationBackward(
     const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
     DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop,
     DeviceMemory<float>* offset_backprop) {
-  return DoBatchNormalizationBackwardImpl(
-      stream, CUDNN_DATA_FLOAT, CUDNN_DATA_FLOAT, y_backprop, x, scale, mean,
-      inv_var, x_desc, scale_offset_desc, epsilon, x_backprop, scale_backprop,
-      offset_backprop);
+  return IsStatusOk(DoBatchNormalizationBackwardImpl(
+                        stream, CUDNN_DATA_FLOAT, CUDNN_DATA_FLOAT, y_backprop,
+                        x, scale, mean, inv_var, x_desc, scale_offset_desc,
+                        epsilon, x_backprop, scale_backprop, offset_backprop),
+                    /*report_error=*/true);
 }
 
 bool CudnnSupport::DoBatchNormalizationBackward(
@@ -3198,14 +2784,15 @@ bool CudnnSupport::DoBatchNormalizationBackward(
     const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
     DeviceMemory<Eigen::half>* x_backprop, DeviceMemory<float>* scale_backprop,
     DeviceMemory<float>* offset_backprop) {
-  return DoBatchNormalizationBackwardImpl(
-      stream, CUDNN_DATA_HALF, CUDNN_DATA_FLOAT, y_backprop, x, scale, mean,
-      inv_var, x_desc, scale_offset_desc, epsilon, x_backprop, scale_backprop,
-      offset_backprop);
+  return IsStatusOk(DoBatchNormalizationBackwardImpl(
+                        stream, CUDNN_DATA_HALF, CUDNN_DATA_FLOAT, y_backprop,
+                        x, scale, mean, inv_var, x_desc, scale_offset_desc,
+                        epsilon, x_backprop, scale_backprop, offset_backprop),
+                    /*report_error=*/true);
 }
 
 template <class T, class U>
-bool CudnnSupport::DoBatchNormalizationBackwardImpl(
+port::Status CudnnSupport::DoBatchNormalizationBackwardImpl(
     Stream* stream, int cudnn_input_type, int cudnn_scale_type,
     const DeviceMemory<T>& y_backprop, const DeviceMemory<T>& x,
     const DeviceMemory<U>& scale, const DeviceMemory<U>& mean,
@@ -3213,18 +2800,10 @@ bool CudnnSupport::DoBatchNormalizationBackwardImpl(
     const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
     DeviceMemory<T>* x_backprop, DeviceMemory<U>* scale_backprop,
     DeviceMemory<U>* offset_backprop) {
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
-    return false;
-  }
-
-  ScopedTensorDescriptor x_descriptor{
-      parent_, x_desc, static_cast<cudnnDataType_t>(cudnn_input_type)};
-  ScopedTensorDescriptor scale_offset_descriptor{
-      parent_, scale_offset_desc,
-      static_cast<cudnnDataType_t>(cudnn_scale_type)};
+  ScopedTensorDescriptor x_descriptor(
+      x_desc, static_cast<cudnnDataType_t>(cudnn_input_type));
+  ScopedTensorDescriptor scale_offset_descriptor(
+      scale_offset_desc, static_cast<cudnnDataType_t>(cudnn_scale_type));
   cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
 #if CUDNN_VERSION >= 7000
   if (BatchnormSpatialPersistentEnabled()) {
@@ -3234,67 +2813,70 @@ bool CudnnSupport::DoBatchNormalizationBackwardImpl(
   float one = 1.0;
   float zero = 0.0;
 
-  status = wrap::cudnnBatchNormalizationBackward(
-      this, stream, ToHandle(dnn_handle_), mode, &one, &zero, &one, &zero,
-      x_descriptor.handle(), x.opaque(), x_descriptor.handle(),
-      y_backprop.opaque(), x_descriptor.handle(), x_backprop->opaque(),
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+
+  RETURN_IF_CUDNN_ERROR(cudnnBatchNormalizationBackward(
+      cudnn.handle(), mode, &one, &zero, &one, &zero, x_descriptor.handle(),
+      x.opaque(), x_descriptor.handle(), y_backprop.opaque(),
+      x_descriptor.handle(), x_backprop->opaque(),
       scale_offset_descriptor.handle(), scale.opaque(),
       scale_backprop->opaque(), offset_backprop->opaque(), epsilon,
-      mean.opaque(), inv_var.opaque());
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to enqueue backward batch normalization on stream: "
-               << ToString(status);
-    return false;
-  }
-  return true;
+      mean.opaque(), inv_var.opaque()));
+  return port::Status::OK();
 }
 
 bool CudnnSupport::DoConvolve(
-    Stream* stream, const BatchDescriptor& batch_descriptor,
+    Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
     const DeviceMemory<float>& input_data,
-    const FilterDescriptor& filter_descriptor,
+    const dnn::FilterDescriptor& filter_descriptor,
     const DeviceMemory<float>& filter_data,
-    const ConvolutionDescriptor& convolution_descriptor,
-    const BatchDescriptor& output_descriptor, DeviceMemory<float>* output_data,
-    ScratchAllocator* scratch_allocator,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::BatchDescriptor& output_descriptor,
+    DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return DoConvolveImpl<float>(
-      stream, batch_descriptor, input_data, filter_descriptor, filter_data,
-      convolution_descriptor, output_descriptor, output_data, scratch_allocator,
-      algorithm_config, output_profile_result);
+  return IsStatusOk(
+      DoConvolveImpl<float>(
+          stream, batch_descriptor, input_data, filter_descriptor, filter_data,
+          convolution_descriptor, output_descriptor, output_data,
+          scratch_allocator, algorithm_config, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoConvolve(
-    Stream* stream, const BatchDescriptor& batch_descriptor,
+    Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
     const DeviceMemory<double>& input_data,
-    const FilterDescriptor& filter_descriptor,
+    const dnn::FilterDescriptor& filter_descriptor,
     const DeviceMemory<double>& filter_data,
-    const ConvolutionDescriptor& convolution_descriptor,
-    const BatchDescriptor& output_descriptor, DeviceMemory<double>* output_data,
-    ScratchAllocator* scratch_allocator,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::BatchDescriptor& output_descriptor,
+    DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return DoConvolveImpl<double>(
-      stream, batch_descriptor, input_data, filter_descriptor, filter_data,
-      convolution_descriptor, output_descriptor, output_data, scratch_allocator,
-      algorithm_config, output_profile_result);
+  return IsStatusOk(
+      DoConvolveImpl<double>(
+          stream, batch_descriptor, input_data, filter_descriptor, filter_data,
+          convolution_descriptor, output_descriptor, output_data,
+          scratch_allocator, algorithm_config, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoConvolve(
-    Stream* stream, const BatchDescriptor& batch_descriptor,
+    Stream* stream, const dnn::BatchDescriptor& batch_descriptor,
     const DeviceMemory<Eigen::half>& input_data,
-    const FilterDescriptor& filter_descriptor,
+    const dnn::FilterDescriptor& filter_descriptor,
     const DeviceMemory<Eigen::half>& filter_data,
-    const ConvolutionDescriptor& convolution_descriptor,
-    const BatchDescriptor& output_descriptor,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::BatchDescriptor& output_descriptor,
     DeviceMemory<Eigen::half>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return DoConvolveImpl<Eigen::half>(
-      stream, batch_descriptor, input_data, filter_descriptor, filter_data,
-      convolution_descriptor, output_descriptor, output_data, scratch_allocator,
-      algorithm_config, output_profile_result);
+  return IsStatusOk(
+      DoConvolveImpl<Eigen::half>(
+          stream, batch_descriptor, input_data, filter_descriptor, filter_data,
+          convolution_descriptor, output_descriptor, output_data,
+          scratch_allocator, algorithm_config, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoFusedConvolve(
@@ -3310,13 +2892,15 @@ bool CudnnSupport::DoFusedConvolve(
     DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return DoFusedConvolveImpl<double, double, double, CUDNN_DATA_DOUBLE,
-                             CUDNN_DATA_DOUBLE>(
-      stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-      filter_descriptor, filter_data, convolution_descriptor, side_input_data,
-      side_input_scale, bias_descriptor, biases, activation_mode,
-      output_descriptor, output_data, scratch_allocator, algorithm_config,
-      output_profile_result);
+  return IsStatusOk(
+      DoFusedConvolveImpl<double, double, double, CUDNN_DATA_DOUBLE,
+                          CUDNN_DATA_DOUBLE>(
+          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
+          filter_descriptor, filter_data, convolution_descriptor,
+          side_input_data, side_input_scale, bias_descriptor, biases,
+          activation_mode, output_descriptor, output_data, scratch_allocator,
+          algorithm_config, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoFusedConvolve(
@@ -3332,13 +2916,15 @@ bool CudnnSupport::DoFusedConvolve(
     DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return DoFusedConvolveImpl<float, float, float, CUDNN_DATA_FLOAT,
-                             CUDNN_DATA_FLOAT>(
-      stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-      filter_descriptor, filter_data, convolution_descriptor, side_input_data,
-      side_input_scale, bias_descriptor, biases, activation_mode,
-      output_descriptor, output_data, scratch_allocator, algorithm_config,
-      output_profile_result);
+  return IsStatusOk(
+      DoFusedConvolveImpl<float, float, float, CUDNN_DATA_FLOAT,
+                          CUDNN_DATA_FLOAT>(
+          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
+          filter_descriptor, filter_data, convolution_descriptor,
+          side_input_data, side_input_scale, bias_descriptor, biases,
+          activation_mode, output_descriptor, output_data, scratch_allocator,
+          algorithm_config, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoFusedConvolve(
@@ -3355,13 +2941,15 @@ bool CudnnSupport::DoFusedConvolve(
     DeviceMemory<Eigen::half>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return DoFusedConvolveImpl<Eigen::half, Eigen::half, float, CUDNN_DATA_HALF,
-                             CUDNN_DATA_FLOAT>(
-      stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-      filter_descriptor, filter_data, convolution_descriptor, side_input_data,
-      side_input_scale, bias_descriptor, biases, activation_mode,
-      output_descriptor, output_data, scratch_allocator, algorithm_config,
-      output_profile_result);
+  return IsStatusOk(
+      DoFusedConvolveImpl<Eigen::half, Eigen::half, float, CUDNN_DATA_HALF,
+                          CUDNN_DATA_FLOAT>(
+          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
+          filter_descriptor, filter_data, convolution_descriptor,
+          side_input_data, side_input_scale, bias_descriptor, biases,
+          activation_mode, output_descriptor, output_data, scratch_allocator,
+          algorithm_config, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoFusedConvolve(
@@ -3377,11 +2965,6 @@ bool CudnnSupport::DoFusedConvolve(
     DeviceMemory<int8>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-#if CUDNN_VERSION < 6000
-  LOG(WARNING) << "cudnnConvolutionBiasActivationForward() is only "
-                  "supported for cuDNN version >= 6";
-  return false;
-#else
   int cc_major, cc_minor;
   stream->parent()->GetDeviceDescription().cuda_compute_capability(&cc_major,
                                                                    &cc_minor);
@@ -3390,50 +2973,15 @@ bool CudnnSupport::DoFusedConvolve(
                     "supported on GPUs with compute capability 6.1 or later.";
     return false;
   }
-  return DoFusedConvolveImpl<int8, float, float, CUDNN_DATA_INT8x4,
-                             CUDNN_DATA_INT32>(
-      stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-      filter_descriptor, filter_data, convolution_descriptor, side_input_data,
-      side_input_scale, bias_descriptor, biases, activation_mode,
-      output_descriptor, output_data, scratch_allocator, algorithm_config,
-      output_profile_result);
-#endif
-}
-
-template<class T>
-DeviceMemory<T> CudnnSupport::MaybeTransformLayout(
-    Stream* stream,
-    BatchDescriptor* output_descriptor,
-    DeviceMemory<T> backward_output_data,
-    std::unique_ptr<TemporaryDeviceMemory<T>>* transform_scratch) {
-  if (output_descriptor->layout() == dnn::DataLayout::kBatchDepthYX) {
-    return backward_output_data;
-  }
-  CHECK(output_descriptor->layout() == dnn::DataLayout::kBatchYXDepth);
-  *transform_scratch =
-      stream->AllocateTemporaryArray<T>(backward_output_data.ElementCount())
-          .ConsumeValueOrDie();
-  BatchDescriptor transformed_output_descriptor;
-  transformed_output_descriptor.CloneFrom(*output_descriptor);
-  transformed_output_descriptor.set_layout(dnn::DataLayout::kBatchDepthYX);
-  cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
-  ScopedTensorDescriptor orig_out_back_nd{parent_, *output_descriptor,
-                                          cudnn_type};
-  ScopedTensorDescriptor transformed_out_back_nd{
-      parent_, transformed_output_descriptor, cudnn_type};
-
-  float alpha = 1.0f;
-  float beta = 0.0f;
-  auto status = wrap::cudnnTransformTensor(
-      this, stream, ToHandle(dnn_handle_), &alpha, orig_out_back_nd.handle(),
-      backward_output_data.opaque(), &beta, transformed_out_back_nd.handle(),
-      (*transform_scratch)->mutable_device_memory()->opaque());
-
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(FATAL) << "Failed to transform the data layout.";
-  }
-  output_descriptor->set_layout(dnn::DataLayout::kBatchDepthYX);
-  return (*transform_scratch)->device_memory();
+  return IsStatusOk(
+      DoFusedConvolveImpl<int8, float, float, CUDNN_DATA_INT8x4,
+                          CUDNN_DATA_INT32>(
+          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
+          filter_descriptor, filter_data, convolution_descriptor,
+          side_input_data, side_input_scale, bias_descriptor, biases,
+          activation_mode, output_descriptor, output_data, scratch_allocator,
+          algorithm_config, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoTransformTensor(Stream* stream,
@@ -3443,50 +2991,32 @@ bool CudnnSupport::DoTransformTensor(Stream* stream,
                                      const dnn::BatchDescriptor& output_desc,
                                      dnn::DataType output_type, float scale,
                                      DeviceMemoryBase* output_data) {
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status);
-  }
-
   float beta = 0.0f;
   ScopedTensorDescriptor input_tensor_desc(
-      parent_, input_desc, ToCudnnDataType(input_type, input_desc.layout()));
+      input_desc, ToCudnnDataType(input_type, input_desc.layout()));
   ScopedTensorDescriptor output_tensor_desc(
-      parent_, output_desc, ToCudnnDataType(output_type, output_desc.layout()));
-  status = wrap::cudnnTransformTensor(
-      this, stream, ToHandle(dnn_handle_), &scale, input_tensor_desc.handle(),
-      input_data.opaque(), &beta, output_tensor_desc.handle(),
-      output_data->opaque());
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "Could not transform a tensor with layout "
-               << input_desc.ToString() << " and data type "
-               << static_cast<int>(input_type) << " to another with layout "
-               << output_desc.ToString() << " and data type "
-               << static_cast<int>(output_type) << ": " << ToString(status);
-    return false;
-  }
-  return true;
+      output_desc, ToCudnnDataType(output_type, output_desc.layout()));
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+  auto status = [&] {
+    RETURN_IF_CUDNN_ERROR(cudnnTransformTensor(
+        cudnn.handle(), &scale, input_tensor_desc.handle(), input_data.opaque(),
+        &beta, output_tensor_desc.handle(), output_data->opaque()));
+    return port::Status::OK();
+  }();
+  return IsStatusOk(status, /*report_error=*/true);
 }
 
 template <class T>
-bool CudnnSupport::DoConvolveBackwardDataImpl(
-    Stream* stream,
-    const FilterDescriptor& filter_descriptor,
+port::Status CudnnSupport::DoConvolveBackwardDataImpl(
+    Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
     const DeviceMemory<T>& filter_data,
-    const BatchDescriptor& output_descriptor_in,
+    const dnn::BatchDescriptor& output_descriptor,
     DeviceMemory<T> backward_output_data,
-    const ConvolutionDescriptor& convolution_descriptor,
-    const BatchDescriptor& input_descriptor,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::BatchDescriptor& input_descriptor,
     DeviceMemory<T>* backward_input_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status);
-  }
-
   cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
   // Alpha is the scaling factor for input.
   float falpha = 1.0;
@@ -3499,264 +3029,130 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
   void* beta = cudnn_type == CUDNN_DATA_DOUBLE ? static_cast<void*>(&dbeta)
                                                : static_cast<void*>(&fbeta);
 
-  // TBD(keveman): remove once cuDNN supports kBatchYXDepth for backward pass.
-  BatchDescriptor output_descriptor;
-  output_descriptor.CloneFrom(output_descriptor_in);
-  std::unique_ptr<TemporaryDeviceMemory<T>> transform_scratch;
-  backward_output_data = MaybeTransformLayout(
-      stream, &output_descriptor, backward_output_data, &transform_scratch);
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
 
-  ScopedTensorDescriptor out_back_nd{parent_, output_descriptor, cudnn_type};
-  ScopedTensorDescriptor in_back_nd{parent_, input_descriptor, cudnn_type};
-  ScopedFilterDescriptor filter{parent_, filter_descriptor, input_descriptor,
-                                cudnn_type};
-  ScopedConvolutionDescriptor conv{parent_, convolution_descriptor,
-                                   GetConvComputeType<T>()};
+  ScopedTensorDescriptor out_back_nd(output_descriptor, cudnn_type);
+  ScopedTensorDescriptor in_back_nd(input_descriptor, cudnn_type);
+  ScopedFilterDescriptor filter(filter_descriptor, cudnn_type);
+  ScopedConvolutionDescriptor conv(convolution_descriptor,
+                                   GetConvComputeType<T>());
 
   const bool is_profiling = output_profile_result != nullptr;
-  cudnnConvolutionBwdDataAlgo_t algo;
-  DeviceMemory<uint8> scratch;
-
-  if (algorithm_config.algorithm().is_default()) {
-    // With the default algorithm, use Cudnn's heuristics.
-    auto get_algorithm = [&](bool specify_limit) SHARED_LOCKS_REQUIRED(
-        dnn_handle_mutex_) -> cudnnConvolutionBwdDataAlgo_t {
-      cudnnConvolutionBwdDataPreference_t preference =
-          specify_limit ? CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT
-                        : CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE;
-
-      auto memory_limit_bytes =
-          scratch_allocator == nullptr
-              ? 0
-              : scratch_allocator->GetMemoryLimitInBytes(stream);
-      if (memory_limit_bytes < 0) {
-        memory_limit_bytes = 0;
-      }
-      cudnnConvolutionBwdDataAlgo_t algo_to_use;
-      cudnnStatus_t status = wrap::cudnnGetConvolutionBackwardDataAlgorithm(
-          parent_, ToHandle(dnn_handle_),
-          /*filterDesc=*/filter.handle(),
-          /*diffDesc=*/out_back_nd.handle(),
-          /*convDesc=*/conv.handle(),
-          /*gradDesc=*/in_back_nd.handle(),
-          /*preference=*/preference,
-          /*memoryLimitInBytes=*/memory_limit_bytes,
-          /*algo=*/&algo_to_use);
-      CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << "Unable to find a suitable "
-                                                "algorithm for doing backward "
-                                                "data convolution";
-      return algo_to_use;
-    };
-
-    algo = get_algorithm(/*specify_limit=*/scratch_allocator != nullptr);
-
-    if (scratch_allocator != nullptr) {
-      size_t size_in_bytes;
-      status = wrap::cudnnGetConvolutionBackwardDataWorkspaceSize(
-          parent_, ToHandle(dnn_handle_),
-          /*filterDesc=*/filter.handle(),
-          /*diffDesc=*/out_back_nd.handle(),
-          /*convDesc=*/conv.handle(),
-          /*gradDesc=*/in_back_nd.handle(),
-          /*algo=*/algo,
-          /*sizeInBytes=*/&size_in_bytes);
-      int64 size_in_bytes_int64 = size_in_bytes;
-      if (status == CUDNN_STATUS_SUCCESS && size_in_bytes_int64 != 0) {
-        if (size_in_bytes_int64 > 0) {
-          auto allocated =
-              scratch_allocator->AllocateBytes(stream, size_in_bytes);
-          if (allocated.ok()) {
-            scratch = allocated.ValueOrDie();
-          } else {
-            LOG(WARNING) << allocated.status().error_message();
-          }
-        } else {
-          LOG(WARNING)
-              << "cudnnGetConvolutionBackwardDataWorkspaceSize() returned "
-                 "negative sizeInBytes value. This could be a cudnn bug.";
-        }
-      }
-    }
 
-    // If we didn't allocate any scratch space (perhaps because of failed
-    // allocation), we force a switch back to the "no workspace" algorithm.
-    if (scratch == nullptr) {
-      algo = get_algorithm(/*specify_limit=*/false);
-    }
-  } else {
-    // An algorithm has been specified.
-    dnn::AlgorithmDesc algotype = algorithm_config.algorithm();
-    algo = ToConvBackwardDataAlgo(algotype);
-    conv.set_use_tensor_op_math(algotype.tensor_ops_enabled());
-    size_t size_in_bytes;
-    status = wrap::cudnnGetConvolutionBackwardDataWorkspaceSize(
-        parent_, ToHandle(dnn_handle_),
-        /*filterDesc=*/filter.handle(),
-        /*diffDesc=*/out_back_nd.handle(),
-        /*convDesc=*/conv.handle(),
-        /*gradDesc=*/in_back_nd.handle(),
-        /*algo=*/algo,
-        /*sizeInBytes=*/&size_in_bytes);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      if (is_profiling) {
-        // Silently return when we are profiling.
-        return false;
-      }
-      LOG(FATAL) << "Cannot query the size of workspace needed for the given "
-                    "algorithm: "
-                 << algorithm_config.algorithm().algo_id();
-    }
-    int64 size_in_bytes_int64 = size_in_bytes;
-    if (size_in_bytes_int64 > 0) {
-      if (scratch_allocator == nullptr) {
-        LOG(FATAL) << "An allocator must be specified when scratch memory is "
-                      "needed";
-      }
-      auto allocated = scratch_allocator->AllocateBytes(stream, size_in_bytes);
-      if (is_profiling && !allocated.ok()) {
-        // Silently return when we are profiling.
-        return false;
-      }
-      if (allocated.ok()) {
-        scratch = allocated.ValueOrDie();
-      } else {
-        LOG(WARNING) << allocated.status().error_message();
-      }
-      if (scratch == nullptr) {
-        CHECK(!algorithm_config.algorithm_no_scratch().is_default())
-            << "The primary convolution algorithm failed memory allocation, "
-               "while a secondary algorithm is not provided.";
-        dnn::AlgorithmDesc algotype = algorithm_config.algorithm_no_scratch();
-        algo = ToConvBackwardDataAlgo(algotype);
-        conv.set_use_tensor_op_math(algotype.tensor_ops_enabled());
-      }
-    } else if (size_in_bytes_int64 < 0) {
-      LOG(WARNING) << "cudnnGetConvolutionBackwardDataWorkspaceSize() returned "
-                      "negative sizeInBytes value. This could be a cudnn bug.";
-    }
-  }
+  DeviceMemory<uint8> scratch;
+  SE_ASSIGN_OR_RETURN(dnn::AlgorithmDesc algo_desc,
+                      GetCudnnConvolutionBackwardDataAlgorithm(
+                          stream, cudnn, algorithm_config, in_back_nd, filter,
+                          conv, out_back_nd, scratch_allocator, &scratch));
 
-  std::unique_ptr<CUDATimer> timer;
+  std::unique_ptr<CUDATimer, TimerDeleter> timer;
   if (is_profiling) {
     timer.reset(new CUDATimer(parent_));  // NOLINT
-    timer->Init();
     // The start and stop of the timer should be as close to the Cudnn call as
     // possible. It is still possible for other threads to issue workload on
     // to this stream. So it could take multiple profiling measurements.
-    timer->Start(AsCUDAStream(stream));
-  }
-
-#if CUDNN_VERSION >= 5000
-  status = wrap::cudnnConvolutionBackwardData(
-#else
-  status = wrap::cudnnConvolutionBackwardData_v3(
-#endif
-      this, stream, ToHandle(dnn_handle_),
-      /*alpha=*/alpha,
-      /*filterDesc=*/filter.handle(),
-      /*filterData=*/filter_data.opaque(),
-      /*diffDesc=*/out_back_nd.handle(),
-      /*diffData=*/backward_output_data.opaque(),
-      /*convDesc=*/conv.handle(),
-      /*algo=*/algo,
-      /*workSpace=*/scratch.opaque(),
-      /*workSpaceSizeInBytes=*/scratch.size(),
-      /*beta=*/beta,
-      /*gradDesc=*/in_back_nd.handle(),
-      /*gradData=*/backward_input_data->opaque());
+    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+      return port::Status(port::error::INTERNAL, "Failed to start timer");
+    }
+  }
+
+  RETURN_IF_CUDNN_ERROR(
+      cudnnConvolutionBackwardData(cudnn.handle(),
+                                   /*alpha=*/alpha,
+                                   /*wDesc=*/filter.handle(),
+                                   /*w=*/filter_data.opaque(),
+                                   /*dyDesc=*/out_back_nd.handle(),
+                                   /*dy=*/backward_output_data.opaque(),
+                                   /*convDesc=*/conv.handle(),
+                                   /*algo=*/ToConvBackwardDataAlgo(algo_desc),
+                                   /*workSpace=*/scratch.opaque(),
+                                   /*workSpaceSizeInBytes=*/scratch.size(),
+                                   /*beta=*/beta,
+                                   /*dxDesc=*/in_back_nd.handle(),
+                                   /*dx=*/backward_input_data->opaque()));
   if (is_profiling) {
-    timer->Stop(AsCUDAStream(stream));
-    if (status == CUDNN_STATUS_SUCCESS) {
-      bool use_tensor_ops = algorithm_config.algorithm().tensor_ops_enabled();
-      dnn::AlgorithmDesc algotype(algo, use_tensor_ops);
-      output_profile_result->set_algorithm(algotype);
-      output_profile_result->set_elapsed_time_in_ms(
-          timer->GetElapsedMilliseconds());
-    }
-    timer->Destroy();
-  }
-  if (status != CUDNN_STATUS_SUCCESS) {
-    // Silently return when we are profiling.
-    if (!is_profiling) {
-      LOG(ERROR) << "failed to enqueue convolution on stream: "
-                 << ToString(status);
+    if (!timer->Stop(AsCUDAStream(stream))) {
+      return port::Status(port::error::INTERNAL, "Failed to stop timer");
     }
-    return false;
+    output_profile_result->set_algorithm(algo_desc);
+    output_profile_result->set_elapsed_time_in_ms(
+        timer->GetElapsedMilliseconds());
   }
-  return true;
+
+  return port::Status::OK();
 }
 
 bool CudnnSupport::DoConvolveBackwardData(
-    Stream* stream, const FilterDescriptor& filter_descriptor,
+    Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
     const DeviceMemory<double>& filter_data,
-    const BatchDescriptor& output_descriptor,
+    const dnn::BatchDescriptor& output_descriptor,
     DeviceMemory<double> backward_output_data,
-    const ConvolutionDescriptor& convolution_descriptor,
-    const BatchDescriptor& input_descriptor,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::BatchDescriptor& input_descriptor,
     DeviceMemory<double>* backward_input_data,
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return DoConvolveBackwardDataImpl(stream, filter_descriptor, filter_data,
-                                    output_descriptor, backward_output_data,
-                                    convolution_descriptor, input_descriptor,
-                                    backward_input_data, scratch_allocator,
-                                    algorithm_config, output_profile_result);
+  return IsStatusOk(
+      DoConvolveBackwardDataImpl(stream, filter_descriptor, filter_data,
+                                 output_descriptor, backward_output_data,
+                                 convolution_descriptor, input_descriptor,
+                                 backward_input_data, scratch_allocator,
+                                 algorithm_config, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoConvolveBackwardData(
-    Stream* stream, const FilterDescriptor& filter_descriptor,
+    Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
     const DeviceMemory<float>& filter_data,
-    const BatchDescriptor& output_descriptor,
+    const dnn::BatchDescriptor& output_descriptor,
     DeviceMemory<float> backward_output_data,
-    const ConvolutionDescriptor& convolution_descriptor,
-    const BatchDescriptor& input_descriptor,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::BatchDescriptor& input_descriptor,
     DeviceMemory<float>* backward_input_data,
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return DoConvolveBackwardDataImpl(stream, filter_descriptor, filter_data,
-                                    output_descriptor, backward_output_data,
-                                    convolution_descriptor, input_descriptor,
-                                    backward_input_data, scratch_allocator,
-                                    algorithm_config, output_profile_result);
+  return IsStatusOk(
+      DoConvolveBackwardDataImpl(stream, filter_descriptor, filter_data,
+                                 output_descriptor, backward_output_data,
+                                 convolution_descriptor, input_descriptor,
+                                 backward_input_data, scratch_allocator,
+                                 algorithm_config, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoConvolveBackwardData(
-    Stream* stream, const FilterDescriptor& filter_descriptor,
+    Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
     const DeviceMemory<Eigen::half>& filter_data,
-    const BatchDescriptor& output_descriptor,
+    const dnn::BatchDescriptor& output_descriptor,
     DeviceMemory<Eigen::half> backward_output_data,
-    const ConvolutionDescriptor& convolution_descriptor,
-    const BatchDescriptor& input_descriptor,
+    const dnn::ConvolutionDescriptor& convolution_descriptor,
+    const dnn::BatchDescriptor& input_descriptor,
     DeviceMemory<Eigen::half>* backward_input_data,
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return DoConvolveBackwardDataImpl(stream, filter_descriptor, filter_data,
-                                    output_descriptor, backward_output_data,
-                                    convolution_descriptor, input_descriptor,
-                                    backward_input_data, scratch_allocator,
-                                    algorithm_config, output_profile_result);
+  return IsStatusOk(
+      DoConvolveBackwardDataImpl(stream, filter_descriptor, filter_data,
+                                 output_descriptor, backward_output_data,
+                                 convolution_descriptor, input_descriptor,
+                                 backward_input_data, scratch_allocator,
+                                 algorithm_config, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 template <class T>
-bool CudnnSupport::DoConvolveBackwardFilterImpl(
+port::Status CudnnSupport::DoConvolveBackwardFilterImpl(
     Stream* stream, const dnn::BatchDescriptor& input_descriptor,
     const DeviceMemory<T>& input_data,
-    const dnn::BatchDescriptor& output_descriptor_in,
+    const dnn::BatchDescriptor& output_descriptor,
     DeviceMemory<T> backward_output_data,
     const dnn::ConvolutionDescriptor& convolution_descriptor,
     const dnn::FilterDescriptor& filter_descriptor,
     DeviceMemory<T>* backward_filter_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status);
-  }
-
   cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
   // Alpha is the scaling factor for input.
   float falpha = 1.0;
@@ -3769,192 +3165,57 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
   void* beta = cudnn_type == CUDNN_DATA_DOUBLE ? static_cast<void*>(&dbeta)
                                                : static_cast<void*>(&fbeta);
 
-  // TBD(keveman): remove once cuDNN supports kBatchYXDepth for backward pass.
-  BatchDescriptor output_descriptor;
-  output_descriptor.CloneFrom(output_descriptor_in);
-  std::unique_ptr<TemporaryDeviceMemory<T>> transform_scratch;
-  backward_output_data = MaybeTransformLayout(
-      stream, &output_descriptor, backward_output_data, &transform_scratch);
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
 
-  ScopedTensorDescriptor out_back_nd{parent_, output_descriptor, cudnn_type};
-  ScopedTensorDescriptor input_nd{parent_, input_descriptor, cudnn_type};
-  ScopedFilterDescriptor filter{parent_, filter_descriptor, input_descriptor,
-                                cudnn_type};
-  ScopedConvolutionDescriptor conv{parent_, convolution_descriptor,
-                                   GetConvComputeType<T>()};
+  ScopedTensorDescriptor out_back_nd(output_descriptor, cudnn_type);
+  ScopedTensorDescriptor input_nd(input_descriptor, cudnn_type);
+  ScopedFilterDescriptor filter(filter_descriptor, cudnn_type);
+  ScopedConvolutionDescriptor conv(convolution_descriptor,
+                                   GetConvComputeType<T>());
 
   const bool is_profiling = output_profile_result != nullptr;
-  cudnnConvolutionBwdFilterAlgo_t algo;
-  DeviceMemory<uint8> scratch;
-
-  if (algorithm_config.algorithm().is_default()) {
-    // With the default algorithm, use Cudnn's heuristics.
-
-    // Lambda that retrieves the algorithm.
-    // specify_limit will occur when we have a scratch allocator and it succeeds
-    // in allocating; otherwise, we'll fall back to the "no workspace" version.
-    auto get_algorithm = [&](bool specify_limit) SHARED_LOCKS_REQUIRED(
-        dnn_handle_mutex_) {
-      cudnnConvolutionBwdFilterPreference_t preference =
-          specify_limit ? CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT
-                        : CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
-
-      auto memory_limit_bytes =
-          scratch_allocator == nullptr
-              ? 0
-              : scratch_allocator->GetMemoryLimitInBytes(stream);
-      if (memory_limit_bytes < 0) {
-        memory_limit_bytes = 0;
-      }
-
-      cudnnConvolutionBwdFilterAlgo_t algo_to_use;
-      cudnnStatus_t status = wrap::cudnnGetConvolutionBackwardFilterAlgorithm(
-          parent_, ToHandle(dnn_handle_),
-          /*srcDesc=*/input_nd.handle(),
-          /*diffDesc=*/out_back_nd.handle(),
-          /*convDesc=*/conv.handle(),
-          /*gradDesc=*/filter.handle(),
-          /*preference=*/preference,
-          /*memoryLimitInBytes=*/memory_limit_bytes,
-          /*algo=*/&algo_to_use);
-      CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << "Unable to find a suitable "
-                                                "algorithm for doing backward "
-                                                "filter convolution";
-      return algo_to_use;
-    };
-
-    algo = get_algorithm(/*specify_limit=*/scratch_allocator != nullptr);
-
-    if (scratch_allocator != nullptr) {
-      size_t size_in_bytes;
-      status = wrap::cudnnGetConvolutionBackwardFilterWorkspaceSize(
-          parent_, ToHandle(dnn_handle_), /*srcDesc=*/input_nd.handle(),
-          /*diffDesc=*/out_back_nd.handle(), /*convDesc=*/conv.handle(),
-          /*gradDesc=*/filter.handle(), /*algo=*/algo,
-          /*sizeInBytes=*/&size_in_bytes);
-      int64 size_in_bytes_int64 = size_in_bytes;
-      if (status == CUDNN_STATUS_SUCCESS && size_in_bytes_int64 != 0) {
-        if (size_in_bytes_int64 > 0) {
-          auto allocated =
-              scratch_allocator->AllocateBytes(stream, size_in_bytes);
-          if (allocated.ok()) {
-            scratch = allocated.ValueOrDie();
-          } else {
-            LOG(WARNING) << allocated.status().error_message();
-          }
-        } else {
-          LOG(WARNING)
-              << "cudnnGetConvolutionBackwardFilterWorkspaceSize() returned "
-                 "negative sizeInBytes value. This could be a cudnn bug.";
-        }
-      }
-    }
 
-    // If we didn't allocate any scratch space (perhaps because of failed
-    // allocation), we force a switch back to the "no workspace" algorithm.
-    if (scratch == nullptr) {
-      algo = get_algorithm(/*specify_limit=*/false);
-    }
-  } else {
-    // An algorithm has been specified.
-    dnn::AlgorithmDesc algotype = algorithm_config.algorithm();
-    algo = ToConvBackwardFilterAlgo(algotype);
-    conv.set_use_tensor_op_math(algotype.tensor_ops_enabled());
-
-    size_t size_in_bytes;
-    status = wrap::cudnnGetConvolutionBackwardFilterWorkspaceSize(
-        parent_, ToHandle(dnn_handle_), /*srcDesc=*/input_nd.handle(),
-        /*diffDesc=*/out_back_nd.handle(), /*convDesc=*/conv.handle(),
-        /*gradDesc=*/filter.handle(), /*algo=*/algo,
-        /*sizeInBytes=*/&size_in_bytes);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      if (is_profiling) {
-        // Silently return when we are profiling.
-        return false;
-      }
-      LOG(FATAL) << "Cannot query the size of workspace needed for the given "
-                    "algorithm: "
-                 << algorithm_config.algorithm().algo_id();
-    }
-    int64 size_in_bytes_int64 = size_in_bytes;
-    if (size_in_bytes_int64 > 0) {
-      if (scratch_allocator == nullptr) {
-        LOG(FATAL) << "An allocator must be specified when scratch memory is "
-                      "needed";
-      }
-      auto allocated = scratch_allocator->AllocateBytes(stream, size_in_bytes);
-      if (is_profiling && !allocated.ok()) {
-        // Silently return when we are profiling.
-        return false;
-      }
-      if (allocated.ok()) {
-        scratch = allocated.ValueOrDie();
-      } else {
-        LOG(WARNING) << allocated.status().error_message();
-      }
-      if (scratch == nullptr) {
-        CHECK(!algorithm_config.algorithm_no_scratch().is_default())
-            << "The primary convolution algorithm failed memory allocation, "
-               "while a secondary algorithm is not provided.";
-        dnn::AlgorithmDesc algotype = algorithm_config.algorithm_no_scratch();
-        algo = ToConvBackwardFilterAlgo(algotype);
-        conv.set_use_tensor_op_math(algotype.tensor_ops_enabled());
-      }
-    } else if (size_in_bytes_int64 < 0) {
-      LOG(WARNING)
-          << "cudnnGetConvolutionBackwardFilterWorkspaceSize() returned "
-             "negative sizeInBytes value. This could be a cudnn bug.";
-    }
-  }
+  DeviceMemory<uint8> scratch;
+  SE_ASSIGN_OR_RETURN(dnn::AlgorithmDesc algo_desc,
+                      GetCudnnConvolutionBackwardFilterAlgorithm(
+                          stream, cudnn, algorithm_config, input_nd, filter,
+                          conv, out_back_nd, scratch_allocator, &scratch));
 
-  std::unique_ptr<CUDATimer> timer;
+  std::unique_ptr<CUDATimer, TimerDeleter> timer;
   if (is_profiling) {
     timer.reset(new CUDATimer(parent_));  // NOLINT
-    timer->Init();
     // The start and stop of the timer should be as close to the Cudnn call as
     // possible. It is still possible for other threads to issue workload on
     // to this stream. So it could take multiple profiling measurements.
-    timer->Start(AsCUDAStream(stream));
+    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+      return port::Status(port::error::INTERNAL, "Failed to start timer");
+    }
   }
 
-#if CUDNN_VERSION >= 5000
-  status = wrap::cudnnConvolutionBackwardFilter(
-#else
-  status = wrap::cudnnConvolutionBackwardFilter_v3(
-#endif
-      this, stream, ToHandle(dnn_handle_), /*alpha=*/alpha,
+  RETURN_IF_CUDNN_ERROR(cudnnConvolutionBackwardFilter(
+      cudnn.handle(),
+      /*alpha=*/alpha,
       /*srcDesc=*/input_nd.handle(),
       /*srcData=*/input_data.opaque(),
       /*diffDesc=*/out_back_nd.handle(),
       /*diffData=*/backward_output_data.opaque(),
       /*convDesc=*/conv.handle(),
-      /*algo=*/algo,
+      /*algo=*/ToConvBackwardFilterAlgo(algo_desc),
       /*workSpace=*/scratch.opaque(),
       /*workSpaceSizeInBytes=*/scratch.size(),
       /*beta=*/beta,
       /*gradDesc=*/filter.handle(),
-      /*gradData=*/backward_filter_data->opaque());
-
+      /*dw=*/backward_filter_data->opaque()));
   if (is_profiling) {
-    timer->Stop(AsCUDAStream(stream));
-    if (status == CUDNN_STATUS_SUCCESS) {
-      bool use_tensor_ops = algorithm_config.algorithm().tensor_ops_enabled();
-      dnn::AlgorithmDesc algotype(algo, use_tensor_ops);
-      output_profile_result->set_algorithm(algotype);
-      output_profile_result->set_elapsed_time_in_ms(
-          timer->GetElapsedMilliseconds());
-    }
-    timer->Destroy();
-  }
-  if (status != CUDNN_STATUS_SUCCESS) {
-    // Silently return when we are profiling.
-    if (!is_profiling) {
-      LOG(ERROR) << "failed to enqueue convolution on stream: "
-                 << ToString(status);
+    if (!timer->Stop(AsCUDAStream(stream))) {
+      return port::Status(port::error::INTERNAL, "Failed to stop timer");
     }
-    return false;
+    output_profile_result->set_algorithm(algo_desc);
+    output_profile_result->set_elapsed_time_in_ms(
+        timer->GetElapsedMilliseconds());
   }
-  return true;
+
+  return port::Status::OK();
 }
 
 bool CudnnSupport::DoConvolveBackwardFilter(
@@ -3968,11 +3229,13 @@ bool CudnnSupport::DoConvolveBackwardFilter(
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return DoConvolveBackwardFilterImpl(stream, input_descriptor, input_data,
-                                      output_descriptor, backward_output_data,
-                                      convolution_descriptor, filter_descriptor,
-                                      backward_filter_data, scratch_allocator,
-                                      algorithm_config, output_profile_result);
+  return IsStatusOk(
+      DoConvolveBackwardFilterImpl(stream, input_descriptor, input_data,
+                                   output_descriptor, backward_output_data,
+                                   convolution_descriptor, filter_descriptor,
+                                   backward_filter_data, scratch_allocator,
+                                   algorithm_config, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoConvolveBackwardFilter(
@@ -3986,11 +3249,13 @@ bool CudnnSupport::DoConvolveBackwardFilter(
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return DoConvolveBackwardFilterImpl(stream, input_descriptor, input_data,
-                                      output_descriptor, backward_output_data,
-                                      convolution_descriptor, filter_descriptor,
-                                      backward_filter_data, scratch_allocator,
-                                      algorithm_config, output_profile_result);
+  return IsStatusOk(
+      DoConvolveBackwardFilterImpl(stream, input_descriptor, input_data,
+                                   output_descriptor, backward_output_data,
+                                   convolution_descriptor, filter_descriptor,
+                                   backward_filter_data, scratch_allocator,
+                                   algorithm_config, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoConvolveBackwardFilter(
@@ -4004,71 +3269,68 @@ bool CudnnSupport::DoConvolveBackwardFilter(
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return DoConvolveBackwardFilterImpl(stream, input_descriptor, input_data,
-                                      output_descriptor, backward_output_data,
-                                      convolution_descriptor, filter_descriptor,
-                                      backward_filter_data, scratch_allocator,
-                                      algorithm_config, output_profile_result);
+  return IsStatusOk(
+      DoConvolveBackwardFilterImpl(stream, input_descriptor, input_data,
+                                   output_descriptor, backward_output_data,
+                                   convolution_descriptor, filter_descriptor,
+                                   backward_filter_data, scratch_allocator,
+                                   algorithm_config, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 template <class T>
-bool CudnnSupport::DoConvolveBackwardBiasImpl(
+port::Status CudnnSupport::DoConvolveBackwardBiasImpl(
     Stream* stream, const dnn::BatchDescriptor& input_descriptor,
     const DeviceMemory<T>& input_data,
     const dnn::BatchDescriptor& bias_descriptor,
     DeviceMemory<T>* backward_bias_data) {
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(FATAL) << "failed to set stream for cudnn handle: " << ToString(status);
-  }
-
   cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
-  ScopedTensorDescriptor input_nd{parent_, input_descriptor, cudnn_type};
-  ScopedTensorDescriptor bias_nd{parent_, bias_descriptor, cudnn_type};
+  ScopedTensorDescriptor input_nd(input_descriptor, cudnn_type);
+  ScopedTensorDescriptor bias_nd(bias_descriptor, cudnn_type);
 
   // Alpha is the scaling factor for input.
   float alpha = 1.0;
   // Beta is the scaling factor for output.
   float beta = 0.0;
 
-  status = wrap::cudnnConvolutionBackwardBias(
-      this, stream, ToHandle(dnn_handle_), &alpha, input_nd.handle(),
-      input_data.opaque(), &beta, bias_nd.handle(),
-      backward_bias_data->opaque());
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to enqueue backward convolution on stream: "
-               << ToString(status);
-    return false;
-  }
-  return true;
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+  RETURN_IF_CUDNN_ERROR(cudnnConvolutionBackwardBias(
+      cudnn.handle(), &alpha, input_nd.handle(), input_data.opaque(), &beta,
+      bias_nd.handle(), backward_bias_data->opaque()));
+  return port::Status::OK();
 }
 
 bool CudnnSupport::DoConvolveBackwardBias(
-    Stream* stream, const BatchDescriptor& input_descriptor,
+    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
     const DeviceMemory<double>& input_data,
-    const BatchDescriptor& bias_descriptor,
+    const dnn::BatchDescriptor& bias_descriptor,
     DeviceMemory<double>* backward_bias_data) {
-  return DoConvolveBackwardBiasImpl(stream, input_descriptor, input_data,
-                                    bias_descriptor, backward_bias_data);
+  return IsStatusOk(
+      DoConvolveBackwardBiasImpl(stream, input_descriptor, input_data,
+                                 bias_descriptor, backward_bias_data),
+      /*report_error=*/true);
 }
 
 bool CudnnSupport::DoConvolveBackwardBias(
-    Stream* stream, const BatchDescriptor& input_descriptor,
+    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
     const DeviceMemory<float>& input_data,
-    const BatchDescriptor& bias_descriptor,
+    const dnn::BatchDescriptor& bias_descriptor,
     DeviceMemory<float>* backward_bias_data) {
-  return DoConvolveBackwardBiasImpl(stream, input_descriptor, input_data,
-                                    bias_descriptor, backward_bias_data);
+  return IsStatusOk(
+      DoConvolveBackwardBiasImpl(stream, input_descriptor, input_data,
+                                 bias_descriptor, backward_bias_data),
+      /*report_error=*/true);
 }
 
 bool CudnnSupport::DoConvolveBackwardBias(
-    Stream* stream, const BatchDescriptor& input_descriptor,
+    Stream* stream, const dnn::BatchDescriptor& input_descriptor,
     const DeviceMemory<Eigen::half>& input_data,
-    const BatchDescriptor& bias_descriptor,
+    const dnn::BatchDescriptor& bias_descriptor,
     DeviceMemory<Eigen::half>* backward_bias_data) {
-  return DoConvolveBackwardBiasImpl(stream, input_descriptor, input_data,
-                                    bias_descriptor, backward_bias_data);
+  return IsStatusOk(
+      DoConvolveBackwardBiasImpl(stream, input_descriptor, input_data,
+                                 bias_descriptor, backward_bias_data),
+      /*report_error=*/true);
 }
 
 bool CudnnSupport::DoMatMul(Stream* stream,
@@ -4211,17 +3473,15 @@ bool CudnnSupport::DoBiasAdd(Stream* stream,
                              const DeviceMemory<float>& biases,
                              const dnn::BatchDescriptor& dimensions,
                              DeviceMemory<float>* output_data) {
-  ScopedTensorDescriptor input_descriptor{parent_, dimensions,
-                                          CUDNN_DATA_FLOAT};
+  ScopedTensorDescriptor input_descriptor(dimensions, CUDNN_DATA_FLOAT);
 
-  BatchDescriptor bias_dimensions;
+  dnn::BatchDescriptor bias_dimensions;
   bias_dimensions.set_count(1)
       .set_feature_map_count(dimensions.feature_map_count())
       .set_height(1)
       .set_width(1)
       .set_layout(dnn::DataLayout::kBatchYXDepth);
-  ScopedTensorDescriptor bias_descriptor{parent_, bias_dimensions,
-                                         CUDNN_DATA_FLOAT};
+  ScopedTensorDescriptor bias_descriptor(bias_dimensions, CUDNN_DATA_FLOAT);
 
   // cudnnAddTensor after R3 is in-place, so we need to copy input_data to
   // output_data before doing the addition, unless the input and
@@ -4237,30 +3497,18 @@ bool CudnnSupport::DoBiasAdd(Stream* stream,
     }
   }
 
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
-    return false;
-  }
-
   const float alpha = 1.0f;
   const float beta = 1.0f;
 
-#if CUDNN_VERSION >= 5000
-  status = wrap::cudnnAddTensor(
-#else
-  status = wrap::cudnnAddTensor_v3(
-#endif
-      this, stream, ToHandle(dnn_handle_), &alpha, bias_descriptor.handle(),
-      biases.opaque(), &beta, input_descriptor.handle(), output_data->opaque());
-
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "stream " << stream << " could not enqueue bias addition.";
-    return false;
-  }
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
 
-  return true;
+  auto status = [&] {
+    RETURN_IF_CUDNN_ERROR(cudnnAddTensor(
+        cudnn.handle(), &alpha, bias_descriptor.handle(), biases.opaque(),
+        &beta, input_descriptor.handle(), output_data->opaque()));
+    return port::Status::OK();
+  }();
+  return IsStatusOk(status, /*report_error=*/true);
 }
 
 bool CudnnSupport::DoActivate(Stream* stream,
@@ -4269,66 +3517,23 @@ bool CudnnSupport::DoActivate(Stream* stream,
                               const DeviceMemory<float>& input_data,
                               DeviceMemory<float>* output_data,
                               uint64 options) {
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
-    return false;
-  }
-
-#if CUDNN_VERSION >= 5000
-  ScopedActivationDescriptor activation_desc{
-      parent_, activation_mode, CUDNN_PROPAGATE_NAN, dimensions.value_max()};
-#else
-  cudnnActivationMode_t mode;
-  switch (activation_mode) {
-    case dnn::ActivationMode::kRelu6:
-      // TODO(leary) should probably do a post-pass to clip at 6?
-      LOG(WARNING) << "user requested Relu6, but providing Relu instead";
-      mode = CUDNN_ACTIVATION_RELU;
-      break;
-    case dnn::ActivationMode::kReluX:
-      // TODO(broune) should probably do a post-pass to clip at X?
-      LOG(WARNING) << "user requested ReluX, but providing Relu instead";
-      mode = CUDNN_ACTIVATION_RELU;
-      break;
-    case dnn::ActivationMode::kRelu:
-      mode = CUDNN_ACTIVATION_RELU;
-      break;
-    case dnn::ActivationMode::kSigmoid:
-      mode = CUDNN_ACTIVATION_SIGMOID;
-      break;
-    case dnn::ActivationMode::kTanh:
-      mode = CUDNN_ACTIVATION_TANH;
-      break;
-    default:
-      LOG(ERROR) << "unrecognized activation mode: "
-                 << static_cast<int>(activation_mode);
-      return false;
-  }
-#endif
+  ScopedActivationDescriptor activation_desc(
+      activation_mode, CUDNN_PROPAGATE_NAN, dimensions.value_max());
 
-  ScopedTensorDescriptor input_nd{parent_, dimensions, CUDNN_DATA_FLOAT};
+  ScopedTensorDescriptor input_nd(dimensions, CUDNN_DATA_FLOAT);
   // Alpha is the input scaling factor.
   float alpha = 1.0;
   // Beta is the output scaling factor.
   float beta = 0.0;
-  status = wrap::cudnnActivationForward(
-      this, stream, ToHandle(dnn_handle_),
-#if CUDNN_VERSION >= 5000
-      activation_desc.handle(),
-#else
-      mode,
-#endif
-      &alpha, input_nd.handle(), input_data.opaque(), &beta, input_nd.handle(),
-      output_data->opaque());
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "stream " << stream
-               << " could not enqueue activation: " << ToString(status);
-    return false;
-  }
 
-  return true;
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+  auto status = [&] {
+    RETURN_IF_CUDNN_ERROR(cudnnActivationForward(
+        cudnn.handle(), activation_desc.handle(), &alpha, input_nd.handle(),
+        input_data.opaque(), &beta, input_nd.handle(), output_data->opaque()));
+    return port::Status::OK();
+  }();
+  return IsStatusOk(status, /*report_error=*/true);
 }
 
 bool CudnnSupport::DoPoolForward(
@@ -4337,32 +3542,23 @@ bool CudnnSupport::DoPoolForward(
     const DeviceMemory<double>& input_data,
     const dnn::BatchDescriptor& output_dimensions,
     DeviceMemory<double>* output_data) {
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
-    return false;
-  }
-
   // Alpha is the scaling factor for input.
   double alpha = 1.0;
   // Beta is the scaling factor for output.
   double beta = 0.0;
 
-  ScopedTensorDescriptor src_desc{parent_, input_dimensions, CUDNN_DATA_DOUBLE};
-  ScopedTensorDescriptor dest_desc{parent_, output_dimensions,
-                                   CUDNN_DATA_DOUBLE};
-  ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
-  status = wrap::cudnnPoolingForward(
-      this, stream, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
-      src_desc.handle(), input_data.opaque(), &beta, dest_desc.handle(),
-      output_data->opaque());
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to enqueue forward pooling on stream: "
-               << ToString(status);
-    return false;
-  }
-  return true;
+  ScopedTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_DOUBLE);
+  ScopedTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_DOUBLE);
+  ScopedPoolingDescriptor pooling_desc(pooling_dimensions);
+
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+  auto status = [&] {
+    RETURN_IF_CUDNN_ERROR(cudnnPoolingForward(
+        cudnn.handle(), pooling_desc.handle(), &alpha, src_desc.handle(),
+        input_data.opaque(), &beta, dest_desc.handle(), output_data->opaque()));
+    return port::Status::OK();
+  }();
+  return IsStatusOk(status, /*report_error=*/true);
 }
 
 bool CudnnSupport::DoPoolForward(
@@ -4371,32 +3567,23 @@ bool CudnnSupport::DoPoolForward(
     const DeviceMemory<float>& input_data,
     const dnn::BatchDescriptor& output_dimensions,
     DeviceMemory<float>* output_data) {
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
-    return false;
-  }
-
   // Alpha is the scaling factor for input.
   float alpha = 1.0;
   // Beta is the scaling factor for output.
   float beta = 0.0;
 
-  ScopedTensorDescriptor src_desc{parent_, input_dimensions, CUDNN_DATA_FLOAT};
-  ScopedTensorDescriptor dest_desc{parent_, output_dimensions,
-                                   CUDNN_DATA_FLOAT};
-  ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
-  status = wrap::cudnnPoolingForward(
-      this, stream, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
-      src_desc.handle(), input_data.opaque(), &beta, dest_desc.handle(),
-      output_data->opaque());
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to enqueue forward pooling on stream: "
-               << ToString(status);
-    return false;
-  }
-  return true;
+  ScopedTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_FLOAT);
+  ScopedTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_FLOAT);
+  ScopedPoolingDescriptor pooling_desc(pooling_dimensions);
+
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+  auto status = [&] {
+    RETURN_IF_CUDNN_ERROR(cudnnPoolingForward(
+        cudnn.handle(), pooling_desc.handle(), &alpha, src_desc.handle(),
+        input_data.opaque(), &beta, dest_desc.handle(), output_data->opaque()));
+    return port::Status::OK();
+  }();
+  return IsStatusOk(status, /*report_error=*/true);
 }
 
 bool CudnnSupport::DoPoolForward(
@@ -4405,31 +3592,22 @@ bool CudnnSupport::DoPoolForward(
     const DeviceMemory<Eigen::half>& input_data,
     const dnn::BatchDescriptor& output_dimensions,
     DeviceMemory<Eigen::half>* output_data) {
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
-    return false;
-  }
-
   // Alpha is the scaling factor for input.
   float alpha = 1.0;
   // Beta is the scaling factor for output.
   float beta = 0.0;
 
-  ScopedTensorDescriptor src_desc{parent_, input_dimensions, CUDNN_DATA_HALF};
-  ScopedTensorDescriptor dest_desc{parent_, output_dimensions, CUDNN_DATA_HALF};
-  ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
-  status = wrap::cudnnPoolingForward(
-      this, stream, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
-      src_desc.handle(), input_data.opaque(), &beta, dest_desc.handle(),
-      output_data->opaque());
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to enqueue forward pooling on stream: "
-               << ToString(status);
-    return false;
-  }
-  return true;
+  ScopedTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_HALF);
+  ScopedTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_HALF);
+  ScopedPoolingDescriptor pooling_desc(pooling_dimensions);
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+  auto status = [&] {
+    RETURN_IF_CUDNN_ERROR(cudnnPoolingForward(
+        cudnn.handle(), pooling_desc.handle(), &alpha, src_desc.handle(),
+        input_data.opaque(), &beta, dest_desc.handle(), output_data->opaque()));
+    return port::Status::OK();
+  }();
+  return IsStatusOk(status, /*report_error=*/true);
 }
 
 bool CudnnSupport::DoPoolBackward(
@@ -4440,33 +3618,25 @@ bool CudnnSupport::DoPoolBackward(
     const DeviceMemory<double>& output_data,
     const DeviceMemory<double>& input_diff_data,
     DeviceMemory<double>* output_diff_data) {
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
-    return false;
-  }
-
   // Alpha is the scaling factor for input.
   double alpha = 1.0;
   // Beta is the scaling factor for output.
   double beta = 0.0;
 
-  ScopedTensorDescriptor src_desc{parent_, input_dimensions, CUDNN_DATA_DOUBLE};
-  ScopedTensorDescriptor dest_desc{parent_, output_dimensions,
-                                   CUDNN_DATA_DOUBLE};
-  ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
-  status = wrap::cudnnPoolingBackward(
-      this, stream, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
-      dest_desc.handle(), output_data.opaque(), dest_desc.handle(),
-      input_diff_data.opaque(), src_desc.handle(), input_data.opaque(), &beta,
-      src_desc.handle(), output_diff_data->opaque());
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to enqueue backward pooling on stream: "
-               << ToString(status);
-    return false;
-  }
-  return true;
+  ScopedTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_DOUBLE);
+  ScopedTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_DOUBLE);
+  ScopedPoolingDescriptor pooling_desc(pooling_dimensions);
+
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+  auto status = [&] {
+    RETURN_IF_CUDNN_ERROR(cudnnPoolingBackward(
+        cudnn.handle(), pooling_desc.handle(), &alpha, dest_desc.handle(),
+        output_data.opaque(), dest_desc.handle(), input_diff_data.opaque(),
+        src_desc.handle(), input_data.opaque(), &beta, src_desc.handle(),
+        output_diff_data->opaque()));
+    return port::Status::OK();
+  }();
+  return IsStatusOk(status, /*report_error=*/true);
 }
 
 bool CudnnSupport::DoPoolBackward(
@@ -4477,33 +3647,25 @@ bool CudnnSupport::DoPoolBackward(
     const DeviceMemory<float>& output_data,
     const DeviceMemory<float>& input_diff_data,
     DeviceMemory<float>* output_diff_data) {
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
-    return false;
-  }
-
   // Alpha is the scaling factor for input.
   float alpha = 1.0;
   // Beta is the scaling factor for output.
   float beta = 0.0;
 
-  ScopedTensorDescriptor src_desc{parent_, input_dimensions, CUDNN_DATA_FLOAT};
-  ScopedTensorDescriptor dest_desc{parent_, output_dimensions,
-                                   CUDNN_DATA_FLOAT};
-  ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
-  status = wrap::cudnnPoolingBackward(
-      this, stream, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
-      dest_desc.handle(), output_data.opaque(), dest_desc.handle(),
-      input_diff_data.opaque(), src_desc.handle(), input_data.opaque(), &beta,
-      src_desc.handle(), output_diff_data->opaque());
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to enqueue backward pooling on stream: "
-               << ToString(status);
-    return false;
-  }
-  return true;
+  ScopedTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_FLOAT);
+  ScopedTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_FLOAT);
+  ScopedPoolingDescriptor pooling_desc(pooling_dimensions);
+
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+  auto status = [&] {
+    RETURN_IF_CUDNN_ERROR(cudnnPoolingBackward(
+        cudnn.handle(), pooling_desc.handle(), &alpha, dest_desc.handle(),
+        output_data.opaque(), dest_desc.handle(), input_diff_data.opaque(),
+        src_desc.handle(), input_data.opaque(), &beta, src_desc.handle(),
+        output_diff_data->opaque()));
+    return port::Status::OK();
+  }();
+  return IsStatusOk(status, /*report_error=*/true);
 }
 
 bool CudnnSupport::DoPoolBackward(
@@ -4514,32 +3676,25 @@ bool CudnnSupport::DoPoolBackward(
     const DeviceMemory<Eigen::half>& output_data,
     const DeviceMemory<Eigen::half>& input_diff_data,
     DeviceMemory<Eigen::half>* output_diff_data) {
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
-    return false;
-  }
-
   // Alpha is the scaling factor for input.
   float alpha = 1.0;
   // Beta is the scaling factor for output.
   float beta = 0.0;
 
-  ScopedTensorDescriptor src_desc{parent_, input_dimensions, CUDNN_DATA_HALF};
-  ScopedTensorDescriptor dest_desc{parent_, output_dimensions, CUDNN_DATA_HALF};
-  ScopedPoolingDescriptor pooling_desc{parent_, pooling_dimensions};
-  status = wrap::cudnnPoolingBackward(
-      this, stream, ToHandle(dnn_handle_), pooling_desc.handle(), &alpha,
-      dest_desc.handle(), output_data.opaque(), dest_desc.handle(),
-      input_diff_data.opaque(), src_desc.handle(), input_data.opaque(), &beta,
-      src_desc.handle(), output_diff_data->opaque());
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to enqueue backward pooling on stream: "
-               << ToString(status);
-    return false;
-  }
-  return true;
+  ScopedTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_HALF);
+  ScopedTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_HALF);
+  ScopedPoolingDescriptor pooling_desc(pooling_dimensions);
+
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+  auto status = [&] {
+    RETURN_IF_CUDNN_ERROR(cudnnPoolingBackward(
+        cudnn.handle(), pooling_desc.handle(), &alpha, dest_desc.handle(),
+        output_data.opaque(), dest_desc.handle(), input_diff_data.opaque(),
+        src_desc.handle(), input_data.opaque(), &beta, src_desc.handle(),
+        output_diff_data->opaque()));
+    return port::Status::OK();
+  }();
+  return IsStatusOk(status, /*report_error=*/true);
 }
 
 bool CudnnSupport::DoNormalize(
@@ -4555,7 +3710,7 @@ bool CudnnSupport::DoNormalizeWithDimensions(
     const DeviceMemory<float>& input_data, DeviceMemory<float>* output_data) {
   // Check for unsupported modes.
   if (normalize_descriptor.wrap_around()) {
-    LOG(ERROR) << "CUDA LRN does not support wrap-around mode";
+    LOG(ERROR) << "CUDA LRN does not support cudnn-around mode";
     return false;
   }
   if (normalize_descriptor.segment_size()) {
@@ -4563,31 +3718,25 @@ bool CudnnSupport::DoNormalizeWithDimensions(
     return false;
   }
 
-  // Launch the normalization.
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
-    return false;
-  }
-
-  ScopedTensorDescriptor dims{parent_, dimensions, CUDNN_DATA_FLOAT};
-  ScopedNormalizeDescriptor normalize{parent_, normalize_descriptor};
+  ScopedTensorDescriptor dims(dimensions, CUDNN_DATA_FLOAT);
+  ScopedNormalizeDescriptor normalize(normalize_descriptor);
 
   // Alpha is the scaling factor for input.
   float alpha = 1.0f;
   // Beta is the scaling factor for output.
   float beta = 0.0f;
 
-  status = wrap::cudnnLRNCrossChannelForward(
-      this, stream, ToHandle(dnn_handle_), normalize.handle(),
-      CUDNN_LRN_CROSS_CHANNEL_DIM1, &alpha, dims.handle(), input_data.opaque(),
-      &beta, dims.handle(), output_data->opaque());
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to run cudnnLRNCrossChannelForward";
-    return false;
-  }
-  return true;
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+
+  // Launch the normalization.
+  auto status = [&] {
+    RETURN_IF_CUDNN_ERROR(cudnnLRNCrossChannelForward(
+        cudnn.handle(), normalize.handle(), CUDNN_LRN_CROSS_CHANNEL_DIM1,
+        &alpha, dims.handle(), input_data.opaque(), &beta, dims.handle(),
+        output_data->opaque()));
+    return port::Status::OK();
+  }();
+  return IsStatusOk(status, /*report_error=*/true);
 }
 
 bool CudnnSupport::DoNormalizeBackwardWithDimensions(
@@ -4598,7 +3747,7 @@ bool CudnnSupport::DoNormalizeBackwardWithDimensions(
     DeviceMemory<float>* raw_variable_gradient) {
   // Check for unsupported modes.
   if (normalize_descriptor.wrap_around()) {
-    LOG(ERROR) << "CUDA LRN does not support wrap-around mode";
+    LOG(ERROR) << "CUDA LRN does not support cudnn-around mode";
     return false;
   }
   if (normalize_descriptor.segment_size()) {
@@ -4606,30 +3755,22 @@ bool CudnnSupport::DoNormalizeBackwardWithDimensions(
     return false;
   }
 
-  mutex_lock lock{dnn_handle_mutex_};
-  auto status = wrap::cudnnSetStream(this, stream, ToHandle(dnn_handle_));
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to set stream for cudnn handle: " << ToString(status);
-    return false;
-  }
-
-  ScopedTensorDescriptor dims{parent_, dimensions, CUDNN_DATA_FLOAT};
-  ScopedNormalizeDescriptor normalize{parent_, normalize_descriptor};
+  ScopedTensorDescriptor dims(dimensions, CUDNN_DATA_FLOAT);
+  ScopedNormalizeDescriptor normalize(normalize_descriptor);
 
   float alpha = 1.0f;
   float beta = 0.0f;
 
-  status = wrap::cudnnLRNCrossChannelBackward(
-      this, stream, ToHandle(dnn_handle_), normalize.handle(),
-      CUDNN_LRN_CROSS_CHANNEL_DIM1, &alpha, dims.handle(),
-      normalized_data.opaque(), dims.handle(),
-      normalized_variable_gradient.opaque(), dims.handle(), raw_data.opaque(),
-      &beta, dims.handle(), raw_variable_gradient->opaque());
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to run cudnnLRNCrossChannelBackward";
-    return false;
-  }
-  return true;
+  auto cudnn = cudnn_->GetHandle(parent_, stream);
+  auto status = [&] {
+    RETURN_IF_CUDNN_ERROR(cudnnLRNCrossChannelBackward(
+        cudnn.handle(), normalize.handle(), CUDNN_LRN_CROSS_CHANNEL_DIM1,
+        &alpha, dims.handle(), normalized_data.opaque(), dims.handle(),
+        normalized_variable_gradient.opaque(), dims.handle(), raw_data.opaque(),
+        &beta, dims.handle(), raw_variable_gradient->opaque()));
+    return port::Status::OK();
+  }();
+  return IsStatusOk(status, /*report_error=*/true);
 }
 
 bool CudnnSupport::DoDepthConcatenate(
@@ -4734,37 +3875,30 @@ bool CudnnSupport::DoMemcpyH2DQuantized(
 }
 
 bool CudnnSupport::DeriveOutputBatchDescriptor(
-    const BatchDescriptor& batch_descriptor,
-    const FilterDescriptor& filter_descriptor,
+    const dnn::BatchDescriptor& batch_descriptor,
+    const dnn::FilterDescriptor& filter_descriptor,
     const dnn::ConvolutionDescriptor& convolution_descriptor,
     dnn::BatchDescriptor* output_batch_descriptor) {
-  ScopedTensorDescriptor input_nd{parent_, batch_descriptor, CUDNN_DATA_FLOAT};
-  ScopedFilterDescriptor filter{parent_, filter_descriptor, batch_descriptor,
-                                CUDNN_DATA_FLOAT};
-  ScopedConvolutionDescriptor conv{parent_, convolution_descriptor,
-                                   CUDNN_DATA_FLOAT};
+  ScopedTensorDescriptor input_nd(batch_descriptor, CUDNN_DATA_FLOAT);
+  ScopedFilterDescriptor filter(filter_descriptor, CUDNN_DATA_FLOAT);
+  ScopedConvolutionDescriptor conv(convolution_descriptor, CUDNN_DATA_FLOAT);
 
   int dn = batch_descriptor.ndims() + 2;
   std::vector<int> dims(dn);  // in BDYX
-  auto status = wrap::cudnnGetConvolutionNdForwardOutputDim(
-      parent_, conv.handle(), input_nd.handle(), filter.handle(), dn,
-      dims.data());
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "could not get output tensor for convolution: "
-               << ToString(status);
-    return false;
-  }
-
-  output_batch_descriptor->set_count(dims[0])
-      .set_feature_map_count(dims[1])
-      .set_layout(batch_descriptor.layout());
-
-  for (int i = 0; i < batch_descriptor.ndims(); i++) {
-    output_batch_descriptor->set_spatial_dim(static_cast<dnn::DimIndex>(i),
-                                             dims.rbegin()[i]);
-  }
+  auto status = [&] {
+    RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionNdForwardOutputDim(
+        conv.handle(), input_nd.handle(), filter.handle(), dn, dims.data()));
+    output_batch_descriptor->set_count(dims[0])
+        .set_feature_map_count(dims[1])
+        .set_layout(batch_descriptor.layout());
 
-  return true;
+    for (int i = 0; i < batch_descriptor.ndims(); i++) {
+      output_batch_descriptor->set_spatial_dim(static_cast<dnn::DimIndex>(i),
+                                               dims.rbegin()[i]);
+    }
+    return port::Status::OK();
+  }();
+  return IsStatusOk(status, /*report_error=*/true);
 }
 
 }  // namespace cuda
@@ -4777,9 +3911,8 @@ void initialize_cudnn() {
             cuda::CUDAExecutor* cuda_executor =
                 dynamic_cast<cuda::CUDAExecutor*>(parent);
             if (cuda_executor == nullptr) {
-              LOG(ERROR)
-                  << "Attempting to initialize an instance of the cuBLAS "
-                  << "support library with a non-CUDA StreamExecutor";
+              LOG(ERROR) << "Attempting to initialize an instance of the cuDNN "
+                         << "support library with a non-CUDA StreamExecutor";
               return nullptr;
             }
 
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index c70eee6014e861..77f92237103bc6 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -19,6 +19,7 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DNN_H_
 #define TENSORFLOW_STREAM_EXECUTOR_CUDA_CUDA_DNN_H_
 
+#include "tensorflow/stream_executor/cuda/cuda_activation.h"
 #include "tensorflow/stream_executor/dnn.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/platform/mutex.h"
@@ -42,7 +43,6 @@ extern const PluginId kCuDnnPlugin;
 class CudnnSupport : public dnn::DnnSupport {
  public:
   explicit CudnnSupport(CUDAExecutor* parent);
-  ~CudnnSupport() override;
 
   port::Status Init() override;
   port::StatusOr<perftools::gputools::dnn::VersionInfo> GetVersion() override;
@@ -631,57 +631,14 @@ class CudnnSupport : public dnn::DnnSupport {
                          dnn::DataType output_type, float scale,
                          DeviceMemoryBase* output_data) override;
 
-  const Stream* GetCurrentDnnStream() const
-      SHARED_LOCKS_REQUIRED(dnn_handle_mutex_) {
-    return current_dnn_stream_;
-  }
-
-  void SetCurrentDnnStream(Stream* stream)
-      EXCLUSIVE_LOCKS_REQUIRED(dnn_handle_mutex_) {
-    current_dnn_stream_ = stream;
-  }
-
-  CUDAExecutor* GetParentExecutor() { return parent_; }
-
-  // Guards the enqueueing of DNN operations via the dnn_handle_ below, and
-  // access to current_dnn_stream_.
-  //
-  // This is a public member because we need to add thread safty annotations in
-  // the cudnn wrapper functions in the cc file, which need to access this
-  // mutex (the annotations require C++ permission checks).
-  mutex dnn_handle_mutex_;
-
  private:
   CUDAExecutor* parent_;  // Parent executor object. Not owned.
 
-  // cudnn library handle. cudnnHandle_t type is not present in this header to
-  // prevent third-party library header inclusions from leaking outside the
-  // single cuda_dnn translation unit.
-  void* dnn_handle_ GUARDED_BY(dnn_handle_mutex_);
-
-  // The current cudnn stream that is set by cudnnSetStream().
-  Stream* current_dnn_stream_ GUARDED_BY(dnn_handle_mutex_);
-
-  // NOTE(keveman): Temporary data layout transformation until cuDNN supports
-  // kBatchYXDepth for backward pass. This function allocates temporary memory,
-  // lays out the source data into the temporary but in the kBatchDepthXY
-  // layout, and returns the temporary memory. The caller is responsible for
-  // deallocating the temporary. Since the allocation is done using Stream's
-  // AllocateTemporaryMemory, a later BlockHostUntilDone could be used for
-  // deallocation.
-  //
-  // transform_scratch is populated with a legitimate temporary allocation iff
-  // the original output data needs to be transformed.
-  template<class T>
-  DeviceMemory<T> MaybeTransformLayout(
-      Stream* stream,
-      dnn::BatchDescriptor* output_descriptor,
-      DeviceMemory<T> backward_output_data,
-      std::unique_ptr<TemporaryDeviceMemory<T>>* transform_scratch)
-      EXCLUSIVE_LOCKS_REQUIRED(dnn_handle_mutex_);
+  // Provides access to the cuDNN handle.
+  std::unique_ptr<class CudnnAccess> cudnn_;
 
   template <class T, class U>
-  bool DoBatchNormalizationForwardImpl(
+  port::Status DoBatchNormalizationForwardImpl(
       Stream* stream, dnn::DataType input_data_type,
       dnn::DataType scale_data_type, const DeviceMemory<T>& x,
       const DeviceMemory<U>& scale, const DeviceMemory<U>& offset,
@@ -696,7 +653,7 @@ class CudnnSupport : public dnn::DnnSupport {
       std::function<void()> inv_var_to_var);
 
   template <class T, class U>
-  bool DoBatchNormalizationBackwardImpl(
+  port::Status DoBatchNormalizationBackwardImpl(
       Stream* stream, int cudnn_input_type, int cudnn_scale_type,
       const DeviceMemory<T>& y_backprop, const DeviceMemory<T>& x,
       const DeviceMemory<U>& scale, const DeviceMemory<U>& mean,
@@ -706,21 +663,20 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemory<U>* offset_backprop);
 
   template <class T>
-  bool DoConvolveImpl(Stream* stream,
-                      const dnn::BatchDescriptor& batch_descriptor,
-                      const DeviceMemory<T>& input_data,
-                      const dnn::FilterDescriptor& filter_descriptor,
-                      const DeviceMemory<T>& filter_data,
-                      const dnn::ConvolutionDescriptor& convolution_descriptor,
-                      const dnn::BatchDescriptor& output_descriptor,
-                      DeviceMemory<T>* output_data,
-                      ScratchAllocator* scratch_allocator,
-                      const dnn::AlgorithmConfig& algorithm_config,
-                      dnn::ProfileResult* output_profile_result);
+  port::Status DoConvolveImpl(
+      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
+      const DeviceMemory<T>& input_data,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<T>& filter_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<T>* output_data, ScratchAllocator* scratch_allocator,
+      const dnn::AlgorithmConfig& algorithm_config,
+      dnn::ProfileResult* output_profile_result);
 
   template <typename Type, typename BiasType, typename ScaleType,
             int cudnn_data_type, int cudnn_compute_type>
-  bool DoFusedConvolveImpl(
+  port::Status DoFusedConvolveImpl(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
       const DeviceMemory<Type>& conv_input_data, ScaleType conv_input_scale,
       const dnn::FilterDescriptor& filter_descriptor,
@@ -735,9 +691,8 @@ class CudnnSupport : public dnn::DnnSupport {
       dnn::ProfileResult* output_profile_result);
 
   template <class T>
-  bool DoConvolveBackwardDataImpl(
-      Stream* stream,
-      const dnn::FilterDescriptor& filter_descriptor,
+  port::Status DoConvolveBackwardDataImpl(
+      Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
       const DeviceMemory<T>& filter_data,
       const dnn::BatchDescriptor& output_descriptor,
       DeviceMemory<T> backward_output_data,
@@ -748,10 +703,10 @@ class CudnnSupport : public dnn::DnnSupport {
       dnn::ProfileResult* output_profile_result);
 
   template <class T>
-  bool DoConvolveBackwardFilterImpl(
+  port::Status DoConvolveBackwardFilterImpl(
       Stream* stream, const dnn::BatchDescriptor& input_descriptor,
       const DeviceMemory<T>& input_data,
-      const dnn::BatchDescriptor& output_descriptor_in,
+      const dnn::BatchDescriptor& output_descriptor,
       DeviceMemory<T> backward_output_data,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       const dnn::FilterDescriptor& filter_descriptor,
@@ -761,56 +716,56 @@ class CudnnSupport : public dnn::DnnSupport {
       dnn::ProfileResult* output_profile_result);
 
   template <class T>
-  bool DoConvolveBackwardBiasImpl(Stream* stream,
-                                  const dnn::BatchDescriptor& input_descriptor,
-                                  const DeviceMemory<T>& input_data,
-                                  const dnn::BatchDescriptor& bias_descriptor,
-                                  DeviceMemory<T>* backward_bias_data);
+  port::Status DoConvolveBackwardBiasImpl(
+      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
+      const DeviceMemory<T>& input_data,
+      const dnn::BatchDescriptor& bias_descriptor,
+      DeviceMemory<T>* backward_bias_data);
 
   template <class T>
-  bool DoRnnForwardImpl(Stream* stream, const CudnnRnnDescriptor& rnn_desc,
-                        const CudnnRnnSequenceTensorDescriptor& input_desc,
-                        const DeviceMemory<T>& input_data,
-                        const CudnnRnnStateTensorDescriptor& input_h_desc,
-                        const DeviceMemory<T>& input_h_data,
-                        const CudnnRnnStateTensorDescriptor& input_c_desc,
-                        const DeviceMemory<T>& input_c_data,
-                        const DeviceMemory<T>& params,
-                        const CudnnRnnSequenceTensorDescriptor& output_desc,
-                        DeviceMemory<T>* output_data,
-                        const CudnnRnnStateTensorDescriptor& output_h_desc,
-                        DeviceMemory<T>* output_h_data,
-                        const CudnnRnnStateTensorDescriptor& output_c_desc,
-                        DeviceMemory<T>* output_c_data, bool is_training,
-                        ScratchAllocator* reserve_space_allocator,
-                        ScratchAllocator* workspace_allocator,
-                        dnn::ProfileResult* output_profile_result);
+  port::Status DoRnnForwardImpl(
+      Stream* stream, const CudnnRnnDescriptor& rnn_desc,
+      const CudnnRnnSequenceTensorDescriptor& input_desc,
+      const DeviceMemory<T>& input_data,
+      const CudnnRnnStateTensorDescriptor& input_h_desc,
+      const DeviceMemory<T>& input_h_data,
+      const CudnnRnnStateTensorDescriptor& input_c_desc,
+      const DeviceMemory<T>& input_c_data, const DeviceMemory<T>& params,
+      const CudnnRnnSequenceTensorDescriptor& output_desc,
+      DeviceMemory<T>* output_data,
+      const CudnnRnnStateTensorDescriptor& output_h_desc,
+      DeviceMemory<T>* output_h_data,
+      const CudnnRnnStateTensorDescriptor& output_c_desc,
+      DeviceMemory<T>* output_c_data, bool is_training,
+      ScratchAllocator* reserve_space_allocator,
+      ScratchAllocator* workspace_allocator,
+      dnn::ProfileResult* output_profile_result);
 
   template <class T>
-  bool DoRnnBackwardImpl(Stream* stream, const CudnnRnnDescriptor& rnn_desc,
-                         const CudnnRnnSequenceTensorDescriptor& input_desc,
-                         const DeviceMemory<T>& input_data,
-                         const CudnnRnnStateTensorDescriptor& input_h_desc,
-                         const DeviceMemory<T>& input_h_data,
-                         const CudnnRnnStateTensorDescriptor& input_c_desc,
-                         const DeviceMemory<T>& input_c_data,
-                         const DeviceMemory<T>& params,
-                         const CudnnRnnSequenceTensorDescriptor& output_desc,
-                         const DeviceMemory<T>& output_data,
-                         const CudnnRnnStateTensorDescriptor& output_h_desc,
-                         const DeviceMemory<T>& output_h_data,
-                         const CudnnRnnStateTensorDescriptor& output_c_desc,
-                         const DeviceMemory<T>& output_c_data,
-                         const DeviceMemory<T>& output_backprop_data,
-                         const DeviceMemory<T>& output_h_backprop_data,
-                         const DeviceMemory<T>& output_c_backprop_data,
-                         DeviceMemory<T>* input_backprop_data,
-                         DeviceMemory<T>* input_h_backprop_data,
-                         DeviceMemory<T>* input_c_backprop_data,
-                         DeviceMemory<T>* params_backprop_data,
-                         DeviceMemory<uint8>* reserve_space_data,
-                         ScratchAllocator* workspace_allocator,
-                         dnn::ProfileResult* output_profile_result);
+  port::Status DoRnnBackwardImpl(
+      Stream* stream, const CudnnRnnDescriptor& rnn_desc,
+      const CudnnRnnSequenceTensorDescriptor& input_desc,
+      const DeviceMemory<T>& input_data,
+      const CudnnRnnStateTensorDescriptor& input_h_desc,
+      const DeviceMemory<T>& input_h_data,
+      const CudnnRnnStateTensorDescriptor& input_c_desc,
+      const DeviceMemory<T>& input_c_data, const DeviceMemory<T>& params,
+      const CudnnRnnSequenceTensorDescriptor& output_desc,
+      const DeviceMemory<T>& output_data,
+      const CudnnRnnStateTensorDescriptor& output_h_desc,
+      const DeviceMemory<T>& output_h_data,
+      const CudnnRnnStateTensorDescriptor& output_c_desc,
+      const DeviceMemory<T>& output_c_data,
+      const DeviceMemory<T>& output_backprop_data,
+      const DeviceMemory<T>& output_h_backprop_data,
+      const DeviceMemory<T>& output_c_backprop_data,
+      DeviceMemory<T>* input_backprop_data,
+      DeviceMemory<T>* input_h_backprop_data,
+      DeviceMemory<T>* input_c_backprop_data,
+      DeviceMemory<T>* params_backprop_data,
+      DeviceMemory<uint8>* reserve_space_data,
+      ScratchAllocator* workspace_allocator,
+      dnn::ProfileResult* output_profile_result);
 
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnSupport);
 };
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index 71cab145b9bb5a..d508f6594a9f9a 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -26,16 +26,16 @@ limitations under the License.
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/error.h"
 #include "tensorflow/stream_executor/lib/human_readable.h"
+#include "tensorflow/stream_executor/lib/inlined_vector.h"
 #include "tensorflow/stream_executor/lib/notification.h"
-#include "tensorflow/stream_executor/lib/threadpool.h"
 #include "tensorflow/stream_executor/lib/stacktrace.h"
 #include "tensorflow/stream_executor/lib/static_threadlocal.h"
 #include "tensorflow/stream_executor/lib/strcat.h"
 #include "tensorflow/stream_executor/lib/stringprintf.h"
+#include "tensorflow/stream_executor/lib/threadpool.h"
 #include "tensorflow/stream_executor/platform/logging.h"
 #include "tensorflow/stream_executor/platform/mutex.h"
 #include "tensorflow/stream_executor/platform/port.h"
-#include "tensorflow/stream_executor/lib/inlined_vector.h"
 
 bool FLAGS_gpuexec_cuda_driver_inject_init_error = false;
 bool FLAGS_gpuexec_cuda_sync_around_driver_calls = false;
@@ -62,14 +62,14 @@ class CreatedContexts {
  public:
   // Returns whether context is a member of the live set.
   static bool Has(CUcontext context) {
-    tf_shared_lock lock{mu_};
+    tf_shared_lock lock(mu_);
     return Live()->find(context) != Live()->end();
   }
 
   // Adds context to the live set.
   static CudaContext* Add(CUcontext context) {
     CHECK(context != nullptr);
-    mutex_lock lock{mu_};
+    mutex_lock lock(mu_);
     auto cuda_context = new CudaContext(context, next_id_++);
     Live()->insert(
         std::make_pair(context, std::unique_ptr<CudaContext>(cuda_context)));
@@ -79,7 +79,7 @@ class CreatedContexts {
   // Removes context from the live set.
   static void Remove(CUcontext context) {
     CHECK(context != nullptr);
-    mutex_lock lock{mu_};
+    mutex_lock lock(mu_);
     auto it = Live()->find(context);
     CHECK(it != Live()->end()) << context;
     Live()->erase(it);
@@ -204,11 +204,11 @@ string ToString(CUresult result) {
     case 719:
       return "CUDA_ERROR_LAUNCH_FAILED";
 
-    OSTREAM_CUDA_ERROR(CONTEXT_ALREADY_IN_USE)
-    OSTREAM_CUDA_ERROR(PEER_ACCESS_UNSUPPORTED)
-    OSTREAM_CUDA_ERROR(NOT_PERMITTED)
-    OSTREAM_CUDA_ERROR(NOT_SUPPORTED)
-    OSTREAM_CUDA_ERROR(UNKNOWN)  // Unknown internal error to CUDA.
+      OSTREAM_CUDA_ERROR(CONTEXT_ALREADY_IN_USE)
+      OSTREAM_CUDA_ERROR(PEER_ACCESS_UNSUPPORTED)
+      OSTREAM_CUDA_ERROR(NOT_PERMITTED)
+      OSTREAM_CUDA_ERROR(NOT_SUPPORTED)
+      OSTREAM_CUDA_ERROR(UNKNOWN)  // Unknown internal error to CUDA.
     default:
       return port::StrCat("CUresult(", static_cast<int>(result), ")");
   }
@@ -396,8 +396,8 @@ static port::Status InternalInit() {
 
   LOG(ERROR) << "failed call to cuInit: " << ToString(res);
   Diagnostician::LogDiagnosticInformation();
-  return port::Status{port::error::ABORTED,
-                      port::StrCat("failed call to cuInit: ", ToString(res))};
+  return port::Status(port::error::ABORTED,
+                      port::StrCat("failed call to cuInit: ", ToString(res)));
 }
 
 }  // namespace
@@ -425,9 +425,9 @@ static port::Status InternalInit() {
     return port::Status::OK();
   }
 
-  return port::Status{
+  return port::Status(
       port::error::INTERNAL,
-      port::StrCat("failed call to cuDeviceGet: ", ToString(res))};
+      port::StrCat("failed call to cuDeviceGet: ", ToString(res)));
 }
 
 /* static */ bool CUDADriver::GetDeviceName(CUdevice device,
@@ -470,7 +470,8 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
 }
 
 /* static */ port::Status CUDADriver::CreateContext(
-    CUdevice device, DeviceOptions device_options, CudaContext** context) {
+    CUdevice device, const DeviceOptions &device_options,
+    CudaContext **context) {
   *context = nullptr;
 
   int flags = 0;
@@ -481,62 +482,45 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
   CUresult res;
   CUcontext former_context;
   CUcontext new_context;
-  {
-    // TODO(leary) Need to see if NVIDIA can expunge the leakiness in their
-    // context creation: see http://b/13248943
 
-#if CUDA_VERSION >= 7000
-    {
-      unsigned int former_primary_context_flags;
-      int former_primary_context_is_active;
-      CHECK_EQ(CUDA_SUCCESS,
-               cuDevicePrimaryCtxGetState(device, &former_primary_context_flags,
-                                          &former_primary_context_is_active));
-      if (former_primary_context_flags != flags) {
-        if (former_primary_context_is_active) {
-          LOG(ERROR)
-              << "The primary context is active and has a different flag set ("
-              << former_primary_context_flags << ") than the desired flag set ("
-              << flags << ").";
-        } else {
-          CHECK_EQ(CUDA_SUCCESS, cuDevicePrimaryCtxSetFlags(device, flags));
-        }
-      }
+  unsigned int former_primary_context_flags;
+  int former_primary_context_is_active;
+  CHECK_EQ(CUDA_SUCCESS,
+           cuDevicePrimaryCtxGetState(device, &former_primary_context_flags,
+                                      &former_primary_context_is_active));
+  if (former_primary_context_flags != flags) {
+    if (former_primary_context_is_active) {
+      LOG(ERROR)
+          << "The primary context is active and has a different flag set ("
+          << former_primary_context_flags << ") than the desired flag set ("
+          << flags << ").";
+    } else {
+      CHECK_EQ(CUDA_SUCCESS, cuDevicePrimaryCtxSetFlags(device, flags));
     }
+  }
 
-    former_context = CUDADriver::CurrentContextOrDie();
-    res = cuDevicePrimaryCtxRetain(&new_context, device);
-    if (former_context != nullptr) {
-      CUdevice former_device;
-      if (cuCtxGetDevice(&former_device) == CUDA_SUCCESS) {
-        if (former_device == device) {
-          if (former_context == new_context) {
-            VLOG(2) << "The primary context " << former_context
-                    << " for device " << device
-                    << " exists before initializing the StreamExecutor.";
-          } else {
-            LOG(WARNING)
-                << "A non-primary context " << former_context << " for device "
-                << device
-                << " exists before initializing the StreamExecutor. The "
-                << "primary context is now " << new_context << ". We "
-                << "haven't verified StreamExecutor works with that.";
-          }
+  former_context = CUDADriver::CurrentContextOrDie();
+  res = cuDevicePrimaryCtxRetain(&new_context, device);
+  if (former_context != nullptr) {
+    CUdevice former_device;
+    if (cuCtxGetDevice(&former_device) == CUDA_SUCCESS) {
+      if (former_device == device) {
+        if (former_context == new_context) {
+          VLOG(2) << "The primary context " << former_context << " for device "
+                  << device
+                  << " exists before initializing the StreamExecutor.";
+        } else {
+          LOG(WARNING) << "A non-primary context " << former_context
+                       << " for device " << device
+                       << " exists before initializing the StreamExecutor. The "
+                       << "primary context is now " << new_context << ". We "
+                       << "haven't verified StreamExecutor works with that.";
         }
-      } else {
-        LOG(ERROR) << "Failed to get the device of the current context "
-                   << former_context;
       }
+    } else {
+      LOG(ERROR) << "Failed to get the device of the current context "
+                 << former_context;
     }
-#else
-    former_context = CurrentContext();
-    if (former_context != nullptr) {
-      LOG(WARNING)
-          << "creating context when one is currently active; existing: "
-          << former_context;
-    }
-    res = cuCtxCreate(&new_context, flags, device);
-#endif
   }
   CHECK_EQ(CUDA_SUCCESS, cuCtxSetCurrent(former_context));
 
@@ -548,11 +532,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
     return port::Status::OK();
   }
 
-#if CUDA_VERSION >= 7000
   string message = "failed call to cuDevicePrimaryCtxRetain: " + ToString(res);
-#else
-  string message = "failed call to cuCtxCreate: " + ToString(res);
-#endif
   if (res == CUDA_ERROR_OUT_OF_MEMORY) {
     uint64 total_memory;
     if (GetDeviceTotalMemory(device, &total_memory)) {
@@ -562,14 +542,13 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
     }
   }
 
-  return port::Status{port::error::INTERNAL, message};
+  return port::Status(port::error::INTERNAL, message);
 }
 
 /* static */ void CUDADriver::DestroyContext(CudaContext* context) {
   if (context == nullptr) {
     return;
   }
-#if CUDA_VERSION >= 7000
   CUcontext former_context = CurrentContext();
   CUresult res = cuCtxSetCurrent(context->context());
   CUdevice device;
@@ -577,9 +556,6 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
   cuCtxSetCurrent(former_context);
 
   res = cuDevicePrimaryCtxRelease(device);
-#else
-  CUresult res = cuCtxDestroy(context->context());
-#endif
 
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to release CUDA context; leaking: " << ToString(res);
@@ -615,7 +591,7 @@ bool DeviceOptionsToContextFlags(const DeviceOptions &device_options,
 /* static */ port::StatusOr<CUsharedconfig>
 CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   CUsharedconfig shared_mem_config;
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult result = cuCtxGetSharedMemConfig(&shared_mem_config);
   if (result != CUDA_SUCCESS) {
     CUdevice device;
@@ -623,16 +599,16 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
     LOG(ERROR) << "failed to get CUDA device shared memory config. "
                << "Context device ID: " << device
                << ", result: " << ToString(result);
-    return port::Status{
+    return port::Status(
         port::error::INTERNAL,
-        port::StrCat("failed to get shared memory config: ", ToString(result))};
+        port::StrCat("failed to get shared memory config: ", ToString(result)));
   }
   return shared_mem_config;
 }
 
 /* static */ port::Status CUDADriver::ContextSetSharedMemConfig(
     CudaContext* context, CUsharedconfig shared_mem_config) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult result = cuCtxSetSharedMemConfig(shared_mem_config);
   if (result != CUDA_SUCCESS) {
     CUdevice device;
@@ -641,9 +617,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                << "Context device ID: " << device
                << ", config: " << shared_mem_config
                << ", result: " << ToString(result);
-    return port::Status{
+    return port::Status(
         port::error::INTERNAL,
-        port::StrCat("failed to set shared memory config: ", ToString(result))};
+        port::StrCat("failed to set shared memory config: ", ToString(result)));
   }
   return port::Status::OK();
 }
@@ -654,7 +630,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
     unsigned int block_dim_y, unsigned int block_dim_z,
     unsigned int shared_mem_bytes, CUstream stream, void **kernel_params,
     void **extra) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   VLOG(2) << "launching kernel: " << function << "; gdx: " << grid_dim_x
           << " gdy: " << grid_dim_y << " gdz: " << grid_dim_z
           << " bdx: " << block_dim_x << " bdy: " << block_dim_y
@@ -674,11 +650,11 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 /* static */ port::Status CUDADriver::LoadCubin(CudaContext* context,
                                                 const char *cubin_bytes,
                                                 CUmodule *module) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult result = cuModuleLoadFatBinary(module, cubin_bytes);
   if (result != CUDA_SUCCESS) {
-    return port::Status{port::error::INTERNAL,
-                        "failed to load in-memory CUBIN: " + ToString(result)};
+    return port::Status(port::error::INTERNAL,
+                        "failed to load in-memory CUBIN: " + ToString(result));
   }
 
   return port::Status::OK();
@@ -691,7 +667,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   bool ret = true;
   GetDriverExecutor()->Schedule([context, ptx_contents, module, &ret,
                                  &notification]() {
-    ScopedActivateContext activation{context};
+    ScopedActivateContext activation(context);
     void *ptx_data = const_cast<char *>(ptx_contents);
     static const unsigned int kLogBufferBytesLimit = 1024;
     unsigned int error_log_buffer_bytes = kLogBufferBytesLimit;
@@ -757,7 +733,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 /* static */ bool CUDADriver::SynchronousMemsetUint8(CudaContext* context,
                                                      CUdeviceptr location,
                                                      uint8 value, size_t size) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult res = cuMemsetD8(location, value, size);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to memset memory: " << ToString(res);
@@ -770,7 +746,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                       CUdeviceptr location,
                                                       uint32 value,
                                                       size_t uint32_count) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult res = cuMemsetD32(location, value, uint32_count);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to memset memory: " << ToString(res);
@@ -784,7 +760,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                       uint8 value,
                                                       size_t uint32_count,
                                                       CUstream stream) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult res = cuMemsetD8Async(location, value, uint32_count, stream);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
@@ -799,7 +775,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                        uint32 value,
                                                        size_t uint32_count,
                                                        CUstream stream) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult res = cuMemsetD32Async(location, value, uint32_count, stream);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "failed to enqueue async memset operation: " << ToString(res);
@@ -877,9 +853,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
     return device;
   }
 
-  return port::Status{
+  return port::Status(
       port::error::INTERNAL,
-      port::StrCat("failed to get device for context: ", ToString(result))};
+      port::StrCat("failed to get device for context: ", ToString(result)));
 }
 
 /* static */ bool CUDADriver::CreateStream(CudaContext *context,
@@ -937,7 +913,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 
 /* static */ void CUDADriver::DeviceDeallocate(CudaContext* context,
                                                void *location) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUdeviceptr pointer = port::bit_cast<CUdeviceptr>(location);
   CUresult res = cuMemFree(pointer);
   if (res != CUDA_SUCCESS) {
@@ -948,9 +924,40 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   }
 }
 
+/* static */ void *CUDADriver::UnifiedMemoryAllocate(CudaContext *context,
+                                                     uint64 bytes) {
+  ScopedActivateContext activation(context);
+  CUdeviceptr result = 0;
+  // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
+  CUresult res = cuMemAllocManaged(&result, bytes, CU_MEM_ATTACH_GLOBAL);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to alloc " << bytes
+               << " bytes unified memory; result: " << ToString(res);
+    return nullptr;
+  }
+  void *ptr = reinterpret_cast<void *>(result);
+  VLOG(2) << "allocated " << ptr << " for context " << context << " of "
+          << bytes << " bytes in unified memory";
+  return ptr;
+}
+
+/* static */ void CUDADriver::UnifiedMemoryDeallocate(CudaContext *context,
+                                                      void *location) {
+  ScopedActivateContext activation(context);
+  CUdeviceptr pointer = port::bit_cast<CUdeviceptr>(location);
+  CUresult res = cuMemFree(pointer);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR) << "failed to free unified memory at " << location
+               << "; result: " << ToString(res);
+  } else {
+    VLOG(2) << "deallocated unified memory at " << location << " for context "
+            << context;
+  }
+}
+
 /* static */ void *CUDADriver::HostAllocate(CudaContext *context,
                                             uint64 bytes) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   void *host_mem = nullptr;
   // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
   CUresult res = cuMemHostAlloc(&host_mem, bytes, CU_MEMHOSTALLOC_PORTABLE);
@@ -963,7 +970,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 
 /* static */ void CUDADriver::HostDeallocate(CudaContext* context,
                                              void *location) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult res = cuMemFreeHost(location);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "error deallocating host memory at " << location << ": "
@@ -973,7 +980,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 
 /* static */ bool CUDADriver::HostRegister(CudaContext* context, void *location,
                                            uint64 bytes) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   // "Portable" memory is visible to all CUDA contexts. Safe for our use model.
   CUresult res =
       cuMemHostRegister(location, bytes, CU_MEMHOSTREGISTER_PORTABLE);
@@ -987,7 +994,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 
 /* static */ bool CUDADriver::HostUnregister(CudaContext* context,
                                              void *location) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult res = cuMemHostUnregister(location);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "error unregistering host memory at " << location << ": "
@@ -1000,8 +1007,8 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 /* static */ port::Status CUDADriver::DestroyEvent(CudaContext* context,
                                                    CUevent *event) {
   if (*event == nullptr) {
-    return port::Status{port::error::INVALID_ARGUMENT,
-                        "input event cannot be null"};
+    return port::Status(port::error::INVALID_ARGUMENT,
+                        "input event cannot be null");
   }
 
   ScopedActivateContext activated{context};
@@ -1013,15 +1020,15 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
       return port::Status::OK();
     case CUDA_ERROR_DEINITIALIZED:
     case CUDA_ERROR_NOT_INITIALIZED:
-      return port::Status{
+      return port::Status(
           port::error::FAILED_PRECONDITION,
           port::Printf("error destroying CUDA event in context %p: %s", context,
-                       ToString(res).c_str())};
+                       ToString(res).c_str()));
     default:
-      return port::Status{
+      return port::Status(
           port::error::INTERNAL,
           port::Printf("error destroying CUDA event in context %p: %s", context,
-                       ToString(res).c_str())};
+                       ToString(res).c_str()));
   }
 }
 
@@ -1035,15 +1042,15 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
       return port::Status::OK();
     case CUDA_ERROR_DEINITIALIZED:
     case CUDA_ERROR_NOT_INITIALIZED:
-      return port::Status{
+      return port::Status(
           port::error::FAILED_PRECONDITION,
           port::Printf("error recording CUDA event on stream %p: %s", stream,
-                       ToString(res).c_str())};
+                       ToString(res).c_str()));
     default:
-      return port::Status{
+      return port::Status(
           port::error::INVALID_ARGUMENT,
           port::Printf("error recording CUDA event on stream %p: %s", stream,
-                       ToString(res).c_str())};
+                       ToString(res).c_str()));
   }
 }
 
@@ -1052,9 +1059,9 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   ScopedActivateContext activated{context};
   CUresult res = cuEventQuery(event);
   if (res != CUDA_SUCCESS && res != CUDA_ERROR_NOT_READY) {
-    return port::Status{
+    return port::Status(
         port::error::INTERNAL,
-        port::Printf("failed to query event: %s", ToString(res).c_str())};
+        port::Printf("failed to query event: %s", ToString(res).c_str()));
   }
 
   return res;
@@ -1084,7 +1091,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 /* static */ bool CUDADriver::WaitStreamOnEvent(CudaContext* context,
                                                 CUstream stream,
                                                 CUevent event) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult res = cuStreamWaitEvent(stream, event, 0 /* = flags */);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "could not wait stream on event: " << ToString(res);
@@ -1095,7 +1102,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
 }
 
 /* static */ bool CUDADriver::SynchronizeContext(CudaContext* context) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult res = cuCtxSynchronize();
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << "could not synchronize on CUDA context: " << ToString(res)
@@ -1141,7 +1148,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                            void *host_dst,
                                                            CUdeviceptr gpu_src,
                                                            uint64 size) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult res = cuMemcpyDtoH(host_dst, gpu_src, size);
   if (res != CUDA_SUCCESS) {
     return port::InternalError(
@@ -1159,7 +1166,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                            CUdeviceptr gpu_dst,
                                                            const void *host_src,
                                                            uint64 size) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult res = cuMemcpyHtoD(gpu_dst, host_src, size);
   if (res != CUDA_SUCCESS) {
     return port::InternalError(port::Printf(
@@ -1176,7 +1183,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                            CUdeviceptr gpu_dst,
                                                            CUdeviceptr gpu_src,
                                                            uint64 size) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult res = cuMemcpyDtoD(gpu_dst, gpu_src, size);
   if (res != CUDA_SUCCESS) {
     return port::InternalError(port::Printf(
@@ -1194,7 +1201,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                     CUdeviceptr gpu_src,
                                                     uint64 size,
                                                     CUstream stream) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult res = cuMemcpyDtoHAsync(host_dst, gpu_src, size, stream);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << port::Printf(
@@ -1214,7 +1221,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                     const void *host_src,
                                                     uint64 size,
                                                     CUstream stream) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult res = cuMemcpyHtoDAsync(gpu_dst, host_src, size, stream);
   if (res != CUDA_SUCCESS) {
     LOG(ERROR) << port::Printf(
@@ -1233,7 +1240,7 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
                                                     CUdeviceptr gpu_src,
                                                     uint64 size,
                                                     CUstream stream) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   CUresult result = cuMemcpyDtoDAsync(gpu_dst, gpu_src, size, stream);
   if (result != CUDA_SUCCESS) {
     LOG(ERROR) << port::Printf(
@@ -1275,12 +1282,12 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
   if (res == CUDA_SUCCESS) {
     return port::Status::OK();
   } else if (res == CUDA_ERROR_OUT_OF_MEMORY) {
-    return port::Status{port::error::RESOURCE_EXHAUSTED,
-                        "could not create CUDA event: out of device memory"};
+    return port::Status(port::error::RESOURCE_EXHAUSTED,
+                        "could not create CUDA event: out of device memory");
   } else {
-    return port::Status{
+    return port::Status(
         port::error::FAILED_PRECONDITION,
-        port::StrCat("could not create CUDA event: ", ToString(res))};
+        port::StrCat("could not create CUDA event: ", ToString(res)));
   }
 }
 
@@ -1308,10 +1315,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
     return context;
   }
 
-  return port::Status{
+  return port::Status(
       port::error::INTERNAL,
       port::StrCat("failed to query device pointer for context: ",
-                   ToString(result))};
+                   ToString(result)));
 }
 
 /* static */ port::StatusOr<MemorySpace> CUDADriver::GetPointerMemorySpace(
@@ -1326,16 +1333,16 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
       case CU_MEMORYTYPE_HOST:
         return MemorySpace::kHost;
       default:
-        return port::Status{
+        return port::Status(
             port::error::INTERNAL,
-            port::StrCat("unknown memory space provided by CUDA API: ", value)};
+            port::StrCat("unknown memory space provided by CUDA API: ", value));
     }
   }
 
-  return port::Status{
+  return port::Status(
       port::error::INTERNAL,
       port::StrCat("failed to query device pointer for memory space: ",
-                   ToString(result))};
+                   ToString(result)));
 }
 
 /* static */ port::Status CUDADriver::GetPointerAddressRange(CUdeviceptr dptr,
@@ -1348,16 +1355,16 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
     // We differentiate between "this pointer is unknown" (return here) and
     // "there was an internal error while performing this operation" (return
     // below).
-    return port::Status{
+    return port::Status(
         port::error::NOT_FOUND,
         port::Printf("not a device pointer %p; %s",
-                     reinterpret_cast<void *>(dptr), ToString(result).c_str())};
+                     reinterpret_cast<void *>(dptr), ToString(result).c_str()));
   }
 
-  return port::Status{
+  return port::Status(
       port::error::INTERNAL,
       port::Printf("failed to get pointer into for device pointer %p; %s",
-                   reinterpret_cast<void *>(dptr), ToString(result).c_str())};
+                   reinterpret_cast<void *>(dptr), ToString(result).c_str()));
 }
 
 /* static */ port::StatusOr<CUdevice> CUDADriver::GetPointerDevice(
@@ -1380,10 +1387,10 @@ CUDADriver::ContextGetSharedMemConfig(CudaContext* context) {
     return port::Status::OK();
   }
 
-  return port::Status{
+  return port::Status(
       port::error::INTERNAL,
       port::Printf("failed to get compute capability for device: %s; %d",
-                   ToString(result).c_str(), device)};
+                   ToString(result).c_str(), device));
 }
 
 // Helper function that turns the integer output of cuDeviceGetAttribute to type
@@ -1394,10 +1401,10 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   int value = -1;
   CUresult result = cuDeviceGetAttribute(&value, attribute, device);
   if (result != CUDA_SUCCESS) {
-    return port::Status{
+    return port::Status(
         port::error::NOT_FOUND,
         port::StrCat("could not retrieve CUDA device attribute (", attribute,
-                     "): ", ToString(result))};
+                     "): ", ToString(result)));
   }
   T converted = value;
   return converted;
@@ -1499,10 +1506,10 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   int val;
   CUresult res = cuDeviceGetAttribute(&val, attribute, device);
   if (res != CUDA_SUCCESS) {
-    return port::Status{
+    return port::Status(
         port::error::INTERNAL,
         port::Printf("failed to get device attribute %d for device %d: %s",
-                     attribute, device, ToString(res).c_str())};
+                     attribute, device, ToString(res).c_str()));
   }
   return val;
 }
@@ -1523,7 +1530,7 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
 /* static */ bool CUDADriver::GetDeviceMemoryInfo(CudaContext* context,
                                                   int64 *free_out,
                                                   int64 *total_out) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
   size_t free = 0;
   size_t total = 0;
   CUresult res = cuMemGetInfo(&free, &total);
@@ -1603,10 +1610,10 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
   CUresult result = cuCtxEnablePeerAccess(to->context(), 0 /* = flags */);
   if (result != CUDA_SUCCESS &&
       result != CUDA_ERROR_PEER_ACCESS_ALREADY_ENABLED) {
-    return port::Status{
+    return port::Status(
         port::error::INTERNAL,
         port::Printf("failed to enable peer access from %p to %p: %s", from, to,
-                     ToString(result).c_str())};
+                     ToString(result).c_str()));
   }
 
   return port::Status::OK();
@@ -1615,16 +1622,16 @@ static port::StatusOr<T> GetSimpleAttribute(CUdevice device,
 /* static */ port::StatusOr<int> CUDADriver::GetMaxOccupiedBlocksPerCore(
     CudaContext* context, CUfunction kernel, int threads_per_block,
     size_t dynamic_shared_memory_bytes) {
-  ScopedActivateContext activation{context};
+  ScopedActivateContext activation(context);
 
   int max_blocks;
   CUresult result = cuOccupancyMaxActiveBlocksPerMultiprocessor(
       &max_blocks, kernel, threads_per_block, dynamic_shared_memory_bytes);
   if (result != CUDA_SUCCESS) {
-    return port::Status{
+    return port::Status(
         port::error::INTERNAL,
         port::Printf("failed to calculate occupancy of kernel %p: %s", kernel,
-                     ToString(result).c_str())};
+                     ToString(result).c_str()));
   }
 
   return max_blocks;
diff --git a/tensorflow/stream_executor/cuda/cuda_driver.h b/tensorflow/stream_executor/cuda/cuda_driver.h
index a9969e247e1815..3713a5b7b98f8b 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.h
+++ b/tensorflow/stream_executor/cuda/cuda_driver.h
@@ -106,6 +106,16 @@ class CUDADriver {
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
   static void DeviceDeallocate(CudaContext* context, void *location);
 
+  // Allocates a unified memory space of size bytes associated with the given
+  // context via cuMemAllocManaged.
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gb347ded34dc326af404aa02af5388a32
+  static void* UnifiedMemoryAllocate(CudaContext* context, uint64 bytes);
+
+  // Deallocates a unified memory space of size bytes associated with the given
+  // context via cuMemFree.
+  // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1g89b3f154e17cc89b6eea277dbdf5c93a
+  static void UnifiedMemoryDeallocate(CudaContext* context, void* location);
+
   // Allocates page-locked and CUDA-registered memory on the host via
   // cuMemAllocHost.
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__MEM.html#group__CUDA__MEM_1gdd8311286d2c2691605362c689bc64e0
@@ -147,7 +157,7 @@ class CUDADriver {
   // userspace processes is given here:
   // http://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g65dc0012348bc84810e2103a40d8e2cf
   static port::Status CreateContext(CUdevice device,
-                                    DeviceOptions device_options,
+                                    const DeviceOptions& device_options,
                                     CudaContext** context);
 
   // Destroys the provided context via cuCtxDestroy.
diff --git a/tensorflow/stream_executor/cuda/cuda_fft.cc b/tensorflow/stream_executor/cuda/cuda_fft.cc
index 5b34740f9f1f90..013ca2d7f6d7f9 100644
--- a/tensorflow/stream_executor/cuda/cuda_fft.cc
+++ b/tensorflow/stream_executor/cuda/cuda_fft.cc
@@ -138,8 +138,8 @@ port::Status CUDAFftPlan::Initialize(
                                   CUDAFftType(type), 1 /* = batch */);
           if (ret != CUFFT_SUCCESS) {
             LOG(ERROR) << "failed to create cuFFT 1d plan:" << ret;
-            return port::Status{port::error::INTERNAL,
-                                "Failed to create cuFFT 1d plan."};
+            return port::Status(port::error::INTERNAL,
+                                "Failed to create cuFFT 1d plan.");
           }
           return port::Status::OK();
         case 2:
@@ -148,8 +148,8 @@ port::Status CUDAFftPlan::Initialize(
                                   elem_count_[1], CUDAFftType(type));
           if (ret != CUFFT_SUCCESS) {
             LOG(ERROR) << "failed to create cuFFT 2d plan:" << ret;
-            return port::Status{port::error::INTERNAL,
-                                "Failed to create cuFFT 2d plan."};
+            return port::Status(port::error::INTERNAL,
+                                "Failed to create cuFFT 2d plan.");
           }
           return port::Status::OK();
         case 3:
@@ -159,29 +159,29 @@ port::Status CUDAFftPlan::Initialize(
                                 elem_count_[2], CUDAFftType(type));
           if (ret != CUFFT_SUCCESS) {
             LOG(ERROR) << "failed to create cuFFT 3d plan:" << ret;
-            return port::Status{port::error::INTERNAL,
-                                "Failed to create cuFFT 3d plan."};
+            return port::Status(port::error::INTERNAL,
+                                "Failed to create cuFFT 3d plan.");
           }
           return port::Status::OK();
         default:
           LOG(ERROR) << "Invalid rank value for cufftPlan. "
                         "Requested 1, 2, or 3, given: "
                      << rank;
-          return port::Status{port::error::INVALID_ARGUMENT,
-                              "cufftPlan only takes rank 1, 2, or 3."};
+          return port::Status(port::error::INVALID_ARGUMENT,
+                              "cufftPlan only takes rank 1, 2, or 3.");
       }
     } else {
       ret = wrap::cufftCreate(parent, &plan_);
       if (ret != CUFFT_SUCCESS) {
         LOG(ERROR) << "failed to create cuFFT plan:" << ret;
-        return port::Status{port::error::INTERNAL,
-                            "Failed to create cuFFT plan."};
+        return port::Status(port::error::INTERNAL,
+                            "Failed to create cuFFT plan.");
       }
       ret = wrap::cufftSetAutoAllocation(parent, plan_, 0);
       if (ret != CUFFT_SUCCESS) {
         LOG(ERROR) << "failed to set auto allocation for cuFFT plan:" << ret;
-        return port::Status{port::error::INTERNAL,
-                            "Failed to set auto allocation for cuFFT plan."};
+        return port::Status(port::error::INTERNAL,
+                            "Failed to set auto allocation for cuFFT plan.");
       }
       switch (rank) {
         case 1:
@@ -190,8 +190,8 @@ port::Status CUDAFftPlan::Initialize(
                                       &scratch_size_bytes_);
           if (ret != CUFFT_SUCCESS) {
             LOG(ERROR) << "failed to make cuFFT 1d plan:" << ret;
-            return port::Status{port::error::INTERNAL,
-                                "Failed to make cuFFT 1d plan."};
+            return port::Status(port::error::INTERNAL,
+                                "Failed to make cuFFT 1d plan.");
           }
           break;
         case 2:
@@ -200,8 +200,8 @@ port::Status CUDAFftPlan::Initialize(
                                       &scratch_size_bytes_);
           if (ret != CUFFT_SUCCESS) {
             LOG(ERROR) << "failed to make cuFFT 2d plan:" << ret;
-            return port::Status{port::error::INTERNAL,
-                                "Failed to make cuFFT 2d plan."};
+            return port::Status(port::error::INTERNAL,
+                                "Failed to make cuFFT 2d plan.");
           }
           break;
         case 3:
@@ -210,16 +210,16 @@ port::Status CUDAFftPlan::Initialize(
                                       CUDAFftType(type), &scratch_size_bytes_);
           if (ret != CUFFT_SUCCESS) {
             LOG(ERROR) << "failed to make cuFFT 3d plan:" << ret;
-            return port::Status{port::error::INTERNAL,
-                                "Failed to make cuFFT 3d plan."};
+            return port::Status(port::error::INTERNAL,
+                                "Failed to make cuFFT 3d plan.");
           }
           break;
         default:
           LOG(ERROR) << "Invalid rank value for cufftPlan. "
                         "Requested 1, 2, or 3, given: "
                      << rank;
-          return port::Status{port::error::INVALID_ARGUMENT,
-                              "cufftPlan only takes rank 1, 2, or 3."};
+          return port::Status(port::error::INVALID_ARGUMENT,
+                              "cufftPlan only takes rank 1, 2, or 3.");
       }
       return UpdateScratchAllocator(stream, scratch_allocator);
     }
@@ -233,23 +233,23 @@ port::Status CUDAFftPlan::Initialize(
           output_distance, CUDAFftType(type), batch_count);
       if (ret != CUFFT_SUCCESS) {
         LOG(ERROR) << "failed to create cuFFT batched plan:" << ret;
-        return port::Status{port::error::INTERNAL,
-                            "Failed to create cuFFT batched plan."};
+        return port::Status(port::error::INTERNAL,
+                            "Failed to create cuFFT batched plan.");
       }
     } else {
       auto ret = wrap::cufftCreate(parent, &plan_);
       if (ret != CUFFT_SUCCESS) {
         LOG(ERROR) << "failed to create cuFFT batched plan:" << ret;
-        return port::Status{port::error::INTERNAL,
-                            "Failed to create cuFFT batched plan."};
+        return port::Status(port::error::INTERNAL,
+                            "Failed to create cuFFT batched plan.");
       }
       ret = wrap::cufftSetAutoAllocation(parent, plan_, 0);
       if (ret != CUFFT_SUCCESS) {
         LOG(ERROR) << "failed to set auto allocation for cuFFT batched plan:"
                    << ret;
-        return port::Status{
+        return port::Status(
             port::error::INTERNAL,
-            "Failed to set auto allocation for cuFFT batched plan."};
+            "Failed to set auto allocation for cuFFT batched plan.");
       }
       ret = wrap::cufftMakePlanMany(
           parent, plan_, rank, elem_count_,
@@ -259,8 +259,8 @@ port::Status CUDAFftPlan::Initialize(
           &scratch_size_bytes_);
       if (ret != CUFFT_SUCCESS) {
         LOG(ERROR) << "failed to make cuFFT batched plan:" << ret;
-        return port::Status{port::error::INTERNAL,
-                            "Failed to make cuFFT batched plan."};
+        return port::Status(port::error::INTERNAL,
+                            "Failed to make cuFFT batched plan.");
       }
       return UpdateScratchAllocator(stream, scratch_allocator);
     }
@@ -293,8 +293,8 @@ port::Status CUDAFftPlan::UpdateScratchAllocator(
   cufftResult_t ret = wrap::cufftSetWorkArea(parent_, plan_, scratch_.opaque());
   if (ret != CUFFT_SUCCESS) {
     LOG(ERROR) << "failed to set work area for cuFFT plan:" << ret;
-    return port::Status{port::error::INTERNAL,
-                        "Failed to set work area for cuFFT plan."};
+    return port::Status(port::error::INTERNAL,
+                        "Failed to set work area for cuFFT plan.");
   }
   return port::Status::OK();
 }
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index fc09a38bbfc646..869d8aaceb10fa 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -609,10 +609,10 @@ port::Status CUDAExecutor::WaitForEvent(Stream *stream, Event *event) {
                                     AsCUDAEvent(event)->cuda_event())) {
     return port::Status::OK();
   } else {
-    return port::Status{
+    return port::Status(
         port::error::INTERNAL,
         port::Printf("error recording waiting for CUDA event on stream %p",
-                     stream)};
+                     stream));
   }
 }
 
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index 621a15ef24a05a..9c94012b0d58fd 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -74,6 +74,14 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
 
   void Deallocate(DeviceMemoryBase *mem) override;
 
+  void *UnifiedMemoryAllocate(uint64 size) override {
+    return CUDADriver::UnifiedMemoryAllocate(context_, size);
+  }
+
+  void UnifiedMemoryDeallocate(void *location) override {
+    return CUDADriver::UnifiedMemoryDeallocate(context_, location);
+  }
+
   // CUDA allocation/registration functions are necessary because the driver
   // internally sets up buffers for DMA operations (and page locks them).
   // There's no external interface for us to otherwise control these DMA
diff --git a/tensorflow/stream_executor/cuda/cuda_platform.cc b/tensorflow/stream_executor/cuda/cuda_platform.cc
index 649224a20e959a..622a4a4edb1fe4 100644
--- a/tensorflow/stream_executor/cuda/cuda_platform.cc
+++ b/tensorflow/stream_executor/cuda/cuda_platform.cc
@@ -124,9 +124,9 @@ port::StatusOr<StreamExecutor*> CudaPlatform::FirstExecutorForBus(
     }
   }
 
-  return port::Status{
+  return port::Status(
       port::error::NOT_FOUND,
-      port::Printf("Executor for bus %d not found.", bus_ordinal)};
+      port::Printf("Executor for bus %d not found.", bus_ordinal));
 }
 
 Platform::Id CudaPlatform::id() const { return kCudaPlatformId; }
@@ -172,11 +172,11 @@ CudaPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
       this, MakeUnique<CUDAExecutor>(config.plugin_config));
   auto init_status = executor->Init(config.ordinal, config.device_options);
   if (!init_status.ok()) {
-    return port::Status{
+    return port::Status(
         port::error::INTERNAL,
         port::Printf(
             "failed initializing StreamExecutor for CUDA device ordinal %d: %s",
-            config.ordinal, init_status.ToString().c_str())};
+            config.ordinal, init_status.ToString().c_str()));
   }
 
   return std::move(executor);
@@ -206,7 +206,6 @@ static void InitializeCudaPlatform() {
 REGISTER_MODULE_INITIALIZER(cuda_platform,
                             stream_executor::InitializeCudaPlatform());
 
-DECLARE_MODULE_INITIALIZER(multi_platform_manager);
 // Note that module initialization sequencing is not supported in the
 // open-source project, so this will be a no-op there.
 REGISTER_MODULE_INITIALIZER_SEQUENCE(cuda_platform, multi_platform_manager);
diff --git a/tensorflow/stream_executor/cuda/cuda_rng.cc b/tensorflow/stream_executor/cuda/cuda_rng.cc
index e289e7ced57b16..88c4f15792737a 100644
--- a/tensorflow/stream_executor/cuda/cuda_rng.cc
+++ b/tensorflow/stream_executor/cuda/cuda_rng.cc
@@ -114,7 +114,7 @@ CUDARng::~CUDARng() {
 }
 
 bool CUDARng::Init() {
-  mutex_lock lock{mu_};
+  mutex_lock lock(mu_);
   CHECK(rng_ == nullptr);
 
   curandStatus_t ret =
@@ -150,7 +150,7 @@ constexpr bool ComplexIsConsecutiveFloats() {
 template <typename T>
 bool CUDARng::DoPopulateRandUniformInternal(Stream *stream,
                                             DeviceMemory<T> *v) {
-  mutex_lock lock{mu_};
+  mutex_lock lock(mu_);
   static_assert(ComplexIsConsecutiveFloats(),
                 "std::complex values are not stored as consecutive values");
 
@@ -209,7 +209,7 @@ bool CUDARng::DoPopulateRandGaussianInternal(Stream *stream, ElemT mean,
                                              ElemT stddev,
                                              DeviceMemory<ElemT> *v,
                                              FuncT func) {
-  mutex_lock lock{mu_};
+  mutex_lock lock(mu_);
 
   if (!SetStream(stream)) {
     return false;
@@ -241,7 +241,7 @@ bool CUDARng::DoPopulateRandGaussian(Stream *stream, double mean, double stddev,
 }
 
 bool CUDARng::SetSeed(Stream *stream, const uint8 *seed, uint64 seed_bytes) {
-  mutex_lock lock{mu_};
+  mutex_lock lock(mu_);
   CHECK(rng_ != nullptr);
 
   if (!CheckSeed(seed, seed_bytes)) {
diff --git a/tensorflow/stream_executor/cuda/cuda_timer.h b/tensorflow/stream_executor/cuda/cuda_timer.h
index 70554ec93120fc..e040cf86fad1f4 100644
--- a/tensorflow/stream_executor/cuda/cuda_timer.h
+++ b/tensorflow/stream_executor/cuda/cuda_timer.h
@@ -37,8 +37,9 @@ class CUDATimer : public internal::TimerInterface {
   explicit CUDATimer(CUDAExecutor *parent)
       : parent_(parent), start_event_(nullptr), stop_event_(nullptr) {}
 
-  // Note: teardown is explicitly handled in this API by a call to
+  // Note: teardown needs to be explicitly handled in this API by a call to
   // StreamExecutor::DeallocateTimer(), which invokes Destroy().
+  // TODO(csigg): Change to RAII.
   ~CUDATimer() override {}
 
   // Allocates the platform-specific pieces of the timer, called as part of
diff --git a/tensorflow/stream_executor/dnn.cc b/tensorflow/stream_executor/dnn.cc
index 031c82d3f4bfcb..82aa8ceb3298a3 100644
--- a/tensorflow/stream_executor/dnn.cc
+++ b/tensorflow/stream_executor/dnn.cc
@@ -141,6 +141,10 @@ string PadAlignmentString(PadAlignment alignment) {
   return "unknown pad alignment";
 }
 
+std::ostream& operator<<(std::ostream& str, dnn::PadAlignment alignment) {
+  return str << PadAlignmentString(alignment);
+}
+
 string ShortPoolingModeString(PoolingMode mode) {
   switch (mode) {
     case PoolingMode::kMaximum:
@@ -407,6 +411,8 @@ string FilterDescriptor::ToShortString() const {
   switch (layout_) {
     case FilterLayout::kOutputInputYX:
       return port::StrCat(od, id, spatial);
+    case FilterLayout::kOutputYXInput:
+      return port::StrCat(od, spatial, id);
     case FilterLayout::kOutputInputYX4:
       return port::StrCat(od, id, spatial, "(VECT_C)");
     case FilterLayout::kInputYXOutput:
@@ -434,6 +440,7 @@ ConvolutionDescriptor::ConvolutionDescriptor(int ndims)
       filter_strides_(ndims, 1),
       dilation_rates_(ndims, 1),
       pad_alignment_(PadAlignment::kDefault),
+      group_count_(1),
       ndims_(ndims) {}
 
 ConvolutionDescriptor::ConvolutionDescriptor()
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 28346eb4a4b025..b735eea9789d45 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -349,6 +349,8 @@ enum class FilterLayout : int64 {
   kOutputInputYX = 0,  // cuDNN's default filter layout, laid out as:
                        // (major) output feature maps >> input feature maps >>
                        // rows >> columns (minor).
+  kOutputYXInput,      // major to minor:
+                       //   (output features, row, columns, input features)
   kOutputInputYX4,  // laid out the same as kOutputInputYX but each element is a
                     // vector of 4 feature maps.
   kInputYXOutput,   // Same as dist_belief's default filter layout.
@@ -467,6 +469,9 @@ enum class PadAlignment : int64 {
 // Returns a string representation of the given padding alignment.
 string PadAlignmentString(PadAlignment alignment);
 
+// Print alignment to str. Needed to use CHECK_EQ between two PadAlignments.
+std::ostream& operator<<(std::ostream& str, dnn::PadAlignment alignment);
+
 // Describes a convolution.
 //
 // Uses the named argument construction form:
@@ -543,6 +548,10 @@ class ConvolutionDescriptor {
     pad_alignment_ = pad_alignment;
     return *this;
   }
+  ConvolutionDescriptor& set_group_count(int group_count) {
+    group_count_ = group_count;
+    return *this;
+  }
   int64 zero_padding_height() const {
     return GetDim(zero_padding_, DimIndex::Y);
   }
@@ -566,6 +575,7 @@ class ConvolutionDescriptor {
   int filter_stride(DimIndex dim) const { return GetDim(filter_strides_, dim); }
   int dilation_rate(DimIndex dim) const { return GetDim(dilation_rates_, dim); }
   PadAlignment pad_alignment() const { return pad_alignment_; }
+  int group_count() const { return group_count_; }
   int ndims() const { return ndims_; }
 
   std::vector<int64> strides() const { return filter_strides_; }
@@ -578,6 +588,7 @@ class ConvolutionDescriptor {
   std::vector<int64> filter_strides_;
   std::vector<int64> dilation_rates_;
   PadAlignment pad_alignment_;
+  int group_count_;
   int ndims_;
   // TODO(leary) cudnn provides these fields, but need to characterize what
   // their effect is -- they may be boolean rather than integral.
@@ -702,7 +713,7 @@ class PoolingDescriptor {
 class AlgorithmDesc {
  public:
   typedef int64 Index;
-  AlgorithmDesc() : algo_(kDefaultAlgorithm), tensor_ops_enabled_(false) {}
+  AlgorithmDesc() : algo_(kDefaultAlgorithm), tensor_ops_enabled_(true) {}
   AlgorithmDesc(Index a, bool use_tensor_ops)
       : algo_(a), tensor_ops_enabled_(use_tensor_ops) {}
   bool is_default() const { return algo_ == kDefaultAlgorithm; }
@@ -895,8 +906,8 @@ enum class ElementwiseOperation { kAdd, kMultiply };
 
 string ElementwiseOperationString(ElementwiseOperation op);
 
-// A simple class representing the version of the backing library, to 
-// workaround the "too perfect forwarding" issue in gcc6+ compilers. 
+// A simple class representing the version of the backing library, to
+// workaround the "too perfect forwarding" issue in gcc6+ compilers.
 // See PR#16309 and issue #18402 for links discussing the issue.
 class VersionInfo {
  public:
@@ -1064,10 +1075,8 @@ class DnnSupport {
   //    convolution result.
   //  scratch_allocator: un-owned, may-be-null object that may allocate scratch
   //    space in order to speed up the convolution operation.
-  //  algorithm: specifies which algorithm should be used for the
-  //    operation. If algorithm.is_default(), the system will pick an algorithm
-  //    by default. The coding of the algorithm is be interpretted by the
-  //    underlying implementation.
+  //  algorithm_config: specifies which algorithm should be used for the
+  //    operation.
   //  output_profile_result: the output profile result for this call. The
   //    profiling is only enabled when this is not nullptr.
   //
@@ -1166,17 +1175,13 @@ class DnnSupport {
   //    convolution input.
   //  filter_descriptor: dimensions of the convolution filter.
   //  convolution_descriptor: stride of the convolution filter.
-  //  input. This can be DeviceMemory pointing to NULL only when activation_mode
-  //  is kNone.
   //  output_descriptor: dimensions of the output layer.
   //  output_data: un-owned device memory region in which to place the
   //    convolution result.
   //  scratch_allocator: un-owned, may-be-null object that may allocate scratch
   //    space in order to speed up the convolution operation.
-  //  algorithm: an integer to specify which algorithm should be used for the
-  //    operation. kDefaultAlgorithm means the system will pick an algorithm
-  //    by default. The coding of the algorithm is be interpreted by the
-  //    underlying implementation.
+  //  algorithm_config: specifies which algorithm should be used for the
+  //    operation.
   //  output_profile_result: the output profile result for this call. The
   //    profiling is only enabled when this is not nullptr.
   //
@@ -1233,6 +1238,7 @@ class DnnSupport {
       ProfileResult* output_profile_result) = 0;
 
   // Return a list of algorithms supported by the forward convolution pass.
+  // cc_major and cc_minor are the compute capabilities of the device.
   virtual bool GetConvolveAlgorithms(
       bool with_winograd_nonfused, int cc_major, int cc_minor,
       std::vector<AlgorithmDesc>* out_algorithms);
@@ -2056,8 +2062,8 @@ class DnnSupport {
                       const dnn::AlgorithmConfig& algorithm_config,
                       float dropout, uint64 seed,
                       ScratchAllocator* state_allocator) {
-    return port::Status{port::error::UNIMPLEMENTED,
-                        "createRnnDescriptor is unimplemented"};
+    return port::Status(port::error::UNIMPLEMENTED,
+                        "createRnnDescriptor is unimplemented");
   }
 
   // Create a RNN sequence descriptor that specifies either the input or output
@@ -2071,8 +2077,8 @@ class DnnSupport {
   virtual port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
   createRnnSequenceTensorDescriptor(int seq_length, int batch_size,
                                     int data_size, dnn::DataType data_type) {
-    return port::Status{port::error::UNIMPLEMENTED,
-                        "createRnnSequenceTensorDescriptor is unimplemented"};
+    return port::Status(port::error::UNIMPLEMENTED,
+                        "createRnnSequenceTensorDescriptor is unimplemented");
   }
 
   // Create an RNN state descriptor that specifies the input or hidden state.
@@ -2080,8 +2086,8 @@ class DnnSupport {
   virtual port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
   createRnnStateTensorDescriptor(int num_layer, int batch_size, int data_size,
                                  dnn::DataType data_type) {
-    return port::Status{port::error::UNIMPLEMENTED,
-                        "createRnnStateTensorDescriptor is unimplemented"};
+    return port::Status(port::error::UNIMPLEMENTED,
+                        "createRnnStateTensorDescriptor is unimplemented");
   }
 
   // Enqueue a forward operation of the RNN model onto the stream.
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.h b/tensorflow/stream_executor/host/host_gpu_executor.h
index b86266ebcf6e09..58dc25d3461b3a 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.h
+++ b/tensorflow/stream_executor/host/host_gpu_executor.h
@@ -106,19 +106,19 @@ class HostExecutor : public internal::StreamExecutorInterface {
   bool HostCallback(Stream *stream, std::function<void()> callback) override;
 
   port::Status AllocateEvent(Event *event) override {
-    return port::Status{port::error::UNIMPLEMENTED, ""};
+    return port::Status(port::error::UNIMPLEMENTED, "");
   }
 
   port::Status DeallocateEvent(Event *event) override {
-    return port::Status{port::error::UNIMPLEMENTED, ""};
+    return port::Status(port::error::UNIMPLEMENTED, "");
   }
 
   port::Status RecordEvent(Stream *stream, Event *event) override {
-    return port::Status{port::error::UNIMPLEMENTED, ""};
+    return port::Status(port::error::UNIMPLEMENTED, "");
   }
 
   port::Status WaitForEvent(Stream *stream, Event *event) override {
-    return port::Status{port::error::UNIMPLEMENTED, ""};
+    return port::Status(port::error::UNIMPLEMENTED, "");
   }
 
   Event::Status PollForEventStatus(Event *event) override {
@@ -167,7 +167,7 @@ class HostExecutor : public internal::StreamExecutorInterface {
         "Shared memory configuration is unsupported for host "
         "executors."};
     LOG(INFO) << error_msg;
-    return port::Status{port::error::UNIMPLEMENTED, error_msg};
+    return port::Status(port::error::UNIMPLEMENTED, error_msg);
   }
 
   bool SupportsBlas() const override;
diff --git a/tensorflow/stream_executor/host/host_platform.cc b/tensorflow/stream_executor/host/host_platform.cc
index a652b08b4fc7e0..410dc9da899cc9 100644
--- a/tensorflow/stream_executor/host/host_platform.cc
+++ b/tensorflow/stream_executor/host/host_platform.cc
@@ -70,11 +70,11 @@ HostPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
       this, MakeUnique<HostExecutor>(config.plugin_config));
   auto init_status = executor->Init(config.ordinal, config.device_options);
   if (!init_status.ok()) {
-    return port::Status{
+    return port::Status(
         port::error::INTERNAL,
         port::Printf(
             "failed initializing StreamExecutor for device ordinal %d: %s",
-            config.ordinal, init_status.ToString().c_str())};
+            config.ordinal, init_status.ToString().c_str()));
   }
 
   return std::move(executor);
@@ -100,7 +100,6 @@ static void InitializeHostPlatform() {
 REGISTER_MODULE_INITIALIZER(host_platform,
                             stream_executor::host::InitializeHostPlatform());
 
-DECLARE_MODULE_INITIALIZER(multi_platform_manager);
 // Note that module initialization sequencing is not supported in the
 // open-source project, so this will be a no-op there.
 REGISTER_MODULE_INITIALIZER_SEQUENCE(host_platform, multi_platform_manager);
diff --git a/tensorflow/stream_executor/host_or_device_scalar.h b/tensorflow/stream_executor/host_or_device_scalar.h
index c9e3e14778384a..1f5d4b9260ce21 100644
--- a/tensorflow/stream_executor/host_or_device_scalar.h
+++ b/tensorflow/stream_executor/host_or_device_scalar.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_STREAM_EXECUTOR_HOST_OR_DEVICE_SCALAR_H_
 #define TENSORFLOW_STREAM_EXECUTOR_HOST_OR_DEVICE_SCALAR_H_
 
-#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/platform/logging.h"
 
 namespace stream_executor {
 
diff --git a/tensorflow/stream_executor/kernel.cc b/tensorflow/stream_executor/kernel.cc
index d1aa596b73da3d..7c1923da51fae7 100644
--- a/tensorflow/stream_executor/kernel.cc
+++ b/tensorflow/stream_executor/kernel.cc
@@ -94,7 +94,7 @@ KernelCacheConfig KernelBase::GetPreferredCacheConfig() const {
 static const char *kStubPrefix = "__device_stub_";
 
 void KernelBase::set_name(port::StringPiece name) {
-  name_ = name.ToString();
+  name_ = std::string(name);
   port::StringPiece stubless_name = name;
   if (tensorflow::str_util::StartsWith(name, kStubPrefix)) {
     stubless_name.remove_prefix(strlen(kStubPrefix));
diff --git a/tensorflow/stream_executor/kernel_spec.cc b/tensorflow/stream_executor/kernel_spec.cc
index 6a1f0a591ff087..902892af3f0118 100644
--- a/tensorflow/stream_executor/kernel_spec.cc
+++ b/tensorflow/stream_executor/kernel_spec.cc
@@ -18,11 +18,11 @@ limitations under the License.
 namespace stream_executor {
 
 KernelLoaderSpec::KernelLoaderSpec(port::StringPiece kernelname)
-    : kernelname_(kernelname.ToString()) {}
+    : kernelname_(std::string(kernelname)) {}
 
 OnDiskKernelLoaderSpec::OnDiskKernelLoaderSpec(port::StringPiece filename,
                                                port::StringPiece kernelname)
-    : KernelLoaderSpec(kernelname), filename_(filename.ToString()) {}
+    : KernelLoaderSpec(kernelname), filename_(std::string(filename)) {}
 
 CudaPtxOnDisk::CudaPtxOnDisk(port::StringPiece filename,
                              port::StringPiece kernelname)
@@ -93,7 +93,7 @@ const char *CudaPtxInMemory::default_text() const {
     return nullptr;
   }
 
-  mutex_lock lock{mu_};
+  mutex_lock lock(mu_);
 
   auto ptx = ptx_by_compute_capability_.begin()->second;
   // Check if there is an entry in decompressed ptx table.
@@ -127,7 +127,7 @@ const char *CudaPtxInMemory::text(int compute_capability_major,
     return nullptr;
   }
 
-  mutex_lock lock{mu_};
+  mutex_lock lock(mu_);
 
   // Check if there is an entry in decompressed ptx table.
   auto decompressed_ptx_iter = decompressed_ptx_.find(ptx_iter->second);
@@ -161,7 +161,7 @@ OpenCLTextOnDisk::OpenCLTextOnDisk(port::StringPiece filename,
 
 OpenCLTextInMemory::OpenCLTextInMemory(port::StringPiece text,
                                        port::StringPiece kernelname)
-    : KernelLoaderSpec(kernelname), text_(text.ToString()) {}
+    : KernelLoaderSpec(kernelname), text_(std::string(text)) {}
 
 OpenCLBinaryOnDisk::OpenCLBinaryOnDisk(port::StringPiece filename,
                                        port::StringPiece kernelname)
diff --git a/tensorflow/stream_executor/lib/env.h b/tensorflow/stream_executor/lib/env.h
index 776eba04080e9e..3ef8deb72e8ffa 100644
--- a/tensorflow/stream_executor/lib/env.h
+++ b/tensorflow/stream_executor/lib/env.h
@@ -32,7 +32,7 @@ inline Status FileExists(const string& filename) {
 }
 
 inline Status FileExists(const port::StringPiece& filename) {
-  return Env::Default()->FileExists(filename.ToString());
+  return Env::Default()->FileExists(std::string(filename));
 }
 
 }  // namespace port
diff --git a/tensorflow/stream_executor/lib/path.cc b/tensorflow/stream_executor/lib/path.cc
index 56e08c316f9575..58a862206c7855 100644
--- a/tensorflow/stream_executor/lib/path.cc
+++ b/tensorflow/stream_executor/lib/path.cc
@@ -33,7 +33,7 @@ string JoinPathImpl(std::initializer_list<port::StringPiece> paths) {
     if (path.empty()) continue;
 
     if (result.empty()) {
-      result = path.ToString();
+      result = std::string(path);
       continue;
     }
 
diff --git a/tensorflow/stream_executor/lib/str_util.h b/tensorflow/stream_executor/lib/str_util.h
index a81c6668184c15..b02fe4f56f24be 100644
--- a/tensorflow/stream_executor/lib/str_util.h
+++ b/tensorflow/stream_executor/lib/str_util.h
@@ -31,7 +31,7 @@ inline string StripSuffixString(port::StringPiece str, port::StringPiece suffix)
   if (tensorflow::str_util::EndsWith(str, suffix)) {
     str.remove_suffix(suffix.size());
   }
-  return str.ToString();
+  return std::string(str);
 }
 
 using tensorflow::str_util::Lowercase;
diff --git a/tensorflow/stream_executor/multi_platform_manager.h b/tensorflow/stream_executor/multi_platform_manager.h
index 7e316879ca0cf9..146a128e85cfe8 100644
--- a/tensorflow/stream_executor/multi_platform_manager.h
+++ b/tensorflow/stream_executor/multi_platform_manager.h
@@ -68,6 +68,7 @@ limitations under the License.
 #include <map>
 #include <memory>
 
+#include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
 #include "tensorflow/stream_executor/platform.h"
@@ -182,4 +183,9 @@ class MultiPlatformManager {
 
 }  // namespace stream_executor
 
+// multi_platform_manager.cc will define this instance. Includers of this header
+// should use
+// REGISTER_MODULE_INITIALIZER_SEQUENCE(my_platform, multi_platform_manager);
+DECLARE_MODULE_INITIALIZER(multi_platform_manager);
+
 #endif  // TENSORFLOW_STREAM_EXECUTOR_MULTI_PLATFORM_MANAGER_H_
diff --git a/tensorflow/stream_executor/plugin_registry.cc b/tensorflow/stream_executor/plugin_registry.cc
index 7812703efd8b4f..c53685c57b0103 100644
--- a/tensorflow/stream_executor/plugin_registry.cc
+++ b/tensorflow/stream_executor/plugin_registry.cc
@@ -72,11 +72,11 @@ port::Status PluginRegistry::RegisterFactoryInternal(
   mutex_lock lock{GetPluginRegistryMutex()};
 
   if (factories->find(plugin_id) != factories->end()) {
-    return port::Status{
+    return port::Status(
         port::error::ALREADY_EXISTS,
         port::Printf("Attempting to register factory for plugin %s when "
                      "one has already been registered",
-                     plugin_name.c_str())};
+                     plugin_name.c_str()));
   }
 
   (*factories)[plugin_id] = factory;
@@ -92,9 +92,9 @@ port::StatusOr<FACTORY_TYPE> PluginRegistry::GetFactoryInternal(
   if (iter == factories.end()) {
     iter = generic_factories.find(plugin_id);
     if (iter == generic_factories.end()) {
-      return port::Status{
+      return port::Status(
           port::error::NOT_FOUND,
-          port::Printf("Plugin ID %p not registered.", plugin_id)};
+          port::Printf("Plugin ID %p not registered.", plugin_id));
     }
   }
 
@@ -212,10 +212,11 @@ bool PluginRegistry::HasFactory(Platform::Id platform_id,
       plugin_id = default_factories_[platform_id].FACTORY_VAR;                \
                                                                               \
       if (plugin_id == kNullPlugin) {                                         \
-        return port::Status{port::error::FAILED_PRECONDITION,                 \
-                            "No suitable " PLUGIN_STRING                      \
-                            " plugin registered. Have you linked in a "       \
-                            PLUGIN_STRING "-providing plugin?"};              \
+        return port::Status(                                                  \
+            port::error::FAILED_PRECONDITION,                                 \
+            "No suitable " PLUGIN_STRING                                      \
+            " plugin registered. Have you linked in a " PLUGIN_STRING         \
+            "-providing plugin?");                                            \
       } else {                                                                \
         VLOG(2) << "Selecting default " PLUGIN_STRING " plugin, "             \
                 << plugin_names_[plugin_id];                                  \
@@ -231,9 +232,9 @@ bool PluginRegistry::HasFactory(Platform::Id platform_id,
       PlatformKind platform_kind, PluginId plugin_id) {                       \
     auto iter = platform_id_by_kind_.find(platform_kind);                     \
     if (iter == platform_id_by_kind_.end()) {                                 \
-      return port::Status{port::error::FAILED_PRECONDITION,                   \
+      return port::Status(port::error::FAILED_PRECONDITION,                   \
                           port::Printf("Platform kind %d not registered.",    \
-                                       static_cast<int>(platform_kind))};     \
+                                       static_cast<int>(platform_kind)));     \
     }                                                                         \
     return GetFactory<PluginRegistry::FACTORY_TYPE>(iter->second, plugin_id); \
   }
diff --git a/tensorflow/stream_executor/rocm/rocm_blas.cc b/tensorflow/stream_executor/rocm/rocm_blas.cc
index c9330ce93bc304..2b8da11a3196b7 100644
--- a/tensorflow/stream_executor/rocm/rocm_blas.cc
+++ b/tensorflow/stream_executor/rocm/rocm_blas.cc
@@ -2317,6 +2317,24 @@ port::Status ROCMBlas::DoBlasGemmBatchedInternal(
                       "failed BLAS call, see log for details");
 }
 
+bool ROCMBlas::DoBlasGemmBatched(
+    Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
+    uint64 n, uint64 k, float alpha,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &a, int lda,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &b, int ldb,
+    float beta, const port::ArraySlice<DeviceMemory<Eigen::half> *> &c,
+    int ldc, int batch_count, ScratchAllocator *scratch_allocator) {
+  return false;
+  //port::Status status = DoBlasGemmBatchedInternal(
+  //    wrap::hipblasCgemmBatched, stream, transa, transb, m, n, k, alpha, a_array,
+  //    lda, b_array, ldb, beta, c_array, ldc, batch_count, scratch_allocator);
+  //if (!status.ok()) {
+  //  LOG(ERROR) << status;
+  //}
+  //return status.ok();
+}
+
+
 bool ROCMBlas::DoBlasGemmBatched(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64 m,
     uint64 n, uint64 k, float alpha,
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index f6b459c6e7b741..5fcc8254db82db 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -276,7 +276,7 @@ Stream::~Stream() {
 Stream &Stream::Init() {
   VLOG_CALL();
 
-  mutex_lock lock{mu_};
+  mutex_lock lock(mu_);
   CHECK_EQ(false, allocated_)
       << "stream appears to already have been initialized";
   CHECK(!ok_) << "stream should be in !ok() state pre-initialization";
@@ -1910,7 +1910,7 @@ Stream &Stream::ThenCopyDevice2HostBuffer(
 }
 
 Stream *Stream::GetOrCreateSubStream() {
-  mutex_lock lock{mu_};
+  mutex_lock lock(mu_);
   for (auto &stream : sub_streams_) {
     if (stream.second) {
       stream.second = false;
@@ -1927,7 +1927,7 @@ Stream *Stream::GetOrCreateSubStream() {
 }
 
 void Stream::ReturnSubStream(Stream *sub_stream) {
-  mutex_lock lock{mu_};
+  mutex_lock lock(mu_);
   for (auto &stream : sub_streams_) {
     if (stream.first.get() == sub_stream) {
       stream.second = true;
@@ -4491,6 +4491,40 @@ Stream &Stream::ThenBlasTrsm(blas::Side side, blas::UpperLower uplo,
               n, alpha, a, lda, b, ldb);
 }
 
+Stream &Stream::ThenBlasGemmBatched(
+    blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+    uint64 k, float alpha,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &a, int lda,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &b, int ldb, float beta,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &c, int ldc,
+    int batch_count) {
+  return ThenBlasGemmBatchedWithScratch(transa, transb, m, n, k, alpha, a, lda,
+                                        b, ldb, beta, c, ldc, batch_count,
+                                        /*scratch_allocator=*/nullptr);
+}
+
+Stream &Stream::ThenBlasGemmBatchedWithScratch(
+    blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+    uint64 k, float alpha,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &a, int lda,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &b, int ldb, float beta,
+    const port::ArraySlice<DeviceMemory<Eigen::half> *> &c, int ldc,
+    int batch_count, ScratchAllocator *scratch_allocator) {
+  VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
+            PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
+            PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count));
+
+  ThenBlasImpl<blas::Transpose, blas::Transpose, uint64, uint64, uint64, float,
+               const port::ArraySlice<DeviceMemory<Eigen::half> *> &, int,
+               const port::ArraySlice<DeviceMemory<Eigen::half> *> &, int,
+               float, const port::ArraySlice<DeviceMemory<Eigen::half> *> &,
+               int, int, ScratchAllocator *>
+      impl;
+  return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n,
+              k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count,
+              scratch_allocator);
+}
+
 Stream &Stream::ThenBlasGemmBatched(
     blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
     uint64 k, float alpha, const port::ArraySlice<DeviceMemory<float> *> &a,
@@ -5207,7 +5241,7 @@ port::Status Stream::BlockHostUntilDone() {
   port::Status first_error;
   {
     // Wait until all active sub-streams have done their tasks.
-    mutex_lock lock{mu_};
+    mutex_lock lock(mu_);
     for (auto &stream : sub_streams_) {
       if (!stream.second) {
         first_error.Update(stream.first->BlockHostUntilDone());
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index 032c76e9a7e185..e2f9670ed0857a 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -66,9 +66,6 @@ namespace dnn {
 class BatchDescriptor;
 class FilterDescriptor;
 class ConvolutionDescriptor;
-class BatchDescriptor;
-class FilterDescriptor;
-class ConvolutionDescriptor;
 class ProfileResult;
 class AlgorithmDesc;
 }  // namespace dnn
@@ -1481,6 +1478,13 @@ class Stream {
       blas::ProfileResult *output_profile_result);
 
   // See BlasSupport::DoBlasGemmBatched.
+  Stream &ThenBlasGemmBatched(
+      blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+      uint64 k, float alpha,
+      const port::ArraySlice<DeviceMemory<Eigen::half> *> &a, int lda,
+      const port::ArraySlice<DeviceMemory<Eigen::half> *> &b, int ldb,
+      float beta, const port::ArraySlice<DeviceMemory<Eigen::half> *> &c,
+      int ldc, int batch_count);
   Stream &ThenBlasGemmBatched(blas::Transpose transa, blas::Transpose transb,
                               uint64 m, uint64 n, uint64 k, float alpha,
                               const port::ArraySlice<DeviceMemory<float> *> &a,
@@ -1513,6 +1517,13 @@ class Stream {
       std::complex<double> beta,
       const port::ArraySlice<DeviceMemory<std::complex<double>> *> &c, int ldc,
       int batch_count);
+  Stream &ThenBlasGemmBatchedWithScratch(
+      blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
+      uint64 k, float alpha,
+      const port::ArraySlice<DeviceMemory<Eigen::half> *> &a, int lda,
+      const port::ArraySlice<DeviceMemory<Eigen::half> *> &b, int ldb,
+      float beta, const port::ArraySlice<DeviceMemory<Eigen::half> *> &c,
+      int ldc, int batch_count, ScratchAllocator *scratch_allocator);
   Stream &ThenBlasGemmBatchedWithScratch(
       blas::Transpose transa, blas::Transpose transb, uint64 m, uint64 n,
       uint64 k, float alpha, const port::ArraySlice<DeviceMemory<float> *> &a,
@@ -2012,7 +2023,7 @@ class Stream {
   friend class ocl::CLBlas;    // for parent_.
 
   bool InErrorState() const LOCKS_EXCLUDED(mu_) {
-    tf_shared_lock lock{mu_};
+    tf_shared_lock lock(mu_);
     return !ok_;
   }
 
@@ -2022,7 +2033,7 @@ class Stream {
     if (operation_retcode) {
       return;
     }
-    mutex_lock lock{mu_};
+    mutex_lock lock(mu_);
     ok_ = false;
   }
 
diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h
index c9f77f0995bcf8..9090c427e9e6ca 100644
--- a/tensorflow/stream_executor/stream_executor_internal.h
+++ b/tensorflow/stream_executor/stream_executor_internal.h
@@ -175,6 +175,15 @@ class StreamExecutorInterface {
   virtual void *AllocateSubBuffer(DeviceMemoryBase *parent, uint64 offset,
                                   uint64 size) = 0;
   virtual void Deallocate(DeviceMemoryBase *mem) = 0;
+  // Allocates unified memory space of the given size, if supported.
+  // See
+  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-unified-memory-programming-hd
+  // for more details on unified memory.
+  virtual void *UnifiedMemoryAllocate(uint64 size) { return nullptr; }
+
+  // Deallocates unified memory space previously allocated with
+  // UnifiedMemoryAllocate.
+  virtual void UnifiedMemoryDeallocate(void *mem) {}
   virtual void *HostMemoryAllocate(uint64 size) = 0;
   virtual void HostMemoryDeallocate(void *mem) = 0;
   virtual bool HostMemoryRegister(void *mem, uint64 size) = 0;
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.cc b/tensorflow/stream_executor/stream_executor_pimpl.cc
index 4e0e47c8e3f6ca..99ee020b77eb02 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.cc
+++ b/tensorflow/stream_executor/stream_executor_pimpl.cc
@@ -237,7 +237,7 @@ void StreamExecutor::Deallocate(DeviceMemoryBase *mem) {
 }
 
 void StreamExecutor::GetMemAllocs(std::map<void *, AllocRecord> *records_out) {
-  tf_shared_lock lock{mu_};
+  tf_shared_lock lock(mu_);
   *records_out = mem_allocs_;
 }
 
@@ -261,13 +261,13 @@ port::Status StreamExecutor::SetDeviceSharedMemoryConfig(
     string error_msg = port::Printf(
         "Invalid shared memory config specified: %d", static_cast<int>(config));
     LOG(ERROR) << error_msg;
-    return port::Status{port::error::INVALID_ARGUMENT, error_msg};
+    return port::Status(port::error::INVALID_ARGUMENT, error_msg);
   }
   return implementation_->SetDeviceSharedMemoryConfig(config);
 }
 
 const DeviceDescription &StreamExecutor::GetDeviceDescription() const {
-  mutex_lock lock{mu_};
+  mutex_lock lock(mu_);
   if (device_description_ != nullptr) {
     return *device_description_;
   }
@@ -398,7 +398,7 @@ StreamExecutor::createRnnStateTensorDescriptor(int num_layer, int batch_size,
 }
 
 dnn::DnnSupport *StreamExecutor::AsDnn() {
-  mutex_lock lock{mu_};
+  mutex_lock lock(mu_);
   if (dnn_ != nullptr) {
     return dnn_.get();
   }
@@ -408,7 +408,7 @@ dnn::DnnSupport *StreamExecutor::AsDnn() {
 }
 
 blas::BlasSupport *StreamExecutor::AsBlas() {
-  mutex_lock lock{mu_};
+  mutex_lock lock(mu_);
   if (blas_ != nullptr) {
     return blas_.get();
   }
@@ -418,7 +418,7 @@ blas::BlasSupport *StreamExecutor::AsBlas() {
 }
 
 fft::FftSupport *StreamExecutor::AsFft() {
-  mutex_lock lock{mu_};
+  mutex_lock lock(mu_);
   if (fft_ != nullptr) {
     return fft_.get();
   }
@@ -428,7 +428,7 @@ fft::FftSupport *StreamExecutor::AsFft() {
 }
 
 rng::RngSupport *StreamExecutor::AsRng() {
-  mutex_lock lock{mu_};
+  mutex_lock lock(mu_);
   if (rng_ != nullptr) {
     return rng_.get();
   }
@@ -469,6 +469,20 @@ bool StreamExecutor::GetSymbol(const string &symbol_name, void **mem,
   return implementation_->GetSymbol(symbol_name, mem, bytes);
 }
 
+void *StreamExecutor::UnifiedMemoryAllocate(uint64 bytes) {
+  void *buffer = implementation_->UnifiedMemoryAllocate(bytes);
+  VLOG(1) << "Called StreamExecutor::UnifiedMemoryAllocate(size=" << bytes
+          << ") returns " << buffer << StackTraceIfVLOG10();
+  return buffer;
+}
+
+void StreamExecutor::UnifiedMemoryDeallocate(void *location) {
+  VLOG(1) << "Called StreamExecutor::UnifiedMemoryDeallocate(location="
+          << location << ")" << StackTraceIfVLOG10();
+
+  return implementation_->UnifiedMemoryDeallocate(location);
+}
+
 void *StreamExecutor::HostMemoryAllocate(uint64 size) {
   void *buffer = implementation_->HostMemoryAllocate(size);
   VLOG(1) << "Called StreamExecutor::HostMemoryAllocate(size=" << size
@@ -587,12 +601,12 @@ port::Status StreamExecutor::SynchronousMemcpyD2H(
 
   result = implementation_->SynchronousMemcpy(host_dst, device_src, size);
   if (!result.ok()) {
-    result = port::Status{port::error::INTERNAL,
+    result = port::Status(port::error::INTERNAL,
                           port::Printf("failed to synchronously memcpy "
                                        "device-to-host: device %p to host %p "
                                        "size %lld: %s",
                                        device_src.opaque(), host_dst, size,
-                                       result.ToString().c_str())};
+                                       result.ToString().c_str()));
   }
 
   return result;
@@ -610,12 +624,12 @@ port::Status StreamExecutor::SynchronousMemcpyH2D(
 
   result = implementation_->SynchronousMemcpy(device_dst, host_src, size);
   if (!result.ok()) {
-    result = port::Status{
+    result = port::Status(
         port::error::INTERNAL,
         port::Printf("failed to synchronously memcpy host-to-device: host "
                      "%p to device %p size %lld: %s",
                      host_src, device_dst->opaque(), size,
-                     result.ToString().c_str())};
+                     result.ToString().c_str()));
   }
 
   return result;
@@ -728,7 +742,7 @@ void StreamExecutor::EnqueueOnBackgroundThread(std::function<void()> task) {
 
 void StreamExecutor::CreateAllocRecord(void *opaque, uint64 bytes) {
   if (FLAGS_check_device_leaks && opaque != nullptr && bytes != 0) {
-    mutex_lock lock{mu_};
+    mutex_lock lock(mu_);
     mem_allocs_[opaque] = AllocRecord{
         bytes, ""};
   }
@@ -736,7 +750,7 @@ void StreamExecutor::CreateAllocRecord(void *opaque, uint64 bytes) {
 
 void StreamExecutor::EraseAllocRecord(void *opaque) {
   if (FLAGS_check_device_leaks && opaque != nullptr) {
-    mutex_lock lock{mu_};
+    mutex_lock lock(mu_);
     if (mem_allocs_.find(opaque) == mem_allocs_.end()) {
       LOG(ERROR) << "Deallocating unknown pointer: "
                  << port::Printf("0x%p", opaque);
@@ -750,7 +764,7 @@ void StreamExecutor::EnableTracing(bool enabled) { tracing_enabled_ = enabled; }
 
 void StreamExecutor::RegisterTraceListener(TraceListener *listener) {
   {
-    mutex_lock lock{mu_};
+    mutex_lock lock(mu_);
     if (listeners_.find(listener) != listeners_.end()) {
       LOG(INFO) << "Attempt to register already-registered listener, "
                 << listener;
@@ -764,7 +778,7 @@ void StreamExecutor::RegisterTraceListener(TraceListener *listener) {
 
 bool StreamExecutor::UnregisterTraceListener(TraceListener *listener) {
   {
-    mutex_lock lock{mu_};
+    mutex_lock lock(mu_);
     if (listeners_.find(listener) == listeners_.end()) {
       LOG(INFO) << "Attempt to unregister unknown listener, " << listener;
       return false;
@@ -781,7 +795,7 @@ void StreamExecutor::SubmitTrace(TraceCallT trace_call, ArgsT &&... args) {
   if (tracing_enabled_) {
     {
       // instance tracers held in a block to limit the lock lifetime.
-      tf_shared_lock lock{mu_};
+      tf_shared_lock lock(mu_);
       for (TraceListener *listener : listeners_) {
         (listener->*trace_call)(std::forward<ArgsT>(args)...);
       }
diff --git a/tensorflow/stream_executor/stream_executor_pimpl.h b/tensorflow/stream_executor/stream_executor_pimpl.h
index ab6b00f6601b5f..ad80a1ba259ce0 100644
--- a/tensorflow/stream_executor/stream_executor_pimpl.h
+++ b/tensorflow/stream_executor/stream_executor_pimpl.h
@@ -177,6 +177,9 @@ class StreamExecutor {
   //
   // Resets the internal contents of mem to be null-representative, but this
   // null-out effect should not be relied upon in client code.
+  //
+  // TODO(jlebar): Change this to accept a DeviceMemoryBase by value, see
+  // discussion in cl/195744342.
   void Deallocate(DeviceMemoryBase *mem);
 
   // Retrieves a mapping of active opaque device memory pointer to a string
@@ -187,6 +190,16 @@ class StreamExecutor {
   // activated.
   void GetMemAllocs(std::map<void *, AllocRecord> *records_out);
 
+  // Allocates unified memory space of the given size, if supported.
+  // See
+  // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#um-unified-memory-programming-hd
+  // for more details on unified memory.
+  void *UnifiedMemoryAllocate(uint64 bytes);
+
+  // Deallocates unified memory space previously allocated with
+  // UnifiedMemoryAllocate.
+  void UnifiedMemoryDeallocate(void *location);
+
   // Allocates a region of host memory and registers it with the platform API.
   // Memory allocated in this manner (or allocated and registered with
   // HostMemoryRegister() is required for use in asynchronous memcpy operations,
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 8edbeb974bb7bd..f9ad86a4f2ffc2 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -514,7 +514,9 @@ def tf_gen_op_wrappers_cc(name,
 #   hidden: Optional list of ops names to make private in the Python module.
 #     It is invalid to specify both "hidden" and "op_whitelist".
 #   visibility: passed to py_library.
-#   deps: list of dependencies for the generated target.
+#   deps: list of dependencies for the intermediate tool used to generate the
+#     python target. NOTE these `deps` are not applied to the final python
+#     library target itself.
 #   require_shape_functions: leave this as False.
 #   hidden_file: optional file that contains a list of op names to make private
 #     in the generated Python module. Each op name should be on a line by
@@ -993,15 +995,6 @@ def tf_gpu_library(deps=None, gpu_deps=None, copts=tf_copts(), **kwargs):
   if not gpu_deps:
     gpu_deps = []
 
-  if 'linkstatic' not in kwargs or kwargs['linkstatic'] != 1:
-    enable_text_relocation_linkopt = select({
-          clean_dep("//tensorflow:darwin"): [],
-          clean_dep("//tensorflow:windows"): [],
-          "//conditions:default": ['-Wl,-z,notext'],})
-    if 'linkopts' in kwargs:
-      kwargs['linkopts'] += enable_text_relocation_linkopt
-    else:
-      kwargs['linkopts'] = enable_text_relocation_linkopt
   native.cc_library(
       # ROCM TODO figure this out
       #deps=deps + if_cuda(gpu_deps + [
@@ -1352,7 +1345,7 @@ def tf_custom_op_library(name, srcs=[], gpu_srcs=[], deps=[], linkopts=[]):
         name=basename + "_gpu",
         srcs=gpu_srcs,
         # ROCM TODO figure this out
-        #copts=_cuda_copts(),
+        #copts=_cuda_copts() + if_tensorrt(["-DGOOGLE_TENSORRT=1"]),,
         #deps=deps + if_cuda(cuda_deps))
         copts=_cuda_copts() + _rocm_copts(),
         deps=deps + if_rocm(rocm_deps))
@@ -1413,12 +1406,6 @@ register_extension_info(
     label_regex_for_dep = "{extension_name}",
 )
 
-def tf_extension_linkopts():
-  return []  # No extension link opts
-
-def tf_extension_copts():
-  return []  # No extension c opts
-
 # In tf_py_wrap_cc generated libraries
 # module init functions are not exported unless
 # they contain one of the keywords in the version file
@@ -1496,13 +1483,13 @@ def tf_py_wrap_cc(name,
   extra_linkopts = select({
       "@local_config_cuda//cuda:darwin": [
           "-Wl,-exported_symbols_list",
-          "%s.lds"%vscriptname,
+          "$(location %s.lds)"%vscriptname,
       ],
       clean_dep("//tensorflow:windows"): [],
       clean_dep("//tensorflow:windows_msvc"): [],
       "//conditions:default": [
           "-Wl,--version-script",
-          "%s.lds"%vscriptname,
+          "$(location %s.lds)"%vscriptname,
       ]
   })
   extra_deps += select({
@@ -1519,10 +1506,10 @@ def tf_py_wrap_cc(name,
   tf_cc_shared_object(
       name=cc_library_name,
       srcs=[module_name + ".cc"],
-      copts=(copts + if_not_windows([
+      copts=copts + if_not_windows([
           "-Wno-self-assign", "-Wno-sign-compare", "-Wno-write-strings"
-      ]) + tf_extension_copts()),
-      linkopts=tf_extension_linkopts() + extra_linkopts,
+      ]),
+      linkopts=extra_linkopts,
       linkstatic=1,
       deps=deps + extra_deps,
       **kwargs)
@@ -1543,7 +1530,7 @@ def tf_py_wrap_cc(name,
 # This macro is for running python tests against system installed pip package
 # on Windows.
 #
-# py_test is built as an exectuable python zip file on Windows, which contains all
+# py_test is built as an executable python zip file on Windows, which contains all
 # dependencies of the target. Because of the C++ extensions, it would be very
 # inefficient if the py_test zips all runfiles, plus we don't need them when running
 # tests against system installed pip package. So we'd like to get rid of the deps
@@ -1558,6 +1545,7 @@ def tf_py_wrap_cc(name,
 # 2. When --define=no_tensorflow_py_deps=false (by default), it's a normal py_test.
 def py_test(deps=[], data=[], **kwargs):
   native.py_test(
+      # TODO(jlebar): Ideally we'd use tcmalloc here.,
       deps=select({
           "//conditions:default": deps,
           clean_dep("//tensorflow:no_tensorflow_py_deps"): [],
@@ -1784,7 +1772,7 @@ def tf_py_build_info_genrule():
       name="py_build_info_gen",
       outs=["platform/build_info.py"],
       cmd=
-      "$(location //tensorflow/tools/build_info:gen_build_info.py) --raw_generate \"$@\" --build_config " + if_cuda("cuda", "cpu"),
+     "$(location //tensorflow/tools/build_info:gen_build_info.py) --raw_generate \"$@\" --build_config " + if_cuda("cuda", "cpu"),
       local=1,
       tools=[clean_dep("//tensorflow/tools/build_info:gen_build_info.py")],)
 
diff --git a/tensorflow/tools/api/generator/BUILD b/tensorflow/tools/api/generator/BUILD
index a1c569951e9916..f0c5877a90836e 100644
--- a/tensorflow/tools/api/generator/BUILD
+++ b/tensorflow/tools/api/generator/BUILD
@@ -9,8 +9,9 @@ py_binary(
     name = "create_python_api",
     srcs = ["create_python_api.py"],
     srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/python",
+        "//tensorflow/python:no_contrib",
     ],
 )
 
@@ -23,114 +24,3 @@ py_test(
         "//tensorflow/python:client_testlib",
     ],
 )
-
-genrule(
-    name = "python_api_gen",
-    # List of API files. This list should include file name for
-    # every module exported using tf_export. For e.g. if an op is decorated with
-    # @tf_export('module1.module2', 'module3'). Then, outs should include
-    # api/module1/module2/__init__.py and api/module3/__init__.py.
-    # keep sorted
-    outs = [
-        # BEGIN GENERATED FILES
-        "api/__init__.py",
-        "api/app/__init__.py",
-        "api/bitwise/__init__.py",
-        "api/compat/__init__.py",
-        "api/contrib/__init__.py",
-        "api/contrib/stat_summarizer/__init__.py",
-        "api/data/__init__.py",
-        "api/distributions/__init__.py",
-        "api/distributions/bijectors/__init__.py",
-        "api/errors/__init__.py",
-        "api/estimator/__init__.py",
-        "api/estimator/export/__init__.py",
-        "api/estimator/inputs/__init__.py",
-        "api/feature_column/__init__.py",
-        "api/gfile/__init__.py",
-        "api/graph_util/__init__.py",
-        "api/image/__init__.py",
-        "api/initializers/__init__.py",
-        "api/keras/__init__.py",
-        "api/keras/activations/__init__.py",
-        "api/keras/applications/__init__.py",
-        "api/keras/applications/densenet/__init__.py",
-        "api/keras/applications/inception_resnet_v2/__init__.py",
-        "api/keras/applications/inception_v3/__init__.py",
-        "api/keras/applications/mobilenet/__init__.py",
-        "api/keras/applications/nasnet/__init__.py",
-        "api/keras/applications/resnet50/__init__.py",
-        "api/keras/applications/vgg16/__init__.py",
-        "api/keras/applications/vgg19/__init__.py",
-        "api/keras/applications/xception/__init__.py",
-        "api/keras/backend/__init__.py",
-        "api/keras/callbacks/__init__.py",
-        "api/keras/constraints/__init__.py",
-        "api/keras/datasets/__init__.py",
-        "api/keras/datasets/boston_housing/__init__.py",
-        "api/keras/datasets/cifar10/__init__.py",
-        "api/keras/datasets/cifar100/__init__.py",
-        "api/keras/datasets/fashion_mnist/__init__.py",
-        "api/keras/datasets/imdb/__init__.py",
-        "api/keras/datasets/mnist/__init__.py",
-        "api/keras/datasets/reuters/__init__.py",
-        "api/keras/estimator/__init__.py",
-        "api/keras/initializers/__init__.py",
-        "api/keras/layers/__init__.py",
-        "api/keras/losses/__init__.py",
-        "api/keras/metrics/__init__.py",
-        "api/keras/models/__init__.py",
-        "api/keras/optimizers/__init__.py",
-        "api/keras/preprocessing/__init__.py",
-        "api/keras/preprocessing/image/__init__.py",
-        "api/keras/preprocessing/sequence/__init__.py",
-        "api/keras/preprocessing/text/__init__.py",
-        "api/keras/regularizers/__init__.py",
-        "api/keras/utils/__init__.py",
-        "api/keras/wrappers/__init__.py",
-        "api/keras/wrappers/scikit_learn/__init__.py",
-        "api/layers/__init__.py",
-        "api/linalg/__init__.py",
-        "api/logging/__init__.py",
-        "api/losses/__init__.py",
-        "api/manip/__init__.py",
-        "api/math/__init__.py",
-        "api/metrics/__init__.py",
-        "api/nn/__init__.py",
-        "api/nn/rnn_cell/__init__.py",
-        "api/profiler/__init__.py",
-        "api/python_io/__init__.py",
-        "api/resource_loader/__init__.py",
-        "api/saved_model/__init__.py",
-        "api/saved_model/builder/__init__.py",
-        "api/saved_model/constants/__init__.py",
-        "api/saved_model/loader/__init__.py",
-        "api/saved_model/main_op/__init__.py",
-        "api/saved_model/signature_constants/__init__.py",
-        "api/saved_model/signature_def_utils/__init__.py",
-        "api/saved_model/tag_constants/__init__.py",
-        "api/saved_model/utils/__init__.py",
-        "api/sets/__init__.py",
-        "api/spectral/__init__.py",
-        "api/summary/__init__.py",
-        "api/sysconfig/__init__.py",
-        "api/test/__init__.py",
-        "api/train/__init__.py",
-        "api/train/queue_runner/__init__.py",
-        "api/user_ops/__init__.py",
-        # END GENERATED FILES
-    ],
-    cmd = "$(location create_python_api) $(OUTS)",
-    tools = ["create_python_api"],
-)
-
-py_library(
-    name = "python_api",
-    srcs = [":python_api_gen"],
-    srcs_version = "PY2AND3",
-    visibility = ["//tensorflow:__subpackages__"],
-    deps = [
-        "//tensorflow/contrib:contrib_py",  # keep
-        "//tensorflow/python",  # keep
-    ],
-)
diff --git a/tensorflow/tools/api/generator/api_gen.bzl b/tensorflow/tools/api/generator/api_gen.bzl
new file mode 100644
index 00000000000000..fe3e4d14340ae6
--- /dev/null
+++ b/tensorflow/tools/api/generator/api_gen.bzl
@@ -0,0 +1,125 @@
+"""Targets for generating TensorFlow Python API __init__.py files."""
+
+# keep sorted
+TENSORFLOW_API_INIT_FILES = [
+    # BEGIN GENERATED FILES
+    "__init__.py",
+    "app/__init__.py",
+    "bitwise/__init__.py",
+    "compat/__init__.py",
+    "data/__init__.py",
+    "distributions/__init__.py",
+    "distributions/bijectors/__init__.py",
+    "errors/__init__.py",
+    "estimator/__init__.py",
+    "estimator/export/__init__.py",
+    "estimator/inputs/__init__.py",
+    "feature_column/__init__.py",
+    "gfile/__init__.py",
+    "graph_util/__init__.py",
+    "image/__init__.py",
+    "initializers/__init__.py",
+    "keras/__init__.py",
+    "keras/activations/__init__.py",
+    "keras/applications/__init__.py",
+    "keras/applications/densenet/__init__.py",
+    "keras/applications/inception_resnet_v2/__init__.py",
+    "keras/applications/inception_v3/__init__.py",
+    "keras/applications/mobilenet/__init__.py",
+    "keras/applications/nasnet/__init__.py",
+    "keras/applications/resnet50/__init__.py",
+    "keras/applications/vgg16/__init__.py",
+    "keras/applications/vgg19/__init__.py",
+    "keras/applications/xception/__init__.py",
+    "keras/backend/__init__.py",
+    "keras/callbacks/__init__.py",
+    "keras/constraints/__init__.py",
+    "keras/datasets/__init__.py",
+    "keras/datasets/boston_housing/__init__.py",
+    "keras/datasets/cifar10/__init__.py",
+    "keras/datasets/cifar100/__init__.py",
+    "keras/datasets/fashion_mnist/__init__.py",
+    "keras/datasets/imdb/__init__.py",
+    "keras/datasets/mnist/__init__.py",
+    "keras/datasets/reuters/__init__.py",
+    "keras/estimator/__init__.py",
+    "keras/initializers/__init__.py",
+    "keras/layers/__init__.py",
+    "keras/losses/__init__.py",
+    "keras/metrics/__init__.py",
+    "keras/models/__init__.py",
+    "keras/optimizers/__init__.py",
+    "keras/preprocessing/__init__.py",
+    "keras/preprocessing/image/__init__.py",
+    "keras/preprocessing/sequence/__init__.py",
+    "keras/preprocessing/text/__init__.py",
+    "keras/regularizers/__init__.py",
+    "keras/utils/__init__.py",
+    "keras/wrappers/__init__.py",
+    "keras/wrappers/scikit_learn/__init__.py",
+    "layers/__init__.py",
+    "linalg/__init__.py",
+    "logging/__init__.py",
+    "losses/__init__.py",
+    "manip/__init__.py",
+    "math/__init__.py",
+    "metrics/__init__.py",
+    "nn/__init__.py",
+    "nn/rnn_cell/__init__.py",
+    "profiler/__init__.py",
+    "python_io/__init__.py",
+    "resource_loader/__init__.py",
+    "strings/__init__.py",
+    "saved_model/__init__.py",
+    "saved_model/builder/__init__.py",
+    "saved_model/constants/__init__.py",
+    "saved_model/loader/__init__.py",
+    "saved_model/main_op/__init__.py",
+    "saved_model/signature_constants/__init__.py",
+    "saved_model/signature_def_utils/__init__.py",
+    "saved_model/tag_constants/__init__.py",
+    "saved_model/utils/__init__.py",
+    "sets/__init__.py",
+    "sparse/__init__.py",
+    "spectral/__init__.py",
+    "summary/__init__.py",
+    "sysconfig/__init__.py",
+    "test/__init__.py",
+    "train/__init__.py",
+    "train/queue_runner/__init__.py",
+    "user_ops/__init__.py",
+    # END GENERATED FILES
+]
+
+# Creates a genrule that generates a directory structure with __init__.py
+# files that import all exported modules (i.e. modules with tf_export
+# decorators).
+#
+# Args:
+#   name: name of genrule to create.
+#   output_files: List of __init__.py files that should be generated.
+#     This list should include file name for every module exported using
+#     tf_export. For e.g. if an op is decorated with
+#     @tf_export('module1.module2', 'module3'). Then, output_files should
+#     include module1/module2/__init__.py and module3/__init__.py.
+#   root_init_template: Python init file that should be used as template for
+#     root __init__.py file. "# API IMPORTS PLACEHOLDER" comment inside this
+#     template will be replaced with root imports collected by this genrule.
+#   srcs: genrule sources. If passing root_init_template, the template file
+#     must be included in sources.
+def gen_api_init_files(name,
+                       output_files=TENSORFLOW_API_INIT_FILES,
+                       root_init_template=None,
+                       srcs=[]):
+  root_init_template_flag = ""
+  if root_init_template:
+    root_init_template_flag = "--root_init_template=$(location " + root_init_template + ")"
+  native.genrule(
+      name = name,
+      outs = output_files,
+      cmd = (
+          "$(location //tensorflow/tools/api/generator:create_python_api) " +
+          root_init_template_flag + " --apidir=$(@D) $(OUTS)"),
+      srcs = srcs,
+      tools = ["//tensorflow/tools/api/generator:create_python_api"],
+  )
diff --git a/tensorflow/tools/api/generator/create_python_api.py b/tensorflow/tools/api/generator/create_python_api.py
index c06a39bfbdf06c..9f210ad42b3169 100644
--- a/tensorflow/tools/api/generator/create_python_api.py
+++ b/tensorflow/tools/api/generator/create_python_api.py
@@ -20,6 +20,7 @@
 
 import argparse
 import collections
+import importlib
 import os
 import sys
 
@@ -28,14 +29,23 @@
 
 _API_CONSTANTS_ATTR = '_tf_api_constants'
 _API_NAMES_ATTR = '_tf_api_names'
-_API_DIR = '/api/'
-_OUTPUT_MODULE = 'tensorflow.tools.api.generator.api'
+_DEFAULT_PACKAGE = 'tensorflow.python'
+_GENFILES_DIR_SUFFIX = 'genfiles/'
+_SYMBOLS_TO_SKIP_EXPLICITLY = {
+    # Overrides __getattr__, so that unwrapping tf_decorator
+    # would have side effects.
+    'tensorflow.python.platform.flags.FLAGS'
+}
 _GENERATED_FILE_HEADER = """\"\"\"Imports for Python API.
 
 This file is MACHINE GENERATED! Do not edit.
 Generated by: tensorflow/tools/api/generator/create_python_api.py script.
 \"\"\"
+
+from __future__ import print_function
+
 """
+_GENERATED_FILE_FOOTER = "\n\ndel print_function\n"
 
 
 class SymbolExposedTwiceError(Exception):
@@ -136,18 +146,26 @@ def build(self):
     # since we import from it using * import.
     underscore_names_str = ', '.join(
         '\'%s\'' % name for name in self._underscore_names_in_root)
-    module_text_map[''] += '''
+    # We will always generate a root __init__.py file to let us handle *
+    # imports consistently. Be sure to have a root __init__.py file listed in
+    # the script outputs.
+    module_text_map[''] = module_text_map.get('', '') + '''
 _names_with_underscore = [%s]
-__all__ = [s for s in dir() if not s.startswith('_')]
-__all__.extend([s for s in _names_with_underscore])
+__all__ = [_s for _s in dir() if not _s.startswith('_')]
+__all__.extend([_s for _s in _names_with_underscore])
+__all__.remove('print_function')
 ''' % underscore_names_str
 
     return module_text_map
 
 
-def get_api_init_text():
+def get_api_init_text(package):
   """Get a map from destination module to __init__.py code for that module.
 
+  Args:
+    package: Base python package containing python with target tf_export
+      decorators.
+
   Returns:
     A dictionary where
       key: (string) destination module (for e.g. tf or tf.consts).
@@ -158,16 +176,19 @@ def get_api_init_text():
 
   # Traverse over everything imported above. Specifically,
   # we want to traverse over TensorFlow Python modules.
-  for module in sys.modules.values():
+  for module in list(sys.modules.values()):
     # Only look at tensorflow modules.
     if (not module or not hasattr(module, '__name__') or
-        'tensorflow.' not in module.__name__):
+        package not in module.__name__):
       continue
     # Do not generate __init__.py files for contrib modules for now.
     if '.contrib.' in module.__name__ or module.__name__.endswith('.contrib'):
       continue
 
     for module_contents_name in dir(module):
+      if (module.__name__ + '.' + module_contents_name
+          in _SYMBOLS_TO_SKIP_EXPLICITLY):
+        continue
       attr = getattr(module, module_contents_name)
 
       # If attr is _tf_api_constants attribute, then add the constants.
@@ -180,7 +201,11 @@ def get_api_init_text():
                 -1, dest_module, module.__name__, value, names[-1])
         continue
 
-      _, attr = tf_decorator.unwrap(attr)
+      try:
+        _, attr = tf_decorator.unwrap(attr)
+      except Exception as e:
+        print('5555: %s %s' % (module, module_contents_name), file=sys.stderr)
+        raise e
       # If attr is a symbol with _tf_api_names attribute, then
       # add import for it.
       if hasattr(attr, '__dict__') and _API_NAMES_ATTR in attr.__dict__:
@@ -195,6 +220,7 @@ def get_api_init_text():
   # For e.g. if we import 'foo.bar.Value'. Then, we also
   # import 'bar' in 'foo'.
   imported_modules = set(module_code_builder.module_imports.keys())
+  import_from = '.'
   for module in imported_modules:
     if not module:
       continue
@@ -202,11 +228,9 @@ def get_api_init_text():
     parent_module = ''  # we import submodules in their parent_module
 
     for submodule_index in range(len(module_split)):
-      import_from = _OUTPUT_MODULE
       if submodule_index > 0:
         parent_module += ('.' + module_split[submodule_index-1] if parent_module
                           else module_split[submodule_index-1])
-        import_from += '.' + parent_module
       module_code_builder.add_import(
           -1, parent_module, import_from,
           module_split[submodule_index], module_split[submodule_index])
@@ -214,12 +238,35 @@ def get_api_init_text():
   return module_code_builder.build()
 
 
-def create_api_files(output_files):
+def get_module(dir_path, relative_to_dir):
+  """Get module that corresponds to path relative to relative_to_dir.
+
+  Args:
+    dir_path: Path to directory.
+    relative_to_dir: Get module relative to this directory.
+
+  Returns:
+    module that corresponds to the given directory.
+  """
+  dir_path = dir_path[len(relative_to_dir):]
+  # Convert path separators to '/' for easier parsing below.
+  dir_path = dir_path.replace(os.sep, '/')
+  return dir_path.replace('/', '.').strip('.')
+
+
+def create_api_files(
+    output_files, package, root_init_template, output_dir):
   """Creates __init__.py files for the Python API.
 
   Args:
     output_files: List of __init__.py file paths to create.
       Each file must be under api/ directory.
+    package: Base python package containing python with target tf_export
+      decorators.
+    root_init_template: Template for top-level __init__.py file.
+      "#API IMPORTS PLACEHOLDER" comment in the template file will be replaced
+      with imports.
+    output_dir: output API root directory.
 
   Raises:
     ValueError: if an output file is not under api/ directory,
@@ -227,18 +274,7 @@ def create_api_files(output_files):
   """
   module_name_to_file_path = {}
   for output_file in output_files:
-    # Convert path separators to '/' for easier parsing below.
-    normalized_output_file = output_file.replace(os.sep, '/')
-    if _API_DIR not in output_file:
-      raise ValueError(
-          'Output files must be in api/ directory, found %s.' % output_file)
-    # Get the module name that corresponds to output_file.
-    # First get module directory under _API_DIR.
-    module_dir = os.path.dirname(
-        normalized_output_file[
-            normalized_output_file.rfind(_API_DIR)+len(_API_DIR):])
-    # Convert / to .
-    module_name = module_dir.replace('/', '.').strip('.')
+    module_name = get_module(os.path.dirname(output_file), output_dir)
     module_name_to_file_path[module_name] = os.path.normpath(output_file)
 
   # Create file for each expected output in genrule.
@@ -247,19 +283,27 @@ def create_api_files(output_files):
       os.makedirs(os.path.dirname(file_path))
     open(file_path, 'a').close()
 
-  module_text_map = get_api_init_text()
+  module_text_map = get_api_init_text(package)
 
   # Add imports to output files.
   missing_output_files = []
   for module, text in module_text_map.items():
     # Make sure genrule output file list is in sync with API exports.
     if module not in module_name_to_file_path:
-      module_file_path = '"api/%s/__init__.py"' %  (
+      module_file_path = '"%s/__init__.py"' %  (
           module.replace('.', '/'))
       missing_output_files.append(module_file_path)
       continue
+    contents = ''
+    if module or not root_init_template:
+      contents = _GENERATED_FILE_HEADER + text + _GENERATED_FILE_FOOTER
+    else:
+      # Read base init file
+      with open(root_init_template, 'r') as root_init_template_file:
+        contents = root_init_template_file.read()
+        contents = contents.replace('# API IMPORTS PLACEHOLDER', text)
     with open(module_name_to_file_path[module], 'w') as fp:
-      fp.write(_GENERATED_FILE_HEADER + text)
+      fp.write(contents)
 
   if missing_output_files:
     raise ValueError(
@@ -269,10 +313,7 @@ def create_api_files(output_files):
         ',\n'.join(sorted(missing_output_files)))
 
 
-def main(output_files):
-  create_api_files(output_files)
-
-if __name__ == '__main__':
+def main():
   parser = argparse.ArgumentParser()
   parser.add_argument(
       'outputs', metavar='O', type=str, nargs='+',
@@ -280,7 +321,22 @@ def main(output_files):
       'semicolon-separated list of Python files that we expect this script to '
       'output. If multiple files are passed in, then we assume output files '
       'are listed directly as arguments.')
+  parser.add_argument(
+      '--package', default=_DEFAULT_PACKAGE, type=str,
+      help='Base package that imports modules containing the target tf_export '
+           'decorators.')
+  parser.add_argument(
+      '--root_init_template', default='', type=str,
+      help='Template for top level __init__.py file. '
+           '"#API IMPORTS PLACEHOLDER" comment will be replaced with imports.')
+  parser.add_argument(
+      '--apidir', type=str, required=True,
+      help='Directory where generated output files are placed. '
+           'gendir should be a prefix of apidir. Also, apidir '
+           'should be a prefix of every directory in outputs.')
+
   args = parser.parse_args()
+
   if len(args.outputs) == 1:
     # If we only get a single argument, then it must be a file containing
     # list of outputs.
@@ -288,4 +344,12 @@ def main(output_files):
       outputs = [line.strip() for line in output_list_file.read().split(';')]
   else:
     outputs = args.outputs
-  main(outputs)
+
+  # Populate `sys.modules` with modules containing tf_export().
+  importlib.import_module(args.package)
+  create_api_files(
+      outputs, args.package, args.root_init_template, args.apidir)
+
+
+if __name__ == '__main__':
+  main()
diff --git a/tensorflow/tools/api/generator/create_python_api_test.py b/tensorflow/tools/api/generator/create_python_api_test.py
index 218c8120453c8d..986340cf6d4a1b 100644
--- a/tensorflow/tools/api/generator/create_python_api_test.py
+++ b/tensorflow/tools/api/generator/create_python_api_test.py
@@ -37,7 +37,7 @@ class TestClass(object):
 
 
 _TEST_CONSTANT = 5
-_MODULE_NAME = 'test.tensorflow.test_module'
+_MODULE_NAME = 'tensorflow.python.test_module'
 
 
 class CreatePythonApiTest(test.TestCase):
@@ -56,28 +56,35 @@ def tearDown(self):
     del sys.modules[_MODULE_NAME]
 
   def testFunctionImportIsAdded(self):
-    imports = create_python_api.get_api_init_text()
+    imports = create_python_api.get_api_init_text(
+        package=create_python_api._DEFAULT_PACKAGE)
     expected_import = (
-        'from test.tensorflow.test_module import test_op as test_op1')
+        'from tensorflow.python.test_module '
+        'import test_op as test_op1')
     self.assertTrue(
         expected_import in str(imports),
         msg='%s not in %s' % (expected_import, str(imports)))
 
-    expected_import = 'from test.tensorflow.test_module import test_op'
+    expected_import = ('from tensorflow.python.test_module '
+                       'import test_op')
     self.assertTrue(
         expected_import in str(imports),
         msg='%s not in %s' % (expected_import, str(imports)))
 
   def testClassImportIsAdded(self):
-    imports = create_python_api.get_api_init_text()
-    expected_import = 'from test.tensorflow.test_module import TestClass'
+    imports = create_python_api.get_api_init_text(
+        package=create_python_api._DEFAULT_PACKAGE)
+    expected_import = ('from tensorflow.python.test_module '
+                       'import TestClass')
     self.assertTrue(
         'TestClass' in str(imports),
         msg='%s not in %s' % (expected_import, str(imports)))
 
   def testConstantIsAdded(self):
-    imports = create_python_api.get_api_init_text()
-    expected = 'from test.tensorflow.test_module import _TEST_CONSTANT'
+    imports = create_python_api.get_api_init_text(
+        package=create_python_api._DEFAULT_PACKAGE)
+    expected = ('from tensorflow.python.test_module '
+                'import _TEST_CONSTANT')
     self.assertTrue(expected in str(imports),
                     msg='%s not in %s' % (expected, str(imports)))
 
diff --git a/tensorflow/tools/api/golden/tensorflow.-attr-value.-list-value.pbtxt b/tensorflow/tools/api/golden/tensorflow.-attr-value.-list-value.pbtxt
index 0fb1aaba2831e6..f1dffd59528509 100644
--- a/tensorflow/tools/api/golden/tensorflow.-attr-value.-list-value.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-attr-value.-list-value.pbtxt
@@ -1,108 +1,70 @@
 path: "tensorflow.AttrValue.ListValue"
-tf_class {
-  is_instance: "<class \'tensorflow.core.framework.attr_value_pb2.ListValue\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "B_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "FUNC_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "F_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "I_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "SHAPE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "S_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TENSOR_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TYPE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "ListValue"
+    field {
+      name: "s"
+      number: 2
+      label: LABEL_REPEATED
+      type: TYPE_BYTES
+    }
+    field {
+      name: "i"
+      number: 3
+      label: LABEL_REPEATED
+      type: TYPE_INT64
+      options {
+        packed: true
+      }
+    }
+    field {
+      name: "f"
+      number: 4
+      label: LABEL_REPEATED
+      type: TYPE_FLOAT
+      options {
+        packed: true
+      }
+    }
+    field {
+      name: "b"
+      number: 5
+      label: LABEL_REPEATED
+      type: TYPE_BOOL
+      options {
+        packed: true
+      }
+    }
+    field {
+      name: "type"
+      number: 6
+      label: LABEL_REPEATED
+      type: TYPE_ENUM
+      type_name: ".tensorflow.DataType"
+      options {
+        packed: true
+      }
+    }
+    field {
+      name: "shape"
+      number: 7
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.TensorShapeProto"
+    }
+    field {
+      name: "tensor"
+      number: 8
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.TensorProto"
+    }
+    field {
+      name: "func"
+      number: 9
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.NameAttrList"
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.-attr-value.pbtxt b/tensorflow/tools/api/golden/tensorflow.-attr-value.pbtxt
index e7a3a1f02faf10..6ccd64f428c3b8 100644
--- a/tensorflow/tools/api/golden/tensorflow.-attr-value.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-attr-value.pbtxt
@@ -1,120 +1,151 @@
 path: "tensorflow.AttrValue"
-tf_class {
-  is_instance: "<class \'tensorflow.core.framework.attr_value_pb2.AttrValue\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "B_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "FUNC_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "F_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "I_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "LIST_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "ListValue"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "PLACEHOLDER_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "SHAPE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "S_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TENSOR_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TYPE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "AttrValue"
+    field {
+      name: "s"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_BYTES
+      oneof_index: 0
+    }
+    field {
+      name: "i"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+      oneof_index: 0
+    }
+    field {
+      name: "f"
+      number: 4
+      label: LABEL_OPTIONAL
+      type: TYPE_FLOAT
+      oneof_index: 0
+    }
+    field {
+      name: "b"
+      number: 5
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+      oneof_index: 0
+    }
+    field {
+      name: "type"
+      number: 6
+      label: LABEL_OPTIONAL
+      type: TYPE_ENUM
+      type_name: ".tensorflow.DataType"
+      oneof_index: 0
+    }
+    field {
+      name: "shape"
+      number: 7
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.TensorShapeProto"
+      oneof_index: 0
+    }
+    field {
+      name: "tensor"
+      number: 8
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.TensorProto"
+      oneof_index: 0
+    }
+    field {
+      name: "list"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.AttrValue.ListValue"
+      oneof_index: 0
+    }
+    field {
+      name: "func"
+      number: 10
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.NameAttrList"
+      oneof_index: 0
+    }
+    field {
+      name: "placeholder"
+      number: 9
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+      oneof_index: 0
+    }
+    nested_type {
+      name: "ListValue"
+      field {
+        name: "s"
+        number: 2
+        label: LABEL_REPEATED
+        type: TYPE_BYTES
+      }
+      field {
+        name: "i"
+        number: 3
+        label: LABEL_REPEATED
+        type: TYPE_INT64
+        options {
+          packed: true
+        }
+      }
+      field {
+        name: "f"
+        number: 4
+        label: LABEL_REPEATED
+        type: TYPE_FLOAT
+        options {
+          packed: true
+        }
+      }
+      field {
+        name: "b"
+        number: 5
+        label: LABEL_REPEATED
+        type: TYPE_BOOL
+        options {
+          packed: true
+        }
+      }
+      field {
+        name: "type"
+        number: 6
+        label: LABEL_REPEATED
+        type: TYPE_ENUM
+        type_name: ".tensorflow.DataType"
+        options {
+          packed: true
+        }
+      }
+      field {
+        name: "shape"
+        number: 7
+        label: LABEL_REPEATED
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.TensorShapeProto"
+      }
+      field {
+        name: "tensor"
+        number: 8
+        label: LABEL_REPEATED
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.TensorProto"
+      }
+      field {
+        name: "func"
+        number: 9
+        label: LABEL_REPEATED
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.NameAttrList"
+      }
+    }
+    oneof_decl {
+      name: "value"
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.-config-proto.-device-count-entry.pbtxt b/tensorflow/tools/api/golden/tensorflow.-config-proto.-device-count-entry.pbtxt
index 29bb3be35cba5f..d9b142682899bf 100644
--- a/tensorflow/tools/api/golden/tensorflow.-config-proto.-device-count-entry.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-config-proto.-device-count-entry.pbtxt
@@ -1,84 +1,21 @@
 path: "tensorflow.ConfigProto.DeviceCountEntry"
-tf_class {
-  is_instance: "<class \'tensorflow.core.protobuf.config_pb2.DeviceCountEntry\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "KEY_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "VALUE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "DeviceCountEntry"
+    field {
+      name: "key"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "value"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_INT32
+    }
+    options {
+      map_entry: true
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.-config-proto.-experimental.pbtxt b/tensorflow/tools/api/golden/tensorflow.-config-proto.-experimental.pbtxt
new file mode 100644
index 00000000000000..9e09a8d48ec7a5
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-config-proto.-experimental.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.ConfigProto.Experimental"
+tf_proto {
+  descriptor {
+    name: "Experimental"
+    field {
+      name: "collective_group_leader"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-config-proto.pbtxt b/tensorflow/tools/api/golden/tensorflow.-config-proto.pbtxt
index 009d64aed09ddc..4af4ed70ef0698 100644
--- a/tensorflow/tools/api/golden/tensorflow.-config-proto.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-config-proto.pbtxt
@@ -1,140 +1,136 @@
 path: "tensorflow.ConfigProto"
-tf_class {
-  is_instance: "<class \'tensorflow.core.protobuf.config_pb2.ConfigProto\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "ALLOW_SOFT_PLACEMENT_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "CLUSTER_DEF_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "DEVICE_COUNT_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "DEVICE_FILTERS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "DeviceCountEntry"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "GPU_OPTIONS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GRAPH_OPTIONS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "INTER_OP_PARALLELISM_THREADS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "INTRA_OP_PARALLELISM_THREADS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "ISOLATE_SESSION_STATE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "LOG_DEVICE_PLACEMENT_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "OPERATION_TIMEOUT_IN_MS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "PLACEMENT_PERIOD_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "RPC_OPTIONS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "SESSION_INTER_OP_THREAD_POOL_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "USE_PER_SESSION_THREADS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "ConfigProto"
+    field {
+      name: "device_count"
+      number: 1
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.ConfigProto.DeviceCountEntry"
+    }
+    field {
+      name: "intra_op_parallelism_threads"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_INT32
+    }
+    field {
+      name: "inter_op_parallelism_threads"
+      number: 5
+      label: LABEL_OPTIONAL
+      type: TYPE_INT32
+    }
+    field {
+      name: "use_per_session_threads"
+      number: 9
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+    field {
+      name: "session_inter_op_thread_pool"
+      number: 12
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.ThreadPoolOptionProto"
+    }
+    field {
+      name: "placement_period"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_INT32
+    }
+    field {
+      name: "device_filters"
+      number: 4
+      label: LABEL_REPEATED
+      type: TYPE_STRING
+    }
+    field {
+      name: "gpu_options"
+      number: 6
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.GPUOptions"
+    }
+    field {
+      name: "allow_soft_placement"
+      number: 7
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+    field {
+      name: "log_device_placement"
+      number: 8
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+    field {
+      name: "graph_options"
+      number: 10
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.GraphOptions"
+    }
+    field {
+      name: "operation_timeout_in_ms"
+      number: 11
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "rpc_options"
+      number: 13
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.RPCOptions"
+    }
+    field {
+      name: "cluster_def"
+      number: 14
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.ClusterDef"
+    }
+    field {
+      name: "isolate_session_state"
+      number: 15
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+    field {
+      name: "experimental"
+      number: 16
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.ConfigProto.Experimental"
+    }
+    nested_type {
+      name: "DeviceCountEntry"
+      field {
+        name: "key"
+        number: 1
+        label: LABEL_OPTIONAL
+        type: TYPE_STRING
+      }
+      field {
+        name: "value"
+        number: 2
+        label: LABEL_OPTIONAL
+        type: TYPE_INT32
+      }
+      options {
+        map_entry: true
+      }
+    }
+    nested_type {
+      name: "Experimental"
+      field {
+        name: "collective_group_leader"
+        number: 1
+        label: LABEL_OPTIONAL
+        type: TYPE_STRING
+      }
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.-event.pbtxt b/tensorflow/tools/api/golden/tensorflow.-event.pbtxt
index 9bf8c124288854..3b75a1735be76f 100644
--- a/tensorflow/tools/api/golden/tensorflow.-event.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-event.pbtxt
@@ -1,112 +1,74 @@
 path: "tensorflow.Event"
-tf_class {
-  is_instance: "<class \'tensorflow.core.util.event_pb2.Event\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "FILE_VERSION_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GRAPH_DEF_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "LOG_MESSAGE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "META_GRAPH_DEF_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "SESSION_LOG_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "STEP_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "SUMMARY_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TAGGED_RUN_METADATA_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "WALL_TIME_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "Event"
+    field {
+      name: "wall_time"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_DOUBLE
+    }
+    field {
+      name: "step"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "file_version"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+      oneof_index: 0
+    }
+    field {
+      name: "graph_def"
+      number: 4
+      label: LABEL_OPTIONAL
+      type: TYPE_BYTES
+      oneof_index: 0
+    }
+    field {
+      name: "summary"
+      number: 5
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.Summary"
+      oneof_index: 0
+    }
+    field {
+      name: "log_message"
+      number: 6
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.LogMessage"
+      oneof_index: 0
+    }
+    field {
+      name: "session_log"
+      number: 7
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.SessionLog"
+      oneof_index: 0
+    }
+    field {
+      name: "tagged_run_metadata"
+      number: 8
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.TaggedRunMetadata"
+      oneof_index: 0
+    }
+    field {
+      name: "meta_graph_def"
+      number: 9
+      label: LABEL_OPTIONAL
+      type: TYPE_BYTES
+      oneof_index: 0
+    }
+    oneof_decl {
+      name: "what"
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.-g-p-u-options.pbtxt b/tensorflow/tools/api/golden/tensorflow.-g-p-u-options.pbtxt
index 875d802a9c458e..f819b174c0b701 100644
--- a/tensorflow/tools/api/golden/tensorflow.-g-p-u-options.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-g-p-u-options.pbtxt
@@ -1,116 +1,86 @@
 path: "tensorflow.GPUOptions"
-tf_class {
-  is_instance: "<class \'tensorflow.core.protobuf.config_pb2.GPUOptions\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "ALLOCATOR_TYPE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "ALLOW_GROWTH_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "DEFERRED_DELETION_BYTES_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "EXPERIMENTAL_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "Experimental"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "FORCE_GPU_COMPATIBLE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "PER_PROCESS_GPU_MEMORY_FRACTION_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "POLLING_ACTIVE_DELAY_USECS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "POLLING_INACTIVE_DELAY_MSECS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "VISIBLE_DEVICE_LIST_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "GPUOptions"
+    field {
+      name: "per_process_gpu_memory_fraction"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_DOUBLE
+    }
+    field {
+      name: "allow_growth"
+      number: 4
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+    field {
+      name: "allocator_type"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "deferred_deletion_bytes"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "visible_device_list"
+      number: 5
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "polling_active_delay_usecs"
+      number: 6
+      label: LABEL_OPTIONAL
+      type: TYPE_INT32
+    }
+    field {
+      name: "polling_inactive_delay_msecs"
+      number: 7
+      label: LABEL_OPTIONAL
+      type: TYPE_INT32
+    }
+    field {
+      name: "force_gpu_compatible"
+      number: 8
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+    field {
+      name: "experimental"
+      number: 9
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.GPUOptions.Experimental"
+    }
+    nested_type {
+      name: "Experimental"
+      field {
+        name: "virtual_devices"
+        number: 1
+        label: LABEL_REPEATED
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.GPUOptions.Experimental.VirtualDevices"
+      }
+      field {
+        name: "use_unified_memory"
+        number: 2
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
+      nested_type {
+        name: "VirtualDevices"
+        field {
+          name: "memory_limit_mb"
+          number: 1
+          label: LABEL_REPEATED
+          type: TYPE_FLOAT
+        }
+      }
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.-gradient-tape.pbtxt b/tensorflow/tools/api/golden/tensorflow.-gradient-tape.pbtxt
index 7405202b892bba..cbf655498c02a6 100644
--- a/tensorflow/tools/api/golden/tensorflow.-gradient-tape.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-gradient-tape.pbtxt
@@ -10,6 +10,14 @@ tf_class {
     name: "gradient"
     argspec: "args=[\'self\', \'target\', \'sources\', \'output_gradients\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "reset"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "stop_recording"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "watch"
     argspec: "args=[\'self\', \'tensor\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.-graph-def.pbtxt b/tensorflow/tools/api/golden/tensorflow.-graph-def.pbtxt
index 1495e847cb08ed..19eccff03d2471 100644
--- a/tensorflow/tools/api/golden/tensorflow.-graph-def.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-graph-def.pbtxt
@@ -1,92 +1,36 @@
 path: "tensorflow.GraphDef"
-tf_class {
-  is_instance: "<class \'tensorflow.core.framework.graph_pb2.GraphDef\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "LIBRARY_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "NODE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "VERSIONS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "VERSION_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "GraphDef"
+    field {
+      name: "node"
+      number: 1
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.NodeDef"
+    }
+    field {
+      name: "versions"
+      number: 4
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.VersionDef"
+    }
+    field {
+      name: "version"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_INT32
+      options {
+        deprecated: true
+      }
+    }
+    field {
+      name: "library"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.FunctionDefLibrary"
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.-graph-options.pbtxt b/tensorflow/tools/api/golden/tensorflow.-graph-options.pbtxt
index 0844f891cad3d4..a9f99bc171cc36 100644
--- a/tensorflow/tools/api/golden/tensorflow.-graph-options.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-graph-options.pbtxt
@@ -1,112 +1,67 @@
 path: "tensorflow.GraphOptions"
-tf_class {
-  is_instance: "<class \'tensorflow.core.protobuf.config_pb2.GraphOptions\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "BUILD_COST_MODEL_AFTER_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "BUILD_COST_MODEL_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "ENABLE_BFLOAT16_SENDRECV_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "ENABLE_RECV_SCHEDULING_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "INFER_SHAPES_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "OPTIMIZER_OPTIONS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "PLACE_PRUNED_GRAPH_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "REWRITE_OPTIONS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TIMELINE_STEP_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "GraphOptions"
+    field {
+      name: "enable_recv_scheduling"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+    field {
+      name: "optimizer_options"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.OptimizerOptions"
+    }
+    field {
+      name: "build_cost_model"
+      number: 4
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "build_cost_model_after"
+      number: 9
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "infer_shapes"
+      number: 5
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+    field {
+      name: "place_pruned_graph"
+      number: 6
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+    field {
+      name: "enable_bfloat16_sendrecv"
+      number: 7
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+    field {
+      name: "timeline_step"
+      number: 8
+      label: LABEL_OPTIONAL
+      type: TYPE_INT32
+    }
+    field {
+      name: "rewrite_options"
+      number: 10
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.RewriterConfig"
+    }
+    reserved_range {
+      start: 1
+      end: 2
+    }
+    reserved_name: "skip_common_subexpression_elimination"
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.-histogram-proto.pbtxt b/tensorflow/tools/api/golden/tensorflow.-histogram-proto.pbtxt
index 2567d2fe602938..d4402f330b8a28 100644
--- a/tensorflow/tools/api/golden/tensorflow.-histogram-proto.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-histogram-proto.pbtxt
@@ -1,104 +1,54 @@
 path: "tensorflow.HistogramProto"
-tf_class {
-  is_instance: "<class \'tensorflow.core.framework.summary_pb2.HistogramProto\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "BUCKET_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "BUCKET_LIMIT_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "MAX_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "MIN_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "NUM_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "SUM_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "SUM_SQUARES_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "HistogramProto"
+    field {
+      name: "min"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_DOUBLE
+    }
+    field {
+      name: "max"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_DOUBLE
+    }
+    field {
+      name: "num"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_DOUBLE
+    }
+    field {
+      name: "sum"
+      number: 4
+      label: LABEL_OPTIONAL
+      type: TYPE_DOUBLE
+    }
+    field {
+      name: "sum_squares"
+      number: 5
+      label: LABEL_OPTIONAL
+      type: TYPE_DOUBLE
+    }
+    field {
+      name: "bucket_limit"
+      number: 6
+      label: LABEL_REPEATED
+      type: TYPE_DOUBLE
+      options {
+        packed: true
+      }
+    }
+    field {
+      name: "bucket"
+      number: 7
+      label: LABEL_REPEATED
+      type: TYPE_DOUBLE
+      options {
+        packed: true
+      }
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.-log-message.pbtxt b/tensorflow/tools/api/golden/tensorflow.-log-message.pbtxt
index a43c5eb7e30c3c..5023aa96bf3b4f 100644
--- a/tensorflow/tools/api/golden/tensorflow.-log-message.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-log-message.pbtxt
@@ -1,112 +1,46 @@
 path: "tensorflow.LogMessage"
-tf_class {
-  is_instance: "<class \'tensorflow.core.util.event_pb2.LogMessage\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "DEBUGGING"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "ERROR"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "FATAL"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "INFO"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "LEVEL_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "Level"
-    mtype: "<class \'google.protobuf.internal.enum_type_wrapper.EnumTypeWrapper\'>"
-  }
-  member {
-    name: "MESSAGE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "UNKNOWN"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "WARN"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "LogMessage"
+    field {
+      name: "level"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_ENUM
+      type_name: ".tensorflow.LogMessage.Level"
+    }
+    field {
+      name: "message"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    enum_type {
+      name: "Level"
+      value {
+        name: "UNKNOWN"
+        number: 0
+      }
+      value {
+        name: "DEBUGGING"
+        number: 10
+      }
+      value {
+        name: "INFO"
+        number: 20
+      }
+      value {
+        name: "WARN"
+        number: 30
+      }
+      value {
+        name: "ERROR"
+        number: 40
+      }
+      value {
+        name: "FATAL"
+        number: 50
+      }
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-collection-def-entry.pbtxt b/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-collection-def-entry.pbtxt
index 3572126fbfd77d..0ba09bec4b3fa6 100644
--- a/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-collection-def-entry.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-collection-def-entry.pbtxt
@@ -1,84 +1,22 @@
 path: "tensorflow.MetaGraphDef.CollectionDefEntry"
-tf_class {
-  is_instance: "<class \'tensorflow.core.protobuf.meta_graph_pb2.CollectionDefEntry\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "KEY_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "VALUE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "CollectionDefEntry"
+    field {
+      name: "key"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "value"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.CollectionDef"
+    }
+    options {
+      map_entry: true
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-meta-info-def.pbtxt b/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-meta-info-def.pbtxt
index b0e983115499c5..41c62a407b8577 100644
--- a/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-meta-info-def.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-meta-info-def.pbtxt
@@ -1,104 +1,50 @@
 path: "tensorflow.MetaGraphDef.MetaInfoDef"
-tf_class {
-  is_instance: "<class \'tensorflow.core.protobuf.meta_graph_pb2.MetaInfoDef\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "ANY_INFO_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "META_GRAPH_VERSION_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "STRIPPED_DEFAULT_ATTRS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "STRIPPED_OP_LIST_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TAGS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TENSORFLOW_GIT_VERSION_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TENSORFLOW_VERSION_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "MetaInfoDef"
+    field {
+      name: "meta_graph_version"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "stripped_op_list"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.OpList"
+    }
+    field {
+      name: "any_info"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".google.protobuf.Any"
+    }
+    field {
+      name: "tags"
+      number: 4
+      label: LABEL_REPEATED
+      type: TYPE_STRING
+    }
+    field {
+      name: "tensorflow_version"
+      number: 5
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "tensorflow_git_version"
+      number: 6
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "stripped_default_attrs"
+      number: 7
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-signature-def-entry.pbtxt b/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-signature-def-entry.pbtxt
index 48fccac99d60b5..73dc414a779ded 100644
--- a/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-signature-def-entry.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.-signature-def-entry.pbtxt
@@ -1,84 +1,22 @@
 path: "tensorflow.MetaGraphDef.SignatureDefEntry"
-tf_class {
-  is_instance: "<class \'tensorflow.core.protobuf.meta_graph_pb2.SignatureDefEntry\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "KEY_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "VALUE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "SignatureDefEntry"
+    field {
+      name: "key"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "value"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.SignatureDef"
+    }
+    options {
+      map_entry: true
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.pbtxt b/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.pbtxt
index 3e683a87159923..d71c2358c93e95 100644
--- a/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-meta-graph-def.pbtxt
@@ -1,112 +1,133 @@
 path: "tensorflow.MetaGraphDef"
-tf_class {
-  is_instance: "<class \'tensorflow.core.protobuf.meta_graph_pb2.MetaGraphDef\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "ASSET_FILE_DEF_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "COLLECTION_DEF_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "CollectionDefEntry"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "GRAPH_DEF_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "META_INFO_DEF_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "MetaInfoDef"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "SAVER_DEF_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "SIGNATURE_DEF_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "SignatureDefEntry"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "MetaGraphDef"
+    field {
+      name: "meta_info_def"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.MetaGraphDef.MetaInfoDef"
+    }
+    field {
+      name: "graph_def"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.GraphDef"
+    }
+    field {
+      name: "saver_def"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.SaverDef"
+    }
+    field {
+      name: "collection_def"
+      number: 4
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.MetaGraphDef.CollectionDefEntry"
+    }
+    field {
+      name: "signature_def"
+      number: 5
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.MetaGraphDef.SignatureDefEntry"
+    }
+    field {
+      name: "asset_file_def"
+      number: 6
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.AssetFileDef"
+    }
+    nested_type {
+      name: "MetaInfoDef"
+      field {
+        name: "meta_graph_version"
+        number: 1
+        label: LABEL_OPTIONAL
+        type: TYPE_STRING
+      }
+      field {
+        name: "stripped_op_list"
+        number: 2
+        label: LABEL_OPTIONAL
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.OpList"
+      }
+      field {
+        name: "any_info"
+        number: 3
+        label: LABEL_OPTIONAL
+        type: TYPE_MESSAGE
+        type_name: ".google.protobuf.Any"
+      }
+      field {
+        name: "tags"
+        number: 4
+        label: LABEL_REPEATED
+        type: TYPE_STRING
+      }
+      field {
+        name: "tensorflow_version"
+        number: 5
+        label: LABEL_OPTIONAL
+        type: TYPE_STRING
+      }
+      field {
+        name: "tensorflow_git_version"
+        number: 6
+        label: LABEL_OPTIONAL
+        type: TYPE_STRING
+      }
+      field {
+        name: "stripped_default_attrs"
+        number: 7
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
+    }
+    nested_type {
+      name: "CollectionDefEntry"
+      field {
+        name: "key"
+        number: 1
+        label: LABEL_OPTIONAL
+        type: TYPE_STRING
+      }
+      field {
+        name: "value"
+        number: 2
+        label: LABEL_OPTIONAL
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.CollectionDef"
+      }
+      options {
+        map_entry: true
+      }
+    }
+    nested_type {
+      name: "SignatureDefEntry"
+      field {
+        name: "key"
+        number: 1
+        label: LABEL_OPTIONAL
+        type: TYPE_STRING
+      }
+      field {
+        name: "value"
+        number: 2
+        label: LABEL_OPTIONAL
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.SignatureDef"
+      }
+      options {
+        map_entry: true
+      }
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.-name-attr-list.-attr-entry.pbtxt b/tensorflow/tools/api/golden/tensorflow.-name-attr-list.-attr-entry.pbtxt
index 2750bd780caa41..b119b208772199 100644
--- a/tensorflow/tools/api/golden/tensorflow.-name-attr-list.-attr-entry.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-name-attr-list.-attr-entry.pbtxt
@@ -1,84 +1,22 @@
 path: "tensorflow.NameAttrList.AttrEntry"
-tf_class {
-  is_instance: "<class \'tensorflow.core.framework.attr_value_pb2.AttrEntry\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "KEY_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "VALUE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "AttrEntry"
+    field {
+      name: "key"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "value"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.AttrValue"
+    }
+    options {
+      map_entry: true
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.-name-attr-list.pbtxt b/tensorflow/tools/api/golden/tensorflow.-name-attr-list.pbtxt
index d10faf67d027a4..fcdb411ffce9b6 100644
--- a/tensorflow/tools/api/golden/tensorflow.-name-attr-list.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-name-attr-list.pbtxt
@@ -1,88 +1,38 @@
 path: "tensorflow.NameAttrList"
-tf_class {
-  is_instance: "<class \'tensorflow.core.framework.attr_value_pb2.NameAttrList\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "ATTR_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "AttrEntry"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "NAME_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "NameAttrList"
+    field {
+      name: "name"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "attr"
+      number: 2
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.NameAttrList.AttrEntry"
+    }
+    nested_type {
+      name: "AttrEntry"
+      field {
+        name: "key"
+        number: 1
+        label: LABEL_OPTIONAL
+        type: TYPE_STRING
+      }
+      field {
+        name: "value"
+        number: 2
+        label: LABEL_OPTIONAL
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.AttrValue"
+      }
+      options {
+        map_entry: true
+      }
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.-node-def.-attr-entry.pbtxt b/tensorflow/tools/api/golden/tensorflow.-node-def.-attr-entry.pbtxt
index b1b62d60f1e8c9..622e4c3d0f60ce 100644
--- a/tensorflow/tools/api/golden/tensorflow.-node-def.-attr-entry.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-node-def.-attr-entry.pbtxt
@@ -1,84 +1,22 @@
 path: "tensorflow.NodeDef.AttrEntry"
-tf_class {
-  is_instance: "<class \'tensorflow.core.framework.node_def_pb2.AttrEntry\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "KEY_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "VALUE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "AttrEntry"
+    field {
+      name: "key"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "value"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.AttrValue"
+    }
+    options {
+      map_entry: true
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.-node-def.pbtxt b/tensorflow/tools/api/golden/tensorflow.-node-def.pbtxt
index b812b4df2b3c15..646fa8abb9b22d 100644
--- a/tensorflow/tools/api/golden/tensorflow.-node-def.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-node-def.pbtxt
@@ -1,100 +1,56 @@
 path: "tensorflow.NodeDef"
-tf_class {
-  is_instance: "<class \'tensorflow.core.framework.node_def_pb2.NodeDef\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "ATTR_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "AttrEntry"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "DEVICE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "INPUT_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "NAME_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "OP_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "NodeDef"
+    field {
+      name: "name"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "op"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "input"
+      number: 3
+      label: LABEL_REPEATED
+      type: TYPE_STRING
+    }
+    field {
+      name: "device"
+      number: 4
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "attr"
+      number: 5
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.NodeDef.AttrEntry"
+    }
+    nested_type {
+      name: "AttrEntry"
+      field {
+        name: "key"
+        number: 1
+        label: LABEL_OPTIONAL
+        type: TYPE_STRING
+      }
+      field {
+        name: "value"
+        number: 2
+        label: LABEL_OPTIONAL
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.AttrValue"
+      }
+      options {
+        map_entry: true
+      }
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.-optimizer-options.pbtxt b/tensorflow/tools/api/golden/tensorflow.-optimizer-options.pbtxt
index 6cac5c4d99fd75..3ccf9d459b133b 100644
--- a/tensorflow/tools/api/golden/tensorflow.-optimizer-options.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-optimizer-options.pbtxt
@@ -1,132 +1,74 @@
 path: "tensorflow.OptimizerOptions"
-tf_class {
-  is_instance: "<class \'tensorflow.core.protobuf.config_pb2.OptimizerOptions\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "DEFAULT"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "DO_COMMON_SUBEXPRESSION_ELIMINATION_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "DO_CONSTANT_FOLDING_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "DO_FUNCTION_INLINING_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "GLOBAL_JIT_LEVEL_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GlobalJitLevel"
-    mtype: "<class \'google.protobuf.internal.enum_type_wrapper.EnumTypeWrapper\'>"
-  }
-  member {
-    name: "L0"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "L1"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "Level"
-    mtype: "<class \'google.protobuf.internal.enum_type_wrapper.EnumTypeWrapper\'>"
-  }
-  member {
-    name: "MAX_FOLDED_CONSTANT_IN_BYTES_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "OFF"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "ON_1"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "ON_2"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "OPT_LEVEL_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "OptimizerOptions"
+    field {
+      name: "do_common_subexpression_elimination"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+    field {
+      name: "do_constant_folding"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+    field {
+      name: "max_folded_constant_in_bytes"
+      number: 6
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "do_function_inlining"
+      number: 4
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+    field {
+      name: "opt_level"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_ENUM
+      type_name: ".tensorflow.OptimizerOptions.Level"
+    }
+    field {
+      name: "global_jit_level"
+      number: 5
+      label: LABEL_OPTIONAL
+      type: TYPE_ENUM
+      type_name: ".tensorflow.OptimizerOptions.GlobalJitLevel"
+    }
+    enum_type {
+      name: "Level"
+      value {
+        name: "L1"
+        number: 0
+      }
+      value {
+        name: "L0"
+        number: -1
+      }
+    }
+    enum_type {
+      name: "GlobalJitLevel"
+      value {
+        name: "DEFAULT"
+        number: 0
+      }
+      value {
+        name: "OFF"
+        number: -1
+      }
+      value {
+        name: "ON_1"
+        number: 1
+      }
+      value {
+        name: "ON_2"
+        number: 2
+      }
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.-run-metadata.pbtxt b/tensorflow/tools/api/golden/tensorflow.-run-metadata.pbtxt
index 808fa0fa217a40..1287940326c019 100644
--- a/tensorflow/tools/api/golden/tensorflow.-run-metadata.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-run-metadata.pbtxt
@@ -1,88 +1,27 @@
 path: "tensorflow.RunMetadata"
-tf_class {
-  is_instance: "<class \'tensorflow.core.protobuf.config_pb2.RunMetadata\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "COST_GRAPH_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "PARTITION_GRAPHS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "STEP_STATS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "RunMetadata"
+    field {
+      name: "step_stats"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.StepStats"
+    }
+    field {
+      name: "cost_graph"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.CostGraphDef"
+    }
+    field {
+      name: "partition_graphs"
+      number: 3
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.GraphDef"
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.-run-options.-experimental.pbtxt b/tensorflow/tools/api/golden/tensorflow.-run-options.-experimental.pbtxt
new file mode 100644
index 00000000000000..537e73aa896990
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.-run-options.-experimental.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.RunOptions.Experimental"
+tf_proto {
+  descriptor {
+    name: "Experimental"
+    field {
+      name: "collective_graph_key"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.-run-options.pbtxt b/tensorflow/tools/api/golden/tensorflow.-run-options.pbtxt
index 2f3e7f1a847dd3..cec04a2bf09624 100644
--- a/tensorflow/tools/api/golden/tensorflow.-run-options.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-run-options.pbtxt
@@ -1,120 +1,83 @@
 path: "tensorflow.RunOptions"
-tf_class {
-  is_instance: "<class \'tensorflow.core.protobuf.config_pb2.RunOptions\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "DEBUG_OPTIONS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "FULL_TRACE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "HARDWARE_TRACE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "INTER_OP_THREAD_POOL_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "NO_TRACE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "OUTPUT_PARTITION_GRAPHS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "REPORT_TENSOR_ALLOCATIONS_UPON_OOM_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "SOFTWARE_TRACE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TIMEOUT_IN_MS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TRACE_LEVEL_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TraceLevel"
-    mtype: "<class \'google.protobuf.internal.enum_type_wrapper.EnumTypeWrapper\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "RunOptions"
+    field {
+      name: "trace_level"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_ENUM
+      type_name: ".tensorflow.RunOptions.TraceLevel"
+    }
+    field {
+      name: "timeout_in_ms"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "inter_op_thread_pool"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_INT32
+    }
+    field {
+      name: "output_partition_graphs"
+      number: 5
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+    field {
+      name: "debug_options"
+      number: 6
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.DebugOptions"
+    }
+    field {
+      name: "report_tensor_allocations_upon_oom"
+      number: 7
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+    field {
+      name: "experimental"
+      number: 8
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.RunOptions.Experimental"
+    }
+    nested_type {
+      name: "Experimental"
+      field {
+        name: "collective_graph_key"
+        number: 1
+        label: LABEL_OPTIONAL
+        type: TYPE_INT64
+      }
+    }
+    enum_type {
+      name: "TraceLevel"
+      value {
+        name: "NO_TRACE"
+        number: 0
+      }
+      value {
+        name: "SOFTWARE_TRACE"
+        number: 1
+      }
+      value {
+        name: "HARDWARE_TRACE"
+        number: 2
+      }
+      value {
+        name: "FULL_TRACE"
+        number: 3
+      }
+    }
+    reserved_range {
+      start: 4
+      end: 5
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.-session-log.pbtxt b/tensorflow/tools/api/golden/tensorflow.-session-log.pbtxt
index ec66d7f3354083..259f2418740cbf 100644
--- a/tensorflow/tools/api/golden/tensorflow.-session-log.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-session-log.pbtxt
@@ -1,108 +1,44 @@
 path: "tensorflow.SessionLog"
-tf_class {
-  is_instance: "<class \'tensorflow.core.util.event_pb2.SessionLog\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "CHECKPOINT"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "CHECKPOINT_PATH_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "MSG_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "START"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "STATUS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "STATUS_UNSPECIFIED"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "STOP"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "SessionStatus"
-    mtype: "<class \'google.protobuf.internal.enum_type_wrapper.EnumTypeWrapper\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "SessionLog"
+    field {
+      name: "status"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_ENUM
+      type_name: ".tensorflow.SessionLog.SessionStatus"
+    }
+    field {
+      name: "checkpoint_path"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "msg"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    enum_type {
+      name: "SessionStatus"
+      value {
+        name: "STATUS_UNSPECIFIED"
+        number: 0
+      }
+      value {
+        name: "START"
+        number: 1
+      }
+      value {
+        name: "STOP"
+        number: 2
+      }
+      value {
+        name: "CHECKPOINT"
+        number: 3
+      }
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.-summary-metadata.-plugin-data.pbtxt b/tensorflow/tools/api/golden/tensorflow.-summary-metadata.-plugin-data.pbtxt
index 067f02ce8cbb1a..a66b74b315c613 100644
--- a/tensorflow/tools/api/golden/tensorflow.-summary-metadata.-plugin-data.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-summary-metadata.-plugin-data.pbtxt
@@ -1,84 +1,18 @@
 path: "tensorflow.SummaryMetadata.PluginData"
-tf_class {
-  is_instance: "<class \'tensorflow.core.framework.summary_pb2.PluginData\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "CONTENT_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "PLUGIN_NAME_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "PluginData"
+    field {
+      name: "plugin_name"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "content"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_BYTES
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.-summary-metadata.pbtxt b/tensorflow/tools/api/golden/tensorflow.-summary-metadata.pbtxt
index b9156521ccbee2..c02575b9626c84 100644
--- a/tensorflow/tools/api/golden/tensorflow.-summary-metadata.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-summary-metadata.pbtxt
@@ -1,92 +1,40 @@
 path: "tensorflow.SummaryMetadata"
-tf_class {
-  is_instance: "<class \'tensorflow.core.framework.summary_pb2.SummaryMetadata\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "DISPLAY_NAME_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "PLUGIN_DATA_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "PluginData"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "SUMMARY_DESCRIPTION_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "SummaryMetadata"
+    field {
+      name: "plugin_data"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.SummaryMetadata.PluginData"
+    }
+    field {
+      name: "display_name"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "summary_description"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    nested_type {
+      name: "PluginData"
+      field {
+        name: "plugin_name"
+        number: 1
+        label: LABEL_OPTIONAL
+        type: TYPE_STRING
+      }
+      field {
+        name: "content"
+        number: 2
+        label: LABEL_OPTIONAL
+        type: TYPE_BYTES
+      }
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.-summary.-audio.pbtxt b/tensorflow/tools/api/golden/tensorflow.-summary.-audio.pbtxt
index 781010d75e23c1..94f712073e0d0d 100644
--- a/tensorflow/tools/api/golden/tensorflow.-summary.-audio.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-summary.-audio.pbtxt
@@ -1,96 +1,36 @@
 path: "tensorflow.Summary.Audio"
-tf_class {
-  is_instance: "<class \'tensorflow.core.framework.summary_pb2.Audio\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "CONTENT_TYPE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "ENCODED_AUDIO_STRING_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "LENGTH_FRAMES_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "NUM_CHANNELS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "SAMPLE_RATE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "Audio"
+    field {
+      name: "sample_rate"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_FLOAT
+    }
+    field {
+      name: "num_channels"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "length_frames"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "encoded_audio_string"
+      number: 4
+      label: LABEL_OPTIONAL
+      type: TYPE_BYTES
+    }
+    field {
+      name: "content_type"
+      number: 5
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.-summary.-image.pbtxt b/tensorflow/tools/api/golden/tensorflow.-summary.-image.pbtxt
index feb9c7ee9270a7..fc1acb483b3051 100644
--- a/tensorflow/tools/api/golden/tensorflow.-summary.-image.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-summary.-image.pbtxt
@@ -1,92 +1,30 @@
 path: "tensorflow.Summary.Image"
-tf_class {
-  is_instance: "<class \'tensorflow.core.framework.summary_pb2.Image\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "COLORSPACE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "ENCODED_IMAGE_STRING_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "HEIGHT_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "WIDTH_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "Image"
+    field {
+      name: "height"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_INT32
+    }
+    field {
+      name: "width"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_INT32
+    }
+    field {
+      name: "colorspace"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_INT32
+    }
+    field {
+      name: "encoded_image_string"
+      number: 4
+      label: LABEL_OPTIONAL
+      type: TYPE_BYTES
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.-summary.-value.pbtxt b/tensorflow/tools/api/golden/tensorflow.-summary.-value.pbtxt
index ffb4f45fc5e2a9..feb84b6ee99654 100644
--- a/tensorflow/tools/api/golden/tensorflow.-summary.-value.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-summary.-value.pbtxt
@@ -1,112 +1,74 @@
 path: "tensorflow.Summary.Value"
-tf_class {
-  is_instance: "<class \'tensorflow.core.framework.summary_pb2.Value\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "AUDIO_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "HISTO_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "IMAGE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "METADATA_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "NODE_NAME_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "OBSOLETE_OLD_STYLE_HISTOGRAM_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "SIMPLE_VALUE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TAG_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TENSOR_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "Value"
+    field {
+      name: "node_name"
+      number: 7
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "tag"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "metadata"
+      number: 9
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.SummaryMetadata"
+    }
+    field {
+      name: "simple_value"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_FLOAT
+      oneof_index: 0
+    }
+    field {
+      name: "obsolete_old_style_histogram"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_BYTES
+      oneof_index: 0
+    }
+    field {
+      name: "image"
+      number: 4
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.Summary.Image"
+      oneof_index: 0
+    }
+    field {
+      name: "histo"
+      number: 5
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.HistogramProto"
+      oneof_index: 0
+    }
+    field {
+      name: "audio"
+      number: 6
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.Summary.Audio"
+      oneof_index: 0
+    }
+    field {
+      name: "tensor"
+      number: 8
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.TensorProto"
+      oneof_index: 0
+    }
+    oneof_decl {
+      name: "value"
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.-summary.pbtxt b/tensorflow/tools/api/golden/tensorflow.-summary.pbtxt
index 38de17fa9e52b8..b2bdff7171804a 100644
--- a/tensorflow/tools/api/golden/tensorflow.-summary.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-summary.pbtxt
@@ -1,92 +1,144 @@
 path: "tensorflow.Summary"
-tf_class {
-  is_instance: "<class \'tensorflow.core.framework.summary_pb2.Summary\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "Audio"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "Image"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "VALUE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "Value"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "Summary"
+    field {
+      name: "value"
+      number: 1
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.Summary.Value"
+    }
+    nested_type {
+      name: "Image"
+      field {
+        name: "height"
+        number: 1
+        label: LABEL_OPTIONAL
+        type: TYPE_INT32
+      }
+      field {
+        name: "width"
+        number: 2
+        label: LABEL_OPTIONAL
+        type: TYPE_INT32
+      }
+      field {
+        name: "colorspace"
+        number: 3
+        label: LABEL_OPTIONAL
+        type: TYPE_INT32
+      }
+      field {
+        name: "encoded_image_string"
+        number: 4
+        label: LABEL_OPTIONAL
+        type: TYPE_BYTES
+      }
+    }
+    nested_type {
+      name: "Audio"
+      field {
+        name: "sample_rate"
+        number: 1
+        label: LABEL_OPTIONAL
+        type: TYPE_FLOAT
+      }
+      field {
+        name: "num_channels"
+        number: 2
+        label: LABEL_OPTIONAL
+        type: TYPE_INT64
+      }
+      field {
+        name: "length_frames"
+        number: 3
+        label: LABEL_OPTIONAL
+        type: TYPE_INT64
+      }
+      field {
+        name: "encoded_audio_string"
+        number: 4
+        label: LABEL_OPTIONAL
+        type: TYPE_BYTES
+      }
+      field {
+        name: "content_type"
+        number: 5
+        label: LABEL_OPTIONAL
+        type: TYPE_STRING
+      }
+    }
+    nested_type {
+      name: "Value"
+      field {
+        name: "node_name"
+        number: 7
+        label: LABEL_OPTIONAL
+        type: TYPE_STRING
+      }
+      field {
+        name: "tag"
+        number: 1
+        label: LABEL_OPTIONAL
+        type: TYPE_STRING
+      }
+      field {
+        name: "metadata"
+        number: 9
+        label: LABEL_OPTIONAL
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.SummaryMetadata"
+      }
+      field {
+        name: "simple_value"
+        number: 2
+        label: LABEL_OPTIONAL
+        type: TYPE_FLOAT
+        oneof_index: 0
+      }
+      field {
+        name: "obsolete_old_style_histogram"
+        number: 3
+        label: LABEL_OPTIONAL
+        type: TYPE_BYTES
+        oneof_index: 0
+      }
+      field {
+        name: "image"
+        number: 4
+        label: LABEL_OPTIONAL
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.Summary.Image"
+        oneof_index: 0
+      }
+      field {
+        name: "histo"
+        number: 5
+        label: LABEL_OPTIONAL
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.HistogramProto"
+        oneof_index: 0
+      }
+      field {
+        name: "audio"
+        number: 6
+        label: LABEL_OPTIONAL
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.Summary.Audio"
+        oneof_index: 0
+      }
+      field {
+        name: "tensor"
+        number: 8
+        label: LABEL_OPTIONAL
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.TensorProto"
+        oneof_index: 0
+      }
+      oneof_decl {
+        name: "value"
+      }
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.-tensor-info.-coo-sparse.pbtxt b/tensorflow/tools/api/golden/tensorflow.-tensor-info.-coo-sparse.pbtxt
index 425c35e0674610..0064c8460cb374 100644
--- a/tensorflow/tools/api/golden/tensorflow.-tensor-info.-coo-sparse.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-tensor-info.-coo-sparse.pbtxt
@@ -1,88 +1,24 @@
 path: "tensorflow.TensorInfo.CooSparse"
-tf_class {
-  is_instance: "<class \'tensorflow.core.protobuf.meta_graph_pb2.CooSparse\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "DENSE_SHAPE_TENSOR_NAME_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "INDICES_TENSOR_NAME_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "VALUES_TENSOR_NAME_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "CooSparse"
+    field {
+      name: "values_tensor_name"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "indices_tensor_name"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "dense_shape_tensor_name"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.-tensor-info.pbtxt b/tensorflow/tools/api/golden/tensorflow.-tensor-info.pbtxt
index 41ea393be51bd7..63566c808e55cb 100644
--- a/tensorflow/tools/api/golden/tensorflow.-tensor-info.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-tensor-info.pbtxt
@@ -1,96 +1,59 @@
 path: "tensorflow.TensorInfo"
-tf_class {
-  is_instance: "<class \'tensorflow.core.protobuf.meta_graph_pb2.TensorInfo\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "COO_SPARSE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "CooSparse"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "DTYPE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "NAME_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TENSOR_SHAPE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "TensorInfo"
+    field {
+      name: "name"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+      oneof_index: 0
+    }
+    field {
+      name: "coo_sparse"
+      number: 4
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.TensorInfo.CooSparse"
+      oneof_index: 0
+    }
+    field {
+      name: "dtype"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_ENUM
+      type_name: ".tensorflow.DataType"
+    }
+    field {
+      name: "tensor_shape"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.TensorShapeProto"
+    }
+    nested_type {
+      name: "CooSparse"
+      field {
+        name: "values_tensor_name"
+        number: 1
+        label: LABEL_OPTIONAL
+        type: TYPE_STRING
+      }
+      field {
+        name: "indices_tensor_name"
+        number: 2
+        label: LABEL_OPTIONAL
+        type: TYPE_STRING
+      }
+      field {
+        name: "dense_shape_tensor_name"
+        number: 3
+        label: LABEL_OPTIONAL
+        type: TYPE_STRING
+      }
+    }
+    oneof_decl {
+      name: "encoding"
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.-variable.pbtxt b/tensorflow/tools/api/golden/tensorflow.-variable.pbtxt
index 5a02bb2175e2d6..23b552cc38488b 100644
--- a/tensorflow/tools/api/golden/tensorflow.-variable.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-variable.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.Variable"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.variables.Variable\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "SaveSliceInfo"
@@ -43,6 +43,10 @@ tf_class {
     name: "shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'initial_value\', \'trainable\', \'collections\', \'validate_shape\', \'caching_device\', \'name\', \'variable_def\', \'dtype\', \'expected_shape\', \'import_scope\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/tensorflow.data.-dataset.pbtxt
index cbbd077c97ba7f..8e7e945ed1bc26 100644
--- a/tensorflow/tools/api/golden/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.data.-dataset.pbtxt
@@ -44,7 +44,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "from_sparse_tensor_slices"
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.pbtxt
index 9a56ae8675c4fd..5cfb2fd2f0c6a7 100644
--- a/tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -45,7 +45,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "from_sparse_tensor_slices"
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.pbtxt
index e5ec824bb8959f..3327e5b274b43c 100644
--- a/tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -45,7 +45,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "from_sparse_tensor_slices"
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.pbtxt
index 008239789c7886..9d59375282b395 100644
--- a/tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.pbtxt
@@ -45,7 +45,7 @@ tf_class {
   }
   member_method {
     name: "from_generator"
-    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "from_sparse_tensor_slices"
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt
index be9ba4ce85bd5b..cf22e39d4c8ab9 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-classifier.pbtxt
@@ -23,6 +23,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'weighted_sum\'], "
   }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt
index 91fca67b6b5b11..a363bceae3b57d 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-baseline-regressor.pbtxt
@@ -23,6 +23,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'weighted_sum\'], "
   }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-best-exporter.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-best-exporter.pbtxt
new file mode 100644
index 00000000000000..9694268199a29c
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-best-exporter.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.estimator.BestExporter"
+tf_class {
+  is_instance: "<class \'tensorflow.python.estimator.exporter.BestExporter\'>"
+  is_instance: "<class \'tensorflow.python.estimator.exporter.Exporter\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'serving_input_receiver_fn\', \'event_file_pattern\', \'compare_fn\', \'assets_extra\', \'as_text\', \'exports_to_keep\'], varargs=None, keywords=None, defaults=[\'best_exporter\', \'None\', \'eval/*.tfevents.*\', \'<function _loss_smaller instance>\', \'None\', \'False\', \'5\'], "
+  }
+  member_method {
+    name: "export"
+    argspec: "args=[\'self\', \'estimator\', \'export_path\', \'checkpoint_path\', \'eval_result\', \'is_the_final_export\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt
index 53a903c239b5fc..099838fa65f6a5 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-classifier.pbtxt
@@ -23,6 +23,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\'], "
   }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt
index ba17c90de28899..87bd19a23a3db7 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-boosted-trees-regressor.pbtxt
@@ -23,6 +23,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'feature_columns\', \'n_batches_per_layer\', \'model_dir\', \'label_dimension\', \'weight_column\', \'n_trees\', \'max_depth\', \'learning_rate\', \'l1_regularization\', \'l2_regularization\', \'tree_complexity\', \'min_node_weight\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'<object object instance>\', \'None\', \'100\', \'6\', \'0.1\', \'0.0\', \'0.0\', \'0.0\', \'0.0\', \'None\'], "
   }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-classifier.pbtxt
index cd4f72fcf839fa..111914f643a3b1 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-classifier.pbtxt
@@ -23,6 +23,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum\'], "
   }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
index 303fd74a64d0c7..67e4ee02d05812 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
@@ -23,6 +23,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'2\', \'None\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum\'], "
   }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
index c97ea7969eff3e..e1289b975e721e 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
@@ -23,6 +23,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'label_dimension\', \'weight_column\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'1\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum\'], "
   }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-regressor.pbtxt
index 4b5b5bf0e3599a..d030b2f51f019e 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-d-n-n-regressor.pbtxt
@@ -23,6 +23,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum\'], "
   }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-estimator.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-estimator.pbtxt
index 42a0d595216ad2..d72b5769778d2e 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-estimator.pbtxt
@@ -22,6 +22,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'model_fn\', \'model_dir\', \'config\', \'params\', \'warm_start_from\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-linear-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-linear-classifier.pbtxt
index 2de52d6c57cc70..cb578759eee2ed 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-linear-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-linear-classifier.pbtxt
@@ -23,6 +23,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'partitioner\', \'warm_start_from\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'None\', \'None\', \'weighted_sum\'], "
   }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.-linear-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.-linear-regressor.pbtxt
index e552f33720bb93..fcd01bb663c7af 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.-linear-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.-linear-regressor.pbtxt
@@ -23,6 +23,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'partitioner\', \'warm_start_from\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'None\', \'None\', \'weighted_sum\'], "
   }
+  member_method {
+    name: "eval_dir"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt b/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt
index 4946f2c51a62af..f1d204a3ef96f3 100644
--- a/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.estimator.pbtxt
@@ -8,6 +8,10 @@ tf_module {
     name: "BaselineRegressor"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "BestExporter"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "BoostedTreesClassifier"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
index 3fc64dae888012..87543e374b5ed2 100644
--- a/tensorflow/tools/api/golden/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
@@ -110,7 +110,7 @@ tf_module {
   }
   member_method {
     name: "non_max_suppression"
-    argspec: "args=[\'boxes\', \'scores\', \'max_output_size\', \'iou_threshold\', \'name\'], varargs=None, keywords=None, defaults=[\'0.5\', \'None\'], "
+    argspec: "args=[\'boxes\', \'scores\', \'max_output_size\', \'iou_threshold\', \'score_threshold\', \'name\'], varargs=None, keywords=None, defaults=[\'0.5\', \'-inf\', \'None\'], "
   }
   member_method {
     name: "pad_to_bounding_box"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
index cee76bdc1db69d..11cdd6f0b5e48f 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.-model.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.Model"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.network.Network\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -127,7 +127,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -155,7 +155,7 @@ tf_class {
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "fit"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
index 02718cb5f9e3ca..4afad3e4df308d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.-sequential.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.keras.Sequential"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.sequential.Sequential\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.network.Network\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.sequential.Sequential\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -160,7 +160,7 @@ tf_class {
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "fit"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.backend.pbtxt
index ba2d083a755384..c6149e8aa7e365 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.backend.pbtxt
@@ -450,7 +450,7 @@ tf_module {
   }
   member_method {
     name: "softmax"
-    argspec: "args=[\'x\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'x\', \'axis\'], varargs=None, keywords=None, defaults=[\'-1\'], "
   }
   member_method {
     name: "softplus"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-base-logger.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-base-logger.pbtxt
index 454823fd23e72c..9eee9b378964a9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-base-logger.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-base-logger.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.callbacks.BaseLogger"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.BaseLogger\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.Callback\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.BaseLogger\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.Callback\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
index 86b264c79f63ff..5bb949c5bb650a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-c-s-v-logger.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.callbacks.CSVLogger"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.CSVLogger\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.Callback\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.CSVLogger\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.Callback\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-callback.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-callback.pbtxt
index 1474b392ff38c0..a5340d52c1af6d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-callback.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-callback.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.callbacks.Callback"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.Callback\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.Callback\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-early-stopping.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-early-stopping.pbtxt
index 27d4a208a4108b..7b0ad85eaac5b8 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-early-stopping.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-early-stopping.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.callbacks.EarlyStopping"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.EarlyStopping\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.Callback\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.EarlyStopping\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.Callback\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-history.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-history.pbtxt
index a7b2deea8286df..ee400b31c43829 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-history.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-history.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.callbacks.History"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.History\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.Callback\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.History\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.Callback\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-lambda-callback.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-lambda-callback.pbtxt
index 5ee22948ad52ed..df8d7b0ef7afca 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-lambda-callback.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-lambda-callback.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.callbacks.LambdaCallback"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.LambdaCallback\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.Callback\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.LambdaCallback\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.Callback\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
index d4c85a4519eb92..ce1a9b694d8708 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-learning-rate-scheduler.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.callbacks.LearningRateScheduler"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.LearningRateScheduler\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.Callback\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.LearningRateScheduler\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.Callback\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-model-checkpoint.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
index 79f9c88bbcaba1..48bb24a05274ad 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-model-checkpoint.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.callbacks.ModelCheckpoint"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.ModelCheckpoint\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.Callback\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.ModelCheckpoint\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.Callback\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-progbar-logger.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-progbar-logger.pbtxt
index 543de0ad48b865..d8bb8b2a7d0f49 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-progbar-logger.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-progbar-logger.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.callbacks.ProgbarLogger"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.ProgbarLogger\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.Callback\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.ProgbarLogger\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.Callback\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
index 5838d583125e46..dc27af9552a886 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-reduce-l-r-on-plateau.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.keras.callbacks.ReduceLROnPlateau"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.ReduceLROnPlateau\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.Callback\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.ReduceLROnPlateau\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.Callback\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'monitor\', \'factor\', \'patience\', \'verbose\', \'mode\', \'epsilon\', \'cooldown\', \'min_lr\'], varargs=None, keywords=None, defaults=[\'val_loss\', \'0.1\', \'10\', \'0\', \'auto\', \'0.0001\', \'0\', \'0\'], "
+    argspec: "args=[\'self\', \'monitor\', \'factor\', \'patience\', \'verbose\', \'mode\', \'min_delta\', \'cooldown\', \'min_lr\'], varargs=None, keywords=kwargs, defaults=[\'val_loss\', \'0.1\', \'10\', \'0\', \'auto\', \'0.0001\', \'0\', \'0\'], "
   }
   member_method {
     name: "in_cooldown"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-remote-monitor.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-remote-monitor.pbtxt
index 3d0acfed1d8f5a..5a3b791c0adc0d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-remote-monitor.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-remote-monitor.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.keras.callbacks.RemoteMonitor"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.RemoteMonitor\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.Callback\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.RemoteMonitor\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.Callback\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'root\', \'path\', \'field\', \'headers\'], varargs=None, keywords=None, defaults=[\'http://localhost:9000\', \'/publish/epoch/end/\', \'data\', \'None\'], "
+    argspec: "args=[\'self\', \'root\', \'path\', \'field\', \'headers\', \'send_as_json\'], varargs=None, keywords=None, defaults=[\'http://localhost:9000\', \'/publish/epoch/end/\', \'data\', \'None\', \'False\'], "
   }
   member_method {
     name: "on_batch_begin"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-tensor-board.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-tensor-board.pbtxt
index 7de4008c4541b9..2f52464315d8c1 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-tensor-board.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-tensor-board.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.callbacks.TensorBoard"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.TensorBoard\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.Callback\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.TensorBoard\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.Callback\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
index bf17e8736c5003..5c2d336353aee7 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.callbacks.-terminate-on-na-n.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.callbacks.TerminateOnNaN"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.TerminateOnNaN\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.callbacks.Callback\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.TerminateOnNaN\'>"
+  is_instance: "<class \'tensorflow.python.keras.callbacks.Callback\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.-constraint.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.constraints.-constraint.pbtxt
index 14977c696fbe70..8e07b7d98e1d83 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.constraints.-constraint.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.constraints.-constraint.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.constraints.Constraint"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.Constraint\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.Constraint\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.-max-norm.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.constraints.-max-norm.pbtxt
index a2269f8a18f5b5..2b81174b6cd4d5 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.constraints.-max-norm.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.constraints.-max-norm.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.constraints.MaxNorm"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.MaxNorm\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.Constraint\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.MaxNorm\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.Constraint\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.-min-max-norm.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.constraints.-min-max-norm.pbtxt
index afe0d6478dde92..a41eda86ac2583 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.constraints.-min-max-norm.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.constraints.-min-max-norm.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.constraints.MinMaxNorm"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.MinMaxNorm\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.Constraint\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.MinMaxNorm\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.Constraint\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.-non-neg.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.constraints.-non-neg.pbtxt
index e8c4bb90881ae3..572e3eea4d9859 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.constraints.-non-neg.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.constraints.-non-neg.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.constraints.NonNeg"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.NonNeg\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.Constraint\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.NonNeg\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.Constraint\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.-unit-norm.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.constraints.-unit-norm.pbtxt
index d457cb6419ef86..fe16c38cc83fb9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.constraints.-unit-norm.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.constraints.-unit-norm.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.constraints.UnitNorm"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.UnitNorm\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.Constraint\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.UnitNorm\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.Constraint\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.max_norm.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.constraints.max_norm.pbtxt
index 48128096d46383..6650bae07a0d32 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.constraints.max_norm.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.constraints.max_norm.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.constraints.max_norm"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.MaxNorm\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.Constraint\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.MaxNorm\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.Constraint\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.min_max_norm.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.constraints.min_max_norm.pbtxt
index 02eb3fb00c0ae5..9dd3bc92fc4fad 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.constraints.min_max_norm.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.constraints.min_max_norm.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.constraints.min_max_norm"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.MinMaxNorm\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.Constraint\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.MinMaxNorm\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.Constraint\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.non_neg.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.constraints.non_neg.pbtxt
index cc1101097ce9c4..a565840939f990 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.constraints.non_neg.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.constraints.non_neg.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.constraints.non_neg"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.NonNeg\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.Constraint\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.NonNeg\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.Constraint\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.constraints.unit_norm.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.constraints.unit_norm.pbtxt
index 086f9f2d43c3d3..5cbe0da4c1d1ff 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.constraints.unit_norm.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.constraints.unit_norm.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.constraints.unit_norm"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.UnitNorm\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.constraints.Constraint\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.UnitNorm\'>"
+  is_instance: "<class \'tensorflow.python.keras.constraints.Constraint\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.constant.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.initializers.constant.pbtxt
new file mode 100644
index 00000000000000..bddc37b907e757
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.initializers.constant.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.initializers.constant"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Constant\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'value\', \'dtype\', \'verify_shape\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'float32\'>\", \'False\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.identity.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.initializers.identity.pbtxt
new file mode 100644
index 00000000000000..a4c5a6149047ff
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.initializers.identity.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.initializers.identity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Identity\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'gain\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.normal.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.initializers.normal.pbtxt
new file mode 100644
index 00000000000000..7485772784d40b
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.initializers.normal.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.initializers.normal"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.RandomNormal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.ones.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.initializers.ones.pbtxt
new file mode 100644
index 00000000000000..a89f78d1e1a47c
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.initializers.ones.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.initializers.ones"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Ones\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.orthogonal.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.initializers.orthogonal.pbtxt
new file mode 100644
index 00000000000000..ee1e9bbae2b713
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.initializers.orthogonal.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.initializers.orthogonal"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Orthogonal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'gain\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.initializers.pbtxt
index 093c56595bd54e..14a667870d3118 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.initializers.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.initializers.pbtxt
@@ -40,6 +40,46 @@ tf_module {
     name: "Zeros"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "constant"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "identity"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "normal"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ones"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "orthogonal"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "random_normal"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "random_uniform"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "truncated_normal"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "uniform"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "zeros"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "deserialize"
     argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.random_normal.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.initializers.random_normal.pbtxt
new file mode 100644
index 00000000000000..a6df1e87a3f68f
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.initializers.random_normal.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.initializers.random_normal"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.RandomNormal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.random_uniform.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.initializers.random_uniform.pbtxt
new file mode 100644
index 00000000000000..37a0fa0d5508de
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.initializers.random_uniform.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.initializers.random_uniform"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.RandomUniform\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'minval\', \'maxval\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \'None\', \"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.truncated_normal.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.initializers.truncated_normal.pbtxt
new file mode 100644
index 00000000000000..f97e93f0b72d5e
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.initializers.truncated_normal.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.initializers.truncated_normal"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.TruncatedNormal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.uniform.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.initializers.uniform.pbtxt
new file mode 100644
index 00000000000000..58186b1383d899
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.initializers.uniform.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.initializers.uniform"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.RandomUniform\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'minval\', \'maxval\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \'None\', \"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.zeros.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.initializers.zeros.pbtxt
new file mode 100644
index 00000000000000..a262390687f31a
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.initializers.zeros.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.initializers.zeros"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Zeros\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt
index 96272d1b7d6143..2bf973debb175d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-activation.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Activation"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Activation\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.Activation\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt
index 8fd55c8686de77..03f20e72c2a325 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.ActivityRegularization"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.ActivityRegularization\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.ActivityRegularization\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt
index 47d1532c3c8cf2..4b46b8d15afb0a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-add.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.Add"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Add\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.merge.Add\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt
index 797d422a90a5ad..d8a1c76fd07634 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.AlphaDropout"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.noise.AlphaDropout\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.noise.AlphaDropout\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index 269be1455b6bf3..622926bc4b8b24 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.AveragePooling1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index 344813621534db..82100d8e09c8e9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.AveragePooling2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index 979008d0edb7d0..408061077cdeab 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.AveragePooling3D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt
index 0ffdffd4cdee14..a3c80311043eeb 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-average.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.Average"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Average\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.merge.Average\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index 6b00f110eea2aa..e2dfaca29f86bd 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.AvgPool1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index caff5a2f1db61c..4f068d2066a450 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.AvgPool2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index 4a7239492177aa..b8c261a74364e9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.AvgPool3D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt
index 9804394fa53d6b..4ccd6cace650e2 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.BatchNormalization"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.normalization.BatchNormalization\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalization\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
index 5e5b04c7c695c6..2790e5fd850c24 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.Bidirectional"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.wrappers.Bidirectional\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.wrappers.Wrapper\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Bidirectional\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Wrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -119,7 +119,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\', \'initial_state\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'inputs\', \'training\', \'mask\', \'initial_state\', \'constants\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "compute_mask"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt
index b8eb4079b9eea3..b1326bd0e6054b 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-concatenate.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.Concatenate"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Concatenate\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.merge.Concatenate\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index 3fdb101425d0f0..e3ac3dbf28da73 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.keras.layers.ConvLSTM2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional_recurrent.ConvLSTM2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional_recurrent.ConvRNN2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional_recurrent.ConvLSTM2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional_recurrent.ConvRNN2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv1-d.pbtxt
index 0be42471e35eeb..1117a695a395f4 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.Conv1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index 39ba31a7094281..b9de1421428dcf 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.keras.layers.Conv2DTranspose"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2DTranspose\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d.pbtxt
index 26d9d8c476f4e4..deb535e06e0600 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.Conv2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index 43611017fa3707..9a9a223fbad11c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.keras.layers.Conv3DTranspose"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3DTranspose\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d.pbtxt
index fa4925ab99d719..1c59b0bdf624b0 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.Conv3D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution1-d.pbtxt
index c5c5d5e7c083dc..30cf5489f4fcd4 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.Convolution1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 36dc2d2e9a70fe..0ec69508d5a199 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.keras.layers.Convolution2DTranspose"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2DTranspose\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d.pbtxt
index 23ec74370bab01..4cd8928403c98a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.Convolution2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index 0e4089c5785ccd..4b4912496deac2 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.keras.layers.Convolution3DTranspose"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3DTranspose\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d.pbtxt
index 23ddbe1a925e33..d0ad9cf56702e5 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.Convolution3D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping1-d.pbtxt
index e04ab6bea85baa..98cff95a7fe9d4 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Cropping1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Cropping1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Cropping1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping2-d.pbtxt
index 655314afffd1e4..2357498b46376e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Cropping2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Cropping2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Cropping2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping3-d.pbtxt
index d5215f1330a2fd..3324cbff304c51 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Cropping3D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Cropping3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Cropping3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
new file mode 100644
index 00000000000000..6c81823654b78a
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.layers.CuDNNGRU"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.cudnn_recurrent.CuDNNGRU\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.cudnn_recurrent._CuDNNRNN\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "cell"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "states"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'units\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\'], varargs=None, keywords=kwargs, defaults=[\'glorot_uniform\', \'orthogonal\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'False\', \'False\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'initial_state\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
new file mode 100644
index 00000000000000..487e04fd0790cb
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.layers.CuDNNLSTM"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.layers.cudnn_recurrent.CuDNNLSTM\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.cudnn_recurrent._CuDNNRNN\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "cell"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "states"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'units\', \'kernel_initializer\', \'recurrent_initializer\', \'bias_initializer\', \'unit_forget_bias\', \'kernel_regularizer\', \'recurrent_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'kernel_constraint\', \'recurrent_constraint\', \'bias_constraint\', \'return_sequences\', \'return_state\', \'go_backwards\', \'stateful\'], varargs=None, keywords=kwargs, defaults=[\'glorot_uniform\', \'orthogonal\', \'zeros\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'False\', \'False\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'getter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\', \'mask\', \'training\', \'initial_state\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'instance\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_initial_state"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\', \'states\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dense.pbtxt
index 310a3c3b918684..137e7cced4e811 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dense.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Dense"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dense\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.Dense\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index 2d67b5f720209e..7161665d2550c1 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.keras.layers.DepthwiseConv2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.DepthwiseConv2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.DepthwiseConv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt
index 0e493a7f2bffc7..24affa248121bc 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dot.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.Dot"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Dot\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.merge.Dot\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dropout.pbtxt
index 14726b4b6cecc3..7ba19a42695da3 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-dropout.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Dropout"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dropout\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt
index 32a50455ed8a2a..503aa9162c3a78 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.ELU"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.advanced_activations.ELU\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.ELU\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt
index 2f615d81124a7e..1737e590a29c57 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-embedding.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Embedding"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.embeddings.Embedding\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.embeddings.Embedding\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-flatten.pbtxt
index 82dc878a8c7f7f..021d024dc2150a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-flatten.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Flatten"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Flatten\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.Flatten\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -82,7 +82,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index d79d02b95433a7..65387008bf3f78 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.GRUCell"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.GRUCell\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.GRUCell\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
index 1d38ae64bb86d1..4f791acf0585c9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GRU"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.GRU\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.GRU\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index 135de9cd95141a..abc30e54e0630a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.GaussianDropout"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.noise.GaussianDropout\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.noise.GaussianDropout\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 5db6e433ee02b8..20791bb448d177 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.GaussianNoise"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.noise.GaussianNoise\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.noise.GaussianNoise\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index bf0dba0a925b4f..449a91d8735c59 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalAveragePooling1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index 6da98036094228..bb361e129728dd 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalAveragePooling2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index 345593dec812a2..e564bf3216104a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalAveragePooling3D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index 5d3be9085e5152..4cb9cc3ec84d67 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalAvgPool1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 0b79a87e0507ec..5ed52b88ae3e2d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalAvgPool2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index 68cdbac652f74b..f4559d29d75ef7 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalAvgPool3D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalAveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalAveragePooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index d5872b444fa9ae..64e2d061e26997 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalMaxPool1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index 4b0cf9a5d38f86..3372ad645388be 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalMaxPool2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index 4c1adb2131f204..08a6860bcd7d9a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalMaxPool3D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index 815f1cf580562e..22c9eab64fde41 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalMaxPooling1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index e027dd6cc282b7..74c405ba9b1b46 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalMaxPooling2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index c647b24a23258b..39f6f981931296 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.GlobalMaxPooling3D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalMaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.GlobalPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalMaxPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.GlobalPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt
index 75d70734b41446..7b25e80b6b7653 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-layer.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.InputLayer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.input_layer.InputLayer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.input_layer.InputLayer\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-spec.pbtxt
index 29edabe0483a21..5fd0a47a68c0d4 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-spec.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-input-spec.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.layers.InputSpec"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.InputSpec\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.InputSpec\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index 0ed383a3554f81..3619b8bfc44373 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.LSTMCell"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.LSTMCell\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.LSTMCell\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 6d14c9c8f69286..8ef3d71dd82efc 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.LSTM"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.LSTM\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.LSTM\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt
index ddf96aba34bf57..ecbaa9ce2c76bf 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-lambda.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Lambda"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Lambda\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.Lambda\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-layer.pbtxt
index aca282d62427f5..9b90db1e5e56d1 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-layer.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.layers.Layer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -105,7 +105,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index b9c53b43c87bd8..3c60eaab7f1df1 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.LeakyReLU"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.advanced_activations.LeakyReLU\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.LeakyReLU\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index 2ee566d03b4631..3dac1ff342ac1b 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.LocallyConnected1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.local.LocallyConnected1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.local.LocallyConnected1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index db0d0e816a6d86..7f1b5db4d34f70 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.LocallyConnected2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.local.LocallyConnected2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.local.LocallyConnected2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-masking.pbtxt
index 82008b89d038e2..b3e31000f3bca0 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-masking.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Masking"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Masking\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.Masking\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool1-d.pbtxt
index 31a34a17d04129..bbd9d1b0dc075b 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.MaxPool1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool2-d.pbtxt
index 70d24ac75c4f85..fe72beea802d12 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.MaxPool2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool3-d.pbtxt
index 55b16564b30919..e9bf57b2b0e603 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.MaxPool3D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index a230b74c383eff..0eecc58a2b6a28 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.MaxPooling1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index d98f7c39f54604..96785a7d855961 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.MaxPooling2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index b2e96a42037582..42c46cccb37b1a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.MaxPooling3D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt
index 0c45bbdf171ad9..ac816f68d492cb 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-maximum.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.Maximum"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Maximum\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.merge.Maximum\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt
index 6423d83418aa40..9ae99563e9a1b3 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-multiply.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.Multiply"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge.Multiply\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.merge._Merge\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.merge.Multiply\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.merge._Merge\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt
index 6e17081375b7fb..815f3bc2d14206 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.PReLU"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.advanced_activations.PReLU\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.PReLU\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-permute.pbtxt
index d01d371da59625..e704992b4a18f6 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-permute.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Permute"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Permute\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.Permute\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
index d3f5508640e268..b3a58fa11eda61 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.RNN"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-repeat-vector.pbtxt
index 44e1007f5420bb..78f464583b4e80 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.RepeatVector"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.RepeatVector\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.RepeatVector\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-reshape.pbtxt
index 8fc3ec33310f53..222344fd0497af 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-reshape.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Reshape"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Reshape\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.Reshape\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index 457d2774950416..55fddf576cac6a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.keras.layers.SeparableConv1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index 54eda8ee2121d4..96314ce49849a5 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.keras.layers.SeparableConv2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -84,7 +84,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'depth_multiplier\', \'activation\', \'use_bias\', \'depthwise_initializer\', \'pointwise_initializer\', \'bias_initializer\', \'depthwise_regularizer\', \'pointwise_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'depthwise_constraint\', \'pointwise_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'1\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'depth_multiplier\', \'activation\', \'use_bias\', \'depthwise_initializer\', \'pointwise_initializer\', \'bias_initializer\', \'depthwise_regularizer\', \'pointwise_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'depthwise_constraint\', \'pointwise_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index 711196554698b7..88bdf9956603c5 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.keras.layers.SeparableConvolution1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index 815e34a48de542..6eeea7a8d1312a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.keras.layers.SeparableConvolution2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -84,7 +84,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'depth_multiplier\', \'activation\', \'use_bias\', \'depthwise_initializer\', \'pointwise_initializer\', \'bias_initializer\', \'depthwise_regularizer\', \'pointwise_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'depthwise_constraint\', \'pointwise_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'1\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'filters\', \'kernel_size\', \'strides\', \'padding\', \'data_format\', \'dilation_rate\', \'depth_multiplier\', \'activation\', \'use_bias\', \'depthwise_initializer\', \'pointwise_initializer\', \'bias_initializer\', \'depthwise_regularizer\', \'pointwise_regularizer\', \'bias_regularizer\', \'activity_regularizer\', \'depthwise_constraint\', \'pointwise_constraint\', \'bias_constraint\'], varargs=None, keywords=kwargs, defaults=[\'(1, 1)\', \'valid\', \'None\', \'(1, 1)\', \'1\', \'None\', \'True\', \'glorot_uniform\', \'glorot_uniform\', \'zeros\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "add_loss"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 6614760e5e7255..3050d462490037 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.SimpleRNNCell"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.SimpleRNNCell\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.SimpleRNNCell\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index bfcfd71ecddfb6..dda4c9358ba5fa 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.SimpleRNN"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.SimpleRNN\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.RNN\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.SimpleRNN\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.RNN\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activation"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt
index 9c4618c4e91ae2..cc6275158b67e9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-softmax.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Softmax"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.advanced_activations.Softmax\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.Softmax\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index 9a0a19d2d52f34..5eb7e750477b17 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.SpatialDropout1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.SpatialDropout1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dropout\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.SpatialDropout1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index 446f7122a6a2dc..500cb8c14ead3e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.SpatialDropout2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.SpatialDropout2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dropout\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.SpatialDropout2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 52a0485b5cea2b..1113a7634fa98b 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.SpatialDropout3D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.SpatialDropout3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dropout\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.SpatialDropout3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -107,7 +107,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index c82e7a192dfca8..c4b9f93561de6a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.StackedRNNCells"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.recurrent.StackedRNNCells\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.recurrent.StackedRNNCells\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index 9ccf251a180343..282c98d79a6e1d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.ThresholdedReLU"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.advanced_activations.ThresholdedReLU\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.advanced_activations.ThresholdedReLU\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt
index e080a07799fe1b..acab93706b29fe 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.layers.TimeDistributed"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.wrappers.TimeDistributed\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.wrappers.Wrapper\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.wrappers.TimeDistributed\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Wrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index 5fadca0b838695..a5ec228a074721 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.UpSampling1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.UpSampling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.UpSampling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index 2d395bf7e87b88..d8d8e0bfe95a6c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.UpSampling2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.UpSampling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.UpSampling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index 18d58ec3b23f7d..97d6dc06fb2e88 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.UpSampling3D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.UpSampling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.UpSampling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt
index 6223cb2f3c1230..ea9bb41b9979de 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-wrapper.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.Wrapper"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.wrappers.Wrapper\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.wrappers.Wrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index e71bba6a7f1df7..e6d1d2e089b01c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.ZeroPadding1D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.ZeroPadding1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.ZeroPadding1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index aba6d8cb1f43bb..f62017305f2651 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.ZeroPadding2D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.ZeroPadding2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.ZeroPadding2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index ce545ecc954e84..07a1fde5bdc355 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.layers.ZeroPadding3D"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.ZeroPadding3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.ZeroPadding3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -106,7 +106,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
index affc9bd09b1124..709eb5be55ef18 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.layers.pbtxt
@@ -112,6 +112,14 @@ tf_module {
     name: "Cropping3D"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CuDNNGRU"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CuDNNLSTM"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Dense"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.losses.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.losses.pbtxt
index ae5f6305b7d1bb..eca6b915388ebf 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.losses.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.losses.pbtxt
@@ -1,5 +1,25 @@
 path: "tensorflow.keras.losses"
 tf_module {
+  member_method {
+    name: "KLD"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MAE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MAPE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MSE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MSLE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "binary_crossentropy"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -12,6 +32,10 @@ tf_module {
     name: "categorical_hinge"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cosine"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "cosine_proximity"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -28,6 +52,10 @@ tf_module {
     name: "hinge"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "kld"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "kullback_leibler_divergence"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -36,6 +64,14 @@ tf_module {
     name: "logcosh"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "mae"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mape"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "mean_absolute_error"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -52,6 +88,14 @@ tf_module {
     name: "mean_squared_logarithmic_error"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "mse"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "msle"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "poisson"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.metrics.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.metrics.pbtxt
index 42729e42376856..a97a9b57587070 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.metrics.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.metrics.pbtxt
@@ -1,5 +1,25 @@
 path: "tensorflow.keras.metrics"
 tf_module {
+  member_method {
+    name: "KLD"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MAE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MAPE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MSE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MSLE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "binary_accuracy"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -16,6 +36,10 @@ tf_module {
     name: "categorical_crossentropy"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cosine"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "cosine_proximity"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -32,10 +56,22 @@ tf_module {
     name: "hinge"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "kld"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "kullback_leibler_divergence"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "mae"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mape"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "mean_absolute_error"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -52,6 +88,14 @@ tf_module {
     name: "mean_squared_logarithmic_error"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "mse"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "msle"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "poisson"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
index dd78384005fce1..62aa929d32b575 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.models.-model.pbtxt
@@ -1,9 +1,9 @@
 path: "tensorflow.keras.models.Model"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.network.Network\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -127,7 +127,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
@@ -155,7 +155,7 @@ tf_class {
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "fit"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
index 9fcb03f47e7701..93ecbbce9b17b9 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.models.-sequential.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.keras.models.Sequential"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.sequential.Sequential\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.training.Model\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.network.Network\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.sequential.Sequential\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -160,7 +160,7 @@ tf_class {
   }
   member_method {
     name: "evaluate_generator"
-    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\'], "
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
   }
   member_method {
     name: "fit"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adadelta.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adadelta.pbtxt
index 32667cf31e4aaa..b9ce154bddef60 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adadelta.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adadelta.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.optimizers.Adadelta"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.optimizers.Adadelta\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizers.Adadelta\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adagrad.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adagrad.pbtxt
index efca59e8e427d2..d0dc9e37a386a2 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adagrad.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adagrad.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.optimizers.Adagrad"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.optimizers.Adagrad\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizers.Adagrad\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adam.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adam.pbtxt
index 5546e2067ab65a..06815fa99a4a47 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adam.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adam.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.optimizers.Adam"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.optimizers.Adam\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizers.Adam\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adamax.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adamax.pbtxt
index aaa54a10606626..47b55fdb44e79e 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adamax.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-adamax.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.optimizers.Adamax"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.optimizers.Adamax\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizers.Adamax\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-nadam.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-nadam.pbtxt
index 1fada7fd9c6eef..8c63a7dda98568 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-nadam.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-nadam.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.optimizers.Nadam"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.optimizers.Nadam\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizers.Nadam\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-optimizer.pbtxt
index ca47e952282e0c..53d64dae932e25 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-optimizer.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.optimizers.Optimizer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-r-m-sprop.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
index fd3f97f35dcb18..a1e9b8cceb95e8 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-r-m-sprop.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.optimizers.RMSprop"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.optimizers.RMSprop\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizers.RMSprop\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-s-g-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-s-g-d.pbtxt
index 25adfd3f0bc89d..a67fefb1bafebd 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-s-g-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.optimizers.-s-g-d.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.optimizers.SGD"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.optimizers.SGD\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.optimizers.Optimizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizers.SGD\'>"
+  is_instance: "<class \'tensorflow.python.keras.optimizers.Optimizer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt
index ec0f3d892d9d03..dddace87dca85c 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-directory-iterator.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.preprocessing.image.DirectoryIterator"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.preprocessing.image.DirectoryIterator\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.preprocessing.image.Iterator\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.utils.data_utils.Sequence\'>"
+  is_instance: "<class \'tensorflow.python.keras.preprocessing.image.DirectoryIterator\'>"
+  is_instance: "<class \'tensorflow.python.keras.preprocessing.image.Iterator\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.data_utils.Sequence\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-image-data-generator.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-image-data-generator.pbtxt
index f5bc04e44c198e..c1e2e94f0bea93 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-image-data-generator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-image-data-generator.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.preprocessing.image.ImageDataGenerator"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.preprocessing.image.ImageDataGenerator\'>"
+  is_instance: "<class \'tensorflow.python.keras.preprocessing.image.ImageDataGenerator\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-iterator.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-iterator.pbtxt
index 69488d63bf1182..825d9f1d1d6a82 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-iterator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-iterator.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.preprocessing.image.Iterator"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.preprocessing.image.Iterator\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.utils.data_utils.Sequence\'>"
+  is_instance: "<class \'tensorflow.python.keras.preprocessing.image.Iterator\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.data_utils.Sequence\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-numpy-array-iterator.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-numpy-array-iterator.pbtxt
index 42196ddeee7aab..75924a254a6a59 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-numpy-array-iterator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.image.-numpy-array-iterator.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.keras.preprocessing.image.NumpyArrayIterator"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.preprocessing.image.NumpyArrayIterator\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.preprocessing.image.Iterator\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.utils.data_utils.Sequence\'>"
+  is_instance: "<class \'tensorflow.python.keras.preprocessing.image.NumpyArrayIterator\'>"
+  is_instance: "<class \'tensorflow.python.keras.preprocessing.image.Iterator\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.data_utils.Sequence\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.sequence.-timeseries-generator.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.sequence.-timeseries-generator.pbtxt
index d9c3215b555c19..326b1fa4fda1c0 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.sequence.-timeseries-generator.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.sequence.-timeseries-generator.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.preprocessing.sequence.TimeseriesGenerator"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.preprocessing.sequence.TimeseriesGenerator\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.utils.data_utils.Sequence\'>"
+  is_instance: "<class \'tensorflow.python.keras.preprocessing.sequence.TimeseriesGenerator\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.data_utils.Sequence\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.-tokenizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.-tokenizer.pbtxt
index ce91caa1afe081..b42b12b6c060f5 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.-tokenizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.preprocessing.text.-tokenizer.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.preprocessing.text.Tokenizer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.preprocessing.text.Tokenizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.preprocessing.text.Tokenizer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.regularizers.-l1-l2.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.regularizers.-l1-l2.pbtxt
index 04dcda38609c71..a45fb7b55e58a5 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.regularizers.-l1-l2.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.regularizers.-l1-l2.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.regularizers.L1L2"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.regularizers.L1L2\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.regularizers.Regularizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.L1L2\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.regularizers.-regularizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.regularizers.-regularizer.pbtxt
index b0a125f238e58f..641001a646564d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.regularizers.-regularizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.regularizers.-regularizer.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.regularizers.Regularizer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.regularizers.Regularizer\'>"
+  is_instance: "<class \'tensorflow.python.keras.regularizers.Regularizer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.utils.-custom-object-scope.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.utils.-custom-object-scope.pbtxt
index dda39ed221a068..109682046b9901 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.utils.-custom-object-scope.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.utils.-custom-object-scope.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.utils.CustomObjectScope"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.utils.generic_utils.CustomObjectScope\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.generic_utils.CustomObjectScope\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.utils.-generator-enqueuer.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.utils.-generator-enqueuer.pbtxt
index 1c5868e711beee..939fd547d06bbd 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.utils.-generator-enqueuer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.utils.-generator-enqueuer.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.utils.GeneratorEnqueuer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.utils.data_utils.GeneratorEnqueuer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.utils.data_utils.SequenceEnqueuer\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.data_utils.GeneratorEnqueuer\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.data_utils.SequenceEnqueuer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.utils.-h-d-f5-matrix.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.utils.-h-d-f5-matrix.pbtxt
index ce62c8bafcaec1..6b832051a975b6 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.utils.-h-d-f5-matrix.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.utils.-h-d-f5-matrix.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.utils.HDF5Matrix"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.utils.io_utils.HDF5Matrix\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.io_utils.HDF5Matrix\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "dtype"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.utils.-progbar.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.utils.-progbar.pbtxt
index 16e1cbe650e166..be4496e753f8bd 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.utils.-progbar.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.utils.-progbar.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.utils.Progbar"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.utils.generic_utils.Progbar\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.generic_utils.Progbar\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.utils.-sequence-enqueuer.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.utils.-sequence-enqueuer.pbtxt
index 5cf2a07b0b265b..a9e499d1009b5a 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.utils.-sequence-enqueuer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.utils.-sequence-enqueuer.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.utils.SequenceEnqueuer"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.utils.data_utils.SequenceEnqueuer\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.data_utils.SequenceEnqueuer\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.utils.-sequence.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.utils.-sequence.pbtxt
index 5b272253e37679..e2dc932dc86dbb 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.utils.-sequence.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.utils.-sequence.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.utils.Sequence"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.utils.data_utils.Sequence\'>"
+  is_instance: "<class \'tensorflow.python.keras.utils.data_utils.Sequence\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.utils.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.utils.pbtxt
index 5a446c09d0130e..4d7a1519ce59b6 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.utils.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.utils.pbtxt
@@ -46,7 +46,7 @@ tf_module {
   }
   member_method {
     name: "multi_gpu_model"
-    argspec: "args=[\'model\', \'gpus\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'model\', \'gpus\', \'cpu_merge\', \'cpu_relocation\'], varargs=None, keywords=None, defaults=[\'True\', \'False\'], "
   }
   member_method {
     name: "normalize"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.wrappers.scikit_learn.-keras-classifier.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.wrappers.scikit_learn.-keras-classifier.pbtxt
index 8d200f99fd14d6..67cca3af41dbf6 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.wrappers.scikit_learn.-keras-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.wrappers.scikit_learn.-keras-classifier.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.wrappers.scikit_learn.KerasClassifier"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.wrappers.scikit_learn.KerasClassifier\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.wrappers.scikit_learn.BaseWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.wrappers.scikit_learn.KerasClassifier\'>"
+  is_instance: "<class \'tensorflow.python.keras.wrappers.scikit_learn.BaseWrapper\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.wrappers.scikit_learn.-keras-regressor.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.wrappers.scikit_learn.-keras-regressor.pbtxt
index 7a971346d86f49..f4b9b7e277ecdb 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.wrappers.scikit_learn.-keras-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.wrappers.scikit_learn.-keras-regressor.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.keras.wrappers.scikit_learn.KerasRegressor"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.wrappers.scikit_learn.KerasRegressor\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.wrappers.scikit_learn.BaseWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.wrappers.scikit_learn.KerasRegressor\'>"
+  is_instance: "<class \'tensorflow.python.keras.wrappers.scikit_learn.BaseWrapper\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling1-d.pbtxt
index 38fd78a5a828c7..11067058d58526 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling1-d.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.layers.AveragePooling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling2-d.pbtxt
index 86a524cc91e106..3259e706d7f7ea 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling2-d.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.layers.AveragePooling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling3-d.pbtxt
index 8a811fe4561ac3..e561f2f4150188 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-average-pooling3-d.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.layers.AveragePooling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.AveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.AveragePooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.AveragePooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-batch-normalization.pbtxt
index 3923e706be7a35..3124a35c7852a9 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-batch-normalization.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.layers.BatchNormalization"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.normalization.BatchNormalization\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.normalization.BatchNormalization\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalization\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv1-d.pbtxt
index 7a0a8a2a51295d..b5ec61255ace78 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv1-d.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.layers.Conv1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d-transpose.pbtxt
index 7ed3a652519a64..b2c89ae66f5329 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d-transpose.pbtxt
@@ -1,12 +1,12 @@
 path: "tensorflow.layers.Conv2DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2DTranspose\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d.pbtxt
index 23831aa74f1c3d..9e4f4969dc6e1b 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv2-d.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.layers.Conv2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d-transpose.pbtxt
index 9d41a6b09900d9..9850e6d7659d31 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d-transpose.pbtxt
@@ -1,12 +1,12 @@
 path: "tensorflow.layers.Conv3DTranspose"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3DTranspose\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3DTranspose\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d.pbtxt
index 865fe08e63c812..be113826cc2b95 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-conv3-d.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.layers.Conv3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-dense.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-dense.pbtxt
index ee164aae204d3f..0d951bf6336ac7 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-dense.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.layers.Dense"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.core.Dense\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dense\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.Dense\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-dropout.pbtxt
index 8167dc79cdf9f8..f1beeed9ef0cb5 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-dropout.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.layers.Dropout"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.core.Dropout\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Dropout\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.Dropout\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -116,7 +116,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-flatten.pbtxt
index efa4419692993a..b75a012811ff10 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-flatten.pbtxt
@@ -1,10 +1,10 @@
 path: "tensorflow.layers.Flatten"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.core.Flatten\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.core.Flatten\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.core.Flatten\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -92,7 +92,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+    argspec: "args=[\'self\', \'data_format\'], varargs=None, keywords=kwargs, defaults=[\'None\'], "
   }
   member_method {
     name: "add_loss"
@@ -116,7 +116,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-input-spec.pbtxt
index 2ff89f0a6faef9..fd02c919aeb5a5 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-input-spec.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-input-spec.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.layers.InputSpec"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.InputSpec\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.InputSpec\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-layer.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-layer.pbtxt
index b3a6dfdffa28d3..80e0fb228b0347 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-layer.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.layers.Layer"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -114,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling1-d.pbtxt
index cef396489dde24..50ff484d733633 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling1-d.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.layers.MaxPooling1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling1D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling2-d.pbtxt
index 565f0c7a79661f..cea809744cd07c 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling2-d.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.layers.MaxPooling2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling2D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling3-d.pbtxt
index 595ce2eeadfc95..ab9e89554c81de 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-max-pooling3-d.pbtxt
@@ -1,11 +1,11 @@
 path: "tensorflow.layers.MaxPooling3D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.pooling.MaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.MaxPooling3D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.pooling.Pooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.MaxPooling3D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.pooling.Pooling3D\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
@@ -117,7 +117,7 @@ tf_class {
   }
   member_method {
     name: "build"
-    argspec: "args=[\'self\', \'_\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "call"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv1-d.pbtxt
index ccca96f72248e3..4362568445e892 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv1-d.pbtxt
@@ -1,12 +1,12 @@
 path: "tensorflow.layers.SeparableConv1D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv1D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv1D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv2-d.pbtxt
index 1c99c9618254ca..3cad824cd3b197 100644
--- a/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.layers.-separable-conv2-d.pbtxt
@@ -1,12 +1,12 @@
 path: "tensorflow.layers.SeparableConv2D"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.convolutional.SeparableConv2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv2D\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.SeparableConv\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.layers.convolutional.Conv\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv2D\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.SeparableConv\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.convolutional.Conv\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-block-diag.__metaclass__.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-block-diag.__metaclass__.pbtxt
new file mode 100644
index 00000000000000..b6dee631760436
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-block-diag.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.linalg.LinearOperatorBlockDiag.__metaclass__"
+tf_class {
+  is_instance: "<class \'abc.ABCMeta\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-block-diag.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-block-diag.pbtxt
new file mode 100644
index 00000000000000..973705dae2fabb
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-block-diag.pbtxt
@@ -0,0 +1,134 @@
+path: "tensorflow.linalg.LinearOperatorBlockDiag"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_block_diag.LinearOperatorBlockDiag\'>"
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "domain_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_parents"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_non_singular"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_positive_definite"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_self_adjoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_square"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "operators"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "range_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "tensor_rank"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'operators\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "add_to_tensor"
+    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
+  }
+  member_method {
+    name: "assert_non_singular"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
+  }
+  member_method {
+    name: "assert_positive_definite"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_positive_definite\'], "
+  }
+  member_method {
+    name: "assert_self_adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_self_adjoint\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
+  }
+  member_method {
+    name: "diag_part"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'diag_part\'], "
+  }
+  member_method {
+    name: "domain_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
+  }
+  member_method {
+    name: "log_abs_determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
+  }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'matmul\'], "
+  }
+  member_method {
+    name: "matvec"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'matvec\'], "
+  }
+  member_method {
+    name: "range_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range_dimension_tensor\'], "
+  }
+  member_method {
+    name: "shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'shape_tensor\'], "
+  }
+  member_method {
+    name: "solve"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'solve\'], "
+  }
+  member_method {
+    name: "solvevec"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'solve\'], "
+  }
+  member_method {
+    name: "tensor_rank_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'tensor_rank_tensor\'], "
+  }
+  member_method {
+    name: "to_dense"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'to_dense\'], "
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'trace\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-kronecker.__metaclass__.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-kronecker.__metaclass__.pbtxt
new file mode 100644
index 00000000000000..5c6784dd021041
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-kronecker.__metaclass__.pbtxt
@@ -0,0 +1,14 @@
+path: "tensorflow.linalg.LinearOperatorKronecker.__metaclass__"
+tf_class {
+  is_instance: "<class \'abc.ABCMeta\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "mro"
+  }
+  member_method {
+    name: "register"
+    argspec: "args=[\'cls\', \'subclass\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-kronecker.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-kronecker.pbtxt
new file mode 100644
index 00000000000000..c11d39082939ed
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.linalg.-linear-operator-kronecker.pbtxt
@@ -0,0 +1,134 @@
+path: "tensorflow.linalg.LinearOperatorKronecker"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator_kronecker.LinearOperatorKronecker\'>"
+  is_instance: "<class \'tensorflow.python.ops.linalg.linear_operator.LinearOperator\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "batch_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "domain_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "graph_parents"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_non_singular"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_positive_definite"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_self_adjoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "is_square"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "operators"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "range_dimension"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "tensor_rank"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'operators\', \'is_non_singular\', \'is_self_adjoint\', \'is_positive_definite\', \'is_square\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_to_tensor"
+    argspec: "args=[\'self\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'add_to_tensor\'], "
+  }
+  member_method {
+    name: "assert_non_singular"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_non_singular\'], "
+  }
+  member_method {
+    name: "assert_positive_definite"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_positive_definite\'], "
+  }
+  member_method {
+    name: "assert_self_adjoint"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'assert_self_adjoint\'], "
+  }
+  member_method {
+    name: "batch_shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
+  }
+  member_method {
+    name: "determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
+  }
+  member_method {
+    name: "diag_part"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'diag_part\'], "
+  }
+  member_method {
+    name: "domain_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'domain_dimension_tensor\'], "
+  }
+  member_method {
+    name: "log_abs_determinant"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'log_abs_det\'], "
+  }
+  member_method {
+    name: "matmul"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'matmul\'], "
+  }
+  member_method {
+    name: "matvec"
+    argspec: "args=[\'self\', \'x\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'matvec\'], "
+  }
+  member_method {
+    name: "range_dimension_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'range_dimension_tensor\'], "
+  }
+  member_method {
+    name: "shape_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'shape_tensor\'], "
+  }
+  member_method {
+    name: "solve"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'adjoint_arg\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'solve\'], "
+  }
+  member_method {
+    name: "solvevec"
+    argspec: "args=[\'self\', \'rhs\', \'adjoint\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'solve\'], "
+  }
+  member_method {
+    name: "tensor_rank_tensor"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'tensor_rank_tensor\'], "
+  }
+  member_method {
+    name: "to_dense"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'to_dense\'], "
+  }
+  member_method {
+    name: "trace"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'trace\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt
index 7a5c533872949a..00b92385433675 100644
--- a/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.linalg.pbtxt
@@ -4,6 +4,10 @@ tf_module {
     name: "LinearOperator"
     mtype: "<class \'abc.ABCMeta\'>"
   }
+  member {
+    name: "LinearOperatorBlockDiag"
+    mtype: "<class \'abc.ABCMeta\'>"
+  }
   member {
     name: "LinearOperatorCirculant"
     mtype: "<class \'abc.ABCMeta\'>"
@@ -32,6 +36,10 @@ tf_module {
     name: "LinearOperatorIdentity"
     mtype: "<class \'abc.ABCMeta\'>"
   }
+  member {
+    name: "LinearOperatorKronecker"
+    mtype: "<class \'abc.ABCMeta\'>"
+  }
   member {
     name: "LinearOperatorLowRankUpdate"
     mtype: "<class \'abc.ABCMeta\'>"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
index e1abd43ab54f9d..a8d9e120cb4aa9 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
index 93e7e40199884f..c039890e1f4c1d 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
index 3c3e38229738fe..62c393de34475a 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.DeviceWrapper\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
index db16660f1145b5..f121ba7939acb1 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.DropoutWrapper\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
index 465fc1cd9c886a..4583dc32b2e98d 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
index 38a387d55a4d34..5016b6ac3010e2 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
@@ -4,8 +4,8 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.LayerRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
index b9e3d934759acc..59623fc983a63c 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.MultiRNNCell\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
index 75b5898c591cbe..e2ab5aaee9456f 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
@@ -2,8 +2,8 @@ path: "tensorflow.nn.rnn_cell.RNNCell"
 tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
index fee0dc63b997f3..bd2a6d61f8578a 100644
--- a/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
@@ -3,8 +3,8 @@ tf_class {
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.ResidualWrapper\'>"
   is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
-  is_instance: "<class \'tensorflow.python.keras._impl.keras.engine.base_layer.Layer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "activity_regularizer"
diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index 0b12bc060efc2e..01b80581188e65 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -488,6 +488,10 @@ tf_module {
     name: "sets"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "sparse"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "spectral"
     mtype: "<type \'module\'>"
@@ -496,6 +500,10 @@ tf_module {
     name: "string"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
+  member {
+    name: "strings"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "summary"
     mtype: "<type \'module\'>"
@@ -784,6 +792,10 @@ tf_module {
     name: "broadcast_static_shape"
     argspec: "args=[\'shape_x\', \'shape_y\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "broadcast_to"
+    argspec: "args=[\'input\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "case"
     argspec: "args=[\'pred_fn_pairs\', \'default\', \'exclusive\', \'strict\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'False\', \'case\'], "
@@ -1524,6 +1536,10 @@ tf_module {
     name: "pow"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "print"
+    argspec: "args=[\'input_\', \'data\', \'message\', \'first_n\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
   member_method {
     name: "py_func"
     argspec: "args=[\'func\', \'inp\', \'Tout\', \'stateful\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
@@ -1686,7 +1702,7 @@ tf_module {
   }
   member_method {
     name: "scan"
-    argspec: "args=[\'fn\', \'elems\', \'initializer\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'infer_shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'True\', \'None\'], "
+    argspec: "args=[\'fn\', \'elems\', \'initializer\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'infer_shape\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'True\', \'False\', \'None\'], "
   }
   member_method {
     name: "scatter_add"
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-advice-proto.-checker.pbtxt b/tensorflow/tools/api/golden/tensorflow.profiler.-advice-proto.-checker.pbtxt
index bd5c36f390add9..e09c44cc9ce713 100644
--- a/tensorflow/tools/api/golden/tensorflow.profiler.-advice-proto.-checker.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.profiler.-advice-proto.-checker.pbtxt
@@ -1,80 +1,12 @@
 path: "tensorflow.profiler.AdviceProto.Checker"
-tf_class {
-  is_instance: "<class \'tensorflow.core.profiler.tfprof_output_pb2.Checker\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "REPORTS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "Checker"
+    field {
+      name: "reports"
+      number: 2
+      label: LABEL_REPEATED
+      type: TYPE_STRING
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-advice-proto.-checkers-entry.pbtxt b/tensorflow/tools/api/golden/tensorflow.profiler.-advice-proto.-checkers-entry.pbtxt
index 7c8c68e155c99d..87462435496fd2 100644
--- a/tensorflow/tools/api/golden/tensorflow.profiler.-advice-proto.-checkers-entry.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.profiler.-advice-proto.-checkers-entry.pbtxt
@@ -1,84 +1,22 @@
 path: "tensorflow.profiler.AdviceProto.CheckersEntry"
-tf_class {
-  is_instance: "<class \'tensorflow.core.profiler.tfprof_output_pb2.CheckersEntry\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "KEY_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "VALUE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "CheckersEntry"
+    field {
+      name: "key"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "value"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.tfprof.AdviceProto.Checker"
+    }
+    options {
+      map_entry: true
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-advice-proto.pbtxt b/tensorflow/tools/api/golden/tensorflow.profiler.-advice-proto.pbtxt
index 1b789f4fc92ed6..a8a8858ccd5af3 100644
--- a/tensorflow/tools/api/golden/tensorflow.profiler.-advice-proto.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.profiler.-advice-proto.pbtxt
@@ -1,88 +1,41 @@
 path: "tensorflow.profiler.AdviceProto"
-tf_class {
-  is_instance: "<class \'tensorflow.core.profiler.tfprof_output_pb2.AdviceProto\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "CHECKERS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "Checker"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "CheckersEntry"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "AdviceProto"
+    field {
+      name: "checkers"
+      number: 1
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.tfprof.AdviceProto.CheckersEntry"
+    }
+    nested_type {
+      name: "CheckersEntry"
+      field {
+        name: "key"
+        number: 1
+        label: LABEL_OPTIONAL
+        type: TYPE_STRING
+      }
+      field {
+        name: "value"
+        number: 2
+        label: LABEL_OPTIONAL
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.tfprof.AdviceProto.Checker"
+      }
+      options {
+        map_entry: true
+      }
+    }
+    nested_type {
+      name: "Checker"
+      field {
+        name: "reports"
+        number: 2
+        label: LABEL_REPEATED
+        type: TYPE_STRING
+      }
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-graph-node-proto.-input-shapes-entry.pbtxt b/tensorflow/tools/api/golden/tensorflow.profiler.-graph-node-proto.-input-shapes-entry.pbtxt
index f0b9605bee1c7c..afec73f537aadd 100644
--- a/tensorflow/tools/api/golden/tensorflow.profiler.-graph-node-proto.-input-shapes-entry.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.profiler.-graph-node-proto.-input-shapes-entry.pbtxt
@@ -1,84 +1,22 @@
 path: "tensorflow.profiler.GraphNodeProto.InputShapesEntry"
-tf_class {
-  is_instance: "<class \'tensorflow.core.profiler.tfprof_output_pb2.InputShapesEntry\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "KEY_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "VALUE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "InputShapesEntry"
+    field {
+      name: "key"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_INT32
+    }
+    field {
+      name: "value"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.TensorShapeProto"
+    }
+    options {
+      map_entry: true
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-graph-node-proto.pbtxt b/tensorflow/tools/api/golden/tensorflow.profiler.-graph-node-proto.pbtxt
index b80896a8a0f36a..3c83177005323a 100644
--- a/tensorflow/tools/api/golden/tensorflow.profiler.-graph-node-proto.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.profiler.-graph-node-proto.pbtxt
@@ -1,188 +1,191 @@
 path: "tensorflow.profiler.GraphNodeProto"
-tf_class {
-  is_instance: "<class \'tensorflow.core.profiler.tfprof_output_pb2.GraphNodeProto\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "ACCELERATOR_EXEC_MICROS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "CHILDREN_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "CPU_EXEC_MICROS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "DEVICES_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "EXEC_MICROS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "FLOAT_OPS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "INPUT_SHAPES_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "InputShapesEntry"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "NAME_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "OUTPUT_BYTES_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "PARAMETERS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "PEAK_BYTES_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "REQUESTED_BYTES_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "RESIDUAL_BYTES_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "RUN_COUNT_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "SHAPES_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TENSOR_VALUE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TOTAL_ACCELERATOR_EXEC_MICROS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TOTAL_CPU_EXEC_MICROS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TOTAL_DEFINITION_COUNT_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TOTAL_EXEC_MICROS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TOTAL_FLOAT_OPS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TOTAL_OUTPUT_BYTES_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TOTAL_PARAMETERS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TOTAL_PEAK_BYTES_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TOTAL_REQUESTED_BYTES_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TOTAL_RESIDUAL_BYTES_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TOTAL_RUN_COUNT_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "GraphNodeProto"
+    field {
+      name: "name"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "tensor_value"
+      number: 15
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.tfprof.TFProfTensorProto"
+    }
+    field {
+      name: "run_count"
+      number: 21
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "exec_micros"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "accelerator_exec_micros"
+      number: 17
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "cpu_exec_micros"
+      number: 18
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "requested_bytes"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "peak_bytes"
+      number: 24
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "residual_bytes"
+      number: 25
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "output_bytes"
+      number: 26
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "parameters"
+      number: 4
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "float_ops"
+      number: 13
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "devices"
+      number: 10
+      label: LABEL_REPEATED
+      type: TYPE_STRING
+    }
+    field {
+      name: "total_definition_count"
+      number: 23
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "total_run_count"
+      number: 22
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "total_exec_micros"
+      number: 6
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "total_accelerator_exec_micros"
+      number: 19
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "total_cpu_exec_micros"
+      number: 20
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "total_requested_bytes"
+      number: 7
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "total_peak_bytes"
+      number: 27
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "total_residual_bytes"
+      number: 28
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "total_output_bytes"
+      number: 29
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "total_parameters"
+      number: 8
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "total_float_ops"
+      number: 14
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "shapes"
+      number: 11
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.TensorShapeProto"
+    }
+    field {
+      name: "input_shapes"
+      number: 16
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.tfprof.GraphNodeProto.InputShapesEntry"
+    }
+    field {
+      name: "children"
+      number: 12
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.tfprof.GraphNodeProto"
+    }
+    nested_type {
+      name: "InputShapesEntry"
+      field {
+        name: "key"
+        number: 1
+        label: LABEL_OPTIONAL
+        type: TYPE_INT32
+      }
+      field {
+        name: "value"
+        number: 2
+        label: LABEL_OPTIONAL
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.TensorShapeProto"
+      }
+      options {
+        map_entry: true
+      }
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-multi-graph-node-proto.pbtxt b/tensorflow/tools/api/golden/tensorflow.profiler.-multi-graph-node-proto.pbtxt
index 33deff64979132..2b08a05437f90b 100644
--- a/tensorflow/tools/api/golden/tensorflow.profiler.-multi-graph-node-proto.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.profiler.-multi-graph-node-proto.pbtxt
@@ -1,160 +1,134 @@
 path: "tensorflow.profiler.MultiGraphNodeProto"
-tf_class {
-  is_instance: "<class \'tensorflow.core.profiler.tfprof_output_pb2.MultiGraphNodeProto\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "ACCELERATOR_EXEC_MICROS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "CHILDREN_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "CPU_EXEC_MICROS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "EXEC_MICROS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "FLOAT_OPS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GRAPH_NODES_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "NAME_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "OUTPUT_BYTES_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "PARAMETERS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "PEAK_BYTES_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "REQUESTED_BYTES_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "RESIDUAL_BYTES_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TOTAL_ACCELERATOR_EXEC_MICROS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TOTAL_CPU_EXEC_MICROS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TOTAL_EXEC_MICROS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TOTAL_FLOAT_OPS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TOTAL_OUTPUT_BYTES_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TOTAL_PARAMETERS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TOTAL_PEAK_BYTES_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TOTAL_REQUESTED_BYTES_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TOTAL_RESIDUAL_BYTES_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "MultiGraphNodeProto"
+    field {
+      name: "name"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "exec_micros"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "accelerator_exec_micros"
+      number: 12
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "cpu_exec_micros"
+      number: 13
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "requested_bytes"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "peak_bytes"
+      number: 16
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "residual_bytes"
+      number: 17
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "output_bytes"
+      number: 18
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "parameters"
+      number: 4
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "float_ops"
+      number: 5
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "total_exec_micros"
+      number: 6
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "total_accelerator_exec_micros"
+      number: 14
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "total_cpu_exec_micros"
+      number: 15
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "total_requested_bytes"
+      number: 7
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "total_peak_bytes"
+      number: 19
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "total_residual_bytes"
+      number: 20
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "total_output_bytes"
+      number: 21
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "total_parameters"
+      number: 8
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "total_float_ops"
+      number: 9
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "graph_nodes"
+      number: 10
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.tfprof.GraphNodeProto"
+    }
+    field {
+      name: "children"
+      number: 11
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.tfprof.MultiGraphNodeProto"
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-op-log-proto.-id-to-string-entry.pbtxt b/tensorflow/tools/api/golden/tensorflow.profiler.-op-log-proto.-id-to-string-entry.pbtxt
index 8c4727cf35bdfc..b3adc50c7e1415 100644
--- a/tensorflow/tools/api/golden/tensorflow.profiler.-op-log-proto.-id-to-string-entry.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.profiler.-op-log-proto.-id-to-string-entry.pbtxt
@@ -1,84 +1,21 @@
 path: "tensorflow.profiler.OpLogProto.IdToStringEntry"
-tf_class {
-  is_instance: "<class \'tensorflow.core.profiler.tfprof_log_pb2.IdToStringEntry\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "KEY_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "VALUE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "IdToStringEntry"
+    field {
+      name: "key"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "value"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    options {
+      map_entry: true
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.profiler.-op-log-proto.pbtxt b/tensorflow/tools/api/golden/tensorflow.profiler.-op-log-proto.pbtxt
index 1071a82b5ce139..7510c566ba574e 100644
--- a/tensorflow/tools/api/golden/tensorflow.profiler.-op-log-proto.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.profiler.-op-log-proto.pbtxt
@@ -1,88 +1,38 @@
 path: "tensorflow.profiler.OpLogProto"
-tf_class {
-  is_instance: "<class \'tensorflow.core.profiler.tfprof_log_pb2.OpLogProto\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "ID_TO_STRING_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "IdToStringEntry"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "LOG_ENTRIES_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "OpLogProto"
+    field {
+      name: "log_entries"
+      number: 1
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.tfprof.OpLogEntry"
+    }
+    field {
+      name: "id_to_string"
+      number: 2
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.tfprof.OpLogProto.IdToStringEntry"
+    }
+    nested_type {
+      name: "IdToStringEntry"
+      field {
+        name: "key"
+        number: 1
+        label: LABEL_OPTIONAL
+        type: TYPE_INT64
+      }
+      field {
+        name: "value"
+        number: 2
+        label: LABEL_OPTIONAL
+        type: TYPE_STRING
+      }
+      options {
+        map_entry: true
+      }
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/tensorflow.sparse.pbtxt
new file mode 100644
index 00000000000000..bbfe395031aece
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.sparse.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.sparse"
+tf_module {
+  member_method {
+    name: "cross"
+    argspec: "args=[\'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "cross_hashed"
+    argspec: "args=[\'inputs\', \'num_buckets\', \'hash_key\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/tensorflow.strings.pbtxt
new file mode 100644
index 00000000000000..b641c39feb6bcc
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.strings.pbtxt
@@ -0,0 +1,11 @@
+path: "tensorflow.strings"
+tf_module {
+  member_method {
+    name: "regex_full_match"
+    argspec: "args=[\'input\', \'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "split"
+    argspec: "args=[\'source\', \'sep\', \'maxsplit\'], varargs=None, keywords=None, defaults=[\'None\', \'-1\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-event.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.-event.pbtxt
index ab3449d80f6108..eb99d0f5334457 100644
--- a/tensorflow/tools/api/golden/tensorflow.summary.-event.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.summary.-event.pbtxt
@@ -1,112 +1,74 @@
 path: "tensorflow.summary.Event"
-tf_class {
-  is_instance: "<class \'tensorflow.core.util.event_pb2.Event\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "FILE_VERSION_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GRAPH_DEF_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "LOG_MESSAGE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "META_GRAPH_DEF_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "SESSION_LOG_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "STEP_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "SUMMARY_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TAGGED_RUN_METADATA_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "WALL_TIME_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "Event"
+    field {
+      name: "wall_time"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_DOUBLE
+    }
+    field {
+      name: "step"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "file_version"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+      oneof_index: 0
+    }
+    field {
+      name: "graph_def"
+      number: 4
+      label: LABEL_OPTIONAL
+      type: TYPE_BYTES
+      oneof_index: 0
+    }
+    field {
+      name: "summary"
+      number: 5
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.Summary"
+      oneof_index: 0
+    }
+    field {
+      name: "log_message"
+      number: 6
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.LogMessage"
+      oneof_index: 0
+    }
+    field {
+      name: "session_log"
+      number: 7
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.SessionLog"
+      oneof_index: 0
+    }
+    field {
+      name: "tagged_run_metadata"
+      number: 8
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.TaggedRunMetadata"
+      oneof_index: 0
+    }
+    field {
+      name: "meta_graph_def"
+      number: 9
+      label: LABEL_OPTIONAL
+      type: TYPE_BYTES
+      oneof_index: 0
+    }
+    oneof_decl {
+      name: "what"
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-session-log.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.-session-log.pbtxt
index 92ca4872caf1c1..73de73869c8d1a 100644
--- a/tensorflow/tools/api/golden/tensorflow.summary.-session-log.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.summary.-session-log.pbtxt
@@ -1,108 +1,44 @@
 path: "tensorflow.summary.SessionLog"
-tf_class {
-  is_instance: "<class \'tensorflow.core.util.event_pb2.SessionLog\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "CHECKPOINT"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "CHECKPOINT_PATH_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "MSG_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "START"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "STATUS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "STATUS_UNSPECIFIED"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "STOP"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "SessionStatus"
-    mtype: "<class \'google.protobuf.internal.enum_type_wrapper.EnumTypeWrapper\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "SessionLog"
+    field {
+      name: "status"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_ENUM
+      type_name: ".tensorflow.SessionLog.SessionStatus"
+    }
+    field {
+      name: "checkpoint_path"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "msg"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    enum_type {
+      name: "SessionStatus"
+      value {
+        name: "STATUS_UNSPECIFIED"
+        number: 0
+      }
+      value {
+        name: "START"
+        number: 1
+      }
+      value {
+        name: "STOP"
+        number: 2
+      }
+      value {
+        name: "CHECKPOINT"
+        number: 3
+      }
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-summary-description.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.-summary-description.pbtxt
index f93da2196adbc2..4a8b59cf02ed46 100644
--- a/tensorflow/tools/api/golden/tensorflow.summary.-summary-description.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.summary.-summary-description.pbtxt
@@ -1,80 +1,12 @@
 path: "tensorflow.summary.SummaryDescription"
-tf_class {
-  is_instance: "<class \'tensorflow.core.framework.summary_pb2.SummaryDescription\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "TYPE_HINT_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "SummaryDescription"
+    field {
+      name: "type_hint"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-summary.-audio.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.-summary.-audio.pbtxt
index 605e305e82cc3f..8b271cf58fc11c 100644
--- a/tensorflow/tools/api/golden/tensorflow.summary.-summary.-audio.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.summary.-summary.-audio.pbtxt
@@ -1,96 +1,36 @@
 path: "tensorflow.summary.Summary.Audio"
-tf_class {
-  is_instance: "<class \'tensorflow.core.framework.summary_pb2.Audio\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "CONTENT_TYPE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "ENCODED_AUDIO_STRING_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "LENGTH_FRAMES_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "NUM_CHANNELS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "SAMPLE_RATE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "Audio"
+    field {
+      name: "sample_rate"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_FLOAT
+    }
+    field {
+      name: "num_channels"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "length_frames"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_INT64
+    }
+    field {
+      name: "encoded_audio_string"
+      number: 4
+      label: LABEL_OPTIONAL
+      type: TYPE_BYTES
+    }
+    field {
+      name: "content_type"
+      number: 5
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-summary.-image.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.-summary.-image.pbtxt
index 0646972196dc72..dbbc02dd0506db 100644
--- a/tensorflow/tools/api/golden/tensorflow.summary.-summary.-image.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.summary.-summary.-image.pbtxt
@@ -1,92 +1,30 @@
 path: "tensorflow.summary.Summary.Image"
-tf_class {
-  is_instance: "<class \'tensorflow.core.framework.summary_pb2.Image\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "COLORSPACE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "ENCODED_IMAGE_STRING_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "HEIGHT_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "WIDTH_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "Image"
+    field {
+      name: "height"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_INT32
+    }
+    field {
+      name: "width"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_INT32
+    }
+    field {
+      name: "colorspace"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_INT32
+    }
+    field {
+      name: "encoded_image_string"
+      number: 4
+      label: LABEL_OPTIONAL
+      type: TYPE_BYTES
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-summary.-value.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.-summary.-value.pbtxt
index b319cd03d9e867..4176171cd938e3 100644
--- a/tensorflow/tools/api/golden/tensorflow.summary.-summary.-value.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.summary.-summary.-value.pbtxt
@@ -1,112 +1,74 @@
 path: "tensorflow.summary.Summary.Value"
-tf_class {
-  is_instance: "<class \'tensorflow.core.framework.summary_pb2.Value\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "AUDIO_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "HISTO_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "IMAGE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "METADATA_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "NODE_NAME_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "OBSOLETE_OLD_STYLE_HISTOGRAM_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "SIMPLE_VALUE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TAG_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TENSOR_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "Value"
+    field {
+      name: "node_name"
+      number: 7
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "tag"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "metadata"
+      number: 9
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.SummaryMetadata"
+    }
+    field {
+      name: "simple_value"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_FLOAT
+      oneof_index: 0
+    }
+    field {
+      name: "obsolete_old_style_histogram"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_BYTES
+      oneof_index: 0
+    }
+    field {
+      name: "image"
+      number: 4
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.Summary.Image"
+      oneof_index: 0
+    }
+    field {
+      name: "histo"
+      number: 5
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.HistogramProto"
+      oneof_index: 0
+    }
+    field {
+      name: "audio"
+      number: 6
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.Summary.Audio"
+      oneof_index: 0
+    }
+    field {
+      name: "tensor"
+      number: 8
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.TensorProto"
+      oneof_index: 0
+    }
+    oneof_decl {
+      name: "value"
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-summary.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.-summary.pbtxt
index 132ef1b7d2e933..d6c5e3a87a115b 100644
--- a/tensorflow/tools/api/golden/tensorflow.summary.-summary.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.summary.-summary.pbtxt
@@ -1,92 +1,144 @@
 path: "tensorflow.summary.Summary"
-tf_class {
-  is_instance: "<class \'tensorflow.core.framework.summary_pb2.Summary\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "Audio"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "Image"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "VALUE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "Value"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "Summary"
+    field {
+      name: "value"
+      number: 1
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.Summary.Value"
+    }
+    nested_type {
+      name: "Image"
+      field {
+        name: "height"
+        number: 1
+        label: LABEL_OPTIONAL
+        type: TYPE_INT32
+      }
+      field {
+        name: "width"
+        number: 2
+        label: LABEL_OPTIONAL
+        type: TYPE_INT32
+      }
+      field {
+        name: "colorspace"
+        number: 3
+        label: LABEL_OPTIONAL
+        type: TYPE_INT32
+      }
+      field {
+        name: "encoded_image_string"
+        number: 4
+        label: LABEL_OPTIONAL
+        type: TYPE_BYTES
+      }
+    }
+    nested_type {
+      name: "Audio"
+      field {
+        name: "sample_rate"
+        number: 1
+        label: LABEL_OPTIONAL
+        type: TYPE_FLOAT
+      }
+      field {
+        name: "num_channels"
+        number: 2
+        label: LABEL_OPTIONAL
+        type: TYPE_INT64
+      }
+      field {
+        name: "length_frames"
+        number: 3
+        label: LABEL_OPTIONAL
+        type: TYPE_INT64
+      }
+      field {
+        name: "encoded_audio_string"
+        number: 4
+        label: LABEL_OPTIONAL
+        type: TYPE_BYTES
+      }
+      field {
+        name: "content_type"
+        number: 5
+        label: LABEL_OPTIONAL
+        type: TYPE_STRING
+      }
+    }
+    nested_type {
+      name: "Value"
+      field {
+        name: "node_name"
+        number: 7
+        label: LABEL_OPTIONAL
+        type: TYPE_STRING
+      }
+      field {
+        name: "tag"
+        number: 1
+        label: LABEL_OPTIONAL
+        type: TYPE_STRING
+      }
+      field {
+        name: "metadata"
+        number: 9
+        label: LABEL_OPTIONAL
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.SummaryMetadata"
+      }
+      field {
+        name: "simple_value"
+        number: 2
+        label: LABEL_OPTIONAL
+        type: TYPE_FLOAT
+        oneof_index: 0
+      }
+      field {
+        name: "obsolete_old_style_histogram"
+        number: 3
+        label: LABEL_OPTIONAL
+        type: TYPE_BYTES
+        oneof_index: 0
+      }
+      field {
+        name: "image"
+        number: 4
+        label: LABEL_OPTIONAL
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.Summary.Image"
+        oneof_index: 0
+      }
+      field {
+        name: "histo"
+        number: 5
+        label: LABEL_OPTIONAL
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.HistogramProto"
+        oneof_index: 0
+      }
+      field {
+        name: "audio"
+        number: 6
+        label: LABEL_OPTIONAL
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.Summary.Audio"
+        oneof_index: 0
+      }
+      field {
+        name: "tensor"
+        number: 8
+        label: LABEL_OPTIONAL
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.TensorProto"
+        oneof_index: 0
+      }
+      oneof_decl {
+        name: "value"
+      }
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.summary.-tagged-run-metadata.pbtxt b/tensorflow/tools/api/golden/tensorflow.summary.-tagged-run-metadata.pbtxt
index 4dce20819de06f..27c8873320403c 100644
--- a/tensorflow/tools/api/golden/tensorflow.summary.-tagged-run-metadata.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.summary.-tagged-run-metadata.pbtxt
@@ -1,84 +1,18 @@
 path: "tensorflow.summary.TaggedRunMetadata"
-tf_class {
-  is_instance: "<class \'tensorflow.core.util.event_pb2.TaggedRunMetadata\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "RUN_METADATA_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TAG_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "TaggedRunMetadata"
+    field {
+      name: "tag"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "run_metadata"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_BYTES
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-adadelta-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-adadelta-optimizer.pbtxt
index 16bfbf20d5227d..1f1d8b6f9e2cde 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-adadelta-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-adadelta-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.AdadeltaOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.adadelta.AdadeltaOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-adagrad-d-a-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
index 61cde9181c2367..a7c05d484905a0 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.AdagradDAOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.adagrad_da.AdagradDAOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-adagrad-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-adagrad-optimizer.pbtxt
index 0a998c1afe4fff..bc8b92389c6ed7 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-adagrad-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-adagrad-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.AdagradOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.adagrad.AdagradOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-adam-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-adam-optimizer.pbtxt
index cc595415257779..5d17be9378fd13 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-adam-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-adam-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.AdamOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.adam.AdamOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-bytes-list.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-bytes-list.pbtxt
index 8cf52b817f342a..87e4f160e5bd59 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-bytes-list.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-bytes-list.pbtxt
@@ -1,80 +1,12 @@
 path: "tensorflow.train.BytesList"
-tf_class {
-  is_instance: "<class \'tensorflow.core.example.feature_pb2.BytesList\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "VALUE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "BytesList"
+    field {
+      name: "value"
+      number: 1
+      label: LABEL_REPEATED
+      type: TYPE_BYTES
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-hook.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-hook.pbtxt
index 327799729c9e7d..c3037baa8c951e 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-hook.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-checkpoint-saver-hook.pbtxt
@@ -5,7 +5,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'checkpoint_dir\', \'save_secs\', \'save_steps\', \'saver\', \'checkpoint_basename\', \'scaffold\', \'listeners\', \'steps_per_run\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'model.ckpt\', \'None\', \'None\', \'1\'], "
+    argspec: "args=[\'self\', \'checkpoint_dir\', \'save_secs\', \'save_steps\', \'saver\', \'checkpoint_basename\', \'scaffold\', \'listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'model.ckpt\', \'None\', \'None\'], "
   }
   member_method {
     name: "after_create_session"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-checkpoint.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-checkpoint.pbtxt
index 17f393d27c4b91..ddc553d7c984b2 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-checkpoint.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-checkpoint.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.train.Checkpoint"
 tf_class {
-  is_instance: "<class \'tensorflow.python.training.checkpointable_utils.Checkpoint\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.Checkpointable\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.util.Checkpoint\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.Checkpointable\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "save_counter"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-cluster-def.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-cluster-def.pbtxt
index 93ff856b09de15..f9de26839f5f6d 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-cluster-def.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-cluster-def.pbtxt
@@ -1,80 +1,13 @@
 path: "tensorflow.train.ClusterDef"
-tf_class {
-  is_instance: "<class \'tensorflow.core.protobuf.cluster_pb2.ClusterDef\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "JOB_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "ClusterDef"
+    field {
+      name: "job"
+      number: 1
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.JobDef"
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-example.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-example.pbtxt
index f7215a20372e98..23c30f1ef4fe2d 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-example.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-example.pbtxt
@@ -1,80 +1,13 @@
 path: "tensorflow.train.Example"
-tf_class {
-  is_instance: "<class \'tensorflow.core.example.example_pb2.Example\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "FEATURES_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "Example"
+    field {
+      name: "features"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.Features"
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-exponential-moving-average.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-exponential-moving-average.pbtxt
index 737acbe07c93da..c9fe136e68b5f3 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-exponential-moving-average.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-exponential-moving-average.pbtxt
@@ -2,6 +2,10 @@ path: "tensorflow.train.ExponentialMovingAverage"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.moving_averages.ExponentialMovingAverage\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'decay\', \'num_updates\', \'zero_debias\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'ExponentialMovingAverage\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-feature-list.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-feature-list.pbtxt
index 3ad98354d69453..2a8b3714fc0c4f 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-feature-list.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-feature-list.pbtxt
@@ -1,80 +1,13 @@
 path: "tensorflow.train.FeatureList"
-tf_class {
-  is_instance: "<class \'tensorflow.core.example.feature_pb2.FeatureList\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "FEATURE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "FeatureList"
+    field {
+      name: "feature"
+      number: 1
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.Feature"
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-feature-lists.-feature-list-entry.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-feature-lists.-feature-list-entry.pbtxt
index cd171f4ca3ef1e..cd1d56e606c96b 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-feature-lists.-feature-list-entry.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-feature-lists.-feature-list-entry.pbtxt
@@ -1,84 +1,22 @@
 path: "tensorflow.train.FeatureLists.FeatureListEntry"
-tf_class {
-  is_instance: "<class \'tensorflow.core.example.feature_pb2.FeatureListEntry\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "KEY_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "VALUE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "FeatureListEntry"
+    field {
+      name: "key"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "value"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.FeatureList"
+    }
+    options {
+      map_entry: true
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-feature-lists.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-feature-lists.pbtxt
index 3d95017d584ad9..3c183a64769b59 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-feature-lists.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-feature-lists.pbtxt
@@ -1,84 +1,32 @@
 path: "tensorflow.train.FeatureLists"
-tf_class {
-  is_instance: "<class \'tensorflow.core.example.feature_pb2.FeatureLists\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "FEATURE_LIST_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "FeatureListEntry"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "FeatureLists"
+    field {
+      name: "feature_list"
+      number: 1
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.FeatureLists.FeatureListEntry"
+    }
+    nested_type {
+      name: "FeatureListEntry"
+      field {
+        name: "key"
+        number: 1
+        label: LABEL_OPTIONAL
+        type: TYPE_STRING
+      }
+      field {
+        name: "value"
+        number: 2
+        label: LABEL_OPTIONAL
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.FeatureList"
+      }
+      options {
+        map_entry: true
+      }
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-feature.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-feature.pbtxt
index 9cca132bba91c4..5d0eb871c2f4ae 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-feature.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-feature.pbtxt
@@ -1,88 +1,33 @@
 path: "tensorflow.train.Feature"
-tf_class {
-  is_instance: "<class \'tensorflow.core.example.feature_pb2.Feature\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "BYTES_LIST_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "FLOAT_LIST_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "INT64_LIST_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "Feature"
+    field {
+      name: "bytes_list"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.BytesList"
+      oneof_index: 0
+    }
+    field {
+      name: "float_list"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.FloatList"
+      oneof_index: 0
+    }
+    field {
+      name: "int64_list"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.Int64List"
+      oneof_index: 0
+    }
+    oneof_decl {
+      name: "kind"
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-features.-feature-entry.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-features.-feature-entry.pbtxt
index 858aee03415dea..f912005f1cc35f 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-features.-feature-entry.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-features.-feature-entry.pbtxt
@@ -1,84 +1,22 @@
 path: "tensorflow.train.Features.FeatureEntry"
-tf_class {
-  is_instance: "<class \'tensorflow.core.example.feature_pb2.FeatureEntry\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "KEY_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "VALUE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "FeatureEntry"
+    field {
+      name: "key"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "value"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.Feature"
+    }
+    options {
+      map_entry: true
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-features.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-features.pbtxt
index 49cd12153bf307..b788ca1d57e1d6 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-features.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-features.pbtxt
@@ -1,84 +1,32 @@
 path: "tensorflow.train.Features"
-tf_class {
-  is_instance: "<class \'tensorflow.core.example.feature_pb2.Features\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "FEATURE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "FeatureEntry"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "Features"
+    field {
+      name: "feature"
+      number: 1
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.Features.FeatureEntry"
+    }
+    nested_type {
+      name: "FeatureEntry"
+      field {
+        name: "key"
+        number: 1
+        label: LABEL_OPTIONAL
+        type: TYPE_STRING
+      }
+      field {
+        name: "value"
+        number: 2
+        label: LABEL_OPTIONAL
+        type: TYPE_MESSAGE
+        type_name: ".tensorflow.Feature"
+      }
+      options {
+        map_entry: true
+      }
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-float-list.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-float-list.pbtxt
index e3f01334b547fe..55d3b46f20e17e 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-float-list.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-float-list.pbtxt
@@ -1,80 +1,15 @@
 path: "tensorflow.train.FloatList"
-tf_class {
-  is_instance: "<class \'tensorflow.core.example.feature_pb2.FloatList\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "VALUE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "FloatList"
+    field {
+      name: "value"
+      number: 1
+      label: LABEL_REPEATED
+      type: TYPE_FLOAT
+      options {
+        packed: true
+      }
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-ftrl-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-ftrl-optimizer.pbtxt
index 1add3a90212234..d265fdeb01c38d 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-ftrl-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-ftrl-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.FtrlOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.ftrl.FtrlOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-gradient-descent-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-gradient-descent-optimizer.pbtxt
index ef5bbd6ace29ab..c673e29cd4dd6c 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-gradient-descent-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-gradient-descent-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.GradientDescentOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.gradient_descent.GradientDescentOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-int64-list.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-int64-list.pbtxt
index 8917dc122cfd0b..1de92b3ab7b5e0 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-int64-list.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-int64-list.pbtxt
@@ -1,80 +1,15 @@
 path: "tensorflow.train.Int64List"
-tf_class {
-  is_instance: "<class \'tensorflow.core.example.feature_pb2.Int64List\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "VALUE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "Int64List"
+    field {
+      name: "value"
+      number: 1
+      label: LABEL_REPEATED
+      type: TYPE_INT64
+      options {
+        packed: true
+      }
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-job-def.-tasks-entry.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-job-def.-tasks-entry.pbtxt
index ac6d81541a43e9..58115590a5eebd 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-job-def.-tasks-entry.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-job-def.-tasks-entry.pbtxt
@@ -1,84 +1,21 @@
 path: "tensorflow.train.JobDef.TasksEntry"
-tf_class {
-  is_instance: "<class \'tensorflow.core.protobuf.cluster_pb2.TasksEntry\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "KEY_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "VALUE_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "TasksEntry"
+    field {
+      name: "key"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_INT32
+    }
+    field {
+      name: "value"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    options {
+      map_entry: true
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-job-def.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-job-def.pbtxt
index ce34537fa13b92..d7eb505e27930d 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-job-def.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-job-def.pbtxt
@@ -1,88 +1,37 @@
 path: "tensorflow.train.JobDef"
-tf_class {
-  is_instance: "<class \'tensorflow.core.protobuf.cluster_pb2.JobDef\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "NAME_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TASKS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TasksEntry"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "JobDef"
+    field {
+      name: "name"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "tasks"
+      number: 2
+      label: LABEL_REPEATED
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.JobDef.TasksEntry"
+    }
+    nested_type {
+      name: "TasksEntry"
+      field {
+        name: "key"
+        number: 1
+        label: LABEL_OPTIONAL
+        type: TYPE_INT32
+      }
+      field {
+        name: "value"
+        number: 2
+        label: LABEL_OPTIONAL
+        type: TYPE_STRING
+      }
+      options {
+        map_entry: true
+      }
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-momentum-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-momentum-optimizer.pbtxt
index 3d6e87f5eb44de..8199f63b9b8c64 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-momentum-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-momentum-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.MomentumOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.momentum.MomentumOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-optimizer.pbtxt
index e73861ff7cb2d9..876bb35e391885 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-optimizer.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.train.Optimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-proximal-adagrad-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
index 301b35b199c878..14349a74efb611 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.ProximalAdagradOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.proximal_adagrad.ProximalAdagradOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
index 8815befa936a85..7d982dc51f6edc 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-proximal-gradient-descent-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.ProximalGradientDescentOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.proximal_gradient_descent.ProximalGradientDescentOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-r-m-s-prop-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
index e9819683ba5ec1..906384a2875bf7 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.RMSPropOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.rmsprop.RMSPropOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-saver-def.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-saver-def.pbtxt
index 84498a64f5b045..4ec99469e40256 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-saver-def.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-saver-def.pbtxt
@@ -1,120 +1,64 @@
 path: "tensorflow.train.SaverDef"
-tf_class {
-  is_instance: "<class \'tensorflow.core.protobuf.saver_pb2.SaverDef\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "CheckpointFormatVersion"
-    mtype: "<class \'google.protobuf.internal.enum_type_wrapper.EnumTypeWrapper\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "FILENAME_TENSOR_NAME_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "KEEP_CHECKPOINT_EVERY_N_HOURS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "LEGACY"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "MAX_TO_KEEP_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "RESTORE_OP_NAME_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "SAVE_TENSOR_NAME_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "SHARDED_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "V1"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "V2"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "VERSION_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "SaverDef"
+    field {
+      name: "filename_tensor_name"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "save_tensor_name"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "restore_op_name"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "max_to_keep"
+      number: 4
+      label: LABEL_OPTIONAL
+      type: TYPE_INT32
+    }
+    field {
+      name: "sharded"
+      number: 5
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+    field {
+      name: "keep_checkpoint_every_n_hours"
+      number: 6
+      label: LABEL_OPTIONAL
+      type: TYPE_FLOAT
+    }
+    field {
+      name: "version"
+      number: 7
+      label: LABEL_OPTIONAL
+      type: TYPE_ENUM
+      type_name: ".tensorflow.SaverDef.CheckpointFormatVersion"
+    }
+    enum_type {
+      name: "CheckpointFormatVersion"
+      value {
+        name: "LEGACY"
+        number: 0
+      }
+      value {
+        name: "V1"
+        number: 1
+      }
+      value {
+        name: "V2"
+        number: 2
+      }
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-sequence-example.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-sequence-example.pbtxt
index 9ab95537021167..6a4553bbc15796 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-sequence-example.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-sequence-example.pbtxt
@@ -1,84 +1,20 @@
 path: "tensorflow.train.SequenceExample"
-tf_class {
-  is_instance: "<class \'tensorflow.core.example.example_pb2.SequenceExample\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "CONTEXT_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "FEATURE_LISTS_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "SequenceExample"
+    field {
+      name: "context"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.Features"
+    }
+    field {
+      name: "feature_lists"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.FeatureLists"
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-server-def.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-server-def.pbtxt
index af0a3b73cc2ff3..83ee7b3eb91a55 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-server-def.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-server-def.pbtxt
@@ -1,96 +1,38 @@
 path: "tensorflow.train.ServerDef"
-tf_class {
-  is_instance: "<class \'tensorflow.core.protobuf.tensorflow_server_pb2.ServerDef\'>"
-  is_instance: "<type \'google.protobuf.pyext._message.CMessage\'>"
-  member {
-    name: "CLUSTER_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "DEFAULT_SESSION_CONFIG_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "DESCRIPTOR"
-    mtype: "<type \'google.protobuf.pyext._message.MessageDescriptor\'>"
-  }
-  member {
-    name: "Extensions"
-    mtype: "<type \'getset_descriptor\'>"
-  }
-  member {
-    name: "JOB_NAME_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "PROTOCOL_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "TASK_INDEX_FIELD_NUMBER"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "ByteSize"
-  }
-  member_method {
-    name: "Clear"
-  }
-  member_method {
-    name: "ClearExtension"
-  }
-  member_method {
-    name: "ClearField"
-  }
-  member_method {
-    name: "CopyFrom"
-  }
-  member_method {
-    name: "DiscardUnknownFields"
-  }
-  member_method {
-    name: "FindInitializationErrors"
-  }
-  member_method {
-    name: "FromString"
-  }
-  member_method {
-    name: "HasExtension"
-  }
-  member_method {
-    name: "HasField"
-  }
-  member_method {
-    name: "IsInitialized"
-  }
-  member_method {
-    name: "ListFields"
-  }
-  member_method {
-    name: "MergeFrom"
-  }
-  member_method {
-    name: "MergeFromString"
-  }
-  member_method {
-    name: "ParseFromString"
-  }
-  member_method {
-    name: "RegisterExtension"
-  }
-  member_method {
-    name: "SerializePartialToString"
-  }
-  member_method {
-    name: "SerializeToString"
-  }
-  member_method {
-    name: "SetInParent"
-  }
-  member_method {
-    name: "WhichOneof"
-  }
-  member_method {
-    name: "__init__"
+tf_proto {
+  descriptor {
+    name: "ServerDef"
+    field {
+      name: "cluster"
+      number: 1
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.ClusterDef"
+    }
+    field {
+      name: "job_name"
+      number: 2
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
+    field {
+      name: "task_index"
+      number: 3
+      label: LABEL_OPTIONAL
+      type: TYPE_INT32
+    }
+    field {
+      name: "default_session_config"
+      number: 4
+      label: LABEL_OPTIONAL
+      type: TYPE_MESSAGE
+      type_name: ".tensorflow.ConfigProto"
+    }
+    field {
+      name: "protocol"
+      number: 5
+      label: LABEL_OPTIONAL
+      type: TYPE_STRING
+    }
   }
 }
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-session-manager.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-session-manager.pbtxt
index cc31bb4e4b3969..448764fe081b25 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-session-manager.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-session-manager.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'local_init_op\', \'ready_op\', \'ready_for_local_init_op\', \'graph\', \'recovery_wait_secs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'30\'], "
+    argspec: "args=[\'self\', \'local_init_op\', \'ready_op\', \'ready_for_local_init_op\', \'graph\', \'recovery_wait_secs\', \'local_init_run_options\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'30\', \'None\'], "
   }
   member_method {
     name: "prepare_session"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-supervisor.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-supervisor.pbtxt
index 1f0e59a1ac2d89..9677e5a98e4a83 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-supervisor.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-supervisor.pbtxt
@@ -104,7 +104,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'graph\', \'ready_op\', \'ready_for_local_init_op\', \'is_chief\', \'init_op\', \'init_feed_dict\', \'local_init_op\', \'logdir\', \'summary_op\', \'saver\', \'global_step\', \'save_summaries_secs\', \'save_model_secs\', \'recovery_wait_secs\', \'stop_grace_secs\', \'checkpoint_basename\', \'session_manager\', \'summary_writer\', \'init_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'0\', \'True\', \'0\', \'None\', \'0\', \'None\', \'0\', \'0\', \'0\', \'120\', \'600\', \'30\', \'120\', \'model.ckpt\', \'None\', \'0\', \'None\'], "
+    argspec: "args=[\'self\', \'graph\', \'ready_op\', \'ready_for_local_init_op\', \'is_chief\', \'init_op\', \'init_feed_dict\', \'local_init_op\', \'logdir\', \'summary_op\', \'saver\', \'global_step\', \'save_summaries_secs\', \'save_model_secs\', \'recovery_wait_secs\', \'stop_grace_secs\', \'checkpoint_basename\', \'session_manager\', \'summary_writer\', \'init_fn\', \'local_init_run_options\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'0\', \'True\', \'0\', \'None\', \'0\', \'None\', \'0\', \'0\', \'0\', \'120\', \'600\', \'30\', \'120\', \'model.ckpt\', \'None\', \'0\', \'None\', \'None\'], "
   }
   member_method {
     name: "loop"
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-sync-replicas-optimizer.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-sync-replicas-optimizer.pbtxt
index 3db96aff876b88..2c0fda3c72b7e1 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-sync-replicas-optimizer.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-sync-replicas-optimizer.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.train.SyncReplicasOptimizer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.sync_replicas_optimizer.SyncReplicasOptimizer\'>"
   is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.CheckpointableBase\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "GATE_GRAPH"
diff --git a/tensorflow/tools/api/lib/api_objects.proto b/tensorflow/tools/api/lib/api_objects.proto
index 0966a5f1d530ec..7207b9c5a9f4db 100644
--- a/tensorflow/tools/api/lib/api_objects.proto
+++ b/tensorflow/tools/api/lib/api_objects.proto
@@ -1,5 +1,7 @@
 syntax = "proto2";
 
+import "google/protobuf/descriptor.proto";
+
 package third_party.tensorflow.tools.api;
 
 message TFAPIMember {
@@ -24,8 +26,17 @@ message TFAPIClass {
   repeated TFAPIMethod member_method = 3;
 };
 
+message TFAPIProto {
+  // Suppress generation of the proto API's descriptor() method lest it
+  // conflict with the standard accessor for the field having the same name.
+  option no_standard_descriptor_accessor = true;
+
+  optional google.protobuf.DescriptorProto descriptor = 1;
+};
+
 message TFAPIObject {
   optional string path = 1;
   optional TFAPIModule tf_module = 2;
   optional TFAPIClass tf_class = 3;
+  optional TFAPIProto tf_proto = 4;
 };
diff --git a/tensorflow/tools/api/lib/python_object_to_proto_visitor.py b/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
index 0b30f7b4d12e3d..1cf330e7024726 100644
--- a/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
+++ b/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
@@ -19,6 +19,7 @@
 from __future__ import division
 from __future__ import print_function
 
+from google.protobuf import message
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
@@ -101,6 +102,11 @@ def _SanitizedMRO(obj):
   return return_list
 
 
+def _IsProtoClass(obj):
+  """Returns whether the passed obj is a Protocol Buffer class."""
+  return isinstance(obj, type) and issubclass(obj, message.Message)
+
+
 class PythonObjectToProtoVisitor(object):
   """A visitor that summarizes given python objects as protobufs."""
 
@@ -153,6 +159,13 @@ def _AddMember(member_name, member_obj, proto):
         # Store the constructed module object.
         self._protos[lib_path] = api_objects_pb2.TFAPIObject(
             path=lib_path, tf_module=module_obj)
+      elif _IsProtoClass(parent):
+        proto_obj = api_objects_pb2.TFAPIProto()
+        parent.DESCRIPTOR.CopyToProto(proto_obj.descriptor)
+
+        # Store the constructed proto object.
+        self._protos[lib_path] = api_objects_pb2.TFAPIObject(
+            path=lib_path, tf_proto=proto_obj)
       elif tf_inspect.isclass(parent):
         # Construct a class.
         class_obj = api_objects_pb2.TFAPIClass()
@@ -161,7 +174,7 @@ def _AddMember(member_name, member_obj, proto):
           if name in parent_corner_cases:
             # If we have an empty entry, skip this object.
             if parent_corner_cases[name]:
-              module_obj.member.add(**(parent_corner_cases[name]))
+              class_obj.member.add(**(parent_corner_cases[name]))
           else:
             _AddMember(name, child, class_obj)
 
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index 1ad6b6d1c0ae5c..90375a794f64a9 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -35,6 +35,7 @@
 
 import tensorflow as tf
 
+from google.protobuf import message
 from google.protobuf import text_format
 
 from tensorflow.python.lib.io import file_io
@@ -195,6 +196,25 @@ def _AssertProtoDictEquals(self,
     else:
       logging.info('No differences found between API and golden.')
 
+  def testNoSubclassOfMessage(self):
+
+    def Visit(path, parent, unused_children):
+      """A Visitor that crashes on subclasses of generated proto classes."""
+      # If the traversed object is a proto Message class
+      if not (isinstance(parent, type) and
+              issubclass(parent, message.Message)):
+        return
+      if parent is message.Message:
+        return
+      # Check that it is a direct subclass of Message.
+      if message.Message not in parent.__bases__:
+        raise NotImplementedError(
+            'Object tf.%s is a subclass of a generated proto Message. '
+            'They are not yet supported by the API tools.' % path)
+    visitor = public_api.PublicAPIVisitor(Visit)
+    visitor.do_not_descend_map['tf'].append('contrib')
+    traverse.traverse(tf, visitor)
+
   @unittest.skipUnless(
       sys.version_info.major == 2,
       'API compabitility test goldens are generated using python2.')
diff --git a/tensorflow/tools/benchmark/benchmark_model.cc b/tensorflow/tools/benchmark/benchmark_model.cc
index 15523028c726fe..de93b12b97081f 100644
--- a/tensorflow/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/tools/benchmark/benchmark_model.cc
@@ -261,6 +261,10 @@ Status InitializeSession(int num_threads, const string& graph,
   graph_def->reset(new GraphDef());
   tensorflow::GraphDef tensorflow_graph;
   Status s = ReadBinaryProto(Env::Default(), graph, graph_def->get());
+  if (!s.ok()) {
+    s = ReadTextProto(Env::Default(), graph, graph_def->get());
+  }
+
   if (!s.ok()) {
     LOG(ERROR) << "Could not create TensorFlow Graph: " << s;
     return s;
@@ -663,12 +667,12 @@ int Main(int argc, char** argv) {
         output_prefix, benchmark_name, "meta-init-plus-first-inference", 1,
         initialization_time_s + (warmup_time_us / 1000000.0) / warmup_runs);
 
-    std::map<string, int64> node_type_map_count;
-    std::map<string, int64> node_type_map_time;
-    std::map<string, int64> node_type_map_memory;
-    std::map<string, int64> node_type_map_times_called;
+    std::map<std::string, int64_t> node_type_map_count;
+    std::map<std::string, int64_t> node_type_map_time;
+    std::map<std::string, int64_t> node_type_map_memory;
+    std::map<std::string, int64_t> node_type_map_times_called;
 
-    int64 accumulated_us;
+    int64_t accumulated_us;
     stats->ComputeStatsByType(&node_type_map_count, &node_type_map_time,
                               &node_type_map_memory,
                               &node_type_map_times_called, &accumulated_us);
diff --git a/tensorflow/tools/benchmark/benchmark_model_test.cc b/tensorflow/tools/benchmark/benchmark_model_test.cc
index 16ab2ff66e763a..6813045d632005 100644
--- a/tensorflow/tools/benchmark/benchmark_model_test.cc
+++ b/tensorflow/tools/benchmark/benchmark_model_test.cc
@@ -26,30 +26,36 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-TEST(BenchmarkModelTest, InitializeAndRun) {
-  const string dir = testing::TmpDir();
-  const string filename_pb = io::JoinPath(dir, "graphdef.pb");
-
+void CreateTestGraph(const ::tensorflow::Scope& root,
+                     benchmark_model::InputLayerInfo* input,
+                     string* output_name, GraphDef* graph_def) {
   // Create a simple graph and write it to filename_pb.
   const int input_width = 400;
   const int input_height = 10;
-  benchmark_model::InputLayerInfo input;
-  input.shape = TensorShape({input_width, input_height});
-  input.data_type = DT_FLOAT;
+  input->shape = TensorShape({input_width, input_height});
+  input->data_type = DT_FLOAT;
   const TensorShape constant_shape({input_height, input_width});
 
   Tensor constant_tensor(DT_FLOAT, constant_shape);
   test::FillFn<float>(&constant_tensor, [](int) -> float { return 3.0; });
 
-  auto root = Scope::NewRootScope().ExitOnError();
   auto placeholder =
-      ops::Placeholder(root, DT_FLOAT, ops::Placeholder::Shape(input.shape));
-  input.name = placeholder.node()->name();
+      ops::Placeholder(root, DT_FLOAT, ops::Placeholder::Shape(input->shape));
+  input->name = placeholder.node()->name();
   auto m = ops::MatMul(root, placeholder, constant_tensor);
-  const string output_name = m.node()->name();
+  *output_name = m.node()->name();
+  TF_ASSERT_OK(root.ToGraphDef(graph_def));
+}
+
+TEST(BenchmarkModelTest, InitializeAndRun) {
+  const string dir = testing::TmpDir();
+  const string filename_pb = io::JoinPath(dir, "graphdef.pb");
+  auto root = Scope::NewRootScope().ExitOnError();
 
+  benchmark_model::InputLayerInfo input;
+  string output_name;
   GraphDef graph_def;
-  TF_ASSERT_OK(root.ToGraphDef(&graph_def));
+  CreateTestGraph(root, &input, &output_name, &graph_def);
   string graph_def_serialized;
   graph_def.SerializeToString(&graph_def_serialized);
   TF_ASSERT_OK(
@@ -69,5 +75,30 @@ TEST(BenchmarkModelTest, InitializeAndRun) {
   ASSERT_EQ(num_runs, 10);
 }
 
+TEST(BenchmarkModeTest, TextProto) {
+  const string dir = testing::TmpDir();
+  const string filename_txt = io::JoinPath(dir, "graphdef.pb.txt");
+  auto root = Scope::NewRootScope().ExitOnError();
+
+  benchmark_model::InputLayerInfo input;
+  string output_name;
+  GraphDef graph_def;
+  CreateTestGraph(root, &input, &output_name, &graph_def);
+  TF_ASSERT_OK(WriteTextProto(Env::Default(), filename_txt, graph_def));
+
+  std::unique_ptr<Session> session;
+  std::unique_ptr<GraphDef> loaded_graph_def;
+  TF_ASSERT_OK(benchmark_model::InitializeSession(1, filename_txt, &session,
+                                                  &loaded_graph_def));
+  std::unique_ptr<StatSummarizer> stats;
+  stats.reset(new tensorflow::StatSummarizer(*(loaded_graph_def.get())));
+  int64 time;
+  int64 num_runs = 0;
+  TF_ASSERT_OK(benchmark_model::TimeMultipleRuns(
+      0.0, 10, 0.0, {input}, {output_name}, {}, session.get(), stats.get(),
+      &time, &num_runs));
+  ASSERT_EQ(num_runs, 10);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/tools/ci_build/builds/test_user_ops.sh b/tensorflow/tools/ci_build/builds/test_user_ops.sh
index c342367bacea9d..25ecee472524d5 100755
--- a/tensorflow/tools/ci_build/builds/test_user_ops.sh
+++ b/tensorflow/tools/ci_build/builds/test_user_ops.sh
@@ -239,8 +239,9 @@ function run_op() {
   fi
 }
 
-run_op $("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; print(tf.Session('').run(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT})))")
-run_op $("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; tf.enable_eager_execution(); print(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT}))") " in eager mode"
+run_op "$("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; print(tf.Session('').run(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT})))")"
+run_op "$("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; tf.enable_eager_execution(); print(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT}).numpy())")" " in eager mode"
+
 
 popd
 
diff --git a/tensorflow/tools/ci_build/builds/with_the_same_user b/tensorflow/tools/ci_build/builds/with_the_same_user
index 69500ab23f69d7..a89c25fd10eab7 100755
--- a/tensorflow/tools/ci_build/builds/with_the_same_user
+++ b/tensorflow/tools/ci_build/builds/with_the_same_user
@@ -40,7 +40,7 @@ if [ -n "${CI_BUILD_USER_FORCE_BADNAME}" ]; then
   ADDUSER_OPTS="--force-badname"
 fi
 
-getent group "${CI_BUILD_GID}" || addgroup --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}"
+getent group "${CI_BUILD_GID}" || addgroup ${ADDUSER_OPTS} --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}"
 getent passwd "${CI_BUILD_UID}" || adduser ${ADDUSER_OPTS} \
     --gid "${CI_BUILD_GID}" --uid "${CI_BUILD_UID}" \
     --gecos "${CI_BUILD_USER} (generated by with_the_same_user script)" \
diff --git a/tensorflow/tools/ci_build/ci_build.sh b/tensorflow/tools/ci_build/ci_build.sh
index ff06fa83d5e1c0..31a821fd9ed108 100755
--- a/tensorflow/tools/ci_build/ci_build.sh
+++ b/tensorflow/tools/ci_build/ci_build.sh
@@ -142,6 +142,12 @@ if [[ $? != "0" ]]; then
   die "ERROR: docker build failed. Dockerfile is at ${DOCKERFILE_PATH}"
 fi
 
+# If caller wants the with_the_same_user script to allow bad usernames, 
+# pass the var to the docker environment
+if [ -n "${CI_BUILD_USER_FORCE_BADNAME}" ]; then
+        CI_BUILD_USER_FORCE_BADNAME_ENV="-e CI_BUILD_USER_FORCE_BADNAME=yes"
+fi
+
 # Run the command inside the container.
 echo "Running '${COMMAND[*]}' inside ${DOCKER_IMG_NAME}..."
 mkdir -p ${WORKSPACE}/bazel-ci_build-cache
@@ -156,6 +162,7 @@ ${DOCKER_BINARY} run --rm --pid=host \
     -e "CI_BUILD_GROUP=$(id -g -n)" \
     -e "CI_BUILD_GID=$(id -g)" \
     -e "CI_TENSORFLOW_SUBMODULE_PATH=${CI_TENSORFLOW_SUBMODULE_PATH}" \
+    ${CI_BUILD_USER_FORCE_BADNAME_ENV} \
     -v ${WORKSPACE}:/workspace \
     -w /workspace \
     ${GPU_EXTRA_PARAMS} \
diff --git a/tensorflow/tools/ci_build/ci_parameterized_build.sh b/tensorflow/tools/ci_build/ci_parameterized_build.sh
index 797e0a6db52aa6..e621f85652588f 100755
--- a/tensorflow/tools/ci_build/ci_parameterized_build.sh
+++ b/tensorflow/tools/ci_build/ci_parameterized_build.sh
@@ -168,7 +168,6 @@ else
   BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:embedding_lookup_test"
   BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:embedding_lookup_sparse_test"
   BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:fully_connected_test"
-  # BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/testing:generated_examples_zip_test"
   BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:hashtable_lookup_test"
   BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:local_response_norm_test"
   BAZEL_TARGET="${BAZEL_TARGET} //tensorflow/contrib/lite/kernels:lsh_projection_test"
diff --git a/tensorflow/tools/ci_build/ci_sanity.sh b/tensorflow/tools/ci_build/ci_sanity.sh
index 8e8b2191e5c8f3..05676f9551d4a1 100755
--- a/tensorflow/tools/ci_build/ci_sanity.sh
+++ b/tensorflow/tools/ci_build/ci_sanity.sh
@@ -100,9 +100,9 @@ do_pylint() {
 "^tensorflow/contrib/eager/python/evaluator\.py.*\[E0202.*method-hidden "\
 "^tensorflow/contrib/eager/python/metrics_impl\.py.*\[E0202.*method-hidden "\
 "^tensorflow/python/platform/gfile\.py.*\[E0301.*non-iterator "\
-"^tensorflow/python/keras/_impl/keras/callbacks\.py.*\[E1133.*not-an-iterable "\
-"^tensorflow/python/keras/_impl/keras/engine/base_layer.py.*\[E0203.*access-member-before-definition "\
-"^tensorflow/python/keras/_impl/keras/layers/recurrent\.py.*\[E0203.*access-member-before-definition "\
+"^tensorflow/python/keras/callbacks\.py.*\[E1133.*not-an-iterable "\
+"^tensorflow/python/keras/engine/base_layer.py.*\[E0203.*access-member-before-definition "\
+"^tensorflow/python/keras/layers/recurrent\.py.*\[E0203.*access-member-before-definition "\
 "^tensorflow/python/kernel_tests/constant_op_eager_test.py.*\[E0303.*invalid-length-returned"
 
   echo "ERROR_WHITELIST=\"${ERROR_WHITELIST}\""
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index 5aaf544afdcb88..b3d3f23ec8fd16 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -17,14 +17,9 @@
 set -e
 
 # We don't apt-get install so that we can install a newer version of pip.
-# Only needed for Ubuntu 14.04 ,and not needed for Ubuntu 16.04 / Debian 8,9
-if $(cat /etc/*-release | grep -q 14.04); then
-  easy_install -U pip==9.0.3
-  easy_install3 -U pip==9.0.3
-else
-  pip2 install --upgrade pip==9.0.3
-  pip3 install --upgrade pip==9.0.3
-fi
+# Only needed for Ubuntu 14.04 and 16.04; not needed for 18.04 and Debian 8,9?
+easy_install -U pip==9.0.3
+easy_install3 -U pip==9.0.3
 
 # Install pip packages from whl files to avoid the time-consuming process of
 # building from source.
@@ -114,3 +109,7 @@ pip2 install --upgrade gast
 pip3 install --upgrade gast
 pip2 install --upgrade termcolor
 pip3 install --upgrade termcolor
+
+# Install last working version of setuptools.
+pip2 install --upgrade setuptools==39.1.0
+pip3 install --upgrade setuptools==39.1.0
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages_remote.sh b/tensorflow/tools/ci_build/install/install_pip_packages_remote.sh
index 0beabcf5ef8300..721590f4d6081d 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages_remote.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages_remote.sh
@@ -20,8 +20,8 @@ if [ ! -f /usr/bin/x86_64-linux-gnu-gcc ]; then
   ln -s /usr/local/bin/clang /usr/bin/x86_64-linux-gnu-gcc
 fi
 
-pip2 install --upgrade setuptools
-pip3 install --upgrade setuptools
+easy_install -U pip==9.0.3
+easy_install3 -U pip==9.0.3
 
 # The rest of the pip packages will be installed in
 # `install_pip_packages.sh`
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index 204a82f647eed5..61d34c7304708b 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -39,7 +39,6 @@ if [[ -z $pip35_version ]]; then
 fi
 
 set -e
-pip3.5 install --upgrade setuptools
 pip3.5 install --upgrade pip
 
 pip3.5 install --upgrade virtualenv
@@ -82,4 +81,7 @@ pip3.5 install --upgrade astor
 pip3.5 install --upgrade gast
 pip3.5 install --upgrade termcolor
 
+# Install last working version of setuptools.
+pip3.5 install --upgrade setuptools==39.1.0
+
 # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh)
diff --git a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
index bfaa044c82887b..fe2d2cf11c3a17 100755
--- a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
@@ -49,9 +49,12 @@ cd Python-3.6.1
 make altinstall
 ln -s /usr/local/bin/pip3.6 /usr/local/bin/pip3
 
+pip3 install --upgrade pip
+
 pip3 install --upgrade virtualenv
 
 set -e
+
 # Install six.
 pip3 install --upgrade absl-py
 pip3 install --upgrade six==1.10.0
@@ -94,4 +97,7 @@ pip3 install --upgrade astor
 pip3 install --upgrade gast
 pip3 install --upgrade termcolor
 
+# Install last working version of setuptools.
+pip3 install --upgrade setuptools==39.1.0
+
 # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh)
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_cc_core.sh b/tensorflow/tools/ci_build/linux/cpu/run_cc_core.sh
index 92cdb6d8b8513a..3b5c92d148d480 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_cc_core.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_cc_core.sh
@@ -35,5 +35,5 @@ yes "" | $PYTHON_BIN_PATH configure.py
 # Run bazel test command. Double test timeouts to avoid flakes.
 bazel test --test_tag_filters=-no_oss,-gpu,-benchmark-test --test_lang_filters=cc,java -k \
     --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --config=opt \
-    --test_output=errors -- \
+    --test_output=errors --test_size_filters=small,medium -- \
     //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_py2_core.sh b/tensorflow/tools/ci_build/linux/cpu/run_py2_core.sh
index dec180ff830128..52eff6330f4c43 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_py2_core.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_py2_core.sh
@@ -34,5 +34,5 @@ yes "" | $PYTHON_BIN_PATH configure.py
 # Run bazel test command. Double test timeouts to avoid flakes.
 bazel test --test_tag_filters=-no_oss,-oss_serial,-gpu,-benchmark-test --test_lang_filters=py -k \
     --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --build_tests_only --config=opt \
-    --test_output=errors -- \
+    --test_output=errors --test_size_filters=small,medium -- \
     //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh b/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh
index 8e13cf9114e4c0..6618969673c8b3 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_py3_contrib.sh
@@ -34,7 +34,7 @@ yes "" | $PYTHON_BIN_PATH configure.py
 # Run bazel test command. Double test timeouts to avoid flakes.
 bazel test --test_tag_filters=-no_oss,-oss_serial,-gpu,-benchmark-test -k \
     --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --config=opt \
-    --test_output=errors -- \
+    --test_size_filters=small,medium --test_output=errors -- \
     //tensorflow/contrib/... \
     -//tensorflow/contrib/lite/... \
     //tensorflow/contrib/lite:context_test \
@@ -53,7 +53,7 @@ bazel test --test_tag_filters=-no_oss,-oss_serial,-gpu,-benchmark-test -k \
     //tensorflow/contrib/lite/kernels:embedding_lookup_test \
     //tensorflow/contrib/lite/kernels:embedding_lookup_sparse_test \
     //tensorflow/contrib/lite/kernels:fully_connected_test \
-    //tensorflow/contrib/lite/testing:generated_examples_zip_test \
+    //tensorflow/contrib/lite/testing:generated_zip_tests \
     //tensorflow/contrib/lite/kernels:hashtable_lookup_test \
     //tensorflow/contrib/lite/kernels:local_response_norm_test \
     //tensorflow/contrib/lite/kernels:lsh_projection_test \
diff --git a/tensorflow/tools/ci_build/linux/cpu/run_py3_core.sh b/tensorflow/tools/ci_build/linux/cpu/run_py3_core.sh
index 6e72ffe617c8eb..7c531a4d681fdc 100755
--- a/tensorflow/tools/ci_build/linux/cpu/run_py3_core.sh
+++ b/tensorflow/tools/ci_build/linux/cpu/run_py3_core.sh
@@ -34,5 +34,5 @@ yes "" | $PYTHON_BIN_PATH configure.py
 # Run bazel test command. Double test timeouts to avoid flakes.
 bazel test --test_tag_filters=-no_oss,-oss_serial,-gpu,-benchmark-test --test_lang_filters=py -k \
     --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 --build_tests_only --config=opt \
-    --test_output=errors -- \
+    --test_output=errors --test_size_filters=small,medium -- \
     //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh b/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh
index 02224d8e9d9efd..9d2c8383fae5c1 100755
--- a/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh
+++ b/tensorflow/tools/ci_build/linux/gpu/run_cc_core.sh
@@ -37,5 +37,6 @@ yes "" | $PYTHON_BIN_PATH configure.py
 bazel test --config=cuda --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-benchmark-test -k \
     --test_lang_filters=cc --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
     --build_tests_only --test_output=errors --local_test_jobs=8 --config=opt \
+    --test_size_filters=small,medium \
     --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute -- \
     //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh b/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh
index 0367a53d1459e7..5b3383e1059a18 100755
--- a/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh
+++ b/tensorflow/tools/ci_build/linux/gpu/run_py3_core.sh
@@ -37,5 +37,6 @@ yes "" | $PYTHON_BIN_PATH configure.py
 bazel test --config=cuda --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-benchmark-test -k \
     --test_lang_filters=py --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
     --build_tests_only --test_output=errors --local_test_jobs=8 --config=opt \
+    --test_size_filters=small,medium \
     --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute -- \
     //tensorflow/... -//tensorflow/compiler/... -//tensorflow/contrib/...
diff --git a/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh b/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
index 45aa6f7a988004..91e2a2556856e6 100755
--- a/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
+++ b/tensorflow/tools/ci_build/linux/libtensorflow_docker.sh
@@ -21,7 +21,12 @@
 # See libtensorflow_cpu.sh and libtensorflow_gpu.sh
 
 set -ex
+
+# Current script directory
+
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+source "${SCRIPT_DIR}/../builds/builds_common.sh"
 DOCKER_CONTEXT_PATH="$(realpath ${SCRIPT_DIR}/..)"
 ROOT_DIR="$(realpath ${SCRIPT_DIR}/../../../../)"
 
@@ -50,7 +55,6 @@ ${DOCKER_BINARY} run \
   -v ${ROOT_DIR}:/workspace \
   -w /workspace \
   -e "PYTHON_BIN_PATH=/usr/bin/python" \
-  -e "TF_NEED_GCP=0" \
   -e "TF_NEED_HDFS=0" \
   -e "TF_NEED_CUDA=${TF_NEED_CUDA}" \
   -e "TF_NEED_ROCM=${TF_NEED_ROCM}" \
diff --git a/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh b/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh
index 1ac04dec834891..1f6c2aa666e2be 100755
--- a/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh
+++ b/tensorflow/tools/ci_build/linux/rocm/run_py3_core.sh
@@ -117,5 +117,14 @@ bazel test --config=rocm --test_tag_filters=-no_oss,-oss_serial,-no_gpu,-benchma
     -//tensorflow/python:nn_fused_batchnorm_test \
     -//tensorflow/python:timeline_test \
     -//tensorflow/python:virtual_gpu_test
+    -//tensorflow/python:function_def_to_graph_test \
+    -//tensorflow/python:gradient_descent_test \
+    -//tensorflow/python:momentum_test \
+    -//tensorflow/python/keras:models_test \
+    -//tensorflow/python/keras:training_test \
+    -//tensorflow/python/keras:cudnn_recurrent_test \
+    -//tensorflow/python/kernel_tests/distributions:beta_test \
+    -//tensorflow/python/kernel_tests/distributions:dirichlet_test \
+    -//tensorflow/python/kernel_tests/distributions:student_t_test \
 
-# Note: temp. disabling 79 unit tests in order to esablish a CI baseline (2018/06/05)
+# Note: temp. disabling 88 unit tests in order to esablish a CI baseline (2018/06/07)
diff --git a/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh b/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
index fc3d166af91fc1..06798adc03b497 100755
--- a/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
+++ b/tensorflow/tools/ci_build/osx/libtensorflow_cpu.sh
@@ -24,7 +24,6 @@ source "${SCRIPT_DIR}/../builds/libtensorflow.sh"
 
 # Configure script
 export PYTHON_BIN_PATH="/usr/bin/python"
-export TF_NEED_GCP=0
 export TF_NEED_HDFS=0
 export TF_NEED_CUDA=0
 export TF_NEED_ROCM=0
diff --git a/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh b/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh
index a2c0c3aca67064..95f1992d7d630d 100755
--- a/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh
+++ b/tensorflow/tools/ci_build/osx/libtensorflow_gpu.sh
@@ -26,7 +26,6 @@ source "${SCRIPT_DIR}/../builds/libtensorflow.sh"
 export TF_NEED_CUDA=1
 export LD_LIBRARY_PATH="/usr/local/cuda/lib:/usr/local/cuda/extras/CUPTI/lib:${LD_LIBRARY_PATH}"
 export PYTHON_BIN_PATH="/usr/bin/python"
-export TF_NEED_GCP=0
 export TF_NEED_HDFS=0
 export TF_NEED_ROCM=0
 export TF_NEED_OPENCL_SYCL=0
diff --git a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
index 1bd1852ffc5701..b8bce57c87ab39 100755
--- a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
+++ b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
@@ -79,6 +79,7 @@ if [[ $1 == "PI_ONE" ]]; then
   --linkopt=-L${OPENBLAS_INSTALL_PATH}/lib/
   --linkopt=-l:libopenblas.a"
   echo "Building for the Pi One/Zero, with no NEON support"
+  WHEEL_ARCH=linux_armv6l
 else
   PI_COPTS='--copt=-march=armv7-a --copt=-mfpu=neon-vfpv4
   --copt=-std=gnu11 --copt=-DS_IREAD=S_IRUSR --copt=-DS_IWRITE=S_IWUSR
@@ -86,6 +87,7 @@ else
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_2
   --copt=-U__GCC_HAVE_SYNC_COMPARE_AND_SWAP_8'
+  WHEEL_ARCH=linux_armv7l
   echo "Building for the Pi Two/Three, with NEON acceleration"
 fi
 
@@ -100,6 +102,8 @@ bazel build -c opt ${PI_COPTS} \
   --copt=-fomit-frame-pointer --cpu=armeabi \
   --crosstool_top=@local_config_arm_compiler//:toolchain \
   --verbose_failures \
+  //tensorflow:libtensorflow.so \
+  //tensorflow:libtensorflow_framework.so \
   //tensorflow/tools/benchmark:benchmark_model \
   //tensorflow/tools/pip_package:build_pip_package
 
@@ -112,10 +116,12 @@ BDIST_OPTS="--universal" \
   bazel-bin/tensorflow/tools/pip_package/build_pip_package "${OUTDIR}"
 
 OLD_FN=$(ls "${OUTDIR}" | grep -m 1 \.whl)
-SUB='s/tensorflow-([^-]+)-([^-]+)-.*/tensorflow-\1-\2-none-any.whl/; print'
+SUB='s/tensorflow-([^-]+)-([^-]+)-.*/tensorflow-\1-\2-none-'${WHEEL_ARCH}'.whl/; print'
 NEW_FN=$(echo "${OLD_FN}" | perl -ne "${SUB}")
 mv "${OUTDIR}/${OLD_FN}" "${OUTDIR}/${NEW_FN}"
 cp bazel-bin/tensorflow/tools/benchmark/benchmark_model "${OUTDIR}"
+cp bazel-bin/tensorflow/libtensorflow.so "${OUTDIR}"
+cp bazel-bin/tensorflow/libtensorflow_framework.so "${OUTDIR}"
 
 echo "Output can be found here:"
 find "${OUTDIR}"
diff --git a/tensorflow/tools/ci_build/update_version.py b/tensorflow/tools/ci_build/update_version.py
index 52a0da9a14847e..00bfcfd49bd1d9 100755
--- a/tensorflow/tools/ci_build/update_version.py
+++ b/tensorflow/tools/ci_build/update_version.py
@@ -248,6 +248,16 @@ def update_md_files(old_version, new_version):
     replace_string_in_line(r"<version>%s<\/version>" % old_version,
                            "<version>%s</version>" % new_version, filepath)
 
+  # Update any links to colab notebooks.
+  def colab_url(version):
+    version_string = "%s.%s.%s" % (version.major, version.minor, version.patch)
+    prefix = "https://colab.research.google.com/github/tensorflow/models/blob/r"
+    return prefix + version_string + "/"
+
+  replace_string_in_line(
+      colab_url(old_version), colab_url(new_version),
+      "%s/docs_src/get_started/eager.md" % TF_SRC_DIR)
+
 
 def major_minor_change(old_version, new_version):
   """Check if a major or minor change occurred."""
diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
index 582188fc00b260..a3e07737a4fa79 100644
--- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
@@ -14,130 +14,20 @@
 # limitations under the License.
 # ==============================================================================
 #
-# C++ tests
-failing_cpu_cc_tests="\
-    //tensorflow/core/kernels:control_flow_ops_test + \
-    //tensorflow/core:example_example_parser_configuration_test + \
-    //tensorflow/core:lib_core_status_test + \
-    //tensorflow/core:lib_monitoring_collection_registry_test + \
-    //tensorflow/core:lib_strings_numbers_test + \
-    //tensorflow/core/platform/hadoop:hadoop_file_system_test + \
-    //tensorflow/core:platform_file_system_test + \
-    //tensorflow/core:platform_logging_test + \
-    //tensorflow/core:util_sparse_sparse_tensor_test + \
-    //tensorflow/cc:framework_gradient_checker_test + \
-    //tensorflow/cc:framework_gradients_test + \
-    //tensorflow/cc:gradients_array_grad_test + \
-    //tensorflow/cc:gradients_math_grad_test + \
-    //tensorflow/cc:gradients_nn_grad_test + \
-    //tensorflow/cc/saved_model:loader_test \
-"
-
-broken_cpu_cc_tests="\
-    //tensorflow/cc:framework_cc_ops_test + \
-    //tensorflow/core/platform/cloud:time_util_test + \
-    //tensorflow/core/platform/cloud:oauth_client_test + \
-    //tensorflow/core/platform/cloud:http_request_test + \
-    //tensorflow/core/platform/cloud:google_auth_provider_test + \
-    //tensorflow/core/platform/cloud:gcs_file_system_test + \
-    //tensorflow/core/kernels/cloud:bigquery_table_accessor_test + \
-    //tensorflow/core/kernels/hexagon:graph_transferer_test + \
-    //tensorflow/core/kernels:remote_fused_graph_execute_utils_test + \
-    //tensorflow/core/kernels:requantize_op_test + \
-    //tensorflow/core/kernels:requantization_range_op_test + \
-    //tensorflow/core/kernels:quantized_reshape_op_test + \
-    //tensorflow/core/kernels:quantized_pooling_ops_test + \
-    //tensorflow/core/kernels:quantized_matmul_op_test + \
-    //tensorflow/core/kernels:quantized_conv_ops_test + \
-    //tensorflow/core/kernels:quantized_concat_op_test + \
-    //tensorflow/core/kernels:quantized_bias_add_op_test + \
-    //tensorflow/core/kernels:quantized_batch_norm_op_test + \
-    //tensorflow/core/kernels:quantized_activation_ops_test + \
-    //tensorflow/core/kernels:quantize_op_test + \
-    //tensorflow/core/kernels:quantize_down_and_shrink_range_op_test + \
-    //tensorflow/core/kernels:quantize_and_dequantize_op_test_gpu + \
-    //tensorflow/core/kernels:quantize_and_dequantize_op_test + \
-    //tensorflow/core/kernels:quantization_utils_test + \
-    //tensorflow/core/kernels:debug_ops_test + \
-    //tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr_test_gpu + \
-    //tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr_test + \
-    //tensorflow/core/distributed_runtime/rpc:grpc_tensor_coding_test + \
-    //tensorflow/core/distributed_runtime/rpc:grpc_session_test_gpu + \
-    //tensorflow/core/distributed_runtime/rpc:grpc_session_test + \
-    //tensorflow/core/distributed_runtime/rpc:grpc_channel_test_gpu + \
-    //tensorflow/core/distributed_runtime/rpc:grpc_channel_test + \
-    //tensorflow/core/distributed_runtime:remote_device_test_gpu + \
-    //tensorflow/core/distributed_runtime:remote_device_test + \
-    //tensorflow/core/distributed_runtime:executor_test_gpu + \
-    //tensorflow/core/distributed_runtime:executor_test + \
-    //tensorflow/core/debug:debug_gateway_test + \
-    //tensorflow/core/debug:debug_grpc_io_utils_test + \
-    //tensorflow/core:util_reporter_test + \
-    //tensorflow/core:util_memmapped_file_system_test + \
-    //tensorflow/core:platform_subprocess_test + \
-    //tensorflow/core:platform_profile_utils_cpu_utils_test + \
-    //tensorflow/core:lib_jpeg_jpeg_mem_unittest + \
-    //tensorflow/core/debug:debug_io_utils_test \
-"
-
-# lib_core_threadpool_test is timeout, but it passes when running alone
-extra_failing_gpu_cc_tests="\
-    //tensorflow/core:lib_core_threadpool_test + \
-    //tensorflow/core:cuda_libdevice_path_test + \
-    //tensorflow/core:common_runtime_direct_session_test + \
-    //tensorflow/core:common_runtime_direct_session_with_tracking_alloc_test + \
-    //tensorflow/core:device_tracer_test + \
-    //tensorflow/core:ops_math_grad_test \
-"
-
-exclude_cpu_cc_tests="${failing_cpu_cc_tests} + ${broken_cpu_cc_tests}"
-
-exclude_gpu_cc_tests="${extra_failing_gpu_cc_tests} + ${exclude_cpu_cc_tests}"
 
 function run_configure_for_cpu_build {
-  # Due to a bug in Bazel: https://github.com/bazelbuild/bazel/issues/2182
-  # yes "" | ./configure doesn't work on Windows, so we set all the
-  # environment variables in advance to avoid interact with the script.
-  export TF_NEED_CUDA=0
-  if [ -z "$TF_ENABLE_XLA" ]; then
-    export TF_ENABLE_XLA=0
-  fi
-  if [ -z "$TF_NEED_MKL" ]; then
-    export TF_NEED_MKL=0
-  fi
-  export TF_NEED_VERBS=0
-  export TF_NEED_GCP=1
-  export TF_NEED_HDFS=0
-  export TF_NEED_OPENCL_SYCL=0
-  echo "" | ./configure
+  yes "" | ./configure
 }
 
 function run_configure_for_gpu_build {
-  # Due to a bug in Bazel: https://github.com/bazelbuild/bazel/issues/2182
-  # yes "" | ./configure doesn't work on Windows, so we set all the
-  # environment variables in advance to avoid interact with the script.
+  # Enable CUDA support
   export TF_NEED_CUDA=1
-  export TF_CUDA_VERSION=9.0
-  export CUDA_TOOLKIT_PATH="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.0"
-  export TF_CUDNN_VERSION=7.0
-  if [ -z "$CUDNN_INSTALL_PATH" ]; then
-    export CUDNN_INSTALL_PATH="C:/tools/cuda"
-  fi
-  export TF_CUDA_COMPUTE_CAPABILITIES="3.7"
-  if [ -z "$TF_ENABLE_XLA" ]; then
-    export TF_ENABLE_XLA=0
-  fi
-  export TF_NEED_VERBS=0
-  export TF_NEED_MKL=0
-  export TF_NEED_GCP=0
-  export TF_NEED_HDFS=0
-  export TF_NEED_OPENCL_SYCL=0
 
   # TODO(pcloudy): Remove this after TensorFlow uses its own CRSOOTOOL
   # for GPU build on Windows
   export USE_MSVC_WRAPPER=1
 
-  echo "" | ./configure
+  yes "" | ./configure
 }
 
 function set_gcs_remote_cache_options {
diff --git a/tensorflow/tools/ci_build/windows/bazel/common_env.sh b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
index 0e6c0227b7ffb6..eefa8ee2d50494 100644
--- a/tensorflow/tools/ci_build/windows/bazel/common_env.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
@@ -49,8 +49,3 @@ export PATH="/c/Program Files/Git/cmd:$PATH"
 
 # Make sure we have pip in PATH
 export PATH="/c/${PYTHON_BASE_PATH}/Scripts:$PATH"
-
-# Add Cuda and Cudnn dll directories into PATH
-export PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.0/bin:$PATH"
-export PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.0/extras/CUPTI/libx64:$PATH"
-export PATH="/c/tools/cuda/bin:$PATH"
diff --git a/tensorflow/tools/ci_build/windows/cpu/bazel/run_cc_test_windows.sh b/tensorflow/tools/ci_build/windows/cpu/bazel/run_cc_test_windows.sh
index 748a961e44c542..dc9af221ecf53b 100644
--- a/tensorflow/tools/ci_build/windows/cpu/bazel/run_cc_test_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/bazel/run_cc_test_windows.sh
@@ -44,7 +44,7 @@ source "tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh" \
 
 run_configure_for_cpu_build
 
-# Compliling the following test is extremely slow with -c opt
+# Compiling the following test is extremely slow with -c opt
 slow_compiling_test="//tensorflow/core/kernels:eigen_backward_spatial_convolutions_test"
 
 # Find all the passing cc_tests on Windows and store them in a variable
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index a2300811bb93b9..0b13b97209fa6c 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -54,24 +54,34 @@ function cleanup {
 trap cleanup EXIT
 
 skip_test=0
+release_build=0
 
 for ARG in "$@"; do
   if [[ "$ARG" == --skip_test ]]; then
     skip_test=1
   elif [[ "$ARG" == --enable_gcs_remote_cache ]]; then
     set_gcs_remote_cache_options
+  elif [[ "$ARG" == --release_build ]]; then
+    release_build=1
   fi
 done
 
-# --define=override_eigen_strong_inline=true speeds up the compiling of conv_grad_ops_3d.cc and conv_ops_3d.cc
-# by 20 minutes. See https://github.com/tensorflow/tensorflow/issues/10521
-echo "build --define=override_eigen_strong_inline=true" >> "${TMP_BAZELRC}"
+if [[ "$release_build" != 1 ]]; then
+  # --define=override_eigen_strong_inline=true speeds up the compiling of conv_grad_ops_3d.cc and conv_ops_3d.cc
+  # by 20 minutes. See https://github.com/tensorflow/tensorflow/issues/10521
+  # Because this hurts the performance of TF, we don't enable it in release build.
+  echo "build --define=override_eigen_strong_inline=true" >> "${TMP_BAZELRC}"
+fi
+
+# The host and target platforms are the same in Windows build. So we don't have
+# to distinct them. This helps avoid building the same targets twice.
+echo "build --distinct_host_configuration=false" >> "${TMP_BAZELRC}"
 
 echo "import %workspace%/${TMP_BAZELRC}" >> .bazelrc
 
 run_configure_for_cpu_build
 
-bazel build --announce_rc -c opt tensorflow/tools/pip_package:build_pip_package || exit $?
+bazel build --announce_rc --config=opt tensorflow/tools/pip_package:build_pip_package || exit $?
 
 if [[ "$skip_test" == 1 ]]; then
   exit 0
@@ -92,7 +102,7 @@ N_JOBS="${NUMBER_OF_PROCESSORS}"
 
 # Define no_tensorflow_py_deps=true so that every py_test has no deps anymore,
 # which will result testing system installed tensorflow
-bazel test -c opt -k --test_output=errors \
+bazel test --announce_rc --config=opt -k --test_output=errors \
   --define=no_tensorflow_py_deps=true --test_lang_filters=py \
   --test_tag_filters=-no_pip,-no_windows,-no_oss \
   --build_tag_filters=-no_pip,-no_windows,-no_oss --build_tests_only \
diff --git a/tensorflow/tools/ci_build/windows/gpu/bazel/run_cc_test_windows.sh b/tensorflow/tools/ci_build/windows/gpu/bazel/run_cc_test_windows.sh
index f26f8727e51bf0..f1114f4ffa40dd 100644
--- a/tensorflow/tools/ci_build/windows/gpu/bazel/run_cc_test_windows.sh
+++ b/tensorflow/tools/ci_build/windows/gpu/bazel/run_cc_test_windows.sh
@@ -46,7 +46,7 @@ clean_output_base
 
 run_configure_for_gpu_build
 
-# Compliling the following test is extremely slow with -c opt
+# Compiling the following test is extremely slow with -c opt
 slow_compiling_test="//tensorflow/core/kernels:eigen_backward_spatial_convolutions_test"
 
 # Find all the passing cc_tests on Windows and store them in a variable
diff --git a/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat b/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
index 4656afe0256d03..cec5b717f8ad07 100644
--- a/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
+++ b/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
@@ -30,7 +30,6 @@ IF DEFINED SWIG_EXE (ECHO SWIG_EXE is set to %SWIG_EXE%) ELSE (SET SWIG_EXE="C:\
 IF DEFINED PY_EXE (ECHO PY_EXE is set to %PY_EXE%) ELSE (SET PY_EXE="C:\Program Files\Anaconda3\python.exe")
 IF DEFINED PY_LIB (ECHO PY_LIB is set to %PY_LIB%) ELSE (SET PY_LIB="C:\Program Files\Anaconda3\libs\python35.lib")
 IF DEFINED CUDNN_HOME (ECHO CUDNN_HOME is set to %CUDNN_HOME%) ELSE (SET CUDNN_HOME="c:\tools\cuda")
-verbosity:quiet
 IF DEFINED DISABLE_FORCEINLINE (ECHO DISABLE_FORCEINLINE is set to %DISABLE_FORCEINLINE%) ELSE (SET DISABLE_FORCEINLINE="OFF")
 
 SET CMAKE_DIR=%REPO_ROOT%\tensorflow\contrib\cmake
diff --git a/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh b/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh
index a410c10b61b9f3..d085e21b0305d1 100755
--- a/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh
+++ b/tensorflow/tools/ci_build/xla/linux/gpu/run_py3.sh
@@ -37,6 +37,7 @@ bazel clean
 # Run bazel test command. Double test timeouts to avoid flakes.
 bazel test --config=cuda --test_tag_filters=-no_gpu,-benchmark-test,-no_oss -k \
     --jobs=${N_JOBS} --test_timeout 300,450,1200,3600 \
+    --test_size_filters=small,medium \
     --build_tests_only --test_output=errors --local_test_jobs=8 \
     --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
     --config=xla -- \
diff --git a/tensorflow/tools/common/BUILD b/tensorflow/tools/common/BUILD
index b9032c046e9352..8c01d15a806004 100644
--- a/tensorflow/tools/common/BUILD
+++ b/tensorflow/tools/common/BUILD
@@ -40,7 +40,24 @@ py_test(
     srcs = ["traverse_test.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":test_module1",
+        ":test_module2",
         ":traverse",
         "//tensorflow/python:platform_test",
     ],
 )
+
+py_library(
+    name = "test_module1",
+    srcs = ["test_module1.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":test_module2",
+    ],
+)
+
+py_library(
+    name = "test_module2",
+    srcs = ["test_module2.py"],
+    srcs_version = "PY2AND3",
+)
diff --git a/tensorflow/python/keras/datasets/cifar100/__init__.py b/tensorflow/tools/common/test_module1.py
similarity index 70%
rename from tensorflow/python/keras/datasets/cifar100/__init__.py
rename to tensorflow/tools/common/test_module1.py
index ca937426733416..cc185cf36e2616 100644
--- a/tensorflow/python/keras/datasets/cifar100/__init__.py
+++ b/tensorflow/tools/common/test_module1.py
@@ -1,4 +1,4 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,14 +12,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""CIFAR100 small image classification dataset."""
+"""A module target for TraverseTest.test_module."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.datasets.cifar100 import load_data
+from tensorflow.tools.common import test_module2
+
+
+class ModuleClass1(object):
+
+  def __init__(self):
+    self._m2 = test_module2.ModuleClass2()
+
+  def __model_class1_method__(self):
+    pass
 
-del absolute_import
-del division
-del print_function
diff --git a/tensorflow/python/keras/datasets/fashion_mnist/__init__.py b/tensorflow/tools/common/test_module2.py
similarity index 83%
rename from tensorflow/python/keras/datasets/fashion_mnist/__init__.py
rename to tensorflow/tools/common/test_module2.py
index 7f5ddecc470733..d9da99d9c0f814 100644
--- a/tensorflow/python/keras/datasets/fashion_mnist/__init__.py
+++ b/tensorflow/tools/common/test_module2.py
@@ -12,14 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Fashion-MNIST dataset."""
+"""A module target for TraverseTest.test_module."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.keras._impl.keras.datasets.fashion_mnist import load_data
 
-del absolute_import
-del division
-del print_function
+class ModuleClass2(object):
+
+  def __init__(self):
+    pass
+
+  def __model_class1_method__(self):
+    pass
+
diff --git a/tensorflow/tools/common/traverse_test.py b/tensorflow/tools/common/traverse_test.py
index eb195ec18efbc7..ed410694ce1c86 100644
--- a/tensorflow/tools/common/traverse_test.py
+++ b/tensorflow/tools/common/traverse_test.py
@@ -18,9 +18,9 @@
 from __future__ import division
 from __future__ import print_function
 
-import sys
-
 from tensorflow.python.platform import googletest
+from tensorflow.tools.common import test_module1
+from tensorflow.tools.common import test_module2
 from tensorflow.tools.common import traverse
 
 
@@ -30,10 +30,6 @@ def __init__(self):
     self.call_log = []
 
   def __call__(self, path, parent, children):
-    # Do not traverse googletest, it's very deep.
-    for item in list(children):
-      if item[1] is googletest:
-        children.remove(item)
     self.call_log += [(path, parent, children)]
 
 
@@ -51,13 +47,12 @@ class Cyclist(object):
 
   def test_module(self):
     visitor = TestVisitor()
-    traverse.traverse(sys.modules[__name__], visitor)
+    traverse.traverse(test_module1, visitor)
 
     called = [parent for _, parent, _ in visitor.call_log]
 
-    self.assertIn(TestVisitor, called)
-    self.assertIn(TraverseTest, called)
-    self.assertIn(traverse, called)
+    self.assertIn(test_module1.ModuleClass1, called)
+    self.assertIn(test_module2.ModuleClass2, called)
 
   def test_class(self):
     visitor = TestVisitor()
diff --git a/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl b/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
index 47539b2423e602..f8f63e276cab61 100644
--- a/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
+++ b/tensorflow/tools/def_file_filter/def_file_filter_configure.bzl
@@ -31,7 +31,11 @@ def _def_file_filter_configure_impl(repository_ctx):
   vc_path = find_vc_path(repository_ctx)
   if vc_path == "visual-studio-not-found":
     auto_configure_fail("Visual C++ build tools not found on your machine")
-  undname_bin_path = find_msvc_tool(repository_ctx, vc_path, "undname.exe").replace("\\", "\\\\")
+
+  undname = find_msvc_tool(repository_ctx, vc_path, "undname.exe")
+  if undname == None:
+    auto_configure_fail("Couldn't find undname.exe under %s, please check your VC installation and set BAZEL_VC environment variable correctly." % vc_path)
+  undname_bin_path = undname.replace("\\", "\\\\")
 
   repository_ctx.template(
     "def_file_filter.py",
diff --git a/tensorflow/tools/dist_test/build_server.sh b/tensorflow/tools/dist_test/build_server.sh
index 225c0347416ec8..345217d733acec 100755
--- a/tensorflow/tools/dist_test/build_server.sh
+++ b/tensorflow/tools/dist_test/build_server.sh
@@ -23,7 +23,7 @@
 #     E.g.: tensorflow/tf_grpc_test_server:0.11.0rc1
 #
 #   whl_file_location: URL from which the TensorFlow whl file will be downloaded.
-#     E.g.: https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
+#     E.g.: https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp27-none-linux_x86_64.whl
 #     E.g.: /path/to/folder/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
 #
 # The optional flag --test lets the script to use the Dockerfile for the
diff --git a/tensorflow/tools/dist_test/local_test.sh b/tensorflow/tools/dist_test/local_test.sh
index caae7fd5305af9..b0114721bd2435 100755
--- a/tensorflow/tools/dist_test/local_test.sh
+++ b/tensorflow/tools/dist_test/local_test.sh
@@ -35,7 +35,7 @@
 #
 # Arguments:
 # whl_file_location: URL from which the TensorFlow whl file will be acquired.
-#   E.g.: https://ci.tensorflow.org/view/Nightly/job/nightly-matrix-cpu/TF_BUILD_IS_OPT=OPT,TF_BUILD_IS_PIP=PIP,TF_BUILD_PYTHON_VERSION=PYTHON2,label=cpu-slave/lastSuccessfulBuild/artifact/pip_test/whl/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
+#   E.g.: https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp27-none-linux_x86_64.whl
 #   E.g.: /path/to/folder/tensorflow-0.11.0rc1-cp27-none-linux_x86_64.whl
 #
 # --leave_container_running:  Do not stop the docker-in-docker container after
@@ -64,9 +64,6 @@ die() {
 # Configurations
 DOCKER_IMG_NAME="tensorflow/tf-dist-test-local-cluster"
 
-# Use TensorFlow v1.5.0 for Python 2.7 and CPU only as we set num_gpus to 0 in the below
-DEFAULT_WHL_FILE_LOCATION="https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.5.0-cp27-none-linux_x86_64.whl"
-
 # Parse input arguments
 LEAVE_CONTAINER_RUNNING=0
 MODEL_NAME=""
@@ -77,8 +74,7 @@ SYNC_REPLICAS_FLAG=""
 
 WHL_FILE_LOCATION=${1}
 if [[ -z "${WHL_FILE_LOCATION}" ]]; then
-  WHL_FILE_LOCATION=${DEFAULT_WHL_FILE_LOCATION}
-  echo "use default whl file location"
+  echo "WARNING: No wheel url passed. Will use latest tf-nightly cpu p2 wheel."
 fi
 
 while true; do
@@ -131,7 +127,11 @@ echo "Building in temporary directory: ${BUILD_DIR}"
 cp -r ${DIR}/* "${BUILD_DIR}"/ || \
   die "Failed to copy files to ${BUILD_DIR}"
 
-if [[ $WHL_FILE_LOCATION =~ 'http://' || $WHL_FILE_LOCATION =~ 'https://' ]]; then
+# Download whl file into the build context directory.
+if [[ -z "${WHL_FILE_LOCATION}" ]]; then
+  pip2 download --no-deps tf-nightly
+  cp tf-nightly-*.whl "${BUILD_DIR}"/tensorflow-none-any.whl
+elif [[ $WHL_FILE_LOCATION =~ 'http://' || $WHL_FILE_LOCATION =~ 'https://' ]]; then
     # Download whl file into the build context directory.
     wget -P "${BUILD_DIR}" "${WHL_FILE_LOCATION}" || \
         die "Failed to download tensorflow whl file from URL: ${WHL_FILE_LOCATION}"
diff --git a/tensorflow/tools/dist_test/remote_test.sh b/tensorflow/tools/dist_test/remote_test.sh
index 935535312d326a..e188c88c8fa725 100755
--- a/tensorflow/tools/dist_test/remote_test.sh
+++ b/tensorflow/tools/dist_test/remote_test.sh
@@ -108,7 +108,7 @@ fi
 # Parse command-line arguments.
 WHL_URL=${1}
 if [[ -z "${WHL_URL}" ]]; then
-  die "whl URL is not specified"
+  echo "WARNING: No wheel url passed. Will use latest tf-nightly cpu p2 wheel."
 fi
 
 # Create docker build context directory.
@@ -121,8 +121,13 @@ cp -r ${DIR}/* ${BUILD_DIR}/ || \
   die "Failed to copy files to ${BUILD_DIR}"
 
 # Download whl file into the build context directory.
-wget -P "${BUILD_DIR}" ${WHL_URL} || \
-  die "Failed to download tensorflow whl file from URL: ${WHL_URL}"
+if [[ -z "${WHL_URL}" ]]; then
+  pip2 download --no-deps tf-nightly
+  cp tf-nightly-*.whl "${BUILD_DIR}"/tensorflow-none-any.whl
+else
+  wget -P "${BUILD_DIR}" ${WHL_URL} || \
+    die "Failed to download tensorflow whl file from URL: ${WHL_URL}"
+fi
 
 # Build docker image for test.
 docker build ${NO_CACHE_FLAG} \
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index b9996395d02bfb..406d134699ff18 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -85,7 +85,7 @@ RUN git clone --branch=r1.8 --depth=1 https://github.com/tensorflow/tensorflow.g
 ENV CI_BUILD_PYTHON python
 
 RUN tensorflow/tools/ci_build/builds/configured CPU \
-    bazel build -c opt --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
+    bazel build -c opt --copt=-mavx --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
         # For optimized builds appropriate for the hardware platform of your choosing, uncomment below...
         # For ivy-bridge or sandy-bridge
         # --copt=-march="ivybridge" \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
index c65e0b72bc582d..a6cd44ced1d546 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
+++ b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
@@ -35,10 +35,10 @@ ENV CI_BUILD_PYTHON=python \
     PYTHON_LIB_PATH=/usr/local/lib/python2.7/dist-packages \
     CC_OPT_FLAGS='-march=native' \
     TF_NEED_JEMALLOC=0 \
-    TF_NEED_GCP=0 \
+    TF_NEED_GCP=1 \
     TF_NEED_CUDA=0 \
     TF_NEED_HDFS=0 \
-    TF_NEED_S3=0 \
+    TF_NEED_S3=1 \
     TF_NEED_OPENCL=0 \
     TF_NEED_GDR=0 \
     TF_ENABLE_XLA=0 \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 7e5e6ef2d5b024..e4dcce9cddd5f4 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -13,8 +13,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         cuda-cusparse-dev-9-0 \
         curl \
         git \
-        libcudnn7=7.0.5.15-1+cuda9.0 \
-        libcudnn7-dev=7.0.5.15-1+cuda9.0 \
+        libcudnn7=7.1.4.18-1+cuda9.0 \
+        libcudnn7-dev=7.1.4.18-1+cuda9.0 \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
@@ -98,7 +98,7 @@ ENV TF_CUDNN_VERSION=7
 RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
     LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:${LD_LIBRARY_PATH} \
     tensorflow/tools/ci_build/builds/configured GPU \
-    bazel build -c opt --config=cuda \
+    bazel build -c opt --copt=-mavx --config=cuda \
 	--cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
         tensorflow/tools/pip_package:build_pip_package && \
     rm /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
index bff4a203920769..9197651ff4326e 100644
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@@ -12,7 +12,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         cuda-cusolver-9-0 \
         cuda-cusparse-9-0 \
         curl \
-        libcudnn7=7.0.5.15-1+cuda9.0 \
+        libcudnn7=7.1.4.18-1+cuda9.0 \
         libfreetype6-dev \
         libhdf5-serial-dev \
         libpng12-dev \
diff --git a/tensorflow/tools/docs/generate_lib.py b/tensorflow/tools/docs/generate_lib.py
index 111d54d8205f80..853ec6194f8327 100644
--- a/tensorflow/tools/docs/generate_lib.py
+++ b/tensorflow/tools/docs/generate_lib.py
@@ -50,7 +50,11 @@ def _is_free_function(py_object, full_name, index):
   return True
 
 
-def write_docs(output_dir, parser_config, yaml_toc, root_title='TensorFlow'):
+def write_docs(output_dir,
+               parser_config,
+               yaml_toc,
+               root_title='TensorFlow',
+               search_hints=True):
   """Write previously extracted docs to disk.
 
   Write a docs page for each symbol included in the indices of parser_config to
@@ -66,6 +70,8 @@ def write_docs(output_dir, parser_config, yaml_toc, root_title='TensorFlow'):
       indices.
     yaml_toc: Set to `True` to generate a "_toc.yaml" file.
     root_title: The title name for the root level index.md.
+    search_hints: (bool) include meta-data search hints at the top of each
+      output file.
 
   Raises:
     ValueError: if `output_dir` is not an absolute path
@@ -134,7 +140,13 @@ def write_docs(output_dir, parser_config, yaml_toc, root_title='TensorFlow'):
       if not os.path.exists(directory):
         os.makedirs(directory)
       # This function returns raw bytes in PY2 or unicode in PY3.
-      text = pretty_docs.build_md_page(page_info)
+      if search_hints:
+        content = [page_info.get_metadata_html()]
+      else:
+        content = ['']
+
+      content.append(pretty_docs.build_md_page(page_info))
+      text = '\n'.join(content)
       if six.PY3:
         text = text.encode('utf-8')
       with open(path, 'wb') as f:
@@ -467,6 +479,12 @@ def __init__(self):
     self._do_not_descend_map = _get_default_do_not_descend_map()
     self.yaml_toc = True
 
+    self.argument_parser.add_argument(
+        '--no_search_hints',
+        dest='search_hints',
+        action='store_false',
+        default=True)
+
   def add_output_dir_argument(self):
     self.argument_parser.add_argument(
         '--output_dir',
@@ -553,7 +571,8 @@ def build(self, flags):
         output_dir,
         parser_config,
         yaml_toc=self.yaml_toc,
-        root_title=root_title)
+        root_title=root_title,
+        search_hints=getattr(flags, 'search_hints', True))
     _other_docs(flags.src_dir, flags.output_dir, reference_resolver)
 
     parser_config.reference_resolver.log_errors()
diff --git a/tensorflow/tools/docs/parser.py b/tensorflow/tools/docs/parser.py
index fb0bd2c2ff438a..50c90527413d09 100644
--- a/tensorflow/tools/docs/parser.py
+++ b/tensorflow/tools/docs/parser.py
@@ -21,6 +21,7 @@
 import ast
 import collections
 import functools
+import itertools
 import json
 import os
 import re
@@ -614,6 +615,9 @@ def _parse_md_docstring(py_object, relative_path_to_root, reference_resolver):
   docstring, compatibility = _handle_compatibility(raw_docstring)
   docstring, function_details = _parse_function_details(docstring)
 
+  if 'Generated by: tensorflow/tools/api/generator' in docstring:
+    docstring = ''
+
   return _DocstringInfo(
       docstring.split('\n')[0], docstring, function_details, compatibility)
 
@@ -906,6 +910,9 @@ def decorators(self):
   def add_decorator(self, dec):
     self._decorators.append(dec)
 
+  def get_metadata_html(self):
+    return _Metadata(self.full_name).build_html()
+
 
 class _ClassPageInfo(object):
   """Collects docs for a class page.
@@ -1099,6 +1106,14 @@ def classes(self):
     """Returns a list of `_LinkInfo` pointing to any nested classes."""
     return self._classes
 
+  def get_metadata_html(self):
+    meta_data = _Metadata(self.full_name)
+    for item in itertools.chain(self.classes, self.properties, self.methods,
+                                self.other_members):
+      meta_data.append(item)
+
+    return meta_data.build_html()
+
   def _add_class(self, short_name, full_name, obj, doc, url):
     """Adds a `_LinkInfo` for a nested class to `classes` list.
 
@@ -1330,6 +1345,16 @@ def _add_other_member(self, short_name, full_name, obj, doc):
     self._other_members.append(
         _OtherMemberInfo(short_name, full_name, obj, doc))
 
+  def get_metadata_html(self):
+    meta_data = _Metadata(self.full_name)
+
+    # Objects with their own pages are not added to the matadata list for the
+    # module, the module only has a link to the object page. No docs.
+    for item in self.other_members:
+      meta_data.append(item)
+
+    return meta_data.build_html()
+
   def collect_docs_for_module(self, parser_config):
     """Collect information necessary specifically for a module's doc page.
 
@@ -1575,7 +1600,8 @@ def is_generated_file(self):
     return True
 
   def __str__(self):
-    return 'Defined in `%s%s`.\n\n' % (self.path_prefix, self.path)
+    return 'Defined in generated file: `%s%s`.\n\n' % (self.path_prefix,
+                                                       self.path)
 
 
 def _get_defined_in(py_object, parser_config):
@@ -1612,6 +1638,8 @@ def _get_defined_in(py_object, parser_config):
 
   if re.match(r'.*/gen_[^/]*\.py$', path):
     return _GeneratedFile(path, parser_config)
+  if 'genfiles' in path or 'tools/api/generator' in path:
+    return _GeneratedFile(path, parser_config)
   elif re.match(r'.*_pb2\.py$', path):
     # The _pb2.py files all appear right next to their defining .proto file.
     return _ProtoFile(path[:-7] + '.proto', parser_config)
@@ -1656,3 +1684,41 @@ def generate_global_index(library_name, index, reference_resolver):
 
   # TODO(markdaoust): use a _ModulePageInfo -> prety_docs.build_md_page()
   return '\n'.join(lines)
+
+
+class _Metadata(object):
+  """A class for building a page's Metadata block.
+
+  Attributes:
+    name: The name of the page being described by the Metadata block.
+  """
+
+  def __init__(self, name):
+    """Creates a Metadata builder.
+
+    Args:
+      name: The name of the page being described by the Metadata block.
+    """
+    self.name = name
+    self._content = []
+
+  def append(self, item):
+    """Adds an item from the page to the Metadata block.
+
+    Args:
+      item: The parsed page section to add.
+    """
+    self._content.append(item.short_name)
+
+  def build_html(self):
+    """Returns the Metadata block as an Html string."""
+    schema = 'http://developers.google.com/ReferenceObject'
+    parts = ['<div itemscope itemtype="%s">' % schema]
+
+    parts.append('<meta itemprop="name" content="%s" />' % self.name)
+    for item in self._content:
+      parts.append('<meta itemprop="property" content="%s"/>' % item)
+
+    parts.extend(['</div>', ''])
+
+    return '\n'.join(parts)
diff --git a/tensorflow/tools/docs/pretty_docs.py b/tensorflow/tools/docs/pretty_docs.py
index 55ab5bdd49a427..63d4fef91cc752 100644
--- a/tensorflow/tools/docs/pretty_docs.py
+++ b/tensorflow/tools/docs/pretty_docs.py
@@ -27,7 +27,6 @@
 from __future__ import division
 from __future__ import print_function
 
-import itertools
 import textwrap
 
 
@@ -58,8 +57,7 @@ def build_md_page(page_info):
 
 def _build_function_page(page_info):
   """Given a FunctionPageInfo object Return the page as an md string."""
-  parts = [_Metadata(page_info.full_name).build_html()]
-  parts.append('# %s\n\n' % page_info.full_name)
+  parts = ['# %s\n\n' % page_info.full_name]
 
   if len(page_info.aliases) > 1:
     parts.append('### Aliases:\n\n')
@@ -83,17 +81,7 @@ def _build_function_page(page_info):
 
 def _build_class_page(page_info):
   """Given a ClassPageInfo object Return the page as an md string."""
-  meta_data = _Metadata(page_info.full_name)
-  for item in itertools.chain(
-      page_info.classes,
-      page_info.properties,
-      page_info.methods,
-      page_info.other_members):
-    meta_data.append(item)
-
-  parts = [meta_data.build_html()]
-
-  parts.append('# {page_info.full_name}\n\n'.format(page_info=page_info))
+  parts = ['# {page_info.full_name}\n\n'.format(page_info=page_info)]
 
   parts.append('## Class `%s`\n\n' % page_info.full_name.split('.')[-1])
   if page_info.bases:
@@ -186,17 +174,7 @@ def _build_class_page(page_info):
 
 def _build_module_page(page_info):
   """Given a ClassPageInfo object Return the page as an md string."""
-  meta_data = _Metadata(page_info.full_name)
-
-  # Objects with their own pages are not added to the matadata list for the
-  # module, as the only thing on the module page is a link to the object's page.
-  for item in page_info.other_members:
-    meta_data.append(item)
-
-  parts = [meta_data.build_html()]
-
-  parts.append(
-      '# Module: {full_name}\n\n'.format(full_name=page_info.full_name))
+  parts = ['# Module: {full_name}\n\n'.format(full_name=page_info.full_name)]
 
   if len(page_info.aliases) > 1:
     parts.append('### Aliases:\n\n')
@@ -317,41 +295,3 @@ def _build_function_details(function_details):
     parts.append(''.join(sub))
 
   return '\n'.join(parts)
-
-
-class _Metadata(object):
-  """A class for building a page's Metadata block.
-
-  Attributes:
-    name: The name of the page being described by the Metadata block.
-  """
-
-  def __init__(self, name):
-    """Create a Metadata builder.
-
-    Args:
-      name: The name of the page being described by the Metadata block.
-    """
-    self.name = name
-    self._content = []
-
-  def append(self, item):
-    """Add an item from the page to the Metadata block.
-
-    Args:
-      item: The parsed page section to add.
-    """
-    self._content.append(item.short_name)
-
-  def build_html(self):
-    """Return the Metadata block as an Html string."""
-    schema = 'http://developers.google.com/ReferenceObject'
-    parts = ['<div itemscope itemtype="%s">' % schema]
-
-    parts.append('<meta itemprop="name" content="%s" />' % self.name)
-    for item in self._content:
-      parts.append('<meta itemprop="property" content="%s"/>' % item)
-
-    parts.extend(['</div>', '', ''])
-
-    return '\n'.join(parts)
diff --git a/tensorflow/tools/graph_transforms/README.md b/tensorflow/tools/graph_transforms/README.md
index 67badb4869029b..9f6f553ba1e4c6 100644
--- a/tensorflow/tools/graph_transforms/README.md
+++ b/tensorflow/tools/graph_transforms/README.md
@@ -388,7 +388,7 @@ input is collapsed down into a simple constant.
 Args:
 
 *   clear_output_shapes: Clears tensor shape information saved as attributes.
-    Some older graphs containes out-of-date information and may cause import
+    Some older graphs contains out-of-date information and may cause import
     errors. Defaults to true.
 
 Prerequisites: None
diff --git a/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc b/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc
index 7651a03fe51012..435f46c107cd9b 100644
--- a/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc
+++ b/tensorflow/tools/graph_transforms/fold_old_batch_norms_test.cc
@@ -191,7 +191,7 @@ class FoldOldBatchNormsTest : public ::testing::Test {
     std::vector<Tensor> fused_outputs;
     TF_ASSERT_OK(fused_session->Run({}, {"output"}, {}, &fused_outputs));
 
-    test::ExpectTensorNear<float>(original_outputs[0], fused_outputs[0], 1e-5);
+    test::ExpectTensorNear<float>(original_outputs[0], fused_outputs[0], 2e-5);
 
     for (const NodeDef& node : fused_graph_def.node()) {
       EXPECT_NE("FusedBatchNorm", node.op());
diff --git a/tensorflow/tools/graph_transforms/freeze_requantization_ranges.cc b/tensorflow/tools/graph_transforms/freeze_requantization_ranges.cc
index f401723808c086..c8dc2a7c4df243 100644
--- a/tensorflow/tools/graph_transforms/freeze_requantization_ranges.cc
+++ b/tensorflow/tools/graph_transforms/freeze_requantization_ranges.cc
@@ -92,9 +92,8 @@ Status ExtractMinMaxRecords(const string& log_file_name,
     if (!str_util::EndsWith(name_string, print_suffix)) {
       continue;
     }
-    string name =
-        name_string.substr(0, name_string.size() - print_suffix.size())
-            .ToString();
+    string name = std::string(
+        name_string.substr(0, name_string.size() - print_suffix.size()));
     records->push_back({name, min, max});
   }
   return Status::OK();
diff --git a/tensorflow/tools/graph_transforms/sparsify_gather_test.cc b/tensorflow/tools/graph_transforms/sparsify_gather_test.cc
index d41321c9a6df75..dd95779a1fb717 100644
--- a/tensorflow/tools/graph_transforms/sparsify_gather_test.cc
+++ b/tensorflow/tools/graph_transforms/sparsify_gather_test.cc
@@ -42,8 +42,8 @@ class SparsifyGatherTest : public ::testing::Test {
                       const std::vector<NodeDef*>& inputs, GraphDef* graph_def,
                       bool control_dep = false) {
     NodeDef* node_def = graph_def->add_node();
-    node_def->set_name(name.ToString());
-    node_def->set_op(op.ToString());
+    node_def->set_name(std::string(name));
+    node_def->set_op(std::string(op));
     if (!control_dep) {
       std::for_each(inputs.begin(), inputs.end(), [&node_def](NodeDef* input) {
         node_def->add_input(input->name());
diff --git a/tensorflow/tools/graph_transforms/transform_graph.cc b/tensorflow/tools/graph_transforms/transform_graph.cc
index 8ce8f5e24b9f00..5cae8f8d8f32ef 100644
--- a/tensorflow/tools/graph_transforms/transform_graph.cc
+++ b/tensorflow/tools/graph_transforms/transform_graph.cc
@@ -65,19 +65,19 @@ Status ParseTransformParameters(const string& transforms_string,
               .GetResult(&remaining, &transform_name);
       if (!found_transform_name) {
         return errors::InvalidArgument("Looking for transform name, but found ",
-                                       remaining.ToString().c_str());
+                                       std::string(remaining).c_str());
       }
       if (Scanner(remaining).OneLiteral("(").GetResult(&remaining, &match)) {
         state = TRANSFORM_PARAM_NAME;
       } else {
         // Add a transform with no parameters.
-        params_list->push_back({transform_name.ToString(), func_parameters});
+        params_list->push_back({std::string(transform_name), func_parameters});
         transform_name = "";
         state = TRANSFORM_NAME;
       }
     } else if (state == TRANSFORM_PARAM_NAME) {
       if (Scanner(remaining).OneLiteral(")").GetResult(&remaining, &match)) {
-        params_list->push_back({transform_name.ToString(), func_parameters});
+        params_list->push_back({std::string(transform_name), func_parameters});
         transform_name = "";
         state = TRANSFORM_NAME;
       } else {
@@ -92,13 +92,13 @@ Status ParseTransformParameters(const string& transforms_string,
         if (!found_parameter_name) {
           return errors::InvalidArgument(
               "Looking for parameter name, but found ",
-              remaining.ToString().c_str());
+              std::string(remaining).c_str());
         }
         if (Scanner(remaining).OneLiteral("=").GetResult(&remaining, &match)) {
           state = TRANSFORM_PARAM_VALUE;
         } else {
           return errors::InvalidArgument("Looking for =, but found ",
-                                         remaining.ToString().c_str());
+                                         std::string(remaining).c_str());
         }
       }
     } else if (state == TRANSFORM_PARAM_VALUE) {
@@ -120,10 +120,10 @@ Status ParseTransformParameters(const string& transforms_string,
       }
       if (!found_parameter_value) {
         return errors::InvalidArgument("Looking for parameter name, but found ",
-                                       remaining.ToString().c_str());
+                                       std::string(remaining).c_str());
       }
-      func_parameters[parameter_name.ToString()].push_back(
-          parameter_value.ToString());
+      func_parameters[std::string(parameter_name)].push_back(
+          std::string(parameter_value));
       // Eat up any trailing quotes.
       Scanner(remaining).ZeroOrOneLiteral("\"").GetResult(&remaining, &match);
       Scanner(remaining).ZeroOrOneLiteral("'").GetResult(&remaining, &match);
@@ -141,7 +141,7 @@ std::string ExpandPath(const std::string& path_string) {
     return path_string;
   }
 
-  const char* home = NULL;
+  const char* home = nullptr;
   std::string::size_type prefix = path_string.find_first_of('/');
   if (path_string.length() == 1 || prefix == 1) {
     // The value of $HOME, e.g., ~/foo
diff --git a/tensorflow/tools/graph_transforms/transform_utils.cc b/tensorflow/tools/graph_transforms/transform_utils.cc
index 367048965d146d..af17fd75bc1cca 100644
--- a/tensorflow/tools/graph_transforms/transform_utils.cc
+++ b/tensorflow/tools/graph_transforms/transform_utils.cc
@@ -93,7 +93,7 @@ void NodeNamePartsFromInput(const string& input_name, string* prefix,
   } else {
     *prefix = "";
   }
-  *node_name = node_name_piece.ToString();
+  *node_name = std::string(node_name_piece);
 }
 
 string NodeNameFromInput(const string& input_name) {
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 569b6678cabddd..77f83b77a02141 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -124,6 +124,9 @@ genrule(
         "@fft2d//:fft/readme.txt",
         "@gemmlowp//:LICENSE",
         "@gif_archive//:COPYING",
+        "@grpc//:LICENSE",
+        "@grpc//third_party/address_sorting:LICENSE",
+        "@grpc//third_party/nanopb:LICENSE.txt",
         "@highwayhash//:LICENSE",
         "@jemalloc//:COPYING",
         "@jpeg//:LICENSE.md",
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 677ea65edd91df..9d4148c07f37fb 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -59,6 +59,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/autograph/converters:converters",
     "//tensorflow/contrib/autograph/converters:test_lib",
     "//tensorflow/contrib/autograph/impl:impl",
+    "//tensorflow/contrib/autograph/operators:operators",
     "//tensorflow/contrib/autograph/pyct:pyct",
     "//tensorflow/contrib/autograph/pyct/static_analysis:static_analysis",
     "//tensorflow/contrib/boosted_trees:boosted_trees_pip",
@@ -173,9 +174,7 @@ sh_binary(
         "//conditions:default": COMMON_PIP_DEPS + [
             ":simple_console",
             "//tensorflow/contrib/lite/python:interpreter_test_data",
-            "//tensorflow/contrib/lite/python:tf_lite_py_pip",
-            "//tensorflow/contrib/lite/toco:toco",
-            "//tensorflow/contrib/lite/toco/python:toco_wrapper",
+            "//tensorflow/contrib/lite/python:tflite_convert",
             "//tensorflow/contrib/lite/toco/python:toco_from_protos",
         ],
     }) + if_mkl(["//third_party/mkl:intel_binary_blob"]) + if_tensorrt([
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index 8f0cf8c3d19480..f7e42ce5362163 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -24,7 +24,7 @@ function real_path() {
 function cp_external() {
   local src_dir=$1
   local dest_dir=$2
-  for f in `find "$src_dir" -maxdepth 1 -mindepth 1 ! -name '*local_config_cuda*' ! -name '*org_tensorflow*'`; do
+  for f in `find "$src_dir" -maxdepth 1 -mindepth 1 ! -name '*local_config_cuda*' ! -name '*local_config_tensorrt*' ! -name '*org_tensorflow*'`; do
     cp -R "$f" "$dest_dir"
   done
   mkdir -p "${dest_dir}/local_config_cuda/cuda/cuda/"
@@ -41,42 +41,15 @@ function is_windows() {
   fi
 }
 
-function main() {
+function prepare_src() {
   if [ $# -lt 1 ] ; then
     echo "No destination dir provided"
     exit 1
   fi
 
-  DEST=$(real_path $1)
-  TMPDIR=$(mktemp -d -t tmp.XXXXXXXXXX)
-
-  PKG_NAME_FLAG=""
-  GPU_BUILD=0
-  NIGHTLY_BUILD=0
-  while true; do
-    if [[ "$1" == "--nightly_flag" ]]; then
-      NIGHTLY_BUILD=1
-    elif [[ "$1" == "--gpu" ]]; then
-      GPU_BUILD=1
-    elif [[ "$1" == "--gpudirect" ]]; then
-      PKG_NAME_FLAG="--project_name tensorflow_gpudirect"
-    fi
-    shift
-
-    if [[ -z "$1" ]]; then
-      break
-    fi
-  done
-
-  if [[ ${NIGHTLY_BUILD} == "1" && ${GPU_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tf_nightly_gpu"
-  elif [[ ${NIGHTLY_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tf_nightly"
-  elif [[ ${GPU_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tensorflow_gpu"
-  fi
-
-  echo $(date) : "=== Using tmpdir: ${TMPDIR}"
+  TMPDIR="$1"
+  mkdir -p "$TMPDIR"
+  echo $(date) : "=== Preparing sources in dir: ${TMPDIR}"
 
   if [ ! -d bazel-bin/tensorflow ]; then
     echo "Could not find bazel-bin.  Did you run from the root of the build tree?"
@@ -139,26 +112,35 @@ function main() {
     fi
     mkdir "${TMPDIR}/tensorflow/aux-bin"
     # Install toco as a binary in aux-bin.
-    # TODO(aselle): Re-enable this when we find a way to do it without doubling
-    # the whl size (over the limit).
-    # cp bazel-bin/tensorflow/contrib/lite/toco/toco ${TMPDIR}/tensorflow/aux-bin/
+    cp bazel-bin/tensorflow/contrib/lite/python/tflite_convert ${TMPDIR}/tensorflow/aux-bin/
   fi
 
   # protobuf pip package doesn't ship with header files. Copy the headers
   # over so user defined ops can be compiled.
   mkdir -p ${TMPDIR}/google
   mkdir -p ${TMPDIR}/third_party
-  pushd ${RUNFILES%org_tensorflow}
+  pushd ${RUNFILES%org_tensorflow} > /dev/null
   for header in $(find protobuf_archive -name \*.h); do
     mkdir -p "${TMPDIR}/google/$(dirname ${header})"
     cp "$header" "${TMPDIR}/google/$(dirname ${header})/"
   done
-  popd
+  popd > /dev/null
   cp -R $RUNFILES/third_party/eigen3 ${TMPDIR}/third_party
 
   cp tensorflow/tools/pip_package/MANIFEST.in ${TMPDIR}
   cp tensorflow/tools/pip_package/README ${TMPDIR}
   cp tensorflow/tools/pip_package/setup.py ${TMPDIR}
+}
+
+function build_wheel() {
+  if [ $# -lt 2 ] ; then
+    echo "No src and dest dir provided"
+    exit 1
+  fi
+
+  TMPDIR="$1"
+  DEST="$2"
+  PKG_NAME_FLAG="$3"
 
   # Before we leave the top-level directory, make sure we know how to
   # call python.
@@ -166,15 +148,110 @@ function main() {
     source tools/python_bin_path.sh
   fi
 
-  pushd ${TMPDIR}
+  pushd ${TMPDIR} > /dev/null
   rm -f MANIFEST
   echo $(date) : "=== Building wheel"
   "${PYTHON_BIN_PATH:-python}" setup.py bdist_wheel ${PKG_NAME_FLAG} >/dev/null
   mkdir -p ${DEST}
   cp dist/* ${DEST}
-  popd
-  rm -rf ${TMPDIR}
+  popd > /dev/null
   echo $(date) : "=== Output wheel file is in: ${DEST}"
 }
 
+function usage() {
+  echo "Usage:"
+  echo "$0 [--src srcdir] [--dst dstdir] [options]"
+  echo "$0 dstdir [options]"
+  echo ""
+  echo "    --src                 prepare sources in srcdir"
+  echo "                              will use temporary dir if not specified"
+  echo ""
+  echo "    --dst                 build wheel in dstdir"
+  echo "                              if dstdir is not set do not build, only prepare sources"
+  echo ""
+  echo "  Options:"
+  echo "    --project_name <name> set project name to name"
+  echo "    --gpu                 build tensorflow_gpu"
+  echo "    --gpudirect           build tensorflow_gpudirect"
+  echo "    --nightly_flag        build tensorflow nightly"
+  echo ""
+  exit 1
+}
+
+function main() {
+  PKG_NAME_FLAG=""
+  PROJECT_NAME=""
+  GPU_BUILD=0
+  NIGHTLY_BUILD=0
+  SRCDIR=""
+  DSTDIR=""
+  CLEANSRC=1
+  while true; do
+    if [[ "$1" == "--help" ]]; then
+      usage
+      exit 1
+    elif [[ "$1" == "--nightly_flag" ]]; then
+      NIGHTLY_BUILD=1
+    elif [[ "$1" == "--gpu" ]]; then
+      GPU_BUILD=1
+    elif [[ "$1" == "--gpudirect" ]]; then
+      PKG_NAME_FLAG="--project_name tensorflow_gpudirect"
+    elif [[ "$1" == "--project_name" ]]; then
+      shift
+      if [[ -z "$1" ]]; then
+        break
+      fi
+      PROJECT_NAME="$1"
+    elif [[ "$1" == "--src" ]]; then
+      shift
+      SRCDIR="$(real_path $1)"
+      CLEANSRC=0
+    elif [[ "$1" == "--dst" ]]; then
+      shift
+      DSTDIR="$(real_path $1)"
+    else
+      DSTDIR="$(real_path $1)"
+    fi
+    shift
+
+    if [[ -z "$1" ]]; then
+      break
+    fi
+  done
+
+  if [[ -z "$DSTDIR" ]] && [[ -z "$SRCDIR" ]]; then
+    echo "No destination dir provided"
+    usage
+    exit 1
+  fi
+
+  if [[ -z "$SRCDIR" ]]; then
+    # make temp srcdir if none set
+    SRCDIR="$(mktemp -d -t tmp.XXXXXXXXXX)"
+  fi
+
+  prepare_src "$SRCDIR"
+
+  if [[ -z "$DSTDIR" ]]; then
+      # only want to prepare sources
+      exit
+  fi
+
+  if [[ -n ${PROJECT_NAME} ]]; then
+    PKG_NAME_FLAG="--project_name ${PROJECT_NAME}"
+  elif [[ ${NIGHTLY_BUILD} == "1" && ${GPU_BUILD} == "1" ]]; then
+    PKG_NAME_FLAG="--project_name tf_nightly_gpu"
+  elif [[ ${NIGHTLY_BUILD} == "1" ]]; then
+    PKG_NAME_FLAG="--project_name tf_nightly"
+  elif [[ ${GPU_BUILD} == "1" ]]; then
+    PKG_NAME_FLAG="--project_name tensorflow_gpu"
+  fi
+
+  build_wheel "$SRCDIR" "$DSTDIR" "$PKG_NAME_FLAG"
+
+  if [[ $CLEANSRC -ne 0 ]]; then
+    rm -rf "${TMPDIR}"
+  fi
+}
+
 main "$@"
diff --git a/tensorflow/tools/pip_package/pip_smoke_test.py b/tensorflow/tools/pip_package/pip_smoke_test.py
index e2518f6cbf0beb..401f833dbd6ae4 100644
--- a/tensorflow/tools/pip_package/pip_smoke_test.py
+++ b/tensorflow/tools/pip_package/pip_smoke_test.py
@@ -30,15 +30,42 @@
 PIP_PACKAGE_QUERY_EXPRESSION = (
     "deps(//tensorflow/tools/pip_package:build_pip_package)")
 
-# pylint: disable=g-backslash-continuation
-PY_TEST_QUERY_EXPRESSION = 'deps(\
-  filter("^((?!benchmark).)*$",\
-  kind(py_test,\
-  //tensorflow/python/... \
-  + //tensorflow/contrib/... \
-  - //tensorflow/contrib/tensorboard/... \
-  - attr(tags, "manual|no_pip", //tensorflow/...))), 1)'
-# pylint: enable=g-backslash-continuation
+
+def GetBuild(dir_base):
+  """Get the list of BUILD file all targets recursively startind at dir_base."""
+  items = []
+  for root, _, files in os.walk(dir_base):
+    for name in files:
+      if (name == "BUILD" and
+          root.find("tensorflow/contrib/lite/examples/android") == -1):
+        items.append("//" + root + ":all")
+  return items
+
+
+def BuildPyTestDependencies():
+  python_targets = GetBuild("tensorflow/python")
+  contrib_targets = GetBuild("tensorflow/contrib")
+  tensorboard_targets = GetBuild("tensorflow/contrib/tensorboard")
+  tensorflow_targets = GetBuild("tensorflow")
+  # Build list of test targets,
+  # python + contrib - tensorboard - attr(manual|pno_pip)
+  targets = " + ".join(python_targets)
+  for t in contrib_targets:
+    targets += " + " + t
+  for t in tensorboard_targets:
+    targets += " - " + t
+  targets += ' - attr(tags, "manual|no_pip", %s)' % " + ".join(
+      tensorflow_targets)
+  query_kind = "kind(py_test, %s)" % targets
+  # Skip benchmarks etc.
+  query_filter = 'filter("^((?!benchmark).)*$", %s)' % query_kind
+  # Get the dependencies
+  query_deps = "deps(%s, 1)" % query_filter
+
+  return python_targets, query_deps
+
+
+PYTHON_TARGETS, PY_TEST_QUERY_EXPRESSION = BuildPyTestDependencies()
 
 # Hard-coded blacklist of files if not included in pip package
 # TODO(amitpatankar): Clean up blacklist.
@@ -94,15 +121,21 @@ def main():
 
   # pip_package_dependencies_list is the list of included files in pip packages
   pip_package_dependencies = subprocess.check_output(
-      ["bazel", "query", PIP_PACKAGE_QUERY_EXPRESSION])
+      ["bazel", "cquery", PIP_PACKAGE_QUERY_EXPRESSION])
   pip_package_dependencies_list = pip_package_dependencies.strip().split("\n")
+  pip_package_dependencies_list = [
+      x.split()[0] for x in pip_package_dependencies_list
+  ]
   print("Pip package superset size: %d" % len(pip_package_dependencies_list))
 
   # tf_py_test_dependencies is the list of dependencies for all python
   # tests in tensorflow
   tf_py_test_dependencies = subprocess.check_output(
-      ["bazel", "query", PY_TEST_QUERY_EXPRESSION])
+      ["bazel", "cquery", PY_TEST_QUERY_EXPRESSION])
   tf_py_test_dependencies_list = tf_py_test_dependencies.strip().split("\n")
+  tf_py_test_dependencies_list = [
+      x.split()[0] for x in tf_py_test_dependencies.strip().split("\n")
+  ]
   print("Pytest dependency subset size: %d" % len(tf_py_test_dependencies_list))
 
   missing_dependencies = []
@@ -133,16 +166,17 @@ def main():
     for missing_dependency in missing_dependencies:
       print("\nMissing dependency: %s " % missing_dependency)
       print("Affected Tests:")
-      rdep_query = ("rdeps(kind(py_test, //tensorflow/python/...), %s)" %
-                    missing_dependency)
-      affected_tests = subprocess.check_output(["bazel", "query", rdep_query])
+      rdep_query = ("rdeps(kind(py_test, %s), %s)" %
+                    (" + ".join(PYTHON_TARGETS), missing_dependency))
+      affected_tests = subprocess.check_output(["bazel", "cquery", rdep_query])
       affected_tests_list = affected_tests.split("\n")[:-2]
       print("\n".join(affected_tests_list))
 
-    raise RuntimeError("""One or more dependencies are not in the pip package.
-Please either blacklist the dependencies in
-//tensorflow/tools/pip_package/pip_smoke_test.py
-or add them to //tensorflow/tools/pip_package/BUILD.""")
+    raise RuntimeError("""
+    One or more added test dependencies are not in the pip package.
+If these test dependencies need to be in TensorFlow pip package, please add them to //tensorflow/tools/pip_package/BUILD.
+Else either blacklist the dependencies in //tensorflow/tools/pip_package/pip_smoke_test.py
+or add no_pip tag to the test.""")
 
   else:
     print("TEST PASSED")
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index b88d023cbcaa9d..d25a9e77b167e1 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -12,6 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""TensorFlow is an open source machine learning framework for everyone.
+
+TensorFlow is an open source software library for high performance numerical
+computation. Its flexible architecture allows easy deployment of computation
+across a variety of platforms (CPUs, GPUs, TPUs), and from desktops to clusters
+of servers to mobile and edge devices.
+
+Originally developed by researchers and engineers from the Google Brain team
+within Google's AI organization, it comes with strong support for machine
+learning and deep learning and the flexible numerical computation core is used
+across many other scientific domains.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -28,10 +40,12 @@
 from setuptools.command.install import install as InstallCommandBase
 from setuptools.dist import Distribution
 
+DOCLINES = __doc__.split('\n')
+
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.8.0-rc1'
+_VERSION = '1.8.0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
@@ -40,7 +54,7 @@
     'numpy >= 1.13.3',
     'six >= 1.10.0',
     'protobuf >= 3.4.0',
-    'tensorboard >= 1.7.0, < 1.8.0',
+    'tensorboard >= 1.8.0, < 1.9.0',
     'termcolor >= 1.1.0',
 ]
 
@@ -69,7 +83,7 @@
 if 'tf_nightly' in project_name:
   for i, pkg in enumerate(REQUIRED_PACKAGES):
     if 'tensorboard' in pkg:
-      REQUIRED_PACKAGES[i] = 'tb-nightly >= 1.8.0a0, < 1.9.0a0'
+      REQUIRED_PACKAGES[i] = 'tb-nightly >= 1.9.0a0, < 1.10.0a0'
       break
 
 # weakref.finalize and enum were introduced in Python 3.4
@@ -81,7 +95,8 @@
 CONSOLE_SCRIPTS = [
     'freeze_graph = tensorflow.python.tools.freeze_graph:run_main',
     'toco_from_protos = tensorflow.contrib.lite.toco.python.toco_from_protos:main',
-    'toco = tensorflow.contrib.lite.toco.python.toco_wrapper:main',
+    'tflite_convert = tensorflow.contrib.lite.python.tflite_convert:main',
+    'toco = tensorflow.contrib.lite.python.tflite_convert:main',
     'saved_model_cli = tensorflow.python.tools.saved_model_cli:main',
     # We need to keep the TensorBoard command, even though the console script
     # is now declared by the tensorboard pip package. If we remove the
@@ -214,9 +229,10 @@ def find_files(pattern, root):
 setup(
     name=project_name,
     version=_VERSION.replace('-', ''),
-    description='TensorFlow helps the tensors flow',
-    long_description='',
+    description=DOCLINES[0],
+    long_description='\n'.join(DOCLINES[2:]),
     url='https://www.tensorflow.org/',
+    download_url='https://github.com/tensorflow/tensorflow/tags',
     author='Google Inc.',
     author_email='opensource@google.com',
     # Contained modules and scripts.
@@ -242,7 +258,7 @@ def find_files(pattern, root):
     },
     # PyPI package information.
     classifiers=[
-        'Development Status :: 4 - Beta',
+        'Development Status :: 5 - Production/Stable',
         'Intended Audience :: Developers',
         'Intended Audience :: Education',
         'Intended Audience :: Science/Research',
@@ -261,4 +277,5 @@ def find_files(pattern, root):
         'Topic :: Software Development :: Libraries :: Python Modules',
     ],
     license='Apache 2.0',
-    keywords='tensorflow tensor machine learning',)
+    keywords='tensorflow tensor machine learning',
+)
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
index 62e29b5128f3a2..15d7c702819dde 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
@@ -279,8 +279,13 @@ void Generator::AppendFieldValueAppend(const FieldDescriptor& field,
       if (omit_default) {
         Print("if (", field_expr, " != 0) {").Nest();
       }
-      Print("o->AppendEnumName(\"", field.name(), "\", ",
-            GetQualifiedEnumNameFn(*field.enum_type()), "(", field_expr, "));");
+      Print("const char* enum_name = ",
+            GetQualifiedEnumNameFn(*field.enum_type()), "(", field_expr, ");");
+      Print("if (enum_name[0]) {").Nest();
+      Print("o->AppendEnumName(\"", field.name(), "\", enum_name);");
+      Unnest().Print("} else {").Nest();
+      Print("o->AppendNumeric(\"", field.name(), "\", ", field_expr, ");");
+      Unnest().Print("}");
       if (omit_default) {
         Unnest().Print("}");
       }
@@ -540,18 +545,24 @@ void Generator::AppendParseMessageFunction(const Descriptor& md) {
       for (int enum_i = 0; enum_i < enum_d->value_count(); ++enum_i) {
         const auto* value_d = enum_d->value(enum_i);
         const string& value_name = value_d->name();
-        string condition = StrCat("value == \"", value_name,
-                                  "\" || value == \"", value_d->number(), "\"");
-        if (value_d->number() == 0) {
-          StrAppend(&condition, " || value == \"-0\"");
-        }
+        string condition = StrCat("value == \"", value_name, "\"");
 
         Print(enum_i == 0 ? "" : "} else ", "if (", condition, ") {");
         Nest();
         Print(set_value_prefix, "(", value_prefix, value_name, ");");
         Unnest();
       }
+      Print("} else {");
+      Nest();
+      // Proto3 allows all numeric values.
+      Print("int32 int_value;");
+      Print("if (strings::SafeStringToNumeric(value, &int_value)) {");
+      Nest();
+      Print(set_value_prefix, "(static_cast<", GetQualifiedName(*enum_d),
+            ">(int_value));");
+      Unnest();
       Print("} else {").Nest().Print("return false;").Unnest().Print("}");
+      Unnest().Print("}");
     } else {
       Print(field->cpp_type_name(), " value;");
       switch (field->cpp_type()) {
@@ -803,6 +814,9 @@ void Generator::Generate(const FileDescriptor& fd) {
   // Add header to cc file.
   SetOutput(&cc_);
   Print("// GENERATED FILE - DO NOT MODIFY");
+  Print();
+  Print("#include <algorithm>");  // for `std::stable_sort()`
+  Print();
   headers = {GetProtoTextHeaderName(fd, true /* impl */)};
   AddHeadersToCurrentSection(headers);
   Print();
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions_lib_test.cc b/tensorflow/tools/proto_text/gen_proto_text_functions_lib_test.cc
index 6f0b4f47de6464..e67add72de660b 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions_lib_test.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions_lib_test.cc
@@ -455,7 +455,10 @@ TEST(CreateProtoDebugStringLibTest, Enums) {
        "repeated_nested_enum: 1"));
 
   EXPECT_PARSE_SUCCESS("", "optional_nested_enum: -0");
-  EXPECT_PARSE_FAILURE("optional_nested_enum: 6");
+  // TODO(amauryfa): restore the line below when protobuf::TextFormat also
+  // supports unknonwn enum values.
+  // EXPECT_PARSE_SUCCESS("optional_nested_enum: 6", "optional_nested_enum: 6");
+  EXPECT_PARSE_FAILURE("optional_nested_enum: 2147483648");  // > INT32_MAX
   EXPECT_PARSE_FAILURE("optional_nested_enum: BARNONE");
   EXPECT_PARSE_FAILURE("optional_nested_enum: 'BAR'");
   EXPECT_PARSE_FAILURE("optional_nested_enum: \"BAR\" ");
diff --git a/tensorflow/tools/quantization/quantize_graph_test.py b/tensorflow/tools/quantization/quantize_graph_test.py
index df71840b64db3a..92bb5127dacf31 100644
--- a/tensorflow/tools/quantization/quantize_graph_test.py
+++ b/tensorflow/tools/quantization/quantize_graph_test.py
@@ -119,8 +119,8 @@ def are_tensors_near(a, b, tolerance):
   flat_a = a.flatten()
   flat_b = b.flatten()
   if len(flat_a) != len(flat_b):
-    print("Tensors are different sizes: " + str(len(flat_a)) + " vs " + str(
-        len(flat_b)))
+    tf_logging.info("Tensors are different sizes: " + str(len(flat_a)) + " vs "
+                    + str(len(flat_b)))
     return False
   value_count = len(flat_a)
   how_many_different = 0
@@ -140,10 +140,10 @@ def are_tensors_near(a, b, tolerance):
   if how_many_different == 0:
     return True
   else:
-    print("Tensors have {0} different values ({1}%), with mean difference"
-          " {2} and mean absolute difference {3}".format(
-              how_many_different, proportion_different * 100, mean_difference,
-              mean_abs_difference))
+    tf_logging.info("Tensors have {0} different values ({1}%), with mean"
+                    " difference {2} and mean absolute difference {3}".format(
+                        how_many_different, proportion_different * 100,
+                        mean_difference, mean_abs_difference))
     return False
 
 
diff --git a/tensorflow/tools/test/upload_test_benchmarks.py b/tensorflow/tools/test/upload_test_benchmarks.py
index 9c45359ee1b037..c0305751092c3e 100644
--- a/tensorflow/tools/test/upload_test_benchmarks.py
+++ b/tensorflow/tools/test/upload_test_benchmarks.py
@@ -89,7 +89,6 @@
 
 from six import text_type
 from google.cloud import datastore
-from six import text_type
 
 
 def is_real_file(dirpath, fname):
diff --git a/tensorflow/user_ops/BUILD b/tensorflow/user_ops/BUILD
deleted file mode 100644
index 71443cc41eb5ec..00000000000000
--- a/tensorflow/user_ops/BUILD
+++ /dev/null
@@ -1,52 +0,0 @@
-# Description:
-# An example for custom op and kernel defined as a TensorFlow plugin.
-
-package(
-    default_visibility = ["//tensorflow:internal"],
-)
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
-load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
-
-tf_custom_op_library(
-    name = "ackermann_op.so",
-    srcs = ["ackermann_op.cc"],
-)
-
-tf_py_test(
-    name = "ackermann_test",
-    size = "small",
-    srcs = ["ackermann_test.py"],
-    additional_deps = ["//tensorflow:tensorflow_py"],
-    data = [":ackermann_op.so"],
-)
-
-tf_custom_op_library(
-    name = "duplicate_op.so",
-    srcs = ["duplicate_op.cc"],
-)
-
-tf_py_test(
-    name = "duplicate_op_test",
-    size = "small",
-    srcs = ["duplicate_op_test.py"],
-    additional_deps = ["//tensorflow:tensorflow_py"],
-    data = [":duplicate_op.so"],
-)
-
-tf_custom_op_library(
-    name = "invalid_op.so",
-    srcs = ["invalid_op.cc"],
-)
-
-tf_py_test(
-    name = "invalid_op_test",
-    size = "small",
-    srcs = ["invalid_op_test.py"],
-    additional_deps = ["//tensorflow:tensorflow_py"],
-    data = [":invalid_op.so"],
-)
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 8411240548b97f..65796bc823194a 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -52,31 +52,31 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   mkl_repository(
       name = "mkl_linux",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_lnx_2018.0.2.20180127.tgz",
-          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_lnx_2018.0.2.20180127.tgz",
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.14/mklml_lnx_2018.0.3.20180406.tgz",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.14/mklml_lnx_2018.0.3.20180406.tgz"
       ],
-      sha256 = "74844bd77294742bf2396ff040369d1aa4cdd9e826fcd38cf8398ae83564d146",
-      strip_prefix = "mklml_lnx_2018.0.2.20180127",
+      sha256 = "d2305244fdc9b87db7426ed4496e87a4b3977ad3374d73b8000e8b7a5b7aa725",
+      strip_prefix = "mklml_lnx_2018.0.3.20180406",
       build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
   mkl_repository(
       name = "mkl_windows",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_win_2018.0.2.20180127.zip",
-          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_win_2018.0.2.20180127.zip"
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.14/mklml_win_2018.0.3.20180406.zip",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.14/mklml_win_2018.0.3.20180406.zip"
       ],
-      sha256 = "d8fbf0faa0684bffa3548005d05fe5cfe56ff9dbc0e15e7612d7ac01055a6ded",
-      strip_prefix = "mklml_win_2018.0.2.20180127",
+      sha256 = "a584a5bf1c8d2ad70b90d12b52652030e9a338217719064fdb84b7ad0d693694",
+      strip_prefix = "mklml_win_2018.0.3.20180406",
       build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
   mkl_repository(
       name = "mkl_darwin",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.13/mklml_mac_2018.0.2.20180127.tgz",
-          "https://github.com/intel/mkl-dnn/releases/download/v0.13/mklml_mac_2018.0.2.20180127.tgz"
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.14/mklml_mac_2018.0.3.20180406.tgz",
+          "https://github.com/intel/mkl-dnn/releases/download/v0.14/mklml_mac_2018.0.3.20180406.tgz"
       ],
-      sha256 = "aa740d71e14562bfea56e6829e6dc186e7487cbcf6748a88dec73826b7ec1943",
-      strip_prefix = "mklml_mac_2018.0.2.20180127",
+      sha256 = "094e3dfd61c816136dc8d12a45cc611ce26c5f4828176a3644cd0b0efa15a25b",
+      strip_prefix = "mklml_mac_2018.0.3.20180406",
       build_file = clean_dep("//third_party/mkl:mkl.BUILD")
   )
 
@@ -87,22 +87,22 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "mkl_dnn",
       urls = [
-          "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/v0.13.tar.gz",
-          "https://github.com/intel/mkl-dnn/archive/v0.13.tar.gz",
+          "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/v0.14.tar.gz",
+          "https://github.com/intel/mkl-dnn/archive/v0.14.tar.gz",
       ],
-      sha256 = "d2cfd93a70cfe86ebe054477c530c9b5c1218b70f75856eb6d1956c68ee89e8f",
-      strip_prefix = "mkl-dnn-0.13",
+      sha256 = "efebc53882856afec86457a2da644693f5d59c68772d41d640d6b60a8efc4eb0",
+      strip_prefix = "mkl-dnn-0.14",
       build_file = clean_dep("//third_party/mkl_dnn:mkldnn.BUILD"),
   )
 
   tf_http_archive(
       name = "com_google_absl",
       urls = [
-          "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/720c017e30339fd1786ce4aac68bc8559736e53f.tar.gz",
-          "https://github.com/abseil/abseil-cpp/archive/720c017e30339fd1786ce4aac68bc8559736e53f.tar.gz",
+          "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/9613678332c976568272c8f4a78631a29159271d.tar.gz",
+          "https://github.com/abseil/abseil-cpp/archive/9613678332c976568272c8f4a78631a29159271d.tar.gz",
       ],
-     sha256 = "5996380e3e8b981f55d1c8d58e709c00dbb4806ba367be75d0925a68cc2f6478",
-     strip_prefix = "abseil-cpp-720c017e30339fd1786ce4aac68bc8559736e53f",
+     sha256 = "1273a1434ced93bc3e703a48c5dced058c95e995c8c009e9bdcb24a69e2180e9",
+     strip_prefix = "abseil-cpp-9613678332c976568272c8f4a78631a29159271d",
      build_file = clean_dep("//third_party:com_google_absl.BUILD"),
   )
 
@@ -171,8 +171,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "gemmlowp",
       urls = [
-          # TODO (yongtang): uncomment once mirror.bazel.build is propagated.
-          # "https://mirror.bazel.build/github.com/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.zip",
+          "https://mirror.bazel.build/github.com/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.zip",
           "https://github.com/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.zip",
       ],
       sha256 = "b87faa7294dfcc5d678f22a59d2c01ca94ea1e2a3b488c38a95a67889ed0a658",
@@ -193,11 +192,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "highwayhash",
       urls = [
-          "https://mirror.bazel.build/github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
-          "https://github.com/google/highwayhash/archive/dfcb97ca4fe9277bf9dc1802dd979b071896453b.tar.gz",
+          "http://mirror.bazel.build/github.com/google/highwayhash/archive/fd3d9af80465e4383162e4a7c5e2f406e82dd968.tar.gz",
+          "https://github.com/google/highwayhash/archive/fd3d9af80465e4383162e4a7c5e2f406e82dd968.tar.gz",
       ],
-      sha256 = "0f30a15b1566d93f146c8d149878a06e91d9bb7ec2cfd76906df62a82be4aac9",
-      strip_prefix = "highwayhash-dfcb97ca4fe9277bf9dc1802dd979b071896453b",
+      sha256 = "9c3e0e87d581feeb0c18d814d98f170ff23e62967a2bd6855847f0b2fe598a37",
+      strip_prefix = "highwayhash-fd3d9af80465e4383162e4a7c5e2f406e82dd968",
       build_file = clean_dep("//third_party:highwayhash.BUILD"),
   )
 
@@ -232,6 +231,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       sha256 = "e45ce5f68b1d80e2cb9a2b601605b374bdf51e1798ef1c2c2bd62131dfcf9eef",
       strip_prefix = "libpng-1.6.34",
       build_file = clean_dep("//third_party:png.BUILD"),
+      patch_file = clean_dep("//third_party:png_fix_rpi.patch"),
   )
 
   tf_http_archive(
@@ -303,11 +303,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "absl_py",
       urls = [
-          "https://mirror.bazel.build/github.com/abseil/abseil-py/archive/acec853355ef987eae48a8d87a79351c15dff593.tar.gz",
-          "https://github.com/abseil/abseil-py/archive/acec853355ef987eae48a8d87a79351c15dff593.tar.gz",
+          "https://mirror.bazel.build/github.com/abseil/abseil-py/archive/ea8c4d2ddbf3fba610c4d613260561699b776db8.tar.gz",
+          "https://github.com/abseil/abseil-py/archive/ea8c4d2ddbf3fba610c4d613260561699b776db8.tar.gz",
       ],
-      sha256 = "29e4584e778bee13aa4093824133d131d927cc160561892880118d9ff7b95a6a",
-      strip_prefix = "abseil-py-acec853355ef987eae48a8d87a79351c15dff593",
+      sha256 = "c30b48e0d2580ef1412e55c5c0e1dab8db2ee4ab56e2075eccff29c90c7c7059",
+      strip_prefix = "abseil-py-ea8c4d2ddbf3fba610c4d613260561699b776db8",
   )
 
   tf_http_archive(
@@ -320,7 +320,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       strip_prefix = "backports.weakref-1.0rc1/src",
       build_file = clean_dep("//third_party:backports_weakref.BUILD"),
   )
-  
+
   filegroup_external(
       name = "org_python_license",
       licenses = ["notice"],  # Python 2.0
@@ -456,11 +456,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/c1e9b6f826c86c87a7e7173f1baf7e7df9f43e32.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/c1e9b6f826c86c87a7e7173f1baf7e7df9f43e32.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/40c66c3d40377cf85640b3a35e6ec5c5b1cbc41f.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/40c66c3d40377cf85640b3a35e6ec5c5b1cbc41f.tar.gz",
       ],
-      sha256 = "92b7c01074f694a77b4d664951d1ec071e30ef19c61e673158e95fbb6e447b54",
-      strip_prefix = "llvm-c1e9b6f826c86c87a7e7173f1baf7e7df9f43e32",
+      sha256 = "6f782a0d2e9d7946bdf20807e0fcd8f5eaed8afd93bdd610cdefbe9435ca551f",
+      strip_prefix = "llvm-40c66c3d40377cf85640b3a35e6ec5c5b1cbc41f",
       build_file = clean_dep("//third_party/llvm:llvm.BUILD"),
   )
 
@@ -759,6 +759,17 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       build_file = clean_dep("//third_party:tflite_smartreply.BUILD"),
   )
 
+  tf_http_archive(
+      name = "tflite_ovic_testdata",
+      sha256 = "a9a705d8d519220178e2e65d383fdb21da37fdb31d1e909b0a1acdac46479e9c",
+      urls = [
+          "https://mirror.bazel.build/storage.googleapis.com/download.tensorflow.org/data/ovic.zip",
+          "https://storage.googleapis.com/download.tensorflow.org/data/ovic.zip",
+      ],
+      build_file = clean_dep("//third_party:tflite_ovic_testdata.BUILD"),
+      strip_prefix = "ovic",
+  )
+
   ##############################################################################
   # BIND DEFINITIONS
   #
@@ -824,7 +835,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   # Needed by Protobuf
   native.bind(
       name = "python_headers",
-      actual = clean_dep("//util/python:python_headers"),
+      actual = clean_dep("//third_party/python_runtime:headers"),
   )
 
   # Needed by Protobuf
diff --git a/third_party/android/BUILD b/third_party/android/BUILD
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/third_party/android/android.bzl.tpl b/third_party/android/android.bzl.tpl
new file mode 100644
index 00000000000000..e6ed4994f3ba6d
--- /dev/null
+++ b/third_party/android/android.bzl.tpl
@@ -0,0 +1,9 @@
+"""Set up configurable Android SDK and NDK dependencies."""
+
+def android_workspace():
+  # String for replacement in Bazel template.
+  # These will either be replaced by android_sdk_repository if various ENV
+  # variables are set when `local_config_android` repo_rule is run, or they
+  # will be replaced by noops otherwise.
+  MAYBE_ANDROID_SDK_REPOSITORY
+  MAYBE_ANDROID_NDK_REPOSITORY
diff --git a/third_party/android/android_configure.BUILD.tpl b/third_party/android/android_configure.BUILD.tpl
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/third_party/android/android_configure.bzl b/third_party/android/android_configure.bzl
new file mode 100644
index 00000000000000..da09bdf39eed90
--- /dev/null
+++ b/third_party/android/android_configure.bzl
@@ -0,0 +1,87 @@
+"""Repository rule for Android SDK and NDK autoconfiguration.
+
+`android_configure` depends on the following environment variables:
+
+  * `ANDROID_NDK_HOME`: Location of Android NDK root.
+  * `ANDROID_SDK_HOME`: Location of Android SDK root.
+  * `ANDROID_SDK_API_LEVEL`: Desired Android SDK API version.
+  * `ANDROID_NDK_API_LEVEL`: Desired Android NDK API version.
+  * `ANDROID_BUILD_TOOLS_VERSION`: Desired Android build tools version.
+"""
+
+# TODO(mikecase): Move logic for getting default values for the env variables
+# from configure.py script into this rule.
+
+_ANDROID_NDK_HOME = "ANDROID_NDK_HOME"
+_ANDROID_SDK_HOME = "ANDROID_SDK_HOME"
+_ANDROID_NDK_API_VERSION = "ANDROID_NDK_API_LEVEL"
+_ANDROID_SDK_API_VERSION = "ANDROID_SDK_API_LEVEL"
+_ANDROID_BUILD_TOOLS_VERSION = "ANDROID_BUILD_TOOLS_VERSION"
+
+_ANDROID_SDK_REPO_TEMPLATE = """
+  native.android_sdk_repository(
+      name="androidsdk",
+      path="%s",
+      api_level=%s,
+      build_tools_version="%s",
+  )
+"""
+
+_ANDROID_NDK_REPO_TEMPLATE = """
+  native.android_ndk_repository(
+      name="androidndk",
+      path="%s",
+      api_level=%s,
+  )
+"""
+
+def _android_autoconf_impl(repository_ctx):
+  """Implementation of the android_autoconf repository rule."""
+  sdk_home = repository_ctx.os.environ.get(_ANDROID_SDK_HOME)
+  sdk_api_level = repository_ctx.os.environ.get(_ANDROID_SDK_API_VERSION)
+  build_tools_version = repository_ctx.os.environ.get(
+      _ANDROID_BUILD_TOOLS_VERSION)
+  ndk_home = repository_ctx.os.environ.get(_ANDROID_NDK_HOME)
+  ndk_api_level = repository_ctx.os.environ.get(_ANDROID_NDK_API_VERSION)
+
+  sdk_rule = "pass"
+  if all([sdk_home, sdk_api_level, build_tools_version]):
+    sdk_rule = _ANDROID_SDK_REPO_TEMPLATE % (
+        sdk_home, sdk_api_level, build_tools_version)
+
+  ndk_rule = "pass"
+  if all([ndk_home, ndk_api_level]):
+    ndk_rule = _ANDROID_NDK_REPO_TEMPLATE % (ndk_home, ndk_api_level)
+
+  repository_ctx.template(
+      "BUILD",
+      Label("//third_party/android:android_configure.BUILD.tpl"))
+  repository_ctx.template(
+      "android.bzl",
+      Label("//third_party/android:android.bzl.tpl"),
+      substitutions={
+          "MAYBE_ANDROID_SDK_REPOSITORY": sdk_rule,
+          "MAYBE_ANDROID_NDK_REPOSITORY": ndk_rule,
+      })
+
+android_configure = repository_rule(
+    implementation = _android_autoconf_impl,
+    environ = [
+        _ANDROID_SDK_API_VERSION,
+        _ANDROID_NDK_API_VERSION,
+        _ANDROID_BUILD_TOOLS_VERSION,
+        _ANDROID_NDK_HOME,
+        _ANDROID_SDK_HOME,
+    ],
+)
+"""Writes Android SDK and NDK rules.
+
+Add the following to your WORKSPACE FILE:
+
+```python
+android_configure(name = "local_config_android")
+```
+
+Args:
+  name: A unique name for this workspace rule.
+"""
diff --git a/third_party/clang_toolchain/download_clang.bzl b/third_party/clang_toolchain/download_clang.bzl
index 54d383d7d76513..02d2b78067ccbf 100644
--- a/third_party/clang_toolchain/download_clang.bzl
+++ b/third_party/clang_toolchain/download_clang.bzl
@@ -35,18 +35,18 @@ def download_clang(repo_ctx, out_folder):
 
   # Latest CLANG_REVISION and CLANG_SUB_REVISION of the Chromiums's release
   # can be found in https://chromium.googlesource.com/chromium/src/tools/clang/+/master/scripts/update.py
-  CLANG_REVISION = '321529'
-  CLANG_SUB_REVISION = 2
+  CLANG_REVISION = '332335'
+  CLANG_SUB_REVISION = 1
 
   package_version = '%s-%s' % (CLANG_REVISION, CLANG_SUB_REVISION)
 
   checksums = {
       'Linux_x64':
-          '76d4eb1ad011e3127c4a9de9b9f5d4ac624b5a9395c4d7395c9e0a487b13daf6',
+          '5c234e0bc43b2386984ac34ac9c200c35686f2f7fa5ded0db031055bbc7f3e52',
       'Mac':
-          '4b2a7a65ac1ee892b318c723eec8771f514bb306f346aa8216bb0006f19d87b7',
+          '69b94f16d261c0922c3853cdad768776f454dece2948363f1c4e20bc2ddbf95d',
       'Win':
-          'eba51bb8f84af41a85903113666bd21c22709010c39c4cb19dc20cf1ed14581b',
+          '76c8897abf032f3e23598275517da60090f53cf35b673481f41fa98752d1ad37',
   }
 
   platform_folder = _get_platform_folder(repo_ctx.os.name)
diff --git a/third_party/eigen.BUILD b/third_party/eigen.BUILD
index 07bb6645ebc2fa..2241c616faee17 100644
--- a/third_party/eigen.BUILD
+++ b/third_party/eigen.BUILD
@@ -64,6 +64,8 @@ cc_library(
         # This define (mostly) guarantees we don't link any problematic
         # code. We use it, but we do not rely on it, as evidenced above.
         "EIGEN_MPL2_ONLY",
+        # XXX FIXME figure this out for ROCm
+        #"EIGEN_MAX_ALIGN_BYTES=64",
     ],
     includes = ["."],
     visibility = ["//visibility:public"],
diff --git a/third_party/examples/eager/spinn/README.md b/third_party/examples/eager/spinn/README.md
index 7f477d19208257..fbb1fde837b92b 100644
--- a/third_party/examples/eager/spinn/README.md
+++ b/third_party/examples/eager/spinn/README.md
@@ -70,7 +70,7 @@ Other eager execution examples can be found under [tensorflow/contrib/eager/pyth
 - After training, you may use the model to perform inference on input data in
   the SNLI data format. The premise and hypotheses sentences are specified with
   the command-line flags `--inference_premise` and `--inference_hypothesis`,
-  respecitvely. Each sentence should include the words, as well as parentheses
+  respectively. Each sentence should include the words, as well as parentheses
   representing a binary parsing of the sentence. The words and parentheses
   should all be separated by spaces. For instance,
 
diff --git a/third_party/examples/eager/spinn/spinn.py b/third_party/examples/eager/spinn/spinn.py
index 8a2b24aa4e284f..67456a5bdfc05f 100644
--- a/third_party/examples/eager/spinn/spinn.py
+++ b/third_party/examples/eager/spinn/spinn.py
@@ -462,7 +462,7 @@ def train_batch(self,
       2. logits as a dense `Tensor` of shape (batch_size, d_out), where d_out is
         the output dimension size of the SNLIClassifier.
     """
-    with tfe.GradientTape() as tape:
+    with tf.GradientTape() as tape:
       tape.watch(self._model.variables)
       logits = self._model(premise,
                            premise_transition,
diff --git a/third_party/gpus/crosstool/CROSSTOOL_clang.tpl b/third_party/gpus/crosstool/CROSSTOOL.tpl
similarity index 96%
rename from third_party/gpus/crosstool/CROSSTOOL_clang.tpl
rename to third_party/gpus/crosstool/CROSSTOOL.tpl
index 2f09473ee2ddf9..60b19daf1d7810 100644
--- a/third_party/gpus/crosstool/CROSSTOOL_clang.tpl
+++ b/third_party/gpus/crosstool/CROSSTOOL.tpl
@@ -140,9 +140,7 @@ toolchain {
       flag_group {
         # All warnings are enabled. Maybe enable -Werror as well?
         flag: "-Wall"
-        # Some parts of the codebase set -Werror and hit this warning, so
-        # switch it off for now.
-        flag: "-Wno-invalid-partial-specialization"
+        %{host_compiler_warnings}
       }
     }
   }
@@ -278,7 +276,7 @@ toolchain {
   }
 
   # Set clang as a C/C++ compiler.
-  tool_path { name: "gcc" path: "%{clang_path}" }
+  tool_path { name: "gcc" path: "%{host_compiler_path}" }
 
   # Use the default system toolchain for everything else.
   tool_path { name: "ar" path: "/usr/bin/ar" }
diff --git a/third_party/gpus/crosstool/CROSSTOOL_nvcc.tpl b/third_party/gpus/crosstool/CROSSTOOL_nvcc.tpl
deleted file mode 100644
index 05290d647ea1b2..00000000000000
--- a/third_party/gpus/crosstool/CROSSTOOL_nvcc.tpl
+++ /dev/null
@@ -1,249 +0,0 @@
-major_version: "local"
-minor_version: ""
-default_target_cpu: "same_as_host"
-
-default_toolchain {
-  cpu: "k8"
-  toolchain_identifier: "local_linux"
-}
-default_toolchain {
-  cpu: "piii"
-  toolchain_identifier: "local_linux"
-}
-default_toolchain {
-  cpu: "arm"
-  toolchain_identifier: "local_linux"
-}
-default_toolchain {
-  cpu: "darwin"
-  toolchain_identifier: "local_darwin"
-}
-default_toolchain {
-  cpu: "ppc"
-  toolchain_identifier: "local_linux"
-}
-
-toolchain {
-  abi_version: "local"
-  abi_libc_version: "local"
-  builtin_sysroot: ""
-  compiler: "compiler"
-  host_system_name: "local"
-  needsPic: true
-  supports_gold_linker: false
-  supports_incremental_linker: false
-  supports_fission: false
-  supports_interface_shared_objects: false
-  supports_normalizing_ar: false
-  supports_start_end_lib: false
-  supports_thin_archives: false
-  target_libc: "local"
-  target_cpu: "local"
-  target_system_name: "local"
-  toolchain_identifier: "local_linux"
-
-  tool_path { name: "ar" path: "/usr/bin/ar" }
-  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
-  tool_path { name: "cpp" path: "/usr/bin/cpp" }
-  tool_path { name: "dwp" path: "/usr/bin/dwp" }
-  # As part of the TensorFlow release, we place some cuda-related compilation
-  # files in @local_config_cuda//crosstool/clang/bin, and this relative
-  # path, combined with the rest of our Bazel configuration causes our
-  # compilation to use those files.
-  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
-  # Use "-std=c++11" for nvcc. For consistency, force both the host compiler
-  # and the device compiler to use "-std=c++11".
-  cxx_flag: "-std=c++11"
-  linker_flag: "-Wl,-no-as-needed"
-  linker_flag: "-lstdc++"
-  linker_flag: "-B/usr/bin/"
-
-%{host_compiler_includes}
-  tool_path { name: "gcov" path: "/usr/bin/gcov" }
-
-  # C(++) compiles invoke the compiler (as that is the one knowing where
-  # to find libraries), but we provide LD so other rules can invoke the linker.
-  tool_path { name: "ld" path: "/usr/bin/ld" }
-
-  tool_path { name: "nm" path: "/usr/bin/nm" }
-  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
-  objcopy_embed_flag: "-I"
-  objcopy_embed_flag: "binary"
-  tool_path { name: "objdump" path: "/usr/bin/objdump" }
-  tool_path { name: "strip" path: "/usr/bin/strip" }
-
-  # Anticipated future default.
-  unfiltered_cxx_flag: "-no-canonical-prefixes"
-
-  # Make C++ compilation deterministic. Use linkstamping instead of these
-  # compiler symbols.
-  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
-  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
-
-  # Security hardening on by default.
-  # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
-  # We need to undef it before redefining it as some distributions now have
-  # it enabled by default.
-  compiler_flag: "-U_FORTIFY_SOURCE"
-  compiler_flag: "-D_FORTIFY_SOURCE=1"
-  compiler_flag: "-fstack-protector"
-  compiler_flag: "-fPIE"
-  linker_flag: "-pie"
-  linker_flag: "-Wl,-z,relro,-z,now"
-
-  # Enable coloring even if there's no attached terminal. Bazel removes the
-  # escape sequences if --nocolor is specified. This isn't supported by gcc
-  # on Ubuntu 14.04.
-  # compiler_flag: "-fcolor-diagnostics"
-
-  # All warnings are enabled. Maybe enable -Werror as well?
-  compiler_flag: "-Wall"
-  # Enable a few more warnings that aren't part of -Wall.
-  compiler_flag: "-Wunused-but-set-parameter"
-  # But disable some that are problematic.
-  compiler_flag: "-Wno-free-nonheap-object" # has false positives
-
-  # Keep stack frames for debugging, even in opt mode.
-  compiler_flag: "-fno-omit-frame-pointer"
-
-  # Anticipated future default.
-  linker_flag: "-no-canonical-prefixes"
-  unfiltered_cxx_flag: "-fno-canonical-system-headers"
-  # Have gcc return the exit code from ld.
-  linker_flag: "-pass-exit-codes"
-  # Stamp the binary with a unique identifier.
-  linker_flag: "-Wl,--build-id=md5"
-  linker_flag: "-Wl,--hash-style=gnu"
-  # Gold linker only? Can we enable this by default?
-  # linker_flag: "-Wl,--warn-execstack"
-  # linker_flag: "-Wl,--detect-odr-violations"
-
-  # Include directory for cuda headers.
-%{cuda_include_path}
-
-  compilation_mode_flags {
-    mode: DBG
-    # Enable debug symbols.
-    compiler_flag: "-g"
-  }
-  compilation_mode_flags {
-    mode: OPT
-
-    # No debug symbols.
-    # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt or
-    # even generally? However, that can't happen here, as it requires special
-    # handling in Bazel.
-    compiler_flag: "-g0"
-
-    # Conservative choice for -O
-    # -O3 can increase binary size and even slow down the resulting binaries.
-    # Profile first and / or use FDO if you need better performance than this.
-    compiler_flag: "-O2"
-
-    # Disable assertions
-    compiler_flag: "-DNDEBUG"
-
-    # Removal of unused code and data at link time (can this increase binary size in some cases?).
-    compiler_flag: "-ffunction-sections"
-    compiler_flag: "-fdata-sections"
-    linker_flag: "-Wl,--gc-sections"
-  }
-  linking_mode_flags { mode: DYNAMIC }
-}
-
-toolchain {
-  abi_version: "local"
-  abi_libc_version: "local"
-  builtin_sysroot: ""
-  compiler: "compiler"
-  host_system_name: "local"
-  needsPic: true
-  target_libc: "macosx"
-  target_cpu: "darwin"
-  target_system_name: "local"
-  toolchain_identifier: "local_darwin"
-
-  tool_path { name: "ar" path: "/usr/bin/libtool" }
-  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
-  tool_path { name: "cpp" path: "/usr/bin/cpp" }
-  tool_path { name: "dwp" path: "/usr/bin/dwp" }
-  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
-  cxx_flag: "-std=c++11"
-  ar_flag: "-static"
-  ar_flag: "-s"
-  ar_flag: "-o"
-  linker_flag: "-lc++"
-  linker_flag: "-undefined"
-  linker_flag: "dynamic_lookup"
-  # TODO(ulfjack): This is wrong on so many levels. Figure out a way to auto-detect the proper
-  # setting from the local compiler, and also how to make incremental builds correct.
-  cxx_builtin_include_directory: "/"
-  tool_path { name: "gcov" path: "/usr/bin/gcov" }
-  tool_path { name: "ld" path: "/usr/bin/ld" }
-  tool_path { name: "nm" path: "/usr/bin/nm" }
-  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
-  objcopy_embed_flag: "-I"
-  objcopy_embed_flag: "binary"
-  tool_path { name: "objdump" path: "/usr/bin/objdump" }
-  tool_path { name: "strip" path: "/usr/bin/strip" }
-
-  # Anticipated future default.
-  unfiltered_cxx_flag: "-no-canonical-prefixes"
-  # Make C++ compilation deterministic. Use linkstamping instead of these
-  # compiler symbols.
-  unfiltered_cxx_flag: "-Wno-builtin-macro-redefined"
-  unfiltered_cxx_flag: "-D__DATE__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIMESTAMP__=\"redacted\""
-  unfiltered_cxx_flag: "-D__TIME__=\"redacted\""
-
-  # Security hardening on by default.
-  # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
-  compiler_flag: "-D_FORTIFY_SOURCE=1"
-  compiler_flag: "-fstack-protector"
-
-  # Enable coloring even if there's no attached terminal. Bazel removes the
-  # escape sequences if --nocolor is specified.
-  compiler_flag: "-fcolor-diagnostics"
-
-  # All warnings are enabled. Maybe enable -Werror as well?
-  compiler_flag: "-Wall"
-  # Enable a few more warnings that aren't part of -Wall.
-  compiler_flag: "-Wthread-safety"
-  compiler_flag: "-Wself-assign"
-
-  # Keep stack frames for debugging, even in opt mode.
-  compiler_flag: "-fno-omit-frame-pointer"
-
-  # Anticipated future default.
-  linker_flag: "-no-canonical-prefixes"
-
-  # Include directory for cuda headers.
-%{cuda_include_path}
-
-  compilation_mode_flags {
-    mode: DBG
-    # Enable debug symbols.
-    compiler_flag: "-g"
-  }
-  compilation_mode_flags {
-    mode: OPT
-    # No debug symbols.
-    # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt or even generally?
-    # However, that can't happen here, as it requires special handling in Bazel.
-    compiler_flag: "-g0"
-
-    # Conservative choice for -O
-    # -O3 can increase binary size and even slow down the resulting binaries.
-    # Profile first and / or use FDO if you need better performance than this.
-    compiler_flag: "-O2"
-
-    # Disable assertions
-    compiler_flag: "-DNDEBUG"
-
-    # Removal of unused code and data at link time (can this increase binary size in some cases?).
-    compiler_flag: "-ffunction-sections"
-    compiler_flag: "-fdata-sections"
-  }
-}
diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl
index 2a37c65bc74a0e..f6b497f813185f 100644
--- a/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/gpus/cuda/BUILD.tpl
@@ -127,6 +127,15 @@ cc_library(
     visibility = ["//visibility:public"],
 )
 
+cc_library(
+    name = "cudnn_header",
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "cufft",
     srcs = ["cuda/lib/%{cufft_lib}"],
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index ede7e318976527..c90c66912d959a 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -604,7 +604,7 @@ def _find_cupti_header_dir(repository_ctx, cuda_config):
   for relative_path in CUPTI_HEADER_PATHS:
     if repository_ctx.path("%s/%scupti.h" % (cuda_toolkit_path, relative_path)).exists:
         return ("%s/%s" % (cuda_toolkit_path, relative_path))[:-1]
-  auto_configure_fail("Cannot find cupti.h under %s" % cuda_toolkit_path)
+  auto_configure_fail("Cannot find cupti.h under %s" % ", ".join([cuda_toolkit_path + "/" + s for s in CUPTI_HEADER_PATHS]))
 
 
 def _find_cupti_lib(repository_ctx, cuda_config):
@@ -1073,23 +1073,46 @@ def _create_local_cuda_repository(repository_ctx):
   cc_fullpath = cc if not should_download_clang else "crosstool/" + cc
 
   host_compiler_includes = _host_compiler_includes(repository_ctx, cc_fullpath)
-  cuda_defines = {
-           "%{cuda_include_path}": _cuda_include_path(repository_ctx,
-                                                      cuda_config),
-           "%{host_compiler_includes}": host_compiler_includes,
-       }
+  cuda_defines = {}
   if is_cuda_clang:
-    cuda_defines["%{clang_path}"] = cc
+    cuda_defines["%{host_compiler_path}"] = str(cc)
+    cuda_defines["%{host_compiler_warnings}"] = """
+        # Some parts of the codebase set -Werror and hit this warning, so
+        # switch it off for now.
+        flag: "-Wno-invalid-partial-specialization"
+    """
+    cuda_defines["%{host_compiler_includes}"] = host_compiler_includes
     _tpl(repository_ctx, "crosstool:BUILD", {"%{linker_files}": ":empty"})
-    _tpl(repository_ctx, "crosstool:CROSSTOOL_clang", cuda_defines, out="crosstool/CROSSTOOL")
     repository_ctx.file("crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc", "")
   else:
+    cuda_defines["%{host_compiler_path}"] = "clang/bin/crosstool_wrapper_driver_is_not_gcc"
+    cuda_defines["%{host_compiler_warnings}"] = ""
+    # TODO(klimek): We currently need to inject "/" as builtin directory path
+    # to disable bazel's dependency checks.
+    # The problem is that:
+    # - the python rules symlink the python headers into the bazel root
+    # - the rules use 'includes' in the BUILD file to redirect includes of the
+    #   python headers through those paths
+    # - bazel currently uses -isystem for include paths specified via 'includes'
+    # - gcc follows symlinks when resolving files via -isystem paths, and puts
+    #   the resolved paths into the .d file, which makes the dependency check
+    #   fail for bazel
+    # There are multiple possible ways to solve this:
+    # 1. make bazel not use -isystem for paths specified via 'includes'
+    # 2. cp the headers instead of symlinking them
+    #
+    # Once this is fixed, the right builtin directory path is:
+    # (host_compiler_includes +
+    #    "\n  cxx_builtin_include_directory: \"%s\"" % cuda_include_path)
+    # The cuda directory needs to be passed, as there is currently no rule
+    # providing the cuda headers in the same way the python headers are
+    # provided.
+    cuda_defines["%{host_compiler_includes}"] = "\n  cxx_builtin_include_directory: \"/\""
     nvcc_path = str(repository_ctx.path("%s/bin/nvcc%s" %
         (cuda_config.cuda_toolkit_path,
         ".exe" if cuda_config.cpu_value == "Windows" else "")))
     _tpl(repository_ctx, "crosstool:BUILD",
          {"%{linker_files}": ":crosstool_wrapper_driver_is_not_gcc"})
-    _tpl(repository_ctx, "crosstool:CROSSTOOL_nvcc", cuda_defines, out="crosstool/CROSSTOOL")
     _tpl(repository_ctx,
          "crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc",
          {
@@ -1100,6 +1123,7 @@ def _create_local_cuda_repository(repository_ctx):
              "%{cuda_compute_capabilities}": ", ".join(
                  ["\"%s\"" % c for c in cuda_config.compute_capabilities]),
          })
+  _tpl(repository_ctx, "crosstool:CROSSTOOL", cuda_defines, out="crosstool/CROSSTOOL")
 
   # Set up cuda_config.h, which is used by
   # tensorflow/stream_executor/dso_loader.cc.
diff --git a/third_party/highwayhash.BUILD b/third_party/highwayhash.BUILD
index 1b8e40765eb319..08cb84ea2c8f20 100644
--- a/third_party/highwayhash.BUILD
+++ b/third_party/highwayhash.BUILD
@@ -10,6 +10,7 @@ cc_library(
     srcs = ["highwayhash/sip_hash.cc"],
     hdrs = [
         "highwayhash/sip_hash.h",
+        "highwayhash/endianess.h",
         "highwayhash/state_helpers.h",
     ],
     visibility = ["//visibility:public"],
diff --git a/third_party/jpeg/jpeg.BUILD b/third_party/jpeg/jpeg.BUILD
index 4418ac32fc4b08..663a2187336d4a 100644
--- a/third_party/jpeg/jpeg.BUILD
+++ b/third_party/jpeg/jpeg.BUILD
@@ -291,8 +291,10 @@ cc_library(
         "jchuff.h",
         "jconfig.h",
         "jdct.h",
+        "jerror.h",
         "jinclude.h",
         "jmorecfg.h",
+        "jpegint.h",
         "jpeglib.h",
         "jsimd.h",
         "jsimddct.h",
diff --git a/third_party/libxsmm.BUILD b/third_party/libxsmm.BUILD
index 4124f2db637689..78ed1f4e168891 100644
--- a/third_party/libxsmm.BUILD
+++ b/third_party/libxsmm.BUILD
@@ -38,8 +38,8 @@ genrule(
         ":libxsmm_interface",
     ],
     visibility = [
-        "//tensorflow/core/kernels:__pkg__",
         "//third_party/eigen3:__pkg__",
+        "//tensorflow/core/kernels:__pkg__",
     ],
 )
 
diff --git a/third_party/llvm/llvm.BUILD b/third_party/llvm/llvm.BUILD
index 7feab85ac04955..65a88cd3dfd873 100644
--- a/third_party/llvm/llvm.BUILD
+++ b/third_party/llvm/llvm.BUILD
@@ -8,10 +8,10 @@ exports_files(["LICENSE.TXT"])
 
 load(
     "@org_tensorflow//third_party/llvm:llvm.bzl",
-    "gentbl",
+    "cmake_var_string",
     "expand_cmake_vars",
+    "gentbl",
     "llvm_target_cmake_vars",
-    "cmake_var_string",
 )
 load(
     "@org_tensorflow//third_party:common.bzl",
@@ -262,7 +262,7 @@ genrule(
 # Rules that apply the LLVM tblgen tool.
 gentbl(
     name = "intrinsics_gen",
-    tbl_outs = [("-gen-intrinsic", "include/llvm/IR/Intrinsics.gen")],
+    tbl_outs = [("-gen-intrinsic", "include/llvm/IR/Intrinsics.inc")],
     tblgen = ":llvm-tblgen",
     td_file = "include/llvm/IR/Intrinsics.td",
     td_srcs = glob([
@@ -273,7 +273,7 @@ gentbl(
 
 gentbl(
     name = "attributes_gen",
-    tbl_outs = [("-gen-attrs", "include/llvm/IR/Attributes.gen")],
+    tbl_outs = [("-gen-attrs", "include/llvm/IR/Attributes.inc")],
     tblgen = ":llvm-tblgen",
     td_file = "include/llvm/IR/Attributes.td",
     td_srcs = ["include/llvm/IR/Attributes.td"],
@@ -368,6 +368,7 @@ llvm_target_list = [
             ("-gen-disassembler", "lib/Target/AMDGPU/AMDGPUGenDisassemblerTables.inc"),
             ("-gen-pseudo-lowering", "lib/Target/AMDGPU/AMDGPUGenMCPseudoLowering.inc"),
             ("-gen-searchable-tables", "lib/Target/AMDGPU/AMDGPUGenSearchableTables.inc"),
+            ("-gen-global-isel", "lib/Target/AMDGPU/AMDGPUGenGlobalISel.inc"),
             # XXX FIXME this rule has to be explicitly disabled
             #("-gen-tgt-intrinsic", "lib/Target/AMDGPU/AMDGPUGenIntrinsics.inc"),
         ],
diff --git a/third_party/mkl/BUILD b/third_party/mkl/BUILD
index c2adf578c703f5..a058c46cc42439 100644
--- a/third_party/mkl/BUILD
+++ b/third_party/mkl/BUILD
@@ -34,6 +34,7 @@ filegroup(
         "@org_tensorflow//tensorflow:windows": [
             "@mkl_windows//:LICENSE",
         ],
+        "//conditions:default": [],
     }),
     visibility = ["//visibility:public"],
 )
@@ -54,5 +55,6 @@ cc_library(
             "@mkl_windows//:mkl_headers",
             "@mkl_windows//:mkl_libs_windows",
         ],
+        "//conditions:default": [],
     }),
 )
diff --git a/third_party/png.BUILD b/third_party/png.BUILD
index 76ab32d69c3505..17c5449cc0d66c 100644
--- a/third_party/png.BUILD
+++ b/third_party/png.BUILD
@@ -28,7 +28,14 @@ cc_library(
         "pngwrite.c",
         "pngwtran.c",
         "pngwutil.c",
-    ],
+    ] + select({
+        "@org_tensorflow//tensorflow:linux_ppc64le": [
+            "powerpc/powerpc_init.c",
+            "powerpc/filter_vsx_intrinsics.c",
+        ],
+        "//conditions:default": [
+        ],
+    }),
     hdrs = [
         "png.h",
         "pngconf.h",
diff --git a/third_party/png_fix_rpi.patch b/third_party/png_fix_rpi.patch
new file mode 100644
index 00000000000000..80da7b3c06444f
--- /dev/null
+++ b/third_party/png_fix_rpi.patch
@@ -0,0 +1,16 @@
+diff -r -u /tmp/libpng-1.6.34/scripts/pnglibconf.h.prebuilt ./scripts/pnglibconf.h.prebuilt
+--- /tmp/libpng-1.6.34/scripts/pnglibconf.h.prebuilt	2017-09-29 01:42:33.000000000 -0700
++++ ./scripts/pnglibconf.h.prebuilt	2018-05-01 09:51:24.719318242 -0700
+@@ -20,6 +20,12 @@
+ #define PNG_ALIGNED_MEMORY_SUPPORTED
+ /*#undef PNG_ARM_NEON_API_SUPPORTED*/
+ /*#undef PNG_ARM_NEON_CHECK_SUPPORTED*/
++
++/* Workaround not having a great build file by forcing
++ * png filter optimization to be disabled on arm */
++#define PNG_ARM_NEON_OPT 0
++
++
+ /*#undef PNG_POWERPC_VSX_API_SUPPORTED*/
+ /*#undef PNG_POWERPC_VSX_CHECK_SUPPORTED*/
+ #define PNG_BENIGN_ERRORS_SUPPORTED
diff --git a/third_party/py/python_configure.bzl b/third_party/py/python_configure.bzl
index 954f21f5f8fe80..3c7e5c84695e45 100644
--- a/third_party/py/python_configure.bzl
+++ b/third_party/py/python_configure.bzl
@@ -6,6 +6,7 @@
   * `PYTHON_LIB_PATH`: Location of python libraries.
 """
 
+_BAZEL_SH = "BAZEL_SH"
 _PYTHON_BIN_PATH = "PYTHON_BIN_PATH"
 _PYTHON_LIB_PATH = "PYTHON_LIB_PATH"
 _TF_PYTHON_CONFIG_REPO = "TF_PYTHON_CONFIG_REPO"
@@ -152,6 +153,22 @@ def _get_python_bin(repository_ctx):
             _PYTHON_BIN_PATH, repository_ctx.os.environ.get("PATH", "")))
 
 
+def _get_bash_bin(repository_ctx):
+  """Gets the bash bin path."""
+  bash_bin = repository_ctx.os.environ.get(_BAZEL_SH)
+  if bash_bin != None:
+    return bash_bin
+  else:
+    bash_bin_path = repository_ctx.which("bash")
+    if bash_bin_path != None:
+      return str(bash_bin_path)
+    else:
+      _fail("Cannot find bash in PATH, please make sure " +
+            "bash is installed and add its directory in PATH, or --define " +
+            "%s='/path/to/bash'.\nPATH=%s" % (
+                _BAZEL_SH, repository_ctx.os.environ.get("PATH", "")))
+
+
 def _get_python_lib(repository_ctx, python_bin):
   """Gets the python lib path."""
   python_lib = repository_ctx.os.environ.get(_PYTHON_LIB_PATH)
@@ -184,14 +201,14 @@ def _get_python_lib(repository_ctx, python_bin):
       "  print(paths[0])\n" +
       "END")
   cmd = '%s - %s' % (python_bin, print_lib)
-  result = repository_ctx.execute(["bash", "-c", cmd])
+  result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
   return result.stdout.strip('\n')
 
 
 def _check_python_lib(repository_ctx, python_lib):
   """Checks the python lib path."""
   cmd = 'test -d "%s" -a -x "%s"' % (python_lib, python_lib)
-  result = repository_ctx.execute(["bash", "-c", cmd])
+  result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
   if result.return_code == 1:
     _fail("Invalid python library path: %s" % python_lib)
 
@@ -199,7 +216,7 @@ def _check_python_lib(repository_ctx, python_lib):
 def _check_python_bin(repository_ctx, python_bin):
   """Checks the python bin path."""
   cmd =  '[[ -x "%s" ]] && [[ ! -d "%s" ]]' % (python_bin, python_bin)
-  result = repository_ctx.execute(["bash", "-c", cmd])
+  result = repository_ctx.execute([_get_bash_bin(repository_ctx), "-c", cmd])
   if result.return_code == 1:
     _fail("--define %s='%s' is not executable. Is it the python binary?" % (
         _PYTHON_BIN_PATH, python_bin))
@@ -294,6 +311,7 @@ def _python_autoconf_impl(repository_ctx):
 python_configure = repository_rule(
     implementation = _python_autoconf_impl,
     environ = [
+        _BAZEL_SH,
         _PYTHON_BIN_PATH,
         _PYTHON_LIB_PATH,
         _TF_PYTHON_CONFIG_REPO,
diff --git a/util/python/BUILD b/third_party/python_runtime/BUILD
similarity index 86%
rename from util/python/BUILD
rename to third_party/python_runtime/BUILD
index f5fa0c6d29c905..2a1609191fe351 100644
--- a/util/python/BUILD
+++ b/third_party/python_runtime/BUILD
@@ -3,6 +3,6 @@ licenses(["notice"])  # New BSD, Python Software Foundation
 package(default_visibility = ["//visibility:public"])
 
 alias(
-    name = "python_headers",
+    name = "headers",
     actual = "@local_config_python//:python_headers",
 )
diff --git a/third_party/repo.bzl b/third_party/repo.bzl
index 36f5aa5bdee43a..cb67d3e9617dd1 100644
--- a/third_party/repo.bzl
+++ b/third_party/repo.bzl
@@ -17,7 +17,6 @@
 _SINGLE_URL_WHITELIST = depset([
     "arm_compiler",
     "ortools_archive",
-    "gemmlowp",
 ])
 
 def _is_windows(ctx):
@@ -88,7 +87,9 @@ def _tf_http_archive(ctx):
   if ctx.attr.patch_file != None:
     _apply_patch(ctx, ctx.attr.patch_file)
   if ctx.attr.build_file != None:
-    ctx.template("BUILD", ctx.attr.build_file, {
+    # Use BUILD.bazel to avoid conflict with third party projects with
+    # BUILD or build (directory) underneath.
+    ctx.template("BUILD.bazel", ctx.attr.build_file, {
         "%prefix%": ".." if _repos_are_siblings() else "external",
     }, False)
 
diff --git a/third_party/tflite_ovic_testdata.BUILD b/third_party/tflite_ovic_testdata.BUILD
new file mode 100644
index 00000000000000..de47ed61f9db9a
--- /dev/null
+++ b/third_party/tflite_ovic_testdata.BUILD
@@ -0,0 +1,12 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(
+    glob(
+        ["**/*"],
+        exclude = [
+            "BUILD",
+        ],
+    ),
+)